diff --git a/.bazelrc b/.bazelrc
index d7ae76f096431a..02dec0349c4741 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -51,16 +51,13 @@
 # Remote build execution options (only configured to work with TF team projects for now.)
 #     rbe_base:  General RBE options shared by all flavors.
 #     rbe_linux: General RBE options used on all linux builds.
-#     rbe_win:   General RBE options used on all windows builds.
+#     rbe_win_base:   General RBE options used on all Windows builds. Not to be used standalone.
+#     rbe_win_clang:  Options specific to compiling using Clang.
 #
 #     rbe_linux_cpu:                  RBE options to build with only CPU support.
 #     rbe_linux_cuda:                 RBE options to build with GPU support using clang.
 #     rbe_linux_cuda_nvcc:            RBE options to build with GPU support using nvcc.
 #
-#     rbe_win_py39: Windows Python 3.9 RBE config
-#
-#     tensorflow_testing_rbe_win:   RBE options to use RBE with tensorflow-testing project on windows
-#
 # Embedded Linux options (experimental and only tested with TFLite build yet)
 #     elinux:          General Embedded Linux options shared by all flavors.
 #     elinux_aarch64:  Embedded Linux options for aarch64 (ARM64) CPU support.
@@ -450,6 +447,17 @@ build:win_clang --host_linkopt=/FORCE:MULTIPLE
 test:win_clang --linkopt=/FORCE:MULTIPLE
 test:win_clang --host_linkopt=/FORCE:MULTIPLE
 
+# Same config as above but for XLA, which has different toolchain paths
+build:win_clang_xla --copt=/clang:-Weverything
+build:win_clang_xla --extra_toolchains=@local_config_cc//:cc-toolchain-x64_windows-clang-cl
+build:win_clang_xla --extra_execution_platforms=//tools/toolchains/win:x64_windows-clang-cl
+build:win_clang_xla --host_platform=//tools/toolchains/win:x64_windows-clang-cl
+build:win_clang_xla --compiler=clang-cl
+build:win_clang_xla --linkopt=/FORCE:MULTIPLE
+build:win_clang_xla --host_linkopt=/FORCE:MULTIPLE
+test:win_clang_xla --linkopt=/FORCE:MULTIPLE
+test:win_clang_xla --host_linkopt=/FORCE:MULTIPLE
+
 # Options to build TensorFlow 1.x or 2.x.
 # TODO(kanglan): Change v2's define to default behavior
 build:v2 --define=tf_api_version=2 --action_env=TF2_BEHAVIOR=1
@@ -546,38 +554,25 @@ build:rbe_linux_cuda_nvcc --config=rbe_linux_cuda
 build:rbe_linux_cuda_nvcc --config=nvcc_clang
 build:rbe_linux_cuda_nvcc --repo_env TF_NCCL_USE_STUB=1
 
-# TODO(kanglan): Remove rbe_win and rbe_win_py3* after b/289091160 is fixed
-build:rbe_win --config=rbe_base
-build:rbe_win --crosstool_top="//tensorflow/tools/toolchains/win/tf_win_05022023:toolchain"
-build:rbe_win --extra_toolchains="//tensorflow/tools/toolchains/win/tf_win_05022023:cc-toolchain-x64_windows"
-build:rbe_win --extra_execution_platforms="//tensorflow/tools/toolchains/win:rbe_windows_ltsc2019"
-build:rbe_win --host_platform="//tensorflow/tools/toolchains/win:rbe_windows_ltsc2019"
-build:rbe_win --platforms="//tensorflow/tools/toolchains/win:rbe_windows_ltsc2019"
-build:rbe_win --shell_executable=C:\\tools\\msys64\\usr\\bin\\bash.exe
-build:rbe_win --experimental_strict_action_env=true
-
-# TODO(gunan): Remove once we use MSVC 2019 with latest patches.
-build:rbe_win --define=override_eigen_strong_inline=true
-
+build:rbe_win_base --config=rbe_base
+build:rbe_win_base --shell_executable=C:\\tools\\msys64\\usr\\bin\\bash.exe
+build:rbe_win_base --remote_instance_name=projects/tensorflow-testing/instances/windows
 # Don't build the python zip archive in the RBE build.
-build:rbe_win --remote_download_minimal
-build:rbe_win --enable_runfiles
-build:rbe_win --nobuild_python_zip
-
-build:rbe_win_py38 --config=rbe_base
-build:rbe_win_py38 --repo_env=PYTHON_BIN_PATH=C:\\Python38\\python.exe
-build:rbe_win_py38 --repo_env=PYTHON_LIB_PATH=C:\\Python38\\lib\\site-packages
-build:rbe_win_py38 --repo_env=TF_PYTHON_CONFIG_REPO=//tensorflow/tools/toolchains/win_1803/py38
-build:rbe_win_py38 --python_path=C:\\Python38\\python.exe
-
-build:rbe_win_py39 --config=rbe_base
-build:rbe_win_py39 --repo_env=PYTHON_BIN_PATH=C:\\Python39\\python.exe
-build:rbe_win_py39 --repo_env=PYTHON_LIB_PATH=C:\\Python39\\lib\\site-packages
-build:rbe_win_py39 --repo_env=TF_PYTHON_CONFIG_REPO=//tensorflow/tools/toolchains/win_1803/py39
-build:rbe_win_py39 --python_path=C:\\Python39\\python.exe
-
-# TODO(kanglan): Merge tensorflow_testing_rbe_win into rbe_win
-common:tensorflow_testing_rbe_win --remote_instance_name=projects/tensorflow-testing/instances/windows
+build:rbe_win_base --remote_download_minimal
+build:rbe_win_base --enable_runfiles
+build:rbe_win_base --nobuild_python_zip
+build:rbe_win_base --define=override_eigen_strong_inline=true
+
+build:rbe_win_clang --config=rbe_win_base
+build:rbe_win_clang --crosstool_top="//tensorflow/tools/toolchains/win/20240424:toolchain"
+build:rbe_win_clang --extra_toolchains="//tensorflow/tools/toolchains/win/20240424:cc-toolchain-x64_windows-clang-cl"
+build:rbe_win_clang --extra_execution_platforms="//tensorflow/tools/toolchains/win:x64_windows-clang-cl"
+build:rbe_win_clang --host_platform="//tensorflow/tools/toolchains/win:x64_windows-clang-cl"
+build:rbe_win_clang --platforms="//tensorflow/tools/toolchains/win:x64_windows-clang-cl"
+build:rbe_win_clang --compiler=clang-cl
+build:rbe_win_clang --linkopt=/FORCE:MULTIPLE
+build:rbe_win_clang --host_linkopt=/FORCE:MULTIPLE
+
 # END TF REMOTE BUILD EXECUTION OPTIONS
 
 # TFLite build configs for generic embedded Linux
@@ -815,7 +810,7 @@ test:linux_cuda_pycpp_test --config=linux_cuda_pycpp_test_filters -- //tensorflo
 build:linux_arm64_pycpp_test_filters --test_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
 build:linux_arm64_pycpp_test_filters --build_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
 build:linux_arm64_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium --flaky_test_attempts=3
-# TODO(michaelhudgins): Why do we need to specifically omit go and java here? 
+# TODO(michaelhudgins): Why do we need to specifically omit go and java here?
 build:linux_arm64_pycpp_test --config=linux_arm64_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/core/grappler/optimizers:auto_mixed_precision_test_cpu -//tensorflow/core/grappler/optimizers:remapper_test_cpu -//tensorflow/core/kernels/image:resize_bicubic_op_test -//tensorflow/compiler/mlir/tfr/examples/customization:test_ops_test -//tensorflow/compiler/mlir/tfr/examples/mnist:mnist_ops_test -//tensorflow/compiler/mlir/tfr/examples/pad:pad_ops_test -//tensorflow/python/tools:aot_compiled_test
 # CROSS-COMPILE ARM64 PYCPP
 build:cross_compile_linux_arm64_pycpp_test --config=linux_arm64_pycpp_test
@@ -924,7 +919,9 @@ build:cross_compile_macos_x86 --extra_toolchains=//tensorflow/tools/toolchains/c
 build:cross_compile_macos_x86 --platform_mappings=tensorflow/tools/toolchains/cross_compile/config/platform_mappings
 
 # RBE cross-compile configs for Darwin x86
-build:rbe_cross_compile_macos_x86 --config=cross_compile_macos_x86
+build:rbe_cross_compile_macos_x86 --config=cross_compile_macos_x86 --remote_download_minimal
+build:rbe_cross_compile_macos_x86 --bes_backend="" --bes_results_url="" --bes_timeout="0s"
+build:rbe_cross_compile_macos_x86 --experimental_remote_build_event_upload="minimal"
 build:rbe_cross_compile_macos_x86 --config=rbe_cross_compile_base
 build:rbe_cross_compile_macos_x86 --bes_upload_mode=nowait_for_upload_complete
 test:rbe_cross_compile_macos_x86 --config=rbe_cross_compile_base
diff --git a/RELEASE.md b/RELEASE.md
index 3c6198b60d1918..8287988507e571 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -31,6 +31,7 @@
       been added to TF binary distributions (Python wheels).
 * Replace `DebuggerOptions` of TensorFlow Quantizer, and migrate to
   `DebuggerConfig` of StableHLO Quantizer.
+* Add TensorFlow to StableHLO converter to TensorFlow pip package.
 
 ## Keras
 
@@ -87,6 +88,8 @@
     * The Python TF Lite Interpreter bindings now have an option
       `experimental_default_delegate_latest_features` to enable all default
       delegate features.
+    * Flatbuffer version update:
+        * `GetTemporaryPointer()` bug fixed.
 
 * `tf.data`
     * Add `wait` to `tf.data.Dataset.load`. If `True`, for snapshots written
@@ -95,6 +98,13 @@
       it's finished. The default is `False` for backward compatibility. Users of
       `distributed_save` are recommended to set it to `True`.
 
+* `tf.tpu.experimental.embedding.TPUEmbeddingV2`
+    * Add `compute_sparse_core_stats` for sparse core users to profile the 
+      data with this API to get the `max_ids` and `max_unique_ids`. These
+      numbers will be needed to configure the sparse core embedding mid level 
+      api.
+    * Remove the `preprocess_features` method since that's no longer needed.
+
 ## Thanks to our Contributors
 
 This release contains contributions from many people at Google, as well as:
diff --git a/ci/official/containers/linux_arm64/build.sh b/ci/official/containers/linux_arm64/build.sh
index 5d6a40658bd782..611d5f48ac0084 100755
--- a/ci/official/containers/linux_arm64/build.sh
+++ b/ci/official/containers/linux_arm64/build.sh
@@ -40,11 +40,15 @@ else
   fi
 fi
 
+# TODO(b/341050361): When these steps are verified, removed the GCR image code.
+AR_IMAGE_PATH="us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/build-arm64"
+
 # Build for both JAX and TF usage.  We do these in one place because they share
 # almost all of the same cache layers
 export DOCKER_BUILDKIT=1
 for target in jax tf; do
   IMAGE="gcr.io/tensorflow-sigs/build-arm64:$target-$TAG"
+  AR_IMAGE="$AR_IMAGE_PATH:$target-$TAG"
   docker pull "$IMAGE" || true
   # Due to some flakiness of resources pulled in the build, allow the docker
   # command to reattempt build a few times in the case of failure (b/302558736)
@@ -55,7 +59,7 @@ for target in jax tf; do
     --build-arg REQUIREMENTS_FILE=jax.requirements.txt \
     --target=$target \
     --cache-from "$IMAGE" \
-    -t "$IMAGE"  . && break
+    -t "$IMAGE" -t "$AR_IMAGE" . && break
   done
   final=$?
   if [ $final -ne 0 ]; then
@@ -66,5 +70,7 @@ for target in jax tf; do
   if [[ -n "$KOKORO_BUILD_ID" ]]; then
     gcloud auth configure-docker
     docker push "$IMAGE"
+    gcloud auth configure-docker us-central1-docker.pkg.dev
+    docker push "$AR_IMAGE"
   fi
 done
diff --git a/ci/official/utilities/setup_docker.sh b/ci/official/utilities/setup_docker.sh
index 36afa2545eb244..91618c75f3ba51 100755
--- a/ci/official/utilities/setup_docker.sh
+++ b/ci/official/utilities/setup_docker.sh
@@ -14,11 +14,12 @@
 # limitations under the License.
 # ==============================================================================
 if [[ "$TFCI_DOCKER_PULL_ENABLE" == 1 ]]; then
-  # Simple retry logic for docker-pull errors. Sleeps for 15s if a pull fails.
+  # Simple retry logic for docker-pull errors. Sleeps if a pull fails.
   # Pulling an already-pulled container image will finish instantly, so
   # repeating the command costs nothing.
   docker pull "$TFCI_DOCKER_IMAGE" || sleep 15
-  docker pull "$TFCI_DOCKER_IMAGE" || sleep 15
+  docker pull "$TFCI_DOCKER_IMAGE" || sleep 30
+  docker pull "$TFCI_DOCKER_IMAGE" || sleep 60
   docker pull "$TFCI_DOCKER_IMAGE"
 fi 
 
diff --git a/requirements_lock_3_10.txt b/requirements_lock_3_10.txt
index 05dc3940487eef..f17468ddaafd0a 100644
--- a/requirements_lock_3_10.txt
+++ b/requirements_lock_3_10.txt
@@ -522,9 +522,9 @@ urllib3==2.2.0 \
     --hash=sha256:051d961ad0c62a94e50ecf1af379c3aba230c66c710493493560c0c223c49f20 \
     --hash=sha256:ce3711610ddce217e6d113a2732fafad960a03fd0318c91faa79481e35c11224
     # via requests
-werkzeug==3.0.1 \
-    --hash=sha256:507e811ecea72b18a404947aded4b3390e1db8f826b494d76550ef45bb3b1dcc \
-    --hash=sha256:90a285dc0e42ad56b34e696398b8122ee4c681833fb35b8334a095d82c56da10
+werkzeug==3.0.3 \
+    --hash=sha256:097e5bfda9f0aba8da6b8545146def481d06aa7d3266e7448e2cccf67dd8bd18 \
+    --hash=sha256:fc9645dc43e03e4d630d23143a04a7f947a9a3b5727cd535fdfe155a17cc48c8
     # via tb-nightly
 wheel==0.41.3 \
     --hash=sha256:488609bc63a29322326e05560731bf7bfea8e48ad646e1f5e40d366607de0942 \
diff --git a/requirements_lock_3_11.txt b/requirements_lock_3_11.txt
index 05dc3940487eef..f17468ddaafd0a 100644
--- a/requirements_lock_3_11.txt
+++ b/requirements_lock_3_11.txt
@@ -522,9 +522,9 @@ urllib3==2.2.0 \
     --hash=sha256:051d961ad0c62a94e50ecf1af379c3aba230c66c710493493560c0c223c49f20 \
     --hash=sha256:ce3711610ddce217e6d113a2732fafad960a03fd0318c91faa79481e35c11224
     # via requests
-werkzeug==3.0.1 \
-    --hash=sha256:507e811ecea72b18a404947aded4b3390e1db8f826b494d76550ef45bb3b1dcc \
-    --hash=sha256:90a285dc0e42ad56b34e696398b8122ee4c681833fb35b8334a095d82c56da10
+werkzeug==3.0.3 \
+    --hash=sha256:097e5bfda9f0aba8da6b8545146def481d06aa7d3266e7448e2cccf67dd8bd18 \
+    --hash=sha256:fc9645dc43e03e4d630d23143a04a7f947a9a3b5727cd535fdfe155a17cc48c8
     # via tb-nightly
 wheel==0.41.3 \
     --hash=sha256:488609bc63a29322326e05560731bf7bfea8e48ad646e1f5e40d366607de0942 \
diff --git a/requirements_lock_3_12.txt b/requirements_lock_3_12.txt
index 120ec6ebcd7c72..0d045ea1a0579c 100644
--- a/requirements_lock_3_12.txt
+++ b/requirements_lock_3_12.txt
@@ -530,9 +530,9 @@ urllib3==2.2.0 \
     --hash=sha256:051d961ad0c62a94e50ecf1af379c3aba230c66c710493493560c0c223c49f20 \
     --hash=sha256:ce3711610ddce217e6d113a2732fafad960a03fd0318c91faa79481e35c11224
     # via requests
-werkzeug==3.0.1 \
-    --hash=sha256:507e811ecea72b18a404947aded4b3390e1db8f826b494d76550ef45bb3b1dcc \
-    --hash=sha256:90a285dc0e42ad56b34e696398b8122ee4c681833fb35b8334a095d82c56da10
+werkzeug==3.0.3 \
+    --hash=sha256:097e5bfda9f0aba8da6b8545146def481d06aa7d3266e7448e2cccf67dd8bd18 \
+    --hash=sha256:fc9645dc43e03e4d630d23143a04a7f947a9a3b5727cd535fdfe155a17cc48c8
     # via tb-nightly
 wheel==0.41.3 \
     --hash=sha256:488609bc63a29322326e05560731bf7bfea8e48ad646e1f5e40d366607de0942 \
diff --git a/requirements_lock_3_9.txt b/requirements_lock_3_9.txt
index 36a55514cd788b..48c74173fe553f 100644
--- a/requirements_lock_3_9.txt
+++ b/requirements_lock_3_9.txt
@@ -526,9 +526,9 @@ urllib3==2.2.0 \
     --hash=sha256:051d961ad0c62a94e50ecf1af379c3aba230c66c710493493560c0c223c49f20 \
     --hash=sha256:ce3711610ddce217e6d113a2732fafad960a03fd0318c91faa79481e35c11224
     # via requests
-werkzeug==3.0.1 \
-    --hash=sha256:507e811ecea72b18a404947aded4b3390e1db8f826b494d76550ef45bb3b1dcc \
-    --hash=sha256:90a285dc0e42ad56b34e696398b8122ee4c681833fb35b8334a095d82c56da10
+werkzeug==3.0.3 \
+    --hash=sha256:097e5bfda9f0aba8da6b8545146def481d06aa7d3266e7448e2cccf67dd8bd18 \
+    --hash=sha256:fc9645dc43e03e4d630d23143a04a7f947a9a3b5727cd535fdfe155a17cc48c8
     # via tb-nightly
 wheel==0.41.3 \
     --hash=sha256:488609bc63a29322326e05560731bf7bfea8e48ad646e1f5e40d366607de0942 \
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 71487e2aec0bee..a4cd4af8975bc2 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -1382,6 +1382,7 @@ tf_cc_shared_library(
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/compiler/mlir/lite/sparsity:sparsify_model",
         "//tensorflow/compiler/mlir/quantization/stablehlo/python:pywrap_quantization_lib_impl",
+        "//tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/python:pywrap_tensorflow_to_stablehlo_lib_impl",
         "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:custom_aggregator_op",
         "//tensorflow/compiler/mlir/quantization/tensorflow/python:quantize_model_cc_impl",
         "//tensorflow/compiler/mlir/quantization/tensorflow:passes",
@@ -1416,6 +1417,7 @@ tf_cc_shared_library(
         "//tensorflow/core/grappler:grappler_item_builder",
         "//tensorflow/core/kernels:data_service_ops",
         "//tensorflow/core/kernels:dataset_ops",
+        "//tensorflow/core/tpu/kernels:sparse_core_layout",
         "//tensorflow/core/platform:logging",
         "//tensorflow/core/platform:path",
         "//tensorflow/core/platform:stacktrace_handler",
diff --git a/tensorflow/c/c_test.c b/tensorflow/c/c_test.c
index ce8a115c5b21bd..5415b2deaf6c93 100644
--- a/tensorflow/c/c_test.c
+++ b/tensorflow/c/c_test.c
@@ -20,6 +20,10 @@ limitations under the License.
 #include <time.h>
 #include <unistd.h>
 
+#ifdef _WIN32
+#include <process.h>
+#endif
+
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/c_api_experimental.h"
 #include "tensorflow/c/env.h"
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index a433b618de7142..d20be8abcf02a4 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -296,8 +296,8 @@ TFE_TensorHandle* TFE_NewTensorHandle(const TF_Tensor* t, TF_Status* status) {
 void TFE_DeleteTensorHandle(TFE_TensorHandle* h) {
   if (h == nullptr) return;
 
-  tensorflow::profiler::TraceMe activity(
-      "TFE_DeleteTensorHandle", tensorflow::profiler::TraceMeLevel::kInfo);
+  tsl::profiler::TraceMe activity("TFE_DeleteTensorHandle",
+                                  tsl::profiler::TraceMeLevel::kInfo);
   if (h) {
     tensorflow::unwrap(h)->Unref();
   }
diff --git a/tensorflow/c/eager/c_api_unified_experimental.cc b/tensorflow/c/eager/c_api_unified_experimental.cc
index 8422459c21b529..ab29b1cd6ff051 100644
--- a/tensorflow/c/eager/c_api_unified_experimental.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental.cc
@@ -216,7 +216,7 @@ void TF_AbstractOpSetAttrType(TF_AbstractOp* op, const char* const attr_name,
   Status status =
       unwrap(op)->SetAttrType(attr_name, static_cast<DataType>(value));
   TF_SetStatus(s, static_cast<TF_Code>(status.code()),
-               tsl::NullTerminatedMessage(status));
+               absl::StatusMessageAsCStr(status));
 }
 
 void TF_ExecuteOperation(TF_AbstractOp* op, int num_inputs,
diff --git a/tensorflow/c/experimental/filesystem/plugins/windows/BUILD b/tensorflow/c/experimental/filesystem/plugins/windows/BUILD
index 159e36e485e6a6..8cb30fa9ae0828 100644
--- a/tensorflow/c/experimental/filesystem/plugins/windows/BUILD
+++ b/tensorflow/c/experimental/filesystem/plugins/windows/BUILD
@@ -31,8 +31,5 @@ cc_library(
         "nobuilder",
         "notap",
     ],
-    deps = [
-        "//tensorflow/c:tf_status",
-        "//tensorflow/c/experimental/filesystem:filesystem_interface",
-    ],
+    deps = ["//tensorflow/c/experimental/filesystem:filesystem_interface"],
 )
diff --git a/tensorflow/c/experimental/filesystem/plugins/windows/windows_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/windows/windows_filesystem.cc
index 31400562a2579e..e382b829341411 100644
--- a/tensorflow/c/experimental/filesystem/plugins/windows/windows_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/windows/windows_filesystem.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include <string.h>
 
 #include "tensorflow/c/experimental/filesystem/filesystem_interface.h"
-#include "tensorflow/c/tf_status.h"
 
 // Implementation of a filesystem for POSIX environments.
 // This filesystem will support `file://` and empty (local) URI schemes.
diff --git a/tensorflow/c/experimental/next_pluggable_device/c_api.cc b/tensorflow/c/experimental/next_pluggable_device/c_api.cc
index 15a50a0a7c4060..a4d47753bdd5d8 100644
--- a/tensorflow/c/experimental/next_pluggable_device/c_api.cc
+++ b/tensorflow/c/experimental/next_pluggable_device/c_api.cc
@@ -91,7 +91,7 @@ void TF_LookupOrCreatePluginResource(
         void* opaque_plugin_resource = create_func(create_func_args);
         *new_resource = new tensorflow::PluginResource(
             opaque_plugin_resource, plugin_resource_name, delete_func);
-        return tensorflow::OkStatus();
+        return absl::OkStatus();
       });
 
   if (cc_status.ok()) {
diff --git a/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util.cc b/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util.cc
index 18a851e394aea7..dd15c9f078cd1d 100644
--- a/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util.cc
+++ b/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util.cc
@@ -69,7 +69,7 @@ absl::Status SetPjRtCBufferToTensor(PJRT_Buffer* c_buffer,
 
 absl::StatusOr<xla::PjRtCApiClient*> GetPjRtCApiClient(
     const DeviceType& device_type) {
-  TF_ASSIGN_OR_RETURN(tsl::StatusOr<xla::PjRtClient*> pjrt_client,
+  TF_ASSIGN_OR_RETURN(absl::StatusOr<xla::PjRtClient*> pjrt_client,
                       tensorflow::GetPjRtClient(device_type));
   auto* pjrt_c_api_client = dynamic_cast<xla::PjRtCApiClient*>(*pjrt_client);
   if (pjrt_c_api_client == nullptr) {
diff --git a/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util.h b/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util.h
index 85c60120b07241..c2b1051f75c39e 100644
--- a/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util.h
+++ b/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util.h
@@ -21,7 +21,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-StatusOr<PJRT_Buffer*> GetPjRtCBufferFromTensor(const Tensor* tensor);
+absl::StatusOr<PJRT_Buffer*> GetPjRtCBufferFromTensor(const Tensor* tensor);
 
 absl::Status SetPjRtCBufferToTensor(PJRT_Buffer* c_buffer,
                                     xla::PjRtCApiClient* c_api_client,
diff --git a/tensorflow/c/experimental/ops/gen/common/case_format.cc b/tensorflow/c/experimental/ops/gen/common/case_format.cc
index 9b8e955356db07..1e9d123005e8a4 100644
--- a/tensorflow/c/experimental/ops/gen/common/case_format.cc
+++ b/tensorflow/c/experimental/ops/gen/common/case_format.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/gen/common/case_format.h"
 
+#include "tensorflow/core/platform/str_util.h"
+#include "tensorflow/core/platform/types.h"
+
 namespace tensorflow {
 namespace generator {
 
diff --git a/tensorflow/c/experimental/ops/gen/common/case_format_test.cc b/tensorflow/c/experimental/ops/gen/common/case_format_test.cc
index 37bc5be753fd64..302bcc42453169 100644
--- a/tensorflow/c/experimental/ops/gen/common/case_format_test.cc
+++ b/tensorflow/c/experimental/ops/gen/common/case_format_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/c/experimental/ops/gen/common/case_format.h"
 
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace generator {
diff --git a/tensorflow/c/experimental/ops/gen/common/controller.cc b/tensorflow/c/experimental/ops/gen/common/controller.cc
index a8e02f41011d32..cafb57c0919403 100644
--- a/tensorflow/c/experimental/ops/gen/common/controller.cc
+++ b/tensorflow/c/experimental/ops/gen/common/controller.cc
@@ -15,11 +15,17 @@ limitations under the License.
 #include "tensorflow/c/experimental/ops/gen/common/controller.h"
 
 #include "absl/strings/substitute.h"
+#include "tensorflow/c/experimental/ops/gen/common/path_config.h"
+#include "tensorflow/c/experimental/ops/gen/common/source_code.h"
+#include "tensorflow/c/experimental/ops/gen/model/op_spec.h"
+#include "tensorflow/core/framework/api_def.pb.h"
 #include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/lib/io/path.h"
-#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_gen_lib.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/path.h"
+#include "tsl/platform/status.h"
 
 namespace tensorflow {
 namespace generator {
diff --git a/tensorflow/c/experimental/ops/gen/common/path_config.cc b/tensorflow/c/experimental/ops/gen/common/path_config.cc
index d9e3881bf15580..b8f84d5f31f4d3 100644
--- a/tensorflow/c/experimental/ops/gen/common/path_config.cc
+++ b/tensorflow/c/experimental/ops/gen/common/path_config.cc
@@ -16,7 +16,9 @@ limitations under the License.
 
 #include <iostream>
 
+#include "absl/strings/str_join.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace generator {
diff --git a/tensorflow/c/experimental/ops/gen/common/source_code.cc b/tensorflow/c/experimental/ops/gen/common/source_code.cc
index ea4db53d167109..ea2b66fac7cd27 100644
--- a/tensorflow/c/experimental/ops/gen/common/source_code.cc
+++ b/tensorflow/c/experimental/ops/gen/common/source_code.cc
@@ -14,9 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/gen/common/source_code.h"
 
+#include "absl/strings/ascii.h"
 #include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stringpiece.h"
 
 namespace tensorflow {
 namespace generator {
diff --git a/tensorflow/c/experimental/ops/gen/common/view_util.cc b/tensorflow/c/experimental/ops/gen/common/view_util.cc
index a14c7e38b63b46..7c8717067b08fe 100644
--- a/tensorflow/c/experimental/ops/gen/common/view_util.cc
+++ b/tensorflow/c/experimental/ops/gen/common/view_util.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/gen/common/view_util.h"
 
+#include "absl/strings/str_join.h"
 #include "absl/strings/substitute.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace generator {
diff --git a/tensorflow/c/experimental/saved_model/core/ops/variable_ops.cc b/tensorflow/c/experimental/saved_model/core/ops/variable_ops.cc
index 7ca7a7bf639bf7..509f209ffd7b42 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/variable_ops.cc
+++ b/tensorflow/c/experimental/saved_model/core/ops/variable_ops.cc
@@ -44,7 +44,7 @@ Status CreateUninitializedResourceVariable(ImmediateExecutionContext* ctx,
 
   // Note that if shape is unknown rank, shape.dim_sizes() will be empty, and
   // shape.dims() will be -1.
-  gtl::InlinedVector<int64_t, 4> dim_sizes = shape.dim_sizes();
+  absl::InlinedVector<int64_t, 4UL> dim_sizes = shape.dim_sizes();
   TF_RETURN_IF_ERROR(varhandle_op->SetAttrShape(
       "shape", reinterpret_cast<const int64_t*>(dim_sizes.data()),
       shape.dims()));
diff --git a/tensorflow/c/experimental/saved_model/internal/saved_model_api_test.cc b/tensorflow/c/experimental/saved_model/internal/saved_model_api_test.cc
index 0701e3b9aa9fff..bc0fae5fd9aeb9 100644
--- a/tensorflow/c/experimental/saved_model/internal/saved_model_api_test.cc
+++ b/tensorflow/c/experimental/saved_model/internal/saved_model_api_test.cc
@@ -506,12 +506,11 @@ TEST_P(CSavedModelAPITest, LoadSavedModelWithUninitializedVariable) {
       tensorflow::down_cast<tensorflow::TFSavedModelAPI*>(
           tensorflow::unwrap(saved_model));
   tensorflow::Variable* uninitialized_variable;
-  ASSERT_EQ(::tensorflow::OkStatus(),
-            model_api->GetVariable("uninitialized_variable",
-                                   &uninitialized_variable));
+  ASSERT_EQ(absl::OkStatus(), model_api->GetVariable("uninitialized_variable",
+                                                     &uninitialized_variable));
   ASSERT_EQ(tensorflow::DT_FLOAT, uninitialized_variable->dtype());
 
-  ASSERT_EQ(::tensorflow::OkStatus(),
+  ASSERT_EQ(absl::OkStatus(),
             model_api->GetVariable("sub_module.uninitialized_variable",
                                    &uninitialized_variable));
   ASSERT_EQ(tensorflow::DT_INT64, uninitialized_variable->dtype());
diff --git a/tensorflow/c/experimental/stream_executor/stream_executor.cc b/tensorflow/c/experimental/stream_executor/stream_executor.cc
index 93d07b431ee4cf..65b31f8cfb8f1c 100644
--- a/tensorflow/c/experimental/stream_executor/stream_executor.cc
+++ b/tensorflow/c/experimental/stream_executor/stream_executor.cc
@@ -200,14 +200,16 @@ void HostCallbackTrampoline(void* ctx, TF_Status* status) {
   delete host_ctx;
 }
 
-class CStreamExecutor : public StreamExecutorInterface {
+class CStreamExecutor : public StreamExecutor {
  public:
-  explicit CStreamExecutor(SP_Device device, SP_DeviceFns* device_fns,
+  explicit CStreamExecutor(Platform* se_platform, SP_Device device,
+                           SP_DeviceFns* device_fns,
                            SP_StreamExecutor* stream_executor,
                            SP_Platform* platform, SP_PlatformFns* platform_fns,
                            SP_TimerFns* timer_fns, const std::string& name,
                            int visible_device_count)
-      : device_(std::move(device)),
+      : StreamExecutor(se_platform),
+        device_(std::move(device)),
         device_fns_(device_fns),
         stream_executor_(stream_executor),
         platform_(platform),
@@ -405,10 +407,6 @@ class CStreamExecutor : public StreamExecutorInterface {
     return stream_executor_->host_callback(&device_, stream_handle,
                                            &HostCallbackTrampoline, ctx);
   }
-  absl::Status AllocateEvent(Event* event) override {
-    DCHECK(event != nullptr);
-    return static_cast<CEvent*>(event->implementation())->Create();
-  }
   absl::Status DeallocateEvent(Event* event) override {
     static_cast<CEvent*>(event->implementation())->Destroy();
     return absl::OkStatus();
@@ -436,14 +434,6 @@ class CStreamExecutor : public StreamExecutorInterface {
         stream_executor_->get_event_status(&device_, event_handle);
     return SEEventStatusToEventStatus(event_status);
   }
-  bool AllocateStream(Stream* stream) override {
-    DCHECK(stream != nullptr);
-    absl::Status status =
-        static_cast<CStream*>(stream->implementation())->Create();
-    // TODO(annarev): update AllocateStream to return status instead
-    // (similar to AllocateEvent).
-    return status.ok();
-  }
   void DeallocateStream(Stream* stream) override {
     static_cast<CStream*>(stream->implementation())->Destroy();
   }
@@ -557,15 +547,19 @@ class CStreamExecutor : public StreamExecutorInterface {
     return builder.Build();
   }
 
-  // Each call creates a new instance of the platform-specific implementation of
-  // the corresponding interface type.
-  std::unique_ptr<EventInterface> CreateEventImplementation() override {
-    return std::unique_ptr<EventInterface>(
-        new CEvent(&device_, stream_executor_));
+  absl::StatusOr<std::unique_ptr<Event>> CreateEvent() override {
+    auto c_event = std::make_unique<CEvent>(&device_, stream_executor_);
+    TF_RETURN_IF_ERROR(c_event->Create());
+    return std::make_unique<Event>(this, std::move(c_event));
   }
-  std::unique_ptr<StreamInterface> GetStreamImplementation() override {
-    return std::unique_ptr<StreamInterface>(
-        new CStream(&device_, stream_executor_));
+
+  absl::StatusOr<std::unique_ptr<Stream>> CreateStream(
+      std::optional<std::variant<StreamPriority, int>> priority =
+          std::nullopt) override {
+    auto c_stream = std::make_unique<CStream>(&device_, stream_executor_);
+    TF_RETURN_IF_ERROR(c_stream->Create());
+    auto stream = std::make_unique<Stream>(this, std::move(c_stream));
+    return std::move(stream);
   }
 
  private:
@@ -644,11 +638,9 @@ absl::StatusOr<std::unique_ptr<StreamExecutor>> CPlatform::GetUncachedExecutor(
                                  c_status.get());
   TF_RETURN_IF_ERROR(StatusFromTF_Status(c_status.get()));
 
-  auto executor = std::make_unique<CStreamExecutor>(
-      std::move(device), &device_fns_, &stream_executor_, &platform_,
+  return std::make_unique<CStreamExecutor>(
+      this, std::move(device), &device_fns_, &stream_executor_, &platform_,
       &platform_fns_, &timer_fns_, name_, visible_device_count);
-  auto result = std::make_unique<StreamExecutor>(this, std::move(executor));
-  return result;
 }
 
 absl::Status InitStreamExecutorPlugin(void* dso_handle,
diff --git a/tensorflow/c/experimental/stream_executor/stream_executor_test.cc b/tensorflow/c/experimental/stream_executor/stream_executor_test.cc
index 56f25a5811293e..680a1d9d1db1f5 100644
--- a/tensorflow/c/experimental/stream_executor/stream_executor_test.cc
+++ b/tensorflow/c/experimental/stream_executor/stream_executor_test.cc
@@ -342,11 +342,10 @@ TEST_F(StreamExecutorTest, CreateEvent) {
 
   StreamExecutor* executor = GetExecutor(0);
   ASSERT_FALSE(event_created);
-  Event* event = new Event(executor);
-  event->Init();
+  TF_ASSERT_OK_AND_ASSIGN(auto event, executor->CreateEvent());
   ASSERT_TRUE(event_created);
   ASSERT_FALSE(event_deleted);
-  delete event;
+  event.reset();
   ASSERT_TRUE(event_deleted);
 }
 
@@ -365,11 +364,10 @@ TEST_F(StreamExecutorTest, PollForEventStatus) {
   };
 
   StreamExecutor* executor = GetExecutor(0);
-  Event event(executor);
-  event.Init();
-  ASSERT_EQ(event.PollForStatus(), Event::Status::kComplete);
+  TF_ASSERT_OK_AND_ASSIGN(auto event, executor->CreateEvent());
+  ASSERT_EQ(event->PollForStatus(), Event::Status::kComplete);
   event_status = SE_EVENT_ERROR;
-  ASSERT_EQ(event.PollForStatus(), Event::Status::kError);
+  ASSERT_EQ(event->PollForStatus(), Event::Status::kError);
 }
 
 TEST_F(StreamExecutorTest, RecordAndWaitForEvent) {
@@ -403,14 +401,13 @@ TEST_F(StreamExecutorTest, RecordAndWaitForEvent) {
   };
 
   StreamExecutor* executor = GetExecutor(0);
-  Event event(executor);
-  event.Init();
+  TF_ASSERT_OK_AND_ASSIGN(auto event, executor->CreateEvent());
   TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
   ASSERT_FALSE(record_called);
-  TF_ASSERT_OK(stream->RecordEvent(&event));
+  TF_ASSERT_OK(stream->RecordEvent(event.get()));
   ASSERT_TRUE(record_called);
   ASSERT_FALSE(wait_called);
-  TF_ASSERT_OK(stream->WaitFor(&event));
+  TF_ASSERT_OK(stream->WaitFor(event.get()));
   ASSERT_TRUE(wait_called);
 }
 
diff --git a/tensorflow/c/kernels_experimental.cc b/tensorflow/c/kernels_experimental.cc
index 26173507f29aec..02e41428c6ae58 100644
--- a/tensorflow/c/kernels_experimental.cc
+++ b/tensorflow/c/kernels_experimental.cc
@@ -266,7 +266,7 @@ void TF_AssignUpdateVariable(TF_OpKernelContext* ctx, int input_index,
   Status status =
       LookupResource(context, HandleFromInput(context, input_index), &variable);
   if (!status.ok()) {
-    printf("Failed with error: %s\n", tsl::NullTerminatedMessage(status));
+    printf("Failed with error: %s\n", absl::StatusMessageAsCStr(status));
     abort();
   }
   const Tensor& value = context->input(value_index);
diff --git a/tensorflow/c/tf_status_helper.cc b/tensorflow/c/tf_status_helper.cc
index bbeae6f76bc497..c96a5af7440dff 100644
--- a/tensorflow/c/tf_status_helper.cc
+++ b/tensorflow/c/tf_status_helper.cc
@@ -25,7 +25,7 @@ namespace tsl {
 void Set_TF_Status_from_Status(TF_Status* tf_status,
                                const absl::Status& status) {
   TF_SetStatus(tf_status, TSLCodeFromStatusCode(status.code()),
-               tsl::NullTerminatedMessage(status));
+               absl::StatusMessageAsCStr(status));
   status.ForEachPayload(
       [tf_status](absl::string_view key, const absl::Cord& value) {
         std::string key_str(key);
diff --git a/tensorflow/cc/experimental/libtf/impl/BUILD b/tensorflow/cc/experimental/libtf/impl/BUILD
index 4f5b7ccfd84940..97b06b21682daa 100644
--- a/tensorflow/cc/experimental/libtf/impl/BUILD
+++ b/tensorflow/cc/experimental/libtf/impl/BUILD
@@ -39,6 +39,8 @@ tf_cc_test(
         ":scalars",
         ":string",
         ":tensor_spec",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
@@ -123,6 +125,8 @@ tf_cc_test(
     deps = [
         ":iostream",  # Necessary for absl::VerifyTypeImplementsAbslHashCorrectly.
         ":tensor_spec",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "@com_google_absl//absl/hash:hash_testing",
diff --git a/tensorflow/cc/experimental/libtf/impl/iostream_test.cc b/tensorflow/cc/experimental/libtf/impl/iostream_test.cc
index 40c3d7550d00d4..dede1483d76187 100644
--- a/tensorflow/cc/experimental/libtf/impl/iostream_test.cc
+++ b/tensorflow/cc/experimental/libtf/impl/iostream_test.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "tensorflow/cc/experimental/libtf/impl/scalars.h"
 #include "tensorflow/cc/experimental/libtf/impl/string.h"
 #include "tensorflow/cc/experimental/libtf/impl/tensor_spec.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tf {
diff --git a/tensorflow/cc/experimental/libtf/impl/tensor_spec_test.cc b/tensorflow/cc/experimental/libtf/impl/tensor_spec_test.cc
index e0654bec85fb29..dc07f77c7ba9b7 100644
--- a/tensorflow/cc/experimental/libtf/impl/tensor_spec_test.cc
+++ b/tensorflow/cc/experimental/libtf/impl/tensor_spec_test.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "tensorflow/cc/experimental/libtf/impl/tensor_spec.h"
 
 #include "absl/hash/hash_testing.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tf {
diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index 6cc731e722d16b..da27e61d380081 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -544,9 +544,7 @@ cc_library(
     name = "fingerprinting_utils",
     srcs = ["fingerprinting_utils.cc"],
     hdrs = ["fingerprinting_utils.h"],
-    visibility = [
-        "//tensorflow:__pkg__",
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         ":constants",
         "//tensorflow/core:lib",
diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc
index 93d7527bdd409f..4666ddd5db9ed6 100644
--- a/tensorflow/compiler/aot/codegen.cc
+++ b/tensorflow/compiler/aot/codegen.cc
@@ -668,7 +668,6 @@ class {{CLASS}} final : public tensorflow::XlaCompiledCpuFunction {
       set_static_data_program_shape(data, StaticProgramShape());
       set_static_data_hlo_profile_printer_data(
           data, StaticHloProfilePrinterData());
-      set_static_data_use_xla_runtime(data, {{USE_XLA_RUNTIME}});
 {{ASSIGN_PROFILE_COUNTERS_SIZE}}
       return data;
     }();
@@ -822,7 +821,6 @@ class {{CLASS}} final : public tensorflow::XlaCompiledCpuFunction {
       {"{{DECLS_FROM_OBJ_FILE}}",
        absl::StrJoin(metadata_result.header_variable_decls, "\n")},
       {"{{ENTRY}}", compile_result.entry_point},
-      {"{{USE_XLA_RUNTIME}}", opts.use_xla_runtime ? "true" : "false"},
       {"{{HLO_PROFILE_PRINTER_DATA_SHIM_EXPRESSION}}",
        metadata_result.hlo_profile_printer_data_access_shim},
       {"{{INCLUDE_XLA_DATA_PROTO}}", include_xla_data_proto},
diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden
index 71b234a8385806..cd1a72308c3ede 100644
--- a/tensorflow/compiler/aot/codegen_test_h.golden
+++ b/tensorflow/compiler/aot/codegen_test_h.golden
@@ -97,7 +97,6 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
       set_static_data_program_shape(data, StaticProgramShape());
       set_static_data_hlo_profile_printer_data(
           data, StaticHloProfilePrinterData());
-      set_static_data_use_xla_runtime(data, false);
 
       return data;
     }();
diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl
index a543aae5b92997..99c8541c55488c 100644
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@@ -319,8 +319,6 @@ def _tf_library(
         ] or []) + (include_standard_runtime_deps and [
             # TODO(cwhipkey): only depend on kernel code that the model actually
             # needed.
-            "@local_xla//xla/service/cpu/runtime:convolution_ffi",
-            "@local_xla//xla/service/cpu/runtime:rng_ffi",
             "@local_xla//xla/service/cpu:runtime_conv2d",
             "@local_xla//xla/service/cpu:runtime_custom_call_status",
             "@local_xla//xla/service/cpu:runtime_key_value_sort",
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 76f3c147903748..623334534567de 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -199,6 +199,7 @@ cc_library(
         "//tensorflow/core/tpu:tpu_node_device_util",
         "//tensorflow/core/tpu:virtual_device",
         "@com_google_absl//absl/types:optional",
+        "@local_tsl//tsl/platform:statusor",
         "@local_xla//xla/stream_executor/tpu:c_api_conversions",
         "@local_xla//xla/stream_executor/tpu:status_helper",
         "@local_xla//xla/stream_executor/tpu:tpu_api",
@@ -314,6 +315,7 @@ cc_library(
         "//tensorflow/core/common_runtime:dma_helper",
         "//tensorflow/core/framework:allocator",
         "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/platform:statusor",
         "@local_xla//xla:util",
         "@local_xla//xla/client:global_data",
         "@local_xla//xla/client:local_client",
@@ -1149,6 +1151,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/numeric:bits",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@local_xla//xla:status_macros",
diff --git a/tensorflow/compiler/jit/device_util.h b/tensorflow/compiler/jit/device_util.h
index df3b7d04fbfe7b..ec4d9484ae8854 100644
--- a/tensorflow/compiler/jit/device_util.h
+++ b/tensorflow/compiler/jit/device_util.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <memory>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/numeric/bits.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
@@ -79,7 +80,7 @@ class DeviceSet {
         uint64 only_lowest_bit_set = word & -word;
         // The number of trailing zeros in a non-zero word is the index of the
         // least significant 1.
-        int bit_index = ctz_uint64(word);
+        int bit_index = absl::countr_zero(word);
         if (!func(DeviceId(word_index * kWordSize + bit_index))) {
           return;
         }
@@ -89,20 +90,6 @@ class DeviceSet {
   }
 
  private:
-  static int ctz_uint64(uint64 x) {
-    DCHECK_NE(x, 0);
-#ifdef __GNUC__
-    return __builtin_ctzl(x);
-#else
-    int result = 0u;
-    while ((x & 1u) == 0u) {
-      x >>= 1;
-      ++result;
-    }
-    return result;
-#endif
-  }
-
   absl::InlinedVector<uint64, 1> storage_;
 
   const int kWordSize = 64;
diff --git a/tensorflow/compiler/jit/kernels/BUILD b/tensorflow/compiler/jit/kernels/BUILD
index 0ac326c61fb3ec..d173564b7fd10d 100644
--- a/tensorflow/compiler/jit/kernels/BUILD
+++ b/tensorflow/compiler/jit/kernels/BUILD
@@ -59,8 +59,10 @@ cc_library(
         "//tensorflow/compiler/jit:xla_compile_util",
         "//tensorflow/core/platform:refcount",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@local_xla//xla/pjrt:pjrt_client",
+        "@local_xla//xla/tsl/concurrency:async_value",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.cc b/tensorflow/compiler/jit/kernels/xla_ops.cc
index 9d75388cfbbe80..5a29e8ef36e9b3 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.cc
+++ b/tensorflow/compiler/jit/kernels/xla_ops.cc
@@ -24,10 +24,10 @@ limitations under the License.
 #include <string_view>
 #include <tuple>
 #include <utility>
-#include <variant>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/jit/device_compilation_profiler.h"
@@ -52,7 +52,7 @@ limitations under the License.
 #include "xla/executable_run_options.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
-#include "xla/statusor.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -224,7 +224,7 @@ xla::SendDeviceMemoryFunction GetSendDeviceMemoryFunction(
           int64_t channel_id, se::Stream* stream, const xla::Shape& shape,
           const se::DeviceMemoryBase& device_memory_base,
           const absl::flat_hash_map<std::string, std::string>& frontend_attrs)
-          -> absl::StatusOr<tsl::AsyncValueRef<se::Event>> {
+          -> absl::StatusOr<tsl::AsyncValueRef<std::unique_ptr<se::Event>>> {
         auto iter = frontend_attrs.find("_xla_host_transfer_rendezvous");
 
         // Generate the Rendezvous key.
@@ -244,12 +244,10 @@ xla::SendDeviceMemoryFunction GetSendDeviceMemoryFunction(
         RendezvousInterface::ParsedKey parsed_key;
         TF_RETURN_IF_ERROR(Rendezvous::ParseKey(rendezvous_key, &parsed_key));
 
-        tsl::AsyncValueRef<se::Event> done_event =
-            tsl::MakeConstructedAsyncValueRef<se::Event>(stream->parent());
-        if (!done_event->Init()) {
-          return errors::Internal(
-              "Failed to initialize done event (channel_id=%d)", channel_id);
-        }
+        TF_ASSIGN_OR_RETURN(auto event, stream->parent()->CreateEvent());
+        tsl::AsyncValueRef<std::unique_ptr<se::Event>> done_event =
+            tsl::MakeConstructedAsyncValueRef<std::unique_ptr<se::Event>>(
+                std::move(event));
 
         Rendezvous::Args args;
         // Rendezvous::Args owns the device context pointer.
@@ -273,7 +271,7 @@ xla::RecvDeviceMemoryFunction GetRecvDeviceMemoryFunction(
           int64_t channel_id, se::Stream* stream, const xla::Shape& shape,
           se::DeviceMemoryBase* device_memory_base,
           const absl::flat_hash_map<std::string, std::string>& frontend_attrs)
-          -> absl::StatusOr<tsl::AsyncValueRef<se::Event>> {
+          -> absl::StatusOr<tsl::AsyncValueRef<std::unique_ptr<se::Event>>> {
         auto iter = frontend_attrs.find("_xla_host_transfer_rendezvous");
 
         // Generate the Rendezvous key.
@@ -293,12 +291,10 @@ xla::RecvDeviceMemoryFunction GetRecvDeviceMemoryFunction(
         RendezvousInterface::ParsedKey parsed_key;
         TF_RETURN_IF_ERROR(Rendezvous::ParseKey(rendezvous_key, &parsed_key));
 
-        tsl::AsyncValueRef<se::Event> done_event =
-            tsl::MakeConstructedAsyncValueRef<se::Event>(stream->parent());
-        if (!done_event->Init()) {
-          return errors::Internal(
-              "Failed to initialize done event (channel_id=%d)", channel_id);
-        }
+        TF_ASSIGN_OR_RETURN(auto event, stream->parent()->CreateEvent());
+        tsl::AsyncValueRef<std::unique_ptr<se::Event>> done_event =
+            tsl::MakeConstructedAsyncValueRef<std::unique_ptr<se::Event>>(
+                std::move(event));
 
         Rendezvous::Args args;
         // Rendezvous::Args owns the device context pointer.
diff --git a/tensorflow/compiler/jit/node_matchers_test.cc b/tensorflow/compiler/jit/node_matchers_test.cc
index 8edb3e456c4c00..6f37d5617b6ce6 100644
--- a/tensorflow/compiler/jit/node_matchers_test.cc
+++ b/tensorflow/compiler/jit/node_matchers_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/node_matchers.h"
 
+#include <string>
+
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/const_op.h"
@@ -117,12 +119,26 @@ TEST(NodeMatchers, CheckControlDependence) {
   EXPECT_THAT(placeholder_d.node(),
               NodeWith(Name("placeholder_d"), CtrlDeps()));
 
-  EXPECT_EQ(
-      Explain(placeholder_c.node(), NodeWith(CtrlDeps())),
-      "ctrl_deps, which has 2 elements, does not match expected: is empty");
-  EXPECT_EQ(Explain(placeholder_d.node(), NodeWith(CtrlDeps(NodeWith()))),
-            "ctrl_deps does not match expected: has 1 element and that element "
-            "is any node");
+  // TODO(griffithjames): Exactly match these explanations.
+  //
+  // When the OSS build has been updated to include the new error messages, the
+  // Explain() expectations can be exact strings again.
+  {
+    const std::string explanation =
+        Explain(placeholder_c.node(), NodeWith(CtrlDeps()));
+    EXPECT_NE(explanation.find("ctrl_deps, which has 2 elements"),
+              std::string::npos);
+    EXPECT_NE(explanation.find("does not match expected: is empty"),
+              std::string::npos);
+  }
+  {
+    const std::string explanation =
+        Explain(placeholder_d.node(), NodeWith(CtrlDeps(NodeWith())));
+    EXPECT_NE(explanation.find("ctrl_deps"), std::string::npos);
+    EXPECT_NE(explanation.find("does not match expected: has 1 element and "
+                               "that element is any node"),
+              std::string::npos);
+  }
 }
 
 TEST(NodeMatchers, ConstValue) {
diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index b5b0c16422ccab..471f54571d2b53 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -52,7 +52,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
-#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index 821d294af90f66..faf3b65d407a7e 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/framework/tensor_reference.h"
+#include "tsl/platform/statusor.h"
 
 namespace tensorflow {
 
@@ -171,8 +172,8 @@ void XlaDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
         host_to_device_stream_.get(), literal, xla_tensor->shaped_buffer()));
 
     if (UseMultipleStreams()) {
-      auto event = std::make_shared<se::Event>(stream_->parent());
-      TF_RET_CHECK(event->Init()) << "Event failed to initialize!";
+      TF_ASSIGN_OR_RETURN(std::shared_ptr<se::Event> event,
+                          stream_->parent()->CreateEvent());
       TF_RETURN_IF_ERROR(host_to_device_stream_->RecordEvent(event.get()));
       xla_tensor->ResetDefinitionEvent(std::move(event),
                                        host_to_device_stream_.get());
diff --git a/tensorflow/compiler/jit/xla_host_recv_device_context.cc b/tensorflow/compiler/jit/xla_host_recv_device_context.cc
index 54f22fe59fa0bf..ae3c149d5d1387 100644
--- a/tensorflow/compiler/jit/xla_host_recv_device_context.cc
+++ b/tensorflow/compiler/jit/xla_host_recv_device_context.cc
@@ -38,7 +38,7 @@ void XlaHostRecvDeviceContext::CopyDeviceTensorToCPU(
     done(status);
     return;
   }
-  status = stream_->RecordEvent(&done_event_.get());
+  status = stream_->RecordEvent(done_event_.get().get());
   if (!status.ok()) {
     done(status);
     return;
diff --git a/tensorflow/compiler/jit/xla_host_recv_device_context.h b/tensorflow/compiler/jit/xla_host_recv_device_context.h
index 8938fd9c9e0c17..028fd4efd68091 100644
--- a/tensorflow/compiler/jit/xla_host_recv_device_context.h
+++ b/tensorflow/compiler/jit/xla_host_recv_device_context.h
@@ -36,8 +36,8 @@ namespace tensorflow {
 //  Tensor device_tensor(device_allocator, DT_FLOAT, TensorShape({2, 2}));
 //  se::DeviceMemoryBase gpu_dst{device_tensor.data(), 4 * sizeof(float)};
 //  xla::Shape shape(xla::F32, {2, 2}, {}, {})
-//  tsl::AsyncValueRef<se::Event> done_event =
-//      tsl::MakeConstructedAsyncValueRef<se::Event>(stream.parent());
+//  tsl::AsyncValueRef<std::unique_ptr<se::Event>> done_event =
+//      tsl::MakeConstructedAsyncValueRef<std::unique_ptr<se::Event>>(stream.parent());
 //  done_event->Init();
 //  Tensor dest_cpu_tensor;
 //
@@ -48,10 +48,10 @@ namespace tensorflow {
 
 class XlaHostRecvDeviceContext : public DeviceContext {
  public:
-  XlaHostRecvDeviceContext(se::Stream* stream,
-                           const se::DeviceMemoryBase& device_memory_base,
-                           const xla::Shape& shape,
-                           tsl::AsyncValueRef<se::Event>& done_event)
+  XlaHostRecvDeviceContext(
+      se::Stream* stream, const se::DeviceMemoryBase& device_memory_base,
+      const xla::Shape& shape,
+      tsl::AsyncValueRef<std::unique_ptr<se::Event>>& done_event)
       : stream_(stream),
         device_memory_base_(device_memory_base),
         shape_(shape),
@@ -82,7 +82,7 @@ class XlaHostRecvDeviceContext : public DeviceContext {
   // not an issue here since only DeviceMemoryBase methods/members are used.
   const se::DeviceMemoryBase device_memory_base_;
   const xla::Shape shape_;
-  tsl::AsyncValueRef<se::Event> done_event_;
+  tsl::AsyncValueRef<std::unique_ptr<se::Event>> done_event_;
 
   XlaHostRecvDeviceContext(const XlaHostRecvDeviceContext&) = delete;
   void operator=(const XlaHostRecvDeviceContext&) = delete;
diff --git a/tensorflow/compiler/jit/xla_host_send_device_context.cc b/tensorflow/compiler/jit/xla_host_send_device_context.cc
index 5d106c8dc3e073..3d1a9a9f5228c6 100644
--- a/tensorflow/compiler/jit/xla_host_send_device_context.cc
+++ b/tensorflow/compiler/jit/xla_host_send_device_context.cc
@@ -28,7 +28,7 @@ void XlaHostSendDeviceContext::CopyCPUTensorToDevice(
     done(status);
     return;
   }
-  status = stream_->RecordEvent(&done_event_.get());
+  status = stream_->RecordEvent(done_event_.get().get());
   if (!status.ok()) {
     done(status);
     return;
diff --git a/tensorflow/compiler/jit/xla_host_send_device_context.h b/tensorflow/compiler/jit/xla_host_send_device_context.h
index d7a254770c969e..f4e4e9a2535341 100644
--- a/tensorflow/compiler/jit/xla_host_send_device_context.h
+++ b/tensorflow/compiler/jit/xla_host_send_device_context.h
@@ -37,8 +37,8 @@ namespace tensorflow {
 //  Tensor device_tensor(device_allocator, DT_FLOAT, TensorShape({2, 2}));
 //  se::DeviceMemoryBase gpu_dst{device_tensor.data(), 4 * sizeof(float)};
 //  xla::Shape shape(xla::F32, {2, 2}, {}, {})
-//  tsl::AsyncValueRef<se::Event> done_event =
-//      tsl::MakeConstructedAsyncValueRef<se::Event>(stream.parent());
+//  tsl::AsyncValueRef<std::unique_ptr<se::Event>> done_event =
+//      tsl::MakeConstructedAsyncValueRef<std::unique_ptr<se::Event>>(stream.parent());
 //  done_event->Init();
 //
 //  XlaHostSendDeviceContext device_context(&stream, &gpu_dst,
@@ -48,10 +48,10 @@ namespace tensorflow {
 
 class XlaHostSendDeviceContext : public DeviceContext {
  public:
-  XlaHostSendDeviceContext(se::Stream* stream,
-                           se::DeviceMemoryBase* device_memory_base,
-                           const xla::Shape& shape,
-                           tsl::AsyncValueRef<se::Event>& done_event)
+  XlaHostSendDeviceContext(
+      se::Stream* stream, se::DeviceMemoryBase* device_memory_base,
+      const xla::Shape& shape,
+      tsl::AsyncValueRef<std::unique_ptr<se::Event>>& done_event)
       : stream_(stream),
         device_memory_base_(device_memory_base),
         shape_(shape),
@@ -79,7 +79,7 @@ class XlaHostSendDeviceContext : public DeviceContext {
   se::Stream* stream_;                        // Not owned.
   se::DeviceMemoryBase* device_memory_base_;  // Not owned.
   const xla::Shape shape_;
-  tsl::AsyncValueRef<se::Event> done_event_;
+  tsl::AsyncValueRef<std::unique_ptr<se::Event>> done_event_;
 
   XlaHostSendDeviceContext(const XlaHostSendDeviceContext&) = delete;
   void operator=(const XlaHostSendDeviceContext&) = delete;
diff --git a/tensorflow/compiler/jit/xla_host_send_recv_device_context_test.cc b/tensorflow/compiler/jit/xla_host_send_recv_device_context_test.cc
index 16f42d1dbe1a0d..62da04c3e7510f 100644
--- a/tensorflow/compiler/jit/xla_host_send_recv_device_context_test.cc
+++ b/tensorflow/compiler/jit/xla_host_send_recv_device_context_test.cc
@@ -79,9 +79,10 @@ TEST_F(XlaHostSendRecvDeviceContextTest, CopyDeviceTensorToCPU) {
       stream->Memcpy(&gpu_dst, origin_cpu_tensor.data(), gpu_dst.size()));
   TF_ASSERT_OK(stream->BlockHostUntilDone());
 
-  tsl::AsyncValueRef<se::Event> done_event =
-      tsl::MakeConstructedAsyncValueRef<se::Event>(executor);
-  done_event->Init();
+  TF_ASSERT_OK_AND_ASSIGN(auto se_event, executor->CreateEvent());
+  tsl::AsyncValueRef<std::unique_ptr<se::Event>> done_event =
+      tsl::MakeConstructedAsyncValueRef<std::unique_ptr<se::Event>>(
+          std::move(se_event));
   XlaHostRecvDeviceContext* device_context =
       new XlaHostRecvDeviceContext(stream.get(), gpu_dst, shape, done_event);
   TF_ASSERT_OK(device_context->CopyDeviceTensorToCPUSync(
@@ -108,9 +109,10 @@ TEST_F(XlaHostSendRecvDeviceContextTest, CopyCPUTensorToDevice) {
   xla::Shape shape;
   TF_ASSERT_OK(TensorShapeToXLAShape(DT_FLOAT, TensorShape({2, 2}), &shape));
 
-  tsl::AsyncValueRef<se::Event> done_event =
-      tsl::MakeConstructedAsyncValueRef<se::Event>(executor);
-  done_event->Init();
+  TF_ASSERT_OK_AND_ASSIGN(auto se_event, executor->CreateEvent());
+  tsl::AsyncValueRef<std::unique_ptr<se::Event>> done_event =
+      tsl::MakeConstructedAsyncValueRef<std::unique_ptr<se::Event>>(
+          std::move(se_event));
   XlaHostSendDeviceContext* device_context =
       new XlaHostSendDeviceContext(stream.get(), &gpu_dst, shape, done_event);
   TF_ASSERT_OK(device_context->CopyCPUTensorToDeviceSync(
@@ -141,17 +143,19 @@ TEST_F(XlaHostSendRecvDeviceContextTest, RoundTrip) {
   xla::Shape shape;
   TF_ASSERT_OK(TensorShapeToXLAShape(DT_FLOAT, TensorShape({2, 2}), &shape));
 
-  tsl::AsyncValueRef<se::Event> send_done_event =
-      tsl::MakeConstructedAsyncValueRef<se::Event>(executor);
-  send_done_event->Init();
+  TF_ASSERT_OK_AND_ASSIGN(auto se_event, executor->CreateEvent());
+  tsl::AsyncValueRef<std::unique_ptr<se::Event>> send_done_event =
+      tsl::MakeConstructedAsyncValueRef<std::unique_ptr<se::Event>>(
+          std::move(se_event));
   XlaHostSendDeviceContext* send_device_context = new XlaHostSendDeviceContext(
       stream.get(), &gpu_dst, shape, send_done_event);
   TF_ASSERT_OK(send_device_context->CopyCPUTensorToDeviceSync(
       &origin_cpu_tensor, device_.get(), &device_tensor));
 
-  tsl::AsyncValueRef<se::Event> recv_done_event =
-      tsl::MakeConstructedAsyncValueRef<se::Event>(executor);
-  recv_done_event->Init();
+  TF_ASSERT_OK_AND_ASSIGN(auto recv_se_event, executor->CreateEvent());
+  tsl::AsyncValueRef<std::unique_ptr<se::Event>> recv_done_event =
+      tsl::MakeConstructedAsyncValueRef<std::unique_ptr<se::Event>>(
+          std::move(recv_se_event));
   XlaHostRecvDeviceContext* recv_device_context = new XlaHostRecvDeviceContext(
       stream.get(), gpu_dst, shape, recv_done_event);
   TF_ASSERT_OK(recv_device_context->CopyDeviceTensorToCPUSync(
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index 9107e07b83bc21..cfeaa937024b32 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -390,10 +390,7 @@ Status XlaComputationLaunchContext::PopulateOutputs(
 
   std::shared_ptr<se::Event> definition_event;
   if (use_multiple_streams_ && stream) {
-    definition_event = std::make_shared<se::Event>(stream->parent());
-    if (!definition_event->Init()) {
-      return errors::Internal("Failed to initialize tensor definition event.");
-    }
+    TF_ASSIGN_OR_RETURN(definition_event, stream->parent()->CreateEvent());
     TF_RETURN_IF_ERROR(stream->RecordEvent(definition_event.get()));
   }
 
@@ -410,7 +407,7 @@ Status XlaComputationLaunchContext::PopulateOutputs(
   if (output.on_host_shape().is_dynamic()) {
     const se::Platform* platform = nullptr;
     if (stream != nullptr) {
-      platform = stream->parent()->platform();
+      platform = stream->parent()->GetPlatform();
     } else {
       // Stream is not set for the host platform.
       TF_ASSIGN_OR_RETURN(platform,
@@ -670,7 +667,8 @@ Status PreparePjRtExecutableArguments(
         std::unique_ptr<xla::PjRtBuffer> pjrt_buffer =
             std::make_unique<xla::PjRtStreamExecutorBuffer>(
                 device_shape, std::move(device_buffer), pjrt_client,
-                pjrt_device);
+                pjrt_device,
+                pjrt_device->default_memory_space().value_or(nullptr));
         owned_args->push_back(std::move(pjrt_buffer));
         args->push_back(owned_args->back().get());
       }
@@ -866,7 +864,7 @@ Status RunPjRtExecutable(
                       pjrt_client->LookupAddressableDevice(pjrt_device_id));
 
   gpu::GpuServingDeviceSelectorResource* device_selector_resource = nullptr;
-  if (device_type == DEVICE_GPU) {
+  if (device_type == DEVICE_GPU && gpu::kUseGpuServingDeviceSelector) {
     auto rm = ctx->resource_manager();
     TF_RETURN_IF_ERROR(rm->LookupOrCreate<
                        gpu::GpuServingDeviceSelectorResource>(
diff --git a/tensorflow/compiler/jit/xla_platform_info.cc b/tensorflow/compiler/jit/xla_platform_info.cc
index 1486340e95da3b..a6b066f7460168 100644
--- a/tensorflow/compiler/jit/xla_platform_info.cc
+++ b/tensorflow/compiler/jit/xla_platform_info.cc
@@ -377,7 +377,7 @@ XlaPlatformInfo XlaPlatformInfoFromDevice(DeviceBase* device_base) {
     auto device = static_cast<Device*>(device_base);
     platform_id = device->tensorflow_accelerator_device_info()
                       ->stream->parent()
-                      ->platform()
+                      ->GetPlatform()
                       ->id();
   } else if (XlaDevice::GetMetadataFromDevice(device_base, &xla_device_metadata)
                  .ok()) {
diff --git a/tensorflow/compiler/jit/xla_tpu_device.cc b/tensorflow/compiler/jit/xla_tpu_device.cc
index dfedd586df69aa..403e6b17e6fc00 100644
--- a/tensorflow/compiler/jit/xla_tpu_device.cc
+++ b/tensorflow/compiler/jit/xla_tpu_device.cc
@@ -43,6 +43,7 @@ limitations under the License.
 #include "tensorflow/core/tpu/tpu_defs.h"
 #include "tensorflow/core/tpu/tpu_node_device_util.h"
 #include "tensorflow/core/tpu/virtual_device.h"
+#include "tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace {
@@ -271,9 +272,8 @@ void TpuDeviceToDeviceCopy(DeviceContext* src_dev_context,
           dst_xla_context->host_to_device_stream()));
     }
 
-    auto definition_event =
-        std::make_shared<se::Event>(dst_xla_context->stream()->parent());
-    TF_RET_CHECK(definition_event->Init()) << "Event failed to initialize!";
+    TF_ASSIGN_OR_RETURN(std::shared_ptr<se::Event> definition_event,
+                        dst_xla_context->stream()->parent()->CreateEvent());
     TF_RETURN_IF_ERROR(
         dst_device_to_device_stream->RecordEvent(definition_event.get()));
     xla_output->ResetDefinitionEvent(std::move(definition_event),
diff --git a/tensorflow/compiler/mlir/BUILD b/tensorflow/compiler/mlir/BUILD
index d0286e5acff9ce..46d5e7e9fb9005 100644
--- a/tensorflow/compiler/mlir/BUILD
+++ b/tensorflow/compiler/mlir/BUILD
@@ -37,6 +37,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -51,7 +52,6 @@ cc_library(
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
         "//tensorflow/compiler/mlir/lite:tf_tfl_passes",  # buildcleaner:keep
         "//tensorflow/compiler/mlir/quantization/stablehlo:bridge_passes",
-        "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_pass_registration",
         "//tensorflow/compiler/mlir/tensorflow:mlprogram_util",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_test_passes",
@@ -75,7 +75,6 @@ cc_library(
         "@local_xla//xla/mlir/framework/ir:xla_framework",
         "@local_xla//xla/mlir/framework/transforms:passes",
         "@local_xla//xla/mlir_hlo:all_passes",
-        "@local_xla//xla/service/cpu:hlo_xla_runtime_pipeline",
     ],
 )
 
@@ -190,7 +189,6 @@ cc_library(
     deps = [
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
         "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_pass_registration",
         "//tensorflow/compiler/mlir/tensorflow:mlprogram_util",
         "//tensorflow/compiler/mlir/tools/kernel_gen/ir:tf_framework_ops",
         "@llvm-project//mlir:AllExtensions",
@@ -204,7 +202,6 @@ cc_library(
         "@llvm-project//mlir:Transforms",
         "@local_xla//xla/mlir/framework/ir:xla_framework",
         "@local_xla//xla/mlir_hlo:hlo_dialect_registration",
-        "@local_xla//xla/service/cpu:hlo_xla_runtime_pipeline",
         "@stablehlo//:register",
     ],
 )
@@ -229,7 +226,6 @@ tf_cc_binary(
         "//tensorflow/compiler/mlir/tensorflow:translate_cl_options",
         "//tensorflow/compiler/mlir/tensorflow:translate_lib",
         "//tensorflow/compiler/mlir/tensorflow:translate_registration",
-        "//tensorflow/compiler/mlir/tensorflow:translate_tf_dialect_op",
         "//tensorflow/core:lib",
         "//tensorflow/core:tensorflow",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/compiler/mlir/init_mlir.cc b/tensorflow/compiler/mlir/init_mlir.cc
index 938cd52359b9d6..ce7cefabcdcf73 100644
--- a/tensorflow/compiler/mlir/init_mlir.cc
+++ b/tensorflow/compiler/mlir/init_mlir.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/init_mlir.h"
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "tensorflow/core/platform/init_main.h"
diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index c3826f1bfb935c..7e49b1d028ce69 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -19,6 +19,7 @@ package_group(
         "//third_party/odml/infra/...",
         "//tensorflow/compiler/mlir/...",
         "//tensorflow/lite/python/...",
+        "//waymo/accelerator/alpine/tools/...",
         "//waymo/ml/compiler/mlir/...",
         # Allow visibility from the mlir language server.
         "//learning/brain/mlir/mlir_lsp_server/...",
@@ -310,6 +311,7 @@ cc_library(
         "@llvm-project//mlir:Dialect",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -360,6 +362,7 @@ cc_library(
         ":tensorflow_lite_ops_inc_gen",
         ":tensorflow_lite_passes_inc_gen",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
+        "//tensorflow/compiler/mlir/lite/schema:schema_fbs",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/compiler/mlir/tensorflow",
@@ -369,7 +372,6 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_traits",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/core:framework",
-        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
@@ -702,6 +704,8 @@ cc_library(
         ":variables_utils",
         "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
+        "//tensorflow/compiler/mlir/lite/schema:schema_fbs",
+        "//tensorflow/compiler/mlir/lite/stablehlo:optimize_layout",
         "//tensorflow/compiler/mlir/lite/stablehlo:tf_legalize_hlo",
         "//tensorflow/compiler/mlir/lite/stablehlo:tfl_legalize_hlo",
         "//tensorflow/compiler/mlir/quantization/common/ir:QuantOps",
@@ -725,7 +729,6 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/kernels:tensor_list",
-        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
@@ -860,6 +863,7 @@ cc_library(
     deps = [
         "convert_type",
         ":op_quant_spec_getters_inc",
+        ":stateful_ops_utils",
         ":tensorflow_lite",
         ":tensorflow_lite_passes_inc_gen",
         ":tensorflow_lite_post_quantize_inc_gen",
@@ -867,6 +871,7 @@ cc_library(
         ":validators",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
         "//tensorflow/compiler/mlir/lite/quantization/lite:tfl_to_std",
+        "//tensorflow/compiler/mlir/lite/schema:schema_fbs",
         "//tensorflow/compiler/mlir/quantization/common:uniform_quantized_types",
         "//tensorflow/compiler/mlir/quantization/common/ir:QuantOps",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
@@ -875,7 +880,6 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:logging",
-        "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/tools/optimize:operator_property",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
@@ -910,6 +914,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -1020,6 +1025,8 @@ cc_library(
         ":convert_type",
         ":converter_inc",
         ":tensorflow_lite",
+        "//tensorflow/compiler/mlir/lite/schema:schema_fbs",
+        "//tensorflow/compiler/mlir/lite/schema:schema_fbs_with_mutable",
         "//tensorflow/compiler/mlir/tensorflow:dynamic_shape_utils",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/core/platform:errors",
@@ -1027,8 +1034,6 @@ cc_library(
         "//tensorflow/core/platform:statusor",
         "//tensorflow/lite/core/c:private_common",
         "//tensorflow/lite/kernels/internal:kernel_utils",
-        "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/schema:schema_fbs_with_mutable",
         "//tensorflow/lite/schema:schema_utils",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
@@ -1036,6 +1041,7 @@ cc_library(
         "@llvm-project//llvm:Analysis",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
         "@local_tsl//tsl/platform:status",
         "@local_xla//xla:statusor",
@@ -1049,8 +1055,8 @@ tf_native_cc_binary(
     name = "flatbuffer_to_string",
     srcs = ["flatbuffer_to_string.cc"],
     deps = [
+        "//tensorflow/compiler/mlir/lite/schema:schema_fbs_with_reflection",
         "//tensorflow/lite/core:model_builder",
-        "//tensorflow/lite/schema:schema_fbs_with_reflection",
         "@flatbuffers",
     ],
 )
@@ -1059,26 +1065,11 @@ tf_native_cc_binary(
     name = "json_to_flatbuffer",
     srcs = ["json_to_flatbuffer.cc"],
     deps = [
-        "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/compiler/mlir/lite/schema:schema_fbs",
         "@flatbuffers",
     ],
 )
 
-cc_library(
-    name = "emit_error_reporter",
-    srcs = [
-        "emit_error_reporter.cc",
-    ],
-    hdrs = [
-        "emit_error_reporter.h",
-    ],
-    deps = [
-        "//tensorflow/lite/core/api",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-    ],
-)
-
 cc_library(
     name = "flatbuffer_export",
     srcs = [
@@ -1097,6 +1088,7 @@ cc_library(
         "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
         "//tensorflow/compiler/mlir/lite/metrics:error_collector_inst",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
+        "//tensorflow/compiler/mlir/lite/schema:schema_fbs_with_mutable",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:convert_tensor",
         "//tensorflow/compiler/mlir/tensorflow:dynamic_shape_utils",
@@ -1114,7 +1106,6 @@ cc_library(
         "//tensorflow/lite/experimental/remat:metadata_util",
         "//tensorflow/lite/python/metrics:converter_error_data_proto_cc",
         "//tensorflow/lite/schema:schema_conversion_utils",
-        "//tensorflow/lite/schema:schema_fbs_with_mutable",
         "//tensorflow/lite/toco:toco_flags_proto_cc",
         "//tensorflow/lite/tools/versioning",
         "//tensorflow/lite/tools/versioning:gpu_compatibility",
@@ -1124,6 +1115,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:cord",
         "@com_google_absl//absl/strings:str_format",
         "@flatbuffers",
         "@llvm-project//llvm:Support",
@@ -1156,6 +1148,7 @@ cc_library(
         ":size_utils",
         ":tensorflow_lite",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
+        "//tensorflow/compiler/mlir/lite/schema:schema_fbs_with_mutable",
         "//tensorflow/compiler/mlir/lite/stablehlo:legalize_stablehlo_composite_to_tfl_custom",
         "//tensorflow/compiler/mlir/lite/stablehlo:legalize_stablehlo_to_vhlo_pass",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
@@ -1170,7 +1163,6 @@ cc_library(
         "//tensorflow/core/platform:status",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/experimental/remat:metadata_util",
-        "//tensorflow/lite/schema:schema_fbs_with_mutable",
         "//tensorflow/lite/schema:schema_utils",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -1203,11 +1195,12 @@ cc_library(
     ],
     deps = [
         ":tensorflow_lite",
+        "//tensorflow/compiler/mlir/lite/schema:schema_fbs",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:errors",
-        "//tensorflow/lite/schema:schema_fbs",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
         "@local_xla//xla:statusor",
     ],
 )
@@ -1224,8 +1217,8 @@ cc_library(
         ":flatbuffer_export",
         ":flatbuffer_import",
         "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
+        "//tensorflow/compiler/mlir/lite/schema:schema_fbs",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/toco:toco_flags_proto_cc",
         "@com_google_absl//absl/strings",
         "@llvm-project//mlir:FuncDialect",
@@ -1314,8 +1307,8 @@ tf_cc_binary(
         ":tf_to_tfl_flatbuffer",
         "//tensorflow/cc/saved_model:loader",
         "//tensorflow/compiler/mlir:init_mlir",
+        "//tensorflow/compiler/mlir/lite/schema:schema_fbs",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
-        "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibrator_singleton_impl",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
         "//tensorflow/compiler/mlir/tensorflow:translate_cl_options",
@@ -1325,7 +1318,6 @@ tf_cc_binary(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:errors",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -1391,6 +1383,7 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/stablehlo:tfl_legalize_hlo",  # buildcleaner: keep
         "//tensorflow/compiler/mlir/lite/stablehlo:transforms",
         "//tensorflow/compiler/mlir/lite/stablehlo:uniform_quantized_stablehlo_to_tfl_pass",
+        "//tensorflow/compiler/mlir/lite/stablehlo/odml_converter:outline_composites",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tf_saved_model_passes",
@@ -1423,6 +1416,7 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/metrics:error_collector",
         "//tensorflow/compiler/mlir/lite/metrics:error_collector_inst",
         "//tensorflow/compiler/mlir/lite/quantization/stablehlo:quantization",
+        "//tensorflow/compiler/mlir/lite/schema:schema_fbs",
         "//tensorflow/compiler/mlir/lite/stablehlo:legalize_stablehlo_composite_to_tfl_custom",
         "//tensorflow/compiler/mlir/lite/stablehlo:legalize_stablehlo_to_vhlo_pass",
         "//tensorflow/compiler/mlir/lite/stablehlo:op_stat_pass",
@@ -1449,7 +1443,6 @@ cc_library(
         "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/experimental/remat:metadata_util",
         "//tensorflow/lite/python/metrics:converter_error_data_proto_cc",
-        "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/toco:toco_flags_proto_cc",
         "//tensorflow/lite/tools/optimize:quantize_weights",
         "//tensorflow/lite/tools/optimize:reduced_precision_support",
@@ -1498,11 +1491,11 @@ cc_library(
     deps = [
         ":convert_type",
         ":low_bit_utils",
+        "//tensorflow/compiler/mlir/lite/schema:schema_fbs",
         "//tensorflow/compiler/mlir/tensorflow:dynamic_shape_utils",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/lite:string_util",
-        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
diff --git a/tensorflow/compiler/mlir/lite/common/tfl_pass_config.h b/tensorflow/compiler/mlir/lite/common/tfl_pass_config.h
index 69ec0bbbcee3dc..1149d7841b38fd 100644
--- a/tensorflow/compiler/mlir/lite/common/tfl_pass_config.h
+++ b/tensorflow/compiler/mlir/lite/common/tfl_pass_config.h
@@ -101,7 +101,7 @@ struct PassConfig {
   bool enable_stablehlo_quantizer = false;
 
   // Enables the attempt to directly lower composites into tflite ops.
-  bool enable_composite_direct_lowering = false;
+  bool enable_composite_direct_lowering = true;
 };
 
 inline llvm::raw_ostream& operator<<(llvm::raw_ostream& os,
diff --git a/tensorflow/compiler/mlir/lite/emit_error_reporter.h b/tensorflow/compiler/mlir/lite/emit_error_reporter.h
deleted file mode 100644
index 9e9a5925600fc2..00000000000000
--- a/tensorflow/compiler/mlir/lite/emit_error_reporter.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_MLIR_LITE_EMIT_ERROR_REPORTER_H_
-#define TENSORFLOW_COMPILER_MLIR_LITE_EMIT_ERROR_REPORTER_H_
-
-#include <cstdarg>
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "tensorflow/lite/core/api/error_reporter.h"
-
-namespace tflite {
-
-// Error reporter that reports errors via the module's emitError.
-class EmitErrorReporter : public ErrorReporter {
- public:
-  explicit EmitErrorReporter(mlir::ModuleOp module) : module_(module) {}
-  int Report(const char* format, va_list args) override;
-
- private:
-  mlir::ModuleOp module_;
-};
-
-}  // namespace tflite
-
-#endif  // TENSORFLOW_COMPILER_MLIR_LITE_EMIT_ERROR_REPORTER_H_
diff --git a/tensorflow/compiler/mlir/lite/experimental/common/outline_operations.cc b/tensorflow/compiler/mlir/lite/experimental/common/outline_operations.cc
index 5f77797b9aa8a7..59cc28f9fa0608 100644
--- a/tensorflow/compiler/mlir/lite/experimental/common/outline_operations.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/common/outline_operations.cc
@@ -44,7 +44,7 @@ namespace common {
 
 bool IsConstantOrNone(Operation* op) {
   return (op->getNumResults() == 1 &&
-          op->getResult(0).getType().isa<NoneType>()) ||
+          mlir::isa<NoneType>(op->getResult(0).getType())) ||
          matchPattern(op, m_Constant()) || isa<QConstOp>(op);
 }
 
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/BUILD b/tensorflow/compiler/mlir/lite/experimental/tac/BUILD
index 248a55c7fe17e1..b7c6eb7055221e 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/BUILD
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/BUILD
@@ -42,6 +42,7 @@ cc_library(
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -88,6 +89,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:SideEffectInterfaces",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -282,7 +284,6 @@ cc_library(
     deps = [
         ":target_aware_conversion",
         "//tensorflow/compiler/mlir:tf_mlir_opt_main",
-        "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_pass_registration",
     ],
     alwayslink = 1,
 )
@@ -324,7 +325,6 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/experimental/tac/utils",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
-        "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_pass_registration",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/status",
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/common/utils.h b/tensorflow/compiler/mlir/lite/experimental/tac/common/utils.h
index 40f4902e655bcd..88382e8cf6f27b 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/common/utils.h
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/common/utils.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/experimental/tac/common/targets.h"
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/utils/utils.h"
@@ -52,7 +53,7 @@ bool NotTFLQuantDequantizeOp(Operation* op);
 
 // Returns true if it is a shaped type of f32 elements.
 inline bool IsF32ShapedType(Type t) {
-  if (auto shaped_type = t.dyn_cast_or_null<ShapedType>()) {
+  if (auto shaped_type = mlir::dyn_cast_or_null<ShapedType>(t)) {
     return shaped_type.getElementType().isF32();
   }
   return false;
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/execution_metadata_exporter.cc b/tensorflow/compiler/mlir/lite/experimental/tac/execution_metadata_exporter.cc
index 9b2458571f0c34..11a1b31e5102de 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/execution_metadata_exporter.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/execution_metadata_exporter.cc
@@ -29,6 +29,7 @@
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Region.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/experimental/tac/common/targets.h"
 #include "tensorflow/compiler/mlir/lite/experimental/tac/hardwares/target_hardware.h"
 #include "tensorflow/compiler/mlir/lite/experimental/tac/runtime_metadata_generated.h"
@@ -82,8 +83,7 @@ std::optional<std::vector<float>> GetPerDeviceCosts(
   for (const auto& kv : hardware_map) {
     auto cost_attr = device_costs_attr.getNamed(kv.first);
     if (!cost_attr.has_value()) return std::nullopt;
-    float cost = cost_attr->getValue()
-                     .dyn_cast_or_null<mlir::FloatAttr>()
+    float cost = mlir::dyn_cast_or_null<mlir::FloatAttr>(cost_attr->getValue())
                      .getValueAsDouble();
     device_costs[kv.second] = cost;
   }
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/tests/device-transform-gpu.mlir b/tensorflow/compiler/mlir/lite/experimental/tac/tests/device-transform-gpu.mlir
index c9a3999dad0a68..5ee1a71e344933 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/tests/device-transform-gpu.mlir
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/tests/device-transform-gpu.mlir
@@ -11,7 +11,7 @@ func.func @pack(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<2x1xf32> {
 // CHECK-DAG:       %[[VAL_4:.*]] = "tfl.pseudo_const"{{.*}}dense<[2, 1]> : tensor<2xi32>
 // CHECK:           %[[VAL_5:.*]] = "tfl.reshape"(%[[VAL_0]], %[[VAL_2]]) : (tensor<1xf32>, tensor<4xi32>) -> tensor<1x1x1x1xf32>
 // CHECK:           %[[VAL_6:.*]] = "tfl.reshape"(%[[VAL_1]], %[[VAL_2]]) : (tensor<1xf32>, tensor<4xi32>) -> tensor<1x1x1x1xf32>
-// CHECK:           %[[VAL_7:.*]] = "tfl.concatenation"(%[[VAL_5]], %[[VAL_6]]) {axis = 3 : i32, fused_activation_function = "NONE"} : (tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) -> tensor<1x1x1x2xf32>
+// CHECK:           %[[VAL_7:.*]] = "tfl.concatenation"(%[[VAL_5]], %[[VAL_6]]) <{axis = 3 : i32, fused_activation_function = "NONE"}> : (tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) -> tensor<1x1x1x2xf32>
 // CHECK:           %[[VAL_8:.*]] = "tfl.reshape"(%[[VAL_7]], %[[VAL_3]]) : (tensor<1x1x1x2xf32>, tensor<1xi32>) -> tensor<2xf32>
 // CHECK:           %[[VAL_9:.*]] = "tfl.reshape"(%[[VAL_8]], %[[VAL_4]]) : (tensor<2xf32>, tensor<2xi32>) -> tensor<2x1xf32>
 // CHECK:           return %[[VAL_9]] : tensor<2x1xf32>
@@ -124,8 +124,8 @@ func.func @sub(%arg0: tensor<1x384x384x3xf32>, %arg1: tensor<3xf32>) -> tensor<1
 
 // CHECK:       func @sub(%[[VAL_0:.*]]: tensor<1x384x384x3xf32>, %[[VAL_1:.*]]: tensor<3xf32>) -> tensor<1x384x384x3xf32> {
 // CHECK:           %[[VAL_2:.*]] = arith.constant dense<-1.000000e+00> : tensor<f32>
-// CHECK:           %[[VAL_3:.*]] = tfl.mul(%[[VAL_1]], %[[VAL_2]]) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<f32>) -> tensor<3xf32>
-// CHECK:           %[[VAL_4:.*]] = tfl.add(%[[VAL_0]], %[[VAL_3]]) {fused_activation_function = "NONE"} : (tensor<1x384x384x3xf32>, tensor<3xf32>) -> tensor<1x384x384x3xf32>
+// CHECK:           %[[VAL_3:.*]] = tfl.mul(%[[VAL_1]], %[[VAL_2]]) <{fused_activation_function = "NONE"}> : (tensor<3xf32>, tensor<f32>) -> tensor<3xf32>
+// CHECK:           %[[VAL_4:.*]] = tfl.add(%[[VAL_0]], %[[VAL_3]]) <{fused_activation_function = "NONE"}> : (tensor<1x384x384x3xf32>, tensor<3xf32>) -> tensor<1x384x384x3xf32>
 // CHECK:           return %[[VAL_4]] : tensor<1x384x384x3xf32>
 // CHECK:         }
 
@@ -139,7 +139,7 @@ func.func @ensureBiasForConv2d(%arg0: tensor<128x32x32x3xf32>, %arg1: tensor<32x
 
 // CHECK:       func @ensureBiasForConv2d(%[[VAL_0:.*]]: tensor<128x32x32x3xf32>, %[[VAL_1:.*]]: tensor<32x1x1x3xf32>) -> tensor<128x32x32x32xf32> {
 // CHECK:           %[[VAL_2:.*]] = "tfl.pseudo_const"{{.*}}dense<0.000000e+00> : tensor<32xf32>
-// CHECK:           %[[VAL_3:.*]] = "tfl.conv_2d"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<128x32x32x3xf32>, tensor<32x1x1x3xf32>, tensor<32xf32>) -> tensor<128x32x32x32xf32>
+// CHECK:           %[[VAL_3:.*]] = "tfl.conv_2d"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<128x32x32x3xf32>, tensor<32x1x1x3xf32>, tensor<32xf32>) -> tensor<128x32x32x32xf32>
 // CHECK:           return %[[VAL_3]] : tensor<128x32x32x32xf32>
 // CHECK:         }
 
@@ -156,7 +156,7 @@ func.func @padSliceTo4D(%arg0: tensor<4x384x32xf32>) -> tensor<1x384x32xf32> {
 // CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() <{value = dense<0> : tensor<4xi32>}> : () -> tensor<4xi32>
 // CHECK-DAG:       %[[VAL_2:.*]] = "tf.Const"() <{value = dense<[1, 1, 384, 32]> : tensor<4xi32>}> : () -> tensor<4xi32>
 // CHECK-DAG:       %[[VAL_3:.*]] = "tfl.pseudo_const"{{.*}}dense<[1, 4, 384, 32]> : tensor<4xi32>
-// CHECK-DAG:       %[[VAL_4:.*]] = "tfl.pseudo_const"() {value = dense<[1, 384, 32]> : tensor<3xi32>
+// CHECK-DAG:       %[[VAL_4:.*]] = "tfl.pseudo_const"() <{value = dense<[1, 384, 32]> : tensor<3xi32>
 // CHECK:           %[[VAL_5:.*]] = "tfl.reshape"(%[[VAL_0]], %[[VAL_3]]) : (tensor<4x384x32xf32>, tensor<4xi32>) -> tensor<1x4x384x32xf32>
 // CHECK:           %[[VAL_6:.*]] = "tfl.slice"(%[[VAL_5]], %[[VAL_1]], %[[VAL_2]]) : (tensor<1x4x384x32xf32>, tensor<4xi32>, tensor<4xi32>) -> tensor<1x1x384x32xf32>
 // CHECK:           %[[VAL_7:.*]] = "tfl.reshape"(%[[VAL_6]], %[[VAL_4]]) : (tensor<1x1x384x32xf32>, tensor<3xi32>) -> tensor<1x384x32xf32>
@@ -189,7 +189,7 @@ func.func @fullyConnectedToConv(%arg0: tensor<384x384xf32>, %arg1: tensor<512x38
 // CHECK-DAG:       %[[VAL_5:.*]] = "tfl.pseudo_const"{{.*}}dense<[384, 512]> : tensor<2xi32>
 // CHECK:           %[[VAL_6:.*]] = "tfl.reshape"(%[[VAL_0]], %[[VAL_3]]) : (tensor<384x384xf32>, tensor<4xi32>) -> tensor<1x1x384x384xf32>
 // CHECK:           %[[VAL_7:.*]] = "tfl.reshape"(%[[VAL_1]], %[[VAL_4]]) : (tensor<512x384xf32>, tensor<4xi32>) -> tensor<512x1x1x384xf32>
-// CHECK:           %[[VAL_8:.*]] = "tfl.conv_2d"(%[[VAL_6]], %[[VAL_7]], %[[VAL_2]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x1x384x384xf32>, tensor<512x1x1x384xf32>, tensor<512xf32>) -> tensor<1x1x384x512xf32>
+// CHECK:           %[[VAL_8:.*]] = "tfl.conv_2d"(%[[VAL_6]], %[[VAL_7]], %[[VAL_2]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<1x1x384x384xf32>, tensor<512x1x1x384xf32>, tensor<512xf32>) -> tensor<1x1x384x512xf32>
 // CHECK:           %[[VAL_9:.*]] = "tfl.reshape"(%[[VAL_8]], %[[VAL_5]]) : (tensor<1x1x384x512xf32>, tensor<2xi32>) -> tensor<384x512xf32>
 // CHECK:           return %[[VAL_9]] : tensor<384x512xf32>
 // CHECK:         }
@@ -208,7 +208,7 @@ func.func @padConcatTo4D(%arg0: tensor<384x384xf32>, %arg1: tensor<384x384xf32>,
 // CHECK:           %[[VAL_7:.*]] = "tfl.reshape"(%[[VAL_1]], %[[VAL_4]]) : (tensor<384x384xf32>, tensor<4xi32>) -> tensor<1x1x384x384xf32>
 // CHECK:           %[[VAL_8:.*]] = "tfl.reshape"(%[[VAL_2]], %[[VAL_4]]) : (tensor<384x384xf32>, tensor<4xi32>) -> tensor<1x1x384x384xf32>
 // CHECK:           %[[VAL_9:.*]] = "tfl.reshape"(%[[VAL_3]], %[[VAL_4]]) : (tensor<384x384xf32>, tensor<4xi32>) -> tensor<1x1x384x384xf32>
-// CHECK:           %[[VAL_10:.*]] = "tfl.concatenation"(%[[VAL_6]], %[[VAL_7]], %[[VAL_8]], %[[VAL_9]]) {axis = 2 : i32, fused_activation_function = "NONE"} : (tensor<1x1x384x384xf32>, tensor<1x1x384x384xf32>, tensor<1x1x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x1x1536x384xf32>
+// CHECK:           %[[VAL_10:.*]] = "tfl.concatenation"(%[[VAL_6]], %[[VAL_7]], %[[VAL_8]], %[[VAL_9]]) <{axis = 2 : i32, fused_activation_function = "NONE"}> : (tensor<1x1x384x384xf32>, tensor<1x1x384x384xf32>, tensor<1x1x384x384xf32>, tensor<1x1x384x384xf32>) -> tensor<1x1x1536x384xf32>
 // CHECK:           %[[VAL_11:.*]] = "tfl.reshape"(%[[VAL_10]], %[[VAL_5]]) : (tensor<1x1x1536x384xf32>, tensor<2xi32>) -> tensor<1536x384xf32>
 // CHECK:           return %[[VAL_11]] : tensor<1536x384xf32>
 // CHECK:         }
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/tests/device-transform-nnapi.mlir b/tensorflow/compiler/mlir/lite/experimental/tac/tests/device-transform-nnapi.mlir
index 8918291711354e..41e57494486187 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/tests/device-transform-nnapi.mlir
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/tests/device-transform-nnapi.mlir
@@ -7,7 +7,7 @@ func.func @mean_4d_keepdim(%arg0: tensor<1x48x48x512xf32>) -> tensor<1x1x1x512xf
 }
 
 // CHECK:       func @mean_4d_keepdim([[VAL_0:%.*]]: tensor<1x48x48x512xf32>) -> tensor<1x1x1x512xf32> {
-// CHECK:           [[VAL_1:%.*]] = "tfl.average_pool_2d"([[VAL_0]]) {filter_height = 48 : i32, filter_width = 48 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x48x48x512xf32>) -> tensor<1x1x1x512xf32>
+// CHECK:           [[VAL_1:%.*]] = "tfl.average_pool_2d"([[VAL_0]]) <{filter_height = 48 : i32, filter_width = 48 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<1x48x48x512xf32>) -> tensor<1x1x1x512xf32>
 // CHECK:           return [[VAL_1]] : tensor<1x1x1x512xf32>
 // CHECK:         }
 
@@ -21,7 +21,7 @@ func.func @mean_4d_no_keepdim(%arg0: tensor<1x48x48x512xf32>) -> tensor<1x512xf3
 
 // CHECK:       func @mean_4d_no_keepdim([[VAL_0:%.*]]: tensor<1x48x48x512xf32>) -> tensor<1x512xf32> {
 // CHECK:           [[VAL_1:%.*]] = "tfl.pseudo_const"(){{.*}}dense<[1, 512]> : tensor<2xi32>
-// CHECK:           [[VAL_2:%.*]] = "tfl.average_pool_2d"([[VAL_0]]) {filter_height = 48 : i32, filter_width = 48 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x48x48x512xf32>) -> tensor<1x1x1x512xf32>
+// CHECK:           [[VAL_2:%.*]] = "tfl.average_pool_2d"([[VAL_0]]) <{filter_height = 48 : i32, filter_width = 48 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<1x48x48x512xf32>) -> tensor<1x1x1x512xf32>
 // CHECK:           [[VAL_3:%.*]] = "tfl.reshape"([[VAL_2]], [[VAL_1]]) : (tensor<1x1x1x512xf32>, tensor<2xi32>) -> tensor<1x512xf32>
 // CHECK:           return [[VAL_3]] : tensor<1x512xf32>
 // CHECK:         }
@@ -36,7 +36,7 @@ func.func @mean_quant_same_scale(%arg0: tensor<?x7x7x2048x!quant.uniform<i8:f32,
 
 // CHECK:   func @mean_quant_same_scale(%[[VAL_0:.*]]: tensor<?x7x7x2048x!quant.uniform<i8:f32, 6.000000e-01:-128>>) -> tensor<?x2048x!quant.uniform<i8:f32, 6.000000e-01:-128>> {
 // CHECK:           %[[VAL_1:.*]] = "tfl.pseudo_const"(){{.*}}dense<[-1, 2048]> : tensor<2xi32>
-// CHECK:           %[[VAL_2:.*]] = "tfl.average_pool_2d"(%[[VAL_0]]) {filter_height = 7 : i32, filter_width = 7 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<?x7x7x2048x!quant.uniform<i8:f32, 6.000000e-01:-128>>) -> tensor<?x1x1x2048x!quant.uniform<i8:f32, 6.000000e-01:-128>>
+// CHECK:           %[[VAL_2:.*]] = "tfl.average_pool_2d"(%[[VAL_0]]) <{filter_height = 7 : i32, filter_width = 7 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<?x7x7x2048x!quant.uniform<i8:f32, 6.000000e-01:-128>>) -> tensor<?x1x1x2048x!quant.uniform<i8:f32, 6.000000e-01:-128>>
 // CHECK:           %[[VAL_3:.*]] = "tfl.reshape"(%[[VAL_2]], %[[VAL_1]]) : (tensor<?x1x1x2048x!quant.uniform<i8:f32, 6.000000e-01:-128>>, tensor<2xi32>) -> tensor<?x2048x!quant.uniform<i8:f32, 6.000000e-01:-128>>
 // CHECK:           return %[[VAL_3]] : tensor<?x2048x!quant.uniform<i8:f32, 6.000000e-01:-128>>
 // CHECK:         }
@@ -51,8 +51,8 @@ func.func @mean_quant_different_scales(%arg0: tensor<?x7x7x2048x!quant.uniform<i
 
 // CHECK:   func @mean_quant_different_scales(%[[VAL_0:.*]]: tensor<?x7x7x2048x!quant.uniform<i8:f32, 6.000000e-01:-128>>) -> tensor<?x2048x!quant.uniform<i8:f32, 9.000000e-01:-128>> {
 // CHECK:           %[[VAL_1:.*]] = "tfl.pseudo_const"(){{.*}}dense<[-1, 2048]> : tensor<2xi32>
-// CHECK:           %[[VAL_2:.*]] = "tfl.average_pool_2d"(%[[VAL_0]]) {filter_height = 7 : i32, filter_width = 7 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<?x7x7x2048x!quant.uniform<i8:f32, 6.000000e-01:-128>>) -> tensor<?x1x1x2048x!quant.uniform<i8:f32, 6.000000e-01:-128>>
+// CHECK:           %[[VAL_2:.*]] = "tfl.average_pool_2d"(%[[VAL_0]]) <{filter_height = 7 : i32, filter_width = 7 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<?x7x7x2048x!quant.uniform<i8:f32, 6.000000e-01:-128>>) -> tensor<?x1x1x2048x!quant.uniform<i8:f32, 6.000000e-01:-128>>
 // CHECK:           %[[VAL_3:.*]] = "tfl.reshape"(%[[VAL_2]], %[[VAL_1]]) : (tensor<?x1x1x2048x!quant.uniform<i8:f32, 6.000000e-01:-128>>, tensor<2xi32>) -> tensor<?x2048x!quant.uniform<i8:f32, 6.000000e-01:-128>>
-// CHECK:           %[[VAL_4:.*]] = "tfl.quantize"(%[[VAL_3]]) {qtype = tensor<?x2048x!quant.uniform<i8:f32, 9.000000e-01:-128>>} : (tensor<?x2048x!quant.uniform<i8:f32, 6.000000e-01:-128>>) -> tensor<?x2048x!quant.uniform<i8:f32, 9.000000e-01:-128>>
+// CHECK:           %[[VAL_4:.*]] = "tfl.quantize"(%[[VAL_3]]) <{qtype = tensor<?x2048x!quant.uniform<i8:f32, 9.000000e-01:-128>>}> : (tensor<?x2048x!quant.uniform<i8:f32, 6.000000e-01:-128>>) -> tensor<?x2048x!quant.uniform<i8:f32, 9.000000e-01:-128>>
 // CHECK:           return %[[VAL_4]] : tensor<?x2048x!quant.uniform<i8:f32, 9.000000e-01:-128>>
 // CHECK:         }
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/tests/e2e/device-transform-nnapi.mlir b/tensorflow/compiler/mlir/lite/experimental/tac/tests/e2e/device-transform-nnapi.mlir
index a69fc368ebcda4..55adc81a2e5713 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/tests/e2e/device-transform-nnapi.mlir
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/tests/e2e/device-transform-nnapi.mlir
@@ -14,7 +14,7 @@ module {
     %0 = "tfl.pack"(%arg0, %arg1) {axis = 0 : i32, values_count = 2 : i32} : (tensor<1xf32>, tensor<1xf32>) -> tensor<2x1xf32>
     func.return %0 : tensor<2x1xf32>
     // CHECK: %[[VAL_0:.*]] = arith.constant dense<[2, 1]> : tensor<2xi32>
-    // CHECK: %[[CONCAT:.*]] = "tfl.concatenation"(%arg0, %arg1) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1xf32>, tensor<1xf32>) -> tensor<2xf32>
+    // CHECK: %[[CONCAT:.*]] = "tfl.concatenation"(%arg0, %arg1) <{axis = 0 : i32, fused_activation_function = "NONE"}> : (tensor<1xf32>, tensor<1xf32>) -> tensor<2xf32>
     // CHECK: %[[VAL_1:.*]] = "tfl.reshape"(%[[CONCAT]], %[[VAL_0]]) : (tensor<2xf32>, tensor<2xi32>) -> tensor<2x1xf32>
     // CHECK: return %[[VAL_1]]
   }
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/tests/e2e/simple-graph.mlir b/tensorflow/compiler/mlir/lite/experimental/tac/tests/e2e/simple-graph.mlir
index b92d6c7a6f0103..a8c5a5f2ff1ee2 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/tests/e2e/simple-graph.mlir
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/tests/e2e/simple-graph.mlir
@@ -12,7 +12,7 @@ func.func @main(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>, %arg2: tensor<1xf32>
 // CHECK: %[[CST:.*]] = arith.constant dense<1> : tensor<4xi32>
 // CHECK:  [[VAL_0:%.*]] = "tfl.reshape"(%1, %[[CST]]) {tac.device = "GPU",  tac.inference_type = "FLOAT"} : (tensor<1xf32>, tensor<4xi32>) -> tensor<1x1x1x1xf32>
 // CHECK:  [[VAL_1:%.*]] = "tfl.reshape"(%2, %[[CST]]) {tac.device = "GPU",  tac.inference_type = "FLOAT"} : (tensor<1xf32>, tensor<4xi32>) -> tensor<1x1x1x1xf32>
-// CHECK:  [[VAL_2:%.*]] = "tfl.concatenation"([[VAL_0]], [[VAL_1]]) {axis = 3 : i32, fused_activation_function = "NONE", tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) -> tensor<1x1x1x2xf32>
+// CHECK:  [[VAL_2:%.*]] = "tfl.concatenation"([[VAL_0]], [[VAL_1]]) <{axis = 3 : i32, fused_activation_function = "NONE"}> {tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) -> tensor<1x1x1x2xf32>
 // CHECK:  [[VAL_3:%.*]] = "tfl.reshape"([[VAL_2]], %{{.*}}) : (tensor<1x1x1x2xf32>, tensor<2xi32>) -> tensor<2x1xf32>
 
 }
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/tests/fold-constants-to-subgraph.mlir b/tensorflow/compiler/mlir/lite/experimental/tac/tests/fold-constants-to-subgraph.mlir
index e8a30755a8c768..12a3b14e5f894f 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/tests/fold-constants-to-subgraph.mlir
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/tests/fold-constants-to-subgraph.mlir
@@ -20,8 +20,8 @@ func.func @simple_test(%arg0: tensor<4x384x32xf32>, %arg1: tensor<3xi32>, %arg2:
 }
 
 // PARTIAL:       func @simple_test(%[[VAL_0:.*]]: tensor<4x384x32xf32>, %[[VAL_1:.*]]: tensor<3xi32>, %[[VAL_2:.*]]: tensor<3xi32>) -> tensor<1x384x32xf32> attributes {tac.interface_name = "func1"} {
-// PARTIAL:           %[[VAL_3:.*]] = "tfl.pseudo_const"() {value = dense<[1, 384, 32]> : tensor<3xi32>} : () -> tensor<3xi32>
-// PARTIAL:           %[[VAL_4:.*]] = "tfl.pseudo_const"() {value = dense<0> : tensor<3xi32>} : () -> tensor<3xi32>
+// PARTIAL:           %[[VAL_3:.*]] = "tfl.pseudo_const"() <{value = dense<[1, 384, 32]> : tensor<3xi32>}> : () -> tensor<3xi32>
+// PARTIAL:           %[[VAL_4:.*]] = "tfl.pseudo_const"() <{value = dense<0> : tensor<3xi32>}> : () -> tensor<3xi32>
 // PARTIAL:           %[[VAL_5:.*]] = "tfl.slice"(%[[VAL_0]], %[[VAL_4]], %[[VAL_3]]) : (tensor<4x384x32xf32>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x384x32xf32>
 // PARTIAL:           return %[[VAL_5]] : tensor<1x384x32xf32>
 // PARTIAL:         }
@@ -52,15 +52,15 @@ func.func @arg_reuse_test_2(%arg0: tensor<4x384x32xf32>, %arg1: tensor<3xi32>, %
 }
 
 // PARTIAL:       func @arg_reuse_test_1(%[[VAL_0:.*]]: tensor<4x384x32xf32>, %[[VAL_1:.*]]: tensor<3xi32>, %[[VAL_2:.*]]: tensor<3xi32>) -> tensor<1x384x32xf32> attributes {tac.interface_name = "func1"} {
-// PARTIAL:           %[[VAL_3:.*]] = "tfl.pseudo_const"() {value = dense<[1, 384, 32]> : tensor<3xi32>} : () -> tensor<3xi32>
-// PARTIAL:           %[[VAL_4:.*]] = "tfl.pseudo_const"() {value = dense<0> : tensor<3xi32>} : () -> tensor<3xi32>
+// PARTIAL:           %[[VAL_3:.*]] = "tfl.pseudo_const"() <{value = dense<[1, 384, 32]> : tensor<3xi32>}> : () -> tensor<3xi32>
+// PARTIAL:           %[[VAL_4:.*]] = "tfl.pseudo_const"() <{value = dense<0> : tensor<3xi32>}> : () -> tensor<3xi32>
 // PARTIAL:           %[[VAL_5:.*]] = "tfl.slice"(%[[VAL_0]], %[[VAL_4]], %[[VAL_3]]) : (tensor<4x384x32xf32>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x384x32xf32>
 // PARTIAL:           return %[[VAL_5]] : tensor<1x384x32xf32>
 // PARTIAL:         }
 
 // PARTIAL:       func @arg_reuse_test_2(%[[VAL_6:.*]]: tensor<4x384x32xf32>, %[[VAL_7:.*]]: tensor<3xi32>, %[[VAL_8:.*]]: tensor<3xi32>) -> tensor<1x384x32xf32> attributes {tac.interface_name = "func2"} {
-// PARTIAL:           %[[VAL_9:.*]] = "tfl.pseudo_const"() {value = dense<[1, 384, 32]> : tensor<3xi32>} : () -> tensor<3xi32>
-// PARTIAL:           %[[VAL_10:.*]] = "tfl.pseudo_const"() {value = dense<0> : tensor<3xi32>} : () -> tensor<3xi32>
+// PARTIAL:           %[[VAL_9:.*]] = "tfl.pseudo_const"() <{value = dense<[1, 384, 32]> : tensor<3xi32>}> : () -> tensor<3xi32>
+// PARTIAL:           %[[VAL_10:.*]] = "tfl.pseudo_const"() <{value = dense<0> : tensor<3xi32>}> : () -> tensor<3xi32>
 // PARTIAL:           %[[VAL_11:.*]] = "tfl.slice"(%[[VAL_6]], %[[VAL_10]], %[[VAL_9]]) : (tensor<4x384x32xf32>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x384x32xf32>
 // PARTIAL:           return %[[VAL_11]] : tensor<1x384x32xf32>
 // PARTIAL:         }
@@ -84,8 +84,8 @@ func.func @quantization_test(%arg0: tensor<384x512x!quant.uniform<i8:f32, 0.1>>,
 }
 
 // PARTIAL:   func @quantization_test(%[[VAL_0:.*]]: tensor<384x512x!quant.uniform<i8:f32, 1.000000e-01>>, %[[VAL_1:.*]]: tensor<128x512x!quant.uniform<i8<-127:127>:f32, 1.000000e-02>>, %[[VAL_2:.*]]: tensor<128x!quant.uniform<i32:f32, 0.69999999999999996>>) -> tensor<384x128x!quant.uniform<i8:f32, 0.089999999999999996:-4>> {
-// PARTIAL:           %[[VAL_3:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<128x!quant.uniform<i32:f32, 0.69999999999999996>>, value = dense<0> : tensor<128xi32>} : () -> tensor<128x!quant.uniform<i32:f32, 0.69999999999999996>>
-// PARTIAL:           %[[VAL_4:.*]] = "tfl.fully_connected"(%[[VAL_0]], %[[VAL_1]], %[[VAL_3]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<384x512x!quant.uniform<i8:f32, 1.000000e-01>>, tensor<128x512x!quant.uniform<i8<-127:127>:f32, 1.000000e-02>>, tensor<128x!quant.uniform<i32:f32, 0.69999999999999996>>) -> tensor<384x128x!quant.uniform<i8:f32, 0.089999999999999996:-4>>
+// PARTIAL:           %[[VAL_3:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<128x!quant.uniform<i32:f32, 0.69999999999999996>>, value = dense<0> : tensor<128xi32>}> : () -> tensor<128x!quant.uniform<i32:f32, 0.69999999999999996>>
+// PARTIAL:           %[[VAL_4:.*]] = "tfl.fully_connected"(%[[VAL_0]], %[[VAL_1]], %[[VAL_3]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<384x512x!quant.uniform<i8:f32, 1.000000e-01>>, tensor<128x512x!quant.uniform<i8<-127:127>:f32, 1.000000e-02>>, tensor<128x!quant.uniform<i32:f32, 0.69999999999999996>>) -> tensor<384x128x!quant.uniform<i8:f32, 0.089999999999999996:-4>>
 // PARTIAL:           return %[[VAL_4]] : tensor<384x128x!quant.uniform<i8:f32, 0.089999999999999996:-4>>
 // PARTIAL:         }
 
@@ -108,9 +108,9 @@ func.func @fold_all_test(%arg0: tensor<256x32x32x3xf32>, %arg1: tensor<16x3x3x3x
 }
 
 // ALL: func @fold_all_test(%[[VAL_0:.*]]: tensor<256x32x32x3xf32>, %[[VAL_1:.*]]: tensor<16x3x3x3xf32>, %[[VAL_2:.*]]: tensor<16xf32>) -> tensor<256x30x30x16xf32> {
-// ALL:           %[[VAL_3:.*]] = "tfl.pseudo_const"() {value = dense<1.000000e+00> : tensor<16xf32>} : () -> tensor<16xf32>
-// ALL:           %[[VAL_4:.*]] = "tfl.pseudo_const"() {value = dense<1.000000e+00> : tensor<16x3x3x3xf32>} : () -> tensor<16x3x3x3xf32>
-// ALL:           %[[VAL_5:.*]] = "tfl.conv_2d"(%[[VAL_0]], %[[VAL_4]], %[[VAL_3]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32, tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<256x32x32x3xf32>, tensor<16x3x3x3xf32>, tensor<16xf32>) -> tensor<256x30x30x16xf32>
+// ALL:           %[[VAL_3:.*]] = "tfl.pseudo_const"() <{value = dense<1.000000e+00> : tensor<16xf32>}> : () -> tensor<16xf32>
+// ALL:           %[[VAL_4:.*]] = "tfl.pseudo_const"() <{value = dense<1.000000e+00> : tensor<16x3x3x3xf32>}> : () -> tensor<16x3x3x3xf32>
+// ALL:           %[[VAL_5:.*]] = "tfl.conv_2d"(%[[VAL_0]], %[[VAL_4]], %[[VAL_3]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32}> {tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<256x32x32x3xf32>, tensor<16x3x3x3xf32>, tensor<16xf32>) -> tensor<256x30x30x16xf32>
 // ALL:           return %[[VAL_5]] : tensor<256x30x30x16xf32>
 // ALL:         }
 }
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/tests/get-alternative-subgraph.mlir b/tensorflow/compiler/mlir/lite/experimental/tac/tests/get-alternative-subgraph.mlir
index 0e5101a5352b4c..80eecdeea1e44f 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/tests/get-alternative-subgraph.mlir
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/tests/get-alternative-subgraph.mlir
@@ -32,7 +32,7 @@ module {
 // CHECK:         }
 
 // CHECK:   func private @func_2_CPU_FLOAT(%[[VAL_0:.*]]: tensor<1xf32>, %[[VAL_1:.*]]: tensor<1xf32>) -> tensor<2x1xf32> attributes {tac.device = "CPU", tac.inference_type = "FLOAT", tac.interface_name = "func_2"} {
-// CHECK:           %[[VAL_2:.*]] = "tfl.pack"(%[[VAL_0]], %[[VAL_1]]) {axis = 0 : i32, tac.device = "CPU", tac.inference_type = "FLOAT", values_count = 2 : i32} : (tensor<1xf32>, tensor<1xf32>) -> tensor<2x1xf32>
+// CHECK:           %[[VAL_2:.*]] = "tfl.pack"(%[[VAL_0]], %[[VAL_1]]) <{axis = 0 : i32, values_count = 2 : i32}> {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<1xf32>, tensor<1xf32>) -> tensor<2x1xf32>
 // CHECK:           return %[[VAL_2]] : tensor<2x1xf32>
 // CHECK:         }
 
@@ -53,7 +53,7 @@ module {
 // CHECK-DAG:       %[[VAL_4:.*]] = "tfl.pseudo_const"(){{.*}}dense<[2, 1]> : tensor<2xi32>
 // CHECK:           %[[VAL_5:.*]] = "tfl.reshape"(%[[VAL_0]], %[[VAL_2]]) {tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<1xf32>, tensor<4xi32>) -> tensor<1x1x1x1xf32>
 // CHECK:           %[[VAL_6:.*]] = "tfl.reshape"(%[[VAL_1]], %[[VAL_2]]) {tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<1xf32>, tensor<4xi32>) -> tensor<1x1x1x1xf32>
-// CHECK:           %[[VAL_7:.*]] = "tfl.concatenation"(%[[VAL_5]], %[[VAL_6]]) {axis = 3 : i32, fused_activation_function = "NONE", tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) -> tensor<1x1x1x2xf32>
+// CHECK:           %[[VAL_7:.*]] = "tfl.concatenation"(%[[VAL_5]], %[[VAL_6]]) <{axis = 3 : i32, fused_activation_function = "NONE"}> {tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) -> tensor<1x1x1x2xf32>
 // CHECK:           %[[VAL_8:.*]] = "tfl.reshape"(%[[VAL_7]], %[[VAL_3]]) {tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<1x1x1x2xf32>, tensor<1xi32>) -> tensor<2xf32>
 // CHECK:           %[[VAL_9:.*]] = "tfl.reshape"(%[[VAL_8]], %[[VAL_4]]) {tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<2xf32>, tensor<2xi32>) -> tensor<2x1xf32>
 // CHECK:           return %[[VAL_9]] : tensor<2x1xf32>
@@ -81,7 +81,7 @@ func.func private @func_10_CPU_FLOAT(%arg0: tensor<3xi32>, %arg1: tensor<i32>, %
 }
 
 // CHECK:   func private @func_10_CPU_FLOAT(%[[VAL_0:.*]]: tensor<3xi32>, %[[VAL_1:.*]]: tensor<i32>, %[[VAL_2:.*]]: tensor<f32>, %[[VAL_3:.*]]: tensor<f32>) -> tensor<*xf32> attributes {tac.device = "CPU", tac.inference_type = "FLOAT", tac.interface_name = "func_10"} {
-// CHECK:           %[[VAL_4:.*]] = "tfl.one_hot"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]], %[[VAL_3]]) {axis = -1 : i32, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<3xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<*xf32>
+// CHECK:           %[[VAL_4:.*]] = "tfl.one_hot"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]], %[[VAL_3]]) <{axis = -1 : i32}> {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<3xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<*xf32>
 // CHECK:           return %[[VAL_4]] : tensor<*xf32>
 // CHECK:         }
 
@@ -121,11 +121,11 @@ func.func private @quantize_ops_CPU_QUANTIZED_INT8(%arg0: tensor<384x512x!quant.
 
 // CHECK:   func private @quantize_ops_CPU_QUANTIZED_INT8(%[[VAL_0:.*]]: tensor<384x512x!quant.uniform<i8:f32, 1.000000e-01>>, %[[VAL_1:.*]]: tensor<128x512x!quant.uniform<i8<-127:127>:f32, 1.000000e-01>>, %[[VAL_2:.*]]: tensor<128x!quant.uniform<i8:f32, 2.000000e-01:-128>>, %[[VAL_3:.*]]: tensor<128x!quant.uniform<i8:f32, 2.000000e-01:-4>>) -> tensor<1x384x128x!quant.uniform<i8:f32, 3.000000e-01:-3>> attributes {tac.device = "CPU", tac.inference_type = "QUANTIZED_INT8", tac.interface_name = "quantize_ops"} {
 // CHECK-DAG:       %[[VAL_4:.*]] = arith.constant dense<[1, 384, 128]> : tensor<3xi32>
-// CHECK-DAG:       %[[VAL_5:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<128x!quant.uniform<i32:f32, 0.69999999999999996>>, value = dense<0> : tensor<128xi32>} : () -> tensor<128x!quant.uniform<i32:f32, 0.69999999999999996>>
-// CHECK:           %[[VAL_6:.*]] = "tfl.fully_connected"(%[[VAL_0]], %[[VAL_1]], %[[VAL_5]]) {fused_activation_function = "NONE", keep_num_dims = false, tac.device = "CPU", tac.inference_type = "QUANTIZED_INT8", weights_format = "DEFAULT"} : (tensor<384x512x!quant.uniform<i8:f32, 1.000000e-01>>, tensor<128x512x!quant.uniform<i8<-127:127>:f32, 1.000000e-01>>, tensor<128x!quant.uniform<i32:f32, 0.69999999999999996>>) -> tensor<384x128x!quant.uniform<i8:f32, 9.000000e-01:-4>>
+// CHECK-DAG:       %[[VAL_5:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<128x!quant.uniform<i32:f32, 0.69999999999999996>>, value = dense<0> : tensor<128xi32>}> : () -> tensor<128x!quant.uniform<i32:f32, 0.69999999999999996>>
+// CHECK:           %[[VAL_6:.*]] = "tfl.fully_connected"(%[[VAL_0]], %[[VAL_1]], %[[VAL_5]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> {tac.device = "CPU", tac.inference_type = "QUANTIZED_INT8"} : (tensor<384x512x!quant.uniform<i8:f32, 1.000000e-01>>, tensor<128x512x!quant.uniform<i8<-127:127>:f32, 1.000000e-01>>, tensor<128x!quant.uniform<i32:f32, 0.69999999999999996>>) -> tensor<384x128x!quant.uniform<i8:f32, 9.000000e-01:-4>>
 // CHECK:           %[[VAL_7:.*]] = "tfl.reshape"(%[[VAL_6]], %[[VAL_4]]) {tac.device = "CPU", tac.inference_type = "QUANTIZED_INT8"} : (tensor<384x128x!quant.uniform<i8:f32, 9.000000e-01:-4>>, tensor<3xi32>) -> tensor<1x384x128x!quant.uniform<i8:f32, 9.000000e-01:-4>>
-// CHECK:           %[[VAL_8:.*]] = tfl.mul(%[[VAL_7]], %[[VAL_2]]) {fused_activation_function = "NONE", tac.device = "CPU", tac.inference_type = "QUANTIZED_INT8"} : (tensor<1x384x128x!quant.uniform<i8:f32, 9.000000e-01:-4>>, tensor<128x!quant.uniform<i8:f32, 2.000000e-01:-128>>) -> tensor<1x384x128x!quant.uniform<i8:f32, 3.000000e-01:3>>
-// CHECK:           %[[VAL_9:.*]] = tfl.add(%[[VAL_8]], %[[VAL_3]]) {fused_activation_function = "NONE", tac.device = "CPU", tac.inference_type = "QUANTIZED_INT8"} : (tensor<1x384x128x!quant.uniform<i8:f32, 3.000000e-01:3>>, tensor<128x!quant.uniform<i8:f32, 2.000000e-01:-4>>) -> tensor<1x384x128x!quant.uniform<i8:f32, 3.000000e-01:-3>>
+// CHECK:           %[[VAL_8:.*]] = tfl.mul(%[[VAL_7]], %[[VAL_2]]) <{fused_activation_function = "NONE"}> {tac.device = "CPU", tac.inference_type = "QUANTIZED_INT8"} : (tensor<1x384x128x!quant.uniform<i8:f32, 9.000000e-01:-4>>, tensor<128x!quant.uniform<i8:f32, 2.000000e-01:-128>>) -> tensor<1x384x128x!quant.uniform<i8:f32, 3.000000e-01:3>>
+// CHECK:           %[[VAL_9:.*]] = tfl.add(%[[VAL_8]], %[[VAL_3]]) <{fused_activation_function = "NONE"}> {tac.device = "CPU", tac.inference_type = "QUANTIZED_INT8"} : (tensor<1x384x128x!quant.uniform<i8:f32, 3.000000e-01:3>>, tensor<128x!quant.uniform<i8:f32, 2.000000e-01:-4>>) -> tensor<1x384x128x!quant.uniform<i8:f32, 3.000000e-01:-3>>
 // CHECK:           return %[[VAL_9]] : tensor<1x384x128x!quant.uniform<i8:f32, 3.000000e-01:-3>>
 // CHECK:         }
 
@@ -139,14 +139,14 @@ func.func private @quantize_ops_CPU_QUANTIZED_INT8(%arg0: tensor<384x512x!quant.
 // CHECK:           %[[VAL_10:.*]] = "tfl.dequantize"(%[[VAL_1]]) {tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<128x512x!quant.uniform<i8<-127:127>:f32, 1.000000e-01>>) -> tensor<128x512xf32>
 // CHECK:           %[[VAL_11:.*]] = "tfl.reshape"(%[[VAL_9]], %[[VAL_6]]) {tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<384x512xf32>, tensor<4xi32>) -> tensor<1x1x384x512xf32>
 // CHECK:           %[[VAL_12:.*]] = "tfl.reshape"(%[[VAL_10]], %[[VAL_7]]) {tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<128x512xf32>, tensor<4xi32>) -> tensor<128x1x1x512xf32>
-// CHECK:           %[[VAL_13:.*]] = "tfl.conv_2d"(%[[VAL_11]], %[[VAL_12]], %[[VAL_4]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32, tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<1x1x384x512xf32>, tensor<128x1x1x512xf32>, tensor<128xf32>) -> tensor<1x1x384x128xf32>
+// CHECK:           %[[VAL_13:.*]] = "tfl.conv_2d"(%[[VAL_11]], %[[VAL_12]], %[[VAL_4]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32}> {tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<1x1x384x512xf32>, tensor<128x1x1x512xf32>, tensor<128xf32>) -> tensor<1x1x384x128xf32>
 // CHECK:           %[[VAL_14:.*]] = "tfl.reshape"(%[[VAL_13]], %[[VAL_8]]) {tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<1x1x384x128xf32>, tensor<2xi32>) -> tensor<384x128xf32>
 // CHECK:           %[[VAL_15:.*]] = "tfl.reshape"(%[[VAL_14]], %[[VAL_5]]) {tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<384x128xf32>, tensor<3xi32>) -> tensor<1x384x128xf32>
 // CHECK:           %[[VAL_16:.*]] = "tfl.dequantize"(%[[VAL_2]]) {tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<128x!quant.uniform<i8:f32, 2.000000e-01:-128>>) -> tensor<128xf32>
-// CHECK:           %[[VAL_17:.*]] = tfl.mul(%[[VAL_15]], %[[VAL_16]]) {fused_activation_function = "NONE", tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
+// CHECK:           %[[VAL_17:.*]] = tfl.mul(%[[VAL_15]], %[[VAL_16]]) <{fused_activation_function = "NONE"}> {tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
 // CHECK:           %[[VAL_18:.*]] = "tfl.dequantize"(%[[VAL_3]]) {tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<128x!quant.uniform<i8:f32, 2.000000e-01:-4>>) -> tensor<128xf32>
-// CHECK:           %[[VAL_19:.*]] = tfl.add(%[[VAL_17]], %[[VAL_18]]) {fused_activation_function = "NONE", tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
-// CHECK:           %[[VAL_20:.*]] = "tfl.quantize"(%[[VAL_19]]) {qtype = tensor<1x384x128x!quant.uniform<i8:f32, 3.000000e-01:-3>>, tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<1x384x128xf32>) -> tensor<1x384x128x!quant.uniform<i8:f32, 3.000000e-01:-3>>
+// CHECK:           %[[VAL_19:.*]] = tfl.add(%[[VAL_17]], %[[VAL_18]]) <{fused_activation_function = "NONE"}> {tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
+// CHECK:           %[[VAL_20:.*]] = "tfl.quantize"(%[[VAL_19]]) <{qtype = tensor<1x384x128x!quant.uniform<i8:f32, 3.000000e-01:-3>>}> {tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<1x384x128xf32>) -> tensor<1x384x128x!quant.uniform<i8:f32, 3.000000e-01:-3>>
 // CHECK:           return %[[VAL_20]] : tensor<1x384x128x!quant.uniform<i8:f32, 3.000000e-01:-3>>
 // CHECK:         }
 
@@ -155,13 +155,13 @@ func.func private @quantize_ops_CPU_QUANTIZED_INT8(%arg0: tensor<384x512x!quant.
 // CHECK-DAG:       %[[VAL_5:.*]] = arith.constant dense<[1, 384, 128]> : tensor<3xi32>
 // CHECK:           %[[VAL_6:.*]] = "tfl.dequantize"(%[[VAL_0]]) {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<384x512x!quant.uniform<i8:f32, 1.000000e-01>>) -> tensor<384x512xf32>
 // CHECK:           %[[VAL_7:.*]] = "tfl.dequantize"(%[[VAL_1]]) {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<128x512x!quant.uniform<i8<-127:127>:f32, 1.000000e-01>>) -> tensor<128x512xf32>
-// CHECK:           %[[VAL_8:.*]] = "tfl.fully_connected"(%[[VAL_6]], %[[VAL_7]], %[[VAL_4]]) {fused_activation_function = "NONE", keep_num_dims = false, tac.device = "CPU", tac.inference_type = "FLOAT", weights_format = "DEFAULT"} : (tensor<384x512xf32>, tensor<128x512xf32>, tensor<128xf32>) -> tensor<384x128xf32>
+// CHECK:           %[[VAL_8:.*]] = "tfl.fully_connected"(%[[VAL_6]], %[[VAL_7]], %[[VAL_4]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<384x512xf32>, tensor<128x512xf32>, tensor<128xf32>) -> tensor<384x128xf32>
 // CHECK:           %[[VAL_9:.*]] = "tfl.reshape"(%[[VAL_8]], %[[VAL_5]]) {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<384x128xf32>, tensor<3xi32>) -> tensor<1x384x128xf32>
 // CHECK:           %[[VAL_10:.*]] = "tfl.dequantize"(%[[VAL_2]]) {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<128x!quant.uniform<i8:f32, 2.000000e-01:-128>>) -> tensor<128xf32>
-// CHECK:           %[[VAL_11:.*]] = tfl.mul(%[[VAL_9]], %[[VAL_10]]) {fused_activation_function = "NONE", tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
+// CHECK:           %[[VAL_11:.*]] = tfl.mul(%[[VAL_9]], %[[VAL_10]]) <{fused_activation_function = "NONE"}> {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
 // CHECK:           %[[VAL_12:.*]] = "tfl.dequantize"(%[[VAL_3]]) {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<128x!quant.uniform<i8:f32, 2.000000e-01:-4>>) -> tensor<128xf32>
-// CHECK:           %[[VAL_13:.*]] = tfl.add(%[[VAL_11]], %[[VAL_12]]) {fused_activation_function = "NONE", tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
-// CHECK:           %[[VAL_14:.*]] = "tfl.quantize"(%[[VAL_13]]) {qtype = tensor<1x384x128x!quant.uniform<i8:f32, 3.000000e-01:-3>>, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<1x384x128xf32>) -> tensor<1x384x128x!quant.uniform<i8:f32, 3.000000e-01:-3>>
+// CHECK:           %[[VAL_13:.*]] = tfl.add(%[[VAL_11]], %[[VAL_12]]) <{fused_activation_function = "NONE"}> {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<1x384x128xf32>, tensor<128xf32>) -> tensor<1x384x128xf32>
+// CHECK:           %[[VAL_14:.*]] = "tfl.quantize"(%[[VAL_13]]) <{qtype = tensor<1x384x128x!quant.uniform<i8:f32, 3.000000e-01:-3>>}> {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<1x384x128xf32>) -> tensor<1x384x128x!quant.uniform<i8:f32, 3.000000e-01:-3>>
 // CHECK:           return %[[VAL_14]] : tensor<1x384x128x!quant.uniform<i8:f32, 3.000000e-01:-3>>
 // CHECK:         }
 
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/tests/pick-subgraphs.mlir b/tensorflow/compiler/mlir/lite/experimental/tac/tests/pick-subgraphs.mlir
index 0157e97a4e4ac3..b309fb513b1fe7 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/tests/pick-subgraphs.mlir
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/tests/pick-subgraphs.mlir
@@ -93,12 +93,12 @@ module {
   }
 
 // CHECK:       func @main([[VAL_0:%.*]]: tensor<1x200x200x200xf32>) -> tensor<2x1x200x200x200xf32> attributes {tf.entry_function = {inputs = "Placeholder", outputs = "mul_1"}} {
-// CHECK:           [[VAL_1:%.*]] = "tfl.pseudo_const"() {value = dense<0.962260901> : tensor<1xf32>} : () -> tensor<1xf32>
+// CHECK:           [[VAL_1:%.*]] = "tfl.pseudo_const"() <{value = dense<0.962260901> : tensor<1xf32>}> : () -> tensor<1xf32>
 // CHECK:           [[VAL_2:%.*]] = call @func_0_GPU_FLOAT([[VAL_0]], [[VAL_1]]) {tac.device = "GPU", tac.inference_type = "FLOAT", tac.interface_name = "func_0"} : (tensor<1x200x200x200xf32>, tensor<1xf32>) -> tensor<1x200x200x200xf32>
-// CHECK:           [[VAL_3:%.*]] = "tfl.pseudo_const"() {value = dense<0.895973444> : tensor<1xf32>} : () -> tensor<1xf32>
+// CHECK:           [[VAL_3:%.*]] = "tfl.pseudo_const"() <{value = dense<0.895973444> : tensor<1xf32>}> : () -> tensor<1xf32>
 // CHECK:           [[VAL_4:%.*]] = call @func_1_GPU_FLOAT([[VAL_0]], [[VAL_3]]) {tac.device = "GPU", tac.inference_type = "FLOAT", tac.interface_name = "func_1"} : (tensor<1x200x200x200xf32>, tensor<1xf32>) -> tensor<1x200x200x200xf32>
 // CHECK:           [[VAL_5:%.*]] = call @func_2_GPU_FLOAT([[VAL_4]], [[VAL_2]]) {tac.device = "GPU", tac.inference_type = "FLOAT", tac.interface_name = "func_2"} : (tensor<1x200x200x200xf32>, tensor<1x200x200x200xf32>) -> tensor<2x1x200x200x200xf32>
-// CHECK:           [[VAL_6:%.*]] = "tfl.pseudo_const"() {value = dense<0.0778453499> : tensor<1xf32>} : () -> tensor<1xf32>
+// CHECK:           [[VAL_6:%.*]] = "tfl.pseudo_const"() <{value = dense<0.0778453499> : tensor<1xf32>}> : () -> tensor<1xf32>
 // CHECK:           [[VAL_7:%.*]] = call @func_3_GPU_FLOAT([[VAL_5]], [[VAL_6]]) {tac.device = "GPU", tac.inference_type = "FLOAT", tac.interface_name = "func_3"} : (tensor<2x1x200x200x200xf32>, tensor<1xf32>) -> tensor<2x1x200x200x200xf32>
 // CHECK:           return [[VAL_7]] : tensor<2x1x200x200x200xf32>
 // CHECK:         }
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/tests/raise-target-subgraphs.mlir b/tensorflow/compiler/mlir/lite/experimental/tac/tests/raise-target-subgraphs.mlir
index 3018221fdacff6..0934bb387a22c5 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/tests/raise-target-subgraphs.mlir
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/tests/raise-target-subgraphs.mlir
@@ -116,7 +116,7 @@ func.func @simpleTest(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>, %arg2: tensor<
 // CHECK:         }
 
 // CHECK:   func private @func_1_CPU_FLOAT(%[[VAL_0:.*]]: tensor<1xf32>, %[[VAL_1:.*]]: tensor<1xf32>) -> tensor<2x1xf32> attributes {tac.device = "CPU", tac.inference_type = "FLOAT", tac.interface_name = "func_1"} {
-// CHECK:           %[[VAL_2:.*]] = "tfl.pack"(%[[VAL_0]], %[[VAL_1]]) {axis = 0 : i32, tac.device = "CPU", tac.inference_type = "FLOAT", values_count = 2 : i32} : (tensor<1xf32>, tensor<1xf32>) -> tensor<2x1xf32>
+// CHECK:           %[[VAL_2:.*]] = "tfl.pack"(%[[VAL_0]], %[[VAL_1]]) <{axis = 0 : i32, values_count = 2 : i32}> {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<1xf32>, tensor<1xf32>) -> tensor<2x1xf32>
 // CHECK:           return %[[VAL_2]] : tensor<2x1xf32>
 // CHECK:         }
 
@@ -134,17 +134,17 @@ func.func @constWeight(%arg0: tensor<256x32x32x3xf32>) -> tensor<256x30x30x16xf3
 }
 
 // CHECK:   func @constWeight(%[[VAL_0:.*]]: tensor<256x32x32x3xf32>) -> tensor<256x30x30x16xf32> {
-// CHECK-DAG:       %[[VAL_1:.*]] = "tfl.pseudo_const"() {value = dense<1.000000e+00> : tensor<16x3x3x3xf32>} : () -> tensor<16x3x3x3xf32>
-// CHECK-DAG:       %[[VAL_2:.*]] = "tfl.pseudo_const"() {value = dense<1.000000e+00> : tensor<16xf32>} : () -> tensor<16xf32>
-// CHECK-DAG:       %[[VAL_3:.*]] = "tfl.pseudo_const"() {value = dense<1.000000e+00> : tensor<16x3x3x16xf32>} : () -> tensor<16x3x3x16xf32>
-// CHECK-DAG:       %[[VAL_4:.*]] = "tfl.pseudo_const"() {value = dense<1.000000e+00> : tensor<16xf32>} : () -> tensor<16xf32>
+// CHECK-DAG:       %[[VAL_1:.*]] = "tfl.pseudo_const"() <{value = dense<1.000000e+00> : tensor<16x3x3x3xf32>}> : () -> tensor<16x3x3x3xf32>
+// CHECK-DAG:       %[[VAL_2:.*]] = "tfl.pseudo_const"() <{value = dense<1.000000e+00> : tensor<16xf32>}> : () -> tensor<16xf32>
+// CHECK-DAG:       %[[VAL_3:.*]] = "tfl.pseudo_const"() <{value = dense<1.000000e+00> : tensor<16x3x3x16xf32>}> : () -> tensor<16x3x3x16xf32>
+// CHECK-DAG:       %[[VAL_4:.*]] = "tfl.pseudo_const"() <{value = dense<1.000000e+00> : tensor<16xf32>}> : () -> tensor<16xf32>
 // CHECK:           %[[VAL_5:.*]] = call @func_0_GPU_FLOAT(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]], %[[VAL_3]], %[[VAL_4]]) {tac.device = "GPU", tac.inference_type = "FLOAT", tac.interface_name = "func_0"} : (tensor<256x32x32x3xf32>, tensor<16x3x3x3xf32>, tensor<16xf32>, tensor<16x3x3x16xf32>, tensor<16xf32>) -> tensor<256x30x30x16xf32>
 // CHECK:           return %[[VAL_5]] : tensor<256x30x30x16xf32>
 // CHECK:         }
 
 // CHECK:   func private @func_0_GPU_FLOAT(%[[VAL_0:.*]]: tensor<256x32x32x3xf32>, %[[VAL_1:.*]]: tensor<16x3x3x3xf32>, %[[VAL_2:.*]]: tensor<16xf32>, %[[VAL_3:.*]]: tensor<16x3x3x16xf32>, %[[VAL_4:.*]]: tensor<16xf32>) -> tensor<256x30x30x16xf32> attributes {tac.device = "GPU", tac.inference_type = "FLOAT", tac.interface_name = "func_0"} {
-// CHECK:           %[[VAL_5:.*]] = "tfl.conv_2d"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32, tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<256x32x32x3xf32>, tensor<16x3x3x3xf32>, tensor<16xf32>) -> tensor<256x30x30x16xf32>
-// CHECK:           %[[VAL_6:.*]] = "tfl.conv_2d"(%[[VAL_5]], %[[VAL_3]], %[[VAL_4]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32, tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<256x30x30x16xf32>, tensor<16x3x3x16xf32>, tensor<16xf32>) -> tensor<256x30x30x16xf32>
+// CHECK:           %[[VAL_5:.*]] = "tfl.conv_2d"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32}> {tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<256x32x32x3xf32>, tensor<16x3x3x3xf32>, tensor<16xf32>) -> tensor<256x30x30x16xf32>
+// CHECK:           %[[VAL_6:.*]] = "tfl.conv_2d"(%[[VAL_5]], %[[VAL_3]], %[[VAL_4]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}> {tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<256x30x30x16xf32>, tensor<16x3x3x16xf32>, tensor<16xf32>) -> tensor<256x30x30x16xf32>
 // CHECK:           return %[[VAL_6]] : tensor<256x30x30x16xf32>
 // CHECK:         }
 
@@ -166,15 +166,15 @@ func.func @norm1(%arg0: tensor<1x128x128xf32>) -> tensor<1x128x128xf32> {
 }
 
 // CHECK:   func @norm1(%[[VAL_0:.*]]: tensor<1x128x128xf32>) -> tensor<1x128x128xf32> {
-// CHECK-DAG:       %[[VAL_1:.*]] = "tfl.pseudo_const"() {value = dense<1.000000e+00> : tensor<128xf32>} : () -> tensor<128xf32>
-// CHECK-DAG:       %[[VAL_2:.*]] = "tfl.pseudo_const"() {value = dense<128> : tensor<2xi32>} : () -> tensor<2xi32>
-// CHECK-DAG:       %[[VAL_3:.*]] = "tfl.pseudo_const"() {value = dense<[1, 128, 128]> : tensor<3xi32>} : () -> tensor<3xi32>
+// CHECK-DAG:       %[[VAL_1:.*]] = "tfl.pseudo_const"() <{value = dense<1.000000e+00> : tensor<128xf32>}> : () -> tensor<128xf32>
+// CHECK-DAG:       %[[VAL_2:.*]] = "tfl.pseudo_const"() <{value = dense<128> : tensor<2xi32>}> : () -> tensor<2xi32>
+// CHECK-DAG:       %[[VAL_3:.*]] = "tfl.pseudo_const"() <{value = dense<[1, 128, 128]> : tensor<3xi32>}> : () -> tensor<3xi32>
 // CHECK:           %[[VAL_4:.*]] = call @func_0_GPU_FLOAT(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]], %[[VAL_3]]) {tac.device = "GPU", tac.inference_type = "FLOAT", tac.interface_name = "func_0"} : (tensor<1x128x128xf32>, tensor<128xf32>, tensor<2xi32>, tensor<3xi32>) -> tensor<1x128x128xf32>
 // CHECK:           return %[[VAL_4]] : tensor<1x128x128xf32>
 // CHECK:         }
 
 // CHECK:   func private @func_0_GPU_FLOAT(%[[VAL_0:.*]]: tensor<1x128x128xf32>, %[[VAL_1:.*]]: tensor<128xf32>, %[[VAL_2:.*]]: tensor<2xi32>, %[[VAL_3:.*]]: tensor<3xi32>) -> tensor<1x128x128xf32> attributes {tac.device = "GPU", tac.inference_type = "FLOAT", tac.interface_name = "func_0"} {
-// CHECK:           %[[VAL_4:.*]] = tfl.add(%[[VAL_0]], %[[VAL_1]]) {fused_activation_function = "NONE", tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<1x128x128xf32>, tensor<128xf32>) -> tensor<1x128x128xf32>
+// CHECK:           %[[VAL_4:.*]] = tfl.add(%[[VAL_0]], %[[VAL_1]]) <{fused_activation_function = "NONE"}> {tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<1x128x128xf32>, tensor<128xf32>) -> tensor<1x128x128xf32>
 // CHECK:           %[[VAL_5:.*]] = "tfl.reshape"(%[[VAL_4]], %[[VAL_2]]) {tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<1x128x128xf32>, tensor<2xi32>) -> tensor<128x128xf32>
 // CHECK:           %[[VAL_6:.*]] = "tfl.relu"(%[[VAL_5]]) {tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<128x128xf32>) -> tensor<128x128xf32>
 // CHECK:           %[[VAL_7:.*]] = "tfl.reshape"(%[[VAL_6]], %[[VAL_3]]) {tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<128x128xf32>, tensor<3xi32>) -> tensor<1x128x128xf32>
@@ -204,19 +204,19 @@ func.func @norm2(%arg0: tensor<1x128x128xf32>) -> tensor<1x128x128xf32> {
 }
 
 // CHECK:   func @norm2(%[[VAL_0:.*]]: tensor<1x128x128xf32>) -> tensor<1x128x128xf32> {
-// CHECK-DAG:       %[[VAL_1:.*]] = "tfl.pseudo_const"() {value = dense<1.000000e+00> : tensor<128xf32>} : () -> tensor<128xf32>
-// CHECK-DAG:       %[[VAL_2:.*]] = "tfl.pseudo_const"() {value = dense<128> : tensor<2xi32>} : () -> tensor<2xi32>
+// CHECK-DAG:       %[[VAL_1:.*]] = "tfl.pseudo_const"() <{value = dense<1.000000e+00> : tensor<128xf32>}> : () -> tensor<128xf32>
+// CHECK-DAG:       %[[VAL_2:.*]] = "tfl.pseudo_const"() <{value = dense<128> : tensor<2xi32>}> : () -> tensor<2xi32>
 // CHECK:           %[[VAL_3:.*]]:2 = call @func_0_GPU_FLOAT(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) {tac.device = "GPU", tac.inference_type = "FLOAT", tac.interface_name = "func_0"} : (tensor<1x128x128xf32>, tensor<128xf32>, tensor<2xi32>) -> (tensor<1x128x128xf32>, tensor<128x128xf32>)
-// CHECK-DAG:       %[[VAL_4:.*]] = "tfl.pseudo_const"() {value = dense<1.000000e+00> : tensor<128x128xf32>} : () -> tensor<128x128xf32>
-// CHECK-DAG:       %[[VAL_5:.*]] = "tfl.pseudo_const"() {value = dense<1.000000e+00> : tensor<128xf32>} : () -> tensor<128xf32>
+// CHECK-DAG:       %[[VAL_4:.*]] = "tfl.pseudo_const"() <{value = dense<1.000000e+00> : tensor<128x128xf32>}> : () -> tensor<128x128xf32>
+// CHECK-DAG:       %[[VAL_5:.*]] = "tfl.pseudo_const"() <{value = dense<1.000000e+00> : tensor<128xf32>}> : () -> tensor<128xf32>
 // CHECK:           %[[VAL_6:.*]] = call @func_2_CPU_FLOAT(%[[VAL_3]]#1, %[[VAL_4]], %[[VAL_5]]) {tac.device = "CPU", tac.inference_type = "FLOAT", tac.interface_name = "func_2"} : (tensor<128x128xf32>, tensor<128x128xf32>, tensor<128xf32>) -> tensor<128x128xf32>
-// CHECK:           %[[VAL_7:.*]] = "tfl.pseudo_const"() {value = dense<[1, 128, 128]> : tensor<3xi32>} : () -> tensor<3xi32>
+// CHECK:           %[[VAL_7:.*]] = "tfl.pseudo_const"() <{value = dense<[1, 128, 128]> : tensor<3xi32>}> : () -> tensor<3xi32>
 // CHECK:           %[[VAL_8:.*]] = call @func_1_GPU_FLOAT(%[[VAL_6]], %[[VAL_7]], %[[VAL_3]]#0) {tac.device = "GPU", tac.inference_type = "FLOAT", tac.interface_name = "func_1"} : (tensor<128x128xf32>, tensor<3xi32>, tensor<1x128x128xf32>) -> tensor<1x128x128xf32>
 // CHECK:           return %[[VAL_8]] : tensor<1x128x128xf32>
 // CHECK:         }
 
 // CHECK:   func.func private @func_0_GPU_FLOAT(%[[VAL_0:.*]]: tensor<1x128x128xf32>, %[[VAL_1:.*]]: tensor<128xf32>, %[[VAL_2:.*]]: tensor<2xi32>) -> (tensor<1x128x128xf32>, tensor<128x128xf32>) attributes {tac.device = "GPU", tac.inference_type = "FLOAT", tac.interface_name = "func_0"} {
-// CHECK:           %[[VAL_3:.*]] = tfl.add(%[[VAL_0]], %[[VAL_1]]) {fused_activation_function = "NONE", tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<1x128x128xf32>, tensor<128xf32>) -> tensor<1x128x128xf32>
+// CHECK:           %[[VAL_3:.*]] = tfl.add(%[[VAL_0]], %[[VAL_1]]) <{fused_activation_function = "NONE"}> {tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<1x128x128xf32>, tensor<128xf32>) -> tensor<1x128x128xf32>
 // CHECK:           %[[VAL_4:.*]] = "tfl.reshape"(%[[VAL_3]], %[[VAL_2]]) {tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<1x128x128xf32>, tensor<2xi32>) -> tensor<128x128xf32>
 // CHECK:           %[[VAL_5:.*]] = "tfl.relu"(%[[VAL_4]]) {tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<128x128xf32>) -> tensor<128x128xf32>
 // CHECK:           return %[[VAL_3]], %[[VAL_5]] : tensor<1x128x128xf32>, tensor<128x128xf32>
@@ -229,7 +229,7 @@ func.func @norm2(%arg0: tensor<1x128x128xf32>) -> tensor<1x128x128xf32> {
 // CHECK:         }
 
 // CHECK:   func.func private @func_2_CPU_FLOAT(%[[VAL_0:.*]]: tensor<128x128xf32>, %[[VAL_1:.*]]: tensor<128x128xf32>, %[[VAL_2:.*]]: tensor<128xf32>) -> tensor<128x128xf32> attributes {tac.device = "CPU", tac.inference_type = "FLOAT", tac.interface_name = "func_2"} {
-// CHECK:           %[[VAL_3:.*]] = "tfl.fully_connected"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) {fused_activation_function = "NONE", keep_num_dims = false, tac.device = "CPU", tac.inference_type = "FLOAT", weights_format = "DEFAULT"} : (tensor<128x128xf32>, tensor<128x128xf32>, tensor<128xf32>) -> tensor<128x128xf32>
+// CHECK:           %[[VAL_3:.*]] = "tfl.fully_connected"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<128x128xf32>, tensor<128x128xf32>, tensor<128xf32>) -> tensor<128x128xf32>
 // CHECK:           return %[[VAL_3]] : tensor<128x128xf32>
 // CHECK:         }
 
@@ -248,8 +248,8 @@ func.func @quantizedOpOnly(%arg0: tensor<1x!quant.uniform<i8:f32, 0.003:-128>>,
 }
 
 // CHECK:   func @quantizedOpOnly(%[[VAL_0:.*]]: tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>, %[[VAL_1:.*]]: tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>) -> tensor<2x1x!quant.uniform<i8:f32, 3.000000e-03:-128>> {
-// CHECK:           %[[VAL_2:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>, value = dense<127> : tensor<1xi8>} : () -> tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>
-// CHECK:           %[[VAL_3:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>, value = dense<127> : tensor<1xi8>} : () -> tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>
+// CHECK:           %[[VAL_2:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>, value = dense<127> : tensor<1xi8>}> : () -> tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>
+// CHECK:           %[[VAL_3:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>, value = dense<127> : tensor<1xi8>}> : () -> tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>
 // CHECK:           %[[VAL_4:.*]] = call @func_0_CPU_QUANTIZED_INT8(%[[VAL_0]], %[[VAL_2]], %[[VAL_3]], %[[VAL_1]]) {tac.device = "CPU", tac.inference_type = "QUANTIZED_INT8", tac.interface_name = "func_0"} : (tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>, tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>, tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>, tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>) -> tensor<2x1x!quant.uniform<i8:f32, 3.000000e-03:-128>>
 // CHECK:           return %[[VAL_4]] : tensor<2x1x!quant.uniform<i8:f32, 3.000000e-03:-128>>
 // CHECK:         }
@@ -258,7 +258,7 @@ func.func @quantizedOpOnly(%arg0: tensor<1x!quant.uniform<i8:f32, 0.003:-128>>,
 // CHECK:           %[[VAL_5:.*]] = tfl.mul %[[VAL_0]], %[[VAL_1]] {fused_activation_function = "NONE", tac.device = "CPU", tac.inference_type = "QUANTIZED_INT8"} : tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>
 // CHECK:           %[[VAL_6:.*]] = tfl.add %[[VAL_5]], %[[VAL_2]] {fused_activation_function = "NONE", tac.device = "CPU", tac.inference_type = "QUANTIZED_INT8"} : tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>
 // CHECK:           %[[VAL_7:.*]] = tfl.add %[[VAL_3]], %[[VAL_1]] {fused_activation_function = "NONE", tac.device = "CPU", tac.inference_type = "QUANTIZED_INT8"} : tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>
-// CHECK:           %[[VAL_8:.*]] = "tfl.pack"(%[[VAL_6]], %[[VAL_7]]) {axis = 0 : i32, tac.device = "CPU", tac.inference_type = "QUANTIZED_INT8", values_count = 2 : i32} : (tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>, tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>) -> tensor<2x1x!quant.uniform<i8:f32, 3.000000e-03:-128>>
+// CHECK:           %[[VAL_8:.*]] = "tfl.pack"(%[[VAL_6]], %[[VAL_7]]) <{axis = 0 : i32, values_count = 2 : i32}> {tac.device = "CPU", tac.inference_type = "QUANTIZED_INT8"} : (tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>, tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>) -> tensor<2x1x!quant.uniform<i8:f32, 3.000000e-03:-128>>
 // CHECK:           return %[[VAL_8]] : tensor<2x1x!quant.uniform<i8:f32, 3.000000e-03:-128>>
 // CHECK:         }
 
@@ -280,12 +280,12 @@ func.func @quantizationWithFloat(%arg0: tensor<1x1x384x!quant.uniform<i8:f32, 0.
 }
 
 // CHECK:   func @quantizationWithFloat(%[[VAL_0:.*]]: tensor<1x1x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>, %[[VAL_1:.*]]: tensor<1x1x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>) -> tensor<1x384x384x!quant.uniform<i8:f32, 3.000000e-03:-128>> {
-// CHECK:           %[[VAL_2:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<1x384x1x!quant.uniform<i8:f32, 3.000000e-03:-128>>, value = dense<127> : tensor<1x384x1xi8>} : () -> tensor<1x384x1x!quant.uniform<i8:f32, 3.000000e-03:-128>>
+// CHECK:           %[[VAL_2:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<1x384x1x!quant.uniform<i8:f32, 3.000000e-03:-128>>, value = dense<127> : tensor<1x384x1xi8>}> : () -> tensor<1x384x1x!quant.uniform<i8:f32, 3.000000e-03:-128>>
 // CHECK:           %[[VAL_3:.*]] = call @func_1_CPU_QUANTIZED_INT8(%[[VAL_0]], %[[VAL_2]]) {tac.device = "CPU", tac.inference_type = "QUANTIZED_INT8", tac.interface_name = "func_1"} : (tensor<1x1x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>, tensor<1x384x1x!quant.uniform<i8:f32, 3.000000e-03:-128>>) -> tensor<1x384x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>
 // CHECK:           %[[VAL_4:.*]] = "tfl.dequantize"(%[[VAL_3]]) : (tensor<1x384x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>) -> tensor<1x384x384xf32>
-// CHECK:           %[[VAL_5:.*]] = "tfl.pseudo_const"() {value = dense<1.000000e+00> : tensor<1x1x384xf32>} : () -> tensor<1x384x384xf32>
+// CHECK:           %[[VAL_5:.*]] = "tfl.pseudo_const"() <{value = dense<1.000000e+00> : tensor<1x1x384xf32>}> : () -> tensor<1x384x384xf32>
 // CHECK:           %[[VAL_6:.*]] = call @func_0_GPU_FLOAT(%[[VAL_4]], %[[VAL_5]]) {tac.device = "GPU", tac.inference_type = "FLOAT", tac.interface_name = "func_0"} : (tensor<1x384x384xf32>, tensor<1x384x384xf32>) -> tensor<1x384x384xf32>
-// CHECK:           %[[VAL_7:.*]] = "tfl.quantize"(%[[VAL_6]]) {qtype = tensor<1x384x1x!quant.uniform<i8:f32, 3.000000e-03:-128>>} : (tensor<1x384x384xf32>) -> tensor<1x384x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>
+// CHECK:           %[[VAL_7:.*]] = "tfl.quantize"(%[[VAL_6]]) <{qtype = tensor<1x384x1x!quant.uniform<i8:f32, 3.000000e-03:-128>>}> : (tensor<1x384x384xf32>) -> tensor<1x384x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>
 // CHECK:           %[[VAL_8:.*]] = call @func_2_CPU_QUANTIZED_INT8(%[[VAL_1]], %[[VAL_7]]) {tac.device = "CPU", tac.inference_type = "QUANTIZED_INT8", tac.interface_name = "func_2"} : (tensor<1x1x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>, tensor<1x384x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>) -> tensor<1x384x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>
 // CHECK:           return %[[VAL_8]] : tensor<1x384x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>
 // CHECK:         }
@@ -296,12 +296,12 @@ func.func @quantizationWithFloat(%arg0: tensor<1x1x384x!quant.uniform<i8:f32, 0.
 // CHECK:         }
 
 // CHECK:   func private @func_1_CPU_QUANTIZED_INT8(%[[VAL_0:.*]]: tensor<1x1x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>, %[[VAL_1:.*]]: tensor<1x384x1x!quant.uniform<i8:f32, 3.000000e-03:-128>>) -> tensor<1x384x384x!quant.uniform<i8:f32, 3.000000e-03:-128>> attributes {tac.device = "CPU", tac.inference_type = "QUANTIZED_INT8", tac.interface_name = "func_1"} {
-// CHECK:           %[[VAL_2:.*]] = tfl.mul(%[[VAL_0]], %[[VAL_1]]) {fused_activation_function = "NONE", tac.device = "CPU", tac.inference_type = "QUANTIZED_INT8"} : (tensor<1x1x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>, tensor<1x384x1x!quant.uniform<i8:f32, 3.000000e-03:-128>>) -> tensor<1x384x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>
+// CHECK:           %[[VAL_2:.*]] = tfl.mul(%[[VAL_0]], %[[VAL_1]]) <{fused_activation_function = "NONE"}> {tac.device = "CPU", tac.inference_type = "QUANTIZED_INT8"} : (tensor<1x1x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>, tensor<1x384x1x!quant.uniform<i8:f32, 3.000000e-03:-128>>) -> tensor<1x384x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>
 // CHECK:           return %[[VAL_2]] : tensor<1x384x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>
 // CHECK:         }
 
 // CHECK:   func private @func_2_CPU_QUANTIZED_INT8(%[[VAL_0:.*]]: tensor<1x1x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>, %[[VAL_1:.*]]: tensor<1x384x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>) -> tensor<1x384x384x!quant.uniform<i8:f32, 3.000000e-03:-128>> attributes {tac.device = "CPU", tac.inference_type = "QUANTIZED_INT8", tac.interface_name = "func_2"} {
-// CHECK:           %[[VAL_2:.*]] = tfl.mul(%[[VAL_0]], %[[VAL_1]]) {fused_activation_function = "NONE", tac.device = "CPU", tac.inference_type = "QUANTIZED_INT8"} : (tensor<1x1x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>, tensor<1x384x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>) -> tensor<1x384x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>
+// CHECK:           %[[VAL_2:.*]] = tfl.mul(%[[VAL_0]], %[[VAL_1]]) <{fused_activation_function = "NONE"}> {tac.device = "CPU", tac.inference_type = "QUANTIZED_INT8"} : (tensor<1x1x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>, tensor<1x384x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>) -> tensor<1x384x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>
 // CHECK:           return %[[VAL_2]] : tensor<1x384x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>
 // CHECK:         }
 
@@ -360,28 +360,28 @@ func.func @cond_false_72730(%arg0: tensor<?x?x!tf_type.string>, %arg1: tensor<?x
       %30 = "tfl.expand_dims"(%arg9, %cst_9) {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
       %31 = tfl.add %30, %25 {fused_activation_function = "NONE", tac.device = "DARWINN", tac.inference_type = "FLOAT"} : tensor<?x1xi32>
       %32 = "tfl.reshape"(%31, %cst_8) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x1xi32>, tensor<1xi32>) -> tensor<?xi32>
-      %33 = "tfl.gather"(%28, %32) {axis = 0 : i32, batch_dims = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x!tf_type.string>, tensor<?xi32>) -> tensor<?x!tf_type.string>
+      %33 = "tfl.gather"(%28, %32) <{axis = 0 : i32, batch_dims = 0 : i32}> {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x!tf_type.string>, tensor<?xi32>) -> tensor<?x!tf_type.string>
       %34 = "tfl.reshape"(%33, %cst_8) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x!tf_type.string>, tensor<1xi32>) -> tensor<?x!tf_type.string>
       %35 = "tfl.shape"(%34) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x!tf_type.string>) -> tensor<1xi32>
       %36 = "tfl.fill"(%35, %cst_7) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<1xi32>, tensor<!tf_type.string>) -> tensor<?x!tf_type.string>
       %37 = "tfl.expand_dims"(%arg10, %cst_9) {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
       %38 = tfl.add %37, %15 {fused_activation_function = "NONE", tac.device = "DARWINN", tac.inference_type = "FLOAT"} : tensor<?x1xi32>
       %39 = "tfl.reshape"(%38, %cst_8) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x1xi32>, tensor<1xi32>) -> tensor<?xi32>
-      %40 = "tfl.gather"(%18, %39) {axis = 0 : i32, batch_dims = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x!tf_type.string>, tensor<?xi32>) -> tensor<?x!tf_type.string>
+      %40 = "tfl.gather"(%18, %39) <{axis = 0 : i32, batch_dims = 0 : i32}> {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x!tf_type.string>, tensor<?xi32>) -> tensor<?x!tf_type.string>
       %41 = "tfl.reshape"(%40, %cst_8) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x!tf_type.string>, tensor<1xi32>) -> tensor<?x!tf_type.string>
       %42 = "tfl.shape"(%41) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x!tf_type.string>) -> tensor<1xi32>
       %43 = "tfl.fill"(%42, %cst_7) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<1xi32>, tensor<!tf_type.string>) -> tensor<?x!tf_type.string>
-      %44 = "tfl.gather"(%arg2, %arg8) {axis = 0 : i32, batch_dims = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x?xi32>, tensor<i32>) -> tensor<?xi32>
+      %44 = "tfl.gather"(%arg2, %arg8) <{axis = 0 : i32, batch_dims = 0 : i32}> {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x?xi32>, tensor<i32>) -> tensor<?xi32>
       %45 = "tfl.equal"(%44, %cst_6) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi1>
       %46 = "tfl.custom"(%45, %36, %34) {custom_code = "FlexSelect", custom_option = #tfl<const_bytes : "0x0653656C6563740031120653656C6563741A001A001A002A070A01541202300732180A052E31323131120F7768696C655F626F64795F3733303200023B341414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<?xi1>, tensor<?x!tf_type.string>, tensor<?x!tf_type.string>) -> tensor<?x!tf_type.string>
       %47 = "tfl.custom"(%arg11, %arg8, %46) {custom_code = "FlexTensorListSetItem", custom_option = #tfl<const_bytes : "0x1154656E736F724C6973745365744974656D0047121154656E736F724C6973745365744974656D1A001A001A002A130A0D656C656D656E745F64747970651202300732170A042E326333120F7768696C655F626F64795F3733303200025C4A1414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, tensor<i32>, tensor<?x!tf_type.string>) -> tensor<!tf_type.variant<tensor<?x!tf_type.string>>>
       %48 = "tfl.equal"(%44, %cst_11) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi1>
       %49 = "tfl.custom"(%48, %43, %41) {custom_code = "FlexSelect", custom_option = #tfl<const_bytes : "0x0653656C6563740031120653656C6563741A001A001A002A070A01541202300732180A052E31323166120F7768696C655F626F64795F3733303200023B341414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<?xi1>, tensor<?x!tf_type.string>, tensor<?x!tf_type.string>) -> tensor<?x!tf_type.string>
       %50 = "tfl.custom"(%arg12, %arg8, %49) {custom_code = "FlexTensorListSetItem", custom_option = #tfl<const_bytes : "0x1154656E736F724C6973745365744974656D0047121154656E736F724C6973745365744974656D1A001A001A002A130A0D656C656D656E745F64747970651202300732170A042E326335120F7768696C655F626F64795F3733303200025C4A1414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, tensor<i32>, tensor<?x!tf_type.string>) -> tensor<!tf_type.variant<tensor<?x!tf_type.string>>>
-      %51 = "tfl.gather"(%cst_5, %44) {axis = 0 : i32, batch_dims = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<5xi32>, tensor<?xi32>) -> tensor<?xi32>
+      %51 = "tfl.gather"(%cst_5, %44) <{axis = 0 : i32, batch_dims = 0 : i32}> {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<5xi32>, tensor<?xi32>) -> tensor<?xi32>
       %52 = tfl.add %arg9, %51 {fused_activation_function = "NONE", tac.device = "DARWINN", tac.inference_type = "FLOAT"} : tensor<?xi32>
       %53 = "tfl.custom"(%arg13, %arg8, %52) {custom_code = "FlexTensorListSetItem", custom_option = #tfl<const_bytes : "0x1154656E736F724C6973745365744974656D0047121154656E736F724C6973745365744974656D1A001A001A002A130A0D656C656D656E745F64747970651202300332170A042E326337120F7768696C655F626F64795F3733303200025C4A1414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<!tf_type.variant<tensor<?xi32>>>, tensor<i32>, tensor<?xi32>) -> tensor<!tf_type.variant<tensor<?xi32>>>
-      %54 = "tfl.gather"(%cst_4, %44) {axis = 0 : i32, batch_dims = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<5xi32>, tensor<?xi32>) -> tensor<?xi32>
+      %54 = "tfl.gather"(%cst_4, %44) <{axis = 0 : i32, batch_dims = 0 : i32}> {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<5xi32>, tensor<?xi32>) -> tensor<?xi32>
       %55 = tfl.add %arg10, %54 {fused_activation_function = "NONE", tac.device = "DARWINN", tac.inference_type = "FLOAT"} : tensor<?xi32>
       %56 = "tfl.custom"(%arg14, %arg8, %55) {custom_code = "FlexTensorListSetItem", custom_option = #tfl<const_bytes : "0x1154656E736F724C6973745365744974656D0047121154656E736F724C6973745365744974656D1A001A001A002A130A0D656C656D656E745F64747970651202300332170A042E343139120F7768696C655F626F64795F3733303200025C4A1414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<!tf_type.variant<tensor<?xi32>>>, tensor<i32>, tensor<?xi32>) -> tensor<!tf_type.variant<tensor<?xi32>>>
       %57 = tfl.add %arg7, %cst_11 {fused_activation_function = "NONE", tac.device = "DARWINN", tac.inference_type = "FLOAT"} : tensor<i32>
@@ -406,12 +406,12 @@ func.func @cond_false_72730(%arg0: tensor<?x?x!tf_type.string>, %arg1: tensor<?x
 // CHECK:   }
 // CHECK:   func.func private @func_0_DARWINN_FLOAT(%arg0: tensor<?x?xi32>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>) -> tensor<i32> attributes {tac.device = "DARWINN", tac.inference_type = "FLOAT", tac.interface_name = "func_0"} {
 // CHECK:     %0 = "tfl.shape"(%arg0) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x?xi32>) -> tensor<2xi32>
-// CHECK:     %1 = "tfl.strided_slice"(%0, %arg1, %arg2, %arg2) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 1 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+// CHECK:     %1 = "tfl.strided_slice"(%0, %arg1, %arg2, %arg2) <{begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 1 : i32}> {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
 // CHECK:     return %1 : tensor<i32>
 // CHECK:   }
 // CHECK:   func.func private @func_1_CPU_FLOAT(%arg0: tensor<1xi32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<?xi32>, %arg4: tensor<?xi32>, %arg5: tensor<?x?x!tf_type.string>, %arg6: tensor<?x?x!tf_type.string>, %arg7: tensor<?x?xi32>, %arg8: tensor<2xi32>) -> (tensor<?x?x!tf_type.string>, tensor<?x?x!tf_type.string>) attributes {tac.device = "CPU", tac.inference_type = "FLOAT", tac.interface_name = "func_1"} {
-// CHECK:     %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "FlexTensorListReserve", custom_option = #tfl<const_bytes : "0x1154656E736F724C697374526573657276650040121154656E736F724C697374526573657276651A001A002A130A0D656C656D656E745F6474797065120230072A100A0A73686170655F74797065120230033200000255431414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<1xi32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<?x!tf_type.string>>>
-// CHECK:     %1 = "tfl.custom"(%arg0, %arg1) {custom_code = "FlexTensorListReserve", custom_option = #tfl<const_bytes : "0x1154656E736F724C697374526573657276650040121154656E736F724C697374526573657276651A001A002A130A0D656C656D656E745F6474797065120230032A100A0A73686170655F74797065120230033200000255431414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<1xi32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<?xi32>>>
+// CHECK:     %0 = "tfl.custom"(%arg0, %arg1) <{custom_code = "FlexTensorListReserve", custom_option = #tfl<const_bytes : "0x1154656E736F724C697374526573657276650040121154656E736F724C697374526573657276651A001A002A130A0D656C656D656E745F6474797065120230072A100A0A73686170655F74797065120230033200000255431414042801">}> {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<1xi32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<?x!tf_type.string>>>
+// CHECK:     %1 = "tfl.custom"(%arg0, %arg1) <{custom_code = "FlexTensorListReserve", custom_option = #tfl<const_bytes : "0x1154656E736F724C697374526573657276650040121154656E736F724C697374526573657276651A001A002A130A0D656C656D656E745F6474797065120230032A100A0A73686170655F74797065120230033200000255431414042801">}> {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<1xi32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<?xi32>>>
 // CHECK:     %2:8 = "tfl.while"(%arg2, %arg2, %arg3, %arg4, %0, %0, %1, %1) ({
 // CHECK:     ^bb0(%arg9: tensor<i32>, %arg10: tensor<i32>, %arg11: tensor<?xi32>, %arg12: tensor<?xi32>, %arg13: tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, %arg14: tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, %arg15: tensor<!tf_type.variant<tensor<?xi32>>>, %arg16: tensor<!tf_type.variant<tensor<?xi32>>>):
 // CHECK:       %7 = func.call @func_2_DARWINN_FLOAT(%arg10, %arg1, %arg9) {tac.device = "DARWINN", tac.inference_type = "FLOAT", tac.interface_name = "func_2"} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<i1>
@@ -430,24 +430,24 @@ func.func @cond_false_72730(%arg0: tensor<?x?x!tf_type.string>, %arg1: tensor<?x
 // CHECK:       %cst_8 = arith.constant dense<1> : tensor<1xi32>
 // CHECK:       %cst_9 = arith.constant dense<0> : tensor<1xi32>
 // CHECK:       %7:2 = func.call @func_3_DARWINN_FLOAT(%arg5, %cst_9, %cst_8, %cst_7, %cst_6, %cst_5) {tac.device = "DARWINN", tac.inference_type = "FLOAT", tac.interface_name = "func_3"} : (tensor<?x?x!tf_type.string>, tensor<1xi32>, tensor<1xi32>, tensor<i32>, tensor<i32>, tensor<1xi32>) -> (tensor<?x1xi32>, tensor<2xi32>)
-// CHECK:       %8 = "tfl.reduce_prod"(%7#1, %cst_9) {keep_dims = true, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>) -> tensor<1xi32>
+// CHECK:       %8 = "tfl.reduce_prod"(%7#1, %cst_9) <{keep_dims = true}> {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>) -> tensor<1xi32>
 // CHECK:       %9:3 = func.call @func_4_DARWINN_FLOAT(%arg5, %8, %arg6, %cst_9, %cst_8, %cst_7, %cst_6, %cst_5) {tac.device = "DARWINN", tac.inference_type = "FLOAT", tac.interface_name = "func_4"} : (tensor<?x?x!tf_type.string>, tensor<1xi32>, tensor<?x?x!tf_type.string>, tensor<1xi32>, tensor<1xi32>, tensor<i32>, tensor<i32>, tensor<1xi32>) -> (tensor<?x!tf_type.string>, tensor<?x1xi32>, tensor<2xi32>)
-// CHECK:       %10 = "tfl.reduce_prod"(%9#2, %cst_9) {keep_dims = true, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>) -> tensor<1xi32>
+// CHECK:       %10 = "tfl.reduce_prod"(%9#2, %cst_9) <{keep_dims = true}> {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>) -> tensor<1xi32>
 // CHECK:       %11 = "tfl.expand_dims"(%arg11, %cst_4) {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
 // CHECK:       %12 = "tfl.expand_dims"(%arg12, %cst_4) {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
 // CHECK:       %13:10 = func.call @func_5_DARWINN_FLOAT(%arg6, %10, %arg10, %cst_6, %11, %9#1, %cst_3, %cst_2, %12, %7#0, %9#0, %arg7, %cst_1, %cst_0, %arg11, %cst, %arg12, %arg9) {tac.device = "DARWINN", tac.inference_type = "FLOAT", tac.interface_name = "func_5"} : (tensor<?x?x!tf_type.string>, tensor<1xi32>, tensor<i32>, tensor<i32>, tensor<?x1xi32>, tensor<?x1xi32>, tensor<1xi32>, tensor<!tf_type.string>, tensor<?x1xi32>, tensor<?x1xi32>, tensor<?x!tf_type.string>, tensor<?x?xi32>, tensor<i32>, tensor<5xi32>, tensor<?xi32>, tensor<5xi32>, tensor<?xi32>, tensor<i32>) -> (tensor<i32>, tensor<?x!tf_type.string>, tensor<?x!tf_type.string>, tensor<?x!tf_type.string>, tensor<?x!tf_type.string>, tensor<?xi1>, tensor<?xi1>, tensor<?xi32>, tensor<?xi32>, tensor<i32>)
-// CHECK:       %14 = "tfl.custom"(%13#5, %13#2, %13#1) {custom_code = "FlexSelect", custom_option = #tfl<const_bytes : "0x0653656C6563740031120653656C6563741A001A001A002A070A01541202300732180A052E31323131120F7768696C655F626F64795F3733303200023B341414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<?xi1>, tensor<?x!tf_type.string>, tensor<?x!tf_type.string>) -> tensor<?x!tf_type.string>
-// CHECK:       %15 = "tfl.custom"(%arg13, %arg10, %14) {custom_code = "FlexTensorListSetItem", custom_option = #tfl<const_bytes : "0x1154656E736F724C6973745365744974656D0047121154656E736F724C6973745365744974656D1A001A001A002A130A0D656C656D656E745F64747970651202300732170A042E326333120F7768696C655F626F64795F3733303200025C4A1414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, tensor<i32>, tensor<?x!tf_type.string>) -> tensor<!tf_type.variant<tensor<?x!tf_type.string>>>
-// CHECK:       %16 = "tfl.custom"(%13#6, %13#4, %13#3) {custom_code = "FlexSelect", custom_option = #tfl<const_bytes : "0x0653656C6563740031120653656C6563741A001A001A002A070A01541202300732180A052E31323166120F7768696C655F626F64795F3733303200023B341414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<?xi1>, tensor<?x!tf_type.string>, tensor<?x!tf_type.string>) -> tensor<?x!tf_type.string>
-// CHECK:       %17 = "tfl.custom"(%arg14, %arg10, %16) {custom_code = "FlexTensorListSetItem", custom_option = #tfl<const_bytes : "0x1154656E736F724C6973745365744974656D0047121154656E736F724C6973745365744974656D1A001A001A002A130A0D656C656D656E745F64747970651202300732170A042E326335120F7768696C655F626F64795F3733303200025C4A1414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, tensor<i32>, tensor<?x!tf_type.string>) -> tensor<!tf_type.variant<tensor<?x!tf_type.string>>>
-// CHECK:       %18 = "tfl.custom"(%arg15, %arg10, %13#7) {custom_code = "FlexTensorListSetItem", custom_option = #tfl<const_bytes : "0x1154656E736F724C6973745365744974656D0047121154656E736F724C6973745365744974656D1A001A001A002A130A0D656C656D656E745F64747970651202300332170A042E326337120F7768696C655F626F64795F3733303200025C4A1414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<!tf_type.variant<tensor<?xi32>>>, tensor<i32>, tensor<?xi32>) -> tensor<!tf_type.variant<tensor<?xi32>>>
-// CHECK:       %19 = "tfl.custom"(%arg16, %arg10, %13#8) {custom_code = "FlexTensorListSetItem", custom_option = #tfl<const_bytes : "0x1154656E736F724C6973745365744974656D0047121154656E736F724C6973745365744974656D1A001A001A002A130A0D656C656D656E745F64747970651202300332170A042E343139120F7768696C655F626F64795F3733303200025C4A1414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<!tf_type.variant<tensor<?xi32>>>, tensor<i32>, tensor<?xi32>) -> tensor<!tf_type.variant<tensor<?xi32>>>
+// CHECK:       %14 = "tfl.custom"(%13#5, %13#2, %13#1) <{custom_code = "FlexSelect", custom_option = #tfl<const_bytes : "0x0653656C6563740031120653656C6563741A001A001A002A070A01541202300732180A052E31323131120F7768696C655F626F64795F3733303200023B341414042801">}> {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<?xi1>, tensor<?x!tf_type.string>, tensor<?x!tf_type.string>) -> tensor<?x!tf_type.string>
+// CHECK:       %15 = "tfl.custom"(%arg13, %arg10, %14) <{custom_code = "FlexTensorListSetItem", custom_option = #tfl<const_bytes : "0x1154656E736F724C6973745365744974656D0047121154656E736F724C6973745365744974656D1A001A001A002A130A0D656C656D656E745F64747970651202300732170A042E326333120F7768696C655F626F64795F3733303200025C4A1414042801">}> {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, tensor<i32>, tensor<?x!tf_type.string>) -> tensor<!tf_type.variant<tensor<?x!tf_type.string>>>
+// CHECK:       %16 = "tfl.custom"(%13#6, %13#4, %13#3) <{custom_code = "FlexSelect", custom_option = #tfl<const_bytes : "0x0653656C6563740031120653656C6563741A001A001A002A070A01541202300732180A052E31323166120F7768696C655F626F64795F3733303200023B341414042801">}> {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<?xi1>, tensor<?x!tf_type.string>, tensor<?x!tf_type.string>) -> tensor<?x!tf_type.string>
+// CHECK:       %17 = "tfl.custom"(%arg14, %arg10, %16) <{custom_code = "FlexTensorListSetItem", custom_option = #tfl<const_bytes : "0x1154656E736F724C6973745365744974656D0047121154656E736F724C6973745365744974656D1A001A001A002A130A0D656C656D656E745F64747970651202300732170A042E326335120F7768696C655F626F64795F3733303200025C4A1414042801">}> {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, tensor<i32>, tensor<?x!tf_type.string>) -> tensor<!tf_type.variant<tensor<?x!tf_type.string>>>
+// CHECK:       %18 = "tfl.custom"(%arg15, %arg10, %13#7) <{custom_code = "FlexTensorListSetItem", custom_option = #tfl<const_bytes : "0x1154656E736F724C6973745365744974656D0047121154656E736F724C6973745365744974656D1A001A001A002A130A0D656C656D656E745F64747970651202300332170A042E326337120F7768696C655F626F64795F3733303200025C4A1414042801">}> {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<!tf_type.variant<tensor<?xi32>>>, tensor<i32>, tensor<?xi32>) -> tensor<!tf_type.variant<tensor<?xi32>>>
+// CHECK:       %19 = "tfl.custom"(%arg16, %arg10, %13#8) <{custom_code = "FlexTensorListSetItem", custom_option = #tfl<const_bytes : "0x1154656E736F724C6973745365744974656D0047121154656E736F724C6973745365744974656D1A001A001A002A130A0D656C656D656E745F64747970651202300332170A042E343139120F7768696C655F626F64795F3733303200025C4A1414042801">}> {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<!tf_type.variant<tensor<?xi32>>>, tensor<i32>, tensor<?xi32>) -> tensor<!tf_type.variant<tensor<?xi32>>>
 // CHECK:       "tfl.yield"(%13#9, %13#0, %13#7, %13#8, %15, %17, %18, %19) : (tensor<i32>, tensor<i32>, tensor<?xi32>, tensor<?xi32>, tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, tensor<!tf_type.variant<tensor<?xi32>>>, tensor<!tf_type.variant<tensor<?xi32>>>) -> ()
 // CHECK:     }) {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<i32>, tensor<i32>, tensor<?xi32>, tensor<?xi32>, tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, tensor<!tf_type.variant<tensor<?xi32>>>, tensor<!tf_type.variant<tensor<?xi32>>>) -> (tensor<i32>, tensor<i32>, tensor<?xi32>, tensor<?xi32>, tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, tensor<!tf_type.variant<tensor<?xi32>>>, tensor<!tf_type.variant<tensor<?xi32>>>)
-// CHECK:     %3 = "tfl.custom"(%2#4, %arg0) {custom_code = "FlexTensorListStack", custom_option = #tfl<const_bytes : "0x0F54656E736F724C697374537461636B0049120F54656E736F724C697374537461636B1A001A002A130A0D656C656D656E745F6474797065120230072A1B0A0C6E756D5F656C656D656E7473120B18FFFFFFFFFFFFFFFFFF01320000025C4C1414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, tensor<1xi32>) -> tensor<?x?x!tf_type.string>
-// CHECK:     %4 = "tfl.custom"(%3, %arg8) {custom_code = "FlexTranspose", custom_option = #tfl<const_bytes : "0x095472616E73706F7365002712095472616E73706F73651A001A002A0B0A05547065726D120230032A070A01541202300732000002342A1414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<?x?x!tf_type.string>, tensor<2xi32>) -> tensor<?x?x!tf_type.string>
-// CHECK:     %5 = "tfl.custom"(%2#5, %arg0) {custom_code = "FlexTensorListStack", custom_option = #tfl<const_bytes : "0x0F54656E736F724C697374537461636B0049120F54656E736F724C697374537461636B1A001A002A130A0D656C656D656E745F6474797065120230072A1B0A0C6E756D5F656C656D656E7473120B18FFFFFFFFFFFFFFFFFF01320000025C4C1414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, tensor<1xi32>) -> tensor<?x?x!tf_type.string>
-// CHECK:     %6 = "tfl.custom"(%5, %arg8) {custom_code = "FlexTranspose", custom_option = #tfl<const_bytes : "0x095472616E73706F7365002712095472616E73706F73651A001A002A070A0154120230072A0B0A05547065726D1202300332000002342A1414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<?x?x!tf_type.string>, tensor<2xi32>) -> tensor<?x?x!tf_type.string>
+// CHECK:     %3 = "tfl.custom"(%2#4, %arg0) <{custom_code = "FlexTensorListStack", custom_option = #tfl<const_bytes : "0x0F54656E736F724C697374537461636B0049120F54656E736F724C697374537461636B1A001A002A130A0D656C656D656E745F6474797065120230072A1B0A0C6E756D5F656C656D656E7473120B18FFFFFFFFFFFFFFFFFF01320000025C4C1414042801">}> {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, tensor<1xi32>) -> tensor<?x?x!tf_type.string>
+// CHECK:     %4 = "tfl.custom"(%3, %arg8) <{custom_code = "FlexTranspose", custom_option = #tfl<const_bytes : "0x095472616E73706F7365002712095472616E73706F73651A001A002A0B0A05547065726D120230032A070A01541202300732000002342A1414042801">}> {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<?x?x!tf_type.string>, tensor<2xi32>) -> tensor<?x?x!tf_type.string>
+// CHECK:     %5 = "tfl.custom"(%2#5, %arg0) <{custom_code = "FlexTensorListStack", custom_option = #tfl<const_bytes : "0x0F54656E736F724C697374537461636B0049120F54656E736F724C697374537461636B1A001A002A130A0D656C656D656E745F6474797065120230072A1B0A0C6E756D5F656C656D656E7473120B18FFFFFFFFFFFFFFFFFF01320000025C4C1414042801">}> {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, tensor<1xi32>) -> tensor<?x?x!tf_type.string>
+// CHECK:     %6 = "tfl.custom"(%5, %arg8) <{custom_code = "FlexTranspose", custom_option = #tfl<const_bytes : "0x095472616E73706F7365002712095472616E73706F73651A001A002A070A0154120230072A0B0A05547065726D1202300332000002342A1414042801">}> {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<?x?x!tf_type.string>, tensor<2xi32>) -> tensor<?x?x!tf_type.string>
 // CHECK:     return %4, %6 : tensor<?x?x!tf_type.string>, tensor<?x?x!tf_type.string>
 // CHECK:   }
 // CHECK:   func.func private @func_2_DARWINN_FLOAT(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> tensor<i1> attributes {tac.device = "DARWINN", tac.inference_type = "FLOAT", tac.interface_name = "func_2"} {
@@ -458,25 +458,25 @@ func.func @cond_false_72730(%arg0: tensor<?x?x!tf_type.string>, %arg1: tensor<?x
 // CHECK:   }
 // CHECK:   func.func private @func_3_DARWINN_FLOAT(%arg0: tensor<?x?x!tf_type.string>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>, %arg3: tensor<i32>, %arg4: tensor<i32>, %arg5: tensor<1xi32>) -> (tensor<?x1xi32>, tensor<2xi32>) attributes {tac.device = "DARWINN", tac.inference_type = "FLOAT", tac.interface_name = "func_3"} {
 // CHECK:     %0 = "tfl.shape"(%arg0) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x?x!tf_type.string>) -> tensor<2xi32>
-// CHECK:     %1 = "tfl.strided_slice"(%0, %arg1, %arg2, %arg2) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 1 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+// CHECK:     %1 = "tfl.strided_slice"(%0, %arg1, %arg2, %arg2) <{begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 1 : i32}> {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
 // CHECK:     %2 = "tfl.range"(%arg3, %1, %arg4) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
-// CHECK:     %3 = "tfl.pack"(%1, %arg4) {axis = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT", values_count = 2 : i32} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-// CHECK:     %4 = "tfl.strided_slice"(%0, %arg2, %arg5, %arg2) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 1 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
-// CHECK:     %5 = tfl.mul(%2, %4) {fused_activation_function = "NONE", tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
+// CHECK:     %3 = "tfl.pack"(%1, %arg4) <{axis = 0 : i32, values_count = 2 : i32}> {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+// CHECK:     %4 = "tfl.strided_slice"(%0, %arg2, %arg5, %arg2) <{begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 1 : i32}> {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+// CHECK:     %5 = tfl.mul(%2, %4) <{fused_activation_function = "NONE"}> {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
 // CHECK:     %6 = "tfl.reshape"(%5, %3) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<2xi32>) -> tensor<?x1xi32>
-// CHECK:     %7 = "tfl.strided_slice"(%0, %arg1, %arg5, %arg2) {begin_mask = 1 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+// CHECK:     %7 = "tfl.strided_slice"(%0, %arg1, %arg5, %arg2) <{begin_mask = 1 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32}> {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
 // CHECK:     return %6, %7 : tensor<?x1xi32>, tensor<2xi32>
 // CHECK:   }
 // CHECK:   func.func private @func_4_DARWINN_FLOAT(%arg0: tensor<?x?x!tf_type.string>, %arg1: tensor<1xi32>, %arg2: tensor<?x?x!tf_type.string>, %arg3: tensor<1xi32>, %arg4: tensor<1xi32>, %arg5: tensor<i32>, %arg6: tensor<i32>, %arg7: tensor<1xi32>) -> (tensor<?x!tf_type.string>, tensor<?x1xi32>, tensor<2xi32>) attributes {tac.device = "DARWINN", tac.inference_type = "FLOAT", tac.interface_name = "func_4"} {
 // CHECK:     %0 = "tfl.reshape"(%arg0, %arg1) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x?x!tf_type.string>, tensor<1xi32>) -> tensor<?x!tf_type.string>
 // CHECK:     %1 = "tfl.shape"(%arg2) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x?x!tf_type.string>) -> tensor<2xi32>
-// CHECK:     %2 = "tfl.strided_slice"(%1, %arg3, %arg4, %arg4) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 1 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+// CHECK:     %2 = "tfl.strided_slice"(%1, %arg3, %arg4, %arg4) <{begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 1 : i32}> {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
 // CHECK:     %3 = "tfl.range"(%arg5, %2, %arg6) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
-// CHECK:     %4 = "tfl.pack"(%2, %arg6) {axis = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT", values_count = 2 : i32} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-// CHECK:     %5 = "tfl.strided_slice"(%1, %arg4, %arg7, %arg4) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 1 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
-// CHECK:     %6 = tfl.mul(%3, %5) {fused_activation_function = "NONE", tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
+// CHECK:     %4 = "tfl.pack"(%2, %arg6) <{axis = 0 : i32, values_count = 2 : i32}> {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+// CHECK:     %5 = "tfl.strided_slice"(%1, %arg4, %arg7, %arg4) <{begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 1 : i32}> {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+// CHECK:     %6 = tfl.mul(%3, %5) <{fused_activation_function = "NONE"}> {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
 // CHECK:     %7 = "tfl.reshape"(%6, %4) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<2xi32>) -> tensor<?x1xi32>
-// CHECK:     %8 = "tfl.strided_slice"(%1, %arg3, %arg7, %arg4) {begin_mask = 1 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+// CHECK:     %8 = "tfl.strided_slice"(%1, %arg3, %arg7, %arg4) <{begin_mask = 1 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32}> {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
 // CHECK:     return %0, %7, %8 : tensor<?x!tf_type.string>, tensor<?x1xi32>, tensor<2xi32>
 // CHECK:   }
 // CHECK:   func.func private @func_5_DARWINN_FLOAT(%arg0: tensor<?x?x!tf_type.string>, %arg1: tensor<1xi32>, %arg2: tensor<i32>, %arg3: tensor<i32>, %arg4: tensor<?x1xi32>, %arg5: tensor<?x1xi32>, %arg6: tensor<1xi32>, %arg7: tensor<!tf_type.string>, %arg8: tensor<?x1xi32>, %arg9: tensor<?x1xi32>, %arg10: tensor<?x!tf_type.string>, %arg11: tensor<?x?xi32>, %arg12: tensor<i32>, %arg13: tensor<5xi32>, %arg14: tensor<?xi32>, %arg15: tensor<5xi32>, %arg16: tensor<?xi32>, %arg17: tensor<i32>) -> (tensor<i32>, tensor<?x!tf_type.string>, tensor<?x!tf_type.string>, tensor<?x!tf_type.string>, tensor<?x!tf_type.string>, tensor<?xi1>, tensor<?xi1>, tensor<?xi32>, tensor<?xi32>, tensor<i32>) attributes {tac.device = "DARWINN", tac.inference_type = "FLOAT", tac.interface_name = "func_5"} {
@@ -484,22 +484,22 @@ func.func @cond_false_72730(%arg0: tensor<?x?x!tf_type.string>, %arg1: tensor<?x
 // CHECK:     %1 = tfl.add %arg2, %arg3 {fused_activation_function = "NONE", tac.device = "DARWINN", tac.inference_type = "FLOAT"} : tensor<i32>
 // CHECK:     %2 = tfl.add %arg4, %arg5 {fused_activation_function = "NONE", tac.device = "DARWINN", tac.inference_type = "FLOAT"} : tensor<?x1xi32>
 // CHECK:     %3 = "tfl.reshape"(%2, %arg6) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x1xi32>, tensor<1xi32>) -> tensor<?xi32>
-// CHECK:     %4 = "tfl.gather"(%0, %3) {axis = 0 : i32, batch_dims = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x!tf_type.string>, tensor<?xi32>) -> tensor<?x!tf_type.string>
+// CHECK:     %4 = "tfl.gather"(%0, %3) <{axis = 0 : i32, batch_dims = 0 : i32}> {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x!tf_type.string>, tensor<?xi32>) -> tensor<?x!tf_type.string>
 // CHECK:     %5 = "tfl.reshape"(%4, %arg6) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x!tf_type.string>, tensor<1xi32>) -> tensor<?x!tf_type.string>
 // CHECK:     %6 = "tfl.shape"(%5) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x!tf_type.string>) -> tensor<1xi32>
 // CHECK:     %7 = "tfl.fill"(%6, %arg7) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<1xi32>, tensor<!tf_type.string>) -> tensor<?x!tf_type.string>
 // CHECK:     %8 = tfl.add %arg8, %arg9 {fused_activation_function = "NONE", tac.device = "DARWINN", tac.inference_type = "FLOAT"} : tensor<?x1xi32>
 // CHECK:     %9 = "tfl.reshape"(%8, %arg6) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x1xi32>, tensor<1xi32>) -> tensor<?xi32>
-// CHECK:     %10 = "tfl.gather"(%arg10, %9) {axis = 0 : i32, batch_dims = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x!tf_type.string>, tensor<?xi32>) -> tensor<?x!tf_type.string>
+// CHECK:     %10 = "tfl.gather"(%arg10, %9) <{axis = 0 : i32, batch_dims = 0 : i32}> {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x!tf_type.string>, tensor<?xi32>) -> tensor<?x!tf_type.string>
 // CHECK:     %11 = "tfl.reshape"(%10, %arg6) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x!tf_type.string>, tensor<1xi32>) -> tensor<?x!tf_type.string>
 // CHECK:     %12 = "tfl.shape"(%11) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x!tf_type.string>) -> tensor<1xi32>
 // CHECK:     %13 = "tfl.fill"(%12, %arg7) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<1xi32>, tensor<!tf_type.string>) -> tensor<?x!tf_type.string>
-// CHECK:     %14 = "tfl.gather"(%arg11, %arg2) {axis = 0 : i32, batch_dims = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x?xi32>, tensor<i32>) -> tensor<?xi32>
+// CHECK:     %14 = "tfl.gather"(%arg11, %arg2) <{axis = 0 : i32, batch_dims = 0 : i32}> {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x?xi32>, tensor<i32>) -> tensor<?xi32>
 // CHECK:     %15 = "tfl.equal"(%14, %arg12) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi1>
 // CHECK:     %16 = "tfl.equal"(%14, %arg3) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi1>
-// CHECK:     %17 = "tfl.gather"(%arg13, %14) {axis = 0 : i32, batch_dims = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<5xi32>, tensor<?xi32>) -> tensor<?xi32>
+// CHECK:     %17 = "tfl.gather"(%arg13, %14) <{axis = 0 : i32, batch_dims = 0 : i32}> {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<5xi32>, tensor<?xi32>) -> tensor<?xi32>
 // CHECK:     %18 = tfl.add %arg14, %17 {fused_activation_function = "NONE", tac.device = "DARWINN", tac.inference_type = "FLOAT"} : tensor<?xi32>
-// CHECK:     %19 = "tfl.gather"(%arg15, %14) {axis = 0 : i32, batch_dims = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<5xi32>, tensor<?xi32>) -> tensor<?xi32>
+// CHECK:     %19 = "tfl.gather"(%arg15, %14) <{axis = 0 : i32, batch_dims = 0 : i32}> {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<5xi32>, tensor<?xi32>) -> tensor<?xi32>
 // CHECK:     %20 = tfl.add %arg16, %19 {fused_activation_function = "NONE", tac.device = "DARWINN", tac.inference_type = "FLOAT"} : tensor<?xi32>
 // CHECK:     %21 = tfl.add %arg17, %arg3 {fused_activation_function = "NONE", tac.device = "DARWINN", tac.inference_type = "FLOAT"} : tensor<i32>
 // CHECK:     return %1, %5, %7, %11, %13, %15, %16, %18, %20, %21 : tensor<i32>, tensor<?x!tf_type.string>, tensor<?x!tf_type.string>, tensor<?x!tf_type.string>, tensor<?x!tf_type.string>, tensor<?xi1>, tensor<?xi1>, tensor<?xi32>, tensor<?xi32>, tensor<i32>
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/cost_model.cc b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/cost_model.cc
index 4efdd053eec5c2..701d9cad1c34c1 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/cost_model.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/cost_model.cc
@@ -59,13 +59,14 @@ int64_t GetTransferredTensorBytes(func::CallOp from_graph,
   for (auto input : to_graph.getOperands()) {
     Operation* input_op = input.getDefiningOp();
     if (input_op && input_op == from_graph.getOperation()) {
-      auto input_type = input.getType().dyn_cast_or_null<RankedTensorType>();
+      auto input_type =
+          mlir::dyn_cast_or_null<RankedTensorType>(input.getType());
       if (input_type == nullptr || !input_type.hasStaticShape()) continue;
       // Quantized type does not support getSizeInBits.
       if (IsQUI8Type(input_type) || IsQI8Type(input_type)) {
         total_size_transferred += input_type.getNumElements() * 8;
       } else {
-        auto s_type = input_type.cast<ShapedType>();
+        auto s_type = mlir::cast<ShapedType>(input_type);
         total_size_transferred +=
             s_type.getNumElements() * s_type.getElementTypeBitWidth();
       }
@@ -81,7 +82,8 @@ int64_t GetTransferredElementCount(func::CallOp from_graph,
   for (auto input : to_graph.getOperands()) {
     Operation* input_op = input.getDefiningOp();
     if (input_op && input_op == from_graph.getOperation()) {
-      auto input_type = input.getType().dyn_cast_or_null<RankedTensorType>();
+      auto input_type =
+          mlir::dyn_cast_or_null<RankedTensorType>(input.getType());
       if (input_type == nullptr || !input_type.hasStaticShape()) continue;
       total_element_count += input_type.getNumElements();
     }
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform.cc b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform.cc
index ea1c299fd546c1..0c37a8da20575f 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform.cc
@@ -156,13 +156,13 @@ struct FoldQuantizedI32ToFloat : public OpRewritePattern<TFL::DequantizeOp> {
     if (!IsQI32Type(input_dequant.getType())) return failure();
 
     auto output_type =
-        dequant_op.getOutput().getType().dyn_cast_or_null<ShapedType>();
+        mlir::dyn_cast_or_null<ShapedType>(dequant_op.getOutput().getType());
     if (!output_type || !output_type.getElementType().isF32()) return failure();
 
-    auto input_type = input_dequant.getType().dyn_cast<ShapedType>();
+    auto input_type = mlir::dyn_cast<ShapedType>(input_dequant.getType());
     // TODO(renjieliu): support UniformQuantizedPerAxisType.
-    auto q_type = input_type.getElementType()
-                      .dyn_cast_or_null<quant::UniformQuantizedType>();
+    auto q_type = mlir::dyn_cast_or_null<quant::UniformQuantizedType>(
+        input_type.getElementType());
     if (!q_type) return failure();
 
     const float scale = q_type.getScale();
@@ -183,9 +183,9 @@ struct FoldQuantizedI32ToFloat : public OpRewritePattern<TFL::DequantizeOp> {
     };
 
     auto dequant_values =
-        input_values.cast<DenseIntOrFPElementsAttr>().mapValues(
-            FloatType::getF32(rewriter.getContext()),
-            llvm::function_ref<DequantizeFuncType>(dequantize_func));
+        mlir::cast<DenseIntOrFPElementsAttr>(input_values)
+            .mapValues(FloatType::getF32(rewriter.getContext()),
+                       llvm::function_ref<DequantizeFuncType>(dequantize_func));
     rewriter.replaceOpWithNewOp<TFL::ConstOp>(dequant_op, dequant_op.getType(),
                                               dequant_values);
 
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform_patterns.cc b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform_patterns.cc
index baf25aa54c109b..278c54e8805f3d 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform_patterns.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform_patterns.cc
@@ -96,11 +96,11 @@ LogicalResult EnsureBias(Operation* op, int bias_idx,
                          PatternRewriter& rewriter) {
   auto bias = op->getOperand(bias_idx);
 
-  if (!bias.getType().isa<NoneType>()) return failure();
+  if (!mlir::isa<NoneType>(bias.getType())) return failure();
 
   // Proceed to create a zero bias.
   auto output = op->getResult(0);
-  auto output_type = output.getType().dyn_cast_or_null<RankedTensorType>();
+  auto output_type = mlir::dyn_cast_or_null<RankedTensorType>(output.getType());
   if (!output_type) return failure();
 
   // bias should be a vector sized of the last output dim.
@@ -163,7 +163,7 @@ SmallVector<Value, 4> SliceOutputs(Operation* split_op, Value input,
     SmallVector<int32_t, 4> slice_size;
     auto current_output = split_op->getResult(i);
     auto current_output_type =
-        current_output.getType().cast<RankedTensorType>();
+        mlir::cast<RankedTensorType>(current_output.getType());
     for (int d = 0; d < input_type.getRank(); ++d) {
       if (d == split_dim) {
         // Split dimension.
@@ -208,7 +208,7 @@ LogicalResult LowerPackIntoConcatReshape::matchAndRewrite(
     TFL::PackOp pack_op, PatternRewriter& rewriter) const {
   // Pack op should have same shape type.
   SmallVector<Value, 5> pack_inputs(pack_op.getValues());
-  auto input_type = pack_inputs[0].getType().dyn_cast<RankedTensorType>();
+  auto input_type = mlir::dyn_cast<RankedTensorType>(pack_inputs[0].getType());
   if (!input_type) return failure();
 
   // Figure out output shapes.
@@ -266,8 +266,8 @@ LogicalResult SquaredDifference::matchAndRewrite(
     TFL::SquaredDifferenceOp squared_diff_op, PatternRewriter& rewriter) const {
   auto x = squared_diff_op.getLhs();
   auto y = squared_diff_op.getRhs();
-  auto x_type = x.getType().dyn_cast<RankedTensorType>();
-  auto y_type = y.getType().dyn_cast<RankedTensorType>();
+  auto x_type = mlir::dyn_cast<RankedTensorType>(x.getType());
+  auto y_type = mlir::dyn_cast<RankedTensorType>(y.getType());
   if (!x_type || !y_type) return failure();
   if (x_type.getShape() != y_type.getShape()) return failure();
 
@@ -290,16 +290,16 @@ LogicalResult UnrollSplit::matchAndRewrite(TFL::SplitOp split_op,
                                            PatternRewriter& rewriter) const {
   auto num_splits = split_op.getNumSplits();
   auto input = split_op.getValue();
-  auto input_type = input.getType().dyn_cast<RankedTensorType>();
+  auto input_type = mlir::dyn_cast<RankedTensorType>(input.getType());
   if (input_type == nullptr || !input_type.hasStaticShape()) return failure();
 
   for (auto result : split_op.getResults()) {
-    auto result_type = result.getType().dyn_cast<RankedTensorType>();
+    auto result_type = mlir::dyn_cast<RankedTensorType>(result.getType());
     if (result_type == nullptr) return failure();
   }
 
   auto output = split_op.getResult(0);
-  auto output_type = output.getType().cast<RankedTensorType>();
+  auto output_type = mlir::cast<RankedTensorType>(output.getType());
 
   // TODO(renjieliu): change to use split_dim when we raise the constants
   // as well.
@@ -330,11 +330,11 @@ LogicalResult UnrollSplitV::matchAndRewrite(TFL::SplitVOp splitv_op,
     return failure();
 
   auto input = splitv_op.getValue();
-  auto input_type = input.getType().dyn_cast_or_null<RankedTensorType>();
+  auto input_type = mlir::dyn_cast_or_null<RankedTensorType>(input.getType());
   if (!input_type || !input_type.hasRank()) return failure();
 
   for (auto result : splitv_op.getResults()) {
-    auto result_type = result.getType().dyn_cast<RankedTensorType>();
+    auto result_type = mlir::dyn_cast<RankedTensorType>(result.getType());
     if (result_type == nullptr) return failure();
   }
 
@@ -371,20 +371,21 @@ LogicalResult PadSlice::matchAndRewrite(TFL::SliceOp slice_op,
   // We have to know the shape of the input, as well as the begin/size.
   // also, begin and size have to be constants.
   auto input = slice_op.getInput();
-  auto input_type = input.getType().dyn_cast_or_null<RankedTensorType>();
+  auto input_type = mlir::dyn_cast_or_null<RankedTensorType>(input.getType());
   if (!input_type || !input_type.hasStaticShape()) return failure();
 
   if (input_type.getRank() >= 4) return failure();
 
   auto begin = slice_op.getBegin();
-  auto begin_type = begin.getType().dyn_cast_or_null<RankedTensorType>();
+  auto begin_type = mlir::dyn_cast_or_null<RankedTensorType>(begin.getType());
   if (!begin_type || !begin_type.hasStaticShape()) return failure();
 
   auto size = slice_op.getSize();
-  auto size_type = size.getType().dyn_cast_or_null<RankedTensorType>();
+  auto size_type = mlir::dyn_cast_or_null<RankedTensorType>(size.getType());
   if (!size_type || !size_type.hasStaticShape()) return failure();
 
-  auto output_type = slice_op.getType().dyn_cast_or_null<RankedTensorType>();
+  auto output_type =
+      mlir::dyn_cast_or_null<RankedTensorType>(slice_op.getType());
   if (!output_type || !output_type.hasStaticShape()) return failure();
 
   // Pad 0s in front of the begin.
@@ -472,17 +473,17 @@ LogicalResult FullyConnectedToConv::matchAndRewrite(
     TFL::FullyConnectedOp fc_op, PatternRewriter& rewriter) const {
   // We have to know the shape of the input.
   auto input = fc_op.getInput();
-  auto input_type = input.getType().dyn_cast_or_null<RankedTensorType>();
+  auto input_type = mlir::dyn_cast_or_null<RankedTensorType>(input.getType());
   if (!input_type || !input_type.hasStaticShape()) return failure();
 
   // We have to know the shape of the weight.
   auto weight = fc_op.getFilter();
-  auto weight_type = weight.getType().dyn_cast_or_null<RankedTensorType>();
+  auto weight_type = mlir::dyn_cast_or_null<RankedTensorType>(weight.getType());
   if (!weight_type || !weight_type.hasStaticShape()) return failure();
 
   // We have to know the shape of the output as well.
   auto output = fc_op.getResult(0);
-  auto output_type = output.getType().dyn_cast_or_null<RankedTensorType>();
+  auto output_type = mlir::dyn_cast_or_null<RankedTensorType>(output.getType());
   if (!output_type || !output_type.hasStaticShape()) return failure();
 
   // Insert a reshape after the input.
@@ -532,13 +533,14 @@ LogicalResult PadConcat::matchAndRewrite(TFL::ConcatenationOp concat_op,
                                          PatternRewriter& rewriter) const {
   int rank = -1;
   for (auto input : concat_op.getValues()) {
-    auto input_type = input.getType().dyn_cast_or_null<RankedTensorType>();
+    auto input_type = mlir::dyn_cast_or_null<RankedTensorType>(input.getType());
     if (!input_type || !input_type.hasStaticShape()) return failure();
 
     rank = input_type.getRank();
   }
 
-  auto output_type = concat_op.getType().dyn_cast_or_null<RankedTensorType>();
+  auto output_type =
+      mlir::dyn_cast_or_null<RankedTensorType>(concat_op.getType());
   if (!output_type || !output_type.hasStaticShape()) return failure();
 
   if (rank >= 4) return failure();
@@ -547,7 +549,7 @@ LogicalResult PadConcat::matchAndRewrite(TFL::ConcatenationOp concat_op,
   // We will insert a reshape op after every input.
   SmallVector<Value, 4> reshape_ops;
   for (auto input : concat_op.getValues()) {
-    auto input_type = input.getType().cast<RankedTensorType>();
+    auto input_type = mlir::cast<RankedTensorType>(input.getType());
     // Get the new shape.
     SmallVector<int64_t, 4> new_shape;
     for (int i = 0; i < 4 - rank; ++i) {
@@ -603,7 +605,7 @@ LogicalResult PadConcat::matchAndRewrite(TFL::ConcatenationOp concat_op,
 LogicalResult ReduceMeanToAvgPool::matchAndRewrite(
     TFL::MeanOp mean_op, PatternRewriter& rewriter) const {
   auto input = mean_op.getInput();
-  auto input_type = input.getType().dyn_cast_or_null<RankedTensorType>();
+  auto input_type = mlir::dyn_cast_or_null<RankedTensorType>(input.getType());
   // Only 4d is supported here.
   if (!input_type || input_type.getRank() != 4) return failure();
 
@@ -619,7 +621,7 @@ LogicalResult ReduceMeanToAvgPool::matchAndRewrite(
   }
 
   auto output = mean_op.getOutput();
-  auto output_type = output.getType().dyn_cast_or_null<RankedTensorType>();
+  auto output_type = mlir::dyn_cast_or_null<RankedTensorType>(output.getType());
   if (!output_type) return failure();
 
   auto input_quantized_type =
@@ -669,7 +671,7 @@ LogicalResult ReduceMeanToAvgPool::matchAndRewrite(
 LogicalResult InsertRequantForReduceMean::matchAndRewrite(
     TFL::MeanOp mean_op, PatternRewriter& rewriter) const {
   auto input = mean_op.getInput();
-  auto input_type = input.getType().dyn_cast_or_null<ShapedType>();
+  auto input_type = mlir::dyn_cast_or_null<ShapedType>(input.getType());
   if (!input_type) return failure();
 
   // Only need to do this for quantized input.
@@ -678,7 +680,7 @@ LogicalResult InsertRequantForReduceMean::matchAndRewrite(
   if (!input_quantized_type) return failure();
 
   auto output = mean_op.getOutput();
-  auto output_type = output.getType().dyn_cast_or_null<ShapedType>();
+  auto output_type = mlir::dyn_cast_or_null<ShapedType>(output.getType());
   if (!output_type) return failure();
   auto output_quantized_type =
       quant::QuantizedType::getQuantizedElementType(output_type);
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/fold_constants_to_subgraph.cc b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/fold_constants_to_subgraph.cc
index b6c544a8f69c9b..e4985f2b5700d5 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/fold_constants_to_subgraph.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/fold_constants_to_subgraph.cc
@@ -107,11 +107,12 @@ bool IsConstOrQConstInt(Operation* op) {
 
   if (auto arith_const_op = dyn_cast_or_null<arith::ConstantOp>(op)) {
     // arith ConstOp path.
-    auto type = arith_const_op.getType().cast<ShapedType>().getElementType();
+    auto type =
+        mlir::cast<ShapedType>(arith_const_op.getType()).getElementType();
     if (!type.isInteger(32) && !type.isInteger(64)) return false;
   } else if (auto const_op = dyn_cast_or_null<TFL::ConstOp>(op)) {
     // ConstOp path.
-    auto type = const_op.getType().cast<ShapedType>().getElementType();
+    auto type = mlir::cast<ShapedType>(const_op.getType()).getElementType();
     if (!type.isInteger(32) && !type.isInteger(64)) return false;
   } else {
     // QConstOp path.
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/raise_target_subgraphs.cc b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/raise_target_subgraphs.cc
index 4fd9f945764b3a..1ff585f6c71cb6 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/raise_target_subgraphs.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/raise_target_subgraphs.cc
@@ -113,18 +113,11 @@ void AddAttrs(OpsAdded& ops_added, OpBuilder& builder, int func_count) {
   added_func_op->setAttr(kInterfaceNameAttr, interface_name);
   added_call_op->setAttr(kInterfaceNameAttr, interface_name);
 
-  StringAttr device = added_func_op->getRegion(0)
-                          .getBlocks()
-                          .front()
-                          .front()
-                          .getAttr(kDevice)
-                          .cast<StringAttr>();
-  StringAttr inference_type = added_func_op->getRegion(0)
-                                  .getBlocks()
-                                  .front()
-                                  .front()
-                                  .getAttr(kInferenceType)
-                                  .cast<StringAttr>();
+  StringAttr device = mlir::cast<StringAttr>(
+      added_func_op->getRegion(0).getBlocks().front().front().getAttr(kDevice));
+  StringAttr inference_type = mlir::cast<StringAttr>(
+      added_func_op->getRegion(0).getBlocks().front().front().getAttr(
+          kInferenceType));
   added_call_op->setAttr(kDevice, device);
   added_call_op->setAttr(kInferenceType, inference_type);
   added_func_op->setAttr(kDevice, device);
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/tac_filter.cc b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/tac_filter.cc
index a2f7441cc170b1..05cadcbb26b1e7 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/tac_filter.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/tac_filter.cc
@@ -110,7 +110,7 @@ void ApplyTacFilter(ModuleOp module, const TacFilter& tac_filter,
 
   llvm::Regex op_regex(tac_filter.op_filter().op_name_pattern());
   module.walk([&](Operation* op) {
-    auto named_loc = op->getLoc().dyn_cast<NameLoc>();
+    auto named_loc = mlir::dyn_cast<NameLoc>(op->getLoc());
     if (!named_loc) {
       return;
     }
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
index 4655fa7c069b54..0b7bd8cc7177e6 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
+#include <cstdio>
 #include <cstring>
 #include <iterator>
 #include <limits>
@@ -38,6 +39,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/status.h"
+#include "absl/strings/cord.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
@@ -83,6 +85,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/metrics/error_collector_inst.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/lite/schema/mutable/schema_generated.h"
 #include "tensorflow/compiler/mlir/lite/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/lite/utils/low_bit_utils.h"
 #include "tensorflow/compiler/mlir/lite/utils/stateful_ops_utils.h"
@@ -108,7 +111,6 @@ limitations under the License.
 #include "tensorflow/lite/experimental/remat/metadata_util.h"
 #include "tensorflow/lite/graph_info.h"
 #include "tensorflow/lite/python/metrics/converter_error_data.pb.h"
-#include "tensorflow/lite/schema/mutable/schema_generated.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/toco/toco_flags.pb.h"
@@ -161,6 +163,9 @@ ABSL_CONST_INIT const absl::string_view kFlexOpNamePrefix = "Flex";
 // used by the TOCO export. (It does not explain rationale for this choice.)
 constexpr size_t kInitialBufferSize = 10240;
 
+// Flatbuffer fields to be padded to 16 bytes aligned.
+constexpr size_t kFbAlignment = 16;
+
 // Set `isSigned` to false if the `type` is an 8-bit unsigned integer type.
 // Since tflite doesn't support unsigned for other types, returns error if
 // `isSigned` is set to false for other types.
@@ -185,11 +190,11 @@ static StatusOr<tflite::TensorType> GetTFLiteType(Type type,
     return tflite::TensorType_BFLOAT16;
   } else if (type.isF64()) {
     return tflite::TensorType_FLOAT64;
-  } else if (type.isa<mlir::TF::StringType>()) {
+  } else if (mlir::isa<mlir::TF::StringType>(type)) {
     return tflite::TensorType_STRING;
-  } else if (type.isa<mlir::TF::Quint8Type>()) {
+  } else if (mlir::isa<mlir::TF::Quint8Type>(type)) {
     return tflite::TensorType_UINT8;
-  } else if (auto complex_type = type.dyn_cast<mlir::ComplexType>()) {
+  } else if (auto complex_type = mlir::dyn_cast<mlir::ComplexType>(type)) {
     auto ftype = complex_type.getElementType();
     if (ftype.isF32()) {
       return tflite::TensorType_COMPLEX64;
@@ -198,7 +203,7 @@ static StatusOr<tflite::TensorType> GetTFLiteType(Type type,
       return tflite::TensorType_COMPLEX128;
     }
     return Status(absl::StatusCode::kInvalidArgument, "Unsupported type");
-  } else if (auto itype = type.dyn_cast<mlir::IntegerType>()) {
+  } else if (auto itype = mlir::dyn_cast<mlir::IntegerType>(type)) {
     switch (itype.getWidth()) {
       case 1:
         return tflite::TensorType_BOOL;
@@ -223,19 +228,20 @@ static StatusOr<tflite::TensorType> GetTFLiteType(Type type,
                                   : tflite::TensorType_INT64;
     }
   } else if (auto q_uniform_type =
-                 type.dyn_cast<mlir::quant::UniformQuantizedType>()) {
+                 mlir::dyn_cast<mlir::quant::UniformQuantizedType>(type)) {
     return GetTFLiteType(q_uniform_type.getStorageType(),
                          q_uniform_type.isSigned());
   } else if (auto q_peraxis_type =
-                 type.dyn_cast<mlir::quant::UniformQuantizedPerAxisType>()) {
+                 mlir::dyn_cast<mlir::quant::UniformQuantizedPerAxisType>(
+                     type)) {
     return GetTFLiteType(q_peraxis_type.getStorageType(),
                          q_peraxis_type.isSigned());
   } else if (auto q_calibrated_type =
-                 type.dyn_cast<mlir::quant::CalibratedQuantizedType>()) {
+                 mlir::dyn_cast<mlir::quant::CalibratedQuantizedType>(type)) {
     return GetTFLiteType(q_calibrated_type.getExpressedType());
-  } else if (type.isa<mlir::TF::ResourceType>()) {
+  } else if (mlir::isa<mlir::TF::ResourceType>(type)) {
     return tflite::TensorType_RESOURCE;
-  } else if (type.isa<mlir::TF::VariantType>()) {
+  } else if (mlir::isa<mlir::TF::VariantType>(type)) {
     return tflite::TensorType_VARIANT;
   }
   // TFLite export fills FLOAT32 for unknown data types. Returning an error
@@ -253,13 +259,13 @@ static bool IsConst(Operation* op) {
 static bool IsTFResourceOp(Operation* op) {
   for (const auto& operand : op->getOperands()) {
     auto elementType = getElementTypeOrSelf(operand.getType());
-    if (elementType.isa<mlir::TF::ResourceType>()) {
+    if (mlir::isa<mlir::TF::ResourceType>(elementType)) {
       return true;
     }
   }
   for (const auto& result : op->getResults()) {
     auto elementType = getElementTypeOrSelf(result.getType());
-    if (elementType.isa<mlir::TF::ResourceType>()) {
+    if (mlir::isa<mlir::TF::ResourceType>(elementType)) {
       return true;
     }
   }
@@ -305,7 +311,8 @@ static std::string GetOpDescriptionForDebug(Operation* inst) {
       os << (!first ? ", " : "");
       first = false;
       os << named_attr.getName().getValue() << " = ";
-      if (auto element_attr = named_attr.getValue().dyn_cast<ElementsAttr>()) {
+      if (auto element_attr =
+              mlir::dyn_cast<ElementsAttr>(named_attr.getValue())) {
         if (element_attr.getNumElements() <= kLargeElementsAttr) {
           element_attr.print(os);
         } else {
@@ -350,9 +357,9 @@ static std::string GetOpsSummary(
 template <typename T>
 static bool HasValidTFLiteType(Value value, T& error_handler) {
   // None type is allowed to represent unspecified operands.
-  if (value.getType().isa<NoneType>()) return true;
+  if (mlir::isa<NoneType>(value.getType())) return true;
 
-  auto type = value.getType().dyn_cast<TensorType>();
+  auto type = mlir::dyn_cast<TensorType>(value.getType());
   if (!type) {
     if (auto op = value.getDefiningOp()) {
       error_handler.emitError()
@@ -411,7 +418,7 @@ static bool IsValidTFLiteMlirModule(ModuleOp module) {
     for (auto arg : bb.getArguments()) {
       if (!HasValidTFLiteType(arg, fn)) {
         auto elementType = getElementTypeOrSelf(arg.getType());
-        if (elementType.isa<mlir::TF::VariantType>()) {
+        if (mlir::isa<mlir::TF::VariantType>(elementType)) {
           return fn.emitError(
                      "function argument uses variant type. Currently, the "
                      "variant type is not natively supported in TFLite. Please "
@@ -430,10 +437,10 @@ static bool IsValidTFLiteMlirModule(ModuleOp module) {
       if (inst.hasTrait<mlir::OpTrait::IsTerminator>()) break;
 
       for (auto result : inst.getResults()) {
-        if (result.getType().isa<mlir::TFL::ControlType>()) continue;
+        if (mlir::isa<mlir::TFL::ControlType>(result.getType())) continue;
         if (!HasValidTFLiteType(result, inst)) {
           auto elementType = getElementTypeOrSelf(result.getType());
-          if (elementType.isa<mlir::TF::VariantType>()) {
+          if (mlir::isa<mlir::TF::VariantType>(elementType)) {
             return inst.emitError(
                        "operand result uses variant type. Currently, the "
                        "variant type is not natively supported in TFLite. "
@@ -716,7 +723,7 @@ class Translator {
 
   // Append constant and custom op buffers at the end of the flatbuffer and
   // calculate the offsets
-  void AppendBufferData(std::string& result);
+  void AppendBufferData(absl::Cord& result);
 
   // Update constant & custom op buffer offsets
   // Return false if fail to update offset
@@ -767,6 +774,11 @@ class Translator {
       const std::vector<int32_t>& results,
       mlir::VhloToStablehloTypeConverter& vhlo_type_converter);
 
+  std::optional<BufferOffset<tflite::Operator>> BuildVhloCompositeV1Op(
+      mlir::vhlo::CompositeOpV1 composite_op,
+      const std::vector<int32_t>& operands, const std::vector<int32_t>& results,
+      std::string op_name);
+
   std::optional<BufferOffset<tflite::Operator>> BuildVhloScatterV1Op(
       mlir::vhlo::ScatterOpV1 scatter_op, const std::vector<int32_t>& operands,
       const std::vector<int32_t>& results,
@@ -816,7 +828,8 @@ class Translator {
   // Maps buffer data to corresponding buffer index
   // in the idx map, the value is a pair of offset and size
   absl::flat_hash_map<int, std::pair<uint64_t, uint64_t>> buffer_idx_map_;
-  absl::flat_hash_map<int, std::vector<uint8_t>> buffer_data_map_;
+  absl::flat_hash_map<int, std::string> buffer_data_map_;
+  bool buffer_data_exported_ = false;
 
   // Maps custom options data to corresponding node
   // Key is set to be the list of input tensor indices and list of output tensor
@@ -908,7 +921,7 @@ std::optional<BufferOffset<tflite::Buffer>> Translator::BuildBuffer(
   if (auto cst = dyn_cast<mlir::arith::ConstantOp>(inst)) {
     // arith::ConstantOp have ElementAttr at this point due to validation of the
     // TFLite module.
-    attr = cst.getValue().cast<ElementsAttr>();
+    attr = mlir::cast<ElementsAttr>(cst.getValue());
   } else if (auto cst = dyn_cast<mlir::TF::ConstOp>(inst)) {
     attr = cst.getValue();
   } else if (auto cst = dyn_cast<tfl::ConstOp>(inst)) {
@@ -919,10 +932,10 @@ std::optional<BufferOffset<tflite::Buffer>> Translator::BuildBuffer(
     attr = cst.getValue();
   } else if (auto cst = dyn_cast<mlir::vhlo::ConstantOpV1>(inst)) {
     mlir::VhloToStablehloTypeConverter vhlo_type_converter;
-    auto tensor_v1_attr = cst.getValue().cast<mlir::vhlo::TensorV1Attr>();
+    auto tensor_v1_attr = mlir::cast<mlir::vhlo::TensorV1Attr>(cst.getValue());
     attr = mlir::DenseIntOrFPElementsAttr::getFromRawBuffer(
-        vhlo_type_converter.convertType(tensor_v1_attr.getType())
-            .cast<mlir::ShapedType>(),
+        mlir::cast<mlir::ShapedType>(
+            vhlo_type_converter.convertType(tensor_v1_attr.getType())),
         tensor_v1_attr.getData());
   } else if (auto cst = dyn_cast<tfl::SparseConstOp>(inst)) {
     attr = cst.getCompressedData();
@@ -945,7 +958,7 @@ std::optional<BufferOffset<tflite::Buffer>> Translator::BuildBuffer(
   // trouble calling ConvertToTensor(). For now, extract the tensor data from
   // ElementsAttr directly in this and read type from tflite::TensorType instead
   // of tensorflow::DataType.
-  auto type = value.getType().cast<TensorType>();
+  auto type = mlir::cast<TensorType>(value.getType());
   tflite::TensorType tflite_element_type =
       GetTFLiteType(type.getElementType()).value();
   if (tflite_element_type == tflite::TensorType_INT4) {
@@ -955,7 +968,8 @@ std::optional<BufferOffset<tflite::Buffer>> Translator::BuildBuffer(
     }
     auto packed_buffer = tflite::PackInt4ValuesDensely(data);
     if (use_buffer_offset_) {
-      buffer_data_map_[index] = packed_buffer;
+      buffer_data_map_[index] =
+          std::string(packed_buffer.begin(), packed_buffer.end());
       return tflite::CreateBuffer(builder_, 0, 1, 1);
     } else {
       if (IsModelBiggerThan2GB(packed_buffer.size())) {
@@ -991,7 +1005,8 @@ std::optional<BufferOffset<tflite::Buffer>> Translator::BuildBuffer(
     if (use_buffer_offset_) {
       std::vector<uint8_t> buffer_data(tensor_buffer, tensor_buffer + bytes);
       free(tensor_buffer);
-      buffer_data_map_[index] = buffer_data;
+      buffer_data_map_[index] =
+          std::string(buffer_data.begin(), buffer_data.end());
       return tflite::CreateBuffer(builder_, 0, 1, 1);
     } else {
       if (IsModelBiggerThan2GB(bytes)) {
@@ -1007,9 +1022,7 @@ std::optional<BufferOffset<tflite::Buffer>> Translator::BuildBuffer(
 
   absl::string_view tensor_data = tensor.tensor_data();
   if (use_buffer_offset_) {
-    std::vector<uint8_t> buffer_data(tensor_data.data(),
-                                     tensor_data.data() + tensor_data.size());
-    buffer_data_map_[index] = buffer_data;
+    buffer_data_map_[index] = std::string(tensor_data);
     return tflite::CreateBuffer(builder_, 0, 1, 1);
   } else {
     if (IsModelBiggerThan2GB(tensor_data.size())) {
@@ -1041,7 +1054,7 @@ int32_t Translator::UnnamedRegionToSubgraph(
 std::optional<std::vector<BufferOffset<tflite::VariantSubType>>>
 Translator::BuildTFVariantType(mlir::Type element_type) {
   std::vector<BufferOffset<tflite::VariantSubType>> variant_params;
-  auto variant_type = element_type.dyn_cast<mlir::TF::VariantType>();
+  auto variant_type = mlir::dyn_cast<mlir::TF::VariantType>(element_type);
   if (!variant_type) {
     return variant_params;
   }
@@ -1070,7 +1083,7 @@ Translator::BuildTFVariantType(mlir::Type element_type) {
 
 std::optional<BufferOffset<tflite::Tensor>> Translator::BuildTensorFromType(
     mlir::Type type, const std::string& name) {
-  auto tensor_type = type.cast<TensorType>();
+  auto tensor_type = mlir::cast<TensorType>(type);
 
   llvm::ArrayRef<int64_t> shape_ref;
   std::vector<int32_t> shape;
@@ -1093,15 +1106,15 @@ std::optional<BufferOffset<tflite::Tensor>> Translator::BuildTensorFromType(
     return std::nullopt;
   }
   BufferOffset<tflite::QuantizationParameters> q_params = 0;
-  if (auto qtype = element_type.dyn_cast<mlir::quant::UniformQuantizedType>()) {
+  if (auto qtype =
+          mlir::dyn_cast<mlir::quant::UniformQuantizedType>(element_type)) {
     std::vector<float> scales = {static_cast<float>(qtype.getScale())};
     std::vector<int64_t> zero_points = {qtype.getZeroPoint()};
     q_params = tflite::CreateQuantizationParameters(
         builder_, /*min=*/0, /*max=*/0, builder_.CreateVector<float>(scales),
         builder_.CreateVector<int64_t>(zero_points));
-  } else if (auto qtype =
-                 element_type
-                     .dyn_cast<mlir::quant::CalibratedQuantizedType>()) {
+  } else if (auto qtype = mlir::dyn_cast<mlir::quant::CalibratedQuantizedType>(
+                 element_type)) {
     std::vector<float> mins = {static_cast<float>(qtype.getMin())};
     std::vector<float> maxs = {static_cast<float>(qtype.getMax())};
     q_params = tflite::CreateQuantizationParameters(
@@ -1120,7 +1133,7 @@ std::optional<BufferOffset<tflite::Tensor>> Translator::BuildTensor(
     Value value, const std::string& name, unsigned buffer_idx,
     const std::optional<BufferOffset<tflite::QuantizationParameters>>&
         quant_parameters) {
-  auto type = value.getType().cast<TensorType>();
+  auto type = mlir::cast<TensorType>(value.getType());
 
   // TFLite requires tensor shape only for the inputs and constants.
   // However, we output all known shapes for better round-tripping
@@ -1150,9 +1163,9 @@ std::optional<BufferOffset<tflite::Tensor>> Translator::BuildTensor(
     // Const op can have a result of dynamic shaped type (e.g. due to constant
     // folding), but we can still derive the shape of a constant tensor for
     // its attribute type.
-    auto tensor_attr = inst->getAttr("value").cast<mlir::TypedAttr>();
+    auto tensor_attr = mlir::cast<mlir::TypedAttr>(inst->getAttr("value"));
     llvm::ArrayRef<int64_t> shape_ref =
-        tensor_attr.getType().cast<TensorType>().getShape();
+        mlir::cast<TensorType>(tensor_attr.getType()).getShape();
     if (mlir::failed(check_shape(shape_ref))) return std::nullopt;
 
     shape = std::vector<int32_t>(shape_ref.begin(), shape_ref.end());
@@ -1191,7 +1204,8 @@ std::optional<BufferOffset<tflite::Tensor>> Translator::BuildTensor(
   }
 
   BufferOffset<tflite::QuantizationParameters> q_params;
-  if (auto qtype = element_type.dyn_cast<mlir::quant::UniformQuantizedType>()) {
+  if (auto qtype =
+          mlir::dyn_cast<mlir::quant::UniformQuantizedType>(element_type)) {
     std::vector<float> scales = {static_cast<float>(qtype.getScale())};
     std::vector<int64_t> zero_points = {qtype.getZeroPoint()};
     q_params = tflite::CreateQuantizationParameters(
@@ -1200,8 +1214,8 @@ std::optional<BufferOffset<tflite::Tensor>> Translator::BuildTensor(
         builder_, /*min=*/0, /*max=*/0, builder_.CreateVector<float>(scales),
         builder_.CreateVector<int64_t>(zero_points));
   } else if (auto qtype =
-                 element_type
-                     .dyn_cast<mlir::quant::UniformQuantizedPerAxisType>()) {
+                 mlir::dyn_cast<mlir::quant::UniformQuantizedPerAxisType>(
+                     element_type)) {
     std::vector<float> scales(qtype.getScales().begin(),
                               qtype.getScales().end());
     std::vector<int64_t> zero_points(qtype.getZeroPoints().begin(),
@@ -1339,7 +1353,9 @@ BufferOffset<tflite::Operator> Translator::BuildCustomOperator(
     Operation* inst, mlir::TFL::CustomOp op,
     const std::vector<int32_t>& operands, const std::vector<int32_t>& results) {
   const std::string attrs =
-      op.getCustomOption().cast<mlir::TFL::ConstBytesAttr>().getValue().str();
+      mlir::cast<mlir::TFL::ConstBytesAttr>(op.getCustomOption())
+          .getValue()
+          .str();
   std::vector<uint8_t> custom_option_vector(attrs.size(), 0);
   memcpy(custom_option_vector.data(), attrs.data(), attrs.size());
   auto opcode_index =
@@ -1492,6 +1508,43 @@ uint32_t Translator::GetOpcodeIndex(const std::string& op_name,
   return it.first->second;
 }
 
+void CreateFlexbufferVector(
+    const std::unique_ptr<flexbuffers::Builder>& flex_builder,
+    std::string& name, const mlir::Attribute& attr) {
+  auto start = flex_builder->StartVector(name.c_str());
+  auto array = attr.cast<mlir::ArrayAttr>().getValue();
+
+  for (int i = 0; i < array.size(); i++) {
+    if (llvm::isa<mlir::BoolAttr>(array[i])) {
+      flex_builder->Bool(name.c_str(),
+                         array[i].cast<mlir::BoolAttr>().getValue());
+    } else if (llvm::isa<mlir::StringAttr>(attr)) {
+      flex_builder->String(name.c_str(),
+                           array[i].cast<mlir::StringAttr>().getValue().str());
+    } else if (llvm::isa<mlir::vhlo::BooleanV1Attr>(array[i])) {
+      flex_builder->Bool(name.c_str(),
+                         array[i].cast<mlir::vhlo::BooleanV1Attr>().getValue());
+    } else if (llvm::isa<mlir::vhlo::StringV1Attr>(array[i])) {
+      flex_builder->String(
+          name.c_str(),
+          array[i].cast<mlir::vhlo::StringV1Attr>().getValue().str());
+    } else if (llvm::isa<mlir::vhlo::IntegerV1Attr>(array[i])) {
+      flex_builder->Int(
+          name.c_str(),
+          array[i].cast<mlir::vhlo::IntegerV1Attr>().getValue().getSExtValue());
+    } else if (llvm::isa<mlir::vhlo::FloatV1Attr>(array[i])) {
+      flex_builder->Float(
+          name.c_str(),
+          array[i].cast<mlir::vhlo::FloatV1Attr>().getValue().convertToFloat());
+
+    } else if (llvm::isa<mlir::vhlo::ArrayV1Attr>(array[i])) {
+      CreateFlexbufferVector(flex_builder, name, array[i]);
+    }
+  }
+
+  flex_builder->EndVector(start, /*typed=*/false, /*fixed=*/false);
+}
+
 std::optional<BufferOffset<tflite::Operator>>
 Translator::BuildStablehloOperatorwithoutOptions(
     Operation* inst, const std::vector<int32_t>& operands,
@@ -1511,7 +1564,7 @@ Translator::BuildStablehloPrecisionConfig(::mlir::ArrayAttr precisionConfig) {
 
   for (auto it = precisionConfig.begin(); it != precisionConfig.end(); it++) {
     precision_config_vec.push_back(static_cast<uint32_t>(
-        (it->cast<mlir::stablehlo::PrecisionAttr>()).getValue()));
+        (mlir::cast<mlir::stablehlo::PrecisionAttr>(*it)).getValue()));
   }
   return builder_.CreateVector(precision_config_vec);
 }
@@ -1523,7 +1576,7 @@ Translator::BuildVhloPrecisionConfigV1(
   auto values = precisionConfig.getValue();
   for (auto it = values.begin(); it != values.end(); it++) {
     precision_config_vec.push_back(static_cast<uint32_t>(
-        (it->cast<mlir::vhlo::PrecisionV1Attr>()).getValue()));
+        (mlir::cast<mlir::vhlo::PrecisionV1Attr>(*it)).getValue()));
   }
   return builder_.CreateVector(precision_config_vec);
 }
@@ -1568,6 +1621,78 @@ Translator::BuildStablehloGatherOp(mlir::stablehlo::GatherOp gather_op,
       tflite::BuiltinOptions2_StablehloGatherOptions, gather_option.Union());
 }
 
+std::optional<BufferOffset<tflite::Operator>>
+Translator::BuildVhloCompositeV1Op(mlir::vhlo::CompositeOpV1 composite_op,
+                                   const std::vector<int32_t>& operands,
+                                   const std::vector<int32_t>& results,
+                                   std::string op_name) {
+  uint32_t opcode_index =
+      GetOpcodeIndex(op_name, tflite::BuiltinOperator_STABLEHLO_COMPOSITE);
+
+  int32_t api_version = composite_op.getVersion()
+                            .cast<mlir::vhlo::IntegerV1Attr>()
+                            .getValue()
+                            .getSExtValue();
+
+  auto name = builder_.CreateString(
+      composite_op.getName().cast<mlir::vhlo::StringV1Attr>().getValue().str());
+
+  auto composite_attributes = composite_op.getCompositeAttributes()
+                                  .cast<mlir::vhlo::DictionaryV1Attr>();
+  auto flex_builder = std::make_unique<flexbuffers::Builder>();
+  size_t map_start = flex_builder->StartMap();
+
+  for (auto namedAttr : composite_attributes.getValue()) {
+    auto name =
+        namedAttr.first.cast<mlir::vhlo::StringV1Attr>().getValue().str();
+    auto attr = namedAttr.second;
+
+    if (llvm::isa<mlir::BoolAttr>(attr))
+      flex_builder->Bool(name.c_str(), attr.cast<mlir::BoolAttr>().getValue());
+    else if (llvm::isa<mlir::StringAttr>(attr))
+      flex_builder->String(name.c_str(),
+                           attr.cast<mlir::StringAttr>().getValue().str());
+    else if (llvm::isa<mlir::vhlo::BooleanV1Attr>(attr))
+      flex_builder->Bool(name.c_str(),
+                         attr.cast<mlir::vhlo::BooleanV1Attr>().getValue());
+    else if (llvm::isa<mlir::vhlo::StringV1Attr>(attr))
+      flex_builder->String(
+          name.c_str(), attr.cast<mlir::vhlo::StringV1Attr>().getValue().str());
+    else if (llvm::isa<mlir::vhlo::IntegerV1Attr>(attr))
+      flex_builder->Int(
+          name.c_str(),
+          attr.cast<mlir::vhlo::IntegerV1Attr>().getValue().getSExtValue());
+    else if (llvm::isa<mlir::vhlo::FloatV1Attr>(attr))
+      flex_builder->Float(
+          name.c_str(),
+          attr.cast<mlir::vhlo::FloatV1Attr>().getValue().convertToFloat());
+  }
+
+  flex_builder->EndMap(map_start);
+  flex_builder->Finish();
+
+  int32_t decomposition_subgraph_index =
+      subgraph_index_map_[composite_op.getDecomposition()
+                              .cast<mlir::vhlo::StringV1Attr>()
+                              .getValue()
+                              .str()];
+
+  auto composite_option = tflite::CreateStableHLOCompositeOptions(
+      builder_, name, decomposition_subgraph_index,
+      builder_.CreateVector(flex_builder->GetBuffer()),
+      tflite::CustomOptionsFormat_FLEXBUFFERS, api_version);
+
+  return tflite::CreateOperator(
+      builder_, opcode_index, builder_.CreateVector(operands),
+      builder_.CreateVector(results), tflite::BuiltinOptions_NONE,
+      /*builtin_options=*/0, /*custom_options=*/0,
+      tflite::CustomOptionsFormat_FLEXBUFFERS,
+      /*mutating_variable_inputs=*/0, /*intermediates=*/0,
+      /*large_custom_options_offset=*/0, /*large_custom_options_size=*/0,
+      tflite::BuiltinOptions2_StableHLOCompositeOptions,
+      composite_option.Union());
+}
+
 std::optional<BufferOffset<tflite::Operator>>
 Translator::BuildStablehloScatterOp(mlir::stablehlo::ScatterOp scatter_op,
                                     const std::vector<int32_t>& operands,
@@ -1732,27 +1857,25 @@ std::optional<BufferOffset<tflite::Operator>> Translator::BuildVhloGatherV1Op(
       GetOpcodeIndex(op_name, tflite::BuiltinOperator_STABLEHLO_GATHER);
 
   auto offset_dims = builder_.CreateVector(mlir::GetVector<int64_t>(
-      gather_op.getOffsetDims().cast<mlir::vhlo::TensorV1Attr>(),
+      mlir::cast<mlir::vhlo::TensorV1Attr>(gather_op.getOffsetDims()),
       vhlo_type_converter));
   auto collapsed_slice_dims = builder_.CreateVector(mlir::GetVector<int64_t>(
-      gather_op.getCollapsedSliceDims().cast<mlir::vhlo::TensorV1Attr>(),
+      mlir::cast<mlir::vhlo::TensorV1Attr>(gather_op.getCollapsedSliceDims()),
       vhlo_type_converter));
   auto start_index_map = builder_.CreateVector(mlir::GetVector<int64_t>(
-      gather_op.getStartIndexMap().cast<mlir::vhlo::TensorV1Attr>(),
+      mlir::cast<mlir::vhlo::TensorV1Attr>(gather_op.getStartIndexMap()),
       vhlo_type_converter));
   auto slice_sizes = builder_.CreateVector(mlir::GetVector<int64_t>(
-      gather_op.getSliceSizes().cast<mlir::vhlo::TensorV1Attr>(),
+      mlir::cast<mlir::vhlo::TensorV1Attr>(gather_op.getSliceSizes()),
       vhlo_type_converter));
 
   auto gather_option = tflite::CreateStablehloGatherOptions(
       builder_, offset_dims, collapsed_slice_dims, start_index_map,
-      gather_op.getIndexVectorDim()
-          .cast<mlir::vhlo::IntegerV1Attr>()
+      mlir::cast<mlir::vhlo::IntegerV1Attr>(gather_op.getIndexVectorDim())
           .getValue()
           .getSExtValue(),
       slice_sizes,
-      gather_op.getIndicesAreSorted()
-          .cast<mlir::vhlo::BooleanV1Attr>()
+      mlir::cast<mlir::vhlo::BooleanV1Attr>(gather_op.getIndicesAreSorted())
           .getValue());
 
   return tflite::CreateOperator(
@@ -1779,26 +1902,26 @@ std::optional<BufferOffset<tflite::Operator>> Translator::BuildVhloScatterV1Op(
       UnnamedRegionToSubgraph(&body, tflite::BuiltinOperator_STABLEHLO_SCATTER);
   if (subgraph_index < 0) return std::nullopt;
 
-  int64_t index_vector_dim = scatter_op.getIndexVectorDim()
-                                 .cast<mlir::vhlo::IntegerV1Attr>()
-                                 .getValue()
-                                 .getSExtValue();
-  bool unique_indices = scatter_op.getUniqueIndices()
-                            .cast<mlir::vhlo::BooleanV1Attr>()
-                            .getValue();
-  bool indices_are_sorted = scatter_op.getIndicesAreSorted()
-                                .cast<mlir::vhlo::BooleanV1Attr>()
-                                .getValue();
+  int64_t index_vector_dim =
+      mlir::cast<mlir::vhlo::IntegerV1Attr>(scatter_op.getIndexVectorDim())
+          .getValue()
+          .getSExtValue();
+  bool unique_indices =
+      mlir::cast<mlir::vhlo::BooleanV1Attr>(scatter_op.getUniqueIndices())
+          .getValue();
+  bool indices_are_sorted =
+      mlir::cast<mlir::vhlo::BooleanV1Attr>(scatter_op.getIndicesAreSorted())
+          .getValue();
 
   auto update_window_dims = builder_.CreateVector(mlir::GetVector<int64_t>(
-      scatter_op.getUpdateWindowDims().cast<mlir::vhlo::TensorV1Attr>(),
+      mlir::cast<mlir::vhlo::TensorV1Attr>(scatter_op.getUpdateWindowDims()),
       vhlo_type_converter));
   auto inserted_window_dims = builder_.CreateVector(mlir::GetVector<int64_t>(
-      scatter_op.getInsertedWindowDims().cast<mlir::vhlo::TensorV1Attr>(),
+      mlir::cast<mlir::vhlo::TensorV1Attr>(scatter_op.getInsertedWindowDims()),
       vhlo_type_converter));
   auto scatter_dims_to_operand_dims = builder_.CreateVector(
-      mlir::GetVector<int64_t>(scatter_op.getScatterDimsToOperandDims()
-                                   .cast<mlir::vhlo::TensorV1Attr>(),
+      mlir::GetVector<int64_t>(mlir::cast<mlir::vhlo::TensorV1Attr>(
+                                   scatter_op.getScatterDimsToOperandDims()),
                                vhlo_type_converter));
 
   auto options = tflite::CreateStablehloScatterOptions(
@@ -1826,20 +1949,22 @@ Translator::BuildVhloReduceWindowV1Op(
   uint32_t opcode_index =
       GetOpcodeIndex(op_name, tflite::BuiltinOperator_STABLEHLO_REDUCE_WINDOW);
 
-  auto window_dimensions = builder_.CreateVector(mlir::GetVector<int64_t>(
-      reduce_window_op.getWindowDimensions().cast<mlir::vhlo::TensorV1Attr>(),
-      vhlo_type_converter));
+  auto window_dimensions = builder_.CreateVector(
+      mlir::GetVector<int64_t>(mlir::cast<mlir::vhlo::TensorV1Attr>(
+                                   reduce_window_op.getWindowDimensions()),
+                               vhlo_type_converter));
   auto window_strides = builder_.CreateVector(mlir::GetVector<int64_t>(
-      reduce_window_op.getWindowStrides().cast<mlir::vhlo::TensorV1Attr>(),
+      mlir::cast<mlir::vhlo::TensorV1Attr>(reduce_window_op.getWindowStrides()),
       vhlo_type_converter));
   auto base_dilations = builder_.CreateVector(mlir::GetVector<int64_t>(
-      reduce_window_op.getBaseDilations().cast<mlir::vhlo::TensorV1Attr>(),
-      vhlo_type_converter));
-  auto window_dilations = builder_.CreateVector(mlir::GetVector<int64_t>(
-      reduce_window_op.getWindowDilations().cast<mlir::vhlo::TensorV1Attr>(),
+      mlir::cast<mlir::vhlo::TensorV1Attr>(reduce_window_op.getBaseDilations()),
       vhlo_type_converter));
+  auto window_dilations = builder_.CreateVector(
+      mlir::GetVector<int64_t>(mlir::cast<mlir::vhlo::TensorV1Attr>(
+                                   reduce_window_op.getWindowDilations()),
+                               vhlo_type_converter));
   auto padding = builder_.CreateVector(mlir::GetVector<int64_t>(
-      reduce_window_op.getPadding().cast<mlir::vhlo::TensorV1Attr>(),
+      mlir::cast<mlir::vhlo::TensorV1Attr>(reduce_window_op.getPadding()),
       vhlo_type_converter));
   auto& body = reduce_window_op.getBody();
   int32_t subgraph_index = UnnamedRegionToSubgraph(
@@ -1870,8 +1995,7 @@ Translator::BuildVhloRngBitGeneratorV1Op(
   uint32_t opcode_index = GetOpcodeIndex(
       op_name, tflite::BuiltinOperator_STABLEHLO_RNG_BIT_GENERATOR);
   tflite::RngAlgorithm algorithm = tflite::RngAlgorithm_DEFAULT;
-  switch (rng_op.getRngAlgorithm()
-              .cast<mlir::vhlo::RngAlgorithmV1Attr>()
+  switch (mlir::cast<mlir::vhlo::RngAlgorithmV1Attr>(rng_op.getRngAlgorithm())
               .getValue()) {
     case mlir::vhlo::RngAlgorithmV1::THREE_FRY:
       algorithm = tflite::RngAlgorithm_THREEFRY;
@@ -1904,13 +2028,13 @@ std::optional<BufferOffset<tflite::Operator>> Translator::BuildVhloPadV1Op(
       GetOpcodeIndex(op_name, tflite::BuiltinOperator_STABLEHLO_PAD);
 
   auto edge_padding_low = builder_.CreateVector(mlir::GetVector<int64_t>(
-      pad_op.getEdgePaddingLow().cast<mlir::vhlo::TensorV1Attr>(),
+      mlir::cast<mlir::vhlo::TensorV1Attr>(pad_op.getEdgePaddingLow()),
       vhlo_type_converter));
   auto edge_padding_high = builder_.CreateVector(mlir::GetVector<int64_t>(
-      pad_op.getEdgePaddingHigh().cast<mlir::vhlo::TensorV1Attr>(),
+      mlir::cast<mlir::vhlo::TensorV1Attr>(pad_op.getEdgePaddingHigh()),
       vhlo_type_converter));
   auto interior_padding = builder_.CreateVector(mlir::GetVector<int64_t>(
-      pad_op.getInteriorPadding().cast<mlir::vhlo::TensorV1Attr>(),
+      mlir::cast<mlir::vhlo::TensorV1Attr>(pad_op.getInteriorPadding()),
       vhlo_type_converter));
 
   auto pad_option = tflite::CreateStablehloPadOptions(
@@ -2031,6 +2155,10 @@ std::optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
     if (auto vhlo_op = llvm::dyn_cast<mlir::vhlo::PadOpV1>(inst)) {
       return BuildVhloPadV1Op(vhlo_op, operands, results, vhlo_type_converter);
     }
+    if (auto vhlo_op = llvm::dyn_cast<mlir::vhlo::CompositeOpV1>(inst)) {
+      return BuildVhloCompositeV1Op(vhlo_op, operands, results,
+                                    inst->getName().getStringRef().str());
+    }
     // for ops don't have kernels, only serialize when conversion is set to
     // true
     if (convert_stablehlo_) {
@@ -2139,10 +2267,10 @@ std::optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
             GetOpcodeIndex(op_name, tflite::BuiltinOperator_STABLEHLO_IOTA);
 
         auto iota_option = tflite::CreateStablehloIotaOptions(
-            builder_, vhlo_op.getIotaDimension()
-                          .cast<mlir::vhlo::IntegerV1Attr>()
-                          .getValue()
-                          .getSExtValue());
+            builder_,
+            mlir::cast<mlir::vhlo::IntegerV1Attr>(vhlo_op.getIotaDimension())
+                .getValue()
+                .getSExtValue());
 
         return tflite::CreateOperator(
             builder_, opcode_index, builder_.CreateVector(operands),
@@ -2156,7 +2284,7 @@ std::optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
             op_name, tflite::BuiltinOperator_STABLEHLO_DYNAMIC_SLICE);
 
         auto slice_sizes = builder_.CreateVector(mlir::GetVector<int64_t>(
-            vhlo_op.getSliceSizes().cast<mlir::vhlo::TensorV1Attr>(),
+            mlir::cast<mlir::vhlo::TensorV1Attr>(vhlo_op.getSliceSizes()),
             vhlo_type_converter));
 
         auto dynamic_slice_option =
@@ -2179,13 +2307,13 @@ std::optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
             tflite::StablehloComparisonType_STABLEHLO_COMPARISON_TYPE_NOTYPE;
         if (compare_type_attr)
           compare_type = static_cast<tflite::StablehloComparisonType>(
-              compare_type_attr.cast<mlir::vhlo::ComparisonTypeV1Attr>()
+              mlir::cast<mlir::vhlo::ComparisonTypeV1Attr>(compare_type_attr)
                   .getValue());
         auto compare_option = tflite::CreateStablehloCompareOptions(
             builder_,
             static_cast<tflite::StablehloComparisonDirection>(
-                vhlo_op.getComparisonDirection()
-                    .cast<mlir::vhlo::ComparisonDirectionV1Attr>()
+                mlir::cast<mlir::vhlo::ComparisonDirectionV1Attr>(
+                    vhlo_op.getComparisonDirection())
                     .getValue()),
             compare_type);
 
@@ -2202,10 +2330,10 @@ std::optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
             op_name, tflite::BuiltinOperator_STABLEHLO_CONCATENATE);
 
         auto concat_option = tflite::CreateStablehloConcatenateOptions(
-            builder_, vhlo_op.getDimension()
-                          .cast<mlir::vhlo::IntegerV1Attr>()
-                          .getValue()
-                          .getSExtValue());
+            builder_,
+            mlir::cast<mlir::vhlo::IntegerV1Attr>(vhlo_op.getDimension())
+                .getValue()
+                .getSExtValue());
 
         return tflite::CreateOperator(
             builder_, opcode_index, builder_.CreateVector(operands),
@@ -2220,13 +2348,13 @@ std::optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
             GetOpcodeIndex(op_name, tflite::BuiltinOperator_STABLEHLO_SLICE);
 
         auto start_indices = builder_.CreateVector((mlir::GetVector<int64_t>(
-            vhlo_op.getStartIndicesAttr().cast<mlir::vhlo::TensorV1Attr>(),
+            mlir::cast<mlir::vhlo::TensorV1Attr>(vhlo_op.getStartIndicesAttr()),
             vhlo_type_converter)));
         auto limit_indices = builder_.CreateVector(mlir::GetVector<int64_t>(
-            vhlo_op.getLimitIndicesAttr().cast<mlir::vhlo::TensorV1Attr>(),
+            mlir::cast<mlir::vhlo::TensorV1Attr>(vhlo_op.getLimitIndicesAttr()),
             vhlo_type_converter));
         auto strides = builder_.CreateVector(mlir::GetVector<int64_t>(
-            vhlo_op.getStridesAttr().cast<mlir::vhlo::TensorV1Attr>(),
+            mlir::cast<mlir::vhlo::TensorV1Attr>(vhlo_op.getStridesAttr()),
             vhlo_type_converter));
 
         auto slice_option = tflite::CreateStablehloSliceOptions(
@@ -2245,63 +2373,64 @@ std::optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
             op_name, tflite::BuiltinOperator_STABLEHLO_CONVOLUTION);
 
         auto window_strides = builder_.CreateVector(mlir::GetVector<int64_t>(
-            vhlo_op.getWindowStrides().cast<mlir::vhlo::TensorV1Attr>(),
+            mlir::cast<mlir::vhlo::TensorV1Attr>(vhlo_op.getWindowStrides()),
             vhlo_type_converter));
         auto padding = builder_.CreateVector(mlir::GetVector<int64_t>(
-            vhlo_op.getPadding().cast<mlir::vhlo::TensorV1Attr>(),
+            mlir::cast<mlir::vhlo::TensorV1Attr>(vhlo_op.getPadding()),
             vhlo_type_converter));
         auto lhs_dialation = builder_.CreateVector(mlir::GetVector<int64_t>(
-            vhlo_op.getLhsDilation().cast<mlir::vhlo::TensorV1Attr>(),
+            mlir::cast<mlir::vhlo::TensorV1Attr>(vhlo_op.getLhsDilation()),
             vhlo_type_converter));
         auto rhs_dialation = builder_.CreateVector(mlir::GetVector<int64_t>(
-            vhlo_op.getRhsDilation().cast<mlir::vhlo::TensorV1Attr>(),
+            mlir::cast<mlir::vhlo::TensorV1Attr>(vhlo_op.getRhsDilation()),
             vhlo_type_converter));
         auto window_reversal = builder_.CreateVector(mlir::GetVector<bool>(
-            vhlo_op.getWindowReversal().cast<mlir::vhlo::TensorV1Attr>(),
+            mlir::cast<mlir::vhlo::TensorV1Attr>(vhlo_op.getWindowReversal()),
             vhlo_type_converter));
-        auto input_batch_dimension = vhlo_op.getInputBatchDimension()
-                                         .cast<mlir::vhlo::IntegerV1Attr>()
+        auto input_batch_dimension = mlir::cast<mlir::vhlo::IntegerV1Attr>(
+                                         vhlo_op.getInputBatchDimension())
                                          .getValue()
                                          .getSExtValue();
-        auto input_feature_dimension = vhlo_op.getInputFeatureDimension()
-                                           .cast<mlir::vhlo::IntegerV1Attr>()
+        auto input_feature_dimension = mlir::cast<mlir::vhlo::IntegerV1Attr>(
+                                           vhlo_op.getInputFeatureDimension())
                                            .getValue()
                                            .getSExtValue();
         auto kernel_input_feature_dimension =
-            vhlo_op.getKernelInputFeatureDimension()
-                .cast<mlir::vhlo::IntegerV1Attr>()
+            mlir::cast<mlir::vhlo::IntegerV1Attr>(
+                vhlo_op.getKernelInputFeatureDimension())
                 .getValue()
                 .getSExtValue();
         auto kernel_output_feature_dimension =
-            vhlo_op.getKernelOutputFeatureDimension()
-                .cast<mlir::vhlo::IntegerV1Attr>()
+            mlir::cast<mlir::vhlo::IntegerV1Attr>(
+                vhlo_op.getKernelOutputFeatureDimension())
                 .getValue()
                 .getSExtValue();
-        auto output_batch_dimension = vhlo_op.getOutputBatchDimension()
-                                          .cast<mlir::vhlo::IntegerV1Attr>()
+        auto output_batch_dimension = mlir::cast<mlir::vhlo::IntegerV1Attr>(
+                                          vhlo_op.getOutputBatchDimension())
                                           .getValue()
                                           .getSExtValue();
-        auto output_feature_dimension = vhlo_op.getOutputFeatureDimension()
-                                            .cast<mlir::vhlo::IntegerV1Attr>()
+        auto output_feature_dimension = mlir::cast<mlir::vhlo::IntegerV1Attr>(
+                                            vhlo_op.getOutputFeatureDimension())
                                             .getValue()
                                             .getSExtValue();
 
         auto kernel_spatial_dimensions = builder_.CreateVector(
-            mlir::GetVector<int64_t>(vhlo_op.getKernelSpatialDimensions()
-                                         .cast<mlir::vhlo::TensorV1Attr>(),
+            mlir::GetVector<int64_t>(mlir::cast<mlir::vhlo::TensorV1Attr>(
+                                         vhlo_op.getKernelSpatialDimensions()),
                                      vhlo_type_converter));
         auto output_spatial_dimension = builder_.CreateVector(
-            mlir::GetVector<int64_t>(vhlo_op.getOutputSpatialDimensions()
-                                         .cast<mlir::vhlo::TensorV1Attr>(),
+            mlir::GetVector<int64_t>(mlir::cast<mlir::vhlo::TensorV1Attr>(
+                                         vhlo_op.getOutputSpatialDimensions()),
                                      vhlo_type_converter));
         auto input_spatial_dimension = builder_.CreateVector(
-            mlir::GetVector<int64_t>(vhlo_op.getInputSpatialDimensions()
-                                         .cast<mlir::vhlo::TensorV1Attr>(),
+            mlir::GetVector<int64_t>(mlir::cast<mlir::vhlo::TensorV1Attr>(
+                                         vhlo_op.getInputSpatialDimensions()),
                                      vhlo_type_converter));
         BufferOffset<flatbuffers::Vector<unsigned int>> precision_config = 0;
         if (vhlo_op.getPrecisionConfig()) {
           precision_config = BuildVhloPrecisionConfigV1(
-              vhlo_op.getPrecisionConfig().dyn_cast<mlir::vhlo::ArrayV1Attr>());
+              mlir::dyn_cast<mlir::vhlo::ArrayV1Attr>(
+                  vhlo_op.getPrecisionConfig()));
         }
 
         auto convolution_option = tflite::CreateStablehloConvolutionOptions(
@@ -2311,12 +2440,11 @@ std::optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
             kernel_output_feature_dimension, kernel_spatial_dimensions,
             output_batch_dimension, output_feature_dimension,
             output_spatial_dimension,
-            vhlo_op.getFeatureGroupCount()
-                .cast<mlir::vhlo::IntegerV1Attr>()
+            mlir::cast<mlir::vhlo::IntegerV1Attr>(
+                vhlo_op.getFeatureGroupCount())
                 .getValue()
                 .getSExtValue(),
-            vhlo_op.getBatchGroupCount()
-                .cast<mlir::vhlo::IntegerV1Attr>()
+            mlir::cast<mlir::vhlo::IntegerV1Attr>(vhlo_op.getBatchGroupCount())
                 .getValue()
                 .getSExtValue(),
             precision_config);
@@ -2334,8 +2462,8 @@ std::optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
             op_name, tflite::BuiltinOperator_STABLEHLO_BROADCAST_IN_DIM);
 
         auto broadcast_dimensions = builder_.CreateVector(
-            mlir::GetVector<int64_t>(vhlo_op.getBroadcastDimensions()
-                                         .cast<mlir::vhlo::TensorV1Attr>(),
+            mlir::GetVector<int64_t>(mlir::cast<mlir::vhlo::TensorV1Attr>(
+                                         vhlo_op.getBroadcastDimensions()),
                                      vhlo_type_converter));
 
         auto broadcast_option = tflite::CreateStablehloBroadcastInDimOptions(
@@ -2354,8 +2482,8 @@ std::optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
         uint32_t opcode_index = GetOpcodeIndex(
             op_name, tflite::BuiltinOperator_STABLEHLO_CUSTOM_CALL);
         auto op_api_version =
-            vhlo_op.getApiVersion()
-                .cast<mlir::vhlo::CustomCallApiVersionV1Attr>()
+            mlir::cast<mlir::vhlo::CustomCallApiVersionV1Attr>(
+                vhlo_op.getApiVersion())
                 .getValue();
         int32_t api_version = 0;
         if (op_api_version ==
@@ -2371,16 +2499,14 @@ std::optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
                                   API_VERSION_STATUS_RETURNING_UNIFIED)
           api_version = 3;
 
-        auto call_target_name =
-            builder_.CreateString(vhlo_op.getCallTargetName()
-                                      .cast<mlir::vhlo::StringV1Attr>()
-                                      .getValue()
-                                      .str());
-        auto backend_config =
-            builder_.CreateString(vhlo_op.getBackendConfig()
-                                      .cast<mlir::vhlo::StringV1Attr>()
-                                      .getValue()
-                                      .str());
+        auto call_target_name = builder_.CreateString(
+            mlir::cast<mlir::vhlo::StringV1Attr>(vhlo_op.getCallTargetName())
+                .getValue()
+                .str());
+        auto backend_config = builder_.CreateString(
+            mlir::cast<mlir::vhlo::StringV1Attr>(vhlo_op.getBackendConfig())
+                .getValue()
+                .str());
         // building the computation info
         auto flex_builder = std::make_unique<flexbuffers::Builder>();
         size_t map_start = flex_builder->StartMap();
@@ -2393,25 +2519,25 @@ std::optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
           if (name == "call_target_name" || name == "backend_config") continue;
           if (llvm::isa<mlir::BoolAttr>(attr))
             flex_builder->Bool(name.c_str(),
-                               attr.cast<mlir::BoolAttr>().getValue());
+                               mlir::cast<mlir::BoolAttr>(attr).getValue());
           if (llvm::isa<mlir::StringAttr>(attr))
             flex_builder->String(
-                name.c_str(), attr.cast<mlir::StringAttr>().getValue().str());
+                name.c_str(),
+                mlir::cast<mlir::StringAttr>(attr).getValue().str());
           if (llvm::isa<mlir::vhlo::BooleanV1Attr>(attr))
             flex_builder->Bool(
                 name.c_str(),
-                attr.cast<mlir::vhlo::BooleanV1Attr>().getValue());
+                mlir::cast<mlir::vhlo::BooleanV1Attr>(attr).getValue());
           if (llvm::isa<mlir::vhlo::StringV1Attr>(attr))
             flex_builder->String(
                 name.c_str(),
-                attr.cast<mlir::vhlo::StringV1Attr>().getValue().str());
+                mlir::cast<mlir::vhlo::StringV1Attr>(attr).getValue().str());
         }
         flex_builder->EndMap(map_start);
         flex_builder->Finish();
         auto custom_call_option = tflite::CreateStablehloCustomCallOptions(
             builder_, call_target_name,
-            vhlo_op.getHasSideEffect()
-                .cast<::mlir::vhlo::BooleanV1Attr>()
+            mlir::cast<::mlir::vhlo::BooleanV1Attr>(vhlo_op.getHasSideEffect())
                 .getValue(),
             backend_config, api_version, 0,
             builder_.CreateVector(flex_builder->GetBuffer()));
@@ -2429,7 +2555,7 @@ std::optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
             GetOpcodeIndex(op_name, tflite::BuiltinOperator_STABLEHLO_REDUCE);
 
         auto dimension = builder_.CreateVector(mlir::GetVector<int64_t>(
-            vhlo_op.getDimensions().cast<mlir::vhlo::TensorV1Attr>(),
+            mlir::cast<mlir::vhlo::TensorV1Attr>(vhlo_op.getDimensions()),
             vhlo_type_converter));
         auto& body = vhlo_op.getBody();
         int32_t subgraph_index = UnnamedRegionToSubgraph(
@@ -2452,26 +2578,27 @@ std::optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
             op_name, tflite::BuiltinOperator_STABLEHLO_DOT_GENERAL);
 
         auto lhs_batching_dimensions = builder_.CreateVector(
-            mlir::GetVector<int64_t>(vhlo_op.getLhsBatchingDimensions()
-                                         .cast<mlir::vhlo::TensorV1Attr>(),
+            mlir::GetVector<int64_t>(mlir::cast<mlir::vhlo::TensorV1Attr>(
+                                         vhlo_op.getLhsBatchingDimensions()),
                                      vhlo_type_converter));
         auto rhs_batching_dimensions = builder_.CreateVector(
-            mlir::GetVector<int64_t>(vhlo_op.getRhsBatchingDimensions()
-                                         .cast<mlir::vhlo::TensorV1Attr>(),
+            mlir::GetVector<int64_t>(mlir::cast<mlir::vhlo::TensorV1Attr>(
+                                         vhlo_op.getRhsBatchingDimensions()),
                                      vhlo_type_converter));
         auto lhs_contracting_dimensions = builder_.CreateVector(
-            mlir::GetVector<int64_t>(vhlo_op.getLhsContractingDimensions()
-                                         .cast<mlir::vhlo::TensorV1Attr>(),
+            mlir::GetVector<int64_t>(mlir::cast<mlir::vhlo::TensorV1Attr>(
+                                         vhlo_op.getLhsContractingDimensions()),
                                      vhlo_type_converter));
         auto rhs_contracting_dimensions = builder_.CreateVector(
-            mlir::GetVector<int64_t>(vhlo_op.getRhsContractingDimensions()
-                                         .cast<mlir::vhlo::TensorV1Attr>(),
+            mlir::GetVector<int64_t>(mlir::cast<mlir::vhlo::TensorV1Attr>(
+                                         vhlo_op.getRhsContractingDimensions()),
                                      vhlo_type_converter));
 
         BufferOffset<flatbuffers::Vector<unsigned int>> precision_config = 0;
         if (vhlo_op.getPrecisionConfig()) {
-          precision_config = BuildVhloPrecisionConfigV1(
-              vhlo_op.getPrecisionConfig().cast<mlir::vhlo::ArrayV1Attr>());
+          precision_config =
+              BuildVhloPrecisionConfigV1(mlir::cast<mlir::vhlo::ArrayV1Attr>(
+                  vhlo_op.getPrecisionConfig()));
         }
 
         auto dot_geneoral_option = tflite::CreateStablehloDotGeneralOptions(
@@ -2497,11 +2624,11 @@ std::optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
 
         auto sort_option = tflite::CreateStablehloSortOptions(
             builder_,
-            vhlo_op.getDimension()
-                .cast<mlir::vhlo::IntegerV1Attr>()
+            mlir::cast<mlir::vhlo::IntegerV1Attr>(vhlo_op.getDimension())
                 .getValue()
                 .getSExtValue(),
-            vhlo_op.getIsStable().cast<mlir::vhlo::BooleanV1Attr>().getValue(),
+            mlir::cast<mlir::vhlo::BooleanV1Attr>(vhlo_op.getIsStable())
+                .getValue(),
             comparator_subgraph_index);
 
         return tflite::CreateOperator(
@@ -2543,7 +2670,7 @@ std::optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
         auto transpose_option = tflite::CreateStablehloTransposeOptions(
             builder_,
             builder_.CreateVector(mlir::GetVector<int64_t>(
-                vhlo_op.getPermutation().cast<mlir::vhlo::TensorV1Attr>(),
+                mlir::cast<mlir::vhlo::TensorV1Attr>(vhlo_op.getPermutation()),
                 vhlo_type_converter)));
 
         return tflite::CreateOperator(
@@ -2669,7 +2796,8 @@ void Translator::InitializeNamesFromAttribute(FuncOp fn, bool* has_input_attr) {
 
   llvm::SmallVector<llvm::StringRef, 2> input_names;
   llvm::SmallVector<llvm::StringRef, 2> output_names;
-  if (auto str = dict_attr.get("inputs").dyn_cast_or_null<mlir::StringAttr>()) {
+  if (auto str =
+          mlir::dyn_cast_or_null<mlir::StringAttr>(dict_attr.get("inputs"))) {
     str.getValue().split(input_names, ',', /*MaxSplit=*/-1,
                          /*KeepEmpty=*/false);
     if (input_names.size() != fn.getNumArguments()) {
@@ -2683,7 +2811,7 @@ void Translator::InitializeNamesFromAttribute(FuncOp fn, bool* has_input_attr) {
   }
 
   if (auto str =
-          dict_attr.get("outputs").dyn_cast_or_null<mlir::StringAttr>()) {
+          mlir::dyn_cast_or_null<mlir::StringAttr>(dict_attr.get("outputs"))) {
     str.getValue().split(output_names, ',', /*MaxSplit=*/-1,
                          /*KeepEmpty=*/false);
     auto term = fn.back().getTerminator();
@@ -2708,13 +2836,14 @@ bool Translator::IsStatefulOperand(mlir::Operation* op, int operand_index) {
 BufferOffset<tflite::QuantizationParameters>
 Translator::GetQuantizationForQuantStatsOpOutput(
     mlir::quantfork::StatisticsOp stats_op) {
-  auto layer_stats = stats_op.getLayerStats().cast<mlir::DenseFPElementsAttr>();
+  auto layer_stats =
+      mlir::cast<mlir::DenseFPElementsAttr>(stats_op.getLayerStats());
   std::optional<mlir::ElementsAttr> axis_stats = stats_op.getAxisStats();
   std::optional<uint64_t> axis = stats_op.getAxis();
   std::vector<float> mins, maxs;
   mlir::DenseFPElementsAttr min_max_attr =
       axis_stats.has_value()
-          ? axis_stats.value().cast<mlir::DenseFPElementsAttr>()
+          ? mlir::cast<mlir::DenseFPElementsAttr>(axis_stats.value())
           : layer_stats;
 
   for (const auto& index_and_value :
@@ -2749,7 +2878,7 @@ std::optional<BufferOffset<tflite::SubGraph>> Translator::BuildSubGraph(
   auto build_tensor_and_buffer = [&](Value value, const int subgraph_index,
                                      const std::string& tensor_name) {
     // NoneType represents optional and may be skipped here.
-    if (value.getType().isa<NoneType>()) {
+    if (mlir::isa<NoneType>(value.getType())) {
       return true;
     }
 
@@ -2833,7 +2962,8 @@ std::optional<BufferOffset<tflite::SubGraph>> Translator::BuildSubGraph(
           "effective_hidden_scale_intermediate"};
       for (const std::string& intermediate : intermediate_names) {
         auto intermediate_attr = inst.getAttr(intermediate);
-        if (auto attr = intermediate_attr.dyn_cast_or_null<mlir::TypeAttr>()) {
+        if (auto attr =
+                mlir::dyn_cast_or_null<mlir::TypeAttr>(intermediate_attr)) {
           Type qtype = attr.getValue();
           auto tensor_or = BuildTensorFromType(
               qtype, name_mapper_.GetUniqueName(intermediate).str());
@@ -2879,7 +3009,7 @@ std::optional<BufferOffset<tflite::SubGraph>> Translator::BuildSubGraph(
     std::vector<int32_t> operands;
     operands.reserve(real_inst->getNumOperands());
     for (auto operand : real_inst->getOperands()) {
-      if (operand.getType().isa<NoneType>())
+      if (mlir::isa<NoneType>(operand.getType()))
         operands.push_back(kTfLiteOptionalTensor);
       else if (auto stats_op =
                    llvm::dyn_cast_or_null<mlir::quantfork::StatisticsOp>(
@@ -2960,7 +3090,7 @@ Translator::CreateMetadataVector() {
     for (const auto& named_attr : dict_attr) {
       StringRef name = named_attr.getName();
       mlir::Attribute attr = named_attr.getValue();
-      if (auto content = attr.dyn_cast<StringAttr>()) {
+      if (auto content = mlir::dyn_cast<StringAttr>(attr)) {
         metadata.push_back(BuildMetadata(name, content.getValue()));
       } else {
         module_.emitError(
@@ -3008,7 +3138,7 @@ Translator::CreateMetadataVector() {
 llvm::SmallVector<llvm::StringRef, 2> GetStringsFromAttrWithSeparator(
     mlir::DictionaryAttr attr, const std::string& attr_key) {
   llvm::SmallVector<llvm::StringRef, 2> result;
-  if (auto str = attr.get(attr_key).dyn_cast_or_null<mlir::StringAttr>()) {
+  if (auto str = mlir::dyn_cast_or_null<mlir::StringAttr>(attr.get(attr_key))) {
     str.getValue().split(result, ',', /*MaxSplit=*/-1,
                          /*KeepEmpty=*/false);
   }
@@ -3027,9 +3157,11 @@ std::vector<std::string> GetStringsFromDictionaryAttr(
     auto attrs = arg_attr.getValue();
     for (const auto attr : attrs) {
       if (attr.getName() == attr_name) {
-        auto array_attr = attr.getValue().dyn_cast_or_null<mlir::ArrayAttr>();
+        auto array_attr =
+            mlir::dyn_cast_or_null<mlir::ArrayAttr>(attr.getValue());
         if (!array_attr || array_attr.empty()) continue;
-        auto string_attr = array_attr[0].dyn_cast_or_null<mlir::StringAttr>();
+        auto string_attr =
+            mlir::dyn_cast_or_null<mlir::StringAttr>(array_attr[0]);
         if (!string_attr) continue;
         result.push_back(string_attr.getValue().str());
       }
@@ -3112,7 +3244,7 @@ std::vector<SignatureDefData> BuildSignaturedef(
     auto unique_name = std::string(name_mapper.GetUniqueName(operand.get()));
     result[0].outputs[sig_def_outputs[i]] = unique_name;
   }
-  if (auto name_attr = exported_name[0].dyn_cast_or_null<StringAttr>())
+  if (auto name_attr = mlir::dyn_cast_or_null<StringAttr>(exported_name[0]))
     result[0].signature_key = name_attr.getValue().str();
   result[0].subgraph_index = subgraph_index;
   return result;
@@ -3197,17 +3329,19 @@ std::optional<std::string> Translator::Translate(
     op_or_arg_name_mapper = &default_op_or_arg_name_mapper;
   if (!UpdateEntryFunction(module)) return std::nullopt;
   if (!IsValidTFLiteMlirModule(module)) return std::nullopt;
-  Translator translator(module, toco_flags, tags, op_or_arg_name_mapper,
-                        metadata, custom_option_alignment);
-  translator.convert_stablehlo_ = serialize_stablehlo_ops;
-  auto ret = translator.TranslateInternal();
-  if (translator.require_use_buffer_offset_) {
+  auto translator = std::unique_ptr<Translator>(
+      new Translator(module, toco_flags, tags, op_or_arg_name_mapper, metadata,
+                     custom_option_alignment));
+  translator->convert_stablehlo_ = serialize_stablehlo_ops;
+  auto ret = translator->TranslateInternal();
+  if (translator->require_use_buffer_offset_) {
+    ret = std::nullopt;
     auto new_toco_flags = toco_flags;
     new_toco_flags.set_use_buffer_offset(true);
-    Translator new_translator(module, new_toco_flags, tags,
-                              op_or_arg_name_mapper, metadata,
-                              custom_option_alignment);
-    return new_translator.TranslateInternal();
+    translator = std::unique_ptr<Translator>(
+        new Translator(module, new_toco_flags, tags, op_or_arg_name_mapper,
+                       metadata, custom_option_alignment));
+    return translator->TranslateInternal();
   }
   return ret;
 }
@@ -3453,63 +3587,91 @@ std::optional<std::string> Translator::TranslateInternal() {
     }
   }
 
-  auto result =
-      std::string(reinterpret_cast<const char*>(builder_.GetBufferPointer()),
-                  builder_.GetSize());
+  absl::Cord result;
+  auto fbs = absl::string_view(
+      reinterpret_cast<const char*>(builder_.GetBufferPointer()),
+      builder_.GetSize());
+  result.Append(fbs);
 
   // Return serialized string for the built FlatBuffer.
   if (use_buffer_offset_) {
+    // Pad to be 16 bytes aligned
+    {
+      std::string pad(kFbAlignment - result.size() % kFbAlignment, '\0');
+      result.Append(std::move(pad));
+    }
     AppendBufferData(result);
-    auto mutable_model = tflite::GetMutableModel(result.data());
+    std::string result_str = std::string(std::move(result));
+    auto mutable_model = tflite::GetMutableModel(result_str.data());
     bool ret = UpdateBufferOffsets(mutable_model);
     if (!ret) {
       return std::nullopt;
     }
-    return result;
+    return result_str;
   }
-  return result;
+  return std::string(result);
 }
 
-void Translator::AppendBufferData(std::string& result) {
+void Translator::AppendBufferData(absl::Cord& result) {
   std::unordered_map<uint64_t, std::pair<int64_t, int64_t>> hashcode_to_pos;
-  // Pad to be 16 bytes aligned
-  while (result.size() % 16 != 0) result += '\0';
-  for (auto& it : buffer_data_map_) {
-    auto buffer = std::string(it.second.begin(), it.second.end());
-    int64_t index = it.first;
+  // Buffer data should be exported only once.
+  assert(!buffer_data_exported_);
+
+  auto it = buffer_data_map_.begin();
+  while (it != buffer_data_map_.end()) {
+    std::string buffer = it->second;
+    int64_t index = it->first;
     int64_t offset = result.size();
-    int64_t size = it.second.size();
+    int64_t size = buffer.size();
     uint64_t hash = tsl::Fingerprint64(buffer);
     if (hashcode_to_pos.find(hash) == hashcode_to_pos.end()) {
       hashcode_to_pos[hash] = std::make_pair(offset, size);
       buffer_idx_map_[index] = std::make_pair(offset, size);
-      result += std::string(it.second.begin(), it.second.end());
-      // Pad to be 16 bytes aligned
-      while (result.size() % 16 != 0) result += '\0';
+      result.Append(std::move(buffer));
+      // Pad to be 16 bytes aligned.
+      {
+        std::string pad(kFbAlignment - result.size() % kFbAlignment, '\0');
+        result.Append(std::move(pad));
+      }
     } else {
       // only update offset/index.
       buffer_idx_map_[index] = hashcode_to_pos[hash];
     }
+    buffer_data_map_.erase(it);
+    it = buffer_data_map_.begin();
+    buffer_data_exported_ = true;
   }
   // pad 16 bytes for the last buffer for XNNPack
-  result += "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
+  result.Append("\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0");
   // pad to be 16 bytes aligned
-  while (result.size() % 16 != 0) result += '\0';
+  {
+    std::string pad(kFbAlignment - result.size() % kFbAlignment, '\0');
+    result.Append(std::move(pad));
+  }
 
   for (auto& it : custom_op_data_map_) {
-    while (result.size() % 16 != 0) result += '\0';
+    {
+      std::string pad(kFbAlignment - result.size() % kFbAlignment, '\0');
+      result.Append(std::move(pad));
+    }
     if (custom_option_alignment_.has_value()) {
-      while (result.size() % custom_option_alignment_.value() != 0)
-        result += '\0';
+      {
+        auto alignment = custom_option_alignment_.value();
+        std::string pad(alignment - result.size() % alignment, '\0');
+        result.Append(std::move(pad));
+      }
     }
     auto buffer = std::string(it.second.begin(), it.second.end());
     int64_t offset = result.size();
     int64_t size = it.second.size();
     custom_op_idx_map_[it.first] = std::make_pair(offset, size);
-    result += buffer;
+    result.Append(std::move(buffer));
   }
   // pad to be 16 bytes aligned
-  while (result.size() % 16 != 0) result += '\0';
+  {
+    std::string pad(kFbAlignment - result.size() % kFbAlignment, '\0');
+    result.Append(std::move(pad));
+  }
 }
 
 bool Translator::UpdateBufferOffsets(tflite::Model* mutable_model) {
@@ -3568,8 +3730,8 @@ BufferOffset<tflite::SparsityParameters> Translator::BuildSparsityParameters(
   std::vector<flatbuffers::Offset<tflite::DimensionMetadata>> fb_dim_metadata(
       dim_size);
   for (int i = 0; i < dim_size; i++) {
-    const auto dim_metadata =
-        s_attr.getDimMetadata()[i].dyn_cast<mlir::TFL::DimensionMetadataAttr>();
+    const auto dim_metadata = mlir::dyn_cast<mlir::TFL::DimensionMetadataAttr>(
+        s_attr.getDimMetadata()[i]);
     if (dim_metadata.getFormat().getValue() ==
         mlir::TFL::DimensionType::DENSE) {
       fb_dim_metadata[i] = tflite::CreateDimensionMetadata(
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
index 0d477b51b6d467..bcc0244194dccd 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
@@ -77,6 +77,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/offset_buffer.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/lite/schema/mutable/schema_generated.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/utils/const_tensor_utils.h"
 #include "tensorflow/compiler/mlir/lite/utils/convert_type.h"
@@ -96,7 +97,6 @@ limitations under the License.
 #include "tensorflow/lite/experimental/remat/metadata_util.h"
 #include "tensorflow/lite/graph_info.h"
 #include "tensorflow/lite/model_builder.h"
-#include "tensorflow/lite/schema/mutable/schema_generated.h"
 #include "tensorflow/lite/schema/schema_utils.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
@@ -770,6 +770,20 @@ StatusOr<Operation*> ConvertOp(
     mlir::BuiltinOptionsToAttributes(op.builtin_options, builder, attrs);
     mlir::BuiltinOptions2ToAttributes(op.builtin_options_2, builder, attrs);
   }
+
+  if (builtin_code == tflite::BuiltinOperator_STABLEHLO_COMPOSITE) {
+    auto composite_options = op.builtin_options_2.AsStableHLOCompositeOptions();
+    std::string decomposition = "";
+    if (composite_options->decomposition_subgraph_index > -1) {
+      decomposition =
+          func_names.at(composite_options->decomposition_subgraph_index);
+    }
+
+    attrs.emplace_back(builder.getNamedAttr(
+        "decomposition",
+        mlir::vhlo::StringV1Attr::get(builder.getContext(), decomposition)));
+  }
+
   op_state.addAttributes(attrs);
 
   // Handle the conversion from subgraph index to functions for If and While. We
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc b/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
index f72ef1f9641d48..3dfc21f5c2a07c 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cstdint>
 #include <optional>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/strings/str_cat.h"
@@ -41,9 +42,11 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "stablehlo/dialect/VhloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/schema/mutable/schema_generated.h"
 #include "tensorflow/compiler/mlir/lite/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
@@ -52,7 +55,6 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/kernels/internal/kernel_utils.h"
-#include "tensorflow/lite/schema/mutable/schema_generated.h"
 #include "tensorflow/lite/schema/schema_utils.h"
 #include "tsl/platform/status.h"
 
@@ -176,7 +178,7 @@ ConvertI64ArrayAttrForOptionWriter(mlir::ArrayAttr attrArray,
   std::vector<int32_t> intVec;
   intVec.reserve(attrArray.getValue().size());
   for (auto attr : attrArray.getValue()) {
-    intVec.push_back(attr.cast<mlir::IntegerAttr>().getInt());
+    intVec.push_back(mlir::cast<mlir::IntegerAttr>(attr).getInt());
   }
   return builder->CreateVector(intVec);
 }
@@ -188,7 +190,7 @@ ConvertF32ArrayAttrForOptionWriter(mlir::ArrayAttr attrArray,
   floatVec.reserve(attrArray.getValue().size());
   for (auto attr : attrArray.getValue()) {
     floatVec.push_back(
-        attr.cast<mlir::FloatAttr>().getValue().convertToFloat());
+        mlir::cast<mlir::FloatAttr>(attr).getValue().convertToFloat());
   }
   return builder->CreateVector(floatVec);
 }
@@ -301,6 +303,21 @@ static mlir::Attribute BuildVhloArrayV1Attr(std::vector<mlir::Attribute> value,
   return mlir::vhlo::ArrayV1Attr::get(builder.getContext(), value);
 }
 
+static mlir::Attribute BuildVhloDictionaryV1Attr(
+    std::vector<std::pair<mlir::Attribute, mlir::Attribute>> value,
+    mlir::Builder builder) {
+  return mlir::vhlo::DictionaryV1Attr::get(builder.getContext(), value);
+}
+
+static mlir::Attribute BuildVhloFloatV1Attr(float value,
+                                            mlir::Builder builder) {
+  mlir::StablehloVhloTypeConverter type_converter;
+  auto vhlo_type =
+      type_converter.convertType(builder.getF32FloatAttr(value).getType());
+  return mlir::vhlo::FloatV1Attr::get(builder.getContext(), vhlo_type,
+                                      ::llvm::APFloat(value));
+}
+
 static mlir::Attribute BuildRankedTensorAttr(std::vector<int64_t> shape,
                                              std::vector<bool> value,
                                              mlir::Builder builder) {
@@ -327,8 +344,8 @@ static mlir::Attribute BuildVhloTensorV1Attr(std::vector<int64_t> shape,
                                              std::vector<int64_t> value,
                                              mlir::Builder builder) {
   mlir::StablehloVhloTypeConverter type_converter;
-  auto builtin_attr = BuildRankedTensorAttr(shape, value, builder)
-                          .dyn_cast<mlir::DenseIntElementsAttr>();
+  auto builtin_attr = mlir::dyn_cast<mlir::DenseIntElementsAttr>(
+      BuildRankedTensorAttr(shape, value, builder));
   auto vhlo_type = type_converter.convertType(builtin_attr.getType());
   return mlir::vhlo::TensorV1Attr::get(builder.getContext(), vhlo_type,
                                        builtin_attr.getRawData());
@@ -338,8 +355,8 @@ static mlir::Attribute BuildVhloTensorV1Attr(std::vector<int64_t> shape,
                                              std::vector<bool> value,
                                              mlir::Builder builder) {
   mlir::StablehloVhloTypeConverter type_converter;
-  auto builtin_attr = BuildRankedTensorAttr(shape, value, builder)
-                          .dyn_cast<mlir::DenseIntElementsAttr>();
+  auto builtin_attr = mlir::dyn_cast<mlir::DenseIntElementsAttr>(
+      BuildRankedTensorAttr(shape, value, builder));
   auto vhlo_type = type_converter.convertType(builtin_attr.getType());
   return mlir::vhlo::TensorV1Attr::get(builder.getContext(), vhlo_type,
                                        builtin_attr.getRawData());
@@ -416,6 +433,33 @@ static mlir::Attribute BuildTFL_MirrorPaddingAttr(tflite::MirrorPadMode value,
   return mlir::TFL::MirrorPaddingTypeAttr::get(builder.getContext(), padding);
 }
 
+static std::vector<mlir::Attribute> BuildAttributeVectorFromFlatbuffer(
+    flexbuffers::Vector flatbuffer_vector, mlir::Builder builder) {
+  std::vector<mlir::Attribute> mlir_vector;
+
+  for (int i = 0; i < flatbuffer_vector.size(); ++i) {
+    auto value = flatbuffer_vector[i];
+
+    if (value.IsBool()) {
+      mlir_vector.push_back(BuildVhloBooleanV1Attr(value.AsBool(), builder));
+    } else if (value.IsString()) {
+      mlir_vector.push_back(
+          BuildVhloStringV1Attr(value.AsString().str(), builder));
+    } else if (value.IsInt()) {
+      mlir_vector.push_back(BuildVhloIntV1Attr(value.AsInt64(), builder));
+    } else if (value.IsFloat()) {
+      mlir_vector.push_back(BuildVhloFloatV1Attr(value.AsFloat(), builder));
+    } else if (value.IsVector()) {
+      std::vector<mlir::Attribute> nested_mlir_vector =
+          BuildAttributeVectorFromFlatbuffer(value.AsVector(), builder);
+      mlir_vector.push_back(
+          BuildVhloArrayV1Attr(std::move(nested_mlir_vector), builder));
+    }
+  }
+
+  return mlir_vector;
+}
+
 static mlir::Attribute BuildTFL_PaddingAttr(tflite::Padding value,
                                             mlir::Builder builder) {
   const char* option_name = tflite::EnumNamePadding(value);
@@ -613,8 +657,6 @@ void BuiltinOptions2ToAttributesManual(
     bool has_side_effect_set = false;
     const flexbuffers::Map& computation_map =
         flexbuffers::GetRoot(op->custom_attributes).AsMap();
-    std::vector<mlir::Attribute> symbol_vec;
-    symbol_vec.reserve(computation_map.size());
     const auto& keys = computation_map.Keys();
     for (size_t i = 0; i < keys.size(); ++i) {
       const auto key = keys[i].AsKey();
@@ -638,6 +680,61 @@ void BuiltinOptions2ToAttributesManual(
           "has_side_effect", BuildVhloBooleanV1Attr(false, builder)));
     return;
   }
+  if (const auto* op = op_union.AsStableHLOCompositeOptions()) {
+    attributes.emplace_back(
+        builder.getNamedAttr("name", BuildVhloStringV1Attr(op->name, builder)));
+
+    attributes.emplace_back(builder.getNamedAttr(
+        "version", BuildVhloIntV1Attr(op->version, builder)));
+
+    auto composite_attribute_pairs =
+        std::vector<std::pair<mlir::Attribute, mlir::Attribute>>();
+
+    auto composite_attributes =
+        flexbuffers::GetRoot(op->composite_attributes).AsMap();
+
+    const auto& keys = composite_attributes.Keys();
+    for (size_t i = 0; i < keys.size(); ++i) {
+      const auto key = keys[i].AsKey();
+      const auto& value = composite_attributes[key];
+
+      std::pair<mlir::Attribute, mlir::Attribute> composite_attribute_pair;
+      composite_attribute_pair.first = BuildVhloStringV1Attr(key, builder);
+
+      if (value.IsBool()) {
+        composite_attribute_pair.second =
+            BuildVhloBooleanV1Attr(value.AsBool(), builder);
+      }
+      if (value.IsString()) {
+        composite_attribute_pair.second =
+            BuildVhloStringV1Attr(value.AsString().str(), builder);
+      }
+      if (value.IsInt()) {
+        composite_attribute_pair.second =
+            BuildVhloIntV1Attr(value.AsInt64(), builder);
+      }
+      if (value.IsFloat()) {
+        composite_attribute_pair.second =
+            BuildVhloFloatV1Attr(value.AsFloat(), builder);
+      }
+
+      if (value.IsVector()) {
+        std::vector<mlir::Attribute> mlir_vector =
+            BuildAttributeVectorFromFlatbuffer(value.AsVector(), builder);
+
+        composite_attribute_pair.second =
+            BuildVhloArrayV1Attr(std::move(mlir_vector), builder);
+      }
+
+      composite_attribute_pairs.emplace_back(composite_attribute_pair);
+    }
+
+    attributes.emplace_back(builder.getNamedAttr(
+        "composite_attributes",
+        BuildVhloDictionaryV1Attr(std::move(composite_attribute_pairs),
+                                  builder)));
+    return;
+  }
   if (const auto* op = op_union.AsStablehloPadOptions()) {
     std::vector<int64_t> shape = {
         static_cast<int64_t>(op->edge_padding_low.size())};
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_operator.h b/tensorflow/compiler/mlir/lite/flatbuffer_operator.h
index 381f2a4c024549..64865eb77b5c43 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_operator.h
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_operator.h
@@ -30,12 +30,13 @@ limitations under the License.
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "stablehlo/dialect/VhloOps.h"  // from @stablehlo
 #include "stablehlo/dialect/VhloTypes.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/lite/schema/mutable/schema_generated.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/statusor.h"
-#include "tensorflow/lite/schema/mutable/schema_generated.h"
 
 namespace mlir {
 
@@ -65,7 +66,7 @@ class StablehloVhloTypeConverter : public mlir::vhlo::VhloTypeConverter {
       return attr;
 
     if (auto stablehloAttr =
-            attr.dyn_cast_or_null<mlir::stablehlo::TypeExtensionsAttr>()) {
+            mlir::dyn_cast_or_null<mlir::stablehlo::TypeExtensionsAttr>(attr)) {
       return mlir::vhlo::TypeExtensionsV1Attr::get(stablehloAttr.getContext(),
                                                    stablehloAttr.getBounds());
     }
@@ -88,7 +89,8 @@ class VhloToStablehloTypeConverter : public vhlo::VhloTypeConverter {
   }
 
   Attribute convertEncoding(Attribute attr) const final {
-    if (auto vhloAttr = attr.dyn_cast_or_null<vhlo::TypeExtensionsV1Attr>()) {
+    if (auto vhloAttr =
+            mlir::dyn_cast_or_null<vhlo::TypeExtensionsV1Attr>(attr)) {
       return stablehlo::TypeExtensionsAttr::get(vhloAttr.getContext(),
                                                 vhloAttr.getBounds());
     }
@@ -296,8 +298,8 @@ static inline std::vector<T> GetVector(
     vhlo::TensorV1Attr elements,
     mlir::vhlo::VhloTypeConverter &vhlo_type_converter) {
   return GetOptionalVector<T>(mlir::DenseIntElementsAttr::getFromRawBuffer(
-      vhlo_type_converter.convertType(elements.getType())
-          .cast<mlir::ShapedType>(),
+      mlir::cast<mlir::ShapedType>(
+          vhlo_type_converter.convertType(elements.getType())),
       elements.getData()));
 }
 
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_to_string.cc b/tensorflow/compiler/mlir/lite/flatbuffer_to_string.cc
index b3e7e8e633e0da..df28f501ef7656 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_to_string.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_to_string.cc
@@ -25,7 +25,7 @@ limitations under the License.
 
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "flatbuffers/minireflect.h"  // from @flatbuffers
-#include "tensorflow/lite/schema/reflection/schema_generated.h"
+#include "tensorflow/compiler/mlir/lite/schema/reflection/schema_generated.h"
 #if FLATBUFFERS_LITTLEENDIAN == 0
 #include "tensorflow/lite/core/model_builder.h"
 #endif
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td b/tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td
index 3ae87672d0a99c..e2e10e2e712131 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td
@@ -38,7 +38,6 @@ def TFL_Dialect : Dialect {
 
   let useDefaultAttributePrinterParser = 1;
   let useDefaultTypePrinterParser = 1;
-  let usePropertiesForAttributes = 0;
 
   let extraClassDeclaration = [{
     ParseResult parseOneResultSameOperandTypeOp(OpAsmParser &parser,
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
index 8ac81939d0d4de..1633820bb5bd5e 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
@@ -86,25 +86,13 @@ namespace {
 ParseResult parseOneResultSameOperandTypeOp(OpAsmParser& parser,
                                             OperationState& result) {
   SmallVector<OpAsmParser::UnresolvedOperand, 2> ops;
-  Type type;
   // If the operand list is in-between parentheses, then we have a generic form.
   // (see the fallback in `printOneResultOp`).
-  SMLoc loc = parser.getCurrentLocation();
   if (!parser.parseOptionalLParen()) {
-    if (parser.parseOperandList(ops) || parser.parseRParen() ||
-        parser.parseOptionalAttrDict(result.attributes) ||
-        parser.parseColon() || parser.parseType(type))
-      return failure();
-    auto fnType = type.dyn_cast<FunctionType>();
-    if (!fnType) {
-      parser.emitError(loc, "expected function type");
-      return failure();
-    }
-    if (parser.resolveOperands(ops, fnType.getInputs(), loc, result.operands))
-      return failure();
-    result.addTypes(fnType.getResults());
-    return success();
+    if (parser.parseOperandList(ops) || parser.parseRParen()) return failure();
+    return parser.parseGenericOperationAfterOpName(result, ops);
   }
+  Type type;
   return failure(parser.parseOperandList(ops) ||
                  parser.parseOptionalAttrDict(result.attributes) ||
                  parser.parseColonType(type) ||
@@ -1046,9 +1034,9 @@ mlir::LogicalResult CustomOp::verify() {
 
 LogicalResult CustomTfOp::inferReturnTypes(
     MLIRContext*, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attr, OpaqueProperties, RegionRange ranges,
+    DictionaryAttr attr, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  CustomTfOpAdaptor op(operands, attr, {}, ranges);
+  CustomTfOpAdaptor op(operands, attr, properties, regions);
 
   if (op.getRegions().empty()) return success();
   auto* real_op = &op.getBody().front().front();
@@ -1391,9 +1379,9 @@ static LogicalResult ComputeConvWindowedOutputSize(
 
 LogicalResult Conv2DOp::inferReturnTypes(
     MLIRContext*, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attr, OpaqueProperties, RegionRange,
+    DictionaryAttr attr, OpaqueProperties properties, RegionRange,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  Conv2DOpAdaptor op(operands, attr);
+  Conv2DOpAdaptor op(operands, attr, properties);
 
   const Value input = op.getInput();
   const Value filter = op.getFilter();
@@ -2072,9 +2060,9 @@ mlir::LogicalResult ReshapeOp::verify() {
 
 LogicalResult ReshapeOp::inferReturnTypes(
     MLIRContext* context, std::optional<Location> location, ValueRange operands,
-    DictionaryAttr attr, OpaqueProperties, RegionRange,
+    DictionaryAttr attr, OpaqueProperties properties, RegionRange,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  ReshapeOpAdaptor op(operands, attr);
+  ReshapeOpAdaptor op(operands, attr, properties);
   const Value input = op.getInput();
   const Value shape = op.getShape();
 
@@ -2449,9 +2437,9 @@ void FakeQuantOp::getCanonicalizationPatterns(RewritePatternSet& results,
 
 LogicalResult UnpackOp::inferReturnTypes(
     MLIRContext* context, std::optional<Location> loc, ValueRange operands,
-    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
-  UnpackOpAdaptor op(operands, attributes);
+  UnpackOpAdaptor op(operands, attributes, properties);
   // TODO(jpienaar): Refactor verify
   if (failed(op.verify(loc.has_value() ? *loc : UnknownLoc::get(context))))
     return failure();
@@ -2810,7 +2798,7 @@ mlir::LogicalResult UnidirectionalSequenceLSTMOp::verify() {
 
 LogicalResult UnidirectionalSequenceLSTMOp::inferReturnTypes(
     MLIRContext*, std::optional<Location>, ValueRange operands,
-    DictionaryAttr attr, OpaqueProperties, RegionRange,
+    DictionaryAttr attr, OpaqueProperties properties, RegionRange,
     SmallVectorImpl<Type>& inferredReturnTypes) {
   Value input = operands[0];
   auto input_type = input.getType().dyn_cast_or_null<RankedTensorType>();
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.h b/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
index 5872fbc8c953c9..ed3a963bbb7523 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
@@ -34,10 +34,10 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops_dialect.h.inc"
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops_enums.h.inc"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
 #include "tensorflow/compiler/mlir/lite/utils/utils.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h"
-#include "tensorflow/lite/schema/schema_generated.h"
 #define GET_ATTRDEF_CLASSES
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops_attrdefs.h.inc"
 
diff --git a/tensorflow/compiler/mlir/lite/json_to_flatbuffer.cc b/tensorflow/compiler/mlir/lite/json_to_flatbuffer.cc
index 4a4e7a65cd6cdc..80558576e08a38 100644
--- a/tensorflow/compiler/mlir/lite/json_to_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/json_to_flatbuffer.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "flatbuffers/idl.h"  // from @flatbuffers
 #include "flatbuffers/util.h"  // from @flatbuffers
-#include "tensorflow/lite/schema/schema_generated.h"
 
 int main(int argc, char** argv) {
   // load FlatBuffer schema (.fbs) and JSON from disk
diff --git a/tensorflow/compiler/mlir/lite/metrics/BUILD b/tensorflow/compiler/mlir/lite/metrics/BUILD
index 6218a2fb30a829..464cd8f33822b7 100644
--- a/tensorflow/compiler/mlir/lite/metrics/BUILD
+++ b/tensorflow/compiler/mlir/lite/metrics/BUILD
@@ -72,5 +72,6 @@ cc_library(
         "//tensorflow/lite/python/metrics:converter_error_data_proto_cc",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
 )
diff --git a/tensorflow/compiler/mlir/lite/metrics/types_util.cc b/tensorflow/compiler/mlir/lite/metrics/types_util.cc
index b47347ceb03827..7dd658e54dd12e 100644
--- a/tensorflow/compiler/mlir/lite/metrics/types_util.cc
+++ b/tensorflow/compiler/mlir/lite/metrics/types_util.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "llvm/ADT/TypeSwitch.h"
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/lite/python/metrics/converter_error_data.pb.h"
 
 namespace mlir {
@@ -67,8 +68,8 @@ class LocationExtractor : public Location {
           new_call->set_name(loc.getName().str());
           // Add child as the source location.
           auto child_loc = loc.getChildLoc();
-          if (child_loc.isa<FileLineColLoc>()) {
-            auto typed_child_loc = child_loc.dyn_cast<FileLineColLoc>();
+          if (mlir::isa<FileLineColLoc>(child_loc)) {
+            auto typed_child_loc = mlir::dyn_cast<FileLineColLoc>(child_loc);
             ExtractFileLine(typed_child_loc, new_call->mutable_source());
           }
         })
@@ -83,7 +84,7 @@ class LocationExtractor : public Location {
           // Skip the first location if it stores information for propagating
           // op_type metadata.
           if (num_locs > 0) {
-            if (auto name_loc = locations[0].dyn_cast<mlir::NameLoc>()) {
+            if (auto name_loc = mlir::dyn_cast<mlir::NameLoc>(locations[0])) {
               if (name_loc.getName().strref().ends_with(":")) {
                 if (num_locs == 2) {
                   return LocationExtractor(locations[1]).Extract(error_data);
diff --git a/tensorflow/compiler/mlir/lite/python/BUILD b/tensorflow/compiler/mlir/lite/python/BUILD
index 203a06ff721a02..c7f50ba6edf81c 100644
--- a/tensorflow/compiler/mlir/lite/python/BUILD
+++ b/tensorflow/compiler/mlir/lite/python/BUILD
@@ -95,6 +95,7 @@ cc_library(
         "//tensorflow/lite/toco:model_flags_proto_cc",
         "//tensorflow/lite/toco:toco_flags_proto_cc",
         "//tensorflow/lite/toco:types_proto_cc",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
@@ -143,10 +144,9 @@ cc_library(
     ],
     deps = [
         "//tensorflow/compiler/mlir/lite:flatbuffer_import",
+        "@com_google_absl//absl/strings:string_view",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TranslateLib",
     ],
diff --git a/tensorflow/compiler/mlir/lite/python/flatbuffer_to_mlir.cc b/tensorflow/compiler/mlir/lite/python/flatbuffer_to_mlir.cc
index 23b3714e73a758..6591251d9e915b 100644
--- a/tensorflow/compiler/mlir/lite/python/flatbuffer_to_mlir.cc
+++ b/tensorflow/compiler/mlir/lite/python/flatbuffer_to_mlir.cc
@@ -18,25 +18,19 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/InitLLVM.h"
+#include "absl/strings/string_view.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/raw_ostream.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/Verifier.h"  // from @llvm-project
 #include "mlir/Support/FileUtilities.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Tools/mlir-translate/Translation.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/flatbuffer_import.h"
 
diff --git a/tensorflow/compiler/mlir/lite/python/jax_to_tfl_flatbuffer.h b/tensorflow/compiler/mlir/lite/python/jax_to_tfl_flatbuffer.h
index e5ce76612f3e01..11a8e28ebed0cc 100644
--- a/tensorflow/compiler/mlir/lite/python/jax_to_tfl_flatbuffer.h
+++ b/tensorflow/compiler/mlir/lite/python/jax_to_tfl_flatbuffer.h
@@ -18,6 +18,8 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/toco_flags.pb.h"
 
diff --git a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
index 16e12bbb6da04d..085478db128a71 100644
--- a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
@@ -21,14 +21,18 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/StringSet.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/Support/FileUtilities.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/cc/saved_model/loader.h"
 #include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
 #include "tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h"
@@ -42,9 +46,11 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/toco_flags.pb.h"
 #include "tensorflow/lite/toco/types.pb.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
 namespace tensorflow {
@@ -80,7 +86,7 @@ Status HandleInputOutputArraysWithModule(
   if (!input_attr) {
     return errors::InvalidArgument("no inputs attribute found");
   }
-  auto input_names = input_attr.cast<mlir::StringAttr>().getValue();
+  auto input_names = mlir::cast<mlir::StringAttr>(input_attr).getValue();
   input_names.split(function_input_names, ",", /*MaxSplit=*/-1,
                     /*KeepEmpty=*/false);
   const int function_input_names_size = function_input_names.size();
@@ -106,7 +112,7 @@ Status HandleInputOutputArraysWithModule(
   if (!output_attr) {
     return errors::InvalidArgument("no outputs attribute found");
   }
-  auto output_names = output_attr.cast<mlir::StringAttr>().getValue();
+  auto output_names = mlir::cast<mlir::StringAttr>(output_attr).getValue();
   output_names.split(function_output_names, ",", /*MaxSplit=*/-1,
                      /*KeepEmpty=*/false);
   const int function_output_names_size = function_output_names.size();
diff --git a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.h b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.h
index 87b5d80d025b47..a9170bdc86a4f5 100644
--- a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.h
+++ b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.h
@@ -19,6 +19,8 @@ limitations under the License.
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/toco_flags.pb.h"
 
diff --git a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
index 224db40942ef90..a4512d226939f7 100644
--- a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
+++ b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
@@ -43,10 +43,10 @@ limitations under the License.
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/op_def_builder.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/public/session.h"
 #include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/toco_flags.pb.h"
 #include "tensorflow/lite/toco/types.pb.h"
diff --git a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
index ef436759985dcb..a57b3585abadb1 100644
--- a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
+++ b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
@@ -24,10 +24,14 @@ limitations under the License.
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "tensorflow/cc/saved_model/loader.h"
 #include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
+#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/toco_flags.pb.h"
diff --git a/tensorflow/compiler/mlir/lite/quantization/ir/ConvertConst.cc b/tensorflow/compiler/mlir/lite/quantization/ir/ConvertConst.cc
index 3de159a1414429..f6eac3e90ec8bd 100644
--- a/tensorflow/compiler/mlir/lite/quantization/ir/ConvertConst.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/ir/ConvertConst.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/ir/Passes.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
@@ -80,7 +81,7 @@ LogicalResult QuantizedConstRewrite::matchAndRewrite(
   }
 
   // Is the constant value a type expressed in a way that we support?
-  if (!value.isa<FloatAttr, DenseElementsAttr, SparseElementsAttr>()) {
+  if (!mlir::isa<FloatAttr, DenseElementsAttr, SparseElementsAttr>(value)) {
     return failure();
   }
 
diff --git a/tensorflow/compiler/mlir/lite/quantization/ir/ConvertSimQuant.cc b/tensorflow/compiler/mlir/lite/quantization/ir/ConvertSimQuant.cc
index e99addc5b5f8a5..a51956ce08a239 100644
--- a/tensorflow/compiler/mlir/lite/quantization/ir/ConvertSimQuant.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/ir/ConvertSimQuant.cc
@@ -13,18 +13,26 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cassert>
+#include <memory>
+#include <utility>
+
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/ir/Passes.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.h"
 #include "tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.h"
 
-using namespace mlir;
-using namespace mlir::quantfork;
-
-namespace {
+namespace mlir::quantfork {
 
 #define GEN_PASS_DEF_QUANTCONVERTSIMULATEDQUANT
 #include "tensorflow/compiler/mlir/lite/quantization/ir/Passes.h.inc"
@@ -51,7 +59,6 @@ class FakeQuantRewrite : public OpRewritePattern<FakeQuantOp> {
       *hadFailure = true;
       return failure();
     }
-
     return success();
   }
 
@@ -66,7 +73,7 @@ class FakeQuantRewrite : public OpRewritePattern<FakeQuantOp> {
 
     quant::QuantizedType elementType =
         static_cast<const ConcreteRewriteClass *>(this)
-            ->convertFakeQuantAttrsToType(op, converter.expressedType);
+            ->convertFakeQuantAttrsToType(op, converter.expressed_type);
 
     if (!elementType) {
       // Note that the fakeQuantAttrsToType will have emitted the error.
@@ -81,7 +88,7 @@ class FakeQuantRewrite : public OpRewritePattern<FakeQuantOp> {
     // this is a forced/hard-coded constraint.
     auto qbarrier = rewriter.create<QuantizeCastOp>(op.getLoc(), quantizedType,
                                                     op.getInputs());
-    rewriter.replaceOpWithNewOp<DequantizeCastOp>(op, converter.inputType,
+    rewriter.replaceOpWithNewOp<DequantizeCastOp>(op, converter.input_type,
                                                   qbarrier.getResult());
 
     return false;
@@ -121,9 +128,9 @@ class ConstFakeQuantPerAxisRewrite
     min.reserve(fqOp.getMin().size());
     max.reserve(fqOp.getMax().size());
     for (auto m : fqOp.getMin())
-      min.push_back(m.cast<FloatAttr>().getValueAsDouble());
+      min.push_back(cast<FloatAttr>(m).getValueAsDouble());
     for (auto m : fqOp.getMax())
-      max.push_back(m.cast<FloatAttr>().getValueAsDouble());
+      max.push_back(cast<FloatAttr>(m).getValueAsDouble());
 
     return fakeQuantAttrsToType(fqOp.getLoc(), fqOp.getNumBits(),
                                 fqOp.getAxis(), min, max, fqOp.getNarrowRange(),
@@ -131,8 +138,6 @@ class ConstFakeQuantPerAxisRewrite
   }
 };
 
-}  // namespace
-
 void ConvertSimulatedQuantPass::runOnOperation() {
   bool hadFailure = false;
   auto func = getOperation();
@@ -144,7 +149,8 @@ void ConvertSimulatedQuantPass::runOnOperation() {
   if (hadFailure) signalPassFailure();
 }
 
-std::unique_ptr<OperationPass<func::FuncOp>>
-mlir::quantfork::createConvertSimulatedQuantPass() {
+std::unique_ptr<OperationPass<func::FuncOp>> createConvertSimulatedQuantPass() {
   return std::make_unique<ConvertSimulatedQuantPass>();
 }
+
+}  // namespace mlir::quantfork
diff --git a/tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.cc b/tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.cc
index d111141958c403..8aa6475b888702 100644
--- a/tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Matchers.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 
 using namespace mlir;
 using namespace mlir::quantfork;
@@ -51,20 +52,20 @@ OpFoldResult StorageCastOp::fold(FoldAdaptor) {
 
 /// The quantization specification should match the expressed type.
 static bool isValidQuantizationSpec(Attribute quantSpec, Type expressed) {
-  if (auto typeAttr = quantSpec.dyn_cast<TypeAttr>()) {
+  if (auto typeAttr = mlir::dyn_cast<TypeAttr>(quantSpec)) {
     Type spec = typeAttr.getValue();
-    if (spec.isa<TensorType, VectorType>()) return false;
+    if (mlir::isa<TensorType, VectorType>(spec)) return false;
 
     // The spec should be either a quantized type which is compatible to the
     // expressed type, or a primitive type which is as same as the
     // (element type of) the expressed type.
-    if (auto quantizedType = spec.dyn_cast<QuantizedType>())
+    if (auto quantizedType = mlir::dyn_cast<QuantizedType>(spec))
       return quantizedType.isCompatibleExpressedType(expressed);
 
-    if (auto tensorType = expressed.dyn_cast<TensorType>())
+    if (auto tensorType = mlir::dyn_cast<TensorType>(expressed))
       return spec == tensorType.getElementType();
 
-    if (auto vectorType = expressed.dyn_cast<VectorType>())
+    if (auto vectorType = mlir::dyn_cast<VectorType>(expressed))
       return spec == vectorType.getElementType();
   }
   return false;
@@ -99,13 +100,13 @@ LogicalResult QuantizeRegionOp::verify() {
 }
 
 LogicalResult StatisticsOp::verify() {
-  auto tensorArg = getArg().getType().dyn_cast<TensorType>();
+  auto tensorArg = mlir::dyn_cast<TensorType>(getArg().getType());
   if (!tensorArg) return emitOpError("arg needs to be tensor type.");
 
   // Verify layerStats attribute.
   {
     auto layerStatsType = getLayerStats().getShapedType();
-    if (!layerStatsType.getElementType().isa<FloatType>()) {
+    if (!mlir::isa<FloatType>(layerStatsType.getElementType())) {
       return emitOpError("layerStats must have a floating point element type");
     }
     if (layerStatsType.getRank() != 1 || layerStatsType.getDimSize(0) != 2) {
@@ -122,7 +123,7 @@ LogicalResult StatisticsOp::verify() {
                         std::multiplies<int64_t>());
 
     auto axisStatsType = getAxisStats()->getShapedType();
-    if (!axisStatsType.getElementType().isa<FloatType>()) {
+    if (!mlir::isa<FloatType>(axisStatsType.getElementType())) {
       return emitOpError("axisStats must have a floating point element type");
     }
     if (axisStatsType.getRank() != 2 || axisStatsType.getDimSize(1) != 2 ||
diff --git a/tensorflow/compiler/mlir/lite/quantization/ir/QuantOpsBase.td b/tensorflow/compiler/mlir/lite/quantization/ir/QuantOpsBase.td
index f9afdc41db1dac..ed7a16c74d0fb7 100644
--- a/tensorflow/compiler/mlir/lite/quantization/ir/QuantOpsBase.td
+++ b/tensorflow/compiler/mlir/lite/quantization/ir/QuantOpsBase.td
@@ -27,7 +27,6 @@ include "mlir/IR/OpBase.td"
 def QuantizationFork_Dialect : Dialect {
   let name = "quantfork";
   let cppNamespace = "::mlir::quantfork";
-  let usePropertiesForAttributes = 0;
 }
 
 #endif // QUANT_FORK_BASE
diff --git a/tensorflow/compiler/mlir/lite/quantization/ir/QuantizeUtils.cc b/tensorflow/compiler/mlir/lite/quantization/ir/QuantizeUtils.cc
index 919c711272b2c1..2ad06f77de8866 100644
--- a/tensorflow/compiler/mlir/lite/quantization/ir/QuantizeUtils.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/ir/QuantizeUtils.cc
@@ -32,8 +32,8 @@ using namespace mlir::quantfork;
 static Attribute convertPrimitiveValueAttr(
     Attribute origRealValue, quant::QuantizedType quantizedElementType,
     const UniformQuantizedValueConverter &converter, Type &outConvertedType) {
-  if (origRealValue.isa<FloatAttr>()) {
-    FloatAttr floatAttr = origRealValue.cast<FloatAttr>();
+  if (mlir::isa<FloatAttr>(origRealValue)) {
+    FloatAttr floatAttr = mlir::cast<FloatAttr>(origRealValue);
     outConvertedType = quantizedElementType.getStorageType();
     return IntegerAttr::get(quantizedElementType.getStorageType(),
                             converter.quantizeFloatToInt(floatAttr.getValue()));
@@ -64,11 +64,11 @@ static SparseElementsAttr convertSparseElementsAttr(
     quant::QuantizedType quantizedElementType,
     const UniformQuantizedValueConverter &converter) {
   DenseElementsAttr realDenseAttr = realSparseAttr.getValues();
-  if (!realDenseAttr.isa<DenseFPElementsAttr>()) {
+  if (!mlir::isa<DenseFPElementsAttr>(realDenseAttr)) {
     return nullptr;
   }
   DenseElementsAttr quantDenseAttr =
-      convertDenseFPElementsAttr(realDenseAttr.cast<DenseFPElementsAttr>(),
+      convertDenseFPElementsAttr(mlir::cast<DenseFPElementsAttr>(realDenseAttr),
                                  quantizedElementType, converter);
   if (!quantDenseAttr) {
     return nullptr;
@@ -76,9 +76,9 @@ static SparseElementsAttr convertSparseElementsAttr(
 
   // Cast from an expressed-type-based type to storage-type-based type,
   // preserving the sparse shape (i.e. tensor<4xf32> -> tensor<4xi8>).
-  ShapedType newSparseType =
-      quantizedElementType.castExpressedToStorageType(realSparseAttr.getType())
-          .dyn_cast_or_null<ShapedType>();
+  ShapedType newSparseType = mlir::dyn_cast_or_null<ShapedType>(
+      quantizedElementType.castExpressedToStorageType(
+          realSparseAttr.getType()));
   if (!newSparseType) {
     return nullptr;
   }
@@ -93,17 +93,19 @@ Attribute mlir::quantfork::quantizeAttrUniform(
     Attribute realValue, quant::UniformQuantizedType quantizedElementType,
     const UniformQuantizedValueConverter &converter, Type &outConvertedType) {
   // Fork to handle different variants of constants supported.
-  if (realValue.isa<DenseFPElementsAttr>()) {
+  if (mlir::isa<DenseFPElementsAttr>(realValue)) {
     // Dense tensor or vector constant.
-    auto converted = convertDenseFPElementsAttr(
-        realValue.cast<DenseFPElementsAttr>(), quantizedElementType, converter);
+    auto converted =
+        convertDenseFPElementsAttr(mlir::cast<DenseFPElementsAttr>(realValue),
+                                   quantizedElementType, converter);
     outConvertedType = converted.getType();
     return converted;
   }
-  if (realValue.isa<SparseElementsAttr>()) {
+  if (mlir::isa<SparseElementsAttr>(realValue)) {
     // Sparse tensor or vector constant.
-    auto converted = convertSparseElementsAttr(
-        realValue.cast<SparseElementsAttr>(), quantizedElementType, converter);
+    auto converted =
+        convertSparseElementsAttr(mlir::cast<SparseElementsAttr>(realValue),
+                                  quantizedElementType, converter);
     outConvertedType = converted.getType();
     return converted;
   }
@@ -121,13 +123,14 @@ Attribute mlir::quantfork::quantizeAttr(
     Attribute realValue, quant::QuantizedType quantizedElementType,
     Type &outConvertedType) {
   if (auto uniformQuantized =
-          quantizedElementType.dyn_cast<quant::UniformQuantizedType>()) {
+          mlir::dyn_cast<quant::UniformQuantizedType>(quantizedElementType)) {
     UniformQuantizedValueConverter converter(uniformQuantized);
     return quantizeAttrUniform(realValue, uniformQuantized, converter,
                                outConvertedType);
   }
   if (auto uniformQuantizedPerAxis =
-          quantizedElementType.dyn_cast<quant::UniformQuantizedPerAxisType>()) {
+          mlir::dyn_cast<quant::UniformQuantizedPerAxisType>(
+              quantizedElementType)) {
     UniformQuantizedPerAxisValueConverter converter(uniformQuantizedPerAxis);
     auto converted = converter.convert(realValue);
     // TODO: why we need this outConvertedType? remove it?
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/BUILD b/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
index 66df4f528aa43d..48e8ebe35dde67 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
@@ -31,18 +31,19 @@ cc_library(
         "//tensorflow/compiler/mlir/lite:flatbuffer_translate_lib",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
         "//tensorflow/compiler/mlir/lite:tf_tfl_passes",
+        "//tensorflow/compiler/mlir/lite/schema:schema_fbs",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/core/api",
-        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -55,24 +56,24 @@ cc_library(
         "quantize_weights.h",
     ],
     deps = [
-        ":quantize_model",
         "//tensorflow/compiler/mlir/lite:common",
         "//tensorflow/compiler/mlir/lite:flatbuffer_translate_lib",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
-        "//tensorflow/compiler/mlir/lite:tensorflow_lite_quantize",
         "//tensorflow/compiler/mlir/lite:tf_tfl_passes",
+        "//tensorflow/compiler/mlir/lite/schema:schema_fbs",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/core/api",
-        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
+        "@flatbuffers//:runtime_cc",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -112,11 +113,10 @@ tf_cc_binary(
     ],
     deps = [
         ":quantize_model",
+        "//tensorflow/compiler/mlir/lite/schema:schema_fbs",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite/schema:schema_fbs",
-        "@com_google_absl//absl/strings",
+        "//tensorflow/lite/c:c_api_types",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AllPassesAndDialects",
     ],
 )
 
@@ -164,12 +164,16 @@ tf_cc_test(
     ],
     deps = [
         ":quantize_model",
+        "//tensorflow/compiler/mlir/lite/schema:schema_fbs",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite:string",
+        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/core/api:error_reporter",
         "//tensorflow/lite/schema:schema_utils",
         "//tensorflow/lite/tools/optimize:test_util",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -198,15 +202,15 @@ tf_cc_test(
     ],
     deps = [
         ":quantize_weights",
+        "//tensorflow/compiler/mlir/lite/schema:schema_fbs",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/schema:schema_utils",
         "//tensorflow/lite/tools/optimize:test_util",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
-        "@llvm-project//llvm:Support",
         "@local_tsl//tsl/platform:logging",
     ],
 )
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
index 19af4a756f9bea..12be81041d66de 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
@@ -20,16 +20,20 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/string_view.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
 #include "tensorflow/compiler/mlir/lite/flatbuffer_export.h"
 #include "tensorflow/compiler/mlir/lite/flatbuffer_import.h"
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
 #include "tensorflow/compiler/mlir/lite/tf_tfl_passes.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/utils/convert_type.h"
@@ -37,7 +41,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
 
 namespace mlir {
 namespace lite {
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h
index 50b397ba0206d2..665766d700512d 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h
@@ -20,9 +20,9 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/string_view.h"
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
 #include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace mlir {
 namespace lite {
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model_test.cc b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model_test.cc
index 02f07e98a1dbca..f1bf4363e797c3 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model_test.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model_test.cc
@@ -27,14 +27,19 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
-#include "tensorflow/core/lib/io/path.h"
+#include "absl/container/flat_hash_set.h"
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "flatbuffers/vector.h"  // from @flatbuffers
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
 #include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/command_line_flags.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/model_builder.h"
 #include "tensorflow/lite/schema/schema_utils.h"
+#include "tensorflow/lite/string_type.h"
 #include "tensorflow/lite/tools/optimize/test_util.h"
 
 // Note: branched from tensorflow/lite/tools/optimize/quantize_model_test.cc
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights.cc b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights.cc
index d55d41cc7a8e6e..e2581e7c53f7f4 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights.cc
@@ -21,26 +21,30 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/string_view.h"
-#include "llvm/ADT/SmallVector.h"
+#include "flatbuffers/buffer.h"  // from @flatbuffers
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Support/Debug.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
 #include "tensorflow/compiler/mlir/lite/flatbuffer_export.h"
 #include "tensorflow/compiler/mlir/lite/flatbuffer_import.h"
-#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
-#include "tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h"
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
 #include "tensorflow/compiler/mlir/lite/tf_tfl_passes.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/stderr_reporter.h"
 
 namespace mlir {
 namespace lite {
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights.h b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights.h
index 6c94e4c2d10c71..f92b58ffb3b01c 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights.h
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights.h
@@ -22,9 +22,11 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/model.h"
-#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace mlir {
 namespace lite {
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights_test.cc b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights_test.cc
index 57d0fda20ba33a..7056b7a244fc59 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights_test.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights_test.cc
@@ -21,12 +21,18 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "llvm/ADT/Twine.h"
-#include "tensorflow/core/lib/io/path.h"
+#include "flatbuffers/buffer.h"  // from @flatbuffers
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "flatbuffers/vector.h"  // from @flatbuffers
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
 #include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/command_line_flags.h"
-#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/model_builder.h"
 #include "tensorflow/lite/schema/schema_utils.h"
+#include "tensorflow/lite/stderr_reporter.h"
 #include "tensorflow/lite/tools/optimize/test_util.h"
 #include "tsl/platform/logging.h"
 
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/tfl_quantizer.cc b/tensorflow/compiler/mlir/lite/quantization/lite/tfl_quantizer.cc
index 4bf154e892bcdb..73e6140d658c4c 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/tfl_quantizer.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/tfl_quantizer.cc
@@ -15,14 +15,16 @@ limitations under the License.
 
 #include <iostream>
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/raw_ostream.h"
 #include "tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/stderr_reporter.h"
 
 using llvm::cl::opt;
 
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/tfl_to_std.cc b/tensorflow/compiler/mlir/lite/quantization/lite/tfl_to_std.cc
index df86c6fa6a5be9..339dfee21495ae 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/tfl_to_std.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/tfl_to_std.cc
@@ -15,10 +15,13 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/quantization/lite/tfl_to_std.h"
 
 #include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/lite/utils/utils.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 
 namespace mlir {
diff --git a/tensorflow/compiler/mlir/lite/quantization/stablehlo/BUILD b/tensorflow/compiler/mlir/lite/quantization/stablehlo/BUILD
index 43142a7a7c52dd..307741acda3439 100644
--- a/tensorflow/compiler/mlir/lite/quantization/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/stablehlo/BUILD
@@ -45,7 +45,6 @@ tf_cc_test(
         "//tensorflow/cc/saved_model:loader",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:io",
-        "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibrator_singleton_impl",  # buildcleaner: keep; prevents undefined reference
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest_main",
diff --git a/tensorflow/compiler/mlir/lite/quantization/tensorflow/fallback_to_flex_ops.cc b/tensorflow/compiler/mlir/lite/quantization/tensorflow/fallback_to_flex_ops.cc
index 9a25d849ea7c8a..e6284d273e50d0 100644
--- a/tensorflow/compiler/mlir/lite/quantization/tensorflow/fallback_to_flex_ops.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/tensorflow/fallback_to_flex_ops.cc
@@ -268,7 +268,7 @@ Value SetNoFallbackAttr(PatternRewriter &rewriter, Value val) {
 
 // Returns true if the attr is a float attribute and be equal to value.
 static bool FloatValueEquals(const Attribute &attr, double value) {
-  auto fp_attr = attr.dyn_cast_or_null<DenseFPElementsAttr>();
+  auto fp_attr = mlir::dyn_cast_or_null<DenseFPElementsAttr>(attr);
   if (fp_attr == nullptr) return false;
 
   if (fp_attr.isSplat()) {
@@ -281,7 +281,7 @@ static bool FloatValueEquals(const Attribute &attr, double value) {
 
 // Returns true if the rank of the value equals to the given rank.
 bool RankEquals(Value value, int rank) {
-  auto rank_type = value.getType().template dyn_cast<RankedTensorType>();
+  auto rank_type = mlir::dyn_cast<RankedTensorType>(value.getType());
   return (rank_type && rank_type.getRank() == rank);
 }
 
diff --git a/tensorflow/compiler/mlir/lite/quantization/tensorflow/tests/fallback_to_flex_ops_default.mlir b/tensorflow/compiler/mlir/lite/quantization/tensorflow/tests/fallback_to_flex_ops_default.mlir
index 9c6d9b8aa8059b..b8a9f325b11077 100644
--- a/tensorflow/compiler/mlir/lite/quantization/tensorflow/tests/fallback_to_flex_ops_default.mlir
+++ b/tensorflow/compiler/mlir/lite/quantization/tensorflow/tests/fallback_to_flex_ops_default.mlir
@@ -12,7 +12,7 @@ func.func @bias_add(%arg0: tensor<1x10x10x32xf32>, %arg1: tensor<32xf32>) -> ten
 func.func @add(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> {
   %0 = "tf.Add"(%arg0, %arg1) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
   func.return %0: tensor<1xf32>
-// CHECK: %[[CUSTOM_0:.*]] = "tfl.custom"(%arg0, %arg1) {custom_code = "FlexAdd", custom_option = #tfl<const_bytes : "0x03416464001412034164641A001A002A070A015412023001320000021B171414042801">} : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+// CHECK: %[[CUSTOM_0:.*]] = "tfl.custom"(%arg0, %arg1) <{custom_code = "FlexAdd", custom_option = #tfl<const_bytes : "0x03416464001412034164641A001A002A070A015412023001320000021B171414042801">}> : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
 // CHECK: return %[[CUSTOM_0]] : tensor<1xf32>
 }
 
@@ -20,7 +20,7 @@ func.func @add(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> {
 func.func @softmax(%arg0: tensor<8x16xf32>) -> tensor<8x16xf32> {
   %0 = "tf.Softmax"(%arg0) : (tensor<8x16xf32>) -> tensor<8x16xf32>
   func.return %0 : tensor<8x16xf32>
-// CHECK: %[[CUSTOM_0:.*]] = "tfl.custom"(%arg0) {custom_code = "FlexSoftmax", custom_option = #tfl<const_bytes : "0x07536F66746D617800161207536F66746D61781A002A070A0154120230013200000221191414042801">} : (tensor<8x16xf32>) -> tensor<8x16xf32>
+// CHECK: %[[CUSTOM_0:.*]] = "tfl.custom"(%arg0) <{custom_code = "FlexSoftmax", custom_option = #tfl<const_bytes : "0x07536F66746D617800161207536F66746D61781A002A070A0154120230013200000221191414042801">}> : (tensor<8x16xf32>) -> tensor<8x16xf32>
 // CHECK: return %[[CUSTOM_0]] : tensor<8x16xf32>
 }
 
@@ -52,7 +52,7 @@ func.func @conv2d_backprop_input_with_sub(%arg0: tensor<4xi32>, %arg1: tensor<3x
 func.func @depth_to_space(%arg0: tensor<1x1x1x4xf32>) -> tensor<1x2x2x1xf32> {
   %0 = "tf.DepthToSpace"(%arg0) {block_size = 2: i64,  data_format = "NHWC"}: (tensor<1x1x1x4xf32>) -> tensor<1x2x2x1xf32>
   func.return %0 : tensor<1x2x2x1xf32>
-// CHECK: %[[CUSTOM_0:.*]] = "tfl.custom"(%arg0) {custom_code = "FlexDepthToSpace", custom_option = #tfl<const_bytes : "{{.*}}">} : (tensor<1x1x1x4xf32>) -> tensor<1x2x2x1xf32>
+// CHECK: %[[CUSTOM_0:.*]] = "tfl.custom"(%arg0) <{custom_code = "FlexDepthToSpace", custom_option = #tfl<const_bytes : "{{.*}}">}> : (tensor<1x1x1x4xf32>) -> tensor<1x2x2x1xf32>
 // CHECK: return %[[CUSTOM_0]] : tensor<1x2x2x1xf32>
 }
 
@@ -60,7 +60,7 @@ func.func @depth_to_space(%arg0: tensor<1x1x1x4xf32>) -> tensor<1x2x2x1xf32> {
 func.func @floor_mod(%arg0: tensor<5xf32>, %arg1: tensor<5xf32>) -> tensor<5xf32> {
   %0 = "tf.FloorMod"(%arg0, %arg1) : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xf32>
   func.return %0 : tensor<5xf32>
-// CHECK: %[[CUSTOM_0:.*]] = "tfl.custom"(%arg0, %arg1) {custom_code = "FlexFloorMod", custom_option = #tfl<const_bytes : "{{.*}}">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xf32>
+// CHECK: %[[CUSTOM_0:.*]] = "tfl.custom"(%arg0, %arg1) <{custom_code = "FlexFloorMod", custom_option = #tfl<const_bytes : "{{.*}}">}> : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xf32>
 // CHECK: return %[[CUSTOM_0]] : tensor<5xf32>
 }
 
@@ -82,7 +82,7 @@ func.func @identity(%arg0: tensor<2xf32>) -> tensor<*xf32> {
   func.return %1 : tensor<*xf32>
 // CHECK: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<1.000000e-03> : tensor<f32>}> {device = ""} : () -> tensor<f32>
 // CHECK: %[[IDENTITY_0:.*]] = "tf.Identity"(%arg0) {device = ""} : (tensor<2xf32>) -> tensor<*xf32>
-// CHECK: %[[ADDV2_0:.*]] = "tfl.custom"(%0, %cst) {custom_code = "FlexAddV2", custom_option = #tfl<const_bytes : "0x0541646456320016120541646456321A001A002A070A015412023001320000021F191414042801">} : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
+// CHECK: %[[ADDV2_0:.*]] = "tfl.custom"(%0, %cst) <{custom_code = "FlexAddV2", custom_option = #tfl<const_bytes : "0x0541646456320016120541646456321A001A002A070A015412023001320000021F191414042801">}> : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
 // CHECK: return %[[ADDV2_0]] : tensor<*xf32>
 }
 
@@ -148,7 +148,7 @@ func.func @conv_with_relu1_invalid_pattern(%arg0: tensor<1x3x4x3xf32>) -> (tenso
 // CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() <{value = dense<[-1.000000e+00, -3.000000e+00]> : tensor<2xf32>}> : () -> tensor<2xf32>
 // CHECK-DAG: %[[CONST_2:.*]] = "tf.Const"() <{value = dense<[1.000000e+00, 3.000000e+00]> : tensor<2xf32>}> : () -> tensor<2xf32>
 // CHECK: %[[CONV2D_0:.*]] = "tf.Conv2D"(%arg0, %[[CONST_0]]) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]}> : (tensor<1x3x4x3xf32>, tensor<1x1x3x2xf32>) -> tensor<1x3x4x2xf32>
-// CHECK: %[[CUSTOM_0:.*]] = "tfl.custom"(%[[CONV2D_0]], %[[CONST_2]]) {custom_code = "FlexMinimum", custom_option = #tfl<const_bytes : "0x074D696E696D756D001812074D696E696D756D1A001A002A070A01541202300132000002231B1414042801">} : (tensor<1x3x4x2xf32>, tensor<2xf32>) -> tensor<1x3x4x2xf32>
-// CHECK: %[[CUSTOM_1:.*]] = "tfl.custom"(%[[CUSTOM_0]], %[[CONST_1]]) {custom_code = "FlexMaximum", custom_option = #tfl<const_bytes : "0x074D6178696D756D001812074D6178696D756D1A001A002A070A01541202300132000002231B1414042801">} : (tensor<1x3x4x2xf32>, tensor<2xf32>) -> tensor<1x3x4x2xf32>
+// CHECK: %[[CUSTOM_0:.*]] = "tfl.custom"(%[[CONV2D_0]], %[[CONST_2]]) <{custom_code = "FlexMinimum", custom_option = #tfl<const_bytes : "0x074D696E696D756D001812074D696E696D756D1A001A002A070A01541202300132000002231B1414042801">}> : (tensor<1x3x4x2xf32>, tensor<2xf32>) -> tensor<1x3x4x2xf32>
+// CHECK: %[[CUSTOM_1:.*]] = "tfl.custom"(%[[CUSTOM_0]], %[[CONST_1]]) <{custom_code = "FlexMaximum", custom_option = #tfl<const_bytes : "0x074D6178696D756D001812074D6178696D756D1A001A002A070A01541202300132000002231B1414042801">}> : (tensor<1x3x4x2xf32>, tensor<2xf32>) -> tensor<1x3x4x2xf32>
 // CHECK: return %[[CUSTOM_1]] : tensor<1x3x4x2xf32>
 }
diff --git a/tensorflow/compiler/mlir/lite/quantization/tensorflow/tests/fallback_to_flex_ops_legacy.mlir b/tensorflow/compiler/mlir/lite/quantization/tensorflow/tests/fallback_to_flex_ops_legacy.mlir
index 5835d7d107cef5..258a006ee37fc6 100644
--- a/tensorflow/compiler/mlir/lite/quantization/tensorflow/tests/fallback_to_flex_ops_legacy.mlir
+++ b/tensorflow/compiler/mlir/lite/quantization/tensorflow/tests/fallback_to_flex_ops_legacy.mlir
@@ -52,7 +52,7 @@ func.func @conv2d_backprop_input_with_sub(%arg0: tensor<4xi32>, %arg1: tensor<3x
 func.func @depth_to_space(%arg0: tensor<1x1x1x4xf32>) -> tensor<1x2x2x1xf32> {
   %0 = "tf.DepthToSpace"(%arg0) {block_size = 2: i64,  data_format = "NHWC"}: (tensor<1x1x1x4xf32>) -> tensor<1x2x2x1xf32>
   func.return %0 : tensor<1x2x2x1xf32>
-// CHECK: %[[CUSTOM_0:.*]] = "tfl.custom"(%arg0) {custom_code = "FlexDepthToSpace", custom_option = #tfl<const_bytes : "{{.*}}">} : (tensor<1x1x1x4xf32>) -> tensor<1x2x2x1xf32>
+// CHECK: %[[CUSTOM_0:.*]] = "tfl.custom"(%arg0) <{custom_code = "FlexDepthToSpace", custom_option = #tfl<const_bytes : "{{.*}}">}> : (tensor<1x1x1x4xf32>) -> tensor<1x2x2x1xf32>
 // CHECK: return %[[CUSTOM_0]] : tensor<1x2x2x1xf32>
 }
 
@@ -60,7 +60,7 @@ func.func @depth_to_space(%arg0: tensor<1x1x1x4xf32>) -> tensor<1x2x2x1xf32> {
 func.func @floor_mod(%arg0: tensor<5xf32>, %arg1: tensor<5xf32>) -> tensor<5xf32> {
   %0 = "tf.FloorMod"(%arg0, %arg1) : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xf32>
   func.return %0 : tensor<5xf32>
-// CHECK: %[[CUSTOM_0:.*]] = "tfl.custom"(%arg0, %arg1) {custom_code = "FlexFloorMod", custom_option = #tfl<const_bytes : "{{.*}}">} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xf32>
+// CHECK: %[[CUSTOM_0:.*]] = "tfl.custom"(%arg0, %arg1) <{custom_code = "FlexFloorMod", custom_option = #tfl<const_bytes : "{{.*}}">}> : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xf32>
 // CHECK: return %[[CUSTOM_0]] : tensor<5xf32>
 }
 
diff --git a/tensorflow/compiler/mlir/lite/quantization/tensorflow/tf_to_quant.cc b/tensorflow/compiler/mlir/lite/quantization/tensorflow/tf_to_quant.cc
index 55790c40509946..b4015181886788 100644
--- a/tensorflow/compiler/mlir/lite/quantization/tensorflow/tf_to_quant.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/tensorflow/tf_to_quant.cc
@@ -133,7 +133,7 @@ struct InsertQuantOpsAfterTFFakeQuantOp
     if (PerAxis) {
       // This is a special case that the quant_dim is the last dimensions
       // according to the tf.FakeQuantWithMinMaxPerChannel.
-      quant_dim = res.getType().template cast<ShapedType>().getRank() - 1;
+      quant_dim = mlir::cast<ShapedType>(res.getType()).getRank() - 1;
     }
     // Use the min/max from the operands and the num_bits and narrow_range
     // attribute to create the quantization parameter for the new quantize op.
diff --git a/tensorflow/compiler/mlir/lite/quantization/tests/import_quant_stats.mlir b/tensorflow/compiler/mlir/lite/quantization/tests/import_quant_stats.mlir
index 1260089c0f264a..5bcb6837f14d81 100644
--- a/tensorflow/compiler/mlir/lite/quantization/tests/import_quant_stats.mlir
+++ b/tensorflow/compiler/mlir/lite/quantization/tests/import_quant_stats.mlir
@@ -18,8 +18,8 @@ func.func @import_stats_name(%arg0: tensor<4xf32>, %cst: tensor<i32>) -> (tensor
   func.return %0#0, %0#1 : tensor<2xf32>, tensor<2xf32>
 
 // CHECK-NEXT: %[[split:.*]]:2 = "tfl.split"
-// CHECK-NEXT: %[[stats1:.*]] = "quantfork.stats"(%[[split]]#0) {layerStats = dense<[-1.000000e+00, 1.000000e+00]>
-// CHECK-NEXT: %[[stats2:.*]] = "quantfork.stats"(%[[split]]#1) {layerStats = dense<[-1.000000e+00, 1.000000e+00]>
+// CHECK-NEXT: %[[stats1:.*]] = "quantfork.stats"(%[[split]]#0) <{layerStats = dense<[-1.000000e+00, 1.000000e+00]>
+// CHECK-NEXT: %[[stats2:.*]] = "quantfork.stats"(%[[split]]#1) <{layerStats = dense<[-1.000000e+00, 1.000000e+00]>
 // CHECK-NEXT: return %[[stats1]], %[[stats2]] : tensor<2xf32>, tensor<2xf32>
 }
 
@@ -30,7 +30,7 @@ func.func @import_stats_name_port(%arg0: tensor<4xf32>, %cst: tensor<i32>) -> (t
   func.return %0#0, %0#1 : tensor<2xf32>, tensor<2xf32>
 
 // CHECK-NEXT: %[[split:.*]]:2 = "tfl.split"
-// CHECK-NEXT: %[[stats1:.*]] = "quantfork.stats"(%[[split]]#0) {layerStats = dense<[-2.000000e+00, 2.000000e+00]>
+// CHECK-NEXT: %[[stats1:.*]] = "quantfork.stats"(%[[split]]#0) <{layerStats = dense<[-2.000000e+00, 2.000000e+00]>
 // CHECK-NEXT: return %[[stats1]],  %[[split]]#1 : tensor<2xf32>, tensor<2xf32>
 }
 
@@ -41,7 +41,7 @@ func.func @import_stats_name_regex(%arg0: tensor<4xf32>, %cst: tensor<i32>) -> (
   func.return %0#0, %0#1 : tensor<2xf32>, tensor<2xf32>
 
 // CHECK-NEXT: %[[split:.*]]:2 = "tfl.split"
-// CHECK-NEXT: %[[stats1:.*]] = "quantfork.stats"(%[[split]]#0) {layerStats = dense<[-3.000000e+00, 3.000000e+00]>
-// CHECK-NEXT: %[[stats2:.*]] = "quantfork.stats"(%[[split]]#1) {layerStats = dense<[-3.000000e+00, 3.000000e+00]>
+// CHECK-NEXT: %[[stats1:.*]] = "quantfork.stats"(%[[split]]#0) <{layerStats = dense<[-3.000000e+00, 3.000000e+00]>
+// CHECK-NEXT: %[[stats2:.*]] = "quantfork.stats"(%[[split]]#1) <{layerStats = dense<[-3.000000e+00, 3.000000e+00]>
 // CHECK-NEXT: return %[[stats1]], %[[stats2]] : tensor<2xf32>, tensor<2xf32>
 }
diff --git a/tensorflow/compiler/mlir/lite/schema/BUILD b/tensorflow/compiler/mlir/lite/schema/BUILD
new file mode 100644
index 00000000000000..34b799a9738741
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/schema/BUILD
@@ -0,0 +1,46 @@
+load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],
+)
+
+exports_files(
+    srcs = ["schema.fbs"],
+)
+
+flatbuffer_cc_library(
+    name = "schema_fbs",
+    srcs = ["schema.fbs"],
+    compatible_with = get_compatible_with_portable(),
+)
+
+# Generic schema for flatbuffer converter (but with mutable makes bigger).
+flatbuffer_cc_library(
+    name = "schema_fbs_with_mutable",
+    srcs = ["schema.fbs"],
+    compatible_with = get_compatible_with_portable(),
+    flatc_args = [
+        "--gen-mutable",
+        "--gen-object-api",
+    ],
+    out_prefix = "mutable/",
+)
+
+# Generic schema for inference on device (but with reflections makes bigger).
+flatbuffer_cc_library(
+    name = "schema_fbs_with_reflection",
+    srcs = ["schema.fbs"],
+    compatible_with = get_compatible_with_portable(),
+    flatc_args = [
+        "--reflect-types",
+        "--reflect-names",
+        "--no-union-value-namespacing",
+        "--gen-object-api",
+    ],
+    out_prefix = "reflection/",
+)
diff --git a/tensorflow/compiler/mlir/lite/schema/README.md b/tensorflow/compiler/mlir/lite/schema/README.md
new file mode 100644
index 00000000000000..369027689e09e2
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/schema/README.md
@@ -0,0 +1,2 @@
+This directory contains schema related files and targets that are used by both
+the TFL converter (tf/compiler/mlir/lite/) and the runtime (tf/lite/).
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/schema/schema.fbs b/tensorflow/compiler/mlir/lite/schema/schema.fbs
new file mode 100644
index 00000000000000..7ab78be26737ee
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/schema/schema.fbs
@@ -0,0 +1,1653 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Revision History
+// Version 0: Initial version.
+// Version 1: Add subgraphs to schema.
+// Version 2: Rename operators to conform to NN API.
+// Version 3: Move buffer data from Model.Subgraph.Tensors to Model.Buffers.
+// Version 3a: Add new builtin op code field. Has backward compatibility with
+//             version 3.
+// Version 3b: Rename fields in SignatureDef. Has backward compatibility with
+//             version 3 and 3a.
+// Version 3c: Move constant tensor buffers & custom op buffers outside from
+//             Flatbuffers. Has backward compatibility with version 3, 3a and
+//             3b.
+
+namespace tflite;
+
+// This corresponds to the version.
+file_identifier "TFL3";
+// File extension of any written files.
+file_extension "tflite";
+
+// IMPORTANT: All new members of tables, enums and unions must be added at the
+// end to ensure backwards compatibility.
+
+// The type of data stored in a tensor.
+enum TensorType : byte {
+  FLOAT32 = 0,
+  FLOAT16 = 1,
+  INT32 = 2,
+  UINT8 = 3,
+  INT64 = 4,
+  STRING = 5,
+  BOOL = 6,
+  INT16 = 7,
+  COMPLEX64 = 8,
+  INT8 = 9,
+  FLOAT64 = 10,
+  COMPLEX128 = 11,
+  UINT64 = 12,
+  // Experimental: Resource and variant types are experimental, that are subject
+  // to change. Do not implement custom kernels using resource & variant types
+  // now.
+  RESOURCE = 13,
+  VARIANT = 14,
+  UINT32 = 15,
+  UINT16 = 16,
+  INT4 = 17,
+  BFLOAT16 = 18,
+}
+
+// Custom quantization parameters for experimenting with new quantization
+// techniques.
+table CustomQuantization {
+  custom:[ubyte] (force_align: 16);
+}
+
+// Represents a specific quantization technique's parameters.
+union QuantizationDetails {
+  CustomQuantization,
+}
+
+// Parameters for converting a quantized tensor back to float.
+table QuantizationParameters {
+  // These four parameters are the asymmetric linear quantization parameters.
+  // Given a quantized value q, the corresponding float value f should be:
+  //   f = scale * (q - zero_point)
+  // For other quantization types, the QuantizationDetails below is used.
+  min:[float];  // For importing back into tensorflow.
+  max:[float];  // For importing back into tensorflow.
+  scale:[float];  // For dequantizing the tensor's values.
+  zero_point:[long];
+
+  // If this is not none, the other quantization parameters (i.e. min, max,
+  // scale, zero_point fields above) are ignored and the value of the
+  // QuantizationDetails union should be used.
+  details:QuantizationDetails;
+
+  // Specifies the dimension of the Tensor's shape that the scales and
+  // zero_points correspond to. For example, a tensor t, with dims=[4, 3, 2, 1]
+  // with quantization params:
+  //   scale=[1.0, 2.0, 3.0], zero_point=[1, 2, 3], quantization_dimension=1
+  // will be quantized across the second dimension of t.
+  //   t[:, 0, :, :] will have scale[0]=1.0, zero_point[0]=1
+  //   t[:, 1, :, :] will have scale[1]=2.0, zero_point[0]=2
+  //   t[:, 2, :, :] will have scale[2]=3.0, zero_point[0]=3
+  quantized_dimension:int;
+}
+
+// Sparse tensors.
+// We use a modification of the TACO format.
+// Reference: http://tensor-compiler.org/kjolstad-oopsla17-tensor-compiler.pdf
+//
+// To encode a conceptual n-dimensional dense tensor with dims (d0, ..., dn-1),
+// potentially with a k-dimensional block (0 <= k <= n) with dims
+// (dn, ..., dn+k-1), the format needs to specify:
+//   1. In what order to traverse these dimensions. For example, to store a 2-D
+//      matrix in row major order, the traversal order would be (d0, d1),
+//      whereas to store it in column major order, the traversal order would be
+//      (d1, d0). If the 2-D matrix has a 2-D inner block, the traversal order
+//      could be (d0, d1, d2, d3).
+//   2. How each block dimension in (dn, ..., dn+k-1) maps to the original
+//      tensor dimension in (d0, ..., dn-1).
+//   3. In the traversal order defined above, the format (dense vs. sparse) and
+//      index metadata for each dimension. For a dense dimension, this is just
+//      the size of that dimension. For a sparse dimension, it's the same as
+//      the compressed index defined in the Compressed Sparse Row (CSR) format.
+//      (http://scipy-lectures.org/advanced/scipy_sparse/csr_matrix.html)
+
+// The storage type for a dimension. Currently we support:
+//   1. DENSE: each coordinate in this dimension is stored implicitly.
+//   2. SPARSE_CSR: only the coordinates with non-zero elements are stored. The
+//      compression technique is the same what CSR uses.
+// More types like a sparse dimension with a different compression technique
+// could be added to the list in the future.
+enum DimensionType : byte {
+  DENSE = 0,
+  SPARSE_CSR = 1,
+}
+
+table Int32Vector {
+  values:[int];
+}
+
+table Uint16Vector {
+  values:[ushort] (force_align: 4);
+}
+
+table Uint8Vector {
+  values:[ubyte] (force_align: 4);
+}
+
+// Variable-typed buffer to store the index metadata for a sparse dimension.
+// The widest type is Int32 instead of UInt32 because tensor's shape is a int32
+// vector. We don't want the per-dimensional index to overflow that range.
+union SparseIndexVector {
+  Int32Vector,
+  Uint16Vector,
+  Uint8Vector
+}
+
+table DimensionMetadata {
+  // Whether a dimension is dense or sparse.
+  format:DimensionType;
+  // Index metadata used for a dimension.
+  //   - If format is DimensionType.DENSE then we use the dense_size field to
+  //     store the size of that dimension. Each index in that dimension is
+  //     stored implicitly.
+  //   - If format is DimensionType.SPARSE_CSR then we use array_segments and
+  //     array_indices to encode that dimension. array_segments represents how
+  //     to segment the indices array, each segment corresponds to one element
+  //     in the previous dimension. array_indices represents the index of the
+  //     non-zero elements within this dimension (as those in the CSR matrix
+  //     format, where the first array is row pointers and the second array is
+  //     column indices).
+  dense_size:int;
+  array_segments:SparseIndexVector;
+  array_indices:SparseIndexVector;
+}
+
+// Parameters to encode a sparse TfLite tensor.
+table SparsityParameters {
+  // The traversal order of the dimensions defined in the `shape` field of the
+  // conceptual dense tensor. For a n-dimensional tensors with dims (d0, d1,
+  // ..., dn-1),
+  //   - if not block sparse, the traversal_order is just a permutation of (d0,
+  //     ..., dn-1). For example, a 2-D matrix stored in row-major order would
+  //     have traversal_order = (d0, d1).
+  //   - if block sparse with a k-dimensional block (0 <= k <= n), the
+  //     traversal_order has n + k elements. The first n elements are still a
+  //     permutation of (d0, ..., dn-1). The lask k elements are a permutation
+  //     of (dn, ..., dn+k-1), defining how to traverse a block internally. For
+  //     example, a 2-D matrix with 2-D blocks, both stored in row-major order
+  //     would have traversal_order = (d0, d1, d2, d3).
+  traversal_order:[int];
+  // For an n-dimensional tensor with a k-dimensional block (0 <= k <= n),
+  // stores how a block dimension in (dn, ..., dn+k-1) maps to the original
+  // tensor dimension in (d0, ..., dn).
+  // It's stored in the order of (dn, ..., dn+k-1).
+  // If not block-sparse, this field is NULL.
+  block_map:[int];
+  // In the traversal order defined above, the metadata needed for
+  // each dimension to locate the non-zero values in the original dense tensor.
+  // The size of the dim_metadata array = the size of the traversal_order array
+  // = n + k.
+  dim_metadata:[DimensionMetadata];
+}
+
+// The nested tensor type for VARIANT type.
+table VariantSubType {
+  // The tensor shape.
+  shape:[int];
+  type:TensorType;
+  // If false, the rank or the number of tensor dimensions is unknown.
+  // If false, "shape" must be [].
+  has_rank: bool = false;
+}
+
+table Tensor {
+  // The tensor shape. The meaning of each entry is operator-specific but
+  // builtin ops use: [batch size, height, width, number of channels] (That's
+  // Tensorflow's NHWC).
+  shape:[int];
+  type:TensorType;
+  // An index that refers to the buffers table at the root of the model. Or,
+  // if there is no data buffer associated (i.e. intermediate results), then
+  // this is 0 (which refers to an always existent empty buffer).
+  //
+  // The data_buffer itself is an opaque container, with the assumption that the
+  // target device is little-endian. In addition, all builtin operators assume
+  // the memory is ordered such that if `shape` is [4, 3, 2], then index
+  // [i, j, k] maps to data_buffer[i*3*2 + j*2 + k].
+  buffer:uint;
+  name:string;  // For debugging and importing back into tensorflow.
+  quantization:QuantizationParameters;  // Optional.
+
+  is_variable:bool = false;
+
+  // Parameters to encode a sparse tensor. See the example in
+  // tensorflow/lite/testdata/sparse_tensor.json.
+  sparsity:SparsityParameters;  // Optional.
+
+  // Encodes `shape` with unknown dimensions. Unknown dimensions are
+  // represented with -1.
+  shape_signature:[int]; // Optional.
+
+  // This field is added to distinguish between scalars and tensors of unknown
+  // ranks (both of which shape is []).
+  // For scalars (rank = 0), shape = [] and has_rank = true.
+  // For tensors with known rank (rank > 0) and shape, shape = [...] and
+  // has_rank = true.
+  // For tensors with unknown rank and shape, shape = [] and has_rank = false.
+  has_rank: bool = false;
+
+  // The nested Tensor types for VARIANT type. This is always empty for
+  // non-VARIANT types. This is optional because the nested type can be omitted.
+  // Currently only 1 subtype is supported. The field is defined as an array for
+  // flexibility of supporting multiple subtypes in the future.
+  variant_tensors:[VariantSubType];
+}
+
+// A list of builtin operators. Builtin operators are slightly faster than custom
+// ones, but not by much. Moreover, while custom operators accept an opaque
+// object containing configuration parameters, builtins have a predetermined
+// set of acceptable options.
+// LINT.IfChange
+enum BuiltinOperator : int32 {
+  ADD = 0,
+  AVERAGE_POOL_2D = 1,
+  CONCATENATION = 2,
+  CONV_2D = 3,
+  DEPTHWISE_CONV_2D = 4,
+  DEPTH_TO_SPACE = 5,
+  DEQUANTIZE = 6,
+  EMBEDDING_LOOKUP = 7,
+  FLOOR = 8,
+  FULLY_CONNECTED = 9,
+  HASHTABLE_LOOKUP = 10,
+  L2_NORMALIZATION = 11,
+  L2_POOL_2D = 12,
+  LOCAL_RESPONSE_NORMALIZATION = 13,
+  LOGISTIC = 14,
+  LSH_PROJECTION = 15,
+  LSTM = 16,
+  MAX_POOL_2D = 17,
+  MUL = 18,
+  RELU = 19,
+  // NOTE(aselle): RELU_N1_TO_1 used to be called RELU1, but it was renamed
+  // since different model developers use RELU1 in different ways. Never
+  // create another op called RELU1.
+  RELU_N1_TO_1 = 20,
+  RELU6 = 21,
+  RESHAPE = 22,
+  RESIZE_BILINEAR = 23,
+  RNN = 24,
+  SOFTMAX = 25,
+  SPACE_TO_DEPTH = 26,
+  SVDF = 27,
+  TANH = 28,
+  CONCAT_EMBEDDINGS = 29,
+  SKIP_GRAM = 30,
+  CALL = 31,
+  CUSTOM = 32,
+  EMBEDDING_LOOKUP_SPARSE = 33,
+  PAD = 34,
+  UNIDIRECTIONAL_SEQUENCE_RNN = 35,
+  GATHER = 36,
+  BATCH_TO_SPACE_ND = 37,
+  SPACE_TO_BATCH_ND = 38,
+  TRANSPOSE = 39,
+  MEAN = 40,
+  SUB = 41,
+  DIV = 42,
+  SQUEEZE = 43,
+  UNIDIRECTIONAL_SEQUENCE_LSTM = 44,
+  STRIDED_SLICE = 45,
+  BIDIRECTIONAL_SEQUENCE_RNN = 46,
+  EXP = 47,
+  TOPK_V2 = 48,
+  SPLIT = 49,
+  LOG_SOFTMAX = 50,
+  // DELEGATE is a special op type for the operations which are delegated to
+  // other backends.
+  // WARNING: Experimental interface, subject to change
+  DELEGATE = 51,
+  BIDIRECTIONAL_SEQUENCE_LSTM = 52,
+  CAST = 53,
+  PRELU = 54,
+  MAXIMUM = 55,
+  ARG_MAX = 56,
+  MINIMUM = 57,
+  LESS = 58,
+  NEG = 59,
+  PADV2 = 60,
+  GREATER = 61,
+  GREATER_EQUAL = 62,
+  LESS_EQUAL = 63,
+  SELECT = 64,
+  SLICE = 65,
+  SIN = 66,
+  TRANSPOSE_CONV = 67,
+  SPARSE_TO_DENSE = 68,
+  TILE = 69,
+  EXPAND_DIMS = 70,
+  EQUAL = 71,
+  NOT_EQUAL = 72,
+  LOG = 73,
+  SUM = 74,
+  SQRT = 75,
+  RSQRT = 76,
+  SHAPE = 77,
+  POW = 78,
+  ARG_MIN = 79,
+  FAKE_QUANT = 80,
+  REDUCE_PROD = 81,
+  REDUCE_MAX = 82,
+  PACK = 83,
+  LOGICAL_OR = 84,
+  ONE_HOT = 85,
+  LOGICAL_AND = 86,
+  LOGICAL_NOT = 87,
+  UNPACK = 88,
+  REDUCE_MIN = 89,
+  FLOOR_DIV = 90,
+  REDUCE_ANY = 91,
+  SQUARE = 92,
+  ZEROS_LIKE = 93,
+  FILL = 94,
+  FLOOR_MOD = 95,
+  RANGE = 96,
+  RESIZE_NEAREST_NEIGHBOR = 97,
+  LEAKY_RELU = 98,
+  SQUARED_DIFFERENCE = 99,
+  MIRROR_PAD = 100,
+  ABS = 101,
+  SPLIT_V = 102,
+  UNIQUE = 103,
+  CEIL = 104,
+  REVERSE_V2 = 105,
+  ADD_N = 106,
+  GATHER_ND = 107,
+  COS = 108,
+  WHERE = 109,
+  RANK = 110,
+  ELU = 111,
+  REVERSE_SEQUENCE = 112,
+  MATRIX_DIAG = 113,
+  QUANTIZE = 114,
+  MATRIX_SET_DIAG = 115,
+  ROUND = 116,
+  HARD_SWISH = 117,
+  IF = 118,
+  WHILE = 119,
+  NON_MAX_SUPPRESSION_V4 = 120,
+  NON_MAX_SUPPRESSION_V5 = 121,
+  SCATTER_ND = 122,
+  SELECT_V2 = 123,
+  DENSIFY = 124,
+  SEGMENT_SUM = 125,
+  BATCH_MATMUL = 126,
+  PLACEHOLDER_FOR_GREATER_OP_CODES = 127,
+  CUMSUM = 128,
+  CALL_ONCE = 129,
+  BROADCAST_TO = 130,
+  RFFT2D = 131,
+  CONV_3D = 132,
+  IMAG=133,
+  REAL=134,
+  COMPLEX_ABS=135,
+  HASHTABLE = 136,
+  HASHTABLE_FIND = 137,
+  HASHTABLE_IMPORT = 138,
+  HASHTABLE_SIZE = 139,
+  REDUCE_ALL = 140,
+  CONV_3D_TRANSPOSE = 141,
+  VAR_HANDLE = 142,
+  READ_VARIABLE = 143,
+  ASSIGN_VARIABLE = 144,
+  BROADCAST_ARGS = 145,
+  RANDOM_STANDARD_NORMAL = 146,
+  BUCKETIZE = 147,
+  RANDOM_UNIFORM = 148,
+  MULTINOMIAL = 149,
+  GELU = 150,
+  DYNAMIC_UPDATE_SLICE = 151,
+  RELU_0_TO_1 = 152,
+  UNSORTED_SEGMENT_PROD = 153,
+  UNSORTED_SEGMENT_MAX = 154,
+  UNSORTED_SEGMENT_SUM = 155,
+  ATAN2 = 156,
+  UNSORTED_SEGMENT_MIN = 157,
+  SIGN = 158,
+  BITCAST = 159,
+  BITWISE_XOR = 160,
+  RIGHT_SHIFT = 161,
+  // All Operators start with STABLEHLO_ prefixes are subject to change
+  // Many of the ops below can not be executed by TFlite runtime
+  STABLEHLO_LOGISTIC = 162, // WARNING: Do not have runtime support
+  STABLEHLO_ADD = 163,
+  STABLEHLO_DIVIDE = 164, // WARNING: No runtime support yet
+  STABLEHLO_MULTIPLY = 165,
+  STABLEHLO_MAXIMUM = 166,
+  STABLEHLO_RESHAPE = 167, // WARNING: No runtime support yet
+  STABLEHLO_CLAMP = 168, // WARNING: No runtime support
+  STABLEHLO_CONCATENATE = 169, // WARNING: No runtime support
+  STABLEHLO_BROADCAST_IN_DIM = 170, // WARNING: No runtime support
+  STABLEHLO_CONVOLUTION = 171, // WARNING: No runtime support
+  STABLEHLO_SLICE = 172, // WARNING: No runtime support
+  STABLEHLO_CUSTOM_CALL = 173, // WARNING: No runtime support
+  STABLEHLO_REDUCE = 174, // WARNING: No runtime support
+  STABLEHLO_ABS = 175, // WARNING: No runtime support
+  STABLEHLO_AND = 176, // WARNING: No runtime support
+  STABLEHLO_COSINE = 177, // WARNING: No runtime support
+  STABLEHLO_EXPONENTIAL = 178, // WARNING: No runtime support
+  STABLEHLO_FLOOR = 179, // WARNING: No runtime support
+  STABLEHLO_LOG = 180, // WARNING: No runtime support
+  STABLEHLO_MINIMUM = 181,
+  STABLEHLO_NEGATE = 182, // WARNING: No runtime support
+  STABLEHLO_OR = 183, // WARNING: No runtime support
+  STABLEHLO_POWER = 184, // WARNING: No runtime support
+  STABLEHLO_REMAINDER = 185, // WARNING: No runtime support
+  STABLEHLO_RSQRT = 186, // WARNING: No runtime support
+  STABLEHLO_SELECT = 187, // WARNING: No runtime support
+  STABLEHLO_SUBTRACT = 188, // WARNING: No runtime support
+  STABLEHLO_TANH = 189, // WARNING: No runtime support
+  STABLEHLO_SCATTER = 190,
+  STABLEHLO_COMPARE = 191, // WARNING: No runtime support
+  STABLEHLO_CONVERT = 192, // WARNING: No runtime support
+  STABLEHLO_DYNAMIC_SLICE = 193, // WARNING: No runtime support
+  STABLEHLO_DYNAMIC_UPDATE_SLICE = 194, // WARNING: No runtime support
+  STABLEHLO_PAD = 195,
+  STABLEHLO_IOTA = 196, // WARNING: No runtime support
+  STABLEHLO_DOT_GENERAL = 197, // WARNING: No runtime support
+  STABLEHLO_REDUCE_WINDOW = 198,
+  STABLEHLO_SORT = 199, // WARNING: No runtime support
+  STABLEHLO_WHILE = 200, // WARNING: No runtime support
+  STABLEHLO_GATHER = 201,
+  STABLEHLO_TRANSPOSE = 202, // WARNING: No runtime support
+  DILATE = 203,
+  STABLEHLO_RNG_BIT_GENERATOR = 204,
+  REDUCE_WINDOW = 205 (deprecated),
+  STABLEHLO_COMPOSITE = 206, // WARNING: No runtime support
+}
+// LINT.ThenChange(nnapi_linter/linter.proto)
+
+// Options for the builtin operators.
+union BuiltinOptions {
+  Conv2DOptions,
+  DepthwiseConv2DOptions,
+  ConcatEmbeddingsOptions,
+  LSHProjectionOptions,
+  Pool2DOptions,
+  SVDFOptions,
+  RNNOptions,
+  FullyConnectedOptions,
+  SoftmaxOptions,
+  ConcatenationOptions,
+  AddOptions,
+  L2NormOptions,
+  LocalResponseNormalizationOptions,
+  LSTMOptions,
+  ResizeBilinearOptions,
+  CallOptions,
+  ReshapeOptions,
+  SkipGramOptions,
+  SpaceToDepthOptions,
+  EmbeddingLookupSparseOptions,
+  MulOptions,
+  PadOptions,
+  GatherOptions,
+  BatchToSpaceNDOptions,
+  SpaceToBatchNDOptions,
+  TransposeOptions,
+  ReducerOptions,
+  SubOptions,
+  DivOptions,
+  SqueezeOptions,
+  SequenceRNNOptions,
+  StridedSliceOptions,
+  ExpOptions,
+  TopKV2Options,
+  SplitOptions,
+  LogSoftmaxOptions,
+  CastOptions,
+  DequantizeOptions,
+  MaximumMinimumOptions,
+  ArgMaxOptions,
+  LessOptions,
+  NegOptions,
+  PadV2Options,
+  GreaterOptions,
+  GreaterEqualOptions,
+  LessEqualOptions,
+  SelectOptions,
+  SliceOptions,
+  TransposeConvOptions,
+  SparseToDenseOptions,
+  TileOptions,
+  ExpandDimsOptions,
+  EqualOptions,
+  NotEqualOptions,
+  ShapeOptions,
+  PowOptions,
+  ArgMinOptions,
+  FakeQuantOptions,
+  PackOptions,
+  LogicalOrOptions,
+  OneHotOptions,
+  LogicalAndOptions,
+  LogicalNotOptions,
+  UnpackOptions,
+  FloorDivOptions,
+  SquareOptions,
+  ZerosLikeOptions,
+  FillOptions,
+  BidirectionalSequenceLSTMOptions,
+  BidirectionalSequenceRNNOptions,
+  UnidirectionalSequenceLSTMOptions,
+  FloorModOptions,
+  RangeOptions,
+  ResizeNearestNeighborOptions,
+  LeakyReluOptions,
+  SquaredDifferenceOptions,
+  MirrorPadOptions,
+  AbsOptions,
+  SplitVOptions,
+  UniqueOptions,
+  ReverseV2Options,
+  AddNOptions,
+  GatherNdOptions,
+  CosOptions,
+  WhereOptions,
+  RankOptions,
+  ReverseSequenceOptions,
+  MatrixDiagOptions,
+  QuantizeOptions,
+  MatrixSetDiagOptions,
+  HardSwishOptions,
+  IfOptions,
+  WhileOptions,
+  DepthToSpaceOptions,
+  NonMaxSuppressionV4Options,
+  NonMaxSuppressionV5Options,
+  ScatterNdOptions,
+  SelectV2Options,
+  DensifyOptions,
+  SegmentSumOptions,
+  BatchMatMulOptions,
+  CumsumOptions,
+  CallOnceOptions,
+  BroadcastToOptions,
+  Rfft2dOptions,
+  Conv3DOptions,
+  HashtableOptions,
+  HashtableFindOptions,
+  HashtableImportOptions,
+  HashtableSizeOptions,
+  VarHandleOptions,
+  ReadVariableOptions,
+  AssignVariableOptions,
+  RandomOptions,
+  BucketizeOptions,
+  GeluOptions,
+  DynamicUpdateSliceOptions,
+  UnsortedSegmentProdOptions,
+  UnsortedSegmentMaxOptions,
+  UnsortedSegmentMinOptions,
+  UnsortedSegmentSumOptions,
+  ATan2Options,
+  SignOptions,
+  BitcastOptions,
+  BitwiseXorOptions,
+  RightShiftOptions,
+  // DO NOT add new options this union, will cause failure in Java api
+  // generation otherwise
+  // Add new builtin options into builtin options 2 instead
+}
+
+union BuiltinOptions2{
+  StablehloConcatenateOptions,
+  StablehloBroadcastInDimOptions,
+  StablehloSliceOptions,
+  StablehloConvolutionOptions,
+  StablehloCustomCallOptions,
+  StablehloReduceOptions,
+  StablehloScatterOptions,
+  StablehloCompareOptions,
+  StablehloDynamicSliceOptions,
+  StablehloPadOptions,
+  StablehloIotaOptions,
+  StablehloDotGeneralOptions,
+  StablehloReduceWindowOptions,
+  StablehloSortOptions,
+  StablehloWhileOptions,
+  StablehloGatherOptions,
+  StablehloTransposeOptions,
+  DilateOptions,
+  StablehloRngBitGeneratorOptions,
+  ReduceWindowOptions (deprecated),
+  StableHLOCompositeOptions,
+}
+
+table StablehloGatherOptions{
+  offset_dims : [long];
+  collapsed_slice_dims : [long];
+  start_index_map : [long];
+  index_vector_dim : long;
+  slice_sizes : [long];
+  indices_are_sorted : bool;
+}
+
+table StablehloTransposeOptions{
+  permutation : [long];
+}
+
+enum StablehloPrecisionConfig : uint {
+  DEFAULT,
+  HIGH,
+  HIGHEST,
+}
+
+table StablehloDotGeneralOptions{
+  lhs_batching_dimensions : [long];
+  rhs_batching_dimensions : [long];
+  lhs_contracting_dimensions : [long];
+  rhs_contracting_dimensions : [long];
+  precision_config : [StablehloPrecisionConfig];
+}
+
+table StablehloReduceWindowOptions{
+  window_dimensions : [long];
+  window_strides : [long];
+  base_dilations : [long];
+  window_dilations : [long];
+  padding : [long];
+  body_subgraph_index : int;
+}
+
+table StablehloWhileOptions{
+  cond_subgraph_index : int;
+  body_subgraph_index : int;
+}
+
+table StablehloSortOptions{
+  dimension : long;
+  is_stable : bool;
+  comparator_subgraph_index : int;
+}
+
+table StablehloConcatenateOptions {
+  dimension : long;
+}
+
+table StablehloBroadcastInDimOptions{
+  broadcast_dimensions : [long];
+}
+
+enum StablehloComparisonDirection : uint {
+  STABLEHLO_COMPARISON_DIRECTION_EQ,
+  STABLEHLO_COMPARISON_DIRECTION_NE,
+  STABLEHLO_COMPARISON_DIRECTION_GE,
+  STABLEHLO_COMPARISON_DIRECTION_GT,
+  STABLEHLO_COMPARISON_DIRECTION_LE,
+  STABLEHLO_COMPARISON_DIRECTION_LT,
+
+}
+
+enum StablehloComparisonType : uint {
+  STABLEHLO_COMPARISON_TYPE_NOTYPE,
+  STABLEHLO_COMPARISON_TYPE_FLOAT,
+  STABLEHLO_COMPARISON_TYPE_FLOAT_TOTAL_ORDER,
+  STABLEHLO_COMPARISON_TYPE_SIGNED,
+  STABLEHLO_COMPARISON_TYPE_UNSIGNED,
+}
+
+table StablehloCompareOptions{
+  comparison_direction : StablehloComparisonDirection;
+  compare_type : StablehloComparisonType;
+}
+
+table StablehloDynamicSliceOptions{
+  slice_sizes : [long];
+}
+
+table StablehloPadOptions{
+  edge_padding_low : [long];
+  edge_padding_high : [long];
+  interior_padding : [long];
+}
+
+table StablehloIotaOptions{
+  iota_dimension : long;
+}
+
+table StablehloCustomCallOptions {
+  call_target_name : string;
+  has_side_effect : bool;
+  backend_config: string;
+  api_version : int; // will be decprecated
+  called_computations: [int]; // should point to subgraphs of the computations
+  custom_attributes : [ubyte];
+}
+
+table StablehloReduceOptions {
+  dimensions : [long];
+  body_subgraph_index : int;
+}
+
+table StablehloSliceOptions{
+  start_indices : [long];
+  limit_indices : [long];
+  strides : [long];
+}
+
+table StablehloConvolutionOptions{
+  window_strides : [long];
+  padding : [long];
+  lhs_dilation : [long];
+  rhs_dilation : [long];
+  window_reversal : [bool];
+  input_batch_dimension : long;
+  input_feature_dimension : long;
+  input_spatial_dimensions : [long];
+  kernel_input_feature_dimension : long;
+  kernel_output_feature_dimension : long;
+  kernel_spatial_dimensions : [long];
+  output_batch_dimension : long;
+  output_feature_dimension : long;
+  output_spatial_dimensions	: [long];
+  feature_group_count : long;
+  batch_group_count : long;
+  precision_config : [StablehloPrecisionConfig];
+}
+
+table StablehloScatterOptions {
+  indices_are_sorted: bool;
+  update_window_dims: [long];
+  inserted_window_dims: [long];
+  scatter_dims_to_operand_dims: [long];
+  index_vector_dim: long;
+  unique_indices: bool;
+  update_computation_subgraph_index: int;
+}
+
+enum RngAlgorithm : byte {
+  // An algorithm auto-selected by the system according to device type.
+  DEFAULT = 0,
+  // The Philox algorithm, as described in paper
+  // ['Parallel Random Numbers: As Easy as 1, 2, 3']
+  // (https://www.thesalmons.org/john/random123/papers/random123sc11.pdf)
+  PHILOX = 1,
+  // The ThreeFry algorithm, as described in paper
+  // ['Parallel Random Numbers: As Easy as 1, 2, 3']
+  // (https://www.thesalmons.org/john/random123/papers/random123sc11.pdf)
+  THREEFRY = 2,
+}
+
+table StablehloRngBitGeneratorOptions {
+  algorithm:RngAlgorithm;
+}
+
+// LINT.IfChange
+enum Padding : byte { SAME, VALID }
+// LINT.ThenChange(//tensorflow/compiler/mlir/lite/ir/tfl_op_enums.td)
+
+// LINT.IfChange
+enum ActivationFunctionType : byte {
+  NONE = 0,
+  RELU = 1,
+  RELU_N1_TO_1 = 2,
+  RELU6 = 3,
+  TANH = 4,
+  SIGN_BIT = 5,
+}
+// LINT.ThenChange(//tensorflow/compiler/mlir/lite/ir/tfl_op_enums.td)
+
+table Conv2DOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  fused_activation_function:ActivationFunctionType;
+  dilation_w_factor:int = 1;
+  dilation_h_factor:int = 1;
+  // Parameters for Conv2D version 8 or above.
+  // When set, quantized_bias_type defines the dtype for both bias and accumulator.
+  quantized_bias_type: TensorType;
+}
+
+// Options for both Conv3D and Conv3DTranspose.
+table Conv3DOptions {
+  padding:Padding;
+  stride_d:int;
+  stride_w:int;
+  stride_h:int;
+  fused_activation_function:ActivationFunctionType;
+  dilation_d_factor:int = 1;
+  dilation_w_factor:int = 1;
+  dilation_h_factor:int = 1;
+}
+
+table Pool2DOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  filter_width:int;
+  filter_height:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+table DepthwiseConv2DOptions {
+  // Parameters for DepthwiseConv version 1 or above.
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  // `depth_multiplier` is redundant. It's used by CPU kernels in
+  // TensorFlow 2.0 or below, but ignored in versions above.
+  // See comments in lite/c/builtin_op_data.h for more details.
+  depth_multiplier:int;
+  fused_activation_function:ActivationFunctionType;
+  // Parameters for DepthwiseConv version 2 or above.
+  dilation_w_factor:int = 1;
+  dilation_h_factor:int = 1;
+}
+
+table ConcatEmbeddingsOptions {
+  num_channels:int;
+  num_columns_per_channel:[int];
+  embedding_dim_per_channel:[int]; // This could be inferred from parameters.
+}
+
+enum LSHProjectionType: byte {
+  UNKNOWN = 0,
+  SPARSE = 1,
+  DENSE = 2,
+}
+
+table LSHProjectionOptions {
+  type: LSHProjectionType;
+}
+
+table SVDFOptions {
+  rank:int;
+  fused_activation_function:ActivationFunctionType;
+  // For weights-only quantization, use asymmetric quantization for non
+  // constant inputs at evaluation time.
+  asymmetric_quantize_inputs:bool;
+}
+
+// An implementation of TensorFlow RNNCell.
+table RNNOptions {
+  fused_activation_function:ActivationFunctionType;
+  asymmetric_quantize_inputs:bool;
+}
+
+// An implementation of TensorFlow dynamic_rnn with RNNCell.
+table SequenceRNNOptions {
+  time_major:bool;
+  fused_activation_function:ActivationFunctionType;
+  asymmetric_quantize_inputs:bool;
+}
+
+// An implementation of TensorFlow bidrectional_dynamic_rnn with RNNCell.
+table BidirectionalSequenceRNNOptions {
+  time_major:bool;
+  fused_activation_function:ActivationFunctionType;
+  merge_outputs: bool;
+  asymmetric_quantize_inputs:bool;
+}
+
+// LINT.IfChange
+enum FullyConnectedOptionsWeightsFormat: byte {
+  DEFAULT = 0,
+  SHUFFLED4x16INT8 = 1,
+}
+// LINT.ThenChange(//tensorflow/compiler/mlir/lite/ir/tfl_op_enums.td)
+
+// An implementation of TensorFlow fully_connected (a.k.a Dense) layer.
+table FullyConnectedOptions {
+  // Parameters for FullyConnected version 1 or above.
+  fused_activation_function:ActivationFunctionType;
+
+  // Parameters for FullyConnected version 2 or above.
+  weights_format:FullyConnectedOptionsWeightsFormat = DEFAULT;
+
+  // Parameters for FullyConnected version 5 or above.
+  // If set to true, then the number of dimension is preserved. Furthermore,
+  // all but the last dimension of the input and output shapes will be equal.
+  keep_num_dims: bool;
+
+  // Parameters for FullyConnected version 7 or above.
+  // If set to true, then weights-only op will use asymmetric quantization for
+  // inputs.
+  asymmetric_quantize_inputs: bool;
+
+  // Parameters for FullyConnected version 11 or above.
+  // When set, quantized_bias_type defines the dtype for both bias and accumulator.
+  quantized_bias_type: TensorType;
+}
+
+table SoftmaxOptions {
+  beta: float;
+}
+
+// An implementation of TensorFlow concat.
+table ConcatenationOptions {
+  axis:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+table AddOptions {
+  fused_activation_function:ActivationFunctionType;
+  // Parameters supported by version 3.
+  pot_scale_int16:bool = true;
+}
+
+table MulOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table L2NormOptions {
+  // This field is currently ignored in the L2 Norm Op.
+  fused_activation_function:ActivationFunctionType;
+}
+
+table LocalResponseNormalizationOptions {
+  radius:int;
+  bias:float;
+  alpha:float;
+  beta:float;
+}
+
+// LINT.IfChange
+enum LSTMKernelType : byte {
+  // Full LSTM kernel which supports peephole and projection.
+  FULL = 0,
+  // Basic LSTM kernels. Equivalent to TensorFlow BasicLSTMCell.
+  BASIC = 1,
+}
+// LINT.ThenChange(//tensorflow/compiler/mlir/lite/ir/tfl_op_enums.td)
+
+// An implementation of TensorFlow LSTMCell and CoupledInputForgetGateLSTMCell
+table LSTMOptions {
+  // Parameters for LSTM version 1 or above.
+  fused_activation_function:ActivationFunctionType;
+  cell_clip: float; // Optional, 0.0 means no clipping
+  proj_clip: float; // Optional, 0.0 means no clipping
+
+  // Parameters for LSTM version 2 or above.
+  // Basic kernel is only supported in version 2 or above.
+  kernel_type: LSTMKernelType = FULL;
+
+  // Parameters for LSTM version 4 or above.
+  asymmetric_quantize_inputs: bool;
+}
+
+// An implementation of TensorFlow dynamic_rnn with LSTMCell.
+table UnidirectionalSequenceLSTMOptions {
+  fused_activation_function:ActivationFunctionType;
+  cell_clip: float; // Optional, 0.0 means no clipping
+  proj_clip: float; // Optional, 0.0 means no clipping
+
+  // If true then first dimension is sequence, otherwise batch.
+  time_major:bool;
+
+  // Parameter for Unidirectional Sequence LSTM version 3.
+  asymmetric_quantize_inputs:bool;
+
+  // Parameter for unidirectional sequence RNN version 4.
+  diagonal_recurrent_tensors:bool;
+}
+
+table BidirectionalSequenceLSTMOptions {
+  // Parameters supported by version 1:
+  fused_activation_function:ActivationFunctionType;
+  cell_clip: float; // Optional, 0.0 means no clipping
+  proj_clip: float; // Optional, 0.0 means no clipping
+
+  // If true, store the outputs of both directions into the first output.
+  merge_outputs: bool;
+
+  // Parameters supported by version 2:
+  // If true then first dimension is sequence, otherwise batch.
+  // Version 1 implementations assumed time_major to be true, so this default
+  // value should never change.
+  time_major: bool = true;
+
+  // Parameters for version 3 or above.
+  asymmetric_quantize_inputs:bool;
+}
+
+table ResizeBilinearOptions {
+  new_height: int (deprecated);
+  new_width: int (deprecated);
+  align_corners: bool;
+  half_pixel_centers: bool;
+}
+
+table ResizeNearestNeighborOptions {
+  align_corners: bool;
+  half_pixel_centers: bool;
+}
+
+// A call operation options
+table CallOptions {
+  // The subgraph index that needs to be called.
+  subgraph:uint;
+}
+
+table PadOptions {
+}
+
+table PadV2Options {
+}
+
+table ReshapeOptions {
+  new_shape:[int];
+}
+
+table SpaceToBatchNDOptions {
+}
+
+table BatchToSpaceNDOptions {
+}
+
+table SkipGramOptions {
+  ngram_size: int;
+  max_skip_size: int;
+  include_all_ngrams: bool;
+}
+
+table SpaceToDepthOptions {
+  block_size: int;
+}
+
+table DepthToSpaceOptions {
+  block_size: int;
+}
+
+table SubOptions {
+  fused_activation_function:ActivationFunctionType;
+  // Parameters supported by version 5
+  pot_scale_int16:bool = true;
+}
+
+table DivOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table TopKV2Options {
+}
+
+enum CombinerType : byte {
+  SUM = 0,
+  MEAN = 1,
+  SQRTN = 2,
+}
+
+table EmbeddingLookupSparseOptions {
+  combiner:CombinerType;
+}
+
+table GatherOptions {
+  axis: int;
+  // Parameters for Gather version 5 or above.
+  batch_dims: int = 0;
+}
+
+table TransposeOptions {
+}
+
+table ExpOptions {
+}
+
+table CosOptions {
+}
+
+table ReducerOptions {
+  keep_dims: bool;
+}
+
+table SqueezeOptions {
+  squeeze_dims:[int];
+}
+
+table SplitOptions {
+  num_splits: int;
+}
+
+table SplitVOptions {
+  num_splits: int;
+}
+
+table StridedSliceOptions {
+  begin_mask: int;
+  end_mask: int;
+  ellipsis_mask: int;
+  new_axis_mask: int;
+  shrink_axis_mask: int;
+  // If true, then the end tensor is an offset of the begin tensor.
+  offset: bool;
+}
+
+table LogSoftmaxOptions {
+}
+
+table CastOptions {
+  in_data_type: TensorType;
+  out_data_type: TensorType;
+}
+
+table DequantizeOptions {
+}
+
+table MaximumMinimumOptions {
+}
+
+table TileOptions {
+}
+
+table ArgMaxOptions {
+  output_type : TensorType;
+}
+
+table ArgMinOptions {
+  output_type : TensorType;
+}
+
+table GreaterOptions {
+}
+
+table GreaterEqualOptions {
+}
+
+table LessOptions {
+}
+
+table LessEqualOptions {
+}
+
+table NegOptions {
+}
+
+table SelectOptions {
+}
+
+table SliceOptions {
+}
+
+table TransposeConvOptions {
+  // Parameters supported by version 1, 2, 3:
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+
+  // Parameters supported by version 4:
+  fused_activation_function:ActivationFunctionType = NONE;
+
+  // Parameters for TransposeConv version 5 or above.
+  // If set, use this for bias and accumulator.
+  // When set, quantized_bias_type defines the dtype for both bias and accumulator.
+  quantized_bias_type: TensorType;
+}
+
+table ExpandDimsOptions {
+}
+
+table SparseToDenseOptions {
+  validate_indices:bool;
+}
+
+table EqualOptions {
+}
+
+table NotEqualOptions {
+}
+
+table ShapeOptions {
+  // Optional output type of the operation (int32 or int64). Defaults to int32.
+  out_type : TensorType;
+}
+
+table RankOptions {
+}
+
+table PowOptions {
+}
+
+table FakeQuantOptions {
+  // Parameters supported by version 1:
+  min:float;
+  max:float;
+  num_bits:int;
+
+  // Parameters supported by version 2:
+  narrow_range:bool;
+}
+
+table PackOptions {
+  values_count:int;
+  axis:int;
+}
+
+table LogicalOrOptions {
+}
+
+table OneHotOptions {
+  axis:int;
+}
+
+table AbsOptions {
+}
+
+
+table HardSwishOptions {
+}
+
+table LogicalAndOptions {
+}
+
+table LogicalNotOptions {
+}
+
+table UnpackOptions {
+  num:int;
+  axis:int;
+}
+
+table FloorDivOptions {
+}
+
+table SquareOptions {
+}
+
+table ZerosLikeOptions {
+}
+
+table FillOptions {
+}
+
+table FloorModOptions {
+}
+
+table RangeOptions {
+}
+
+table LeakyReluOptions {
+  alpha:float;
+}
+
+table SquaredDifferenceOptions {
+}
+
+// LINT.IfChange
+enum MirrorPadMode : byte {
+  // Doesn't include borders.
+  REFLECT = 0,
+  // Includes borders.
+  SYMMETRIC = 1,
+}
+// LINT.ThenChange(//tensorflow/compiler/mlir/lite/ir/tfl_op_enums.td)
+
+table MirrorPadOptions {
+  mode:MirrorPadMode;
+}
+
+table UniqueOptions {
+  idx_out_type:TensorType = INT32;
+}
+
+table ReverseV2Options {
+}
+
+table AddNOptions {
+}
+
+table GatherNdOptions {
+}
+
+table WhereOptions {
+}
+
+table ReverseSequenceOptions {
+  seq_dim:int;
+  batch_dim:int = 0;
+}
+
+table MatrixDiagOptions {
+}
+
+table QuantizeOptions {
+}
+
+table MatrixSetDiagOptions {
+}
+
+table IfOptions {
+  then_subgraph_index:int;
+  else_subgraph_index:int;
+}
+
+table CallOnceOptions {
+  init_subgraph_index:int;
+}
+
+table WhileOptions {
+  cond_subgraph_index:int;
+  body_subgraph_index:int;
+}
+
+table NonMaxSuppressionV4Options {
+}
+
+table NonMaxSuppressionV5Options {
+}
+
+table ScatterNdOptions {
+}
+
+table SelectV2Options {
+}
+
+table DensifyOptions {
+}
+
+table SegmentSumOptions {
+}
+
+table BatchMatMulOptions {
+  adj_x:bool;
+  adj_y:bool;
+  // Parameters for BatchMatMul version 4 or above.
+  // If set to true, then weights-only op will use asymmetric quantization for
+  // inputs.
+  asymmetric_quantize_inputs: bool;
+}
+
+table CumsumOptions {
+  exclusive:bool;
+  reverse:bool;
+}
+
+table BroadcastToOptions {
+}
+
+table Rfft2dOptions {
+}
+
+table HashtableOptions {
+  // The identity of hash tables. This identity will be used across different
+  // subgraphs in the same interpreter instance.
+  table_id:int;
+  key_dtype:TensorType;
+  value_dtype:TensorType;
+}
+
+table HashtableFindOptions {
+}
+
+table HashtableImportOptions {
+}
+
+table HashtableSizeOptions {
+}
+
+table VarHandleOptions {
+  container:string;
+  shared_name:string;
+}
+
+table ReadVariableOptions {
+}
+
+table AssignVariableOptions {
+}
+
+table RandomOptions {
+  seed: long;
+  seed2: long;
+}
+
+table BucketizeOptions {
+  boundaries: [float];  // The bucket boundaries.
+}
+
+table GeluOptions {
+  approximate: bool;
+}
+
+table DynamicUpdateSliceOptions {
+}
+
+table UnsortedSegmentProdOptions {
+}
+
+table UnsortedSegmentMaxOptions {
+}
+
+table UnsortedSegmentSumOptions {
+}
+
+table ATan2Options {
+}
+
+table UnsortedSegmentMinOptions{
+}
+
+table SignOptions {
+}
+
+table BitcastOptions {
+}
+
+table BitwiseXorOptions {
+}
+
+table RightShiftOptions {
+}
+
+table DilateOptions {
+}
+
+enum ReduceWindowFunction : int {
+  UNSUPPORTED,
+  ADD,
+  MUL,
+  MINIMUM,
+  MAXIMUM,
+  ALL,
+  ANY,
+}
+
+table ReduceWindowOptions (deprecated) {
+  reduce_function: ReduceWindowFunction;
+}
+
+// An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
+// builtin, or a string if the operator is custom.
+table OperatorCode {
+  // This field is for backward compatibility. This field will be used when
+  // the value of the extended builtin_code field has less than
+  // BulitinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES.
+  deprecated_builtin_code:byte;
+  custom_code:string;
+
+  // The version of the operator. The version need to be bumped whenever new
+  // parameters are introduced into an op.
+  version:int = 1;
+
+  // This field is introduced for resolving op builtin code shortage problem
+  // (the original BuiltinOperator enum field was represented as a byte).
+  // This field will be used when the value of the extended builtin_code field
+  // has greater than BulitinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES.
+  builtin_code:BuiltinOperator;
+}
+
+enum CustomOptionsFormat : byte {
+  FLEXBUFFERS = 0,
+}
+
+table StableHLOCompositeOptions {
+  name:string;
+  decomposition_subgraph_index:int32;
+  composite_attributes:[ubyte];
+  composite_attributes_format:CustomOptionsFormat;
+  version:int32;
+}
+
+// An operator takes tensors as inputs and outputs. The type of operation being
+// performed is determined by an index into the list of valid OperatorCodes,
+// while the specifics of each operations is configured using builtin_options
+// or custom_options.
+table Operator {
+  // Index into the operator_codes array. Using an integer here avoids
+  // complicate map lookups.
+  opcode_index:uint;
+
+  // Optional input are indicated by -1.
+  inputs:[int];
+  outputs:[int];
+
+  builtin_options:BuiltinOptions;
+  custom_options:[ubyte];
+  custom_options_format:CustomOptionsFormat;
+
+  // A list of booleans indicating the input tensors which are being mutated by
+  // this operator.(e.g. used by RNN and LSTM).
+  // For example, if the "inputs" array refers to 5 tensors and the second and
+  // fifth are mutable variables, then this list will contain
+  // [false, true, false, false, true].
+  //
+  // If the list is empty, no variable is mutated in this operator.
+  // The list either has the same length as `inputs`, or is empty.
+  mutating_variable_inputs:[bool];
+
+  // A list of indices to the subgraph's "tensors" that are internal to an Op.
+  // Internal tensors are those that do not flow in or out of the operation,
+  // but instead are part of internal computation. As such, the operation's
+  // implementation may manage its memory more efficiently. They are needed
+  // however (i.e. not just an implementation detail) since they are part of the
+  // computation, which may require relevant metadata such as quantization
+  // parameters.
+  intermediates:[int];
+
+  // When an op is using custom_options in a model that is larger than 2GB, then
+  // we instead use the following attributes to find the buffer location which
+  // is stored outside of flatbuffers, the offset is calculated relative to the
+  // beginning of the file and is only valid if > 1
+  large_custom_options_offset: ulong;
+  large_custom_options_size: ulong;
+
+  // Flatbuffers union struct has a 128 elements limit in JAVA, so a second
+  // union is added, in the case of where BuitlinOptions2 runs out, a third
+  // one can be added
+  builtin_options_2 : BuiltinOptions2;
+}
+
+// The root type, defining a subgraph, which typically represents an entire
+// model.
+table SubGraph {
+  // A list of all tensors used in this subgraph.
+  tensors:[Tensor];
+
+  // Indices of the tensors that are inputs into this subgraph. Note this is
+  // the list of non-static tensors that feed into the subgraph for inference.
+  inputs:[int];
+
+  // Indices of the tensors that are outputs out of this subgraph. Note this is
+  // the list of output tensors that are considered the product of the
+  // subgraph's inference.
+  outputs:[int];
+
+  // All operators, in execution order.
+  operators:[Operator];
+
+  // Name of this subgraph (used for debugging).
+  name:string;
+}
+
+// Table of raw data buffers (used for constant tensors). Referenced by tensors
+// by index. The generous alignment accommodates mmap-friendly data structures.
+table Buffer {
+  data:[ubyte] (force_align: 16);
+
+  // In a model that is larger than 2GB, then buffers instead uses the following
+  // attributes to find stored data, which is outside of flatbuffers
+  // the offset is calculated relative to the beginning of the file and is only
+  // valid if > 1.
+  offset: ulong;
+  size: ulong;
+}
+
+table Metadata {
+  // A human readable string to uniquely identify a Metadata.
+  name:string;
+  // An index to the buffers table.
+  buffer:uint;
+}
+
+// Map from an alias name of tensor to tensor index in the graph.
+// This is used in Signature def.
+table TensorMap {
+  // Represents the alias to use for this tensor.
+  name:string;
+
+  // The actual tensor index in the primary graph, that 'name' corresponds to.
+  tensor_index:uint;
+}
+
+// This corresponds to SignatureDef in Tensorflow SavedModel.
+// The SignatureDef will be part of the SavedModel provided for conversion.
+table SignatureDef {
+  // Named inputs for this signature.
+  inputs:[TensorMap];
+
+  // Named outputs for this signature.
+  outputs:[TensorMap];
+
+  // Key value which was in the Tensorflow SavedModel SignatureDef map.
+  signature_key:string;
+
+  // Model tag, deprecated.
+  deprecated_tag:string (deprecated);
+
+  // Index of subgraphs that corresponds to the exported method.
+  subgraph_index:uint;
+}
+
+table Model {
+  // Version of the schema.
+  version:uint;
+
+  // A list of all operator codes used in this model. This is
+  // kept in order because operators carry an index into this
+  // vector.
+  operator_codes:[OperatorCode];
+
+  // All the subgraphs of the model. The 0th is assumed to be the main
+  // model.
+  subgraphs:[SubGraph];
+
+  // A description of the model.
+  description:string;
+
+  // Buffers of the model.
+  // Note the 0th entry of this array must be an empty buffer (sentinel).
+  // This is a convention so that tensors without a buffer can provide 0 as
+  // their buffer.
+  buffers:[Buffer];
+
+  // Metadata about the model. Indirects into the existings buffers list.
+  // Deprecated, prefer to use metadata field.
+  metadata_buffer:[int];
+
+  // Metadata about the model.
+  metadata:[Metadata];
+
+  // Optional SignatureDefs for the model.
+  signature_defs:[SignatureDef];
+}
+
+root_type Model;
diff --git a/tensorflow/compiler/mlir/lite/sparsity/BUILD b/tensorflow/compiler/mlir/lite/sparsity/BUILD
index fce754995766d5..0566ea545a7b0b 100644
--- a/tensorflow/compiler/mlir/lite/sparsity/BUILD
+++ b/tensorflow/compiler/mlir/lite/sparsity/BUILD
@@ -30,12 +30,12 @@ cc_library(
         "//tensorflow/compiler/mlir/lite:common",
         "//tensorflow/compiler/mlir/lite:flatbuffer_translate_lib",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite_d2s",
+        "//tensorflow/compiler/mlir/lite/schema:schema_fbs",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/core/c:private_c_api_types",
-        "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/tools/optimize:reduced_precision_support",
         "@com_google_absl//absl/strings",
         "@flatbuffers",
@@ -54,10 +54,10 @@ tf_cc_test(
     ],
     deps = [
         ":sparsify_model",
+        "//tensorflow/compiler/mlir/lite/schema:schema_fbs",
         "//tensorflow/lite/core:model_builder",
         "//tensorflow/lite/core/api:error_reporter",
         "//tensorflow/lite/core/c:private_c_api_types",
-        "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/tools/optimize:reduced_precision_support",
         "@com_google_googletest//:gtest_main",
         "@flatbuffers",
diff --git a/tensorflow/compiler/mlir/lite/sparsity/sparsify_model.h b/tensorflow/compiler/mlir/lite/sparsity/sparsify_model.h
index 53deff6d990bb0..e0659063bc4ec6 100644
--- a/tensorflow/compiler/mlir/lite/sparsity/sparsify_model.h
+++ b/tensorflow/compiler/mlir/lite/sparsity/sparsify_model.h
@@ -16,9 +16,9 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_LITE_SPARSITY_SPARSIFY_MODEL_H_
 
 #include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/c/c_api_types.h"
-#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace mlir {
 namespace lite {
diff --git a/tensorflow/compiler/mlir/lite/sparsity/sparsify_model_test.cc b/tensorflow/compiler/mlir/lite/sparsity/sparsify_model_test.cc
index 861a02be9caa6c..71fc1927a02217 100644
--- a/tensorflow/compiler/mlir/lite/sparsity/sparsify_model_test.cc
+++ b/tensorflow/compiler/mlir/lite/sparsity/sparsify_model_test.cc
@@ -26,10 +26,10 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/core/model_builder.h"
-#include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/tools/optimize/reduced_precision_support.h"
 
 namespace mlir {
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/BUILD b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
index 9976d6ff363c8f..e4001d4c08b695 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
@@ -64,9 +64,11 @@ cc_library(
     deps = [
         ":stablehlo_util",
         "//tensorflow/compiler/mlir/tensorflow",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
     ],
     alwayslink = 1,
 )
@@ -545,6 +547,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@local_xla//xla/mlir_hlo",
@@ -752,6 +755,27 @@ cc_library(
     alwayslink = True,
 )
 
+cc_library(
+    name = "optimize_layout",
+    srcs = [
+        "transforms/optimize_layout.cc",
+    ],
+    hdrs = ["transforms/passes.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":passes_inc_gen",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:DialectUtils",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+        "@stablehlo//:stablehlo_ops",
+    ],
+    alwayslink = 1,
+)
+
 gentbl_cc_library(
     name = "composite_lowering_inc_gen",
     compatible_with = get_compatible_with_portable(),
@@ -794,7 +818,6 @@ tf_cc_binary(
         "//tensorflow/compiler/mlir/lite:tf_to_tfl_flatbuffer",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantize_preprocess",
         "//tensorflow/compiler/mlir/quantization/tensorflow:tf_quant_ops",
-        "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibrator_singleton_impl",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tf_graph_optimization_pass",
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/BUILD b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/BUILD
index c487600517f9b8..5add6c730cac5e 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/BUILD
+++ b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/BUILD
@@ -1,5 +1,6 @@
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -18,12 +19,14 @@ package_group(
 tf_cc_binary(
     name = "odml-converter",
     srcs = ["odml_converter_main.cc"],
+    compatible_with = get_compatible_with_portable(),
     visibility = [
         "//tensorflow/compiler/mlir/lite/stablehlo/odml_converter:__subpackages__",
         "//third_party/odml/infra:__subpackages__",
     ],  # Prototype phase.
     deps = [
-        ":all_passes",
+        ":outline_composites",
+        ":shlo_simplify",
         "//tensorflow/compiler/mlir:init_mlir",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
         "//tensorflow/compiler/mlir/lite/stablehlo:legalize_stablehlo_to_vhlo_pass",
@@ -33,12 +36,84 @@ tf_cc_binary(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:MlirOptLib",
         "@llvm-project//mlir:Support",
+        "@stablehlo//:chlo_ops",
+        "@stablehlo//:stablehlo_ops",
+    ],
+)
+
+cc_library(
+    name = "outline_composites",
+    srcs = [
+        "transforms/outline_composites.cc",
+    ],
+    hdrs = ["passes.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":passes_inc_gen",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:BufferizationInterfaces",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+        "@stablehlo//:chlo_ops",
+        "@stablehlo//:stablehlo_ops",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "shlo_simplify",
+    srcs = [
+        "transforms/shlo_simplify.cc",
+    ],
+    hdrs = ["passes.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":folders",
+        ":passes_inc_gen",
+        ":shlo_simplify_inc_gen",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+        "@stablehlo//:stablehlo_ops",
+    ],
+    alwayslink = 1,
+)
+
+gentbl_cc_library(
+    name = "shlo_simplify_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = [
+        (
+            ["-gen-rewriters"],
+            "transforms/generated_shlo_simplify.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "transforms/shlo_simplify.td",
+    deps = ["@stablehlo//:stablehlo_ops_td_files"],
+)
+
+cc_library(
+    name = "folders",
+    srcs = ["folders.cc"],
+    hdrs = ["folders.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
         "@stablehlo//:stablehlo_ops",
     ],
 )
 
 gentbl_cc_library(
     name = "passes_inc_gen",
+    compatible_with = get_compatible_with_portable(),
     tbl_outs = [
         (
             [
@@ -52,13 +127,3 @@ gentbl_cc_library(
     td_file = "passes.td",
     deps = ["@llvm-project//mlir:PassBaseTdFiles"],
 )
-
-cc_library(
-    name = "all_passes",
-    hdrs = ["passes.h"],
-    deps = [":passes_inc_gen"],
-)
-
-exports_files([
-    "run_lit.sh",
-])
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/folders.cc b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/folders.cc
new file mode 100644
index 00000000000000..cb48050db47cb5
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/folders.cc
@@ -0,0 +1,129 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <optional>
+#include <vector>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+
+namespace mlir::odml {
+
+namespace {
+
+// Helper class for parsing operands to a foldable operation.
+class FoldAdaptor {
+ public:
+  // Returns std::nullopt if the operation cannot be folded.
+  static std::optional<FoldAdaptor> Create(Operation* operation) {
+    auto foldable_opr = [](Value val) -> bool {
+      return !llvm::isa<BlockArgument>(val) &&
+             llvm::isa<stablehlo::ConstantOp>(val.getDefiningOp());
+    };
+    if (!llvm::all_of(operation->getOperands(), foldable_opr)) {
+      return std::nullopt;
+    }
+    return FoldAdaptor(operation);
+  }
+
+  // Gets a list of ElementsAttr behind each constant operand.
+  llvm::SmallVector<ElementsAttr> OperandData() {
+    llvm::SmallVector<ElementsAttr> res;
+    res.reserve(operation_->getNumOperands());
+    for (auto opr : operation_->getOperands()) {
+      auto op = llvm::dyn_cast<stablehlo::ConstantOp>(opr.getDefiningOp());
+      res.push_back(op.getValue());
+    }
+    return res;
+  }
+
+  // Gets a pointer to the operation to be folded.
+  Operation* Op() { return operation_; }
+
+ private:
+  explicit FoldAdaptor(Operation* operation) : operation_(operation) {}
+  Operation* const operation_;
+};
+
+// APSInt provides operators which APInt does not, so allow for converting
+// to APSInt for computation. Only APInts can be directly read from
+// element attributes.
+static const APFloat& AddSign(const APFloat& v) { return v; }
+static APSInt AddSign(const APInt& v) { return APSInt(v); }
+
+template <typename ResultType>
+static LogicalResult FoldDivOpInternal(stablehlo::DivOp op,
+                                       PatternRewriter& rewriter) {
+  auto adaptor = FoldAdaptor::Create(op);
+  if (!adaptor.has_value()) {
+    return failure();
+  }
+  auto const_oprs = adaptor.value().OperandData();
+
+  const bool lhs_splat = const_oprs[0].isSplat();
+  const bool rhs_splat = const_oprs[1].isSplat();
+
+  auto lhs_vals = const_oprs[0].getValues<ResultType>();
+  auto rhs_vals = const_oprs[1].getValues<ResultType>();
+  const auto num_results = std::max(lhs_vals.size(), rhs_vals.size());
+  std::vector<ResultType> res;
+  res.reserve(num_results);
+
+  auto lhs_start = lhs_vals.begin();
+  auto rhs_start = rhs_vals.begin();
+
+  for (int i = 0; i < num_results; ++i) {
+    auto lhs_val = lhs_splat ? *lhs_start : *(lhs_start++);
+    auto rhs_val = rhs_splat ? *rhs_start : *(rhs_start++);
+    auto signed_lhs_val = AddSign(lhs_val);
+    auto signed_rhs_val = AddSign(rhs_val);
+    if (signed_rhs_val.isZero()) {
+      return failure();
+    }
+    res.push_back(signed_lhs_val / signed_rhs_val);
+  }
+
+  auto res_attr = DenseElementsAttr::get(
+      const_oprs[0].getType().cast<RankedTensorType>(), res);
+  rewriter.replaceOpWithNewOp<stablehlo::ConstantOp>(adaptor.value().Op(),
+                                                     res_attr);
+  return success();
+}
+
+static LogicalResult FoldDivOp(stablehlo::DivOp op, PatternRewriter& rewriter) {
+  auto etype = op.getType().getElementType();
+  if (etype.isa<FloatType>()) {
+    return FoldDivOpInternal<APFloat>(op, rewriter);
+  }
+  if (etype.isa<IntegerType>()) {
+    return FoldDivOpInternal<APInt>(op, rewriter);
+  }
+  return failure();
+}
+}  // namespace
+
+void PopulateFolderPatterns(RewritePatternSet& patternSet) {
+  patternSet.add(FoldDivOp);
+}
+
+}  // namespace mlir::odml
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass_registration.cc b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/folders.h
similarity index 52%
rename from tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass_registration.cc
rename to tensorflow/compiler/mlir/lite/stablehlo/odml_converter/folders.h
index 3359ba08fd15df..6f3d2d55b33252 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass_registration.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/folders.h
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,12 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_ODML_CONVERTER_FOLDERS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_ODML_CONVERTER_FOLDERS_H_
 
-#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.h"
+namespace mlir::odml {
 
-namespace tensorflow {
+// Populates the pattern set with all folding patterns. These patterns
+// are intended to have precedence over any other patterns added to the set.
+void PopulateFolderPatterns(RewritePatternSet &patternSet);
 
-REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 10,
-                      MlirRoundtripPass);
+}  // namespace mlir::odml
 
-}  // namespace tensorflow
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_ODML_CONVERTER_FOLDERS_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/odml_converter_main.cc b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/odml_converter_main.cc
index ecd7396c2a4622..a510e640a7abd8 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/odml_converter_main.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/odml_converter_main.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Tools/mlir-opt/MlirOptMain.h"  // from @llvm-project
+#include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/init_mlir.h"
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
@@ -42,7 +43,7 @@ int main(int argc, char* argv[]) {
   mlir::DialectRegistry registry;
   registry.insert<mlir::func::FuncDialect, mlir::stablehlo::StablehloDialect,
                   mlir::TFL::TFLDialect, mlir::arith::ArithDialect,
-                  mlir::TF::TensorFlowDialect>();
+                  mlir::TF::TensorFlowDialect, mlir::chlo::ChloDialect>();
 
   return failed(
       mlir::MlirOptMain(argc, argv, "ODML Converter Driver\n", registry));
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/passes.h b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/passes.h
index b3589356f196a2..42e5d18e11f965 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/passes.h
+++ b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/passes.h
@@ -16,8 +16,18 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_ODML_CONVERTER_PASSES_H_
 #define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_ODML_CONVERTER_PASSES_H_
 
+#include <memory>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
 namespace mlir::odml {
 
+std::unique_ptr<OperationPass<func::FuncOp>> CreateOutlineCompositesPass();
+
+std::unique_ptr<OperationPass<ModuleOp>> CreateSHLOSimplifyPass();
+
 #define GEN_PASS_REGISTRATION
 #include "tensorflow/compiler/mlir/lite/stablehlo/odml_converter/passes.h.inc"
 
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/passes.td b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/passes.td
index 800d7e0d2ff59b..45360d8749dc84 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/passes.td
+++ b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/passes.td
@@ -15,3 +15,29 @@ limitations under the License.
 
 include "mlir/Pass/PassBase.td"
 
+def OutlineCompositesPass: Pass<"outline-composites", "func::FuncOp"> {
+  let summary = "Outlines specific patterns into composites.";
+  let description = [{
+    Outline specific patterns into composites. Specific patterns can be any
+    sub-DAG within a single `Block*`. The signature of the new composite
+    matches the inupt and output edges from a node in the sub-DAG to a node out
+    of it. The associated decomposition has the same semantic as the matched
+    ops, but may not have identical structure.
+  }];
+
+  let options = [];
+  let constructor = "CreateOutlineCompositesPass()";
+  let dependentDialects = ["mlir::chlo::ChloDialect", "mlir::stablehlo::StablehloDialect", "mlir::func::FuncDialect"];
+}
+
+def SHLOSimplifyPass: Pass<"shlo-simplify", "ModuleOp"> {
+  let summary = "Apply internal canonicalizations and foldings.";
+  let description = [{
+    Applies various internally defined patterns.
+  }];
+
+  let options = [
+
+  ];
+  let constructor = "CreateSHLOSimplifyPass()";
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/tests/BUILD b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/tests/BUILD
index c990b20c8fb51c..c78441dcfc446a 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/tests/BUILD
+++ b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/tests/BUILD
@@ -21,5 +21,6 @@ filegroup(
     data = [
         "//tensorflow/compiler/mlir/lite/stablehlo/odml_converter:odml-converter",
         "@llvm-project//llvm:FileCheck",
+        "@llvm-project//mlir:run_lit.sh",
     ],
 )
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/tests/outline_composites.mlir b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/tests/outline_composites.mlir
new file mode 100644
index 00000000000000..81726aaba37bcc
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/tests/outline_composites.mlir
@@ -0,0 +1,57 @@
+// RUN: odml-converter --outline-composites %s -split-input-file | FileCheck %s
+
+func.func @geluWithCustomCallErf(%arg0: tensor<2xf32>) -> tensor<2xf32> {
+  %0 = stablehlo.constant dense<1.000000e+00> : tensor<2xf32>
+  %1 = stablehlo.constant dense<0.707106769> : tensor<2xf32>
+  %2 = stablehlo.constant dense<5.000000e-01> : tensor<2xf32>
+  %3 = stablehlo.multiply %arg0, %2 : tensor<2xf32>
+  %4 = stablehlo.multiply %arg0, %1 : tensor<2xf32>
+  %5 = stablehlo.custom_call @mhlo.erf(%4) {mhlo.attributes = {}, mhlo.version = 1 : i64} : (tensor<2xf32>) -> tensor<2xf32>
+  %6 = stablehlo.add %5, %0 : tensor<2xf32>
+  %7 = stablehlo.multiply %3, %6 : tensor<2xf32>
+  return %7 : tensor<2xf32>
+}
+
+// CHECK: func.func private @gelu_decomp_0(%arg0: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK: %cst = stablehlo.constant dense<1.000000e+00> : tensor<2xf32>
+// CHECK: %cst_0 = stablehlo.constant dense<5.000000e-01> : tensor<2xf32>
+// CHECK: %cst_1 = stablehlo.constant dense<0.707106769> : tensor<2xf32>
+// CHECK: %0 = stablehlo.multiply %arg0, %cst_1 : tensor<2xf32>
+// CHECK: %1 = chlo.erf %0 : tensor<2xf32> -> tensor<2xf32>
+// CHECK: %2 = stablehlo.add %1, %cst : tensor<2xf32>
+// CHECK: %3 = stablehlo.multiply %arg0, %cst_0 : tensor<2xf32>
+// CHECK: %4 = stablehlo.multiply %3, %2 : tensor<2xf32>
+// CHECK: return %4 : tensor<2xf32>
+
+// CHECK-LABEL: geluWithCustomCallErf
+// CHECK: %0 = stablehlo.composite "odml.internal.gelu" %arg0 {composite_attributes = {approx = false}, decomposition = @gelu_decomp_0} : (tensor<2xf32>) -> tensor<2xf32>
+// CHECK: return %0
+
+// -----
+
+func.func @geluWithCHLOErf(%arg0: tensor<2xf32>) -> tensor<2xf32> {
+  %0 = stablehlo.constant dense<1.000000e+00> : tensor<2xf32>
+  %1 = stablehlo.constant dense<0.707106769> : tensor<2xf32>
+  %2 = stablehlo.constant dense<5.000000e-01> : tensor<2xf32>
+  %3 = stablehlo.multiply %arg0, %2 : tensor<2xf32>
+  %4 = stablehlo.multiply %arg0, %1 : tensor<2xf32>
+  %5 = chlo.erf %4 : tensor<2xf32> -> tensor<2xf32>
+  %6 = stablehlo.add %5, %0 : tensor<2xf32>
+  %7 = stablehlo.multiply %3, %6 : tensor<2xf32>
+  return %7 : tensor<2xf32>
+}
+
+// CHECK: func.func private @gelu_decomp_0(%arg0: tensor<2xf32>) -> tensor<2xf32>
+// CHECK: %cst = stablehlo.constant dense<1.000000e+00> : tensor<2xf32>
+// CHECK: %cst_0 = stablehlo.constant dense<5.000000e-01> : tensor<2xf32>
+// CHECK: %cst_1 = stablehlo.constant dense<0.707106769> : tensor<2xf32>
+// CHECK: %0 = stablehlo.multiply %arg0, %cst_1 : tensor<2xf32>
+// CHECK: %1 = chlo.erf %0 : tensor<2xf32> -> tensor<2xf32>
+// CHECK: %2 = stablehlo.add %1, %cst : tensor<2xf32>
+// CHECK: %3 = stablehlo.multiply %arg0, %cst_0 : tensor<2xf32>
+// CHECK: %4 = stablehlo.multiply %3, %2 : tensor<2xf32>
+// CHECK: return %4 : tensor<2xf32>
+
+// CHECK-LABEL: geluWithCHLOErf
+// CHECK: %0 = stablehlo.composite "odml.internal.gelu" %arg0 {composite_attributes = {approx = false}, decomposition = @gelu_decomp_0} : (tensor<2xf32>) -> tensor<2xf32>
+// CHECK: return %0
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/tests/shlo_simplify.mlir b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/tests/shlo_simplify.mlir
new file mode 100644
index 00000000000000..e06431a1852b3b
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/tests/shlo_simplify.mlir
@@ -0,0 +1,96 @@
+// RUN: odml-converter --shlo-simplify %s -split-input-file | FileCheck %s
+
+func.func @foldDiv() -> tensor<2xf32> {
+  %0 = stablehlo.constant dense<[2.0, 3.0]> : tensor<2xf32>
+  %1 = stablehlo.constant dense<[4.0, 6.0]> : tensor<2xf32>
+  %2 = stablehlo.divide %0, %1 : tensor<2xf32>
+  return %2 : tensor<2xf32>
+}
+
+// CHECK-LABEL: foldDiv
+// CHECK: stablehlo.constant dense<5.000000e-01> : tensor<2xf32>
+
+// -----
+
+func.func @foldDivLHSSplat() -> tensor<2xf32> {
+  %0 = stablehlo.constant dense<2.0> : tensor<2xf32>
+  %1 = stablehlo.constant dense<[4.0, 6.0]> : tensor<2xf32>
+  %2 = stablehlo.divide %0, %1 : tensor<2xf32>
+  return %2 : tensor<2xf32>
+}
+
+// CHECK-LABEL: foldDivLHSSplat
+// CHECK: stablehlo.constant dense<[5.000000e-01, 0.333333343]> : tensor<2xf32>
+
+// -----
+
+func.func @foldDivRHSSplat() -> tensor<2xf32> {
+  %0 = stablehlo.constant dense<[4.0, 6.0]> : tensor<2xf32>
+  %1 = stablehlo.constant dense<2.0> : tensor<2xf32>
+  %2 = stablehlo.divide %0, %1 : tensor<2xf32>
+  return %2 : tensor<2xf32>
+}
+
+// CHECK-LABEL: foldDivRHSSplat
+// CHECK: stablehlo.constant dense<[2.000000e+00, 3.000000e+00]> : tensor<2xf32>
+
+// -----
+
+func.func @foldDivBothSplat() -> tensor<2xf32> {
+  %0 = stablehlo.constant dense<4.0> : tensor<2xf32>
+  %1 = stablehlo.constant dense<2.0> : tensor<2xf32>
+  %2 = stablehlo.divide %0, %1 : tensor<2xf32>
+  return %2 : tensor<2xf32>
+}
+
+// CHECK-LABEL: foldDivBothSplat
+// CHECK: stablehlo.constant dense<2.000000e+00> : tensor<2xf32>
+
+// -----
+
+func.func @foldDivF64() -> tensor<2xf64> {
+  %0 = stablehlo.constant dense<[2.0, 3.0]> : tensor<2xf64>
+  %1 = stablehlo.constant dense<[4.0, 6.0]> : tensor<2xf64>
+  %2 = stablehlo.divide %0, %1 : tensor<2xf64>
+  return %2 : tensor<2xf64>
+}
+
+// CHECK-LABEL: foldDivF64
+// CHECK: stablehlo.constant dense<5.000000e-01> : tensor<2xf64>
+
+// -----
+
+func.func @foldDivI32() -> tensor<2xi32> {
+  %0 = stablehlo.constant dense<[9, 3]> : tensor<2xi32>
+  %1 = stablehlo.constant dense<[4, 6]> : tensor<2xi32>
+  %2 = stablehlo.divide %0, %1 : tensor<2xi32>
+  return %2 : tensor<2xi32>
+}
+
+// CHECK-LABEL: foldDivI32
+// CHECK: stablehlo.constant dense<[2, 0]> : tensor<2xi32>
+
+// -----
+
+func.func @divideToMulReciprocalSplat(%arg0: tensor<2xf32>) -> tensor<2xf32> {
+  %0 = stablehlo.constant dense<2.0> : tensor<2xf32>
+  %2 = stablehlo.divide %arg0, %0 : tensor<2xf32>
+  return %2 : tensor<2xf32>
+}
+
+// CHECK-LABEL: divideToMulReciprocalSplat
+// CHECK: stablehlo.constant dense<5.000000e-01> : tensor<2xf32>
+// CHECK: stablehlo.multiply
+
+// -----
+
+func.func @divideToMulReciprocal(%arg0: tensor<2xf32>) -> tensor<2xf32> {
+  %0 = stablehlo.constant dense<[2.0, 3.0]> : tensor<2xf32>
+  %2 = stablehlo.divide %arg0, %0 : tensor<2xf32>
+  return %2 : tensor<2xf32>
+}
+
+// CHECK-LABEL: divideToMulReciprocal
+// CHECK: stablehlo.constant dense<[5.000000e-01, 0.333333343]> : tensor<2xf32>
+// CHECK: stablehlo.multiply
+
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/transforms/outline_composites.cc b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/transforms/outline_composites.cc
new file mode 100644
index 00000000000000..821ba4fa7e4d2f
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/transforms/outline_composites.cc
@@ -0,0 +1,252 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cmath>
+#include <cstdlib>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+
+namespace mlir {
+namespace odml {
+namespace {
+
+// TODO - b/330337238: Surface these to other files when needed.
+constexpr llvm::StringLiteral kCompositeNamespace = "odml.internal";
+constexpr llvm::StringLiteral kGelu = "gelu";
+
+std::string MakeCompositeName(llvm::StringRef op_name) {
+  return (kCompositeNamespace + "." + op_name).str();
+}
+
+#define GEN_PASS_DEF_OUTLINECOMPOSITESPASS
+#include "tensorflow/compiler/mlir/lite/stablehlo/odml_converter/passes.h.inc"
+
+constexpr float kOne = 1.0;
+const float kOneOverRoot2 = kOne / std::sqrt(2);
+constexpr float kHalf = kOne / 2.0;
+constexpr float kTolerance = kOne / 1000.0;
+
+// Gets the operation that uses the sole result of given operation
+// if there is only one.
+Operation* GetUserIfOnlyOne(Operation* op) {
+  if (op->getNumResults() != 1) return nullptr;
+  auto result = op->getResult(0);
+  if (!result.hasOneUse()) return nullptr;
+  return (*result.getUses().begin()).getOwner();
+}
+
+// Gets operation providing value for the given operand of given operation
+// if the given operation is the only user.
+Operation* GetInputOpWithOneUse(Operation* op, int opr_num) {
+  if (opr_num >= op->getNumOperands()) return nullptr;
+  auto opr = op->getOperand(opr_num);
+  if (llvm::isa<BlockArgument>(opr)) return nullptr;
+  auto* res = opr.getDefiningOp();
+  if (!res->hasOneUse()) return nullptr;
+  return res;
+}
+
+// Checks if the given operand of given operation refers to a splat constant
+// with given val.
+bool HasSplatArg(Operation* op, float val, int opr_num) {
+  auto* cst_input = GetInputOpWithOneUse(op, 1);
+  if (!cst_input) return false;
+  auto cst_op = llvm::dyn_cast_or_null<stablehlo::ConstantOp>(cst_input);
+  if (!cst_op) return false;
+  ElementsAttr value = cst_op.getValue();
+  if (!value.isSplat()) return false;
+  if (!value.getElementType().isF32()) return false;
+  return std::abs(value.getSplatValue<float>() - val) < kTolerance;
+}
+
+// Determines if the given op is semantically that of the gauss error function.
+bool MatchERF(Operation* op) {
+  if (auto custom_call = llvm::dyn_cast_or_null<stablehlo::CustomCallOp>(op)) {
+    return custom_call.getCallTargetName() == "mhlo.erf";
+  }
+  return llvm::isa<chlo::ErfOp>(op);
+}
+
+// Builds a reference implementation of non-approximate GELU.
+func::FuncOp BuildGELUDecomposition(RankedTensorType type,
+                                    PatternRewriter& rewriter,
+                                    Block* insertion_point) {
+  rewriter.setInsertionPointToStart(insertion_point);
+
+  auto ftype = FunctionType::get(rewriter.getContext(), {type}, {type});
+  auto name = rewriter.getStringAttr("gelu_decomp");
+  func::FuncOp new_func = rewriter.create<func::FuncOp>(
+      insertion_point->front().getLoc(), name, ftype);
+  new_func.setPrivate();
+  new_func.addEntryBlock();
+  rewriter.setInsertionPointToStart(&new_func.getBody().front());
+
+  auto one_val = DenseElementsAttr::get(type, kOne);
+  auto one_cst =
+      rewriter.create<stablehlo::ConstantOp>(rewriter.getUnknownLoc(), one_val);
+
+  auto half_val = DenseElementsAttr::get(type, kHalf);
+  auto half_cst =
+      rewriter.create<stablehlo::ConstantOp>(one_cst.getLoc(), half_val);
+
+  auto one_over_root2_val = DenseElementsAttr::get(type, kOneOverRoot2);
+  auto one_over_root2_cst = rewriter.create<stablehlo::ConstantOp>(
+      half_cst.getLoc(), one_over_root2_val);
+
+  auto mul_op = rewriter.create<stablehlo::MulOp>(one_over_root2_cst.getLoc(),
+                                                  new_func.getArguments()[0],
+                                                  one_over_root2_cst);
+  auto erf_op = rewriter.create<chlo::ErfOp>(mul_op.getLoc(), mul_op);
+  auto add_op =
+      rewriter.create<stablehlo::AddOp>(erf_op.getLoc(), erf_op, one_cst);
+  auto lhs_mul_op = rewriter.create<stablehlo::MulOp>(
+      half_cst.getLoc(), new_func.getArguments()[0], half_cst);
+  auto output_mul_op = rewriter.create<stablehlo::MulOp>(lhs_mul_op.getLoc(),
+                                                         lhs_mul_op, add_op);
+
+  rewriter.create<func::ReturnOp>(output_mul_op.getLoc(),
+                                  output_mul_op.getResult());
+  rewriter.clearInsertionPoint();
+  return new_func;
+}
+
+// Outlines non-approximate GELU into a stablehlo composite.
+//
+//    -> mul 1/sqrt(2) -> erf -> add 1 ->
+// in                                    mul
+//    ---------> mul 0.5 --------------->
+//
+// This pattern assumes all binary ewise ops with one constant argument
+// have that constant argument as the second operand. It works by
+// identifying `erf` ops and validate the structure around them.
+class OutlineGELU : public RewritePattern {
+ public:
+  explicit OutlineGELU(MLIRContext* context)
+      : RewritePattern(MatchAnyOpTypeTag(), /*benefit=*/1, context) {}
+
+  LogicalResult matchAndRewrite(Operation* op,
+                                PatternRewriter& rewriter) const override {
+    if (!MatchERF(op)) return failure();
+    // `add 1`
+    auto* erf_user = GetUserIfOnlyOne(op);
+    if (!erf_user) return failure();
+
+    // `mul`
+    auto* erf_user_user = GetUserIfOnlyOne(erf_user);
+    if (!erf_user_user) return failure();
+
+    // `mul 1/sqrt(2)`
+    auto* erf_input = GetInputOpWithOneUse(op, 0);
+    if (!erf_input) return failure();
+
+    // `mul 0.5`
+    auto* erf_user_user_input = GetInputOpWithOneUse(erf_user_user, 0);
+    if (!erf_user_user_input) return failure();
+
+    // Check `mul 0.5` and `mul 1/sqrt(2)` refer to the same input.
+    if (erf_user_user_input->getOperand(0) != erf_input->getOperand(0)) {
+      return failure();
+    }
+
+    // Check the structural matches have the correct op type and values.
+    auto rhs_mul = llvm::dyn_cast_or_null<stablehlo::MulOp>(erf_input);
+    if (!rhs_mul) return failure();
+
+    auto lhs_mul =
+        llvm::dyn_cast_or_null<stablehlo::MulOp>(erf_user_user_input);
+    if (!lhs_mul) return failure();
+
+    auto output_mul = llvm::dyn_cast_or_null<stablehlo::MulOp>(erf_user_user);
+    if (!output_mul) return failure();
+
+    auto rhs_add = llvm::dyn_cast_or_null<stablehlo::AddOp>(erf_user);
+    if (!rhs_add) return failure();
+
+    if (!HasSplatArg(rhs_add, kOne, 1)) return failure();
+    if (!HasSplatArg(lhs_mul, kHalf, 1)) return failure();
+    if (!HasSplatArg(rhs_mul, kOneOverRoot2, 1)) return failure();
+
+    // Build a function to serve as the GELU decomposition in the
+    // shlo composite op.
+    auto root = op->getParentOfType<ModuleOp>();
+    auto func = BuildGELUDecomposition(
+        rhs_add.getType().cast<RankedTensorType>(), rewriter, root.getBody());
+
+    SymbolTable table(root);
+    (void)table.renameToUnique(func, {});
+
+    rewriter.setInsertionPointAfter(output_mul);
+    auto composite_attrs = rewriter.getDictionaryAttr(
+        {rewriter.getNamedAttr("approx", rewriter.getBoolAttr(false))});
+    auto composite_op = rewriter.create<stablehlo::CompositeOp>(
+        output_mul.getLoc(), func.getResultTypes()[0],
+        SmallVector<Value>{erf_input->getOperand(0)}, MakeCompositeName(kGelu),
+        composite_attrs, func.getSymName());
+    rewriter.replaceAllOpUsesWith(output_mul, composite_op);
+    // Note these must be erased in reverse topo order to avoid
+    // failing in debug mode.
+    rewriter.eraseOp(output_mul);
+    rewriter.eraseOp(rhs_add);
+    rewriter.eraseOp(op);
+    rewriter.eraseOp(lhs_mul);
+    rewriter.eraseOp(rhs_mul);
+
+    return success();
+  }
+};
+
+class OutlineCompositesPass
+    : public impl::OutlineCompositesPassBase<OutlineCompositesPass> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(OutlineCompositesPass)
+
+  void runOnOperation() override {
+    auto func = getOperation();
+    RewritePatternSet patterns(&getContext());
+    patterns.add<OutlineGELU>(&getContext());
+    if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns)))) {
+      signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>> CreateOutlineCompositesPass() {
+  return std::make_unique<OutlineCompositesPass>();
+}
+
+}  // namespace odml
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/transforms/shlo_simplify.cc b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/transforms/shlo_simplify.cc
new file mode 100644
index 00000000000000..668fe06515812e
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/transforms/shlo_simplify.cc
@@ -0,0 +1,60 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <utility>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/lite/stablehlo/odml_converter/folders.h"
+
+namespace mlir {
+namespace odml {
+namespace {
+
+#define GEN_PASS_DEF_SHLOSIMPLIFYPASS
+#include "tensorflow/compiler/mlir/lite/stablehlo/odml_converter/passes.h.inc"
+#include "tensorflow/compiler/mlir/lite/stablehlo/odml_converter/transforms/generated_shlo_simplify.inc"
+
+// Performs misc odml "cleanup" on shlo dialect. This is a functional standin
+// for canonicalization and folding which is not offered directly by the
+// shlo implementation.
+class SHLOSimplifyPass : public impl::SHLOSimplifyPassBase<SHLOSimplifyPass> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(SHLOSimplifyPass)
+
+  void runOnOperation() override {
+    ModuleOp module = getOperation();
+    RewritePatternSet patterns(&getContext());
+    populateWithGenerated(patterns);
+    PopulateFolderPatterns(patterns);
+    if (failed(applyPatternsAndFoldGreedily(module, std::move(patterns)))) {
+      signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>> CreateSHLOSimplifyPass() {
+  return std::make_unique<SHLOSimplifyPass>();
+}
+
+}  // namespace odml
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/transforms/shlo_simplify.td b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/transforms/shlo_simplify.td
new file mode 100644
index 00000000000000..c8d19baeb11d0d
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/odml_converter/transforms/shlo_simplify.td
@@ -0,0 +1,38 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+include "stablehlo/dialect/StablehloOps.td"
+include "mlir/IR/OpBase.td"
+include "mlir/IR/BuiltinAttributes.td"
+include "mlir/IR/CommonAttrConstraints.td"
+include "mlir/IR/CommonTypeConstraints.td"
+
+def CloneF32ElementsAttrWithOnes
+  : NativeCodeCall<"DenseElementsAttr::get($0.getType().cast<ShapedType>(), (float)1.0)">;
+
+def NotConstant : Constraint<
+    CPred<"$0.isa<BlockArgument>() || !llvm::isa<stablehlo::ConstantOp>($0.getDefiningOp())">,
+    "Is not a constant.">;
+
+def : Pat<(StableHLO_DivOp $l,
+            (StableHLO_ConstantOp:$divisor FloatElementsAttr<32>:$cst)),
+          (StableHLO_MulOp $l,
+            (StableHLO_DivOp
+              (StableHLO_ConstantOp (CloneF32ElementsAttrWithOnes $cst)),
+              $divisor)),
+          [(NotConstant $l)]>;
+
+
+
+
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/odml_to_stablehlo.cc b/tensorflow/compiler/mlir/lite/stablehlo/odml_to_stablehlo.cc
index f1d6b237ac2ef6..28afcd43a03218 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/odml_to_stablehlo.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/odml_to_stablehlo.cc
@@ -63,7 +63,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.h"
 #include "tensorflow/compiler/mlir/tf2xla/transforms/passes.h"
 #include "xla/mlir/framework/transforms/passes.h"
-#include "xla/mlir_hlo/lhlo/transforms/passes.h"
 #include "xla/mlir_hlo/mhlo/IR/register.h"
 #include "xla/mlir_hlo/mhlo/transforms/passes.h"
 #include "tensorflow/core/platform/errors.h"
@@ -156,7 +155,7 @@ opt<std::string> exported_model_signatures(
 namespace mlir {
 namespace odml {
 
-tensorflow::StatusOr<OwningOpRef<mlir::ModuleOp>> ImportSavedModelOrMLIR(
+absl::StatusOr<OwningOpRef<mlir::ModuleOp>> ImportSavedModelOrMLIR(
     const std::string& input_path, MLIRContext* context,
     llvm::SourceMgr* source_mgr,
     std::unique_ptr<tensorflow::SavedModelBundle>* saved_model_bundle) {
@@ -215,7 +214,7 @@ tensorflow::Status ExportModule(mlir::ModuleOp module,
   output->os() << result;
   output->keep();
 
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertTFToStableHLO(
@@ -261,7 +260,7 @@ tensorflow::Status ConvertTFToStableHLO(
     return tensorflow::errors::Aborted("Lowering to StableHLO failed.");
   }
 
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status RunConverter(const PassPipelineCLParser& pass_pipeline) {
@@ -352,7 +351,6 @@ void initAllPasses() {
   mlir::registerAllPasses();
   mlir::registerTensorFlowPasses();
   mlir::mhlo::registerAllMhloPasses();
-  mlir::lmhlo::registerAllLmhloPasses();
   // These are in compiler/mlir/tf2xla and not part of the above MHLO passes.
   mlir::mhlo::registerTfXlaPasses();
   mlir::mhlo::registerLegalizeTFPass();
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/compose-uniform-quantized-type.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/compose-uniform-quantized-type.mlir
index b98d8af67ccd29..7e60dc85a487a6 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/compose-uniform-quantized-type.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/compose-uniform-quantized-type.mlir
@@ -27,7 +27,7 @@ module {
     %20 = call @uniform_dequantize_0(%19, %5, %6) : (tensor<1x3x3x4xi8>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xi8>) -> tensor<1x3x3x4xf32>
     return %20 : tensor<1x3x3x4xf32>
   }
-// CHECK: %[[FILTER:.*]] = stablehlo.constant() {value = dense<1> : tensor<3x3x4x4xi8>} : () -> tensor<3x3x4x4x!quant.uniform<i8:f32:3, {{{.*}}}>>
+// CHECK: %[[FILTER:.*]] = stablehlo.constant() <{value = dense<1> : tensor<3x3x4x4xi8>}> : () -> tensor<3x3x4x4x!quant.uniform<i8:f32:3, {{{.*}}}>>
 // CHECK: %[[QUANT_ARG:.*]] = stablehlo.uniform_quantize %[[ARG]] : (tensor<1x3x3x4xf32>) -> tensor<1x3x3x4x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK: %[[CONV:.*]] = stablehlo.convolution(%[[QUANT_ARG]], %[[FILTER]]) {{.*}} : (tensor<1x3x3x4x!quant.uniform<i8:f32, {{.*}}>>, tensor<3x3x4x4x!quant.uniform<i8:f32:3, {{.*}}>>) -> tensor<1x3x3x4x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK: %[[DEQUANT:.*]] = stablehlo.uniform_dequantize %[[CONV]] : (tensor<1x3x3x4x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x3x3x4xf32>
@@ -87,7 +87,7 @@ module {
     %18 = call @uniform_dequantize_0(%17, %5, %6) : (tensor<1x3x3x4xi8>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xi8>) -> tensor<1x3x3x4xf32>
     return %18 : tensor<1x3x3x4xf32>
   }
-// CHECK: %[[FILTER:.*]] = stablehlo.constant() {value = dense<20> : tensor<3x3x4x4xi8>} : () -> tensor<3x3x4x4x!quant.uniform<i8:f32:3, {{{.*}}}>>
+// CHECK: %[[FILTER:.*]] = stablehlo.constant() <{value = dense<20> : tensor<3x3x4x4xi8>}> : () -> tensor<3x3x4x4x!quant.uniform<i8:f32:3, {{{.*}}}>>
 // CHECK: %[[QUANT_ARG:.*]] = stablehlo.uniform_quantize %[[ARG]] : (tensor<1x3x3x4xf32>) -> tensor<1x3x3x4x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK: %[[CONV:.*]] = stablehlo.convolution(%[[QUANT_ARG]], %[[FILTER]]) {{.*}} : (tensor<1x3x3x4x!quant.uniform<i8:f32, {{.*}}>>, tensor<3x3x4x4x!quant.uniform<i8:f32:3, {{.*}}>>) -> tensor<1x3x3x4x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK: %[[DEQUANT:.*]] = stablehlo.uniform_dequantize %[[CONV]] : (tensor<1x3x3x4x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x3x3x4xf32>
@@ -182,7 +182,7 @@ module {
     return %17 : tensor<1x4x3xf32>
   }
 // Quantization dimension == 1 because it is the output feature dimension.
-// CHECK: %[[FILTER:.*]] = stablehlo.constant() {value = dense<5> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8:f32:1, {{{.*}}}>>
+// CHECK: %[[FILTER:.*]] = stablehlo.constant() <{value = dense<5> : tensor<2x3xi8>}> : () -> tensor<2x3x!quant.uniform<i8:f32:1, {{{.*}}}>>
 // CHECK: %[[QUANT_ARG:.*]] = stablehlo.uniform_quantize %[[ARG]] : (tensor<1x4x2xf32>) -> tensor<1x4x2x!quant.uniform<i8:f32, {{.*}}:1>>
 // CHECK: %[[CONV:.*]] = stablehlo.dot_general %[[QUANT_ARG]], %[[FILTER]], contracting_dims = [2] x [0] : (tensor<1x4x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8:f32:1, {{.*}}>>) -> tensor<1x4x3x!quant.uniform<i8:f32, {{.*}}:2>>
 // CHECK: %[[DEQUANT:.*]] = stablehlo.uniform_dequantize %[[CONV]] : (tensor<1x4x3x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x4x3xf32>
@@ -238,7 +238,7 @@ module {
   }
 // Quantization dimension == 1 because it is the output feature dimension.
 // Quantized filter values (from f32 constant) are cast to i8.
-// CHECK: %[[FILTER:.*]] = stablehlo.constant() {value = dense<5> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8:f32:1, {{{.*}}}>>
+// CHECK: %[[FILTER:.*]] = stablehlo.constant() <{value = dense<5> : tensor<2x3xi8>}> : () -> tensor<2x3x!quant.uniform<i8:f32:1, {{{.*}}}>>
 // CHECK: %[[QUANT_ARG:.*]] = stablehlo.uniform_quantize %[[ARG]] : (tensor<1x4x2xf32>) -> tensor<1x4x2x!quant.uniform<i8:f32, {{.*}}:1>>
 // CHECK: %[[CONV:.*]] = stablehlo.dot_general %[[QUANT_ARG]], %[[FILTER]], contracting_dims = [2] x [0] : (tensor<1x4x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8:f32:1, {{.*}}>>) -> tensor<1x4x3x!quant.uniform<i8:f32, {{.*}}:2>>
 // CHECK: %[[DEQUANT:.*]] = stablehlo.uniform_dequantize %[[CONV]] : (tensor<1x4x3x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x4x3xf32>
@@ -292,7 +292,7 @@ module {
     return %15 : tensor<1x3xf32>
   }
 // Quantization dimension == 1 because it is the output feature dimension.
-// CHECK: %[[FILTER:.*]] = stablehlo.constant() {value = dense<5> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8:f32:1, {{{.*}}}>>
+// CHECK: %[[FILTER:.*]] = stablehlo.constant() <{value = dense<5> : tensor<2x3xi8>}> : () -> tensor<2x3x!quant.uniform<i8:f32:1, {{{.*}}}>>
 // CHECK: %[[QUANT_ARG:.*]] = stablehlo.uniform_quantize %[[ARG]] : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}:1>>
 // CHECK: %[[CONV:.*]] = stablehlo.dot_general %[[QUANT_ARG]], %[[FILTER]], contracting_dims = [1] x [0] : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8:f32:1, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}:2>>
 // CHECK: %[[DEQUANT:.*]] = stablehlo.uniform_dequantize %[[CONV]] : (tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x3xf32>
@@ -431,8 +431,8 @@ module {
     %2 = stablehlo.constant dense<-128> : tensor<1x1x1xi8>  // Input 1 zero point (z1).
     %3 = stablehlo.constant dense<-128> : tensor<1x1x1xi32>  // Input 1 zero point (z1) (upcast & folded into i32).
     %4 = stablehlo.constant dense<4.000000e-01> : tensor<1x1x1xf32>  // Input 2 inverse scale (1 / s2).
-    %5 = stablehlo.constant dense<-3> : tensor<1x1x1xi8>  // Input 2 zero point (z2).
-    %6 = stablehlo.constant dense<-3> : tensor<1x1x1xi32>  // Input 2 zero point (z2) (upcast & folded into i32).
+    %5 = stablehlo.constant dense<0> : tensor<1x1x1xi8>  // Input 2 zero point (z2).
+    %6 = stablehlo.constant dense<0> : tensor<1x1x1xi32>  // Input 2 zero point (z2) (upcast & folded into i32).
     %7 = stablehlo.constant dense<5.000000e-01> : tensor<1x1x1xf32>  // Output inverse scale (1 / s3).
     %8 = stablehlo.constant dense<-5> : tensor<1x1x1xi8>  // Output zero point (z3).
     %9 = stablehlo.constant dense<1.250000e+01> : tensor<1x1x1xf32>  // Merged scale (s1 * s2).
@@ -454,8 +454,8 @@ module {
     return %23 : tensor<8x16x4xf32>
   }
 // CHECK: %[[UQ_0:.*]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<8x16x16xf32>) -> tensor<8x16x16x!quant.uniform<i8:f32, 5.000000e+00:-128>>
-// CHECK: %[[UQ_1:.*]] = stablehlo.uniform_quantize %[[ARG_1]] : (tensor<8x16x4xf32>) -> tensor<8x16x4x!quant.uniform<i8:f32, 2.500000e+00:-3>>
-// CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %[[UQ_0]], %[[UQ_1]], batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x16x16x!quant.uniform<i8:f32, 5.000000e+00:-128>>, tensor<8x16x4x!quant.uniform<i8:f32, 2.500000e+00:-3>>) -> tensor<8x16x4x!quant.uniform<i8:f32, 2.000000e+00:-5>>
+// CHECK: %[[UQ_1:.*]] = stablehlo.uniform_quantize %[[ARG_1]] : (tensor<8x16x4xf32>) -> tensor<8x16x4x!quant.uniform<i8:f32, 2.500000e+00>>
+// CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %[[UQ_0]], %[[UQ_1]], batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x16x16x!quant.uniform<i8:f32, 5.000000e+00:-128>>, tensor<8x16x4x!quant.uniform<i8:f32, 2.500000e+00>>) -> tensor<8x16x4x!quant.uniform<i8:f32, 2.000000e+00:-5>>
 // CHECK: %[[DQ_0:.*]] = stablehlo.uniform_dequantize %[[DOT_GENERAL]] : (tensor<8x16x4x!quant.uniform<i8:f32, 2.000000e+00:-5>>) -> tensor<8x16x4xf32>
 // CHECK: return %[[DQ_0]]
 
@@ -492,7 +492,7 @@ module {
     %1 = stablehlo.constant dense<2.000000e-01> : tensor<1x1x1xf32>  // Input 1 inverse scale (1 / s1).
     %2 = stablehlo.constant dense<-128> : tensor<1x1x1xi8>  // Input 1 zero point (z1).
     %3 = stablehlo.constant dense<4.000000e-01> : tensor<1x1x1xf32>  // Input 2 inverse scale (1 / s2).
-    %4 = stablehlo.constant dense<-3> : tensor<1x1x1xi8>  // Input 2 zero point (z2).
+    %4 = stablehlo.constant dense<0> : tensor<1x1x1xi8>  // Input 2 zero point (z2).
     %5 = stablehlo.constant dense<5.000000e-01> : tensor<1x1x1xf32>  // Output inverse scale (1 / s3).
     %6 = stablehlo.constant dense<-5> : tensor<1x1x1xi8>  // Output zero point (z3).
     %7 = stablehlo.constant dense<1.250000e+01> : tensor<1x1x1xf32>  // Merged scale (s1 * s2).
@@ -516,8 +516,8 @@ module {
     return %23 : tensor<8x16x4xf32>
   }
 // CHECK: %[[UQ_0:.*]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<8x16x16xf32>) -> tensor<8x16x16x!quant.uniform<i8:f32, 5.000000e+00:-128>>
-// CHECK: %[[UQ_1:.*]] = stablehlo.uniform_quantize %[[ARG_1]] : (tensor<8x16x4xf32>) -> tensor<8x16x4x!quant.uniform<i8:f32, 2.500000e+00:-3>>
-// CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %[[UQ_0]], %[[UQ_1]], batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x16x16x!quant.uniform<i8:f32, 5.000000e+00:-128>>, tensor<8x16x4x!quant.uniform<i8:f32, 2.500000e+00:-3>>) -> tensor<8x16x4x!quant.uniform<i8:f32, 2.000000e+00:-5>>
+// CHECK: %[[UQ_1:.*]] = stablehlo.uniform_quantize %[[ARG_1]] : (tensor<8x16x4xf32>) -> tensor<8x16x4x!quant.uniform<i8:f32, 2.500000e+00>>
+// CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %[[UQ_0]], %[[UQ_1]], batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x16x16x!quant.uniform<i8:f32, 5.000000e+00:-128>>, tensor<8x16x4x!quant.uniform<i8:f32, 2.500000e+00>>) -> tensor<8x16x4x!quant.uniform<i8:f32, 2.000000e+00:-5>>
 // CHECK: %[[DQ_0:.*]] = stablehlo.uniform_dequantize %[[DOT_GENERAL]] : (tensor<8x16x4x!quant.uniform<i8:f32, 2.000000e+00:-5>>) -> tensor<8x16x4xf32>
 // CHECK: return %[[DQ_0]]
 
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/composite-lowering.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/composite-lowering.mlir
index c614ee10bf2b45..4121caa60a8e0d 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/composite-lowering.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/composite-lowering.mlir
@@ -65,7 +65,7 @@ func.func private @XlaCallModule_aten.avg_pool2d.default.impl_0(%arg0: tensor<1x
 // CHECK:           %[[VAL_2:.*]] = "tfl.transpose"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1x3x6x6xf32>, tensor<4xi32>) -> tensor<1x6x6x3xf32>
 // CHECK:           %[[VAL_3:.*]] = arith.constant dense<0> : tensor<4x2xi32>
 // CHECK:           %[[VAL_4:.*]] = "tfl.pad"(%[[VAL_2]], %[[VAL_3]]) : (tensor<1x6x6x3xf32>, tensor<4x2xi32>) -> tensor<1x6x6x3xf32>
-// CHECK:           %[[VAL_5:.*]] = "tfl.average_pool_2d"(%[[VAL_4]]) {filter_height = 3 : i32, filter_width = 3 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x6x6x3xf32>) -> tensor<1x4x4x3xf32>
+// CHECK:           %[[VAL_5:.*]] = "tfl.average_pool_2d"(%[[VAL_4]]) <{filter_height = 3 : i32, filter_width = 3 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<1x6x6x3xf32>) -> tensor<1x4x4x3xf32>
 // CHECK:           %[[VAL_6:.*]] = arith.constant dense<[0, 3, 1, 2]> : tensor<4xi32>
 // CHECK:           %[[VAL_7:.*]] = "tfl.transpose"(%[[VAL_5]], %[[VAL_6]]) : (tensor<1x4x4x3xf32>, tensor<4xi32>) -> tensor<1x3x4x4xf32>
 // CHECK:           %[[VAL_8:.*]] = "tf.Identity"(%[[VAL_7]]) {device = ""} : (tensor<1x3x4x4xf32>) -> tensor<*xf32>
@@ -102,7 +102,7 @@ func.func private @XlaCallModule_aten.avg_pool2d.default.impl_1(%arg0: tensor<1x
 // CHECK-SAME:                               %[[VAL_0:.*]]: tensor<1x3x6x6xf32>) -> tensor<*xf32> {
 // CHECK:           %[[VAL_1:.*]] = arith.constant dense<[0, 2, 3, 1]> : tensor<4xi32>
 // CHECK:           %[[VAL_2:.*]] = "tfl.transpose"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1x3x6x6xf32>, tensor<4xi32>) -> tensor<1x6x6x3xf32>
-// CHECK:           %[[VAL_3:.*]] = "tfl.average_pool_2d"(%[[VAL_2]]) {filter_height = 3 : i32, filter_width = 3 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x6x6x3xf32>) -> tensor<1x6x6x3xf32>
+// CHECK:           %[[VAL_3:.*]] = "tfl.average_pool_2d"(%[[VAL_2]]) <{filter_height = 3 : i32, filter_width = 3 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<1x6x6x3xf32>) -> tensor<1x6x6x3xf32>
 // CHECK:           %[[VAL_4:.*]] = arith.constant dense<[0, 3, 1, 2]> : tensor<4xi32>
 // CHECK:           %[[VAL_5:.*]] = "tfl.transpose"(%[[VAL_3]], %[[VAL_4]]) : (tensor<1x6x6x3xf32>, tensor<4xi32>) -> tensor<1x3x6x6xf32>
 // CHECK:           %[[VAL_6:.*]] = "tf.Identity"(%[[VAL_5]]) {device = ""} : (tensor<1x3x6x6xf32>) -> tensor<*xf32>
@@ -172,8 +172,71 @@ func.func private @XlaCallModule_odml.upsample_bilinear2d.impl_21_0(%arg0: tenso
 // CHECK:           %[[VAL_1:.*]] = arith.constant dense<[0, 2, 3, 1]> : tensor<4xi32>
 // CHECK:           %[[VAL_2:.*]] = "tfl.transpose"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1x64x16x16xf32>, tensor<4xi32>) -> tensor<1x16x16x64xf32>
 // CHECK:           %[[VAL_3:.*]] = arith.constant dense<32> : tensor<2xi32>
-// CHECK:           %[[VAL_4:.*]] = "tfl.resize_bilinear"(%[[VAL_2]], %[[VAL_3]]) {align_corners = false, half_pixel_centers = true} : (tensor<1x16x16x64xf32>, tensor<2xi32>) -> tensor<1x32x32x64xf32>
+// CHECK:           %[[VAL_4:.*]] = "tfl.resize_bilinear"(%[[VAL_2]], %[[VAL_3]]) <{align_corners = false, half_pixel_centers = true}> : (tensor<1x16x16x64xf32>, tensor<2xi32>) -> tensor<1x32x32x64xf32>
 // CHECK:           %[[VAL_5:.*]] = arith.constant dense<[0, 3, 1, 2]> : tensor<4xi32>
 // CHECK:           %[[VAL_6:.*]] = "tfl.transpose"(%[[VAL_4]], %[[VAL_5]]) : (tensor<1x32x32x64xf32>, tensor<4xi32>) -> tensor<1x64x32x32xf32>
 // CHECK:           return %[[VAL_6]] : tensor<1x64x32x32xf32>
 // CHECK:         }
+
+func.func private @gelu_decomp(%arg0: tensor<2xf32>) -> tensor<2xf32>
+func.func @gelu(%arg0: tensor<2xf32>) -> tensor<2xf32> {
+  %0 = mhlo.composite "odml.internal.gelu" %arg0 {composite_attributes = {approx = false}, decomposition = @gelu_decomp} : (tensor<2xf32>) -> tensor<2xf32>
+  return %0 : tensor<2xf32>
+}
+
+// CHECK-LABEL: gelu
+// CHECK: %0 = "tfl.gelu"(%arg0) <{approximate = false}> : (tensor<2xf32>) -> tensor<2xf32>
+
+// CHECK-LABEL  func.func @jax_image_resize_nearest
+func.func @jax_image_resize_nearest(%arg0: tensor<1x2x2x10xf32>) -> (tensor<1x4x4x10xf32>) {
+  %1 = mhlo.composite "odml.jax_resize_nearest_neighbor2d" %arg0 {composite_attributes = {output_size = dense<4> : tensor<2xi64>}, decomposition = @XlaCallModule_odml.jax_resize_nearest_neighbor2d.impl_0} : (tensor<1x2x2x10xf32>) -> tensor<1x4x4x10xf32>
+  return %1 : tensor<1x4x4x10xf32>
+}
+func.func private @XlaCallModule_odml.jax_resize_nearest_neighbor2d.impl_0(%arg0: tensor<1x2x2x10xf32>) -> tensor<1x4x4x10xf32> {
+  %0 = call @XlaCallModule__resize_0(%arg0) : (tensor<1x2x2x10xf32>) -> tensor<1x4x4x10xf32>
+  return %0 : tensor<1x4x4x10xf32>
+}
+func.func private @XlaCallModule__resize_0(%arg0: tensor<1x2x2x10xf32>) -> (tensor<1x4x4x10xf32>) {
+  %0 = mhlo.constant dense<2> : tensor<i32>
+  %1 = mhlo.constant dense<0> : tensor<i32>
+  %2 = mhlo.constant dense<4.000000e+00> : tensor<f32>
+  %3 = mhlo.constant dense<2.000000e+00> : tensor<f32>
+  %4 = mhlo.constant dense<5.000000e-01> : tensor<f32>
+  %5 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<4xf32>
+  %6 = "mhlo.broadcast_in_dim"(%4) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>) -> tensor<4xf32>
+  %7 = mhlo.add %5, %6 : tensor<4xf32>
+  %8 = "mhlo.broadcast_in_dim"(%3) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>) -> tensor<4xf32>
+  %9 = mhlo.multiply %7, %8 : tensor<4xf32>
+  %10 = "mhlo.broadcast_in_dim"(%2) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>) -> tensor<4xf32>
+  %11 = mhlo.divide %9, %10 : tensor<4xf32>
+  %12 = mhlo.floor %11 : tensor<4xf32>
+  %13 = mhlo.convert %12 : (tensor<4xf32>) -> tensor<4xi32>
+  %14 = "mhlo.broadcast_in_dim"(%1) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<i32>) -> tensor<4xi32>
+  %15 = mhlo.compare  LT, %13, %14,  SIGNED : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi1>
+  %16 = "mhlo.broadcast_in_dim"(%0) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<i32>) -> tensor<4xi32>
+  %17 = mhlo.add %13, %16 : tensor<4xi32>
+  %18 = mhlo.select %15, %17, %13 : tensor<4xi1>, tensor<4xi32>
+  %19 = "mhlo.broadcast_in_dim"(%18) <{broadcast_dimensions = dense<0> : tensor<1xi64>}> : (tensor<4xi32>) -> tensor<4x1xi32>
+  %20 = "mhlo.gather"(%arg0, %19) <{dimension_numbers = #mhlo.gather<offset_dims = [0, 2, 3], collapsed_slice_dims = [1], start_index_map = [1], index_vector_dim = 1>, slice_sizes = dense<[1, 1, 2, 10]> : tensor<4xi64>}> : (tensor<1x2x2x10xf32>, tensor<4x1xi32>) -> tensor<1x4x2x10xf32>
+  %21 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<4xf32>
+  %22 = "mhlo.broadcast_in_dim"(%4) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>) -> tensor<4xf32>
+  %23 = mhlo.add %21, %22 : tensor<4xf32>
+  %24 = "mhlo.broadcast_in_dim"(%3) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>) -> tensor<4xf32>
+  %25 = mhlo.multiply %23, %24 : tensor<4xf32>
+  %26 = "mhlo.broadcast_in_dim"(%2) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<f32>) -> tensor<4xf32>
+  %27 = mhlo.divide %25, %26 : tensor<4xf32>
+  %28 = mhlo.floor %27 : tensor<4xf32>
+  %29 = mhlo.convert %28 : (tensor<4xf32>) -> tensor<4xi32>
+  %30 = "mhlo.broadcast_in_dim"(%1) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<i32>) -> tensor<4xi32>
+  %31 = mhlo.compare  LT, %29, %30,  SIGNED : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi1>
+  %32 = "mhlo.broadcast_in_dim"(%0) <{broadcast_dimensions = dense<> : tensor<0xi64>}> : (tensor<i32>) -> tensor<4xi32>
+  %33 = mhlo.add %29, %32 : tensor<4xi32>
+  %34 = mhlo.select %31, %33, %29 : tensor<4xi1>, tensor<4xi32>
+  %35 = "mhlo.broadcast_in_dim"(%34) <{broadcast_dimensions = dense<0> : tensor<1xi64>}> : (tensor<4xi32>) -> tensor<4x1xi32>
+  %36 = "mhlo.gather"(%20, %35) <{dimension_numbers = #mhlo.gather<offset_dims = [0, 1, 3], collapsed_slice_dims = [2], start_index_map = [2], index_vector_dim = 1>, slice_sizes = dense<[1, 4, 1, 10]> : tensor<4xi64>}> : (tensor<1x4x2x10xf32>, tensor<4x1xi32>) -> tensor<1x4x4x10xf32>
+  return %36 : tensor<1x4x4x10xf32>
+}
+
+// CHECK:  %cst = arith.constant dense<4> : tensor<2xi32>
+// CHECK:  %0 = "tfl.resize_nearest_neighbor"(%arg0, %cst) <{align_corners = false, half_pixel_centers = true}> : (tensor<1x2x2x10xf32>, tensor<2xi32>) -> tensor<1x4x4x10xf32>
+// CHECK:  return %0 : tensor<1x4x4x10xf32>
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-composite.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-composite.mlir
index 268247e815faa3..d64b50b72d533f 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-composite.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-composite.mlir
@@ -5,14 +5,14 @@ module {
   func.func public @main(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>, %arg2: tensor<1x100x32x4xf32>,
       %arg3: tensor<1x500x4x4xf32>, %arg4: tensor<1x500x4x4xf32>, %arg5: tensor<1x1x100x500xf32>, %arg6: tensor<f32>)
       -> tensor<1x100x32x4xf32> {
-    // CHECK-ROUNDTRIP: %0 = "tfl.custom"(%arg2, %arg3, %arg4, %arg5, %arg6) {custom_code = "odml.scaled_dot_product_attention", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<1x100x32x4xf32>, tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>, tensor<1x1x100x500xf32>, tensor<f32>) -> tensor<1x100x32x4xf32>
+    // CHECK-ROUNDTRIP: %0 = "tfl.custom"(%arg2, %arg3, %arg4, %arg5, %arg6) <{custom_code = "odml.scaled_dot_product_attention", custom_option = #tfl<const_bytes : "0x00000100002401">}> : (tensor<1x100x32x4xf32>, tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>, tensor<1x1x100x500xf32>, tensor<f32>) -> tensor<1x100x32x4xf32>
     %0 = func.call @test_sdpa(%arg2, %arg3, %arg4, %arg5, %arg6) : (tensor<1x100x32x4xf32>,  tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>, tensor<1x1x100x500xf32>, tensor<f32>) -> tensor<1x100x32x4xf32>
     return %0: tensor<1x100x32x4xf32>
   }
 
   // CHECK-LABEL: func.func private @test_sdpa
   func.func private @test_sdpa(%arg0: tensor<1x100x32x4xf32>, %arg1: tensor<1x500x4x4xf32>, %arg2: tensor<1x500x4x4xf32>, %arg3: tensor<1x1x100x500xf32>, %arg4: tensor<f32>) -> tensor<1x100x32x4xf32> {
-    // CHECK:  %0 = "tfl.custom"(%arg0, %arg1, %arg2, %arg3, %arg4) {custom_code = "odml.scaled_dot_product_attention", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<1x100x32x4xf32>, tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>, tensor<1x1x100x500xf32>, tensor<f32>) -> tensor<1x100x32x4xf32>
+    // CHECK:  %0 = "tfl.custom"(%arg0, %arg1, %arg2, %arg3, %arg4) <{custom_code = "odml.scaled_dot_product_attention", custom_option = #tfl<const_bytes : "0x00000100002401">}> : (tensor<1x100x32x4xf32>, tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>, tensor<1x1x100x500xf32>, tensor<f32>) -> tensor<1x100x32x4xf32>
     %0 = stablehlo.composite "odml.scaled_dot_product_attention" %arg0, %arg1, %arg2, %arg3, %arg4 {decomposition = @odml.scaled_dot_product_attention.impl} : (tensor<1x100x32x4xf32>, tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>, tensor<1x1x100x500xf32>, tensor<f32>) -> tensor<1x100x32x4xf32>
     return %0 : tensor<1x100x32x4xf32>
   }
@@ -23,8 +23,8 @@ module {
 
   // CHECK-LABEL: func.func private @test_multiple_kv_caches
   func.func private @test_multiple_kv_caches(%arg0: tensor<1x500x4x4xf32>, %arg1: tensor<1x500x4x4xf32>, %arg2: tensor<100xi64>, %arg3: tensor<1x100x4x4xf32>, %arg4: tensor<1x100x4x4xf32>) -> (tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>) {
-    // CHECK: %0:2 = "tfl.custom"(%arg2, %arg3, %arg4) {custom_code = "odml.update_kv_cache", custom_option = #tfl<const_bytes : "0x6B765F63616368655F6D6178006C617965725F696E646578006E756D5F6C6179657273000325190E030001000300F40100000200050505092501">} : (tensor<100xi64>, tensor<1x100x4x4xf32>, tensor<1x100x4x4xf32>) -> (tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>)
-    // CHECK: %1:2 = "tfl.custom"(%arg2, %arg3, %arg4) {custom_code = "odml.update_kv_cache", custom_option = #tfl<const_bytes : "0x6B765F63616368655F6D6178006C617965725F696E646578006E756D5F6C6179657273000325190E030001000300F40101000200050505092501">} : (tensor<100xi64>, tensor<1x100x4x4xf32>, tensor<1x100x4x4xf32>) -> (tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>)
+    // CHECK: %0:2 = "tfl.custom"(%arg2, %arg3, %arg4) <{custom_code = "odml.update_kv_cache", custom_option = #tfl<const_bytes : "0x6B765F63616368655F6D6178006C617965725F696E646578006E756D5F6C6179657273000325190E030001000300F40100000200050505092501">}> : (tensor<100xi64>, tensor<1x100x4x4xf32>, tensor<1x100x4x4xf32>) -> (tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>)
+    // CHECK: %1:2 = "tfl.custom"(%arg2, %arg3, %arg4) <{custom_code = "odml.update_kv_cache", custom_option = #tfl<const_bytes : "0x6B765F63616368655F6D6178006C617965725F696E646578006E756D5F6C6179657273000325190E030001000300F40101000200050505092501">}> : (tensor<100xi64>, tensor<1x100x4x4xf32>, tensor<1x100x4x4xf32>) -> (tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>)
     %0:2 = stablehlo.composite "odml.update_kv_cache" %arg0, %arg1, %arg2, %arg3, %arg4 {composite_attributes = {kv_cache_max = 500 : i64}, decomposition = @odml.update_kv_cache.impl_0} : (tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>, tensor<100xi64>, tensor<1x100x4x4xf32>, tensor<1x100x4x4xf32>) -> (tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>)
     %1:2 = stablehlo.composite "odml.update_kv_cache" %0#0, %0#1, %arg2, %arg3, %arg4 {composite_attributes = {kv_cache_max = 500 : i64}, decomposition = @odml.update_kv_cache.impl_0} : (tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>, tensor<100xi64>, tensor<1x100x4x4xf32>, tensor<1x100x4x4xf32>) -> (tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>)
     return %1#0, %1#1 : tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/optimize_layout.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/optimize_layout.mlir
new file mode 100644
index 00000000000000..25ae45f300b13e
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/optimize_layout.mlir
@@ -0,0 +1,50 @@
+// RUN: odml-to-stablehlo-opt %s --transpose-commute-ops | FileCheck %s
+// CHECK-LABEL:   func.func @commute_transpose_pad(
+// CHECK-SAME:          %[[INPUT:.*]]: tensor<1x112x112x64xf32>,
+// CHECK-SAME:          %[[PAD_VAL:.*]]: tensor<f32>) -> tensor<1x64x114x114xf32> {
+// CHECK:           %[[PAD:.*]] = stablehlo.pad %[[INPUT]], %[[PAD_VAL]],
+// CHECK:               low = [0, 1, 1, 0], high = [0, 1, 1, 0], interior = [0, 0, 0, 0]
+// CHECK:               : (tensor<1x112x112x64xf32>, tensor<f32>) -> tensor<1x114x114x64xf32>
+// CHECK:           %[[TPOS:.*]] = stablehlo.transpose %[[PAD]], dims = [0, 3, 1, 2]
+// CHECK:               : (tensor<1x114x114x64xf32>) -> tensor<1x64x114x114xf32>
+// CHECK:           return %[[TPOS]] : tensor<1x64x114x114xf32>
+
+func.func @commute_transpose_pad(
+      %arg0: tensor<1x112x112x64xf32>, %padding_val: tensor<f32>)
+      -> tensor<1x64x114x114xf32> {
+  %tspos = stablehlo.transpose %arg0, dims = [0, 3, 1, 2]
+      : (tensor<1x112x112x64xf32>) -> tensor<1x64x112x112xf32>
+  %ret = stablehlo.pad %tspos, %padding_val,
+      low = [0, 0, 1, 1], high = [0, 0, 1, 1], interior = [0, 0, 0, 0]
+      : (tensor<1x64x112x112xf32>, tensor<f32>) -> tensor<1x64x114x114xf32>
+  return %ret :tensor<1x64x114x114xf32>
+}
+
+// -----
+// CHECK-LABEL:   func.func @commute_transpose_reduce_window(
+// CHECK-SAME:          %[[INPUT:.*]]: tensor<1x114x114x64xf32>,
+// CHECK-SAME:          %[[PAD_VAL:.*]]: tensor<f32>) -> tensor<1x64x56x56xf32> {
+// CHECK:           %[[REDUCE:.*]] = "stablehlo.reduce_window"(%[[INPUT]], %[[PAD_VAL]])
+// CHECK:               <{window_dimensions = array<i64: 1, 3, 3, 1>,
+// CHECK:                 window_strides = array<i64: 1, 2, 2, 1>}> ({
+// CHECK:           ^bb0(%[[ARG0:.*]]: tensor<f32>, %[[ARG1:.*]]: tensor<f32>):
+// CHECK:             %[[MAX:.*]] = stablehlo.maximum %[[ARG0]], %[[ARG1]] : tensor<f32>
+// CHECK:             stablehlo.return %[[MAX]] : tensor<f32>
+// CHECK:           }) : (tensor<1x114x114x64xf32>, tensor<f32>) -> tensor<1x56x56x64xf32>
+// CHECK:           %[[TPOS:.*]] = stablehlo.transpose %[[REDUCE]], dims = [0, 3, 1, 2]
+// CHECK:              : (tensor<1x56x56x64xf32>) -> tensor<1x64x56x56xf32>
+// CHECK:           return %[[TPOS]] : tensor<1x64x56x56xf32>
+
+func.func @commute_transpose_reduce_window(
+      %input: tensor<1x114x114x64xf32>,
+      %cst: tensor<f32>) -> tensor<1x64x56x56xf32> {
+  %tpos = stablehlo.transpose %input, dims = [0, 3, 1, 2]
+      : (tensor<1x114x114x64xf32>) -> tensor<1x64x114x114xf32>
+  %ret = "stablehlo.reduce_window"(%tpos, %cst)
+      <{window_dimensions = array<i64: 1, 1, 3, 3>, window_strides = array<i64: 1, 1, 2, 2>}> ({
+  ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
+    %max = stablehlo.maximum %arg0, %arg1 : tensor<f32>
+    stablehlo.return %max: tensor<f32>
+  }) : (tensor<1x64x114x114xf32>, tensor<f32>) -> tensor<1x64x56x56xf32>
+  return %ret : tensor<1x64x56x56xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/tfl_legalize_hlo.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/tfl_legalize_hlo.mlir
index 9be635a44268f6..9b1c3f91ebb4e7 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/tfl_legalize_hlo.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/tfl_legalize_hlo.mlir
@@ -15,7 +15,7 @@ func.func @transpose_2d(%arg0: tensor<2x3xf32>) -> tensor<3x2xf32> {
   func.return %0 : tensor<3x2xf32>
 
 // CHECK-LABEL:   transpose_2d
-// CHECK-NEXT:    %0 = "tfl.pseudo_const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK-NEXT:    %0 = "tfl.pseudo_const"() <{value = dense<[1, 0]> : tensor<2xi64>}> : () -> tensor<2xi64>
 // CHECK-NEXT:    %1 = "tfl.cast"(%0) : (tensor<2xi64>) -> tensor<2xi32>
 // CHECK-NEXT:    %2 = "tfl.transpose"(%arg0, %1) : (tensor<2x3xf32>, tensor<2xi32>) -> tensor<3x2xf32>
 // CHECK-NEXT:    return %2 : tensor<3x2xf32>
@@ -26,7 +26,7 @@ func.func @transpose_3d(%arg0: tensor<1x2x3xf32>) -> tensor<3x2x1xf32> {
   func.return %0 : tensor<3x2x1xf32>
 
 // CHECK-LABEL:   transpose_3d
-// CHECK-NEXT:    %0 = "tfl.pseudo_const"() {value = dense<[2, 1, 0]> : tensor<3xi64>} : () -> tensor<3xi64>
+// CHECK-NEXT:    %0 = "tfl.pseudo_const"() <{value = dense<[2, 1, 0]> : tensor<3xi64>}> : () -> tensor<3xi64>
 // CHECK-NEXT:    %1 = "tfl.cast"(%0) : (tensor<3xi64>) -> tensor<3xi32>
 // CHECK-NEXT:    %2 = "tfl.transpose"(%arg0, %1) : (tensor<1x2x3xf32>, tensor<3xi32>) -> tensor<3x2x1xf32>
 // CHECK-NEXT:    return %2 : tensor<3x2x1xf32>
@@ -37,7 +37,7 @@ func.func @transpose_dynamic_2d(%arg0: tensor<?x4xf32>) -> tensor<4x?xf32> {
   func.return %0 : tensor<4x?xf32>
 
 // CHECK-LABEL:   transpose_dynamic_2d
-// CHECK-NEXT:    %0 = "tfl.pseudo_const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK-NEXT:    %0 = "tfl.pseudo_const"() <{value = dense<[1, 0]> : tensor<2xi64>}> : () -> tensor<2xi64>
 // CHECK-NEXT:    %1 = "tfl.cast"(%0) : (tensor<2xi64>) -> tensor<2xi32>
 // CHECK-NEXT:    %2 = "tfl.transpose"(%arg0, %1) : (tensor<?x4xf32>, tensor<2xi32>) -> tensor<4x?xf32>
 // CHECK-NEXT:    return %2 : tensor<4x?xf32>
@@ -63,7 +63,7 @@ func.func @convert_dot_general(%arg0: tensor<3x2x6x5x1xf32>, %arg1: tensor<3x2x4
 // CHECK:         %[[TRANSPOSED_1:.*]] = "tfl.transpose"
 // CHECK-NEXT:    %[[RESHAPED_0:.*]] = mhlo.reshape %[[TRANSPOSED_0]]
 // CHECK-NEXT:    %[[RESHAPED_1:.*]] = mhlo.reshape %[[TRANSPOSED_1]]
-// CHECK-NEXT:    %[[BMM_0:.*]] = "tfl.batch_matmul"(%[[RESHAPED_0]], %[[RESHAPED_1]]) {adj_x = false, adj_y = false, asymmetric_quantize_inputs = false} : (tensor<3x5x12xf32>, tensor<3x12x4xf32>) -> tensor<3x5x4xf32>
+// CHECK-NEXT:    %[[BMM_0:.*]] = "tfl.batch_matmul"(%[[RESHAPED_0]], %[[RESHAPED_1]]) <{adj_x = false, adj_y = false, asymmetric_quantize_inputs = false}> : (tensor<3x5x12xf32>, tensor<3x12x4xf32>) -> tensor<3x5x4xf32>
 // CHECK-NEXT:    %[[RESHAPED_BMM:.*]] = mhlo.reshape %[[BMM_0]]
 // CHECK-NEXT:    return %[[RESHAPED_BMM]] : tensor<3x5x1x4xf32>
 }
@@ -84,7 +84,7 @@ func.func @convert_dot_general_repeated(%arg0: tensor<1x1x1024xf32>, %arg1: tens
 // CHECK-LABEL:   convert_dot_general_repeated
 // CHECK:         %[[RESHAPED_0:.*]] = mhlo.reshape %arg0
 // CHECK-NEXT:    %[[RESHAPED_1:.*]] = mhlo.reshape %arg1
-// CHECK-NEXT:    %[[BMM_0:.*]] = "tfl.batch_matmul"(%[[RESHAPED_0]], %[[RESHAPED_1]]) {adj_x = false, adj_y = false, asymmetric_quantize_inputs = false} : {{.*}} -> tensor<1x1024xf32>
+// CHECK-NEXT:    %[[BMM_0:.*]] = "tfl.batch_matmul"(%[[RESHAPED_0]], %[[RESHAPED_1]]) <{adj_x = false, adj_y = false, asymmetric_quantize_inputs = false}> : {{.*}} -> tensor<1x1024xf32>
 // CHECK-NEXT:    %[[RESHAPED_BMM:.*]] = mhlo.reshape %[[BMM_0]]
 // CHECK-NEXT:    return %[[RESHAPED_BMM]] : tensor<1x1x1024xf32>
 }
@@ -101,7 +101,7 @@ func.func @convert_dot_general_int8(%arg0: tensor<256xi8>, %arg1: tensor<256x8xi
 // CHECK-LABEL:   convert_dot_general_int8
 // CHECK:         %[[RESHAPED_0:.*]] = mhlo.reshape %arg0
 // CHECK-NEXT:    %[[RESHAPED_1:.*]] = mhlo.reshape %arg1
-// CHECK-NEXT:    %[[BMM_0:.*]] = "tfl.batch_matmul"(%[[RESHAPED_0]], %[[RESHAPED_1]]) {adj_x = false, adj_y = false, asymmetric_quantize_inputs = false} : {{.*}} -> tensor<1x8xi32>
+// CHECK-NEXT:    %[[BMM_0:.*]] = "tfl.batch_matmul"(%[[RESHAPED_0]], %[[RESHAPED_1]]) <{adj_x = false, adj_y = false, asymmetric_quantize_inputs = false}> : {{.*}} -> tensor<1x8xi32>
 // CHECK-NEXT:    %[[RESHAPED_BMM:.*]] = mhlo.reshape %[[BMM_0]]
 // CHECK-NEXT:    return %[[RESHAPED_BMM]] : tensor<8xi32>
 }
@@ -117,27 +117,27 @@ func.func @convert_dot_general_dynamic_rhs_out_dim(%arg0: tensor<4x4x256xf32>, %
 func.return %0 : tensor<4x4x?xf32>
 
 // CHECK-LABEL:     convert_dot_general_dynamic_rhs_out_dim
-// CHECK:           %0 = "tfl.pseudo_const"() {value = dense<[0, 2, 1]> : tensor<3xi64>} : () -> tensor<3xi64>
+// CHECK:           %0 = "tfl.pseudo_const"() <{value = dense<[0, 2, 1]> : tensor<3xi64>}> : () -> tensor<3xi64>
 // CHECK-NEXT:      %1 = "tfl.cast"(%0) : (tensor<3xi64>) -> tensor<3xi32>
 // CHECK-NEXT:      %2 = "tfl.transpose"(%arg1, %1) : (tensor<4x?x256xf32>, tensor<3xi32>) -> tensor<4x256x?xf32>
 // CHECK-NEXT:      %3 = mhlo.reshape %arg0 : (tensor<4x4x256xf32>) -> tensor<4x4x256xf32>
 // CHECK-NEXT:      %4 = "tfl.shape"(%arg1) : (tensor<4x?x256xf32>) -> tensor<3xi32>
-// CHECK-NEXT:      %5 = "tfl.pseudo_const"() {value = dense<[-1, 0, -1]> : tensor<3xi32>} : () -> tensor<3xi32>
-// CHECK-NEXT:      %6 = "tfl.pseudo_const"() {value = dense<[-1, -1, 0]> : tensor<3xi32>} : () -> tensor<3xi32>
-// CHECK-NEXT:      %7 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+// CHECK-NEXT:      %5 = "tfl.pseudo_const"() <{value = dense<[-1, 0, -1]> : tensor<3xi32>}> : () -> tensor<3xi32>
+// CHECK-NEXT:      %6 = "tfl.pseudo_const"() <{value = dense<[-1, -1, 0]> : tensor<3xi32>}> : () -> tensor<3xi32>
+// CHECK-NEXT:      %7 = "tfl.pseudo_const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
 // CHECK-NEXT:      %8 = "tfl.unsorted_segment_prod"(%4, %5, %7) : (tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<1xi32>
 // CHECK-NEXT:      %9 = "tfl.unsorted_segment_prod"(%4, %6, %7) : (tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<1xi32>
-// CHECK-NEXT:      %10 = "tfl.pseudo_const"() {value = dense<4> : tensor<1xi32>} : () -> tensor<1xi32>
-// CHECK-NEXT:      %11 = "tfl.concatenation"(%10, %9, %8) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
+// CHECK-NEXT:      %10 = "tfl.pseudo_const"() <{value = dense<4> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK-NEXT:      %11 = "tfl.concatenation"(%10, %9, %8) <{axis = 0 : i32, fused_activation_function = "NONE"}> : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
 // CHECK-NEXT:      %12 = mhlo.dynamic_reshape %2, %11 : (tensor<4x256x?xf32>, tensor<3xi32>) -> tensor<4x256x?xf32>
-// CHECK-NEXT:      %13 = "tfl.batch_matmul"(%3, %12) {adj_x = false, adj_y = false, asymmetric_quantize_inputs = false} : (tensor<4x4x256xf32>, tensor<4x256x?xf32>) -> tensor<4x4x?xf32>
+// CHECK-NEXT:      %13 = "tfl.batch_matmul"(%3, %12) <{adj_x = false, adj_y = false, asymmetric_quantize_inputs = false}> : (tensor<4x4x256xf32>, tensor<4x256x?xf32>) -> tensor<4x4x?xf32>
 // CHECK-NEXT:      %14 = "tfl.shape"(%arg0) : (tensor<4x4x256xf32>) -> tensor<3xi32>
 // CHECK-NEXT:      %15 = "tfl.shape"(%arg1) : (tensor<4x?x256xf32>) -> tensor<3xi32>
-// CHECK-NEXT:      %16 = "tfl.pseudo_const"() {value = dense<[0, 1]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK-NEXT:      %17 = "tfl.gather"(%14, %16) {axis = 0 : i32, batch_dims = 0 : i32} : (tensor<3xi32>, tensor<2xi64>) -> tensor<2xi32>
-// CHECK-NEXT:      %18 = "tfl.pseudo_const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
-// CHECK-NEXT:      %19 = "tfl.gather"(%15, %18) {axis = 0 : i32, batch_dims = 0 : i32} : (tensor<3xi32>, tensor<1xi64>) -> tensor<1xi32>
-// CHECK-NEXT:      %20 = "tfl.concatenation"(%17, %19) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<2xi32>, tensor<1xi32>) -> tensor<3xi32>
+// CHECK-NEXT:      %16 = "tfl.pseudo_const"() <{value = dense<[0, 1]> : tensor<2xi64>}> : () -> tensor<2xi64>
+// CHECK-NEXT:      %17 = "tfl.gather"(%14, %16) <{axis = 0 : i32, batch_dims = 0 : i32}> : (tensor<3xi32>, tensor<2xi64>) -> tensor<2xi32>
+// CHECK-NEXT:      %18 = "tfl.pseudo_const"() <{value = dense<1> : tensor<1xi64>}> : () -> tensor<1xi64>
+// CHECK-NEXT:      %19 = "tfl.gather"(%15, %18) <{axis = 0 : i32, batch_dims = 0 : i32}> : (tensor<3xi32>, tensor<1xi64>) -> tensor<1xi32>
+// CHECK-NEXT:      %20 = "tfl.concatenation"(%17, %19) <{axis = 0 : i32, fused_activation_function = "NONE"}> : (tensor<2xi32>, tensor<1xi32>) -> tensor<3xi32>
 // CHECK-NEXT:      %21 = mhlo.dynamic_reshape %13, %20 : (tensor<4x4x?xf32>, tensor<3xi32>) -> tensor<4x4x?xf32>
 // CHECK-NEXT:      return %21 : tensor<4x4x?xf32>
 }
@@ -153,37 +153,37 @@ func.func @convert_dot_general_dynamic_batch_dim(%arg0: tensor<2x?x2x3xf32>, %ar
 func.return %0 : tensor<2x?x2x4xf32>
 
 // CHECK-LABEL:     convert_dot_general_dynamic_batch_dim
-// CHECK:           %0 = "tfl.pseudo_const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK:           %0 = "tfl.pseudo_const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK-NEXT:      %1 = "tfl.cast"(%0) : (tensor<4xi64>) -> tensor<4xi32>
 // CHECK-NEXT:      %2 = "tfl.transpose"(%arg1, %1) : (tensor<2x?x4x3xf32>, tensor<4xi32>) -> tensor<2x?x3x4xf32>
 // CHECK-NEXT:      %3 = "tfl.shape"(%arg0) : (tensor<2x?x2x3xf32>) -> tensor<4xi32>
-// CHECK-NEXT:      %4 = "tfl.pseudo_const"() {value = dense<[-1, -1, 0, -1]> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-NEXT:      %5 = "tfl.pseudo_const"() {value = dense<[-1, -1, -1, 0]> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-NEXT:      %6 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+// CHECK-NEXT:      %4 = "tfl.pseudo_const"() <{value = dense<[-1, -1, 0, -1]> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-NEXT:      %5 = "tfl.pseudo_const"() <{value = dense<[-1, -1, -1, 0]> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-NEXT:      %6 = "tfl.pseudo_const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
 // CHECK-NEXT:      %7 = "tfl.unsorted_segment_prod"(%3, %4, %6) : (tensor<4xi32>, tensor<4xi32>, tensor<i32>) -> tensor<1xi32>
 // CHECK-NEXT:      %8 = "tfl.unsorted_segment_prod"(%3, %5, %6) : (tensor<4xi32>, tensor<4xi32>, tensor<i32>) -> tensor<1xi32>
-// CHECK-NEXT:      %9 = "tfl.pseudo_const"() {value = dense<[0, 1]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK-NEXT:      %10 = "tfl.gather"(%3, %9) {axis = 0 : i32, batch_dims = 0 : i32} : (tensor<4xi32>, tensor<2xi64>) -> tensor<2xi32>
-// CHECK-NEXT:      %11 = "tfl.concatenation"(%10, %7, %8) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+// CHECK-NEXT:      %9 = "tfl.pseudo_const"() <{value = dense<[0, 1]> : tensor<2xi64>}> : () -> tensor<2xi64>
+// CHECK-NEXT:      %10 = "tfl.gather"(%3, %9) <{axis = 0 : i32, batch_dims = 0 : i32}> : (tensor<4xi32>, tensor<2xi64>) -> tensor<2xi32>
+// CHECK-NEXT:      %11 = "tfl.concatenation"(%10, %7, %8) <{axis = 0 : i32, fused_activation_function = "NONE"}> : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
 // CHECK-NEXT:      %12 = mhlo.dynamic_reshape %arg0, %11 : (tensor<2x?x2x3xf32>, tensor<4xi32>) -> tensor<2x?x2x3xf32>
 // CHECK-NEXT:      %13 = "tfl.shape"(%arg1) : (tensor<2x?x4x3xf32>) -> tensor<4xi32>
-// CHECK-NEXT:      %14 = "tfl.pseudo_const"() {value = dense<[-1, -1, 0, -1]> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-NEXT:      %15 = "tfl.pseudo_const"() {value = dense<[-1, -1, -1, 0]> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-NEXT:      %16 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+// CHECK-NEXT:      %14 = "tfl.pseudo_const"() <{value = dense<[-1, -1, 0, -1]> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-NEXT:      %15 = "tfl.pseudo_const"() <{value = dense<[-1, -1, -1, 0]> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-NEXT:      %16 = "tfl.pseudo_const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
 // CHECK-NEXT:      %17 = "tfl.unsorted_segment_prod"(%13, %14, %16) : (tensor<4xi32>, tensor<4xi32>, tensor<i32>) -> tensor<1xi32>
 // CHECK-NEXT:      %18 = "tfl.unsorted_segment_prod"(%13, %15, %16) : (tensor<4xi32>, tensor<4xi32>, tensor<i32>) -> tensor<1xi32>
-// CHECK-NEXT:      %19 = "tfl.pseudo_const"() {value = dense<[0, 1]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK-NEXT:      %20 = "tfl.gather"(%13, %19) {axis = 0 : i32, batch_dims = 0 : i32} : (tensor<4xi32>, tensor<2xi64>) -> tensor<2xi32>
-// CHECK-NEXT:      %21 = "tfl.concatenation"(%20, %18, %17) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+// CHECK-NEXT:      %19 = "tfl.pseudo_const"() <{value = dense<[0, 1]> : tensor<2xi64>}> : () -> tensor<2xi64>
+// CHECK-NEXT:      %20 = "tfl.gather"(%13, %19) <{axis = 0 : i32, batch_dims = 0 : i32}> : (tensor<4xi32>, tensor<2xi64>) -> tensor<2xi32>
+// CHECK-NEXT:      %21 = "tfl.concatenation"(%20, %18, %17) <{axis = 0 : i32, fused_activation_function = "NONE"}> : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
 // CHECK-NEXT:      %22 = mhlo.dynamic_reshape %2, %21 : (tensor<2x?x3x4xf32>, tensor<4xi32>) -> tensor<2x?x3x4xf32>
-// CHECK-NEXT:      %23 = "tfl.batch_matmul"(%12, %22) {adj_x = false, adj_y = false, asymmetric_quantize_inputs = false} : (tensor<2x?x2x3xf32>, tensor<2x?x3x4xf32>) -> tensor<2x?x2x4xf32>
+// CHECK-NEXT:      %23 = "tfl.batch_matmul"(%12, %22) <{adj_x = false, adj_y = false, asymmetric_quantize_inputs = false}> : (tensor<2x?x2x3xf32>, tensor<2x?x3x4xf32>) -> tensor<2x?x2x4xf32>
 // CHECK-NEXT:      %24 = "tfl.shape"(%arg0) : (tensor<2x?x2x3xf32>) -> tensor<4xi32>
 // CHECK-NEXT:      %25 = "tfl.shape"(%arg1) : (tensor<2x?x4x3xf32>) -> tensor<4xi32>
-// CHECK-NEXT:      %26 = "tfl.pseudo_const"() {value = dense<[0, 1, 2]> : tensor<3xi64>} : () -> tensor<3xi64>
-// CHECK-NEXT:      %27 = "tfl.gather"(%24, %26) {axis = 0 : i32, batch_dims = 0 : i32} : (tensor<4xi32>, tensor<3xi64>) -> tensor<3xi32>
-// CHECK-NEXT:      %28 = "tfl.pseudo_const"() {value = dense<2> : tensor<1xi64>} : () -> tensor<1xi64>
-// CHECK-NEXT:      %29 = "tfl.gather"(%25, %28) {axis = 0 : i32, batch_dims = 0 : i32} : (tensor<4xi32>, tensor<1xi64>) -> tensor<1xi32>
-// CHECK-NEXT:      %30 = "tfl.concatenation"(%27, %29) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<3xi32>, tensor<1xi32>) -> tensor<4xi32>
+// CHECK-NEXT:      %26 = "tfl.pseudo_const"() <{value = dense<[0, 1, 2]> : tensor<3xi64>}> : () -> tensor<3xi64>
+// CHECK-NEXT:      %27 = "tfl.gather"(%24, %26) <{axis = 0 : i32, batch_dims = 0 : i32}> : (tensor<4xi32>, tensor<3xi64>) -> tensor<3xi32>
+// CHECK-NEXT:      %28 = "tfl.pseudo_const"() <{value = dense<2> : tensor<1xi64>}> : () -> tensor<1xi64>
+// CHECK-NEXT:      %29 = "tfl.gather"(%25, %28) <{axis = 0 : i32, batch_dims = 0 : i32}> : (tensor<4xi32>, tensor<1xi64>) -> tensor<1xi32>
+// CHECK-NEXT:      %30 = "tfl.concatenation"(%27, %29) <{axis = 0 : i32, fused_activation_function = "NONE"}> : (tensor<3xi32>, tensor<1xi32>) -> tensor<4xi32>
 // CHECK-NEXT:      %31 = mhlo.dynamic_reshape %23, %30 : (tensor<2x?x2x4xf32>, tensor<4xi32>) -> tensor<2x?x2x4xf32>
 // CHECK-NEXT:      return %31 : tensor<2x?x2x4xf32>
 }
@@ -200,35 +200,35 @@ func.func @convert_dot_general_dynamic_lhs_rhs_out_dims(%arg0: tensor<2x2x?x3xf3
 func.return %0 : tensor<2x2x?x4x?xf32>
 
 // CHECK-LABEL:     convert_dot_general_dynamic_lhs_rhs_out_dims
-// CHECK:           %0 = "tfl.pseudo_const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK:           %0 = "tfl.pseudo_const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi64>}> : () -> tensor<4xi64>
 // CHECK-NEXT:      %1 = "tfl.cast"(%0) : (tensor<4xi64>) -> tensor<4xi32>
 // CHECK-NEXT:      %2 = "tfl.transpose"(%arg1, %1) : (tensor<2x4x?x3xf32>, tensor<4xi32>) -> tensor<2x3x4x?xf32>
 // CHECK-NEXT:      %3 = "tfl.shape"(%arg0) : (tensor<2x2x?x3xf32>) -> tensor<4xi32>
-// CHECK-NEXT:      %4 = "tfl.pseudo_const"() {value = dense<[-1, 0, 0, -1]> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-NEXT:      %5 = "tfl.pseudo_const"() {value = dense<[-1, -1, -1, 0]> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-NEXT:      %6 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+// CHECK-NEXT:      %4 = "tfl.pseudo_const"() <{value = dense<[-1, 0, 0, -1]> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-NEXT:      %5 = "tfl.pseudo_const"() <{value = dense<[-1, -1, -1, 0]> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-NEXT:      %6 = "tfl.pseudo_const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
 // CHECK-NEXT:      %7 = "tfl.unsorted_segment_prod"(%3, %4, %6) : (tensor<4xi32>, tensor<4xi32>, tensor<i32>) -> tensor<1xi32>
 // CHECK-NEXT:      %8 = "tfl.unsorted_segment_prod"(%3, %5, %6) : (tensor<4xi32>, tensor<4xi32>, tensor<i32>) -> tensor<1xi32>
-// CHECK-NEXT:      %9 = "tfl.pseudo_const"() {value = dense<2> : tensor<1xi32>} : () -> tensor<1xi32>
-// CHECK-NEXT:      %10 = "tfl.concatenation"(%9, %7, %8) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
+// CHECK-NEXT:      %9 = "tfl.pseudo_const"() <{value = dense<2> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK-NEXT:      %10 = "tfl.concatenation"(%9, %7, %8) <{axis = 0 : i32, fused_activation_function = "NONE"}> : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
 // CHECK-NEXT:      %11 = mhlo.dynamic_reshape %arg0, %10 : (tensor<2x2x?x3xf32>, tensor<3xi32>) -> tensor<2x?x3xf32>
 // CHECK-NEXT:      %12 = "tfl.shape"(%arg1) : (tensor<2x4x?x3xf32>) -> tensor<4xi32>
-// CHECK-NEXT:      %13 = "tfl.pseudo_const"() {value = dense<[-1, 0, 0, -1]> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-NEXT:      %14 = "tfl.pseudo_const"() {value = dense<[-1, -1, -1, 0]> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK-NEXT:      %15 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+// CHECK-NEXT:      %13 = "tfl.pseudo_const"() <{value = dense<[-1, 0, 0, -1]> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-NEXT:      %14 = "tfl.pseudo_const"() <{value = dense<[-1, -1, -1, 0]> : tensor<4xi32>}> : () -> tensor<4xi32>
+// CHECK-NEXT:      %15 = "tfl.pseudo_const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
 // CHECK-NEXT:      %16 = "tfl.unsorted_segment_prod"(%12, %13, %15) : (tensor<4xi32>, tensor<4xi32>, tensor<i32>) -> tensor<1xi32>
 // CHECK-NEXT:      %17 = "tfl.unsorted_segment_prod"(%12, %14, %15) : (tensor<4xi32>, tensor<4xi32>, tensor<i32>) -> tensor<1xi32>
-// CHECK-NEXT:      %18 = "tfl.pseudo_const"() {value = dense<2> : tensor<1xi32>} : () -> tensor<1xi32>
-// CHECK-NEXT:      %19 = "tfl.concatenation"(%18, %17, %16) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
+// CHECK-NEXT:      %18 = "tfl.pseudo_const"() <{value = dense<2> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK-NEXT:      %19 = "tfl.concatenation"(%18, %17, %16) <{axis = 0 : i32, fused_activation_function = "NONE"}> : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
 // CHECK-NEXT:      %20 = mhlo.dynamic_reshape %2, %19 : (tensor<2x3x4x?xf32>, tensor<3xi32>) -> tensor<2x3x?xf32>
-// CHECK-NEXT:      %21 = "tfl.batch_matmul"(%11, %20) {adj_x = false, adj_y = false, asymmetric_quantize_inputs = false} : (tensor<2x?x3xf32>, tensor<2x3x?xf32>) -> tensor<2x?x?xf32>
+// CHECK-NEXT:      %21 = "tfl.batch_matmul"(%11, %20) <{adj_x = false, adj_y = false, asymmetric_quantize_inputs = false}> : (tensor<2x?x3xf32>, tensor<2x3x?xf32>) -> tensor<2x?x?xf32>
 // CHECK-NEXT:      %22 = "tfl.shape"(%arg0) : (tensor<2x2x?x3xf32>) -> tensor<4xi32>
 // CHECK-NEXT:      %23 = "tfl.shape"(%arg1) : (tensor<2x4x?x3xf32>) -> tensor<4xi32>
-// CHECK-NEXT:      %24 = "tfl.pseudo_const"() {value = dense<[0, 1, 2]> : tensor<3xi64>} : () -> tensor<3xi64>
-// CHECK-NEXT:      %25 = "tfl.gather"(%22, %24) {axis = 0 : i32, batch_dims = 0 : i32} : (tensor<4xi32>, tensor<3xi64>) -> tensor<3xi32>
-// CHECK-NEXT:      %26 = "tfl.pseudo_const"() {value = dense<[1, 2]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK-NEXT:      %27 = "tfl.gather"(%23, %26) {axis = 0 : i32, batch_dims = 0 : i32} : (tensor<4xi32>, tensor<2xi64>) -> tensor<2xi32>
-// CHECK-NEXT:      %28 = "tfl.concatenation"(%25, %27) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<3xi32>, tensor<2xi32>) -> tensor<5xi32>
+// CHECK-NEXT:      %24 = "tfl.pseudo_const"() <{value = dense<[0, 1, 2]> : tensor<3xi64>}> : () -> tensor<3xi64>
+// CHECK-NEXT:      %25 = "tfl.gather"(%22, %24) <{axis = 0 : i32, batch_dims = 0 : i32}> : (tensor<4xi32>, tensor<3xi64>) -> tensor<3xi32>
+// CHECK-NEXT:      %26 = "tfl.pseudo_const"() <{value = dense<[1, 2]> : tensor<2xi64>}> : () -> tensor<2xi64>
+// CHECK-NEXT:      %27 = "tfl.gather"(%23, %26) <{axis = 0 : i32, batch_dims = 0 : i32}> : (tensor<4xi32>, tensor<2xi64>) -> tensor<2xi32>
+// CHECK-NEXT:      %28 = "tfl.concatenation"(%25, %27) <{axis = 0 : i32, fused_activation_function = "NONE"}> : (tensor<3xi32>, tensor<2xi32>) -> tensor<5xi32>
 // CHECK-NEXT:      %29 = mhlo.dynamic_reshape %21, %28 : (tensor<2x?x?xf32>, tensor<5xi32>) -> tensor<2x2x?x4x?xf32>
 // CHECK-NEXT:      return %29 : tensor<2x2x?x4x?xf32>
 
@@ -246,24 +246,24 @@ func.return %0 : tensor<4x4x256xf32>
 
 // CHECK-LABEL:     convert_dot_general_dynamic_contracting_dim
 // CHECK:           %0 = "tfl.shape"(%arg0) : (tensor<4x4x?xf32>) -> tensor<3xi32>
-// CHECK-NEXT:      %1 = "tfl.pseudo_const"() {value = dense<[-1, 0, -1]> : tensor<3xi32>} : () -> tensor<3xi32>
-// CHECK-NEXT:      %2 = "tfl.pseudo_const"() {value = dense<[-1, -1, 0]> : tensor<3xi32>} : () -> tensor<3xi32>
-// CHECK-NEXT:      %3 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+// CHECK-NEXT:      %1 = "tfl.pseudo_const"() <{value = dense<[-1, 0, -1]> : tensor<3xi32>}> : () -> tensor<3xi32>
+// CHECK-NEXT:      %2 = "tfl.pseudo_const"() <{value = dense<[-1, -1, 0]> : tensor<3xi32>}> : () -> tensor<3xi32>
+// CHECK-NEXT:      %3 = "tfl.pseudo_const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
 // CHECK-NEXT:      %4 = "tfl.unsorted_segment_prod"(%0, %1, %3) : (tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<1xi32>
 // CHECK-NEXT:      %5 = "tfl.unsorted_segment_prod"(%0, %2, %3) : (tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<1xi32>
-// CHECK-NEXT:      %6 = "tfl.pseudo_const"() {value = dense<4> : tensor<1xi32>} : () -> tensor<1xi32>
-// CHECK-NEXT:      %7 = "tfl.concatenation"(%6, %4, %5) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
+// CHECK-NEXT:      %6 = "tfl.pseudo_const"() <{value = dense<4> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK-NEXT:      %7 = "tfl.concatenation"(%6, %4, %5) <{axis = 0 : i32, fused_activation_function = "NONE"}> : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
 // CHECK-NEXT:      %8 = mhlo.dynamic_reshape %arg0, %7 : (tensor<4x4x?xf32>, tensor<3xi32>) -> tensor<4x4x?xf32>
 // CHECK-NEXT:      %9 = "tfl.shape"(%arg1) : (tensor<4x?x256xf32>) -> tensor<3xi32>
-// CHECK-NEXT:      %10 = "tfl.pseudo_const"() {value = dense<[-1, -1, 0]> : tensor<3xi32>} : () -> tensor<3xi32>
-// CHECK-NEXT:      %11 = "tfl.pseudo_const"() {value = dense<[-1, 0, -1]> : tensor<3xi32>} : () -> tensor<3xi32>
-// CHECK-NEXT:      %12 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+// CHECK-NEXT:      %10 = "tfl.pseudo_const"() <{value = dense<[-1, -1, 0]> : tensor<3xi32>}> : () -> tensor<3xi32>
+// CHECK-NEXT:      %11 = "tfl.pseudo_const"() <{value = dense<[-1, 0, -1]> : tensor<3xi32>}> : () -> tensor<3xi32>
+// CHECK-NEXT:      %12 = "tfl.pseudo_const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
 // CHECK-NEXT:      %13 = "tfl.unsorted_segment_prod"(%9, %10, %12) : (tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<1xi32>
 // CHECK-NEXT:      %14 = "tfl.unsorted_segment_prod"(%9, %11, %12) : (tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<1xi32>
-// CHECK-NEXT:      %15 = "tfl.pseudo_const"() {value = dense<4> : tensor<1xi32>} : () -> tensor<1xi32>
-// CHECK-NEXT:      %16 = "tfl.concatenation"(%15, %14, %13) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
+// CHECK-NEXT:      %15 = "tfl.pseudo_const"() <{value = dense<4> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK-NEXT:      %16 = "tfl.concatenation"(%15, %14, %13) <{axis = 0 : i32, fused_activation_function = "NONE"}> : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
 // CHECK-NEXT:      %17 = mhlo.dynamic_reshape %arg1, %16 : (tensor<4x?x256xf32>, tensor<3xi32>) -> tensor<4x?x256xf32>
-// CHECK-NEXT:      %18 = "tfl.batch_matmul"(%8, %17) {adj_x = false, adj_y = false, asymmetric_quantize_inputs = false} : (tensor<4x4x?xf32>, tensor<4x?x256xf32>) -> tensor<4x4x256xf32>
+// CHECK-NEXT:      %18 = "tfl.batch_matmul"(%8, %17) <{adj_x = false, adj_y = false, asymmetric_quantize_inputs = false}> : (tensor<4x4x?xf32>, tensor<4x?x256xf32>) -> tensor<4x4x256xf32>
 // CHECK-NEXT:      %19 = mhlo.reshape %18 : (tensor<4x4x256xf32>) -> tensor<4x4x256xf32>
 // CHECK-NEXT:      return %19 : tensor<4x4x256xf32>
 }
@@ -294,7 +294,7 @@ func.func @convert_argmax(%arg0: tensor<4x32x256xf32>) -> (tensor<4x32xf32>, ten
   // CHECK:  %2 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<256xi32>
   // CHECK:  %3 = "mhlo.broadcast_in_dim"(%2) <{broadcast_dimensions = dense<2> : tensor<1xi64>}> : (tensor<256xi32>) -> tensor<4x32x256xi32>
   // CHECK:  %cst = arith.constant dense<2> : tensor<1xi32>
-  // CHECK:  %4 = "tfl.reduce_max"(%arg0, %cst) {keep_dims = false} : (tensor<4x32x256xf32>, tensor<1xi32>) -> tensor<4x32xf32>
+  // CHECK:  %4 = "tfl.reduce_max"(%arg0, %cst) <{keep_dims = false}> : (tensor<4x32x256xf32>, tensor<1xi32>) -> tensor<4x32xf32>
   // CHECK:  %5 = "tfl.arg_max"(%arg0, %cst) : (tensor<4x32x256xf32>, tensor<1xi32>) -> tensor<4x32xi32>
   // CHECK:  return %4, %5 : tensor<4x32xf32>, tensor<4x32xi32>
 }
@@ -323,7 +323,7 @@ func.func @convert_argmax_constant(%arg0: tensor<2x2x4xf32>) -> (tensor<2x2xf32>
   // CHECK-DAG:  %1 = mhlo.constant dense<0> : tensor<i32>
   // CHECK:  %2 = mhlo.constant dense<{{\[\[}}[0, 1, 2, 3], [0, 1, 2, 3]], {{\[\[}}0, 1, 2, 3], [0, 1, 2, 3]]]> : tensor<2x2x4xi32>
   // CHECK:  %cst = arith.constant dense<2> : tensor<1xi32>
-  // CHECK:  %3 = "tfl.reduce_max"(%arg0, %cst) {keep_dims = false} : (tensor<2x2x4xf32>, tensor<1xi32>) -> tensor<2x2xf32>
+  // CHECK:  %3 = "tfl.reduce_max"(%arg0, %cst) <{keep_dims = false}> : (tensor<2x2x4xf32>, tensor<1xi32>) -> tensor<2x2xf32>
   // CHECK:  %4 = "tfl.arg_max"(%arg0, %cst) : (tensor<2x2x4xf32>, tensor<1xi32>) -> tensor<2x2xi32>
   // CHECK:  return %3, %4 : tensor<2x2xf32>, tensor<2x2xi32>
 }
@@ -352,7 +352,7 @@ func.func @convert_argmax_constant_non_z_axis(%arg0: tensor<4x4xf32>) -> (tensor
   // CHECK-DAG:  %1 = mhlo.constant dense<0> : tensor<i32>
   // CHECK:  %2 = mhlo.constant dense<{{\[\[}}0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [3, 3, 3, 3]]> : tensor<4x4xi32>
   // CHECK:  %cst = arith.constant dense<0> : tensor<1xi32>
-  // CHECK:  %3 = "tfl.reduce_max"(%arg0, %cst) {keep_dims = false} : (tensor<4x4xf32>, tensor<1xi32>) -> tensor<4xf32>
+  // CHECK:  %3 = "tfl.reduce_max"(%arg0, %cst) <{keep_dims = false}> : (tensor<4x4xf32>, tensor<1xi32>) -> tensor<4xf32>
   // CHECK:  %4 = "tfl.arg_max"(%arg0, %cst) : (tensor<4x4xf32>, tensor<1xi32>) -> tensor<4xi32>
   // CHECK:  return %3, %4 : tensor<4xf32>, tensor<4xi32>
 }
@@ -379,7 +379,7 @@ func.func @convert_argmax_bool(%arg0: tensor<2xi1>) -> tensor<i32> {
   // CHECK-DAG:  %1 = mhlo.constant dense<false> : tensor<i1>
   // CHECK:  %2 = mhlo.constant dense<0> : tensor<i32>
   // CHECK:  %cst = arith.constant dense<0> : tensor<1xi32>
-  // CHECK:  %3 = "tfl.reduce_any"(%arg0, %cst) {keep_dims = false} : (tensor<2xi1>, tensor<1xi32>) -> tensor<i1>
+  // CHECK:  %3 = "tfl.reduce_any"(%arg0, %cst) <{keep_dims = false}> : (tensor<2xi1>, tensor<1xi32>) -> tensor<i1>
   // CHECK:  %4 = "tfl.arg_max"(%arg0, %cst) : (tensor<2xi1>, tensor<1xi32>) -> tensor<i32>
   // CHECK:  return %4 : tensor<i32>
 }
@@ -410,7 +410,7 @@ func.func @convert_argmin(%arg0: tensor<4x32x256xf32>) -> (tensor<4x32xf32>, ten
   // CHECK:  %2 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<256xi32>
   // CHECK:  %3 = "mhlo.broadcast_in_dim"(%2) <{broadcast_dimensions = dense<2> : tensor<1xi64>}> : (tensor<256xi32>) -> tensor<4x32x256xi32>
   // CHECK:  %cst = arith.constant dense<2> : tensor<1xi32>
-  // CHECK:  %4 = "tfl.reduce_min"(%arg0, %cst) {keep_dims = false} : (tensor<4x32x256xf32>, tensor<1xi32>) -> tensor<4x32xf32>
+  // CHECK:  %4 = "tfl.reduce_min"(%arg0, %cst) <{keep_dims = false}> : (tensor<4x32x256xf32>, tensor<1xi32>) -> tensor<4x32xf32>
   // CHECK:  %5 = "tfl.arg_min"(%arg0, %cst) : (tensor<4x32x256xf32>, tensor<1xi32>) -> tensor<4x32xi32>
   // CHECK:  return %4, %5 : tensor<4x32xf32>, tensor<4x32xi32>
 }
@@ -440,7 +440,7 @@ func.func @convert_argmin_i16(%arg0: tensor<2xi16>) -> (tensor<i16>, tensor<i32>
   // CHECK-DAG:  %2 = mhlo.constant dense<32767> : tensor<i16>
   // CHECK:  %3 = mhlo.constant dense<0> : tensor<i32>
   // CHECK:  %cst = arith.constant dense<0> : tensor<1xi32>
-  // CHECK:  %4 = "tfl.reduce_min"(%arg0, %cst) {keep_dims = false} : (tensor<2xi16>, tensor<1xi32>) -> tensor<i16>
+  // CHECK:  %4 = "tfl.reduce_min"(%arg0, %cst) <{keep_dims = false}> : (tensor<2xi16>, tensor<1xi32>) -> tensor<i16>
   // CHECK:  %5 = "tfl.arg_min"(%arg0, %cst) : (tensor<2xi16>, tensor<1xi32>) -> tensor<i32>
   // CHECK:  return %4, %5 : tensor<i16>, tensor<i32>
 }
@@ -470,7 +470,7 @@ func.func @convert_argmin_constant(%arg0: tensor<2x2x4xf32>) -> (tensor<2x2xf32>
   // CHECK-DAG:  %1 = mhlo.constant dense<0> : tensor<i32>
   // CHECK:  %2 = mhlo.constant dense<{{\[\[}}[0, 1, 2, 3], [0, 1, 2, 3]], {{\[\[}}0, 1, 2, 3], [0, 1, 2, 3]]]> : tensor<2x2x4xi32>
   // CHECK:  %cst = arith.constant dense<2> : tensor<1xi32>
-  // CHECK:  %3 = "tfl.reduce_min"(%arg0, %cst) {keep_dims = false} : (tensor<2x2x4xf32>, tensor<1xi32>) -> tensor<2x2xf32>
+  // CHECK:  %3 = "tfl.reduce_min"(%arg0, %cst) <{keep_dims = false}> : (tensor<2x2x4xf32>, tensor<1xi32>) -> tensor<2x2xf32>
   // CHECK:  %4 = "tfl.arg_min"(%arg0, %cst) : (tensor<2x2x4xf32>, tensor<1xi32>) -> tensor<2x2xi32>
   // CHECK:  return %3, %4 : tensor<2x2xf32>, tensor<2x2xi32>
 }
@@ -497,7 +497,7 @@ func.func @convert_argmin_bool(%arg0: tensor<2xi1>) -> tensor<i32> {
   // CHECK-DAG:  %1 = mhlo.constant dense<false> : tensor<i1>
   // CHECK:  %2 = mhlo.constant dense<0> : tensor<i32>
   // CHECK:  %cst = arith.constant dense<0> : tensor<1xi32>
-  // CHECK:  %3 = "tfl.reduce_all"(%arg0, %cst) {keep_dims = false} : (tensor<2xi1>, tensor<1xi32>) -> tensor<i1>
+  // CHECK:  %3 = "tfl.reduce_all"(%arg0, %cst) <{keep_dims = false}> : (tensor<2xi1>, tensor<1xi32>) -> tensor<i1>
   // CHECK:  %4 = "tfl.arg_min"(%arg0, %cst) : (tensor<2xi1>, tensor<1xi32>) -> tensor<i32>
   // CHECK:  return %4 : tensor<i32>
 }
@@ -528,7 +528,7 @@ func.func @convert_argmax_with_reshaped_iota(%arg0: tensor<1x32x1xf32>) -> (tens
   // CHECK:  %2 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<32xi32>
   // CHECK:  %3 = mhlo.reshape %2 : (tensor<32xi32>) -> tensor<1x32x1xi32>
   // CHECK:  %cst = arith.constant dense<1> : tensor<1xi32>
-  // CHECK:  %4 = "tfl.reduce_max"(%arg0, %cst) {keep_dims = false} : (tensor<1x32x1xf32>, tensor<1xi32>) -> tensor<1x1xf32>
+  // CHECK:  %4 = "tfl.reduce_max"(%arg0, %cst) <{keep_dims = false}> : (tensor<1x32x1xf32>, tensor<1xi32>) -> tensor<1x1xf32>
   // CHECK:  %5 = "tfl.arg_max"(%arg0, %cst) : (tensor<1x32x1xf32>, tensor<1xi32>) -> tensor<1x1xi32>
   // CHECK:  return %4, %5 : tensor<1x1xf32>, tensor<1x1xi32>
 }
@@ -556,7 +556,7 @@ func.func @convert_pytorch_argmax(%arg0: tensor<1x9xi32>) -> tensor<1xi32> {
   // CHECK:  %2 = "mhlo.iota"() <{iota_dimension = 0 : i64}> : () -> tensor<9xi32>
   // CHECK:  %3 = mhlo.reshape %2 : (tensor<9xi32>) -> tensor<1x9xi32>
   // CHECK:  %cst = arith.constant dense<1> : tensor<1xi32>
-  // CHECK:  %4 = "tfl.reduce_max"(%arg0, %cst) {keep_dims = false} : (tensor<1x9xi32>, tensor<1xi32>) -> tensor<1xi32>
+  // CHECK:  %4 = "tfl.reduce_max"(%arg0, %cst) <{keep_dims = false}> : (tensor<1x9xi32>, tensor<1xi32>) -> tensor<1xi32>
   // CHECK:  %5 = "tfl.arg_max"(%arg0, %cst) : (tensor<1x9xi32>, tensor<1xi32>) -> tensor<1xi32>
   // CHECK:  return %5 : tensor<1xi32>
 }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/tfl_legalize_hlo_custom_call.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/tfl_legalize_hlo_custom_call.mlir
index f0cba06bd984d8..8b1b24e9888508 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/tfl_legalize_hlo_custom_call.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/tfl_legalize_hlo_custom_call.mlir
@@ -8,10 +8,10 @@ func.func @mhlo_custom_call_test__legalize_string_backend_config(%arg0: tensor<1
   } : (tensor<1x4xf32>) -> (tensor<1x8xf32>)
   func.return %0 : tensor<1x8xf32>
 
-  //       CHECK: %0 = "tfl.custom"(%arg0) {
+  //       CHECK: %0 = "tfl.custom"(%arg0) <{
   //  CHECK-SAME:   custom_code = "custom_call.my_custom_op",
   //  CHECK-SAME:   custom_option = #tfl<const_bytes : "0x746869735F69735F615F746573745F737472696E67">
-  //  CHECK-SAME: } : (tensor<1x4xf32>) -> tensor<1x8xf32>
+  //  CHECK-SAME: }> : (tensor<1x4xf32>) -> tensor<1x8xf32>
 }
 
 // CHECK-LABEL: mhlo_custom_call_test__dont_legalize_dict_backend_config
@@ -35,10 +35,10 @@ func.func @mhlo_custom_call_test__api_version_4(%arg0: tensor<1x4xf32>) -> tenso
   } : (tensor<1x4xf32>) -> (tensor<1x8xf32>)
   func.return %0 : tensor<1x8xf32>
 
-  //       CHECK: %0 = "tfl.custom"(%arg0) {
+  //       CHECK: %0 = "tfl.custom"(%arg0) <{
   //  CHECK-SAME:   custom_code = "custom_call.my_custom_op",
   //  CHECK-SAME:   custom_option = #tfl<const_bytes : "0x">
-  //  CHECK-SAME: } : (tensor<1x4xf32>) -> tensor<1x8xf32>
+  //  CHECK-SAME: }> : (tensor<1x4xf32>) -> tensor<1x8xf32>
 }
 
 // CHECK-LABEL: mhlo_custom_call_does_not_legalize_tf_function
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir
index 7107f7dcb08a45..64b14b85fc7c71 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir
@@ -13,7 +13,7 @@ func.func @uniform_quantize_op(%arg: tensor<2x2xf32>) -> tensor<2x2x!quant.unifo
   return %0 : tensor<2x2x!quant.uniform<i8:f32, 3.000000e+0:127>>
 }
 // CHECK-LABEL: uniform_quantize_op
-// CHECK: %[[QUANT:.+]] = "tfl.quantize"({{.*}}) {qtype = tensor<2x2x!quant.uniform<i8:f32, 3.000000e+00:127>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<i8:f32, 3.000000e+00:127>>
+// CHECK: %[[QUANT:.+]] = "tfl.quantize"({{.*}}) <{qtype = tensor<2x2x!quant.uniform<i8:f32, 3.000000e+00:127>>}> : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<i8:f32, 3.000000e+00:127>>
 // CHECK: return %[[QUANT]]
 
 // -----
@@ -120,9 +120,9 @@ func.func @convolution_upstream_same_padding_srq(%arg0: tensor<1x3x3x4x!quant.un
 // to (2, 3, 3, 4).
 // CHECK-LABEL: convolution_upstream_same_padding_srq
 // CHECK-SAME: %[[ARG:.+]]: tensor<1x3x3x4x!quant.uniform<i8:f32, 3.000000e+00:-100>>
-// CHECK-DAG: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>, value = dense<3> : tensor<2x3x3x4xi8>} : () -> tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>
-// CHECK-DAG: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<2x!quant.uniform<i32:f32:0, {6.000000e+02,9.000000e+03}>>, value = dense<0> : tensor<2xi32>} : () -> tensor<2x!quant.uniform<i32:f32:0, {6.000000e+02,9.000000e+03}>>
-// CHECK: %[[CONV2D:.+]] = "tfl.conv_2d"(%[[ARG]], %[[QCONST_0]], %[[QCONST_1]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x3x3x4x!quant.uniform<i8:f32, 3.000000e+00:-100>>, tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>, tensor<2x!quant.uniform<i32:f32:0, {6.000000e+02,9.000000e+03}>>) -> tensor<1x3x3x2x!quant.uniform<i8:f32, 4.000000e+00>>
+// CHECK-DAG: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>, value = dense<3> : tensor<2x3x3x4xi8>}> : () -> tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>
+// CHECK-DAG: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<2x!quant.uniform<i32:f32:0, {6.000000e+02,9.000000e+03}>>, value = dense<0> : tensor<2xi32>}> : () -> tensor<2x!quant.uniform<i32:f32:0, {6.000000e+02,9.000000e+03}>>
+// CHECK: %[[CONV2D:.+]] = "tfl.conv_2d"(%[[ARG]], %[[QCONST_0]], %[[QCONST_1]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<1x3x3x4x!quant.uniform<i8:f32, 3.000000e+00:-100>>, tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>, tensor<2x!quant.uniform<i32:f32:0, {6.000000e+02,9.000000e+03}>>) -> tensor<1x3x3x2x!quant.uniform<i8:f32, 4.000000e+00>>
 // CHECK: return %[[CONV2D]] : tensor<1x3x3x2x!quant.uniform<i8:f32, 4.000000e+00>>
 
 // -----
@@ -149,10 +149,10 @@ func.func @convolution_upstream_srq_valid_padding(%arg0: tensor<1x3x3x4x!quant.u
 }
 // CHECK-LABEL: convolution_upstream_srq_valid_padding
 // CHECK-SAME: %[[ARG:.+]]: tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>
-// CHECK: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>, value = dense<3> : tensor<2x3x3x4xi8>} : () -> tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>
-// CHECK: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>, value = dense<0> : tensor<2xi32>} : () -> tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>
+// CHECK: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>, value = dense<3> : tensor<2x3x3x4xi8>}> : () -> tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>
+// CHECK: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>, value = dense<0> : tensor<2xi32>}> : () -> tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>
 // CHECK-NOT: tfl.pad
-// CHECK: %[[CONV2D:.+]] = "tfl.conv_2d"(%[[ARG]], %[[QCONST_0]], %[[QCONST_1]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>, tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>, tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>) -> tensor<1x1x1x2x!quant.uniform<i8:f32, 4.000000e+00>>
+// CHECK: %[[CONV2D:.+]] = "tfl.conv_2d"(%[[ARG]], %[[QCONST_0]], %[[QCONST_1]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>, tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>, tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>) -> tensor<1x1x1x2x!quant.uniform<i8:f32, 4.000000e+00>>
 // CHECK: return %[[CONV2D]] : tensor<1x1x1x2x!quant.uniform<i8:f32, 4.000000e+00>>
 
 // -----
@@ -168,9 +168,9 @@ func.func @convolution_upstream_srq_valid_padding(%arg0: tensor<1x3x3x4x!quant.u
 }
 // CHECK-LABEL: convolution_upstream_srq_valid_padding
 // CHECK-SAME: %[[ARG:.+]]: tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>
-// CHECK: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>, value = dense<3> : tensor<2x3x3x4xi8>} : () -> tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>
-// CHECK: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>, value = dense<0> : tensor<2xi32>} : () -> tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>
-// CHECK: %[[CONV2D:.+]] = "tfl.conv_2d"(%[[ARG]], %[[QCONST_0]], %[[QCONST_1]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>, tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>, tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>) -> tensor<1x1x1x2x!quant.uniform<i8:f32, 4.000000e+00>>
+// CHECK: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>, value = dense<3> : tensor<2x3x3x4xi8>}> : () -> tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>
+// CHECK: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>, value = dense<0> : tensor<2xi32>}> : () -> tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>
+// CHECK: %[[CONV2D:.+]] = "tfl.conv_2d"(%[[ARG]], %[[QCONST_0]], %[[QCONST_1]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>, tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>, tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>) -> tensor<1x1x1x2x!quant.uniform<i8:f32, 4.000000e+00>>
 // CHECK: return %[[CONV2D]] : tensor<1x1x1x2x!quant.uniform<i8:f32, 4.000000e+00>>
 
 // -----
@@ -186,10 +186,10 @@ func.func @convolution_upstream_srq_strides(%arg0: tensor<1x3x3x4x!quant.uniform
 }
 // CHECK-LABEL: convolution_upstream_srq_strides
 // CHECK-SAME: %[[ARG:.+]]: tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>
-// CHECK-DAG: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>, value = dense<3> : tensor<2x3x3x4xi8>} : () -> tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>
-// CHECK-DAG: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>, value = dense<0> : tensor<2xi32>} : () -> tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>
+// CHECK-DAG: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>, value = dense<3> : tensor<2x3x3x4xi8>}> : () -> tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>
+// CHECK-DAG: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>, value = dense<0> : tensor<2xi32>}> : () -> tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>
 // Tests that the stride_w is set to 2.
-// CHECK: %[[CONV2D:.+]] = "tfl.conv_2d"(%[[ARG]], %[[QCONST_0]], %[[QCONST_1]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 2 : i32} : (tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>, tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>, tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>) -> tensor<1x3x2x2x!quant.uniform<i8:f32, 4.000000e+00>>
+// CHECK: %[[CONV2D:.+]] = "tfl.conv_2d"(%[[ARG]], %[[QCONST_0]], %[[QCONST_1]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 2 : i32}> : (tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>, tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>, tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>) -> tensor<1x3x2x2x!quant.uniform<i8:f32, 4.000000e+00>>
 // CHECK: return %[[CONV2D]] : tensor<1x3x2x2x!quant.uniform<i8:f32, 4.000000e+00>>
 
 // -----
@@ -210,8 +210,8 @@ func.func @dot_general_upstream_srq_asym_input(%arg0: tensor<1x2x3x4x!quant.unif
 }
 // CHECK-LABEL: dot_general_upstream_srq_asym_input
 // CHECK-SAME: %[[ARG:.+]]: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>
-// CHECK: %[[QCONST_0:.+]] =  "tfl.pseudo_qconst"() {qtype = tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>, value = dense<1> : tensor<1x2x4x5xi8>} : () -> tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>
-// CHECK: %[[BMM:.+]] = "tfl.batch_matmul"(%[[ARG]], %[[QCONST_0]]) {adj_x = false, adj_y = false} : (tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>, tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+00>>
+// CHECK: %[[QCONST_0:.+]] =  "tfl.pseudo_qconst"() <{qtype = tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>, value = dense<1> : tensor<1x2x4x5xi8>}> : () -> tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>
+// CHECK: %[[BMM:.+]] = "tfl.batch_matmul"(%[[ARG]], %[[QCONST_0]]) <{adj_x = false, adj_y = false}> : (tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>, tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+00>>
 
 // -----
 
@@ -233,7 +233,7 @@ func.func @dot_general_upstream_srq_sym_input(%arg0: tensor<1x2x3x4x!quant.unifo
 // CHECK-LABEL: dot_general_upstream_srq_sym_input
 // CHECK-SAME: %[[ARG:.+]]: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+00>>
 // CHECK: %[[QCONST_0:.+]] =  "tfl.pseudo_qconst"()
-// CHECK: "tfl.batch_matmul"(%[[ARG]], %[[QCONST_0]]) {adj_x = false, adj_y = false}
+// CHECK: "tfl.batch_matmul"(%[[ARG]], %[[QCONST_0]]) <{adj_x = false, adj_y = false}>
 
 // -----
 
@@ -252,7 +252,7 @@ func.func @dot_general_upstream_srq_activation_rhs(%arg0: tensor<1x2x3x4x!quant.
   return %0 : tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>>
 }
 // CHECK-LABEL: dot_general_upstream_srq_activation_rhs
-// CHECK: "tfl.batch_matmul"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+00>>, tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+00>>
+// CHECK: "tfl.batch_matmul"(%arg0, %arg1) <{adj_x = false, adj_y = false}> : (tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+00>>, tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+00>>
 
 // -----
 
@@ -274,8 +274,8 @@ func.func @dot_general_upstream_srq_adj_x(%arg0: tensor<1x2x4x3x!quant.uniform<i
   return %1 : tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>>
 }
 // CHECK-SAME: %[[ARG:.+]]: tensor<1x2x4x3x!quant.uniform<i8:f32, 1.000000e+00>>
-// CHECK: %[[QCONST_0:.+]] =  "tfl.pseudo_qconst"() {qtype = tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>, value = dense<1> : tensor<1x2x4x5xi8>} : () -> tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>
-// CHECK: "tfl.batch_matmul"(%[[ARG]], %[[QCONST_0]]) {adj_x = true, adj_y = false}
+// CHECK: %[[QCONST_0:.+]] =  "tfl.pseudo_qconst"() <{qtype = tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>, value = dense<1> : tensor<1x2x4x5xi8>}> : () -> tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>
+// CHECK: "tfl.batch_matmul"(%[[ARG]], %[[QCONST_0]]) <{adj_x = true, adj_y = false}>
 
 // -----
 
@@ -297,8 +297,8 @@ func.func @dot_general_upstream_srq_adj_y(%arg0: tensor<1x2x3x4x!quant.uniform<i
 }
 // CHECK-LABEL: dot_general_upstream_srq_adj_y
 // CHECK-SAME: %[[ARG:.+]]: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+00>>
-// CHECK: %[[QCONST_0:.+]] =  "tfl.pseudo_qconst"() {qtype = tensor<1x2x5x4x!quant.uniform<i8:f32, 1.000000e+00>>, value = dense<1> : tensor<1x2x5x4xi8>} : () -> tensor<1x2x5x4x!quant.uniform<i8:f32, 1.000000e+00>>
-// CHECK: "tfl.batch_matmul"(%[[ARG]], %[[QCONST_0]]) {adj_x = false, adj_y = true}
+// CHECK: %[[QCONST_0:.+]] =  "tfl.pseudo_qconst"() <{qtype = tensor<1x2x5x4x!quant.uniform<i8:f32, 1.000000e+00>>, value = dense<1> : tensor<1x2x5x4xi8>}> : () -> tensor<1x2x5x4x!quant.uniform<i8:f32, 1.000000e+00>>
+// CHECK: "tfl.batch_matmul"(%[[ARG]], %[[QCONST_0]]) <{adj_x = false, adj_y = true}>
 
 // -----
 
@@ -393,13 +393,13 @@ func.func @dot_general_upstream_srq_float_operands(%arg0: tensor<1x2x3x4xf32>, %
 
 // CHECK-LABEL: dot_general_upstream_srq_asym_weight
 func.func @dot_general_upstream_srq_asym_weight(%arg0: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>> {
-  %0 = stablehlo.constant() {value = dense<1> : tensor<1x2x4x5xi8>} : () -> tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0:5>>
-  %1 = "stablehlo.dot_general"(%arg0, %0) {dot_dimension_numbers = #stablehlo.dot<lhs_batching_dimensions = [0, 1], rhs_batching_dimensions = [0, 1], lhs_contracting_dimensions = [3], rhs_contracting_dimensions = [2]>, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} : (tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>, tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0:5>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>>
+  %0 = stablehlo.constant() {value = dense<1> : tensor<1x2x4x5xi8>} : () -> tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0:0>>
+  %1 = "stablehlo.dot_general"(%arg0, %0) {dot_dimension_numbers = #stablehlo.dot<lhs_batching_dimensions = [0, 1], rhs_batching_dimensions = [0, 1], lhs_contracting_dimensions = [3], rhs_contracting_dimensions = [2]>, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} : (tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>, tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0:0>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>>
   return %1 : tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>>
 }
 // CHECK-SAME: %[[ARG:.+]]: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>
-// CHECK: %[[QCONST_0:.+]] =  "tfl.pseudo_qconst"() {qtype = tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00:5>>, value = dense<1> : tensor<1x2x4x5xi8>} : () -> tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00:5>>
-// CHECK: %[[BMM:.+]] = "tfl.batch_matmul"(%[[ARG]], %[[QCONST_0]]) {adj_x = false, adj_y = false} : (tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>, tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00:5>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+00>>
+// CHECK: %[[QCONST_0:.+]] =  "tfl.pseudo_qconst"() <{qtype = tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>, value = dense<1> : tensor<1x2x4x5xi8>}> : () -> tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>
+// CHECK: %[[BMM:.+]] = "tfl.batch_matmul"(%[[ARG]], %[[QCONST_0]]) <{adj_x = false, adj_y = false}> : (tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>, tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+00>>
 
 // -----
 
@@ -414,10 +414,10 @@ func.func @dot_general_upstream_srq_per_axis_quantized_filter(%arg0: tensor<1x3x
 }
 // CHECK-SAME: %[[ARG_0:.+]]: tensor<1x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>
 // Weight tensor is transposed, as tfl.fully_connected accepts a [o, i] matrix.
-// CHECK-DAG: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<2x3x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>, value = dense<1> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>
-// CHECK-DAG: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<2x!quant.uniform<i32:f32:0, {1.000000e+08,1.500000e+09}>>, value = dense<0> : tensor<2xi32>} : () -> tensor<2x!quant.uniform<i32:f32:0, {1.000000e+08,1.500000e+09}>>
+// CHECK-DAG: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<2x3x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>, value = dense<1> : tensor<2x3xi8>}> : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>
+// CHECK-DAG: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<2x!quant.uniform<i32:f32:0, {1.000000e+08,1.500000e+09}>>, value = dense<0> : tensor<2xi32>}> : () -> tensor<2x!quant.uniform<i32:f32:0, {1.000000e+08,1.500000e+09}>>
 // Bias tensor's scale is input scale * filter scale.
-// CHECK: %[[FC:.+]] = "tfl.fully_connected"(%[[ARG_0]], %[[QCONST_0]], %[[QCONST_1]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<1x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>, tensor<2x!quant.uniform<i32:f32:0, {1.000000e+08,1.500000e+09}>>) -> tensor<1x2x!quant.uniform<i8:f32, 4.000000e+04:127>>
+// CHECK: %[[FC:.+]] = "tfl.fully_connected"(%[[ARG_0]], %[[QCONST_0]], %[[QCONST_1]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<1x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>, tensor<2x!quant.uniform<i32:f32:0, {1.000000e+08,1.500000e+09}>>) -> tensor<1x2x!quant.uniform<i8:f32, 4.000000e+04:127>>
 // CHECK-NEXT: return %[[FC]] : tensor<1x2x!quant.uniform<i8:f32, 4.000000e+04:127>>
 
 // -----
@@ -427,8 +427,8 @@ func.func @dot_general_upstream_srq_per_axis_quantized_filter(%arg0: tensor<1x3x
 
 // CHECK-LABEL: dot_general_upstream_srq_per_axis_quantized_filter_with_batch_dim
 func.func @dot_general_upstream_srq_per_axis_quantized_filter_with_batch_dim(%arg0: tensor<1x1x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>) -> tensor<1x1x2x!quant.uniform<i8:f32, 4.000000e+04:127>> {
-  %0 = stablehlo.constant() {value = dense<1> : tensor<1x3x2xi8>} : () -> tensor<1x3x2x!quant.uniform<i8:f32:1,{2.000000e+02, 3.000000e+03}>>
-  %1 = stablehlo.dot_general %arg0, %0, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<1x1x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>, tensor<1x3x2x!quant.uniform<i8:f32:1,{2.000000e+02, 3.000000e+03}>>) -> tensor<1x1x2x!quant.uniform<i8:f32, 4.000000e+04:127>>
+  %0 = stablehlo.constant() {value = dense<1> : tensor<1x3x2xi8>} : () -> tensor<1x3x2x!quant.uniform<i8:f32:0,{2.000000e+02}>>
+  %1 = stablehlo.dot_general %arg0, %0, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<1x1x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>, tensor<1x3x2x!quant.uniform<i8:f32:0,{2.000000e+02}>>) -> tensor<1x1x2x!quant.uniform<i8:f32, 4.000000e+04:127>>
   return %1 : tensor<1x1x2x!quant.uniform<i8:f32, 4.000000e+04:127>>
 }
 // Nothing changes.
@@ -443,8 +443,8 @@ func.func @dot_general_upstream_srq_per_axis_quantized_filter_with_batch_dim(%ar
 
 // CHECK-LABEL: dot_general_upstream_srq_per_axis_quantized_filter_multibatch
 func.func @dot_general_upstream_srq_per_axis_quantized_filter_multibatch(%arg0: tensor<3x1x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>) -> tensor<3x1x2x!quant.uniform<i8:f32, 4.000000e+04:127>> {
-  %0 = stablehlo.constant() {value = dense<1> : tensor<3x3x2xi8>} : () -> tensor<3x3x2x!quant.uniform<i8:f32:1,{2.000000e+02, 3.000000e+03}>>
-  %1 = stablehlo.dot_general %arg0, %0, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<3x1x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>, tensor<3x3x2x!quant.uniform<i8:f32:1,{2.000000e+02, 3.000000e+03}>>) -> tensor<3x1x2x!quant.uniform<i8:f32, 4.000000e+04:127>>
+  %0 = stablehlo.constant() {value = dense<1> : tensor<3x3x2xi8>} : () -> tensor<3x3x2x!quant.uniform<i8:f32:2,{2.000000e+02, 3.000000e+03}>>
+  %1 = stablehlo.dot_general %arg0, %0, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<3x1x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>, tensor<3x3x2x!quant.uniform<i8:f32:2,{2.000000e+02, 3.000000e+03}>>) -> tensor<3x1x2x!quant.uniform<i8:f32, 4.000000e+04:127>>
   return %1 : tensor<3x1x2x!quant.uniform<i8:f32, 4.000000e+04:127>>
 }
 // Nothing changes.
@@ -459,8 +459,8 @@ func.func @dot_general_upstream_srq_per_axis_quantized_filter_multibatch(%arg0:
 
 // CHECK-LABEL: dot_general_upstream_srq_per_axis_quantized_filter_with_multiple_contracting_dims
 func.func @dot_general_upstream_srq_per_axis_quantized_filter_with_multiple_contracting_dims(%arg0: tensor<1x2x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>) -> tensor<1x1x!quant.uniform<i8:f32, 4.000000e+04:127>> {
-  %0 = stablehlo.constant() {value = dense<1> : tensor<1x3x2xi8>} : () -> tensor<1x3x2x!quant.uniform<i8:f32:1,{2.000000e+02, 3.000000e+03}>>
-  %1 = stablehlo.dot_general %arg0, %0, contracting_dims = [1, 2] x [2, 1] : (tensor<1x2x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>, tensor<1x3x2x!quant.uniform<i8:f32:1,{2.000000e+02, 3.000000e+03}>>) -> tensor<1x1x!quant.uniform<i8:f32, 4.000000e+04:127>>
+  %0 = stablehlo.constant() {value = dense<1> : tensor<1x3x2xi8>} : () -> tensor<1x3x2x!quant.uniform<i8:f32:0,{2.000000e+02}>>
+  %1 = stablehlo.dot_general %arg0, %0, contracting_dims = [1, 2] x [2, 1] : (tensor<1x2x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>, tensor<1x3x2x!quant.uniform<i8:f32:0,{2.000000e+02}>>) -> tensor<1x1x!quant.uniform<i8:f32, 4.000000e+04:127>>
   return %1 : tensor<1x1x!quant.uniform<i8:f32, 4.000000e+04:127>>
 }
 // Nothing changes.
@@ -486,17 +486,17 @@ func.func @dot_general_upstream_srq_per_axis_quantized_filter_with_multiple_cont
 // * dot_general_with_relu6_fn
 
 func.func @dot_general_srq(%arg0: tensor<1x1024x!quant.uniform<i8:f32, 1.000000e+0:0>>) -> (tensor<1x3x!quant.uniform<i8:f32, 3.000000e+00:-127>>) {
-  %0 = stablehlo.constant() {value = dense<1> : tensor<1024x3xi8>} : () -> tensor<1024x3x!quant.uniform<i8<-127:127>:f32:1, {2.000000e+0, 2.000000e+0}>>
-  %1 = stablehlo.dot_general %arg0, %0, contracting_dims = [1] x [0] : (tensor<1x1024x!quant.uniform<i8:f32, 1.000000e+0:0>>, tensor<1024x3x!quant.uniform<i8<-127:127>:f32:1, {2.000000e+0, 2.000000e+0}>>) -> tensor<1x3x!quant.uniform<i32:f32:1, {2.000000e+0, 2.000000e+0, 2.000000e+0}>>
+  %0 = stablehlo.constant() {value = dense<1> : tensor<1024x3xi8>} : () -> tensor<1024x3x!quant.uniform<i8<-127:127>:f32:1, {2.000000e+0, 2.000000e+0, 2.000000e+0}>>
+  %1 = stablehlo.dot_general %arg0, %0, contracting_dims = [1] x [0] : (tensor<1x1024x!quant.uniform<i8:f32, 1.000000e+0:0>>, tensor<1024x3x!quant.uniform<i8<-127:127>:f32:1, {2.000000e+0, 2.000000e+0, 2.000000e+0}>>) -> tensor<1x3x!quant.uniform<i32:f32:1, {2.000000e+0, 2.000000e+0, 2.000000e+0}>>
   %2 = stablehlo.uniform_quantize %1 : (tensor<1x3x!quant.uniform<i32:f32:1, {2.000000e+0, 2.000000e+0, 2.000000e+0}>>) -> tensor<1x3x!quant.uniform<i8:f32, 3.000000e+00:-127>>
   return %2 : tensor<1x3x!quant.uniform<i8:f32, 3.000000e+00:-127>>
 }
 // CHECK-LABEL: dot_general_srq
 // CHECK-SAME: (%[[ARG_1:.+]]: tensor<1x1024x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3x!quant.uniform<i8:f32, 3.000000e+00:-127>>
 // CHECK-NOT: stablehlo.dot_general
-// CHECK: %[[QCONST_0:.+]] =  "tfl.pseudo_qconst"() {qtype = tensor<3x1024x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+00,2.000000e+00}>>, value = dense<1> : tensor<3x1024xi8>} : () -> tensor<3x1024x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+00,2.000000e+00}>>
-// CHECK: %[[QCONST_1:.+]] =  "tfl.pseudo_qconst"() {qtype = tensor<3x!quant.uniform<i32:f32:0, {2.000000e+00,2.000000e+00}>>, value = dense<0> : tensor<3xi32>} : () -> tensor<3x!quant.uniform<i32:f32:0, {2.000000e+00,2.000000e+00}>>
-// CHECK: %[[FULLY_CONNECTED:.+]] =  "tfl.fully_connected"(%[[ARG_1]], %[[QCONST_0]], %[[QCONST_1]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<1x1024x!quant.uniform<i8:f32, 1.000000e+00>>, tensor<3x1024x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+00,2.000000e+00}>>, tensor<3x!quant.uniform<i32:f32:0, {2.000000e+00,2.000000e+00}>>) -> tensor<1x3x!quant.uniform<i8:f32, 3.000000e+00:-127>>
+// CHECK: %[[QCONST_0:.+]] =  "tfl.pseudo_qconst"() <{qtype = tensor<3x1024x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+00,2.000000e+00,2.000000e+00}>>, value = dense<1> : tensor<3x1024xi8>}> : () -> tensor<3x1024x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+00,2.000000e+00,2.000000e+00}>>
+// CHECK: %[[QCONST_1:.+]] =  "tfl.pseudo_qconst"() <{qtype = tensor<3x!quant.uniform<i32:f32:0, {2.000000e+00,2.000000e+00,2.000000e+00}>>, value = dense<0> : tensor<3xi32>}> : () -> tensor<3x!quant.uniform<i32:f32:0, {2.000000e+00,2.000000e+00,2.000000e+00}>>
+// CHECK: %[[FULLY_CONNECTED:.+]] =  "tfl.fully_connected"(%[[ARG_1]], %[[QCONST_0]], %[[QCONST_1]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<1x1024x!quant.uniform<i8:f32, 1.000000e+00>>, tensor<3x1024x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+00,2.000000e+00,2.000000e+00}>>, tensor<3x!quant.uniform<i32:f32:0, {2.000000e+00,2.000000e+00,2.000000e+00}>>) -> tensor<1x3x!quant.uniform<i8:f32, 3.000000e+00:-127>>
 // CHECK-NOT: tfl.batch_matmul
 // CHECK: return %[[FULLY_CONNECTED]]
 
@@ -516,9 +516,9 @@ func.func @dot_general_with_bias_same_shape_srq(%arg0: tensor<1x1024x!quant.unif
 }
 // CHECK-LABEL: dot_general_with_bias_same_shape
 // CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x1024x!quant.uniform<i8:f32, 1.000000e+00>>) -> tensor<1x3x!quant.uniform<i8:f32, 3.000000e+00:-127>>
-// CHECK-DAG: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<3x1024x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+00,2.000000e+00,2.000000e+00}>>, value = dense<1> : tensor<3x1024xi8>} : () -> tensor<3x1024x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+00,2.000000e+00,2.000000e+00}>>
-// CHECK-DAG: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<3x!quant.uniform<i32:f32:0, {2.000000e+00,2.000000e+00,2.000000e+00}>>, value = dense<2> : tensor<1x3xi32>} : () -> tensor<3x!quant.uniform<i32:f32:0, {2.000000e+00,2.000000e+00,2.000000e+00}>>
-// CHECK: %[[FULLY_CONNECTED:.+]] = "tfl.fully_connected"(%[[ARG_0]], %[[QCONST_0]], %[[QCONST_1]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<1x1024x!quant.uniform<i8:f32, 1.000000e+00>>, tensor<3x1024x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+00,2.000000e+00,2.000000e+00}>>, tensor<3x!quant.uniform<i32:f32:0, {2.000000e+00,2.000000e+00,2.000000e+00}>>) -> tensor<1x3x!quant.uniform<i8:f32, 3.000000e+00:-127>>
+// CHECK-DAG: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<3x1024x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+00,2.000000e+00,2.000000e+00}>>, value = dense<1> : tensor<3x1024xi8>}> : () -> tensor<3x1024x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+00,2.000000e+00,2.000000e+00}>>
+// CHECK-DAG: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<3x!quant.uniform<i32:f32:0, {2.000000e+00,2.000000e+00,2.000000e+00}>>, value = dense<2> : tensor<1x3xi32>}> : () -> tensor<3x!quant.uniform<i32:f32:0, {2.000000e+00,2.000000e+00,2.000000e+00}>>
+// CHECK: %[[FULLY_CONNECTED:.+]] = "tfl.fully_connected"(%[[ARG_0]], %[[QCONST_0]], %[[QCONST_1]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<1x1024x!quant.uniform<i8:f32, 1.000000e+00>>, tensor<3x1024x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+00,2.000000e+00,2.000000e+00}>>, tensor<3x!quant.uniform<i32:f32:0, {2.000000e+00,2.000000e+00,2.000000e+00}>>) -> tensor<1x3x!quant.uniform<i8:f32, 3.000000e+00:-127>>
 // CHECK: return %[[FULLY_CONNECTED]]
 
 // -----
@@ -542,9 +542,9 @@ func.func @dot_general_srq_constant_transpose_rhs(%arg0: tensor<1x3x!quant.unifo
 
 // Checks that the `tfl.pseudo_qconst` corresponding to the `stablehlo.constant`
 // has the same shape.
-// CHECK: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<2x3x!quant.uniform<i8:f32:0, {3.000000e+00,3.000000e+00}>>, value = dense<1> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8:f32:0, {3.000000e+00,3.000000e+00}>>
-// CHECK: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<2x!quant.uniform<i32:f32:0, {1.500000e+01,1.500000e+01}>>, value = dense<0> : tensor<2xi32>} : () -> tensor<2x!quant.uniform<i32:f32:0, {1.500000e+01,1.500000e+01}>>
-// CHECK: %[[FULLY_CONNECTED:.+]] = "tfl.fully_connected"(%[[ARG]], %[[QCONST_0]], %[[QCONST_1]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<1x3x!quant.uniform<i8:f32, 5.000000e+00:-128>>, tensor<2x3x!quant.uniform<i8:f32:0, {3.000000e+00,3.000000e+00}>>, tensor<2x!quant.uniform<i32:f32:0, {1.500000e+01,1.500000e+01}>>) -> tensor<1x2x!quant.uniform<i8:f32, 4.000000e+00:7>>
+// CHECK: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<2x3x!quant.uniform<i8:f32:0, {3.000000e+00,3.000000e+00}>>, value = dense<1> : tensor<2x3xi8>}> : () -> tensor<2x3x!quant.uniform<i8:f32:0, {3.000000e+00,3.000000e+00}>>
+// CHECK: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<2x!quant.uniform<i32:f32:0, {1.500000e+01,1.500000e+01}>>, value = dense<0> : tensor<2xi32>}> : () -> tensor<2x!quant.uniform<i32:f32:0, {1.500000e+01,1.500000e+01}>>
+// CHECK: %[[FULLY_CONNECTED:.+]] = "tfl.fully_connected"(%[[ARG]], %[[QCONST_0]], %[[QCONST_1]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<1x3x!quant.uniform<i8:f32, 5.000000e+00:-128>>, tensor<2x3x!quant.uniform<i8:f32:0, {3.000000e+00,3.000000e+00}>>, tensor<2x!quant.uniform<i32:f32:0, {1.500000e+01,1.500000e+01}>>) -> tensor<1x2x!quant.uniform<i8:f32, 4.000000e+00:7>>
 
 // Also checks that the i32 -> i8 uniform quantize is absorbed into
 // `tfl.fully_connected`.
@@ -557,9 +557,9 @@ func.func @dot_general_srq_constant_transpose_rhs(%arg0: tensor<1x3x!quant.unifo
 // (e.g. argument), the conversion to `tfl.fully_connected` doesn't happen.
 
 // CHECK-LABEL: dot_general_srq_arg_transpose_rhs
-func.func @dot_general_srq_arg_transpose_rhs(%arg0: tensor<1x3x!quant.uniform<i8:f32, 5.000000e+00:-128>>, %arg1: tensor<2x3x!quant.uniform<i8:f32, 3.000000e+00:-23>>) -> tensor<1x2x!quant.uniform<i8:f32, 4.000000e+00:7>> {
-  %1 = stablehlo.transpose %arg1, dims = [1, 0] : (tensor<2x3x!quant.uniform<i8:f32, 3.000000e+00:-23>>) -> tensor<3x2x!quant.uniform<i8:f32, 3.000000e+00:-23>>
-  %2 = stablehlo.dot_general %arg0, %1, contracting_dims = [1] x [0] : (tensor<1x3x!quant.uniform<i8:f32, 5.000000e+00:-128>>, tensor<3x2x!quant.uniform<i8:f32, 3.000000e+00:-23>>) -> tensor<1x2x!quant.uniform<i32:f32, 2.000000e+00>>
+func.func @dot_general_srq_arg_transpose_rhs(%arg0: tensor<1x3x!quant.uniform<i8:f32, 5.000000e+00:-128>>, %arg1: tensor<2x3x!quant.uniform<i8:f32, 3.000000e+00>>) -> tensor<1x2x!quant.uniform<i8:f32, 4.000000e+00:7>> {
+  %1 = stablehlo.transpose %arg1, dims = [1, 0] : (tensor<2x3x!quant.uniform<i8:f32, 3.000000e+00>>) -> tensor<3x2x!quant.uniform<i8:f32, 3.000000e+00>>
+  %2 = stablehlo.dot_general %arg0, %1, contracting_dims = [1] x [0] : (tensor<1x3x!quant.uniform<i8:f32, 5.000000e+00:-128>>, tensor<3x2x!quant.uniform<i8:f32, 3.000000e+00>>) -> tensor<1x2x!quant.uniform<i32:f32, 2.000000e+00>>
   %3 = stablehlo.uniform_quantize %2 : (tensor<1x2x!quant.uniform<i32:f32, 2.000000e+00>>) -> tensor<1x2x!quant.uniform<i8:f32, 4.000000e+00:7>>
   return %3 : tensor<1x2x!quant.uniform<i8:f32, 4.000000e+00:7>>
 }
@@ -577,7 +577,7 @@ func.func @dot_general_srq_arg_transpose_rhs(%arg0: tensor<1x3x!quant.uniform<i8
 // Tests static range quantized dot_general with qi32 -> qi8 requantization is
 // properly lowered to `tfl.batch_matmul`.
 
-func.func @dot_general_srq_to_batch_matmul(%arg0: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+00:3>>, %arg1: tensor<1x2x4x5x!quant.uniform<i8:f32, 2.000000e+00:4>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 6.000000e+00:5>> {
+func.func @dot_general_srq_to_batch_matmul(%arg0: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+00:3>>, %arg1: tensor<1x2x4x5x!quant.uniform<i8:f32, 2.000000e+00>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 6.000000e+00:5>> {
   %0 = "stablehlo.dot_general"(%arg0, %arg1) {
     dot_dimension_numbers = #stablehlo.dot<
       lhs_batching_dimensions = [0, 1],
@@ -586,14 +586,14 @@ func.func @dot_general_srq_to_batch_matmul(%arg0: tensor<1x2x3x4x!quant.uniform<
       rhs_contracting_dimensions = [2]
     >,
     precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]
-  } : (tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+00:3>>, tensor<1x2x4x5x!quant.uniform<i8:f32, 2.000000e+00:4>>) -> tensor<1x2x3x5x!quant.uniform<i32:f32, 4.000000e+00:1>>
+  } : (tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+00:3>>, tensor<1x2x4x5x!quant.uniform<i8:f32, 2.000000e+00>>) -> tensor<1x2x3x5x!quant.uniform<i32:f32, 4.000000e+00:1>>
   %1 = stablehlo.uniform_quantize %0 : (tensor<1x2x3x5x!quant.uniform<i32:f32, 4.000000e+00:1>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 6.000000e+00:5>>
   return %1 : tensor<1x2x3x5x!quant.uniform<i8:f32, 6.000000e+00:5>>
 }
 
 // CHECK-LABEL: dot_general_srq_to_batch_matmul
-// CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+00:3>>, %[[ARG_1:.+]]: tensor<1x2x4x5x!quant.uniform<i8:f32, 2.000000e+00:4>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 6.000000e+00:5>>
-// CHECK: %[[BMM:.+]] = "tfl.batch_matmul"(%[[ARG_0]], %[[ARG_1]]) {adj_x = false, adj_y = false} : (tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+00:3>>, tensor<1x2x4x5x!quant.uniform<i8:f32, 2.000000e+00:4>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 6.000000e+00:5>>
+// CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+00:3>>, %[[ARG_1:.+]]: tensor<1x2x4x5x!quant.uniform<i8:f32, 2.000000e+00>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 6.000000e+00:5>>
+// CHECK: %[[BMM:.+]] = "tfl.batch_matmul"(%[[ARG_0]], %[[ARG_1]]) <{adj_x = false, adj_y = false}> : (tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+00:3>>, tensor<1x2x4x5x!quant.uniform<i8:f32, 2.000000e+00>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 6.000000e+00:5>>
 // CHECK-NOT: stablehlo.dot_general
 // CHECK-NOT: stablehlo.uniform_quantize
 // CHECK-NOT: tfl.fully_connected
@@ -606,7 +606,7 @@ func.func @dot_general_srq_to_batch_matmul(%arg0: tensor<1x2x3x4x!quant.uniform<
 // not converted to `tfl.batch_matmul` when there are multiple use of the
 // intermediate result.
 
-func.func @dot_general_srq_multiple_use_of_intermediate_result(%arg0: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+00:3>>, %arg1: tensor<1x2x4x5x!quant.uniform<i8:f32, 2.000000e+00:4>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 6.000000e+00:5>> {
+func.func @dot_general_srq_multiple_use_of_intermediate_result(%arg0: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+00:3>>, %arg1: tensor<1x2x4x5x!quant.uniform<i8:f32, 2.000000e+00>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 6.000000e+00:5>> {
   %0 = "stablehlo.dot_general"(%arg0, %arg1) {
     dot_dimension_numbers = #stablehlo.dot<
       lhs_batching_dimensions = [0, 1],
@@ -615,7 +615,7 @@ func.func @dot_general_srq_multiple_use_of_intermediate_result(%arg0: tensor<1x2
       rhs_contracting_dimensions = [2]
     >,
     precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]
-  } : (tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+00:3>>, tensor<1x2x4x5x!quant.uniform<i8:f32, 2.000000e+00:4>>) -> tensor<1x2x3x5x!quant.uniform<i32:f32, 4.000000e+00:1>>
+  } : (tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+00:3>>, tensor<1x2x4x5x!quant.uniform<i8:f32, 2.000000e+00>>) -> tensor<1x2x3x5x!quant.uniform<i32:f32, 4.000000e+00:1>>
   %1 = stablehlo.uniform_quantize %0 : (tensor<1x2x3x5x!quant.uniform<i32:f32, 4.000000e+00:1>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 6.000000e+00:5>>
   %2 = stablehlo.uniform_quantize %0 : (tensor<1x2x3x5x!quant.uniform<i32:f32, 4.000000e+00:1>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 6.000000e+00:5>>
   %3 = stablehlo.add %1, %2 : tensor<1x2x3x5x!quant.uniform<i8:f32, 6.000000e+00:5>>
@@ -646,11 +646,11 @@ func.func @conv_srq(%arg0: tensor<1x5x5x2x!quant.uniform<i8:f32, 2.000000e+00:0>
 }
 // CHECK-LABEL: func.func @conv_srq
 // CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x5x5x2x!quant.uniform<i8:f32, 2.000000e+00>>) -> tensor<1x4x4x4x!quant.uniform<i8:f32, 8.000000e+00:-128>>
-// CHECK-DAG: %[[CONST_0:.+]] = "tfl.pseudo_const"() {value = dense<{{\[\[0, 0\], \[1, 1\], \[1, 1\], \[0, 0\]\]}}> : tensor<4x2xi32>} : () -> tensor<4x2xi32>
-// CHECK-DAG: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<4x4x4x2x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00,3.000000e+00,3.000000e+00}>>, value = dense<3> : tensor<4x4x4x2xi8>} : () -> tensor<4x4x4x2x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00,3.000000e+00,3.000000e+00}>>
-// CHECK-DAG: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<4x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00}>>, value = dense<0> : tensor<4xi32>} : () -> tensor<4x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00}>>
+// CHECK-DAG: %[[CONST_0:.+]] = "tfl.pseudo_const"() <{value = dense<{{\[\[0, 0\], \[1, 1\], \[1, 1\], \[0, 0\]\]}}> : tensor<4x2xi32>}> : () -> tensor<4x2xi32>
+// CHECK-DAG: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<4x4x4x2x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00,3.000000e+00,3.000000e+00}>>, value = dense<3> : tensor<4x4x4x2xi8>}> : () -> tensor<4x4x4x2x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00,3.000000e+00,3.000000e+00}>>
+// CHECK-DAG: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<4x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00}>>, value = dense<0> : tensor<4xi32>}> : () -> tensor<4x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00}>>
 // CHECK: %[[PAD:.+]] = "tfl.pad"(%[[ARG_0]], %[[CONST_0]]) : (tensor<1x5x5x2x!quant.uniform<i8:f32, 2.000000e+00>>, tensor<4x2xi32>) -> tensor<1x7x7x2x!quant.uniform<i8:f32, 2.000000e+00>>
-// CHECK: %[[CONV_2D:.+]] = "tfl.conv_2d"(%[[PAD]], %[[QCONST_0]], %[[QCONST_1]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x7x7x2x!quant.uniform<i8:f32, 2.000000e+00>>, tensor<4x4x4x2x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00,3.000000e+00,3.000000e+00}>>, tensor<4x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00}>>) -> tensor<1x4x4x4x!quant.uniform<i8:f32, 8.000000e+00:-128>>
+// CHECK: %[[CONV_2D:.+]] = "tfl.conv_2d"(%[[PAD]], %[[QCONST_0]], %[[QCONST_1]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<1x7x7x2x!quant.uniform<i8:f32, 2.000000e+00>>, tensor<4x4x4x2x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00,3.000000e+00,3.000000e+00}>>, tensor<4x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00}>>) -> tensor<1x4x4x4x!quant.uniform<i8:f32, 8.000000e+00:-128>>
 // CHECK: return %[[CONV_2D]]
 
 func.func @conv_same_padding_srq(%arg0: tensor<1x32x32x3x!quant.uniform<i8:f32, 2.000000e+00:0>>) -> (tensor<1x32x32x2x!quant.uniform<i8:f32, 8.000000e+00:-128>>) {
@@ -661,9 +661,9 @@ func.func @conv_same_padding_srq(%arg0: tensor<1x32x32x3x!quant.uniform<i8:f32,
 }
 // CHECK-LABEL: func.func @conv_same_padding_srq
 // CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x32x32x3x!quant.uniform<i8:f32, 2.000000e+00>>) -> tensor<1x32x32x2x!quant.uniform<i8:f32, 8.000000e+00:-128>>
-// CHECK-DAG: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<2x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00}>>, value = dense<3> : tensor<2x3x3x3xi8>} : () -> tensor<2x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00}>>
-// CHECK-DAG: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<2x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00}>>, value = dense<0> : tensor<2xi32>} : () -> tensor<2x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00}>>
-// CHECK: %[[CONV_2D:.+]] = "tfl.conv_2d"(%[[ARG_0]], %[[QCONST_0]], %[[QCONST_1]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x32x32x3x!quant.uniform<i8:f32, 2.000000e+00>>, tensor<2x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00}>>, tensor<2x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00}>>) -> tensor<1x32x32x2x!quant.uniform<i8:f32, 8.000000e+00:-128>>
+// CHECK-DAG: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<2x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00}>>, value = dense<3> : tensor<2x3x3x3xi8>}> : () -> tensor<2x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00}>>
+// CHECK-DAG: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<2x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00}>>, value = dense<0> : tensor<2xi32>}> : () -> tensor<2x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00}>>
+// CHECK: %[[CONV_2D:.+]] = "tfl.conv_2d"(%[[ARG_0]], %[[QCONST_0]], %[[QCONST_1]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<1x32x32x3x!quant.uniform<i8:f32, 2.000000e+00>>, tensor<2x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00}>>, tensor<2x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00}>>) -> tensor<1x32x32x2x!quant.uniform<i8:f32, 8.000000e+00:-128>>
 // CHECK: return %[[CONV_2D]] : tensor<1x32x32x2x!quant.uniform<i8:f32, 8.000000e+00:-128>>
 
 // -----
@@ -676,9 +676,9 @@ func.func @conv_same_padding_srq_non_unit_strides(%arg0: tensor<1x32x32x3x!quant
 }
 // CHECK-LABEL: func.func @conv_same_padding_srq_non_unit_strides
 // CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x32x32x3x!quant.uniform<i8:f32, 2.000000e+00>>) -> tensor<1x16x16x2x!quant.uniform<i8:f32, 8.000000e+00:-128>>
-// CHECK-DAG: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<2x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00}>>, value = dense<3> : tensor<2x3x3x3xi8>} : () -> tensor<2x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00}>>
-// CHECK-DAG: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<2x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00}>>, value = dense<0> : tensor<2xi32>} : () -> tensor<2x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00}>>
-// CHECK: %[[CONV_2D:.+]] = "tfl.conv_2d"(%[[ARG_0]], %[[QCONST_0]], %[[QCONST_1]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x32x32x3x!quant.uniform<i8:f32, 2.000000e+00>>, tensor<2x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00}>>, tensor<2x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00}>>) -> tensor<1x16x16x2x!quant.uniform<i8:f32, 8.000000e+00:-128>>
+// CHECK-DAG: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<2x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00}>>, value = dense<3> : tensor<2x3x3x3xi8>}> : () -> tensor<2x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00}>>
+// CHECK-DAG: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<2x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00}>>, value = dense<0> : tensor<2xi32>}> : () -> tensor<2x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00}>>
+// CHECK: %[[CONV_2D:.+]] = "tfl.conv_2d"(%[[ARG_0]], %[[QCONST_0]], %[[QCONST_1]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32}> : (tensor<1x32x32x3x!quant.uniform<i8:f32, 2.000000e+00>>, tensor<2x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00}>>, tensor<2x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00}>>) -> tensor<1x16x16x2x!quant.uniform<i8:f32, 8.000000e+00:-128>>
 // CHECK: return %[[CONV_2D]] : tensor<1x16x16x2x!quant.uniform<i8:f32, 8.000000e+00:-128>>
 
 // -----
@@ -692,11 +692,11 @@ func.func @conv_srq_transpose_conv(%arg0: tensor<1x5x5x2x!quant.uniform<i8:f32,
 // CHECK-LABEL: func.func @conv_srq_transpose_conv
 // CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x5x5x2x!quant.uniform<i8:f32, 2.000000e+00>>) -> tensor<1x14x14x4x!quant.uniform<i8:f32, 8.000000e+00:-128>>
 // CHECK-DAG: %[[CONST_0:.+]] = arith.constant dense<[1, 14, 14, 4]> : tensor<4xi32>
-// CHECK-DAG: %[[CONST_1:.*]] = "tfl.pseudo_const"() {value = dense<{{\[\[0, 0\], \[1, 1\], \[1, 1\], \[0, 0\]\]}}> : tensor<4x2xi32>} : () -> tensor<4x2xi32>
-// CHECK-DAG: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<4x2x2x2x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00,3.000000e+00,3.000000e+00}>>, value = dense<3> : tensor<4x2x2x2xi8>} : () -> tensor<4x2x2x2x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00,3.000000e+00,3.000000e+00}>>
-// CHECK-DAG: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<4x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00}>>, value = dense<0> : tensor<4xi32>} : () -> tensor<4x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00}>>
+// CHECK-DAG: %[[CONST_1:.*]] = "tfl.pseudo_const"() <{value = dense<{{\[\[0, 0\], \[1, 1\], \[1, 1\], \[0, 0\]\]}}> : tensor<4x2xi32>}> : () -> tensor<4x2xi32>
+// CHECK-DAG: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<4x2x2x2x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00,3.000000e+00,3.000000e+00}>>, value = dense<3> : tensor<4x2x2x2xi8>}> : () -> tensor<4x2x2x2x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00,3.000000e+00,3.000000e+00}>>
+// CHECK-DAG: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<4x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00}>>, value = dense<0> : tensor<4xi32>}> : () -> tensor<4x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00}>>
 // CHECK: %[[PAD:.+]] = "tfl.pad"(%[[ARG_0]], %[[CONST_1]]) : (tensor<1x5x5x2x!quant.uniform<i8:f32, 2.000000e+00>>, tensor<4x2xi32>) -> tensor<1x7x7x2x!quant.uniform<i8:f32, 2.000000e+00>>
-// CHECK: %[[TRANSPOSE_CONV_2D:.+]] = "tfl.transpose_conv"(%[[CONST_0]], %[[QCONST_0]], %[[PAD]], %[[QCONST_1]]) {fused_activation_function = "NONE", padding = "VALID", stride_h = 4 : i32, stride_w = 4 : i32} : (tensor<4xi32>, tensor<4x2x2x2x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00,3.000000e+00,3.000000e+00}>>, tensor<1x7x7x2x!quant.uniform<i8:f32, 2.000000e+00>>, tensor<4x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00}>>) -> tensor<1x14x14x4x!quant.uniform<i8:f32, 8.000000e+00:-128>>
+// CHECK: %[[TRANSPOSE_CONV_2D:.+]] = "tfl.transpose_conv"(%[[CONST_0]], %[[QCONST_0]], %[[PAD]], %[[QCONST_1]]) <{fused_activation_function = "NONE", padding = "VALID", stride_h = 4 : i32, stride_w = 4 : i32}> : (tensor<4xi32>, tensor<4x2x2x2x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00,3.000000e+00,3.000000e+00}>>, tensor<1x7x7x2x!quant.uniform<i8:f32, 2.000000e+00>>, tensor<4x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00}>>) -> tensor<1x14x14x4x!quant.uniform<i8:f32, 8.000000e+00:-128>>
 // CHECK: return %[[TRANSPOSE_CONV_2D]]
 
 // -----
@@ -721,11 +721,11 @@ func.func @conv_with_bias_and_relu_srq(%arg0: tensor<1x5x5x2x!quant.uniform<i8:f
   }
 // CHECK-LABEL: func.func @conv_with_bias_and_relu_srq
 // CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x5x5x2x!quant.uniform<i8:f32, 2.000000e+00>>) -> tensor<1x4x4x4x!quant.uniform<i8:f32, 8.000000e+00:-128>>
-// CHECK-DAG: %[[CONST_0:.+]] = "tfl.pseudo_const"() {value = dense<{{\[\[0, 0\], \[1, 1\], \[1, 1\], \[0, 0\]\]}}> : tensor<4x2xi32>} : () -> tensor<4x2xi32>
-// CHECK-DAG: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<4x4x4x2x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00,3.000000e+00,3.000000e+00}>>, value = dense<3> : tensor<4x4x4x2xi8>} : () -> tensor<4x4x4x2x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00,3.000000e+00,3.000000e+00}>>
-// CHECK-DAG: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<4x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00}>>, value = dense<5> : tensor<1x1x1x4xi32>} : () -> tensor<4x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00}>>
+// CHECK-DAG: %[[CONST_0:.+]] = "tfl.pseudo_const"() <{value = dense<{{\[\[0, 0\], \[1, 1\], \[1, 1\], \[0, 0\]\]}}> : tensor<4x2xi32>}> : () -> tensor<4x2xi32>
+// CHECK-DAG: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<4x4x4x2x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00,3.000000e+00,3.000000e+00}>>, value = dense<3> : tensor<4x4x4x2xi8>}> : () -> tensor<4x4x4x2x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00,3.000000e+00,3.000000e+00}>>
+// CHECK-DAG: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<4x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00}>>, value = dense<5> : tensor<1x1x1x4xi32>}> : () -> tensor<4x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00}>>
 // CHECK: %[[PAD:.+]] = "tfl.pad"(%[[ARG_0]], %[[CONST_0]]) : (tensor<1x5x5x2x!quant.uniform<i8:f32, 2.000000e+00>>, tensor<4x2xi32>) -> tensor<1x7x7x2x!quant.uniform<i8:f32, 2.000000e+00>>
-// CHECK: %[[CONV_2D:.+]] = "tfl.conv_2d"(%[[PAD]], %[[QCONST_0]], %[[QCONST_1]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x7x7x2x!quant.uniform<i8:f32, 2.000000e+00>>, tensor<4x4x4x2x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00,3.000000e+00,3.000000e+00}>>, tensor<4x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00}>>) -> tensor<1x4x4x4x!quant.uniform<i8:f32, 8.000000e+00:-128>>
+// CHECK: %[[CONV_2D:.+]] = "tfl.conv_2d"(%[[PAD]], %[[QCONST_0]], %[[QCONST_1]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<1x7x7x2x!quant.uniform<i8:f32, 2.000000e+00>>, tensor<4x4x4x2x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00,3.000000e+00,3.000000e+00}>>, tensor<4x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00}>>) -> tensor<1x4x4x4x!quant.uniform<i8:f32, 8.000000e+00:-128>>
 // CHECK: return %[[CONV_2D]]
 
 func.func @conv_with_bias_same_padding_srq(%arg0: tensor<1x32x32x3x!quant.uniform<i8:f32, 2.000000e+00:0>>) -> (tensor<1x32x32x2x!quant.uniform<i8:f32, 8.000000e+00:-128>>) {
@@ -739,9 +739,9 @@ func.func @conv_with_bias_same_padding_srq(%arg0: tensor<1x32x32x3x!quant.unifor
 }
 // CHECK-LABEL: func.func @conv_with_bias_same_padding_srq
 // CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x32x32x3x!quant.uniform<i8:f32, 2.000000e+00>>) -> tensor<1x32x32x2x!quant.uniform<i8:f32, 8.000000e+00:-128>>
-// CHECK-DAG: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<2x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00}>>, value = dense<3> : tensor<2x3x3x3xi8>} : () -> tensor<2x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00}>>
-// CHECK-DAG: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<2x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00}>>, value = dense<5> : tensor<1x1x1x2xi32>} : () -> tensor<2x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00}>>
-// CHECK: %[[CONV_2D:.+]] = "tfl.conv_2d"(%[[ARG_0]], %[[QCONST_0]], %[[QCONST_1]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x32x32x3x!quant.uniform<i8:f32, 2.000000e+00>>, tensor<2x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00}>>, tensor<2x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00}>>) -> tensor<1x32x32x2x!quant.uniform<i8:f32, 8.000000e+00:-128>>
+// CHECK-DAG: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<2x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00}>>, value = dense<3> : tensor<2x3x3x3xi8>}> : () -> tensor<2x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00}>>
+// CHECK-DAG: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<2x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00}>>, value = dense<5> : tensor<1x1x1x2xi32>}> : () -> tensor<2x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00}>>
+// CHECK: %[[CONV_2D:.+]] = "tfl.conv_2d"(%[[ARG_0]], %[[QCONST_0]], %[[QCONST_1]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<1x32x32x3x!quant.uniform<i8:f32, 2.000000e+00>>, tensor<2x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00}>>, tensor<2x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00}>>) -> tensor<1x32x32x2x!quant.uniform<i8:f32, 8.000000e+00:-128>>
 // CHECK: return %[[CONV_2D]]
 
 func.func @conv_with_bias_same_padding_srq_depthwise(%arg0: tensor<1x4x5x3x!quant.uniform<i8:f32, 2.000000e+00:0>>) -> (tensor<1x5x6x3x!quant.uniform<i8:f32, 8.000000e+00:-128>>) {
@@ -755,11 +755,11 @@ func.func @conv_with_bias_same_padding_srq_depthwise(%arg0: tensor<1x4x5x3x!quan
 }
 // CHECK-LABEL: func.func @conv_with_bias_same_padding_srq_depthwise
 // CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x4x5x3x!quant.uniform<i8:f32, 2.000000e+00>>) -> tensor<1x5x6x3x!quant.uniform<i8:f32, 8.000000e+00:-128>>
-// CHECK-DAG: %[[CONST_0:.+]] = "tfl.pseudo_const"() {value = dense<{{\[\[0, 0\], \[1, 1\], \[1, 1\], \[0, 0\]\]}}> : tensor<4x2xi32>} : () -> tensor<4x2xi32>
-// CHECK-DAG: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<1x2x2x3x!quant.uniform<i8<-127:127>:f32:3, {3.000000e+00,3.000000e+00,3.000000e+00}>>, value = dense<3> : tensor<1x2x2x3xi8>} : () -> tensor<1x2x2x3x!quant.uniform<i8<-127:127>:f32:3, {3.000000e+00,3.000000e+00,3.000000e+00}>>
-// CHECK-DAG: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<3x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00,6.000000e+00}>>, value = dense<5> : tensor<1x1x1x3xi32>} : () -> tensor<3x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00,6.000000e+00}>>
+// CHECK-DAG: %[[CONST_0:.+]] = "tfl.pseudo_const"() <{value = dense<{{\[\[0, 0\], \[1, 1\], \[1, 1\], \[0, 0\]\]}}> : tensor<4x2xi32>}> : () -> tensor<4x2xi32>
+// CHECK-DAG: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<1x2x2x3x!quant.uniform<i8<-127:127>:f32:3, {3.000000e+00,3.000000e+00,3.000000e+00}>>, value = dense<3> : tensor<1x2x2x3xi8>}> : () -> tensor<1x2x2x3x!quant.uniform<i8<-127:127>:f32:3, {3.000000e+00,3.000000e+00,3.000000e+00}>>
+// CHECK-DAG: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<3x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00,6.000000e+00}>>, value = dense<5> : tensor<1x1x1x3xi32>}> : () -> tensor<3x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00,6.000000e+00}>>
 // CHECK: %[[PAD:.+]] = "tfl.pad"(%[[ARG_0]], %[[CONST_0]]) : (tensor<1x4x5x3x!quant.uniform<i8:f32, 2.000000e+00>>, tensor<4x2xi32>) -> tensor<1x6x7x3x!quant.uniform<i8:f32, 2.000000e+00>>
-// CHECK: %[[DEPTHWISE_CONV_2D:.+]] = "tfl.depthwise_conv_2d"(%[[PAD]], %[[QCONST_0]], %[[QCONST_1]]) {depth_multiplier = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x6x7x3x!quant.uniform<i8:f32, 2.000000e+00>>, tensor<1x2x2x3x!quant.uniform<i8<-127:127>:f32:3, {3.000000e+00,3.000000e+00,3.000000e+00}>>, tensor<3x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00,6.000000e+00}>>) -> tensor<1x5x6x3x!quant.uniform<i8:f32, 8.000000e+00:-128>>
+// CHECK: %[[DEPTHWISE_CONV_2D:.+]] = "tfl.depthwise_conv_2d"(%[[PAD]], %[[QCONST_0]], %[[QCONST_1]]) <{depth_multiplier = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<1x6x7x3x!quant.uniform<i8:f32, 2.000000e+00>>, tensor<1x2x2x3x!quant.uniform<i8<-127:127>:f32:3, {3.000000e+00,3.000000e+00,3.000000e+00}>>, tensor<3x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00,6.000000e+00}>>) -> tensor<1x5x6x3x!quant.uniform<i8:f32, 8.000000e+00:-128>>
 // CHECK: return %[[DEPTHWISE_CONV_2D]]
 
 // -----
@@ -872,7 +872,7 @@ func.func @concatenate(
 // CHECK-LABEL: concatenate
 // CHECK-SAME: %[[ARG0:.+]]: tensor<3x2x!quant.uniform<i8:f32, 2.000000e+00:-1>>, %[[ARG1:.+]]: tensor<1x2x!quant.uniform<i8:f32, 2.000000e+00:-1>>
 // CHECK-NOT: stablehlo.concatenate
-// CHECK: %[[CONCAT:.+]] = "tfl.concatenation"(%arg0, %arg1) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<3x2x!quant.uniform<i8:f32, 2.000000e+00:-1>>, tensor<1x2x!quant.uniform<i8:f32, 2.000000e+00:-1>>) -> tensor<4x2x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+// CHECK: %[[CONCAT:.+]] = "tfl.concatenation"(%arg0, %arg1) <{axis = 0 : i32, fused_activation_function = "NONE"}> : (tensor<3x2x!quant.uniform<i8:f32, 2.000000e+00:-1>>, tensor<1x2x!quant.uniform<i8:f32, 2.000000e+00:-1>>) -> tensor<4x2x!quant.uniform<i8:f32, 2.000000e+00:-1>>
 // CHECK: return %[[CONCAT]]
 
 // -----
@@ -998,7 +998,7 @@ func.func @strided_slice(
 // CHECK{LITERAL}: dense<[3, 4]> : tensor<2xi32>
 // CHECK: %[[STRIDE:.+]] = arith.constant
 // CHECK{LITERAL}: dense<[2, 3]> : tensor<2xi32>
-// CHECK: %[[SLICE:.+]] = "tfl.strided_slice"(%[[ARG0]], %[[START]], %[[SIZE]], %[[STRIDE]]) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<3x6x!quant.uniform<i8:f32, 2.000000e+00:-1>>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<2x2x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+// CHECK: %[[SLICE:.+]] = "tfl.strided_slice"(%[[ARG0]], %[[START]], %[[SIZE]], %[[STRIDE]]) <{begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32}> : (tensor<3x6x!quant.uniform<i8:f32, 2.000000e+00:-1>>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<2x2x!quant.uniform<i8:f32, 2.000000e+00:-1>>
 // CHECK: return %[[SLICE]]
 
 // -----
@@ -1456,7 +1456,7 @@ func.func @dynamic_slice(
 // CHECK: %[[MIN1:.+]] = "tfl.minimum"(%[[BITCAST1]], %[[MAX1]]) : (tensor<1xi64>, tensor<1xi64>) -> tensor<1xi64>
 // CHECK: %[[BITCAST2:.+]] = "tfl.bitcast"(%[[ARG2]]) : (tensor<i64>) -> tensor<1xi64>
 // CHECK: %[[MIN2:.+]] = "tfl.minimum"(%[[BITCAST2]], %[[MAX2]]) : (tensor<1xi64>, tensor<1xi64>) -> tensor<1xi64>
-// CHECK: %[[CONCAT:.+]] = "tfl.concatenation"(%[[MIN1]], %[[MIN2]]) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1xi64>, tensor<1xi64>) -> tensor<2xi64>
+// CHECK: %[[CONCAT:.+]] = "tfl.concatenation"(%[[MIN1]], %[[MIN2]]) <{axis = 0 : i32, fused_activation_function = "NONE"}> : (tensor<1xi64>, tensor<1xi64>) -> tensor<2xi64>
 // CHECK: %[[MAX:.+]] = "tfl.maximum"(%[[CONCAT]], %[[ZERO]]) : (tensor<2xi64>, tensor<1xi64>) -> tensor<2xi64>
 // CHECK: %[[SLICE:.+]] = "tfl.slice"(%[[ARG0]], %[[MAX]], %[[SLICE_SIZE]])
 // CHECK-SAME: (tensor<4x4x!quant.uniform<i8:f32, 3.000000e-01:-5>>, tensor<2xi64>, tensor<2xi64>) -> tensor<2x1x!quant.uniform<i8:f32, 3.000000e-01:-5>>
@@ -1490,7 +1490,7 @@ func.func @add(%arg0: tensor<1x3x!quant.uniform<i8:f32, 1.000000e+0:8>>, %arg1:
 }
 
 // CHECK-LABEL: func @add
-// CHECK: %[[ADD:.+]] = tfl.add(%arg0, %arg1) {fused_activation_function = "NONE"} : (tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[ADD:.+]] = tfl.add(%arg0, %arg1) <{fused_activation_function = "NONE"}> : (tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK: return %[[ADD]]
 
 // -----
@@ -1517,7 +1517,7 @@ func.func @quantized_constant() -> tensor<1x2x4x5x!quant.uniform<i8:f32, 1.00000
   return %0 : tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0>>
 }
 
-// CHECK: %[[QCONST:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>, value = dense<1> : tensor<1x2x4x5xi8>}
+// CHECK: %[[QCONST:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>, value = dense<1> : tensor<1x2x4x5xi8>}>
 // CHECK-SAME: () -> tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>
 // CHECK: return %[[QCONST]]
 
@@ -1556,27 +1556,67 @@ func.func @dot_general_hybrid(%arg0: tensor<1x2x3x4xf32>) -> tensor<1x2x3x5xf32>
   return %1 : tensor<1x2x3x5xf32>
 }
 
-// CHECK: %[[WEIGHT:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>, value = dense<1> : tensor<1x2x4x5xi8>}
+// CHECK: %[[WEIGHT:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>, value = dense<1> : tensor<1x2x4x5xi8>}>
 // CHECK: %[[DQ:.+]] = "tfl.dequantize"(%[[WEIGHT]]) : (tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>) -> tensor<1x2x4x5xf32>
 // CHECK: %[[DOT:.+]] = stablehlo.dot_general %[[ARG0]], %[[DQ]], batching_dims = [0, 1] x [0, 1], contracting_dims = [3] x [2], precision = [DEFAULT, DEFAULT] : (tensor<1x2x3x4xf32>, tensor<1x2x4x5xf32>) -> tensor<1x2x3x5xf32>
 // CHECK: return %[[DOT]]
 
 // -----
 
-// Tests that a hybrid quantized convolution is splitted into dequantize and
-// float convolution.
+// Tests that a hybrid per-channel quantized convolution for tfl.conv_2d is
+// splitted into dequantize and float stablehlo.convolution.
 
-// CHECK-LABEL: func @convolution_hybrid
+// CHECK-LABEL: func @convolution_hybrid_per_channel
 // CHECK-SAME: %[[ARG0:.+]]: tensor<1x3x3x4xf32>
-func.func @convolution_hybrid(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3x3x2xf32> {
+func.func @convolution_hybrid_per_channel(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3x3x2xf32> {
   %0 = stablehlo.constant() {value = dense<3> : tensor<3x3x4x2xi8>} : () -> tensor<3x3x4x2x!quant.uniform<i8:f32:3, {2.000000e+2, 3.000000e+3}>>
   %1 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x2x!quant.uniform<i8:f32:3, {2.000000e+2, 3.000000e+3}>>) -> tensor<1x3x3x2xf32>
   return %1 : tensor<1x3x3x2xf32>
 }
 
-// CHECK: %[[WEIGHT:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<3x3x4x2x!quant.uniform<i8:f32:3, {2.000000e+02,3.000000e+03}>>, value = dense<3> : tensor<3x3x4x2xi8>}
-// CHECK: %[[DQ:.+]] = "tfl.dequantize"(%[[WEIGHT]]) : (tensor<3x3x4x2x!quant.uniform<i8:f32:3, {2.000000e+02,3.000000e+03}>>) -> tensor<3x3x4x2xf32>
+// CHECK: %[[WEIGHT:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>, value = dense<3> : tensor<2x3x3x4xi8>}>
+// CHECK: %[[DQ:.+]] = "tfl.dequantize"(%[[WEIGHT]]) : (tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>) -> tensor<2x3x3x4xf32>
+// CHECK: %[[CONV:.+]] = stablehlo.convolution(%[[ARG0]], %[[DQ]])
+// CHECK{LITERAL}: dim_numbers = [b, 0, 1, f]x[o, 0, 1, i]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64}
+// CHECK-SAME: (tensor<1x3x3x4xf32>, tensor<2x3x3x4xf32>) -> tensor<1x3x3x2xf32>
+// CHECK: return %[[CONV]]
+
+// -----
+
+// Tests that a hybrid per-tensor quantized convolution for tfl.conv_2d is
+// splitted into dequantize and float stablehlo.convolution.
+
+// CHECK-LABEL: func @convolution_hybrid_per_tensor
+// CHECK-SAME: %[[ARG0:.+]]: tensor<1x3x3x4xf32>
+func.func @convolution_hybrid_per_tensor(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3x3x2xf32> {
+  %0 = stablehlo.constant() {value = dense<3> : tensor<3x3x4x2xi8>} : () -> tensor<3x3x4x2x!quant.uniform<i8:f32, 3.000000e-01:-5>>
+  %1 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x2x!quant.uniform<i8:f32, 3.000000e-01:-5>>) -> tensor<1x3x3x2xf32>
+  return %1 : tensor<1x3x3x2xf32>
+}
+
+// CHECK: %[[WEIGHT:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<2x3x3x4x!quant.uniform<i8:f32, 3.000000e-01:-5>>, value = dense<3> : tensor<2x3x3x4xi8>}>
+// CHECK: %[[DQ:.+]] = "tfl.dequantize"(%[[WEIGHT]]) : (tensor<2x3x3x4x!quant.uniform<i8:f32, 3.000000e-01:-5>>) -> tensor<2x3x3x4xf32>
+// CHECK: %[[CONV:.+]] = stablehlo.convolution(%[[ARG0]], %[[DQ]])
+// CHECK{LITERAL}: dim_numbers = [b, 0, 1, f]x[o, 0, 1, i]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64}
+// CHECK-SAME: (tensor<1x3x3x4xf32>, tensor<2x3x3x4xf32>) -> tensor<1x3x3x2xf32>
+// CHECK: return %[[CONV]]
+
+// -----
+
+// Tests that a hybrid per-channel quantized convolution for tfl.depthwise_conv
+// is splitted into dequantize and float stablehlo.convolution.
+
+// CHECK-LABEL: func @depthwise_convolution_hybrid_per_channel
+// CHECK-SAME: %[[ARG0:.+]]: tensor<1x3x3x4xf32>
+func.func @depthwise_convolution_hybrid_per_channel(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32> {
+  %0 = stablehlo.constant() {value = dense<3> : tensor<3x3x1x4xi8>} : () -> tensor<3x3x1x4x!quant.uniform<i8:f32:3, {2.000000e+2, 3.000000e+3, 2.000000e+2, 3.000000e+3}>>
+  %1 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 4 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x1x4x!quant.uniform<i8:f32:3, {2.000000e+2, 3.000000e+3, 2.000000e+2, 3.000000e+3}>>) -> tensor<1x3x3x4xf32>
+  return %1 : tensor<1x3x3x4xf32>
+}
+
+// CHECK: %[[WEIGHT:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<1x3x3x4x!quant.uniform<i8<-127:127>:f32:3, {2.000000e+02,3.000000e+03,2.000000e+02,3.000000e+03}>>, value = dense<3> : tensor<1x3x3x4xi8>}>
+// CHECK: %[[DQ:.+]] = "tfl.dequantize"(%[[WEIGHT]]) : (tensor<1x3x3x4x!quant.uniform<i8<-127:127>:f32:3, {2.000000e+02,3.000000e+03,2.000000e+02,3.000000e+03}>>) -> tensor<1x3x3x4xf32>
 // CHECK: %[[CONV:.+]] = stablehlo.convolution(%[[ARG0]], %[[DQ]])
-// CHECK{LITERAL}: dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64}
-// CHECK-SAME: (tensor<1x3x3x4xf32>, tensor<3x3x4x2xf32>) -> tensor<1x3x3x2xf32>
+// CHECK{LITERAL}: dim_numbers = [b, 0, 1, f]x[i, 0, 1, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 4 : i64}
+// CHECK-SAME: (tensor<1x3x3x4xf32>, tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32>
 // CHECK: return %[[CONV]]
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/compose_uniform_quantized_type_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/compose_uniform_quantized_type_pass.cc
index 15481b9a0a1ad2..52f2c4be02a3aa 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/compose_uniform_quantized_type_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/compose_uniform_quantized_type_pass.cc
@@ -73,7 +73,7 @@ bool IsI8ToF32Cast(stablehlo::ConvertOp convert_op) {
   const bool is_i8_operand =
       convert_op.getOperand().getType().getElementType().isInteger(/*width=*/8);
   const bool is_f32_result =
-      convert_op.getResult().getType().getElementType().isa<Float32Type>();
+      mlir::isa<Float32Type>(convert_op.getResult().getType().getElementType());
   return is_i8_operand && is_f32_result;
 }
 
@@ -92,7 +92,7 @@ bool IsI32ToF32Cast(stablehlo::ConvertOp convert_op) {
       convert_op.getOperand().getType().getElementType().isInteger(
           /*width=*/32);
   const bool is_f32_result =
-      convert_op.getResult().getType().getElementType().isa<Float32Type>();
+      mlir::isa<Float32Type>(convert_op.getResult().getType().getElementType());
   return is_i32_operand && is_f32_result;
 }
 
@@ -104,7 +104,8 @@ LogicalResult MatchZeroPointsOperand(Value zero_points) {
     return failure();
   }
 
-  auto zero_points_type = zero_points.getType().dyn_cast_or_null<TensorType>();
+  auto zero_points_type =
+      mlir::dyn_cast_or_null<TensorType>(zero_points.getType());
   if (!zero_points_type) {
     LLVM_DEBUG(llvm::dbgs() << "Zero point value should be a tensor type. Got: "
                             << zero_points_type << ".\n");
@@ -112,7 +113,7 @@ LogicalResult MatchZeroPointsOperand(Value zero_points) {
   }
 
   if (Type zero_points_element_type = zero_points_type.getElementType();
-      !zero_points_element_type.isa<IntegerType>()) {
+      !mlir::isa<IntegerType>(zero_points_element_type)) {
     LLVM_DEBUG(llvm::dbgs() << "Zero point should be an integer type. Got: "
                             << zero_points_element_type << ".\n");
     return failure();
@@ -146,7 +147,7 @@ LogicalResult MatchInverseScalesOperand(Value inverse_scales) {
   }
 
   auto inverse_scales_type =
-      inverse_scales.getType().dyn_cast_or_null<TensorType>();
+      mlir::dyn_cast_or_null<TensorType>(inverse_scales.getType());
   if (!inverse_scales_type) {
     LLVM_DEBUG(llvm::dbgs() << "Inverse scales should be a tensor type. Got: "
                             << inverse_scales_type << ".\n");
@@ -154,7 +155,7 @@ LogicalResult MatchInverseScalesOperand(Value inverse_scales) {
   }
 
   if (Type inverse_scales_element_type = inverse_scales_type.getElementType();
-      !inverse_scales_element_type.isa<FloatType>()) {
+      !mlir::isa<FloatType>(inverse_scales_element_type)) {
     LLVM_DEBUG(llvm::dbgs()
                << "Inverse scales element should be a float type. Got: "
                << inverse_scales_element_type << ".\n");
@@ -207,7 +208,7 @@ class UniformQuantizeFunctionCallPattern {
     }
 
     auto input_value_type =
-        input_value.getType().dyn_cast_or_null<TensorType>();
+        mlir::dyn_cast_or_null<TensorType>(input_value.getType());
     if (!input_value_type) {
       LLVM_DEBUG(llvm::dbgs()
                  << "Failed to match @uniform_quantize function call pattern. "
@@ -216,7 +217,7 @@ class UniformQuantizeFunctionCallPattern {
     }
 
     if (Type input_element_type = input_value_type.getElementType();
-        !input_element_type.isa<FloatType>()) {
+        !mlir::isa<FloatType>(input_element_type)) {
       LLVM_DEBUG(llvm::dbgs()
                  << "Failed to match @uniform_quantize function call pattern. "
                     "Input value's element type must be a float. Got: "
@@ -299,7 +300,7 @@ class UniformDequantizeFunctionCallPattern {
     }
 
     auto input_value_type =
-        input_value.getType().dyn_cast_or_null<TensorType>();
+        mlir::dyn_cast_or_null<TensorType>(input_value.getType());
     if (!input_value_type) {
       LLVM_DEBUG(llvm::dbgs()
                  << "Failed to match @uniform_dequantize call pattern. Input "
@@ -309,7 +310,7 @@ class UniformDequantizeFunctionCallPattern {
     }
 
     if (Type input_element_type = input_value_type.getElementType();
-        !input_element_type.isa<IntegerType>()) {
+        !mlir::isa<IntegerType>(input_element_type)) {
       LLVM_DEBUG(llvm::dbgs()
                  << "Failed to match @uniform_dequantize call pattern. Input "
                     "value's element type must be integer. Got: "
@@ -433,8 +434,9 @@ class ComposeUniformQuantizedConvolutionOp
   LogicalResult match(stablehlo::ConvolutionOp op) const final {
     // Verify operands' types.
     for (Type operand_type : op.getOperandTypes()) {
-      if (Type element_type = operand_type.cast<TensorType>().getElementType();
-          !element_type.isa<Float32Type>()) {
+      if (Type element_type =
+              mlir::cast<TensorType>(operand_type).getElementType();
+          !mlir::isa<Float32Type>(element_type)) {
         LLVM_DEBUG(llvm::dbgs()
                    << "Failed to match. The operand type must be a float. Got: "
                    << element_type << ".\n");
@@ -477,8 +479,9 @@ class ComposeUniformQuantizedConvolutionOp
     // Match the subgraph that receives the convolution output.
     Value conv_output_value = op.getResult();
     if (auto output_element_type =
-            conv_output_value.getType().cast<TensorType>().getElementType();
-        !output_element_type.isa<FloatType>()) {
+            mlir::cast<TensorType>(conv_output_value.getType())
+                .getElementType();
+        !mlir::isa<FloatType>(output_element_type)) {
       LLVM_DEBUG(
           llvm::dbgs()
           << "Failed to match. Output type is expected to be a float. Got: "
@@ -530,14 +533,12 @@ class ComposeUniformQuantizedConvolutionOp
       return failure();
     }
 
-    if (!(other_zp_i8_to_f32_convert_op.getResult()
-              .getType()
-              .getElementType()
-              .isa<Float32Type>() &&
-          other_zp_i8_to_f32_convert_op.getOperand()
-              .getType()
-              .getElementType()
-              .isa<IntegerType>())) {
+    if (!(mlir::isa<Float32Type>(other_zp_i8_to_f32_convert_op.getResult()
+                                     .getType()
+                                     .getElementType()) &&
+          mlir::isa<IntegerType>(other_zp_i8_to_f32_convert_op.getOperand()
+                                     .getType()
+                                     .getElementType()))) {
       LLVM_DEBUG(
           llvm::dbgs()
           << "Failed to match. The ConvertOp is not an i8->f32 type cast.\n");
@@ -671,8 +672,8 @@ class ComposeUniformQuantizedConvolutionOp
         rewriter.create<stablehlo::UniformQuantizeOp>(
             uniform_quantize_call_op.getLoc(),
             /*result=*/
-            input_value.getType().cast<TensorType>().clone(
-                input_quantized_element_type),
+            mlir::cast<TensorType>(input_value.getType())
+                .clone(input_quantized_element_type),
             /*operand=*/input_value);
 
     rewriter.replaceAllUsesWith(input_i8_to_f32_convert_op.getResult(),
@@ -689,20 +690,21 @@ class ComposeUniformQuantizedConvolutionOp
       // This is i8 values disguised as f32 (due to the upcast trick). Simply
       // cast them to i8.
       ElementsAttr filter_value = filter_constant_op.getValue();
-      filter_i8_value_attr = filter_value.cast<DenseFPElementsAttr>().mapValues(
-          rewriter.getI8Type(), [](const APFloat& val) -> APInt {
-            APSInt convertedInt(/*BitWidth=*/8, /*isUnsigned=*/false);
-            bool ignored;
-            val.convertToInteger(convertedInt, APFloat::rmTowardZero, &ignored);
-            return convertedInt;
-          });
+      filter_i8_value_attr =
+          mlir::cast<DenseFPElementsAttr>(filter_value)
+              .mapValues(rewriter.getI8Type(), [](const APFloat& val) -> APInt {
+                APSInt convertedInt(/*BitWidth=*/8, /*isUnsigned=*/false);
+                bool ignored;
+                val.convertToInteger(convertedInt, APFloat::rmTowardZero,
+                                     &ignored);
+                return convertedInt;
+              });
     } else if (isa<stablehlo::ConvertOp>(filter_op) &&
                isa<stablehlo::ConstantOp>(
                    filter_op->getOperand(0).getDefiningOp())) {
-      filter_i8_value_attr =
+      filter_i8_value_attr = mlir::cast<DenseIntElementsAttr>(
           cast<stablehlo::ConstantOp>(filter_op->getOperand(0).getDefiningOp())
-              .getValue()
-              .cast<DenseIntElementsAttr>();
+              .getValue());
     }
 
     // Create Uniform Quantized constant for the filter.
@@ -719,9 +721,9 @@ class ComposeUniformQuantizedConvolutionOp
         scale_combined_broadcast_in_dim_op.getOperand().getDefiningOp());
 
     SmallVector<double> filter_scale_values;
-    for (const auto combined_scale_value : combined_scale_constant_op.getValue()
-                                               .cast<DenseFPElementsAttr>()
-                                               .getValues<float>()) {
+    for (const auto combined_scale_value :
+         mlir::cast<DenseFPElementsAttr>(combined_scale_constant_op.getValue())
+             .getValues<float>()) {
       // UniformQuantizedPerAxisType requires scales to have double dtype.
       const double filter_scale_value = static_cast<double>(
           combined_scale_value * input_inverse_scales_value);
@@ -780,7 +782,8 @@ class ComposeUniformQuantizedConvolutionOp
     Value conv_output_value = op.getResult();
     auto output_uniform_quantized_tensor_type = RankedTensorType::getChecked(
         rewriter.getUnknownLoc(),
-        /*shape=*/conv_output_value.getType().cast<TensorType>().getShape(),
+        /*shape=*/
+        mlir::cast<TensorType>(conv_output_value.getType()).getShape(),
         output_uniform_quantized_type);
 
     SmallVector<Type> new_conv_output_types = {
@@ -1017,8 +1020,8 @@ class ComposeUniformQuantizedDotGeneralOp
         rewriter.create<stablehlo::UniformQuantizeOp>(
             input_i8_to_f32_convert_op.getLoc(),
             /*result=*/
-            input_value.getType().cast<TensorType>().clone(
-                input_uniform_quantized_type),
+            mlir::cast<TensorType>(input_value.getType())
+                .clone(input_uniform_quantized_type),
             /*operand=*/input_value);
 
     rewriter.replaceAllUsesWith(input_i8_to_f32_convert_op.getResult(),
@@ -1029,13 +1032,13 @@ class ComposeUniformQuantizedDotGeneralOp
     stablehlo::ConstantOp filter_constant_op =
         GetFilterConstantOp(filter_value);
     auto filter_value_attr =
-        filter_constant_op.getValue().cast<DenseElementsAttr>();
+        mlir::cast<DenseElementsAttr>(filter_constant_op.getValue());
     if (filter_value_attr.getElementType().isF32()) {
       // This is i8 values disguised as f32 (due to the upcast trick). Simply
       // cast them to i8.
       filter_value_attr =
-          filter_value_attr.cast<DenseFPElementsAttr>().mapValues(
-              rewriter.getI8Type(), [](const APFloat& val) -> APInt {
+          mlir::cast<DenseFPElementsAttr>(filter_value_attr)
+              .mapValues(rewriter.getI8Type(), [](const APFloat& val) -> APInt {
                 APSInt converted_int(/*BitWidth=*/8, /*isUnsigned=*/false);
                 bool ignored;
                 val.convertToInteger(converted_int, APFloat::rmTowardZero,
@@ -1072,9 +1075,9 @@ class ComposeUniformQuantizedDotGeneralOp
     auto merged_scale_constant_op =
         cast<stablehlo::ConstantOp>(multiply_op_second_operand.getDefiningOp());
     SmallVector<double> filter_scale_values;
-    for (const auto merged_scale : merged_scale_constant_op.getValue()
-                                       .cast<DenseFPElementsAttr>()
-                                       .getValues<float>()) {
+    for (const auto merged_scale :
+         mlir::cast<DenseFPElementsAttr>(merged_scale_constant_op.getValue())
+             .getValues<float>()) {
       // (s1 * s2) * (1 / s1) = s2
       // UniformQuantizedPerAxisType requires scales to have double dtype.
       filter_scale_values.push_back(
@@ -1086,7 +1089,7 @@ class ComposeUniformQuantizedDotGeneralOp
 
     const int quantization_dimension = GetFilterQuantizationDimension(
         op.getDotDimensionNumbers(),
-        filter_value_attr.getType().cast<TensorType>().getRank());
+        mlir::cast<TensorType>(filter_value_attr.getType()).getRank());
     const UniformQuantizedPerAxisType filter_uniform_quantized_type =
         CreateI8F32UniformQuantizedPerAxisType(
             filter_constant_op.getLoc(), *rewriter.getContext(),
@@ -1097,8 +1100,8 @@ class ComposeUniformQuantizedDotGeneralOp
     auto quantized_filter_constant_op = rewriter.create<stablehlo::ConstantOp>(
         filter_constant_op.getLoc(),
         /*output=*/
-        filter_constant_op.getResult().getType().cast<TensorType>().clone(
-            filter_uniform_quantized_type),
+        mlir::cast<TensorType>(filter_constant_op.getResult().getType())
+            .clone(filter_uniform_quantized_type),
         /*value=*/filter_value_attr);
 
     rewriter.replaceAllUsesWith(filter_value,
@@ -1137,8 +1140,8 @@ class ComposeUniformQuantizedDotGeneralOp
 
     auto new_dot_general_op = rewriter.create<stablehlo::DotGeneralOp>(
         op.getLoc(), /*resultType0=*/
-        op.getResult().getType().cast<TensorType>().clone(
-            output_uniform_quantized_type),
+        mlir::cast<TensorType>(op.getResult().getType())
+            .clone(output_uniform_quantized_type),
         /*lhs=*/op.getLhs(), /*rhs=*/op.getRhs(),
         /*dot_dimension_numbers=*/op.getDotDimensionNumbers(),
         /*precision_config=*/op.getPrecisionConfigAttr());
@@ -1395,8 +1398,8 @@ class ComposeUniformQuantizedDotGeneralOpWithTwoQuantizedActivations
         rewriter.create<stablehlo::UniformQuantizeOp>(
             input1_uniform_quantize_call_op.getLoc(),
             /*result=*/
-            input1_value.getType().cast<TensorType>().clone(
-                input1_uniform_quantized_type),
+            mlir::cast<TensorType>(input1_value.getType())
+                .clone(input1_uniform_quantized_type),
             /*operand=*/input1_value);
 
     rewriter.replaceAllUsesWith(input1_zero_point_subtract_op.getResult(),
@@ -1434,8 +1437,8 @@ class ComposeUniformQuantizedDotGeneralOpWithTwoQuantizedActivations
         rewriter.create<stablehlo::UniformQuantizeOp>(
             input2_uniform_quantize_call_op.getLoc(),
             /*result=*/
-            input2_value.getType().cast<TensorType>().clone(
-                input2_uniform_quantized_type),
+            mlir::cast<TensorType>(input2_value.getType())
+                .clone(input2_uniform_quantized_type),
             /*operand=*/input2_value);
 
     rewriter.replaceAllUsesWith(input2_zero_point_subtract_op.getResult(),
@@ -1482,8 +1485,8 @@ class ComposeUniformQuantizedDotGeneralOpWithTwoQuantizedActivations
 
     auto new_dot_general_op = rewriter.create<stablehlo::DotGeneralOp>(
         op.getLoc(), /*resultType0=*/
-        op.getResult().getType().cast<TensorType>().clone(
-            output_uniform_quantized_type),
+        mlir::cast<TensorType>(op.getResult().getType())
+            .clone(output_uniform_quantized_type),
         /*lhs=*/op.getLhs(), /*rhs=*/op.getRhs(),
         /*dot_dimension_numbers=*/op.getDotDimensionNumbers(),
         /*precision_config=*/op.getPrecisionConfigAttr());
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_avg_pool.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_avg_pool.cc
index 801c8775682cbd..8c28f2e5e5df4b 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_avg_pool.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_avg_pool.cc
@@ -55,7 +55,7 @@ DenseIntElementsAttr GetPaddingArrayAttr(Builder& builder, Operation* old_op) {
 }
 
 ShapedType GetPaddedType(Operation* old_op) {
-  auto input_type = old_op->getOperand(0).getType().cast<ShapedType>();
+  auto input_type = mlir::cast<ShapedType>(old_op->getOperand(0).getType());
   auto input_shape = input_type.getShape();  // NCHW
   int64_t batch_size = input_shape[0];
   int64_t channel_size = input_shape[1];
@@ -124,7 +124,7 @@ StringAttr GetPaddingStringAttr(Builder& builder, Operation* old_op) {
   auto composite_attrs = composite_op.getCompositeAttributes();
 
   auto operand_shape =
-      composite_op.getOperand(0).getType().cast<ShapedType>().getShape();
+      mlir::cast<ShapedType>(composite_op.getOperand(0).getType()).getShape();
   // NC(H)(W)
   std::vector<int32_t> spatial_dim_sizes = {
       static_cast<int32_t>(operand_shape[2]),
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_patterns.td b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_patterns.td
index 829cf2fbaf16a4..5b9324c2a1782b 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_patterns.td
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_patterns.td
@@ -49,3 +49,24 @@ def LegalizeTorchUpsampleBlinear2dComposite: Pat<
                         (Arith_ConstantOp
                             ConstantAttr<RankedI32ElementsAttr<[4]>,"{0, 3, 1, 2}">)),
                     [(IsSupportedNchwUpsampleBlinear $input, $old_val, $attrs)]>;
+
+// TODO(b/333961789): Add support for NCHW layout for PyTorch resize, plus jax
+// supports NCHW inputs as well, so we need to add reliable way of checking the
+// layout.
+// pattern to lower a stablehlo.composite with `jax.image.resize` in `nearest`
+// mode to a tflite.resize_nearest_neighbor op.
+def LegalizeJaxResizeNearestNeighbor2dComposite: Pat<
+    (MHLO_CompositeOp:$old_val
+    (variadic $input),
+    ConstantStrAttr<StrAttr, "odml.jax_resize_nearest_neighbor2d">, $attrs, $_, $_),
+    (TFL_ResizeNearestNeighborOp
+      $input,
+      (Arith_ConstantOp:$output_size (GetI32DenseAttr (GetAsVectorAttr<"output_size"> $attrs))),
+      ConstBoolAttrFalse,
+      ConstBoolAttrTrue)>;
+
+
+def LegalizeCompositeGELU : Pat<(MHLO_CompositeOp:$composite 
+                                  (variadic $inputs), 
+                                  ConstantStrAttr<StrAttr, "odml.internal.gelu">, $_, $_, $_),
+                                (TFL_GeluOp $inputs, ConstBoolAttrFalse)>;
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.cc
index 403bf9968a9acd..2809c81458918c 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 
 namespace mlir {
 namespace odml {
@@ -65,8 +66,8 @@ bool GetI32VectorFromDenseI64CompositeAttr(
 
 bool IsSupportedNchwUpsampleBlinear(
     Value input, Value output, const DenseIntElementsAttr& output_size_attr) {
-  auto input_shape = input.getType().cast<ShapedType>().getShape();
-  auto output_shape = output.getType().cast<ShapedType>().getShape();
+  auto input_shape = mlir::cast<ShapedType>(input.getType()).getShape();
+  auto output_shape = mlir::cast<ShapedType>(output.getType()).getShape();
 
   // Only support 4D tensor.
   if (input_shape.size() != 4 || output_shape.size() != 4) {
@@ -89,7 +90,7 @@ bool IsSupportedNchwUpsampleBlinear(
 
 ShapedType GetNhwcReturnTypeFromNchw(Operation* old_op) {
   auto composite_result_shape =
-      old_op->getResults().front().getType().cast<ShapedType>().getShape();
+      mlir::cast<ShapedType>(old_op->getResults().front().getType()).getShape();
   std::array<int64_t, 4> output_shape;
   // NHWC <- NCHW
   output_shape[0] = composite_result_shape[0];
@@ -97,7 +98,7 @@ ShapedType GetNhwcReturnTypeFromNchw(Operation* old_op) {
   output_shape[2] = composite_result_shape[3];
   output_shape[3] = composite_result_shape[1];
 
-  auto input_type = old_op->getOperand(0).getType().cast<ShapedType>();
+  auto input_type = mlir::cast<ShapedType>(old_op->getOperand(0).getType());
 
   return RankedTensorType::get(output_shape, input_type.getElementType());
 }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.h
index 0691dc74997212..79d0910bce18a4 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.h
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.h
@@ -38,10 +38,10 @@ template <typename AttrType>
 bool EnsureAttribute(const DictionaryAttr& composite_attributes,
                      const std::string& attr_name, AttrType* out_attr) {
   Attribute attr = composite_attributes.get(attr_name);
-  if (!attr.isa_and_nonnull<AttrType>()) {
+  if (!mlir::isa_and_nonnull<AttrType>(attr)) {
     return false;
   }
-  if (AttrType content = attr.dyn_cast<AttrType>()) {
+  if (AttrType content = mlir::dyn_cast<AttrType>(attr)) {
     *out_attr = content;
     return true;
   } else {
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.td b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.td
index d39a8efb8b13b3..30d6f4247fba52 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.td
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_utils.td
@@ -22,7 +22,6 @@ include "mlir/IR/PatternBase.td"
 def GetNhwcReturnTypeFromNchw: NativeCodeCall<
   "GetNhwcReturnTypeFromNchw((*$0.begin()).getDefiningOp())">;
 
-
 // When given a DenseIntElementsAttr containing I64 elements, this extracts
 // one I32IntegerAttr from the given index.
 class GetI32At<int index>: NativeCodeCall<
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/fold_broadcast_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/fold_broadcast_pass.cc
index 847738e5cc7cbe..c2b31aeb540720 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/fold_broadcast_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/fold_broadcast_pass.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h"
@@ -117,7 +118,7 @@ static Attribute BinaryFolder(Op *op) {
   auto rhs = dyn_cast_or_null<DenseElementsAttr>(rhs_op.getValue());
   if (!lhs || !rhs) return {};
 
-  ShapedType type = op->getType().template cast<ShapedType>();
+  ShapedType type = mlir::cast<ShapedType>(op->getType());
   if (!type.hasStaticShape()) {
     return {};
   }
@@ -125,15 +126,15 @@ static Attribute BinaryFolder(Op *op) {
   Type etype = type.getElementType();
 
   // Evaluate for element types.
-  if (!etype.isa<ElementType>()) {
+  if (!mlir::isa<ElementType>(etype)) {
     return {};
   }
 
   // Special case for folding splats no matter how large.
   // Only covers the case of both attrs being splats; operation-specific cases
   // like adding a zero or multiplying by one are handled elsewhere.
-  SplatElementsAttr splatLhs = lhs.template dyn_cast<SplatElementsAttr>();
-  SplatElementsAttr splatRhs = rhs.template dyn_cast<SplatElementsAttr>();
+  SplatElementsAttr splatLhs = mlir::dyn_cast<SplatElementsAttr>(lhs);
+  SplatElementsAttr splatRhs = mlir::dyn_cast<SplatElementsAttr>(rhs);
   if (splatLhs && splatRhs) {
     auto signedLhs = addSign(splatLhs.getSplatValue<ValType>(), etype);
     auto signedRhs = addSign(splatRhs.getSplatValue<ValType>(), etype);
@@ -195,10 +196,10 @@ class FoldBroadcastInDimBeforeBinaryElementwiseOp
     auto bcast_dims = bcast_op.getBroadcastDimensions();
     auto elem_type = const_val.getElementType();
     Attribute result;
-    if (elem_type.template isa<FloatType>()) {
+    if (mlir::isa<FloatType>(elem_type)) {
       result = ConstFoldBroadcastInDim<FloatAttr>(result_type, const_val,
                                                   bcast_dims);
-    } else if (elem_type.template isa<IntegerType>()) {
+    } else if (mlir::isa<IntegerType>(elem_type)) {
       result = ConstFoldBroadcastInDim<IntegerAttr>(result_type, const_val,
                                                     bcast_dims);
     } else {
@@ -217,14 +218,14 @@ using FoldBroadcastInDimBeforeMulOp =
 // Constant folds mhlo.mul, this folder doesn't have an upper limit on how many
 // elements can be folded.
 LogicalResult ConstantFoldMul(mhlo::MulOp op, PatternRewriter &rewriter) {
-  ShapedType type = op.getType().dyn_cast<ShapedType>();
+  ShapedType type = mlir::dyn_cast<ShapedType>(op.getType());
   Type etype = type.getElementType();
   Attribute result = {};
-  if (etype.isa<FloatType>()) {
+  if (mlir::isa<FloatType>(etype)) {
     result =
         BinaryFolder<mhlo::MulOp, FloatType, APFloat, std::multiplies<APFloat>>(
             &op);
-  } else if (etype.isa<IntegerType>()) {
+  } else if (mlir::isa<IntegerType>(etype)) {
     result =
         BinaryFolder<mhlo::MulOp, IntegerType, APInt, std::multiplies<APSInt>>(
             &op);
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/hlo_matchers.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/hlo_matchers.cc
index 2d9308b05cb47b..9e7a5d424a2ecc 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/hlo_matchers.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/hlo_matchers.cc
@@ -237,9 +237,9 @@ bool MatchReshapedIota(DenseIntElementsAttr dimensions, Value iota) {
   auto reshape_op = dyn_cast_or_null<mhlo::ReshapeOp>(iota.getDefiningOp());
   if (!reshape_op) return false;
   auto operand_type =
-      reshape_op.getOperand().getType().dyn_cast<RankedTensorType>();
+      mlir::dyn_cast<RankedTensorType>(reshape_op.getOperand().getType());
   if (!operand_type || !operand_type.hasStaticShape()) return false;
-  auto reshape_type = reshape_op.getType().cast<RankedTensorType>();
+  auto reshape_type = mlir::cast<RankedTensorType>(reshape_op.getType());
 
   // Reshape can take a 1-D iota input and add extra dims of size one.
   if (operand_type.getRank() != 1) return false;
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
index 3a483f44568ce2..96081a2b2b1bd8 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
@@ -162,10 +162,8 @@ class ConvertNdConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
     }
 
     // tf Convolution doesn't support quantized type.
-    if (conv_op.getRhs()
-            .getType()
-            .getElementType()
-            .isa<quant::QuantizedType>()) {
+    if (mlir::isa<quant::QuantizedType>(
+            conv_op.getRhs().getType().getElementType())) {
       return failure();
     }
 
@@ -193,11 +191,11 @@ class ConvertNdConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
     const int kernel_input_feature_dimension =
         dnums.getKernelInputFeatureDimension();
     const int input_channels =
-        conv_op.getLhs().getType().cast<ShapedType>().getDimSize(
-            input_feature_dimension);
+        mlir::cast<ShapedType>(conv_op.getLhs().getType())
+            .getDimSize(input_feature_dimension);
     const int kernel_input_channels =
-        conv_op.getRhs().getType().cast<ShapedType>().getDimSize(
-            kernel_input_feature_dimension);
+        mlir::cast<ShapedType>(conv_op.getRhs().getType())
+            .getDimSize(kernel_input_feature_dimension);
     int feature_group_count = conv_op.getFeatureGroupCount();
 
     // check if group count is valid
@@ -238,14 +236,14 @@ class ConvertNdConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
   };
 
   static bool IsSupportedConvOp(mhlo::ConvolutionOp conv_op) {
-    if (!conv_op.getRhs().getType().cast<ShapedType>().hasStaticShape()) {
+    if (!mlir::cast<ShapedType>(conv_op.getRhs().getType()).hasStaticShape()) {
       return false;
     }
-    if (!conv_op.getLhs().getType().cast<ShapedType>().hasStaticShape() &&
-        !conv_op.getType().cast<ShapedType>().hasStaticShape()) {
+    if (!mlir::cast<ShapedType>(conv_op.getLhs().getType()).hasStaticShape() &&
+        !mlir::cast<ShapedType>(conv_op.getType()).hasStaticShape()) {
       auto dnums = conv_op.getDimensionNumbers();
-      auto lhs_type = conv_op.getLhs().getType().cast<ShapedType>();
-      auto out_type = conv_op.getType().cast<ShapedType>();
+      auto lhs_type = mlir::cast<ShapedType>(conv_op.getLhs().getType());
+      auto out_type = mlir::cast<ShapedType>(conv_op.getType());
       int64_t input_batch_dim = dnums.getInputBatchDimension();
       int64_t out_batch_dim = dnums.getOutputBatchDimension();
       for (size_t i = 0; i < lhs_type.getRank(); ++i) {
@@ -263,10 +261,7 @@ class ConvertNdConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
     if (!lhs_dilation.isSplat() || lhs_dilation.getSplatValue<int64_t>() != 1)
       return false;
 
-    if (conv_op.getWindowStrides()
-            .value()
-            .getType()
-            .cast<ShapedType>()
+    if (mlir::cast<ShapedType>(conv_op.getWindowStrides().value().getType())
             .getRank() != 1)
       return false;
 
@@ -290,10 +285,10 @@ class ConvertNdConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
       int64_t pad_low_int64;
       int64_t pad_high_int64;
       tensorflow::Status status = tensorflow::GetWindowedOutputSizeVerbose(
-          conv_op.getLhs().getType().cast<ShapedType>().getDimSize(
-              input_spatial_dim[i]),
-          conv_op.getRhs().getType().cast<ShapedType>().getDimSize(
-              kernel_spatial_dim[i]),
+          mlir::cast<ShapedType>(conv_op.getLhs().getType())
+              .getDimSize(input_spatial_dim[i]),
+          mlir::cast<ShapedType>(conv_op.getRhs().getType())
+              .getDimSize(kernel_spatial_dim[i]),
           dilation[dim], strides[dim], tensorflow::Padding::SAME, &output_size,
           &pad_low_int64, &pad_high_int64);
       if (!status.ok()) return false;
@@ -314,7 +309,7 @@ class ConvertNdConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
       return value;
     }
 
-    auto input_type = value.getType().cast<RankedTensorType>();
+    auto input_type = mlir::cast<RankedTensorType>(value.getType());
     auto input_shape = input_type.getShape();
 
     llvm::SmallVector<int64_t, 4> start;
@@ -380,7 +375,7 @@ class ConvertNdConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
     // Convolution. This is needed because TF.Conv3DOp doesn't support EXPLICIT.
     if (padding == "EXPLICIT" && num_spatial_dims == 3) {
       auto lhs_type =
-          conv_op.getLhs().getType().template dyn_cast<RankedTensorType>();
+          mlir::dyn_cast<RankedTensorType>(conv_op.getLhs().getType());
       RankedTensorType padding_attr_type = mlir::RankedTensorType::get(
           {lhs_type.getRank(), 2}, rewriter.getIntegerType(64));
       auto padding_const = rewriter.create<TF::ConstOp>(
@@ -394,7 +389,7 @@ class ConvertNdConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
       padding = "VALID";
     }
 
-    auto conv_output_type = conv_op.getType().cast<RankedTensorType>();
+    auto conv_output_type = mlir::cast<RankedTensorType>(conv_op.getType());
     DenseIntElementsAttr permutation;
     const bool need_transpose_output = NeedsReformatTypeAndPermutation(
         dnums.getOutputBatchDimension(), dnums.getOutputFeatureDimension(),
@@ -418,7 +413,7 @@ class ConvertNdConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
       // Reshapes filter format to [filter_height, filter_width, in_channels,
       // channel_multiplier] from HLO's [filter_height, filter_width, 1,
       // in_channels * channel_multiplier] format.
-      auto filter_type = rhs.getType().cast<ShapedType>();
+      auto filter_type = mlir::cast<ShapedType>(rhs.getType());
       llvm::ArrayRef<int64_t> hlo_filter_shape = filter_type.getShape();
       llvm::SmallVector<int64_t, 4> tf_filter_shape(hlo_filter_shape.begin(),
                                                     hlo_filter_shape.end());
@@ -491,13 +486,13 @@ class Convert1DConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
     // Group convolution is not supported yet.
     const int64_t input_feature_dimension = dnums.getInputFeatureDimension();
     const int64_t input_channels =
-        conv_op.getLhs().getType().cast<ShapedType>().getDimSize(
-            input_feature_dimension);
+        mlir::cast<ShapedType>(conv_op.getLhs().getType())
+            .getDimSize(input_feature_dimension);
     const int kernel_input_feature_dimension =
         dnums.getKernelInputFeatureDimension();
     const int kernel_input_channels =
-        conv_op.getRhs().getType().cast<ShapedType>().getDimSize(
-            kernel_input_feature_dimension);
+        mlir::cast<ShapedType>(conv_op.getRhs().getType())
+            .getDimSize(kernel_input_feature_dimension);
     const int64_t feature_group_count = conv_op.getFeatureGroupCount();
     if (feature_group_count != input_channels / kernel_input_channels ||
         input_channels % kernel_input_channels != 0)
@@ -508,7 +503,7 @@ class Convert1DConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
     //
 
     // Reshape input image to add a new spatial dimension.
-    auto image_type = conv_op.getLhs().getType().cast<ShapedType>();
+    auto image_type = mlir::cast<ShapedType>(conv_op.getLhs().getType());
     SmallVector<int64_t, 4> image_2d_shape(image_type.getShape().begin(),
                                            image_type.getShape().end());
     image_2d_shape.push_back(1);
@@ -530,7 +525,7 @@ class Convert1DConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
         image_permutation_and_shape.permutation);
 
     // Reshape kernel to add a new spatial dimension.
-    auto kernel_type = conv_op.getRhs().getType().cast<ShapedType>();
+    auto kernel_type = mlir::cast<ShapedType>(conv_op.getRhs().getType());
     SmallVector<int64_t, 4> kernel_2d_shape;
     for (int64_t dim : kernel_type.getShape()) {
       kernel_2d_shape.push_back(dim);
@@ -623,7 +618,7 @@ class Convert1DConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
     //
 
     // Determine the 2-D convolution output shape.
-    auto output_type = conv_op->getResult(0).getType().cast<ShapedType>();
+    auto output_type = mlir::cast<ShapedType>(conv_op->getResult(0).getType());
     SmallVector<int64_t, 4> output_2d_shape;
     for (int64_t dim : output_type.getShape()) {
       output_2d_shape.push_back(dim);
@@ -648,7 +643,7 @@ class Convert1DConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
         conv_op.getPrecisionConfigAttr());
 
     OpResult conv2d_output = conv2d_op->getResult(0);
-    auto conv2d_output_type = conv2d_output.getType().cast<ShapedType>();
+    auto conv2d_output_type = mlir::cast<ShapedType>(conv2d_output.getType());
 
     //
     // Transpose and reshape the output
@@ -676,9 +671,9 @@ using Convert3DConvOp = ConvertNdConvOp<3>;
 // lhs_dilation>1 and window_strides=1.
 LogicalResult IsSupportedNonTrivialConvOp(mhlo::ConvolutionOp conv_op,
                                           ConversionPatternRewriter& rewriter) {
-  if (!conv_op.getLhs().getType().cast<ShapedType>().hasStaticShape() ||
-      !conv_op.getRhs().getType().cast<ShapedType>().hasStaticShape() ||
-      !conv_op.getType().cast<ShapedType>().hasStaticShape())
+  if (!mlir::cast<ShapedType>(conv_op.getLhs().getType()).hasStaticShape() ||
+      !mlir::cast<ShapedType>(conv_op.getRhs().getType()).hasStaticShape() ||
+      !mlir::cast<ShapedType>(conv_op.getType()).hasStaticShape())
     return rewriter.notifyMatchFailure(conv_op, "requires static shape");
   mhlo::ConvDimensionNumbersAttr dnums = conv_op.getDimensionNumbers();
 
@@ -687,10 +682,7 @@ LogicalResult IsSupportedNonTrivialConvOp(mhlo::ConvolutionOp conv_op,
     return rewriter.notifyMatchFailure(conv_op,
                                        "requires non-trivial lhs_dilation");
 
-  if (conv_op.getWindowStrides()
-          .value()
-          .getType()
-          .cast<ShapedType>()
+  if (mlir::cast<ShapedType>(conv_op.getWindowStrides().value().getType())
           .getRank() != 1)
     return rewriter.notifyMatchFailure(
         conv_op, "requires window_strides to equal to one");
@@ -746,19 +738,19 @@ class ConvertToResizeBilinearOpOrDepthwiseTransposedConvOp
     mhlo::ConvDimensionNumbersAttr dnums = conv_op.getDimensionNumbers();
     const int input_feature_dimension = dnums.getInputFeatureDimension();
     const int input_channels =
-        conv_op.getLhs().getType().cast<ShapedType>().getDimSize(
-            input_feature_dimension);
+        mlir::cast<ShapedType>(conv_op.getLhs().getType())
+            .getDimSize(input_feature_dimension);
     int feature_group_count = conv_op.getFeatureGroupCount();
     const int kernel_input_feature_dimension =
         dnums.getKernelInputFeatureDimension();
     const int kernel_input_channels =
-        conv_op.getRhs().getType().cast<ShapedType>().getDimSize(
-            kernel_input_feature_dimension);
+        mlir::cast<ShapedType>(conv_op.getRhs().getType())
+            .getDimSize(kernel_input_feature_dimension);
     const int kernel_output_feature_dimension =
         dnums.getKernelOutputFeatureDimension();
     const int kernel_output_channels =
-        conv_op.getRhs().getType().cast<ShapedType>().getDimSize(
-            kernel_output_feature_dimension);
+        mlir::cast<ShapedType>(conv_op.getRhs().getType())
+            .getDimSize(kernel_output_feature_dimension);
 
     // To support a depthwise convolution, we need-
     // 1. feature_group_count != 1 (except when input_channels==1)
@@ -795,7 +787,7 @@ class ConvertToResizeBilinearOpOrDepthwiseTransposedConvOp
     auto create_slice = [&](mlir::Value tensor, int depth_idx, int channel_idx,
                             bool is_kernel = false) -> mlir::Value {
       std::vector<int64_t> tensor_shape =
-          tensor.getType().cast<ShapedType>().getShape().vec();
+          mlir::cast<ShapedType>(tensor.getType()).getShape().vec();
 
       // Calculate offsets based on depth_idx, channel_idx and tensor_shape
       std::vector<int64_t> start_indices(tensor_shape.size(), 0);
@@ -828,7 +820,8 @@ class ConvertToResizeBilinearOpOrDepthwiseTransposedConvOp
 
       // Calculate convolution output_type based on sliced_input and
       // sliced_kernel
-      auto output_type = conv_op->getResult(0).getType().cast<ShapedType>();
+      auto output_type =
+          mlir::cast<ShapedType>(conv_op->getResult(0).getType());
       std::vector<int64_t> new_output_shape = output_type.getShape().vec();
       new_output_shape[dnums.getOutputFeatureDimension()] /=
           feature_group_count;
@@ -884,8 +877,8 @@ class ConvertToResizeBilinearOpOrDepthwiseTransposedConvOp
     int feature_group_count = conv_op.getFeatureGroupCount();
     const int input_feature_dimension = dnums.getInputFeatureDimension();
     const int input_channels =
-        conv_op.getLhs().getType().cast<ShapedType>().getDimSize(
-            input_feature_dimension);
+        mlir::cast<ShapedType>(conv_op.getLhs().getType())
+            .getDimSize(input_feature_dimension);
 
     // Check for Group Convolution parameters
     if (feature_group_count != 1 && feature_group_count != input_channels) {
@@ -919,7 +912,7 @@ class ConvertToResizeBilinearOpOrDepthwiseTransposedConvOp
     auto padding_values = padding.getValues<int64_t>();
 
     // Cast the dimension sizes to int.
-    auto lhs_type = conv_op.getLhs().getType().cast<ShapedType>();
+    auto lhs_type = mlir::cast<ShapedType>(conv_op.getLhs().getType());
     llvm::SmallVector<int> input_sizes = {
         static_cast<int>(lhs_type.getDimSize(input_spatial_dimensions[0])),
         static_cast<int>(lhs_type.getDimSize(input_spatial_dimensions[1]))};
@@ -1101,7 +1094,8 @@ class ConvertNonTrivialConvOp
         transpose_order[dnums.getOutputSpatialDimensions().data()[i]] = i + 1;
       }
       auto output_shape =
-          conv_op.getResult().getType().cast<RankedTensorType>().getShape();
+          mlir::cast<RankedTensorType>(conv_op.getResult().getType())
+              .getShape();
       SmallVector<int64_t, 4> transposed_output_shape = {
           output_shape[dnums.getOutputBatchDimension()],
           output_shape[dnums.getOutputSpatialDimensions().data()[0]],
@@ -1114,7 +1108,7 @@ class ConvertNonTrivialConvOp
       }
       auto output_type = RankedTensorType::get(
           transposed_output_shape,
-          conv_op.getRhs().getType().cast<ShapedType>().getElementType());
+          mlir::cast<ShapedType>(conv_op.getRhs().getType()).getElementType());
       auto output_sizes = rewriter.create<TF::ConstOp>(
           conv_op.getLoc(),
           DenseIntElementsAttr::get(
@@ -1138,7 +1132,8 @@ class ConvertNonTrivialConvOp
     } else {
       SmallVector<int32_t, 4> output_shape_i32;
       for (int64_t dim :
-           conv_op.getResult().getType().cast<RankedTensorType>().getShape()) {
+           mlir::cast<RankedTensorType>(conv_op.getResult().getType())
+               .getShape()) {
         output_shape_i32.push_back(dim);
       }
       auto output_sizes = rewriter.create<TF::ConstOp>(
@@ -1176,14 +1171,12 @@ class ConvertNonTrivialConvOp
 
     for (size_t i = 1; i <= num_spatial_dims; ++i) {
       int64_t stride = strides[i];
-      int64_t input_size =
-          conv_op.getLhs().getType().cast<ShapedType>().getDimSize(
-              input_spatial_dims[i - 1]);
-      int64_t kernel_size =
-          conv_op.getRhs().getType().cast<ShapedType>().getDimSize(
-              kernel_spatial_dims[i - 1]);
-      int64_t output_size = conv_op.getType().cast<ShapedType>().getDimSize(
-          output_spatial_dims[i - 1]);
+      int64_t input_size = mlir::cast<ShapedType>(conv_op.getLhs().getType())
+                               .getDimSize(input_spatial_dims[i - 1]);
+      int64_t kernel_size = mlir::cast<ShapedType>(conv_op.getRhs().getType())
+                                .getDimSize(kernel_spatial_dims[i - 1]);
+      int64_t output_size = mlir::cast<ShapedType>(conv_op.getType())
+                                .getDimSize(output_spatial_dims[i - 1]);
 
       // stablehlo.convolution op needs explicit padding to be set to model any
       // Transposed-Convolution in JAX/PT. Checking to see if-
@@ -1225,11 +1218,10 @@ class ConvertNonTrivialConvOp
         return false;
       }
       int64_t stride = strides[i + 1];
-      int64_t input_size =
-          conv_op.getLhs().getType().cast<ShapedType>().getDimSize(
-              input_spatial_dims[i]);
-      int64_t output_size = conv_op.getType().cast<ShapedType>().getDimSize(
-          output_spatial_dims[i]);
+      int64_t input_size = mlir::cast<ShapedType>(conv_op.getLhs().getType())
+                               .getDimSize(input_spatial_dims[i]);
+      int64_t output_size = mlir::cast<ShapedType>(conv_op.getType())
+                                .getDimSize(output_spatial_dims[i]);
       // The reason for the below check is as follows:
       // When computing the output, we have the following relation between
       // o - output dim size, i - input dim size, s - stride, P - total pads
@@ -1280,13 +1272,11 @@ class ConvertDynamicSliceOp : public OpConversionPattern<mhlo::DynamicSliceOp> {
   LogicalResult matchAndRewrite(
       mhlo::DynamicSliceOp op, OpAdaptor adaptor,
       ConversionPatternRewriter& rewriter) const final {
-    ShapedType input_type = op.getOperand().getType().cast<ShapedType>();
+    ShapedType input_type = mlir::cast<ShapedType>(op.getOperand().getType());
     if (!input_type.hasStaticShape()) return failure();
-    Type start_indices_element_type = op.getStartIndices()
-                                          .front()
-                                          .getType()
-                                          .cast<ShapedType>()
-                                          .getElementType();
+    Type start_indices_element_type =
+        mlir::cast<ShapedType>(op.getStartIndices().front().getType())
+            .getElementType();
 
     // The mhlo dynamic_slice's start_indices can be either signed/unsigned
     // int32/int64. However, TF only takes in either i32 or i64 types for begin,
@@ -1307,8 +1297,8 @@ class ConvertDynamicSliceOp : public OpConversionPattern<mhlo::DynamicSliceOp> {
     for (uint64_t i = 0, e = op.getStartIndices().size(); i < e; ++i) {
       // Always put a cast there.
       auto start = op.getStartIndices()[i];
-      auto cast_type = start.getType().cast<ShapedType>().clone(
-          signed_start_indices_element_type);
+      auto cast_type = mlir::cast<ShapedType>(start.getType())
+                           .clone(signed_start_indices_element_type);
       auto cast_op = rewriter.create<TF::CastOp>(op.getLoc(), cast_type, start);
       Value clamp_max = rewriter.create<TF::ConstOp>(
           op.getLoc(), rewriter.getIntegerAttr(
@@ -1409,11 +1399,11 @@ class ConvertDynamicUpdateSliceOp
   LogicalResult matchAndRewrite(
       mhlo::DynamicUpdateSliceOp op, OpAdaptor adaptor,
       ConversionPatternRewriter& rewriter) const final {
-    ShapedType operand_type = op.getOperand().getType().cast<ShapedType>();
+    ShapedType operand_type = mlir::cast<ShapedType>(op.getOperand().getType());
     ShapedType update_type =
-        op.getUpdate().getType().dyn_cast_or_null<ShapedType>();
-    ShapedType start_indices_type =
-        op.getStartIndices().front().getType().dyn_cast_or_null<ShapedType>();
+        mlir::dyn_cast_or_null<ShapedType>(op.getUpdate().getType());
+    ShapedType start_indices_type = mlir::dyn_cast_or_null<ShapedType>(
+        op.getStartIndices().front().getType());
     if (update_type == nullptr || start_indices_type == nullptr)
       return rewriter.notifyMatchFailure(
           op, "update and start_indices should have ShapedType");
@@ -1474,8 +1464,8 @@ class ConvertSortToTfTopk : public OpConversionPattern<mhlo::SortOp> {
           op, "only match for the case where operands is of size 2");
     auto keys = op.getInputs()[0];
     auto indices = op.getInputs()[1];
-    auto keys_ty = keys.getType().dyn_cast_or_null<ShapedType>();
-    auto indices_ty = indices.getType().dyn_cast_or_null<ShapedType>();
+    auto keys_ty = mlir::dyn_cast_or_null<ShapedType>(keys.getType());
+    auto indices_ty = mlir::dyn_cast_or_null<ShapedType>(indices.getType());
     if (!keys_ty || !keys_ty.hasStaticShape() ||
         !keys_ty.getElementType().isIntOrFloat())
       return rewriter.notifyMatchFailure(
@@ -1589,7 +1579,7 @@ Value BuildDotOperandFlattenedShapeOp(Value operand,
                                       DotDimensionsInfo dot_dimensions_info,
                                       ImplicitLocOpBuilder& builder,
                                       bool is_lhs) {
-  auto operand_type = operand.getType().cast<ShapedType>();
+  auto operand_type = mlir::cast<ShapedType>(operand.getType());
   BoolAttr true_attr = builder.getBoolAttr(true);
   auto operand_shape = builder.create<TF::ShapeOp>(operand, true_attr);
   const int64_t operand_rank = operand_type.getRank();
@@ -1665,8 +1655,8 @@ Value BuildDotOperandFlattenedShapeOp(Value operand,
 Value ConvertDot(PatternRewriter& rewriter, Value lhs, Value rhs,
                  DotDimensionNumbersAttr dot_dimension_numbers,
                  ShapedType result_type, mlir::Location loc) {
-  auto lhs_type = lhs.getType().cast<ShapedType>();
-  auto rhs_type = rhs.getType().cast<ShapedType>();
+  auto lhs_type = mlir::cast<ShapedType>(lhs.getType());
+  auto rhs_type = mlir::cast<ShapedType>(rhs.getType());
   const int lhs_rank = lhs_type.getRank();
   const int rhs_rank = rhs_type.getRank();
   ImplicitLocOpBuilder builder(loc, rewriter);
@@ -1821,7 +1811,7 @@ Value ConvertDot(PatternRewriter& rewriter, Value lhs, Value rhs,
 // necessary.
 Value ConvertDotOp(PatternRewriter& rewriter, Operation* old_op) {
   auto dot_op = cast<mhlo::DotOp>(old_op);
-  auto lhs_rank = dot_op.getLhs().getType().cast<ShapedType>().getRank();
+  auto lhs_rank = mlir::cast<ShapedType>(dot_op.getLhs().getType()).getRank();
   auto dot_dimension_numbers =
       DotDimensionNumbersAttr::get(rewriter.getContext(),
                                    /*lhs_batching_dimensions=*/{},
@@ -1831,17 +1821,18 @@ Value ConvertDotOp(PatternRewriter& rewriter, Operation* old_op) {
                                    /*rhs_contracting_dimensions=*/{0});
   return ConvertDot(
       rewriter, dot_op.getLhs(), dot_op.getRhs(), dot_dimension_numbers,
-      dot_op.getResult().getType().cast<ShapedType>(), dot_op.getLoc());
+      mlir::cast<ShapedType>(dot_op.getResult().getType()), dot_op.getLoc());
 }
 
 // Converts mhlo.dot to tf.BatchMatMul. Reshape or Transpose ops will also be
 // inserted to convert to well-formed matrix multiply.
 Value ConvertDotGeneralOp(PatternRewriter& rewriter, Operation* old_op) {
   auto dot_general_op = cast<mhlo::DotGeneralOp>(old_op);
-  return ConvertDot(rewriter, dot_general_op.getLhs(), dot_general_op.getRhs(),
-                    dot_general_op.getDotDimensionNumbers(),
-                    dot_general_op.getResult().getType().cast<ShapedType>(),
-                    dot_general_op.getLoc());
+  return ConvertDot(
+      rewriter, dot_general_op.getLhs(), dot_general_op.getRhs(),
+      dot_general_op.getDotDimensionNumbers(),
+      mlir::cast<ShapedType>(dot_general_op.getResult().getType()),
+      dot_general_op.getLoc());
 }
 
 // Replace BinaryOp with a combination of TfBinaryOp and TfReduceOp if the
@@ -1940,9 +1931,9 @@ class ConvertReduceOpToTfOp : public OpConversionPattern<mhlo::ReduceOp> {
         reduce_op.getResults().size() != 1)
       return failure();
 
-    if (!reduce_op.getInputs()[0].getType().isa<RankedTensorType>())
+    if (!mlir::isa<RankedTensorType>(reduce_op.getInputs()[0].getType()))
       return failure();
-    if (!reduce_op.getType(0).isa<RankedTensorType>()) return failure();
+    if (!mlir::isa<RankedTensorType>(reduce_op.getType(0))) return failure();
     return success();
   }
 };
@@ -1953,13 +1944,13 @@ class ConvertReduceOpToTfProd
   using ConvertReduceOpToTfOp::ConvertReduceOpToTfOp;
 
   LogicalResult MatchInitValue(Value init_value) const override {
-    auto type = init_value.getType().cast<ShapedType>().getElementType();
-    if (type.isa<FloatType>()) {
+    auto type = mlir::cast<ShapedType>(init_value.getType()).getElementType();
+    if (mlir::isa<FloatType>(type)) {
       float const_value;
       if (failed(GetConstantSplatValue<float>(init_value, const_value)) ||
           const_value != 1.0)
         return failure();
-    } else if (type.isa<IntegerType>() && type.isSignlessInteger()) {
+    } else if (mlir::isa<IntegerType>(type) && type.isSignlessInteger()) {
       int32_t const_value;
       if (failed(GetConstantSplatValue<int32_t>(init_value, const_value)) ||
           const_value != 1)
@@ -1978,13 +1969,13 @@ class ConvertReduceOpToTfSum
   using ConvertReduceOpToTfOp::ConvertReduceOpToTfOp;
 
   LogicalResult MatchInitValue(Value init_value) const override {
-    auto type = init_value.getType().cast<ShapedType>().getElementType();
-    if (type.isa<FloatType>()) {
+    auto type = mlir::cast<ShapedType>(init_value.getType()).getElementType();
+    if (mlir::isa<FloatType>(type)) {
       APFloat const_value(.0);
       if (failed(GetConstantSplatValue(init_value, const_value)) ||
           !const_value.isZero())
         return failure();
-    } else if (type.isa<IntegerType>() && type.isSignlessInteger()) {
+    } else if (mlir::isa<IntegerType>(type) && type.isSignlessInteger()) {
       APInt const_value;
       if (failed(GetConstantSplatValue(init_value, const_value)) ||
           !const_value.isZero())
@@ -2003,13 +1994,13 @@ class ConvertReduceOpToTfMax
   using ConvertReduceOpToTfOp::ConvertReduceOpToTfOp;
 
   LogicalResult MatchInitValue(Value init_value) const override {
-    auto type = init_value.getType().cast<ShapedType>().getElementType();
-    if (type.isa<FloatType>()) {
+    auto type = mlir::cast<ShapedType>(init_value.getType()).getElementType();
+    if (mlir::isa<FloatType>(type)) {
       APFloat const_value(.0);
       if (failed(GetConstantSplatValue(init_value, const_value)) ||
           !const_value.isInfinity() || !const_value.isNegative())
         return failure();
-    } else if (type.isa<IntegerType>() && type.isSignlessInteger()) {
+    } else if (mlir::isa<IntegerType>(type) && type.isSignlessInteger()) {
       APInt const_value;
       if (failed(GetConstantSplatValue(init_value, const_value)) ||
           !const_value.isMinSignedValue())
@@ -2027,14 +2018,14 @@ class ConvertReduceOpToTfMin
   using ConvertReduceOpToTfOp::ConvertReduceOpToTfOp;
 
   LogicalResult MatchInitValue(Value init_value) const override {
-    auto type = init_value.getType().cast<ShapedType>().getElementType();
+    auto type = mlir::cast<ShapedType>(init_value.getType()).getElementType();
 
-    if (type.isa<FloatType>()) {
+    if (mlir::isa<FloatType>(type)) {
       APFloat const_value(.0);
       if (failed(GetConstantSplatValue(init_value, const_value)) ||
           !const_value.isInfinity() || const_value.isNegative())
         return failure();
-    } else if (type.isa<IntegerType>() && type.isSignlessInteger()) {
+    } else if (mlir::isa<IntegerType>(type) && type.isSignlessInteger()) {
       APInt const_value;
       if (failed(GetConstantSplatValue(init_value, const_value)) ||
           !const_value.isMaxSignedValue())
@@ -2088,7 +2079,7 @@ class ConvertReduceOpToTfArgmax
     auto element_type = attr.getType().getElementType();
     if (attr.getNumElements() != 1 || !element_type.isIntOrFloat())
       return false;
-    if (element_type.isa<FloatType>()) {
+    if (mlir::isa<FloatType>(element_type)) {
       auto value = *attr.value_begin<APFloat>();
       return value.isNegative() && value.isInfinity();
     } else if (element_type.isInteger(1)) {
@@ -2112,7 +2103,7 @@ class ConvertReduceOpToTfArgmin
     auto element_type = attr.getType().getElementType();
     if (attr.getNumElements() != 1 || !element_type.isIntOrFloat())
       return false;
-    if (element_type.isa<FloatType>()) {
+    if (mlir::isa<FloatType>(element_type)) {
       auto value = *attr.value_begin<APFloat>();
       return !value.isNegative() && value.isInfinity();
     } else if (element_type.isInteger(1)) {
@@ -2134,18 +2125,18 @@ class ConvertIotaOpToTfRange : public OpConversionPattern<mhlo::IotaOp> {
       mhlo::IotaOp iota_op, OpAdaptor adaptor,
       ConversionPatternRewriter& rewriter) const final {
     RankedTensorType type =
-        iota_op.getType().dyn_cast_or_null<RankedTensorType>();
+        mlir::dyn_cast_or_null<RankedTensorType>(iota_op.getType());
     // TF::RangeOp doesn't support UI16.
     if (!type || type.getElementType().isUnsignedInteger(16)) return failure();
 
     const uint64_t dimension = iota_op.getIotaDimension();
     Type element_type = type.getElementType();
     Attribute start, limit, delta;
-    if (element_type.isa<FloatType>()) {
+    if (mlir::isa<FloatType>(element_type)) {
       start = rewriter.getFloatAttr(element_type, 0.0);
       limit = rewriter.getFloatAttr(element_type, type.getShape()[dimension]);
       delta = rewriter.getFloatAttr(element_type, 1.0);
-    } else if (element_type.isa<IntegerType>()) {
+    } else if (mlir::isa<IntegerType>(element_type)) {
       start = rewriter.getIntegerAttr(element_type, 0);
       limit = rewriter.getIntegerAttr(element_type, type.getShape()[dimension]);
       delta = rewriter.getIntegerAttr(element_type, 1);
@@ -2249,9 +2240,10 @@ bool IsSpatialPoolingWithoutDilation(
 
   // Check that the individual padding values are corresponding to SAME
   // padding from TensorFlow.
-  auto operand_type = rw.getInputs()[0].getType().dyn_cast<RankedTensorType>();
+  auto operand_type =
+      mlir::dyn_cast<RankedTensorType>(rw.getInputs()[0].getType());
   RankedTensorType output_type =
-      rw.getResult(0).getType().dyn_cast<RankedTensorType>();
+      mlir::dyn_cast<RankedTensorType>(rw.getResult(0).getType());
   if (!operand_type || !output_type) return false;
 
   for (uint64_t i = 1; i < rank - 1; ++i) {
@@ -2293,12 +2285,13 @@ class ConvertLoweredCumOp : public OpConversionPattern<mhlo::ReduceWindowOp> {
     auto const_op = llvm::dyn_cast_or_null<mhlo::ConstantOp>(
         rw.getInitValues()[0].getDefiningOp());
     if (!const_op) return failure();
-    auto const_op_dense_value = const_op.getValue().cast<DenseElementsAttr>();
+    auto const_op_dense_value =
+        mlir::cast<DenseElementsAttr>(const_op.getValue());
     if (!const_op_dense_value || !IsInitValue(const_op_dense_value)) {
       return failure();
     }
 
-    auto operand_type = rw.getInputs()[0].getType().cast<ShapedType>();
+    auto operand_type = mlir::cast<ShapedType>(rw.getInputs()[0].getType());
 
     // For a cumulative op, require a tensor of 1s for each dimension in
     // operand.
@@ -2383,7 +2376,7 @@ class ConvertLoweredCumSumOp
     auto element_type = attr.getType().getElementType();
     if (attr.getNumElements() != 1 || !element_type.isIntOrFloat())
       return false;
-    if (element_type.isa<FloatType>()) {
+    if (mlir::isa<FloatType>(element_type)) {
       auto value = *attr.value_begin<APFloat>();
       return value.isZero();
     }
@@ -2399,7 +2392,7 @@ class ConvertLoweredCumProdOp
     auto element_type = attr.getType().getElementType();
     if (attr.getNumElements() != 1 || !element_type.isIntOrFloat())
       return false;
-    if (element_type.isa<FloatType>()) {
+    if (mlir::isa<FloatType>(element_type)) {
       auto value = *attr.value_begin<APFloat>();
       return value.isExactlyValue(1.0);
     }
@@ -2431,8 +2424,8 @@ class ConvertAvgPoolOp : public OpConversionPattern<mhlo::DivOp> {
 
     // Check that this is a floating point reduce window with a rank of 4 or 5.
     const RankedTensorType rw_type =
-        rw.getResult(0).getType().dyn_cast<RankedTensorType>();
-    if (!rw_type || !rw_type.getElementType().isa<FloatType>() ||
+        mlir::dyn_cast<RankedTensorType>(rw.getResult(0).getType());
+    if (!rw_type || !mlir::isa<FloatType>(rw_type.getElementType()) ||
         rw_type.getRank() <= 3 || rw_type.getRank() > 5)
       return failure();
 
@@ -2568,8 +2561,8 @@ class ConvertMaxPoolOp : public OpConversionPattern<mhlo::ReduceWindowOp> {
 
     // Check that this is a floating point reduce window with a rank of 4 or 5.
     const RankedTensorType rw_type =
-        rw.getResult(0).getType().dyn_cast<RankedTensorType>();
-    if (!rw_type || !rw_type.getElementType().isa<FloatType>() ||
+        mlir::dyn_cast<RankedTensorType>(rw.getResult(0).getType());
+    if (!rw_type || !mlir::isa<FloatType>(rw_type.getElementType()) ||
         rw_type.getRank() <= 3 || rw_type.getRank() > 5)
       return failure();
 
@@ -2639,7 +2632,7 @@ class ConvertMaxPoolOp : public OpConversionPattern<mhlo::ReduceWindowOp> {
 
 // Returns the shape of the given value in a Constant Op.
 arith::ConstantOp ShapeToConst(PatternRewriter& rewriter, Value value) {
-  ArrayRef<int64_t> shape = value.getType().cast<ShapedType>().getShape();
+  ArrayRef<int64_t> shape = mlir::cast<ShapedType>(value.getType()).getShape();
   auto attr_type = RankedTensorType::get({static_cast<int64_t>(shape.size())},
                                          rewriter.getIntegerType(64));
   auto attr = DenseElementsAttr::get(attr_type, shape);
@@ -2659,36 +2652,37 @@ bool IsSign(APFloat a, APFloat sign) {
 }
 
 bool IsDenseSplatIntAttr(ElementsAttr float_or_int) {
-  return float_or_int.isa<SplatElementsAttr>() &&
-         float_or_int.isa<DenseIntElementsAttr>();
+  return mlir::isa<SplatElementsAttr>(float_or_int) &&
+         mlir::isa<DenseIntElementsAttr>(float_or_int);
 }
 
 bool IsDenseSplatFloatAttr(ElementsAttr float_or_int) {
-  return float_or_int.isa<SplatElementsAttr>() &&
-         float_or_int.isa<DenseFPElementsAttr>();
+  return mlir::isa<SplatElementsAttr>(float_or_int) &&
+         mlir::isa<DenseFPElementsAttr>(float_or_int);
 }
 
 bool ValueIsReciprocal(ElementsAttr float_or_int, ElementsAttr rhs) {
   if (IsDenseSplatFloatAttr(float_or_int) &&
       IsDenseSplatFloatAttr(float_or_int)) {
-    return (float_or_int.cast<SplatElementsAttr>().getSplatValue<APFloat>() *
-            rhs.cast<SplatElementsAttr>().getSplatValue<APFloat>())
+    return (mlir::cast<SplatElementsAttr>(float_or_int)
+                .getSplatValue<APFloat>() *
+            mlir::cast<SplatElementsAttr>(rhs).getSplatValue<APFloat>())
         .isExactlyValue(1.0);
   } else if (IsDenseSplatIntAttr(float_or_int) &&
              IsDenseSplatIntAttr(float_or_int)) {
-    return (float_or_int.cast<SplatElementsAttr>().getSplatValue<APInt>() *
-            rhs.cast<SplatElementsAttr>().getSplatValue<APInt>()) == 1;
+    return (mlir::cast<SplatElementsAttr>(float_or_int).getSplatValue<APInt>() *
+            mlir::cast<SplatElementsAttr>(rhs).getSplatValue<APInt>()) == 1;
   }
   return false;
 }
 
 bool ValueEquals(ElementsAttr float_or_int, double rhs) {
   if (IsDenseSplatFloatAttr(float_or_int)) {
-    return float_or_int.cast<SplatElementsAttr>()
+    return mlir::cast<SplatElementsAttr>(float_or_int)
         .getSplatValue<APFloat>()
         .isExactlyValue(rhs);
   } else if (IsDenseSplatIntAttr(float_or_int)) {
-    return float_or_int.cast<SplatElementsAttr>().getSplatValue<APInt>() ==
+    return mlir::cast<SplatElementsAttr>(float_or_int).getSplatValue<APInt>() ==
            static_cast<int>(rhs);
   }
   return false;
@@ -2696,11 +2690,12 @@ bool ValueEquals(ElementsAttr float_or_int, double rhs) {
 
 bool ValueGreaterThanZero(ElementsAttr float_or_int) {
   if (IsDenseSplatIntAttr(float_or_int)) {
-    auto value = float_or_int.cast<SplatElementsAttr>().getSplatValue<APInt>();
+    auto value =
+        mlir::cast<SplatElementsAttr>(float_or_int).getSplatValue<APInt>();
     return !value.isNegative() && !value.isZero();
   } else if (IsDenseSplatFloatAttr(float_or_int)) {
     auto value =
-        float_or_int.cast<SplatElementsAttr>().getSplatValue<APFloat>();
+        mlir::cast<SplatElementsAttr>(float_or_int).getSplatValue<APFloat>();
     return !value.isNaN() && !value.isNegative() && !value.isZero();
   }
   return false;
@@ -2723,13 +2718,13 @@ bool TensorIsSign(PatternRewriter& rewriter, ElementsAttr float_or_int,
       int_spl && sgn_cst_spl) {
     return IsSign(int_spl.getValue(), sgn_cst_spl.getValue());
   }
-  if (float_or_int.isa<DenseFPElementsAttr>()) {
+  if (mlir::isa<DenseFPElementsAttr>(float_or_int)) {
     auto sgn_splat_value = sgn_splat.getSplatValue<APFloat>();
     return llvm::all_of(float_or_int.getValues<APFloat>(), [&](APFloat value) {
       return IsSign(value, sgn_splat_value);
     });
   }
-  if (float_or_int.isa<DenseIntElementsAttr>()) {
+  if (mlir::isa<DenseIntElementsAttr>(float_or_int)) {
     auto sgn_splat_value = sgn_splat.getSplatValue<APInt>();
     return llvm::all_of(float_or_int.getValues<APInt>(), [&](APInt value) {
       return IsSign(value, sgn_splat_value);
@@ -2778,9 +2773,11 @@ class ConvertGatherOp : public OpConversionPattern<mhlo::GatherOp> {
     Value start_indices = gather_op.getStartIndices();
 
     // Can only convert with static shaped gather.
-    ShapedType operand_type = operand.getType().cast<ShapedType>();
-    ShapedType start_indices_type = start_indices.getType().cast<ShapedType>();
-    ShapedType result_type = gather_op.getResult().getType().cast<ShapedType>();
+    ShapedType operand_type = mlir::cast<ShapedType>(operand.getType());
+    ShapedType start_indices_type =
+        mlir::cast<ShapedType>(start_indices.getType());
+    ShapedType result_type =
+        mlir::cast<ShapedType>(gather_op.getResult().getType());
     if (!operand_type.hasStaticShape()) {
       gather_op.emitOpError() << "Dynamic shaped operand is not supported.";
       return failure();
@@ -2917,9 +2914,11 @@ class ConvertGatherOp : public OpConversionPattern<mhlo::GatherOp> {
     static const int max_batch_size = 50;
 
     // Can only convert with static shaped gather.
-    ShapedType operand_type = operand.getType().cast<ShapedType>();
-    ShapedType start_indices_type = start_indices.getType().cast<ShapedType>();
-    ShapedType result_type = gather_op.getResult().getType().cast<ShapedType>();
+    ShapedType operand_type = mlir::cast<ShapedType>(operand.getType());
+    ShapedType start_indices_type =
+        mlir::cast<ShapedType>(start_indices.getType());
+    ShapedType result_type =
+        mlir::cast<ShapedType>(gather_op.getResult().getType());
     if (!operand_type.hasStaticShape() ||
         !start_indices_type.hasStaticShape() || !result_type.hasStaticShape()) {
       return rewriter.notifyMatchFailure(
@@ -3140,7 +3139,7 @@ class ConvertWhileOp : public OpConversionPattern<mhlo::WhileOp> {
 
     // This rule doesn't support mhlo::WhileOp with tuple inputs.
     for (auto type : while_op->getOperandTypes()) {
-      if (type.isa<TupleType>()) return failure();
+      if (mlir::isa<TupleType>(type)) return failure();
     }
 
     // Creates a TF::WhileRegionOp to replace the mhlo::WhileOp. HLO WhileOp
@@ -3296,7 +3295,7 @@ class ConvertCustomCallWithApproxTopK
       }
     }
     auto backend_config =
-        op.getBackendConfigAttr().dyn_cast_or_null<mlir::DictionaryAttr>();
+        mlir::dyn_cast_or_null<mlir::DictionaryAttr>(op.getBackendConfigAttr());
     if (!backend_config) {
       return op.emitOpError() << "Missing backend_config attribute";
     }
@@ -3385,12 +3384,13 @@ class ConvertCustomCallWithApproxTopK
              << "ApproxTopK takes exactly 1 called_computation.";
     }
     mlir::func::FuncOp callee = module_op_->lookupSymbol<mlir::func::FuncOp>(
-        op.getCalledComputations()[0].cast<FlatSymbolRefAttr>());
+        mlir::cast<FlatSymbolRefAttr>(op.getCalledComputations()[0]));
     mlir::FunctionType callee_type = callee.getFunctionType();
     SmallVector<Type, 4> expected_callee_input_types;
     auto num_inputs = op.getInputs().size() / 2;
     for (unsigned i = 0; i < num_inputs; ++i) {
-      auto input_type = op.getOperand(i).getType().dyn_cast<RankedTensorType>();
+      auto input_type =
+          mlir::dyn_cast<RankedTensorType>(op.getOperand(i).getType());
       auto scalar = RankedTensorType::get({}, input_type.getElementType());
       expected_callee_input_types.push_back(scalar);
       expected_callee_input_types.push_back(scalar);
@@ -3491,12 +3491,10 @@ class ConvertRealDynamicSliceOp
   LogicalResult matchAndRewrite(
       mhlo::RealDynamicSliceOp real_dynamic_slice_op, OpAdaptor adaptor,
       ConversionPatternRewriter& rewriter) const final {
-    auto start_indices_type = real_dynamic_slice_op.getStartIndices()
-                                  .getType()
-                                  .cast<RankedTensorType>();
-    auto end_indices_type = real_dynamic_slice_op.getLimitIndices()
-                                .getType()
-                                .cast<RankedTensorType>();
+    auto start_indices_type = mlir::cast<RankedTensorType>(
+        real_dynamic_slice_op.getStartIndices().getType());
+    auto end_indices_type = mlir::cast<RankedTensorType>(
+        real_dynamic_slice_op.getLimitIndices().getType());
 
     if (start_indices_type.getNumDynamicDims() != 0 ||
         end_indices_type.getNumDynamicDims() != 0) {
@@ -3522,7 +3520,7 @@ class ConvertDynamicIotaOp : public OpConversionPattern<mhlo::DynamicIotaOp> {
       mhlo::DynamicIotaOp dynamic_iota_op, OpAdaptor adaptor,
       ConversionPatternRewriter& rewriter) const final {
     RankedTensorType type =
-        dynamic_iota_op.getType().dyn_cast_or_null<RankedTensorType>();
+        mlir::dyn_cast_or_null<RankedTensorType>(dynamic_iota_op.getType());
     if (!type || type.getElementType().isUnsignedInteger(64)) {
       return rewriter.notifyMatchFailure(dynamic_iota_op,
                                          "TF::RangeOp doesn't support UI64");
@@ -3538,19 +3536,19 @@ class ConvertDynamicIotaOp : public OpConversionPattern<mhlo::DynamicIotaOp> {
     const uint64_t dimension = dynamic_iota_op.getIotaDimension();
     Type element_type = type.getElementType();
     Attribute start, delta;
-    if (element_type.isa<FloatType>()) {
+    if (mlir::isa<FloatType>(element_type)) {
       start = rewriter.getFloatAttr(element_type, 0.0);
       delta = rewriter.getFloatAttr(element_type, 1.0);
-    } else if (element_type.isa<IntegerType>()) {
+    } else if (mlir::isa<IntegerType>(element_type)) {
       start = rewriter.getIntegerAttr(element_type, 0);
       delta = rewriter.getIntegerAttr(element_type, 1);
     } else {
       return failure();
     }
     auto output_shape = dynamic_iota_op.getOperand();
-    if (element_type.isa<FloatType>()) {
+    if (mlir::isa<FloatType>(element_type)) {
       auto cast_type =
-          output_shape.getType().cast<ShapedType>().clone(element_type);
+          mlir::cast<ShapedType>(output_shape.getType()).clone(element_type);
       output_shape = rewriter.create<TF::CastOp>(dynamic_iota_op.getLoc(),
                                                  cast_type, output_shape);
     }
@@ -3581,7 +3579,7 @@ bool IsTFStyleBroadcast(DenseIntElementsAttr broadcast_dimensions,
   // broadcast_dimensions is an increasing list by definition, thus it suffices
   // to check the first element.
   int64_t input_rank = broadcast_dimensions.getNumElements();
-  int64_t output_rank = output.getType().cast<ShapedType>().getRank();
+  int64_t output_rank = mlir::cast<ShapedType>(output.getType()).getRank();
   return input_rank == 0 ||
          (broadcast_dimensions.getValues<APInt>()[0].getSExtValue() ==
           output_rank - input_rank);
@@ -3606,11 +3604,12 @@ arith::ConstantOp ExpandedShape(PatternRewriter& rewriter, Value input,
                                 Value output) {
   // Initialize expanded shape with output rank and dimensions of 1.
   SmallVector<Attribute, 4> expanded_shape(
-      output.getType().cast<ShapedType>().getRank(),
+      mlir::cast<ShapedType>(output.getType()).getRank(),
       /*Value=*/rewriter.getI64IntegerAttr(1));
 
   // Set dimension sizes specified by broadcast_dimensions.
-  ArrayRef<int64_t> input_shape = input.getType().cast<ShapedType>().getShape();
+  ArrayRef<int64_t> input_shape =
+      mlir::cast<ShapedType>(input.getType()).getShape();
   for (auto x : llvm::enumerate(broadcast_dimensions)) {
     expanded_shape[x.value().getSExtValue()] =
         rewriter.getI64IntegerAttr(input_shape[x.index()]);
@@ -3627,9 +3626,9 @@ arith::ConstantOp ExpandedShape(PatternRewriter& rewriter, Value input,
 Value ExpandedDynamicShape(PatternRewriter& rewriter, Value input,
                            DenseIntElementsAttr broadcast_dimensions,
                            Value output) {
-  assert(output.getType().cast<ShapedType>() &&
+  assert(mlir::cast<ShapedType>(output.getType()) &&
          "output type must be of ShapedType");
-  int64_t output_rank = output.getType().cast<ShapedType>().getRank();
+  int64_t output_rank = mlir::cast<ShapedType>(output.getType()).getRank();
   llvm::SmallVector<int64_t, 4> expanded_dimensions;
   llvm::SmallSet<int64_t, 4> broadcast_dimensions_values;
   for (auto x : llvm::enumerate(broadcast_dimensions)) {
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/custom_call.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/custom_call.cc
index 9d52ee30dd3ce7..520cff8681156a 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/custom_call.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/custom_call.cc
@@ -37,7 +37,7 @@ LogicalResult ConvertCustomCallOp::matchAndRewrite(
       rewriter.getStringAttr(mhlo_custom_call.getCallTargetName()));
 
   if (auto bc = mhlo_custom_call.getBackendConfig()) {
-    if (auto stringattr = bc->dyn_cast_or_null<mlir::StringAttr>()) {
+    if (auto stringattr = mlir::dyn_cast_or_null<mlir::StringAttr>(*bc)) {
       tfl_custom.setCustomOptionAttr(
           TFL::ConstBytesAttr::get(rewriter.getContext(), stringattr));
     }
@@ -53,7 +53,7 @@ LogicalResult ConvertCustomCallOp::matchAndRewrite(
 std::optional<bool> IsCustomCallLegal(mhlo::CustomCallOp op) {
   if (op.getCallTargetName().starts_with("custom_call.")) {
     auto bc = op.getBackendConfig();
-    if (!bc || bc->isa<mlir::StringAttr>()) {
+    if (!bc || mlir::isa<mlir::StringAttr>(*bc)) {
       return false;
     }
   }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/dot_general.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/dot_general.cc
index ef3337cbca27cd..ccd726d2737f84 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/dot_general.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/dot_general.cc
@@ -169,7 +169,7 @@ Value BuildDotOperandFlattenedShapeOp(Value operand,
                                       DotDimensionsInfo dot_dimensions_info,
                                       ImplicitLocOpBuilder& builder,
                                       bool is_lhs) {
-  auto operand_type = operand.getType().cast<ShapedType>();
+  auto operand_type = mlir::cast<ShapedType>(operand.getType());
   auto operand_shape = builder.create<TFL::ShapeOp>(
       RankedTensorType::get(static_cast<int32_t>(operand_type.getRank()),
                             builder.getIntegerType(32)),
@@ -248,8 +248,8 @@ Value BuildDotOperandFlattenedShapeOp(Value operand,
 Value ConvertDot(PatternRewriter& rewriter, Value lhs, Value rhs,
                  mhlo::DotDimensionNumbersAttr dot_dimension_numbers,
                  ShapedType result_type, mlir::Location loc) {
-  auto lhs_type = lhs.getType().cast<ShapedType>();
-  auto rhs_type = rhs.getType().cast<ShapedType>();
+  auto lhs_type = mlir::cast<ShapedType>(lhs.getType());
+  auto rhs_type = mlir::cast<ShapedType>(rhs.getType());
   const int lhs_rank = lhs_type.getRank();
   const int rhs_rank = rhs_type.getRank();
   ImplicitLocOpBuilder builder(loc, rewriter);
@@ -412,7 +412,7 @@ Value ConvertDot(PatternRewriter& rewriter, Value lhs, Value rhs,
 // be inserted when necessary. See ConvertDotGeneralOp for additional notes.
 Value ConvertDotOp(PatternRewriter& rewriter, Operation* old_op) {
   auto dot_op = cast<mhlo::DotOp>(old_op);
-  auto lhs_rank = dot_op.getLhs().getType().cast<ShapedType>().getRank();
+  auto lhs_rank = mlir::cast<ShapedType>(dot_op.getLhs().getType()).getRank();
   auto dot_dimension_numbers =
       mhlo::DotDimensionNumbersAttr::get(rewriter.getContext(),
                                          /*lhsBatchingDimensions=*/{},
@@ -422,15 +422,16 @@ Value ConvertDotOp(PatternRewriter& rewriter, Operation* old_op) {
                                          /*rhsContractingDimensions=*/{0});
   return ConvertDot(
       rewriter, dot_op.getLhs(), dot_op.getRhs(), dot_dimension_numbers,
-      dot_op.getResult().getType().cast<ShapedType>(), dot_op.getLoc());
+      mlir::cast<ShapedType>(dot_op.getResult().getType()), dot_op.getLoc());
 }
 
 Value ConvertDotGeneralOp(PatternRewriter& rewriter, Operation* old_op) {
   auto dot_general_op = cast<mhlo::DotGeneralOp>(old_op);
-  return ConvertDot(rewriter, dot_general_op.getLhs(), dot_general_op.getRhs(),
-                    dot_general_op.getDotDimensionNumbers(),
-                    dot_general_op.getResult().getType().cast<ShapedType>(),
-                    dot_general_op.getLoc());
+  return ConvertDot(
+      rewriter, dot_general_op.getLhs(), dot_general_op.getRhs(),
+      dot_general_op.getDotDimensionNumbers(),
+      mlir::cast<ShapedType>(dot_general_op.getResult().getType()),
+      dot_general_op.getLoc());
 }
 }  // namespace odml
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/reduce.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/reduce.h
index bfb705d00813d5..157cb82ce8e94e 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/reduce.h
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/reduce.h
@@ -74,7 +74,7 @@ class ConvertReduceOpToArgMinMax : public OpConversionPattern<mhlo::ReduceOp> {
     if (!MatchIota(reduce_op.getDimensions(), iota)) return failure();
 
     // Match the reduction computation.
-    const bool is_float = operand_init.getElementType().isa<FloatType>();
+    const bool is_float = mlir::isa<FloatType>(operand_init.getElementType());
     if (failed(MatchReduceToArgMinMaxType1(reduce_op, is_float, is_argmax)) &&
         failed(MatchReduceToArgMinMaxType2(reduce_op, is_argmax)))
       return rewriter.notifyMatchFailure(
@@ -91,8 +91,8 @@ class ConvertReduceOpToArgMinMax : public OpConversionPattern<mhlo::ReduceOp> {
     // Generate a Max and an ArgMax of as the mhlo op returns both while in TF
     // we have separate ops for them. If only one of them is used then the other
     // one will be garbage collected later.
-    if (!operand.getType().isa<ShapedType>()) return failure();
-    auto operand_type = operand.getType().cast<ShapedType>();
+    if (!mlir::isa<ShapedType>(operand.getType())) return failure();
+    auto operand_type = mlir::cast<ShapedType>(operand.getType());
     if (operand_type.getElementType().isInteger(1)) {
       // TF does not support min or max on boolean (int1) arguments.
       // Use AnyOp for MaxOp and AllOp for MinOp.
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/scatter.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/scatter.h
index fb0e0d80a4eb9b..f8ea8227137617 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/scatter.h
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/scatter.h
@@ -95,9 +95,9 @@ class ConvertScatterOp : public OpConversionPattern<mhlo::ScatterOp> {
       OperandRange updates = scatter_op.getUpdates();
       if (operands.size() != 1 || updates.size() != 1) return failure();
 
-      ShapedType operand_type = operands[0].getType().cast<ShapedType>();
-      ShapedType indices_type = indices.getType().cast<ShapedType>();
-      ShapedType updates_type = updates[0].getType().cast<ShapedType>();
+      ShapedType operand_type = mlir::cast<ShapedType>(operands[0].getType());
+      ShapedType indices_type = mlir::cast<ShapedType>(indices.getType());
+      ShapedType updates_type = mlir::cast<ShapedType>(updates[0].getType());
 
       Value new_updates = updates[0];
 
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.cc
index c2f533776d0408..783f0431e9b964 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.cc
@@ -203,7 +203,7 @@ Value InsertTranspose(Value value, int batch_dim, int feature_dim,
                       int default_batch_dim, int default_feature_dim,
                       int default_spatial_dim_start, int num_spatial_dims,
                       ConversionPatternRewriter& rewriter) {
-  auto type = value.getType().cast<RankedTensorType>();
+  auto type = mlir::cast<RankedTensorType>(value.getType());
   DenseIntElementsAttr permutation;
   const int spatial_dim_start = spatial_dimensions.front();
   if (!NeedsReformatTypeAndPermutation(
@@ -224,7 +224,7 @@ Value InsertTranspose(Value value, int batch_dim, int feature_dim,
 
 Value CreateCastToInt32(Value val, Location loc, PatternRewriter& rewriter) {
   IntegerType new_ele_type = rewriter.getIntegerType(32);
-  if (auto shaped_type = val.getType().dyn_cast<RankedTensorType>()) {
+  if (auto shaped_type = mlir::dyn_cast<RankedTensorType>(val.getType())) {
     ShapedType new_type =
         RankedTensorType::get(shaped_type.getShape(), new_ele_type);
     return rewriter.create<TFL::CastOp>(loc, new_type, val);
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_composite_to_tfl_custom.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_composite_to_tfl_custom.cc
index 6e0a3325460b7a..50521a02c7b907 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_composite_to_tfl_custom.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_composite_to_tfl_custom.cc
@@ -63,13 +63,13 @@ LogicalResult BuildOption(flexbuffers::Builder* fbb, Operation* op,
   const char* key = pair.getName().data();
   const auto attr = pair.getValue();
 
-  if (attr.isa<::mlir::IntegerAttr>()) {
-    fbb->Int(key, attr.dyn_cast<mlir::IntegerAttr>().getInt());
+  if (mlir::isa<::mlir::IntegerAttr>(attr)) {
+    fbb->Int(key, mlir::dyn_cast<mlir::IntegerAttr>(attr).getInt());
     return success();
   }
 
-  if (attr.isa<::mlir::FloatAttr>()) {
-    fbb->Double(key, attr.dyn_cast<mlir::FloatAttr>().getValueAsDouble());
+  if (mlir::isa<::mlir::FloatAttr>(attr)) {
+    fbb->Double(key, mlir::dyn_cast<mlir::FloatAttr>(attr).getValueAsDouble());
     return success();
   }
 
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_custom_call_to_composite.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_custom_call_to_composite.cc
index 4cfb0e04e96af4..e699c303bbaac2 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_custom_call_to_composite.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_custom_call_to_composite.cc
@@ -45,18 +45,19 @@ struct ReplaceCustomCallWithComposite final
   LogicalResult matchAndRewrite(mlir::stablehlo::CustomCallOp op,
                                 PatternRewriter &rewriter) const override {
     auto backendConfig =
-        op->getAttr("composite.backend_config").dyn_cast<DictionaryAttr>();
+        mlir::dyn_cast<DictionaryAttr>(op->getAttr("composite.backend_config"));
     if (!backendConfig)
       return op->emitError(
           "custom_call has no 'composite.backend_config' attribute or the "
           "attribute is not a dictionary");
 
-    auto name = backendConfig.get("name").dyn_cast<StringAttr>();
+    auto name = mlir::dyn_cast<StringAttr>(backendConfig.get("name"));
     if (!name)
       return op->emitError(
           "backend_config has no 'name' key or the name value is not a string");
 
-    auto attrs = backendConfig.get("attributes").dyn_cast<DictionaryAttr>();
+    auto attrs =
+        mlir::dyn_cast<DictionaryAttr>(backendConfig.get("attributes"));
     if (!attrs)
       return op->emitError(
           "backend_config has no 'attributes' key or the attributes value is "
@@ -66,7 +67,7 @@ struct ReplaceCustomCallWithComposite final
     if (!calledComputations || calledComputations.size() != 1)
       return op->emitError("expected exactly one called_computation");
 
-    auto decomposition = calledComputations[0].cast<FlatSymbolRefAttr>();
+    auto decomposition = mlir::cast<FlatSymbolRefAttr>(calledComputations[0]);
 
     auto composite = rewriter.create<mlir::stablehlo::CompositeOp>(
         op.getLoc(), op.getResultTypes(), op.getOperands(), name.str(), attrs,
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_to_vhlo.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_to_vhlo.cc
index cd6199e3c152f7..b3a85259cc482a 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_to_vhlo.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_to_vhlo.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "stablehlo/transforms/Passes.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h"
+#include "tensorflow/lite/core/macros.h"
 
 #define DEBUG_TYPE "compat-passes"
 
@@ -92,7 +93,7 @@ class StablehloToOdmlTypeConverter : public vhlo::VhloTypeConverter {
       return attr;
 
     if (auto stablehlo_attr =
-            attr.dyn_cast_or_null<stablehlo::TypeExtensionsAttr>()) {
+            mlir::dyn_cast_or_null<stablehlo::TypeExtensionsAttr>(attr)) {
       return vhlo::TypeExtensionsV1Attr::get(stablehlo_attr.getContext(),
                                              stablehlo_attr.getBounds());
     }
@@ -118,7 +119,8 @@ class VhloToStablehloTypeConverter : public vhlo::VhloTypeConverter {
   }
 
   Attribute convertEncoding(Attribute attr) const final {
-    if (auto vhlo_attr = attr.dyn_cast_or_null<vhlo::TypeExtensionsV1Attr>()) {
+    if (auto vhlo_attr =
+            mlir::dyn_cast_or_null<vhlo::TypeExtensionsV1Attr>(attr)) {
       return stablehlo::TypeExtensionsAttr::get(vhlo_attr.getContext(),
                                                 vhlo_attr.getBounds());
     }
@@ -230,7 +232,7 @@ LogicalResult ApplyVhloToVersionPatterns(ModuleOp module,
   PassManager pm(module.getContext());
   pm.addPass(stablehlo::createVhloToVersionPass({version}));
   if (failed(pm.run(module))) {
-    return module->emitError("Failed VHLO to version") << version;
+    return module->emitError("Failed VHLO to version ") << version;
   }
   return success();
 }
@@ -274,11 +276,11 @@ struct LegalizeStablehloToVhloPass
           LegalizeStablehloToVhloPass> {
   void runOnOperation() override {
     ModuleOp module = getOperation();
-    std::string target_version = "0.14.0";
+    std::string target_version = tflite_supported_stablehlo_version;
     VhloToStablehloTypeConverter to_builtin_converter;
 
     // StableHLO --> VHLO (allow funcs)
-    //   VHLO -> Downgrade to 0.14.0
+    //   VHLO -> Downgrade to tflite_supported_stablehlo_version
     //     VHLO Tensor --> Builtin Tensor
     //       Remove cast(tensor->vhlo) -> cast(vhlo->tensor) pattern
     if (failed(ApplyStablehloToVhloPatterns(module,
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/op_stat_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/op_stat_pass.cc
index f7a136f2259ad2..82c7a4b4687055 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/op_stat_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/op_stat_pass.cc
@@ -105,8 +105,8 @@ void PrintOpStatsPass::runOnOperation() {
           (dyn_cast_or_null<DynamicRangeQuantizedOpInterface>(op))
               ? op->getOperand(1)
               : op->getResult(0);
-      ShapedType value_shaped_type =
-          value_for_deducing_op_type.getType().dyn_cast_or_null<ShapedType>();
+      ShapedType value_shaped_type = mlir::dyn_cast_or_null<ShapedType>(
+          value_for_deducing_op_type.getType());
       if (value_shaped_type != nullptr) {
         auto operand_or_result = value_shaped_type.getElementType();
         std::string dtype;
@@ -122,15 +122,16 @@ void PrintOpStatsPass::runOnOperation() {
             })
             .Case<UniformQuantizedType>([&](Type) {
               auto uniform_quantized_dtype =
-                  operand_or_result.dyn_cast_or_null<UniformQuantizedType>()
+                  mlir::dyn_cast_or_null<UniformQuantizedType>(
+                      operand_or_result)
                       .getStorageType();
               dtype = absl::StrCat(
                   "uq_", uniform_quantized_dtype.getIntOrFloatBitWidth());
             })
             .Case<quant::UniformQuantizedPerAxisType>([&](Type) {
               auto uniform_quantized_dtype =
-                  operand_or_result
-                      .dyn_cast_or_null<quant::UniformQuantizedPerAxisType>()
+                  mlir::dyn_cast_or_null<quant::UniformQuantizedPerAxisType>(
+                      operand_or_result)
                       .getStorageType();
               dtype = absl::StrCat(
                   "uq_", uniform_quantized_dtype.getIntOrFloatBitWidth());
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/optimize.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/optimize.cc
index b0797521798994..d9c23dfa12b8ae 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/optimize.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
@@ -37,8 +38,8 @@ namespace odml {
 // Convert mhlo.dot to mhlo.dot_general.
 LogicalResult ConvertDotToDotGeneral(mhlo::DotOp op,
                                      PatternRewriter &rewriter) {
-  auto lhs_type = op.getLhs().getType().cast<ShapedType>();
-  auto rhs_type = op.getRhs().getType().cast<ShapedType>();
+  auto lhs_type = mlir::cast<ShapedType>(op.getLhs().getType());
+  auto rhs_type = mlir::cast<ShapedType>(op.getRhs().getType());
   if (!lhs_type.hasRank() || !rhs_type.hasRank()) {
     return rewriter.notifyMatchFailure(op, "unsupported unranked input type");
   }
@@ -264,7 +265,7 @@ LogicalResult LiftDotConcatLHS(mhlo::ConcatenateOp concat,
   new_concat_shape[new_concat_dim] = 0;
   for (auto v : all_dot_lhs) {
     new_concat_shape[new_concat_dim] +=
-        v.getType().dyn_cast<ShapedType>().getShape()[new_concat_dim];
+        mlir::dyn_cast<ShapedType>(v.getType()).getShape()[new_concat_dim];
   }
 
   auto new_concat = rewriter.create<mhlo::ConcatenateOp>(
@@ -353,7 +354,7 @@ LogicalResult LiftDotConcatLHSAndRHS(mhlo::ConcatenateOp concat,
   lhs_new_concat_shape[lhs_batch_dim] = 0;
   for (auto v : all_dot_lhs) {
     lhs_new_concat_shape[lhs_batch_dim] +=
-        v.getType().dyn_cast<ShapedType>().getShape()[lhs_batch_dim];
+        mlir::dyn_cast<ShapedType>(v.getType()).getShape()[lhs_batch_dim];
   }
   const int64_t rhs_batch_dim =
       first_dot.getDotDimensionNumbers().getRhsBatchingDimensions()[0];
@@ -362,7 +363,7 @@ LogicalResult LiftDotConcatLHSAndRHS(mhlo::ConcatenateOp concat,
   rhs_new_concat_shape[rhs_batch_dim] = 0;
   for (auto v : all_dot_rhs) {
     rhs_new_concat_shape[rhs_batch_dim] +=
-        v.getType().dyn_cast<ShapedType>().getShape()[rhs_batch_dim];
+        mlir::dyn_cast<ShapedType>(v.getType()).getShape()[rhs_batch_dim];
   }
 
   auto lhs_new_concat = rewriter.create<mhlo::ConcatenateOp>(
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/optimize_layout.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/optimize_layout.cc
new file mode 100644
index 00000000000000..11cb7254b75c76
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/optimize_layout.cc
@@ -0,0 +1,210 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file implements logic for legalizing HLO to TensorFlow.
+
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <utility>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Utils/IndexingUtils.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h"
+
+namespace mlir {
+namespace odml {
+namespace {
+
+#define DEBUG_TYPE "stablehlo-optimize-layout"
+
+#define GEN_PASS_DEF_TRANSPOSECOMMUTEOPSPASS
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h.inc"
+
+class TransposeCommuteOpsPass
+    : public impl::TransposeCommuteOpsPassBase<TransposeCommuteOpsPass> {
+  void runOnOperation() override;
+};
+
+// Inversely permutate a given vector
+static SmallVector<int64_t> InvertPermutationToVector(ArrayRef<int64_t> vec,
+                                                      ArrayRef<int64_t> perm) {
+  return applyPermutation(vec, invertPermutationVector(perm));
+}
+
+static RankedTensorType GetPermutedTensorTypeHelper(RankedTensorType type,
+                                                    ArrayRef<int64_t> perm,
+                                                    bool isInvert) {
+  SmallVector<int64_t, 4> permutedShape = applyPermutation(
+      type.getShape(), isInvert ? invertPermutationVector(perm) : perm);
+  return RankedTensorType::get(permutedShape, type.getElementType());
+}
+
+static RankedTensorType GetInvertPermutedTensorType(RankedTensorType type,
+                                                    ArrayRef<int64_t> perm) {
+  return GetPermutedTensorTypeHelper(type, perm, true /*isInvert*/);
+}
+
+static Value CreateTranspose(OpBuilder& builder, Value source,
+                             ArrayRef<int64_t> perm) {
+  return builder.create<stablehlo::TransposeOp>(source.getLoc(), source, perm)
+      ->getResult(0);
+}
+
+// Transform pad(transpose(x)) to transpose(pad(x))
+struct TransposeCommuteWithPad : public OpRewritePattern<stablehlo::PadOp> {
+  using OpRewritePattern<stablehlo::PadOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(stablehlo::PadOp pad_op,
+                                PatternRewriter& rewriter) const override {
+    Value pad_input = pad_op.getOperand();
+    RankedTensorType pad_type = pad_op.getType().cast<RankedTensorType>();
+
+    auto transpose_op = pad_input.getDefiningOp<stablehlo::TransposeOp>();
+    if (!transpose_op || !transpose_op->hasOneUse()) return failure();
+    Value transpose_input = transpose_op.getOperand();
+
+    ArrayRef<int64_t> transpose_perm = transpose_op.getPermutation();
+    SmallVector<int64_t> new_padding_low =
+        InvertPermutationToVector(pad_op.getEdgePaddingLow(), transpose_perm);
+    SmallVector<int64_t> new_padding_high =
+        InvertPermutationToVector(pad_op.getEdgePaddingHigh(), transpose_perm);
+    SmallVector<int64_t> new_padding_interrier =
+        InvertPermutationToVector(pad_op.getInteriorPadding(), transpose_perm);
+
+    RankedTensorType new_pad_type =
+        GetInvertPermutedTensorType(pad_type, transpose_perm);
+    Value new_pad = rewriter.create<stablehlo::PadOp>(
+        pad_op.getLoc(), new_pad_type, transpose_input,
+        pad_op.getPaddingValue(), new_padding_low, new_padding_high,
+        new_padding_interrier);
+
+    Value orig_pad = CreateTranspose(rewriter, new_pad, transpose_perm);
+    rewriter.replaceOp(pad_op, orig_pad);
+    return success();
+  }
+};
+
+// Transform reduce_window(transpose(x)) to transpose(reduce_window(x))
+struct TransposeCommuteWithReduceWindow
+    : public OpRewritePattern<stablehlo::ReduceWindowOp> {
+  using OpRewritePattern<stablehlo::ReduceWindowOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(stablehlo::ReduceWindowOp reduce_op,
+                                PatternRewriter& rewriter) const override {
+    MLIRContext* ctx = reduce_op.getContext();
+    ValueRange inputs = reduce_op.getInputs();
+    // Only handle binary reduce ops for now
+    if (inputs.size() != 1) return failure();
+    Value reduce_input = inputs[0];
+
+    RankedTensorType reduce_type =
+        reduce_op.getResultTypes()[0].cast<RankedTensorType>();
+
+    auto transpose_op = reduce_input.getDefiningOp<stablehlo::TransposeOp>();
+    if (!transpose_op || !transpose_op->hasOneUse()) return failure();
+    Value transpose_input = transpose_op.getOperand();
+
+    ArrayRef<int64_t> transpose_perm = transpose_op.getPermutation();
+
+    // Inversely transposes all the attributes to prepare for the new reduce op
+    auto getInvertPermutedAttr =
+        [&](std::optional<ArrayRef<int64_t>> vals) -> DenseI64ArrayAttr {
+      return vals.has_value()
+                 ? DenseI64ArrayAttr::get(
+                       ctx, InvertPermutationToVector(*vals, transpose_perm))
+                 : nullptr;
+    };
+    DenseI64ArrayAttr new_window_dimensions =
+        getInvertPermutedAttr(reduce_op.getWindowDimensions());
+    DenseI64ArrayAttr new_window_strides =
+        getInvertPermutedAttr(reduce_op.getWindowStrides());
+    DenseI64ArrayAttr new_base_dilations =
+        getInvertPermutedAttr(reduce_op.getBaseDilations());
+    DenseI64ArrayAttr new_win_dilations =
+        getInvertPermutedAttr(reduce_op.getWindowDilations());
+
+    auto padding = reduce_op.getPadding();
+    int64_t rank = transpose_perm.size();
+    DenseIntElementsAttr new_padding_attr = nullptr;
+    if (padding.has_value()) {
+      SmallVector<int64_t> new_padding(rank * 2, 0);
+      auto old_padding = (*padding).getValues<int64_t>();
+      for (int64_t idx = 0; idx < rank; ++idx) {
+        new_padding[2 * transpose_perm[idx]] = old_padding[2 * idx];
+        new_padding[2 * transpose_perm[idx] + 1] = old_padding[2 * idx + 1];
+      }
+      new_padding_attr =
+          DenseIntElementsAttr::get((*padding).getType(), new_padding);
+    }
+
+    RankedTensorType new_reduce_type =
+        GetInvertPermutedTensorType(reduce_type, transpose_perm);
+    auto new_reduce_op = rewriter.create<stablehlo::ReduceWindowOp>(
+        reduce_op.getLoc(), new_reduce_type, transpose_input,
+        reduce_op.getInitValues()[0], new_window_dimensions, new_window_strides,
+        new_base_dilations, new_win_dilations, new_padding_attr);
+    IRMapping mapping;
+    reduce_op.getBody().cloneInto(&new_reduce_op.getBody(), mapping);
+
+    Value orig_reduce_op =
+        CreateTranspose(rewriter, new_reduce_op->getResult(0), transpose_perm);
+    rewriter.replaceOp(reduce_op, orig_reduce_op);
+    return success();
+  }
+};
+
+void TransposeCommuteOpsPass::runOnOperation() {
+  auto* ctx = &getContext();
+
+  RewritePatternSet patterns(ctx);
+  patterns.add<TransposeCommuteWithPad, TransposeCommuteWithReduceWindow>(ctx);
+  if (failed(
+          applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)))) {
+    return signalPassFailure();
+  }
+}
+
+}  // end namespace
+
+std::unique_ptr<OperationPass<ModuleOp>> CreateTransposeCommuteOpsPass() {
+  return std::make_unique<TransposeCommuteOpsPass>();
+}
+
+static PassRegistration<TransposeCommuteOpsPass> pass;
+
+}  // end namespace odml
+}  // end namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h
index 49e8b673f63374..6c8e587871d393 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h
@@ -49,6 +49,9 @@ CreateComposeUniformQuantizedTypePass();
 std::unique_ptr<OperationPass<func::FuncOp>>
 CreateUniformQuantizedStableHloToTflPass();
 
+// Create a pass that commute transposes through specific ops
+std::unique_ptr<OperationPass<ModuleOp>> CreateTransposeCommuteOpsPass();
+
 // Create a pass that legalizes MHLO to TF dialect.
 std::unique_ptr<OperationPass<ModuleOp>> CreateLegalizeHloToTfPass();
 
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.td b/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.td
index a535d3aa867c80..1a1d7335b0517e 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.td
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.td
@@ -114,3 +114,9 @@ def CompositeLoweringPass : Pass<"composite-lowering", "ModuleOp"> {
   let dependentDialects = ["mlir::mhlo::MhloDialect", "TFL::TensorFlowLiteDialect"];
   let constructor = "mlir::odml::CreateCompositeLoweringPass()";
 }
+
+def TransposeCommuteOpsPass : Pass<"transpose-commute-ops", "ModuleOp"> {
+  let summary = "Move transpose through specific ops";
+  let dependentDialects = ["mlir::stablehlo::StablehloDialect"];
+  let constructor = "mlir::odml::CreateTransposeCommuteOpsPass()";
+}
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/rename_entrypoint_to_main.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/rename_entrypoint_to_main.cc
index 72ae7cc1c0047d..81c6fc47473d43 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/rename_entrypoint_to_main.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/rename_entrypoint_to_main.cc
@@ -18,10 +18,12 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 
 namespace mlir {
@@ -61,21 +63,40 @@ class RenameEntrypointToMainPass
       //   }
       // clang-format on
       for (auto attr : session_initializer.getInitializers()) {
-        auto sym_attr = attr.dyn_cast<FlatSymbolRefAttr>();
+        auto sym_attr = mlir::dyn_cast<FlatSymbolRefAttr>(attr);
         if (!sym_attr) break;
         entrypoints.erase(sym_attr.getValue());
       }
     }
 
     if (entrypoints.empty()) {
-      fail(module, "No entrypoints found");
-    } else if (entrypoints.size() == 1) {
+      return fail(module, "No entrypoints found");
+    }
+    if (entrypoints.size() == 1) {
       auto entrypoint = entrypoints.begin()->second;
       Builder builder(entrypoint);
       entrypoint.setName(builder.getStringAttr("main"));
-    } else {
-      fail(module, "Too many entrypoints found");
+      return;
+    }
+
+    // In case we have more than 1 entry points, choose the one with
+    // 'tf.entry_function' attribute set.
+    llvm::SmallVector<func::FuncOp, 4> candidate_funcs;
+    for (auto& entrypoint : entrypoints) {
+      if (entrypoint.second->hasAttr("tf.entry_function")) {
+        candidate_funcs.push_back(entrypoint.second);
+      }
+    }
+
+    if (candidate_funcs.empty()) {
+      return fail(module, "No entrypoints found");
+    }
+    if (candidate_funcs.size() > 1) {
+      return fail(module, "Too many entrypoints found");
     }
+    // Found entrypoint
+    Builder builder(candidate_funcs[0]);
+    candidate_funcs[0].setName(builder.getStringAttr("main"));
   }
 };
 
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tflite_legalize_hlo.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tflite_legalize_hlo.cc
index 4304d34f4743ec..f86b78275fb951 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tflite_legalize_hlo.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tflite_legalize_hlo.cc
@@ -66,7 +66,7 @@ class ConvertReduceOpToTFLiteArgmax
     auto element_type = attr.getType().getElementType();
     if (attr.getNumElements() != 1 || !element_type.isIntOrFloat())
       return false;
-    if (element_type.isa<FloatType>()) {
+    if (mlir::isa<FloatType>(element_type)) {
       auto value = *attr.value_begin<APFloat>();
       return value.isNegative() && value.isInfinity();
     } else if (element_type.isInteger(1)) {
@@ -90,7 +90,7 @@ class ConvertReduceOpToTFLiteArgmin
     auto element_type = attr.getType().getElementType();
     if (attr.getNumElements() != 1 || !element_type.isIntOrFloat())
       return false;
-    if (element_type.isa<FloatType>()) {
+    if (mlir::isa<FloatType>(element_type)) {
       auto value = *attr.value_begin<APFloat>();
       return !value.isNegative() && value.isInfinity();
     } else if (element_type.isInteger(1)) {
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.cc
index 7eb3abdef793eb..fdbf12538f230e 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.cc
@@ -77,7 +77,7 @@ void AddTFToStablehloPasses(OpPassManager& pm, bool skip_resize,
   // TF -> StableHLO legalization.
   AddLegalizeTFToStablehloPasses(pm, /*skip_quantization_ops=*/false,
                                  skip_resize,
-                                 /*skip_stateful_partitioned_call=*/false);
+                                 /*skip_partitioned_calls=*/false);
 
   // Wrap disallowed ops in stablehlo.custom_call ops.
   if (smuggle_disallowed_ops) {
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/unfold_splat_constant_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/unfold_splat_constant_pass.cc
index e7f86a022d2274..7a3abd35d0d376 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/unfold_splat_constant_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/unfold_splat_constant_pass.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 
 namespace mlir {
@@ -60,7 +61,7 @@ class UnfoldSplatConstantPass
   void UnfoldSplatConstant(mlir::OpBuilder* op_builder,
                            mhlo::ConstantOp const_op) const {
     auto splat_elements_attr =
-        const_op.getValue().dyn_cast<SplatElementsAttr>();
+        mlir::dyn_cast<SplatElementsAttr>(const_op.getValue());
     if (!splat_elements_attr) {
       return;
     }
@@ -68,8 +69,8 @@ class UnfoldSplatConstantPass
       return;
     }
     auto element_type = splat_elements_attr.getType().getElementType();
-    if (element_type.isa<ComplexType>() ||
-        element_type.isa<quant::QuantizedType>()) {
+    if (mlir::isa<ComplexType>(element_type) ||
+        mlir::isa<quant::QuantizedType>(element_type)) {
       return;
     }
     op_builder->setInsertionPoint(const_op);
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/unfuse_batch_norm_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/unfuse_batch_norm_pass.cc
index f4cdad00b79774..dadcabc55a5e57 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/unfuse_batch_norm_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/unfuse_batch_norm_pass.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h"
@@ -60,7 +61,8 @@ Value broadcastToFeatureDim(Location loc, RankedTensorType result_type,
 
 // Gets the shape of operand, assuming it is a dynamic shape with static rank.
 Value getShapeValue(Location loc, Value operand, PatternRewriter &rewriter) {
-  RankedTensorType resultType = operand.getType().dyn_cast<RankedTensorType>();
+  RankedTensorType resultType =
+      mlir::dyn_cast<RankedTensorType>(operand.getType());
   return rewriter.create<shape::ShapeOfOp>(
       loc,
       RankedTensorType::get(/*shape=*/{resultType.getRank()},
@@ -92,8 +94,8 @@ Value materializeEpsilon(Operation *op, FloatAttr epsilon_attr,
   }
 
   auto scalar_type = RankedTensorType::get(/*shape=*/{}, fp_type);
-  auto epsilon_tensor_attr =
-      DenseElementsAttr::get(scalar_type, {epsilon_attr.cast<Attribute>()});
+  auto epsilon_tensor_attr = DenseElementsAttr::get(
+      scalar_type, {mlir::cast<Attribute>(epsilon_attr)});
   Value epsilon = b.create<mhlo::ConstantOp>(epsilon_tensor_attr);
   auto dims_type = RankedTensorType::get(/*shape=*/{0}, b.getIntegerType(64));
   auto dims = DenseIntElementsAttr::get(dims_type, SmallVector<int64_t, 1>{});
@@ -113,7 +115,7 @@ class UnfuseBatchNormTrainingPattern
   LogicalResult matchAndRewrite(mhlo::BatchNormTrainingOp bn_op,
                                 PatternRewriter &rewriter) const override {
     auto inputs = bn_op.getOperand();
-    auto input_type = inputs.getType().dyn_cast<RankedTensorType>();
+    auto input_type = mlir::dyn_cast<RankedTensorType>(inputs.getType());
     if (!input_type) {
       return failure();
     }
@@ -172,13 +174,14 @@ class UnfuseBatchNormInferencePattern
     // Enforce type invariants.
     // Note that we deduce the actual element type from the variance,
     // which should not be subject to quantization at a higher level.
-    auto input_type = bn_op.getOperand().getType().dyn_cast<RankedTensorType>();
+    auto input_type =
+        mlir::dyn_cast<RankedTensorType>(bn_op.getOperand().getType());
     auto variance_type =
-        bn_op.getVariance().getType().dyn_cast<RankedTensorType>();
+        mlir::dyn_cast<RankedTensorType>(bn_op.getVariance().getType());
     if (!input_type || !variance_type) {
       return failure();
     }
-    auto fp_type = variance_type.getElementType().dyn_cast<FloatType>();
+    auto fp_type = mlir::dyn_cast<FloatType>(variance_type.getElementType());
     if (!fp_type) {
       return failure();
     }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc
index ad3bc3cd4cd24d..c3a05d5a0706a7 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc
@@ -310,6 +310,137 @@ Type GetQuantizedOutputType(Operation* op, PatternRewriter& rewriter,
       new_result_quantized_type);
 }
 
+// Matches kernel dimension numbers, ranks of input and output and constant
+// kernel for legalization to TFLite convolution ops.
+LogicalResult MatchConvolutionFormat(stablehlo::ConvolutionOp op) {
+  stablehlo::ConvDimensionNumbersAttr dimension_numbers =
+      op.getDimensionNumbers();
+  const int64_t kernel_input_feature_dim =
+      dimension_numbers.getKernelInputFeatureDimension();
+  if (kernel_input_feature_dim != 2) {
+    LLVM_DEBUG(llvm::dbgs() << "Expected kernel input feature == 2. Got: "
+                            << kernel_input_feature_dim << ".\n");
+    return failure();
+  }
+
+  const int64_t kernel_output_feature_dim =
+      dimension_numbers.getKernelOutputFeatureDimension();
+  if (kernel_output_feature_dim != 3) {
+    LLVM_DEBUG(llvm::dbgs() << "Expected kernel output feature == 3. Got: "
+                            << kernel_output_feature_dim << ".\n");
+    return failure();
+  }
+
+  const auto input_type = op.getLhs().getType().cast<TensorType>();
+  if (input_type.getRank() != 4) {
+    LLVM_DEBUG(llvm::dbgs() << "Only 2D convolution op is supported. "
+                               "Expected input rank of 4. Got: "
+                            << input_type.getRank() << ".\n");
+    return failure();
+  }
+
+  const auto filter_type = op.getRhs().getType().cast<TensorType>();
+  if (filter_type.getRank() != 4) {
+    LLVM_DEBUG(llvm::dbgs() << "Only 2D convolution op is supported. "
+                               "Expected filter rank of 4. Got: "
+                            << filter_type.getRank() << ".\n");
+    return failure();
+  }
+
+  if (Operation* filter_op = op.getRhs().getDefiningOp();
+      filter_op == nullptr || !isa<stablehlo::ConstantOp>(filter_op)) {
+    LLVM_DEBUG(llvm::dbgs() << "Filter should be a constant.\n");
+    return failure();
+  }
+
+  return success();
+}
+
+// Transposes the convolution filter tensor of format [0, 1, i, o] to match the
+// filter tensor format for TFLite convolution. The following transformations
+// are supported:
+//
+// Depthwise case (`feature_group_count` > 1)
+//   * Permutates given filter to `[i, 0, 1, o]` format.
+// General convolution (`feature_group_count` = 1)
+//   * Permutates given filter to `[o, 0, 1, i]` format.
+// Using TransposeOp doesn't work because the quantized dimension
+// changes which violates the constraint for the TransposeOp that the
+// input's and output's element type should be the same.
+DenseIntElementsAttr TransposeFilterInConvolution(
+    Location loc, PatternRewriter& rewriter,
+    const DenseIntElementsAttr& filter_value_attr, const bool is_depthwise) {
+  ArrayRef<int64_t> filter_shape = filter_value_attr.getShapedType().getShape();
+  SmallVector<int8_t> filter_constant_values{
+      filter_value_attr.getValues<int8_t>()};
+  SmallVector<int8_t> new_filter_constant_values(filter_constant_values.size(),
+                                                 0);
+  SmallVector<int64_t, 4> transpose_dims;
+  if (is_depthwise) {
+    transpose_dims = {2, 0, 1, 3};
+  } else {
+    transpose_dims = {3, 0, 1, 2};
+  }
+
+  SmallVector<int64_t> new_filter_shape;
+  new_filter_shape.reserve(filter_shape.size());
+  for (int i = 0; i < filter_shape.size(); ++i) {
+    new_filter_shape.push_back(filter_shape[transpose_dims[i]]);
+  }
+
+  auto get_array_idx = [](ArrayRef<int64_t> shape, const int i, const int j,
+                          const int k, const int l) -> int64_t {
+    return (i * shape[1] * shape[2] * shape[3]) + (j * shape[2] * shape[3]) +
+           (k * shape[3]) + l;
+  };
+
+  // Transpose the filter value.
+  // TODO: b/336203735 - Use `DenseElementsTransposer` instead of manual
+  // transpose.
+  for (int i = 0; i < filter_shape[0]; ++i) {
+    for (int j = 0; j < filter_shape[1]; ++j) {
+      for (int k = 0; k < filter_shape[2]; ++k) {
+        for (int l = 0; l < filter_shape[3]; ++l) {
+          // [o, 0, 1, i] for `tfl.conv_2d` case`,
+          // [i, 0, 1, o] for `tfl.depthwise_conv_2d` case.
+          int old_idx = get_array_idx(filter_shape, i, j, k, l);
+          int new_idx = is_depthwise
+                            ? get_array_idx(new_filter_shape, k, i, j, l)
+                            : get_array_idx(new_filter_shape, l, i, j, k);
+          new_filter_constant_values[new_idx] = filter_constant_values[old_idx];
+        }
+      }
+    }
+  }
+
+  // Create the new filter constant.
+  auto new_filter_value_attr_type =
+      RankedTensorType::getChecked(loc, new_filter_shape,
+                                   /*elementType=*/rewriter.getI8Type());
+  auto new_filter_constant_value_attr = DenseIntElementsAttr::get(
+      new_filter_value_attr_type, new_filter_constant_values);
+
+  return new_filter_constant_value_attr;
+}
+
+// Checks if the given convolution op is depthwise.
+bool IsDepthwiseConvolution(stablehlo::ConvolutionOp op) {
+  // `feature_group_count` controls how the input channel dimension is
+  // split.
+  // A value bigger than one signals depthwise convolution behavior.
+  return op.getFeatureGroupCount() > 1;
+}
+
+// Returns kernel output feature dimension of TFLite convolutions.
+int64_t GetConvolutionKernelOutputFeatureDimension(bool is_depthwise) {
+  return is_depthwise ? 3 : 0;
+}
+
+// Returns kernel input feature dimension of TFLite convolutions.
+int64_t GetConvolutionKernelInputFeatureDimension(bool is_depthwise) {
+  return is_depthwise ? 0 : 3;
+}
+
 // stablehlo.uniform_quantize -> tfl.quantize
 // TODO: b/322428814 - Add StableHLO quantizer integration tests for ODML.
 class RewriteUniformQuantizeOp
@@ -881,24 +1012,6 @@ class RewriteQuantizedConvolutionOp
         IsI32F32UniformQuantizedPerAxisType(GetElementType(op.getResult()));
     const bool fuse_bias_constant =
         FindUserOfType<stablehlo::AddOp>(op) && has_i32_output;
-    stablehlo::ConvDimensionNumbersAttr dimension_numbers =
-        op.getDimensionNumbers();
-
-    const int64_t kernel_input_feature_dim =
-        dimension_numbers.getKernelInputFeatureDimension();
-    if (kernel_input_feature_dim != 2) {
-      LLVM_DEBUG(llvm::dbgs() << "Expected kernel input feature == 2. Got: "
-                              << kernel_input_feature_dim << ".\n");
-      return failure();
-    }
-
-    const int64_t kernel_output_feature_dim =
-        dimension_numbers.getKernelOutputFeatureDimension();
-    if (kernel_output_feature_dim != 3) {
-      LLVM_DEBUG(llvm::dbgs() << "Expected kernel output feature == 3. Got: "
-                              << kernel_output_feature_dim << ".\n");
-      return failure();
-    }
 
     if (failed(MatchInput(op.getOperand(0)))) {
       LLVM_DEBUG(llvm::dbgs()
@@ -918,6 +1031,12 @@ class RewriteQuantizedConvolutionOp
       return failure();
     }
 
+    if (failed(MatchConvolutionFormat(op))) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Failed to match dimension format for convolution_op.\n");
+      return failure();
+    }
+
     if (fuse_bias_constant) {
       Operation* add_op = FindUserOfType<stablehlo::AddOp>(op);
       if (add_op == nullptr) {
@@ -941,7 +1060,7 @@ class RewriteQuantizedConvolutionOp
     stablehlo::ConvDimensionNumbersAttr dimension_numbers =
         op.getDimensionNumbers();
 
-    const bool is_depthwise = IsDepthwiseConvolution(op, dimension_numbers);
+    const bool is_depthwise = IsDepthwiseConvolution(op);
     const bool is_transpose_conv = IsTransposeConv(op, dimension_numbers);
     const bool fuse_bias_constant =
         FindUserOfType<stablehlo::AddOp>(op) && has_i32_output;
@@ -1029,13 +1148,6 @@ class RewriteQuantizedConvolutionOp
  private:
   static LogicalResult MatchInput(Value input) {
     auto input_type = input.getType().cast<TensorType>();
-    if (input_type.getRank() != 4) {
-      LLVM_DEBUG(llvm::dbgs() << "Only 2D convolution op is supported. "
-                                 "Expected input rank of 4. Got: "
-                              << input_type.getRank() << ".\n");
-      return failure();
-    }
-
     if (const auto input_element_type = input_type.getElementType();
         !IsI8F32UniformQuantizedType(input_element_type)) {
       LLVM_DEBUG(llvm::dbgs()
@@ -1049,13 +1161,6 @@ class RewriteQuantizedConvolutionOp
 
   static LogicalResult MatchFilter(Value filter) {
     auto filter_type = filter.getType().cast<TensorType>();
-    if (filter_type.getRank() != 4) {
-      LLVM_DEBUG(llvm::dbgs() << "Only 2D convolution op is supported. "
-                                 "Expected filter rank of 4. Got: "
-                              << filter_type.getRank() << ".\n");
-      return failure();
-    }
-
     const Type filter_element_type = filter_type.getElementType();
     if (!IsI8F32UniformQuantizedPerAxisType(filter_type.getElementType())) {
       LLVM_DEBUG(
@@ -1071,12 +1176,6 @@ class RewriteQuantizedConvolutionOp
                               << filter_element_type << "\n");
       return failure();
     }
-
-    if (Operation* filter_op = filter.getDefiningOp();
-        filter_op == nullptr || !isa<stablehlo::ConstantOp>(filter_op)) {
-      LLVM_DEBUG(llvm::dbgs() << "Filter should be a constant.\n");
-      return failure();
-    }
     return success();
   }
 
@@ -1155,76 +1254,6 @@ class RewriteQuantizedConvolutionOp
     return padded_shape;
   }
 
-  // Transposes the filter tensor to match the filter tensor format for
-  // TFLite convolution. The following transformations are supported:
-  //
-  // Depthwise case (`feature_group_count` > 1)
-  //   * Permutates given filter to `[i, 0, 1, o]` format.
-  // General convolution (`feature_group_count` = 1)
-  //   * Permutates given filter to `[o, 0, 1, i]` format.
-  // Using TransposeOp doesn't work because the quantized dimension
-  // changes which violates the constraint for the TransposeOp that the
-  // input's and output's element type should be the same.
-  DenseIntElementsAttr TransposeFilterValue(
-      Location loc, PatternRewriter& rewriter,
-      const DenseIntElementsAttr& filter_value_attr,
-      const bool is_depthwise) const {
-    ArrayRef<int64_t> filter_shape =
-        filter_value_attr.getShapedType().getShape();
-    SmallVector<int8_t> filter_constant_values;
-    for (auto filter_val : filter_value_attr.getValues<int8_t>()) {
-      filter_constant_values.push_back(filter_val);
-    }
-
-    SmallVector<int8_t> new_filter_constant_values(
-        filter_constant_values.size(), 0);
-
-    SmallVector<int64_t> new_filter_shape;
-    SmallVector<int64_t, 4> transpose_dims;
-    if (is_depthwise) {
-      transpose_dims = {2, 0, 1, 3};
-    } else {
-      transpose_dims = {3, 0, 1, 2};
-    }
-    for (int i = 0; i < filter_shape.size(); ++i) {
-      new_filter_shape.push_back(filter_shape[transpose_dims[i]]);
-    }
-
-    auto get_array_idx = [](ArrayRef<int64_t> shape, const int i, const int j,
-                            const int k, const int l) -> int64_t {
-      return (i * shape[1] * shape[2] * shape[3]) + (j * shape[2] * shape[3]) +
-             (k * shape[3]) + l;
-    };
-
-    // Transpose the filter value.
-    for (int i = 0; i < filter_shape[0]; ++i) {
-      for (int j = 0; j < filter_shape[1]; ++j) {
-        for (int k = 0; k < filter_shape[2]; ++k) {
-          for (int l = 0; l < filter_shape[3]; ++l) {
-            // [o, 0, 1, i] for `tfl.conv_2d` case`,
-            // [i, 0, 1, o] for `tfl.depthwise_conv_2d` case.
-            int old_idx = get_array_idx(filter_shape, i, j, k, l);
-            int new_idx = is_depthwise
-                              ? get_array_idx(new_filter_shape, k, i, j, l)
-                              : get_array_idx(new_filter_shape, l, i, j, k);
-
-            new_filter_constant_values[new_idx] =
-                filter_constant_values[old_idx];
-          }
-        }
-      }
-    }
-
-    // Create the new filter constant.
-    auto new_filter_value_attr_type =
-        RankedTensorType::getChecked(loc, new_filter_shape,
-                                     /*elementType=*/rewriter.getI8Type());
-    auto new_filter_constant_value_attr = DenseIntElementsAttr::get(
-        new_filter_value_attr_type, new_filter_constant_values);
-
-    return new_filter_constant_value_attr;
-  }
-
   std::pair<int64_t, int64_t> GetDimSize(
       const ArrayRef<int64_t> shape, const ArrayRef<int64_t> indexes) const {
     return {shape[indexes[0]], shape[indexes[1]]};
@@ -1335,12 +1364,13 @@ class RewriteQuantizedConvolutionOp
 
   // Returns the stride amount for the height and width, respectively.
   std::pair<int64_t, int64_t> GetStrides(stablehlo::ConvolutionOp op) const {
-    DenseI64ArrayAttr window_strides_attr = op.getWindowStridesAttr();
-    if (!window_strides_attr) {
+    std::optional<ArrayRef<int64_t>> window_strides_attr =
+        op.getWindowStrides();
+    if (!window_strides_attr.has_value()) {
       return {1, 1};  // Default values.
     }
 
-    auto window_strides_attr_value = window_strides_attr.asArrayRef();
+    auto window_strides_attr_value = window_strides_attr.value();
     // It is guaranteed from the spec that it has two values:
     // https://github.com/openxla/stablehlo/blob/main/docs/spec.md#convolution.
     return {window_strides_attr_value[0], window_strides_attr_value[1]};
@@ -1349,12 +1379,12 @@ class RewriteQuantizedConvolutionOp
   // Returns the dilation amount for the height and width, respectively.
   std::pair<int64_t, int64_t> GetDilationFactors(
       stablehlo::ConvolutionOp op) const {
-    DenseI64ArrayAttr lhs_dilation_attr = op.getLhsDilationAttr();
-    if (!lhs_dilation_attr) {
+    std::optional<ArrayRef<int64_t>> lhs_dilation_attr = op.getLhsDilation();
+    if (!lhs_dilation_attr.has_value()) {
       return {1, 1};  // Default values.
     }
 
-    auto lhs_dilation_attr_value = lhs_dilation_attr.asArrayRef();
+    auto lhs_dilation_attr_value = lhs_dilation_attr.value();
     // It is guaranteed from the spec that it has two values:
     // https://github.com/openxla/stablehlo/blob/main/docs/spec.md#convolution.
     return {lhs_dilation_attr_value[0], lhs_dilation_attr_value[1]};
@@ -1371,8 +1401,10 @@ class RewriteQuantizedConvolutionOp
     auto filter_constant_value_attr = cast<DenseIntElementsAttr>(
         cast<stablehlo::ConstantOp>(filter_value.getDefiningOp()).getValue());
     const DenseIntElementsAttr new_filter_value_attr =
-        TransposeFilterValue(filter_op->getLoc(), rewriter,
-                             filter_constant_value_attr, is_depthwise);
+        TransposeFilterInConvolution(filter_op->getLoc(), rewriter,
+                                     filter_constant_value_attr, is_depthwise);
+    int64_t kernel_output_feature_dim =
+        GetConvolutionKernelOutputFeatureDimension(is_depthwise);
     // Create a new quantized tensor type for the filter. This is required
     // because the quantized dimension is changed from 3 -> 0. `TFL::Conv2DOp`
     // requires the quantized dimension to be 0 because it accepts a filter
@@ -1383,14 +1415,15 @@ class RewriteQuantizedConvolutionOp
     auto new_filter_quantized_type = CreateI8F32UniformQuantizedPerAxisType(
         filter_op->getLoc(), *op.getContext(),
         filter_uniform_quantized_type.getScales(),
-        filter_uniform_quantized_type.getZeroPoints(), is_depthwise ? 3 : 0,
+        filter_uniform_quantized_type.getZeroPoints(),
+        /*quantization_dimension=*/kernel_output_feature_dim,
         /*narrow_range=*/true);
     const auto new_filter_result_type = RankedTensorType::getChecked(
         filter_op->getLoc(),
         /*shape=*/new_filter_value_attr.getShapedType().getShape(),
         /*type=*/new_filter_quantized_type);
     const int64_t num_output_features =
-        new_filter_result_type.getShape()[is_depthwise ? 3 : 0];
+        new_filter_result_type.getShape()[kernel_output_feature_dim];
     new_filter_constant_op = rewriter.create<TFL::QConstOp>(
         filter_op->getLoc(), /*output=*/TypeAttr::get(new_filter_result_type),
         new_filter_value_attr);
@@ -1441,15 +1474,6 @@ class RewriteQuantizedConvolutionOp
     }
     return bias;
   }
-
-  bool IsDepthwiseConvolution(
-      stablehlo::ConvolutionOp op,
-      const stablehlo::ConvDimensionNumbersAttr dimension_numbers) const {
-    // `feature_group_count` controls how the input channel dimension is
-    // split.
-    // A value bigger than one signals depthwise convolution behavior.
-    return op.getFeatureGroupCount() > 1;
-  }
 };
 
 // Rewrites quantized `stablehlo.transpose` to `tfl.transpose`.
@@ -2124,28 +2148,27 @@ class RewriteQuantizedConstantOp
   }
 };
 
-// Splits dot-like hybrid quantized StableHLO ops into `tfl.dequantize` and
-// float StableHLO op. Legalization of float StableHLO op depends on existing
-// passes for conversion of StableHLO -> MHLO -> TF -> TFL.
-template <typename OpType>
-class RewriteHybridQuantizedDotLikeOp : public OpRewritePattern<OpType> {
+// Splits hybrid quantized `stablehlo.dot_general` into `tfl.dequantize` and
+// float `stablehlo.dot_general` op. Legalization of float
+// `stablehlo.dot_general` op relies on existing passes for conversion of
+// StableHLO -> MHLO -> TF -> TFL.
+class RewriteHybridQuantizedDotGeneralOp
+    : public OpRewritePattern<stablehlo::DotGeneralOp> {
  public:
-  using OpRewritePattern<OpType>::OpRewritePattern;
+  using OpRewritePattern<stablehlo::DotGeneralOp>::OpRewritePattern;
 
-  LogicalResult match(OpType op) const override {
-    if (op->getNumOperands() != 2 || op->getNumResults() != 1) {
-      return failure();
-    }
+  LogicalResult match(stablehlo::DotGeneralOp op) const override {
     // Lhs and result should not be quantized and rhs should be quantized.
     return success(!IsQuantizedTensorType(op->getOperand(0).getType()) &&
                    IsQuantizedTensorType(op->getOperand(1).getType()) &&
                    !IsQuantizedTensorType(op->getResult(0).getType()));
   }
 
-  void rewrite(OpType op, PatternRewriter& rewriter) const override {
-    Value rhs = op.getOperand(1);
+  void rewrite(stablehlo::DotGeneralOp op,
+               PatternRewriter& rewriter) const override {
+    Value rhs = op.getRhs();
     Type lhs_element_type =
-        op.getOperand(0).getType().template cast<TensorType>().getElementType();
+        op.getLhs().getType().template cast<TensorType>().getElementType();
     Type dequantized_rhs_type =
         quant::CloneTypeWithNewElementType(rhs.getType(), lhs_element_type);
     auto dq = rewriter.create<TFL::DequantizeOp>(
@@ -2155,17 +2178,135 @@ class RewriteHybridQuantizedDotLikeOp : public OpRewritePattern<OpType> {
   }
 };
 
+// Splits hybrid quantized `stablehlo.convolution` into `tfl.dequantize` and
+// float `stablehlo.convolution` op. Weight tensor is transposed to match the
+// filter tensor format for TFLite convolution.
+// Legalization of float `stablehlo.convolution` op relies on existing passes
+// for conversion of StableHLO -> MHLO -> TF -> TFL.
+class RewriteHybridQuantizedConvolutionOp
+    : public OpRewritePattern<stablehlo::ConvolutionOp> {
+ public:
+  explicit RewriteHybridQuantizedConvolutionOp(MLIRContext* ctx)
+      : OpRewritePattern<stablehlo::ConvolutionOp>(ctx, /*benefit=*/5) {}
+
+  LogicalResult match(stablehlo::ConvolutionOp op) const override {
+    if (failed(MatchConvolutionFormat(op))) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Failed to match dimension format for convolution_op.\n");
+      return failure();
+    }
+    // Lhs and result should not be quantized and rhs should be quantized.
+    return success(!IsQuantizedTensorType(op->getOperand(0).getType()) &&
+                   IsQuantizedTensorType(op->getOperand(1).getType()) &&
+                   !IsQuantizedTensorType(op->getResult(0).getType()));
+  }
+
+  void rewrite(stablehlo::ConvolutionOp op,
+               PatternRewriter& rewriter) const override {
+    const bool is_depthwise = IsDepthwiseConvolution(op);
+
+    Operation* filter_op = op.getRhs().getDefiningOp();
+    auto filter_constant_value_attr = cast<DenseIntElementsAttr>(
+        cast<stablehlo::ConstantOp>(filter_op).getValue());
+    const DenseIntElementsAttr new_filter_value_attr =
+        TransposeFilterInConvolution(filter_op->getLoc(), rewriter,
+                                     filter_constant_value_attr, is_depthwise);
+
+    Type new_filter_type = GetNewWeightQuantizedType(
+        /*context=*/op.getContext(), /*location=*/filter_op->getLoc(),
+        /*new_shape=*/new_filter_value_attr.getShapedType().getShape(),
+        /*filter_type=*/op.getRhs().getType(), is_depthwise);
+    auto new_filter = rewriter.create<TFL::QConstOp>(
+        filter_op->getLoc(),
+        /*output=*/TypeAttr::get(new_filter_type), new_filter_value_attr);
+    stablehlo::ConvDimensionNumbersAttr new_dimension_numbers =
+        GetTflDimensionNumbers(rewriter.getContext(), op.getDimensionNumbers(),
+                               is_depthwise);
+    op.setDimensionNumbersAttr(new_dimension_numbers);
+
+    Type lhs_element_type =
+        op.getOperand(0).getType().template cast<TensorType>().getElementType();
+    Type dequantized_rhs_type = quant::CloneTypeWithNewElementType(
+        new_filter.getType(), lhs_element_type);
+    auto dq = rewriter.create<TFL::DequantizeOp>(
+        op->getLoc(), /*output=*/dequantized_rhs_type,
+        /*input=*/new_filter);
+    rewriter.replaceAllUsesExcept(filter_op->getResult(0), dq.getOutput(), dq);
+  }
+
+ private:
+  // Returns new quantized type for weights after transpose.
+  Type GetNewWeightQuantizedType(MLIRContext* context, Location location,
+                                 ArrayRef<int64_t> new_shape, Type filter_type,
+                                 bool is_depthwise) const {
+    auto tensor_type = filter_type.cast<TensorType>();
+    auto element_type = tensor_type.getElementType();
+    RankedTensorType new_filter_result_type;
+    if (element_type.isa<UniformQuantizedPerAxisType>()) {
+      auto per_axis_type = element_type.cast<UniformQuantizedPerAxisType>();
+      int64_t kernel_output_feature_dim =
+          GetConvolutionKernelOutputFeatureDimension(is_depthwise);
+      auto new_filter_quantized_type = CreateI8F32UniformQuantizedPerAxisType(
+          location, *context, per_axis_type.getScales(),
+          per_axis_type.getZeroPoints(),
+          /*quantization_dimension=*/kernel_output_feature_dim,
+          /*narrow_range=*/true);
+      new_filter_result_type =
+          RankedTensorType::getChecked(location,
+                                       /*shape=*/new_shape,
+                                       /*type=*/new_filter_quantized_type);
+    } else if (element_type.isa<UniformQuantizedType>()) {
+      auto per_tensor_type = element_type.cast<UniformQuantizedType>();
+      new_filter_result_type =
+          RankedTensorType::getChecked(location,
+                                       /*shape=*/new_shape,
+                                       /*type=*/per_tensor_type);
+    } else {
+      LLVM_DEBUG(
+          llvm::dbgs()
+          << "Weight tensor elements do not have uniform quantized type.\n");
+    }
+    return new_filter_result_type;
+  }
+
+  // Returns the dimension numbers of the given stablehlo's
+  // convolution attribute with transposed filter tensors to
+  // match TFLite format.
+  // Depthwise case (`feature_group_count` > 1)
+  //   * `[0, 1, i, o]` -> `[i, 0, 1, o]` format.
+  // General convolution (`feature_group_count` = 1)
+  //   * `[0, 1, i, o]` -> `[o, 0, 1, i]` format.
+  stablehlo::ConvDimensionNumbersAttr GetTflDimensionNumbers(
+      MLIRContext* context,
+      stablehlo::ConvDimensionNumbersAttr dimension_numbers,
+      bool is_depthwise) const {
+    int64_t kernel_input_feature_dim =
+        GetConvolutionKernelInputFeatureDimension(is_depthwise);
+    int64_t kernel_output_feature_dim =
+        GetConvolutionKernelOutputFeatureDimension(is_depthwise);
+    SmallVector<int64_t> kernel_spatial_dims{1, 2};
+
+    return stablehlo::ConvDimensionNumbersAttr::get(
+        context, dimension_numbers.getInputBatchDimension(),
+        dimension_numbers.getInputFeatureDimension(),
+        dimension_numbers.getInputSpatialDimensions(), kernel_input_feature_dim,
+        kernel_output_feature_dim, kernel_spatial_dims,
+        dimension_numbers.getOutputBatchDimension(),
+        dimension_numbers.getOutputFeatureDimension(),
+        dimension_numbers.getOutputSpatialDimensions());
+  }
+};
+
 void UniformQuantizedStableHloToTflPass::runOnOperation() {
   func::FuncOp func_op = getOperation();
   MLIRContext& ctx = getContext();
 
   RewritePatternSet patterns(&ctx);
-  patterns.add<RewriteHybridQuantizedDotLikeOp<stablehlo::ConvolutionOp>,
-               RewriteHybridQuantizedDotLikeOp<stablehlo::DotGeneralOp>,
-               RewriteUniformDequantizeOp, RewriteUniformQuantizeOp,
-               RewriteQuantizedAddOp, RewriteQuantizedBroadcastInDimOp,
-               RewriteQuantizedConcatenateOp, RewriteQuantizedConstantOp,
-               RewriteQuantizedConvolutionOp,
+  patterns.add<RewriteHybridQuantizedConvolutionOp,
+               RewriteHybridQuantizedDotGeneralOp, RewriteUniformDequantizeOp,
+               RewriteUniformQuantizeOp, RewriteQuantizedAddOp,
+               RewriteQuantizedBroadcastInDimOp, RewriteQuantizedConcatenateOp,
+               RewriteQuantizedConstantOp, RewriteQuantizedConvolutionOp,
                RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp,
                RewriteQuantizedDynamicReshapeOp, RewriteQuantizedDynamicSliceOp,
                RewriteQuantizedGatherOp, RewriteQuantizedPadOp,
diff --git a/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir b/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir
index 3d08f278834314..143cee78b5001c 100644
--- a/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir
@@ -272,7 +272,7 @@ func.func @RemoveLstmQuantZeroBias(
      cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", kernel_type = #tfl<lstm_kernel_type_attr FULL>, proj_clip = 0.01 : f32
   } : (tensor<1x528xf32>, tensor<2048x528xf32>, tensor<2048x528xf32>, tensor<2048x528xf32>, tensor<2048x528xf32>, tensor<2048x640xf32>, tensor<2048x640xf32>, tensor<2048x640xf32>, tensor<2048x640xf32>, none, none, none, tensor<2048xf32>, tensor<2048xf32>, tensor<2048xf32>, tensor<2048xf32>, tensor<640x2048xf32>, tensor<640xf32>, tensor<1x640xf32>, tensor<1x2048xf32>, tensor<2048xf32>, tensor<2048xf32>, tensor<2048xf32>, tensor<2048xf32>) -> tensor<1x640xf32>
     func.return %0 : tensor<1x640xf32>
-// CHECK: %[[NONE:.+]] = "tfl.no_value"() {value} : () -> none
+// CHECK: %[[NONE:.+]] = "tfl.no_value"() <{value}> : () -> none
 // CHECK: "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %[[NONE]], %[[NONE]], %[[NONE]], %arg9, %arg10, %arg11, %arg12, %arg13, %[[NONE]], %arg19, %arg20, %arg15, %arg16, %arg17, %arg18)
 }
 
@@ -282,11 +282,11 @@ func.func @keepCustomFlexOps(%arg0: tensor<1x10xf32>) -> tensor<1x10xf32> {
   %2 = "tfl.custom"(%1, %arg0) {custom_code = "FlexAddV2", custom_option = #tfl<const_bytes : "0x0541646456320016120541646456321A001A002A070A015412023001320000021F191414042801">} : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
   "tfl.custom"(%0, %2) {custom_code = "FlexAssignVariableOp", custom_option = #tfl<const_bytes : "0x1041737369676E5661726961626C654F70003B121041737369676E5661726961626C654F701A001A002A0B0A056474797065120230012A140A0E76616C69646174655F736861706512022800320000024F3E1414042801">} : (tensor<!tf_type.resource<tensor<1x10xf32>>>, tensor<1x10xf32>) -> ()
   %3 = "tfl.custom"(%0) {custom_code = "FlexReadVariableOp", custom_option = #tfl<const_bytes : "0x0E526561645661726961626C654F700021120E526561645661726961626C654F701A002A0B0A056474797065120230013200000233241414042801">} : (tensor<!tf_type.resource<tensor<1x10xf32>>>) -> tensor<1x10xf32>
-  // CHECK:      %0 = "tfl.custom"() {custom_code = "FlexVarHandleOp"
-  // CHECK-NEXT: %1 = "tfl.custom"(%0) {custom_code = "FlexReadVariableOp"
-  // CHECK-NEXT: %2 = "tfl.custom"(%1, %arg0) {custom_code = "FlexAddV2"
-  // CHECK-NEXT: "tfl.custom"(%0, %2) {custom_code = "FlexAssignVariableOp"
-  // CHECK-NEXT: %3 = "tfl.custom"(%0) {custom_code = "FlexReadVariableOp"
+  // CHECK:      %0 = "tfl.custom"() <{custom_code = "FlexVarHandleOp"
+  // CHECK-NEXT: %1 = "tfl.custom"(%0) <{custom_code = "FlexReadVariableOp"
+  // CHECK-NEXT: %2 = "tfl.custom"(%1, %arg0) <{custom_code = "FlexAddV2"
+  // CHECK-NEXT: "tfl.custom"(%0, %2) <{custom_code = "FlexAssignVariableOp"
+  // CHECK-NEXT: %3 = "tfl.custom"(%0) <{custom_code = "FlexReadVariableOp"
   func.return %3 : tensor<1x10xf32>
 }
 
diff --git a/tensorflow/compiler/mlir/lite/tests/const-fold.mlir b/tensorflow/compiler/mlir/lite/tests/const-fold.mlir
index f244d15294c253..9626a292b8eb6d 100644
--- a/tensorflow/compiler/mlir/lite/tests/const-fold.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/const-fold.mlir
@@ -416,7 +416,7 @@ func.func @reshape_dynamic_output() -> tensor<?xi32> {
   %input = arith.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
   %shape = arith.constant dense<[4]> : tensor<1xi32>
 
-  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<[1, 2, 3, 4]> : tensor<4xi32>} : () -> tensor<?xi32>
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() <{value = dense<[1, 2, 3, 4]> : tensor<4xi32>}> : () -> tensor<?xi32>
   // CHECK: return %[[CST]]
   %0 = "tfl.reshape"(%input, %shape) : (tensor<2x2xi32>, tensor<1xi32>) -> tensor<?xi32>
   func.return %0 : tensor<?xi32>
@@ -438,7 +438,7 @@ func.func @range_int() -> tensor<?xi32> {
   %cst_1 = arith.constant dense<4> : tensor<i32>
   %cst_2 = arith.constant dense<1> : tensor<i32>
 
-  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<[0, 1, 2, 3]> : tensor<4xi32>} : () -> tensor<?xi32>
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() <{value = dense<[0, 1, 2, 3]> : tensor<4xi32>}> : () -> tensor<?xi32>
   // CHECK: return %[[CST]]
   %0 = "tfl.range"(%cst, %cst_1, %cst_2) : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
   func.return %0 : tensor<?xi32>
@@ -450,7 +450,7 @@ func.func @range_float() -> tensor<?xf32> {
   %cst_1 = arith.constant dense<4.0> : tensor<f32>
   %cst_2 = arith.constant dense<1.0> : tensor<f32>
 
-  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<[0.000000e+00, 1.000000e+00, 2.000000e+00, 3.000000e+00]> : tensor<4xf32>} : () -> tensor<?xf32>
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() <{value = dense<[0.000000e+00, 1.000000e+00, 2.000000e+00, 3.000000e+00]> : tensor<4xf32>}> : () -> tensor<?xf32>
   // CHECK: return %[[CST]]
   %0 = "tfl.range"(%cst, %cst_1, %cst_2) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
@@ -463,7 +463,7 @@ func.func @range_float_neg_delta() -> tensor<?xf32> {
   %cst_1 = arith.constant dense<-4.0> : tensor<f32>
   %cst_2 = arith.constant dense<-1.0> : tensor<f32>
 
-  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<[0.000000e+00, -1.000000e+00, -2.000000e+00, -3.000000e+00]> : tensor<4xf32>} : () -> tensor<?xf32>
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() <{value = dense<[0.000000e+00, -1.000000e+00, -2.000000e+00, -3.000000e+00]> : tensor<4xf32>}> : () -> tensor<?xf32>
   // CHECK: return %[[CST]]
   %0 = "tfl.range"(%cst, %cst_1, %cst_2) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
@@ -475,7 +475,7 @@ func.func @range_float_nonzero_base() -> tensor<?xf32> {
   %cst_1 = arith.constant dense<7.0> : tensor<f32>
   %cst_2 = arith.constant dense<1.5> : tensor<f32>
 
-  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<[2.000000e+00, 3.500000e+00, 5.000000e+00, 6.500000e+00]> : tensor<4xf32>} : () -> tensor<?xf32>
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() <{value = dense<[2.000000e+00, 3.500000e+00, 5.000000e+00, 6.500000e+00]> : tensor<4xf32>}> : () -> tensor<?xf32>
   // CHECK: return %[[CST]]
   %0 = "tfl.range"(%cst, %cst_1, %cst_2) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
@@ -508,7 +508,7 @@ func.func @transpose_dynamic() -> tensor<?xi32> {
   %cst = arith.constant dense<[1, 2, 3]> : tensor<3xi32>
   %cst_perm = arith.constant dense<0> : tensor<1xi32>
 
-  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<{{\[}}1, 2, 3]> : tensor<3xi32>} : () -> tensor<?xi32>
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() <{value = dense<{{\[}}1, 2, 3]> : tensor<3xi32>}> : () -> tensor<?xi32>
   // CHECK: return %[[CST]]
   %0 = "tfl.transpose"(%cst, %cst_perm) : (tensor<3xi32>, tensor<1xi32>) -> tensor<?xi32>
   func.return %0 : tensor<?xi32>
@@ -567,7 +567,7 @@ func.func @ConstantFoldBinaryOpDynamicOutput() -> tensor<?xi32> {
   %87 = "tfl.sub"(%cst_0, %cst) {fused_activation_function = "NONE"} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
   func.return %87 : tensor<?xi32>
 
-  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<[-5, 0]> : tensor<2xi32>} : () -> tensor<?xi32>
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() <{value = dense<[-5, 0]> : tensor<2xi32>}> : () -> tensor<?xi32>
   // CHECK: return %[[CST]]
 }
 
@@ -580,7 +580,7 @@ func.func @add_dense_dense_int_same_shape_dynamic() -> tensor<?xi32> {
 
   func.return %2 : tensor<?xi32>
 
-  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<[5, 22, -2, 98]> : tensor<4xi32>} : () -> tensor<?xi32>
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() <{value = dense<[5, 22, -2, 98]> : tensor<4xi32>}> : () -> tensor<?xi32>
   // CHECK: return %[[CST]]
 }
 
@@ -603,7 +603,7 @@ func.func @concat_3_tensors_1_empty() -> tensor<?xi32> {
   %3 = "tfl.concatenation"(%0, %1, %2) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<2xi32>, tensor<2xi32>, tensor<0xi32>) -> tensor<?xi32>
   func.return %3 : tensor<?xi32>
 
-  // CHECK: %0 = "tfl.concatenation"(%[[CST]], %[[CST]]) {axis = 0 : i32, fused_activation_function = "NONE"}
+  // CHECK: %0 = "tfl.concatenation"(%[[CST]], %[[CST]]) <{axis = 0 : i32, fused_activation_function = "NONE"}>
   // CHECK: return %0 : tensor<?xi32>
 }
 
@@ -835,7 +835,7 @@ func.func @NoFoldFullyConnectedNonFloat() -> tensor<1024xf32> {
   // CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : tensor<512xf32>
   // CHECK-DAG: %[[CST_0:.*]] = arith.constant dense<2> : tensor<1024x512xi8>
   // CHECK-DAG: %[[CST_1:.*]] = arith.constant dense<4.000000e+00> : tensor<1024xf32>
-  // CHECK: %[[VAL:.*]] = "tfl.fully_connected"(%[[CST]], %[[CST_0]], %[[CST_1]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<512xf32>, tensor<1024x512xi8>, tensor<1024xf32>) -> tensor<1024xf32>
+  // CHECK: %[[VAL:.*]] = "tfl.fully_connected"(%[[CST]], %[[CST_0]], %[[CST_1]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<512xf32>, tensor<1024x512xi8>, tensor<1024xf32>) -> tensor<1024xf32>
   // CHECK: return %[[VAL]] : tensor<1024xf32>
 }
 
@@ -851,7 +851,7 @@ func.func @NoFoldFullyConnectedHighRank() -> tensor<2x1024xf32> {
   // CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : tensor<2x512xf32>
   // CHECK-DAG: %[[CST_0:.*]] = arith.constant dense<2.000000e+00> : tensor<1024x512xf32>
   // CHECK-DAG: %[[CST_1:.*]] = arith.constant dense<4.000000e+00> : tensor<1024xf32>
-  // CHECK: %[[VAL:.*]] = "tfl.fully_connected"(%[[CST]], %[[CST_0]], %[[CST_1]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<2x512xf32>, tensor<1024x512xf32>, tensor<1024xf32>) -> tensor<2x1024xf32>
+  // CHECK: %[[VAL:.*]] = "tfl.fully_connected"(%[[CST]], %[[CST_0]], %[[CST_1]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<2x512xf32>, tensor<1024x512xf32>, tensor<1024xf32>) -> tensor<2x1024xf32>
   // CHECK: return %[[VAL]] : tensor<2x1024xf32>
 }
 
diff --git a/tensorflow/compiler/mlir/lite/tests/decompose-hybrid-quantization.mlir b/tensorflow/compiler/mlir/lite/tests/decompose-hybrid-quantization.mlir
index ce2c896ccde2ab..b656f8c649b5c4 100644
--- a/tensorflow/compiler/mlir/lite/tests/decompose-hybrid-quantization.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/decompose-hybrid-quantization.mlir
@@ -2,9 +2,9 @@
 
 // CHECK-LABEL: @test_conv2d_float
 func.func @test_conv2d_float(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x32x32x16xf32> {
-  // CHECK-DAG: %[[VAL0:.+]] = "tfl.pseudo_const"() {value = dense<42> : tensor<16x1x1x8xi8>}
-  // CHECK-DAG: %[[VAL1:.+]] = "tfl.pseudo_const"() {value = dense<1> : tensor<16x1x1x8xi8>}
-  // CHECK-DAG: %[[VAL2:.+]] = "tfl.conv_2d"(%arg0, %[[VAL0]], %[[VAL1]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}
+  // CHECK-DAG: %[[VAL0:.+]] = "tfl.pseudo_const"() <{value = dense<42> : tensor<16x1x1x8xi8>}>
+  // CHECK-DAG: %[[VAL1:.+]] = "tfl.pseudo_const"() <{value = dense<1> : tensor<16x1x1x8xi8>}>
+  // CHECK-DAG: %[[VAL2:.+]] = "tfl.conv_2d"(%arg0, %[[VAL0]], %[[VAL1]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}>
   // CHECK: return %[[VAL2]]
   %0 = "tfl.pseudo_const"() {value = dense<42> : tensor<16x1x1x8xi8>} : () -> tensor<16x1x1x8xf32>
   %1 = "tfl.pseudo_const"() {value = dense<1> : tensor<16x1x1x8xi8>} : () -> tensor<16xf32>
@@ -16,9 +16,9 @@ func.func @test_conv2d_float(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x32x32x16x
 
 // CHECK-LABEL: @test_conv2d_qi8
 func.func @test_conv2d_qi8(%arg0: tensor<1x32x32x8x!quant.uniform<i8:f32, 1.0>>) -> tensor<1x32x32x16x!quant.uniform<i8:f32, 1.0>> {
-  // CHECK-DAG: %[[VAL0:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<{{.+}}>, value = dense<42> : tensor<16x1x1x8xi8>}
-  // CHECK-DAG: %[[VAL1:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<{{.+}}>, value = dense<0> : tensor<16xi32>}
-  // CHECK-DAG: %[[VAL2:.+]] = "tfl.conv_2d"(%arg0, %0, %1) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}
+  // CHECK-DAG: %[[VAL0:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<{{.+}}>, value = dense<42> : tensor<16x1x1x8xi8>}>
+  // CHECK-DAG: %[[VAL1:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<{{.+}}>, value = dense<0> : tensor<16xi32>}>
+  // CHECK-DAG: %[[VAL2:.+]] = "tfl.conv_2d"(%arg0, %0, %1) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}>
   // CHECK: return %[[VAL2]]
   %0 = "tfl.pseudo_qconst"() {qtype = tensor<16x1x1x8x!quant.uniform<i8:f32, 1.0>>, value = dense<42> : tensor<16x1x1x8xi8>} : () -> tensor<16x1x1x8x!quant.uniform<i8:f32, 1.0>>
   %1 = "tfl.pseudo_qconst"() {qtype = tensor<16x!quant.uniform<i32:f32, 1.0>>, value = dense<0> : tensor<16xi32>} : () -> tensor<16x!quant.uniform<i32:f32, 1.0>>
@@ -31,8 +31,8 @@ func.func @test_conv2d_qi8(%arg0: tensor<1x32x32x8x!quant.uniform<i8:f32, 1.0>>)
 // CHECK-LABEL: @test_conv2d_qi16
 func.func @test_conv2d_qi16(%arg0: tensor<1x32x32x8x!quant.uniform<i16:f32, 1.0>>) -> tensor<1x32x32x16x!quant.uniform<i16:f32, 1.0>> {
   // CHECK-DAG: %[[BIAS:.+]] = arith.constant dense<0> : tensor<16xi64>
-  // CHECK-DAG: %[[VAL0:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<{{.+}}>, value = dense<42> : tensor<16x1x1x8xi8>}
-  // CHECK-DAG: %[[VAL1:.+]] = "tfl.conv_2d"(%arg0, %[[VAL0]], %[[BIAS]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}
+  // CHECK-DAG: %[[VAL0:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<{{.+}}>, value = dense<42> : tensor<16x1x1x8xi8>}>
+  // CHECK-DAG: %[[VAL1:.+]] = "tfl.conv_2d"(%arg0, %[[VAL0]], %[[BIAS]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}>
   // CHECK: return %[[VAL1]]
   %0 = "tfl.pseudo_qconst"() {qtype = tensor<16x1x1x8x!quant.uniform<i8:f32, 1.0>>, value = dense<42> : tensor<16x1x1x8xi8>} : () -> tensor<16x1x1x8x!quant.uniform<i8:f32, 1.0>>
   %1 = "arith.constant"() {value = dense<0> : tensor<16xi64>} : () -> tensor<16xi64>
@@ -44,12 +44,12 @@ func.func @test_conv2d_qi16(%arg0: tensor<1x32x32x8x!quant.uniform<i16:f32, 1.0>
 
 // CHECK-LABEL: @test_conv2d_replace_qi8
 func.func @test_conv2d_replace_qi8(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x32x32x16x!quant.uniform<i8:f32, 1.0>> {
-  // CHECK-DAG: %[[VAL0:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<{{.+}}>, value = dense<42> : tensor<16x1x1x8xi8>}
-  // CHECK-DAG: %[[VAL1:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<{{.+}}>, value = dense<0> : tensor<16xi32>}
+  // CHECK-DAG: %[[VAL0:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<{{.+}}>, value = dense<42> : tensor<16x1x1x8xi8>}>
+  // CHECK-DAG: %[[VAL1:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<{{.+}}>, value = dense<0> : tensor<16xi32>}>
   // CHECK-DAG: %[[VAL2:.+]] = "tfl.dequantize"(%[[VAL0]])
   // CHECK-DAG: %[[VAL3:.+]] = "tfl.dequantize"(%[[VAL1]])
-  // CHECK-DAG: %[[VAL4:.+]] = "tfl.conv_2d"(%arg0, %[[VAL2]], %[[VAL3]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}
-  // CHECK-DAG: %[[VAL5:.+]] = "tfl.quantize"(%4) {qtype = tensor<1x32x32x16x!quant.uniform<i8:f32, 1.000000e+00>>}
+  // CHECK-DAG: %[[VAL4:.+]] = "tfl.conv_2d"(%arg0, %[[VAL2]], %[[VAL3]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}>
+  // CHECK-DAG: %[[VAL5:.+]] = "tfl.quantize"(%4) <{qtype = tensor<1x32x32x16x!quant.uniform<i8:f32, 1.000000e+00>>}>
   // CHECK: return %[[VAL5]]
   %0 = "tfl.pseudo_qconst"() {qtype = tensor<16x1x1x8x!quant.uniform<i8:f32, 1.0>>, value = dense<42> : tensor<16x1x1x8xi8>} : () -> tensor<16x1x1x8x!quant.uniform<i8:f32, 1.0>>
   %1 = "tfl.pseudo_qconst"() {qtype = tensor<16x!quant.uniform<i32:f32, 1.0>>, value = dense<0> : tensor<16xi32>} : () -> tensor<16x!quant.uniform<i32:f32, 1.0>>
@@ -61,11 +61,11 @@ func.func @test_conv2d_replace_qi8(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x32x
 
 // CHECK-LABEL: @test_conv2d_replace_float
 func.func @test_conv2d_replace_float(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x32x32x16xf32> {
-  // CHECK-DAG: %[[VAL0:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<16x1x1x8x{{.+}}>, value = dense<42> : tensor<16x1x1x8xi8>} : () -> tensor<16x1x1x8x!quant.uniform<{{.+}}>>
-  // CHECK-DAG: %[[VAL1:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<16x{{.+}}>, value = dense<0> : tensor<16xi32>} : () -> tensor<16x!quant.uniform<{{.+}}>>
+  // CHECK-DAG: %[[VAL0:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<16x1x1x8x{{.+}}>, value = dense<42> : tensor<16x1x1x8xi8>}> : () -> tensor<16x1x1x8x!quant.uniform<{{.+}}>>
+  // CHECK-DAG: %[[VAL1:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<16x{{.+}}>, value = dense<0> : tensor<16xi32>}> : () -> tensor<16x!quant.uniform<{{.+}}>>
   // CHECK-DAG: %[[VAL2:.+]] = "tfl.dequantize"(%[[VAL0]])
   // CHECK-DAG: %[[VAL3:.+]] = "tfl.dequantize"(%[[VAL1]])
-  // CHECK-DAG: %[[VAL4:.+]] = "tfl.conv_2d"(%arg0, %[[VAL2]], %[[VAL3]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}
+  // CHECK-DAG: %[[VAL4:.+]] = "tfl.conv_2d"(%arg0, %[[VAL2]], %[[VAL3]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}>
   // CHECK: return %[[VAL4]]
   %0 = "tfl.pseudo_qconst"() {qtype = tensor<16x1x1x8x!quant.uniform<i8:f32, 1.0>>, value = dense<42> : tensor<16x1x1x8xi8>} : () -> tensor<16x1x1x8x!quant.uniform<i8:f32, 1.0>>
   %1 = "tfl.pseudo_qconst"() {qtype = tensor<16x!quant.uniform<i32:f32, 1.0>>, value = dense<0> : tensor<16xi32>} : () -> tensor<16x!quant.uniform<i32:f32, 1.0>>
@@ -77,10 +77,10 @@ func.func @test_conv2d_replace_float(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x3
 
 // CHECK-LABEL: @test_conv3d_float
 func.func @test_conv3d_float(%arg0: tensor<1x32x32x32x8xf32>) -> tensor<1x32x32x32x16xf32> {
-  // CHECK-DAG: %[[VAL0:.+]] = "tfl.pseudo_const"() {value = dense<1.000000e+00> : tensor<16xf32>
-  // CHECK-DAG: %[[VAL1:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<1x1x8x16x!quant.uniform<{{.+}}>>, value = dense<42> : tensor<1x1x1x8x16xi8>}
+  // CHECK-DAG: %[[VAL0:.+]] = "tfl.pseudo_const"() <{value = dense<1.000000e+00> : tensor<16xf32>
+  // CHECK-DAG: %[[VAL1:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<1x1x8x16x!quant.uniform<{{.+}}>>, value = dense<42> : tensor<1x1x1x8x16xi8>}>
   // CHECK: %[[VAL2:.+]] = "tfl.dequantize"(%[[VAL1]]) : (tensor<1x1x1x8x16x!quant.uniform<{{.+}}>>) -> tensor<1x1x1x8x16xf32>
-  // CHECK: %[[VAL3:.+]] = "tfl.conv_3d"(%arg0, %[[VAL2]], %[[VAL0]]) {dilation_d_factor = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_d = 1 : i32, stride_h = 1 : i32, stride_w = 1 : i32}
+  // CHECK: %[[VAL3:.+]] = "tfl.conv_3d"(%arg0, %[[VAL2]], %[[VAL0]]) <{dilation_d_factor = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_d = 1 : i32, stride_h = 1 : i32, stride_w = 1 : i32}>
   // CHECK: return %[[VAL3]]
   %0 = "tfl.pseudo_qconst"() {qtype = tensor<1x1x8x16x!quant.uniform<i8:f32, 1.0>>, value = dense<42> : tensor<1x1x1x8x16xi8>} : () -> tensor<1x1x1x8x16x!quant.uniform<i8:f32, 1.0>>
   %1 = "tfl.pseudo_const"() { value = dense<1.0> : tensor<16xf32>} : () -> tensor<16xf32>
@@ -92,12 +92,12 @@ func.func @test_conv3d_float(%arg0: tensor<1x32x32x32x8xf32>) -> tensor<1x32x32x
 
 // CHECK-LABEL: @test_transpose_conv2d
 func.func @test_transpose_conv2d(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x32x32x16xf32> {
-  // CHECK-DAG: %[[SHAPE:.+]] = "tfl.pseudo_const"() {value = dense<[1, 32, 32, 16]>
-  // CHECK-DAG: %[[VAL0:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<16x{{.+}}>, value = dense<1> : tensor<16xi32>}
-  // CHECK-DAG: %[[VAL1:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<16x{{.+}}>, value = dense<2> : tensor<16xi32>}
+  // CHECK-DAG: %[[SHAPE:.+]] = "tfl.pseudo_const"() <{value = dense<[1, 32, 32, 16]>
+  // CHECK-DAG: %[[VAL0:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<16x{{.+}}>, value = dense<1> : tensor<16xi32>}>
+  // CHECK-DAG: %[[VAL1:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<16x{{.+}}>, value = dense<2> : tensor<16xi32>}>
   // CHECK-DAG: %[[VAL2:.+]] = "tfl.dequantize"(%[[VAL0]])
   // CHECK-DAG: %[[VAL3:.+]] = "tfl.dequantize"(%[[VAL1]])
-  // CHECK-DAG: %[[VAL4:.+]] = "tfl.transpose_conv"(%[[SHAPE]], %[[VAL2]], %arg0, %[[VAL3]]) {fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}
+  // CHECK-DAG: %[[VAL4:.+]] = "tfl.transpose_conv"(%[[SHAPE]], %[[VAL2]], %arg0, %[[VAL3]]) <{fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}>
   // CHECK: return %[[VAL4]]
   %0 = "tfl.pseudo_const"() { value = dense<[1, 32, 32, 16]> : tensor<4xi32> } : () -> tensor<4xi32>
   %1 = "tfl.pseudo_qconst"() {qtype = tensor<16x!quant.uniform<i32:f32, 1.0>>, value = dense<1> : tensor<16xi32>} : () -> tensor<16x1x1x8x!quant.uniform<i32:f32, 1.0>>
@@ -110,11 +110,11 @@ func.func @test_transpose_conv2d(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x32x32
 
 // CHECK-LABEL: @test_depthwise_conv2d_replace_float
 func.func @test_depthwise_conv2d_replace_float(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x112x112x32xf32> {
-  // CHECK-DAG: %[[VAL0:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<32x3x3x3x!quant.uniform<{{.+}}>>, value = dense<42> : tensor<32x3x3x3xi8>}
-  // CHECK-DAG: %[[VAL1:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<32x!quant.uniform<{{.+}}>>, value = dense<0> : tensor<32xi32>}
+  // CHECK-DAG: %[[VAL0:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<32x3x3x3x!quant.uniform<{{.+}}>>, value = dense<42> : tensor<32x3x3x3xi8>}>
+  // CHECK-DAG: %[[VAL1:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<32x!quant.uniform<{{.+}}>>, value = dense<0> : tensor<32xi32>}>
   // CHECK-DAG: %[[VAL2:.+]] = "tfl.dequantize"(%[[VAL0]]) : (tensor<32x3x3x3x!quant.uniform<{{.+}}>>)
   // CHECK-DAG: %[[VAL3:.+]] = "tfl.dequantize"(%[[VAL1]]) : (tensor<32x!quant.uniform<{{.+}})
-  // CHECK-DAG: %[[VAL4:.+]] = "tfl.depthwise_conv_2d"(%arg0, %[[VAL2]], %[[VAL3]]) {depth_multiplier = 4 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 4 : i32, stride_w = 5 : i32}
+  // CHECK-DAG: %[[VAL4:.+]] = "tfl.depthwise_conv_2d"(%arg0, %[[VAL2]], %[[VAL3]]) <{depth_multiplier = 4 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 4 : i32, stride_w = 5 : i32}>
   // CHECK: return %[[VAL4]]
   %0 = "tfl.pseudo_qconst"() {qtype = tensor<32x3x3x3x!quant.uniform<i8:f32, 1.0>>, value = dense<42> : tensor<32x3x3x3xi8>} : () -> tensor<32x3x3x3x!quant.uniform<i8:f32, 1.0>>
   %1 = "tfl.pseudo_qconst"() {qtype = tensor<32x!quant.uniform<i32:f32, 1.0>>, value = dense<0> : tensor<32xi32>} : () -> tensor<32x!quant.uniform<i32:f32, 1.0>>
@@ -126,11 +126,11 @@ func.func @test_depthwise_conv2d_replace_float(%arg0: tensor<1x224x224x3xf32>) -
 
 // CHECK-LABEL: @test_fullyconnected_replace_float
 func.func @test_fullyconnected_replace_float(%arg0: tensor<4x256x6x6xf32>) -> tensor<4x256x36xf32> {
-  // CHECK-DAG: %[[VAL0:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<36x36x!quant.uniform<{{.+}}>>, value = dense<42> : tensor<36x36xi8>}
-  // CHECK-DAG: %[[VAL1:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<36x!quant.uniform<{{.+}}>>, value = dense<0> : tensor<36xi32>}
+  // CHECK-DAG: %[[VAL0:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<36x36x!quant.uniform<{{.+}}>>, value = dense<42> : tensor<36x36xi8>}>
+  // CHECK-DAG: %[[VAL1:.+]] = "tfl.pseudo_qconst"() <{qtype = tensor<36x!quant.uniform<{{.+}}>>, value = dense<0> : tensor<36xi32>}>
   // CHECK-DAG: %[[VAL2:.+]] = "tfl.dequantize"(%[[VAL0]]) : (tensor<36x36x!quant.uniform<i8:f32, 1.000000e+00>>)
   // CHECK-DAG: %[[VAL3:.+]] = "tfl.dequantize"(%[[VAL1]]) : (tensor<36x!quant.uniform<i32:f32, 1.000000e+00>>)
-  // CHECK: %[[VAL4:.+]] = "tfl.fully_connected"(%arg0, %[[VAL2]], %[[VAL3]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}
+  // CHECK: %[[VAL4:.+]] = "tfl.fully_connected"(%arg0, %[[VAL2]], %[[VAL3]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}>
   // CHECK: return %[[VAL4]]
   %0 = "tfl.pseudo_qconst"() {qtype = tensor<36x36x!quant.uniform<i8:f32, 1.0>>, value = dense<42> : tensor<36x36xi8>} : () -> tensor<36x36x!quant.uniform<i8:f32, 1.0>>
   %1 = "tfl.pseudo_qconst"() {qtype = tensor<36x!quant.uniform<i32:f32, 1.0>>, value = dense<0> : tensor<36xi32>} : () -> tensor<36x!quant.uniform<i32:f32, 1.0>>
diff --git a/tensorflow/compiler/mlir/lite/tests/default_quant_params.mlir b/tensorflow/compiler/mlir/lite/tests/default_quant_params.mlir
index b062de1e84aede..2bb3020618d5c7 100644
--- a/tensorflow/compiler/mlir/lite/tests/default_quant_params.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/default_quant_params.mlir
@@ -5,10 +5,10 @@ func.func @hardcode_all(%arg0: tensor<2x2xf32>, %arg1: tensor<2x1xf32>) -> tenso
   %0 = "tfl.add"(%arg0, %arg1) {fused_activation_function="NONE"}: (tensor<2x2xf32>, tensor<2x1xf32>) -> tensor<2x2xf32>
   func.return %0 : tensor<2x2xf32>
 
-// CHECK: %[[q0:.*]] = "tfl.quantize"(%arg1) {qtype = tensor<2x1x!quant.uniform<u8:f32, 0.0078431372549019607:128>>}
-// CHECK: %[[q1:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<2x2x!quant.uniform<u8:f32, 0.0078431372549019607:128>>}
+// CHECK: %[[q0:.*]] = "tfl.quantize"(%arg1) <{qtype = tensor<2x1x!quant.uniform<u8:f32, 0.0078431372549019607:128>>}>
+// CHECK: %[[q1:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<2x2x!quant.uniform<u8:f32, 0.0078431372549019607:128>>}>
 // Quantized tfl.add
-// CHECK: %[[add:.*]] = tfl.add(%[[q1]], %[[q0]]) {fused_activation_function = "NONE"} : (tensor<2x2x!quant.uniform<u8:f32, 0.0078431372549019607:128>>
+// CHECK: %[[add:.*]] = tfl.add(%[[q1]], %[[q0]]) <{fused_activation_function = "NONE"}> : (tensor<2x2x!quant.uniform<u8:f32, 0.0078431372549019607:128>>
 // CHECK: %[[dq:.*]] = "tfl.dequantize"(%[[add]]) : (tensor<2x2x!quant.uniform<u8:f32, 0.0078431372549019607:128>>)
 // CHECK: return %[[dq]]
 }
@@ -20,9 +20,9 @@ func.func @hardcode_input(%arg0: tensor<2x2xf32>, %arg1: tensor<2x1xf32>) -> ten
   %4 = "tfl.add"(%1, %arg1) {fused_activation_function="NONE"}: (tensor<2x2xf32>, tensor<2x1xf32>) -> tensor<2x2xf32>
   func.return %4 : tensor<2x2xf32>
 
-// CHECK: %[[q0:.*]] = "tfl.quantize"(%arg1) {qtype = tensor<2x1x!quant.uniform<u8:f32, 0.0078431372549019607:128>>}
-// CHECK: %[[q1:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e+00:128>>}
-// CHECK: %[[add:.*]] = tfl.add(%[[q1]], %[[q0]]) {fused_activation_function = "NONE"} : (tensor<2x2x!quant.uniform<u8:f32, 1.000000e+00:128>>
+// CHECK: %[[q0:.*]] = "tfl.quantize"(%arg1) <{qtype = tensor<2x1x!quant.uniform<u8:f32, 0.0078431372549019607:128>>}>
+// CHECK: %[[q1:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e+00:128>>}>
+// CHECK: %[[add:.*]] = tfl.add(%[[q1]], %[[q0]]) <{fused_activation_function = "NONE"}> : (tensor<2x2x!quant.uniform<u8:f32, 1.000000e+00:128>>
 // CHECK: %[[dq:.*]] = "tfl.dequantize"(%[[add]]) : (tensor<2x2x!quant.uniform<u8:f32, 0.0078431372549019607:128>>)
 // CHECK: return %[[dq]]
 }
@@ -33,8 +33,8 @@ func.func @hardcode_input_deq(%arg0: tensor<2x2x!quant.uniform<u8:f32, 1.0>>, %a
   %4 = "tfl.add"(%1, %arg1) {fused_activation_function="NONE"}: (tensor<2x2xf32>, tensor<2x1xf32>) -> tensor<2x2xf32>
   func.return %4 : tensor<2x2xf32>
 
-// CHECK: %[[q:.*]] = "tfl.quantize"(%arg1) {qtype = tensor<2x1x!quant.uniform<u8:f32, 0.0078431372549019607:128>>}
-// CHECK: %[[add:.*]] = tfl.add(%arg0, %[[q]]) {fused_activation_function = "NONE"} : (tensor<2x2x!quant.uniform<u8:f32, 1.000000e+00>>
+// CHECK: %[[q:.*]] = "tfl.quantize"(%arg1) <{qtype = tensor<2x1x!quant.uniform<u8:f32, 0.0078431372549019607:128>>}>
+// CHECK: %[[add:.*]] = tfl.add(%arg0, %[[q]]) <{fused_activation_function = "NONE"}> : (tensor<2x2x!quant.uniform<u8:f32, 1.000000e+00>>
 // CHECK: %[[dq:.*]] = "tfl.dequantize"(%[[add]]) : (tensor<2x2x!quant.uniform<u8:f32, 0.0078431372549019607:128>>)
 // CHECK: return %[[dq]]
 }
@@ -48,9 +48,9 @@ func.func @hardcode_output(%arg0: tensor<2x2xf32>, %arg1: tensor<2x1xf32>) -> te
   %4 = "tfl.add"(%2, %3) {fused_activation_function="NONE"}: (tensor<2x2xf32>, tensor<2x1xf32>) -> tensor<2x2xf32>
   func.return %4 : tensor<2x2xf32>
 
-// CHECK: %[[q0:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e+00:128>>}
-// CHECK: %[[q1:.*]] = "tfl.quantize"(%arg1) {qtype = tensor<2x1x!quant.uniform<u8:f32, 1.000000e+00:128>>}
-// CHECK: %[[add:.*]] = tfl.add(%[[q0]], %[[q1]]) {fused_activation_function = "NONE"} : (tensor<2x2x!quant.uniform<u8:f32, 1.000000e+00:128>>
+// CHECK: %[[q0:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e+00:128>>}>
+// CHECK: %[[q1:.*]] = "tfl.quantize"(%arg1) <{qtype = tensor<2x1x!quant.uniform<u8:f32, 1.000000e+00:128>>}>
+// CHECK: %[[add:.*]] = tfl.add(%[[q0]], %[[q1]]) <{fused_activation_function = "NONE"}> : (tensor<2x2x!quant.uniform<u8:f32, 1.000000e+00:128>>
 // CHECK: %[[dq:.*]] = "tfl.dequantize"(%[[add]]) : (tensor<2x2x!quant.uniform<u8:f32, 0.0078431372549019607:128>>)
 // CHECK: return %[[dq]]
 }
@@ -81,8 +81,8 @@ func.func @test_conv_2d_activation_and_bias(%arg0: tensor<1x224x224x3xf32>, %arg
     %1 = "tfl.conv_2d"(%arg0, %0, %arg2) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x224x224x3xf32>, tensor<32x3x3x3xf32>, tensor<32xf32>) -> tensor<1x112x112x32xf32>
     func.return %1 : tensor<1x112x112x32xf32>
 
-// CHECK: %[[q0:.*]] = "tfl.quantize"(%arg2) {qtype = tensor<32x!quant.uniform<i32:f32, 0.0078431372549019607>>}
-// CHECK: %[[q1:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x224x224x3x!quant.uniform<u8:f32, 0.0078431372549019607:128>>}
+// CHECK: %[[q0:.*]] = "tfl.quantize"(%arg2) <{qtype = tensor<32x!quant.uniform<i32:f32, 0.0078431372549019607>>}>
+// CHECK: %[[q1:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<1x224x224x3x!quant.uniform<u8:f32, 0.0078431372549019607:128>>}>
 // CHECK: %[[conv:.*]] = "tfl.conv_2d"(%[[q1]], %arg1, %[[q0]])
 // CHECK: %[[dq:.*]] = "tfl.dequantize"(%[[conv]]) : (tensor<1x112x112x32x!quant.uniform<u8:f32, 0.0078431372549019607:128>>)
 // CHECK: return %[[dq]]
diff --git a/tensorflow/compiler/mlir/lite/tests/end2end/fake_quant_per_channel.pbtxt b/tensorflow/compiler/mlir/lite/tests/end2end/fake_quant_per_channel.pbtxt
index ada1c80dfd7535..3574ae83a41998 100644
--- a/tensorflow/compiler/mlir/lite/tests/end2end/fake_quant_per_channel.pbtxt
+++ b/tensorflow/compiler/mlir/lite/tests/end2end/fake_quant_per_channel.pbtxt
@@ -450,9 +450,9 @@ node {
 # MLIR-SAME:  inputs = "input"
 # MLIR-SAME:  outputs = "output"
 # MLIR:         %[[shape:.*]] = arith.constant dense<[1, -1, 31]> : tensor<3xi32>
-# MLIR:         %[[bias:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<186x!quant.uniform<i32:f32:0
-# MLIR:         %[[weight:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<186x1x1x256x!quant.uniform<i8<-127:127>:f32:0, {0.12581039038230116,
-# MLIR:         %[[conv:.*]] = "tfl.conv_2d"(%[[ARG_0]], %[[weight]], %[[bias]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}
+# MLIR:         %[[bias:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<186x!quant.uniform<i32:f32:0
+# MLIR:         %[[weight:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<186x1x1x256x!quant.uniform<i8<-127:127>:f32:0, {0.12581039038230116,
+# MLIR:         %[[conv:.*]] = "tfl.conv_2d"(%[[ARG_0]], %[[weight]], %[[bias]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}
 # MLIR:         %[[reshape:.*]] = "tfl.reshape"(%[[conv]], %[[shape]]) : (tensor<1x1x1x186x!quant.uniform<i8:f32, 0.09363494573854933:22>>, tensor<3xi32>)
 # MLIR:         return %[[reshape]] : tensor<1x6x31x!quant.uniform<i8:f32, 0.09363494573854933:22>>
 # MLIR:       }
diff --git a/tensorflow/compiler/mlir/lite/tests/end2end/fake_quant_per_channel_4bit.pbtxt b/tensorflow/compiler/mlir/lite/tests/end2end/fake_quant_per_channel_4bit.pbtxt
index 60027cb443a091..7040bd424d02a8 100644
--- a/tensorflow/compiler/mlir/lite/tests/end2end/fake_quant_per_channel_4bit.pbtxt
+++ b/tensorflow/compiler/mlir/lite/tests/end2end/fake_quant_per_channel_4bit.pbtxt
@@ -450,9 +450,9 @@ node {
 # MLIR-SAME:  inputs = "input"
 # MLIR-SAME:  outputs = "output"
 # MLIR:         %[[shape:.*]] = arith.constant dense<[1, -1, 31]> : tensor<3xi32>
-# MLIR:         %[[bias:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<186x!quant.uniform<i32:f32:0
-# MLIR:         %[[weight:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<186x1x1x256x!quant.uniform<i4<-7:7>:f32:0, {2.2825599397931779,
-# MLIR:         %[[conv:.*]] = "tfl.conv_2d"(%[[ARG_0]], %[[weight]], %[[bias]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}
+# MLIR:         %[[bias:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<186x!quant.uniform<i32:f32:0
+# MLIR:         %[[weight:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<186x1x1x256x!quant.uniform<i4<-7:7>:f32:0, {2.2825599397931779,
+# MLIR:         %[[conv:.*]] = "tfl.conv_2d"(%[[ARG_0]], %[[weight]], %[[bias]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}
 # MLIR:         %[[reshape:.*]] = "tfl.reshape"(%[[conv]], %[[shape]]) : (tensor<1x1x1x186x!quant.uniform<i8:f32, 0.09363494573854933:22>>, tensor<3xi32>)
 # MLIR:         return %[[reshape]] : tensor<1x6x31x!quant.uniform<i8:f32, 0.09363494573854933:22>>
 # MLIR:       }
diff --git a/tensorflow/compiler/mlir/lite/tests/end2end/fake_quant_without_identity.pbtxt b/tensorflow/compiler/mlir/lite/tests/end2end/fake_quant_without_identity.pbtxt
index 6bacdbda2f933b..07c167fdfe01fc 100644
--- a/tensorflow/compiler/mlir/lite/tests/end2end/fake_quant_without_identity.pbtxt
+++ b/tensorflow/compiler/mlir/lite/tests/end2end/fake_quant_without_identity.pbtxt
@@ -410,9 +410,9 @@ node {
 # MLIR-SAME:  inputs = "input"
 # MLIR-SAME:  outputs = "output"
 # MLIR:         %[[shape:.*]] = arith.constant dense<[1, -1, 31]> : tensor<3xi32>
-# MLIR:         %[[bias:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<186x!quant.uniform<i32:f32:0
-# MLIR:         %[[weight:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<186x1x1x256x!quant.uniform<i8<-127:127>:f32:0, {0.12581039038230116,
-# MLIR:         %[[conv:.*]] = "tfl.conv_2d"(%[[ARG_0]], %[[weight]], %[[bias]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}
+# MLIR:         %[[bias:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<186x!quant.uniform<i32:f32:0
+# MLIR:         %[[weight:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<186x1x1x256x!quant.uniform<i8<-127:127>:f32:0, {0.12581039038230116,
+# MLIR:         %[[conv:.*]] = "tfl.conv_2d"(%[[ARG_0]], %[[weight]], %[[bias]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}
 # MLIR:         %[[reshape:.*]] = "tfl.reshape"(%[[conv]], %[[shape]]) : (tensor<1x1x1x186x!quant.uniform<i8:f32, 0.09363494573854933:22>>, tensor<3xi32>)
 # MLIR:         return %[[reshape]] : tensor<1x6x31x!quant.uniform<i8:f32, 0.09363494573854933:22>>
 # MLIR:       }
diff --git a/tensorflow/compiler/mlir/lite/tests/end2end/fake_quant_without_identity_4bit.pbtxt b/tensorflow/compiler/mlir/lite/tests/end2end/fake_quant_without_identity_4bit.pbtxt
index 12e02dbd014d1e..5e7f04d6beaaa4 100644
--- a/tensorflow/compiler/mlir/lite/tests/end2end/fake_quant_without_identity_4bit.pbtxt
+++ b/tensorflow/compiler/mlir/lite/tests/end2end/fake_quant_without_identity_4bit.pbtxt
@@ -410,9 +410,9 @@ node {
 # MLIR-SAME:  inputs = "input"
 # MLIR-SAME:  outputs = "output"
 # MLIR:         %[[shape:.*]] = arith.constant dense<[1, -1, 31]> : tensor<3xi32>
-# MLIR:         %[[bias:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<186x!quant.uniform<i32:f32:0
-# MLIR:         %[[weight:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<186x1x1x256x!quant.uniform<i4<-7:7>:f32:0, {2.2825599397931779,
-# MLIR:         %[[conv:.*]] = "tfl.conv_2d"(%[[ARG_0]], %[[weight]], %[[bias]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}
+# MLIR:         %[[bias:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<186x!quant.uniform<i32:f32:0
+# MLIR:         %[[weight:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<186x1x1x256x!quant.uniform<i4<-7:7>:f32:0, {2.2825599397931779,
+# MLIR:         %[[conv:.*]] = "tfl.conv_2d"(%[[ARG_0]], %[[weight]], %[[bias]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}
 # MLIR:         %[[reshape:.*]] = "tfl.reshape"(%[[conv]], %[[shape]]) : (tensor<1x1x1x186x!quant.uniform<i8:f32, 0.09363494573854933:22>>, tensor<3xi32>)
 # MLIR:         return %[[reshape]] : tensor<1x6x31x!quant.uniform<i8:f32, 0.09363494573854933:22>>
 # MLIR:       }
diff --git a/tensorflow/compiler/mlir/lite/tests/end2end/quant_stats.pbtxt b/tensorflow/compiler/mlir/lite/tests/end2end/quant_stats.pbtxt
index ebab9a55611287..a2f21223929d79 100644
--- a/tensorflow/compiler/mlir/lite/tests/end2end/quant_stats.pbtxt
+++ b/tensorflow/compiler/mlir/lite/tests/end2end/quant_stats.pbtxt
@@ -58,7 +58,7 @@ versions {
 
 # MLIR-LABEL: func @main(%arg0: tensor<4x!quant.uniform<u8:f32, 0.015686274509803921:128>>, %arg1: tensor<4x!quant.uniform<u8:f32, 0.023529411764705882:128>>) -> tensor<4x!quant.uniform<u8:f32, 0.0078431372549019607:128>>
 # MLIR-SAME:    attributes {tf.entry_function = {control_outputs = "", inputs = "input0,input1", outputs = "Add"}} {
-# MLIR-NEXT:    %[[add:.*]] = tfl.add(%arg0, %arg1) {fused_activation_function = "NONE"} : (tensor<4x!quant.uniform<u8:f32, 0.015686274509803921:128>>, tensor<4x!quant.uniform<u8:f32, 0.023529411764705882:128>>) -> tensor<4x!quant.uniform<u8:f32, 0.0078431372549019607:128>>
+# MLIR-NEXT:    %[[add:.*]] = tfl.add(%arg0, %arg1) <{fused_activation_function = "NONE"}> : (tensor<4x!quant.uniform<u8:f32, 0.015686274509803921:128>>, tensor<4x!quant.uniform<u8:f32, 0.023529411764705882:128>>) -> tensor<4x!quant.uniform<u8:f32, 0.0078431372549019607:128>>
 # MLIR-NEXT:    return %[[add]] : tensor<4x!quant.uniform<u8:f32, 0.0078431372549019607:128>>
 # MLIR-NEXT:  }
 
diff --git a/tensorflow/compiler/mlir/lite/tests/end2end/unroll_batch_matmul.pbtxt b/tensorflow/compiler/mlir/lite/tests/end2end/unroll_batch_matmul.pbtxt
index adbc83bde4b8fe..293fe283ee2685 100644
--- a/tensorflow/compiler/mlir/lite/tests/end2end/unroll_batch_matmul.pbtxt
+++ b/tensorflow/compiler/mlir/lite/tests/end2end/unroll_batch_matmul.pbtxt
@@ -79,12 +79,12 @@ versions {
 
 # CHECK:       func @main(%[[VAL_0:.*]]: tensor<2x5x3xf32>, %[[VAL_1:.*]]: tensor<3x7xf32>) -> tensor<2x5x7xf32> attributes {tf.entry_function = {control_outputs = "", inputs = "Placeholder,Placeholder_1", outputs = "MatMul"}} {
 # CHECK-DAG:       %[[VAL_2:.*]] = arith.constant dense<[1, 0]> : tensor<2xi32>
-# CHECK-DAG:       %[[VAL_3:.*]] = "tfl.no_value"() {value} : () -> none
+# CHECK-DAG:       %[[VAL_3:.*]] = "tfl.no_value"() <{value}> : () -> none
 # CHECK-DAG:       %[[VAL_6:.*]] = arith.constant dense<0> : tensor<i32>
-# CHECK:           %[[VAL_7:.*]]:2 = "tfl.split"(%[[VAL_6]], %[[VAL_0]]) {num_splits = 2 : i32} : (tensor<i32>, tensor<2x5x3xf32>) -> (tensor<1x5x3xf32>, tensor<1x5x3xf32>)
+# CHECK:           %[[VAL_7:.*]]:2 = "tfl.split"(%[[VAL_6]], %[[VAL_0]]) <{num_splits = 2 : i32}> : (tensor<i32>, tensor<2x5x3xf32>) -> (tensor<1x5x3xf32>, tensor<1x5x3xf32>)
 # CHECK:           %[[VAL_9:.*]] = "tfl.transpose"(%[[VAL_1]], %[[VAL_2]]) : (tensor<3x7xf32>, tensor<2xi32>) -> tensor<7x3xf32>
-# CHECK:           %[[VAL_10:.*]] = "tfl.fully_connected"(%[[VAL_7]]#0, %[[VAL_9]], %[[VAL_3]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<1x5x3xf32>, tensor<7x3xf32>, none) -> tensor<5x7xf32>
-# CHECK:           %[[VAL_11:.*]] = "tfl.fully_connected"(%[[VAL_7]]#1, %[[VAL_9]], %[[VAL_3]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<1x5x3xf32>, tensor<7x3xf32>, none) -> tensor<5x7xf32>
-# CHECK:           %[[VAL_12:.*]] = "tfl.pack"(%[[VAL_10]], %[[VAL_11]]) {axis = 0 : i32, values_count = 2 : i32} : (tensor<5x7xf32>, tensor<5x7xf32>) -> tensor<2x5x7xf32>
+# CHECK:           %[[VAL_10:.*]] = "tfl.fully_connected"(%[[VAL_7]]#0, %[[VAL_9]], %[[VAL_3]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<1x5x3xf32>, tensor<7x3xf32>, none) -> tensor<5x7xf32>
+# CHECK:           %[[VAL_11:.*]] = "tfl.fully_connected"(%[[VAL_7]]#1, %[[VAL_9]], %[[VAL_3]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<1x5x3xf32>, tensor<7x3xf32>, none) -> tensor<5x7xf32>
+# CHECK:           %[[VAL_12:.*]] = "tfl.pack"(%[[VAL_10]], %[[VAL_11]]) <{axis = 0 : i32, values_count = 2 : i32}> : (tensor<5x7xf32>, tensor<5x7xf32>) -> tensor<2x5x7xf32>
 # CHECK:           return %[[VAL_12]] : tensor<2x5x7xf32>
 # CHECK:         }
diff --git a/tensorflow/compiler/mlir/lite/tests/end2end/unroll_batch_matmul_disabled.pbtxt b/tensorflow/compiler/mlir/lite/tests/end2end/unroll_batch_matmul_disabled.pbtxt
index c9287bc6184fb3..b75f3076054978 100644
--- a/tensorflow/compiler/mlir/lite/tests/end2end/unroll_batch_matmul_disabled.pbtxt
+++ b/tensorflow/compiler/mlir/lite/tests/end2end/unroll_batch_matmul_disabled.pbtxt
@@ -78,6 +78,6 @@ versions {
 }
 
 # CHECK: func @main(%[[VAL_0:.*]]: tensor<2x5x3xf32>, %[[VAL_1:.*]]: tensor<3x7xf32>) -> tensor<2x5x7xf32> attributes {tf.entry_function = {control_outputs = "", inputs = "Placeholder,Placeholder_1", outputs = "MatMul"}} {
-# CHECK:     %[[VAL_2:.*]] = "tfl.batch_matmul"(%[[VAL_0]], %[[VAL_1]]) {adj_x = false, adj_y = false} : (tensor<2x5x3xf32>, tensor<3x7xf32>) -> tensor<2x5x7xf32>
+# CHECK:     %[[VAL_2:.*]] = "tfl.batch_matmul"(%[[VAL_0]], %[[VAL_1]]) <{adj_x = false, adj_y = false}> : (tensor<2x5x3xf32>, tensor<3x7xf32>) -> tensor<2x5x7xf32>
 # CHECK:     return %[[VAL_2]] : tensor<2x5x7xf32>
 # CHECK: }
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/BUILD b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/BUILD
index 2afbe2a0d2c766..5a5b9e32e8b9dd 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/BUILD
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/BUILD
@@ -31,6 +31,7 @@ filegroup(
         "//tensorflow/compiler/mlir/lite:flatbuffer_to_string",
         "//tensorflow/compiler/mlir/lite:flatbuffer_translate",
         "//tensorflow/compiler/mlir/lite:json_to_flatbuffer",
+        "//tensorflow/compiler/mlir/lite:tf_tfl_translate",
         "@llvm-project//llvm:FileCheck",
     ],
 )
@@ -55,8 +56,8 @@ tf_native_cc_binary(
         "importer_test_min_max.cc",
     ],
     deps = [
+        "//tensorflow/compiler/mlir/lite/schema:schema_fbs",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/schema:schema_utils",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/basic_lstm.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/basic_lstm.mlir
index af5f320a24f6e2..5f0845a391e783 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/basic_lstm.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/basic_lstm.mlir
@@ -3,7 +3,7 @@
 
 func.func @main(%arg0: tensor<1x384xf32>, %arg1: tensor<1x96xf32>, %arg2: tensor<384x480xf32>, %arg3: tensor<384xf32>, %arg4: tensor<1x96xf32>) -> tensor<1x96xf32> {
 // CHECK-LABEL: @main
-// CHECK: "tfl.basic_lstm"({{.*}}) {asymmetric_quantize_inputs = false, cell_clip = 1.000000e+00 : f32, fused_activation_function = "RELU", kernel_type = #tfl<lstm_kernel_type_attr BASIC>, proj_clip = 2.000000e+00 : f32} : (tensor<1x384xf32>, tensor<1x96xf32>, tensor<384x480xf32>, tensor<384xf32>, tensor<1x96xf32>) -> (tensor<1x96xf32>, tensor<1x96xf32>, tensor<1x480xf32>, tensor<1x384xf32>)
+// CHECK: "tfl.basic_lstm"({{.*}}) <{cell_clip = 1.000000e+00 : f32, fused_activation_function = "RELU", kernel_type = #tfl<lstm_kernel_type_attr BASIC>, proj_clip = 2.000000e+00 : f32}> {asymmetric_quantize_inputs = false} : (tensor<1x384xf32>, tensor<1x96xf32>, tensor<384x480xf32>, tensor<384xf32>, tensor<1x96xf32>) -> (tensor<1x96xf32>, tensor<1x96xf32>, tensor<1x480xf32>, tensor<1x384xf32>)
 
   %0:4 = "tfl.basic_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4) {fused_activation_function = "RELU", cell_clip = 1.0 : f32, proj_clip = 2.0 : f32} : (tensor<1x384xf32>, tensor<1x96xf32>, tensor<384x480xf32>, tensor<384xf32>, tensor<1x96xf32>) -> (tensor<1x96xf32>, tensor<1x96xf32>, tensor<1x480xf32>, tensor<1x384xf32>)
   func.return %0#0 : tensor<1x96xf32>
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/bucketize.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/bucketize.mlir
index ba1bdcf9f28807..9b4463cda1a587 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/bucketize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/bucketize.mlir
@@ -3,7 +3,7 @@
 
 func.func @main(%arg0: tensor<3x2xf32>) -> tensor<3x2xi32> {
   // CHECK-LABEL: @main
-  // CHECK: "tfl.bucketize"(%arg0) {boundaries = [0.000000e+00 : f32, 1.000000e+01 : f32, 1.000000e+02 : f32]} : (tensor<3x2xf32>) -> tensor<3x2xi32>
+  // CHECK: "tfl.bucketize"(%arg0) <{boundaries = [0.000000e+00 : f32, 1.000000e+01 : f32, 1.000000e+02 : f32]}> : (tensor<3x2xf32>) -> tensor<3x2xi32>
   %0 = "tfl.bucketize"(%arg0) {boundaries = [0.0 : f32, 10.0 : f32, 100.0 : f32]} : (tensor<3x2xf32>) -> tensor<3x2xi32>
   func.return %0 : tensor<3x2xi32>
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/composite_op_round_trip.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/composite_op_round_trip.mlir
new file mode 100644
index 00000000000000..2084e8b7fe004d
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/composite_op_round_trip.mlir
@@ -0,0 +1,27 @@
+// RUN: tf_tfl_translate --enable-hlo-to-tf-conversion --input-mlir %s -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir - -o - | FileCheck %s --check-prefix=CHECK-ROUNDTRIP
+
+
+module {
+  func.func public @main( %arg0: tensor<i64>) ->  tensor<i64> {
+    %0 = func.call @test_add_roundtrip(%arg0) : (tensor<i64>) -> tensor<i64>
+
+    return %0 : tensor<i64>
+  }
+
+
+  // CHECK-LABEL: func.func private @test_add_roundtrip
+  func.func private @test_add_roundtrip(%arg0: tensor<i64>) -> tensor<i64> {
+    // CHECK-ROUNDTRIP:  %0 = stablehlo.composite "stablehlo.add_n" %arg0 {composite_attributes = {test_bool = false, test_int = 2 : i64, test_string = "test"}, decomposition = @add_n.impl} : (tensor<i64>) -> tensor<i64>
+    %0 = stablehlo.composite "stablehlo.add_n" %arg0 { composite_attributes = { test_int = 2 : i64, test_bool = 0 : i1, test_string = "test"}, decomposition = @add_n.impl } : (tensor<i64>) -> tensor<i64>
+    return %0 : tensor<i64>
+  }
+  func.func private @add_n.impl(%arg0: tensor<i64>) -> tensor<i64> {
+    %0 = stablehlo.constant dense<2> : tensor<i64>
+    %1 = stablehlo.add %arg0, %0 : tensor<i64>
+    return %1 : tensor<i64>
+  }
+
+
+
+
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/constants.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/constants.mlir
index 6977b0e84c0d85..b5cc23bf58a739 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/constants.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/constants.mlir
@@ -102,42 +102,42 @@ func.func @int4() -> tensor<5xi4> {
 
 func.func @qi32_per_axis() -> tensor<3x3x!quant.uniform<i32:f32:1, {1.0, 0.5:1, 0.25:1}>> {
   // CHECK-LABEL: @qi32_per_axis
-  // CHECK: {qtype = tensor<3x3x!quant.uniform<i32:f32:1, {1.000000e+00,5.000000e-01:1,2.500000e-01:1}>>, value = dense<1> : tensor<3x3xi32>} : () -> tensor<3x3x!quant.uniform<i32:f32:1, {1.000000e+00,5.000000e-01:1,2.500000e-01:1}>>
+  // CHECK: <{qtype = tensor<3x3x!quant.uniform<i32:f32:1, {1.000000e+00,5.000000e-01:1,2.500000e-01:1}>>, value = dense<1> : tensor<3x3xi32>}> : () -> tensor<3x3x!quant.uniform<i32:f32:1, {1.000000e+00,5.000000e-01:1,2.500000e-01:1}>>
   %0 = "tfl.pseudo_qconst"() { qtype = tensor<3x3x!quant.uniform<i32:f32:1, {1.0, 0.5:1, 0.25:1}>>, value = dense<1> : tensor<3x3xi32>} : () -> tensor<3x3x!quant.uniform<i32:f32:1, {1.0, 0.5:1, 0.25:1}>>
   func.return %0 : tensor<3x3x!quant.uniform<i32:f32:1, {1.0, 0.5:1, 0.25:1}>>
 }
 
 func.func @qi32_per_axis_zero() -> tensor<3x3x!quant.uniform<i32:f32:0, {1.0, 0.5:1, 0.25:1}>> {
   // CHECK-LABEL: @qi32_per_axis_zero
-  // CHECK: {qtype = tensor<3x3x!quant.uniform<i32:f32:0, {1.000000e+00,5.000000e-01:1,2.500000e-01:1}>>, value = dense<1> : tensor<3x3xi32>} : () -> tensor<3x3x!quant.uniform<i32:f32:0, {1.000000e+00,5.000000e-01:1,2.500000e-01:1}>>
+  // CHECK: <{qtype = tensor<3x3x!quant.uniform<i32:f32:0, {1.000000e+00,5.000000e-01:1,2.500000e-01:1}>>, value = dense<1> : tensor<3x3xi32>}> : () -> tensor<3x3x!quant.uniform<i32:f32:0, {1.000000e+00,5.000000e-01:1,2.500000e-01:1}>>
   %0 = "tfl.pseudo_qconst"() { qtype = tensor<3x3x!quant.uniform<i32:f32:0, {1.0, 0.5:1, 0.25:1}>>, value = dense<1> : tensor<3x3xi32>} : () -> tensor<3x3x!quant.uniform<i32:f32:0, {1.0, 0.5:1, 0.25:1}>>
   func.return %0 : tensor<3x3x!quant.uniform<i32:f32:0, {1.0, 0.5:1, 0.25:1}>>
 }
 
 func.func @qu8() -> tensor<3x!quant.uniform<u8<1:255>:f32, 1.0>> {
   // CHECK-LABEL: @qu8
-  // CHECK: {qtype = tensor<3x!quant.uniform<u8<1:255>:f32, 1.000000e+00>>, value = dense<1> : tensor<3xi8>} : () -> tensor<3x!quant.uniform<u8<1:255>:f32, 1.000000e+00>>
+  // CHECK: <{qtype = tensor<3x!quant.uniform<u8<1:255>:f32, 1.000000e+00>>, value = dense<1> : tensor<3xi8>}> : () -> tensor<3x!quant.uniform<u8<1:255>:f32, 1.000000e+00>>
   %0 = "tfl.pseudo_qconst"() { qtype = tensor<3x!quant.uniform<u8<1:255>:f32, 1.0>>, value = dense<1> : tensor<3xi8>} : () -> tensor<3x!quant.uniform<u8<1:255>:f32, 1.0>>
   func.return %0 : tensor<3x!quant.uniform<u8<1:255>:f32, 1.0>>
 }
 
 func.func @sparse_f32() -> tensor<3x2xf32> {
   // CHECK-LABEL: @sparse_f32
-  // CHECK: {compressed_data = dense<[1.000000e+00, 2.000000e+00, 5.000000e-01, 2.500000e-01, -1.000000e+00, -2.000000e+00, -5.000000e-01, -2.500000e-01]> : tensor<8xf32>, s_param = #tfl.sparsity_parameter<traversal_order = [0, 1, 2, 3], block_map = [3, 1], dim_metadata = <format = DENSE, dense_size = 16, segments = [], indices = []>, <format = DENSE, dense_size = 1, segments = [], indices = []>, <format = DENSE, dense_size = 1, segments = [], indices = []>, <format = SPARSE_CSR, dense_size = 0, segments = [0, 5, 11], indices = [1, 4, 9]>>, value = dense<0.000000e+00> : tensor<3x2xf32>}
+  // CHECK: <{compressed_data = dense<[1.000000e+00, 2.000000e+00, 5.000000e-01, 2.500000e-01, -1.000000e+00, -2.000000e+00, -5.000000e-01, -2.500000e-01]> : tensor<8xf32>, s_param = #tfl.sparsity_parameter<traversal_order = [0, 1, 2, 3], block_map = [3, 1], dim_metadata = <format = DENSE, dense_size = 16, segments = [], indices = []>, <format = DENSE, dense_size = 1, segments = [], indices = []>, <format = DENSE, dense_size = 1, segments = [], indices = []>, <format = SPARSE_CSR, dense_size = 0, segments = [0, 5, 11], indices = [1, 4, 9]>>, value = dense<0.000000e+00> : tensor<3x2xf32>}>
   %0 = "tfl.pseudo_sparse_const"() {compressed_data = dense<[1.0, 2.0, 0.5, 0.25, -1.0, -2.0, -0.5, -0.25]> : tensor<8xf32>, s_param = #tfl.sparsity_parameter<traversal_order = [0, 1, 2, 3], block_map = [3, 1], dim_metadata = #tfl.dimension_metadata<format = DENSE, dense_size = 16, segments = [], indices = []>, #tfl.dimension_metadata<format = DENSE, dense_size = 1, segments = [], indices = []>, #tfl.dimension_metadata<format = DENSE, dense_size = 1, segments = [], indices = []>, #tfl.dimension_metadata<format = SPARSE_CSR, dense_size = 0, segments = [0, 5, 11], indices = [1, 4, 9]>>, value = dense<0.000000e+00> : tensor<3x2xf32>} : () -> tensor<3x2xf32>
   func.return %0: tensor<3x2xf32>
 }
 
 func.func @sparse_f16() -> tensor<3x2xf16> {
   // CHECK-LABEL: @sparse_f16
-  // CHECK: {compressed_data = dense<[1.000000e+00, 2.000000e+00, 5.000000e-01, 2.500000e-01, -1.000000e+00, -2.000000e+00, -5.000000e-01, -2.500000e-01]> : tensor<8xf16>, s_param = #tfl.sparsity_parameter<traversal_order = [0, 1, 2, 3], block_map = [3, 1], dim_metadata = <format = DENSE, dense_size = 16, segments = [], indices = []>, <format = DENSE, dense_size = 1, segments = [], indices = []>, <format = DENSE, dense_size = 1, segments = [], indices = []>, <format = SPARSE_CSR, dense_size = 0, segments = [0, 5, 11], indices = [1, 4, 9]>>, value = dense<0.000000e+00> : tensor<3x2xf16>}
+  // CHECK: <{compressed_data = dense<[1.000000e+00, 2.000000e+00, 5.000000e-01, 2.500000e-01, -1.000000e+00, -2.000000e+00, -5.000000e-01, -2.500000e-01]> : tensor<8xf16>, s_param = #tfl.sparsity_parameter<traversal_order = [0, 1, 2, 3], block_map = [3, 1], dim_metadata = <format = DENSE, dense_size = 16, segments = [], indices = []>, <format = DENSE, dense_size = 1, segments = [], indices = []>, <format = DENSE, dense_size = 1, segments = [], indices = []>, <format = SPARSE_CSR, dense_size = 0, segments = [0, 5, 11], indices = [1, 4, 9]>>, value = dense<0.000000e+00> : tensor<3x2xf16>}>
   %0 = "tfl.pseudo_sparse_const"() {compressed_data = dense<[1.0, 2.0, 0.5, 0.25, -1.0, -2.0, -0.5, -0.25]> : tensor<8xf16>, s_param = #tfl.sparsity_parameter<traversal_order = [0, 1, 2, 3], block_map = [3, 1], dim_metadata = #tfl.dimension_metadata<format = DENSE, dense_size = 16, segments = [], indices = []>, #tfl.dimension_metadata<format = DENSE, dense_size = 1, segments = [], indices = []>, #tfl.dimension_metadata<format = DENSE, dense_size = 1, segments = [], indices = []>, #tfl.dimension_metadata<format = SPARSE_CSR, dense_size = 0, segments = [0, 5, 11], indices = [1, 4, 9]>>, value = dense<0.000000e+00> : tensor<3x2xf16>} : () -> tensor<3x2xf16>
   func.return %0: tensor<3x2xf16>
 }
 
 func.func @sparse_qu8() -> tensor<3x2x!quant.uniform<u8<1:255>:f32, 1.0>> {
   // CHECK-LABEL: @sparse_qu8
-  // CHECK: {compressed_data = dense<[1, 2, 3, 4, -1, -2, -3, -4]> : tensor<8xi8>, qtype = tensor<3x2x!quant.uniform<u8<1:255>:f32, 1.000000e+00>>, s_param = #tfl.sparsity_parameter<traversal_order = [0, 1, 2, 3], block_map = [3, 1], dim_metadata = <format = DENSE, dense_size = 16, segments = [], indices = []>, <format = DENSE, dense_size = 1, segments = [], indices = []>, <format = DENSE, dense_size = 1, segments = [], indices = []>, <format = SPARSE_CSR, dense_size = 0, segments = [0, 5, 11], indices = [1, 4, 9]>>, value = dense<0> : tensor<3x2xi8>}
+  // CHECK: <{compressed_data = dense<[1, 2, 3, 4, -1, -2, -3, -4]> : tensor<8xi8>, qtype = tensor<3x2x!quant.uniform<u8<1:255>:f32, 1.000000e+00>>, s_param = #tfl.sparsity_parameter<traversal_order = [0, 1, 2, 3], block_map = [3, 1], dim_metadata = <format = DENSE, dense_size = 16, segments = [], indices = []>, <format = DENSE, dense_size = 1, segments = [], indices = []>, <format = DENSE, dense_size = 1, segments = [], indices = []>, <format = SPARSE_CSR, dense_size = 0, segments = [0, 5, 11], indices = [1, 4, 9]>>, value = dense<0> : tensor<3x2xi8>}>
   %0 = "tfl.pseudo_sparse_qconst"() {compressed_data = dense<[1, 2, 3, 4, -1, -2, -3, -4]> : tensor<8xi8>, qtype = tensor<3x2x!quant.uniform<u8<1:255>:f32, 1.0>>, s_param = #tfl.sparsity_parameter<traversal_order = [0, 1, 2, 3], block_map = [3, 1], dim_metadata = #tfl.dimension_metadata<format = DENSE, dense_size = 16, segments = [], indices = []>, #tfl.dimension_metadata<format = DENSE, dense_size = 1, segments = [], indices = []>, #tfl.dimension_metadata<format = DENSE, dense_size = 1, segments = [], indices = []>, #tfl.dimension_metadata<format = SPARSE_CSR, dense_size = 0, segments = [0, 5, 11], indices = [1, 4, 9]>>, value = dense<42> : tensor<3x2xi8>} : () -> tensor<3x2x!quant.uniform<u8<1:255>:f32, 1.0>>
   func.return %0: tensor<3x2x!quant.uniform<u8<1:255>:f32, 1.0>>
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/constants_offset.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/constants_offset.mlir
index 1bbe28692c60eb..eeca24298432c2 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/constants_offset.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/constants_offset.mlir
@@ -102,42 +102,42 @@ func.func @int4() -> tensor<5xi4> {
 
 func.func @qi32_per_axis() -> tensor<3x3x!quant.uniform<i32:f32:1, {1.0, 0.5:1, 0.25:1}>> {
   // CHECK-LABEL: @qi32_per_axis
-  // CHECK: {qtype = tensor<3x3x!quant.uniform<i32:f32:1, {1.000000e+00,5.000000e-01:1,2.500000e-01:1}>>, value = dense<1> : tensor<3x3xi32>} : () -> tensor<3x3x!quant.uniform<i32:f32:1, {1.000000e+00,5.000000e-01:1,2.500000e-01:1}>>
+  // CHECK: <{qtype = tensor<3x3x!quant.uniform<i32:f32:1, {1.000000e+00,5.000000e-01:1,2.500000e-01:1}>>, value = dense<1> : tensor<3x3xi32>}> : () -> tensor<3x3x!quant.uniform<i32:f32:1, {1.000000e+00,5.000000e-01:1,2.500000e-01:1}>>
   %0 = "tfl.pseudo_qconst"() { qtype = tensor<3x3x!quant.uniform<i32:f32:1, {1.0, 0.5:1, 0.25:1}>>, value = dense<1> : tensor<3x3xi32>} : () -> tensor<3x3x!quant.uniform<i32:f32:1, {1.0, 0.5:1, 0.25:1}>>
   func.return %0 : tensor<3x3x!quant.uniform<i32:f32:1, {1.0, 0.5:1, 0.25:1}>>
 }
 
 func.func @qi32_per_axis_zero() -> tensor<3x3x!quant.uniform<i32:f32:0, {1.0, 0.5:1, 0.25:1}>> {
   // CHECK-LABEL: @qi32_per_axis_zero
-  // CHECK: {qtype = tensor<3x3x!quant.uniform<i32:f32:0, {1.000000e+00,5.000000e-01:1,2.500000e-01:1}>>, value = dense<1> : tensor<3x3xi32>} : () -> tensor<3x3x!quant.uniform<i32:f32:0, {1.000000e+00,5.000000e-01:1,2.500000e-01:1}>>
+  // CHECK: <{qtype = tensor<3x3x!quant.uniform<i32:f32:0, {1.000000e+00,5.000000e-01:1,2.500000e-01:1}>>, value = dense<1> : tensor<3x3xi32>}> : () -> tensor<3x3x!quant.uniform<i32:f32:0, {1.000000e+00,5.000000e-01:1,2.500000e-01:1}>>
   %0 = "tfl.pseudo_qconst"() { qtype = tensor<3x3x!quant.uniform<i32:f32:0, {1.0, 0.5:1, 0.25:1}>>, value = dense<1> : tensor<3x3xi32>} : () -> tensor<3x3x!quant.uniform<i32:f32:0, {1.0, 0.5:1, 0.25:1}>>
   func.return %0 : tensor<3x3x!quant.uniform<i32:f32:0, {1.0, 0.5:1, 0.25:1}>>
 }
 
 func.func @qu8() -> tensor<3x!quant.uniform<u8<1:255>:f32, 1.0>> {
   // CHECK-LABEL: @qu8
-  // CHECK: {qtype = tensor<3x!quant.uniform<u8<1:255>:f32, 1.000000e+00>>, value = dense<1> : tensor<3xi8>} : () -> tensor<3x!quant.uniform<u8<1:255>:f32, 1.000000e+00>>
+  // CHECK: <{qtype = tensor<3x!quant.uniform<u8<1:255>:f32, 1.000000e+00>>, value = dense<1> : tensor<3xi8>}> : () -> tensor<3x!quant.uniform<u8<1:255>:f32, 1.000000e+00>>
   %0 = "tfl.pseudo_qconst"() { qtype = tensor<3x!quant.uniform<u8<1:255>:f32, 1.0>>, value = dense<1> : tensor<3xi8>} : () -> tensor<3x!quant.uniform<u8<1:255>:f32, 1.0>>
   func.return %0 : tensor<3x!quant.uniform<u8<1:255>:f32, 1.0>>
 }
 
 func.func @sparse_f32() -> tensor<3x2xf32> {
   // CHECK-LABEL: @sparse_f32
-  // CHECK: {compressed_data = dense<[1.000000e+00, 2.000000e+00, 5.000000e-01, 2.500000e-01, -1.000000e+00, -2.000000e+00, -5.000000e-01, -2.500000e-01]> : tensor<8xf32>, s_param = #tfl.sparsity_parameter<traversal_order = [0, 1, 2, 3], block_map = [3, 1], dim_metadata = <format = DENSE, dense_size = 16, segments = [], indices = []>, <format = DENSE, dense_size = 1, segments = [], indices = []>, <format = DENSE, dense_size = 1, segments = [], indices = []>, <format = SPARSE_CSR, dense_size = 0, segments = [0, 5, 11], indices = [1, 4, 9]>>, value = dense<0.000000e+00> : tensor<3x2xf32>}
+  // CHECK: <{compressed_data = dense<[1.000000e+00, 2.000000e+00, 5.000000e-01, 2.500000e-01, -1.000000e+00, -2.000000e+00, -5.000000e-01, -2.500000e-01]> : tensor<8xf32>, s_param = #tfl.sparsity_parameter<traversal_order = [0, 1, 2, 3], block_map = [3, 1], dim_metadata = <format = DENSE, dense_size = 16, segments = [], indices = []>, <format = DENSE, dense_size = 1, segments = [], indices = []>, <format = DENSE, dense_size = 1, segments = [], indices = []>, <format = SPARSE_CSR, dense_size = 0, segments = [0, 5, 11], indices = [1, 4, 9]>>, value = dense<0.000000e+00> : tensor<3x2xf32>}>
   %0 = "tfl.pseudo_sparse_const"() {compressed_data = dense<[1.0, 2.0, 0.5, 0.25, -1.0, -2.0, -0.5, -0.25]> : tensor<8xf32>, s_param = #tfl.sparsity_parameter<traversal_order = [0, 1, 2, 3], block_map = [3, 1], dim_metadata = #tfl.dimension_metadata<format = DENSE, dense_size = 16, segments = [], indices = []>, #tfl.dimension_metadata<format = DENSE, dense_size = 1, segments = [], indices = []>, #tfl.dimension_metadata<format = DENSE, dense_size = 1, segments = [], indices = []>, #tfl.dimension_metadata<format = SPARSE_CSR, dense_size = 0, segments = [0, 5, 11], indices = [1, 4, 9]>>, value = dense<0.000000e+00> : tensor<3x2xf32>} : () -> tensor<3x2xf32>
   func.return %0: tensor<3x2xf32>
 }
 
 func.func @sparse_f16() -> tensor<3x2xf16> {
   // CHECK-LABEL: @sparse_f16
-  // CHECK: {compressed_data = dense<[1.000000e+00, 2.000000e+00, 5.000000e-01, 2.500000e-01, -1.000000e+00, -2.000000e+00, -5.000000e-01, -2.500000e-01]> : tensor<8xf16>, s_param = #tfl.sparsity_parameter<traversal_order = [0, 1, 2, 3], block_map = [3, 1], dim_metadata = <format = DENSE, dense_size = 16, segments = [], indices = []>, <format = DENSE, dense_size = 1, segments = [], indices = []>, <format = DENSE, dense_size = 1, segments = [], indices = []>, <format = SPARSE_CSR, dense_size = 0, segments = [0, 5, 11], indices = [1, 4, 9]>>, value = dense<0.000000e+00> : tensor<3x2xf16>}
+  // CHECK: <{compressed_data = dense<[1.000000e+00, 2.000000e+00, 5.000000e-01, 2.500000e-01, -1.000000e+00, -2.000000e+00, -5.000000e-01, -2.500000e-01]> : tensor<8xf16>, s_param = #tfl.sparsity_parameter<traversal_order = [0, 1, 2, 3], block_map = [3, 1], dim_metadata = <format = DENSE, dense_size = 16, segments = [], indices = []>, <format = DENSE, dense_size = 1, segments = [], indices = []>, <format = DENSE, dense_size = 1, segments = [], indices = []>, <format = SPARSE_CSR, dense_size = 0, segments = [0, 5, 11], indices = [1, 4, 9]>>, value = dense<0.000000e+00> : tensor<3x2xf16>}>
   %0 = "tfl.pseudo_sparse_const"() {compressed_data = dense<[1.0, 2.0, 0.5, 0.25, -1.0, -2.0, -0.5, -0.25]> : tensor<8xf16>, s_param = #tfl.sparsity_parameter<traversal_order = [0, 1, 2, 3], block_map = [3, 1], dim_metadata = #tfl.dimension_metadata<format = DENSE, dense_size = 16, segments = [], indices = []>, #tfl.dimension_metadata<format = DENSE, dense_size = 1, segments = [], indices = []>, #tfl.dimension_metadata<format = DENSE, dense_size = 1, segments = [], indices = []>, #tfl.dimension_metadata<format = SPARSE_CSR, dense_size = 0, segments = [0, 5, 11], indices = [1, 4, 9]>>, value = dense<0.000000e+00> : tensor<3x2xf16>} : () -> tensor<3x2xf16>
   func.return %0: tensor<3x2xf16>
 }
 
 func.func @sparse_qu8() -> tensor<3x2x!quant.uniform<u8<1:255>:f32, 1.0>> {
   // CHECK-LABEL: @sparse_qu8
-  // CHECK: {compressed_data = dense<[1, 2, 3, 4, -1, -2, -3, -4]> : tensor<8xi8>, qtype = tensor<3x2x!quant.uniform<u8<1:255>:f32, 1.000000e+00>>, s_param = #tfl.sparsity_parameter<traversal_order = [0, 1, 2, 3], block_map = [3, 1], dim_metadata = <format = DENSE, dense_size = 16, segments = [], indices = []>, <format = DENSE, dense_size = 1, segments = [], indices = []>, <format = DENSE, dense_size = 1, segments = [], indices = []>, <format = SPARSE_CSR, dense_size = 0, segments = [0, 5, 11], indices = [1, 4, 9]>>, value = dense<0> : tensor<3x2xi8>}
+  // CHECK: <{compressed_data = dense<[1, 2, 3, 4, -1, -2, -3, -4]> : tensor<8xi8>, qtype = tensor<3x2x!quant.uniform<u8<1:255>:f32, 1.000000e+00>>, s_param = #tfl.sparsity_parameter<traversal_order = [0, 1, 2, 3], block_map = [3, 1], dim_metadata = <format = DENSE, dense_size = 16, segments = [], indices = []>, <format = DENSE, dense_size = 1, segments = [], indices = []>, <format = DENSE, dense_size = 1, segments = [], indices = []>, <format = SPARSE_CSR, dense_size = 0, segments = [0, 5, 11], indices = [1, 4, 9]>>, value = dense<0> : tensor<3x2xi8>}>
   %0 = "tfl.pseudo_sparse_qconst"() {compressed_data = dense<[1, 2, 3, 4, -1, -2, -3, -4]> : tensor<8xi8>, qtype = tensor<3x2x!quant.uniform<u8<1:255>:f32, 1.0>>, s_param = #tfl.sparsity_parameter<traversal_order = [0, 1, 2, 3], block_map = [3, 1], dim_metadata = #tfl.dimension_metadata<format = DENSE, dense_size = 16, segments = [], indices = []>, #tfl.dimension_metadata<format = DENSE, dense_size = 1, segments = [], indices = []>, #tfl.dimension_metadata<format = DENSE, dense_size = 1, segments = [], indices = []>, #tfl.dimension_metadata<format = SPARSE_CSR, dense_size = 0, segments = [0, 5, 11], indices = [1, 4, 9]>>, value = dense<42> : tensor<3x2xi8>} : () -> tensor<3x2x!quant.uniform<u8<1:255>:f32, 1.0>>
   func.return %0: tensor<3x2x!quant.uniform<u8<1:255>:f32, 1.0>>
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/custom_op.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/custom_op.mlir
index d798655017ee45..d1f7a4bb6423a2 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/custom_op.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/custom_op.mlir
@@ -5,4 +5,4 @@ func.func @main(%arg0: tensor<32x4x4x128xf32>, %arg1: tensor<1x32x42x128xf32>, %
   func.return %0 : tensor<1x64x84x32xf32>
 }
 // CHECK-LABEL: main
-// CHECK: "tfl.custom"(%arg0, %arg1, %arg2) {custom_code = "Convolution2DTransposeBias", custom_option = #tfl<const_bytes : "0x010000000200000002000000">} : (tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, tensor<4xi32>) -> tensor<1x64x84x32xf32>
+// CHECK: "tfl.custom"(%arg0, %arg1, %arg2) <{custom_code = "Convolution2DTransposeBias", custom_option = #tfl<const_bytes : "0x010000000200000002000000">}> : (tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, tensor<4xi32>) -> tensor<1x64x84x32xf32>
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/custom_op_offset.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/custom_op_offset.mlir
index e1f3f1ed9e09f8..9f93a628b5d6a4 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/custom_op_offset.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/custom_op_offset.mlir
@@ -5,4 +5,4 @@ func.func @main(%arg0: tensor<32x4x4x128xf32>, %arg1: tensor<1x32x42x128xf32>, %
   func.return %0 : tensor<1x64x84x32xf32>
 }
 // CHECK-LABEL: main
-// CHECK: "tfl.custom"(%arg0, %arg1, %arg2) {custom_code = "Convolution2DTransposeBias", custom_option = #tfl<const_bytes : "0x010000000200000002000000">} : (tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, tensor<4xi32>) -> tensor<1x64x84x32xf32>
+// CHECK: "tfl.custom"(%arg0, %arg1, %arg2) <{custom_code = "Convolution2DTransposeBias", custom_option = #tfl<const_bytes : "0x010000000200000002000000">}> : (tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, tensor<4xi32>) -> tensor<1x64x84x32xf32>
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/external_constant.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/external_constant.mlir
index 81259b9a2e288f..377a8d45bb991a 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/external_constant.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/external_constant.mlir
@@ -8,7 +8,7 @@ func.func @main(tensor<40x37xf32>, tensor<40x37xf32>) -> tensor<40x40xf32> {
   func.return %0 : tensor<40x40xf32>
 
 // CHECK-LABEL: func @main(%arg0: tensor<40x37xf32>, %arg1: tensor<40x37xf32>) -> tensor<40x40xf32>
-// CHECK:      %[[CONST:[0-9]+]] = "tfl.external_const"() {buffer_index = 3 : i32} : () -> tensor<40xf32>
-// CHECK-NEXT: %[[FULL:[0-9]+]]:2 = "tfl.fully_connected"(%arg0, %arg1, %[[CONST]]) {asymmetric_quantize_inputs = false, fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}
+// CHECK:      %[[CONST:[0-9]+]] = "tfl.external_const"() <{buffer_index = 3 : i32}> : () -> tensor<40xf32>
+// CHECK-NEXT: %[[FULL:[0-9]+]]:2 = "tfl.fully_connected"(%arg0, %arg1, %[[CONST]]) <{asymmetric_quantize_inputs = false, fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}>
 // CHECK-NEXT: return %[[FULL]]#0
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/import_json.json b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/import_json.json
index c89592440d89ce..d00f1e6a58adcf 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/import_json.json
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/import_json.json
@@ -1,7 +1,7 @@
 // RUN: json_to_flatbuffer %p/test_schema.fbs %s | flatbuffer_translate --tflite-flatbuffer-to-mlir -o - | FileCheck %s
 
-// CHECK: %[[CST:.*]] = "tfl.no_value"() {value} : () -> none
-// CHECK: %[[RES0:.*]] = "tfl.conv_2d"(%arg0, %arg1, %[[CST]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<256x32x32x3xf32>, tensor<16x3x3x3xf32>, none) -> tensor<256x32x32x16xf32>
+// CHECK: %[[CST:.*]] = "tfl.no_value"() <{value}> : () -> none
+// CHECK: %[[RES0:.*]] = "tfl.conv_2d"(%arg0, %arg1, %[[CST]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<256x32x32x3xf32>, tensor<16x3x3x3xf32>, none) -> tensor<256x32x32x16xf32>
 // CHECK: return %[[RES0]] : tensor<256x32x32x16xf32>
 
 {
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/importer_test_min_max.cc b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/importer_test_min_max.cc
index 8fc5a0cb051fdf..f4097c2e5e924b 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/importer_test_min_max.cc
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/importer_test_min_max.cc
@@ -24,8 +24,8 @@ limitations under the License.
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/raw_ostream.h"
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
 #include "tensorflow/lite/model.h"
-#include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/schema/schema_utils.h"
 
 using llvm::cl::opt;
@@ -100,14 +100,14 @@ std::optional<std::unique_ptr<tflite::ModelT>> InjectStatsToFullyConnected(
 
   // CHECK-LABEL: func @main(%arg0: tensor<40x37xf32>, %arg1: tensor<40x37xf32>)
   // CHECK-SAME:      -> tensor<40x40xf32>
-  // CHECK:         %[[stat:.*]] = "quantfork.stats"(%arg0) {layerStats = dense<
-  // CHECK-SAME:      [-1.000000e+00, 1.000000e+00]> : tensor<2xf32>}
+  // CHECK:         %[[stat:.*]] = "quantfork.stats"(%arg0) <{layerStats = dense
+  // CHECK-SAME:      <[-1.000000e+00, 1.000000e+00]> : tensor<2xf32>}>
   // CHECK-SAME:      : (tensor<40x37xf32>) -> tensor<40x37xf32>
-  // CHECK-NEXT:    %[[cst:.*]] = "tfl.pseudo_const"() {value = dense<
-  // CHECK-SAME:      1.000000e+00> : tensor<40xf32>} : () -> tensor<40xf32>
+  // CHECK-NEXT:    %[[cst:.*]] = "tfl.pseudo_const"() <{value = dense<
+  // CHECK-SAME:      1.000000e+00> : tensor<40xf32>}> : () -> tensor<40xf32>
   // CHECK-NEXT:    %[[fc:.*]]:2 = "tfl.fully_connected"(%[[stat]], %arg1,
   // CHECK-NEXT:    %[[stat1:.*]] = "quantfork.stats"(%[[fc]]#0)
-  // CHECK-SAME:    {axis = 1 : i64,
+  // CHECK-SAME:    <{axis = 1 : i64,
   // CHECK-SAME:      axisStats = dense<{{\[}}[-0.000000e+00, 0.000000e+00],
   // CHECK-SAME:      [-1.000000e+00, 1.000000e+00],
   // CHECK-SAME:      [-2.000000e+00, 2.000000e+00]
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/importer_test_min_max.cc.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/importer_test_min_max.cc.mlir
index 0f351b34f0fe7a..83e48bc1e102fa 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/importer_test_min_max.cc.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/importer_test_min_max.cc.mlir
@@ -8,7 +8,7 @@ func.func @main(tensor<40x37xf32>, tensor<40x37xf32>) -> tensor<40x40xf32> {
   func.return %0 : tensor<40x40xf32>
 
 // CHECK-LABEL: func @main(%arg0: tensor<40x37xf32>, %arg1: tensor<40x37xf32>) -> tensor<40x40xf32>
-// CHECK:      %[[CONST:[0-9]+]] = "tfl.pseudo_const"() {value = dense<1.000000e+00> : tensor<40xf32>}
-// CHECK-NEXT: %[[FULL:[0-9]+]]:2 = "tfl.fully_connected"(%arg0, %arg1, %[[CONST]]) {asymmetric_quantize_inputs = false, fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}
+// CHECK:      %[[CONST:[0-9]+]] = "tfl.pseudo_const"() <{value = dense<1.000000e+00> : tensor<40xf32>}>
+// CHECK-NEXT: %[[FULL:[0-9]+]]:2 = "tfl.fully_connected"(%arg0, %arg1, %[[CONST]]) <{asymmetric_quantize_inputs = false, fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}>
 // CHECK-NEXT: return %[[FULL]]#0
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/legacy_reshape.json b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/legacy_reshape.json
index d698473713a325..c0d80c5a95247c 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/legacy_reshape.json
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/legacy_reshape.json
@@ -1,6 +1,6 @@
 // RUN: json_to_flatbuffer %p/test_schema.fbs %s | flatbuffer_translate --tflite-flatbuffer-to-mlir -o - | FileCheck %s
 
-// CHECK: %0 = "tfl.pseudo_const"() {value = dense<2> : tensor<2xi32>} : () -> tensor<2xi32>
+// CHECK: %0 = "tfl.pseudo_const"() <{value = dense<2> : tensor<2xi32>}> : () -> tensor<2xi32>
 // CHECK: %1 = "tfl.reshape"(%arg0, %0) : (tensor<1x4xf32>, tensor<2xi32>) -> tensor<2x2xf32>
 
 {
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/lstm.json b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/lstm.json
index b6704814326ae2..9403af28efec36 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/lstm.json
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/lstm.json
@@ -1,8 +1,8 @@
 // RUN: json_to_flatbuffer %p/test_schema.fbs %s | flatbuffer_translate --tflite-flatbuffer-to-mlir -o - | FileCheck %s
 // RUN: json_to_flatbuffer %p/test_schema.fbs %s | flatbuffer_translate --tflite-flatbuffer-to-mlir -o - | flatbuffer_translate -mlir-to-tflite-flatbuffer - -o - | flatbuffer_to_string - | FileCheck --check-prefix=RoundTrip %s
 
-// CHECK-DAG: %[[input_18:.*]] = "quantfork.stats"({{.*}}) {layerStats = dense<[-8.000000e-01, 1.600000e+00]> : tensor<2xf32>} : (tensor<1x4xf32>) -> tensor<1x4xf32>
-// CHECK-DAG: %[[input_19:.*]] = "quantfork.stats"({{.*}}) {layerStats = dense<[-2.000000e+00, 4.000000e+00]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+// CHECK-DAG: %[[input_18:.*]] = "quantfork.stats"({{.*}}) <{layerStats = dense<[-8.000000e-01, 1.600000e+00]> : tensor<2xf32>}> : (tensor<1x4xf32>) -> tensor<1x4xf32>
+// CHECK-DAG: %[[input_19:.*]] = "quantfork.stats"({{.*}}) <{layerStats = dense<[-2.000000e+00, 4.000000e+00]> : tensor<2xf32>}> : (tensor<1x2xf32>) -> tensor<1x2xf32>
 
 // CHECK: "tfl.unidirectional_sequence_lstm"({{.*}}, %[[input_18]], %[[input_19]], %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}})
 // CHECK-SAME: effective_hidden_scale_intermediate = tensor<*x!quant.calibrated<f32<-5.000000e-01:5.000000e-01>>>
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/lstm.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/lstm.mlir
index 33e5cca6e5de17..7b7b2d1273666b 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/lstm.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/lstm.mlir
@@ -8,10 +8,11 @@ func.func @main(%arg0: tensor<1x4xf32>, %arg1: tensor<4x4xf32>, %arg2: tensor<4x
   func.return %24 : tensor<1x4xf32>
 // CHECK-LABEL: main
 // separate lines since there is no region for this op. third_party/tensorflow/compiler/mlir/lite/ir/tfl_ops.td: 3252
-// CHECK-DAG: %[[RES0:.*]] = "tfl.pseudo_const"() {tfl.is_variable, value = dense<{{.*}}> : tensor<1x4xf32>} : () -> tensor<1x4xf32>
-// CHECK-DAG: %[[RES1:.*]] = "tfl.pseudo_const"() {tfl.is_variable, value = dense<{{.*}}> : tensor<1x4xf32>} : () -> tensor<1x4xf32>
-// CHECK: %[[RES2:.*]] = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %[[RES0]], %[[RES1]], %arg18, %arg19, %arg20, %arg21) ({
-// CHECK:  }) {asymmetric_quantize_inputs = false, cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = #tfl<lstm_kernel_type_attr FULL>, proj_clip = 0.000000e+00 : f32} : (tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
+// CHECK-DAG: %[[RES0:.*]] = "tfl.pseudo_const"() <{value = dense<{{.*}}> : tensor<1x4xf32>}> {tfl.is_variable} : () -> tensor<1x4xf32>
+// CHECK-DAG: %[[RES1:.*]] = "tfl.pseudo_const"() <{value = dense<{{.*}}> : tensor<1x4xf32>}> {tfl.is_variable} : () -> tensor<1x4xf32>
+// CHECK: %[[RES2:.*]] = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %[[RES0]], %[[RES1]], %arg18, %arg19, %arg20, %arg21)
+// CHECK-SAME: <{asymmetric_quantize_inputs = false, cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = #tfl<lstm_kernel_type_attr FULL>, proj_clip = 0.000000e+00 : f32}> ({
+// CHECK: }) : (tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
 // CHECK: return %[[RES2]]
 
 }
@@ -23,16 +24,17 @@ func.func @testFullyQuantizedLSTM(%arg0: tensor<1x528x!quant.uniform<i8:f32, 0.0
     %0 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %cst, %cst, %cst, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg19, %arg20, %arg15, %arg16, %arg17, %arg18) ({}) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", input_to_input_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0049890000373125076>>, input_to_forget_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0078849997371435165>>, input_to_cell_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0087630003690719604>>, input_to_output_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0057529998011887074>>, effective_hidden_scale_intermediate = tensor<0x!quant.uniform<i8:f32, 0.0075630000792443752:2>>, kernel_type = #tfl<lstm_kernel_type_attr FULL>, proj_clip = 0.01 : f32} : (tensor<1x528x!quant.uniform<i8:f32, 0.037248000502586365:-19>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.059801999479532242>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.031925998628139496>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.056272000074386597>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.063763998448848724>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.013358999975025654>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.022830000147223473>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.032276000827550888>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.035427000373601913>>, none, none, none, tensor<2048x!quant.uniform<i32:f32, 4.2675782196965883E-7>>, tensor<2048x!quant.uniform<i32:f32, 1.0742187583900886E-7>>, tensor<2048x!quant.uniform<i32:f32, 1.6406249869760359E-7>>, tensor<2048x!quant.uniform<i32:f32, 1.523437447303877E-7>>, tensor<640x2048x!quant.uniform<i8<-127:127>:f32, 0.021174000576138496>>, tensor<640x!quant.uniform<i32:f32, 1.601389680352559E-4>>, tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>, tensor<1x2048x!quant.uniform<i16:f32, 4.8799999058246613E-4>>, tensor<2048x!quant.uniform<i16:f32, 4.3700000969693065E-4>>, tensor<2048x!quant.uniform<i16:f32, 1.1000000085914508E-4>>, tensor<2048x!quant.uniform<i16:f32, 1.6799999866634607E-4>>, tensor<2048x!quant.uniform<i16:f32, 1.55999994603917E-4>>) -> tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>
     func.return %0 : tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>
 // CHECK-LABEL: testFullyQuantizedLSTM
-// CHECK: %[[CST:.*]] = "tfl.no_value"() {value} : () -> none
+// CHECK: %[[CST:.*]] = "tfl.no_value"() <{value}> : () -> none
 // CHECK: %[[RES0:.*]] = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %[[CST]], %[[CST]], %[[CST]], %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg19, %arg20, %arg15, %arg16, %arg17, %arg18)
-// CHECK: }) {asymmetric_quantize_inputs = false, cell_clip = 1.000000e+01 : f32, effective_hidden_scale_intermediate = tensor<0x!quant.uniform<i8:f32, 0.0075630000792443752:2>>, fused_activation_function = "TANH", input_to_cell_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0087630003690719604>>, input_to_forget_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0078849997371435165>>, input_to_input_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0049890000373125076>>, input_to_output_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0057529998011887074>>, kernel_type = #tfl<lstm_kernel_type_attr FULL>, proj_clip = 0.00999999977 : f32} : (tensor<1x528x!quant.uniform<i8:f32, 0.037248000502586365:-19>>, tensor<2048x528x!quant.uniform<i8:f32, 0.059801999479532242>>, tensor<2048x528x!quant.uniform<i8:f32, 0.031925998628139496>>, tensor<2048x528x!quant.uniform<i8:f32, 0.056272000074386597>>, tensor<2048x528x!quant.uniform<i8:f32, 0.063763998448848724>>, tensor<2048x640x!quant.uniform<i8:f32, 0.013358999975025654>>, tensor<2048x640x!quant.uniform<i8:f32, 0.022830000147223473>>, tensor<2048x640x!quant.uniform<i8:f32, 0.032276000827550888>>, tensor<2048x640x!quant.uniform<i8:f32, 0.035427000373601913>>, none, none, none, tensor<2048x!quant.uniform<i32:f32, 4.2675782196965883E-7>>, tensor<2048x!quant.uniform<i32:f32, 1.0742187583900886E-7>>, tensor<2048x!quant.uniform<i32:f32, 1.6406249869760359E-7>>, tensor<2048x!quant.uniform<i32:f32, 1.523437447303877E-7>>, tensor<640x2048x!quant.uniform<i8:f32, 0.021174000576138496>>, tensor<640x!quant.uniform<i32:f32, 1.6013896674849093E-4>>, tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>, tensor<1x2048x!quant.uniform<i16:f32, 4.8799999058246613E-4>>, tensor<2048x!quant.uniform<i16:f32, 4.3700000969693065E-4>>, tensor<2048x!quant.uniform<i16:f32, 1.1000000085914508E-4>>, tensor<2048x!quant.uniform<i16:f32, 1.6799999866634607E-4>>, tensor<2048x!quant.uniform<i16:f32, 1.55999994603917E-4>>) -> tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>
+// CHECK-SAME: <{asymmetric_quantize_inputs = false, cell_clip = 1.000000e+01 : f32, effective_hidden_scale_intermediate = tensor<0x!quant.uniform<i8:f32, 0.0075630000792443752:2>>, fused_activation_function = "TANH", input_to_cell_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0087630003690719604>>, input_to_forget_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0078849997371435165>>, input_to_input_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0049890000373125076>>, input_to_output_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0057529998011887074>>, kernel_type = #tfl<lstm_kernel_type_attr FULL>, proj_clip = 0.00999999977 : f32}> ({
+// CHECK: }) : (tensor<1x528x!quant.uniform<i8:f32, 0.037248000502586365:-19>>, tensor<2048x528x!quant.uniform<i8:f32, 0.059801999479532242>>, tensor<2048x528x!quant.uniform<i8:f32, 0.031925998628139496>>, tensor<2048x528x!quant.uniform<i8:f32, 0.056272000074386597>>, tensor<2048x528x!quant.uniform<i8:f32, 0.063763998448848724>>, tensor<2048x640x!quant.uniform<i8:f32, 0.013358999975025654>>, tensor<2048x640x!quant.uniform<i8:f32, 0.022830000147223473>>, tensor<2048x640x!quant.uniform<i8:f32, 0.032276000827550888>>, tensor<2048x640x!quant.uniform<i8:f32, 0.035427000373601913>>, none, none, none, tensor<2048x!quant.uniform<i32:f32, 4.2675782196965883E-7>>, tensor<2048x!quant.uniform<i32:f32, 1.0742187583900886E-7>>, tensor<2048x!quant.uniform<i32:f32, 1.6406249869760359E-7>>, tensor<2048x!quant.uniform<i32:f32, 1.523437447303877E-7>>, tensor<640x2048x!quant.uniform<i8:f32, 0.021174000576138496>>, tensor<640x!quant.uniform<i32:f32, 1.6013896674849093E-4>>, tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>, tensor<1x2048x!quant.uniform<i16:f32, 4.8799999058246613E-4>>, tensor<2048x!quant.uniform<i16:f32, 4.3700000969693065E-4>>, tensor<2048x!quant.uniform<i16:f32, 1.1000000085914508E-4>>, tensor<2048x!quant.uniform<i16:f32, 1.6799999866634607E-4>>, tensor<2048x!quant.uniform<i16:f32, 1.55999994603917E-4>>) -> tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>
 }
 
 // -----
 
 // CHECK-LABEL: testUnidirectionalSequenceLstmWithIntermediates
 func.func @testUnidirectionalSequenceLstmWithIntermediates(%arg0: tensor<? x ? x ? x f32>, %arg1: tensor<? x ? x f32>, %arg2: tensor<? x ? x f32>, %arg3: tensor<? x ? x f32>, %arg4: tensor<? x ? x f32>, %arg5: tensor<? x ? x f32>, %arg6: tensor<? x ? x f32>, %arg7: tensor<? x ? x f32>, %arg8: tensor<? x ? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<? x f32>, %arg16: tensor<? x ? x f32>, %arg17: tensor<? x f32>, %arg18: tensor<? x ? x f32>, %arg19: tensor<? x ? xf32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x ? x ? x f32> {
-  // CHECK: "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {asymmetric_quantize_inputs = false, cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = false, effective_hidden_scale_intermediate = tensor<0x!quant.uniform<i8:f32, 0.0077881771139800549>>, fused_activation_function = "TANH", input_to_cell_intermediate = tensor<0xf32>, input_to_forget_intermediate = tensor<0xf32>, input_to_input_intermediate = tensor<0xf32>, input_to_output_intermediate = tensor<0xf32>, proj_clip = 0.000000e+00 : f32, time_major = false} : (tensor<?x?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?x?x?xf32>
+  // CHECK: "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) <{asymmetric_quantize_inputs = false, cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = false, effective_hidden_scale_intermediate = tensor<0x!quant.uniform<i8:f32, 0.0077881771139800549>>, fused_activation_function = "TANH", input_to_cell_intermediate = tensor<0xf32>, input_to_forget_intermediate = tensor<0xf32>, input_to_input_intermediate = tensor<0xf32>, input_to_output_intermediate = tensor<0xf32>, proj_clip = 0.000000e+00 : f32, time_major = false}> : (tensor<?x?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?x?x?xf32>
   %0 = "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {cell_clip = 1.000000e+01 : f32, effective_hidden_scale_intermediate = tensor<0x!quant.uniform<i8:f32, 0.0077881771139800549>>, fused_activation_function = "TANH", input_to_cell_intermediate = tensor<0xf32>, input_to_forget_intermediate = tensor<0xf32>, input_to_input_intermediate = tensor<0xf32>, input_to_output_intermediate = tensor<0xf32>, proj_clip = 0.000000e+00 : f32, time_major = false} : (tensor<?x?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?x?x?xf32>
   func.return %0 : tensor<?x?x?xf32>
 }
@@ -46,10 +48,11 @@ func.func @testLSTMAsymAttributeTrue(%arg0: tensor<1x4xf32>, %arg1: tensor<4x4xf
   %24 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %cst0, %cst1, %arg18, %arg19, %arg20, %arg21) ({}) {asymmetric_quantize_inputs = true, cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = #tfl<lstm_kernel_type_attr FULL>, proj_clip = 0.000000e+00 : f32} : (tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
   func.return %24 : tensor<1x4xf32>
 
-// CHECK-DAG: %[[RES0:.*]] = "tfl.pseudo_const"() {tfl.is_variable, value = dense<{{.*}}> : tensor<1x4xf32>} : () -> tensor<1x4xf32>
-// CHECK-DAG: %[[RES1:.*]] = "tfl.pseudo_const"() {tfl.is_variable, value = dense<{{.*}}> : tensor<1x4xf32>} : () -> tensor<1x4xf32>
-// CHECK: %[[RES2:.*]] = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %[[RES0]], %[[RES1]], %arg18, %arg19, %arg20, %arg21) ({
-// CHECK:  }) {asymmetric_quantize_inputs = true, cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = #tfl<lstm_kernel_type_attr FULL>, proj_clip = 0.000000e+00 : f32} : (tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
+// CHECK-DAG: %[[RES0:.*]] = "tfl.pseudo_const"() <{value = dense<{{.*}}> : tensor<1x4xf32>}> {tfl.is_variable} : () -> tensor<1x4xf32>
+// CHECK-DAG: %[[RES1:.*]] = "tfl.pseudo_const"() <{value = dense<{{.*}}> : tensor<1x4xf32>}> {tfl.is_variable} : () -> tensor<1x4xf32>
+// CHECK: %[[RES2:.*]] = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %[[RES0]], %[[RES1]], %arg18, %arg19, %arg20, %arg21)
+// CHECK-SAME: <{asymmetric_quantize_inputs = true, cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = #tfl<lstm_kernel_type_attr FULL>, proj_clip = 0.000000e+00 : f32}> ({
+// CHECK: }) : (tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
 // CHECK: return %[[RES2]]
 
 }
@@ -63,10 +66,11 @@ func.func @testLSTMAsymAttributeFalse(%arg0: tensor<1x4xf32>, %arg1: tensor<4x4x
   %24 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %cst0, %cst1, %arg18, %arg19, %arg20, %arg21) ({}) {asymmetric_quantize_inputs = false, cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = #tfl<lstm_kernel_type_attr FULL>, proj_clip = 0.000000e+00 : f32} : (tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
   func.return %24 : tensor<1x4xf32>
 
-// CHECK-DAG: %[[RES0:.*]] = "tfl.pseudo_const"() {tfl.is_variable, value = dense<{{.*}}> : tensor<1x4xf32>} : () -> tensor<1x4xf32>
-// CHECK-DAG: %[[RES1:.*]] = "tfl.pseudo_const"() {tfl.is_variable, value = dense<{{.*}}> : tensor<1x4xf32>} : () -> tensor<1x4xf32>
-// CHECK: %[[RES2:.*]] = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %[[RES0]], %[[RES1]], %arg18, %arg19, %arg20, %arg21) ({
-// CHECK:  }) {asymmetric_quantize_inputs = false, cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = #tfl<lstm_kernel_type_attr FULL>, proj_clip = 0.000000e+00 : f32} : (tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
+// CHECK-DAG: %[[RES0:.*]] = "tfl.pseudo_const"() <{value = dense<{{.*}}> : tensor<1x4xf32>}> {tfl.is_variable} : () -> tensor<1x4xf32>
+// CHECK-DAG: %[[RES1:.*]] = "tfl.pseudo_const"() <{value = dense<{{.*}}> : tensor<1x4xf32>}> {tfl.is_variable} : () -> tensor<1x4xf32>
+// CHECK: %[[RES2:.*]] = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %[[RES0]], %[[RES1]], %arg18, %arg19, %arg20, %arg21)
+// CHECK-SAME: <{asymmetric_quantize_inputs = false, cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = #tfl<lstm_kernel_type_attr FULL>, proj_clip = 0.000000e+00 : f32}> ({
+// CHECK: }) : (tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
 // CHECK: return %[[RES2]]
 
 }
@@ -80,10 +84,11 @@ func.func @testLSTMAsymAttributeDefault(%arg0: tensor<1x4xf32>, %arg1: tensor<4x
   %24 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %cst0, %cst1, %arg18, %arg19, %arg20, %arg21) ({}) {cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = #tfl<lstm_kernel_type_attr FULL>, proj_clip = 0.000000e+00 : f32} : (tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
   func.return %24 : tensor<1x4xf32>
 
-// CHECK-DAG: %[[RES0:.*]] = "tfl.pseudo_const"() {tfl.is_variable, value = dense<{{.*}}> : tensor<1x4xf32>} : () -> tensor<1x4xf32>
-// CHECK-DAG: %[[RES1:.*]] = "tfl.pseudo_const"() {tfl.is_variable, value = dense<{{.*}}> : tensor<1x4xf32>} : () -> tensor<1x4xf32>
-// CHECK: %[[RES2:.*]] = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %[[RES0]], %[[RES1]], %arg18, %arg19, %arg20, %arg21) ({
-// CHECK:  }) {asymmetric_quantize_inputs = false, cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = #tfl<lstm_kernel_type_attr FULL>, proj_clip = 0.000000e+00 : f32} : (tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
+// CHECK-DAG: %[[RES0:.*]] = "tfl.pseudo_const"() <{value = dense<{{.*}}> : tensor<1x4xf32>}> {tfl.is_variable} : () -> tensor<1x4xf32>
+// CHECK-DAG: %[[RES1:.*]] = "tfl.pseudo_const"() <{value = dense<{{.*}}> : tensor<1x4xf32>}> {tfl.is_variable} : () -> tensor<1x4xf32>
+// CHECK: %[[RES2:.*]] = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %[[RES0]], %[[RES1]], %arg18, %arg19, %arg20, %arg21)
+// CHECK-SAME: <{asymmetric_quantize_inputs = false, cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = #tfl<lstm_kernel_type_attr FULL>, proj_clip = 0.000000e+00 : f32}> ({
+// CHECK: }) : (tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
 // CHECK: return %[[RES2]]
 
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/many_attribute_op.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/many_attribute_op.mlir
index fb88f37615d731..faac23eedcff54 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/many_attribute_op.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/many_attribute_op.mlir
@@ -3,7 +3,7 @@
 // Confirm a wide array of attribute survives the round-trip
 func.func @main(tensor<1x6x6x16xf32>) -> tensor<1x1x1x16xf32> {
 ^bb0(%arg0: tensor<1x6x6x16xf32>):
-  // CHECK: "tfl.average_pool_2d"(%{{.*}}) {filter_height = 3 : i32, filter_width = 6 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 3 : i32, stride_w = 1 : i32} : (tensor<1x6x6x16xf32>) -> tensor<1x1x1x16xf32>
+  // CHECK: "tfl.average_pool_2d"(%{{.*}}) <{filter_height = 3 : i32, filter_width = 6 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 3 : i32, stride_w = 1 : i32}> : (tensor<1x6x6x16xf32>) -> tensor<1x1x1x16xf32>
   %0 = "tfl.average_pool_2d"(%arg0) {filter_height = 3 : i32, filter_width = 6 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 3 : i32, stride_w = 1 : i32} : (tensor<1x6x6x16xf32>) -> tensor<1x1x1x16xf32> loc("avgpool")
   func.return %0 : tensor<1x1x1x16xf32>
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/math.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/math.mlir
index 820bf2ca33decc..ac5c8a89725e29 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/math.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/math.mlir
@@ -3,7 +3,7 @@
 
 func.func @main(tensor<4xf32>) -> tensor<4xf32> {
 ^bb0(%arg0: tensor<4xf32>):
-  // CHECK:      [[CONST:%.*]] = "tfl.pseudo_const"() {value = dense<1.000000e+00> : tensor<4xf32>} : () -> tensor<4xf32>
+  // CHECK:      [[CONST:%.*]] = "tfl.pseudo_const"() <{value = dense<1.000000e+00> : tensor<4xf32>}> : () -> tensor<4xf32>
   // CHECK-NEXT: [[SQDIFF:%.*]] = tfl.squared_difference %arg0, [[CONST]] : tensor<4xf32>
   // CHECK-NEXT: %{{.*}} = tfl.mul %arg0, [[SQDIFF]] {fused_activation_function = "NONE"} : tensor<4xf32>
   %0 = "tfl.pseudo_const" () {value = dense<1.0> : tensor<4xf32>} : () -> tensor<4xf32> loc("Const")
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/matmul.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/matmul.mlir
index c786f08ef30209..0b817991f659b7 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/matmul.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/matmul.mlir
@@ -5,7 +5,7 @@ func.func @main(%arg0: tensor<4x10x15xf32>, %arg1: tensor<4x15x17xf32>) -> tenso
   func.return %0:  tensor<4x10x17xf32>
 
 // CHECK-LABEL: main
-// CHECK: %[[RESULT0:.*]] =  "tfl.batch_matmul"(%arg0, %arg1) {adj_x = false, adj_y = false, asymmetric_quantize_inputs = false} : (tensor<4x10x15xf32>, tensor<4x15x17xf32>) -> tensor<4x10x17xf32>
+// CHECK: %[[RESULT0:.*]] =  "tfl.batch_matmul"(%arg0, %arg1) <{adj_x = false, adj_y = false, asymmetric_quantize_inputs = false}> : (tensor<4x10x15xf32>, tensor<4x15x17xf32>) -> tensor<4x10x17xf32>
 // CHECK: return %[[RESULT0]]
 }
 
@@ -14,7 +14,7 @@ func.func @testMatmulAsymAttributeTrue(%arg0: tensor<4x10x15xf32>, %arg1: tensor
   %0 = "tfl.batch_matmul"(%arg0, %arg1) {adj_x = false, adj_y = false, asymmetric_quantize_inputs = true} : (tensor<4x10x15xf32>, tensor<4x15x17xf32>) -> tensor<4x10x17xf32>
   func.return %0:  tensor<4x10x17xf32>
 
-// CHECK: %[[RESULT0:.*]] =  "tfl.batch_matmul"(%arg0, %arg1) {adj_x = false, adj_y = false, asymmetric_quantize_inputs = true} : (tensor<4x10x15xf32>, tensor<4x15x17xf32>) -> tensor<4x10x17xf32>
+// CHECK: %[[RESULT0:.*]] =  "tfl.batch_matmul"(%arg0, %arg1) <{adj_x = false, adj_y = false, asymmetric_quantize_inputs = true}> : (tensor<4x10x15xf32>, tensor<4x15x17xf32>) -> tensor<4x10x17xf32>
 // CHECK: return %[[RESULT0]]
 }
 
@@ -23,6 +23,6 @@ func.func @testMatmulAsymAttributeFalse(%arg0: tensor<4x10x15xf32>, %arg1: tenso
   %0 = "tfl.batch_matmul"(%arg0, %arg1) {adj_x = false, adj_y = false, asymmetric_quantize_inputs = false} : (tensor<4x10x15xf32>, tensor<4x15x17xf32>) -> tensor<4x10x17xf32>
   func.return %0:  tensor<4x10x17xf32>
 
-// CHECK: %[[RESULT0:.*]] =  "tfl.batch_matmul"(%arg0, %arg1) {adj_x = false, adj_y = false, asymmetric_quantize_inputs = false} : (tensor<4x10x15xf32>, tensor<4x15x17xf32>) -> tensor<4x10x17xf32>
+// CHECK: %[[RESULT0:.*]] =  "tfl.batch_matmul"(%arg0, %arg1) <{adj_x = false, adj_y = false, asymmetric_quantize_inputs = false}> : (tensor<4x10x15xf32>, tensor<4x15x17xf32>) -> tensor<4x10x17xf32>
 // CHECK: return %[[RESULT0]]
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/optional.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/optional.mlir
index 2779c8ed684575..ada417fe9d4085 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/optional.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/optional.mlir
@@ -2,7 +2,7 @@
 // Test to make sure optional parameters survive a roundtrip
 
 func.func @main(%arg0: tensor<40x37xf32>, %arg1: tensor<40x37xf32>) -> tensor<40x40xf32> {
-// CHECK: [[NONE:%.*]] = "tfl.no_value"() {value} : () -> none
+// CHECK: [[NONE:%.*]] = "tfl.no_value"() <{value}> : () -> none
 // CHECK: "tfl.fully_connected"(%arg0, %arg1, [[NONE]])
 // CHECK-SAME: (tensor<40x37xf32>, tensor<40x37xf32>, none) -> (tensor<40x40xf32>, tensor<40x40xf32>)
   %cst = "tfl.no_value"() {value = unit} : () -> none
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/optional_input.json b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/optional_input.json
index 3107e7ea2695c4..0d04ef052437e7 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/optional_input.json
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/optional_input.json
@@ -2,8 +2,8 @@
 
 // This test is to test that if the flatbuffer omits the last optional input `bias` of tfl.conv_2d op, the flatbuffer_importer will automatically adds `none` value to tfl.conv_2d.
 
-// CHECK: %[[CST:.*]] = "tfl.no_value"() {value} : () -> none
-// CHECK: %[[RES0:.*]] = "tfl.conv_2d"(%arg0, %arg1, %[[CST]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<256x32x32x3xf32>, tensor<16x3x3x3xf32>, none) -> tensor<256x32x32x16xf32>
+// CHECK: %[[CST:.*]] = "tfl.no_value"() <{value}> : () -> none
+// CHECK: %[[RES0:.*]] = "tfl.conv_2d"(%arg0, %arg1, %[[CST]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<256x32x32x3xf32>, tensor<16x3x3x3xf32>, none) -> tensor<256x32x32x16xf32>
 // CHECK: return %[[RES0]] : tensor<256x32x32x16xf32>
 
 {
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/quant_stats.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/quant_stats.mlir
index e0d581e4e799a9..8c7c1201841a70 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/quant_stats.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/quant_stats.mlir
@@ -3,7 +3,7 @@
 
 func.func @main(%arg0: tensor<1x512x672x8xf32>) -> tensor<1x512x672x8xf32> {
 // CHECK-LABEL: @main
-// CHECK: %[[RES0:.*]] = "quantfork.stats"(%arg0) {layerStats = dense<[0.000000e+00, 2.550000e+02]> : tensor<2xf32>} : (tensor<1x512x672x8xf32>) -> tensor<1x512x672x8xf32>
+// CHECK: %[[RES0:.*]] = "quantfork.stats"(%arg0) <{layerStats = dense<[0.000000e+00, 2.550000e+02]> : tensor<2xf32>}> : (tensor<1x512x672x8xf32>) -> tensor<1x512x672x8xf32>
 
   %0 = "quantfork.stats"(%arg0) {layerStats = dense<[0.000000e+00, 2.550000e+02]> : tensor<2xf32>} : (tensor<1x512x672x8xf32>) -> tensor<1x512x672x8xf32>
   func.return %0 : tensor<1x512x672x8xf32>
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/quantization.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/quantization.mlir
index fdba4fd64b9697..6a6c3378155254 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/quantization.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/quantization.mlir
@@ -2,10 +2,10 @@
 
 // CHECK-LABEL: main
 func.func @main(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x401408xf32> {
-// CHECK:   %{{.*}} = "tfl.quantize"(%{{.*}}) {qtype = tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>} : (tensor<1x224x224x3xf32>) -> tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>
+// CHECK:   %{{.*}} = "tfl.quantize"(%{{.*}}) <{qtype = tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>}> : (tensor<1x224x224x3xf32>) -> tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>
 // The float values here doesn't match exactly because double -> float -> double is lossy
-// CHECK-NEXT:   %{{.*}} = "tfl.pseudo_qconst"() {qtype = tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32, 0.021826678{{[0-9]*}}:151>>, value = dense<-76> : tensor<32x3x3x3xi8>} : () -> tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32, 0.021826678{{[0-9]*}}:151>>
-// CHECK-NEXT:   %{{.*}} = "tfl.pseudo_qconst"() {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092{{[0-9]*}}E-4>>, value = dense<0> : tensor<32xi32>} : () -> tensor<32x!quant.uniform<i32:f32, 1.7052092{{[0-9]*}}E-4>>
+// CHECK-NEXT:   %{{.*}} = "tfl.pseudo_qconst"() <{qtype = tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32, 0.021826678{{[0-9]*}}:151>>, value = dense<-76> : tensor<32x3x3x3xi8>}> : () -> tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32, 0.021826678{{[0-9]*}}:151>>
+// CHECK-NEXT:   %{{.*}} = "tfl.pseudo_qconst"() <{qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092{{[0-9]*}}E-4>>, value = dense<0> : tensor<32xi32>}> : () -> tensor<32x!quant.uniform<i32:f32, 1.7052092{{[0-9]*}}E-4>>
 // CHECK:   %{{.*}} = "tfl.dequantize"(%{{.*}}) : (tensor<1x401408x!quant.uniform<u8:f32, 3.906250e-03>>) -> tensor<1x401408xf32>
 
   %cst = arith.constant dense<[1, 401408]> : tensor<2xi32>
@@ -27,9 +27,9 @@ func.func @quantized_constant(%arg0: tensor<1x2xf32>) -> tensor<2x2xf32> {
   %3 = "tfl.dequantize"(%2) : (tensor<2x2x!quant.uniform<u8:f32, 1.0>>) -> tensor<2x2xf32>
   func.return %3 : tensor<2x2xf32>
 
-// CHECK-NEXT: %[[Q:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e+00>>} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<u8:f32, 1.000000e+00>>
-// CHECK-NEXT: %[[CST:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e+00>>, value = dense<-76> : tensor<1x2xi8>} : () -> tensor<1x2x!quant.uniform<u8:f32, 1.000000e+00>>
-// CHECK-NEXT: %[[CONCAT:.*]] = "tfl.concatenation"(%[[Q]], %[[CST]]) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e+00>>, tensor<1x2x!quant.uniform<u8:f32, 1.000000e+00>>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e+00>>
+// CHECK-NEXT: %[[Q:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e+00>>}> : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<u8:f32, 1.000000e+00>>
+// CHECK-NEXT: %[[CST:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e+00>>, value = dense<-76> : tensor<1x2xi8>}> : () -> tensor<1x2x!quant.uniform<u8:f32, 1.000000e+00>>
+// CHECK-NEXT: %[[CONCAT:.*]] = "tfl.concatenation"(%[[Q]], %[[CST]]) <{axis = 0 : i32, fused_activation_function = "NONE"}> : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e+00>>, tensor<1x2x!quant.uniform<u8:f32, 1.000000e+00>>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e+00>>
 // CHECK-NEXT: %[[DQ:.*]] = "tfl.dequantize"(%[[CONCAT]]) : (tensor<2x2x!quant.uniform<u8:f32, 1.000000e+00>>) -> tensor<2x2xf32>
 // CHECK-NEXT: return %[[DQ]] : tensor<2x2xf32>
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/reshape.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/reshape.mlir
index 73344ddc4f535f..ae5f81c80b35f4 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/reshape.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/reshape.mlir
@@ -2,7 +2,7 @@
 // Confirm we can extract type info from reshape
 
 func.func @main() -> tensor<2x2xf32> {
-  // CHECK: %[[cst:.*]] = "tfl.pseudo_const"() {value = dense<2> : tensor<2xi32>} : () -> tensor<2xi32>
+  // CHECK: %[[cst:.*]] = "tfl.pseudo_const"() <{value = dense<2> : tensor<2xi32>}> : () -> tensor<2xi32>
   // CHECK: %{{.*}} = "tfl.reshape"(%{{.*}}, %[[cst]]) : (tensor<4xf32>, tensor<2xi32>) -> tensor<2x2xf32>
   %cst = arith.constant dense<[2, 2]> : tensor<2xi32>
   %0 = "tfl.pseudo_const" () {value = dense<1.0> : tensor<4xf32>} : () -> tensor<4xf32> loc("Const")
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/simple.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/simple.mlir
index 906e6efff29305..0d642ce2c1a6e8 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/simple.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/simple.mlir
@@ -10,9 +10,9 @@ func.func @main(tensor<3x2xi32>) -> tensor<3x2xi32> {
   // CHECK-SAME: tfl.description = "MLIR Converted."
   // CHECK-SAME: tfl.schema_version = 3 : i32
 
-  // CHECK:          %{{.*}} = "tfl.pseudo_const"() {value = dense<{{\[\[1, 2\], \[3, 4\], \[5, 6\]\]}}> : tensor<3x2xi32>}
+  // CHECK:          %{{.*}} = "tfl.pseudo_const"() <{value = dense<{{\[\[1, 2\], \[3, 4\], \[5, 6\]\]}}> : tensor<3x2xi32>}>
   // CHECK-NEXT:     [[SUB:%.*]] = tfl.sub %{{.*}}, %{{.*}} {fused_activation_function = "RELU6"} : tensor<3x2xi32>
-  // CHECK-NEXT:     [[SCALAR:%.*]] = "tfl.pseudo_const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NEXT:     [[SCALAR:%.*]] = "tfl.pseudo_const"() <{value = dense<10> : tensor<i32>}> : () -> tensor<i32>
   // CHECK-NEXT:     [[ADD:%.*]] = tfl.add([[SCALAR]], [[SUB]]) {fused_activation_function = "NONE"} : (tensor<i32>, tensor<3x2xi32>) -> tensor<3x2xi32>
   // CHECK-NEXT:     return [[ADD]] : tensor<3x2xi32>
 
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/variable.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/variable.mlir
index 0914fc37016771..40c8aa3ce64ddf 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/variable.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/variable.mlir
@@ -2,7 +2,7 @@
 
 // CHECK-LABEL: main
 func.func @main() -> tensor<3x2xi32> {
-  // CHECK: "tfl.pseudo_const"() {tfl.is_variable, value = dense<0> : tensor<3x2xi32>} : () -> tensor<3x2xi32>
+  // CHECK: "tfl.pseudo_const"() <{value = dense<0> : tensor<3x2xi32>}> {tfl.is_variable} : () -> tensor<3x2xi32>
   %0 = "tfl.pseudo_const"() {value = dense<0> : tensor<3x2xi32>, tfl.is_variable} : () -> tensor<3x2xi32> loc("variable")
   func.return %0 : tensor<3x2xi32>
 }
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/tests/fuse-tftext.mlir b/tensorflow/compiler/mlir/lite/tests/fuse-tftext.mlir
index 9b0e8c4863fc4b..51d685bd910561 100644
--- a/tensorflow/compiler/mlir/lite/tests/fuse-tftext.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/fuse-tftext.mlir
@@ -1027,7 +1027,7 @@ func.func @WhitespaceTokenize_RaggedGather_1_Assert_3_AssertGuard_true_23810(%ar
 }
 
 // CHECK:  func private @whitespace_tokenizer_rank1(%arg0: tensor<1x!tf_type.string> {tf._user_specified_name = "input"}) -> (tensor<?x!tf_type.string>, tensor<?xi64>) attributes {tf._implements = #tf_type.func<@"tftext:WhitespaceTokenizer", {}>, tf._input_shapes = [#tf_type.shape<1>], tf.signature.is_stateful} {
-// CHECK:  %0:2 = "tfl.custom"(%arg0) {custom_code = "tftext:WhitespaceTokenizer", custom_option = #tfl<const_bytes : "0x">} : (tensor<1x!tf_type.string>) -> (tensor<?x!tf_type.string>, tensor<?xi64>)
+// CHECK:  %0:2 = "tfl.custom"(%arg0) <{custom_code = "tftext:WhitespaceTokenizer", custom_option = #tfl<const_bytes : "0x">}> : (tensor<1x!tf_type.string>) -> (tensor<?x!tf_type.string>, tensor<?xi64>)
 // CHECK:  return %0#0, %0#1 : tensor<?x!tf_type.string>, tensor<?xi64>
 
 func.func private @whitespace_tokenizer_rank2(%arg0: tensor<?x1x!tf_type.string> {tf._user_specified_name = "input"}) -> (tensor<?x!tf_type.string>, tensor<?xi64>, tensor<?xi64>) attributes {tf._input_shapes = [#tf_type.shape<?x1>], tf._implements = #tf_type.func<@"tftext:WhitespaceTokenizer", {}>, tf.signature.is_stateful} {
@@ -2161,7 +2161,7 @@ func.func @WhitespaceTokenize_WhitespaceTokenize_WhitespaceTokenize_RaggedGather
 
 
 // CHECK:  func private @whitespace_tokenizer_rank2(%arg0: tensor<?x1x!tf_type.string> {tf._user_specified_name = "input"}) -> (tensor<?x!tf_type.string>, tensor<?xi64>, tensor<?xi64>) attributes {tf._implements = #tf_type.func<@"tftext:WhitespaceTokenizer", {}>, tf._input_shapes = [#tf_type.shape<?x1>], tf.signature.is_stateful} {
-// CHECK:  %0:3 = "tfl.custom"(%arg0) {custom_code = "tftext:WhitespaceTokenizer", custom_option = #tfl<const_bytes : "0x">} : (tensor<?x1x!tf_type.string>) -> (tensor<?x!tf_type.string>, tensor<?xi64>, tensor<?xi64>)
+// CHECK:  %0:3 = "tfl.custom"(%arg0) <{custom_code = "tftext:WhitespaceTokenizer", custom_option = #tfl<const_bytes : "0x">}> : (tensor<?x1x!tf_type.string>) -> (tensor<?x!tf_type.string>, tensor<?xi64>, tensor<?xi64>)
 // CHECK:  return %0#0, %0#1, %0#2 : tensor<?x!tf_type.string>, tensor<?xi64>, tensor<?xi64>
 
 func.func private @whitespace_tokenizer_rank0(%arg0: tensor<!tf_type.string> {tf._user_specified_name = "input"}) -> tensor<?x!tf_type.string> attributes {tf._input_shapes = [#tf_type.shape<>], tf._implements = #tf_type.func<@"tftext:WhitespaceTokenizer", {}>, tf.signature.is_stateful} {
@@ -3191,7 +3191,7 @@ func.func @WhitespaceTokenize_WhitespaceTokenize_RaggedGather_1_Assert_3_AssertG
 }
 
 // CHECK: func private @whitespace_tokenizer_rank0(%arg0: tensor<!tf_type.string> {tf._user_specified_name = "input"}) -> tensor<?x!tf_type.string> attributes {tf._implements = #tf_type.func<@"tftext:WhitespaceTokenizer", {}>, tf._input_shapes = [#tf_type.shape<>], tf.signature.is_stateful} {
-// CHECK: %0 = "tfl.custom"(%arg0) {custom_code = "tftext:WhitespaceTokenizer", custom_option = #tfl<const_bytes : "0x">} : (tensor<!tf_type.string>) -> tensor<?x!tf_type.string>
+// CHECK: %0 = "tfl.custom"(%arg0) <{custom_code = "tftext:WhitespaceTokenizer", custom_option = #tfl<const_bytes : "0x">}> : (tensor<!tf_type.string>) -> tensor<?x!tf_type.string>
 // CHECK: return %0 : tensor<?x!tf_type.string>
 
 func.func @ngrams(%arg0: tensor<?x!tf_type.string> {tf._user_specified_name = "input"}) -> tensor<?x!tf_type.string> attributes {tf._input_shapes = [#tf_type.shape<?>], tf._implements = #tf_type.func<@"tftext:Ngrams", {axis = -1 : i64, reduction_type = "STRING_JOIN", string_separator = " ", width = 2 : i64}>} {
@@ -3209,7 +3209,7 @@ func.func @ngrams(%arg0: tensor<?x!tf_type.string> {tf._user_specified_name = "i
 }
 
 // CHECK: func @ngrams(%arg0: tensor<?x!tf_type.string> {tf._user_specified_name = "input"}) -> tensor<?x!tf_type.string> attributes {tf._implements = #tf_type.func<@"tftext:Ngrams", {axis = -1 : i64, reduction_type = "STRING_JOIN", string_separator = " ", width = 2 : i64}>, tf._input_shapes = [#tf_type.shape<?>]} {
-// CHECK:   %0 = "tfl.custom"(%arg0) {custom_code = "tftext:Ngrams", custom_option = #tfl<const_bytes : "0x776964746800737472696E675F736570617261746F72000120006178697300726564756374696F6E5F74797065000B535452494E475F4A4F494E0004221E383F040104FF152D0204141404082401">} : (tensor<?x!tf_type.string>) -> tensor<?x!tf_type.string>
+// CHECK:   %0 = "tfl.custom"(%arg0) <{custom_code = "tftext:Ngrams", custom_option = #tfl<const_bytes : "0x776964746800737472696E675F736570617261746F72000120006178697300726564756374696F6E5F74797065000B535452494E475F4A4F494E0004221E383F040104FF152D0204141404082401">}> : (tensor<?x!tf_type.string>) -> tensor<?x!tf_type.string>
 // CHECK:   return %0 : tensor<?x!tf_type.string>
 // CHECK: }
 
@@ -3434,7 +3434,7 @@ func.func private @NGrams_SlidingWindow_RaggedConcat_assert_equal_2_Assert_Asser
   func.return %5 : tensor<i1>
 }
 // CHECK:  func private @ngrams_ragged_rank_2(%arg0: tensor<?x!tf_type.string> {tf._user_specified_name = "values"}, %arg1: tensor<3xi64> {tf._user_specified_name = "args_0"}, %arg2: tensor<?xi64> {tf._user_specified_name = "args_1"}) -> (tensor<?x!tf_type.string>, tensor<3xi64>, tensor<?xi64>) attributes {tf._implements = #tf_type.func<@"tftext:Ngrams", {axis = -1 : i64, reduction_type = "STRING_JOIN", string_separator = "", width = 2 : i64}>, tf._input_shapes = [#tf_type.shape<?>, #tf_type.shape<3>, #tf_type.shape<?>], tf.signature.is_stateful} {
-// CHECK:    %0:3 = "tfl.custom"(%arg0, %arg1, %arg2) {custom_code = "tftext:Ngrams", custom_option = #tfl<const_bytes : "0x776964746800737472696E675F736570617261746F720000006178697300726564756374696F6E5F74797065000B535452494E475F4A4F494E0004221E373E040104FF152C0204141404082401">} : (tensor<?x!tf_type.string>, tensor<3xi64>, tensor<?xi64>) -> (tensor<?x!tf_type.string>, tensor<3xi64>, tensor<?xi64>)
+// CHECK:    %0:3 = "tfl.custom"(%arg0, %arg1, %arg2) <{custom_code = "tftext:Ngrams", custom_option = #tfl<const_bytes : "0x776964746800737472696E675F736570617261746F720000006178697300726564756374696F6E5F74797065000B535452494E475F4A4F494E0004221E373E040104FF152C0204141404082401">}> : (tensor<?x!tf_type.string>, tensor<3xi64>, tensor<?xi64>) -> (tensor<?x!tf_type.string>, tensor<3xi64>, tensor<?xi64>)
 // CHECK:    return %0#0, %0#1, %0#2 : tensor<?x!tf_type.string>, tensor<3xi64>, tensor<?xi64>
 
 
@@ -3449,5 +3449,5 @@ func.func private @sgnn_projection(%arg0: tensor<?x!tf_type.string> {tf._user_sp
 
 
 // CHECK: func private @sgnn_projection(%arg0: tensor<?x!tf_type.string> {tf._user_specified_name = "values"}, %arg1: tensor<?xi64> {tf._user_specified_name = "row_splits"}) -> tensor<?x10xf64> attributes {tf._implements = #tf_type.func<@"tftext:custom:SgnnProjection", {buckets = 2147483647 : i64, hash_seed = [1902835825, -1475704015, 473120514, 1254202069, 1558833093, 1756181982, 1906603252, -1034142694, 542842690, 535515822]}>, tf._input_shapes = [#tf_type.shape<?>, #tf_type.shape<?>], tf.signature.is_stateful} {
-// CHECK:   %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "tftext:custom:SgnnProjection", custom_option = #tfl<const_bytes : "0x686173685F736565640000000A00000071F86A71318B0AA8023F331CD59AC14AC5E7E95CDE35AD68F474A4711A3C5CC2421F5B20AE52EB1F6275636B6574730002094200030000000100000002000000FFFFFF7F44000000062E0A2601">} : (tensor<?x!tf_type.string>, tensor<?xi64>) -> tensor<?x10xf64>
+// CHECK:   %0 = "tfl.custom"(%arg0, %arg1) <{custom_code = "tftext:custom:SgnnProjection", custom_option = #tfl<const_bytes : "0x686173685F736565640000000A00000071F86A71318B0AA8023F331CD59AC14AC5E7E95CDE35AD68F474A4711A3C5CC2421F5B20AE52EB1F6275636B6574730002094200030000000100000002000000FFFFFF7F44000000062E0A2601">}> : (tensor<?x!tf_type.string>, tensor<?xi64>) -> tensor<?x10xf64>
 // CHECK:   return %0 : tensor<?x10xf64>
diff --git a/tensorflow/compiler/mlir/lite/tests/insert_call_once_op.mlir b/tensorflow/compiler/mlir/lite/tests/insert_call_once_op.mlir
index db3d75e93a805a..44ce1b5c96e90d 100644
--- a/tensorflow/compiler/mlir/lite/tests/insert_call_once_op.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/insert_call_once_op.mlir
@@ -22,7 +22,7 @@ module attributes {tf_saved_model.semantics} {
     %1 = "tf.LookupTableFindV2"(%0, %arg0, %cst) {device = ""} : (tensor<!tf_type.resource>, tensor<i64>, tensor<!tf_type.string>) -> tensor<*x!tf_type.string>
     func.return %1 : tensor<*x!tf_type.string>
     // CHECK-LABEL: @serving_default
-    // CHECK: "tfl.call_once"() {session_init_function = "init_all_tables"} : () -> ()
+    // CHECK: "tfl.call_once"() <{session_init_function = "init_all_tables"}> : () -> ()
   }
 }
 
diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tensorlist.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tensorlist.mlir
index 74b10665528406..2cf209c8e0444f 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tensorlist.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tensorlist.mlir
@@ -4,7 +4,7 @@
 // CHECK-LABEL: listReserveScalarShapeI32
 func.func @listReserveScalarShapeI32(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<!tf_type.variant<tensor<*xi32>>> {
   %0 = "tf.TensorListReserve"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<*xi32>>>
-  // CHECK: %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "TensorListReserve", custom_option = #tfl<const_bytes : "0x02">} : (tensor<i32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<*xi32>>>
+  // CHECK: %0 = "tfl.custom"(%arg0, %arg1) <{custom_code = "TensorListReserve", custom_option = #tfl<const_bytes : "0x02">}> : (tensor<i32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<*xi32>>>
   func.return %0 : tensor<!tf_type.variant<tensor<*xi32>>>
 }
 
@@ -13,7 +13,7 @@ func.func @listReserveScalarShapeI32(%arg0: tensor<i32>, %arg1: tensor<i32>) ->
 // CHECK-LABEL: listReserve1DShapeI32
 func.func @listReserve1DShapeI32(%arg0: tensor<2xi32>, %arg1: tensor<i32>) -> tensor<!tf_type.variant<tensor<*xi32>>> {
   %0 = "tf.TensorListReserve"(%arg0, %arg1) : (tensor<2xi32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<*xi32>>>
-  // CHECK: %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "TensorListReserve", custom_option = #tfl<const_bytes : "0x02">} : (tensor<2xi32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<*xi32>>>
+  // CHECK: %0 = "tfl.custom"(%arg0, %arg1) <{custom_code = "TensorListReserve", custom_option = #tfl<const_bytes : "0x02">}> : (tensor<2xi32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<*xi32>>>
   func.return %0 : tensor<!tf_type.variant<tensor<*xi32>>>
 }
 
@@ -22,7 +22,7 @@ func.func @listReserve1DShapeI32(%arg0: tensor<2xi32>, %arg1: tensor<i32>) -> te
 // CHECK-LABEL: listReserveScalarShapeFloat
 func.func @listReserveScalarShapeFloat(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<!tf_type.variant<tensor<*xf32>>> {
   %0 = "tf.TensorListReserve"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<*xf32>>>
-  // CHECK: %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "TensorListReserve", custom_option = #tfl<const_bytes : "0x00">} : (tensor<i32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<*xf32>>>
+  // CHECK: %0 = "tfl.custom"(%arg0, %arg1) <{custom_code = "TensorListReserve", custom_option = #tfl<const_bytes : "0x00">}> : (tensor<i32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<*xf32>>>
   func.return %0 : tensor<!tf_type.variant<tensor<*xf32>>>
 }
 
@@ -31,7 +31,7 @@ func.func @listReserveScalarShapeFloat(%arg0: tensor<i32>, %arg1: tensor<i32>) -
 // CHECK-LABEL: listReserveScalarShapeLong
 func.func @listReserveScalarShapeLong(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<!tf_type.variant<tensor<*xi64>>> {
   %0 = "tf.TensorListReserve"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<*xi64>>>
-  // CHECK: %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "TensorListReserve", custom_option = #tfl<const_bytes : "0x04">} : (tensor<i32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<*xi64>>>
+  // CHECK: %0 = "tfl.custom"(%arg0, %arg1) <{custom_code = "TensorListReserve", custom_option = #tfl<const_bytes : "0x04">}> : (tensor<i32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<*xi64>>>
   func.return %0 : tensor<!tf_type.variant<tensor<*xi64>>>
 }
 
@@ -40,7 +40,7 @@ func.func @listReserveScalarShapeLong(%arg0: tensor<i32>, %arg1: tensor<i32>) ->
 // CHECK-LABEL: listReserveScalarShapeBool
 func.func @listReserveScalarShapeBool(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<!tf_type.variant<tensor<*xi1>>> {
   %0 = "tf.TensorListReserve"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<*xi1>>>
-  // CHECK: %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "TensorListReserve", custom_option = #tfl<const_bytes : "0x06">} : (tensor<i32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<*xi1>>>
+  // CHECK: %0 = "tfl.custom"(%arg0, %arg1) <{custom_code = "TensorListReserve", custom_option = #tfl<const_bytes : "0x06">}> : (tensor<i32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<*xi1>>>
   func.return %0 : tensor<!tf_type.variant<tensor<*xi1>>>
 }
 
@@ -49,7 +49,7 @@ func.func @listReserveScalarShapeBool(%arg0: tensor<i32>, %arg1: tensor<i32>) ->
 // CHECK-LABEL: listStack
 func.func @listStack(%arg0: tensor<!tf_type.variant<tensor<*xi32>>>, %arg1: tensor<i32>) -> tensor<*xi32> {
   %0 = "tf.TensorListStack"(%arg0, %arg1) : (tensor<!tf_type.variant<tensor<*xi32>>>, tensor<i32>) -> tensor<*xi32>
-  // CHECK: %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "TensorListStack", custom_option = #tfl<const_bytes : "0x">} : (tensor<!tf_type.variant<tensor<*xi32>>>, tensor<i32>) -> tensor<*xi32>
+  // CHECK: %0 = "tfl.custom"(%arg0, %arg1) <{custom_code = "TensorListStack", custom_option = #tfl<const_bytes : "0x">}> : (tensor<!tf_type.variant<tensor<*xi32>>>, tensor<i32>) -> tensor<*xi32>
   func.return %0 : tensor<*xi32>
 }
 
@@ -58,7 +58,7 @@ func.func @listStack(%arg0: tensor<!tf_type.variant<tensor<*xi32>>>, %arg1: tens
 // CHECK-LABEL: listSetItem
 func.func @listSetItem(%arg0: tensor<!tf_type.variant<tensor<*xi32>>>, %arg1: tensor<i32>, %arg2: tensor<*xi32>) -> tensor<!tf_type.variant<tensor<*xi32>>> {
   %0 = "tf.TensorListSetItem"(%arg0, %arg1, %arg2) : (tensor<!tf_type.variant<tensor<*xi32>>>, tensor<i32>, tensor<*xi32>) -> tensor<!tf_type.variant<tensor<*xi32>>>
-  // CHECK: %0 = "tfl.custom"(%arg0, %arg1, %arg2) {custom_code = "TensorListSetItem", custom_option = #tfl<const_bytes : "0x">} : (tensor<!tf_type.variant<tensor<*xi32>>>, tensor<i32>, tensor<*xi32>) -> tensor<!tf_type.variant<tensor<*xi32>>>
+  // CHECK: %0 = "tfl.custom"(%arg0, %arg1, %arg2) <{custom_code = "TensorListSetItem", custom_option = #tfl<const_bytes : "0x">}> : (tensor<!tf_type.variant<tensor<*xi32>>>, tensor<i32>, tensor<*xi32>) -> tensor<!tf_type.variant<tensor<*xi32>>>
   func.return %0 : tensor<!tf_type.variant<tensor<*xi32>>>
 }
 
@@ -67,7 +67,7 @@ func.func @listSetItem(%arg0: tensor<!tf_type.variant<tensor<*xi32>>>, %arg1: te
 // CHECK-LABEL: listGetItem
 func.func @listGetItem(%arg0: tensor<!tf_type.variant<tensor<*xi32>>>, %arg1: tensor<i32>, %arg2: tensor<2xi32>) -> tensor<2xi32> {
   %0 = "tf.TensorListGetItem"(%arg0, %arg1, %arg2) : (tensor<!tf_type.variant<tensor<*xi32>>>, tensor<i32>, tensor<2xi32>) -> tensor<2xi32>
-  // CHECK: %0 = "tfl.custom"(%arg0, %arg1, %arg2) {custom_code = "TensorListGetItem", custom_option = #tfl<const_bytes : "0x">} : (tensor<!tf_type.variant<tensor<*xi32>>>, tensor<i32>, tensor<2xi32>) -> tensor<2xi32>
+  // CHECK: %0 = "tfl.custom"(%arg0, %arg1, %arg2) <{custom_code = "TensorListGetItem", custom_option = #tfl<const_bytes : "0x">}> : (tensor<!tf_type.variant<tensor<*xi32>>>, tensor<i32>, tensor<2xi32>) -> tensor<2xi32>
   func.return %0 : tensor<2xi32>
 }
 
@@ -77,7 +77,7 @@ func.func @listGetItem(%arg0: tensor<!tf_type.variant<tensor<*xi32>>>, %arg1: te
 func.func @listFromTensor(%tensor: tensor<3xi32>, %shape : tensor<?xi32>) -> tensor<!tf_type.variant<tensor<i32>>> {
   %0 = "tf.TensorListFromTensor"(%tensor, %shape) : (tensor<3xi32>, tensor<?xi32>) -> tensor<!tf_type.variant<tensor<i32>>>
   func.return %0 : tensor<!tf_type.variant<tensor<i32>>>
-  // CHECK: %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "TensorListFromTensor", custom_option = #tfl<const_bytes : "0x">} : (tensor<3xi32>, tensor<?xi32>) -> tensor<!tf_type.variant<tensor<i32>>>
+  // CHECK: %0 = "tfl.custom"(%arg0, %arg1) <{custom_code = "TensorListFromTensor", custom_option = #tfl<const_bytes : "0x">}> : (tensor<3xi32>, tensor<?xi32>) -> tensor<!tf_type.variant<tensor<i32>>>
 }
 
 // -----
@@ -95,7 +95,7 @@ func.func @typeNotSupportedNotLegalized(%arg0: tensor<!tf_type.variant<tensor<*x
 // CHECK-LABEL: listLength
 func.func @listLength(%arg0: tensor<!tf_type.variant<tensor<*xi32>>>) -> tensor<i32> {
   %0 = "tf.TensorListLength"(%arg0) : (tensor<!tf_type.variant<tensor<*xi32>>>) -> tensor<i32>
-  // CHECK: %0 = "tfl.custom"(%arg0) {custom_code = "TensorListLength", custom_option = #tfl<const_bytes : "0x">} : (tensor<!tf_type.variant<tensor<*xi32>>>) -> tensor<i32>
+  // CHECK: %0 = "tfl.custom"(%arg0) <{custom_code = "TensorListLength", custom_option = #tfl<const_bytes : "0x">}> : (tensor<!tf_type.variant<tensor<*xi32>>>) -> tensor<i32>
   func.return %0 : tensor<i32>
 }
 
@@ -105,7 +105,7 @@ func.func @listLength(%arg0: tensor<!tf_type.variant<tensor<*xi32>>>) -> tensor<
 func.func @listEmptyToListReserve(%arg0: tensor<?xi32>, %arg1: tensor<i32>) -> tensor<!tf_type.variant<tensor<*xi64>>> {
   %0 = "tf.EmptyTensorList"(%arg0, %arg1) : (tensor<?xi32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<*xi64>>>
   // CHECK: %cst = arith.constant dense<0> : tensor<i32>
-  // CHECK: %0 = "tfl.custom"(%arg0, %cst) {custom_code = "TensorListReserve", custom_option = #tfl<const_bytes : "0x04">} : (tensor<?xi32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<*xi64>>>
+  // CHECK: %0 = "tfl.custom"(%arg0, %cst) <{custom_code = "TensorListReserve", custom_option = #tfl<const_bytes : "0x04">}> : (tensor<?xi32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<*xi64>>>
   func.return %0 : tensor<!tf_type.variant<tensor<*xi64>>>
 }
 
@@ -114,7 +114,7 @@ func.func @listEmptyToListReserve(%arg0: tensor<?xi32>, %arg1: tensor<i32>) -> t
 // CHECK-LABEL: listElementShape
 func.func @listElementShape(%arg0: tensor<!tf_type.variant<tensor<*xi32>>>) -> tensor<*xi32> {
   %0 = "tf.TensorListElementShape"(%arg0) : (tensor<!tf_type.variant<tensor<*xi32>>>) -> tensor<*xi32>
-  // CHECK: %0 = "tfl.custom"(%arg0) {custom_code = "TensorListElementShape", custom_option = #tfl<const_bytes : "0x">} : (tensor<!tf_type.variant<tensor<*xi32>>>) -> tensor<*xi32>
+  // CHECK: %0 = "tfl.custom"(%arg0) <{custom_code = "TensorListElementShape", custom_option = #tfl<const_bytes : "0x">}> : (tensor<!tf_type.variant<tensor<*xi32>>>) -> tensor<*xi32>
   func.return %0 : tensor<*xi32>
 }
 
@@ -123,7 +123,7 @@ func.func @listElementShape(%arg0: tensor<!tf_type.variant<tensor<*xi32>>>) -> t
 // CHECK-LABEL: listPopBack
 func.func @listPopBack(%arg0: tensor<!tf_type.variant<tensor<2xi32>>>, %arg1: tensor<1xi32>) -> (tensor<!tf_type.variant<tensor<2xi32>>>, tensor<2xi32>)  {
   %0, %1 = "tf.TensorListPopBack"(%arg0, %arg1) : (tensor<!tf_type.variant<tensor<2xi32>>>, tensor<1xi32>) -> (tensor<!tf_type.variant<tensor<2xi32>>>, tensor<2xi32>)
-  // CHECK: %0:2 = "tfl.custom"(%arg0, %arg1) {custom_code = "TensorListPopBack", custom_option = #tfl<const_bytes : "0x">} : (tensor<!tf_type.variant<tensor<2xi32>>>, tensor<1xi32>) -> (tensor<!tf_type.variant<tensor<2xi32>>>, tensor<2xi32>)
+  // CHECK: %0:2 = "tfl.custom"(%arg0, %arg1) <{custom_code = "TensorListPopBack", custom_option = #tfl<const_bytes : "0x">}> : (tensor<!tf_type.variant<tensor<2xi32>>>, tensor<1xi32>) -> (tensor<!tf_type.variant<tensor<2xi32>>>, tensor<2xi32>)
   func.return %0, %1 : tensor<!tf_type.variant<tensor<2xi32>>>, tensor<2xi32>
 }
 
@@ -132,7 +132,7 @@ func.func @listPopBack(%arg0: tensor<!tf_type.variant<tensor<2xi32>>>, %arg1: te
 // CHECK-LABEL: listPushBack
 func.func @listPushBack(%arg0: tensor<!tf_type.variant<tensor<?x1xf32>>>, %arg1: tensor<16x1xf32>) -> tensor<!tf_type.variant<tensor<?x1xf32>>>  {
   %0 = "tf.TensorListPushBack"(%arg0, %arg1) : (tensor<!tf_type.variant<tensor<?x1xf32>>>, tensor<16x1xf32>) -> tensor<!tf_type.variant<tensor<?x1xf32>>>
-  // CHECK: %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "TensorListPushBack", custom_option = #tfl<const_bytes : "0x">} : (tensor<!tf_type.variant<tensor<?x1xf32>>>, tensor<16x1xf32>) -> tensor<!tf_type.variant<tensor<?x1xf32>>>
+  // CHECK: %0 = "tfl.custom"(%arg0, %arg1) <{custom_code = "TensorListPushBack", custom_option = #tfl<const_bytes : "0x">}> : (tensor<!tf_type.variant<tensor<?x1xf32>>>, tensor<16x1xf32>) -> tensor<!tf_type.variant<tensor<?x1xf32>>>
   func.return %0: tensor<!tf_type.variant<tensor<?x1xf32>>>
 }
 
@@ -141,7 +141,7 @@ func.func @listPushBack(%arg0: tensor<!tf_type.variant<tensor<?x1xf32>>>, %arg1:
 // CHECK-LABEL: variantAddN
 func.func @variantAddN(%arg0: tensor<!tf_type.variant<tensor<*xi32>>>, %arg1: tensor<!tf_type.variant<tensor<*xi32>>>) -> tensor<!tf_type.variant<tensor<*xi32>>> {
   %1 = "tf.AddN"(%arg0, %arg1) : (tensor<!tf_type.variant<tensor<*xi32>>>, tensor<!tf_type.variant<tensor<*xi32>>>) -> tensor<!tf_type.variant<tensor<*xi32>>>
-  // CHECK: %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "VariantAddN", custom_option = #tfl<const_bytes : "0x">} : (tensor<!tf_type.variant<tensor<*xi32>>>, tensor<!tf_type.variant<tensor<*xi32>>>) -> tensor<!tf_type.variant<tensor<*xi32>>>
+  // CHECK: %0 = "tfl.custom"(%arg0, %arg1) <{custom_code = "VariantAddN", custom_option = #tfl<const_bytes : "0x">}> : (tensor<!tf_type.variant<tensor<*xi32>>>, tensor<!tf_type.variant<tensor<*xi32>>>) -> tensor<!tf_type.variant<tensor<*xi32>>>
   func.return %1 : tensor<!tf_type.variant<tensor<*xi32>>>
 }
 
diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf-hashtables.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf-hashtables.mlir
index 2e90bdff67ad6e..70d9701ac63b12 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tf-hashtables.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf-hashtables.mlir
@@ -7,7 +7,7 @@ func.func @hashtable_string_to_int64(%arg0: tensor<i64>) -> tensor<*xi64> {
   %1 = "tf.LookupTableFindV2"(%0, %cst, %arg0) {device = ""} : (tensor<!tf_type.resource>, tensor<!tf_type.string>, tensor<i64>) -> tensor<*xi64>
   // CHECK-LABEL: hashtable_string_to_int64
   // CHECK:       [[CST:%.*]] = arith.constant dense<"f"> : tensor<!tf_type.string>
-  // CHECK-NEXT:  [[HASH_TABLE:%.*]] = "tfl.hashtable"() {key_dtype = !tf_type.string, table_id = 1530976467 : i32, value_dtype = i64} : () -> tensor<1x!tf_type.resource>
+  // CHECK-NEXT:  [[HASH_TABLE:%.*]] = "tfl.hashtable"() <{key_dtype = !tf_type.string, table_id = 1530976467 : i32, value_dtype = i64}> : () -> tensor<1x!tf_type.resource>
   // CHECK-NEXT:  [[FIND:%.*]] = "tfl.hashtable_find"([[HASH_TABLE]], [[CST]], %arg0) : (tensor<1x!tf_type.resource>, tensor<!tf_type.string>, tensor<i64>) -> tensor<*xi64>
   // CHECK-NEXT:  return [[FIND]] : tensor<*xi64>
   func.return %1 : tensor<*xi64>
@@ -22,7 +22,7 @@ func.func @hashtable_int64_to_string(%arg0: tensor<i64>) -> tensor<*x!tf_type.st
   %1 = "tf.LookupTableFindV2"(%0, %arg0, %cst) {device = ""} : (tensor<!tf_type.resource>, tensor<i64>, tensor<!tf_type.string>) -> tensor<*x!tf_type.string>
   // CHECK-LABEL: hashtable_int64_to_string
   // CHECK:       [[CST:%.*]] = arith.constant dense<"f"> : tensor<!tf_type.string>
-  // CHECK-NEXT:  [[HASH_TABLE:%.*]] = "tfl.hashtable"() {key_dtype = i64, table_id = 1530976467 : i32, value_dtype = !tf_type.string} : () -> tensor<1x!tf_type.resource>
+  // CHECK-NEXT:  [[HASH_TABLE:%.*]] = "tfl.hashtable"() <{key_dtype = i64, table_id = 1530976467 : i32, value_dtype = !tf_type.string}> : () -> tensor<1x!tf_type.resource>
   // CHECK-NEXT:  [[FIND:%.*]] = "tfl.hashtable_find"([[HASH_TABLE]], %arg0, [[CST]]) : (tensor<1x!tf_type.resource>, tensor<i64>, tensor<!tf_type.string>) -> tensor<*x!tf_type.string>
   // CHECK-NEXT:  return [[FIND]] : tensor<*x!tf_type.string>
   func.return %1 : tensor<*x!tf_type.string>
@@ -52,7 +52,7 @@ func.func @hashtable_import(%arg0: tensor<5x!tf_type.string>) {
   // CHECK-LABEL: hashtable_import
   // CHECK:       [[CST:%.*]] = arith.constant dense<["emerson", "lake", "palmer"]> : tensor<3x!tf_type.string>
   // CHECK-NEXT:  [[CST_0:%.*]] = arith.constant dense<[0, 1, 2]> : tensor<3xi64>
-  // CHECK-NEXT:  [[HASH_TABLE:%.*]] = "tfl.hashtable"() {key_dtype = !tf_type.string, table_id = -1323619995 : i32, value_dtype = i64} : () -> tensor<1x!tf_type.resource>
+  // CHECK-NEXT:  [[HASH_TABLE:%.*]] = "tfl.hashtable"() <{key_dtype = !tf_type.string, table_id = -1323619995 : i32, value_dtype = i64}> : () -> tensor<1x!tf_type.resource>
   // CHECK-NEXT:   "tfl.hashtable_import"([[HASH_TABLE]], [[CST]], [[CST_0]]) : (tensor<1x!tf_type.resource>, tensor<3x!tf_type.string>, tensor<3xi64>) -> ()
 }
 
@@ -63,7 +63,7 @@ func.func @hashtable_size(%arg0: tensor<5x!tf_type.string>) -> tensor<i64> {
   %0 = "tf.HashTableV2"() {container = "", device = "", key_dtype = !tf_type.string, shared_name = "hash_table_1dd4fef4-646d-491f-a3a8-bf5334f45813", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf_type.resource>
   %1 = "tf.LookupTableSizeV2"(%0) {device = ""} : (tensor<!tf_type.resource>) -> tensor<i64>
   // CHECK-LABEL: hashtable_size
-  // CHECK-NEXT:  [[HASH_TABLE:%.*]] = "tfl.hashtable"() {key_dtype = !tf_type.string, table_id = -1323619995 : i32, value_dtype = i64} : () -> tensor<1x!tf_type.resource>
+  // CHECK-NEXT:  [[HASH_TABLE:%.*]] = "tfl.hashtable"() <{key_dtype = !tf_type.string, table_id = -1323619995 : i32, value_dtype = i64}> : () -> tensor<1x!tf_type.resource>
   // CHECK-NEXT:  [[SIZE:%.*]] = "tfl.hashtable_size"([[HASH_TABLE]]) : (tensor<1x!tf_type.resource>) -> tensor<i64>
   // CHECK-NEXT:  return [[SIZE]] : tensor<i64>
   func.return %1 : tensor<i64>
@@ -83,7 +83,7 @@ func.func @hashtable_import_then_find(%arg0: tensor<5x!tf_type.string>) -> tenso
   // CHECK:       [[CST:%.*]] = arith.constant dense<["emerson", "lake", "palmer"]> : tensor<3x!tf_type.string>
   // CHECK-NEXT:  [[CST_0:%.*]] = arith.constant dense<-1> : tensor<i64>
   // CHECK-NEXT:  [[CST_1:%.*]] = arith.constant dense<[0, 1, 2]> : tensor<3xi64>
-  // CHECK-NEXT:  [[HASH_TABLE:%.*]] = "tfl.hashtable"() {key_dtype = !tf_type.string, table_id = -1323619995 : i32, value_dtype = i64} : () -> tensor<1x!tf_type.resource>
+  // CHECK-NEXT:  [[HASH_TABLE:%.*]] = "tfl.hashtable"() <{key_dtype = !tf_type.string, table_id = -1323619995 : i32, value_dtype = i64}> : () -> tensor<1x!tf_type.resource>
   // CHECK-NEXT:   "tfl.hashtable_import"([[HASH_TABLE]], [[CST]], [[CST_1]]) : (tensor<1x!tf_type.resource>, tensor<3x!tf_type.string>, tensor<3xi64>) -> ()
   // CHECK-NEXT:  [[FIND:%.*]] = "tfl.hashtable_find"([[HASH_TABLE]], %arg0, [[CST_0]]) : (tensor<1x!tf_type.resource>, tensor<5x!tf_type.string>, tensor<i64>) -> tensor<*xi64>
   // CHECK-NEXT:  return [[FIND]] : tensor<*xi64>
@@ -102,7 +102,7 @@ func.func @hashtable_import_then_size(%arg0: tensor<5x!tf_type.string>) -> tenso
   // CHECK-LABEL: hashtable_import_then_size
   // CHECK:       [[CST:%.*]] = arith.constant dense<["emerson", "lake", "palmer"]> : tensor<3x!tf_type.string>
   // CHECK-NEXT:  [[CST_0:%.*]] = arith.constant dense<[0, 1, 2]> : tensor<3xi64>
-  // CHECK-NEXT:  [[HASH_TABLE:%.*]] = "tfl.hashtable"() {key_dtype = !tf_type.string, table_id = -1323619995 : i32, value_dtype = i64} : () -> tensor<1x!tf_type.resource>
+  // CHECK-NEXT:  [[HASH_TABLE:%.*]] = "tfl.hashtable"() <{key_dtype = !tf_type.string, table_id = -1323619995 : i32, value_dtype = i64}> : () -> tensor<1x!tf_type.resource>
   // CHECK-NEXT:   "tfl.hashtable_import"([[HASH_TABLE]], [[CST]], [[CST_0]]) : (tensor<1x!tf_type.resource>, tensor<3x!tf_type.string>, tensor<3xi64>) -> ()
   // CHECK-NEXT:  [[SIZE:%.*]] = "tfl.hashtable_size"([[HASH_TABLE]]) : (tensor<1x!tf_type.resource>) -> tensor<i64>
   // CHECK-NEXT:  return [[SIZE]] : tensor<i64>
diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf-no-runtime-verification.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf-no-runtime-verification.mlir
index 46ff509b7cc46e..2c17e734c58dad 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tf-no-runtime-verification.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf-no-runtime-verification.mlir
@@ -6,6 +6,6 @@ func.func @broadcast_to_bf16(%arg0: tensor<3xbf16>, %arg1: tensor<2xi64>) -> ten
 
 // CHECK-LABEL: broadcast_to_bf16
 // CHECK:  [[CST:%.*]] = arith.constant dense<1.000000e+00> : tensor<3x3xbf16>
-// CHECK:  [[MUL:%.*]] = tfl.mul(%arg0, [[CST]]) {fused_activation_function = "NONE"} : (tensor<3xbf16>, tensor<3x3xbf16>) -> tensor<3x3xbf16>
+// CHECK:  [[MUL:%.*]] = tfl.mul(%arg0, [[CST]]) <{fused_activation_function = "NONE"}> : (tensor<3xbf16>, tensor<3x3xbf16>) -> tensor<3x3xbf16>
 // CHECK:  return [[MUL]] : tensor<3x3xbf16>
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf-variables.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf-variables.mlir
index 54a10bf0dad5bc..36b26c78b258f8 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tf-variables.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf-variables.mlir
@@ -15,8 +15,8 @@ module attributes {tf_saved_model.semantics} {
     func.return %2, %3 : tensor<1x10xf32>, tensor<1x10xi64>
   }
 
-  // CHECK: %[[RESOURCE:.*]] = "tfl.var_handle"() {container = "c", shared_name = "a"} : () -> tensor<!tf_type.resource<tensor<1x10xf32>>>
-  // CHECK: %[[RESOURCE_1:.*]] = "tfl.var_handle"() {container = "c", shared_name = "b"} : () -> tensor<!tf_type.resource<tensor<1x10xi64>>>
+  // CHECK: %[[RESOURCE:.*]] = "tfl.var_handle"() <{container = "c", shared_name = "a"}> : () -> tensor<!tf_type.resource<tensor<1x10xf32>>>
+  // CHECK: %[[RESOURCE_1:.*]] = "tfl.var_handle"() <{container = "c", shared_name = "b"}> : () -> tensor<!tf_type.resource<tensor<1x10xi64>>>
   // CHECK: %[[VAR_VAL:.*]] = "tfl.read_variable"(%[[RESOURCE]]) : (tensor<!tf_type.resource<tensor<1x10xf32>>>) -> tensor<1x10xf32>
   // CHECK: %[[ADD:.*]] = tfl.add %[[VAR_VAL]], %arg0 {fused_activation_function = "NONE"} : tensor<1x10xf32>
   // CHECK: "tfl.assign_variable"(%[[RESOURCE]], %[[ADD]]) : (tensor<!tf_type.resource<tensor<1x10xf32>>>, tensor<1x10xf32>) -> ()
@@ -41,7 +41,7 @@ module attributes {tf_saved_model.semantics} {
     "tf.AssignVariableOp"(%handle_0, %cst_1) : (tensor<!tf_type.resource<tensor<1x10xf32>>>, tensor<1x10xf32>) -> ()
     func.return
     // CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : tensor<1x10xf32>
-    // CHECK: %[[RESOURCE:.*]] = "tfl.var_handle"() {container = "c", shared_name = "a"} : () -> tensor<!tf_type.resource<tensor<1x10xf32>>>
+    // CHECK: %[[RESOURCE:.*]] = "tfl.var_handle"() <{container = "c", shared_name = "a"}> : () -> tensor<!tf_type.resource<tensor<1x10xf32>>>
     // CHECK: "tfl.assign_variable"(%[[RESOURCE]], %[[CST]]) : (tensor<!tf_type.resource<tensor<1x10xf32>>>, tensor<1x10xf32>) -> ()
   }
 
@@ -57,7 +57,7 @@ module attributes {tf_saved_model.semantics} {
     "tf.AssignVariableOp"(%handle_0, %1) : (tensor<!tf_type.resource<tensor<1x10xf32>>>, tensor<1x10xf32>) -> ()
     %2 = "tf.ReadVariableOp"(%handle_0) {device = ""} : (tensor<!tf_type.resource<tensor<1x10xf32>>>) -> tensor<1x10xf32>
     func.return %2 : tensor<1x10xf32>
-    // CHECK: %[[RESOURCE:.*]] = "tfl.var_handle"() {container = "c", shared_name = "a"} : () -> tensor<!tf_type.resource<tensor<1x10xf32>>>
+    // CHECK: %[[RESOURCE:.*]] = "tfl.var_handle"() <{container = "c", shared_name = "a"}> : () -> tensor<!tf_type.resource<tensor<1x10xf32>>>
     // CHECK: %[[VAR_VAL:.*]] = "tfl.read_variable"(%[[RESOURCE]]) : (tensor<!tf_type.resource<tensor<1x10xf32>>>) -> tensor<1x10xf32>
     // CHECK: %[[ADD:.*]] = tfl.add %[[VAR_VAL]], %arg0 {fused_activation_function = "NONE"} : tensor<1x10xf32>
     // CHECK: "tfl.assign_variable"(%[[RESOURCE]], %[[ADD]]) : (tensor<!tf_type.resource<tensor<1x10xf32>>>, tensor<1x10xf32>) -> ()
diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf-while.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf-while.mlir
index ab9b39bd94cb97..0dfd9e1c1a78bf 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tf-while.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf-while.mlir
@@ -58,7 +58,7 @@ func.func @while_cond_10_frozen0(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>, %ar
 // CANON-SAME:         (tensor<i32>, tensor<256x256xf32>, tensor<?x256x256xf32>)
 // CANON:           [[VAL_1:%.*]] = arith.constant dense<1.000000e+00> : tensor<256x256xf32>
 // CANON:           [[VAL_2:%.*]] = arith.constant dense<0> : tensor<i32>
-// CANON:           [[VAL_6:%.*]]:3 = "tfl.while"([[VAL_2]], [[VAL_2]], [[VAL_0]]) ({
+// CANON:           [[VAL_6:%.*]]:3 = "tfl.while"([[VAL_2]], [[VAL_2]], [[VAL_0]]) <{is_stateless = true}> ({
 // CANON:           ^bb0([[VAL_7:%.*]]: tensor<*xi32>, [[VAL_8:%.*]]: tensor<*xi32>, [[VAL_9:%.*]]: tensor<*xf32>):
 // CANON:             [[VAL_3:%.*]] = arith.constant dense<10> : tensor<i32>
 // CANON:             [[VAL_10:%.*]] = "tf.Less"([[VAL_8]], [[VAL_3]])
@@ -71,6 +71,6 @@ func.func @while_cond_10_frozen0(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>, %ar
 // CANON:             [[VAL_15:%.*]] = "tf.AddV2"([[VAL_13]], [[VAL_5]])
 // CANON:             [[VAL_16:%.*]] = "tf.AddV2"([[VAL_11]], [[VAL_4]])
 // CANON:             "tfl.yield"([[VAL_16]], [[VAL_14]], [[VAL_15]]) : (tensor<*xi32>, tensor<*xi32>, tensor<*xf32>) -> ()
-// CANON:           }) {is_stateless = true} : (tensor<i32>, tensor<i32>, tensor<?x256x256xf32>) -> (tensor<i32>, tensor<i32>, tensor<?x256x256xf32>)
+// CANON:           }) : (tensor<i32>, tensor<i32>, tensor<?x256x256xf32>) -> (tensor<i32>, tensor<i32>, tensor<?x256x256xf32>)
 // CANON:           return [[VAL_17:%.*]]#1, [[VAL_1]], [[VAL_17]]#2 : tensor<i32>, tensor<256x256xf32>, tensor<?x256x256xf32>
 // CANON:         }
diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
index a0b9f90a879507..2e178f754dbbc4 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
@@ -30,7 +30,7 @@ func.func @LeakyRelu(%arg0: tensor<1xf32>) -> tensor<1xf32> {
   func.return %2: tensor<1xf32>
 
 // CHECK-LABEL: LeakyRelu
-// CHECK:  "tfl.leaky_relu"(%arg0) {alpha = 1.000000e-01 : f32} : (tensor<1xf32>) -> tensor<1xf32>
+// CHECK:  "tfl.leaky_relu"(%arg0) <{alpha = 1.000000e-01 : f32}> : (tensor<1xf32>) -> tensor<1xf32>
 }
 
 func.func @biasAdd(%arg0: tensor<1x10x10x32xf32>, %arg1: tensor<32xf32>) -> tensor<1x10x10x32xf32> {
@@ -38,7 +38,7 @@ func.func @biasAdd(%arg0: tensor<1x10x10x32xf32>, %arg1: tensor<32xf32>) -> tens
   func.return %0 : tensor<1x10x10x32xf32>
 
 // CHECK-LABEL: biasAdd
-// CHECK: tfl.add(%arg0, %arg1) {fused_activation_function = "NONE"} : (tensor<1x10x10x32xf32>, tensor<32xf32>) -> tensor<1x10x10x32xf32>
+// CHECK: tfl.add(%arg0, %arg1) <{fused_activation_function = "NONE"}> : (tensor<1x10x10x32xf32>, tensor<32xf32>) -> tensor<1x10x10x32xf32>
 }
 
 func.func @biasAddInt(%arg0: tensor<1x10x10x32xi32>, %arg1: tensor<32xi32>) -> tensor<1x10x10x32xi32> {
@@ -57,8 +57,8 @@ func.func @squeezeAndReshape(%arg0: tensor<1x1x10xf32>, %arg1: tensor<?x10xf32>)
   %4 = "tf.some_op"(%1, %3) : (tensor<*xf32>, tensor<2x5xf32>) -> i32
   func.return %4 : i32
 // CHECK-LABEL: squeezeAndReshape
-// CHECK:  "tfl.squeeze"(%arg0) {squeeze_dims = [0]} : (tensor<1x1x10xf32>) -> tensor<1x10xf32>
-// CHECK:  %1 = "tfl.squeeze"(%arg1) {squeeze_dims = []} : (tensor<?x10xf32>) -> tensor<*xf32>
+// CHECK:  "tfl.squeeze"(%arg0) <{squeeze_dims = [0]}> : (tensor<1x1x10xf32>) -> tensor<1x10xf32>
+// CHECK:  %1 = "tfl.squeeze"(%arg1) <{squeeze_dims = []}> : (tensor<?x10xf32>) -> tensor<*xf32>
 // CHECK:  %cst = arith.constant dense<[2, 5]> : tensor<2xi32>
 // CHECK:  %2 = "tfl.reshape"(%0, %cst) : (tensor<1x10xf32>, tensor<2xi32>) -> tensor<2x5xf32>
 // CHECK:  %3 = "tf.some_op"(%1, %2) : (tensor<*xf32>, tensor<2x5xf32>) -> i32
@@ -118,7 +118,7 @@ func.func @avgPool2D(%arg0: tensor<1x6x6x16xf32>) -> tensor<1x1x1x16xf32> {
   func.return %6 : tensor<1x1x1x16xf32>
 
 // CHECK-LABEL: func @avgPool2D
-// CHECK:  "tfl.average_pool_2d"(%arg0) {filter_height = 3 : i32, filter_width = 6 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 3 : i32, stride_w = 1 : i32} : (tensor<1x6x6x16xf32>) -> tensor<1x1x1x16xf32>
+// CHECK:  "tfl.average_pool_2d"(%arg0) <{filter_height = 3 : i32, filter_width = 6 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 3 : i32, stride_w = 1 : i32}> : (tensor<1x6x6x16xf32>) -> tensor<1x1x1x16xf32>
 // CHECK:  %1 = "tf.AvgPool"(%arg0)
 // CHECK:  %2 = "tf.AvgPool"(%arg0)
 }
@@ -138,7 +138,7 @@ func.func @avgPool2DChannelFirst(%arg0: tensor<1x16x6x6xf32>) -> tensor<1x16x1x1
 // CHECK-LABEL: func @avgPool2DChannelFirst
 // CHECK:  %cst = arith.constant dense<[0, 2, 3, 1]> : tensor<4xi32>
 // CHECK:  %0 = "tfl.transpose"(%arg0, %cst) : (tensor<1x16x6x6xf32>, tensor<4xi32>) -> tensor<1x6x6x16xf32>
-// CHECK:  %1 = "tfl.average_pool_2d"(%0) {filter_height = 3 : i32, filter_width = 6 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 3 : i32, stride_w = 1 : i32} : (tensor<1x6x6x16xf32>) -> tensor<1x1x1x16xf32>
+// CHECK:  %1 = "tfl.average_pool_2d"(%0) <{filter_height = 3 : i32, filter_width = 6 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 3 : i32, stride_w = 1 : i32}> : (tensor<1x6x6x16xf32>) -> tensor<1x1x1x16xf32>
 // CHECK:  %cst_0 = arith.constant dense<[0, 3, 1, 2]> : tensor<4xi32>
 // CHECK:  %2 = "tfl.transpose"(%1, %cst_0) : (tensor<1x1x1x16xf32>, tensor<4xi32>) -> tensor<1x16x1x1xf32>
 // CHECK:  %3 = "tf.AvgPool"(%arg0)
@@ -150,7 +150,7 @@ func.func @softmax(%arg0: tensor<8x16xf32>) -> tensor<8x16xf32> {
   func.return %0 : tensor<8x16xf32>
 
 // CHECK-LABEL: softmax
-// CHECK:  "tfl.softmax"(%arg0) {beta = 1.000000e+00 : f32} : (tensor<8x16xf32>) -> tensor<8x16xf32>
+// CHECK:  "tfl.softmax"(%arg0) <{beta = 1.000000e+00 : f32}> : (tensor<8x16xf32>) -> tensor<8x16xf32>
 }
 
 func.func @softplus(%arg0: tensor<8x16xf32>) -> tensor<8x16xf32> {
@@ -160,7 +160,7 @@ func.func @softplus(%arg0: tensor<8x16xf32>) -> tensor<8x16xf32> {
 // CHECK-LABEL: softplus
 // CHECK:  %[[exp:.*]] = "tfl.exp"(%arg0) : (tensor<8x16xf32>) -> tensor<8x16xf32>
 // CHECK:  %[[cst:.*]] = arith.constant dense<1.000000e+00> : tensor<f32>
-// CHECK:  %[[add:.*]] = tfl.add(%[[exp]], %[[cst]]) {fused_activation_function = "NONE"} : (tensor<8x16xf32>, tensor<f32>) -> tensor<8x16xf32>
+// CHECK:  %[[add:.*]] = tfl.add(%[[exp]], %[[cst]]) <{fused_activation_function = "NONE"}> : (tensor<8x16xf32>, tensor<f32>) -> tensor<8x16xf32>
 // CHECK:  %[[log:.*]] = "tfl.log"(%[[add]]) : (tensor<8x16xf32>) -> tensor<8x16xf32>
 }
 
@@ -169,7 +169,7 @@ func.func @fakeQuantArgsFalse(%arg0: tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
   func.return %0 : tensor<8x8x8x8xf32>
 
   // CHECK-LABEL: fakeQuantArgsFalse
-  // CHECK: "tfl.quantize"(%arg0) {qtype = tensor<8x8x8x8x!quant.uniform<u8:f32, 0.0011764706057660721:85>>}
+  // CHECK: "tfl.quantize"(%arg0) <{qtype = tensor<8x8x8x8x!quant.uniform<u8:f32, 0.0011764706057660721:85>>}>
   // CHECK: %1 = "tfl.dequantize"(%0) : (tensor<8x8x8x8x!quant.uniform<u8:f32, 0.0011764706057660721:85>>) -> tensor<8x8x8x8xf32>
 }
 
@@ -178,7 +178,7 @@ func.func @fakeQuantArgsTrue(%arg0: tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
   func.return %0 : tensor<8x8x8x8xf32>
 
   // CHECK-LABEL: fakeQuantArgsTrue
-  // CHECK: "tfl.quantize"(%arg0) {qtype = tensor<8x8x8x8x!quant.uniform<u8<1:255>:f32, 0.001181102379804521:86>>} : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8x!quant.uniform<u8<1:255>:f32, 0.001181102379804521:86>>
+  // CHECK: "tfl.quantize"(%arg0) <{qtype = tensor<8x8x8x8x!quant.uniform<u8<1:255>:f32, 0.001181102379804521:86>>}> : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8x!quant.uniform<u8<1:255>:f32, 0.001181102379804521:86>>
   // CHECK: %1 = "tfl.dequantize"(%0) : (tensor<8x8x8x8x!quant.uniform<u8<1:255>:f32, 0.001181102379804521:86>>) -> tensor<8x8x8x8xf32>
 }
 
@@ -189,7 +189,7 @@ func.func @fakeQuantVarsFalse(%arg0: tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
   func.return %0 : tensor<8x8x8x8xf32>
 
   // CHECK-LABEL: fakeQuantVarsFalse
-  // CHECK: "tfl.quantize"(%arg0) {qtype = tensor<8x8x8x8x!quant.uniform<u8:f32, 0.0011764706057660721:85>>}
+  // CHECK: "tfl.quantize"(%arg0) <{qtype = tensor<8x8x8x8x!quant.uniform<u8:f32, 0.0011764706057660721:85>>}>
   // CHECK: %1 = "tfl.dequantize"(%0) : (tensor<8x8x8x8x!quant.uniform<u8:f32, 0.0011764706057660721:85>>) -> tensor<8x8x8x8xf32>
 }
 
@@ -206,7 +206,7 @@ func.func @fakeQuantArgsFalse4Bits(%arg0: tensor<8x8x8x8xf32>) -> tensor<8x8x8x8
   func.return %0 : tensor<8x8x8x8xf32>
 
   // CHECK-LABEL: fakeQuantArgsFalse
-  // CHECK: "tfl.quantize"(%arg0) {qtype = tensor<8x8x8x8x!quant.uniform<u4:f32, 0.020000000298023225:5>>}
+  // CHECK: "tfl.quantize"(%arg0) <{qtype = tensor<8x8x8x8x!quant.uniform<u4:f32, 0.020000000298023225:5>>}>
   // CHECK: %1 = "tfl.dequantize"(%0) : (tensor<8x8x8x8x!quant.uniform<u4:f32, 0.020000000298023225:5>>) -> tensor<8x8x8x8xf32>
 }
 
@@ -215,7 +215,7 @@ func.func @fakeQuantArgsTrue4Bits(%arg0: tensor<8x8x8x8xf32>) -> tensor<8x8x8x8x
   func.return %0 : tensor<8x8x8x8xf32>
 
   // CHECK-LABEL: fakeQuantArgsTrue
-  // CHECK: "tfl.quantize"(%arg0) {qtype = tensor<8x8x8x8x!quant.uniform<u4<1:15>:f32, 0.021428571747882024:6>>} : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8x!quant.uniform<u4<1:15>:f32, 0.021428571747882024:6>>
+  // CHECK: "tfl.quantize"(%arg0) <{qtype = tensor<8x8x8x8x!quant.uniform<u4<1:15>:f32, 0.021428571747882024:6>>}> : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8x!quant.uniform<u4<1:15>:f32, 0.021428571747882024:6>>
   // CHECK: %1 = "tfl.dequantize"(%0) : (tensor<8x8x8x8x!quant.uniform<u4<1:15>:f32, 0.021428571747882024:6>>) -> tensor<8x8x8x8xf32>
 }
 
@@ -226,7 +226,7 @@ func.func @fakeQuantVarsFalse4Bits(%arg0: tensor<8x8x8x8xf32>) -> tensor<8x8x8x8
   func.return %0 : tensor<8x8x8x8xf32>
 
   // CHECK-LABEL: fakeQuantVarsFalse
-  // CHECK: "tfl.quantize"(%arg0) {qtype = tensor<8x8x8x8x!quant.uniform<u4:f32, 0.020000000298023225:5>>}
+  // CHECK: "tfl.quantize"(%arg0) <{qtype = tensor<8x8x8x8x!quant.uniform<u4:f32, 0.020000000298023225:5>>}>
   // CHECK: %1 = "tfl.dequantize"(%0) : (tensor<8x8x8x8x!quant.uniform<u4:f32, 0.020000000298023225:5>>) -> tensor<8x8x8x8xf32>
 }
 
@@ -243,7 +243,7 @@ func.func @const() -> tensor<2xi32> {
   func.return %0: tensor<2xi32>
 
 // CHECK-LABEL: @const
-// CHECK: "tfl.pseudo_const"() {value = #tf_type<tensor_proto : "0x746674656E736F722464747970653A2044545F494E5433320A74656E736F725F7368617065207B0A202064696D207B0A2020202073697A653A20320A20207D0A7D0A74656E736F725F636F6E74656E743A20225C3230305C3030305C3030305C3030305C3230305C3030305C3030305C303030220A"> : tensor<2xi32>} : () -> tensor<2xi32>
+// CHECK: "tfl.pseudo_const"() <{value = #tf_type<tensor_proto : "0x746674656E736F722464747970653A2044545F494E5433320A74656E736F725F7368617065207B0A202064696D207B0A2020202073697A653A20320A20207D0A7D0A74656E736F725F636F6E74656E743A20225C3230305C3030305C3030305C3030305C3230305C3030305C3030305C303030220A"> : tensor<2xi32>}> : () -> tensor<2xi32>
 }
 
 func.func @shape(%arg0: tensor<?x1001xf32>) -> tensor<2xi32> {
@@ -357,7 +357,7 @@ func.func @maxPool2D(%arg0: tensor<1x1x1x16xf32>) -> tensor<1x1x1x16xf32> {
   func.return %6 : tensor<1x1x1x16xf32>
 
 // CHECK-LABEL: func @maxPool2D
-// CHECK:  "tfl.max_pool_2d"(%arg0) {filter_height = 3 : i32, filter_width = 6 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 3 : i32, stride_w = 1 : i32} : (tensor<1x1x1x16xf32>) -> tensor<1x1x1x16xf32>
+// CHECK:  "tfl.max_pool_2d"(%arg0) <{filter_height = 3 : i32, filter_width = 6 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 3 : i32, stride_w = 1 : i32}> : (tensor<1x1x1x16xf32>) -> tensor<1x1x1x16xf32>
 // CHECK:  %1 = "tf.MaxPool"(%arg0)
 // CHECK:  %2 = "tf.MaxPool"(%arg0)
 }
@@ -379,7 +379,7 @@ func.func @maxPool2DChannelFirst(%arg0: tensor<1x16x6x6xf32>) -> tensor<1x16x1x1
 // CHECK-LABEL: func @maxPool2DChannelFirst
 // CHECK:  %cst = arith.constant dense<[0, 2, 3, 1]> : tensor<4xi32>
 // CHECK:  %0 = "tfl.transpose"(%arg0, %cst) : (tensor<1x16x6x6xf32>, tensor<4xi32>) -> tensor<1x6x6x16xf32>
-// CHECK:  %1 = "tfl.max_pool_2d"(%0) {filter_height = 3 : i32, filter_width = 6 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 3 : i32, stride_w = 1 : i32} : (tensor<1x6x6x16xf32>) -> tensor<1x1x1x16xf32>
+// CHECK:  %1 = "tfl.max_pool_2d"(%0) <{filter_height = 3 : i32, filter_width = 6 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 3 : i32, stride_w = 1 : i32}> : (tensor<1x6x6x16xf32>) -> tensor<1x1x1x16xf32>
 // CHECK:  %cst_0 = arith.constant dense<[0, 3, 1, 2]> : tensor<4xi32>
 // CHECK:  %2 = "tfl.transpose"(%1, %cst_0) : (tensor<1x1x1x16xf32>, tensor<4xi32>) -> tensor<1x16x1x1xf32>
 // CHECK:  %3 = "tf.MaxPool"(%arg0)
@@ -399,7 +399,7 @@ func.func @any(%arg0: tensor<2x2xi1>, %arg1: tensor<i32>) -> tensor<i1> {
   func.return %0 : tensor<i1>
 
 // CHECK-LABEL:any
-// CHECK:  "tfl.reduce_any"(%arg0, %arg1) {keep_dims = false} : (tensor<2x2xi1>, tensor<i32>) -> tensor<i1>
+// CHECK:  "tfl.reduce_any"(%arg0, %arg1) <{keep_dims = false}> : (tensor<2x2xi1>, tensor<i32>) -> tensor<i1>
 }
 
 func.func @any_i64axes(%arg0: tensor<8x16x16xi1>, %arg1: tensor<2xi64>) -> tensor<?xi1> {
@@ -408,7 +408,7 @@ func.func @any_i64axes(%arg0: tensor<8x16x16xi1>, %arg1: tensor<2xi64>) -> tenso
 
   // CHECK-LABEL: any_i64axes
   // CHECK: %[[V0:.*]] = "tfl.cast"(%arg1) : (tensor<2xi64>) -> tensor<2xi32>
-  // CHECK: "tfl.reduce_any"(%arg0, %[[V0]]) {keep_dims = false} : (tensor<8x16x16xi1>, tensor<2xi32>) -> tensor<?xi1>
+  // CHECK: "tfl.reduce_any"(%arg0, %[[V0]]) <{keep_dims = false}> : (tensor<8x16x16xi1>, tensor<2xi32>) -> tensor<?xi1>
 }
 
 func.func @ceil(%arg0: tensor<8x16xf32>) -> tensor<8x16xf32> {
@@ -449,7 +449,7 @@ func.func @squeezeDefault(%arg0: tensor<1x2x2xf32>) -> tensor<2x2xf32> {
   func.return %0 : tensor<2x2xf32>
 
 // CHECK-LABEL:squeezeDefault
-// CHECK:  "tfl.squeeze"(%arg0) {squeeze_dims = []} : (tensor<1x2x2xf32>) -> tensor<2x2xf32>
+// CHECK:  "tfl.squeeze"(%arg0) <{squeeze_dims = []}> : (tensor<1x2x2xf32>) -> tensor<2x2xf32>
 }
 
 func.func @squeezeSingleAxis(%arg0: tensor<2x1x2xf32>) -> tensor<2x2xf32> {
@@ -457,7 +457,7 @@ func.func @squeezeSingleAxis(%arg0: tensor<2x1x2xf32>) -> tensor<2x2xf32> {
   func.return %0 : tensor<2x2xf32>
 
 // CHECK-LABEL:squeezeSingleAxis
-// CHECK:  "tfl.squeeze"(%arg0) {squeeze_dims = [1]} : (tensor<2x1x2xf32>) -> tensor<2x2xf32>
+// CHECK:  "tfl.squeeze"(%arg0) <{squeeze_dims = [1]}> : (tensor<2x1x2xf32>) -> tensor<2x2xf32>
 }
 
 func.func @squeezeTwoAxes(%arg0: tensor<1x2x1x2xf32>) -> tensor<2x2xf32> {
@@ -465,7 +465,7 @@ func.func @squeezeTwoAxes(%arg0: tensor<1x2x1x2xf32>) -> tensor<2x2xf32> {
   func.return %0 : tensor<2x2xf32>
 
 // CHECK-LABEL:squeezeTwoAxes
-// CHECK:  "tfl.squeeze"(%arg0) {squeeze_dims = [0, 2]} : (tensor<1x2x1x2xf32>) -> tensor<2x2xf32>
+// CHECK:  "tfl.squeeze"(%arg0) <{squeeze_dims = [0, 2]}> : (tensor<1x2x1x2xf32>) -> tensor<2x2xf32>
 }
 
 func.func @gatherScalarIndices(%arg0 : tensor<3x2xf32>, %arg1 : tensor<i32>) -> tensor<2xf32> {
@@ -473,7 +473,7 @@ func.func @gatherScalarIndices(%arg0 : tensor<3x2xf32>, %arg1 : tensor<i32>) ->
   func.return %0 : tensor<2xf32>
 
 // CHECK-LABEL:gatherScalarIndices
-// CHECK:  "tfl.gather"(%arg0, %arg1) {axis = 0 : i32, batch_dims = 0 : i32} : (tensor<3x2xf32>, tensor<i32>) -> tensor<2xf32>
+// CHECK:  "tfl.gather"(%arg0, %arg1) <{axis = 0 : i32, batch_dims = 0 : i32}> : (tensor<3x2xf32>, tensor<i32>) -> tensor<2xf32>
 }
 
 func.func @gatherVectorIndices(%arg0 : tensor<2xf32>, %arg1 : tensor<3xi32>) -> tensor<3xf32> {
@@ -481,7 +481,7 @@ func.func @gatherVectorIndices(%arg0 : tensor<2xf32>, %arg1 : tensor<3xi32>) ->
   func.return %0 : tensor<3xf32>
 
 // CHECK-LABEL:gatherVectorIndices
-// CHECK:  "tfl.gather"(%arg0, %arg1) {axis = 0 : i32, batch_dims = 0 : i32} : (tensor<2xf32>, tensor<3xi32>) -> tensor<3xf32>
+// CHECK:  "tfl.gather"(%arg0, %arg1) <{axis = 0 : i32, batch_dims = 0 : i32}> : (tensor<2xf32>, tensor<3xi32>) -> tensor<3xf32>
 }
 
 func.func @gatherHigherRankIndices(%arg0 : tensor<2x3x6xf32>, %arg1 : tensor<4x5xi32>) -> tensor<4x5x3x6xf32> {
@@ -489,7 +489,7 @@ func.func @gatherHigherRankIndices(%arg0 : tensor<2x3x6xf32>, %arg1 : tensor<4x5
   func.return %0 : tensor<4x5x3x6xf32>
 
 // CHECK-LABEL:gatherHigherRankIndices
-// CHECK:  "tfl.gather"(%arg0, %arg1) {axis = 0 : i32, batch_dims = 0 : i32} : (tensor<2x3x6xf32>, tensor<4x5xi32>) -> tensor<4x5x3x6xf32>
+// CHECK:  "tfl.gather"(%arg0, %arg1) <{axis = 0 : i32, batch_dims = 0 : i32}> : (tensor<2x3x6xf32>, tensor<4x5xi32>) -> tensor<4x5x3x6xf32>
 }
 
 func.func @gatherNdVectorIndices(%arg0 : tensor<3x2x2xf32>, %arg1 : tensor<2xi32>) -> tensor<2xf32> {
@@ -544,7 +544,7 @@ func.func @gatherV2VectorIndices(%arg0 : tensor<1x2x20xf32>, %arg1 : tensor<3x5x
   func.return %1 : tensor<1x3x5x20xf32>
 
 // CHECK-LABEL:gatherV2VectorIndices
-// CHECK:  "tfl.gather"(%arg0, %arg1) {axis = 1 : i32, batch_dims = 0 : i32} : (tensor<1x2x20xf32>, tensor<3x5xi32>) -> tensor<1x3x5x20xf32>
+// CHECK:  "tfl.gather"(%arg0, %arg1) <{axis = 1 : i32, batch_dims = 0 : i32}> : (tensor<1x2x20xf32>, tensor<3x5xi32>) -> tensor<1x3x5x20xf32>
 }
 
 func.func @gatherV2VectorIndices_I64Axis(%arg0 : tensor<1x2x20xf32>, %arg1 : tensor<3x5xi32>) -> tensor<1x3x5x20xf32> {
@@ -553,7 +553,7 @@ func.func @gatherV2VectorIndices_I64Axis(%arg0 : tensor<1x2x20xf32>, %arg1 : ten
   func.return %1 : tensor<1x3x5x20xf32>
 
 // CHECK-LABEL:gatherV2VectorIndices_I64Axis
-// CHECK:  "tfl.gather"(%arg0, %arg1) {axis = 1 : i32, batch_dims = 0 : i32} : (tensor<1x2x20xf32>, tensor<3x5xi32>) -> tensor<1x3x5x20xf32>
+// CHECK:  "tfl.gather"(%arg0, %arg1) <{axis = 1 : i32, batch_dims = 0 : i32}> : (tensor<1x2x20xf32>, tensor<3x5xi32>) -> tensor<1x3x5x20xf32>
 }
 
 func.func @gatherV2VectorIndicesNegAxis(%arg0 : tensor<1x2x20xf32>, %arg1 : tensor<3x5xi32>) -> tensor<1x2x3x5xf32> {
@@ -562,7 +562,7 @@ func.func @gatherV2VectorIndicesNegAxis(%arg0 : tensor<1x2x20xf32>, %arg1 : tens
   func.return %1 : tensor<1x2x3x5xf32>
 
 // CHECK-LABEL:gatherV2VectorIndices
-// CHECK:  "tfl.gather"(%arg0, %arg1) {axis = -1 : i32, batch_dims = 0 : i32} : (tensor<1x2x20xf32>, tensor<3x5xi32>) -> tensor<1x2x3x5xf32>
+// CHECK:  "tfl.gather"(%arg0, %arg1) <{axis = -1 : i32, batch_dims = 0 : i32}> : (tensor<1x2x20xf32>, tensor<3x5xi32>) -> tensor<1x2x3x5xf32>
 }
 
 func.func @gatherWithBatchDims(%arg0 : tensor<2x3x6xf32>, %arg1 : tensor<2x5xi32>) -> tensor<2x5x3x6xf32> {
@@ -571,7 +571,7 @@ func.func @gatherWithBatchDims(%arg0 : tensor<2x3x6xf32>, %arg1 : tensor<2x5xi32
   func.return %1 : tensor<2x5x3x6xf32>
 
 // CHECK-LABEL:gatherWithBatchDims
-// CHECK:  "tfl.gather"(%arg0, %arg1) {axis = 1 : i32, batch_dims = 1 : i32} : (tensor<2x3x6xf32>, tensor<2x5xi32>) -> tensor<2x5x3x6xf32>
+// CHECK:  "tfl.gather"(%arg0, %arg1) <{axis = 1 : i32, batch_dims = 1 : i32}> : (tensor<2x3x6xf32>, tensor<2x5xi32>) -> tensor<2x5x3x6xf32>
 }
 
 
@@ -1037,7 +1037,7 @@ func.func @pack2Tensors(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2x
   func.return %0 : tensor<2x2xi32>
 
 // CHECK-LABEL: pack2Tensors
-// CHECK: "tfl.pack"(%arg0, %arg1) {axis = 0 : i32, values_count = 2 : i32} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2x2xi32>
+// CHECK: "tfl.pack"(%arg0, %arg1) <{axis = 0 : i32, values_count = 2 : i32}> : (tensor<2xi32>, tensor<2xi32>) -> tensor<2x2xi32>
 }
 
 func.func @pack3Tensors(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>, %arg2 : tensor<2xi32>) -> tensor<2x3xi32> {
@@ -1045,7 +1045,7 @@ func.func @pack3Tensors(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>, %arg2 : tens
   func.return %0 : tensor<2x3xi32>
 
 // CHECK-LABEL: pack3Tensors
-// CHECK: "tfl.pack"(%arg0, %arg1, %arg2) {axis = 1 : i32, values_count = 3 : i32} : (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<2x3xi32>
+// CHECK: "tfl.pack"(%arg0, %arg1, %arg2) <{axis = 1 : i32, values_count = 3 : i32}> : (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<2x3xi32>
 }
 
 func.func @packStringWithFlex(%arg0: tensor<2x!tf_type.string>, %arg1: tensor<2x!tf_type.string>) -> tensor<2x2x!tf_type.string> {
@@ -1061,7 +1061,7 @@ func.func @packNegAxis(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>, %arg2 : tenso
   func.return %0 : tensor<2x3xi32>
 
 // CHECK-LABEL: packNegAxis
-// CHECK: "tfl.pack"(%arg0, %arg1, %arg2) {axis = -1 : i32, values_count = 3 : i32} : (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<2x3xi32>
+// CHECK: "tfl.pack"(%arg0, %arg1, %arg2) <{axis = -1 : i32, values_count = 3 : i32}> : (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<2x3xi32>
 }
 
 func.func @unpack2Tensors(%arg0: tensor<2x2xi32>) -> tensor<2xi32> {
@@ -1069,7 +1069,7 @@ func.func @unpack2Tensors(%arg0: tensor<2x2xi32>) -> tensor<2xi32> {
   func.return %0#0 : tensor<2xi32>
 
 // CHECK-LABEL: unpack2Tensors
-// CHECK: "tfl.unpack"(%arg0) {axis = 0 : i32, num = 2 : i32} : (tensor<2x2xi32>) -> (tensor<2xi32>, tensor<2xi32>)
+// CHECK: "tfl.unpack"(%arg0) <{axis = 0 : i32, num = 2 : i32}> : (tensor<2x2xi32>) -> (tensor<2xi32>, tensor<2xi32>)
 }
 
 func.func @unpack3Tensors(%arg0: tensor<2x3xi32>) -> tensor<2xi32> {
@@ -1077,7 +1077,7 @@ func.func @unpack3Tensors(%arg0: tensor<2x3xi32>) -> tensor<2xi32> {
   func.return %0#0 : tensor<2xi32>
 
 // CHECK-LABEL: unpack3Tensors
-// CHECK: "tfl.unpack"(%arg0) {axis = 1 : i32, num = 3 : i32} : (tensor<2x3xi32>) -> (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>)
+// CHECK: "tfl.unpack"(%arg0) <{axis = 1 : i32, num = 3 : i32}> : (tensor<2x3xi32>) -> (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>)
 }
 
 func.func @unpackNegAxis(%arg0: tensor<2x3xi32>) -> tensor<2xi32> {
@@ -1085,7 +1085,7 @@ func.func @unpackNegAxis(%arg0: tensor<2x3xi32>) -> tensor<2xi32> {
   func.return %0#0 : tensor<2xi32>
 
 // CHECK-LABEL: unpackNegAxis
-// CHECK: "tfl.unpack"(%arg0) {axis = -1 : i32, num = 3 : i32} : (tensor<2x3xi32>) -> (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>)
+// CHECK: "tfl.unpack"(%arg0) <{axis = -1 : i32, num = 3 : i32}> : (tensor<2x3xi32>) -> (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>)
 }
 
 func.func @mean(%arg0: tensor<2x2xf32>, %arg1: tensor<1xi32>) -> tensor<1x2xf32> {
@@ -1093,7 +1093,7 @@ func.func @mean(%arg0: tensor<2x2xf32>, %arg1: tensor<1xi32>) -> tensor<1x2xf32>
   func.return %0 : tensor<1x2xf32>
 
 // CHECK-LABEL: mean
-// CHECK:  "tfl.mean"(%arg0, %arg1) {keep_dims = false} : (tensor<2x2xf32>, tensor<1xi32>) -> tensor<1x2xf32>
+// CHECK:  "tfl.mean"(%arg0, %arg1) <{keep_dims = false}> : (tensor<2x2xf32>, tensor<1xi32>) -> tensor<1x2xf32>
 }
 
 func.func @mean_true(%arg0: tensor<2x2xf32>, %arg1: tensor<1xi32>) -> tensor<1x2xf32> {
@@ -1101,7 +1101,7 @@ func.func @mean_true(%arg0: tensor<2x2xf32>, %arg1: tensor<1xi32>) -> tensor<1x2
   func.return %0 : tensor<1x2xf32>
 
 // CHECK-LABEL: mean_true
-// CHECK:  "tfl.mean"(%arg0, %arg1) {keep_dims = true} : (tensor<2x2xf32>, tensor<1xi32>) -> tensor<1x2xf32>
+// CHECK:  "tfl.mean"(%arg0, %arg1) <{keep_dims = true}> : (tensor<2x2xf32>, tensor<1xi32>) -> tensor<1x2xf32>
 }
 
 func.func @sum(%arg0: tensor<8x16x16xf32>, %arg1: tensor<2xi32>) -> tensor<?xf32> {
@@ -1109,7 +1109,7 @@ func.func @sum(%arg0: tensor<8x16x16xf32>, %arg1: tensor<2xi32>) -> tensor<?xf32
   func.return %0 : tensor<?xf32>
 
   // CHECK-LABEL: sum
-  // CHECK: "tfl.sum"(%arg0, %arg1) {keep_dims = false} : (tensor<8x16x16xf32>, tensor<2xi32>) -> tensor<?xf32>
+  // CHECK: "tfl.sum"(%arg0, %arg1) <{keep_dims = false}> : (tensor<8x16x16xf32>, tensor<2xi32>) -> tensor<?xf32>
 }
 
 func.func @sum_true(%arg0: tensor<8x16x16xf32>, %arg1: tensor<2xi32>) -> tensor<?xf32> {
@@ -1117,7 +1117,7 @@ func.func @sum_true(%arg0: tensor<8x16x16xf32>, %arg1: tensor<2xi32>) -> tensor<
   func.return %0 : tensor<?xf32>
 
   // CHECK-LABEL: sum_true
-  // CHECK: "tfl.sum"(%arg0, %arg1) {keep_dims = true} : (tensor<8x16x16xf32>, tensor<2xi32>) -> tensor<?xf32>
+  // CHECK: "tfl.sum"(%arg0, %arg1) <{keep_dims = true}> : (tensor<8x16x16xf32>, tensor<2xi32>) -> tensor<?xf32>
 }
 
 func.func @sum_i64axes(%arg0: tensor<8x16x16xf32>, %arg1: tensor<2xi64>) -> tensor<?xf32> {
@@ -1126,7 +1126,7 @@ func.func @sum_i64axes(%arg0: tensor<8x16x16xf32>, %arg1: tensor<2xi64>) -> tens
 
   // CHECK-LABEL: sum_i64axes
   // CHECK: %[[V0:.*]] = "tfl.cast"(%arg1) : (tensor<2xi64>) -> tensor<2xi32>
-  // CHECK: "tfl.sum"(%arg0, %[[V0]]) {keep_dims = false} : (tensor<8x16x16xf32>, tensor<2xi32>) -> tensor<?xf32>
+  // CHECK: "tfl.sum"(%arg0, %[[V0]]) <{keep_dims = false}> : (tensor<8x16x16xf32>, tensor<2xi32>) -> tensor<?xf32>
 }
 
 func.func @reduce_min(%arg0: tensor<8x16x16xf32>, %arg1: tensor<2xi32>) -> tensor<?xf32> {
@@ -1134,7 +1134,7 @@ func.func @reduce_min(%arg0: tensor<8x16x16xf32>, %arg1: tensor<2xi32>) -> tenso
   func.return %0 : tensor<?xf32>
 
   // CHECK-LABEL: reduce_min
-  // CHECK: "tfl.reduce_min"(%arg0, %arg1) {keep_dims = false} : (tensor<8x16x16xf32>, tensor<2xi32>) -> tensor<?xf32>
+  // CHECK: "tfl.reduce_min"(%arg0, %arg1) <{keep_dims = false}> : (tensor<8x16x16xf32>, tensor<2xi32>) -> tensor<?xf32>
 }
 
 func.func @reduce_min_true(%arg0: tensor<8x16x16xf32>, %arg1: tensor<2xi32>) -> tensor<?xf32> {
@@ -1142,7 +1142,7 @@ func.func @reduce_min_true(%arg0: tensor<8x16x16xf32>, %arg1: tensor<2xi32>) ->
   func.return %0 : tensor<?xf32>
 
   // CHECK-LABEL: reduce_min_true
-  // CHECK: "tfl.reduce_min"(%arg0, %arg1) {keep_dims = true} : (tensor<8x16x16xf32>, tensor<2xi32>) -> tensor<?xf32>
+  // CHECK: "tfl.reduce_min"(%arg0, %arg1) <{keep_dims = true}> : (tensor<8x16x16xf32>, tensor<2xi32>) -> tensor<?xf32>
 }
 
 func.func @reduce_min_i64axes(%arg0: tensor<8x16x16xf32>, %arg1: tensor<2xi64>) -> tensor<?xf32> {
@@ -1151,7 +1151,7 @@ func.func @reduce_min_i64axes(%arg0: tensor<8x16x16xf32>, %arg1: tensor<2xi64>)
 
   // CHECK-LABEL: reduce_min_i64axes
   // CHECK: %[[V0:.*]] = "tfl.cast"(%arg1) : (tensor<2xi64>) -> tensor<2xi32>
-  // CHECK: "tfl.reduce_min"(%arg0, %[[V0]]) {keep_dims = false} : (tensor<8x16x16xf32>, tensor<2xi32>) -> tensor<?xf32>
+  // CHECK: "tfl.reduce_min"(%arg0, %[[V0]]) <{keep_dims = false}> : (tensor<8x16x16xf32>, tensor<2xi32>) -> tensor<?xf32>
 }
 
 func.func @reduce_max(%arg0: tensor<8x16x16xf32>, %arg1: tensor<2xi32>) -> tensor<?xf32> {
@@ -1159,7 +1159,7 @@ func.func @reduce_max(%arg0: tensor<8x16x16xf32>, %arg1: tensor<2xi32>) -> tenso
   func.return %0 : tensor<?xf32>
 
   // CHECK-LABEL: reduce_max
-  // CHECK: "tfl.reduce_max"(%arg0, %arg1) {keep_dims = false} : (tensor<8x16x16xf32>, tensor<2xi32>) -> tensor<?xf32>
+  // CHECK: "tfl.reduce_max"(%arg0, %arg1) <{keep_dims = false}> : (tensor<8x16x16xf32>, tensor<2xi32>) -> tensor<?xf32>
 }
 
 func.func @reduce_max_true(%arg0: tensor<8x16x16xf32>, %arg1: tensor<2xi32>) -> tensor<?xf32> {
@@ -1167,7 +1167,7 @@ func.func @reduce_max_true(%arg0: tensor<8x16x16xf32>, %arg1: tensor<2xi32>) ->
   func.return %0 : tensor<?xf32>
 
   // CHECK-LABEL: reduce_max_true
-  // CHECK: "tfl.reduce_max"(%arg0, %arg1) {keep_dims = true} : (tensor<8x16x16xf32>, tensor<2xi32>) -> tensor<?xf32>
+  // CHECK: "tfl.reduce_max"(%arg0, %arg1) <{keep_dims = true}> : (tensor<8x16x16xf32>, tensor<2xi32>) -> tensor<?xf32>
 }
 
 func.func @reduce_max_i64axes(%arg0: tensor<8x16x16xf32>, %arg1: tensor<2xi64>) -> tensor<?xf32> {
@@ -1176,7 +1176,7 @@ func.func @reduce_max_i64axes(%arg0: tensor<8x16x16xf32>, %arg1: tensor<2xi64>)
 
   // CHECK-LABEL: reduce_max_i64axes
   // CHECK: %[[V0:.*]] = "tfl.cast"(%arg1) : (tensor<2xi64>) -> tensor<2xi32>
-  // CHECK: "tfl.reduce_max"(%arg0, %[[V0]]) {keep_dims = false} : (tensor<8x16x16xf32>, tensor<2xi32>) -> tensor<?xf32>
+  // CHECK: "tfl.reduce_max"(%arg0, %[[V0]]) <{keep_dims = false}> : (tensor<8x16x16xf32>, tensor<2xi32>) -> tensor<?xf32>
 }
 
 func.func @reduce_prod(%arg0: tensor<8x16x16xf32>, %arg1: tensor<2xi32>) -> tensor<?xf32> {
@@ -1184,7 +1184,7 @@ func.func @reduce_prod(%arg0: tensor<8x16x16xf32>, %arg1: tensor<2xi32>) -> tens
   func.return %0 : tensor<?xf32>
 
   // CHECK-LABEL: reduce_prod
-  // CHECK: "tfl.reduce_prod"(%arg0, %arg1) {keep_dims = false} : (tensor<8x16x16xf32>, tensor<2xi32>) -> tensor<?xf32>
+  // CHECK: "tfl.reduce_prod"(%arg0, %arg1) <{keep_dims = false}> : (tensor<8x16x16xf32>, tensor<2xi32>) -> tensor<?xf32>
 }
 
 func.func @reduce_prod_true(%arg0: tensor<8x16x16xf32>, %arg1: tensor<2xi32>) -> tensor<?xf32> {
@@ -1192,7 +1192,7 @@ func.func @reduce_prod_true(%arg0: tensor<8x16x16xf32>, %arg1: tensor<2xi32>) ->
   func.return %0 : tensor<?xf32>
 
   // CHECK-LABEL: reduce_prod_true
-  // CHECK: "tfl.reduce_prod"(%arg0, %arg1) {keep_dims = true} : (tensor<8x16x16xf32>, tensor<2xi32>) -> tensor<?xf32>
+  // CHECK: "tfl.reduce_prod"(%arg0, %arg1) <{keep_dims = true}> : (tensor<8x16x16xf32>, tensor<2xi32>) -> tensor<?xf32>
 }
 
 func.func @reduce_prod_i64axes(%arg0: tensor<8x16x16xf32>, %arg1: tensor<2xi64>) -> tensor<?xf32> {
@@ -1201,7 +1201,7 @@ func.func @reduce_prod_i64axes(%arg0: tensor<8x16x16xf32>, %arg1: tensor<2xi64>)
 
   // CHECK-LABEL: reduce_prod_i64axes
   // CHECK: %[[V0:.*]] = "tfl.cast"(%arg1) : (tensor<2xi64>) -> tensor<2xi32>
-  // CHECK: "tfl.reduce_prod"(%arg0, %[[V0]]) {keep_dims = false} : (tensor<8x16x16xf32>, tensor<2xi32>) -> tensor<?xf32>
+  // CHECK: "tfl.reduce_prod"(%arg0, %[[V0]]) <{keep_dims = false}> : (tensor<8x16x16xf32>, tensor<2xi32>) -> tensor<?xf32>
 }
 
 func.func @batch_to_space_nd(%arg0: tensor<4x2x2x3xf32>, %arg1: tensor<2xi32>, %arg2: tensor<2x2xi32>) -> tensor<?xf32> {
@@ -1248,7 +1248,7 @@ func.func @split(%arg0: tensor<i32>, %arg1: tensor<1x4x3x3xf32>) -> tensor<1x4x3
   func.return %0#0 : tensor<1x4x3xf32>
 
   // CHECK-LABEL: split
-  // CHECK: "tfl.split"(%arg0, %arg1) {num_splits = 3 : i32} : (tensor<i32>, tensor<1x4x3x3xf32>) -> (tensor<1x4x3xf32>, tensor<1x4x3xf32>, tensor<1x4x3xf32>)
+  // CHECK: "tfl.split"(%arg0, %arg1) <{num_splits = 3 : i32}> : (tensor<i32>, tensor<1x4x3x3xf32>) -> (tensor<1x4x3xf32>, tensor<1x4x3xf32>, tensor<1x4x3xf32>)
 }
 
 func.func @splitv(%arg0: tensor<1x4x3x3xf32>, %arg1: tensor<2xi32>, %arg2: tensor<i32>) -> tensor<1x4x2x3xf32> {
@@ -1256,7 +1256,7 @@ func.func @splitv(%arg0: tensor<1x4x3x3xf32>, %arg1: tensor<2xi32>, %arg2: tenso
   func.return %0#0 : tensor<1x4x2x3xf32>
 
   // CHECK-LABEL: splitv
-  // CHECK: "tfl.split_v"(%arg0, %arg1, %arg2) {num_splits = 2 : i32} : (tensor<1x4x3x3xf32>, tensor<2xi32>, tensor<i32>) -> (tensor<1x4x2x3xf32>, tensor<1x4x1x3xf32>)
+  // CHECK: "tfl.split_v"(%arg0, %arg1, %arg2) <{num_splits = 2 : i32}> : (tensor<1x4x3x3xf32>, tensor<2xi32>, tensor<i32>) -> (tensor<1x4x2x3xf32>, tensor<1x4x1x3xf32>)
 }
 
 func.func @matmul(%arg0: tensor<40x37xf32>, %arg1: tensor<37x40xf32>) -> tensor<40x40xf32> {
@@ -1266,8 +1266,8 @@ func.func @matmul(%arg0: tensor<40x37xf32>, %arg1: tensor<37x40xf32>) -> tensor<
 // CHECK-LABEL: matmul
 // CHECK: %[[CST:.*]] = arith.constant dense<[1, 0]> : tensor<2xi32>
 // CHECK: %[[ARG:.*]] = "tfl.transpose"(%arg1, %[[CST]]) : (tensor<37x40xf32>, tensor<2xi32>) -> tensor<40x37xf32>
-// CHECK: %[[CST_0:.*]] = "tfl.no_value"() {value} : () -> none
-// CHECK: "tfl.fully_connected"(%arg0, %[[ARG]], %[[CST_0]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<40x37xf32>, tensor<40x37xf32>, none) -> tensor<40x40xf32>
+// CHECK: %[[CST_0:.*]] = "tfl.no_value"() <{value}> : () -> none
+// CHECK: "tfl.fully_connected"(%arg0, %[[ARG]], %[[CST_0]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<40x37xf32>, tensor<40x37xf32>, none) -> tensor<40x40xf32>
 }
 
 func.func @matmul_transposed_a(%arg0: tensor<37x40xf32>, %arg1: tensor<37x40xf32>) -> tensor<40x40xf32> {
@@ -1278,8 +1278,8 @@ func.func @matmul_transposed_a(%arg0: tensor<37x40xf32>, %arg1: tensor<37x40xf32
 // CHECK: %[[CST_0:.*]] = arith.constant dense<[1, 0]> : tensor<2xi32>
 // CHECK: %[[ARG_0:.*]] = "tfl.transpose"(%arg0, %[[CST_0]]) : (tensor<37x40xf32>, tensor<2xi32>) -> tensor<40x37xf32>
 // CHECK: %[[ARG_1:.*]] = "tfl.transpose"(%arg1, %[[CST_0]]) : (tensor<37x40xf32>, tensor<2xi32>) -> tensor<40x37xf32>
-// CHECK: %[[CST_2:.*]] = "tfl.no_value"() {value} : () -> none
-// CHECK: "tfl.fully_connected"(%[[ARG_0]], %[[ARG_1]], %[[CST_2]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<40x37xf32>, tensor<40x37xf32>, none) -> tensor<40x40xf32>
+// CHECK: %[[CST_2:.*]] = "tfl.no_value"() <{value}> : () -> none
+// CHECK: "tfl.fully_connected"(%[[ARG_0]], %[[ARG_1]], %[[CST_2]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<40x37xf32>, tensor<40x37xf32>, none) -> tensor<40x40xf32>
 }
 
 func.func @matmul_transposed_b(%arg0: tensor<40x37xf32>, %arg1: tensor<40x37xf32>) -> tensor<40x40xf32> {
@@ -1287,8 +1287,8 @@ func.func @matmul_transposed_b(%arg0: tensor<40x37xf32>, %arg1: tensor<40x37xf32
 (tensor<40x37xf32>, tensor<40x37xf32>) -> tensor<40x40xf32>
   func.return %0 : tensor<40x40xf32>
 // CHECK-LABEL: matmul_transposed_b
-// CHECK: %[[CST:.*]] = "tfl.no_value"() {value} : () -> none
-// CHECK: "tfl.fully_connected"(%arg0, %arg1, %[[CST]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<40x37xf32>, tensor<40x37xf32>, none) -> tensor<40x40xf32>
+// CHECK: %[[CST:.*]] = "tfl.no_value"() <{value}> : () -> none
+// CHECK: "tfl.fully_connected"(%arg0, %arg1, %[[CST]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<40x37xf32>, tensor<40x37xf32>, none) -> tensor<40x40xf32>
 }
 
 func.func @matmul_transposed_ab(%arg0: tensor<37x40xf32>, %arg1: tensor<40x37xf32>) -> tensor<40x40xf32> {
@@ -1298,8 +1298,8 @@ func.func @matmul_transposed_ab(%arg0: tensor<37x40xf32>, %arg1: tensor<40x37xf3
 // CHECK-LABEL: matmul_transposed_ab
 // CHECK: %[[CST_0:.*]] = arith.constant dense<[1, 0]> : tensor<2xi32>
 // CHECK: %[[ARG_0:.*]] = "tfl.transpose"(%arg0, %[[CST_0]]) : (tensor<37x40xf32>, tensor<2xi32>) -> tensor<40x37xf32>
-// CHECK: %[[CST_1:.*]] = "tfl.no_value"() {value} : () -> none
-// CHECK: "tfl.fully_connected"(%[[ARG_0]], %arg1, %[[CST_1]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<40x37xf32>, tensor<40x37xf32>, none) -> tensor<40x40xf32>
+// CHECK: %[[CST_1:.*]] = "tfl.no_value"() <{value}> : () -> none
+// CHECK: "tfl.fully_connected"(%[[ARG_0]], %arg1, %[[CST_1]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<40x37xf32>, tensor<40x37xf32>, none) -> tensor<40x40xf32>
 }
 
 func.func @concat_v2_with_3_tensors(%arg0: tensor<2x1xi32>, %arg1: tensor<2x1xi32>, %arg2: tensor<2x1xi32>) -> tensor<2x3xi32> {
@@ -1308,7 +1308,7 @@ func.func @concat_v2_with_3_tensors(%arg0: tensor<2x1xi32>, %arg1: tensor<2x1xi3
   func.return %1 : tensor<2x3xi32>
 
 // CHECK-LABEL: concat_v2_with_3_tensors
-// CHECK: "tfl.concatenation"(%arg0, %arg1, %arg2) {axis = -1 : i32, fused_activation_function = "NONE"} : (tensor<2x1xi32>, tensor<2x1xi32>, tensor<2x1xi32>) -> tensor<2x3xi32>
+// CHECK: "tfl.concatenation"(%arg0, %arg1, %arg2) <{axis = -1 : i32, fused_activation_function = "NONE"}> : (tensor<2x1xi32>, tensor<2x1xi32>, tensor<2x1xi32>) -> tensor<2x3xi32>
 }
 
 func.func @concat_v2_i64_axis(%arg0: tensor<2x1xi32>, %arg1: tensor<2x1xi32>, %arg2: tensor<2x1xi32>) -> tensor<2x3xi32> {
@@ -1317,7 +1317,7 @@ func.func @concat_v2_i64_axis(%arg0: tensor<2x1xi32>, %arg1: tensor<2x1xi32>, %a
   func.return %1 : tensor<2x3xi32>
 
 // CHECK-LABEL: concat_v2_i64_axis
-// CHECK: "tfl.concatenation"(%arg0, %arg1, %arg2) {axis = -1 : i32, fused_activation_function = "NONE"} : (tensor<2x1xi32>, tensor<2x1xi32>, tensor<2x1xi32>) -> tensor<2x3xi32>
+// CHECK: "tfl.concatenation"(%arg0, %arg1, %arg2) <{axis = -1 : i32, fused_activation_function = "NONE"}> : (tensor<2x1xi32>, tensor<2x1xi32>, tensor<2x1xi32>) -> tensor<2x3xi32>
 }
 
 func.func @concat_v2_with_bool_type(%arg0: tensor<?x1xi1>, %arg1: tensor<?x1xi1>) -> tensor<?x2xi1> {
@@ -1326,28 +1326,28 @@ func.func @concat_v2_with_bool_type(%arg0: tensor<?x1xi1>, %arg1: tensor<?x1xi1>
   func.return %1 : tensor<?x2xi1>
 
 // CHECK-LABEL: concat_v2_with_bool_type
-// CHECK: "tfl.concatenation"(%arg0, %arg1) {axis = -1 : i32, fused_activation_function = "NONE"} : (tensor<?x1xi1>, tensor<?x1xi1>) -> tensor<?x2xi1>
+// CHECK: "tfl.concatenation"(%arg0, %arg1) <{axis = -1 : i32, fused_activation_function = "NONE"}> : (tensor<?x1xi1>, tensor<?x1xi1>) -> tensor<?x2xi1>
 }
 
 func.func @resize_with_bilinear(%arg0: tensor<1x100x100x3xf32>, %arg1: tensor<4xi32>) -> tensor<?xf32> {
   %0 = "tf.ResizeBilinear"(%arg0, %arg1) {align_corners = true} : (tensor<1x100x100x3xf32>, tensor<4xi32>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
   // CHECK-LABEL: resize_with_bilinear
-  // CHECK: "tfl.resize_bilinear"(%arg0, %arg1) {align_corners = true, half_pixel_centers = false} : (tensor<1x100x100x3xf32>, tensor<4xi32>) -> tensor<?xf32>
+  // CHECK: "tfl.resize_bilinear"(%arg0, %arg1) <{align_corners = true, half_pixel_centers = false}> : (tensor<1x100x100x3xf32>, tensor<4xi32>) -> tensor<?xf32>
 }
 
 func.func @resize_with_bilinear_with_half_pixel_centers(%arg0: tensor<1x100x100x3xf32>, %arg1: tensor<4xi32>) -> tensor<?xf32> {
   %0 = "tf.ResizeBilinear"(%arg0, %arg1) {align_corners = false, half_pixel_centers = true} : (tensor<1x100x100x3xf32>, tensor<4xi32>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
   // CHECK-LABEL: resize_with_bilinear_with_half_pixel_centers
-  // CHECK: "tfl.resize_bilinear"(%arg0, %arg1) {align_corners = false, half_pixel_centers = true} : (tensor<1x100x100x3xf32>, tensor<4xi32>) -> tensor<?xf32>
+  // CHECK: "tfl.resize_bilinear"(%arg0, %arg1) <{align_corners = false, half_pixel_centers = true}> : (tensor<1x100x100x3xf32>, tensor<4xi32>) -> tensor<?xf32>
 }
 
 func.func @strided_slice(%arg0: tensor<12x2x2x5xf32>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>, %arg3: tensor<1xi32>) -> tensor<1x2x2x5xf32> {
   %0 = "tf.StridedSlice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64, offset = false} : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32>
   func.return %0 : tensor<1x2x2x5xf32>
   // CHECK-LABEL: strided_slice
-  // CHECK: "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32>
+  // CHECK: "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) <{begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32}> : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32>
 }
 
 func.func @strided_slice_with_constant_attributes(%arg0: tensor<10x10x10xf32>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>, %arg3: tensor<1xi32>) -> tensor<10x10xf32> {
@@ -1360,14 +1360,14 @@ func.func @strided_slice_with_constant_attributes(%arg0: tensor<10x10x10xf32>, %
   // CHECK-DAG: [[BEGIN:%cst.*]] = arith.constant dense<-1> : tensor<1xi32>
   // CHECK-DAG: [[END:%cst.*]] = arith.constant dense<0> : tensor<1xi32>
   // CHECK-DAG: [[STRIDES:%cst.*]] = arith.constant dense<1> : tensor<1xi32>
-  // CHECK-NEXT: "tfl.strided_slice"(%arg0, [[BEGIN]], [[END]], [[STRIDES]]) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 1 : i32} : (tensor<10x10x10xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<10x10xf32>
+  // CHECK-NEXT: "tfl.strided_slice"(%arg0, [[BEGIN]], [[END]], [[STRIDES]]) <{begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 1 : i32}> : (tensor<10x10x10xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<10x10xf32>
 }
 
 func.func @strided_slice_with_string(%arg0: tensor<12x2x2x5x!tf_type.string>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>, %arg3: tensor<1xi32>) -> tensor<1x2x2x5x!tf_type.string> {
   %0 = "tf.StridedSlice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64, offset = false} : (tensor<12x2x2x5x!tf_type.string>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5x!tf_type.string>
   func.return %0 : tensor<1x2x2x5x!tf_type.string>
   // CHECK-LABEL: strided_slice_with_string
-  // CHECK: "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<12x2x2x5x!tf_type.string>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5x!tf_type.string>
+  // CHECK: "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) <{begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32}> : (tensor<12x2x2x5x!tf_type.string>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5x!tf_type.string>
 }
 
 func.func @strided_slice_with_unranked_input_and_i64_parameters(%arg0: tensor<*xf32>, %arg1: tensor<1xi64>, %arg2: tensor<1xi64>, %arg3: tensor<1xi64>) -> tensor<*xf32> {
@@ -1377,7 +1377,7 @@ func.func @strided_slice_with_unranked_input_and_i64_parameters(%arg0: tensor<*x
   // CHECK-DAG: [[BEGIN:%.*]] = "tfl.cast"(%arg1) : (tensor<1xi64>) -> tensor<1xi32>
   // CHECK-DAG: [[END:%.*]] = "tfl.cast"(%arg2) : (tensor<1xi64>) -> tensor<1xi32>
   // CHECK-DAG: [[STRIDES:%.*]] = "tfl.cast"(%arg3) : (tensor<1xi64>) -> tensor<1xi32>
-  // CHECK-NEXT: "tfl.strided_slice"(%arg0, [[BEGIN]], [[END]], [[STRIDES]]) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<*xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<*xf32>
+  // CHECK-NEXT: "tfl.strided_slice"(%arg0, [[BEGIN]], [[END]], [[STRIDES]]) <{begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32}> : (tensor<*xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<*xf32>
 }
 
 func.func @strided_slice_with_i64_parameters(%arg0: tensor<12x2x2x5xf32>, %arg1: tensor<1xi64>, %arg2: tensor<1xi64>, %arg3: tensor<1xi64>) -> tensor<1x2x2x5xf32> {
@@ -1387,7 +1387,7 @@ func.func @strided_slice_with_i64_parameters(%arg0: tensor<12x2x2x5xf32>, %arg1:
   // CHECK-DAG: [[BEGIN:%.*]] = "tfl.cast"(%arg1) : (tensor<1xi64>) -> tensor<1xi32>
   // CHECK-DAG: [[END:%.*]] = "tfl.cast"(%arg2) : (tensor<1xi64>) -> tensor<1xi32>
   // CHECK-DAG: [[STRIDES:%.*]] = "tfl.cast"(%arg3) : (tensor<1xi64>) -> tensor<1xi32>
-  // CHECK-NEXT: "tfl.strided_slice"(%arg0, [[BEGIN]], [[END]], [[STRIDES]]) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32>
+  // CHECK-NEXT: "tfl.strided_slice"(%arg0, [[BEGIN]], [[END]], [[STRIDES]]) <{begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32}> : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32>
 }
 
 func.func @strided_slice_with_i64_constant_attributes(%arg0: tensor<10x10x10xf32>) -> tensor<10x10xf32> {
@@ -1400,21 +1400,21 @@ func.func @strided_slice_with_i64_constant_attributes(%arg0: tensor<10x10x10xf32
   // CHECK-DAG: [[BEGIN:%cst.*]] = arith.constant dense<-1> : tensor<1xi32>
   // CHECK-DAG: [[END:%cst.*]] = arith.constant dense<0> : tensor<1xi32>
   // CHECK-DAG: [[STRIDES:%cst.*]] = arith.constant dense<1> : tensor<1xi32>
-  // CHECK-NEXT: "tfl.strided_slice"(%arg0, [[BEGIN]], [[END]], [[STRIDES]]) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 1 : i32} : (tensor<10x10x10xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<10x10xf32>
+  // CHECK-NEXT: "tfl.strided_slice"(%arg0, [[BEGIN]], [[END]], [[STRIDES]]) <{begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 1 : i32}> : (tensor<10x10x10xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<10x10xf32>
 }
 
 func.func @strided_slice_non_zero_ellipsis_mask(%arg0: tensor<12x2x2x5xf32>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>, %arg3: tensor<1xi32>) -> tensor<1x2x2x5xf32> {
   %0 = "tf.StridedSlice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i64, ellipsis_mask = 1 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64, offset = false} : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32>
   func.return %0 : tensor<1x2x2x5xf32>
   // CHECK-LABEL: strided_slice_non_zero_ellipsis_mask
-  // CHECK:  %0 = "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 1 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32>
+  // CHECK:  %0 = "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) <{begin_mask = 0 : i32, ellipsis_mask = 1 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32}> : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32>
 }
 
 func.func @strided_slice_non_zero_new_axis_mask(%arg0: tensor<12x2x2x5xf32>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>, %arg3: tensor<1xi32>) -> tensor<1x2x2x5xf32> {
   %0 = "tf.StridedSlice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 2 : i64, shrink_axis_mask = 0 : i64, offset = false} : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32>
   func.return %0 : tensor<1x2x2x5xf32>
   // CHECK-LABEL: strided_slice_non_zero_new_axis_mask
-  // CHECK: "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 2 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32>
+  // CHECK: "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) <{begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 2 : i32, offset = false, shrink_axis_mask = 0 : i32}> : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32>
 }
 
 func.func @strided_slice_big_dims(%arg0: tensor<5x6x7xf32>, %arg1: tensor<3xi32>, %arg2: tensor<3xi32>, %arg3: tensor<3xi32>) -> tensor<1x1x5x6x7xf32> {
@@ -1437,7 +1437,7 @@ func.func @mirror_pad(tensor<2x1x3xf32>, tensor<3x2xi32>) -> tensor<? x f32> {
   func.return %0#0 : tensor<? x f32>
 
   // CHECK-LABEL: mirror_pad
-  // CHECK:  "tfl.mirror_pad"(%arg0, %arg1) {mode = #tfl<mirror_pad_attr SYMMETRIC>} : (tensor<2x1x3xf32>, tensor<3x2xi32>) -> tensor<?xf32>
+  // CHECK:  "tfl.mirror_pad"(%arg0, %arg1) <{mode = #tfl<mirror_pad_attr SYMMETRIC>}> : (tensor<2x1x3xf32>, tensor<3x2xi32>) -> tensor<?xf32>
   // CHECK:  return
 }
 
@@ -1447,7 +1447,7 @@ func.func @mirror_pad_reflect(tensor<2x1x3xf32>, tensor<3x2xi32>) -> tensor<? x
   func.return %0#0 : tensor<? x f32>
 
   // CHECK-LABEL: mirror_pad_reflect
-  // CHECK:  "tfl.mirror_pad"(%arg0, %arg1) {mode = #tfl<mirror_pad_attr REFLECT>} : (tensor<2x1x3xf32>, tensor<3x2xi32>) -> tensor<?xf32>
+  // CHECK:  "tfl.mirror_pad"(%arg0, %arg1) <{mode = #tfl<mirror_pad_attr REFLECT>}> : (tensor<2x1x3xf32>, tensor<3x2xi32>) -> tensor<?xf32>
   // CHECK:  return
 }
 
@@ -1512,7 +1512,7 @@ func.func @ReverseSequence(%arg0: tensor<2x3xf32>, %arg1: tensor<2xi32>) -> tens
   func.return %0: tensor<2x3xf32>
 
 // CHECK-LABEL: ReverseSequence
-// CHECK:  "tfl.reverse_sequence"(%arg0, %arg1) {batch_dim = 0 : i32, seq_dim = 0 : i32} : (tensor<2x3xf32>, tensor<2xi32>) -> tensor<2x3xf32>
+// CHECK:  "tfl.reverse_sequence"(%arg0, %arg1) <{batch_dim = 0 : i32, seq_dim = 0 : i32}> : (tensor<2x3xf32>, tensor<2xi32>) -> tensor<2x3xf32>
 }
 
 func.func @LRN(%arg0: tensor<2x3x4x5xf32>) -> tensor<2x3x4x5xf32> {
@@ -1520,7 +1520,7 @@ func.func @LRN(%arg0: tensor<2x3x4x5xf32>) -> tensor<2x3x4x5xf32> {
   func.return %0: tensor<2x3x4x5xf32>
 
   // CHECK-LABEL: LRN
-  // CHECK: "tfl.local_response_normalization"(%arg0) {alpha = 1.000000e+00 : f32, beta = 5.000000e-01 : f32, bias = 1.000000e+00 : f32, radius = 5 : i32} : (tensor<2x3x4x5xf32>) -> tensor<2x3x4x5xf32>
+  // CHECK: "tfl.local_response_normalization"(%arg0) <{alpha = 1.000000e+00 : f32, beta = 5.000000e-01 : f32, bias = 1.000000e+00 : f32, radius = 5 : i32}> : (tensor<2x3x4x5xf32>) -> tensor<2x3x4x5xf32>
   // CHECK: return %0 : tensor<2x3x4x5xf32>
 }
 
@@ -1529,7 +1529,7 @@ func.func @OneHot(%arg0: tensor<3xi32>, %arg1: tensor<i32>, %arg2: tensor<f32>,
   func.return %0: tensor<*xf32>
 
 // CHECK-LABEL: OneHot
-// CHECK: "tfl.one_hot"(%arg0, %arg1, %arg2, %arg3) {axis = -1 : i32} : (tensor<3xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<*xf32>
+// CHECK: "tfl.one_hot"(%arg0, %arg1, %arg2, %arg3) <{axis = -1 : i32}> : (tensor<3xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<*xf32>
 }
 
 func.func @argmax(%arg0: tensor<3xi32>, %arg1: tensor<i32>) -> tensor<i32> {
@@ -1554,7 +1554,7 @@ func.func @space_to_depth(%arg0: tensor<1x2x2x1xf32>) -> tensor<?xf32> {
 
   // CHECK-LABEL: space_to_depth
   // CHECK: %[[ARG:.*]]: tensor<1x2x2x1xf32>
-  // CHECK: "tfl.space_to_depth"(%[[ARG]]) {block_size = 2 : i32} : (tensor<1x2x2x1xf32>) -> tensor<?xf32>
+  // CHECK: "tfl.space_to_depth"(%[[ARG]]) <{block_size = 2 : i32}> : (tensor<1x2x2x1xf32>) -> tensor<?xf32>
 }
 
 func.func @round(%arg0: tensor<8x16xf32>) -> tensor<8x16xf32> {
@@ -1571,14 +1571,14 @@ func.func @resize_nearest_neighbor(%arg0: tensor<1x100x100x3xf32>, %arg1: tensor
   %0 = "tf.ResizeNearestNeighbor"(%arg0, %arg1) {align_corners = true} : (tensor<1x100x100x3xf32>, tensor<4xi32>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
   // CHECK-LABEL: resize_nearest_neighbor
-  // CHECK: "tfl.resize_nearest_neighbor"(%arg0, %arg1) {align_corners = true, half_pixel_centers = false} : (tensor<1x100x100x3xf32>, tensor<4xi32>) -> tensor<?xf32>
+  // CHECK: "tfl.resize_nearest_neighbor"(%arg0, %arg1) <{align_corners = true, half_pixel_centers = false}> : (tensor<1x100x100x3xf32>, tensor<4xi32>) -> tensor<?xf32>
 }
 
 func.func @resize_nearest_neighbor_with_half_pixel_centers(%arg0: tensor<1x100x100x3xf32>, %arg1: tensor<4xi32>) -> tensor<?xf32> {
   %0 = "tf.ResizeNearestNeighbor"(%arg0, %arg1) {align_corners = false, half_pixel_centers = true} : (tensor<1x100x100x3xf32>, tensor<4xi32>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
   // CHECK-LABEL: resize_nearest_neighbor_with_half_pixel_centers
-  // CHECK: "tfl.resize_nearest_neighbor"(%arg0, %arg1) {align_corners = false, half_pixel_centers = true} : (tensor<1x100x100x3xf32>, tensor<4xi32>) -> tensor<?xf32>
+  // CHECK: "tfl.resize_nearest_neighbor"(%arg0, %arg1) <{align_corners = false, half_pixel_centers = true}> : (tensor<1x100x100x3xf32>, tensor<4xi32>) -> tensor<?xf32>
 }
 
 func.func @sparse_to_dense_with_scalar_sparse_indices(%arg0: tensor<i32>, %arg1: tensor<3xi32>, %arg2: tensor<f32>, %arg3: tensor<f32>) -> tensor<?x?x?xf32> {
@@ -1643,7 +1643,7 @@ func.func @depth_to_space(%arg0: tensor<1x1x1x4xf32>) -> tensor<1x2x2x1xf32> {
 
   // CHECK-LABEL: depth_to_space
   // CHECK: %[[ARG:.*]]: tensor<1x1x1x4xf32>
-  // CHECK: "tfl.depth_to_space"(%[[ARG]]) {block_size = 2 : i32} : (tensor<1x1x1x4xf32>) -> tensor<1x2x2x1xf32>
+  // CHECK: "tfl.depth_to_space"(%[[ARG]]) <{block_size = 2 : i32}> : (tensor<1x1x1x4xf32>) -> tensor<1x2x2x1xf32>
 }
 
 func.func @non_max_suppression_v4(%arg0: tensor<3x4xf32>, %arg1: tensor<3xf32>, %arg2: tensor<i32>, %arg3: tensor<f32>, %arg4: tensor<f32>) -> tensor<2xi32> {
@@ -1687,9 +1687,9 @@ func.func @conv2d_backprop_input(%arg0: tensor<4xi32>, %arg1: tensor<3x3x1x32xf3
   // CHECK-LABEL: conv2d_backprop_input
   // CHECK: %[[CST:.*]] = arith.constant dense<[2, 0, 1, 3]> : tensor<4xi32>
   // CHECK: %[[ARG0:.*]] = "tfl.transpose"(%arg1, %[[CST]]) : (tensor<3x3x1x32xf32>, tensor<4xi32>) -> tensor<1x3x3x32xf32>
-  // CHECK: %[[CST_0:.*]] = "tfl.no_value"() {value} : () -> none
-  // CHECK: %[[ARG1:.*]] = "tfl.transpose_conv"(%arg0, %[[ARG0]], %arg2, %[[CST_0]]) {fused_activation_function = "NONE", padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<1x3x3x32xf32>, tensor<15x14x14x32xf32>, none) -> tensor<15x28x28x1xf32>
-  // CHECK: %[[ARG3:.*]] = "tfl.transpose_conv"(%arg0, %[[ARG0]], %arg2, %[[CST_0]]) {fused_activation_function = "NONE", padding = "VALID", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<1x3x3x32xf32>, tensor<15x14x14x32xf32>, none) -> tensor<15x28x28x1xf32>
+  // CHECK: %[[CST_0:.*]] = "tfl.no_value"() <{value}> : () -> none
+  // CHECK: %[[ARG1:.*]] = "tfl.transpose_conv"(%arg0, %[[ARG0]], %arg2, %[[CST_0]]) <{fused_activation_function = "NONE", padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32}> : (tensor<4xi32>, tensor<1x3x3x32xf32>, tensor<15x14x14x32xf32>, none) -> tensor<15x28x28x1xf32>
+  // CHECK: %[[ARG3:.*]] = "tfl.transpose_conv"(%arg0, %[[ARG0]], %arg2, %[[CST_0]]) <{fused_activation_function = "NONE", padding = "VALID", stride_h = 2 : i32, stride_w = 2 : i32}> : (tensor<4xi32>, tensor<1x3x3x32xf32>, tensor<15x14x14x32xf32>, none) -> tensor<15x28x28x1xf32>
   // CHECK: %[[RESULT:.*]] = tfl.add %[[ARG1]], %[[ARG3]] {fused_activation_function = "NONE"} : tensor<15x28x28x1xf32>
   // CHECK: return %[[RESULT]] : tensor<15x28x28x1xf32>
 }
@@ -1734,7 +1734,7 @@ func.func @reciprocal_f32(%arg0: tensor<8xf32>) -> tensor<8xf32> {
 
 // CHECK-LABEL: reciprocal_f32
 // CHECK:  %cst = arith.constant dense<1.000000e+00> : tensor<f32>
-// CHECK:  tfl.div(%cst, %arg0) {fused_activation_function = "NONE"} : (tensor<f32>, tensor<8xf32>) -> tensor<8xf32>
+// CHECK:  tfl.div(%cst, %arg0) <{fused_activation_function = "NONE"}> : (tensor<f32>, tensor<8xf32>) -> tensor<8xf32>
 // CHECK:  return
 }
 
@@ -1744,7 +1744,7 @@ func.func @reciprocal_i32(%arg0: tensor<8xi32>) -> tensor<8xi32> {
 
 // CHECK-LABEL: reciprocal_i32
 // CHECK:  %cst = arith.constant dense<1> : tensor<i32>
-// CHECK:  tfl.div(%cst, %arg0) {fused_activation_function = "NONE"} : (tensor<i32>, tensor<8xi32>) -> tensor<8xi32>
+// CHECK:  tfl.div(%cst, %arg0) <{fused_activation_function = "NONE"}> : (tensor<i32>, tensor<8xi32>) -> tensor<8xi32>
 // CHECK:  return
 }
 
@@ -1763,8 +1763,8 @@ func.func @LstmWithoutProjection(%arg: tensor<28x1x28xf32>) -> (tensor<28x1x16xf
 // CHECK-DAG:       [[VAL_2:%.*]] = arith.constant dense<0.000000e+00> : tensor<16x16xf32>
 // CHECK-DAG:       [[VAL_3:%.*]] = arith.constant dense<0.000000e+00> : tensor<16xf32>
 // CHECK-DAG:       [[VAL_4:%.*]] = arith.constant dense<0.000000e+00> : tensor<1x16xf32>
-// CHECK-DAG:       [[VAL_5:%.*]] = "tfl.no_value"() {value} : () -> none
-// CHECK:           [[VAL_6:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_1]], [[VAL_1]], [[VAL_1]], [[VAL_1]], [[VAL_2]], [[VAL_2]], [[VAL_2]], [[VAL_2]], [[VAL_3]], [[VAL_3]], [[VAL_3]], [[VAL_3]], [[VAL_3]], [[VAL_3]], [[VAL_3]], [[VAL_5]], [[VAL_5]], [[VAL_4]], [[VAL_4]], [[VAL_5]], [[VAL_5]], [[VAL_5]], [[VAL_5]]) {cell_clip = 0.000000e+00 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<28x1x28xf32>, tensor<16x28xf32>, tensor<16x28xf32>, tensor<16x28xf32>, tensor<16x28xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, none, none, tensor<1x16xf32>, tensor<1x16xf32>, none, none, none, none) -> tensor<28x1x16xf32>
+// CHECK-DAG:       [[VAL_5:%.*]] = "tfl.no_value"() <{value}> : () -> none
+// CHECK:           [[VAL_6:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_1]], [[VAL_1]], [[VAL_1]], [[VAL_1]], [[VAL_2]], [[VAL_2]], [[VAL_2]], [[VAL_2]], [[VAL_3]], [[VAL_3]], [[VAL_3]], [[VAL_3]], [[VAL_3]], [[VAL_3]], [[VAL_3]], [[VAL_5]], [[VAL_5]], [[VAL_4]], [[VAL_4]], [[VAL_5]], [[VAL_5]], [[VAL_5]], [[VAL_5]]) <{cell_clip = 0.000000e+00 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true}> : (tensor<28x1x28xf32>, tensor<16x28xf32>, tensor<16x28xf32>, tensor<16x28xf32>, tensor<16x28xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, none, none, tensor<1x16xf32>, tensor<1x16xf32>, none, none, none, none) -> tensor<28x1x16xf32>
 // CHECK:           return [[VAL_6]] : tensor<28x1x16xf32>
 // CHECK:         }
 
@@ -1788,8 +1788,8 @@ func.func @LstmWithProjection(%arg: tensor<28x1x16xf32>) -> (tensor<28x1x8xf32>)
 // CHECK-DAG:       [[VAL_11:%.*]] = arith.constant dense<0.000000e+00> : tensor<1x16xf32>
 // CHECK-DAG:       [[VAL_12:%.*]] = arith.constant dense<0.000000e+00> : tensor<8x16xf32>
 // CHECK-DAG:       [[VAL_13:%.*]] = arith.constant dense<0.000000e+00> : tensor<1x8xf32>
-// CHECK-DAG:       [[VAL_14:%.*]] = "tfl.no_value"() {value} : () -> none
-// CHECK:           [[VAL_15:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_7]], [[VAL_8]], [[VAL_8]], [[VAL_8]], [[VAL_8]], [[VAL_9]], [[VAL_9]], [[VAL_9]], [[VAL_9]], [[VAL_14]], [[VAL_14]], [[VAL_14]], [[VAL_10]], [[VAL_10]], [[VAL_10]], [[VAL_10]], [[VAL_12]], [[VAL_14]], [[VAL_13]], [[VAL_11]], [[VAL_14]], [[VAL_14]], [[VAL_14]], [[VAL_14]]) {cell_clip = 0.000000e+00 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<28x1x16xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16x8xf32>, tensor<16x8xf32>, tensor<16x8xf32>, tensor<16x8xf32>, none, none, none, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<8x16xf32>, none, tensor<1x8xf32>, tensor<1x16xf32>, none, none, none, none) -> tensor<28x1x8xf32>
+// CHECK-DAG:       [[VAL_14:%.*]] = "tfl.no_value"() <{value}> : () -> none
+// CHECK:           [[VAL_15:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_7]], [[VAL_8]], [[VAL_8]], [[VAL_8]], [[VAL_8]], [[VAL_9]], [[VAL_9]], [[VAL_9]], [[VAL_9]], [[VAL_14]], [[VAL_14]], [[VAL_14]], [[VAL_10]], [[VAL_10]], [[VAL_10]], [[VAL_10]], [[VAL_12]], [[VAL_14]], [[VAL_13]], [[VAL_11]], [[VAL_14]], [[VAL_14]], [[VAL_14]], [[VAL_14]]) <{cell_clip = 0.000000e+00 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true}> : (tensor<28x1x16xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16x8xf32>, tensor<16x8xf32>, tensor<16x8xf32>, tensor<16x8xf32>, none, none, none, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<8x16xf32>, none, tensor<1x8xf32>, tensor<1x16xf32>, none, none, none, none) -> tensor<28x1x8xf32>
 // CHECK:           return [[VAL_15]] : tensor<28x1x8xf32>
 // CHECK:         }
 
@@ -1805,7 +1805,7 @@ func.func @UnidirectionalRnn(%arg: tensor<28x1x28xf32>) -> (tensor<28x1x28xf32>)
 // CHECK-DAG:       [[VAL_1:%.*]] = arith.constant dense<0.000000e+00> : tensor<28x28xf32>
 // CHECK-DAG:       [[VAL_2:%.*]] = arith.constant dense<0.000000e+00> : tensor<28xf32>
 // CHECK-DAG:       [[VAL_3:%.*]] = arith.constant dense<0.000000e+00> : tensor<1x28xf32>
-// CHECK:           [[VAL_4:%.*]] = "tfl.unidirectional_sequence_rnn"([[VAL_0]], [[VAL_1]], [[VAL_1]], [[VAL_2]], [[VAL_3]]) {fused_activation_function = "TANH", time_major = true} : (tensor<28x1x28xf32>, tensor<28x28xf32>, tensor<28x28xf32>, tensor<28xf32>, tensor<1x28xf32>) -> tensor<28x1x28xf32>
+// CHECK:           [[VAL_4:%.*]] = "tfl.unidirectional_sequence_rnn"([[VAL_0]], [[VAL_1]], [[VAL_1]], [[VAL_2]], [[VAL_3]]) <{fused_activation_function = "TANH", time_major = true}> : (tensor<28x1x28xf32>, tensor<28x28xf32>, tensor<28x28xf32>, tensor<28xf32>, tensor<1x28xf32>) -> tensor<28x1x28xf32>
 // CHECK:           return [[VAL_4]] : tensor<28x1x28xf32>
 // CHECK:         }
 
@@ -1832,7 +1832,7 @@ func.func @matmul_batch(%arg0: tensor<10x15xf32>, %arg1: tensor<15x17xf32>) -> t
 (tensor<10x15xf32>, tensor<15x17xf32>) -> tensor<10x17xf32>
   func.return %0 : tensor<10x17xf32>
 // CHECK-LABEL: matmul_batch
-// CHECK: "tfl.batch_matmul"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<10x15xf32>, tensor<15x17xf32>) -> tensor<10x17xf32>
+// CHECK: "tfl.batch_matmul"(%arg0, %arg1) <{adj_x = false, adj_y = false}> : (tensor<10x15xf32>, tensor<15x17xf32>) -> tensor<10x17xf32>
 }
 
 func.func @matmul_batchv2(%arg0: tensor<2x10x15xf32>, %arg1: tensor<15x17xf32>) -> tensor<2x10x17xf32> {
@@ -1840,7 +1840,7 @@ func.func @matmul_batchv2(%arg0: tensor<2x10x15xf32>, %arg1: tensor<15x17xf32>)
 (tensor<2x10x15xf32>, tensor<15x17xf32>) -> tensor<2x10x17xf32>
   func.return %0 : tensor<2x10x17xf32>
 // CHECK-LABEL: matmul_batchv2
-// CHECK: "tfl.batch_matmul"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<2x10x15xf32>, tensor<15x17xf32>) -> tensor<2x10x17xf32>
+// CHECK: "tfl.batch_matmul"(%arg0, %arg1) <{adj_x = false, adj_y = false}> : (tensor<2x10x15xf32>, tensor<15x17xf32>) -> tensor<2x10x17xf32>
 }
 
 func.func @matmul_batchv3(%arg0: tensor<2x10x15xf32>, %arg1: tensor<15x17xf32>) -> tensor<2x10x17xf32> {
@@ -1848,7 +1848,7 @@ func.func @matmul_batchv3(%arg0: tensor<2x10x15xf32>, %arg1: tensor<15x17xf32>)
 (tensor<2x10x15xf32>, tensor<15x17xf32>) -> tensor<2x10x17xf32>
   func.return %0 : tensor<2x10x17xf32>
 // CHECK-LABEL: matmul_batchv3
-// CHECK: "tfl.batch_matmul"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<2x10x15xf32>, tensor<15x17xf32>) -> tensor<2x10x17xf32>
+// CHECK: "tfl.batch_matmul"(%arg0, %arg1) <{adj_x = false, adj_y = false}> : (tensor<2x10x15xf32>, tensor<15x17xf32>) -> tensor<2x10x17xf32>
 }
 
 func.func @matmul_batchv3_int8(%arg0: tensor<2x10x15xi8>, %arg1: tensor<15x17xi8>) -> tensor<2x10x17xi32> {
@@ -1856,7 +1856,7 @@ func.func @matmul_batchv3_int8(%arg0: tensor<2x10x15xi8>, %arg1: tensor<15x17xi8
 (tensor<2x10x15xi8>, tensor<15x17xi8>) -> tensor<2x10x17xi32>
   func.return %0 : tensor<2x10x17xi32>
 // CHECK-LABEL: matmul_batchv3_int8
-// CHECK: "tfl.batch_matmul"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<2x10x15xi8>, tensor<15x17xi8>) -> tensor<2x10x17xi32>
+// CHECK: "tfl.batch_matmul"(%arg0, %arg1) <{adj_x = false, adj_y = false}> : (tensor<2x10x15xi8>, tensor<15x17xi8>) -> tensor<2x10x17xi32>
 }
 
 func.func @matmul_batchv2_unknown_dim(%arg0: tensor<?x10x15xf32>, %arg1: tensor<15x17xf32>) -> tensor<?x10x17xf32> {
@@ -1864,7 +1864,7 @@ func.func @matmul_batchv2_unknown_dim(%arg0: tensor<?x10x15xf32>, %arg1: tensor<
 (tensor<?x10x15xf32>, tensor<15x17xf32>) -> tensor<?x10x17xf32>
   func.return %0 : tensor<?x10x17xf32>
 // CHECK-LABEL: matmul_batchv2_unknown_dim
-// CHECK: "tfl.batch_matmul"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<?x10x15xf32>, tensor<15x17xf32>) -> tensor<?x10x17xf32>
+// CHECK: "tfl.batch_matmul"(%arg0, %arg1) <{adj_x = false, adj_y = false}> : (tensor<?x10x15xf32>, tensor<15x17xf32>) -> tensor<?x10x17xf32>
 }
 
 func.func @matmul_batchv3_unknown_dim(%arg0: tensor<?x10x15xf32>, %arg1: tensor<15x17xf32>) -> tensor<?x10x17xf32> {
@@ -1872,7 +1872,7 @@ func.func @matmul_batchv3_unknown_dim(%arg0: tensor<?x10x15xf32>, %arg1: tensor<
 (tensor<?x10x15xf32>, tensor<15x17xf32>) -> tensor<?x10x17xf32>
   func.return %0 : tensor<?x10x17xf32>
 // CHECK-LABEL: matmul_batchv3_unknown_dim
-// CHECK: "tfl.batch_matmul"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<?x10x15xf32>, tensor<15x17xf32>) -> tensor<?x10x17xf32>
+// CHECK: "tfl.batch_matmul"(%arg0, %arg1) <{adj_x = false, adj_y = false}> : (tensor<?x10x15xf32>, tensor<15x17xf32>) -> tensor<?x10x17xf32>
 }
 
 func.func @matmul_batchv3_unknown_dim_bf16(%arg0: tensor<?x4x5xbf16>, %arg1: tensor<5x6xf32>) -> tensor<?x4x6xbf16> {
@@ -1883,7 +1883,7 @@ func.func @matmul_batchv3_unknown_dim_bf16(%arg0: tensor<?x4x5xbf16>, %arg1: ten
   func.return %2 : tensor<?x4x6xbf16>
 // CHECK-LABEL: matmul_batchv3_unknown_dim_bf16
 // CHECK: [[CST:%.*]] = "tfl.cast"(%arg0) : (tensor<?x4x5xbf16>) -> tensor<?x4x5xf32>
-// CHECK: [[BMM:%.*]] = "tfl.batch_matmul"([[CST]], %arg1) {adj_x = false, adj_y = false} : (tensor<?x4x5xf32>, tensor<5x6xf32>) -> tensor<?x4x6xf32>
+// CHECK: [[BMM:%.*]] = "tfl.batch_matmul"([[CST]], %arg1) <{adj_x = false, adj_y = false}> : (tensor<?x4x5xf32>, tensor<5x6xf32>) -> tensor<?x4x6xf32>
 // CHECK: "tfl.cast"([[BMM]]) : (tensor<?x4x6xf32>) -> tensor<?x4x6xbf16>
 }
 
@@ -1918,12 +1918,12 @@ func.func @test5DAddWithImplicitBroadcast(%arg0: tensor<1x1x1x3x1xi32>, %arg1 :
   %0 = "tf.Add"(%arg0, %arg1): (tensor<1x1x1x3x1xi32>, tensor<1x1x1x1x4xi32>) -> tensor<1x1x1x3x4xi32>
   func.return %0 : tensor<1x1x1x3x4xi32>
 // CHECK-LABEL: test5DAddWithImplicitBroadcast
-// CHECK: %0 = tfl.add(%arg0, %arg1) {fused_activation_function = "NONE"} : (tensor<1x1x1x3x1xi32>, tensor<1x1x1x1x4xi32>) -> tensor<1x1x1x3x4xi32>
+// CHECK: %0 = tfl.add(%arg0, %arg1) <{fused_activation_function = "NONE"}> : (tensor<1x1x1x3x1xi32>, tensor<1x1x1x1x4xi32>) -> tensor<1x1x1x3x4xi32>
 }
 
 func.func @test6DAddWithImplicitBroadcast(%arg0: tensor<1x2x1x4x5x6xi32>, %arg1: tensor<1x2x3x4x5x1xi32>) -> tensor<1x2x3x4x5x6xi32> {
 // CHECK-LABEL: test6DAddWithImplicitBroadcast
-// CHECK:  %0 = tfl.add(%arg0, %arg1) {fused_activation_function = "NONE"} : (tensor<1x2x1x4x5x6xi32>, tensor<1x2x3x4x5x1xi32>) -> tensor<1x2x3x4x5x6xi32>
+// CHECK:  %0 = tfl.add(%arg0, %arg1) <{fused_activation_function = "NONE"}> : (tensor<1x2x1x4x5x6xi32>, tensor<1x2x3x4x5x1xi32>) -> tensor<1x2x3x4x5x6xi32>
   %0 = "tf.Add"(%arg0, %arg1) : (tensor<1x2x1x4x5x6xi32>, tensor<1x2x3x4x5x1xi32>) -> tensor<1x2x3x4x5x6xi32>
   func.return %0 : tensor<1x2x3x4x5x6xi32>
 }
@@ -1941,12 +1941,12 @@ func.func @test5DSubWithImplicitBroadcast(%arg0: tensor<1x1x1x3x1xi32>, %arg1 :
   %0 = "tf.Sub"(%arg0, %arg1): (tensor<1x1x1x3x1xi32>, tensor<1x1x1x1x4xi32>) -> tensor<1x1x1x3x4xi32>
   func.return %0 : tensor<1x1x1x3x4xi32>
 // CHECK-LABEL: test5DSubWithImplicitBroadcast
-// CHECK: %0 = tfl.sub(%arg0, %arg1) {fused_activation_function = "NONE"} : (tensor<1x1x1x3x1xi32>, tensor<1x1x1x1x4xi32>) -> tensor<1x1x1x3x4xi32>
+// CHECK: %0 = tfl.sub(%arg0, %arg1) <{fused_activation_function = "NONE"}> : (tensor<1x1x1x3x1xi32>, tensor<1x1x1x1x4xi32>) -> tensor<1x1x1x3x4xi32>
 }
 
 func.func @test6DSubWithImplicitBroadcast(%arg0: tensor<1x2x1x4x5x6xi32>, %arg1: tensor<1x2x3x4x5x1xi32>) -> tensor<1x2x3x4x5x6xi32> {
 // CHECK-LABEL: test6DSubWithImplicitBroadcast
-// CHECK:  %0 = tfl.sub(%arg0, %arg1) {fused_activation_function = "NONE"} : (tensor<1x2x1x4x5x6xi32>, tensor<1x2x3x4x5x1xi32>) -> tensor<1x2x3x4x5x6xi32>
+// CHECK:  %0 = tfl.sub(%arg0, %arg1) <{fused_activation_function = "NONE"}> : (tensor<1x2x1x4x5x6xi32>, tensor<1x2x3x4x5x1xi32>) -> tensor<1x2x3x4x5x6xi32>
   %0 = "tf.Sub"(%arg0, %arg1) : (tensor<1x2x1x4x5x6xi32>, tensor<1x2x3x4x5x1xi32>) -> tensor<1x2x3x4x5x6xi32>
   func.return %0 : tensor<1x2x3x4x5x6xi32>
 }
@@ -1964,12 +1964,12 @@ func.func @test5DMulWithImplicitBroadcast(%arg0: tensor<1x1x1x3x1xi32>, %arg1 :
   %0 = "tf.Mul"(%arg0, %arg1): (tensor<1x1x1x3x1xi32>, tensor<1x1x1x1x4xi32>) -> tensor<1x1x1x3x4xi32>
   func.return %0 : tensor<1x1x1x3x4xi32>
 // CHECK-LABEL: test5DMulWithImplicitBroadcast
-// CHECK: %0 = tfl.mul(%arg0, %arg1) {fused_activation_function = "NONE"} : (tensor<1x1x1x3x1xi32>, tensor<1x1x1x1x4xi32>) -> tensor<1x1x1x3x4xi32>
+// CHECK: %0 = tfl.mul(%arg0, %arg1) <{fused_activation_function = "NONE"}> : (tensor<1x1x1x3x1xi32>, tensor<1x1x1x1x4xi32>) -> tensor<1x1x1x3x4xi32>
 }
 
 func.func @test6DMulWithImplicitBroadcast(%arg0: tensor<1x2x1x4x5x6xi32>, %arg1: tensor<1x2x3x4x5x1xi32>) -> tensor<1x2x3x4x5x6xi32> {
 // CHECK-LABEL: test6DMulWithImplicitBroadcast
-// CHECK:  %0 = tfl.mul(%arg0, %arg1) {fused_activation_function = "NONE"} : (tensor<1x2x1x4x5x6xi32>, tensor<1x2x3x4x5x1xi32>) -> tensor<1x2x3x4x5x6xi32>
+// CHECK:  %0 = tfl.mul(%arg0, %arg1) <{fused_activation_function = "NONE"}> : (tensor<1x2x1x4x5x6xi32>, tensor<1x2x3x4x5x1xi32>) -> tensor<1x2x3x4x5x6xi32>
   %0 = "tf.Mul"(%arg0, %arg1) : (tensor<1x2x1x4x5x6xi32>, tensor<1x2x3x4x5x1xi32>) -> tensor<1x2x3x4x5x6xi32>
   func.return %0 : tensor<1x2x3x4x5x6xi32>
 }
@@ -2148,7 +2148,7 @@ func.func @cumsum(%arg0: tensor<3x3xf32>, %arg1: tensor<i32>) -> tensor<3x3xf32>
   %0 = "tf.Cumsum"(%arg0, %arg1) {exclusive = false, reverse = false} : (tensor<3x3xf32>, tensor<i32>) -> tensor<3x3xf32>
   func.return %0 : tensor<3x3xf32>
   // CHECK-LABEL: cumsum
-  // CHECK: "tfl.cumsum"(%arg0, %arg1) {exclusive = false, reverse = false} : (tensor<3x3xf32>, tensor<i32>) -> tensor<3x3xf32>
+  // CHECK: "tfl.cumsum"(%arg0, %arg1) <{exclusive = false, reverse = false}> : (tensor<3x3xf32>, tensor<i32>) -> tensor<3x3xf32>
 }
 
 func.func @cumsum_i64(%arg0: tensor<3x3xf32>, %arg1: tensor<i64>) -> tensor<3x3xf32> {
@@ -2314,8 +2314,8 @@ func.func @conv3d_valid(%arg0: tensor<?x?x?x?x?xf32>,%arg1:  tensor<?x?x?x?x?xf3
   func.return %0: tensor<?x?x?x?x?xf32>
 
   // CHECK-LABEL: conv3d_valid
-  // CHECK:  %[[CST:.*]] = "tfl.no_value"() {value} : () -> none
-  // CHECK:  [[BCT:%.*]] = "tfl.conv_3d"(%arg0, %arg1, %[[CST]]) {dilation_d_factor = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_d = 1 : i32, stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<?x?x?x?x?xf32>, tensor<?x?x?x?x?xf32>, none) -> tensor<?x?x?x?x?xf32>
+  // CHECK:  %[[CST:.*]] = "tfl.no_value"() <{value}> : () -> none
+  // CHECK:  [[BCT:%.*]] = "tfl.conv_3d"(%arg0, %arg1, %[[CST]]) <{dilation_d_factor = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_d = 1 : i32, stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<?x?x?x?x?xf32>, tensor<?x?x?x?x?xf32>, none) -> tensor<?x?x?x?x?xf32>
   // CHECK:  return [[BCT]] : tensor<?x?x?x?x?xf32>
 }
 
@@ -2359,7 +2359,7 @@ func.func @all(%arg0: tensor<2x2xi1>, %arg1: tensor<i32>) -> tensor<i1> {
   func.return %0 : tensor<i1>
 
 // CHECK-LABEL:all
-// CHECK:  "tfl.reduce_all"(%arg0, %arg1) {keep_dims = false} : (tensor<2x2xi1>, tensor<i32>) -> tensor<i1>
+// CHECK:  "tfl.reduce_all"(%arg0, %arg1) <{keep_dims = false}> : (tensor<2x2xi1>, tensor<i32>) -> tensor<i1>
 }
 
 func.func @all_i64axes(%arg0: tensor<8x16x16xi1>, %arg1: tensor<2xi64>) -> tensor<?xi1> {
@@ -2368,7 +2368,7 @@ func.func @all_i64axes(%arg0: tensor<8x16x16xi1>, %arg1: tensor<2xi64>) -> tenso
 
   // CHECK-LABEL: all_i64axes
   // CHECK: %[[V0:.*]] = "tfl.cast"(%arg1) : (tensor<2xi64>) -> tensor<2xi32>
-  // CHECK: "tfl.reduce_all"(%arg0, %[[V0]]) {keep_dims = false} : (tensor<8x16x16xi1>, tensor<2xi32>) -> tensor<?xi1>
+  // CHECK: "tfl.reduce_all"(%arg0, %[[V0]]) <{keep_dims = false}> : (tensor<8x16x16xi1>, tensor<2xi32>) -> tensor<?xi1>
 }
 
 func.func @quantize_dequantize_v4(%arg0 : tensor<?x?xf32>) -> tensor<?x?xf32> {
@@ -2378,7 +2378,7 @@ func.func @quantize_dequantize_v4(%arg0 : tensor<?x?xf32>) -> tensor<?x?xf32> {
   func.return %0 : tensor<?x?xf32>
 
 // CHECK-LABEL: quantize_dequantize_v4
-// CHECK:  %[[QUANT:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<?x?x!quant.uniform<u8:f32, 1.000000e+00>>} : (tensor<?x?xf32>) -> tensor<?x?x!quant.uniform<u8:f32, 1.000000e+00>>
+// CHECK:  %[[QUANT:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<?x?x!quant.uniform<u8:f32, 1.000000e+00>>}> : (tensor<?x?xf32>) -> tensor<?x?x!quant.uniform<u8:f32, 1.000000e+00>>
 // CHECK:  %[[DEQUANT:.*]] = "tfl.dequantize"(%[[QUANT]]) : (tensor<?x?x!quant.uniform<u8:f32, 1.000000e+00>>) -> tensor<?x?xf32>
 // CHECK:  return %[[DEQUANT]]
 }
@@ -2387,9 +2387,9 @@ func.func @conv3d_transpose(%arg0: tensor<2x5x6x8x2xf32>, %arg1: tensor<1x2x2x3x
   %0 = "tf.Conv3DBackpropInputV2"(%arg2, %arg1, %arg0) {data_format = "NDHWC", dilations = [1, 1, 1, 1, 1], padding = "VALID", strides = [1, 2, 2, 2, 1]} : (tensor<5xi64>, tensor<1x2x2x3x2xf32>, tensor<2x5x6x8x2xf32>) -> tensor<?x?x?x?x?xf32>
   func.return %0 : tensor<?x?x?x?x?xf32>
   // CHECK-LABEL: conv3d_transpose
-  // CHECK: %[[CST:.*]] = "tfl.no_value"() {value} : () -> none
+  // CHECK: %[[CST:.*]] = "tfl.no_value"() <{value}> : () -> none
   // CHECK: %[[OUT_SHAPE:.*]] = "tfl.cast"(%arg2) : (tensor<5xi64>) -> tensor<5xi32>
-  // CHECK: %[[RESULT:.*]] = "tfl.conv_3d_transpose"(%[[OUT_SHAPE]], %arg1, %arg0, %[[CST]]) {dilation_d_factor = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_d = 2 : i32, stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<5xi32>, tensor<1x2x2x3x2xf32>, tensor<2x5x6x8x2xf32>, none) -> tensor<?x?x?x?x?xf32>
+  // CHECK: %[[RESULT:.*]] = "tfl.conv_3d_transpose"(%[[OUT_SHAPE]], %arg1, %arg0, %[[CST]]) <{dilation_d_factor = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_d = 2 : i32, stride_h = 2 : i32, stride_w = 2 : i32}> : (tensor<5xi32>, tensor<1x2x2x3x2xf32>, tensor<2x5x6x8x2xf32>, none) -> tensor<?x?x?x?x?xf32>
   // CHECK: return %[[RESULT]] : tensor<?x?x?x?x?xf32>
 }
 
@@ -2464,7 +2464,7 @@ func.func @mul_with_unranked_lhs(%arg0: tensor<*xf32>, %arg1: tensor<?x3x2x1xf32
   func.return %0 : tensor<?x3x2x1xf32>
 
   // CHECK-LABEL:mul_with_unranked_lhs
-  // CHECK: %0 = tfl.mul(%arg0, %arg1) {fused_activation_function = "NONE"} : (tensor<*xf32>, tensor<?x3x2x1xf32>) -> tensor<?x3x2x1xf32>
+  // CHECK: %0 = tfl.mul(%arg0, %arg1) <{fused_activation_function = "NONE"}> : (tensor<*xf32>, tensor<?x3x2x1xf32>) -> tensor<?x3x2x1xf32>
   // CHECK: return %0 : tensor<?x3x2x1xf32>
 }
 
@@ -2530,7 +2530,7 @@ func.func @Bucketize(%arg0: tensor<3x2xf32>) -> tensor<3x2xi32> {
   func.return %0: tensor<3x2xi32>
 
 // CHECK-LABEL: Bucketize
-// CHECK:  "tfl.bucketize"(%arg0) {boundaries = [1.000000e+00 : f32, 1.000000e+01 : f32, 1.000000e+02 : f32]} : (tensor<3x2xf32>) -> tensor<3x2xi32>
+// CHECK:  "tfl.bucketize"(%arg0) <{boundaries = [1.000000e+00 : f32, 1.000000e+01 : f32, 1.000000e+02 : f32]}> : (tensor<3x2xf32>) -> tensor<3x2xi32>
 }
 
 func.func @random_uniform_f32(%arg0: tensor<3xi32>) -> tensor<?x?x?xf32> {
@@ -2538,7 +2538,7 @@ func.func @random_uniform_f32(%arg0: tensor<3xi32>) -> tensor<?x?x?xf32> {
   func.return %0 : tensor<?x?x?xf32>
 
 // CHECK-LABEL:random_uniform_f32
-// CHECK:  "tfl.random_uniform"(%arg0) {seed = 0 : i64, seed2 = 0 : i64} : (tensor<3xi32>) -> tensor<?x?x?xf32>
+// CHECK:  "tfl.random_uniform"(%arg0) <{seed = 0 : i64, seed2 = 0 : i64}> : (tensor<3xi32>) -> tensor<?x?x?xf32>
 }
 
 func.func @random_standard_normal_f32(%arg0: tensor<3xi32>) -> tensor<?x?x?xf32> {
@@ -2546,7 +2546,7 @@ func.func @random_standard_normal_f32(%arg0: tensor<3xi32>) -> tensor<?x?x?xf32>
   func.return %0 : tensor<?x?x?xf32>
 
 // CHECK-LABEL:random_standard_normal_f32
-// CHECK:  "tfl.random_standard_normal"(%arg0) {seed = 0 : i64, seed2 = 0 : i64} : (tensor<3xi32>) -> tensor<?x?x?xf32>
+// CHECK:  "tfl.random_standard_normal"(%arg0) <{seed = 0 : i64, seed2 = 0 : i64}> : (tensor<3xi32>) -> tensor<?x?x?xf32>
 }
 
 func.func @multinomial_i64(%arg0: tensor<2xf32>, %arg1: tensor<1xi32>) -> tensor<10xi64> {
@@ -2554,7 +2554,7 @@ func.func @multinomial_i64(%arg0: tensor<2xf32>, %arg1: tensor<1xi32>) -> tensor
   func.return %0 : tensor<10xi64>
 
 // CHECK-LABEL:multinomial_i64
-// CHECK: "tfl.multinomial"(%arg0, %arg1) {seed = 0 : i64, seed2 = 0 : i64} : (tensor<2xf32>, tensor<1xi32>) -> tensor<10xi64>
+// CHECK: "tfl.multinomial"(%arg0, %arg1) <{seed = 0 : i64, seed2 = 0 : i64}> : (tensor<2xf32>, tensor<1xi32>) -> tensor<10xi64>
 }
 
 func.func @multinomial_i32(%arg0: tensor<2xf32>, %arg1: tensor<1xi32>) -> tensor<10xi32> {
@@ -2562,7 +2562,7 @@ func.func @multinomial_i32(%arg0: tensor<2xf32>, %arg1: tensor<1xi32>) -> tensor
   func.return %0 : tensor<10xi32>
 
 // CHECK-LABEL:multinomial_i32
-// CHECK: "tfl.multinomial"(%arg0, %arg1) {seed = 0 : i64, seed2 = 0 : i64} : (tensor<2xf32>, tensor<1xi32>) -> tensor<10xi32>
+// CHECK: "tfl.multinomial"(%arg0, %arg1) <{seed = 0 : i64, seed2 = 0 : i64}> : (tensor<2xf32>, tensor<1xi32>) -> tensor<10xi32>
 }
 
 func.func @dynamic_update_slice(%arg0: tensor<4x5xi32>, %arg1: tensor<1x5xi32>, %arg2: tensor<2xi32>) -> tensor<4x5xi32> {
@@ -2683,7 +2683,7 @@ func.func @sigmoidGrad(%arg0: tensor<?x32xf32>, %arg1: tensor<?x32xf32>) -> tens
   func.return %0 : tensor<?x32xf32>
 // CHECK-LABEL: sigmoidGrad
 // CHECK-NEXT: [[ONE:%.+]] = arith.constant dense<1.000000e+00> : tensor<f32>
-// CHECK-NEXT: [[SUB:%.+]] = tfl.sub([[ONE]], %arg0) {fused_activation_function = "NONE"} : (tensor<f32>, tensor<?x32xf32>) -> tensor<?x32xf32>
+// CHECK-NEXT: [[SUB:%.+]] = tfl.sub([[ONE]], %arg0) <{fused_activation_function = "NONE"}> : (tensor<f32>, tensor<?x32xf32>) -> tensor<?x32xf32>
 // CHECK-NEXT: [[MUL0:%.+]] = tfl.mul %arg0, [[SUB]] {fused_activation_function = "NONE"} : tensor<?x32xf32>
 // CHECK-NEXT: [[MUL1:%.+]] =  tfl.mul %arg1, [[MUL0]] {fused_activation_function = "NONE"} : tensor<?x32xf32>
 // CHECK: return [[MUL1]]
@@ -2697,8 +2697,8 @@ func.func @batchmatmul2fullyconnected(%arg0: tensor<4x128x2xf32>) -> (tensor<4x1
   // CHECK-LABEL: batchmatmul2fullyconnected
   // CHECK-DAG:  %cst_0 = arith.constant dense<[1, 0]> : tensor<2xi32> 
   // CHECK:  %0 = "tfl.transpose"(%cst, %cst_0) : (tensor<2x1xf32>, tensor<2xi32>) -> tensor<1x2xf32> 
-  // CHECK-DAG:  %1 = "tfl.no_value"() {value} : () -> none
-  // CHECK:  %2 = "tfl.fully_connected"(%arg0, %0, %1) {fused_activation_function = "NONE", keep_num_dims = true, weights_format = "DEFAULT"} : (tensor<4x128x2xf32>, tensor<1x2xf32>, none) -> tensor<4x128x1xf32>
+  // CHECK-DAG:  %1 = "tfl.no_value"() <{value}> : () -> none
+  // CHECK:  %2 = "tfl.fully_connected"(%arg0, %0, %1) <{fused_activation_function = "NONE", keep_num_dims = true, weights_format = "DEFAULT"}> : (tensor<4x128x2xf32>, tensor<1x2xf32>, none) -> tensor<4x128x1xf32>
   // CHECK:  return %2 : tensor<4x128x1xf32>
 }
 
diff --git a/tensorflow/compiler/mlir/lite/tests/legalize_jax_random.mlir b/tensorflow/compiler/mlir/lite/tests/legalize_jax_random.mlir
index 76f453d1d3a8aa..fe1c86c3d3feb7 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize_jax_random.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize_jax_random.mlir
@@ -4,7 +4,7 @@
 // CHECK-LABEL:   func @tfl_wrapped_jax_random_normal(
 // CHECK-SAME:                                        %[[RNG:.*]]: tensor<2xui32>) -> tuple<tensor<3x4xf32>> {
 // CHECK:           %[[VAL_0:.*]] = stablehlo.constant dense<[3, 4]> : tensor<2xi32>
-// CHECK:           %[[VAL_1:.*]] = "tfl.custom"(%[[VAL_0]]) {custom_code = "RandomStandardNormal", custom_option = #tfl<const_bytes : "0x">} : (tensor<2xi32>) -> tensor<3x4xf32>
+// CHECK:           %[[VAL_1:.*]] = "tfl.custom"(%[[VAL_0]]) <{custom_code = "RandomStandardNormal", custom_option = #tfl<const_bytes : "0x">}> : (tensor<2xi32>) -> tensor<3x4xf32>
 // CHECK:           %[[VAL_2:.*]] = stablehlo.tuple %[[VAL_1]] : tuple<tensor<3x4xf32>>
 // CHECK:           return %[[VAL_2]] : tuple<tensor<3x4xf32>>
 // CHECK:         }
@@ -20,7 +20,7 @@ func.func @tfl_wrapped_jax_random_normal(%arg0: tensor<2xui32>) -> tuple<tensor<
 // CHECK-LABEL:   func @tfl_wrapped_jax_random_uniform(
 // CHECK-SAME:                                         %[[RNG:.*]]: tensor<2xui32>) -> tuple<tensor<1x2xf32>> {
 // CHECK:           %[[VAL_0:.*]] = stablehlo.constant dense<[1, 2]> : tensor<2xi32>
-// CHECK:           %[[VAL_1:.*]] = "tfl.custom"(%[[VAL_0]]) {custom_code = "RandomUniform", custom_option = #tfl<const_bytes : "0x">} : (tensor<2xi32>) -> tensor<1x2xf32>
+// CHECK:           %[[VAL_1:.*]] = "tfl.custom"(%[[VAL_0]]) <{custom_code = "RandomUniform", custom_option = #tfl<const_bytes : "0x">}> : (tensor<2xi32>) -> tensor<1x2xf32>
 // CHECK:           %[[VAL_2:.*]] = stablehlo.tuple %[[VAL_1]] : tuple<tensor<1x2xf32>>
 // CHECK:           return %[[VAL_2]] : tuple<tensor<1x2xf32>>
 // CHECK:         }
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fake_quant.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fake_quant.mlir
index 89b3a2b7caa079..10f051bf66c966 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fake_quant.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fake_quant.mlir
@@ -59,7 +59,7 @@ func.func @main(tensor<4xf32>) -> tensor<4xf32> {
 // CHECK-NEXT:     signature_defs: [ ]
 // CHECK-NEXT:   }
 
-// IMPORT: "tfl.fake_quant"(%arg0) {max = 1.400000e+00 : f32, min = 3.000000e-01 : f32, narrow_range = false, num_bits = 6 : i32}
+// IMPORT: "tfl.fake_quant"(%arg0) <{max = 1.400000e+00 : f32, min = 3.000000e-01 : f32, narrow_range = false, num_bits = 6 : i32}>
 
   %0 = "tfl.fake_quant"(%arg0) {num_bits = 6 : i32, narrow_range = false, min = 0.3:f32, max = 1.4:f32} : (tensor<4 x f32>) -> tensor<4 x f32>
   func.return %0 : tensor<4xf32>
diff --git a/tensorflow/compiler/mlir/lite/tests/modify_io_nodes.mlir b/tensorflow/compiler/mlir/lite/tests/modify_io_nodes.mlir
index 686fcc5703552e..38d2a05904e418 100644
--- a/tensorflow/compiler/mlir/lite/tests/modify_io_nodes.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/modify_io_nodes.mlir
@@ -9,39 +9,39 @@ func.func @modified(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x401408xf32> attr
   %2 = "tfl.pseudo_qconst"() {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>, value = dense<0> : tensor<32xi32>} : () -> tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>
   %3 = "tfl.conv_2d"(%0, %1, %2) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x224x224x3x!quant.uniform<i8:f32, 7.812500e-03>>, tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216>>, tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>) -> tensor<1x112x112x32x!quant.uniform<i8:f32, 0.023528476789885875>>
   %4 = "tfl.reshape"(%3, %cst) : (tensor<1x112x112x32x!quant.uniform<i8:f32, 0.023528476789885875>>, tensor<2xi32>) -> tensor<1x401408x!quant.uniform<i8:f32, 0.023528476789885875>>
-  %5 = "tfl.softmax"(%4) {beta = 1.000000e+00 : f32} : (tensor<1x401408x!quant.uniform<i8:f32, 0.023528476789885875>>) -> tensor<1x401408x!quant.uniform<i8:f32, 3.906250e-03>>
+  %5 = "tfl.softmax"(%4) <{beta = 1.000000e+00 : f32}> : (tensor<1x401408x!quant.uniform<i8:f32, 0.023528476789885875>>) -> tensor<1x401408x!quant.uniform<i8:f32, 3.906250e-03>>
   %6 = "tfl.dequantize"(%5) : (tensor<1x401408x!quant.uniform<i8:f32, 3.906250e-03>>) -> tensor<1x401408xf32>
   func.return %6 : tensor<1x401408xf32>
 
 // CHECK-LABEL: func @modified(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x401408xf32>
 // CHECK-NEXT: %[[shape:.*]] = arith.constant dense<[1, 401408]> : tensor<2xi32>
-// CHECK-NEXT: %[[q:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x224x224x3x!quant.uniform<i8:f32, 7.812500e-03>>} : (tensor<1x224x224x3xf32>) -> tensor<1x224x224x3x!quant.uniform<i8:f32, 7.812500e-03>>
-// CHECK-NEXT: %[[cst1:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216:151>>, value = dense<-76> : tensor<32x3x3x3xi8>} : () -> tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216>>
-// CHECK-NEXT: %[[cst2:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>, value = dense<0> : tensor<32xi32>} : () -> tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>
-// CHECK-NEXT: %[[conv:.*]] = "tfl.conv_2d"(%[[q]], %[[cst1]], %[[cst2]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x224x224x3x!quant.uniform<i8:f32, 7.812500e-03>>, tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216>>, tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>) -> tensor<1x112x112x32x!quant.uniform<i8:f32, 0.023528476789885875>>
+// CHECK-NEXT: %[[q:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<1x224x224x3x!quant.uniform<i8:f32, 7.812500e-03>>}> : (tensor<1x224x224x3xf32>) -> tensor<1x224x224x3x!quant.uniform<i8:f32, 7.812500e-03>>
+// CHECK-NEXT: %[[cst1:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216:151>>, value = dense<-76> : tensor<32x3x3x3xi8>}> : () -> tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216>>
+// CHECK-NEXT: %[[cst2:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>, value = dense<0> : tensor<32xi32>}> : () -> tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>
+// CHECK-NEXT: %[[conv:.*]] = "tfl.conv_2d"(%[[q]], %[[cst1]], %[[cst2]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32}> : (tensor<1x224x224x3x!quant.uniform<i8:f32, 7.812500e-03>>, tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216>>, tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>) -> tensor<1x112x112x32x!quant.uniform<i8:f32, 0.023528476789885875>>
 // CHECK-NEXT: %[[reshape:.*]] = "tfl.reshape"(%[[conv]], %[[shape]]) : (tensor<1x112x112x32x!quant.uniform<i8:f32, 0.023528476789885875>>, tensor<2xi32>) -> tensor<1x401408x!quant.uniform<i8:f32, 0.023528476789885875>>
-// CHECK-NEXT: %[[softmax:.*]] = "tfl.softmax"(%[[reshape]]) {beta = 1.000000e+00 : f32} : (tensor<1x401408x!quant.uniform<i8:f32, 0.023528476789885875>>) -> tensor<1x401408x!quant.uniform<i8:f32, 3.906250e-03>>
+// CHECK-NEXT: %[[softmax:.*]] = "tfl.softmax"(%[[reshape]]) <{beta = 1.000000e+00 : f32}> : (tensor<1x401408x!quant.uniform<i8:f32, 0.023528476789885875>>) -> tensor<1x401408x!quant.uniform<i8:f32, 3.906250e-03>>
 // CHECK-NEXT: %[[dq:.*]] = "tfl.dequantize"(%[[softmax]]) : (tensor<1x401408x!quant.uniform<i8:f32, 3.906250e-03>>) -> tensor<1x401408xf32>
 // CHECK-NEXT: return %[[dq]] : tensor<1x401408xf32>
 
 // INT8-LABEL: @modified(%arg0: tensor<1x224x224x3x!quant.uniform<i8:f32, 7.812500e-03>>) -> tensor<1x401408x!quant.uniform<i8:f32, 3.906250e-03>>
 // INT8-NEXT: %[[shape:.*]] = arith.constant dense<[1, 401408]> : tensor<2xi32>
-// INT8-NEXT: %[[cst1:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216:151>>, value = dense<-76> : tensor<32x3x3x3xi8>} : () -> tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216>>
-// INT8-NEXT: %[[cst2:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>, value = dense<0> : tensor<32xi32>} : () -> tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>
-// INT8-NEXT: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[cst1]], %[[cst2]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x224x224x3x!quant.uniform<i8:f32, 7.812500e-03>>, tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216>>, tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>) -> tensor<1x112x112x32x!quant.uniform<i8:f32, 0.023528476789885875>>
+// INT8-NEXT: %[[cst1:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216:151>>, value = dense<-76> : tensor<32x3x3x3xi8>}> : () -> tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216>>
+// INT8-NEXT: %[[cst2:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>, value = dense<0> : tensor<32xi32>}> : () -> tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>
+// INT8-NEXT: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[cst1]], %[[cst2]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32}> : (tensor<1x224x224x3x!quant.uniform<i8:f32, 7.812500e-03>>, tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216>>, tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>) -> tensor<1x112x112x32x!quant.uniform<i8:f32, 0.023528476789885875>>
 // INT8-NEXT: %[[reshape:.*]] = "tfl.reshape"(%[[conv]], %[[shape]]) : (tensor<1x112x112x32x!quant.uniform<i8:f32, 0.023528476789885875>>, tensor<2xi32>) -> tensor<1x401408x!quant.uniform<i8:f32, 0.023528476789885875>>
-// INT8-NEXT: %[[softmax:.*]] = "tfl.softmax"(%[[reshape]]) {beta = 1.000000e+00 : f32} : (tensor<1x401408x!quant.uniform<i8:f32, 0.023528476789885875>>) -> tensor<1x401408x!quant.uniform<i8:f32, 3.906250e-03>>
+// INT8-NEXT: %[[softmax:.*]] = "tfl.softmax"(%[[reshape]]) <{beta = 1.000000e+00 : f32}> : (tensor<1x401408x!quant.uniform<i8:f32, 0.023528476789885875>>) -> tensor<1x401408x!quant.uniform<i8:f32, 3.906250e-03>>
 // INT8-NEXT: return %[[softmax]] : tensor<1x401408x!quant.uniform<i8:f32, 3.906250e-03>>
 
 // UINT8-LABEL: func @modified(%arg0: tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x401408x!quant.uniform<u8:f32, 3.906250e-03:128>>
 // UINT8-NEXT: %[[shape:.*]] = arith.constant dense<[1, 401408]> : tensor<2xi32>
-// UINT8-NEXT: %[[q:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x224x224x3x!quant.uniform<i8:f32, 7.812500e-03>>} : (tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x224x224x3x!quant.uniform<i8:f32, 7.812500e-03>>
-// UINT8-NEXT: %[[cst1:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216:151>>, value = dense<-76> : tensor<32x3x3x3xi8>} : () -> tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216>>
-// UINT8-NEXT: %[[cst2:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>, value = dense<0> : tensor<32xi32>} : () -> tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>
-// UINT8-NEXT: %[[conv:.*]] = "tfl.conv_2d"(%[[q]], %[[cst1]], %[[cst2]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x224x224x3x!quant.uniform<i8:f32, 7.812500e-03>>, tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216>>, tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>) -> tensor<1x112x112x32x!quant.uniform<i8:f32, 0.023528476789885875>>
+// UINT8-NEXT: %[[q:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<1x224x224x3x!quant.uniform<i8:f32, 7.812500e-03>>}> : (tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x224x224x3x!quant.uniform<i8:f32, 7.812500e-03>>
+// UINT8-NEXT: %[[cst1:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216:151>>, value = dense<-76> : tensor<32x3x3x3xi8>}> : () -> tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216>>
+// UINT8-NEXT: %[[cst2:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>, value = dense<0> : tensor<32xi32>}> : () -> tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>
+// UINT8-NEXT: %[[conv:.*]] = "tfl.conv_2d"(%[[q]], %[[cst1]], %[[cst2]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32}> : (tensor<1x224x224x3x!quant.uniform<i8:f32, 7.812500e-03>>, tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216>>, tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>) -> tensor<1x112x112x32x!quant.uniform<i8:f32, 0.023528476789885875>>
 // UINT8-NEXT: %[[reshape:.*]] = "tfl.reshape"(%[[conv]], %[[shape]]) : (tensor<1x112x112x32x!quant.uniform<i8:f32, 0.023528476789885875>>, tensor<2xi32>) -> tensor<1x401408x!quant.uniform<i8:f32, 0.023528476789885875>>
-// UINT8-NEXT: %[[softmax:.*]] = "tfl.softmax"(%[[reshape]]) {beta = 1.000000e+00 : f32} : (tensor<1x401408x!quant.uniform<i8:f32, 0.023528476789885875>>) -> tensor<1x401408x!quant.uniform<i8:f32, 3.906250e-03>>
-// UINT8-NEXT: %[[dq:.*]] = "tfl.quantize"(%[[softmax]]) {qtype = tensor<1x401408x!quant.uniform<u8:f32, 3.906250e-03:128>>} : (tensor<1x401408x!quant.uniform<i8:f32, 3.906250e-03>>) -> tensor<1x401408x!quant.uniform<u8:f32, 3.906250e-03:128>>
+// UINT8-NEXT: %[[softmax:.*]] = "tfl.softmax"(%[[reshape]]) <{beta = 1.000000e+00 : f32}> : (tensor<1x401408x!quant.uniform<i8:f32, 0.023528476789885875>>) -> tensor<1x401408x!quant.uniform<i8:f32, 3.906250e-03>>
+// UINT8-NEXT: %[[dq:.*]] = "tfl.quantize"(%[[softmax]]) <{qtype = tensor<1x401408x!quant.uniform<u8:f32, 3.906250e-03:128>>}> : (tensor<1x401408x!quant.uniform<i8:f32, 3.906250e-03>>) -> tensor<1x401408x!quant.uniform<u8:f32, 3.906250e-03:128>>
 // UINT8-NEXT: return %[[dq]] : tensor<1x401408x!quant.uniform<u8:f32, 3.906250e-03:128>>
 }
 
@@ -52,40 +52,40 @@ func.func @not_modified(%arg0: tensor<f32>, %arg1: tensor<1x224x224x3xf32>) -> (
   %2 = "tfl.pseudo_qconst"() {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>, value = dense<0> : tensor<32xi32>} : () -> tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>
   %3 = "tfl.conv_2d"(%0, %1, %2) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x224x224x3x!quant.uniform<i8:f32, 7.812500e-03>>, tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216>>, tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>) -> tensor<1x112x112x32x!quant.uniform<i8:f32, 0.023528476789885875>>
   %4 = "tfl.reshape"(%3, %cst) : (tensor<1x112x112x32x!quant.uniform<i8:f32, 0.023528476789885875>>, tensor<2xi32>) -> tensor<1x401408x!quant.uniform<i8:f32, 0.023528476789885875>>
-  %5 = "tfl.softmax"(%4) {beta = 1.000000e+00 : f32} : (tensor<1x401408x!quant.uniform<i8:f32, 0.023528476789885875>>) -> tensor<1x401408x!quant.uniform<i8:f32, 3.906250e-03>>
+  %5 = "tfl.softmax"(%4) <{beta = 1.000000e+00 : f32}> : (tensor<1x401408x!quant.uniform<i8:f32, 0.023528476789885875>>) -> tensor<1x401408x!quant.uniform<i8:f32, 3.906250e-03>>
   %6 = "tfl.dequantize"(%5) : (tensor<1x401408x!quant.uniform<i8:f32, 3.906250e-03>>) -> tensor<1x401408xf32>
   func.return %6, %arg1 : tensor<1x401408xf32>, tensor<1x224x224x3xf32>
 
 // CHECK-LABEL: func @not_modified(%arg0: tensor<f32>, %arg1: tensor<1x224x224x3xf32>) -> (tensor<1x401408xf32>, tensor<1x224x224x3xf32>)
 // CHECK-NEXT: %[[shape:.*]] = arith.constant dense<[1, 401408]> : tensor<2xi32>
-// CHECK-NEXT: %[[q:.*]] = "tfl.quantize"(%arg1) {qtype = tensor<1x224x224x3x!quant.uniform<i8:f32, 7.812500e-03>>} : (tensor<1x224x224x3xf32>) -> tensor<1x224x224x3x!quant.uniform<i8:f32, 7.812500e-03>>
-// CHECK-NEXT: %[[cst1:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216:151>>, value = dense<-76> : tensor<32x3x3x3xi8>} : () -> tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216>>
-// CHECK-NEXT: %[[cst2:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>, value = dense<0> : tensor<32xi32>} : () -> tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>
-// CHECK-NEXT: %[[conv:.*]] = "tfl.conv_2d"(%[[q]], %[[cst1]], %[[cst2]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x224x224x3x!quant.uniform<i8:f32, 7.812500e-03>>, tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216>>, tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>) -> tensor<1x112x112x32x!quant.uniform<i8:f32, 0.023528476789885875>>
+// CHECK-NEXT: %[[q:.*]] = "tfl.quantize"(%arg1) <{qtype = tensor<1x224x224x3x!quant.uniform<i8:f32, 7.812500e-03>>}> : (tensor<1x224x224x3xf32>) -> tensor<1x224x224x3x!quant.uniform<i8:f32, 7.812500e-03>>
+// CHECK-NEXT: %[[cst1:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216:151>>, value = dense<-76> : tensor<32x3x3x3xi8>}> : () -> tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216>>
+// CHECK-NEXT: %[[cst2:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>, value = dense<0> : tensor<32xi32>}> : () -> tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>
+// CHECK-NEXT: %[[conv:.*]] = "tfl.conv_2d"(%[[q]], %[[cst1]], %[[cst2]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32}> : (tensor<1x224x224x3x!quant.uniform<i8:f32, 7.812500e-03>>, tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216>>, tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>) -> tensor<1x112x112x32x!quant.uniform<i8:f32, 0.023528476789885875>>
 // CHECK-NEXT: %[[reshape:.*]] = "tfl.reshape"(%[[conv]], %[[shape]]) : (tensor<1x112x112x32x!quant.uniform<i8:f32, 0.023528476789885875>>, tensor<2xi32>) -> tensor<1x401408x!quant.uniform<i8:f32, 0.023528476789885875>>
-// CHECK-NEXT: %[[softmax:.*]] = "tfl.softmax"(%[[reshape]]) {beta = 1.000000e+00 : f32} : (tensor<1x401408x!quant.uniform<i8:f32, 0.023528476789885875>>) -> tensor<1x401408x!quant.uniform<i8:f32, 3.906250e-03>>
+// CHECK-NEXT: %[[softmax:.*]] = "tfl.softmax"(%[[reshape]]) <{beta = 1.000000e+00 : f32}> : (tensor<1x401408x!quant.uniform<i8:f32, 0.023528476789885875>>) -> tensor<1x401408x!quant.uniform<i8:f32, 3.906250e-03>>
 // CHECK-NEXT: %[[dq:.*]] = "tfl.dequantize"(%[[softmax]]) : (tensor<1x401408x!quant.uniform<i8:f32, 3.906250e-03>>) -> tensor<1x401408xf32>
 // CHECK-NEXT: return %[[dq]], %arg1 : tensor<1x401408xf32>, tensor<1x224x224x3xf32>
 
 // INT8-LABEL: @not_modified(%arg0: tensor<f32>, %arg1: tensor<1x224x224x3xf32>) -> (tensor<1x401408x!quant.uniform<i8:f32, 3.906250e-03>>, tensor<1x224x224x3xf32>)
 // INT8-NEXT: %[[shape:.*]] = arith.constant dense<[1, 401408]> : tensor<2xi32>
-// INT8-NEXT: %[[q:.*]] = "tfl.quantize"(%arg1) {qtype = tensor<1x224x224x3x!quant.uniform<i8:f32, 7.812500e-03>>} : (tensor<1x224x224x3xf32>) -> tensor<1x224x224x3x!quant.uniform<i8:f32, 7.812500e-03>>
-// INT8-NEXT: %[[cst1:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216:151>>, value = dense<-76> : tensor<32x3x3x3xi8>} : () -> tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216>>
-// INT8-NEXT: %[[cst2:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>, value = dense<0> : tensor<32xi32>} : () -> tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>
-// INT8-NEXT: %[[conv:.*]] = "tfl.conv_2d"(%[[q]], %[[cst1]], %[[cst2]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x224x224x3x!quant.uniform<i8:f32, 7.812500e-03>>, tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216>>, tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>) -> tensor<1x112x112x32x!quant.uniform<i8:f32, 0.023528476789885875>>
+// INT8-NEXT: %[[q:.*]] = "tfl.quantize"(%arg1) <{qtype = tensor<1x224x224x3x!quant.uniform<i8:f32, 7.812500e-03>>}> : (tensor<1x224x224x3xf32>) -> tensor<1x224x224x3x!quant.uniform<i8:f32, 7.812500e-03>>
+// INT8-NEXT: %[[cst1:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216:151>>, value = dense<-76> : tensor<32x3x3x3xi8>}> : () -> tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216>>
+// INT8-NEXT: %[[cst2:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>, value = dense<0> : tensor<32xi32>}> : () -> tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>
+// INT8-NEXT: %[[conv:.*]] = "tfl.conv_2d"(%[[q]], %[[cst1]], %[[cst2]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32}> : (tensor<1x224x224x3x!quant.uniform<i8:f32, 7.812500e-03>>, tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216>>, tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>) -> tensor<1x112x112x32x!quant.uniform<i8:f32, 0.023528476789885875>>
 // INT8-NEXT: %[[reshape:.*]] = "tfl.reshape"(%[[conv]], %[[shape]]) : (tensor<1x112x112x32x!quant.uniform<i8:f32, 0.023528476789885875>>, tensor<2xi32>) -> tensor<1x401408x!quant.uniform<i8:f32, 0.023528476789885875>>
-// INT8-NEXT: %[[softmax:.*]] = "tfl.softmax"(%[[reshape]]) {beta = 1.000000e+00 : f32} : (tensor<1x401408x!quant.uniform<i8:f32, 0.023528476789885875>>) -> tensor<1x401408x!quant.uniform<i8:f32, 3.906250e-03>>
+// INT8-NEXT: %[[softmax:.*]] = "tfl.softmax"(%[[reshape]]) <{beta = 1.000000e+00 : f32}> : (tensor<1x401408x!quant.uniform<i8:f32, 0.023528476789885875>>) -> tensor<1x401408x!quant.uniform<i8:f32, 3.906250e-03>>
 // INT8-NEXT: return %[[softmax]], %arg1 : tensor<1x401408x!quant.uniform<i8:f32, 3.906250e-03>>, tensor<1x224x224x3xf32>
 
 // UINT8-LABEL: func @not_modified(%arg0: tensor<f32>, %arg1: tensor<1x224x224x3xf32>) -> (tensor<1x401408x!quant.uniform<u8:f32, 3.906250e-03:128>>, tensor<1x224x224x3xf32>)
 // UINT8-NEXT: %[[shape:.*]] = arith.constant dense<[1, 401408]> : tensor<2xi32>
-// UINT8-NEXT: %[[q:.*]] = "tfl.quantize"(%arg1) {qtype = tensor<1x224x224x3x!quant.uniform<i8:f32, 7.812500e-03>>} : (tensor<1x224x224x3xf32>) -> tensor<1x224x224x3x!quant.uniform<i8:f32, 7.812500e-03>>
-// UINT8-NEXT: %[[cst1:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216:151>>, value = dense<-76> : tensor<32x3x3x3xi8>} : () -> tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216>>
-// UINT8-NEXT: %[[cst2:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>, value = dense<0> : tensor<32xi32>} : () -> tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>
-// UINT8-NEXT: %[[conv:.*]] = "tfl.conv_2d"(%[[q]], %[[cst1]], %[[cst2]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x224x224x3x!quant.uniform<i8:f32, 7.812500e-03>>, tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216>>, tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>) -> tensor<1x112x112x32x!quant.uniform<i8:f32, 0.023528476789885875>>
+// UINT8-NEXT: %[[q:.*]] = "tfl.quantize"(%arg1) <{qtype = tensor<1x224x224x3x!quant.uniform<i8:f32, 7.812500e-03>>}> : (tensor<1x224x224x3xf32>) -> tensor<1x224x224x3x!quant.uniform<i8:f32, 7.812500e-03>>
+// UINT8-NEXT: %[[cst1:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216:151>>, value = dense<-76> : tensor<32x3x3x3xi8>}> : () -> tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216>>
+// UINT8-NEXT: %[[cst2:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>, value = dense<0> : tensor<32xi32>}> : () -> tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>
+// UINT8-NEXT: %[[conv:.*]] = "tfl.conv_2d"(%[[q]], %[[cst1]], %[[cst2]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32}> : (tensor<1x224x224x3x!quant.uniform<i8:f32, 7.812500e-03>>, tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216>>, tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>) -> tensor<1x112x112x32x!quant.uniform<i8:f32, 0.023528476789885875>>
 // UINT8-NEXT: %[[reshape:.*]] = "tfl.reshape"(%[[conv]], %[[shape]]) : (tensor<1x112x112x32x!quant.uniform<i8:f32, 0.023528476789885875>>, tensor<2xi32>) -> tensor<1x401408x!quant.uniform<i8:f32, 0.023528476789885875>>
-// UINT8-NEXT: %[[softmax:.*]] = "tfl.softmax"(%[[reshape]]) {beta = 1.000000e+00 : f32} : (tensor<1x401408x!quant.uniform<i8:f32, 0.023528476789885875>>) -> tensor<1x401408x!quant.uniform<i8:f32, 3.906250e-03>>
-// UINT8-NEXT: %[[dq:.*]] = "tfl.quantize"(%[[softmax]]) {qtype = tensor<1x401408x!quant.uniform<u8:f32, 3.906250e-03:128>>} : (tensor<1x401408x!quant.uniform<i8:f32, 3.906250e-03>>) -> tensor<1x401408x!quant.uniform<u8:f32, 3.906250e-03:128>>
+// UINT8-NEXT: %[[softmax:.*]] = "tfl.softmax"(%[[reshape]]) <{beta = 1.000000e+00 : f32}> : (tensor<1x401408x!quant.uniform<i8:f32, 0.023528476789885875>>) -> tensor<1x401408x!quant.uniform<i8:f32, 3.906250e-03>>
+// UINT8-NEXT: %[[dq:.*]] = "tfl.quantize"(%[[softmax]]) <{qtype = tensor<1x401408x!quant.uniform<u8:f32, 3.906250e-03:128>>}> : (tensor<1x401408x!quant.uniform<i8:f32, 3.906250e-03>>) -> tensor<1x401408x!quant.uniform<u8:f32, 3.906250e-03:128>>
 // UINT8-NEXT: return %[[dq]], %arg1 : tensor<1x401408x!quant.uniform<u8:f32, 3.906250e-03:128>>, tensor<1x224x224x3xf32>
 }
 
@@ -96,7 +96,7 @@ func.func @main(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x401408xf32> {
   %2 = "tfl.pseudo_qconst"() {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>, value = dense<0> : tensor<32xi32>} : () -> tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>
   %3 = "tfl.conv_2d"(%0, %1, %2) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x224x224x3x!quant.uniform<i8:f32, 7.812500e-03>>, tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216>>, tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>) -> tensor<1x112x112x32x!quant.uniform<i8:f32, 0.023528476789885875>>
   %4 = "tfl.reshape"(%3, %cst) : (tensor<1x112x112x32x!quant.uniform<i8:f32, 0.023528476789885875>>, tensor<2xi32>) -> tensor<1x401408x!quant.uniform<i8:f32, 0.023528476789885875>>
-  %5 = "tfl.softmax"(%4) {beta = 1.000000e+00 : f32} : (tensor<1x401408x!quant.uniform<i8:f32, 0.023528476789885875>>) -> tensor<1x401408x!quant.uniform<i8:f32, 3.906250e-03>>
+  %5 = "tfl.softmax"(%4) <{beta = 1.000000e+00 : f32}> : (tensor<1x401408x!quant.uniform<i8:f32, 0.023528476789885875>>) -> tensor<1x401408x!quant.uniform<i8:f32, 3.906250e-03>>
   %6 = "tfl.dequantize"(%5) : (tensor<1x401408x!quant.uniform<i8:f32, 3.906250e-03>>) -> tensor<1x401408xf32>
   func.return %6 : tensor<1x401408xf32>
 
@@ -112,7 +112,7 @@ func.func @non_entry_funciton(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x401408
   %2 = "tfl.pseudo_qconst"() {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>, value = dense<0> : tensor<32xi32>} : () -> tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>
   %3 = "tfl.conv_2d"(%0, %1, %2) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x224x224x3x!quant.uniform<i8:f32, 7.812500e-03>>, tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.021826678373682216>>, tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>) -> tensor<1x112x112x32x!quant.uniform<i8:f32, 0.023528476789885875>>
   %4 = "tfl.reshape"(%3, %cst) : (tensor<1x112x112x32x!quant.uniform<i8:f32, 0.023528476789885875>>, tensor<2xi32>) -> tensor<1x401408x!quant.uniform<i8:f32, 0.023528476789885875>>
-  %5 = "tfl.softmax"(%4) {beta = 1.000000e+00 : f32} : (tensor<1x401408x!quant.uniform<i8:f32, 0.023528476789885875>>) -> tensor<1x401408x!quant.uniform<i8:f32, 3.906250e-03>>
+  %5 = "tfl.softmax"(%4) <{beta = 1.000000e+00 : f32}> : (tensor<1x401408x!quant.uniform<i8:f32, 0.023528476789885875>>) -> tensor<1x401408x!quant.uniform<i8:f32, 3.906250e-03>>
   %6 = "tfl.dequantize"(%5) : (tensor<1x401408x!quant.uniform<i8:f32, 3.906250e-03>>) -> tensor<1x401408xf32>
   func.return %6 : tensor<1x401408xf32>
 
diff --git a/tensorflow/compiler/mlir/lite/tests/ops.mlir b/tensorflow/compiler/mlir/lite/tests/ops.mlir
index 20c4a031157a89..fa69cd46017f8f 100644
--- a/tensorflow/compiler/mlir/lite/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/ops.mlir
@@ -401,7 +401,7 @@ func.func @testAddWithI64Broadcasting(tensor< 2x3xi64>, tensor<3xi64>) -> tensor
 // CHECK-LABEL: add_with_i32_five_dim_broadcasting
 func.func @add_with_i32_five_dim_broadcasting(tensor<1x1x1x1x1xi32>, tensor<1xi32>) -> tensor<1x1x1x1x1xi32> {
 ^bb0(%arg0: tensor<1x1x1x1x1xi32>, %arg1: tensor<1xi32>):
-  // CHECK: tfl.add(%arg0, %arg1) {fused_activation_function = "RELU6"}
+  // CHECK: tfl.add(%arg0, %arg1) <{fused_activation_function = "RELU6"}>
   %0 = "tfl.add"(%arg0, %arg1) {fused_activation_function = "RELU6"} : (tensor<1x1x1x1x1xi32>, tensor<1xi32>) -> tensor<1x1x1x1x1xi32>
   func.return %0#0 : tensor<1x1x1x1x1xi32>
 }
@@ -420,7 +420,7 @@ func.func @add_with_quantized_i16_broadcasting(tensor<2x2xf32>, tensor<1xf32>) -
 // CHECK-LABEL: sub_with_i32_five_dim_broadcasting
 func.func @sub_with_i32_five_dim_broadcasting(tensor<1x1x1x1x1xi32>, tensor<1xi32>) -> tensor<1x1x1x1x1xi32> {
 ^bb0(%arg0: tensor<1x1x1x1x1xi32>, %arg1: tensor<1xi32>):
-  // CHECK: tfl.sub(%arg0, %arg1) {fused_activation_function = "RELU6"}
+  // CHECK: tfl.sub(%arg0, %arg1) <{fused_activation_function = "RELU6"}>
   %0 = "tfl.sub"(%arg0, %arg1) {fused_activation_function = "RELU6"} : (tensor<1x1x1x1x1xi32>, tensor<1xi32>) -> tensor<1x1x1x1x1xi32>
   func.return %0#0 : tensor<1x1x1x1x1xi32>
 }
@@ -438,7 +438,7 @@ func.func @sub_with_quantized_i8_five_dim_broadcasting(tensor<1x1x1x1x1xf32>, te
 // CHECK-LABEL: mul_with_i32_five_dim_broadcasting
 func.func @mul_with_i32_five_dim_broadcasting(tensor<1x1x1x1x1xi32>, tensor<1xi32>) -> tensor<1x1x1x1x1xi32> {
 ^bb0(%arg0: tensor<1x1x1x1x1xi32>, %arg1: tensor<1xi32>):
-  // CHECK: tfl.mul(%arg0, %arg1) {fused_activation_function = "RELU6"}
+  // CHECK: tfl.mul(%arg0, %arg1) <{fused_activation_function = "RELU6"}>
   %0 = "tfl.mul"(%arg0, %arg1) {fused_activation_function = "RELU6"} : (tensor<1x1x1x1x1xi32>, tensor<1xi32>) -> tensor<1x1x1x1x1xi32>
   func.return %0#0 : tensor<1x1x1x1x1xi32>
 }
@@ -448,7 +448,7 @@ func.func @mul_with_i32_five_dim_broadcasting(tensor<1x1x1x1x1xi32>, tensor<1xi3
 // CHECK-LABEL: mul_with_quantized_i16_five_dim_broadcasting
 func.func @mul_with_quantized_i16_five_dim_broadcasting(tensor<1x1x1x1x1x!quant.any<i16:f32>>, tensor<1x!quant.any<i16:f32>>) -> tensor<1x1x1x1x1x!quant.any<i16:f32>> {
 ^bb0(%arg0: tensor<1x1x1x1x1x!quant.any<i16:f32>>, %arg1: tensor<1x!quant.any<i16:f32>>):
-  // CHECK: tfl.mul(%arg0, %arg1) {fused_activation_function = "RELU6"}
+  // CHECK: tfl.mul(%arg0, %arg1) <{fused_activation_function = "RELU6"}>
   %0 = "tfl.mul"(%arg0, %arg1) {fused_activation_function = "RELU6"} : (tensor<1x1x1x1x1x!quant.any<i16:f32>>, tensor<1x!quant.any<i16:f32>>) -> tensor<1x1x1x1x1x!quant.any<i16:f32>>
   func.return %0#0 : tensor<1x1x1x1x1x!quant.any<i16:f32>>
 }
@@ -467,7 +467,7 @@ func.func @mul_with_quantized_i16_to_uint8_broadcasting(tensor<1x1x!quant.any<i1
 // CHECK-LABEL: testMulNonQuantizedOperandsandQuantizedResult
 func.func @testMulNonQuantizedOperandsandQuantizedResult(tensor<? x f32>, tensor<? x f32>) -> tensor<? x !quant.any<i16:f32>> {
 ^bb0(%arg0: tensor<? x f32>, %arg1: tensor<? x f32>):
-  // CHECK: tfl.mul(%arg0, %arg1) {fused_activation_function = "RELU6"}
+  // CHECK: tfl.mul(%arg0, %arg1) <{fused_activation_function = "RELU6"}>
   %0 = "tfl.mul"(%arg0, %arg1) {fused_activation_function = "RELU6"}: (tensor<? x f32>, tensor<? x f32>) -> tensor<? x !quant.any<i16:f32>>
   func.return %0#0 : tensor<? x !quant.any<i16:f32>>
 }
@@ -614,7 +614,7 @@ func.func @testConv2D4DBias(tensor<256x32x32x3xf32>, tensor<16x3x3x3xf32>, tenso
 // CHECK-LABEL: testFakeQuant
 func.func @testFakeQuant(tensor<? x f32>, f32, f32) -> tensor<? x f32> {
 ^bb0(%arg0: tensor<? x f32>, %arg1: f32, %arg2: f32):
-  // CHECK: "tfl.fake_quant"(%arg0)  {max = 1.400000e+00 : f32, min = 3.000000e-01 : f32, narrow_range = false, num_bits = 6 : i32} : (tensor<?xf32>) -> tensor<?xf32>
+  // CHECK: "tfl.fake_quant"(%arg0)  <{max = 1.400000e+00 : f32, min = 3.000000e-01 : f32, narrow_range = false, num_bits = 6 : i32}> : (tensor<?xf32>) -> tensor<?xf32>
   %1 = "tfl.fake_quant"(%arg0) {num_bits = 6 : i32, narrow_range = false, min = 0.3:f32, max = 1.4:f32} : (tensor<? x f32>) -> tensor<? x f32>
   func.return %1 : tensor<? x f32>
 }
@@ -622,7 +622,7 @@ func.func @testFakeQuant(tensor<? x f32>, f32, f32) -> tensor<? x f32> {
 // CHECK-LABEL: testQuantize
 func.func @testQuantize(tensor<? x f32>) -> tensor<? x !quant.uniform<u8:f32, 0.1:128>> {
 ^bb0(%arg0: tensor<? x f32>):
-  // CHECK: %0 = "tfl.quantize"(%arg0) {qtype = tensor<?x!quant.uniform<u8:f32, 1.000000e-01:128>>}
+  // CHECK: %0 = "tfl.quantize"(%arg0) <{qtype = tensor<?x!quant.uniform<u8:f32, 1.000000e-01:128>>}>
   %0 = "tfl.quantize"(%arg0) {qtype = tensor<? x !quant.uniform<u8:f32, 0.1:128>>} : (tensor<? x f32>) -> tensor<? x !quant.uniform<u8:f32, 0.1:128>>
   func.return %0 : tensor<? x !quant.uniform<u8:f32, 0.1:128>>
 }
@@ -738,7 +738,7 @@ func.func @testPadding(%arg0: tensor<256x32x32x3xf32>, %arg1: tensor<16x3x3x3xf3
 // CHECK-LABEL: testMaxPool2D
 func.func @testMaxPool2D(tensor<256x32x32x3xf32>) -> tensor<?xf32> {
 ^bb0(%arg0: tensor<256x32x32x3xf32>):
-  // CHECK: "tfl.max_pool_2d"(%arg0) {filter_height = 1 : i32, filter_width = 1 : i32, fused_activation_function = "RELU6", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<256x32x32x3xf32>) -> tensor<?xf32>
+  // CHECK: "tfl.max_pool_2d"(%arg0) <{filter_height = 1 : i32, filter_width = 1 : i32, fused_activation_function = "RELU6", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<256x32x32x3xf32>) -> tensor<?xf32>
   %0 = "tfl.max_pool_2d"(%arg0) {filter_height = 1 : i32, filter_width = 1 : i32, fused_activation_function = "RELU6", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<256x32x32x3xf32>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
 }
@@ -748,7 +748,7 @@ func.func @testMaxPool2D(tensor<256x32x32x3xf32>) -> tensor<?xf32> {
 // CHECK-LABEL: testMaxPool2DQuantized
 func.func @testMaxPool2DQuantized(tensor<256x32x32x3x!quant.uniform<i8:f32, 0.1:128>>) -> tensor<?x!quant.uniform<i8:f32, 0.1:128>> {
 ^bb0(%arg0: tensor<256x32x32x3x!quant.uniform<i8:f32, 0.1:128>>):
-  // CHECK: "tfl.max_pool_2d"(%arg0) {filter_height = 1 : i32, filter_width = 1 : i32, fused_activation_function = "RELU6", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}
+  // CHECK: "tfl.max_pool_2d"(%arg0) <{filter_height = 1 : i32, filter_width = 1 : i32, fused_activation_function = "RELU6", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}>
   %0 = "tfl.max_pool_2d"(%arg0) {filter_height = 1 : i32, filter_width = 1 : i32, fused_activation_function = "RELU6", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<256x32x32x3x!quant.uniform<i8:f32, 0.1:128>>) -> tensor<?x!quant.uniform<i8:f32, 0.1:128>>
   func.return %0 : tensor<?x!quant.uniform<i8:f32, 0.1:128>>
 }
@@ -824,7 +824,7 @@ func.func @testLogisticWithWrongInputType(tensor<?xi32>) -> tensor<?xi32> {
 
 // CHECK-LABEL: testUnidirectionalSequenceRnn
 func.func @testUnidirectionalSequenceRnn(%arg0: tensor<? x f32>, %arg1: tensor<? x f32>, %arg2: tensor<? x f32>, %arg3: tensor<? x f32>, %arg4: tensor<? x ? x f32>) -> tensor<? x f32> {
-  // CHECK: "tfl.unidirectional_sequence_rnn"(%arg0, %arg1, %arg2, %arg3, %arg4) {fused_activation_function = "NONE", time_major = false} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>) -> tensor<?xf32>
+  // CHECK: "tfl.unidirectional_sequence_rnn"(%arg0, %arg1, %arg2, %arg3, %arg4) <{fused_activation_function = "NONE", time_major = false}> : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>) -> tensor<?xf32>
   %0 = "tfl.unidirectional_sequence_rnn"(%arg0, %arg1, %arg2, %arg3, %arg4) {fused_activation_function = "NONE", time_major = false} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
 }
@@ -833,7 +833,7 @@ func.func @testUnidirectionalSequenceRnn(%arg0: tensor<? x f32>, %arg1: tensor<?
 
 // CHECK-LABEL: testUnidirectionalSequenceLstmWithoutProjection
 func.func @testUnidirectionalSequenceLstmWithoutProjection(%arg0: tensor<? x ? x ? x f32>, %arg1: tensor<? x ? x f32>, %arg2: tensor<? x ? x f32>, %arg3: tensor<? x ? x f32>, %arg4: tensor<? x ? x f32>, %arg5: tensor<? x ? x f32>, %arg6: tensor<? x ? x f32>, %arg7: tensor<? x ? x f32>, %arg8: tensor<? x ? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<? x f32>, %arg16: none, %arg17: none, %arg18: tensor<? x ? x f32>, %arg19: tensor<? x ? xf32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x ? x ? xf32> {
-  // CHECK: "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {fused_activation_function = "NONE", time_major = false} : (tensor<?x?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, none, none, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?x?x?xf32>
+  // CHECK: "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) <{fused_activation_function = "NONE", time_major = false}> : (tensor<?x?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, none, none, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?x?x?xf32>
   %0 = "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {fused_activation_function = "NONE", time_major = false} : (tensor<?x?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, none, none, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?x?x?xf32>
   func.return %0 : tensor<?x?x?xf32>
 }
@@ -842,7 +842,7 @@ func.func @testUnidirectionalSequenceLstmWithoutProjection(%arg0: tensor<? x ? x
 
 // CHECK-LABEL: testUnidirectionalSequenceLstm
 func.func @testUnidirectionalSequenceLstm(%arg0: tensor<? x ? x ? x f32>, %arg1: tensor<? x ? x f32>, %arg2: tensor<? x ? x f32>, %arg3: tensor<? x ? x f32>, %arg4: tensor<? x ? x f32>, %arg5: tensor<? x ? x f32>, %arg6: tensor<? x ? x f32>, %arg7: tensor<? x ? x f32>, %arg8: tensor<? x ? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<? x f32>, %arg16: tensor<? x ? x f32>, %arg17: tensor<? x f32>, %arg18: tensor<? x ? x f32>, %arg19: tensor<? x ? xf32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x ? x ? xf32> {
-  // CHECK: "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {fused_activation_function = "NONE", time_major = false} : (tensor<?x?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?x?x?xf32>
+  // CHECK: "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) <{fused_activation_function = "NONE", time_major = false}> : (tensor<?x?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?x?x?xf32>
   %0 = "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {fused_activation_function = "NONE", time_major = false} : (tensor<?x?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?x?x?xf32>
   func.return %0 : tensor<?x?x?xf32>
 }
@@ -860,7 +860,7 @@ func.func @testUnidirectionalSequenceLstmWithNoneTypeAndOverrideAttr(%arg0: tens
   %arg20: none, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x ? x ? xf32> {
   // CHECK: "tfl.unidirectional_sequence_lstm"(%arg0,
   // CHECK-SAME: %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15,
-  // CHECK-SAME: %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {cell_clip = 1.000000e+00 : f32, fused_activation_function = "NONE", time_major = false} :
+  // CHECK-SAME: %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) <{cell_clip = 1.000000e+00 : f32, fused_activation_function = "NONE", time_major = false}> :
   // CHECK-SAME: (tensor<?x?x?xf32>,
   // CHECK-SAME: none, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, none, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>,
   // CHECK-SAME: none, tensor<?xf32>, tensor<?xf32>, none, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>,
@@ -879,7 +879,7 @@ func.func @testUnidirectionalSequenceLstmWithNoneTypeAndOverrideAttr(%arg0: tens
 
 // CHECK-LABEL: testUnidirectionalSequenceLstmWithIntermediates
 func.func @testUnidirectionalSequenceLstmWithIntermediates(%arg0: tensor<? x ? x ? x f32>, %arg1: tensor<? x ? x f32>, %arg2: tensor<? x ? x f32>, %arg3: tensor<? x ? x f32>, %arg4: tensor<? x ? x f32>, %arg5: tensor<? x ? x f32>, %arg6: tensor<? x ? x f32>, %arg7: tensor<? x ? x f32>, %arg8: tensor<? x ? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<? x f32>, %arg16: tensor<? x ? x f32>, %arg17: tensor<? x f32>, %arg18: tensor<? x ? x f32>, %arg19: tensor<? x ? x f32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x ? x ? xf32> {
-  // CHECK: "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {cell_clip = 1.000000e+01 : f32, effective_hidden_scale_intermediate = tensor<0x!quant.uniform<i8<-127:127>:f32, 0.0077881771139800549>>, fused_activation_function = "TANH", input_to_cell_intermediate = tensor<0xf32>, input_to_forget_intermediate = tensor<0xf32>, input_to_input_intermediate = tensor<0xf32>, input_to_output_intermediate = tensor<0xf32>, proj_clip = 0.000000e+00 : f32, time_major = false} : (tensor<?x?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?x?x?xf32>
+  // CHECK: "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) <{cell_clip = 1.000000e+01 : f32, effective_hidden_scale_intermediate = tensor<0x!quant.uniform<i8<-127:127>:f32, 0.0077881771139800549>>, fused_activation_function = "TANH", input_to_cell_intermediate = tensor<0xf32>, input_to_forget_intermediate = tensor<0xf32>, input_to_input_intermediate = tensor<0xf32>, input_to_output_intermediate = tensor<0xf32>, proj_clip = 0.000000e+00 : f32, time_major = false}> : (tensor<?x?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?x?x?xf32>
   %0 = "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {cell_clip = 1.000000e+01 : f32, effective_hidden_scale_intermediate = tensor<0x!quant.uniform<i8<-127:127>:f32, 0.0077881771139800549>>, fused_activation_function = "TANH", input_to_cell_intermediate = tensor<0xf32>, input_to_forget_intermediate = tensor<0xf32>, input_to_input_intermediate = tensor<0xf32>, input_to_output_intermediate = tensor<0xf32>, proj_clip = 0.000000e+00 : f32, time_major = false} : (tensor<?x?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?x?x?xf32>
   func.return %0 : tensor<?x?x?xf32>
 }
@@ -900,16 +900,17 @@ func.func @testLstmIntermediates(%arg0: tensor<1x528x!quant.uniform<i8:f32, 0.03
   %cst = "tfl.no_value"() {value = unit} : () -> none
   %0 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %cst, %cst, %cst, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg19, %arg20, %arg15, %arg16, %arg17, %arg18) ({}) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", input_to_input_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0049890000373125076>>, input_to_forget_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0078849997371435165>>, input_to_cell_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0087630003690719604>>, input_to_output_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0057529998011887074>>, effective_hidden_scale_intermediate = tensor<0x!quant.uniform<i8<-127:127>:f32, 0.0075630000792443752:2>>, kernel_type = #tfl<lstm_kernel_type_attr FULL>, proj_clip = 0.01 : f32} : (tensor<1x528x!quant.uniform<i8:f32, 0.037248000502586365:-19>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.059801999479532242>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.031925998628139496>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.056272000074386597>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.063763998448848724>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.013358999975025654>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.022830000147223473>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.032276000827550888>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.035427000373601913>>, none, none, none, tensor<2048x!quant.uniform<i32:f32, 4.2675782196965883E-7>>, tensor<2048x!quant.uniform<i32:f32, 1.0742187583900886E-7>>, tensor<2048x!quant.uniform<i32:f32, 1.6406249869760359E-7>>, tensor<2048x!quant.uniform<i32:f32, 1.523437447303877E-7>>, tensor<640x2048x!quant.uniform<i8<-127:127>:f32, 0.021174000576138496>>, tensor<640x!quant.uniform<i32:f32, 1.601389680352559E-4>>, tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>, tensor<1x2048x!quant.uniform<i16:f32, 4.8799999058246613E-4>>, tensor<2048x!quant.uniform<i16:f32, 4.3700000969693065E-4>>, tensor<2048x!quant.uniform<i16:f32, 1.1000000085914508E-4>>, tensor<2048x!quant.uniform<i16:f32, 1.6799999866634607E-4>>, tensor<2048x!quant.uniform<i16:f32, 1.55999994603917E-4>>) -> tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>
   func.return %0 : tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>
-// CHECK: %[[RES0:.*]] = "tfl.no_value"() {value} : () -> none
-// CHECK: %[[RES1:.*]] = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %[[RES0]], %[[RES0]], %[[RES0]], %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg19, %arg20, %arg15, %arg16, %arg17, %arg18) ({
-// CHECK: }) {cell_clip = 1.000000e+01 : f32, effective_hidden_scale_intermediate = tensor<0x!quant.uniform<i8<-127:127>:f32, 0.0075630000792443752:2>>, fused_activation_function = "TANH", input_to_cell_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0087630003690719604>>, input_to_forget_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0078849997371435165>>, input_to_input_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0049890000373125076>>, input_to_output_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0057529998011887074>>, kernel_type = #tfl<lstm_kernel_type_attr FULL>, proj_clip = 0.00999999977 : f32} : (tensor<1x528x!quant.uniform<i8:f32, 0.037248000502586365:-19>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.059801999479532242>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.031925998628139496>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.056272000074386597>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.063763998448848724>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.013358999975025654>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.022830000147223473>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.032276000827550888>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.035427000373601913>>, none, none, none, tensor<2048x!quant.uniform<i32:f32, 4.2675782196965883E-7>>, tensor<2048x!quant.uniform<i32:f32, 1.0742187583900886E-7>>, tensor<2048x!quant.uniform<i32:f32, 1.6406249869760359E-7>>, tensor<2048x!quant.uniform<i32:f32, 1.523437447303877E-7>>, tensor<640x2048x!quant.uniform<i8<-127:127>:f32, 0.021174000576138496>>, tensor<640x!quant.uniform<i32:f32, 1.601389680352559E-4>>, tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>, tensor<1x2048x!quant.uniform<i16:f32, 4.8799999058246613E-4>>, tensor<2048x!quant.uniform<i16:f32, 4.3700000969693065E-4>>, tensor<2048x!quant.uniform<i16:f32, 1.1000000085914508E-4>>, tensor<2048x!quant.uniform<i16:f32, 1.6799999866634607E-4>>, tensor<2048x!quant.uniform<i16:f32, 1.55999994603917E-4>>) -> tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>
+// CHECK: %[[RES0:.*]] = "tfl.no_value"() <{value}> : () -> none
+// CHECK: %[[RES1:.*]] = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %[[RES0]], %[[RES0]], %[[RES0]], %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg19, %arg20, %arg15, %arg16, %arg17, %arg18)
+// CHECK-SAME: <{cell_clip = 1.000000e+01 : f32, effective_hidden_scale_intermediate = tensor<0x!quant.uniform<i8<-127:127>:f32, 0.0075630000792443752:2>>, fused_activation_function = "TANH", input_to_cell_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0087630003690719604>>, input_to_forget_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0078849997371435165>>, input_to_input_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0049890000373125076>>, input_to_output_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0057529998011887074>>, kernel_type = #tfl<lstm_kernel_type_attr FULL>, proj_clip = 0.00999999977 : f32}> ({
+// CHECK: }) : (tensor<1x528x!quant.uniform<i8:f32, 0.037248000502586365:-19>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.059801999479532242>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.031925998628139496>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.056272000074386597>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.063763998448848724>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.013358999975025654>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.022830000147223473>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.032276000827550888>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.035427000373601913>>, none, none, none, tensor<2048x!quant.uniform<i32:f32, 4.2675782196965883E-7>>, tensor<2048x!quant.uniform<i32:f32, 1.0742187583900886E-7>>, tensor<2048x!quant.uniform<i32:f32, 1.6406249869760359E-7>>, tensor<2048x!quant.uniform<i32:f32, 1.523437447303877E-7>>, tensor<640x2048x!quant.uniform<i8<-127:127>:f32, 0.021174000576138496>>, tensor<640x!quant.uniform<i32:f32, 1.601389680352559E-4>>, tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>, tensor<1x2048x!quant.uniform<i16:f32, 4.8799999058246613E-4>>, tensor<2048x!quant.uniform<i16:f32, 4.3700000969693065E-4>>, tensor<2048x!quant.uniform<i16:f32, 1.1000000085914508E-4>>, tensor<2048x!quant.uniform<i16:f32, 1.6799999866634607E-4>>, tensor<2048x!quant.uniform<i16:f32, 1.55999994603917E-4>>) -> tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>
 }
 
 // -----
 
 // CHECK-LABEL: testBidirectionalSequenceLstm
 func.func @testBidirectionalSequenceLstm(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>, %arg3: tensor<?x?xf32>, %arg4: tensor<?x?xf32>, %arg5: tensor<?x?xf32>, %arg6: tensor<?x?xf32>, %arg7: tensor<?x?xf32>, %arg8: tensor<?x?xf32>, %arg9: tensor<?xf32>, %arg10: tensor<?xf32>, %arg11: tensor<?xf32>, %arg12: tensor<?xf32>, %arg13: tensor<?xf32>, %arg14: tensor<?xf32>, %arg15: tensor<?xf32>, %arg16: tensor<?x?xf32>, %arg17: tensor<?xf32>, %arg18: tensor<?x?xf32>, %arg19: tensor<?x?xf32>, %arg20: tensor<?x?xf32>, %arg21: tensor<?x?xf32>, %arg22: tensor<?x?xf32>, %arg23: tensor<?x?xf32>, %arg24: tensor<?x?xf32>, %arg25: tensor<?x?xf32>, %arg26: tensor<?xf32>, %arg27: tensor<?xf32>, %arg28: tensor<?xf32>, %arg29: tensor<?xf32>, %arg30: tensor<?xf32>, %arg31: tensor<?xf32>, %arg32: tensor<?xf32>, %arg33: tensor<?x?xf32>, %arg34: tensor<?xf32>, %arg35: tensor<?xf32>, %arg36: tensor<?xf32>, %arg37: tensor<?xf32>, %arg38: tensor<?xf32>, %arg39: tensor<?xf32>, %arg40: tensor<?xf32>, %arg41: tensor<?xf32>, %arg42: tensor<?xf32>, %arg43: tensor<?xf32>, %arg44: tensor<?xf32>, %arg45: tensor<?xf32>, %arg46: tensor<?xf32>, %arg47: tensor<?xf32>) -> tensor<?xf32> {
-  // CHECK: "tfl.bidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg30, %arg31, %arg32, %arg33, %arg34, %arg35, %arg36, %arg37, %arg38, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg45, %arg46, %arg47) {cell_clip = 1.000000e+00 : f32, fused_activation_function = "NONE", merge_outputs = true, time_major = false} : (tensor<?x?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> (tensor<?xf32>, tensor<?xf32>)
+  // CHECK: "tfl.bidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg30, %arg31, %arg32, %arg33, %arg34, %arg35, %arg36, %arg37, %arg38, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg45, %arg46, %arg47) <{cell_clip = 1.000000e+00 : f32, fused_activation_function = "NONE", merge_outputs = true, time_major = false}> : (tensor<?x?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> (tensor<?xf32>, tensor<?xf32>)
   %0:2 = "tfl.bidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg30, %arg31, %arg32, %arg33, %arg34, %arg35, %arg36, %arg37, %arg38, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg45, %arg46, %arg47) {cell_clip = 1.000000e+00 : f32, fused_activation_function = "NONE", merge_outputs = true, time_major = false} : (tensor<?x?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> (tensor<?xf32>, tensor<?xf32>)
   func.return %0#0 : tensor<?xf32>
 }
@@ -922,9 +923,10 @@ func.func @testLstmQuantizedType(%arg0: tensor<1x528x!quant.uniform<i8:f32, 0.03
     %0 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %cst, %cst, %cst, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg19, %arg20, %arg15, %arg16, %arg17, %arg18) ({
     }) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", kernel_type = #tfl<lstm_kernel_type_attr FULL>, proj_clip = 0.01 : f32} : (tensor<1x528x!quant.uniform<i8:f32, 0.037248000502586365:-19>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.059801999479532242>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.059801999479532242>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.059801999479532242>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.059801999479532242>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.059801999479532242>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.059801999479532242>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.059801999479532242>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.059801999479532242>>, none, none, none, tensor<2048x!quant.uniform<i32:f32, 1.000000e-02>>, tensor<2048x!quant.uniform<i32:f32, 1.000000e-02>>, tensor<2048x!quant.uniform<i32:f32, 1.000000e-02>>, tensor<2048x!quant.uniform<i32:f32, 1.000000e-02>>, tensor<640x2048x!quant.uniform<i8<-127:127>:f32, 0.021174000576138496>>, tensor<640x!quant.uniform<i32:f32, 9.9999999747524271E-7>>, tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>, tensor<1x2048x!quant.uniform<i16:f32, 4.8799999058246613E-4>>, tensor<2048x!quant.uniform<i16:f32, 4.3700000969693065E-4>>, tensor<2048x!quant.uniform<i16:f32, 4.3700000969693065E-4>>, tensor<2048x!quant.uniform<i16:f32, 4.3700000969693065E-4>>, tensor<2048x!quant.uniform<i16:f32, 4.3700000969693065E-4>>) -> tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>
     func.return %0 : tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>
-  // CHECK: %[[RES0:.*]] = "tfl.no_value"() {value} : () -> none
-  // CHECK: %[[RES1:.*]] = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %[[RES0]], %[[RES0]], %[[RES0]], %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg19, %arg20, %arg15, %arg16, %arg17, %arg18) ({
-  // CHECK-NEXT: }) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", kernel_type = #tfl<lstm_kernel_type_attr FULL>, proj_clip = 0.00999999977 : f32} : (tensor<1x528x!quant.uniform<i8:f32, 0.037248000502586365:-19>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.059801999479532242>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.059801999479532242>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.059801999479532242>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.059801999479532242>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.059801999479532242>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.059801999479532242>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.059801999479532242>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.059801999479532242>>, none, none, none, tensor<2048x!quant.uniform<i32:f32, 1.000000e-02>>, tensor<2048x!quant.uniform<i32:f32, 1.000000e-02>>, tensor<2048x!quant.uniform<i32:f32, 1.000000e-02>>, tensor<2048x!quant.uniform<i32:f32, 1.000000e-02>>, tensor<640x2048x!quant.uniform<i8<-127:127>:f32, 0.021174000576138496>>, tensor<640x!quant.uniform<i32:f32, 9.9999999747524271E-7>>, tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>, tensor<1x2048x!quant.uniform<i16:f32, 4.8799999058246613E-4>>, tensor<2048x!quant.uniform<i16:f32, 4.3700000969693065E-4>>, tensor<2048x!quant.uniform<i16:f32, 4.3700000969693065E-4>>, tensor<2048x!quant.uniform<i16:f32, 4.3700000969693065E-4>>, tensor<2048x!quant.uniform<i16:f32, 4.3700000969693065E-4>>) -> tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>
+  // CHECK: %[[RES0:.*]] = "tfl.no_value"() <{value}> : () -> none
+  // CHECK: %[[RES1:.*]] = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %[[RES0]], %[[RES0]], %[[RES0]], %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg19, %arg20, %arg15, %arg16, %arg17, %arg18)
+  // CHECK-SAME: <{cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", kernel_type = #tfl<lstm_kernel_type_attr FULL>, proj_clip = 0.00999999977 : f32}> ({
+  // CHECK-NEXT: }) : (tensor<1x528x!quant.uniform<i8:f32, 0.037248000502586365:-19>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.059801999479532242>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.059801999479532242>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.059801999479532242>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.059801999479532242>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.059801999479532242>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.059801999479532242>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.059801999479532242>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.059801999479532242>>, none, none, none, tensor<2048x!quant.uniform<i32:f32, 1.000000e-02>>, tensor<2048x!quant.uniform<i32:f32, 1.000000e-02>>, tensor<2048x!quant.uniform<i32:f32, 1.000000e-02>>, tensor<2048x!quant.uniform<i32:f32, 1.000000e-02>>, tensor<640x2048x!quant.uniform<i8<-127:127>:f32, 0.021174000576138496>>, tensor<640x!quant.uniform<i32:f32, 9.9999999747524271E-7>>, tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>, tensor<1x2048x!quant.uniform<i16:f32, 4.8799999058246613E-4>>, tensor<2048x!quant.uniform<i16:f32, 4.3700000969693065E-4>>, tensor<2048x!quant.uniform<i16:f32, 4.3700000969693065E-4>>, tensor<2048x!quant.uniform<i16:f32, 4.3700000969693065E-4>>, tensor<2048x!quant.uniform<i16:f32, 4.3700000969693065E-4>>) -> tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>
   // CHECK: return %[[RES1]]
 }
 
@@ -933,7 +935,8 @@ func.func @testLstmQuantizedType(%arg0: tensor<1x528x!quant.uniform<i8:f32, 0.03
 // CHECK-LABEL: testLstm
 func.func @testLstm(%arg0: tensor<? x f32>, %arg1: tensor<? x f32>, %arg2: tensor<? x ? x f32>, %arg3: tensor<? x ? x f32>, %arg4: tensor<? x f32>, %arg5: tensor<? x ? x f32>, %arg6: tensor<? x ? x f32>, %arg7: tensor<? x ? x f32>, %arg8: tensor<? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<? x f32>, %arg16: tensor<? x ? x f32>, %arg17: tensor<? x f32>, %arg18: tensor<? x f32>, %arg19: tensor<? x f32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x f32> {
   // CHECK: "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23)
-  // CHECK-NEXT: {fused_activation_function = "NONE", kernel_type = #tfl<lstm_kernel_type_attr FULL>} : (tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  // CHECK-SAME: <{fused_activation_function = "NONE", kernel_type = #tfl<lstm_kernel_type_attr FULL>}> ({
+  // CHECK-NEXT: }) : (tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   %0 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) ({}) {fused_activation_function = "NONE", kernel_type = #tfl<lstm_kernel_type_attr FULL>} : (tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
 }
@@ -966,8 +969,8 @@ func.func @testLstmWithNoneTypeAndOverrideAttr(%arg0: tensor<? x f32>,
   %arg18: tensor<? x f32>, %arg19: tensor<? x f32>,
   %arg20: none, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x f32> {
   // CHECK: "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23)
-  // CHECK-NEXT: {cell_clip = 1.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = #tfl<lstm_kernel_type_attr FULL>} :
-  // CHECK-SAME: (tensor<?xf32>,
+  // CHECK-SAME: <{cell_clip = 1.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = #tfl<lstm_kernel_type_attr FULL>}> ({
+  // CHECK-NEXT: }) : (tensor<?xf32>,
   // CHECK-SAME: none, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, none, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>,
   // CHECK-SAME: none, tensor<?xf32>, tensor<?xf32>, none, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>,
   // CHECK-SAME: none, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
@@ -1362,13 +1365,13 @@ func.func @testPadV2UnsupportedPaddings(tensor<*xf32>, tensor<6x3xi32>) -> tenso
 // -----
 
 func.func @packQuantizedU8(%arg0: tensor<2x!quant.uniform<u8:f32, 0.1>>, %arg1: tensor<2x!quant.uniform<u8:f32, 0.1>>) -> tensor<2x2x!quant.uniform<u8:f32, 0.1>> {
-  // CHECK: "tfl.pack"(%arg0, %arg1) {axis = 0 : i32, values_count = 2 : i32}
+  // CHECK: "tfl.pack"(%arg0, %arg1) <{axis = 0 : i32, values_count = 2 : i32}>
   %0 = "tfl.pack"(%arg0, %arg1) {axis = 0 : i32, values_count = 2 : i32} : (tensor<2x!quant.uniform<u8:f32, 0.1>>, tensor<2x!quant.uniform<u8:f32, 0.1>>) -> tensor<2x2x!quant.uniform<u8:f32, 0.1>>
   func.return %0 : tensor<2x2x!quant.uniform<u8:f32, 0.1>>
 }
 
 func.func @packQuantizedI8(%arg0: tensor<2x!quant.uniform<i8:f32, 0.1>>, %arg1: tensor<2x!quant.uniform<i8:f32, 0.1>>) -> tensor<2x2x!quant.uniform<i8:f32, 0.1>> {
-  // CHECK: "tfl.pack"(%arg0, %arg1) {axis = 0 : i32, values_count = 2 : i32}
+  // CHECK: "tfl.pack"(%arg0, %arg1) <{axis = 0 : i32, values_count = 2 : i32}>
   %0 = "tfl.pack"(%arg0, %arg1) {axis = 0 : i32, values_count = 2 : i32} : (tensor<2x!quant.uniform<i8:f32, 0.1>>, tensor<2x!quant.uniform<i8:f32, 0.1>>) -> tensor<2x2x!quant.uniform<i8:f32, 0.1>>
   func.return %0 : tensor<2x2x!quant.uniform<i8:f32, 0.1>>
 }
@@ -1376,7 +1379,7 @@ func.func @packQuantizedI8(%arg0: tensor<2x!quant.uniform<i8:f32, 0.1>>, %arg1:
 // -----
 
 func.func @pack(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2x2xi32> {
-  // CHECK: "tfl.pack"(%arg0, %arg1) {axis = 0 : i32, values_count = 2 : i32}
+  // CHECK: "tfl.pack"(%arg0, %arg1) <{axis = 0 : i32, values_count = 2 : i32}>
   %0 = "tfl.pack"(%arg0, %arg1) {axis = 0 : i32, values_count = 2 : i32} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2x2xi32>
   func.return %0 : tensor<2x2xi32>
 }
@@ -1384,7 +1387,7 @@ func.func @pack(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2x2xi32> {
 // -----
 
 func.func @packUnranked(%arg0: tensor<2xi32>, %arg1: tensor<*xi32>) -> tensor<2x2xi32> {
-  // CHECK: "tfl.pack"(%arg0, %arg1) {axis = 0 : i32, values_count = 2 : i32}
+  // CHECK: "tfl.pack"(%arg0, %arg1) <{axis = 0 : i32, values_count = 2 : i32}>
   %0 = "tfl.pack"(%arg0, %arg1) {axis = 0 : i32, values_count = 2 : i32} : (tensor<2xi32>, tensor<*xi32>) -> tensor<2x2xi32>
   func.return %0 : tensor<2x2xi32>
 }
@@ -1392,7 +1395,7 @@ func.func @packUnranked(%arg0: tensor<2xi32>, %arg1: tensor<*xi32>) -> tensor<2x
 // -----
 
 func.func @packInputRank(%arg0: tensor<1x4xi32>, %arg1: tensor<1x4xi32>) -> tensor<1x4x2xi32> {
-  // CHECK: "tfl.pack"(%arg0, %arg1) {axis = 2 : i32, values_count = 2 : i32}
+  // CHECK: "tfl.pack"(%arg0, %arg1) <{axis = 2 : i32, values_count = 2 : i32}>
   %0 = "tfl.pack"(%arg0, %arg1) {axis = 2 : i32, values_count = 2 : i32} : (tensor<1x4xi32>, tensor<1x4xi32>) -> tensor<1x4x2xi32>
   func.return %0 : tensor<1x4x2xi32>
 }
@@ -1400,13 +1403,13 @@ func.func @packInputRank(%arg0: tensor<1x4xi32>, %arg1: tensor<1x4xi32>) -> tens
 // -----
 
 func.func @packNegInputAxis2(%arg0: tensor<1x4xi32>, %arg1: tensor<1x4xi32>) -> tensor<1x2x4xi32> {
-  // CHECK: "tfl.pack"(%arg0, %arg1) {axis = -2 : i32, values_count = 2 : i32}
+  // CHECK: "tfl.pack"(%arg0, %arg1) <{axis = -2 : i32, values_count = 2 : i32}>
   %0 = "tfl.pack"(%arg0, %arg1) {axis = -2 : i32, values_count = 2 : i32} : (tensor<1x4xi32>, tensor<1x4xi32>) -> tensor<1x2x4xi32>
   func.return %0 : tensor<1x2x4xi32>
 }
 
 func.func @packNegInputAxis3(%arg0: tensor<1x4xi32>, %arg1: tensor<1x4xi32>) -> tensor<2x1x4xi32> {
-  // CHECK: "tfl.pack"(%arg0, %arg1) {axis = -3 : i32, values_count = 2 : i32}
+  // CHECK: "tfl.pack"(%arg0, %arg1) <{axis = -3 : i32, values_count = 2 : i32}>
   %0 = "tfl.pack"(%arg0, %arg1) {axis = -3 : i32, values_count = 2 : i32} : (tensor<1x4xi32>, tensor<1x4xi32>) -> tensor<2x1x4xi32>
   func.return %0 : tensor<2x1x4xi32>
 }
@@ -1414,7 +1417,7 @@ func.func @packNegInputAxis3(%arg0: tensor<1x4xi32>, %arg1: tensor<1x4xi32>) ->
 // -----
 
 func.func @packInputUnranked(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>) -> tensor<*xi32> {
-  // CHECK: "tfl.pack"(%arg0, %arg1) {axis = -2 : i32, values_count = 2 : i32}
+  // CHECK: "tfl.pack"(%arg0, %arg1) <{axis = -2 : i32, values_count = 2 : i32}>
   %0 = "tfl.pack"(%arg0, %arg1) {axis = -2 : i32, values_count = 2 : i32} : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
   func.return %0 : tensor<*xi32>
 }
@@ -1446,7 +1449,7 @@ func.func @pack(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2x2xi32> {
 // -----
 
 func.func @unpack(%arg0: tensor<2x3xi32>) -> tensor<2xi32> {
-  // CHECK: "tfl.unpack"(%arg0) {axis = 1 : i32, num = 3 : i32}
+  // CHECK: "tfl.unpack"(%arg0) <{axis = 1 : i32, num = 3 : i32}>
   %0:3 = "tfl.unpack"(%arg0) {axis = 1 : i32, num = 3 : i32} : (tensor<2x3xi32>) -> (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>)
   func.return %0#0 : tensor<2xi32>
 }
@@ -1454,7 +1457,7 @@ func.func @unpack(%arg0: tensor<2x3xi32>) -> tensor<2xi32> {
 // -----
 
 func.func @unpack(%arg0: tensor<2x3xi32>) -> tensor<2xi32> {
-  // CHECK: "tfl.unpack"(%arg0) {axis = -1 : i32, num = 3 : i32}
+  // CHECK: "tfl.unpack"(%arg0) <{axis = -1 : i32, num = 3 : i32}>
   %0:3 = "tfl.unpack"(%arg0) {axis = -1 : i32, num = 3 : i32} : (tensor<2x3xi32>) -> (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>)
   func.return %0#0 : tensor<2xi32>
 }
@@ -1462,7 +1465,7 @@ func.func @unpack(%arg0: tensor<2x3xi32>) -> tensor<2xi32> {
 // -----
 
 func.func @unpack(%arg0: tensor<2x3xi32>) -> tensor<3xi32> {
-  // CHECK: "tfl.unpack"(%arg0) {axis = -2 : i32, num = 2 : i32}
+  // CHECK: "tfl.unpack"(%arg0) <{axis = -2 : i32, num = 2 : i32}>
   %0:2 = "tfl.unpack"(%arg0) {axis = -2 : i32, num = 2 : i32} : (tensor<2x3xi32>) -> (tensor<3xi32>, tensor<3xi32>)
   func.return %0#0 : tensor<3xi32>
 }
@@ -1538,7 +1541,7 @@ func.func @unpack(%arg0: tensor<?x2x5x3xf32>) -> () {
 
 // CHECK-LABEL: testMean
 func.func @testMean(%arg0: tensor<2x2xf32>, %arg1 : tensor<1xi32>) -> tensor<1x2xf32> {
-  // CHECK: "tfl.mean"(%arg0, %arg1) {keep_dims = false}
+  // CHECK: "tfl.mean"(%arg0, %arg1) <{keep_dims = false}>
   %0 = "tfl.mean"(%arg0, %arg1) {keep_dims = false}: (tensor<2x2xf32>, tensor<1xi32>) -> tensor<1x2xf32>
   func.return %0 : tensor<1x2xf32>
 }
@@ -1547,7 +1550,7 @@ func.func @testMean(%arg0: tensor<2x2xf32>, %arg1 : tensor<1xi32>) -> tensor<1x2
 
 // CHECK-LABEL: testMean_true
 func.func @testMean_true(%arg0: tensor<2x2xf32>, %arg1 : tensor<1xi32>) -> tensor<1x2xf32> {
-  // CHECK: "tfl.mean"(%arg0, %arg1) {keep_dims = true}
+  // CHECK: "tfl.mean"(%arg0, %arg1) <{keep_dims = true}>
   %0 = "tfl.mean"(%arg0, %arg1) {keep_dims = true}: (tensor<2x2xf32>, tensor<1xi32>) -> tensor<1x2xf32>
   func.return %0 : tensor<1x2xf32>
 }
@@ -1597,7 +1600,7 @@ func.func @testBatchMatmulHybridQuant(%arg0 : tensor<1x4x384x32xf32>, %arg1 : te
 // -----
 
 func.func @testConcat(%arg0: tensor<1x2xi32>, %arg1: tensor<1x2xi32>) -> tensor<2x2xi32> {
-  // CHECK: "tfl.concatenation"(%arg0, %arg1) {axis = 0 : i32, fused_activation_function = "NONE"}
+  // CHECK: "tfl.concatenation"(%arg0, %arg1) <{axis = 0 : i32, fused_activation_function = "NONE"}>
   %0 = "tfl.concatenation"(%arg0, %arg1) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1x2xi32>, tensor<1x2xi32>) -> tensor<2x2xi32>
   func.return %0 : tensor<2x2xi32>
 }
@@ -1605,7 +1608,7 @@ func.func @testConcat(%arg0: tensor<1x2xi32>, %arg1: tensor<1x2xi32>) -> tensor<
 // -----
 
 func.func @testConcatQuantized(%arg0: tensor<1x2x!quant.uniform<i8:f32, 0.1:128>>, %arg1: tensor<1x2x!quant.uniform<i8:f32, 0.1:128>>) -> tensor<2x2x!quant.uniform<i8:f32, 0.1:128>> {
-  // CHECK: "tfl.concatenation"(%arg0, %arg1) {axis = 0 : i32, fused_activation_function = "NONE"}
+  // CHECK: "tfl.concatenation"(%arg0, %arg1) <{axis = 0 : i32, fused_activation_function = "NONE"}>
   %0 = "tfl.concatenation"(%arg0, %arg1) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1x2x!quant.uniform<i8:f32, 0.1:128>>, tensor<1x2x!quant.uniform<i8:f32, 0.1:128>>) -> tensor<2x2x!quant.uniform<i8:f32, 0.1:128>>
   func.return %0 : tensor<2x2x!quant.uniform<i8:f32, 0.1:128>>
 }
@@ -1692,7 +1695,7 @@ func.func @testConcatBenignDynamicDimSizeOperand(%arg0: tensor<1x?xi32>, %arg1:
 
 // CHECK-LABEL: testResizeBilinear
 func.func @testResizeBilinear(%arg0 : tensor<1x100x100x3xf32>, %arg1 : tensor<4xi32>) -> tensor<?xf32> {
-  // CHECK: "tfl.resize_bilinear"(%arg0, %arg1) {align_corners = false, half_pixel_centers = false}
+  // CHECK: "tfl.resize_bilinear"(%arg0, %arg1) <{align_corners = false, half_pixel_centers = false}>
   %0 = "tfl.resize_bilinear"(%arg0, %arg1) {align_corners = false, half_pixel_centers = false} : (tensor<1x100x100x3xf32>, tensor<4xi32>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
 }
@@ -1709,7 +1712,7 @@ func.func @testResizeBilinearInvalidOutputType(%arg0 : tensor<1x100x100x3xf32>,
 
 // CHECK-LABEL: testStridedSlice
 func.func @testStridedSlice(%arg0: tensor<12x2x2x5xf32>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>, %arg3: tensor<1xi32>) -> tensor<1x2x2x5xf32> {
-  // CHECK: "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32>
+  // CHECK: "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) <{begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32}> : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32>
   %0 = "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, offset = false} : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32>
   func.return %0 : tensor<1x2x2x5xf32>
 }
@@ -1750,7 +1753,7 @@ func.func @testStridedSliceWithInvalidOutputType(%arg0: tensor<12x2x2x5xf32>, %a
 
 // CHECK-LABEL: testOneHot
 func.func @testOneHot(%arg0: tensor<3xi32>, %arg1: tensor<i32>, %arg2: tensor<f32>, %arg3: tensor<f32>) -> tensor<*xf32> {
-  // CHECK: "tfl.one_hot"(%arg0, %arg1, %arg2, %arg3) {axis = -1 : i32} : (tensor<3xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<*xf32>
+  // CHECK: "tfl.one_hot"(%arg0, %arg1, %arg2, %arg3) <{axis = -1 : i32}> : (tensor<3xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<*xf32>
   %0 = "tfl.one_hot"(%arg0, %arg1, %arg2, %arg3) {axis = -1 : i32} : (tensor<3xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
 }
@@ -1784,7 +1787,7 @@ func.func @testArgMin(%arg0: tensor<3xi32>, %arg1: tensor<i32>) -> tensor<i32> {
 // CHECK-LABEL: testSpaceToDepth
 func.func @testSpaceToDepthF32(%arg0: tensor<1x2x2x1xf32>) -> tensor<1x1x1x4xf32> {
   // CHECK: %[[ARG:.*]]: tensor<1x2x2x1xf32>
-  // CHECK: "tfl.space_to_depth"(%[[ARG]]) {block_size = 2 : i32} : (tensor<1x2x2x1xf32>) -> tensor<1x1x1x4xf32>
+  // CHECK: "tfl.space_to_depth"(%[[ARG]]) <{block_size = 2 : i32}> : (tensor<1x2x2x1xf32>) -> tensor<1x1x1x4xf32>
   %0 = "tfl.space_to_depth"(%arg0) {block_size = 2: i32} : (tensor<1x2x2x1xf32>) -> tensor<1x1x1x4xf32>
   func.return %0 : tensor<1x1x1x4xf32>
 }
@@ -2046,7 +2049,7 @@ func.func @testWrongQuantizedLocalResponseNormalization(%arg0 : tensor<1x56x56x1
 
 // CHECK-LABEL: testSvdf
 func.func @testSvdf(%arg0: tensor<? x f32>, %arg1: tensor<? x f32>, %arg2: tensor<? x f32>, %arg3: tensor<? x f32>, %arg4: tensor<? x f32>) -> tensor<? x f32> {
-  // CHECK: "tfl.svdf"(%arg0, %arg1, %arg2, %arg3, %arg4) {fused_activation_function = "RELU", rank = 2 : i32} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  // CHECK: "tfl.svdf"(%arg0, %arg1, %arg2, %arg3, %arg4) <{fused_activation_function = "RELU", rank = 2 : i32}> : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   %0 = "tfl.svdf"(%arg0, %arg1, %arg2, %arg3, %arg4) {fused_activation_function = "RELU", rank = 2 : i32} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
 }
@@ -2056,7 +2059,7 @@ func.func @testSvdf(%arg0: tensor<? x f32>, %arg1: tensor<? x f32>, %arg2: tenso
 // CHECK-LABEL: testDepthToSpace
 func.func @testDepthToSpaceF32(%arg0: tensor<1x1x1x4xf32>) -> tensor<1x2x2x1xf32> {
   // CHECK: %[[ARG:.*]]: tensor<1x1x1x4xf32>
-  // CHECK: "tfl.depth_to_space"(%[[ARG]]) {block_size = 2 : i32} : (tensor<1x1x1x4xf32>) -> tensor<1x2x2x1xf32>
+  // CHECK: "tfl.depth_to_space"(%[[ARG]]) <{block_size = 2 : i32}> : (tensor<1x1x1x4xf32>) -> tensor<1x2x2x1xf32>
   %0 = "tfl.depth_to_space"(%arg0) {block_size = 2: i32} : (tensor<1x1x1x4xf32>) -> tensor<1x2x2x1xf32>
   func.return %0 : tensor<1x2x2x1xf32>
 }
@@ -2619,7 +2622,7 @@ func.func @testTransposeConv(%arg0: tensor<4xi32>, %arg1: tensor<32x4x4x128xf32>
 
 // CHECK-LABEL: testTransposeConvWithOutputThatHasDynamicSizes
 func.func @testTransposeConvWithOutputThatHasDynamicSizes(%arg0: tensor<4xi32>, %arg1: tensor<32x4x4x128xf32>, %arg2: tensor<1x32x42x128xf32>) -> tensor<?x?x?x?xf32> {
-  // CHECK: %[[NONE:.*]] = "tfl.no_value"() {value} : () -> none
+  // CHECK: %[[NONE:.*]] = "tfl.no_value"() <{value}> : () -> none
   // CHECK: "tfl.transpose_conv"(%arg0, %arg1, %arg2, %[[NONE]])
   %cst = "tfl.no_value"() {value = unit} : () -> none
   %0 = "tfl.transpose_conv"(%arg0, %arg1, %arg2, %cst) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32, fused_activation_function = "NONE"} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, none) -> tensor<?x?x?x?xf32>
diff --git a/tensorflow/compiler/mlir/lite/tests/optimize.mlir b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
index 75c1a791eeca73..3c2c24baba8972 100644
--- a/tensorflow/compiler/mlir/lite/tests/optimize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
@@ -14,7 +14,7 @@ func.func @fusedConv2dRelu(%arg0: tensor<256x32x32x3xf32>, %arg1: tensor<16x3x3x
   %1 = "tfl.relu"(%0) : (tensor<256x32x32x16xf32>) -> tensor<256x32x32x16xf32>
   func.return %1 : tensor<256x32x32x16xf32>
 
-  // CHECK: %0 = "tfl.conv_2d"(%arg0, %arg1, %arg2) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "RELU", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<256x32x32x3xf32>, tensor<16x3x3x3xf32>, tensor<16xf32>) -> tensor<256x32x32x16xf32>
+  // CHECK: %0 = "tfl.conv_2d"(%arg0, %arg1, %arg2) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "RELU", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<256x32x32x3xf32>, tensor<16x3x3x3xf32>, tensor<16xf32>) -> tensor<256x32x32x16xf32>
   // CHECK: return %0
 }
 
@@ -24,7 +24,7 @@ func.func @fusedDepthwiseConv2dRelu6(%arg0: tensor<256x32x32x3xf32>, %arg1: tens
   %1 = "tfl.relu6"(%0) : (tensor<256x30x30x16xf32>) -> tensor<256x30x30x16xf32>
   func.return %1 : tensor<256x30x30x16xf32>
 
-  // CHECK: %0 = "tfl.depthwise_conv_2d"(%arg0, %arg1, %arg2) {depth_multiplier = 4 : i32, dilation_h_factor = 2 : i32, dilation_w_factor = 3 : i32, fused_activation_function = "RELU6", padding = "SAME", stride_h = 4 : i32, stride_w = 5 : i32} : (tensor<256x32x32x3xf32>, tensor<16x3x3x3xf32>, tensor<16xf32>) -> tensor<256x30x30x16xf32>
+  // CHECK: %0 = "tfl.depthwise_conv_2d"(%arg0, %arg1, %arg2) <{depth_multiplier = 4 : i32, dilation_h_factor = 2 : i32, dilation_w_factor = 3 : i32, fused_activation_function = "RELU6", padding = "SAME", stride_h = 4 : i32, stride_w = 5 : i32}> : (tensor<256x32x32x3xf32>, tensor<16x3x3x3xf32>, tensor<16xf32>) -> tensor<256x30x30x16xf32>
   // CHECK: return %0
 }
 
@@ -34,7 +34,7 @@ func.func @fusedMaxPool2dRelu(%arg0: tensor<1x147x147x16xf32>) -> tensor<1x73x73
   %1 = "tfl.relu"(%0) : (tensor<1x73x73x16xf32>) -> tensor<1x73x73x16xf32>
   func.return %1 : tensor<1x73x73x16xf32>
 
-  // CHECK: %0 = "tfl.max_pool_2d"(%arg0) {filter_height = 3 : i32, filter_width = 3 : i32, fused_activation_function = "RELU", padding = "VALID", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x147x147x16xf32>) -> tensor<1x73x73x16xf32>
+  // CHECK: %0 = "tfl.max_pool_2d"(%arg0) <{filter_height = 3 : i32, filter_width = 3 : i32, fused_activation_function = "RELU", padding = "VALID", stride_h = 2 : i32, stride_w = 2 : i32}> : (tensor<1x147x147x16xf32>) -> tensor<1x73x73x16xf32>
   // CHECK: return %0
 }
 
@@ -44,7 +44,7 @@ func.func @fusedAvgPool2dRelu1(%arg0: tensor<1x147x147x16xf32>) -> tensor<1x73x7
   %1 = "tfl.relu_n1_to_1"(%0) : (tensor<1x73x73x16xf32>) -> tensor<1x73x73x16xf32>
   func.return %1 : tensor<1x73x73x16xf32>
 
-  // CHECK: %0 = "tfl.average_pool_2d"(%arg0) {filter_height = 3 : i32, filter_width = 3 : i32, fused_activation_function = "RELU_N1_TO_1", padding = "VALID", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x147x147x16xf32>) -> tensor<1x73x73x16xf32>
+  // CHECK: %0 = "tfl.average_pool_2d"(%arg0) <{filter_height = 3 : i32, filter_width = 3 : i32, fused_activation_function = "RELU_N1_TO_1", padding = "VALID", stride_h = 2 : i32, stride_w = 2 : i32}> : (tensor<1x147x147x16xf32>) -> tensor<1x73x73x16xf32>
   // CHECK: return %0
 }
 
@@ -188,7 +188,7 @@ func.func @fuseMulIntoTransposeConvNoBias(%arg0: tensor<1x32x42x128xf32>) -> ten
 
   // CHECK-DAG: %[[SHAPE:.*]] = arith.constant dense<[1, 64, 84, 32]> : tensor<4xi32>
   // CHECK-DAG: %[[WEIGHTS:.*]] = arith.constant dense<1.500000e+00> : tensor<32x4x4x128xf32>
-  // CHECK-DAG: %[[BIAS:.*]] = "tfl.no_value"() {value} : () -> none
+  // CHECK-DAG: %[[BIAS:.*]] = "tfl.no_value"() <{value}> : () -> none
   // CHECK: %[[RESULT:.*]] = "tfl.transpose_conv"(%[[SHAPE]], %[[WEIGHTS]], %arg0, %[[BIAS]])
   // CHECK: return %[[RESULT]]
 }
@@ -204,7 +204,7 @@ func.func @fuseAddIntoFollowingConv2d(%arg0: tensor<256x32x32x3xf32>) -> tensor<
 
 // CHECK-DAG: %[[w:.*]] = arith.constant dense<1.000000e+00> : tensor<16x3x3x3xf32>
 // CHECK-DAG: %[[b:.*]] = "tfl.pseudo_const"(){{.*}}dense<[4.150000e+01, 4.250000e+01, 4.350000e+01, 4.450000e+01, 4.550000e+01, 4.650000e+01, 4.750000e+01, 4.850000e+01, 4.950000e+01, 5.050000e+01, 5.150000e+01, 5.250000e+01, 5.350000e+01, 5.450000e+01, 5.550000e+01, 5.650000e+01]> : tensor<16xf32>
-// CHECK-NEXT: %[[c:.*]] = "tfl.conv_2d"(%arg0, %[[w]], %[[b]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<256x32x32x3xf32>, tensor<16x3x3x3xf32>, tensor<16xf32>) -> tensor<256x30x30x16xf32>
+// CHECK-NEXT: %[[c:.*]] = "tfl.conv_2d"(%arg0, %[[w]], %[[b]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<256x32x32x3xf32>, tensor<16x3x3x3xf32>, tensor<16xf32>) -> tensor<256x30x30x16xf32>
 // CHECK-NEXT: return %[[c]] : tensor<256x30x30x16xf32>
 }
 
@@ -219,7 +219,7 @@ func.func @fuseSubIntoFollowingConv2d(%arg0: tensor<256x32x32x3xf32>) -> tensor<
 
 // CHECK-DAG: %[[w:.*]] = arith.constant dense<1.000000e+00> : tensor<16x3x3x3xf32>
 // CHECK-DAG: %[[b:.*]] = "tfl.pseudo_const"(){{.*}}dense<[-3.950000e+01, -3.850000e+01, -3.750000e+01, -3.650000e+01, -3.550000e+01, -3.450000e+01, -3.350000e+01, -3.250000e+01, -3.150000e+01, -3.050000e+01, -2.950000e+01, -2.850000e+01, -2.750000e+01, -2.650000e+01, -2.550000e+01, -2.450000e+01]> : tensor<16xf32>
-// CHECK-NEXT: %[[c:.*]] = "tfl.conv_2d"(%arg0, %[[w]], %[[b]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<256x32x32x3xf32>, tensor<16x3x3x3xf32>, tensor<16xf32>) -> tensor<256x30x30x16xf32>
+// CHECK-NEXT: %[[c:.*]] = "tfl.conv_2d"(%arg0, %[[w]], %[[b]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<256x32x32x3xf32>, tensor<16x3x3x3xf32>, tensor<16xf32>) -> tensor<256x30x30x16xf32>
 // CHECK-NEXT: return %[[c]] : tensor<256x30x30x16xf32>
 }
 
@@ -279,7 +279,7 @@ func.func @fuseAddIntoFollowingDepthwiseConv2d(%arg0: tensor<256x32x32x3xf32>) -
 
 // CHECK-DAG: %[[w:.*]] = arith.constant dense<1.000000e+00> : tensor<3x3x3x16xf32>
 // CHECK-DAG: %[[b:.*]] = "tfl.pseudo_const"(){{.*}}dense<[4.150000e+01, 4.250000e+01, 4.350000e+01, 4.450000e+01, 4.550000e+01, 4.650000e+01, 4.750000e+01, 4.850000e+01, 4.950000e+01, 5.050000e+01, 5.150000e+01, 5.250000e+01, 5.350000e+01, 5.450000e+01, 5.550000e+01, 5.650000e+01]> : tensor<16xf32>
-// CHECK-NEXT: %[[dc:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[w]], %[[b]]) {depth_multiplier = 4 : i32, dilation_h_factor = 2 : i32, dilation_w_factor = 3 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 4 : i32, stride_w = 5 : i32} : (tensor<256x32x32x3xf32>, tensor<3x3x3x16xf32>, tensor<16xf32>) -> tensor<256x30x30x16xf32>
+// CHECK-NEXT: %[[dc:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[w]], %[[b]]) <{depth_multiplier = 4 : i32, dilation_h_factor = 2 : i32, dilation_w_factor = 3 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 4 : i32, stride_w = 5 : i32}> : (tensor<256x32x32x3xf32>, tensor<3x3x3x16xf32>, tensor<16xf32>) -> tensor<256x30x30x16xf32>
 // CHECK-NEXT: return %[[dc]] : tensor<256x30x30x16xf32>
 }
 
@@ -322,7 +322,7 @@ func.func @fuseMulIntoConv2dWithQDQs(%arg0: tensor<256x32x32x3xf32>) -> tensor<2
 
   // CHECK-DAG: %[[w:.*]] = "tfl.pseudo_const"(){{.*}}dense<3.000000e+00> : tensor<3x3x3x3xf32>
   // CHECK-DAG: %[[cst:.*]] = "tfl.pseudo_const"(){{.*}}dense<[1.500000e+00, 3.000000e+00, 4.500000e+00]> : tensor<3xf32>
-  // CHECK: %[[q:.*]] = "tfl.quantize"(%[[w]]) {qtype = tensor<3x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {1.500000e+00,3.000000e+00,4.500000e+00}>>}
+  // CHECK: %[[q:.*]] = "tfl.quantize"(%[[w]]) <{qtype = tensor<3x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {1.500000e+00,3.000000e+00,4.500000e+00}>>}>
   // CHECK: %[[dq:.*]] = "tfl.dequantize"(%[[q]])
   // CHECK: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[dq]], %[[cst]])
   // CHECK: return %[[conv]] : tensor<256x8x7x3xf32>
@@ -341,7 +341,7 @@ func.func @fuseMulIntoFullyConnectedWithOptionalAttribute(%arg0: tensor<4x2xf32>
 
 // CHECK-DAG:  %[[CONSTANT:.*]] = "tfl.pseudo_const"(){{.*}}dense<{{\[\[}}1.000000e+00, 2.000000e+00], [6.000000e+00, 8.000000e+00]]> : tensor<2x2xf32>
 // CHECK-DAG:  %[[CONSTANT0:.*]] = "tfl.pseudo_const"(){{.*}}dense<[2.000000e+00, 4.000000e+00]> : tensor<2xf32>
-// CHECK:  %[[RES:.*]] = "tfl.fully_connected"(%arg0, %[[CONSTANT]], %[[CONSTANT0]]) {asymmetric_quantize_inputs = true,
+// CHECK:  %[[RES:.*]] = "tfl.fully_connected"(%arg0, %[[CONSTANT]], %[[CONSTANT0]]) <{asymmetric_quantize_inputs = true,
 }
 
 // CHECK-LABEL: @fuseMulIntoFullyConnected
@@ -357,7 +357,7 @@ func.func @fuseMulIntoFullyConnected(%arg0: tensor<4x2xf32>) -> tensor<4x2xf32>
 
 // CHECK-DAG:  %[[CONSTANT:.*]] = "tfl.pseudo_const"(){{.*}}dense<{{\[\[}}1.000000e+00, 2.000000e+00], [6.000000e+00, 8.000000e+00]]> : tensor<2x2xf32>
 // CHECK-DAG:  %[[CONSTANT0:.*]] = "tfl.pseudo_const"(){{.*}}dense<[2.000000e+00, 4.000000e+00]> : tensor<2xf32>
-// CHECK:  %[[RES:.*]] = "tfl.fully_connected"(%arg0, %[[CONSTANT]], %[[CONSTANT0]]) {fused_activation_function = "RELU6", keep_num_dims = false, weights_format = "DEFAULT"}
+// CHECK:  %[[RES:.*]] = "tfl.fully_connected"(%arg0, %[[CONSTANT]], %[[CONSTANT0]]) <{fused_activation_function = "RELU6", keep_num_dims = false, weights_format = "DEFAULT"}>
 // CHECK:  return %[[RES]] : tensor<4x2xf32>
 }
 
@@ -372,8 +372,8 @@ func.func @DontFuseMulIntoFullyConnectedForLargeFilter(%arg0: tensor<128x256000x
 
   func.return %1 : tensor<128x1024xf32>
 
-// CHECK:  %[[a:.*]] = "tfl.fully_connected"(%arg0, %cst_0, %cst_1) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}
-// CHECK:  %[[b:.*]] = tfl.mul(%[[a]], %cst) {fused_activation_function = "RELU6"}
+// CHECK:  %[[a:.*]] = "tfl.fully_connected"(%arg0, %cst_0, %cst_1) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}>
+// CHECK:  %[[b:.*]] = tfl.mul(%[[a]], %cst) <{fused_activation_function = "RELU6"}>
 }
 
 
@@ -393,9 +393,9 @@ func.func @skipFuseMulIntoFullyConnected(%arg0: tensor<4x2xf32>) -> (tensor<1x8x
   // CHECK:  %cst_0 = arith.constant dense<2.000000e+00> : tensor<2xf32>
   // CHECK:  %cst_1 = arith.constant dense<[1.000000e+00, 2.000000e+00]> : tensor<2xf32>
   // CHECK:  %cst_2 = arith.constant dense<[1, 8]> : tensor<2xi32>
-  // CHECK:  %0 = "tfl.fully_connected"(%arg0, %cst, %cst_0) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<4x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<4x2xf32>
+  // CHECK:  %0 = "tfl.fully_connected"(%arg0, %cst, %cst_0) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<4x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<4x2xf32>
   // CHECK:  %1 = "tfl.reshape"(%0, %cst_2) : (tensor<4x2xf32>, tensor<2xi32>) -> tensor<1x8xf32>
-  // CHECK:  %2 = tfl.mul(%0, %cst_1) {fused_activation_function = "RELU6"} : (tensor<4x2xf32>, tensor<2xf32>) -> tensor<4x2xf32>
+  // CHECK:  %2 = tfl.mul(%0, %cst_1) <{fused_activation_function = "RELU6"}> : (tensor<4x2xf32>, tensor<2xf32>) -> tensor<4x2xf32>
   // CHECK:  return %1, %2 : tensor<1x8xf32>, tensor<4x2xf32>
 }
 
@@ -414,7 +414,7 @@ func.func @fuseAddIntoFollowingFullyConnectedWithQDQs(%arg0: tensor<4x2xf32>) ->
 // CHECK-DAG: %[[b:.*]] = "tfl.pseudo_const"(){{.*}}dense<[6.500000e+00, 1.250000e+01]> : tensor<2xf32>
 // CHECK-NEXT: %[[q:.*]] = "tfl.quantize"(%[[w]])
 // CHECK-NEXT: %[[dq:.*]] = "tfl.dequantize"(%[[q]])
-// CHECK-NEXT: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %[[dq]], %[[b]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<4x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<4x2xf32>
+// CHECK-NEXT: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %[[dq]], %[[b]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<4x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<4x2xf32>
 // CHECK-NEXT: return %[[fc]] : tensor<4x2xf32>
 }
 
@@ -429,7 +429,7 @@ func.func @fuseAddIntoFollowingFullyConnected(%arg0: tensor<4x2xf32>) -> tensor<
 
 // CHECK-DAG: %[[w:.*]] = arith.constant dense<{{\[}}[1.000000e+00, 2.000000e+00], [3.000000e+00, 4.000000e+00]]> : tensor<2x2xf32>
 // CHECK-DAG: %[[b:.*]] = "tfl.pseudo_const"(){{.*}}dense<[6.500000e+00, 1.250000e+01]> : tensor<2xf32>
-// CHECK-NEXT: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %[[w]], %[[b]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<4x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<4x2xf32>
+// CHECK-NEXT: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %[[w]], %[[b]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<4x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<4x2xf32>
 // CHECK-NEXT: return %[[fc]] : tensor<4x2xf32>
 }
 
@@ -456,7 +456,7 @@ func.func @fuseMulIntoFollowingFullyConnected(%arg0: tensor<4x2xf32>) -> tensor<
 
 // CHECK-DAG: %[[b:.*]] = arith.constant dense<2.000000e+00> : tensor<2xf32>
 // CHECK-DAG: %[[w:.*]] = "tfl.pseudo_const"(){{.*}}dense<{{\[}}[1.500000e+00, 3.000000e+00], [4.500000e+00, 6.000000e+00]]> : tensor<2x2xf32>
-// CHECK-NEXT: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %[[w]], %[[b]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<4x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<4x2xf32>
+// CHECK-NEXT: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %[[w]], %[[b]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<4x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<4x2xf32>
 // CHECK-NEXT: return %[[fc]] : tensor<4x2xf32>
 }
 
@@ -472,7 +472,7 @@ func.func @fuseMulIntoFullyConnectedBroadcast(%arg0: tensor<1x3xf32>) -> tensor<
 
 // CHECK-DAG:  %[[CONSTANT:.*]] = "tfl.pseudo_const"(){{.*}}dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [2.000000e+00, 4.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
 // CHECK-DAG:  %[[CONSTANT0:.*]] = "tfl.pseudo_const"(){{.*}}dense<[2.000000e+00, 4.000000e+00]> : tensor<2xf32>
-// CHECK:  %[[RES:.*]] = "tfl.fully_connected"(%arg0, %[[CONSTANT]], %[[CONSTANT0]]) {fused_activation_function = "RELU6", keep_num_dims = false, weights_format = "DEFAULT"}
+// CHECK:  %[[RES:.*]] = "tfl.fully_connected"(%arg0, %[[CONSTANT]], %[[CONSTANT0]]) <{fused_activation_function = "RELU6", keep_num_dims = false, weights_format = "DEFAULT"}>
 // CHECK:  return %[[RES]] : tensor<1x2xf32>
 }
 
@@ -487,7 +487,7 @@ func.func @fuseMulIntoFullyConnectedNoBias(%arg0: tensor<4x2xf32>, %arg1: none)
   func.return %1 : tensor<4x2xf32>
 
 // CHECK-DAG:  %[[CONSTANT:.*]] = "tfl.pseudo_const"(){{.*}}dense<{{\[\[}}1.000000e+00, 2.000000e+00], [6.000000e+00, 8.000000e+00]]> : tensor<2x2xf32>
-// CHECK:  %[[RES:.*]] = "tfl.fully_connected"(%arg0, %[[CONSTANT]], %arg1) {fused_activation_function = "RELU6", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<4x2xf32>, tensor<2x2xf32>, none) -> tensor<4x2xf32>
+// CHECK:  %[[RES:.*]] = "tfl.fully_connected"(%arg0, %[[CONSTANT]], %arg1) <{fused_activation_function = "RELU6", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<4x2xf32>, tensor<2x2xf32>, none) -> tensor<4x2xf32>
 // CHECK:  return %[[RES]] : tensor<4x2xf32>
 }
 
@@ -504,7 +504,7 @@ func.func @fuseMulIntoDepthwiseConv2d(%arg0: tensor<1x112x112x2xf32>) -> tensor<
 
 // CHECK-DAG:  %cst = arith.constant dense<{{\[\[\[\[}}1.000000e+00, 4.000000e+00], [3.000000e+00, 8.000000e+00], [5.000000e+00, 1.200000e+01]], {{\[\[}}7.000000e+00, 1.600000e+01], [9.000000e+00, 2.000000e+01], [1.100000e+01, 2.400000e+01]], {{\[\[}}1.300000e+01, 2.800000e+01], [1.500000e+01, 3.200000e+01], [1.700000e+01, 3.600000e+01]]]]> : tensor<1x3x3x2xf32>
 // CHECK-DAG:  %cst_0 = arith.constant dense<[2.000000e+00, 4.000000e+00]> : tensor<2xf32>
-// CHECK:  %0 = "tfl.depthwise_conv_2d"(%arg0, %cst, %cst_0) {depth_multiplier = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "RELU6", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x112x112x2xf32>, tensor<1x3x3x2xf32>, tensor<2xf32>) -> tensor<1x112x112x2xf32>
+// CHECK:  %0 = "tfl.depthwise_conv_2d"(%arg0, %cst, %cst_0) <{depth_multiplier = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "RELU6", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<1x112x112x2xf32>, tensor<1x3x3x2xf32>, tensor<2xf32>) -> tensor<1x112x112x2xf32>
 // CHECK:  return %0
 }
 
@@ -521,7 +521,7 @@ func.func @fuse4DMulIntoDepthwiseConv2d(%arg0: tensor<1x112x112x2xf32>) -> tenso
 
 // CHECK-DAG:  %cst = arith.constant dense<{{\[\[\[\[}}1.000000e+00, 4.000000e+00], [3.000000e+00, 8.000000e+00], [5.000000e+00, 1.200000e+01]], {{\[\[}}7.000000e+00, 1.600000e+01], [9.000000e+00, 2.000000e+01], [1.100000e+01, 2.400000e+01]], {{\[\[}}1.300000e+01, 2.800000e+01], [1.500000e+01, 3.200000e+01], [1.700000e+01, 3.600000e+01]]]]> : tensor<1x3x3x2xf32>
 // CHECK-DAG:  %cst_0 = arith.constant dense<[2.000000e+00, 4.000000e+00]> : tensor<2xf32>
-// CHECK:  %0 = "tfl.depthwise_conv_2d"(%arg0, %cst, %cst_0) {depth_multiplier = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "RELU6", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x112x112x2xf32>, tensor<1x3x3x2xf32>, tensor<2xf32>) -> tensor<1x112x112x2xf32>
+// CHECK:  %0 = "tfl.depthwise_conv_2d"(%arg0, %cst, %cst_0) <{depth_multiplier = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "RELU6", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<1x112x112x2xf32>, tensor<1x3x3x2xf32>, tensor<2xf32>) -> tensor<1x112x112x2xf32>
 // CHECK:  return %0
 }
 
@@ -648,13 +648,13 @@ func.func @FuseFullyConnectedMultiUseAddBroadcastedNagative(%arg0: tensor<1x40x3
   %4 = "tfl.mul"(%2, %cst1) {fused_activation_function = "NONE"} : (tensor<1x40x4xf32>, tensor<1x1x4xf32>) -> tensor<1x40x4xf32>
   func.return %1, %3, %4 : tensor<1x40x4xf32>, tensor<1x40x4xf32>, tensor<1x40x4xf32>
 
-  // CHECK:  %0 = "tfl.no_value"() {value} : () -> none
+  // CHECK:  %0 = "tfl.no_value"() <{value}> : () -> none
   // CHECK:  %cst = arith.constant dense<{{\[\[\[}}2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00]]]> : tensor<1x1x4xf32>
-  // CHECK:  %1 = "tfl.fully_connected"(%arg0, %arg1, %0) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<1x40x37xf32>, tensor<4x37xf32>, none) -> tensor<1x40x4xf32>
-  // CHECK:  %2 = tfl.add(%1, %cst) {fused_activation_function = "NONE"} : (tensor<1x40x4xf32>, tensor<1x1x4xf32>) -> tensor<1x40x4xf32>
-  // CHECK:  %3 = "tfl.fully_connected"(%arg0, %arg1, %0) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<1x40x37xf32>, tensor<4x37xf32>, none) -> tensor<1x40x4xf32>
-  // CHECK:  %4 = tfl.add(%3, %cst) {fused_activation_function = "NONE"} : (tensor<1x40x4xf32>, tensor<1x1x4xf32>) -> tensor<1x40x4xf32>
-  // CHECK:  %5 = tfl.mul(%3, %cst) {fused_activation_function = "NONE"} : (tensor<1x40x4xf32>, tensor<1x1x4xf32>) -> tensor<1x40x4xf32>
+  // CHECK:  %1 = "tfl.fully_connected"(%arg0, %arg1, %0) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<1x40x37xf32>, tensor<4x37xf32>, none) -> tensor<1x40x4xf32>
+  // CHECK:  %2 = tfl.add(%1, %cst) <{fused_activation_function = "NONE"}> : (tensor<1x40x4xf32>, tensor<1x1x4xf32>) -> tensor<1x40x4xf32>
+  // CHECK:  %3 = "tfl.fully_connected"(%arg0, %arg1, %0) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<1x40x37xf32>, tensor<4x37xf32>, none) -> tensor<1x40x4xf32>
+  // CHECK:  %4 = tfl.add(%3, %cst) <{fused_activation_function = "NONE"}> : (tensor<1x40x4xf32>, tensor<1x1x4xf32>) -> tensor<1x40x4xf32>
+  // CHECK:  %5 = tfl.mul(%3, %cst) <{fused_activation_function = "NONE"}> : (tensor<1x40x4xf32>, tensor<1x1x4xf32>) -> tensor<1x40x4xf32>
   // CHECK:  return %2, %4, %5 : tensor<1x40x4xf32>, tensor<1x40x4xf32>, tensor<1x40x4xf32>
 }
 
@@ -670,7 +670,7 @@ func.func @FuseFullyConnectedBroadcastedBiasAddWithQDQs(%arg0: tensor<40x37xf32>
 
   // CHECK: %[[cst:.*]] = arith.constant dense<2.000000e+00> : tensor<40xf32>
   // CHECK: %[[q:.*]] = "tfl.quantize"
-  // CHECK-SAME: {qtype = tensor<40x!quant.uniform<i8:f32, 0.024986599940879671:92>>} : (tensor<40xf32>) -> tensor<40x!quant.uniform<i8:f32, 0.024986599940879671:92>>
+  // CHECK-SAME: <{qtype = tensor<40x!quant.uniform<i8:f32, 0.024986599940879671:92>>}> : (tensor<40xf32>) -> tensor<40x!quant.uniform<i8:f32, 0.024986599940879671:92>>
   // CHECK: %[[dq:.*]] = "tfl.dequantize"
   // CHECK-SAME: (tensor<40x!quant.uniform<i8:f32, 0.024986599940879671:92>>) -> tensor<40xf32>
   // CHECK: %[[fc:.*]] = "tfl.fully_connected"
@@ -735,7 +735,7 @@ func.func @FuseFullyConnectedAddNoBiasWithUnfusableRhs(%arg0: tensor<4x37xf32>,
 
   func.return %1 : tensor<4x4xf32>
 
-  // CHECK-DAG: %[[unit:.*]] = "tfl.no_value"() {value} : () -> none
+  // CHECK-DAG: %[[unit:.*]] = "tfl.no_value"() <{value}> : () -> none
   // CHECK-DAG: %[[filter:.*]] = arith.constant dense<{{.*}}> : tensor<4x4xf32>
   // CHECK: %[[fc_result:.*]] = "tfl.fully_connected"(%arg0, %arg1, %[[unit]])
   // CHECK: %[[add_result:.*]] = tfl.add %[[fc_result]], %[[filter]]
@@ -769,7 +769,7 @@ func.func @FuseReshapeAroundBMMLHS(%arg0: tensor<6x5x1024xf32>) -> tensor<6x5x81
   %2 = "tfl.reshape"(%1, %cst_0) : (tensor<30x8192xf32>, tensor<3xi32>) -> tensor<6x5x8192xf32>
   return %2 : tensor<6x5x8192xf32>
   // CHECK: %cst = arith.constant dense_resource<__elided__> : tensor<1024x8192xf32>
-  // CHECK: %0 = "tfl.batch_matmul"(%arg0, %cst) {adj_x = false, adj_y = false} : (tensor<6x5x1024xf32>, tensor<1024x8192xf32>) -> tensor<6x5x8192xf32>
+  // CHECK: %0 = "tfl.batch_matmul"(%arg0, %cst) <{adj_x = false, adj_y = false}> : (tensor<6x5x1024xf32>, tensor<1024x8192xf32>) -> tensor<6x5x8192xf32>
   // CHECK: return %0 : tensor<6x5x8192xf32>
 }
 
@@ -784,7 +784,7 @@ func.func @FuseReshapeAroundBMMLHSNegative(%arg0: tensor<1x64xf32>, %arg1: tenso
   // CHECK: %cst = arith.constant dense<[1, 1024]> : tensor<2xi32>
   // CHECK: %cst_0 = arith.constant dense<[1, 1, 64]> : tensor<3xi32>
   // CHECK: %0 = "tfl.reshape"(%arg0, %cst_0) : (tensor<1x64xf32>, tensor<3xi32>) -> tensor<1x1x64xf32>
-  // CHECK: %1 = "tfl.batch_matmul"(%0, %arg1) {adj_x = false, adj_y = false} : (tensor<1x1x64xf32>, tensor<1x64x1024xf32>) -> tensor<1x1x1024xf32>
+  // CHECK: %1 = "tfl.batch_matmul"(%0, %arg1) <{adj_x = false, adj_y = false}> : (tensor<1x1x64xf32>, tensor<1x64x1024xf32>) -> tensor<1x1x1024xf32>
   // CHECK: %2 = "tfl.reshape"(%1, %cst) : (tensor<1x1x1024xf32>, tensor<2xi32>) -> tensor<1x1024xf32>
   // CHECK: return %2 : tensor<1x1024xf32>
 }
@@ -800,7 +800,7 @@ func.func @FuseReshapeAroundBMMNagativeTest(%arg0: tensor<5x4x1x1024xf32>, %arg1
   // CHECK: %cst = arith.constant dense_resource<__elided__> : tensor<3xi32>
   // CHECK: %cst_0 = arith.constant dense_resource<__elided__> : tensor<4xi32>
   // CHECK: %0 = "tfl.reshape"(%arg0, %cst) : (tensor<5x4x1x1024xf32>, tensor<3xi32>) -> tensor<5x4x1024xf32>
-  // CHECK: %1 = "tfl.batch_matmul"(%0, %arg1) {adj_x = false, adj_y = false} : (tensor<5x4x1024xf32>, tensor<5x1024x8192xf32>) -> tensor<5x4x8192xf32>
+  // CHECK: %1 = "tfl.batch_matmul"(%0, %arg1) <{adj_x = false, adj_y = false}> : (tensor<5x4x1024xf32>, tensor<5x1024x8192xf32>) -> tensor<5x4x8192xf32>
   // CHECK: %2 = "tfl.reshape"(%1, %cst_0) : (tensor<5x4x8192xf32>, tensor<4xi32>) -> tensor<5x4x1x8192xf32>
   // CHECK: return %2 : tensor<5x4x1x8192xf32>
 }
@@ -819,8 +819,8 @@ func.func @FuseReshapeAroundBMMNagativeTest2(%arg0: tensor<2x1536xf32>) -> tenso
   // CHECK: %cst = arith.constant dense_resource<__elided__> : tensor<3xi32>
   // CHECK: %cst_0 = arith.constant dense_resource<__elided__> : tensor<2xi32>
   // CHECK: %0 = "tfl.reshape"(%arg0, %cst) : (tensor<2x1536xf32>, tensor<3xi32>) -> tensor<2x12x128xf32>
-  // CHECK: %1 = "tfl.pseudo_qconst"() {qtype = tensor<128x64x!quant.uniform<i8:f32, 0.0047710379585623741>>, value = dense<9> : tensor<128x64xi8>} : () -> tensor<128x64x!quant.uniform<i8:f32, 0.0047710379585623741>>
-  // CHECK: %2 = "tfl.batch_matmul"(%0, %1) {adj_x = false, adj_y = false, asymmetric_quantize_inputs = true} : (tensor<2x12x128xf32>, tensor<128x64x!quant.uniform<i8:f32, 0.0047710379585623741>>) -> tensor<2x12x64xf32>
+  // CHECK: %1 = "tfl.pseudo_qconst"() <{qtype = tensor<128x64x!quant.uniform<i8:f32, 0.0047710379585623741>>, value = dense<9> : tensor<128x64xi8>}> : () -> tensor<128x64x!quant.uniform<i8:f32, 0.0047710379585623741>>
+  // CHECK: %2 = "tfl.batch_matmul"(%0, %1) <{adj_x = false, adj_y = false, asymmetric_quantize_inputs = true}> : (tensor<2x12x128xf32>, tensor<128x64x!quant.uniform<i8:f32, 0.0047710379585623741>>) -> tensor<2x12x64xf32>
   // CHECK: %3 = "tfl.reshape"(%2, %cst_0) : (tensor<2x12x64xf32>, tensor<2xi32>) -> tensor<2x768xf32>
   // CHECK: return %3 : tensor<2x768xf32>
 }
@@ -835,7 +835,7 @@ func.func @FuseReshapeAroundBMMRHS(%arg0: tensor<1x3x6x5x1024xf32>) -> tensor<1x
   %2 = "tfl.reshape"(%1, %cst_0) : (tensor<1x90x8192xf32>, tensor<5xi32>) -> tensor<1x3x6x5x8192xf32>
   return %2 : tensor<1x3x6x5x8192xf32>
   // CHECK: %cst = arith.constant dense_resource<__elided__> : tensor<1x1024x8192xf32>
-  // CHECK: %0 = "tfl.batch_matmul"(%arg0, %cst) {adj_x = false, adj_y = false} : (tensor<1x3x6x5x1024xf32>, tensor<1x1024x8192xf32>) -> tensor<1x3x6x5x8192xf32>
+  // CHECK: %0 = "tfl.batch_matmul"(%arg0, %cst) <{adj_x = false, adj_y = false}> : (tensor<1x3x6x5x1024xf32>, tensor<1x1024x8192xf32>) -> tensor<1x3x6x5x8192xf32>
   // CHECK: return %0 : tensor<1x3x6x5x8192xf32>
 }
 
@@ -845,7 +845,7 @@ func.func @FuseTransposeIntoBMM_LHS(%arg0: tensor<1x4x1440x256xf32>, %arg1: tens
   %32 = "tfl.transpose"(%arg1, %cst_1) : (tensor<1x1440x256xf32>, tensor<3xi32>) -> tensor<1x256x1440xf32>
   %33 = "tfl.batch_matmul"(%32, %arg0) {adj_x = false, adj_y = false} : (tensor<1x256x1440xf32>, tensor<1x4x1440x256xf32>) -> tensor<1x4x256x256xf32>
   return %33 : tensor<1x4x256x256xf32>
-  // CHECK: %0 = "tfl.batch_matmul"(%arg1, %arg0) {adj_x = true, adj_y = false} : (tensor<1x1440x256xf32>, tensor<1x4x1440x256xf32>) -> tensor<1x4x256x256xf32>
+  // CHECK: %0 = "tfl.batch_matmul"(%arg1, %arg0) <{adj_x = true, adj_y = false}> : (tensor<1x1440x256xf32>, tensor<1x4x1440x256xf32>) -> tensor<1x4x256x256xf32>
   // CHECK: return %0 : tensor<1x4x256x256xf32>
 }
 
@@ -902,8 +902,8 @@ func.func @RetainRedundantReshapeUseInNonBinaryOp(%arg0: tensor<128xf32>, %arg1:
   // CHECK-DAG: %cst = arith.constant dense<0> : tensor<1xi32>
   // CHECK-DAG: %cst_0 = arith.constant dense<[1, 1, 1, 128]> : tensor<4xi32>
   // CHECK: %0 = "tfl.reshape"(%arg0, %cst_0) : (tensor<128xf32>, tensor<4xi32>) -> tensor<1x1x1x128xf32>
-  // CHECK: %1 = tfl.mul(%0, %arg1) {fused_activation_function = "NONE"} : (tensor<1x1x1x128xf32>, tensor<1x512x512x128xf32>) -> tensor<1x512x512x128xf32>
-  // CHECK: %2 = "tfl.reduce_max"(%0, %cst) {keep_dims = false} : (tensor<1x1x1x128xf32>, tensor<1xi32>) -> tensor<128xf32>
+  // CHECK: %1 = tfl.mul(%0, %arg1) <{fused_activation_function = "NONE"}> : (tensor<1x1x1x128xf32>, tensor<1x512x512x128xf32>) -> tensor<1x512x512x128xf32>
+  // CHECK: %2 = "tfl.reduce_max"(%0, %cst) <{keep_dims = false}> : (tensor<1x1x1x128xf32>, tensor<1xi32>) -> tensor<128xf32>
   // CHECK: return %1, %2
 }
 
@@ -963,10 +963,10 @@ func.func @FuseFullyConnectedReshapeAddConstWithOptionalAttribute(%arg0: tensor<
   func.return %3 : tensor<40x40xf32>
 
   // CHECK-DAG: %[[cst:.*]] = arith.constant dense<5.000000e+00> : tensor<40x40xf32>
-  // CHECK: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %arg1, %[[cst]]) {asymmetric_quantize_inputs = true,
+  // CHECK: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %arg1, %[[cst]]) <{asymmetric_quantize_inputs = true,
 
   // FOLD: %[[cst:.*]] = arith.constant dense<5.000000e+00> : tensor<40x40xf32>
-  // FOLD: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %arg1, %[[cst]]) {asymmetric_quantize_inputs = true,
+  // FOLD: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %arg1, %[[cst]]) <{asymmetric_quantize_inputs = true,
 }
 
 // CHECK-LABEL: @FuseFullyConnectedReshapeAddConstWithActivation
@@ -985,13 +985,13 @@ func.func @FuseFullyConnectedReshapeAddConstWithActivation(%arg0: tensor<40x37xf
   func.return %3 : tensor<40x40xf32>
 
   // CHECK-DAG: %[[cst:.*]] = arith.constant dense<5.000000e+00> : tensor<40x40xf32>
-  // CHECK: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %arg1, %[[cst]]) {fused_activation_function = "RELU6", keep_num_dims = false, weights_format = "DEFAULT"}
+  // CHECK: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %arg1, %[[cst]]) <{fused_activation_function = "RELU6", keep_num_dims = false, weights_format = "DEFAULT"}>
   // CHECK: %[[rs1:.*]] = "tfl.reshape"(%[[fc]]
   // CHECK: %[[rs2:.*]] = "tfl.reshape"(%[[rs1]]
   // CHECK: return %[[rs2]]
 
   // FOLD: %[[cst:.*]] = arith.constant dense<5.000000e+00> : tensor<40x40xf32>
-  // FOLD: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %arg1, %[[cst]]) {fused_activation_function = "RELU6", keep_num_dims = false, weights_format = "DEFAULT"}
+  // FOLD: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %arg1, %[[cst]]) <{fused_activation_function = "RELU6", keep_num_dims = false, weights_format = "DEFAULT"}>
   // FOLD: return %[[fc]]
 }
 
@@ -1008,7 +1008,7 @@ func.func @FuseFullyConnectedReshapeAdd2DConst(%arg0: tensor<40x37xf32>, %arg1:
   func.return %2 : tensor<1x40x4x10xf32>
 
   // CHECK-DAG: %[[cst:.*]] = arith.constant dense<2.000000e+00> : tensor<40xf32>
-  // CHECK: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %arg1, %[[cst]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}
+  // CHECK: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %arg1, %[[cst]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}>
   // CHECK: %[[rs:.*]] = "tfl.reshape"(%[[fc]]
   // CHECK: return %[[rs]]
 }
@@ -1026,7 +1026,7 @@ func.func @FuseFCReshapeAdd2DConst2(%arg0: tensor<40x37xf32>, %arg1: tensor<40x3
   func.return %2 : tensor<1x40x4x10xf32>
 
   // CHECK-DAG: %[[cst:.*]] = arith.constant dense<2.000000e+00> : tensor<40xf32>
-  // CHECK: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %arg1, %[[cst]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}
+  // CHECK: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %arg1, %[[cst]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}>
   // CHECK: %[[rs:.*]] = "tfl.reshape"(%[[fc]]
   // CHECK: return %[[rs]]
 }
@@ -1044,7 +1044,7 @@ func.func @FuseFullyConnectedReshapeAdd2DConstWithActivation(%arg0: tensor<40x37
   func.return %2 : tensor<1x40x4x10xf32>
 
   // CHECK-DAG: %[[cst:.*]] = arith.constant dense<2.000000e+00> : tensor<40xf32>
-  // CHECK: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %arg1, %[[cst]]) {fused_activation_function = "RELU6", keep_num_dims = false, weights_format = "DEFAULT"}
+  // CHECK: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %arg1, %[[cst]]) <{fused_activation_function = "RELU6", keep_num_dims = false, weights_format = "DEFAULT"}>
   // CHECK: %[[rs:.*]] = "tfl.reshape"(%[[fc]]
   // CHECK: return %[[rs]]
 }
@@ -1062,7 +1062,7 @@ func.func @FuseFCReshapeAdd2DConstWithActvtn2(%arg0: tensor<40x37xf32>, %arg1: t
   func.return %2 : tensor<1x40x4x10xf32>
 
   // CHECK-DAG: %[[cst:.*]] = arith.constant dense<2.000000e+00> : tensor<40xf32>
-  // CHECK: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %arg1, %[[cst]]) {fused_activation_function = "RELU6", keep_num_dims = false, weights_format = "DEFAULT"}
+  // CHECK: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %arg1, %[[cst]]) <{fused_activation_function = "RELU6", keep_num_dims = false, weights_format = "DEFAULT"}>
   // CHECK: %[[rs:.*]] = "tfl.reshape"(%[[fc]]
   // CHECK: return %[[rs]]
 }
@@ -1328,7 +1328,7 @@ func.func @HardSwishPatternFail(%arg0: tensor<1xf32>) -> tensor<1xf32> {
   %1 = "tfl.mul"(%arg0, %0) {fused_activation_function = "NONE"} :  (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
   %2 = "tfl.mul"(%1, %six)  {fused_activation_function = "NONE"} :  (tensor<1xf32>, tensor<f32>) -> tensor<1xf32>
   func.return %2: tensor<1xf32>
-  // CHECK: %0 = tfl.sub(%arg0, %cst) {fused_activation_function = "RELU6"} : (tensor<1xf32>, tensor<f32>) -> tensor<1xf32>
+  // CHECK: %0 = tfl.sub(%arg0, %cst) <{fused_activation_function = "RELU6"}> : (tensor<1xf32>, tensor<f32>) -> tensor<1xf32>
 }
 
 // CHECK-LABEL: @L2NormalizePattern
@@ -1339,7 +1339,7 @@ func.func @L2NormalizePattern(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %2 = "tfl.rsqrt"(%1) : (tensor<f32>) -> tensor<f32>
   %3 = "tfl.mul"(%arg0, %2) {fused_activation_function = "NONE"} : (tensor<2xf32>, tensor<f32>) -> tensor<2xf32>
   func.return %3: tensor<2xf32>
-  // CHECK: %[[RES:[0-9].*]] = "tfl.l2_normalization"([[INPUT:%.*]]) {fused_activation_function = "NONE"} : (tensor<2xf32>) -> tensor<2xf32>
+  // CHECK: %[[RES:[0-9].*]] = "tfl.l2_normalization"([[INPUT:%.*]]) <{fused_activation_function = "NONE"}> : (tensor<2xf32>) -> tensor<2xf32>
   // CHECK: return %[[RES]]
 }
 
@@ -1351,7 +1351,7 @@ func.func @L2NormalizePattern1(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %2 = "tfl.sqrt"(%1) : (tensor<f32>) -> tensor<f32>
   %3 = "tfl.div"(%arg0, %2) {fused_activation_function = "NONE"} : (tensor<2xf32>, tensor<f32>) -> tensor<2xf32>
   func.return %3: tensor<2xf32>
-  // CHECK: %[[RES:[0-9].*]] = "tfl.l2_normalization"([[INPUT:%.*]]) {fused_activation_function = "NONE"} : (tensor<2xf32>) -> tensor<2xf32>
+  // CHECK: %[[RES:[0-9].*]] = "tfl.l2_normalization"([[INPUT:%.*]]) <{fused_activation_function = "NONE"}> : (tensor<2xf32>) -> tensor<2xf32>
   // CHECK: return %[[RES]]
 }
 
@@ -1365,7 +1365,7 @@ func.func @L2NormalizePattern2(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %3 = "tfl.rsqrt"(%2) : (tensor<1xf32>) -> tensor<1xf32>
   %4 = "tfl.mul"(%arg0, %3) {fused_activation_function = "NONE"} : (tensor<2xf32>, tensor<1xf32>) -> tensor<2xf32>
   func.return %4: tensor<2xf32>
-  // CHECK: %[[RES:[0-9].*]] = "tfl.l2_normalization"([[INPUT:%.*]]) {fused_activation_function = "NONE"} : (tensor<2xf32>) -> tensor<2xf32>
+  // CHECK: %[[RES:[0-9].*]] = "tfl.l2_normalization"([[INPUT:%.*]]) <{fused_activation_function = "NONE"}> : (tensor<2xf32>) -> tensor<2xf32>
   // CHECK: return %[[RES]]
 }
 
@@ -1379,7 +1379,7 @@ func.func @L2NormalizePattern3(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %3 = "tfl.sqrt"(%2) : (tensor<1xf32>) -> tensor<1xf32>
   %4 = "tfl.div"(%arg0, %3) {fused_activation_function = "NONE"} : (tensor<2xf32>, tensor<1xf32>) -> tensor<2xf32>
   func.return %4: tensor<2xf32>
-  // CHECK: %[[RES:[0-9].*]] = "tfl.l2_normalization"([[INPUT:%.*]]) {fused_activation_function = "NONE"} : (tensor<2xf32>) -> tensor<2xf32>
+  // CHECK: %[[RES:[0-9].*]] = "tfl.l2_normalization"([[INPUT:%.*]]) <{fused_activation_function = "NONE"}> : (tensor<2xf32>) -> tensor<2xf32>
   // CHECK: return %[[RES]]
 }
 
@@ -1393,7 +1393,7 @@ func.func @L2NormalizePattern4(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %3 = "tfl.sqrt"(%2) : (tensor<1xf32>) -> tensor<1xf32>
   %4 = "tfl.div"(%arg0, %3) {fused_activation_function = "NONE"} : (tensor<2xf32>, tensor<1xf32>) -> tensor<2xf32>
   func.return %4: tensor<2xf32>
-  // CHECK: %[[RES:[0-9].*]] = "tfl.l2_normalization"([[INPUT:%.*]]) {fused_activation_function = "NONE"} : (tensor<2xf32>) -> tensor<2xf32>
+  // CHECK: %[[RES:[0-9].*]] = "tfl.l2_normalization"([[INPUT:%.*]]) <{fused_activation_function = "NONE"}> : (tensor<2xf32>) -> tensor<2xf32>
   // CHECK: return %[[RES]]
 }
 
@@ -1407,7 +1407,7 @@ func.func @L2NormalizePattern5(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %3 = "tfl.sqrt"(%2) : (tensor<1xf32>) -> tensor<1xf32>
   %4 = "tfl.div"(%arg0, %3) {fused_activation_function = "NONE"} : (tensor<2xf32>, tensor<1xf32>) -> tensor<2xf32>
   func.return %4: tensor<2xf32>
-  // CHECK: %[[RES:[0-9].*]] = "tfl.l2_normalization"([[INPUT:%.*]]) {fused_activation_function = "NONE"} : (tensor<2xf32>) -> tensor<2xf32>
+  // CHECK: %[[RES:[0-9].*]] = "tfl.l2_normalization"([[INPUT:%.*]]) <{fused_activation_function = "NONE"}> : (tensor<2xf32>) -> tensor<2xf32>
   // CHECK: return %[[RES]]
 }
 
@@ -1420,7 +1420,7 @@ func.func @InvalidL2NormalizePattern(%arg0: tensor<2xf32>, %arg1: tensor<2xf32>)
   %2 = "tfl.sqrt"(%1) : (tensor<f32>) -> tensor<f32>
   %3 = "tfl.div"(%arg1, %2) {fused_activation_function = "NONE"} : (tensor<2xf32>, tensor<f32>) -> tensor<2xf32>
   func.return %3: tensor<2xf32>
-  // CHECK: %3 = tfl.div([[INPUT:%.*]], %2) {fused_activation_function = "NONE"} : (tensor<2xf32>, tensor<f32>) -> tensor<2xf32>
+  // CHECK: %3 = tfl.div([[INPUT:%.*]], %2) <{fused_activation_function = "NONE"}> : (tensor<2xf32>, tensor<f32>) -> tensor<2xf32>
   // CHECK: return %3
 }
 
@@ -1435,7 +1435,7 @@ func.func @InvalidL2NormalizePattern2(%arg0: tensor<2xf32>, %arg1: tensor<2xf32>
   %3 = "tfl.sqrt"(%2) : (tensor<1xf32>) -> tensor<1xf32>
   %4 = "tfl.div"(%arg0, %3) {fused_activation_function = "NONE"} : (tensor<2xf32>, tensor<1xf32>) -> tensor<2xf32>
   func.return %4 : tensor<2xf32>
-  // CHECK: %[[RES:[0-9].*]] = tfl.div([[INPUT:%.*]], %3) {fused_activation_function = "NONE"} : (tensor<2xf32>, tensor<1xf32>) -> tensor<2xf32>
+  // CHECK: %[[RES:[0-9].*]] = tfl.div([[INPUT:%.*]], %3) <{fused_activation_function = "NONE"}> : (tensor<2xf32>, tensor<1xf32>) -> tensor<2xf32>
   // CHECK: return %[[RES]]
 }
 
@@ -1448,7 +1448,7 @@ func.func @InvalidL2NormalizePattern3(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32>
   %2 = "tfl.sqrt"(%1) : (tensor<f32>) -> tensor<f32>
   %3 = "tfl.div"(%arg0, %2) {fused_activation_function = "NONE"} : (tensor<2x2xf32>, tensor<f32>) -> tensor<2x2xf32>
   func.return %3: tensor<2x2xf32>
-  // CHECK: %[[RES:[0-9].*]] = tfl.div([[INPUT:%.*]], %2) {fused_activation_function = "NONE"} : (tensor<2x2xf32>, tensor<f32>) -> tensor<2x2xf32>
+  // CHECK: %[[RES:[0-9].*]] = tfl.div([[INPUT:%.*]], %2) <{fused_activation_function = "NONE"}> : (tensor<2x2xf32>, tensor<f32>) -> tensor<2x2xf32>
   // CHECK: return %[[RES]]
 }
 
@@ -1463,7 +1463,7 @@ func.func @fuseDivIntoConv2d(%arg0: tensor<1x112x112x2xf32>) -> tensor<1x28x23x2
   func.return %1 : tensor<1x28x23x2xf32>
   // CHECK-DAG: %[[cst:.*]] = arith.constant dense<{{\[\[\[\[}}1.000000e+00, 2.000000e+00], [3.000000e+00, 4.000000e+00]], {{\[\[}}5.000000e+00, 6.000000e+00], [7.000000e+00, 8.000000e+00]]], {{\[\[\[}}4.500000e+00, 5.000000e+00], [5.500000e+00, 6.000000e+00]], {{\[\[}}6.500000e+00, 7.000000e+00], [7.500000e+00, 8.000000e+00]]]]> : tensor<2x2x2x2xf32>
   // CHECK-DAG: %[[cst:.*]] = arith.constant dense<[1.000000e+00, 5.000000e-01]> : tensor<2xf32>
-  // CHECK: %[[RES:[0-9].*]] = "tfl.conv_2d"(%arg0, %cst, %cst_0) {dilation_h_factor = 2 : i32, dilation_w_factor = 3 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 4 : i32, stride_w = 5 : i32} : (tensor<1x112x112x2xf32>, tensor<2x2x2x2xf32>, tensor<2xf32>) -> tensor<1x28x23x2xf32>
+  // CHECK: %[[RES:[0-9].*]] = "tfl.conv_2d"(%arg0, %cst, %cst_0) <{dilation_h_factor = 2 : i32, dilation_w_factor = 3 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 4 : i32, stride_w = 5 : i32}> : (tensor<1x112x112x2xf32>, tensor<2x2x2x2xf32>, tensor<2xf32>) -> tensor<1x28x23x2xf32>
   // CHECK: return %[[RES]]
 }
 
@@ -1478,7 +1478,7 @@ func.func @fuseDivIntoDepthwiseConv2d(%arg0: tensor<1x112x112x2xf32>) -> tensor<
   func.return %1 : tensor<1x112x112x2xf32>
   // CHECK-DAG: %[[cst:.*]] = arith.constant dense<{{\[\[\[\[}}1.000000e+00, 1.000000e+00], [3.000000e+00, 2.000000e+00]], {{\[\[}}5.000000e+00, 3.000000e+00], [7.000000e+00, 4.000000e+00]]], {{\[\[\[}}9.000000e+00, 5.000000e+00], [1.100000e+01, 6.000000e+00]], {{\[\[}}1.300000e+01, 7.000000e+00], [1.500000e+01, 8.000000e+00]]]]> : tensor<2x2x2x2xf32>
   // CHECK-DAG: %[[cst:.*]] = arith.constant dense<[1.000000e+00, 5.000000e-01]> : tensor<2xf32>
-  // CHECK: %[[RES:[0-9].*]] = "tfl.depthwise_conv_2d"(%arg0, %cst, %cst_0) {depth_multiplier = 1 : i32, dilation_h_factor = 2 : i32, dilation_w_factor = 3 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 4 : i32, stride_w = 5 : i32} : (tensor<1x112x112x2xf32>, tensor<2x2x2x2xf32>, tensor<2xf32>) -> tensor<1x112x112x2xf32>
+  // CHECK: %[[RES:[0-9].*]] = "tfl.depthwise_conv_2d"(%arg0, %cst, %cst_0) <{depth_multiplier = 1 : i32, dilation_h_factor = 2 : i32, dilation_w_factor = 3 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 4 : i32, stride_w = 5 : i32}> : (tensor<1x112x112x2xf32>, tensor<2x2x2x2xf32>, tensor<2xf32>) -> tensor<1x112x112x2xf32>
   // CHECK: return %[[RES]]
 }
 
@@ -1493,7 +1493,7 @@ func.func @fuseDivIntoConv2d_Scalar(%arg0: tensor<1x112x112x2xf32>) -> tensor<1x
   func.return %1 : tensor<1x28x23x1xf32>
   // CHECK-DAG: %[[CST1:.*]] = arith.constant dense<{{\[\[\[\[}}5.000000e-01, 1.000000e+00], [1.500000e+00, 2.000000e+00]], {{\[\[}}2.500000e+00, 3.000000e+00], [3.500000e+00, 4.000000e+00]]]]> : tensor<1x2x2x2xf32>
   // CHECK-DAG: %[[CST2:.*]] = arith.constant dense<5.000000e-01> : tensor<2xf32>
-  // CHECK: %[[RES:[0-9].*]] = "tfl.conv_2d"(%arg0, %[[CST1]], %[[CST2]]) {dilation_h_factor = 2 : i32, dilation_w_factor = 3 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 4 : i32, stride_w = 5 : i32} : (tensor<1x112x112x2xf32>, tensor<1x2x2x2xf32>, tensor<2xf32>) -> tensor<1x28x23x1xf32>
+  // CHECK: %[[RES:[0-9].*]] = "tfl.conv_2d"(%arg0, %[[CST1]], %[[CST2]]) <{dilation_h_factor = 2 : i32, dilation_w_factor = 3 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 4 : i32, stride_w = 5 : i32}> : (tensor<1x112x112x2xf32>, tensor<1x2x2x2xf32>, tensor<2xf32>) -> tensor<1x28x23x1xf32>
   // CHECK: return %[[RES]]
 }
 
@@ -1508,7 +1508,7 @@ func.func @fuseMulIntoConv2d_Scalar(%arg0: tensor<1x112x112x2xf32>) -> tensor<1x
   func.return %1 : tensor<1x28x23x1xf32>
   // CHECK-DAG: %[[CST1:.*]] = arith.constant dense<{{\[\[\[\[}}2.000000e+00, 4.000000e+00], [6.000000e+00, 8.000000e+00]], {{\[\[}}1.000000e+01, 1.200000e+01], [1.400000e+01, 1.600000e+01]]]]> : tensor<1x2x2x2xf32>
   // CHECK-DAG: %[[CST2:.*]] = arith.constant dense<2.000000e+00> : tensor<1xf32>
-  // CHECK: %[[RES:[0-9].*]] = "tfl.conv_2d"(%arg0, %[[CST1]], %[[CST2]]) {dilation_h_factor = 2 : i32, dilation_w_factor = 3 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 4 : i32, stride_w = 5 : i32} : (tensor<1x112x112x2xf32>, tensor<1x2x2x2xf32>, tensor<1xf32>) -> tensor<1x28x23x1xf32>
+  // CHECK: %[[RES:[0-9].*]] = "tfl.conv_2d"(%arg0, %[[CST1]], %[[CST2]]) <{dilation_h_factor = 2 : i32, dilation_w_factor = 3 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 4 : i32, stride_w = 5 : i32}> : (tensor<1x112x112x2xf32>, tensor<1x2x2x2xf32>, tensor<1xf32>) -> tensor<1x28x23x1xf32>
   // CHECK: return %[[RES]]
 }
 
@@ -1537,9 +1537,9 @@ func.func @fuseTileWithBinaryOp1(%arg0: tensor<1x1xf32>, %arg1: tensor<1x128xf32
   func.return %3 : tensor<1x128xf32>
 
   // CHECK-DAG: %[[cst:.*]] = arith.constant dense<1.000000e+00> : tensor<f32>
-  // CHECK: %[[ADD:[0-9].*]] = tfl.add(%arg0, %[[cst]]) {fused_activation_function = "NONE"} : (tensor<1x1xf32>, tensor<f32>) -> tensor<1x1xf32>
+  // CHECK: %[[ADD:[0-9].*]] = tfl.add(%arg0, %[[cst]]) <{fused_activation_function = "NONE"}> : (tensor<1x1xf32>, tensor<f32>) -> tensor<1x1xf32>
   // CHECK: %[[SQRT:[0-9].*]] = "tfl.sqrt"(%[[ADD]]) : (tensor<1x1xf32>) -> tensor<1x1xf32>
-  // CHECK: %[[RES:[0-9].*]] = tfl.div(%[[SQRT]], %arg1) {fused_activation_function = "NONE"} : (tensor<1x1xf32>, tensor<1x128xf32>) -> tensor<1x128xf32>
+  // CHECK: %[[RES:[0-9].*]] = tfl.div(%[[SQRT]], %arg1) <{fused_activation_function = "NONE"}> : (tensor<1x1xf32>, tensor<1x128xf32>) -> tensor<1x128xf32>
   // CHECK: return %[[RES]]
 }
 
@@ -1640,7 +1640,7 @@ func.func @convertTrivialTransposeToReshape(%arg0: tensor<6x6x256x1xf32>) -> ten
   %0 = "tfl.transpose"(%arg0, %cst) : (tensor<6x6x256x1xf32>, tensor<4xi32>) -> tensor<1x6x6x256xf32>
   func.return %0 : tensor<1x6x6x256xf32>
 
-  // CHECK-DAG: [[CONST:.*]] = "tfl.pseudo_const"(){{.*}}dense<[1, 6, 6, 256]> : tensor<4xi32>
+  // CHECK-DAG: [[CONST:.*]] = arith.constant {{.*}}dense<[1, 6, 6, 256]> : tensor<4xi32>
   // CHECK: %[[RESULT:.*]] = "tfl.reshape"(%arg0, %[[CONST:.*]]) : (tensor<6x6x256x1xf32>, tensor<4xi32>) -> tensor<1x6x6x256xf32>
   // CHECK: return %[[RESULT]]
 }
@@ -1797,8 +1797,8 @@ func.func @FusingbiasAdd(%arg0: tensor<1x10x10x32xf32>, %arg1: tensor<32xf32>) -
   func.return %2 : tensor<1x10x10x32xf32>
 
 // Fusing-LABEL: FusingbiasAdd
-// Fusing:  %[[add:[0-9].*]] = tfl.add(%arg0, %arg1) {fused_activation_function = "NONE"} : (tensor<1x10x10x32xf32>, tensor<32xf32>) -> tensor<1x10x10x32xf32>
-// Fusing:  %[[add1:[0-9].*]] = tfl.add(%[[add]], %arg1) {fused_activation_function = "RELU6"} : (tensor<1x10x10x32xf32>, tensor<32xf32>) -> tensor<1x10x10x32xf32>
+// Fusing:  %[[add:[0-9].*]] = tfl.add(%arg0, %arg1) <{fused_activation_function = "NONE"}> : (tensor<1x10x10x32xf32>, tensor<32xf32>) -> tensor<1x10x10x32xf32>
+// Fusing:  %[[add1:[0-9].*]] = tfl.add(%[[add]], %arg1) <{fused_activation_function = "RELU6"}> : (tensor<1x10x10x32xf32>, tensor<32xf32>) -> tensor<1x10x10x32xf32>
 }
 
 func.func @FusingdivRelu(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> {
@@ -2017,7 +2017,7 @@ func.func @FoldSumKeepDim(%arg0: tensor<8x128xf32>) -> tensor<8x1xf32> {
   func.return %1 : tensor<8x1xf32>
 
 // CHECK-LABEL: FoldSumKeepDim
-// CHECK: %[[RESULT:.*]] = "tfl.sum"(%arg0, %cst) {keep_dims = true} : (tensor<8x128xf32>, tensor<1xi32>) -> tensor<8x1xf32>
+// CHECK: %[[RESULT:.*]] = "tfl.sum"(%arg0, %cst) <{keep_dims = true}> : (tensor<8x128xf32>, tensor<1xi32>) -> tensor<8x1xf32>
 // CHECK: return %[[RESULT]] : tensor<8x1xf32>
 }
 
@@ -2029,7 +2029,7 @@ func.func @FoldReduceMinKeepDim(%arg0: tensor<8x128xf32>) -> tensor<1x128xf32> {
   func.return %1 : tensor<1x128xf32>
 
 // CHECK-LABEL: FoldReduceMinKeepDim
-// CHECK: %[[RESULT:.*]] = "tfl.reduce_min"(%arg0, %cst) {keep_dims = true} : (tensor<8x128xf32>, tensor<1xi32>) -> tensor<1x128xf32>
+// CHECK: %[[RESULT:.*]] = "tfl.reduce_min"(%arg0, %cst) <{keep_dims = true}> : (tensor<8x128xf32>, tensor<1xi32>) -> tensor<1x128xf32>
 // CHECK: return %[[RESULT]] : tensor<1x128xf32>
 }
 
@@ -2041,7 +2041,7 @@ func.func @FoldReduceMaxKeepDim(%arg0: tensor<8x128xf32>) -> tensor<1x128xf32> {
   func.return %1 : tensor<1x128xf32>
 
 // CHECK-LABEL: FoldReduceMaxKeepDim
-// CHECK: %[[RESULT:.*]] = "tfl.reduce_max"(%arg0, %cst) {keep_dims = true} : (tensor<8x128xf32>, tensor<1xi32>) -> tensor<1x128xf32>
+// CHECK: %[[RESULT:.*]] = "tfl.reduce_max"(%arg0, %cst) <{keep_dims = true}> : (tensor<8x128xf32>, tensor<1xi32>) -> tensor<1x128xf32>
 // CHECK: return %[[RESULT]] : tensor<1x128xf32>
 }
 
@@ -2053,7 +2053,7 @@ func.func @FoldReduceProdKeepDim(%arg0: tensor<8x128xf32>) -> tensor<1x1xf32> {
   func.return %1 : tensor<1x1xf32>
 
 // CHECK-LABEL: FoldReduceProdKeepDim
-// CHECK: %[[RESULT:.*]] = "tfl.reduce_prod"(%arg0, %cst) {keep_dims = true} : (tensor<8x128xf32>, tensor<2xi32>) -> tensor<1x1xf32>
+// CHECK: %[[RESULT:.*]] = "tfl.reduce_prod"(%arg0, %cst) <{keep_dims = true}> : (tensor<8x128xf32>, tensor<2xi32>) -> tensor<1x1xf32>
 // CHECK: return %[[RESULT]] : tensor<1x1xf32>
 }
 
@@ -2065,7 +2065,7 @@ func.func @FoldMeanKeepDim(%arg0: tensor<8x128xf32>) -> tensor<1x128xf32> {
   func.return %1 : tensor<1x128xf32>
 
 // CHECK-LABEL: FoldMeanKeepDim
-// CHECK: %[[RESULT:.*]] = "tfl.mean"(%arg0, %cst) {keep_dims = true} : (tensor<8x128xf32>, tensor<1xi32>) -> tensor<1x128xf32>
+// CHECK: %[[RESULT:.*]] = "tfl.mean"(%arg0, %cst) <{keep_dims = true}> : (tensor<8x128xf32>, tensor<1xi32>) -> tensor<1x128xf32>
 // CHECK: return %[[RESULT]] : tensor<1x128xf32>
 }
 
@@ -2079,7 +2079,7 @@ func.func @SoftMaxWithNormalization(%arg0: tensor<8x128xf32>) -> tensor<8x128xf3
   func.return %4 : tensor<8x128xf32>
 
 // CHECK-LABEL: SoftMaxWithNormalization
-// CHECK: %[[RESULT:.*]] = "tfl.softmax"(%arg0) {beta = 1.000000e+00 : f32} : (tensor<8x128xf32>) -> tensor<8x128xf32>
+// CHECK: %[[RESULT:.*]] = "tfl.softmax"(%arg0) <{beta = 1.000000e+00 : f32}> : (tensor<8x128xf32>) -> tensor<8x128xf32>
 // CHECK: return %[[RESULT]] : tensor<8x128xf32>
 }
 
@@ -2091,7 +2091,7 @@ func.func @SoftMaxWithoutNormalization(%arg0: tensor<8x128xf32>) -> tensor<8x128
   func.return %2 : tensor<8x128xf32>
 
 // CHECK-LABEL: SoftMaxWithoutNormalization
-// CHECK: %[[RESULT:.*]] = "tfl.softmax"(%arg0) {beta = 1.000000e+00 : f32} : (tensor<8x128xf32>) -> tensor<8x128xf32>
+// CHECK: %[[RESULT:.*]] = "tfl.softmax"(%arg0) <{beta = 1.000000e+00 : f32}> : (tensor<8x128xf32>) -> tensor<8x128xf32>
 // CHECK: return %[[RESULT]] : tensor<8x128xf32>
 }
 
@@ -2103,7 +2103,7 @@ func.func @SoftMaxWithoutNormalizationNegAxis(%arg0: tensor<8x128xf32>) -> tenso
   func.return %2 : tensor<8x128xf32>
 
 // CHECK-LABEL: SoftMaxWithoutNormalizationNegAxis
-// CHECK: %[[RESULT:.*]] = "tfl.softmax"(%arg0) {beta = 1.000000e+00 : f32} : (tensor<8x128xf32>) -> tensor<8x128xf32>
+// CHECK: %[[RESULT:.*]] = "tfl.softmax"(%arg0) <{beta = 1.000000e+00 : f32}> : (tensor<8x128xf32>) -> tensor<8x128xf32>
 // CHECK: return %[[RESULT]] : tensor<8x128xf32>
 }
 
@@ -2160,7 +2160,7 @@ func.func @fuseMulIntoConv2d_Splat2D(%arg0: tensor<1x112x112x2xf32>) -> tensor<1
   func.return %1 : tensor<1x112x112x2xf32>
   // CHECK-DAG: %[[CST1:.*]] = arith.constant dense<{{\[\[\[\[}}2.000000e+00, 4.000000e+00]]], {{\[\[\[}}6.000000e+00, 8.000000e+00]]]]> : tensor<2x1x1x2xf32>
   // CHECK-DAG: %[[CST2:.*]] = arith.constant dense<2.000000e+00> : tensor<2xf32>
-  // CHECK: %[[RES:[0-9].*]] = "tfl.conv_2d"(%arg0, %[[CST1]], %[[CST2]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x112x112x2xf32>, tensor<2x1x1x2xf32>, tensor<2xf32>) -> tensor<1x112x112x2xf32>
+  // CHECK: %[[RES:[0-9].*]] = "tfl.conv_2d"(%arg0, %[[CST1]], %[[CST2]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<1x112x112x2xf32>, tensor<2x1x1x2xf32>, tensor<2xf32>) -> tensor<1x112x112x2xf32>
   // CHECK: return %[[RES]]
 }
 
@@ -2174,9 +2174,9 @@ func.func @AvoidFuseFullyConnectedAddWithSplat2D(%arg0: tensor<1x1x1x1x1xf32>, %
 
   func.return %1 : tensor<1x1x1x1x1xf32>
 
-  // CHECK-DAG: %[[CST1:.*]] = "tfl.no_value"() {value} : () -> none
+  // CHECK-DAG: %[[CST1:.*]] = "tfl.no_value"() <{value}> : () -> none
   // CHECK-DAG: %[[CST2:.*]] = arith.constant dense<2.000000e+00> : tensor<1x1x1x1x1xf32>
-  // CHECK: %[[FC_RESULT:.*]] = "tfl.fully_connected"(%arg0, %arg1, %[[CST1]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<1x1x1x1x1xf32>, tensor<1x1xf32>, none) -> tensor<1x1x1x1x1xf32>
+  // CHECK: %[[FC_RESULT:.*]] = "tfl.fully_connected"(%arg0, %arg1, %[[CST1]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<1x1x1x1x1xf32>, tensor<1x1xf32>, none) -> tensor<1x1x1x1x1xf32>
   // CHECK: %[[ADD:.*]] = tfl.add %[[FC_RESULT]], %[[CST2]] {fused_activation_function = "NONE"} : tensor<1x1x1x1x1xf32>
   // CHECK: return %[[ADD]] : tensor<1x1x1x1x1xf32>
 }
@@ -2215,7 +2215,7 @@ func.func @DontConvertMul1WithBroadcastToIdentity(%arg0: tensor<2xf32>) -> tenso
   %0 = "tfl.mul"(%arg0, %cst) {fused_activation_function = "NONE"} : (tensor<2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
   func.return %0 : tensor<2x2xf32>
   // CHECK-DAG: %cst = arith.constant dense<1.000000e+00> : tensor<2x2xf32>
-  // CHECK: %0 = tfl.mul(%arg0, %cst) {fused_activation_function = "NONE"} : (tensor<2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  // CHECK: %0 = tfl.mul(%arg0, %cst) <{fused_activation_function = "NONE"}> : (tensor<2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
   // CHECK: return %0 : tensor<2x2xf32>
 }
 
@@ -2403,13 +2403,13 @@ func.func @EliminateReduceOpsBool(%arg: tensor<1x2x1x3xi1>, %arg_scalar: tensor<
   // CHECK-DAG: %[[AXIS_1:.*]] = arith.constant dense<1> : tensor<1xi32>
   // CHECK-DAG: %[[AXIS_2:.*]] = arith.constant dense<2> : tensor<1xi32>
   // CHECK-DAG: %[[AXIS_3:.*]] = arith.constant dense<3> : tensor<1xi32>
-  // CHECK: %[[RET_0:.*]] = "tfl.reduce_any"(%arg0, %[[AXIS_0]]) {keep_dims = false} : (tensor<1x2x1x3xi1>, tensor<1xi32>) -> tensor<2x1x3xi1>
-  // CHECK: %[[RET_1:.*]] = "tfl.reduce_any"(%arg0, %[[AXIS_1]]) {keep_dims = false} : (tensor<1x2x1x3xi1>, tensor<1xi32>) -> tensor<1x1x1x3xi1>
-  // CHECK: %[[RET_2:.*]] = "tfl.reduce_all"(%arg0, %[[AXIS_1]]) {keep_dims = true} : (tensor<1x2x1x3xi1>, tensor<1xi32>) -> tensor<1x1x3xi1>
-  // CHECK: %[[RET_3:.*]] = "tfl.reduce_all"(%arg0, %[[AXIS_2]]) {keep_dims = false} : (tensor<1x2x1x3xi1>, tensor<1xi32>) -> tensor<1x2x3xi1>
-  // CHECK: %[[RET_4:.*]] = "tfl.reduce_all"(%arg0, %[[AXIS_3]]) {keep_dims = false} : (tensor<1x2x1x3xi1>, tensor<1xi32>) -> tensor<1x2x1xi1>
-  // CHECK: %[[RET_5:.*]] = "tfl.reduce_all"(%arg0, %[[AXIS_3]]) {keep_dims = true} : (tensor<1x2x1x3xi1>, tensor<1xi32>) -> tensor<1x2x1x1xi1>
-  // CHECK: %[[RET_6:.*]] = "tfl.reduce_all"(%arg2, %arg3) {keep_dims = true} : (tensor<?xi1>, tensor<?xi32>) -> tensor<?xi1>
+  // CHECK: %[[RET_0:.*]] = "tfl.reduce_any"(%arg0, %[[AXIS_0]]) <{keep_dims = false}> : (tensor<1x2x1x3xi1>, tensor<1xi32>) -> tensor<2x1x3xi1>
+  // CHECK: %[[RET_1:.*]] = "tfl.reduce_any"(%arg0, %[[AXIS_1]]) <{keep_dims = false}> : (tensor<1x2x1x3xi1>, tensor<1xi32>) -> tensor<1x1x1x3xi1>
+  // CHECK: %[[RET_2:.*]] = "tfl.reduce_all"(%arg0, %[[AXIS_1]]) <{keep_dims = true}> : (tensor<1x2x1x3xi1>, tensor<1xi32>) -> tensor<1x1x3xi1>
+  // CHECK: %[[RET_3:.*]] = "tfl.reduce_all"(%arg0, %[[AXIS_2]]) <{keep_dims = false}> : (tensor<1x2x1x3xi1>, tensor<1xi32>) -> tensor<1x2x3xi1>
+  // CHECK: %[[RET_4:.*]] = "tfl.reduce_all"(%arg0, %[[AXIS_3]]) <{keep_dims = false}> : (tensor<1x2x1x3xi1>, tensor<1xi32>) -> tensor<1x2x1xi1>
+  // CHECK: %[[RET_5:.*]] = "tfl.reduce_all"(%arg0, %[[AXIS_3]]) <{keep_dims = true}> : (tensor<1x2x1x3xi1>, tensor<1xi32>) -> tensor<1x2x1x1xi1>
+  // CHECK: %[[RET_6:.*]] = "tfl.reduce_all"(%arg2, %arg3) <{keep_dims = true}> : (tensor<?xi1>, tensor<?xi32>) -> tensor<?xi1>
   // CHECK: return %arg1, %arg1, %[[RET_0]], %arg0, %[[RET_1]], %[[RET_2]], %[[RET_3]], %arg0, %[[RET_4]], %[[RET_5]], %[[RET_6]] : tensor<i1>, tensor<i1>, tensor<2x1x3xi1>, tensor<1x2x1x3xi1>, tensor<1x1x1x3xi1>, tensor<1x1x3xi1>, tensor<1x2x3xi1>, tensor<1x2x1x3xi1>, tensor<1x2x1xi1>, tensor<1x2x1x1xi1>, tensor<?xi1>
 }
 
@@ -2436,13 +2436,13 @@ func.func @EliminateReduceOpsFloat(%arg: tensor<1x2x1x3xf32>, %arg_scalar: tenso
   // CHECK-DAG: %[[AXIS_1:.*]] = arith.constant dense<1> : tensor<1xi32>
   // CHECK-DAG: %[[AXIS_2:.*]] = arith.constant dense<2> : tensor<1xi32>
   // CHECK-DAG: %[[AXIS_3:.*]] = arith.constant dense<3> : tensor<1xi32>
-  // CHECK: %[[RET_0:.*]] = "tfl.reduce_min"(%arg0, %[[AXIS_0]]) {keep_dims = false} : (tensor<1x2x1x3xf32>, tensor<1xi32>) -> tensor<2x1x3xf32>
-  // CHECK: %[[RET_1:.*]] = "tfl.reduce_prod"(%arg0, %[[AXIS_1]]) {keep_dims = false} : (tensor<1x2x1x3xf32>, tensor<1xi32>) -> tensor<1x1x1x3xf32>
-  // CHECK: %[[RET_2:.*]] = "tfl.mean"(%arg0, %[[AXIS_1]]) {keep_dims = true} : (tensor<1x2x1x3xf32>, tensor<1xi32>) -> tensor<1x1x3xf32>
-  // CHECK: %[[RET_3:.*]] = "tfl.sum"(%arg0, %[[AXIS_2]]) {keep_dims = false} : (tensor<1x2x1x3xf32>, tensor<1xi32>) -> tensor<1x2x3xf32>
-  // CHECK: %[[RET_4:.*]] = "tfl.reduce_max"(%arg0, %[[AXIS_3]]) {keep_dims = false} : (tensor<1x2x1x3xf32>, tensor<1xi32>) -> tensor<1x2x1xf32>
-  // CHECK: %[[RET_5:.*]] = "tfl.reduce_prod"(%arg0, %[[AXIS_3]]) {keep_dims = true} : (tensor<1x2x1x3xf32>, tensor<1xi32>) -> tensor<1x2x1x1xf32>
-  // CHECK: %[[RET_6:.*]] = "tfl.sum"(%arg2, %arg3) {keep_dims = true} : (tensor<?xf32>, tensor<?xi32>) -> tensor<?xf32>
+  // CHECK: %[[RET_0:.*]] = "tfl.reduce_min"(%arg0, %[[AXIS_0]]) <{keep_dims = false}> : (tensor<1x2x1x3xf32>, tensor<1xi32>) -> tensor<2x1x3xf32>
+  // CHECK: %[[RET_1:.*]] = "tfl.reduce_prod"(%arg0, %[[AXIS_1]]) <{keep_dims = false}> : (tensor<1x2x1x3xf32>, tensor<1xi32>) -> tensor<1x1x1x3xf32>
+  // CHECK: %[[RET_2:.*]] = "tfl.mean"(%arg0, %[[AXIS_1]]) <{keep_dims = true}> : (tensor<1x2x1x3xf32>, tensor<1xi32>) -> tensor<1x1x3xf32>
+  // CHECK: %[[RET_3:.*]] = "tfl.sum"(%arg0, %[[AXIS_2]]) <{keep_dims = false}> : (tensor<1x2x1x3xf32>, tensor<1xi32>) -> tensor<1x2x3xf32>
+  // CHECK: %[[RET_4:.*]] = "tfl.reduce_max"(%arg0, %[[AXIS_3]]) <{keep_dims = false}> : (tensor<1x2x1x3xf32>, tensor<1xi32>) -> tensor<1x2x1xf32>
+  // CHECK: %[[RET_5:.*]] = "tfl.reduce_prod"(%arg0, %[[AXIS_3]]) <{keep_dims = true}> : (tensor<1x2x1x3xf32>, tensor<1xi32>) -> tensor<1x2x1x1xf32>
+  // CHECK: %[[RET_6:.*]] = "tfl.sum"(%arg2, %arg3) <{keep_dims = true}> : (tensor<?xf32>, tensor<?xi32>) -> tensor<?xf32>
   // CHECK: return %arg1, %arg1, %[[RET_0]], %arg0, %[[RET_1]], %[[RET_2]], %[[RET_3]], %arg0, %[[RET_4]], %[[RET_5]], %[[RET_6]] : tensor<f32>, tensor<f32>, tensor<2x1x3xf32>, tensor<1x2x1x3xf32>, tensor<1x1x1x3xf32>, tensor<1x1x3xf32>, tensor<1x2x3xf32>, tensor<1x2x1x3xf32>, tensor<1x2x1xf32>, tensor<1x2x1x1xf32>, tensor<?xf32>
 }
 
@@ -2497,7 +2497,7 @@ func.func @DontRemoveSoftmaxNegativeBetaBeforeArgmax(%arg0: tensor<16x1024xf32>)
   %1 = "tfl.arg_max"(%0, %cst) : (tensor<16x1024xf32>, tensor<1xi32>) -> tensor<16xi32>
   func.return %1 : tensor<16xi32>
   // CHECK-DAG: %[[CST:.*]] = arith.constant dense<-1> : tensor<1xi32>
-  // CHECK: %[[SOFTMAX:.*]] = "tfl.softmax"(%arg0) {beta = -1.000000e+00 : f32} : (tensor<16x1024xf32>) -> tensor<16x1024xf32>
+  // CHECK: %[[SOFTMAX:.*]] = "tfl.softmax"(%arg0) <{beta = -1.000000e+00 : f32}> : (tensor<16x1024xf32>) -> tensor<16x1024xf32>
   // CHECK: %[[ARG_MAX:.*]] = "tfl.arg_max"(%[[SOFTMAX]], %[[CST]]) : (tensor<16x1024xf32>, tensor<1xi32>) -> tensor<16xi32>
   // CHECK: return %[[ARG_MAX]] : tensor<16xi32>
 }
@@ -2509,7 +2509,7 @@ func.func @DontRemoveSoftmaxNonLastAxisBeforeArgmax(%arg0: tensor<16x1024xf32>)
   %1 = "tfl.arg_max"(%0, %cst) : (tensor<16x1024xf32>, tensor<1xi32>) -> tensor<16xi32>
   func.return %1 : tensor<16xi32>
   // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0> : tensor<1xi32>
-  // CHECK: %[[SOFTMAX:.*]] = "tfl.softmax"(%arg0) {beta = 1.000000e+00 : f32} : (tensor<16x1024xf32>) -> tensor<16x1024xf32>
+  // CHECK: %[[SOFTMAX:.*]] = "tfl.softmax"(%arg0) <{beta = 1.000000e+00 : f32}> : (tensor<16x1024xf32>) -> tensor<16x1024xf32>
   // CHECK: %[[ARG_MAX:.*]] = "tfl.arg_max"(%[[SOFTMAX]], %[[CST]]) : (tensor<16x1024xf32>, tensor<1xi32>) -> tensor<16xi32>
   // CHECK: return %[[ARG_MAX]] : tensor<16xi32>
 }
@@ -2633,7 +2633,7 @@ func.func @FuseAddWithFullyConnectedWithBias(%arg: tensor<2x512xf32>) -> tensor<
   // 2.0 * 3.0 * 512 + 5.0 = 3077.0
   // CHECK-DAG: %[[WEIGHTS:.*]] = arith.constant dense<3.000000e+00> : tensor<1024x512xf32>
   // CHECK-DAG: %[[BIAS:.*]] = arith.constant dense<3.077000e+03> : tensor<1024xf32>
-  // CHECK: %[[RESULT:.*]] = "tfl.fully_connected"(%arg0, %[[WEIGHTS]], %[[BIAS]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<2x512xf32>, tensor<1024x512xf32>, tensor<1024xf32>) -> tensor<2x1024xf32>
+  // CHECK: %[[RESULT:.*]] = "tfl.fully_connected"(%arg0, %[[WEIGHTS]], %[[BIAS]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<2x512xf32>, tensor<1024x512xf32>, tensor<1024xf32>) -> tensor<2x1024xf32>
   // CHECK: return %[[RESULT]]
 }
 
@@ -2652,6 +2652,17 @@ func.func @FuseAddWithFullyConnectedWithQuantizedWeight(%arg: tensor<2x512xf32>)
   // CHECK: tfl.add
 }
 
+// CHECK-LABEL: @FuseBatchMatMulAndTransposeWithQuantizedWeight
+func.func @FuseBatchMatMulAndTransposeWithQuantizedWeight(%arg: tensor<1x2xf32>) -> tensor<1x3xf32> {
+  %cst_3 = arith.constant dense<[1, 0]> : tensor<2xi32>
+  %79 = "tfl.pseudo_qconst"() {qtype = tensor<3x2x!quant.uniform<i8<-127:127>:f32:0, {2.378620e-03,2.848260e-03,2.545190e-03}>>, value = dense<10> : tensor<3x2xi8>} : () -> tensor<3x2x!quant.uniform<i8<-127:127>:f32:0, {2.378620e-03,2.848260e-03,2.545190e-03}>>
+  %80 = "tfl.transpose"(%79, %cst_3) : (tensor<3x2x!quant.uniform<i8<-127:127>:f32:0, {2.378620e-03,2.848260e-03,2.545190e-03}>>, tensor<2xi32>) -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {2.378620e-03,2.848260e-03,2.545190e-03}>>
+  %81 = "tfl.batch_matmul"(%arg, %80) {adj_x = false, adj_y = false, asymmetric_quantize_inputs = false} : (tensor<1x2xf32>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {2.378620e-03,2.848260e-03,2.545190e-03}>>) -> tensor<1x3xf32>
+  func.return %81 : tensor<1x3xf32>
+
+  // CHECK: tfl.fully_connected
+}
+
 // CHECK-LABEL: @FuseAddWithFullyConnectedNoBias
 // Note: Currently not fused.
 func.func @FuseAddWithFullyConnectedNoBias(%arg: tensor<2x512xf32>) -> tensor<2x1024xf32> {
@@ -2666,9 +2677,9 @@ func.func @FuseAddWithFullyConnectedNoBias(%arg: tensor<2x512xf32>) -> tensor<2x
 
   // CHECK-DAG: %[[ADDEND:.*]] = arith.constant dense<2.000000e+00> : tensor<512xf32>
   // CHECK-DAG: %[[WEIGHTS:.*]] = arith.constant dense<3.000000e+00> : tensor<1024x512xf32>
-  // CHECK-DAG: %[[BIAS:.*]] = "tfl.no_value"() {value} : () -> none
-  // CHECK: %[[VAL_0:.*]] = tfl.add(%arg0, %[[ADDEND]]) {fused_activation_function = "NONE"} : (tensor<2x512xf32>, tensor<512xf32>) -> tensor<2x512xf32>
-  // CHECK: %[[VAL_1:.*]] = "tfl.fully_connected"(%[[VAL_0]], %[[WEIGHTS]], %[[BIAS]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<2x512xf32>, tensor<1024x512xf32>, none) -> tensor<2x1024xf32>
+  // CHECK-DAG: %[[BIAS:.*]] = "tfl.no_value"() <{value}> : () -> none
+  // CHECK: %[[VAL_0:.*]] = tfl.add(%arg0, %[[ADDEND]]) <{fused_activation_function = "NONE"}> : (tensor<2x512xf32>, tensor<512xf32>) -> tensor<2x512xf32>
+  // CHECK: %[[VAL_1:.*]] = "tfl.fully_connected"(%[[VAL_0]], %[[WEIGHTS]], %[[BIAS]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<2x512xf32>, tensor<1024x512xf32>, none) -> tensor<2x1024xf32>
   // CHECK: return %[[VAL_1]]
 }
 
@@ -2687,7 +2698,7 @@ func.func @DontFuseAddWithFullyConnectedMismatchedDimensions(%arg: tensor<2x512x
   // CHECK-DAG: %[[WEIGHTS:.*]] = arith.constant dense<3.000000e+00> : tensor<1024x512xf32>
   // CHECK-DAG: %[[BIAS:.*]] = arith.constant dense<5.000000e+00> : tensor<1024xf32>
   // CHECK: %[[VAL_0:.*]] = tfl.add %arg0, %[[ADDEND]] {fused_activation_function = "NONE"} : tensor<2x512xf32>
-  // CHECK: %[[VAL_1:.*]] = "tfl.fully_connected"(%[[VAL_0]], %[[WEIGHTS]], %[[BIAS]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<2x512xf32>, tensor<1024x512xf32>, tensor<1024xf32>) -> tensor<2x1024xf32>
+  // CHECK: %[[VAL_1:.*]] = "tfl.fully_connected"(%[[VAL_0]], %[[WEIGHTS]], %[[BIAS]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<2x512xf32>, tensor<1024x512xf32>, tensor<1024xf32>) -> tensor<2x1024xf32>
   // CHECK: return %[[VAL_1]]
 }
 
@@ -2704,7 +2715,7 @@ func.func @FuseMulWithFullyConnectedWithBias(%arg: tensor<2x512xf32>) -> tensor<
 
   // CHECK-DAG: %[[WEIGHTS:.*]] = arith.constant dense<6.000000e+00> : tensor<1024x512xf32>
   // CHECK-DAG: %[[BIAS:.*]] = arith.constant dense<5.000000e+00> : tensor<1024xf32>
-  // CHECK: %[[RESULT:.*]] = "tfl.fully_connected"(%arg0, %[[WEIGHTS]], %[[BIAS]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<2x512xf32>, tensor<1024x512xf32>, tensor<1024xf32>) -> tensor<2x1024xf32>
+  // CHECK: %[[RESULT:.*]] = "tfl.fully_connected"(%arg0, %[[WEIGHTS]], %[[BIAS]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<2x512xf32>, tensor<1024x512xf32>, tensor<1024xf32>) -> tensor<2x1024xf32>
   // CHECK: return %[[RESULT]]
 }
 
@@ -2735,16 +2746,16 @@ func.func @FuseMulWithFullyConnectedNoBias(%arg: tensor<2x512xf32>) -> tensor<2x
   func.return %1 : tensor<2x1024xf32>
 
   // CHECK-DAG: %[[WEIGHTS:.*]] = arith.constant dense<6.000000e+00> : tensor<1024x512xf32>
-  // CHECK-DAG: %[[BIAS:.*]] = "tfl.no_value"() {value} : () -> none
-  // CHECK: %[[VAL_0:.*]] = "tfl.fully_connected"(%arg0, %[[WEIGHTS]], %[[BIAS]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<2x512xf32>, tensor<1024x512xf32>, none) -> tensor<2x1024xf32>
+  // CHECK-DAG: %[[BIAS:.*]] = "tfl.no_value"() <{value}> : () -> none
+  // CHECK: %[[VAL_0:.*]] = "tfl.fully_connected"(%arg0, %[[WEIGHTS]], %[[BIAS]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<2x512xf32>, tensor<1024x512xf32>, none) -> tensor<2x1024xf32>
   // CHECK: return %[[VAL_0]]
 
   // NoFusing-LABEL: FuseMulWithFullyConnectedNoBias
   // NoFusing-DAG: %[[MWEIGHTS:.*]] = arith.constant dense<2.000000e+00> : tensor<512xf32>
   // NoFusing-DAG: %[[WEIGHTS:.*]] = arith.constant dense<3.000000e+00> : tensor<1024x512xf32>
-  // NoFusing-DAG: %[[BIAS:.*]] = "tfl.no_value"() {value} : () -> none
-  // NoFusing: %[[MUL:.*]] = tfl.mul(%arg0, %[[MWEIGHTS]]) {fused_activation_function = "NONE"} : (tensor<2x512xf32>, tensor<512xf32>) -> tensor<2x512xf32>
-  // NoFusing: %[[VAL:.*]] = "tfl.fully_connected"(%[[MUL]], %[[WEIGHTS]], %[[BIAS]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<2x512xf32>, tensor<1024x512xf32>, none) -> tensor<2x1024xf32>
+  // NoFusing-DAG: %[[BIAS:.*]] = "tfl.no_value"() <{value}> : () -> none
+  // NoFusing: %[[MUL:.*]] = tfl.mul(%arg0, %[[MWEIGHTS]]) <{fused_activation_function = "NONE"}> : (tensor<2x512xf32>, tensor<512xf32>) -> tensor<2x512xf32>
+  // NoFusing: %[[VAL:.*]] = "tfl.fully_connected"(%[[MUL]], %[[WEIGHTS]], %[[BIAS]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<2x512xf32>, tensor<1024x512xf32>, none) -> tensor<2x1024xf32>
   // NoFusing: return %[[VAL]]
 }
 
@@ -2760,8 +2771,8 @@ func.func @FuseMulWithFullyConnectedNoBiasWithOptionalAttribute(%arg: tensor<2x5
   func.return %1 : tensor<2x1024xf32>
 
   // CHECK-DAG: %[[WEIGHTS:.*]] = arith.constant dense<6.000000e+00> : tensor<1024x512xf32>
-  // CHECK-DAG: %[[BIAS:.*]] = "tfl.no_value"() {value} : () -> none
-  // CHECK: %[[VAL_0:.*]] = "tfl.fully_connected"(%arg0, %[[WEIGHTS]], %[[BIAS]]) {asymmetric_quantize_inputs = true,
+  // CHECK-DAG: %[[BIAS:.*]] = "tfl.no_value"() <{value}> : () -> none
+  // CHECK: %[[VAL_0:.*]] = "tfl.fully_connected"(%arg0, %[[WEIGHTS]], %[[BIAS]]) <{asymmetric_quantize_inputs = true,
 }
 
 // CHECK-LABEL: @DontFuseMulWithFullyConnectedMismatchedDimensions
@@ -2779,7 +2790,7 @@ func.func @DontFuseMulWithFullyConnectedMismatchedDimensions(%arg: tensor<2x512x
   // CHECK-DAG: %[[WEIGHTS:.*]] = arith.constant dense<3.000000e+00> : tensor<1024x512xf32>
   // CHECK-DAG: %[[BIAS:.*]] = arith.constant dense<5.000000e+00> : tensor<1024xf32>
   // CHECK: %[[VAL_0:.*]] = tfl.mul %arg0, %[[MULTIPLIER]] {fused_activation_function = "NONE"} : tensor<2x512xf32>
-  // CHECK: %[[VAL_1:.*]] = "tfl.fully_connected"(%[[VAL_0]], %[[WEIGHTS]], %[[BIAS]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<2x512xf32>, tensor<1024x512xf32>, tensor<1024xf32>) -> tensor<2x1024xf32>
+  // CHECK: %[[VAL_1:.*]] = "tfl.fully_connected"(%[[VAL_0]], %[[WEIGHTS]], %[[BIAS]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<2x512xf32>, tensor<1024x512xf32>, tensor<1024xf32>) -> tensor<2x1024xf32>
   // CHECK: return %[[VAL_1]]
 }
 
@@ -2789,7 +2800,7 @@ func.func @RemoveReshapeBeforeFullyConnectedExpandDims0(%arg0: tensor<128x64xf32
   %0 = "tfl.reshape"(%arg0, %cst) : (tensor<128x64xf32>, tensor<3xi32>) -> tensor<1x128x64xf32>
   %1 = "tfl.fully_connected"(%0, %arg1, %arg2) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<1x128x64xf32>, tensor<32x64xf32>, tensor<32xf32>) -> tensor<128x32xf32>
   func.return %1 : tensor<128x32xf32>
-  // CHECK: %[[FULLY_CONNECTED:.*]] = "tfl.fully_connected"(%arg0, %arg1, %arg2) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<128x64xf32>, tensor<32x64xf32>, tensor<32xf32>) -> tensor<128x32xf32>
+  // CHECK: %[[FULLY_CONNECTED:.*]] = "tfl.fully_connected"(%arg0, %arg1, %arg2) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<128x64xf32>, tensor<32x64xf32>, tensor<32xf32>) -> tensor<128x32xf32>
   // CHECK: return %[[FULLY_CONNECTED]] : tensor<128x32xf32>
 }
 
@@ -2799,7 +2810,7 @@ func.func @RemoveReshapeBeforeFullyConnectedReshape(%arg0: tensor<128x64xf32>, %
   %0 = "tfl.reshape"(%arg0, %cst) : (tensor<128x64xf32>, tensor<3xi32>) -> tensor<4x32x64xf32>
   %1 = "tfl.fully_connected"(%0, %arg1, %arg2) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<4x32x64xf32>, tensor<32x64xf32>, tensor<32xf32>) -> tensor<128x32xf32>
   func.return %1 : tensor<128x32xf32>
-  // CHECK: %[[FULLY_CONNECTED:.*]] = "tfl.fully_connected"(%arg0, %arg1, %arg2) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<128x64xf32>, tensor<32x64xf32>, tensor<32xf32>) -> tensor<128x32xf32>
+  // CHECK: %[[FULLY_CONNECTED:.*]] = "tfl.fully_connected"(%arg0, %arg1, %arg2) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<128x64xf32>, tensor<32x64xf32>, tensor<32xf32>) -> tensor<128x32xf32>
   // CHECK: return %[[FULLY_CONNECTED]] : tensor<128x32xf32>
 }
 
@@ -2811,7 +2822,7 @@ func.func @DontRemoveReshapeBeforeFullyConnectedKeepNumDims(%arg0: tensor<128x64
   func.return %1 : tensor<1x128x32xf32>
   // CHECK-DAG: %[[CST:.*]] = arith.constant dense<[1, 128, 64]> : tensor<3xi32>
   // CHECK: %[[RESHAPE:.*]] = "tfl.reshape"(%arg0, %[[CST]]) : (tensor<128x64xf32>, tensor<3xi32>) -> tensor<1x128x64xf32>
-  // CHECK: %[[FULLY_CONNECTED:.*]] = "tfl.fully_connected"(%[[RESHAPE]], %arg1, %arg2) {fused_activation_function = "NONE", keep_num_dims = true, weights_format = "DEFAULT"} : (tensor<1x128x64xf32>, tensor<32x64xf32>, tensor<32xf32>) -> tensor<1x128x32xf32>
+  // CHECK: %[[FULLY_CONNECTED:.*]] = "tfl.fully_connected"(%[[RESHAPE]], %arg1, %arg2) <{fused_activation_function = "NONE", keep_num_dims = true, weights_format = "DEFAULT"}> : (tensor<1x128x64xf32>, tensor<32x64xf32>, tensor<32xf32>) -> tensor<1x128x32xf32>
   // CHECK: return %[[FULLY_CONNECTED]] : tensor<1x128x32xf32>
 }
 
@@ -2823,7 +2834,7 @@ func.func @DontRemoveReshapeBeforeFullyConnectedChangeLastDim(%arg0: tensor<128x
   func.return %1 : tensor<256x32xf32>
   // CHECK-DAG: %[[CST:.*]] = arith.constant dense<[1, 256, 32]> : tensor<3xi32>
   // CHECK: %[[RESHAPE:.*]] = "tfl.reshape"(%arg0, %[[CST]]) : (tensor<128x64xf32>, tensor<3xi32>) -> tensor<1x256x32xf32>
-  // CHECK: %[[FULLY_CONNECTED:.*]] = "tfl.fully_connected"(%[[RESHAPE]], %arg1, %arg2) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<1x256x32xf32>, tensor<32x32xf32>, tensor<32xf32>) -> tensor<256x32xf32>
+  // CHECK: %[[FULLY_CONNECTED:.*]] = "tfl.fully_connected"(%[[RESHAPE]], %arg1, %arg2) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<1x256x32xf32>, tensor<32x32xf32>, tensor<32xf32>) -> tensor<256x32xf32>
   // CHECK: return %[[FULLY_CONNECTED]] : tensor<256x32xf32>
 }
 
@@ -2838,8 +2849,8 @@ func.func @DontFuseAddWithConvActivationFunc(%arg0: tensor<1x3x1x1xf32>) -> tens
   // CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.500000e+00> : tensor<1xf32>
   // CHECK-DAG: %[[CST_1:.*]] = arith.constant dense<0.000000e+00> : tensor<3xf32>
   // CHECK-DAG: %[[CST_2:.*]] = arith.constant dense<1.100000e+00> : tensor<3x2x1x1xf32>
-  // CHECK: %[[ADD:.*]] = tfl.add(%arg0, %[[CST]]) {fused_activation_function = "RELU6"} : (tensor<1x3x1x1xf32>, tensor<1xf32>) -> tensor<1x3x1x1xf32>
-  // CHECK: %[[CONV:.*]] = "tfl.conv_2d"(%[[ADD]], %[[CST_2]], %[[CST_1]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x3x1x1xf32>, tensor<3x2x1x1xf32>, tensor<3xf32>) -> tensor<1x2x1x3xf32>
+  // CHECK: %[[ADD:.*]] = tfl.add(%arg0, %[[CST]]) <{fused_activation_function = "RELU6"}> : (tensor<1x3x1x1xf32>, tensor<1xf32>) -> tensor<1x3x1x1xf32>
+  // CHECK: %[[CONV:.*]] = "tfl.conv_2d"(%[[ADD]], %[[CST_2]], %[[CST_1]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<1x3x1x1xf32>, tensor<3x2x1x1xf32>, tensor<3xf32>) -> tensor<1x2x1x3xf32>
   // CHECK: return %[[CONV]]
 }
 
@@ -2874,7 +2885,7 @@ func.func @replaceReshapeEqualWithOneHot(%arg: tensor<2x1xi32>) -> tensor<2x3xi1
   // CHECK-DAG: %[[CST3:.*]] = arith.constant dense<false> : tensor<i1>
   // CHECK-DAG: %[[CST4:.*]] = arith.constant dense<2> : tensor<1xi32>
   // CHECK: %[[RESHAPE:.*]] = "tfl.reshape"(%arg0, %[[CST4]]) : (tensor<2x1xi32>, tensor<1xi32>) -> tensor<2xi32>
-  // CHECK: %[[RES:.*]] = "tfl.one_hot"(%[[RESHAPE]], %[[CST1]], %[[CST2]], %[[CST3]]) {axis = -1 : i32} : (tensor<2xi32>, tensor<i32>, tensor<i1>, tensor<i1>) -> tensor<2x3xi1>
+  // CHECK: %[[RES:.*]] = "tfl.one_hot"(%[[RESHAPE]], %[[CST1]], %[[CST2]], %[[CST3]]) <{axis = -1 : i32}> : (tensor<2xi32>, tensor<i32>, tensor<i1>, tensor<i1>) -> tensor<2x3xi1>
 }
 
 // CHECK-LABEL: ReplaceReshapeEqualWithOneHotWithBatchingDim
@@ -2888,7 +2899,7 @@ func.func @ReplaceReshapeEqualWithOneHotWithBatchingDim(%arg: tensor<2x2x1xi32>)
   // CHECK-DAG: %[[CST3:.*]] = arith.constant dense<false> : tensor<i1>
   // CHECK-DAG: %[[CST4:.*]] = arith.constant dense<2> : tensor<2xi32>
   // CHECK: %[[RESHAPE:.*]] = "tfl.reshape"(%arg0, %[[CST4]]) : (tensor<2x2x1xi32>, tensor<2xi32>) -> tensor<2x2xi32>
-  // CHECK: %[[RES:.*]] = "tfl.one_hot"(%[[RESHAPE]], %[[CST1]], %[[CST2]], %[[CST3]]) {axis = -1 : i32} : (tensor<2x2xi32>, tensor<i32>, tensor<i1>, tensor<i1>) -> tensor<2x2x3xi1>
+  // CHECK: %[[RES:.*]] = "tfl.one_hot"(%[[RESHAPE]], %[[CST1]], %[[CST2]], %[[CST3]]) <{axis = -1 : i32}> : (tensor<2x2xi32>, tensor<i32>, tensor<i1>, tensor<i1>) -> tensor<2x2x3xi1>
 }
 
 // CHECK-LABEL: noReplaceReshapeEqualWithOneHotBadShape
@@ -2937,7 +2948,7 @@ func.func @ReplaceReshapeEqualOneHotDynamicBatch(%arg0: tensor<?xi32>) -> (tenso
   // CHECK-DAG: %[[CST_3:.*]] = arith.constant dense<-1> : tensor<i32>
   // CHECK: %[[EXPAND_DIMS:.*]] = "tfl.expand_dims"(%arg0, %[[CST_3]]) : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
   // CHECK: %[[RESHAPE:.*]] = "tfl.reshape"(%0, %[[CST]]) : (tensor<?x1xi32>, tensor<1xi32>) -> tensor<?xi32>
-  // CHECK: %[[ONE_HOT:.*]] = "tfl.one_hot"(%1, %[[CST_0]], %[[CST_1]], %[[CST_2]]) {axis = -1 : i32} : (tensor<?xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<?x10xf32>
+  // CHECK: %[[ONE_HOT:.*]] = "tfl.one_hot"(%1, %[[CST_0]], %[[CST_1]], %[[CST_2]]) <{axis = -1 : i32}> : (tensor<?xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<?x10xf32>
   // CHECK-NEXT: return %[[ONE_HOT]]
 }
 
@@ -2999,8 +3010,8 @@ func.func @fuseOneHotCast(%arg: tensor<2xi32>) -> (tensor<2x3xf32>, tensor<2x3xf
   // CHECK-DAG: %[[CST3:.*]] = arith.constant dense<0.000000e+00> : tensor<f32>
   // CHECK-DAG: %[[CST4:.*]] = arith.constant dense<5.000000e+00> : tensor<f32>
   // CHECK-DAG: %[[CST5:.*]] = arith.constant dense<7.000000e+00> : tensor<f32>
-  // CHECK: %[[RES1:.*]] = "tfl.one_hot"(%arg0, %[[CST1]], %[[CST2]], %[[CST3]]) {axis = -1 : i32} : (tensor<2xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<2x3xf32>
-  // CHECK: %[[RES2:.*]] = "tfl.one_hot"(%arg0, %[[CST1]], %[[CST4]], %[[CST5]]) {axis = -1 : i32} : (tensor<2xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<2x3xf32>
+  // CHECK: %[[RES1:.*]] = "tfl.one_hot"(%arg0, %[[CST1]], %[[CST2]], %[[CST3]]) <{axis = -1 : i32}> : (tensor<2xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<2x3xf32>
+  // CHECK: %[[RES2:.*]] = "tfl.one_hot"(%arg0, %[[CST1]], %[[CST4]], %[[CST5]]) <{axis = -1 : i32}> : (tensor<2xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<2x3xf32>
 }
 
 // CHECK-LABEL: replaceOneHotFullyConnectedWithLookup
@@ -3039,9 +3050,9 @@ func.func @dontReplaceOneHotFullyConnectedWithLookupBadIndexType(%arg: tensor<2x
   // CHECK-DAG: %[[CST2:.*]] = arith.constant dense<1.000000e+00> : tensor<f32>
   // CHECK-DAG: %[[CST3:.*]] = arith.constant dense<0.000000e+00> : tensor<f32>
   // CHECK-DAG: %[[CST4:.*]] = arith.constant dense<7.000000e+00> : tensor<5x3xf32>
-  // CHECK-DAG: %[[CST5:.*]] = "tfl.no_value"() {value} : () -> none
-  // CHECK: %[[TMP:.*]] = "tfl.one_hot"(%arg0, %[[CST1]], %[[CST2]], %[[CST3]]) {axis = -1 : i32} : (tensor<2xi64>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<2x3xf32>
-  // CHECK: %[[RES:.*]] = "tfl.fully_connected"(%[[TMP]], %[[CST4]], %[[CST5]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<2x3xf32>, tensor<5x3xf32>, none) -> tensor<2x5xf32>
+  // CHECK-DAG: %[[CST5:.*]] = "tfl.no_value"() <{value}> : () -> none
+  // CHECK: %[[TMP:.*]] = "tfl.one_hot"(%arg0, %[[CST1]], %[[CST2]], %[[CST3]]) <{axis = -1 : i32}> : (tensor<2xi64>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<2x3xf32>
+  // CHECK: %[[RES:.*]] = "tfl.fully_connected"(%[[TMP]], %[[CST4]], %[[CST5]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<2x3xf32>, tensor<5x3xf32>, none) -> tensor<2x5xf32>
   // CHECK: return %[[RES]] : tensor<2x5xf32>
 }
 
@@ -3064,9 +3075,9 @@ func.func @dontReplaceOneHotFullyConnectedWithLookupBadIndexTypeWithOptionalAttr
   // CHECK-DAG: %[[CST2:.*]] = arith.constant dense<1.000000e+00> : tensor<f32>
   // CHECK-DAG: %[[CST3:.*]] = arith.constant dense<0.000000e+00> : tensor<f32>
   // CHECK-DAG: %[[CST4:.*]] = arith.constant dense<7.000000e+00> : tensor<5x3xf32>
-  // CHECK-DAG: %[[CST5:.*]] = "tfl.no_value"() {value} : () -> none
-  // CHECK: %[[TMP:.*]] = "tfl.one_hot"(%arg0, %[[CST1]], %[[CST2]], %[[CST3]]) {axis = -1 : i32} : (tensor<2xi64>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<2x3xf32>
-  // CHECK: %[[RES:.*]] = "tfl.fully_connected"(%[[TMP]], %[[CST4]], %[[CST5]]) {asymmetric_quantize_inputs = true,
+  // CHECK-DAG: %[[CST5:.*]] = "tfl.no_value"() <{value}> : () -> none
+  // CHECK: %[[TMP:.*]] = "tfl.one_hot"(%arg0, %[[CST1]], %[[CST2]], %[[CST3]]) <{axis = -1 : i32}> : (tensor<2xi64>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<2x3xf32>
+  // CHECK: %[[RES:.*]] = "tfl.fully_connected"(%[[TMP]], %[[CST4]], %[[CST5]]) <{asymmetric_quantize_inputs = true,
 }
 
 // CHECK-LABEL: ReplaceOneHotFullyConnectedWithLookup2DRank
@@ -3109,9 +3120,9 @@ func.func @dontReplaceOneHotFullyConnectedWithLookupBadOn(%arg: tensor<2xi32>) -
   // CHECK-DAG: %[[CST2:.*]] = arith.constant dense<2.000000e+00> : tensor<f32>
   // CHECK-DAG: %[[CST3:.*]] = arith.constant dense<0.000000e+00> : tensor<f32>
   // CHECK-DAG: %[[CST4:.*]] = arith.constant dense<7.000000e+00> : tensor<5x3xf32>
-  // CHECK-DAG: %[[CST5:.*]] = "tfl.no_value"() {value} : () -> none
-  // CHECK: %[[TMP:.*]] = "tfl.one_hot"(%arg0, %[[CST1]], %[[CST2]], %[[CST3]]) {axis = -1 : i32} : (tensor<2xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<2x3xf32>
-  // CHECK: %[[RES:.*]] = "tfl.fully_connected"(%[[TMP]], %[[CST4]], %[[CST5]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<2x3xf32>, tensor<5x3xf32>, none) -> tensor<2x5xf32>
+  // CHECK-DAG: %[[CST5:.*]] = "tfl.no_value"() <{value}> : () -> none
+  // CHECK: %[[TMP:.*]] = "tfl.one_hot"(%arg0, %[[CST1]], %[[CST2]], %[[CST3]]) <{axis = -1 : i32}> : (tensor<2xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<2x3xf32>
+  // CHECK: %[[RES:.*]] = "tfl.fully_connected"(%[[TMP]], %[[CST4]], %[[CST5]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<2x3xf32>, tensor<5x3xf32>, none) -> tensor<2x5xf32>
   // CHECK: return %[[RES]] : tensor<2x5xf32>
 }
 
@@ -3133,9 +3144,9 @@ func.func @dontReplaceOneHotFullyConnectedWithLookupBadOff(%arg: tensor<2xi32>)
   // CHECK-DAG: %[[CST2:.*]] = arith.constant dense<1.000000e+00> : tensor<f32>
   // CHECK-DAG: %[[CST3:.*]] = arith.constant dense<-1.000000e+00> : tensor<f32>
   // CHECK-DAG: %[[CST4:.*]] = arith.constant dense<7.000000e+00> : tensor<5x3xf32>
-  // CHECK-DAG: %[[CST5:.*]] = "tfl.no_value"() {value} : () -> none
-  // CHECK: %[[TMP:.*]] = "tfl.one_hot"(%arg0, %[[CST1]], %[[CST2]], %[[CST3]]) {axis = -1 : i32} : (tensor<2xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<2x3xf32>
-  // CHECK: %[[RES:.*]] = "tfl.fully_connected"(%[[TMP]], %[[CST4]], %[[CST5]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<2x3xf32>, tensor<5x3xf32>, none) -> tensor<2x5xf32>
+  // CHECK-DAG: %[[CST5:.*]] = "tfl.no_value"() <{value}> : () -> none
+  // CHECK: %[[TMP:.*]] = "tfl.one_hot"(%arg0, %[[CST1]], %[[CST2]], %[[CST3]]) <{axis = -1 : i32}> : (tensor<2xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<2x3xf32>
+  // CHECK: %[[RES:.*]] = "tfl.fully_connected"(%[[TMP]], %[[CST4]], %[[CST5]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<2x3xf32>, tensor<5x3xf32>, none) -> tensor<2x5xf32>
   // CHECK: return %[[RES]] : tensor<2x5xf32>
 }
 
@@ -3158,8 +3169,8 @@ func.func @dontReplaceOneHotFullyConnectedWithLookupBadBias(%arg: tensor<2xi32>)
   // CHECK-DAG: %[[CST3:.*]] = arith.constant dense<0.000000e+00> : tensor<f32>
   // CHECK-DAG: %[[CST4:.*]] = arith.constant dense<7.000000e+00> : tensor<5x3xf32>
   // CHECK-DAG: %[[CST5:.*]] = arith.constant dense<1.100000e+01> : tensor<f32>
-  // CHECK: %[[TMP:.*]] = "tfl.one_hot"(%arg0, %[[CST1]], %[[CST2]], %[[CST3]]) {axis = -1 : i32} : (tensor<2xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<2x3xf32>
-  // CHECK: %[[RES:.*]] = "tfl.fully_connected"(%[[TMP]], %[[CST4]], %[[CST5]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<2x3xf32>, tensor<5x3xf32>, tensor<f32>) -> tensor<2x5xf32>
+  // CHECK: %[[TMP:.*]] = "tfl.one_hot"(%arg0, %[[CST1]], %[[CST2]], %[[CST3]]) <{axis = -1 : i32}> : (tensor<2xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<2x3xf32>
+  // CHECK: %[[RES:.*]] = "tfl.fully_connected"(%[[TMP]], %[[CST4]], %[[CST5]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<2x3xf32>, tensor<5x3xf32>, tensor<f32>) -> tensor<2x5xf32>
   // CHECK: return %[[RES]] : tensor<2x5xf32>
 }
 
@@ -3291,10 +3302,10 @@ func.func @eliminateCumSumCheckIndices(%arg: tensor<1x2x1x3xf32>) -> (tensor<1x2
   // CHECK-DAG: %[[AXIS_M1:.*]] = arith.constant dense<-1> : tensor<i32>
   // CHECK-DAG: %[[AXIS_P1:.*]] = arith.constant dense<1> : tensor<i32>
   // CHECK-DAG: %[[AXIS_P3:.*]] = arith.constant dense<3> : tensor<i32>
-  // CHECK: %[[RES_M3:.*]] = "tfl.cumsum"(%arg0, %[[AXIS_M3]]) {exclusive = false, reverse = false} : (tensor<1x2x1x3xf32>, tensor<i32>) -> tensor<1x2x1x3xf32>
-  // CHECK: %[[RES_M1:.*]] = "tfl.cumsum"(%arg0, %[[AXIS_M1]]) {exclusive = false, reverse = false} : (tensor<1x2x1x3xf32>, tensor<i32>) -> tensor<1x2x1x3xf32>
-  // CHECK: %[[RES_P1:.*]] = "tfl.cumsum"(%arg0, %[[AXIS_P1]]) {exclusive = false, reverse = false} : (tensor<1x2x1x3xf32>, tensor<i32>) -> tensor<1x2x1x3xf32>
-  // CHECK: %[[RES_P3:.*]] = "tfl.cumsum"(%arg0, %[[AXIS_P3]]) {exclusive = false, reverse = false} : (tensor<1x2x1x3xf32>, tensor<i32>) -> tensor<1x2x1x3xf32>
+  // CHECK: %[[RES_M3:.*]] = "tfl.cumsum"(%arg0, %[[AXIS_M3]]) <{exclusive = false, reverse = false}> : (tensor<1x2x1x3xf32>, tensor<i32>) -> tensor<1x2x1x3xf32>
+  // CHECK: %[[RES_M1:.*]] = "tfl.cumsum"(%arg0, %[[AXIS_M1]]) <{exclusive = false, reverse = false}> : (tensor<1x2x1x3xf32>, tensor<i32>) -> tensor<1x2x1x3xf32>
+  // CHECK: %[[RES_P1:.*]] = "tfl.cumsum"(%arg0, %[[AXIS_P1]]) <{exclusive = false, reverse = false}> : (tensor<1x2x1x3xf32>, tensor<i32>) -> tensor<1x2x1x3xf32>
+  // CHECK: %[[RES_P3:.*]] = "tfl.cumsum"(%arg0, %[[AXIS_P3]]) <{exclusive = false, reverse = false}> : (tensor<1x2x1x3xf32>, tensor<i32>) -> tensor<1x2x1x3xf32>
   // CHECK: return %arg0, %[[RES_M3]], %arg0, %[[RES_M1]], %arg0, %[[RES_P1]], %arg0, %[[RES_P3]] : tensor<1x2x1x3xf32>, tensor<1x2x1x3xf32>, tensor<1x2x1x3xf32>, tensor<1x2x1x3xf32>, tensor<1x2x1x3xf32>, tensor<1x2x1x3xf32>, tensor<1x2x1x3xf32>, tensor<1x2x1x3xf32>
 }
 
@@ -3308,8 +3319,8 @@ func.func @eliminateCumSumCheckAttributes(%arg: tensor<1x2x1x3xf32>) -> (tensor<
   func.return %res_ff, %res_ft, %res_tf, %res_tt: tensor<1x2x1x3xf32>, tensor<1x2x1x3xf32>, tensor<1x2x1x3xf32>, tensor<1x2x1x3xf32>
 
   // CHECK: %[[AXIS:.*]] = arith.constant dense<2> : tensor<i32>
-  // CHECK: %[[RES_TF:.*]] = "tfl.cumsum"(%arg0, %[[AXIS]]) {exclusive = true, reverse = false} : (tensor<1x2x1x3xf32>, tensor<i32>) -> tensor<1x2x1x3xf32>
-  // CHECK: %[[RES_TT:.*]] = "tfl.cumsum"(%arg0, %[[AXIS]]) {exclusive = true, reverse = true} : (tensor<1x2x1x3xf32>, tensor<i32>) -> tensor<1x2x1x3xf32>
+  // CHECK: %[[RES_TF:.*]] = "tfl.cumsum"(%arg0, %[[AXIS]]) <{exclusive = true, reverse = false}> : (tensor<1x2x1x3xf32>, tensor<i32>) -> tensor<1x2x1x3xf32>
+  // CHECK: %[[RES_TT:.*]] = "tfl.cumsum"(%arg0, %[[AXIS]]) <{exclusive = true, reverse = true}> : (tensor<1x2x1x3xf32>, tensor<i32>) -> tensor<1x2x1x3xf32>
   // CHECK: return %arg0, %arg0, %[[RES_TF]], %[[RES_TT]] : tensor<1x2x1x3xf32>, tensor<1x2x1x3xf32>, tensor<1x2x1x3xf32>, tensor<1x2x1x3xf32>
 }
 
@@ -3325,7 +3336,7 @@ func.func @gelu(%arg0: tensor<3xf32>) -> tensor<3xf32> {
   func.return %4 : tensor<3xf32>
 
 // CHECK-LABEL:gelu
-// CHECK: "tfl.gelu"(%arg0) {approximate = false} : (tensor<3xf32>) -> tensor<3xf32>
+// CHECK: "tfl.gelu"(%arg0) <{approximate = false}> : (tensor<3xf32>) -> tensor<3xf32>
 }
 
 func.func @gelu_no_match(%arg0: tensor<3xf32>) -> tensor<3xf32> {
@@ -3377,7 +3388,7 @@ func.func @gelu_approximate(%arg0: tensor<3xf32>) -> tensor<3xf32> {
   func.return %7 : tensor<3xf32>
 
 // CHECK-LABEL:gelu_approximate
-// CHECK: "tfl.gelu"(%arg0) {approximate = true} : (tensor<3xf32>) -> tensor<3xf32>
+// CHECK: "tfl.gelu"(%arg0) <{approximate = true}> : (tensor<3xf32>) -> tensor<3xf32>
 }
 
 func.func @gelu_approximate1(%arg0: tensor<3xf32>) -> tensor<3xf32> {
@@ -3397,7 +3408,7 @@ func.func @gelu_approximate1(%arg0: tensor<3xf32>) -> tensor<3xf32> {
   func.return %7 : tensor<3xf32>
 
 // CHECK-LABEL:gelu_approximate
-// CHECK: "tfl.gelu"(%arg0) {approximate = true} : (tensor<3xf32>) -> tensor<3xf32>
+// CHECK: "tfl.gelu"(%arg0) <{approximate = true}> : (tensor<3xf32>) -> tensor<3xf32>
 }
 
 func.func @gelu_approximate_no_match(%arg0: tensor<3xf32>) -> tensor<3xf32> {
@@ -3456,7 +3467,7 @@ func.func @eliminateExtraSelectLhs(%arg0: tensor<4x2x1xf32>, %arg1: tensor<4x2x1
 
   // CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : tensor<2x2xf32>
   // CHECK-DAG: %[[CST_1:.*]] = arith.constant dense<2.000000e+00> : tensor<2xf32>
-  // CHECK: %[[FC:.*]] = "tfl.fully_connected"(%arg0, %[[CST]], %[[CST_1]]) {asymmetric_quantize_inputs = true, fused_activation_function = "NONE", keep_num_dims = true, weights_format = "DEFAULT"} : (tensor<4x2x1xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<4x2x1xf32>
+  // CHECK: %[[FC:.*]] = "tfl.fully_connected"(%arg0, %[[CST]], %[[CST_1]]) <{asymmetric_quantize_inputs = true, fused_activation_function = "NONE", keep_num_dims = true, weights_format = "DEFAULT"}> : (tensor<4x2x1xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<4x2x1xf32>
   // CHECK-NEXT: %[[SELECT:.*]] = "tfl.select_v2"
   // CHECK-NEXT: return %[[SELECT]]
 }
@@ -3475,7 +3486,7 @@ func.func @eliminateExtraSelectRhs(%arg0: tensor<4x2x1xf32>, %arg1: tensor<4x2x1
 
   // CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : tensor<2x2xf32>
   // CHECK-DAG: %[[CST_1:.*]] = arith.constant dense<2.000000e+00> : tensor<2xf32>
-  // CHECK: %[[FC:.*]] = "tfl.fully_connected"(%arg0, %[[CST]], %[[CST_1]]) {asymmetric_quantize_inputs = true, fused_activation_function = "NONE", keep_num_dims = true, weights_format = "DEFAULT"} : (tensor<4x2x1xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<4x2x1xf32>
+  // CHECK: %[[FC:.*]] = "tfl.fully_connected"(%arg0, %[[CST]], %[[CST_1]]) <{asymmetric_quantize_inputs = true, fused_activation_function = "NONE", keep_num_dims = true, weights_format = "DEFAULT"}> : (tensor<4x2x1xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<4x2x1xf32>
   // CHECK-NEXT: %[[SELECT:.*]] = "tfl.select_v2"
   // CHECK-NEXT: return %[[SELECT]]
 }
@@ -3497,7 +3508,7 @@ func.func @DontEliminateExtraSelect(%arg0: tensor<4x2xf32>, %arg1: tensor<4x2xi1
   // CHECK-DAG: %[[CST_1:.*]] = arith.constant dense<2.000000e+00> : tensor<2xf32>
   // CHECK-DAG: %[[CST_2:.*]] = arith.constant dense<0.000000e+00> : tensor<4x2xf32>
   // CHECK: %[[SELECT:.*]] = "tfl.select_v2"(%arg1, %arg0, %[[CST_2]]) : (tensor<4x2xi1>, tensor<4x2xf32>, tensor<4x2xf32>) -> tensor<4x2xf32>
-  // CHECK: %[[FC:.*]] = "tfl.fully_connected"(%[[SELECT]], %[[CST]], %[[CST_1]]) {asymmetric_quantize_inputs = true, fused_activation_function = "NONE", keep_num_dims = true, weights_format = "DEFAULT"} : (tensor<4x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<4x2xf32>
+  // CHECK: %[[FC:.*]] = "tfl.fully_connected"(%[[SELECT]], %[[CST]], %[[CST_1]]) <{asymmetric_quantize_inputs = true, fused_activation_function = "NONE", keep_num_dims = true, weights_format = "DEFAULT"}> : (tensor<4x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<4x2xf32>
   // CHECK-NEXT: %[[SELECT_1:.*]] = "tfl.select_v2"
   // CHECK-NEXT: return %[[SELECT_1]]
 }
@@ -3549,8 +3560,8 @@ func.func @fuseReluToMin1_StaticShapeWithSameShapeCst_Float2(%arg0: tensor<2x2xf
 // CHECK-LABEL:   func @fuseAddAndStridedSlice
 func.func @fuseAddAndStridedSlice(%arg0: tensor<4xi32>, %arg1: tensor<1xi32>) -> tensor<4xi32> {
   // CHECK-DAG:  %[[cst:.*]] = arith.constant dense<1> : tensor<1xi32>
-  // CHECK-DAG:  %[[c0:.*]] = "tfl.pseudo_const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
-  // CHECK:  %1 = "tfl.strided_slice"(%arg0, %arg1, %[[cst]], %[[c0]]) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = true, shrink_axis_mask = 0 : i32} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+  // CHECK-DAG:  %[[c0:.*]] = "tfl.pseudo_const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
+  // CHECK:  %1 = "tfl.strided_slice"(%arg0, %arg1, %[[cst]], %[[c0]]) <{begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = true, shrink_axis_mask = 0 : i32}> : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
 
   %cst_0 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   %cst_1 = "tfl.pseudo_const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
@@ -3562,8 +3573,8 @@ func.func @fuseAddAndStridedSlice(%arg0: tensor<4xi32>, %arg1: tensor<1xi32>) ->
 // CHECK-LABEL:   func @fuseSubAndStridedSlice
 func.func @fuseSubAndStridedSlice(%arg0: tensor<4xi32>, %arg1: tensor<1xi32>) -> tensor<4xi32> {
   // CHECK-DAG:  %[[cst:.*]] = arith.constant dense<1> : tensor<1xi32>
-  // CHECK-DAG:  %[[c0:.*]] = "tfl.pseudo_const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
-  // CHECK:  %1 = "tfl.strided_slice"(%arg0, %arg1, %[[cst]], %[[c0]]) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = true, shrink_axis_mask = 0 : i32} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+  // CHECK-DAG:  %[[c0:.*]] = "tfl.pseudo_const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
+  // CHECK:  %1 = "tfl.strided_slice"(%arg0, %arg1, %[[cst]], %[[c0]]) <{begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = true, shrink_axis_mask = 0 : i32}> : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
 
   %cst_0 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   %cst_1 = "tfl.pseudo_const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
@@ -3574,9 +3585,9 @@ func.func @fuseSubAndStridedSlice(%arg0: tensor<4xi32>, %arg1: tensor<1xi32>) ->
 
 // CHECK-LABEL:   func @dontFuseAddAndStridedSliceNonConstantStride
 func.func @dontFuseAddAndStridedSliceNonConstantStrides(%arg0: tensor<4xi32>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>) -> tensor<4xi32> {
-  // CHECK-DAG:  %0 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-  // CHECK:  %1 = tfl.add(%arg1, %0) {fused_activation_function = "NONE"} : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
-  // CHECK:  %2 = "tfl.strided_slice"(%arg0, %arg1, %1, %arg2) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+  // CHECK-DAG:  %0 = "tfl.pseudo_const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
+  // CHECK:  %1 = tfl.add(%arg1, %0) <{fused_activation_function = "NONE"}> : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
+  // CHECK:  %2 = "tfl.strided_slice"(%arg0, %arg1, %1, %arg2) <{begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32}> : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
 
   %cst = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   %0 = "tfl.add"(%arg1, %cst) {fused_activation_function = "NONE"} : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
@@ -3586,9 +3597,9 @@ func.func @dontFuseAddAndStridedSliceNonConstantStrides(%arg0: tensor<4xi32>, %a
 
 // CHECK-LABEL:   func @dontFuseAddAndStridedSliceOffset
 func.func @dontFuseAddAndStridedSliceOffset(%arg0: tensor<4xi32>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>, %arg3: tensor<1xi32>) -> tensor<4xi32> {
-  // CHECK-DAG:  %0 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-  // CHECK:  %1 = tfl.add(%arg2, %0) {fused_activation_function = "NONE"} : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
-  // CHECK:  %2 = "tfl.strided_slice"(%arg0, %arg1, %1, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+  // CHECK-DAG:  %0 = "tfl.pseudo_const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
+  // CHECK:  %1 = tfl.add(%arg2, %0) <{fused_activation_function = "NONE"}> : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
+  // CHECK:  %2 = "tfl.strided_slice"(%arg0, %arg1, %1, %arg3) <{begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32}> : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
 
   %cst = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   %0 = "tfl.add"(%arg2, %cst) {fused_activation_function = "NONE"} : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
@@ -3599,7 +3610,7 @@ func.func @dontFuseAddAndStridedSliceOffset(%arg0: tensor<4xi32>, %arg1: tensor<
 // CHECK-LABEL:   func @dontFuseAddAndStridedSliceNonConstantOffset
 func.func @dontFuseAddAndStridedSliceNonConstantOffset(%arg0: tensor<4xi32>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>) -> tensor<4xi32> {
   // CHECK:  %0 = tfl.add %arg1, %arg1 {fused_activation_function = "NONE"} : tensor<1xi32>
-  // CHECK: "tfl.strided_slice"(%arg0, %arg1, %0, %arg2) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+  // CHECK: "tfl.strided_slice"(%arg0, %arg1, %0, %arg2) <{begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32}> : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
 
   %0 = "tfl.add"(%arg1, %arg1) {fused_activation_function = "NONE"} : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
   %1 = "tfl.strided_slice"(%arg0, %arg1, %0, %arg2) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, offset = false} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
@@ -3608,10 +3619,10 @@ func.func @dontFuseAddAndStridedSliceNonConstantOffset(%arg0: tensor<4xi32>, %ar
 
 // CHECK-LABEL:   func @dontFuseAddAndStridedSliceBeginMask
 func.func @dontFuseAddAndStridedSliceBeginMask(%arg0: tensor<4xi32>, %arg1: tensor<1xi32>) -> tensor<4xi32> {
-  // CHECK-DAG:  %0 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-  // CHECK-DAG:  %1 = "tfl.pseudo_const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
-  // CHECK:  %2 = tfl.add(%arg1, %0) {fused_activation_function = "NONE"} : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
-  // CHECK:  %3 = "tfl.strided_slice"(%arg0, %arg1, %2, %1) {begin_mask = 1 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+  // CHECK-DAG:  %0 = "tfl.pseudo_const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
+  // CHECK-DAG:  %1 = "tfl.pseudo_const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
+  // CHECK:  %2 = tfl.add(%arg1, %0) <{fused_activation_function = "NONE"}> : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
+  // CHECK:  %3 = "tfl.strided_slice"(%arg0, %arg1, %2, %1) <{begin_mask = 1 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32}> : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
 
   %cst_0 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   %cst_1 = "tfl.pseudo_const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
@@ -3622,10 +3633,10 @@ func.func @dontFuseAddAndStridedSliceBeginMask(%arg0: tensor<4xi32>, %arg1: tens
 
 // CHECK-LABEL:   func @dontFuseAddAndStridedSliceEndMask
 func.func @dontFuseAddAndStridedSliceEndMask(%arg0: tensor<4xi32>, %arg1: tensor<1xi32>) -> tensor<4xi32> {
-  // CHECK-DAG:  %0 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-  // CHECK-DAG:  %1 = "tfl.pseudo_const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
-  // CHECK:  %2 = tfl.add(%arg1, %0) {fused_activation_function = "NONE"} : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
-  // CHECK:  %3 = "tfl.strided_slice"(%arg0, %arg1, %2, %1) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 1 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+  // CHECK-DAG:  %0 = "tfl.pseudo_const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
+  // CHECK-DAG:  %1 = "tfl.pseudo_const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
+  // CHECK:  %2 = tfl.add(%arg1, %0) <{fused_activation_function = "NONE"}> : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
+  // CHECK:  %3 = "tfl.strided_slice"(%arg0, %arg1, %2, %1) <{begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 1 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32}> : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
 
   %cst_0 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   %cst_1 = "tfl.pseudo_const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
@@ -3636,10 +3647,10 @@ func.func @dontFuseAddAndStridedSliceEndMask(%arg0: tensor<4xi32>, %arg1: tensor
 
 // CHECK-LABEL:   func @dontFuseAddAndStridedSliceEllipsisMask
 func.func @dontFuseAddAndStridedSliceEllipsisMask(%arg0: tensor<4xi32>, %arg1: tensor<1xi32>) -> tensor<4xi32> {
-  // CHECK-DAG:  %0 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-  // CHECK-DAG:  %1 = "tfl.pseudo_const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
-  // CHECK:  %2 = tfl.add(%arg1, %0) {fused_activation_function = "NONE"} : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
-  // CHECK:  %3 = "tfl.strided_slice"(%arg0, %arg1, %2, %1) {begin_mask = 0 : i32, ellipsis_mask = 1 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+  // CHECK-DAG:  %0 = "tfl.pseudo_const"() <{value = dense<1> : tensor<i32>}> : () -> tensor<i32>
+  // CHECK-DAG:  %1 = "tfl.pseudo_const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
+  // CHECK:  %2 = tfl.add(%arg1, %0) <{fused_activation_function = "NONE"}> : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
+  // CHECK:  %3 = "tfl.strided_slice"(%arg0, %arg1, %2, %1) <{begin_mask = 0 : i32, ellipsis_mask = 1 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32}> : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
 
   %cst_0 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   %cst_1 = "tfl.pseudo_const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
@@ -3752,7 +3763,7 @@ func.func @FuseReshapeAndTransposeAroundBatchMatmul(%arg0: tensor<1x128x1024xf32
   %cst_3 = arith.constant dense<[2, 0, 1]> : tensor<3xi32>
   %0 = "tfl.transpose"(%arg0, %cst_3) : (tensor<1x128x1024xf32>, tensor<3xi32>) -> tensor<1024x1x128xf32>
   %1 = "tfl.reshape"(%0, %cst_2) : (tensor<1024x1x128xf32>, tensor<2xi32>) -> tensor<1024x128xf32>
-  // CHECK: %[[BMM:.*]] = "tfl.batch_matmul"(%arg0, %arg1) {adj_x = false, adj_y = false, asymmetric_quantize_inputs = false}
+  // CHECK: %[[BMM:.*]] = "tfl.batch_matmul"(%arg0, %arg1) <{adj_x = false, adj_y = false, asymmetric_quantize_inputs = false}>
   // CHECK-NOT: tfl.reshape
   // CHECK-NOT: tfl.transpose
   %2 = "tfl.batch_matmul"(%arg1, %1) {adj_x = true, adj_y = false, asymmetric_quantize_inputs = false} : (tensor<1024x16xf32>, tensor<1024x128xf32>) -> tensor<16x128xf32>
@@ -3771,7 +3782,7 @@ func.func @FuseReshapeAndTransposeAroundBatchMatmulWithLargerThan3Rank(%arg0: te
   %0 = "tfl.transpose"(%arg0, %cst_3) : (tensor<1x128x4x256xf32>, tensor<4xi32>) -> tensor<4x256x1x128xf32>
   %1 = "tfl.reshape"(%0, %cst_2) : (tensor<4x256x1x128xf32>, tensor<2xi32>) -> tensor<1024x128xf32>
   // CHECK: %[[RESHAE_ARG0:.*]] = "tfl.reshape"(%arg0, %[[CST:.*]]) : (tensor<1x128x4x256xf32>, tensor<3xi32>) -> tensor<1x128x1024xf32>
-  // CHECK: %[[BMM:.*]] = "tfl.batch_matmul"(%[[RESHAE_ARG0]], %arg1) {adj_x = false, adj_y = true, asymmetric_quantize_inputs = false}
+  // CHECK: %[[BMM:.*]] = "tfl.batch_matmul"(%[[RESHAE_ARG0]], %arg1) <{adj_x = false, adj_y = true, asymmetric_quantize_inputs = false}>
   // CHECK-NOT: tfl.reshape
   // CHECK-NOT: tfl.transpose
   %2 = "tfl.batch_matmul"(%arg1, %1) {adj_x = false, adj_y = false, asymmetric_quantize_inputs = false} : (tensor<16x1024xf32>, tensor<1024x128xf32>) -> tensor<16x128xf32>
@@ -3812,7 +3823,7 @@ func.func @FuseTransposeReshapeIntoBatchMatmul(%arg0: tensor<4x1024xf32>, %arg1:
   %0 = "tfl.transpose"(%arg1, %cst_1) : (tensor<8x4x256xf32>, tensor<3xi32>) -> tensor<4x256x8xf32>
   %1 = "tfl.reshape"(%0, %cst_0) : (tensor<4x256x8xf32>, tensor<2xi32>) -> tensor<1024x8xf32>
   // CHECK: %[[RES0:.*]] = "tfl.reshape"(%arg1, %[[CST:.*]]) : (tensor<8x4x256xf32>, tensor<2xi32>) -> tensor<8x1024xf32>
-  // CHECK: %[[RES1:.*]] = "tfl.batch_matmul"(%arg0, %[[RES0]]) {adj_x = false, adj_y = true, asymmetric_quantize_inputs = false} : (tensor<4x1024xf32>, tensor<8x1024xf32>) -> tensor<4x8xf32>
+  // CHECK: %[[RES1:.*]] = "tfl.batch_matmul"(%arg0, %[[RES0]]) <{adj_x = false, adj_y = true, asymmetric_quantize_inputs = false}> : (tensor<4x1024xf32>, tensor<8x1024xf32>) -> tensor<4x8xf32>
   %2 = "tfl.batch_matmul"(%arg0, %1) {adj_x = false, adj_y = false, asymmetric_quantize_inputs = false} : (tensor<4x1024xf32>, tensor<1024x8xf32>) -> tensor<4x8xf32>
   func.return %2 : tensor<4x8xf32>
   // CHECK: return %[[RES1]] : tensor<4x8xf32>
@@ -3821,7 +3832,7 @@ func.func @FuseTransposeReshapeIntoBatchMatmul(%arg0: tensor<4x1024xf32>, %arg1:
 // CHECK-LABEL: FuseTransposeAfterBatchMatmul
 func.func @FuseTransposeAfterBatchMatmul(%arg0: tensor<4x1024xf32>, %arg1: tensor<8x1024xf32>, %arg2: none) -> tensor<8x4xf32> {
   %cst = arith.constant dense<[1, 0]> : tensor<2xi32>
-  // CHECK: %[[RES0:.*]] = "tfl.batch_matmul"(%arg1, %arg0) {adj_x = false, adj_y = true, asymmetric_quantize_inputs = false} : (tensor<8x1024xf32>, tensor<4x1024xf32>) -> tensor<8x4xf32>
+  // CHECK: %[[RES0:.*]] = "tfl.batch_matmul"(%arg1, %arg0) <{adj_x = false, adj_y = true, asymmetric_quantize_inputs = false}> : (tensor<8x1024xf32>, tensor<4x1024xf32>) -> tensor<8x4xf32>
   %0 = "tfl.batch_matmul"(%arg0, %arg1) {adj_x = false, adj_y = true, asymmetric_quantize_inputs = false} : (tensor<4x1024xf32>, tensor<8x1024xf32>) -> tensor<4x8xf32>
   %1 = "tfl.transpose"(%0, %cst) : (tensor<4x8xf32>, tensor<2xi32>) -> tensor<8x4xf32>
   func.return %1 : tensor<8x4xf32>
@@ -3980,7 +3991,7 @@ func.func @FuseExcessBroadcastingOnReshapes(%arg0: tensor<1x8xf32>) -> tensor<1x
     // CHECK: %cst_0 = arith.constant dense<[1, 1, 1, 128]> : tensor<4xi32>
     // CHECK: %cst_1 = arith.constant dense<[8, 1]> : tensor<2xi32>
     // CHECK: %0 = "tfl.reshape"(%arg0, %cst_1) : (tensor<1x8xf32>, tensor<2xi32>) -> tensor<8x1xf32>
-    // CHECK: %1 = tfl.mul(%0, %cst) {fused_activation_function = "NONE"} : (tensor<8x1xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
+    // CHECK: %1 = tfl.mul(%0, %cst) <{fused_activation_function = "NONE"}> : (tensor<8x1xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
     // CHECK: %2 = "tfl.reshape"(%1, %cst_0) : (tensor<8x16xf32>, tensor<4xi32>) -> tensor<1x1x1x128xf32>
     // CHECK: return %2 : tensor<1x1x1x128xf32>
 }
@@ -4003,7 +4014,7 @@ func.func @broadcast_to_f32_low_dim(%arg0: tensor<3xf32>, %arg1: tensor<2xi32>)
   %0 = "tfl.broadcast_to"(%arg0, %arg1) : (tensor<3xf32>, tensor<2xi32>) -> tensor<3x3xf32>
   return %0 : tensor<3x3xf32>
   // CHECK:  %cst = arith.constant dense<1.000000e+00> : tensor<3x3xf32>
-  // CHECK:  %0 = tfl.mul(%arg0, %cst) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
+  // CHECK:  %0 = tfl.mul(%arg0, %cst) <{fused_activation_function = "NONE"}> : (tensor<3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
   // CHECK:  return %0 : tensor<3x3xf32>
 }
 
@@ -4012,7 +4023,7 @@ func.func @broadcast_to_i32_low_dim(%arg0: tensor<3xi32>, %arg1: tensor<2xi32>)
   %0 = "tfl.broadcast_to"(%arg0, %arg1) : (tensor<3xi32>, tensor<2xi32>) -> tensor<3x3xi32>
   return %0 : tensor<3x3xi32>
   // CHECK:  %cst = arith.constant dense<1> : tensor<3x3xi32>
-  // CHECK:  %0 = tfl.mul(%arg0, %cst) {fused_activation_function = "NONE"} : (tensor<3xi32>, tensor<3x3xi32>) -> tensor<3x3xi32>
+  // CHECK:  %0 = tfl.mul(%arg0, %cst) <{fused_activation_function = "NONE"}> : (tensor<3xi32>, tensor<3x3xi32>) -> tensor<3x3xi32>
   // CHECK:  return %0 : tensor<3x3xi32>
 }
 
@@ -4021,7 +4032,7 @@ func.func @broadcast_to_low_dim_with_unknown_shape(%arg0: tensor<3xf32>, %arg1:
   %0 = "tfl.broadcast_to"(%arg0, %arg1) : (tensor<3xf32>, tensor<*xi32>) -> tensor<3x3xf32>
   return %0 : tensor<3x3xf32>
   // CHECK:  %cst = arith.constant dense<1.000000e+00> : tensor<3x3xf32>
-  // CHECK:  %0 = tfl.mul(%arg0, %cst) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
+  // CHECK:  %0 = tfl.mul(%arg0, %cst) <{fused_activation_function = "NONE"}> : (tensor<3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
   // CHECK:  return %0 : tensor<3x3xf32>
 }
 
@@ -4030,7 +4041,7 @@ func.func @broadcast_to_i16_low_dim(%arg0: tensor<3xi16>, %arg1: tensor<2xi32>)
   %0 = "tfl.broadcast_to"(%arg0, %arg1) : (tensor<3xi16>, tensor<2xi32>) -> tensor<3x3xi16>
   return %0 : tensor<3x3xi16>
   // CHECK:  %cst = arith.constant dense<1> : tensor<3x3xi16>
-  // CHECK:  %0 = tfl.mul(%arg0, %cst) {fused_activation_function = "NONE"} : (tensor<3xi16>, tensor<3x3xi16>) -> tensor<3x3xi16>
+  // CHECK:  %0 = tfl.mul(%arg0, %cst) <{fused_activation_function = "NONE"}> : (tensor<3xi16>, tensor<3x3xi16>) -> tensor<3x3xi16>
   // CHECK:  return %0 : tensor<3x3xi16>
 }
 
@@ -4040,7 +4051,7 @@ func.func @broadcast_to_i32_low_dim_with_unknown_output(%arg0: tensor<3xi32>, %a
   return %0 : tensor<*xi32>
   // CHECK:  %cst = arith.constant dense<1> : tensor<i32>
   // CHECK:  %0 = "tfl.fill"(%arg1, %cst) : (tensor<2xi32>, tensor<i32>) -> tensor<*xi32>
-  // CHECK:  %1 = tfl.mul(%arg0, %0) {fused_activation_function = "NONE"} : (tensor<3xi32>, tensor<*xi32>) -> tensor<*xi32>
+  // CHECK:  %1 = tfl.mul(%arg0, %0) <{fused_activation_function = "NONE"}> : (tensor<3xi32>, tensor<*xi32>) -> tensor<*xi32>
   // CHECK:  return %1 : tensor<*xi32>
 }
 
@@ -4049,7 +4060,7 @@ func.func @broadcast_to_ui32(%arg0: tensor<ui32>, %arg1: tensor<1xi64>) -> tenso
   %0 = "tfl.broadcast_to"(%arg0, %arg1) : (tensor<ui32>, tensor<1xi64>) -> tensor<10xui32>
   return %0 : tensor<10xui32>
   // CHECK:  %cst = arith.constant dense<1> : tensor<10xui32>
-  // CHECK:  %0 = tfl.mul(%arg0, %cst) {fused_activation_function = "NONE"} : (tensor<ui32>, tensor<10xui32>) -> tensor<10xui32>
+  // CHECK:  %0 = tfl.mul(%arg0, %cst) <{fused_activation_function = "NONE"}> : (tensor<ui32>, tensor<10xui32>) -> tensor<10xui32>
   // CHECK:  return %0 : tensor<10xui32>
 }
 
@@ -4058,7 +4069,7 @@ func.func @broadcast_to_f32(%arg0: tensor<3xf32>, %arg1: tensor<2xi32>) -> tenso
   %0 = "tfl.broadcast_to"(%arg0, %arg1) : (tensor<3xf32>, tensor<2xi32>) -> tensor<3x3xf32>
   return %0 : tensor<3x3xf32>
   // CHECK:  %cst = arith.constant dense<1.000000e+00> : tensor<3x3xf32>
-  // CHECK:  %0 = tfl.mul(%arg0, %cst) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
+  // CHECK:  %0 = tfl.mul(%arg0, %cst) <{fused_activation_function = "NONE"}> : (tensor<3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
   // CHECK:  return %0 : tensor<3x3xf32>
 }
 
@@ -4067,7 +4078,7 @@ func.func @broadcast_to_i32(%arg0: tensor<3xi32>, %arg1: tensor<2xi32>) -> tenso
   %0 = "tfl.broadcast_to"(%arg0, %arg1) : (tensor<3xi32>, tensor<2xi32>) -> tensor<3x3xi32>
   return %0 : tensor<3x3xi32>
   // CHECK:  %cst = arith.constant dense<1> : tensor<3x3xi32>
-  // CHECK:  %0 = tfl.mul(%arg0, %cst) {fused_activation_function = "NONE"} : (tensor<3xi32>, tensor<3x3xi32>) -> tensor<3x3xi32>
+  // CHECK:  %0 = tfl.mul(%arg0, %cst) <{fused_activation_function = "NONE"}> : (tensor<3xi32>, tensor<3x3xi32>) -> tensor<3x3xi32>
   // CHECK:  return %0 : tensor<3x3xi32>
 }
 
@@ -4077,7 +4088,7 @@ func.func @broadcast_to_i32_with_dynamic_shape_and_output(%arg0: tensor<3xi32>,
   return %0 : tensor<3x?xi32>
   // CHECK:  %cst = arith.constant dense<1> : tensor<i32>
   // CHECK:  %0 = "tfl.fill"(%arg1, %cst) : (tensor<2xi32>, tensor<i32>) -> tensor<3x?xi32>
-  // CHECK:  %1 = tfl.mul(%arg0, %0) {fused_activation_function = "NONE"} : (tensor<3xi32>, tensor<3x?xi32>) -> tensor<3x?xi32>
+  // CHECK:  %1 = tfl.mul(%arg0, %0) <{fused_activation_function = "NONE"}> : (tensor<3xi32>, tensor<3x?xi32>) -> tensor<3x?xi32>
   // CHECK:  return %1 : tensor<3x?xi32>
 }
 
diff --git a/tensorflow/compiler/mlir/lite/tests/optimize_batch_matmul.mlir b/tensorflow/compiler/mlir/lite/tests/optimize_batch_matmul.mlir
index c838ead5f031f9..9e2cccaf5158d5 100644
--- a/tensorflow/compiler/mlir/lite/tests/optimize_batch_matmul.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/optimize_batch_matmul.mlir
@@ -16,9 +16,9 @@ func.func @FuseTransposeFCLhsToBatchMatmul(%arg0: tensor<1024x4xf32>, %arg1: ten
   %cst_0 = arith.constant dense<[1, 0]> : tensor<2xi32>
   %cst_1 = "tfl.no_value"() {value} : () -> none
   %0 = "tfl.transpose"(%arg0, %cst_0) : (tensor<1024x4xf32>, tensor<2xi32>) -> tensor<4x1024xf32>
-  // CHECK: %[[RES0:.*]] = "tfl.batch_matmul"(%arg1, %arg0) {adj_x = false, adj_y = false, asymmetric_quantize_inputs = false} : (tensor<8x1024xf32>, tensor<1024x4xf32>) -> tensor<8x4xf32>
+  // CHECK: %[[RES0:.*]] = "tfl.batch_matmul"(%arg1, %arg0) <{adj_x = false, adj_y = false, asymmetric_quantize_inputs = false}> : (tensor<8x1024xf32>, tensor<1024x4xf32>) -> tensor<8x4xf32>
   %1 = "tfl.fully_connected"(%0, %arg1, %cst_1) {asymmetric_quantize_inputs = false, fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<4x1024xf32>, tensor<8x1024xf32>, none) -> tensor<4x8xf32>
-  // CHECK: %[[RES1:.*]] = "tfl.batch_matmul"(%[[RES0]], %arg2) {adj_x = false, adj_y = false, asymmetric_quantize_inputs = false} : (tensor<8x4xf32>, tensor<4x256xf32>) -> tensor<8x256xf32>
+  // CHECK: %[[RES1:.*]] = "tfl.batch_matmul"(%[[RES0]], %arg2) <{adj_x = false, adj_y = false, asymmetric_quantize_inputs = false}> : (tensor<8x4xf32>, tensor<4x256xf32>) -> tensor<8x256xf32>
   %2 = "tfl.batch_matmul"(%1, %arg2) {adj_x = true, adj_y = false, asymmetric_quantize_inputs = false} : (tensor<4x8xf32>, tensor<4x256xf32>) -> tensor<8x256xf32>
   func.return %2 : tensor<8x256xf32>
   // CHECK: return %[[RES1]] : tensor<8x256xf32>
@@ -34,7 +34,7 @@ func.func @Batchmatmul2Fullyconnected(%arg0: tensor<4x128x2xf32>) -> (tensor<4x1
   // CHECK-SAME: [1.000000e+00, 2.000000e+00]
   // CHECK-SAME: tensor<1x2xf32>
   // CHECK: %[[FC_RES:.*]] = "tfl.fully_connected"(%arg0, %[[CONST_WEIGHT]]
-  // CHECK-SAME: {fused_activation_function = "NONE", keep_num_dims = true, weights_format = "DEFAULT"} : (tensor<4x128x2xf32>, tensor<1x2xf32>, none) -> tensor<4x128x1xf32>
+  // CHECK-SAME: <{fused_activation_function = "NONE", keep_num_dims = true, weights_format = "DEFAULT"}> : (tensor<4x128x2xf32>, tensor<1x2xf32>, none) -> tensor<4x128x1xf32>
   // CHECK-NEXT: return %[[FC_RES]]
 }
 
@@ -48,7 +48,7 @@ func.func @Batchmatmul2FullyconnectedAdjy(%arg0: tensor<4x128x2xf32>) -> (tensor
   // CHECK-SAME: [1.000000e+00, 2.000000e+00]
   // CHECK-SAME: tensor<1x2xf32>
   // CHECK: %[[FC_RES:.*]] = "tfl.fully_connected"(%arg0, %[[CONST_WEIGHT]]
-  // CHECK-SAME: {fused_activation_function = "NONE", keep_num_dims = true, weights_format = "DEFAULT"} : (tensor<4x128x2xf32>, tensor<1x2xf32>, none) -> tensor<4x128x1xf32>
+  // CHECK-SAME: <{fused_activation_function = "NONE", keep_num_dims = true, weights_format = "DEFAULT"}> : (tensor<4x128x2xf32>, tensor<1x2xf32>, none) -> tensor<4x128x1xf32>
   // CHECK-NEXT: return %[[FC_RES]]
 }
 
@@ -62,7 +62,7 @@ func.func @Batchmatmul2FullyconnectedAdjx(%arg0: tensor<4x2x128xf32>) -> (tensor
   // CHECK: %[[TRANSPOSED_X:.*]] = "tfl.transpose"
   // CHECK-SAME: (tensor<4x2x128xf32>, tensor<3xi32>) -> tensor<4x128x2xf32>
   // CHECK-NEXT: %[[FC_RES:.*]] = "tfl.fully_connected"(%[[TRANSPOSED_X]]
-  // CHECK-SAME: {fused_activation_function = "NONE", keep_num_dims = true, weights_format = "DEFAULT"} : (tensor<4x128x2xf32>, tensor<1x2xf32>, none) -> tensor<4x128x1xf32>
+  // CHECK-SAME: <{fused_activation_function = "NONE", keep_num_dims = true, weights_format = "DEFAULT"}> : (tensor<4x128x2xf32>, tensor<1x2xf32>, none) -> tensor<4x128x1xf32>
   // CHECK-NEXT: return %[[FC_RES]]
 }
 
@@ -87,7 +87,7 @@ func.func @Batchmatmul2FullyconnectedTransposedY(%arg0: tensor<4x128x2xf32>) ->
   // CHECK-SAME: [1.000000e+00, 2.000000e+00]
   // CHECK-SAME: tensor<1x2xf32>
   // CHECK: %[[FC_RES:.*]] = "tfl.fully_connected"(%arg0, %[[CONST_WEIGHT]]
-  // CHECK-SAME: {fused_activation_function = "NONE", keep_num_dims = true, weights_format = "DEFAULT"} : (tensor<4x128x2xf32>, tensor<1x2xf32>, none) -> tensor<4x128x1xf32>
+  // CHECK-SAME: <{fused_activation_function = "NONE", keep_num_dims = true, weights_format = "DEFAULT"}> : (tensor<4x128x2xf32>, tensor<1x2xf32>, none) -> tensor<4x128x1xf32>
   // CHECK-NEXT: return %[[FC_RES]]
 }
 
@@ -113,7 +113,7 @@ func.func @Batchmatmul2FullyconnectedQDQ(%arg0: tensor<4x128x2xf32>, %arg1: tens
   // CHECK: %[[TRANSPOSED_X:.*]] = "tfl.transpose"
   // CHECK-SAME: (tensor<2x1xf32>, tensor<2xi32>) -> tensor<1x2xf32>
   // CHECK: %[[FC_RES:.*]] = "tfl.fully_connected"(%arg0, %[[TRANSPOSED_X]]
-  // CHECK-SAME: {fused_activation_function = "NONE", keep_num_dims = true, weights_format = "DEFAULT"} : (tensor<4x128x2xf32>, tensor<1x2xf32>, none) -> tensor<4x128x1xf32>
+  // CHECK-SAME: <{fused_activation_function = "NONE", keep_num_dims = true, weights_format = "DEFAULT"}> : (tensor<4x128x2xf32>, tensor<1x2xf32>, none) -> tensor<4x128x1xf32>
   // CHECK-NEXT: return %[[FC_RES]]
 }
 
@@ -123,8 +123,8 @@ func.func @BatchmatmulToReduceSumI32(%arg0: tensor<1x16384x257xi32>) -> (tensor<
   %0 = arith.constant dense<1> : tensor<1x1x16384xi32>
   %1 = "tfl.batch_matmul"(%0, %arg0) {adj_x = false, adj_y = false} : (tensor<1x1x16384xi32>, tensor<1x16384x257xi32>) -> tensor<1x1x257xi32>
   func.return %1 : tensor<1x1x257xi32>
-  // CHECK: %[[CONST_DIM:.*]] = "tfl.pseudo_const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
-  // CHECK: %[[RED:.*]] = "tfl.sum"(%arg0, %[[CONST_DIM]]) {keep_dims = true} : (tensor<1x16384x257xi32>, tensor<1xi32>) -> tensor<1x1x257xi32>
+  // CHECK: %[[CONST_DIM:.*]] = "tfl.pseudo_const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
+  // CHECK: %[[RED:.*]] = "tfl.sum"(%arg0, %[[CONST_DIM]]) <{keep_dims = true}> : (tensor<1x16384x257xi32>, tensor<1xi32>) -> tensor<1x1x257xi32>
 }
 
 // CHECK-LABEL: BatchmatmulToReduceSumF32
@@ -133,6 +133,6 @@ func.func @BatchmatmulToReduceSumF32(%arg0: tensor<1x16384x257xf32>) -> (tensor<
   %0 = arith.constant dense<1.0> : tensor<1x1x16384xf32>
   %1 = "tfl.batch_matmul"(%0, %arg0) {adj_x = false, adj_y = false} : (tensor<1x1x16384xf32>, tensor<1x16384x257xf32>) -> tensor<1x1x257xf32>
   func.return %1 : tensor<1x1x257xf32>
-  // CHECK: %[[CONST_DIM:.*]] = "tfl.pseudo_const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
-  // CHECK: %[[RED:.*]] = "tfl.sum"(%arg0, %[[CONST_DIM]]) {keep_dims = true} : (tensor<1x16384x257xf32>, tensor<1xi32>) -> tensor<1x1x257xf32>
+  // CHECK: %[[CONST_DIM:.*]] = "tfl.pseudo_const"() <{value = dense<1> : tensor<1xi32>}> : () -> tensor<1xi32>
+  // CHECK: %[[RED:.*]] = "tfl.sum"(%arg0, %[[CONST_DIM]]) <{keep_dims = true}> : (tensor<1x16384x257xf32>, tensor<1xi32>) -> tensor<1x1x257xf32>
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/optimize_no_verify.mlir b/tensorflow/compiler/mlir/lite/tests/optimize_no_verify.mlir
index caa4af5efbbc64..d9c81887664a0f 100644
--- a/tensorflow/compiler/mlir/lite/tests/optimize_no_verify.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/optimize_no_verify.mlir
@@ -26,8 +26,8 @@ func.func @fuseBroadcastMulIntoFullyConnected(%arg0: tensor<1x10368xbf16>) -> te
   %1 = "tfl.mul"(%0, %cst_2) {fused_activation_function = "NONE"} : (tensor<1x256xbf16>, tensor<32x1x256xbf16>) -> tensor<32x1x256xbf16>
   func.return %1 : tensor<32x1x256xbf16>
 
-// CHECK:  %[[V0:.*]] = "tfl.fully_connected"(%arg0, {{.*}}) {{{.*}}} : (tensor<1x10368xbf16>, tensor<256x10368xbf16>, none) -> tensor<1x256xbf16>
-// CHECK:  %[[V1:.*]] = tfl.mul(%[[V0]], {{.*}}) {{{.*}}} : (tensor<1x256xbf16>, tensor<32x1x256xbf16>) -> tensor<32x1x256xbf16>
+// CHECK:  %[[V0:.*]] = "tfl.fully_connected"(%arg0, {{.*}}) <{{{.*}}}> : (tensor<1x10368xbf16>, tensor<256x10368xbf16>, none) -> tensor<1x256xbf16>
+// CHECK:  %[[V1:.*]] = tfl.mul(%[[V0]], {{.*}}) <{{{.*}}}> : (tensor<1x256xbf16>, tensor<32x1x256xbf16>) -> tensor<32x1x256xbf16>
 // CHECK:  return %[[V1]] : tensor<32x1x256xbf16>
 }
 
diff --git a/tensorflow/compiler/mlir/lite/tests/post-quantize-dynamic-range.mlir b/tensorflow/compiler/mlir/lite/tests/post-quantize-dynamic-range.mlir
index c3cc6aa588a2df..c68fab4762e120 100644
--- a/tensorflow/compiler/mlir/lite/tests/post-quantize-dynamic-range.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/post-quantize-dynamic-range.mlir
@@ -12,9 +12,9 @@ func.func @PruneUnusedCustomOp(%arg0: tensor<1x1x1x1xf32>) -> tensor<*xf32> attr
   %custom_3 = "tfl.custom"(%arg0, %dq_w) {custom_code = "CustomTestOp", custom_option = #tfl<const_bytes : "0x">} : (tensor<1x1x1x1xf32>, tensor<1024x1x1x1xf32>) -> tensor<*xf32>
   func.return %custom_3 : tensor<*xf32>
 
-// CHECK: %[[q_w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<1024x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>, value = dense<127> : tensor<1024x1x1x1xi8>} : () -> tensor<1024x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+// CHECK: %[[q_w:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<1024x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>, value = dense<127> : tensor<1024x1x1x1xi8>}> : () -> tensor<1024x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
 // CHECK: %[[dq_w:.*]] = "tfl.dequantize"(%[[q_w:.*]]) : (tensor<1024x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<1024x1x1x1xf32>
-// CHECK: %[[custom_3:.*]] = "tfl.custom"(%arg0, %[[dq_w:.*]]) {custom_code = "CustomTestOp", custom_option = #tfl<const_bytes : "0x">} : (tensor<1x1x1x1xf32>, tensor<1024x1x1x1xf32>) -> tensor<*xf32>
+// CHECK: %[[custom_3:.*]] = "tfl.custom"(%arg0, %[[dq_w:.*]]) <{custom_code = "CustomTestOp", custom_option = #tfl<const_bytes : "0x">}> : (tensor<1x1x1x1xf32>, tensor<1024x1x1x1xf32>) -> tensor<*xf32>
 // CHECK: return %[[custom_3:.*]]
 }
 
@@ -27,11 +27,11 @@ func.func @NotPruneUnusedCustomOp(%arg0: tensor<1x1x1x1xf32>) -> tensor<*xf32> a
   %custom_3 = "tfl.custom"(%arg0, %dq_w) {custom_code = "CustomTestOp2", custom_option = #tfl<const_bytes : "0x">} : (tensor<1x1x1x1xf32>, tensor<1024x1x1x1xf32>) -> tensor<*xf32>
   func.return %custom_3 : tensor<*xf32>
 
-// CHECK: %[[q_w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<1024x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>, value = dense<127> : tensor<1024x1x1x1xi8>} : () -> tensor<1024x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+// CHECK: %[[q_w:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<1024x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>, value = dense<127> : tensor<1024x1x1x1xi8>}> : () -> tensor<1024x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
 // CHECK: %[[dq_w:.*]] = "tfl.dequantize"(%[[q_w:.*]]) : (tensor<1024x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<1024x1x1x1xf32>
-// CHECK: %[[custom_1:.*]] = "tfl.custom"(%arg0, %[[dq_w:.*]]) {custom_code = "CustomTestOp2", custom_option = #tfl<const_bytes : "0x">} : (tensor<1x1x1x1xf32>, tensor<1024x1x1x1xf32>) -> tensor<*xf32>
-// CHECK: %[[custom_2:.*]] = "tfl.custom"(%arg0, %[[dq_w:.*]]) {custom_code = "CustomTestOp2", custom_option = #tfl<const_bytes : "0x">} : (tensor<1x1x1x1xf32>, tensor<1024x1x1x1xf32>) -> tensor<*xf32>
-// CHECK: %[[custom_3:.*]] = "tfl.custom"(%arg0, %[[dq_w:.*]]) {custom_code = "CustomTestOp2", custom_option = #tfl<const_bytes : "0x">} : (tensor<1x1x1x1xf32>, tensor<1024x1x1x1xf32>) -> tensor<*xf32>
+// CHECK: %[[custom_1:.*]] = "tfl.custom"(%arg0, %[[dq_w:.*]]) <{custom_code = "CustomTestOp2", custom_option = #tfl<const_bytes : "0x">}> : (tensor<1x1x1x1xf32>, tensor<1024x1x1x1xf32>) -> tensor<*xf32>
+// CHECK: %[[custom_2:.*]] = "tfl.custom"(%arg0, %[[dq_w:.*]]) <{custom_code = "CustomTestOp2", custom_option = #tfl<const_bytes : "0x">}> : (tensor<1x1x1x1xf32>, tensor<1024x1x1x1xf32>) -> tensor<*xf32>
+// CHECK: %[[custom_3:.*]] = "tfl.custom"(%arg0, %[[dq_w:.*]]) <{custom_code = "CustomTestOp2", custom_option = #tfl<const_bytes : "0x">}> : (tensor<1x1x1x1xf32>, tensor<1024x1x1x1xf32>) -> tensor<*xf32>
 // CHECK: return %[[custom_3:.*]]
 }
 
@@ -46,22 +46,22 @@ func.func @PruneQuantizedCustomOp(%arg0: tensor<1x1x1x1xf32>) -> tensor<*xf32> a
   func.return %custom : tensor<*xf32>
 
 // CHECK: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<1024x1x1x1xf32>
-// CHECK: %[[custom:.*]] = "tfl.custom"(%arg0, %[[w:.*]]) {custom_code = "CustomTestOp", custom_option = #tfl<const_bytes : "0x">}
+// CHECK: %[[custom:.*]] = "tfl.custom"(%arg0, %[[w:.*]]) <{custom_code = "CustomTestOp", custom_option = #tfl<const_bytes : "0x">}>
 // CHECK: return %[[custom:.*]]
 
-// NotPrune: %[[w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<1024x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+// NotPrune: %[[w:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<1024x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
 // NotPrune: %[[dq_w:.*]] = "tfl.dequantize"(%[[w:.*]]) : (tensor<1024x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<1024x1x1x1xf32>
-// NotPrune: %[[custom:.*]] = "tfl.custom"(%arg0, %[[dq_w:.*]]) {custom_code = "CustomTestOp", custom_option = #tfl<const_bytes : "0x">}
-// NotPrune: %[[custom_1:.*]] = "tfl.custom"(%arg0, %[[w:.*]]) {custom_code = "CustomTestOp", custom_option = #tfl<const_bytes : "0x">}
-// NotPrune: %[[custom_2:.*]] = "tfl.custom"(%arg0, %[[w:.*]]) {custom_code = "CustomTestOp", custom_option = #tfl<const_bytes : "0x">}
+// NotPrune: %[[custom:.*]] = "tfl.custom"(%arg0, %[[dq_w:.*]]) <{custom_code = "CustomTestOp", custom_option = #tfl<const_bytes : "0x">}>
+// NotPrune: %[[custom_1:.*]] = "tfl.custom"(%arg0, %[[w:.*]]) <{custom_code = "CustomTestOp", custom_option = #tfl<const_bytes : "0x">}>
+// NotPrune: %[[custom_2:.*]] = "tfl.custom"(%arg0, %[[w:.*]]) <{custom_code = "CustomTestOp", custom_option = #tfl<const_bytes : "0x">}>
 
-// NoSideEffect: %[[q_w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<1024x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
-// NoSideEffect: %[[custom:.*]] = "tfl.custom"(%arg0, %[[q_w:.*]]) {custom_code = "CustomTestOp", custom_option = #tfl<const_bytes : "0x">}
+// NoSideEffect: %[[q_w:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<1024x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+// NoSideEffect: %[[custom:.*]] = "tfl.custom"(%arg0, %[[q_w:.*]]) <{custom_code = "CustomTestOp", custom_option = #tfl<const_bytes : "0x">}
 // NoSideEffect: return %[[custom:.*]]
 
-// NoSideEffectWeightOnly: %[[q_w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<1024x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+// NoSideEffectWeightOnly: %[[q_w:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<1024x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
 // NoSideEffectWeightOnly: %[[dq_w:.*]] = "tfl.dequantize"(%[[q_w:.*]]) : (tensor<1024x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<1024x1x1x1xf32>
-// NoSideEffectWeightOnly: %[[custom:.*]] = "tfl.custom"(%arg0, %[[dq_w:.*]]) {custom_code = "CustomTestOp", custom_option = #tfl<const_bytes : "0x">}
+// NoSideEffectWeightOnly: %[[custom:.*]] = "tfl.custom"(%arg0, %[[dq_w:.*]]) <{custom_code = "CustomTestOp", custom_option = #tfl<const_bytes : "0x">}>
 // NoSideEffectWeightOnly: return %[[custom:.*]]
 }
 
@@ -80,16 +80,16 @@ func.func @QuantizeCustomOp(%arg0: tensor<1x1x1x1xf32>) -> (tensor<*xf32>, tenso
 // CHECK: %[[w_1:.*]] = arith.constant dense<1.270000e+02> : tensor<4096x1x1x1xf32>
 // CHECK: %[[w_2:.*]] = arith.constant dense<1.270000e+02> : tensor<128x1x1x1xf32>
 // CHECK: %[[b:.*]] = arith.constant dense<1.270000e+02> : tensor<2048x1x1x1xf32>
-// CHECK: %[[custom_1:.*]] = "tfl.custom"(%arg0, %[[w_1]], %[[w_2]], %[[b]]) {custom_code = "CustomTestOp", custom_option = #tfl<const_bytes : "0x">} : (tensor<1x1x1x1xf32>, tensor<4096x1x1x1xf32>, tensor<128x1x1x1xf32>, tensor<2048x1x1x1xf32>) -> tensor<*xf32>
-// CHECK: %[[custom_2:.*]] = "tfl.custom"(%arg0, %[[w_1]], %[[w_2]], %[[b]]) {custom_code = "CustomTestOp2", custom_option = #tfl<const_bytes : "0x">} : (tensor<1x1x1x1xf32>, tensor<4096x1x1x1xf32>, tensor<128x1x1x1xf32>, tensor<2048x1x1x1xf32>) -> tensor<*xf32>
-// CHECK: %[[custom_3:.*]] = "tfl.custom"(%arg0, %[[w_1]], %[[w_2]], %[[b]]) {custom_code = "CustomTestOp3", custom_option = #tfl<const_bytes : "0x">} : (tensor<1x1x1x1xf32>, tensor<4096x1x1x1xf32>, tensor<128x1x1x1xf32>, tensor<2048x1x1x1xf32>) -> tensor<*xf32>
+// CHECK: %[[custom_1:.*]] = "tfl.custom"(%arg0, %[[w_1]], %[[w_2]], %[[b]]) <{custom_code = "CustomTestOp", custom_option = #tfl<const_bytes : "0x">}> : (tensor<1x1x1x1xf32>, tensor<4096x1x1x1xf32>, tensor<128x1x1x1xf32>, tensor<2048x1x1x1xf32>) -> tensor<*xf32>
+// CHECK: %[[custom_2:.*]] = "tfl.custom"(%arg0, %[[w_1]], %[[w_2]], %[[b]]) <{custom_code = "CustomTestOp2", custom_option = #tfl<const_bytes : "0x">}> : (tensor<1x1x1x1xf32>, tensor<4096x1x1x1xf32>, tensor<128x1x1x1xf32>, tensor<2048x1x1x1xf32>) -> tensor<*xf32>
+// CHECK: %[[custom_3:.*]] = "tfl.custom"(%arg0, %[[w_1]], %[[w_2]], %[[b]]) <{custom_code = "CustomTestOp3", custom_option = #tfl<const_bytes : "0x">}> : (tensor<1x1x1x1xf32>, tensor<4096x1x1x1xf32>, tensor<128x1x1x1xf32>, tensor<2048x1x1x1xf32>) -> tensor<*xf32>
 // CHECK: return %[[custom_1:.*]], %[[custom_2:.*]], %[[custom_3:.*]]
 
 // CustomOpWeightOnly: %[[w_1:.*]] = arith.constant dense<1.270000e+02> : tensor<4096x1x1x1xf32>
-// CustomOpWeightOnly: %[[q_w1:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<4096x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+// CustomOpWeightOnly: %[[q_w1:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<4096x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
 // CustomOpWeightOnly: %[[dq_w1:.*]] = "tfl.dequantize"(%[[q_w1]]) : (tensor<4096x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<4096x1x1x1xf32>
 // CustomOpWeightOnly: %[[w_2:.*]] = arith.constant dense<1.270000e+02> : tensor<128x1x1x1xf32>
-// CustomOpWeightOnly: %[[q_b:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<2048x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+// CustomOpWeightOnly: %[[q_b:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<2048x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
 // CustomOpWeightOnly: %[[dq_b:.*]] = "tfl.dequantize"(%[[q_b]]) : (tensor<2048x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<2048x1x1x1xf32>
 // CustomOpWeightOnly: %[[custom_1:.*]] = "tfl.custom"(%arg0, %[[dq_w1]], %[[w_2]], %[[dq_b]]) {custom_code = "CustomTestOp", custom_option = #tfl<const_bytes : "0x">} : (tensor<1x1x1x1xf32>, tensor<4096x1x1x1xf32>, tensor<128x1x1x1xf32>, tensor<2048x1x1x1xf32>) -> tensor<*xf32>
 // CustomOpWeightOnly: %[[custom_2:.*]] = "tfl.custom"(%arg0, %[[w_1]], %[[w_2]], %[[b]]) {custom_code = "CustomTestOp2", custom_option = #tfl<const_bytes : "0x">} : (tensor<1x1x1x1xf32>, tensor<4096x1x1x1xf32>, tensor<128x1x1x1xf32>, tensor<2048x1x1x1xf32>) -> tensor<*xf32>
diff --git a/tensorflow/compiler/mlir/lite/tests/post-quantize.mlir b/tensorflow/compiler/mlir/lite/tests/post-quantize.mlir
index de372f38daf906..9e34c1bd7bdbac 100644
--- a/tensorflow/compiler/mlir/lite/tests/post-quantize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/post-quantize.mlir
@@ -20,8 +20,8 @@ func.func @RemoveUnused(%arg0: tensor<4xf32>, %arg1: tensor<i32>) -> (tensor<2xf
 // CHECK-NEXT: %[[split:.*]]:4 = "tfl.split"(%arg1, %arg0)
 // CHECK-NEXT: return %[[split]]#0, %[[split]]#1
 
-// QDQ-NEXT: %[[q:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<4x!quant.uniform<u8:f32, 1.000000e+00>>} : (tensor<4xf32>) -> tensor<4x!quant.uniform<u8:f32, 1.000000e+00>>
-// QDQ-NEXT: %[[split:.*]]:4 = "tfl.split"(%arg1, %[[q]]) {num_splits = 4 : i32} : (tensor<i32>, tensor<4x!quant.uniform<u8:f32, 1.000000e+00>>) -> (tensor<2x!quant.uniform<u8:f32, 1.000000e+00>>, tensor<2x!quant.uniform<u8:f32, 1.000000e+00>>, tensor<2x!quant.uniform<u8:f32, 1.000000e+00>>, tensor<2x!quant.uniform<u8:f32, 1.000000e+00>>)
+// QDQ-NEXT: %[[q:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<4x!quant.uniform<u8:f32, 1.000000e+00>>}> : (tensor<4xf32>) -> tensor<4x!quant.uniform<u8:f32, 1.000000e+00>>
+// QDQ-NEXT: %[[split:.*]]:4 = "tfl.split"(%arg1, %[[q]]) <{num_splits = 4 : i32}> : (tensor<i32>, tensor<4x!quant.uniform<u8:f32, 1.000000e+00>>) -> (tensor<2x!quant.uniform<u8:f32, 1.000000e+00>>, tensor<2x!quant.uniform<u8:f32, 1.000000e+00>>, tensor<2x!quant.uniform<u8:f32, 1.000000e+00>>, tensor<2x!quant.uniform<u8:f32, 1.000000e+00>>)
 // QDQ-NEXT: %[[out1:.*]] = "tfl.dequantize"(%[[split]]#0) : (tensor<2x!quant.uniform<u8:f32, 1.000000e+00>>) -> tensor<2xf32>
 // QDQ-NEXT: %[[out2:.*]] = "tfl.dequantize"(%[[split]]#1) : (tensor<2x!quant.uniform<u8:f32, 1.000000e+00>>) -> tensor<2xf32>
 // QDQ-NEXT: return %[[out1]], %[[out2]] : tensor<2xf32>, tensor<2xf32>
@@ -37,8 +37,8 @@ func.func @RemoveTrival(%arg0: tensor<384x512x!quant.uniform<i8:f32, 1.0:-128>>,
 // CHECK-NEXT: %[[fc:.*]] = "tfl.fully_connected"{{.*}} -> tensor<384x128x!quant.uniform<i8:f32, 2.000000e+00>>
 // CHECK-NEXT: return %[[fc]]
 
-// QDQ-NEXT: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %arg1, %arg2) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<384x512x!quant.uniform<i8:f32, 1.000000e+00:-128>>, tensor<128x512x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>, none) -> tensor<384x128x!quant.uniform<i8:f32, 1.000000e+00>>
-// QDQ-NEXT: %[[q:.*]] = "tfl.quantize"(%[[fc]]) {qtype = tensor<384x128x!quant.uniform<i8:f32, 2.000000e+00>>} : (tensor<384x128x!quant.uniform<i8:f32, 1.000000e+00>>) -> tensor<384x128x!quant.uniform<i8:f32, 2.000000e+00>>
+// QDQ-NEXT: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %arg1, %arg2) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}> : (tensor<384x512x!quant.uniform<i8:f32, 1.000000e+00:-128>>, tensor<128x512x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>, none) -> tensor<384x128x!quant.uniform<i8:f32, 1.000000e+00>>
+// QDQ-NEXT: %[[q:.*]] = "tfl.quantize"(%[[fc]]) <{qtype = tensor<384x128x!quant.uniform<i8:f32, 2.000000e+00>>}> : (tensor<384x128x!quant.uniform<i8:f32, 1.000000e+00>>) -> tensor<384x128x!quant.uniform<i8:f32, 2.000000e+00>>
 // QDQ-NEXT: return %[[q]] : tensor<384x128x!quant.uniform<i8:f32, 2.000000e+00>>
 }
 
@@ -64,11 +64,11 @@ func.func @main2(%arg0: tensor<2x4xf32>, %arg1: tensor<2x4xf32>) -> tensor<2x4xf
 
 // CHECK: func @main(%arg0: tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>)
 // CHECK-NEXT:  %[[cst:.*]] = arith.constant dense<[1, 401408]> : tensor<2xi32>
-// CHECK-NEXT:  %[[q_cst_0:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32, 0.021826678373682216:151>>, value = dense<-76> : tensor<32x3x3x3xi8>}
-// CHECK-NEXT:  %[[q_cst_1:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>, value = dense<0> : tensor<32xi32>}
-// CHECK-NEXT:  %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[q_cst_0]], %[[q_cst_1]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32}
+// CHECK-NEXT:  %[[q_cst_0:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32, 0.021826678373682216:151>>, value = dense<-76> : tensor<32x3x3x3xi8>}>
+// CHECK-NEXT:  %[[q_cst_1:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>, value = dense<0> : tensor<32xi32>}>
+// CHECK-NEXT:  %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[q_cst_0]], %[[q_cst_1]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32}>
 // CHECK-NEXT:  %[[reshape:.*]] = "tfl.reshape"(%[[conv]], %[[cst]]) : (tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>, tensor<2xi32>)
-// CHECK-NEXT:  %[[softmax:.*]] = "tfl.softmax"(%[[reshape]]) {beta = 1.000000e+00 : f32} : (tensor<1x401408x!quant.uniform<u8:f32, 0.023528476789885875>>)
+// CHECK-NEXT:  %[[softmax:.*]] = "tfl.softmax"(%[[reshape]]) <{beta = 1.000000e+00 : f32}> : (tensor<1x401408x!quant.uniform<u8:f32, 0.023528476789885875>>)
 // CHECK-NEXT:  return %[[softmax]] : tensor<1x401408x!quant.uniform<u8:f32, 3.906250e-03>>
 // CHECK-NEXT:}
 
@@ -81,7 +81,7 @@ func.func @main2(%arg0: tensor<2x4xf32>, %arg1: tensor<2x4xf32>) -> tensor<2x4xf
 func.func @HandleReturnedDequantizeWithAnotherUse(%arg0: tensor<128x16xf32>) -> (tensor<128x16xf32>, tensor<128xi32>) {
 // CHECK-NEXT:  %[[cst:.*]] = arith.constant dense<1> : tensor<i32>
   %cst = arith.constant dense<1> : tensor<i32>
-// CHECK-NEXT:  %[[softmax:.*]] = "tfl.softmax"(%arg0) {beta = 1.000000e+00 : f32} : (tensor<128x16xf32>) -> tensor<128x16xf32>
+// CHECK-NEXT:  %[[softmax:.*]] = "tfl.softmax"(%arg0) <{beta = 1.000000e+00 : f32}> : (tensor<128x16xf32>) -> tensor<128x16xf32>
   %0 = "tfl.softmax"(%arg0) {beta = 1.000000e+00 : f32} : (tensor<128x16xf32>) -> tensor<128x16xf32>
   %1 = "tfl.quantize"(%0) {qtype = tensor<128x16x!quant.uniform<u8:f32, 3.906250e-03>>, volatile} : (tensor<128x16xf32>) -> tensor<128x16x!quant.uniform<u8:f32, 3.906250e-03>>
   %2 = "tfl.dequantize"(%1) : (tensor<128x16x!quant.uniform<u8:f32, 3.906250e-03>>) -> tensor<128x16xf32>
@@ -145,11 +145,11 @@ func.func @RemoveLeadingQdq(%arg0: tensor<4xf32>, %arg1: tensor<i32>) -> (tensor
   func.return %4 : tensor<2xf32>
 
 // CHECK-NEXT:  %[[dequant:.*]] = "tfl.dequantize"(%arg0) : (tensor<4x!quant.uniform<u8:f32, 1.000000e+00>>) -> tensor<4xf32>
-// CHECK-NEXT:  %[[split:.*]]:4 = "tfl.split"(%arg1, %[[dequant]]) {num_splits = 4 : i32} : (tensor<i32>, tensor<4xf32>) -> (tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>)
-// CHECK-NEXT:  %[[quant:.*]] = "tfl.quantize"(%[[split]]#0) {qtype = tensor<2x!quant.uniform<u8:f32, 1.000000e+00>>, volatile} : (tensor<2xf32>) -> tensor<2x!quant.uniform<u8:f32, 1.000000e+00>>
+// CHECK-NEXT:  %[[split:.*]]:4 = "tfl.split"(%arg1, %[[dequant]]) <{num_splits = 4 : i32}> : (tensor<i32>, tensor<4xf32>) -> (tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>)
+// CHECK-NEXT:  %[[quant:.*]] = "tfl.quantize"(%[[split]]#0) <{qtype = tensor<2x!quant.uniform<u8:f32, 1.000000e+00>>}> {volatile} : (tensor<2xf32>) -> tensor<2x!quant.uniform<u8:f32, 1.000000e+00>>
 // CHECK-NEXT:  return %[[quant]] : tensor<2x!quant.uniform<u8:f32, 1.000000e+00>>
 
-// QDQ-NEXT:  %[[split:.*]]:4 = "tfl.split"(%arg1, %arg0) {num_splits = 4 : i32} : (tensor<i32>, tensor<4xf32>) -> (tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>)
+// QDQ-NEXT:  %[[split:.*]]:4 = "tfl.split"(%arg1, %arg0) <{num_splits = 4 : i32}> : (tensor<i32>, tensor<4xf32>) -> (tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>)
 // QDQ-NEXT:  return %[[split]]#0 : tensor<2xf32>
 }
 
@@ -166,7 +166,7 @@ func.func @FoldTranspose(%arg0: tensor<1x10x20x3xf32>) -> tensor<1x20x40x16xf32>
   return %5 : tensor<1x20x40x16xf32>
 
   // CHECK-NOT: "tfl.transpose"
-  // CHECK: "tfl.pseudo_qconst"() {qtype = tensor<16x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.047244094488188976>>, value = dense<"0x03030402FD010302010103FE0301020001010001FD02030101FE0400020100FDFEFD01FC01FF02FEFCFE000303FCFE00FF0301FF04010303FF0402FE01FF01000002FD03FD03FC020202FE0204FD03FF01FFFD03FEFE010003FFFF010103FD00FCFEFE020300FFFE02FD03010402040201010401FCFDFDFF0102FE010003FD00FD02FF03FF000201FF00FD0204FD010102FFFF02020003000102FF0002FF0204040300FEFFFEFDFCFC000000000201020000010001FF00FFFF01FF03FE0003FF03FFFEFE03FE03FF0000FE0303FE0002FF01FF01FF04FDFD01FD020101FDFE0101030303020203030301FD010104FD000103FC03FF02FE020402000002FDFF0103FF03010102FDFE02FF00FE01FD02FEFE0002FD02FE0203FFFFFC01FC0102FE04FCFEFC00FCFCFF03000301FFFE03030100030001000302FC01FD0000FD010101FC01020201FDFFFE02FE00FE0201020003040203010100010404FE00FDFE04FE0401FEFDFDFD00FD04FEFCFF03FFFDFF01FF04030403020200020303FF00FF03FD000104FEFD04FCFCFDFE02FF02000003FF00FF030002FDFEFD030300030401000104FCFE030103FC01FD00FC03FE"> : tensor<16x3x3x3xi8>} : () -> tensor<16x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.047244094488188976>>
+  // CHECK: "tfl.pseudo_qconst"() <{qtype = tensor<16x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.047244094488188976>>, value = dense<"0x03030402FD010302010103FE0301020001010001FD02030101FE0400020100FDFEFD01FC01FF02FEFCFE000303FCFE00FF0301FF04010303FF0402FE01FF01000002FD03FD03FC020202FE0204FD03FF01FFFD03FEFE010003FFFF010103FD00FCFEFE020300FFFE02FD03010402040201010401FCFDFDFF0102FE010003FD00FD02FF03FF000201FF00FD0204FD010102FFFF02020003000102FF0002FF0204040300FEFFFEFDFCFC000000000201020000010001FF00FFFF01FF03FE0003FF03FFFEFE03FE03FF0000FE0303FE0002FF01FF01FF04FDFD01FD020101FDFE0101030303020203030301FD010104FD000103FC03FF02FE020402000002FDFF0103FF03010102FDFE02FF00FE01FD02FEFE0002FD02FE0203FFFFFC01FC0102FE04FCFEFC00FCFCFF03000301FFFE03030100030001000302FC01FD0000FD010101FC01020201FDFFFE02FE00FE0201020003040203010100010404FE00FDFE04FE0401FEFDFDFD00FD04FEFCFF03FFFDFF01FF04030403020200020303FF00FF03FD000104FEFD04FCFCFDFE02FF02000003FF00FF030002FDFEFD030300030401000104FCFE030103FC01FD00FC03FE"> : tensor<16x3x3x3xi8>}> : () -> tensor<16x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.047244094488188976>>
   // CHECK-NEXT: "tfl.transpose_conv"
 }
 
@@ -178,6 +178,6 @@ func.func @FoldReshape(%arg0: tensor<4xi32>, %arg1: tensor<1x48x80x16x!quant.uni
   %2 = "tfl.transpose_conv"(%arg0, %1, %arg1, %arg2) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32, fused_activation_function = "NONE"} : (tensor<4xi32>, tensor<1x2x2x16x!quant.uniform<i8<-127:127>:f32, 0.022395913056501255>>, tensor<1x48x80x16x!quant.uniform<i8:f32, 0.047054948993757659:-128>>, tensor<1x!quant.uniform<i32:f32, 0.0010538385465422978>>) -> tensor<1x96x160x1x!quant.uniform<i8:f32, 0.37102097156001074:-14>>
   return %2 : tensor<1x96x160x1x!quant.uniform<i8:f32, 0.37102097156001074:-14>>
   // CHECK-NOT: "tfl.reshape"
-  // CHECK{LITERAL}: "tfl.pseudo_qconst"() {qtype = tensor<1x2x2x16x!quant.uniform<i8<-127:127>:f32, 0.022395913056501255>>, value = dense<[[[[12, -60, -51, -59, -62, 33, 53, 17, -31, 50, 27, 7, -19, -34, -14, -26], [47, -84, -32, -36, -102, -8, -8, 35, -33, 59, 95, 40, -25, -30, -55, 25]], [[4, -41, -61, 12, -23, 48, 40, 15, -39, 52, 81, -62, -24, 17, -7, -52], [40, -70, -45, 32, -43, 2, -30, 34, -35, 58, 77, -28, -30, 37, -47, -5]]]]> : tensor<1x2x2x16xi8>} : () -> tensor<1x2x2x16x!quant.uniform<i8<-127:127>:f32, 0.022395913056501255>>
+  // CHECK{LITERAL}: "tfl.pseudo_qconst"() <{qtype = tensor<1x2x2x16x!quant.uniform<i8<-127:127>:f32, 0.022395913056501255>>, value = dense<[[[[12, -60, -51, -59, -62, 33, 53, 17, -31, 50, 27, 7, -19, -34, -14, -26], [47, -84, -32, -36, -102, -8, -8, 35, -33, 59, 95, 40, -25, -30, -55, 25]], [[4, -41, -61, 12, -23, 48, 40, 15, -39, 52, 81, -62, -24, 17, -7, -52], [40, -70, -45, 32, -43, 2, -30, 34, -35, 58, 77, -28, -30, 37, -47, -5]]]]> : tensor<1x2x2x16xi8>}> : () -> tensor<1x2x2x16x!quant.uniform<i8<-127:127>:f32, 0.022395913056501255>>
   // CHECK-NEXT: "tfl.transpose_conv"
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-composite-functions-tf.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-composite-functions-tf.mlir
index 27d98c7599c93d..d7ce1a065a8540 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-composite-functions-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-composite-functions-tf.mlir
@@ -46,7 +46,7 @@ func.func @layernormalizedlstmcellsimple(%arg0: tensor<1x?xf32>, %arg1: tensor<3
 // CHECK:           [[VAL_7:%.*]] = "tf.Transpose"([[VAL_1]], [[VAL_6]]) : (tensor<3x4xf32>, tensor<2xi32>) -> tensor<4x3xf32>
 // CHECK:           [[VAL_8:%.*]] = arith.constant dense<[1, 0]> : tensor<2xi32>
 // CHECK:           [[VAL_9:%.*]] = "tf.Transpose"([[VAL_4]], [[VAL_8]]) : (tensor<1x3xf32>, tensor<2xi32>) -> tensor<3x1xf32>
-// CHECK-DAG:       [[VAL_10:%.*]] = "tfl.no_value"() {value} : () -> none
+// CHECK-DAG:       [[VAL_10:%.*]] = "tfl.no_value"() <{value}> : () -> none
 // CHECK-DAG:       [[VAL_11:%.*]] = arith.constant dense<0> : tensor<2xi64>
 // CHECK-DAG:       [[VAL_12:%.*]] = arith.constant dense<[1, 0]> : tensor<2xi64>
 // CHECK:           [[VAL_13:%.*]] = "tf.Slice"([[VAL_7]], [[VAL_11]], [[VAL_12]]) : (tensor<4x3xf32>, tensor<2xi64>, tensor<2xi64>) -> tensor<1x0xf32>
@@ -85,8 +85,9 @@ func.func @layernormalizedlstmcellsimple(%arg0: tensor<1x?xf32>, %arg1: tensor<3
 // CHECK-DAG:       [[VAL_46:%.*]] = arith.constant dense<0.000000e+00> : tensor<3xf32>
 // CHECK-DAG:       [[VAL_47:%.*]] = arith.constant dense<0.000000e+00> : tensor<1x3xf32>
 // CHECK-DAG:       [[VAL_48:%.*]] = arith.constant dense<0.000000e+00> : tensor<1x1xf32>
-// CHECK:           [[VAL_49:%.*]] = "tfl.lstm"([[VAL_0]], [[VAL_16]], [[VAL_19]], [[VAL_13]], [[VAL_22]], [[VAL_28]], [[VAL_31]], [[VAL_25]], [[VAL_34]], [[VAL_10]], [[VAL_10]], [[VAL_10]], [[VAL_40]], [[VAL_41]], [[VAL_37]], [[VAL_42]], [[VAL_45]], [[VAL_46]], [[VAL_47]], [[VAL_48]], [[VAL_10]], [[VAL_10]], [[VAL_10]], [[VAL_10]]) ({
-// CHECK:           }) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", kernel_type = #tfl<lstm_kernel_type_attr FULL>, proj_clip = 0.000000e+00 : f32} : (tensor<1x?xf32>, tensor<1x0xf32>, tensor<1x0xf32>, tensor<1x0xf32>, tensor<1x0xf32>, tensor<1x3xf32>, tensor<1x3xf32>, tensor<1x3xf32>, tensor<1x3xf32>, none, none, none, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<3x1xf32>, tensor<3xf32>, tensor<1x3xf32>, tensor<1x1xf32>, none, none, none, none) -> tensor<1x3xf32>
+// CHECK:           [[VAL_49:%.*]] = "tfl.lstm"([[VAL_0]], [[VAL_16]], [[VAL_19]], [[VAL_13]], [[VAL_22]], [[VAL_28]], [[VAL_31]], [[VAL_25]], [[VAL_34]], [[VAL_10]], [[VAL_10]], [[VAL_10]], [[VAL_40]], [[VAL_41]], [[VAL_37]], [[VAL_42]], [[VAL_45]], [[VAL_46]], [[VAL_47]], [[VAL_48]], [[VAL_10]], [[VAL_10]], [[VAL_10]], [[VAL_10]]) 
+// CHECK-SAME:      <{cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", kernel_type = #tfl<lstm_kernel_type_attr FULL>, proj_clip = 0.000000e+00 : f32}> ({
+// CHECK:           }) : (tensor<1x?xf32>, tensor<1x0xf32>, tensor<1x0xf32>, tensor<1x0xf32>, tensor<1x0xf32>, tensor<1x3xf32>, tensor<1x3xf32>, tensor<1x3xf32>, tensor<1x3xf32>, none, none, none, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<3x1xf32>, tensor<3xf32>, tensor<1x3xf32>, tensor<1x1xf32>, none, none, none, none) -> tensor<1x3xf32>
 // CHECK:           [[VAL_50:%.*]] = tensor.cast [[VAL_51:%.*]] : tensor<1x3xf32> to tensor<1x?xf32>
 // CHECK:           return [[VAL_50]] : tensor<1x?xf32>
 
@@ -98,7 +99,7 @@ func.func @layernormalizedlstmcellsimple(%arg0: tensor<1x?xf32>, %arg1: tensor<3
 // CHECK:           [[VAL_53:%.*]] = "tf.Transpose"([[VAL_1]], [[VAL_52]]) : (tensor<3x4xf32>, tensor<2xi32>) -> tensor<4x3xf32>
 // CHECK:           [[VAL_54:%.*]] = arith.constant dense<[1, 0]> : tensor<2xi32>
 // CHECK:           [[VAL_55:%.*]] = "tf.Transpose"([[VAL_4]], [[VAL_54]]) : (tensor<1x3xf32>, tensor<2xi32>) -> tensor<3x1xf32>
-// CHECK-DAG:       [[VAL_56:%.*]] = "tfl.no_value"() {value} : () -> none
+// CHECK-DAG:       [[VAL_56:%.*]] = "tfl.no_value"() <{value}> : () -> none
 // CHECK-DAG:       [[VAL_57:%.*]] = arith.constant dense<0> : tensor<2xi64>
 // CHECK-DAG:       [[VAL_58:%.*]] = arith.constant dense<[1, 0]> : tensor<2xi64>
 // CHECK:           [[VAL_59:%.*]] = "tf.Slice"([[VAL_53]], [[VAL_57]], [[VAL_58]]) : (tensor<4x3xf32>, tensor<2xi64>, tensor<2xi64>) -> tensor<1x0xf32>
@@ -145,8 +146,9 @@ func.func @layernormalizedlstmcellsimple(%arg0: tensor<1x?xf32>, %arg1: tensor<3
 // CHECK:           [[VAL_100:%.*]] = "tf.Slice"([[VAL_5]], [[VAL_98]], [[VAL_99]]) : (tensor<2xf32>, tensor<1xi64>, tensor<1xi64>) -> tensor<1xf32>
 // CHECK-DAG:       [[VAL_101:%.*]] = arith.constant dense<0.000000e+00> : tensor<1xf32>
 // CHECK-DAG:       [[VAL_102:%.*]] = arith.constant dense<0.000000e+00> : tensor<1xf32>
-// CHECK:           [[VAL_103:%.*]] = "tfl.lstm"([[VAL_0]], [[VAL_62]], [[VAL_65]], [[VAL_59]], [[VAL_68]], [[VAL_74]], [[VAL_77]], [[VAL_71]], [[VAL_80]], [[VAL_56]], [[VAL_56]], [[VAL_56]], [[VAL_86]], [[VAL_87]], [[VAL_83]], [[VAL_88]], [[VAL_91]], [[VAL_92]], [[VAL_93]], [[VAL_94]], [[VAL_100]], [[VAL_101]], [[VAL_97]], [[VAL_102]]) ({
-// CHECK:           }) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", kernel_type = #tfl<lstm_kernel_type_attr FULL>, proj_clip = 0.000000e+00 : f32} : (tensor<1x?xf32>, tensor<1x0xf32>, tensor<1x0xf32>, tensor<1x0xf32>, tensor<1x0xf32>, tensor<1x3xf32>, tensor<1x3xf32>, tensor<1x3xf32>, tensor<1x3xf32>, none, none, none, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<3x1xf32>, tensor<3xf32>, tensor<1x3xf32>, tensor<1x1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x3xf32>
+// CHECK:           [[VAL_103:%.*]] = "tfl.lstm"([[VAL_0]], [[VAL_62]], [[VAL_65]], [[VAL_59]], [[VAL_68]], [[VAL_74]], [[VAL_77]], [[VAL_71]], [[VAL_80]], [[VAL_56]], [[VAL_56]], [[VAL_56]], [[VAL_86]], [[VAL_87]], [[VAL_83]], [[VAL_88]], [[VAL_91]], [[VAL_92]], [[VAL_93]], [[VAL_94]], [[VAL_100]], [[VAL_101]], [[VAL_97]], [[VAL_102]])
+// CHECK-SAME:      <{cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", kernel_type = #tfl<lstm_kernel_type_attr FULL>, proj_clip = 0.000000e+00 : f32}> ({
+// CHECK:           }) : (tensor<1x?xf32>, tensor<1x0xf32>, tensor<1x0xf32>, tensor<1x0xf32>, tensor<1x0xf32>, tensor<1x3xf32>, tensor<1x3xf32>, tensor<1x3xf32>, tensor<1x3xf32>, none, none, none, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<3x1xf32>, tensor<3xf32>, tensor<1x3xf32>, tensor<1x1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x3xf32>
 // CHECK:           [[VAL_104:%.*]] = tensor.cast [[VAL_105:%.*]] : tensor<1x3xf32> to tensor<1x?xf32>
 // CHECK:           return [[VAL_104]] : tensor<1x?xf32>
 }
@@ -203,8 +205,8 @@ func.func @inference_standard_lstm_time_major(%arg0: tensor<?x8x8xf32>, %arg1: t
 // CHECK-DAG:       [[VAL_16:%.*]] = "tf.Const"() <{value = dense<10> : tensor<4xi32>}> : () -> tensor<4xi32>
 // CHECK-DAG:       [[VAL_17:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_18:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_16]], [[VAL_17]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
-// CHECK:           [[VAL_19:%.*]] = "tfl.no_value"() {value} : () -> none
-// CHECK:           [[VAL_20:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_15]]#0, [[VAL_15]]#1, [[VAL_15]]#2, [[VAL_15]]#3, [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_18]]#0, [[VAL_18]]#1, [[VAL_18]]#2, [[VAL_18]]#3, [[VAL_19]], [[VAL_19]], [[VAL_1]], [[VAL_2]], [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_19]]) {cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = false, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<?x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<?x8x10xf32>
+// CHECK:           [[VAL_19:%.*]] = "tfl.no_value"() <{value}> : () -> none
+// CHECK:           [[VAL_20:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_15]]#0, [[VAL_15]]#1, [[VAL_15]]#2, [[VAL_15]]#3, [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_18]]#0, [[VAL_18]]#1, [[VAL_18]]#2, [[VAL_18]]#3, [[VAL_19]], [[VAL_19]], [[VAL_1]], [[VAL_2]], [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_19]]) <{cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = false, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true}> : (tensor<?x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<?x8x10xf32>
 // CHECK-DAG:       [[VAL_21:%.*]] = arith.constant dense<[-1, 0, 0]> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_22:%.*]] = arith.constant dense<0> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_23:%.*]] = arith.constant dense<1> : tensor<3xi32>
@@ -257,8 +259,8 @@ func.func @inference_standard_indy_lstm_time_major(%arg0: tensor<8x8x8xf32>, %ar
 // CHECK-DAG:       [[VAL_28:%.*]] = "tf.Const"() <{value = dense<10> : tensor<4xi32>}> : () -> tensor<4xi32>
 // CHECK-DAG:       [[VAL_29:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_30:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_28]], [[VAL_29]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
-// CHECK:           [[VAL_31:%.*]] = "tfl.no_value"() {value} : () -> none
-// CHECK:           [[VAL_32:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_21]], [[VAL_23]], [[VAL_25]], [[VAL_27]], [[VAL_31]], [[VAL_31]], [[VAL_31]], [[VAL_30]]#0, [[VAL_30]]#1, [[VAL_30]]#2, [[VAL_30]]#3, [[VAL_31]], [[VAL_31]], [[VAL_1]], [[VAL_2]], [[VAL_31]], [[VAL_31]], [[VAL_31]], [[VAL_31]]) {cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = true, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<8x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<8x8x10xf32>
+// CHECK:           [[VAL_31:%.*]] = "tfl.no_value"() <{value}> : () -> none
+// CHECK:           [[VAL_32:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_21]], [[VAL_23]], [[VAL_25]], [[VAL_27]], [[VAL_31]], [[VAL_31]], [[VAL_31]], [[VAL_30]]#0, [[VAL_30]]#1, [[VAL_30]]#2, [[VAL_30]]#3, [[VAL_31]], [[VAL_31]], [[VAL_1]], [[VAL_2]], [[VAL_31]], [[VAL_31]], [[VAL_31]], [[VAL_31]]) <{cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = true, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true}> : (tensor<8x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<8x8x10xf32>
 // CHECK-DAG:       [[VAL_33:%.*]] = arith.constant dense<[-1, 0, 0]> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_34:%.*]] = arith.constant dense<0> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_35:%.*]] = arith.constant dense<1> : tensor<3xi32>
@@ -299,8 +301,8 @@ func.func @inference_standard_lstm_non_time_major(%arg0: tensor<8x8x8xf32>, %arg
 // CHECK-DAG:       [[VAL_16:%.*]] = "tf.Const"() <{value = dense<10> : tensor<4xi32>}> : () -> tensor<4xi32>
 // CHECK-DAG:       [[VAL_17:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_18:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_16]], [[VAL_17]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
-// CHECK:           [[VAL_19:%.*]] = "tfl.no_value"() {value} : () -> none
-// CHECK:           [[VAL_20:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_15]]#0, [[VAL_15]]#1, [[VAL_15]]#2, [[VAL_15]]#3, [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_18]]#0, [[VAL_18]]#1, [[VAL_18]]#2, [[VAL_18]]#3, [[VAL_19]], [[VAL_19]], [[VAL_1]], [[VAL_2]], [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_19]]) {cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = false, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = false} : (tensor<8x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<8x8x10xf32>
+// CHECK:           [[VAL_19:%.*]] = "tfl.no_value"() <{value}> : () -> none
+// CHECK:           [[VAL_20:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_15]]#0, [[VAL_15]]#1, [[VAL_15]]#2, [[VAL_15]]#3, [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_18]]#0, [[VAL_18]]#1, [[VAL_18]]#2, [[VAL_18]]#3, [[VAL_19]], [[VAL_19]], [[VAL_1]], [[VAL_2]], [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_19]]) <{cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = false, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = false}> : (tensor<8x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<8x8x10xf32>
 // CHECK-DAG:       [[VAL_21:%.*]] = arith.constant dense<[0, -1, 0]> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_22:%.*]] = arith.constant dense<0> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_23:%.*]] = arith.constant dense<1> : tensor<3xi32>
@@ -354,8 +356,8 @@ func.func @inference_standard_indy_lstm_non_time_major(%arg0: tensor<8x8x8xf32>,
 // CHECK-DAG:       [[VAL_28:%.*]] = "tf.Const"() <{value = dense<10> : tensor<4xi32>}> : () -> tensor<4xi32>
 // CHECK-DAG:       [[VAL_29:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_30:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_28]], [[VAL_29]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
-// CHECK:           [[VAL_31:%.*]] = "tfl.no_value"() {value} : () -> none
-// CHECK:           [[VAL_32:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_21]], [[VAL_23]], [[VAL_25]], [[VAL_27]], [[VAL_31]], [[VAL_31]], [[VAL_31]], [[VAL_30]]#0, [[VAL_30]]#1, [[VAL_30]]#2, [[VAL_30]]#3, [[VAL_31]], [[VAL_31]], [[VAL_1]], [[VAL_2]], [[VAL_31]], [[VAL_31]], [[VAL_31]], [[VAL_31]]) {cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = true, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = false} : (tensor<8x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<8x8x10xf32>
+// CHECK:           [[VAL_31:%.*]] = "tfl.no_value"() <{value}> : () -> none
+// CHECK:           [[VAL_32:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_21]], [[VAL_23]], [[VAL_25]], [[VAL_27]], [[VAL_31]], [[VAL_31]], [[VAL_31]], [[VAL_30]]#0, [[VAL_30]]#1, [[VAL_30]]#2, [[VAL_30]]#3, [[VAL_31]], [[VAL_31]], [[VAL_1]], [[VAL_2]], [[VAL_31]], [[VAL_31]], [[VAL_31]], [[VAL_31]]) <{cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = true, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = false}> : (tensor<8x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<8x8x10xf32>
 // CHECK-DAG:       [[VAL_33:%.*]] = arith.constant dense<[0, -1, 0]> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_34:%.*]] = arith.constant dense<0> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_35:%.*]] = arith.constant dense<1> : tensor<3xi32>
@@ -398,8 +400,8 @@ func.func @inference_standard_lstm_time_major_go_backwards(%arg0: tensor<?x8x8xf
 // CHECK-DAG:       [[VAL_18:%.*]] = "tf.Const"() <{value = dense<10> : tensor<4xi32>}> : () -> tensor<4xi32>
 // CHECK-DAG:       [[VAL_19:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_20:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_18]], [[VAL_19]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
-// CHECK:           [[VAL_21:%.*]] = "tfl.no_value"() {value} : () -> none
-// CHECK:           [[VAL_22:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_7]], [[VAL_14]]#0, [[VAL_14]]#1, [[VAL_14]]#2, [[VAL_14]]#3, [[VAL_17]]#0, [[VAL_17]]#1, [[VAL_17]]#2, [[VAL_17]]#3, [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_20]]#0, [[VAL_20]]#1, [[VAL_20]]#2, [[VAL_20]]#3, [[VAL_21]], [[VAL_21]], [[VAL_1]], [[VAL_2]], [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_21]]) {cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = false, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<?x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<?x8x10xf32>
+// CHECK:           [[VAL_21:%.*]] = "tfl.no_value"() <{value}> : () -> none
+// CHECK:           [[VAL_22:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_7]], [[VAL_14]]#0, [[VAL_14]]#1, [[VAL_14]]#2, [[VAL_14]]#3, [[VAL_17]]#0, [[VAL_17]]#1, [[VAL_17]]#2, [[VAL_17]]#3, [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_20]]#0, [[VAL_20]]#1, [[VAL_20]]#2, [[VAL_20]]#3, [[VAL_21]], [[VAL_21]], [[VAL_1]], [[VAL_2]], [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_21]]) <{cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = false, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true}> : (tensor<?x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<?x8x10xf32>
 // CHECK-DAG:       [[VAL_23:%.*]] = arith.constant dense<[-1, 0, 0]> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_24:%.*]] = arith.constant dense<0> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_25:%.*]] = arith.constant dense<1> : tensor<3xi32>
@@ -455,8 +457,8 @@ func.func @inference_standard_indy_lstm_time_major_go_backwards(%arg0: tensor<8x
 // CHECK-DAG:       [[VAL_28:%.*]] = "tf.Const"() <{value = dense<10> : tensor<4xi32>}> : () -> tensor<4xi32>
 // CHECK-DAG:       [[VAL_29:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_30:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_28]], [[VAL_29]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
-// CHECK:           [[VAL_31:%.*]] = "tfl.no_value"() {value} : () -> none
-// CHECK:           [[VAL_32:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_41]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_21]], [[VAL_23]], [[VAL_25]], [[VAL_27]], [[VAL_31]], [[VAL_31]], [[VAL_31]], [[VAL_30]]#0, [[VAL_30]]#1, [[VAL_30]]#2, [[VAL_30]]#3, [[VAL_31]], [[VAL_31]], [[VAL_1]], [[VAL_2]], [[VAL_31]], [[VAL_31]], [[VAL_31]], [[VAL_31]]) {cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = true, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<8x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<8x8x10xf32>
+// CHECK:           [[VAL_31:%.*]] = "tfl.no_value"() <{value}> : () -> none
+// CHECK:           [[VAL_32:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_41]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_21]], [[VAL_23]], [[VAL_25]], [[VAL_27]], [[VAL_31]], [[VAL_31]], [[VAL_31]], [[VAL_30]]#0, [[VAL_30]]#1, [[VAL_30]]#2, [[VAL_30]]#3, [[VAL_31]], [[VAL_31]], [[VAL_1]], [[VAL_2]], [[VAL_31]], [[VAL_31]], [[VAL_31]], [[VAL_31]]) <{cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = true, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true}> : (tensor<8x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<8x8x10xf32>
 // CHECK-DAG:       [[VAL_33:%.*]] = arith.constant dense<[-1, 0, 0]> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_34:%.*]] = arith.constant dense<0> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_35:%.*]] = arith.constant dense<1> : tensor<3xi32>
@@ -499,8 +501,8 @@ func.func @inference_standard_lstm_non_time_major_go_backwards(%arg0: tensor<8x8
 // CHECK-DAG:       [[VAL_18:%.*]] = "tf.Const"() <{value = dense<10> : tensor<4xi32>}> : () -> tensor<4xi32>
 // CHECK-DAG:       [[VAL_19:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_20:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_18]], [[VAL_19]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
-// CHECK:           [[VAL_21:%.*]] = "tfl.no_value"() {value} : () -> none
-// CHECK:           [[VAL_22:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_7]], [[VAL_14]]#0, [[VAL_14]]#1, [[VAL_14]]#2, [[VAL_14]]#3, [[VAL_17]]#0, [[VAL_17]]#1, [[VAL_17]]#2, [[VAL_17]]#3, [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_20]]#0, [[VAL_20]]#1, [[VAL_20]]#2, [[VAL_20]]#3, [[VAL_21]], [[VAL_21]], [[VAL_1]], [[VAL_2]], [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_21]]) {cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = false, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = false} : (tensor<8x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<8x8x10xf32>
+// CHECK:           [[VAL_21:%.*]] = "tfl.no_value"() <{value}> : () -> none
+// CHECK:           [[VAL_22:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_7]], [[VAL_14]]#0, [[VAL_14]]#1, [[VAL_14]]#2, [[VAL_14]]#3, [[VAL_17]]#0, [[VAL_17]]#1, [[VAL_17]]#2, [[VAL_17]]#3, [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_20]]#0, [[VAL_20]]#1, [[VAL_20]]#2, [[VAL_20]]#3, [[VAL_21]], [[VAL_21]], [[VAL_1]], [[VAL_2]], [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_21]]) <{cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = false, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = false}> : (tensor<8x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<8x8x10xf32>
 // CHECK-DAG:       [[VAL_23:%.*]] = arith.constant dense<[0, -1, 0]> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_24:%.*]] = arith.constant dense<0> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_25:%.*]] = arith.constant dense<1> : tensor<3xi32>
@@ -556,8 +558,8 @@ func.func @inference_standard_indy_lstm_non_time_major_go_backwards(%arg0: tenso
 // CHECK-DAG:       [[VAL_28:%.*]] = "tf.Const"() <{value = dense<10> : tensor<4xi32>}> : () -> tensor<4xi32>
 // CHECK-DAG:       [[VAL_29:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_30:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_28]], [[VAL_29]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
-// CHECK:           [[VAL_31:%.*]] = "tfl.no_value"() {value} : () -> none
-// CHECK:           [[VAL_32:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_41]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_21]], [[VAL_23]], [[VAL_25]], [[VAL_27]], [[VAL_31]], [[VAL_31]], [[VAL_31]], [[VAL_30]]#0, [[VAL_30]]#1, [[VAL_30]]#2, [[VAL_30]]#3, [[VAL_31]], [[VAL_31]], [[VAL_1]], [[VAL_2]], [[VAL_31]], [[VAL_31]], [[VAL_31]], [[VAL_31]]) {cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = true, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = false} : (tensor<8x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<8x8x10xf32>
+// CHECK:           [[VAL_31:%.*]] = "tfl.no_value"() <{value}> : () -> none
+// CHECK:           [[VAL_32:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_41]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_21]], [[VAL_23]], [[VAL_25]], [[VAL_27]], [[VAL_31]], [[VAL_31]], [[VAL_31]], [[VAL_30]]#0, [[VAL_30]]#1, [[VAL_30]]#2, [[VAL_30]]#3, [[VAL_31]], [[VAL_31]], [[VAL_1]], [[VAL_2]], [[VAL_31]], [[VAL_31]], [[VAL_31]], [[VAL_31]]) <{cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = true, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = false}> : (tensor<8x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<8x8x10xf32>
 // CHECK-DAG:       [[VAL_33:%.*]] = arith.constant dense<[0, -1, 0]> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_34:%.*]] = arith.constant dense<0> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_35:%.*]] = arith.constant dense<1> : tensor<3xi32>
@@ -605,8 +607,8 @@ func.func @inference_standard_lstm_time_major_can_fuse(%arg0: tensor<?x8x8xf32>,
 // CHECK-DAG:       [[VAL_16:%.*]] = "tf.Const"() <{value = dense<10> : tensor<4xi32>}> : () -> tensor<4xi32>
 // CHECK-DAG:       [[VAL_17:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_18:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_16]], [[VAL_17]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
-// CHECK:           [[VAL_19:%.*]] = "tfl.no_value"() {value} : () -> none
-// CHECK:           [[VAL_20:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_15]]#0, [[VAL_15]]#1, [[VAL_15]]#2, [[VAL_15]]#3, [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_18]]#0, [[VAL_18]]#1, [[VAL_18]]#2, [[VAL_18]]#3, [[VAL_19]], [[VAL_19]], [[VAL_1]], [[VAL_2]], [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_19]]) {cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = false, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<?x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<?x8x10xf32>
+// CHECK:           [[VAL_19:%.*]] = "tfl.no_value"() <{value}> : () -> none
+// CHECK:           [[VAL_20:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_15]]#0, [[VAL_15]]#1, [[VAL_15]]#2, [[VAL_15]]#3, [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_18]]#0, [[VAL_18]]#1, [[VAL_18]]#2, [[VAL_18]]#3, [[VAL_19]], [[VAL_19]], [[VAL_1]], [[VAL_2]], [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_19]]) <{cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = false, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true}> : (tensor<?x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<?x8x10xf32>
 // CHECK-DAG:       [[VAL_21:%.*]] = arith.constant dense<[-1, 0, 0]> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_22:%.*]] = arith.constant dense<0> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_23:%.*]] = arith.constant dense<1> : tensor<3xi32>
@@ -655,8 +657,8 @@ func.func @inference_standard_lstm_time_major_can_fuse_last_output(%arg0: tensor
 // CHECK-DAG:       [[VAL_16:%.*]] = "tf.Const"() <{value = dense<10> : tensor<4xi32>}> : () -> tensor<4xi32>
 // CHECK-DAG:       [[VAL_17:%.*]] = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
 // CHECK:           [[VAL_18:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_16]], [[VAL_17]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
-// CHECK:           [[VAL_19:%.*]] = "tfl.no_value"() {value} : () -> none
-// CHECK:           [[VAL_20:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_15]]#0, [[VAL_15]]#1, [[VAL_15]]#2, [[VAL_15]]#3, [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_18]]#0, [[VAL_18]]#1, [[VAL_18]]#2, [[VAL_18]]#3, [[VAL_19]], [[VAL_19]], [[VAL_1]], [[VAL_2]], [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_19]]) {cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = false, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<?x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<?x8x10xf32>
+// CHECK:           [[VAL_19:%.*]] = "tfl.no_value"() <{value}> : () -> none
+// CHECK:           [[VAL_20:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_15]]#0, [[VAL_15]]#1, [[VAL_15]]#2, [[VAL_15]]#3, [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_18]]#0, [[VAL_18]]#1, [[VAL_18]]#2, [[VAL_18]]#3, [[VAL_19]], [[VAL_19]], [[VAL_1]], [[VAL_2]], [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_19]]) <{cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = false, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true}> : (tensor<?x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<?x8x10xf32>
 // CHECK-DAG:       [[VAL_21:%.*]] = arith.constant dense<[-1, 0, 0]> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_22:%.*]] = arith.constant dense<0> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_23:%.*]] = arith.constant dense<1> : tensor<3xi32>
@@ -811,7 +813,7 @@ func.func @tflite_custom_nms(%arg0: tensor<1x100x4xf32>, %arg1: tensor<1x100x91x
 // CHECK-SAME:                          %[[VAL_0:.*]]: tensor<1x100x4xf32>,
 // CHECK-SAME:                          %[[VAL_1:.*]]: tensor<1x100x91xf32>,
 // CHECK-SAME:                          %[[VAL_2:.*]]: tensor<100x4xf32>) -> (tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>) attributes {tf._implements = "TFLite_Detection_PostProcess", tf._reference = "mlir"} {
-// CHECK:         %[[VAL_3:.*]]:4 = "tfl.custom"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) {custom_code = "TFLite_Detection_PostProcess", custom_option = #tfl<const_bytes : "0x6D61785F646574656374696F6E73006D61785F636C61737365735F7065725F646574656374696F6E006E756D5F636C6173736573006E6D735F73636F72655F7468726573686F6C64006E6D735F696F755F7468726573686F6C6400795F7363616C6500785F7363616C6500685F7363616C6500775F7363616C65007573655F726567756C61725F6E6D73000A217E8E465B681720313A00000C000000010000000A0000000000803F010000000A0000009A99193F0000003F5B0000000000000000000040000020410000A0400E06060E0E06060E0E0E322601">} : (tensor<1x100x4xf32>, tensor<1x100x91xf32>, tensor<100x4xf32>) -> (tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>)
+// CHECK:         %[[VAL_3:.*]]:4 = "tfl.custom"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) <{custom_code = "TFLite_Detection_PostProcess", custom_option = #tfl<const_bytes : "0x6D61785F646574656374696F6E73006D61785F636C61737365735F7065725F646574656374696F6E006E756D5F636C6173736573006E6D735F73636F72655F7468726573686F6C64006E6D735F696F755F7468726573686F6C6400795F7363616C6500785F7363616C6500685F7363616C6500775F7363616C65007573655F726567756C61725F6E6D73000A217E8E465B681720313A00000C000000010000000A0000000000803F010000000A0000009A99193F0000003F5B0000000000000000000040000020410000A0400E06060E0E06060E0E0E322601">}> : (tensor<1x100x4xf32>, tensor<1x100x91xf32>, tensor<100x4xf32>) -> (tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>)
 // CHECK:         return %[[VAL_3]]#0, %[[VAL_3]]#1, %[[VAL_3]]#2, %[[VAL_3]]#3 : tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>
 // CHECK:       }
 }
@@ -861,7 +863,7 @@ func.func @max_unpooling_2d(%arg0: tensor<1x1x2x1xf32>, %arg1: tensor<1x1x2x1xi3
 // CHECK-LABEL: func @max_unpooling_2d(
 // CHECK-SAME:                         %[[VAL_0:.*]]: tensor<1x1x2x1xf32>,
 // CHECK-SAME:                         %[[VAL_1:.*]]: tensor<1x1x2x1xi32>) -> tensor<1x2x4x1xf32> attributes {tf._implements = "MaxUnpooling2D"} {
-// CHECK-NEXT:    %[[VAL_2:.*]] = "tfl.custom"(%[[VAL_0]], %[[VAL_1]]) {custom_code = "MaxUnpooling2D", custom_option = #tfl<const_bytes : "0x01000000020000000200000002000000020000000000000000000000000000000000000000000000">} : (tensor<1x1x2x1xf32>, tensor<1x1x2x1xi32>) -> tensor<1x2x4x1xf32>
+// CHECK-NEXT:    %[[VAL_2:.*]] = "tfl.custom"(%[[VAL_0]], %[[VAL_1]]) <{custom_code = "MaxUnpooling2D", custom_option = #tfl<const_bytes : "0x01000000020000000200000002000000020000000000000000000000000000000000000000000000">}> : (tensor<1x1x2x1xf32>, tensor<1x1x2x1xi32>) -> tensor<1x2x4x1xf32>
 // CHECK-NEXT:    return %[[VAL_2]] : tensor<1x2x4x1xf32>
 // CHECK-NEXT:  }
 }
@@ -978,7 +980,7 @@ func.func private @__inference_interpolate_bilinear(%arg0: tensor<2x4x4x1xf32>,
 // CHECK-LABEL: func private @__inference_dense_image_warp(
 // CHECK-SAME:      %arg0: tensor<2x4x4x1xf32>,
 // CHECK-SAME:      %arg1: tensor<2x4x4x2xf32>) -> tensor<2x4x4x1xf32> attributes {tf._implements = "DenseImageWarp"} {
-// CHECK-NEXT:    %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "DenseImageWarp", custom_option = #tfl<const_bytes : "0x">} : (tensor<2x4x4x1xf32>, tensor<2x4x4x2xf32>) -> tensor<2x4x4x1xf32>
+// CHECK-NEXT:    %0 = "tfl.custom"(%arg0, %arg1) <{custom_code = "DenseImageWarp", custom_option = #tfl<const_bytes : "0x">}> : (tensor<2x4x4x1xf32>, tensor<2x4x4x2xf32>) -> tensor<2x4x4x1xf32>
 // CHECK-NEXT:    return %0 : tensor<2x4x4x1xf32>
 // CHECK-NEXT:  }
 }
@@ -1014,7 +1016,7 @@ func.func private @dense_image_warp_invalid_output_type(%arg0: tensor<2x4x4x1xf3
 // -----
 
 module {
-func.func @my_composite_op_150(%arg0: tensor<4x4xf32>, %arg1: tensor<4x4xf32>, %arg2: tensor<4x4xf32>) -> (tensor<*xf32>, tensor<*xf32>) attributes {tf._implements = #tf_type.func<@my_composite_op, {example_option = 10 : i64, tfl_fusable_op = true}>} {
+func.func @my_composite_op_150(%arg0: tensor<4x4xf32>, %arg1: tensor<4x4xf32>, %arg2: tensor<4x4xf32>) -> (tensor<*xf32>, tensor<*xf32>) attributes {tf._implements = #tf_type.func<@my_composite_op, {example_option = 10 : i64, example_str = "value 1.01", tfl_fusable_op = true}>} {
   %0 = "tf.AddV2"(%arg0, %arg1) {device = ""} : (tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<*xf32>
   %1 = "tf.Identity"(%0) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
   %2 = "tf.Mul"(%0, %arg2) {device = ""} : (tensor<*xf32>, tensor<4x4xf32>) -> tensor<*xf32>
@@ -1022,8 +1024,8 @@ func.func @my_composite_op_150(%arg0: tensor<4x4xf32>, %arg1: tensor<4x4xf32>, %
   func.return %1, %3 : tensor<*xf32>, tensor<*xf32>
 }
 
-// CHECK-LABEL: func @my_composite_op_150(%arg0: tensor<4x4xf32>, %arg1: tensor<4x4xf32>, %arg2: tensor<4x4xf32>) -> (tensor<*xf32>, tensor<*xf32>) attributes {tf._implements = #tf_type.func<@my_composite_op, {example_option = 10 : i64, tfl_fusable_op = true}>} {
-// CHECK-NEXT:  %0:2 = "tfl.custom"(%arg0, %arg1, %arg2) {custom_code = "my_composite_op", custom_option = #tfl<const_bytes : "0x6578616D706C655F6F7074696F6E0001100101010A04022401">} : (tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>) -> (tensor<*xf32>, tensor<*xf32>)
+// CHECK-LABEL: func @my_composite_op_150(%arg0: tensor<4x4xf32>, %arg1: tensor<4x4xf32>, %arg2: tensor<4x4xf32>) -> (tensor<*xf32>, tensor<*xf32>) attributes {tf._implements = #tf_type.func<@my_composite_op, {example_option = 10 : i64, example_str = "value 1.01", tfl_fusable_op = true}>} {
+// CHECK-NEXT:  %0:2 = "tfl.custom"(%arg0, %arg1, %arg2) <{custom_code = "my_composite_op", custom_option = #tfl<const_bytes : "0x6578616D706C655F6F7074696F6E006578616D706C655F737472000A76616C756520312E30310002281A0201020A120414042401">}> : (tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>) -> (tensor<*xf32>, tensor<*xf32>)
 // CHECK-NEXT:  return %0#0, %0#1 : tensor<*xf32>, tensor<*xf32>
 // CHECK-NEXT: }
 
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-quantize-dynamic-range.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-quantize-dynamic-range.mlir
index b35355524127dc..baa41bc47b1d1f 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-quantize-dynamic-range.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-quantize-dynamic-range.mlir
@@ -20,32 +20,32 @@ func.func @QuantizeConv2D(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x112x112x64
 
 // CHECK-DAG: %[[b:.*]] = arith.constant dense<-1.23697901> : tensor<64xf32>
 // CHECK-DAG: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<64x3x3x3xf32>
-// CHECK: %[[q_w:.*]] = "tfl.quantize"(%[[w]]) {qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,
+// CHECK: %[[q_w:.*]] = "tfl.quantize"(%[[w]]) <{qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,
 // CHECK: %[[dq_w:.*]] = "tfl.dequantize"(%[[q_w]])
-// CHECK: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[dq_w]], %[[b]]) {
+// CHECK: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[dq_w]], %[[b]]) <{
 // CHECK-NOT: asymmetric_quantize_inputs = true
 // CHECK-SAME: dilation_h_factor = 1 : i32
 // CHECK: return %[[conv:.*]]
 
 // PerTensor-DAG: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<64x3x3x3xf32>
 // PerTensor-DAG: %[[b:.*]] = arith.constant dense<-1.23697901> : tensor<64xf32>
-// PerTensor: %[[q_w:.*]] = "tfl.quantize"(%[[w]]) {qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>}
+// PerTensor: %[[q_w:.*]] = "tfl.quantize"(%[[w]]) <{qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>}>
 // PerTensor: %[[dq_w:.*]] = "tfl.dequantize"(%[[q_w]]) : (tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<64x3x3x3xf32>
-// PerTensor: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[dq_w]], %[[b]]) {
+// PerTensor: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[dq_w]], %[[b]]) <{
 // PerTensor-NOT: asymmetric_quantize_inputs = true
 // PerTensor-SAME: dilation_h_factor = 1 : i32
 // PerTensor: return %[[conv:.*]]
 
 // MinElement-DAG: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<64x3x3x3xf32>
 // MinElement-DAG: %[[b:.*]] = arith.constant dense<-1.23697901> : tensor<64xf32>
-// MinElement: %[[conv:.*]]= "tfl.conv_2d"(%arg0, %[[w]], %[[b]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x224x224x3xf32>, tensor<64x3x3x3xf32>, tensor<64xf32>) -> tensor<1x112x112x64xf32>
+// MinElement: %[[conv:.*]]= "tfl.conv_2d"(%arg0, %[[w]], %[[b]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32}> : (tensor<1x224x224x3xf32>, tensor<64x3x3x3xf32>, tensor<64xf32>) -> tensor<1x112x112x64xf32>
 // MinElement: return %[[conv:.*]]
 
 // Float16-DAG: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<64x3x3x3xf16>
 // Float16-DAG: %[[b:.*]] = arith.constant dense<-1.237300e+00> : tensor<64xf16>
 // Float16: %[[dq_w:.*]] = "tfl.dequantize"(%[[w]]) : (tensor<64x3x3x3xf16>) -> tensor<64x3x3x3xf32>
 // Float16: %[[dq_b:.*]] = "tfl.dequantize"(%[[b]]) : (tensor<64xf16>) -> tensor<64xf32>
-// Float16: %[[conv:.*]]= "tfl.conv_2d"(%arg0, %[[dq_w]], %[[dq_b]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x224x224x3xf32>, tensor<64x3x3x3xf32>, tensor<64xf32>) -> tensor<1x112x112x64xf32>
+// Float16: %[[conv:.*]]= "tfl.conv_2d"(%arg0, %[[dq_w]], %[[dq_b]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32}> : (tensor<1x224x224x3xf32>, tensor<64x3x3x3xf32>, tensor<64xf32>) -> tensor<1x112x112x64xf32>
 // Float16: return %[[conv:.*]]
 }
 
@@ -63,32 +63,32 @@ func.func @QuantizeDepthwiseConv2D(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x1
 
 // CHECK-DAG: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<64x3x3x3xf32>
 // CHECK-DAG: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<64xf32>
-// CHECK: %[[q_w:.*]] = "tfl.quantize"(%[[w]]) {qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32:3, {1.000000e+00,1.000000e+00,1.000000e+00}
+// CHECK: %[[q_w:.*]] = "tfl.quantize"(%[[w]]) <{qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32:3, {1.000000e+00,1.000000e+00,1.000000e+00}>
 // CHECK: %[[dq_w:.*]] = "tfl.dequantize"(%[[q_w]])
-// CHECK: %[[dconv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[dq_w]], %[[b]]) {
+// CHECK: %[[dconv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[dq_w]], %[[b]]) <{
 // CHECK-NOT: asymmetric_quantize_inputs = true
 // CHECK-SAME: depth_multiplier = 4 : i32
 // CHECK: return %[[dconv:.*]]
 
 // PerTensor-DAG: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<64x3x3x3xf32>
 // PerTensor-DAG: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<64xf32>
-// PerTensor: %[[q_w:.*]] = "tfl.quantize"(%[[w]]) {qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>}
+// PerTensor: %[[q_w:.*]] = "tfl.quantize"(%[[w]]) <{qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>}>
 // PerTensor: %[[dq_w:.*]] = "tfl.dequantize"(%[[q_w]]) : (tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<64x3x3x3xf32>
-// PerTensor: %[[dconv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[dq_w]], %[[b]]) {
+// PerTensor: %[[dconv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[dq_w]], %[[b]]) <{
 // PerTensor-NOT: asymmetric_quantize_inputs = true
 // PerTensor-SAME: depth_multiplier = 4 : i32
 // PerTensor: return %[[dconv:.*]]
 
 // MinElement: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<64x3x3x3xf32>
 // MinElement: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<64xf32>
-// MinElement: %[[dconv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[w]], %[[b]]) {depth_multiplier = 4 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 4 : i32, stride_w = 5 : i32} : (tensor<1x224x224x3xf32>, tensor<64x3x3x3xf32>, tensor<64xf32>) -> tensor<1x112x112x64xf32>
+// MinElement: %[[dconv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[w]], %[[b]]) <{depth_multiplier = 4 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 4 : i32, stride_w = 5 : i32}> : (tensor<1x224x224x3xf32>, tensor<64x3x3x3xf32>, tensor<64xf32>) -> tensor<1x112x112x64xf32>
 // MinElement: return %[[dconv:.*]]
 
 // Float16-DAG: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<64x3x3x3xf16>
 // Float16-DAG: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<64xf16>
 // Float16: %[[dq_w:.*]] = "tfl.dequantize"(%[[w]]) : (tensor<64x3x3x3xf16>) -> tensor<64x3x3x3xf32>
 // Float16: %[[dq_b:.*]] = "tfl.dequantize"(%[[b]]) : (tensor<64xf16>) -> tensor<64xf32>
-// Float16: %[[dconv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[dq_w]], %[[dq_b]]) {depth_multiplier = 4 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 4 : i32, stride_w = 5 : i32} : (tensor<1x224x224x3xf32>, tensor<64x3x3x3xf32>, tensor<64xf32>) -> tensor<1x112x112x64xf32>
+// Float16: %[[dconv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[dq_w]], %[[dq_b]]) <{depth_multiplier = 4 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 4 : i32, stride_w = 5 : i32}> : (tensor<1x224x224x3xf32>, tensor<64x3x3x3xf32>, tensor<64xf32>) -> tensor<1x112x112x64xf32>
 // Float16: return %[[dconv:.*]]
 }
 
@@ -103,19 +103,19 @@ func.func @QuantizeFullyConnected(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x11
   func.return %fc : tensor<1x112x112x512xf32>
 
 // CHECK-DAG: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<512x12xf32>
-// CHECK-DAG: %[[q_w:.*]] = "tfl.quantize"(%[[w]]) {qtype = tensor<512x12x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,
+// CHECK-DAG: %[[q_w:.*]] = "tfl.quantize"(%[[w]]) <{qtype = tensor<512x12x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,
 // CHECK-DAG: %[[dq_w:.*]] = "tfl.dequantize"(%[[q_w]]) : (tensor<512x12x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,
 // CHECK-DAG: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<512xf32>
-// CHECK: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %[[dq_w]], %[[b]]) {
+// CHECK: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %[[dq_w]], %[[b]]) <{
 // CHECK-NOT: fused_activation_function = "NONE"
 // CHECK-SAME: asymmetric_quantize_inputs = true
 // CHECK: return %[[fc:.*]]
 
 // PerTensor-DAG: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<512x12xf32>
-// PerTensor-DAG: %[[q_w:.*]]= "tfl.quantize"(%[[w:.*]]) {qtype = tensor<512x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>}
+// PerTensor-DAG: %[[q_w:.*]]= "tfl.quantize"(%[[w:.*]]) <{qtype = tensor<512x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>}>
 // PerTensor-DAG: %[[dq_w:.*]] = "tfl.dequantize"(%[[q_w:.*]]) : (tensor<512x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<512x12xf32>
 // PerTensor-DAG: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<512xf32>
-// PerTensor: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %[[dq_w:.*]], %[[b:.*]]) {
+// PerTensor: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %[[dq_w:.*]], %[[b:.*]]) <{
 // PerTensor-NOT: fused_activation_function = "NONE"
 // PerTensor-SAME: asymmetric_quantize_inputs = true
 // PerTensor: return %[[fc:.*]]
@@ -132,21 +132,21 @@ func.func @QuantizeBatchMatmulWithActConst(%arg0: tensor<1x3x3x512xf32>) -> tens
   func.return %mm_s : tensor<1x3x3x2xf32>
 
 // CHECK: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<512x2xf32>
-// CHECK: %[[q_w:.*]] = "tfl.quantize"(%[[w]]) {qtype = tensor<512x2x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>}
+// CHECK: %[[q_w:.*]] = "tfl.quantize"(%[[w]]) <{qtype = tensor<512x2x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>}>
 // CHECK: %[[dq_w:.*]] = "tfl.dequantize"(%[[q_w]]) : (tensor<512x2x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<512x2xf32>
-// CHECK: %[[mm:.*]] = "tfl.batch_matmul"(%arg0, %[[dq_w]]) {adj_x = false, adj_y = false
+// CHECK: %[[mm:.*]] = "tfl.batch_matmul"(%arg0, %[[dq_w]]) <{adj_x = false, adj_y = false
 // CHECK-SAME: , asymmetric_quantize_inputs = true
 // CHECK: return %[[mm:.*]]
 
 // PerTensor: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<512x2xf32>
-// PerTensor: %[[q_w:.*]] = "tfl.quantize"(%[[w]]) {qtype = tensor<512x2x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>}
+// PerTensor: %[[q_w:.*]] = "tfl.quantize"(%[[w]]) <{qtype = tensor<512x2x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>}>
 // PerTensor: %[[dq_w:.*]] = "tfl.dequantize"(%[[q_w]]) : (tensor<512x2x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<512x2xf32>
-// PerTensor: %[[mm:.*]] = "tfl.batch_matmul"(%arg0, %[[dq_w]]) {adj_x = false, adj_y = false
+// PerTensor: %[[mm:.*]] = "tfl.batch_matmul"(%arg0, %[[dq_w]]) <{adj_x = false, adj_y = false
 // PerTensor-SAME: , asymmetric_quantize_inputs = true
 // PerTensor: return %[[mm:.*]]
 
 // MinElement: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<512x2xf32>
-// MinElement: %[[mm:.*]] = "tfl.batch_matmul"(%arg0, %[[w]]) {adj_x = false, adj_y = false} : (tensor<1x3x3x512xf32>, tensor<512x2xf32>) -> tensor<1x3x3x2xf32>
+// MinElement: %[[mm:.*]] = "tfl.batch_matmul"(%arg0, %[[w]]) <{adj_x = false, adj_y = false}> : (tensor<1x3x3x512xf32>, tensor<512x2xf32>) -> tensor<1x3x3x2xf32>
 // MinElement: return %[[mm:.*]]
 }
 
@@ -160,11 +160,11 @@ func.func @NotQuantizeBatchMatmulWithConstAct(%arg0: tensor<1x1x3x512xf32>) -> t
   func.return %mm_s : tensor<1x1x12x3xf32>
 
 // CHECK: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<1x1x12x512xf32>
-// CHECK: %[[mm:.*]] = "tfl.batch_matmul"(%[[w]], %arg0) {adj_x = false, adj_y = true}
+// CHECK: %[[mm:.*]] = "tfl.batch_matmul"(%[[w]], %arg0) <{adj_x = false, adj_y = true}>
 // CHECK: return %[[mm:.*]]
 
 // PerTensor: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<1x1x12x512xf32>
-// PerTensor: %[[mm:.*]] = "tfl.batch_matmul"(%[[w]], %arg0) {adj_x = false, adj_y = true}
+// PerTensor: %[[mm:.*]] = "tfl.batch_matmul"(%[[w]], %arg0) <{adj_x = false, adj_y = true}>
 // PerTensor: return %[[mm:.*]]
 }
 
@@ -176,10 +176,10 @@ func.func @NotQuantizeBatchMatmulWithActAct(%arg0: tensor<1x3x3x512xf32>) -> ten
   %mm_s = "quantfork.stats"(%mm) {layerStats = dense<[0.000000e+00, 1.000000e+01]> : tensor<2xf32>} : (tensor<1x3x3x3xf32>) -> tensor<1x3x3x3xf32>
   func.return %mm : tensor<1x3x3x3xf32>
 
-// CHECK: %[[mm:.*]] = "tfl.batch_matmul"(%arg0, %arg0) {adj_x = false, adj_y = true}
+// CHECK: %[[mm:.*]] = "tfl.batch_matmul"(%arg0, %arg0) <{adj_x = false, adj_y = true}>
 // CHECK: return %[[mm:.*]]
 
-// PerTensor: %[[mm:.*]] = "tfl.batch_matmul"(%arg0, %arg0) {adj_x = false, adj_y = true}
+// PerTensor: %[[mm:.*]] = "tfl.batch_matmul"(%arg0, %arg0) <{adj_x = false, adj_y = true}>
 // PerTensor: return %[[mm:.*]]
 }
 
@@ -212,31 +212,31 @@ func.func @QuantizeCustomOp(%arg0: tensor<1x1x1x1xf32>) -> (tensor<*xf32>, tenso
 // CHECK: %[[w_1:.*]] = arith.constant dense<1.270000e+02> : tensor<4096x1x1x1xf32>
 // CHECK: %[[w_2:.*]] = arith.constant dense<1.270000e+02> : tensor<128x1x1x1xf32>
 // CHECK: %[[b:.*]] = arith.constant dense<1.270000e+02> : tensor<2048x1x1x1xf32>
-// CHECK: %[[custom_1:.*]] = "tfl.custom"(%arg0, %[[w_1]], %[[w_2]], %[[b]]) {custom_code = "CustomTestOp", custom_option = #tfl<const_bytes : "0x">} : (tensor<1x1x1x1xf32>, tensor<4096x1x1x1xf32>, tensor<128x1x1x1xf32>, tensor<2048x1x1x1xf32>) -> tensor<*xf32>
-// CHECK: %[[custom_2:.*]] = "tfl.custom"(%arg0, %[[w_1]], %[[w_2]], %[[b]]) {custom_code = "CustomTestOp2", custom_option = #tfl<const_bytes : "0x">} : (tensor<1x1x1x1xf32>, tensor<4096x1x1x1xf32>, tensor<128x1x1x1xf32>, tensor<2048x1x1x1xf32>) -> tensor<*xf32>
-// CHECK: %[[custom_3:.*]] = "tfl.custom"(%arg0, %[[w_1]], %[[w_2]], %[[b]]) {custom_code = "CustomTestOp3", custom_option = #tfl<const_bytes : "0x">} : (tensor<1x1x1x1xf32>, tensor<4096x1x1x1xf32>, tensor<128x1x1x1xf32>, tensor<2048x1x1x1xf32>) -> tensor<*xf32>
+// CHECK: %[[custom_1:.*]] = "tfl.custom"(%arg0, %[[w_1]], %[[w_2]], %[[b]]) <{custom_code = "CustomTestOp", custom_option = #tfl<const_bytes : "0x">}> : (tensor<1x1x1x1xf32>, tensor<4096x1x1x1xf32>, tensor<128x1x1x1xf32>, tensor<2048x1x1x1xf32>) -> tensor<*xf32>
+// CHECK: %[[custom_2:.*]] = "tfl.custom"(%arg0, %[[w_1]], %[[w_2]], %[[b]]) <{custom_code = "CustomTestOp2", custom_option = #tfl<const_bytes : "0x">}> : (tensor<1x1x1x1xf32>, tensor<4096x1x1x1xf32>, tensor<128x1x1x1xf32>, tensor<2048x1x1x1xf32>) -> tensor<*xf32>
+// CHECK: %[[custom_3:.*]] = "tfl.custom"(%arg0, %[[w_1]], %[[w_2]], %[[b]]) <{custom_code = "CustomTestOp3", custom_option = #tfl<const_bytes : "0x">}> : (tensor<1x1x1x1xf32>, tensor<4096x1x1x1xf32>, tensor<128x1x1x1xf32>, tensor<2048x1x1x1xf32>) -> tensor<*xf32>
 // CHECK: return %[[custom_1:.*]], %[[custom_2:.*]], %[[custom_3:.*]]
 
 // CustomOp-DAG: %[[w_1:.*]] = arith.constant dense<1.270000e+02> : tensor<4096x1x1x1xf32>
 // CustomOp-DAG: %[[w_2:.*]] = arith.constant dense<1.270000e+02> : tensor<128x1x1x1xf32>
 // CustomOp-DAG: %[[b:.*]] = arith.constant dense<1.270000e+02> : tensor<2048x1x1x1xf32>
-// CustomOp-DAG: %[[q_w1:.*]] = "tfl.quantize"(%[[w_1]]) {qtype = tensor<4096x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>} : (tensor<4096x1x1x1xf32>) -> tensor<4096x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
-// CustomOp-DAG: %[[q_b:.*]] = "tfl.quantize"(%[[b]]) {qtype = tensor<2048x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>} : (tensor<2048x1x1x1xf32>) -> tensor<2048x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+// CustomOp-DAG: %[[q_w1:.*]] = "tfl.quantize"(%[[w_1]]) <{qtype = tensor<4096x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>}> : (tensor<4096x1x1x1xf32>) -> tensor<4096x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+// CustomOp-DAG: %[[q_b:.*]] = "tfl.quantize"(%[[b]]) <{qtype = tensor<2048x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>}> : (tensor<2048x1x1x1xf32>) -> tensor<2048x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
 // CustomOp-DAG: %[[dq_w1:.*]] = "tfl.dequantize"(%[[q_w1]]) : (tensor<4096x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<4096x1x1x1xf32>
 // CustomOp: %[[dq_b:.*]] = "tfl.dequantize"(%[[q_b]]) : (tensor<2048x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<2048x1x1x1xf32>
-// CustomOp: %[[custom_1:.*]] = "tfl.custom"(%arg0, %[[dq_w1]], %[[w_2]], %[[dq_b]]) {custom_code = "CustomTestOp", custom_option = #tfl<const_bytes : "0x">} : (tensor<1x1x1x1xf32>, tensor<4096x1x1x1xf32>, tensor<128x1x1x1xf32>, tensor<2048x1x1x1xf32>) -> tensor<*xf32>
-// CustomOp: %[[custom_2:.*]] = "tfl.custom"(%arg0, %[[w_1]], %[[w_2]], %[[b]]) {custom_code = "CustomTestOp2", custom_option = #tfl<const_bytes : "0x">} : (tensor<1x1x1x1xf32>, tensor<4096x1x1x1xf32>, tensor<128x1x1x1xf32>, tensor<2048x1x1x1xf32>) -> tensor<*xf32>
-// CustomOp: %[[custom_3:.*]] = "tfl.custom"(%arg0, %[[w_1]], %[[w_2]], %[[dq_b]]) {custom_code = "CustomTestOp3", custom_option = #tfl<const_bytes : "0x">} : (tensor<1x1x1x1xf32>, tensor<4096x1x1x1xf32>, tensor<128x1x1x1xf32>, tensor<2048x1x1x1xf32>) -> tensor<*xf32>
+// CustomOp: %[[custom_1:.*]] = "tfl.custom"(%arg0, %[[dq_w1]], %[[w_2]], %[[dq_b]]) <{custom_code = "CustomTestOp", custom_option = #tfl<const_bytes : "0x">}> : (tensor<1x1x1x1xf32>, tensor<4096x1x1x1xf32>, tensor<128x1x1x1xf32>, tensor<2048x1x1x1xf32>) -> tensor<*xf32>
+// CustomOp: %[[custom_2:.*]] = "tfl.custom"(%arg0, %[[w_1]], %[[w_2]], %[[b]]) <{custom_code = "CustomTestOp2", custom_option = #tfl<const_bytes : "0x">}> : (tensor<1x1x1x1xf32>, tensor<4096x1x1x1xf32>, tensor<128x1x1x1xf32>, tensor<2048x1x1x1xf32>) -> tensor<*xf32>
+// CustomOp: %[[custom_3:.*]] = "tfl.custom"(%arg0, %[[w_1]], %[[w_2]], %[[dq_b]]) <{custom_code = "CustomTestOp3", custom_option = #tfl<const_bytes : "0x">}> : (tensor<1x1x1x1xf32>, tensor<4096x1x1x1xf32>, tensor<128x1x1x1xf32>, tensor<2048x1x1x1xf32>) -> tensor<*xf32>
 // CustomOp: return %[[custom_1:.*]], %[[custom_2:.*]], %[[custom_3:.*]]
 
 // MinElement-DAG: %[[w_1:.*]] = arith.constant dense<1.270000e+02> : tensor<4096x1x1x1xf32>
-// MinElement-DAG: %[[q_w1:.*]] = "tfl.quantize"(%[[w_1]]) {qtype = tensor<4096x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>} : (tensor<4096x1x1x1xf32>) -> tensor<4096x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+// MinElement-DAG: %[[q_w1:.*]] = "tfl.quantize"(%[[w_1]]) <{qtype = tensor<4096x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>}> : (tensor<4096x1x1x1xf32>) -> tensor<4096x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
 // MinElement-DAG: %[[dq_w1:.*]] = "tfl.dequantize"(%[[q_w1]]) : (tensor<4096x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<4096x1x1x1xf32>
 // MinElement-DAG: %[[w_2:.*]] = arith.constant dense<1.270000e+02> : tensor<128x1x1x1xf32>
 // MinElement-DAG: %[[b:.*]] = arith.constant dense<1.270000e+02> : tensor<2048x1x1x1xf32>
-// MinElement: %[[custom_1:.*]] = "tfl.custom"(%arg0, %[[dq_w1]], %[[w_2]], %[[b]]) {custom_code = "CustomTestOp", custom_option = #tfl<const_bytes : "0x">} : (tensor<1x1x1x1xf32>, tensor<4096x1x1x1xf32>, tensor<128x1x1x1xf32>, tensor<2048x1x1x1xf32>) -> tensor<*xf32>
-// MinElement: %[[custom_2:.*]] = "tfl.custom"(%arg0, %[[w_1]], %[[w_2]], %[[b]]) {custom_code = "CustomTestOp2", custom_option = #tfl<const_bytes : "0x">} : (tensor<1x1x1x1xf32>, tensor<4096x1x1x1xf32>, tensor<128x1x1x1xf32>, tensor<2048x1x1x1xf32>) -> tensor<*xf32>
-// MinElement: %[[custom_3:.*]] = "tfl.custom"(%arg0, %[[w_1]], %[[w_2]], %[[b]]) {custom_code = "CustomTestOp3", custom_option = #tfl<const_bytes : "0x">} : (tensor<1x1x1x1xf32>, tensor<4096x1x1x1xf32>, tensor<128x1x1x1xf32>, tensor<2048x1x1x1xf32>) -> tensor<*xf32>
+// MinElement: %[[custom_1:.*]] = "tfl.custom"(%arg0, %[[dq_w1]], %[[w_2]], %[[b]]) <{custom_code = "CustomTestOp", custom_option = #tfl<const_bytes : "0x">}> : (tensor<1x1x1x1xf32>, tensor<4096x1x1x1xf32>, tensor<128x1x1x1xf32>, tensor<2048x1x1x1xf32>) -> tensor<*xf32>
+// MinElement: %[[custom_2:.*]] = "tfl.custom"(%arg0, %[[w_1]], %[[w_2]], %[[b]]) <{custom_code = "CustomTestOp2", custom_option = #tfl<const_bytes : "0x">}> : (tensor<1x1x1x1xf32>, tensor<4096x1x1x1xf32>, tensor<128x1x1x1xf32>, tensor<2048x1x1x1xf32>) -> tensor<*xf32>
+// MinElement: %[[custom_3:.*]] = "tfl.custom"(%arg0, %[[w_1]], %[[w_2]], %[[b]]) <{custom_code = "CustomTestOp3", custom_option = #tfl<const_bytes : "0x">}> : (tensor<1x1x1x1xf32>, tensor<4096x1x1x1xf32>, tensor<128x1x1x1xf32>, tensor<2048x1x1x1xf32>) -> tensor<*xf32>
 // MinElement: return %[[custom_1:.*]], %[[custom_2:.*]], %[[custom_3:.*]]
 }
 
@@ -252,18 +252,18 @@ func.func @QuantizeTransposeConvWeightOnly(%arg0: tensor<32x4x4x128xf32>, %arg1:
 
 // CHECK-DAG: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<1x32x42x128xf32>
 // CHECK-DAG: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<1x32x42x128xf32>
-// CHECK: %[[q_w:.*]] = "tfl.quantize"(%[[w]]) {qtype = tensor<1x32x42x128x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00}>>} : (tensor<1x32x42x128xf32>) -> tensor<1x32x42x128x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00}>>
+// CHECK: %[[q_w:.*]] = "tfl.quantize"(%[[w]]) <{qtype = tensor<1x32x42x128x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00}>>}> : (tensor<1x32x42x128xf32>) -> tensor<1x32x42x128x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00}>>
 // CHECK: %[[dq_w:.*]] = "tfl.dequantize"(%[[q_w]]) : (tensor<1x32x42x128x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00}>>) -> tensor<1x32x42x128xf32>
-// CHECK: %[[tconv:.*]] = "tfl.transpose_conv"(%arg1, %[[dq_w:.*]], %arg0, %[[b:.*]]) {
+// CHECK: %[[tconv:.*]] = "tfl.transpose_conv"(%arg1, %[[dq_w:.*]], %arg0, %[[b:.*]]) <{
 // CHECK-NOT: asymmetric_quantize_inputs = true
 // CHECK-SAME: padding = "SAME"
 // CHECK: return %[[tconv:.*]]
 
 // PerTensor-DAG: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<1x32x42x128xf32>
 // PerTensor-DAG: %[[b:.*]]= arith.constant dense<0.000000e+00> : tensor<1x32x42x128xf32>
-// PerTensor: %[[q_w:.*]] = "tfl.quantize"(%[[w]]) {qtype = tensor<1x32x42x128x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>} : (tensor<1x32x42x128xf32>) -> tensor<1x32x42x128x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+// PerTensor: %[[q_w:.*]] = "tfl.quantize"(%[[w]]) <{qtype = tensor<1x32x42x128x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>}> : (tensor<1x32x42x128xf32>) -> tensor<1x32x42x128x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
 // PerTensor: %[[dq_w:.*]] = "tfl.dequantize"(%[[q_w]]) : (tensor<1x32x42x128x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<1x32x42x128xf32>
-// PerTensor: %[[tconv:.*]] = "tfl.transpose_conv"(%arg1, %[[dq_w:.*]], %arg0, %[[b:.*]]) {
+// PerTensor: %[[tconv:.*]] = "tfl.transpose_conv"(%arg1, %[[dq_w:.*]], %arg0, %[[b:.*]]) <{
 // PerTensor-NOT: asymmetric_quantize_inputs = true
 // PerTensor-SAME: padding = "SAME"
 // PerTensor: return %[[tconv:.*]]
@@ -278,13 +278,13 @@ func.func @QuantizeGatherWeightOnly(%arg0: tensor<3xi32>) -> tensor<3x3x3x3xf32>
   func.return %emb_s : tensor<3x3x3x3xf32>
 
 // CHECK: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<64x3x3x3xf32>
-// CHECK: %[[q_w:.*]] = "tfl.quantize"(%[[w]]) {qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+// CHECK: %[[q_w:.*]] = "tfl.quantize"(%[[w]]) <{qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
 // CHECK: %[[dq_w:.*]] = "tfl.dequantize"(%[[q_w]]) : (tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<64x3x3x3xf32>
 // CHECK: %[[emb:.*]] = "tfl.gather"(%[[dq_w]], %arg0)
 // CHECK: return %[[emb:.*]]
 
 // PerTensor: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<64x3x3x3xf32>
-// PerTensor: %[[q_w:.*]] = "tfl.quantize"(%[[w]]) {qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+// PerTensor: %[[q_w:.*]] = "tfl.quantize"(%[[w]]) <{qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
 // PerTensor: %[[dq_w:.*]] = "tfl.dequantize"(%[[q_w]]) : (tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<64x3x3x3xf32>
 // PerTensor: %[[emb:.*]] = "tfl.gather"(%[[dq_w]], %arg0)
 // PerTensor: return %[[emb:.*]]
@@ -312,10 +312,10 @@ func.func @NotQuantizeConv3D(%arg0: tensor<?x28x28x28x8xf32>) -> tensor<?x26x26x
   func.return %add_s : tensor<?x26x26x26x16xf32>
 
 // CHECK-DAG: %[[out_ch:.*]] = arith.constant dense<16> : tensor<1xi64>
-// CHECK-DAG: %[[const:.*]] = "tfl.no_value"() {value} : () -> none
+// CHECK-DAG: %[[const:.*]] = "tfl.no_value"() <{value}> : () -> none
 // CHECK-DAG: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<3x3x3x8x16xf32>
 // CHECK-DAG: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<16xf32>
-// CHECK: %[[conv3d:.*]] = "tfl.conv_3d"(%arg0, %[[w]], %[[const]]) {dilation_d_factor = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_d = 1 : i32, stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<?x28x28x28x8xf32>, tensor<3x3x3x8x16xf32>, none) -> tensor<?x26x26x26x16xf32>
+// CHECK: %[[conv3d:.*]] = "tfl.conv_3d"(%arg0, %[[w]], %[[const]]) <{dilation_d_factor = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_d = 1 : i32, stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<?x28x28x28x8xf32>, tensor<3x3x3x8x16xf32>, none) -> tensor<?x26x26x26x16xf32>
 // CHECK: %2 = "tfl.shape"(%[[conv3d]]) : (tensor<?x26x26x26x16xf32>) -> tensor<5xi64>
 // CHECK: %3 = "tfl.broadcast_args"(%2, %[[out_ch]]) : (tensor<5xi64>, tensor<1xi64>) -> tensor<5xi64>
 // CHECK: %4 = "tfl.broadcast_to"(%[[conv3d]], %3) : (tensor<?x26x26x26x16xf32>, tensor<5xi64>) -> tensor<?x26x26x26x16xf32>
@@ -324,10 +324,10 @@ func.func @NotQuantizeConv3D(%arg0: tensor<?x28x28x28x8xf32>) -> tensor<?x26x26x
 // CHECK: return %6 : tensor<?x26x26x26x16xf32>
 
 // PerTensor: %[[out_ch:.*]] = arith.constant dense<16> : tensor<1xi64>
-// PerTensor: %[[const:.*]] = "tfl.no_value"() {value} : () -> none
+// PerTensor: %[[const:.*]] = "tfl.no_value"() <{value}> : () -> none
 // PerTensor: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<3x3x3x8x16xf32>
 // PerTensor: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<16xf32>
-// PerTensor: %[[conv3d:.*]] = "tfl.conv_3d"(%arg0, %[[w]], %[[const]]) {dilation_d_factor = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_d = 1 : i32, stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<?x28x28x28x8xf32>, tensor<3x3x3x8x16xf32>, none) -> tensor<?x26x26x26x16xf32>
+// PerTensor: %[[conv3d:.*]] = "tfl.conv_3d"(%arg0, %[[w]], %[[const]]) <{dilation_d_factor = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_d = 1 : i32, stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<?x28x28x28x8xf32>, tensor<3x3x3x8x16xf32>, none) -> tensor<?x26x26x26x16xf32>
 // PerTensor: %2 = "tfl.shape"(%[[conv3d]]) : (tensor<?x26x26x26x16xf32>) -> tensor<5xi64>
 // PerTensor: %3 = "tfl.broadcast_args"(%2, %[[out_ch]]) : (tensor<5xi64>, tensor<1xi64>) -> tensor<5xi64>
 // PerTensor: %4 = "tfl.broadcast_to"(%[[conv3d]], %3) : (tensor<?x26x26x26x16xf32>, tensor<5xi64>) -> tensor<?x26x26x26x16xf32>
@@ -338,10 +338,10 @@ func.func @NotQuantizeConv3D(%arg0: tensor<?x28x28x28x8xf32>) -> tensor<?x26x26x
 // Float16-DAG: %[[out_ch:.*]] = arith.constant dense<16> : tensor<1xi64>
 // Float16-DAG: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<3x3x3x8x16xf16>
 // Float16-DAG: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<16xf16>
-// Float16-DAG: %[[const:.*]] = "tfl.no_value"() {value} : () -> none
+// Float16-DAG: %[[const:.*]] = "tfl.no_value"() <{value}> : () -> none
 // Float16-DAG: %[[dq_w:.*]] = "tfl.dequantize"(%[[w]]) : (tensor<3x3x3x8x16xf16>) -> tensor<3x3x3x8x16xf32>
 // Float16-DAG: %[[dq_b:.*]] = "tfl.dequantize"(%[[b]]) : (tensor<16xf16>) -> tensor<16xf32>
-// Float16: %[[conv3d:.*]] = "tfl.conv_3d"(%arg0, %[[dq_w]], %[[const]]) {dilation_d_factor = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_d = 1 : i32, stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<?x28x28x28x8xf32>, tensor<3x3x3x8x16xf32>, none) -> tensor<?x26x26x26x16xf32>
+// Float16: %[[conv3d:.*]] = "tfl.conv_3d"(%arg0, %[[dq_w]], %[[const]]) <{dilation_d_factor = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_d = 1 : i32, stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<?x28x28x28x8xf32>, tensor<3x3x3x8x16xf32>, none) -> tensor<?x26x26x26x16xf32>
 // Float16: %4 = "tfl.shape"(%[[conv3d]]) : (tensor<?x26x26x26x16xf32>) -> tensor<5xi64>
 // Float16: %5 = "tfl.broadcast_args"(%4, %[[out_ch]]) : (tensor<5xi64>, tensor<1xi64>) -> tensor<5xi64>
 // Float16: %6 = "tfl.broadcast_to"(%[[conv3d]], %5) : (tensor<?x26x26x26x16xf32>, tensor<5xi64>) -> tensor<?x26x26x26x16xf32>
@@ -367,24 +367,24 @@ func.func @QuantizeMultiUses(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x112x112
 
 // CHECK-DAG: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<64x3x3x3xf32>
 // CHECK-DAG: %[[b:.*]] = arith.constant dense<-1.23697901> : tensor<64xf32>
-// CHECK-DAG: %[[q_w1:.*]] = "tfl.quantize"(%[[w]]) {qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32:3, {1.000000e+00,1.000000e+00,1.000000e+00}
-// CHECK-DAG: %[[q_w2:.*]] = "tfl.quantize"(%[[w]]) {qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00,1.000000e+00,1.000000e+00
+// CHECK-DAG: %[[q_w1:.*]] = "tfl.quantize"(%[[w]]) <{qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32:3, {1.000000e+00,1.000000e+00,1.000000e+00}>
+// CHECK-DAG: %[[q_w2:.*]] = "tfl.quantize"(%[[w]]) <{qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00,1.000000e+00,1.000000e+00
 // CHECK-DAG: %[[dq_w1:.*]] = "tfl.dequantize"(%[[q_w1]])
 // CHECK-DAG: %[[dq_w2:.*]] = "tfl.dequantize"(%[[q_w2]])
 // CHECK: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[dq_w2]], %[[b]])
 // CHECK: %[[dconv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[dq_w1]], %[[b]])
-// CHECK: %[[bmm:.*]] = "tfl.batch_matmul"(%[[conv]], %[[dconv]]) {adj_x = false, adj_y = true
+// CHECK: %[[bmm:.*]] = "tfl.batch_matmul"(%[[conv]], %[[dconv]]) <{adj_x = false, adj_y = true
 // CHECK-NOT: , asymmetric_quantize_inputs = true
 // CHECK-SAME: }
 // CHECK: return %[[bmm:.*]]
 
 // PerTensor-DAG: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<64x3x3x3xf32>
 // PerTensor-DAG: %[[b:.*]] = arith.constant dense<-1.23697901> : tensor<64xf32>
-// PerTensor: %[[q_w:.*]] = "tfl.quantize"(%[[w]]) {qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>}
+// PerTensor: %[[q_w:.*]] = "tfl.quantize"(%[[w]]) <{qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>}>
 // PerTensor: %[[dq_w:.*]] = "tfl.dequantize"(%[[q_w]]) : (tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<64x3x3x3xf32>
 // PerTensor: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[dq_w]], %[[b]])
 // PerTensor: %[[dconv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[dq_w]], %[[b]])
-// PerTensor: %[[bmm:.*]] = "tfl.batch_matmul"(%[[conv]], %[[dconv]]) {adj_x = false, adj_y = true
+// PerTensor: %[[bmm:.*]] = "tfl.batch_matmul"(%[[conv]], %[[dconv]]) <{adj_x = false, adj_y = true
 // PerTensor-NOT: , asymmetric_quantize_inputs = true
 // PerTensor-SAME: }
 // PerTensor: return %[[bmm:.*]]
@@ -395,7 +395,7 @@ func.func @QuantizeMultiUses(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x112x112
 // Float16-DAG: %[[dq_b:.*]] = "tfl.dequantize"(%[[b:.*]]) : (tensor<64xf16>) -> tensor<64xf32>
 // Float16: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[dq_w]], %[[dq_b]])
 // Float16: %[[dconv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[dq_w]], %[[dq_b]])
-// Float16: %[[bmm:.*]] = "tfl.batch_matmul"(%[[conv]], %[[dconv]]) {adj_x = false, adj_y = true
+// Float16: %[[bmm:.*]] = "tfl.batch_matmul"(%[[conv]], %[[dconv]]) <{adj_x = false, adj_y = true
 // Float16: return %[[bmm:.*]]
 }
 
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-quantize-post-training-16bits.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-quantize-post-training-16bits.mlir
index 15ede0019e12d6..31d31c656b1d5f 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-quantize-post-training-16bits.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-quantize-post-training-16bits.mlir
@@ -58,7 +58,7 @@ func.func @QuantizeUnidirectionalLstmFullPerTensor(%arg0: tensor<1x2x3xf32>) ->
 // CHECK-DAG: %[[input_6:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x1x!quant.uniform<i8<-127:127>:f32, 0.0047244096365500624>>) -> tensor<1x1xf32>
 // CHECK-DAG: %[[input_7:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x1x!quant.uniform<i8<-127:127>:f32, 0.0055118109297564652>>) -> tensor<1x1xf32>
 // CHECK-DAG: %[[input_8:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x1x!quant.uniform<i8<-127:127>:f32, 0.0062992126922907796>>) -> tensor<1x1xf32>
-// CHECK-DAG: %[[input_9:.*]] = "tfl.no_value"() {value} : () -> none
+// CHECK-DAG: %[[input_9:.*]] = "tfl.no_value"() <{value}> : () -> none
 // CHECK-DAG: %[[input_10:.*]] = "tfl.dequantize"({{.*}}) : (tensor<3x!quant.uniform<i32:f32, 2.4030322780124744E-8>>) -> tensor<3xf32>
 // CHECK-DAG: %[[input_11:.*]] = "tfl.dequantize"({{.*}}) : (tensor<3x!quant.uniform<i32:f32, 4.8060645560249487E-8>>) -> tensor<3xf32>
 // CHECK-DAG: %[[input_12:.*]] = "tfl.dequantize"({{.*}}) : (tensor<3x!quant.uniform<i32:f32, 7.2090970130772759E-8>>) -> tensor<3xf32>
@@ -73,7 +73,7 @@ func.func @QuantizeUnidirectionalLstmFullPerTensor(%arg0: tensor<1x2x3xf32>) ->
 // CHECK-SAME: %[[input_10]], %[[input_11]], %[[input_12]], %[[input_13]],
 // CHECK-SAME: %[[input_9]], %[[input_9]],
 // CHECK-SAME: %[[input_14]], %[[input_15]],
-// CHECK-SAME: %[[input_9]], %[[input_9]], %[[input_9]], %[[input_9]]) {
+// CHECK-SAME: %[[input_9]], %[[input_9]], %[[input_9]], %[[input_9]]) <{
 // CHECK-SAME: asymmetric_quantize_inputs = false,
 // CHECK-SAME: cell_clip = 1.000000e+01 : f32,
 // CHECK-SAME: effective_hidden_scale_intermediate = tensor<0x!quant.uniform<i8:f32, {{.*}}>>,
@@ -83,7 +83,7 @@ func.func @QuantizeUnidirectionalLstmFullPerTensor(%arg0: tensor<1x2x3xf32>) ->
 // CHECK-SAME: input_to_input_intermediate = tensor<0xf32>,
 // CHECK-SAME: input_to_output_intermediate = tensor<0xf32>,
 // CHECK-SAME: proj_clip = 0.000000e+00 : f32,
-// CHECK-SAME: time_major = false} : (
+// CHECK-SAME: time_major = false}> : (
 // CHECK-SAME: tensor<1x2x3xf32>,
 // CHECK-SAME: tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>,
 // CHECK-SAME: tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>,
@@ -93,7 +93,7 @@ func.func @QuantizeUnidirectionalLstmFullPerTensor(%arg0: tensor<1x2x3xf32>) ->
 // CHECK-SAME: tensor<1x3xf32>, tensor<1x3xf32>,
 // CHECK-SAME: none, none, none, none)
 // CHECK-SAME: -> tensor<1x2x3xf32>
-// CHECK: "tfl.quantize"(%[[lstm]]) {qtype = tensor<1x2x3x!quant.uniform<i16:f32, {{.*}}>>, volatile} : (tensor<1x2x3xf32>) -> tensor<1x2x3x!quant.uniform<i16:f32, {{.*}}>>
+// CHECK: "tfl.quantize"(%[[lstm]]) <{qtype = tensor<1x2x3x!quant.uniform<i16:f32, {{.*}}>>}> {volatile} : (tensor<1x2x3xf32>) -> tensor<1x2x3x!quant.uniform<i16:f32, {{.*}}>>
 
 }
 
@@ -176,7 +176,7 @@ func.func @QuantizeUnidirectionalLstmFullPerAxis(%arg0: tensor<1x2x3xf32>) -> (t
 // CHECK-DAG: %[[input_6:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x1x!quant.uniform<i8<-127:127>:f32, 0.0047244096365500624>>) -> tensor<1x1xf32>
 // CHECK-DAG: %[[input_7:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x1x!quant.uniform<i8<-127:127>:f32, 0.0055118109297564652>>) -> tensor<1x1xf32>
 // CHECK-DAG: %[[input_8:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x1x!quant.uniform<i8<-127:127>:f32, 0.0062992126922907796>>) -> tensor<1x1xf32>
-// CHECK-DAG: %[[input_9:.*]] = "tfl.no_value"() {value} : () -> none
+// CHECK-DAG: %[[input_9:.*]] = "tfl.no_value"() <{value}> : () -> none
 // CHECK-DAG: %[[input_10:.*]] = "tfl.dequantize"({{.*}}) : (tensor<3x!quant.uniform<i32:f32, 2.4030322780124744E-8>>) -> tensor<3xf32>
 // CHECK-DAG: %[[input_11:.*]] = "tfl.dequantize"({{.*}}) : (tensor<3x!quant.uniform<i32:f32, 4.8060645560249487E-8>>) -> tensor<3xf32>
 // CHECK-DAG: %[[input_12:.*]] = "tfl.dequantize"({{.*}}) : (tensor<3x!quant.uniform<i32:f32, 7.2090970130772759E-8>>) -> tensor<3xf32>
@@ -191,14 +191,14 @@ func.func @QuantizeUnidirectionalLstmFullPerAxis(%arg0: tensor<1x2x3xf32>) -> (t
 // CHECK-SAME: %[[input_10]], %[[input_11]], %[[input_12]], %[[input_13]],
 // CHECK-SAME: %[[input_9]], %[[input_9]],
 // CHECK-SAME: %[[input_14]], %[[input_15]],
-// CHECK-SAME: %[[input_9]], %[[input_9]], %[[input_9]], %[[input_9]]) {
+// CHECK-SAME: %[[input_9]], %[[input_9]], %[[input_9]], %[[input_9]]) <{
 // CHECK-SAME: asymmetric_quantize_inputs = false,
 // CHECK-SAME: cell_clip = 1.000000e+01 : f32, effective_hidden_scale_intermediate = tensor<0x!quant.uniform<i8:f32, {{.*}}>>,
 // CHECK-SAME: fused_activation_function = "TANH",
 // CHECK-SAME: input_to_cell_intermediate = tensor<0xf32>,
 // CHECK-SAME: input_to_forget_intermediate = tensor<0xf32>,
 // CHECK-SAME: input_to_input_intermediate = tensor<0xf32>,
-// CHECK-SAME: input_to_output_intermediate = tensor<0xf32>, proj_clip = 0.000000e+00 : f32, time_major = false} : (
+// CHECK-SAME: input_to_output_intermediate = tensor<0xf32>, proj_clip = 0.000000e+00 : f32, time_major = false}> : (
 // CHECK-SAME: tensor<1x2x3xf32>,
 // CHECK-SAME: tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>,
 // CHECK-SAME: tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>,
@@ -208,7 +208,7 @@ func.func @QuantizeUnidirectionalLstmFullPerAxis(%arg0: tensor<1x2x3xf32>) -> (t
 // CHECK-SAME: tensor<1x3xf32>, tensor<1x3xf32>,
 // CHECK-SAME: none, none, none, none)
 // CHECK-SAME: -> tensor<1x2x3xf32>
-// CHECK: %32 = "tfl.quantize"(%31) {qtype = tensor<1x2x3x!quant.uniform<i16:f32:2, {{{.*}},{{.*}},{{.*}}}>>, volatile} : (tensor<1x2x3xf32>) -> tensor<1x2x3x!quant.uniform<i16:f32:2, {{{.*}},{{.*}},{{.*}}}>>
+// CHECK: %32 = "tfl.quantize"(%31) <{qtype = tensor<1x2x3x!quant.uniform<i16:f32:2, {{{.*}},{{.*}},{{.*}}}>>}> {volatile} : (tensor<1x2x3xf32>) -> tensor<1x2x3x!quant.uniform<i16:f32:2, {{{.*}},{{.*}},{{.*}}}>>
 
 }
 
@@ -219,10 +219,10 @@ func.func @QuantizeFixedOutputRangeInterfaceOpSoftmax(%arg0: tensor<1x1xf32>) ->
   %2 = "quantfork.stats"(%1) {layerStats = dense<[-1.0, 1.0]> : tensor<2xf32>} : (tensor<1x1xf32>) -> tensor<1x1xf32>
   func.return %2 : tensor<1x1xf32>
 
-// CHECK: %[[q1:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x1x!quant.uniform<i16:f32, {{.*}}>>, volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<i16:f32, {{.*}}>>
+// CHECK: %[[q1:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<1x1x!quant.uniform<i16:f32, {{.*}}>>}> {volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<i16:f32, {{.*}}>>
 // CHECK-NEXT: %[[dq1:.*]] = "tfl.dequantize"(%[[q1]]) : (tensor<1x1x!quant.uniform<i16:f32, {{.*}}>>) -> tensor<1x1xf32>
-// CHECK-NEXT: %[[sm:.*]] = "tfl.softmax"(%[[dq1]]) {{{.*}}} : (tensor<1x1xf32>) -> tensor<1x1xf32>
-// CHECK-NEXT: %[[q2:.*]] = "tfl.quantize"(%[[sm]]) {qtype = tensor<1x1x!quant.uniform<i16:f32, 3.0517578125E-5>>, volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<i16:f32, 3.0517578125E-5>>
+// CHECK-NEXT: %[[sm:.*]] = "tfl.softmax"(%[[dq1]]) <{{{.*}}}> : (tensor<1x1xf32>) -> tensor<1x1xf32>
+// CHECK-NEXT: %[[q2:.*]] = "tfl.quantize"(%[[sm]]) <{qtype = tensor<1x1x!quant.uniform<i16:f32, 3.0517578125E-5>>}> {volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<i16:f32, 3.0517578125E-5>>
 // CHECK-NEXT: %[[dq2:.*]] = "tfl.dequantize"(%[[q2]]) : (tensor<1x1x!quant.uniform<i16:f32, 3.0517578125E-5>>) -> tensor<1x1xf32>
 }
 
@@ -233,10 +233,10 @@ func.func @QuantizeFixedOutputRangeInterfaceOpL2Normalization(%arg0: tensor<1x1x
   %2 = "quantfork.stats"(%1) {layerStats = dense<[-1.0, 1.0]> : tensor<2xf32>} : (tensor<1x1xf32>) -> tensor<1x1xf32>
   func.return %2 : tensor<1x1xf32>
 
-// CHECK: %[[q1:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x1x!quant.uniform<i16:f32, {{.*}}>>, volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<i16:f32, {{.*}}>>
+// CHECK: %[[q1:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<1x1x!quant.uniform<i16:f32, {{.*}}>>}> {volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<i16:f32, {{.*}}>>
 // CHECK-NEXT: %[[dq1:.*]] = "tfl.dequantize"(%[[q1]]) : (tensor<1x1x!quant.uniform<i16:f32, {{.*}}>>) -> tensor<1x1xf32>
-// CHECK-NEXT: %[[l2:.*]] = "tfl.l2_normalization"(%[[dq1]]) {{{.*}}} : (tensor<1x1xf32>) -> tensor<1x1xf32>
-// CHECK-NEXT: %[[q2:.*]] = "tfl.quantize"(%[[l2]]) {qtype = tensor<1x1x!quant.uniform<i16:f32, 3.0517578125E-5>>, volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<i16:f32, 3.0517578125E-5>>
+// CHECK-NEXT: %[[l2:.*]] = "tfl.l2_normalization"(%[[dq1]]) <{{{.*}}}> : (tensor<1x1xf32>) -> tensor<1x1xf32>
+// CHECK-NEXT: %[[q2:.*]] = "tfl.quantize"(%[[l2]]) <{qtype = tensor<1x1x!quant.uniform<i16:f32, 3.0517578125E-5>>}> {volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<i16:f32, 3.0517578125E-5>>
 // CHECK-NEXT: %[[dq2:.*]] = "tfl.dequantize"(%[[q2]]) : (tensor<1x1x!quant.uniform<i16:f32, 3.0517578125E-5>>) -> tensor<1x1xf32>
 }
 
@@ -247,10 +247,10 @@ func.func @QuantizeFixedOutputRangeInterfaceOpLogistic(%arg0: tensor<1x1xf32>) -
   %2 = "quantfork.stats"(%1) {layerStats = dense<[-1.0, 1.0]> : tensor<2xf32>} : (tensor<1x1xf32>) -> tensor<1x1xf32>
   func.return %2 : tensor<1x1xf32>
 
-// CHECK: %[[q1:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x1x!quant.uniform<i16:f32, {{.*}}>>, volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<i16:f32, {{.*}}>>
+// CHECK: %[[q1:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<1x1x!quant.uniform<i16:f32, {{.*}}>>}> {volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<i16:f32, {{.*}}>>
 // CHECK-NEXT: %[[dq1:.*]] = "tfl.dequantize"(%[[q1]]) : (tensor<1x1x!quant.uniform<i16:f32, {{.*}}>>) -> tensor<1x1xf32>
 // CHECK-NEXT: %[[lo:.*]] = "tfl.logistic"(%[[dq1]]) : (tensor<1x1xf32>) -> tensor<1x1xf32>
-// CHECK-NEXT: %[[q2:.*]] = "tfl.quantize"(%[[lo]]) {qtype = tensor<1x1x!quant.uniform<i16:f32, 1.52587890625E-5:-32768>>, volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<i16:f32, 1.52587890625E-5:-32768>>
+// CHECK-NEXT: %[[q2:.*]] = "tfl.quantize"(%[[lo]]) <{qtype = tensor<1x1x!quant.uniform<i16:f32, 1.52587890625E-5:-32768>>}> {volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<i16:f32, 1.52587890625E-5:-32768>>
 // CHECK-NEXT: %[[dq2:.*]] = "tfl.dequantize"(%[[q2]]) : (tensor<1x1x!quant.uniform<i16:f32, 1.52587890625E-5:-32768>>) -> tensor<1x1xf32>
 }
 
@@ -261,10 +261,10 @@ func.func @QuantizeFixedOutputRangeInterfaceOpTanh(%arg0: tensor<1x1xf32>) -> (t
   %2 = "quantfork.stats"(%1) {layerStats = dense<[-1.0, 1.0]> : tensor<2xf32>} : (tensor<1x1xf32>) -> tensor<1x1xf32>
   func.return %2 : tensor<1x1xf32>
 
-// CHECK: %[[q1:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x1x!quant.uniform<i16:f32, {{.*}}>>, volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<i16:f32, {{.*}}>>
+// CHECK: %[[q1:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<1x1x!quant.uniform<i16:f32, {{.*}}>>}> {volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<i16:f32, {{.*}}>>
 // CHECK-NEXT: %[[dq1:.*]] = "tfl.dequantize"(%[[q1]]) : (tensor<1x1x!quant.uniform<i16:f32, {{.*}}>>) -> tensor<1x1xf32>
 // CHECK-NEXT: %[[ta:.*]] = "tfl.tanh"(%[[dq1]]) : (tensor<1x1xf32>) -> tensor<1x1xf32>
-// CHECK-NEXT: %[[q2:.*]] = "tfl.quantize"(%[[ta]]) {qtype = tensor<1x1x!quant.uniform<i16:f32, 3.0517578125E-5>>, volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<i16:f32, 3.0517578125E-5>>
+// CHECK-NEXT: %[[q2:.*]] = "tfl.quantize"(%[[ta]]) <{qtype = tensor<1x1x!quant.uniform<i16:f32, 3.0517578125E-5>>}> {volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<i16:f32, 3.0517578125E-5>>
 // CHECK-NEXT: %[[dq2:.*]] = "tfl.dequantize"(%[[q2]]) : (tensor<1x1x!quant.uniform<i16:f32, 3.0517578125E-5>>) -> tensor<1x1xf32>
 }
 
@@ -277,10 +277,10 @@ func.func @QuantizeReshapeOp(%arg0: tensor<1x1x3xf32>) -> (tensor<1x3xf32>) {
   func.return %4 : tensor<1x3xf32>
 
 // CHECK: %[[cst:.*]] = arith.constant dense<[-1, 3]> : tensor<2xi32>
-// CHECK-NEXT: %[[q1:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x1x3x!quant.uniform<i16:f32, {{.*}}>>, volatile} : (tensor<1x1x3xf32>) -> tensor<1x1x3x!quant.uniform<i16:f32, {{.*}}>>
+// CHECK-NEXT: %[[q1:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<1x1x3x!quant.uniform<i16:f32, {{.*}}>>}> {volatile} : (tensor<1x1x3xf32>) -> tensor<1x1x3x!quant.uniform<i16:f32, {{.*}}>>
 // CHECK-NEXT: %[[dq1:.*]] = "tfl.dequantize"(%[[q1]]) : (tensor<1x1x3x!quant.uniform<i16:f32, {{.*}}>>) -> tensor<1x1x3xf32>
 // CHECK-NEXT: %[[rs:.*]] = "tfl.reshape"(%[[dq1]], %[[cst]]) : (tensor<1x1x3xf32>, tensor<2xi32>) -> tensor<1x3xf32>
-// CHECK-NEXT: %[[q2:.*]] = "tfl.quantize"(%[[rs]]) {qtype = tensor<1x3x!quant.uniform<i16:f32, {{.*}}>>, volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i16:f32, {{.*}}>>
+// CHECK-NEXT: %[[q2:.*]] = "tfl.quantize"(%[[rs]]) <{qtype = tensor<1x3x!quant.uniform<i16:f32, {{.*}}>>}> {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i16:f32, {{.*}}>>
 // CHECK-NEXT: %[[dq2:.*]] = "tfl.dequantize"(%[[q2]]) : (tensor<1x3x!quant.uniform<i16:f32, {{.*}}>>) -> tensor<1x3xf32>
 // CHECK-NEXT: return %[[dq2]] : tensor<1x3xf32>
 }
@@ -295,15 +295,15 @@ func.func @QuantizeFullyConnectedOp(%arg0: tensor<1x3xf32>) -> (tensor<1x1xf32>)
   func.return %5 : tensor<1x1xf32>
 
 // CHECK: %[[cst:.*]] = arith.constant dense<{{.*}}> : tensor<1xf32>
-// CHECK-NEXT: %[[q1:.*]] = "tfl.quantize"(%[[cst]]) {qtype = tensor<1x!quant.uniform<i32:f32, {{.*}}>>, volatile} : (tensor<1xf32>) -> tensor<1x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK-NEXT: %[[q1:.*]] = "tfl.quantize"(%[[cst]]) <{qtype = tensor<1x!quant.uniform<i32:f32, {{.*}}>>}> {volatile} : (tensor<1xf32>) -> tensor<1x!quant.uniform<i32:f32, {{.*}}>>
 // CHECK-NEXT: %[[dq1:.*]] = "tfl.dequantize"(%[[q1]]) : (tensor<1x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1xf32>
 // CHECK-NEXT: %[[cst_0:.*]] = arith.constant dense<{{.*}}> : tensor<1x3xf32>
-// CHECK-NEXT: %[[q2:.*]] = "tfl.quantize"(%[[cst_0]]) {qtype = tensor<1x3x!quant.uniform<i8<-127:127>:f32:0, {{.*}}>>, volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8<-127:127>:f32:0, {{.*}}>>
+// CHECK-NEXT: %[[q2:.*]] = "tfl.quantize"(%[[cst_0]]) <{qtype = tensor<1x3x!quant.uniform<i8<-127:127>:f32:0, {{.*}}>>}> {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8<-127:127>:f32:0, {{.*}}>>
 // CHECK-NEXT: %[[dq2:.*]] = "tfl.dequantize"(%[[q2]]) : (tensor<1x3x!quant.uniform<i8<-127:127>:f32:0, {{.*}}>>) -> tensor<1x3xf32>
-// CHECK-NEXT: %[[q3:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x3x!quant.uniform<i16:f32, {{.*}}>>, volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i16:f32, {{.*}}>>
+// CHECK-NEXT: %[[q3:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<1x3x!quant.uniform<i16:f32, {{.*}}>>}> {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i16:f32, {{.*}}>>
 // CHECK-NEXT: %[[dq3:.*]] = "tfl.dequantize"(%[[q3]]) : (tensor<1x3x!quant.uniform<i16:f32, {{.*}}>>) -> tensor<1x3xf32>
-// CHECK-NEXT: %[[fc:.*]] = "tfl.fully_connected"(%[[dq3]], %[[dq2]], %[[dq1]]) {{{.*}}} : (tensor<1x3xf32>, tensor<1x3xf32>, tensor<1xf32>) -> tensor<1x1xf32>
-// CHECK-NEXT: %[[q4:.*]] = "tfl.quantize"(%[[fc]]) {qtype = tensor<1x1x!quant.uniform<i16:f32, {{.*}}>>, volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<i16:f32, {{.*}}>>
+// CHECK-NEXT: %[[fc:.*]] = "tfl.fully_connected"(%[[dq3]], %[[dq2]], %[[dq1]]) <{{{.*}}}> : (tensor<1x3xf32>, tensor<1x3xf32>, tensor<1xf32>) -> tensor<1x1xf32>
+// CHECK-NEXT: %[[q4:.*]] = "tfl.quantize"(%[[fc]]) <{qtype = tensor<1x1x!quant.uniform<i16:f32, {{.*}}>>}> {volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<i16:f32, {{.*}}>>
 // CHECK-NEXT: %[[dq4:.*]] = "tfl.dequantize"(%[[q4]]) : (tensor<1x1x!quant.uniform<i16:f32, {{.*}}>>) -> tensor<1x1xf32>
 // CHECK-NEXT: return %[[dq4]] : tensor<1x1xf32>
 }
@@ -321,19 +321,19 @@ func.func @QuantizeReshapeAndFullyConnectedOp(%arg0: tensor<1x1x3xf32>) -> (tens
   func.return %8 : tensor<1x1xf32>
 
 // CHECK: %[[cst:.*]] = arith.constant dense<{{.*}}> : tensor<1xf32>
-// CHECK-NEXT: %[[q1:.*]] = "tfl.quantize"(%[[cst]]) {qtype = tensor<1x!quant.uniform<i32:f32, {{.*}}>>, volatile} : (tensor<1xf32>) -> tensor<1x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK-NEXT: %[[q1:.*]] = "tfl.quantize"(%[[cst]]) <{qtype = tensor<1x!quant.uniform<i32:f32, {{.*}}>>}> {volatile} : (tensor<1xf32>) -> tensor<1x!quant.uniform<i32:f32, {{.*}}>>
 // CHECK-NEXT: %[[dq1:.*]] = "tfl.dequantize"(%[[q1]]) : (tensor<1x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1xf32>
 // CHECK-NEXT: %[[cst_0:.*]] = arith.constant dense<{{.*}}> : tensor<1x3xf32>
-// CHECK-NEXT: %[[q2:.*]] = "tfl.quantize"(%[[cst_0]]) {qtype = tensor<1x3x!quant.uniform<i8<-127:127>:f32:0, {{.*}}>>, volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8<-127:127>:f32:0, {{.*}}>>
+// CHECK-NEXT: %[[q2:.*]] = "tfl.quantize"(%[[cst_0]]) <{qtype = tensor<1x3x!quant.uniform<i8<-127:127>:f32:0, {{.*}}>>}> {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8<-127:127>:f32:0, {{.*}}>>
 // CHECK-NEXT: %[[dq2:.*]] = "tfl.dequantize"(%[[q2]]) : (tensor<1x3x!quant.uniform<i8<-127:127>:f32:0, {{.*}}>>) -> tensor<1x3xf32>
 // CHECK-NEXT: %[[cst_1:.*]] = arith.constant dense<[-1, 3]> : tensor<2xi32>
-// CHECK-NEXT: %[[q3:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x1x3x!quant.uniform<i16:f32, {{.*}}>>, volatile} : (tensor<1x1x3xf32>) -> tensor<1x1x3x!quant.uniform<i16:f32, {{.*}}>>
+// CHECK-NEXT: %[[q3:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<1x1x3x!quant.uniform<i16:f32, {{.*}}>>}> {volatile} : (tensor<1x1x3xf32>) -> tensor<1x1x3x!quant.uniform<i16:f32, {{.*}}>>
 // CHECK-NEXT: %[[dq3:.*]] = "tfl.dequantize"(%[[q3]]) : (tensor<1x1x3x!quant.uniform<i16:f32, {{.*}}>>) -> tensor<1x1x3xf32>
 // CHECK-NEXT: %[[rs:.*]] = "tfl.reshape"(%[[dq3]], %[[cst_1]]) : (tensor<1x1x3xf32>, tensor<2xi32>) -> tensor<1x3xf32>
-// CHECK-NEXT: %[[q4:.*]] = "tfl.quantize"(%[[rs]]) {qtype = tensor<1x3x!quant.uniform<i16:f32, {{.*}}>>, volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i16:f32, {{.*}}>>
+// CHECK-NEXT: %[[q4:.*]] = "tfl.quantize"(%[[rs]]) <{qtype = tensor<1x3x!quant.uniform<i16:f32, {{.*}}>>}> {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i16:f32, {{.*}}>>
 // CHECK-NEXT: %[[dq4:.*]] = "tfl.dequantize"(%[[q4]]) : (tensor<1x3x!quant.uniform<i16:f32, {{.*}}>>) -> tensor<1x3xf32>
-// CHECK-NEXT: %[[fc:.*]] = "tfl.fully_connected"(%[[dq4]], %[[dq2]], %[[dq1]]) {{{.*}}} : (tensor<1x3xf32>, tensor<1x3xf32>, tensor<1xf32>) -> tensor<1x1xf32>
-// CHECK-NEXT: %[[q5:.*]] = "tfl.quantize"(%[[fc]]) {qtype = tensor<1x1x!quant.uniform<i16:f32, {{.*}}>>, volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<i16:f32, {{.*}}>>
+// CHECK-NEXT: %[[fc:.*]] = "tfl.fully_connected"(%[[dq4]], %[[dq2]], %[[dq1]]) <{{{.*}}}> : (tensor<1x3xf32>, tensor<1x3xf32>, tensor<1xf32>) -> tensor<1x1xf32>
+// CHECK-NEXT: %[[q5:.*]] = "tfl.quantize"(%[[fc]]) <{qtype = tensor<1x1x!quant.uniform<i16:f32, {{.*}}>>}> {volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<i16:f32, {{.*}}>>
 // CHECK-NEXT: %[[dq5:.*]] = "tfl.dequantize"(%[[q5]]) : (tensor<1x1x!quant.uniform<i16:f32, {{.*}}>>) -> tensor<1x1xf32>
 // CHECK-NEXT: return %[[dq5]] : tensor<1x1xf32>
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-quantize-post-training.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-quantize-post-training.mlir
index 0b3ea28406825a..bf397c2d1b8463 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-quantize-post-training.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-quantize-post-training.mlir
@@ -29,9 +29,9 @@ func.func @QuantizeLstmCellInput(%arg0: tensor<1x28x28xf32>) -> tensor<1x28x20xf
         none, none, none, none) -> tensor<1x28x20xf32>
     %1 = "quantfork.stats"(%0) {layerStats = dense<[-1.0, 2.0]> : tensor<2xf32>} : (tensor<1x28x20xf32>) -> tensor<1x28x20xf32>
     func.return %1 : tensor<1x28x20xf32>
-// CHECK-DAG: %[[none:.*]] = "tfl.no_value"() {value} : () -> none
+// CHECK-DAG: %[[none:.*]] = "tfl.no_value"() <{value}> : () -> none
 // CHECK-DAG: %[[cell_input:.*]] = arith.constant dense<1.000000e+00> : tensor<1x20xf32>
-// CHECK-DAG: %[[q:.*]] = "tfl.quantize"(%[[cell_input]]) {qtype = tensor<1x20x!quant.uniform<i16:f32, 2.44140625E-4>>} : (tensor<1x20xf32>) -> tensor<1x20x!quant.uniform<i16:f32, 2.44140625E-4>>
+// CHECK-DAG: %[[q:.*]] = "tfl.quantize"(%[[cell_input]]) <{qtype = tensor<1x20x!quant.uniform<i16:f32, 2.44140625E-4>>}> : (tensor<1x20xf32>) -> tensor<1x20x!quant.uniform<i16:f32, 2.44140625E-4>>
 // CHECK-DAG: %[[dq:.*]] = "tfl.dequantize"(%[[q]]) : (tensor<1x20x!quant.uniform<i16:f32, 2.44140625E-4>>) -> tensor<1x20xf32>
 // Checks if input 19 is correctly passed from a dequantize op.
 // CHECK: %[[lstm:.*]] = "tfl.unidirectional_sequence_lstm"(%arg0, {{(%[^%,]+, )+}}%[[dq]], %[[none]], %[[none]], %[[none]], %[[none]])
@@ -113,7 +113,7 @@ func.func @QuantizeWithoutNorm(%arg0: tensor<1x1x5xf32>) -> tensor<*xf32> attrib
 // CHECK-SAME: %[[input_9]], %[[input_10]], %[[input_11]], %[[input_12]], %[[input_13]], %[[input_14]], %[[input_15]], %[[input_16]], %[[input_17]], %[[input_18]], %[[input_19]]
 // CHECK-SAME: effective_hidden_scale_intermediate = tensor<!quant.uniform<i8:f32, 0.0039215686274509803:-1>>
 
-// CHECK: "tfl.quantize"(%[[lstm]]) {qtype = tensor<*x!quant.uniform<i8:f32, 0.015686274509803921:-1>>, volatile}
+// CHECK: "tfl.quantize"(%[[lstm]]) <{qtype = tensor<*x!quant.uniform<i8:f32, 0.015686274509803921:-1>>}> {volatile}
 }
 
 // CHECK-LABEL: QuantizeLstmCifg
@@ -166,7 +166,7 @@ func.func @QuantizeLstmCifg(%arg0: tensor<1x5xf32>) -> tensor<*xf32> attributes
   %24 = "quantfork.stats"(%23) {layerStats = dense<[-1.0, 2.0]> : tensor<2xf32>} : (tensor<*xf32>) -> tensor<*xf32>
   func.return %24 : tensor<*xf32>
 
-// CHECK-DAG: %[[none:.*]] = "tfl.no_value"() {value} : () -> none
+// CHECK-DAG: %[[none:.*]] = "tfl.no_value"() <{value}> : () -> none
 // CHECK-DAG: %[[input_0:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x5x!quant.uniform<i8:f32, 0.010588235481112611:-15>>) -> tensor<1x5xf32>
 // CHECK-DAG: %[[input_2:.*]] = "tfl.dequantize"({{.*}}) : (tensor<2x5x!quant.uniform<i8<-127:127>:f32, 0.018341723389512912>>) -> tensor<2x5xf32>
 // CHECK-DAG: %[[input_3:.*]] = "tfl.dequantize"({{.*}}) : (tensor<2x5x!quant.uniform<i8<-127:127>:f32, 0.011170119751156785>>) -> tensor<2x5xf32>
@@ -190,12 +190,12 @@ func.func @QuantizeLstmCifg(%arg0: tensor<1x5xf32>) -> tensor<*xf32> attributes
 // CHECK: %[[lstm:.*]] = "tfl.lstm"(%[[input_0]], %[[none]], %[[input_2]], %[[input_3]], %[[input_4]], %[[none]], %[[input_6]], %[[input_7]], %[[input_8]],
 // CHECK-SAME: %[[none]], %[[input_10]], %[[input_11]], %[[none]], %[[input_13]], %[[input_14]], %[[input_15]], %[[input_16]], %[[input_17]], %[[input_18]], %[[input_19]],
 // CHECK-SAME: %[[none]], %[[input_21]], %[[input_22]], %[[input_23]])
-// CHECK-NEXT: effective_hidden_scale_intermediate = tensor<!quant.uniform<i8:f32, 0.0039215686274509803:-1>>
+// CHECK-SAME: effective_hidden_scale_intermediate = tensor<!quant.uniform<i8:f32, 0.0039215686274509803:-1>>
 // CHECK-SAME: input_to_cell_intermediate = tensor<!quant.uniform<i16<-32767:32767>:f32, 1.2207403790398877E-4>>
 // CHECK-SAME: input_to_forget_intermediate = tensor<!quant.uniform<i16<-32767:32767>:f32, 4.8829615161595508E-4>>
 // CHECK-SAME: input_to_output_intermediate = tensor<!quant.uniform<i16<-32767:32767>:f32, 3.0518509475997192E-5>>
 
-// CHECK: "tfl.quantize"(%[[lstm]]) {qtype = tensor<*x!quant.uniform<i8:f32, 0.015686274509803921:-1>>, volatile}
+// CHECK: "tfl.quantize"(%[[lstm]]) <{qtype = tensor<*x!quant.uniform<i8:f32, 0.015686274509803921:-1>>}> {volatile}
 }
 
 // CHECK-LABEL: QuantizeUnidirectionalLstmFull
@@ -286,7 +286,7 @@ func.func @QuantizeUnidirectionalLstmFull(%arg0: tensor<1x1x5xf32>) -> tensor<*x
 // CHECK-SAME: input_to_input_intermediate = tensor<!quant.uniform<i16<-32767:32767>:f32, 9.7659230323191015E-4>>
 // CHECK-SAME: input_to_output_intermediate = tensor<!quant.uniform<i16<-32767:32767>:f32, 3.0518509475997192E-5>>
 
-// CHECK: "tfl.quantize"(%[[lstm]]) {qtype = tensor<*x!quant.uniform<i8:f32, 0.015686274509803921:-1>>, volatile}
+// CHECK: "tfl.quantize"(%[[lstm]]) <{qtype = tensor<*x!quant.uniform<i8:f32, 0.015686274509803921:-1>>}> {volatile}
 }
 
 // CHECK-LABEL: QuantizeUnidirectionalLstmWithFixedOutputRangedInput
@@ -481,13 +481,13 @@ func.func @QuantizeLstmFull(%arg0: tensor<1x5xf32>) -> tensor<*xf32> attributes
 // CHECK: %[[lstm:.*]] = "tfl.lstm"(%[[input_0]], %[[input_1]], %[[input_2]], %[[input_3]], %[[input_4]], %[[input_5]], %[[input_6]], %[[input_7]], %[[input_8]],
 // CHECK-SAME: %[[input_9]], %[[input_10]], %[[input_11]], %[[input_12]], %[[input_13]], %[[input_14]], %[[input_15]], %[[input_16]], %[[input_17]], %[[input_18]], %[[input_19]],
 // CHECK-SAME: %[[input_20]], %[[input_21]], %[[input_22]], %[[input_23]])
-// CHECK-NEXT: effective_hidden_scale_intermediate = tensor<!quant.uniform<i8:f32, 0.0039215686274509803:-1>>
+// CHECK-SAME: effective_hidden_scale_intermediate = tensor<!quant.uniform<i8:f32, 0.0039215686274509803:-1>>
 // CHECK-SAME: input_to_cell_intermediate = tensor<!quant.uniform<i16<-32767:32767>:f32, 1.2207403790398877E-4>>
 // CHECK-SAME: input_to_forget_intermediate = tensor<!quant.uniform<i16<-32767:32767>:f32, 4.8829615161595508E-4>>
 // CHECK-SAME: input_to_input_intermediate = tensor<!quant.uniform<i16<-32767:32767>:f32, 9.7659230323191015E-4>>
 // CHECK-SAME: input_to_output_intermediate = tensor<!quant.uniform<i16<-32767:32767>:f32, 3.0518509475997192E-5>>
 
-// CHECK: "tfl.quantize"(%[[lstm]]) {qtype = tensor<*x!quant.uniform<i8:f32, 0.015686274509803921:-1>>, volatile}
+// CHECK: "tfl.quantize"(%[[lstm]]) <{qtype = tensor<*x!quant.uniform<i8:f32, 0.015686274509803921:-1>>}> {volatile}
 }
 
 // CHECK-LABEL: QuantizeSVDF
@@ -508,7 +508,7 @@ func.func @QuantizeSVDF(%arg0: tensor<1x3xf32>) -> tensor<1x2xf32>  {
 // CHECK-DAG: %[[input_3:.*]] = "tfl.dequantize"({{.*}}) : (tensor<2x!quant.uniform<i32:f32, 1.3900876031311922E-5>>)
 // CHECK-DAG: %[[input_4:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x4x!quant.uniform<i16<-32767:32767>:f32, 0.0037514108011770368>>)
 // CHECK: %[[svdf:.*]] = "tfl.svdf"(%[[input_0]], %[[input_1]], %[[input_2]], %[[input_3]], %[[input_4]])
-// CHECK: %[[q:.*]] = "tfl.quantize"(%[[svdf]]) {qtype = tensor<1x2x!quant.uniform<i8:f32, 0.12954867493872549:-128>>, volatile}
+// CHECK: %[[q:.*]] = "tfl.quantize"(%[[svdf]]) <{qtype = tensor<1x2x!quant.uniform<i8:f32, 0.12954867493872549:-128>>}> {volatile}
 // CHECK: %[[dq:.*]] = "tfl.dequantize"(%[[q]])
 // CHECK: return %[[dq]]
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-quantize-signed.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-quantize-signed.mlir
index 882b335135cf74..c6a2eb88e09e8f 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-quantize-signed.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-quantize-signed.mlir
@@ -7,7 +7,7 @@ func.func @uint8_to_int8(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
   %2 = "tfl.dequantize"(%1) : (tensor<2x2x!quant.uniform<u8:f32, 1.0:128>>) -> tensor<2x2xf32>
   func.return %2 : tensor<2x2xf32>
 
-// CHECK-NEXT: %[[q:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00>>} : (tensor<2x2xf32>)
+// CHECK-NEXT: %[[q:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00>>}> : (tensor<2x2xf32>)
 // CHECK-NEXT: %[[dq:.*]] = "tfl.dequantize"(%[[q]])
 // CHECK-NEXT: return %[[dq]] : tensor<2x2xf32>
 }
@@ -18,7 +18,7 @@ func.func @uint8_to_int8_per_axis(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
   %2 = "tfl.dequantize"(%1) : (tensor<2x2x!quant.uniform<u8:f32:1, {1.0:128, 1.0}>>) -> tensor<2x2xf32>
   func.return %2 : tensor<2x2xf32>
 
-// CHECK-NEXT: %[[q:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<2x2x!quant.uniform<i8:f32:1, {1.000000e+00,1.000000e+00:-128}>>}
+// CHECK-NEXT: %[[q:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<2x2x!quant.uniform<i8:f32:1, {1.000000e+00,1.000000e+00:-128}>>}>
 // CHECK-NEXT: %[[dq:.*]] = "tfl.dequantize"(%0)
 // CHECK-NEXT: return %[[dq]] : tensor<2x2xf32>
 }
@@ -29,7 +29,7 @@ func.func @uint8_to_int8_narrow_range(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32>
   %2 = "tfl.dequantize"(%1) : (tensor<2x2x!quant.uniform<u8<1:255>:f32, 1.0:255>>) -> tensor<2x2xf32>
   func.return %2 : tensor<2x2xf32>
 
-// CHECK-NEXT: %[[q:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<2x2x!quant.uniform<i8<-127:127>:f32, 1.000000e+00:127>>}
+// CHECK-NEXT: %[[q:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<2x2x!quant.uniform<i8<-127:127>:f32, 1.000000e+00:127>>}>
 // CHECK-NEXT: %[[dq:.*]] = "tfl.dequantize"(%[[q]])
 // CHECK-NEXT: return %[[dq]] : tensor<2x2xf32>
 }
@@ -49,9 +49,9 @@ func.func @prepareStatistics(%arg0: tensor<8x4x3xf32>) -> tensor<8x4x3xf32> {
   } : (tensor<8x4x3xf32>) -> tensor<8x4x3xf32>
   func.return %1 : tensor<8x4x3xf32>
 
-// CHECK: %[[q1:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<8x4x3x!quant.uniform<i8:f32, 0.0078431372549019607:-1>>, volatile}
+// CHECK: %[[q1:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<8x4x3x!quant.uniform<i8:f32, 0.0078431372549019607:-1>>}> {volatile}
 // CHECK: %[[dq1:.*]] = "tfl.dequantize"(%[[q1]])
-// CHECK: %[[q2:.*]] = "tfl.quantize"(%[[dq1]]) {qtype = tensor<8x4x3x!quant.uniform<i8:f32:2, {0.0078431372549019607:-1,0.062745098039215685:-1,0.0039215686274509803:-1}>>, volatile}
+// CHECK: %[[q2:.*]] = "tfl.quantize"(%[[dq1]]) <{qtype = tensor<8x4x3x!quant.uniform<i8:f32:2, {0.0078431372549019607:-1,0.062745098039215685:-1,0.0039215686274509803:-1}>>}> {volatile}
 // CHECK: %[[dq2:.*]] = "tfl.dequantize"(%[[q2]])
 // CHECK: return %[[dq2]]
 }
@@ -71,9 +71,9 @@ func.func @prepareStatisticsNudge(%arg0: tensor<8x4x3xf32>) -> tensor<8x4x3xf32>
   } : (tensor<8x4x3xf32>) -> tensor<8x4x3xf32>
   func.return %1 : tensor<8x4x3xf32>
 
-// CHECK: %[[q1:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<8x4x3x!quant.uniform<i8:f32, 0.0039215686274509803:-128>>, volatile}
+// CHECK: %[[q1:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<8x4x3x!quant.uniform<i8:f32, 0.0039215686274509803:-128>>}> {volatile}
 // CHECK: %[[dq1:.*]] = "tfl.dequantize"(%[[q1]])
-// CHECK: %[[q2:.*]] = "tfl.quantize"(%[[dq1]]) {qtype = tensor<8x4x3x!quant.uniform<i8:f32:2, {0.0078431372549019607:-1,0.031372549019607843:127,0.0039215686274509803:-1}>>, volatile}
+// CHECK: %[[q2:.*]] = "tfl.quantize"(%[[dq1]]) <{qtype = tensor<8x4x3x!quant.uniform<i8:f32:2, {0.0078431372549019607:-1,0.031372549019607843:127,0.0039215686274509803:-1}>>}> {volatile}
 // CHECK: %[[dq2:.*]] = "tfl.dequantize"(%[[q2]])
 // CHECK: return %[[dq2]]
 }
@@ -85,7 +85,7 @@ func.func @preparePrelu(%arg0: tensor<1x10x10x3xf32>) -> tensor<1x10x10x3xf32> {
   func.return %prelu : tensor<1x10x10x3xf32>
 
 // CHECK: %[[cst:.*]] = arith.constant dense<[{{\[}}[1.66394591, 3.61694336, 2.0382936]]]> : tensor<1x1x3xf32>
-// CHECK: %[[q:.*]] = "tfl.quantize"(%[[cst]]) {qtype = tensor<1x1x3x!quant.uniform<i8<-127:127>:f32, 0.028479868971456691>>, volatile} : (tensor<1x1x3xf32>) -> tensor<1x1x3x!quant.uniform<i8<-127:127>:f32, 0.028479868971456691>>
+// CHECK: %[[q:.*]] = "tfl.quantize"(%[[cst]]) <{qtype = tensor<1x1x3x!quant.uniform<i8<-127:127>:f32, 0.028479868971456691>>}> {volatile} : (tensor<1x1x3xf32>) -> tensor<1x1x3x!quant.uniform<i8<-127:127>:f32, 0.028479868971456691>>
 // CHECK: %[[dq:.*]] = "tfl.dequantize"(%[[q]]) : (tensor<1x1x3x!quant.uniform<i8<-127:127>:f32, 0.028479868971456691>>) -> tensor<1x1x3xf32>
 // CHECK: %[[p:.*]] = "tfl.prelu"(%arg0, %[[dq]]) : (tensor<1x10x10x3xf32>, tensor<1x1x3xf32>) -> tensor<1x10x10x3xf32>
 // CHECK: return %[[p]] : tensor<1x10x10x3xf32>
@@ -98,7 +98,7 @@ func.func @prepareAdd(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
   func.return %add : tensor<2x2xf32>
 
 // CHECK: %[[cst:.*]] = arith.constant dense<[{{\[}}0.000000e+00, 1.000000e+00], [2.000000e+00, 2.550000e+02]]>
-// CHECK: %[[q:.*]] = "tfl.quantize"(%[[cst]]) {qtype = tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:-128>>, volatile}
+// CHECK: %[[q:.*]] = "tfl.quantize"(%[[cst]]) <{qtype = tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:-128>>}> {volatile}
 // CHECK: %[[dq:.*]] = "tfl.dequantize"(%[[q]])
 // CHECK: %[[add:.*]] = tfl.add %arg0, %[[dq]]
 // CHECK: return %[[add]]
@@ -113,13 +113,13 @@ func.func @prepareConv2DSplat(%arg0: tensor<1x5x5x3xf32>) -> tensor<1x5x5x3xf32>
   func.return %conv : tensor<1x5x5x3xf32>
 
 // CHECK: %[[cst:.*]] = arith.constant dense<1.270000e+02> : tensor<3x3x3x3xf32>
-// CHECK: %[[q:.*]] = "tfl.quantize"(%[[cst]]) {qtype = tensor<3x3x3x3x!quant.uniform<i8<-127:127>:f32:0
+// CHECK: %[[q:.*]] = "tfl.quantize"(%[[cst]]) <{qtype = tensor<3x3x3x3x!quant.uniform<i8<-127:127>:f32:0
 // CHECK-SAME:  {1.000000e+00,1.000000e+00,1.000000e+00}
 // CHECK: %[[dq:.*]] = "tfl.dequantize"(%[[q]])
 // CHECK: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[dq]]
 
 // PerTensor: %[[cst:.*]] = arith.constant dense<1.270000e+02> : tensor<3x3x3x3xf32>
-// PerTensor: %[[q:.*]] = "tfl.quantize"(%[[cst]]) {qtype = tensor<3x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>, volatile}
+// PerTensor: %[[q:.*]] = "tfl.quantize"(%[[cst]]) <{qtype = tensor<3x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>}> {volatile}
 // PerTensor: %[[dq:.*]] = "tfl.dequantize"(%[[q]])
 // PerTensor: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[dq]]
 }
@@ -133,13 +133,13 @@ func.func @prepareConv2D(%arg0: tensor<1x5x5x1xf32>) -> tensor<1x5x5x3xf32> {
   func.return %conv : tensor<1x5x5x3xf32>
 
 // CHECK: %[[cst:.*]] = arith.constant dense<[{{\[\[\[}}0.000000e+00]]], [{{\[\[}}1.270000e+02]]], [{{\[\[}}-1.270000e+02]]]]>
-// CHECK: %[[q:.*]] = "tfl.quantize"(%[[cst]]) {qtype = tensor<3x1x1x1x!quant.uniform<i8<-127:127>:f32:0,
-// CHECK-SAME: {3.9370078740157481E-9,1.000000e+00,1.000000e+00}>>, volatile}
+// CHECK: %[[q:.*]] = "tfl.quantize"(%[[cst]]) <{qtype = tensor<3x1x1x1x!quant.uniform<i8<-127:127>:f32:0,
+// CHECK-SAME: {3.9370078740157481E-9,1.000000e+00,1.000000e+00}>>}> {volatile}
 // CHECK: %[[dq:.*]] = "tfl.dequantize"(%[[q]])
 // CHECK: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[dq]]
 
 // PerTensor: %[[cst:.*]] = arith.constant dense<[{{\[\[\[}}0.000000e+00]]], [{{\[\[}}1.270000e+02]]], [{{\[\[}}-1.270000e+02]]]]>
-// PerTensor: %[[q:.*]] = "tfl.quantize"(%[[cst]]) {qtype = tensor<3x1x1x1x!quant.uniform<i8<-127:127>:f32,
+// PerTensor: %[[q:.*]] = "tfl.quantize"(%[[cst]]) <{qtype = tensor<3x1x1x1x!quant.uniform<i8<-127:127>:f32,
 // PerTensor: %[[dq:.*]] = "tfl.dequantize"(%[[q]])
 // PerTensor: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[dq]]
 }
@@ -153,13 +153,13 @@ func.func @prepareDepthwiseConv2D(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x11
   func.return %dc : tensor<1x112x112x32xf32>
 
 // CHECK: %[[cst:.*]] = arith.constant dense<1.270000e+02> : tensor<32x3x3x3xf32>
-// CHECK: %[[q:.*]] = "tfl.quantize"(%[[cst]]) {qtype = tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32:3
+// CHECK: %[[q:.*]] = "tfl.quantize"(%[[cst]]) <{qtype = tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32:3
 // CHECK-SAME:  {1.000000e+00,1.000000e+00,1.000000e+00}
 // CHECK: %[[dq:.*]] = "tfl.dequantize"(%[[q]])
 // CHECK: %[[conv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[dq]]
 
 // PerTensor: %[[cst:.*]] = arith.constant dense<1.270000e+02> : tensor<32x3x3x3xf32>
-// PerTensor: %[[q:.*]] = "tfl.quantize"(%[[cst]]) {qtype = tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32,
+// PerTensor: %[[q:.*]] = "tfl.quantize"(%[[cst]]) <{qtype = tensor<32x3x3x3x!quant.uniform<i8<-127:127>:f32,
 // PerTensor: %[[dq:.*]] = "tfl.dequantize"(%[[q]])
 // PerTensor: %[[conv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[dq]]
 }
@@ -173,12 +173,12 @@ func.func @QuantizeFullyConnected(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x11
   func.return %fc : tensor<1x112x112x4xf32>
 
 // CHECK: %[[cst:.*]] = arith.constant dense<1.270000e+02> : tensor<4x12xf32>
-// CHECK: %[[q:.*]] = "tfl.quantize"(%cst) {qtype = tensor<4x12x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00}>>, volatile}
+// CHECK: %[[q:.*]] = "tfl.quantize"(%cst) <{qtype = tensor<4x12x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00}>>}> {volatile}
 // CHECK: %[[dq:.*]] = "tfl.dequantize"(%0) : (tensor<4x12x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00}>>) -> tensor<4x12xf32>
 // CHECK: "tfl.fully_connected"(%arg0, %[[dq]]
 
 // PerTensor: %[[cst:.*]] = arith.constant dense<1.270000e+02> : tensor<4x12xf32>
-// PerTensor: %[[q:.*]] = "tfl.quantize"(%cst) {qtype = tensor<4x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>, volatile}
+// PerTensor: %[[q:.*]] = "tfl.quantize"(%cst) <{qtype = tensor<4x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>}> {volatile}
 // PerTensor: %[[dq:.*]] = "tfl.dequantize"(%0) : (tensor<4x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<4x12xf32>
 // PerTensor: "tfl.fully_connected"(%arg0, %[[dq]]
 }
@@ -192,12 +192,12 @@ func.func @QuantizeTransposeConv(%arg0: tensor<32x4x4x128xf32>, %arg1: tensor<4x
   func.return %tc : tensor<1x32x42x128xf32>
 
 // CHECK: %[[CST:.*]] = arith.constant dense<1.270000e+02> : tensor<1x32x42x128xf32>
-// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CST]]) {qtype = tensor<1x32x42x128x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00}>>, volatile}
+// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CST]]) <{qtype = tensor<1x32x42x128x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00}>>}> {volatile}
 // CHECK: %[[DEQUANTIZE:.*]] = "tfl.dequantize"(%[[QUANTIZE]]) : (tensor<1x32x42x128x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00}>>) -> tensor<1x32x42x128xf32>
 // CHECK: "tfl.transpose_conv"(%arg1, %[[DEQUANTIZE]], %arg0,
 
 // PerTensor: %[[CST:.*]] = arith.constant dense<1.270000e+02> : tensor<1x32x42x128xf32>
-// PerTensor: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CST]]) {qtype = tensor<1x32x42x128x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>, volatile}
+// PerTensor: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CST]]) <{qtype = tensor<1x32x42x128x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>}> {volatile}
 // PerTensor: %[[DEQUANTIZE:.*]] = "tfl.dequantize"(%[[QUANTIZE]]) : (tensor<1x32x42x128x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<1x32x42x128xf32>
 // PerTensor: "tfl.transpose_conv"(%arg1, %[[DEQUANTIZE]], %arg0,
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir
index 3c83883dfea9a4..eea145a0b9f6ba 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir
@@ -60,7 +60,7 @@ func.func @not_reset_input(%arg0: tensor<f32>) -> (tensor<!quant.uniform<i16:f32
   %0 = "tfl.quantize"(%arg0) {qtype = tensor<!quant.uniform<i16:f32, 1.0>>} : (tensor<f32>) -> tensor<!quant.uniform<i16:f32, 1.0>>
   func.return %0: tensor<!quant.uniform<i16:f32, 1.0>>
 
-// CHECK-NEXT: %[[q:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<!quant.uniform<i16:f32, 1.000000e+00>>}
+// CHECK-NEXT: %[[q:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<!quant.uniform<i16:f32, 1.000000e+00>>}>
 // CHECK-NEXT: return %[[q]]
 }
 
@@ -92,9 +92,9 @@ func.func @prepareStatistics(%arg0: tensor<8x4x3xf32>) -> tensor<8x4x3xf32> {
   } : (tensor<8x4x3xf32>) -> tensor<8x4x3xf32>
   func.return %1 : tensor<8x4x3xf32>
 
-// CHECK: %[[q1:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<8x4x3x!quant.uniform<u8:f32, 0.0078431372549019607:128>>, volatile}
+// CHECK: %[[q1:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<8x4x3x!quant.uniform<u8:f32, 0.0078431372549019607:128>>}> {volatile}
 // CHECK: %[[dq1:.*]] = "tfl.dequantize"(%[[q1]])
-// CHECK: %[[q2:.*]] = "tfl.quantize"(%[[dq1]]) {qtype = tensor<8x4x3x!quant.uniform<u8:f32:2, {0.0078431372549019607:128,0.062745098039215685:128,0.0039215686274509803:128}>>, volatile}
+// CHECK: %[[q2:.*]] = "tfl.quantize"(%[[dq1]]) <{qtype = tensor<8x4x3x!quant.uniform<u8:f32:2, {0.0078431372549019607:128,0.062745098039215685:128,0.0039215686274509803:128}>>}> {volatile}
 // CHECK: %[[dq2:.*]] = "tfl.dequantize"(%[[q2]])
 // CHECK: return %[[dq2]]
 }
@@ -106,7 +106,7 @@ func.func @prepareNarrowStatistics(%arg0: tensor<8x4x3xf32>) -> tensor<8x4x3xf32
   } : (tensor<8x4x3xf32>) -> tensor<8x4x3xf32>
   func.return %0 : tensor<8x4x3xf32>
 
-// CHECK: %[[q:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<8x4x3x!quant.uniform<u8:f32, 7.8509803919350426E-9:128>>, volatile}
+// CHECK: %[[q:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<8x4x3x!quant.uniform<u8:f32, 7.8509803919350426E-9:128>>}> {volatile}
 // CHECK: %[[dq:.*]] = "tfl.dequantize"(%[[q]])
 // CHECK: return %[[dq]]
 }
@@ -123,7 +123,7 @@ func.func @QuantizeConv2DPerChannel(%arg0: tensor<1x224x224x3x!quant.uniform<u8:
   func.return %conv : tensor<1x112x112x32xf32>
 
 // CHECK-NEXT: %[[cst:.*]] = arith.constant dense<1.000000e+00> : tensor<32xf32>
-// CHECK-NEXT: %[[qbias:.*]] = "tfl.quantize"(%[[cst]]) {qtype = tensor<32x!quant.uniform<i32:f32:0, {1.500000e+00,3.000000e+00,4.500000e+00}>>, volatile}
+// CHECK-NEXT: %[[qbias:.*]] = "tfl.quantize"(%[[cst]]) <{qtype = tensor<32x!quant.uniform<i32:f32:0, {1.500000e+00,3.000000e+00,4.500000e+00}>>}> {volatile}
 // CHECK-NEXT: %[[bias:.*]] = "tfl.dequantize"(%[[qbias]])
 // CHECK-NEXT: %[[in:.*]] = "tfl.dequantize"(%arg0)
 // CHECK-NEXT: %[[w:.*]] = "tfl.dequantize"(%arg1)
@@ -143,7 +143,7 @@ func.func @QuantizeConv2DPerChannelConst(%arg0: tensor<1x224x224x3x!quant.unifor
   func.return %conv : tensor<1x112x112x32xf32>
 
 // CHECK-NEXT: %[[cst:.*]] = arith.constant dense<1.000000e+00> : tensor<32xf32>
-// CHECK-NEXT: %[[qbias:.*]] = "tfl.quantize"(%[[cst]]) {qtype = tensor<32x!quant.uniform<i32:f32:0, {1.500000e+00,3.000000e+00,4.500000e+00}>>, volatile}
+// CHECK-NEXT: %[[qbias:.*]] = "tfl.quantize"(%[[cst]]) <{qtype = tensor<32x!quant.uniform<i32:f32:0, {1.500000e+00,3.000000e+00,4.500000e+00}>>}> {volatile}
 // CHECK-NEXT: %[[bias:.*]] = "tfl.dequantize"(%[[qbias]])
 // CHECK-NEXT: %[[in:.*]] = "tfl.dequantize"(%arg0)
 // CHECK-NEXT: %[[w:.*]] = "tfl.dequantize"(%arg1)
@@ -163,7 +163,7 @@ func.func @QuantizeConv2DPerChannels(%arg0: tensor<1x224x224x3x!quant.uniform<u8
   func.return %conv : tensor<1x112x112x32xf32>
 
 // CHECK-NEXT: %[[cst:.*]] = arith.constant dense<1.000000e+00> : tensor<32xf32>
-// CHECK-NEXT: %[[qbias:.*]] = "tfl.quantize"(%[[cst]]) {qtype = tensor<32x!quant.uniform<i32:f32:0, {1.000000e+00,4.000000e+00,9.000000e+00}>>, volatile}
+// CHECK-NEXT: %[[qbias:.*]] = "tfl.quantize"(%[[cst]]) <{qtype = tensor<32x!quant.uniform<i32:f32:0, {1.000000e+00,4.000000e+00,9.000000e+00}>>}> {volatile}
 // CHECK-NEXT: %[[bias:.*]] = "tfl.dequantize"(%[[qbias]])
 // CHECK-NEXT: %[[in:.*]] = "tfl.dequantize"(%arg0)
 // CHECK-NEXT: %[[w:.*]] = "tfl.dequantize"(%arg1)
@@ -183,7 +183,7 @@ func.func @QuantizeConv2D(tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03
   func.return %6 : tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
 
 // CHECK: %cst = arith.constant dense<-1.23697901> : tensor<32xf32>
-// CHECK: %0 = "tfl.quantize"(%cst) {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>, volatile}
+// CHECK: %0 = "tfl.quantize"(%cst) <{qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>}> {volatile}
 // CHECK: %1 = "tfl.dequantize"(%0) : (tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>)
 // CHECK: %2 = "tfl.dequantize"(%arg0)
 // CHECK: %3 = "tfl.pseudo_qconst"()
@@ -205,7 +205,7 @@ func.func @QuantizeFullyConnected(tensor<1x224x224x3x!quant.uniform<u8:f32, 7.81
   func.return %6 : tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
 
 // CHECK: %cst = arith.constant dense<-1.23697901> : tensor<32xf32>
-// CHECK: %0 = "tfl.quantize"(%cst) {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>, volatile}
+// CHECK: %0 = "tfl.quantize"(%cst) <{qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>}> {volatile}
 // CHECK: %1 = "tfl.dequantize"(%0) : (tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>)
 // CHECK: %2 = "tfl.dequantize"(%arg0)
 // CHECK: %3 = "tfl.pseudo_qconst"()
@@ -227,7 +227,7 @@ func.func @QuantizeDepthwiseConv2D(tensor<1x224x224x3x!quant.uniform<u8:f32, 7.8
   func.return %6 : tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
 
 // CHECK: %cst = arith.constant dense<-1.23697901> : tensor<32xf32>
-// CHECK: %0 = "tfl.quantize"(%cst) {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>, volatile}
+// CHECK: %0 = "tfl.quantize"(%cst) <{qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>}> {volatile}
 // CHECK: %1 = "tfl.dequantize"(%0) : (tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>)
 // CHECK: %2 = "tfl.dequantize"(%arg0)
 // CHECK: %3 = "tfl.pseudo_qconst"()
@@ -308,7 +308,7 @@ func.func @QuantizeStridedSlice(tensor<12x2x2x5x!quant.uniform<u8:f32, 0.1>>, te
 
 // CHECK: %0 = "tfl.dequantize"(%arg0)
 // CHECK: %1 = "tfl.strided_slice"(%0, %arg1, %arg2, %arg3)
-// CHECK: %2 = "tfl.quantize"(%1) {qtype = tensor<1x2x2x5x!quant.uniform<u8:f32, 1.000000e-01>>, volatile}
+// CHECK: %2 = "tfl.quantize"(%1) <{qtype = tensor<1x2x2x5x!quant.uniform<u8:f32, 1.000000e-01>>}> {volatile}
 // CHECK: %3 = "tfl.dequantize"(%2)
 // CHECK: return %3 : tensor<1x2x2x5xf32>
 }
@@ -353,7 +353,7 @@ func.func @QuantizeReshape2D(tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03
 
 // CHECK: %0 = "tfl.dequantize"(%arg0) : (tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>)
 // CHECK: %1 = "tfl.reshape"(%0, %{{.*}}) : (tensor<1x6x6x16xf32>, tensor<3xi32>) -> tensor<1x36x16xf32>
-// CHECK: %2 = "tfl.quantize"(%1) {qtype = tensor<1x36x16x!quant.uniform<u8:f32, 7.812500e-03:128>>, volatile}
+// CHECK: %2 = "tfl.quantize"(%1) <{qtype = tensor<1x36x16x!quant.uniform<u8:f32, 7.812500e-03:128>>}> {volatile}
 // CHECK: %3 = "tfl.dequantize"(%2) : (tensor<1x36x16x!quant.uniform<u8:f32, 7.812500e-03:128>>)
 // CHECK: return %3 : tensor<1x36x16xf32>
 }
@@ -366,8 +366,8 @@ func.func @QuantizeSoftmax(tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:1
   func.return %1 : tensor<1x6x6x16xf32>
 
 // CHECK: %0 = "tfl.dequantize"(%arg0)
-// CHECK: %1 = "tfl.softmax"(%0) {beta = 1.000000e+00 : f32} : (tensor<1x6x6x16xf32>) -> tensor<1x6x6x16xf32>
-// CHECK: %2 = "tfl.quantize"(%1) {qtype = tensor<1x6x6x16x!quant.uniform<u8:f32, 3.906250e-03>>, volatile}
+// CHECK: %1 = "tfl.softmax"(%0) <{beta = 1.000000e+00 : f32}> : (tensor<1x6x6x16xf32>) -> tensor<1x6x6x16xf32>
+// CHECK: %2 = "tfl.quantize"(%1) <{qtype = tensor<1x6x6x16x!quant.uniform<u8:f32, 3.906250e-03>>}> {volatile}
 // CHECK: %3 = "tfl.dequantize"(%2)
 // CHECK: return %3 : tensor<1x6x6x16xf32>
 }
@@ -381,7 +381,7 @@ func.func @QuantizeLogistic(tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:
 
 // CHECK: %0 = "tfl.dequantize"(%arg0)
 // CHECK: %1 = "tfl.logistic"(%0) : (tensor<1x6x6x16xf32>) -> tensor<1x6x6x16xf32>
-// CHECK: %2 = "tfl.quantize"(%1) {qtype = tensor<1x6x6x16x!quant.uniform<u8:f32, 3.906250e-03>>, volatile}
+// CHECK: %2 = "tfl.quantize"(%1) <{qtype = tensor<1x6x6x16x!quant.uniform<u8:f32, 3.906250e-03>>}> {volatile}
 // CHECK: %3 = "tfl.dequantize"(%2) : (tensor<1x6x6x16x!quant.uniform<u8:f32, 3.906250e-03>>) -> tensor<1x6x6x16xf32>
 // CHECK: return %3 : tensor<1x6x6x16xf32>
 }
@@ -416,7 +416,7 @@ func.func @QDQNoQuantizeSoftmax(tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e
   func.return %1 : tensor<1x6x6x16xf32>
 
 // QDQ: %0 = "tfl.dequantize"(%arg0)
-// QDQ: %1 = "tfl.softmax"(%0) {beta = 1.000000e+00 : f32} : (tensor<1x6x6x16xf32>) -> tensor<1x6x6x16xf32>
+// QDQ: %1 = "tfl.softmax"(%0) <{beta = 1.000000e+00 : f32}> : (tensor<1x6x6x16xf32>) -> tensor<1x6x6x16xf32>
 // QDQ-NOT: "tfl.quantize"
 // QDQ: return %1 : tensor<1x6x6x16xf32>
 }
@@ -429,7 +429,7 @@ func.func @QuantizeL2Norm(%arg0: tensor<1x6x6x16x!quant.uniform<u8:f32, 1.0>>) -
 
 // CHECK: %[[in:.*]] = "tfl.dequantize"(%arg0)
 // CHECK: %[[l2:.*]] = "tfl.l2_normalization"(%[[in]])
-// CHECK: %[[q:.*]] = "tfl.quantize"(%[[l2]]) {qtype = tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>, volatile}
+// CHECK: %[[q:.*]] = "tfl.quantize"(%[[l2]]) <{qtype = tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>}> {volatile}
 // CHECK: %[[dq:.*]] = "tfl.dequantize"(%[[q]])
 // CHECK: return %[[dq]] : tensor<1x6x6x16xf32>
 }
@@ -452,11 +452,11 @@ func.func @QuantizeConcatOperand0ToAll(tensor<1x2x!quant.uniform<u8:f32, 0.1:128
   %1 = "tfl.concatenation"(%0, %arg1) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2xf32>
   func.return %1 : tensor<2x2xf32>
 
-// CHECK: %0 = "tfl.quantize"(%arg1) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>, volatile}
+// CHECK: %0 = "tfl.quantize"(%arg1) <{qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>}> {volatile}
 // CHECK: %1 = "tfl.dequantize"(%0) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
 // CHECK: %2 = "tfl.dequantize"(%arg0) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
-// CHECK: %3 = "tfl.concatenation"(%2, %1) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2xf32>
-// CHECK: %4 = "tfl.quantize"(%3) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>, volatile}
+// CHECK: %3 = "tfl.concatenation"(%2, %1) <{axis = 0 : i32, fused_activation_function = "NONE"}> : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2xf32>
+// CHECK: %4 = "tfl.quantize"(%3) <{qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>}> {volatile}
 // CHECK: %5 = "tfl.dequantize"(%4) : (tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<2x2xf32>
 // CHECK: return %5 : tensor<2x2xf32>
 }
@@ -468,11 +468,11 @@ func.func @QuantizeConcatOperand1ToAll(tensor<1x2xf32>, tensor<1x2x!quant.unifor
   %1 = "tfl.concatenation"(%arg0, %0) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2xf32>
   func.return %1 : tensor<2x2xf32>
 
-// CHECK: %0 = "tfl.quantize"(%arg0) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>, volatile}
+// CHECK: %0 = "tfl.quantize"(%arg0) <{qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>}> {volatile}
 // CHECK: %1 = "tfl.dequantize"(%0) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
 // CHECK: %2 = "tfl.dequantize"(%arg1) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
-// CHECK: %3 = "tfl.concatenation"(%1, %2) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2xf32>
-// CHECK: %4 = "tfl.quantize"(%3) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>, volatile}
+// CHECK: %3 = "tfl.concatenation"(%1, %2) <{axis = 0 : i32, fused_activation_function = "NONE"}> : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2xf32>
+// CHECK: %4 = "tfl.quantize"(%3) <{qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>}> {volatile}
 // CHECK: %5 = "tfl.dequantize"(%4) : (tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<2x2xf32>
 // CHECK: return %5 : tensor<2x2xf32>
 }
@@ -484,12 +484,12 @@ func.func @QuantizeConcatResToAll(tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x
   %1 = "tfl.quantize"(%0) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
   func.return %1 : tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 
-// CHECK: %0 = "tfl.quantize"(%arg1) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>, volatile}
+// CHECK: %0 = "tfl.quantize"(%arg1) <{qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>}> {volatile}
 // CHECK: %1 = "tfl.dequantize"(%0) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
-// CHECK: %2 = "tfl.quantize"(%arg0) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>, volatile}
+// CHECK: %2 = "tfl.quantize"(%arg0) <{qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>}> {volatile}
 // CHECK: %3 = "tfl.dequantize"(%2) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
-// CHECK: %4 = "tfl.concatenation"(%3, %1) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2xf32>
-// CHECK: %5 = "tfl.quantize"(%4) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
+// CHECK: %4 = "tfl.concatenation"(%3, %1) <{axis = 0 : i32, fused_activation_function = "NONE"}> : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2xf32>
+// CHECK: %5 = "tfl.quantize"(%4) <{qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>}> : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 // CHECK: return %5 : tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 }
 
@@ -501,11 +501,11 @@ func.func @QuantizeConcatResToAllNoRequantize(tensor<1x2x!quant.uniform<u8:f32,
   %2 = "tfl.quantize"(%1) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
   func.return %2 : tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 
-// CHECK: %0 = "tfl.quantize"(%arg1) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>, volatile}
+// CHECK: %0 = "tfl.quantize"(%arg1) <{qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>}> {volatile}
 // CHECK: %1 = "tfl.dequantize"(%0) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
 // CHECK: %2 = "tfl.dequantize"(%arg0) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
-// CHECK: %3 = "tfl.concatenation"(%2, %1) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2xf32>
-// CHECK: %4 = "tfl.quantize"(%3) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
+// CHECK: %3 = "tfl.concatenation"(%2, %1) <{axis = 0 : i32, fused_activation_function = "NONE"}> : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2xf32>
+// CHECK: %4 = "tfl.quantize"(%3) <{qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>}> : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 // CHECK: return %4 : tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 }
 
@@ -518,13 +518,13 @@ func.func @QuantizeConcatResToAllRequantize(tensor<1x2xf32>, tensor<1x2xf32>) ->
   %3 = "tfl.quantize"(%2) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
   func.return %3 : tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 
-// CHECK: %[[Q1:.*]] =  "tfl.quantize"(%arg1) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>, volatile}
+// CHECK: %[[Q1:.*]] =  "tfl.quantize"(%arg1) <{qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>}> {volatile}
 // CHECK: %[[DQ1:.*]] = "tfl.dequantize"(%[[Q1]]) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
-// CHECK: %[[Q0:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x2x!quant.uniform<u8:f32, 2.000000e+00:128>>} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<u8:f32, 2.000000e+00:128>>
-// CHECK: %[[RQ0:.*]] = "tfl.quantize"(%[[Q0]]) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<1x2x!quant.uniform<u8:f32, 2.000000e+00:128>>) -> tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
+// CHECK: %[[Q0:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<1x2x!quant.uniform<u8:f32, 2.000000e+00:128>>}> : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<u8:f32, 2.000000e+00:128>>
+// CHECK: %[[RQ0:.*]] = "tfl.quantize"(%[[Q0]]) <{qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>}> : (tensor<1x2x!quant.uniform<u8:f32, 2.000000e+00:128>>) -> tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 // CHECK: %[[DQ0:.*]] = "tfl.dequantize"(%[[RQ0]]) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
-// CHECK: %[[CONC:.*]] = "tfl.concatenation"(%[[DQ0]], %[[DQ1]]) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2xf32>
-// CHECK: %[[Q:.*]] = "tfl.quantize"(%[[CONC]]) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
+// CHECK: %[[CONC:.*]] = "tfl.concatenation"(%[[DQ0]], %[[DQ1]]) <{axis = 0 : i32, fused_activation_function = "NONE"}> : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2xf32>
+// CHECK: %[[Q:.*]] = "tfl.quantize"(%[[CONC]]) <{qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>}> : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 // CHECK: return %[[Q]] : tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 }
 
@@ -536,12 +536,12 @@ func.func @QuantizeConcatResToAllRequantizeArg(tensor<1x2x!quant.uniform<u8:f32,
   %3 = "tfl.quantize"(%2) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
   func.return %3 : tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 
-// CHECK: %[[Q1:.*]] =  "tfl.quantize"(%arg1) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>, volatile}
+// CHECK: %[[Q1:.*]] =  "tfl.quantize"(%arg1) <{qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>}> {volatile}
 // CHECK: %[[DQ1:.*]] = "tfl.dequantize"(%[[Q1]]) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
-// CHECK: %[[RQ0:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<1x2x!quant.uniform<u8:f32, 2.000000e+00:128>>) -> tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
+// CHECK: %[[RQ0:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>}> : (tensor<1x2x!quant.uniform<u8:f32, 2.000000e+00:128>>) -> tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 // CHECK: %[[DQ0:.*]] = "tfl.dequantize"(%[[RQ0]]) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
-// CHECK: %[[CONC:.*]] = "tfl.concatenation"(%[[DQ0]], %[[DQ1]]) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2xf32>
-// CHECK: %[[Q:.*]] = "tfl.quantize"(%[[CONC]]) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
+// CHECK: %[[CONC:.*]] = "tfl.concatenation"(%[[DQ0]], %[[DQ1]]) <{axis = 0 : i32, fused_activation_function = "NONE"}> : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2xf32>
+// CHECK: %[[Q:.*]] = "tfl.quantize"(%[[CONC]]) <{qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>}> : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 // CHECK: return %[[Q]] : tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 }
 
@@ -551,8 +551,8 @@ func.func @NotRequantizeAlreadyQuantizedModel(%arg0: tensor<1x73x73x64x!quant.un
   %10 = "tfl.concatenation"(%arg0, %9) {axis = 3 : i32, fused_activation_function = "NONE"} : (tensor<1x73x73x64x!quant.uniform<u8:f32, 1.0>>, tensor<1x73x73x96x!quant.uniform<u8:f32, 2.0>>) -> tensor<1x73x73x160x!quant.uniform<u8:f32, 1.0>>
   func.return %10 : tensor<1x73x73x160x!quant.uniform<u8:f32, 1.0>>
 
-// CHECK: %[[max:.*]] = "tfl.max_pool_2d"(%arg1) {filter_height = 3 : i32, filter_width = 3 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x147x147x96x!quant.uniform<u8:f32, 2.000000e+00>>) -> tensor<1x73x73x96x!quant.uniform<u8:f32, 2.000000e+00>>
-// CHECK: %[[cat:.*]] = "tfl.concatenation"(%arg0, %[[max]]) {axis = 3 : i32, fused_activation_function = "NONE"} : (tensor<1x73x73x64x!quant.uniform<u8:f32, 1.000000e+00>>, tensor<1x73x73x96x!quant.uniform<u8:f32, 2.000000e+00>>) -> tensor<1x73x73x160x!quant.uniform<u8:f32, 1.000000e+00>>
+// CHECK: %[[max:.*]] = "tfl.max_pool_2d"(%arg1) <{filter_height = 3 : i32, filter_width = 3 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 2 : i32, stride_w = 2 : i32}> : (tensor<1x147x147x96x!quant.uniform<u8:f32, 2.000000e+00>>) -> tensor<1x73x73x96x!quant.uniform<u8:f32, 2.000000e+00>>
+// CHECK: %[[cat:.*]] = "tfl.concatenation"(%arg0, %[[max]]) <{axis = 3 : i32, fused_activation_function = "NONE"}> : (tensor<1x73x73x64x!quant.uniform<u8:f32, 1.000000e+00>>, tensor<1x73x73x96x!quant.uniform<u8:f32, 2.000000e+00>>) -> tensor<1x73x73x160x!quant.uniform<u8:f32, 1.000000e+00>>
 // CHECK: return %[[cat]] : tensor<1x73x73x160x!quant.uniform<u8:f32, 1.000000e+00>>
 }
 
@@ -577,22 +577,22 @@ func.func @QuantizeChain(tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:
   func.return %10 : tensor<1x36x16xf32>
 
 // CHECK: %cst = arith.constant dense<-1.23697901> : tensor<32xf32>
-// CHECK: %0 = "tfl.quantize"(%cst) {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>, volatile}
+// CHECK: %0 = "tfl.quantize"(%cst) <{qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>}> {volatile}
 // CHECK: %1 = "tfl.dequantize"(%0) : (tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>)
 // CHECK: %2 = "tfl.dequantize"(%arg0) : (tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>)
 // CHECK: %3 = "tfl.pseudo_qconst"()
 // CHECK: %4 = "tfl.dequantize"(%3) : (tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32, 0.021826678373682216:151>>)
 // CHECK: %5 = "tfl.average_pool_2d"(%2)
-// CHECK: %6 = "tfl.quantize"(%5) {qtype = tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>, volatile}
+// CHECK: %6 = "tfl.quantize"(%5) <{qtype = tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>}> {volatile}
 // CHECK: %7 = "tfl.dequantize"(%6) : (tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>)
 // CHECK: %8 = "tfl.conv_2d"(%7, %4, %1)
-// CHECK: %9 = "tfl.quantize"(%8) {qtype = tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>}
+// CHECK: %9 = "tfl.quantize"(%8) <{qtype = tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>}>
 // CHECK: %10 = "tfl.dequantize"(%9) : (tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>)
 // CHECK: %11 = "tfl.reshape"(%10, %{{.*}})
-// CHECK: %12 = "tfl.quantize"(%11) {qtype = tensor<1x36x16x!quant.uniform<u8:f32, 0.023528476789885875>>, volatile}
+// CHECK: %12 = "tfl.quantize"(%11) <{qtype = tensor<1x36x16x!quant.uniform<u8:f32, 0.023528476789885875>>}> {volatile}
 // CHECK: %13 = "tfl.dequantize"(%12) : (tensor<1x36x16x!quant.uniform<u8:f32, 0.023528476789885875>>)
 // CHECK: %14 = "tfl.softmax"(%13)
-// CHECK: %15 = "tfl.quantize"(%14) {qtype = tensor<1x36x16x!quant.uniform<u8:f32, 3.906250e-03>>, volatile}
+// CHECK: %15 = "tfl.quantize"(%14) <{qtype = tensor<1x36x16x!quant.uniform<u8:f32, 3.906250e-03>>}> {volatile}
 // CHECK: %16 = "tfl.dequantize"(%15) : (tensor<1x36x16x!quant.uniform<u8:f32, 3.906250e-03>>)
 // CHECK: return %16 : tensor<1x36x16xf32>
 }
@@ -603,7 +603,7 @@ func.func @QuantizeConstant() -> tensor<2x3xf32> {
   func.return %cst : tensor<2x3xf32>
 
 // CHECK: %cst = arith.constant dense{{.*}}tensor<2x3xf32>
-// CHECK: %0 = "tfl.quantize"(%cst) {qtype = tensor<2x3x!quant.uniform<u8:f32, 0.023529411764705882:128>>, volatile}
+// CHECK: %0 = "tfl.quantize"(%cst) <{qtype = tensor<2x3x!quant.uniform<u8:f32, 0.023529411764705882:128>>}> {volatile}
 // CHECK: %1 = "tfl.dequantize"(%0)
 // CHECK: return %1 : tensor<2x3xf32>
 }
@@ -613,7 +613,7 @@ func.func @NotQuantizeNoneType() -> none {
   %cst = "tfl.no_value"() {value = unit} : () -> none
   func.return %cst : none
 
-// CHECK-NEXT:  %[[cst:.*]] = "tfl.no_value"() {value} : () -> none
+// CHECK-NEXT:  %[[cst:.*]] = "tfl.no_value"() <{value}> : () -> none
 // CHECK-NEXT:  return %[[cst]]
 }
 
@@ -623,7 +623,7 @@ func.func @QuantizeZeroSplat() -> tensor<2x3xf32> {
   func.return %cst : tensor<2x3xf32>
 
 // CHECK-NEXT:  %[[cst:.*]] = arith.constant dense<0.000000e+00> : tensor<2x3xf32>
-// CHECK-NEXT:  "tfl.quantize"(%[[cst]]) {qtype = tensor<2x3x!quant.uniform<u8:f32, 3.9215686274509805E-9:127>>, volatile}
+// CHECK-NEXT:  "tfl.quantize"(%[[cst]]) <{qtype = tensor<2x3x!quant.uniform<u8:f32, 3.9215686274509805E-9:127>>}> {volatile}
 }
 
 // CHECK-LABEL: QuantizeZeroScalar
@@ -632,7 +632,7 @@ func.func @QuantizeZeroScalar() -> tensor<f32> {
   func.return %cst : tensor<f32>
 
 // CHECK-NEXT:  %[[cst:.*]] = arith.constant dense<0.000000e+00> : tensor<f32>
-// CHECK-NEXT:  "tfl.quantize"(%[[cst]]) {qtype = tensor<!quant.uniform<u8:f32, 3.9215686274509805E-9:127>>, volatile}
+// CHECK-NEXT:  "tfl.quantize"(%[[cst]]) <{qtype = tensor<!quant.uniform<u8:f32, 3.9215686274509805E-9:127>>}> {volatile}
 }
 
 // CHECK-LABEL: QuantizePositiveSplat
@@ -641,7 +641,7 @@ func.func @QuantizePositiveSplat() -> tensor<2x3xf32> {
   func.return %cst : tensor<2x3xf32>
 
 // CHECK-NEXT:  %[[cst:.*]] = arith.constant dense<2.540000e+01> : tensor<2x3xf32>
-// CHECK-NEXT:  "tfl.quantize"(%[[cst]]) {qtype = tensor<2x3x!quant.uniform<u8:f32, 0.099607841641295186>>, volatile}
+// CHECK-NEXT:  "tfl.quantize"(%[[cst]]) <{qtype = tensor<2x3x!quant.uniform<u8:f32, 0.099607841641295186>>}> {volatile}
 }
 
 // CHECK-LABEL: QuantizePositiveScalar
@@ -650,7 +650,7 @@ func.func @QuantizePositiveScalar() -> tensor<f32> {
   func.return %cst : tensor<f32>
 
 // CHECK-NEXT:  %[[cst:.*]] = arith.constant dense<2.540000e+00> : tensor<f32>
-// CHECK-NEXT:  "tfl.quantize"(%[[cst]]) {qtype = tensor<!quant.uniform<u8:f32, 0.0099607841641295193>>, volatile}
+// CHECK-NEXT:  "tfl.quantize"(%[[cst]]) <{qtype = tensor<!quant.uniform<u8:f32, 0.0099607841641295193>>}> {volatile}
 }
 
 // CHECK-LABEL: QuantizeNegativeSplat
@@ -659,7 +659,7 @@ func.func @QuantizeNegativeSplat() -> tensor<2x3xf32> {
   func.return %cst : tensor<2x3xf32>
 
 // CHECK-NEXT:  %[[cst:.*]] = arith.constant dense<-2.540000e+00> : tensor<2x3xf32>
-// CHECK-NEXT:  "tfl.quantize"(%[[cst]]) {qtype = tensor<2x3x!quant.uniform<u8:f32, 0.0099607841641295193:255>>, volatile}
+// CHECK-NEXT:  "tfl.quantize"(%[[cst]]) <{qtype = tensor<2x3x!quant.uniform<u8:f32, 0.0099607841641295193:255>>}> {volatile}
 }
 
 // CHECK-LABEL: QuantizeNegativeScalar
@@ -668,7 +668,7 @@ func.func @QuantizeNegativeScalar() -> tensor<f32> {
   func.return %cst : tensor<f32>
 
 // CHECK-NEXT:  %[[cst:.*]] = arith.constant dense<-2.540000e+01> : tensor<f32>
-// CHECK-NEXT:  "tfl.quantize"(%[[cst]]) {qtype = tensor<!quant.uniform<u8:f32, 0.099607841641295186:255>>, volatile}
+// CHECK-NEXT:  "tfl.quantize"(%[[cst]]) <{qtype = tensor<!quant.uniform<u8:f32, 0.099607841641295186:255>>}> {volatile}
 }
 
 // Make sure biases are not shared.
@@ -721,7 +721,7 @@ func.func @QuantizeSharedBiases2(
 // CHECK: %[[q:.*]] = "tfl.quantize"(%[[cst]])
 // CHECK: %[[dq:.*]] = "tfl.dequantize"(%[[q]])
 // CHECK: %[[cst_0:.*]] = arith.constant dense<0.000000e+00> : tensor<32xf32>
-// CHECK: %[[q_0:.*]] = "tfl.quantize"(%[[cst_0]]) {qtype = tensor<32x!quant.uniform<u8:f32, 3.9215686274509805E-9:127>>, volatile}
+// CHECK: %[[q_0:.*]] = "tfl.quantize"(%[[cst_0]]) <{qtype = tensor<32x!quant.uniform<u8:f32, 3.9215686274509805E-9:127>>}> {volatile}
 // CHECK: %[[dq_0:.*]] = "tfl.dequantize"(%[[q_0]])
 // CHECK: %{{.*}} = tfl.add %{{.*}}, %[[dq_0]]
 // CHECK: %{{.*}} = "tfl.conv_2d"(%{{.*}}, %{{.*}}, %[[dq]])
@@ -746,7 +746,7 @@ func.func @QuantizeSharedBiases3(
   func.return %3, %7 : tensor<32x!quant.uniform<u8:f32, 1.0>>, tensor<1x56x56x32x!quant.uniform<u8:f32, 1.0>>
 
 // CHECK: %[[cst:.*]] = arith.constant dense<0.000000e+00> : tensor<32xf32>
-// CHECK: %[[q:.*]] = "tfl.quantize"(%[[cst]]) {qtype = tensor<32x!quant.uniform<i32:f32, 2.000000e+00>>, volatile}
+// CHECK: %[[q:.*]] = "tfl.quantize"(%[[cst]]) <{qtype = tensor<32x!quant.uniform<i32:f32, 2.000000e+00>>}> {volatile}
 // CHECK: %[[dq:.*]] = "tfl.dequantize"(%[[q]])
 // CHECK: %[[cst_0:.*]] = arith.constant dense<0.000000e+00> : tensor<32xf32>
 // CHECK: %[[q_0:.*]] = "tfl.quantize"(%[[cst_0]])
@@ -796,10 +796,10 @@ func.func @QuantizeWeight(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x112x112x32
   func.return %c : tensor<1x112x112x32xf32>
 
 // CHECK: %[[w:.*]] = arith.constant dense<1.000000e+00> : tensor<32x3x3x3xf32>
-// CHECK: %[[q:.*]] = "tfl.quantize"(%[[w]]) {qtype = tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32, 0.003937007874015748:1>>, volatile} : (tensor<32x3x3x3xf32>) -> tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32, 0.003937007874015748:1>>
+// CHECK: %[[q:.*]] = "tfl.quantize"(%[[w]]) <{qtype = tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32, 0.003937007874015748:1>>}> {volatile} : (tensor<32x3x3x3xf32>) -> tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32, 0.003937007874015748:1>>
 // CHECK: %[[dq:.*]] = "tfl.dequantize"(%[[q]]) : (tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32, 0.003937007874015748:1>>) -> tensor<32x3x3x3xf32>
 // CHECK: %[[b:.*]] = arith.constant dense<-1.000000e+00> : tensor<32xf32>
-// CHECK: %[[c:.*]] = "tfl.conv_2d"(%arg0, %[[dq]], %[[b]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x224x224x3xf32>, tensor<32x3x3x3xf32>, tensor<32xf32>) -> tensor<1x112x112x32xf32>
+// CHECK: %[[c:.*]] = "tfl.conv_2d"(%arg0, %[[dq]], %[[b]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32}> : (tensor<1x224x224x3xf32>, tensor<32x3x3x3xf32>, tensor<32xf32>) -> tensor<1x112x112x32xf32>
 // CHECK: return %[[c]] : tensor<1x112x112x32xf32>
 }
 
@@ -812,7 +812,7 @@ func.func @NoRedundantQuantizeWeight() -> tensor<1x112x112x32xf32> {
   func.return %dq : tensor<1x112x112x32xf32>
 
 // CHECK-NEXT: %[[w:.*]] = arith.constant dense<1.000000e+00> : tensor<1x112x112x32xf32>
-// CHECK-NEXT: %[[q:.*]] = "tfl.quantize"(%[[w]]) {qtype = tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>}
+// CHECK-NEXT: %[[q:.*]] = "tfl.quantize"(%[[w]]) <{qtype = tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>}>
 // CHECK-NEXT: %[[dq:.*]] = "tfl.dequantize"(%[[q]])
 // CHECK-NEXT: return %[[dq]] : tensor<1x112x112x32xf32>
 }
@@ -849,34 +849,34 @@ func.func @QuantizedCatsAddRequantsTest(%arg0: tensor<1x1xf32>, %arg1: tensor<1x
   %13 = "tfl.concatenation"(%9, %3) {axis = -1 : i32, fused_activation_function = "NONE"} : (tensor<1x2xf32>, tensor<1x1xf32>) -> tensor<1x3xf32>
   %14 = "quantfork.stats"(%13) {layerStats = dense<[-0.488159984, 0.398609281]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
   func.return %10, %14 : tensor<1x4xf32>, tensor<1x3xf32>
-// CHECK-NEXT: %[[q0:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x1x!quant.uniform<u8:f32, 0.0024715415402954701:178>>, volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<u8:f32, 0.0024715415402954701:178>>
-// CHECK-NEXT: %[[r0q0:.*]] = "tfl.quantize"(%[[q0]]) {qtype = tensor<1x1x!quant.uniform<u8:f32, 0.0026575490540149166:184>>} : (tensor<1x1x!quant.uniform<u8:f32, 0.0024715415402954701:178>>) -> tensor<1x1x!quant.uniform<u8:f32, 0.0026575490540149166:184>>
-// CHECK-NEXT: %[[r1q0:.*]] = "tfl.quantize"(%[[q0]]) {qtype = tensor<1x1x!quant.uniform<u8:f32, 0.0027669200710221833:159>>} : (tensor<1x1x!quant.uniform<u8:f32, 0.0024715415402954701:178>>) -> tensor<1x1x!quant.uniform<u8:f32, 0.0027669200710221833:159>>
+// CHECK-NEXT: %[[q0:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<1x1x!quant.uniform<u8:f32, 0.0024715415402954701:178>>}> {volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<u8:f32, 0.0024715415402954701:178>>
+// CHECK-NEXT: %[[r0q0:.*]] = "tfl.quantize"(%[[q0]]) <{qtype = tensor<1x1x!quant.uniform<u8:f32, 0.0026575490540149166:184>>}> : (tensor<1x1x!quant.uniform<u8:f32, 0.0024715415402954701:178>>) -> tensor<1x1x!quant.uniform<u8:f32, 0.0026575490540149166:184>>
+// CHECK-NEXT: %[[r1q0:.*]] = "tfl.quantize"(%[[q0]]) <{qtype = tensor<1x1x!quant.uniform<u8:f32, 0.0027669200710221833:159>>}> : (tensor<1x1x!quant.uniform<u8:f32, 0.0024715415402954701:178>>) -> tensor<1x1x!quant.uniform<u8:f32, 0.0027669200710221833:159>>
 // CHECK-NEXT: %[[d1q0:.*]] = "tfl.dequantize"(%[[r1q0]]) : (tensor<1x1x!quant.uniform<u8:f32, 0.0027669200710221833:159>>) -> tensor<1x1xf32>
 // CHECK-NEXT: %[[d0q0:.*]] = "tfl.dequantize"(%[[r0q0]]) : (tensor<1x1x!quant.uniform<u8:f32, 0.0026575490540149166:184>>) -> tensor<1x1xf32>
-// CHECK-NEXT: %[[q1:.*]] = "tfl.quantize"(%arg1) {qtype = tensor<1x1x!quant.uniform<u8:f32, 0.0016452147680170396:94>>, volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<u8:f32, 0.0016452147680170396:94>>
-// CHECK-NEXT: %[[r0q1:.*]] = "tfl.quantize"(%[[q1]]) {qtype = tensor<1x1x!quant.uniform<u8:f32, 0.0027669200710221833:159>>} : (tensor<1x1x!quant.uniform<u8:f32, 0.0016452147680170396:94>>) -> tensor<1x1x!quant.uniform<u8:f32, 0.0027669200710221833:159>>
+// CHECK-NEXT: %[[q1:.*]] = "tfl.quantize"(%arg1) <{qtype = tensor<1x1x!quant.uniform<u8:f32, 0.0016452147680170396:94>>}> {volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<u8:f32, 0.0016452147680170396:94>>
+// CHECK-NEXT: %[[r0q1:.*]] = "tfl.quantize"(%[[q1]]) <{qtype = tensor<1x1x!quant.uniform<u8:f32, 0.0027669200710221833:159>>}> : (tensor<1x1x!quant.uniform<u8:f32, 0.0016452147680170396:94>>) -> tensor<1x1x!quant.uniform<u8:f32, 0.0027669200710221833:159>>
 // CHECK-NEXT: %[[d0q1:.*]] = "tfl.dequantize"(%[[r0q1]]) : (tensor<1x1x!quant.uniform<u8:f32, 0.0027669200710221833:159>>) -> tensor<1x1xf32>
-// CHECK-NEXT: %[[q2:.*]] = "tfl.quantize"(%arg2) {qtype = tensor<1x1x!quant.uniform<u8:f32, 0.0025560007375829358:191>>, volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<u8:f32, 0.0025560007375829358:191>>
-// CHECK-NEXT: %[[r0q2:.*]] = "tfl.quantize"(%[[q2]]) {qtype = tensor<1x1x!quant.uniform<u8:f32, 0.0026575490540149166:184>>} : (tensor<1x1x!quant.uniform<u8:f32, 0.0025560007375829358:191>>) -> tensor<1x1x!quant.uniform<u8:f32, 0.0026575490540149166:184>>
+// CHECK-NEXT: %[[q2:.*]] = "tfl.quantize"(%arg2) <{qtype = tensor<1x1x!quant.uniform<u8:f32, 0.0025560007375829358:191>>}> {volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<u8:f32, 0.0025560007375829358:191>>
+// CHECK-NEXT: %[[r0q2:.*]] = "tfl.quantize"(%[[q2]]) <{qtype = tensor<1x1x!quant.uniform<u8:f32, 0.0026575490540149166:184>>}> : (tensor<1x1x!quant.uniform<u8:f32, 0.0025560007375829358:191>>) -> tensor<1x1x!quant.uniform<u8:f32, 0.0026575490540149166:184>>
 // CHECK-NEXT: %[[d0q2:.*]] = "tfl.dequantize"(%[[r0q2]]) : (tensor<1x1x!quant.uniform<u8:f32, 0.0026575490540149166:184>>) -> tensor<1x1xf32>
-// CHECK-NEXT: %[[q3:.*]] = "tfl.quantize"(%arg3) {qtype = tensor<1x1x!quant.uniform<u8:f32, 0.0025506425137613335:99>>, volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<u8:f32, 0.0025506425137613335:99>>
-// CHECK-NEXT: %[[r0q3:.*]] = "tfl.quantize"(%[[q3]]) {qtype = tensor<1x1x!quant.uniform<u8:f32, 0.0034775265291625379:140>>} : (tensor<1x1x!quant.uniform<u8:f32, 0.0025506425137613335:99>>) -> tensor<1x1x!quant.uniform<u8:f32, 0.0034775265291625379:140>>
+// CHECK-NEXT: %[[q3:.*]] = "tfl.quantize"(%arg3) <{qtype = tensor<1x1x!quant.uniform<u8:f32, 0.0025506425137613335:99>>}> {volatile} : (tensor<1x1xf32>) -> tensor<1x1x!quant.uniform<u8:f32, 0.0025506425137613335:99>>
+// CHECK-NEXT: %[[r0q3:.*]] = "tfl.quantize"(%[[q3]]) <{qtype = tensor<1x1x!quant.uniform<u8:f32, 0.0034775265291625379:140>>}> : (tensor<1x1x!quant.uniform<u8:f32, 0.0025506425137613335:99>>) -> tensor<1x1x!quant.uniform<u8:f32, 0.0034775265291625379:140>>
 // CHECK-NEXT: %[[d0q3:.*]] = "tfl.dequantize"(%[[r0q3]]) : (tensor<1x1x!quant.uniform<u8:f32, 0.0034775265291625379:140>>) -> tensor<1x1xf32>
-// CHECK-NEXT: %[[cat1_0:.*]] = "tfl.concatenation"(%[[d0q1]], %[[d1q0]]) {axis = -1 : i32, fused_activation_function = "NONE"} : (tensor<1x1xf32>, tensor<1x1xf32>) -> tensor<1x2xf32>
-// CHECK-NEXT: %[[qcat1_0:.*]] = "tfl.quantize"(%[[cat1_0]]) {qtype = tensor<1x2x!quant.uniform<u8:f32, 0.0027669200710221833:159>>, volatile} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<u8:f32, 0.0027669200710221833:159>>
-// CHECK-NEXT: %[[r0qcat1_0:.*]] = "tfl.quantize"(%[[qcat1_0]]) {qtype = tensor<1x2x!quant.uniform<u8:f32, 0.0026575490540149166:184>>} : (tensor<1x2x!quant.uniform<u8:f32, 0.0027669200710221833:159>>) -> tensor<1x2x!quant.uniform<u8:f32, 0.0026575490540149166:184>>
+// CHECK-NEXT: %[[cat1_0:.*]] = "tfl.concatenation"(%[[d0q1]], %[[d1q0]]) <{axis = -1 : i32, fused_activation_function = "NONE"}> : (tensor<1x1xf32>, tensor<1x1xf32>) -> tensor<1x2xf32>
+// CHECK-NEXT: %[[qcat1_0:.*]] = "tfl.quantize"(%[[cat1_0]]) <{qtype = tensor<1x2x!quant.uniform<u8:f32, 0.0027669200710221833:159>>}> {volatile} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<u8:f32, 0.0027669200710221833:159>>
+// CHECK-NEXT: %[[r0qcat1_0:.*]] = "tfl.quantize"(%[[qcat1_0]]) <{qtype = tensor<1x2x!quant.uniform<u8:f32, 0.0026575490540149166:184>>}> : (tensor<1x2x!quant.uniform<u8:f32, 0.0027669200710221833:159>>) -> tensor<1x2x!quant.uniform<u8:f32, 0.0026575490540149166:184>>
 // CHECK-NEXT: %[[d0qcat1_0:.*]] = "tfl.dequantize"(%[[r0qcat1_0]]) : (tensor<1x2x!quant.uniform<u8:f32, 0.0026575490540149166:184>>) -> tensor<1x2xf32>
-// CHECK-NEXT: %[[cat_2_0:.*]] = "tfl.concatenation"(%[[d0q2]], %[[d0q0]]) {axis = -1 : i32, fused_activation_function = "NONE"} : (tensor<1x1xf32>, tensor<1x1xf32>) -> tensor<1x2xf32>
-// CHECK-NEXT: %[[qcat_2_0:.*]] = "tfl.quantize"(%[[cat_2_0]]) {qtype = tensor<1x2x!quant.uniform<u8:f32, 0.0026575490540149166:184>>, volatile} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<u8:f32, 0.0026575490540149166:184>>
-// CHECK-NEXT: %[[r0qcat_2_0:.*]] = "tfl.quantize"(%[[qcat_2_0]]) {qtype = tensor<1x2x!quant.uniform<u8:f32, 0.0034775265291625379:140>>} : (tensor<1x2x!quant.uniform<u8:f32, 0.0026575490540149166:184>>) -> tensor<1x2x!quant.uniform<u8:f32, 0.0034775265291625379:140>>
+// CHECK-NEXT: %[[cat_2_0:.*]] = "tfl.concatenation"(%[[d0q2]], %[[d0q0]]) <{axis = -1 : i32, fused_activation_function = "NONE"}> : (tensor<1x1xf32>, tensor<1x1xf32>) -> tensor<1x2xf32>
+// CHECK-NEXT: %[[qcat_2_0:.*]] = "tfl.quantize"(%[[cat_2_0]]) <{qtype = tensor<1x2x!quant.uniform<u8:f32, 0.0026575490540149166:184>>}> {volatile} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<u8:f32, 0.0026575490540149166:184>>
+// CHECK-NEXT: %[[r0qcat_2_0:.*]] = "tfl.quantize"(%[[qcat_2_0]]) <{qtype = tensor<1x2x!quant.uniform<u8:f32, 0.0034775265291625379:140>>}> : (tensor<1x2x!quant.uniform<u8:f32, 0.0026575490540149166:184>>) -> tensor<1x2x!quant.uniform<u8:f32, 0.0034775265291625379:140>>
 // CHECK-NEXT: %[[d0qcat_2_0:.*]] = "tfl.dequantize"(%[[r0qcat_2_0]]) : (tensor<1x2x!quant.uniform<u8:f32, 0.0034775265291625379:140>>) -> tensor<1x2xf32>
 // CHECK-NEXT: %[[dqcat_2_0:.*]] = "tfl.dequantize"(%[[qcat_2_0]]) : (tensor<1x2x!quant.uniform<u8:f32, 0.0026575490540149166:184>>) -> tensor<1x2xf32>
-// CHECK-NEXT: %[[cat_2_0_1_0:.*]] = "tfl.concatenation"(%[[dqcat_2_0]], %[[d0qcat1_0]]) {axis = -1 : i32, fused_activation_function = "NONE"} : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<1x4xf32>
-// CHECK-NEXT: %[[qcat_2_0_1_0:.*]] = "tfl.quantize"(%[[cat_2_0_1_0]]) {qtype = tensor<1x4x!quant.uniform<u8:f32, 0.0026575490540149166:184>>, volatile} : (tensor<1x4xf32>) -> tensor<1x4x!quant.uniform<u8:f32, 0.0026575490540149166:184>>
+// CHECK-NEXT: %[[cat_2_0_1_0:.*]] = "tfl.concatenation"(%[[dqcat_2_0]], %[[d0qcat1_0]]) <{axis = -1 : i32, fused_activation_function = "NONE"}> : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<1x4xf32>
+// CHECK-NEXT: %[[qcat_2_0_1_0:.*]] = "tfl.quantize"(%[[cat_2_0_1_0]]) <{qtype = tensor<1x4x!quant.uniform<u8:f32, 0.0026575490540149166:184>>}> {volatile} : (tensor<1x4xf32>) -> tensor<1x4x!quant.uniform<u8:f32, 0.0026575490540149166:184>>
 // CHECK-NEXT: %[[dqcat_2_0_1_0:.*]] = "tfl.dequantize"(%[[qcat_2_0_1_0]]) : (tensor<1x4x!quant.uniform<u8:f32, 0.0026575490540149166:184>>) -> tensor<1x4xf32>
-// CHECK-NEXT: %[[cat_2_0_3:.*]] = "tfl.concatenation"(%[[d0qcat_2_0]], %[[d0q3]]) {axis = -1 : i32, fused_activation_function = "NONE"} : (tensor<1x2xf32>, tensor<1x1xf32>) -> tensor<1x3xf32>
-// CHECK-NEXT: %[[qcat_2_0_3:.*]] = "tfl.quantize"(%[[cat_2_0_3]]) {qtype = tensor<1x3x!quant.uniform<u8:f32, 0.0034775265291625379:140>>, volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<u8:f32, 0.0034775265291625379:140>>
+// CHECK-NEXT: %[[cat_2_0_3:.*]] = "tfl.concatenation"(%[[d0qcat_2_0]], %[[d0q3]]) <{axis = -1 : i32, fused_activation_function = "NONE"}> : (tensor<1x2xf32>, tensor<1x1xf32>) -> tensor<1x3xf32>
+// CHECK-NEXT: %[[qcat_2_0_3:.*]] = "tfl.quantize"(%[[cat_2_0_3]]) <{qtype = tensor<1x3x!quant.uniform<u8:f32, 0.0034775265291625379:140>>}> {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<u8:f32, 0.0034775265291625379:140>>
 // CHECK-NEXT: %[[dqcat_2_0_3:.*]] = "tfl.dequantize"(%[[qcat_2_0_3]]) : (tensor<1x3x!quant.uniform<u8:f32, 0.0034775265291625379:140>>) -> tensor<1x3xf32>
 // CHECK-NEXT: return %[[dqcat_2_0_1_0]], %[[dqcat_2_0_3]] : tensor<1x4xf32>, tensor<1x3xf32>
 }
@@ -892,10 +892,10 @@ func.func @TransposePerTensorQuantizationPropagation() -> tensor<2x5xf32> {
 
   // QDQ: %[[perm:.*]] = arith.constant dense<[1, 0]> : tensor<2xi32>
   // QDQ-NEXT: %[[w:.*]] = arith.constant dense<1.000000e+00> : tensor<5x2xf32>
-  // QDQ-NEXT: %[[qw:.*]] = "tfl.quantize"(%[[w]]) {qtype = tensor<5x2x!quant.uniform<i8<-127:127>:f32
+  // QDQ-NEXT: %[[qw:.*]] = "tfl.quantize"(%[[w]]) <{qtype = tensor<5x2x!quant.uniform<i8<-127:127>:f32
   // QDQ-NEXT: %[[dqw:.*]] = "tfl.dequantize"(%[[qw]]) : (tensor<5x2x!quant.uniform<i8<-127:127>:f32
   // QDQ-NEXT: %[[tp:.*]] = "tfl.transpose"(%[[dqw]], %[[perm]]) : (tensor<5x2xf32>, tensor<2xi32>) -> tensor<2x5xf32>
-  // QDQ-NEXT: %[[qtw:.*]] = "tfl.quantize"(%[[tp]]) {qtype = tensor<2x5x!quant.uniform<i8<-127:127>:f32
+  // QDQ-NEXT: %[[qtw:.*]] = "tfl.quantize"(%[[tp]]) <{qtype = tensor<2x5x!quant.uniform<i8<-127:127>:f32
   // QDQ-NEXT: %[[dqtw:.*]] = "tfl.dequantize"(%[[qtw]]) : (tensor<2x5x!quant.uniform<i8<-127:127>:f32
   // QDQ-NEXT: return %[[dqtw]] : tensor<2x5xf32>
 }
@@ -911,10 +911,10 @@ func.func @TransposePerChannelNewQuantDim() -> tensor<2x5xf32> {
 
 // QDQ: %[[perm:.*]] = arith.constant dense<[1, 0]> : tensor<2xi32>
 // QDQ-NEXT: %[[w:.*]] = arith.constant dense<1.000000e+00> : tensor<5x2xf32>
-// QDQ-NEXT: %[[qw:.*]] = "tfl.quantize"(%[[w]]) {qtype = tensor<5x2x!quant.uniform<i8<-127:127>:f32:0
+// QDQ-NEXT: %[[qw:.*]] = "tfl.quantize"(%[[w]]) <{qtype = tensor<5x2x!quant.uniform<i8<-127:127>:f32:0
 // QDQ-NEXT: %[[dqw:.*]] = "tfl.dequantize"(%[[qw]]) : (tensor<5x2x!quant.uniform<i8<-127:127>:f32:0
 // QDQ-NEXT: %[[tp:.*]] = "tfl.transpose"(%[[dqw]], %[[perm]]) : (tensor<5x2xf32>, tensor<2xi32>) -> tensor<2x5xf32>
-// QDQ-NEXT: %[[qtw:.*]] = "tfl.quantize"(%[[tp]]) {qtype = tensor<2x5x!quant.uniform<i8<-127:127>:f32:1
+// QDQ-NEXT: %[[qtw:.*]] = "tfl.quantize"(%[[tp]]) <{qtype = tensor<2x5x!quant.uniform<i8<-127:127>:f32:1
 // QDQ-NEXT: %[[dqtw:.*]] = "tfl.dequantize"(%[[qtw]]) : (tensor<2x5x!quant.uniform<i8<-127:127>:f32:1
 // QDQ-NEXT: return %[[dqtw]] : tensor<2x5xf32>
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-tf-fake-quant-4bit.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-tf-fake-quant-4bit.mlir
index dca4c21766ee4a..d9e15db9a182a5 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-tf-fake-quant-4bit.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-tf-fake-quant-4bit.mlir
@@ -12,7 +12,7 @@ func.func @fakeQuantPerChannelForActivation(%arg0: tensor<8x4xf32>) -> (tensor<8
 
 // CHECK:  %[[fq:.*]] = "tf.FakeQuantWithMinMaxVarsPerChannel"(%arg0, %cst, %cst_0)
 // The last channel tests the code in quantization utils that expands very small ranges to be at least 1e-6.
-// CHECK:  %[[q:.*]] = "tfl.quantize"(%[[fq]]) {qtype = tensor<8x4x!quant.uniform<u4:f32:1, {1.000000e+00,1.000000e+00:1,1.000000e+00,6.6666666666666668E-8:7}>>}
+// CHECK:  %[[q:.*]] = "tfl.quantize"(%[[fq]]) <{qtype = tensor<8x4x!quant.uniform<u4:f32:1, {1.000000e+00,1.000000e+00:1,1.000000e+00,6.6666666666666668E-8:7}>>}>
 // CHECK:  %[[dq:.*]] = "tfl.dequantize"(%[[q]])
 // CHECK:  return %[[dq]]
 }
@@ -26,7 +26,7 @@ func.func @fakeQuantForActivation(tensor<8xf32>) -> (tensor<8xf32>) {
   func.return %0 : tensor<8xf32>
 
 // CHECK:  %0 = "tf.FakeQuantWithMinMaxVars"(%arg0, %cst, %cst_0)
-// CHECK:  %1 = "tfl.quantize"(%0) {qtype = tensor<8x!quant.uniform<u4:f32, 1.000000e+00>>}
+// CHECK:  %1 = "tfl.quantize"(%0) <{qtype = tensor<8x!quant.uniform<u4:f32, 1.000000e+00>>}>
 // CHECK:  %2 = "tfl.dequantize"(%1)
 // CHECK:  return %2
 }
@@ -41,7 +41,7 @@ func.func @fakeQuantForActivationNoDuplication(tensor<8xf32>) -> (tensor<8x!quan
   func.return %1 : tensor<8x!quant.uniform<u4:f32, 1.000000e+00>>
 
 // CHECK:  %0 = "tf.FakeQuantWithMinMaxVars"(%arg0, %cst, %cst_0) <{narrow_range = false, num_bits = 3 : i64}>
-// CHECK:  %1 = "tfl.quantize"(%0) {qtype = tensor<8x!quant.uniform<u4:f32, 1.000000e+00>>}
+// CHECK:  %1 = "tfl.quantize"(%0) <{qtype = tensor<8x!quant.uniform<u4:f32, 1.000000e+00>>}>
 // CHECK:  return %1
 }
 
@@ -60,7 +60,7 @@ func.func @WrappedFakeQuantFolded() -> tensor<8xf32> {
   func.return %rst : tensor<8xf32>
 
 // CHECK: %[[CONSTANT:.*]] = arith.constant dense<0.000000e+00> : tensor<8xf32>
-// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT]]) {qtype = tensor<8x!quant.uniform<u4:f32, 1.000000e+00>>}
+// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT]]) <{qtype = tensor<8x!quant.uniform<u4:f32, 1.000000e+00>>}>
 // CHECK: %[[DEQUANTIZE:.*]] = "tfl.dequantize"(%[[QUANTIZE]])
 // CHECK: return %[[DEQUANTIZE]] : tensor<8xf32>
 }
@@ -76,7 +76,7 @@ func.func @fakeQuantFolded() -> (tensor<8xf32>) {
   func.return %rst : tensor<8xf32>
 
 // CHECK: %[[CONSTANT:.*]] = arith.constant dense<0.000000e+00> : tensor<8xf32>
-// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT]]) {qtype = tensor<8x!quant.uniform<u4:f32, 1.000000e+00>>}
+// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT]]) <{qtype = tensor<8x!quant.uniform<u4:f32, 1.000000e+00>>}>
 // CHECK: %[[DEQUANTIZE:.*]] = "tfl.dequantize"(%[[QUANTIZE]])
 // CHECK: return %[[DEQUANTIZE]] : tensor<8xf32>
 }
@@ -90,7 +90,7 @@ func.func @fakeQuantFoldedWithoutIdentity() -> (tensor<8xf32>) {
   func.return %rst : tensor<8xf32>
 
 // CHECK: %[[CONSTANT:.*]] = arith.constant dense<0.000000e+00> : tensor<8xf32>
-// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT]]) {qtype = tensor<8x!quant.uniform<u4:f32, 1.000000e+00>>}
+// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT]]) <{qtype = tensor<8x!quant.uniform<u4:f32, 1.000000e+00>>}>
 // CHECK: %[[DEQUANTIZE:.*]] = "tfl.dequantize"(%[[QUANTIZE]])
 // CHECK: return %[[DEQUANTIZE]] : tensor<8xf32>
 }
@@ -108,7 +108,7 @@ func.func @fakeQuantFoldedWithCast() -> (tensor<8xf32>) {
   func.return %rst : tensor<8xf32>
 
 // CHECK: %[[CONSTANT:.*]] = arith.constant dense<0.000000e+00> : tensor<8xf32>
-// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT]]) {qtype = tensor<8x!quant.uniform<u4:f32, 1.000000e+00>>}
+// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT]]) <{qtype = tensor<8x!quant.uniform<u4:f32, 1.000000e+00>>}>
 // CHECK: %[[DEQUANTIZE:.*]] = "tfl.dequantize"(%[[QUANTIZE]])
 // CHECK: return %[[DEQUANTIZE]] : tensor<8xf32>
 }
@@ -198,7 +198,7 @@ func.func @fakeQuantWithConv2D(tensor<256x32x32x3xf32>) -> (tensor<256x8x7x16xf3
 
 // CHECK-DAG: %[[CONSTANT:.*]] = arith.constant dense<0.000000e+00> : tensor<16xf32>
 // CHECK-DAG: %[[CONSTANT0:.*]] = arith.constant dense<0.000000e+00> : tensor<16x3x3x3xf32>
-// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT0]]) {qtype = tensor<16x3x3x3x!quant.uniform<u4:f32, 1.000000e+00>>}
+// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT0]]) <{qtype = tensor<16x3x3x3x!quant.uniform<u4:f32, 1.000000e+00>>}>
 // CHECK: %[[DEQUANTIZE:.*]] = "tfl.dequantize"(%[[QUANTIZE]])
 // CHECK: %[[CONV:.*]] = "tfl.conv_2d"(%arg0, %[[DEQUANTIZE]], %[[CONSTANT]])
 // CHECK: return %[[CONV]]
@@ -218,7 +218,7 @@ func.func @perChannelFakeQuantWithConv2D(tensor<256x32x32x3xf32>) -> (tensor<256
 
 // CHECK-DAG: %[[CONSTANT0:.*]] = arith.constant dense<0.000000e+00> : tensor<16x3x3x3xf32>
 // CHECK-DAG: %[[CONSTANT:.*]] = arith.constant dense<0.000000e+00> : tensor<16xf32>
-// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT0]]) {qtype = tensor<16x3x3x3x!quant.uniform<u4:f32:0,
+// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT0]]) <{qtype = tensor<16x3x3x3x!quant.uniform<u4:f32:0,
 // CHECK-SAME: {1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00}>>
 // CHECK: %[[DEQUANTIZE:.*]] = "tfl.dequantize"(%[[QUANTIZE]])
 // CHECK: %[[CONV:.*]] = "tfl.conv_2d"(%arg0, %[[DEQUANTIZE]], %[[CONSTANT]])
@@ -239,7 +239,7 @@ func.func @fakeQuantWithDepthwiseConv2D(tensor<256x32x32x3xf32>) -> (tensor<256x
 
 // CHECK-DAG: %[[CONSTANT:.*]] = arith.constant dense<0.000000e+00> : tensor<48xf32>
 // CHECK-DAG: %[[CONSTANT0:.*]] = arith.constant dense<0.000000e+00> : tensor<1x3x3x48xf32>
-// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT0]]) {qtype = tensor<1x3x3x48x!quant.uniform<u4:f32, 1.000000e+00>>}
+// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT0]]) <{qtype = tensor<1x3x3x48x!quant.uniform<u4:f32, 1.000000e+00>>}>
 // CHECK: %[[DEQUANTIZE:.*]] = "tfl.dequantize"(%[[QUANTIZE]])
 // CHECK: %[[CONV:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[DEQUANTIZE]], %[[CONSTANT]])
 // CHECK: return %[[CONV]]
@@ -259,7 +259,7 @@ func.func @perChannelFakeQuantWithDepthwiseConv2D(tensor<256x32x32x3xf32>) -> (t
 
 // CHECK-DAG: %[[CONSTANT0:.*]] = arith.constant dense<0.000000e+00> : tensor<1x3x3x48xf32>
 // CHECK-DAG: %[[CONSTANT:.*]] = arith.constant dense<0.000000e+00> : tensor<48xf32>
-// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT0]]) {qtype = tensor<1x3x3x48x!quant.uniform<u4:f32:3,
+// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT0]]) <{qtype = tensor<1x3x3x48x!quant.uniform<u4:f32:3,
 // CHECK-SAME: {1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,
 // CHECK-SAME:  1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,
 // CHECK-SAME:  1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00}>>}
@@ -285,7 +285,7 @@ func.func @perChannelFakeQuantWithDepthwiseConv2DWithReshape(%arg: tensor<1x160x
 
 // CHECK-DAG: %[[CONSTANT0:.*]] = arith.constant dense<0.000000e+00> : tensor<1x3x3x48xf32>
 // CHECK-DAG: %[[CONSTANT:.*]] = arith.constant dense<0.000000e+00> : tensor<48xf32>
-// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT0]]) {qtype = tensor<1x3x3x48x!quant.uniform<u4:f32:3,
+// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT0]]) <{qtype = tensor<1x3x3x48x!quant.uniform<u4:f32:3,
 // CHECK-SAME: {1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,
 // CHECK-SAME:  1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,
 // CHECK-SAME:  1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00}>>}
@@ -302,7 +302,7 @@ func.func @fakeQuant3BitPerChannelForActivation(%arg0: tensor<8x4xf32>) -> (tens
   func.return %0 : tensor<8x4xf32>
 
 // LOBIT:  %[[fq:.*]] = "tf.FakeQuantWithMinMaxVarsPerChannel"(%arg0, %cst, %cst_0)
-// LOBIT:  %[[q:.*]] = "tfl.quantize"(%[[fq]]) {qtype = tensor<8x4x!quant.uniform<u4<0:7>:f32:1, {1.000000e+00,1.000000e+00:1,2.000000e+00:4,2.000000e+00:3}>>}
+// LOBIT:  %[[q:.*]] = "tfl.quantize"(%[[fq]]) <{qtype = tensor<8x4x!quant.uniform<u4<0:7>:f32:1, {1.000000e+00,1.000000e+00:1,2.000000e+00:4,2.000000e+00:3}>>}>
 // LOBIT:  %[[dq:.*]] = "tfl.dequantize"(%[[q]])
 // LOBIT:  return %[[dq]]
 }
@@ -316,7 +316,7 @@ func.func @fakeQuant3BitForActivation(tensor<8xf32>) -> (tensor<8xf32>) {
   func.return %0 : tensor<8xf32>
 
 // LOBIT:  %0 = "tf.FakeQuantWithMinMaxVars"(%arg0, %cst, %cst_0)
-// LOBIT:  %1 = "tfl.quantize"(%0) {qtype = tensor<8x!quant.uniform<u4<0:7>:f32, 2.000000e+00:3>>}
+// LOBIT:  %1 = "tfl.quantize"(%0) <{qtype = tensor<8x!quant.uniform<u4<0:7>:f32, 2.000000e+00:3>>}>
 // LOBIT:  %2 = "tfl.dequantize"(%1)
 // LOBIT:  return %2
 }
@@ -335,7 +335,7 @@ func.func @fakeQuant4BitWithConv2DPerChannel(tensor<256x32x32x3xf32>) -> (tensor
 
 // LOBIT-DAG: %[[CONSTANT:.*]] = arith.constant dense<0.000000e+00> : tensor<4xf32>
 // LOBIT-DAG: %[[CONSTANT0:.*]] = arith.constant dense<0.000000e+00> : tensor<4x3x3x3xf32>
-// LOBIT: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT0]]) {qtype = tensor<4x3x3x3x!quant.uniform<u4<1:15>:f32:0, {1.000000e+00:1,1.000000e+00:2,1.000000e+00:7,1.000000e+00:15}>>}
+// LOBIT: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT0]]) <{qtype = tensor<4x3x3x3x!quant.uniform<u4<1:15>:f32:0, {1.000000e+00:1,1.000000e+00:2,1.000000e+00:7,1.000000e+00:15}>>}>
 // LOBIT: %[[DEQUANTIZE:.*]] = "tfl.dequantize"(%[[QUANTIZE]])
 // LOBIT: %[[CONV:.*]] = "tfl.conv_2d"(%arg0, %[[DEQUANTIZE]], %[[CONSTANT]])
 // LOBIT: return %[[CONV]]
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-tf-fake-quant.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-tf-fake-quant.mlir
index c65cecc188f468..b9fe9310588d77 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-tf-fake-quant.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-tf-fake-quant.mlir
@@ -11,7 +11,7 @@ func.func @fakeQuantPerChannelForActivation(%arg0: tensor<8x4xf32>) -> (tensor<8
   func.return %0 : tensor<8x4xf32>
 
 // CHECK:  %[[fq:.*]] = "tf.FakeQuantWithMinMaxVarsPerChannel"(%arg0, %cst, %cst_0)
-// CHECK:  %[[q:.*]] = "tfl.quantize"(%[[fq]]) {qtype = tensor<8x4x!quant.uniform<u8:f32:1, {1.000000e+00,1.000000e+00:1,1.000000e+00,3.9215686274509805E-9:127}>>}
+// CHECK:  %[[q:.*]] = "tfl.quantize"(%[[fq]]) <{qtype = tensor<8x4x!quant.uniform<u8:f32:1, {1.000000e+00,1.000000e+00:1,1.000000e+00,3.9215686274509805E-9:127}>>}>
 // CHECK:  %[[dq:.*]] = "tfl.dequantize"(%[[q]])
 // CHECK:  return %[[dq]]
 }
@@ -25,7 +25,7 @@ func.func @fakeQuantForActivation(tensor<8xf32>) -> (tensor<8xf32>) {
   func.return %0 : tensor<8xf32>
 
 // CHECK:  %0 = "tf.FakeQuantWithMinMaxVars"(%arg0, %cst, %cst_0)
-// CHECK:  %1 = "tfl.quantize"(%0) {qtype = tensor<8x!quant.uniform<u8:f32, 1.000000e+00>>}
+// CHECK:  %1 = "tfl.quantize"(%0) <{qtype = tensor<8x!quant.uniform<u8:f32, 1.000000e+00>>}>
 // CHECK:  %2 = "tfl.dequantize"(%1)
 // CHECK:  return %2
 }
@@ -40,7 +40,7 @@ func.func @fakeQuantForActivationNoDuplication(tensor<8xf32>) -> (tensor<8x!quan
   func.return %1 : tensor<8x!quant.uniform<u8:f32, 1.000000e+00>>
 
 // CHECK:  %0 = "tf.FakeQuantWithMinMaxVars"(%arg0, %cst, %cst_0) <{narrow_range = false, num_bits = 5 : i64}>
-// CHECK:  %1 = "tfl.quantize"(%0) {qtype = tensor<8x!quant.uniform<u8:f32, 1.000000e+00>>}
+// CHECK:  %1 = "tfl.quantize"(%0) <{qtype = tensor<8x!quant.uniform<u8:f32, 1.000000e+00>>}>
 // CHECK:  return %1
 }
 
@@ -59,7 +59,7 @@ func.func @WrappedFakeQuantFolded() -> tensor<8xf32> {
   func.return %rst : tensor<8xf32>
 
 // CHECK: %[[CONSTANT:.*]] = arith.constant dense<0.000000e+00> : tensor<8xf32>
-// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT]]) {qtype = tensor<8x!quant.uniform<u8:f32, 1.000000e+00>>}
+// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT]]) <{qtype = tensor<8x!quant.uniform<u8:f32, 1.000000e+00>>}>
 // CHECK: %[[DEQUANTIZE:.*]] = "tfl.dequantize"(%[[QUANTIZE]])
 // CHECK: return %[[DEQUANTIZE]] : tensor<8xf32>
 }
@@ -75,7 +75,7 @@ func.func @fakeQuantFolded() -> (tensor<8xf32>) {
   func.return %rst : tensor<8xf32>
 
 // CHECK: %[[CONSTANT:.*]] = arith.constant dense<0.000000e+00> : tensor<8xf32>
-// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT]]) {qtype = tensor<8x!quant.uniform<u8:f32, 1.000000e+00>>}
+// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT]]) <{qtype = tensor<8x!quant.uniform<u8:f32, 1.000000e+00>>}>
 // CHECK: %[[DEQUANTIZE:.*]] = "tfl.dequantize"(%[[QUANTIZE]])
 // CHECK: return %[[DEQUANTIZE]] : tensor<8xf32>
 }
@@ -89,7 +89,7 @@ func.func @fakeQuantFoldedWithoutIdentity() -> (tensor<8xf32>) {
   func.return %rst : tensor<8xf32>
 
 // CHECK: %[[CONSTANT:.*]] = arith.constant dense<0.000000e+00> : tensor<8xf32>
-// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT]]) {qtype = tensor<8x!quant.uniform<u8:f32, 1.000000e+00>>}
+// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT]]) <{qtype = tensor<8x!quant.uniform<u8:f32, 1.000000e+00>>}>
 // CHECK: %[[DEQUANTIZE:.*]] = "tfl.dequantize"(%[[QUANTIZE]])
 // CHECK: return %[[DEQUANTIZE]] : tensor<8xf32>
 }
@@ -107,7 +107,7 @@ func.func @fakeQuantFoldedWithCast() -> (tensor<8xf32>) {
   func.return %rst : tensor<8xf32>
 
 // CHECK: %[[CONSTANT:.*]] = arith.constant dense<0.000000e+00> : tensor<8xf32>
-// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT]]) {qtype = tensor<8x!quant.uniform<u8:f32, 1.000000e+00>>}
+// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT]]) <{qtype = tensor<8x!quant.uniform<u8:f32, 1.000000e+00>>}>
 // CHECK: %[[DEQUANTIZE:.*]] = "tfl.dequantize"(%[[QUANTIZE]])
 // CHECK: return %[[DEQUANTIZE]] : tensor<8xf32>
 }
@@ -197,7 +197,7 @@ func.func @fakeQuantWithConv2D(tensor<256x32x32x3xf32>) -> (tensor<256x8x7x16xf3
 
 // CHECK-DAG: %[[CONSTANT:.*]] = arith.constant dense<0.000000e+00> : tensor<16xf32>
 // CHECK-DAG: %[[CONSTANT0:.*]] = arith.constant dense<0.000000e+00> : tensor<16x3x3x3xf32>
-// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT0]]) {qtype = tensor<16x3x3x3x!quant.uniform<u8:f32, 1.000000e+00>>}
+// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT0]]) <{qtype = tensor<16x3x3x3x!quant.uniform<u8:f32, 1.000000e+00>>}>
 // CHECK: %[[DEQUANTIZE:.*]] = "tfl.dequantize"(%[[QUANTIZE]])
 // CHECK: %[[CONV:.*]] = "tfl.conv_2d"(%arg0, %[[DEQUANTIZE]], %[[CONSTANT]])
 // CHECK: return %[[CONV]]
@@ -217,7 +217,7 @@ func.func @perChannelFakeQuantWithConv2D(tensor<256x32x32x3xf32>) -> (tensor<256
 
 // CHECK-DAG: %[[CONSTANT0:.*]] = arith.constant dense<0.000000e+00> : tensor<16x3x3x3xf32>
 // CHECK-DAG: %[[CONSTANT:.*]] = arith.constant dense<0.000000e+00> : tensor<16xf32>
-// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT0]]) {qtype = tensor<16x3x3x3x!quant.uniform<u8:f32:0,
+// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT0]]) <{qtype = tensor<16x3x3x3x!quant.uniform<u8:f32:0,
 // CHECK-SAME: {1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00}>>
 // CHECK: %[[DEQUANTIZE:.*]] = "tfl.dequantize"(%[[QUANTIZE]])
 // CHECK: %[[CONV:.*]] = "tfl.conv_2d"(%arg0, %[[DEQUANTIZE]], %[[CONSTANT]])
@@ -238,7 +238,7 @@ func.func @fakeQuantWithDepthwiseConv2D(tensor<256x32x32x3xf32>) -> (tensor<256x
 
 // CHECK-DAG: %[[CONSTANT:.*]] = arith.constant dense<0.000000e+00> : tensor<48xf32>
 // CHECK-DAG: %[[CONSTANT0:.*]] = arith.constant dense<0.000000e+00> : tensor<1x3x3x48xf32>
-// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT0]]) {qtype = tensor<1x3x3x48x!quant.uniform<u8:f32, 1.000000e+00>>}
+// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT0]]) <{qtype = tensor<1x3x3x48x!quant.uniform<u8:f32, 1.000000e+00>>}>
 // CHECK: %[[DEQUANTIZE:.*]] = "tfl.dequantize"(%[[QUANTIZE]])
 // CHECK: %[[CONV:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[DEQUANTIZE]], %[[CONSTANT]])
 // CHECK: return %[[CONV]]
@@ -258,7 +258,7 @@ func.func @perChannelFakeQuantWithDepthwiseConv2D(tensor<256x32x32x3xf32>) -> (t
 
 // CHECK-DAG: %[[CONSTANT0:.*]] = arith.constant dense<0.000000e+00> : tensor<1x3x3x48xf32>
 // CHECK-DAG: %[[CONSTANT:.*]] = arith.constant dense<0.000000e+00> : tensor<48xf32>
-// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT0]]) {qtype = tensor<1x3x3x48x!quant.uniform<u8:f32:3,
+// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT0]]) <{qtype = tensor<1x3x3x48x!quant.uniform<u8:f32:3,
 // CHECK-SAME: {1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,
 // CHECK-SAME:  1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,
 // CHECK-SAME:  1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00}>>}
@@ -284,7 +284,7 @@ func.func @perChannelFakeQuantWithDepthwiseConv2DWithReshape(%arg: tensor<1x160x
 
 // CHECK-DAG: %[[CONSTANT0:.*]] = arith.constant dense<0.000000e+00> : tensor<1x3x3x48xf32>
 // CHECK-DAG: %[[CONSTANT:.*]] = arith.constant dense<0.000000e+00> : tensor<48xf32>
-// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT0]]) {qtype = tensor<1x3x3x48x!quant.uniform<u8:f32:3,
+// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT0]]) <{qtype = tensor<1x3x3x48x!quant.uniform<u8:f32:3,
 // CHECK-SAME: {1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,
 // CHECK-SAME:  1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,
 // CHECK-SAME:  1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00}>>}
@@ -301,7 +301,7 @@ func.func @fakeQuant3BitPerChannelForActivation(%arg0: tensor<8x4xf32>) -> (tens
   func.return %0 : tensor<8x4xf32>
 
 // LOBIT:  %[[fq:.*]] = "tf.FakeQuantWithMinMaxVarsPerChannel"(%arg0, %cst, %cst_0)
-// LOBIT:  %[[q:.*]] = "tfl.quantize"(%[[fq]]) {qtype = tensor<8x4x!quant.uniform<u8<0:31>:f32:1, {1.000000e+00,1.000000e+00:1,2.000000e+00:16,2.000000e+00:15}>>}
+// LOBIT:  %[[q:.*]] = "tfl.quantize"(%[[fq]]) <{qtype = tensor<8x4x!quant.uniform<u8<0:31>:f32:1, {1.000000e+00,1.000000e+00:1,2.000000e+00:16,2.000000e+00:15}>>}>
 // LOBIT:  %[[dq:.*]] = "tfl.dequantize"(%[[q]])
 // LOBIT:  return %[[dq]]
 }
@@ -315,7 +315,7 @@ func.func @fakeQuant3BitForActivation(tensor<8xf32>) -> (tensor<8xf32>) {
   func.return %0 : tensor<8xf32>
 
 // LOBIT:  %0 = "tf.FakeQuantWithMinMaxVars"(%arg0, %cst, %cst_0)
-// LOBIT:  %1 = "tfl.quantize"(%0) {qtype = tensor<8x!quant.uniform<u8<0:31>:f32, 2.000000e+00:15>>}
+// LOBIT:  %1 = "tfl.quantize"(%0) <{qtype = tensor<8x!quant.uniform<u8<0:31>:f32, 2.000000e+00:15>>}>
 // LOBIT:  %2 = "tfl.dequantize"(%1)
 // LOBIT:  return %2
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
index 1b323ff3df689b..785cfa2fa2d26f 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
@@ -23,13 +23,13 @@ func.func @conv(tensor<256x32x32x3xf32>, tensor<3x3x3x16xf32>, tensor<256x3x32x3
 // CHECK-DAG:  %[[CONSTANT0:.*]] = arith.constant dense<[3, 0, 1, 2]> : tensor<4xi32>
 // CHECK-DAG:  %[[CONSTANT1:.*]] = arith.constant dense<[{{\[}}0, 0], [1, 1], [1, 1], [0, 0]]> : tensor<4x2xi32>
 // CHECK:  %0 = "tf.Transpose"(%arg1, %[[CONSTANT0]]) : (tensor<3x3x3x16xf32>, tensor<4xi32>) -> tensor<16x3x3x3xf32>
-// CHECK:  %1 = "tfl.conv_2d"(%arg0, %0, %[[CONSTANT]]) {dilation_h_factor = 2 : i32, dilation_w_factor = 3 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 4 : i32, stride_w = 5 : i32} : (tensor<256x32x32x3xf32>, tensor<16x3x3x3xf32>, tensor<16xf32>) -> tensor<256x8x7x16xf32>
+// CHECK:  %1 = "tfl.conv_2d"(%arg0, %0, %[[CONSTANT]]) <{dilation_h_factor = 2 : i32, dilation_w_factor = 3 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 4 : i32, stride_w = 5 : i32}> : (tensor<256x32x32x3xf32>, tensor<16x3x3x3xf32>, tensor<16xf32>) -> tensor<256x8x7x16xf32>
 // CHECK:  %2 = "tf.Conv2D"
 // CHECK:  %3 = "tf.Transpose"(%arg1, %[[CONSTANT0]]) : (tensor<3x3x3x16xf32>, tensor<4xi32>) -> tensor<16x3x3x3xf32>
-// CHECK:  %4 = "tfl.conv_2d"(%arg0, %3, %[[CONSTANT]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 4 : i32, stride_w = 5 : i32} : (tensor<256x32x32x3xf32>, tensor<16x3x3x3xf32>, tensor<16xf32>) -> tensor<256x8x6x16xf32>
+// CHECK:  %4 = "tfl.conv_2d"(%arg0, %3, %[[CONSTANT]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 4 : i32, stride_w = 5 : i32}> : (tensor<256x32x32x3xf32>, tensor<16x3x3x3xf32>, tensor<16xf32>) -> tensor<256x8x6x16xf32>
 // CHECK:  %5 = "tf.Pad"(%arg0, %[[CONSTANT1]]) : (tensor<256x32x32x3xf32>, tensor<4x2xi32>) -> tensor<*xf32>
 // CHECK:  %6 = "tf.Transpose"(%arg1, %[[CONSTANT0]]) : (tensor<3x3x3x16xf32>, tensor<4xi32>) -> tensor<16x3x3x3xf32>
-// CHECK:  %7 = "tfl.conv_2d"(%5, %6, %[[CONSTANT]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<*xf32>, tensor<16x3x3x3xf32>, tensor<16xf32>) -> tensor<256x32x32x16xf32>
+// CHECK:  %7 = "tfl.conv_2d"(%5, %6, %[[CONSTANT]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<*xf32>, tensor<16x3x3x3xf32>, tensor<16xf32>) -> tensor<256x32x32x16xf32>
 // CHECK:  %8 = "tf.Conv2D"(%arg0, %arg1) <{data_format = "NHWC", dilations = [1, 1, 1, 1], padding = "SAME", strides = [2, 1, 1, 1]}> {T = "tfdtype$DT_FLOAT"} : (tensor<256x32x32x3xf32>, tensor<3x3x3x16xf32>) -> tensor<256x32x32x16xf32>
 }
 
@@ -50,10 +50,10 @@ func.func @depthwiseConv2D(tensor<256x32x32x3xf32>, tensor<3x3x3x4xf32>, tensor<
 // CHECK-DAG:  %[[CONSTANT:.*]] = arith.constant dense<0.000000e+00> : tensor<12xf32>
 // CHECK-DAG:  %[[CONSTANT0:.*]] = arith.constant dense<[1, 3, 3, 12]> : tensor<4xi32>
 // CHECK:  %0 = "tf.Reshape"(%arg1, %[[CONSTANT0]]) : (tensor<3x3x3x4xf32>, tensor<4xi32>) -> tensor<1x3x3x12xf32>
-// CHECK:  %1 = "tfl.depthwise_conv_2d"(%arg0, %0, %[[CONSTANT]]) {depth_multiplier = 4 : i32, dilation_h_factor = 2 : i32, dilation_w_factor = 3 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 4 : i32, stride_w = 5 : i32} : (tensor<256x32x32x3xf32>, tensor<1x3x3x12xf32>, tensor<12xf32>) -> tensor<256x30x30x12xf32>
+// CHECK:  %1 = "tfl.depthwise_conv_2d"(%arg0, %0, %[[CONSTANT]]) <{depth_multiplier = 4 : i32, dilation_h_factor = 2 : i32, dilation_w_factor = 3 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 4 : i32, stride_w = 5 : i32}> : (tensor<256x32x32x3xf32>, tensor<1x3x3x12xf32>, tensor<12xf32>) -> tensor<256x30x30x12xf32>
 // CHECK:  %2 = "tf.DepthwiseConv2dNative"
 // CHECK:  %3 = "tf.Reshape"(%arg1, %[[CONSTANT0]]) : (tensor<3x3x3x4xf32>, tensor<4xi32>) -> tensor<1x3x3x12xf32>
-// CHECK:  %4 = "tfl.depthwise_conv_2d"(%arg0, %3, %[[CONSTANT]]) {depth_multiplier = 4 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 4 : i32, stride_w = 5 : i32} : (tensor<256x32x32x3xf32>, tensor<1x3x3x12xf32>, tensor<12xf32>) -> tensor<256x30x30x12xf32>
+// CHECK:  %4 = "tfl.depthwise_conv_2d"(%arg0, %3, %[[CONSTANT]]) <{depth_multiplier = 4 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 4 : i32, stride_w = 5 : i32}> : (tensor<256x32x32x3xf32>, tensor<1x3x3x12xf32>, tensor<12xf32>) -> tensor<256x30x30x12xf32>
 // CHECK:  %5 = "tf.DepthwiseConv2dNative"
 }
 
@@ -145,7 +145,7 @@ func.func @QDQsFollowedByTranspose(tensor<1x2xf32>) -> (tensor<2x1xf32>) {
 // CHECK: %cst = arith.constant
 // CHECK: %[[trans:.*]] = "tf.Transpose"
 // CHECK-SAME: -> tensor<2x1xf32>
-// CHECK: %[[q:.*]] = "tfl.quantize"(%[[trans]]) {qtype = tensor<2x1x!quant.uniform<u8:f32, 1.000000e+00>>}
+// CHECK: %[[q:.*]] = "tfl.quantize"(%[[trans]]) <{qtype = tensor<2x1x!quant.uniform<u8:f32, 1.000000e+00>>}>
 // CHECK-SAME: -> tensor<2x1x!quant.uniform<u8:f32, 1.000000e+00>>
 // CHECK: %[[dq:.*]] = "tfl.dequantize"(%[[q]])
 // CHECK-SAME: -> tensor<2x1xf32>
@@ -164,7 +164,7 @@ func.func @QDQFollowedByReshape(tensor<1x2xf32>) -> (tensor<2x1xf32>) {
 // CHECK: %cst = arith.constant
 // CHECK: %[[rs:.*]] = "tf.Reshape"
 // CHECK-SAME: -> tensor<2x1xf32>
-// CHECK: %[[q:.*]] = "tfl.quantize"(%[[rs]]) {qtype = tensor<2x1x!quant.uniform<u8:f32, 1.000000e+00>>}
+// CHECK: %[[q:.*]] = "tfl.quantize"(%[[rs]]) <{qtype = tensor<2x1x!quant.uniform<u8:f32, 1.000000e+00>>}>
 // CHECK-SAME: -> tensor<2x1x!quant.uniform<u8:f32, 1.000000e+00>>
 // CHECK: %[[dq:.*]] = "tfl.dequantize"(%[[q]])
 // CHECK-SAME: -> tensor<2x1xf32>
@@ -503,7 +503,7 @@ func.func @xla_conv_v2(%arg0: tensor<4x8x8x16xf32>) -> tensor<4x8x8x16xf32> {
   func.return %4 : tensor<4x8x8x16xf32>
   // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0.000000e+00> : tensor<16xf32>
   // CHECK-DAG: %[[CST0:.*]] = arith.constant dense<1.000000e+00> : tensor<16x3x3x16xf32>
-  // CHECK: %[[RES:.*]] = "tfl.conv_2d"(%arg0, %[[CST0]], %[[CST]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<4x8x8x16xf32>, tensor<16x3x3x16xf32>, tensor<16xf32>) -> tensor<4x8x8x16xf32>
+  // CHECK: %[[RES:.*]] = "tfl.conv_2d"(%arg0, %[[CST0]], %[[CST]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<4x8x8x16xf32>, tensor<16x3x3x16xf32>, tensor<16xf32>) -> tensor<4x8x8x16xf32>
   // CHECK: return %[[RES]]
 }
 
@@ -661,7 +661,7 @@ func.func @QuantDequantTranspose(%arg0: tensor<2x3xf32>) -> (tensor<2x4xf32>) {
   // CHECK-LABEL: QuantDequantTranspose
   // CHECK-DAG: %[[CST:.*]] = "tf.Const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<?xi32>
   // CHECK-DAG: %[[CST_0:.*]] = arith.constant dense<1.00392163> : tensor<3x4xf32>
-  // CHECK: %[[QUANT:.*]] = "tfl.quantize"(%[[CST_0]]) {qtype = tensor<3x4x!quant.uniform<u8:f32:1, {0.0078431372549019607:128,0.0078431372549019607:128,0.0078431372549019607:128,0.0078431372549019607:128}>>} : (tensor<3x4xf32>) -> tensor<3x4x!quant.uniform<u8:f32:1, {0.0078431372549019607:128,0.0078431372549019607:128,0.0078431372549019607:128,0.0078431372549019607:128}>>
+  // CHECK: %[[QUANT:.*]] = "tfl.quantize"(%[[CST_0]]) <{qtype = tensor<3x4x!quant.uniform<u8:f32:1, {0.0078431372549019607:128,0.0078431372549019607:128,0.0078431372549019607:128,0.0078431372549019607:128}>>}> : (tensor<3x4xf32>) -> tensor<3x4x!quant.uniform<u8:f32:1, {0.0078431372549019607:128,0.0078431372549019607:128,0.0078431372549019607:128,0.0078431372549019607:128}>>
   // CHECK: %[[DEQUANT:.*]] = "tfl.dequantize"(%[[QUANT]]) : (tensor<3x4x!quant.uniform<u8:f32:1, {0.0078431372549019607:128,0.0078431372549019607:128,0.0078431372549019607:128,0.0078431372549019607:128}>>) -> tensor<3x4xf32>
   // CHECK: %[[TRANSPOSE:.*]] = "tf.Transpose"(%[[DEQUANT]], %[[CST]]) : (tensor<3x4xf32>, tensor<?xi32>) -> tensor<*xf32>
   // CHECK: %[[MATMUL:.*]] = "tf.MatMul"(%arg0, %[[TRANSPOSE]]) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = true}> : (tensor<2x3xf32>, tensor<*xf32>) -> tensor<2x4xf32>
@@ -675,7 +675,7 @@ func.func @GroupConv(%arg0: tensor<?x1x26x14xf32>, %arg1: tensor<1x3x2x14xf32>)
   // CHECK-DAG:  %[[CONSTANT:.*]] = arith.constant dense<0.000000e+00> : tensor<14xf32>
   // CHECK-DAG:  %[[CONSTANT0:.*]] = arith.constant dense<[3, 0, 1, 2]> : tensor<4xi32>
   // CHECK:  %0 = "tf.Transpose"(%arg1, %[[CONSTANT0]]) : (tensor<1x3x2x14xf32>, tensor<4xi32>) -> tensor<14x1x3x2xf32>
-  // CHECK:  %1 = "tfl.conv_2d"(%arg0, %0, %[[CONSTANT]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 5 : i32} : (tensor<?x1x26x14xf32>, tensor<14x1x3x2xf32>, tensor<14xf32>) -> tensor<?x1x6x14xf32>
+  // CHECK:  %1 = "tfl.conv_2d"(%arg0, %0, %[[CONSTANT]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 5 : i32}> : (tensor<?x1x26x14xf32>, tensor<14x1x3x2xf32>, tensor<14xf32>) -> tensor<?x1x6x14xf32>
 }
 
 func.func @UnsupportedGroupConv_UnrankedTensorType(%arg0: tensor<*xf32>, %arg1: tensor<1x3x2x14xf32>) -> (tensor<?x1x6x14xf32>) {
@@ -704,4 +704,80 @@ func.func @RedundantShapeOp(%shape: tensor<?xi64>, %fill: tensor<f32>) -> (tenso
   // CHECK-LABEL: RedundantShapeOp
   // CHECK-NOT: "tf.Shape"
 }
+
+// CHECK-LABEL: @MoveTransposeAcrossPerChannelQuant
+func.func @MoveTransposeAcrossPerChannelQuant(%arg0 : tensor<1x224x224x3xf32>) -> tensor<1x112x112x6xf32> {
+  %cst = "tf.Const"() <{value = dense<6.0> : tensor<6x3x7x7xf32>}> : () -> tensor<6x3x7x7xf32>
+  %cst_14 = "tf.Const"() <{value = dense<[2, 3, 1, 0]> : tensor<4xi64>}> : () -> tensor<4xi64>
+  %126 = "tfl.quantize"(%cst) {qtype = tensor<6x3x7x7x!quant.uniform<i8<-127:127>:f32:0, {1.412750e-03,3.503970e-04,2.441410e-04,3.823330e-04,2.441410e-04,8.950800e-04}>>} : (tensor<6x3x7x7xf32>) -> tensor<6x3x7x7x!quant.uniform<i8<-127:127>:f32:0, {1.412750e-03,3.503970e-04,2.441410e-04,3.823330e-04,2.441410e-04,8.950800e-04}>>
+  %127 = "tfl.dequantize"(%126) : (tensor<6x3x7x7x!quant.uniform<i8<-127:127>:f32:0, {1.412750e-03,3.503970e-04,2.441410e-04,3.823330e-04,2.441410e-04,8.950800e-04}>>) -> tensor<6x3x7x7xf32>
+  %129 = "tf.Transpose"(%127, %cst_14) : (tensor<6x3x7x7xf32>, tensor<4xi64>) -> tensor<7x7x3x6xf32>
+  %130 = "tf.Conv2D"(%arg0, %129) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [0, 0, 3, 3, 3, 3, 0, 0], padding = "EXPLICIT", strides = [1, 2, 2, 1], use_cudnn_on_gpu = true}> : (tensor<1x224x224x3xf32>, tensor<7x7x3x6xf32>) -> tensor<1x112x112x6xf32>
+  return %130 : tensor<1x112x112x6xf32>
+  // CHECK: %cst = arith.constant dense<6.000000e+00> : tensor<6x7x7x3xf32>
+  // CHECK: %cst_0 = arith.constant dense<0.000000e+00> : tensor<6xf32>
+  // CHECK: %cst_1 = arith.constant dense<{{\[\[}}0, 0], [3, 3], [3, 3], [0, 0]]> : tensor<4x2xi32>
+  // CHECK: %0 = "tf.Pad"(%arg0, %cst_1) : (tensor<1x224x224x3xf32>, tensor<4x2xi32>) -> tensor<*xf32>
+  // CHECK: %1 = "tfl.quantize"(%cst) <{qtype = tensor<6x7x7x3x!quant.uniform<i8<-127:127>:f32:0, {1.412750e-03,3.503970e-04,2.441410e-04,3.823330e-04,2.441410e-04,8.950800e-04}>>}> : (tensor<6x7x7x3xf32>) -> tensor<6x7x7x3x!quant.uniform<i8<-127:127>:f32:0, {1.412750e-03,3.503970e-04,2.441410e-04,3.823330e-04,2.441410e-04,8.950800e-04}>>
+  // CHECK: %2 = "tfl.dequantize"(%1) : (tensor<6x7x7x3x!quant.uniform<i8<-127:127>:f32:0, {1.412750e-03,3.503970e-04,2.441410e-04,3.823330e-04,2.441410e-04,8.950800e-04}>>) -> tensor<6x7x7x3xf32>
+  // CHECK: %3 = "tfl.conv_2d"(%0, %2, %cst_0) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 2 : i32, stride_w = 2 : i32}> : (tensor<*xf32>, tensor<6x7x7x3xf32>, tensor<6xf32>) -> tensor<1x112x112x6xf32>
+  // CHECK: return %3 : tensor<1x112x112x6xf32>
+}
+
+// CHECK-LABEL: @FoldDoubleTranspose
+func.func @FoldDoubleTranspose(%arg0: tensor<1x4x1440x256xf32>) -> tensor<1x1440x256x4xf32> {
+    %cst_12 = arith.constant dense<[0, 1, 3, 2]> : tensor<4xi32>
+    %cst_18 = arith.constant dense<[0, 2, 1, 3]> : tensor<4xi32>
+    %2112 = "tf.Transpose"(%arg0, %cst_18) : (tensor<1x4x1440x256xf32>, tensor<4xi32>) -> tensor<1x1440x4x256xf32>
+    %2114 = "tf.Transpose"(%2112, %cst_12) : (tensor<1x1440x4x256xf32>, tensor<4xi32>) -> tensor<1x1440x256x4xf32>
+    return %2114 : tensor<1x1440x256x4xf32>
+  // CHECK-DAG: %cst = arith.constant dense<[0, 2, 3, 1]> : tensor<4xi32>
+  // CHECK: %0 = "tf.Transpose"(%arg0, %cst) : (tensor<1x4x1440x256xf32>, tensor<4xi32>) -> tensor<1x1440x256x4xf32>
+  // CHECK: return %0
+}
+
+// CHECK-LABEL: @FoldMultpleTranspose
+func.func @FoldMultpleTranspose(%arg0: tensor<1x4x1440x256xf32>) -> tensor<1x256x4x1440xf32> {
+    %cst_11 = arith.constant dense<[0, 2, 3, 1]> : tensor<4xi32>
+    %cst_12 = arith.constant dense<[0, 1, 3, 2]> : tensor<4xi32>
+    %cst_18 = arith.constant dense<[0, 2, 1, 3]> : tensor<4xi32>
+    %2112 = "tf.Transpose"(%arg0, %cst_11) : (tensor<1x4x1440x256xf32>, tensor<4xi32>) -> tensor<1x1440x256x4xf32>
+    %2113 = "tf.Transpose"(%2112, %cst_18) : (tensor<1x1440x256x4xf32>, tensor<4xi32>) -> tensor<1x256x1440x4xf32>
+    %2114 = "tf.Transpose"(%2113, %cst_12) : (tensor<1x256x1440x4xf32>, tensor<4xi32>) -> tensor<1x256x4x1440xf32>
+    return %2114 : tensor<1x256x4x1440xf32>
+  // CHECK-DAG: %cst = arith.constant dense<[0, 3, 1, 2]> : tensor<4xi32>
+  // CHECK: %0 = "tf.Transpose"(%arg0, %cst) : (tensor<1x4x1440x256xf32>, tensor<4xi32>) -> tensor<1x256x4x1440xf32>
+  // CHECK: return %0
+}
+
+// CHECK-LABEL @FoldTrivialReshapeIntoTranspose
+func.func @FoldTrivialReshapeIntoTranspose(%arg: tensor<2x1x3x3xf32>) -> tensor<1x3x3x2xf32> {
+  %cst = arith.constant dense<[1, 3, 3, 2]> : tensor<4xi32>
+  %cst_2 = arith.constant dense<[2, 3, 0, 1]> : tensor<4xi32>
+  %2 = "tf.Transpose"(%arg, %cst_2) : (tensor<2x1x3x3xf32>, tensor<4xi32>) -> tensor<3x3x2x1xf32>
+  %3 = "tf.Reshape"(%2, %cst) : (tensor<3x3x2x1xf32>, tensor<4xi32>) -> tensor<1x3x3x2xf32>
+  return %3: tensor<1x3x3x2xf32>
+  // CHECK:  %cst = arith.constant dense<[1, 2, 3, 0]> : tensor<4xi32>
+  // CHECK:  %0 = "tf.Transpose"(%arg0, %cst) : (tensor<2x1x3x3xf32>, tensor<4xi32>) -> tensor<1x3x3x2xf32>
+  // CHECK:  return %0 : tensor<1x3x3x2xf32>
+}
+
+// CHECK-LABEL: @MoveTransposeAcrossDepthwiseConvPerChannelQuant
+func.func @MoveTransposeAcrossDepthwiseConvPerChannelQuant(%arg0: tensor<1x112x112x2xf32>) -> tensor<1x112x112x2xf32> {
+  %cst = arith.constant dense<[1, 2, 3, 0]> : tensor<4xi32>
+  %cst_0 = arith.constant dense<0.000000e+00> : tensor<2xf32>
+  %cst_1 = arith.constant dense<6.000000e+00> : tensor<2x1x3x3xf32>
+  %0 = "tfl.quantize"(%cst_1) {qtype = tensor<2x1x3x3x!quant.uniform<i8<-127:127>:f32:0, {6.587140e-03,1.888450e-02}>>} : (tensor<2x1x3x3xf32>) -> tensor<2x1x3x3x!quant.uniform<i8<-127:127>:f32:0, {6.587140e-03,1.888450e-02}>>
+  %1 = "tfl.dequantize"(%0) : (tensor<2x1x3x3x!quant.uniform<i8<-127:127>:f32:0, {6.587140e-03,1.888450e-02}>>) -> tensor<2x1x3x3xf32>
+  %2 = "tf.Transpose"(%1, %cst) : (tensor<2x1x3x3xf32>, tensor<4xi32>) -> tensor<1x3x3x2xf32>
+  %3 = "tfl.depthwise_conv_2d"(%arg0, %2, %cst_0) {depth_multiplier = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x112x112x2xf32>, tensor<1x3x3x2xf32>, tensor<2xf32>) -> tensor<1x112x112x2xf32>
+  return %3 : tensor<1x112x112x2xf32>
+  // CHECK: %cst = arith.constant dense<0.000000e+00> : tensor<2xf32>
+  // CHECK: %cst_0 = arith.constant dense<6.000000e+00> : tensor<1x3x3x2xf32>
+  // CHECK: %0 = "tfl.quantize"(%cst_0) <{qtype = tensor<1x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {6.587140e-03,1.888450e-02}>>}> : (tensor<1x3x3x2xf32>) -> tensor<1x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {6.587140e-03,1.888450e-02}>>
+  // CHECK: %1 = "tfl.dequantize"(%0) : (tensor<1x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {6.587140e-03,1.888450e-02}>>) -> tensor<1x3x3x2xf32>
+  // CHECK: %2 = "tfl.depthwise_conv_2d"(%arg0, %1, %cst) <{depth_multiplier = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<1x112x112x2xf32>, tensor<1x3x3x2xf32>, tensor<2xf32>) -> tensor<1x112x112x2xf32>
+  // CHECK: return %2 : tensor<1x112x112x2xf32>
+}
+
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/push-tpose-through-ewise.mlir b/tensorflow/compiler/mlir/lite/tests/push-tpose-through-ewise.mlir
index f5a6d68c6f7ec3..a5da33ca90191b 100644
--- a/tensorflow/compiler/mlir/lite/tests/push-tpose-through-ewise.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/push-tpose-through-ewise.mlir
@@ -95,7 +95,7 @@ func.func @pushTposeBcastNoChange(%arg0: tensor<2x3x4x1xf32>) -> tensor<5x2x3x4x
 // CHECK: %cst = arith.constant dense<1.000000e+00> : tensor<5x2x3x4xf32>
 // CHECK: %cst_0 = arith.constant dense<[3, 0, 1, 2]> : tensor<4xi32>
 // CHECK: %0 = "tfl.transpose"(%arg0, %cst_0) : (tensor<2x3x4x1xf32>, tensor<4xi32>) -> tensor<1x2x3x4xf32>
-// CHECK: %1 = tfl.add(%0, %cst) {fused_activation_function = "NONE"} : (tensor<1x2x3x4xf32>, tensor<5x2x3x4xf32>) -> tensor<5x2x3x4xf32>
+// CHECK: %1 = tfl.add(%0, %cst) <{fused_activation_function = "NONE"}> : (tensor<1x2x3x4xf32>, tensor<5x2x3x4xf32>) -> tensor<5x2x3x4xf32>
 
 // -----
 
@@ -110,7 +110,7 @@ func.func @doubleTposeOneBroadcastInput(%arg0: tensor<2x3x4x1xf32>, %arg1: tenso
 }
 
 // CHECK: %cst = arith.constant dense<[3, 0, 1, 2]> : tensor<4xi32>
-// CHECK: %0 = tfl.add(%arg0, %arg1) {fused_activation_function = "NONE"} : (tensor<2x3x4x1xf32>, tensor<2x3x4x5xf32>) -> tensor<2x3x4x5xf32>
+// CHECK: %0 = tfl.add(%arg0, %arg1) <{fused_activation_function = "NONE"}> : (tensor<2x3x4x1xf32>, tensor<2x3x4x5xf32>) -> tensor<2x3x4x5xf32>
 // CHECK: %1 = "tfl.transpose"(%0, %cst) : (tensor<2x3x4x5xf32>, tensor<4xi32>) -> tensor<5x2x3x4xf32>
 // CHECK: return %1 : tensor<5x2x3x4xf32>
 
@@ -145,7 +145,7 @@ func.func @pushTposeBcastCstInput(%arg0: tensor<2x3x4x5xf32>) -> tensor<5x2x3x4x
 
 // CHECK: %cst = arith.constant dense<[3, 0, 1, 2]> : tensor<4xi32>
 // CHECK: %cst_0 = arith.constant dense<1.000000e+00> : tensor<2x3x4x1xf32>
-// CHECK: %0 = tfl.add(%arg0, %cst_0) {fused_activation_function = "NONE"} : (tensor<2x3x4x5xf32>, tensor<2x3x4x1xf32>) -> tensor<2x3x4x5xf32>
+// CHECK: %0 = tfl.add(%arg0, %cst_0) <{fused_activation_function = "NONE"}> : (tensor<2x3x4x5xf32>, tensor<2x3x4x1xf32>) -> tensor<2x3x4x5xf32>
 // CHECK: %1 = "tfl.transpose"(%0, %cst) : (tensor<2x3x4x5xf32>, tensor<4xi32>) -> tensor<5x2x3x4xf32>
 
 // -----
@@ -161,7 +161,7 @@ func.func @pushTposeBcastScalarCstInput(%arg0: tensor<2x3x4x5xf32>) -> tensor<5x
 
 // CHECK: %cst = arith.constant dense<1.000000e+00> : tensor<f32>
 // CHECK: %cst_0 = arith.constant dense<[3, 0, 1, 2]> : tensor<4xi32>
-// CHECK: %0 = tfl.add(%arg0, %cst) {fused_activation_function = "NONE"} : (tensor<2x3x4x5xf32>, tensor<f32>) -> tensor<2x3x4x5xf32>
+// CHECK: %0 = tfl.add(%arg0, %cst) <{fused_activation_function = "NONE"}> : (tensor<2x3x4x5xf32>, tensor<f32>) -> tensor<2x3x4x5xf32>
 // CHECK: %1 = "tfl.transpose"(%0, %cst_0) : (tensor<2x3x4x5xf32>, tensor<4xi32>) -> tensor<5x2x3x4xf32>
 
 
diff --git a/tensorflow/compiler/mlir/lite/tests/quantize-dynamic-range-float16.mlir b/tensorflow/compiler/mlir/lite/tests/quantize-dynamic-range-float16.mlir
new file mode 100644
index 00000000000000..5e0599560975fb
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/quantize-dynamic-range-float16.mlir
@@ -0,0 +1,78 @@
+// RUN: tf-opt %s -tfl-prepare-quantize-dynamic-range="enable-float16-quantization" -tfl-quantize="enable-dynamic-range-quantization=true" | FileCheck --check-prefix=CHECK %s
+
+// CHECK-LABEL: QuantizeUnidirectionalLstm
+func.func @QuantizeUnidirectionalLstm(%arg0: tensor<1x2x3xf32>) -> (tensor<1x2x3xf32>) {
+  %1 = "tfl.pseudo_const"() {value = dense<[[0.1]]> : tensor<1x1xf32>} : () -> tensor<1x1xf32>
+  %2 = "tfl.pseudo_const"() {value = dense<[[0.2]]> : tensor<1x1xf32>} : () -> tensor<1x1xf32>
+  %3 = "tfl.pseudo_const"() {value = dense<[[0.3]]> : tensor<1x1xf32>} : () -> tensor<1x1xf32>
+  %4 = "tfl.pseudo_const"() {value = dense<[[0.4]]> : tensor<1x1xf32>} : () -> tensor<1x1xf32>
+  %5 = "tfl.pseudo_const"() {value = dense<[[0.5]]> : tensor<1x1xf32>} : () -> tensor<1x1xf32>
+  %6 = "tfl.pseudo_const"() {value = dense<[[0.6]]> : tensor<1x1xf32>} : () -> tensor<1x1xf32>
+  %7 = "tfl.pseudo_const"() {value = dense<[[0.7]]> : tensor<1x1xf32>} : () -> tensor<1x1xf32>
+  %8 = "tfl.pseudo_const"() {value = dense<[[0.8]]> : tensor<1x1xf32>} : () -> tensor<1x1xf32>
+  %9 = "tfl.no_value"() {value} : () -> none
+  %10 = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
+  %11 = "tfl.pseudo_const"() {value = dense<1.000000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
+  %recurrent_input = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<1x3xf32>} : () -> tensor<1x3xf32>
+  %cell_input = "tfl.pseudo_const"() {value = dense<1.000000e+00> : tensor<1x3xf32>} : () -> tensor<1x3xf32>
+  %16 = "tfl.unidirectional_sequence_lstm"(
+    %arg0,
+    %1, %2, %3, %4,
+    %5, %6, %7, %8,
+    %9, %9, %9,
+    %10, %11,
+    %10, %10,
+    %9, %9,
+    %recurrent_input, %cell_input,
+    %9, %9, %9, %9) {
+      cell_clip = 1.000000e+01 : f32,
+      fused_activation_function = "TANH",
+      proj_clip = 0.000000e+00 : f32,
+      time_major = false} : (
+        tensor<1x2x3xf32>,
+        tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>,
+        tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>,
+        none, none, none,
+        tensor<3xf32>, tensor<3xf32>, tensor<3xf32>, tensor<3xf32>,
+        none, none,
+        tensor<1x3xf32>, tensor<1x3xf32>,
+        none, none, none, none) -> tensor<1x2x3xf32>
+  %17 = "quantfork.stats"(%16) {layerStats = dense<[-0.1, 0.1]> : tensor<2xf32>} : (tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
+  func.return %17 : tensor<1x2x3xf32>
+
+  // CHECK: %[[NONE:.*]] = "tfl.no_value"() <{value}> : () -> none
+  // CHECK: %[[DQ_1:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x1xf16>) -> tensor<1x1xf32>
+  // CHECK: %[[DQ_2:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x1xf16>) -> tensor<1x1xf32>
+  // CHECK: %[[DQ_3:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x1xf16>) -> tensor<1x1xf32>
+  // CHECK: %[[DQ_4:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x1xf16>) -> tensor<1x1xf32>
+  // CHECK: %[[DQ_5:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x1xf16>) -> tensor<1x1xf32>
+  // CHECK: %[[DQ_6:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x1xf16>) -> tensor<1x1xf32>
+  // CHECK: %[[DQ_7:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x1xf16>) -> tensor<1x1xf32>
+  // CHECK: %[[DQ_8:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x1xf16>) -> tensor<1x1xf32>
+  // CHECK: %[[DQ_9:.*]] = "tfl.dequantize"({{.*}}) : (tensor<3xf16>) -> tensor<3xf32>
+  // CHECK: %[[DQ_10:.*]] = "tfl.dequantize"({{.*}}) : (tensor<3xf16>) -> tensor<3xf32>
+  // CHECK: %[[DQ_11:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x3xf16>) -> tensor<1x3xf32>
+  // CHECK: %[[DQ_12:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x3xf16>) -> tensor<1x3xf32>
+  // CHECK: %[[lstm:.*]] = "tfl.unidirectional_sequence_lstm"(
+  // CHECK-SAME: %arg0,
+  // CHECK-SAME: %[[DQ_1]], %[[DQ_2]], %[[DQ_3]], %[[DQ_4]],
+  // CHECK-SAME: %[[DQ_5]], %[[DQ_6]], %[[DQ_7]], %[[DQ_8]],
+  // CHECK-SAME: %[[NONE]], %[[NONE]], %[[NONE]],
+  // CHECK-SAME: %[[DQ_9]], %[[DQ_10]], %[[DQ_9]], %[[DQ_9]],
+  // CHECK-SAME: %[[NONE]], %[[NONE]],
+  // CHECK-SAME: %[[DQ_11]], %[[DQ_12]],
+  // CHECK-SAME: %[[NONE]], %[[NONE]], %[[NONE]], %[[NONE]]) <{
+  // CHECK-SAME: cell_clip = 1.000000e+01 : f32,
+  // CHECK-SAME: fused_activation_function = "TANH",
+  // CHECK-SAME: proj_clip = 0.000000e+00 : f32,
+  // CHECK-SAME: time_major = false}> : (
+  // CHECK-SAME: tensor<1x2x3xf32>,
+  // CHECK-SAME: tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>,
+  // CHECK-SAME: tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>,
+  // CHECK-SAME: none, none, none,
+  // CHECK-SAME: tensor<3xf32>, tensor<3xf32>, tensor<3xf32>, tensor<3xf32>,
+  // CHECK-SAME: none, none,
+  // CHECK-SAME: tensor<1x3xf32>, tensor<1x3xf32>,
+  // CHECK-SAME: none, none, none, none)
+  // CHECK-SAME: -> tensor<1x2x3xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/quantize-dynamic-range.mlir b/tensorflow/compiler/mlir/lite/tests/quantize-dynamic-range.mlir
index ad4ff5a129f4a2..47a2947692d1eb 100644
--- a/tensorflow/compiler/mlir/lite/tests/quantize-dynamic-range.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/quantize-dynamic-range.mlir
@@ -18,39 +18,39 @@ func.func @QuantizeConv2D(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x112x112x64
   func.return %conv : tensor<1x112x112x64xf32>
 
 // CHECK: %[[b:.*]] = arith.constant dense<-1.23697901> : tensor<64xf32>
-// CHECK: %[[w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {
-// CHECK: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[w]], %[[b]]) {
+// CHECK: %[[w:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {
+// CHECK: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[w]], %[[b]]) <{
 // CHECK-NOT: asymmetric_quantize_inputs = true
 // CHECK-SAME: dilation_h_factor = 1 : i32
 // CHECK: return %[[conv:.*]]
 
 // PerTensor: %[[b:.*]] = arith.constant dense<-1.23697901> : tensor<64xf32>
-// PerTensor: %[[w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
-// PerTensor: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[w]], %[[b]]) {
+// PerTensor: %[[w:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+// PerTensor: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[w]], %[[b]]) <{
 // PerTensor-NOT: asymmetric_quantize_inputs = true
 // PerTensor-SAME: dilation_h_factor = 1 : i32
 // PerTensor: return %[[conv:.*]]
 
 // PerChannelWeightOnly: %[[b:.*]] = arith.constant dense<-1.23697901> : tensor<64xf32>
-// PerChannelWeightOnly: %[[w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {
+// PerChannelWeightOnly: %[[w:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {
 // PerChannelWeightOnly: %[[dq_w:.*]] = "tfl.dequantize"(%[[w]]) : (tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {
-// PerChannelWeightOnly: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[dq_w]], %[[b]]) {
+// PerChannelWeightOnly: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[dq_w]], %[[b]]) <{
 // PerChannelWeightOnly-NOT: asymmetric_quantize_inputs = true
 // PerChannelWeightOnly-SAME: dilation_h_factor = 1 : i32
 // PerChannelWeightOnly: return %[[conv:.*]]
 
 // PerTensorWeightOnly: %[[b:.*]] = arith.constant dense<-1.23697901> : tensor<64xf32>
-// PerTensorWeightOnly: %[[w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+// PerTensorWeightOnly: %[[w:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
 // PerTensorWeightOnly: %[[dq_w:.*]] = "tfl.dequantize"(%[[w]]) : (tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
-// PerTensorWeightOnly: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[dq_w]], %[[b]]) {
+// PerTensorWeightOnly: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[dq_w]], %[[b]]) <{
 // PerTensorWeightOnly-NOT: asymmetric_quantize_inputs = true
 // PerTensorWeightOnly-SAME: dilation_h_factor = 1 : i32
 // PerTensorWeightOnly: return %[[conv:.*]]
 
 // BLOCK: %[[b:.*]] = arith.constant dense<-1.23697901> : tensor<64xf32>
-// BLOCK: %[[w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+// BLOCK: %[[w:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
 // BLOCK: %[[dq_w:.*]] = "tfl.dequantize"(%[[w]]) : (tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
-// BLOCK: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[dq_w]], %[[b]]) {
+// BLOCK: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[dq_w]], %[[b]]) <{
 // BLOCK: return %[[conv:.*]]
 }
 
@@ -63,15 +63,15 @@ func.func @QuantizeDepthwiseConv2D(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x1
   func.return %dconv : tensor<1x112x112x64xf32>
 
 // CHECK: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<64xf32>
-// CHECK: %[[w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32:3, {1.000000e+00,1.000000e+00,1.000000e+00}
-// CHECK: %[[dconv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[w]], %[[b]]) {
+// CHECK: %[[w:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32:3, {1.000000e+00,1.000000e+00,1.000000e+00}
+// CHECK: %[[dconv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[w]], %[[b]]) <{
 // CHECK-NOT: asymmetric_quantize_inputs = true
 // CHECK-SAME: depth_multiplier = 4 : i32
 // CHECK: return %[[dconv:.*]]
 
 // PerTensor: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<64xf32>
-// PerTensor: %[[w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
-// PerTensor: %[[dconv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[w]], %[[b]]) {
+// PerTensor: %[[w:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+// PerTensor: %[[dconv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[w]], %[[b]]) <{
 // PerTensor-NOT: asymmetric_quantize_inputs = true
 // PerTensor-SAME: depth_multiplier = 4 : i32
 // PerTensor: return %[[dconv:.*]]
@@ -88,31 +88,31 @@ func.func @QuantizeFullyConnected(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x11
   func.return %fc : tensor<1x112x112x512xf32>
 
 // CHECK: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<512xf32>
-// CHECK: %[[w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<512x12x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00
-// CHECK: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %[[w]], %[[b]]) {
+// CHECK: %[[w:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<512x12x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00
+// CHECK: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %[[w]], %[[b]]) <{
 // CHECK-NOT: fused_activation_function = "NONE",
 // CHECK-SAME: asymmetric_quantize_inputs = true,
 // CHECK: return %[[fc:.*]]
 
 // PerTensor: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<512xf32>
-// PerTensor: %[[w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<512x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
-// PerTensor: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %[[w]], %[[b]]) {
+// PerTensor: %[[w:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<512x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+// PerTensor: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %[[w]], %[[b]]) <{
 // PerTensor-NOT: fused_activation_function = "NONE",
 // PerTensor-SAME: asymmetric_quantize_inputs = true,
 // PerTensor: return %[[fc:.*]]
 
 // PerChannelWeightOnly: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<512xf32>
-// PerChannelWeightOnly: %[[w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<512x12x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00
+// PerChannelWeightOnly: %[[w:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<512x12x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00
 // PerChannelWeightOnly: %[[dq_w:.*]] = "tfl.dequantize"(%[[w]]) : (tensor<512x12x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00
-// PerChannelWeightOnly: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %[[dq_w]], %[[b]]) {
+// PerChannelWeightOnly: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %[[dq_w]], %[[b]]) <{
 // PerChannelWeightOnly-NOT: fused_activation_function = "NONE",
 // PerChannelWeightOnly-SAME: asymmetric_quantize_inputs = true,
 // PerChannelWeightOnly: return %[[fc:.*]]
 
 // PerTensorWeightOnly: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<512xf32>
-// PerTensorWeightOnly: %[[w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<512x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+// PerTensorWeightOnly: %[[w:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<512x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
 // PerTensorWeightOnly: %[[dq_w:.*]] = "tfl.dequantize"(%[[w]]) : (tensor<512x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
-// PerTensorWeightOnly: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %[[dq_w]], %[[b]]) {
+// PerTensorWeightOnly: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %[[dq_w]], %[[b]]) <{
 // PerTensorWeightOnly-NOT: fused_activation_function = "NONE",
 // PerTensorWeightOnly-SAME: asymmetric_quantize_inputs = true,
 // PerTensorWeightOnly: return %[[fc:.*]]
@@ -126,13 +126,13 @@ func.func @QuantizeMatmulWithActConst(%arg0: tensor<1x3x3x512xf32>) -> tensor<1x
   %mm = "tfl.batch_matmul"(%arg0, %w) {adj_x = false, adj_y = false} : (tensor<1x3x3x512xf32>, tensor<512x12xf32>) -> tensor<1x3x3x12xf32>
   func.return %mm : tensor<1x3x3x12xf32>
 
-// CHECK: %[[w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<512x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>,
-// CHECK: %[[mm:.*]] = "tfl.batch_matmul"(%arg0, %[[w]]) {adj_x = false, adj_y = false
+// CHECK: %[[w:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<512x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>,
+// CHECK: %[[mm:.*]] = "tfl.batch_matmul"(%arg0, %[[w]]) <{adj_x = false, adj_y = false
 // CHECK-SAME: , asymmetric_quantize_inputs = true
 // CHECK: return %[[mm:.*]]
 
-// PerTensor: %[[w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<512x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>,
-// PerTensor: %[[mm:.*]] = "tfl.batch_matmul"(%arg0, %[[w]]) {adj_x = false, adj_y = false
+// PerTensor: %[[w:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<512x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>,
+// PerTensor: %[[mm:.*]] = "tfl.batch_matmul"(%arg0, %[[w]]) <{adj_x = false, adj_y = false
 // PerTensor-SAME: , asymmetric_quantize_inputs = true
 // PerTensor: return %[[mm:.*]]
 }
@@ -148,33 +148,33 @@ func.func @QuantizeTransposeConvWeightOnly(%arg0: tensor<32x4x4x128xf32>, %arg1:
   func.return %tconv : tensor<1x32x42x128xf32>
 
 // CHECK: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<1x32x42x128xf32>
-// CHECK: %[[w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<1x32x42x128x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00}>>
+// CHECK: %[[w:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<1x32x42x128x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00}>>
 // CHECK: %[[dq_w:.*]] = "tfl.dequantize"(%[[w]]) : (tensor<1x32x42x128x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00}>>) -> tensor<1x32x42x128xf32>
-// CHECK: %[[tconv:.*]] = "tfl.transpose_conv"(%arg1, %[[dq_w]], %arg0, %[[b]]) {
+// CHECK: %[[tconv:.*]] = "tfl.transpose_conv"(%arg1, %[[dq_w]], %arg0, %[[b]]) <{
 // CHECK-NOT: asymmetric_quantize_inputs = true
 // CHECK-SAME: padding = "SAME"
 // CHECK: return %[[tconv:.*]]
 
 // PerTensor: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<1x32x42x128xf32>
-// PerTensor: %[[w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<1x32x42x128x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+// PerTensor: %[[w:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<1x32x42x128x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
 // PerTensor: %[[dq_w:.*]] = "tfl.dequantize"(%[[w]]) : (tensor<1x32x42x128x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<1x32x42x128xf32>
-// PerTensor: %[[tconv:.*]] = "tfl.transpose_conv"(%arg1, %[[dq_w]], %arg0, %[[b]]) {
+// PerTensor: %[[tconv:.*]] = "tfl.transpose_conv"(%arg1, %[[dq_w]], %arg0, %[[b]]) <{
 // PerTensor-NOT: asymmetric_quantize_inputs = true
 // PerTensor-SAME: padding = "SAME"
 // PerTensor: return %[[tconv:.*]]
 
 // PerChannelWeightOnly: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<1x32x42x128xf32>
-// PerChannelWeightOnly: %[[w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<1x32x42x128x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00}>>
+// PerChannelWeightOnly: %[[w:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<1x32x42x128x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00}>>
 // PerChannelWeightOnly: %[[dq_w:.*]] = "tfl.dequantize"(%[[w]]) : (tensor<1x32x42x128x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00}>>) -> tensor<1x32x42x128xf32>
-// PerChannelWeightOnly: %[[tconv:.*]] = "tfl.transpose_conv"(%arg1, %[[dq_w]], %arg0, %[[b]]) {
+// PerChannelWeightOnly: %[[tconv:.*]] = "tfl.transpose_conv"(%arg1, %[[dq_w]], %arg0, %[[b]]) <{
 // PerChannelWeightOnly-NOT: asymmetric_quantize_inputs = true
 // PerChannelWeightOnly-SAME: padding = "SAME"
 // PerChannelWeightOnly: return %[[tconv:.*]]
 
 // PerTensorWeightOnly: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<1x32x42x128xf32>
-// PerTensorWeightOnly: %[[w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<1x32x42x128x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+// PerTensorWeightOnly: %[[w:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<1x32x42x128x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
 // PerTensorWeightOnly: %[[dq_w:.*]] = "tfl.dequantize"(%[[w]]) : (tensor<1x32x42x128x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<1x32x42x128xf32>
-// PerTensorWeightOnly: %[[tconv:.*]] = "tfl.transpose_conv"(%arg1, %[[dq_w]], %arg0, %[[b]]) {
+// PerTensorWeightOnly: %[[tconv:.*]] = "tfl.transpose_conv"(%arg1, %[[dq_w]], %arg0, %[[b]]) <{
 // PerTensorWeightOnly-NOT: asymmetric_quantize_inputs = true
 // PerTensorWeightOnly-SAME: padding = "SAME"
 // PerTensorWeightOnly: return %[[tconv:.*]]
@@ -188,12 +188,12 @@ func.func @QuantizeGatherWeightOnly(%arg0: tensor<3xi32>) -> tensor<3x3x3x3xf32>
   %emb_s = "quantfork.stats"(%emb) {layerStats = dense<[0.000000e+00, 1.000000e+01]> : tensor<2xf32>} : (tensor<3x3x3x3xf32>) -> tensor<3x3x3x3xf32>
   func.return %emb_s : tensor<3x3x3x3xf32>
 
-// CHECK: %[[q_w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+// CHECK: %[[q_w:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
 // CHECK: %[[dq_w:.*]] = "tfl.dequantize"(%[[q_w]]) : (tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<64x3x3x3xf32>
 // CHECK: %[[emb:.*]] = "tfl.gather"(%[[dq_w]], %arg0)
 // CHECK: return %[[emb:.*]]
 
-// PerTensor: %[[q_w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+// PerTensor: %[[q_w:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
 // PerTensor: %[[dq_w:.*]] = "tfl.dequantize"(%[[q_w]]) : (tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<64x3x3x3xf32>
 // PerTensor: %[[emb:.*]] = "tfl.gather"(%[[dq_w]], %arg0)
 // PerTensor: return %[[emb:.*]]
@@ -209,16 +209,16 @@ func.func @QuantizeCustomOp(%arg0: tensor<1x1x1x1xf32>) -> tensor<*xf32> attribu
   func.return %custom : tensor<*xf32>
 
 // CHECK: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<1024x1x1x1xf32>
-// CHECK: %[[custom:.*]] = "tfl.custom"(%arg0, %[[w:.*]]) {custom_code = "CustomTestOp", custom_option = #tfl<const_bytes : "0x">}
+// CHECK: %[[custom:.*]] = "tfl.custom"(%arg0, %[[w:.*]]) <{custom_code = "CustomTestOp", custom_option = #tfl<const_bytes : "0x">}>
 // CHECK: return %[[custom:.*]]
 
-// CustomOpWeightOnly: %[[q_w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<1024x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+// CustomOpWeightOnly: %[[q_w:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<1024x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
 // CustomOpWeightOnly: %[[dq_w:.*]] = "tfl.dequantize"(%[[q_w:.*]]) : (tensor<1024x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<1024x1x1x1xf32>
-// CustomOpWeightOnly: %[[custom:.*]] = "tfl.custom"(%arg0, %[[dq_w:.*]]) {custom_code = "CustomTestOp", custom_option = #tfl<const_bytes : "0x">}
+// CustomOpWeightOnly: %[[custom:.*]] = "tfl.custom"(%arg0, %[[dq_w:.*]]) <{custom_code = "CustomTestOp", custom_option = #tfl<const_bytes : "0x">}>
 // CustomOpWeightOnly: return %[[custom:.*]]
 
-// CustomOpNotWeightOnly: %[[q_w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<1024x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
-// CustomOpNotWeightOnly: %[[custom:.*]] = "tfl.custom"(%arg0, %[[q_w:.*]]) {custom_code = "CustomTestOp", custom_option = #tfl<const_bytes : "0x">}
+// CustomOpNotWeightOnly: %[[q_w:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<1024x1x1x1x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+// CustomOpNotWeightOnly: %[[custom:.*]] = "tfl.custom"(%arg0, %[[q_w:.*]]) <{custom_code = "CustomTestOp", custom_option = #tfl<const_bytes : "0x">}>
 // CustomOpNotWeightOnly: return %[[custom:.*]]
 }
 
@@ -234,22 +234,22 @@ func.func @NotQuantizeConv3D(%arg0: tensor<1x32x32x32x8xf32>) -> tensor<1x32x32x
 
 // CHECK-DAG: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<1x1x1x8x16xf32>
 // CHECK-DAG: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<16xf32>
-// CHECK: %[[conv_3d:.*]] = "tfl.conv_3d"(%arg0, %[[w]], %[[b]]) {dilation_d_factor = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_d = 1 : i32, stride_h = 1 : i32, stride_w = 1 : i32}
+// CHECK: %[[conv_3d:.*]] = "tfl.conv_3d"(%arg0, %[[w]], %[[b]]) <{dilation_d_factor = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_d = 1 : i32, stride_h = 1 : i32, stride_w = 1 : i32}>
 // CHECK: return %[[conv_3d:.*]]
 
 // PerTensor: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<1x1x1x8x16xf32>
 // PerTensor: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<16xf32>
-// PerTensor: %[[conv_3d:.*]] = "tfl.conv_3d"(%arg0, %[[w]], %[[b]]) {dilation_d_factor = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_d = 1 : i32, stride_h = 1 : i32, stride_w = 1 : i32}
+// PerTensor: %[[conv_3d:.*]] = "tfl.conv_3d"(%arg0, %[[w]], %[[b]]) <{dilation_d_factor = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_d = 1 : i32, stride_h = 1 : i32, stride_w = 1 : i32}>
 // PerTensor: return %[[conv_3d:.*]]
 
 // PerChannelWeightOnly: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<1x1x1x8x16xf32>
 // PerChannelWeightOnly: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<16xf32>
-// PerChannelWeightOnly: %[[conv_3d:.*]] = "tfl.conv_3d"(%arg0, %[[w]], %[[b]]) {dilation_d_factor = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_d = 1 : i32, stride_h = 1 : i32, stride_w = 1 : i32}
+// PerChannelWeightOnly: %[[conv_3d:.*]] = "tfl.conv_3d"(%arg0, %[[w]], %[[b]]) <{dilation_d_factor = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_d = 1 : i32, stride_h = 1 : i32, stride_w = 1 : i32}>
 // PerChannelWeightOnly: return %[[conv_3d:.*]]
 
 // PerTensorWeightOnly: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<1x1x1x8x16xf32>
 // PerTensorWeightOnly: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<16xf32>
-// PerTensorWeightOnly: %[[conv_3d:.*]] = "tfl.conv_3d"(%arg0, %[[w]], %[[b]]) {dilation_d_factor = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_d = 1 : i32, stride_h = 1 : i32, stride_w = 1 : i32}
+// PerTensorWeightOnly: %[[conv_3d:.*]] = "tfl.conv_3d"(%arg0, %[[w]], %[[b]]) <{dilation_d_factor = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_d = 1 : i32, stride_h = 1 : i32, stride_w = 1 : i32}>
 // PerTensorWeightOnly: return %[[conv_3d:.*]]
 }
 
@@ -266,50 +266,50 @@ func.func @QuantizeMultiUses(%arg0: tensor<1x224x224x3xf32>, %arg1: tensor<3xi32
   func.return %bmm, %emb : tensor<1x112x112x112xf32>, tensor<3x3x3x3xf32>
 
 // CHECK-DAG: %[[b:.*]] = arith.constant dense<-1.23697901> : tensor<64xf32>
-// CHECK-DAG: %[[w1:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+// CHECK-DAG: %[[w1:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
 // CHECK-DAG: %[[dq_w1:.*]] = "tfl.dequantize"(%[[w1]]) : (tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<64x3x3x3xf32>
-// CHECK-DAG: %[[w2:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32:3, {1.000000e+00,1.000000e+00,1.000000e+00}
-// CHECK-DAG: %[[w3:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00,1.000000e+00,1.000000e+00
+// CHECK-DAG: %[[w2:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32:3, {1.000000e+00,1.000000e+00,1.000000e+00}>
+// CHECK-DAG: %[[w3:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00,1.000000e+00,1.000000e+00
 // CHECK: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[w3]], %[[b]])
 // CHECK: %[[dconv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[w2]], %[[b]])
 // CHECK: %[[emb:.*]] = "tfl.gather"(%[[dq_w1]], %arg1)
-// CHECK: %[[bmm:.*]] = "tfl.batch_matmul"(%[[conv]], %[[dconv]]) {adj_x = false, adj_y = true
+// CHECK: %[[bmm:.*]] = "tfl.batch_matmul"(%[[conv]], %[[dconv]]) <{adj_x = false, adj_y = true
 // CHECK-NOT: , asymmetric_quantize_inputs = true
 // CHECK-SAME: }
 // CHECK: return %[[bmm:.*]], %[[emb:.*]]
 
 // PerTensor: %[[b:.*]] = arith.constant dense<-1.23697901> : tensor<64xf32>
-// PerTensor: %[[w1:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+// PerTensor: %[[w1:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
 // PerTensor: %[[dq_w1:.*]] = "tfl.dequantize"(%[[w1]]) : (tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<64x3x3x3xf32>
 // PerTensor: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[w1]], %[[b]])
 // PerTensor: %[[dconv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[w1]], %[[b]])
 // PerTensor: %[[emb:.*]] = "tfl.gather"(%[[dq_w1]], %arg1)
-// PerTensor: %[[bmm:.*]] = "tfl.batch_matmul"(%[[conv]], %[[dconv]]) {adj_x = false, adj_y = true
+// PerTensor: %[[bmm:.*]] = "tfl.batch_matmul"(%[[conv]], %[[dconv]]) <{adj_x = false, adj_y = true
 // PerTensor-NOT: , asymmetric_quantize_inputs = true
 // PerTensor-SAME: }
 // PerTensor: return %[[bmm:.*]], %[[emb:.*]]
 
 // PerChannelWeightOnly-DAG: %[[b:.*]] = arith.constant dense<-1.23697901> : tensor<64xf32>
-// PerChannelWeightOnly-DAG: %[[w1:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+// PerChannelWeightOnly-DAG: %[[w1:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
 // PerChannelWeightOnly-DAG: %[[dq_w1:.*]] = "tfl.dequantize"(%[[w1]]) : (tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<64x3x3x3xf32>
-// PerChannelWeightOnly-DAG: %[[w2:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32:3, {1.000000e+00,1.000000e+00,1.000000e+00}
+// PerChannelWeightOnly-DAG: %[[w2:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32:3, {1.000000e+00,1.000000e+00,1.000000e+00}
 // PerChannelWeightOnly-DAG: %[[dq_w2:.*]] = "tfl.dequantize"(%[[w2]]) : (tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32:3, {1.000000e+00,1.000000e+00,1.000000e+00}>>) -> tensor<64x3x3x3xf32>
-// PerChannelWeightOnly-DAG: %[[w3:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00,1.000000e+00,1.000000e+00
+// PerChannelWeightOnly-DAG: %[[w3:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00,1.000000e+00,1.000000e+00
 // PerChannelWeightOnly-DAG: %[[dq_w3:.*]] = "tfl.dequantize"(%[[w3]]) : (tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00,1.000000e+00,1.000000e+00
 // PerChannelWeightOnly: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[dq_w3]], %[[b]])
 // PerChannelWeightOnly: %[[dconv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[dq_w2]], %[[b]])
 // PerChannelWeightOnly: %[[emb:.*]] = "tfl.gather"(%[[dq_w1]], %arg1)
-// PerChannelWeightOnly: %[[bmm:.*]] = "tfl.batch_matmul"(%[[conv]], %[[dconv]]) {adj_x = false, adj_y = true
+// PerChannelWeightOnly: %[[bmm:.*]] = "tfl.batch_matmul"(%[[conv]], %[[dconv]]) <{adj_x = false, adj_y = true
 // PerChannelWeightOnly-NOT: , asymmetric_quantize_inputs = true
 // PerChannelWeightOnly-SAME: }
 // PerChannelWeightOnly: return %[[bmm:.*]], %[[emb:.*]]
 
 // BLOCK: %[[b:.*]] = arith.constant dense<-1.23697901> : tensor<64xf32>
-// BLOCK: %[[w1:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+// BLOCK: %[[w1:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
 // BLOCK: %[[dq_w1:.*]] = "tfl.dequantize"(%[[w1]]) : (tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<64x3x3x3xf32>
 // BLOCK: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[dq_w1]], %[[b]])
 // BLOCK: %[[dconv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[w1]], %[[b]])
 // BLOCK: %[[emb:.*]] = "tfl.gather"(%[[dq_w1]], %arg1)
-// BLOCK: %[[bmm:.*]] = "tfl.batch_matmul"(%[[conv]], %[[dconv]]) {adj_x = false, adj_y = true
+// BLOCK: %[[bmm:.*]] = "tfl.batch_matmul"(%[[conv]], %[[dconv]]) <{adj_x = false, adj_y = true
 // BLOCK: return %[[bmm:.*]], %[[emb:.*]]
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/quantize-numeric-verify.mlir b/tensorflow/compiler/mlir/lite/tests/quantize-numeric-verify.mlir
index 7990b3aaf9e151..d043b98aa94899 100644
--- a/tensorflow/compiler/mlir/lite/tests/quantize-numeric-verify.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/quantize-numeric-verify.mlir
@@ -18,7 +18,7 @@ func.func @QuantizeConv2D(tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03
 // DEBUG: %[[act:.*]] = "tfl.dequantize"(%arg0) : (tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x224x224x3xf32>
 // DEBUG: %[[f_conv:.*]] = "tfl.conv_2d"(%[[act]], %[[wt]], %[[bias]])
 // DEBUG: %[[q_conv:.*]] = "tfl.conv_2d"
-// DEBUG: "tfl.NumericVerify"(%[[q_conv]], %[[f_conv]]) {log_if_failed = true, tolerance = 5.000000e+00 : f32}
+// DEBUG: "tfl.NumericVerify"(%[[q_conv]], %[[f_conv]]) <{log_if_failed = true, tolerance = 5.000000e+00 : f32}>
 // DEBUG: return %[[q_conv]] : tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
 }
 
@@ -56,8 +56,8 @@ func.func @QuantizeSplit(%arg: tensor<4x!quant.uniform<u8:f32, 1.0>>, %cst: tens
 
 // DEBUG: %[[f_split:.*]]:2 = "tfl.split"
 // DEBUG: %[[q_split:.*]]:2 = "tfl.split"
-// DEBUG: "tfl.NumericVerify"(%[[q_split]]#1, %[[f_split]]#1) {log_if_failed = true, tolerance = 5.000000e+00 : f32}
-// DEBUG: "tfl.NumericVerify"(%[[q_split]]#0, %[[f_split]]#0) {log_if_failed = true, tolerance = 5.000000e+00 : f32}
+// DEBUG: "tfl.NumericVerify"(%[[q_split]]#1, %[[f_split]]#1) <{log_if_failed = true, tolerance = 5.000000e+00 : f32}>
+// DEBUG: "tfl.NumericVerify"(%[[q_split]]#0, %[[f_split]]#0) <{log_if_failed = true, tolerance = 5.000000e+00 : f32}>
 }
 
 // DEBUG-LABEL: NotQuantizePow
diff --git a/tensorflow/compiler/mlir/lite/tests/quantize-variables.mlir b/tensorflow/compiler/mlir/lite/tests/quantize-variables.mlir
index 58dfed58a698e7..a5ac48521818ab 100644
--- a/tensorflow/compiler/mlir/lite/tests/quantize-variables.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/quantize-variables.mlir
@@ -8,10 +8,10 @@ func.func @QuantizeReadVariable() -> (tensor<1x2x1x3x!quant.uniform<i8:f32, 1.0>
   %3 = "tfl.quantize"(%2) {qtype = tensor<1x2x1x3x!quant.uniform<i8:f32, 1.0>>, volatile} : (tensor<1x2x1x3xf32>) -> tensor<1x2x1x3x!quant.uniform<i8:f32, 1.0>>
   func.return %3 : tensor<1x2x1x3x!quant.uniform<i8:f32, 1.0>>
 
-// CHECK-NEXT:  %[[vh:.*]] = "tfl.var_handle"() {container = "", shared_name = ""} : () -> tensor<*x!tf_type.resource<tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>>>
+// CHECK-NEXT:  %[[vh:.*]] = "tfl.var_handle"() <{container = "", shared_name = ""}> : () -> tensor<*x!tf_type.resource<tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>>>
 // CHECK-NEXT:  %[[rv:.*]] = "tfl.read_variable"(%[[vh]]) : (tensor<*x!tf_type.resource<tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>>>) -> tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
 // CHECK-NEXT:  %[[dq:.*]] = "tfl.dequantize"(%[[rv]]) : (tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>) -> tensor<1x2x1x3xf32>
-// CHECK-NEXT:  %[[q:.*]] = "tfl.quantize"(%[[dq]]) {qtype = tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>, volatile} : (tensor<1x2x1x3xf32>) -> tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
+// CHECK-NEXT:  %[[q:.*]] = "tfl.quantize"(%[[dq]]) <{qtype = tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>}> {volatile} : (tensor<1x2x1x3xf32>) -> tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
 // CHECK-NEXT:  return %[[q]] : tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
 }
 
@@ -22,7 +22,7 @@ func.func @QuantizeAssignVariableWithDequantAndEqualType(%arg0 : tensor<1x2x1x3x
   "tfl.assign_variable"(%0, %1) : (tensor<!tf_type.resource>, tensor<1x2x1x3xf32>) -> ()
   func.return %arg0 : tensor<1x2x1x3x!quant.uniform<i8:f32, 1.0>>
 
-// CHECK-NEXT:  %[[vh:.*]] = "tfl.var_handle"() {container = "", shared_name = ""} : () -> tensor<*x!tf_type.resource<tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>>>
+// CHECK-NEXT:  %[[vh:.*]] = "tfl.var_handle"() <{container = "", shared_name = ""}> : () -> tensor<*x!tf_type.resource<tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>>>
 // CHECK-NEXT:  "tfl.assign_variable"(%[[vh]], %arg0) : (tensor<*x!tf_type.resource<tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>>>, tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>) -> ()
 // CHECK-NEXT:  return %arg0 : tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
 }
@@ -36,11 +36,11 @@ func.func @QuantizeAssignVariableWithDequantAndNotEqualType(%arg0 : tensor<1x2x1
   "tfl.assign_variable"(%1, %5) : (tensor<!tf_type.resource>, tensor<1x2x1x3xf32>) -> ()
   func.return %arg0 : tensor<1x2x1x3x!quant.uniform<i8:f64, 1.0>>
 
-// CHECK-NEXT:  %[[vh:.*]] = "tfl.var_handle"() {container = "", shared_name = ""} : () -> tensor<*x!tf_type.resource<tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>>>
+// CHECK-NEXT:  %[[vh:.*]] = "tfl.var_handle"() <{container = "", shared_name = ""}> : () -> tensor<*x!tf_type.resource<tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>>>
 // CHECK-NEXT:  %[[rv:.*]] = "tfl.read_variable"(%[[vh]]) : (tensor<*x!tf_type.resource<tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>>>) -> tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
 // CHECK-NEXT:  %[[dq:.*]] = "tfl.dequantize"(%[[rv]]) : (tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>) -> tensor<1x2x1x3xf32>
-// CHECK-NEXT:  %[[q1:.*]] = "tfl.quantize"(%[[dq]]) {qtype = tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>, volatile} : (tensor<1x2x1x3xf32>) -> tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
-// CHECK-NEXT:  %[[q2:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>} : (tensor<1x2x1x3x!quant.uniform<i8:f64, 1.000000e+00>>) -> tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
+// CHECK-NEXT:  %[[q1:.*]] = "tfl.quantize"(%[[dq]]) <{qtype = tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>}> {volatile} : (tensor<1x2x1x3xf32>) -> tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
+// CHECK-NEXT:  %[[q2:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>}> : (tensor<1x2x1x3x!quant.uniform<i8:f64, 1.000000e+00>>) -> tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
 // CHECK-NEXT:  "tfl.assign_variable"(%[[vh]], %[[q2]]) : (tensor<*x!tf_type.resource<tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>>>, tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>) -> ()
 // CHECK-NEXT:  return %arg0 : tensor<1x2x1x3x!quant.uniform<i8:f64, 1.000000e+00>>
 }
@@ -54,10 +54,10 @@ func.func @QuantizeAssignVariableWithoutDequant(%arg0 : tensor<1x2x1x3xf32>) ->
   "tfl.assign_variable"(%0, %3) : (tensor<!tf_type.resource>, tensor<1x2x1x3xf32>) -> ()
   func.return %arg0 : tensor<1x2x1x3xf32>
 
-// CHECK-NEXT:  %[[vh:.*]] = "tfl.var_handle"() {container = "", shared_name = ""} : () -> tensor<*x!tf_type.resource<tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>>>
+// CHECK-NEXT:  %[[vh:.*]] = "tfl.var_handle"() <{container = "", shared_name = ""}> : () -> tensor<*x!tf_type.resource<tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>>>
 // CHECK-NEXT:  %[[rv:.*]] = "tfl.read_variable"(%[[vh]]) : (tensor<*x!tf_type.resource<tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>>>) -> tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
 // CHECK-NEXT:  %[[dq:.*]] = "tfl.dequantize"(%[[rv]]) : (tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>) -> tensor<1x2x1x3xf32>
-// CHECK-NEXT:  %[[q:.*]] = "tfl.quantize"(%[[dq]]) {qtype = tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>, volatile} : (tensor<1x2x1x3xf32>) -> tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
+// CHECK-NEXT:  %[[q:.*]] = "tfl.quantize"(%[[dq]]) <{qtype = tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>}> {volatile} : (tensor<1x2x1x3xf32>) -> tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
 // CHECK-NEXT:  "tfl.assign_variable"(%[[vh]], %[[q]]) : (tensor<*x!tf_type.resource<tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>>>, tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>) -> ()
 // CHECK-NEXT:  return %arg0 : tensor<1x2x1x3xf32>
 }
@@ -67,7 +67,7 @@ func.func @VarHandleCase(%arg0 : tensor<1x2x1x3xf32>) -> tensor<1x2x1x3xf32> {
   %0 = "tfl.var_handle"() : () -> tensor<!tf_type.resource>
   func.return %arg0 : tensor<1x2x1x3xf32>
 
-// CHECK-NEXT:  %[[vh:.*]] = "tfl.var_handle"() {container = "", shared_name = ""} : () -> tensor<*x!tf_type.resource<tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>>>
+// CHECK-NEXT:  %[[vh:.*]] = "tfl.var_handle"() <{container = "", shared_name = ""}> : () -> tensor<*x!tf_type.resource<tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>>>
 // CHECK-NEXT:  return %arg0 : tensor<1x2x1x3xf32>
 }
 
@@ -89,19 +89,19 @@ func.func @QuantizeReadAssign(%arg0: tensor<1x32x1x3xf32>) -> (tensor<1x34x1x3xf
   "tfl.assign_variable"(%2, %9) : (tensor<!tf_type.resource>, tensor<1x2x1x3xf32>) -> ()
   func.return %6 : tensor<1x34x1x3xf32>
 
-// CHECK-NEXT:  %[[q1:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x32x1x3x!quant.uniform<i8:f32, 1.000000e+00>>, volatile} : (tensor<1x32x1x3xf32>) -> tensor<1x32x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
+// CHECK-NEXT:  %[[q1:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<1x32x1x3x!quant.uniform<i8:f32, 1.000000e+00>>}> {volatile} : (tensor<1x32x1x3xf32>) -> tensor<1x32x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
 // CHECK-NEXT:  %[[dq1:.*]] = "tfl.dequantize"(%[[q1]]) : (tensor<1x32x1x3x!quant.uniform<i8:f32, 1.000000e+00>>) -> tensor<1x32x1x3xf32>
 // CHECK-NEXT:  %[[cst:.*]] = arith.constant dense<1> : tensor<4xi32>
 // CHECK-NEXT:  %[[cst_0:.*]] = arith.constant dense<[0, 0, 0, 3]> : tensor<4xi32>
 // CHECK-NEXT:  %[[cst_1:.*]] = arith.constant dense<[0, -2, 0, 0]> : tensor<4xi32>
-// CHECK-NEXT:  %[[vh:.*]] = "tfl.var_handle"() {container = "", shared_name = "read_assign2/states"} : () -> tensor<*x!tf_type.resource<tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>>>
+// CHECK-NEXT:  %[[vh:.*]] = "tfl.var_handle"() <{container = "", shared_name = "read_assign2/states"}> : () -> tensor<*x!tf_type.resource<tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>>>
 // CHECK-NEXT:  %[[rv:.*]] = "tfl.read_variable"(%[[vh]]) : (tensor<*x!tf_type.resource<tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>>>) -> tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
 // CHECK-NEXT:  %[[dq2:.*]] = "tfl.dequantize"(%[[rv]]) : (tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>) -> tensor<1x2x1x3xf32>
-// CHECK-NEXT:  %[[cc:.*]] = "tfl.concatenation"(%[[dq2]], %[[dq1]]) {axis = 1 : i32, fused_activation_function = "NONE"} : (tensor<1x2x1x3xf32>, tensor<1x32x1x3xf32>) -> tensor<1x34x1x3xf32>
-// CHECK-NEXT:  %[[q2:.*]] = "tfl.quantize"(%[[cc]]) {qtype = tensor<1x34x1x3x!quant.uniform<i8:f32, 1.000000e+00>>, volatile} : (tensor<1x34x1x3xf32>) -> tensor<1x34x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
+// CHECK-NEXT:  %[[cc:.*]] = "tfl.concatenation"(%[[dq2]], %[[dq1]]) <{axis = 1 : i32, fused_activation_function = "NONE"}> : (tensor<1x2x1x3xf32>, tensor<1x32x1x3xf32>) -> tensor<1x34x1x3xf32>
+// CHECK-NEXT:  %[[q2:.*]] = "tfl.quantize"(%[[cc]]) <{qtype = tensor<1x34x1x3x!quant.uniform<i8:f32, 1.000000e+00>>}> {volatile} : (tensor<1x34x1x3xf32>) -> tensor<1x34x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
 // CHECK-NEXT:  %[[dq3:.*]] = "tfl.dequantize"(%[[q2]]) : (tensor<1x34x1x3x!quant.uniform<i8:f32, 1.000000e+00>>) -> tensor<1x34x1x3xf32>
-// CHECK-NEXT:  %[[ss:.*]] = "tfl.strided_slice"(%[[dq3]], %[[cst_1]], %[[cst_0]], %[[cst]]) {begin_mask = 13 : i32, ellipsis_mask = 0 : i32, end_mask = 15 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<1x34x1x3xf32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) -> tensor<1x2x1x3xf32>
-// CHECK-NEXT:  %[[q3:.*]] = "tfl.quantize"(%[[ss]]) {qtype = tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>, volatile} : (tensor<1x2x1x3xf32>) -> tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
+// CHECK-NEXT:  %[[ss:.*]] = "tfl.strided_slice"(%[[dq3]], %[[cst_1]], %[[cst_0]], %[[cst]]) <{begin_mask = 13 : i32, ellipsis_mask = 0 : i32, end_mask = 15 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32}> : (tensor<1x34x1x3xf32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) -> tensor<1x2x1x3xf32>
+// CHECK-NEXT:  %[[q3:.*]] = "tfl.quantize"(%[[ss]]) <{qtype = tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>}> {volatile} : (tensor<1x2x1x3xf32>) -> tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
 // CHECK-NEXT:  "tfl.assign_variable"(%[[vh]], %[[q3]]) : (tensor<*x!tf_type.resource<tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>>>, tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>) -> ()
 // CHECK-NEXT:  return %[[dq3]] : tensor<1x34x1x3xf32>
 }
@@ -133,11 +133,11 @@ func.func @QuantizeConvVariable(%arg0: tensor<1x3x1x1xf32>) -> (tensor<1x3x1x1xf
   "tfl.assign_variable"(%6, %16) : (tensor<!tf_type.resource>, tensor<1x3x1x1xf32>) -> ()
   func.return %10 : tensor<1x3x1x1xf32>
 
-// WHOLE-PASSES:  %[[vh:.*]] = "tfl.var_handle"() {container = "", shared_name = "conv_variable/state"} : () -> tensor<*x!tf_type.resource<tensor<1x3x1x1x!quant.uniform<i8:f32, {{.*}}>>>>
+// WHOLE-PASSES:  %[[vh:.*]] = "tfl.var_handle"() <{container = "", shared_name = "conv_variable/state"}> : () -> tensor<*x!tf_type.resource<tensor<1x3x1x1x!quant.uniform<i8:f32, {{.*}}>>>>
 // WHOLE-PASSES-NEXT:  %[[rv:.*]] = "tfl.read_variable"(%[[vh]]) : (tensor<*x!tf_type.resource<tensor<1x3x1x1x!quant.uniform<i8:f32, {{.*}}>>>>) -> tensor<1x3x1x1x!quant.uniform<i8:f32, {{.*}}>>
-// WHOLE-PASSES-DAG:  %[[cv:.*]] = "tfl.conv_2d"(%arg0, {{.*}}) {{{.*}}} : (tensor<1x3x1x1x!quant.uniform<i8:f32, {{.*}}>>, tensor<1x3x1x1x!quant.uniform<i8<-127:127>:f32:0, {{.*}}>>, tensor<1x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1x3x1x1x!quant.uniform<i8:f32, {{.*}}>>
-// WHOLE-PASSES-NEXT:  %[[cc:.*]] = "tfl.concatenation"(%[[rv]], %[[cv]]) {{{.*}}} : (tensor<1x3x1x1x!quant.uniform<i8:f32, {{.*}}>>, tensor<1x3x1x1x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x6x1x1x!quant.uniform<i8:f32, {{.*}}>>
-// WHOLE-PASSES-NEXT:  %[[ss:.*]] = "tfl.strided_slice"(%[[cc]], {{.*}}) {{{.*}}} : (tensor<1x6x1x1x!quant.uniform<i8:f32, {{.*}}>>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) -> tensor<1x3x1x1x!quant.uniform<i8:f32, {{.*}}>>
+// WHOLE-PASSES-DAG:  %[[cv:.*]] = "tfl.conv_2d"(%arg0, {{.*}}) <{{{.*}}}> : (tensor<1x3x1x1x!quant.uniform<i8:f32, {{.*}}>>, tensor<1x3x1x1x!quant.uniform<i8<-127:127>:f32:0, {{.*}}>>, tensor<1x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1x3x1x1x!quant.uniform<i8:f32, {{.*}}>>
+// WHOLE-PASSES-NEXT:  %[[cc:.*]] = "tfl.concatenation"(%[[rv]], %[[cv]]) <{{{.*}}}> : (tensor<1x3x1x1x!quant.uniform<i8:f32, {{.*}}>>, tensor<1x3x1x1x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x6x1x1x!quant.uniform<i8:f32, {{.*}}>>
+// WHOLE-PASSES-NEXT:  %[[ss:.*]] = "tfl.strided_slice"(%[[cc]], {{.*}}) <{{{.*}}}> : (tensor<1x6x1x1x!quant.uniform<i8:f32, {{.*}}>>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) -> tensor<1x3x1x1x!quant.uniform<i8:f32, {{.*}}>>
 // WHOLE-PASSES-NEXT:  "tfl.assign_variable"(%[[vh]], %[[ss]]) : (tensor<*x!tf_type.resource<tensor<1x3x1x1x!quant.uniform<i8:f32, {{.*}}>>>>, tensor<1x3x1x1x!quant.uniform<i8:f32, {{.*}}>>) -> ()
 // WHOLE-PASSES-NEXT:  return %[[cv]] : tensor<1x3x1x1x!quant.uniform<i8:f32, {{.*}}>>
 }
@@ -171,19 +171,19 @@ func.func @QuantizeTwoVariable(%arg0: tensor<1x2x3xf32>) -> (tensor<1x2x3xf32>)
 
   func.return %0 : tensor<1x2x3xf32>
 
-// WHOLE-PASSES:  %[[q1:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>} : (tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>) -> tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>
-// WHOLE-PASSES-DAG:  %[[vh1:.*]] = "tfl.var_handle"() {container = "", shared_name = "read_assign/states0"} : () -> tensor<*x!tf_type.resource<tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>>>
-// WHOLE-PASSES-DAG:  %[[vh2:.*]] = "tfl.var_handle"() {container = "", shared_name = "read_assign/states1"} : () -> tensor<*x!tf_type.resource<tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>>>
+// WHOLE-PASSES:  %[[q1:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>}> : (tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>) -> tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>
+// WHOLE-PASSES-DAG:  %[[vh1:.*]] = "tfl.var_handle"() <{container = "", shared_name = "read_assign/states0"}> : () -> tensor<*x!tf_type.resource<tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>>>
+// WHOLE-PASSES-DAG:  %[[vh2:.*]] = "tfl.var_handle"() <{container = "", shared_name = "read_assign/states1"}> : () -> tensor<*x!tf_type.resource<tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>>>
 
 // WHOLE-PASSES-DAG:  %[[rv1:.*]] = "tfl.read_variable"({{.*}}) : (tensor<*x!tf_type.resource<tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>>>) -> tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>
 // WHOLE-PASSES-NEXT:  %[[cc1:.*]] = "tfl.concatenation"(%[[rv1]], {{.*}}) {{.*}} : (tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>, tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>) -> tensor<1x4x3x!quant.uniform<u8:f32, {{.*}}>>
-// WHOLE-PASSES-NEXT:  %[[q2:.*]] = "tfl.quantize"(%[[cc1]]) {qtype = tensor<1x4x3x!quant.uniform<u8:f32, {{.*}}>>} : (tensor<1x4x3x!quant.uniform<u8:f32, {{.*}}>>) -> tensor<1x4x3x!quant.uniform<u8:f32, {{.*}}>>
-// WHOLE-PASSES-NEXT:  %[[ss1:.*]] = "tfl.strided_slice"(%[[q2]], {{.*}}) {{{.*}}} : (tensor<1x4x3x!quant.uniform<u8:f32, {{.*}}>>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>
+// WHOLE-PASSES-NEXT:  %[[q2:.*]] = "tfl.quantize"(%[[cc1]]) <{qtype = tensor<1x4x3x!quant.uniform<u8:f32, {{.*}}>>}> : (tensor<1x4x3x!quant.uniform<u8:f32, {{.*}}>>) -> tensor<1x4x3x!quant.uniform<u8:f32, {{.*}}>>
+// WHOLE-PASSES-NEXT:  %[[ss1:.*]] = "tfl.strided_slice"(%[[q2]], {{.*}}) <{{{.*}}}> : (tensor<1x4x3x!quant.uniform<u8:f32, {{.*}}>>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>
 // WHOLE-PASSES-NEXT:  "tfl.assign_variable"(%[[vh1]], %[[ss1]]) : (tensor<*x!tf_type.resource<tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>>>, tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>) -> ()
 
 // WHOLE-PASSES-DAG:  %[[rv2:.*]] = "tfl.read_variable"({{.*}}) : (tensor<*x!tf_type.resource<tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>>>) -> tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>
 // WHOLE-PASSES-NEXT:  %[[cc2:.*]] = "tfl.concatenation"(%[[rv2]], {{.*}}) {{.*}} : (tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>, tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>) -> tensor<1x4x3x!quant.uniform<u8:f32, {{.*}}>>
-// WHOLE-PASSES-NEXT:  %[[ss2:.*]] = "tfl.strided_slice"(%[[cc2]], {{.*}}) {{{.*}}} : (tensor<1x4x3x!quant.uniform<u8:f32, {{.*}}>>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>
+// WHOLE-PASSES-NEXT:  %[[ss2:.*]] = "tfl.strided_slice"(%[[cc2]], {{.*}}) <{{{.*}}}> : (tensor<1x4x3x!quant.uniform<u8:f32, {{.*}}>>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>
 // WHOLE-PASSES-NEXT:  "tfl.assign_variable"(%[[vh2]], %[[ss2]]) : (tensor<*x!tf_type.resource<tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>>>, tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>) -> ()
 
 // WHOLE-PASSES-NEXT:  return %arg0 : tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>
diff --git a/tensorflow/compiler/mlir/lite/tests/quantize.mlir b/tensorflow/compiler/mlir/lite/tests/quantize.mlir
index 69771e0100f496..f99d3cb409f0fc 100644
--- a/tensorflow/compiler/mlir/lite/tests/quantize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/quantize.mlir
@@ -8,7 +8,7 @@ func.func @QuantizeFloatConst() -> tensor<2x2x!quant.uniform<u8:f32, 7.843137254
   %1 = "tfl.quantize"(%0) {qtype = tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>
   func.return %1 : tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>
 
-// CHECK:  %[[cst:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>, value = dense<0> : tensor<2x2xi8>}
+// CHECK:  %[[cst:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>, value = dense<0> : tensor<2x2xi8>}>
 // CHECK:  return %[[cst]]
 }
 
@@ -18,7 +18,7 @@ func.func @QuantizeFloatConst4Bits() -> tensor<2x4x!quant.uniform<i4:f32, 2.5000
   %1 = "tfl.quantize"(%0) {qtype = tensor<2x4x!quant.uniform<i4:f32, 2.500000e-01:-1>>} : (tensor<2x4xf32>) -> tensor<2x4x!quant.uniform<i4:f32, 2.500000e-01:-1>>
   func.return %1 : tensor<2x4x!quant.uniform<i4:f32, 2.500000e-01:-1>>
 
-// CHECK:  %[[cst:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<2x4x!quant.uniform<i4:f32, 2.500000e-01:-1>>, value = dense<{{\[\[}}-4, -3, -2, -1{{\]}}, [0, 1, 2, 3{{\]\]}}> : tensor<2x4xi4>}
+// CHECK:  %[[cst:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<2x4x!quant.uniform<i4:f32, 2.500000e-01:-1>>, value = dense<{{\[\[}}-4, -3, -2, -1{{\]}}, [0, 1, 2, 3{{\]\]}}> : tensor<2x4xi4>}>
 // CHECK:  return %[[cst]]
 }
 
@@ -28,7 +28,7 @@ func.func @QuantizeDenseFloatConst() -> tensor<2x2x!quant.uniform<u8:f32, 7.8431
   %1 = "tfl.quantize"(%0) {qtype = tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>
   func.return %1 : tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>
 
-// CHECK:  %[[cst:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>, value = dense<{{\[\[}}0, -1], {{\[}}-1, -1]]> : tensor<2x2xi8>}
+// CHECK:  %[[cst:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>, value = dense<{{\[\[}}0, -1], {{\[}}-1, -1]]> : tensor<2x2xi8>}>
 // CHECK:  return %[[cst]]
 }
 
@@ -38,7 +38,7 @@ func.func @QuantizeSplatFloatConst() -> tensor<2x2x!quant.uniform<u8:f32, 7.8431
   %1 = "tfl.quantize"(%0) {qtype = tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>
   func.return %1 : tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>
 
-// CHECK:  %[[cst:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>, value = dense<-1> : tensor<2x2xi8>}
+// CHECK:  %[[cst:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>, value = dense<-1> : tensor<2x2xi8>}>
 // CHECK:  return %[[cst]]
 }
 
@@ -60,7 +60,7 @@ func.func @DequantizeAndQuantize() -> tensor<2x2x!quant.uniform<u8:f32, 7.843137
   %1 = "tfl.quantize"(%0) {qtype = tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>
   func.return %1 : tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>
 
-// CHECK:  %[[cst:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>, value = dense<-1> : tensor<2x2xi8>}
+// CHECK:  %[[cst:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>, value = dense<-1> : tensor<2x2xi8>}>
 // CHECK:  return %[[cst]] : tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>
 }
 
@@ -76,8 +76,8 @@ func.func @QuantizeConv2D(tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03
   %6 = "tfl.quantize"(%5) {qtype = tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>} : (tensor<1x112x112x32xf32>) -> tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
   func.return %6 : tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
 
-// CHECK: %[[cst0:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<32x!quant.uniform<i32:f32, 7.812500e-04>>, value = dense<-1583> : tensor<32xi32>}
-// CHECK: %[[cst1:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32, 1.000000e-01>>, value = dense<1> : tensor<32x3x3x3xi8>}
+// CHECK: %[[cst0:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<32x!quant.uniform<i32:f32, 7.812500e-04>>, value = dense<-1583> : tensor<32xi32>}>
+// CHECK: %[[cst1:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32, 1.000000e-01>>, value = dense<1> : tensor<32x3x3x3xi8>}>
 // CHECK: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[cst1]], %[[cst0]])
 // CHECK: return %[[conv]] : tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
 }
@@ -94,8 +94,8 @@ func.func @QuantizeConv2D4Bit(tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500
   %6 = "tfl.quantize"(%5) {qtype = tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>} : (tensor<1x112x112x32xf32>) -> tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
   func.return %6 : tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
 
-// CHECK: %[[cst0:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<32x!quant.uniform<i32:f32, 7.812500e-04>>, value = dense<-1583> : tensor<32xi32>}
-// CHECK: %[[cst1:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<32x3x3x3x!quant.uniform<u4<1:15>:f32, 1.000000e-01>>, value = dense<1> : tensor<32x3x3x3xi4>}
+// CHECK: %[[cst0:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<32x!quant.uniform<i32:f32, 7.812500e-04>>, value = dense<-1583> : tensor<32xi32>}>
+// CHECK: %[[cst1:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<32x3x3x3x!quant.uniform<u4<1:15>:f32, 1.000000e-01>>, value = dense<1> : tensor<32x3x3x3xi4>}>
 // CHECK: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[cst1]], %[[cst0]])
 // CHECK: return %[[conv]] : tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
 }
@@ -111,9 +111,9 @@ func.func @QuantizeDepthwiseConv2D(tensor<1x224x224x3x!quant.uniform<u8:f32, 7.8
   %6 = "tfl.quantize"(%5) {qtype = tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>} : (tensor<1x112x112x32xf32>) -> tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
   func.return %6 : tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
 
-// CHECK: %[[cst0:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>, value = dense<-7254> : tensor<32xi32>}
-// CHECK: %[[cst1:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32, 0.021826678373682216:151>>, value = dense<-76> : tensor<32x3x3x3xi8>}
-// CHECK: %[[conv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[cst1]], %[[cst0]]) {depth_multiplier = 4 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 4 : i32, stride_w = 5 : i32}
+// CHECK: %[[cst0:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>, value = dense<-7254> : tensor<32xi32>}>
+// CHECK: %[[cst1:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32, 0.021826678373682216:151>>, value = dense<-76> : tensor<32x3x3x3xi8>}>
+// CHECK: %[[conv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[cst1]], %[[cst0]]) <{depth_multiplier = 4 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 4 : i32, stride_w = 5 : i32}>
 // CHECK: return %[[conv]]
 }
 
@@ -128,9 +128,9 @@ func.func @QuantizeDepthwiseConv2D4Bit(tensor<1x224x224x3x!quant.uniform<u8:f32,
   %6 = "tfl.quantize"(%5) {qtype = tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>} : (tensor<1x112x112x32xf32>) -> tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
   func.return %6 : tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
 
-// CHECK: %[[cst0:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<32x!quant.uniform<i32:f32, 0.0030937367812500002>>, value = dense<-400> : tensor<32xi32>}
-// CHECK: %[[cst1:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<32x3x3x3x!quant.uniform<u4<1:15>:f32, 0.39599830800000002:8>>, value = dense<-7> : tensor<32x3x3x3xi4>}
-// CHECK: %[[conv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[cst1]], %[[cst0]]) {depth_multiplier = 4 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 4 : i32, stride_w = 5 : i32}
+// CHECK: %[[cst0:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<32x!quant.uniform<i32:f32, 0.0030937367812500002>>, value = dense<-400> : tensor<32xi32>}>
+// CHECK: %[[cst1:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<32x3x3x3x!quant.uniform<u4<1:15>:f32, 0.39599830800000002:8>>, value = dense<-7> : tensor<32x3x3x3xi4>}>
+// CHECK: %[[conv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[cst1]], %[[cst0]]) <{depth_multiplier = 4 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 4 : i32, stride_w = 5 : i32}>
 // CHECK: return %[[conv]]
 }
 
@@ -145,14 +145,14 @@ func.func @QuantizeFullyConnected(tensor<1x224x224x3x!quant.uniform<u8:f32, 7.81
   %6 = "tfl.quantize"(%5) {qtype = tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>} : (tensor<1x112x112x32xf32>) -> tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
   func.return %6 : tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
 
-// CHECK: %[[cst_0:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>, value = dense<-7254> : tensor<32xi32>}
-// CHECK: %[[cst_1:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<32x12x!quant.uniform<u8<1:255>:f32, 0.021826678373682216:151>>, value = dense<-76> : tensor<32x12xi8>}
-// CHECK: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %[[cst_1]], %[[cst_0]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}
+// CHECK: %[[cst_0:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>, value = dense<-7254> : tensor<32xi32>}>
+// CHECK: %[[cst_1:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<32x12x!quant.uniform<u8<1:255>:f32, 0.021826678373682216:151>>, value = dense<-76> : tensor<32x12xi8>}>
+// CHECK: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %[[cst_1]], %[[cst_0]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}>
 // CHECK: return %[[fc]]
 
 // BLOCK: %[[cst:.*]] = "tfl.pseudo_const"(){{.*}}dense<-1.23697901>
 // BLOCK: %[[dq1:.*]] = "tfl.dequantize"(%arg0)
-// BLOCK: %[[cst2:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<32x12x!quant.uniform<u8<1:255>:f32, 0.021826678373682216:151>>, value = dense<-76> : tensor<32x12xi8>}
+// BLOCK: %[[cst2:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<32x12x!quant.uniform<u8<1:255>:f32, 0.021826678373682216:151>>, value = dense<-76> : tensor<32x12xi8>}>
 // BLOCK: %[[dq2:.*]] = "tfl.dequantize"(%[[cst2]])
 // BLOCK: %[[fc:.*]] = "tfl.fully_connected"(%[[dq1]], %[[dq2]], %[[cst]])
 // BLOCK: %[[q:.*]] = "tfl.quantize"(%[[fc]])
@@ -170,14 +170,14 @@ func.func @QuantizeFullyConnected4Bit(tensor<1x224x224x3x!quant.uniform<u8:f32,
   %6 = "tfl.quantize"(%5) {qtype = tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>} : (tensor<1x112x112x32xf32>) -> tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
   func.return %6 : tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
 
-// CHECK: %[[cst_0:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<32x!quant.uniform<i32:f32, 0.0030937367812500002>>, value = dense<-400> : tensor<32xi32>}
-// CHECK: %[[cst_1:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<32x12x!quant.uniform<u4<1:15>:f32, 0.39599830800000002:8>>, value = dense<-7> : tensor<32x12xi4>}
-// CHECK: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %[[cst_1]], %[[cst_0]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}
+// CHECK: %[[cst_0:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<32x!quant.uniform<i32:f32, 0.0030937367812500002>>, value = dense<-400> : tensor<32xi32>}>
+// CHECK: %[[cst_1:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<32x12x!quant.uniform<u4<1:15>:f32, 0.39599830800000002:8>>, value = dense<-7> : tensor<32x12xi4>}>
+// CHECK: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %[[cst_1]], %[[cst_0]]) <{fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}>
 // CHECK: return %[[fc]]
 
 // BLOCK: %[[cst:.*]] = "tfl.pseudo_const"(){{.*}}dense<-1.23697901>
 // BLOCK: %[[dq1:.*]] = "tfl.dequantize"(%arg0)
-// BLOCK: %[[cst2:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<32x12x!quant.uniform<u4<1:15>:f32, 0.39599830800000002:8>>, value = dense<-7> : tensor<32x12xi4>}
+// BLOCK: %[[cst2:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<32x12x!quant.uniform<u4<1:15>:f32, 0.39599830800000002:8>>, value = dense<-7> : tensor<32x12xi4>}>
 // BLOCK: %[[dq2:.*]] = "tfl.dequantize"(%[[cst2]])
 // BLOCK: %[[fc:.*]] = "tfl.fully_connected"(%[[dq1]], %[[dq2]], %[[cst]])
 // BLOCK: %[[q:.*]] = "tfl.quantize"(%[[fc]])
@@ -254,13 +254,13 @@ func.func @QuantizeAdd(tensor<1x56x56x24x!quant.uniform<u8:f32, 0.27583434161017
   %3 = "tfl.quantize"(%2) {qtype = tensor<1x56x56x24x!quant.uniform<u8:f32, 0.4321689530914905:133>>} : (tensor<1x56x56x24xf32>) -> tensor<1x56x56x24x!quant.uniform<u8:f32, 0.4321689530914905:133>>
   func.return %3 : tensor<1x56x56x24x!quant.uniform<u8:f32, 0.4321689530914905:133>>
 
-// CHECK: %[[add:.*]] = tfl.add(%arg0, %arg1) {fused_activation_function = "NONE"} : (tensor<1x56x56x24x!quant.uniform<u8:f32, 0.27583434161017922:119>>, tensor<1x56x56x24x!quant.uniform<u8:f32, 0.40149296779258581:136>>)
+// CHECK: %[[add:.*]] = tfl.add(%arg0, %arg1) <{fused_activation_function = "NONE"}> : (tensor<1x56x56x24x!quant.uniform<u8:f32, 0.27583434161017922:119>>, tensor<1x56x56x24x!quant.uniform<u8:f32, 0.40149296779258581:136>>)
 // CHECK: return %[[add]] : tensor<1x56x56x24x!quant.uniform<u8:f32, 0.4321689530914905:133>>
 
 // BLOCK: %[[dq0:.*]] = "tfl.dequantize"(%arg0) : (tensor<1x56x56x24x!quant.uniform<u8:f32, 0.27583434161017922:119>>)
 // BLOCK: %[[dq1:.*]] = "tfl.dequantize"(%arg1) : (tensor<1x56x56x24x!quant.uniform<u8:f32, 0.40149296779258581:136>>)
 // BLOCK: %[[add:.*]] = tfl.add %[[dq0]], %[[dq1]] {fused_activation_function = "NONE"} : tensor<1x56x56x24xf32>
-// BLOCK: %[[q:.*]] = "tfl.quantize"(%[[add]]) {qtype = tensor<1x56x56x24x!quant.uniform<u8:f32, 0.4321689530914905:133>>}
+// BLOCK: %[[q:.*]] = "tfl.quantize"(%[[add]]) <{qtype = tensor<1x56x56x24x!quant.uniform<u8:f32, 0.4321689530914905:133>>}>
 // BLOCK: return %[[q]] : tensor<1x56x56x24x!quant.uniform<u8:f32, 0.4321689530914905:133>>
 }
 
@@ -271,9 +271,9 @@ func.func @QuantizeConcat(tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2x!quant
   %1 = "tfl.quantize"(%0) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
   func.return %1 : tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 
-// CHECK: %[[q0:.*]] = "tfl.quantize"(%arg1) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>, volatile}
-// CHECK: %[[q1:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>, volatile}
-// CHECK: %[[cc:.*]] = "tfl.concatenation"(%[[q1]], %[[q0]]) {axis = 0 : i32, fused_activation_function = "NONE"}
+// CHECK: %[[q0:.*]] = "tfl.quantize"(%arg1) <{qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>}> {volatile}
+// CHECK: %[[q1:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>}> {volatile}
+// CHECK: %[[cc:.*]] = "tfl.concatenation"(%[[q1]], %[[q0]]) <{axis = 0 : i32, fused_activation_function = "NONE"}>
 // CHECK: return %[[cc]] : tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 }
 
@@ -285,9 +285,9 @@ func.func @QuantizeConcatRequantize(tensor<1x2x!quant.uniform<u8:f32, 2.0:128>>,
   %3 = "tfl.quantize"(%2) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
   func.return %3 : tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 
-// CHECK: %[[q1:.*]] = "tfl.quantize"(%arg1) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>, volatile}
-// CHECK: %[[q0:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>}
-// CHECK: %[[cc:.*]] = "tfl.concatenation"(%[[q0]], %[[q1]]) {axis = 0 : i32, fused_activation_function = "NONE"}
+// CHECK: %[[q1:.*]] = "tfl.quantize"(%arg1) <{qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>}> {volatile}
+// CHECK: %[[q0:.*]] = "tfl.quantize"(%arg0) <{qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>}>
+// CHECK: %[[cc:.*]] = "tfl.concatenation"(%[[q0]], %[[q1]]) <{axis = 0 : i32, fused_activation_function = "NONE"}>
 // CHECK: return %[[cc]] : tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 }
 
@@ -298,7 +298,7 @@ func.func @QuantizeMaxPool2D(tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03
   %1 = "tfl.max_pool_2d"(%0) {filter_height = 1 : i32, filter_width = 1 : i32, fused_activation_function = "RELU6", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x6x6x16xf32>) -> tensor<1x1x1x16xf32>
   func.return %1 : tensor<1x1x1x16xf32>
 
-// CHECK: %[[mp:.*]] = "tfl.max_pool_2d"(%arg0) {filter_height = 1 : i32, filter_width = 1 : i32, fused_activation_function = "RELU6", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x1x1x16x!quant.uniform<u8:f32, 7.812500e-03:128>>
+// CHECK: %[[mp:.*]] = "tfl.max_pool_2d"(%arg0) <{filter_height = 1 : i32, filter_width = 1 : i32, fused_activation_function = "RELU6", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x1x1x16x!quant.uniform<u8:f32, 7.812500e-03:128>>
 // CHECK: %[[dq:.*]] = "tfl.dequantize"(%[[mp]]) : (tensor<1x1x1x16x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x1x1x16xf32>
 // CHECK: return %[[dq]] : tensor<1x1x1x16xf32>
 }
@@ -311,7 +311,7 @@ func.func @QuantizeSplit(%arg: tensor<4x!quant.uniform<u8:f32, 1.0>>, %cst: tens
   %3 = "tfl.quantize"(%1#1) {qtype = tensor<2x!quant.uniform<u8:f32, 1.0>>} : (tensor<2xf32>) -> tensor<2x!quant.uniform<u8:f32, 1.0>>
   func.return %2, %3 : tensor<2x!quant.uniform<u8:f32, 1.0>>, tensor<2x!quant.uniform<u8:f32, 1.0>>
 
-// CHECK: %[[sp:.*]]:2 = "tfl.split"(%arg1, %arg0) {num_splits = 2 : i32} : (tensor<i32>, tensor<4x!quant.uniform<u8:f32, 1.000000e+00>>)
+// CHECK: %[[sp:.*]]:2 = "tfl.split"(%arg1, %arg0) <{num_splits = 2 : i32}> : (tensor<i32>, tensor<4x!quant.uniform<u8:f32, 1.000000e+00>>)
 // CHECK: return %[[sp]]#0, %[[sp]]#1
 }
 
@@ -324,7 +324,7 @@ func.func @QuantizeSplitUnusedResults(%arg: tensor<4x!quant.uniform<u8:f32, 1.0>
   %3 = "tfl.quantize"(%1#1) {qtype = tensor<2x!quant.uniform<u8:f32, 1.0>>} : (tensor<2xf32>) -> tensor<2x!quant.uniform<u8:f32, 1.0>>
   func.return %2, %3 : tensor<2x!quant.uniform<u8:f32, 1.0>>, tensor<2x!quant.uniform<u8:f32, 1.0>>
 
-// CHECK: %[[sp:.*]]:4 = "tfl.split"(%arg1, %arg0) {num_splits = 4 : i32} : (tensor<i32>, tensor<4x!quant.uniform<u8:f32, 1.000000e+00>>)
+// CHECK: %[[sp:.*]]:4 = "tfl.split"(%arg1, %arg0) <{num_splits = 4 : i32}> : (tensor<i32>, tensor<4x!quant.uniform<u8:f32, 1.000000e+00>>)
 // CHECK: return %[[sp]]#0, %[[sp]]#1
 }
 
@@ -440,7 +440,7 @@ func.func @CheckLegacyQuantizeAdd() -> tensor<1x2x!quant.uniform<i8:f32, 0.00784
   %0 = "tfl.quantize"(%cst) {qtype = tensor<1x2x!quant.uniform<i8:f32, 0.0078431372549019607:-128>>, volatile} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, 0.0078431372549019607:-128>>
   func.return %0 : tensor<1x2x!quant.uniform<i8:f32, 0.0078431372549019607:-128>>
 
-// LEGACY:  "tfl.pseudo_qconst"() {qtype = tensor<1x2x!quant.uniform<i8:f32, 0.0078431372549019607:-128>>, value = dense<{{\[\[}}-1, 127]]> : tensor<1x2xi8>}
+// LEGACY:  "tfl.pseudo_qconst"() <{qtype = tensor<1x2x!quant.uniform<i8:f32, 0.0078431372549019607:-128>>, value = dense<{{\[\[}}-1, 127]]> : tensor<1x2xi8>}>
 }
 
 func.func private @testIfThen(tensor<*xf32>) -> tensor<*xf32>
@@ -467,8 +467,8 @@ func.func @NotQuantizeReadVariable() -> tensor<1x2x3x!quant.uniform<u8<1:255>:f3
   %1 = "tfl.read_variable"(%0) : (tensor<!tf_type.resource<tensor<1x2x3xf32>>>) -> tensor<1x2x3xf32>
   %2 = "tfl.quantize"(%1) {qtype = tensor<1x2x3x!quant.uniform<u8<1:255>:f32, 0.047244094488188976:128>>} : (tensor<1x2x3xf32>) -> tensor<1x2x3x!quant.uniform<u8<1:255>:f32, 0.047244094488188976:128>>
   func.return %2 : tensor<1x2x3x!quant.uniform<u8<1:255>:f32, 0.047244094488188976:128>>
-  // CHECK: %[[handle:.*]] = "tfl.var_handle"() {container = "", shared_name = "states"} : () -> tensor<!tf_type.resource<tensor<1x2x3xf32>>>
+  // CHECK: %[[handle:.*]] = "tfl.var_handle"() <{container = "", shared_name = "states"}> : () -> tensor<!tf_type.resource<tensor<1x2x3xf32>>>
   // CHECK-NEXT: %[[read:.*]] = "tfl.read_variable"(%[[handle]]) : (tensor<!tf_type.resource<tensor<1x2x3xf32>>>) -> tensor<1x2x3xf32>
-  // CHECK-NEXT: %[[quantize:.*]] = "tfl.quantize"(%[[read]]) {qtype = tensor<1x2x3x!quant.uniform<u8<1:255>:f32, 0.047244094488188976:128>>} : (tensor<1x2x3xf32>) -> tensor<1x2x3x!quant.uniform<u8<1:255>:f32, 0.047244094488188976:128>>
+  // CHECK-NEXT: %[[quantize:.*]] = "tfl.quantize"(%[[read]]) <{qtype = tensor<1x2x3x!quant.uniform<u8<1:255>:f32, 0.047244094488188976:128>>}> : (tensor<1x2x3xf32>) -> tensor<1x2x3x!quant.uniform<u8<1:255>:f32, 0.047244094488188976:128>>
   // CHECK-NEXT: return %[[quantize]]
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/shape-inference.mlir b/tensorflow/compiler/mlir/lite/tests/shape-inference.mlir
index 5baa981122985c..01a4a72749bf45 100644
--- a/tensorflow/compiler/mlir/lite/tests/shape-inference.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/shape-inference.mlir
@@ -3,7 +3,7 @@
 module attributes {tf.versions = {producer = 888 : i32}} {
 // CHECK-LABEL: testConv2dShapeValidPadding
 func.func @testConv2dShapeValidPadding(%arg0: tensor<1x112x80x128xf32>, %arg1: tensor<128x3x3x128xf32>, %arg2: tensor<128xf32>) -> tensor<1x?x?x128xf32> {
-  // CHECK: "tfl.conv_2d"(%arg0, %arg1, %arg2) {dilation_h_factor = 2 : i32, dilation_w_factor = 2 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x112x80x128xf32>, tensor<128x3x3x128xf32>, tensor<128xf32>) -> tensor<1x108x76x128xf32>
+  // CHECK: "tfl.conv_2d"(%arg0, %arg1, %arg2) <{dilation_h_factor = 2 : i32, dilation_w_factor = 2 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<1x112x80x128xf32>, tensor<128x3x3x128xf32>, tensor<128xf32>) -> tensor<1x108x76x128xf32>
   %0 = "tfl.conv_2d"(%arg0, %arg1, %arg2) {dilation_h_factor = 2 : i32, dilation_w_factor = 2 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x112x80x128xf32>, tensor<128x3x3x128xf32>, tensor<128xf32>) -> tensor<1x?x?x128xf32>
   func.return %0 : tensor<1x?x?x128xf32>
 }
@@ -14,7 +14,7 @@ func.func @testConv2dShapeValidPadding(%arg0: tensor<1x112x80x128xf32>, %arg1: t
 module attributes {tf.versions = {producer = 888 : i32}} {
 // CHECK-LABEL: testConv2dShapeInferenceSamePadding
 func.func @testConv2dShapeInferenceSamePadding(%arg0: tensor<1x112x80x128xf32>, %arg1: tensor<128x3x3x128xf32>, %arg2: tensor<128xf32>) -> tensor<1x?x?x128xf32> {
-  // CHECK: "tfl.conv_2d"(%arg0, %arg1, %arg2) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x112x80x128xf32>, tensor<128x3x3x128xf32>, tensor<128xf32>) -> tensor<1x112x80x128xf32>
+  // CHECK: "tfl.conv_2d"(%arg0, %arg1, %arg2) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<1x112x80x128xf32>, tensor<128x3x3x128xf32>, tensor<128xf32>) -> tensor<1x112x80x128xf32>
   %0 = "tfl.conv_2d"(%arg0, %arg1, %arg2) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x112x80x128xf32>, tensor<128x3x3x128xf32>, tensor<128xf32>) -> tensor<1x?x?x128xf32>
   func.return %0 : tensor<1x?x?x128xf32>
 }
@@ -25,7 +25,7 @@ func.func @testConv2dShapeInferenceSamePadding(%arg0: tensor<1x112x80x128xf32>,
 module attributes {tf.versions = {producer = 888 : i32}} {
 // CHECK-LABEL: testConv2dShapeInferenceDilation
 func.func @testConv2dShapeInferenceDilation(%arg0: tensor<1x112x80x128xf32>, %arg1: tensor<128x3x3x128xf32>, %arg2: tensor<128xf32>) -> tensor<1x?x?x128xf32> {
-  // CHECK: "tfl.conv_2d"(%arg0, %arg1, %arg2) {dilation_h_factor = 2 : i32, dilation_w_factor = 2 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x112x80x128xf32>, tensor<128x3x3x128xf32>, tensor<128xf32>) -> tensor<1x112x80x128xf32>
+  // CHECK: "tfl.conv_2d"(%arg0, %arg1, %arg2) <{dilation_h_factor = 2 : i32, dilation_w_factor = 2 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<1x112x80x128xf32>, tensor<128x3x3x128xf32>, tensor<128xf32>) -> tensor<1x112x80x128xf32>
   %0 = "tfl.conv_2d"(%arg0, %arg1, %arg2) {dilation_h_factor = 2 : i32, dilation_w_factor = 2 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x112x80x128xf32>, tensor<128x3x3x128xf32>, tensor<128xf32>) -> tensor<1x?x?x128xf32>
   func.return %0 : tensor<1x?x?x128xf32>
 }
@@ -36,7 +36,7 @@ func.func @testConv2dShapeInferenceDilation(%arg0: tensor<1x112x80x128xf32>, %ar
 module attributes {tf.versions = {producer = 888 : i32}} {
 // CHECK-LABEL: testConv2dShapeInferenceStrides
 func.func @testConv2dShapeInferenceStrides(%arg0: tensor<1x112x80x128xf32>, %arg1: tensor<128x3x3x128xf32>, %arg2: tensor<128xf32>) -> tensor<1x?x?x128xf32> {
-  // CHECK: "tfl.conv_2d"(%arg0, %arg1, %arg2) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x112x80x128xf32>, tensor<128x3x3x128xf32>, tensor<128xf32>) -> tensor<1x56x40x128xf32>
+  // CHECK: "tfl.conv_2d"(%arg0, %arg1, %arg2) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32}> : (tensor<1x112x80x128xf32>, tensor<128x3x3x128xf32>, tensor<128xf32>) -> tensor<1x56x40x128xf32>
   %0 = "tfl.conv_2d"(%arg0, %arg1, %arg2) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x112x80x128xf32>, tensor<128x3x3x128xf32>, tensor<128xf32>) -> tensor<1x?x?x128xf32>
   func.return %0 : tensor<1x?x?x128xf32>
 }
@@ -47,7 +47,7 @@ func.func @testConv2dShapeInferenceStrides(%arg0: tensor<1x112x80x128xf32>, %arg
 module attributes {tf.versions = {producer = 888 : i32}} {
 // CHECK-LABEL: testConv2dShapeInferenceUnranked
 func.func @testConv2dShapeInferenceUnranked(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>, %arg2: tensor<*xf32>) -> tensor<*xf32> {
-  // CHECK: "tfl.conv_2d"(%arg0, %arg1, %arg2) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+  // CHECK: "tfl.conv_2d"(%arg0, %arg1, %arg2) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
   %0 = "tfl.conv_2d"(%arg0, %arg1, %arg2) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
 }
@@ -58,7 +58,7 @@ func.func @testConv2dShapeInferenceUnranked(%arg0: tensor<*xf32>, %arg1: tensor<
 module attributes {tf.versions = {producer = 888 : i32}} {
 // CHECK-LABEL: testConv2dShapeInferenceDynamic
 func.func @testConv2dShapeInferenceDynamic(%arg0: tensor<1x?x?x128xf32>, %arg1: tensor<128x3x3x128xf32>, %arg2: tensor<128xf32>) -> tensor<1x?x?x128xf32> {
-  // CHECK: "tfl.conv_2d"(%arg0, %arg1, %arg2) {dilation_h_factor = 2 : i32, dilation_w_factor = 2 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x?x?x128xf32>, tensor<128x3x3x128xf32>, tensor<128xf32>) -> tensor<1x?x?x128xf32>
+  // CHECK: "tfl.conv_2d"(%arg0, %arg1, %arg2) <{dilation_h_factor = 2 : i32, dilation_w_factor = 2 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32}> : (tensor<1x?x?x128xf32>, tensor<128x3x3x128xf32>, tensor<128xf32>) -> tensor<1x?x?x128xf32>
   %0 = "tfl.conv_2d"(%arg0, %arg1, %arg2) {dilation_h_factor = 2 : i32, dilation_w_factor = 2 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x?x?x128xf32>, tensor<128x3x3x128xf32>, tensor<128xf32>) -> tensor<1x?x?x128xf32>
   func.return %0 : tensor<1x?x?x128xf32>
 }
@@ -80,7 +80,7 @@ func.func @testConv2dShapeInvalidRanks(%arg0: tensor<1x112x80xf32>, %arg1: tenso
 module attributes {tf.versions = {producer = 888 : i32}} {
 // CHECK-LABEL: testUnidirectionalSequenceLstmShapeInference
 func.func @testUnidirectionalSequenceLstmShapeInference(%arg0: tensor<600 x 10 x 20 x f32>, %arg1: tensor<? x ? x f32>, %arg2: tensor<? x ? x f32>, %arg3: tensor<? x ? x f32>, %arg4: tensor<? x ? x f32>, %arg5: tensor<? x ? x f32>, %arg6: tensor<? x ? x f32>, %arg7: tensor<? x ? x f32>, %arg8: tensor<? x ? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<40 x f32>, %arg16: tensor<? x ? x f32>, %arg17: tensor<? x f32>, %arg18: tensor<600 x 40 x f32>, %arg19: tensor<600 x 40 x f32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x ? x ? x f32> {
-  // CHECK: "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {fused_activation_function = "NONE", time_major = false} : (tensor<600x10x20xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<40xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<600x40xf32>, tensor<600x40xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<600x10x40xf32
+  // CHECK: "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) <{fused_activation_function = "NONE", time_major = false}> : (tensor<600x10x20xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<40xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<600x40xf32>, tensor<600x40xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<600x10x40xf32
   %0 = "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {fused_activation_function = "NONE", time_major = false} : (tensor<600 x 10 x 20 x f32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<40xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<600x40xf32>, tensor<600x40xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<? x ? x ? xf32>
   func.return %0 : tensor<? x ? x ? x f32>
 }
@@ -91,7 +91,7 @@ func.func @testUnidirectionalSequenceLstmShapeInference(%arg0: tensor<600 x 10 x
 module attributes {tf.versions = {producer = 888 : i32}} {
 // CHECK-LABEL: testUnidirectionalSequenceLstmShapeInference
 func.func @testUnidirectionalSequenceLstmShapeInference(%arg0: tensor<600 x ? x 20 x f32>, %arg1: tensor<? x ? x f32>, %arg2: tensor<? x ? x f32>, %arg3: tensor<? x ? x f32>, %arg4: tensor<? x ? x f32>, %arg5: tensor<? x ? x f32>, %arg6: tensor<? x ? x f32>, %arg7: tensor<? x ? x f32>, %arg8: tensor<? x ? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<40 x f32>, %arg16: tensor<? x ? x f32>, %arg17: tensor<? x f32>, %arg18: tensor<600 x 40 x f32>, %arg19: tensor<600 x 40 x f32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x ? x ? x f32> {
-  // CHECK: "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {fused_activation_function = "NONE", time_major = false} : (tensor<600x?x20xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<40xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<600x40xf32>, tensor<600x40xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<600x?x40xf32
+  // CHECK: "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) <{fused_activation_function = "NONE", time_major = false}> : (tensor<600x?x20xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<40xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<600x40xf32>, tensor<600x40xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<600x?x40xf32
   %0 = "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {fused_activation_function = "NONE", time_major = false} : (tensor<600 x ? x 20 x f32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<40xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<600x40xf32>, tensor<600x40xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<? x ? x ? xf32>
   func.return %0 : tensor<? x ? x ? x f32>
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/split-merged-operands.mlir b/tensorflow/compiler/mlir/lite/tests/split-merged-operands.mlir
index 134b7a0dccf7f6..ffe1ee7264e8d1 100644
--- a/tensorflow/compiler/mlir/lite/tests/split-merged-operands.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/split-merged-operands.mlir
@@ -2,9 +2,9 @@
 
 func.func @testSingleLstm(%arg0: tensor<4x4xf32>, %arg1: tensor<4xf32>, %arg2: tensor<4x4x4xf32>) -> tensor<4x4x4xf32> {
   // CHECK-LABEL: testSingleLstm
-  // CHECK-DAG:  %[[CST_0:.*]] = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
-  // CHECK-DAG:  %[[CST_1:.*]] = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
-  // CHECK:  %[[LSTM:[a-z0-9]*]] = "tfl.unidirectional_sequence_lstm"(%arg2, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg0, %arg1, %[[CST_0]], %[[CST_1]], %arg0, %arg0, %arg0, %arg0) {fused_activation_function = "NONE", time_major = true} : (tensor<4x4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4x4xf32>
+  // CHECK-DAG:  %[[CST_0:.*]] = "tfl.pseudo_const"() <{value = dense<0.000000e+00> : tensor<4x4xf32>}> : () -> tensor<4x4xf32>
+  // CHECK-DAG:  %[[CST_1:.*]] = "tfl.pseudo_const"() <{value = dense<0.000000e+00> : tensor<4x4xf32>}> : () -> tensor<4x4xf32>
+  // CHECK:  %[[LSTM:[a-z0-9]*]] = "tfl.unidirectional_sequence_lstm"(%arg2, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg0, %arg1, %[[CST_0]], %[[CST_1]], %arg0, %arg0, %arg0, %arg0) <{fused_activation_function = "NONE", time_major = true}> : (tensor<4x4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4x4xf32>
 
   %0 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<4x4xf32>} : () -> tensor<4x4xf32> loc("Const")
   %1 = "tfl.unidirectional_sequence_lstm"(%arg2, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg0, %arg1, %0, %0, %arg0, %arg0, %arg0, %arg0) {fused_activation_function = "NONE", time_major = true} : (tensor<4x4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4x4xf32>
@@ -13,15 +13,29 @@ func.func @testSingleLstm(%arg0: tensor<4x4xf32>, %arg1: tensor<4xf32>, %arg2: t
 
 func.func @testMultipleLstms(%arg0: tensor<4x4xf32>, %arg1: tensor<4xf32>, %arg2: tensor<4x4x4xf32>) -> tensor<4x4x4xf32> {
   // CHECK-LABEL: testMultipleLstms
-  // CHECK-DAG:  %[[CST_0:.*]] = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
-  // CHECK-DAG:  %[[CST_1:.*]] = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
-  // CHECK:  %[[LSTM_1:[a-z0-9]*]] = "tfl.unidirectional_sequence_lstm"(%arg2, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg0, %arg1, %[[CST_0]], %[[CST_1]], %arg0, %arg0, %arg0, %arg0) {fused_activation_function = "NONE", time_major = true} : (tensor<4x4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4x4xf32>
-  // CHECK-DAG:  %[[CST_2:.*]] = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
-  // CHECK-DAG:  %[[CST_3:.*]] = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
-  // CHECK:  %[[LSTM_2:[a-z0-9]*]] = "tfl.unidirectional_sequence_lstm"(%[[LSTM_1]], %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg0, %arg1, %[[CST_2]], %[[CST_3]], %arg0, %arg0, %arg0, %arg0) {fused_activation_function = "NONE", time_major = true} : (tensor<4x4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4x4xf32>
+  // CHECK-DAG:  %[[CST_0:.*]] = "tfl.pseudo_const"() <{value = dense<0.000000e+00> : tensor<4x4xf32>}> : () -> tensor<4x4xf32>
+  // CHECK-DAG:  %[[CST_1:.*]] = "tfl.pseudo_const"() <{value = dense<0.000000e+00> : tensor<4x4xf32>}> : () -> tensor<4x4xf32>
+  // CHECK:  %[[LSTM_1:[a-z0-9]*]] = "tfl.unidirectional_sequence_lstm"(%arg2, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg0, %arg1, %[[CST_0]], %[[CST_1]], %arg0, %arg0, %arg0, %arg0) <{fused_activation_function = "NONE", time_major = true}> : (tensor<4x4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4x4xf32>
+  // CHECK-DAG:  %[[CST_2:.*]] = "tfl.pseudo_const"() <{value = dense<0.000000e+00> : tensor<4x4xf32>}> : () -> tensor<4x4xf32>
+  // CHECK-DAG:  %[[CST_3:.*]] = "tfl.pseudo_const"() <{value = dense<0.000000e+00> : tensor<4x4xf32>}> : () -> tensor<4x4xf32>
+  // CHECK:  %[[LSTM_2:[a-z0-9]*]] = "tfl.unidirectional_sequence_lstm"(%[[LSTM_1]], %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg0, %arg1, %[[CST_2]], %[[CST_3]], %arg0, %arg0, %arg0, %arg0) <{fused_activation_function = "NONE", time_major = true}> : (tensor<4x4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4x4xf32>
 
   %0 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<4x4xf32>} : () -> tensor<4x4xf32> loc("Const")
   %1 = "tfl.unidirectional_sequence_lstm"(%arg2, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg0, %arg1, %0, %0, %arg0, %arg0, %arg0, %arg0) {fused_activation_function = "NONE", time_major = true} : (tensor<4x4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4x4xf32>
   %2 = "tfl.unidirectional_sequence_lstm"(%1, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg0, %arg1, %0, %0, %arg0, %arg0, %arg0, %arg0) {fused_activation_function = "NONE", time_major = true} : (tensor<4x4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4x4xf32>
   func.return %2 : tensor<4x4x4xf32>
 }
+
+func.func @testSingleLstmFloat16(%arg0: tensor<4x4xf32>, %arg1: tensor<4xf32>, %arg2: tensor<4x4x4xf32>) -> tensor<4x4x4xf32> {
+  // CHECK-LABEL: testSingleLstm
+  // CHECK-DAG:  %[[CST_0:.*]] = "tfl.pseudo_const"() <{value = dense<0.000000e+00> : tensor<4x4xf16>}> : () -> tensor<4x4xf16>
+  // CHECK-DAG:  %[[CST_1:.*]] = "tfl.pseudo_const"() <{value = dense<0.000000e+00> : tensor<4x4xf16>}> : () -> tensor<4x4xf16>
+  // CHECK-DAG:  %[[DQ_0:.*]] = "tfl.dequantize"(%[[CST_0]]) : (tensor<4x4xf16>) -> tensor<4x4xf32>
+  // CHECK-DAG:  %[[DQ_1:.*]] = "tfl.dequantize"(%[[CST_1]]) : (tensor<4x4xf16>) -> tensor<4x4xf32>
+  // CHECK:  %[[LSTM:[a-z0-9]*]] = "tfl.unidirectional_sequence_lstm"(%arg2, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg0, %arg1, %[[DQ_0]], %[[DQ_1]], %arg0, %arg0, %arg0, %arg0) <{fused_activation_function = "NONE", time_major = true}> : (tensor<4x4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4x4xf32>
+
+  %0 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<4x4xf16>} : () -> tensor<4x4xf16> loc("Const")
+  %1 = "tfl.dequantize"(%0) : (tensor<4x4xf16>) -> tensor<4x4xf32>
+  %2 = "tfl.unidirectional_sequence_lstm"(%arg2, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg0, %arg1, %1, %1, %arg0, %arg0, %arg0, %arg0) {fused_activation_function = "NONE", time_major = true} : (tensor<4x4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4x4xf32>
+  func.return %2 : tensor<4x4x4xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/tfl_while_outline.mlir b/tensorflow/compiler/mlir/lite/tests/tfl_while_outline.mlir
index 71f717d4c7862b..8ef595c2dcf43e 100644
--- a/tensorflow/compiler/mlir/lite/tests/tfl_while_outline.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/tfl_while_outline.mlir
@@ -204,7 +204,7 @@ func.func @whileSinkConstant(%arg0: tensor<1x256xf32>) -> tensor<1x256xf32> attr
       "tfl.yield"(%3) : (tensor<i1>) -> ()
     }, {
     ^bb0(%arg1: tensor<i32>, %arg2: tensor<1x256xf32>):
-     // CHECK: %[[QCONST:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<256x256x!quant.uniform<i8:f32, 1.000000e+00>>, value = dense<1> : tensor<256x256xi8>} : () -> tensor<256x256x!quant.uniform<i8:f32, 1.000000e+00>>
+     // CHECK: %[[QCONST:.*]] = "tfl.pseudo_qconst"() <{qtype = tensor<256x256x!quant.uniform<i8:f32, 1.000000e+00>>, value = dense<1> : tensor<256x256xi8>}> : () -> tensor<256x256x!quant.uniform<i8:f32, 1.000000e+00>>
      // CHECK: %[[CONST:.*]] = arith.constant dense<1> : tensor<256x256xi8>
      %4 = "tfl.batch_matmul"(%arg2, %cst_0) {adj_x = false, adj_y = false} : (tensor<1x256xf32>, tensor<256x256xi8>) -> tensor<1x256xf32>
      // CHECK-NEXT: %[[BMM_0:.*]] = "tfl.batch_matmul"(%arg1, %[[CONST]])
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
index f4aa97069655e8..ccd6b8e559eac8 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_passes.h"
 #include "tensorflow/compiler/mlir/lite/quantization/tensorflow/passes.h"
+#include "tensorflow/compiler/mlir/lite/stablehlo/odml_converter/passes.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.h"
@@ -150,12 +151,14 @@ void AddPreQuantizationStableHloToTfPasses(
   // to be consistent with other entrypoints.
   pass_manager.addPass(mlir::mhlo::createHloLegalizeToStablehloPass());
 
+  pass_manager.addNestedPass<mlir::func::FuncOp>(
+      mlir::odml::CreateOutlineCompositesPass());
   // Decompose CHLO into StableHLO ops
   // TODO(b/331843141): There are some CHLO's like TopK which we could instead
   // lower to TFL ops.
   mlir::stablehlo::experimental::createChloLegalizeToStablehloPipeline(
       pass_manager);
-
+  pass_manager.addPass(mlir::odml::CreateTransposeCommuteOpsPass());
   // The following two passes find specific uniform quantization patterns in
   // StableHLO and converts them to TFLite ops that accept or produce uniform
   // quantized types. They only target a specific set of models that contain
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
index c99780603abbb4..8d124af7cb246a 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
@@ -49,6 +49,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/init_mlir.h"
 #include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
 #include "tensorflow/compiler/mlir/lite/flatbuffer_export_flags.h"
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
 #include "tensorflow/compiler/mlir/lite/tf_tfl_translate_cl.h"
 #include "tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
@@ -61,7 +62,6 @@ limitations under the License.
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/lite/model_builder.h"
-#include "tensorflow/lite/schema/schema_generated.h"
 
 using mlir::MLIRContext;
 using mlir::ModuleOp;
diff --git a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
index ac4de7f82b23d0..dd8b345862e3c8 100644
--- a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
@@ -50,6 +50,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/metrics/error_collector.h"
 #include "tensorflow/compiler/mlir/lite/metrics/error_collector_inst.h"
 #include "tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.h"
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/op_stat_pass.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_util.h"
@@ -80,7 +81,6 @@ limitations under the License.
 #include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/experimental/remat/metadata_util.h"
 #include "tensorflow/lite/python/metrics/converter_error_data.pb.h"
-#include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/tools/optimize/quantize_weights.h"
 #include "tensorflow/lite/tools/optimize/reduced_precision_support.h"
 #include "tsl/platform/protobuf.h"  // IWYU pragma: keep
diff --git a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
index 463b09005544aa..f77912938d8709 100644
--- a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
+++ b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
@@ -45,7 +45,7 @@ namespace tensorflow {
 // file; otherwise, load from a GraphDef.
 // Setting prune_unused_nodes to true, would prune unreachable nodes if
 // output_arrays is specified.
-tsl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> LoadFromGraphdefOrMlirSource(
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> LoadFromGraphdefOrMlirSource(
     const std::string& input_filename, bool input_mlir,
     bool use_splatted_constant, const std::vector<std::string>& extra_tf_opdefs,
     const GraphImportConfig& specs, absl::string_view debug_info_file,
@@ -56,7 +56,7 @@ tsl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> LoadFromGraphdefOrMlirSource(
 
 // Load Saved model (either v1 or v2) into MLIR.
 // 'saved_model_bundle' will be initialized if V1 model was loaded.
-tsl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ImportSavedModel(
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ImportSavedModel(
     const std::string& input_filename, int saved_model_version,
     const std::unordered_set<std::string>& tags,
     absl::Span<const std::string> extra_tf_opdefs,
diff --git a/tensorflow/compiler/mlir/lite/transforms/analyze_variables.cc b/tensorflow/compiler/mlir/lite/transforms/analyze_variables.cc
index 6fd0278bf909e4..39afd416ab1aa2 100644
--- a/tensorflow/compiler/mlir/lite/transforms/analyze_variables.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/analyze_variables.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -92,7 +93,8 @@ void AnalyzeVariablesPass::runOnOperation() {
     // Note: this might disable native variables in more than needed cases.
     // TODO(b/189370197): Enhance variable analysis.
     for (auto operand : op->getOperands()) {
-      if (getElementTypeOrSelf(operand.getType()).isa<TF::ResourceType>()) {
+      if (mlir::isa<TF::ResourceType>(
+              getElementTypeOrSelf(operand.getType()))) {
         legalize_to_tfl = false;
         return WalkResult::interrupt();
       }
diff --git a/tensorflow/compiler/mlir/lite/transforms/decompose_hybrid_quantization.cc b/tensorflow/compiler/mlir/lite/transforms/decompose_hybrid_quantization.cc
index 5329274271c55c..3fcd82ef033938 100644
--- a/tensorflow/compiler/mlir/lite/transforms/decompose_hybrid_quantization.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/decompose_hybrid_quantization.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
@@ -62,20 +63,20 @@ class DequantizeConverter : public OpRewritePattern<SrcOp> {
     bool allTypesFp = true;
     bool allTypesQuantizedOrInt = true;
     for (auto operand : op->getOperands()) {
-      ShapedType type = operand.getType().template dyn_cast<ShapedType>();
+      ShapedType type = mlir::dyn_cast<ShapedType>(operand.getType());
       if (!type) continue;
-      allTypesFp &= !type.getElementType().isa<quant::QuantizedType>();
+      allTypesFp &= !mlir::isa<quant::QuantizedType>(type.getElementType());
       allTypesQuantizedOrInt &=
-          (type.getElementType().isa<quant::QuantizedType>() ||
-           type.getElementType().isa<IntegerType>());
+          (mlir::isa<quant::QuantizedType>(type.getElementType()) ||
+           mlir::isa<IntegerType>(type.getElementType()));
     }
 
     for (auto result : op->getResults()) {
-      ShapedType type = result.getType().template cast<ShapedType>();
-      allTypesFp &= !type.getElementType().isa<quant::QuantizedType>();
+      ShapedType type = mlir::cast<ShapedType>(result.getType());
+      allTypesFp &= !mlir::isa<quant::QuantizedType>(type.getElementType());
       allTypesQuantizedOrInt &=
-          (type.getElementType().isa<quant::QuantizedType>() ||
-           type.getElementType().isa<IntegerType>());
+          (mlir::isa<quant::QuantizedType>(type.getElementType()) ||
+           mlir::isa<IntegerType>(type.getElementType()));
     }
 
     // If all quantized or floating point then types are consistent.
diff --git a/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc b/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
index 2f015e61d58fe6..94ed4b1e0340a5 100644
--- a/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
@@ -152,7 +153,7 @@ void DefaultQuantParamsPass::AddToWorkListIfUnquantized(
     Value value, std::vector<Value> *values) {
   // If the result isn't with float type, this result is an integer tensor and
   // doesn't require quantization.
-  auto tensor_type = value.getType().dyn_cast<TensorType>();
+  auto tensor_type = mlir::dyn_cast<TensorType>(value.getType());
   if (!tensor_type) {
     // There are none type values.
     return;
@@ -202,9 +203,9 @@ quant::QuantParams DefaultQuantParamsPass::GetQuantParamsForBias(
   for (int non_bias : non_biases) {
     Operation *non_bias_define = op->getOperand(non_bias).getDefiningOp();
     if (auto dequant = llvm::dyn_cast<TFL::DequantizeOp>(non_bias_define)) {
-      auto non_bias_type = dequant.getInput().getType().cast<TensorType>();
+      auto non_bias_type = mlir::cast<TensorType>(dequant.getInput().getType());
       auto non_bias_ele_type =
-          non_bias_type.getElementType().cast<quant::QuantizedType>();
+          mlir::cast<quant::QuantizedType>(non_bias_type.getElementType());
       non_bias_types.push_back(non_bias_ele_type);
     } else {
       // The non-bias hasn't been quantized, let's skip this bias.
diff --git a/tensorflow/compiler/mlir/lite/transforms/dense_to_sparse.cc b/tensorflow/compiler/mlir/lite/transforms/dense_to_sparse.cc
index 8a3abc94e2af57..5cac14867482bb 100644
--- a/tensorflow/compiler/mlir/lite/transforms/dense_to_sparse.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/dense_to_sparse.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/lite/kernels/internal/utils/sparsity_format_converter.h"
@@ -92,13 +93,13 @@ float CalculateRandomSparsity(const ElementsAttr& attr,
   int num_elements = type.getNumElements();
   int num_zeros = 0;
 
-  if (type.getElementType().isa<FloatType>()) {
+  if (mlir::isa<FloatType>(type.getElementType())) {
     for (const auto val : attr.getValues<APFloat>()) {
       if (val.isZero()) {
         num_zeros++;
       }
     }
-  } else if (type.getElementType().isa<quant::QuantizedType>()) {
+  } else if (mlir::isa<quant::QuantizedType>(type.getElementType())) {
     for (const auto val : attr.getValues<int8_t>()) {
       if (val == 0) {
         num_zeros++;
@@ -144,7 +145,7 @@ float CalculateBlockSparsity(const ElementsAttr& attr, const ShapedType& type,
     sparsity =
         GetSparsity(type.getNumElements() - format_converter.GetData().size(),
                     type.getNumElements());
-  } else if (type.getElementType().isa<quant::QuantizedType>()) {
+  } else if (mlir::isa<quant::QuantizedType>(type.getElementType())) {
     tflite::internal::sparsity::FormatConverter<int8_t> format_converter(
         shape, traversal_order, format, b_size, b_map);
     std::vector<int8_t> data;
@@ -179,10 +180,10 @@ InspectResult InspectWeight(
   InspectResult result = {};
   if (auto cst = dyn_cast<ConstOp>(inst)) {
     attr = cst.getValue();
-    type = cst.getType().cast<ShapedType>();
+    type = mlir::cast<ShapedType>(cst.getType());
   } else if (auto cst = dyn_cast<QConstOp>(inst)) {
     attr = cst.getValue();
-    type = cst.getType().cast<ShapedType>();
+    type = mlir::cast<ShapedType>(cst.getType());
   } else {
     result.can_compress = false;
     return result;
@@ -229,10 +230,10 @@ std::vector<T> BuildSparsityParameterAttribute(
   ShapedType type;
   if (auto cst = dyn_cast<ConstOp>(inst)) {
     attr = cst.getValue();
-    type = cst.getType().cast<ShapedType>();
+    type = mlir::cast<ShapedType>(cst.getType());
   } else if (auto cst = dyn_cast<QConstOp>(inst)) {
     attr = cst.getValue();
-    type = cst.getType().cast<ShapedType>();
+    type = mlir::cast<ShapedType>(cst.getType());
   } else {
     assert(false && "Expected a constant-like op");
   }
@@ -317,10 +318,10 @@ void DenseToSparsePass::runOnOperation() {
       float ratio_threshold = kBlockOverRandomSparsityRatio;
       if (isa<ConstOp>(inst)) {
         supported_block_size = sparse_op.GetFloatBlockSize();
-        type = dyn_cast<ConstOp>(inst).getType().cast<ShapedType>();
+        type = mlir::cast<ShapedType>(dyn_cast<ConstOp>(inst).getType());
       } else if (isa<QConstOp>(inst)) {
         supported_block_size = sparse_op.GetQuantizedBlockSize();
-        type = dyn_cast<QConstOp>(inst).getType().cast<ShapedType>();
+        type = mlir::cast<ShapedType>(dyn_cast<QConstOp>(inst).getType());
         ratio_threshold = kBlockOverRandomSparsityRatioQuant;
       } else {
         continue;
@@ -341,7 +342,7 @@ void DenseToSparsePass::runOnOperation() {
       SparsityParameterAttr s_param;
       if (auto cst = dyn_cast<ConstOp>(inst)) {
         auto attr = cst.getValue();
-        auto type = cst.getType().cast<ShapedType>();
+        auto type = mlir::cast<ShapedType>(cst.getType());
         if (type.getElementType().isF32()) {
           std::vector<float> dense_data;
           dense_data.reserve(type.getNumElements());
@@ -385,7 +386,7 @@ void DenseToSparsePass::runOnOperation() {
         }
       } else if (auto cst = dyn_cast<QConstOp>(inst)) {
         auto attr = cst.getValue();
-        auto type = cst.getType().cast<ShapedType>();
+        auto type = mlir::cast<ShapedType>(cst.getType());
         std::vector<int8_t> dense_data;
         dense_data.reserve(type.getNumElements());
         for (const auto& val : attr.getValues<int8_t>())
diff --git a/tensorflow/compiler/mlir/lite/transforms/dilated_conv.h b/tensorflow/compiler/mlir/lite/transforms/dilated_conv.h
index 51068fcf4ac67c..fe8bb7d2ca177f 100644
--- a/tensorflow/compiler/mlir/lite/transforms/dilated_conv.h
+++ b/tensorflow/compiler/mlir/lite/transforms/dilated_conv.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/utils/validators.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -110,7 +111,7 @@ LogicalResult ConvertTFDilatedConvOp<Conv2dOpTy>::matchAndRewrite(
   }
 
   // Allow dynamic width and height dimensions only.
-  auto result_ty = op.getResult().getType().template cast<TensorType>();
+  auto result_ty = mlir::cast<TensorType>(op.getResult().getType());
   if (!result_ty.hasRank() || result_ty.getRank() != 4 ||
       result_ty.isDynamicDim(0) || result_ty.isDynamicDim(3)) {
     return rewriter.notifyMatchFailure(
@@ -187,8 +188,7 @@ LogicalResult ConvertTFDilatedConvOp<Conv2dOpTy>::matchAndRewrite(
     // Make sure that the axis in `expand_op` is constant.
     if (auto const_op =
             llvm::dyn_cast<TF::ConstOp>(expand_op.getDim().getDefiningOp())) {
-      expand_axis = (*const_op.getValue()
-                          .cast<DenseElementsAttr>()
+      expand_axis = (*mlir::cast<DenseElementsAttr>(const_op.getValue())
                           .getValues<APInt>()
                           .begin())
                         .getSExtValue();
@@ -208,7 +208,7 @@ LogicalResult ConvertTFDilatedConvOp<Conv2dOpTy>::matchAndRewrite(
       return rewriter.notifyMatchFailure(
           squeeze_op, "squeeze dims should have exactly 1 dimension specified");
     }
-    int64_t squeeze_axis = squeeze_dims[0].cast<IntegerAttr>().getInt();
+    int64_t squeeze_axis = mlir::cast<IntegerAttr>(squeeze_dims[0]).getInt();
     if (squeeze_axis < 0) {
       // Always squeeze 4D input to 3D input.
       squeeze_axis += 4;
@@ -318,7 +318,8 @@ LogicalResult ConvertTFDilatedConvOp<Conv2dOpTy>::matchAndRewrite(
   }
 
   if (expand_op) {
-    if (stb_op.getInput().getType().dyn_cast<RankedTensorType>() == nullptr) {
+    if (mlir::dyn_cast<RankedTensorType>(stb_op.getInput().getType()) ==
+        nullptr) {
       return rewriter.notifyMatchFailure(
           stb_op, "SpaceToBatchND op's input should have RankedTensorType");
     }
@@ -401,7 +402,7 @@ LogicalResult ConvertTFDilatedConvOp<Conv2dOpTy>::matchAndRewrite(
     expand_op.setOperand(0, stb_op.getInput());
     // Calculate the shape for expand.
     auto input_shape =
-        stb_op.getInput().getType().cast<ShapedType>().getShape();
+        mlir::cast<ShapedType>(stb_op.getInput().getType()).getShape();
     SmallVector<int64_t, 4> expand_shape(input_shape.begin(),
                                          input_shape.end());
     expand_shape.insert(expand_shape.begin() + expand_axis, 1);
@@ -412,7 +413,7 @@ LogicalResult ConvertTFDilatedConvOp<Conv2dOpTy>::matchAndRewrite(
 
     // Update the conv op's output shape.
     auto bts_output_shape =
-        bts_op.getOutput().getType().cast<ShapedType>().getShape();
+        mlir::cast<ShapedType>(bts_op.getOutput().getType()).getShape();
     SmallVector<int64_t, 4> conv_result_shape(bts_output_shape.begin(),
                                               bts_output_shape.end());
     conv_result_shape.insert(conv_result_shape.begin() + expand_axis, 1);
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_hashtables.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_hashtables.cc
index 252e18e191aea4..5e88048d775532 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_hashtables.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_hashtables.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
@@ -142,10 +143,12 @@ bool checkWhetherGraphHasValidStaticLookupTables(ModuleOp module) {
 
     // Only allow string -> int64 and int64 -> string mappings due to kernel
     // capability.
-    if (!((key_dtype.isa<TF::StringType>() && value_dtype.isa<IntegerType>() &&
-           value_dtype.cast<IntegerType>().getWidth() == 64) ||
-          (value_dtype.isa<TF::StringType>() && key_dtype.isa<IntegerType>() &&
-           key_dtype.cast<IntegerType>().getWidth() == 64))) {
+    if (!((mlir::isa<TF::StringType>(key_dtype) &&
+           mlir::isa<IntegerType>(value_dtype) &&
+           mlir::cast<IntegerType>(value_dtype).getWidth() == 64) ||
+          (mlir::isa<TF::StringType>(value_dtype) &&
+           mlir::isa<IntegerType>(key_dtype) &&
+           mlir::cast<IntegerType>(key_dtype).getWidth() == 64))) {
       return false;
     }
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_jax_random.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_jax_random.cc
index e8bae6eb64280f..9b0a80a4f92a71 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_jax_random.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_jax_random.cc
@@ -84,10 +84,10 @@ void LegalizeJaxRandomPass::runOnOperation() {
   auto func = getOperation();
   if (!IsJaxRandomUniform(func) && !IsJaxRandomNormal(func)) return;
   auto result_tuple_ty =
-      func.getFunctionType().getResult(0).dyn_cast_or_null<TupleType>();
+      mlir::dyn_cast_or_null<TupleType>(func.getFunctionType().getResult(0));
   if (!result_tuple_ty) return;
   if (result_tuple_ty.size() != 1) return;
-  auto result_ty = result_tuple_ty.getType(0).dyn_cast<ShapedType>();
+  auto result_ty = mlir::dyn_cast<ShapedType>(result_tuple_ty.getType(0));
 
   func.eraseBody();
   func.addEntryBlock();
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
index cfe9bc754d8077..240773a82a9657 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
@@ -73,9 +73,6 @@ def CreateTFCastToInt32Op : NativeCodeCall<
 def CreateInt32ConstOrCast : NativeCodeCall<
   "CreateInt32ConstOrCast($0, $_loc, $_builder)">;
 
-def CreateNoneValue : NativeCodeCall<
-  "$_builder.create<TFL::NoValueOp>($0.getLoc(), $_builder.getUnitAttr())">;
-
 // Creates an int32 constant op from an integer attribute $0.
 def CreateInt32ConstOpFromIntAttr
   : NativeCodeCall<"$_builder.create<TF::ConstOp>($_loc, DenseElementsAttr::get(RankedTensorType::get({}, $_builder.getI32Type()), {static_cast<int32_t>($0.cast<IntegerAttr>().getInt())}))">;
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_tensorlist.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_tensorlist.cc
index d4f58c00eea4f5..10adf2434acbca 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_tensorlist.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_tensorlist.cc
@@ -40,12 +40,12 @@ limitations under the License.
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
-#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace {
 using ::mlir::MLIRContext;
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
index 38a8bffd87bb03..2011b6d33ccd45 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
@@ -58,7 +58,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
@@ -96,7 +95,7 @@ class LegalizeTFPass : public impl::LegalizeTFPassBase<LegalizeTFPass> {
 // Util that casts 'val' to Int32 by adding a cast Op.
 Value CreateCastToInt32(Value val, Location loc, PatternRewriter& rewriter) {
   IntegerType new_ele_type = rewriter.getIntegerType(32);
-  if (auto shaped_type = val.getType().dyn_cast<RankedTensorType>()) {
+  if (auto shaped_type = mlir::dyn_cast<RankedTensorType>(val.getType())) {
     ShapedType new_type =
         RankedTensorType::get(shaped_type.getShape(), new_ele_type);
     return rewriter.createOrFold<TF::CastOp>(loc, new_type, val,
@@ -114,7 +113,7 @@ Value CreateCastToInt32(Value val, Location loc, PatternRewriter& rewriter) {
 // 2. In the default case, cast the `Value` to an int32_t.
 Value CreateInt32ConstOrCast(Value val, Location loc,
                              PatternRewriter& rewriter) {
-  if (val.getType().cast<ShapedType>().hasStaticShape()) {
+  if (mlir::cast<ShapedType>(val.getType()).hasStaticShape()) {
     DenseElementsAttr shape_value_attr;
     if (matchPattern(val, m_Constant(&shape_value_attr))) {
       SmallVector<int32_t, 4> new_shape_array_i32;
@@ -137,7 +136,7 @@ Value CreateInt32ConstOrCast(Value val, Location loc,
 
 // Get shape of an operand or result, support both dynamic and static shape.
 Value GetShape(Value input, Location loc, PatternRewriter& rewriter) {
-  auto shaped_type = input.getType().cast<ShapedType>();
+  auto shaped_type = mlir::cast<ShapedType>(input.getType());
   if (shaped_type.hasStaticShape()) {
     auto static_shape = shaped_type.getShape();
     auto static_shape_type =
@@ -271,7 +270,7 @@ bool ConvertTFBatchMatMulOp2TFLFullyConnectedOp(Operation* bmm_op,
 
   // Create a tfl.transpose op that performs ZX transpose on `input`.
   auto create_z_x_transpose_op = [&](Value input) -> Value {
-    RankedTensorType input_type = input.getType().cast<RankedTensorType>();
+    RankedTensorType input_type = mlir::cast<RankedTensorType>(input.getType());
     const int input_rank = input_type.getRank();
 
     // Create a 1D I32 tensor for representing the dimension permutation.
@@ -364,7 +363,7 @@ LogicalResult ConvertTFMatMulOp::matchAndRewrite(
   auto rhs = op->getOperand(1);
   auto transpose = [&](Value input) -> std::pair<LogicalResult, Value> {
     RankedTensorType type =
-        input.getType().dyn_cast_or_null<RankedTensorType>();
+        mlir::dyn_cast_or_null<RankedTensorType>(input.getType());
     if (!type || type.getRank() != 2) return {failure(), nullptr};
 
     auto permute_attr = DenseIntElementsAttr::get(
@@ -583,15 +582,15 @@ bool ConvertTFMatrixDiagV2orV3(Operation* op, PatternRewriter* rewriter) {
   // Verify padding_value is a tensor with all 0s.
   mlir::Value padding_value = tf_matrix_diag_v2_or_v3_op.getPaddingValue();
   mlir::Type element_type =
-      padding_value.getType().cast<ShapedType>().getElementType();
-  if (element_type.isa<FloatType>()) {
+      mlir::cast<ShapedType>(padding_value.getType()).getElementType();
+  if (mlir::isa<FloatType>(element_type)) {
     DenseFPElementsAttr padding_attr;
     if (!matchPattern(padding_value, m_Constant(&padding_attr)) ||
         !padding_attr.isSplat() ||
         !padding_attr.getSplatValue<APFloat>().isZero()) {
       return false;
     }
-  } else if (element_type.isa<IntegerType>()) {
+  } else if (mlir::isa<IntegerType>(element_type)) {
     DenseIntElementsAttr padding_attr;
     if (!matchPattern(padding_value, m_Constant(&padding_attr)) ||
         !padding_attr.isSplat() ||
@@ -642,7 +641,7 @@ struct LegalizeUnidirectionalSequenceLstm : public RewritePattern {
 
     SmallVector<int64_t, 20> tflite_indices;
     for (auto index_attr : tflite_indices_attr.getValue()) {
-      IntegerAttr index = index_attr.cast<IntegerAttr>();
+      IntegerAttr index = mlir::cast<IntegerAttr>(index_attr);
       tflite_indices.push_back(index.getInt());
     }
 
@@ -773,13 +772,13 @@ class ApplyExplicitBroadcasting : public OpRewritePattern<SourceOp> {
     SmallVector<int64_t, 4> symbolic_broadcast_shape;
     // Matches fail when lhs or rhs is unranked tensor.
     // TODO(b/176202543): Support unranked tensor.
-    if (!lhs.getType().cast<ShapedType>().hasRank() ||
-        !rhs.getType().cast<ShapedType>().hasRank()) {
+    if (!mlir::cast<ShapedType>(lhs.getType()).hasRank() ||
+        !mlir::cast<ShapedType>(rhs.getType()).hasRank()) {
       return failure();
     }
     if (!OpTrait::util::getBroadcastedShape(
-            lhs.getType().cast<ShapedType>().getShape(),
-            rhs.getType().cast<ShapedType>().getShape(),
+            mlir::cast<ShapedType>(lhs.getType()).getShape(),
+            mlir::cast<ShapedType>(rhs.getType()).getShape(),
             symbolic_broadcast_shape)) {
       return failure();
     }
@@ -824,13 +823,13 @@ class ApplyExplicitBroadcasting : public OpRewritePattern<SourceOp> {
     auto lhs = op->getOperand(0);
     auto rhs = op->getOperand(1);
 
-    if (!lhs.getType().cast<ShapedType>().hasStaticShape() ||
-        !rhs.getType().cast<ShapedType>().hasStaticShape()) {
+    if (!mlir::cast<ShapedType>(lhs.getType()).hasStaticShape() ||
+        !mlir::cast<ShapedType>(rhs.getType()).hasStaticShape()) {
       return rewriteOpWithDynamicInput(op, rewriter);
     }
 
-    auto lhs_shape = lhs.getType().cast<ShapedType>().getShape();
-    auto rhs_shape = rhs.getType().cast<ShapedType>().getShape();
+    auto lhs_shape = mlir::cast<ShapedType>(lhs.getType()).getShape();
+    auto rhs_shape = mlir::cast<ShapedType>(rhs.getType()).getShape();
 
     if (lhs_shape == rhs_shape) {
       return failure();
@@ -892,23 +891,23 @@ class ApplyExplicitBroadcasting<TF::SelectV2Op>
 
     // Matches fail when lhs|rhs|cond is unranked tensor.
     // TODO(b/176202543): Support unranked tensor.
-    if (!lhs.getType().cast<ShapedType>().hasRank() ||
-        !rhs.getType().cast<ShapedType>().hasRank() ||
-        !cond.getType().cast<ShapedType>().hasRank()) {
+    if (!mlir::cast<ShapedType>(lhs.getType()).hasRank() ||
+        !mlir::cast<ShapedType>(rhs.getType()).hasRank() ||
+        !mlir::cast<ShapedType>(cond.getType()).hasRank()) {
       return failure();
     }
 
     // Calculates symbolic broadcast shape that is only used in types.
     SmallVector<int64_t, 4> symbolic_broadcast_lhs_rhs_shape;
     if (!OpTrait::util::getBroadcastedShape(
-            lhs.getType().cast<ShapedType>().getShape(),
-            rhs.getType().cast<ShapedType>().getShape(),
+            mlir::cast<ShapedType>(lhs.getType()).getShape(),
+            mlir::cast<ShapedType>(rhs.getType()).getShape(),
             symbolic_broadcast_lhs_rhs_shape)) {
       return failure();
     }
     SmallVector<int64_t, 4> symbolic_broadcast_shape;
     if (!OpTrait::util::getBroadcastedShape(
-            cond.getType().cast<ShapedType>().getShape(),
+            mlir::cast<ShapedType>(cond.getType()).getShape(),
             symbolic_broadcast_lhs_rhs_shape, symbolic_broadcast_shape)) {
       return failure();
     }
@@ -964,15 +963,15 @@ class ApplyExplicitBroadcasting<TF::SelectV2Op>
     auto rhs = op->getOperand(2);
 
     // Should have static shapes to calculate the broadcasted shape.
-    if (!lhs.getType().cast<ShapedType>().hasStaticShape() ||
-        !rhs.getType().cast<ShapedType>().hasStaticShape() ||
-        !cond.getType().cast<ShapedType>().hasStaticShape()) {
+    if (!mlir::cast<ShapedType>(lhs.getType()).hasStaticShape() ||
+        !mlir::cast<ShapedType>(rhs.getType()).hasStaticShape() ||
+        !mlir::cast<ShapedType>(cond.getType()).hasStaticShape()) {
       return rewriteOpWithDynamicInput(op, rewriter);
     }
 
-    auto lhs_shape = lhs.getType().cast<ShapedType>().getShape();
-    auto rhs_shape = rhs.getType().cast<ShapedType>().getShape();
-    auto cond_shape = cond.getType().cast<ShapedType>().getShape();
+    auto lhs_shape = mlir::cast<ShapedType>(lhs.getType()).getShape();
+    auto rhs_shape = mlir::cast<ShapedType>(rhs.getType()).getShape();
+    auto cond_shape = mlir::cast<ShapedType>(cond.getType()).getShape();
 
     if (lhs_shape == rhs_shape && cond_shape == lhs_shape) {
       return failure();
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_variables.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_variables.cc
index 7098b2f75157da..7742ea06976c00 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_variables.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_variables.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
@@ -67,7 +68,7 @@ class LegalizeVariablesPass
     // If TFLite variable legalization is not allowed, then we skip this pass.
     if (auto legalize_tfl_variables_attr =
             module->getAttr(kLegalizeTflVariables)) {
-      if (!legalize_tfl_variables_attr.cast<BoolAttr>().getValue()) return;
+      if (!mlir::cast<BoolAttr>(legalize_tfl_variables_attr).getValue()) return;
     }
 
     RewritePatternSet patterns(&getContext());
diff --git a/tensorflow/compiler/mlir/lite/transforms/lift_tflite_flex_ops.cc b/tensorflow/compiler/mlir/lite/transforms/lift_tflite_flex_ops.cc
index e212ce16ee6ccd..747e96d40b6850 100644
--- a/tensorflow/compiler/mlir/lite/transforms/lift_tflite_flex_ops.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/lift_tflite_flex_ops.cc
@@ -117,7 +117,7 @@ class LiftFlexCustomOp : public OpRewritePattern<TFL::CustomOp> {
     // TODO(b/146131919): correct handling of resource type
     if (auto tensor_array_v3_op = dyn_cast<TF::TensorArrayV3Op>(tf_op)) {
       Value handle = tensor_array_v3_op.getHandle();
-      auto handle_type = handle.getType().cast<TensorType>();
+      auto handle_type = mlir::cast<TensorType>(handle.getType());
       if (handle_type.getElementType().isInteger(/*width=*/32)) {
         Type resource_tensor_type =
             handle_type.clone(TF::ResourceType::get(rewriter.getContext()));
@@ -225,8 +225,8 @@ class LiftFlexCustomOp : public OpRewritePattern<TFL::CustomOp> {
         return emitError(loc, mlir_attr.status().message());
       }
       if (absl::StrContains(op_name, "Dataset") &&
-          mlir_attr->isa<TF::FuncAttr>()) {
-        mlir_attr = mlir_attr->cast<TF::FuncAttr>().getName();
+          mlir::isa<TF::FuncAttr>(*mlir_attr)) {
+        mlir_attr = mlir::cast<TF::FuncAttr>(*mlir_attr).getName();
       }
       attributes.push_back(builder.getNamedAttr(attr_name, *mlir_attr));
     }
diff --git a/tensorflow/compiler/mlir/lite/transforms/modify_io_nodes.cc b/tensorflow/compiler/mlir/lite/transforms/modify_io_nodes.cc
index a8adac41229277..7fea1e395ea209 100644
--- a/tensorflow/compiler/mlir/lite/transforms/modify_io_nodes.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/modify_io_nodes.cc
@@ -94,7 +94,7 @@ LogicalResult ModifyIONodesPass::SetupInputOutputTypesIfNull(
 LogicalResult ModifyIONodesPass::ModifyInputNodes(
     func::FuncOp func, llvm::SmallVectorImpl<Type>& new_input_types,
     OpBuilder builder) {
-  if (input_type.isa<FloatType>()) {
+  if (mlir::isa<FloatType>(input_type)) {
     return success();
   }
 
@@ -151,7 +151,7 @@ LogicalResult ModifyIONodesPass::ModifyOutputNodes(
   auto* terminator = block.getTerminator();
   builder.setInsertionPoint(terminator);
 
-  if (output_type.isa<FloatType>()) {
+  if (mlir::isa<FloatType>(output_type)) {
     return success();
   }
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize.cc b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
index 1fc84007a64cce..606be04a0f7d6b 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
@@ -81,7 +81,7 @@ constexpr char kRelu6[] = "RELU6";
 constexpr char kRelu1[] = "RELU_N1_TO_1";
 
 ElementsAttr FlattenTo1D(Attribute a) {
-  auto elements = a.cast<DenseElementsAttr>();
+  auto elements = mlir::cast<DenseElementsAttr>(a);
   const std::array<int64_t, 1> flattened_shape = {elements.getNumElements()};
   auto new_type = RankedTensorType::get(flattened_shape,
                                         elements.getType().getElementType());
@@ -91,8 +91,8 @@ ElementsAttr FlattenTo1D(Attribute a) {
 // This assumes that the bias is of shape NxCx1x1 and doesn't require transpose
 // Its corresponding constraint is optimize_patterns.td:IsBiasShape()
 ElementsAttr ReshapeNCHWBiasToNHWC(Value v, Attribute a) {
-  auto elements = a.cast<DenseElementsAttr>();
-  auto shape = v.getType().cast<ShapedType>().getShape();
+  auto elements = mlir::cast<DenseElementsAttr>(a);
+  auto shape = mlir::cast<ShapedType>(v.getType()).getShape();
   if (shape.size() != 4 || shape[2] != 1 || shape[3] != 1) return elements;
   const std::array<int64_t, 4> new_shape = {shape[0], shape[2], shape[3],
                                             shape[1]};
@@ -105,15 +105,16 @@ bool L2NormalizeReduceAxis(Value sq_op, DenseElementsAttr axis) {
   if (axis.getNumElements() == 0) {
     return false;
   }
-  if (sq_op.getType().cast<ShapedType>().getRank() - 1 ==
+  if (mlir::cast<ShapedType>(sq_op.getType()).getRank() - 1 ==
           *axis.getValues<int>().begin() ||
       *axis.getValues<int>().begin() == -1) {
     return true;
   }
-  if (sq_op.getType().cast<ShapedType>().getRank() != axis.getNumElements()) {
+  if (mlir::cast<ShapedType>(sq_op.getType()).getRank() !=
+      axis.getNumElements()) {
     return false;
   }
-  auto shape = sq_op.getType().cast<ShapedType>();
+  auto shape = mlir::cast<ShapedType>(sq_op.getType());
   SmallVector<int, 4> elems{axis.getValues<int>().begin(),
                             axis.getValues<int>().end()};
   for (int i = 0; i < shape.getRank(); ++i) {
@@ -144,9 +145,10 @@ class OptimizePass : public impl::OptimizePassBase<OptimizePass> {
 // is equal to the non-contracting dimension after a reshape
 bool BroadcastDimsProductEqual(Value input, Value output,
                                size_t agg_start_idx) {
-  ArrayRef<int64_t> input_shape = input.getType().cast<ShapedType>().getShape();
+  ArrayRef<int64_t> input_shape =
+      mlir::cast<ShapedType>(input.getType()).getShape();
   ArrayRef<int64_t> output_shape =
-      output.getType().cast<ShapedType>().getShape();
+      mlir::cast<ShapedType>(output.getType()).getShape();
 
   int64_t agg_value = 1;
   for (size_t i = agg_start_idx; i < input_shape.size() - 1; ++i) {
@@ -166,7 +168,7 @@ bool IsBroadcastableElementsAttrAndType(Type a, Type b) {
 // broadcast-compatible with `b`.
 bool OperandsBroadcastToOutputType(Type a, Type b, Type expected_output) {
   Type output_element_type =
-      expected_output.cast<ShapedType>().getElementType();
+      mlir::cast<ShapedType>(expected_output).getElementType();
   Type broadcasted_type =
       OpTrait::util::getBroadcastedType(a, b, output_element_type);
   return broadcasted_type != Type() && broadcasted_type == expected_output;
@@ -175,8 +177,8 @@ bool OperandsBroadcastToOutputType(Type a, Type b, Type expected_output) {
 // Returns whether if `type1` dimensions are the same as the ending dimensions
 // of `type2`. This is more restricted than broadcastable.
 bool IsTailOfShape(Type type1, Type type2) {
-  auto tail_type = type1.dyn_cast<ShapedType>();
-  auto full_type = type2.dyn_cast<ShapedType>();
+  auto tail_type = mlir::dyn_cast<ShapedType>(type1);
+  auto full_type = mlir::dyn_cast<ShapedType>(type2);
   if (!tail_type || !full_type || !tail_type.hasRank() ||
       !full_type.hasRank() || tail_type.getRank() > full_type.getRank())
     return false;
@@ -189,8 +191,8 @@ bool IsTailOfShape(Type type1, Type type2) {
 // the reduced `type1` dimensions are the same as the ending dimensions
 // of `type2`.
 bool IsReducedTailOfShape(Type type1, Type type2) {
-  auto tail_type = type1.dyn_cast<ShapedType>();
-  auto full_type = type2.dyn_cast<ShapedType>();
+  auto tail_type = mlir::dyn_cast<ShapedType>(type1);
+  auto full_type = mlir::dyn_cast<ShapedType>(type2);
   if (!tail_type || !full_type || !tail_type.hasRank() || !full_type.hasRank())
     return false;
 
@@ -211,10 +213,10 @@ bool IsReducedTailOfShape(Type type1, Type type2) {
 // elements in type2. This is a required condition to flatten type2 to form a
 // 1D array and allow the binaryOp handle the broadcasting implicitly.
 bool IsLastDimEqualToNumElements(Type type1, Type type2) {
-  return (type1.cast<ShapedType>().getRank() >= 1 &&
-          type1.cast<ShapedType>().getDimSize(
-              type1.cast<ShapedType>().getRank() - 1) ==
-              type2.cast<ShapedType>().getNumElements());
+  return (mlir::cast<ShapedType>(type1).getRank() >= 1 &&
+          mlir::cast<ShapedType>(type1).getDimSize(
+              mlir::cast<ShapedType>(type1).getRank() - 1) ==
+              mlir::cast<ShapedType>(type2).getNumElements());
 }
 
 bool CanFuseConvOrDepthwiseConvShapes(const ArrayRef<int64_t> filter_shape,
@@ -249,20 +251,21 @@ bool CanFuseConvOrDepthwiseConvShapes(const ArrayRef<int64_t> filter_shape,
 
 bool CanFuseConvOrDepthwiseConv(Value filter, Attribute val,
                                 bool is_depthwise) {
-  const auto elements = val.dyn_cast<DenseElementsAttr>();
+  const auto elements = mlir::dyn_cast<DenseElementsAttr>(val);
   if (!elements) {
     return false;
   }
   const auto elements_shape = elements.getType().getShape();
-  const auto filter_shape = filter.getType().cast<ShapedType>().getShape();
+  const auto filter_shape = mlir::cast<ShapedType>(filter.getType()).getShape();
   return CanFuseConvOrDepthwiseConvShapes(filter_shape, elements_shape,
                                           is_depthwise);
 }
 
 bool CanFuseConvOrDepthwiseConv(Attribute filter, Attribute val,
                                 bool is_depthwise) {
-  if (const auto elements = val.dyn_cast<DenseElementsAttr>()) {
-    if (const auto filter_elements = filter.dyn_cast<DenseElementsAttr>()) {
+  if (const auto elements = mlir::dyn_cast<DenseElementsAttr>(val)) {
+    if (const auto filter_elements =
+            mlir::dyn_cast<DenseElementsAttr>(filter)) {
       return CanFuseConvOrDepthwiseConvShapes(
           filter_elements.getType().getShape(), elements.getType().getShape(),
           is_depthwise);
@@ -277,8 +280,8 @@ bool CanFuseConvOrDepthwiseConv(Attribute filter, Attribute val,
 bool CanOptimizeIdentityGatherNdOrScatterNdOp(Value params,
                                               DenseIntElementsAttr indices,
                                               Type output_type) {
-  auto params_type = params.getType().dyn_cast<RankedTensorType>();
-  auto indices_type = indices.getType().dyn_cast<RankedTensorType>();
+  auto params_type = mlir::dyn_cast<RankedTensorType>(params.getType());
+  auto indices_type = mlir::dyn_cast<RankedTensorType>(indices.getType());
   // Checks the shape of `params` is [n, ...], shape of `indices` is [n, 1]. 2D
   // `indices` means it gets the first row of `params`. As long as indices
   // iterate the first row of `params`, the output is identical to input.
@@ -306,8 +309,8 @@ bool CanOptimizeIdentityGatherNdOrScatterNdOp(Value params,
 // for each dim i, the output tensor is identical to `input`.
 bool CanOptimizeIdentitySliceOp(Value input, Attribute begin, Attribute size) {
   // Checks if `begin` and `size` are i32 or i64.
-  auto begin_attr = begin.dyn_cast<DenseIntElementsAttr>();
-  auto size_attr = size.dyn_cast<DenseIntElementsAttr>();
+  auto begin_attr = mlir::dyn_cast<DenseIntElementsAttr>(begin);
+  auto size_attr = mlir::dyn_cast<DenseIntElementsAttr>(size);
   if (!begin_attr || !size_attr) {
     return false;
   }
@@ -323,7 +326,7 @@ bool CanOptimizeIdentitySliceOp(Value input, Attribute begin, Attribute size) {
 
   // Checks if `input` is ranked and its rank is equal to number of elements in
   // `begin` and `size`.
-  auto input_ty = input.getType().cast<ShapedType>();
+  auto input_ty = mlir::cast<ShapedType>(input.getType());
   if (!input_ty.hasRank()) {
     return false;
   }
@@ -348,7 +351,7 @@ bool CanOptimizeIdentitySliceOp(Value input, Attribute begin, Attribute size) {
 // Expand Attribute 'a' to 4D with all 1s except 1 dimension.
 // Which dimension depends on 'is_depthwise' is true or false.
 ElementsAttr ExpandTo4DForConvImpl(Attribute a, bool is_depthwise) {
-  auto elements = a.dyn_cast<DenseElementsAttr>();
+  auto elements = mlir::dyn_cast<DenseElementsAttr>(a);
   auto shape = elements.getType().getShape();
   if (!shape.empty()) {
     // Checks that elements are essentially 1d.
@@ -377,46 +380,19 @@ TypeAttr RescaleQtype(Type input, Attribute factor) {
   return quant::RescaleQuantizedType(input, factor);
 }
 
-// Utility function to map final permutation to initial permutation
-// initial -> permutation1 -> permutation2 -> final
-DenseElementsAttr RemapPermutation(Value permutation1, Value permutation2) {
-  SmallVector<int32_t> initial_permutation;
-  DenseElementsAttr perm1_const;
-  DenseElementsAttr perm2_const;
-
-  SmallVector<int32_t> new_permutation;
-  if (matchPattern(permutation1, m_Constant(&perm1_const)) &&
-      matchPattern(permutation2, m_Constant(&perm2_const))) {
-    for (int32_t idx = 0; idx < perm1_const.getNumElements(); ++idx) {
-      initial_permutation.push_back(idx);
-    }
-    for (auto perm : perm2_const.getValues<APInt>()) {
-      new_permutation.push_back(
-          initial_permutation[perm1_const
-                                  .getValues<APInt>()[perm.getSExtValue()]
-                                  .getSExtValue()]);
-    }
-  }
-
-  return mlir::DenseElementsAttr::get(
-      RankedTensorType::get(
-          {static_cast<int>(new_permutation.size())},
-          mlir::IntegerType::get(permutation1.getContext(), 32)),
-      llvm::ArrayRef(new_permutation));
-}
-
 // Returns `true` if reducing `axes` in `input` with `keep_dims=true` results in
 // the specified `shape` and `false` otherwise.
 static bool ShapeMatchesReduceWithKeepAxes(Value input,
                                            const mlir::Attribute &axes,
                                            const mlir::Attribute &shape) {
-  RankedTensorType type = input.getType().dyn_cast_or_null<RankedTensorType>();
+  RankedTensorType type =
+      mlir::dyn_cast_or_null<RankedTensorType>(input.getType());
   if (!type) return false;
 
   DenseIntElementsAttr axes_attr =
-      axes.dyn_cast_or_null<DenseIntElementsAttr>();
+      mlir::dyn_cast_or_null<DenseIntElementsAttr>(axes);
   DenseIntElementsAttr shape_attr =
-      shape.dyn_cast_or_null<DenseIntElementsAttr>();
+      mlir::dyn_cast_or_null<DenseIntElementsAttr>(shape);
   if (!axes_attr || !shape_attr) return false;
 
   if (shape_attr.getNumElements() != type.getRank()) return false;
@@ -441,12 +417,12 @@ static bool ShapeMatchesReduceWithKeepAxes(Value input,
 static bool AreInputDimensionsOneInAxes(Value input,
                                         const mlir::Attribute &axes) {
   RankedTensorType input_type =
-      input.getType().dyn_cast_or_null<RankedTensorType>();
+      mlir::dyn_cast_or_null<RankedTensorType>(input.getType());
   if (!input_type) return false;
   auto type_shape = input_type.getShape();
 
   DenseIntElementsAttr axes_attr =
-      axes.dyn_cast_or_null<DenseIntElementsAttr>();
+      mlir::dyn_cast_or_null<DenseIntElementsAttr>(axes);
   if (!axes_attr) return false;
 
   for (auto a : axes_attr.getValues<APInt>()) {
@@ -467,7 +443,7 @@ static bool AreInputDimensionsOneInAxes(Value input,
 }
 
 static bool FloatValueEquals(const Attribute &attr, double value) {
-  auto fp_attr = attr.dyn_cast_or_null<DenseFPElementsAttr>();
+  auto fp_attr = mlir::dyn_cast_or_null<DenseFPElementsAttr>(attr);
   if (!fp_attr) return false;
 
   if (fp_attr.isSplat()) {
@@ -482,12 +458,12 @@ static bool FloatValueEquals(const Attribute &attr, double value) {
 // to `raw_value`.
 template <typename T>
 bool IsConstantValueOf(mlir::TypedAttr value, T raw_value) {
-  auto element_type = value.getType().cast<ShapedType>().getElementType();
+  auto element_type = mlir::cast<ShapedType>(value.getType()).getElementType();
 
-  if (element_type.isa<FloatType>()) {
+  if (mlir::isa<FloatType>(element_type)) {
     return FloatValueEquals(value, raw_value);
-  } else if (element_type.isa<IntegerType>()) {
-    auto int_attr = value.dyn_cast_or_null<DenseIntElementsAttr>();
+  } else if (mlir::isa<IntegerType>(element_type)) {
+    auto int_attr = mlir::dyn_cast_or_null<DenseIntElementsAttr>(value);
     if (!int_attr) return false;
 
     if (int_attr.isSplat()) {
@@ -502,13 +478,13 @@ bool IsConstantValueOf(mlir::TypedAttr value, T raw_value) {
 
 // Returns true if the value's element type is F32.
 bool IsF32Value(Value value) {
-  return value.getType().cast<ShapedType>().getElementType().isF32();
+  return mlir::cast<ShapedType>(value.getType()).getElementType().isF32();
 }
 
 // Returns the number of elements in attr if it is a static shape, 1 otherwise,
 // as an unranked int32 Attribute.
 TypedAttr GetNumElementsOrOne(Type type) {
-  auto shaped_type = type.cast<ShapedType>();
+  auto shaped_type = mlir::cast<ShapedType>(type);
   int32_t num_elements =
       shaped_type.hasStaticShape() ? shaped_type.getNumElements() : 1;
 
@@ -523,7 +499,7 @@ TypedAttr GetNumElementsOrOne(Type type) {
 Value ReshapeValueDroppingLastDim(OpBuilder &builder, Value value) {
   // This function is always guarded with HasTrivialShapeExceptSecondLastDim(),
   // so we could cast safely here.
-  auto type = value.getType().cast<ShapedType>();
+  auto type = mlir::cast<ShapedType>(value.getType());
   SmallVector<int> new_shape;
   if (type.hasStaticShape()) {
     for (int64_t dim : type.getShape().drop_back()) {
@@ -543,7 +519,7 @@ Value ReshapeValueDroppingLastDim(OpBuilder &builder, Value value) {
 
 // Returns true if val has a static shape and the last dimension equals 1.
 bool IsLastDimensionEqualOne(Value val) {
-  const auto val_type = val.getType().cast<ShapedType>();
+  const auto val_type = mlir::cast<ShapedType>(val.getType());
   if (!val_type.hasStaticShape()) return false;
   const auto val_shape = val_type.getShape();
   if (val_shape.empty()) return false;
@@ -577,7 +553,7 @@ bool HasOneUseOrUsedByOnlyBinaryOps(Value out_value) {
 //
 // If such a value is used in an Equal operator, it can be replaced with OneHot.
 bool IsOneHotIndexAttribute(Attribute attr) {
-  const auto dense_attr = attr.dyn_cast_or_null<DenseIntElementsAttr>();
+  const auto dense_attr = mlir::dyn_cast_or_null<DenseIntElementsAttr>(attr);
   if (!dense_attr) {
     return false;
   }
@@ -602,7 +578,7 @@ bool IsOneHotIndexAttribute(Attribute attr) {
 }
 
 Value Get1DShapeValue(OpBuilder &builder, Value value) {
-  auto type = value.getType().cast<ShapedType>();
+  auto type = mlir::cast<ShapedType>(value.getType());
   if (!type.hasStaticShape()) {
     return nullptr;
   }
@@ -614,11 +590,11 @@ Value Get1DShapeValue(OpBuilder &builder, Value value) {
 }
 
 Type GetEmbeddingLookupShape(Value lookup, Value value) {
-  auto lookup_type = lookup.getType().cast<ShapedType>();
+  auto lookup_type = mlir::cast<ShapedType>(lookup.getType());
   if (!lookup_type.hasStaticShape()) {
     return nullptr;
   }
-  auto value_type = value.getType().cast<ShapedType>();
+  auto value_type = mlir::cast<ShapedType>(value.getType());
   if (!value_type.hasStaticShape() || value_type.getRank() != 2) {
     return nullptr;
   }
@@ -665,7 +641,7 @@ bool IsF32Splat(Attribute input_splat) {
 // Attribute holding a single value of float type. If attr has no elements, the
 // result is 0.0f.
 TypedAttr ConvertSingleElementAttrToFloatAttr(Attribute attr) {
-  const auto dense_fp_attr = attr.dyn_cast_or_null<DenseFPElementsAttr>();
+  const auto dense_fp_attr = mlir::dyn_cast_or_null<DenseFPElementsAttr>(attr);
   if (dense_fp_attr) {
     // Already float => return
     return dense_fp_attr;
@@ -673,7 +649,7 @@ TypedAttr ConvertSingleElementAttrToFloatAttr(Attribute attr) {
 
   OpBuilder builder(attr.getContext());
 
-  const auto dense_int_attr = attr.dyn_cast<DenseIntElementsAttr>();
+  const auto dense_int_attr = mlir::dyn_cast<DenseIntElementsAttr>(attr);
   const auto int_values = dense_int_attr.getValues<APInt>();
   float float_val = 0.0f;
   if (!int_values.empty()) {
@@ -793,9 +769,7 @@ struct SqueezeReshapesAroundBroadcastOp
 
     // Pattern is applied only if the broadcast_to shape has more than 5
     // dimensions.
-    if (tfl_broadcast_to_op.getShape()
-            .getType()
-            .cast<ShapedType>()
+    if (mlir::cast<ShapedType>(tfl_broadcast_to_op.getShape().getType())
             .getNumElements() < 6) {
       return rewriter.notifyMatchFailure(loc,
                                          "Not supported broadcast_to shape");
@@ -831,7 +805,7 @@ struct SqueezeReshapesAroundBroadcastOp
     // Calculate the number of extra leading and trailing 1s in the
     // broadcast_op output.
     auto broadcast_output_shapetype =
-        tfl_broadcast_to_op.getOutput().getType().cast<ShapedType>();
+        mlir::cast<ShapedType>(tfl_broadcast_to_op.getOutput().getType());
     int num_leading_broadcast_dims =
         GetNumLeadingOnes(broadcast_output_shapetype);
     int num_trailing_broadcast_dims =
@@ -839,9 +813,7 @@ struct SqueezeReshapesAroundBroadcastOp
 
     // Get the new shape for the inner reshape_op after removing the extra 1s.
     llvm::SmallVector<int32_t, 6> new_reshape_shape_i32{
-        inner_reshape_op.getOutput()
-            .getType()
-            .cast<RankedTensorType>()
+        mlir::cast<RankedTensorType>(inner_reshape_op.getOutput().getType())
             .getShape()
             .drop_back(num_trailing_broadcast_dims)
             .drop_front(num_leading_broadcast_dims)};
@@ -886,11 +858,11 @@ struct ConvertTFLBroadcastToMulOp
   LogicalResult matchAndRewrite(TFL::BroadcastToOp tfl_broadcast_to_op,
                                 PatternRewriter &rewriter) const override {
     auto input_type =
-        tfl_broadcast_to_op.getInput().getType().cast<ShapedType>();
+        mlir::cast<ShapedType>(tfl_broadcast_to_op.getInput().getType());
     auto output_type =
-        tfl_broadcast_to_op.getOutput().getType().cast<ShapedType>();
+        mlir::cast<ShapedType>(tfl_broadcast_to_op.getOutput().getType());
     auto shape_type =
-        tfl_broadcast_to_op.getShape().getType().cast<ShapedType>();
+        mlir::cast<ShapedType>(tfl_broadcast_to_op.getShape().getType());
     Type element_type = input_type.getElementType();
 
     auto loc = tfl_broadcast_to_op->getLoc();
@@ -909,7 +881,7 @@ struct ConvertTFLBroadcastToMulOp
 
     // Allow lowering when the input's elements type is F32, BFloat16, I32 or
     // I16.
-    if (!(element_type.isa<BFloat16Type, Float32Type>() ||
+    if (!(mlir::isa<BFloat16Type, Float32Type>(element_type) ||
           element_type.isInteger(32) || element_type.isInteger(16)))
       return rewriter.notifyMatchFailure(loc, "element_type_not_supported");
 
@@ -986,7 +958,7 @@ struct FuseAddAndStridedSlice : public OpRewritePattern<TFL::StridedSliceOp> {
       return failure();
 
     mlir::TensorType constant_val_type =
-        constant_val.getType().cast<TensorType>();
+        mlir::cast<TensorType>(constant_val.getType());
     // If it's not 1D or 0D (which can be broadcasted to 1D), reject the
     // matching.
     if (constant_val_type.getRank() > 1) {
@@ -994,14 +966,14 @@ struct FuseAddAndStridedSlice : public OpRewritePattern<TFL::StridedSliceOp> {
     }
 
     mlir::RankedTensorType end_type =
-        strided_slice_op.getEnd().getType().dyn_cast<RankedTensorType>();
+        mlir::dyn_cast<RankedTensorType>(strided_slice_op.getEnd().getType());
     // begin, end and strides are Rank 1 tensors with one element per dimension
     // of input.
     int64_t num_dims = end_type.getShape()[0];
     DenseElementsAttr new_added_value =
         added_value.reshape(RankedTensorType::get(
             {num_dims},
-            added_value.getType().cast<ShapedType>().getElementType()));
+            mlir::cast<ShapedType>(added_value.getType()).getElementType()));
     ::mlir::arith::ConstantOp new_end = rewriter.create<arith::ConstantOp>(
         strided_slice_op.getEnd().getLoc(), new_added_value);
 
@@ -1183,7 +1155,7 @@ struct FuseFullyConnectedAndAdd : public OpRewritePattern<TFL::AddOp> {
         add_op.getLhs().getDefiningOp());
     if (!fc_op) return failure();
 
-    auto constant_val_type = constant_val.getType().cast<TensorType>();
+    auto constant_val_type = mlir::cast<TensorType>(constant_val.getType());
 
     // In TFLite FullyConnect definition, bias must be a 1D tensor where
     // the number of elements is equal to the number of channels.
@@ -1199,7 +1171,7 @@ struct FuseFullyConnectedAndAdd : public OpRewritePattern<TFL::AddOp> {
     Value filter = fc_op.getFilter();
     Value bias = fc_op.getBias();
     ElementsAttr bias_value;
-    const bool is_none_bias = bias.getType().isa<NoneType>();
+    const bool is_none_bias = mlir::isa<NoneType>(bias.getType());
     if (fc_op.getFusedActivationFunction() != "NONE") return failure();
 
     if (!is_none_bias && !matchPattern(bias, m_Constant(&bias_value)))
@@ -1212,7 +1184,7 @@ struct FuseFullyConnectedAndAdd : public OpRewritePattern<TFL::AddOp> {
         // to properly broadcast the scalar to `{num_channels}` shape.
 
         // Get the number of channels if possible.
-        auto filter_type = filter.getType().dyn_cast<RankedTensorType>();
+        auto filter_type = mlir::dyn_cast<RankedTensorType>(filter.getType());
         // Filter must be a `2D` tensor with `{num_channels, num_features}`
         // shape. The following check is rejecting unknown rank (-1).
         if (filter_type == nullptr || filter_type.getRank() != 2) {
@@ -1287,14 +1259,14 @@ struct FuseAddAndFullyConnected
 
     // Don't match adds where the added constant is not 1D.
     {
-      auto addend_shape = add_op.getRhs().getType().cast<ShapedType>();
+      auto addend_shape = mlir::cast<ShapedType>(add_op.getRhs().getType());
       if (!addend_shape.hasStaticShape()) return failure();
       if (addend_shape.getShape().size() != 1) return failure();
     }
 
     // Calculate new bias.  Generate a new FC; it will be constant folded.
     auto old_bias = fc_op.getBias();
-    if (!old_bias || old_bias.getType().isa<NoneType>()) {
+    if (!old_bias || mlir::isa<NoneType>(old_bias.getType())) {
       // TODO(b/180752069): Figure out new bias' type when old bias is empty.
       return failure();
     }
@@ -1358,7 +1330,7 @@ struct FuseMulAndFullyConnected
 
     // Don't match muls where the multiplier constant is not 1D.
     {
-      auto multiplier_shape = mul_op.getRhs().getType().cast<ShapedType>();
+      auto multiplier_shape = mlir::cast<ShapedType>(mul_op.getRhs().getType());
       if (!multiplier_shape.hasStaticShape()) return failure();
       if (multiplier_shape.getShape().size() != 1) return failure();
     }
@@ -1464,7 +1436,7 @@ struct FuseFullyConnectedAndMul : public OpRewritePattern<TFL::MulOp> {
     Value bias = fc_op.getBias();
     ElementsAttr cst_tmp;
     if (!matchPattern(filter, m_Constant(&cst_tmp))) return failure();
-    if (!bias.getType().isa<NoneType>() &&
+    if (!mlir::isa<NoneType>(bias.getType()) &&
         !matchPattern(bias, m_Constant(&cst_tmp)))
       return failure();
     if (fc_op.getFusedActivationFunction() != "NONE") return failure();
@@ -1494,7 +1466,7 @@ struct FuseFullyConnectedAndMul : public OpRewritePattern<TFL::MulOp> {
     // Rewrite. Since the folder of TFL::MulOp couldn't broadcast the operands,
     // TF::MulOp is used to fold the constant.
     // TODO(b/139192933): switch to the TFL constant folding
-    auto filter_type = filter.getType().cast<ShapedType>();
+    auto filter_type = mlir::cast<ShapedType>(filter.getType());
     if (filter_type.hasStaticShape()) {
       auto size =
           filter_type.getNumElements() * filter_type.getElementTypeBitWidth();
@@ -1506,7 +1478,7 @@ struct FuseFullyConnectedAndMul : public OpRewritePattern<TFL::MulOp> {
         rewriter.create<TF::MulOp>(mul_op.getLoc(), filter, new_const_val)
             .getZ();
     // If bias isn't None, it needs to be multiplied as well.
-    if (!bias.getType().isa<NoneType>()) {
+    if (!mlir::isa<NoneType>(bias.getType())) {
       bias = rewriter.create<TF::MulOp>(mul_op.getLoc(), bias, constant_val)
                  .getZ();
     }
@@ -1585,7 +1557,7 @@ struct FuseAffinOpAndMulWithQDQs : public OpRewritePattern<TFL::MulOp> {
     // weight constant
     ElementsAttr cst_tmp;
     if (!matchPattern(filter, m_Constant(&cst_tmp))) return failure();
-    if (!bias.getType().isa<NoneType>() &&
+    if (!mlir::isa<NoneType>(bias.getType()) &&
         !matchPattern(bias, m_Constant(&cst_tmp)))
       return failure();
     if (fc_op.getFusedActivationFunction() != "NONE") return failure();
@@ -1607,7 +1579,7 @@ struct FuseAffinOpAndMulWithQDQs : public OpRewritePattern<TFL::MulOp> {
     }
 
     // Make sure that the fused bias will be a 1D tensor.
-    auto gamma_shape = gamma.getType().cast<ShapedType>();
+    auto gamma_shape = mlir::cast<ShapedType>(gamma.getType());
     if (!gamma_shape.hasRank() || gamma_shape.getRank() != 1) {
       return failure();
     }
@@ -1623,7 +1595,7 @@ struct FuseAffinOpAndMulWithQDQs : public OpRewritePattern<TFL::MulOp> {
                                                  new_filter, new_qtype);
 
     // If bias isn't None, it needs to be multiplied as well.
-    if (!bias.getType().isa<NoneType>()) {
+    if (!mlir::isa<NoneType>(bias.getType())) {
       rewriter.setInsertionPoint(fc_op);
       auto new_bias = rewriter.create<TF::MulOp>(loc, bias, gamma);
       fc_op.getOperation()->replaceUsesOfWith(bias, new_bias);
@@ -1674,7 +1646,7 @@ struct FuseBinaryOpToFollowingAffineOp : public OpRewritePattern<AffineOpType> {
       }
       filter = q.getInput();
     }
-    if (!bias.getType().isa<NoneType>() &&
+    if (!mlir::isa<NoneType>(bias.getType()) &&
         !matchPattern(bias, m_Constant(&bias_cst)))
       return failure();
     auto binary_op_activation_func =
@@ -1705,7 +1677,7 @@ struct FuseBinaryOpToFollowingAffineOp : public OpRewritePattern<AffineOpType> {
       // The new bias should be a 1-D tensor with length equals to the bias
       // dimension of the weight.
       SmallVector<APFloat, 4> new_bias_values;
-      if (bias.getType().isa<NoneType>()) {  // none bias, a list of zeros
+      if (mlir::isa<NoneType>(bias.getType())) {  // none bias, a list of zeros
         new_bias_values.resize(bias_size,
                                APFloat::getZero(cst_value.getSemantics()));
       } else if (bias_cst.getNumElements() == 1) {  // scalar bias, broadcast it
@@ -1806,12 +1778,11 @@ struct ScalarizeSplatConstantForBroadcastableOps
     }
 
     constexpr int kSplatOperandIndex = 1;
-    auto result_type =
-        binary_op.getResult().getType().template cast<ShapedType>();
+    auto result_type = mlir::cast<ShapedType>(binary_op.getResult().getType());
     mlir::Value non_splat_operand =
         binary_op.getOperand(1 - kSplatOperandIndex);
     auto non_splat_operand_type =
-        non_splat_operand.getType().cast<ShapedType>();
+        mlir::cast<ShapedType>(non_splat_operand.getType());
     // If the other operand's shape does not equal to the result shape, then we
     // cannot scalarize the splat constant because the result shape relies on
     // the splat constant op's shape for broadcasting.
@@ -1850,10 +1821,11 @@ struct ScalarizeSplatConstantForBroadcastableOps
     if (!matchPattern(value, m_Constant(elements_attr))) {
       return false;
     }
-    auto element_type = value.getType().cast<ShapedType>().getElementType();
+    auto element_type =
+        mlir::cast<ShapedType>(value.getType()).getElementType();
     // Ignore per-axis quantized constants because after converting to scalar,
     // we will lose per-axis qantization parameter.
-    if (element_type.isa<quant::UniformQuantizedPerAxisType>()) {
+    if (mlir::isa<quant::UniformQuantizedPerAxisType>(element_type)) {
       return false;
     }
     if (IsScalar(value)) {
@@ -1864,7 +1836,7 @@ struct ScalarizeSplatConstantForBroadcastableOps
 
   // If this type is a scalar shaped type.
   bool IsScalar(mlir::Value value) const {
-    auto type = value.getType().dyn_cast<ShapedType>();
+    auto type = mlir::dyn_cast<ShapedType>(value.getType());
     if (!type) {
       return false;
     }
@@ -1883,7 +1855,7 @@ struct ScalarizeSplatConstantForBroadcastableOps
     DenseElementsAttr value;
     // Check that bias are constants if not none.
     Value bias = affine_op->getOperand(2);
-    if (!bias.getType().isa<NoneType>() &&
+    if (!mlir::isa<NoneType>(bias.getType()) &&
         !matchPattern(bias, m_Constant(&value))) {
       return false;
     }
@@ -1896,7 +1868,7 @@ struct ScalarizeSplatConstantForBroadcastableOps
     // We can only fuse F32/BF16.
     auto is_fusable_type = [](Type t) {
       Type element_type = t;
-      if (auto shaped_type = t.dyn_cast<ShapedType>()) {
+      if (auto shaped_type = mlir::dyn_cast<ShapedType>(t)) {
         element_type = shaped_type.getElementType();
       }
       return element_type.isBF16() || element_type.isF32();
@@ -1920,68 +1892,6 @@ using ScalarizeSplatConstantForMul =
 using ScalarizeSplatConstantForDiv =
     ScalarizeSplatConstantForBroadcastableOps<TFL::DivOp>;
 
-struct ConvertTrivialTransposeOpToReshapeOp
-    : public OpRewritePattern<TFL::TransposeOp> {
-  using OpRewritePattern<TFL::TransposeOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(TFL::TransposeOp transpose_op,
-                                PatternRewriter &rewriter) const override {
-    auto input_type = transpose_op.getInput().getType().cast<ShapedType>();
-    auto output_type = transpose_op.getOutput().getType().cast<ShapedType>();
-    // It's possible to know if the transformation is safe only if the input
-    // & output shapes are fully known and permutation is a constant.
-    if (!input_type.hasStaticShape() || !output_type.hasStaticShape())
-      return failure();
-    Value perm = transpose_op.getPerm();
-    DenseElementsAttr perm_values_attr;
-    if (!matchPattern(perm, m_Constant(&perm_values_attr))) return failure();
-
-    auto input_shape = input_type.getShape();
-    SmallVector<int64_t, 8> perm_values;
-    for (const auto &dim : perm_values_attr.getValues<APInt>())
-      perm_values.push_back(dim.getSExtValue());
-
-    // This should never happen unless the input graph is malformed.
-    if (input_shape.size() != perm_values.size()) {
-      transpose_op.emitError(
-          "TransposeOP has inconsistent input and perm values.");
-    }
-
-    SmallVector<int, 8> old_major_index_ordering;
-    SmallVector<int, 8> new_major_index_ordering;
-    for (int i = 0, end = input_shape.size(); i < end; i++) {
-      if (input_shape[i] != 1) {
-        old_major_index_ordering.push_back(i);
-      }
-
-      if (input_shape[perm_values[i]] != 1) {
-        new_major_index_ordering.push_back(perm_values[i]);
-      }
-    }
-    if (old_major_index_ordering != new_major_index_ordering) {
-      return failure();
-    }
-
-    // Rewrite.
-    Location loc = transpose_op.getLoc();
-
-    SmallVector<int32_t, 8> output_shape_values;
-    for (auto dim : output_type.getShape()) {
-      output_shape_values.push_back(
-          ShapedType::isDynamic(dim) ? -1 : static_cast<int32_t>(dim));
-    }
-
-    auto new_shape = rewriter.create<TF::ConstOp>(
-        loc, GetI32ElementsAttr(output_shape_values, &rewriter));
-
-    rewriter.replaceOpWithNewOp<TFL::ReshapeOp>(
-        transpose_op, transpose_op.getOutput().getType(),
-        transpose_op.getInput(), new_shape);
-
-    return success();
-  }
-};
-
 // Remove Reshape before FullyConnected when `keep_num_dims=false` and Reshape
 // does not alter the last dimension as FullyConnected will collapse all other
 // dimensions into a single dimension. For example,
@@ -2002,10 +1912,9 @@ struct RemoveReshapeBeforeFullyConnected
   LogicalResult matchAndRewrite(TFL::FullyConnectedOp fully_connected_op,
                                 PatternRewriter &) const override {
     auto input = fully_connected_op.getInput();
-    auto input_ty = input.getType().dyn_cast<ShapedType>();
-    auto output_ty = fully_connected_op.getOutput()[0]
-                         .getType()
-                         .template dyn_cast<ShapedType>();
+    auto input_ty = mlir::dyn_cast<ShapedType>(input.getType());
+    auto output_ty =
+        mlir::dyn_cast<ShapedType>(fully_connected_op.getOutput()[0].getType());
     if (!input_ty.hasStaticShape() ||
         fully_connected_op.getWeightsFormat() != "DEFAULT" ||
         fully_connected_op.getKeepNumDims() || !output_ty.hasStaticShape() ||
@@ -2018,7 +1927,7 @@ struct RemoveReshapeBeforeFullyConnected
 
     // Check if the last dimension does not change after reshape.
     auto reshape_input = reshape_op.getInput();
-    auto reshape_input_ty = reshape_input.getType().dyn_cast<ShapedType>();
+    auto reshape_input_ty = mlir::dyn_cast<ShapedType>(reshape_input.getType());
     if (!reshape_input_ty.hasStaticShape() || input_ty.getRank() == 0 ||
         reshape_input_ty.getRank() == 0 ||
         input_ty.getDimSize(input_ty.getRank() - 1) !=
@@ -2061,9 +1970,9 @@ struct RemoveReshapeAfterFullyConnected
     if (!reshape_op.getInput().hasOneUse()) return failure();
 
     auto input_shape =
-        fully_connected_op.getInput().getType().cast<ShapedType>();
-    auto output_shape = fully_connected_op.getType(0).cast<ShapedType>();
-    auto reshape_shape = reshape_op.getType().cast<ShapedType>();
+        mlir::cast<ShapedType>(fully_connected_op.getInput().getType());
+    auto output_shape = mlir::cast<ShapedType>(fully_connected_op.getType(0));
+    auto reshape_shape = mlir::cast<ShapedType>(reshape_op.getType());
     if (!input_shape.hasStaticShape() || !output_shape.hasStaticShape() ||
         !reshape_shape.hasStaticShape())
       return failure();
@@ -2128,7 +2037,7 @@ struct FuseUnpackAndConcatToReshape
       }
     }
 
-    auto output_type = concat_op.getType().cast<ShapedType>();
+    auto output_type = mlir::cast<ShapedType>(concat_op.getType());
     if (!output_type.hasStaticShape()) {
       return failure();
     }
@@ -2188,8 +2097,8 @@ struct OptimizeTopK : public OpRewritePattern<TFL::TopKV2Op> {
     // for last dimension.
     // It can be done  by verifying the number of elements:
     // i.e., num_input/input_last_dim = num_result/k
-    auto input_ty = value.getType().dyn_cast_or_null<ShapedType>();
-    auto result_ty = slice_op.getType().dyn_cast<ShapedType>();
+    auto input_ty = mlir::dyn_cast_or_null<ShapedType>(value.getType());
+    auto result_ty = mlir::dyn_cast<ShapedType>(slice_op.getType());
     if (!input_ty || !result_ty) return std::nullopt;
     if (!input_ty.hasStaticShape() || !result_ty.hasStaticShape())
       return std::nullopt;
@@ -2230,8 +2139,8 @@ struct OptimizeTopK : public OpRewritePattern<TFL::TopKV2Op> {
     Value k_cst = rewriter.create<TFL::ConstOp>(
         op.getLoc(), DenseElementsAttr::get(k_ty, k));
     // Compute new result types.
-    auto values_ty = values.getType().dyn_cast<ShapedType>();
-    auto indices_ty = indices.getType().dyn_cast<ShapedType>();
+    auto values_ty = mlir::dyn_cast<ShapedType>(values.getType());
+    auto indices_ty = mlir::dyn_cast<ShapedType>(indices.getType());
     auto shape = std::vector<int64_t>();
     for (auto d : values_ty.getShape().drop_back()) {
       shape.push_back(d);
@@ -2439,7 +2348,7 @@ struct FuseLogSoftmax : public OpRewritePattern<TFL::SubOp> {
     if (!sum_op || !sum_op.getKeepDims() ||
         !isSupportedAxis(
             sum_op.getAxes(),
-            sum_op.getOperand(0).getType().cast<ShapedType>().getRank())) {
+            mlir::cast<ShapedType>(sum_op.getOperand(0).getType()).getRank())) {
       return failure();
     }
     if (!sum_op->hasOneUse()) {
@@ -2466,10 +2375,10 @@ struct FuseLogSoftmax : public OpRewritePattern<TFL::SubOp> {
         parent_sub_op.getRhs().getDefiningOp());
     if (!reduce_max_op || !reduce_max_op->hasOneUse() ||
         !reduce_max_op.getKeepDims() ||
-        !isSupportedAxis(reduce_max_op.getAxes(), reduce_max_op.getOperand(0)
-                                                      .getType()
-                                                      .cast<ShapedType>()
-                                                      .getRank())) {
+        !isSupportedAxis(
+            reduce_max_op.getAxes(),
+            mlir::cast<ShapedType>(reduce_max_op.getOperand(0).getType())
+                .getRank())) {
       return failure();
     }
 
@@ -2562,7 +2471,7 @@ struct UndoBroadcastFullyConnectedBiasAddWithQDQs
     }
 
     auto bias_type = bias_op.getType();
-    auto bias_rank = bias_type.cast<ShapedType>().getRank();
+    auto bias_rank = mlir::cast<ShapedType>(bias_type).getRank();
     if (bias_rank > 4 || bias_rank < 2) {
       return failure();
     }
@@ -2587,8 +2496,8 @@ struct UndoBroadcastFullyConnectedBiasAddWithQDQs
     q_op.setOperand(new_bias_op);
     auto new_q_op_type =
         RankedTensorType::Builder(
-            q_op.getResult().getType().cast<RankedTensorType>())
-            .setShape(new_bias_type.cast<ShapedType>().getShape());
+            mlir::cast<RankedTensorType>(q_op.getResult().getType()))
+            .setShape(mlir::cast<ShapedType>(new_bias_type).getShape());
     q_op.getResult().setType(new_q_op_type);
     auto attr = TypeAttr::get(q_op.getResult().getType());
     q_op.setQtypeAttr(attr);
@@ -2596,8 +2505,8 @@ struct UndoBroadcastFullyConnectedBiasAddWithQDQs
     // Update DequantizeOp's output shape
     auto new_dq_op_type =
         RankedTensorType::Builder(
-            dq_op.getResult().getType().cast<RankedTensorType>())
-            .setShape(new_bias_type.cast<ShapedType>().getShape());
+            mlir::cast<RankedTensorType>(dq_op.getResult().getType()))
+            .setShape(mlir::cast<ShapedType>(new_bias_type).getShape());
     dq_op.getResult().setType(new_dq_op_type);
 
     // Remove old bias
@@ -2655,9 +2564,9 @@ void OptimizePass::runOnOperation() {
       FuseFullyConnectedAndReluX<TFL::Relu1Op, kRelu1>,
       FuseBinaryOpToFollowingConv2D, FuseBinaryOpToFollowingDepthwiseConv2D,
       FuseBinaryOpToFollowingFullyConnected, FuseConv2DAndMulWithQDQs,
-      FuseDepthwiseConv2DAndMulWithQDQs, ConvertTrivialTransposeOpToReshapeOp,
-      RemoveReshapeAfterFullyConnected, RemoveReshapeBeforeFullyConnected,
-      FuseUnpackAndConcatToReshape, OptimizeTopK, FuseAddAndStridedSlice,
+      FuseDepthwiseConv2DAndMulWithQDQs, RemoveReshapeAfterFullyConnected,
+      RemoveReshapeBeforeFullyConnected, FuseUnpackAndConcatToReshape,
+      OptimizeTopK, FuseAddAndStridedSlice,
       FuseReshapeAndTransposeAroundBatchMatmul,
       FuseTransposeReshapeIntoBatchMatmul>(ctx);
   if (!this->disable_fuse_mul_and_fc_) {
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_batch_matmul.cc b/tensorflow/compiler/mlir/lite/transforms/optimize_batch_matmul.cc
index 5b696b52db4b2e..0eacfcb8ef09f0 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_batch_matmul.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_batch_matmul.cc
@@ -94,7 +94,8 @@ struct ConvertBatchMatMulOp2FullyConnectedOp
 
     // Create a tfl.transpose op that performs ZX transpose on `input`.
     auto create_z_x_transpose_op = [&](Value input) -> Value {
-      RankedTensorType input_type = input.getType().cast<RankedTensorType>();
+      RankedTensorType input_type =
+          mlir::cast<RankedTensorType>(input.getType());
       const int input_rank = input_type.getRank();
 
       // Create a 1D I32 tensor for representing the dimension permutation.
@@ -176,7 +177,7 @@ struct ConvertBatchMatMulOpToReduceSum
     // the adj(X|Y) attribute, respectively.
     // So adjX == True indicates [..., c_x, r_x == 1].
     llvm::ArrayRef<int64_t> lhs_shape =
-        bmm_op.getX().getType().cast<RankedTensorType>().getShape();
+        mlir::cast<RankedTensorType>(bmm_op.getX().getType()).getShape();
     int rX = lhs_shape.size() - 2;
     int cX = lhs_shape.size() - 1;
     if (bmm_op.getAdjX()) {
@@ -189,7 +190,7 @@ struct ConvertBatchMatMulOpToReduceSum
     }
 
     llvm::ArrayRef<int64_t> rhs_shape =
-        bmm_op.getY().getType().cast<RankedTensorType>().getShape();
+        mlir::cast<RankedTensorType>(bmm_op.getY().getType()).getShape();
     int rY = rhs_shape.size() - 1;
     int cY = rhs_shape.size() - 2;
     if (bmm_op.getAdjX()) {
@@ -210,11 +211,11 @@ struct ConvertBatchMatMulOpToReduceSum
 
  private:
   bool SplatValueEquals(SplatElementsAttr float_or_int, double rhs) const {
-    if (float_or_int.isa<DenseFPElementsAttr>()) {
-      return float_or_int.cast<DenseFPElementsAttr>()
+    if (mlir::isa<DenseFPElementsAttr>(float_or_int)) {
+      return mlir::cast<DenseFPElementsAttr>(float_or_int)
           .getSplatValue<APFloat>()
           .isExactlyValue(rhs);
-    } else if (float_or_int.cast<DenseIntElementsAttr>()) {
+    } else if (mlir::cast<DenseIntElementsAttr>(float_or_int)) {
       return float_or_int.getSplatValue<APInt>() == static_cast<int>(rhs);
     }
     return false;
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_functional_ops.cc b/tensorflow/compiler/mlir/lite/transforms/optimize_functional_ops.cc
index 7d7ab4b5acd33d..69137210b48ffc 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_functional_ops.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_functional_ops.cc
@@ -21,12 +21,13 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
@@ -110,7 +111,7 @@ class FoldIfOp : public OpRewritePattern<TF::IfOp> {
     if (!matchPattern(op.getCond(), m_Constant(&cond))) return failure();
 
     // TODO(hinsu): Handle constants that are not scalar booleans.
-    auto cond_type = cond.getType().dyn_cast<RankedTensorType>();
+    auto cond_type = mlir::dyn_cast<RankedTensorType>(cond.getType());
     if (!cond_type || !cond_type.getShape().equals({}) ||
         !cond_type.getElementType().isInteger(/*width=*/1))
       return failure();
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_op_order.cc b/tensorflow/compiler/mlir/lite/transforms/optimize_op_order.cc
index 4ce0a3b8c43225..62c2c43778e254 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_op_order.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_op_order.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
@@ -66,9 +67,9 @@ struct PushDownDequantize : public OpRewritePattern<DequantizeOp> {
     // Only push down the dequantize op when the output is smaller, so that it
     // can have smaller memory usage.
     auto input_type =
-        dequantize_op.getOutput().getType().dyn_cast<RankedTensorType>();
-    auto output_type =
-        passthrough_op->getResult(0).getType().dyn_cast<RankedTensorType>();
+        mlir::dyn_cast<RankedTensorType>(dequantize_op.getOutput().getType());
+    auto output_type = mlir::dyn_cast<RankedTensorType>(
+        passthrough_op->getResult(0).getType());
     if (!input_type || !output_type ||
         get_num_elements(input_type) <= get_num_elements(output_type)) {
       return failure();
@@ -85,7 +86,7 @@ struct PushDownDequantize : public OpRewritePattern<DequantizeOp> {
 
     // Set the input type of the passthrough op and pull it up.
     Type new_output_type;
-    if (input_element_type.isa<quant::QuantizedType>()) {
+    if (mlir::isa<quant::QuantizedType>(input_element_type)) {
       new_output_type = QuantizedType::getQuantizedElementType(
                             dequantize_op.getInput().getType())
                             .castFromExpressedType(output_type);
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
index 0b068972c8fd30..4353b82e2fb901 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
@@ -366,6 +366,15 @@ class ConstDoubleValueLessThan<string n> : Constraint<
   "std::abs(*$0.cast<DenseElementsAttr>().getValues<float>().begin()) < "
   # n>>;
 
+// Constraint that the attribute value is negative infinity or negative largest.
+// We use both -inf & flt_min due to the forward compatibility.
+def ConstAPFloatNegLargestOrNegInfinity : Constraint<CPred<
+  "$0.isa<DenseElementsAttr>() && "
+  "$0.cast<DenseElementsAttr>().getNumElements() == 1 && "
+  "(($0.cast<DenseElementsAttr>().getValues<APFloat>()[0].isLargest() && "
+  "$0.cast<DenseElementsAttr>().getValues<APFloat>()[0].isNegative()) || "
+  "$0.cast<DenseElementsAttr>().getValues<APFloat>()[0].isNegInfinity())">>;
+
 def L2NormValidReduceIndex : Constraint<CPred<
   "L2NormalizeReduceAxis($0, $1.cast<DenseElementsAttr>())">>;
 
@@ -771,9 +780,13 @@ def UndoBroadcastConvBiasAdd : Pat<
    (HasRankAtLeast<2> $bias),
    (IsDefinedByConv2DOp $lhs)]>;
 
-// Function to map final permutation to initial permutation
-// initial -> permutation1 -> permutation2 -> final
-def RemapPermutation: NativeCodeCall<"RemapPermutation($0, $1)">;
+// Pattern to convert a trivial transpose op to a reshape op.
+def ConvertTrivialTransposeOpToReshapeOp : Pat<
+  (TFL_TransposeOp:$transpose_op $input, (Arith_ConstantOp:$permutation $p1)),
+  (TFL_ReshapeOp $input, (Arith_ConstantOp (GetShape $transpose_op))),
+  [(IsTransposeTrivial $input, $permutation),
+   (AnyStaticShapeTensor $input),
+   (AnyStaticShapeTensor $transpose_op)]>;
 
 // Pattern to fuse redundant tanspose op
 def FoldDoubleTranspose : Pat<
@@ -1013,6 +1026,30 @@ def FoldNormalizationIntoSoftmax : Pat<
    (HasOneUse $sub),
    (HasOneUse $max)]>;
 
+// Convert softmax(x-reshape(maximum(max(x), -inf))) into softmax(x) as the softmax op already deals
+// with the max normalization. This comes from upstream Jax (https://github.com/google/jax/pull/15677)
+def FoldNormalizationIntoSoftmaxJaxWithAxisMinus1 : Pat<
+  (TFL_SoftmaxOp
+    (TFL_SubOp:$sub $input,
+    (TFL_ReshapeOp:$reshape
+      (TFL_MaximumOp:$maximum
+        (TFL_ReduceMaxOp:$max $max_input, (Arith_ConstantOp I32ElementsAttr: $axes),
+                              ConstBoolAttrFalse),
+        (Arith_ConstantOp F32ElementsAttr: $threshold)
+      ),
+      (Arith_ConstantOp I32ElementsAttr: $shape)
+    ),
+    TFL_AF_None),
+    $beta),
+  (TFL_SoftmaxOp $input, $beta),
+  [(IsSame $input, $max_input),
+   (AxesIsLastDimension $axes, $max_input),
+   (ConstAPFloatNegLargestOrNegInfinity $threshold),
+   (HasOneUse $maximum),
+   (HasOneUse $reshape),
+   (HasOneUse $sub),
+   (HasOneUse $max)]>;
+
 def HaveSameType : Constraint<CPred<"($0.getType() == $1.getType())">>;
 
 class AllElementsAreF32<string val> : Constraint<CPred<
@@ -1562,6 +1599,20 @@ def FuseTransposeAfterBatchMatmul : Pat<
   ),
   [(AreLastTwoDimsTransposed $perm_value)]>;
 
+// Fuse redundant RHS TFL_TransposeOp into TFL_BatchMatMulOp if rhs is constant
+// tensor of rank-2.
+def FuseTransposeIntoBatchMatMulRHS: Pat<
+  (TFL_BatchMatMulOp $lhs,
+    (TFL_TransposeOp (TFL_QConstOp:$input $_, $_), (Arith_ConstantOp:$perm_value $p0)),
+    $adj_x, $adj_y, $asymmetric_quantize_inputs),
+  (TFL_FullyConnectedOp
+    $lhs,
+    $input, (CreateNoneValue $lhs), TFL_AF_None, TFL_FCWO_Default,
+    ConstBoolAttrTrue, $asymmetric_quantize_inputs),
+  [(HasRank<2> $input),
+   (AreLastTwoDimsTransposed $perm_value),
+   (IsBoolAttrEqual<"false"> $adj_y)]>;
+
 // Replace conv-->transpose-->add with conv-->add-->transpose
 // The bias needs only reshape (i.e. ReshapeNCHWBiasToNHWC) and not transpose
 // because the bias's shape simply changes from NxCx1x1 to Nx1x1xC.
diff --git a/tensorflow/compiler/mlir/lite/transforms/passes.td b/tensorflow/compiler/mlir/lite/transforms/passes.td
index eefb109d2b966e..b2ab947b3895b3 100644
--- a/tensorflow/compiler/mlir/lite/transforms/passes.td
+++ b/tensorflow/compiler/mlir/lite/transforms/passes.td
@@ -397,6 +397,9 @@ def QuantizePass : Pass<"tfl-quantize", "mlir::func::FuncOp"> {
              "std::string", "Names of location to blocklist from quantization">,
       Option<"enable_custom_op_weight_only_", "enable-custom-op-weight-only",
              "std::string", "", "Specifies which custom ops are weight-only.">,
+      Option<"enable_float16_quantization_",
+              "enable-float16-quantization", "bool",
+              "false", "Whether apply float16 quantization. If false, int8 quantization is applied.">,
   ];
 }
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/pin_ops_with_side_effects.cc b/tensorflow/compiler/mlir/lite/transforms/pin_ops_with_side_effects.cc
index 1d0cd497b052f3..7baa0136f1c33c 100644
--- a/tensorflow/compiler/mlir/lite/transforms/pin_ops_with_side_effects.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/pin_ops_with_side_effects.cc
@@ -37,9 +37,9 @@ namespace {
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h.inc"
 
 bool IsResourceTensor(Value value) {
-  const auto tensor_type = value.getType().dyn_cast<TensorType>();
+  const auto tensor_type = mlir::dyn_cast<TensorType>(value.getType());
   return tensor_type &&
-         tensor_type.getElementType().isa<mlir::TF::ResourceType>();
+         mlir::isa<mlir::TF::ResourceType>(tensor_type.getElementType());
 }
 
 // The default criterion for operations being considered as causing or being
diff --git a/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
index 80d7ab24c23316..867eecff15818f 100644
--- a/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
@@ -233,8 +234,8 @@ struct FoldTransposeOp : public OpRewritePattern<TransposeOp> {
     DenseIntElementsAttr perm_tensor;
     if (!matchPattern(op.getPerm(), m_Constant(&perm_tensor))) return failure();
 
-    if (!(getElementTypeOrSelf(op.getOutput().getType()))
-             .isa<quant::UniformQuantizedType>())
+    if (!mlir::isa<quant::UniformQuantizedType>(
+            (getElementTypeOrSelf(op.getOutput().getType()))))
       return failure();
 
     ElementsAttr input_tensor = qconst_op.getValue();
@@ -244,7 +245,7 @@ struct FoldTransposeOp : public OpRewritePattern<TransposeOp> {
     assert(perm_tensor.getType().getNumElements() == num_dimensions);
 
     ArrayRef<int64_t> input_shape = input_tensor.getShapedType().getShape();
-    auto output_type = op.getOutput().getType().cast<ShapedType>();
+    auto output_type = mlir::cast<ShapedType>(op.getOutput().getType());
 
     SmallVector<int32_t, 4> perm;
     SmallVector<int64_t, 4> output_shape;
@@ -265,9 +266,9 @@ struct FoldTransposeOp : public OpRewritePattern<TransposeOp> {
     auto result_type =
         RankedTensorType::get(output_shape, output_type.getElementType());
     auto values_type = RankedTensorType::get(
-        output_shape, output_type.getElementType()
-                          .cast<quant::UniformQuantizedType>()
-                          .getStorageType());
+        output_shape,
+        mlir::cast<quant::UniformQuantizedType>(output_type.getElementType())
+            .getStorageType());
     rewriter.replaceOpWithNewOp<QConstOp>(
         op, TypeAttr::get(result_type),
         DenseIntElementsAttr::get(values_type, new_values));
@@ -289,18 +290,18 @@ struct FoldReshapeOp : public OpRewritePattern<ReshapeOp> {
     if (qconst_op == nullptr) return failure();
 
     auto dense_elements =
-        qconst_op.getValue().dyn_cast_or_null<DenseElementsAttr>();
+        mlir::dyn_cast_or_null<DenseElementsAttr>(qconst_op.getValue());
     if (dense_elements == nullptr) return failure();
 
     // Handle per tensor cases only.
-    if (!(getElementTypeOrSelf(op.getType()))
-             .isa<quant::UniformQuantizedType>()) {
+    if (!mlir::isa<quant::UniformQuantizedType>(
+            (getElementTypeOrSelf(op.getType())))) {
       return failure();
     }
 
     // Remove identity reshape with both static result and input shape.
-    auto result_type = op.getType().cast<ShapedType>();
-    auto input_type = op.getInput().getType().cast<ShapedType>();
+    auto result_type = mlir::cast<ShapedType>(op.getType());
+    auto input_type = mlir::cast<ShapedType>(op.getInput().getType());
 
     // Constant folding
     // If the result type isn't static, tries to derive the result type from
@@ -318,9 +319,9 @@ struct FoldReshapeOp : public OpRewritePattern<ReshapeOp> {
           RankedTensorType::get(shape_data, input_type.getElementType());
     }
     auto values_type = RankedTensorType::get(
-        result_type.getShape(), result_type.getElementType()
-                                    .cast<quant::UniformQuantizedType>()
-                                    .getStorageType());
+        result_type.getShape(),
+        mlir::cast<quant::UniformQuantizedType>(result_type.getElementType())
+            .getStorageType());
 
     DenseElementsAttr reshaped_elements = dense_elements.reshape(values_type);
     rewriter.replaceOpWithNewOp<QConstOp>(op, TypeAttr::get(result_type),
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc
index 98582d03c553b9..9ed32a1b9a674e 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc
@@ -80,12 +80,16 @@ LogicalResult CreateTflFusableOpCustomOptions(
   size_t start_map = fbb.StartMap();
 
   for (auto attr : attrs) {
-    if (auto float_attr = attr.second.dyn_cast_or_null<FloatAttr>()) {
+    if (auto float_attr = mlir::dyn_cast_or_null<FloatAttr>(attr.second)) {
       fbb.Float(attr.first.data(), float_attr.getValue().convertToFloat());
-    } else if (auto int_attr = attr.second.dyn_cast_or_null<IntegerAttr>()) {
+    } else if (auto int_attr =
+                   mlir::dyn_cast_or_null<IntegerAttr>(attr.second)) {
       fbb.Int(attr.first.data(), int_attr.getInt());
-    } else if (auto bool_attr = attr.second.dyn_cast_or_null<BoolAttr>()) {
+    } else if (auto bool_attr = mlir::dyn_cast_or_null<BoolAttr>(attr.second)) {
       fbb.Bool(attr.first.data(), bool_attr.getValue());
+    } else if (auto string_attr =
+                   mlir::dyn_cast_or_null<StringAttr>(attr.second)) {
+      fbb.String(attr.first.data(), string_attr.getValue().str());
     } else {
       // TODO(b/201482289): support other data types.
       return failure();
@@ -180,7 +184,7 @@ LogicalResult CheckFusableLayerNormalizedLstmCellSimple(
     func::FuncOp lstm_func) {
   for (int i = 0; i < 5; ++i) {
     auto input = lstm_func.getArgument(i);
-    auto input_type = input.getType().dyn_cast_or_null<RankedTensorType>();
+    auto input_type = mlir::dyn_cast_or_null<RankedTensorType>(input.getType());
     if (!input_type) {
       lstm_func.emitWarning(
           "we cannot fuse this lstm func because all the inputs have not "
@@ -195,7 +199,7 @@ LogicalResult CheckFusableLayerNormalizedLstmCellSimple(
 LogicalResult CheckFusableLstmCellSimple(func::FuncOp lstm_func) {
   for (int i = 0; i < 4; ++i) {
     auto input = lstm_func.getArgument(i);
-    auto input_type = input.getType().dyn_cast_or_null<RankedTensorType>();
+    auto input_type = mlir::dyn_cast_or_null<RankedTensorType>(input.getType());
     if (!input_type) {
       lstm_func.emitWarning(
           "we cannot fuse this lstm func because all the inputs have not "
@@ -248,7 +252,7 @@ LogicalResult CheckFusableKerasLstm(func::FuncOp lstm_func, ModuleOp module) {
   // types.
   for (int i = 0; i < 6; ++i) {
     auto input = lstm_func.getArgument(i);
-    auto input_type = input.getType().dyn_cast_or_null<RankedTensorType>();
+    auto input_type = mlir::dyn_cast_or_null<RankedTensorType>(input.getType());
     if (!input_type) {
       lstm_func.emitWarning(
           "we cannot fuse this lstm func because all the inputs have not "
@@ -366,7 +370,7 @@ void PrepareCompositeFunctionsPass::ConvertTFImplementsWithAttributes(
     for (auto attr_item : dict_attr) {
       // Push other attributes except the TFLFusableOp.
       if (attr_item.getName() == kTFLFusableOp &&
-          attr_item.getValue().dyn_cast<BoolAttr>().getValue()) {
+          mlir::dyn_cast<BoolAttr>(attr_item.getValue()).getValue()) {
         tfl_fusable_op = true;
       } else {
         attributes.push_back({attr_item.getName(), attr_item.getValue()});
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td b/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
index c625b329be6413..78951ae16397f6 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
@@ -15,6 +15,7 @@ limitations under the License.
 
 include "tensorflow/compiler/mlir/tensorflow/transforms/optimize.td"
 include "tensorflow/compiler/mlir/lite/ir/tfl_ops.td"
+include "tensorflow/compiler/mlir/lite/utils/utils.td"
 
 def FalseBoolAttr : AttrConstraint<CPred<"!$_self.getValue()">>;
 
@@ -67,6 +68,28 @@ def ConvertMatmulWithTranspose : Pat<(TF_MatMulOp $a, $b, ConstBoolAttrTrue, $bt
              /*delta=*/(TF_ConstOp TFi32<-1>)), (TF_ConstOp TFi32<1>))), $b,
            ConstBoolAttrFalse, $bt, $grad_a, $grad_b)>;
 
+// Pattern to fuse redundant tanspose op
+def FoldDoubleTranspose : Pat<
+  (TF_TransposeOp
+    (TF_TransposeOp:$transpose_out1 $input, (Arith_ConstantOp:$permutation1 $p1)),
+    (Arith_ConstantOp:$permutation2 $p2)),
+  (TF_TransposeOp $input,
+    (Arith_ConstantOp (RemapPermutation $permutation1, $permutation2))),
+  [(HasOneUse $transpose_out1)]>;
+
+// Pattern to fuse trivial reshape op into transpose op
+def FoldTrivialReshapeIntoTranspose : Pat<
+  (TF_ReshapeOp:$output
+      (TF_TransposeOp:$transpose_out1 $input, (Arith_ConstantOp:$permutation1 $p1)), $_),
+  (TF_TransposeOp:$transpose_op $input,
+      (Arith_ConstantOp
+        (RemapPermutation $permutation1,
+            (GetPermutationFromTrivialReshape $transpose_out1, $output)))),
+  [(IsReshapeEquivalentToTranspose $transpose_out1, $output),
+   (AnyStaticShapeTensor $input),
+   (AnyStaticShapeTensor $output),
+   (HasOneUse $transpose_out1)]>;
+
 // Partially supported in TFLite, treated as passthrough IdentityOp
 def ConvertCheckNumerics : Pat<(TF_CheckNumericsOp $arg, $msg), (TF_IdentityOp $arg)>;
 def ConvertSnapshot : Pat<(TF_SnapshotOp $arg), (TF_IdentityOp $arg)>;
@@ -136,6 +159,19 @@ def ReorderReshapeDequantQuantUsedByDepthwiseConv :
    (CanUpdateShapeWithAxis<3> $qtype, $old_value)],
   [], (addBenefit 10)>;
 
+// The axis is set to 3, because this transpose is from the legalization of
+// tf.depthwiseconvnative and the new channel axis is the last dimension.
+def ReorderTransposeDequantQuantUsedByDepthwiseConv :
+      Pat<(TF_TransposeOp:$old_value
+              (TFL_DequantizeOp (TFL_QuantizeOp $input, $qtype)), $perm),
+          (TFL_DequantizeOp
+              (TFL_QuantizeOp
+                  (TF_TransposeOp $input, $perm),
+                  (UpdateShapeWithAxis<3> $qtype, $old_value))),
+  [(UsedBy<"DepthwiseConv2D"> $old_value),
+   (CanUpdateShapeWithAxis<3> $qtype, $old_value)],
+  [], (addBenefit 10)>;
+
 // The Rank op produces result which is independent with the quantization
 // parameters of the input, so we can remove the quantization ops.
 def OptimizeAwayRankDequantQuant :
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
index ce11ca73970136..9f76ad1f6e9098 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
@@ -153,8 +153,8 @@ bool PrepareQuantizePass::SetInputNodesQuantizationParams(func::FuncOp func) {
 
   bool need_to_set_input_nodes_quantization_params = false;
   for (const BlockArgument arg : func.getArguments()) {
-    auto shaped = arg.getType().dyn_cast<ShapedType>();
-    if (shaped && shaped.getElementType().isa<FloatType>() &&
+    auto shaped = mlir::dyn_cast<ShapedType>(arg.getType());
+    if (shaped && mlir::isa<FloatType>(shaped.getElementType()) &&
         !has_quantize_op(arg)) {
       need_to_set_input_nodes_quantization_params = true;
       break;
@@ -179,8 +179,8 @@ bool PrepareQuantizePass::SetInputNodesQuantizationParams(func::FuncOp func) {
   auto add_quantize_op = [&](Location loc, Type input_type, Block* block,
                              Block::iterator insertion_point, Value arg,
                              int i) {
-    if (auto shaped = input_type.dyn_cast<ShapedType>()) {
-      if (shaped.getElementType().isa<FloatType>()) {
+    if (auto shaped = mlir::dyn_cast<ShapedType>(input_type)) {
+      if (mlir::isa<FloatType>(shaped.getElementType())) {
         // If there are existing quantize ops, they are from training and we
         // should respect them.
         if (has_quantize_op(arg)) {
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_dynamic_range.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_dynamic_range.cc
index f0fd79ff207f39..0b823844aa4a58 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_dynamic_range.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_dynamic_range.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/quantization/lite/tfl_to_std.h"
@@ -193,7 +194,7 @@ class PrepareDynamicRangeQuantizableOp
           continue;
         }
 
-        if (attr.dyn_cast<DenseFPElementsAttr>().size() >=
+        if (mlir::dyn_cast<DenseFPElementsAttr>(attr).size() >=
             quant_specs_.minimum_elements_for_weights) {
           continue;
         }
@@ -205,7 +206,7 @@ class PrepareDynamicRangeQuantizableOp
             "supported. The operand ")
             << const_op->getName().getStringRef().str() << " at index " << qi
             << " was not quantized because it has "
-            << attr.dyn_cast<DenseFPElementsAttr>().size()
+            << mlir::dyn_cast<DenseFPElementsAttr>(attr).size()
             << " elements which is fewer than the "
                "`minimum_elements_for_weights` threshold of "
             << quant_specs_.minimum_elements_for_weights;
@@ -233,7 +234,7 @@ class PrepareDynamicRangeQuantizableOp
 
     // Get types
     TensorType old_result_type =
-        op.getResult().getType().template dyn_cast<TensorType>();
+        mlir::dyn_cast<TensorType>(op.getResult().getType());
     FloatType quantized_type = FloatType::getF16(op.getContext());
     ShapedType new_result_type = old_result_type.clone(quantized_type);
 
@@ -287,27 +288,27 @@ class PrepareDynamicRangeQuantizableOp
     DenseFPElementsAttr attr;
     if (!matchPattern(op->getResult(0), m_Constant(&attr))) return false;
 
-    if (attr.dyn_cast<DenseFPElementsAttr>().size() <
+    if (mlir::dyn_cast<DenseFPElementsAttr>(attr).size() <
         quant_specs_.minimum_elements_for_weights) {
       op->emitRemark("Quantization is skipped for ")
           << quantize_op->getName().getStringRef().str() << " because it has "
-          << attr.dyn_cast<DenseFPElementsAttr>().size()
+          << mlir::dyn_cast<DenseFPElementsAttr>(attr).size()
           << " elements which is fewer than the threshold("
           << quant_specs_.minimum_elements_for_weights << " elements).";
       return false;
     }
 
     if (op_with_per_axis_support) {
-      quant_type = quant::GetUniformQuantizedPerAxisTypeForWeight(
-                       attr, affine_user.GetQuantizationDimIndex(),
-                       /*symmetric=*/true, bit_width, is_signed,
-                       is_narrow_range, is_legacy_float)
-                       .template dyn_cast<quant::QuantizedType>();
+      quant_type = mlir::dyn_cast<quant::QuantizedType>(
+          quant::GetUniformQuantizedPerAxisTypeForWeight(
+              attr, affine_user.GetQuantizationDimIndex(),
+              /*symmetric=*/true, bit_width, is_signed, is_narrow_range,
+              is_legacy_float));
     } else {
-      quant_type = quant::GetUniformQuantizedTypeForWeight(
-                       attr, is_narrow_range && is_signed, bit_width, is_signed,
-                       is_narrow_range, is_legacy_float)
-                       .template dyn_cast<quant::QuantizedType>();
+      quant_type = mlir::dyn_cast<quant::QuantizedType>(
+          quant::GetUniformQuantizedTypeForWeight(
+              attr, is_narrow_range && is_signed, bit_width, is_signed,
+              is_narrow_range, is_legacy_float));
     }
     return insertQDQ(rewriter, op, quant_type, quant_op);
   }
@@ -346,7 +347,7 @@ class PrepareDynamicRangeQuantizableOp
   bool getQuantizableOps(arith::ConstantOp op,
                          QuantizationUnits& quantizable_ops) const {
     // Non-float tensors do not need quantization.
-    auto type = op.getType().dyn_cast<ShapedType>();
+    auto type = mlir::dyn_cast<ShapedType>(op.getType());
     if (!type || !type.getElementType().isF32()) return false;
 
     Value value = op.getResult();
@@ -420,7 +421,7 @@ class PrepareDynamicRangeQuantizableOp
       // Get types
       Type old_result_type = op.getResult().getType();
       ShapedType new_result_type =
-          cast_op.getType().template dyn_cast<ShapedType>();
+          mlir::dyn_cast<ShapedType>(cast_op.getType());
 
       // Proceeds only if the casting is to float16
       if (!new_result_type.getElementType().isF16()) continue;
@@ -428,7 +429,7 @@ class PrepareDynamicRangeQuantizableOp
       // Cast values
       std::vector<Eigen::half> new_values;
       DenseFPElementsAttr value_attr =
-          op.getValue().cast<DenseFPElementsAttr>();
+          mlir::cast<DenseFPElementsAttr>(op.getValue());
       new_values.reserve(value_attr.getNumElements());
 
       constexpr float kMaxFloat16Value = 65504.f;
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.h b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.h
index e102c6bedd4328..061a8db4398321 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.h
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.h
@@ -36,16 +36,17 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
 #include "tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_traits.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/tools/optimize/operator_property.h"
 
 //===----------------------------------------------------------------------===//
@@ -100,19 +101,18 @@ LogicalResult GetLstmProperty(LstmOp op,
     return failure();
   }
   lstm_variant->use_projection =
-      !op.getProjectionWeights().getType().template isa<NoneType>();
+      !mlir::isa<NoneType>(op.getProjectionWeights().getType());
   lstm_variant->use_peephole =
-      !op.getCellToOutputWeights().getType().template isa<NoneType>();
+      !mlir::isa<NoneType>(op.getCellToOutputWeights().getType());
   lstm_variant->use_layer_norm =
-      !op.getForgetLayerNormCoefficients().getType().template isa<NoneType>();
+      !mlir::isa<NoneType>(op.getForgetLayerNormCoefficients().getType());
 
   *op_property = operator_property::GetOperatorProperty(
       *lstm_variant, activation_number_of_bits);
 
   // TODO(b/176258587) move this to operator_property.cc if this is needed in
   // other components, too.
-  bool use_cifg =
-      op.getInputToInputWeights().getType().template isa<NoneType>();
+  bool use_cifg = mlir::isa<NoneType>(op.getInputToInputWeights().getType());
   if (use_cifg) {
     const absl::flat_hash_set<int> cifg_non_inputs = {1, 5, 9, 12, 20};
     const int cifg_non_intermediate = 0;
@@ -197,9 +197,9 @@ class PrepareLstmOutputScale : public OpRewritePattern<SourceOp> {
     llvm::SmallVector<llvm::APFloat, 4> min_max_values;
 
     for (auto& stats_op : stats_ops) {
-      auto values = stats_op.getLayerStats()
-                        .dyn_cast<DenseFPElementsAttr>()
-                        .getValues<llvm::APFloat>();
+      auto values =
+          mlir::dyn_cast<DenseFPElementsAttr>(stats_op.getLayerStats())
+              .getValues<llvm::APFloat>();
       min_max_values.insert(min_max_values.end(), values.begin(), values.end());
     }
 
@@ -285,8 +285,8 @@ class ConvertOpStatsToQDQs : public OpRewritePattern<SourceOp> {
       const operator_property::TensorProperty& tensor_property,
       PatternRewriter& rewriter) const {
     // Non-float tensors are neither weights nor require quantization.
-    auto type = const_op->getResult(0).getType().dyn_cast<ShapedType>();
-    if (!type || !type.getElementType().isa<FloatType>()) return success();
+    auto type = mlir::dyn_cast<ShapedType>(const_op->getResult(0).getType());
+    if (!type || !mlir::isa<FloatType>(type.getElementType())) return success();
 
     DenseFPElementsAttr attr;
     if (!matchPattern(const_op->getResult(0), m_Constant(&attr))) {
@@ -312,12 +312,12 @@ class ConvertOpStatsToQDQs : public OpRewritePattern<SourceOp> {
           rewriter.getIntegerType(16), attr.getType().getElementType(), scale,
           /*zeroPoint=*/0, llvm::minIntN(10), -llvm::minIntN(10));
     } else {
-      quant_type = quant::GetUniformQuantizedTypeForWeight(
-                       attr, /*symmetric=*/true,
-                       /*num_bits=*/tensor_property.number_of_bits,
-                       /*is_signed=*/true,
-                       /*narrow_range=*/true, quant_specs_.legacy_float_scale)
-                       .template dyn_cast<quant::UniformQuantizedType>();
+      quant_type = mlir::dyn_cast<quant::UniformQuantizedType>(
+          quant::GetUniformQuantizedTypeForWeight(
+              attr, /*symmetric=*/true,
+              /*num_bits=*/tensor_property.number_of_bits,
+              /*is_signed=*/true,
+              /*narrow_range=*/true, quant_specs_.legacy_float_scale));
     }
     if (!quant_type) {
       const_op->emitError("Failed to get quantized type");
@@ -346,7 +346,7 @@ class ConvertOpStatsToQDQs : public OpRewritePattern<SourceOp> {
                      << "] is a state tensor, but has more than one use.";
       return failure();
     }
-    auto stats = stats_op.getLayerStats().dyn_cast<DenseFPElementsAttr>();
+    auto stats = mlir::dyn_cast<DenseFPElementsAttr>(stats_op.getLayerStats());
     if (!stats || stats.getNumElements() != 2) {
       stats_op.emitError("Stats should have 2 values.");
       return failure();
@@ -454,7 +454,7 @@ class ConvertLstmStatsToQDQs : public ConvertOpStatsToQDQs<SourceOp> {
         return failure();
       }
       auto calibrated_type =
-          quant_type.template dyn_cast<quant::CalibratedQuantizedType>();
+          mlir::dyn_cast<quant::CalibratedQuantizedType>(quant_type);
       if (!calibrated_type) {
         int num_storage_bits = quant_type.getStorageTypeIntegralWidth();
         if (tensor_property.number_of_bits != num_storage_bits) {
@@ -474,9 +474,9 @@ class ConvertLstmStatsToQDQs : public ConvertOpStatsToQDQs<SourceOp> {
             /*narrowRange=*/false, calibrated_type.getExpressedType(),
             /*isSigned=*/this->quant_specs_.IsSignedInferenceType());
         if (this->quant_specs_.legacy_float_scale) {
-          qtype = quant::DownCastScale(qtype, calibrated_type.getMin(),
-                                       calibrated_type.getMax(), op.getLoc())
-                      .template cast<UniformQuantizedType>();
+          qtype = mlir::cast<UniformQuantizedType>(
+              quant::DownCastScale(qtype, calibrated_type.getMin(),
+                                   calibrated_type.getMax(), op.getLoc()));
         }
       } else if (tensor_property.number_of_bits == 16) {
         double max = std::max(std::abs(calibrated_type.getMin()),
@@ -508,9 +508,9 @@ inline quant::AccumulatorScaleFunc GetUniformQuantizedTypeForBiasWithScale(
   return [=](const std::vector<quant::QuantParams>& quant_params,
              const int adjusted_quant_dim,
              const bool legacy_float_scale) -> quant::QuantParams {
-    if (auto qtype = quant::GetUniformQuantizedTypeForBias(
-                         quant_params, legacy_float_scale, adjusted_quant_dim)
-                         .dyn_cast_or_null<UniformQuantizedType>()) {
+    if (auto qtype = mlir::dyn_cast_or_null<UniformQuantizedType>(
+            quant::GetUniformQuantizedTypeForBias(
+                quant_params, legacy_float_scale, adjusted_quant_dim))) {
       return quant::UniformQuantizedType::get(
           qtype.getFlags(), qtype.getStorageType(), qtype.getExpressedType(),
           qtype.getScale() * scale, qtype.getZeroPoint(),
@@ -540,14 +540,14 @@ std::unique_ptr<quant::OpQuantSpec> GetLstmOpQuantSpec(LstmOp op) {
            tensor_property.derived_scale.intermediate_tensors) {
         auto quant_type = GetIntermediateElementType<LstmOp>(op, tensor_index);
         if (!quant_type ||
-            !quant_type.template isa<quant::UniformQuantizedType>()) {
+            !mlir::isa<quant::UniformQuantizedType>(quant_type)) {
           op->emitError() << "While processing derived scale, intermediate "
                           << intermediate_attributes[tensor_index]
                           << " is not quantized.";
           return nullptr;
         }
-        scale *= quant_type.template dyn_cast<quant::UniformQuantizedType>()
-                     .getScale();
+        scale *=
+            mlir::dyn_cast<quant::UniformQuantizedType>(quant_type).getScale();
       }
       for (float factor : tensor_property.derived_scale.factors) {
         scale *= factor;
@@ -590,7 +590,8 @@ class PropagateTransposedPerAxisQuantDim
     auto q_op = dyn_cast_or_null<quantfork::QuantizeCastOp>(
         dq_op.getOperand().getDefiningOp());
     if (!q_op) return failure();
-    auto qtype = dq_op.getArg().getType().cast<TensorType>().getElementType();
+    auto qtype =
+        mlir::cast<TensorType>(dq_op.getArg().getType()).getElementType();
     auto aqtype = dyn_cast_or_null<quant::UniformQuantizedPerAxisType>(qtype);
     if (!aqtype) return failure();
 
@@ -599,8 +600,8 @@ class PropagateTransposedPerAxisQuantDim
     auto next_op = *transpose_op.getResult().getUsers().begin();
     if (dyn_cast_or_null<quantfork::QuantizeCastOp>(next_op)) return failure();
 
-    auto input_type = transpose_op.getInput().getType().cast<ShapedType>();
-    auto perm_type = transpose_op.getPerm().getType().cast<ShapedType>();
+    auto input_type = mlir::cast<ShapedType>(transpose_op.getInput().getType());
+    auto perm_type = mlir::cast<ShapedType>(transpose_op.getPerm().getType());
     if (input_type.hasStaticShape() && perm_type.hasStaticShape()) {
       if (perm_type.getNumElements() != input_type.getRank()) {
         return transpose_op.emitOpError(
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
index 9f0a7fbafff450..b0b6fc8ac7f2d8 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
@@ -67,6 +67,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/utils/constant_utils.h"
 #include "tensorflow/compiler/mlir/lite/utils/fake_quant_utils.h"
 #include "tensorflow/compiler/mlir/lite/utils/size_utils.h"
+#include "tensorflow/compiler/mlir/lite/utils/utils.h"
 #include "tensorflow/compiler/mlir/lite/utils/validators.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/einsum.h"
@@ -89,7 +90,7 @@ namespace {
 // Preconditions: The given value must have a ShapedType.
 static Value CreateTFCastOpI32(OpBuilder *builder, Location loc, Value x,
                                BoolAttr truncate) {
-  auto x_type = x.getType().dyn_cast_or_null<ShapedType>();
+  auto x_type = mlir::dyn_cast_or_null<ShapedType>(x.getType());
   if (!x_type) llvm_unreachable("unsupported type");
   Type type = x_type.clone(builder->getI32Type());
   return builder->create<TF::CastOp>(loc, type, x, truncate);
@@ -200,14 +201,14 @@ class ConvertTFConvOp : public RewritePattern {
     // that we can extract info from the shape (e.g., for constructing bias
     // tensor, for setting depth_multiplier attribute, etc.).
     auto filter = tf_op.getFilter();
-    auto filter_type = filter.getType().template dyn_cast<RankedTensorType>();
+    auto filter_type = mlir::dyn_cast<RankedTensorType>(filter.getType());
     if (!filter_type || filter_type.getRank() != 4 ||
         !filter_type.hasStaticShape())
       return failure();
 
     Value input = tf_op.getInput();
     RankedTensorType input_type =
-        input.getType().template dyn_cast<RankedTensorType>();
+        mlir::dyn_cast<RankedTensorType>(input.getType());
     // Only rank size four input will be only available by the tf.Conv2D
     // operator verification.
     if (!input_type || input_type.isDynamicDim(3)) {
@@ -244,7 +245,7 @@ class ConvertTFConvOp : public RewritePattern {
           op->getAttrOfType<ArrayAttr>("explicit_paddings").getValue();
 
       auto get_int = [](Attribute attr) {
-        return attr.template cast<IntegerAttr>().getInt();
+        return mlir::cast<IntegerAttr>(attr).getInt();
       };
 
       SmallVector<int32_t> padding_values(padding_attr_array.size());
@@ -324,7 +325,7 @@ class ConvertTFConv2D : public ConvertTFConvOp<ConvertTFConv2D, TF::Conv2DOp> {
     auto perm_op = rewriter.create<TF::ConstOp>(loc, perm_type, perm_attr);
 
     // Create tensor type for the transpose result.
-    auto filter_type = filter.getType().cast<RankedTensorType>();
+    auto filter_type = mlir::cast<RankedTensorType>(filter.getType());
     auto result_shape =
         llvm::to_vector<4>(llvm::map_range(perm, [filter_type](int64_t dim) {
           return filter_type.getDimSize(dim);
@@ -361,7 +362,8 @@ class ConvertTFDepthwiseConv2dNative
     // have a corresponding 'depth_multiplier' attribute; the multiplier is the
     // fourth dimension in the 4-D filter tensor. We query the multiplier from
     // tf.DepthwiseConv2dNative and set it as the attribute value accordingly.
-    auto multiplier = filter.getType().cast<RankedTensorType>().getDimSize(3);
+    auto multiplier =
+        mlir::cast<RankedTensorType>(filter.getType()).getDimSize(3);
 
     filter = legalizeFilter(rewriter, loc, filter);
     return rewriter.create<TFL::DepthwiseConv2DOp>(
@@ -385,7 +387,7 @@ class ConvertTFDepthwiseConv2dNative
   /// RankedTensorType.
   Value legalizeFilter(PatternRewriter &rewriter, Location loc,
                        Value filter) const {
-    auto filter_type = filter.getType().cast<RankedTensorType>();
+    auto filter_type = mlir::cast<RankedTensorType>(filter.getType());
     auto filterShape = filter_type.getShape();
     SmallVector<int64_t, 4> result_shape = {1, filterShape[0], filterShape[1],
                                             filterShape[2] * filterShape[3]};
@@ -443,7 +445,7 @@ struct ConvertTFStridedSlice : public RewritePattern {
     // Insert a new reshape op.
     Value original_input = strided_slice_op.getInput();
     RankedTensorType original_input_type =
-        original_input.getType().dyn_cast<RankedTensorType>();
+        mlir::dyn_cast<RankedTensorType>(original_input.getType());
     if (!original_input_type) {
       return failure();
     }
@@ -522,7 +524,8 @@ struct ConvertTFStridedSlice : public RewritePattern {
 
     DenseIntElementsAttr begin_dense_elem_attr;
     Value begin = strided_slice_op.getBegin();
-    auto begin_ranked_attr_type = begin.getType().dyn_cast<RankedTensorType>();
+    auto begin_ranked_attr_type =
+        mlir::dyn_cast<RankedTensorType>(begin.getType());
     if (!begin_ranked_attr_type ||
         !matchPattern(begin, m_Constant(&begin_dense_elem_attr))) {
       return failure();
@@ -530,7 +533,7 @@ struct ConvertTFStridedSlice : public RewritePattern {
 
     DenseIntElementsAttr end_dense_elem_attr;
     Value end = strided_slice_op.getEnd();
-    auto end_ranked_attr_type = end.getType().dyn_cast<RankedTensorType>();
+    auto end_ranked_attr_type = mlir::dyn_cast<RankedTensorType>(end.getType());
     if (!end_ranked_attr_type ||
         !matchPattern(end, m_Constant(&end_dense_elem_attr))) {
       return failure();
@@ -539,14 +542,15 @@ struct ConvertTFStridedSlice : public RewritePattern {
     DenseIntElementsAttr stride_dense_elem_attr;
     Value stride = strided_slice_op.getStrides();
     auto stride_ranked_attr_type =
-        stride.getType().dyn_cast<RankedTensorType>();
+        mlir::dyn_cast<RankedTensorType>(stride.getType());
     if (!stride_ranked_attr_type ||
         !matchPattern(stride, m_Constant(&stride_dense_elem_attr))) {
       return failure();
     }
 
     Value input = strided_slice_op.getInput();
-    RankedTensorType input_type = input.getType().dyn_cast<RankedTensorType>();
+    RankedTensorType input_type =
+        mlir::dyn_cast<RankedTensorType>(input.getType());
     if (!input_type) {
       return failure();
     }
@@ -554,7 +558,7 @@ struct ConvertTFStridedSlice : public RewritePattern {
 
     const int input_size = input_shape.size();
 
-    RankedTensorType begin_type = begin.getType().cast<RankedTensorType>();
+    RankedTensorType begin_type = mlir::cast<RankedTensorType>(begin.getType());
     const ArrayRef<int64_t> begin_shape = begin_type.getShape();
     const int begin_dim = begin_shape.size();
 
@@ -688,7 +692,7 @@ struct ConvertTFStridedSlice : public RewritePattern {
     }
 
     auto ranked_input_type =
-        strided_slice_op.getInput().getType().dyn_cast<RankedTensorType>();
+        mlir::dyn_cast<RankedTensorType>(strided_slice_op.getInput().getType());
     if (!ranked_input_type) {
       return failure();
     }
@@ -697,10 +701,11 @@ struct ConvertTFStridedSlice : public RewritePattern {
     auto end_attr = strided_slice_op.getEnd();
     auto strides_attr = strided_slice_op.getStrides();
 
-    auto begin_attr_type = begin_attr.getType().dyn_cast<RankedTensorType>();
-    auto end_attr_type = end_attr.getType().dyn_cast<RankedTensorType>();
+    auto begin_attr_type =
+        mlir::dyn_cast<RankedTensorType>(begin_attr.getType());
+    auto end_attr_type = mlir::dyn_cast<RankedTensorType>(end_attr.getType());
     auto strides_attr_type =
-        strides_attr.getType().dyn_cast<RankedTensorType>();
+        mlir::dyn_cast<RankedTensorType>(strides_attr.getType());
 
     DenseIntElementsAttr begin_elem_attr;
     DenseIntElementsAttr end_elem_attr;
@@ -899,8 +904,8 @@ struct FusedBatchNormV3Pat : public ::mlir::RewritePattern {
       if (!epsilon)
         epsilon = rewriter.getFloatAttr(rewriter.getF32Type(), 0.0001f);
 
-      if (!(((epsilon.isa<::mlir::FloatAttr>())) &&
-            ((epsilon.cast<::mlir::FloatAttr>().getType().isF32())))) {
+      if (!(((mlir::isa<::mlir::FloatAttr>(epsilon))) &&
+            ((mlir::cast<::mlir::FloatAttr>(epsilon).getType().isF32())))) {
         return rewriter.notifyMatchFailure(
             fused_batch_norm_op, [&](::mlir::Diagnostic &diag) {
               diag << "op 'tf.FusedBatchNormV3' attribute 'epsilon' failed to "
@@ -963,7 +968,7 @@ struct FusedBatchNormV3Pat : public ::mlir::RewritePattern {
     int64_t last_dim = ShapedType::kDynamic;
     {
       auto is_last_dim_compatible = [](const Value &v, int64_t &last_dim) {
-        auto v_type = v.getType().dyn_cast_or_null<RankedTensorType>();
+        auto v_type = mlir::dyn_cast_or_null<RankedTensorType>(v.getType());
         if (!v_type) return true;
         int64_t v_last_dim = v_type.getDimSize(v_type.getRank() - 1);
         if (v_last_dim == ShapedType::kDynamic) return true;
@@ -1007,9 +1012,8 @@ struct FusedBatchNormV3Pat : public ::mlir::RewritePattern {
 
     // For training, mean and variance is calculated from input values.
     if (is_training.getValue()) {
-      auto input_type = fused_batch_norm_op.getX()
-                            .getType()
-                            .dyn_cast_or_null<RankedTensorType>();
+      auto input_type = mlir::dyn_cast_or_null<RankedTensorType>(
+          fused_batch_norm_op.getX().getType());
       if (!input_type || input_type.getRank() != 4) {
         return rewriter.notifyMatchFailure(
             fused_batch_norm_op, [&](::mlir::Diagnostic &diag) {
@@ -1383,14 +1387,14 @@ struct ConvertRfftToRfft2d : public RewritePattern {
     auto rfft_op = dyn_cast<TF::RFFTOp>(op);
 
     auto input = rfft_op.getInput();
-    auto input_type = input.getType().dyn_cast_or_null<RankedTensorType>();
+    auto input_type = mlir::dyn_cast_or_null<RankedTensorType>(input.getType());
     if (!input_type) return failure();
     auto fft_len = rfft_op.getFftLength();
-    auto fft_len_type = fft_len.getType().dyn_cast_or_null<ShapedType>();
+    auto fft_len_type = mlir::dyn_cast_or_null<ShapedType>(fft_len.getType());
     if (!fft_len_type) return failure();
 
     auto output_type =
-        rfft_op.getResult().getType().dyn_cast_or_null<RankedTensorType>();
+        mlir::dyn_cast_or_null<RankedTensorType>(rfft_op.getResult().getType());
     if (!output_type) return failure();
 
     // Expanded inputs.
diff --git a/tensorflow/compiler/mlir/lite/transforms/push_transpose_through_ewise.cc b/tensorflow/compiler/mlir/lite/transforms/push_transpose_through_ewise.cc
index 363c30ab0b818c..7a8b35e4be7cde 100644
--- a/tensorflow/compiler/mlir/lite/transforms/push_transpose_through_ewise.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/push_transpose_through_ewise.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
@@ -280,7 +281,7 @@ class CommuteTransposeWithEwiseOps : public RewritePattern {
     }
 
     auto other_input_type =
-        cst_arg->getResult(0).getType().cast<RankedTensorType>();
+        mlir::cast<RankedTensorType>(cst_arg->getResult(0).getType());
 
     Operation *tposed_const;
     if (other_input_type.getNumElements() == 1) {
diff --git a/tensorflow/compiler/mlir/lite/transforms/quantize.cc b/tensorflow/compiler/mlir/lite/transforms/quantize.cc
index 9c38d09ab0c2bd..e41f98af795347 100644
--- a/tensorflow/compiler/mlir/lite/transforms/quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/quantize.cc
@@ -249,6 +249,9 @@ void QuantizePass::runOnOperation() {
                        quant::CustomOpUpdateOptions::kWeightOnly,
                        quant_specs.custom_map);
   }
+  if (enable_float16_quantization_) {
+    quant_specs.inference_type = tensorflow::DT_HALF;
+  }
 
   const quant::QuantPassSpec quant_params = {
       {quant_specs.verify_numeric, error_tolerance_,
diff --git a/tensorflow/compiler/mlir/lite/transforms/quantize_variables.cc b/tensorflow/compiler/mlir/lite/transforms/quantize_variables.cc
index 0d9db051ef27ff..96412f20633f6a 100644
--- a/tensorflow/compiler/mlir/lite/transforms/quantize_variables.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/quantize_variables.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "mlir/IR/Matchers.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
@@ -169,7 +170,7 @@ void QuantizeVariablesPass::QuantizeVariable(
   for (VarHandleOp var_handle_op : var_handle_ops) {
     builder.setInsertionPoint(var_handle_op);
     auto output_type = UnrankedTensorType::get(TF::ResourceType::get(
-        {ref_qtype.cast<TensorType>()}, builder.getContext()));
+        {mlir::cast<TensorType>(ref_qtype)}, builder.getContext()));
     auto new_var_handle_op = builder.create<VarHandleOp>(
         var_handle_op.getLoc(), output_type, var_handle_op.getContainer(),
         var_handle_op.getSharedName());
diff --git a/tensorflow/compiler/mlir/lite/transforms/reduce_type_precision.cc b/tensorflow/compiler/mlir/lite/transforms/reduce_type_precision.cc
index bee14272020446..659c5aceb39c04 100644
--- a/tensorflow/compiler/mlir/lite/transforms/reduce_type_precision.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/reduce_type_precision.cc
@@ -62,12 +62,12 @@ class CheckRangeAndConvertI8ToI4 : public OpRewritePattern<arith::ConstantOp> {
 
   LogicalResult matchAndRewrite(arith::ConstantOp op,
                                 PatternRewriter &rewriter) const override {
-    auto const_type = op.getType().dyn_cast<ShapedType>();
+    auto const_type = mlir::dyn_cast<ShapedType>(op.getType());
     if (!const_type || !const_type.getElementType().isSignlessInteger(8)) {
       return failure();
     }
 
-    auto attr = op.getValue().cast<ElementsAttr>();
+    auto attr = mlir::cast<ElementsAttr>(op.getValue());
     for (mlir::APInt v : attr.getValues<mlir::APInt>()) {
       auto v_int = static_cast<int8_t>(*(v.getRawData()));
       if (v_int > 7 || v_int < -8) {
@@ -79,7 +79,7 @@ class CheckRangeAndConvertI8ToI4 : public OpRewritePattern<arith::ConstantOp> {
     auto shaped_type =
         mlir::RankedTensorType::get(const_type.getShape(), builder.getI4Type());
     auto newAttr = DenseElementsAttr::getFromRawBuffer(
-        shaped_type, op.getValue().cast<DenseElementsAttr>().getRawData());
+        shaped_type, mlir::cast<DenseElementsAttr>(op.getValue()).getRawData());
     rewriter.replaceOpWithNewOp<arith::ConstantOp>(op, newAttr);
 
     return success();
@@ -92,8 +92,8 @@ class SanitizeGatherOpOutputToI4 : public OpRewritePattern<TFL::GatherOp> {
 
   LogicalResult matchAndRewrite(TFL::GatherOp op,
                                 PatternRewriter &rewriter) const override {
-    auto const_type = op.getOperand(0).getType().dyn_cast<ShapedType>();
-    auto result_type = op.getResult().getType().dyn_cast<ShapedType>();
+    auto const_type = mlir::dyn_cast<ShapedType>(op.getOperand(0).getType());
+    auto result_type = mlir::dyn_cast<ShapedType>(op.getResult().getType());
     if (!const_type || !const_type.getElementType().isSignlessInteger(4) ||
         !result_type || !result_type.getElementType().isSignlessInteger(8)) {
       return failure();
@@ -109,7 +109,8 @@ class SanitizeGatherOpOutputToI4 : public OpRewritePattern<TFL::GatherOp> {
     auto new_gather_op = rewriter.create<TFL::GatherOp>(
         op.getLoc(),
         /*result=*/
-        op.getResult().getType().cast<TensorType>().clone(builder.getI4Type()),
+        mlir::cast<TensorType>(op.getResult().getType())
+            .clone(builder.getI4Type()),
         /*operand=*/op.getOperands(), op->getAttrs());
     rewriter.replaceAllUsesWith(op.getResult(), new_gather_op.getResult());
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/reduce_while_operands.cc b/tensorflow/compiler/mlir/lite/transforms/reduce_while_operands.cc
index c8999216c8054b..ab03af3a4c062a 100644
--- a/tensorflow/compiler/mlir/lite/transforms/reduce_while_operands.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/reduce_while_operands.cc
@@ -104,7 +104,7 @@ void FindProducers(Value start_node, std::vector<uint64_t> &neighbors) {
   while (!queue.empty()) {
     auto node = queue.back();
     queue.pop_back();
-    if (auto arg = node.dyn_cast_or_null<BlockArgument>()) {
+    if (auto arg = mlir::dyn_cast_or_null<BlockArgument>(node)) {
       neighbors.push_back(arg.getArgNumber());
       continue;
     }
@@ -149,7 +149,7 @@ bool AllOperationSafe(Block &block) {
     // Fact: if every op's operands are defined in the same block as op,
     //       then no operation has implicit arugments (constant doesn't count).
     for (auto operand : op->getOperands()) {
-      if (operand.dyn_cast_or_null<BlockArgument>()) continue;
+      if (mlir::dyn_cast_or_null<BlockArgument>(operand)) continue;
       auto operand_op = operand.getDefiningOp();
       if (IsConstant(operand_op)) continue;
       if (operand_op->getBlock() != op->getBlock()) {
diff --git a/tensorflow/compiler/mlir/lite/transforms/split_merged_operands.cc b/tensorflow/compiler/mlir/lite/transforms/split_merged_operands.cc
index a787da584ea8be..4c555a8d0f6e3b 100644
--- a/tensorflow/compiler/mlir/lite/transforms/split_merged_operands.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/split_merged_operands.cc
@@ -85,23 +85,36 @@ LogicalResult DuplicateValueIfNeeded(Operation* op,
     Value operand = op->getOperand(index);
     auto inserted_value = values->insert(operand).second;
     if (inserted_value) continue;
-    // We can only clone the constant op at this point.
-    // Since all ops have been legalized to tflite ops, so we only care about
-    // ConstOp or QConstOp or mlir constant op/
+    // We can only clone the constant op or const->dequantize combo. The latter
+    // case is useful for float16 quantization. Since all ops have been
+    // legalized to tflite ops, so we only care about ConstOp or QConstOp or
+    // mlir constant op.
     Operation* input_op = operand.getDefiningOp();
     if (input_op == nullptr) return failure();
 
     Attribute attr;
-    if (!matchPattern(input_op, m_Constant(&attr))) {
+    if (matchPattern(input_op, m_Constant(&attr))) {
+      // Constant case.
+      builder->setInsertionPoint(op);
+      Operation* duplicated_input_op = builder->clone(*input_op);
+
+      // Rewire the inputs.
+      op->setOperand(index, duplicated_input_op->getResult(0));
+    } else if (auto dq = dyn_cast<DequantizeOp>(input_op);
+               dq && matchPattern(dq.getInput(), m_Constant(&attr))) {
+      // Constant -> Dequantize case.
+      builder->setInsertionPoint(op);
+      Operation* duplicated_input_op =
+          builder->clone(*dq.getInput().getDefiningOp());
+      Operation* duplicated_dq_op = builder->clone(*dq);
+      // Rewire the inputs.
+      duplicated_dq_op->setOperand(0, duplicated_input_op->getResult(0));
+      op->setOperand(index, duplicated_dq_op->getResult(0));
+    } else {
       op->emitError()
           << "We cannot duplicate the value since it's not constant.\n";
       return failure();
     }
-    builder->setInsertionPoint(op);
-    Operation* duplicated_input_op = builder->clone(*input_op);
-
-    // Rewire the inputs.
-    op->setOperand(index, duplicated_input_op->getResult(0));
   }
   return success();
 }
diff --git a/tensorflow/compiler/mlir/lite/transforms/unfold_large_splat_constant.cc b/tensorflow/compiler/mlir/lite/transforms/unfold_large_splat_constant.cc
index 1def97523cd668..2669159b0206bb 100644
--- a/tensorflow/compiler/mlir/lite/transforms/unfold_large_splat_constant.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/unfold_large_splat_constant.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
@@ -62,7 +63,7 @@ class UnfoldLargeSplatConstantPass
   void MaybeUnfoldLargeSplatConstant(mlir::OpBuilder* op_builder,
                                      mlir::arith::ConstantOp const_op) const {
     auto splat_elements_attr =
-        const_op.getValue().dyn_cast<SplatElementsAttr>();
+        mlir::dyn_cast<SplatElementsAttr>(const_op.getValue());
     if (!splat_elements_attr) {
       return;
     }
diff --git a/tensorflow/compiler/mlir/lite/transforms/while_loop_outline.cc b/tensorflow/compiler/mlir/lite/transforms/while_loop_outline.cc
index a3c3ece3dc94a1..013abb6ec0ea80 100644
--- a/tensorflow/compiler/mlir/lite/transforms/while_loop_outline.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/while_loop_outline.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "mlir/IR/Matchers.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
@@ -92,13 +93,13 @@ bool IsCompatibleTypeWithTFLCastOp(Type type) {
     return true;
 
   // Complex<F<32>> is allowed.
-  if (elemType.isa<ComplexType>() &&
-      elemType.cast<ComplexType>().getElementType().isF32())
+  if (mlir::isa<ComplexType>(elemType) &&
+      mlir::cast<ComplexType>(elemType).getElementType().isF32())
     return true;
 
   // QUINT8 and UI8 are allowed.
-  if (elemType.isa<TF::Quint8Type>() ||
-      (elemType.isInteger(8) && elemType.cast<IntegerType>().isUnsigned()))
+  if (mlir::isa<TF::Quint8Type>(elemType) ||
+      (elemType.isInteger(8) && mlir::cast<IntegerType>(elemType).isUnsigned()))
     return true;
 
   return false;
diff --git a/tensorflow/compiler/mlir/lite/utils/arithmetic_count_util.h b/tensorflow/compiler/mlir/lite/utils/arithmetic_count_util.h
index ac170f33d9ba85..c851d73b03290d 100644
--- a/tensorflow/compiler/mlir/lite/utils/arithmetic_count_util.h
+++ b/tensorflow/compiler/mlir/lite/utils/arithmetic_count_util.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 
 namespace mlir {
 namespace TFL {
@@ -27,7 +28,7 @@ class ArithmeticCountUtilHelper {
   static bool GetFirstOutputCount(mlir::Operation* op, int64_t* count) {
     auto output = op->getResult(0);
     auto output_type =
-        output.getType().dyn_cast_or_null<mlir::RankedTensorType>();
+        mlir::dyn_cast_or_null<mlir::RankedTensorType>(output.getType());
     if (!output_type || !output_type.hasStaticShape()) return false;
 
     *count = output_type.getNumElements();
@@ -38,7 +39,7 @@ class ArithmeticCountUtilHelper {
     int64_t total_count = 0;
     for (auto input : op->getOperands()) {
       auto input_type =
-          input.getType().dyn_cast_or_null<mlir::RankedTensorType>();
+          mlir::dyn_cast_or_null<mlir::RankedTensorType>(input.getType());
       if (!input_type || !input_type.hasStaticShape()) {
         return false;
       }
@@ -54,12 +55,12 @@ class ArithmeticCountUtilHelper {
                                                            int64_t* count) {
     auto weight = op->getOperand(1);
     auto weight_type =
-        weight.getType().dyn_cast_or_null<mlir::RankedTensorType>();
+        mlir::dyn_cast_or_null<mlir::RankedTensorType>(weight.getType());
     if (weight_type == nullptr || !weight_type.hasStaticShape()) return false;
 
     auto output = op->getResult(0);
     auto output_type =
-        output.getType().dyn_cast_or_null<mlir::RankedTensorType>();
+        mlir::dyn_cast_or_null<mlir::RankedTensorType>(output.getType());
     if (output_type == nullptr || !output_type.hasStaticShape()) return false;
 
     int64_t cols = 1;
@@ -73,7 +74,7 @@ class ArithmeticCountUtilHelper {
     auto bias = op->getOperand(2);
     if (bias) {
       auto bias_type =
-          bias.getType().dyn_cast_or_null<mlir::RankedTensorType>();
+          mlir::dyn_cast_or_null<mlir::RankedTensorType>(bias.getType());
       if (bias_type && bias_type.hasStaticShape()) {
         *count += output_type.getNumElements();
       }
diff --git a/tensorflow/compiler/mlir/lite/utils/attribute_utils.cc b/tensorflow/compiler/mlir/lite/utils/attribute_utils.cc
index 20336080cc20d6..1629000ff181df 100644
--- a/tensorflow/compiler/mlir/lite/utils/attribute_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/attribute_utils.cc
@@ -15,23 +15,24 @@ limitations under the License.
 
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 
 namespace mlir {
 namespace TFL {
 
 FloatAttr ExtractSingleElementAsFloat(ElementsAttr attr) {
   if (attr.getShapedType().getNumElements() != 1 ||
-      !attr.getShapedType().getElementType().isa<FloatType>()) {
+      !mlir::isa<FloatType>(attr.getShapedType().getElementType())) {
     return {};
   }
   return attr.getSplatValue<FloatAttr>();
 }
 
 FloatAttr GetSingleElementAsFloatOrSelf(Attribute attr) {
-  if (auto m = attr.dyn_cast_or_null<ElementsAttr>()) {
+  if (auto m = mlir::dyn_cast_or_null<ElementsAttr>(attr)) {
     return ExtractSingleElementAsFloat(m);
   } else {
-    return attr.dyn_cast_or_null<FloatAttr>();
+    return mlir::dyn_cast_or_null<FloatAttr>(attr);
   }
 }
 
diff --git a/tensorflow/compiler/mlir/lite/utils/const_tensor_utils.cc b/tensorflow/compiler/mlir/lite/utils/const_tensor_utils.cc
index 96d75cca30a48d..41eed865496a01 100644
--- a/tensorflow/compiler/mlir/lite/utils/const_tensor_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/const_tensor_utils.cc
@@ -36,11 +36,11 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
 #include "tensorflow/compiler/mlir/lite/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/lite/utils/low_bit_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
-#include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_util.h"
 #include "tsl/platform/statusor.h"
 
@@ -131,11 +131,11 @@ StatusOr<QuantizedType> GetQuantizedType(const TensorT& tensor, Builder builder,
 
   if (!storage_type) {
     const mlir::Type raw_elem_type = ConvertElementType(tensor.type, builder);
-    if (!raw_elem_type.isa<mlir::IntegerType>()) {
+    if (!mlir::isa<mlir::IntegerType>(raw_elem_type)) {
       return absl::InvalidArgumentError(
           "Quantized tensors must be stored as integers");
     }
-    storage_type = raw_elem_type.cast<mlir::IntegerType>();
+    storage_type = mlir::cast<mlir::IntegerType>(raw_elem_type);
   }
 
   // TFlite uses narrow-range [u]int8 for constant buffers of quantized weights.
@@ -254,11 +254,11 @@ mlir::ElementsAttr GetSplat(RankedTensorType type, int unique_index,
     return DenseElementsAttr::get(
         type, builder.getIntegerAttr(element_ty, unique_index));
 
-  if (element_ty.isa<mlir::FloatType>())
+  if (mlir::isa<mlir::FloatType>(element_ty))
     return DenseElementsAttr::get(
         type, builder.getFloatAttr(element_ty, unique_index));
 
-  if (auto qtype = element_ty.dyn_cast<QuantizedType>()) {
+  if (auto qtype = mlir::dyn_cast<QuantizedType>(element_ty)) {
     mlir::RankedTensorType new_type = tensorflow::GetTypeFromTFTensorShape(
         type.getShape(), qtype.getStorageType());
     return DenseElementsAttr::get(
@@ -272,9 +272,10 @@ StatusOr<mlir::ElementsAttr> ConvertIntBuffer(
     bool truncate) {
   mlir::Type elem_type = shaped_type.getElementType();
   unsigned bit_width;
-  if (auto itype = elem_type.dyn_cast<mlir::IntegerType>()) {
+  if (auto itype = mlir::dyn_cast<mlir::IntegerType>(elem_type)) {
     bit_width = itype.getWidth();
-  } else if (auto qtype = elem_type.dyn_cast<mlir::quant::QuantizedType>()) {
+  } else if (auto qtype =
+                 mlir::dyn_cast<mlir::quant::QuantizedType>(elem_type)) {
     bit_width = qtype.getStorageTypeIntegralWidth();
     shaped_type = tensorflow::GetTypeFromTFTensorShape(shaped_type.getShape(),
                                                        qtype.getStorageType());
diff --git a/tensorflow/compiler/mlir/lite/utils/const_tensor_utils.h b/tensorflow/compiler/mlir/lite/utils/const_tensor_utils.h
index 52bcbbef72aba6..d9618517a5dc96 100644
--- a/tensorflow/compiler/mlir/lite/utils/const_tensor_utils.h
+++ b/tensorflow/compiler/mlir/lite/utils/const_tensor_utils.h
@@ -25,9 +25,9 @@ limitations under the License.
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
-#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace mlir {
 namespace TFL {
diff --git a/tensorflow/compiler/mlir/lite/utils/constant_utils.cc b/tensorflow/compiler/mlir/lite/utils/constant_utils.cc
index 8bf3b4f0106604..6a4dbf3e505ba6 100644
--- a/tensorflow/compiler/mlir/lite/utils/constant_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/constant_utils.cc
@@ -56,7 +56,8 @@ absl::StatusOr<TypedAttr> CreateTypedAttr(ShapedType shaped_type, int value) {
   } else if (element_type.isF32()) {
     return DenseElementsAttr::get<float>(shaped_type,
                                          static_cast<float>(value));
-  } else if (auto complex_type = element_type.dyn_cast<mlir::ComplexType>()) {
+  } else if (auto complex_type =
+                 mlir::dyn_cast<mlir::ComplexType>(element_type)) {
     auto etype = complex_type.getElementType();
     if (etype.isF32()) {
       tensorflow::TensorProto repr;
@@ -77,7 +78,7 @@ absl::StatusOr<TypedAttr> CreateTypedAttr(ShapedType shaped_type, int value) {
       return tensorflow::Status(absl::StatusCode::kInvalidArgument,
                                 "Unsupported type");
     }
-  } else if (auto itype = element_type.dyn_cast<mlir::IntegerType>()) {
+  } else if (auto itype = mlir::dyn_cast<mlir::IntegerType>(element_type)) {
     if (element_type.isSignedInteger()) {
       switch (itype.getWidth()) {
         case 8:
diff --git a/tensorflow/compiler/mlir/lite/utils/convert_type.cc b/tensorflow/compiler/mlir/lite/utils/convert_type.cc
index f2e659b9aea9ce..c7f922de39ad81 100644
--- a/tensorflow/compiler/mlir/lite/utils/convert_type.cc
+++ b/tensorflow/compiler/mlir/lite/utils/convert_type.cc
@@ -18,12 +18,13 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "xla/statusor.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/errors.h"
-#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 
@@ -40,16 +41,16 @@ tflite::TensorType ConvertTypeToTensorType(mlir::Type type) {
     return tflite::TensorType_FLOAT32;
   } else if (type.isF64()) {
     return tflite::TensorType_FLOAT64;
-  } else if (type.isa<mlir::TF::StringType>()) {
+  } else if (mlir::isa<mlir::TF::StringType>(type)) {
     return tflite::TensorType_STRING;
-  } else if (auto complex_type = type.dyn_cast<mlir::ComplexType>()) {
+  } else if (auto complex_type = mlir::dyn_cast<mlir::ComplexType>(type)) {
     if (complex_type.getElementType().isF32()) {
       return tflite::TensorType_COMPLEX64;
     } else if (complex_type.getElementType().isF64()) {
       return tflite::TensorType_COMPLEX128;
     }
     llvm_unreachable("invalid complex Type in conversion");
-  } else if (auto itype = type.dyn_cast<mlir::IntegerType>()) {
+  } else if (auto itype = mlir::dyn_cast<mlir::IntegerType>(type)) {
     switch (itype.getWidth()) {
       case 1:
         return tflite::TensorType_BOOL;
@@ -209,7 +210,7 @@ absl::StatusOr<tflite::TensorType> TfTypeToTflType(tensorflow::DataType type) {
 
 mlir::Type GetShapeStrippedType(mlir::TypeAttr type_attr) {
   auto type = type_attr.getValue();
-  auto shaped_type = type.dyn_cast<mlir::ShapedType>();
+  auto shaped_type = mlir::dyn_cast<mlir::ShapedType>(type);
   if (shaped_type) {
     return shaped_type.getElementType();
   } else {
diff --git a/tensorflow/compiler/mlir/lite/utils/convert_type.h b/tensorflow/compiler/mlir/lite/utils/convert_type.h
index ce26591d52b34a..85631dbe258f8e 100644
--- a/tensorflow/compiler/mlir/lite/utils/convert_type.h
+++ b/tensorflow/compiler/mlir/lite/utils/convert_type.h
@@ -19,9 +19,9 @@ limitations under the License.
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/schema/schema_generated.h"
 #include "xla/statusor.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace mlir {
 class Builder;
diff --git a/tensorflow/compiler/mlir/lite/utils/fake_quant_utils.h b/tensorflow/compiler/mlir/lite/utils/fake_quant_utils.h
index 77b047f68c6bf2..d1dcf8c304b0a9 100644
--- a/tensorflow/compiler/mlir/lite/utils/fake_quant_utils.h
+++ b/tensorflow/compiler/mlir/lite/utils/fake_quant_utils.h
@@ -123,7 +123,7 @@ class InsertTFLQuantOpsAfterTFFakeQuantOp {
     int quant_dim = -1;
     if (PerAxis) {
       // This is a special case that the quant_dim is the last dimensions.
-      quant_dim = res.getType().template cast<ShapedType>().getRank() - 1;
+      quant_dim = mlir::cast<ShapedType>(res.getType()).getRank() - 1;
     }
     // Use the min/max from the operands and the num_bits and narrow_range
     // attribute to create the quantization parameter for the new quantize op.
diff --git a/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc b/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc
index 0a563238635d20..bada49a68a9e55 100644
--- a/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc
@@ -127,7 +127,7 @@ Value Reverse(OpBuilder* builder, Value value_to_reverse, int axis,
 }
 
 ArrayRef<int64_t> GetRankedTensorShape(Value value) {
-  return value.getType().cast<RankedTensorType>().getShape();
+  return mlir::cast<RankedTensorType>(value.getType()).getShape();
 }
 
 Value SliceRankedTensor(OpBuilder* builder, Value input,
@@ -159,7 +159,7 @@ Value SliceRankedTensor(OpBuilder* builder, Value input,
       location,
       RankedTensorType::get(
           size_values,
-          input.getType().cast<RankedTensorType>().getElementType()),
+          mlir::cast<RankedTensorType>(input.getType()).getElementType()),
       input, slice_i2c_begin, slice_i2c_size);
 }
 
@@ -170,7 +170,8 @@ Value CreateStridedSliceOp(mlir::Location loc, ArrayRef<int64_t> output_shape,
                            int64_t ellipsis_mask, int64_t new_axis_mask,
                            int64_t shrink_axis_mask, OpBuilder* builder) {
   auto output_type = RankedTensorType::get(
-      output_shape, input.getType().cast<RankedTensorType>().getElementType());
+      output_shape,
+      mlir::cast<RankedTensorType>(input.getType()).getElementType());
   auto begin_tensor = CreateI32DenseConst(builder, begin, loc);
   auto end_tensor = CreateI32DenseConst(builder, end, loc);
   auto strides_tensor = CreateI32DenseConst(builder, strides, loc);
@@ -387,7 +388,8 @@ void ConvertLSTMCellSimpleToFusedLSTM::UpdateFuncSignature() {
   SmallVector<int64_t, 2> output_shape{1, tensorflow::kTFDynamicSize};
   auto input_types = fused_func_op_.getFunctionType().getInputs();
   auto output_type = tensorflow::GetTypeFromTFTensorShape(
-      output_shape, input_.getType().cast<RankedTensorType>().getElementType());
+      output_shape,
+      mlir::cast<RankedTensorType>(input_.getType()).getElementType());
   fused_func_op_.setType(mlir::FunctionType::get(fused_func_op_.getContext(),
                                                  input_types, output_type));
 }
@@ -410,7 +412,8 @@ LogicalResult ConvertLSTMCellSimpleToFusedLSTM::RewriteFunc() {
   // Create the fused LSTM op.
   SmallVector<int64_t, 2> output_shape = {1, n_output_};
   auto result_type = mlir::RankedTensorType::get(
-      output_shape, input_.getType().cast<RankedTensorType>().getElementType());
+      output_shape,
+      mlir::cast<RankedTensorType>(input_.getType()).getElementType());
   lstm_ = builder_.create<mlir::TFL::LSTMOp>(
       fused_func_op_.getLoc(), result_type, input_, input2input_, input2forget_,
       input2cell_, input2output_, rec2input_, rec2forget_, rec2cell_,
@@ -436,7 +439,7 @@ LogicalResult ConvertLSTMCellSimpleToFusedLSTM::RewriteFunc() {
   SmallVector<int64_t, 2> func_output_shape = {1, tensorflow::kTFDynamicSize};
   auto func_result_type = tensorflow::GetTypeFromTFTensorShape(
       func_output_shape,
-      input_.getType().cast<RankedTensorType>().getElementType());
+      mlir::cast<RankedTensorType>(input_.getType()).getElementType());
 
   auto tensor_cast = builder_.create<mlir::tensor::CastOp>(
       fused_func_op_.getLoc(), func_result_type, lstm_.getResult());
@@ -491,7 +494,7 @@ LogicalResult ConvertLSTMCellSimpleToFusedLSTM::Initialize() {
   bias_ = fused_func_op_.getArgument(2);
 
   weight_ = fused_func_op_.getArgument(1);
-  weight_type_ = weight_.getType().cast<RankedTensorType>();
+  weight_type_ = mlir::cast<RankedTensorType>(weight_.getType());
 
   if (weight_type_.getRank() != 2) {
     return fused_func_op_.emitError() << "The weight tensor was not of rank 2";
@@ -505,7 +508,7 @@ LogicalResult ConvertLSTMCellSimpleToFusedLSTM::Initialize() {
   n_cell_ = weight_type_.getDimSize(1) / num_gates_;
 
   projection_ = fused_func_op_.getArgument(3);
-  projection_type_ = projection_.getType().cast<RankedTensorType>();
+  projection_type_ = mlir::cast<RankedTensorType>(projection_.getType());
   if (projection_type_.getRank() != 2) {
     n_output_ = n_cell_;
   } else {
@@ -532,7 +535,8 @@ LogicalResult ConvertLayerNormalizedLSTMCellSimpleToFusedLSTM::Initialize() {
   }
 
   layer_norm_scale_ = fused_func_op_.getArgument(4);
-  layer_norm_scale_type_ = layer_norm_scale_.getType().cast<RankedTensorType>();
+  layer_norm_scale_type_ =
+      mlir::cast<RankedTensorType>(layer_norm_scale_.getType());
   if (layer_norm_scale_type_.getRank() != 1) {
     return fused_func_op_.emitError()
            << "The layer_norm_scale tensor was not of rank 1";
@@ -607,7 +611,7 @@ TF::ReshapeOp CreateFlattenOP(const Value& input, Location loc,
 LogicalResult CreateEqualSizeSplitVOp(Value input, int axis, int splits,
                                       Location loc, OpBuilder* builder,
                                       Operation** result) {
-  auto input_type = input.getType().cast<RankedTensorType>();
+  auto input_type = mlir::cast<RankedTensorType>(input.getType());
   SmallVector<int64_t, 4> output_shape;
   int size_of_splits;
   if (input_type.getRank() < axis || axis < 0) return failure();
@@ -666,7 +670,7 @@ LogicalResult ConvertKerasLSTMLayer(mlir::func::FuncOp func_op,
   if (time_major_attr == nullptr) return failure();
 
   bool time_majored = time_major_attr.getValue();
-  auto input_type = input.getType().dyn_cast_or_null<RankedTensorType>();
+  auto input_type = mlir::dyn_cast_or_null<RankedTensorType>(input.getType());
   if (!input_type) {
     func_op.emitError() << "Input type is not a ranked tensor type";
     return failure();
@@ -692,7 +696,7 @@ LogicalResult ConvertKerasLSTMLayer(mlir::func::FuncOp func_op,
 
   // Setup correct weights.
   RankedTensorType weight_type =
-      weight_kernel.getType().cast<RankedTensorType>();
+      mlir::cast<RankedTensorType>(weight_kernel.getType());
   if (weight_type.getRank() != 2)
     return func_op.emitError() << "The weight should be rank of 2";
 
@@ -700,7 +704,7 @@ LogicalResult ConvertKerasLSTMLayer(mlir::func::FuncOp func_op,
       Transpose2D(builder, weight_kernel, weight_type, func_op.getLoc());
 
   RankedTensorType recurrent_kernel_type =
-      recurrent_kernel.getType().cast<RankedTensorType>();
+      mlir::cast<RankedTensorType>(recurrent_kernel.getType());
   const int64_t n_output = recurrent_kernel_type.getDimSize(0);
 
   Value transpose_recurrent_kernel = Transpose2D(
@@ -726,28 +730,28 @@ LogicalResult ConvertKerasLSTMLayer(mlir::func::FuncOp func_op,
   // IndyLSTMs are a LSTM variant with diagonal recurrent weight
   // matrices. For optimization purposes these are provided as vectors.
   Value recurrent_to_input_weights =
-      indy ? CreateFlattenOP(recurrent_weights_array->getResult(0),
-                             func_op.getLoc(), builder)
-                 .getResult()
-                 .cast<Value>()
+      indy ? mlir::cast<Value>(
+                 CreateFlattenOP(recurrent_weights_array->getResult(0),
+                                 func_op.getLoc(), builder)
+                     .getResult())
            : recurrent_weights_array->getResult(0);
   Value recurrent_to_forget_weights =
-      indy ? CreateFlattenOP(recurrent_weights_array->getResult(1),
-                             func_op.getLoc(), builder)
-                 .getResult()
-                 .cast<Value>()
+      indy ? mlir::cast<Value>(
+                 CreateFlattenOP(recurrent_weights_array->getResult(1),
+                                 func_op.getLoc(), builder)
+                     .getResult())
            : recurrent_weights_array->getResult(1);
   Value recurrent_to_cell_weights =
-      indy ? CreateFlattenOP(recurrent_weights_array->getResult(2),
-                             func_op.getLoc(), builder)
-                 .getResult()
-                 .cast<Value>()
+      indy ? mlir::cast<Value>(
+                 CreateFlattenOP(recurrent_weights_array->getResult(2),
+                                 func_op.getLoc(), builder)
+                     .getResult())
            : recurrent_weights_array->getResult(2);
   Value recurrent_to_output_weights =
-      indy ? CreateFlattenOP(recurrent_weights_array->getResult(3),
-                             func_op.getLoc(), builder)
-                 .getResult()
-                 .cast<Value>()
+      indy ? mlir::cast<Value>(
+                 CreateFlattenOP(recurrent_weights_array->getResult(3),
+                                 func_op.getLoc(), builder)
+                     .getResult())
            : recurrent_weights_array->getResult(3);
 
   // Splits the bias into 4:
@@ -765,7 +769,7 @@ LogicalResult ConvertKerasLSTMLayer(mlir::func::FuncOp func_op,
   }
   auto result_type = mlir::RankedTensorType::get(
       output_shape,
-      final_inputs.getType().cast<RankedTensorType>().getElementType());
+      mlir::cast<RankedTensorType>(final_inputs.getType()).getElementType());
 
   Value none = CreateNoneValue(builder, func_op.getLoc());
   auto lstm = builder->create<mlir::TFL::UnidirectionalSequenceLSTMOp>(
@@ -866,7 +870,8 @@ LogicalResult ConvertKerasLSTMLayer(mlir::func::FuncOp func_op,
 
   // All the rest: states, device.
   for (int i = 2; i < 5; ++i) {
-    auto result_type = func_op.getResultTypes()[i].dyn_cast<RankedTensorType>();
+    auto result_type =
+        mlir::dyn_cast<RankedTensorType>(func_op.getResultTypes()[i]);
     outputs.push_back(CreatTfF32ConstOp(builder, result_type.getShape(), 0.0f,
                                         func_op.getLoc()));
     output_types.push_back(result_type);
diff --git a/tensorflow/compiler/mlir/lite/utils/lstm_utils_test.cc b/tensorflow/compiler/mlir/lite/utils/lstm_utils_test.cc
index 342bbb5c7fe382..7fe7ae8404137c 100644
--- a/tensorflow/compiler/mlir/lite/utils/lstm_utils_test.cc
+++ b/tensorflow/compiler/mlir/lite/utils/lstm_utils_test.cc
@@ -134,22 +134,18 @@ TEST_F(LstmUtilsTest, ConvertLSTMCellSimple) {
 
   auto transpose_op = fused_lstm_func_.getBody().front().begin();
   transpose_op++;
-  EXPECT_EQ(
-      transpose_op->getOperand(0).getType().cast<RankedTensorType>().getDimSize(
-          0),
-      3);
-  EXPECT_EQ(
-      transpose_op->getOperand(0).getType().cast<RankedTensorType>().getDimSize(
-          1),
-      12);
-  EXPECT_EQ(
-      transpose_op->getResult(0).getType().cast<RankedTensorType>().getDimSize(
-          0),
-      12);
-  EXPECT_EQ(
-      transpose_op->getResult(0).getType().cast<RankedTensorType>().getDimSize(
-          1),
-      3);
+  EXPECT_EQ(mlir::cast<RankedTensorType>(transpose_op->getOperand(0).getType())
+                .getDimSize(0),
+            3);
+  EXPECT_EQ(mlir::cast<RankedTensorType>(transpose_op->getOperand(0).getType())
+                .getDimSize(1),
+            12);
+  EXPECT_EQ(mlir::cast<RankedTensorType>(transpose_op->getResult(0).getType())
+                .getDimSize(0),
+            12);
+  EXPECT_EQ(mlir::cast<RankedTensorType>(transpose_op->getResult(0).getType())
+                .getDimSize(1),
+            3);
 
   auto it = fused_lstm_func_.getBody().back().rbegin();
   EXPECT_EQ(it->getName().getStringRef(),
@@ -161,33 +157,31 @@ TEST_F(LstmUtilsTest, ConvertLSTMCellSimple) {
   EXPECT_EQ(it->getNumOperands(), 24);
   EXPECT_EQ(it->getNumResults(), 1);
   // cifg = false, so input2input is not None.
-  EXPECT_FALSE(it->getOperand(1).getType().isa<NoneType>());
+  EXPECT_FALSE(mlir::isa<NoneType>(it->getOperand(1).getType()));
   // input layer norm is None
-  EXPECT_TRUE(it->getOperand(20).getType().isa<NoneType>());
+  EXPECT_TRUE(mlir::isa<NoneType>(it->getOperand(20).getType()));
   // proj_bias is F32
-  EXPECT_TRUE(it->getOperand(17)
-                  .getType()
-                  .cast<RankedTensorType>()
+  EXPECT_TRUE(mlir::cast<RankedTensorType>(it->getOperand(17).getType())
                   .getElementType()
                   .isF32());
 
   // output gate bias is 0 since it is out of bounds of the bias tensor, so
   // we set its value as a const tensor of specified size and value 0.
-  EXPECT_TRUE(mlir::cast<mlir::arith::ConstantOp>(
-                  it->getOpOperand(15).get().getDefiningOp())
-                  .getValue()
-                  .cast<ElementsAttr>()
-                  .getValues<FloatAttr>()[0]
-                  .getValue()
-                  .isExactlyValue(0.0f));
+  EXPECT_TRUE(
+      mlir::cast<ElementsAttr>(mlir::cast<mlir::arith::ConstantOp>(
+                                   it->getOpOperand(15).get().getDefiningOp())
+                                   .getValue())
+          .getValues<FloatAttr>()[0]
+          .getValue()
+          .isExactlyValue(0.0f));
 
   EXPECT_EQ(fused_lstm_func_.getFunctionType().getNumResults(), 1);
   auto output_types = fused_lstm_func_.getFunctionType().getResults();
   SmallVector<int64_t, 2> output_shape{1, mlir::ShapedType::kDynamic};
-  EXPECT_EQ(output_types[0].cast<RankedTensorType>().getShape().size(),
+  EXPECT_EQ(mlir::cast<RankedTensorType>(output_types[0]).getShape().size(),
             output_shape.size());
   for (int i = 0; i < output_shape.size(); i++) {
-    EXPECT_EQ(output_types[0].cast<RankedTensorType>().getDimSize(i),
+    EXPECT_EQ(mlir::cast<RankedTensorType>(output_types[0]).getDimSize(i),
               output_shape[i]);
   }
 }
@@ -215,7 +209,7 @@ TEST_F(LstmUtilsTest, ConvertLSTMCellSimpleToFusedLSTMCoupleInputForget) {
   EXPECT_EQ(it->getNumOperands(), 24);
   EXPECT_EQ(it->getNumResults(), 1);
   // cifg = true, so input2input is None.
-  EXPECT_TRUE(it->getOperand(1).getType().isa<NoneType>());
+  EXPECT_TRUE(mlir::isa<NoneType>(it->getOperand(1).getType()));
 }
 
 TEST_F(LstmUtilsTest, ConvertLayerNormLSTMCellSimpleToFusedLSTM) {
@@ -242,23 +236,25 @@ TEST_F(LstmUtilsTest, ConvertLayerNormLSTMCellSimpleToFusedLSTM) {
   EXPECT_EQ(it->getNumOperands(), 24);
   EXPECT_EQ(it->getNumResults(), 1);
   // cifg = false, so input2input is not None.
-  EXPECT_FALSE(it->getOperand(1).getType().isa<NoneType>());
+  EXPECT_FALSE(mlir::isa<NoneType>(it->getOperand(1).getType()));
 
   // input layer norm
-  EXPECT_FALSE(it->getOperand(20).getType().isa<NoneType>());
+  EXPECT_FALSE(mlir::isa<NoneType>(it->getOperand(20).getType()));
+  EXPECT_EQ(mlir::cast<RankedTensorType>(it->getOperand(20).getType())
+                .getShape()
+                .size(),
+            1);
   EXPECT_EQ(
-      it->getOperand(20).getType().cast<RankedTensorType>().getShape().size(),
-      1);
-  EXPECT_EQ(it->getOperand(20).getType().cast<RankedTensorType>().getDimSize(0),
-            3);
+      mlir::cast<RankedTensorType>(it->getOperand(20).getType()).getDimSize(0),
+      3);
 
   EXPECT_EQ(fused_ln_lstm_func_.getFunctionType().getNumResults(), 1);
   auto output_types = fused_ln_lstm_func_.getFunctionType().getResults();
   SmallVector<int64_t, 2> output_shape{1, mlir::ShapedType::kDynamic};
-  EXPECT_EQ(output_types[0].cast<RankedTensorType>().getShape().size(),
+  EXPECT_EQ(mlir::cast<RankedTensorType>(output_types[0]).getShape().size(),
             output_shape.size());
   for (int i = 0; i < output_shape.size(); i++) {
-    EXPECT_EQ(output_types[0].cast<RankedTensorType>().getDimSize(i),
+    EXPECT_EQ(mlir::cast<RankedTensorType>(output_types[0]).getDimSize(i),
               output_shape[i]);
   }
 }
diff --git a/tensorflow/compiler/mlir/lite/utils/nms_utils.cc b/tensorflow/compiler/mlir/lite/utils/nms_utils.cc
index 5633068509faf4..cab3df456c0e00 100644
--- a/tensorflow/compiler/mlir/lite/utils/nms_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/nms_utils.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 
 namespace mlir {
@@ -74,7 +75,7 @@ LogicalResult ConvertNMSPaddedFunc::VerifySignature() {
   // The TFLite fused op does not support batching yet.
   // TODO(b/158709815): Add support for batches with padded NMS.
   auto boxes_type =
-      func_.getFunctionType().getInput(0).dyn_cast<RankedTensorType>();
+      mlir::dyn_cast<RankedTensorType>(func_.getFunctionType().getInput(0));
   if (boxes_type == nullptr || !boxes_type.hasRank() ||
       boxes_type.getRank() != 2) {
     return func_.emitWarning() << "TFLite does not support batched input for "
@@ -121,7 +122,7 @@ LogicalResult ConvertSSDPostProcessFunc::CreateNMSCustomOptions(
       failed(AddFloatAttr(func, attrs, "w_scale", &fbb)))
     return failure();
   auto use_regular_nms =
-      attrs.get("use_regular_nms").dyn_cast_or_null<BoolAttr>();
+      mlir::dyn_cast_or_null<BoolAttr>(attrs.get("use_regular_nms"));
   if (!use_regular_nms) {
     return func.emitError()
            << "use_regular_nms attribute is not set or not a bool";
@@ -137,7 +138,7 @@ LogicalResult ConvertSSDPostProcessFunc::CreateNMSCustomOptions(
 LogicalResult ConvertSSDPostProcessFunc::AddIntAttr(
     func::FuncOp func, DictionaryAttr attrs, const std::string& attribute,
     flexbuffers::Builder* builder) {
-  auto int_attr = attrs.get(attribute).dyn_cast_or_null<IntegerAttr>();
+  auto int_attr = mlir::dyn_cast_or_null<IntegerAttr>(attrs.get(attribute));
   if (!int_attr) {
     return func.emitError()
            << attribute.c_str() << " attribute is not set or not an integer";
@@ -149,7 +150,7 @@ LogicalResult ConvertSSDPostProcessFunc::AddIntAttr(
 LogicalResult ConvertSSDPostProcessFunc::AddFloatAttr(
     func::FuncOp func, DictionaryAttr attrs, const std::string& attribute,
     flexbuffers::Builder* builder) {
-  auto float_attr = attrs.get(attribute).dyn_cast_or_null<FloatAttr>();
+  auto float_attr = mlir::dyn_cast_or_null<FloatAttr>(attrs.get(attribute));
   if (!float_attr) {
     return func.emitError()
            << attribute.c_str() << " attribute is not set or not a float";
@@ -160,7 +161,7 @@ LogicalResult ConvertSSDPostProcessFunc::AddFloatAttr(
 
 LogicalResult ConvertSSDPostProcessFunc::HasIntAttr(
     func::FuncOp func, DictionaryAttr attrs, const std::string& attribute) {
-  auto int_attr = attrs.get(attribute).dyn_cast_or_null<IntegerAttr>();
+  auto int_attr = mlir::dyn_cast_or_null<IntegerAttr>(attrs.get(attribute));
   if (!int_attr) {
     return func.emitWarning()
            << attribute.c_str() << " attribute is not set or not an integer";
@@ -170,7 +171,7 @@ LogicalResult ConvertSSDPostProcessFunc::HasIntAttr(
 
 LogicalResult ConvertSSDPostProcessFunc::HasFloatAttr(
     func::FuncOp func, DictionaryAttr attrs, const std::string& attribute) {
-  auto float_attr = attrs.get(attribute).dyn_cast_or_null<FloatAttr>();
+  auto float_attr = mlir::dyn_cast_or_null<FloatAttr>(attrs.get(attribute));
   if (!float_attr) {
     return func.emitWarning()
            << attribute.c_str() << " attribute is not set or not a float";
diff --git a/tensorflow/compiler/mlir/lite/utils/perception_ops_utils.cc b/tensorflow/compiler/mlir/lite/utils/perception_ops_utils.cc
index c7944b67406907..f6595331c02415 100644
--- a/tensorflow/compiler/mlir/lite/utils/perception_ops_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/perception_ops_utils.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -45,14 +46,15 @@ inline LogicalResult HasIntegerArrayWithSize(func::FuncOp* func,
                                              const DictionaryAttr& attrs,
                                              const std::string& attr_name,
                                              int N) {
-  ArrayAttr array_attr = attrs.get(attr_name).dyn_cast_or_null<ArrayAttr>();
+  ArrayAttr array_attr =
+      mlir::dyn_cast_or_null<ArrayAttr>(attrs.get(attr_name));
   if (array_attr == nullptr || array_attr.size() != N) {
     return func->emitWarning()
            << "'" << attr_name << "' attribute for " << kMaxUnpooling
            << " must be set and has size of " << N;
   }
   for (Attribute integer_attr : array_attr.getValue()) {
-    IntegerAttr value = integer_attr.dyn_cast<IntegerAttr>();
+    IntegerAttr value = mlir::dyn_cast<IntegerAttr>(integer_attr);
     if (!value) {
       return func->emitWarning()
              << "'" << attr_name << "' attribute for " << kMaxUnpooling
@@ -66,7 +68,8 @@ inline LogicalResult GetIntegerArraySafe(
     func::FuncOp* func, const DictionaryAttr& attrs,
     const std::string& attr_name, llvm::SmallVectorImpl<int32_t>* results,
     int N) {
-  ArrayAttr array_attr = attrs.get(attr_name).dyn_cast_or_null<ArrayAttr>();
+  ArrayAttr array_attr =
+      mlir::dyn_cast_or_null<ArrayAttr>(attrs.get(attr_name));
   if (array_attr == nullptr || array_attr.size() != N) {
     return func->emitError()
            << "'" << attr_name << "' attribute for " << kMaxUnpooling
@@ -75,7 +78,7 @@ inline LogicalResult GetIntegerArraySafe(
   results->reserve(N);
 
   for (Attribute integer_attr : array_attr.getValue()) {
-    IntegerAttr value = integer_attr.dyn_cast<IntegerAttr>();
+    IntegerAttr value = mlir::dyn_cast<IntegerAttr>(integer_attr);
     if (!value) {
       return func->emitError()
              << "'" << attr_name << "' attribute for " << kMaxUnpooling
@@ -132,13 +135,12 @@ LogicalResult ConvertMaxUnpoolingFunc::VerifySignature() {
   }
 
   // Retrieves padding.
-  auto padding = attrs.get("padding").dyn_cast_or_null<StringAttr>();
+  auto padding = mlir::dyn_cast_or_null<StringAttr>(attrs.get("padding"));
   if (!padding) {
     return func_.emitWarning() << "'padding' attribute for " << kMaxUnpooling
                                << " is not set or not a string";
   }
-  if (!padding.getValue().equals("VALID") &&
-      !padding.getValue().equals("SAME")) {
+  if (padding.getValue() != "VALID" && padding.getValue() != "SAME") {
     return func_.emitWarning()
            << "Padding for " << kMaxUnpooling << " must be 'SAME' or 'VALID'";
   }
@@ -166,14 +168,14 @@ LogicalResult ConvertMaxUnpoolingFunc::CreateCustomOptions(
   pool_params.stride_width = strides[1];
 
   // Retrieves padding.
-  auto padding = attrs.get("padding").dyn_cast_or_null<StringAttr>();
+  auto padding = mlir::dyn_cast_or_null<StringAttr>(attrs.get("padding"));
   if (!padding) {
     return func_.emitError() << "'padding' attribute for " << kMaxUnpooling
                              << " is not set or not a string";
   }
-  if (padding.getValue().equals("VALID")) {
+  if (padding.getValue() == "VALID") {
     pool_params.padding = kTfLitePaddingValid;
-  } else if (padding.getValue().equals("SAME")) {
+  } else if (padding.getValue() == "SAME") {
     pool_params.padding = kTfLitePaddingSame;
   } else {
     return func_.emitError()
@@ -224,22 +226,22 @@ LogicalResult ConvertDenseImageWarpFunc::VerifySignature() {
   }
 
   // Check types and shapes.
-  auto image_type =
-      func_.getFunctionType().getInput(0).dyn_cast_or_null<RankedTensorType>();
+  auto image_type = mlir::dyn_cast_or_null<RankedTensorType>(
+      func_.getFunctionType().getInput(0));
   if (!image_type || !image_type.getElementType().isF32() ||
       image_type.getRank() != 4) {
     return func_.emitWarning() << "Image should be a 4D float tensor";
   }
 
-  auto flow_type =
-      func_.getFunctionType().getInput(1).dyn_cast_or_null<RankedTensorType>();
+  auto flow_type = mlir::dyn_cast_or_null<RankedTensorType>(
+      func_.getFunctionType().getInput(1));
   if (!flow_type || !flow_type.getElementType().isF32() ||
       flow_type.getRank() != 4) {
     return func_.emitWarning() << "Flow should be a 4D float tensor";
   }
 
-  auto output_type =
-      func_.getFunctionType().getResult(0).dyn_cast_or_null<RankedTensorType>();
+  auto output_type = mlir::dyn_cast_or_null<RankedTensorType>(
+      func_.getFunctionType().getResult(0));
   if (!output_type || !output_type.getElementType().isF32() ||
       output_type.getRank() != 4) {
     return func_.emitWarning() << "Output should be a 4D float tensor";
diff --git a/tensorflow/compiler/mlir/lite/utils/tftext_utils.cc b/tensorflow/compiler/mlir/lite/utils/tftext_utils.cc
index 7ce9c56086e691..5e9bcc16d27537 100644
--- a/tensorflow/compiler/mlir/lite/utils/tftext_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/tftext_utils.cc
@@ -62,11 +62,13 @@ inline ConstBytesAttr CustomOption(OpBuilder* builder,
 }
 
 inline TensorType GetInputType(func::FuncOp func, int idx) {
-  return func.getFunctionType().getInput(idx).dyn_cast_or_null<TensorType>();
+  return mlir::dyn_cast_or_null<TensorType>(
+      func.getFunctionType().getInput(idx));
 }
 
 inline TensorType GetResultType(func::FuncOp func, int idx) {
-  return func.getFunctionType().getResult(idx).dyn_cast_or_null<TensorType>();
+  return mlir::dyn_cast_or_null<TensorType>(
+      func.getFunctionType().getResult(idx));
 }
 
 inline bool RankEquals(const TensorType& type, int rank) {
@@ -89,7 +91,7 @@ LogicalResult VerifyWhitespaceTokenizer(func::FuncOp func) {
   // * 2nd output is the inner offset;
   // * 3rd output is the outer offset.
   auto input_type = GetInputType(func, 0);
-  if (!input_type || !input_type.getElementType().isa<StringType>() ||
+  if (!input_type || !mlir::isa<StringType>(input_type.getElementType()) ||
       !input_type.hasRank()) {
     return func.emitError() << "Input should be a string tensor";
   }
@@ -107,7 +109,7 @@ LogicalResult VerifyWhitespaceTokenizer(func::FuncOp func) {
 
   auto value_type = GetResultType(func, 0);
   if (!RankEquals(value_type, 1) ||
-      !value_type.getElementType().isa<StringType>()) {
+      !mlir::isa<StringType>(value_type.getElementType())) {
     return func.emitError() << "1st output should be string tensor";
   }
   if (func.getNumResults() > 1) {
@@ -157,12 +159,14 @@ LogicalResult VerifyNgrams(func::FuncOp func) {
   int row_splits = func.getFunctionType().getInputs().size() - kRowSplits;
   if (row_splits == 0) {
     auto input_values = GetInputType(func, kValues);
-    if (!input_values || !input_values.getElementType().isa<StringType>()) {
+    if (!input_values ||
+        !mlir::isa<StringType>(input_values.getElementType())) {
       return func.emitError()
              << "Input " << kValues << " should be a string tensor";
     }
     auto output_values = GetResultType(func, kValues);
-    if (!output_values || !output_values.getElementType().isa<StringType>()) {
+    if (!output_values ||
+        !mlir::isa<StringType>(output_values.getElementType())) {
       return func.emitError()
              << "Output " << kValues << " should be a string tensor";
     }
@@ -175,13 +179,13 @@ LogicalResult VerifyNgrams(func::FuncOp func) {
   } else {
     auto input_values = GetInputType(func, kValues);
     if (!RankEquals(input_values, 1) ||
-        !input_values.getElementType().isa<StringType>()) {
+        !mlir::isa<StringType>(input_values.getElementType())) {
       return func.emitError()
              << "Input " << kValues << " should be a 1D string tensor";
     }
     auto output_values = GetResultType(func, kValues);
     if (!RankEquals(output_values, 1) ||
-        !output_values.getElementType().isa<StringType>()) {
+        !mlir::isa<StringType>(output_values.getElementType())) {
       return func.emitError()
              << "Output " << kValues << " should be a 1D string tensor";
     }
@@ -211,14 +215,14 @@ LogicalResult CreateNgramsCustomOption(func::FuncOp func, DictionaryAttr attrs,
   flexbuffers::Builder fbb;
   size_t start_map = fbb.StartMap();
 
-  auto width = attrs.get("width").dyn_cast_or_null<IntegerAttr>();
+  auto width = mlir::dyn_cast_or_null<IntegerAttr>(attrs.get("width"));
   if (!width) {
     return func.emitError() << "'width' attribute is not set or not an integer";
   }
   fbb.Int("width", width.getInt());
 
   auto string_separator =
-      attrs.get("string_separator").dyn_cast_or_null<StringAttr>();
+      mlir::dyn_cast_or_null<StringAttr>(attrs.get("string_separator"));
   if (!string_separator) {
     return func.emitError()
            << "'string_separator' attribute is not set or not a string";
@@ -229,14 +233,14 @@ LogicalResult CreateNgramsCustomOption(func::FuncOp func, DictionaryAttr attrs,
                                    string_separator.getValue().size());
   fbb.String("string_separator", string_separator_str);
 
-  auto axis = attrs.get("axis").dyn_cast_or_null<IntegerAttr>();
+  auto axis = mlir::dyn_cast_or_null<IntegerAttr>(attrs.get("axis"));
   if (!axis) {
     return func.emitError() << "'axis' attribute is not set or not an integer";
   }
   fbb.Int("axis", axis.getInt());
 
   auto reduction_type =
-      attrs.get("reduction_type").dyn_cast_or_null<StringAttr>();
+      mlir::dyn_cast_or_null<StringAttr>(attrs.get("reduction_type"));
   if (!reduction_type) {
     return func.emitError()
            << "'reduction_type' attribute is not set or not a string";
@@ -277,23 +281,23 @@ LogicalResult VerifySgnnProjection(func::FuncOp func, FuncAttr attr) {
     return func.emitError() << "Mismatched number of inputs and outputs.";
   }
   auto values_type = GetInputType(func, 0);
-  if (!values_type || !values_type.getElementType().isa<StringType>()) {
+  if (!values_type || !mlir::isa<StringType>(values_type.getElementType())) {
     return func.emitError() << "First input should be a string tensor";
   }
   auto row_splits_type = GetInputType(func, 1);
   if (!row_splits_type ||
-      !row_splits_type.getElementType().isa<IntegerType>()) {
+      !mlir::isa<IntegerType>(row_splits_type.getElementType())) {
     return func.emitError() << "Second input should be an integer tensor";
   }
 
   auto hash_seed =
-      attr.getAttrs().get("hash_seed").dyn_cast_or_null<ArrayAttr>();
+      mlir::dyn_cast_or_null<ArrayAttr>(attr.getAttrs().get("hash_seed"));
   if (!hash_seed) {
     return func.emitError()
            << "'hash_seed' attribute is not set or not an array";
   }
   auto output_type = GetResultType(func, 0);
-  if (!output_type || !output_type.getElementType().isa<FloatType>() ||
+  if (!output_type || !mlir::isa<FloatType>(output_type.getElementType()) ||
       !RankEquals(output_type, 2)) {
     return func.emitError() << "Output should be a 2D float tensor.";
   }
@@ -302,7 +306,8 @@ LogicalResult VerifySgnnProjection(func::FuncOp func, FuncAttr attr) {
            << "Output 2nd dimension should be the num of hash seeds.";
   }
 
-  auto buckets = attr.getAttrs().get("buckets").dyn_cast_or_null<IntegerAttr>();
+  auto buckets =
+      mlir::dyn_cast_or_null<IntegerAttr>(attr.getAttrs().get("buckets"));
   if (!buckets) {
     return func.emitError() << "'buckets' attribute is not set or not int";
   }
@@ -316,15 +321,16 @@ LogicalResult CreateSgnnProjectionCustomOption(
   flexbuffers::Builder fbb;
   size_t start_map = fbb.StartMap();
 
-  auto hash_seed = attrs.get("hash_seed").dyn_cast_or_null<ArrayAttr>();
+  auto hash_seed = mlir::dyn_cast_or_null<ArrayAttr>(attrs.get("hash_seed"));
   auto vector_start = fbb.StartVector("hash_seed");
   for (int i = 0; i < hash_seed.size(); i++) {
     fbb.Add(static_cast<int32_t>(
-        (hash_seed.getValue().data() + i)->dyn_cast<IntegerAttr>().getInt()));
+        mlir::dyn_cast<IntegerAttr>(*(hash_seed.getValue().data() + i))
+            .getInt()));
   }
   fbb.EndVector(vector_start, /*typed=*/true, /*fixed=*/false);
 
-  auto buckets = attrs.get("buckets").dyn_cast_or_null<IntegerAttr>();
+  auto buckets = mlir::dyn_cast_or_null<IntegerAttr>(attrs.get("buckets"));
   fbb.Int("buckets", buckets.getInt());
 
   fbb.EndMap(start_map);
diff --git a/tensorflow/compiler/mlir/lite/utils/tftext_utils_test.cc b/tensorflow/compiler/mlir/lite/utils/tftext_utils_test.cc
index 4ddb6b1c4411be..5138d7475452cd 100644
--- a/tensorflow/compiler/mlir/lite/utils/tftext_utils_test.cc
+++ b/tensorflow/compiler/mlir/lite/utils/tftext_utils_test.cc
@@ -33,7 +33,7 @@ namespace {
 void Register(const std::string& op_name, OpRegistry* registry) {
   registry->Register([op_name](OpRegistrationData* op_reg_data) -> Status {
     op_reg_data->op_def.set_name(op_name);
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   });
 }
 
diff --git a/tensorflow/compiler/mlir/lite/utils/utils.h b/tensorflow/compiler/mlir/lite/utils/utils.h
index 9fce1bc44387c3..d73bf37ebd748a 100644
--- a/tensorflow/compiler/mlir/lite/utils/utils.h
+++ b/tensorflow/compiler/mlir/lite/utils/utils.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <utility>
+#include <vector>
 
 #include "llvm/ADT/ArrayRef.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
@@ -62,6 +63,139 @@ inline bool OpHasSameStaticShapes(Operation* op) {
   return true;
 }
 
+// Utility function to map final permutation to initial permutation
+// initial -> permutation1 -> permutation2 -> final
+inline DenseElementsAttr RemapPermutation(Value permutation1,
+                                          DenseElementsAttr perm2_const) {
+  SmallVector<int32_t> initial_permutation;
+  DenseElementsAttr perm1_const;
+
+  SmallVector<int32_t> new_permutation;
+  if (matchPattern(permutation1, m_Constant(&perm1_const))) {
+    for (int32_t idx = 0; idx < perm1_const.getNumElements(); ++idx) {
+      initial_permutation.push_back(idx);
+    }
+    for (auto perm : perm2_const.getValues<APInt>()) {
+      new_permutation.push_back(
+          initial_permutation[perm1_const
+                                  .getValues<APInt>()[perm.getSExtValue()]
+                                  .getSExtValue()]);
+    }
+  }
+
+  return mlir::DenseElementsAttr::get(
+      RankedTensorType::get(
+          {static_cast<int>(new_permutation.size())},
+          mlir::IntegerType::get(permutation1.getContext(), 32)),
+      llvm::ArrayRef(new_permutation));
+}
+
+// Utility function to map final permutation to initial permutation
+// initial -> permutation1 -> permutation2 -> final
+inline DenseElementsAttr RemapPermutation(Value permutation1,
+                                          Value permutation2) {
+  DenseElementsAttr perm2_const;
+  (void)matchPattern(permutation2, m_Constant(&perm2_const));
+
+  return RemapPermutation(permutation1, perm2_const);
+}
+
+// Returns true if the transpose op is trivial. Trivial means that
+// the permutation is a cyclic permutation of the original shape with only the
+// identity dimensions permuted.
+inline bool IsTransposeTrivial(llvm::ArrayRef<int64_t> input_shape,
+                               Value perm) {
+  DenseElementsAttr perm_values_attr;
+  if (!matchPattern(perm, m_Constant(&perm_values_attr))) return false;
+
+  SmallVector<int64_t, 8> perm_values;
+  for (const auto& dim : perm_values_attr.getValues<APInt>())
+    perm_values.push_back(dim.getSExtValue());
+
+  // This should never happen unless the input graph is malformed.
+  if (input_shape.size() != perm_values.size()) {
+    return false;
+  }
+
+  SmallVector<int, 8> old_major_index_ordering;
+  SmallVector<int, 8> new_major_index_ordering;
+  for (int i = 0, end = input_shape.size(); i < end; i++) {
+    if (input_shape[i] != 1) {
+      old_major_index_ordering.push_back(i);
+    }
+
+    if (input_shape[perm_values[i]] != 1) {
+      new_major_index_ordering.push_back(perm_values[i]);
+    }
+  }
+  return (old_major_index_ordering == new_major_index_ordering);
+}
+
+// Returns the permutation that maps the input shape to the output shape.
+// This is only valid for trivial reshape ops.
+inline DenseElementsAttr GetPermutationFromTrivialReshape(
+    ShapedType input_type, ShapedType output_type) {
+  ArrayRef<int64_t> in_shape = input_type.getShape();
+  ArrayRef<int64_t> out_shape = output_type.getShape();
+
+  // Get the indexes of the non-identity dimensions and the identity dimensions
+  // in the input shape.
+  SmallVector<int32_t> input_nonidentity_dims_index_array;
+  SmallVector<int32_t> input_identity_dims_index_array;
+
+  // Since the reshape is trivial, the input and output shapes should have the
+  // same number of dimensions. And the non-identity dimensions must be in the
+  // same cyclic order.
+  for (size_t idx = 0; idx < in_shape.size(); ++idx) {
+    if (in_shape[idx] != 1) {
+      input_nonidentity_dims_index_array.push_back(idx);
+    } else {
+      input_identity_dims_index_array.push_back(idx);
+    }
+  }
+
+  // Get the permutation that maps the input shape to the output shape.
+  SmallVector<int32_t> permutation;
+  size_t nonidentity_dims_index_poiter = 0;
+  size_t identity_dims_index_pointer = 0;
+  for (auto out_dim : out_shape) {
+    if (out_dim != 1) {
+      permutation.push_back(
+          input_nonidentity_dims_index_array[nonidentity_dims_index_poiter++]);
+    } else {
+      permutation.push_back(
+          input_identity_dims_index_array[identity_dims_index_pointer++]);
+    }
+  }
+
+  return mlir::DenseElementsAttr::get(
+      RankedTensorType::get(
+          {static_cast<int>(permutation.size())},
+          mlir::IntegerType::get(input_type.getContext(), 32)),
+      llvm::ArrayRef(permutation));
+}
+
+// Returns true if the reshape op is equivalent to a transpose op.
+// This is true if the reshape op is a trivial reshape op, meaning no change in
+// the order of non-identity dimensions.
+inline bool IsReshapeEquivalentToTranspose(ShapedType input_type,
+                                           ShapedType output_type) {
+  std::vector<int64_t> in_shape{input_type.getShape().vec()};
+  std::vector<int64_t> out_shape{output_type.getShape().vec()};
+
+  // If the reshape changes the number of dimensions so it cannot be interpreted
+  // as a transpose.
+  if (in_shape.size() != out_shape.size()) {
+    return false;
+  }
+
+  in_shape.erase(std::remove(in_shape.begin(), in_shape.end(), 1),
+                 in_shape.end());
+  out_shape.erase(std::remove(out_shape.begin(), out_shape.end(), 1),
+                  out_shape.end());
+  return in_shape == out_shape;
+}
+
 // Checks if all elements in the constant attribute value are 1.
 inline bool IsAllOnesConstant(Attribute value) {
   auto values = value.cast<DenseElementsAttr>().getValues<int32_t>();
diff --git a/tensorflow/compiler/mlir/lite/utils/utils.td b/tensorflow/compiler/mlir/lite/utils/utils.td
index 42af8c67b2a7ce..067c95f1ce4c15 100644
--- a/tensorflow/compiler/mlir/lite/utils/utils.td
+++ b/tensorflow/compiler/mlir/lite/utils/utils.td
@@ -19,6 +19,9 @@ include "mlir/IR/OpBase.td"
 include "mlir/Dialect/Func/IR/FuncOps.td"
 include "mlir/IR/PatternBase.td"
 
+def CreateNoneValue : NativeCodeCall<
+  "$_builder.create<TFL::NoValueOp>($0.getLoc(), $_builder.getUnitAttr())">;
+
 // Returns shape of a ranked tensor.
 // if called without a ranked tensor it will fail.
 def GetShape: NativeCodeCall<"GetShape($0)">;
@@ -26,6 +29,27 @@ def GetShape: NativeCodeCall<"GetShape($0)">;
 // Constraint that values in list attribute are all ones.
 def IsAllOnesConstant : Constraint<CPred<"TFL::IsAllOnesConstant($0)">>;
 
+// Constraint that checks if the transpose op is trivial. Trivial means that
+// the permutation is a cyclic permutation of the original shape with only the
+// identity dimensions permuted.
+def IsTransposeTrivial : Constraint<CPred<
+  "TFL::IsTransposeTrivial($0.getType().cast<ShapedType>().getShape(), $1)">>;
+
+// Constraint that checks if the reshape op is equivalent to a transpose op.
+// This is true if the reshape op is a trivial reshape op, meaning no change in
+// the order of non-identity dimensions.
+def IsReshapeEquivalentToTranspose : Constraint<CPred<
+  "TFL::IsReshapeEquivalentToTranspose("
+    "$0.getType().cast<ShapedType>(),"
+    "$1.getType().cast<ShapedType>())">>;
+
+// Returns the permutation of the trivial reshape op, this will be used to
+// construct the transpose op.
+def GetPermutationFromTrivialReshape : NativeCodeCall<
+  "TFL::GetPermutationFromTrivialReshape("
+    "$0.getType().cast<ShapedType>(),"
+    "$1.getType().cast<ShapedType>())">;
+
 // Constraint that checks if all values in offset between two
 // attributes are non-negative.
 def HasNonNegativeOffset : Constraint<CPred<"TFL::HasNonNegativeOffset($0, $1)">>;
@@ -59,6 +83,10 @@ def SameElementType : Constraint<
 class GetTransposedType<string perm> : NativeCodeCall<
   "GetTransposedType($0, " # perm # ")">;
 
+// Function to map final permutation to initial permutation
+// initial -> permutation1 -> permutation2 -> final
+def RemapPermutation: NativeCodeCall<"RemapPermutation($0, $1)">;
+
 // Checks if all of an ops inputs are the same static shape.
 // BUILD NOTE: "OpHasSameStaticShapes" here refers to the C++ function defined 
 // in `utils/utils.h`. The `utils.h` header is included in `tfl_ops.h` so all
diff --git a/tensorflow/compiler/mlir/lite/utils/validators.cc b/tensorflow/compiler/mlir/lite/utils/validators.cc
index f4714e00e5f2a4..902d7b144ba69d 100644
--- a/tensorflow/compiler/mlir/lite/utils/validators.cc
+++ b/tensorflow/compiler/mlir/lite/utils/validators.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "mlir/Dialect/Traits.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 
 namespace mlir {
 namespace TFL {
@@ -36,45 +37,45 @@ bool TFIntListIs1XY1(Operation *op, StringRef name, IntegerAttr *x,
   auto elements = attr.getValue();
   if (elements.size() != 4 ||
       std::any_of(elements.begin(), elements.end(),
-                  [](Attribute e) { return !e.isa<IntegerAttr>(); }))
+                  [](Attribute e) { return !mlir::isa<IntegerAttr>(e); }))
     return false;
 
-  if (elements.front().cast<IntegerAttr>().getInt() != 1 ||
-      elements.back().cast<IntegerAttr>().getInt() != 1)
+  if (mlir::cast<IntegerAttr>(elements.front()).getInt() != 1 ||
+      mlir::cast<IntegerAttr>(elements.back()).getInt() != 1)
     return false;
 
   Builder b(op->getContext());
-  *x = b.getI32IntegerAttr(elements[1].cast<IntegerAttr>().getInt());
-  *y = b.getI32IntegerAttr(elements[2].cast<IntegerAttr>().getInt());
+  *x = b.getI32IntegerAttr(mlir::cast<IntegerAttr>(elements[1]).getInt());
+  *y = b.getI32IntegerAttr(mlir::cast<IntegerAttr>(elements[2]).getInt());
 
   return true;
 }
 
 // Returns true if the attribute is an integer list of the form [1, X, Y, 1].
 bool TFIntListIs1XY1(const Attribute attr) {
-  const auto &elements = attr.cast<ArrayAttr>().getValue();
+  const auto &elements = mlir::cast<ArrayAttr>(attr).getValue();
   if (elements.size() != 4 ||
       std::any_of(elements.begin(), elements.end(),
-                  [](Attribute e) { return !e.isa<IntegerAttr>(); }))
+                  [](Attribute e) { return !mlir::isa<IntegerAttr>(e); }))
     return false;
 
-  if (elements.front().cast<IntegerAttr>().getValue() != 1 ||
-      elements.back().cast<IntegerAttr>().getValue() != 1)
+  if (mlir::cast<IntegerAttr>(elements.front()).getValue() != 1 ||
+      mlir::cast<IntegerAttr>(elements.back()).getValue() != 1)
     return false;
   return true;
 }
 
 // Returns true if the attribute is an integer list of the form [1, 1, X, Y].
 bool TFIntListIs11XY(const Attribute attr) {
-  const auto &elements = attr.cast<ArrayAttr>().getValue();
+  const auto &elements = mlir::cast<ArrayAttr>(attr).getValue();
   if (elements.size() != 4 ||
       std::any_of(elements.begin(), elements.end(),
-                  [](Attribute e) { return !e.isa<IntegerAttr>(); }))
+                  [](Attribute e) { return !mlir::isa<IntegerAttr>(e); }))
     return false;
 
   const Attribute *data = elements.data();
-  if (data[0].cast<IntegerAttr>().getValue() != 1 ||
-      data[1].cast<IntegerAttr>().getValue() != 1)
+  if (mlir::cast<IntegerAttr>(data[0]).getValue() != 1 ||
+      mlir::cast<IntegerAttr>(data[1]).getValue() != 1)
     return false;
   return true;
 }
@@ -91,17 +92,17 @@ bool TFIntListIs1XYZ1(Operation *op, StringRef name, IntegerAttr *x,
   auto elements = attr.getValue();
   if (elements.size() != 5 ||
       std::any_of(elements.begin(), elements.end(),
-                  [](Attribute e) { return !e.isa<IntegerAttr>(); }))
+                  [](Attribute e) { return !mlir::isa<IntegerAttr>(e); }))
     return false;
 
-  if (elements.front().cast<IntegerAttr>().getInt() != 1 ||
-      elements.back().cast<IntegerAttr>().getInt() != 1)
+  if (mlir::cast<IntegerAttr>(elements.front()).getInt() != 1 ||
+      mlir::cast<IntegerAttr>(elements.back()).getInt() != 1)
     return false;
 
   Builder b(op->getContext());
-  *x = b.getI32IntegerAttr(elements[1].cast<IntegerAttr>().getInt());
-  *y = b.getI32IntegerAttr(elements[2].cast<IntegerAttr>().getInt());
-  *z = b.getI32IntegerAttr(elements[3].cast<IntegerAttr>().getInt());
+  *x = b.getI32IntegerAttr(mlir::cast<IntegerAttr>(elements[1]).getInt());
+  *y = b.getI32IntegerAttr(mlir::cast<IntegerAttr>(elements[2]).getInt());
+  *z = b.getI32IntegerAttr(mlir::cast<IntegerAttr>(elements[3]).getInt());
 
   return true;
 }
@@ -109,10 +110,10 @@ bool TFIntListIs1XYZ1(Operation *op, StringRef name, IntegerAttr *x,
 // Returns true if every element of the attribute is 1. All elements of `attr`
 // must be `IntegerAttr`.
 bool TFIntListIsAllOnes(const Attribute attr) {
-  const auto &elements = attr.cast<ArrayAttr>().getValue();
+  const auto &elements = mlir::cast<ArrayAttr>(attr).getValue();
 
   return !std::any_of(elements.begin(), elements.end(), [](Attribute e) {
-    return e.cast<IntegerAttr>().getValue() != 1;
+    return mlir::cast<IntegerAttr>(e).getValue() != 1;
   });
 }
 
@@ -133,7 +134,7 @@ bool IsDimensionsDegenerateExceptLastOne(ArrayRef<int64_t> elements_shape) {
 }
 
 bool IsDimensionsDegenerateExceptLastOne(TypedAttr val) {
-  if (auto ranked_type = val.getType().dyn_cast<RankedTensorType>()) {
+  if (auto ranked_type = mlir::dyn_cast<RankedTensorType>(val.getType())) {
     return IsDimensionsDegenerateExceptLastOne(ranked_type.getShape());
   }
   return false;
diff --git a/tensorflow/compiler/mlir/lite/utils/validators.h b/tensorflow/compiler/mlir/lite/utils/validators.h
index 08d2e7b068b4be..0e7370c5fa499b 100644
--- a/tensorflow/compiler/mlir/lite/utils/validators.h
+++ b/tensorflow/compiler/mlir/lite/utils/validators.h
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 
 namespace mlir {
 namespace TFL {
@@ -70,21 +71,21 @@ bool TFIntListIsAllOnes(Attribute attr);
 // Returns true iff the given value is a float32 tensor.
 // is "DT_FLOAT".
 inline bool TFTypeIsFloat32Tensor(Value value) {
-  auto tensorType = value.getType().dyn_cast<TensorType>();
+  auto tensorType = mlir::dyn_cast<TensorType>(value.getType());
   if (!tensorType) return false;
   return tensorType.getElementType().isF32();
 }
 
 // Returns true iff the given value is a bf16 tensor.
 inline bool TFTypeIsBFloat16Tensor(Value value) {
-  auto tensorType = value.getType().dyn_cast<TensorType>();
+  auto tensorType = mlir::dyn_cast<TensorType>(value.getType());
   if (!tensorType) return false;
   return tensorType.getElementType().isBF16();
 }
 
 // Returns true iff the given value is a f16 tensor.
 inline bool TFTypeIsHalfTensor(Value value) {
-  auto tensorType = value.getType().dyn_cast<TensorType>();
+  auto tensorType = mlir::dyn_cast<TensorType>(value.getType());
   if (!tensorType) return false;
   return tensorType.getElementType().isF16();
 }
diff --git a/tensorflow/compiler/mlir/op_or_arg_name_mapper.cc b/tensorflow/compiler/mlir/op_or_arg_name_mapper.cc
index f5912553f10dbe..8f3261f6574ff7 100644
--- a/tensorflow/compiler/mlir/op_or_arg_name_mapper.cc
+++ b/tensorflow/compiler/mlir/op_or_arg_name_mapper.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/utils/name_utils.h"
 
 static inline absl::string_view StringRefToView(llvm::StringRef ref) {
@@ -123,7 +124,7 @@ std::string OpOrArgLocNameMapper::GetName(OpOrVal op_or_val) {
   // If the location is none of the expected types, then simply use name
   // generated using the op type. Follow TF convention and append the result
   // index unless 0.
-  if (auto result = val.dyn_cast<mlir::OpResult>()) {
+  if (auto result = mlir::dyn_cast<mlir::OpResult>(val)) {
     if (result.getResultNumber() > 0)
       return llvm::formatv("{0}:{1}",
                            result.getOwner()->getName().getStringRef(),
@@ -131,7 +132,7 @@ std::string OpOrArgLocNameMapper::GetName(OpOrVal op_or_val) {
     return std::string(result.getOwner()->getName().getStringRef());
   }
   // Use the ASM syntax for BlockArgument
-  if (auto arg = val.dyn_cast<mlir::BlockArgument>()) {
+  if (auto arg = mlir::dyn_cast<mlir::BlockArgument>(val)) {
     return "arg" + std::to_string(arg.getArgNumber());
   }
   return "";
diff --git a/tensorflow/compiler/mlir/python/mlir.cc b/tensorflow/compiler/mlir/python/mlir.cc
index 4841c0ad85714f..de1226c68c39d0 100644
--- a/tensorflow/compiler/mlir/python/mlir.cc
+++ b/tensorflow/compiler/mlir/python/mlir.cc
@@ -66,7 +66,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tosa/tfl_passes.h"
 #include "tensorflow/compiler/mlir/tosa/transforms/passes.h"
 #include "xla/mlir/framework/transforms/passes.h"
-#include "xla/mlir_hlo/lhlo/transforms/passes.h"
 #include "xla/mlir_hlo/mhlo/transforms/passes.h"
 #include "xla/status_macros.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
@@ -92,7 +91,6 @@ static void RegisterPasses() {
     mlir::registerTensorFlowPasses();
     mlir::TFDevice::registerTensorFlowDevicePasses();
     mlir::mhlo::registerAllMhloPasses();
-    mlir::lmhlo::registerAllLmhloPasses();
     // These are in compiler/mlir/xla and not part of the above MHLO
     // passes.
     mlir::mhlo::registerTfXlaPasses();
diff --git a/tensorflow/compiler/mlir/quantization/common/BUILD b/tensorflow/compiler/mlir/quantization/common/BUILD
index da122b67993af7..448b34717282b7 100644
--- a/tensorflow/compiler/mlir/quantization/common/BUILD
+++ b/tensorflow/compiler/mlir/quantization/common/BUILD
@@ -40,16 +40,18 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:xla_call_module_attrs",
         "//tensorflow/core:framework_lite",
         "//tensorflow/core/ir/types:Dialect",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:BytecodeOpInterface",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
-        "@local_tsl//tsl/platform:protobuf",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.cc b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.cc
index 540eff26685968..1367e7e5eaa175 100644
--- a/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.cc
+++ b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.cc
@@ -42,14 +42,14 @@ namespace mlir::quant {
 using ::mlir::stablehlo::DotGeneralOp;
 
 bool HasStaticShape(Value value) {
-  auto shaped_type = value.getType().dyn_cast<ShapedType>();
+  auto shaped_type = mlir::dyn_cast<ShapedType>(value.getType());
   if (!shaped_type) return false;
 
   return shaped_type.hasStaticShape();
 }
 
 bool HasStaticShapeAtDims(Value value, const ArrayRef<int> dims) {
-  auto shaped_type = value.getType().dyn_cast<ShapedType>();
+  auto shaped_type = mlir::dyn_cast<ShapedType>(value.getType());
   if (!shaped_type || !shaped_type.hasRank()) return false;
 
   for (auto dim : dims) {
@@ -59,9 +59,9 @@ bool HasStaticShapeAtDims(Value value, const ArrayRef<int> dims) {
 }
 
 Type CloneTypeWithNewElementType(Type old_type, Type element_type) {
-  if (!old_type.isa<ShapedType>()) return {};
+  if (!mlir::isa<ShapedType>(old_type)) return {};
 
-  return old_type.cast<ShapedType>().clone(element_type);
+  return mlir::cast<ShapedType>(old_type).clone(element_type);
 }
 
 SmallVector<Value> CloneOpWithReplacedOperands(
@@ -133,9 +133,11 @@ absl::StatusOr<bool> IsDotGeneralFullyConnected(DotGeneralOp dot_general_op) {
   const ArrayRef<int64_t> rhs_contracting_dims =
       dot_dimension_numbers.getRhsContractingDimensions();
   const int64_t input_rank =
-      dot_general_op.getOperand(0).getType().dyn_cast<ShapedType>().getRank();
+      mlir::dyn_cast<ShapedType>(dot_general_op.getOperand(0).getType())
+          .getRank();
   const int64_t filter_rank =
-      dot_general_op.getOperand(1).getType().dyn_cast<ShapedType>().getRank();
+      mlir::dyn_cast<ShapedType>(dot_general_op.getOperand(1).getType())
+          .getRank();
   // The following conditions are such requirements:
   //   - rank(lhs) is 1 or 2
   //   - rank(rhs) = 2
@@ -164,7 +166,8 @@ std::optional<int64_t> GetDotGeneralQuantizationDim(
     DotGeneralOp dot_general_op) {
   if (dot_general_op == nullptr) return std::nullopt;
   const int64_t filter_rank =
-      dot_general_op.getOperand(1).getType().dyn_cast<ShapedType>().getRank();
+      mlir::dyn_cast<ShapedType>(dot_general_op.getOperand(1).getType())
+          .getRank();
 
   // To quantize rhs per-channel, we currently only consider the case where
   // `stablehlo.dot_general` is legalizable to `tfl.fully_connected`.
@@ -174,4 +177,8 @@ std::optional<int64_t> GetDotGeneralQuantizationDim(
   return filter_rank - 1;
 }
 
+bool ContainsConvOrDot(StringRef str) {
+  return str.contains("_conv") || str.contains("_dot_general");
+}
+
 }  // namespace mlir::quant
diff --git a/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h
index 490a77a3b73ffa..e94f9359d6fad2 100644
--- a/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h
+++ b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h
@@ -42,6 +42,10 @@ namespace mlir::quant {
 
 constexpr char kAttrMapAttribute[] = "attr_map";
 
+// Name of the string attribute attached to `XlaCallModuleOp`, which is the
+// textproto representation of `Method`.
+inline constexpr StringRef kQuantizationMethodAttr = "_quantization_method";
+
 // Permutation from the NHWC tensor format to NCHW. This is an inverse
 // permutation of `kNchwToNhwcPermutation`.
 inline constexpr std::array<int64_t, 4> kNhwcToNchwPermutation = {0, 3, 1, 2};
@@ -65,7 +69,7 @@ bool HasStaticShapeAtDims(Value value, ArrayRef<int> dims);
 // Whether `value` has known rank of `rank`. Returns false when it is not a
 // `ShapedType` or its rank is unknown.
 inline bool HasRankOf(Value value, const int64_t rank) {
-  auto shaped_type = value.getType().dyn_cast_or_null<ShapedType>();
+  auto shaped_type = mlir::dyn_cast_or_null<ShapedType>(value.getType());
   return shaped_type && shaped_type.hasRank() && shaped_type.getRank() == rank;
 }
 
@@ -215,7 +219,7 @@ Operation* FindOperandOfType(Operation* op) {
 // Returns the function attribute for the given call op which is lifted for
 // quantization.
 inline FlatSymbolRefAttr GetFuncAttr(TF::PartitionedCallOp call_op) {
-  return call_op.getFAttr().template dyn_cast<FlatSymbolRefAttr>();
+  return mlir::dyn_cast<FlatSymbolRefAttr>(call_op.getFAttr());
 }
 
 inline FlatSymbolRefAttr GetFuncAttr(TF::XlaCallModuleOp call_op) {
@@ -248,6 +252,9 @@ absl::StatusOr<bool> IsDotGeneralFullyConnected(
 std::optional<int64_t> GetDotGeneralQuantizationDim(
     ::mlir::stablehlo::DotGeneralOp dot_general_op);
 
+// Checks if a `StringRef` contains 'conv' or 'dot_general'.
+bool ContainsConvOrDot(StringRef str);
+
 }  // namespace mlir::quant
 
 #endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_ATTRS_AND_CONSTRAINTS_H_
diff --git a/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints_test.cc b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints_test.cc
index ca0df77f81b51c..720616309afe38 100644
--- a/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints_test.cc
+++ b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints_test.cc
@@ -98,6 +98,21 @@ constexpr absl::string_view kModuleXlaCallModule = R"mlir(
   }
 )mlir";
 
+constexpr absl::string_view kModuleDotWeightOnlyPtq = R"mlir(
+  module {
+    func.func @main(%arg0: tensor<?x2xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<?x2xf32>) {
+      %0 = stablehlo.constant dense<[-0.211145893, -0.708605706]> : tensor<2xf32>
+      %1 = stablehlo.constant dense<[[-0.630731344, 0.54962182], [0.180364341, -0.764542698]]> : tensor<2x2xf32>
+      %2 = "tf.XlaCallModule"(%arg0, %1, %0) <{Sout = [#tf_type.shape<?x2>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable", _quantization_method = "weight_only_ptq { }"} : (tensor<?x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<?x2xf32>
+      return %2 : tensor<?x2xf32>
+    }
+    func.func private @composite_dot_general_fn_1(%arg0: tensor<?x2xf32>, %arg1: tensor<2x2xf32>, %arg2: tensor<2xf32>) -> tensor<?x2xf32> attributes {_from_xla_call_module, tf_quant.composite_function} {
+      %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<?x2xf32>, tensor<2x2xf32>) -> tensor<?x2xf32>
+      return %0 : tensor<?x2xf32>
+    }
+  }
+)mlir";
+
 constexpr absl::string_view kModuleXlaCallModuleNoEntryNoQuantTrait = R"mlir(
   module {
     func.func @main(%arg0: tensor<?x2xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<?x2xf32>) {
@@ -128,8 +143,8 @@ constexpr absl::string_view kModulePartitionedCall = R"mlir(
 
 constexpr absl::string_view kModuleHybridQuantized = R"mlir(
   module {
-    func.func @main(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03:-128>> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<1x3xf32>) {
-      %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<1x3xf32>
+    func.func @main(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03:0>> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<1x3xf32>) {
+      %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03:0>>) -> tensor<1x3xf32>
       return %0 : tensor<1x3xf32>
     }
   }
@@ -526,5 +541,31 @@ TEST_F(AttrsAndConstraintsTest, DotGeneralBatchMatmulReturnsNullQuantDim) {
   EXPECT_THAT(GetDotGeneralQuantizationDim(dot_general_op), Eq(std::nullopt));
 }
 
+TEST_F(AttrsAndConstraintsTest, ContainsConvOrDotTrue) {
+  OwningOpRef<ModuleOp> module_op =
+      ParseModuleOpString(kModuleDotWeightOnlyPtq);
+  ASSERT_TRUE(module_op);
+
+  func::FuncOp main_fn = FindMainFuncOp(*module_op);
+  ASSERT_THAT(main_fn, NotNull());
+
+  auto call_op = *main_fn.getOps<TF::XlaCallModuleOp>().begin();
+  const StringRef function_name = GetEntryFunctionName(call_op);
+  EXPECT_TRUE(ContainsConvOrDot(function_name));
+}
+
+TEST_F(AttrsAndConstraintsTest, ContainsConvOrDotFalse) {
+  OwningOpRef<ModuleOp> module_op =
+      ParseModuleOpString(kModuleXlaCallModuleNoEntryNoQuantTrait);
+  ASSERT_TRUE(module_op);
+
+  func::FuncOp main_fn = FindMainFuncOp(*module_op);
+  ASSERT_THAT(main_fn, NotNull());
+
+  auto call_op = *main_fn.getOps<TF::XlaCallModuleOp>().begin();
+  const StringRef function_name = GetEntryFunctionName(call_op);
+  EXPECT_FALSE(ContainsConvOrDot(function_name));
+}
+
 }  // namespace
 }  // namespace mlir::quant
diff --git a/tensorflow/compiler/mlir/quantization/common/ir/QuantOps.cc b/tensorflow/compiler/mlir/quantization/common/ir/QuantOps.cc
index 7bd7424e4d1c6a..6ddebac1ff00f9 100644
--- a/tensorflow/compiler/mlir/quantization/common/ir/QuantOps.cc
+++ b/tensorflow/compiler/mlir/quantization/common/ir/QuantOps.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Matchers.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/common/ir/QuantOpsDialect.cc.inc"
 
 namespace mlir::quant::ir {
@@ -49,20 +50,20 @@ OpFoldResult StorageCastOp::fold(FoldAdaptor) {
 
 /// The quantization specification should match the expressed type.
 static bool isValidQuantizationSpec(Attribute quantSpec, Type expressed) {
-  if (auto typeAttr = quantSpec.dyn_cast<TypeAttr>()) {
+  if (auto typeAttr = mlir::dyn_cast<TypeAttr>(quantSpec)) {
     Type spec = typeAttr.getValue();
-    if (spec.isa<TensorType, VectorType>()) return false;
+    if (mlir::isa<TensorType, VectorType>(spec)) return false;
 
     // The spec should be either a quantized type which is compatible to the
     // expressed type, or a primitive type which is as same as the
     // (element type of) the expressed type.
-    if (auto quantizedType = spec.dyn_cast<QuantizedType>())
+    if (auto quantizedType = mlir::dyn_cast<QuantizedType>(spec))
       return quantizedType.isCompatibleExpressedType(expressed);
 
-    if (auto tensorType = expressed.dyn_cast<TensorType>())
+    if (auto tensorType = mlir::dyn_cast<TensorType>(expressed))
       return spec == tensorType.getElementType();
 
-    if (auto vectorType = expressed.dyn_cast<VectorType>())
+    if (auto vectorType = mlir::dyn_cast<VectorType>(expressed))
       return spec == vectorType.getElementType();
   }
   return false;
@@ -97,13 +98,13 @@ LogicalResult QuantizeRegionOp::verify() {
 }
 
 LogicalResult StatisticsOp::verify() {
-  auto tensorArg = getArg().getType().dyn_cast<TensorType>();
+  auto tensorArg = mlir::dyn_cast<TensorType>(getArg().getType());
   if (!tensorArg) return emitOpError("arg needs to be tensor type.");
 
   // Verify layerStats attribute.
   {
     auto layerStatsType = getLayerStats().getShapedType();
-    if (!layerStatsType.getElementType().isa<FloatType>()) {
+    if (!mlir::isa<FloatType>(layerStatsType.getElementType())) {
       return emitOpError("layerStats must have a floating point element type");
     }
     if (layerStatsType.getRank() != 1 || layerStatsType.getDimSize(0) != 2) {
@@ -120,7 +121,7 @@ LogicalResult StatisticsOp::verify() {
                         std::multiplies<int64_t>());
 
     auto axisStatsType = getAxisStats()->getShapedType();
-    if (!axisStatsType.getElementType().isa<FloatType>()) {
+    if (!mlir::isa<FloatType>(axisStatsType.getElementType())) {
       return emitOpError("axisStats must have a floating point element type");
     }
     if (axisStatsType.getRank() != 2 || axisStatsType.getDimSize(1) != 2 ||
diff --git a/tensorflow/compiler/mlir/quantization/common/ir/QuantOpsBase.td b/tensorflow/compiler/mlir/quantization/common/ir/QuantOpsBase.td
index d891ed17ee1443..fb762f933d6f00 100644
--- a/tensorflow/compiler/mlir/quantization/common/ir/QuantOpsBase.td
+++ b/tensorflow/compiler/mlir/quantization/common/ir/QuantOpsBase.td
@@ -27,7 +27,6 @@ include "mlir/IR/OpBase.td"
 def Quant_Dialect : Dialect {
   let name = "quantization";
   let cppNamespace = "::mlir::quant::ir";
-  let usePropertiesForAttributes = 0;
 }
 
 #endif // QUANTIZATION_BASE
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.cc b/tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.cc
index 5a200241af00dd..c0509bb8243bfc 100644
--- a/tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.cc
+++ b/tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.cc
@@ -15,54 +15,66 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.h"
 
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <iterator>
 #include <numeric>
 
+#include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 
-using namespace mlir;
-using namespace mlir::quantfork;
+namespace mlir::quantfork {
 
-static bool isQuantizablePrimitiveType(Type inputType) {
-  return inputType.isa<FloatType>();
+static bool isQuantizablePrimitiveType(Type input_type) {
+  return isa<FloatType>(input_type);
 }
 
 ExpressedToQuantizedConverter ExpressedToQuantizedConverter::forInputType(
-    Type inputType) {
-  if (inputType.isa<TensorType, VectorType>()) {
-    Type elementType = inputType.cast<ShapedType>().getElementType();
-    if (!isQuantizablePrimitiveType(elementType))
-      return ExpressedToQuantizedConverter{inputType, nullptr};
-    return ExpressedToQuantizedConverter{inputType, elementType};
+    Type input_type) {
+  if (isa<TensorType, VectorType>(input_type)) {
+    Type element_type = cast<ShapedType>(input_type).getElementType();
+    if (!isQuantizablePrimitiveType(element_type))
+      return ExpressedToQuantizedConverter{input_type, nullptr};
+    return ExpressedToQuantizedConverter{input_type, element_type};
   }
   // Supported primitive type (which just is the expressed type).
-  if (isQuantizablePrimitiveType(inputType))
-    return ExpressedToQuantizedConverter{inputType, inputType};
+  if (isQuantizablePrimitiveType(input_type))
+    return ExpressedToQuantizedConverter{input_type, input_type};
   // Unsupported.
-  return ExpressedToQuantizedConverter{inputType, nullptr};
+  return ExpressedToQuantizedConverter{input_type, nullptr};
 }
 
 Type ExpressedToQuantizedConverter::convert(
-    quant::QuantizedType elementalType) const {
-  assert(expressedType && "convert() on unsupported conversion");
-  if (auto tensorType = inputType.dyn_cast<RankedTensorType>())
-    return RankedTensorType::get(tensorType.getShape(), elementalType);
-  if (auto tensorType = inputType.dyn_cast<UnrankedTensorType>())
-    return UnrankedTensorType::get(elementalType);
-  if (auto vectorType = inputType.dyn_cast<VectorType>())
-    return VectorType::get(vectorType.getShape(), elementalType);
+    quant::QuantizedType elemental_type) const {
+  assert(expressed_type && "convert() on unsupported conversion");
+  if (auto tensor_type = dyn_cast<RankedTensorType>(input_type))
+    return RankedTensorType::get(tensor_type.getShape(), elemental_type);
+  if (auto tensor_type = dyn_cast<UnrankedTensorType>(input_type))
+    return UnrankedTensorType::get(elemental_type);
+  if (auto vector_type = dyn_cast<VectorType>(input_type))
+    return VectorType::get(vector_type.getShape(), elemental_type);
 
   // If the expressed types match, just use the new elemental type.
-  if (elementalType.getExpressedType() == expressedType) return elementalType;
+  if (elemental_type.getExpressedType() == expressed_type) {
+    return elemental_type;
+  }
   // Unsupported.
   return nullptr;
 }
 
 ElementsAttr UniformQuantizedPerAxisValueConverter::convert(
-    Attribute realValue) {
-  if (auto attr = realValue.dyn_cast<DenseFPElementsAttr>()) {
+    Attribute real_value) {
+  if (auto attr = dyn_cast<DenseFPElementsAttr>(real_value)) {
     return convert(attr);
   }
-  // TODO: handles sparse elements attribute
   return nullptr;
 }
 
@@ -71,26 +83,30 @@ DenseElementsAttr UniformQuantizedPerAxisValueConverter::convert(
   // Creates the converter for each chunk. Normally the size of the
   // quantization dim is 3, so we can cache all the converters.
   ShapedType type = attr.getType();
-  size_t dimSize = type.getDimSize(quantizationDim);
-  if (dimSize != scales.size()) {
+  std::size_t dim_size = type.getDimSize(quantization_dim_);
+  if (dim_size != scales_.size()) {
     return {};
   }
   SmallVector<UniformQuantizedValueConverter, 4> converters;
-  converters.reserve(dimSize);
-  for (int i = 0, e = dimSize; i != e; ++i) {
+  converters.reserve(dim_size);
+  for (int i = 0, e = dim_size; i != e; ++i) {
     converters.push_back(getPerChunkConverter(i));
   }
 
   // Scan the elements of the dense elements attributes and quantize them by
   // using the right quantization parameters.
-  int64_t flattenIndex = 0;
+  int64_t flatten_index = 0;
   auto shape = type.getShape();
-  int64_t chunkSize =
-      std::accumulate(std::next(shape.begin(), quantizationDim + 1),
+  int64_t chunk_size =
+      std::accumulate(std::next(shape.begin(), quantization_dim_ + 1),
                       shape.end(), 1, std::multiplies<int64_t>());
-  Type newElementType = IntegerType::get(attr.getContext(), storageBitWidth);
-  return attr.mapValues(newElementType, [&](const APFloat &old) {
-    int chunkIndex = (flattenIndex++) / chunkSize;
-    return converters[chunkIndex % dimSize].quantizeFloatToInt(old);
+  Type new_element_type =
+      IntegerType::get(attr.getContext(), storage_bit_width_);
+  return attr.mapValues(new_element_type, [&](const APFloat &old) {
+    int chunk_index = flatten_index / chunk_size;
+    flatten_index++;
+    return converters[chunk_index % dim_size].quantizeFloatToInt(old);
   });
 }
+
+}  // namespace mlir::quantfork
diff --git a/tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.h b/tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.h
index b6f65e455d0c09..c0c6c30e0d6e58 100644
--- a/tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.h
+++ b/tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.h
@@ -16,130 +16,139 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_IR_UNIFORMSUPPORT_H_
 #define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_IR_UNIFORMSUPPORT_H_
 
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <cstring>
 #include <utility>
 
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/APSInt.h"
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
-
-namespace mlir {
-namespace quantfork {
-
-/// Performs type conversion from an arbitrary input type to a type
-/// that is expressed by a QuantizedType.
-///
-/// This handles cases where the inputType is a supported primitive type
-/// (i.e. f32, bf16, etc) or a vector/tensor type based on a supported
-/// elemental type.
-///
-/// Since conversion often involves introspecting some attributes of the
-/// input type in order to determine how to represent it, this is a two step
-/// process.
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace mlir::quantfork {
+
+// Performs type conversion from an arbitrary input type to a type
+// that is expressed by a QuantizedType.
+//
+// This handles cases where the inputType is a supported primitive type
+// (i.e. f32, bf16, etc) or a vector/tensor type based on a supported
+// elemental type.
+//
+// Since conversion often involves introspecting some attributes of the
+// input type in order to determine how to represent it, this is a two step
+// process.
 struct ExpressedToQuantizedConverter {
-  /// Creates a converter for the given input type.
-  static ExpressedToQuantizedConverter forInputType(Type inputType);
+  // Creates a converter for the given input type.
+  static ExpressedToQuantizedConverter forInputType(Type input_type);
 
-  /// Converts the inputType to be based on the given elemental type,
-  /// returning the new type (or nullptr and emit an error on failure).
-  Type convert(quant::QuantizedType elementalType) const;
+  // Converts the inputType to be based on the given elemental type,
+  // returning the new type (or nullptr and emit an error on failure).
+  Type convert(quant::QuantizedType elemental_type) const;
 
-  /// Whether the conversion is legal.
-  explicit operator bool() const { return (bool)expressedType; }
+  // Whether the conversion is legal.
+  explicit operator bool() const { return (bool)expressed_type; }
 
-  /// The input type that is being converted from.
-  /// This may be an elemental or composite type.
-  const Type inputType;
+  // The input type that is being converted from.
+  // This may be an elemental or composite type.
+  const Type input_type;
 
-  /// Supported, elemental expressed type (i.e. f32).
-  /// Will be nullptr if conversion is not supported.
-  const Type expressedType;
+  // Supported, elemental expressed type (i.e. f32).
+  // Will be nullptr if conversion is not supported.
+  const Type expressed_type;
 };
 
-/// Reference implementation of converting between real numbers and values
-/// represented by a UniformQuantizedType.
-/// Note that this is not expected to be speedy and may be superseded eventually
-/// by a more optimal implementation.
-/// Also, the interface assumes that quantization is done per-layer and will
-/// need to be wider for various per-channel schemes. As such, this is a
-/// placeholder.
+// Reference implementation of converting between real numbers and values
+// represented by a UniformQuantizedType.
+// Note that this is not expected to be speedy and may be superseded eventually
+// by a more optimal implementation.
+// Also, the interface assumes that quantization is done per-layer and will
+// need to be wider for various per-channel schemes. As such, this is a
+// placeholder.
 class UniformQuantizedValueConverter {
  public:
   explicit UniformQuantizedValueConverter(
-      quant::UniformQuantizedType uniformType)
+      quant::UniformQuantizedType uniform_type)
       : UniformQuantizedValueConverter(
-            uniformType.getScale(),
-            static_cast<double>(uniformType.getZeroPoint()),
-            static_cast<double>(uniformType.getStorageTypeMin()),
-            static_cast<double>(uniformType.getStorageTypeMax()),
-            uniformType.getStorageTypeIntegralWidth(), uniformType.isSigned()) {
-    assert(uniformType.getExpressedType().isa<FloatType>());
-    assert(uniformType.getStorageType().isSignlessInteger());
+            uniform_type.getScale(),
+            static_cast<double>(uniform_type.getZeroPoint()),
+            static_cast<double>(uniform_type.getStorageTypeMin()),
+            static_cast<double>(uniform_type.getStorageTypeMax()),
+            uniform_type.getStorageTypeIntegralWidth(),
+            uniform_type.isSigned()) {
+    assert(isa<FloatType>(uniform_type.getExpressedType()));
+    assert(uniform_type.getStorageType().isSignlessInteger());
   }
 
-  UniformQuantizedValueConverter(double scale, double zeroPoint,
-                                 double clampMin, double clampMax,
-                                 uint32_t storageBitWidth, bool isSigned)
-      : scale(scale),
-        zeroPoint(zeroPoint),
-        clampMin(clampMin),
-        clampMax(clampMax),
-        scaleDouble(scale),
-        zeroPointDouble(zeroPoint),
-        clampMinDouble(clampMin),
-        clampMaxDouble(clampMax),
-        storageBitWidth(storageBitWidth),
-        isSigned(isSigned),
-        roundMode(APFloat::rmNearestTiesToAway) {}
-
-  UniformQuantizedValueConverter(double scale, double zeroPoint,
-                                 const APFloat &clampMin,
-                                 const APFloat &clampMax,
-                                 uint32_t storageBitWidth, bool isSigned)
-      : scale(scale),
-        zeroPoint(zeroPoint),
-        clampMin(clampMin),
-        clampMax(clampMax),
-        scaleDouble(scale),
-        zeroPointDouble(zeroPoint),
-        clampMinDouble(clampMin.convertToDouble()),
-        clampMaxDouble(clampMax.convertToDouble()),
-        storageBitWidth(storageBitWidth),
-        isSigned(isSigned),
-        roundMode(APFloat::rmNearestTiesToAway) {}
-
-  virtual APInt quantizeFloatToInt(APFloat expressedValue) const {
+  UniformQuantizedValueConverter(double scale, double zero_point,
+                                 double clamp_min, double clamp_max,
+                                 uint32_t storage_bit_width, bool is_signed)
+      : scale_(scale),
+        zero_point_(zero_point),
+        clamp_min_(clamp_min),
+        clamp_max_(clamp_max),
+        scale_double_(scale),
+        zero_point_double_(zero_point),
+        clamp_min_double_(clamp_min),
+        clamp_max_double_(clamp_max),
+        storage_bit_width_(storage_bit_width),
+        is_signed_(is_signed),
+        round_mode_(APFloat::rmNearestTiesToAway) {}
+
+  UniformQuantizedValueConverter(double scale, double zero_point,
+                                 const APFloat& clamp_min,
+                                 const APFloat& clamp_max,
+                                 uint32_t storage_bit_width, bool is_signed)
+      : scale_(scale),
+        zero_point_(zero_point),
+        clamp_min_(clamp_min),
+        clamp_max_(clamp_max),
+        scale_double_(scale),
+        zero_point_double_(zero_point),
+        clamp_min_double_(clamp_min.convertToDouble()),
+        clamp_max_double_(clamp_max.convertToDouble()),
+        storage_bit_width_(storage_bit_width),
+        is_signed_(is_signed),
+        round_mode_(APFloat::rmNearestTiesToAway) {}
+
+  virtual APInt quantizeFloatToInt(APFloat expressed_value) const {
     // This function is a performance critical code path in quantization
     // since it runs for each single float parameter value.
 
     // Specialize f32->u8/i8 case to optimize performance.
-    if (&expressedValue.getSemantics() == &APFloat::IEEEsingle() &&
-        storageBitWidth == 8 &&
-        roundMode == llvm::APFloatBase::rmNearestTiesToAway) {
-      return quantizeF32ToInt8(expressedValue);
+    if (&expressed_value.getSemantics() == &APFloat::IEEEsingle() &&
+        storage_bit_width_ == 8 &&
+        round_mode_ == llvm::APFloatBase::rmNearestTiesToAway) {
+      return quantizeF32ToInt8(expressed_value);
     }
 
     bool lossy;
-    expressedValue.convert(scale.getSemantics(), roundMode, &lossy);
-    // fixedpoint = clamp(clampMin, clampMax, (
-    //   roundHalfToEven(expressed / scale) + zeroPoint))
-    APFloat scaled = (expressedValue / scale);
-    scaled.roundToIntegral(roundMode);
-    scaled.add(zeroPoint, roundMode);
-    APFloat fixedpoint = llvm::minimum(scaled, clampMax);
-    fixedpoint = llvm::maximum(fixedpoint, clampMin);
-
-    llvm::APSInt result(storageBitWidth, !isSigned);
-    fixedpoint.convertToInteger(result, roundMode, &lossy);
+    expressed_value.convert(scale_.getSemantics(), round_mode_, &lossy);
+    // fixed_point = clamp(clamp_min, clamp_max, (
+    //   roundHalfToEven(expressed / scale) + zero_point))
+    APFloat scaled = (expressed_value / scale_);
+    scaled.roundToIntegral(round_mode_);
+    scaled.add(zero_point_, round_mode_);
+    APFloat fixed_point = llvm::minimum(scaled, clamp_max_);
+    fixed_point = llvm::maximum(fixed_point, clamp_min_);
+
+    llvm::APSInt result(storage_bit_width_, !is_signed_);
+    fixed_point.convertToInteger(result, round_mode_, &lossy);
 
     return std::move(result);
   }
 
-  int64_t quantizeFloatToInt64(APFloat expressedValue) const {
-    APInt qValue = quantizeFloatToInt(std::move(expressedValue));
-    return isSigned ? qValue.getSExtValue() : qValue.getZExtValue();
+  int64_t quantizeFloatToInt64(APFloat expressed_value) const {
+    const APInt q_value = quantizeFloatToInt(std::move(expressed_value));
+    return is_signed_ ? q_value.getSExtValue() : q_value.getZExtValue();
   }
 
   virtual ~UniformQuantizedValueConverter() = default;
@@ -147,94 +156,92 @@ class UniformQuantizedValueConverter {
  private:
   // An optimized implementation to quantize f32 to i8/u8 with C++ native
   // arithmetic.
-  virtual APInt quantizeF32ToInt8(APFloat expressedValue) const {
-    assert(&expressedValue.getSemantics() == &APFloat::IEEEsingle());
-    assert(storageBitWidth == 8);
-    assert(roundMode == llvm::APFloatBase::rmNearestTiesToAway);
+  virtual APInt quantizeF32ToInt8(const APFloat& expressed_value) const {
+    assert(&expressed_value.getSemantics() == &APFloat::IEEEsingle());
+    assert(storage_bit_width_ == 8);
+    assert(round_mode_ == llvm::APFloatBase::rmNearestTiesToAway);
 
-    const float realValue = expressedValue.convertToFloat();
+    const float real_value = expressed_value.convertToFloat();
 
-    const double scaled = realValue / scaleDouble + zeroPointDouble;
+    const double scaled = real_value / scale_double_ + zero_point_double_;
     // Round to nearest integer with halfway cases rounded away from zero.
-    const double scaledRounded = std::round(scaled);
-    const double clamped =
-        std::min(std::max(scaledRounded, clampMinDouble), clampMaxDouble);
-
-    uint64_t signlessResult;
-    if (isSigned) {
-      int64_t clampedInt = static_cast<int8_t>(clamped);
-      memcpy(&signlessResult, &clampedInt, sizeof(clampedInt));
+    const double scaled_rounded = std::round(scaled);
+    const double clamped = std::min(std::max(scaled_rounded, clamp_min_double_),
+                                    clamp_max_double_);
+
+    uint64_t signless_result;
+    if (is_signed_) {
+      int64_t clamped_int = static_cast<int8_t>(clamped);
+      memcpy(&signless_result, &clamped_int, sizeof(clamped_int));
     } else {
-      signlessResult = static_cast<uint8_t>(clamped);
+      signless_result = static_cast<uint8_t>(clamped);
     }
-    return APInt(storageBitWidth, signlessResult);
+    return APInt(storage_bit_width_, signless_result);
   }
 
   // Keep both APFloat and double versions of the quantization parameters
   // around since they will be used in generic and specialized arithmetic,
   // respectively.
-  const APFloat scale;
-  const APFloat zeroPoint;
-  const APFloat clampMin;
-  const APFloat clampMax;
-
-  const double scaleDouble;
-  const double zeroPointDouble;
-  const double clampMinDouble;
-  const double clampMaxDouble;
-
-  const uint32_t storageBitWidth;
-  const bool isSigned;
-  const llvm::APFloat::roundingMode roundMode;
+  const APFloat scale_;
+  const APFloat zero_point_;
+  const APFloat clamp_min_;
+  const APFloat clamp_max_;
+
+  const double scale_double_;
+  const double zero_point_double_;
+  const double clamp_min_double_;
+  const double clamp_max_double_;
+
+  const uint32_t storage_bit_width_;
+  const bool is_signed_;
+  const llvm::APFloat::roundingMode round_mode_;
 };
 
-/// An utility class to quantize an attribute by the per-axis quantization
-/// parameters. The size of the quantization dim in the converted elements
-/// attribute should matche the size of of scales/zeroPoints vectors in the
-/// quantization parameters.
+// An utility class to quantize an attribute by the per-axis quantization
+// parameters. The size of the quantization dim in the converted elements
+// attribute should match the size of of scales/zero_points vectors in the
+// quantization parameters.
 class UniformQuantizedPerAxisValueConverter {
  public:
   explicit UniformQuantizedPerAxisValueConverter(
-      quant::UniformQuantizedPerAxisType uniformType)
-      : scales(uniformType.getScales()),
-        zeroPoints(uniformType.getZeroPoints()),
-        clampMin(static_cast<double>(uniformType.getStorageTypeMin())),
-        clampMax(static_cast<double>(uniformType.getStorageTypeMax())),
-        storageBitWidth(uniformType.getStorageTypeIntegralWidth()),
-        isSigned(uniformType.isSigned()),
-        quantizationDim(uniformType.getQuantizedDimension()) {
-    assert(uniformType.getExpressedType().isa<FloatType>());
-    assert(uniformType.getStorageType().isSignlessInteger());
-    assert(scales.size() == zeroPoints.size());
+      quant::UniformQuantizedPerAxisType uniform_type)
+      : scales_(uniform_type.getScales()),
+        zero_points_(uniform_type.getZeroPoints()),
+        clamp_min_(static_cast<double>(uniform_type.getStorageTypeMin())),
+        clamp_max_(static_cast<double>(uniform_type.getStorageTypeMax())),
+        storage_bit_width_(uniform_type.getStorageTypeIntegralWidth()),
+        is_signed_(uniform_type.isSigned()),
+        quantization_dim_(uniform_type.getQuantizedDimension()) {
+    assert(isa<FloatType>(uniform_type.getExpressedType()));
+    assert(uniform_type.getStorageType().isSignlessInteger());
+    assert(scales_.size() == zero_points_.size());
   }
 
-  /// Quantize an Attribute by the quantization parameters. Return nullptr if
-  /// the conversion fails or the input array isn't an ElementsAttr.
-  ElementsAttr convert(Attribute realValue);
+  // Quantize an Attribute by the quantization parameters. Return nullptr if
+  // the conversion fails or the input array isn't an ElementsAttr.
+  ElementsAttr convert(Attribute real_value);
 
  private:
-  /// Quantize an DenseFPElementsAttr by the quantization parameters.
+  // Quantize an DenseFPElementsAttr by the quantization parameters.
   DenseElementsAttr convert(DenseFPElementsAttr attr);
 
-  /// Get a uniform converter for the index-th chunk along the quantizationDim.
-  /// All the elements in this chunk is quantized by the returned converter.
+  // Get a uniform converter for the index-th chunk along the quantizationDim.
+  // All the elements in this chunk is quantized by the returned converter.
   UniformQuantizedValueConverter getPerChunkConverter(int index) const {
-    UniformQuantizedValueConverter converter(scales[index], zeroPoints[index],
-                                             clampMin, clampMax,
-                                             storageBitWidth, isSigned);
-    return converter;
+    return UniformQuantizedValueConverter(scales_[index], zero_points_[index],
+                                          clamp_min_, clamp_max_,
+                                          storage_bit_width_, is_signed_);
   }
 
-  const ArrayRef<double> scales;
-  const ArrayRef<int64_t> zeroPoints;
-  const APFloat clampMin;
-  const APFloat clampMax;
-  const uint32_t storageBitWidth;
-  const bool isSigned;
-  int32_t quantizationDim;
+  const ArrayRef<double> scales_;
+  const ArrayRef<int64_t> zero_points_;
+  const APFloat clamp_min_;
+  const APFloat clamp_max_;
+  const uint32_t storage_bit_width_;
+  const bool is_signed_;
+  int32_t quantization_dim_;
 };
 
-}  // namespace quantfork
-}  // namespace mlir
+}  // namespace mlir::quantfork
 
 #endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_IR_UNIFORMSUPPORT_H_
diff --git a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.cc b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.cc
index 050bf45d7b5a46..bf894948d4cec8 100644
--- a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.cc
+++ b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include <stack>
 #include <string>
 
+#include "absl/algorithm/container.h"
+#include "absl/base/nullability.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -136,7 +138,7 @@ ValueRange CreateTFXlaCallModuleOp(OpBuilder& builder, const Location location,
   SmallVector<Attribute> shape_attrs;
   for (const Type result_type : output_types) {
     shape_attrs.push_back(
-        tf_type::ShapeAttr::get(ctx, result_type.cast<ShapedType>()));
+        tf_type::ShapeAttr::get(ctx, mlir::cast<ShapedType>(result_type)));
   }
   auto empty_array_attr = ArrayAttr::get(ctx, {});
   auto platforms = ArrayAttr::get(ctx, {StringAttr::get(ctx, kPlatformCpu)});
@@ -266,9 +268,9 @@ LogicalResult SetAttributeMap(MLIRContext& context,
     const NamedAttribute& attribute = attributes[idx];
     // Skip the following steps if the attribute value is `NullAttribute`.
     if (const auto string_attr =
-            attribute.getValue().dyn_cast_or_null<StringAttr>();
+            mlir::dyn_cast_or_null<StringAttr>(attribute.getValue());
         string_attr != nullptr &&
-        string_attr.getValue().equals(kNullAttributeValue)) {
+        string_attr.getValue() == kNullAttributeValue) {
       continue;
     }
 
@@ -479,10 +481,9 @@ bool IsEinsumSupportedByXlaDotV2(StringAttr equation_attr) {
          rhs_out_idx_start >= batch_dim_size;
 }
 
-absl::StatusOr<Method> GetQuantizationMethod(
-    TF::XlaCallModuleOp xla_call_module_op) {
+absl::StatusOr<Method> GetQuantizationMethod(absl::Nonnull<Operation*> op) {
   const auto quantization_method_attr =
-      xla_call_module_op->getAttrOfType<StringAttr>(kQuantizationMethodAttr);
+      op->getAttrOfType<StringAttr>(kQuantizationMethodAttr);
   if (!quantization_method_attr) {
     return absl::InvalidArgumentError(absl::StrCat(
         "Attribute ", kQuantizationMethodAttr.str(), " is not found."));
@@ -498,15 +499,40 @@ absl::StatusOr<Method> GetQuantizationMethod(
   return quantization_method;
 }
 
-Method GetQuantizationMethodOrDefault(TF::XlaCallModuleOp xla_call_module_op) {
-  absl::StatusOr<Method> method = GetQuantizationMethod(xla_call_module_op);
+Method GetQuantizationMethodOrDefault(absl::Nonnull<Operation*> op) {
+  absl::StatusOr<Method> method = GetQuantizationMethod(op);
   if (method.status().code() == absl::StatusCode::kInternal) {
     // This indicates that the `Method` protobuf string is corrupt, but this
     // function ignores it and returns the default instance.
-    xla_call_module_op->emitError(absl::StrCat(
-        "Failed to get quantization method: ", method.status().ToString()));
+    op->emitError(absl::StrCat("Failed to get quantization method: ",
+                               method.status().ToString()));
   }
   return method.ok() ? *method : Method::default_instance();
 }
 
+bool HasWeightOnlyPtqMethod(TF::XlaCallModuleOp xla_call_module_op) {
+  Method method = GetQuantizationMethodOrDefault(xla_call_module_op);
+  return method.has_weight_only_ptq();
+}
+
+bool IsWeightOnlyQuantizableOp(const Operation& op) {
+  if (auto call_op = dyn_cast<TF::XlaCallModuleOp>(op)) {
+    StringRef entry_function_name = GetEntryFunctionName(call_op);
+    absl::StatusOr<Method> quantization_method = GetQuantizationMethod(call_op);
+    return ContainsConvOrDot(entry_function_name) && quantization_method.ok() &&
+           quantization_method->has_weight_only_ptq();
+  }
+  return false;
+}
+
+SmallVector<func::FuncOp> GetSortedFunctions(ModuleOp module_op) {
+  auto iterator_range = module_op.getOps<func::FuncOp>();
+  SmallVector<func::FuncOp> func_ops(iterator_range.begin(),
+                                     iterator_range.end());
+  absl::c_sort(func_ops, [](func::FuncOp op1, func::FuncOp op2) {
+    return op1.getName() < op2.getName();
+  });
+  return func_ops;
+}
+
 }  // namespace mlir::quant
diff --git a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h
index bfef9a13df1a01..2d22816e725a48 100644
--- a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h
+++ b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h
@@ -15,11 +15,15 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_LIFT_AS_FUNCTION_CALL_H_
 #define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_LIFT_AS_FUNCTION_CALL_H_
 
+#include "absl/base/nullability.h"
 #include "absl/status/statusor.h"
 #include "llvm/ADT/SmallVector.h"
+#include "mlir/Bytecode/BytecodeOpInterface.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
@@ -43,10 +47,6 @@ constexpr StringRef kCompositeFuncPrefix = "composite_";
 inline constexpr StringRef kOriginalStablehloEntryFunctionAttrName =
     "_original_entry_function";
 
-// Name of the string attribute attached to `XlaCallModuleOp`, which is the
-// textproto representation of `Method`.
-inline constexpr StringRef kQuantizationMethodAttr = "_quantization_method";
-
 // FunctionCallOpType to be generated as the function call operator when
 // function lifting will happen.
 enum FunctionCallOpType { TFPartitionedCallOp = 0, TFXlaCallModuleOp = 1 };
@@ -62,19 +62,20 @@ bool IsInStableHloOpRegion(Operation* op);
 // Checks if a given einsum op is supported for XlaDotV2 quantization.
 bool IsEinsumSupportedByXlaDotV2(StringAttr equation_attr);
 
-// Gets the quantization method from the given `XlaCallModuleOp`. It is
-// retrieved from the `kQuantizationMethodAttr` string attribute. Returns
+// Gets the quantization method from `op`. It is retrieved from the
+// `kQuantizationMethodAttr` string attribute. Returns
 // `absl::InvalidArgumentError` when the attribute doesn't exist. Returns
 // `absl::InternalError` when parsing the attribute to `Method` failed.
+// `op` must be non-null.
 absl::StatusOr<::stablehlo::quantization::Method> GetQuantizationMethod(
-    TF::XlaCallModuleOp xla_call_module_op);
+    absl::Nonnull<Operation*> op);
 
-// Gets the quantization method from the given `XlaCallModuleOp`. It is
-// retrieved from the `kQuantizationMethodAttr` string attribute. Returns a
-// default instance of `Method` iff the attribute doesn't exist or the attribute
-// contains an invalid textproto for `Method`.
+// Gets the quantization method from `op`. It is retrieved from the
+// `kQuantizationMethodAttr` string attribute. Returns a default instance of
+// `Method` iff the attribute doesn't exist or the attribute contains an invalid
+// textproto for `Method`. `op` must be non-null.
 ::stablehlo::quantization::Method GetQuantizationMethodOrDefault(
-    TF::XlaCallModuleOp xla_call_module_op);
+    absl::Nonnull<Operation*> op);
 
 // Creates a function to wrap the section between arguments and results.
 // The generated function call op type will be decided by the given call_op_type
@@ -99,6 +100,17 @@ SmallVector<Value, 4> LiftAsFunctionCall(OpBuilder& builder, Location location,
 // Used to attach bias to einsum argument list.
 SmallVector<Value> AppendToVector(ArrayRef<Value> arguments, Value append);
 
+// Checks if the `Method` attatched to the given `tf.XlaCallModule` op has
+// `WeightOnlyPtq`.
+bool HasWeightOnlyPtqMethod(TF::XlaCallModuleOp xla_call_module_op);
+
+// Checks if an op is a `tf.XlaCallModule` op, contains 'conv' or 'dot_general'
+// in its name and has `Method` with `WeightOnlyPtq`.
+bool IsWeightOnlyQuantizableOp(const Operation& op);
+
+// Lists the functions in a ModuleOp sorted by their names.
+SmallVector<func::FuncOp> GetSortedFunctions(ModuleOp module_op);
+
 }  // namespace mlir::quant
 
 #endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_LIFT_AS_FUNCTION_CALL_H_
diff --git a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call_test.cc b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call_test.cc
index 5e5e103ba72018..4a40a70700f9b2 100644
--- a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call_test.cc
+++ b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call_test.cc
@@ -47,6 +47,8 @@ namespace {
 using ::stablehlo::quantization::Method;
 using ::testing::HasSubstr;
 using ::testing::NotNull;
+using ::testing::SizeIs;
+using ::testing::StrEq;
 using ::tsl::protobuf::util::MessageDifferencer;
 using ::tsl::testing::IsOk;
 using ::tsl::testing::StatusIs;
@@ -118,10 +120,11 @@ TEST_F(LiftAsFunctionCallTest, FunctionLiftedAsXlaCallModuleOp) {
       FindOperationOfType<mlir::stablehlo::DotGeneralOp>(entry_func);
 
   EXPECT_TRUE(isa<TF::XlaCallModuleOp>(lifted_op));
-  EXPECT_EQ(lifted_op->getAttr("_original_entry_function").cast<StringAttr>(),
-            "composite_dot_general_fn_1");
   EXPECT_EQ(
-      lifted_dot_general_op->getAttr("precision_config").cast<ArrayAttr>(),
+      mlir::cast<StringAttr>(lifted_op->getAttr("_original_entry_function")),
+      "composite_dot_general_fn_1");
+  EXPECT_EQ(
+      mlir::cast<ArrayAttr>(lifted_dot_general_op->getAttr("precision_config")),
       builder_.getArrayAttr(SmallVector<Attribute>(
           1, mlir::stablehlo::PrecisionAttr::get(
                  ctx_.get(), mlir::stablehlo::Precision::DEFAULT))));
@@ -144,8 +147,9 @@ TEST_F(LiftAsFunctionCallTest, FunctionNoAttrLiftedAsXlaCallModuleOp) {
                          "composite_dot_general_fn", operands, results)[0]
           .getDefiningOp();
   EXPECT_TRUE(isa<TF::XlaCallModuleOp>(lifted_op));
-  EXPECT_EQ(lifted_op->getAttr("_original_entry_function").cast<StringAttr>(),
-            "composite_dot_general_fn_1");
+  EXPECT_EQ(
+      mlir::cast<StringAttr>(lifted_op->getAttr("_original_entry_function")),
+      "composite_dot_general_fn_1");
 }
 
 TEST_F(LiftAsFunctionCallTest, EinsumSupportedForXlaDotV2Succeeds) {
@@ -351,6 +355,179 @@ TEST_F(
   const Method method = GetQuantizationMethodOrDefault(*xla_call_module_op);
   EXPECT_TRUE(MessageDifferencer::Equals(method, Method::default_instance()));
 }
+constexpr absl::string_view kModuleDotWeightOnlyPtq = R"mlir(
+  module {
+    func.func @main(%arg0: tensor<?x2xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<?x2xf32>) {
+      %0 = stablehlo.constant dense<[-0.211145893, -0.708605706]> : tensor<2xf32>
+      %1 = stablehlo.constant dense<[[-0.630731344, 0.54962182], [0.180364341, -0.764542698]]> : tensor<2x2xf32>
+      %2 = "tf.XlaCallModule"(%arg0, %1, %0) <{Sout = [#tf_type.shape<?x2>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable", _quantization_method = "weight_only_ptq { }"} : (tensor<?x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<?x2xf32>
+      return %2 : tensor<?x2xf32>
+    }
+    func.func private @composite_dot_general_fn_1(%arg0: tensor<?x2xf32>, %arg1: tensor<2x2xf32>, %arg2: tensor<2xf32>) -> tensor<?x2xf32> attributes {_from_xla_call_module, tf_quant.composite_function} {
+      %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<?x2xf32>, tensor<2x2xf32>) -> tensor<?x2xf32>
+      return %0 : tensor<?x2xf32>
+    }
+  }
+)mlir";
+
+TEST_F(LiftAsFunctionCallTest, HasWeightOnlyPtqMethodExists) {
+  OwningOpRef<ModuleOp> module_op =
+      ParseModuleOpString(kModuleDotWeightOnlyPtq);
+  ASSERT_TRUE(module_op);
+
+  func::FuncOp main_fn = FindMainFuncOp(*module_op);
+  ASSERT_THAT(main_fn, NotNull());
+
+  auto call_op = *main_fn.getOps<TF::XlaCallModuleOp>().begin();
+  EXPECT_TRUE(HasWeightOnlyPtqMethod(call_op));
+}
+
+TEST_F(LiftAsFunctionCallTest, HasWeightOnlyPtqMethodDifferentMethod) {
+  const absl::string_view kModuleDotNoQuantization = R"mlir(
+    module {
+      func.func @main(%arg0: tensor<?x2xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<?x2xf32>) {
+        %0 = stablehlo.constant dense<[-0.211145893, -0.708605706]> : tensor<2xf32>
+        %1 = stablehlo.constant dense<[[-0.630731344, 0.54962182], [0.180364341, -0.764542698]]> : tensor<2x2xf32>
+        %2 = "tf.XlaCallModule"(%arg0, %1, %0) <{Sout = [#tf_type.shape<?x2>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable", _quantization_method = "no_quantization { }"} : (tensor<?x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<?x2xf32>
+        return %2 : tensor<?x2xf32>
+      }
+      func.func private @composite_dot_general_fn_1(%arg0: tensor<?x2xf32>, %arg1: tensor<2x2xf32>, %arg2: tensor<2xf32>) -> tensor<?x2xf32> attributes {_from_xla_call_module, tf_quant.composite_function} {
+        %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<?x2xf32>, tensor<2x2xf32>) -> tensor<?x2xf32>
+        return %0 : tensor<?x2xf32>
+      }
+    }
+  )mlir";
+  OwningOpRef<ModuleOp> module_op =
+      ParseModuleOpString(kModuleDotNoQuantization);
+  ASSERT_TRUE(module_op);
+
+  func::FuncOp main_fn = FindMainFuncOp(*module_op);
+  ASSERT_THAT(main_fn, NotNull());
+
+  auto call_op = *main_fn.getOps<TF::XlaCallModuleOp>().begin();
+  EXPECT_FALSE(HasWeightOnlyPtqMethod(call_op));
+}
+
+TEST_F(LiftAsFunctionCallTest, HasWeightOnlyPtqMethodNoMethod) {
+  const absl::string_view kModuleXlaCallModule = R"mlir(
+    module {
+      func.func @main(%arg0: tensor<?x2xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<?x2xf32>) {
+        %0 = stablehlo.constant dense<[-0.211145893, -0.708605706]> : tensor<2xf32>
+        %1 = stablehlo.constant dense<[[-0.630731344, 0.54962182], [0.180364341, -0.764542698]]> : tensor<2x2xf32>
+        %2 = "tf.XlaCallModule"(%arg0, %1, %0) <{Sout = [#tf_type.shape<?x2>], module = "", version = 9 : i64}> {_entry_function = @composite_fn_1, _original_entry_function = "composite_fn_1", _tfl_quant_trait = "fully_quantizable"} : (tensor<?x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<?x2xf32>
+        return %2 : tensor<?x2xf32>
+      }
+      func.func private @composite_fn_1(%arg0: tensor<?x2xf32>, %arg1: tensor<2x2xf32>, %arg2: tensor<2xf32>) -> tensor<?x2xf32> attributes {_from_xla_call_module, tf_quant.composite_function} {
+        return %arg0 : tensor<?x2xf32>
+      }
+    }
+  )mlir";
+  OwningOpRef<ModuleOp> module_op = ParseModuleOpString(kModuleXlaCallModule);
+  ASSERT_TRUE(module_op);
+
+  func::FuncOp main_fn = FindMainFuncOp(*module_op);
+  ASSERT_THAT(main_fn, NotNull());
+
+  auto call_op = *main_fn.getOps<TF::XlaCallModuleOp>().begin();
+  EXPECT_FALSE(HasWeightOnlyPtqMethod(call_op));
+}
+
+TEST_F(LiftAsFunctionCallTest, IsWeightOnlyQuantizableOpDot) {
+  OwningOpRef<ModuleOp> module_op =
+      ParseModuleOpString(kModuleDotWeightOnlyPtq);
+  ASSERT_TRUE(module_op);
+
+  func::FuncOp main_fn = FindMainFuncOp(*module_op);
+  ASSERT_THAT(main_fn, NotNull());
+
+  auto call_op = *main_fn.getOps<TF::XlaCallModuleOp>().begin();
+  EXPECT_TRUE(IsWeightOnlyQuantizableOp(*call_op));
+}
+
+TEST_F(LiftAsFunctionCallTest, IsWeightOnlyQuantizableOpNotTfXlaCallModuleOp) {
+  const absl::string_view kModulePartitionedCallDot = R"mlir(
+    module {
+      func.func @main(%arg0: tensor<?x2xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<?x2xf32>) {
+        %0 = stablehlo.constant dense<[-0.211145893, -0.708605706]> : tensor<2xf32>
+        %1 = stablehlo.constant dense<[[-0.630731344, 0.54962182], [0.180364341, -0.764542698]]> : tensor<2x2xf32>
+        %2 = "tf.PartitionedCall"(%arg0, %1, %0)  {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_dot_general_fn_1, _quantization_method = "weight_only_ptq { }"} : (tensor<?x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<?x2xf32>
+        return %2 : tensor<?x2xf32>
+      }
+      func.func private @composite_dot_general_fn_1(%arg0: tensor<?x2xf32>, %arg1: tensor<2x2xf32>, %arg2: tensor<2xf32>) -> tensor<?x2xf32> attributes {_from_xla_call_module, tf_quant.composite_function} {
+        %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<?x2xf32>, tensor<2x2xf32>) -> tensor<?x2xf32>
+        return %0 : tensor<?x2xf32>
+      }
+    }
+  )mlir";
+  OwningOpRef<ModuleOp> module_op =
+      ParseModuleOpString(kModulePartitionedCallDot);
+  ASSERT_TRUE(module_op);
+
+  func::FuncOp main_fn = FindMainFuncOp(*module_op);
+  ASSERT_THAT(main_fn, NotNull());
+
+  auto call_op = *main_fn.getOps<TF::PartitionedCallOp>().begin();
+  EXPECT_FALSE(IsWeightOnlyQuantizableOp(*call_op));
+}
+
+TEST_F(LiftAsFunctionCallTest, IsWeightOnlyQuantizableOpNoConvNoDot) {
+  constexpr absl::string_view kModuleXlaCallModule = R"mlir(
+    module {
+      func.func @main(%arg0: tensor<?x2xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<?x2xf32>) {
+        %0 = stablehlo.constant dense<[-0.211145893, -0.708605706]> : tensor<2xf32>
+        %1 = stablehlo.constant dense<[[-0.630731344, 0.54962182], [0.180364341, -0.764542698]]> : tensor<2x2xf32>
+        %2 = "tf.XlaCallModule"(%arg0, %1, %0) <{Sout = [#tf_type.shape<?x2>], module = "", version = 9 : i64}> {_entry_function = @composite_fn_1, _original_entry_function = "composite_fn_1", _tfl_quant_trait = "fully_quantizable", _quantization_method = "weight_only_ptq { }"} : (tensor<?x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<?x2xf32>
+        return %2 : tensor<?x2xf32>
+      }
+      func.func private @composite_fn_1(%arg0: tensor<?x2xf32>, %arg1: tensor<2x2xf32>, %arg2: tensor<2xf32>) -> tensor<?x2xf32> attributes {_from_xla_call_module, tf_quant.composite_function} {
+        return %arg0 : tensor<?x2xf32>
+      }
+    }
+  )mlir";
+  OwningOpRef<ModuleOp> module_op = ParseModuleOpString(kModuleXlaCallModule);
+  ASSERT_TRUE(module_op);
+
+  func::FuncOp main_fn = FindMainFuncOp(*module_op);
+  ASSERT_THAT(main_fn, NotNull());
+
+  auto call_op = *main_fn.getOps<TF::XlaCallModuleOp>().begin();
+  EXPECT_FALSE(IsWeightOnlyQuantizableOp(*call_op));
+}
+
+TEST_F(LiftAsFunctionCallTest, GetSortedFunctions) {
+  constexpr absl::string_view kModuleXlaCallModule = R"mlir(
+    module {
+      func.func @conv_3_fn(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32> {
+        %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x4x4xf32>
+        %1 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
+        %2 = stablehlo.convolution(%1, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
+        func.return %2: tensor<1x3x3x4xf32>
+      }
+
+      func.func @conv_1_fn(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32> {
+        %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x4x4xf32>
+        %1 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
+        %2 = stablehlo.convolution(%1, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
+        func.return %2: tensor<1x3x3x4xf32>
+      }
+
+      func.func @conv_2_fn(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32> {
+        %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x4x4xf32>
+        %1 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
+        %2 = stablehlo.convolution(%1, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
+        func.return %2: tensor<1x3x3x4xf32>
+      }
+    }
+  )mlir";
+  OwningOpRef<ModuleOp> module_op = ParseModuleOpString(kModuleXlaCallModule);
+  ASSERT_TRUE(module_op);
+
+  SmallVector<func::FuncOp> funcs = GetSortedFunctions(*module_op);
+  ASSERT_THAT(funcs, SizeIs(3));
+  EXPECT_THAT(funcs[0].getSymName(), StrEq("conv_1_fn"));
+  EXPECT_THAT(funcs[1].getSymName(), StrEq("conv_2_fn"));
+  EXPECT_THAT(funcs[2].getSymName(), StrEq("conv_3_fn"));
+}
 
 }  // namespace
 }  // namespace mlir::quant
diff --git a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver.cc b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver.cc
index 216a4a2b3d58e9..7645177160fc62 100644
--- a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver.cc
+++ b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver.cc
@@ -91,10 +91,11 @@ bool HasPerAxisQuantizedOperand(Operation* op) {
   for (int i = 0; i < op->getNumOperands(); ++i) {
     if (auto dq_op = dyn_cast_or_null<quantfork::DequantizeCastOp>(
             op->getOperand(i).getDefiningOp())) {
-      auto type = dq_op.getArg().getType().cast<TensorType>().getElementType();
+      auto type =
+          mlir::cast<TensorType>(dq_op.getArg().getType()).getElementType();
       if (auto per_axis_qtype =
-              QuantizedType::getQuantizedElementType(type)
-                  .dyn_cast_or_null<quant::UniformQuantizedPerAxisType>()) {
+              mlir::dyn_cast_or_null<quant::UniformQuantizedPerAxisType>(
+                  QuantizedType::getQuantizedElementType(type))) {
         return true;
       }
     }
@@ -179,7 +180,7 @@ bool QuantizationDriver::SetConstantResultParams(Operation* op) {
         /*num_bits=*/8, is_signed_,
         /*narrow_range=*/is_weight, legacy_float_scale_);
   }
-  if (const auto quant_type = final_type.dyn_cast_or_null<QuantizedType>();
+  if (const auto quant_type = mlir::dyn_cast_or_null<QuantizedType>(final_type);
       quant_type != nullptr) {
     return SetResultParams(op, /*result_index=*/0, quant_type);
   }
@@ -225,7 +226,7 @@ QuantizedType QuantizationDriver::GetBiasParams(
     if (bias_op != nullptr) {
       Type bias_type = bias_op->getResult(0).getType();
       if (bias_type != builder_.getNoneType()) {
-        const int bias_rank = bias_type.dyn_cast<ShapedType>().getRank();
+        const int bias_rank = mlir::dyn_cast<ShapedType>(bias_type).getRank();
         adjusted_quant_dim = bias_rank > 1 ? bias_rank - 1 : 0;
       }
     }
@@ -489,12 +490,12 @@ QuantizedType QuantizationDriver::GetQuantParamsForSameScaleConstraint(
 void QuantizationDriver::PreprocessConstantOps() {
   fn_.walk([&](arith::ConstantOp cst) {
     // Non-float tensors are neither weights nor require quantization.
-    const auto type = cst.getType().dyn_cast<ShapedType>();
-    if (!type || !type.getElementType().isa<FloatType>()) return;
+    const auto type = mlir::dyn_cast<ShapedType>(cst.getType());
+    if (!type || !mlir::isa<FloatType>(type.getElementType())) return;
 
     // Skip if the value is NaN or INF.
     // Otherwise the illegal scale/zp will be calculated.
-    auto float_attr = cst.getValueAttr().dyn_cast<DenseFPElementsAttr>();
+    auto float_attr = mlir::dyn_cast<DenseFPElementsAttr>(cst.getValueAttr());
     if (float_attr && (float_attr.getValues<APFloat>().empty() ||
                        !float_attr.getValues<APFloat>()[0].isFinite())) {
       return;
@@ -620,7 +621,7 @@ bool QuantizationDriver::ShouldCheckBiasScale(
   auto affine_op = dyn_cast<AffineQuantizedOpInterface>(op);
   auto bias_op = op->getOperand(bias_index).getDefiningOp<arith::ConstantOp>();
   if (!affine_op || !bias_op || input_indices.size() != 2) return false;
-  if (!bias_op.getValue().isa<DenseFPElementsAttr>()) return false;
+  if (!mlir::isa<DenseFPElementsAttr>(bias_op.getValue())) return false;
   filter_index = affine_op.GetAffineOperandIndex();
   if (!op->getOperand(filter_index).getDefiningOp<arith::ConstantOp>()) {
     return false;
@@ -658,12 +659,12 @@ bool QuantizationDriver::SetBiasParamsWithAdjustments(
   QuantState filter_state = GetOperandQuantState(op, filter_index);
   auto bias_op = op->getOperand(bias_index).getDefiningOp<arith::ConstantOp>();
   const double input_scale =
-      input_state.params.cast<UniformQuantizedType>().getScale();
+      mlir::cast<UniformQuantizedType>(input_state.params).getScale();
 
-  auto bias_values = bias_op.getValue().cast<DenseFPElementsAttr>();
+  auto bias_values = mlir::cast<DenseFPElementsAttr>(bias_op.getValue());
   // Restrict maximum absolute value of bias within INT_MAX / 2, to make some
   // room for accumulator.
-  if (auto bias_quantized_type = params.dyn_cast<UniformQuantizedType>();
+  if (auto bias_quantized_type = mlir::dyn_cast<UniformQuantizedType>(params);
       bias_quantized_type != nullptr) {
     double bias_half_range = 0.0f;
     for (auto bias : bias_values.getValues<APFloat>()) {
@@ -691,7 +692,7 @@ bool QuantizationDriver::SetBiasParamsWithAdjustments(
     }
 
     const auto filter_quantized_type =
-        filter_state.params.cast<UniformQuantizedType>();
+        mlir::cast<UniformQuantizedType>(filter_state.params);
     changed |= SetOperandParams(
         op, filter_index,
         UniformQuantizedType::getChecked(
@@ -703,10 +704,10 @@ bool QuantizationDriver::SetBiasParamsWithAdjustments(
             filter_quantized_type.getStorageTypeMax()),
         /*override=*/true);
   } else if (auto bias_quantized_type =
-                 params.dyn_cast<quant::UniformQuantizedPerAxisType>();
+                 mlir::dyn_cast<quant::UniformQuantizedPerAxisType>(params);
              bias_quantized_type != nullptr) {
     const auto filter_quantized_type =
-        filter_state.params.cast<quant::UniformQuantizedPerAxisType>();
+        mlir::cast<quant::UniformQuantizedPerAxisType>(filter_state.params);
     std::vector<double> new_bias_scales = bias_quantized_type.getScales().vec();
     std::vector<double> new_filter_scales =
         filter_quantized_type.getScales().vec();
@@ -822,21 +823,22 @@ bool QuantizationDriver::PropagateParamsAndReturnIfChanged() {
 
       // Use the final state to set all the operands' parameters.
       for (int i = 0; i < op->getNumOperands(); ++i) {
-        if (auto type = op->getOperand(i).getType().dyn_cast<ShapedType>()) {
+        if (auto type =
+                mlir::dyn_cast<ShapedType>(op->getOperand(i).getType())) {
           // Without this check, it will accidentally propagate the quantization
           // information by the shared non-float tensors.
-          if (type.getElementType().isa<FloatType>())
+          if (mlir::isa<FloatType>(type.getElementType()))
             changed |= SetOperandParams(op, i, params);
         }
       }
 
       // Use the final state to set all the results' parameters.
       for (int i = 0; i < op->getNumResults(); ++i)
-        if (auto type = op->getResult(i).getType().dyn_cast<ShapedType>();
+        if (auto type = mlir::dyn_cast<ShapedType>(op->getResult(i).getType());
             type != nullptr) {
           // Without this check, it will accidentally propagate the quantization
           // information by the shared non-float-tensors.
-          if (type.getElementType().isa<FloatType>())
+          if (mlir::isa<FloatType>(type.getElementType()))
             changed |= SetResultParams(op, i, params);
         }
     }
diff --git a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver_test.cc b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver_test.cc
index cc82c09894b46b..f017054cbe7044 100644
--- a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver_test.cc
+++ b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver_test.cc
@@ -159,10 +159,9 @@ TEST_F(ApplyQuantizationParamsPropagationTest, FinalizeInsertsQDQOps) {
   ASSERT_NE(filter_qcast_op, nullptr);
   EXPECT_TRUE(isa<quantfork::QuantizeCastOp>(filter_qcast_op));
   EXPECT_TRUE(isa<quantfork::DequantizeCastOp>(filter_dcast_op));
-  EXPECT_TRUE(isa<UniformQuantizedPerAxisType>(filter_qcast_op->getResult(0)
-                                                   .getType()
-                                                   .cast<TensorType>()
-                                                   .getElementType()));
+  EXPECT_TRUE(isa<UniformQuantizedPerAxisType>(
+      mlir::cast<TensorType>(filter_qcast_op->getResult(0).getType())
+          .getElementType()));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.cc b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.cc
index f6c561be98d49b..8e5496106c5279 100644
--- a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.cc
+++ b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.cc
@@ -29,7 +29,6 @@ limitations under the License.
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
@@ -125,14 +124,13 @@ QuantizedType ResetMinMaxFromNumBits(const QuantizedType type,
   const auto& recalculate_zero_point = [&](int64_t zero_point) -> int64_t {
     return qmax - std::round((storage_type_max - zero_point) / rate);
   };
-  if (auto q_type = type.dyn_cast<UniformQuantizedType>()) {
+  if (auto q_type = dyn_cast<UniformQuantizedType>(type)) {
     const double scale = recalculate_scale(q_type.getScale());
     const double zero_point = recalculate_zero_point(q_type.getZeroPoint());
     return UniformQuantizedType::get(q_type.getFlags(), q_type.getStorageType(),
                                      q_type.getExpressedType(), scale,
                                      zero_point, qmin, qmax);
-  } else if (auto q_type =
-                 type.dyn_cast<quant::UniformQuantizedPerAxisType>()) {
+  } else if (auto q_type = dyn_cast<quant::UniformQuantizedPerAxisType>(type)) {
     const int size = q_type.getScales().size();
     SmallVector<double, 4> scales(size);
     SmallVector<int64_t, 4> zero_points(size);
@@ -155,7 +153,7 @@ quant::UniformQuantizedPerAxisType ResetAxisAndBroadcast(
     const ArrayRef<int64_t> shape,
     const quant::UniformQuantizedPerAxisType qtype, const Type target,
     const int quant_dim) {
-  const auto shaped = target.dyn_cast<RankedTensorType>();
+  const auto shaped = dyn_cast<RankedTensorType>(target);
   if (!shaped) return {};
   const ArrayRef<int64_t> new_shape = shaped.getShape();
 
@@ -236,52 +234,54 @@ Type GetQuantizedType(Builder builder, const Type input_type,
   SmallVector<double, 4> effective_mins, effective_maxs;
   ExpandVerySmallRange(min, max, effective_mins, effective_maxs);
 
-  quant::QuantizedType quantizedEleType;
+  quant::QuantizedType quantized_element_type;
   if (min.size() == 1 && max.size() == 1 && quant_dim == -1) {
-    quantizedEleType = quantfork::fakeQuantAttrsToType(
+    quantized_element_type = quantfork::fakeQuantAttrsToType(
         builder.getUnknownLoc(), storage_type_width, effective_mins[0],
-        effective_maxs[0], narrow_range, converter.expressedType, is_signed);
+        effective_maxs[0], narrow_range, converter.expressed_type, is_signed);
     if (legacy_float_scale) {
-      quantizedEleType =
-          DownCastScale(quantizedEleType, effective_mins[0], effective_maxs[0],
-                        builder.getUnknownLoc());
+      quantized_element_type =
+          DownCastScale(quantized_element_type, effective_mins[0],
+                        effective_maxs[0], builder.getUnknownLoc());
     }
   } else if (min.size() == max.size()) {
-    auto shape = input_type.dyn_cast<ShapedType>();
+    auto shape = dyn_cast<ShapedType>(input_type);
     if (!shape || shape.getRank() <= quant_dim ||
         static_cast<int64_t>(min.size()) != shape.getDimSize(quant_dim)) {
       return {};
     }
     // The quantization dim is set to the last dimension.
-    quantizedEleType = quantfork::fakeQuantAttrsToType(
+    quantized_element_type = quantfork::fakeQuantAttrsToType(
         builder.getUnknownLoc(), storage_type_width, quant_dim, effective_mins,
-        effective_maxs, narrow_range, converter.expressedType, is_signed);
+        effective_maxs, narrow_range, converter.expressed_type, is_signed);
     if (legacy_float_scale) {
-      quantizedEleType = DownCastScale(quantizedEleType, effective_mins,
-                                       effective_maxs, builder.getUnknownLoc());
+      quantized_element_type =
+          DownCastScale(quantized_element_type, effective_mins, effective_maxs,
+                        builder.getUnknownLoc());
     }
   }
-  if (!quantizedEleType) return {};
+  if (!quantized_element_type) return {};
   // Use fake quant configured bit-widths (only supported for
   // 1 < num_bits < 8 bits) instead of using 8-bit defaults.
   if (use_fake_quant_num_bits && storage_type_width > 1 &&
       storage_type_width < 8 &&
-      quantizedEleType.getStorageTypeMax() >
+      quantized_element_type.getStorageTypeMax() >
           QType::getDefaultMinimumForInteger(is_signed, storage_type_width)) {
     const auto resetEleType = ResetMinMaxFromNumBits(
-        quantizedEleType, storage_type_width, narrow_range, is_signed);
+        quantized_element_type, storage_type_width, narrow_range, is_signed);
     return converter.convert(resetEleType);
   }
-  return converter.convert(quantizedEleType);
+  return converter.convert(quantized_element_type);
 }
 
 // TODO(fengliuai): promote this utility method to mlir QuantOps.
 TypeAttr RescaleQuantizedType(const Type input, const Attribute factor) {
-  const auto factor_values = factor.dyn_cast_or_null<DenseFPElementsAttr>();
+  const auto factor_values = dyn_cast_or_null<DenseFPElementsAttr>(factor);
   if (!factor_values) return {};
-  const auto ele_type = quant::QuantizedType::getQuantizedElementType(input);
-  if (!ele_type) return {};
-  if (auto qtype = ele_type.dyn_cast<quant::UniformQuantizedPerAxisType>()) {
+  const auto element_type =
+      quant::QuantizedType::getQuantizedElementType(input);
+  if (!element_type) return {};
+  if (auto qtype = dyn_cast<quant::UniformQuantizedPerAxisType>(element_type)) {
     const ArrayRef<double> scales = qtype.getScales();
     // Broadcasting hasn't been implemented yet.
     if (static_cast<int64_t>(scales.size()) != factor_values.getNumElements())
@@ -315,8 +315,8 @@ TypeAttr GetQuantizedTypeAttr(const Builder builder, const Type input_type,
                               const bool legacy_float_scale,
                               const bool use_fake_quant_num_bits) {
   SmallVector<double, 4> min_value, max_value;
-  const auto mins = min.dyn_cast<DenseFPElementsAttr>();
-  const auto maxs = max.dyn_cast<DenseFPElementsAttr>();
+  const auto mins = dyn_cast<DenseFPElementsAttr>(min);
+  const auto maxs = dyn_cast<DenseFPElementsAttr>(max);
   if (mins && maxs) {
     min_value.reserve(mins.getNumElements());
     max_value.reserve(maxs.getNumElements());
@@ -327,8 +327,8 @@ TypeAttr GetQuantizedTypeAttr(const Builder builder, const Type input_type,
       max_value.push_back(FloatAttr::getValueAsDouble(*it));
     }
   } else {
-    const auto fmin = min.dyn_cast<FloatAttr>();
-    const auto fmax = max.dyn_cast<FloatAttr>();
+    const auto fmin = dyn_cast<FloatAttr>(min);
+    const auto fmax = dyn_cast<FloatAttr>(max);
     if (fmin && fmax) {
       min_value.push_back(fmin.getValueAsDouble());
       max_value.push_back(fmax.getValueAsDouble());
@@ -348,14 +348,14 @@ TypeAttr CastQuantizedTypeAttrFromExpressedType(const Builder builder,
                                                 const TypeAttr source,
                                                 const Type target,
                                                 const int axis) {
-  const auto source_type = source.getValue().dyn_cast_or_null<ShapedType>();
+  const auto source_type = dyn_cast_or_null<ShapedType>(source.getValue());
   if (!source_type) return {};
   const auto src_ele_type = source_type.getElementType();
-  auto qtype = src_ele_type.dyn_cast<quant::QuantizedType>();
+  auto qtype = dyn_cast<quant::QuantizedType>(src_ele_type);
 
   // Reset the quantization dimensions if it is per-axis.
   if (const auto per_axis =
-          qtype.dyn_cast_or_null<quant::UniformQuantizedPerAxisType>()) {
+          dyn_cast_or_null<quant::UniformQuantizedPerAxisType>(qtype)) {
     // For the pass-through ops, we don't know which the dimension will be the
     // new quantization dimension. Only if the new quantization dimension can
     // be inferred, it is safe to reset the per-axis quantized type.
@@ -396,7 +396,9 @@ void ExtractMinMaxFromAttr(const DenseFPElementsAttr values, const int dim_size,
     }
   } else {
     int64_t flatten_index = 0;
-    for (auto it = values.begin(); it != values.end(); ++it, ++flatten_index) {
+    auto begin = values.begin();
+    auto end = values.end();
+    for (auto it = begin; it != end; ++it, ++flatten_index) {
       const double ele_value = FloatAttr::getValueAsDouble(*it);
       const int slice_index = flatten_index / slice_size;
       const int channel_index = slice_index % dim_size;
@@ -427,7 +429,7 @@ Type GetUniformQuantizedTypeForWeight(
 
   SmallVector<double, 4> mins(1, std::numeric_limits<double>::max());
   SmallVector<double, 4> maxs(1, std::numeric_limits<double>::min());
-  const auto fp = attr.dyn_cast<DenseFPElementsAttr>();
+  const auto fp = dyn_cast<DenseFPElementsAttr>(attr);
   if (!fp) return {};
 
   // Computes the effective min/max values of the attribute values.
@@ -438,7 +440,7 @@ Type GetUniformQuantizedTypeForWeight(
       GetQuantizedType(builder, attr.getType(), mins[0], maxs[0],
                        /*quant_dim=*/-1, num_bits, narrow_range, is_signed,
                        legacy_float_scale, use_fake_quant_num_bits);
-  if (const auto ele_type = type.dyn_cast_or_null<TensorType>())
+  if (const auto ele_type = dyn_cast_or_null<TensorType>(type))
     return ele_type.getElementType();
 
   return {};
@@ -449,7 +451,7 @@ Type GetUniformQuantizedPerAxisTypeForWeight(
     const unsigned num_bits, const bool is_signed, const bool narrow_range,
     const bool legacy_float_scale, const bool use_fake_quant_num_bits) {
   const Builder builder(attr.getContext());
-  const auto shape = attr.getType().cast<ShapedType>().getShape();
+  const auto shape = cast<ShapedType>(attr.getType()).getShape();
   if (static_cast<int>(shape.size()) <= quant_dim) return {};
   // `symmetric` can only be used when it is `signed` and `narrow_range`.
   if (symmetric && (!is_signed || !narrow_range)) return {};
@@ -460,7 +462,7 @@ Type GetUniformQuantizedPerAxisTypeForWeight(
                       std::multiplies<int64_t>());
   SmallVector<double, 4> mins(dim_size, std::numeric_limits<double>::max());
   SmallVector<double, 4> maxs(dim_size, std::numeric_limits<double>::min());
-  const auto fp = attr.dyn_cast<DenseFPElementsAttr>();
+  const auto fp = dyn_cast<DenseFPElementsAttr>(attr);
   if (!fp) return {};
 
   // Computes the effective min/max values of the attribute values.
@@ -469,7 +471,7 @@ Type GetUniformQuantizedPerAxisTypeForWeight(
   const auto type = GetQuantizedType(
       builder, attr.getType(), mins, maxs, quant_dim, num_bits, narrow_range,
       is_signed, legacy_float_scale, use_fake_quant_num_bits);
-  if (auto ele_type = type.dyn_cast_or_null<TensorType>())
+  if (auto ele_type = dyn_cast_or_null<TensorType>(type))
     return ele_type.getElementType();
 
   return {};
@@ -495,28 +497,28 @@ quant::QuantizedType GetUniformQuantizedTypeForBias(
     expressed_type = op_type.getExpressedType();
 
     if (const auto type =
-            op_type.dyn_cast<quant::UniformQuantizedPerAxisType>()) {
+            dyn_cast<quant::UniformQuantizedPerAxisType>(op_type)) {
       if (axis_size != 1 && axis_size != type.getScales().size()) return {};
       if (quant_dim != -1 && quant_dim != type.getQuantizedDimension())
         return {};
       axis_size = type.getScales().size();
       quant_dim = type.getQuantizedDimension();
-    } else if (!op_type.isa<quant::UniformQuantizedType>()) {
+    } else if (!isa<quant::UniformQuantizedType>(op_type)) {
       return {};
     }
   }
 
   // The scale from the UniformQuantizedTypes is broadcasted if there are
   // UniformQuantizedPerAxisTypes.
-  llvm::SmallVector<double, 4> scales(axis_size, 1.0);
+  SmallVector<double, 4> scales(axis_size, 1.0);
   for (const auto op_type : op_types) {
     if (const auto type =
-            op_type.dyn_cast<quant::UniformQuantizedPerAxisType>()) {
+            dyn_cast<quant::UniformQuantizedPerAxisType>(op_type)) {
       for (const auto& index_scale : llvm::enumerate(type.getScales())) {
         scales[index_scale.index()] *= index_scale.value();
       }
     } else if (const auto type =
-                   op_type.dyn_cast<quant::UniformQuantizedType>()) {
+                   dyn_cast<quant::UniformQuantizedType>(op_type)) {
       for (int index = 0; index < axis_size; ++index) {
         scales[index] *= type.getScale();
       }
@@ -541,7 +543,7 @@ quant::QuantizedType GetUniformQuantizedTypeForBias(
         /*flags=*/true, storage_type, expressed_type, scales[0],
         /*zeroPoint=*/0, storage_type_min, storage_type_max);
   } else {
-    llvm::SmallVector<int64_t, 4> zero_points(axis_size, 0);
+    SmallVector<int64_t, 4> zero_points(axis_size, 0);
     // If the bias is a 1-D tensor, set the `quantizedDimension` to 0.
     // If the bias rank is larger than 1 because it was already broadcasted
     // to match the output shape, use the last index.
@@ -555,30 +557,28 @@ quant::QuantizedType GetUniformQuantizedTypeForBias(
 
 ElementsAttr QuantizeLegacy(const Attribute real_value,
                             const Type tensor_type) {
-  if (!real_value.isa<DenseFPElementsAttr>() ||
+  if (!isa<DenseFPElementsAttr>(real_value) ||
       !quant::QuantizedType::getQuantizedElementType(tensor_type)) {
     return {};
   }
-  const auto real_values_attr = real_value.cast<DenseFPElementsAttr>();
+  const auto real_values_attr = cast<DenseFPElementsAttr>(real_value);
   auto q_type = quant::QuantizedType::getQuantizedElementType(tensor_type);
   std::vector<float> real_values;
-  llvm::SmallVector<APInt, 8> quantized_attr;
+  SmallVector<APInt, 8> quantized_attr;
   real_values.reserve(real_values_attr.getNumElements());
   quantized_attr.reserve(real_values_attr.getNumElements());
   std::transform(real_values_attr.begin(), real_values_attr.end(),
                  std::back_inserter(real_values), [&](APFloat value) -> float {
                    return value.convertToFloat();
                  });
-  const ShapedType new_dense_type =
-      q_type.castExpressedToStorageType(real_values_attr.getType())
-          .dyn_cast_or_null<ShapedType>();
-  const int width =
-      q_type.getStorageType().dyn_cast<mlir::IntegerType>().getWidth();
+  const ShapedType new_dense_type = dyn_cast_or_null<ShapedType>(
+      q_type.castExpressedToStorageType(real_values_attr.getType()));
+  const int width = dyn_cast<IntegerType>(q_type.getStorageType()).getWidth();
 
   if (width == 8 && q_type.getStorageTypeMax() == 127 &&
       q_type.getStorageTypeMin() == -127) {
     std::vector<int8_t> quantized_values(real_values_attr.getNumElements());
-    if (auto uniform_type = q_type.dyn_cast<UniformQuantizedType>()) {
+    if (auto uniform_type = dyn_cast<UniformQuantizedType>(q_type)) {
       float min, max, scale;
       tflite::tensor_utils::SymmetricQuantizeFloats(
           real_values.data(), real_values.size(), quantized_values.data(), &min,
@@ -588,7 +588,7 @@ ElementsAttr QuantizeLegacy(const Attribute real_value,
         return Quantize(real_value, tensor_type);
       }
     } else if (auto uniform_type =
-                   q_type.dyn_cast<quant::UniformQuantizedPerAxisType>()) {
+                   dyn_cast<quant::UniformQuantizedPerAxisType>(q_type)) {
       std::vector<float> scales_inv;
       std::vector<int32_t> dimension;
       dimension.insert(dimension.end(), new_dense_type.getShape().begin(),
@@ -617,7 +617,7 @@ ElementsAttr QuantizeLegacy(const Attribute real_value,
     // not correctly quantized by legacy quantizer so call the new Quantize.
     return Quantize(real_value, tensor_type);
   } else if (width == 16) {
-    if (const auto uniform_type = q_type.dyn_cast<UniformQuantizedType>()) {
+    if (const auto uniform_type = dyn_cast<UniformQuantizedType>(q_type)) {
       const auto quantized_values =
           tflite::optimize::utils::SymmetricQuantizeFloatsToInt16(
               real_values.data(), real_values.size(), uniform_type.getScale());
@@ -630,10 +630,10 @@ ElementsAttr QuantizeLegacy(const Attribute real_value,
     }
   } else if (width == 32) {
     std::vector<float> scales;
-    if (const auto uniform_type = q_type.dyn_cast<UniformQuantizedType>()) {
+    if (const auto uniform_type = dyn_cast<UniformQuantizedType>(q_type)) {
       scales.push_back(uniform_type.getScale());
     } else if (const auto uniform_type =
-                   q_type.dyn_cast<quant::UniformQuantizedPerAxisType>()) {
+                   dyn_cast<quant::UniformQuantizedPerAxisType>(q_type)) {
       scales.insert(scales.end(), uniform_type.getScales().begin(),
                     uniform_type.getScales().end());
     } else {
@@ -656,8 +656,8 @@ ElementsAttr Quantize(const Attribute real_value, const Type tensor_type) {
   if (const auto q_type =
           quant::QuantizedType::getQuantizedElementType(tensor_type)) {
     Type converted_type;
-    return quantfork::quantizeAttr(real_value, q_type, converted_type)
-        .dyn_cast_or_null<ElementsAttr>();
+    return dyn_cast_or_null<ElementsAttr>(
+        quantfork::quantizeAttr(real_value, q_type, converted_type));
   }
   return {};
 }
@@ -678,10 +678,9 @@ quant::QuantizedType DownCastScale(QuantizedType type,
   if (!type) return type;
   SmallVector<double, 4> scales(mins.size());
   SmallVector<int64_t, 4> zero_points(mins.size());
-  if (auto q_type = type.dyn_cast<UniformQuantizedType>()) {
+  if (auto q_type = dyn_cast<UniformQuantizedType>(type)) {
     zero_points.push_back(q_type.getZeroPoint());
-  } else if (auto q_type =
-                 type.dyn_cast<quant::UniformQuantizedPerAxisType>()) {
+  } else if (auto q_type = dyn_cast<quant::UniformQuantizedPerAxisType>(type)) {
     zero_points = {q_type.getZeroPoints().begin(),
                    q_type.getZeroPoints().end()};
   }
@@ -701,13 +700,12 @@ quant::QuantizedType DownCastScale(QuantizedType type,
       }
     }
   }
-  if (auto q_type = type.dyn_cast<UniformQuantizedType>()) {
+  if (auto q_type = dyn_cast<UniformQuantizedType>(type)) {
     return UniformQuantizedType::get(q_type.getFlags(), q_type.getStorageType(),
                                      q_type.getExpressedType(), scales[0],
                                      zero_points[0], q_type.getStorageTypeMin(),
                                      q_type.getStorageTypeMax());
-  } else if (auto q_type =
-                 type.dyn_cast<quant::UniformQuantizedPerAxisType>()) {
+  } else if (auto q_type = dyn_cast<quant::UniformQuantizedPerAxisType>(type)) {
     return quant::UniformQuantizedPerAxisType::get(
         q_type.getFlags(), q_type.getStorageType(), q_type.getExpressedType(),
         scales, zero_points, q_type.getQuantizedDimension(),
@@ -722,8 +720,8 @@ quant::QuantizedType DownCastScale(QuantizedType type,
 static bool PreferResultScale(Operation* op) {
   int float_operands = 0;
   for (auto operand : op->getOperands()) {
-    if (auto operand_type = operand.getType().dyn_cast<ShapedType>()) {
-      if (operand_type.getElementType().isa<FloatType>()) {
+    if (auto operand_type = dyn_cast<ShapedType>(operand.getType())) {
+      if (isa<FloatType>(operand_type.getElementType())) {
         if (++float_operands > 1) return true;
       }
     }
@@ -733,22 +731,22 @@ static bool PreferResultScale(Operation* op) {
 
 std::unique_ptr<OpQuantScaleSpec> GetDefaultQuantScaleSpec(Operation* op) {
   auto spec = std::make_unique<OpQuantScaleSpec>();
-  if (llvm::isa<SameScalesOpInterface>(op)) {
+  if (isa<SameScalesOpInterface>(op)) {
     spec->has_same_scale_requirement = true;
     spec->required_same_scale_func = [op](const bool sign,
                                           const int bit_width) {
-      return llvm::cast<SameScalesOpInterface>(op)
+      return cast<SameScalesOpInterface>(op)
           .RequiredSameOperandsAndResultsScale(sign, bit_width);
     };
     spec->required_same_quantized_axes_func = [op]() {
-      return llvm::cast<SameScalesOpInterface>(op).RequiredSameQuantizedAxes();
+      return cast<SameScalesOpInterface>(op).RequiredSameQuantizedAxes();
     };
   }
-  if (llvm::isa<FixedOutputRangeInterface>(op)) {
+  if (isa<FixedOutputRangeInterface>(op)) {
     spec->has_fixed_output_range = true;
     spec->fixed_output_range_func = [op](bool sign, int bit_width) {
-      return llvm::cast<FixedOutputRangeInterface>(op).GetFixedOutputRange(
-          sign, bit_width);
+      return cast<FixedOutputRangeInterface>(op).GetFixedOutputRange(sign,
+                                                                     bit_width);
     };
   }
   return spec;
@@ -760,21 +758,21 @@ static bool IsStatsRedundant(
     Operation* op, const OpQuantSpecGetter op_quant_spec_getter,
     const OpQuantScaleSpecGetter op_quant_scale_spec_getter) {
   // If it has FixedOutputRangeInterface, no need to manually create spec.
-  return llvm::isa<FixedOutputRangeInterface>(op) ||
+  return isa<FixedOutputRangeInterface>(op) ||
          op_quant_scale_spec_getter(op)->has_fixed_output_range;
 }
 
 static bool IsSameScaleOp(
     Operation* op, const OpQuantScaleSpecGetter op_quant_scale_spec_getter) {
   // If it has SameScalesOpInterface, no need to manually create spec.
-  return llvm::dyn_cast<SameScalesOpInterface>(op) ||
+  return dyn_cast<SameScalesOpInterface>(op) ||
          op_quant_scale_spec_getter(op)->has_same_scale_requirement;
 }
 
 bool RemoveRedundantStatsOps(
-    mlir::func::FuncOp func, const OpQuantSpecGetter op_quant_spec_getter,
+    func::FuncOp func, const OpQuantSpecGetter op_quant_spec_getter,
     const OpQuantScaleSpecGetter op_quant_scale_spec_getter) {
-  llvm::SmallVector<quantfork::StatisticsOp, 16> all_stats_ops;
+  SmallVector<quantfork::StatisticsOp, 16> all_stats_ops;
   llvm::DenseSet<Operation*> redundant_stats_ops;
 
   // Step 0: remove the quantfork::StatisticsOp which are used by the
@@ -782,8 +780,7 @@ bool RemoveRedundantStatsOps(
   // ops.
   func.walk([&](quantfork::QuantizeCastOp q) {
     auto input_op = q.getArg().getDefiningOp();
-    if (auto stats =
-            llvm::dyn_cast_or_null<quantfork::StatisticsOp>(input_op)) {
+    if (auto stats = dyn_cast_or_null<quantfork::StatisticsOp>(input_op)) {
       q.setOperand(stats.getArg());
       if (stats.use_empty()) stats.erase();
     }
@@ -820,8 +817,8 @@ bool RemoveRedundantStatsOps(
         if (!res.hasOneUse()) {
           continue;
         }
-        if (auto next_stats = llvm::dyn_cast<quantfork::StatisticsOp>(
-                *res.getUsers().begin())) {
+        if (auto next_stats =
+                dyn_cast<quantfork::StatisticsOp>(*res.getUsers().begin())) {
           // quantization parameters can be propagated to next_stats
           redundant_stats_ops.insert(next_stats);
           // add next_stats to the work list so propagation can continue.
@@ -848,7 +845,7 @@ bool RemoveRedundantStatsOps(
         continue;
       }
       for (Value input : def->getOperands()) {
-        if (auto next_stats = llvm::dyn_cast_or_null<quantfork::StatisticsOp>(
+        if (auto next_stats = dyn_cast_or_null<quantfork::StatisticsOp>(
                 input.getDefiningOp())) {
           redundant_stats_ops.insert(next_stats);
           all_stats_ops.push_back(next_stats);
@@ -859,8 +856,8 @@ bool RemoveRedundantStatsOps(
 
   // Step3: Remove all the redundant stats ops
   for (Operation* it : redundant_stats_ops) {
-    if (!llvm::isa<quantfork::StatisticsOp>(it)) return true;
-    auto stats_op = llvm::cast<quantfork::StatisticsOp>(it);
+    if (!isa<quantfork::StatisticsOp>(it)) return true;
+    auto stats_op = cast<quantfork::StatisticsOp>(it);
     stats_op.getResult().replaceAllUsesWith(stats_op.getArg());
     stats_op.erase();
   }
@@ -870,9 +867,9 @@ bool RemoveRedundantStatsOps(
 }
 
 LogicalResult VerifySameScales(Operation* op) {
-  auto same_scale_op = llvm::cast<SameScalesOpInterface>(op);
+  auto same_scale_op = cast<SameScalesOpInterface>(op);
 
-  llvm::SmallVector<QuantizedType, 4> collected_quant_params;
+  SmallVector<QuantizedType, 4> collected_quant_params;
   for (Value input : op->getOperands()) {
     QuantizedType quant_params =
         QuantizedType::getQuantizedElementType(input.getType());
@@ -901,9 +898,9 @@ LogicalResult VerifySameScales(Operation* op) {
     // method.
     if (!same_scale_op.RequiredSameQuantizedAxes()) {
       const auto expected_per_axis_qtype =
-          expected_params.dyn_cast<quant::UniformQuantizedPerAxisType>();
+          dyn_cast<quant::UniformQuantizedPerAxisType>(expected_params);
       const auto compared_per_axis_qtype =
-          compared_params.dyn_cast<quant::UniformQuantizedPerAxisType>();
+          dyn_cast<quant::UniformQuantizedPerAxisType>(compared_params);
       if (expected_per_axis_qtype && compared_per_axis_qtype &&
           llvm::equal(expected_per_axis_qtype.getScales(),
                       compared_per_axis_qtype.getScales()) &&
@@ -945,8 +942,8 @@ quant::UniformQuantizedType GetFixedOutputRange(
     const bool is_signed, const int bit_width, const Type tensor_type,
     const double scale, int64_t zero_point, int64_t storage_min,
     int64_t storage_max) {
-  const auto result_type = tensor_type.cast<ShapedType>();
-  if (!result_type.getElementType().isa<FloatType>()) return {};
+  const auto result_type = cast<ShapedType>(tensor_type);
+  if (!isa<FloatType>(result_type.getElementType())) return {};
   Builder builder(result_type.getContext());
 
   // Only support 8-bits and 16-bits
@@ -988,17 +985,17 @@ Type ConvertSignedQuantizedToUnsigned(const Type signed_tensor_type,
 
   const auto flags = !quant::QuantizationFlags::Signed;
   QType new_qtype;
-  if (auto uqtype = qtype.dyn_cast<quant::UniformQuantizedType>()) {
+  if (auto uqtype = dyn_cast<quant::UniformQuantizedType>(qtype)) {
     new_qtype = quant::UniformQuantizedType::getChecked(
         loc, flags, qtype.getStorageType(), qtype.getExpressedType(),
         uqtype.getScale(), uqtype.getZeroPoint() - offset,
         uqtype.getStorageTypeMin() - offset,
         uqtype.getStorageTypeMax() - offset);
   } else if (auto aqtype =
-                 qtype.dyn_cast<quant::UniformQuantizedPerAxisType>()) {
+                 dyn_cast<quant::UniformQuantizedPerAxisType>(qtype)) {
     const auto zero_points = aqtype.getZeroPoints();
-    llvm::SmallVector<int64_t, 4> new_zero_points(zero_points.begin(),
-                                                  zero_points.end());
+    SmallVector<int64_t, 4> new_zero_points(zero_points.begin(),
+                                            zero_points.end());
     for (int i = 0; i < new_zero_points.size(); ++i) {
       new_zero_points[i] -= offset;
     }
diff --git a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h
index 453dc419371932..3f9f56d45fbaa7 100644
--- a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h
+++ b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h
@@ -493,6 +493,7 @@ class QuantizationPattern : public RewritePattern {
         continue;
       }
 
+      bool is_operand_or_result_modified = false;
       // Collect all the quantized inputs and "clone" the matched op by these
       // inputs.
       SmallVector<Value, 4> inputs;
@@ -517,6 +518,7 @@ class QuantizationPattern : public RewritePattern {
             // Dynamic range quantization is applied by having QuantizeOp as an
             // input. Only int8 weight is supported for now.
             inputs.push_back(dq_op.getOperand());
+            is_operand_or_result_modified = true;
           } else {
             // Otherwise, it's the case where the operand is activations or the
             // quantizing_op is non-supported/weight-only.
@@ -525,6 +527,7 @@ class QuantizationPattern : public RewritePattern {
         } else {
           if (auto dq_op =
                   dyn_cast_or_null<DequantizeOpT>(operand.getDefiningOp())) {
+            is_operand_or_result_modified = true;
             inputs.push_back(dq_op.getOperand());
           } else if (!ele_type.isF32()) {
             // If the operand is an integer tensor, then it doesn't require the
@@ -561,6 +564,7 @@ class QuantizationPattern : public RewritePattern {
           outputs_replaced.insert(
               {user.getResult(), enumerated_result.index()});
           output_types.push_back(user.getType());
+          is_operand_or_result_modified = true;
         } else if (!result_ele_type.isF32()) {
           // If the result is an integer tensor, then it doesn't require the
           // D op in the pattern.
@@ -576,6 +580,13 @@ class QuantizationPattern : public RewritePattern {
         }
       }
 
+      // For float16 quantization if none of the operand or result is modified,
+      // replacing the op. See b/335025403.
+      if (inference_type == tensorflow::DT_HALF &&
+          !is_operand_or_result_modified) {
+        return failure();
+      }
+
       rewriter.setInsertionPointAfter(quantizing_op);
       OperationState new_state(quantizing_op->getLoc(),
                                quantizing_op->getName().getStringRef(), inputs,
diff --git a/tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.cc b/tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.cc
index a64ba201250727..7f66d76798acfa 100644
--- a/tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.cc
+++ b/tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.cc
@@ -94,12 +94,12 @@ bool IsStorageTypeI32(const QuantizedType quantized_type) {
 
 bool IsExpressedTypeF32(const QuantizedType quantized_type) {
   const Type expressed_type = quantized_type.getExpressedType();
-  return expressed_type.isa<Float32Type>();
+  return mlir::isa<Float32Type>(expressed_type);
 }
 
 bool IsI8F32UniformQuantizedType(const Type type) {
   const UniformQuantizedType quantized_type =
-      type.dyn_cast_or_null<UniformQuantizedType>();
+      mlir::dyn_cast_or_null<UniformQuantizedType>(type);
   if (!quantized_type) {
     LLVM_DEBUG(llvm::dbgs()
                << "Expected a uniform quantized type. Got: " << type << ".\n");
@@ -123,7 +123,7 @@ bool IsI8F32UniformQuantizedType(const Type type) {
 
 bool IsI8F32UniformQuantizedPerAxisType(const Type type) {
   const UniformQuantizedPerAxisType quantized_per_axis_type =
-      type.dyn_cast_or_null<UniformQuantizedPerAxisType>();
+      mlir::dyn_cast_or_null<UniformQuantizedPerAxisType>(type);
   if (!quantized_per_axis_type) {
     LLVM_DEBUG(llvm::dbgs()
                << "Expected a uniform quantized type. Got: " << type << ".\n");
@@ -147,7 +147,7 @@ bool IsI8F32UniformQuantizedPerAxisType(const Type type) {
 
 bool IsI32F32UniformQuantizedType(const Type type) {
   const UniformQuantizedType quantized_type =
-      type.dyn_cast_or_null<UniformQuantizedType>();
+      mlir::dyn_cast_or_null<UniformQuantizedType>(type);
   if (!quantized_type) {
     LLVM_DEBUG(llvm::dbgs()
                << "Expected a uniform quantized type. Got: " << type << ".\n");
@@ -171,7 +171,7 @@ bool IsI32F32UniformQuantizedType(const Type type) {
 
 bool IsI32F32UniformQuantizedPerAxisType(const Type type) {
   const UniformQuantizedPerAxisType quantized_per_axis_type =
-      type.dyn_cast_or_null<UniformQuantizedPerAxisType>();
+      mlir::dyn_cast_or_null<UniformQuantizedPerAxisType>(type);
   if (!quantized_per_axis_type) {
     LLVM_DEBUG(llvm::dbgs()
                << "Expected a uniform quantized type. Got: " << type << ".\n");
@@ -208,11 +208,11 @@ bool IsSupportedByTfliteQuantizeOrDequantizeOps(IntegerType storage_type) {
 }
 
 bool IsQuantizedTensorType(Type type) {
-  if (!type.isa<TensorType>()) {
+  if (!mlir::isa<TensorType>(type)) {
     return false;
   }
-  Type element_type = type.cast<TensorType>().getElementType();
-  return element_type.isa<QuantizedType>();
+  Type element_type = mlir::cast<TensorType>(type).getElementType();
+  return mlir::isa<QuantizedType>(element_type);
 }
 
 bool IsOpFullyQuantized(Operation* op) {
diff --git a/tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.h b/tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.h
index ab850c878ff0dd..e30db98a9616de 100644
--- a/tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.h
+++ b/tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.h
@@ -82,7 +82,7 @@ bool IsExpressedTypeF32(QuantizedType quantized_type);
 // Given a value, extract the `ElementType`.
 // `value` should be a non-null `TensorType`.
 inline Type GetElementType(const Value value) {
-  return value.getType().cast<TensorType>().getElementType();
+  return mlir::cast<TensorType>(value.getType()).getElementType();
 }
 
 // Returns true iff `type` is a uniform quantized type whose storage type is
diff --git a/tensorflow/compiler/mlir/quantization/common/uniform_quantized_types_test.cc b/tensorflow/compiler/mlir/quantization/common/uniform_quantized_types_test.cc
index e9443a667fcef3..d4055b1732b1d8 100644
--- a/tensorflow/compiler/mlir/quantization/common/uniform_quantized_types_test.cc
+++ b/tensorflow/compiler/mlir/quantization/common/uniform_quantized_types_test.cc
@@ -348,7 +348,8 @@ TEST_F(IsI8F32UniformQuantizedTypeTest, UniformQuantizedTypeSucceeds) {
       /*flags=*/QuantizationFlags::Signed, builder_.getI8Type(),
       builder_.getF32Type(), /*scale=*/1.0,
       /*zeroPoint=*/0, /*storageTypeMin=*/-128, /*storageTypeMax=*/127);
-  EXPECT_THAT(qi8_type.dyn_cast_or_null<UniformQuantizedType>(), NotNull());
+  EXPECT_THAT(mlir::dyn_cast_or_null<UniformQuantizedType>(qi8_type),
+              NotNull());
 }
 
 TEST_F(IsI8F32UniformQuantizedTypeTest, StorageTypeI8Succeeds) {
@@ -398,8 +399,9 @@ TEST_F(IsI8F32UniformQuantizedTypeTest, UniformQuantizedPerAxisTypeSucceeds) {
           /*scales=*/{1.0},
           /*zeroPoints=*/{0}, /*quantizedDimension=*/0, /*storageTypeMin=*/-128,
           /*storageTypeMax=*/127);
-  EXPECT_THAT(qi8_per_axis_type.dyn_cast_or_null<UniformQuantizedPerAxisType>(),
-              NotNull());
+  EXPECT_THAT(
+      mlir::dyn_cast_or_null<UniformQuantizedPerAxisType>(qi8_per_axis_type),
+      NotNull());
 }
 
 TEST_F(IsI8F32UniformQuantizedPerAxisTypeTest, StorageTypeI8Succeeds) {
@@ -452,7 +454,8 @@ TEST_F(IsI32F32UniformQuantizedTypeTest, UniformQuantizedTypeSucceeds) {
       /*zeroPoint=*/0, /*storageTypeMin=*/-2147483647,
       /*storageTypeMax=*/2147483646);
   EXPECT_TRUE(IsI32F32UniformQuantizedType(qi32_type));
-  EXPECT_THAT(qi32_type.dyn_cast_or_null<UniformQuantizedType>(), NotNull());
+  EXPECT_THAT(mlir::dyn_cast_or_null<UniformQuantizedType>(qi32_type),
+              NotNull());
 }
 
 TEST_F(IsI32F32UniformQuantizedTypeTest, StorageTypeI32Succeeds) {
@@ -509,7 +512,7 @@ TEST_F(IsI32F32UniformQuantizedPerAxisTypeTest,
       /*storageTypeMax=*/127);
   EXPECT_FALSE(IsI32F32UniformQuantizedPerAxisType(qi8_type));
   EXPECT_FALSE(IsStorageTypeI32(qi8_type));
-  EXPECT_THAT(qi8_type.dyn_cast_or_null<UniformQuantizedPerAxisType>(),
+  EXPECT_THAT(mlir::dyn_cast_or_null<UniformQuantizedPerAxisType>(qi8_type),
               IsNull());
 }
 
@@ -523,7 +526,7 @@ TEST_F(IsI32F32UniformQuantizedTypeTest, UniformQuantizedPerAxisTypeSucceeds) {
           /*storageTypeMin=*/-2147483647, /*storageTypeMax=*/2147483646);
 
   EXPECT_THAT(
-      qi32_per_axis_type.dyn_cast_or_null<UniformQuantizedPerAxisType>(),
+      mlir::dyn_cast_or_null<UniformQuantizedPerAxisType>(qi32_per_axis_type),
       NotNull());
 }
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
index 3da423119752cb..db53084419f1f9 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
@@ -51,6 +51,7 @@ cc_library(
         "passes/convert_xla_call_module_op_to_bfloat16.cc",
         "passes/defer_activation_transpose.cc",
         "passes/fold_constant_transpose.cc",
+        "passes/insert_calibration_statistics_saver.cc",
         "passes/insert_weight_param.cc",
         "passes/lift_quantizable_spots_as_functions.cc",
         "passes/lift_quantizable_spots_as_functions_fusion.inc",
@@ -99,6 +100,7 @@ cc_library(
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:permutation",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:report",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/instrumentations:save_report",
         "//tensorflow/compiler/mlir/quantization/stablehlo/ops:stablehlo_op_quant_spec",
         "//tensorflow/compiler/mlir/quantization/tensorflow:passes",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
@@ -138,6 +140,7 @@ cc_library(
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
+        "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:regexp",
         "@local_tsl//tsl/platform:str_util",
@@ -572,6 +575,7 @@ tf_cc_test(
     deps = [
         ":math_utils",
         "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:Support",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD
index 5ae92d648bf5c9..f175dfdc9ea1a2 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD
@@ -40,6 +40,8 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings:string_view",
     ],
 )
 
@@ -62,7 +64,9 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
@@ -75,6 +79,7 @@ tf_cc_test(
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:status",
@@ -110,10 +115,7 @@ cc_library(
     hdrs = ["debugger.h"],
     compatible_with = get_compatible_with_portable(),
     deps = [
-        ":graph_def",
-        "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow:tf_quant_ops",
-        "//tensorflow/core:protos_all_cc",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
     ],
@@ -321,9 +323,12 @@ cc_library(
     hdrs = ["report.h"],
     compatible_with = get_compatible_with_portable(),
     deps = [
+        ":io",
         "//tensorflow/compiler/mlir/quantization/common:lift_as_function_call",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
         "//tensorflow/compiler/mlir/tensorflow",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
@@ -337,13 +342,18 @@ tf_cc_test(
     name = "report_test",
     srcs = ["report_test.cc"],
     deps = [
+        ":io",
         ":report",
         "//tensorflow/compiler/mlir/quantization/common:test_base",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:protobuf",
+        "@local_tsl//tsl/platform:status_matchers",
     ],
 )
 
@@ -365,8 +375,10 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         ":component",
+        ":config",
         ":pass_pipeline",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/instrumentations:save_report",
         "//tensorflow/compiler/mlir/quantization/tensorflow:passes",
         "//tensorflow/compiler/mlir/quantization/tensorflow/cc:run_passes",
         "@com_google_absl//absl/base:nullability",
@@ -436,12 +448,14 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         ":component",
+        ":config",
         ":context",
         ":pass_pipeline",
         ":saved_model_export",
         ":saved_model_import",
         ":types",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/instrumentations:save_report",
         "//tensorflow/compiler/mlir/quantization/tensorflow:passes",
         "//tensorflow/compiler/mlir/quantization/tensorflow/cc:run_passes",
         "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib",
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/BUILD
index 3fbd4ed586e45f..9926546f8c47a8 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/BUILD
@@ -25,15 +25,22 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:io",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration:min_max_value",
         "//tensorflow/compiler/mlir/quantization/tensorflow:tf_quant_ops",
         "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibration_statistics_proto_cc",
-        "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibrator_singleton",
         "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:path",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -45,6 +52,7 @@ cc_library(
     deps = [
         ":representative_dataset",
         ":statistics",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:passes",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:component",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:debugger",
@@ -53,8 +61,11 @@ cc_library(
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:types",
         "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibration_statistics_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:run_passes",
         "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib",
         "//tensorflow/core/protobuf:for_core_protos_cc",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
@@ -67,6 +78,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
+        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/calibration_parameters.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/calibration_parameters.h
index ffad37d15d243c..9e1950afa76dba 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/calibration_parameters.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/calibration_parameters.h
@@ -23,10 +23,6 @@ limitations under the License.
 
 namespace stablehlo::quantization {
 
-// TODO: b/321158562 - Make the number of bins configurable.
-// Default number of histogram bins for each batch sample.
-constexpr int32_t kDefaultNumOfBins = 1 << 9;
-
 // Calculates the bin width from the range and expected number of bins. The
 // bin width is formalized to the form of 2^n. As a consequence, the actual
 // number of bins might be smaller than the given `num_bins`.
@@ -70,8 +66,10 @@ inline bool IsHistogramCalibration(
 }
 
 // Gets the number of bins for the given calibration method.
-inline int32_t GetNumBins(const CalibrationOptions::CalibrationMethod method) {
-  return IsHistogramCalibration(method) ? kDefaultNumOfBins : 0;
+inline int32_t GetNumBins(const CalibrationOptions& calib_opts) {
+  return IsHistogramCalibration(calib_opts.calibration_method())
+             ? calib_opts.calibration_parameters().num_bins()
+             : 0;
 }
 
 }  // namespace stablehlo::quantization
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.cc
index ce626145318b9f..52db906e512391 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/base/nullability.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/log/die_if_null.h"
@@ -40,26 +41,67 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/types.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
 namespace mlir::quant::stablehlo {
+namespace {
 
 using ::stablehlo::quantization::AddCalibrationStatistics;
 using ::stablehlo::quantization::CreateRepresentativeDatasetFileMap;
 using ::stablehlo::quantization::DisableDebugging;
+using ::stablehlo::quantization::IsCalibrationRequired;
 using ::stablehlo::quantization::QuantizationConfig;
+using ::stablehlo::quantization::ReadStatistics;
 using ::stablehlo::quantization::RepresentativeDatasetConfig;
 using ::stablehlo::quantization::io::CreateTmpDir;
 using ::stablehlo::quantization::io::GetLocalTmpFileName;
+using ::stablehlo::quantization::io::ListDirectory;
 using ::tensorflow::AssetFileDef;
 using ::tensorflow::SignatureDef;
+using ::tensorflow::calibrator::CalibrationStatistics;
 using ::tensorflow::quantization::ExportedModel;
 using ::tensorflow::quantization::PyFunctionLibrary;
+using ::tensorflow::quantization::RunPasses;
+using CalibrationStatisticsFlatMap =
+    absl::flat_hash_map<std::string, CalibrationStatistics>;
+
+}  // namespace
+
+absl::Status RunCalibrationPasses(
+    mlir::ModuleOp module_op, MLIRContext& ctx,
+    absl::string_view calibration_data_dir,
+    const bool force_regenerate_calibration_data) {
+  // Disable DumpTensor ops when running calibration.
+  DisableDebugging(module_op);
+
+  std::vector<std::string> skipping_aggregator_ops;
+  if (!force_regenerate_calibration_data) {
+    TF_ASSIGN_OR_RETURN(const CalibrationStatisticsFlatMap statistics_map,
+                        ReadStatistics(calibration_data_dir));
+    absl::c_for_each(statistics_map, [&](const auto& iter) {
+      return skipping_aggregator_ops.push_back(iter.first);
+    });
+  }
+
+  return RunPasses(
+      /*name=*/
+      CalibrationComponent::kName,
+      /*add_passes_func=*/
+      [calibration_data_dir, &skipping_aggregator_ops](PassManager& pm) {
+        pm.addPass(CreateInsertCalibrationStatisticsSaverPass(
+            calibration_data_dir, skipping_aggregator_ops));
+      },
+      ctx, module_op);
+}
 
 CalibrationComponent::CalibrationComponent(
     absl::Nonnull<MLIRContext*> ctx,
@@ -77,16 +119,23 @@ CalibrationComponent::CalibrationComponent(
       signature_def_map_(std::move(signature_def_map)),
       signature_keys_(std::move(signature_keys)) {}
 
-absl::StatusOr<ExportedModel> CalibrationComponent::ExportToSavedModel(
-    ModuleOp module_op, const absl::string_view dst_saved_model_path) {
+absl::Status CalibrationComponent::ExportToSavedModel(
+    ModuleOp module_op, absl::string_view calibration_data_dir,
+    const bool force_regenerate_calibration_data,
+    const absl::string_view dst_saved_model_path) {
   TF_ASSIGN_OR_RETURN(const std::string checkpoint_dir, GetLocalTmpFileName());
 
   // Clone ModuleOp and function aliases so changes in this pipeline won't
   // be reflected in the original values.
   mlir::OwningOpRef<mlir::ModuleOp> cloned_module_ref(module_op.clone());
 
-  // Disable DumpTensor ops when running calibration.
-  DisableDebugging(*cloned_module_ref);
+  TF_RETURN_IF_ERROR(RunCalibrationPasses(*cloned_module_ref, *ctx_,
+                                          calibration_data_dir,
+                                          force_regenerate_calibration_data));
+
+  const bool is_calibration_required =
+      IsCalibrationRequired(*cloned_module_ref);
+  if (!is_calibration_required) return absl::OkStatus();
 
   // `duplicate_shape_determining_constants = false` because the
   // resulting graph of this step is not expected to be loaded on TPU.
@@ -107,42 +156,52 @@ absl::StatusOr<ExportedModel> CalibrationComponent::ExportToSavedModel(
                                       src_saved_model_path_, tags_,
                                       signature_def_map_);
 
-  return exported_model;
+  return absl::OkStatus();
 }
 
 absl::StatusOr<ModuleOp> CalibrationComponent::Run(
     ModuleOp module_op, const QuantizationConfig& config) {
-  // Exports the pre-calibrated model to SavedModel.
-  TF_ASSIGN_OR_RETURN(const std::string precalibrated_saved_model_dir,
+  // Export the calibration model to SavedModel.
+  TF_ASSIGN_OR_RETURN(const std::string calibration_saved_model_dir,
                       CreateTmpDir());
 
-  TF_ASSIGN_OR_RETURN(
-      ExportedModel exported_model,
-      ExportToSavedModel(module_op, precalibrated_saved_model_dir));
-
-  // Translates `RepresentativeDatasetConfig`s to signature key ->
-  // `RepresentativeDatasetFile` mapping.
-  const auto dataset_configs =
-      config.calibration_options().representative_datasets();
-  const std::vector<RepresentativeDatasetConfig> dataset_config_vector(
-      dataset_configs.begin(), dataset_configs.end());
-  TF_ASSIGN_OR_RETURN(
-      const auto representative_dataset_file_map,
-      CreateRepresentativeDatasetFileMap(dataset_config_vector));
-
-  // Runs calibration on the exported model. The statistics will be stored in a
-  // separate singleton object `CalibratorSingleton` and are directly added to
-  // `exported_model` without re-importing it.
-  if (py_function_lib_->RunCalibration(
-          precalibrated_saved_model_dir, signature_keys_, tags_,
-          /*force_graph_mode_calibration=*/true,
-          representative_dataset_file_map) == std::nullopt) {
-    return absl::InternalError(
-        "CalibrationComponent error: Failed to run calibration.");
+  std::string calibration_data_dir =
+      config.calibration_options().calibration_data_dir();
+  if (calibration_data_dir.empty()) {
+    TF_ASSIGN_OR_RETURN(calibration_data_dir, CreateTmpDir());
+  }
+
+  TF_RETURN_IF_ERROR(ExportToSavedModel(
+      module_op, calibration_data_dir,
+      config.calibration_options().force_regenerate_calibration_data(),
+      calibration_saved_model_dir));
+
+  TF_ASSIGN_OR_RETURN(std::vector<std::string> calibration_saved_model_files,
+                      ListDirectory(calibration_saved_model_dir));
+  if (!calibration_saved_model_files.empty()) {
+    // Translate `RepresentativeDatasetConfig`s to signature key ->
+    // `RepresentativeDatasetFile` mapping.
+    const auto dataset_configs =
+        config.calibration_options().representative_datasets();
+    const std::vector<RepresentativeDatasetConfig> dataset_config_vector(
+        dataset_configs.begin(), dataset_configs.end());
+    TF_ASSIGN_OR_RETURN(
+        const auto representative_dataset_file_map,
+        CreateRepresentativeDatasetFileMap(dataset_config_vector));
+
+    // Run calibration on the exported model.
+    if (py_function_lib_->RunCalibration(
+            calibration_saved_model_dir, signature_keys_, tags_,
+            /*force_graph_mode_calibration=*/true,
+            representative_dataset_file_map) == std::nullopt) {
+      return absl::InternalError(
+          "CalibrationComponent error: Failed to run calibration.");
+    }
   }
 
   if (absl::Status status = AddCalibrationStatistics(
-          module_op, config.calibration_options(), *py_function_lib_);
+          module_op, calibration_data_dir, config.calibration_options(),
+          *py_function_lib_);
       !status.ok()) {
     LOG(WARNING) << "Some CustomAggregator ops do not have min or max "
                     "values. Parts of the graph are not quantized. "
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.h
index cb137031948a3a..03d2dd933732d4 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.h
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "absl/base/nullability.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
@@ -76,9 +77,11 @@ class CalibrationComponent : public Component {
   // Exports `module_op` to SavedModel at `dst_saved_model_path`. This is used
   // to export the pre-calibrated `module_op` to SavedModel so that the
   // calibration process can use it to load and run the graph with the
-  // representative dataset.
-  absl::StatusOr<tensorflow::quantization::ExportedModel> ExportToSavedModel(
-      ModuleOp module_op, absl::string_view dst_saved_model_path);
+  // representative dataset. Returns a failure status if the export fails.
+  absl::Status ExportToSavedModel(ModuleOp module_op,
+                                  absl::string_view calibration_data_dir,
+                                  bool force_regenerate_calibration_data,
+                                  absl::string_view dst_saved_model_path);
 
   // Imports the SavedModel at `calibrated_saved_model_path` to `ModuleOp` after
   // running calibration.
@@ -109,6 +112,11 @@ class CalibrationComponent : public Component {
   const std::vector<std::string> signature_keys_;
 };
 
+// Runs passes to prepare the calibration model.
+absl::Status RunCalibrationPasses(mlir::ModuleOp module_op, MLIRContext& ctx,
+                                  absl::string_view calibration_data_dir,
+                                  bool force_regenerate_calibration_data);
+
 }  // namespace mlir::quant::stablehlo
 
 #endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_COMPONENT_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.cc
index 19a44097458f1a..ea96bd029b079e 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.cc
@@ -15,39 +15,69 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.h"
 
 #include <optional>
+#include <string>
+#include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/min_max_value.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
+#include "tsl/platform/path.h"
+#include "tsl/platform/statusor.h"
 
 namespace stablehlo::quantization {
 namespace {
 
 using ::stablehlo::quantization::CalibrationOptions;
 using ::tensorflow::calibrator::CalibrationStatistics;
-using ::tensorflow::calibrator::CalibratorSingleton;
+using ::tensorflow::calibrator::CalibrationStatisticsMap;
 using ::tensorflow::quantization::PyFunctionLibrary;
+using CalibrationStatisticsFlatMap =
+    absl::flat_hash_map<std::string, CalibrationStatistics>;
 
 }  // namespace
 
+// Reads the calibration statistics from the given directory.
+absl::StatusOr<CalibrationStatisticsFlatMap> ReadStatistics(
+    absl::string_view calibration_data_dir) {
+  TF_ASSIGN_OR_RETURN(std::vector<std::string> statistics_files,
+                      io::ListDirectory(calibration_data_dir));
+
+  CalibrationStatisticsFlatMap statistics_map;
+  for (const std::string& statistics_file : statistics_files) {
+    TF_ASSIGN_OR_RETURN(
+        const auto single_map,
+        io::ReadBinaryProto<CalibrationStatisticsMap>(
+            tsl::io::JoinPath(calibration_data_dir, statistics_file)));
+    statistics_map.insert(single_map.statistics().begin(),
+                          single_map.statistics().end());
+  }
+  return statistics_map;
+}
+
 absl::Status AddCalibrationStatistics(
-    mlir::ModuleOp module_op, const CalibrationOptions& calibration_options,
+    mlir::ModuleOp module_op, absl::string_view calibration_data_dir,
+    const CalibrationOptions& calibration_options,
     const PyFunctionLibrary& py_function_library) {
+  TF_ASSIGN_OR_RETURN(const CalibrationStatisticsFlatMap statistics_map,
+                      ReadStatistics(calibration_data_dir));
+
   absl::Status status = absl::OkStatus();
-  module_op.walk([&py_function_library, &calibration_options,
-                  &status](mlir::TF::CustomAggregatorOp aggregator_op) {
+  module_op.walk([&py_function_library, &calibration_options, &status,
+                  &statistics_map](mlir::TF::CustomAggregatorOp aggregator_op) {
     mlir::StringRef id = aggregator_op.getId();
-    std::optional<CalibrationStatistics> statistics =
-        CalibratorSingleton::GetStatistics(id);
-    if (statistics == std::nullopt) {
+    auto iter = statistics_map.find(id);
+    if (iter == statistics_map.end()) {
       status = absl::InternalError(
           absl::StrFormat("Calibrated data does not exist. Cannot find "
                           "statistics. value for id: %s",
@@ -56,10 +86,8 @@ absl::Status AddCalibrationStatistics(
     }
 
     const std::optional<MinMaxValue> min_max_values =
-        py_function_library.GetCalibrationMinMaxValue(*statistics,
+        py_function_library.GetCalibrationMinMaxValue(iter->second,
                                                       calibration_options);
-    CalibratorSingleton::ClearData(id);
-
     if (min_max_values == std::nullopt) {
       status = absl::InternalError(
           "Cannot find min/max values for calibration statistics.");
@@ -74,4 +102,14 @@ absl::Status AddCalibrationStatistics(
   return status;
 }
 
+bool IsCalibrationRequired(mlir::ModuleOp module_op) {
+  bool calibration_required = false;
+  module_op.walk(
+      [&calibration_required](
+          mlir::TF::CalibrationStatisticsSaverOp statistics_saver_op) {
+        calibration_required = true;
+      });
+  return calibration_required;
+}
+
 }  // namespace stablehlo::quantization
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.h
index 9b67f22a2dac72..41f78be3578bca 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.h
@@ -15,22 +15,36 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_STATISTICS_H_
 #define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_STATISTICS_H_
 
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
 
 namespace stablehlo::quantization {
 
+// Reads the calibration statistics from the given directory.
+absl::StatusOr<absl::flat_hash_map<
+    std::string, tensorflow::calibrator::CalibrationStatistics>>
+ReadStatistics(absl::string_view calibration_data_dir);
+
 // Adds calibrated min / max values to CustomAggregator nodes in `graph_def`.
 // The min and max values will be added to the "min" and "max" attributes,
 // respectively. `calibration_options` provides the strategy to retrieve min and
 // max values.
 absl::Status AddCalibrationStatistics(
-    mlir::ModuleOp module_op,
+    mlir::ModuleOp module_op, absl::string_view calibration_data_dir,
     const stablehlo::quantization::CalibrationOptions& calibration_options,
     const tensorflow::quantization::PyFunctionLibrary& py_function_library);
 
+// Checks if the model required calibration.
+bool IsCalibrationRequired(mlir::ModuleOp module_op);
+
 }  // namespace stablehlo::quantization
 
 #endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_STATISTICS_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.cc
index b3aa1500a0a3c7..1522c68f300cba 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.cc
@@ -29,59 +29,35 @@ void PopulateDefaultCalibrationOptions(QuantizationConfig& quant_config) {
     quant_config.mutable_calibration_options()->set_calibration_method(
         CalibrationOptions::CALIBRATION_METHOD_MIN_MAX);
   }
+
   switch (quant_config.calibration_options().calibration_method()) {
-    case CalibrationOptions::CALIBRATION_METHOD_MIN_MAX:
-      break;
-    case CalibrationOptions::CALIBRATION_METHOD_AVERAGE_MIN_MAX:
-      break;
     case CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_PERCENTILE:
-      if (quant_config.calibration_options()
-              .calibration_parameters()
-              .initial_num_bins() == 0) {
-        quant_config.mutable_calibration_options()
-            ->mutable_calibration_parameters()
-            ->set_initial_num_bins(256);
-      }
-      if (quant_config.calibration_options()
-              .calibration_parameters()
-              .min_percentile() == 0) {
-        quant_config.mutable_calibration_options()
-            ->mutable_calibration_parameters()
-            ->set_min_percentile(0.001);
-      }
-      if (quant_config.calibration_options()
-              .calibration_parameters()
-              .max_percentile() == 0) {
-        quant_config.mutable_calibration_options()
-            ->mutable_calibration_parameters()
-            ->set_max_percentile(99.999);
-      }
-      break;
     case CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_BRUTEFORCE:
-      if (quant_config.calibration_options()
-              .calibration_parameters()
-              .initial_num_bins() == 0) {
-        quant_config.mutable_calibration_options()
-            ->mutable_calibration_parameters()
-            ->set_initial_num_bins(256);
-      }
-      break;
     case CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_MAX_FREQUENCY:
-      if (quant_config.calibration_options()
-              .calibration_parameters()
-              .initial_num_bins() == 0) {
-        quant_config.mutable_calibration_options()
-            ->mutable_calibration_parameters()
-            ->set_initial_num_bins(256);
-      }
-      break;
     case CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_SYMMETRIC:
       if (quant_config.calibration_options()
               .calibration_parameters()
-              .initial_num_bins() == 0) {
+              .num_bins() == 0) {
         quant_config.mutable_calibration_options()
             ->mutable_calibration_parameters()
-            ->set_initial_num_bins(256);
+            ->set_num_bins(512);
+      }
+      if (quant_config.calibration_options().calibration_method() ==
+          CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_PERCENTILE) {
+        if (quant_config.calibration_options()
+                .calibration_parameters()
+                .min_percentile() == 0) {
+          quant_config.mutable_calibration_options()
+              ->mutable_calibration_parameters()
+              ->set_min_percentile(0.001);
+        }
+        if (quant_config.calibration_options()
+                .calibration_parameters()
+                .max_percentile() == 0) {
+          quant_config.mutable_calibration_options()
+              ->mutable_calibration_parameters()
+              ->set_max_percentile(99.999);
+        }
       }
       break;
     default:
@@ -109,11 +85,18 @@ QuantizationSpec GetDefaultStaticRangePtqSpec(StaticRangePtqPreset preset) {
   return spec;
 }
 
-QuantizationSpec GetDefaultWeightOnlyPtqSpec(WeightOnlyPtqPreset preset) {
+QuantizationSpec GetDefaultWeightOnlyPtqSpec() {
   QuantizationSpec spec{};
   spec.mutable_matcher()->mutable_function_name()->set_regex(
       "^.*(conv|dot_general).*");
-  spec.mutable_method()->mutable_weight_only_ptq();
+
+  WeightOnlyPtq& weight_only_ptq_spec =
+      *spec.mutable_method()->mutable_weight_only_ptq();
+  if (auto [iter, inserted] =
+          weight_only_ptq_spec.mutable_input_quantized_types()->try_emplace(1);
+      inserted) {
+    iter->second.mutable_dimension_specs();
+  }
   return spec;
 }
 
@@ -133,6 +116,9 @@ QuantizationSpec GetDefaultWeightOnlyPtqSpec(WeightOnlyPtqPreset preset) {
 // }
 QuantizationSpec GetPtqSpecForConvolution(Method::MethodCase method_case) {
   QuantizationSpec spec{};
+  if (method_case != Method::kStaticRangePtq) {
+    return spec;
+  }
 
   // Matches all convolution quantizable unit family.
   spec.mutable_matcher()->mutable_function_name()->set_regex(
@@ -147,18 +133,10 @@ QuantizationSpec GetPtqSpecForConvolution(Method::MethodCase method_case) {
 
   // The index of weight operands passed to lifted functions for convolution
   // is 1.
-  if (method_case == Method::kStaticRangePtq) {
-    StaticRangePtq& static_range_ptq_spec =
-        *spec.mutable_method()->mutable_static_range_ptq();
-    static_range_ptq_spec.mutable_input_quantized_types()->try_emplace(
-        1, std::move(conv_weight_quantized_type));
-  } else if (method_case == Method::kWeightOnlyPtq) {
-    WeightOnlyPtq& weight_only_ptq_spec =
-        *spec.mutable_method()->mutable_weight_only_ptq();
-    weight_only_ptq_spec.mutable_input_quantized_types()->try_emplace(
-        1, std::move(conv_weight_quantized_type));
-  }
-
+  StaticRangePtq& static_range_ptq_spec =
+      *spec.mutable_method()->mutable_static_range_ptq();
+  static_range_ptq_spec.mutable_input_quantized_types()->try_emplace(
+      1, std::move(conv_weight_quantized_type));
   return spec;
 };
 
@@ -192,15 +170,12 @@ void ExpandStaticRangePtqPreset(const StaticRangePtqPreset& preset,
   config.mutable_specs()->Swap(&new_specs);
 }
 
-void ExpandWeightOnlyPtqPreset(const WeightOnlyPtqPreset& preset,
-                               QuantizationConfig& config) {
+void ExpandWeightOnlyPtqPreset(QuantizationConfig& config) {
   // Create a new `QuantizationSpecs` to replace the existing one. The
   // expansion from `WeightOnlyPtqPreset` gets populated first and then
   // user-provided explicit `QuantizationSpec`s will be appended.
   QuantizationSpecs new_specs{};
-  *new_specs.add_specs() =
-      GetDefaultWeightOnlyPtqSpec(/*preset=*/config.weight_only_ptq_preset());
-  // TODO: b/307625297 - Add per-channel weight only support.
+  *new_specs.add_specs() = GetDefaultWeightOnlyPtqSpec();
 
   // Append user-provided specs to override existing specs.
   const QuantizationSpecs& previous_specs = config.specs();
@@ -222,7 +197,7 @@ QuantizationConfig ExpandPresets(const QuantizationConfig& config) {
       ExpandStaticRangePtqPreset(config.static_range_ptq_preset(), new_config);
       break;
     case QuantizationConfig::kWeightOnlyPtqPreset:
-      ExpandWeightOnlyPtqPreset(config.weight_only_ptq_preset(), new_config);
+      ExpandWeightOnlyPtqPreset(new_config);
       break;
     default:
       // Preset has not been specified. The expansion is a no-op.
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h
index 19f250bedfe1b8..f668cacd41ba2d 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h
@@ -15,6 +15,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CONFIG_H_
 #define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CONFIG_H_
 
+#include <optional>
+
+#include "absl/base/attributes.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 
 namespace stablehlo::quantization {
@@ -45,6 +49,17 @@ QuantizationConfig ExpandPresets(const QuantizationConfig& config);
 bool HasQuantizationMethod(const QuantizationSpecs& specs,
                            Method::MethodCase method_case);
 
+// Convenience function for converting the optional `report_file_path` field to
+// `std::optional<absl::string_view>`, where `std::nullopt` represents that the
+// field is not explicitly set. The returned value is a reference type
+// (`absl::string_view`) so its lifetime is bound to the input `config`.
+inline std::optional<absl::string_view> GetReportFilePath(
+    const QuantizationConfig& config ABSL_ATTRIBUTE_LIFETIME_BOUND) {
+  return config.has_report_file_path()
+             ? std::make_optional<absl::string_view>(config.report_file_path())
+             : std::nullopt;
+}
+
 }  // namespace stablehlo::quantization
 
 #endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CONFIG_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config_test.cc
index c46daaf1252f26..4662339f85624f 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config_test.cc
@@ -24,6 +24,7 @@ namespace {
 using ::testing::Eq;
 using ::testing::SizeIs;
 using ::testing::StrEq;
+using ::testing::Truly;
 
 TEST(PopulateDefaultsTest, PopulateDefaultsForEmptyConfig) {
   QuantizationConfig config{};
@@ -69,18 +70,16 @@ TEST(PopulateDefaultsTest, ExplicitCalibrationOptionsNotOverridden) {
       *config.mutable_calibration_options();
   calibration_options.set_calibration_method(
       CalibrationOptions::CALIBRATION_METHOD_AVERAGE_MIN_MAX);
-  calibration_options.mutable_calibration_parameters()->set_initial_num_bins(
-      512);
+  calibration_options.mutable_calibration_parameters()->set_num_bins(512);
 
   // Test that if the user explicitly provided `calibration_options`, it is not
   // overridden.
   const QuantizationConfig new_config = PopulateDefaults(config);
   EXPECT_THAT(new_config.calibration_options().calibration_method(),
               Eq(CalibrationOptions::CALIBRATION_METHOD_AVERAGE_MIN_MAX));
-  EXPECT_THAT(new_config.calibration_options()
-                  .calibration_parameters()
-                  .initial_num_bins(),
-              Eq(512));
+  EXPECT_THAT(
+      new_config.calibration_options().calibration_parameters().num_bins(),
+      Eq(512));
 }
 
 TEST(PopulateDefaultsTest, DefaultNumbersPopulatedForPartOfCalibrationOptions) {
@@ -89,18 +88,16 @@ TEST(PopulateDefaultsTest, DefaultNumbersPopulatedForPartOfCalibrationOptions) {
       *config.mutable_calibration_options();
   calibration_options.set_calibration_method(
       CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_PERCENTILE);
-  calibration_options.mutable_calibration_parameters()->set_initial_num_bins(
-      512);
+  calibration_options.mutable_calibration_parameters()->set_num_bins(512);
 
   // Test that if the user explicitly provided part of the
   // `calibration_options`, it is not overridden, rest of the data are default.
   const QuantizationConfig new_config = PopulateDefaults(config);
   EXPECT_THAT(new_config.calibration_options().calibration_method(),
               Eq(CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_PERCENTILE));
-  EXPECT_THAT(new_config.calibration_options()
-                  .calibration_parameters()
-                  .initial_num_bins(),
-              Eq(512));
+  EXPECT_THAT(
+      new_config.calibration_options().calibration_parameters().num_bins(),
+      Eq(512));
   EXPECT_THAT(new_config.calibration_options()
                   .calibration_parameters()
                   .min_percentile(),
@@ -123,10 +120,9 @@ TEST(PopulateDefaultsTest,
   EXPECT_THAT(
       new_config.calibration_options().calibration_method(),
       Eq(CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_BRUTEFORCE));
-  EXPECT_THAT(new_config.calibration_options()
-                  .calibration_parameters()
-                  .initial_num_bins(),
-              Eq(256));
+  EXPECT_THAT(
+      new_config.calibration_options().calibration_parameters().num_bins(),
+      Eq(512));
   EXPECT_THAT(new_config.calibration_options()
                   .calibration_parameters()
                   .min_percentile(),
@@ -171,10 +167,12 @@ TEST(ExpandPresetsTest, ExpandStaticRangePtqEnableFullIntquantization) {
   const StaticRangePtq& srq_spec = conv_spec.method().static_range_ptq();
   ASSERT_THAT(srq_spec.input_quantized_types(), SizeIs(1));
   ASSERT_TRUE(srq_spec.input_quantized_types().contains(1));
+  ASSERT_TRUE(srq_spec.input_quantized_types().at(1).has_dimension_specs());
 
-  EXPECT_THAT(
-      srq_spec.input_quantized_types().at(1).dimension_specs().dimension(),
-      Eq(3));
+  const QuantizedDimension& dimension_specs =
+      srq_spec.input_quantized_types().at(1).dimension_specs();
+  ASSERT_TRUE(dimension_specs.has_dimension());
+  EXPECT_THAT(dimension_specs.dimension(), Eq(3));
 
   // Test that representative dataset config has been transferred to the
   // `CalibrationOptions`.
@@ -285,6 +283,15 @@ TEST(ExpandPresetsTest, ExpandWeightOnlyPtqPresetDefault) {
   EXPECT_THAT(spec.matcher().function_name().regex(),
               StrEq("^.*(conv|dot_general).*"));
   EXPECT_TRUE(spec.method().has_weight_only_ptq());
+
+  const WeightOnlyPtq& weight_only_ptq_spec = spec.method().weight_only_ptq();
+
+  EXPECT_THAT(weight_only_ptq_spec.input_quantized_types(),
+              UnorderedElementsAre(Pair(
+                  1, Truly([](const auto& quantized_type) {
+                    return quantized_type.has_dimension_specs() &&
+                           !quantized_type.dimension_specs().has_dimension();
+                  }))));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.cc
index a06c7f8ed79fb4..3a6a30a4105d4b 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.cc
@@ -16,11 +16,7 @@ limitations under the License.
 
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/graph_def.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.h"
-#include "tensorflow/core/framework/attr_value.pb.h"
-#include "tensorflow/core/framework/node_def.pb.h"
 
 namespace stablehlo::quantization {
 
@@ -29,15 +25,6 @@ void DisableDebugging(mlir::ModuleOp module_op) {
       [](mlir::TF::DumpTensorOp dump_op) { dump_op.setEnabled(false); });
 }
 
-void EnableDebugging(tensorflow::quantization::ExportedModel& exported_model) {
-  MutateNodeDefs(*exported_model.mutable_graph_def(),
-                 [](tensorflow::NodeDef& node_def) {
-                   if (node_def.op() == "DumpTensor") {
-                     (*node_def.mutable_attr())["enabled"].set_b(true);
-                   }
-                 });
-}
-
 void ChangeToQuantizedFilename(mlir::ModuleOp module_op) {
   module_op.walk([](mlir::TF::DumpTensorOp dump_op) {
     dump_op.setFileName("quantized_tensor_data.pb");
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.h
index f034e4d94ee4bf..feae14446c8515 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.h
@@ -16,16 +16,12 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_DEBUGGER_H_
 
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
 
 namespace stablehlo::quantization {
 
 // Disables debugging on `DumpTensor` ops.
 void DisableDebugging(mlir::ModuleOp module_op);
 
-// Enables debugging on `DumpTensor` ops.
-void EnableDebugging(tensorflow::quantization::ExportedModel& exported_model);
-
 // Changes the filename from `unquantized_tensor_data.pb` to
 // `quantized_tensor_data.pb`.
 void ChangeToQuantizedFilename(mlir::ModuleOp module_op);
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/io.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/io.cc
index 16a1013ae25166..94aa9ef780a522 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/io.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/io.cc
@@ -15,11 +15,14 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h"
 
 #include <string>
+#include <vector>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
 #include "tsl/platform/env.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
 namespace stablehlo::quantization::io {
@@ -53,4 +56,32 @@ absl::StatusOr<std::string> CreateTmpDir() {
   return CreateTmpDir(tsl::Env::Default());
 }
 
+absl::Status WriteStringToFile(const absl::string_view file_path,
+                               const absl::string_view data) {
+  auto* env = tsl::Env::Default();
+  return WriteStringToFile(env, std::string(file_path), data);
+}
+
+absl::StatusOr<std::string> ReadFileToString(
+    const absl::string_view file_path) {
+  auto* env = tsl::Env::Default();
+  std::string data{};
+  absl::Status read_status =
+      ReadFileToString(env, std::string(file_path), &data);
+
+  if (read_status.ok()) {
+    return data;
+  } else {
+    return read_status;
+  }
+}
+
+absl::StatusOr<std::vector<std::string>> ListDirectory(
+    absl::string_view directory) {
+  std::vector<std::string> children;
+  TF_RETURN_IF_ERROR(
+      tsl::Env::Default()->GetChildren(std::string(directory), &children));
+  return children;
+}
+
 }  // namespace stablehlo::quantization::io
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h
index bf17ba641f9da5..39c99436e361b3 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h
@@ -16,9 +16,13 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_IO_H_
 
 #include <string>
+#include <vector>
 
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "tsl/platform/env.h"
+#include "tsl/platform/errors.h"
 
 namespace stablehlo::quantization::io {
 
@@ -41,6 +45,29 @@ absl::StatusOr<std::string> CreateTmpDir(tsl::Env* env);
 // returned by `tsl::Env::Default`.
 absl::StatusOr<std::string> CreateTmpDir();
 
+// Convenience function for writing string `data` to file without the need to
+// pass `tsl::Env` instance. Internally it uses the default `tsl::Env::Default`.
+absl::Status WriteStringToFile(absl::string_view file_path,
+                               absl::string_view data);
+
+// Convenience function for reading string data from file at `file_path` without
+// the need to pass `tsl::Env` instance. Internally it uses the default
+// `tsl::Env::Default`. Returns an OK status with string data containing file
+// contents. Returns non-ok status upon error, e.g. file doesn't exist.
+absl::StatusOr<std::string> ReadFileToString(absl::string_view file_path);
+
+// Lists all files and directories under the given directory.
+absl::StatusOr<std::vector<std::string>> ListDirectory(
+    absl::string_view directory);
+
+template <class MessageT>
+absl::StatusOr<MessageT> ReadBinaryProto(const std::string& binary_file_path) {
+  MessageT message;
+  TF_RETURN_IF_ERROR(
+      tsl::ReadBinaryProto(tsl::Env::Default(), binary_file_path, &message));
+  return message;
+}
+
 }  // namespace stablehlo::quantization::io
 
 #endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_IO_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/io_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/io_test.cc
index f4f1c5c16589e4..180df43a62a249 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/io_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/io_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h"
 
 #include <cstdint>
+#include <fstream>
 #include <string>
 #include <vector>
 
@@ -23,18 +24,21 @@ limitations under the License.
 #include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/file_system.h"
-#include "tsl/platform/status.h"
 #include "tsl/platform/status_matchers.h"
 #include "tsl/platform/types.h"
 
 namespace stablehlo::quantization::io {
 namespace {
 
+using ::testing::Eq;
 using ::testing::HasSubstr;
 using ::testing::IsEmpty;
 using ::testing::Not;
+using ::testing::SizeIs;
+using ::testing::UnorderedElementsAre;
 using ::tsl::testing::IsOk;
 using ::tsl::testing::StatusIs;
 
@@ -140,5 +144,63 @@ TEST(IoTest, CreateTmpDirWhenInvalidPathReturnsInternalError) {
                                 HasSubstr("Failed to create tmp dir")));
 }
 
+TEST(IoTest, WriteStringToFile) {
+  const std::string dst_file_path =
+      absl::StrCat(testing::TempDir(), "/tmp_file");
+
+  const absl::Status write_status =
+      WriteStringToFile(dst_file_path, "test_string");
+  ASSERT_THAT(write_status, IsOk());
+
+  auto* const env = tsl::Env::Default();
+  ASSERT_THAT(env->FileExists(dst_file_path), IsOk());
+
+  std::string data{};
+  ASSERT_THAT(tsl::ReadFileToString(env, dst_file_path, &data), IsOk());
+
+  EXPECT_THAT(data, Eq("test_string"));
+}
+
+TEST(IoTest, ReadFileToString) {
+  // Prepare a temp file and write some string to it.
+  const std::string src_file_path =
+      absl::StrCat(testing::TempDir(), "/tmp_file");
+
+  {
+    std::ofstream ofs(src_file_path);
+    ofs << "test_string";
+  }
+
+  // Test that the contents match.
+  const absl::StatusOr<std::string> read_status =
+      ReadFileToString(src_file_path);
+  ASSERT_THAT(read_status, IsOk());
+  EXPECT_THAT(*read_status, Eq("test_string"));
+}
+
+TEST(IoTest, ListChildrenInDirectory) {
+  absl::StatusOr<std::string> tmp_dir = CreateTmpDir();
+
+  ASSERT_THAT(tmp_dir, IsOk());
+
+  auto* const env = tsl::Env::Default();
+  EXPECT_THAT(env->FileExists(*tmp_dir), IsOk());
+
+  ASSERT_THAT(
+      WriteStringToFile(absl::StrCat(*tmp_dir, "/tmp_file1"), "test_string"),
+      IsOk());
+  ASSERT_THAT(
+      WriteStringToFile(absl::StrCat(*tmp_dir, "/tmp_file2"), "test_string"),
+      IsOk());
+  ASSERT_THAT(env->RecursivelyCreateDir(absl::StrCat(*tmp_dir, "/subdir")),
+              IsOk());
+
+  absl::StatusOr<std::vector<std::string>> children = ListDirectory(*tmp_dir);
+  EXPECT_THAT(children, IsOk());
+  EXPECT_THAT(children.value(), SizeIs(3));
+  EXPECT_THAT(children.value(),
+              UnorderedElementsAre("subdir", "tmp_file1", "tmp_file2"));
+}
+
 }  // namespace
 }  // namespace stablehlo::quantization::io
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc
index 622ff502c01ed9..490a9290c8342b 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc
@@ -32,7 +32,6 @@ using ::stablehlo::quantization::CalibrationOptions;
 using ::stablehlo::quantization::DebuggerConfig;
 using ::stablehlo::quantization::PipelineConfig;
 using ::stablehlo::quantization::QuantizationSpecs;
-using ::stablehlo::quantization::StaticRangePtqPreset;
 
 void AddPreCalibrationPasses(OpPassManager& pm,
                              const CalibrationOptions& calibration_options,
@@ -51,7 +50,6 @@ void AddPreCalibrationPasses(OpPassManager& pm,
   }
   pm.addNestedPass<func::FuncOp>(
       CreateInsertCustomAggregationOpsPass(calibration_options));
-  pm.addPass(CreateIssueIDsOfCustomAggregationOpsPass());
 }
 
 void AddPostCalibrationPasses(OpPassManager& pm,
@@ -64,7 +62,6 @@ void AddPostCalibrationPasses(OpPassManager& pm,
   options.enable_per_channel_quantized_weight_ = true;
   // For debugging purposes.
   options.mlir_dump_file_name_ = "quantize_composite_functions";
-  options.enable_weight_only_ = false;
   options.merge_fusion_with_dequantize_ =
       pipeline_config.merge_fusion_with_dequantize();
 
@@ -101,7 +98,6 @@ void AddWeightOnlyQuantizationPasses(
   QuantizeCompositeFunctionsPassOptions options;
   // For debugging purposes.
   options.mlir_dump_file_name_ = "quantize_composite_functions";
-  options.enable_weight_only_ = true;
   pm.addPass(createQuantizeCompositeFunctionsPass(options));
 
   // Add an inliner pass to inline quantized StableHLO functions.
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.cc
index 001ece707cfe90..d164a8e07617e9 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.cc
@@ -14,12 +14,16 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.h"
 
+#include <memory>
+
 #include "absl/base/nullability.h"
 #include "absl/log/die_if_null.h"
 #include "absl/status/statusor.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/save_report.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
 #include "xla/mlir_hlo/mhlo/transforms/passes.h"
@@ -27,6 +31,7 @@ limitations under the License.
 
 namespace mlir::quant::stablehlo {
 
+using ::stablehlo::quantization::GetReportFilePath;
 using ::stablehlo::quantization::PipelineConfig;
 using ::stablehlo::quantization::QuantizationConfig;
 using ::stablehlo::quantization::QuantizationSpecs;
@@ -41,6 +46,11 @@ absl::StatusOr<ModuleOp> PostCalibrationComponent::Run(
   TF_RETURN_IF_ERROR(RunPasses(
       kName, /*add_passes_func=*/
       [&config](PassManager& pm) {
+        // Add instrumentation to save quantization report after quantization.
+        pm.addInstrumentation(
+            std::make_unique<SaveQuantizationReportInstrumentation>(
+                GetReportFilePath(config)));
+
         AddPostCalibrationPasses(pm, config.pipeline_config(), config.specs());
       },
       *ctx_, module_op));
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/report.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/report.cc
index 93be3516d76f8d..f8181deca51a0e 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/report.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/report.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -26,6 +28,7 @@ limitations under the License.
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tsl/platform/protobuf.h"  // IWYU pragma: keep
@@ -33,8 +36,10 @@ limitations under the License.
 namespace mlir::quant::stablehlo {
 namespace {
 
+using ::stablehlo::quantization::Method;
 using ::stablehlo::quantization::QuantizationResult;
 using ::stablehlo::quantization::QuantizationResults;
+using ::stablehlo::quantization::io::WriteStringToFile;
 using ::tsl::protobuf::TextFormat;
 
 // Given a `quantized_func_name` that starts with `kQuantizedFuncPrefix`,
@@ -48,19 +53,27 @@ std::string GetCompositeFunctionName(const StringRef quantized_func_name) {
 // Retrieves `QuantizationResult` from `call_op`. If the callee's name starts
 // with `kQuantizedFuncPrefix` then a `QuantizationResult` will be returned with
 // its `name` field set to the callee's name reverted back to the lifted
-// function's name. Otherwise, returns `std::nullopt`.
+// function's name. Also, `call_op` must have the `kQuantizationMethodAttr`
+// attribute, which is deserialized as `Method` and set in the returned
+// `QuantizationResult`. Otherwise, it returns `std::nullopt`.
 std::optional<QuantizationResult> GetQuantizationResult(func::CallOp call_op) {
   const StringRef callee_name = call_op.getCalleeAttr().getValue();
+  if (!callee_name.starts_with(kQuantizedFuncPrefix)) {
+    return std::nullopt;  // `call_op` is not a quantized function call.
+  }
 
-  if (callee_name.starts_with(kQuantizedFuncPrefix)) {
-    // TODO: b/329554870 - Transfer the `Method` used to quantize the op.
-    QuantizationResult result{};
-    result.mutable_quantizable_unit()->set_name(
-        GetCompositeFunctionName(callee_name));
-    return result;
-  } else {
+  absl::StatusOr<Method> method = GetQuantizationMethod(call_op);
+  if (!method.ok()) {
+    call_op->emitError() << "Failed to get quantization method: "
+                         << method.status().ToString();
     return std::nullopt;
   }
+
+  QuantizationResult result{};
+  result.mutable_quantizable_unit()->set_name(
+      GetCompositeFunctionName(callee_name));
+  *result.mutable_method() = std::move(*method);
+  return result;
 }
 
 // Retrieves `QuantizationResult` from `xla_call_module_op`. If
@@ -72,9 +85,8 @@ std::optional<QuantizationResult> GetQuantizationResult(func::CallOp call_op) {
 std::optional<QuantizationResult> GetQuantizationResult(
     TF::XlaCallModuleOp xla_call_module_op) {
   const StringAttr callee_name_attr =
-      xla_call_module_op
-          ->getDiscardableAttr(kOriginalStablehloEntryFunctionAttrName)
-          .dyn_cast_or_null<StringAttr>();
+      mlir::dyn_cast_or_null<StringAttr>(xla_call_module_op->getDiscardableAttr(
+          kOriginalStablehloEntryFunctionAttrName));
 
   // `TF::XlaCallModuleOp` without the `_original_entry_function` means it is
   // not a quantizable unit.
@@ -152,4 +164,11 @@ void QuantizationReport::Print() const {
   llvm::outs().flush();  // Show the report immediately.
 }
 
+absl::Status QuantizationReport::Save(const StringRef file_path) const {
+  std::string results_str{};
+  TextFormat::PrintToString(GetQuantizationResults(), &results_str);
+
+  return WriteStringToFile(file_path, results_str);
+}
+
 }  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/report.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/report.h
index a362bb758cb60c..8252dda620dc3e 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/report.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/report.h
@@ -17,7 +17,9 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/status/status.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 
 namespace mlir::quant::stablehlo {
@@ -50,6 +52,11 @@ class QuantizationReport {
   // Prints a human-readable report to stdout.
   void Print() const;
 
+  // Saves the report to `file_path`. The textproto representation of
+  // `QuantizationResults` will be written to the file. Returns non-ok status
+  // when the file write fails.
+  absl::Status Save(StringRef file_path) const;
+
  private:
   ::stablehlo::quantization::QuantizationResults CollectResultsFromModuleOp(
       ModuleOp module_op) const;
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/report_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/report_test.cc
index 4783fb6beebc2d..690ee47e5b3c7d 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/report_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/report_test.cc
@@ -19,12 +19,17 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/common/test_base.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tsl/platform/protobuf.h"  // IWYU pragma: keep
+#include "tsl/platform/status_matchers.h"
 
 namespace mlir::quant::stablehlo {
 namespace {
@@ -33,10 +38,14 @@ using ::stablehlo::quantization::Method;
 using ::stablehlo::quantization::QuantizableUnit;
 using ::stablehlo::quantization::QuantizationResult;
 using ::stablehlo::quantization::QuantizationResults;
+using ::stablehlo::quantization::io::ReadFileToString;
+using ::testing::HasSubstr;
 using ::testing::IsEmpty;
 using ::testing::SizeIs;
 using ::testing::StrEq;
+using ::testing::TempDir;
 using ::tsl::protobuf::TextFormat;
+using ::tsl::testing::IsOk;
 
 using QuantizationReportTest = ::mlir::quant::QuantizationTestBase;
 
@@ -74,7 +83,7 @@ TEST_F(QuantizationReportTest, InitializeWithModuleOp) {
     func.func @main(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> {
       %0 = stablehlo.constant() {value = dense<127> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {1.000000e+0,2.000000e+0,3.000000e+0}>>
       %1 = stablehlo.uniform_quantize %arg0 : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, 4.000000e+0>>
-      %2 = call @quantized_dot_general_fn(%1, %0) : (tensor<1x2x!quant.uniform<i8:f32, 4.000000e+0>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {1.000000e+0,2.000000e+0,3.000000e+0}>>) -> tensor<1x3x!quant.uniform<i8:f32, 5.000000e+0>>
+      %2 = call @quantized_dot_general_fn(%1, %0) {_quantization_method = "static_range_ptq { }"} : (tensor<1x2x!quant.uniform<i8:f32, 4.000000e+0>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {1.000000e+0,2.000000e+0,3.000000e+0}>>) -> tensor<1x3x!quant.uniform<i8:f32, 5.000000e+0>>
       %3 = stablehlo.uniform_dequantize %2 : (tensor<1x3x!quant.uniform<i8:f32, 5.000000e+0>>) -> tensor<1x3xf32>
       return %3 : tensor<1x3xf32>
     }
@@ -96,11 +105,73 @@ TEST_F(QuantizationReportTest, InitializeWithModuleOp) {
 
   // Test that the quantized `QuantizableUnit` corresponding to
   // `composite_dot_general_fn` is captured.
-  // TODO: Transfer the `Method` used to quantize the op.
   const QuantizationResult& result = results.results(0);
   EXPECT_THAT(result.quantizable_unit().name(),
               StrEq("composite_dot_general_fn"));
-  EXPECT_FALSE(result.has_method());
+  EXPECT_TRUE(result.method().has_static_range_ptq());
+}
+
+TEST_F(QuantizationReportTest,
+       InitializeWithModuleOpWithoutQuantizationMethodAttribute) {
+  // A quantized dot_general op but the `CallOp` is missing the
+  // `_quantization_method` attribute.
+  constexpr absl::string_view
+      kQuantizedDotGeneralMissingQuantizationMethodAttr = R"mlir(
+    func.func @main(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> {
+      %0 = stablehlo.constant() {value = dense<127> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {1.000000e+0,2.000000e+0,3.000000e+0}>>
+      %1 = stablehlo.uniform_quantize %arg0 : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, 4.000000e+0>>
+      %2 = call @quantized_dot_general_fn(%1, %0) : (tensor<1x2x!quant.uniform<i8:f32, 4.000000e+0>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {1.000000e+0,2.000000e+0,3.000000e+0}>>) -> tensor<1x3x!quant.uniform<i8:f32, 5.000000e+0>>
+      %3 = stablehlo.uniform_dequantize %2 : (tensor<1x3x!quant.uniform<i8:f32, 5.000000e+0>>) -> tensor<1x3xf32>
+      return %3 : tensor<1x3xf32>
+    }
+
+    func.func private @quantized_dot_general_fn(%arg0: tensor<1x2x!quant.uniform<i8:f32, 4.000000e+0>>, %arg1: tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {1.000000e+0,2.000000e+0,3.000000e+0}>>) -> tensor<1x3x!quant.uniform<i8:f32, 5.000000e+0>> {
+      %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2x!quant.uniform<i8:f32, 4.000000e+0>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {1.000000e+0,2.000000e+0,3.000000e+0}>>) -> tensor<1x3x!quant.uniform<i32:f32:1, {6.000000e+0,7.000000e+0,8.000000e+0}>>
+      %1 = stablehlo.uniform_quantize %0 : (tensor<1x3x!quant.uniform<i32:f32:1, {6.000000e+0,7.000000e+0,8.000000e+0}>>) -> tensor<1x3x!quant.uniform<i8:f32, 5.000000e+0>>
+      return %1 : tensor<1x3x!quant.uniform<i8:f32, 5.000000e+0>>
+    }
+  )mlir";
+
+  const OwningOpRef<ModuleOp> module_op =
+      ParseModuleOpString(kQuantizedDotGeneralMissingQuantizationMethodAttr);
+  ASSERT_TRUE(module_op);
+
+  const QuantizationReport report(*module_op);
+  const QuantizationResults& results = report.GetQuantizationResults();
+  // The quantized call op without the _quantization_method attribute is not
+  // captured as a `QuantizationResult`.
+  ASSERT_THAT(results.results(), IsEmpty());
+}
+
+TEST_F(QuantizationReportTest, InitializeWithModuleOpWithInvalidCalleeName) {
+  // A quantized dot_general op but the callee function has an invalid name. It
+  // is expected to start with `quantized_`.
+  constexpr absl::string_view kQuantizedDotGeneralWithInvalidCalleeName =
+      R"mlir(
+    func.func @main(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> {
+      %0 = stablehlo.constant() {value = dense<127> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {1.000000e+0,2.000000e+0,3.000000e+0}>>
+      %1 = stablehlo.uniform_quantize %arg0 : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, 4.000000e+0>>
+      %2 = call @invalid_quantized_dot_general_fn(%1, %0) {_quantization_method = "static_range_ptq { }"} : (tensor<1x2x!quant.uniform<i8:f32, 4.000000e+0>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {1.000000e+0,2.000000e+0,3.000000e+0}>>) -> tensor<1x3x!quant.uniform<i8:f32, 5.000000e+0>>
+      %3 = stablehlo.uniform_dequantize %2 : (tensor<1x3x!quant.uniform<i8:f32, 5.000000e+0>>) -> tensor<1x3xf32>
+      return %3 : tensor<1x3xf32>
+    }
+
+    func.func private @invalid_quantized_dot_general_fn(%arg0: tensor<1x2x!quant.uniform<i8:f32, 4.000000e+0>>, %arg1: tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {1.000000e+0,2.000000e+0,3.000000e+0}>>) -> tensor<1x3x!quant.uniform<i8:f32, 5.000000e+0>> {
+      %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2x!quant.uniform<i8:f32, 4.000000e+0>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {1.000000e+0,2.000000e+0,3.000000e+0}>>) -> tensor<1x3x!quant.uniform<i32:f32:1, {6.000000e+0,7.000000e+0,8.000000e+0}>>
+      %1 = stablehlo.uniform_quantize %0 : (tensor<1x3x!quant.uniform<i32:f32:1, {6.000000e+0,7.000000e+0,8.000000e+0}>>) -> tensor<1x3x!quant.uniform<i8:f32, 5.000000e+0>>
+      return %1 : tensor<1x3x!quant.uniform<i8:f32, 5.000000e+0>>
+    }
+  )mlir";
+
+  const OwningOpRef<ModuleOp> module_op =
+      ParseModuleOpString(kQuantizedDotGeneralWithInvalidCalleeName);
+  ASSERT_TRUE(module_op);
+
+  const QuantizationReport report(*module_op);
+  const QuantizationResults& results = report.GetQuantizationResults();
+  // The quantized call op whose callee doesn't start with `quantized_` is not
+  // captured as a `QuantizationResult`.
+  ASSERT_THAT(results.results(), IsEmpty());
 }
 
 TEST_F(QuantizationReportTest, InitializeWithModuleOpWithNonQuantizedOp) {
@@ -141,11 +212,11 @@ TEST_F(QuantizationReportTest,
     func.func @main(%arg0: tensor<1x2xf32>, %arg1: tensor<1x2xf32>) -> tensor<1x3xf32> {
       // Non-quantized dot_general.
       %0 = stablehlo.constant dense<3.000000e+0> : tensor<2x3xf32>
-      %1 = "tf.XlaCallModule"(%arg0, %0) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+      %1 = "tf.XlaCallModule"(%arg0, %0) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
       // Quantized dot_general.
       %2 = stablehlo.constant() {value = dense<127> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {1.000000e+0,2.000000e+0,3.000000e+0}>>
       %3 = stablehlo.uniform_quantize %arg1 : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, 4.000000e+0>>
-      %4 = call @quantized_dot_general_fn_2(%3, %2) : (tensor<1x2x!quant.uniform<i8:f32, 4.000000e+0>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {1.000000e+0,2.000000e+0,3.000000e+0}>>) -> tensor<1x3x!quant.uniform<i8:f32, 5.000000e+0>>
+      %4 = call @quantized_dot_general_fn_2(%3, %2) {_quantization_method = "static_range_ptq { }"} : (tensor<1x2x!quant.uniform<i8:f32, 4.000000e+0>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {1.000000e+0,2.000000e+0,3.000000e+0}>>) -> tensor<1x3x!quant.uniform<i8:f32, 5.000000e+0>>
       %5 = stablehlo.uniform_dequantize %4 : (tensor<1x3x!quant.uniform<i8:f32, 5.000000e+0>>) -> tensor<1x3xf32>
       // Add is there to prevent from dot_generals from being DCEed.
       %6 = stablehlo.add %1, %5 : tensor<1x3xf32>
@@ -178,7 +249,7 @@ TEST_F(QuantizationReportTest,
   const QuantizationResult& quantized_result = results.results(0);
   EXPECT_THAT(quantized_result.quantizable_unit().name(),
               StrEq("composite_dot_general_fn_2"));
-  EXPECT_FALSE(quantized_result.has_method());
+  EXPECT_TRUE(quantized_result.method().has_static_range_ptq());
 
   // Test that the non-quantized op is captured in `results`.
   const QuantizationResult& non_quantized_result = results.results(1);
@@ -203,9 +274,52 @@ TEST_F(QuantizationReportTest, ToString) {
   std::string result_str{};
   TextFormat::PrintToString(report.GetQuantizationResults(), &result_str);
 
-  EXPECT_THAT(report.ToString(), testing::HasSubstr("Quantization Report"));
-  EXPECT_THAT(report.ToString(), testing::HasSubstr(result_str));
-  EXPECT_THAT(report.ToString(), testing::HasSubstr("Quantization Report End"));
+  EXPECT_THAT(report.ToString(), HasSubstr("Quantization Report"));
+  EXPECT_THAT(report.ToString(), HasSubstr(result_str));
+  EXPECT_THAT(report.ToString(), HasSubstr("Quantization Report End"));
+}
+
+TEST_F(QuantizationReportTest, Save) {
+  constexpr absl::string_view kQuantizedDotGeneral = R"mlir(
+    func.func @main(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> {
+      %0 = stablehlo.constant() {value = dense<127> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {1.000000e+0,2.000000e+0,3.000000e+0}>>
+      %1 = stablehlo.uniform_quantize %arg0 : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, 4.000000e+0>>
+      %2 = call @quantized_dot_general_fn(%1, %0) {_quantization_method = "static_range_ptq { }"} : (tensor<1x2x!quant.uniform<i8:f32, 4.000000e+0>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {1.000000e+0,2.000000e+0,3.000000e+0}>>) -> tensor<1x3x!quant.uniform<i8:f32, 5.000000e+0>>
+      %3 = stablehlo.uniform_dequantize %2 : (tensor<1x3x!quant.uniform<i8:f32, 5.000000e+0>>) -> tensor<1x3xf32>
+      return %3 : tensor<1x3xf32>
+    }
+
+    func.func private @quantized_dot_general_fn(%arg0: tensor<1x2x!quant.uniform<i8:f32, 4.000000e+0>>, %arg1: tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {1.000000e+0,2.000000e+0,3.000000e+0}>>) -> tensor<1x3x!quant.uniform<i8:f32, 5.000000e+0>> {
+      %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2x!quant.uniform<i8:f32, 4.000000e+0>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {1.000000e+0,2.000000e+0,3.000000e+0}>>) -> tensor<1x3x!quant.uniform<i32:f32:1, {6.000000e+0,7.000000e+0,8.000000e+0}>>
+      %1 = stablehlo.uniform_quantize %0 : (tensor<1x3x!quant.uniform<i32:f32:1, {6.000000e+0,7.000000e+0,8.000000e+0}>>) -> tensor<1x3x!quant.uniform<i8:f32, 5.000000e+0>>
+      return %1 : tensor<1x3x!quant.uniform<i8:f32, 5.000000e+0>>
+    }
+  )mlir";
+
+  const OwningOpRef<ModuleOp> module_op =
+      ParseModuleOpString(kQuantizedDotGeneral);
+  ASSERT_TRUE(module_op);
+
+  const QuantizationReport report(*module_op);
+
+  const std::string dst_file_path =
+      absl::StrCat(TempDir(), "/quantization_report.txtpb");
+  const absl::Status save_status = report.Save(dst_file_path);
+  ASSERT_THAT(save_status, IsOk());
+
+  const absl::StatusOr<std::string> file_data = ReadFileToString(dst_file_path);
+  ASSERT_THAT(file_data, IsOk());
+
+  // Test that the file data can be parsed as `QuantizationResults`.
+  QuantizationResults results{};
+  ASSERT_TRUE(TextFormat::ParseFromString(*file_data, &results));
+
+  // Check that `results` reflects the information of the quantized units
+  // properly.
+  ASSERT_THAT(results.results(), SizeIs(1));
+  EXPECT_THAT(results.results(0).quantizable_unit().name(),
+              StrEq("composite_dot_general_fn"));
+  EXPECT_TRUE(results.results(0).method().has_static_range_ptq());
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.cc
index a223a0b03f58a4..295ab06eb1bf70 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.cc
@@ -77,7 +77,7 @@ absl::StatusOr<ImportedMlirModuleOp> SavedModelToMlirModuleOp(
                                             module_op.status().ToString()));
   }
 
-  return std::make_pair(module_op->release(), std::move(bundle));
+  return std::make_pair(std::move(*module_op), std::move(bundle));
 }
 
 absl::StatusOr<absl::flat_hash_map<FunctionName, FunctionAlias>>
@@ -119,7 +119,7 @@ void UpdateFunctionAliases(
   });
 }
 
-absl::StatusOr<ModuleOp> ImportSavedModel(
+absl::StatusOr<OwningOpRef<ModuleOp>> ImportSavedModel(
     const absl::string_view saved_model_path,
     const std::vector<std::string>& signature_keys,
     const std::unordered_set<std::string>& tags,
@@ -132,7 +132,7 @@ absl::StatusOr<ModuleOp> ImportSavedModel(
       SavedModelToMlirModuleOp(saved_model_path, tags, signature_keys, ctx));
   auto [module_op, saved_model_bundle] = std::move(imported_module);
 
-  UpdateFunctionAliases(function_aliases, module_op);
+  UpdateFunctionAliases(function_aliases, *module_op);
 
   // Collect the names of the functions that have aliases so that they may not
   // be inlined.
@@ -143,11 +143,11 @@ absl::StatusOr<ModuleOp> ImportSavedModel(
 
   TF_RETURN_IF_ERROR(PreprocessAndFreezeGraph(
       mlir_dump_file_prefix, /*is_inliner_run=*/true,
-      /*noinline_functions=*/aliased_function_names, module_op, &ctx,
+      /*noinline_functions=*/aliased_function_names, *module_op, &ctx,
       saved_model_bundle == nullptr ? nullptr
                                     : saved_model_bundle->GetSession(),
       /*run_tf_to_stablehlo=*/true, /*deserialize_xla_call_module=*/false));
-  return module_op;
+  return std::move(module_op);
 }
 
 }  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.h
index 631d2e714900aa..8f1e4236e09823 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "tensorflow/cc/saved_model/loader.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/types.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
@@ -38,7 +39,8 @@ namespace mlir::quant::stablehlo {
 // `tensorflow::Session` which may be useful when reading values from resources
 // (e.g. `TF::VarHandleOp`s).
 using ImportedMlirModuleOp =
-    std::pair<ModuleOp, std::unique_ptr<::tensorflow::SavedModelBundle>>;
+    std::pair<OwningOpRef<ModuleOp>,
+              std::unique_ptr<::tensorflow::SavedModelBundle>>;
 
 // Loads a SavedModel at `saved_model_path` and converts it to `mlir::ModuleOp`.
 //
@@ -72,7 +74,7 @@ void UpdateFunctionAliases(
 // Loads a SavedModel to `mlir::ModuleOp` and performs preprocesses including
 // shape inference and graph freezing.
 // TODO: b/329206105 - Add unit tests after decomposing preprocessing passes.
-absl::StatusOr<ModuleOp> ImportSavedModel(
+absl::StatusOr<OwningOpRef<ModuleOp>> ImportSavedModel(
     absl::string_view saved_model_path,
     const std::vector<std::string>& signature_keys,
     const std::unordered_set<std::string>& tags,
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.cc
index 015ab7605a05b7..3d350613629c7e 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/component.h"
@@ -105,7 +106,7 @@ absl::Status QuantizeStaticRangePtq(
   }
 
   TF_ASSIGN_OR_RETURN(
-      ModuleOp module_op,
+      OwningOpRef<ModuleOp> module,
       ImportSavedModel(src_saved_model_path, signature_keys, tags,
                        quantization_config, PreCalibrationComponent::kName,
                        *function_aliases, *ctx));
@@ -113,14 +114,14 @@ absl::Status QuantizeStaticRangePtq(
   StaticRangePtqComponent static_range_ptq_component(
       ctx.get(), &py_function_library, src_saved_model_path, signature_keys,
       tags, signature_def_map, *function_aliases);
-  TF_ASSIGN_OR_RETURN(module_op, static_range_ptq_component.Run(
-                                     module_op, quantization_config));
+  TF_ASSIGN_OR_RETURN(
+      *module, static_range_ptq_component.Run(*module, quantization_config));
 
   TF_ASSIGN_OR_RETURN(
       const ExportedModel post_calibrated_exported_model,
       CreateExportedModel(signature_keys, tags, quantization_config,
                           PostCalibrationComponent::kName, *function_aliases,
-                          *ctx, module_op));
+                          *ctx, *module));
 
   // Remove the `tpu` tag for exporting because the output quantized model is
   // essentially a CPU model.
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/weight_only_ptq.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/weight_only_ptq.cc
index bbd9a9c25620bd..f1df09c36ccce0 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/weight_only_ptq.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/weight_only_ptq.cc
@@ -28,11 +28,13 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/context.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/types.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/save_report.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
@@ -42,6 +44,7 @@ limitations under the License.
 
 namespace mlir::quant::stablehlo {
 
+using ::stablehlo::quantization::GetReportFilePath;
 using ::stablehlo::quantization::QuantizationConfig;
 using ::tensorflow::SignatureDef;
 using ::tensorflow::quantization::ExportedModel;
@@ -56,6 +59,11 @@ absl::StatusOr<ModuleOp> WeightOnlyPtqComponent::Run(
   TF_RETURN_IF_ERROR(RunPasses(
       kName, /*add_passes_func=*/
       [&config](PassManager& pm) {
+        // Add instrumentation to save quantization report after quantization.
+        pm.addInstrumentation(
+            std::make_unique<SaveQuantizationReportInstrumentation>(
+                GetReportFilePath(config)));
+
         AddWeightOnlyQuantizationPasses(pm, config.specs(),
                                         config.pipeline_config(),
                                         config.debugger_config());
@@ -85,20 +93,20 @@ absl::Status QuantizeWeightOnlyPtq(
   }
 
   TF_ASSIGN_OR_RETURN(
-      ModuleOp module_op,
+      auto module,
       ImportSavedModel(src_saved_model_path, signature_keys, tags,
                        quantization_config, WeightOnlyPtqComponent::kName,
                        *function_aliases, *ctx));
 
   WeightOnlyPtqComponent weight_only_ptq_component(ctx.get());
   TF_ASSIGN_OR_RETURN(
-      module_op, weight_only_ptq_component.Run(module_op, quantization_config));
+      *module, weight_only_ptq_component.Run(*module, quantization_config));
 
   TF_ASSIGN_OR_RETURN(
       const ExportedModel post_calibrated_exported_model,
       CreateExportedModel(signature_keys, tags, quantization_config,
                           WeightOnlyPtqComponent::kName, *function_aliases,
-                          *ctx, module_op));
+                          *ctx, *module));
 
   // Remove the `tpu` tag for exporting because the output quantized model is
   // essentially a CPU model.
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/BUILD
new file mode 100644
index 00000000000000..476192965a3f8f
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/BUILD
@@ -0,0 +1,46 @@
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["@stablehlo//:license"],
+    default_visibility = [
+        "//tensorflow/compiler/mlir/quantization/stablehlo:__subpackages__",
+    ],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "save_report",
+    srcs = ["save_report.cc"],
+    hdrs = ["save_report.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:report",
+        "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+tf_cc_test(
+    name = "save_report_test",
+    srcs = ["save_report_test.cc"],
+    deps = [
+        ":save_report",
+        "//tensorflow/compiler/mlir/quantization/common:test_base",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:passes",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:io",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@local_tsl//tsl/platform:protobuf",
+        "@local_tsl//tsl/platform:status_matchers",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/save_report.cc b/tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/save_report.cc
new file mode 100644
index 00000000000000..e1a705cdbb24f6
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/save_report.cc
@@ -0,0 +1,95 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/save_report.h"
+
+#include <optional>
+#include <string>
+
+#include "absl/base/nullability.h"
+#include "absl/log/log.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/report.h"
+
+namespace mlir::quant::stablehlo {
+namespace {
+
+// Converts `std::optional<absl::string_view>` to `std::optional<std::string>`.
+// A `std::nullopt` is returned when `view` is `std::nullopt`.
+std::optional<std::string> OptionalStringViewToOptionalString(
+    std::optional<absl::string_view> view) {
+  if (view == std::nullopt) return std::nullopt;
+  return std::make_optional<std::string>(*view);
+}
+
+// Whether the pass is `QuantizeCompositeFunctionPass`.
+bool IsQuantizeCompositeFunctionPass(absl::Nullable<Pass*> pass,
+                                     absl::Nullable<Operation*> op) {
+  // It is known that `op` is `ModuleOp` when `pass` is
+  // `QuantizeCompositeFunctionPass`, but the check is still performed to be
+  // defensive.
+  return pass != nullptr &&
+         pass->getArgument() == "stablehlo-quantize-composite-functions" &&
+         isa_and_nonnull<ModuleOp>(op);
+}
+
+// Report is saved only when:
+// * After running `QuantizeCompositeFunctionPass`.
+// * The pass is run on `ModuleOp`.
+// * `file_path` is not `nullopt`.
+bool ShouldSaveReport(absl::Nullable<Pass*> pass, absl::Nullable<Operation*> op,
+                      const std::optional<std::string>& file_path) {
+  return file_path != std::nullopt && IsQuantizeCompositeFunctionPass(pass, op);
+}
+
+void SaveReport(const QuantizationReport& report,
+                const absl::string_view file_path) {
+  if (const absl::Status save_status = report.Save(file_path);
+      save_status.ok()) {
+    LOG(INFO) << "Successfully saved quantization report to: " << file_path;
+  } else {
+    LOG(ERROR) << "Failed to save quantization report to: " << file_path
+               << " with status: " << save_status;
+  }
+}
+
+}  // namespace
+
+SaveQuantizationReportInstrumentation::SaveQuantizationReportInstrumentation(
+    std::optional<absl::string_view> file_path)
+    : file_path_(OptionalStringViewToOptionalString(file_path)) {}
+
+void SaveQuantizationReportInstrumentation::runAfterPass(Pass* pass,
+                                                         Operation* op) {
+  // Only run after `QuantizeCompositeFunctionPass`.
+  if (!IsQuantizeCompositeFunctionPass(pass, op)) return;
+
+  auto module_op = cast<ModuleOp>(op);
+  const QuantizationReport report(module_op);
+
+  // Print a human-readable report to stdout regardless of whether the report
+  // is saved to file.
+  report.Print();
+
+  // Exit early if the report should not be saved to file.
+  if (!ShouldSaveReport(pass, op, file_path_)) return;
+
+  SaveReport(report, *file_path_);
+}
+
+}  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/save_report.h b/tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/save_report.h
new file mode 100644
index 00000000000000..e690e6252b3393
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/save_report.h
@@ -0,0 +1,52 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_INSTRUMENTATIONS_SAVE_REPORT_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_INSTRUMENTATIONS_SAVE_REPORT_H_
+
+#include <optional>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassInstrumentation.h"  // from @llvm-project
+
+namespace mlir::quant::stablehlo {
+
+// A `PassInstrumentation` that saves quantization report to file after
+// `QuantizeCompositeFunctionsPass` is run. It inspects the `ModuleOp` after
+// quantization and analyzes the quantizable units and quantization methods
+// used. The report file will be saved at the `file_path`. The report file
+// contains textproto of `QuantizationResults`. `file_path`'s base directories
+// should exist (this pass instrumentation will not `mkdir` them).
+//
+// See `QuantizationReport` for further details on the quantization report.
+class SaveQuantizationReportInstrumentation : public PassInstrumentation {
+ public:
+  // `file_path` is the path to save the report file. The report file is in
+  // textproto format so a `.txtpb` extension is preferred but it doesn't result
+  // in error if other extension is used. This instrumentation will not be run
+  // if `file_path` is a `nullopt`.
+  explicit SaveQuantizationReportInstrumentation(
+      std::optional<absl::string_view> file_path);
+
+  void runAfterPass(Pass* pass, Operation* op) override;
+
+ private:
+  std::optional<std::string> file_path_;  // Path to file to save the report.
+};
+
+}  // namespace mlir::quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_INSTRUMENTATIONS_SAVE_REPORT_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/save_report_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/save_report_test.cc
new file mode 100644
index 00000000000000..27d282dc309a7e
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/save_report_test.cc
@@ -0,0 +1,186 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/instrumentations/save_report.h"
+
+#include <memory>
+#include <optional>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/test_base.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h"
+#include "tsl/platform/protobuf.h"  // IWYU pragma: keep
+#include "tsl/platform/status_matchers.h"
+
+namespace mlir::quant::stablehlo {
+namespace {
+
+using ::stablehlo::quantization::QuantizationResults;
+using ::stablehlo::quantization::io::ReadFileToString;
+using ::testing::SizeIs;
+using ::testing::StrEq;
+using ::tsl::protobuf::TextFormat;
+using ::tsl::testing::IsOk;
+using ::tsl::testing::StatusIs;
+
+using SaveQuantizationReportInstrumentationTest = QuantizationTestBase;
+
+TEST_F(SaveQuantizationReportInstrumentationTest, SaveReport) {
+  constexpr absl::string_view kModuleWithCompositeDotGeneral = R"mlir(
+    func.func @main(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> {
+      %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
+      %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+      %1 = "tf.XlaCallModule"(%0, %cst) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+      %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+      return %2 : tensor<1x3xf32>
+    }
+
+    func.func private @composite_dot_general_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+      %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+      return %0 : tensor<1x3xf32>
+    }
+  )mlir";
+
+  const OwningOpRef<ModuleOp> module_op =
+      ParseModuleOpString(kModuleWithCompositeDotGeneral);
+  ASSERT_TRUE(module_op);
+
+  // Create a pass manager with `SaveQuantizationReportInstrumentation` and
+  // `QuantizeCompositeFunctionsPass`. Run the passes against `module_op`.
+  PassManager pm(ctx_.get());
+
+  QuantizeCompositeFunctionsPassOptions options;
+  pm.addPass(createQuantizeCompositeFunctionsPass(options));
+
+  const std::string report_file_path =
+      absl::StrCat(testing::TempDir(), "/save_report.txtpb");
+  pm.addInstrumentation(std::make_unique<SaveQuantizationReportInstrumentation>(
+      report_file_path));
+
+  const LogicalResult run_result = pm.run(*module_op);
+  ASSERT_TRUE(succeeded(run_result));
+
+  // Check that the report file contains `QuantizationResults` textproto,
+  // reflecting the quantization results, in this case the
+  // `composite_dot_general_fn` with quantized with `static_range_ptq` method.
+  const absl::StatusOr<std::string> file_data =
+      ReadFileToString(report_file_path);
+  ASSERT_THAT(file_data, IsOk());
+
+  /*
+  results {
+    quantizable_unit {
+      name: "composite_dot_general_fn"
+    }
+    method { static_range_ptq { } }
+  }
+  */
+  QuantizationResults results{};
+  ASSERT_TRUE(TextFormat::ParseFromString(*file_data, &results));
+  ASSERT_THAT(results.results(), SizeIs(1));
+  EXPECT_THAT(results.results(0).quantizable_unit().name(),
+              StrEq("composite_dot_general_fn"));
+  EXPECT_TRUE(results.results(0).method().has_static_range_ptq());
+}
+
+TEST_F(SaveQuantizationReportInstrumentationTest,
+       ReportNotSavedWhenNoQuantizeCompositeFunctionsPass) {
+  constexpr absl::string_view kModuleWithCompositeDotGeneral = R"mlir(
+    func.func @main(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> {
+      %cst = "stablehlo.constant"() {value = dense<3.00000000e-1> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
+      %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+      %1 = "tf.XlaCallModule"(%0, %cst) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+      %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+      return %2 : tensor<1x3xf32>
+    }
+
+    func.func private @composite_dot_general_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+      %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+      return %0 : tensor<1x3xf32>
+    }
+  )mlir";
+
+  const OwningOpRef<ModuleOp> module_op =
+      ParseModuleOpString(kModuleWithCompositeDotGeneral);
+  ASSERT_TRUE(module_op);
+
+  // Create a pass manager with `SaveQuantizationReportInstrumentation` a pass
+  // that is not `QuantizeCompositeFunctionsPass`. Run the passes against
+  // `module_op`.
+  PassManager pm(ctx_.get());
+
+  pm.addPass(createPrepareQuantizePass());
+
+  const std::string report_file_path = absl::StrCat(
+      testing::TempDir(),
+      "/report_not_saved_no_quantize_composite_functions_pass.txtpb");
+  pm.addInstrumentation(std::make_unique<SaveQuantizationReportInstrumentation>(
+      report_file_path));
+
+  const LogicalResult run_result = pm.run(*module_op);
+  ASSERT_TRUE(succeeded(run_result));
+
+  // The report file is not created because `QuantizeCompositeFunctionsPass` was
+  // not run.
+  EXPECT_THAT(ReadFileToString(report_file_path),
+              StatusIs(absl::StatusCode::kNotFound));
+}
+
+TEST_F(SaveQuantizationReportInstrumentationTest,
+       ReportNotSavedWhenReportFilePathIsNullopt) {
+  constexpr absl::string_view kModuleWithCompositeDotGeneral = R"mlir(
+    func.func @main(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> {
+      %cst = "stablehlo.constant"() {value = dense<3.00000000e-1> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
+      %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+      %1 = "tf.XlaCallModule"(%0, %cst) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+      %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+      return %2 : tensor<1x3xf32>
+    }
+
+    func.func private @composite_dot_general_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+      %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+      return %0 : tensor<1x3xf32>
+    }
+  )mlir";
+
+  const OwningOpRef<ModuleOp> module_op =
+      ParseModuleOpString(kModuleWithCompositeDotGeneral);
+  ASSERT_TRUE(module_op);
+
+  PassManager pm(ctx_.get());
+
+  QuantizeCompositeFunctionsPassOptions options;
+  pm.addPass(createQuantizeCompositeFunctionsPass(options));
+  pm.addInstrumentation(std::make_unique<SaveQuantizationReportInstrumentation>(
+      /*file_path=*/std::nullopt));
+
+  // The report file is not created and `SaveQuantizationReportInstrumentation`
+  // is not run, but the passes still run without errors.
+  const LogicalResult run_result = pm.run(*module_op);
+  ASSERT_TRUE(succeeded(run_result));
+}
+
+}  // namespace
+}  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.cc b/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.cc
index 3018db7b2649e9..54b0744fcda94d 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.cc
@@ -118,6 +118,8 @@ std::unique_ptr<OpQuantSpec> GetStableHloOpQuantSpec(Operation* op) {
       if (auto optional_dim = GetDotGeneralQuantizationDim(dot_general_op);
           optional_dim) {
         spec->coeff_op_quant_dim[1] = optional_dim.value();
+      } else {
+        spec->coeff_op_quant_dim[1] = -1;
       }
       if (function_name.contains("with_bias")) {
         spec->biases_params[2] = {{0, 1},
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_ops_to_mhlo.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_ops_to_mhlo.cc
index cd861d934e75f8..5575a7516fccc9 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_ops_to_mhlo.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_ops_to_mhlo.cc
@@ -88,7 +88,7 @@ FailureOr<TensorType> GetUniformQuantizedType(
   }
 
   auto original_element_type = getElementTypeOrSelf(original_type);
-  if (!original_element_type.isa<TF::Qint8Type, TF::Qint32Type>()) {
+  if (!mlir::isa<TF::Qint8Type, TF::Qint32Type>(original_element_type)) {
     return rewriter.notifyMatchFailure(
         op, "Quantized type must be qint8 or qint32.");
   }
@@ -112,7 +112,7 @@ FailureOr<TensorType> GetUniformQuantizedType(
         quantized_dimension, storage_type_min, storage_type_max);
   }
 
-  return original_type.cast<TensorType>().clone(elem_ty);
+  return mlir::cast<TensorType>(original_type).clone(elem_ty);
 }
 
 // If operand is TF const op, create MHLO constant op from the contents.
@@ -178,8 +178,8 @@ FailureOr<ElementsAttr> ConvertPaddingAttr(
     const xla::ConvolutionDimensionNumbers &dnums, PatternRewriter &rewriter) {
   StringAttr conv_padding = op.getPaddingAttr();
   SmallVector<int64_t> padding_nums;
-  ShapedType lhs_shape = op.getLhs().getType().template cast<ShapedType>();
-  ShapedType rhs_shape = op.getRhs().getType().template cast<ShapedType>();
+  ShapedType lhs_shape = mlir::cast<ShapedType>(op.getLhs().getType());
+  ShapedType rhs_shape = mlir::cast<ShapedType>(op.getRhs().getType());
 
   // Handle only static shape cases.
   // TODO(b/260284866): Handle dynamic shape cases.
@@ -192,26 +192,26 @@ FailureOr<ElementsAttr> ConvertPaddingAttr(
 
   const int64_t padding_nums_size = 2 * (rhs_shape.getRank() - 2);
   padding_nums.reserve(padding_nums_size);
-  if (conv_padding.strref().equals("EXPLICIT")) {
+  if (conv_padding.strref() == "EXPLICIT") {
     for (auto padding_elem :
          op.getExplicitPaddingAttr().template getAsRange<IntegerAttr>()) {
       padding_nums.push_back(padding_elem.getInt());
     }
-  } else if (conv_padding.strref().equals("VALID")) {
+  } else if (conv_padding.strref() == "VALID") {
     padding_nums.resize(padding_nums_size, 0);
   } else {
     padding_nums.resize(padding_nums_size);
     for (int i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
       const int64_t stride =
-          op.getWindowStridesAttr()[i].template cast<IntegerAttr>().getInt();
+          mlir::cast<IntegerAttr>(op.getWindowStridesAttr()[i]).getInt();
       const int64_t lhs_size_dilated =
           ::tensorflow::UniformQuantizedConvolutionParams::DilatedSize(
               lhs_shape.getDimSize(dnums.input_spatial_dimensions(i)),
-              op.getLhsDilationAttr()[i].template cast<IntegerAttr>().getInt());
+              mlir::cast<IntegerAttr>(op.getLhsDilationAttr()[i]).getInt());
       const int64_t rhs_size_dilated =
           ::tensorflow::UniformQuantizedConvolutionParams::DilatedSize(
               rhs_shape.getDimSize(dnums.kernel_spatial_dimensions(i)),
-              op.getRhsDilationAttr()[i].template cast<IntegerAttr>().getInt());
+              mlir::cast<IntegerAttr>(op.getRhsDilationAttr()[i]).getInt());
 
       const int64_t output_size = (lhs_size_dilated + stride - 1) / stride;
       const int64_t total_padding = std::max(
@@ -262,7 +262,7 @@ FailureOr<SmallVector<NamedAttribute>> ConvertToMhloConvolutionOpAttrs(
                attr.getName() == op.getLhsDilationAttrName() ||
                attr.getName() == op.getRhsDilationAttrName()) {
       attr.setValue(ConvertToDenseElementsAttr(
-          attr.getValue().template cast<ArrayAttr>(), rewriter));
+          mlir::cast<ArrayAttr>(attr.getValue()), rewriter));
       converted_attrs.push_back(attr);
     }
   }
@@ -362,9 +362,9 @@ class ConvertUniformQuantizeOp
         op->getLoc(), *output_type, op.getInput());
     rewriter.replaceOpWithNewOp<mhlo::BitcastConvertOp>(
         op,
-        output_type->clone(output_type->getElementType()
-                               .dyn_cast<quant::QuantizedType>()
-                               .getStorageType()),
+        output_type->clone(
+            mlir::dyn_cast<quant::QuantizedType>(output_type->getElementType())
+                .getStorageType()),
         result);
 
     return success();
@@ -438,9 +438,9 @@ class ConvertUniformRequantizeOp
         op->getLoc(), *output_type, input_quant);
     rewriter.replaceOpWithNewOp<mhlo::BitcastConvertOp>(
         op,
-        output_type->clone(output_type->getElementType()
-                               .dyn_cast<quant::QuantizedType>()
-                               .getStorageType()),
+        output_type->clone(
+            mlir::dyn_cast<quant::QuantizedType>(output_type->getElementType())
+                .getStorageType()),
         result);
     return success();
   }
@@ -502,9 +502,9 @@ class ConvertUniformQuantizedDotOp
                                      /*precision_config=*/nullptr);
     rewriter.replaceOpWithNewOp<mhlo::BitcastConvertOp>(
         op,
-        output_type->clone(output_type->getElementType()
-                               .dyn_cast<quant::QuantizedType>()
-                               .getStorageType()),
+        output_type->clone(
+            mlir::dyn_cast<quant::QuantizedType>(output_type->getElementType())
+                .getStorageType()),
         result);
     return success();
   }
@@ -564,9 +564,9 @@ class ConvertUniformQuantizedConvolutionOp
         op->getLoc(), *output_type, operands, *converted_attrs_or);
     rewriter.replaceOpWithNewOp<mhlo::BitcastConvertOp>(
         op,
-        output_type->clone(output_type->getElementType()
-                               .dyn_cast<quant::QuantizedType>()
-                               .getStorageType()),
+        output_type->clone(
+            mlir::dyn_cast<quant::QuantizedType>(output_type->getElementType())
+                .getStorageType()),
         result);
     return success();
   }
@@ -582,7 +582,7 @@ class ConvertUniformQuantizedAddOp
       ConversionPatternRewriter &rewriter) const override {
     Value lhs = adaptor.getLhs();
 
-    auto lhs_type = lhs.getType().cast<ShapedType>();
+    auto lhs_type = mlir::cast<ShapedType>(lhs.getType());
     if (!lhs_type.hasRank()) {
       return rewriter.notifyMatchFailure(
           op, "Legalization supports cases where only lhs rank known.");
@@ -632,9 +632,9 @@ class ConvertUniformQuantizedAddOp
         op->getLoc(), *output_type, lhs, *rhs_or, broadcast_dims);
     rewriter.replaceOpWithNewOp<mhlo::BitcastConvertOp>(
         op,
-        output_type->clone(output_type->getElementType()
-                               .dyn_cast<quant::QuantizedType>()
-                               .getStorageType()),
+        output_type->clone(
+            mlir::dyn_cast<quant::QuantizedType>(output_type->getElementType())
+                .getStorageType()),
         result);
     return success();
   }
@@ -692,9 +692,9 @@ class ConvertUniformQuantizedClipByValueOp
         op->getLoc(), *output_type, res_min_clipped, *max_or, broadcast_dims);
     rewriter.replaceOpWithNewOp<mhlo::BitcastConvertOp>(
         op,
-        output_type->clone(output_type->getElementType()
-                               .dyn_cast<quant::QuantizedType>()
-                               .getStorageType()),
+        output_type->clone(
+            mlir::dyn_cast<quant::QuantizedType>(output_type->getElementType())
+                .getStorageType()),
         res_max_clipped);
     return success();
   }
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_types.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_types.cc
index 65192fc1117673..f07097a109a0af 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_types.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_types.cc
@@ -71,7 +71,7 @@ bool IsIllegalType(Type type) {
 // If input is not TF qint types, returns the original type.
 Type ToLegalType(Type type) {
   if (IsTFQintType(type)) return GetIntTypeFromTFQint(type);
-  if (auto shaped = type.dyn_cast<ShapedType>()) {
+  if (auto shaped = mlir::dyn_cast<ShapedType>(type)) {
     Type elem = shaped.getElementType();
     if (IsTFQintType(elem)) return shaped.clone(ToLegalType(elem));
   }
@@ -289,7 +289,7 @@ class TFConstOpQuantToIntPattern : public OpConversionPattern<TF::ConstOp> {
     }
     auto dense_attr_or = GetDenseAttrFromTensorProtoAttr(
         tensor_proto_attr.getValue(),
-        ToLegalType(op.getOutput().getType()).dyn_cast<TensorType>());
+        mlir::dyn_cast<TensorType>(ToLegalType(op.getOutput().getType())));
     if (failed(dense_attr_or)) {
       op->emitError("failed to get DenseElementAttr.");
       return failure();
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/verify_quant_legalization.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/verify_quant_legalization.cc
index 2825195addea12..7484ed89aa51b1 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/verify_quant_legalization.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/verify_quant_legalization.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/stablehlo/utils/tf_type_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -53,7 +54,7 @@ class VerifyQuantLegalization
 
 bool IsQuantType(Type type) {
   auto element_type = getElementTypeOrSelf(type);
-  return element_type.isa<quant::UniformQuantizedType>() ||
+  return mlir::isa<quant::UniformQuantizedType>(element_type) ||
          IsTFQintType(element_type);
 }
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/convert_func_to_bfloat16.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/convert_func_to_bfloat16.cc
index 0204a19452bb0d..4a85786dc94937 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/convert_func_to_bfloat16.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/convert_func_to_bfloat16.cc
@@ -143,8 +143,8 @@ class BFloat16TypePattern : public ConversionPattern {
       state.attributes.set(
           const_op.getValueAttrName(),
           DenseFPElementsAttr::get(
-              const_op.getValue().getType().dyn_cast<ShapedType>().clone(
-                  rewriter.getBF16Type()),
+              mlir::dyn_cast<ShapedType>(const_op.getValue().getType())
+                  .clone(rewriter.getBF16Type()),
               bfloat16_values));
     }
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/defer_activation_transpose.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/defer_activation_transpose.cc
index 5be09ce2ad47ef..686204030c1fdc 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/defer_activation_transpose.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/defer_activation_transpose.cc
@@ -155,7 +155,7 @@ class DeferActivationTransposeForMaxPoolReduceWindowOp
                PatternRewriter& rewriter) const override {
     auto transpose_op = cast<TransposeOp>(op.getOperand(0).getDefiningOp());
 
-    const auto result_type = op.getResult(0).getType().cast<TensorType>();
+    const auto result_type = mlir::cast<TensorType>(op.getResult(0).getType());
     const SmallVector<int64_t> new_result_shape =
         Permute<int64_t>(result_type.getShape(), kNchwToNhwcPermutation);
 
@@ -169,16 +169,16 @@ class DeferActivationTransposeForMaxPoolReduceWindowOp
             op.getLoc(), new_result_type, transpose_op.getOperand(),
             /*init_value=*/op.getOperand(1),
             /*window_dimensions=*/
-            PermuteI64ArrayAttr(rewriter, op.getWindowDimensionsAttr(),
+            PermuteI64ArrayAttr(rewriter, op.getWindowDimensions(),
                                 kNchwToNhwcPermutation),
             /*window_strides=*/
-            PermuteI64ArrayAttr(rewriter, op.getWindowStridesAttr(),
+            PermuteI64ArrayAttr(rewriter, op.getWindowStrides(),
                                 kNchwToNhwcPermutation),
             /*base_dilations=*/
-            PermuteI64ArrayAttr(rewriter, op.getBaseDilationsAttr(),
+            PermuteI64ArrayAttr(rewriter, op.getBaseDilations(),
                                 kNchwToNhwcPermutation),
             /*window_dilations=*/
-            PermuteI64ArrayAttr(rewriter, op.getWindowDilationsAttr(),
+            PermuteI64ArrayAttr(rewriter, op.getWindowDilations(),
                                 kNchwToNhwcPermutation),
             /*padding=*/DenseIntElementsAttr(nullptr));
 
@@ -199,12 +199,13 @@ class DeferActivationTransposeForMaxPoolReduceWindowOp
   // `array_attr` and `permutation` must be equal. Returns a null attribute
   // if `array_attr` is null.
   DenseI64ArrayAttr PermuteI64ArrayAttr(
-      PatternRewriter& rewriter, const DenseI64ArrayAttr array_attr,
+      PatternRewriter& rewriter,
+      const std::optional<ArrayRef<int64_t>> array_attr,
       const ArrayRef<int64_t> permutation) const {
-    if (array_attr == nullptr) return DenseI64ArrayAttr(nullptr);
+    if (!array_attr.has_value()) return DenseI64ArrayAttr(nullptr);
 
     return rewriter.getDenseI64ArrayAttr(
-        Permute<int64_t>(array_attr, permutation));
+        Permute<int64_t>(array_attr.value(), permutation));
   }
 
   LogicalResult MatchMaxPoolReduceWindowOp(
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/fold_constant_transpose.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/fold_constant_transpose.cc
index 051745c0d6792b..06e38c3935c417 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/fold_constant_transpose.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/fold_constant_transpose.cc
@@ -127,12 +127,13 @@ class FoldTransposedConstantOp
     if (!const_op) return failure();
 
     // Only support float tensors.
-    auto tensor_type = const_op.getType().dyn_cast_or_null<TensorType>();
+    auto tensor_type = mlir::dyn_cast_or_null<TensorType>(const_op.getType());
     if (!tensor_type || !tensor_type.getElementType().isF32()) {
       return failure();
     }
 
-    return success(const_op.getValue().isa_and_nonnull<DenseFPElementsAttr>());
+    return success(
+        mlir::isa_and_nonnull<DenseFPElementsAttr>(const_op.getValue()));
   }
 
   void rewrite(mlir::stablehlo::TransposeOp op,
@@ -140,7 +141,8 @@ class FoldTransposedConstantOp
     auto const_op =
         cast<mlir::stablehlo::ConstantOp>(op.getOperand().getDefiningOp());
 
-    const auto value_attr = const_op.getValue().cast<DenseFPElementsAttr>();
+    const auto value_attr =
+        mlir::cast<DenseFPElementsAttr>(const_op.getValue());
     const ArrayRef<int64_t> original_shape =
         value_attr.getShapedType().getShape();
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/insert_calibration_statistics_saver.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/insert_calibration_statistics_saver.cc
new file mode 100644
index 00000000000000..8cb0b645c312cf
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/insert_calibration_statistics_saver.cc
@@ -0,0 +1,189 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"  // IWYU pragma: keep
+#include "tsl/platform/path.h"
+
+namespace mlir::quant::stablehlo {
+namespace {
+
+std::string GetOutputFilePath(absl::string_view calibration_data_dir,
+                              absl::string_view func_name,
+                              int32_t output_file_idx) {
+  return tsl::io::JoinPath(calibration_data_dir,
+                           llvm::Twine(func_name)
+                               .concat("_")
+                               .concat(std::to_string(output_file_idx))
+                               .concat(".pb")
+                               .str());
+}
+
+// Finds `CustomAggregator` ops and collects their outputs and attributes.
+void FindCustomAggregatorOps(
+    Region& region,
+    const std::unordered_set<std::string>& aggregator_ops_to_ignore,
+    SmallVector<Value>& statistics_outputs, SmallVector<StringRef>& ids,
+    SmallVector<int32_t>& calibration_methods) {
+  for (auto op : region.getOps<TF::CustomAggregatorOp>()) {
+    if (aggregator_ops_to_ignore.count(op.getId().str())) continue;
+
+    ids.push_back(op.getId());
+    calibration_methods.push_back(op.getCalibrationMethod());
+    statistics_outputs.push_back(op.getMin());
+    statistics_outputs.push_back(op.getMax());
+    statistics_outputs.push_back(op.getHistogram());
+  }
+}
+
+// Inserts a `CalibrationStatisticsSaverOp` to the end of the region.
+LogicalResult InsertCalibrationStatisticsSaverOp(
+    Region& region, MLIRContext& ctx, absl::string_view output_file_path,
+    const std::unordered_set<std::string>& aggregator_ops_to_ignore) {
+  SmallVector<Value> statistics_outputs;
+  SmallVector<StringRef> ids;
+  SmallVector<int32_t> calibration_methods;
+  FindCustomAggregatorOps(region, aggregator_ops_to_ignore, statistics_outputs,
+                          ids, calibration_methods);
+  if (statistics_outputs.empty()) return failure();
+
+  OpBuilder builder(&ctx);
+  // Set the insertion point right before the return op.
+  builder.setInsertionPoint(&region.back().back());
+
+  StringAttr output_file_path_attr = builder.getStringAttr(output_file_path);
+  ArrayAttr ids_attr = builder.getStrArrayAttr(ids);
+  ArrayAttr calibration_methods_attr =
+      builder.getI32ArrayAttr(calibration_methods);
+  builder.create<TF::CalibrationStatisticsSaverOp>(
+      region.getLoc(), statistics_outputs, output_file_path_attr, ids_attr,
+      calibration_methods_attr);
+  return success();
+}
+
+// Returns true if the op contains a `CalibrationStatisticsSaverOp`.
+bool ContainCalibrationStatisticsSaverOp(Operation* op) {
+  // Check the region for CaseRegionOp, IfRegionOp and WhileRegionOp.
+  for (Region& region : op->getRegions()) {
+    if (!region.getOps<TF::CalibrationStatisticsSaverOp>().empty()) {
+      return true;
+    }
+  }
+
+  SymbolTable symbol_table(op->getParentOfType<ModuleOp>());
+  // Check the functions associated to CaseOp, IfOp and WhileOp.
+  for (const NamedAttribute& attr : op->getAttrs()) {
+    FlatSymbolRefAttr symbol_attr =
+        dyn_cast_or_null<FlatSymbolRefAttr>(attr.getValue());
+    if (!symbol_attr) continue;
+
+    func::FuncOp target_func = dyn_cast_or_null<func::FuncOp>(
+        symbol_table.lookup(symbol_attr.getValue()));
+    if (!target_func) continue;
+
+    if (!target_func.getBody()
+             .getOps<TF::CalibrationStatisticsSaverOp>()
+             .empty()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+}  // namespace
+
+#define GEN_PASS_DECL_INSERTCALIBRATIONSTATISTICSSAVERPASS
+#define GEN_PASS_DEF_INSERTCALIBRATIONSTATISTICSSAVERPASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h.inc"
+
+class InsertCalibrationStatisticsSaverPass
+    : public impl::InsertCalibrationStatisticsSaverPassBase<
+          InsertCalibrationStatisticsSaverPass> {
+ public:
+  using impl::InsertCalibrationStatisticsSaverPassBase<
+      InsertCalibrationStatisticsSaverPass>::
+      InsertCalibrationStatisticsSaverPassBase;
+
+ private:
+  void runOnOperation() override;
+};
+
+void InsertCalibrationStatisticsSaverPass::runOnOperation() {
+  ModuleOp module_op = getOperation();
+  MLIRContext& ctx = getContext();
+
+  std::unordered_set<std::string> aggregator_ops_to_ignore(
+      aggregator_ops_to_ignore_.begin(), aggregator_ops_to_ignore_.end());
+
+  // Insert CalibrationStatisticsSaverOp to the end of each region.
+  for (auto func_op : module_op.getOps<func::FuncOp>()) {
+    int32_t output_file_idx = 0;
+    StringRef func_name = func_op.getSymName();
+
+    func_op.walk([&output_file_idx, &ctx, &func_name, &aggregator_ops_to_ignore,
+                  this](Operation* op) {
+      for (Region& region : op->getRegions()) {
+        if (succeeded(InsertCalibrationStatisticsSaverOp(
+                region, ctx,
+                GetOutputFilePath(calibration_data_dir_, func_name,
+                                  output_file_idx),
+                aggregator_ops_to_ignore))) {
+          ++output_file_idx;
+        };
+      }
+    });
+  }
+
+  // Control flow ops that contains CalibrationStatisticsSaver ops must be set
+  // to stateful, otherwise the op will not be executed.
+  OpBuilder builder(&ctx);
+  module_op.walk([&builder](Operation* op) {
+    if (op->hasAttrOfType<BoolAttr>("is_stateless") &&
+        ContainCalibrationStatisticsSaverOp(op)) {
+      op->setAttr("is_stateless", builder.getBoolAttr(false));
+    }
+  });
+}
+
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateInsertCalibrationStatisticsSaverPass(
+    StringRef calibration_data_dir,
+    const std::vector<std::string>& aggregator_ops_to_ignore) {
+  InsertCalibrationStatisticsSaverPassOptions options = {
+      .aggregator_ops_to_ignore_ = aggregator_ops_to_ignore,
+      .calibration_data_dir_ = calibration_data_dir.str(),
+  };
+  return std::make_unique<InsertCalibrationStatisticsSaverPass>(options);
+}
+
+}  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/insert_weight_param.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/insert_weight_param.cc
index 9fb1e9e985d15e..28396ec71ab07e 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/insert_weight_param.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/insert_weight_param.cc
@@ -13,20 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
 #include <utility>
 
+#include "llvm/ADT/STLExtras.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project  // IWYU pragma: keep
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
 #include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project  // IWYU pragma: keep
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Matchers.h"  // from @llvm-project
 #include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
@@ -41,6 +45,7 @@ limitations under the License.
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
+#include "tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h"  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -52,6 +57,10 @@ namespace mlir::quant::stablehlo {
 
 namespace {
 
+using ::stablehlo::quantization::Method;
+using ::stablehlo::quantization::QuantizedType;
+using ::stablehlo::quantization::WeightOnlyPtq;
+
 // Inserts quantization parameters of weights for weight-only quantization and
 // dynamic range quantization of `stablehlo.convolution` and
 // `stablehlo.dot_general`.
@@ -81,45 +90,58 @@ class InsertWeightParamPattern
     if (op->getNumResults() != 1) {
       return failure();
     }
-    auto type = op->getResult(0).getType().cast<TensorType>();
+    auto type = mlir::cast<TensorType>(op->getResult(0).getType());
     if (!type || !type.getElementType().isF32()) {
       return failure();
     }
-    return success(op->hasOneUse() &&
-                   IsWeightQuantizableFunction(*op->getUses().begin()));
+    return success(
+        op->hasOneUse() &&
+        IsWeightQuantizableFunction(*op->getUses().begin(), type.getRank()));
   }
 
   // Checks if the operand is second operand of `tf.XlaCallModule` op for
   // `stablehlo.convolution` or `stablehlo.dot_general` with fully_quantizable
   // trait.
-  static bool IsWeightQuantizableFunction(OpOperand& operand) {
+  static bool IsWeightQuantizableFunction(OpOperand& operand, int64_t rank) {
     if (operand.getOperandNumber() != 1) {
       return false;
     }
     Operation* user = operand.getOwner();
-    if (isa<TF::XlaCallModuleOp>(user)) {
-      auto call_op = cast<TF::XlaCallModuleOp>(user);
-      const StringRef function_name = GetEntryFunctionName(call_op);
-      const bool is_conv_or_dot = function_name.contains("conv") ||
-                                  function_name.contains("dot_general");
-      const bool has_quant_trait = HasQuantizableTrait(call_op);
-      return is_conv_or_dot && has_quant_trait;
+    if (!IsWeightOnlyQuantizableOp(*user)) {
+      return false;
     }
-    return false;
+    Method method = GetQuantizationMethodOrDefault(user);
+    return HasValidWeightOnlyPtqMethod(method.weight_only_ptq(), rank);
   }
 
   void rewrite(Operation* op, PatternRewriter& rewriter) const override {
     Operation* quantizable_op = *op->getUsers().begin();
     DenseFPElementsAttr attr;
-    if (!matchPattern(op->getResult(0), m_Constant(&attr))) {
-      return;
+    matchPattern(op->getResult(0), m_Constant(&attr));
+
+    Method method = GetQuantizationMethodOrDefault(quantizable_op);
+    const WeightOnlyPtq& weight_only_ptq = method.weight_only_ptq();
+
+    Type weight_type;
+    if (IsPerTensor(weight_only_ptq)) {
+      weight_type = dyn_cast<quant::QuantizedType>(
+          quant::GetUniformQuantizedTypeForWeight(
+              attr, /*symmetric=*/true, /*num_bits=*/8, /*is_signed=*/true,
+              /*narrow_range=*/true, /*legacy_float_scale=*/false));
+    } else {
+      int quantization_dimension = GetQuantizationDimension(
+          weight_only_ptq, cast<TF::XlaCallModuleOp>(quantizable_op));
+      weight_type = quant::GetUniformQuantizedPerAxisTypeForWeight(
+          attr, quantization_dimension, /*symmetric=*/true, /*num_bits=*/8,
+          /*is_signed=*/true,
+          /*narrow_range=*/true, /*legacy_float_scale=*/false);
     }
-    auto quant_type =
-        quant::GetUniformQuantizedTypeForWeight(
-            attr, /*symmetric=*/false, /*num_bits=*/8, /*is_signed=*/true,
-            /*narrow_range=*/false, /*legacy_float_scale=*/false)
-            .template dyn_cast<quant::QuantizedType>();
+
+    auto quant_type = dyn_cast<quant::QuantizedType>(weight_type);
     if (!quant_type) {
+      op->emitError(
+          "Failed to get weight quantization parameters for weight-only "
+          "quantization.");
       return;
     }
 
@@ -134,6 +156,80 @@ class InsertWeightParamPattern
                                                            expressed_type, q);
     quantizable_op->setOperand(1, dq.getResult());
   }
+
+ private:
+  static bool HasValidWeightOnlyPtqMethod(const WeightOnlyPtq& weight_only_ptq,
+                                          int64_t rank) {
+    const auto& input_quantized_types = weight_only_ptq.input_quantized_types();
+    if (IsPerTensor(weight_only_ptq)) {
+      return true;
+    }
+    // `input_quantized_types` should contain spec for quantization type of the
+    // second operand, which is weight.
+    const QuantizedType& quantized_type = input_quantized_types.at(1);
+    if (const auto& specs = quantized_type.dimension_specs();
+        specs.has_dimension()) {
+      return specs.dimension() >= 0 && specs.dimension() < rank;
+    }
+    return true;
+  }
+
+  static bool IsPerTensor(const WeightOnlyPtq& weight_only_ptq) {
+    const auto& input_quantized_types = weight_only_ptq.input_quantized_types();
+    if (input_quantized_types.empty()) {
+      return true;
+    }
+    auto weight_type = input_quantized_types.find(1);
+    if (weight_type == input_quantized_types.end()) {
+      return true;
+    }
+    return weight_type->second.has_per_tensor();
+  }
+
+  static int GetQuantizationDimension(const WeightOnlyPtq& weight_only_ptq,
+                                      TF::XlaCallModuleOp op) {
+    const QuantizedType& quantized_type =
+        weight_only_ptq.input_quantized_types().at(1);
+    if (quantized_type.dimension_specs().has_dimension()) {
+      return quantized_type.dimension_specs().dimension();
+    }
+    return GetDefaultQuantizationDimension(op);
+  }
+
+  // Determines quantization dimension of weights for given `tf.XlaCallModule`
+  // op. For convolution, returns output feature dimension of the kernel. For
+  // dot_general, returns the first non-contracting dimension, non-batching
+  // dimension. If such dimension does not exists, returns the last dimension of
+  // rhs.
+  static int64_t GetDefaultQuantizationDimension(TF::XlaCallModuleOp op) {
+    const StringRef function_name = GetEntryFunctionName(op);
+    const auto module_op = op->getParentOfType<ModuleOp>();
+    const SymbolTable symbol_table(module_op);
+    func::FuncOp func = symbol_table.lookup<func::FuncOp>(function_name);
+
+    if (function_name.contains("conv")) {
+      return (*(func.getOps<mlir::stablehlo::ConvolutionOp>().begin()))
+          .getDimensionNumbers()
+          .getKernelOutputFeatureDimension();
+    } else if (function_name.contains("dot_general")) {
+      auto dot = *(func.getOps<mlir::stablehlo::DotGeneralOp>().begin());
+      const ::mlir::stablehlo::DotDimensionNumbersAttr dimension_numbers =
+          dot.getDotDimensionNumbers();
+      ArrayRef<int64_t> rhs_contracting_dims =
+          dimension_numbers.getRhsContractingDimensions();
+      ArrayRef<int64_t> rhs_batching_dims =
+          dimension_numbers.getRhsBatchingDimensions();
+      int64_t rank = dot.getRhs().getType().cast<TensorType>().getRank();
+      for (int i = 0; i < rank; ++i) {
+        // Return the first non-contracting, non-batching dimension of rhs.
+        if (llvm::find(rhs_contracting_dims, i) == rhs_contracting_dims.end() &&
+            llvm::find(rhs_batching_dims, i) == rhs_batching_dims.end()) {
+          return i;
+        }
+      }
+    }
+    return op.getOperand(1).getType().cast<TensorType>().getRank() - 1;
+  }
 };
 
 void InsertWeightParamPass::runOnOperation() {
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions.cc
index 6577666ab90f10..d5487dd5ad8abd 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions.cc
@@ -66,7 +66,7 @@ Attribute DefaultOrNullAttr(OpBuilder& builder, const Attribute& attr) {
 // Checks whether the value of a constant equals the given float, regardless
 // of the tensor dimension.
 bool FloatValueEquals(const Attribute& attr, const double value) {
-  const auto fp_attr = attr.dyn_cast_or_null<DenseFPElementsAttr>();
+  const auto fp_attr = mlir::dyn_cast_or_null<DenseFPElementsAttr>(attr);
   if (!fp_attr) return false;
 
   if (fp_attr.isSplat()) {
@@ -208,7 +208,9 @@ void LiftQuantizableSpotsAsFunctionsPass::runOnOperation() {
   simple_patterns::populateWithGenerated(patterns);
   fusion_patterns::populateWithGenerated(patterns);
   FrozenRewritePatternSet frozen_patterns(std::move(patterns));
-  for (auto func : module_op.getOps<func::FuncOp>()) {
+
+  // Iterate over the sorted list of functions to keep order deterministic.
+  for (func::FuncOp func : GetSortedFunctions(module_op)) {
     if (failed(applyPatternsAndFoldGreedily(func, frozen_patterns))) {
       func.emitError()
           << "quant-stablehlo-lift-quantizable-spots-as-functions failed.";
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/merge_fusion_with_dequantize.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/merge_fusion_with_dequantize.cc
index acfe3cfd6fc6b2..9a0d8fb2a25b2b 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/merge_fusion_with_dequantize.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/merge_fusion_with_dequantize.cc
@@ -69,8 +69,8 @@ class MergeFusionWithUniformDequantizePattern
     auto func_name = call_op.getCallee();
     if (!func_name.starts_with("quantized_")) return failure();
     if (call_op->getNumResults() != 1) return failure();
-    if (!getElementTypeOrSelf(call_op->getResult(0).getType())
-             .isa<UniformQuantizedType>())
+    if (!mlir::isa<UniformQuantizedType>(
+            getElementTypeOrSelf(call_op->getResult(0).getType())))
       return failure();
 
     // Fetch the callee function.
@@ -89,8 +89,8 @@ class MergeFusionWithUniformDequantizePattern
     // Create a new func.call op with f32 output.
     auto new_call_op = call_op.clone();
     new_call_op->getResult(0).setType(
-        call_op.getResult(0).getType().cast<ShapedType>().clone(
-            rewriter.getF32Type()));
+        mlir::cast<ShapedType>(call_op.getResult(0).getType())
+            .clone(rewriter.getF32Type()));
     rewriter.setInsertionPoint(call_op);
     rewriter.insert(new_call_op);
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/nchw_convolution_to_nhwc.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/nchw_convolution_to_nhwc.cc
index 521f701598fb0a..ed2da6ed103273 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/nchw_convolution_to_nhwc.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/nchw_convolution_to_nhwc.cc
@@ -73,7 +73,7 @@ class RewriteNchwConvolutionToNhwc
     // Transpose the input tensor: [b, f, 0, 1] => [b, 0, 1, f]
     Value input = op->getOperand(0);
     const TensorType new_input_tensor_type = GetTransposedTensorType(
-        input.getType().cast<TensorType>(), kNchwToNhwcPermutation);
+        mlir::cast<TensorType>(input.getType()), kNchwToNhwcPermutation);
 
     auto input_transpose_op = rewriter.create<mlir::stablehlo::TransposeOp>(
         op.getLoc(), /*resultType0=*/new_input_tensor_type, /*operand=*/input,
@@ -82,7 +82,7 @@ class RewriteNchwConvolutionToNhwc
     // Transpose the filter tensor: [o, i, 0, 1] => [0, 1, i, o]
     Value filter = op->getOperand(1);
     const TensorType new_filter_tensor_type = GetTransposedTensorType(
-        filter.getType().cast<TensorType>(), kOihwToHwioPermutation);
+        mlir::cast<TensorType>(filter.getType()), kOihwToHwioPermutation);
 
     auto filter_transpose_op = rewriter.create<mlir::stablehlo::TransposeOp>(
         op.getLoc(), /*resultType0=*/new_filter_tensor_type, /*operand=*/filter,
@@ -98,7 +98,8 @@ class RewriteNchwConvolutionToNhwc
         /*outputSpatialDimensions=*/SmallVector<int64_t>{1, 2});
 
     // Determine the shape of the output tensor: [b, f, 0, 1] => [b, 0, 1, f]
-    auto output_tensor_type = op->getResult(0).getType().cast<TensorType>();
+    auto output_tensor_type =
+        mlir::cast<TensorType>(op->getResult(0).getType());
     const TensorType new_conv_output_tensor_type =
         GetTransposedTensorType(output_tensor_type, kNchwToNhwcPermutation);
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h b/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h
index 2937eec8d9a2f0..d13c589c2ba890 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 #include <string>
+#include <vector>
 
 #include "absl/status/statusor.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -44,6 +45,12 @@ std::unique_ptr<OperationPass<ModuleOp>>
 CreateLiftQuantizableSpotsAsFunctionsPass(
     const ::stablehlo::quantization::QuantizationSpecs& quantization_specs);
 
+// Creates a pass that inserts CalibrationStatisticsSaverOp.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateInsertCalibrationStatisticsSaverPass(
+    StringRef calibration_data_dir,
+    const std::vector<std::string>& aggregator_ops_to_ignore);
+
 // Adds generated pass default constructors or options definitions.
 #define GEN_PASS_DECL
 // Adds generated pass registration functions.
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td b/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td
index fdb7fa7941f025..7661e8d562fbe9 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td
@@ -63,10 +63,6 @@ def QuantizeCompositeFunctionsPass : Pass<"stablehlo-quantize-composite-function
     Option<"mlir_dump_file_name_", "mlir-dump-file-name",
         "std::optional<std::string>", /*default=*/"std::nullopt",
         "MLIR dump file name.">,
-    Option<"enable_weight_only_",
-        "enable-weight-only",
-        "bool", /*default=*/"false",
-        "Whether to produce weight-only quantized op for convolution and dot_general op.">,
     Option<"merge_fusion_with_dequantize_",
         "merge-fusion-with-dequantize",
         "bool", /*default=*/"false",
@@ -106,10 +102,6 @@ def QuantizePass : Pass<"stablehlo-quantize", "mlir::ModuleOp"> {
         "enable-per-channel-quantized-weight",
         "bool", /*default=*/"true",
         "Whether to enable per-channel quantized weights.">,
-    Option<"enable_weight_only_",
-        "enable-weight-only",
-        "bool", /*default=*/"false",
-        "Whether to produce weight-only quantized op for convolution and dot_general op.">,
   ];
   let dependentDialects = [
     "mlir::stablehlo::StablehloDialect",
@@ -228,3 +220,20 @@ def RemoveShardingCustomCallPass : Pass<"stablehlo-remove-sharding-custom-call",
   }];
   let dependentDialects = ["mlir::stablehlo::StablehloDialect"];
 }
+
+def InsertCalibrationStatisticsSaverPass : Pass<"stablehlo-insert-calibration-statistics-saver", "ModuleOp"> {
+  let summary = "Inserts `CalibrationStatisticsSaver` op to collect and save calibration statistics.";
+  let description = [{
+    Finds all `CustomAggregator` ops in the each function and add a single
+    `CalibrationStatisticsSaver` op at the end of the function to collect their
+    statistics.
+  }];
+  let options = [
+    ListOption<"aggregator_ops_to_ignore_", "aggregator-ops-to-ignore", "std::string",
+               "Ops to ignore when inserting CalibrationStatisticsSaver.">,
+    Option<"calibration_data_dir_", "calibration-data-dir",
+        "std::string", /*default=*/"",
+        "The directory to save calibration data.">,
+  ];
+  let dependentDialects = ["TF::TensorFlowDialect"];
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc
index a6d041a5b8cb9e..787fca3594f14a 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc
@@ -75,6 +75,7 @@ using ::mlir::stablehlo::GetDimensionSizeOp;
 using ::mlir::stablehlo::ReshapeOp;
 using ::mlir::stablehlo::UniformQuantizeOp;
 using ::stablehlo::quantization::Method;
+using ::stablehlo::quantization::QuantizedDimension;
 using ::stablehlo::quantization::QuantizedType;
 using ::stablehlo::quantization::StaticRangePtq;
 
@@ -237,7 +238,7 @@ void CreateAndReturnQuantizedBiasPattern(
   if (succeeded(bcast_op)) {
     Value bcast_op_result = (*bcast_op)->getResult(0);
     auto bcast_op_result_type =
-        bcast_op_result.getType().cast<RankedTensorType>();
+        mlir::cast<RankedTensorType>(bcast_op_result.getType());
     const ArrayRef<int64_t> bcast_shape = bcast_op_result_type.getShape();
     const TensorType new_bcast_op_result_type = bcast_op_result_type.cloneWith(
         bcast_shape, accumulation_quantized_element_type);
@@ -245,7 +246,7 @@ void CreateAndReturnQuantizedBiasPattern(
   }
 
   const auto add_op_result_type =
-      add_op_result.getType().cast<RankedTensorType>();
+      mlir::cast<RankedTensorType>(add_op_result.getType());
   const ArrayRef<int64_t> add_op_shape = add_op_result_type.getShape();
   // For quantized bias add case, lhs, rhs, and result have the same types.
   const TensorType new_add_op_result_type = add_op_result_type.cloneWith(
@@ -269,7 +270,8 @@ class EntryFuncBodyQuantizationPattern {
   // Returns `success()` if `entry_func_op`'s body is eligible for rewriting. At
   // this point `entry_func_op`'s signature has not been reset with quantized
   // types.
-  virtual LogicalResult match(func::FuncOp entry_func_op) const = 0;
+  virtual LogicalResult match(func::FuncOp entry_func_op,
+                              const Method& quantization_method) const = 0;
 
   // Rewrites the `entry_func_op`'s body.
   virtual void rewrite(func::FuncOp entry_func_op,
@@ -318,7 +320,7 @@ void RewriteGemmStyleOp(func::FuncOp entry_func_op, PatternRewriter& rewriter,
 
   Value gemm_style_op_result = gemm_style_op->getResult(0);
   const auto gemm_style_op_result_type =
-      gemm_style_op_result.getType().cast<RankedTensorType>();
+      mlir::cast<RankedTensorType>(gemm_style_op_result.getType());
   const ArrayRef<int64_t> gemm_style_shape =
       gemm_style_op_result_type.getShape();
 
@@ -326,11 +328,12 @@ void RewriteGemmStyleOp(func::FuncOp entry_func_op, PatternRewriter& rewriter,
   TensorType new_gemm_style_op_result_type;
 
   const double input_scale =
-      getElementTypeOrSelf(input_type).cast<UniformQuantizedType>().getScale();
+      mlir::cast<UniformQuantizedType>(getElementTypeOrSelf(input_type))
+          .getScale();
 
   if (enable_per_channel_quantized_weight) {
-    ArrayRef<double> filter_scales = getElementTypeOrSelf(filter_type)
-                                         .cast<UniformQuantizedPerAxisType>()
+    ArrayRef<double> filter_scales = mlir::cast<UniformQuantizedPerAxisType>(
+                                         getElementTypeOrSelf(filter_type))
                                          .getScales();
     std::vector<double> result_scales;
     result_scales.reserve(filter_scales.size());
@@ -340,8 +343,8 @@ void RewriteGemmStyleOp(func::FuncOp entry_func_op, PatternRewriter& rewriter,
     }
 
     const ArrayRef<int64_t> zero_points =
-        getElementTypeOrSelf(filter_type)
-            .cast<UniformQuantizedPerAxisType>()
+        mlir::cast<UniformQuantizedPerAxisType>(
+            getElementTypeOrSelf(filter_type))
             .getZeroPoints();
 
     // `stablehlo.convolution` assumes the following format:
@@ -351,7 +354,7 @@ void RewriteGemmStyleOp(func::FuncOp entry_func_op, PatternRewriter& rewriter,
     // `stablehlo.dot_general` legalizable to `tfl.fully_connected` has a
     // filter rank of 2 with the last dimension as the channel dimension.
     const int64_t quantization_dimension =
-        filter_type.cast<ShapedType>().getShape().size() - 1;
+        mlir::cast<ShapedType>(filter_type).getShape().size() - 1;
     accumulation_quantized_element_type =
         CreateI32F32UniformQuantizedPerAxisType(
             gemm_style_op->getLoc(), *rewriter.getContext(), result_scales,
@@ -360,9 +363,9 @@ void RewriteGemmStyleOp(func::FuncOp entry_func_op, PatternRewriter& rewriter,
     new_gemm_style_op_result_type = gemm_style_op_result_type.cloneWith(
         gemm_style_shape, accumulation_quantized_element_type);
   } else {
-    const double filter_scale = getElementTypeOrSelf(filter_type)
-                                    .cast<UniformQuantizedType>()
-                                    .getScale();
+    const double filter_scale =
+        mlir::cast<UniformQuantizedType>(getElementTypeOrSelf(filter_type))
+            .getScale();
     const double result_scale = input_scale * filter_scale;
 
     accumulation_quantized_element_type = CreateI32F32UniformQuantizedType(
@@ -408,19 +411,20 @@ void RewriteGemmStyleOp(func::FuncOp entry_func_op, PatternRewriter& rewriter,
 class QuantizeDotGeneralOpPattern : public EntryFuncBodyQuantizationPattern {
  public:
   explicit QuantizeDotGeneralOpPattern(
-      const bool enable_per_channel_quantized_weight,
-      const bool enable_weight_only)
+      const bool enable_per_channel_quantized_weight)
       : enable_per_channel_quantized_weight_(
-            enable_per_channel_quantized_weight),
-        enable_weight_only_(enable_weight_only) {}
+            enable_per_channel_quantized_weight) {}
 
-  LogicalResult match(func::FuncOp entry_func_op) const override {
+  LogicalResult match(func::FuncOp entry_func_op,
+                      const Method& quantization_method) const override {
+    if (!quantization_method.has_static_range_ptq()) {
+      return failure();
+    }
     return MatchGemmStyleOp<DotGeneralOp>(entry_func_op);
   }
 
   void rewrite(func::FuncOp entry_func_op, const Method& quantization_method,
                PatternRewriter& rewriter) const override {
-    if (enable_weight_only_) return;
     DotGeneralOp dot_general_op = *entry_func_op.getOps<DotGeneralOp>().begin();
     const bool should_quantize_per_channel =
         enable_per_channel_quantized_weight_ &&
@@ -433,28 +437,26 @@ class QuantizeDotGeneralOpPattern : public EntryFuncBodyQuantizationPattern {
   [[deprecated(
       "Do not rely on this field for per-channel quantization. Use `Method` "
       "instead.")]] const bool enable_per_channel_quantized_weight_;
-  // TODO: b/331510853 - Deprecate boolean flag and use `Method` to perform
-  // weight-only quantization.
-  const bool enable_weight_only_;
 };
 
 // Quantizes the entry function's body containing a `ConvolutionOp`.
 class QuantizeConvolutionOpPattern : public EntryFuncBodyQuantizationPattern {
  public:
   explicit QuantizeConvolutionOpPattern(
-      const bool enable_per_channel_quantized_weight,
-      const bool enable_weight_only)
+      const bool enable_per_channel_quantized_weight)
       : enable_per_channel_quantized_weight_(
-            enable_per_channel_quantized_weight),
-        enable_weight_only_(enable_weight_only) {}
+            enable_per_channel_quantized_weight) {}
 
-  LogicalResult match(func::FuncOp entry_func_op) const override {
+  LogicalResult match(func::FuncOp entry_func_op,
+                      const Method& quantization_method) const override {
+    if (!quantization_method.has_static_range_ptq()) {
+      return failure();
+    }
     return MatchGemmStyleOp<ConvolutionOp>(entry_func_op);
   }
 
   void rewrite(func::FuncOp entry_func_op, const Method& quantization_method,
                PatternRewriter& rewriter) const override {
-    if (enable_weight_only_) return;
     RewriteGemmStyleOp<ConvolutionOp>(
         entry_func_op, rewriter,
         enable_per_channel_quantized_weight_ &&
@@ -463,7 +465,8 @@ class QuantizeConvolutionOpPattern : public EntryFuncBodyQuantizationPattern {
 
   // Returns true if the quantization method indicates per-channel quantization
   // for convolution weights. This method specifically matches a quantization
-  // dimension of 3 for the input index 1.
+  // dimension of 3 for the input index 1 or unspecified quantization dimension
+  // for the input index 1.
   bool IsWeightPerChannelQuantized(const Method& quantization_method) const {
     if (quantization_method.has_static_range_ptq()) {
       const StaticRangePtq& static_range_ptq_spec =
@@ -472,7 +475,13 @@ class QuantizeConvolutionOpPattern : public EntryFuncBodyQuantizationPattern {
       if (static_range_ptq_spec.input_quantized_types().contains(1)) {
         const QuantizedType& weight_quantized_type =
             static_range_ptq_spec.input_quantized_types().at(1);
-        return weight_quantized_type.dimension_specs().dimension() == 3;
+        if (weight_quantized_type.has_per_tensor()) {
+          return false;
+        }
+        const QuantizedDimension& dimension_specs =
+            weight_quantized_type.dimension_specs();
+        return !dimension_specs.has_dimension() ||
+               dimension_specs.dimension() == 3;
       }
     }
     return false;
@@ -482,25 +491,60 @@ class QuantizeConvolutionOpPattern : public EntryFuncBodyQuantizationPattern {
   [[deprecated(
       "Do not rely on this field for per-channel quantization. Use `Method` "
       "instead.")]] const bool enable_per_channel_quantized_weight_;
-  // TODO: b/331510853 - Deprecate boolean flag and use `Method` to perform
-  // weight-only quantization.
-  const bool enable_weight_only_;
+};
+
+// Quantizes the entry function's body for weight-only quantized op.
+template <typename OpT>
+class QuantizeWeightOnlyOpPattern : public EntryFuncBodyQuantizationPattern {
+ public:
+  explicit QuantizeWeightOnlyOpPattern(
+      const bool enable_per_channel_quantized_weight)
+      : enable_per_channel_quantized_weight_(
+            enable_per_channel_quantized_weight) {}
+
+  LogicalResult match(func::FuncOp entry_func_op,
+                      const Method& quantization_method) const override {
+    if (!quantization_method.has_weight_only_ptq()) {
+      return failure();
+    }
+    return MatchGemmStyleOp<OpT>(entry_func_op);
+  }
+
+  void rewrite(func::FuncOp entry_func_op, const Method& quantization_method,
+               PatternRewriter& rewriter) const override {}
+
+ private:
+  [[deprecated(
+      "Do not rely on this field for per-channel quantization. Use `Method` "
+      "instead.")]] const bool enable_per_channel_quantized_weight_;
 };
 
 template <typename SingularOpT>
 class QuantizeSingularOpPattern : public EntryFuncBodyQuantizationPattern {
  public:
   explicit QuantizeSingularOpPattern(
-      const bool enable_per_channel_quantized_weight,
-      const bool enable_weight_only) {}
+      const bool enable_per_channel_quantized_weight) {}
 
-  LogicalResult match(func::FuncOp entry_func_op) const override {
+  LogicalResult match(func::FuncOp entry_func_op,
+                      const Method& quantization_method) const override {
+    if (!quantization_method.has_static_range_ptq()) {
+      return failure();
+    }
     const auto op_iterator_range = entry_func_op.getOps<SingularOpT>();
     if (op_iterator_range.empty()) {
       LLVM_DEBUG(llvm::dbgs() << "Function does not have "
                               << SingularOpT::getOperationName() << " op.\n");
       return failure();
     }
+
+    // Entry function body should have one block with two ops(op to be quantized
+    // and return op).
+    Region& body = entry_func_op.getBody();
+    if (body.getBlocks().size() != 1 ||
+        body.begin()->getOperations().size() != 2) {
+      return failure();
+    }
+
     if (!isa<RankedTensorType>(
             (*op_iterator_range.begin()).getResult().getType())) {
       LLVM_DEBUG(llvm::dbgs() << SingularOpT::getOperationName()
@@ -526,13 +570,13 @@ class QuantizeSingularOpPattern : public EntryFuncBodyQuantizationPattern {
 
       // Get the quantized tensor manipulation op's output type and update.
       const auto singular_op_result_type =
-          singular_op_result.getType().cast<RankedTensorType>();
+          mlir::cast<RankedTensorType>(singular_op_result.getType());
       const ArrayRef<int64_t> singular_op_shape =
           singular_op_result_type.getShape();
       const TensorType new_singular_op_result_type =
           singular_op_result_type.cloneWith(
-              singular_op_shape,
-              getElementTypeOrSelf(operand_type).cast<UniformQuantizedType>());
+              singular_op_shape, mlir::cast<UniformQuantizedType>(
+                                     getElementTypeOrSelf(operand_type)));
       singular_op_result.setType(new_singular_op_result_type);
 
       // Create requantization op and return.
@@ -599,9 +643,9 @@ void ReplaceQuantizedXlaCallModuleOpWithQuantizedCallOp(
     const EntryFuncBodyQuantizationPattern& body_rewrite_pattern,
     const Method& quantization_method) {
   const ModuleOp module_op = xla_call_module_op->getParentOfType<ModuleOp>();
-  const SymbolTable symbol_table(module_op);
 
-  func::FuncOp entry_func_op = GetEntryFuncOp(xla_call_module_op, symbol_table);
+  func::FuncOp entry_func_op =
+      GetEntryFuncOp(xla_call_module_op, SymbolTable(module_op));
   QuantizeEntryFuncOp(ctx, rewriter, xla_call_module_op, entry_func_op,
                       body_rewrite_pattern, quantization_method);
 
@@ -627,16 +671,13 @@ template <typename FuncBodyRewritePatternT,
 class XlaCallModuleOpToCallOp : public OpRewritePattern<TF::XlaCallModuleOp> {
  public:
   explicit XlaCallModuleOpToCallOp(
-      MLIRContext& ctx, const bool enable_per_channel_quantized_weight,
-      const bool enable_weight_only)
+      MLIRContext& ctx, const bool enable_per_channel_quantized_weight)
       : OpRewritePattern<TF::XlaCallModuleOp>(&ctx),
         enable_per_channel_quantized_weight_(
-            enable_per_channel_quantized_weight),
-        enable_weight_only_(enable_weight_only) {}
+            enable_per_channel_quantized_weight) {}
 
   LogicalResult match(TF::XlaCallModuleOp op) const override {
     ModuleOp module_op = op->getParentOfType<ModuleOp>();
-    SymbolTable symbol_table(module_op);
 
     // Ignore ops without quantization method.
     // Consider adding checks for individual methods.
@@ -646,19 +687,18 @@ class XlaCallModuleOpToCallOp : public OpRewritePattern<TF::XlaCallModuleOp> {
     if (!IsQuantizedXlaCallModuleOp(op)) return failure();
 
     // For weight-only quantization, op should be hybrid quantized.
-    if (enable_weight_only_ && !IsHybridQuantizedOp(op)) {
+    if (HasWeightOnlyPtqMethod(op) && !IsHybridQuantizedOp(op)) {
       return failure();
     }
 
-    func::FuncOp entry_func_op = GetEntryFuncOp(op, symbol_table);
+    func::FuncOp entry_func_op = GetEntryFuncOp(op, SymbolTable(module_op));
     if (!entry_func_op) {
       op->emitError("Failed to find a valid entry function.");
       return failure();
     }
-
-    return FuncBodyRewritePatternT(enable_per_channel_quantized_weight_,
-                                   enable_weight_only_)
-        .match(entry_func_op);
+    Method quantization_method = GetQuantizationMethodOrDefault(op);
+    return FuncBodyRewritePatternT(enable_per_channel_quantized_weight_)
+        .match(entry_func_op, quantization_method);
   }
 
   void rewrite(TF::XlaCallModuleOp xla_call_module_op,
@@ -671,8 +711,7 @@ class XlaCallModuleOpToCallOp : public OpRewritePattern<TF::XlaCallModuleOp> {
 
     ReplaceQuantizedXlaCallModuleOpWithQuantizedCallOp(
         *rewriter.getContext(), rewriter, xla_call_module_op,
-        FuncBodyRewritePatternT(enable_per_channel_quantized_weight_,
-                                enable_weight_only_),
+        FuncBodyRewritePatternT(enable_per_channel_quantized_weight_),
         quantization_method);
   }
 
@@ -680,9 +719,6 @@ class XlaCallModuleOpToCallOp : public OpRewritePattern<TF::XlaCallModuleOp> {
   [[deprecated(
       "Do not rely on this field for per-channel quantization. Use `Method` "
       "instead.")]] const bool enable_per_channel_quantized_weight_;
-  // TODO: b/331510853 - Deprecate boolean flag and use `Method` to perform
-  // weight-only quantization.
-  const bool enable_weight_only_;
 };
 
 // Quantizes op with regions such as stablehlo.reduce_window op.
@@ -733,13 +769,13 @@ class QuantizeOpWithRegionPattern
       inputs.reserve(op_with_region->getNumOperands());
       for (Value operand : op_with_region->getOperands()) {
         const Type operand_type = operand.getType();
-        if (operand_type.isa<NoneType>()) {
+        if (mlir::isa<NoneType>(operand_type)) {
           inputs.push_back(operand);
           continue;
         }
 
         const Type element_type =
-            operand.getType().cast<TensorType>().getElementType();
+            mlir::cast<TensorType>(operand.getType()).getElementType();
         if (auto dq_op = dyn_cast_or_null<quantfork::DequantizeCastOp>(
                 operand.getDefiningOp())) {
           inputs.push_back(dq_op.getOperand());
@@ -759,13 +795,13 @@ class QuantizeOpWithRegionPattern
       output_types.reserve(op_with_region->getNumResults());
       for (const Value result : op_with_region->getResults()) {
         const Type result_type = result.getType();
-        if (result_type.isa<NoneType>()) {
+        if (mlir::isa<NoneType>(result_type)) {
           outputs_replaced.push_back(result);
           output_types.push_back(result_type);
           continue;
         }
         const Type result_element_type =
-            result.getType().cast<TensorType>().getElementType();
+            mlir::cast<TensorType>(result.getType()).getElementType();
         // If the user is the QuantizeOp, it must be the only user.
         if (result.hasOneUse() &&
             isa<quantfork::QuantizeCastOp>(*result.user_begin())) {
@@ -799,7 +835,7 @@ class QuantizeOpWithRegionPattern
 
       const Type operand_type = quantized_op->getOperandTypes()[0];
       const Type element_type =
-          operand_type.cast<TensorType>().getElementType();
+          mlir::cast<TensorType>(operand_type).getElementType();
       for (Region& region : quantized_op->getRegions()) {
         ReplaceTypesInNestedRegion(region, element_type);
       }
@@ -856,7 +892,7 @@ class QuantizeOpWithRegionPattern
   // Replaces element type of the given tensor type while preserving shape of
   // the given type. If the given type is not tensor type, just return itself.
   Type ReplaceElementType(const Type type, const Type element_type) const {
-    if (TensorType tensor_type = type.dyn_cast<TensorType>()) {
+    if (TensorType tensor_type = mlir::dyn_cast<TensorType>(type)) {
       return tensor_type.clone(element_type);
     }
     return type;
@@ -874,23 +910,23 @@ bool IsQuantizedCompositeFunction(func::CallOp call_op) {
 
   bool has_quantized_types = false;
   for (Value operand : call_op.getOperands()) {
-    if (const TensorType type = operand.getType().dyn_cast<TensorType>()) {
-      if (type.getElementType().isa<FloatType>()) {
+    if (const TensorType type = mlir::dyn_cast<TensorType>(operand.getType())) {
+      if (mlir::isa<FloatType>(type.getElementType())) {
         return false;
       }
-      if (type.getElementType()
-              .isa<UniformQuantizedType, UniformQuantizedPerAxisType>()) {
+      if (mlir::isa<UniformQuantizedType, UniformQuantizedPerAxisType>(
+              type.getElementType())) {
         has_quantized_types = true;
       }
     }
   }
   for (const Value result : call_op.getResults()) {
-    if (const auto type = result.getType().dyn_cast<TensorType>()) {
-      if (type.getElementType().isa<FloatType>()) {
+    if (const auto type = mlir::dyn_cast<TensorType>(result.getType())) {
+      if (mlir::isa<FloatType>(type.getElementType())) {
         return false;
       }
-      if (type.getElementType()
-              .isa<UniformQuantizedType, UniformQuantizedPerAxisType>()) {
+      if (mlir::isa<UniformQuantizedType, UniformQuantizedPerAxisType>(
+              type.getElementType())) {
         has_quantized_types = true;
       }
     }
@@ -919,7 +955,7 @@ bool IsConnectedWithQuantizedCompsiteFunction(Operation* same_scale_op) {
             ->has_same_scale_requirement) {
       for (const OpResult result : preceding_op->getResults()) {
         const Type element_type = getElementTypeOrSelf(result.getType());
-        if (element_type.isa<UniformQuantizedType>()) {
+        if (mlir::isa<UniformQuantizedType>(element_type)) {
           return true;
         }
       }
@@ -947,7 +983,7 @@ bool IsConnectedWithQuantizedCompsiteFunction(Operation* same_scale_op) {
               ->has_same_scale_requirement) {
         for (Value operand : following_op->getOperands()) {
           const Type element_type = getElementTypeOrSelf(operand.getType());
-          if (element_type.isa<UniformQuantizedType>()) {
+          if (mlir::isa<UniformQuantizedType>(element_type)) {
             return true;
           }
         }
@@ -958,20 +994,6 @@ bool IsConnectedWithQuantizedCompsiteFunction(Operation* same_scale_op) {
   return false;
 }
 
-template <typename OpT>
-class QuantizeWeightOnlyOpPattern : public EntryFuncBodyQuantizationPattern {
- public:
-  explicit QuantizeWeightOnlyOpPattern(
-      const bool enable_per_channel_quantized_weight) {}
-
-  LogicalResult match(func::FuncOp entry_func_op) const override {
-    return MatchGemmStyleOp<OpT>(entry_func_op);
-  }
-
-  void rewrite(func::FuncOp entry_func_op, const Method& quantization_method,
-               PatternRewriter& rewriter) const override {}
-};
-
 // Compute heavy patterns should be quantized for both server and ODML targets.
 // Most patterns here are useful when quantized since they are compute heavy
 // or memory bound.
@@ -979,13 +1001,18 @@ void PopulateCommonQuantizationPatterns(
     MLIRContext& ctx, RewritePatternSet& patterns,
     const bool enable_per_channel_quantized_weight) {
   patterns.add<XlaCallModuleOpToCallOp<QuantizeConvolutionOpPattern>>(
-      ctx, enable_per_channel_quantized_weight, /*enable_weight_only=*/false);
+      ctx, enable_per_channel_quantized_weight);
   patterns.add<XlaCallModuleOpToCallOp<QuantizeDotGeneralOpPattern>>(
-      ctx, enable_per_channel_quantized_weight, /*enable_weight_only=*/false);
+      ctx, enable_per_channel_quantized_weight);
+  patterns
+      .add<XlaCallModuleOpToCallOp<QuantizeWeightOnlyOpPattern<ConvolutionOp>>>(
+          ctx, enable_per_channel_quantized_weight);
+  patterns
+      .add<XlaCallModuleOpToCallOp<QuantizeWeightOnlyOpPattern<DotGeneralOp>>>(
+          ctx, enable_per_channel_quantized_weight);
   // TODO: b/307620772 - Per-channel quantization for gather.
   patterns.add<XlaCallModuleOpToCallOp<QuantizeSingularOpPattern<GatherOp>>>(
-      ctx, /*enable_per_channel_quantized_weight=*/false,
-      /*enable_weight_only=*/false);
+      ctx, /*enable_per_channel_quantized_weight=*/false);
   // Populate pattern for quantization of ops with regions such as
   // `stablehlo.reduce_window` op.
   patterns.add<QuantizeOpWithRegionPattern>(ctx);
@@ -994,16 +1021,7 @@ void PopulateCommonQuantizationPatterns(
 void PopulateAllQuantizablePatterns(MLIRContext& ctx,
                                     RewritePatternSet& patterns) {
   patterns.add<XlaCallModuleOpToCallOp<QuantizeSingularOpPattern<AddOp>>>(
-      ctx, /*enable_per_channel_quantized_weight=*/false,
-      /*enable_weight_only=*/false);
-}
-
-void PopulateQuantizeWeightOnlyPatterns(MLIRContext& ctx,
-                                        RewritePatternSet& patterns) {
-  patterns.add<XlaCallModuleOpToCallOp<QuantizeConvolutionOpPattern>,
-               XlaCallModuleOpToCallOp<QuantizeDotGeneralOpPattern>>(
-      ctx, /*enable_per_channel_quantized_weight*/ false,
-      /*enable_weight_only=*/true);
+      ctx, /*enable_per_channel_quantized_weight=*/false);
 }
 
 }  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h
index 67eb267c1d9037..c07314d6cff6cf 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h
@@ -40,6 +40,7 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.h"
 #include "tensorflow/core/framework/types.pb.h"
@@ -59,18 +60,8 @@ bool IsConnectedWithQuantizedCompsiteFunction(Operation* same_scale_op);
 // quantization parameters are annotated by the QuantizeOp/DequantizeOp pairs.
 // Each matched pattern are rewritten by its quantized alternatives.
 //
-// The concrete pattern, extends from this base pattern, can specify whether it
-// allows weight-only quantization. If it is allowed, for operand/result that is
-// not adjacent to dequantize/quantize op, it remains as float. For
-// operand/result that is adjacent to dequantize/quantize, it is quantized.
-// Weight-only quantization can be used to generate both weight-only
-// quantization and dynamic range quantization. The condition for allowing
-// weight-only quantization or not for an op can be specified in the below
-// function:
-//
-//    static bool AllowWeightOnlyQuantization(Operation& op)
-//
-// This is a templatized `OpRewritePattern<RootOpT>`.
+// Quantization method is determined by the `_quantization_method` attributes
+// attached to each quantizable units.
 //
 // Template constraints are imposed as follows:
 //
@@ -159,18 +150,22 @@ class StableHloQuantizationPattern : public OpRewritePattern<RootOpT> {
         return failure();
       }
 
+      const bool weight_only_quantizable =
+          IsWeightOnlyQuantizableOp(*candidate_op);
+
       // Collect all the quantized inputs and "clone" the matched op by these
       // inputs.
       SmallVector<Value, 4> inputs;
       inputs.reserve(candidate_op->getNumOperands());
       for (auto operand : candidate_op->getOperands()) {
         Type operand_type = operand.getType();
-        if (operand_type.isa<NoneType>()) {
+        if (mlir::isa<NoneType>(operand_type)) {
           inputs.push_back(operand);
           continue;
         }
 
-        auto ele_type = operand.getType().cast<TensorType>().getElementType();
+        auto ele_type =
+            mlir::cast<TensorType>(operand.getType()).getElementType();
         if (auto dq_op =
                 dyn_cast_or_null<DequantizeOpT>(operand.getDefiningOp())) {
           inputs.push_back(dq_op.getOperand());
@@ -178,8 +173,7 @@ class StableHloQuantizationPattern : public OpRewritePattern<RootOpT> {
           // If the operand is an integer tensor, then it doesn't require the
           // DequantizeOp in the pattern.
           inputs.push_back(operand);
-        } else if (static_cast<const ConcreteT*>(this)
-                       ->AllowWeightOnlyQuantization(*candidate_op)) {
+        } else if (weight_only_quantizable) {
           inputs.push_back(operand);
         } else {
           return failure();
@@ -197,13 +191,13 @@ class StableHloQuantizationPattern : public OpRewritePattern<RootOpT> {
         Type result_type = result.getType();
         // Add this to the test coverage once we create test ops with none type
         // results.
-        if (result_type.isa<NoneType>()) {
+        if (mlir::isa<NoneType>(result_type)) {
           outputs_replaced.insert({result, enumerated_result.index()});
           output_types.push_back(result_type);
           continue;
         }
         Type result_ele_type =
-            result.getType().cast<TensorType>().getElementType();
+            mlir::cast<TensorType>(result.getType()).getElementType();
         // If the user is the QuantizeOp, it must be the only user.
         if (result.hasOneUse() && isa<QuantizeOpT>(*result.user_begin())) {
           auto user = cast<QuantizeOpT>(*result.user_begin());
@@ -215,8 +209,7 @@ class StableHloQuantizationPattern : public OpRewritePattern<RootOpT> {
           // D op in the pattern.
           outputs_replaced.insert({result, enumerated_result.index()});
           output_types.push_back(result.getType());
-        } else if (static_cast<const ConcreteT*>(this)
-                       ->AllowWeightOnlyQuantization(*candidate_op)) {
+        } else if (weight_only_quantizable) {
           outputs_replaced.insert({result, enumerated_result.index()});
           output_types.push_back(result.getType());
         } else {
@@ -260,10 +253,6 @@ void PopulateCommonQuantizationPatterns(
 void PopulateAllQuantizablePatterns(MLIRContext& ctx,
                                     RewritePatternSet& patterns);
 
-// Populates pattern weight-only quantization.
-void PopulateQuantizeWeightOnlyPatterns(MLIRContext& ctx,
-                                        RewritePatternSet& patterns);
-
 }  // namespace mlir::quant::stablehlo
 
 #endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PASSES_QUANTIZATION_PATTERNS_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc
index 0000057402886f..86dbae8e4181f9 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc
@@ -77,35 +77,14 @@ struct StableHloQuantizationReverse
                                   quantfork::QuantizeCastOp>(ctx) {}
 };
 
-bool IsHybridQuantizableOp(Operation& op) {
-  auto call_op = cast<TF::XlaCallModuleOp>(op);
-  if (call_op == nullptr) return false;
-  StringRef entry_function_name = GetEntryFunctionName(call_op);
-  return entry_function_name.contains("conv") ||
-         entry_function_name.contains("dot_general");
-}
-
-// Quantization rewrite pattern using DQ as the root op.
-struct StableHloQuantizationWeightOnly
-    : public StableHloQuantizationBase<StableHloQuantizationWeightOnly> {
-  explicit StableHloQuantizationWeightOnly(MLIRContext* ctx)
-      : StableHloQuantizationBase<StableHloQuantizationWeightOnly>(ctx) {}
-
-  static bool AllowWeightOnlyQuantization(Operation& op) {
-    return IsHybridQuantizableOp(op);
-  }
-};
-
 class QuantizePass : public impl::QuantizePassBase<QuantizePass> {
  public:
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(QuantizePass)
 
   using impl::QuantizePassBase<QuantizePass>::QuantizePassBase;
 
-  explicit QuantizePass(const bool enable_per_channel_quantized_weight,
-                        const bool enable_weight_only) {
+  explicit QuantizePass(const bool enable_per_channel_quantized_weight) {
     enable_per_channel_quantized_weight_ = enable_per_channel_quantized_weight;
-    enable_weight_only_ = enable_weight_only;
   }
 
  private:
@@ -118,10 +97,6 @@ void QuantizePass::runOnOperation() {
 
   RewritePatternSet patterns(&ctx);
   patterns.add<StableHloQuantization, StableHloQuantizationReverse>(&ctx);
-  if (enable_weight_only_) {
-    patterns.add<StableHloQuantizationWeightOnly>(&ctx);
-    PopulateQuantizeWeightOnlyPatterns(ctx, patterns);
-  }
 
   PopulateCommonQuantizationPatterns(ctx, patterns,
                                      enable_per_channel_quantized_weight_);
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_composite_functions.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_composite_functions.cc
index 1efc5d40c7ce20..a713f5501b271d 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_composite_functions.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_composite_functions.cc
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <memory>
-
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project  // IWYU pragma: keep
@@ -26,8 +24,6 @@ limitations under the License.
 #include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"  // IWYU pragma: keep
-#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
-#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/report.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
@@ -55,10 +51,8 @@ class QuantizeCompositeFunctionsPass
       QuantizeCompositeFunctionsPass>::QuantizeCompositeFunctionsPassBase;
 
   explicit QuantizeCompositeFunctionsPass(
-      const bool enable_per_channel_quantized_weight,
-      const bool enable_weight_only) {
+      const bool enable_per_channel_quantized_weight) {
     enable_per_channel_quantized_weight_ = enable_per_channel_quantized_weight;
-    enable_weight_only_ = enable_weight_only;
   }
 
  private:
@@ -80,9 +74,10 @@ void QuantizeCompositeFunctionsPass::runOnOperation() {
   // Change this to user-given bit width once we have custom configuration.
   options.bit_width_ = 8;
 
-  if (enable_weight_only_) {
-    pm.addNestedPass<func::FuncOp>(createInsertWeightParamPass());
-  }
+  // Insert quantization parameters for weights for ops with `weight_only_ptq`
+  // attribute.
+  pm.addNestedPass<func::FuncOp>(createInsertWeightParamPass());
+
   // PrepareQuantizePass uses SymbolTable to fetch relevant GEMM ops for
   // determining quantization attributes. This requires module-level context.
   pm.addPass(createPrepareQuantizePass(options));
@@ -90,7 +85,7 @@ void QuantizeCompositeFunctionsPass::runOnOperation() {
   QuantizePassOptions quantize_options;
   quantize_options.enable_per_channel_quantized_weight_ =
       enable_per_channel_quantized_weight_;
-  quantize_options.enable_weight_only_ = enable_weight_only_;
+
   // QuantizePass modifies FuncOps referenced outside of its given scope
   // and therefore requires a module-level context.
   pm.addPass(createQuantizePass(quantize_options));
@@ -113,10 +108,6 @@ void QuantizeCompositeFunctionsPass::runOnOperation() {
       !pm_run_status.ok()) {
     signalPassFailure();
   }
-
-  // Emit human-readable quantization report.
-  const QuantizationReport report(module_op);
-  report.Print();
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_weight.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_weight.cc
index 95f150d683c57b..e0469cc8d14032 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_weight.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_weight.cc
@@ -111,7 +111,7 @@ class QuantizeWeight : public OpRewritePattern<ConstantOp> {
   QuantizationUnits GetQuantizableOps(ConstantOp op) const {
     // Non-float tensors do not need quantization.
     QuantizationUnits quantizable_ops;
-    const ShapedType type = op.getType().dyn_cast<ShapedType>();
+    const ShapedType type = mlir::dyn_cast<ShapedType>(op.getType());
     if (!type || !type.getElementType().isF32()) return quantizable_ops;
 
     const Value value = op.getResult();
@@ -150,7 +150,7 @@ class QuantizeWeight : public OpRewritePattern<ConstantOp> {
     }
 
     TensorType old_result_type =
-        op.getResult().getType().dyn_cast<TensorType>();
+        mlir::dyn_cast<TensorType>(op.getResult().getType());
     const FloatType quantized_type = FloatType::getF16(op.getContext());
     const ShapedType new_result_type = old_result_type.clone(quantized_type);
 
@@ -184,7 +184,7 @@ class QuantizeWeight : public OpRewritePattern<ConstantOp> {
       // Get types.
       const Type old_result_type = op.getResult().getType();
       const ShapedType new_result_type =
-          convert_op.getType().dyn_cast<ShapedType>();
+          mlir::dyn_cast<ShapedType>(convert_op.getType());
 
       // Proceeds only if the converting is to float16.
       if (!new_result_type.getElementType().isF16()) continue;
@@ -192,7 +192,7 @@ class QuantizeWeight : public OpRewritePattern<ConstantOp> {
       // Convert values.
       std::vector<Eigen::half> new_values;
       const DenseFPElementsAttr value_attr =
-          op.getValue().cast<DenseFPElementsAttr>();
+          mlir::cast<DenseFPElementsAttr>(op.getValue());
       new_values.reserve(value_attr.getNumElements());
 
       for (const float value : value_attr.getValues<float>()) {
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.cc
index 6ed82c125b0be9..e1b4adb013684c 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.cc
@@ -163,7 +163,7 @@ void CreateXlaCallModuleOp(ValueRange inputs, ValueRange outputs,
   SmallVector<Attribute> shape_attrs;
   for (const Type result_type : result_types) {
     shape_attrs.push_back(
-        tf_type::ShapeAttr::get(ctx, result_type.cast<ShapedType>()));
+        tf_type::ShapeAttr::get(ctx, mlir::cast<ShapedType>(result_type)));
   }
   const auto empty_array_attr = ArrayAttr::get(ctx, {});
   // TODO: b/310291615 - find a better way for platform support.
@@ -502,7 +502,7 @@ void ReplaceStablehloOpsInMainFunctionWithXlaCallModuleOpsPass::
     SymbolTable symbol_table(module_op);
     for (auto call_op : main_func.getOps<TF::PartitionedCallOp>()) {
       func_ops.push_back(dyn_cast_or_null<func::FuncOp>(symbol_table.lookup(
-          call_op.getFAttr().cast<FlatSymbolRefAttr>().getValue())));
+          mlir::cast<FlatSymbolRefAttr>(call_op.getFAttr()).getValue())));
     }
     for (auto call_op : main_func.getOps<TF::StatefulPartitionedCallOp>()) {
       func_ops.push_back(
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/python/BUILD
index df5252b986adf5..0999d37da524c2 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/BUILD
@@ -30,6 +30,7 @@ package(
 pytype_strict_library(
     name = "quantization",
     srcs = ["quantization.py"],
+    visibility = ["//visibility:public"],
     deps = [
         ":pywrap_quantization",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_py",
@@ -45,6 +46,10 @@ pytype_strict_library(
 #     testonly = 1,
 #     srcs = ["integration_test/quantize_model_test_base.py"],
 #     tags = ["no_pip"],
+#     visibility = [
+#         "//learning/brain/mlir/quantization/stablehlo:__subpackages__",
+#         "//tensorflow/compiler/mlir/quantization:__subpackages__",
+#     ],
 #     deps = [
 #         "//third_party/py/mlir:ir",
 #         "//third_party/py/mlir:stablehlo_dialect",
@@ -62,6 +67,7 @@ pytype_strict_library(
 #         "//tensorflow/python/ops:nn_ops",
 #         "//tensorflow/python/ops:variables",
 #         "//tensorflow/python/platform:client_testlib",
+#         "//tensorflow/python/platform:tf_logging",
 #         "//tensorflow/python/saved_model:load",
 #         "//tensorflow/python/saved_model:loader",
 #         "//tensorflow/python/saved_model:save",
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py
index f65c56bc577742..ab0fb1d5662bba 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py
@@ -12,12 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+import os
 import re
 from typing import Mapping, Optional, Sequence
 
 from absl.testing import parameterized
 import numpy as np
 
+from google.protobuf import text_format
 from tensorflow.compiler.mlir.quantization.common.python import testing
 from tensorflow.compiler.mlir.quantization.stablehlo import quantization_config_pb2 as qc
 from tensorflow.compiler.mlir.quantization.stablehlo.python import quantization
@@ -145,7 +147,7 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
     # done in MLIR level.
     # Tests that the quantized graph outputs similar values. The rtol and atol
     # values are arbitrary.
-    self.assertAllClose(new_outputs, expected_outputs, rtol=0.03, atol=0.2)
+    self.assertAllClose(new_outputs, expected_outputs, rtol=0.3, atol=0.2)
 
     # Due to other meta data, the compression is not exactly 1/4.
     self.assertLess(
@@ -575,6 +577,114 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
         0.65,
     )
 
+  def test_reuse_calibration_data(self):
+    _, y_shape, bias_shape, x_signature, y_signature = (
+        self._prepare_sample_einsum_datashapes('abc,cde->abde', use_bias=True)
+    )
+
+    self._create_einsum_model(
+        self._input_saved_model_path,
+        'abc,cde->abde',
+        y_shape,
+        x_signature,
+        y_signature,
+        bias_shape,
+    )
+
+    # Generate model input data.
+    rng = np.random.default_rng(seed=42)
+    input_data = ops.convert_to_tensor(
+        rng.uniform(low=0.0, high=1.0, size=x_signature).astype('f4')
+    )
+
+    def data_gen() -> repr_dataset.RepresentativeDataset:
+      for _ in range(100):
+        yield {
+            'x': ops.convert_to_tensor(
+                np.random.uniform(low=0.0, high=1.0, size=x_signature).astype(
+                    'f4'
+                )
+            ),
+        }
+
+    dataset_path = self.create_tempfile('tfrecord').full_path
+    path_map = {'serving_default': dataset_path}
+    repr_dataset.TfRecordRepresentativeDatasetSaver(path_map).save(
+        {'serving_default': data_gen()}
+    )
+
+    calibration_data_dir = self.create_tempdir('calibration_data').full_path
+    config = qc.QuantizationConfig(
+        static_range_ptq_preset=qc.StaticRangePtqPreset(
+            representative_datasets=[
+                qc.RepresentativeDatasetConfig(
+                    tf_record=qc.TfRecordFile(path=dataset_path)
+                )
+            ]
+        ),
+        tf_saved_model=qc.TfSavedModelConfig(tags=[tag_constants.SERVING]),
+        calibration_options=qc.CalibrationOptions(
+            calibration_method=_CalibrationMethod.CALIBRATION_METHOD_MIN_MAX,
+            calibration_data_dir=calibration_data_dir,
+        ),
+    )
+
+    # Run quantization the first time, calibration is expected to be run.
+    with self.assertLogs(level='INFO') as info_logs:
+      quantization.quantize_saved_model(
+          self._input_saved_model_path,
+          self._output_saved_model_path,
+          config,
+      )
+      self.assertTrue(
+          self._any_log_contains(
+              'Calibration step is executed in graph mode.',
+              info_logs.records,
+          )
+      )
+      module_str = self._extract_first_xla_call_module_op(
+          self._output_saved_model_path
+      )
+      self.assertTrue(
+          re.search('stablehlo.dot_general.*xi8>.*xi8>.*xi32>', module_str)
+      )
+
+    # Run quantization the first time, calibration is expected to be skipped.
+    output_saved_model_path_2 = self.create_tempdir('output2').full_path
+    with self.assertLogs(level='INFO') as info_logs:
+      quantization.quantize_saved_model(
+          self._input_saved_model_path,
+          output_saved_model_path_2,
+          config,
+      )
+      self.assertFalse(
+          self._any_log_contains(
+              'Calibration step is executed in graph mode.',
+              info_logs.records,
+          )
+      )
+      module_str = self._extract_first_xla_call_module_op(
+          output_saved_model_path_2
+      )
+      self.assertTrue(
+          re.search('stablehlo.dot_general.*xi8>.*xi8>.*xi32>', module_str)
+      )
+
+    # Expect both quantized model to produce the same results.
+    root = load.load(self._output_saved_model_path)
+    self.assertCountEqual(root.signatures.keys(), {'serving_default'})
+    new_outputs_1 = root.signatures['serving_default'](
+        x=ops.convert_to_tensor(input_data)
+    )
+
+    root = load.load(output_saved_model_path_2)
+    self.assertCountEqual(root.signatures.keys(), {'serving_default'})
+    new_outputs_2 = root.signatures['serving_default'](
+        x=ops.convert_to_tensor(input_data)
+    )
+
+    self.assertAllClose(new_outputs_1, new_outputs_2)
+
   @parameterized.named_parameters(
       ('use_constant_with_int32_input', np.int32, False),
       ('use_variable_with_int32_input', np.int32, True),
@@ -897,7 +1007,7 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
     # be exactly the same. Indirectly proves that the `FunctionNameMatcherSpec`
     # with regex '.*invalid_function_name.*' did not match the quantizable unit.
     self.assertAllClose(new_outputs, expected_outputs, rtol=0.04)
-    self.assertNotAllClose(new_outputs, expected_outputs, rtol=0.00001)
+    self.assertNotAllClose(new_outputs, expected_outputs, 1e-7)
 
     # Due to other meta data, the compression is not exactly 1/4.
     self.assertLess(
@@ -907,6 +1017,72 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
         0.4,
     )
 
+  def test_save_quantization_report_file(self):
+    """Tests that the quantization report file is created.
+
+    Also test that it is populated with textproto of `QuantizationResults`.
+    """
+    input_shape = (1, 16)
+    filter_shape = (16, 3)
+    self._create_matmul_model(
+        input_shape,
+        filter_shape,
+        self._input_saved_model_path,
+    )
+
+    rng = np.random.default_rng(seed=42)
+
+    def data_gen() -> repr_dataset.RepresentativeDataset:
+      for _ in range(100):
+        yield {
+            'input_tensor': rng.uniform(
+                low=0.0, high=1.0, size=input_shape
+            ).astype(np.float32)
+        }
+
+    dataset_path = self.create_tempfile('tfrecord').full_path
+    path_map = {'serving_default': dataset_path}
+    repr_dataset.TfRecordRepresentativeDatasetSaver(path_map).save(
+        {'serving_default': data_gen()}
+    )
+
+    report_file_path = self.create_tempfile('report.txtpb').full_path
+    config = qc.QuantizationConfig(
+        static_range_ptq_preset=qc.StaticRangePtqPreset(
+            representative_datasets=[
+                qc.RepresentativeDatasetConfig(
+                    tf_record=qc.TfRecordFile(path=dataset_path)
+                )
+            ]
+        ),
+        tf_saved_model=qc.TfSavedModelConfig(tags=[tag_constants.SERVING]),
+        report_file_path=report_file_path,
+    )
+    quantization.quantize_saved_model(
+        self._input_saved_model_path,
+        self._output_saved_model_path,
+        config,
+    )
+
+    # Test the contents of the report file, which is a textproto of
+    # `QuantizationResults`.
+    self.assertTrue(os.path.exists(report_file_path))
+    with open(report_file_path, 'r') as f:
+      quantization_results_textpb = f.read()
+
+    results = qc.QuantizationResults()
+    text_format.Parse(quantization_results_textpb, results)
+
+    self.assertProtoEquals(
+        expected_message_maybe_ascii=r"""
+        results {
+          quantizable_unit { name: "composite_dot_general_fn_1" }
+          method { static_range_ptq {} }
+        }
+        """,
+        message=results,
+    )
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class CalibrationOptionsTest(quantize_model_test_base.QuantizedModelTest):
@@ -931,7 +1107,7 @@ class CalibrationOptionsTest(quantize_model_test_base.QuantizedModelTest):
           'calibration_options': qc.CalibrationOptions(
               calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_PERCENTILE,
               calibration_parameters=qc.CalibrationOptions.CalibrationParameters(
-                  initial_num_bins=10,
+                  num_bins=10,
               ),
           ),
       },
@@ -939,7 +1115,7 @@ class CalibrationOptionsTest(quantize_model_test_base.QuantizedModelTest):
           'calibration_options': qc.CalibrationOptions(
               calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_BRUTEFORCE,
               calibration_parameters=qc.CalibrationOptions.CalibrationParameters(
-                  initial_num_bins=10,
+                  num_bins=10,
               ),
           ),
       },
@@ -947,7 +1123,7 @@ class CalibrationOptionsTest(quantize_model_test_base.QuantizedModelTest):
           'calibration_options': qc.CalibrationOptions(
               calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_MAX_FREQUENCY,
               calibration_parameters=qc.CalibrationOptions.CalibrationParameters(
-                  initial_num_bins=10,
+                  num_bins=10,
               ),
           ),
       },
@@ -955,7 +1131,7 @@ class CalibrationOptionsTest(quantize_model_test_base.QuantizedModelTest):
           'calibration_options': qc.CalibrationOptions(
               calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_SYMMETRIC,
               calibration_parameters=qc.CalibrationOptions.CalibrationParameters(
-                  initial_num_bins=10,
+                  num_bins=10,
               ),
           ),
       },
@@ -1212,9 +1388,8 @@ def test_conv_weight_only_model(
         self._output_saved_model_path
     )
 
-    # Tests that the output graph contains subtract and multiply for
+    # Tests that the output graph contains multiply op for symmetric
     # dequantization.
-    self.assertTrue(re.search('stablehlo.subtract', module_str))
     self.assertTrue(re.search('stablehlo.multiply', module_str))
     # Tests that the output graph contains float dot_general.
     self.assertTrue(
@@ -1357,6 +1532,58 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
     # Check add is not quantized.
     self.assertTrue(re.search(r'stablehlo.add.*f32>', module_str), module_str)
 
+  def test_save_quantization_report_file(self):
+    """Tests that the quantization report file is created.
+
+    Also test that it is populated with textproto of `QuantizationResults`.
+    """
+
+    input_shape = (1, 3, 4, 3)
+    filter_shape = (2, 3, 3, 2)
+    self._create_conv2d_model(
+        input_shape,
+        filter_shape,
+        self._input_saved_model_path,
+    )
+
+    report_file_path = self.create_tempfile('report.txtpb').full_path
+    config = qc.QuantizationConfig(
+        weight_only_ptq_preset=qc.WeightOnlyPtqPreset(),
+        tf_saved_model=qc.TfSavedModelConfig(tags=[tag_constants.SERVING]),
+        report_file_path=report_file_path,
+    )
+    quantization.quantize_saved_model(
+        self._input_saved_model_path,
+        self._output_saved_model_path,
+        config,
+    )
+
+    # Test the contents of the report file, which is a textproto of
+    # `QuantizationResults`.
+    self.assertTrue(os.path.exists(report_file_path))
+    with open(report_file_path, 'r') as f:
+      quantization_results_textpb = f.read()
+
+    results = qc.QuantizationResults()
+    text_format.Parse(quantization_results_textpb, results)
+
+    self.assertProtoEquals(
+        expected_message_maybe_ascii=r"""
+        results {
+          quantizable_unit { name: "composite_conv_fn_1" }
+          method {
+            weight_only_ptq {
+              input_quantized_types {
+                key: 1
+                value { dimension_specs {} }
+              }
+            }
+          }
+        }
+        """,
+        message=results,
+    )
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test_base.py b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test_base.py
index 31c53a4cf20fe9..fef1784fec9370 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test_base.py
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test_base.py
@@ -33,11 +33,13 @@
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import load
 from tensorflow.python.saved_model import loader_impl
 from tensorflow.python.saved_model import save as saved_model_save
 from tensorflow.python.types import core
 
+
 FUNC_ALIAS = 'some_alias'
 
 
@@ -164,6 +166,27 @@ def matmul(self, input_tensor: core.Tensor) -> Mapping[str, core.Tensor]:
     )
     return model
 
+  def _any_log_contains(
+      self, substring: str, log_record_list: List['logging.LogRecord']
+  ) -> bool:
+    """Returns True if any of the log contains a given substring.
+
+    Args:
+      substring: A piece of string to check whether it exists in the log
+        message.
+      log_record_list: A list of `absl.logging.LogRecord`s.
+
+    Returns:
+      True if and only if the substring exists in any of the log in
+      `log_record_list`.
+    """
+    return any(
+        map(
+            lambda log_record: substring in str(log_record.message),
+            log_record_list,
+        )
+    )
+
   def _create_matmul_and_same_scale_model(
       self,
       input_shape: Sequence[int],
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization_lib.cc b/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization_lib.cc
index 3b5ece120bdeb0..517bd117348072 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization_lib.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization_lib.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/weight_only_ptq.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
 
 namespace stablehlo::quantization::pywrap {
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto b/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto
index 81f2ff3686fbbe..49e8161df3a749 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto
@@ -77,8 +77,8 @@ message StaticRangePtqPreset {
   bool enable_full_int_quantization = 3;
 }
 
-// Applies int8 per-tensor weight-only post-training quantization for all
-// dot_general op.
+// Applies int8 per-channel weight-only post-training quantization for all
+// dot_general and convolution ops.
 message WeightOnlyPtqPreset {}
 
 // Metadata specific to the input TensorFlow SavedModel, which may be required
@@ -131,10 +131,18 @@ message QuantizationResults {
   repeated QuantizationResult results = 1;
 }
 
+// Signals per-channel quantization. When dimension is not specified, StableHLO
+// quantizer determines the quantization dimension to be output feature
+// dimension for convolution and first non-batching, non-contracting dimension
+// for dot_general.
 message QuantizedDimension {
-  int32 dimension = 1;  // Should be less than the rank of the quantized tensor.
+  // Should be less than the rank of the quantized tensor.
+  optional int32 dimension = 1;
 }
 
+// Signals quantization type to be per-tensor.
+message PerTensor {}
+
 // Corresponds to StableHLO's `QuantizedTensorElementType`. Type parameters such
 // as `QuantizationParameters` is omitted because they are determined during
 // quantization.
@@ -144,13 +152,17 @@ message QuantizedDimension {
 // Currently only supports specifying quantization granularity (e.g. for
 // per-channel quantization).
 // TODO: b/331144430 - Support specifying storage types.
+// Next ID: 3
 message QuantizedType {
   // Specifies the granularity of quantization parameters for each dimension of
   // a quantized tensor. If specified, per-channel quantization is applied. If
   // not specified, per-tensor quantization is applied.
   // TODO: Make it a `repeated` field to be able to express multi-channel /
   // sub-channel quantization.
-  QuantizedDimension dimension_specs = 1;
+  oneof type {
+    QuantizedDimension dimension_specs = 1;
+    PerTensor per_tensor = 2;
+  }
 }
 
 // A quantization method representing "do not quantize". Mostly used for
@@ -266,7 +278,7 @@ message DebuggerConfig {
 }
 
 // Defines various calibration options.
-// Next ID: 4
+// Next ID: 6
 message CalibrationOptions {
   // Configurations for calibration methods.
   // Next ID: 7
@@ -296,10 +308,8 @@ message CalibrationOptions {
   // Parameters required for calibration.
   // Next ID: 4
   message CalibrationParameters {
-    // The number of bins when histogram is initialized. It can be increased
-    // because histogram is dynamically expanded by sample inputs.
-    // initial_num_bins is 256 by default.
-    int32 initial_num_bins = 1;
+    // The number of histogram bins. Default to 512.
+    int32 num_bins = 1;
     // min_percentile is only used in HISTOGRAM_PERCENTILE.
     // min_percentile is 0.001 by default.
     float min_percentile = 2;
@@ -321,11 +331,19 @@ message CalibrationOptions {
   // Configures representative dataset. Each item corresponds to a
   // representative dataset used to calibrate a function.
   repeated RepresentativeDatasetConfig representative_datasets = 3;
+
+  // The path to save calibration statistics data. If not set, use a temporary
+  // directory.
+  string calibration_data_dir = 4;
+
+  // Whether to reuse the existing calibration data in `calibration_data_dir`.
+  // Default to False.
+  bool force_regenerate_calibration_data = 5;
 }
 
 // Quantization configuration for StableHLO Quantizer. This is the primary
 // message containing all configurable options.
-// Next ID: 8
+// Next ID: 9
 message QuantizationConfig {
   // Config presets provide predefined popular or common quantization specs.
   // Lightweight users may choose one of the presets for quick experiments. Each
@@ -354,4 +372,9 @@ message QuantizationConfig {
   // activation of static range quantization (SRQ). Quantization calibration
   // method is set to MIN_MAX by default.
   CalibrationOptions calibration_options = 6;
+
+  // Path to file to save the quantization report, which is essentially a
+  // textproto rendering of `QuantizationResults`. If not set, the report will
+  // only be emitted to stdout.
+  optional string report_file_path = 8;
 }
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/post_calibration_component.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/post_calibration_component.mlir
index 2f149281fbd0be..61f4b27e66af90 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/post_calibration_component.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/post_calibration_component.mlir
@@ -8,9 +8,9 @@
 // int ops.
 func.func @main(%arg0: tensor<1x1024xf32>) -> tensor<1x3xf32> {
   %0 = "tf.Const"() <{value = dense<0.5> : tensor<1024x3xf32>}> : () -> tensor<1024x3xf32>
-  %1:4 = "tf.CustomAggregator"(%arg0) <{id = "1"}> {calibration_method = 1 : i64, device = "", initial_num_bins = 0 : i64, max = 0.999992311 : f32, max_percentile = 0.000000e+00 : f32, min = 7.547870e-07 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x1024xf32>) -> (tensor<1x1024xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+  %1:4 = "tf.CustomAggregator"(%arg0) <{id = "1", calibration_method = 1 : i32, num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}> {min = 7.547870e-07 : f32, max = 0.999992311 : f32} : (tensor<1x1024xf32>) -> (tensor<1x1024xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
   %2 = "tf.XlaCallModule"(%1#0, %0) <{Sout = [#tf_type.shape<1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _quantization_method = "static_range_ptq {}", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
-  %3:4 = "tf.CustomAggregator"(%2) <{id = "2"}> {calibration_method = 1 : i64, device = "", initial_num_bins = 0 : i64, max = 5.3033524 : f32, max_percentile = 0.000000e+00 : f32, min = -3.5216827 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+  %3:4 = "tf.CustomAggregator"(%2) <{id = "2", calibration_method = 1 : i32, num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}> {min = -17.5216827 : f32, max = 18.3033524 : f32} : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
   return %3#0 : tensor<1x3xf32>
 }
 func.func private @composite_dot_general_fn_1(%arg0: tensor<1x1024xf32>, %arg1: tensor<1024x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
@@ -36,9 +36,9 @@ func.func private @composite_dot_general_fn_1(%arg0: tensor<1x1024xf32>, %arg1:
 
 func.func @main_no_unpack(%arg0: tensor<1x1024xf32>) -> tensor<1x3xf32> {
   %0 = "tf.Const"() <{value = dense<0.5> : tensor<1024x3xf32>}> : () -> tensor<1024x3xf32>
-  %1:4 = "tf.CustomAggregator"(%arg0) <{id = "1"}> {calibration_method = 1 : i64, device = "", initial_num_bins = 0 : i64, max = 0.999992311 : f32, max_percentile = 0.000000e+00 : f32, min = 7.547870e-07 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x1024xf32>) -> (tensor<1x1024xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+  %1:4 = "tf.CustomAggregator"(%arg0) <{id = "1", calibration_method = 1 : i32, num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}> {device = "", max = 0.999992311 : f32, min = 7.547870e-07 : f32} : (tensor<1x1024xf32>) -> (tensor<1x1024xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
   %2 = "tf.XlaCallModule"(%1#0, %0) <{Sout = [#tf_type.shape<1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _quantization_method = "static_range_ptq {}", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
-  %3:4 = "tf.CustomAggregator"(%2) <{id = "2"}> {calibration_method = 1 : i64, device = "", initial_num_bins = 0 : i64, max = 5.3033524 : f32, max_percentile = 0.000000e+00 : f32, min = -3.5216827 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+  %3:4 = "tf.CustomAggregator"(%2) <{id = "2", calibration_method = 1 : i32, num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}> {device = "", max = 18.3033524 : f32, min = -17.5216827 : f32} : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
   return %3#0 : tensor<1x3xf32>
 }
 func.func private @composite_dot_general_fn_1(%arg0: tensor<1x1024xf32>, %arg1: tensor<1024x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
@@ -47,7 +47,7 @@ func.func private @composite_dot_general_fn_1(%arg0: tensor<1x1024xf32>, %arg1:
 }
 // CHECK-NO-UNPACK-LABEL: func.func @main_no_unpack
 // CHECK-NO-UNPACK-SAME: (%[[ARG_0:.+]]: tensor<1x1024xf32>) -> tensor<1x3xf32>
-// CHECK-NO-UNPACK-DAG: %[[CONST:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<1024x3xi8>} : () -> tensor<1024x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>>
+// CHECK-NO-UNPACK-DAG: %[[CONST:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<1024x3xi8>}> : () -> tensor<1024x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>>
 // CHECK-NO-UNPACK: %[[QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x1024xf32>) -> tensor<1x1024x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK-NO-UNPACK: %[[DOT:.+]] = stablehlo.dot_general %[[QUANTIZE_0]], %[[CONST]]
 // CHECK-NO-UNPACK: %[[QUANTIZE_1:.+]] = stablehlo.uniform_quantize %[[DOT]] : (tensor<1x3x!quant.uniform<i32:f32:1, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/pre_calibration_component.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/pre_calibration_component.mlir
index 954323af9ef7ad..0c5e7a7cab09f2 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/pre_calibration_component.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/pre_calibration_component.mlir
@@ -8,10 +8,10 @@ func.func @main(%arg0: tensor<1x4xf32>) -> tensor<1x3xf32> {
 }
 // CHECK: @main(%[[ARG_0:.+]]: tensor<1x4xf32>) -> tensor<1x3xf32>
 // CHECK-DAG: %[[CST:.+]] = stablehlo.constant dense<1.000000e+00> : tensor<4x3xf32>
-// CHECK: %[[CUSTOM_AGGREGATOR_0:.+]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[ARG_0]]) <{id = "0"}> {{.*}}  : (tensor<1x4xf32>) -> (tensor<1x4xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+// CHECK: %[[CUSTOM_AGGREGATOR_0:.+]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[ARG_0]]) <{calibration_method = 1 : i32, id = "composite_dot_general_fn_1_arg_0_calibration_method_1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<1x4xf32>) -> (tensor<1x4xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
 // CHECK: %[[XLA_CALL_MODULE:.+]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_0]], %[[CST]])
 // CHECK-SAME: _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
-// CHECK: %[[CUSTOM_AGGREGATOR_1:.+]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[XLA_CALL_MODULE]]) <{id = "1"}> {{.*}} : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+// CHECK: %[[CUSTOM_AGGREGATOR_1:.+]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[XLA_CALL_MODULE]]) <{calibration_method = 1 : i32, id = "composite_dot_general_fn_1_calibration_method_1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
 // CHECK: return %[[CUSTOM_AGGREGATOR_1]] : tensor<1x3xf32>
 // CHECK: }
 // CHECK: }
@@ -28,10 +28,10 @@ func.func @serving_default(%arg0: tensor<1x4xf32>) -> tensor<1x3xf32> {
 }
 // CHECK: @serving_default(%[[ARG_0:.+]]: tensor<1x4xf32>) -> tensor<1x3xf32>
 // CHECK-DAG: %[[CST:.+]] = stablehlo.constant dense<1.000000e+00> : tensor<4x3xf32>
-// CHECK: %[[CUSTOM_AGGREGATOR_0:.+]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[ARG_0]]) <{id = "0"}> {{.*}} : (tensor<1x4xf32>) -> (tensor<1x4xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+// CHECK: %[[CUSTOM_AGGREGATOR_0:.+]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[ARG_0]]) <{calibration_method = 1 : i32, id = "composite_dot_general_fn_1_arg_0_calibration_method_1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<1x4xf32>) -> (tensor<1x4xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
 // CHECK: %[[XLA_CALL_MODULE:.+]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_0]], %[[CST]])
 // CHECK-SAME: _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
-// CHECK: %[[CUSTOM_AGGREGATOR_1:.+]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[XLA_CALL_MODULE]]) <{id = "1"}> {{.*}} : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+// CHECK: %[[CUSTOM_AGGREGATOR_1:.+]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[XLA_CALL_MODULE]]) <{calibration_method = 1 : i32, id = "composite_dot_general_fn_1_calibration_method_1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
 // CHECK: return %[[CUSTOM_AGGREGATOR_1]] : tensor<1x3xf32>
 // CHECK: }
 // CHECK: }
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/defer_activation_transpose.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/defer_activation_transpose.mlir
index 96b270f8b888f9..d9db49de957ac9 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/defer_activation_transpose.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/defer_activation_transpose.mlir
@@ -118,14 +118,14 @@ func.func @reduce_window_max_activation_transpose(%arg0: tensor<1x16x16x4xf32>)
 
 // Check that the body is not modified.
 // CHECK: %[[REDUCE_WINDOW:.+]] = "stablehlo.reduce_window"(%[[ARG]], %[[INIT_VALUE_CONST]])
+// CHECK: <{window_dimensions = array<i64: 1, 2, 2, 1>, window_strides = array<i64: 1, 2, 2, 1>}>
 // CHECK: ^bb0(%[[REDUCE_ARG_0:.+]]: tensor<f32>, %[[REDUCE_ARG_1:.+]]: tensor<f32>):
 // CHECK: %[[MAX:.+]] = stablehlo.maximum %[[REDUCE_ARG_0]], %[[REDUCE_ARG_1]]
 // CHECK: stablehlo.return %[[MAX]]
 
 // Check that the attributes window_dimensions & window_strides are also
 // permutated to match the new input shape.
-// CHECK: {window_dimensions = array<i64: 1, 2, 2, 1>, window_strides = array<i64: 1, 2, 2, 1>}
-// CHECK-SAME: (tensor<1x16x16x4xf32>, tensor<f32>) -> tensor<1x8x8x4xf32>
+// CHECK: (tensor<1x16x16x4xf32>, tensor<f32>) -> tensor<1x8x8x4xf32>
 
 // Check that a `stablehlo.transpose` is added to the result to match the shape
 // of the users.
@@ -162,6 +162,7 @@ func.func @reduce_window_max_activation_transpose_explicit_optional_attrs(
 
 // Check that the body is not modified.
 // CHECK: %[[REDUCE_WINDOW:.+]] = "stablehlo.reduce_window"(%[[ARG]], %[[INIT_VALUE_CONST]])
+// CHECK: <{base_dilations = array<i64: 1, 2, 2, 1>, window_dilations = array<i64: 1, 2, 2, 1>, window_dimensions = array<i64: 1, 2, 2, 1>, window_strides = array<i64: 1, 2, 2, 1>}>
 // CHECK: ^bb0(%[[REDUCE_ARG_0:.+]]: tensor<f32>, %[[REDUCE_ARG_1:.+]]: tensor<f32>):
 // CHECK: %[[MAX:.+]] = stablehlo.maximum %[[REDUCE_ARG_0]], %[[REDUCE_ARG_1]]
 // CHECK: stablehlo.return %[[MAX]]
@@ -169,8 +170,7 @@ func.func @reduce_window_max_activation_transpose_explicit_optional_attrs(
 // Check that the attributes window_dimensions & window_strides along with
 // optional attributes base_dilations and window_dilations are also permutated
 // to match the new input shape.
-// CHECK: {base_dilations = array<i64: 1, 2, 2, 1>, window_dilations = array<i64: 1, 2, 2, 1>, window_dimensions = array<i64: 1, 2, 2, 1>, window_strides = array<i64: 1, 2, 2, 1>}
-// CHECK-SAME: (tensor<1x16x16x4xf32>, tensor<f32>) -> tensor<1x15x15x4xf32>
+// CHECK: (tensor<1x16x16x4xf32>, tensor<f32>) -> tensor<1x15x15x4xf32>
 
 // Check that a `stablehlo.transpose` is added to the result to match the shape
 // of the users.
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/insert_calibration_statistics_saver.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/insert_calibration_statistics_saver.mlir
new file mode 100644
index 00000000000000..f80ce73ff88bf1
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/insert_calibration_statistics_saver.mlir
@@ -0,0 +1,219 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -mlir-disable-threading -stablehlo-insert-calibration-statistics-saver | FileCheck %s
+
+func.func @serving_default(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x2x2x2xf32>) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}} {
+  %cst = "tf.Const"() <{value = dense<[[[[-0.891899645, 0.392044574], [0.77720493, 1.31188095], [0.255048186, 2.700150e+00]], [[-1.08111858, -0.406604826], [-0.298575521, -2.25356531], [-1.00201964, 2.54532099]], [[-1.34911358, 0.279911458], [-0.868258893, -1.36708188], [0.866317451, -2.05804896]]], [[[-0.591397941, 0.331505477], [0.715151429, 2.64073896], [1.27163255, 0.206143498]], [[0.474211812, 1.45044816], [0.119936548, 2.54149938], [-0.939900994, 0.438387245]], [[-1.12486279, -1.09022558], [0.82202208, 1.04652023], [1.30316162, 2.62054276]]]]> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
+  %output, %min, %max, %histogram = "tf.CustomAggregator"(%arg0) <{calibration_method = 5 : i32, id = "0", num_bins = 32 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}> : (tensor<1x3x4x3xf32>) -> (tensor<1x3x4x3xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+  %0 = "tf.Conv2D"(%output, %cst) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 2, 2, 1], use_cudnn_on_gpu = true}> {attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations", device = ""} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x2x2x2xf32>
+  %output_1, %min_2, %max_3, %histogram_4 = "tf.CustomAggregator"(%0) <{calibration_method = 5 : i32, id = "1", num_bins = 32 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}> : (tensor<1x2x2x2xf32>) -> (tensor<1x2x2x2xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+  %1 = "tf.Identity"(%output_1) {device = ""} : (tensor<1x2x2x2xf32>) -> tensor<1x2x2x2xf32>
+  return %1 : tensor<1x2x2x2xf32>
+}
+// CHECK-LABEL: @serving_default
+// CHECK: %[[CUSTOM_AGGREGATOR_0:.*]], %[[MIN_O:.*]], %[[MAX_O:.*]], %[[HISTOGRAM_0:.*]] = "tf.CustomAggregator"
+// CKECK-SAME: <{calibration_method = 5 : i32, id = "0", num_bins = 32 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}> : (tensor<1x3x4x3xf32>) -> (tensor<1x3x4x3xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+// CHECK: %[[CUSTOM_AGGREGATOR_1:.*]], %[[MIN_1:.*]], %[[MAX_1:.*]], %[[HISTOGRAM_1:.*]] = "tf.CustomAggregator"
+// CKECK-SAME: <{calibration_method = 5 : i32, id = "1", num_bins = 32 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}> : (tensor<1x3x4x3xf32>) -> (tensor<1x3x4x3xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+// CHECK: "tf.CalibrationStatisticsSaver"(%[[MIN_O]], %[[MAX_O]], %[[HISTOGRAM_0]], %[[MIN_1]], %[[MAX_1]], %[[HISTOGRAM_1]])
+// CHECK-SAME: <{calibration_methods = [5 : i32, 5 : i32], ids = ["0", "1"], output_file_path = "serving_default_0.pb"}>  : (tensor<f32>, tensor<f32>, tensor<512xi64>, tensor<f32>, tensor<f32>, tensor<512xi64>) -> ()
+// CHECK: return
+
+// -----
+
+// No CustomAggregator ops exist.
+func.func private @composite_conv2d_with_bias_and_relu6_fn_1(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>, %arg2: tensor<2xf32>) -> tensor<1x2x2x2xf32> attributes {tf_quant.composite_function} {
+  %0 = "tf.Conv2D"(%arg0, %arg1) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 2, 2, 1], use_cudnn_on_gpu = true}> {attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations", device = ""} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x2x2x2xf32>
+  %1 = "tf.BiasAdd"(%0, %arg2) <{data_format = "NHWC"}> : (tensor<1x2x2x2xf32>, tensor<2xf32>) -> tensor<1x2x2x2xf32>
+  %2 = "tf.Relu6"(%1) {device = ""} : (tensor<1x2x2x2xf32>) -> tensor<1x2x2x2xf32>
+  return %2 : tensor<1x2x2x2xf32>
+}
+// CHECK-LABEL: @composite_conv2d_with_bias_and_relu6_fn_1
+// CHECK-NOT: "tf.CalibrationStatisticsSaver"
+
+// -----
+
+// Check the IfOp is set to stateful.
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1833 : i32}, tf_saved_model.semantics} {
+  // CHECK-LABEL: func.func @serving_default
+  // CHECK: "tf.If"
+  // CHECK-SAME: is_stateless = false
+  func.func @serving_default(%arg0: tensor<1x4xf32> {tf_saved_model.index_path = ["x"]}) -> (tensor<1x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_x:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %cst = "tf.Const"() <{value = dense<[0, 1]> : tensor<2xi32>}> {device = ""} : () -> tensor<2xi32>
+    %cst_0 = "tf.Const"() <{value = dense<1.000000e+01> : tensor<f32>}> {device = ""} : () -> tensor<f32>
+    %0 = "tf.Sum"(%arg0, %cst) <{keep_dims = false}> {device = ""} : (tensor<1x4xf32>, tensor<2xi32>) -> tensor<f32>
+    %1 = "tf.Greater"(%0, %cst_0) {device = ""} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    %2:2 = "tf.If"(%1, %arg0) <{else_branch = @cond_false_80, is_stateless = true, then_branch = @cond_true_70}> {Tcond = i1, Tin = [f32], Tout = [i1, f32], _lower_using_switch_merge = true, _read_only_resource_inputs = [], device = ""} : (tensor<i1>, tensor<1x4xf32>) -> (tensor<i1>, tensor<1x3xf32>)
+    %3 = "tf.Identity"(%2#1) {device = ""} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+    return %3 : tensor<1x3xf32>
+  }
+
+  // CHECK-LABEL: func.func private @cond_false_80
+  // CHECK: "tf.CalibrationStatisticsSaver"
+  // CHECK-SAME: output_file_path = "cond_false_80_0.pb"
+  func.func private @cond_false_80(%arg0: tensor<1x4xf32> {tf._user_specified_name = "x"}) -> (tensor<i1>, tensor<1x3xf32>) attributes {tf._construction_context = "kEagerRuntime", tf._input_shapes = [#tf_type.shape<1x4>], tf._original_func_name = "cond_false_8"} {
+    %cst = "tf.Const"() <{value = dense<true> : tensor<i1>}> {device = ""} : () -> tensor<i1>
+    %cst_0 = "tf.Const"() <{value = dense<[0.117216609, 0.933735609, 0.0728900209]> : tensor<3xf32>}> {device = ""} : () -> tensor<3xf32>
+    %cst_1 = "tf.Const"() <{value = dense<[[-0.795477629, 0.581315517, 0.921566545], [0.138622552, 0.463866323, 0.95474267], [-0.143770888, -0.796835303, 0.899996876], [0.0989735424, -0.483384758, -7.277030e-01]]> : tensor<4x3xf32>}> {device = ""} : () -> tensor<4x3xf32>
+    %output, %min, %max, %histogram = "tf.CustomAggregator"(%arg0) <{calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<1x4xf32>) -> (tensor<1x4xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+    %0 = "tf.Identity"(%cst) {device = ""} : (tensor<i1>) -> tensor<i1>
+    %1 = "tf.PartitionedCall"(%output, %cst_1, %cst_0) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_with_bias_fn_1}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x4xf32>, tensor<4x3xf32>, tensor<3xf32>) -> tensor<1x3xf32>
+    %output_2, %min_3, %max_4, %histogram_5 = "tf.CustomAggregator"(%1) <{calibration_method = 1 : i32, id = "1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+    %2 = "tf.Identity"(%output_2) {device = ""} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+    return %0, %2 : tensor<i1>, tensor<1x3xf32>
+  }
+
+  // CHECK-LABEL: func.func private @cond_true_70
+  // CHECK: "tf.CalibrationStatisticsSaver"
+  // CHECK-SAME: output_file_path = "cond_true_70_0.pb"
+  func.func private @cond_true_70(%arg0: tensor<1x4xf32> {tf._user_specified_name = "x"}) -> (tensor<i1>, tensor<1x3xf32>) attributes {tf._construction_context = "kEagerRuntime", tf._input_shapes = [#tf_type.shape<1x4>], tf._original_func_name = "cond_true_7"} {
+    %cst = "tf.Const"() <{value = dense<true> : tensor<i1>}> {device = ""} : () -> tensor<i1>
+    %cst_0 = "tf.Const"() <{value = dense<[0.335351914, 0.084816426, -0.664676845]> : tensor<3xf32>}> {device = ""} : () -> tensor<3xf32>
+    %cst_1 = "tf.Const"() <{value = dense<[[-0.630731344, 0.54962182, 0.180364341], [-0.764542698, -0.211145893, -0.708605706], [-0.954062759, -0.614013135, 0.612640202], [-0.418223292, 5.057390e-01, 0.899269938]]> : tensor<4x3xf32>}> {device = ""} : () -> tensor<4x3xf32>
+    %output, %min, %max, %histogram = "tf.CustomAggregator"(%arg0) <{calibration_method = 1 : i32, id = "2", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<1x4xf32>) -> (tensor<1x4xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+    %0 = "tf.Identity"(%cst) {device = ""} : (tensor<i1>) -> tensor<i1>
+    %1 = "tf.PartitionedCall"(%output, %cst_1, %cst_0) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_with_bias_fn_2}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x4xf32>, tensor<4x3xf32>, tensor<3xf32>) -> tensor<1x3xf32>
+    %output_2, %min_3, %max_4, %histogram_5 = "tf.CustomAggregator"(%1) <{calibration_method = 1 : i32, id = "3", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+    %2 = "tf.Identity"(%output_2) {device = ""} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+    return %0, %2 : tensor<i1>, tensor<1x3xf32>
+  }
+
+  func.func private @composite_matmul_with_bias_fn_1(%arg0: tensor<1x4xf32>, %arg1: tensor<4x3xf32>, %arg2: tensor<3xf32>) -> tensor<1x3xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.MatMul"(%arg0, %arg1) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_b", device = ""} : (tensor<1x4xf32>, tensor<4x3xf32>) -> tensor<1x3xf32>
+    %1 = "tf.BiasAdd"(%0, %arg2) <{data_format = "NHWC"}> {device = ""} : (tensor<1x3xf32>, tensor<3xf32>) -> tensor<1x3xf32>
+    return %1 : tensor<1x3xf32>
+  }
+
+  func.func private @composite_matmul_with_bias_fn_2(%arg0: tensor<1x4xf32>, %arg1: tensor<4x3xf32>, %arg2: tensor<3xf32>) -> tensor<1x3xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.MatMul"(%arg0, %arg1) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_b", device = ""} : (tensor<1x4xf32>, tensor<4x3xf32>) -> tensor<1x3xf32>
+    %1 = "tf.BiasAdd"(%0, %arg2) <{data_format = "NHWC"}> {device = ""} : (tensor<1x3xf32>, tensor<3xf32>) -> tensor<1x3xf32>
+    return %1 : tensor<1x3xf32>
+  }
+}
+
+// -----
+
+// Check the IfRegion is set to stateful.
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1833 : i32}, tf_saved_model.semantics} {
+  // CHECK-LABEL: func.func @serving_default
+  // CHECK: "tf.IfRegion"
+  // CHECK-SAME: is_stateless = false
+
+  // CHECK: "tf.CalibrationStatisticsSaver"
+  // CHECK-SAME: output_file_path = "serving_default_0.pb"
+
+  // CHECK: "tf.CalibrationStatisticsSaver"
+  // CHECK-SAME: output_file_path = "serving_default_1.pb"
+
+  // CHECK: "tf.CalibrationStatisticsSaver"
+  // CHECK-SAME: output_file_path = "serving_default_2.pb"
+  func.func @serving_default(%arg0: tensor<1x4xf32> {tf_saved_model.index_path = ["x"]}) -> (tensor<1x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_x:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %cst = "tf.Const"() <{value = dense<1.000000e+01> : tensor<f32>}> {device = ""} : () -> tensor<f32>
+    %cst_0 = "tf.Const"() <{value = dense<[0, 1]> : tensor<2xi32>}> {device = ""} : () -> tensor<2xi32>
+    %cst_1 = "tf.Const"() <{value = dense<[[-0.630731344, 0.54962182, 0.180364341], [-0.764542698, -0.211145893, -0.708605706], [-0.954062759, -0.614013135, 0.612640202], [-0.418223292, 5.057390e-01, 0.899269938]]> : tensor<4x3xf32>}> {device = ""} : () -> tensor<4x3xf32>
+    %cst_2 = "tf.Const"() <{value = dense<[0.335351914, 0.084816426, -0.664676845]> : tensor<3xf32>}> {device = ""} : () -> tensor<3xf32>
+    %cst_3 = "tf.Const"() <{value = dense<true> : tensor<i1>}> {device = ""} : () -> tensor<i1>
+    %cst_4 = "tf.Const"() <{value = dense<[[-0.795477629, 0.581315517, 0.921566545], [0.138622552, 0.463866323, 0.95474267], [-0.143770888, -0.796835303, 0.899996876], [0.0989735424, -0.483384758, -7.277030e-01]]> : tensor<4x3xf32>}> {device = ""} : () -> tensor<4x3xf32>
+    %cst_5 = "tf.Const"() <{value = dense<[0.117216609, 0.933735609, 0.0728900209]> : tensor<3xf32>}> {device = ""} : () -> tensor<3xf32>
+    %output, %min, %max, %histogram = "tf.CustomAggregator"(%arg0) <{calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<1x4xf32>) -> (tensor<1x4xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+    %0 = "tf.Sum"(%output, %cst_0) <{keep_dims = false}> {device = ""} : (tensor<1x4xf32>, tensor<2xi32>) -> tensor<f32>
+    %1 = "tf.Greater"(%0, %cst) {device = ""} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    %2:2 = "tf.IfRegion"(%1) <{_else_func_name = "cond_false_80", _then_func_name = "cond_true_70", is_stateless = true}> ({
+      %4 = "tf.Identity"(%cst_3) {device = ""} : (tensor<i1>) -> tensor<i1>
+      %5 = "tf.PartitionedCall"(%output, %cst_1, %cst_2) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_with_bias_fn_2}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x4xf32>, tensor<4x3xf32>, tensor<3xf32>) -> tensor<1x3xf32>
+      %output_6, %min_7, %max_8, %histogram_9 = "tf.CustomAggregator"(%5) <{calibration_method = 1 : i32, id = "1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+      %6 = "tf.Identity"(%output_6) {device = ""} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+      "tf.Yield"(%4, %6) {device = ""} : (tensor<i1>, tensor<1x3xf32>) -> ()
+    }, {
+      %4 = "tf.Identity"(%cst_3) {device = ""} : (tensor<i1>) -> tensor<i1>
+      %5 = "tf.PartitionedCall"(%output, %cst_4, %cst_5) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_with_bias_fn_1}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x4xf32>, tensor<4x3xf32>, tensor<3xf32>) -> tensor<1x3xf32>
+      %output_6, %min_7, %max_8, %histogram_9 = "tf.CustomAggregator"(%5) <{calibration_method = 1 : i32, id = "2", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+      %6 = "tf.Identity"(%output_6) {device = ""} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+      "tf.Yield"(%4, %6) {device = ""} : (tensor<i1>, tensor<1x3xf32>) -> ()
+    }) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = ""} : (tensor<i1>) -> (tensor<i1>, tensor<1x3xf32>)
+    %3 = "tf.Identity"(%2#1) {device = ""} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+    return %3 : tensor<1x3xf32>
+  }
+  func.func private @composite_matmul_with_bias_fn_2(%arg0: tensor<1x4xf32>, %arg1: tensor<4x3xf32>, %arg2: tensor<3xf32>) -> tensor<1x3xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.MatMul"(%arg0, %arg1) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_b", device = ""} : (tensor<1x4xf32>, tensor<4x3xf32>) -> tensor<1x3xf32>
+    %1 = "tf.BiasAdd"(%0, %arg2) <{data_format = "NHWC"}> {device = ""} : (tensor<1x3xf32>, tensor<3xf32>) -> tensor<1x3xf32>
+    return %1 : tensor<1x3xf32>
+  }
+  func.func private @composite_matmul_with_bias_fn_1(%arg0: tensor<1x4xf32>, %arg1: tensor<4x3xf32>, %arg2: tensor<3xf32>) -> tensor<1x3xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.MatMul"(%arg0, %arg1) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_b", device = ""} : (tensor<1x4xf32>, tensor<4x3xf32>) -> tensor<1x3xf32>
+    %1 = "tf.BiasAdd"(%0, %arg2) <{data_format = "NHWC"}> {device = ""} : (tensor<1x3xf32>, tensor<3xf32>) -> tensor<1x3xf32>
+    return %1 : tensor<1x3xf32>
+  }
+}
+
+// -----
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1836 : i32}, tf_saved_model.semantics} {
+  func.func @main(%arg0: tensor<10x1x1024xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<10x1x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %cst = stablehlo.constant dense<0.000000e+00>: tensor<10x1024x3xf32>
+    %output, %min, %max, %histogram = "tf.CustomAggregator"(%arg0) <{calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<10x1x1024xf32>) -> (tensor<10x1x1024xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+    %0 = "tf.XlaCallModule"(%output, %cst) <{Sout = [#tf_type.shape<10x1x3>], dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_with_relu_fn_1, _original_entry_function = "composite_dot_general_with_relu_fn_1", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _tfl_quant_trait = "fully_quantizable"} : (tensor<10x1x1024xf32>, tensor<10x1024x3xf32>) -> tensor<10x1x3xf32>
+    %output_0, %min_1, %max_2, %histogram_3 = "tf.CustomAggregator"(%0) <{calibration_method = 1 : i32, id = "1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<10x1x3xf32>) -> (tensor<10x1x3xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+    return %output_0 : tensor<10x1x3xf32>
+  }
+  // CHECK-LABEL: @main
+  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]], %[[MIN_O:.*]], %[[MAX_O:.*]], %[[HISTOGRAM_0:.*]] = "tf.CustomAggregator"
+  // CKECK-SAME: <{calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}>
+  // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]], %[[MIN_1:.*]], %[[MAX_1:.*]], %[[HISTOGRAM_1:.*]] = "tf.CustomAggregator"
+  // CKECK-SAME: <{calibration_method = 1 : i32, id = "1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}>
+  // CHECK: "tf.CalibrationStatisticsSaver"(%[[MIN_O]], %[[MAX_O]], %[[HISTOGRAM_0]], %[[MIN_1]], %[[MAX_1]], %[[HISTOGRAM_1]])
+  // CHECK-SAME: <{calibration_methods = [1 : i32, 1 : i32], ids = ["0", "1"], output_file_path = "main_0.pb"}> : (tensor<f32>, tensor<f32>, tensor<0xi64>, tensor<f32>, tensor<f32>, tensor<0xi64>) -> ()
+  // CHECK: return
+
+  func.func private @composite_dot_general_with_relu_fn_1(%arg0: tensor<10x1x1024xf32>, %arg1: tensor<10x1024x3xf32>) -> tensor<10x1x3xf32> attributes {_from_xla_call_module, tf_quant.composite_function} {
+    %cst = stablehlo.constant dense<0.000000e+00> : tensor<10x1x3xf32>
+    %0 = stablehlo.dot_general %arg0, %arg1, batching_dims = [0] x [0], contracting_dims = [2] x [1], precision = [DEFAULT, DEFAULT] {mhlo.frontend_attributes = {grad_x = "false", grad_y = "false"}} : (tensor<10x1x1024xf32>, tensor<10x1024x3xf32>) -> tensor<10x1x3xf32>
+    %1 = stablehlo.maximum %0, %cst : tensor<10x1x3xf32>
+    return %1 : tensor<10x1x3xf32>
+  }
+  // CHECK-LABEL: func.func private @composite_dot_general_with_relu_fn_1
+  // CHECK-NOT: "tf.CalibrationStatisticsSaver"
+}
+
+// -----
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1836 : i32}, tf_saved_model.semantics} {
+  // CHECK-LABEL: func.func @main
+  // CHECK: "tf.CalibrationStatisticsSaver"
+  // CHECK-SAME: output_file_path = "main_0.pb"
+  // CHECK: "tf.CalibrationStatisticsSaver"
+  // CHECK-SAME: output_file_path = "main_1.pb"
+  // CHECK: "tf.CalibrationStatisticsSaver"
+  // CHECK-SAME: output_file_path = "main_2.pb"
+  func.func @main(%arg0: tensor<1x4xf32> {tf_saved_model.index_path = ["x"]}) -> (tensor<1x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_x:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %cst = stablehlo.constant dense<1.000000e+01> : tensor<f32>
+    %cst_0 = stablehlo.constant dense<[[-0.630731344, 0.54962182, 0.180364341], [-0.764542698, -0.211145893, -0.708605706], [-0.954062759, -0.614013135, 0.612640202], [-0.418223292, 5.057390e-01, 0.899269938]]> : tensor<4x3xf32>
+    %c = stablehlo.constant dense<true> : tensor<i1>
+    %cst_1 = stablehlo.constant dense<[[-0.795477629, 0.581315517, 0.921566545], [0.138622552, 0.463866323, 0.95474267], [-0.143770888, -0.796835303, 0.899996876], [0.0989735424, -0.483384758, -7.277030e-01]]> : tensor<4x3xf32>
+    %cst_2 = stablehlo.constant dense<-0.000000e+00> : tensor<f32>
+    %cst_3 = stablehlo.constant dense<[[0.335351914, 0.084816426, -0.664676845]]> : tensor<1x3xf32>
+    %cst_4 = stablehlo.constant dense<[[0.117216609, 0.933735609, 0.0728900209]]> : tensor<1x3xf32>
+    %output, %min, %max, %histogram = "tf.CustomAggregator"(%arg0) <{calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<1x4xf32>) -> (tensor<1x4xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+    %0 = stablehlo.reduce(%output init: %cst_2) applies stablehlo.add across dimensions = [0, 1] : (tensor<1x4xf32>, tensor<f32>) -> tensor<f32>
+    %1 = stablehlo.compare  GT, %0, %cst : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    %2:2 = "stablehlo.if"(%1) ({
+      %3 = "tf.XlaCallModule"(%output, %cst_0, %cst_3) <{Sout = [#tf_type.shape<1x3>], dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_with_bias_same_shape_fn_2, _original_entry_function = "composite_dot_general_with_bias_same_shape_fn_2", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _tfl_quant_trait = "fully_quantizable"} : (tensor<1x4xf32>, tensor<4x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
+      %output_5, %min_6, %max_7, %histogram_8 = "tf.CustomAggregator"(%3) <{calibration_method = 1 : i32, id = "1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+      stablehlo.return %c, %output_5 : tensor<i1>, tensor<1x3xf32>
+    }, {
+      %3 = "tf.XlaCallModule"(%output, %cst_1, %cst_4) <{Sout = [#tf_type.shape<1x3>], dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_with_bias_same_shape_fn_1, _original_entry_function = "composite_dot_general_with_bias_same_shape_fn_1", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _tfl_quant_trait = "fully_quantizable"} : (tensor<1x4xf32>, tensor<4x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
+      %output_5, %min_6, %max_7, %histogram_8 = "tf.CustomAggregator"(%3) <{calibration_method = 1 : i32, id = "2", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+      stablehlo.return %c, %output_5 : tensor<i1>, tensor<1x3xf32>
+    }) : (tensor<i1>) -> (tensor<i1>, tensor<1x3xf32>)
+    return %2#1 : tensor<1x3xf32>
+  }
+  func.func private @composite_dot_general_with_bias_same_shape_fn_2(%arg0: tensor<1x4xf32>, %arg1: tensor<4x3xf32>, %arg2: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module, tf_quant.composite_function} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x4xf32>, tensor<4x3xf32>) -> tensor<1x3xf32>
+    %1 = stablehlo.add %0, %arg2 : tensor<1x3xf32>
+    return %1 : tensor<1x3xf32>
+  }
+
+  func.func private @composite_dot_general_with_bias_same_shape_fn_1(%arg0: tensor<1x4xf32>, %arg1: tensor<4x3xf32>, %arg2: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module, tf_quant.composite_function} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x4xf32>, tensor<4x3xf32>) -> tensor<1x3xf32>
+    %1 = stablehlo.add %0, %arg2 : tensor<1x3xf32>
+    return %1 : tensor<1x3xf32>
+  }
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/insert_calibration_statistics_saver_with_skipping.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/insert_calibration_statistics_saver_with_skipping.mlir
new file mode 100644
index 00000000000000..97d546afe2b723
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/insert_calibration_statistics_saver_with_skipping.mlir
@@ -0,0 +1,47 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -stablehlo-insert-calibration-statistics-saver='aggregator-ops-to-ignore=skipping_id' | FileCheck %s
+
+func.func @serving_default(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x2x2x2xf32>) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}} {
+  %cst = "tf.Const"() <{value = dense<[[[[-0.891899645, 0.392044574], [0.77720493, 1.31188095], [0.255048186, 2.700150e+00]], [[-1.08111858, -0.406604826], [-0.298575521, -2.25356531], [-1.00201964, 2.54532099]], [[-1.34911358, 0.279911458], [-0.868258893, -1.36708188], [0.866317451, -2.05804896]]], [[[-0.591397941, 0.331505477], [0.715151429, 2.64073896], [1.27163255, 0.206143498]], [[0.474211812, 1.45044816], [0.119936548, 2.54149938], [-0.939900994, 0.438387245]], [[-1.12486279, -1.09022558], [0.82202208, 1.04652023], [1.30316162, 2.62054276]]]]> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
+  %output, %min, %max, %histogram = "tf.CustomAggregator"(%arg0) <{calibration_method = 5 : i32, id = "skipping_id", num_bins = 32 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}> : (tensor<1x3x4x3xf32>) -> (tensor<1x3x4x3xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+  %0 = "tf.Conv2D"(%output, %cst) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 2, 2, 1], use_cudnn_on_gpu = true}> {attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations", device = ""} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x2x2x2xf32>
+  %output_1, %min_2, %max_3, %histogram_4 = "tf.CustomAggregator"(%0) <{calibration_method = 5 : i32, id = "keeping_id", num_bins = 32 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}> : (tensor<1x2x2x2xf32>) -> (tensor<1x2x2x2xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+  %1 = "tf.Identity"(%output_1) {device = ""} : (tensor<1x2x2x2xf32>) -> tensor<1x2x2x2xf32>
+  return %1 : tensor<1x2x2x2xf32>
+}
+// CHECK-LABEL: @serving_default
+// CHECK: %[[CUSTOM_AGGREGATOR_0:.*]], %[[MIN_O:.*]], %[[MAX_O:.*]], %[[HISTOGRAM_0:.*]] = "tf.CustomAggregator"
+// CKECK-SAME: <{calibration_method = 5 : i32, id = "skipping_id", num_bins = 32 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}> : (tensor<1x3x4x3xf32>) -> (tensor<1x3x4x3xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+// CHECK: %[[CUSTOM_AGGREGATOR_1:.*]], %[[MIN_1:.*]], %[[MAX_1:.*]], %[[HISTOGRAM_1:.*]] = "tf.CustomAggregator"
+// CKECK-SAME: <{calibration_method = 5 : i32, id = "keeping_id", num_bins = 32 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}> : (tensor<1x3x4x3xf32>) -> (tensor<1x3x4x3xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+// CHECK: "tf.CalibrationStatisticsSaver"(%[[MIN_1]], %[[MAX_1]], %[[HISTOGRAM_1]])
+// CHECK-SAME: <{calibration_methods = [5 : i32], ids = ["keeping_id"], output_file_path = "serving_default_0.pb"}>  : (tensor<f32>, tensor<f32>, tensor<512xi64>) -> ()
+// CHECK: return
+
+// -----
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1836 : i32}, tf_saved_model.semantics} {
+  func.func @main(%arg0: tensor<10x1x1024xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<10x1x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %cst = stablehlo.constant dense<0.000000e+00>: tensor<10x1024x3xf32>
+    %output, %min, %max, %histogram = "tf.CustomAggregator"(%arg0) <{calibration_method = 1 : i32, id = "skipping_id", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<10x1x1024xf32>) -> (tensor<10x1x1024xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+    %0 = "tf.XlaCallModule"(%output, %cst) <{Sout = [#tf_type.shape<10x1x3>], dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_with_relu_fn_1, _original_entry_function = "composite_dot_general_with_relu_fn_1", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _tfl_quant_trait = "fully_quantizable"} : (tensor<10x1x1024xf32>, tensor<10x1024x3xf32>) -> tensor<10x1x3xf32>
+    %output_0, %min_1, %max_2, %histogram_3 = "tf.CustomAggregator"(%0) <{calibration_method = 1 : i32, id = "keeping_id", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<10x1x3xf32>) -> (tensor<10x1x3xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+    return %output_0 : tensor<10x1x3xf32>
+  }
+  // CHECK-LABEL: @main
+  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]], %[[MIN_O:.*]], %[[MAX_O:.*]], %[[HISTOGRAM_0:.*]] = "tf.CustomAggregator"
+  // CKECK-SAME: <{calibration_method = 1 : i32, id = "skipping_id", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}>
+  // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]], %[[MIN_1:.*]], %[[MAX_1:.*]], %[[HISTOGRAM_1:.*]] = "tf.CustomAggregator"
+  // CKECK-SAME: <{calibration_method = 1 : i32, id = "keeping_id", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}>
+  // CHECK: "tf.CalibrationStatisticsSaver"(%[[MIN_1]], %[[MAX_1]], %[[HISTOGRAM_1]])
+  // CHECK-SAME: <{calibration_methods = [1 : i32], ids = ["keeping_id"], output_file_path = "main_0.pb"}> : (tensor<f32>, tensor<f32>, tensor<0xi64>) -> ()
+  // CHECK: return
+
+  func.func private @composite_dot_general_with_relu_fn_1(%arg0: tensor<10x1x1024xf32>, %arg1: tensor<10x1024x3xf32>) -> tensor<10x1x3xf32> attributes {_from_xla_call_module, tf_quant.composite_function} {
+    %cst = stablehlo.constant dense<0.000000e+00> : tensor<10x1x3xf32>
+    %0 = stablehlo.dot_general %arg0, %arg1, batching_dims = [0] x [0], contracting_dims = [2] x [1], precision = [DEFAULT, DEFAULT] {mhlo.frontend_attributes = {grad_x = "false", grad_y = "false"}} : (tensor<10x1x1024xf32>, tensor<10x1024x3xf32>) -> tensor<10x1x3xf32>
+    %1 = stablehlo.maximum %0, %cst : tensor<10x1x3xf32>
+    return %1 : tensor<10x1x3xf32>
+  }
+  // CHECK-LABEL: func.func private @composite_dot_general_with_relu_fn_1
+  // CHECK-NOT: "tf.CalibrationStatisticsSaver"
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/insert_weight_param.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/insert_weight_param.mlir
index 89ff96efecf471..6a194023dbbfc1 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/insert_weight_param.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/insert_weight_param.mlir
@@ -1,14 +1,15 @@
 // RUN: stablehlo-quant-opt %s -split-input-file -stablehlo-insert-weight-param | FileCheck %s
 
-// Test that q/dq pair is inserted between constant and XlaCallModule op
-// with quantizable trait and function name containing conv.
+// Test that q/dq pair with per-tensor quantization parameter is inserted
+// between constant and XlaCallModule op with empty `weight_only_ptq` method
+// and function name containing conv.
 
-func.func @qdq_for_conv_weight(%arg0: tensor<1x3x2x3xf32>) -> tensor<1x2x2x2xf32> attributes {tf._original_func_name = "main_0"} {
+func.func @qdq_for_conv_weight_empty(%arg0: tensor<1x3x2x3xf32>) -> tensor<1x2x2x2xf32> attributes {tf._original_func_name = "main_0"} {
   %cst = "tf.Const"() {value = dense<3.000000e-01> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
   %0 = "tf.XlaCallModule"(%arg0, %cst) {
     Sout = [#tf_type.shape<1x2x2x2>], _entry_function = @composite_conv_fn,
     _original_entry_function = "composite_conv_fn",
-    _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",
+    _stablehlo_module_attrs = {}, _quantization_method = "weight_only_ptq { }",
     device = "", dim_args_spec = [], disabled_checks = [],
     has_token_input_output = false, module = "", platforms = [],
     version = 5 : i64
@@ -16,25 +17,28 @@ func.func @qdq_for_conv_weight(%arg0: tensor<1x3x2x3xf32>) -> tensor<1x2x2x2xf32
   return %0 : tensor<1x2x2x2xf32>
 }
 
-// CHECK-LABEL: func.func @qdq_for_conv_weight
+// CHECK-LABEL: func.func @qdq_for_conv_weight_empty
 // CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x3x2x3xf32>) -> tensor<1x2x2x2xf32>
 // CHECK: %[[CST:.+]] = "tf.Const"() <{value = dense<3.000000e-01> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
-// CHECK: %[[Q:.+]] = "quantfork.qcast"(%[[CST]]) : (tensor<2x3x3x2xf32>) -> tensor<2x3x3x2x!quant.uniform<i8:f32, 0.0011764706349840352:-128>>
-// CHECK: %[[DQ:.+]] = "quantfork.dcast"(%[[Q]]) : (tensor<2x3x3x2x!quant.uniform<i8:f32, 0.0011764706349840352:-128>>) -> tensor<2x3x3x2xf32>
-// CHECK: %[[CALL:.+]] = "tf.XlaCallModule"(%[[ARG_0]], %[[DQ]]) <{Sout = [#tf_type.shape<1x2x2x2>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_conv_fn, _original_entry_function = "composite_conv_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x3x2x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x2x2x2xf32>
+// CHECK: %[[Q:.+]] = "quantfork.qcast"(%[[CST]]) : (tensor<2x3x3x2xf32>) -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>
+// CHECK: %[[DQ:.+]] = "quantfork.dcast"(%[[Q]]) : (tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>) -> tensor<2x3x3x2xf32>
+// CHECK: %[[CALL:.+]] = "tf.XlaCallModule"(%[[ARG_0]], %[[DQ]])
+// CHECK-SAME: _entry_function = @composite_conv_fn, _original_entry_function = "composite_conv_fn", _quantization_method = "weight_only_ptq { }"
+// CHECK-SAME: (tensor<1x3x2x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x2x2x2xf32>
 // CHECK: return %[[CALL]] : tensor<1x2x2x2xf32>
 
 // -----
 
-// Test that q/dq pair is inserted between constant and XlaCallModule op
-// with quantizable trait and function name containing dot_general.
+// Test that q/dq pair with per-tensor quantization parameter is inserted
+// between constant and XlaCallModule op with empty `weight_only_ptq` method and
+// function name containing dot_general.
 
-func.func @qdq_for_dot_general_weight(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
+func.func @qdq_for_dot_general_weight_empty(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
   %cst = "tf.Const"() {value = dense<3.000000e-01> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
   %0 = "tf.XlaCallModule"(%arg0, %cst) {
     Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn,
     _original_entry_function = "composite_dot_general_fn",
-    _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",
+    _quantization_method = "weight_only_ptq { }", _stablehlo_module_attrs = {},
     device = "", dim_args_spec = [], disabled_checks = [],
     has_token_input_output = false, module = "", platforms = [],
     version = 5 : i64
@@ -42,16 +46,228 @@ func.func @qdq_for_dot_general_weight(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32>
   return %0 : tensor<1x3xf32>
 }
 
-// CHECK-LABEL: func.func @qdq_for_dot_general_weight
+// CHECK-LABEL: func.func @qdq_for_dot_general_weight_empty
 // CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x2xf32>) -> tensor<1x3xf32>
 // CHECK: %[[CST:.+]] = "tf.Const"() <{value = dense<3.000000e-01> : tensor<2x3xf32>}> : () -> tensor<2x3xf32>
-// CHECK: %[[Q:.+]] = "quantfork.qcast"(%[[CST]]) : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8:f32, 0.0011764706349840352:-128>>
-// CHECK: %[[DQ:.+]] = "quantfork.dcast"(%[[Q]]) : (tensor<2x3x!quant.uniform<i8:f32, 0.0011764706349840352:-128>>) -> tensor<2x3xf32>
-// CHECK: %[[CALL:.+]] = "tf.XlaCallModule"(%[[ARG_0]], %[[DQ]]) <{Sout = [#tf_type.shape<1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+// CHECK: %[[Q:.+]] = "quantfork.qcast"(%[[CST]]) : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>
+// CHECK: %[[DQ:.+]] = "quantfork.dcast"(%[[Q]]) : (tensor<2x3x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>) -> tensor<2x3xf32>
+// CHECK: %[[CALL:.+]] = "tf.XlaCallModule"(%[[ARG_0]], %[[DQ]])
+// CHECK-SAME: _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _quantization_method = "weight_only_ptq { }"
+// CHECK-SAME: (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
 // CHECK: return %[[CALL]] : tensor<1x3xf32>
 
 // -----
 
+// Test that q/dq pair with per-tensor quantization parameter is inserted
+// between constant and XlaCallModule op with `weight_only_ptq` method of
+// `per_tensor` and function name containing conv.
+
+func.func @qdq_for_conv_weight_per_tensor(%arg0: tensor<1x3x2x3xf32>) -> tensor<1x2x2x2xf32> attributes {tf._original_func_name = "main_0"} {
+  %cst = "tf.Const"() {value = dense<3.000000e-01> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+  %0 = "tf.XlaCallModule"(%arg0, %cst) {
+    Sout = [#tf_type.shape<1x2x2x2>], _entry_function = @composite_conv_fn,
+    _original_entry_function = "composite_conv_fn",
+    _stablehlo_module_attrs = {}, _quantization_method = "weight_only_ptq {input_quantized_types {key: 1, value {per_tensor {}}}}",
+    device = "", dim_args_spec = [], disabled_checks = [],
+    has_token_input_output = false, module = "", platforms = [],
+    version = 5 : i64
+  } : (tensor<1x3x2x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x2x2x2xf32>
+  return %0 : tensor<1x2x2x2xf32>
+}
+
+// CHECK-LABEL: func.func @qdq_for_conv_weight_per_tensor
+// CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x3x2x3xf32>) -> tensor<1x2x2x2xf32>
+// CHECK: %[[CST:.+]] = "tf.Const"() <{value = dense<3.000000e-01> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
+// CHECK: %[[Q:.+]] = "quantfork.qcast"(%[[CST]]) : (tensor<2x3x3x2xf32>) -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>
+// CHECK: %[[DQ:.+]] = "quantfork.dcast"(%[[Q]]) : (tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>) -> tensor<2x3x3x2xf32>
+// CHECK: %[[CALL:.+]] = "tf.XlaCallModule"(%[[ARG_0]], %[[DQ]])
+// CHECK-SAME: _entry_function = @composite_conv_fn, _original_entry_function = "composite_conv_fn", _quantization_method = "weight_only_ptq {input_quantized_types {key: 1, value {per_tensor {}}}}"
+// CHECK-SAME: (tensor<1x3x2x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x2x2x2xf32>
+// CHECK: return %[[CALL]] : tensor<1x2x2x2xf32>
+
+// -----
+
+// Test that q/dq pair with per-tensor quantization parameter is inserted
+// between constant and XlaCallModule op with `weight_only_ptq` method of
+// `per_tensor` and function name containing dot_general.
+
+func.func @qdq_for_dot_general_weight_per_tensor(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
+  %cst = "tf.Const"() {value = dense<3.000000e-01> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
+  %0 = "tf.XlaCallModule"(%arg0, %cst) {
+    Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn,
+    _original_entry_function = "composite_dot_general_fn",
+    _quantization_method = "weight_only_ptq {input_quantized_types {key: 1, value {per_tensor {}}}}", _stablehlo_module_attrs = {},
+    device = "", dim_args_spec = [], disabled_checks = [],
+    has_token_input_output = false, module = "", platforms = [],
+    version = 5 : i64
+  } : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+  return %0 : tensor<1x3xf32>
+}
+
+// CHECK-LABEL: func.func @qdq_for_dot_general_weight_per_tensor
+// CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x2xf32>) -> tensor<1x3xf32>
+// CHECK: %[[CST:.+]] = "tf.Const"() <{value = dense<3.000000e-01> : tensor<2x3xf32>}> : () -> tensor<2x3xf32>
+// CHECK: %[[Q:.+]] = "quantfork.qcast"(%[[CST]]) : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>
+// CHECK: %[[DQ:.+]] = "quantfork.dcast"(%[[Q]]) : (tensor<2x3x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>) -> tensor<2x3xf32>
+// CHECK: %[[CALL:.+]] = "tf.XlaCallModule"(%[[ARG_0]], %[[DQ]])
+// CHECK-SAME: _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _quantization_method = "weight_only_ptq {input_quantized_types {key: 1, value {per_tensor {}}}}"
+// CHECK-SAME: (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+// CHECK: return %[[CALL]] : tensor<1x3xf32>
+
+// -----
+
+// Test that q/dq pair with per-channel quantization parameter is inserted
+// between constant and XlaCallModule op with `weight_only_ptq` method of
+// `quatized_type` without specified quantization dimension and function name
+// containing conv.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @qdq_for_conv_weight_per_channel_default(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+    %0 = "tf.XlaCallModule"(%arg0, %cst) {
+        Sout = [#tf_type.shape<1x3x4x2>], dim_args_spec = [], disabled_checks = [],
+        has_token_input_output = false, module = "", platforms = [], version = 5 : i64,
+        _entry_function = @composite_conv_fn, _original_entry_function = "composite_conv_fn",
+        _quantization_method = "weight_only_ptq {input_quantized_types {key: 1, value {dimension_specs {}}}}",
+        _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",
+        device = ""
+      } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    return %0 : tensor<1x3x4x2xf32>
+  }
+
+  // CHECK: func.func private @qdq_for_conv_weight_per_channel_default(%[[ARG0:.+]]: tensor<1x3x4x3xf32>)
+  // CHECK: %[[CST:.+]] = "tf.Const"() <{value = dense<3.000000e-01> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
+  // CHECK: %[[Q:.+]] = "quantfork.qcast"(%[[CST]]) : (tensor<2x3x3x2xf32>) -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>
+  // CHECK: %[[DQ:.+]] = "quantfork.dcast"(%[[Q]]) : (tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>) -> tensor<2x3x3x2xf32>
+  // CHECK: %[[CALL:.+]] = "tf.XlaCallModule"(%[[ARG0]], %[[DQ]])
+  // CHECK-SAME: (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+  // CHECK: return %[[CALL]]
+
+  func.func private @composite_conv_fn(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[0, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    return %0 : tensor<1x3x4x2xf32>
+  }
+  // CHECK: func private @composite_conv_fn
+  // CHECK: %[[CONV:.+]] = stablehlo.convolution
+  // CHECK: return %[[CONV]]
+}
+
+// -----
+
+// Test that q/dq pair with per-channel quantization parameter is inserted
+// between constant and XlaCallModule op with `weight_only_ptq` method of
+// `quatized_type` without specified quantization dimension and function name
+// containing dot_general.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @qdq_for_dot_general_weight_per_channel_default(%arg0: tensor<4x3x6x5xf32>) -> tensor<4x3x6x2xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.000000e-01> : tensor<4x3x5x2xf32>} : () -> tensor<4x3x5x2xf32>
+    %0 = "tf.XlaCallModule"(%arg0, %cst) {
+      Sout = [#tf_type.shape<4x3x6x2>], _entry_function = @composite_dot_general_fn,
+      _original_entry_function = "composite_dot_general_fn",
+      _quantization_method = "weight_only_ptq {input_quantized_types {key: 1, value {dimension_specs {}}}}",
+      _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",
+      device = "", dim_args_spec = [], disabled_checks = [],
+      has_token_input_output = false, module = "", platforms = [],
+      version = 5 : i64
+    } : (tensor<4x3x6x5xf32>, tensor<4x3x5x2xf32>) -> tensor<4x3x6x2xf32>
+    return %0 : tensor<4x3x6x2xf32>
+  }
+  // CHECK: func.func private @qdq_for_dot_general_weight_per_channel_default(%[[ARG0:.+]]: tensor<4x3x6x5xf32>)
+  // CHECK: %[[CST:.+]] = "tf.Const"() <{value = dense<3.000000e-01> : tensor<4x3x5x2xf32>}> : () -> tensor<4x3x5x2xf32>
+  // CHECK: %[[Q:.+]] = "quantfork.qcast"(%[[CST]]) : (tensor<4x3x5x2xf32>) -> tensor<4x3x5x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>
+  // CHECK: %[[DQ:.+]] = "quantfork.dcast"(%[[Q]]) : (tensor<4x3x5x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>) -> tensor<4x3x5x2xf32>
+  // CHECK: %[[CALL:.+]] = "tf.XlaCallModule"(%[[ARG0]], %[[DQ]])
+  // CHECK-SAME: (tensor<4x3x6x5xf32>, tensor<4x3x5x2xf32>) -> tensor<4x3x6x2xf32>
+  // CHECK: return %[[CALL]]
+
+  func.func private @composite_dot_general_fn(%arg0: tensor<4x3x6x5xf32>, %arg1: tensor<4x3x5x2xf32>) -> tensor<4x3x6x2xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, batching_dims = [0, 1] x [0, 1], contracting_dims = [3] x [2] : (tensor<4x3x6x5xf32>, tensor<4x3x5x2xf32>) -> tensor<4x3x6x2xf32>
+    return %0 : tensor<4x3x6x2xf32>
+  }
+  // CHECK: func private @composite_dot_general_fn
+  // CHECK: %[[DOT:.+]] = stablehlo.dot_general
+  // CHECK: return %[[DOT]]
+}
+
+// -----
+
+// Test that q/dq pair with per-channel quantization parameter is inserted
+// between constant and XlaCallModule op with `weight_only_ptq` method of
+// `quatized_type` with specified quantization dimension and function name
+// containing conv.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @qdq_for_conv_weight_per_channel(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+    %0 = "tf.XlaCallModule"(%arg0, %cst) {
+        Sout = [#tf_type.shape<1x3x4x2>], dim_args_spec = [], disabled_checks = [],
+        has_token_input_output = false, module = "", platforms = [], version = 5 : i64,
+        _entry_function = @composite_conv_fn, _original_entry_function = "composite_conv_fn",
+        _quantization_method = "weight_only_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}",
+        _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",
+        device = ""
+      } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    return %0 : tensor<1x3x4x2xf32>
+  }
+
+  // CHECK: func.func private @qdq_for_conv_weight_per_channel(%[[ARG0:.+]]: tensor<1x3x4x3xf32>)
+  // CHECK: %[[CST:.+]] = "tf.Const"() <{value = dense<3.000000e-01> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
+  // CHECK: %[[Q:.+]] = "quantfork.qcast"(%[[CST]]) : (tensor<2x3x3x2xf32>) -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>
+  // CHECK: %[[DQ:.+]] = "quantfork.dcast"(%[[Q]]) : (tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>) -> tensor<2x3x3x2xf32>
+  // CHECK: %[[CALL:.+]] = "tf.XlaCallModule"(%[[ARG0]], %[[DQ]])
+  // CHECK-SAME: (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+  // CHECK: return %[[CALL]]
+
+  func.func private @composite_conv_fn(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[0, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    return %0 : tensor<1x3x4x2xf32>
+  }
+  // CHECK: func private @composite_conv_fn
+  // CHECK: %[[CONV:.+]] = stablehlo.convolution
+  // CHECK: return %[[CONV]]
+}
+
+// -----
+
+// Test that q/dq pair with per-channel quantization parameter is inserted
+// between constant and XlaCallModule op with `weight_only_ptq` method of
+// `quatized_type` with specified quantization dimension and function name
+// containing dot_general.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @qdq_for_dot_general_weight_per_channel(%arg0: tensor<4x3x6x5xf32>) -> tensor<4x3x6x2xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.000000e-01> : tensor<4x3x5x2xf32>} : () -> tensor<4x3x5x2xf32>
+    %0 = "tf.XlaCallModule"(%arg0, %cst) {
+      Sout = [#tf_type.shape<4x3x6x2>], _entry_function = @composite_dot_general_fn,
+      _original_entry_function = "composite_dot_general_fn",
+      _quantization_method = "weight_only_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}",
+      _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",
+      device = "", dim_args_spec = [], disabled_checks = [],
+      has_token_input_output = false, module = "", platforms = [],
+      version = 5 : i64
+    } : (tensor<4x3x6x5xf32>, tensor<4x3x5x2xf32>) -> tensor<4x3x6x2xf32>
+    return %0 : tensor<4x3x6x2xf32>
+  }
+  // CHECK: func.func private @qdq_for_dot_general_weight_per_channel(%[[ARG0:.+]]: tensor<4x3x6x5xf32>)
+  // CHECK: %[[CST:.+]] = "tf.Const"() <{value = dense<3.000000e-01> : tensor<4x3x5x2xf32>}> : () -> tensor<4x3x5x2xf32>
+  // CHECK: %[[Q:.+]] = "quantfork.qcast"(%[[CST]]) : (tensor<4x3x5x2xf32>) -> tensor<4x3x5x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>
+  // CHECK: %[[DQ:.+]] = "quantfork.dcast"(%[[Q]]) : (tensor<4x3x5x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>) -> tensor<4x3x5x2xf32>
+  // CHECK: %[[CALL:.+]] = "tf.XlaCallModule"(%[[ARG0]], %[[DQ]])
+  // CHECK-SAME: (tensor<4x3x6x5xf32>, tensor<4x3x5x2xf32>) -> tensor<4x3x6x2xf32>
+  // CHECK: return %[[CALL]]
+
+  func.func private @composite_dot_general_fn(%arg0: tensor<4x3x6x5xf32>, %arg1: tensor<4x3x5x2xf32>) -> tensor<4x3x6x2xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, batching_dims = [0, 1] x [0, 1], contracting_dims = [3] x [2] : (tensor<4x3x6x5xf32>, tensor<4x3x5x2xf32>) -> tensor<4x3x6x2xf32>
+    return %0 : tensor<4x3x6x2xf32>
+  }
+  // CHECK: func private @composite_dot_general_fn
+  // CHECK: %[[DOT:.+]] = stablehlo.dot_general
+  // CHECK: return %[[DOT]]
+}
+
+// -----
+
 // Test that q/dq pair is not inserted between constant and XlaCallModule op
 // whose entry function name does not include conv nor dot_general.
 
@@ -59,7 +275,7 @@ func.func @no_qdq_except_conv_and_dot_general(%arg0: tensor<2x3x2xi64>) -> tenso
   %cst = "tf.Const"() {value = dense<3.000000e-01> : tensor<3x4x2xf32>} : () -> tensor<3x4x2xf32>
   %0 = "tf.XlaCallModule"(%cst, %arg0) {
     Sout = [#tf_type.shape<1x3>], _entry_function = @composite_gather_fn,
-    _original_entry_function = "composite_gather_fn",
+    _original_entry_function = "composite_gather_fn", _quantization_method = "weight_only_ptq { }",
     _stablehlo_module_attrs = {}, device = "", dim_args_spec = [],
     disabled_checks = [], has_token_input_output = false, module = "",
     platforms = [], version = 5 : i64
@@ -81,7 +297,7 @@ func.func @no_qdq_for_non_weight_constant(%arg0: tensor<1x2xf32>, %arg1: tensor<
   %0 = "tf.XlaCallModule"(%arg0, %arg1, %cst) {
     Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_with_bias_fn,
     _original_entry_function = "composite_dot_general_with_bias_fn",
-    _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",
+    _stablehlo_module_attrs = {}, _quantization_method = "weight_only_ptq { }",
     device = "", dim_args_spec = [], disabled_checks = [],
     has_token_input_output = false, module = "", platforms = [],
     version = 5 : i64
@@ -96,7 +312,7 @@ func.func @no_qdq_for_non_weight_constant(%arg0: tensor<1x2xf32>, %arg1: tensor<
 // -----
 
 // Test that q/dq pair is not inserted between constant and XlaCallModule op
-// without quantizable trait.
+// without `weight_only_ptq` method.
 
 func.func @no_qdq_for_not_quantizable_call(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
   %cst = "tf.Const"() {value = dense<3.000000e-01> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
@@ -116,6 +332,27 @@ func.func @no_qdq_for_not_quantizable_call(%arg0: tensor<1x2xf32>) -> tensor<1x3
 
 // -----
 
+// Test that q/dq pair is not inserted between constant and XlaCallModule op
+// with different method.
+
+func.func @no_qdq_for_not_quantizable_call(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
+  %cst = "tf.Const"() {value = dense<3.000000e-01> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
+  %0 = "tf.XlaCallModule"(%arg0, %cst) {
+    Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn,
+    _original_entry_function = "composite_dot_general_fn",
+    _stablehlo_module_attrs = {}, device = "", dim_args_spec = [],
+    disabled_checks = [], has_token_input_output = false, module = "",
+    platforms = [], _quantization_method = "static_range_ptq { }", version = 5 : i64
+  } : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+  return %0 : tensor<1x3xf32>
+}
+
+// CHECK-LABEL: func.func @no_qdq_for_not_quantizable_call
+// CHECK-NOT: quantfork.qcast
+// CHECK-NOT: quantfork.dcast
+
+// -----
+
 // Test that q/dq pair is not inserted when constant has multiple users.
 
 func.func @no_qdq_for_multiple_users(%arg0: tensor<2x2xf32>) -> tensor<2x3xf32> attributes {tf._original_func_name = "main_0"} {
@@ -123,7 +360,7 @@ func.func @no_qdq_for_multiple_users(%arg0: tensor<2x2xf32>) -> tensor<2x3xf32>
   %0 = "tf.XlaCallModule"(%arg0, %cst) {
     Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn,
     _original_entry_function = "composite_dot_general_fn",
-    _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",
+    _stablehlo_module_attrs = {}, _quantization_method = "weight_only_ptq { }",
     device = "", dim_args_spec = [], disabled_checks = [],
     has_token_input_output = false, module = "", platforms = [],
     version = 5 : i64
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/lift_quantizable_spots_as_functions.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/lift_quantizable_spots_as_functions.mlir
index fa722c2fc71c88..eb4c2416024512 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/lift_quantizable_spots_as_functions.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/lift_quantizable_spots_as_functions.mlir
@@ -247,9 +247,9 @@ func.func @conv_with_relu_dynamic_fn(%arg0: tensor<?x28x28x1xf32>) -> tensor<?x2
 // CHECK: }
 
 // CHECK-LABEL: private @composite_conv_with_relu_dynamic_fn_1
-// CHECK: %[[CONST:.*]] = stablehlo.constant dense<0.000000e+00>
 // CHECK: %[[CONV:.*]] = stablehlo.convolution(%arg0, %arg1)
 // CHECK: %[[SHAPE_OF:.*]] = shape.shape_of %[[CONV]]
+// CHECK-DAG: %[[CONST:.*]] = stablehlo.constant dense<0.000000e+00>
 // CHECK: %[[DYNAMIC_BROADCAST_IN_DIM:.*]] = stablehlo.dynamic_broadcast_in_dim %[[CONST]], %[[SHAPE_OF]]
 // CHECK: %[[MAX:.*]] = stablehlo.maximum %[[CONV]], %[[DYNAMIC_BROADCAST_IN_DIM]]
 // CHECK: return %[[MAX]] : tensor<?x28x28x16xf32>
@@ -293,9 +293,9 @@ func.func @dot_general_with_relu_dynamic_fn(%arg0: tensor<?x12544xf32>) -> tenso
 // CHECK: }
 
 // CHECK-LABEL: private @composite_dot_general_with_relu_dynamic_fn_1
-// CHECK: %[[CONST:.*]] = stablehlo.constant dense<0.000000e+00>
 // CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
 // CHECK: %[[SHAPE_OF:.*]] = shape.shape_of %[[DOT_GENERAL]]
+// CHECK-DAG: %[[CONST:.*]] = stablehlo.constant dense<0.000000e+00>
 // CHECK: %[[DYNAMIC_BROADCAST_IN_DIM:.*]] = stablehlo.dynamic_broadcast_in_dim %[[CONST]], %[[SHAPE_OF]]
 // CHECK: %[[MAX:.*]] = stablehlo.maximum %[[DOT_GENERAL]], %[[DYNAMIC_BROADCAST_IN_DIM]]
 // CHECK: return %[[MAX]] : tensor<?x10xf32>
@@ -342,9 +342,9 @@ func.func @conv_with_relu6_fn(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32>
 // CHECK: }
 
 // CHECK-LABEL: private @composite_conv_with_relu6_fn_1
-// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<0.000000e+00>
-// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<6.000000e+00>
+// CHECK-DAG: %[[CONST_1:.*]] = stablehlo.constant dense<6.000000e+00>
 // CHECK: %[[CONV:.*]] = stablehlo.convolution(%arg0, %arg1)
+// CHECK-DAG: %[[CONST_0:.*]] = stablehlo.constant dense<0.000000e+00>
 // CHECK: %[[CLAMP:.*]] = stablehlo.clamp %[[CONST_0]], %[[CONV]], %[[CONST_1]]
 // CHECK: return %[[CLAMP]] : tensor<1x3x3x4xf32>
 // CHECK: }
@@ -367,9 +367,9 @@ func.func @dot_general_with_relu6_fn(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x6
 // CHECK: }
 
 // CHECK-LABEL: private @composite_dot_general_with_relu6_fn_1
-// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<0.000000e+00>
-// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<6.000000e+00>
+// CHECK-DAG: %[[CONST_1:.*]] = stablehlo.constant dense<6.000000e+00>
 // CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
+// CHECK-DAG: %[[CONST_0:.*]] = stablehlo.constant dense<0.000000e+00>
 // CHECK: %[[CLAMP:.*]] = stablehlo.clamp %[[CONST_0]], %[[DOT_GENERAL]], %[[CONST_1]]
 // CHECK: return %[[CLAMP]] : tensor<1x1x64xf32>
 // CHECK: }
@@ -392,9 +392,9 @@ func.func @conv_with_relu6_dynamic_fn(%arg0: tensor<?x28x28x1xf32>) -> tensor<?x
 // CHECK: }
 
 // CHECK-LABEL: private @composite_conv_with_relu6_fn_1
-// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<0.000000e+00>
-// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<6.000000e+00>
+// CHECK-DAG: %[[CONST_1:.*]] = stablehlo.constant dense<6.000000e+00>
 // CHECK: %[[CONV:.*]] = stablehlo.convolution(%arg0, %arg1)
+// CHECK-DAG: %[[CONST_0:.*]] = stablehlo.constant dense<0.000000e+00>
 // CHECK: %[[CLAMP:.*]] = stablehlo.clamp %[[CONST_0]], %[[CONV]], %[[CONST_1]]
 // CHECK: return %[[CLAMP]] : tensor<?x28x28x16xf32>
 // CHECK: }
@@ -417,9 +417,9 @@ func.func @dot_general_with_relu6_dynamic_fn(%arg0: tensor<?x12544xf32>) -> tens
 // CHECK: }
 
 // CHECK-LABEL: private @composite_dot_general_with_relu6_fn_1
-// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<0.000000e+00>
-// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<6.000000e+00>
+// CHECK-DAG: %[[CONST_1:.*]] = stablehlo.constant dense<6.000000e+00>
 // CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
+// CHECK-DAG: %[[CONST_0:.*]] = stablehlo.constant dense<0.000000e+00>
 // CHECK: %[[CLAMP:.*]] = stablehlo.clamp %[[CONST_0]], %[[DOT_GENERAL]], %[[CONST_1]]
 // CHECK: return %[[CLAMP]] : tensor<?x10xf32>
 // CHECK: }
@@ -444,8 +444,8 @@ func.func @dot_general_with_bias_same_shape_and_relu_fn(%arg0: tensor<1x1x167xf3
 // CHECK: }
 
 // CHECK-LABEL: private @composite_dot_general_with_bias_same_shape_and_relu_fn_1
-// CHECK: %[[CONST:.*]] = stablehlo.constant dense<0.000000e+00>
 // CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
+// CHECK-DAG: %[[CONST:.*]] = stablehlo.constant dense<0.000000e+00>
 // CHECK: %[[ADD:.*]] = stablehlo.add %[[DOT_GENERAL]], %arg2
 // CHECK: %[[MAX:.*]] = stablehlo.maximum %[[ADD]], %[[CONST]]
 // CHECK: return %[[MAX]] : tensor<1x1x64xf32>
@@ -472,9 +472,9 @@ func.func @conv_with_bias_and_relu_fn(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3x
 // CHECK: }
 
 // CHECK-LABEL: private @composite_conv_with_bias_and_relu_fn_1
-// CHECK: %[[CONST:.*]] = stablehlo.constant dense<0.000000e+00>
 // CHECK: %[[BROADCAST_IN_DIM:.*]] = stablehlo.broadcast_in_dim %arg2
 // CHECK: %[[CONV:.*]] = stablehlo.convolution(%arg0, %arg1)
+// CHECK-DAG: %[[CONST:.*]] = stablehlo.constant dense<0.000000e+00>
 // CHECK: %[[ADD:.*]] = stablehlo.add %[[CONV]], %[[BROADCAST_IN_DIM]]
 // CHECK: %[[MAX:.*]] = stablehlo.maximum %[[ADD]], %[[CONST]]
 // CHECK: return %[[MAX]] : tensor<1x3x3x4xf32>
@@ -501,9 +501,9 @@ func.func @dot_general_with_bias_and_relu_fn(%arg0: tensor<1x1x167xf32>) -> tens
 // CHECK: }
 
 // CHECK-LABEL: private @composite_dot_general_with_bias_and_relu_fn_1
-// CHECK: %[[CONST:.*]] = stablehlo.constant dense<0.000000e+00>
 // CHECK: %[[BROADCAST_IN_DIM:.*]] = stablehlo.broadcast_in_dim %arg2
 // CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
+// CHECK-DAG: %[[CONST:.*]] = stablehlo.constant dense<0.000000e+00>
 // CHECK: %[[ADD:.*]] = stablehlo.add %[[DOT_GENERAL]], %[[BROADCAST_IN_DIM]]
 // CHECK: %[[MAX:.*]] = stablehlo.maximum %[[ADD]], %[[CONST]]
 // CHECK: return %[[MAX]] : tensor<1x1x64xf32>
@@ -533,12 +533,12 @@ func.func @conv_with_bias_and_relu_dynamic_fn(%arg0: tensor<?x28x28x1xf32>) -> t
 // CHECK: }
 
 // CHECK-LABEL: private @composite_conv_with_bias_and_relu_dynamic_fn_1
-// CHECK: %[[CONST:.*]] = stablehlo.constant dense<0.000000e+00>
 // CHECK: %[[CONV:.*]] = stablehlo.convolution(%arg0, %arg1)
 // CHECK: %[[SHAPE_OF_0:.*]] = shape.shape_of %[[CONV]]
 // CHECK: %[[DYNAMIC_BROADCAST_IN_DIM_0:.*]] = stablehlo.dynamic_broadcast_in_dim %arg2, %[[SHAPE_OF_0]]
 // CHECK: %[[ADD:.*]] = stablehlo.add %[[CONV]], %[[DYNAMIC_BROADCAST_IN_DIM_0]]
 // CHECK: %[[SHAPE_OF_1:.*]] = shape.shape_of %[[ADD]]
+// CHECK-DAG: %[[CONST:.*]] = stablehlo.constant dense<0.000000e+00>
 // CHECK: %[[DYNAMIC_BROADCAST_IN_DIM_1:.*]] = stablehlo.dynamic_broadcast_in_dim %[[CONST]], %[[SHAPE_OF_1]]
 // CHECK: %[[MAX:.*]] = stablehlo.maximum %[[ADD]], %[[DYNAMIC_BROADCAST_IN_DIM_1]]
 // CHECK: return %[[MAX]] : tensor<?x28x28x16xf32>
@@ -591,12 +591,12 @@ func.func @dot_general_with_bias_and_relu_dynamic_fn(%arg0: tensor<?x12544xf32>)
 // CHECK: }
 
 // CHECK-LABEL: private @composite_dot_general_with_bias_and_relu_dynamic_fn_1
-// CHECK: %[[CONST:.*]] = stablehlo.constant dense<0.000000e+00>
 // CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
 // CHECK: %[[SHAPE_OF_0:.*]] = shape.shape_of %[[DOT_GENERAL]]
 // CHECK: %[[DYNAMIC_BROADCAST_IN_DIM_0:.*]] = stablehlo.dynamic_broadcast_in_dim %arg2, %[[SHAPE_OF_0]]
 // CHECK: %[[ADD:.*]] = stablehlo.add %[[DOT_GENERAL]], %[[DYNAMIC_BROADCAST_IN_DIM_0]]
 // CHECK: %[[SHAPE_OF_1:.*]] = shape.shape_of %[[ADD]]
+// CHECK-DAG: %[[CONST:.*]] = stablehlo.constant dense<0.000000e+00>
 // CHECK: %[[DYNAMIC_BROADCAST_IN_DIM_1:.*]] = stablehlo.dynamic_broadcast_in_dim %[[CONST]], %[[SHAPE_OF_1]]
 // CHECK: %[[MAX:.*]] = stablehlo.maximum %[[ADD]], %[[DYNAMIC_BROADCAST_IN_DIM_1]]
 // CHECK: return %[[MAX]] : tensor<?x10xf32>
@@ -623,10 +623,10 @@ func.func @dot_general_with_bias_same_shape_and_relu6_fn(%arg0: tensor<1x1x167xf
 // CHECK: }
 
 // CHECK-LABEL: private @composite_dot_general_with_bias_same_shape_and_relu6_fn_1
-// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<0.000000e+00>
-// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<6.000000e+00>
 // CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
+// CHECK-DAG: %[[CONST_1:.*]] = stablehlo.constant dense<6.000000e+00>
 // CHECK: %[[ADD:.*]] = stablehlo.add %[[DOT_GENERAL]], %arg2
+// CHECK-DAG: %[[CONST_0:.*]] = stablehlo.constant dense<0.000000e+00>
 // CHECK: %[[CLAMP:.*]] = stablehlo.clamp %[[CONST_0]], %[[ADD]], %[[CONST_1]]
 // CHECK: return %[[CLAMP]] : tensor<1x1x64xf32>
 // CHECK: }
@@ -653,11 +653,11 @@ func.func @conv_with_bias_and_relu6_fn(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3
 // CHECK: }
 
 // CHECK-LABEL: private @composite_conv_with_bias_and_relu6_fn_1
-// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<0.000000e+00>
-// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<6.000000e+00>
 // CHECK: %[[BROADCAST_IN_DIM:.*]] = stablehlo.broadcast_in_dim %arg2
 // CHECK: %[[CONV:.*]] = stablehlo.convolution(%arg0, %arg1)
+// CHECK-DAG: %[[CONST_1:.*]] = stablehlo.constant dense<6.000000e+00>
 // CHECK: %[[ADD:.*]] = stablehlo.add %[[CONV]], %[[BROADCAST_IN_DIM]]
+// CHECK-DAG: %[[CONST_0:.*]] = stablehlo.constant dense<0.000000e+00>
 // CHECK: %[[CLAMP:.*]] = stablehlo.clamp %[[CONST_0]], %[[ADD]], %[[CONST_1]]
 // CHECK: return %[[CLAMP]] : tensor<1x3x3x4xf32>
 // CHECK: }
@@ -684,11 +684,11 @@ func.func @dot_general_with_bias_and_relu6_fn(%arg0: tensor<1x1x167xf32>) -> ten
 // CHECK: }
 
 // CHECK-LABEL: private @composite_dot_general_with_bias_and_relu6_fn_1
-// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<0.000000e+00>
-// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<6.000000e+00>
 // CHECK: %[[BROADCAST_IN_DIM:.*]] = stablehlo.broadcast_in_dim %arg2
 // CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
+// CHECK-DAG: %[[CONST_1:.*]] = stablehlo.constant dense<6.000000e+00>
 // CHECK: %[[ADD:.*]] = stablehlo.add %[[DOT_GENERAL]], %[[BROADCAST_IN_DIM]]
+// CHECK-DAG: %[[CONST_0:.*]] = stablehlo.constant dense<0.000000e+00>
 // CHECK: %[[CLAMP:.*]] = stablehlo.clamp %[[CONST_0]], %[[ADD]], %[[CONST_1]]
 // CHECK: return %[[CLAMP]] : tensor<1x1x64xf32>
 // CHECK: }
@@ -716,12 +716,12 @@ func.func @conv_with_bias_and_relu6_dynamic_fn(%arg0: tensor<?x28x28x1xf32>) ->
 // CHECK: }
 
 // CHECK-LABEL: private @composite_conv_with_bias_and_relu6_dynamic_fn_1
-// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<0.000000e+00>
-// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<6.000000e+00>
 // CHECK: %[[CONV:.*]] = stablehlo.convolution(%arg0, %arg1)
 // CHECK: %[[SHAPE_OF_0:.*]] = shape.shape_of %[[CONV]]
 // CHECK: %[[DYNAMIC_BROADCAST_IN_DIM_0:.*]] = stablehlo.dynamic_broadcast_in_dim %arg2, %[[SHAPE_OF_0]]
+// CHECK-DAG: %[[CONST_1:.*]] = stablehlo.constant dense<6.000000e+00>
 // CHECK: %[[ADD:.*]] = stablehlo.add %[[CONV]], %[[DYNAMIC_BROADCAST_IN_DIM_0]]
+// CHECK-DAG: %[[CONST_0:.*]] = stablehlo.constant dense<0.000000e+00>
 // CHECK: %[[CLAMP:.*]] = stablehlo.clamp %[[CONST_0]], %[[ADD]], %[[CONST_1]]
 // CHECK: return %[[CLAMP]] : tensor<?x28x28x16xf32>
 // CHECK: }
@@ -771,12 +771,12 @@ func.func @dot_general_with_bias_and_relu6_dynamic_fn(%arg0: tensor<?x12544xf32>
 // CHECK: }
 
 // CHECK-LABEL: private @composite_dot_general_with_bias_and_relu6_dynamic_fn_1
-// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<0.000000e+00>
-// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<6.000000e+00>
 // CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
 // CHECK: %[[SHAPE_OF_0:.*]] = shape.shape_of %[[DOT_GENERAL]]
 // CHECK: %[[DYNAMIC_BROADCAST_IN_DIM_0:.*]] = stablehlo.dynamic_broadcast_in_dim %arg2, %[[SHAPE_OF_0]]
+// CHECK-DAG: %[[CONST_1:.*]] = stablehlo.constant dense<6.000000e+00>
 // CHECK: %[[ADD:.*]] = stablehlo.add %[[DOT_GENERAL]], %[[DYNAMIC_BROADCAST_IN_DIM_0]]
+// CHECK-DAG: %[[CONST_0:.*]] = stablehlo.constant dense<0.000000e+00>
 // CHECK: %[[CLAMP:.*]] = stablehlo.clamp %[[CONST_0]], %[[ADD]], %[[CONST_1]]
 // CHECK: return %[[CLAMP]] : tensor<?x10xf32>
 // CHECK: }
@@ -808,3 +808,48 @@ func.func @gather_fn() -> tensor<2x3x2x2xi32> {
 // CHECK: %[[GATHER:.*]] = "stablehlo.gather"(%arg0, %arg1)
 // CHECK: return %[[GATHER]] : tensor<2x3x2x2xi32>
 // CHECK: }
+
+// -----
+
+// Test that the name of composite functions are deterministic. There are 3
+// unsorted functions in this module and each function has 2 quantizable ops.
+module {
+  func.func @conv_3_fn(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32> {
+    %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x4x4xf32>
+    %1 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
+    %2 = stablehlo.convolution(%1, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
+    func.return %2: tensor<1x3x3x4xf32>
+  }
+
+  func.func @conv_1_fn(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32> {
+    %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x4x4xf32>
+    %1 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
+    %2 = stablehlo.convolution(%1, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
+    func.return %2: tensor<1x3x3x4xf32>
+  }
+
+  func.func @conv_2_fn(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32> {
+    %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x4x4xf32>
+    %1 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
+    %2 = stablehlo.convolution(%1, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
+    func.return %2: tensor<1x3x3x4xf32>
+  }
+}
+
+// CHECK-LABEL: @conv_3_fn
+// CHECK: tf.XlaCallModule
+// CHECK-SAME: _entry_function = @composite_conv_fn_6, _original_entry_function = "composite_conv_fn_6"
+// CHECK: tf.XlaCallModule
+// CHECK-SAME: _entry_function = @composite_conv_fn_5, _original_entry_function = "composite_conv_fn_5"
+
+// CHECK-LABEL: @conv_1_fn
+// CHECK: tf.XlaCallModule
+// CHECK-SAME: _entry_function = @composite_conv_fn_2, _original_entry_function = "composite_conv_fn_2"
+// CHECK: tf.XlaCallModule
+// CHECK-SAME: _entry_function = @composite_conv_fn_1, _original_entry_function = "composite_conv_fn_1"
+
+// CHECK-LABEL: @conv_2_fn
+// CHECK: tf.XlaCallModule
+// CHECK-SAME: _entry_function = @composite_conv_fn_4, _original_entry_function = "composite_conv_fn_4"
+// CHECK: tf.XlaCallModule
+// CHECK-SAME: _entry_function = @composite_conv_fn_3, _original_entry_function = "composite_conv_fn_3"
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/post_quantize.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/post_quantize.mlir
index ae2f57081e40f7..301a0661633425 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/post_quantize.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/post_quantize.mlir
@@ -37,7 +37,7 @@ func.func @remove_volatile_qdq_with_requantization(%arg0: tensor<3x2xf32>) -> te
 // CHECK-LABEL: @quantize_constant
 // CHECK-SAME: %[[ARG0:.*]]: tensor<1x3xf32>
 func.func @quantize_constant(%arg0: tensor<1x3xf32>) -> tensor<1x2xf32> {
-  // CHECK-DAG: %[[QCST:.*]] = stablehlo.constant() {value = dense<-78> : tensor<3x2xi8>} : () -> tensor<3x2x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>
+  // CHECK-DAG: %[[QCST:.*]] = stablehlo.constant() <{value = dense<-78> : tensor<3x2xi8>}> : () -> tensor<3x2x!quant.uniform<i8<-127:127>:f32, 5.000000e-03>>
   // CHECK-DAG: %[[Q1:.*]] = stablehlo.uniform_quantize %[[ARG0]]
   // CHECK-NOT: "quantfork.qcast"
   // CHECK: %[[DOT:.*]] = stablehlo.dot %[[Q1]], %[[QCST]]
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_op_with_region.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_op_with_region.mlir
index d94e1ca3787a3c..f5626e8b1506be 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_op_with_region.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_op_with_region.mlir
@@ -14,12 +14,12 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
     // CHECK: %[[CALL:.*]] = call @quantized_dot_general_fn_1(%[[Q2]], %[[Q1]])
 
     // CHECK: %[[REDUCE:.*]] = "stablehlo.reduce_window"(%[[CALL]], %[[Q0]])
+    // CHECK{LITERAL}: padding = dense<[[0, 0], [1, 1], [1, 1], [0, 0]]> : tensor<4x2xi64>
+    // CHECK-SAME: window_dimensions = array<i64: 1, 3, 3, 1>
     // CHECK: %[[ARG1:.*]]: tensor<!quant.uniform<i8:f32, 3.000000e-01:1>>, %[[ARG2:.*]]: tensor<!quant.uniform<i8:f32, 3.000000e-01:1>>
     // CHECK: %[[MAX:.*]] = stablehlo.maximum %[[ARG1]], %[[ARG2]] : tensor<!quant.uniform<i8:f32, 3.000000e-01:1>>
     // CHECK: stablehlo.return %[[MAX]] : tensor<!quant.uniform<i8:f32, 3.000000e-01:1>>
-    // CHECK{LITERAL}: padding = dense<[[0, 0], [1, 1], [1, 1], [0, 0]]> : tensor<4x2xi64>
-    // CHECK-SAME: window_dimensions = array<i64: 1, 3, 3, 1>
-    // CHECK-SAME: (tensor<2x3x1x3x!quant.uniform<i8:f32, 3.000000e-01:1>>, tensor<!quant.uniform<i8:f32, 3.000000e-01:1>>) -> tensor<2x3x1x3x!quant.uniform<i8:f32, 3.000000e-01:1>>
+    // CHECK: (tensor<2x3x1x3x!quant.uniform<i8:f32, 3.000000e-01:1>>, tensor<!quant.uniform<i8:f32, 3.000000e-01:1>>) -> tensor<2x3x1x3x!quant.uniform<i8:f32, 3.000000e-01:1>>
 
     // CHECK: %[[DQ:.*]] = "quantfork.dcast"(%[[REDUCE]])
     // CHECK: return %[[DQ]]
@@ -70,12 +70,12 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
     // CHECK: %[[Q1:.*]] = "quantfork.qcast"(%[[ARG0]])
 
     // CHECK: %[[REDUCE:.*]] = "stablehlo.reduce_window"(%[[Q1]], %[[Q0]])
+    // CHECK{LITERAL}: padding = dense<[[0, 0], [1, 1], [1, 1], [0, 0]]> : tensor<4x2xi64>
+    // CHECK-SAME: window_dimensions = array<i64: 1, 3, 3, 1>
     // CHECK: %[[ARG1:.*]]: tensor<!quant.uniform<i8:f32, 5.000000e-01:2>>, %[[ARG2:.*]]: tensor<!quant.uniform<i8:f32, 5.000000e-01:2>>
     // CHECK: %[[MAX:.*]] = stablehlo.maximum %[[ARG1]], %[[ARG2]] : tensor<!quant.uniform<i8:f32, 5.000000e-01:2>>
     // CHECK: stablehlo.return %[[MAX]] : tensor<!quant.uniform<i8:f32, 5.000000e-01:2>>
-    // CHECK{LITERAL}: padding = dense<[[0, 0], [1, 1], [1, 1], [0, 0]]> : tensor<4x2xi64>
-    // CHECK-SAME: window_dimensions = array<i64: 1, 3, 3, 1>
-    // CHECK-SAME: (tensor<2x3x1x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>, tensor<!quant.uniform<i8:f32, 5.000000e-01:2>>) -> tensor<2x3x1x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>
+    // CHECK: (tensor<2x3x1x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>, tensor<!quant.uniform<i8:f32, 5.000000e-01:2>>) -> tensor<2x3x1x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>
 
     // CHECK: %[[Q2:.*]] = "quantfork.qcast"(%[[CST1]])
     // CHECK: %[[CALL:.*]] = call @quantized_dot_general_fn_1(%[[REDUCE]], %[[Q2]])
@@ -132,12 +132,12 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
     // CHECK: %[[RESHAPE:.*]] = stablehlo.reshape %[[CALL]]
 
     // CHECK: %[[REDUCE:.*]] = "stablehlo.reduce_window"(%[[RESHAPE]], %[[Q0]])
+    // CHECK{LITERAL}: padding = dense<[[0, 0], [1, 1], [0, 0]]> : tensor<3x2xi64>
+    // CHECK-SAME: window_dimensions = array<i64: 1, 3, 1>
     // CHECK: %[[ARG1:.*]]: tensor<!quant.uniform<i8:f32, 3.000000e-01:1>>, %[[ARG2:.*]]: tensor<!quant.uniform<i8:f32, 3.000000e-01:1>>
     // CHECK: %[[MAX:.*]] = stablehlo.maximum %[[ARG1]], %[[ARG2]] : tensor<!quant.uniform<i8:f32, 3.000000e-01:1>>
     // CHECK: stablehlo.return %[[MAX]] : tensor<!quant.uniform<i8:f32, 3.000000e-01:1>>
-    // CHECK{LITERAL}: padding = dense<[[0, 0], [1, 1], [0, 0]]> : tensor<3x2xi64>
-    // CHECK-SAME: window_dimensions = array<i64: 1, 3, 1>
-    // CHECK-SAME: (tensor<2x3x3x!quant.uniform<i8:f32, 3.000000e-01:1>>, tensor<!quant.uniform<i8:f32, 3.000000e-01:1>>) -> tensor<2x3x3x!quant.uniform<i8:f32, 3.000000e-01:1>>
+    // CHECK: (tensor<2x3x3x!quant.uniform<i8:f32, 3.000000e-01:1>>, tensor<!quant.uniform<i8:f32, 3.000000e-01:1>>) -> tensor<2x3x3x!quant.uniform<i8:f32, 3.000000e-01:1>>
 
     // CHECK: %[[DQ:.*]] = "quantfork.dcast"(%[[REDUCE]])
     // CHECK: return %[[DQ]]
@@ -191,12 +191,12 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
     // CHECK: %[[Q1:.*]] = "quantfork.qcast"(%[[ARG0]])
 
     // CHECK: %[[REDUCE:.*]] = "stablehlo.reduce_window"(%[[Q1]], %[[Q0]])
+    // CHECK{LITERAL}: padding = dense<[[0, 0], [1, 1], [0, 0]]> : tensor<3x2xi64>
+    // CHECK-SAME: window_dimensions = array<i64: 1, 3, 1>
     // CHECK: %[[ARG1:.*]]: tensor<!quant.uniform<i8:f32, 5.000000e-01:2>>, %[[ARG2:.*]]: tensor<!quant.uniform<i8:f32, 5.000000e-01:2>>
     // CHECK: %[[MAX:.*]] = stablehlo.maximum %[[ARG1]], %[[ARG2]] : tensor<!quant.uniform<i8:f32, 5.000000e-01:2>>
     // CHECK: stablehlo.return %[[MAX]] : tensor<!quant.uniform<i8:f32, 5.000000e-01:2>>
-    // CHECK{LITERAL}: padding = dense<[[0, 0], [1, 1], [0, 0]]> : tensor<3x2xi64>
-    // CHECK-SAME: window_dimensions = array<i64: 1, 3, 1>
-    // CHECK-SAME: (tensor<2x3x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>, tensor<!quant.uniform<i8:f32, 5.000000e-01:2>>) -> tensor<2x3x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>
+    // CHECK: (tensor<2x3x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>, tensor<!quant.uniform<i8:f32, 5.000000e-01:2>>) -> tensor<2x3x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>
 
     // CHECK: %[[RESHAPE:.*]] = stablehlo.reshape %[[REDUCE]]
     // CHECK: %[[Q2:.*]] = "quantfork.qcast"(%[[CST1]])
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_same_scale.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_same_scale.mlir
index 25aab3044a3496..7a905dfbe58a9e 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_same_scale.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_same_scale.mlir
@@ -291,17 +291,17 @@ module attributes {tf_saved_model.semantics} {
   // CHECK-SAME: %[[ARG2:.*]]: tensor<2x3x2xi64>
   func.func private @composite_and_gather(%arg0: tensor<3x4x5xf32>, %arg1: tensor<3x5x2xf32>, %arg2: tensor<2x3x2xi64>) -> tensor<2x3x2x2xf32> {
     // CHECK: %[[Q1:.*]] = "quantfork.qcast"(%[[ARG0]]) {volatile} : (tensor<3x4x5xf32>) -> tensor<3x4x5x!quant.uniform<i8:f32, 5.000000e-03>>
-    // CHECK: %[[Q2:.*]] = "quantfork.qcast"(%[[ARG1]]) {volatile} : (tensor<3x5x2xf32>) -> tensor<3x5x2x!quant.uniform<i8<-127:127>:f32, 6.000000e-03:13>>
+    // CHECK: %[[Q2:.*]] = "quantfork.qcast"(%[[ARG1]]) {volatile} : (tensor<3x5x2xf32>) -> tensor<3x5x2x!quant.uniform<i8<-127:127>:f32, 6.000000e-03>>
     // CHECK: %[[CALL:.*]] = call @quantized_dot_general_fn_1(%[[Q1]], %[[Q2]])
-    // CHECK-SAME: (tensor<3x4x5x!quant.uniform<i8:f32, 5.000000e-03>>, tensor<3x5x2x!quant.uniform<i8<-127:127>:f32, 6.000000e-03:13>>) -> tensor<3x4x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
+    // CHECK-SAME: (tensor<3x4x5x!quant.uniform<i8:f32, 5.000000e-03>>, tensor<3x5x2x!quant.uniform<i8<-127:127>:f32, 6.000000e-03>>) -> tensor<3x4x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
     // CHECK: %[[GATHER:.*]] = "stablehlo.gather"(%[[CALL]], %[[ARG2]])
     // CHECK-SAME: (tensor<3x4x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>, tensor<2x3x2xi64>) -> tensor<2x3x2x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
     // CHECK: %[[DQ:.*]] = "quantfork.dcast"(%[[GATHER]]) : (tensor<2x3x2x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<2x3x2x2xf32>
     // CHECK: return %[[DQ]]
     %0 = "quantfork.qcast"(%arg0) {volatile} : (tensor<3x4x5xf32>) -> tensor<3x4x5x!quant.uniform<i8:f32, 5.000000e-03>>
     %1 = "quantfork.dcast"(%0) : (tensor<3x4x5x!quant.uniform<i8:f32, 5.000000e-03>>) -> tensor<3x4x5xf32>
-    %2 = "quantfork.qcast"(%arg1) {volatile} : (tensor<3x5x2xf32>) -> tensor<3x5x2x!quant.uniform<i8<-127:127>:f32, 6.000000e-03:13>>
-    %3 = "quantfork.dcast"(%2) : (tensor<3x5x2x!quant.uniform<i8<-127:127>:f32, 6.000000e-03:13>>) -> tensor<3x5x2xf32>
+    %2 = "quantfork.qcast"(%arg1) {volatile} : (tensor<3x5x2xf32>) -> tensor<3x5x2x!quant.uniform<i8<-127:127>:f32, 6.000000e-03>>
+    %3 = "quantfork.dcast"(%2) : (tensor<3x5x2x!quant.uniform<i8<-127:127>:f32, 6.000000e-03>>) -> tensor<3x5x2xf32>
     %4 = "tf.XlaCallModule"(%1, %3) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _quantization_method = "static_range_ptq {}", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<3x4x5xf32>, tensor<3x5x2xf32>) -> tensor<3x4x2xf32>
     %5 = "quantfork.qcast"(%4) {volatile} : (tensor<3x4x2xf32>) -> tensor<3x4x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
     %6 = "quantfork.dcast"(%5) : (tensor<3x4x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<3x4x2xf32>
@@ -321,10 +321,10 @@ module attributes {tf_saved_model.semantics} {
 
   // CHECK: quantized_dot_general_fn_1
   // CHECK-SAME: %[[ARG2:.*]]: tensor<3x4x5x!quant.uniform<i8:f32, 5.000000e-03>>
-  // CHECK-SAME: %[[ARG3:.*]]: tensor<3x5x2x!quant.uniform<i8<-127:127>:f32, 6.000000e-03:13>>
+  // CHECK-SAME: %[[ARG3:.*]]: tensor<3x5x2x!quant.uniform<i8<-127:127>:f32, 6.000000e-03>>
   func.func private @composite_dot_general_fn_1(%arg0: tensor<3x4x5xf32>, %arg1: tensor<3x5x2xf32>) -> tensor<3x4x2xf32> attributes {_from_xla_call_module} {
     // CHECK: %[[DOT:.*]] = stablehlo.dot_general %[[ARG2]], %[[ARG3]]
-    // CHECK-SAME: (tensor<3x4x5x!quant.uniform<i8:f32, 5.000000e-03>>, tensor<3x5x2x!quant.uniform<i8<-127:127>:f32, 6.000000e-03:13>>) -> tensor<3x4x2x!quant.uniform<i32:f32, 3.000000e-05>>
+    // CHECK-SAME: (tensor<3x4x5x!quant.uniform<i8:f32, 5.000000e-03>>, tensor<3x5x2x!quant.uniform<i8<-127:127>:f32, 6.000000e-03>>) -> tensor<3x4x2x!quant.uniform<i32:f32, 3.000000e-05>>
     // CHECK: %[[Q3:.*]] = stablehlo.uniform_quantize %0 : (tensor<3x4x2x!quant.uniform<i32:f32, 3.000000e-05>>) -> tensor<3x4x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
     // CHECK: return %[[Q3]]
     %0 = stablehlo.dot_general %arg0, %arg1, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<3x4x5xf32>, tensor<3x5x2xf32>) -> tensor<3x4x2xf32>
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_weight_only.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_weight_only.mlir
index 81e8b4bde5e13e..e152a90ce72c3a 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_weight_only.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_weight_only.mlir
@@ -1,4 +1,4 @@
-// RUN: stablehlo-quant-opt %s -split-input-file -stablehlo-quantize=enable-weight-only=true | FileCheck %s
+// RUN: stablehlo-quant-opt %s -split-input-file -stablehlo-quantize | FileCheck %s
 
 // Test that hybrid quantized dot_general is produced when q/dq pair only exists
 // for weight.
@@ -6,8 +6,8 @@
 module attributes {tf_saved_model.semantics} {
   func.func private @quantize_dot_general_fn(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
     %cst = stablehlo.constant dense<3.000000e-01> : tensor<2x3xf32>
-    %0 = "quantfork.qcast"(%cst) : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03:-128>>
-    %1 = "quantfork.dcast"(%0) : (tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<2x3xf32>
+    %0 = "quantfork.qcast"(%cst) : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03>>
+    %1 = "quantfork.dcast"(%0) : (tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03>>) -> tensor<2x3xf32>
     %2 = "tf.XlaCallModule"(%arg0, %1) <{Sout = [#tf_type.shape<1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _quantization_method = "weight_only_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
     return %2 : tensor<1x3xf32>
   }
@@ -21,15 +21,15 @@ module attributes {tf_saved_model.semantics} {
 // CHECK-LABEL: quantize_dot_general_fn
 // CHECK-SAME: %[[ARG0:.+]]: tensor<1x2xf32>
 // CHECK: %[[CST:.+]] = stablehlo.constant dense<3.000000e-01> : tensor<2x3xf32>
-// CHECK: %[[Q:.+]] = "quantfork.qcast"(%[[CST]]) : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03:-128>>
+// CHECK: %[[Q:.+]] = "quantfork.qcast"(%[[CST]]) : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03>>
 // CHECK: %[[CALL:.+]] = call @quantized_dot_general_fn(%[[ARG0]], %[[Q]])
-// CHECK-SAME: {_quantization_method = "weight_only_ptq { }"} : (tensor<1x2xf32>, tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<1x3xf32>
+// CHECK-SAME: {_quantization_method = "weight_only_ptq { }"} : (tensor<1x2xf32>, tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03>>) -> tensor<1x3xf32>
 // CHECK: return %[[CALL]]
 
 // CHECK: quantized_dot_general_fn
-// CHECK-SAME: (%[[ARG1:.+]]: tensor<1x2xf32>,  %[[ARG2:.+]]: tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<1x3xf32>
+// CHECK-SAME: (%[[ARG1:.+]]: tensor<1x2xf32>,  %[[ARG2:.+]]: tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03>>) -> tensor<1x3xf32>
 // CHECK: %[[DOT:.+]] = stablehlo.dot_general %[[ARG1]], %[[ARG2]]
-// CHECK-SAME: (tensor<1x2xf32>, tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<1x3xf32>
+// CHECK-SAME: (tensor<1x2xf32>, tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03>>) -> tensor<1x3xf32>
 // CHECK: return %[[DOT]]
 
 // -----
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions.mlir
index 09f002559b7830..a9d805412fdd2a 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions.mlir
@@ -17,14 +17,14 @@ module attributes {tf_saved_model.semantics} {
 // calls the quantized entry function.
 
 // CHECK: func.func private @quantize_dot_general_fn(%[[ARG_0:.+]]: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"}
-// CHECK: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>
+// CHECK: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3xi8>}> : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>
 // CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK: %[[CALL_0:.+]] = call @quantized_dot_general_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]]) {_quantization_method = "static_range_ptq { }"} : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3xf32>
 // CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3xf32>
 
 // CHECK-PER-TENSOR: func.func private @quantize_dot_general_fn(%[[ARG_0:.+]]: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"}
-// CHECK-PER-TENSOR: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>
+// CHECK-PER-TENSOR: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3xi8>}> : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>
 // CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_dot_general_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]]) {_quantization_method = "static_range_ptq { }"} : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK-PER-TENSOR: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3xf32>
@@ -51,7 +51,7 @@ module attributes {tf_saved_model.semantics} {
 
 // -----
 
-// Tests that `stablehlo.dot_general` with `batching_dim` is not quantized.
+// Tests that `stablehlo.dot_general` with `batching_dim` is quantized.
 
 module attributes {tf_saved_model.semantics} {
   func.func private @quantize_dot_general_batch_per_tensor_quantized_fn(%arg0: tensor<2x2x2xf32>) -> tensor<2x2x3xf32> attributes {tf._original_func_name = "main_0"} {
@@ -62,9 +62,9 @@ module attributes {tf_saved_model.semantics} {
     return %2 : tensor<2x2x3xf32>
   }
 // CHECK: func.func private @quantize_dot_general_batch_per_tensor_quantized_fn(%[[ARG_0:.+]]: tensor<2x2x2xf32>) -> tensor<2x2x3xf32> attributes {tf._original_func_name = "main_0"}
-// CHECK: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<127> : tensor<2x2x3xi8>} : () -> tensor<2x2x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<127> : tensor<2x2x3xi8>}> : () -> tensor<2x2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>>
 // CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<2x2x2xf32>) -> tensor<2x2x2x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK: %[[CALL_0:.+]] = call @quantized_dot_general_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]]) {_quantization_method = "static_range_ptq { }"} : (tensor<2x2x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x2x3x!quant.uniform<i8:f32, {{.*}}>) -> tensor<2x2x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CALL_0:.+]] = call @quantized_dot_general_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]]) {_quantization_method = "static_range_ptq { }"} : (tensor<2x2x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>) -> tensor<2x2x3x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<2x2x3x!quant.uniform<i8:f32, {{.*}}>) -> tensor<2x2x3xf32>
 // CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<2x2x3xf32>
 
@@ -88,16 +88,16 @@ module attributes {tf_saved_model.semantics} {
     return %2 : tensor<1x3xf32>
   }
 // CHECK: func.func private @quantize_dot_general_with_bias_same_shape_fn(%[[ARG_0:.+]]: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"}
-// CHECK: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>
-// CHECK: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<1x3xi32>} : () -> tensor<1x3x!quant.uniform<i32:f32:1, {{.*}}>
+// CHECK: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3xi8>}> : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>
+// CHECK: %[[CONST_1:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<1x3xi32>}> : () -> tensor<1x3x!quant.uniform<i32:f32:1, {{.*}}>
 // CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK: %[[CALL_0:.+]] = call @quantized_dot_general_with_bias_same_shape_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) {_quantization_method = "static_range_ptq { }"} : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>, tensor<1x3x!quant.uniform<i32:f32:1, {{.*}}>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>
 // CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3xf32>
 // CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3xf32>
 
 // CHECK-PER-TENSOR: func.func private @quantize_dot_general_with_bias_same_shape_fn(%[[ARG_0:.+]]: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"}
-// CHECK-PER-TENSOR-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>
-// CHECK-PER-TENSOR-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<1x3xi32>} : () -> tensor<1x3x!quant.uniform<i32:f32, {{.*}}>
+// CHECK-PER-TENSOR-DAG: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3xi8>}> : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>
+// CHECK-PER-TENSOR-DAG: %[[CONST_1:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<1x3xi32>}> : () -> tensor<1x3x!quant.uniform<i32:f32, {{.*}}>
 // CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_dot_general_with_bias_same_shape_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) {_quantization_method = "static_range_ptq { }"} : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>, tensor<1x3x!quant.uniform<i32:f32, {{.*}}>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>
 // CHECK-PER-TENSOR: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3xf32>
@@ -137,16 +137,16 @@ module attributes {tf_saved_model.semantics} {
     return %2 : tensor<?x3xf32>
   }
 // CHECK: func.func private @quantize_dot_general_with_bias_dynamic_fn(%[[ARG_0:.+]]: tensor<?x2xf32>) -> tensor<?x3xf32> attributes {tf._original_func_name = "main_0"}
-// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>
-// CHECK-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<3xi32>} : () -> tensor<3x!quant.uniform<i32:f32:0, {{.*}}>
+// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3xi8>}> : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>
+// CHECK-DAG: %[[CONST_1:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<3xi32>}> : () -> tensor<3x!quant.uniform<i32:f32:0, {{.*}}>
 // CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<?x2xf32>) -> tensor<?x2x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK: %[[CALL_0:.+]] = call @quantized_dot_general_with_bias_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) {_quantization_method = "static_range_ptq { }"} : (tensor<?x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>, tensor<3x!quant.uniform<i32:f32:0, {{.*}}>) -> tensor<?x3x!quant.uniform<i8:f32, {{.*}}>
 // CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<?x3x!quant.uniform<i8:f32, {{.*}}>) -> tensor<?x3xf32>
 // CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<?x3xf32>
 
 // CHECK-PER-TENSOR: func.func private @quantize_dot_general_with_bias_dynamic_fn(%[[ARG_0:.+]]: tensor<?x2xf32>) -> tensor<?x3xf32> attributes {tf._original_func_name = "main_0"}
-// CHECK-PER-TENSOR-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>
-// CHECK-PER-TENSOR-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<3xi32>} : () -> tensor<3x!quant.uniform<i32:f32, {{.*}}>
+// CHECK-PER-TENSOR-DAG: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3xi8>}> : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>
+// CHECK-PER-TENSOR-DAG: %[[CONST_1:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<3xi32>}> : () -> tensor<3x!quant.uniform<i32:f32, {{.*}}>
 // CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<?x2xf32>) -> tensor<?x2x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_dot_general_with_bias_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) {_quantization_method = "static_range_ptq { }"} : (tensor<?x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>, tensor<3x!quant.uniform<i32:f32, {{.*}}>) -> tensor<?x3x!quant.uniform<i8:f32, {{.*}}>
 // CHECK-PER-TENSOR: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<?x3x!quant.uniform<i8:f32, {{.*}}>) -> tensor<?x3xf32>
@@ -219,7 +219,7 @@ module attributes {tf_saved_model.semantics} {
 // calls the quantized entry function.
 
 // CHECK: func.func private @quantize_conv_fn(%[[ARG_0:.+]]: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
-// CHECK: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>
+// CHECK: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3x3x2xi8>}> : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>
 // CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK: %[[CALL_0:.+]] = call @quantized_conv_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]])
 // CHECK-SAME: {_quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}"} : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
@@ -227,7 +227,7 @@ module attributes {tf_saved_model.semantics} {
 // CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3x4x2xf32>
 
 // CHECK-PER-TENSOR: func.func private @quantize_conv_fn(%[[ARG_0:.+]]: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
-// CHECK-PER-TENSOR: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>
+// CHECK-PER-TENSOR: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3x3x2xi8>}> : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>
 // CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_conv_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]])
 // CHECK-PER-TENSOR-SAME: {_quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}"} : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
@@ -286,7 +286,7 @@ func.func @quantize_conv_fn_per_tensor(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3
 // calls the quantized entry function.
 
 // CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32>
-// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8:f32, {{.*}}>
+// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3x3x2xi8>}> : () -> tensor<2x3x3x2x!quant.uniform<i8:f32, {{.*}}>
 // CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK: %[[CALL_0:.+]] = call @quantized_conv_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]]) {_quantization_method = "static_range_ptq { }"} : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3x4x2xf32>
@@ -338,8 +338,8 @@ module attributes {tf_saved_model.semantics} {
     return %2 : tensor<1x3x4x2xf32>
   }
 // CHECK: func.func private @quantize_conv_with_bias_1d_fn(%[[ARG_0:.+]]: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
-// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>
-// CHECK-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<47978> : tensor<2xi32>} : () -> tensor<2x!quant.uniform<i32:f32:0, {8.3371932554046126E-6,8.3371932554046126E-6}>>
+// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3x3x2xi8>}> : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>
+// CHECK-DAG: %[[CONST_1:.+]] = stablehlo.constant() <{value = dense<47978> : tensor<2xi32>}> : () -> tensor<2x!quant.uniform<i32:f32:0, {8.3371932554046126E-6,8.3371932554046126E-6}>>
 // CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK: %[[CALL_0:.+]] = call @quantized_conv_with_bias_1d_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]])
 // CHECK-SAME: {_quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}"} : (tensor<1x3x4x3x!quant.uniform<i8:f32, 0.0035294116712084002:-128>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>, tensor<2x!quant.uniform<i32:f32:0, {8.3371932554046126E-6,8.3371932554046126E-6}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, 0.0027450979924669452:-128>>
@@ -347,8 +347,8 @@ module attributes {tf_saved_model.semantics} {
 // CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3x4x2xf32>
 
 // CHECK-PER-TENSOR: func.func private @quantize_conv_with_bias_1d_fn(%[[ARG_0:.+]]: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
-// CHECK-PER-TENSOR-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>
-// CHECK-PER-TENSOR-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2xi32>} : () -> tensor<2x!quant.uniform<i32:f32, {{.*}}>
+// CHECK-PER-TENSOR-DAG: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3x3x2xi8>}> : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>
+// CHECK-PER-TENSOR-DAG: %[[CONST_1:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2xi32>}> : () -> tensor<2x!quant.uniform<i32:f32, {{.*}}>
 // CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_conv_with_bias_1d_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]])
 // CHECK-PER-TENSOR-SAME: {_quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}"} : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>, tensor<2x!quant.uniform<i32:f32, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>
@@ -407,8 +407,8 @@ module attributes {tf_saved_model.semantics} {
     return %2 : tensor<1x3x4x2xf32>
   }
 // CHECK: func.func private @quantize_conv_with_bias_fn(%[[ARG_0:.+]]: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
-// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>
-// CHECK-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<1x1x1x2xi32>} : () -> tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>
+// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3x3x2xi8>}> : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>
+// CHECK-DAG: %[[CONST_1:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<1x1x1x2xi32>}> : () -> tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>
 // CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK: %[[CALL_0:.+]] = call @quantized_conv_with_bias_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]])
 // CHECK-SAME: {_quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}"} : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>, tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>
@@ -416,8 +416,8 @@ module attributes {tf_saved_model.semantics} {
 // CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3x4x2xf32>
 
 // CHECK-PER-TENSOR: func.func private @quantize_conv_with_bias_fn(%[[ARG_0:.+]]: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
-// CHECK-PER-TENSOR-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>
-// CHECK-PER-TENSOR-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<1x1x1x2xi32>} : () -> tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>
+// CHECK-PER-TENSOR-DAG: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3x3x2xi8>}> : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>
+// CHECK-PER-TENSOR-DAG: %[[CONST_1:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<1x1x1x2xi32>}> : () -> tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>
 // CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_conv_with_bias_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]])
 // CHECK-PER-TENSOR-SAME: {_quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}"} : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>, tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>
@@ -475,8 +475,8 @@ module attributes {tf_saved_model.semantics} {
     return %2 : tensor<?x3x4x2xf32>
   }
 // CHECK: func.func private @quantize_conv_with_bias_dynamic_fn(%[[ARG_0:.+]]: tensor<?x3x4x3xf32>) -> tensor<?x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
-// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>
-// CHECK-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<1x1x1x2xi32>} : () -> tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>
+// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3x3x2xi8>}> : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>
+// CHECK-DAG: %[[CONST_1:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<1x1x1x2xi32>}> : () -> tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>
 // CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK: %[[CALL_0:.+]] = call @quantized_conv_with_bias_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]])
 // CHECK-SAME: {_quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}"} : (tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>, tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>
@@ -484,8 +484,8 @@ module attributes {tf_saved_model.semantics} {
 // CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<?x3x4x2xf32>
 
 // CHECK-PER-TENSOR: func.func private @quantize_conv_with_bias_dynamic_fn(%[[ARG_0:.+]]: tensor<?x3x4x3xf32>) -> tensor<?x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
-// CHECK-PER-TENSOR-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>
-// CHECK-PER-TENSOR-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<1x1x1x2xi32>} : () -> tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>
+// CHECK-PER-TENSOR-DAG: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3x3x2xi8>}> : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>
+// CHECK-PER-TENSOR-DAG: %[[CONST_1:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<1x1x1x2xi32>}> : () -> tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>
 // CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_conv_with_bias_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]])
 // CHECK-PER_TENSOR-SAME: {_quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}"} : (tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>, tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>
@@ -569,8 +569,8 @@ module attributes {tf_saved_model.semantics} {
     return %2 : tensor<?x3x4x2xf32>
   }
 // CHECK: func.func private @quantize_conv_with_bias_and_relu_dynamic_fn(%[[ARG_0:.+]]: tensor<?x3x4x3xf32>) -> tensor<?x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
-// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>
-// CHECK-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<1x1x1x2xi32>} : () -> tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>
+// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3x3x2xi8>}> : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>
+// CHECK-DAG: %[[CONST_1:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<1x1x1x2xi32>}> : () -> tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>
 // CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK: %[[CALL_0:.+]] = call @quantized_conv_with_bias_and_relu_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]])
 // CHECK-SAME: {_quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}"} : (tensor<?x3x4x3x!quant.uniform<i8:f32, 0.0035294116712084002:-128>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>, tensor<1x1x1x2x!quant.uniform<i32:f32:3, {8.3371932554046126E-6,8.3371932554046126E-6}>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, 0.0031372549487095253:-128>>
@@ -578,8 +578,8 @@ module attributes {tf_saved_model.semantics} {
 // CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<?x3x4x2xf32>
 
 // CHECK-PER-TENSOR: func.func private @quantize_conv_with_bias_and_relu_dynamic_fn(%[[ARG_0:.+]]: tensor<?x3x4x3xf32>) -> tensor<?x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
-// CHECK-PER-TENSOR-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>
-// CHECK-PER-TENSOR-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<1x1x1x2xi32>} : () -> tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>
+// CHECK-PER-TENSOR-DAG: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3x3x2xi8>}> : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>
+// CHECK-PER-TENSOR-DAG: %[[CONST_1:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<1x1x1x2xi32>}> : () -> tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>
 // CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_conv_with_bias_and_relu_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]])
 // CHECK-PER-TENSOR-SAME: {_quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}"} : (tensor<?x3x4x3x!quant.uniform<i8:f32, 0.0035294116712084002:-128>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>, tensor<1x1x1x2x!quant.uniform<i32:f32, 8.3371932554046126E-6>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, 0.0031372549487095253:-128>>
@@ -666,8 +666,8 @@ module attributes {tf_saved_model.semantics} {
     return %2 : tensor<?x3x4x2xf32>
   }
 // CHECK: func.func private @quantize_conv_with_bias_and_relu6_dynamic_fn(%[[ARG_0:.+]]: tensor<?x3x4x3xf32>) -> tensor<?x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
-// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>
-// CHECK-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<1x1x1x2xi32>} : () -> tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>
+// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3x3x2xi8>}> : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>
+// CHECK-DAG: %[[CONST_1:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<1x1x1x2xi32>}> : () -> tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>
 // CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK: %[[CALL_0:.+]] = call @quantized_conv_with_bias_and_relu6_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]])
 // CHECK-SAME: {_quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}"} : (tensor<?x3x4x3x!quant.uniform<i8:f32, 0.0035294116712084002:-128>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>, tensor<1x1x1x2x!quant.uniform<i32:f32:3, {8.3371932554046126E-6,8.3371932554046126E-6}>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, 0.0023529412699680704:-128>>
@@ -675,8 +675,8 @@ module attributes {tf_saved_model.semantics} {
 // CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<?x3x4x2xf32>
 
 // CHECK-PER-TENSOR: func.func private @quantize_conv_with_bias_and_relu6_dynamic_fn(%[[ARG_0:.+]]: tensor<?x3x4x3xf32>) -> tensor<?x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
-// CHECK-PER-TENSOR-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>
-// CHECK-PER-TENSOR-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<1x1x1x2xi32>} : () -> tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>
+// CHECK-PER-TENSOR-DAG: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<2x3x3x2xi8>}> : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>
+// CHECK-PER-TENSOR-DAG: %[[CONST_1:.+]] = stablehlo.constant() <{value = dense<{{.*}}> : tensor<1x1x1x2xi32>}> : () -> tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>
 // CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_conv_with_bias_and_relu6_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]])
 // CHECK-PER-TENSOR: {_quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}"} : (tensor<?x3x4x3x!quant.uniform<i8:f32, 0.0035294116712084002:-128>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>, tensor<1x1x1x2x!quant.uniform<i32:f32, 8.3371932554046126E-6>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, 0.0023529412699680704:-128>>
@@ -810,8 +810,8 @@ module attributes {tf_saved_model.semantics} {
     %5 = "quantfork.stats"(%4) {layerStats = dense<[5.00000000e-6, 9.80000000e-1]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
     return %5 : tensor<1x3xf32>
   }
-// CHECK: %[[CONST:.+]] = stablehlo.constant() {value = dense<127> : tensor<1x2xi8>} : () -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<127> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>>
+// CHECK: %[[CONST:.+]] = stablehlo.constant() <{value = dense<127> : tensor<1x2xi8>}> : () -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CONST_0:.+]] = stablehlo.constant() <{value = dense<127> : tensor<2x3xi8>}> : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>>
 // CHECK: %[[UNIFORM_QUANTIZE:.+]] = stablehlo.uniform_quantize %[[ARG]] : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK: %[[CALL:.+]] = call @quantized_add_fn(%[[UNIFORM_QUANTIZE]], %[[CONST]]) {_quantization_method = "static_range_ptq { }"} : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK: %[[UNIFORM_DEQUANTIZE:.+]] = stablehlo.uniform_dequantize %[[CALL]] : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x2xf32>
@@ -837,3 +837,53 @@ module attributes {tf_saved_model.semantics} {
 // CHECK: %[[UNIFORM_QUANTIZE:.+]] = stablehlo.uniform_quantize %[[DOT_GENERAL]] : (tensor<1x3x!quant.uniform<i32:f32:1, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
 // CHECK: return %[[UNIFORM_QUANTIZE]] : tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
 }
+
+// -----
+
+// Tests that `stablehlo.add` is not quantized and emits error when the function
+// does not include two ops.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @not_quantize_fn_when_not_singular(%arg: tensor<1x2xf32>) -> tensor<1x2xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<1.00000000e-1> : tensor<1x2xf32>} : () -> tensor<1x2xf32>
+    %0 = "quantfork.stats"(%arg) {layerStats = dense<[4.00000000e-6, 9.80000000e-1]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst) {Sout = [#tf_type.shape<1x2>], _entry_function = @composite_add_fn, _original_entry_function = "composite_add_fn", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<1x2xf32>
+    // expected-error@+1 {{'stablehlo.uniform_dequantize' op operand #0 must be tensor of 4/8/16/32-bit uniform quantized signed integer or 4/8/16/32-bit uniform quantized unsigned integer or 4/8/16/32-bit uniform quantized per axis signed integer or 4/8/16/32-bit uniform quantized per axis unsigned integer values, but got 'tensor<1x2xf32>'}}
+    %2 = "quantfork.stats"(%1) {layerStats = dense<[4.00000000e-6, 9.80000000e-1]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+    return %2 : tensor<1x2xf32>
+  }
+
+  func.func private @composite_add_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<1x2xf32>) -> tensor<1x2xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.add %arg0, %arg1 : tensor<1x2xf32>
+    %1 = stablehlo.add %0, %arg1 : tensor<1x2xf32>
+    return %1 : tensor<1x2xf32>
+  }
+}
+
+// -----
+
+// Tests that `stablehlo.gather` without `static_range_ptq` is not quantized.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @not_quantize_singular_op_without_static_range_ptq(%arg: tensor<3x4x2xf32>) -> tensor<2x3x2x2xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<1> : tensor<2x3x2xi32>} : () -> tensor<2x3x2xi32>
+    %0 = "quantfork.stats"(%arg) {layerStats = dense<[4.00000000e-6, 9.80000000e-1]> : tensor<2xf32>} : (tensor<3x4x2xf32>) -> tensor<3x4x2xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst) {Sout = [#tf_type.shape<2x3x2x2>], _entry_function = @composite_gather_fn, _original_entry_function = "composite_gather_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<3x4x2xf32>, tensor<2x3x2xi32>) -> tensor<2x3x2x2xf32>
+    // expected-error@+1 {{'stablehlo.uniform_dequantize' op operand #0 must be tensor of 4/8/16/32-bit uniform quantized signed integer or 4/8/16/32-bit uniform quantized unsigned integer or 4/8/16/32-bit uniform quantized per axis signed integer or 4/8/16/32-bit uniform quantized per axis unsigned integer values, but got 'tensor<2x3x2x2xf32>'}}
+    %2 = "quantfork.stats"(%1) {layerStats = dense<[4.00000000e-6, 9.80000000e-1]> : tensor<2xf32>} : (tensor<2x3x2x2xf32>) -> tensor<2x3x2x2xf32>
+    return %2 : tensor<2x3x2x2xf32>
+  }
+
+  func.func private @composite_gather_fn(%arg0: tensor<3x4x2xf32>, %arg1: tensor<2x3x2xi32>) -> tensor<2x3x2x2xf32> attributes {_from_xla_call_module} {
+    %0 = "stablehlo.gather"(%arg0, %arg1) {
+      dimension_numbers = #stablehlo.gather<
+        offset_dims = [2, 3],
+        collapsed_slice_dims = [0],
+        start_index_map = [1, 0],
+        index_vector_dim = 2>,
+      slice_sizes = array<i64: 1, 2, 2>,
+      indices_are_sorted = false
+    } : (tensor<3x4x2xf32>, tensor<2x3x2xi32>) -> tensor<2x3x2x2xf32>
+    return %0 : tensor<2x3x2x2xf32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions_weight_only.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions_weight_only.mlir
index b96cb15039d763..148e1330cfca34 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions_weight_only.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions_weight_only.mlir
@@ -1,11 +1,11 @@
 // RUN: stablehlo-quant-opt %s -split-input-file -verify-diagnostics \
-// RUN:     -stablehlo-quantize-composite-functions=enable-weight-only=true | FileCheck --check-prefix=CHECK %s
+// RUN:     -stablehlo-quantize-composite-functions | FileCheck --check-prefix=CHECK %s
 
-// Test that weight-only quantized dot_general op is produced when
-// weight_only_ptq is provided.
+// Test that per-tensor weight-only quantized dot_general op is produced when
+// empty `weight_only_ptq` is provided.
 
 module attributes {tf_saved_model.semantics} {
-  func.func private @quantize_dot_general_fn(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
+  func.func private @quantize_dot_general_per_tensor(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
     %0 = stablehlo.constant dense<3.000000e-01> : tensor<2x3xf32>
     %1 = "tf.XlaCallModule"(%arg0, %0) <{Sout = [#tf_type.shape<1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _quantization_method = "weight_only_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
     return %1 : tensor<1x3xf32>
@@ -17,25 +17,25 @@ module attributes {tf_saved_model.semantics} {
   }
 }
 
-// CHECK-LABEL: quantize_dot_general_fn
+// CHECK-LABEL: quantize_dot_general_per_tensor
 // CHECK-SAME: %[[ARG0:.+]]: tensor<1x2xf32>
-// CHECK: %[[CST:.+]] = stablehlo.constant() {value = dense<127> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8:f32, 0.0011764706349840352:-128>>
-// CHECK: %[[CALL:.+]] = call @quantized_dot_general_fn(%[[ARG0]], %[[CST]]) {_quantization_method = "weight_only_ptq { }"} : (tensor<1x2xf32>, tensor<2x3x!quant.uniform<i8:f32, 0.0011764706349840352:-128>>) -> tensor<1x3xf32>
+// CHECK: %[[CST:.+]] = stablehlo.constant() <{value = dense<127> : tensor<2x3xi8>}> : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>
+// CHECK: %[[CALL:.+]] = call @quantized_dot_general_fn(%[[ARG0]], %[[CST]]) {_quantization_method = "weight_only_ptq { }"} : (tensor<1x2xf32>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>) -> tensor<1x3xf32>
 // CHECK: return %[[CALL]]
 
 // CHECK: quantized_dot_general_fn
-// CHECK-SAME: (%[[ARG1:.+]]: tensor<1x2xf32>,  %[[ARG2:.+]]: tensor<2x3x!quant.uniform<i8:f32, 0.0011764706349840352:-128>>) -> tensor<1x3xf32>
+// CHECK-SAME: (%[[ARG1:.+]]: tensor<1x2xf32>,  %[[ARG2:.+]]: tensor<2x3x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>) -> tensor<1x3xf32>
 // CHECK: %[[DOT:.+]] = stablehlo.dot_general %[[ARG1]], %[[ARG2]]
-// CHECK-SAME: (tensor<1x2xf32>, tensor<2x3x!quant.uniform<i8:f32, 0.0011764706349840352:-128>>) -> tensor<1x3xf32>
+// CHECK-SAME: (tensor<1x2xf32>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>) -> tensor<1x3xf32>
 // CHECK: return %[[DOT]]
 
 // -----
 
-// Test that hybrid quantized convolution op is produced when weight_only_ptq is
-// provided.
+// Test that per-tensor weight-only quantized convolution op is produced when
+// empty `weight_only_ptq` is provided.
 
 module attributes {tf_saved_model.semantics} {
-  func.func private @quantize_conv_fn(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"} {
+  func.func private @quantize_conv_per_tensor(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"} {
     %0 = stablehlo.constant dense<3.000000e-01> : tensor<2x3x3x2xf32>
     %1 = "tf.XlaCallModule"(%arg0, %0) <{Sout = [#tf_type.shape<1x3x4x2>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_conv_fn, _original_entry_function = "composite_conv_fn", _quantization_method = "weight_only_ptq { }", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
     return %1 : tensor<1x3x4x2xf32>
@@ -47,14 +47,76 @@ module attributes {tf_saved_model.semantics} {
   }
 }
 
-// CHECK-LABEL: quantize_conv_fn
+// CHECK-LABEL: quantize_conv_per_tensor
 // CHECK-SAME: %[[ARG0:.+]]: tensor<1x3x4x3xf32>
-// CHECK: %[[CST:.+]] = stablehlo.constant() {value = dense<127> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8:f32, 0.0011764706349840352:-128>>
-// CHECK: %[[CALL:.+]] = call @quantized_conv_fn(%[[ARG0]], %[[CST]]) {_quantization_method = "weight_only_ptq { }"} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2x!quant.uniform<i8:f32, 0.0011764706349840352:-128>>) -> tensor<1x3x4x2xf32>
+// CHECK: %[[CST:.+]] = stablehlo.constant() <{value = dense<127> : tensor<2x3x3x2xi8>}> : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>
+// CHECK: %[[CALL:.+]] = call @quantized_conv_fn(%[[ARG0]], %[[CST]]) {_quantization_method = "weight_only_ptq { }"} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>) -> tensor<1x3x4x2xf32>
 // CHECK: return %[[CALL]]
 
 // CHECK: quantized_conv_fn
-// CHECK-SAME: (%[[ARG1:.+]]: tensor<1x3x4x3xf32>,  %[[ARG2:.+]]: tensor<2x3x3x2x!quant.uniform<i8:f32, 0.0011764706349840352:-128>>) -> tensor<1x3x4x2xf32>
+// CHECK-SAME: (%[[ARG1:.+]]: tensor<1x3x4x3xf32>,  %[[ARG2:.+]]: tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>) -> tensor<1x3x4x2xf32>
 // CHECK: %[[CONV:.+]] = stablehlo.convolution(%[[ARG1]], %[[ARG2]])
-// CHECK-SAME: (tensor<1x3x4x3xf32>, tensor<2x3x3x2x!quant.uniform<i8:f32, 0.0011764706349840352:-128>>) -> tensor<1x3x4x2xf32>
+// CHECK-SAME: (tensor<1x3x4x3xf32>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>) -> tensor<1x3x4x2xf32>
+// CHECK: return %[[CONV]]
+
+// -----
+
+// Test that per-channel weight-only quantized dot_general op is produced when
+// `weight_only_ptq` with `dimension_specs` is provided.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @quantize_dot_general_per_channel(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
+    %0 = stablehlo.constant dense<3.000000e-01> : tensor<2x3xf32>
+    %1 = "tf.XlaCallModule"(%arg0, %0) <{Sout = [#tf_type.shape<1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _quantization_method = "weight_only_ptq {input_quantized_types {key: 1, value {dimension_specs {}}}}", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    return %1 : tensor<1x3xf32>
+  }
+
+  func.func private @composite_dot_general_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    return %0 : tensor<1x3xf32>
+  }
+}
+
+// CHECK-LABEL: quantize_dot_general_per_channel
+// CHECK-SAME: %[[ARG0:.+]]: tensor<1x2xf32>
+// CHECK: %[[CST:.+]] = stablehlo.constant() <{value = dense<127> : tensor<2x3xi8>}> : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {0.0023622048182750312,0.0023622048182750312,0.0023622048182750312}>>
+// CHECK: %[[CALL:.+]] = call @quantized_dot_general_fn(%[[ARG0]], %[[CST]]) {_quantization_method = "weight_only_ptq {input_quantized_types {key: 1, value {dimension_specs {}}}}"}
+// CHECK-SAME: (tensor<1x2xf32>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {0.0023622048182750312,0.0023622048182750312,0.0023622048182750312}>>) -> tensor<1x3xf32>
+// CHECK: return %[[CALL]]
+
+// CHECK: quantized_dot_general_fn
+// CHECK-SAME: (%[[ARG1:.+]]: tensor<1x2xf32>,  %[[ARG2:.+]]: tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {0.0023622048182750312,0.0023622048182750312,0.0023622048182750312}>>) -> tensor<1x3xf32>
+// CHECK: %[[DOT:.+]] = stablehlo.dot_general %[[ARG1]], %[[ARG2]]
+// CHECK-SAME: (tensor<1x2xf32>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {0.0023622048182750312,0.0023622048182750312,0.0023622048182750312}>>) -> tensor<1x3xf32>
+// CHECK: return %[[DOT]]
+
+// -----
+
+// Test that per-channel weight-only quantized convolution op is produced when
+// `weight_only_ptq` with `dimension_specs` is provided.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @quantize_conv_per_channel(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"} {
+    %0 = stablehlo.constant dense<3.000000e-01> : tensor<2x3x3x2xf32>
+    %1 = "tf.XlaCallModule"(%arg0, %0) <{Sout = [#tf_type.shape<1x3x4x2>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_conv_fn, _original_entry_function = "composite_conv_fn", _quantization_method = "weight_only_ptq {input_quantized_types {key: 1, value {dimension_specs {}}}}", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    return %1 : tensor<1x3x4x2xf32>
+  }
+
+  func.func private @composite_conv_fn(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[0, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    return %0 : tensor<1x3x4x2xf32>
+  }
+}
+
+// CHECK-LABEL: quantize_conv_per_channel
+// CHECK-SAME: %[[ARG0:.+]]: tensor<1x3x4x3xf32>
+// CHECK: %[[CST:.+]] = stablehlo.constant() <{value = dense<127> : tensor<2x3x3x2xi8>}> : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>
+// CHECK: %[[CALL:.+]] = call @quantized_conv_fn(%[[ARG0]], %[[CST]]) {_quantization_method = "weight_only_ptq {input_quantized_types {key: 1, value {dimension_specs {}}}}"}
+// CHECK-SAME: (tensor<1x3x4x3xf32>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>) -> tensor<1x3x4x2xf32>
+// CHECK: return %[[CALL]]
+
+// CHECK: quantized_conv_fn
+// CHECK-SAME: (%[[ARG1:.+]]: tensor<1x3x4x3xf32>,  %[[ARG2:.+]]: tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>) -> tensor<1x3x4x2xf32>
+// CHECK: %[[CONV:.+]] = stablehlo.convolution(%[[ARG1]], %[[ARG2]])
+// CHECK-SAME: (tensor<1x3x4x3xf32>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>) -> tensor<1x3x4x2xf32>
 // CHECK: return %[[CONV]]
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.mlir
index 02e1c5e9923915..ab55a8bc2989bb 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.mlir
@@ -22,23 +22,23 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
   func.func @main(%arg0: tensor<1x1024xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<1x64xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
     %0 = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
     %1 = stablehlo.constant dense<1.000000e+03> : tensor<1x3xf32>
-    %2:4 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x1024xf32>) -> (tensor<1x1024xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %2:4 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32} : (tensor<1x1024xf32>) -> (tensor<1x1024xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
     %3 = "tf.XlaCallModule"(%2#0, %0, %1) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64} : (tensor<1x1024xf32>, tensor<1024x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
-    %4:4 = "tf.CustomAggregator"(%3) {calibration_method = 1 : i32, id = "1", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %4:4 = "tf.CustomAggregator"(%3) {calibration_method = 1 : i32, id = "1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32} : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
     %5 = stablehlo.constant dense<1.000000e+03> : tensor<3x64xf32>
     %6 = stablehlo.constant dense<1.000000e+03> : tensor<1x64xf32>
-    %7:4 = "tf.CustomAggregator"(%4#0) {calibration_method = 1 : i32, id = "0", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %7:4 = "tf.CustomAggregator"(%4#0) {calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32} : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
     %8 = "tf.XlaCallModule"(%7#0, %5, %6) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_with_relu_fn_1, _original_entry_function = "composite_dot_general_with_relu_fn_1", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64} : (tensor<1x3xf32>, tensor<3x64xf32>, tensor<1x64xf32>) -> tensor<1x64xf32>
-    %9:4 = "tf.CustomAggregator"(%6) {calibration_method = 1 : i32, id = "1", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x64xf32>) -> (tensor<1x64xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %9:4 = "tf.CustomAggregator"(%6) {calibration_method = 1 : i32, id = "1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32} : (tensor<1x64xf32>) -> (tensor<1x64xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
     return %9#0 : tensor<1x64xf32>
   }
 
   // CHECK: %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_0:.*]] = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<{{.*}}>, #tf_type.shape<{{.*}}>], {{.*}}, module = "", platforms = ["CPU", "TPU"], version = 9 : i64}> {_entry_function = @_stablehlo_main_1
-  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{id = "0"}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}
+  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}>
   // CHECK: %[[XLA_CALL_MODULE_0:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_0]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_0:.*]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_0:.*]]) <{Sout = [#tf_type.shape<1x3>], {{.*}}, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable"}
   // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[XLA_CALL_MODULE_0]])
   // CHECK: %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_1:.*]] = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<{{.*}}>, #tf_type.shape<{{.*}}>], {{.*}}, module = "", platforms = ["CPU", "TPU"], version = 9 : i64}> {_entry_function = @_stablehlo_main_0
-  // CHECK: %[[CUSTOM_AGGREGATOR_2:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[CUSTOM_AGGREGATOR_1]]) <{id = "0"}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}
+  // CHECK: %[[CUSTOM_AGGREGATOR_2:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[CUSTOM_AGGREGATOR_1]]) <{calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}>
   // CHECK: %[[XLA_CALL_MODULE_1:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_2]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_1:.*]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP_1:.*]]) <{Sout = [#tf_type.shape<1x3>], {{.*}}, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_with_relu_fn_1, _original_entry_function = "composite_dot_general_with_relu_fn_1", _tfl_quant_trait = "fully_quantizable"}
   // CHECK: %[[CUSTOM_AGGREGATOR_3:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[XLA_CALL_MODULE_1:.*]])
   // CHECK: return %[[CUSTOM_AGGREGATOR_3]] : tensor<1x64xf32>
@@ -111,14 +111,14 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
   // CHECK: @serving_default
   func.func @serving_default(%arg0: tensor<1x1024xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<1x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
     %0 = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
-    %1:4 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x1024xf32>) -> (tensor<1x1024xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %1:4 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32} : (tensor<1x1024xf32>) -> (tensor<1x1024xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
     %2 = "tf.XlaCallModule"(%1#0, %0) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64} : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
-    %3:4 = "tf.CustomAggregator"(%2) {calibration_method = 1 : i32, id = "1", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %3:4 = "tf.CustomAggregator"(%2) {calibration_method = 1 : i32, id = "1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32} : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
     return %3#0 : tensor<1x3xf32>
   }
 
   // CHECK: %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP:.*]] = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<1024x3>], {{.*}}, module = "", platforms = ["CPU", "TPU"], version = 9 : i64}> {_entry_function = @_stablehlo_main_0, _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}}
-  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{id = "0"}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}
+  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}>
   // CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR:.*]], %[[STABLEHLO_SUBGRAPH_TO_XLA_CALL_MODULE_OP:.*]]) <{Sout = [#tf_type.shape<1x3>], {{.*}}}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
   // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[XLA_CALL_MODULE:.*]])
   // CHECK: return %[[CUSTOM_AGGREGATOR_1]]
@@ -143,14 +143,14 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
   // CHECK-LABEL: @random_name
   func.func @random_name(%arg0: tensor<1x1024xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<1x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
     %0 = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
-    %1:4 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x1024xf32>) -> (tensor<1x1024xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %1:4 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32} : (tensor<1x1024xf32>) -> (tensor<1x1024xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
     %2 = "tf.XlaCallModule"(%1#0, %0) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64} : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
-    %3:4 = "tf.CustomAggregator"(%2) {calibration_method = 1 : i32, id = "1", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %3:4 = "tf.CustomAggregator"(%2) {calibration_method = 1 : i32, id = "1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32} : (tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
     return %3#0 : tensor<1x3xf32>
   }
 
   // CHECK: %[[CONSTANT:.*]] = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
-  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{id = "0"}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}
+  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}>
   // CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR:.*]], %[[XLA_CALL_MODULE_EXTRACTED_FROM_SUBGRAPH:.*]]) <{Sout = [#tf_type.shape<1x3>], {{.*}}, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
   // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[XLA_CALL_MODULE:.*]])
   // CHECK: return %[[CUSTOM_AGGREGATOR_1]]
@@ -185,9 +185,9 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
   // CHECK: @serving_default
   func.func @serving_default(%arg0: tensor<1024x1024xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<1024x3xf32> {tf_saved_model.index_path = ["output1"]}, tensor<1024x3xf32> {tf_saved_model.index_path = ["output2"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
     %0 = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
-    %1:4 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1024x1024xf32>) -> (tensor<1024x1024xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %1:4 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32} : (tensor<1024x1024xf32>) -> (tensor<1024x1024xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
     %2 = "tf.XlaCallModule"(%1#0, %0) {Sout = [#tf_type.shape<1024x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1024x1024xf32>, tensor<1024x3xf32>) -> tensor<1024x3xf32>
-    %3:4 = "tf.CustomAggregator"(%2) {calibration_method = 1 : i32, id = "1", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1024x3xf32>) -> (tensor<1024x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %3:4 = "tf.CustomAggregator"(%2) {calibration_method = 1 : i32, id = "1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32} : (tensor<1024x3xf32>) -> (tensor<1024x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
     %4 = stablehlo.constant dense<1.000000e+03> : tensor<1024x3xf32>
     %5 = stablehlo.add %3#0, %4 : tensor<1024x3xf32>
     %6 = stablehlo.multiply %3#0, %0 : tensor<1024x3xf32>
@@ -195,7 +195,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
   }
 
   // CHECK: %[[SUBGRAPH_1:.*]] = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<1024x3>], {{.*}} ["CPU", "TPU"], {{.*}}}> {_entry_function = @_stablehlo_main_1
-  // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{id = "0"}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}
+  // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}>
   // CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_1]], %[[SUBGRAPH_1]]) <{Sout = [#tf_type.shape<1024x3>], {{.*}}}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
   // CHECK: %[[CUSTOM_AGGREGATOR_2:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[XLA_CALL_MODULE:.*]])
   // CHECK: %[[SUBGRAPH_2:.*]]:2 = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_2]], %[[SUBGRAPH_1]]) <{Sout = [#tf_type.shape<1024x3>, #tf_type.shape<1024x3>], {{.*}}}> {_entry_function = @_stablehlo_main_0
@@ -235,16 +235,16 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
     %0 = stablehlo.constant dense<1.000000e+03> : tensor<3x11xf32>
     // %1 is large enough that it won't be duplicated.
     %1 = stablehlo.constant dense<1.000000e+01> : tensor<3x11xf32>
-    %2:4 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<3x3xf32>) -> (tensor<3x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %2:4 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32} : (tensor<3x3xf32>) -> (tensor<3x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
     %3 = "tf.XlaCallModule"(%2#0, %0) {Sout = [#tf_type.shape<3x11>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<3x3xf32>, tensor<3x11xf32>) -> tensor<3x11xf32>
-    %4:4 = "tf.CustomAggregator"(%3) {calibration_method = 1 : i32, id = "1", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<3x11xf32>) -> (tensor<3x11xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %4:4 = "tf.CustomAggregator"(%3) {calibration_method = 1 : i32, id = "1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32} : (tensor<3x11xf32>) -> (tensor<3x11xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
     %5 = stablehlo.add %4#0, %1 : tensor<3x11xf32>
     %6 = stablehlo.multiply %5, %1 : tensor<3x11xf32>
     return %6 : tensor<3x11xf32>
   }
 
   // CHECK: %[[SUBGRAPH_1:.*]] = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<3x11>], {{.*}} ["CPU", "TPU"], {{.*}}}> {_entry_function = @_stablehlo_main_1
-  // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{id = "0"}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}
+  // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}>
   // CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_1]], %[[SUBGRAPH_1]]) <{Sout = [#tf_type.shape<3x11>], {{.*}}}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
   // CHECK: %[[CUSTOM_AGGREGATOR_2:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[XLA_CALL_MODULE:.*]])
   // CHECK: %[[SUBGRAPH_2:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_2]]) <{Sout = [#tf_type.shape<3x11>], {{.*}} ["CPU", "TPU"], {{.*}}}> {_entry_function = @_stablehlo_main_0
@@ -293,14 +293,14 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
     %4 = stablehlo.compare  EQ, %3, %2,  NOTYPE : (tensor<1024x3xf32>, tensor<1024x3xf32>) -> tensor<1024x3xi1>
     stablehlo.custom_call @shape_assertion(%4) {error_message = "Shape assertion failed", has_side_effect = true} : (tensor<1024x3xi1>) -> ()
     %5 = stablehlo.constant dense<2.000000e+03> : tensor<1024x3xf32>
-    %6:4 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1024x1024xf32>) -> (tensor<1024x1024xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %6:4 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32} : (tensor<1024x1024xf32>) -> (tensor<1024x1024xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
     %7 = "tf.XlaCallModule"(%6#0, %5) {Sout = [#tf_type.shape<1024x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1024x1024xf32>, tensor<1024x3xf32>) -> tensor<1024x3xf32>
-    %8:4 = "tf.CustomAggregator"(%7) {calibration_method = 1 : i32, id = "1", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1024x3xf32>) -> (tensor<1024x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %8:4 = "tf.CustomAggregator"(%7) {calibration_method = 1 : i32, id = "1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32} : (tensor<1024x3xf32>) -> (tensor<1024x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
     %9 = stablehlo.add %8#0, %0 : tensor<1024x3xf32>
     return %9 : tensor<1024x3xf32>
   }
   // CHECK: %[[SUBGRAPH_0:.*]] = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<1024x3>], {{.*}} ["CPU", "TPU"], {{.*}}}> {_entry_function = @_stablehlo_main_1
-  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{id = "0"}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}
+  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}>
   // CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_0]], %[[SUBGRAPH_0]]) <{Sout = [#tf_type.shape<1024x3>], {{.*}}}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
   // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[XLA_CALL_MODULE:.*]])
   // CHECK: %[[SUBGRAPH_1:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_1]]) <{Sout = [#tf_type.shape<1024x3>], {{.*}}}> {_entry_function = @_stablehlo_main_0
@@ -339,16 +339,16 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
     %2 = stablehlo.remainder %0, %1 : tensor<1024x3xf32>
     %3 = "tf.Identity"(%2) {device = ""} : (tensor<1024x3xf32>) -> tensor<1024x3xf32>
     %4 = stablehlo.constant dense<2.000000e+03> : tensor<1024x3xf32>
-    %5:4 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1024x1024xf32>) -> (tensor<1024x1024xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %5:4 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32} : (tensor<1024x1024xf32>) -> (tensor<1024x1024xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
     %6 = "tf.XlaCallModule"(%5#0, %4) {Sout = [#tf_type.shape<1024x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1024x1024xf32>, tensor<1024x3xf32>) -> tensor<1024x3xf32>
-    %7:4 = "tf.CustomAggregator"(%6) {calibration_method = 1 : i32, id = "1", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1024x3xf32>) -> (tensor<1024x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %7:4 = "tf.CustomAggregator"(%6) {calibration_method = 1 : i32, id = "1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32} : (tensor<1024x3xf32>) -> (tensor<1024x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
     %8 = stablehlo.add %7#0, %0 : tensor<1024x3xf32>
     return %8 : tensor<1024x3xf32>
   }
   // CHECK: %[[SUBGRAPH_0:.*]]:2 = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<1024x3>, #tf_type.shape<1024x3>], {{.*}} ["CPU", "TPU"], {{.*}}}> {_entry_function = @_stablehlo_main_2
   // CHECK: %[[IDENTIFY:.*]] = "tf.Identity"(%[[SUBGRAPH_0]]#1) {device = ""} : (tensor<1024x3xf32>) -> tensor<1024x3xf32>
   // CHECK: %[[SUBGRAPH_1:.*]] = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<1024x3>], {{.*}} ["CPU", "TPU"], {{.*}}}> {_entry_function = @_stablehlo_main_1
-  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{id = "0"}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}
+  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}>
   // CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_0]], %[[SUBGRAPH_1]]) <{Sout = [#tf_type.shape<1024x3>], {{.*}}}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
   // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[XLA_CALL_MODULE:.*]])
   // CHECK: %[[SUBGRAPH_2:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_1]], %[[SUBGRAPH_0]]#0) <{Sout = [#tf_type.shape<1024x3>], {{.*}}}> {_entry_function = @_stablehlo_main_0
@@ -394,14 +394,14 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
     %7 = stablehlo.compare  EQ, %6, %2,  NOTYPE : (tensor<1024x3xf32>, tensor<1024x3xf32>) -> tensor<1024x3xi1>
     stablehlo.custom_call @shape_assertion(%7) {error_message = "Shape assertion failed", has_side_effect = true} : (tensor<1024x3xi1>) -> ()
     %8 = stablehlo.constant dense<2.000000e+03> : tensor<1024x3xf32>
-    %9:4 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1024x1024xf32>) -> (tensor<1024x1024xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %9:4 = "tf.CustomAggregator"(%arg0) {calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32} : (tensor<1024x1024xf32>) -> (tensor<1024x1024xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
     %10 = "tf.XlaCallModule"(%9#0, %8) {Sout = [#tf_type.shape<1024x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _tfl_quant_trait = "fully_quantizable", dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1024x1024xf32>, tensor<1024x3xf32>) -> tensor<1024x3xf32>
-    %11:4 = "tf.CustomAggregator"(%10) {calibration_method = 1 : i32, id = "1", initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1024x3xf32>) -> (tensor<1024x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+    %11:4 = "tf.CustomAggregator"(%10) {calibration_method = 1 : i32, id = "1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32} : (tensor<1024x3xf32>) -> (tensor<1024x3xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
     %12 = stablehlo.add %11#0, %0 : tensor<1024x3xf32>
     return %12 : tensor<1024x3xf32>
   }
   // CHECK: %[[SUBGRAPH_0:.*]]:2 = "tf.XlaCallModule"() <{Sout = [#tf_type.shape<1024x3>, #tf_type.shape<1024x3>], {{.*}} ["CPU", "TPU"], {{.*}}}> {_entry_function = @_stablehlo_main_1
-  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{id = "0"}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32}
+  // CHECK: %[[CUSTOM_AGGREGATOR_0:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{calibration_method = 1 : i32, id = "0", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}>
   // CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_0]], %[[SUBGRAPH_0]]#1) <{Sout = [#tf_type.shape<1024x3>], {{.*}}}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
   // CHECK: %[[CUSTOM_AGGREGATOR_1:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[XLA_CALL_MODULE:.*]])
   // CHECK: %[[SUBGRAPH_1:.*]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_1]], %[[SUBGRAPH_0]]#0) <{Sout = [#tf_type.shape<1024x3>], {{.*}}}> {_entry_function = @_stablehlo_main_0
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/pipelines/process_nchw_tensor.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/pipelines/process_nchw_tensor.mlir
index 831131a4c64555..5e443526c650f1 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/pipelines/process_nchw_tensor.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/pipelines/process_nchw_tensor.mlir
@@ -100,8 +100,9 @@ func.func @nchw_conv_with_bias_add_max_pool(%arg0: tensor<1x2x5x5xf32>) -> tenso
 // CHECK: %[[CONV:.+]] = stablehlo.convolution(%[[TRANSPOSE_0]], %[[WEIGHT_CONST]]) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = {{\[\[}}1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x5x5x2xf32>, tensor<3x3x2x4xf32>) -> tensor<1x5x5x4xf32>
 // CHECK: %[[ADD:.+]] = stablehlo.add %[[CONV]], %[[BIAS_CONST]] : tensor<1x5x5x4xf32>
 // CHECK: %[[REDUCE_WINDOW_MAX:.+]] = "stablehlo.reduce_window"(%[[ADD]], %[[INIT_VALUE_CONST:.+]])
+// CHECK: <{window_dimensions = array<i64: 1, 2, 2, 1>, window_strides = array<i64: 1, 2, 2, 1>}>
 // CHECK: stablehlo.maximum
-// CHECK: {window_dimensions = array<i64: 1, 2, 2, 1>, window_strides = array<i64: 1, 2, 2, 1>} : (tensor<1x5x5x4xf32>, tensor<f32>) -> tensor<1x2x2x4xf32>
+// CHECK: (tensor<1x5x5x4xf32>, tensor<f32>) -> tensor<1x2x2x4xf32>
 // CHECK: %[[TRANSPOSE_1:.+]] = stablehlo.transpose %[[REDUCE_WINDOW_MAX]], dims = [0, 3, 1, 2] : (tensor<1x2x2x4xf32>) -> tensor<1x4x2x2xf32>
 // CHECK: return %[[TRANSPOSE_1]]
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/utils/bfloat16_type.cc b/tensorflow/compiler/mlir/quantization/stablehlo/utils/bfloat16_type.cc
index efda4282b2cbec..640f0ebc5c5061 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/utils/bfloat16_type.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/utils/bfloat16_type.cc
@@ -29,7 +29,7 @@ bool IsLargeFloatType(Type type) {
 }
 
 Type ToBfloat16Type(Type type) {
-  if (auto shaped = type.dyn_cast<ShapedType>()) {
+  if (auto shaped = mlir::dyn_cast<ShapedType>(type)) {
     const Type elem = shaped.getElementType();
     if (IsLargeFloatType(elem)) {
       return shaped.clone(BFloat16Type::get(type.getContext()));
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/utils/math_utils_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/utils/math_utils_test.cc
index fefdfbbb543123..1558a5478e604f 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/utils/math_utils_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/utils/math_utils_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/quantization/stablehlo/utils/math_utils.h"
 
 #include <gtest/gtest.h>
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 
 namespace mlir::quant::stablehlo {
 namespace {
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/utils/tf_type_utils.cc b/tensorflow/compiler/mlir/quantization/stablehlo/utils/tf_type_utils.cc
index 2f801565b93a1f..555e8af25b374f 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/utils/tf_type_utils.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/utils/tf_type_utils.cc
@@ -37,8 +37,8 @@ limitations under the License.
 namespace mlir::quant::tensorflow {
 
 bool IsTFQintType(const Type type) {
-  return type.isa<TF::Qint8Type, TF::Qint16Type, TF::Qint32Type, TF::Quint8Type,
-                  TF::Quint16Type>();
+  return mlir::isa<TF::Qint8Type, TF::Qint16Type, TF::Qint32Type,
+                   TF::Quint8Type, TF::Quint16Type>(type);
 }
 
 Type GetIntTypeFromTFQint(const Type type) {
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/utils/tf_type_utils_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/utils/tf_type_utils_test.cc
index 87d71438cf4e7c..e1fbe1917d9780 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/utils/tf_type_utils_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/utils/tf_type_utils_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/register_common_dialects.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
@@ -188,31 +189,31 @@ TEST(GetIntTypeFromTFQintTest, ChecksIntTypesFromTFQint) {
 
   auto type = GetIntTypeFromTFQint(TF::Qint8Type::get(context.get()));
   EXPECT_TRUE(llvm::isa<IntegerType>(type));
-  EXPECT_EQ(type.dyn_cast<IntegerType>().getWidth(), 8);
-  EXPECT_FALSE(type.dyn_cast<IntegerType>().isSigned());
-  EXPECT_FALSE(type.dyn_cast<IntegerType>().isUnsigned());
+  EXPECT_EQ(mlir::dyn_cast<IntegerType>(type).getWidth(), 8);
+  EXPECT_FALSE(mlir::dyn_cast<IntegerType>(type).isSigned());
+  EXPECT_FALSE(mlir::dyn_cast<IntegerType>(type).isUnsigned());
 
   type = GetIntTypeFromTFQint(TF::Qint16Type::get(context.get()));
   EXPECT_TRUE(llvm::isa<IntegerType>(type));
-  EXPECT_EQ(type.dyn_cast<IntegerType>().getWidth(), 16);
-  EXPECT_FALSE(type.dyn_cast<IntegerType>().isSigned());
-  EXPECT_FALSE(type.dyn_cast<IntegerType>().isUnsigned());
+  EXPECT_EQ(mlir::dyn_cast<IntegerType>(type).getWidth(), 16);
+  EXPECT_FALSE(mlir::dyn_cast<IntegerType>(type).isSigned());
+  EXPECT_FALSE(mlir::dyn_cast<IntegerType>(type).isUnsigned());
 
   type = GetIntTypeFromTFQint(TF::Qint32Type::get(context.get()));
   EXPECT_TRUE(llvm::isa<IntegerType>(type));
-  EXPECT_EQ(type.dyn_cast<IntegerType>().getWidth(), 32);
-  EXPECT_FALSE(type.dyn_cast<IntegerType>().isSigned());
-  EXPECT_FALSE(type.dyn_cast<IntegerType>().isUnsigned());
+  EXPECT_EQ(mlir::dyn_cast<IntegerType>(type).getWidth(), 32);
+  EXPECT_FALSE(mlir::dyn_cast<IntegerType>(type).isSigned());
+  EXPECT_FALSE(mlir::dyn_cast<IntegerType>(type).isUnsigned());
 
   type = GetIntTypeFromTFQint(TF::Quint8Type::get(context.get()));
   EXPECT_TRUE(llvm::isa<IntegerType>(type));
-  EXPECT_EQ(type.dyn_cast<IntegerType>().getWidth(), 8);
-  EXPECT_TRUE(type.dyn_cast<IntegerType>().isUnsigned());
+  EXPECT_EQ(mlir::dyn_cast<IntegerType>(type).getWidth(), 8);
+  EXPECT_TRUE(mlir::dyn_cast<IntegerType>(type).isUnsigned());
 
   type = GetIntTypeFromTFQint(TF::Quint16Type::get(context.get()));
   EXPECT_TRUE(llvm::isa<IntegerType>(type));
-  EXPECT_EQ(type.dyn_cast<IntegerType>().getWidth(), 16);
-  EXPECT_TRUE(type.dyn_cast<IntegerType>().isUnsigned());
+  EXPECT_EQ(mlir::dyn_cast<IntegerType>(type).getWidth(), 16);
+  EXPECT_TRUE(mlir::dyn_cast<IntegerType>(type).isUnsigned());
 
   // Non qint types are returned as is.
   EXPECT_EQ(GetIntTypeFromTFQint(IntegerType::get(type.getContext(), 32)),
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
index 94dc1b1569620f..1762a67d7d3acf 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
@@ -348,7 +348,6 @@ cc_library(
         "passes/insert_quantized_functions.cc",
         "passes/insert_restore_op.cc",
         "passes/insert_save_op.cc",
-        "passes/issue_ids_of_custom_aggregation_ops.cc",
         "passes/lift_hashtable_ops_as_args.cc",
         "passes/lift_quantizable_spots_as_functions.cc",
         "passes/lift_quantizable_spots_as_functions.inc",
@@ -504,6 +503,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/BUILD
index 9ae8d6401afcd6..6e6ee260f48a2c 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/BUILD
@@ -12,13 +12,8 @@ load(
     "get_compatible_with_portable",
     "tf_kernel_library",
     "tf_py_strict_test",
-    "tf_python_pybind_extension",
 )
 load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library")
-load(
-    "//tensorflow/core/platform:build_config_root.bzl",
-    "if_static",
-)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -30,49 +25,6 @@ package(
     licenses = ["notice"],
 )
 
-# Directly linked to `custom_aggregator_op`. In general, one should avoid directly depending on
-# this target to avoid the ODR violation. Depend on `calibrator_singleton` instead.
-cc_library(
-    name = "calibrator_singleton_impl",
-    srcs = ["calibrator_singleton.cc"],
-    hdrs = ["calibrator_singleton.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        ":calibration_statistics_collector_average_min_max",
-        ":calibration_statistics_collector_base",
-        ":calibration_statistics_collector_histogram",
-        ":calibration_statistics_collector_min_max",
-        ":calibration_statistics_proto_cc",
-        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
-        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
-        "//tensorflow/core:framework",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/types:optional",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-cc_library(
-    name = "calibrator_singleton",
-    hdrs = ["calibrator_singleton.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = if_static([":calibrator_singleton_impl"]) + [
-        ":calibration_statistics_collector_base",
-        ":calibration_statistics_proto_cc",
-        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
-        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
-        "//tensorflow/core:framework",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/types:optional",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
 cc_library(
     name = "calibration_statistics_collector_base",
     hdrs = ["calibration_statistics_collector_base.h"],
@@ -181,20 +133,6 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
-    name = "calibrator_singleton_test",
-    size = "small",
-    srcs = ["calibrator_singleton_test.cc"],
-    deps = [
-        ":calibration_statistics_proto_cc",
-        ":calibrator_singleton_impl",
-        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
 tf_kernel_library(
     name = "custom_aggregator_op",
     srcs = ["custom_aggregator_op.cc"],
@@ -204,7 +142,6 @@ tf_kernel_library(
         "//tensorflow/compiler/mlir/quantization/tensorflow/python:__pkg__",
     ],
     deps = [
-        ":calibrator_singleton_impl",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration:calibration_parameters",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
@@ -238,7 +175,6 @@ tf_py_strict_test(
     deps = [
         ":calibration_statistics_proto_py",
         ":gen_custom_aggregator_op_wrapper",
-        ":pywrap_calibration",
         "//tensorflow:tensorflow_py",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_py",
         "//tensorflow/python:pywrap_tensorflow",
@@ -249,20 +185,6 @@ tf_py_strict_test(
     ],
 )
 
-tf_python_pybind_extension(
-    name = "pywrap_calibration",
-    srcs = ["pywrap_calibration.cc"],
-    pytype_srcs = ["pywrap_calibration.pyi"],
-    deps = [
-        ":calibration_statistics_proto_cc",
-        ":calibrator_singleton",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@pybind11",
-        "@pybind11_protobuf//pybind11_protobuf:native_proto_caster",
-    ],
-)
-
 tf_kernel_library(
     name = "calibration_statistics_saver_op",
     srcs = ["calibration_statistics_saver_op.cc"],
@@ -276,6 +198,7 @@ tf_kernel_library(
         ":calibration_statistics_collector_base",
         ":calibration_statistics_collector_histogram",
         ":calibration_statistics_collector_min_max",
+        ":calibration_statistics_proto_cc",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
@@ -298,6 +221,7 @@ tf_cc_test(
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_saver_op.cc b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_saver_op.cc
index 8061ad3fe2d444..4b30fab0cc39fc 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_saver_op.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_saver_op.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_average_min_max.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_base.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_histogram.h"
@@ -36,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_requires.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tsl/platform/file_system.h"
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_saver_op_test.cc b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_saver_op_test.cc
index 8335722cdea929..15cb07f4b93270 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_saver_op_test.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_saver_op_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/path.h"
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.cc b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.cc
deleted file mode 100644
index 74575b761737a3..00000000000000
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.cc
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.h"
-
-#include <cstdint>
-#include <memory>
-#include <optional>
-#include <string>
-#include <vector>
-
-#include "absl/base/attributes.h"
-#include "absl/base/const_init.h"
-#include "absl/strings/string_view.h"
-#include "absl/synchronization/mutex.h"
-#include "absl/types/span.h"
-#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_average_min_max.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_histogram.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_min_max.h"
-#include "tensorflow/core/framework/tensor.h"
-
-namespace tensorflow {
-namespace calibrator {
-
-using ::stablehlo::quantization::CalibrationOptions;
-
-ABSL_CONST_INIT absl::Mutex CalibratorSingleton::lock_(absl::kConstInit);
-
-CalibratorSingleton& CalibratorSingleton::GetInstance() {
-  static CalibratorSingleton* calibrator = new CalibratorSingleton();
-  return *calibrator;
-}
-
-void CalibratorSingleton::ClearCollectedInformation() {
-  absl::MutexLock lock(&lock_);
-
-  CalibratorSingleton& instance = GetInstance();
-  instance.id_to_collector_.clear();
-}
-
-void CalibratorSingleton::ClearData(absl::string_view id) {
-  absl::MutexLock lock(&lock_);
-
-  CalibratorSingleton& instance = GetInstance();
-
-  const std::string id_str{id};
-  instance.id_to_collector_[id_str].reset(nullptr);
-}
-
-void CalibratorSingleton::Report(absl::string_view id, const Tensor& min_tensor,
-                                 const Tensor& max_tensor,
-                                 const Tensor& histogram_tensor,
-                                 const CalibrationOptions& calib_opts) {
-  const float min_value = min_tensor.scalar<float>()();
-  const float max_value = max_tensor.scalar<float>()();
-  auto histogram_flat = histogram_tensor.flat<int64_t>();
-  absl::Span<const int64_t> histogram_data =
-      absl::MakeSpan(histogram_flat.data(), histogram_flat.size());
-  Report(id, min_value, max_value, histogram_data, calib_opts);
-}
-
-void CalibratorSingleton::Report(absl::string_view id, float min, float max,
-                                 absl::Span<const int64_t> histogram,
-                                 const CalibrationOptions& calib_opts) {
-  absl::MutexLock lock(&lock_);
-
-  CalibratorSingleton& instance = GetInstance();
-  const std::string id_str{id};
-  AssignIfNotExists(id_str, calib_opts);
-  instance.id_to_collector_[id_str]->Collect(min, max, histogram);
-}
-
-std::optional<CalibrationStatistics> CalibratorSingleton::GetStatistics(
-    absl::string_view id) {
-  absl::MutexLock lock(&lock_);
-
-  CalibratorSingleton& instance = GetInstance();
-
-  const std::string id_str{id};
-
-  if (!instance.id_to_collector_[id_str]) {
-    return std::nullopt;
-  }
-
-  return instance.id_to_collector_[id_str]->GetStatistics();
-}
-
-void CalibratorSingleton::AssignIfNotExists(
-    std::string id_str, const CalibrationOptions& calib_opts) {
-  CalibratorSingleton& instance = GetInstance();
-  if (instance.id_to_collector_[id_str]) return;
-
-  switch (calib_opts.calibration_method()) {
-    case CalibrationOptions::CALIBRATION_METHOD_AVERAGE_MIN_MAX:
-      instance.id_to_collector_[id_str] =
-          std::make_unique<CalibrationStatisticsCollectorAverageMinMax>();
-      break;
-    case CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_PERCENTILE:
-    case CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_BRUTEFORCE:
-    case CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_SYMMETRIC:
-    case CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_MAX_FREQUENCY:
-      instance.id_to_collector_[id_str] =
-          std::make_unique<CalibrationStatisticsCollectorHistogram>();
-      break;
-    case CalibrationOptions::CALIBRATION_METHOD_MIN_MAX:
-    default:
-      instance.id_to_collector_[id_str] =
-          std::make_unique<CalibrationStatisticsCollectorMinMax>();
-  }
-}
-
-}  // namespace calibrator
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.h b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.h
deleted file mode 100644
index 8a6aee81ee9cbd..00000000000000
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CALIBRATOR_CALIBRATOR_SINGLETON_H_
-#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CALIBRATOR_CALIBRATOR_SINGLETON_H_
-
-#include <atomic>
-#include <cstdint>
-#include <memory>
-#include <optional>
-#include <string>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/strings/string_view.h"
-#include "absl/synchronization/mutex.h"
-#include "absl/types/optional.h"
-#include "absl/types/span.h"
-#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_collector_base.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
-#include "tensorflow/core/framework/tensor.h"
-
-namespace tensorflow {
-namespace calibrator {
-
-using stablehlo::quantization::CalibrationOptions;
-
-// TODO: b/315084876 - Move to stablehlo quantizer directory.
-class CalibratorSingleton {
- public:
-  // Clears the collected information.
-  static void ClearCollectedInformation();
-
-  // Clears the collected data of the given node id.
-  static void ClearData(absl::string_view id);
-
-  // Reports data to the singleton. Only calculates the required statistics
-  // based on CalibrationOptions.
-  static void Report(absl::string_view id, const Tensor& min_tensor,
-                     const Tensor& max_tensor, const Tensor& histogram_tensor,
-                     const CalibrationOptions& calib_opts);
-
-  // Same as above but accepts primitive input types.
-  static void Report(absl::string_view id, float min, float max,
-                     absl::Span<const int64_t> histogram,
-                     const CalibrationOptions& calib_opts);
-
-  // Returns the calibration statistics of the given id.
-  static std::optional<CalibrationStatistics> GetStatistics(
-      absl::string_view id);
-
- private:
-  static CalibratorSingleton& GetInstance();
-  static absl::Mutex lock_;
-  static void AssignIfNotExists(std::string id_str,
-                                const CalibrationOptions& calib_opts);
-
-  absl::flat_hash_map<std::string,
-                      std::unique_ptr<CalibrationStatisticsCollectorBase>>
-      id_to_collector_;
-
-  CalibratorSingleton() = default;
-  ~CalibratorSingleton() = default;
-};
-
-}  // namespace calibrator
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CALIBRATOR_CALIBRATOR_SINGLETON_H_
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton_test.cc b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton_test.cc
deleted file mode 100644
index ca338b58c5909d..00000000000000
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton_test.cc
+++ /dev/null
@@ -1,203 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.h"
-
-#include <cstdint>
-#include <optional>
-#include <vector>
-
-#include <gtest/gtest.h>
-#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-namespace calibrator {
-namespace {
-
-using ::stablehlo::quantization::CalibrationOptions;
-
-TEST(CalibratorSingletonTest, SimpleMinMax) {
-  CalibrationOptions calib_opts;
-  calib_opts.set_calibration_method(
-      CalibrationOptions::CALIBRATION_METHOD_MIN_MAX);
-
-  CalibratorSingleton::Report(/*id=*/"1", /*min=*/1.0f, /*max=*/5.0f,
-                              /*histogram=*/{},
-                              /*calib_opts=*/calib_opts);
-  std::optional<CalibrationStatistics> statistics =
-      CalibratorSingleton::GetStatistics(/*id=*/"1");
-
-  EXPECT_TRUE(statistics.has_value());
-  EXPECT_EQ(statistics.value().min_max_statistics().global_min(), 1.0f);
-  EXPECT_EQ(statistics.value().min_max_statistics().global_max(), 5.0f);
-
-  CalibratorSingleton::Report(/*id=*/"1", /*min=*/1.0f, /*max=*/10.0f,
-                              /*histogram=*/{},
-                              /*calib_opts=*/calib_opts);
-  statistics = CalibratorSingleton::GetStatistics(/*id=*/"1");
-
-  EXPECT_TRUE(statistics.has_value());
-  EXPECT_EQ(statistics.value().min_max_statistics().global_min(), 1.0f);
-  EXPECT_EQ(statistics.value().min_max_statistics().global_max(), 10.0f);
-
-  CalibratorSingleton::Report(/*id=*/"1", /*min=*/-5.0f, /*max=*/5.0f,
-                              /*histogram=*/{},
-                              /*calib_opts=*/calib_opts);
-  statistics = CalibratorSingleton::GetStatistics(/*id=*/"1");
-
-  EXPECT_TRUE(statistics.has_value());
-  EXPECT_EQ(statistics.value().min_max_statistics().global_min(), -5.0f);
-  EXPECT_EQ(statistics.value().min_max_statistics().global_max(), 10.0f);
-}
-
-TEST(CalibratorSingletonTest, DifferentSessions) {
-  CalibrationOptions calib_opts;
-  calib_opts.set_calibration_method(
-      CalibrationOptions::CALIBRATION_METHOD_MIN_MAX);
-
-  CalibratorSingleton::Report(/*id=*/"2", /*min=*/1.0f, /*max=*/5.0f,
-                              /*histogram=*/{},
-                              /*calib_opts=*/calib_opts);
-  std::optional<CalibrationStatistics> statistics =
-      CalibratorSingleton::GetStatistics(/*id=*/"2");
-
-  EXPECT_TRUE(statistics.has_value());
-  EXPECT_EQ(statistics.value().min_max_statistics().global_min(), 1.0f);
-  EXPECT_EQ(statistics.value().min_max_statistics().global_max(), 5.0f);
-
-  CalibratorSingleton::Report(/*id=*/"2", /*min=*/1.0f, /*max=*/10.0f,
-                              /*histogram=*/{},
-                              /*calib_opts=*/calib_opts);
-  statistics = CalibratorSingleton::GetStatistics(/*id=*/"2");
-
-  EXPECT_TRUE(statistics.has_value());
-  EXPECT_EQ(statistics.value().min_max_statistics().global_min(), 1.0f);
-  EXPECT_EQ(statistics.value().min_max_statistics().global_max(), 10.0f);
-
-  CalibratorSingleton::Report(/*id=*/"3", /*min=*/-5.0f, /*max=*/5.0f,
-                              /*histogram=*/{},
-                              /*calib_opts=*/calib_opts);
-  statistics = CalibratorSingleton::GetStatistics(/*id=*/"3");
-
-  EXPECT_TRUE(statistics.has_value());
-  EXPECT_EQ(statistics.value().min_max_statistics().global_min(), -5.0f);
-  EXPECT_EQ(statistics.value().min_max_statistics().global_max(), 5.0f);
-}
-
-TEST(CalibratorSingletonTest, ClearAndGetEmptyResult) {
-  std::vector<std::vector<float>> report_vec;
-  CalibrationOptions calib_opts;
-  calib_opts.set_calibration_method(
-      CalibrationOptions::CALIBRATION_METHOD_MIN_MAX);
-
-  report_vec.push_back({1.0f, 2.0f, 3.0f, 4.0f, 5.0f});
-  report_vec.push_back({1.0f, 2.0f, 3.0f, 4.0f, 10.0f});
-
-  CalibratorSingleton::Report(/*id=*/"4", /*min=*/1.0f, /*max=*/5.0f,
-                              /*histogram=*/{},
-                              /*calib_opts=*/calib_opts);
-  std::optional<CalibrationStatistics> statistics =
-      CalibratorSingleton::GetStatistics(/*id=*/"4");
-
-  EXPECT_TRUE(statistics.has_value());
-  EXPECT_EQ(statistics.value().min_max_statistics().global_min(), 1.0f);
-  EXPECT_EQ(statistics.value().min_max_statistics().global_max(), 5.0f);
-
-  CalibratorSingleton::ClearData(/*id=*/"4");
-  statistics = CalibratorSingleton::GetStatistics(/*id=*/"4");
-
-  EXPECT_FALSE(statistics.has_value());
-}
-
-TEST(CalibratorSingletonTest, ClearDataAndGetResults) {
-  CalibrationOptions calib_opts;
-  calib_opts.set_calibration_method(
-      CalibrationOptions::CALIBRATION_METHOD_MIN_MAX);
-
-  CalibratorSingleton::Report(/*id=*/"5", /*min=*/1.0f, /*max=*/5.0f,
-                              /*histogram=*/{},
-                              /*calib_opts=*/calib_opts);
-  std::optional<CalibrationStatistics> statistics =
-      CalibratorSingleton::GetStatistics(/*id=*/"5");
-
-  EXPECT_TRUE(statistics.has_value());
-  EXPECT_EQ(statistics.value().min_max_statistics().global_min(), 1.0f);
-  EXPECT_EQ(statistics.value().min_max_statistics().global_max(), 5.0f);
-
-  CalibratorSingleton::Report(/*id=*/"6", /*min=*/1.0f, /*max=*/10.0f,
-                              /*histogram=*/{},
-                              /*calib_opts=*/calib_opts);
-  statistics = CalibratorSingleton::GetStatistics(/*id=*/"6");
-
-  EXPECT_TRUE(statistics.has_value());
-  EXPECT_EQ(statistics.value().min_max_statistics().global_min(), 1.0f);
-  EXPECT_EQ(statistics.value().min_max_statistics().global_max(), 10.0f);
-
-  CalibratorSingleton::ClearData(/*id=*/"5");
-  statistics = CalibratorSingleton::GetStatistics(/*id=*/"5");
-
-  EXPECT_FALSE(statistics.has_value());
-
-  CalibratorSingleton::Report(/*id=*/"6", /*min=*/1.0f, /*max=*/10.0f,
-                              /*histogram=*/{},
-                              /*calib_opts=*/calib_opts);
-  statistics = CalibratorSingleton::GetStatistics(/*id=*/"6");
-
-  EXPECT_TRUE(statistics.has_value());
-  EXPECT_EQ(statistics.value().min_max_statistics().global_min(), 1.0f);
-  EXPECT_EQ(statistics.value().min_max_statistics().global_max(), 10.0f);
-}
-
-TEST(CalibratorSingletonTest, SimpleAverageMinMax) {
-  CalibrationOptions calib_opts;
-  calib_opts.set_calibration_method(
-      CalibrationOptions::CALIBRATION_METHOD_AVERAGE_MIN_MAX);
-
-  CalibratorSingleton::Report(/*id=*/"7", /*min=*/-10.0f, /*max=*/30.0f,
-                              /*histogram=*/{},
-                              /*calib_opts=*/calib_opts);
-  std::optional<CalibrationStatistics> statistics =
-      CalibratorSingleton::GetStatistics(/*id=*/"7");
-
-  EXPECT_TRUE(statistics.has_value());
-  EXPECT_EQ(statistics.value().average_min_max_statistics().min_sum(), -10.0f);
-  EXPECT_EQ(statistics.value().average_min_max_statistics().max_sum(), 30.0f);
-  EXPECT_EQ(statistics.value().average_min_max_statistics().num_samples(), 1);
-
-  CalibratorSingleton::Report(/*id=*/"7", /*min=*/-20.0f, /*max=*/60.0f,
-                              /*histogram=*/{},
-                              /*calib_opts=*/calib_opts);
-  statistics = CalibratorSingleton::GetStatistics(/*id=*/"7");
-
-  EXPECT_TRUE(statistics.has_value());
-  EXPECT_EQ(statistics.value().average_min_max_statistics().min_sum(), -30.0f);
-  EXPECT_EQ(statistics.value().average_min_max_statistics().max_sum(), 90.0f);
-  EXPECT_EQ(statistics.value().average_min_max_statistics().num_samples(), 2);
-
-  CalibratorSingleton::Report(/*id=*/"7", /*min=*/-30.0f, /*max=*/90.0f,
-                              /*histogram=*/{},
-                              /*calib_opts=*/calib_opts);
-  statistics = CalibratorSingleton::GetStatistics(/*id=*/"7");
-
-  EXPECT_TRUE(statistics.has_value());
-  EXPECT_EQ(statistics.value().average_min_max_statistics().min_sum(), -60.0f);
-  EXPECT_EQ(statistics.value().average_min_max_statistics().max_sum(), 180.0f);
-  EXPECT_EQ(statistics.value().average_min_max_statistics().num_samples(), 3);
-}
-
-}  // namespace
-}  // namespace calibrator
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/custom_aggregator_op.cc b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/custom_aggregator_op.cc
index 66d932a44f6179..ea37ab7b2be9bf 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/custom_aggregator_op.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/custom_aggregator_op.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/calibration_parameters.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/op.h"
@@ -37,7 +36,6 @@ using ::stablehlo::quantization::CalculateBinIndexSafe;
 using ::stablehlo::quantization::CalculateBinWidth;
 using ::stablehlo::quantization::CalculateLowerBound;
 using ::stablehlo::quantization::CalibrationOptions;
-using ::stablehlo::quantization::GetNumBins;
 using CPUDevice = ::Eigen::ThreadPoolDevice;
 using CalibrationMethod =
     ::stablehlo::quantization::CalibrationOptions_CalibrationMethod;
@@ -52,7 +50,7 @@ REGISTER_OP("CustomAggregator")
     .Output("histogram: int64")
     .Attr("id: string")
     .Attr("calibration_method: int = 0")
-    .Attr("initial_num_bins: int = 0")
+    .Attr("num_bins: int = 0")
     .Attr("min_percentile: float = 0.0")
     .Attr("max_percentile: float = 0.0")
     .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
@@ -60,12 +58,9 @@ REGISTER_OP("CustomAggregator")
       c->set_output(1, c->Scalar());
       c->set_output(2, c->Scalar());
 
-      const tensorflow::AttrValue* calibration_method_attr;
-      TF_RETURN_IF_ERROR(
-          c->GetAttr("calibration_method", &calibration_method_attr));
-      int32_t num_bins = GetNumBins(
-          static_cast<CalibrationMethod>(calibration_method_attr->i()));
-      c->set_output(3, c->MakeShape({num_bins}));
+      const tensorflow::AttrValue* num_bins_attr;
+      TF_RETURN_IF_ERROR(c->GetAttr("num_bins", &num_bins_attr));
+      c->set_output(3, c->MakeShape({num_bins_attr->i()}));
 
       return absl::OkStatus();
     });
@@ -77,13 +72,12 @@ class CustomAggregatorOp : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("id", &id_));
 
     int calibration_method_value;
-    int initial_num_bins;
+    int num_bins;
     float min_percentile;
     float max_percentile;
     OP_REQUIRES_OK(context, context->GetAttr("calibration_method",
                                              &calibration_method_value));
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("initial_num_bins", &initial_num_bins));
+    OP_REQUIRES_OK(context, context->GetAttr("num_bins", &num_bins));
     OP_REQUIRES_OK(context,
                    context->GetAttr("min_percentile", &min_percentile));
     OP_REQUIRES_OK(context,
@@ -98,8 +92,7 @@ class CustomAggregatorOp : public OpKernel {
         absl::AbortedError("The calibration method must be specified."));
 
     calib_opts_.set_calibration_method(calibration_method);
-    calib_opts_.mutable_calibration_parameters()->set_initial_num_bins(
-        initial_num_bins);
+    calib_opts_.mutable_calibration_parameters()->set_num_bins(num_bins);
     calib_opts_.mutable_calibration_parameters()->set_min_percentile(
         min_percentile);
     calib_opts_.mutable_calibration_parameters()->set_max_percentile(
@@ -123,7 +116,7 @@ class CustomAggregatorOp : public OpKernel {
         context->template eigen_device<CPUDevice>()) = input_flat.maximum();
 
     // Calculate histogram statistics.
-    int32_t num_bins = GetNumBins(calib_opts_.calibration_method());
+    const int32_t num_bins = calib_opts_.calibration_parameters().num_bins();
     Tensor* histogram_output = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output("histogram", {num_bins},
                                                      &histogram_output));
@@ -133,11 +126,6 @@ class CustomAggregatorOp : public OpKernel {
       CalculateHistogramStatistics(context, input_tensor, min_value, max_value,
                                    num_bins, histogram_output);
     }
-
-    // By passing calib_opts_ and input_tensor to CalibratorSingleton,
-    // CalibrationStatisticsCollector can calculate statistics for calibration.
-    calibrator::CalibratorSingleton::Report(id_, *min_output, *max_output,
-                                            *histogram_output, calib_opts_);
   }
 
  private:
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/integration_test/custom_aggregator_op_test.py b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/integration_test/custom_aggregator_op_test.py
index 5940803f470117..78bd79e43faf1c 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/integration_test/custom_aggregator_op_test.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/integration_test/custom_aggregator_op_test.py
@@ -17,9 +17,7 @@
 import tensorflow  # pylint: disable=unused-import
 
 from tensorflow.compiler.mlir.quantization.stablehlo import quantization_config_pb2 as stablehlo_quant_config_pb2
-from tensorflow.compiler.mlir.quantization.tensorflow.calibrator import calibration_statistics_pb2 as calib_stat_pb2
 from tensorflow.compiler.mlir.quantization.tensorflow.calibrator import custom_aggregator_op_wrapper
-from tensorflow.compiler.mlir.quantization.tensorflow.calibrator import pywrap_calibration
 from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -39,7 +37,6 @@ def setUp(self):
 
   def testBypassAndMinMax(self):
     with self.session():
-      pywrap_calibration.clear_calibrator()
       input_tensor = array_ops.constant(
           [1.0, 2.0, 3.0, 4.0, 5.0], dtypes.float32
       )
@@ -55,18 +52,8 @@ def testBypassAndMinMax(self):
       self.assertEqual(aggregator_output.max, 5.0)
       self.assertEmpty(aggregator_output.histogram)
 
-      statistics: calib_stat_pb2.CalibrationStatistics = (
-          pywrap_calibration.get_statistics_from_calibrator('1')
-      )
-
-      min_val = statistics.min_max_statistics.global_min
-      max_val = statistics.min_max_statistics.global_max
-
-      self.assertAllEqual((min_val, max_val), (1.0, 5.0))
-
   def testTwoIdentities(self):
     with self.session():
-      pywrap_calibration.clear_calibrator()
       input_tensor1 = array_ops.constant(
           [1.0, 2.0, 3.0, 4.0, 5.0], dtypes.float32
       )
@@ -97,80 +84,8 @@ def testTwoIdentities(self):
       self.assertEqual(aggregator2_output.max, -1.0)
       self.assertEmpty(aggregator2_output.histogram)
 
-      statistics: calib_stat_pb2 = (
-          pywrap_calibration.get_statistics_from_calibrator('2')
-      )
-      min_val = statistics.min_max_statistics.global_min
-      max_val = statistics.min_max_statistics.global_max
-      self.assertAllEqual((min_val, max_val), (1.0, 5.0))
-      statistics: calib_stat_pb2 = (
-          pywrap_calibration.get_statistics_from_calibrator('3')
-      )
-      min_val = statistics.min_max_statistics.global_min
-      max_val = statistics.min_max_statistics.global_max
-      self.assertAllEqual((min_val, max_val), (-5.0, -1.0))
-
-  def testClearData(self):
-    with self.session():
-      pywrap_calibration.clear_calibrator()
-      input_tensor1 = array_ops.constant(
-          [1.0, 2.0, 3.0, 4.0, 5.0], dtypes.float32
-      )
-      aggregator1 = custom_aggregator_op_wrapper.custom_aggregator(
-          input_tensor1,
-          '4',
-          calibration_method=_CalibrationMethod.CALIBRATION_METHOD_MIN_MAX,
-      )
-      aggregator1_output = self.evaluate(aggregator1)
-      self.assertAllEqual(aggregator1_output.output, [1.0, 2.0, 3.0, 4.0, 5.0])
-      self.assertEqual(aggregator1_output.min, 1.0)
-      self.assertEqual(aggregator1_output.max, 5.0)
-      self.assertEmpty(aggregator1_output.histogram)
-
-      input_tensor2 = array_ops.constant(
-          [-1.0, -2.0, -3.0, -4.0, -5.0], dtypes.float32
-      )
-      aggregator2 = custom_aggregator_op_wrapper.custom_aggregator(
-          input_tensor2,
-          '5',
-          calibration_method=_CalibrationMethod.CALIBRATION_METHOD_MIN_MAX,
-      )
-      aggregator2_output = self.evaluate(aggregator2)
-      self.assertAllEqual(
-          aggregator2_output.output, [-1.0, -2.0, -3.0, -4.0, -5.0]
-      )
-      self.assertEqual(aggregator2_output.min, -5.0)
-      self.assertEqual(aggregator2_output.max, -1.0)
-      self.assertEmpty(aggregator2_output.histogram)
-
-      statistics: calib_stat_pb2 = (
-          pywrap_calibration.get_statistics_from_calibrator('4')
-      )
-      min_val = statistics.min_max_statistics.global_min
-      max_val = statistics.min_max_statistics.global_max
-      self.assertAllEqual((min_val, max_val), (1.0, 5.0))
-
-      statistics: calib_stat_pb2 = (
-          pywrap_calibration.get_statistics_from_calibrator('5')
-      )
-      min_val = statistics.min_max_statistics.global_min
-      max_val = statistics.min_max_statistics.global_max
-      self.assertAllEqual((min_val, max_val), (-5.0, -1.0))
-
-      pywrap_calibration.clear_data_from_calibrator('4')
-      with self.assertRaises(ValueError):
-        pywrap_calibration.get_statistics_from_calibrator('4')
-
-      statistics: calib_stat_pb2 = (
-          pywrap_calibration.get_statistics_from_calibrator('5')
-      )
-      min_val = statistics.min_max_statistics.global_min
-      max_val = statistics.min_max_statistics.global_max
-      self.assertAllEqual((min_val, max_val), (-5.0, -1.0))
-
   def testBypassAndAverageMinMax(self):
     with self.session():
-      pywrap_calibration.clear_calibrator()
       input_tensor1 = array_ops.constant(
           [-50.0, -25.0, 0.0, 25.0, 50.0], dtypes.float32
       )
@@ -204,19 +119,8 @@ def testBypassAndAverageMinMax(self):
       self.assertEqual(aggregator2_output.max, 100.0)
       self.assertEmpty(aggregator2_output.histogram)
 
-      statistics: calib_stat_pb2 = (
-          pywrap_calibration.get_statistics_from_calibrator('6')
-      )
-
-      min_sum = statistics.average_min_max_statistics.min_sum
-      max_sum = statistics.average_min_max_statistics.max_sum
-      num_samples = statistics.average_min_max_statistics.num_samples
-
-      self.assertAllEqual((min_sum, max_sum, num_samples), (-150.0, 150.0, 2))
-
   def testHistogramCalibration(self):
     with self.session():
-      pywrap_calibration.clear_calibrator()
       input_tensor = array_ops.constant(
           [1.0, 1.0, 3.0, 4.0, 6.0], dtypes.float32
       )
@@ -225,7 +129,7 @@ def testHistogramCalibration(self):
           input_tensor,
           id='7',
           calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_BRUTEFORCE,
-          initial_num_bins=256,
+          num_bins=512,
       )
       aggregator_output = self.evaluate(aggregator)
       self.assertAllEqual(aggregator_output.output, [1.0, 1.0, 3.0, 4.0, 6.0])
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/pywrap_calibration.cc b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/pywrap_calibration.cc
deleted file mode 100644
index 8f7c4e30457a2e..00000000000000
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/pywrap_calibration.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <optional>
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "pybind11/pybind11.h"  // from @pybind11
-#include "pybind11_protobuf/native_proto_caster.h"  // from @pybind11_protobuf
-#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.h"
-
-namespace py = ::pybind11;
-
-namespace {
-
-using ::tensorflow::calibrator::CalibrationStatistics;
-using ::tensorflow::calibrator::CalibratorSingleton;
-
-// Retrieves collected statistics of a `CustomAggregator` node from the
-// singleton. `id` is the identifier of the `CustomAggregator`.
-CalibrationStatistics GetStatisticsFromCalibrator(const absl::string_view id) {
-  std::optional<CalibrationStatistics> statistics =
-      CalibratorSingleton::GetStatistics(id);
-
-  if (!statistics.has_value()) {
-    throw py::value_error(absl::StrFormat(
-        "Calibrated data does not exist. Cannot find statistics."
-        "value for id: '%s'",
-        id));
-  }
-
-  return *statistics;
-}
-
-}  // namespace
-
-PYBIND11_MODULE(pywrap_calibration, m) {
-  // Allows type casting protobuf objects.
-  pybind11_protobuf::ImportNativeProtoCasters();
-
-  m.doc() = "Defines functions for interacting with CalibratorSingleton.";
-
-  m.def(
-      // If the function signature changes, likely its corresponding .pyi type
-      // hinting should also change.
-      // LINT.IfChange
-      "clear_calibrator",
-      []() -> void
-      // LINT.ThenChange(pywrap_calibration.pyi:clear_calibrator)
-      { CalibratorSingleton::ClearCollectedInformation(); },
-      R"pbdoc(
-      Clears the collected metrics from the calibrator.
-    )pbdoc");
-  m.def(
-      // If the function signature changes, likely its corresponding .pyi type
-      // hinting should also change.
-      // LINT.IfChange
-      "clear_data_from_calibrator",
-      [](const absl::string_view id) -> void
-      // LINT.ThenChange(pywrap_calibration.pyi:clear_data_from_calibrator)
-      { CalibratorSingleton::ClearData(id); },
-      R"pbdoc(
-      Clears the collected data of the given id from calibrator.
-      )pbdoc",
-      py::arg("id"));
-  m.def(
-      // If the function signature changes, likely its corresponding .pyi type
-      // hinting should also change.
-      // LINT.IfChange
-      "get_statistics_from_calibrator",
-      [](const absl::string_view id) -> CalibrationStatistics {
-        // LINT.ThenChange(pywrap_calibration.pyi:get_statistics_from_calibrator)
-        return GetStatisticsFromCalibrator(id);
-      },
-      R"pbdoc(
-      Returns the proto CalibrationStatistics given id from calibrator.
-      )pbdoc",
-      py::arg("id"));
-}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/constant_fold.cc b/tensorflow/compiler/mlir/quantization/tensorflow/cc/constant_fold.cc
index 565adebfe52300..64695d6719885d 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/cc/constant_fold.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/constant_fold.cc
@@ -71,7 +71,7 @@ LogicalResult FoldOperation(OpBuilder& builder, Operation* op,
 bool IsOperationFoldable(Operation* op) {
   if (isa<TF::ConstOp>(op)) return true;
 
-  if (!op->getDialect()->getNamespace().equals("tf") || !TF::CanBeFolded(op)) {
+  if (op->getDialect()->getNamespace() != "tf" || !TF::CanBeFolded(op)) {
     return false;
   }
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/convert_asset_args.cc b/tensorflow/compiler/mlir/quantization/tensorflow/cc/convert_asset_args.cc
index c2445456339fb9..60d2c07bdab8ea 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/cc/convert_asset_args.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/convert_asset_args.cc
@@ -93,8 +93,7 @@ SmallVector<StringRef> GetEntryFunctionInputs(func::FuncOp func_op) {
       func_op->getAttrOfType<DictionaryAttr>("tf.entry_function");
 
   SmallVector<StringRef> inputs;
-  entry_function_attr.get("inputs")
-      .dyn_cast_or_null<StringAttr>()
+  mlir::dyn_cast_or_null<StringAttr>(entry_function_attr.get("inputs"))
       .strref()
       .split(inputs, /*Separator=*/",");
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/convert_asset_args_test.cc b/tensorflow/compiler/mlir/quantization/tensorflow/cc/convert_asset_args_test.cc
index 238b1bb8ef8955..d77859a67c9dca 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/cc/convert_asset_args_test.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/convert_asset_args_test.cc
@@ -94,9 +94,11 @@ TEST_F(ConvertAssetArgsTest, ConvertsSingleAssetArg) {
   EXPECT_THAT(arg_attrs.get("tf_saved_model.bound_input"), IsNull());
 
   const ArrayRef<Attribute> index_path_attrs =
-      arg_attrs.get("tf_saved_model.index_path").cast<ArrayAttr>().getValue();
+      mlir::cast<ArrayAttr>(arg_attrs.get("tf_saved_model.index_path"))
+          .getValue();
   EXPECT_THAT(index_path_attrs, SizeIs(1));
-  StringAttr index_path = index_path_attrs[0].dyn_cast_or_null<StringAttr>();
+  StringAttr index_path =
+      mlir::dyn_cast_or_null<StringAttr>(index_path_attrs[0]);
   EXPECT_THAT(index_path, NotNull());
   EXPECT_THAT(index_path, Eq("arg_0:0"));
 }
@@ -122,9 +124,11 @@ TEST_F(ConvertAssetArgsTest, NonBoundedArgsNotModified) {
   EXPECT_THAT(arg_attrs.get("tf_saved_model.bound_input"), IsNull());
 
   const ArrayRef<Attribute> index_path_attrs =
-      arg_attrs.get("tf_saved_model.index_path").cast<ArrayAttr>().getValue();
+      mlir::cast<ArrayAttr>(arg_attrs.get("tf_saved_model.index_path"))
+          .getValue();
   EXPECT_THAT(index_path_attrs, SizeIs(1));
-  StringAttr index_path = index_path_attrs[0].dyn_cast_or_null<StringAttr>();
+  StringAttr index_path =
+      mlir::dyn_cast_or_null<StringAttr>(index_path_attrs[0]);
   EXPECT_THAT(index_path, NotNull());
   EXPECT_THAT(index_path, Eq("arg_0:0"));
 }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/quantization_unit_loc.cc b/tensorflow/compiler/mlir/quantization/tensorflow/cc/quantization_unit_loc.cc
index 7be369e7947ced..8ba632b66ae0f3 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/cc/quantization_unit_loc.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/quantization_unit_loc.cc
@@ -65,9 +65,9 @@ bool QuantizationUnitLoc::classof(Attribute attr) {
   if (!llvm::isa<CallSiteLoc>(attr)) return false;
   auto callsite_loc = llvm::dyn_cast<CallSiteLoc>(attr);
 
-  if (!callsite_loc.getCaller().isa<NameLoc>()) return false;
+  if (!mlir::isa<NameLoc>(callsite_loc.getCaller())) return false;
   StringRef caller_name =
-      callsite_loc.getCaller().cast<NameLoc>().getName().strref();
+      mlir::cast<NameLoc>(callsite_loc.getCaller()).getName().strref();
   return caller_name.starts_with(kQuantizationUnitPrefix) &&
          caller_name.ends_with(kQuantizationUnitSuffix);
 }
@@ -75,8 +75,8 @@ bool QuantizationUnitLoc::classof(Attribute attr) {
 std::optional<QuantizationUnitLoc::QuantizationUnit>
 FindQuantizationUnitFromLoc(Location loc) {
   if (isa<QuantizationUnitLoc>(loc)) {
-    Location caller = loc.cast<CallSiteLoc>().getCaller();
-    StringRef caller_name = caller.cast<NameLoc>().getName().strref();
+    Location caller = mlir::cast<CallSiteLoc>(loc).getCaller();
+    StringRef caller_name = mlir::cast<NameLoc>(caller).getName().strref();
     const size_t start_index = kQuantizationUnitPrefix.size();
     const size_t end_index = caller_name.rfind(kQuantizationUnitSuffix);
     std::string serialized_proto =
@@ -87,14 +87,15 @@ FindQuantizationUnitFromLoc(Location loc) {
     }
   } else if (isa<FusedLoc>(loc)) {
     // If the op is rewritten, FusedLoc can be created.
-    for (Location child_loc : loc.cast<FusedLoc>().getLocations()) {
+    for (Location child_loc : mlir::cast<FusedLoc>(loc).getLocations()) {
       std::optional<QuantizationUnitLoc::QuantizationUnit> found_unit =
           FindQuantizationUnitFromLoc(child_loc);
       if (found_unit.has_value()) return found_unit;
     }
   } else if (isa<CallSiteLoc>(loc)) {
     // If the graph is inlined, CallSiteLoc can be created.
-    return FindQuantizationUnitFromLoc(loc.cast<CallSiteLoc>().getCallee());
+    return FindQuantizationUnitFromLoc(
+        mlir::cast<CallSiteLoc>(loc).getCallee());
   }
 
   return std::nullopt;
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.cc b/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.cc
index 52ca3722a12bd5..9630b20b32d571 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.cc
@@ -56,7 +56,7 @@ bool IsOpWithInt8TypeOperand(Operation* op) {
 }
 
 bool IsValueWithQuantizablePrecision(Value val) {
-  auto type = val.getType().dyn_cast<ShapedType>();
+  auto type = mlir::dyn_cast<ShapedType>(val.getType());
   if (!type) return false;
   // Supported original tensor data types.
   if (type.getElementType().isF32() || type.getElementType().isBF16())
@@ -82,7 +82,7 @@ std::unique_ptr<OpQuantSpec> GetTFOpQuantSpec(Operation* op) {
   auto spec = std::make_unique<OpQuantSpec>();
   if (auto call_op = dyn_cast<TF::PartitionedCallOp>(op)) {
     StringRef function_name =
-        call_op.getFAttr().cast<FlatSymbolRefAttr>().getValue();
+        mlir::cast<FlatSymbolRefAttr>(call_op.getFAttr()).getValue();
     if (!function_name.starts_with("composite_")) {
       return spec;
     }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_quantize_op.cc b/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_quantize_op.cc
index 723adde447e546..47beb9e0c2636f 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_quantize_op.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_quantize_op.cc
@@ -153,11 +153,10 @@ QuantizedType CalculateUniformQuantParams(
   DenseFPElementsAttr attr;
   if (!matchPattern(op->getResult(0), m_Constant(&attr))) return nullptr;
 
-  QuantizedType quant_type =
+  QuantizedType quant_type = mlir::dyn_cast<quant::QuantizedType>(
       quant::GetUniformQuantizedTypeForWeight(
           attr, /*symmetric=*/kIsNarrowRange && kIsSigned, kBitWidth, kIsSigned,
-          kIsNarrowRange, /*is_legacy_float*/ false)
-          .template dyn_cast<quant::QuantizedType>();
+          kIsNarrowRange, /*is_legacy_float*/ false));
 
   return quant_type;
 }
@@ -172,16 +171,16 @@ std::optional<Value> AddUniformQuantizeOps(PatternRewriter& rewriter,
   }
   Type expressed_type = op.getResult().getType();
   Type quantized_type = quant_type.castFromExpressedType(expressed_type);
-  ShapedType shaped_quantized_type = quantized_type.cast<ShapedType>();
+  ShapedType shaped_quantized_type = mlir::cast<ShapedType>(quantized_type);
   DenseElementsAttr tensor_proto_attr =
-      Quantize(attr, shaped_quantized_type).dyn_cast<DenseElementsAttr>();
+      mlir::dyn_cast<DenseElementsAttr>(Quantize(attr, shaped_quantized_type));
   if (!tensor_proto_attr) {
     return nullptr;
   }
 
-  Type storage_type = shaped_quantized_type.getElementType()
-                          .cast<QuantizedType>()
-                          .getStorageType();
+  Type storage_type =
+      mlir::cast<QuantizedType>(shaped_quantized_type.getElementType())
+          .getStorageType();
   ShapedType new_type = shaped_quantized_type.clone(storage_type);
 
   rewriter.setInsertionPointAfter(op);
@@ -205,7 +204,7 @@ Operation* LogicsForUniformDequanization(PatternRewriter& rewriter,
   auto new_cast_op =
       rewriter.create<TF::CastOp>(loc, create_unknown_input_shape, input_val);
   // TODO - b/278949920: Enable Per-Channel Quantization for XLA Opset
-  auto qtype = quant_type.dyn_cast<UniformQuantizedType>();
+  auto qtype = mlir::dyn_cast<UniformQuantizedType>(quant_type);
   TensorType scale_type = RankedTensorType::get({}, rewriter.getF32Type());
   Value scale_op = rewriter.create<TF::ConstOp>(
       loc, scale_type,
@@ -253,7 +252,7 @@ std::optional<TF::PartitionedCallOp> ApplyUniformQuantization(
 
   std::optional<TF::PartitionedCallOp> dequantized_val =
       AddUniformDequantizeOps(rewriter, quant_type, quantized_val.value(),
-                              op.getType().cast<ShapedType>());
+                              mlir::cast<ShapedType>(op.getType()));
 
   return dequantized_val;
 }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/add_quantization_unit_loc.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/add_quantization_unit_loc.cc
index d390ac6d548e78..109fa943f9334b 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/add_quantization_unit_loc.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/add_quantization_unit_loc.cc
@@ -69,17 +69,17 @@ class AddQuantizationUnitLocPass
 // tensorflow/compiler/mlir/tensorflow/translate/import_model.cc for more
 // details.
 bool IsImportLocPattern(FusedLoc loc) {
-  ArrayRef<Location> locations = loc.cast<FusedLoc>().getLocations();
+  ArrayRef<Location> locations = mlir::cast<FusedLoc>(loc).getLocations();
   if (locations.size() < 2 || !isa<NameLoc>(locations.front())) return false;
 
   StringRef op_type_with_suffix =
-      locations.front().cast<NameLoc>().getName().strref();
+      mlir::cast<NameLoc>(locations.front()).getName().strref();
   if (!op_type_with_suffix.ends_with(":")) return false;
 
   return absl::c_all_of(locations, [](Location loc) {
     return isa<NameLoc>(loc) ||
            (isa<CallSiteLoc>(loc) &&
-            isa<NameLoc>(loc.cast<CallSiteLoc>().getCallee()));
+            isa<NameLoc>(mlir::cast<CallSiteLoc>(loc).getCallee()));
   });
 }
 
@@ -99,23 +99,23 @@ void FindQuantizationUnitsRecursively(Location loc,
     }
   };
 
-  ArrayRef<Location> locations = loc.cast<FusedLoc>().getLocations();
-  if (IsImportLocPattern(loc.cast<FusedLoc>())) {
+  ArrayRef<Location> locations = mlir::cast<FusedLoc>(loc).getLocations();
+  if (IsImportLocPattern(mlir::cast<FusedLoc>(loc))) {
     QuantizationUnit new_unit;
     // Op type is a NameLoc with the ":" suffix.
     StringRef op_type_with_suffix =
-        locations.front().cast<NameLoc>().getName().strref();
+        mlir::cast<NameLoc>(locations.front()).getName().strref();
     StringRef op_type =
         op_type_with_suffix.substr(0, op_type_with_suffix.size() - 1);
     new_unit.set_op_type(op_type.str());
 
     if (isa<NameLoc>(locations.back())) {
       StringRef name_loc_id =
-          locations.back().cast<NameLoc>().getName().strref();
+          mlir::cast<NameLoc>(locations.back()).getName().strref();
       set_node_and_func_name(new_unit, name_loc_id);
     } else {
-      Location callee = locations.back().cast<CallSiteLoc>().getCallee();
-      StringRef name_loc_id = callee.cast<NameLoc>().getName().strref();
+      Location callee = mlir::cast<CallSiteLoc>(locations.back()).getCallee();
+      StringRef name_loc_id = mlir::cast<NameLoc>(callee).getName().strref();
       set_node_and_func_name(new_unit, name_loc_id);
     }
     units.push_back(new_unit);
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_custom_aggregation_op_to_quant_stats.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_custom_aggregation_op_to_quant_stats.cc
index e4229cb97bf45a..8c02ace87d8001 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_custom_aggregation_op_to_quant_stats.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_custom_aggregation_op_to_quant_stats.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
@@ -75,8 +76,8 @@ class ConvertCustomAggregationOpToQuantStats
 
   LogicalResult matchAndRewrite(TF::CustomAggregatorOp op,
                                 PatternRewriter &rewriter) const override {
-    FloatAttr min = op->getAttr("min").dyn_cast_or_null<FloatAttr>();
-    FloatAttr max = op->getAttr("max").dyn_cast_or_null<FloatAttr>();
+    FloatAttr min = mlir::dyn_cast_or_null<FloatAttr>(op->getAttr("min"));
+    FloatAttr max = mlir::dyn_cast_or_null<FloatAttr>(op->getAttr("max"));
 
     // When there are no min and max attributes, remove op.
     if (min == nullptr || max == nullptr) {
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_tf_xla_op_to_tf_op.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_tf_xla_op_to_tf_op.cc
index d23a0f8d3a7af2..c39492f0efe709 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_tf_xla_op_to_tf_op.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_tf_xla_op_to_tf_op.cc
@@ -158,10 +158,8 @@ Value CreateEinsumOpFromXlaDotV2Op(OpBuilder& builder, const Location loc,
   xla::DotDimensionNumbers dot_dimension_numbers;
   dot_dimension_numbers.ParseFromString(dot_dimension_numbers_str.str());
   SmallVector<Value> input_arguments = {lhs, rhs};
-  const int lhs_rank =
-      lhs.getType().template cast<ShapedType>().getShape().size();
-  const int rhs_rank =
-      rhs.getType().template cast<ShapedType>().getShape().size();
+  const int lhs_rank = mlir::cast<ShapedType>(lhs.getType()).getShape().size();
+  const int rhs_rank = mlir::cast<ShapedType>(rhs.getType()).getShape().size();
 
   const std::string einsum_equation =
       CreateEinsumEquation(dot_dimension_numbers, lhs_rank, rhs_rank);
@@ -218,7 +216,7 @@ RankedTensorType RestoreCollapsedDimensions(
 Type GetSliceOpOutputType(Type xla_gather_op_output_type,
                           const absl::flat_hash_set<int64_t>& collapsed_dims) {
   if (auto ranked_output_type =
-          xla_gather_op_output_type.dyn_cast<RankedTensorType>();
+          mlir::dyn_cast<RankedTensorType>(xla_gather_op_output_type);
       ranked_output_type) {
     return RestoreCollapsedDimensions(ranked_output_type, collapsed_dims);
   }
@@ -228,9 +226,9 @@ Type GetSliceOpOutputType(Type xla_gather_op_output_type,
 
 // TODO (b/275225582): Supports Xla Gather op in general case.
 bool IsXlaGatherWithoutBatch(Value operand, Value start_indices) {
-  auto operand_type = operand.getType().dyn_cast_or_null<ShapedType>();
+  auto operand_type = mlir::dyn_cast_or_null<ShapedType>(operand.getType());
   auto start_indices_type =
-      start_indices.getType().dyn_cast_or_null<ShapedType>();
+      mlir::dyn_cast_or_null<ShapedType>(start_indices.getType());
   if (start_indices_type == nullptr || operand_type == nullptr) return false;
   return start_indices_type.getShape().size() == 1;
 }
@@ -245,7 +243,7 @@ Value CreateSliceAndReshapeOpFromXlaGatherOpWithoutBatch(
   // Construct full start_indices with given start_indices and
   // start_index_map.
   const ArrayRef<int64_t> operand_shape =
-      operand.getType().cast<ShapedType>().getShape();
+      mlir::cast<ShapedType>(operand.getType()).getShape();
   const int64_t operand_rank = operand_shape.size();
 
   // Fills zeros if start_index is not given in start_indices.
@@ -273,7 +271,7 @@ Value CreateSliceAndReshapeOpFromXlaGatherOpWithoutBatch(
       builder.create<TF::CastOp>(
           loc,
           RankedTensorType::get(
-              start_indices.getType().template cast<ShapedType>().getShape(),
+              mlir::cast<ShapedType>(start_indices.getType()).getShape(),
               builder.getI64Type()),
           start_indices));
 
@@ -289,7 +287,7 @@ Value CreateSliceAndReshapeOpFromXlaGatherOpWithoutBatch(
       builder.create<TF::CastOp>(
           loc,
           RankedTensorType::get(
-              slice_sizes.getType().template cast<ShapedType>().getShape(),
+              mlir::cast<ShapedType>(slice_sizes.getType()).getShape(),
               builder.getI64Type()),
           slice_sizes));
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_tpu_model_to_cpu.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_tpu_model_to_cpu.cc
index b4cdcd8f771a21..b3fc6207842469 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_tpu_model_to_cpu.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_tpu_model_to_cpu.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
@@ -92,7 +93,7 @@ class ReplaceTpuPartitionedCallOpWithPartitionedCallOp
  private:
   LogicalResult matchAndRewrite(TF::TPUPartitionedCallOp call_op,
                                 PatternRewriter& rewriter) const override {
-    auto f_attr = call_op.getFAttr().dyn_cast<FlatSymbolRefAttr>();
+    auto f_attr = mlir::dyn_cast<FlatSymbolRefAttr>(call_op.getFAttr());
     auto module_op = call_op->getParentOfType<ModuleOp>();
     SymbolTable symbol_table(module_op);
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_custom_aggregation_ops.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_custom_aggregation_ops.cc
index 5ed89d89339571..c5d7ca8e47f6f9 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_custom_aggregation_ops.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_custom_aggregation_ops.cc
@@ -14,10 +14,14 @@ limitations under the License.
 ==============================================================================*/
 #include <cstdint>
 #include <memory>
+#include <optional>
+#include <string>
 #include <utility>
 
 #include "absl/status/statusor.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Support/CommandLine.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
@@ -28,6 +32,7 @@ limitations under the License.
 #include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
@@ -53,6 +58,40 @@ using ::stablehlo::quantization::Method;
 
 constexpr StringRef kQuantTraitAttrName = "_tfl_quant_trait";
 
+// Whether the op is a call op to lifted composite function.
+bool IsCallToQuantizableLiftedFunction(Operation *op) {
+  if (!op) return false;
+  if (auto xla_call_module_op = dyn_cast_or_null<TF::XlaCallModuleOp>(op);
+      xla_call_module_op != nullptr) {
+    absl::StatusOr<Method> method = GetQuantizationMethod(xla_call_module_op);
+    if (method.ok() && method->has_static_range_ptq()) return true;
+  }
+
+  TF::PartitionedCallOp call_op = dyn_cast_or_null<TF::PartitionedCallOp>(op);
+  return call_op && call_op->hasAttrOfType<StringAttr>(kQuantTraitAttrName) &&
+         call_op->getAttrOfType<StringAttr>(kQuantTraitAttrName).getValue() ==
+             llvm::StringRef(
+                 QuantTraitValues[QuantizationTrait::FullyQuantizable]);
+}
+
+// Returns the composite function name.
+std::optional<StringRef> GetCompsiteFunctionName(Operation *op) {
+  if (!IsCallToQuantizableLiftedFunction(op)) return std::nullopt;
+
+  if (auto xla_call_module_op = dyn_cast_or_null<TF::XlaCallModuleOp>(op);
+      xla_call_module_op != nullptr) {
+    auto entry_function_attr = xla_call_module_op->getAttrOfType<StringAttr>(
+        kOriginalStablehloEntryFunctionAttrName);
+    if (!entry_function_attr) return std::nullopt;
+    return entry_function_attr.getValue();
+  } else {
+    TF::PartitionedCallOp call_op = dyn_cast_or_null<TF::PartitionedCallOp>(op);
+    const auto f_attr = call_op.getFAttr().dyn_cast<FlatSymbolRefAttr>();
+    if (!f_attr) return std::nullopt;
+    return f_attr.getValue();
+  }
+}
+
 class InsertCustomAggregationOpsPass
     : public PassWrapper<InsertCustomAggregationOpsPass,
                          OperationPass<func::FuncOp>> {
@@ -145,7 +184,7 @@ class InsertCustomAggregationOpsPass
             CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_PERCENTILE);
         auto calibration_parameters =
             CalibrationOptions::CalibrationParameters();
-        calibration_parameters.set_initial_num_bins(256);
+        calibration_parameters.set_num_bins(512);
         calibration_parameters.set_min_percentile(0.001);
         calibration_parameters.set_max_percentile(99.999);
         calib_opts_.mutable_calibration_parameters()->CopyFrom(
@@ -157,7 +196,7 @@ class InsertCustomAggregationOpsPass
             CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_BRUTEFORCE);
         auto calibration_parameters =
             CalibrationOptions::CalibrationParameters();
-        calibration_parameters.set_initial_num_bins(256);
+        calibration_parameters.set_num_bins(512);
         calib_opts_.mutable_calibration_parameters()->CopyFrom(
             calibration_parameters);
         break;
@@ -167,7 +206,7 @@ class InsertCustomAggregationOpsPass
             CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_MAX_FREQUENCY);
         auto calibration_parameters =
             CalibrationOptions::CalibrationParameters();
-        calibration_parameters.set_initial_num_bins(256);
+        calibration_parameters.set_num_bins(512);
         calib_opts_.mutable_calibration_parameters()->CopyFrom(
             calibration_parameters);
         break;
@@ -177,7 +216,7 @@ class InsertCustomAggregationOpsPass
             CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_SYMMETRIC);
         auto calibration_parameters =
             CalibrationOptions::CalibrationParameters();
-        calibration_parameters.set_initial_num_bins(256);
+        calibration_parameters.set_num_bins(512);
         calib_opts_.mutable_calibration_parameters()->CopyFrom(
             calibration_parameters);
         break;
@@ -204,17 +243,22 @@ class AddCustomAggregationOp : public RewritePattern {
 
     // The CustomAggregatorOp is only added after quantizable values.
     SmallVector<Value> quantizable_values;
+    SmallVector<std::string> aggregator_ids;
     if (IsCallToQuantizableLiftedFunction(op)) {
+      std::optional<StringRef> composite_function_name =
+          GetCompsiteFunctionName(op);
+      if (!composite_function_name.has_value()) return failure();
+
       // Quantize inputs of quantizable composite functions.
-      for (Value input : op->getOperands()) {
-        Type element_type = getElementTypeOrSelf(input.getType());
+      for (OpOperand &input : op->getOpOperands()) {
+        Type element_type = getElementTypeOrSelf(input.get().getType());
         // Non-float cases won't be calibrated.
         if (!element_type.isF32()) {
           continue;
         }
 
         // Skip when there is any already existing CustomAggregatorOp found.
-        Operation *defining_op = input.getDefiningOp();
+        Operation *defining_op = input.get().getDefiningOp();
         if (dyn_cast_or_null<TF::CustomAggregatorOp>(defining_op)) {
           continue;
         }
@@ -225,41 +269,51 @@ class AddCustomAggregationOp : public RewritePattern {
           continue;
         }
 
-        quantizable_values.push_back(input);
+        quantizable_values.push_back(input.get());
+        aggregator_ids.push_back(
+            (llvm::Twine(composite_function_name.value()) + "_arg_" +
+             llvm::Twine(input.getOperandNumber()) + "_calibration_method_" +
+             llvm::Twine(calib_opts_.calibration_method()))
+                .str());
       }
     } else {
       // Quantize output of fully quantizable composite functions.
       for (Value input : op->getOperands()) {
         auto defining_op = input.getDefiningOp();
-        if (!IsCallToQuantizableLiftedFunction(defining_op)) {
-          continue;
-        }
+        std::optional<StringRef> composite_function_name =
+            GetCompsiteFunctionName(defining_op);
+        if (!composite_function_name.has_value()) continue;
 
         // Do not add CustomAggregatorOp after Gather since it is a weight-only
         // quantizable op.
         if (auto call_op =
                 dyn_cast_or_null<TF::PartitionedCallOp>(defining_op)) {
           StringRef function_name =
-              call_op.getFAttr().cast<FlatSymbolRefAttr>().getValue();
+              mlir::cast<FlatSymbolRefAttr>(call_op.getFAttr()).getValue();
           if (function_name.contains("gather")) continue;
         }
 
         quantizable_values.push_back(input);
+        // All composite functions have a single result at the moment.
+        aggregator_ids.push_back((llvm::Twine(composite_function_name.value()) +
+                                  "_calibration_method_" +
+                                  llvm::Twine(calib_opts_.calibration_method()))
+                                     .str());
       }
     }
     if (quantizable_values.empty()) return failure();
 
-    for (Value value : quantizable_values) {
+    int32_t effective_num_bins = GetNumBins(calib_opts_);
+    for (auto [value, aggregator_id] :
+         llvm::zip_equal(quantizable_values, aggregator_ids)) {
       // ID attribute will have empty value for now.
       SmallVector<NamedAttribute, 5> attributes{
-          rewriter.getNamedAttr("id", rewriter.getStringAttr("")),
+          rewriter.getNamedAttr("id", rewriter.getStringAttr(aggregator_id)),
           rewriter.getNamedAttr(
               "calibration_method",
               rewriter.getI32IntegerAttr(calib_opts_.calibration_method())),
-          rewriter.getNamedAttr(
-              "initial_num_bins",
-              rewriter.getI32IntegerAttr(
-                  calib_opts_.calibration_parameters().initial_num_bins())),
+          rewriter.getNamedAttr("num_bins",
+                                rewriter.getI32IntegerAttr(effective_num_bins)),
           rewriter.getNamedAttr(
               "min_percentile",
               rewriter.getF32FloatAttr(
@@ -270,12 +324,11 @@ class AddCustomAggregationOp : public RewritePattern {
                   calib_opts_.calibration_parameters().max_percentile())),
       };
 
-      int32_t num_bins = GetNumBins(calib_opts_.calibration_method());
       SmallVector<Type, 4> output_types{
           value.getType(),
           RankedTensorType::get({}, rewriter.getF32Type()),
           RankedTensorType::get({}, rewriter.getF32Type()),
-          RankedTensorType::get({num_bins}, rewriter.getI64Type()),
+          RankedTensorType::get({effective_num_bins}, rewriter.getI64Type()),
       };
 
       // Insert custom aggregation op between operand and operator.
@@ -292,22 +345,6 @@ class AddCustomAggregationOp : public RewritePattern {
 
  private:
   CalibrationOptions calib_opts_;
-
-  // Whether the op is a call op to lifted composite function.
-  bool IsCallToQuantizableLiftedFunction(Operation *op) const {
-    if (!op) return false;
-    if (auto xla_call_module_op = dyn_cast_or_null<TF::XlaCallModuleOp>(op);
-        xla_call_module_op != nullptr) {
-      absl::StatusOr<Method> method = GetQuantizationMethod(xla_call_module_op);
-      if (method.ok() && method->has_static_range_ptq()) return true;
-    }
-
-    TF::PartitionedCallOp call_op = dyn_cast_or_null<TF::PartitionedCallOp>(op);
-    return call_op && call_op->hasAttrOfType<StringAttr>(kQuantTraitAttrName) &&
-           call_op->getAttrOfType<StringAttr>(kQuantTraitAttrName)
-               .getValue()
-               .equals(QuantTraitValues[QuantizationTrait::FullyQuantizable]);
-  }
 };
 
 void InsertCustomAggregationOpsPass::runOnOperation() {
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_main_function.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_main_function.cc
index 682889917c112e..0f855088d17943 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_main_function.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_main_function.cc
@@ -154,7 +154,7 @@ void GetUniqueInputOutputNodeNames(ModuleOp module_op,
 
       if (auto inputs_attr = tf_attrs.get("inputs")) {
         const std::string inputs_attr_str =
-            inputs_attr.cast<StringAttr>().getValue().str();
+            mlir::cast<StringAttr>(inputs_attr).getValue().str();
         std::vector<std::string> fn_input_names =
             absl::StrSplit(inputs_attr_str, ',', absl::SkipEmpty());
 
@@ -174,7 +174,7 @@ void GetUniqueInputOutputNodeNames(ModuleOp module_op,
 
       if (auto outputs_attr = tf_attrs.get("outputs")) {
         const std::string outputs_attr_str =
-            outputs_attr.cast<StringAttr>().getValue().str();
+            mlir::cast<StringAttr>(outputs_attr).getValue().str();
         std::vector<std::string> fn_output_names =
             absl::StrSplit(outputs_attr_str, ',', absl::SkipEmpty());
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/issue_ids_of_custom_aggregation_ops.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/issue_ids_of_custom_aggregation_ops.cc
deleted file mode 100644
index 1100a903f6e226..00000000000000
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/issue_ids_of_custom_aggregation_ops.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <tuple>
-#include <utility>
-
-#include "llvm/ADT/StringRef.h"
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
-
-namespace mlir {
-namespace quant {
-namespace {
-
-class IssueIDsOfCustomAggregationOpsPass
-    : public PassWrapper<IssueIDsOfCustomAggregationOpsPass,
-                         OperationPass<ModuleOp>> {
- public:
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
-      IssueIDsOfCustomAggregationOpsPass)
-
-  StringRef getArgument() const final {
-    // This is the argument used to refer to the pass in the textual format (on
-    // the commandline for example).
-    return "quant-issues-ids-of-custom-aggregation-ops";
-  }
-
-  StringRef getDescription() const final {
-    // This is a brief description of the pass.
-    return "Issue IDs of custom aggregation ops for the calibration procedure";
-  }
-
-  void getDependentDialects(DialectRegistry& registry) const override {
-    registry.insert<TF::TensorFlowDialect>();
-  }
-
- private:
-  void runOnOperation() override;
-
-  void issueIdToCustomAggregator(Operation* op);
-
-  // Count of aggregator ops encountered;
-  int aggregator_count_;
-};
-
-static PassRegistration<IssueIDsOfCustomAggregationOpsPass> pass;
-
-void IssueIDsOfCustomAggregationOpsPass::issueIdToCustomAggregator(
-    Operation* op) {
-  // Return early when only aggregator operators are given.
-  if (!dyn_cast_or_null<TF::CustomAggregatorOp>(op)) return;
-
-  // Issue id based on the number of aggregators found.
-  OpBuilder builder(op);
-  op->setAttr("id", builder.getStringAttr(std::to_string(aggregator_count_)));
-  ++aggregator_count_;
-}
-
-void IssueIDsOfCustomAggregationOpsPass::runOnOperation() {
-  ModuleOp module = getOperation();
-  module.walk([&](Operation* op) { issueIdToCustomAggregator(op); });
-}
-
-}  // namespace
-
-std::unique_ptr<OperationPass<ModuleOp>>
-CreateIssueIDsOfCustomAggregationOpsPass() {
-  return std::make_unique<IssueIDsOfCustomAggregationOpsPass>();
-}
-
-}  // namespace quant
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_hashtable_ops_as_args.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_hashtable_ops_as_args.cc
index f48d15dd81cbdb..18ee96bfe9422e 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_hashtable_ops_as_args.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_hashtable_ops_as_args.cc
@@ -70,14 +70,15 @@ bool IsHashTableOp(Operation* op) {
 
 // Checks if the function is the main or initializer function.
 bool IsMainOrInitializerFunction(ModuleOp module, func::FuncOp func) {
-  if (func.getSymName().equals(tensorflow::kImportModelDefaultGraphFuncName) ||
-      func.getSymName().equals(kTfQuantSaveFuncName)) {
+  if (func.getSymName() ==
+          llvm::StringRef(tensorflow::kImportModelDefaultGraphFuncName) ||
+      func.getSymName() == kTfQuantSaveFuncName) {
     return true;
   }
 
   for (func::FuncOp init_func :
        tf_saved_model::GetInitializerFunctions(module)) {
-    if (func.getSymName().equals(init_func.getSymName())) {
+    if (func.getSymName() == init_func.getSymName()) {
       return true;
     }
   }
@@ -118,7 +119,7 @@ bool IsResourceInitialized(ModuleOp module_op, Operation* hash_table) {
        tf_saved_model::GetInitializerFunctions(module_op)) {
     for (Operation& op : init_func_op.getBody().getOps()) {
       StringRef other_shared_name = GetSharedName(&op);
-      if (IsHashTableOp(&op) && other_shared_name.equals(shared_name)) {
+      if (IsHashTableOp(&op) && other_shared_name == shared_name) {
         return true;
       }
     }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.cc
index 63fb3bd94005ee..672cd78b01de9c 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.cc
@@ -174,7 +174,7 @@ class CheckQuantizableOps
   LogicalResult matchAndRewrite(TF::PartitionedCallOp call_op,
                                 PatternRewriter& rewriter) const override {
     StringRef function_name =
-        call_op.getFAttr().cast<FlatSymbolRefAttr>().getValue();
+        mlir::cast<FlatSymbolRefAttr>(call_op.getFAttr()).getValue();
     if (!function_name.starts_with("composite_") ||
         !call_op->hasAttr(kQuantTraitAttrName)) {
       return failure();
@@ -193,11 +193,10 @@ class CheckQuantizableOps
     }
 
     // Only the composite functions with f32 inputs are quantizable.
-    if (call_op.getResults().size() == 1 && !call_op->getResult(0)
-                                                 .getType()
-                                                 .cast<ShapedType>()
-                                                 .getElementType()
-                                                 .isF32()) {
+    if (call_op.getResults().size() == 1 &&
+        !mlir::cast<ShapedType>(call_op->getResult(0).getType())
+             .getElementType()
+             .isF32()) {
       check_status.Update(absl::InternalError(
           "Composite functions for quantization should be f32 type."));
     }
@@ -274,7 +273,7 @@ class CheckQuantizableOps
       // For BatchMatMul, the input must be ranked to determine the batch
       // dimensions.
       ShapedType shaped_type =
-          call_op->getOperand(0).getType().dyn_cast<ShapedType>();
+          mlir::dyn_cast<ShapedType>(call_op->getOperand(0).getType());
       if (!shaped_type || !shaped_type.hasRank()) {
         return absl::InternalError("The input of BatchMatMul must have rank.");
       }
@@ -282,7 +281,8 @@ class CheckQuantizableOps
       // This op is guaranteed to be a constant as ODS checks IsConstTensor.
       // Check if the number of elements meets the requirement.
       int64_t num_elements =
-          call_op.getOperand(0).getType().cast<ShapedType>().getNumElements();
+          mlir::cast<ShapedType>(call_op.getOperand(0).getType())
+              .getNumElements();
       if (num_elements < quant_options_.min_num_elements_for_weights()) {
         return absl::InternalError(
             "The params of Gather have fewer number of elements than "
@@ -391,7 +391,9 @@ void LiftQuantizableSpotsAsFunctionsPass::runOnOperation() {
   populateWithGenerated(patterns);
   patterns.add<CheckQuantizableOps>(ctx, quant_options_);
   FrozenRewritePatternSet frozen_patterns(std::move(patterns));
-  for (auto func : module.getOps<func::FuncOp>()) {
+
+  // Iterate over the sorted list of functions to keep the order deterministic.
+  for (func::FuncOp func : GetSortedFunctions(module)) {
     if (failed(applyPatternsAndFoldGreedily(func, frozen_patterns))) {
       func.emitError() << "quant-lift-quantizable-spots-as-functions failed.";
       signalPassFailure();
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions_drq.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions_drq.cc
index 0acb2e56ea617e..a75bef5f842746 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions_drq.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions_drq.cc
@@ -137,7 +137,8 @@ class CheckQuantizableOps
       // This op is guaranteed to be a constant as ODS checks IsConstTensor.
       // Check if the number of elements meets the requirement.
       int current_num_elements =
-          call_op.getOperand(idx).getType().cast<ShapedType>().getNumElements();
+          mlir::cast<ShapedType>(call_op.getOperand(idx).getType())
+              .getNumElements();
       if (current_num_elements < min_num_elements_for_weights_) {
         call_op.emitRemark("Quantization is skipped for ")
             << call_op->getName().getStringRef().str() << " because it has "
@@ -149,7 +150,7 @@ class CheckQuantizableOps
     }
 
     StringRef function_name =
-        call_op.getFAttr().cast<FlatSymbolRefAttr>().getValue();
+        mlir::cast<FlatSymbolRefAttr>(call_op.getFAttr()).getValue();
     if ((quantization_method_ == tensorflow::quantization::QuantizationMethod::
                                      METHOD_DYNAMIC_RANGE_INT8) &&
         (function_name.contains("batch_matmul") ||
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_initializer_function_ops_to_main.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_initializer_function_ops_to_main.cc
index f1f65a1a183371..fe196b9caa4452 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_initializer_function_ops_to_main.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_initializer_function_ops_to_main.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "mlir/IR/TypeRange.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/common/func.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/manipulate_model_attr.h"
@@ -56,7 +57,6 @@ using ::mlir::tf_saved_model::kTfSavedModelInitializerInitType;
 using ::mlir::tf_saved_model::kTfSavedModelInitializerRestoreType;
 using ::mlir::tf_saved_model::kTfSavedModelInitializerTypeAttr;
 using ::mlir::tf_saved_model::SessionInitializerOp;
-using ::tensorflow::kImportModelDefaultGraphFuncName;
 
 // Array of initializer functions' types. The corresponding initializer
 // functions should be merged in this order. This is because:
@@ -153,7 +153,7 @@ LogicalResult ValidateInitFunc(func::FuncOp init_func_op) {
 
   FetchOp fetch_op = graph_op.GetFetch();
   for (const Value fetch : fetch_op.getFetches()) {
-    if (!fetch.getType().isa<tf_executor::ControlType>()) {
+    if (!mlir::isa<tf_executor::ControlType>(fetch.getType())) {
       fetch_op.emitError(absl::StrFormat(
           "Validation failed for the initializer function: %s. "
           "All initializer function's fetches should be "
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_save_function_ops_to_main.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_save_function_ops_to_main.cc
index e092352dc52c29..6f42c9fcaba7c5 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_save_function_ops_to_main.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_save_function_ops_to_main.cc
@@ -143,7 +143,7 @@ BlockArgument GetFilePrefixArg(func::FuncOp main_func_op) {
     auto index_path_attr =
         main_func_op.getArgAttrOfType<ArrayAttr>(i, kTfSavedModelIndexPathAttr);
     if (index_path_attr && !index_path_attr.empty() &&
-        index_path_attr[0].cast<StringAttr>() == kTfFilePrefix) {
+        mlir::cast<StringAttr>(index_path_attr[0]) == kTfFilePrefix) {
       return main_func_op.getArgument(i);
     }
   }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h b/tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h
index 5ea5a058cc94d3..9a0084ef38f412 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h
@@ -63,10 +63,6 @@ CreateLiftQuantizableSpotsAsFunctionsDRQPass(
 std::unique_ptr<OperationPass<func::FuncOp>>
 CreateConvertCustomAggregationOpToQuantStatsPass();
 
-// Issues IDs of custom aggregation ops for preparing the calibration procedure.
-std::unique_ptr<OperationPass<ModuleOp>>
-CreateIssueIDsOfCustomAggregationOpsPass();
-
 // Inserts quantized function library.
 std::unique_ptr<OperationPass<ModuleOp>> CreateInsertQuantizedFunctionsPass(
     tensorflow::quantization::QuantizationMethod::PresetMethod
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.cc
index 38075bb67b7010..a87245345f6987 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.cc
@@ -98,8 +98,8 @@ class PrepareLiftingPass
 // indices in `val2`.
 bool HasEqualElementSize(Value val1, Value val2, ArrayRef<int> val1_indices,
                          ArrayRef<int> val2_indices) {
-  ShapedType val1_shape = val1.getType().cast<ShapedType>();
-  ShapedType val2_shape = val2.getType().cast<ShapedType>();
+  ShapedType val1_shape = mlir::cast<ShapedType>(val1.getType());
+  ShapedType val2_shape = mlir::cast<ShapedType>(val2.getType());
   if (!val1_shape.hasRank() || !val2_shape.hasRank()) return false;
 
   int val1_result = 1;
@@ -134,7 +134,7 @@ bool ReshapableTo1DTensor(ShapedType rhs_shape) {
 }
 
 Value ReshapeTo1DTensor(OpBuilder& builder, Location loc, Value value) {
-  auto shape = value.getType().cast<ShapedType>();
+  auto shape = mlir::cast<ShapedType>(value.getType());
   if (shape.getRank() != 1) {
     SmallVector<int64_t> new_shape;
     new_shape.push_back(shape.getNumElements());
@@ -157,8 +157,8 @@ LogicalResult MatchSupportedAffineOp(Operation* op, Value& binding_output,
   bool is_supported_affine_op = false;
   if (llvm::isa<TF::Conv2DOp, TF::Conv3DOp, TF::DepthwiseConv2dNativeOp>(op)) {
     if (const auto data_format = op->getAttrOfType<StringAttr>("data_format")) {
-      is_supported_affine_op = data_format.getValue().equals("NHWC") ||
-                               data_format.getValue().equals("NDHWC");
+      is_supported_affine_op =
+          data_format.getValue() == "NHWC" || data_format.getValue() == "NDHWC";
     }
   } else if (llvm::isa<TF::BatchMatMulV2Op>(op)) {
     if (const auto adj_y = op->getAttrOfType<BoolAttr>("adj_y")) {
@@ -182,7 +182,7 @@ LogicalResult MatchSupportedAffineOp(Operation* op, Value& binding_output,
 // Makes the 1D value broadcastable with the `rhs_shape`.
 Value MakeOneDimValueBroadcastable(OpBuilder& builder, Location loc,
                                    Value value, ShapedType rhs_shape) {
-  ShapedType value_shape = value.getType().dyn_cast_or_null<ShapedType>();
+  ShapedType value_shape = mlir::dyn_cast_or_null<ShapedType>(value.getType());
   if (!value_shape || value_shape.getRank() != 1 ||
       !value_shape.hasStaticShape() || !rhs_shape.hasStaticShape()) {
     return {};
@@ -211,7 +211,8 @@ bool CanBeSymmetricallyQuantized(Value weight) {
   auto dq_op = weight.getDefiningOp<quantfork::DequantizeCastOp>();
   if (!dq_op) return true;
 
-  auto qtype = dq_op.getArg().getType().cast<TensorType>().getElementType();
+  auto qtype =
+      mlir::cast<TensorType>(dq_op.getArg().getType()).getElementType();
   if (auto uniform_type = llvm::dyn_cast_or_null<UniformQuantizedType>(qtype)) {
     return uniform_type.getZeroPoint() == 0;
   } else if (auto per_axis_type =
@@ -252,12 +253,12 @@ Value MultiplyFakeQuantValue(OpBuilder& builder, Location loc, Value value,
 
   Value float_value = q_op.getArg();
   Value new_value = builder.create<TF::MulOp>(loc, float_value, multiplier);
-  auto new_value_type = new_value.getType().cast<TensorType>();
+  auto new_value_type = mlir::cast<TensorType>(new_value.getType());
 
   // Get multiplier value in double.
   DenseFPElementsAttr multiplier_attr;
   if (!matchPattern(multiplier, m_Constant(&multiplier_attr)) ||
-      multiplier_attr.getType().cast<ShapedType>().getRank() > 1) {
+      mlir::cast<ShapedType>(multiplier_attr.getType()).getRank() > 1) {
     return {};
   }
   std::vector<double> multiplier_values;
@@ -268,7 +269,7 @@ Value MultiplyFakeQuantValue(OpBuilder& builder, Location loc, Value value,
 
   // Multiply the quantization parameters by the multiplier.
   QuantizedType new_qtype;
-  auto element_type = q_op.getType().cast<TensorType>().getElementType();
+  auto element_type = mlir::cast<TensorType>(q_op.getType()).getElementType();
   if (auto uniform_type = llvm::dyn_cast<UniformQuantizedType>(element_type)) {
     if (multiplier_attr.isSplat()) {
       double new_scale = multiplier_array.front() * uniform_type.getScale();
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize.cc
index fe38ed8dc0f634..cad8c1686eb67b 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
@@ -171,8 +172,8 @@ bool PrepareQuantizePass::SetInputNodesQuantizationParams(func::FuncOp func) {
 
   bool need_to_set_input_nodes_quantization_params = false;
   for (const BlockArgument arg : func.getArguments()) {
-    auto shaped = arg.getType().dyn_cast<ShapedType>();
-    if (shaped && shaped.getElementType().isa<FloatType>() &&
+    auto shaped = mlir::dyn_cast<ShapedType>(arg.getType());
+    if (shaped && mlir::isa<FloatType>(shaped.getElementType()) &&
         !has_quantize_op(arg)) {
       need_to_set_input_nodes_quantization_params = true;
       break;
@@ -197,8 +198,8 @@ bool PrepareQuantizePass::SetInputNodesQuantizationParams(func::FuncOp func) {
   auto add_quantize_op = [&](Location loc, Type input_type, Block* block,
                              Block::iterator insertion_point, Value arg,
                              int i) {
-    if (auto shaped = input_type.dyn_cast<ShapedType>()) {
-      if (shaped.getElementType().isa<FloatType>()) {
+    if (auto shaped = mlir::dyn_cast<ShapedType>(input_type)) {
+      if (mlir::isa<FloatType>(shaped.getElementType())) {
         // If there are existing quantize ops, they are from training and we
         // should respect them.
         if (has_quantize_op(arg)) {
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize_drq.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize_drq.cc
index 71587390580406..b2c0ceb205ca99 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize_drq.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize_drq.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
@@ -142,7 +143,7 @@ class PrepareDRQQuantizableOp : public OpRewritePattern<arith::ConstantOp> {
   bool getQuantizableOps(arith::ConstantOp op,
                          QuantizationUnits& quantizable_ops) const {
     // Non-float tensors do not need quantization.
-    auto type = op.getType().dyn_cast<ShapedType>();
+    auto type = mlir::dyn_cast<ShapedType>(op.getType());
     if (!type || !type.getElementType().isF32()) return false;
 
     Value value = op.getResult();
@@ -183,23 +184,23 @@ class PrepareDRQQuantizableOp : public OpRewritePattern<arith::ConstantOp> {
     if (attr.size() < quant_specs_.minimum_elements_for_weights) {
       op->emitRemark("Quantization is skipped for ")
           << quantized_op->getName().getStringRef().str() << " because it has "
-          << attr.dyn_cast<DenseFPElementsAttr>().size()
+          << mlir::dyn_cast<DenseFPElementsAttr>(attr).size()
           << " elements which is fewer than the threshold("
           << quant_specs_.minimum_elements_for_weights << " elements).";
       return false;
     }
 
     if (is_per_channel_quantization) {
-      quant_type = quant::GetUniformQuantizedPerAxisTypeForWeight(
-                       attr, quant_dim,
-                       /*symmetric=*/true, bit_width, is_signed,
-                       is_narrow_range, is_legacy_float)
-                       .template dyn_cast<quant::QuantizedType>();
+      quant_type = mlir::dyn_cast<quant::QuantizedType>(
+          quant::GetUniformQuantizedPerAxisTypeForWeight(
+              attr, quant_dim,
+              /*symmetric=*/true, bit_width, is_signed, is_narrow_range,
+              is_legacy_float));
     } else {
-      quant_type = quant::GetUniformQuantizedTypeForWeight(
-                       attr, is_narrow_range && is_signed, bit_width, is_signed,
-                       is_narrow_range, is_legacy_float)
-                       .template dyn_cast<quant::QuantizedType>();
+      quant_type = mlir::dyn_cast<quant::QuantizedType>(
+          quant::GetUniformQuantizedTypeForWeight(
+              attr, is_narrow_range && is_signed, bit_width, is_signed,
+              is_narrow_range, is_legacy_float));
     }
     return insertQDQ(rewriter, op, quant_type, quant_op);
   }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/preprocess_op.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/preprocess_op.cc
index 3f54fe580fe1c4..08b2faadacd3d5 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/preprocess_op.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/preprocess_op.cc
@@ -202,7 +202,7 @@ class PreprocessConstantOp : public OpRewritePattern<TF::PartitionedCallOp> {
 
   LogicalResult matchAndRewrite(TF::PartitionedCallOp op,
                                 PatternRewriter& rewriter) const override {
-    const auto f_attr = op.getFAttr().dyn_cast<FlatSymbolRefAttr>();
+    const auto f_attr = mlir::dyn_cast<FlatSymbolRefAttr>(op.getFAttr());
     // Non-quantizable op
     if (!op->hasAttr(kQuantTraitAttrName)) return failure();
     StringRef function_name = f_attr.getValue();
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/propagate_quantize_type.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/propagate_quantize_type.cc
index 8570652b4019e7..0d2edd5bacd6c1 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/propagate_quantize_type.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/propagate_quantize_type.cc
@@ -100,7 +100,7 @@ class PropagateDequantizeOpIfAllowed
 
   LogicalResult matchAndRewrite(TF::PartitionedCallOp op,
                                 PatternRewriter& rewriter) const override {
-    const auto f_attr = op.getFAttr().dyn_cast<FlatSymbolRefAttr>();
+    const auto f_attr = mlir::dyn_cast<FlatSymbolRefAttr>(op.getFAttr());
     StringRef function_name = f_attr.getValue();
     if (!function_name.starts_with(kDequantizeFunctionName)) return failure();
 
@@ -127,7 +127,8 @@ class PropagateDequantizeOpIfAllowed
         auto original_result_type = user_op->getResult(0).getType();
         auto new_user_op_type = CloneTypeWithNewElementType(
             original_result_type,
-            op_before_dequantize.getType().cast<ShapedType>().getElementType());
+            mlir::cast<ShapedType>(op_before_dequantize.getType())
+                .getElementType());
         createNewDequantizeOp(rewriter, op, user_op, user_idx,
                               new_user_op_type);
       } else {
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.cc
index 0b3c89c56f60bb..50409709d44854 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.cc
@@ -213,11 +213,11 @@ LogicalResult CreateQuantizationParams(QuantizedType elem_type, Location loc,
   if (!elem_type) {
     return failure();
   }
-  if (auto qtype = elem_type.dyn_cast<UniformQuantizedType>()) {
+  if (auto qtype = mlir::dyn_cast<UniformQuantizedType>(elem_type)) {
     return CreateUniformQuantizedTypeParams(qtype, loc, rewriter, scale,
                                             zero_point);
-  } else if (auto qtype =
-                 elem_type.dyn_cast<quant::UniformQuantizedPerAxisType>()) {
+  } else if (auto qtype = mlir::dyn_cast<quant::UniformQuantizedPerAxisType>(
+                 elem_type)) {
     return CreateUniformQuantizedPerAxisTypeParams(qtype, loc, rewriter, scale,
                                                    zero_point);
   }
@@ -235,7 +235,7 @@ ShapedType ConvertIntToQint(ShapedType input_type, MLIRContext* ctx) {
   if (ele_type.isIntOrFloat()) {
     bit_width = ele_type.getIntOrFloatBitWidth();
     is_signed = ele_type.isSignlessIntOrFloat() || ele_type.isSignedInteger();
-  } else if (QuantizedType qtype = ele_type.dyn_cast<QuantizedType>()) {
+  } else if (QuantizedType qtype = mlir::dyn_cast<QuantizedType>(ele_type)) {
     bit_width = qtype.getStorageTypeIntegralWidth();
     is_signed = qtype.isSigned();
   } else {
@@ -275,8 +275,9 @@ class ReplaceQuantizePattern
 
   LogicalResult matchAndRewrite(quantfork::QuantizeCastOp q_op,
                                 PatternRewriter& rewriter) const override {
-    auto output_type = q_op.getType().cast<TensorType>();
-    auto elem_type = output_type.getElementType().dyn_cast<QuantizedType>();
+    auto output_type = mlir::cast<TensorType>(q_op.getType());
+    auto elem_type =
+        mlir::dyn_cast<QuantizedType>(output_type.getElementType());
     const Location loc = q_op->getLoc();
     Value scale, zero_point;
 
@@ -289,7 +290,7 @@ class ReplaceQuantizePattern
 
     if (target_opset_ == OpSet::UNIFORM_QUANTIZED) {
       ShapedType new_output_type = ConvertIntToQint(
-          output_type.cast<ShapedType>(), rewriter.getContext());
+          mlir::cast<ShapedType>(output_type), rewriter.getContext());
       if (!new_output_type) {
         q_op->emitError(
             "Failed to convert the type to the corresponding qtype.");
@@ -327,8 +328,8 @@ class ReplaceDequantizePattern
 
   LogicalResult matchAndRewrite(quantfork::DequantizeCastOp dq_op,
                                 PatternRewriter& rewriter) const override {
-    auto input_type = dq_op.getArg().getType().cast<TensorType>();
-    auto elem_type = input_type.getElementType().dyn_cast<QuantizedType>();
+    auto input_type = mlir::cast<TensorType>(dq_op.getArg().getType());
+    auto elem_type = mlir::dyn_cast<QuantizedType>(input_type.getElementType());
     const Location loc = dq_op->getLoc();
 
     Value scale, zero_point;
@@ -340,13 +341,13 @@ class ReplaceDequantizePattern
     TensorType output_type = input_type.clone(elem_type.getStorageType());
     if (target_opset_ == OpSet::UNIFORM_QUANTIZED) {
       ShapedType new_output_type = ConvertIntToQint(
-          output_type.cast<ShapedType>(), rewriter.getContext());
+          mlir::cast<ShapedType>(output_type), rewriter.getContext());
       if (!new_output_type) {
         dq_op->emitError(
             "Failed to convert the type to the corresponding qtype.");
         return failure();
       }
-      output_type = new_output_type.cast<TensorType>();
+      output_type = mlir::cast<TensorType>(new_output_type);
     }
 
     auto scast_op = rewriter.create<quantfork::StorageCastOp>(loc, output_type,
@@ -376,8 +377,8 @@ bool IsQuantizedCallforDynamicRange(TF::PartitionedCallOp call_op) {
       return false;
     } else if (cur_op) {
       // Check if the QuantizeCastOp has element type of quantized type.
-      if (!getElementTypeOrSelf(cur_op.getResult().getType())
-               .isa<QuantizedType>()) {
+      if (!mlir::isa<QuantizedType>(
+              getElementTypeOrSelf(cur_op.getResult().getType()))) {
         return false;
       }
       // Satisfies the input condition.
@@ -385,8 +386,8 @@ bool IsQuantizedCallforDynamicRange(TF::PartitionedCallOp call_op) {
     }
   }
   for (Value output : call_op.getOutput()) {
-    if (auto type = output.getType().dyn_cast<TensorType>()) {
-      if (type.getElementType().isa<QuantizedType>()) {
+    if (auto type = mlir::dyn_cast<TensorType>(output.getType())) {
+      if (mlir::isa<QuantizedType>(type.getElementType())) {
         return false;
       }
     }
@@ -398,15 +399,15 @@ bool IsQuantizedCallforDynamicRange(TF::PartitionedCallOp call_op) {
 bool IsQuantizedCallforStaticRange(TF::PartitionedCallOp call_op) {
   bool has_quantized_types = false;
   for (Value input : call_op.getArgs()) {
-    if (auto type = input.getType().dyn_cast<TensorType>()) {
-      if (type.getElementType().isa<QuantizedType>()) {
+    if (auto type = mlir::dyn_cast<TensorType>(input.getType())) {
+      if (mlir::isa<QuantizedType>(type.getElementType())) {
         has_quantized_types = true;
       }
     }
   }
   for (Value output : call_op.getOutput()) {
-    if (auto type = output.getType().dyn_cast<TensorType>()) {
-      if (type.getElementType().isa<QuantizedType>()) {
+    if (auto type = mlir::dyn_cast<TensorType>(output.getType())) {
+      if (mlir::isa<QuantizedType>(type.getElementType())) {
         has_quantized_types = true;
       }
     }
@@ -616,7 +617,7 @@ std::string GetQuantizedFunctionName(StringRef func_name,
 
 bool ContainsFloatResultType(ArrayRef<Type> result_types) {
   for (auto current_type : result_types) {
-    if (current_type.dyn_cast<TensorType>().getElementType().isF32())
+    if (mlir::dyn_cast<TensorType>(current_type).getElementType().isF32())
       return true;
   }
   return false;
@@ -644,7 +645,7 @@ class QuantizeFunctionPattern
 
   LogicalResult matchAndRewrite(TF::PartitionedCallOp call_op,
                                 PatternRewriter& rewriter) const override {
-    const auto f_attr = call_op.getFAttr().dyn_cast<FlatSymbolRefAttr>();
+    const auto f_attr = mlir::dyn_cast<FlatSymbolRefAttr>(call_op.getFAttr());
     // removeAttr will return nullptr if no attribute was removed.
     if (!call_op->removeAttr(kQuantTraitAttrName) || !f_attr) {
       return failure();
@@ -671,12 +672,12 @@ class QuantizeFunctionPattern
     SmallVector<Value, 4> args;
     SmallVector<Value, 4> qparam_args;
     for (Value arg : call_op.getArgs()) {
-      if (const auto arg_type = arg.getType().dyn_cast<TensorType>()) {
+      if (const auto arg_type = mlir::dyn_cast<TensorType>(arg.getType())) {
         QuantizedType qtype =
-            arg_type.getElementType().dyn_cast<QuantizedType>();
+            mlir::dyn_cast<QuantizedType>(arg_type.getElementType());
         if (!qtype) continue;
-        if (!qtype.isa<UniformQuantizedType,
-                       quant::UniformQuantizedPerAxisType>()) {
+        if (!mlir::isa<UniformQuantizedType,
+                       quant::UniformQuantizedPerAxisType>(qtype)) {
           return failure();
         }
         Value scale, zero_point;
@@ -693,12 +694,12 @@ class QuantizeFunctionPattern
     }
 
     for (Value result : call_op->getResults()) {
-      if (auto result_type = result.getType().dyn_cast<TensorType>()) {
+      if (auto result_type = mlir::dyn_cast<TensorType>(result.getType())) {
         QuantizedType qtype =
-            result_type.getElementType().dyn_cast<QuantizedType>();
+            mlir::dyn_cast<QuantizedType>(result_type.getElementType());
         if (!qtype) continue;
-        if (!qtype.isa<UniformQuantizedType,
-                       quant::UniformQuantizedPerAxisType>()) {
+        if (!mlir::isa<UniformQuantizedType,
+                       quant::UniformQuantizedPerAxisType>(qtype)) {
           return failure();
         }
         Value scale, zero_point;
@@ -717,12 +718,13 @@ class QuantizeFunctionPattern
     rewriter.setInsertionPoint(call_op);
 
     for (Value arg : call_op.getArgs()) {
-      TensorType arg_type = arg.getType().dyn_cast<TensorType>();
+      TensorType arg_type = mlir::dyn_cast<TensorType>(arg.getType());
       if (!arg_type) {
         args.push_back(arg);
         continue;
       }
-      QuantizedType qtype = arg_type.getElementType().dyn_cast<QuantizedType>();
+      QuantizedType qtype =
+          mlir::dyn_cast<QuantizedType>(arg_type.getElementType());
       if (!qtype) {
         args.push_back(arg);
         continue;
@@ -730,15 +732,15 @@ class QuantizeFunctionPattern
 
       quantfork::StorageCastOp scast_op;
       if (target_opset_ == OpSet::UNIFORM_QUANTIZED) {
-        ShapedType new_arg_type = ConvertIntToQint(arg_type.cast<ShapedType>(),
-                                                   rewriter.getContext());
+        ShapedType new_arg_type = ConvertIntToQint(
+            mlir::cast<ShapedType>(arg_type), rewriter.getContext());
         if (!new_arg_type) {
           call_op->emitError(
               "Failed to convert the type to the corresponding qtype.");
           return failure();
         }
         scast_op = rewriter.create<quantfork::StorageCastOp>(
-            arg.getLoc(), new_arg_type.cast<TensorType>(), arg);
+            arg.getLoc(), mlir::cast<TensorType>(new_arg_type), arg);
       } else {
         scast_op = rewriter.create<quantfork::StorageCastOp>(
             arg.getLoc(), arg_type.clone(qtype.getStorageType()), arg);
@@ -761,20 +763,20 @@ class QuantizeFunctionPattern
 
     SmallVector<Type, 4> result_types;
     for (Value result : call_op->getResults()) {
-      TensorType result_type = result.getType().dyn_cast<TensorType>();
+      TensorType result_type = mlir::dyn_cast<TensorType>(result.getType());
       if (!result_type) {
         result_types.push_back(result.getType());
         continue;
       }
       QuantizedType qtype =
-          result_type.getElementType().dyn_cast<QuantizedType>();
+          mlir::dyn_cast<QuantizedType>(result_type.getElementType());
       if (!qtype) {
         result_types.push_back(result_type);
         continue;
       }
       if (target_opset_ == OpSet::UNIFORM_QUANTIZED) {
         ShapedType new_result_type = ConvertIntToQint(
-            result_type.cast<ShapedType>(), rewriter.getContext());
+            mlir::cast<ShapedType>(result_type), rewriter.getContext());
         result_types.push_back(new_result_type);
       } else {
         result_types.push_back(result_type.clone(qtype.getStorageType()));
@@ -871,13 +873,13 @@ class QuantizeFunctionPattern
     rewriter.setInsertionPointAfter(call_op);
     SmallVector<Type, 4> result_types;
     for (Value result : call_op->getResults()) {
-      TensorType result_type = result.getType().dyn_cast<TensorType>();
+      TensorType result_type = mlir::dyn_cast<TensorType>(result.getType());
       if (!result_type) {
         result_types.push_back(result.getType());
         continue;
       }
       QuantizedType qtype =
-          result_type.getElementType().dyn_cast<QuantizedType>();
+          mlir::dyn_cast<QuantizedType>(result_type.getElementType());
       if (!qtype) {
         result_types.push_back(result_type);
         continue;
@@ -890,7 +892,7 @@ class QuantizeFunctionPattern
     auto module = call_op->getParentOfType<ModuleOp>();
     SymbolTable symbol_table(module);
 
-    const auto f_attr = call_op.getFAttr().dyn_cast<FlatSymbolRefAttr>();
+    const auto f_attr = mlir::dyn_cast<FlatSymbolRefAttr>(call_op.getFAttr());
     const auto float_func =
         dyn_cast<func::FuncOp>(symbol_table.lookup(f_attr.getValue()));
     rewriter.setInsertionPointAfter(float_func);
@@ -973,14 +975,15 @@ class QuantizeConstPattern
       return failure();
     }
 
-    ShapedType tensor_qtype = q_op.getResult().getType().cast<ShapedType>();
+    ShapedType tensor_qtype =
+        mlir::cast<ShapedType>(q_op.getResult().getType());
     Attribute tensor_proto_attr = Quantize(attr, tensor_qtype);
     if (!tensor_proto_attr) {
       return failure();
     }
 
-    Type storage_type =
-        tensor_qtype.getElementType().cast<QuantizedType>().getStorageType();
+    Type storage_type = mlir::cast<QuantizedType>(tensor_qtype.getElementType())
+                            .getStorageType();
     ShapedType new_type = tensor_qtype.clone(storage_type);
     Location loc = q_op.getArg().getLoc();
 
@@ -991,14 +994,14 @@ class QuantizeConstPattern
       // workaround.
       tensorflow::TensorProto tensor_proto;
       if (!mlir::tfg::ConvertToTensorProto(
-               tensor_proto_attr.cast<ElementsAttr>(), &tensor_proto)
+               mlir::cast<ElementsAttr>(tensor_proto_attr), &tensor_proto)
                .ok()) {
         return failure();
       }
 
-      const int bit_width = tensor_qtype.getElementType()
-                                .dyn_cast<QuantizedType>()
-                                .getStorageTypeIntegralWidth();
+      const int bit_width =
+          mlir::dyn_cast<QuantizedType>(tensor_qtype.getElementType())
+              .getStorageTypeIntegralWidth();
 
       tensor_proto.set_dtype((bit_width == 8) ? tensorflow::DT_QINT8
                                               : tensorflow::DT_QINT32);
@@ -1033,8 +1036,9 @@ class RestoreWeightShapePattern
     int weight_operand_idx = 1;
     Operation* weight_op = op.getOperand(weight_operand_idx).getDefiningOp();
 
-    auto weight_type = weight_op->getResult(0).getType().dyn_cast<ShapedType>();
-    auto input_type = op.getOperand(0).getType().dyn_cast<ShapedType>();
+    auto weight_type =
+        mlir::dyn_cast<ShapedType>(weight_op->getResult(0).getType());
+    auto input_type = mlir::dyn_cast<ShapedType>(op.getOperand(0).getType());
 
     llvm::ArrayRef<int64_t> weight_shape = weight_type.getShape();
     llvm::ArrayRef<int64_t> input_shape = input_type.getShape();
@@ -1073,7 +1077,7 @@ class RestoreWeightShapePattern
 
   LogicalResult matchAndRewrite(TF::PartitionedCallOp call_op,
                                 PatternRewriter& rewriter) const override {
-    const auto f_attr = call_op.getFAttr().dyn_cast<FlatSymbolRefAttr>();
+    const auto f_attr = mlir::dyn_cast<FlatSymbolRefAttr>(call_op.getFAttr());
     StringRef function_name = f_attr.getValue();
     // TODO(b/228928859): Improve the getter function to match attributes rather
     // than function name.
@@ -1106,7 +1110,8 @@ class QuantizationSummary {
 
     module_.walk([&](Operation* op) {
       if (auto call_op = llvm::dyn_cast_or_null<TF::PartitionedCallOp>(op)) {
-        const auto f_attr = call_op.getFAttr().dyn_cast<FlatSymbolRefAttr>();
+        const auto f_attr =
+            mlir::dyn_cast<FlatSymbolRefAttr>(call_op.getFAttr());
         if (!f_attr) return;
         StringRef func_name = f_attr.getValue();
         if (func_name.starts_with(kQuantizedFuncPrefix)) {
@@ -1227,7 +1232,7 @@ class QuantizationSummary {
     }
 
     // Use the first op as the representative name.
-    return quantized_ops.front().cast<StringAttr>().getValue();
+    return mlir::cast<StringAttr>(quantized_ops.front()).getValue();
   }
 
   bool IsInCompsiteFunction(Operation* op) {
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/replace_cast_hacks_with_tf_xla_ops.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/replace_cast_hacks_with_tf_xla_ops.cc
index 374d687428ee3e..b202798dffe9d0 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/replace_cast_hacks_with_tf_xla_ops.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/replace_cast_hacks_with_tf_xla_ops.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/ValueRange.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
@@ -77,8 +78,9 @@ void PrepareXlaConvParams(OpBuilder &builder, Location loc, ArrayAttr strides,
   SmallVector<int32_t> lhs_dilation_values(num_dims - 2, 1);
   SmallVector<int32_t> stride_values, rhs_dilation_values;
   for (int64_t i : llvm::seq<int64_t>(1, num_dims - 1)) {
-    stride_values.push_back(strides[i].cast<IntegerAttr>().getInt());
-    rhs_dilation_values.push_back(dilations[i].cast<IntegerAttr>().getInt());
+    stride_values.push_back(mlir::cast<IntegerAttr>(strides[i]).getInt());
+    rhs_dilation_values.push_back(
+        mlir::cast<IntegerAttr>(dilations[i]).getInt());
   }
   window_strides = Create1DConstValue<int32_t>(builder, loc, stride_values);
   lhs_dilation = Create1DConstValue<int32_t>(builder, loc, lhs_dilation_values);
@@ -96,7 +98,7 @@ Value CreateZeroPointPartialOffset(OpBuilder &builder, Location loc,
     return CreateScalarConstValue<int32_t>(builder, loc, 0);
   }
 
-  auto shape = tensor.getType().template cast<ShapedType>();
+  auto shape = mlir::cast<ShapedType>(tensor.getType());
   SmallVector<int64_t> non_output_indices;
   for (int64_t i : llvm::seq<int64_t>(0, shape.getRank())) {
     if (absl::c_count(output_dims, i) == 0) {
@@ -108,7 +110,7 @@ Value CreateZeroPointPartialOffset(OpBuilder &builder, Location loc,
       Create1DConstValue<int64_t>(builder, loc, non_output_indices);
   auto zp = CreateScalarConstValue<int32_t>(builder, loc, other_tensor_zp);
 
-  TensorType tensor_type = tensor.getType().dyn_cast<TensorType>();
+  TensorType tensor_type = mlir::dyn_cast<TensorType>(tensor.getType());
   Value tensor_i32 = builder.create<TF::CastOp>(
       loc, tensor_type.clone(builder.getIntegerType(32)), tensor);
   auto reduced =
@@ -136,7 +138,7 @@ Value MergeZeroPointOffset(OpBuilder &builder, Location loc, Value weight,
                            int8_t input_zp, int8_t weight_zp,
                            Value zp_input_contribution,
                            Value zp_weight_contribution) {
-  auto weight_shape = weight.getType().template cast<ShapedType>();
+  auto weight_shape = mlir::cast<ShapedType>(weight.getType());
   SmallVector<int64_t> weight_non_output_indices;
   for (auto i : llvm::seq<int64_t>(0, weight_shape.getRank())) {
     if (absl::c_count(weight_output_dims, i) == 0) {
@@ -498,7 +500,7 @@ Value CreateZeroPointPartialOffsetXlaDotV2(
     return CreateScalarConstValue<int32_t>(builder, loc, 0);
   }
 
-  auto shape = tensor.getType().template cast<ShapedType>();
+  auto shape = mlir::cast<ShapedType>(tensor.getType());
   SmallVector<int64_t> tensor_shape;
   for (auto v : shape.getShape()) {
     tensor_shape.push_back(v);
@@ -506,7 +508,7 @@ Value CreateZeroPointPartialOffsetXlaDotV2(
 
   auto zp = CreateScalarConstValue<int32_t>(builder, loc, other_tensor_zp);
 
-  TensorType tensor_type = tensor.getType().dyn_cast<TensorType>();
+  TensorType tensor_type = mlir::dyn_cast<TensorType>(tensor.getType());
   Value tensor_i32 = builder.create<TF::CastOp>(
       loc, tensor_type.clone(builder.getIntegerType(32)), tensor);
 
@@ -596,7 +598,7 @@ Value CalculateZeroPointOffsetXLADotV2(OpBuilder &builder, Location loc,
   Value zp_weight_contribution = CreateZeroPointPartialOffsetXlaDotV2(
       builder, loc, weight, input_zp, dnums, /*is_lhs=*/false, output_rank);
 
-  auto weight_shape = weight.getType().template cast<ShapedType>();
+  auto weight_shape = mlir::cast<ShapedType>(weight.getType());
 
   absl::flat_hash_set<int64_t> rhs_contracting_dims;
   for (auto dim : dnums.rhs_contracting_dimensions()) {
@@ -711,8 +713,8 @@ Value CreateXlaConvOpFromTfConv2dOp(OpBuilder &builder, Location loc,
                                     ArrayAttr dilations,
                                     StringAttr conv_padding,
                                     ArrayAttr explicit_paddings) {
-  auto input_shape = input.getType().template cast<ShapedType>();
-  auto filter_shape = filter.getType().template cast<ShapedType>();
+  auto input_shape = mlir::cast<ShapedType>(input.getType());
+  auto filter_shape = mlir::cast<ShapedType>(filter.getType());
   if (!input_shape.hasRank() || input_shape.getRank() != 4 ||
       !filter_shape.hasRank() || filter_shape.getRank() != 4) {
     emitError(loc, "input and filter are expected to be 4D tensors");
@@ -731,8 +733,8 @@ Value CreateXlaConvOpFromTfDepthwiseConv2dOp(
     OpBuilder &builder, Location loc, Value input, Value filter, Value input_zp,
     Value conv_output, ArrayAttr strides, ArrayAttr dilations,
     StringAttr conv_padding, ArrayAttr explicit_paddings) {
-  auto input_shape = input.getType().template cast<ShapedType>();
-  auto filter_shape = filter.getType().template cast<ShapedType>();
+  auto input_shape = mlir::cast<ShapedType>(input.getType());
+  auto filter_shape = mlir::cast<ShapedType>(filter.getType());
   if (!input_shape.hasRank() || input_shape.getRank() != 4 ||
       !filter_shape.hasRank() || filter_shape.getRank() != 4) {
     emitError(loc, "input and filter are expected to be 4D tensors");
@@ -759,8 +761,8 @@ Value CreateXlaConvOpFromTfConv3dOp(OpBuilder &builder, Location loc,
                                     Value conv_output, ArrayAttr strides,
                                     ArrayAttr dilations,
                                     StringAttr conv_padding) {
-  auto input_shape = input.getType().template cast<ShapedType>();
-  auto filter_shape = filter.getType().template cast<ShapedType>();
+  auto input_shape = mlir::cast<ShapedType>(input.getType());
+  auto filter_shape = mlir::cast<ShapedType>(filter.getType());
   if (!input_shape.hasRank() || input_shape.getRank() != 5 ||
       !filter_shape.hasRank() || filter_shape.getRank() != 5) {
     emitError(loc, "input and filter are expected to be 5D tensors");
@@ -819,7 +821,7 @@ Value CreateXlaDotV2Op(OpBuilder &builder, Location loc, Value input,
 
   Value zp_offset = CalculateZeroPointOffsetXLADotV2(
       builder, loc, input, weight, input_zp_value, weight_zp_value, dnums,
-      output.getType().template cast<ShapedType>().getRank());
+      mlir::cast<ShapedType>(output.getType()).getRank());
 
   return builder.create<TF::SubOp>(loc, dot_result, zp_offset);
 }
@@ -891,8 +893,8 @@ GetBroadcastShapesForBatchMatmul(ShapedType input_type,
 // function, except BroadcastTo, are expected to be folded.
 void BroadcastBatchDimensionsForBatchMatMul(OpBuilder &builder, Location loc,
                                             Value &input, Value &weight) {
-  ShapedType input_type = input.getType().template cast<ShapedType>();
-  ShapedType weight_type = weight.getType().template cast<ShapedType>();
+  ShapedType input_type = mlir::cast<ShapedType>(input.getType());
+  ShapedType weight_type = mlir::cast<ShapedType>(weight.getType());
   const int32_t input_rank = input_type.getRank();
   const int32_t weight_rank = weight_type.getRank();
   const int32_t broadcasted_rank = std::max(input_rank, weight_rank);
@@ -984,7 +986,7 @@ Value CreateXlaDotV2OpFromTfBatchMatMulOp(OpBuilder &builder, Location loc,
   BroadcastBatchDimensionsForBatchMatMul(builder, loc, input, weight);
 
   // Both input and weight have the same rank after broadcasting.
-  ShapedType weight_shape = weight.getType().template cast<ShapedType>();
+  ShapedType weight_shape = mlir::cast<ShapedType>(weight.getType());
   int num_batch_dim = weight_shape.getRank() - 2;
 
   // Transpose and constant-fold the weight if needed.
@@ -1016,7 +1018,7 @@ Value CreateXlaDotV2OpFromTfBatchMatMulOp(OpBuilder &builder, Location loc,
 
 // Check if the given value is a ranked type with specified integer width.
 bool IsRankedInt(Value value, const int integer_width) {
-  ShapedType value_type = value.getType().template cast<ShapedType>();
+  ShapedType value_type = mlir::cast<ShapedType>(value.getType());
   if (!value_type.hasRank()) return false;
   if (!value_type.getElementType().isInteger(integer_width)) return false;
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.td
index e33e226be35515..4928bafc7490eb 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.td
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.td
@@ -61,7 +61,18 @@ def TF_CustomAggregatorOp : TF_Op<"CustomAggregator", [Pure]> {
   let arguments = (ins
     TensorOf<[TF_Float32]>:$input,
 
-    StrAttr:$id
+    // The unique id of this `CustomAggregator` op.
+    StrAttr:$id,
+    // The integer value of the enforcing `CalibrationMethod`.
+    I32Attr:$calibration_method,
+    // The number of histogram bins.
+    I32Attr:$num_bins,
+    // Min percentile to be included in the selected range, only used in the
+    // `HISTOGRAM_PERCENTILE` method.
+    F32Attr:$min_percentile,
+    // Max percentile to be included in the selected range, only used in the
+    // `HISTOGRAM_PERCENTILE` method.
+    F32Attr:$max_percentile
   );
 
   let results = (outs
@@ -72,6 +83,20 @@ def TF_CustomAggregatorOp : TF_Op<"CustomAggregator", [Pure]> {
   );
 }
 
+def TF_CalibrationStatisticsSaverOp : TF_Op<"CalibrationStatisticsSaver", []> {
+  let summary = "Aggregates and saves calibration statistics.";
+
+  let arguments = (ins
+    Variadic<TensorOf<[TF_Float32, TF_Int64]>>:$inputs,
+
+    StrAttr:$output_file_path,
+    StrArrayAttr:$ids,
+    I32ArrayAttr:$calibration_methods
+  );
+
+  TF_DerivedOperandTypeListAttr Tin = TF_DerivedOperandTypeListAttr<0>;
+}
+
 def TF_DumpTensorOp : TF_Op<"DumpTensor", []> {
   let summary = "Dump tensor proto.";
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
index 78a8321f9f87d4..c0a472ca8f2e26 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
@@ -45,6 +45,8 @@ cc_library(
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:saved_model_export",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:saved_model_import",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:types",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:weight_only_ptq",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration:component",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration:statistics",
         "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow:passes",
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
index 08ff75ac802613..b44c788bc10f7b 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
@@ -346,25 +346,10 @@ def test_drq_per_channel_for_non_uniform_opset_raises_value_error(
           self._input_saved_model_path, quantization_options=options
       )
 
-  @test_util.run_in_graph_and_eager_modes
   def test_force_graph_mode_calibration(self):
-    input_type = dtypes.int32
-    input_placeholder = self._create_and_save_tf1_gather_model(
-        self._input_saved_model_path,
-        signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY,
-        tags={tag_constants.SERVING},
-        input_key='x',
-        output_key='output',
-        input_type=input_type,
-    )
+    model = self.SimpleModel()
 
-    data_gen = self._create_data_generator(
-        input_key='x',
-        shape=input_placeholder.shape,
-        minval=0,
-        maxval=10,
-        dtype=input_type,
-    )
+    saved_model_save.save(model, self._input_saved_model_path)
 
     options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
@@ -383,7 +368,7 @@ def test_force_graph_mode_calibration(self):
         quantize_model.quantize(
             self._input_saved_model_path,
             quantization_options=options,
-            representative_dataset=data_gen,
+            representative_dataset=self._simple_model_data_gen(),
         )
       finally:
         # Restore the logger verbosity.
@@ -2725,6 +2710,116 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
     self.assertAllClose(new_outputs, got_outputs, atol=0.097)
     self.assertAllClose(new_outputs, expected_outputs, atol=0.057)
 
+  def test_reuse_calibration_data(self):
+    model = self._create_simple_gather_and_conv_model(
+        dtypes.int32, filter_shape=(2, 3, 3, 1024)
+    )
+    saved_model_save.save(model, self._input_saved_model_path)
+
+    data_gen = self._create_data_generator(
+        input_key='input_tensor',
+        shape=[50],
+        minval=0,
+        maxval=64,
+        dtype=dtypes.int32,
+    )
+
+    tags = {tag_constants.SERVING}
+
+    calibration_data_dir = self.create_tempdir('calibration_data').full_path
+    quantization_options = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            preset_method=_PresetMethod.METHOD_STATIC_RANGE_INT8
+        ),
+        tags=tags,
+        signature_keys=['serving_default'],
+        op_set=quant_opts_pb2.XLA,
+        force_graph_mode_calibration=True,
+        calibration_options=stablehlo_quant_config_pb2.CalibrationOptions(
+            calibration_method=_CalibrationMethod.CALIBRATION_METHOD_MIN_MAX,
+            calibration_data_dir=calibration_data_dir,
+        ),
+    )
+
+    # Run quantization the first time, calibration is expected to be run.
+    with self.assertLogs(level='INFO') as info_logs:
+      # Save the logger verbosity.
+      prev_log_level = logging.get_verbosity()
+      logging.set_verbosity(logging.INFO)
+      try:
+        converted_model1 = quantize_model.quantize(
+            self._input_saved_model_path,
+            self._output_saved_model_path,
+            quantization_options,
+            representative_dataset=data_gen,
+        )
+      finally:
+        # Restore the logger verbosity.
+        logging.set_verbosity(prev_log_level)
+
+      self.assertNotEmpty(info_logs.records)
+      self.assertTrue(
+          self._any_log_contains(
+              'Calibration step is executed in graph mode.',
+              info_logs.records,
+          )
+      )
+      self.assertIsNotNone(converted_model1)
+      self.assertCountEqual(
+          converted_model1.signatures._signatures.keys(), {'serving_default'}
+      )
+
+      output_loader = saved_model_loader.SavedModelLoader(
+          self._output_saved_model_path
+      )
+      output_graphdef = output_loader.get_meta_graph_def_from_tags(
+          tags
+      ).graph_def
+      self.assertTrue(self._contains_op(output_graphdef, 'XlaConvV2'))
+
+    # Run quantization the first time, calibration is expected to be skipped.
+    with self.assertLogs(level='INFO') as info_logs:
+      # Save the logger verbosity.
+      prev_log_level = logging.get_verbosity()
+      logging.set_verbosity(logging.INFO)
+      try:
+        converted_model2 = quantize_model.quantize(
+            self._input_saved_model_path,
+            self._output_saved_model_path,
+            quantization_options,
+            representative_dataset=data_gen,
+            overwrite_output_directory=True,
+        )
+      finally:
+        # Restore the logger verbosity.
+        logging.set_verbosity(prev_log_level)
+
+      self.assertNotEmpty(info_logs.records)
+      self.assertFalse(
+          self._any_log_contains(
+              'Calibration step is executed in graph mode.',
+              info_logs.records,
+          )
+      )
+      self.assertIsNotNone(converted_model2)
+      self.assertCountEqual(
+          converted_model2.signatures._signatures.keys(), {'serving_default'}
+      )
+
+      # Expect two models to produce the same results.
+      test_data = ops.convert_to_tensor(
+          np.random.uniform(low=0, high=64, size=(32)).astype(
+              dtypes.int32.as_numpy_dtype
+          )
+      )
+      new_outputs_1 = converted_model1.signatures['serving_default'](
+          input_tensor=test_data
+      )['output']
+      new_outputs_2 = converted_model2.signatures['serving_default'](
+          input_tensor=test_data
+      )['output']
+      self.assertAllClose(new_outputs_1, new_outputs_2)
+
   @test_util.run_in_graph_and_eager_modes
   def test_function_alias_preserved(self):
     model = self._create_conv2d_model(
@@ -5406,6 +5501,7 @@ def test_einsum_model(
 
   @parameterized.named_parameters(
       ('to_xla_per_tensor', quant_opts_pb2.XLA, False),
+      ('stablehlo_per_channel', quant_opts_pb2.STABLEHLO, True),
   )
   @test_util.run_in_graph_and_eager_modes
   def test_matmul_model(
@@ -5447,8 +5543,14 @@ def test_matmul_model(
     )
     output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
 
+    if target_opset == quant_opts_pb2.XLA:
+      self.assertTrue(self._contains_op(output_graphdef, 'XlaDotV2'))
+    elif target_opset == quant_opts_pb2.STABLEHLO:
+      # This is to verify the invocation of StableHLO quantizer works. More
+      # thorough functional tests are in StableHLO quantizer directory.
+      self.assertTrue(self._contains_op(output_graphdef, 'XlaCallModule'))
+
     # Due to other meta data, the compression is not exactly 1/4.
-    self.assertTrue(self._contains_op(output_graphdef, 'XlaDotV2'))
     self.assertLess(
         testing.get_size_ratio(
             self._output_saved_model_path, self._input_saved_model_path
@@ -5458,6 +5560,7 @@ def test_matmul_model(
 
   @parameterized.named_parameters(
       ('to_xla_per_tensor', quant_opts_pb2.XLA, False),
+      ('stablehlo_per_channel', quant_opts_pb2.STABLEHLO, True),
       # TODO: b/289761265 - [Converter Component][TF-Quantizer] Improve Weight-
       # only Quantization
       # Enable this back once new weight-only quantizer is supported for per-
@@ -5517,7 +5620,7 @@ def test_conv_model(
         0.3,
     )
 
-    if enable_per_channel_quantization:
+    if enable_per_channel_quantization and target_opset == quant_opts_pb2.XLA:
       per_channel_size_attr = attr_value_pb2.AttrValue(
           list=attr_value_pb2.AttrValue.ListValue(
               shape=[
@@ -5536,6 +5639,12 @@ def test_conv_model(
               output_graphdef, 'Const', '_output_shapes', per_channel_size_attr
           )
       )
+    if target_opset == quant_opts_pb2.XLA:
+      self.assertTrue(self._contains_op(output_graphdef, 'XlaConvV2'))
+    elif target_opset == quant_opts_pb2.STABLEHLO:
+      # This is to verify the invocation of StableHLO quantizer works. More
+      # thorough functional tests are in StableHLO quantizer directory.
+      self.assertTrue(self._contains_op(output_graphdef, 'XlaCallModule'))
 
     input_tensor = array_ops.constant(
         np.random.uniform(low=0, high=0.1, size=input_shape),
@@ -6211,25 +6320,25 @@ class CalibrationOptionsTest(quantize_model_test_base.QuantizedModelTest):
               stablehlo_quant_config_pb2.CalibrationOptions(
                   calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_PERCENTILE,
                   calibration_parameters=stablehlo_quant_config_pb2.CalibrationOptions.CalibrationParameters(
-                      initial_num_bins=32,
+                      num_bins=32,
                   ),
               ),
               stablehlo_quant_config_pb2.CalibrationOptions(
                   calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_BRUTEFORCE,
                   calibration_parameters=stablehlo_quant_config_pb2.CalibrationOptions.CalibrationParameters(
-                      initial_num_bins=32,
+                      num_bins=32,
                   ),
               ),
               stablehlo_quant_config_pb2.CalibrationOptions(
                   calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_MAX_FREQUENCY,
                   calibration_parameters=stablehlo_quant_config_pb2.CalibrationOptions.CalibrationParameters(
-                      initial_num_bins=32,
+                      num_bins=32,
                   ),
               ),
               stablehlo_quant_config_pb2.CalibrationOptions(
                   calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_SYMMETRIC,
                   calibration_parameters=stablehlo_quant_config_pb2.CalibrationOptions.CalibrationParameters(
-                      initial_num_bins=32,
+                      num_bins=32,
                   ),
               ),
           ],
@@ -6376,7 +6485,7 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
           'default_calibration_options': stablehlo_quant_config_pb2.CalibrationOptions(
               calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_PERCENTILE,
               calibration_parameters=stablehlo_quant_config_pb2.CalibrationOptions.CalibrationParameters(
-                  initial_num_bins=256,
+                  num_bins=512,
                   min_percentile=0.001,
                   max_percentile=99.999,
               ),
@@ -6390,7 +6499,7 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
           'default_calibration_options': stablehlo_quant_config_pb2.CalibrationOptions(
               calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_BRUTEFORCE,
               calibration_parameters=stablehlo_quant_config_pb2.CalibrationOptions.CalibrationParameters(
-                  initial_num_bins=256
+                  num_bins=512
               ),
           ),
       },
@@ -6402,7 +6511,7 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
           'default_calibration_options': stablehlo_quant_config_pb2.CalibrationOptions(
               calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_MAX_FREQUENCY,
               calibration_parameters=stablehlo_quant_config_pb2.CalibrationOptions.CalibrationParameters(
-                  initial_num_bins=256
+                  num_bins=512
               ),
           ),
       },
@@ -6414,7 +6523,7 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
           'default_calibration_options': stablehlo_quant_config_pb2.CalibrationOptions(
               calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_SYMMETRIC,
               calibration_parameters=stablehlo_quant_config_pb2.CalibrationOptions.CalibrationParameters(
-                  initial_num_bins=256
+                  num_bins=512
               ),
           ),
       },
@@ -6441,8 +6550,8 @@ def test_default_calibration_options(
         default_calibration_options.calibration_method,
     )
     self.assertEqual(
-        quant_opts.calibration_options.calibration_parameters.initial_num_bins,
-        default_calibration_options.calibration_parameters.initial_num_bins,
+        quant_opts.calibration_options.calibration_parameters.num_bins,
+        default_calibration_options.calibration_parameters.num_bins,
     )
     self.assertEqual(
         quant_opts.calibration_options.calibration_parameters.min_percentile,
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
index 9f4621360e2e89..e38310879184ef 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/cc/saved_model/loader.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/context.h"
@@ -46,6 +47,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/types.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/weight_only_ptq.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/cc/convert_asset_args.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
@@ -77,13 +79,14 @@ using ::mlir::quant::stablehlo::GetFunctionAliases;
 using ::mlir::quant::stablehlo::kExportStepSuffix;
 using ::mlir::quant::stablehlo::PostCalibrationComponent;
 using ::mlir::quant::stablehlo::PreCalibrationComponent;
+using ::mlir::quant::stablehlo::RunCalibrationPasses;
 using ::mlir::quant::stablehlo::UpdateFunctionAliases;
+using ::mlir::quant::stablehlo::WeightOnlyPtqComponent;
 using ::stablehlo::quantization::AddCalibrationStatistics;
 using ::stablehlo::quantization::ChangeToQuantizedFilename;
 using ::stablehlo::quantization::DebuggerConfig;
-using ::stablehlo::quantization::DisableDebugging;
-using ::stablehlo::quantization::EnableDebugging;
 using ::stablehlo::quantization::ExpandPresets;
+using ::stablehlo::quantization::IsCalibrationRequired;
 using ::stablehlo::quantization::PopulateDefaults;
 using ::stablehlo::quantization::QuantizationConfig;
 using ::stablehlo::quantization::io::CreateTmpDir;
@@ -156,13 +159,17 @@ absl::StatusOr<ExportedModel> ModuleOpToExportedModel(
 absl::StatusOr<ExportedModel> ExportCalibrationModel(
     mlir::ModuleOp module_op, mlir::MLIRContext *context,
     const QuantizationOptions &quantization_options,
-    const absl::flat_hash_map<std::string, std::string> &function_aliases) {
+    const absl::flat_hash_map<std::string, std::string> &function_aliases,
+    absl::string_view calibration_data_dir) {
   // Clone ModuleOp and function aliases so changes in this pipeline won't
   // be reflected in the original values.
   mlir::OwningOpRef<mlir::ModuleOp> cloned_module_ref(module_op.clone());
 
-  // Disable DumpTensor ops when running calibration.
-  DisableDebugging(*cloned_module_ref);
+  TF_RETURN_IF_ERROR(
+      RunCalibrationPasses(*cloned_module_ref, *context, calibration_data_dir,
+                           quantization_options.calibration_options()
+                               .force_regenerate_calibration_data()));
+  if (!IsCalibrationRequired(*cloned_module_ref)) return ExportedModel();
 
   absl::StatusOr<ExportedModel> exported_model = ModuleOpToExportedModel(
       *cloned_module_ref, context, kTfQuantPtqPreCalibrationStepName,
@@ -177,6 +184,27 @@ absl::StatusOr<ExportedModel> ExportCalibrationModel(
   return *exported_model;
 }
 
+absl::StatusOr<ExportedModel> ExportDebuggingModel(
+    mlir::ModuleOp module_op, mlir::MLIRContext *context,
+    const QuantizationOptions &quantization_options,
+    const absl::flat_hash_map<std::string, std::string> &function_aliases) {
+  // Clone ModuleOp and function aliases so changes in this pipeline won't
+  // be reflected in the original values.
+  mlir::OwningOpRef<mlir::ModuleOp> cloned_module_ref(module_op.clone());
+
+  absl::StatusOr<ExportedModel> exported_model = ModuleOpToExportedModel(
+      *cloned_module_ref, context, kTfQuantPtqPreCalibrationStepName,
+      /*unfreeze_constants=*/!quantization_options.freeze_all_variables(),
+      function_aliases);
+  if (!exported_model.status().ok()) {
+    return absl::InternalError(
+        absl::StrCat("Failed to export debugging model: ",
+                     exported_model.status().message()));
+  }
+
+  return *exported_model;
+}
+
 QuantizationConfig GetQuantizationConfigForStaticRangePtq(
     const QuantizationOptions &quantization_options) {
   QuantizationConfig quantization_config{};
@@ -197,10 +225,25 @@ QuantizationConfig GetQuantizationConfigForStaticRangePtq(
   return ExpandPresets(PopulateDefaults(quantization_config));
 }
 
+QuantizationConfig GetQuantizationConfigForWeightOnlyPtq(
+    const QuantizationOptions &quantization_options) {
+  QuantizationConfig quantization_config{};
+  quantization_config.mutable_weight_only_ptq_preset();
+  // When targeting server TPUs quantized types should be unpacked into
+  // integer ops.
+  quantization_config.mutable_pipeline_config()->set_unpack_quantized_types(
+      true);
+  *quantization_config.mutable_debugger_config() =
+      quantization_options.debugger_config();
+
+  return ExpandPresets(PopulateDefaults(quantization_config));
+}
+
 absl::StatusOr<ExportedModel> QuantizePtqModelPreCalibrationImpl(
     mlir::ModuleOp module_op, mlir::MLIRContext *context,
     const QuantizationOptions &quantization_options,
-    const absl::flat_hash_map<std::string, std::string> &function_aliases) {
+    const absl::flat_hash_map<std::string, std::string> &function_aliases,
+    absl::string_view calibration_data_dir) {
   const bool is_stablehlo = quantization_options.op_set() == OpSet::STABLEHLO;
   // Use StableHLO Quantizer option if opset is specified.
   if (is_stablehlo) {
@@ -221,7 +264,7 @@ absl::StatusOr<ExportedModel> QuantizePtqModelPreCalibrationImpl(
   }
 
   return ExportCalibrationModel(module_op, context, quantization_options,
-                                function_aliases);
+                                function_aliases, calibration_data_dir);
 }
 
 absl::StatusOr<ExportedModel> QuantizePtqModelPostCalibrationImpl(
@@ -358,6 +401,7 @@ absl::StatusOr<ExportedModel> QuantizeWeightOnly(
         "Failed to get function alias: ", function_aliases.status().message()));
   }
 
+  const bool is_stablehlo = quantization_options.op_set() == OpSet::STABLEHLO;
   absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> module =
       ImportAndPreprocessSavedModel(
           saved_model_path,
@@ -365,7 +409,8 @@ absl::StatusOr<ExportedModel> QuantizeWeightOnly(
            quantization_options.signature_keys().end()},
           {quantization_options.tags().begin(),
            quantization_options.tags().end()},
-          context.get(), /*is_inliner_run=*/true, /*run_tf_to_stablehlo=*/false,
+          context.get(), /*is_inliner_run=*/true,
+          /*run_tf_to_stablehlo=*/is_stablehlo,
           /*deserialize_xla_call_module=*/false, *function_aliases);
   if (!module.status().ok()) {
     return absl::InternalError(
@@ -374,14 +419,24 @@ absl::StatusOr<ExportedModel> QuantizeWeightOnly(
   }
   mlir::OwningOpRef<mlir::ModuleOp> module_ref = std::move(module).value();
 
-  TF_RETURN_IF_ERROR(RunPasses(
-      kTfQuantWeightOnlyStepName,
-      /*add_passes_func=*/
-      [&quantization_options](mlir::PassManager &pm) {
-        AddQuantizeWeightOnlyPasses(pm, quantization_options,
-                                    kTfQuantWeightOnlyStepName);
-      },
-      *context, *module_ref));
+  // Use StableHLO Quantizer option if opset is specified.
+  if (is_stablehlo) {
+    const QuantizationConfig quantization_config =
+        GetQuantizationConfigForWeightOnlyPtq(quantization_options);
+
+    WeightOnlyPtqComponent weight_only_ptq_component(context.get());
+    TF_ASSIGN_OR_RETURN(*module_ref, weight_only_ptq_component.Run(
+                                         *module_ref, quantization_config));
+  } else {
+    TF_RETURN_IF_ERROR(RunPasses(
+        kTfQuantWeightOnlyStepName,
+        /*add_passes_func=*/
+        [&quantization_options](mlir::PassManager &pm) {
+          AddQuantizeWeightOnlyPasses(pm, quantization_options,
+                                      kTfQuantWeightOnlyStepName);
+        },
+        *context, *module_ref));
+  }
 
   return ModuleOpToExportedModel(
       *module_ref, context.get(), kTfQuantWeightOnlyStepName,
@@ -422,27 +477,34 @@ absl::StatusOr<ExportedModel> QuantizeStaticRangePtq(
   }
   mlir::OwningOpRef<mlir::ModuleOp> module_ref = std::move(module).value();
 
-  TF_ASSIGN_OR_RETURN(
-      absl::StatusOr<ExportedModel> pre_calibration_exported_model,
-      QuantizePtqModelPreCalibrationImpl(
-          *module_ref, context.get(), quantization_options, *function_aliases));
+  std::string calibration_data_dir =
+      quantization_options.calibration_options().calibration_data_dir();
+  if (calibration_data_dir.empty()) {
+    TF_ASSIGN_OR_RETURN(calibration_data_dir, CreateTmpDir());
+  }
 
-  TF_ASSIGN_OR_RETURN(
-      const absl::StatusOr<std::string> precalibrated_saved_model_dir,
-      CreateTmpDir());
+  TF_ASSIGN_OR_RETURN(ExportedModel calibration_exported_model,
+                      QuantizePtqModelPreCalibrationImpl(
+                          *module_ref, context.get(), quantization_options,
+                          *function_aliases, calibration_data_dir));
 
-  py_function_library.SaveExportedModel(
-      *precalibrated_saved_model_dir, *pre_calibration_exported_model,
-      saved_model_path, tags, signature_def_map);
+  // Save and run the calibration model.
+  if (calibration_exported_model.has_graph_def()) {
+    TF_ASSIGN_OR_RETURN(std::string calibration_saved_model_dir,
+                        CreateTmpDir());
+    py_function_library.SaveExportedModel(
+        calibration_saved_model_dir, calibration_exported_model,
+        saved_model_path, tags, signature_def_map);
 
-  py_function_library.RunCalibration(
-      *precalibrated_saved_model_dir, signature_keys, tags,
-      quantization_options.force_graph_mode_calibration(),
-      representative_dataset_file_map_serialized);
+    py_function_library.RunCalibration(
+        calibration_saved_model_dir, signature_keys, tags,
+        quantization_options.force_graph_mode_calibration(),
+        representative_dataset_file_map_serialized);
+  }
 
   if (absl::Status status = AddCalibrationStatistics(
-          *module_ref, quantization_options.calibration_options(),
-          py_function_library);
+          *module_ref, calibration_data_dir,
+          quantization_options.calibration_options(), py_function_library);
       !status.ok()) {
     LOG(WARNING) << "Some CustomAggregator ops do not have min or max "
                     "values. Parts of the graph are not quantized. "
@@ -459,14 +521,17 @@ absl::StatusOr<ExportedModel> QuantizeStaticRangePtq(
   if (quantization_options.has_debugger_config() &&
       quantization_options.debugger_config().debugger_type() ==
           DebuggerConfig::DEBUGGER_TYPE_WHOLE_MODEL) {
-    EnableDebugging(*pre_calibration_exported_model);
+    TF_ASSIGN_OR_RETURN(
+        ExportedModel debugging_exported_model,
+        ExportDebuggingModel(*module_ref, context.get(), quantization_options,
+                             *function_aliases));
     ChangeToQuantizedFilename(*module_ref);
 
     absl::string_view unquantized_dump_model_path =
         quantization_options.debugger_config().unquantized_dump_model_path();
     py_function_library.SaveExportedModel(
-        unquantized_dump_model_path, *pre_calibration_exported_model,
-        saved_model_path, tags, signature_def_map);
+        unquantized_dump_model_path, debugging_exported_model, saved_model_path,
+        tags, signature_def_map);
   }
 
   return QuantizePtqModelPostCalibrationImpl(
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
index e0eeca13d92f20..f7dec2d2a5dee7 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
@@ -607,8 +607,8 @@ def _populate_calibration_options(
       calib_opts.calibration_method
       == _CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_PERCENTILE
   ):
-    if not calib_opts.calibration_parameters.initial_num_bins:
-      calib_opts.calibration_parameters.initial_num_bins = 256
+    if not calib_opts.calibration_parameters.num_bins:
+      calib_opts.calibration_parameters.num_bins = 512
     if not calib_opts.calibration_parameters.min_percentile:
       calib_opts.calibration_parameters.min_percentile = 0.001
     if not calib_opts.calibration_parameters.max_percentile:
@@ -632,8 +632,14 @@ def _populate_calibration_options(
           f' methods. calibration_method={calib_opts.calibration_method}'
       )
 
-    if not calib_opts.calibration_parameters.initial_num_bins:
-      calib_opts.calibration_parameters.initial_num_bins = 256
+    if not calib_opts.calibration_parameters.num_bins:
+      calib_opts.calibration_parameters.num_bins = 512
+
+  if calib_opts.calibration_data_dir:
+    save_model.create_empty_output_dir(
+        calib_opts.calibration_data_dir,
+        overwrite=calib_opts.force_regenerate_calibration_data,
+    )
 
 
 def _populate_quantization_options_default_values(
@@ -735,24 +741,24 @@ def _populate_quantization_options_default_values(
   if (quantization_options.op_set == quant_opts_pb2.OpSet.STABLEHLO) and (
       quantization_options.quantization_method.preset_method
       != _PresetMethod.METHOD_STATIC_RANGE_INT8
+      and quantization_options.quantization_method.preset_method
+      != _PresetMethod.METHOD_STATIC_RANGE_WEIGHT_ONLY_INT8
   ):
     raise ValueError(
         'StableHLO quantized opset currently only supports static range'
-        ' quantization via TF Quantizer.'
+        ' quantization and weight-only quantizationvia TF Quantizer.'
     )
 
-  if quantization_options.HasField('debugger_config'):
-    # Set `force_graph_mode_calibration` to True to avoid skipping op execution,
-    # which are not connected to return ops, during calibration execution.
-    # Setting `force_graph_mode_calibration` to True enables execution of the
-    # model in graph mode (not eager mode).
-    logging.debug(
-        'Setting `force_graph_mode_calibration = True` to ensure the debugging '
-        'model is executed in graph mode during calibration, rather than eager '
-        'mode.'
-    )
-    quantization_options.force_graph_mode_calibration = True
+  # Set `force_graph_mode_calibration` to True to avoid skipping op execution,
+  # which are not connected to return ops, during calibration execution.
+  # TODO: b/335031954 - Bring back support to run calibration in Eager mode.
+  logging.debug(
+      'Setting `force_graph_mode_calibration = True` to ensure the calibration'
+      ' mode is executed properly.'
+  )
+  quantization_options.force_graph_mode_calibration = True
 
+  if quantization_options.HasField('debugger_config'):
     if not quantization_options.debugger_config.log_dir_path:
       quantization_options.debugger_config.log_dir_path = '/tmp/dumps'
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/save_model.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/save_model.py
index 4b4ac4f65fe157..87ad7a11f2e677 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/save_model.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/save_model.py
@@ -128,7 +128,9 @@ def _restore_output_tensor_names(
   return graph_def
 
 
-def _create_empty_output_dir(output_directory: str) -> None:
+def create_empty_output_dir(
+    output_directory: str, overwrite: bool = True
+) -> None:
   """Creates the `output_directory`.
 
   If `output_directory` already exists, it recursively deletes all contents
@@ -138,10 +140,11 @@ def _create_empty_output_dir(output_directory: str) -> None:
 
   Args:
     output_directory: Output directory.
+    overwrite: Where to clean the output directory if exists.
   """
-  if file_io.file_exists_v2(output_directory):
+  if overwrite and file_io.file_exists_v2(output_directory):
     logging.info(
-        'Deleting existing directory for quantized model output: %s .',
+        'Deleting existing output directory: %s .',
         output_directory,
     )
     file_io.delete_recursively_v2(output_directory)
@@ -297,7 +300,7 @@ def save_model_v1(
     ValueError iff the graph does not contain a valid signature or the file
     prefix tensor is not found in the graph.
   """
-  _create_empty_output_dir(output_dir)
+  create_empty_output_dir(output_dir)
   v1_builder = builder.SavedModelBuilder(output_dir)
 
   graph_def = _restore_output_tensor_names(graph_def)
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.cc b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.cc
index 0e756021844a5c..b91e4a23613341 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.cc
@@ -157,7 +157,6 @@ void AddQuantizePtqPreCalibrationPasses(
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::quant::CreateInsertCustomAggregationOpsPass(
           quantization_options.calibration_options()));
-  pm.addPass(mlir::quant::CreateIssueIDsOfCustomAggregationOpsPass());
 }
 
 void AddQuantizePtqPostCalibrationPasses(
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.cc b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.cc
index c8db1da7adace4..1d150ce7a648ea 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.h"
 
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <string>
@@ -23,6 +24,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
@@ -69,7 +71,9 @@ void AddUnfuseMhloOpsPasses(mlir::PassManager& pm) {
 
 // Converts TF SavedModel to StableHLO module. The input TF SavedModel can have
 // StableHLO module serialized into a XlaCallModuleOp. (ex: JAX/PyTorch models)
-void AddTFToStablehloPasses(mlir::PassManager& pm) {
+void AddTFToStablehloPasses(
+    mlir::PassManager& pm,
+    llvm::ArrayRef<llvm::ArrayRef<int64_t>> input_arg_shapes) {
   pm.addPass(mlir::odml::CreateRenameEntrypointToMainPass());
   // TODO: b/230572023 - Consider improving shape inference for While op instead
   // of dropping the attribute. This need not be correct for models not trained
@@ -97,7 +101,7 @@ void AddTFToStablehloPasses(mlir::PassManager& pm) {
   pm.addPass(mlir::createSymbolDCEPass());
   pm.addPass(mlir::createCanonicalizerPass());
   // Propagates shapes on the TensorFlow graph.
-  pm.addPass(mlir::TF::CreateTFShapeInferencePass());
+  pm.addPass(mlir::TF::CreateTFShapeInferencePass(input_arg_shapes));
   pm.addPass(mlir::createCanonicalizerPass());
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::TFDevice::CreateDecomposeResourceOpsPass());
@@ -110,7 +114,7 @@ void AddTFToStablehloPasses(mlir::PassManager& pm) {
 
   // Generic MLIR optimization passes.
   pm.addPass(mlir::createCanonicalizerPass());
-  pm.addPass(mlir::TF::CreateTFShapeInferencePass());
+  pm.addPass(mlir::TF::CreateTFShapeInferencePass(input_arg_shapes));
 
   // Legalizes TF UniformQuantized types into MHLO. Part of the official
   // TF/XLA bridge component.
@@ -120,9 +124,9 @@ void AddTFToStablehloPasses(mlir::PassManager& pm) {
 
   // TF -> StableHLO legalization.
   // Skip StatefulPartitionedCall to preserve aliased functions.
-  mlir::odml::AddLegalizeTFToStablehloPasses(
-      pm, /*skip_quantization_ops=*/true,
-      /*skip_resize=*/false, /*skip_stateful_partitioned_call=*/true);
+  mlir::odml::AddLegalizeTFToStablehloPasses(pm, /*skip_quantization_ops=*/true,
+                                             /*skip_resize=*/false,
+                                             /*skip_partitioned_calls=*/true);
   // StableHLO -> MHLO legalization for MHLO optimization.
   pm.addPass(mlir::mhlo::createStablehloLegalizeToHloPass());
   // Rewrites legacy StableHLO ops.
@@ -137,7 +141,8 @@ absl::Status PreprocessAndFreezeGraph(
     const absl::flat_hash_set<std::string>& noinline_functions,
     mlir::ModuleOp module_op, mlir::MLIRContext* context,
     std::optional<Session*> session, const bool run_tf_to_stablehlo,
-    const bool deserialize_xla_call_module) {
+    const bool deserialize_xla_call_module,
+    llvm::ArrayRef<llvm::ArrayRef<int64_t>> input_arg_shapes) {
   mlir::PassManager pm_before_freezing_variables(context);
   mlir::StatusScopedDiagnosticHandler statusHandler(module_op.getContext(),
                                                     /*propagate=*/true);
@@ -169,7 +174,7 @@ absl::Status PreprocessAndFreezeGraph(
   if (run_tf_to_stablehlo) {
     // AddLegalizeTFToStablehloPasses expects frozen TF variables when
     // legalizing to stablehlo.constant.
-    AddTFToStablehloPasses(pm_after_freezing_variables);
+    AddTFToStablehloPasses(pm_after_freezing_variables, input_arg_shapes);
   }
 
   if (deserialize_xla_call_module) {
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.h b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.h
index 740dca6c7b106b..878b3ebdb27968 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_QUANTIZE_PREPROCESS_H_
 #define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_QUANTIZE_PREPROCESS_H_
 
+#include <cstdint>
 #include <optional>
 #include <string>
 
@@ -45,7 +46,8 @@ absl::Status PreprocessAndFreezeGraph(
     const absl::flat_hash_set<std::string>& noinline_functions,
     mlir::ModuleOp module_op, mlir::MLIRContext* context,
     std::optional<Session*> session, bool run_tf_to_stablehlo,
-    bool deserialize_xla_call_module);
+    bool deserialize_xla_call_module,
+    llvm::ArrayRef<llvm::ArrayRef<int64_t>> input_arg_shapes = {});
 
 // Overload of `PreprocessAndFreezeGraph` that uses the default MLIR dump file
 // prefix.
@@ -56,10 +58,15 @@ inline absl::Status PreprocessAndFreezeGraph(mlir::ModuleOp module_op,
       /*mlir_dump_file_prefix=*/kDefaultTfQuantMlirDumpFilePrefix,
       /*is_inliner_run=*/true, /*noinline_functions=*/{}, module_op, context,
       session, /*run_tf_to_stablehlo=*/false,
-      /*deserialize_xla_call_module=*/false);
+      /*deserialize_xla_call_module=*/false, /*input_arg_shapes=*/{});
 }
 
-void AddTFToStablehloPasses(mlir::PassManager& pm);
+// TF->StableHLO has limited support for dynamic shapes.
+// Some models can only be converted with explicitly provided input argument
+// shapes.
+void AddTFToStablehloPasses(
+    mlir::PassManager& pm,
+    llvm::ArrayRef<llvm::ArrayRef<int64_t>> input_arg_shapes = {});
 
 }  // namespace quantization
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/convert_custom_aggregation_op_to_quant_stats.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/convert_custom_aggregation_op_to_quant_stats.mlir
index f72c9f3388c071..91e1e1d82d6150 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/convert_custom_aggregation_op_to_quant_stats.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/convert_custom_aggregation_op_to_quant_stats.mlir
@@ -1,18 +1,18 @@
 // RUN: tf-quant-opt %s -quant-convert-tf-custom-aggregator-op-to-quant-stats | FileCheck %s
 
 func.func @customAggregator(%arg0: tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8x8x8x8xf32>) {
-  %0:4 = "tf.CustomAggregator"(%arg0) {min = -0.1 : f32, max = 0.2 : f32, id = "0"} : (tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
-  %1:4 = "tf.CustomAggregator"(%arg0) {id = "1"} : (tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+  %0:4 = "tf.CustomAggregator"(%arg0) {min = -0.1 : f32, max = 0.2 : f32, id = "0", calibration_method = 1 : i32, num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+  %1:4 = "tf.CustomAggregator"(%arg0) {id = "1", calibration_method = 1 : i32, num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
   func.return %0#0, %1#0 : tensor<8x8x8x8xf32>, tensor<8x8x8x8xf32>
 }
 // CHECK: func @customAggregator
-// CHECK-NEXT: %[[stats:.*]] = "quantfork.stats"(%arg0) {layerStats = dense<[-1.000000e-01, 2.000000e-01]> : tensor<2xf32>} : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
+// CHECK-NEXT: %[[stats:.*]] = "quantfork.stats"(%arg0) <{layerStats = dense<[-1.000000e-01, 2.000000e-01]> : tensor<2xf32>}> : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
 // CHECK-NEXT: return %[[stats]], %arg0
 
 func.func @doNotHandleNoMinMaxCases(%arg0: tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8x8x8x8xf32>, tensor<8x8x8x8xf32>) {
-  %0:4 = "tf.CustomAggregator"(%arg0) {min = -0.1 : f32, id = "1"} : (tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
-  %1:4 = "tf.CustomAggregator"(%arg0) {max = 0.2 : f32, id = "2"} : (tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
-  %2:4 = "tf.CustomAggregator"(%arg0) {id = "3"} : (tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+  %0:4 = "tf.CustomAggregator"(%arg0) {min = -0.1 : f32, id = "1", calibration_method = 1 : i32, num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+  %1:4 = "tf.CustomAggregator"(%arg0) {max = 0.2 : f32, id = "2", calibration_method = 1 : i32, num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
+  %2:4 = "tf.CustomAggregator"(%arg0) {id = "3", calibration_method = 1 : i32, num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
   func.return %0#0, %1#0, %2#0 : tensor<8x8x8x8xf32>, tensor<8x8x8x8xf32>, tensor<8x8x8x8xf32>
 }
 // CHECK: func @doNotHandleNoMinMaxCases
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_custom_aggregation_ops.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_custom_aggregation_ops.mlir
index 052da55dce336d..85480a46e352fe 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_custom_aggregation_ops.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_custom_aggregation_ops.mlir
@@ -26,10 +26,10 @@ module {
 
 // CalibrationOptions(calibration_method=CALIBRATION_METHOD_MIN_MAX)
 // MIN-MAX-CHECK: func @wrap_composite_func
-// MIN-MAX-CHECK-NEXT:  [[rhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg1) <{id = ""}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
-// MIN-MAX-CHECK-NEXT:  [[lhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{id = ""}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+// MIN-MAX-CHECK-NEXT:  [[rhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg1) <{calibration_method = 1 : i32, id = "composite_conv2d_with_relu6_fn_arg_1_calibration_method_1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+// MIN-MAX-CHECK-NEXT:  [[lhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{calibration_method = 1 : i32, id = "composite_conv2d_with_relu6_fn_arg_0_calibration_method_1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
 // MIN-MAX-CHECK-NEXT:  [[add:%.*]] = "tf.PartitionedCall"([[lhs]], [[rhs]])
-// MIN-MAX-CHECK-NEXT:  [[res:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"([[add]]) <{id = ""}> {calibration_method = 1 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+// MIN-MAX-CHECK-NEXT:  [[res:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"([[add]]) <{calibration_method = 1 : i32, id = "composite_conv2d_with_relu6_fn_calibration_method_1", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
 // MIN-MAX-CHECK-NEXT:  return [[res]] : tensor<*xf32>
 
 // MIN-MAX-CHECK: func @no_composite_func
@@ -43,10 +43,10 @@ module {
 
 // CalibrationOptions(calibration_method=CALIBRATION_METHOD_AVERAGE_MIN_MAX)
 // AVERAGE-MIN-MAX-CHECK: func @wrap_composite_func
-// AVERAGE-MIN-MAX-CHECK-NEXT:  [[rhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg1) <{id = ""}> {calibration_method = 2 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
-// AVERAGE-MIN-MAX-CHECK-NEXT:  [[lhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{id = ""}> {calibration_method = 2 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+// AVERAGE-MIN-MAX-CHECK-NEXT:  [[rhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg1) <{calibration_method = 2 : i32, id = "composite_conv2d_with_relu6_fn_arg_1_calibration_method_2", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+// AVERAGE-MIN-MAX-CHECK-NEXT:  [[lhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{calibration_method = 2 : i32, id = "composite_conv2d_with_relu6_fn_arg_0_calibration_method_2", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
 // AVERAGE-MIN-MAX-CHECK-NEXT:  [[add:%.*]] = "tf.PartitionedCall"([[lhs]], [[rhs]])
-// AVERAGE-MIN-MAX-CHECK-NEXT:  [[res:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"([[add]]) <{id = ""}> {calibration_method = 2 : i32, initial_num_bins = 0 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
+// AVERAGE-MIN-MAX-CHECK-NEXT:  [[res:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"([[add]]) <{calibration_method = 2 : i32, id = "composite_conv2d_with_relu6_fn_calibration_method_2", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 0 : i32}> : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<0xi64>)
 // AVERAGE-MIN-MAX-CHECK-NEXT:  return [[res]] : tensor<*xf32>
 
 // AVERAGE-MIN-MAX-CHECK: func @no_composite_func
@@ -60,13 +60,13 @@ module {
 
 // CalibrationOptions(
 //   calibration_method=CALIBRATION_METHOD_HISTOGRAM_PERCENTILE,
-//   calibration_parameters=CalibrationParameters(initial_num_bins=256, min_percentile=0.001, max_percentile=99.999)
+//   calibration_parameters=CalibrationParameters(num_bins=256, min_percentile=0.001, max_percentile=99.999)
 // )
 // HISTOGRAM-PERCENTILE-CHECK: func @wrap_composite_func
-// HISTOGRAM-PERCENTILE-CHECK-NEXT:  [[rhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg1) <{id = ""}> {calibration_method = 3 : i32, initial_num_bins = 256 : i32, max_percentile = 9.999900e+01 : f32, min_percentile = 1.000000e-03 : f32} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
-// HISTOGRAM-PERCENTILE-CHECK-NEXT:  [[lhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{id = ""}> {calibration_method = 3 : i32, initial_num_bins = 256 : i32, max_percentile = 9.999900e+01 : f32, min_percentile = 1.000000e-03 : f32} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+// HISTOGRAM-PERCENTILE-CHECK-NEXT:  [[rhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg1) <{calibration_method = 3 : i32, id = "composite_conv2d_with_relu6_fn_arg_1_calibration_method_3", max_percentile = 9.999900e+01 : f32, min_percentile = 1.000000e-03 : f32, num_bins = 512 : i32}> : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+// HISTOGRAM-PERCENTILE-CHECK-NEXT:  [[lhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{calibration_method = 3 : i32, id = "composite_conv2d_with_relu6_fn_arg_0_calibration_method_3", max_percentile = 9.999900e+01 : f32, min_percentile = 1.000000e-03 : f32, num_bins = 512 : i32}> : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
 // HISTOGRAM-PERCENTILE-CHECK-NEXT:  [[add:%.*]] = "tf.PartitionedCall"([[lhs]], [[rhs]])
-// HISTOGRAM-PERCENTILE-CHECK-NEXT:  [[res:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"([[add]]) <{id = ""}> {calibration_method = 3 : i32, initial_num_bins = 256 : i32, max_percentile = 9.999900e+01 : f32, min_percentile = 1.000000e-03 : f32} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+// HISTOGRAM-PERCENTILE-CHECK-NEXT:  [[res:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"([[add]]) <{calibration_method = 3 : i32, id = "composite_conv2d_with_relu6_fn_calibration_method_3", max_percentile = 9.999900e+01 : f32, min_percentile = 1.000000e-03 : f32, num_bins = 512 : i32}> : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
 // HISTOGRAM-PERCENTILE-CHECK-NEXT:  return [[res]] : tensor<*xf32>
 
 // HISTOGRAM-PERCENTILE-CHECK: func @no_composite_func
@@ -80,13 +80,13 @@ module {
 
 // CalibrationOptions(
 //   calibration_method=CALIBRATION_METHOD_HISTOGRAM_MSE_BRUTEFORCE,
-//   calibration_parameters=CalibrationParameters(initial_num_bins=256)
+//   calibration_parameters=CalibrationParameters(num_bins=256)
 // )
 // HISTOGRAM-MSE-BRUTEFORCE-CHECK: func @wrap_composite_func
-// HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  [[rhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg1) <{id = ""}> {calibration_method = 4 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
-// HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  [[lhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{id = ""}> {calibration_method = 4 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+// HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  [[rhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg1) <{calibration_method = 4 : i32, id = "composite_conv2d_with_relu6_fn_arg_1_calibration_method_4", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 512 : i32}> : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+// HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  [[lhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{calibration_method = 4 : i32, id = "composite_conv2d_with_relu6_fn_arg_0_calibration_method_4", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 512 : i32}> : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
 // HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  [[add:%.*]] = "tf.PartitionedCall"([[lhs]], [[rhs]])
-// HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  [[res:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"([[add]]) <{id = ""}> {calibration_method = 4 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+// HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  [[res:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"([[add]]) <{calibration_method = 4 : i32, id = "composite_conv2d_with_relu6_fn_calibration_method_4", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 512 : i32}> : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
 // HISTOGRAM-MSE-BRUTEFORCE-CHECK-NEXT:  return [[res]] : tensor<*xf32>
 
 // HISTOGRAM-MSE-BRUTEFORCE-CHECK: func @no_composite_func
@@ -100,13 +100,13 @@ module {
 
 // CalibrationOptions(
 //   calibration_method=CALIBRATION_METHOD_HISTOGRAM_MSE_MAX_FREQUENCY,
-//   calibration_parameters=CalibrationParameters(initial_num_bins=256)
+//   calibration_parameters=CalibrationParameters(num_bins=256)
 // )
 // HISTOGRAM-MSE-MAX-FREQUENCY-CHECK: func @wrap_composite_func
-// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  [[rhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg1) <{id = ""}> {calibration_method = 5 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
-// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  [[lhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{id = ""}> {calibration_method = 5 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  [[rhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg1) <{calibration_method = 5 : i32, id = "composite_conv2d_with_relu6_fn_arg_1_calibration_method_5", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 512 : i32}> : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  [[lhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{calibration_method = 5 : i32, id = "composite_conv2d_with_relu6_fn_arg_0_calibration_method_5", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 512 : i32}> : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
 // HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  [[add:%.*]] = "tf.PartitionedCall"([[lhs]], [[rhs]])
-// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  [[res:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"([[add]]) <{id = ""}> {calibration_method = 5 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+// HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  [[res:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"([[add]]) <{calibration_method = 5 : i32, id = "composite_conv2d_with_relu6_fn_calibration_method_5", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 512 : i32}> : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
 // HISTOGRAM-MSE-MAX-FREQUENCY-CHECK-NEXT:  return [[res]] : tensor<*xf32>
 
 // HISTOGRAM-MSE-MAX-FREQUENCY-CHECK: func @no_composite_func
@@ -120,13 +120,13 @@ module {
 
 // CalibrationOptions(
 //   calibration_method=CALIBRATION_METHOD_HISTOGRAM_MSE_SYMMETRIC,
-//   calibration_parameters=CalibrationParameters(initial_num_bins=256)
+//   calibration_parameters=CalibrationParameters(num_bins=256)
 // )
 // HISTOGRAM-MSE-SYMMETRIC-CHECK: func @wrap_composite_func
-// HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  [[rhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg1) <{id = ""}> {calibration_method = 6 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
-// HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  [[lhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{id = ""}> {calibration_method = 6 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+// HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  [[rhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg1) <{calibration_method = 6 : i32, id = "composite_conv2d_with_relu6_fn_arg_1_calibration_method_6", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 512 : i32}> : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+// HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  [[lhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{calibration_method = 6 : i32, id = "composite_conv2d_with_relu6_fn_arg_0_calibration_method_6", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 512 : i32}> : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
 // HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  [[add:%.*]] = "tf.PartitionedCall"([[lhs]], [[rhs]])
-// HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  [[res:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"([[add]]) <{id = ""}> {calibration_method = 6 : i32, initial_num_bins = 256 : i32, max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
+// HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  [[res:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"([[add]]) <{calibration_method = 6 : i32, id = "composite_conv2d_with_relu6_fn_calibration_method_6", max_percentile = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32, num_bins = 512 : i32}> : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<512xi64>)
 // HISTOGRAM-MSE-SYMMETRIC-CHECK-NEXT:  return [[res]] : tensor<*xf32>
 
 // HISTOGRAM-MSE-SYMMETRIC-CHECK: func @no_composite_func
@@ -144,14 +144,17 @@ module {
 module {
   // CHECK-LABEL: func.func @main
   func.func @main(%arg0: tensor<?x100352xf32>, %arg1: tensor<100352x10xf32>) -> tensor<?x10xf32> {
-    // CHECK-DAG: %[[ARG0_ID:.*]] = "tf.Identity"(%arg0)
-    // CHECK-DAG: %[[ARG1_ID:.*]] = "tf.Identity"(%arg1)
-    // CHECK-DAG: %[[ARG0_AGG:.*]] = "tf.CustomAggregator"(%[[ARG0_ID]])
-    // CHECK-DAG: %[[ARG1_AGG:.*]] = "tf.CustomAggregator"(%[[ARG1_ID]])
-    // CHECK: %[[RES:.*]] = "tf.XlaCallModule"(%[[ARG0_AGG]], %[[ARG1_AGG]])
-    // CHECK: %[[RES_AGG:.*]] = "tf.CustomAggregator"(%[[RES]])
-    // CHECK-DAG: %[[RES_ID:.*]] = "tf.Identity"(%[[RES_AGG]])
-    // CHECK: return %[[RES_ID]] : tensor<?x10xf32>
+    // MIN-MAX-CHECK-DAG: %[[ARG0_ID:.*]] = "tf.Identity"(%arg0)
+    // MIN-MAX-CHECK: %[[ARG0_AGG:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[ARG0_ID]])
+    // MIN-MAX-CHECK-SAME: id = "composite_dot_general_fn_1_arg_0_calibration_method_1"
+    // MIN-MAX-CHECK-DAG: %[[ARG1_ID:.*]] = "tf.Identity"(%arg1)
+    // MIN-MAX-CHECK: %[[ARG1_AGG:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[ARG1_ID]])
+    // MIN-MAX-CHECK-SAME: id = "composite_dot_general_fn_1_arg_1_calibration_method_1"
+    // MIN-MAX-CHECK: %[[RES:.*]] = "tf.XlaCallModule"(%[[ARG0_AGG]], %[[ARG1_AGG]])
+    // MIN-MAX-CHECK: %[[RES_AGG:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%[[RES]])
+    // MIN-MAX-CHECK-SAME: id = "composite_dot_general_fn_1_calibration_method_1"
+    // MIN-MAX-CHECK: %[[RES_ID:.*]] = "tf.Identity"(%[[RES_AGG]])
+    // MIN-MAX-CHECK: return %[[RES_ID]] : tensor<?x10xf32>
     %0 = "tf.Identity"(%arg0) {device = ""} : (tensor<?x100352xf32>) -> tensor<?x100352xf32>
     %1 = "tf.Identity"(%arg1) {device = ""} : (tensor<100352x10xf32>) -> tensor<100352x10xf32>
     %2 = "tf.XlaCallModule"(%0, %1) <{
@@ -162,7 +165,8 @@ module {
     }> {
         _entry_function = @composite_dot_general_fn_1,
         _original_entry_function = "composite_dot_general_fn_1",
-        _tfl_quant_trait = "fully_quantizable"
+        _tfl_quant_trait = "fully_quantizable",
+        _quantization_method = "static_range_ptq { }"
     } : (tensor<?x100352xf32>, tensor<100352x10xf32>) -> tensor<?x10xf32>
     %3 = "tf.Identity"(%2) {device = ""} : (tensor<?x10xf32>) -> tensor<?x10xf32>
     return %3 : tensor<?x10xf32>
@@ -175,3 +179,174 @@ module {
     return %0 : tensor<?x10xf32>
   }
 }
+
+// -----
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1833 : i32}, tf_saved_model.semantics} {
+  func.func @serving_default(%arg0: tensor<1x4xf32> {tf_saved_model.index_path = ["x"]}) -> (tensor<1x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_x:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %cst = "tf.Const"() <{value = dense<[0, 1]> : tensor<2xi32>}> {device = ""} : () -> tensor<2xi32>
+    %cst_0 = "tf.Const"() <{value = dense<1.000000e+01> : tensor<f32>}> {device = ""} : () -> tensor<f32>
+    %0 = "tf.Sum"(%arg0, %cst) <{keep_dims = false}> {device = ""} : (tensor<1x4xf32>, tensor<2xi32>) -> tensor<f32>
+    %1 = "tf.Greater"(%0, %cst_0) {device = ""} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    %2:2 = "tf.If"(%1, %arg0) <{else_branch = @cond_false_80, is_stateless = true, then_branch = @cond_true_70}> {Tcond = i1, Tin = [f32], Tout = [i1, f32], _lower_using_switch_merge = true, _read_only_resource_inputs = [], device = ""} : (tensor<i1>, tensor<1x4xf32>) -> (tensor<i1>, tensor<1x3xf32>)
+    %3 = "tf.Identity"(%2#1) {device = ""} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+    return %3 : tensor<1x3xf32>
+  }
+
+
+  func.func private @cond_false_80(%arg0: tensor<1x4xf32> {tf._user_specified_name = "x"}) -> (tensor<i1>, tensor<1x3xf32>) attributes {tf._construction_context = "kEagerRuntime", tf._input_shapes = [#tf_type.shape<1x4>], tf._original_func_name = "cond_false_8"} {
+    %cst = "tf.Const"() <{value = dense<true> : tensor<i1>}> {device = ""} : () -> tensor<i1>
+    %cst_0 = "tf.Const"() <{value = dense<[0.117216609, 0.933735609, 0.0728900209]> : tensor<3xf32>}> {device = ""} : () -> tensor<3xf32>
+    %cst_1 = "tf.Const"() <{value = dense<[[-0.795477629, 0.581315517, 0.921566545], [0.138622552, 0.463866323, 0.95474267], [-0.143770888, -0.796835303, 0.899996876], [0.0989735424, -0.483384758, -7.277030e-01]]> : tensor<4x3xf32>}> {device = ""} : () -> tensor<4x3xf32>
+    %0 = "tf.Identity"(%cst) {device = ""} : (tensor<i1>) -> tensor<i1>
+    %1 = "tf.PartitionedCall"(%arg0, %cst_1, %cst_0) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_with_bias_fn_1}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x4xf32>, tensor<4x3xf32>, tensor<3xf32>) -> tensor<1x3xf32>
+    %2 = "tf.Identity"(%1) {device = ""} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+    return %0, %2 : tensor<i1>, tensor<1x3xf32>
+  }
+  // MIN-MAX-CHECK: func.func private @cond_false_80
+  // MIN-MAX-CHECK: %[[ARG0_AGG:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"
+  // MIN-MAX-CHECK-SAME: id = "composite_matmul_with_bias_fn_1_arg_0_calibration_method_1"
+  // MIN-MAX-CHECK: %[[ARG0_AGG:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"
+  // MIN-MAX-CHECK-SAME: id = "composite_matmul_with_bias_fn_1_calibration_method_1"
+
+  func.func private @cond_true_70(%arg0: tensor<1x4xf32> {tf._user_specified_name = "x"}) -> (tensor<i1>, tensor<1x3xf32>) attributes {tf._construction_context = "kEagerRuntime", tf._input_shapes = [#tf_type.shape<1x4>], tf._original_func_name = "cond_true_7"} {
+    %cst = "tf.Const"() <{value = dense<true> : tensor<i1>}> {device = ""} : () -> tensor<i1>
+    %cst_0 = "tf.Const"() <{value = dense<[0.335351914, 0.084816426, -0.664676845]> : tensor<3xf32>}> {device = ""} : () -> tensor<3xf32>
+    %cst_1 = "tf.Const"() <{value = dense<[[-0.630731344, 0.54962182, 0.180364341], [-0.764542698, -0.211145893, -0.708605706], [-0.954062759, -0.614013135, 0.612640202], [-0.418223292, 5.057390e-01, 0.899269938]]> : tensor<4x3xf32>}> {device = ""} : () -> tensor<4x3xf32>
+    %0 = "tf.Identity"(%cst) {device = ""} : (tensor<i1>) -> tensor<i1>
+    %1 = "tf.PartitionedCall"(%arg0, %cst_1, %cst_0) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_with_bias_fn_2}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x4xf32>, tensor<4x3xf32>, tensor<3xf32>) -> tensor<1x3xf32>
+    %2 = "tf.Identity"(%1) {device = ""} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+    return %0, %2 : tensor<i1>, tensor<1x3xf32>
+  }
+  // MIN-MAX-CHECK: func.func private @cond_true_70
+  // MIN-MAX-CHECK: %[[ARG0_AGG:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"
+  // MIN-MAX-CHECK-SAME: id = "composite_matmul_with_bias_fn_2_arg_0_calibration_method_1"
+  // MIN-MAX-CHECK: %[[ARG0_AGG:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"
+  // MIN-MAX-CHECK-SAME: id = "composite_matmul_with_bias_fn_2_calibration_method_1"
+
+  func.func private @composite_matmul_with_bias_fn_1(%arg0: tensor<1x4xf32>, %arg1: tensor<4x3xf32>, %arg2: tensor<3xf32>) -> tensor<1x3xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.MatMul"(%arg0, %arg1) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_b", device = ""} : (tensor<1x4xf32>, tensor<4x3xf32>) -> tensor<1x3xf32>
+    %1 = "tf.BiasAdd"(%0, %arg2) <{data_format = "NHWC"}> {device = ""} : (tensor<1x3xf32>, tensor<3xf32>) -> tensor<1x3xf32>
+    return %1 : tensor<1x3xf32>
+  }
+
+  func.func private @composite_matmul_with_bias_fn_2(%arg0: tensor<1x4xf32>, %arg1: tensor<4x3xf32>, %arg2: tensor<3xf32>) -> tensor<1x3xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.MatMul"(%arg0, %arg1) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_b", device = ""} : (tensor<1x4xf32>, tensor<4x3xf32>) -> tensor<1x3xf32>
+    %1 = "tf.BiasAdd"(%0, %arg2) <{data_format = "NHWC"}> {device = ""} : (tensor<1x3xf32>, tensor<3xf32>) -> tensor<1x3xf32>
+    return %1 : tensor<1x3xf32>
+  }
+}
+
+// -----
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1833 : i32}, tf_saved_model.semantics} {
+  func.func @serving_default(%arg0: tensor<1x4xf32> {tf_saved_model.index_path = ["x"]}) -> (tensor<1x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_x:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %cst = "tf.Const"() <{value = dense<1.000000e+01> : tensor<f32>}> {device = ""} : () -> tensor<f32>
+    %cst_0 = "tf.Const"() <{value = dense<[0, 1]> : tensor<2xi32>}> {device = ""} : () -> tensor<2xi32>
+    %cst_1 = "tf.Const"() <{value = dense<[[-0.630731344, 0.54962182, 0.180364341], [-0.764542698, -0.211145893, -0.708605706], [-0.954062759, -0.614013135, 0.612640202], [-0.418223292, 5.057390e-01, 0.899269938]]> : tensor<4x3xf32>}> {device = ""} : () -> tensor<4x3xf32>
+    %cst_2 = "tf.Const"() <{value = dense<[0.335351914, 0.084816426, -0.664676845]> : tensor<3xf32>}> {device = ""} : () -> tensor<3xf32>
+    %cst_3 = "tf.Const"() <{value = dense<true> : tensor<i1>}> {device = ""} : () -> tensor<i1>
+    %cst_4 = "tf.Const"() <{value = dense<[[-0.795477629, 0.581315517, 0.921566545], [0.138622552, 0.463866323, 0.95474267], [-0.143770888, -0.796835303, 0.899996876], [0.0989735424, -0.483384758, -7.277030e-01]]> : tensor<4x3xf32>}> {device = ""} : () -> tensor<4x3xf32>
+    %cst_5 = "tf.Const"() <{value = dense<[0.117216609, 0.933735609, 0.0728900209]> : tensor<3xf32>}> {device = ""} : () -> tensor<3xf32>
+    %0 = "tf.Sum"(%arg0, %cst_0) <{keep_dims = false}> {device = ""} : (tensor<1x4xf32>, tensor<2xi32>) -> tensor<f32>
+    %1 = "tf.Greater"(%0, %cst) {device = ""} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    %2:2 = "tf.IfRegion"(%1) <{_else_func_name = "cond_false_80", _then_func_name = "cond_true_70", is_stateless = true}> ({
+      %4 = "tf.Identity"(%cst_3) {device = ""} : (tensor<i1>) -> tensor<i1>
+      %5 = "tf.PartitionedCall"(%arg0, %cst_1, %cst_2) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_with_bias_fn_2}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x4xf32>, tensor<4x3xf32>, tensor<3xf32>) -> tensor<1x3xf32>
+      %6 = "tf.Identity"(%5) {device = ""} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+      "tf.Yield"(%4, %6) {device = ""} : (tensor<i1>, tensor<1x3xf32>) -> ()
+    }, {
+      %4 = "tf.Identity"(%cst_3) {device = ""} : (tensor<i1>) -> tensor<i1>
+      %5 = "tf.PartitionedCall"(%arg0, %cst_4, %cst_5) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_with_bias_fn_1}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x4xf32>, tensor<4x3xf32>, tensor<3xf32>) -> tensor<1x3xf32>
+      %6 = "tf.Identity"(%5) {device = ""} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+      "tf.Yield"(%4, %6) {device = ""} : (tensor<i1>, tensor<1x3xf32>) -> ()
+    }) {_lower_using_switch_merge = true, _read_only_resource_inputs = [], device = ""} : (tensor<i1>) -> (tensor<i1>, tensor<1x3xf32>)
+    %3 = "tf.Identity"(%2#1) {device = ""} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+    return %3 : tensor<1x3xf32>
+  }
+  // MIN-MAX-CHECK: func.func @serving_default
+  // MIN-MAX-CHECK: %[[ARG0_AGG:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"
+  // MIN-MAX-CHECK-SAME: id = "composite_matmul_with_bias_fn_1_arg_0_calibration_method_1"
+  // MIN-MAX-CHECK: "tf.IfRegion"
+  // MIN-MAX-CHECK: %[[ARG0_AGG:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"
+  // MIN-MAX-CHECK-SAME: id = "composite_matmul_with_bias_fn_2_calibration_method_1"
+  // MIN-MAX-CHECK: %[[ARG0_AGG:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"
+  // MIN-MAX-CHECK-SAME: id = "composite_matmul_with_bias_fn_1_calibration_method_1"
+
+  func.func private @composite_matmul_with_bias_fn_2(%arg0: tensor<1x4xf32>, %arg1: tensor<4x3xf32>, %arg2: tensor<3xf32>) -> tensor<1x3xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.MatMul"(%arg0, %arg1) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_b", device = ""} : (tensor<1x4xf32>, tensor<4x3xf32>) -> tensor<1x3xf32>
+    %1 = "tf.BiasAdd"(%0, %arg2) <{data_format = "NHWC"}> {device = ""} : (tensor<1x3xf32>, tensor<3xf32>) -> tensor<1x3xf32>
+    return %1 : tensor<1x3xf32>
+  }
+  func.func private @composite_matmul_with_bias_fn_1(%arg0: tensor<1x4xf32>, %arg1: tensor<4x3xf32>, %arg2: tensor<3xf32>) -> tensor<1x3xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.MatMul"(%arg0, %arg1) <{grad_a = false, grad_b = false, transpose_a = false, transpose_b = false}> {attr_map = "0:transpose_a,1:transpose_b", device = ""} : (tensor<1x4xf32>, tensor<4x3xf32>) -> tensor<1x3xf32>
+    %1 = "tf.BiasAdd"(%0, %arg2) <{data_format = "NHWC"}> {device = ""} : (tensor<1x3xf32>, tensor<3xf32>) -> tensor<1x3xf32>
+    return %1 : tensor<1x3xf32>
+  }
+}
+
+// -----
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1836 : i32}, tf_saved_model.semantics} {
+  func.func @main(%arg0: tensor<10x1x1024xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<10x1x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %cst = stablehlo.constant dense<0.000000e+00>: tensor<10x1024x3xf32>
+    %0 = "tf.XlaCallModule"(%arg0, %cst) <{Sout = [#tf_type.shape<10x1x3>], dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_with_relu_fn_1, _original_entry_function = "composite_dot_general_with_relu_fn_1", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _tfl_quant_trait = "fully_quantizable"} : (tensor<10x1x1024xf32>, tensor<10x1024x3xf32>) -> tensor<10x1x3xf32>
+    return %0 : tensor<10x1x3xf32>
+  }
+  // MIN-MAX-CHECK: func.func @main
+  // MIN-MAX-CHECK: %[[ARG0_AGG:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"
+  // MIN-MAX-CHECK-SAME: id = "composite_dot_general_with_relu_fn_1_arg_0_calibration_method_1"
+  // MIN-MAX-CHECK: %[[ARG0_AGG:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"
+  // MIN-MAX-CHECK-SAME: id = "composite_dot_general_with_relu_fn_1_calibration_method_1"
+
+  func.func private @composite_dot_general_with_relu_fn_1(%arg0: tensor<10x1x1024xf32>, %arg1: tensor<10x1024x3xf32>) -> tensor<10x1x3xf32> attributes {_from_xla_call_module, tf_quant.composite_function} {
+    %cst = stablehlo.constant dense<0.000000e+00> : tensor<10x1x3xf32>
+    %0 = stablehlo.dot_general %arg0, %arg1, batching_dims = [0] x [0], contracting_dims = [2] x [1], precision = [DEFAULT, DEFAULT] {mhlo.frontend_attributes = {grad_x = "false", grad_y = "false"}} : (tensor<10x1x1024xf32>, tensor<10x1024x3xf32>) -> tensor<10x1x3xf32>
+    %1 = stablehlo.maximum %0, %cst : tensor<10x1x3xf32>
+    return %1 : tensor<10x1x3xf32>
+  }
+}
+
+// -----
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1836 : i32}, tf_saved_model.semantics} {
+  func.func @main(%arg0: tensor<1x4xf32> {tf_saved_model.index_path = ["x"]}) -> (tensor<1x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_x:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %cst = stablehlo.constant dense<1.000000e+01> : tensor<f32>
+    %cst_0 = stablehlo.constant dense<[[-0.630731344, 0.54962182, 0.180364341], [-0.764542698, -0.211145893, -0.708605706], [-0.954062759, -0.614013135, 0.612640202], [-0.418223292, 5.057390e-01, 0.899269938]]> : tensor<4x3xf32>
+    %c = stablehlo.constant dense<true> : tensor<i1>
+    %cst_1 = stablehlo.constant dense<[[-0.795477629, 0.581315517, 0.921566545], [0.138622552, 0.463866323, 0.95474267], [-0.143770888, -0.796835303, 0.899996876], [0.0989735424, -0.483384758, -7.277030e-01]]> : tensor<4x3xf32>
+    %cst_2 = stablehlo.constant dense<-0.000000e+00> : tensor<f32>
+    %cst_3 = stablehlo.constant dense<[[0.335351914, 0.084816426, -0.664676845]]> : tensor<1x3xf32>
+    %cst_4 = stablehlo.constant dense<[[0.117216609, 0.933735609, 0.0728900209]]> : tensor<1x3xf32>
+    %0 = stablehlo.reduce(%arg0 init: %cst_2) applies stablehlo.add across dimensions = [0, 1] : (tensor<1x4xf32>, tensor<f32>) -> tensor<f32>
+    %1 = stablehlo.compare  GT, %0, %cst : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    %2:2 = "stablehlo.if"(%1) ({
+      %3 = "tf.XlaCallModule"(%arg0, %cst_0, %cst_3) <{Sout = [#tf_type.shape<1x3>], dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_with_bias_same_shape_fn_2, _original_entry_function = "composite_dot_general_with_bias_same_shape_fn_2", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _tfl_quant_trait = "fully_quantizable"} : (tensor<1x4xf32>, tensor<4x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
+      stablehlo.return %c, %3 : tensor<i1>, tensor<1x3xf32>
+    }, {
+      %3 = "tf.XlaCallModule"(%arg0, %cst_1, %cst_4) <{Sout = [#tf_type.shape<1x3>], dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}> {_entry_function = @composite_dot_general_with_bias_same_shape_fn_1, _original_entry_function = "composite_dot_general_with_bias_same_shape_fn_1", _quantization_method = "static_range_ptq { }", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _tfl_quant_trait = "fully_quantizable"} : (tensor<1x4xf32>, tensor<4x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
+      stablehlo.return %c, %3 : tensor<i1>, tensor<1x3xf32>
+    }) : (tensor<i1>) -> (tensor<i1>, tensor<1x3xf32>)
+    return %2#1 : tensor<1x3xf32>
+  }
+  // MIN-MAX-CHECK: func.func @main
+  // MIN-MAX-CHECK: %[[ARG0_AGG:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"
+  // MIN-MAX-CHECK-SAME: id = "composite_dot_general_with_bias_same_shape_fn_1_arg_0_calibration_method_1"
+  // MIN-MAX-CHECK: "stablehlo.if"
+  // MIN-MAX-CHECK: %[[ARG0_AGG:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"
+  // MIN-MAX-CHECK-SAME: id = "composite_dot_general_with_bias_same_shape_fn_2_calibration_method_1"
+  // MIN-MAX-CHECK: %[[ARG0_AGG:.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"
+  // MIN-MAX-CHECK-SAME: id = "composite_dot_general_with_bias_same_shape_fn_1_calibration_method_1"
+
+  func.func private @composite_dot_general_with_bias_same_shape_fn_2(%arg0: tensor<1x4xf32>, %arg1: tensor<4x3xf32>, %arg2: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module, tf_quant.composite_function} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x4xf32>, tensor<4x3xf32>) -> tensor<1x3xf32>
+    %1 = stablehlo.add %0, %arg2 : tensor<1x3xf32>
+    return %1 : tensor<1x3xf32>
+  }
+
+  func.func private @composite_dot_general_with_bias_same_shape_fn_1(%arg0: tensor<1x4xf32>, %arg1: tensor<4x3xf32>, %arg2: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module, tf_quant.composite_function} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x4xf32>, tensor<4x3xf32>) -> tensor<1x3xf32>
+    %1 = stablehlo.add %0, %arg2 : tensor<1x3xf32>
+    return %1 : tensor<1x3xf32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/issue_ids_of_custom_aggregation_ops.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/issue_ids_of_custom_aggregation_ops.mlir
deleted file mode 100644
index 6a1621cdf17e89..00000000000000
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/issue_ids_of_custom_aggregation_ops.mlir
+++ /dev/null
@@ -1,17 +0,0 @@
-// RUN: tf-quant-opt %s -quant-issues-ids-of-custom-aggregation-ops | FileCheck %s
-
-func.func @issue_ids(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
-  %0:4 = "tf.CustomAggregator"(%arg1) {id = ""} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
-  %1:4 = "tf.CustomAggregator"(%arg0) {id = ""} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
-  %2 = "tf.AddV2"(%1, %0) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-  %3:4 = "tf.CustomAggregator"(%2) {id = ""} : (tensor<*xf32>) -> (tensor<*xf32>, tensor<f32>, tensor<f32>, tensor<*xi64>)
-  func.return %3 : tensor<*xf32>
-}
-
-
-// CHECK: func @issue_ids
-// CHECK-NEXT:  [[rhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg1) <{id = "0"}> : (tensor<*xf32>)
-// CHECK-NEXT:  [[lhs:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"(%arg0) <{id = "1"}> : (tensor<*xf32>)
-// CHECK-NEXT:  [[add:%.*]] = "tf.AddV2"([[lhs]], [[rhs]]) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-// CHECK-NEXT:  [[res:%.*]], {{.*}}, {{.*}}, {{.*}} = "tf.CustomAggregator"([[add]]) <{id = "2"}> : (tensor<*xf32>)
-// CHECK-NEXT:  return [[res]] : tensor<*xf32>
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions.mlir
index 6a7f9da6bc5563..b0ce385ba41628 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions.mlir
@@ -421,3 +421,88 @@ func.func @conv3d_with_bias(%arg0: tensor<1x3x4x3x3xf32>) -> (tensor<1x3x2x3x2xf
 // CHECK-LABEL: private @composite_conv3d_with_bias_and_relu6_fn_1
 // CHECK-LABEL: private @composite_conv3d_with_bias_fn_1
 }
+
+// -----
+
+// Test that the name of composite functions are deterministic. There are 3
+// unsorted functions in this module and each function has 2 quantizable ops.
+module {
+  func.func @float_conv_3(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
+    %cst = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+    %0 = "tf.Conv2D"(%arg0, %arg1) {
+      data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [],
+      padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
+    } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+    %1 = "tf.BiasAdd"(%0, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+    %2 = "tf.Relu6"(%1) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+
+
+    %3 = "tf.Conv2D"(%arg0, %arg1) {
+      data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [],
+      padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
+    } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+    %4 = "tf.BiasAdd"(%3, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+    %5 = "tf.Relu6"(%4) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+
+    func.return %2, %5 : tensor<*xf32>, tensor<*xf32>
+  }
+
+  func.func @float_conv_1(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
+    %cst = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+    %0 = "tf.Conv2D"(%arg0, %arg1) {
+      data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [],
+      padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
+    } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+    %1 = "tf.BiasAdd"(%0, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+    %2 = "tf.Relu6"(%1) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+
+
+    %3 = "tf.Conv2D"(%arg0, %arg1) {
+      data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [],
+      padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
+    } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+    %4 = "tf.BiasAdd"(%3, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+    %5 = "tf.Relu6"(%4) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+
+    func.return %2, %5 : tensor<*xf32>, tensor<*xf32>
+  }
+
+  func.func @float_conv_2(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
+    %cst = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+    %0 = "tf.Conv2D"(%arg0, %arg1) {
+      data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [],
+      padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
+    } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+    %1 = "tf.BiasAdd"(%0, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+    %2 = "tf.Relu6"(%1) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+
+
+    %3 = "tf.Conv2D"(%arg0, %arg1) {
+      data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [],
+      padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
+    } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+    %4 = "tf.BiasAdd"(%3, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+    %5 = "tf.Relu6"(%4) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+
+    func.return %2, %5 : tensor<*xf32>, tensor<*xf32>
+  }
+}
+
+// CHECK-LABEL: @float_conv_3
+// CHECK:  "tf.PartitionedCall"
+// CHECK-SAME: f = @composite_conv2d_with_bias_and_relu6_fn_6
+// CHECK:  "tf.PartitionedCall"
+// CHECK-SAME: f = @composite_conv2d_with_bias_and_relu6_fn_5
+
+// CHECK-LABEL: @float_conv_1
+// CHECK:  "tf.PartitionedCall"
+// CHECK-SAME: f = @composite_conv2d_with_bias_and_relu6_fn_2
+// CHECK:  "tf.PartitionedCall"
+// CHECK-SAME: f = @composite_conv2d_with_bias_and_relu6_fn_1
+
+// CHECK-LABEL: @float_conv_2
+// CHECK:  "tf.PartitionedCall"
+// CHECK-SAME: f = @composite_conv2d_with_bias_and_relu6_fn_4
+// CHECK:  "tf.PartitionedCall"
+// CHECK-SAME: f = @composite_conv2d_with_bias_and_relu6_fn_3
+
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize.mlir
index 0f3c7024dba4b4..7e020bd279d2b6 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize.mlir
@@ -21,7 +21,7 @@ func.func private @conv(%input: tensor<1x3x4x3xf32> {tf._user_specified_name = "
 
 // CHECK-DAG: [[bias:%.+]] = "arith.constant"() <{value = dense<[7.11401462, 7.05456924]> : tensor<2xf32>}> : () -> tensor<2xf32>
 // CHECK-DAG: [[weight:%.+]] = "arith.constant"() <{value = dense_resource<__elided__> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2x!quant.uniform<i8:f32, 0.074855112561992565:-1>>
-// CHECK: [[q_input:%.+]] = "quantfork.qcast"(%arg0) : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, 0.58810077742034317:-128>>
+// CHECK: [[q_input:%.+]] = "quantfork.qcast"([[ARG0:%arg[0-9]+]]) : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, 0.58810077742034317:-128>>
 // CHECK-NEXT: [[q_bias:%.+]] = "quantfork.qcast"([[bias]]) : (tensor<2xf32>) -> tensor<2x!quant.uniform<i32:f32, 0.044022349891595126>>
 // CHECK-NEXT: [[conv:%.+]] = "tf.PartitionedCall"([[q_input]], [[weight]], [[q_bias]]) <{config = "", config_proto = "", executor_type = "", f = @[[composite_fn:composite_conv2d_with_bias_and_relu6_fn.*]]}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x3x4x3x!quant.uniform<i8:f32, 0.58810077742034317:-128>>, tensor<2x3x3x2x!quant.uniform<i8:f32, 0.074855112561992565:-1>>, tensor<2x!quant.uniform<i32:f32, 0.044022349891595126>>) -> tensor<*x!quant.uniform<i8:f32, 0.023529411764705882:-128>>
 // CHECK-NEXT: [[res:%.+]] = "quantfork.dcast"([[conv]]) : (tensor<*x!quant.uniform<i8:f32, 0.023529411764705882:-128>>) -> tensor<*xf32>
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_drq.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_drq.mlir
index e3bda3f5d09af9..b8b31d880e0c78 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_drq.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_drq.mlir
@@ -15,7 +15,7 @@ module {
 
 // CHECK: %[[cst:.*]] = "arith.constant"() <{value = dense<0.000000e+00> : tensor<2x1024xf32>}> : () -> tensor<2x1024xf32>
 // CHECK: %[[q_cst:.*]] = "quantfork.qcast"(%[[cst]]) : (tensor<2x1024xf32>) -> tensor<2x1024x!quant.uniform<i8<-127:127>:f32, 3.9370078740157481E-9>>
-// CHECK: %[[out:.*]] = "tf.PartitionedCall"(%arg0, %[[q_cst]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x2x2x3xf32>, tensor<2x1024x!quant.uniform<i8<-127:127>:f32, 3.9370078740157481E-9>>) -> tensor<*xf32>
+// CHECK: %[[out:.*]] = "tf.PartitionedCall"([[ARG0:%arg[0-9]+]], %[[q_cst]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x2x2x3xf32>, tensor<2x1024x!quant.uniform<i8<-127:127>:f32, 3.9370078740157481E-9>>) -> tensor<*xf32>
 // CHECK: "func.return"(%[[out]]) : (tensor<*xf32>) -> ()
 }
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_xla.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_xla.mlir
index f24b6399774f08..6a6c176ad37d2e 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_xla.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_xla.mlir
@@ -21,7 +21,7 @@ func.func private @conv(%input: tensor<1x3x4x3xf32> {tf._user_specified_name = "
 
 // CHECK-DAG: [[bias:%.+]] = "arith.constant"() <{value = dense<[7.11401462, 7.05456924]> : tensor<2xf32>}> : () -> tensor<2xf32>
 // CHECK-DAG: [[weight:%.+]] = "arith.constant"() <{value = dense_resource<__elided__> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2x!quant.uniform<i8:f32, 0.074855112561992565:-1>>
-// CHECK: [[q_input:%.+]] = "quantfork.qcast"(%arg0) : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, 0.58810077742034317:-128>>
+// CHECK: [[q_input:%.+]] = "quantfork.qcast"([[ARG0:%arg[0-9]+]]) : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, 0.58810077742034317:-128>>
 // CHECK-NEXT: [[q_bias:%.+]] = "quantfork.qcast"([[bias]]) : (tensor<2xf32>) -> tensor<2x!quant.uniform<i32:f32, 0.044022349891595126>>
 // CHECK-NEXT: [[conv:%.+]] = "tf.PartitionedCall"([[q_input]], [[weight]], [[q_bias]]) <{config = "", config_proto = "", executor_type = "", f = @[[composite_fn:composite_conv2d_with_bias_and_relu6_fn.*]]}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<1x3x4x3x!quant.uniform<i8:f32, 0.58810077742034317:-128>>, tensor<2x3x3x2x!quant.uniform<i8:f32, 0.074855112561992565:-1>>, tensor<2x!quant.uniform<i32:f32, 0.044022349891595126>>) -> tensor<*x!quant.uniform<i8:f32, 0.023529411764705882:-128>>
 // CHECK-NEXT: [[res:%.+]] = "quantfork.dcast"([[conv]]) : (tensor<*x!quant.uniform<i8:f32, 0.023529411764705882:-128>>) -> tensor<*xf32>
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/utils/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/utils/BUILD
index 21600fb78083a5..4397b4fc5a3f2d 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/utils/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/utils/BUILD
@@ -30,6 +30,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -85,6 +86,7 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
         "@local_xla//xla:xla_data_proto_cc",
     ],
 )
@@ -100,5 +102,6 @@ tf_cc_test(
         "@com_google_googletest//:gtest_main",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
 )
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/utils/fake_quant_utils.h b/tensorflow/compiler/mlir/quantization/tensorflow/utils/fake_quant_utils.h
index 5a1734bf6bf026..702e19506d2fd6 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/utils/fake_quant_utils.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/utils/fake_quant_utils.h
@@ -112,7 +112,7 @@ class ConvertFakeQuantOpToQuantOps {
 
     Value input = tf_op.getInputs();
     int quant_dim = -1;
-    auto input_type = input.getType().template cast<ShapedType>();
+    auto input_type = mlir::cast<ShapedType>(input.getType());
     if (PerAxis) {
       if (!input_type.hasRank()) {
         tf_op.emitError("The input should have known rank for per-channel op.");
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_quantize_op_utils.cc b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_quantize_op_utils.cc
index 264c6c508a60f7..1392bf4de2a92f 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_quantize_op_utils.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_quantize_op_utils.cc
@@ -16,14 +16,15 @@ limitations under the License.
 
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 
 namespace mlir {
 namespace quant {
 
 UnrankedTensorType CreateUnknownShapeFromElementType(Type tensor_type) {
-  if (!tensor_type.cast<TensorType>()) return UnrankedTensorType();
+  if (!mlir::cast<TensorType>(tensor_type)) return UnrankedTensorType();
   return UnrankedTensorType::get(
-      tensor_type.cast<TensorType>().getElementType());
+      mlir::cast<TensorType>(tensor_type).getElementType());
 }
 
 }  // namespace quant
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_uniform_attribute_utils.cc b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_uniform_attribute_utils.cc
index 967af993c0bcf7..430d5ff6ba2047 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_uniform_attribute_utils.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_uniform_attribute_utils.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/ops/uniform_op_quant_spec.h"
@@ -66,9 +67,9 @@ constexpr std::array<absl::string_view, 2> kSuffixes = {"_min_val", "_max_val"};
 
 Attribute GetWindowStridesValue(
     PatternRewriter& rewriter, llvm::StringMap<Attribute>& identifier_to_attr) {
-  ArrayAttr stride = identifier_to_attr["strides"].dyn_cast<ArrayAttr>();
-  const int stride_h = stride[1].cast<IntegerAttr>().getInt();
-  const int stride_w = stride[2].cast<IntegerAttr>().getInt();
+  ArrayAttr stride = mlir::dyn_cast<ArrayAttr>(identifier_to_attr["strides"]);
+  const int stride_h = mlir::cast<IntegerAttr>(stride[1]).getInt();
+  const int stride_w = mlir::cast<IntegerAttr>(stride[2]).getInt();
   return rewriter.getI64ArrayAttr({stride_h, stride_w});
 }
 
@@ -79,23 +80,24 @@ Attribute GetLhsDilationValue(PatternRewriter& rewriter,
 
 Attribute GetRhsDilationValue(PatternRewriter& rewriter,
                               llvm::StringMap<Attribute>& identifier_to_attr) {
-  ArrayAttr dilations = identifier_to_attr["dilations"].dyn_cast<ArrayAttr>();
-  const int dilation_h = dilations[1].cast<IntegerAttr>().getInt();
-  const int dilation_w = dilations[2].cast<IntegerAttr>().getInt();
+  ArrayAttr dilations =
+      mlir::dyn_cast<ArrayAttr>(identifier_to_attr["dilations"]);
+  const int dilation_h = mlir::cast<IntegerAttr>(dilations[1]).getInt();
+  const int dilation_w = mlir::cast<IntegerAttr>(dilations[2]).getInt();
   return rewriter.getI64ArrayAttr({dilation_h, dilation_w});
 }
 
 Attribute GetPaddingValue(PatternRewriter& rewriter,
                           llvm::StringMap<Attribute>& identifier_to_attr) {
   llvm::StringRef padding =
-      identifier_to_attr["padding"].dyn_cast<StringAttr>().getValue();
+      mlir::dyn_cast<StringAttr>(identifier_to_attr["padding"]).getValue();
   return rewriter.getStringAttr(padding);
 }
 
 Attribute GetExplicitPaddingValue(
     PatternRewriter& rewriter, llvm::StringMap<Attribute>& identifier_to_attr) {
   ArrayAttr explicit_padding =
-      identifier_to_attr["explicit_paddings"].dyn_cast<ArrayAttr>();
+      mlir::dyn_cast<ArrayAttr>(identifier_to_attr["explicit_paddings"]);
   return explicit_padding;
 }
 
@@ -167,7 +169,7 @@ LogicalResult CheckIfAttrIs8Bit(const std::string& attr, Operation* op,
     element_type = getElementTypeOrSelf(op->getOpResult(0).getType());
   }
   if (element_type) {
-    is_8_bit = element_type.isa<TF::Qint8Type>();
+    is_8_bit = mlir::isa<TF::Qint8Type>(element_type);
     return success();
   }
   return failure();
@@ -295,7 +297,8 @@ LogicalResult FillAttributesForUniformQuantizedConvolutionOp(
 
   auto feature_group_cnt_attr = llvm::StringRef("feature_group_count");
   int feature_group_cnt = 1;
-  ShapedType input_shape = op->getOperand(0).getType().dyn_cast<ShapedType>();
+  ShapedType input_shape =
+      mlir::dyn_cast<ShapedType>(op->getOperand(0).getType());
   if (!input_shape) {
     return op->emitError(
         "Only input with known shape is supported for Uniform Quantized "
@@ -425,7 +428,8 @@ LogicalResult FillAttributesForUniformRequantizeOp(
     activation_quantization_axis =
         GetQuantizationAxis(rewriter, op, /*operand_index=*/0);
 
-    auto output_scale_type = op->getOperand(3).getType().dyn_cast<ShapedType>();
+    auto output_scale_type =
+        mlir::dyn_cast<ShapedType>(op->getOperand(3).getType());
     if (!output_scale_type) {
       return failure();
     }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_xla_attribute_utils.cc b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_xla_attribute_utils.cc
index f1d7a6ae576c7b..b22726de30aeaa 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_xla_attribute_utils.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_xla_attribute_utils.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/strings/str_format.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/cc/constant_fold.h"
 #include "xla/xla_data.pb.h"
@@ -34,8 +35,7 @@ Value GetDimValue(OpBuilder &builder, Location loc, Value shape_value,
   return builder.create<TF::StridedSliceOp>(
       loc,
       RankedTensorType::get(
-          {},
-          shape_value.getType().template cast<ShapedType>().getElementType()),
+          {}, mlir::cast<ShapedType>(shape_value.getType()).getElementType()),
       /*input=*/shape_value,
       /*begin=*/Create1DConstValue<int32_t>(builder, loc, {dim}),
       /*end=*/Create1DConstValue<int32_t>(builder, loc, {dim + 1}),
@@ -109,14 +109,14 @@ Value PadForDynamicShapedInputSamePadding(
         CreateConstValue<int64_t>(builder, loc, {rank}, shape));
   };
 
-  ShapedType filter_shape = filter.getType().template cast<ShapedType>();
+  ShapedType filter_shape = mlir::cast<ShapedType>(filter.getType());
   Value input_shape_value = builder.create<TF::ShapeOp>(
       loc, RankedTensorType::get({num_dims}, builder.getI32Type()), input);
   auto scalar_to_rank1 = [&](Value value) { return reshape_op(value, {1}); };
   for (int i : llvm::seq<int>(1, num_dims - 1)) {
     Value input_size_i = GetDimValue(builder, loc, input_shape_value, i);
-    const int stride_i = strides[i].cast<IntegerAttr>().getInt();
-    const int dilation_i = dilations[i].cast<IntegerAttr>().getInt();
+    const int stride_i = mlir::cast<IntegerAttr>(strides[i]).getInt();
+    const int dilation_i = mlir::cast<IntegerAttr>(dilations[i]).getInt();
     const int filter_i = filter_shape.getDimSize(i - 1);
     Value pad_i_low, pad_i_high;
     GetSamePaddingValues(builder, loc, input_size_i, filter_i, dilation_i,
@@ -154,21 +154,21 @@ Value CalculatePaddingAndPadIfNeeded(OpBuilder &builder, Location loc,
                                      StringAttr conv_padding,
                                      ArrayAttr explicit_paddings,
                                      Value &padding, int num_dims) {
-  ShapedType input_shape = input.getType().template cast<ShapedType>();
+  ShapedType input_shape = mlir::cast<ShapedType>(input.getType());
   SmallVector<int64_t> spatial_dims(num_dims - 2);
   absl::c_iota(spatial_dims, 1);
   bool has_dynamic_spatial_dim = absl::c_any_of(
       spatial_dims,
       [&input_shape](int64_t dim) { return input_shape.isDynamicDim(dim); });
-  if (conv_padding.strref().equals("SAME") && has_dynamic_spatial_dim) {
+  if (conv_padding.strref() == "SAME" && has_dynamic_spatial_dim) {
     return PadForDynamicShapedInputSamePadding(
         builder, loc, input, filter, input_zp_value, strides, dilations,
         conv_padding, padding, num_dims);
   }
 
-  ShapedType filter_shape = filter.getType().template cast<ShapedType>();
+  ShapedType filter_shape = mlir::cast<ShapedType>(filter.getType());
   SmallVector<int32_t> padding_values(2 * num_dims, 0);
-  if (conv_padding.strref().equals("EXPLICIT")) {
+  if (conv_padding.strref() == "EXPLICIT") {
     if (explicit_paddings.size() != 2 * num_dims) {
       emitError(loc,
                 absl::StrFormat(
@@ -178,16 +178,16 @@ Value CalculatePaddingAndPadIfNeeded(OpBuilder &builder, Location loc,
     }
     for (int i : spatial_dims) {
       padding_values[2 * i] =
-          explicit_paddings[2 * i].cast<IntegerAttr>().getInt();
+          mlir::cast<IntegerAttr>(explicit_paddings[2 * i]).getInt();
       padding_values[2 * i + 1] =
-          explicit_paddings[2 * i + 1].cast<IntegerAttr>().getInt();
+          mlir::cast<IntegerAttr>(explicit_paddings[2 * i + 1]).getInt();
     }
-  } else if (conv_padding.strref().equals("SAME")) {
+  } else if (conv_padding.strref() == "SAME") {
     for (int i : spatial_dims) {
       int input_size = input_shape.getDimSize(i);
       int filter_size = filter_shape.getDimSize(i - 1);
-      int stride_i = strides[i].cast<IntegerAttr>().getInt();
-      int dilation_i = dilations[i].cast<IntegerAttr>().getInt();
+      int stride_i = mlir::cast<IntegerAttr>(strides[i]).getInt();
+      int dilation_i = mlir::cast<IntegerAttr>(dilations[i]).getInt();
       int out_size = tflite::ComputeOutSize(kTfLitePaddingSame, input_size,
                                             filter_size, stride_i, dilation_i);
 
@@ -243,7 +243,7 @@ Value CalculatePaddingAndPadIfNeeded(OpBuilder &builder, Location loc,
 //
 // packed_value = bitwise_or(packed_low, packed_high)
 Value PackOperand(OpBuilder &builder, Location loc, Value value, int pack_dim) {
-  ShapedType value_type = value.getType().cast<ShapedType>();
+  ShapedType value_type = mlir::cast<ShapedType>(value.getType());
   const int rank = value_type.getRank();
 
   SmallVector<int64_t> packed_shape(value_type.getShape().begin(),
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_xla_attribute_utils_test.cc b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_xla_attribute_utils_test.cc
index cc4bbb344026da..cbcda677b87733 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_xla_attribute_utils_test.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_xla_attribute_utils_test.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 
@@ -51,7 +52,8 @@ void PackOperandTestHelper(
   DenseIntElementsAttr packed_value_attr;
   ASSERT_TRUE(matchPattern(packed_value, m_Constant(&packed_value_attr)));
 
-  ShapedType packed_shape_type = packed_value.getType().dyn_cast<ShapedType>();
+  ShapedType packed_shape_type =
+      mlir::dyn_cast<ShapedType>(packed_value.getType());
   llvm::SmallVector<int64_t> packed_shape(packed_shape_type.getShape().begin(),
                                           packed_shape_type.getShape().end());
   EXPECT_THAT(packed_shape, testing::ElementsAreArray(expected_packed_shape));
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/BUILD
new file mode 100644
index 00000000000000..ecc3c9e7ca6ebe
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/BUILD
@@ -0,0 +1,103 @@
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+load("//tensorflow:tensorflow.default.bzl", "filegroup", "get_compatible_with_portable")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
+load("//tensorflow/compiler/mlir/quantization/stablehlo:internal_visibility_allowlist.bzl", "internal_visibility_allowlist")
+
+package_group(
+    name = "internal_visibility_allowlist_package",
+    packages = [
+        "//tensorflow/compiler/mlir/lite/...",
+        "//tensorflow/compiler/mlir/quantization/...",
+        "//tensorflow/compiler/mlir/tf2xla/transforms/...",
+        "//tensorflow/lite/...",
+    ] + internal_visibility_allowlist(),
+)
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["@stablehlo//:license"],
+    default_visibility = [
+        ":internal_visibility_allowlist_package",
+        "//tensorflow:__pkg__",
+    ],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "tf_to_stablehlo",
+    srcs = [
+        "tf_to_stablehlo.cc",
+    ],
+    hdrs = [
+        "tf_to_stablehlo.h",
+    ],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:saved_model_import",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:quantize_preprocess",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:shape_inference_pass",
+        "//tensorflow/core:core_cpu_base",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+    alwayslink = True,
+)
+
+tf_cc_binary(
+    name = "tf-to-stablehlo-translate",
+    srcs = [
+        "tf_to_stablehlo_translate.cc",
+    ],
+    visibility = [":internal_visibility_allowlist_package"],
+    deps = [
+        ":tf_to_stablehlo",
+        "//tensorflow/compiler/mlir:init_mlir",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+glob_lit_tests(
+    name = "all_tests",
+    data = [":test_utilities"],
+    default_tags = [
+        "no_oss",
+        "no_pip",
+    ],
+    driver = "//tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo:run_lit.sh",
+    size_override = {
+    },
+    tags_override = {
+    },
+    test_file_exts = ["mlir"],
+)
+
+# Bundle together all of the test utilities that are used by tests.
+filegroup(
+    name = "test_utilities",
+    testonly = True,
+    data = [
+        ":tf-to-stablehlo-translate",
+        "@llvm-project//llvm:FileCheck",
+        "@llvm-project//llvm:not",
+        "@llvm-project//mlir:run_lit.sh",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/README.md b/tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/README.md
new file mode 100644
index 00000000000000..a65de3c38df001
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/README.md
@@ -0,0 +1,123 @@
+# Tensorflow SavedModel to StableHLO (tf-to-stablehlo-translate)
+
+Converts TensorFlow models (SavedModel or MLIR module) to StableHLO MLIR
+modules, preserving model structure and signatures. It enables seamless
+integration of TensorFlow models into MLIR-based compiler frameworks for further
+optimization and deployment.
+
+## C++ APIs
+
+```bash
+tf-to-stablehlo-translate \
+    --input-path=/path/to/model \
+    [--exported-model-signatures=signature1,signature2] \
+    [--tag-names=tag1,tag2] \
+    [--input-arg-shapes-str=arg-name:shape,...] \
+    [--e] \
+    [--output-filename=/path/to/output.mlir]
+```
+
+* `--input-path`: The path to the input TensorFlow SavedModel or MLIR module
+  with .mlir extension.
+* `--exported-model-signatures`: Comma-separated list of exported model
+  signatures to convert. Ignored for MLIR input.
+* `--tags`: Comma-separated list of tags for loading SavedModel. Ignored for
+  MLIR input.
+* `--input-arg-shapes`: A string representation of input argument shapes for
+  'main' entry-point, separating tensors with ':', dimension with ',', and
+  using '?' for unknown sizes. For example, `input-arg-shapes=1,2::1,?`
+  expresses argument shapes `[1,2]`, `[]` and `[1,?]`.
+* `--e`: Elide large elements attrs while dumping the output StableHLO.
+* `--output_filename`: Path to the output file where the textual StableHLO MLIR
+  module will be written (default: stdout).
+
+
+### Examples
+
+* To convert [microsoft/resnet-50](https://huggingface.co/microsoft/resnet-50)
+model to StableHLO with static input shape `4x3x224x224` for input argument with
+type `tensor<?x3x224x224xf32>`.
+
+```bash
+tf-to-stablehlo-translate <saved-model-path> --input-arg-shapes=4,3,224,224
+```
+
+* To convert
+[google-bert/bert-large-uncased](https://huggingface.co/google-bert/bert-large-uncased)
+to StableHLO with static input shapes `1x12`, `1x12`, and `1x12` for input
+arguments with types `tensor<?x?xi32>, tensor<?x?xi32>, tensor<?x?xi32>`.
+
+```bash
+tf-to-stablehlo-translate <saved-model-path> --input-arg-shapes=1,12:1,12:1,12
+```
+
+### Dependencies
+
+* TensorFlow
+* MLIR
+* Abseil (absl)
+
+## Python APIs
+
+
+### `savedmodel_to_stablehlo`
+
+Converts a TensorFlow SavedModel into StableHLO bytecode.
+
+```Python
+from tensorflow.compiler.mlir.quantization.tensorflow_to_stablehlo.python import pywrap_tensorflow_to_stablehlo as tf2shlo
+
+stablehlo_bytes = tf2shlo.savedmodel_to_stablehlo(
+        input_path="/path/to/your/savedmodel",
+        exported_model_signatures=["serving_default"],
+        tag_names=["serve"],
+        input_arg_shapes_str="1,28,28,3::32"
+)
+
+```
+
+#### Arguments:
+
+* `input_path` (required): Path to your SavedModel directory.
+* `exported_model_signatures` (optional): List of signature names to convert.
+                                          Defaults to ["serving_default"].
+* `tag_names` (optional): List of tags associated with the SavedModel. Defaults
+                          to ["serve"].
+* `input_arg_shapes_str` (optional): A string representation of input argument
+                                     shapes for 'main' entry-point, separating
+                                     tensors with ':', dimension with ',', and
+                                     using '?' for unknown sizes. For example,
+                                     `input-arg-shapes=1,2::1,?` expresses
+                                     argument shapes `[1,2], [] and [1,?]`.
+
+#### Error Handling
+
+An exception will be raised with details about the error.
+
+### `tensorflow_module_to_stablehlo`
+
+Converts a TensorFlow MLIR module string into StableHLO bytecode.
+
+```Python
+from tensorflow.compiler.mlir.quantization.tensorflow_to_stablehlo.python import pywrap_tensorflow_to_stablehlo as tf2shlo
+
+stablehlo_bytes = tf2shlo.tensorflow_module_to_stablehlo(
+    module_op_str="your_tensorflow_mlir_module_string",
+    input_arg_shapes_str="1,28,28,3::32"
+)
+```
+
+#### Arguments:
+
+* `module_op_str` (required): String containing the TensorFlow MLIR module.
+* `input_arg_shapes_str` (optional): A string representation of input argument
+                                     shapes for 'main' entry-point, separating
+                                     tensors with ':', dimension with ',', and
+                                     using '?' for unknown sizes. For example,
+                                     `input-arg-shapes=1,2::1,?` expresses
+                                     argument shapes `[1,2], [] and [1,?]`.
+
+#### Error Handling
+
+Return `py::none()` (equivalent to Python's `None`) if there's an error. An
+exception will be raised with details about the error.
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/python/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/python/BUILD
new file mode 100644
index 00000000000000..f7a1c77026d215
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/python/BUILD
@@ -0,0 +1,108 @@
+load(
+    "//tensorflow:tensorflow.default.bzl",
+    "get_compatible_with_portable",
+    "tf_py_strict_test",
+    "tf_python_pybind_extension",
+)
+load("//tensorflow/compiler/mlir/quantization/stablehlo:internal_visibility_allowlist.bzl", "internal_visibility_allowlist")
+
+package_group(
+    name = "internal_visibility_allowlist_package",
+    packages = [
+        "//tensorflow/compiler/mlir/lite/...",
+        "//tensorflow/compiler/mlir/quantization/...",
+        "//tensorflow/compiler/mlir/tf2xla/transforms/...",
+        "//tensorflow/lite/...",
+    ] + internal_visibility_allowlist(),
+)
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["@stablehlo//:license"],
+    default_visibility = [
+        ":internal_visibility_allowlist_package",
+        "//tensorflow:__pkg__",
+        "//tensorflow/python:__pkg__",
+    ],
+    licenses = ["notice"],
+)
+
+# copybara:uncomment_begin(google-only)
+# tf_py_strict_test(
+#     name = "tensorflow_to_stablehlo_test",
+#     testonly = 1,
+#     srcs = ["integration_test/tensorflow_to_stablehlo_test.py"],
+#     deps = [
+#         ":pywrap_tensorflow_to_stablehlo",
+#         "//testing/pymocks:matchers",
+#         "//third_party/py/mlir",
+#         "//third_party/py/mlir:ir",
+#         "//third_party/py/mlir:stablehlo_dialect",
+#         "//third_party/py/mlir/_mlir_libs:_mlirRegisterEverything",
+#         "//tensorflow:tensorflow_py",
+#         "//tensorflow/compiler/mlir/stablehlo",
+#         "//tensorflow/python/framework:test_lib",
+#         "//tensorflow/python/platform:client_testlib",
+#         "//tensorflow/python/types:core",
+#     ],
+# )
+# copybara:uncomment_end
+
+# This is a header-only target. The purpose of `pywrap_tensorflow_to_stablehlo_lib_*` targets is to expose only
+# the symbols that are required by `pywrap_tensorflow_to_stablehlo` that translates them to python functions.
+# The only intended use case of this library is by `pywrap_tensorflow_to_stablehlo`. Not letting
+# `pywrap_tensorflow_to_stablehlo` directly depend on sub-libraries like `static_range_srq` and instead haiving
+# a consolidated impl library `pywrap_tensorflow_to_stablehlo_lib_impl` allows the maintainers to avoid
+# declaring multiple impl libraries to `libtensorflow_cc` and `lib_pywrap_tensorflow_internal`,
+# which is required to avoid ODR violations.
+cc_library(
+    name = "pywrap_tensorflow_to_stablehlo_lib_header_only",
+    srcs = [],
+    hdrs = ["pywrap_tensorflow_to_stablehlo_lib.h"],
+    compatible_with = get_compatible_with_portable(),
+    visibility = ["//visibility:private"],  # ONLY for `pywrap_tensorflow_to_stablehlo`.
+    deps = [
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
+# See the comments for `pywrap_tensorflow_to_stablehlo_lib_header_only`.
+cc_library(
+    name = "pywrap_tensorflow_to_stablehlo_lib_impl",
+    srcs = ["pywrap_tensorflow_to_stablehlo_lib.cc"],
+    hdrs = ["pywrap_tensorflow_to_stablehlo_lib.h"],
+    compatible_with = get_compatible_with_portable(),
+    visibility = [
+        "//tensorflow:__pkg__",  # For libtensorflow_cc.so.
+        "//tensorflow/python:__pkg__",  # For lib_pywrap_tensorflow_internal.so.
+    ],
+    deps = [
+        "//tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo:tf_to_stablehlo",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/core:lib",
+        "//third_party/python_runtime:headers",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:BytecodeWriter",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+tf_python_pybind_extension(
+    name = "pywrap_tensorflow_to_stablehlo",
+    srcs = ["pywrap_tensorflow_to_stablehlo.cc"],
+    pytype_srcs = ["pywrap_tensorflow_to_stablehlo.pyi"],
+    # Each dependency MUST be either header-only or exclusive.
+    deps = [
+        ":pywrap_tensorflow_to_stablehlo_lib_header_only",
+        "//third_party/python_runtime:headers",
+        "@pybind11",
+        "@pybind11_abseil//pybind11_abseil:absl_casters",
+        "@pybind11_abseil//pybind11_abseil:status_casters",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/python/integration_test/tensorflow_to_stablehlo_test.py b/tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/python/integration_test/tensorflow_to_stablehlo_test.py
new file mode 100644
index 00000000000000..28b224abdd5db9
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/python/integration_test/tensorflow_to_stablehlo_test.py
@@ -0,0 +1,78 @@
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import tempfile
+from mlir import ir
+from mlir.dialects import stablehlo
+import tensorflow as tf
+from tensorflow.compiler.mlir.quantization.tensorflow_to_stablehlo.python import pywrap_tensorflow_to_stablehlo as tensorflow_to_stablehlo
+from tensorflow.python.platform import test
+
+
+def build_savedmodel(tempdir) -> str:
+
+  class AddOneModel(tf.keras.Model):
+
+    def call(self, x):
+      return x + 1
+
+  model = AddOneModel()
+
+  x_train = tf.constant([1, 2, 3, 4, 5], dtype=tf.float32)
+  y_train = tf.constant([2, 3, 4, 5, 6], dtype=tf.float32)
+
+  model.compile(optimizer='sgd', loss='mse')
+  model.fit(x_train, y_train, epochs=1)
+
+  path = tempdir + '/add_one_model'
+  model.save(path)
+  return path
+
+
+class TensorflowToStableHLOTest(test.TestCase):
+
+  def test_saved_model_to_stablehlo(self):
+    with tempfile.TemporaryDirectory() as tempdir:
+      path = build_savedmodel(tempdir)
+      module_bytecode = tensorflow_to_stablehlo.savedmodel_to_stablehlo(
+          input_path=path, input_arg_shapes_str='4'
+      )
+      with ir.Context() as ctx:
+        stablehlo.register_dialect(ctx)
+        module = ir.Module.parse(module_bytecode)
+        self.assertIn('stablehlo.add %arg0, %cst : tensor<4xf32>', str(module))
+
+  def test_tf_mlir_to_stablehlo(self):
+    assembly = """
+      module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
+        func.func @main(%arg0 : tensor<?xf32>) -> tensor<?xf32> {
+          %cst = "tf.Const"() {value = dense<1.0> : tensor<f32>} : () -> tensor<f32>
+          %0 = "tf.Add"(%arg0, %cst): (tensor<?xf32>, tensor<f32>) -> tensor<?xf32>
+          func.return %0 : tensor<?xf32>
+        }
+      }
+    """
+    module_bytecode = tensorflow_to_stablehlo.tensorflow_module_to_stablehlo(
+        module=assembly,
+        input_arg_shapes_str='4',
+    )
+    with ir.Context() as ctx:
+      stablehlo.register_dialect(ctx)
+      module = ir.Module.parse(module_bytecode)
+      self.assertIn('stablehlo.add %arg0, %cst : tensor<4xf32>', str(module))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/python/pywrap_tensorflow_to_stablehlo.cc b/tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/python/pywrap_tensorflow_to_stablehlo.cc
new file mode 100644
index 00000000000000..1d1f775f5dfda1
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/python/pywrap_tensorflow_to_stablehlo.cc
@@ -0,0 +1,97 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <Python.h>
+
+#include "pybind11/pybind11.h"  // from @pybind11
+#include "pybind11/pytypes.h"  // from @pybind11
+#include "pybind11_abseil/absl_casters.h"  // from @pybind11_abseil    // IWYU pragma: keep
+#include "pybind11_abseil/status_casters.h"  // from @pybind11_abseil  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/python/pywrap_tensorflow_to_stablehlo_lib.h"
+
+namespace py = pybind11;
+
+namespace {
+
+using mlir::tensorflow_to_stablehlo::pywrap::PywrapSavedModelToStablehlo;
+using mlir::tensorflow_to_stablehlo::pywrap::PywrapTfModuleToStablehlo;
+
+}  // namespace
+
+PYBIND11_MODULE(pywrap_tensorflow_to_stablehlo, m) {
+  m.doc() = "TensorFlow to StableHLO APIs.";
+
+  // LINT.IfChange(savedmodel_to_stablehlo)
+  m.def(
+      "savedmodel_to_stablehlo",
+      [](absl::string_view input_path,
+         const std::vector<std::string>& exported_model_signatures =
+             {"serving_default"},
+         const std::vector<std::string>& tag_names = {"serve"},
+         absl::string_view input_arg_shapes_str = "") -> py::bytes {
+        auto module_bytecode =
+            PywrapSavedModelToStablehlo(input_path, exported_model_signatures,
+                                        tag_names, input_arg_shapes_str);
+        if (!module_bytecode.ok()) {
+          PyErr_SetString(PyExc_ValueError,
+                          module_bytecode.status().ToString().c_str());
+          throw py::error_already_set();
+        }
+        return py::bytes(module_bytecode.value());
+      },
+      R"pbdoc(
+        Converts a TensorFlow SavedModel into StableHLO bytecode.
+
+        * input-path: The path to the input TensorFlow SavedModel.
+        * exported-model-signatures: Comma-separated list of exported model
+          signatures to convert.
+        * tag_names: Comma-separated list of tags for loading SavedModel.
+        * input-arg-shapes: A string representation of input argument shapes for
+          'main' entry-point, separating tensors with ':', dimension with ',', and
+          using '?' for unknown sizes. For example, 'input-arg-shapes=1,2::1,?'
+          expresses argument shapes [1,2], [] and [1,?].
+        )pbdoc",
+      py::arg("input_path"),
+      py::arg("exported_model_signatures") =
+          std::vector<std::string>{"serving_default"},
+      py::arg("tag_names") = std::vector<std::string>{"serve"},
+      py::arg("input_arg_shapes_str") = "");
+  // LINT.ThenChange(pywrap_tensorflow_to_stablehlo.pyi:savedmodel_to_stablehlo)
+  //
+  // LINT.IfChange(tensorflow_module_to_stablehlo)
+  m.def(
+      "tensorflow_module_to_stablehlo",
+      [](absl::string_view module_op_str,
+         absl::string_view input_arg_shapes_str) -> py::bytes {
+        auto module_bytecode =
+            PywrapTfModuleToStablehlo(module_op_str, input_arg_shapes_str);
+        if (!module_bytecode.ok()) {
+          PyErr_SetString(PyExc_ValueError,
+                          module_bytecode.status().ToString().c_str());
+          throw py::error_already_set();
+        }
+        return py::bytes(module_bytecode.value());
+      },
+      R"pbdoc(
+        Converts a TensorFlow MLIR module string into StableHLO bytecode.
+
+        * module: TensorFlow MLIR module string.
+        * input-arg-shapes: A string representation of input argument shapes for
+          'main' entry-point, separating tensors with ':', dimension with ',', and
+          using '?' for unknown sizes. For example, 'input-arg-shapes=1,2::1,?'
+          expresses argument shapes [1,2], [] and [1,?].
+        )pbdoc",
+      py::arg("module"), py::arg("input_arg_shapes_str") = "");
+  // LINT.ThenChange(pywrap_tensorflow_to_stablehlo.pyi:tensorflow_module_to_stablehlo)
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/python/pywrap_tensorflow_to_stablehlo.pyi b/tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/python/pywrap_tensorflow_to_stablehlo.pyi
new file mode 100644
index 00000000000000..ec5eaad7983bf0
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/python/pywrap_tensorflow_to_stablehlo.pyi
@@ -0,0 +1,30 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# LINT.IfChange(savedmodel_to_stablehlo)
+def savedmodel_to_stablehlo(
+    input_path: str,
+    exported_model_signatures: list[str] = ["serving_default"],
+    tag_names: list[str] = ["serve"],
+    input_arg_shapes_str: str = "",
+) -> bytes: ...
+# LINT.ThenChange()
+
+# LINT.IfChange(tensorflow_module_to_stablehlo)
+def tensorflow_module_to_stablehlo(
+    module: str,
+    input_arg_shapes_str: str = "",
+) -> bytes: ...
+# LINT.ThenChange()
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/python/pywrap_tensorflow_to_stablehlo_lib.cc b/tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/python/pywrap_tensorflow_to_stablehlo_lib.cc
new file mode 100644
index 00000000000000..cbd535a861482f
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/python/pywrap_tensorflow_to_stablehlo_lib.cc
@@ -0,0 +1,141 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/python/pywrap_tensorflow_to_stablehlo_lib.h"
+
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Bytecode/BytecodeWriter.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "mlir/Support/FileUtilities.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/tf_to_stablehlo.h"
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
+#include "tensorflow/core/platform/path.h"
+
+namespace mlir::tensorflow_to_stablehlo::pywrap {
+
+absl::StatusOr<std::string> ModuleToBytecode(ModuleOp module) {
+  std::string bytecode;
+  llvm::raw_string_ostream os(bytecode);
+  mlir::BytecodeWriterConfig config;
+  if (mlir::failed(mlir::writeBytecodeToFile(module, os, config))) {
+    return absl::InvalidArgumentError("mlir::writeBytecodeToFile failed");
+  }
+  return bytecode;
+}
+
+absl::StatusOr<std::string> ExportModule(ModuleOp module) {
+  const std::string output_filename = tensorflow::io::GetTempFilename(".mlir");
+  std::string error_msg;
+  auto output = openOutputFile(output_filename, &error_msg);
+  if (output == nullptr) {
+    return absl::UnknownError(
+        absl::StrCat("Unable to open output path: ", error_msg));
+  }
+
+  std::string result;
+  llvm::raw_string_ostream os(result);
+  OpPrintingFlags printing_flags;
+  module.print(os, printing_flags);
+
+  output->os() << result;
+  output->keep();
+
+  return output_filename;
+}
+
+absl::StatusOr<std::string> PywrapSavedModelToStablehlo(
+    absl::string_view input_path,
+    const std::vector<std::string>& exported_model_signatures,
+    const std::vector<std::string>& tag_names,
+    absl::string_view input_arg_shapes_str) {
+  mlir::DialectRegistry registry;
+  RegisterAllTensorFlowDialects(registry);
+  mlir::MLIRContext context(registry);
+  context.loadAllAvailableDialects();
+
+  auto module =
+      TfToStablehlo(input_path, &context, exported_model_signatures, tag_names,
+                    input_arg_shapes_str, /*is_input_mlir_module=*/false);
+
+  if (!module.ok()) {
+    return absl::UnknownError(
+        absl::StrCat("Failed to convert SavedModel to StableHLO: ",
+                     module.status().message()));
+  }
+
+  auto bytecode = ModuleToBytecode(module.value().get());
+  if (!bytecode.ok()) {
+    return absl::UnknownError(
+        absl::StrCat("Failed to serialize MLIR module to bytecode: ",
+                     bytecode.status().message()));
+  }
+
+  return bytecode.value();
+}
+
+absl::StatusOr<std::string> PywrapTfModuleToStablehlo(
+    absl::string_view module_op_str, absl::string_view input_arg_shapes_str) {
+  mlir::DialectRegistry registry;
+  RegisterAllTensorFlowDialects(registry);
+  mlir::MLIRContext context(registry);
+  context.loadAllAvailableDialects();
+
+  auto tf_module = mlir::parseSourceString<ModuleOp>(module_op_str, &context);
+  if (!tf_module) {
+    return absl::UnknownError("Failed to parse MLIR module");
+  }
+
+  auto mlir_file_path = ExportModule(*tf_module);
+  if (!mlir_file_path.ok()) {
+    return absl::UnknownError(
+        absl::StrCat("Failed to write MLIR module to file.",
+                     mlir_file_path.status().message()));
+  }
+
+  auto module = TfToStablehlo(*mlir_file_path, &context,
+                              /*exported_model_signatures=*/{},
+                              /*tag_names=*/{}, input_arg_shapes_str,
+                              /*is_input_mlir_module=*/true);
+
+  if (!module.ok()) {
+    return absl::UnknownError(
+        absl::StrCat(" Failed to convert SavedModel to StableHLO: ",
+                     module.status().message()));
+  }
+
+  auto bytecode = ModuleToBytecode(module.value().get());
+  if (!bytecode.ok()) {
+    return absl::UnknownError(
+        absl::StrCat("Failed to serialize MLIR module to bytecode: ",
+                     bytecode.status().message()));
+  }
+
+  return bytecode.value();
+}
+
+}  // namespace mlir::tensorflow_to_stablehlo::pywrap
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/python/pywrap_tensorflow_to_stablehlo_lib.h b/tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/python/pywrap_tensorflow_to_stablehlo_lib.h
new file mode 100644
index 00000000000000..c79ed32b990dd6
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/python/pywrap_tensorflow_to_stablehlo_lib.h
@@ -0,0 +1,67 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_TO_STABLEHLO_PYTHON_PYWRAP_TENSORFLOW_TO_STABLEHLO_LIB_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_TO_STABLEHLO_PYTHON_PYWRAP_TENSORFLOW_TO_STABLEHLO_LIB_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+
+namespace mlir::tensorflow_to_stablehlo::pywrap {
+
+// Converts a TensorFlow SavedModel to a StableHLO MLIR module and serializes it
+// to bytecode.
+//
+// Args:
+//   input_path: The path to the SavedModel directory.
+//    exported_model_signatures: Comma-separated list of exported model
+//   signatures to convert. tag_names: Comma-separated list of tags for loading
+//    SavedModel.
+//   input_arg_shapes_str: A string representation of input argument
+//    shapes for 'main' entry-point, separating tensors with ':', dimension
+//    with ',', and using '?' for unknown sizes. For example,
+//    'input-arg-shapes=1,2::1,?' expresses argument shapes [1,2], [] and [1,?].
+//
+// Returns:
+//   An absl::StatusOr containing the serialized bytecode of the StableHLO
+//   module on success, or an error status on failure.
+absl::StatusOr<std::string> PywrapSavedModelToStablehlo(
+    absl::string_view input_path,
+    const std::vector<std::string>& exported_model_signatures,
+    const std::vector<std::string>& tag_names,
+    absl::string_view input_arg_shapes_str);
+
+// Converts a TensorFlow MLIR module string to a StableHLO MLIR module and
+// serializes it to bytecode.
+//
+// Args:
+//   module_op_str: TensorFlow MLIR module string.
+//   input_arg_shapes_str: A string representation of input argument
+//    shapes for 'main' entry-point, separating tensors with ':', dimension
+//    with ',', and using '?' for unknown sizes. For example,
+//    'input-arg-shapes=1,2::1,?' expresses argument shapes [1,2], [] and [1,?].
+//
+// Returns:
+//   An absl::StatusOr containing the serialized bytecode of the StableHLO
+//   module on success, or an error status on failure.
+absl::StatusOr<std::string> PywrapTfModuleToStablehlo(
+    absl::string_view module_op_str, absl::string_view input_arg_shapes_str);
+
+}  // namespace mlir::tensorflow_to_stablehlo::pywrap
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_TO_STABLEHLO_PYTHON_PYWRAP_TENSORFLOW_TO_STABLEHLO_LIB_H_
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/tests/test_tf_to_stablehlo.mlir b/tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/tests/test_tf_to_stablehlo.mlir
new file mode 100644
index 00000000000000..7c71e7014fa743
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/tests/test_tf_to_stablehlo.mlir
@@ -0,0 +1,22 @@
+// RUN: tf-to-stablehlo-translate %s --input-arg-shapes=1 -o - | FileCheck %s
+
+// CHECK-LABEL: func.func @main
+// CHECK: %[[UQ:.*]] = stablehlo.uniform_quantize %arg0 : (tensor<1xf32>) -> tensor<1x!quant.uniform<i8:f32, 1.000000e+00:3>>
+// CHECK: %[[BITCAST_CONVERT_0:.*]] = stablehlo.bitcast_convert %[[UQ]] : (tensor<1x!quant.uniform<i8:f32, 1.000000e+00:3>>) -> tensor<1xi8>
+// CHECK: %[[BITCAST_CONVERT_1:.*]] = stablehlo.bitcast_convert %[[BITCAST_CONVERT_0]] : (tensor<1xi8>) -> tensor<1x!quant.uniform<i8:f32, 1.000000e+00:3>>
+// CHECK: %[[UDQ:.*]] = stablehlo.uniform_dequantize %[[BITCAST_CONVERT_1]] : (tensor<1x!quant.uniform<i8:f32, 1.000000e+00:3>>) -> tensor<1xf32>
+// CHECK: return %[[UDQ]] : tensor<1xf32>
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
+    func.func @main(%arg0 : tensor<?xf32>) -> tensor<?xf32> {
+      %scales = "tf.Const"() { value = dense<1.0> : tensor<f32> } : () -> tensor<f32>
+      %zps = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
+
+      %0 = "tf.UniformQuantize"(%arg0, %scales, %zps) {
+        quantization_axis = -1 : i64, quantization_min_val = -128 : i64, quantization_max_val = 127 : i64
+      } : (tensor<?xf32>, tensor<f32>, tensor<i32>) -> tensor<?x!tf_type.qint8>
+      %1 = "tf.UniformDequantize"(%0, %scales, %zps) {
+        quantization_axis = -1 : i64, quantization_min_val = -128 : i64, quantization_max_val = 127 : i64
+      } : (tensor<?x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<?xf32>
+      func.return %1 : tensor<?xf32>
+    }
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/tf_to_stablehlo.cc b/tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/tf_to_stablehlo.cc
new file mode 100644
index 00000000000000..08cf8e67957c28
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/tf_to_stablehlo.cc
@@ -0,0 +1,138 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/SourceMgr.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "mlir/Support/FileUtilities.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.h"
+#include "tensorflow/core/public/session.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace mlir {
+namespace {
+
+// Extract the mlir TF module and optionally a ::tensorflow::SavedModelBundle
+// from a saved model or from an mlir file.
+absl::StatusOr<quant::stablehlo::ImportedMlirModuleOp> ImportSavedModelOrTfMlir(
+    absl::string_view input_path, MLIRContext* context,
+    const std::vector<std::string>& exported_model_signatures,
+    const std::vector<std::string>& tag_names, bool is_input_mlir_module) {
+  if (is_input_mlir_module) {
+    std::string error_message;
+    std::unique_ptr<llvm::MemoryBuffer> file =
+        openInputFile(input_path, &error_message);
+    if (!file) {
+      return absl::AbortedError(
+          absl::StrCat("Failed to parse input MLIR model: ", error_message));
+    }
+
+    llvm::SourceMgr source_mgr;
+    source_mgr.AddNewSourceBuffer(std::move(file), llvm::SMLoc());
+    auto module = parseSourceFile<ModuleOp>(source_mgr, context);
+    if (module->getOperation() == nullptr) {
+      return absl::AbortedError("Failed to parse input MLIR model.");
+    }
+
+    return quant::stablehlo::ImportedMlirModuleOp(std::move(module), nullptr);
+  }
+
+  std::unordered_set<std::string> tag_set(tag_names.begin(), tag_names.end());
+  return quant::stablehlo::SavedModelToMlirModuleOp(
+      input_path, tag_set, exported_model_signatures, *context);
+}
+
+// Convert an TF module to a StableHLO module
+absl::StatusOr<OwningOpRef<ModuleOp>> ConvertTFToStablehlo(
+    quant::stablehlo::ImportedMlirModuleOp imported_module,
+    absl::string_view input_path, MLIRContext* context,
+    const std::vector<std::string>& tag_names,
+    absl::string_view input_arg_shapes_str, bool is_input_mlir_module) {
+  auto [module_op, saved_model_bundle] = std::move(imported_module);
+
+  // Collect the names of the functions that have aliases so that they may not
+  // be inlined.
+  absl::flat_hash_set<std::string> aliased_function_names;
+  if (!is_input_mlir_module) {
+    std::unordered_set<std::string> tag_set(tag_names.begin(), tag_names.end());
+    TF_ASSIGN_OR_RETURN(
+        auto function_aliases,
+        quant::stablehlo::GetFunctionAliases(input_path, tag_set));
+    quant::stablehlo::UpdateFunctionAliases(function_aliases, *module_op);
+    absl::c_for_each(function_aliases, [&](const auto& aliases) {
+      return aliased_function_names.insert(aliases.first);
+    });
+  }
+
+  std::optional<tensorflow::Session*> session;
+  if (saved_model_bundle) {
+    session = saved_model_bundle->GetSession();
+  }
+  TF_ASSIGN_OR_RETURN(auto input_arg_shapes_vec,
+                      TF::ParseArgumentShapes(input_arg_shapes_str));
+  llvm::SmallVector<llvm::ArrayRef<int64_t>> input_arg_shapes(
+      input_arg_shapes_vec.begin(), input_arg_shapes_vec.end());
+  TF_RETURN_IF_ERROR(tensorflow::quantization::PreprocessAndFreezeGraph(
+      /*mlir_dump_file_prefix=*/"", /*is_inliner_run=*/true,
+      /*noinline_functions=*/aliased_function_names, *module_op, context,
+      session,
+      /*run_tf_to_stablehlo=*/true, /*deserialize_xla_call_module=*/false,
+      input_arg_shapes));
+
+  return std::move(module_op);
+}
+
+}  // namespace
+
+absl::StatusOr<OwningOpRef<ModuleOp>> TfToStablehlo(
+    absl::string_view input_path, MLIRContext* context,
+    const std::vector<std::string>& exported_model_signatures,
+    const std::vector<std::string>& tag_names,
+    absl::string_view input_arg_shapes_str, bool is_input_mlir_module) {
+  auto import_module_status =
+      ImportSavedModelOrTfMlir(input_path, context, exported_model_signatures,
+                               tag_names, is_input_mlir_module);
+  if (!import_module_status.ok()) {
+    return import_module_status.status();
+  }
+
+  return ConvertTFToStablehlo(*std::move(import_module_status), input_path,
+                              context, tag_names, input_arg_shapes_str,
+                              is_input_mlir_module);
+}
+
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/tf_to_stablehlo.h b/tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/tf_to_stablehlo.h
new file mode 100644
index 00000000000000..55a579344d6c4d
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/tf_to_stablehlo.h
@@ -0,0 +1,56 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_TO_STABLEHLO_TF_TO_STABLEHLO_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_TO_STABLEHLO_TF_TO_STABLEHLO_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+
+namespace mlir {
+
+// Converts a TensorFlow model (either from a SavedModel or an MLIR module) to a
+// StableHLO MLIR module.
+//
+// Args:
+//  input_path: The path to the input TensorFlow SavedModel or MLIR module.
+//  context: The MLIR context to use for parsing or creating the MLIR module.
+//  exported_model_signatures: List of exported model signatures (strings) to
+//    convert.
+//  tag_names: List of tag names (strings) used for loading SavedModel.
+//    Ignored for MLIR input.
+//  input_arg_shapes_str:  A string representation of input argument shapes for
+//    'main' entry-point, separating tensors with ':', dimension with ',', and
+//    using '?' for unknown sizes. For example, 'input-arg-shapes=1,2::1,?'
+//    expresses argument shapes [1,2], [] and [1,?].
+//  is_input_mlir_module: If true, `input_path` is treated as an MLIR
+//    module instead of a SavedModel.
+//
+// Returns:
+//   An absl::StatusOr containing the converted StableHLO MLIR module on
+//   success, or an absl::Status with an error message on failure.
+absl::StatusOr<OwningOpRef<ModuleOp>> TfToStablehlo(
+    absl::string_view input_path, MLIRContext* context,
+    const std::vector<std::string>& exported_model_signatures,
+    const std::vector<std::string>& tag_names,
+    absl::string_view input_arg_shapes_str, bool is_input_mlir_module);
+
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_TO_STABLEHLO_TF_TO_STABLEHLO_H_
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/tf_to_stablehlo_translate.cc b/tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/tf_to_stablehlo_translate.cc
new file mode 100644
index 00000000000000..6b43cc8b112313
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/tf_to_stablehlo_translate.cc
@@ -0,0 +1,134 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_split.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/Support/FileUtilities.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/init_mlir.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/tf_to_stablehlo.h"
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
+
+namespace {
+
+using llvm::cl::opt;
+
+// NOLINTNEXTLINE
+opt<std::string> input_path(llvm::cl::Positional,
+                            llvm::cl::desc("<input path>"), llvm::cl::Required);
+
+// NOLINTNEXTLINE
+opt<std::string> output_filename("o", llvm::cl::desc("<output path>"),
+                                 llvm::cl::Optional, llvm::cl::init("-"));
+
+// NOLINTNEXTLINE
+opt<std::string> input_arg_shapes_str(
+    "input-arg-shapes",
+    llvm::cl::desc(
+        "A string representation of input argument shapes for 'main' "
+        "entry-point, separating tensors with ':', dimension with ',', and "
+        "using '?' for unknown sizes. For example, 'input-arg-shapes=1,2::1,?' "
+        "expresses argument shapes [1,2], [] and [1,?]"),
+    llvm::cl::Optional, llvm::cl::init(""));
+
+// NOLINTNEXTLINE
+opt<std::string> exported_model_signatures(
+    "exported-model-signatures",
+    llvm::cl::desc(
+        "Comma-separated list of exported model signatures to convert"),
+    llvm::cl::Optional, llvm::cl::init("serving_default"));
+
+// NOLINTNEXTLINE
+opt<std::string> tag_names(
+    "tags",
+    llvm::cl::desc("Comma-separated list of tags for loading SavedModel. "
+                   "Ignored for MLIR input"),
+    llvm::cl::Optional, llvm::cl::init("serve"));
+
+// NOLINTNEXTLINE
+opt<bool> elide_large_elements_attrs(
+    "e",
+    llvm::cl::desc(
+        "Elide large elements attrs while dumping the output StableHLO."),
+    llvm::cl::Optional, llvm::cl::init(false));
+
+}  // namespace
+
+namespace mlir {
+
+namespace {
+// Dump the ModuleOp 'module' to the file specified using 'outputFileName'
+absl::Status ExportModule(ModuleOp module) {
+  std::string error_msg;
+  auto output = openOutputFile(output_filename, &error_msg);
+  if (output == nullptr) {
+    return absl::AbortedError(
+        absl::StrCat("Unable to write to output path: ", error_msg));
+  }
+
+  // Export StableHLO MLIR as output
+  std::string result;
+  llvm::raw_string_ostream os(result);
+  OpPrintingFlags printing_flags;
+  if (elide_large_elements_attrs) {
+    printing_flags.elideLargeElementsAttrs();
+  }
+  module.print(os, printing_flags);
+  os.flush();
+
+  output->os() << result;
+  output->keep();
+
+  return absl::OkStatus();
+}
+
+}  // namespace
+}  // namespace mlir
+
+int main(int argc, char** argv) {
+  tensorflow::InitMlir y(&argc, &argv);
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "TF Saved Model to Stablehlo converter\n");
+
+  mlir::DialectRegistry registry;
+  RegisterAllTensorFlowDialects(registry);
+  mlir::MLIRContext context(registry);
+  context.loadAllAvailableDialects();
+
+  bool is_input_mlir_module = absl::EndsWith(input_path, ".mlir");
+  std::vector<std::string> exported_model_signatures_in_vector =
+      absl::StrSplit(exported_model_signatures, ',');
+  std::vector<std::string> tag_names_in_vector = absl::StrSplit(tag_names, ',');
+  auto module = TfToStablehlo(
+      input_path, &context, exported_model_signatures_in_vector,
+      tag_names_in_vector, input_arg_shapes_str, is_input_mlir_module);
+  if (!module.ok()) {
+    llvm::errs() << module.status().ToString() << "\n";
+    return module.status().raw_code();
+  }
+
+  return mlir::ExportModule(module->get()).raw_code();
+}
diff --git a/tensorflow/compiler/mlir/register_common_dialects.cc b/tensorflow/compiler/mlir/register_common_dialects.cc
index b089bd9a1eb787..fe626375a8ee8f 100644
--- a/tensorflow/compiler/mlir/register_common_dialects.cc
+++ b/tensorflow/compiler/mlir/register_common_dialects.cc
@@ -28,7 +28,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h"
 #include "xla/mlir/framework/ir/xla_framework.h"
 #include "xla/mlir_hlo/mhlo/IR/register.h"
-#include "xla/service/cpu/hlo_xla_runtime_pipeline.h"
 
 namespace mlir {
 
@@ -38,7 +37,6 @@ void RegisterCommonToolingDialects(mlir::DialectRegistry& registry) {
   mlir::registerAllDialects(registry);
   mlir::registerAllExtensions(registry);
   mlir::stablehlo::registerAllDialects(registry);
-  xla::cpu::RegisterHloXlaRuntimePipelineDialects(registry);
 
   registry.insert<mlir::TFL::TensorFlowLiteDialect>();
   registry.insert<mlir::kernel_gen::tf_framework::TFFrameworkDialect>();
diff --git a/tensorflow/compiler/mlir/runlit.cfg.py b/tensorflow/compiler/mlir/runlit.cfg.py
index abfcfcfc6746e6..d32bffa13dc11a 100644
--- a/tensorflow/compiler/mlir/runlit.cfg.py
+++ b/tensorflow/compiler/mlir/runlit.cfg.py
@@ -79,6 +79,7 @@
     'mlir-translate',
     'odml-to-stablehlo-opt',
     'odml_to_stablehlo',
+    'odml-converter',
     'stable-quant-opt',
     'tac-opt-all-backends',
     'tac-translate',
diff --git a/tensorflow/compiler/mlir/runlit.site.cfg.py b/tensorflow/compiler/mlir/runlit.site.cfg.py
index dc75547758e11f..b4fc94d8729d68 100644
--- a/tensorflow/compiler/mlir/runlit.site.cfg.py
+++ b/tensorflow/compiler/mlir/runlit.site.cfg.py
@@ -42,6 +42,7 @@
     'tensorflow/compiler/mlir/lite',
     'tensorflow/compiler/mlir/lite/experimental/tac',
     'tensorflow/compiler/mlir/lite/stablehlo',
+    'tensorflow/compiler/mlir/lite/stablehlo/odml_converter',
     'tensorflow/compiler/mlir/quantization/tensorflow',
     'tensorflow/compiler/mlir/tensorflow',
     'tensorflow/compiler/mlir/tfrt',
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 26d5e4d52b41d7..b138c2d3efd598 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -644,6 +644,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -675,10 +676,12 @@ cc_library(
         ":tensorflow",
         ":tensorflow_op_interfaces",
         ":tensorflow_side_effects",
+        ":tensorflow_traits",
         ":tensorflow_types",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/log",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:FuncDialect",
@@ -778,6 +781,7 @@ cc_library(
     hdrs = ["utils/location_utils.h"],
     deps = [
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -908,6 +912,7 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
         "@local_tsl//tsl/platform:ml_dtypes",
         "@local_xla//xla:test",
     ],
@@ -938,6 +943,7 @@ cc_library(
         "//tensorflow/core/util:managed_stack_trace",
         "@com_google_absl//absl/status",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
         "@local_xla//xla/mlir/utils:error_util",
     ],
 )
@@ -1401,6 +1407,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -1473,6 +1480,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Dialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -1505,6 +1513,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -1516,6 +1525,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Dialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -1656,9 +1666,6 @@ aliased_targets = [
     "export_graphdef",
     "import_model",
     "export_tf_dialect_op",
-    "translate_tf_dialect_op",
-    "mlir_roundtrip_pass",
-    "mlir_roundtrip_pass_registration",
     "mlir_roundtrip_flags",
     "mlir_import_options",
     "translate_lib",
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.cc b/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.cc
index 348316e2648ccb..267bc48d17e06d 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.cc
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.cc
@@ -20,29 +20,36 @@ limitations under the License.
 #include <optional>
 #include <utility>
 
+#include "absl/log/log.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/Analysis/CallGraph.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/ValueRange.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 
 namespace mlir {
 namespace TF {
@@ -74,7 +81,7 @@ class BacktrackAnalysisInfo {
   // the result cannot be backtracked to a region argument, returns
   // std::nullopt.
   std::optional<int> GetArg(int result_index) const {
-    if (auto arg = GetValue(result_index).dyn_cast<BlockArgument>())
+    if (auto arg = mlir::dyn_cast<BlockArgument>(GetValue(result_index)))
       if (arg.getParentBlock() == &region_->front()) return arg.getArgNumber();
     return std::nullopt;
   }
@@ -191,7 +198,7 @@ BacktrackAnalysis::BacktrackAnalysis(
 // possible.
 Value BacktrackAnalysis::BacktrackValue(Value value) {
   while (Operation* op = value.getDefiningOp()) {
-    int res_index = value.cast<OpResult>().getResultNumber();
+    int res_index = mlir::cast<OpResult>(value).getResultNumber();
     if (auto graph = dyn_cast<tf_executor::GraphOp>(op)) {
       value = graph.GetFetch().getOperand(res_index);
     } else if (auto island = dyn_cast<tf_executor::IslandOp>(op)) {
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h b/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h
index 7afec29bc5df75..c49852c1864763 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h
@@ -21,14 +21,21 @@ limitations under the License.
 #include <memory>
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Region.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/analysis/per_function_aggregate_analysis.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.cc b/tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.cc
index 5ceda80490f688..e27d0405d7e8f1 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.cc
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.cc
@@ -46,7 +46,7 @@ ResourceConstructingOps ResourceConstructingOps::EntryState(
   return ResourceConstructingOps();
 }
 ResourceConstructingOps ResourceConstructingOps::EntryState(Value value) {
-  if (auto barg = value.dyn_cast<BlockArgument>()) {
+  if (auto barg = mlir::dyn_cast<BlockArgument>(value)) {
     if (func::FuncOp func =
             dyn_cast<func::FuncOp>(barg.getOwner()->getParentOp())) {
       SymbolTable symbol_table(func->getParentOfType<ModuleOp>());
@@ -87,7 +87,7 @@ IsComposite IsComposite::EntryState(MLIRContext *context) {
 
 IsComposite IsComposite::EntryState(Value value) {
   IsComposite result;
-  if (auto barg = value.dyn_cast<BlockArgument>()) {
+  if (auto barg = mlir::dyn_cast<BlockArgument>(value)) {
     if (func::FuncOp func =
             dyn_cast<func::FuncOp>(barg.getOwner()->getParentOp())) {
       if (func.getArgAttr(barg.getArgNumber(), kCompositeDevice)) {
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.h b/tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.h
index 0cf3611af1d20c..1e68ac41d25b54 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.h
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.h
@@ -25,9 +25,11 @@ limitations under the License.
 #include "llvm/Support/Debug.h"
 #include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h"  // from @llvm-project
 #include "mlir/Analysis/DataFlow/SparseAnalysis.h"  // from @llvm-project
+#include "mlir/Analysis/DataFlowFramework.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/resource_value_typed_analyzer.cc b/tensorflow/compiler/mlir/tensorflow/analysis/resource_value_typed_analyzer.cc
index e1a984ea69bc67..372446641382ac 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/resource_value_typed_analyzer.cc
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/resource_value_typed_analyzer.cc
@@ -16,9 +16,18 @@ limitations under the License.
 
 #include <tuple>
 
+#include "llvm/ADT/STLExtras.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Region.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
@@ -29,8 +38,8 @@ namespace TF {
 namespace {
 
 bool IsResourceType(Type type) {
-  if (auto tensor_type = type.dyn_cast<TensorType>()) {
-    return tensor_type.getElementType().isa<TF::ResourceType>();
+  if (auto tensor_type = mlir::dyn_cast<TensorType>(type)) {
+    return mlir::isa<TF::ResourceType>(tensor_type.getElementType());
   }
   return false;
 }
@@ -44,10 +53,9 @@ func::FuncOp GetSessionInitializerFunc(ModuleOp module) {
   auto session_init_op = tf_saved_model::GetSessionInitializerOp(module);
   if (session_init_op && !session_init_op.getInitializers().empty()) {
     SymbolTable symbol_table(module);
-    func::FuncOp init_func_op =
-        symbol_table.lookup<func::FuncOp>(session_init_op.getInitializers()[0]
-                                              .cast<FlatSymbolRefAttr>()
-                                              .getValue());
+    func::FuncOp init_func_op = symbol_table.lookup<func::FuncOp>(
+        mlir::cast<FlatSymbolRefAttr>(session_init_op.getInitializers()[0])
+            .getValue());
     return init_func_op;
   }
   return nullptr;
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/resource_value_typed_analyzer.h b/tensorflow/compiler/mlir/tensorflow/analysis/resource_value_typed_analyzer.h
index 9817b290c4cbdb..738d8c1df3d395 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/resource_value_typed_analyzer.h
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/resource_value_typed_analyzer.h
@@ -22,6 +22,11 @@ limitations under the License.
 #include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Region.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
 namespace mlir {
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
index c95dd020497385..179b3979348161 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
@@ -26,25 +26,32 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/node_hash_map.h"
+#include "absl/log/log.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Block.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
 #include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -73,7 +80,7 @@ const ResourceIdSet& UnknownResourceSet() {
 const ResourceIdSet& GetResourceUniqueIdsOrUnknown(
     Value value,
     const ResourceAliasAnalysis::Info& alias_analysis) {
-  if (!getElementTypeOrSelf(value.getType()).isa<TF::ResourceType>() ||
+  if (!mlir::isa<TF::ResourceType>(getElementTypeOrSelf(value.getType())) ||
       alias_analysis.IsUnknownResource(value)) return UnknownResourceSet();
   return alias_analysis.GetResourceUniqueIds(value);
 }
@@ -145,7 +152,7 @@ bool MayHaveSideEffect(Operation* op) {
 bool ShouldUseResourceAliasAnalysis(
     const MemoryEffects::EffectInstance& effect) {
   Value value = effect.getValue();
-  if (value && getElementTypeOrSelf(value.getType()).isa<ResourceType>()) {
+  if (value && mlir::isa<ResourceType>(getElementTypeOrSelf(value.getType()))) {
     // For value-based effects on resource values we can use resource alias
     // analysis.
     return true;
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h
index 97fcd30d36d02f..feb90de18857b2 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h
@@ -23,12 +23,18 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Region.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/analysis/per_function_aggregate_analysis.h"
 #include "tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h"
 
 namespace mlir {
diff --git a/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc b/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc
index e9a35b1221c2a4..7275aee19e49f4 100644
--- a/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc
+++ b/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc
@@ -121,7 +121,7 @@ class MlirTensor : public TracingTensorHandle {
 
   Value getValue() { return value_; }
   Type getElementType() {
-    return value_.getType().cast<ShapedType>().getElementType();
+    return mlir::cast<ShapedType>(value_.getType()).getElementType();
   }
 
   // For LLVM style RTTI.
@@ -340,11 +340,11 @@ Status MlirAbstractOp::SetOpName(const char* const op_name) {
 
 Status MlirAbstractOp::AddRef(Type type, Type* output_type) {
   Type elt_type = getElementTypeOrSelf(type);
-  if (elt_type.isa<mlir::TF::TensorFlowRefType>()) {
+  if (mlir::isa<mlir::TF::TensorFlowRefType>(elt_type)) {
     return InvalidArgument("Requested reference to a reference type");
   }
   elt_type = TensorFlowRefType::get(elt_type);
-  if (RankedTensorType tensor_type = type.dyn_cast<RankedTensorType>()) {
+  if (RankedTensorType tensor_type = mlir::dyn_cast<RankedTensorType>(type)) {
     *output_type = RankedTensorType::get(tensor_type.getShape(), elt_type);
   }
   *output_type = UnrankedTensorType::get(elt_type);
@@ -373,11 +373,11 @@ Status MlirAbstractOp::Create(ArrayRef<Value> operands,
         return InvalidArgument("Missing attribute '", output_arg.number_attr(),
                                "' required for output list '",
                                output_arg.name(), "'");
-      if (!repeats_attr.isa<IntegerAttr>())
+      if (!mlir::isa<IntegerAttr>(repeats_attr))
         return InvalidArgument("Attribute '", output_arg.number_attr(),
                                "' required for output list '",
                                output_arg.name(), "' isn't an integer");
-      int64_t repeats = repeats_attr.cast<IntegerAttr>().getInt();
+      int64_t repeats = mlir::cast<IntegerAttr>(repeats_attr).getInt();
 
       if (!output_arg.type_attr().empty()) {
         // Same type repeated "repeats" times.
@@ -386,7 +386,7 @@ Status MlirAbstractOp::Create(ArrayRef<Value> operands,
           return InvalidArgument("Missing attribute '", output_arg.type_attr(),
                                  "' required for output '", output_arg.name(),
                                  "'");
-        TypedAttr type_attr = attr.dyn_cast<TypedAttr>();
+        TypedAttr type_attr = mlir::dyn_cast<TypedAttr>(attr);
         if (!type_attr)
           return InvalidArgument("Attribute '", output_arg.type_attr(),
                                  "' required for output '", output_arg.name(),
@@ -410,7 +410,7 @@ Status MlirAbstractOp::Create(ArrayRef<Value> operands,
         return InvalidArgument("Missing attribute '", output_arg.type_attr(),
                                "' required for output '", output_arg.name(),
                                "'");
-      TypeAttr type_attr = attr.dyn_cast<TypeAttr>();
+      TypeAttr type_attr = mlir::dyn_cast<TypeAttr>(attr);
       if (!type_attr)
         return InvalidArgument("Attribute '", output_arg.type_attr(),
                                "' required for output '", output_arg.name(),
@@ -423,13 +423,13 @@ Status MlirAbstractOp::Create(ArrayRef<Value> operands,
         return InvalidArgument(
             "Missing attribute '", output_arg.type_list_attr(),
             "' required for output '", output_arg.name(), "'");
-      ArrayAttr array_attr = attr.dyn_cast<ArrayAttr>();
+      ArrayAttr array_attr = mlir::dyn_cast<ArrayAttr>(attr);
       if (!array_attr)
         return InvalidArgument("Attribute '", output_arg.type_list_attr(),
                                "' required for output '", output_arg.name(),
                                "' isn't an array attribute");
       for (Attribute attr : array_attr) {
-        TypeAttr type_attr = attr.dyn_cast<TypeAttr>();
+        TypeAttr type_attr = mlir::dyn_cast<TypeAttr>(attr);
         if (!type_attr)
           return InvalidArgument("Array Attribute '",
                                  output_arg.type_list_attr(),
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/host_runtime/tfrt_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/host_runtime/tfrt_ops.cc
index dba58f17ccb029..9a1db50ff6b732 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/host_runtime/tfrt_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/host_runtime/tfrt_ops.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
@@ -45,11 +46,12 @@ _TfrtGetResourceOp::GetResourceHandleValueAndIdList(
 
   for (const auto &iter : llvm::enumerate(getResults())) {
     auto index = iter.index();
-    if (getElementTypeOrSelf(iter.value().getType()).isa<TF::ResourceType>()) {
+    if (mlir::isa<TF::ResourceType>(
+            getElementTypeOrSelf(iter.value().getType()))) {
       resource_vec.push_back(GetResourceHandleValueAndIdBase(
-          getContainer()[index].cast<mlir::StringAttr>().getValue(),
-          getSharedName()[index].cast<mlir::StringAttr>().getValue(), device,
-          getResults()[index], resource_handle_id_map, next_id));
+          mlir::cast<mlir::StringAttr>(getContainer()[index]).getValue(),
+          mlir::cast<mlir::StringAttr>(getSharedName()[index]).getValue(),
+          device, getResults()[index], resource_handle_id_map, next_id));
     }
   }
   return resource_vec;
@@ -100,16 +102,16 @@ mlir::LogicalResult IfrtCallOp::verify() {
   }
 
   for (mlir::Value arg : getArgs()) {
-    if (mlir::getElementTypeOrSelf(arg.getType())
-            .isa<mlir::TF::ResourceType>()) {
+    if (mlir::isa<mlir::TF::ResourceType>(
+            mlir::getElementTypeOrSelf(arg.getType()))) {
       return emitOpError()
              << "does not support passing '!tf.resource' values as arguments";
     }
   }
 
   for (mlir::Value result : getResults()) {
-    if (mlir::getElementTypeOrSelf(result.getType())
-            .isa<mlir::TF::ResourceType>()) {
+    if (mlir::isa<mlir::TF::ResourceType>(
+            mlir::getElementTypeOrSelf(result.getType()))) {
       return emitOpError()
              << "does not support returning '!tf.resource' values as results";
     }
@@ -118,12 +120,13 @@ mlir::LogicalResult IfrtCallOp::verify() {
   // Verify variable_arg_indices is sorted in ascending order.
   int64_t prev_index = -1;
   for (auto arg_index_attr : getVariableArgIndicesAttr()) {
-    if (!arg_index_attr.isa_and_nonnull<mlir::IntegerAttr>()) {
+    if (!mlir::isa_and_nonnull<mlir::IntegerAttr>(arg_index_attr)) {
       return emitOpError() << "variable_arg_indices must be an integer";
     }
 
-    int64_t index =
-        arg_index_attr.dyn_cast<mlir::IntegerAttr>().getValue().getSExtValue();
+    int64_t index = mlir::dyn_cast<mlir::IntegerAttr>(arg_index_attr)
+                        .getValue()
+                        .getSExtValue();
     if (index < 0) {
       return emitOpError() << "variable_arg_indices must be positive";
     }
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/host_runtime/tfrt_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/host_runtime/tfrt_ops.td
index 0c783c01caa287..e46a6500dfd516 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/host_runtime/tfrt_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/host_runtime/tfrt_ops.td
@@ -94,29 +94,26 @@ Empty strings indicate that they are non-partitioned tensors.}]>:$shape_and_slic
 
 
 def TF_IfrtLoadVariableOp : TF_Op<"IfrtLoadVariable", [Pure]> {
-  let summary = "Loads a variable tensor as an IFRT array";
+  let summary = "Loads a restored variable tensor as a tensor future";
 
   let description = [{
-    This op loads a variable tensor as an IFRT array and binds it with the specified name.
+    This op loads a restored variable tensor as a tensor future. It is a
+    replacement of `tf.ReadVariableOp`.
 
-    This op is an replacement of `tf.ReadVariableOp` in the case that a constant
-    variable tensor is an input to the tpu program invoked by `tf.IfrtCall`.
+    This op returns a scalar string tensor containing the restored variable name, which can be
+    used as a key within the runtime, as well as a future for the tensor.
 
-    After a `tf.ReadVariableOp` is lowered into `tf.IfrtLoadVariableOp`, the `tf.IfrtCall` kernel
-    will bind the loaded IFRT array by name with the tpu program's input.
-
-    `tf.IfrtLoadVariableOp` converts the tensor into an IFRT array based on device and sharding
-    configuration specified in `VariableDeviceShardingConfigProto`.
-
-    This op returns a scalar string tensor containing the loaded variable name, which can be
-    used as a key to look for the loaded IFRT array in runtime and a restored tensor, which
-    maybe lowered to a future by runtime.
+    The `tf.IfrtCall` kernel uses the output $array_key.
+    Other ops executed by TFRT may make use of $tensor_future.
   }];
 
+  // TODO(b/339423851) Redefine the IfrtLoadVariableOp as it doesn't require the
+  // sharding info in the attribute if multihost do not need this info. 
   let arguments = (ins
     Arg<TF_Tensor, [{The variable tensor to be loaded}], []>:$variable,
     DefaultValuedStrAttr<StrAttr, "">:$device_sharding_config_proto_text,
-    DefaultValuedAttr<StrAttr, "">:$name
+    DefaultValuedAttr<StrAttr, "">:$name,
+    DefaultValuedAttr<BoolAttr, "false">:$used_by_host
   );
 
   let results = (outs
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_arith_ops_folder.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_arith_ops_folder.cc
index f5284a0ef3cf96..9a78a1a83ae214 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_arith_ops_folder.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_arith_ops_folder.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 
 namespace mlir {
@@ -27,12 +28,12 @@ namespace TF {
 // Verifies an reduction op's `input` and reduction `dims`.
 LogicalResult VerifyReductionInputAndDims(Value input, Value dims,
                                           Location loc) {
-  auto dims_type = dims.getType().dyn_cast<RankedTensorType>();
+  auto dims_type = mlir::dyn_cast<RankedTensorType>(dims.getType());
   if (!dims_type) return success();
   if (dims_type.getRank() > 1)
     return emitError(loc, "dimensions can only be 0D or 1D tensor");
 
-  auto input_type = input.getType().dyn_cast<RankedTensorType>();
+  auto input_type = mlir::dyn_cast<RankedTensorType>(input.getType());
   if (!input_type) return success();
   int64_t rank = input_type.getRank();
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_arith_ops_folder.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_arith_ops_folder.h
index aa0f84eb122e2b..64b5d2e141f13d 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_arith_ops_folder.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_arith_ops_folder.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/IR/TypeRange.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 
 namespace mlir {
 
@@ -60,10 +61,10 @@ template <
         OpT, AddV2Op, SubOp, MulOp, DivOp, RealDivOp>::value>::type * = nullptr>
 OpFoldResult IdentityArithmeticOpFolder(OpT arithmetic_op,
                                         ArrayRef<Attribute> operands) {
-  auto lhs_type = arithmetic_op.getX().getType().template cast<ShapedType>();
-  auto rhs_type = arithmetic_op.getY().getType().template cast<ShapedType>();
+  auto lhs_type = mlir::cast<ShapedType>(arithmetic_op.getX().getType());
+  auto rhs_type = mlir::cast<ShapedType>(arithmetic_op.getY().getType());
   auto result_type =
-      arithmetic_op.getResult().getType().template cast<ShapedType>();
+      mlir::cast<ShapedType>(arithmetic_op.getResult().getType());
 
   // We can fold arithmetic operation only of we can prove that we will not
   // accidentally hide a broadcasting error.
@@ -86,8 +87,8 @@ OpFoldResult IdentityArithmeticOpFolder(OpT arithmetic_op,
   // Check that we have a constant operand on one side (candidate for identity).
   const bool is_commutative =
       (std::is_same<OpT, AddV2Op>::value || std::is_same<OpT, MulOp>::value);
-  auto lhs_attr = operands[0].dyn_cast_or_null<DenseElementsAttr>();
-  auto rhs_attr = operands[1].dyn_cast_or_null<DenseElementsAttr>();
+  auto lhs_attr = mlir::dyn_cast_or_null<DenseElementsAttr>(operands[0]);
+  auto rhs_attr = mlir::dyn_cast_or_null<DenseElementsAttr>(operands[1]);
   if (!rhs_attr && !(is_commutative && lhs_attr)) return {};
 
   // Mul and Div ops have identity value one while AddV2 and SubOp have identity
@@ -100,9 +101,9 @@ OpFoldResult IdentityArithmeticOpFolder(OpT arithmetic_op,
 
   Type element_ty = lhs_type.getElementType();
   Attribute identity_attr;
-  if (auto ty = element_ty.template dyn_cast<FloatType>()) {
+  if (auto ty = mlir::dyn_cast<FloatType>(element_ty)) {
     identity_attr = FloatAttr::get(ty, static_cast<double>(identity));
-  } else if (auto ty = element_ty.template dyn_cast<IntegerType>()) {
+  } else if (auto ty = mlir::dyn_cast<IntegerType>(element_ty)) {
     identity_attr = IntegerAttr::get(ty, static_cast<int64_t>(identity));
   } else {
     return {};
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
index 5d145c85a68a06..df887ce453b8ea 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
@@ -99,7 +99,8 @@ struct TFInlinerInterface : public DialectInlinerInterface {
   Operation* materializeCallConversion(OpBuilder& builder, Value input,
                                        Type result_type,
                                        Location conversion_loc) const final {
-    if (!result_type.isa<TensorType>() || !input.getType().isa<TensorType>())
+    if (!mlir::isa<TensorType>(result_type) ||
+        !mlir::isa<TensorType>(input.getType()))
       return nullptr;
     return builder.create<TF::CastOp>(conversion_loc, result_type, input,
                                       /*truncate=*/builder.getBoolAttr(false));
@@ -307,7 +308,7 @@ ParseResult SetReplicateOpOperands(
     llvm::ArrayRef<Type> region_arg_types, int32_t* n) {
   for (const auto& attr : state->attributes)
     if (attr.getName().strref() == "n")
-      if (auto n_attr = attr.getValue().dyn_cast<IntegerAttr>())
+      if (auto n_attr = mlir::dyn_cast<IntegerAttr>(attr.getValue()))
         *n = n_attr.getInt();
 
   if (*n < 2)
@@ -507,13 +508,14 @@ LogicalResult ReplicateOp::verify() {
   // Check number of devices, if set, matches `n`.
   if (op.getDevices().has_value()) {
     for (auto device_attr : op.getDevices().value().getValue()) {
-      auto device_list = device_attr.getValue().dyn_cast_or_null<ArrayAttr>();
+      auto device_list =
+          mlir::dyn_cast_or_null<ArrayAttr>(device_attr.getValue());
       if (!device_list)
         return op.emitError()
                << "expects 'devices' to be a map alias and device name list.";
 
       bool is_device_string = llvm::all_of(device_list, [](Attribute attr) {
-        return attr.dyn_cast_or_null<StringAttr>();
+        return mlir::dyn_cast_or_null<StringAttr>(attr);
       });
       if (!is_device_string)
         return op.emitOpError() << "expects 'devices' to be a consists of "
@@ -747,8 +749,8 @@ static LogicalResult EliminatePassThroughResults(ClusterOp op,
     // Old bridge only removes unsupported TPU types (only string for now)
     // during outside compilation extraction so this should be enough for
     // the parity.
-    bool is_unsupported_type = getElementTypeOrSelf(operand.get().getType())
-                                   .isa<mlir::TF::StringType>();
+    bool is_unsupported_type = mlir::isa<mlir::TF::StringType>(
+        getElementTypeOrSelf(operand.get().getType()));
     Value result = operand.get();
     if (is_unsupported_type && result.getParentBlock() != &body &&
         !is_used_for_resource_write) {
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
index f7c35420c22b4a..f48e1570933cf9 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/FoldUtils.h"  // from @llvm-project
 #include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
@@ -119,11 +120,11 @@ Type TensorFlowExecutorDialect::parseType(DialectAsmParser &parser) const {
 
 void TensorFlowExecutorDialect::printType(Type type,
                                           DialectAsmPrinter &os) const {
-  if (type.isa<ControlType>()) {
+  if (mlir::isa<ControlType>(type)) {
     os << "control";
     return;
   }
-  if (type.isa<TokenType>()) {
+  if (mlir::isa<TokenType>(type)) {
     os << "token";
     return;
   }
@@ -141,7 +142,7 @@ namespace {
 LogicalResult VerifyControlOperandsAfterAllData(Operation *op) {
   bool found_control = false;
   for (int operand_idx : llvm::seq<int>(0, op->getNumOperands())) {
-    if (op->getOperand(operand_idx).getType().isa<ControlType>()) {
+    if (mlir::isa<ControlType>(op->getOperand(operand_idx).getType())) {
       found_control = true;
       continue;
     }
@@ -192,7 +193,7 @@ LogicalResult GraphOp::verify() {
     Value operand = fetch.getOperand(i);
     // Break out of the loop at the first control operand encountered.
     const int64_t num_results = graph.getNumResults();
-    if (operand.getType().isa<ControlType>()) {
+    if (mlir::isa<ControlType>(operand.getType())) {
       if (i != num_results)
         return fetch.emitOpError()
                << "operand #" << i
@@ -241,7 +242,7 @@ ParseResult GraphOp::parse(OpAsmParser &parser, OperationState &result) {
   // the fetch operation.
   result.types.reserve(fetch.getNumOperands());
   for (Type type : fetch.getOperandTypes()) {
-    if (type.isa<ControlType>()) break;
+    if (mlir::isa<ControlType>(type)) break;
     result.types.push_back(type);
   }
 
@@ -403,8 +404,8 @@ ParseResult SwitchOp::parse(OpAsmParser &parser, OperationState &result) {
   // fully qualified) or a short form with a single type (in which case the data
   // input and the outputs are all using this type and predicate is tensor<i1>
   // type).
-  if (types.front().isa<FunctionType>()) {
-    FunctionType type = types.front().cast<FunctionType>();
+  if (mlir::isa<FunctionType>(types.front())) {
+    FunctionType type = mlir::cast<FunctionType>(types.front());
     if (type.getNumInputs() < 2)
       return parser.emitError(parser.getNameLoc())
              << " expects a single data type and a predicate";
@@ -439,7 +440,7 @@ void SwitchOp::print(OpAsmPrinter &p) {
   p << " : ";
   if (getTrueOutput().getType() != data_operand_ty ||
       getFalseOutput().getType() != data_operand_ty ||
-      getPredicate().getType().isa<UnrankedTensorType>()) {
+      mlir::isa<UnrankedTensorType>(getPredicate().getType())) {
     p.printFunctionalType(getOperation());
   } else {
     p << getType(0);
@@ -465,16 +466,16 @@ LogicalResult SwitchNOp::verify() {
 
   // Check that operand can be broadcasted to each output type.
   auto operand0_type = switchn.getOperand(0).getType();
-  TensorType operand0_tensor_type = operand0_type.dyn_cast<TensorType>();
+  TensorType operand0_tensor_type = mlir::dyn_cast<TensorType>(operand0_type);
   if (!operand0_tensor_type) {
     return switchn.emitOpError()
            << "expects data operand to have tensor type but got "
            << operand0_type;
   }
   for (Type output_type : switchn.getResultTypes()) {
-    if (output_type.isa<ControlType>()) break;
+    if (mlir::isa<ControlType>(output_type)) break;
 
-    TensorType output_tensor_type = output_type.dyn_cast<TensorType>();
+    TensorType output_tensor_type = mlir::dyn_cast<TensorType>(output_type);
     if (!output_tensor_type) {
       return switchn.emitOpError()
              << "expects outputs to have tensor type but got " << output_type;
@@ -483,10 +484,10 @@ LogicalResult SwitchNOp::verify() {
     // If the output type is a ref type, then the operand type should also be of
     // the same ref type. However, if the output type is a non-ref type T, then
     // the operand can be tensor of type T or T_REF.
-    bool is_output_ref =
-        output_tensor_type.getElementType().isa<tf_type::TensorFlowRefType>();
-    if (is_output_ref && !operand0_tensor_type.getElementType()
-                              .isa<tf_type::TensorFlowRefType>()) {
+    bool is_output_ref = mlir::isa<tf_type::TensorFlowRefType>(
+        output_tensor_type.getElementType());
+    if (is_output_ref && !mlir::isa<tf_type::TensorFlowRefType>(
+                             operand0_tensor_type.getElementType())) {
       return switchn.emitOpError()
              << "expects same operand and output element type but got "
              << operand0_tensor_type << " vs " << output_tensor_type;
@@ -573,24 +574,24 @@ LogicalResult MergeOp::verify() {
     return merge.emitOpError() << "expects at least one operand";
 
   Type data_type = merge.getOperand(0).getType();
-  if (data_type.isa<ControlType>())
+  if (mlir::isa<ControlType>(data_type))
     return merge.emitOpError() << "expects a non-control input";
 
   // Check that each operand can be individually broadcasted to the output type.
   Type output_type = merge.getOutput().getType();
-  TensorType output_tensor_ty = output_type.dyn_cast<TensorType>();
+  TensorType output_tensor_ty = mlir::dyn_cast<TensorType>(output_type);
   if (!output_tensor_ty) {
     return merge.emitOpError()
            << "expects output to have tensor type but got " << output_type;
   }
   bool is_output_ref =
-      output_tensor_ty.getElementType().isa<tf_type::TensorFlowRefType>();
+      mlir::isa<tf_type::TensorFlowRefType>(output_tensor_ty.getElementType());
   for (Type operand_type : merge.getOperandTypes()) {
-    if (operand_type.isa<ControlType>()) break;
+    if (mlir::isa<ControlType>(operand_type)) break;
 
     // TODO(hinsu): Update ControlOperandsAfterAllData trait to verify this
     // constraint.
-    TensorType operand_tensor_ty = operand_type.dyn_cast<TensorType>();
+    TensorType operand_tensor_ty = mlir::dyn_cast<TensorType>(operand_type);
     if (!operand_tensor_ty)
       return merge.emitOpError()
              << "expects data operands to have tensor type but got "
@@ -599,8 +600,8 @@ LogicalResult MergeOp::verify() {
     // If output type is a ref type then all operand types should also be of the
     // same ref type. However, if the output type is a non-ref type T, operands
     // can be tensor of type T or T_REF.
-    if (is_output_ref &&
-        !operand_tensor_ty.getElementType().isa<tf_type::TensorFlowRefType>()) {
+    if (is_output_ref && !mlir::isa<tf_type::TensorFlowRefType>(
+                             operand_tensor_ty.getElementType())) {
       return merge.emitOpError()
              << "expects same operand and output element type but got "
              << operand_tensor_ty << " vs " << output_tensor_ty;
@@ -624,7 +625,7 @@ void MergeOp::print(OpAsmPrinter &p) {
 
   Type output_type = getOutput().getType();
   for (Type operand_type : getOperandTypes()) {
-    if (operand_type.isa<ControlType>()) break;
+    if (mlir::isa<ControlType>(operand_type)) break;
     num_data_operands++;
 
     if (operand_type != output_type) {
@@ -660,7 +661,7 @@ ParseResult MergeOp::parse(OpAsmParser &parser, OperationState &result) {
   // Support parsing either a functional type (in which case all the types are
   // fully qualified) or a short form with a single type (in which case the data
   // inputs and the output are all using this type).
-  if (FunctionType type = types.front().dyn_cast<FunctionType>()) {
+  if (FunctionType type = mlir::dyn_cast<FunctionType>(types.front())) {
     result.types.assign(type.getResults().begin(), type.getResults().end());
     types.assign(type.getInputs().begin(), type.getInputs().end());
   } else {
@@ -747,7 +748,7 @@ ParseResult EnterOp::parse(OpAsmParser &parser, OperationState &result) {
   // Support parsing either a functional type (in which case all the types are
   // fully qualified) or a short form with a single type (in which case the data
   // input and the outputs are all using this type).
-  if (FunctionType type = types.front().dyn_cast<FunctionType>()) {
+  if (FunctionType type = mlir::dyn_cast<FunctionType>(types.front())) {
     // One data input, and any number of control inputs.
     if (type.getNumInputs() >= 1) {
       result.types.assign(type.getResults().begin(), type.getResults().end());
@@ -876,7 +877,7 @@ ParseResult LoopCondOp::parse(OpAsmParser &parser, OperationState &result) {
   // fully qualified) or a short form with a single type (in which case the data
   // input and the outputs are all using this type).
   Type control_type = ControlType::get(parser.getBuilder().getContext());
-  if (FunctionType type = types.front().dyn_cast<FunctionType>()) {
+  if (FunctionType type = mlir::dyn_cast<FunctionType>(types.front())) {
     if (llvm::count_if(type.getInputs(),
                        [=](Type type) { return type != control_type; }) != 1)
       return parser.emitError(parser.getNameLoc())
@@ -959,14 +960,14 @@ struct HoistInnerOpsSingleIslandGraph : public OpRewritePattern<GraphOp> {
     llvm::SmallVector<Value, 8> new_rets;
     for (Value operand : fetch_op.getFetches()) {
       // Control results should not be propagated out.
-      if (operand.getType().isa<ControlType>()) break;
+      if (mlir::isa<ControlType>(operand.getType())) break;
 
       if (operand.getDefiningOp() != island_op) {
         // Operand is not from island, simply propagate it out.
         new_rets.push_back(operand);
       } else {
         // Lookup yield operand in island for inner op result.
-        auto result = operand.cast<OpResult>();
+        auto result = mlir::cast<OpResult>(operand);
         new_rets.push_back(yield_op.getOperand(result.getResultNumber()));
       }
     }
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 6ba660297366ba..b3d9200aa5d00d 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -6274,6 +6274,10 @@ Note that on CPU, if an out of bound index is found, an error is returned.
 On GPU, if an out of bound index is found, a 0 is stored in the
 corresponding output value.
 
+Note that on TPU, if any dimension of `params` is of size 0 then the output will
+be the expected shape filled with zeros. On CPU and GPU an error will be
+returned.
+
 See also `tf.batch_gather` and `tf.gather_nd`.
   }];
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index d3026b02878741..373586ae837a3f 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -191,7 +191,8 @@ struct TFInlinerInterface : public DialectInlinerInterface {
   Operation *materializeCallConversion(OpBuilder &builder, Value input,
                                        Type result_type,
                                        Location conversion_loc) const final {
-    if (!result_type.isa<TensorType>() || !input.getType().isa<TensorType>())
+    if (!mlir::isa<TensorType>(result_type) ||
+        !mlir::isa<TensorType>(input.getType()))
       return nullptr;
     return builder.create<TF::CastOp>(conversion_loc, result_type, input,
                                       /*truncate=*/builder.getBoolAttr(false));
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
index a763b50ccd92cf..f8fcf569c9837a 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
@@ -2431,4 +2431,300 @@ def TF_StoreMinibatchStatisticsInFdoOp : TF_Op<"StoreMinibatchStatisticsInFdo",
   let results = (outs
   );
 }
+
+def TF_ConvertToListOfSparseCoreCooTensorsOp : TF_Op<"ConvertToListOfSparseCoreCooTensors", [Pure, SameVariadicOperandSize, SameVariadicResultSize]> {
+  let summary = "An op which converts the sparse/ragged/dense tensor into a list of COO tensor for each SparseCore.";
+
+  let arguments = (ins
+    TF_Int32Tensor:$indices_or_row_splits,
+    TF_Int32Tensor:$values,
+    TF_Float32Tensor:$weights,
+
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$sample_count,
+    ConfinedAttr<I64Attr, [IntMinValue<0>]>:$row_offset,
+    ConfinedAttr<I64Attr, [IntMinValue<0>]>:$col_offset,
+    ConfinedAttr<I64Attr, [IntMinValue<0>]>:$col_shift,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$num_sc_shards,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$stacked_table_sample_count,
+    StrAttr:$combiner
+  );
+
+  let results = (outs
+    Variadic<TF_Int32Tensor>:$row_ids_list,
+    Variadic<TF_Int32Tensor>:$col_ids_list,
+    Variadic<TF_Float32Tensor>:$gains_list
+  );
+
+  TF_DerivedResultSizeAttr num_sc_per_chip = TF_DerivedResultSizeAttr<0>;
+}
+
+
+def TF_SortListOfSparseCoreCooTensorsOp : TF_Op<"SortListOfSparseCoreCooTensors",  [Pure, SameVariadicOperandSize]> {
+  let summary = "An op which sorts each COO tensors in the list by which SparseCore the id will go to. This op should be used along with the ConvertToSparseCoreCsrWrappedCooTensorOp.";
+
+  let arguments = (ins
+    Variadic<TF_Int32Tensor>:$row_ids_list,
+    Variadic<TF_Int32Tensor>:$col_ids_list,
+    Variadic<TF_Float32Tensor>:$gains_list,
+
+    I64ArrayAttr:$sample_count_list,
+    I64ArrayAttr:$col_offset_list,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$num_replica,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$table_vocab_size,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$feature_width,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$num_sc_per_chip,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$max_ids_per_sparse_core,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$max_unique_ids_per_sparse_core,
+    StrAttr:$table_name
+  );
+
+  let results = (outs
+    TF_Int32Tensor:$sorted_row_ids,
+    TF_Int32Tensor:$sorted_col_ids,
+    TF_Float32Tensor:$sorted_gains,
+    TF_Int32Tensor:$id_counts
+  );
+
+  // N represents the number of COO tensors in the list.
+  TF_DerivedOperandSizeAttr N = TF_DerivedOperandSizeAttr<1>;
+}
+
+
+def TF_ConvertToSparseCoreCsrWrappedCooTensorOp : TF_Op<"ConvertToSparseCoreCsrWrappedCooTensorOp", [Pure, SameVariadicOperandSize]> {
+  let summary = "An op which converts the sorted coo tensor into sparse core CSR wrapped COO format.";
+
+  let arguments = (ins
+    Variadic<TF_Int32Tensor>:$sorted_row_ids_list,
+    Variadic<TF_Int32Tensor>:$sorted_col_ids_list,
+    Variadic<TF_Float32Tensor>:$sorted_gains_list,
+    Variadic<TF_Int32Tensor>:$id_counts_list,
+    TF_Int64Tensor:$splits,
+
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$sample_count_per_sc,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$num_replica,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$max_minibatches_per_sc,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$max_ids_per_chip_per_sample,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$table_vocab_size,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$feature_width,
+    StrAttr:$table_name,
+    BoolAttr:$allow_id_dropping
+  );
+
+  let results = (outs
+    TF_Int32Tensor:$row_pointers,
+    TF_Int32Tensor:$sorted_sample_ids,
+    TF_Int32Tensor:$sorted_token_ids,
+    TF_Float32Tensor:$sorted_gains,
+    TF_Int32Tensor:$row_pointers_unpadded_size,
+    TF_Int32Tensor:$ids_unpadded_size,
+    TF_Int32Tensor:$num_minibatches_per_sc
+  );
+
+  TF_DerivedOperandSizeAttr num_sc_per_chip = TF_DerivedOperandSizeAttr<1>;
+}
+
+
+def TF_GetStatsFromListOfSparseCoreCooTensorsOp : TF_Op<"GetStatsFromListOfSparseCoreCooTensors",  [Pure, SameVariadicOperandSize]> {
+  let summary = "An op which computes the max_ids/uniques for a given table.";
+
+  let arguments = (ins
+    Variadic<TF_Int32Tensor>:$row_ids_list,
+    Variadic<TF_Int32Tensor>:$col_ids_list,
+    Variadic<TF_Float32Tensor>:$gains_list,
+
+    I64ArrayAttr:$sample_count_list,
+    I64ArrayAttr:$col_offset_list,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$num_replica,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$table_vocab_size,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$feature_width,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$num_sc_per_chip,
+    StrAttr:$table_name
+  );
+
+  let results = (outs
+    TF_Int32Tensor:$max_ids_per_sparse_core,
+    TF_Int32Tensor:$max_unique_ids_per_sparse_core
+  );
+
+  TF_DerivedOperandSizeAttr N = TF_DerivedOperandSizeAttr<1>;
+}
+
+def TF_XlaSparseDenseMatmulWithStaticBufferSizeOp : TF_Op<"XlaSparseDenseMatmulWithStaticBufferSize", [Pure]> {
+  let summary = "A XLA op which performs the dense-sparse matrix multiplication.";
+
+  let arguments = (ins
+    TF_Int32Tensor:$row_pointers,
+    TF_Int32Tensor:$sorted_sample_ids,
+    TF_Int32Tensor:$sorted_token_ids,
+    TF_Float32Tensor:$sorted_gains,
+    TF_Float32Tensor:$embedding_table,
+    TF_Int32Tensor:$num_minibatches_per_physical_sparse_core,
+
+    ConfinedAttr<I64Attr, [IntMinValue<0>]>:$input_size,
+    OptionalAttr<F32Attr>:$quantization_config_low,
+    OptionalAttr<F32Attr>:$quantization_config_high,
+    OptionalAttr<I64Attr>:$quantization_config_num_buckets,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$max_ids_per_sparse_core,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$max_unique_ids_per_sparse_core,
+    StrAttr:$table_name
+  );
+
+  let results = (outs
+    TF_Float32Tensor:$activations
+  );
+}
+
+
+def TF_XlaSparseDenseMatmulGradWithSgdAndStaticBufferSizeOp : TF_Op<"XlaSparseDenseMatmulGradWithSgdAndStaticBufferSize", [Pure]> {
+  let summary = "A XLA op which performs the SGD optimizer update for the dense-sparse matrix multiplication.";
+
+  let arguments = (ins
+    TF_Int32Tensor:$row_pointers,
+    TF_Int32Tensor:$sorted_sample_ids,
+    TF_Int32Tensor:$sorted_token_ids,
+    TF_Float32Tensor:$sorted_gains,
+    TF_Float32Tensor:$activation_gradients,
+    TF_Float32Tensor:$learning_rate,
+    TF_Float32Tensor:$embedding_table,
+    TF_Int32Tensor:$num_minibatches_per_physical_sparse_core,
+
+    F32Attr:$clip_weight_min,
+    F32Attr:$clip_weight_max,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$max_ids_per_sparse_core,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$max_unique_ids_per_sparse_core,
+    StrAttr:$table_name
+  );
+
+  let results = (outs
+    TF_Float32Tensor:$updated_embedding_table
+  );
+}
+
+def XlaSparseDenseMatmulGradWithAdagradAndStaticBufferSizeOp : TF_Op<"XlaSparseDenseMatmulGradWithAdagradAndStaticBufferSize", [Pure]> {
+  let summary = "A XLA op which performs the Adagrad optimizer update for the dense-sparse matrix multiplication.";
+
+  let arguments = (ins
+    TF_Int32Tensor:$row_pointers,
+    TF_Int32Tensor:$sorted_sample_ids,
+    TF_Int32Tensor:$sorted_token_ids,
+    TF_Float32Tensor:$sorted_gains,
+    TF_Float32Tensor:$activation_gradients,
+    TF_Float32Tensor:$learning_rate,
+    TF_Float32Tensor:$embedding_table,
+    TF_Float32Tensor:$accumulator,
+    TF_Int32Tensor:$num_minibatches_per_physical_sparse_core,
+
+    F32Attr:$clip_weight_min,
+    F32Attr:$clip_weight_max,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$max_ids_per_sparse_core,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$max_unique_ids_per_sparse_core,
+    StrAttr:$table_name
+  );
+
+  let results = (outs
+    TF_Float32Tensor:$updated_embedding_table,
+    TF_Float32Tensor:$updated_accumulator
+  );
+}
+
+def XlaSparseDenseMatmulGradWithAdagradMomentumAndStaticBufferSizeOp : TF_Op<"XlaSparseDenseMatmulGradWithAdagradMomentumAndStaticBufferSize", [Pure]> {
+  let summary = "A XLA op which performs the Adagrad momentumoptimizer update for the dense-sparse matrix multiplication.";
+
+  let arguments = (ins
+    TF_Int32Tensor:$row_pointers,
+    TF_Int32Tensor:$sorted_sample_ids,
+    TF_Int32Tensor:$sorted_token_ids,
+    TF_Float32Tensor:$sorted_gains,
+    TF_Float32Tensor:$activation_gradients,
+    TF_Float32Tensor:$learning_rate,
+    TF_Float32Tensor:$embedding_table,
+    TF_Float32Tensor:$accumulator,
+    TF_Float32Tensor:$momenta,
+    TF_Int32Tensor:$num_minibatches_per_physical_sparse_core,
+
+    BoolAttr:$use_nesterov,
+    F32Attr:$exponent,
+    F32Attr:$beta1,
+    F32Attr:$beta2,
+    F32Attr:$epsilon,
+    F32Attr:$clip_weight_min,
+    F32Attr:$clip_weight_max,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$max_ids_per_sparse_core,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$max_unique_ids_per_sparse_core,
+    StrAttr:$table_name
+  );
+
+  let results = (outs
+    TF_Float32Tensor:$updated_embedding_table,
+    TF_Float32Tensor:$updated_accumulator,
+    TF_Float32Tensor:$updated_momenta
+  );
+}
+
+def XlaSparseDenseMatmulGradWithAdamAndStaticBufferSizeOp : TF_Op<"XlaSparseDenseMatmulGradWithAdamAndStaticBufferSize", [Pure]> {
+  let summary = "A XLA op which performs the Adam optimizer update for the dense-sparse matrix multiplication.";
+
+  let arguments = (ins
+    TF_Int32Tensor:$row_pointers,
+    TF_Int32Tensor:$sorted_sample_ids,
+    TF_Int32Tensor:$sorted_token_ids,
+    TF_Float32Tensor:$sorted_gains,
+    TF_Float32Tensor:$activation_gradients,
+    TF_Float32Tensor:$learning_rate,
+    TF_Float32Tensor:$embedding_table,
+    TF_Float32Tensor:$momenta,
+    TF_Float32Tensor:$velocity,
+    TF_Int32Tensor:$num_minibatches_per_physical_sparse_core,
+
+    BoolAttr:$use_sum_inside_sqrt,
+    F32Attr:$beta1,
+    F32Attr:$beta2,
+    F32Attr:$epsilon,
+    F32Attr:$clip_weight_min,
+    F32Attr:$clip_weight_max,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$max_ids_per_sparse_core,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$max_unique_ids_per_sparse_core,
+    StrAttr:$table_name
+  );
+
+  let results = (outs
+    TF_Float32Tensor:$updated_embedding_table,
+    TF_Float32Tensor:$updated_momenta,
+    TF_Float32Tensor:$updated_velocity
+  );
+}
+
+def XlaSparseDenseMatmulGradWithFtrlAndStaticBufferSizeOp : TF_Op<"XlaSparseDenseMatmulGradWithFtrlAndStaticBufferSize", [Pure]> {
+  let summary = "A XLA op which performs the Ftrl optimizer update for the dense-sparse matrix multiplication.";
+
+  let arguments = (ins
+    TF_Int32Tensor:$row_pointers,
+    TF_Int32Tensor:$sorted_sample_ids,
+    TF_Int32Tensor:$sorted_token_ids,
+    TF_Float32Tensor:$sorted_gains,
+    TF_Float32Tensor:$activation_gradients,
+    TF_Float32Tensor:$learning_rate,
+    TF_Float32Tensor:$embedding_table,
+    TF_Float32Tensor:$accumulator,
+    TF_Float32Tensor:$linear,
+    TF_Int32Tensor:$num_minibatches_per_physical_sparse_core,
+
+    BoolAttr:$multiply_linear_by_learning_rate,
+    F32Attr:$beta,
+    F32Attr:$learning_rate_power,
+    F32Attr:$l1_regularization_strength,
+    F32Attr:$l2_regularization_strength,
+    F32Attr:$clip_weight_min,
+    F32Attr:$clip_weight_max,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$max_ids_per_sparse_core,
+    ConfinedAttr<I64Attr, [IntMinValue<1>]>:$max_unique_ids_per_sparse_core,
+    StrAttr:$table_name
+  );
+
+  let results = (outs
+    TF_Float32Tensor:$updated_embedding_table,
+    TF_Float32Tensor:$updated_accumulator,
+    TF_Float32Tensor:$updated_linear
+  );
+}
 #endif // TF_OPS
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
index 988c749adb8cc6..36fb36a3d451c6 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
@@ -160,12 +160,12 @@ OpFoldResult AddNOp::fold(FoldAdaptor adaptor) {
   int non_zero_index = -1;
   auto IsKnownZero = [](Attribute attr) {
     if (!attr) return false;
-    auto splat = attr.dyn_cast<SplatElementsAttr>();
+    auto splat = mlir::dyn_cast<SplatElementsAttr>(attr);
     if (!splat) return false;
     Type element_ty = splat.getType().getElementType();
-    if (element_ty.isa<FloatType>())
+    if (mlir::isa<FloatType>(element_ty))
       return splat.getSplatValue<llvm::APFloat>().isZero();
-    if (element_ty.isa<IntegerType>())
+    if (mlir::isa<IntegerType>(element_ty))
       return splat.getSplatValue<llvm::APInt>().getSExtValue() == 0;
     return false;
   };
@@ -180,13 +180,13 @@ OpFoldResult AddNOp::fold(FoldAdaptor adaptor) {
   }
 
   // Only fold when the result shape is fully static.
-  auto result_ty = getType().dyn_cast<ShapedType>();
+  auto result_ty = mlir::dyn_cast<ShapedType>(getType());
   if (!result_ty || !result_ty.hasStaticShape()) return {};
 
   if (non_zero_index == -1) {
     return SplatElementsAttr::get(
-        result_ty,
-        operands.begin()->cast<DenseElementsAttr>().getSplatValue<Attribute>());
+        result_ty, mlir::cast<DenseElementsAttr>(*operands.begin())
+                       .getSplatValue<Attribute>());
   }
 
   // Check the non-zero operand's shape matches the result shape.
@@ -423,7 +423,7 @@ LogicalResult BatchToSpaceOp::verify() {
   int64_t block_size = op.getBlockSize();
 
   llvm::SmallVector<int64_t, 4> input_shape(4, ShapedType::kDynamic);
-  auto input_type = op.getInput().getType().cast<TensorType>();
+  auto input_type = mlir::cast<TensorType>(op.getInput().getType());
   if (input_type.hasRank()) {
     if (input_type.getRank() != 4)
       return op.emitOpError()
@@ -442,7 +442,7 @@ LogicalResult BatchToSpaceOp::verify() {
                        input_type.getShape().end());
   }
 
-  auto crops_type = op.getCrops().getType().cast<TensorType>();
+  auto crops_type = mlir::cast<TensorType>(op.getCrops().getType());
   if (crops_type.hasRank()) {
     if (crops_type.getRank() != 2)
       return op.emitOpError()
@@ -477,7 +477,7 @@ LogicalResult BatchToSpaceOp::verify() {
     }
   }
 
-  auto output_type = op.getOutput().getType().cast<TensorType>();
+  auto output_type = mlir::cast<TensorType>(op.getOutput().getType());
   if (output_type.hasRank()) {
     if (output_type.getRank() != 4)
       return op.emitOpError()
@@ -567,8 +567,8 @@ void BatchToSpaceOp::getCanonicalizationPatterns(RewritePatternSet& results,
 
 LogicalResult BatchToSpaceNDOp::verify() {
   BatchToSpaceNDOp op = *this;
-  auto block_shape_ty = op.getBlockShape().getType().cast<ShapedType>();
-  auto crops_ty = op.getCrops().getType().cast<ShapedType>();
+  auto block_shape_ty = mlir::cast<ShapedType>(op.getBlockShape().getType());
+  auto crops_ty = mlir::cast<ShapedType>(op.getCrops().getType());
 
   if (block_shape_ty.hasStaticShape() && crops_ty.hasStaticShape()) {
     const int block_rank = block_shape_ty.getShape().front();
@@ -617,9 +617,9 @@ LogicalResult BiasAddOp::verify() {
     return op.emitOpError("requires bias operand to have rank exactly one");
 
   RankedTensorType value_ty =
-      op.getValue().getType().dyn_cast<RankedTensorType>();
+      mlir::dyn_cast<RankedTensorType>(op.getValue().getType());
   RankedTensorType bias_ty =
-      op.getBias().getType().dyn_cast<RankedTensorType>();
+      mlir::dyn_cast<RankedTensorType>(op.getBias().getType());
   if (!bias_ty || !value_ty) return success();
 
   int64_t feature_dim_idx =
@@ -716,7 +716,7 @@ OpFoldResult BroadcastToOp::fold(FoldAdaptor) {
 
   // Fold broadcast if operand and result types are the same and all dimensions
   // are statically known (no-op broadcast).
-  auto result_ty = getType().dyn_cast<ShapedType>();
+  auto result_ty = mlir::dyn_cast<ShapedType>(getType());
   if (!result_ty || !result_ty.hasStaticShape()) return {};
 
   if (result_ty == input.getType()) return input;
@@ -818,8 +818,8 @@ LogicalResult BroadcastGradientArgsOp::verify() {
 
   // Verify that output types are of rank one and matches the computed result
   // shape.
-  auto r0_ty = op.getR0().getType().dyn_cast<RankedTensorType>();
-  auto r1_ty = op.getR1().getType().dyn_cast<RankedTensorType>();
+  auto r0_ty = mlir::dyn_cast<RankedTensorType>(op.getR0().getType());
+  auto r1_ty = mlir::dyn_cast<RankedTensorType>(op.getR1().getType());
   if (r0_ty && r0_ty.hasStaticShape() && r0_ty.getDimSize(0) != r0.size())
     return op.emitOpError() << "requires dimension 0 size of 'r0' to be "
                             << r0.size() << " but got " << r0_ty.getShape()[0];
@@ -852,7 +852,8 @@ LogicalResult BroadcastGradientArgsOp::fold(
 
   auto build_out_dense_element = [](SmallVectorImpl<int64_t>& shape,
                                     Type input_type) {
-    Type element_type = input_type.cast<mlir::TensorType>().getElementType();
+    Type element_type =
+        mlir::cast<mlir::TensorType>(input_type).getElementType();
     RankedTensorType type = tensorflow::GetTypeFromTFTensorShape(
         {static_cast<int64_t>(shape.size())}, element_type);
     // Input could only be i32 or i64. For i32, downcast to int32_t array.
@@ -893,7 +894,7 @@ LogicalResult FoldConstantCaseOp::matchAndRewrite(
   int index = *branch.getValues<int>().begin();
   if (index < 0 || index >= op.num_branches()) index = op.num_branches() - 1;
 
-  auto func = op.getBranches()[index].cast<SymbolRefAttr>();
+  auto func = mlir::cast<SymbolRefAttr>(op.getBranches()[index]);
   auto empty = rewriter.getStringAttr("");
   ReplaceTfOpWithNewOp<PartitionedCallOp>(
       rewriter, op, op.getResultTypes(), op.getOperands().drop_front(), func,
@@ -932,7 +933,7 @@ static LogicalResult VerifyCaseOrIfOpBranchFunctions(
 
   for (const auto& branch : llvm::enumerate(branches)) {
     auto branch_func = symbol_table.lookupNearestSymbolFrom<func::FuncOp>(
-        op, branch.value().cast<SymbolRefAttr>());
+        op, mlir::cast<SymbolRefAttr>(branch.value()));
     if (!branch_func)
       return op->emitOpError()
              << "expects " << branch_name(branch.index()) << " ("
@@ -1347,12 +1348,10 @@ LogicalResult HoistCwiseBinaryOutOfConcat::matchAndRewrite(
     else
       return failure();
     DenseElementsAttr const_attr;
-    auto scalar_tensor_type =
-        first_arg_op->getOperand(hoist_params->scalar_operand_idx)
-            .getType()
-            .dyn_cast<ShapedType>();
+    auto scalar_tensor_type = mlir::dyn_cast<ShapedType>(
+        first_arg_op->getOperand(hoist_params->scalar_operand_idx).getType());
     Type scalar_dtype = scalar_tensor_type.getElementType();
-    if (scalar_dtype.isa<FloatType>())
+    if (mlir::isa<FloatType>(scalar_dtype))
       const_attr = DenseElementsAttr::get(scalar_tensor_type,
                                           static_cast<float>(identity_val));
     else
@@ -1450,7 +1449,7 @@ HoistCwiseBinaryOutOfConcat::GetHoistParams(
       } else {
         operand = arg.getDefiningOp()->getOperand(operand_idx);
       }
-      auto ranked = operand.getType().dyn_cast<RankedTensorType>();
+      auto ranked = mlir::dyn_cast<RankedTensorType>(operand.getType());
       return ranked && ranked.getRank() == (axis + 1) &&
              ranked.getShape()[axis] == 1;
     });
@@ -1461,13 +1460,13 @@ HoistCwiseBinaryOutOfConcat::GetHoistParams(
     return llvm::all_of(op.getValues(), [&](Value arg) -> bool {
       if (exceptions.count(arg)) return true;
       auto operand = arg.getDefiningOp()->getOperand(operand_idx);
-      auto ranked = operand.getType().dyn_cast<RankedTensorType>();
+      auto ranked = mlir::dyn_cast<RankedTensorType>(operand.getType());
       return ranked && ranked.hasRank() && ranked.getRank() == 0;
     });
   };
 
   // Concat result type must be a ranked tensor.
-  auto ranked = op.getType().dyn_cast<RankedTensorType>();
+  auto ranked = mlir::dyn_cast<RankedTensorType>(op.getType());
   if (!ranked) return std::nullopt;
 
   // TODO(ezhulenev): Add support for more valid concat patterns.
@@ -1527,7 +1526,7 @@ static LogicalResult Verify(OpT op) {
 
   DenseIntElementsAttr axis_attr;
   if (matchPattern(op.getAxis(), m_Constant(&axis_attr))) {
-    auto input_ty = op.getX().getType().template dyn_cast<RankedTensorType>();
+    auto input_ty = mlir::dyn_cast<RankedTensorType>(op.getX().getType());
     if (input_ty) {
       int64_t rank = input_ty.getRank();
       assert(axis_attr.getNumElements() == 1 &&
@@ -1561,7 +1560,8 @@ LogicalResult ConcatOffsetOp::verify() {
            << "requires sizes of shapes and offsets to be the same, got sizes "
            << op.getShape().size() << " and " << op.getOffset().size();
 
-  auto ranked_dim = op.getConcatDim().getType().dyn_cast<RankedTensorType>();
+  auto ranked_dim =
+      mlir::dyn_cast<RankedTensorType>(op.getConcatDim().getType());
   if (ranked_dim && ranked_dim.getRank() != 0)
     return op.emitOpError()
            << "requires concat_dim to be a scalar, got tensor of rank "
@@ -1578,7 +1578,7 @@ LogicalResult ConcatOffsetOp::verify() {
       return op.emitOpError() << "requires operand and result " << idx
                               << " to have compatible shapes";
 
-    auto ranked_shape = shape.getType().dyn_cast<RankedTensorType>();
+    auto ranked_shape = mlir::dyn_cast<RankedTensorType>(shape.getType());
     if (!ranked_shape) continue;
 
     if (ranked_shape.getRank() != 1)
@@ -1609,14 +1609,15 @@ LogicalResult ConcatOffsetOp::fold(FoldAdaptor adaptor,
   if (operands.size() < 3) return failure();
 
   // Check concat_dim is a scalar.
-  auto concat_dim_attr = operands[0].dyn_cast_or_null<DenseIntElementsAttr>();
+  auto concat_dim_attr =
+      mlir::dyn_cast_or_null<DenseIntElementsAttr>(operands[0]);
   if (!concat_dim_attr || concat_dim_attr.getType().getRank() != 0)
     return failure();
 
   llvm::SmallVector<DenseIntElementsAttr, 4> shapes;
   shapes.reserve(operands.size() - 1);
   for (Attribute shape : llvm::drop_begin(operands, 1))
-    if (auto shape_attr = shape.dyn_cast_or_null<DenseIntElementsAttr>())
+    if (auto shape_attr = mlir::dyn_cast_or_null<DenseIntElementsAttr>(shape))
       shapes.push_back(shape_attr);
     else
       return failure();
@@ -1685,14 +1686,14 @@ OpFoldResult ConstOp::fold(FoldAdaptor adaptor) {
 void ConstOp::build(OpBuilder& builder, OperationState& result,
                     Attribute value) {
   ShapedType type;
-  if (auto elem_attr = value.dyn_cast<ElementsAttr>()) {
+  if (auto elem_attr = mlir::dyn_cast<ElementsAttr>(value)) {
     return ConstOp::build(builder, result, elem_attr);
-  } else if (value.isa<BoolAttr, FloatAttr, IntegerAttr>()) {
+  } else if (mlir::isa<BoolAttr, FloatAttr, IntegerAttr>(value)) {
     // All TensorFlow types must be tensor types. In the build() method,
     // we want to provide more flexibility by allowing attributes of scalar
     // types. But we need to wrap it up with ElementsAttr to construct
     // valid TensorFlow constants.
-    auto typed_attr = value.cast<TypedAttr>();
+    auto typed_attr = mlir::cast<TypedAttr>(value);
     type = tensorflow::GetTypeFromTFTensorShape(/*shape=*/{},
                                                 typed_attr.getType());
     return ConstOp::build(builder, result, DenseElementsAttr::get(type, value));
@@ -1704,7 +1705,7 @@ void ConstOp::build(OpBuilder& builder, OperationState& result,
 void ConstOp::build(OpBuilder& builder, OperationState& result, Type type,
                     Attribute value) {
   // Handle the case where the type and value are already tensors.
-  if (type.isa<TensorType>() && value.isa<ElementsAttr>()) {
+  if (mlir::isa<TensorType>(type) && mlir::isa<ElementsAttr>(value)) {
     result.addTypes(type);
     result.addAttribute("value", value);
     return;
@@ -1722,7 +1723,7 @@ LogicalResult ConstOp::inferReturnTypes(
   ConstOpAdaptor adaptor(operands, attributes, properties, regions);
   auto value = adaptor.getValue();
   if (!value) return emitOptionalError(location, "missing attribute 'value'");
-  if (auto elem_attr = value.dyn_cast<ElementsAttr>()) {
+  if (auto elem_attr = mlir::dyn_cast<ElementsAttr>(value)) {
     inferredReturnTypes.assign({elem_attr.getType()});
     return success();
   }
@@ -1743,7 +1744,7 @@ static LogicalResult VerifyConvOpAttributes(
     return emitOptionalError(
         location, "requires strides attribute length to be ", num_dims);
   auto is_not_positive = [](Attribute val) {
-    return val.cast<IntegerAttr>().getValue().getSExtValue() <= 0;
+    return mlir::cast<IntegerAttr>(val).getValue().getSExtValue() <= 0;
   };
   if (llvm::any_of(strides, is_not_positive))
     return emitOptionalError(location, "requires positive strides");
@@ -1793,9 +1794,8 @@ static LogicalResult Verify(OpT op) {
 
   if (padding == tensorflow::Padding::EXPLICIT) {
     ArrayRef<Attribute> explicit_padding;
-    ArrayAttr explicit_pad =
-        op->getAttr("explicit_paddings")
-            .template dyn_cast_or_null<::mlir::ArrayAttr>();
+    ArrayAttr explicit_pad = mlir::dyn_cast_or_null<::mlir::ArrayAttr>(
+        op->getAttr("explicit_paddings"));
     if (!explicit_pad) {
       explicit_pad = ::mlir::Builder(op->getContext()).getI64ArrayAttr({});
     }
@@ -1812,7 +1812,7 @@ static LogicalResult Verify(OpT op) {
           num_dims * 2);
     }
     auto is_negative = [](Attribute val) {
-      return val.cast<IntegerAttr>().getValue().getSExtValue() < 0;
+      return mlir::cast<IntegerAttr>(val).getValue().getSExtValue() < 0;
     };
     if (llvm::any_of(explicit_padding, is_negative))
       return emitOptionalError(op.getLoc(),
@@ -1827,7 +1827,7 @@ static LogicalResult Verify(OpT op) {
   }
 
   int64_t input_channels = ShapedType::kDynamic;
-  if (auto ty = op.getInput().getType().template dyn_cast<RankedTensorType>()) {
+  if (auto ty = mlir::dyn_cast<RankedTensorType>(op.getInput().getType())) {
     absl::string_view data_format(op.getDataFormat().data(),
                                   op.getDataFormat().size());
     tensorflow::TensorFormat format;
@@ -1838,8 +1838,7 @@ static LogicalResult Verify(OpT op) {
   }
 
   int64_t filter_channels = ShapedType::kDynamic;
-  if (auto ty =
-          op.getFilter().getType().template dyn_cast<RankedTensorType>()) {
+  if (auto ty = mlir::dyn_cast<RankedTensorType>(op.getFilter().getType())) {
     int idx = tensorflow::GetFilterTensorInputChannelsDimIndex(
         num_dims, tensorflow::FORMAT_HWIO);
     filter_channels = ty.getDimSize(idx);
@@ -1891,8 +1890,8 @@ static LogicalResult inferConvReturnTypeComponents(
   const int64_t num_dims = 2 + num_spatial_dims;
   const Value input = op.getInput();
   const Value filter = op.getFilter();
-  const TensorType input_ty = input.getType().template cast<TensorType>();
-  const TensorType filter_ty = filter.getType().template cast<TensorType>();
+  const TensorType input_ty = mlir::cast<TensorType>(input.getType());
+  const TensorType filter_ty = mlir::cast<TensorType>(filter.getType());
 
   ArrayRef<Attribute> strides = op.getStrides().getValue();
   StringRef data_format = op.getDataFormat();
@@ -1910,7 +1909,7 @@ static LogicalResult inferConvReturnTypeComponents(
   (void)padding_is_valid;
 
   auto get_int = [](Attribute attr) {
-    return attr.template cast<IntegerAttr>().getInt();
+    return mlir::cast<IntegerAttr>(attr).getInt();
   };
 
   // Output always have `num_dims` rank. All dimensions are initialized to
@@ -1967,7 +1966,7 @@ LogicalResult Conv2DOp::inferReturnTypeComponents(
   Conv2DOpAdaptor op(operands.getValues(), attributes, properties, regions);
   ArrayRef<Attribute> explicit_padding;
   ArrayAttr explicit_pad =
-      op.getExplicitPaddings().dyn_cast_or_null<::mlir::ArrayAttr>();
+      mlir::dyn_cast_or_null<::mlir::ArrayAttr>(op.getExplicitPaddings());
   if (!explicit_pad) {
     explicit_pad = ::mlir::Builder(context).getI64ArrayAttr({});
   }
@@ -1984,7 +1983,7 @@ StringRef Conv2DOp::GetOptimalLayout(const RuntimeDevices& devices) {
     return getDataFormat();
 
   // Input must be a tensor.
-  auto input_ty = getInput().getType().dyn_cast<TensorType>();
+  auto input_ty = mlir::dyn_cast<TensorType>(getInput().getType());
   if (!input_ty) return getDataFormat();
 
   // For f16 data type on devices with Tensor Cores support NHWC data format
@@ -1998,7 +1997,7 @@ StringRef Conv2DOp::GetOptimalLayout(const RuntimeDevices& devices) {
     return getDataFormat();
 
   // Keep current data format if filter rank is unknown or not equal to 4.
-  auto filter_ty = getFilter().getType().dyn_cast<RankedTensorType>();
+  auto filter_ty = mlir::dyn_cast<RankedTensorType>(getFilter().getType());
   if (!filter_ty || filter_ty.getRank() != 4) return getDataFormat();
 
   const int64_t d0 = filter_ty.getDimSize(0);
@@ -2006,7 +2005,7 @@ StringRef Conv2DOp::GetOptimalLayout(const RuntimeDevices& devices) {
 
   auto all_ones = [](ArrayAttr arr) -> bool {
     return llvm::all_of(arr, [](Attribute attr) -> bool {
-      return attr.cast<IntegerAttr>().getInt() == 1;
+      return mlir::cast<IntegerAttr>(attr).getInt() == 1;
     });
   };
 
@@ -2068,7 +2067,7 @@ StringRef Conv2DBackpropFilterOp::GetOptimalLayout(
     return getDataFormat();
 
   // Input must be a tensor.
-  auto input_ty = getInput().getType().dyn_cast<TensorType>();
+  auto input_ty = mlir::dyn_cast<TensorType>(getInput().getType());
   if (!input_ty) return getDataFormat();
 
   // For f16 data type on devices with Tensor Cores support NHWC data format
@@ -2142,7 +2141,7 @@ StringRef Conv2DBackpropInputOp::GetOptimalLayout(
     return getDataFormat();
 
   // Filter must be a tensor.
-  auto filter_ty = getFilter().getType().dyn_cast<TensorType>();
+  auto filter_ty = mlir::dyn_cast<TensorType>(getFilter().getType());
   if (!filter_ty) return getDataFormat();
 
   // For f16 data type on devices with Tensor Cores support NHWC data format
@@ -2177,7 +2176,7 @@ LogicalResult Conv3DOp::inferReturnTypeComponents(
 
 LogicalResult DataFormatVecPermuteOp::verify() {
   DataFormatVecPermuteOp op = *this;
-  auto input_ty = op.getX().getType().dyn_cast<RankedTensorType>();
+  auto input_ty = mlir::dyn_cast<RankedTensorType>(op.getX().getType());
   if (!input_ty) return success();
 
   int rank = input_ty.getRank();
@@ -2285,12 +2284,12 @@ class DivNoNanOrMulNoNanConstantY : public OpRewritePattern<OpT> {
     if (auto yDefOp = dyn_cast_or_null<TF::ConstOp>(y.getDefiningOp())) {
       Type typeOfElementsInY = getElementTypeOrSelf(y.getType());
       ElementsAttr attr = yDefOp.getValue();
-      bool yHasComplexElements = typeOfElementsInY.isa<ComplexType>();
+      bool yHasComplexElements = mlir::isa<ComplexType>(typeOfElementsInY);
 
       // If `y` is a splat constant, then the op will definitely get replaced.
       // We check for a splat constant first, in order to optimize the
       // performance of this canonicalization because this check will be O(1).
-      if (auto splatAttr = attr.dyn_cast<SplatElementsAttr>()) {
+      if (auto splatAttr = mlir::dyn_cast<SplatElementsAttr>(attr)) {
         bool splatAttrIsZero = false;
         if (!yHasComplexElements) {
           if (splatAttr.getSplatValue<APFloat>().isZero())
@@ -2356,7 +2355,8 @@ LogicalResult DynamicStitchOp::verify() {
   if (op.getN() < 1)
     return op.emitOpError("requires attribute N with value >= 1");
 
-  if (RankedTensorType out_ty = op.getType().dyn_cast<RankedTensorType>()) {
+  if (RankedTensorType out_ty =
+          mlir::dyn_cast<RankedTensorType>(op.getType())) {
     if (out_ty.getRank() == 0) {
       return op.emitOpError("requires non scalar output");
     }
@@ -2383,8 +2383,9 @@ LogicalResult DynamicStitchOp::verify() {
     }
 
     Value data = std::get<1>(it);
-    RankedTensorType index_ty = index.getType().dyn_cast<RankedTensorType>();
-    RankedTensorType data_ty = data.getType().dyn_cast<RankedTensorType>();
+    RankedTensorType index_ty =
+        mlir::dyn_cast<RankedTensorType>(index.getType());
+    RankedTensorType data_ty = mlir::dyn_cast<RankedTensorType>(data.getType());
     if (!index_ty || !data_ty) continue;
 
     int64_t index_rank = index_ty.getRank();
@@ -2429,7 +2430,7 @@ LogicalResult DynamicStitchOp::verify() {
       expected_shape.append(inferred_item_shape->begin(),
                             inferred_item_shape->end());
 
-      auto out_ty = op.getType().cast<TensorType>();
+      auto out_ty = mlir::cast<TensorType>(op.getType());
       auto expected_out_ty = tensorflow::GetTypeFromTFTensorShape(
           expected_shape, out_ty.getElementType());
 
@@ -2471,25 +2472,25 @@ OpFoldResult EmptyOp::fold(FoldAdaptor adaptor) {
   Attribute attr = operands.front();
   if (!attr) return {};
 
-  auto int_attr = attr.cast<DenseIntElementsAttr>();
+  auto int_attr = mlir::cast<DenseIntElementsAttr>(attr);
   SmallVector<int64_t, 6> out_shape;
   for (const auto val : int_attr.getValues<int32_t>()) {
     out_shape.push_back(val);
   }
 
-  auto type = getResult().getType().cast<ShapedType>();
+  auto type = mlir::cast<ShapedType>(getResult().getType());
   auto etype = type.getElementType();
 
   // We can not fold if the result is not static.
   if (!type.hasStaticShape()) return {};
 
-  if (auto float_type = etype.dyn_cast<FloatType>()) {
+  if (auto float_type = mlir::dyn_cast<FloatType>(etype)) {
     auto out_type = tensorflow::GetTypeFromTFTensorShape(out_shape, float_type);
     return DenseElementsAttr::get(out_type,
                                   {APFloat(float_type.getFloatSemantics())});
   }
 
-  if (auto int_type = etype.dyn_cast<IntegerType>()) {
+  if (auto int_type = mlir::dyn_cast<IntegerType>(etype)) {
     auto out_type = tensorflow::GetTypeFromTFTensorShape(out_shape, etype);
     APInt val(int_type.getWidth(), 0, int_type.getSignedness());
     return DenseElementsAttr::get(out_type, val);
@@ -2580,7 +2581,7 @@ EnqueueTPUEmbeddingSparseTensorBatchOp::GetResourceInstanceStr() {
 //===----------------------------------------------------------------------===//
 
 OpFoldResult EnsureShapeOp::fold(FoldAdaptor) {
-  ShapedType type = getInput().getType().dyn_cast<ShapedType>();
+  ShapedType type = mlir::dyn_cast<ShapedType>(getInput().getType());
   if (!type || !type.hasRank()) return {};
   // If shape attribute equals input operand's type's shape, fold it to input.
   std::optional<llvm::ArrayRef<int64_t>> shape_constraint = getShape();
@@ -2639,15 +2640,15 @@ static LogicalResult flipComatibleShapeError(Ty op, PatternRewriter& rewriter) {
   // we don't know which one it is. TF shape inference turns unranked outputs
   // into ranked ones if it can statically evaluate the broadcast, see the shape
   // function of tf.Equal.
-  auto ty = op.getType().template dyn_cast<RankedTensorType>();
+  auto ty = mlir::dyn_cast<RankedTensorType>(op.getType());
   if (!ty) {
     return rewriter.notifyMatchFailure(op, "requires a ranked output shape");
   }
 
   // Unless this is a scalar compare, a scalar output indicates that this will
   // always fail.
-  auto x_ty = op.getX().getType().template dyn_cast<RankedTensorType>();
-  auto y_ty = op.getY().getType().template dyn_cast<RankedTensorType>();
+  auto x_ty = mlir::dyn_cast<RankedTensorType>(op.getX().getType());
+  auto y_ty = mlir::dyn_cast<RankedTensorType>(op.getY().getType());
   if (ty.getRank() == 0 &&
       (!x_ty || x_ty.getRank() != 0 || !y_ty || y_ty.getRank() != 0)) {
     return rewriter.notifyMatchFailure(op, "output rank must match input rank");
@@ -2675,10 +2676,10 @@ void NotEqualOp::getCanonicalizationPatterns(RewritePatternSet& results,
 //===----------------------------------------------------------------------===//
 
 Type InferExpandDimsOpType(Value input, Value dim) {
-  Type element_ty = input.getType().cast<TensorType>().getElementType();
+  Type element_ty = mlir::cast<TensorType>(input.getType()).getElementType();
   auto unranked_ty = UnrankedTensorType::get(element_ty);
 
-  auto input_ty = input.getType().dyn_cast<RankedTensorType>();
+  auto input_ty = mlir::dyn_cast<RankedTensorType>(input.getType());
   if (!input_ty) return unranked_ty;
 
   DenseIntElementsAttr dim_attr;
@@ -2773,7 +2774,7 @@ LogicalResult FakeQuantWithMinMaxVarsPerChannelOp::verify() {
         "requires num_bits to be between 2 and 16, inclusive");
   }
 
-  auto inputs_type = inputs.getType().dyn_cast<RankedTensorType>();
+  auto inputs_type = mlir::dyn_cast<RankedTensorType>(inputs.getType());
   if (!inputs_type) return success();
   int depth = inputs_type.getDimSize(inputs_type.getRank() - 1);
   if ((min && min.getDimSize(0) != depth) ||
@@ -2800,7 +2801,7 @@ LogicalResult FillOp::verify() {
 }
 
 static ShapedType InferFillOpType(Value dims, Value value) {
-  Type etype = value.getType().cast<ShapedType>().getElementType();
+  Type etype = mlir::cast<ShapedType>(value.getType()).getElementType();
 
   DenseIntElementsAttr dims_attr;
   if (matchPattern(dims, m_Constant(&dims_attr))) {
@@ -2813,7 +2814,7 @@ static ShapedType InferFillOpType(Value dims, Value value) {
   }
 
   if (auto shape_op = dims.getDefiningOp<ShapeOp>()) {
-    if (auto t = shape_op.getInput().getType().dyn_cast<ShapedType>()) {
+    if (auto t = mlir::dyn_cast<ShapedType>(shape_op.getInput().getType())) {
       return t;
     }
   }
@@ -2830,20 +2831,20 @@ OpFoldResult FillOp::fold(FoldAdaptor adaptor) {
   auto operands = adaptor.getOperands();
   assert(operands.size() == 2 && "fill op has two operand");
 
-  auto type = getType().cast<ShapedType>();
+  auto type = mlir::cast<ShapedType>(getType());
   // DenseElementsAttr that is used in this folder only supports int and float
   // types.
   // TODO(hinsu): Handle complex types once there is a attribute kind for
   // complex.
   if (!type.getElementType().isIntOrFloat()) return {};
 
-  auto value = operands[1].dyn_cast_or_null<ElementsAttr>();
+  auto value = mlir::dyn_cast_or_null<ElementsAttr>(operands[1]);
   if (!value) return {};
 
   if (type.hasStaticShape())
     return DenseElementsAttr::get(type, value.getValues<Attribute>()[0]);
 
-  auto dims = operands[0].dyn_cast_or_null<DenseIntElementsAttr>();
+  auto dims = mlir::dyn_cast_or_null<DenseIntElementsAttr>(operands[0]);
   if (!dims) return {};
 
   llvm::SmallVector<int64_t, 4> shape;
@@ -2876,7 +2877,7 @@ StringRef FusedBatchNormGradV3Op::GetOptimalLayout(
 
   // For f16 data type on devices with Tensor Cores support NHWC data format
   // is up to ~2x faster.
-  auto x_ty = getX().getType().cast<TensorType>();
+  auto x_ty = mlir::cast<TensorType>(getX().getType());
   const bool is_f16 = x_ty.getElementType().isF16();
   if (is_f16 && CanUseTensorCores(devices)) return "NHWC";
 
@@ -2940,7 +2941,7 @@ static StringRef GetOptimalLayout(const RuntimeDevices& devices, Op* op) {
 
   // For f16 data type on devices with Tensor Cores support NHWC data format
   // is up to ~2x faster.
-  auto x_ty = op->getX().getType().template cast<TensorType>();
+  auto x_ty = mlir::cast<TensorType>(op->getX().getType());
   const bool is_f16 = x_ty.getElementType().isF16();
   if (is_f16 && CanUseTensorCores(devices)) return "NHWC";
 
@@ -3045,7 +3046,7 @@ void GeneratorDatasetRegionOp::getSuccessorRegions(
 LogicalResult GatherV2Op::verify() {
   GatherV2Op op = *this;
   int64_t batch_dims = op.getBatchDims();
-  if (auto ty = op.getIndices().getType().dyn_cast<RankedTensorType>()) {
+  if (auto ty = mlir::dyn_cast<RankedTensorType>(op.getIndices().getType())) {
     int64_t rank = ty.getRank();
     if (batch_dims > rank || batch_dims < -rank)
       return op.emitOpError()
@@ -3060,7 +3061,7 @@ LogicalResult GatherV2Op::verify() {
   DenseIntElementsAttr axis_attr;
   if (matchPattern(op.getAxis(), m_Constant(&axis_attr))) {
     int64_t axis = (*axis_attr.begin()).getSExtValue();
-    if (auto ty = op.getParams().getType().dyn_cast<RankedTensorType>()) {
+    if (auto ty = mlir::dyn_cast<RankedTensorType>(op.getParams().getType())) {
       int64_t rank = ty.getRank();
       if (axis >= rank || axis < -rank)
         return op.emitOpError() << "axis (" << axis << ") must be in range ["
@@ -3283,7 +3284,7 @@ void IfRegionOp::getSuccessorRegions(
 // Verifies that the input is 1D.
 LogicalResult InvertPermutationOp::verify() {
   InvertPermutationOp op = *this;
-  auto x_type = op.getX().getType().cast<TensorType>();
+  auto x_type = mlir::cast<TensorType>(op.getX().getType());
   if (!x_type.hasRank()) return success();
   if (x_type.getShape().size() != 1)
     return op.emitOpError() << "requires input x to be 1-dimensional";
@@ -3310,10 +3311,12 @@ OpFoldResult LeakyReluOp::fold(FoldAdaptor adaptor) {
     return FloatAttr::get(arg.getType(), val);
   };
 
-  if (auto arg = operands[0].dyn_cast_or_null<FloatAttr>()) {
+  if (auto arg = mlir::dyn_cast_or_null<FloatAttr>(operands[0])) {
     return calculate(arg);
-  } else if (auto arg = operands[0].dyn_cast_or_null<SplatElementsAttr>()) {
-    if (auto elementAttr = arg.getSplatValue<Attribute>().dyn_cast<FloatAttr>())
+  } else if (auto arg =
+                 mlir::dyn_cast_or_null<SplatElementsAttr>(operands[0])) {
+    if (auto elementAttr =
+            mlir::dyn_cast<FloatAttr>(arg.getSplatValue<Attribute>()))
       return DenseElementsAttr::get(arg.getType(), calculate(elementAttr));
   }
   return {};
@@ -3378,7 +3381,7 @@ OpFoldResult LogicalAndOp::fold(FoldAdaptor adaptor) {
   auto result_type = getType();
 
   for (const auto& operand : operands) {
-    auto splat_attr = operand.dyn_cast_or_null<SplatElementsAttr>();
+    auto splat_attr = mlir::dyn_cast_or_null<SplatElementsAttr>(operand);
     if (!splat_attr) continue;
 
     if (splat_attr.getType() != result_type) continue;
@@ -3540,7 +3543,8 @@ LogicalResult MeanOp::FoldOperandsPermutation(ArrayRef<int64_t> permutation) {
       dyn_cast_or_null<TF::ConstOp>(getReductionIndices().getDefiningOp());
   if (!reduction_op) return failure();
 
-  auto reductions_value = reduction_op.getValue().dyn_cast<DenseElementsAttr>();
+  auto reductions_value =
+      mlir::dyn_cast<DenseElementsAttr>(reduction_op.getValue());
   if (!reductions_value) return failure();
 
   // Prepare new reduction indices according to operand permutation.
@@ -3597,8 +3601,8 @@ void HashTableOp::getCanonicalizationPatterns(RewritePatternSet& results,
 
 LogicalResult BitcastOp::verify() {
   BitcastOp op = *this;
-  auto input_type = op.getInput().getType().cast<ShapedType>();
-  auto output_type = op.getOutput().getType().cast<ShapedType>();
+  auto input_type = mlir::cast<ShapedType>(op.getInput().getType());
+  auto output_type = mlir::cast<ShapedType>(op.getOutput().getType());
   auto input_element_type = input_type.getElementType();
   auto output_element_type = output_type.getElementType();
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_layout_helper.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_layout_helper.cc
index d67c1da227d1c6..b3ce501c1c08d1 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_layout_helper.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_layout_helper.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_layout_helper.h"
 
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
 namespace mlir {
 namespace TF {
 
@@ -60,7 +62,7 @@ ArrayAttr ShuffleArrayAttr(ArrayAttr attr, ArrayRef<int64_t> permutation,
 
 // Shuffle ranked tensor dimensions according to the permutation.
 Type ShuffleRankedTensorType(Type type, ArrayRef<int64_t> permutation) {
-  if (auto ranked_type = type.dyn_cast<RankedTensorType>()) {
+  if (auto ranked_type = mlir::dyn_cast<RankedTensorType>(type)) {
     ArrayRef<int64_t> shape = ranked_type.getShape();
     assert(permutation.size() == shape.size());
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
index 3dfe56bc625f5a..c0528f35bd11fe 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include <string>
 #include <tuple>
 #include <type_traits>
+#include <vector>
 
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
@@ -103,6 +104,11 @@ Value LookThroughIdentity(Value result) {
   return result;
 }
 
+bool IsWithinInt32Range(int64_t value) {
+  return (value >= std::numeric_limits<int32_t>::min() &&
+          value <= std::numeric_limits<int32_t>::max());
+};
+
 #include "tensorflow/compiler/mlir/tensorflow/transforms/generated_canonicalize.inc"
 }  // namespace
 
@@ -416,10 +422,27 @@ struct ConvertPackToReshape : public OpRewritePattern<PackOp> {
       return failure();
     }
 
-    // Create constant shape for reshape.
-    auto type = tensorflow::GetTypeFromTFTensorShape(
+    auto output_int_type = tensorflow::GetTypeFromTFTensorShape(
         output_ty.getRank(), rewriter.getIntegerType(64));
-    auto shape_attr = DenseIntElementsAttr::get(type, output_ty.getShape());
+    auto shape_attr =
+        DenseIntElementsAttr::get(output_int_type, output_ty.getShape());
+
+    // use int32_t instead of int64_t if all elements are in the range of int32
+    // because int64 is not supported in dynamic reshape in XLA
+    bool elements_all_in_int32_range =
+        std::all_of(output_ty.getShape().begin(), output_ty.getShape().end(),
+                    IsWithinInt32Range);
+
+    if (elements_all_in_int32_range) {
+      std::vector<int32_t> output_shape(output_ty.getRank());
+      std::transform(output_ty.getShape().begin(), output_ty.getShape().end(),
+                     output_shape.begin(),
+                     [](int64_t val) { return static_cast<int32_t>(val); });
+      output_int_type = tensorflow::GetTypeFromTFTensorShape(
+          output_ty.getRank(), rewriter.getIntegerType(32));
+      shape_attr = DenseIntElementsAttr::get(output_int_type, output_shape);
+    }
+
     auto shape = rewriter.create<ConstOp>(pack_op.getLoc(), shape_attr);
 
     // TODO(b/173622615): Remove after fixed.
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_tensor_helper.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_tensor_helper.cc
index 24036d17b588e6..ca8f27a1489c06 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_tensor_helper.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_tensor_helper.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Matchers.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 
 namespace mlir {
 namespace TF {
@@ -33,9 +34,9 @@ class IdentityNOp;
 RankedTensorType GetRankedTensorTypeForOperand(Value operand) {
   DenseElementsAttr attr;
   if (matchPattern(operand, m_Constant(&attr))) {
-    return attr.getType().dyn_cast<RankedTensorType>();
+    return mlir::dyn_cast<RankedTensorType>(attr.getType());
   }
-  return operand.getType().dyn_cast<RankedTensorType>();
+  return mlir::dyn_cast<RankedTensorType>(operand.getType());
 }
 
 // Returns the tf.Equal/tf.NotEqual result type given `x` and `y` and inputs. If
@@ -53,7 +54,7 @@ Type DeduceEqualCmpOpType(Builder *builder, Location loc, Value x, Value y,
     }
   }
 
-  auto ranked_type = result_type.dyn_cast<RankedTensorType>();
+  auto ranked_type = mlir::dyn_cast<RankedTensorType>(result_type);
   if (!ranked_type) return UnrankedTensorType::get(builder->getI1Type());
 
   return RankedTensorType::get(ranked_type.getShape(), builder->getI1Type());
@@ -65,7 +66,7 @@ Type InferReductionOpType(Value input, Value reduction_indices,
   Type element_ty = getElementTypeOrSelf(input_ty);
 
   // Output type is unranked if input type is not ranked.
-  auto ranked_ty = input_ty.dyn_cast<RankedTensorType>();
+  auto ranked_ty = mlir::dyn_cast<RankedTensorType>(input_ty);
   if (!ranked_ty) return UnrankedTensorType::get(element_ty);
   int64_t rank = ranked_ty.getRank();
 
@@ -124,7 +125,7 @@ LogicalResult VerifyTypesCompatibility(Operation::operand_type_range types,
   // the dimension index on the first mismatch and ignore dimension at that
   // index in following types.
   for (Type ty : types) {
-    RankedTensorType ranked_ty = ty.dyn_cast<RankedTensorType>();
+    RankedTensorType ranked_ty = mlir::dyn_cast<RankedTensorType>(ty);
     if (!ranked_ty) continue;
 
     int64_t rank = ranked_ty.getRank();
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_tensor_helper.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_tensor_helper.h
index aaf795afd72917..e77ea7d77deef0 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_tensor_helper.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_tensor_helper.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 
 namespace mlir {
 
@@ -36,7 +37,7 @@ RankedTensorType GetRankedTensorTypeForOperand(Value operand);
 // given `rank`.
 inline bool IsOfRankedFloatTensorType(RankedTensorType type, int rank) {
   return type && type.getRank() == rank &&
-         type.getElementType().isa<FloatType>();
+         mlir::isa<FloatType>(type.getElementType());
 }
 
 // Returns true if the given `value` has the specified rank or has unranked
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
index e5ecef28a38377..45717471e373a2 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
@@ -46,11 +47,11 @@ namespace tf_saved_model {
 //===----------------------------------------------------------------------===//
 
 static bool IsStrArrayAttr(Attribute attr) {
-  auto array = attr.dyn_cast<ArrayAttr>();
+  auto array = mlir::dyn_cast<ArrayAttr>(attr);
   if (!array) return false;
 
-  return llvm::all_of(array,
-                      [](Attribute attr) { return attr.isa<StringAttr>(); });
+  return llvm::all_of(
+      array, [](Attribute attr) { return mlir::isa<StringAttr>(attr); });
 }
 
 //===----------------------------------------------------------------------===//
@@ -58,10 +59,11 @@ static bool IsStrArrayAttr(Attribute attr) {
 //===----------------------------------------------------------------------===//
 
 LogicalResult VerifyTensorTypesCompatible(Type t1, Type t2) {
-  if (!t1.isa<TensorType>() || !t2.isa<TensorType>()) {
+  if (!mlir::isa<TensorType>(t1) || !mlir::isa<TensorType>(t2)) {
     return failure();
   }
-  return verifyCompatibleShape(t1.cast<TensorType>(), t2.cast<TensorType>());
+  return verifyCompatibleShape(mlir::cast<TensorType>(t1),
+                               mlir::cast<TensorType>(t2));
 }
 
 LogicalResult GlobalTensorOp::verify() {
@@ -75,7 +77,7 @@ LogicalResult GlobalTensorOp::verify() {
     }
   }
   if (!global_tensor.getIsMutable()) {
-    if (!global_tensor.getType().cast<TensorType>().hasStaticShape()) {
+    if (!mlir::cast<TensorType>(global_tensor.getType()).hasStaticShape()) {
       return global_tensor.emitError()
              << "'type' attribute for immutable 'tf_saved_model.global_tensor' "
                 "should have a static shape";
@@ -91,7 +93,7 @@ LogicalResult SessionInitializerOp::verify() {
 
   for (auto sym_ref : session_initializer.getInitializers()) {
     auto init_func_op = symbol_table.lookup<mlir::func::FuncOp>(
-        sym_ref.cast<FlatSymbolRefAttr>().getValue());
+        mlir::cast<FlatSymbolRefAttr>(sym_ref).getValue());
 
     if (!init_func_op)
       return session_initializer.emitOpError()
@@ -143,16 +145,16 @@ TensorFlowSavedModelDialect::TensorFlowSavedModelDialect(MLIRContext *context)
 }
 
 static LogicalResult VerifyIndexPath(Operation *op, NamedAttribute named_attr) {
-  auto attr = named_attr.getValue().dyn_cast<ArrayAttr>();
+  auto attr = mlir::dyn_cast<ArrayAttr>(named_attr.getValue());
   if (!attr) {
     return op->emitError() << "'" << kTfSavedModelIndexPathAttr
                            << "' attribute should be an ArrayAttr";
   }
   for (auto element : attr) {
-    if (element.isa<StringAttr>()) {
+    if (mlir::isa<StringAttr>(element)) {
       continue;
     }
-    if (auto integer = element.dyn_cast<IntegerAttr>()) {
+    if (auto integer = mlir::dyn_cast<IntegerAttr>(element)) {
       if (integer.getValue().getBitWidth() == 64) {
         continue;
       }
@@ -165,7 +167,7 @@ static LogicalResult VerifyIndexPath(Operation *op, NamedAttribute named_attr) {
 
 Type GetBoundInputArgTypeFor(mlir::Operation *op) {
   if (auto global_tensor = llvm::dyn_cast<GlobalTensorOp>(op)) {
-    auto type = global_tensor.getType().cast<TensorType>();
+    auto type = mlir::cast<TensorType>(global_tensor.getType());
     return RankedTensorType::get(
         {}, TF::ResourceType::get({type}, type.getContext()));
   }
@@ -196,12 +198,12 @@ LogicalResult TensorFlowSavedModelDialect::verifyRegionArgAttribute(
     Operation *op, unsigned region_index, unsigned arg_index,
     NamedAttribute named_attr) {
   if (named_attr.getName() == "tf_saved_model.bound_input") {
-    if (!named_attr.getValue().isa<FlatSymbolRefAttr>()) {
+    if (!mlir::isa<FlatSymbolRefAttr>(named_attr.getValue())) {
       return op->emitError() << "'tf_saved_model.bound_input' attribute should "
                                 "be a FlatSymbolRefAttr";
     }
     auto symbol_name =
-        named_attr.getValue().cast<FlatSymbolRefAttr>().getValue();
+        mlir::cast<FlatSymbolRefAttr>(named_attr.getValue()).getValue();
     auto module = op->getParentOfType<ModuleOp>();
     mlir::Operation *symbol_op = module.lookupSymbol(symbol_name);
     if (!symbol_op) {
@@ -292,8 +294,8 @@ static LogicalResult VerifySavedModelModule(
             &op, {exported_names_ident, attr}))) {
       return failure();
     }
-    for (auto str : attr.cast<ArrayAttr>()) {
-      auto exported_name = str.cast<StringAttr>().getValue();
+    for (auto str : mlir::cast<ArrayAttr>(attr)) {
+      auto exported_name = mlir::cast<StringAttr>(str).getValue();
       auto p = exported_name_to_op.insert({exported_name, &op});
       if (!p.second) {
         return op.emitError()
@@ -341,7 +343,8 @@ static LogicalResult VerifySavedModelModule(
     auto init_syms = (*session_initializers.begin()).getInitializers();
     return std::any_of(
         init_syms.begin(), init_syms.end(), [&](Attribute sym_ref) {
-          return sym_ref.cast<FlatSymbolRefAttr>().getValue() == func.getName();
+          return mlir::cast<FlatSymbolRefAttr>(sym_ref).getValue() ==
+                 func.getName();
         });
   };
 
@@ -439,7 +442,7 @@ LogicalResult VerifyInitializerTypeAttr(Operation *op,
 
   // Validate the attribute value.
   auto initializer_type_attr_value =
-      named_attr.getValue().dyn_cast_or_null<StringAttr>();
+      mlir::dyn_cast_or_null<StringAttr>(named_attr.getValue());
   if (!initializer_type_attr_value) {
     return op->emitError() << "Attribute tf_saved_model.initializer_type "
                            << "should be a StringAttr.";
@@ -504,7 +507,7 @@ SmallVector<StringRef, 2> GetExportedNames(Operation *op) {
       op->getAttrOfType<ArrayAttr>(kTfSavedModelExportedNamesAttr);
   if (exported_names) {
     for (auto name : exported_names) {
-      ret.push_back(name.cast<StringAttr>().getValue());
+      ret.push_back(mlir::cast<StringAttr>(name).getValue());
     }
   }
   return ret;
@@ -547,7 +550,7 @@ class OptimizeSessionInitializerPattern
     SmallVector<mlir::Attribute, 2> to_keep;
     for (auto sym_ref : op.getInitializers()) {
       auto init_func_op = symbol_table.lookup<mlir::func::FuncOp>(
-          sym_ref.cast<FlatSymbolRefAttr>().getValue());
+          mlir::cast<FlatSymbolRefAttr>(sym_ref).getValue());
 
       // The init function can only be referenced from the SessionInitializerOp.
       // And there is at most one SessionInitializerOp in the module. So if both
@@ -590,7 +593,7 @@ SmallVector<StringRef, 2> GetSessionInitializerExportedName(ModuleOp op) {
   SmallVector<StringRef, 2> results;
   for (auto sym_ref : session_initializer_op.getInitializers()) {
     auto init_func_op = symbol_table.lookup<mlir::func::FuncOp>(
-        sym_ref.cast<FlatSymbolRefAttr>().getValue());
+        mlir::cast<FlatSymbolRefAttr>(sym_ref).getValue());
     auto exported_names = GetExportedNames(init_func_op);
     assert(exported_names.size() == 1);
     results.push_back(exported_names[0]);
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h
index 62f6192c1f84f0..c6abd7689beddc 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
 #include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
@@ -38,7 +39,7 @@ namespace TF {
 static inline LogicalResult VerifyRefTypeMatch(mlir::Type type,
                                                mlir::Type maybe_ref_type) {
   if (auto ref_type =
-          maybe_ref_type.dyn_cast<mlir::tf_type::TensorFlowRefType>())
+          mlir::dyn_cast<mlir::tf_type::TensorFlowRefType>(maybe_ref_type))
     return success(ref_type.RemoveRef().getTypeID() == type.getTypeID());
   return failure();
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
index 04052926504174..a5925ac4156baa 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
@@ -1924,8 +1924,8 @@ func.func @testFoldEnsureShapeOp(%arg0: tensor<10x20xf32>) -> (tensor<10x20xf32>
 func.func @testConvertPackToReshapeAxis0(%arg0: tensor<2x3xf32>) -> tensor<1x2x3xf32> {
   %0 = "tf.Pack"(%arg0) {axis = 0 : i64, _xla_outside_compilation = "1", device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2x3xf32>) -> tensor<1x2x3xf32>
   func.return %0 : tensor<1x2x3xf32>
-  // CHECK: %[[SHAPE:.*]] = "tf.Const"() <{value = dense<[1, 2, 3]> : tensor<3xi64>}> : () -> tensor<3xi64>
-  // CHECK: %[[RESHAPE:.*]] = "tf.Reshape"(%arg0, %[[SHAPE]]) {_xla_outside_compilation = "1", device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2x3xf32>, tensor<3xi64>) -> tensor<1x2x3xf32>
+  // CHECK: %[[SHAPE:.*]] = "tf.Const"() <{value = dense<[1, 2, 3]> : tensor<3xi32>}> : () -> tensor<3xi32>
+  // CHECK: %[[RESHAPE:.*]] = "tf.Reshape"(%arg0, %[[SHAPE]]) {_xla_outside_compilation = "1", device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2x3xf32>, tensor<3xi32>) -> tensor<1x2x3xf32>
   // CHECK: return %[[RESHAPE]] : tensor<1x2x3xf32>
 }
 
@@ -1933,8 +1933,8 @@ func.func @testConvertPackToReshapeAxis0(%arg0: tensor<2x3xf32>) -> tensor<1x2x3
 func.func @testConvertPackToReshapeAxis1(%arg0: tensor<2x3xf32>) -> tensor<2x1x3xf32> {
   %0 = "tf.Pack"(%arg0) {axis = 1 : i64, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2x3xf32>) -> tensor<2x1x3xf32>
   func.return %0 : tensor<2x1x3xf32>
-  // CHECK: %[[SHAPE:.*]] = "tf.Const"() <{value = dense<[2, 1, 3]> : tensor<3xi64>}> : () -> tensor<3xi64>
-  // CHECK: %[[RESHAPE:.*]] = "tf.Reshape"(%arg0, %[[SHAPE]]) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2x3xf32>, tensor<3xi64>) -> tensor<2x1x3xf32>
+  // CHECK: %[[SHAPE:.*]] = "tf.Const"() <{value = dense<[2, 1, 3]> : tensor<3xi32>}> : () -> tensor<3xi32>
+  // CHECK: %[[RESHAPE:.*]] = "tf.Reshape"(%arg0, %[[SHAPE]]) {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<2x3xf32>, tensor<3xi32>) -> tensor<2x1x3xf32>
   // CHECK: return %[[RESHAPE]] : tensor<2x1x3xf32>
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/cluster_outlining.mlir b/tensorflow/compiler/mlir/tensorflow/tests/cluster_outlining.mlir
index 90f1cfc2fd5027..bafe05c2eebcc4 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/cluster_outlining.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/cluster_outlining.mlir
@@ -105,20 +105,18 @@ func.func @cluster_operands(%arg0: tensor<?xi32>) -> tensor<?xi32> {
 // -----
 
 // Tests cluster attributes are copied over to cluster_func.
-// Includes device info propagation.
 
 // CHECK-LABEL: func @cluster_attrs
 func.func @cluster_attrs() -> tensor<?xi32> {
   %0 = "tf_device.cluster"() ({
     %1 = "tf.A"() : () -> tensor<?xi32>
     tf_device.return %1 : tensor<?xi32>
-  }) {cluster_attr = "cluster_attr", device = "device"} : () -> tensor<?xi32>
+  }) {cluster_attr = "cluster_attr"} : () -> tensor<?xi32>
   func.return %0 : tensor<?xi32>
 }
 
 // CHECK: "tf_device.cluster_func"
 // CHECK-SAME: cluster_attr = "cluster_attr"
-// CHECK-SAME: device = "device"
 
 // -----
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/convert_to_legacy_compile_and_replicate_attributes.mlir b/tensorflow/compiler/mlir/tensorflow/tests/convert_to_legacy_compile_and_replicate_attributes.mlir
index bc900be065c4e8..e27ebb2ea5189c 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/convert_to_legacy_compile_and_replicate_attributes.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/convert_to_legacy_compile_and_replicate_attributes.mlir
@@ -37,7 +37,7 @@ func.func @convert_to_legacy_attributes_failure(%arg0: tensor<*xf32>, %arg1: ten
     %outputs_9, %control_10 = tf_executor.island(%control_4) wraps "tf.Identity"(%outputs_7) {_replication_info = "cluster", _tpu_input_identity = true, _xla_compile_device_type = "TPU", device = ""} : (tensor<*xf32>) -> tensor<*xf32>
     %outputs_11, %control_12 = tf_executor.island wraps "tf.Mul"(%outputs, %outputs_9) {_replication_info = "cluster", _xla_compile_device_type = "TPU", device = ""} : (tensor<f32>, tensor<*xf32>) -> tensor<*xf32>
     %outputs_13, %control_14 = tf_executor.island wraps "tf.AddV2"(%outputs_11, %outputs_0) {_replication_info = "cluster", _xla_compile_device_type = "TPU", device = ""} : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
-    // expected-error @+1 {{'tf.Identity' op has '_replication_info' attribute but not '_xla_compile_device_type' attribute which is unsupported}}
+    // expected-error @+1 {{'tf.Identity' op is expected to have either both or none of '_replication_info' and '_xla_compile_device_type' attributes}}
     %outputs_15, %control_16 = tf_executor.island wraps "tf.Identity"(%outputs_13) {_replication_info = "cluster", _tpu_output_identity = true, device = "/device:TPU_REPLICATED_CORE:0"} : (tensor<*xf32>) -> tensor<*xf32>
     %outputs_17, %control_18 = tf_executor.island wraps "tf.TPUReplicatedOutput"(%outputs_15) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
     %outputs_19, %control_20 = tf_executor.island(%control_3) wraps "tf.Identity"(%outputs_17) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mark_ops_for_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mark_ops_for_outside_compilation.mlir
index ca1e4c99549d94..9abb90805961c3 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mark_ops_for_outside_compilation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mark_ops_for_outside_compilation.mlir
@@ -595,3 +595,17 @@ func.func @unsupported_op_gpu_cluster() -> tensor<i32> {
   }) {allow_soft_placement = true, _xla_compile_device_type = "GPU"} : () -> tensor<i32>
   func.return %0 : tensor<i32>
 }
+
+// CHECK-LABEL: func @xla_host_compute
+func.func @xla_host_compute(%arg0: tensor<i32>) {
+  "tf_device.cluster"() ({
+    %cst = "tf.Const"() {value = dense<16> : tensor<i32>} : () -> tensor<i32>
+    // CHECK:  tf.XlaHostCompute
+    // CHECK-SAME:_xla_original_oc_node_name = "hcb0", _xla_token_input_nodes = ["_xla_token_arg_node"] 
+    "tf.XlaHostCompute"(%cst) <{ancestors = [], cost_estimate_ns = 1000000 : i64, key = "_host_callback", recv_key = "", send_key = "", shapes = [], tpu_core = 0 : i64}> {_xla_original_oc_node_name = "hcb0", _xla_token_input_nodes = ["_xla_token_arg_node"]} : (tensor<i32>) -> ()
+    tf_device.return
+  }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<i32>
+  func.return
+}
+
+
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/simple_tf_dialect_op.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/simple_tf_dialect_op.mlir
deleted file mode 100644
index 780406e0c16127..00000000000000
--- a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/simple_tf_dialect_op.mlir
+++ /dev/null
@@ -1,32 +0,0 @@
-// RUN: tf-mlir-translate -test-only-mlir-to-tf-nodedef %s -o - | FileCheck %s
-
-func.func @main() {
-^bb0:
-  // CHECK: name: "node_name"
-  // CHECK-NEXT: op: "Const"
-  // CHECK-NEXT: attr {
-  // CHECK:      key: "dtype"
-  // CHECK-NEXT:   value {
-  // CHECK-NEXT:     type: DT_INT32
-  // CHECK-NEXT:   }
-  // CHECK-NEXT: }
-  // CHECK-NEXT: attr {
-  // CHECK-NEXT:   key: "value"
-  // CHECK-NEXT:   value {
-  // CHECK-NEXT:     tensor {
-  // CHECK-NEXT:       dtype: DT_INT32
-  // CHECK-NEXT:       tensor_shape {
-  // CHECK-NEXT:         dim {
-  // CHECK-NEXT:           size: 2
-  // CHECK-NEXT:         }
-  // CHECK-NEXT:       }
-  // CHECK-NEXT:       tensor_content: "\200\000\000\000\200\000\000\000"
-  // CHECK:      experimental_debug_info {
-  // CHECK-NEXT:   original_node_names: "n1"
-  // CHECK-NEXT:   original_func_names: "f1"
-  // CHECK-NEXT: }
-  %0 = "tf.Const"() {value = #tf_type<tensor_proto : "0x746674656E736F722464747970653A2044545F494E5433320A74656E736F725F7368617065207B0A202064696D207B0A2020202073697A653A20320A20207D0A7D0A74656E736F725F636F6E74656E743A20225C3230305C3030305C3030305C3030305C3230305C3030305C3030305C303030220A"> : tensor<2xi32>} : () -> (tensor<2xi32>) loc(fused[callsite("n1@f1" at callsite("node_name" at "file_loc"))])
-  func.return
-}
-
-
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/roundtrip-tf-executor.mlir b/tensorflow/compiler/mlir/tensorflow/tests/roundtrip-tf-executor.mlir
deleted file mode 100644
index ddc9ea80d37036..00000000000000
--- a/tensorflow/compiler/mlir/tensorflow/tests/roundtrip-tf-executor.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: tf-opt %s --run-tf-graph-optimization --graph-passes=MlirRoundtripPass | FileCheck %s
-
-// The test uses the tf_graph_optimization_pass to run the MlirRoundtripPass.
-// We convert mlir -> Graph -> mlir -> Graph -> mlir
-
-func.func @main() {
-  tf_executor.graph {
-    %0 = tf_executor.island wraps "tf.NoOp"() {} : () -> () loc("X")
-    tf_executor.fetch
-  }
-  func.return
-}
-
-// Check for the presence of tf.NoOp in the final output.
-// CHECK: tf.NoOp
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference_with_shape_specialization.mlir b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference_with_shape_specialization.mlir
new file mode 100644
index 00000000000000..b016e3ee033012
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference_with_shape_specialization.mlir
@@ -0,0 +1,42 @@
+// RUN: tf-opt %s -tf-shape-inference=input-arg-shapes=1 -verify-diagnostics -split-input-file | FileCheck %s
+// RUN: not tf-opt %s -tf-shape-inference=input-arg-shapes=* 2>&1 | FileCheck --check-prefix=INPUT_ARG_SHAPES_ERROR %s
+// INPUT_ARG_SHAPES_ERROR: Missing input argument shapes
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
+    // CHECK-LABEL: func.func @main
+    // CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() <{value = dense<1.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+    // CHECK-DAG: %[[CST_1:.*]] = "tf.Const"() <{value = dense<3> : tensor<i32>}> : () -> tensor<i32>
+    // CHECK-NEXT: %[[UQ:.*]] = "tf.UniformQuantize"(%arg0, %cst, %cst_0) <{quantization_axis = -1 : i64, quantization_max_val = 127 : i64, quantization_min_val = -128 : i64}> : (tensor<1xf32>, tensor<f32>, tensor<i32>) -> tensor<1x!tf_type.qint8>
+    // CHECK-NEXT: %[[UDQ:.*]] = "tf.UniformDequantize"(%[[UQ]], %[[CST_0]], %[[CST_1]]) <{quantization_axis = -1 : i64, quantization_max_val = 127 : i64, quantization_min_val = -128 : i64}> : (tensor<1x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<1xf32>
+    // CHECK-NEXT: return %[[UDQ]] : tensor<1xf32>
+    func.func @main(%arg0 : tensor<?xf32>) -> tensor<?xf32> {
+      %scales = "tf.Const"() { value = dense<1.0> : tensor<f32> } : () -> tensor<f32>
+      %zps = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
+
+      %0 = "tf.UniformQuantize"(%arg0, %scales, %zps) {
+        quantization_axis = -1 : i64, quantization_min_val = -128 : i64, quantization_max_val = 127 : i64
+      } : (tensor<?xf32>, tensor<f32>, tensor<i32>) -> tensor<?x!tf_type.qint8>
+      %1 = "tf.UniformDequantize"(%0, %scales, %zps) {
+        quantization_axis = -1 : i64, quantization_min_val = -128 : i64, quantization_max_val = 127 : i64
+      } : (tensor<?x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<?xf32>
+      func.return %1 : tensor<?xf32>
+    }
+}
+
+// -----
+
+// expected-error@+1 {{Input shapes provided but no `main` function found.}}
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
+    func.func @non_main(%arg0 : tensor<?xf32>) -> tensor<?xf32> {
+      %scales = "tf.Const"() { value = dense<1.0> : tensor<f32> } : () -> tensor<f32>
+      %zps = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
+
+      %0 = "tf.UniformQuantize"(%arg0, %scales, %zps) {
+        quantization_axis = -1 : i64, quantization_min_val = -128 : i64, quantization_max_val = 127 : i64
+      } : (tensor<?xf32>, tensor<f32>, tensor<i32>) -> tensor<?x!tf_type.qint8>
+      %1 = "tf.UniformDequantize"(%0, %scales, %zps) {
+        quantization_axis = -1 : i64, quantization_min_val = -128 : i64, quantization_max_val = 127 : i64
+      } : (tensor<?x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<?xf32>
+      func.return %1 : tensor<?xf32>
+    }
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu-resource-read-for-write.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu-resource-read-for-write.mlir
index c4ca46d34d7501..4ebc7cb4d063a2 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu-resource-read-for-write.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu-resource-read-for-write.mlir
@@ -62,3 +62,30 @@ func.func @multiple_result_user(%arg0: tensor<i32>, %arg1: tensor<*x!tf_type.res
 func.func @multiple_result_user_func(%arg0: tensor<i32>) -> tensor<i32> {
   func.return %arg0 : tensor<i32>
 }
+
+// CHECK-LABEL: @reads_outside_replicate_op
+func.func @reads_outside_replicate_op(%arg0: tensor<*x!tf_type.resource<tensor<1xf32>>> {tf.device = "/job:tpu_host_worker/replica:0/task:0/device:CPU:0"}) {
+// CHECK-COUNT-1: tf.ReadVariableOp
+// CHECK: tf_device.replicate
+// CHECK-NOT: tf.ReadVariableOp
+  %0 = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf_type.resource<tensor<1xf32>>>) -> tensor<1xf32>
+  %cst = "tf.Const"() <{value = dense<0.000000e+00> : tensor<f32>}> : () -> tensor<f32>
+  %cst_0 = "tf.Const"() <{value = dense<1> : tensor<1xi64>}> : () -> tensor<1xi64>
+  %fill = "tf.Fill"(%cst_0, %cst) : (tensor<1xi64>, tensor<f32>) -> tensor<1xf32>
+  tf_device.replicate([%0, %fill] as %arg_r0: tensor<1xf32>) {n = 2 : i32} {
+    %1 = "tf_device.launch"() <{device = "TPU_REPLICATED_HOST_0"}> ({
+      %2 = "tf.Identity"(%arg_r0) : (tensor<1xf32>) -> tensor<1xf32>
+      tf_device.return %2 : tensor<1xf32>
+    }) : () -> tensor<1xf32>
+    %3 = "tf_device.cluster_func"(%1) <{func = @write_chain_func}> {_replication_info = "cluster__train_helper", _xla_compile_device_type = "TPU", num_cores_per_replica = 1 : i64} : (tensor<1xf32>) -> tensor<1xf32>
+    "tf.AssignVariableOp"(%arg0, %3) <{validate_shape = false}> : (tensor<*x!tf_type.resource<tensor<1xf32>>>, tensor<1xf32>) -> ()
+    tf_device.return
+  }
+  func.return
+}
+
+func.func private @write_chain_func(%arg0: tensor<1xf32>) -> (tensor<1xf32>) {
+  %cst = "tf.Const"() <{value = dense<[[0, 1]]> : tensor<1x2xi32>}> : () -> tensor<1x2xi32>
+  %0 = "tf.XlaAllReduce"(%arg0, %cst) <{mode = "CrossReplica", reduce_op = "Add"}> : (tensor<1xf32>, tensor<1x2xi32>) -> tensor<1xf32>
+  return %0 : tensor<1xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir
index 1d3c1b6f3cf518..4ede37c3d1b4d2 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir
@@ -724,8 +724,8 @@ func.func @cluster_ops_keep_replicated_core_attr() {
 
 // -----
 
-func.func @missing_compilation_attribute() {
-  // expected-error@+1 {{'tf.opA' op has '_replication_info' attribute but not '_xla_compile_device_type' attribute which is unsupported}}
+func.func @missing_replication_or_compilation_attribute() {
+  // expected-error@+1 {{'tf.opA' op is expected to have either both or none of '_replication_info' and '_xla_compile_device_type' attributes}}
   %0 = "tf.opA"() { _replication_info = "replicate", device = "/device:TPU_REPLICATED_CORE:0", name = "name", is_stateless = true} : () -> tensor<i1>
   "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _replication_info = "replicate", device = "/device:TPU:0", num_replicas = 1, topology = "topology"} : () -> ()
   func.return
@@ -744,124 +744,17 @@ func.func @empty_replication_attribute() {
 
 func.func @invalid_device_type() {
   // expected-error@+1 {{'tf.opA' op has invalid '_xla_compile_device_type' value 'XPU'}}
-  "tf.opA"() { _xla_compile_device_type = "XPU", _replication_info = "replicate", is_stateless = true} : () -> ()
-  func.return
-}
-
-// -----
-
-// Check non-replicated case, including expected attributes at device cluster.
-// CHECK: "tf_device.cluster"()
-// CHECK:    "tf.opA"()
-// CHECK:    "tf.opB"()
-// CHECK:    tf_device.return
-// CHECK:  })  {_replication_info = "__no_replication_cluster", _xla_compile_device_type = "TPU", allow_soft_placement = true, device_assignment = [], num_cores_per_replica = 1 : i32, step_marker_location = "", topology = "", use_spmd_for_xla_partitioning = false}
-func.func @valid_compilation_cluster_no_replication() {
-  "tf.opA"() { _xla_compile_device_type = "TPU", is_stateless = true} : () -> ()
-  "tf.opB"() { _xla_compile_device_type = "TPU", is_stateless = true} : () -> ()
-  func.return
-}
-
-// -----
-
-// Check non-replicated case, empty op device to no device in cluster.
-// CHECK: "tf_device.cluster"()
-// CHECK:    "tf.opA"()
-// CHECK:    "tf.opB"()
-// CHECK: tf_device.return
-// CHECK-NOT: device =
-// CHECK: return
-func.func @valid_compilation_cluster_no_replication_empty_op_device() {
-  "tf.opA"() { _xla_compile_device_type = "TPU", device = ""} : () -> ()
-  "tf.opB"() { _xla_compile_device_type = "TPU", device = ""} : () -> ()
-  func.return
-}
-
-
-// Check non-replicated case, including expected device attr in cluster.
-// CHECK: "tf_device.cluster"()
-// CHECK:    "tf.opA"()
-// CHECK:    "tf.opB"()
-// CHECK: device = "/device:TPU:1"
-func.func @valid_compilation_cluster_no_replication_op_device() {
-  "tf.opA"() { _xla_compile_device_type = "TPU", device = "/device:TPU:1"} : () -> ()
-  "tf.opB"() { _xla_compile_device_type = "TPU", device = "/device:TPU:1"} : () -> ()
-  func.return
-}
-
-// -----
-
-// Check conflicting device names
-// CHECK: "tf_device.cluster"()
-// CHECK:    "tf.opA"()
-// CHECK:    "tf.opB"()
-// CHECK-NOT: device =
-func.func @do_nothing_if_short_names_conflict() {
-  "tf.opA"() { _xla_compile_device_type = "TPU", device = "/replica:1/task:2/device:TPU:1"} : () -> ()
-  "tf.opB"() { _xla_compile_device_type = "TPU", device = "/replica:3/task:4/device:TPU:1"} : () -> ()
-  func.return
-}
-
-// -----
-
-// Check non-replicated case, including expected device attr in cluster.
-// CHECK: "tf_device.cluster"()
-// CHECK:    "tf.opA"()
-// CHECK:    "tf.opB"()
-// CHECK: device = "/task:0/device:TPU:1"
-func.func @valid_compilation_cluster_no_replication_op_device() {
-  "tf.opA"() { _xla_compile_device_type = "TPU", device = "/task:0/device:TPU:1"} : () -> ()
-  "tf.opB"() { _xla_compile_device_type = "TPU", device = "/device:TPU:1"} : () -> ()
-  func.return
-}
-
-// -----
-
-// Check non-replicated case, including expected device attr in cluster.
-// CHECK: "tf_device.cluster"()
-// CHECK:    "tf.opA"()
-// CHECK:    "tf.opB"()
-// CHECK: device = "/task:0/device:TPU:1"
-func.func @valid_compilation_cluster_no_replication_op_device() {
-  "tf.opA"() { _xla_compile_device_type = "TPU", device = "/device:TPU:1"} : () -> ()
-  "tf.opB"() { _xla_compile_device_type = "TPU", device = "/task:0/device:TPU:1"} : () -> ()
-  func.return
-}
-
-// -----
-
-// Check non-replicated case, empty op device to no device in cluster.
-// CHECK: "tf_device.cluster"()
-// CHECK:    "tf.opA"()
-// CHECK:    "tf.opB"()
-// CHECK: tf_device.return
-// CHECK-NOT: device =
-// CHECK: return
-func.func @valid_compilation_cluster_no_replication_op_device() {
-  "tf.opA"() { _xla_compile_device_type = "TPU", device = "/device:TPU:0"} : () -> ()
-  "tf.opB"() { _xla_compile_device_type = "TPU", device = "/task:0/device:TPU:1"} : () -> ()
-  func.return
-}
-
-// -----
-
-// Check non-replicated case, empty op device to no device in cluster.
-// CHECK: "tf_device.cluster"()
-// CHECK:    "tf.opA"()
-// CHECK:    "tf.opB"()
-// CHECK: tf_device.return
-// CHECK-NOT: device =
-// CHECK: return
-func.func @valid_compilation_cluster_no_replication_op_device() {
-  "tf.opA"() { _xla_compile_device_type = "TPU", device = "/device:CPU:0"} : () -> ()
+  %0 = "tf.opA"() {_xla_compile_device_type = "XPU", _replication_info = "replicate", device = "/device:TPU:0", name = "name", is_stateless = true} : () -> tensor<i1>
+  "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _replication_info = "replicate", device = "/device:TPU:0", num_replicas = 1, topology = "topology"} : () -> ()
   func.return
 }
 
 // -----
 // expected-error@+1 {{found different '_xla_compile_device_type' attribute values (GPU,TPU) in same block which is not supported}}
 func.func @invalid_compilation_cluster_mixed_device_types() {
-  "tf.opA"() { _xla_compile_device_type = "GPU", is_stateless = true} : () -> ()
-  "tf.opB"() { _xla_compile_device_type = "TPU", is_stateless = true} : () -> ()
+  "tf.opA"() { _xla_compile_device_type = "GPU", _replication_info = "replicate", is_stateless = true} : () -> ()
+  "tf.opB"() { _xla_compile_device_type = "TPU", _replication_info = "replicate", is_stateless = true} : () -> ()
+  "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _replication_info = "replicate", device = "/device:TPU:0", num_replicas = 1, topology = "topology"} : () -> ()
   func.return
 }
 
@@ -871,37 +764,31 @@ func.func @invalid_compilation_cluster_mixed_device_types() {
 func.func @invalid_compilation_replication_cluster_mixed_device_types() {
   "tf.opA"() { _xla_compile_device_type = "CPU", _replication_info = "cluster", is_stateless = true} : () -> ()
   "tf.opB"() { _xla_compile_device_type = "GPU", _replication_info = "cluster", is_stateless = true} : () -> ()
-  func.return
-}
-
-// -----
-
-// expected-error@+1 {{found mixed replicated and non-replicated compiled ops in same block which is not supported}}
-func.func @mixed_replicated_non_replicated_ops() {
-  "tf.opA"() { _xla_compile_device_type = "TPU", is_stateless = true} : () -> ()
-  "tf.opB"() { _xla_compile_device_type = "TPU", _replication_info = "cluster", is_stateless = true} : () -> ()
+  "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _replication_info = "replicate", device = "/device:TPU:0", num_replicas = 1, topology = "topology"} : () -> ()
   func.return
 }
 
 // -----
 
 func.func @cyclic_control_dependency_no_replication() {
-  "tf.opA"() {_xla_compile_device_type = "TPU"} : () -> ()
+  "tf.opA"() {_xla_compile_device_type = "TPU", _replication_info = "replicate"} : () -> ()
   // expected-warning-re@+1 {{Op has cyclic dependency with a compilation cluster{{.*}}}}
   "tf.opB"() : () -> ()
-  "tf.opC"() {_xla_compile_device_type = "TPU"} : () -> ()
+  "tf.opC"() {_xla_compile_device_type = "TPU", _replication_info = "replicate"} : () -> ()
+  "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _replication_info = "replicate", device = "/device:TPU:0", num_replicas = 1, topology = "topology"} : () -> ()
   func.return
 }
 
 // -----
 
 func.func @cyclic_data_dependency_no_replication() {
-  %0 = "tf.opA"() {_xla_compile_device_type = "TPU", is_stateless = true} : () -> (tensor<i32>)
+  %0 = "tf.opA"() {_xla_compile_device_type = "TPU",  _replication_info = "replicate", is_stateless = true} : () -> (tensor<i32>)
   // expected-warning-re@+2 {{Op has cyclic dependency with a compilation cluster{{.*}}}}
   // expected-error@+1 {{operand #0 does not dominate this use}}
   %1 = "tf.opB"(%0) {is_stateless = true} : (tensor<i32>) -> (tensor<i32>)
   // expected-note@+1 {{operand defined here (op in the same block)}}
-  "tf.opC"(%1) {_xla_compile_device_type = "TPU", is_stateless = true} : (tensor<i32>) -> ()
+  "tf.opC"(%1) {_xla_compile_device_type = "TPU", _replication_info = "replicate", is_stateless = true} : (tensor<i32>) -> ()
+  "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _replication_info = "replicate", device = "/device:TPU:0", num_replicas = 1, topology = "topology"} : () -> ()
   func.return
 }
 
@@ -920,12 +807,12 @@ func.func @cyclic_control_dependency_replication() {
 // -----
 
 func.func @cyclic_data_dependency_replication() {
-  %0 = "tf.opA"() {_xla_compile_device_type = "TPU", is_stateless = true} : () -> (tensor<i32>)
+  %0 = "tf.opA"() {_xla_compile_device_type = "TPU", _replication_info = "cluster", is_stateless = true} : () -> (tensor<i32>)
   // expected-warning-re@+2 {{Op has cyclic dependency with a compilation cluster{{.*}}}}
   // expected-error@+1 {{operand #0 does not dominate this use}}
   %1 = "tf.opB"(%0) {is_stateless = true} : (tensor<i32>) -> (tensor<i32>)
   // expected-note@+1 {{operand defined here (op in the same block)}}
-  "tf.opC"(%1) {_xla_compile_device_type = "TPU", is_stateless = true} : (tensor<i32>) -> ()
+  "tf.opC"(%1) {_xla_compile_device_type = "TPU", _replication_info = "cluster", is_stateless = true} : (tensor<i32>) -> ()
   "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _replication_info = "cluster", device = "/device:TPU:0", num_replicas = 2, topology = "topology"} : () -> ()
   func.return
 }
@@ -935,6 +822,7 @@ func.func @cyclic_data_dependency_replication() {
 // expected-warning@+1 {{TPUReplicateMetadata for associated '_replication_info' attribute 'cluster' is missing}}
 func.func @missing_metadata() {
   "tf.opA"() {_xla_compile_device_type = "TPU", _replication_info = "cluster"} : () -> ()
+  "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _replication_info = "replicate", device = "/device:TPU:0", num_replicas = 2, topology = "topology"} : () -> ()
   func.return
 }
 
@@ -1035,11 +923,12 @@ func.func @gather_nd(%arg0: tensor<*x!tf_type.resource<tensor<80xf32>>>,
     Tindices = i32
   } : (tensor<*x!tf_type.resource<tensor<80xf32>>>, tensor<i32>) -> tensor<1x80xf32>
   %2 = "tf.Add"(%1, %1) {
-    _xla_compile_device_type = "TPU",
+    _xla_compile_device_type = "TPU", _replication_info = "cluster",
     device = "/task:0/device:TPU:0", dtype = f32
   } : (tensor<1x80xf32>, tensor<1x80xf32>) -> tensor<1x80xf32>
   %3 = "tf.ResourceGatherNd"(%arg0, %0) {
     Tindices = i32
   } : (tensor<*x!tf_type.resource<tensor<80xf32>>>, tensor<i32>) -> tensor<1x80xf32>
+  "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _replication_info = "cluster", device = "/device:TPU:0", num_replicas = 1, topology = "topology"} : () -> ()
   func.return
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
index db28242944434e..a148b78bf42332 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
@@ -605,41 +605,6 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests user given device in cluster_func is propagated correctly.
-
-module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
-  // CHECK-LABEL: func @no_replication_device
-  func.func @no_replication_device() {
-    "tf_device.cluster_func"() {_xla_compile_device_type = "TPU", _replication_info = "__no_replication_cluster", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", topology = "", device = "/job:worker/replica:0/task:0/device:TPU:1", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = [], use_spmd_for_xla_partitioning = false} : () -> ()
-    // CHECK: "tf_device.launch"() <{device = "/job:worker/replica:0/task:0/device:TPU:1"}>
-    // CHECK: tf.TPUExecute
-    // CHECK-NEXT: tf_device.return
-    func.return
-  }
-  func.func @empty_func() {
-    func.return
-  }
-}
-
-// -----
-
-// Tests CPU given device in cluster_func is not propagated.
-
-module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
-  // CHECK-LABEL: func @no_replication_device
-  func.func @no_replication_device() {
-    "tf_device.cluster_func"() {_xla_compile_device_type = "TPU", _replication_info = "__no_replication_cluster", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", topology = "", device = "/job:worker/replica:0/task:0/device:CPU:0", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = [], use_spmd_for_xla_partitioning = false} : () -> ()
-    // CHECK: "tf_device.launch"() <{device = "/job:worker/replica:0/task:0/device:TPU:0"}>
-    // CHECK: tf.TPUExecute
-    // CHECK-NEXT: tf_device.return
-    func.return
-  }
-  func.func @empty_func() {
-    func.return
-  }
-}
-
-// -----
 // Tests metadata is populated correctly for use_spmd_for_xla_partitioning ==
 // true.
 
@@ -2579,8 +2544,8 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
 // -----
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
-  func.func @missing_compilation_attribute() {
-    // expected-error@+1 {{'tf_device.cluster_func' op has '_replication_info' attribute but not '_xla_compile_device_type' attribute which is unsupported}}
+  func.func @missing_compilation_and_replication_attributes() {
+    // expected-error@+1 {{'tf_device.cluster_func' op is expected to have either both or none of '_replication_info' and '_xla_compile_device_type' attributes}}
     "tf_device.cluster_func"() {_replication_info = "cluster0", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "", topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = [], use_spmd_for_xla_partitioning = false} : () -> ()
     func.return
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_validate_inputs.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_validate_inputs.mlir
index 4af8cdf06f727f..295079b24fe799 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_validate_inputs.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_validate_inputs.mlir
@@ -3,7 +3,7 @@
 // CHECK-LABEL: func @num_replicas_replicated
 func.func @num_replicas_replicated(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
   %0:2 = tf_executor.graph {
-    %control = tf_executor.island() wraps "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _tpu_replicate = "cluster", device = "/device:TPU:0", num_replicas = 2, topology = "topology"} : () -> ()
+    %control = tf_executor.island() wraps "tf.TPUReplicateMetadata"() {_tpu_replicate = "cluster", device = "/device:TPU:0", num_replicas = 2, topology = "topology"} : () -> ()
     %ri, %c0 = tf_executor.island wraps "tf.TPUReplicatedInput"(%arg0, %arg1) {index = 1 : i64, is_mirrored_variable = false, is_packed = false} : (tensor<i32>, tensor<i32>) -> tensor<i32>
     %out, %c1 = tf_executor.island wraps "tf.opA"(%ri) {_tpu_replicate = "cluster"} : (tensor<i32>) -> tensor<i32>
     %ro:2, %c2 = tf_executor.island wraps "tf.TPUReplicatedOutput"(%out) : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
@@ -16,7 +16,7 @@ func.func @num_replicas_replicated(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2
 
 func.func @num_replicas_replicated_input(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
   %0:2 = tf_executor.graph {
-    %control = tf_executor.island() wraps "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _tpu_replicate = "cluster", device = "/device:TPU:0", num_replicas = 2, topology = "topology"} : () -> ()
+    %control = tf_executor.island() wraps "tf.TPUReplicateMetadata"() {_tpu_replicate = "cluster", device = "/device:TPU:0", num_replicas = 2, topology = "topology"} : () -> ()
     // expected-error @+1 {{'tf.TPUReplicatedInput' op TF2XLA TPU bridge input check: number of inputs inconsistent. num_replicas=2 no. of inputs=3}}
     %ri, %c0 = tf_executor.island wraps "tf.TPUReplicatedInput"(%arg0, %arg1, %arg1) {index = 1 : i64, is_mirrored_variable = false, is_packed = false} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<i32>
     %out, %c1 = tf_executor.island wraps "tf.opA"(%ri) {_tpu_replicate = "cluster"} : (tensor<i32>) -> tensor<i32>
@@ -30,7 +30,7 @@ func.func @num_replicas_replicated_input(%arg0: tensor<i32>, %arg1: tensor<i32>,
 
 func.func @num_replicas_replicated_input_packed(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
   %0:2 = tf_executor.graph {
-    %control = tf_executor.island() wraps "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _tpu_replicate = "cluster", device = "/device:TPU:0", num_replicas = 2, topology = "topology"} : () -> ()
+    %control = tf_executor.island() wraps "tf.TPUReplicateMetadata"() {_tpu_replicate = "cluster", device = "/device:TPU:0", num_replicas = 2, topology = "topology"} : () -> ()
     // expected-error @+1 {{'tf.TPUReplicatedInput' op TF2XLA TPU bridge input check: packed with number of inputs not 1. num_replicas=2 no. of inputs=2}}
     %ri, %c0 = tf_executor.island wraps "tf.TPUReplicatedInput"(%arg0, %arg1) {index = 1 : i64, is_mirrored_variable = false, is_packed = true} : (tensor<i32>, tensor<i32>) -> tensor<i32>
     %out, %c1 = tf_executor.island wraps "tf.opA"(%ri) {_tpu_replicate = "cluster"} : (tensor<i32>) -> tensor<i32>
@@ -44,7 +44,7 @@ func.func @num_replicas_replicated_input_packed(%arg0: tensor<i32>, %arg1: tenso
 
 func.func @num_replicas_replicated_output(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
   %0:2 = tf_executor.graph {
-    %control = tf_executor.island() wraps "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _tpu_replicate = "cluster", device = "/device:TPU:0", num_replicas = 2, topology = "topology"} : () -> ()
+    %control = tf_executor.island() wraps "tf.TPUReplicateMetadata"() {_tpu_replicate = "cluster", device = "/device:TPU:0", num_replicas = 2, topology = "topology"} : () -> ()
     %ri, %c0 = tf_executor.island wraps "tf.TPUReplicatedInput"(%arg0, %arg1) {index = 1 : i64, is_mirrored_variable = false, is_packed = false} : (tensor<i32>, tensor<i32>) -> tensor<i32>
     %out, %c1 = tf_executor.island wraps "tf.opA"(%ri) {_tpu_replicate = "cluster"} : (tensor<i32>) -> tensor<i32>
     // expected-error @+1 {{'tf.TPUReplicatedOutput' op TF2XLA TPU bridge input check: number of outputs inconsistent. num_replicas=2 no. of outputs=3}}
@@ -58,7 +58,7 @@ func.func @num_replicas_replicated_output(%arg0: tensor<i32>, %arg1: tensor<i32>
 
 func.func @num_core_per_replica_partitioned_input(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
   %0:2 = tf_executor.graph {
-    %control = tf_executor.island() wraps "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _tpu_replicate = "cluster", device = "/device:TPU:0", num_cores_per_replica = 2 : i64, num_replicas = 1 : i64, topology = "topology"} : () -> ()
+    %control = tf_executor.island() wraps "tf.TPUReplicateMetadata"() {_tpu_replicate = "cluster", device = "/device:TPU:0", num_cores_per_replica = 2 : i64, num_replicas = 1 : i64, topology = "topology"} : () -> ()
     // expected-error @+1 {{'tf.TPUPartitionedInput' op TF2XLA TPU bridge input check: number of inputs inconsistent. num_cores_per_replica=2 no. of inputs=3}}
     %pi, %c0 = tf_executor.island wraps "tf.TPUPartitionedInput"(%arg0, %arg1, %arg1) {index = 1 : i64} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<i32>
     %out, %c1 = tf_executor.island wraps "tf.opA"(%pi) {_tpu_replicate = "cluster"} : (tensor<i32>) -> tensor<i32>
@@ -72,7 +72,7 @@ func.func @num_core_per_replica_partitioned_input(%arg0: tensor<i32>, %arg1: ten
 
 func.func @num_core_per_replica_partitioned_output(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
   %0:2 = tf_executor.graph {
-    %control = tf_executor.island() wraps "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _tpu_replicate = "cluster", device = "/device:TPU:0", num_cores_per_replica = 2 : i64, num_replicas = 1 : i64, topology = "topology"} : () -> ()
+    %control = tf_executor.island() wraps "tf.TPUReplicateMetadata"() {_tpu_replicate = "cluster", device = "/device:TPU:0", num_cores_per_replica = 2 : i64, num_replicas = 1 : i64, topology = "topology"} : () -> ()
     %pi, %c0 = tf_executor.island wraps "tf.TPUPartitionedInput"(%arg0, %arg1) {index = 1 : i64} : (tensor<i32>, tensor<i32>) -> tensor<i32>
     %out, %c1 = tf_executor.island wraps "tf.opA"(%pi) {_tpu_replicate = "cluster"} : (tensor<i32>) -> tensor<i32>
     // expected-error @+1 {{'tf.TPUPartitionedOutput' op TF2XLA TPU bridge input check: number of outputs inconsistent. num_cores_per_replica=2 no. of outputs=3}}
@@ -86,7 +86,7 @@ func.func @num_core_per_replica_partitioned_output(%arg0: tensor<i32>, %arg1: te
 
 func.func @validate_tpu_replicate_no_attr(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
   %0:2 = tf_executor.graph {
-    %control = tf_executor.island() wraps "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _tpu_replicate = "cluster", device = "/device:TPU:0", num_replicas = 2, topology = "topology"} : () -> ()
+    %control = tf_executor.island() wraps "tf.TPUReplicateMetadata"() {_tpu_replicate = "cluster", device = "/device:TPU:0", num_replicas = 2, topology = "topology"} : () -> ()
     %ri, %c0 = tf_executor.island wraps "tf.TPUReplicatedInput"(%arg0, %arg1) {index = 1 : i64, is_mirrored_variable = false, is_packed = false} : (tensor<i32>, tensor<i32>) -> tensor<i32>
     %out, %c1 = tf_executor.island wraps "tf.opA"(%ri) {_tpu_replicate="cluster"}: (tensor<i32>) -> tensor<i32>
     // expected-warning @+1 {{TF2XLA TPU bridge input check: cluster op = tf.opA with cluster = cluster has successor as non cluster op tf.opB}}
@@ -102,7 +102,7 @@ func.func @validate_tpu_replicate_no_attr(%arg0: tensor<i32>, %arg1: tensor<i32>
 
 func.func @validate_tpu_replicate_wrong_attr(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
   %0:2 = tf_executor.graph {
-    %control = tf_executor.island() wraps "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _tpu_replicate = "cluster", device = "/device:TPU:0", num_replicas = 2, topology = "topology"} : () -> ()
+    %control = tf_executor.island() wraps "tf.TPUReplicateMetadata"() {_tpu_replicate = "cluster", device = "/device:TPU:0", num_replicas = 2, topology = "topology"} : () -> ()
     %ri, %c0 = tf_executor.island wraps "tf.TPUReplicatedInput"(%arg0, %arg1) {index = 1 : i64, is_mirrored_variable = false, is_packed = false} : (tensor<i32>, tensor<i32>) -> tensor<i32>
     %out, %c1 = tf_executor.island wraps "tf.opA"(%ri) {_tpu_replicate = "cluster_wrong"}: (tensor<i32>) -> tensor<i32>
     // expected-error @+1 {{'tf.opB' op TF2XLA TPU bridge input check: mismatch clusters tpu_replicate attr. Parent op tf.opA with cluster = cluster_wrong has successor cluster op tf.opB with cluster = cluster}}
@@ -117,7 +117,7 @@ func.func @validate_tpu_replicate_wrong_attr(%arg0: tensor<i32>, %arg1: tensor<i
 
 func.func @valid_xla_nonxla(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
   %0:2 = tf_executor.graph {
-    %control = tf_executor.island wraps "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _tpu_replicate = "cluster", device = "/device:TPU:0", num_replicas = 2, topology = "topology"} : () -> ()
+    %control = tf_executor.island wraps "tf.TPUReplicateMetadata"() {_tpu_replicate = "cluster", device = "/device:TPU:0", num_replicas = 2, topology = "topology"} : () -> ()
     %ri, %c0 = tf_executor.island wraps "tf.TPUReplicatedInput"(%arg0, %arg1) {index = 1 : i64, is_mirrored_variable = false, is_packed = false} : (tensor<i32>, tensor<i32>) -> tensor<i32>
     %out, %c1 = tf_executor.island wraps "tf.opA"(%ri) {_tpu_replicate = "cluster", device = "TPU"} : (tensor<i32>) -> tensor<i32>
     %ro:2, %c2 = tf_executor.island wraps "tf.TPUReplicatedOutput"(%out) : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
@@ -130,7 +130,7 @@ func.func @valid_xla_nonxla(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tenso
 
 func.func @valid_xla_nonxla_warning(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> (tensor<*x!tf_type.string>, tensor<*x!tf_type.string>) {
   %0:2 = tf_executor.graph {
-    %control = tf_executor.island wraps "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _tpu_replicate = "cluster", device = "/device:TPU:0", num_replicas = 2, topology = "topology"} : () -> ()
+    %control = tf_executor.island wraps "tf.TPUReplicateMetadata"() {_tpu_replicate = "cluster", device = "/device:TPU:0", num_replicas = 2, topology = "topology"} : () -> ()
     %ri, %c0 = tf_executor.island wraps "tf.TPUReplicatedInput"(%arg0, %arg1) {index = 1 : i64, is_mirrored_variable = false, is_packed = false} : (tensor<i32>, tensor<i32>) -> tensor<*x!tf_type.string>
     // expected-warning @+1 {{TF/XLA TPU bridge input check: found invalid op. tf.Identity can't be both xla and non-xla}}
     %out, %c1 = tf_executor.island(%c0) wraps "tf.Identity"(%ri) {_tpu_replicate = "cluster", device = ""} : (tensor<*x!tf_type.string>) -> tensor<*x!tf_type.string>
@@ -151,7 +151,7 @@ func.func @valid_xla_nonxla_warning(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg
 
 func.func @valid_MAXIMAL_sharding_device(%arg0: tensor<i32>) -> tensor<i32> {
   %0 = tf_executor.graph {
-    %control = tf_executor.island wraps "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _tpu_replicate = "cluster", device = "/device:TPU:0", num_cores_per_replica = 2 : i64, num_replicas = 1 : i64, topology = "topology"} : () -> ()
+    %control = tf_executor.island wraps "tf.TPUReplicateMetadata"() {_tpu_replicate = "cluster", device = "/device:TPU:0", num_cores_per_replica = 2 : i64, num_replicas = 1 : i64, topology = "topology"} : () -> ()
     %0, %c = tf_executor.island wraps "tf.Identity"(%arg0) {_tpu_replicate = "cluster", _XlaSharding = "\08\01\1A\01\01\22\01\00"} : (tensor<i32>) -> tensor<i32>
     tf_executor.fetch %0 : tensor<i32>
   }
@@ -168,7 +168,7 @@ func.func @valid_MAXIMAL_sharding_device(%arg0: tensor<i32>) -> tensor<i32> {
 
 func.func @invalid_MAXIMAL_sharding_device(%arg0: tensor<i32>) -> tensor<i32> {
   %0 = tf_executor.graph {
-    %control = tf_executor.island wraps "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _tpu_replicate = "cluster", device = "/device:TPU:0", num_cores_per_replica = 2 : i64, num_replicas = 1 : i64, topology = "topology"} : () -> ()
+    %control = tf_executor.island wraps "tf.TPUReplicateMetadata"() {_tpu_replicate = "cluster", device = "/device:TPU:0", num_cores_per_replica = 2 : i64, num_replicas = 1 : i64, topology = "topology"} : () -> ()
     // expected-error @+1 {{'tf.Identity' op TF2XLA TPU bridge input check: invalid sharding device 2 for num_cores_per_replica = 2}}
     %0, %c = tf_executor.island wraps "tf.Identity"(%arg0) {_tpu_replicate = "cluster", _XlaSharding = "\08\01\1A\01\01\22\01\02"} : (tensor<i32>) -> tensor<i32>
     tf_executor.fetch %0 : tensor<i32>
@@ -194,7 +194,7 @@ func.func @invalid_MAXIMAL_sharding_device(%arg0: tensor<i32>) -> tensor<i32> {
 
 func.func @invalid_TUPLE_sharding_arity(%arg0: tensor<i32>) -> tensor<i32> {
   %0 = tf_executor.graph {
-    %control = tf_executor.island wraps "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _tpu_replicate = "cluster", device = "/device:TPU:0", num_cores_per_replica = 2 : i64, num_replicas = 1 : i64, topology = "topology"} : () -> ()
+    %control = tf_executor.island wraps "tf.TPUReplicateMetadata"() {_tpu_replicate = "cluster", device = "/device:TPU:0", num_cores_per_replica = 2 : i64, num_replicas = 1 : i64, topology = "topology"} : () -> ()
     // expected-error @+1 {{'tf.Identity' op TF2XLA TPU bridge input check: invalid no. of tuple shardings 2 for arity = 1}}
     %0, %c = tf_executor.island wraps "tf.Identity"(%arg0) {_tpu_replicate = "cluster", _XlaSharding = "\08\02\2a\08\08\01\1a\01\01\22\01\00\2a\08\08\01\1a\01\01\22\01\01"} : (tensor<i32>) -> tensor<i32>
     tf_executor.fetch %0 : tensor<i32>
@@ -220,11 +220,36 @@ func.func @invalid_TUPLE_sharding_arity(%arg0: tensor<i32>) -> tensor<i32> {
 
 func.func @outfeed_enqueue_tuple_sharding_exception(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> {
   %0 = tf_executor.graph {
-    %control = tf_executor.island wraps "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _tpu_replicate = "cluster", device = "/device:TPU:0", num_cores_per_replica = 2 : i64, num_replicas = 1 : i64, topology = "topology"} : () -> ()
+    %control = tf_executor.island wraps "tf.TPUReplicateMetadata"() {_tpu_replicate = "cluster", device = "/device:TPU:0", num_cores_per_replica = 2 : i64, num_replicas = 1 : i64, topology = "topology"} : () -> ()
     %0, %c0 = tf_executor.island wraps "tf.AddV2"(%arg0, %arg1) {_tpu_replicate = "cluster"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
     %c1 = tf_executor.island wraps "tf.OutfeedEnqueueTuple"(%arg0, %arg1) {_tpu_replicate = "cluster", _XlaSharding = "\08\02\2a\08\08\01\1a\01\01\22\01\00\2a\08\08\01\1a\01\01\22\01\01"} : (tensor<i32>, tensor<i32>) -> ()
     tf_executor.fetch %0 : tensor<i32>
   }
   return %0 : tensor<i32>
 }
-// -----
\ No newline at end of file
+
+
+// -----
+
+func.func @single_core_tpu(%arg0: tensor<i32>) -> () {
+  tf_executor.graph {
+    // expected-error @+1 {{found a single-core TPU graph}}
+    tf_executor.island wraps "tf.Identity"(%arg0) {_xla_compile_device_type = "TPU"} : (tensor<i32>) -> tensor<i32>
+     tf_executor.fetch
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @num_replicas_1
+func.func @num_replicas_1(%arg0: tensor<i32>) -> (tensor<i32>) {
+  %0 = tf_executor.graph {
+    %control = tf_executor.island() wraps "tf.TPUReplicateMetadata"() {_tpu_replicate = "cluster", device = "/device:TPU:0", num_replicas = 1, num_cores_per_replica = 1, topology = "topology"} : () -> ()
+    %ri, %c0 = tf_executor.island wraps "tf.TPUReplicatedInput"(%arg0) {index = 1 : i64, is_mirrored_variable = false, is_packed = false} : (tensor<i32>) -> tensor<i32>
+    %out, %c1 = tf_executor.island wraps "tf.opA"(%ri) {_tpu_replicate = "cluster"} : (tensor<i32>) -> tensor<i32>
+    %ro, %c2 = tf_executor.island wraps "tf.TPUReplicatedOutput"(%out) : (tensor<i32>) -> tensor<i32>
+    tf_executor.fetch %ro : tensor<i32>
+  }
+  return %0 : tensor<i32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/xla_validate_inputs.mlir b/tensorflow/compiler/mlir/tensorflow/tests/xla_validate_inputs.mlir
new file mode 100644
index 00000000000000..1beb284ce9509a
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/xla_validate_inputs.mlir
@@ -0,0 +1,21 @@
+// RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-xla-validate-inputs
+
+// expected-error @+1 {{expects no nested calls of entry functions as they prevent graph traversal in some passes from working correctly}}
+func.func @nested_entry_functions() attributes {tf.entry_function = {}} {
+  tf_executor.graph {
+     %control = tf_executor.island wraps "tf.StatefulPartitionedCall"() {config = "", config_proto = "", device = "/device:CPU:0", executor_type = "", f = @func} : () -> ()
+     tf_executor.fetch
+  }
+  func.return
+}
+
+func.func @func() attributes {tf.entry_function = {}} {
+  func.return
+}
+
+// -----
+
+// expected-error @+1 {{does not support top-level compilation marker}}
+func.func @top_level_compilation_marker() attributes {_xla_compile_device_type = "CPU", tf.entry_function = {}} {
+  func.return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/xla_validate_iputs.mlir b/tensorflow/compiler/mlir/tensorflow/tests/xla_validate_iputs.mlir
deleted file mode 100644
index 81f3398321f569..00000000000000
--- a/tensorflow/compiler/mlir/tensorflow/tests/xla_validate_iputs.mlir
+++ /dev/null
@@ -1,20 +0,0 @@
-// RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-xla-validate-inputs
-
-// expected-error @+1 {{TF2XLA MLIR CPU/GPU phase 1 bridge expects no nested calls of entry functions as they prevent graph traversal in some passes from working correctly}}
-func.func @nested_entry_functions(%arg0: tensor<i32>) -> tensor<i32> attributes {tf.entry_function = {}} {
-  %0 = "tf.StatefulPartitionedCall"(%arg0) {config = "", config_proto = "", device = "/device:CPU:0", executor_type = "", f = @func} : (tensor<i32>) -> (tensor<i32>)
-  func.return %0 : tensor<i32>
-}
-
-func.func @func(%arg0: tensor<i32>) -> tensor<i32> attributes {tf.entry_function = {}} {
-  func.return %arg0 : tensor<i32>
-}
-
-// -----
-
-// expected-error @+1 {{TF2XLA MLIR CPU/GPU MLIR phase 1 bridge expects single region and single block in an entry function}}
-func.func @multi_blocks_entry_function(%arg0: tensor<i32>) -> tensor<i32> attributes {tf.entry_function = {}} {
-  cf.br ^bb1
-^bb1:
-  func.return %arg0 : tensor<i32>
-}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/BUILD b/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
index 2e090224a5c86c..4daaf633212451 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
@@ -147,6 +147,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:attribute_utils",
         "//tensorflow/core:framework",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -298,6 +299,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -531,7 +533,6 @@ cc_library(
         "tpu_reorder_replicate_and_partitioned_inputs.cc",
         "tpu_resource_partitioning.cc",
         "tpu_resource_read_for_write.cc",
-        "tpu_sharding_identification_pass.cc",
         "tpu_space_to_depth_pass.cc",
         "tpu_update_embedding_enqueue_op_inputs.cc",
         "tpu_validate_inputs.cc",
@@ -777,6 +778,7 @@ cc_library(
         ":tf_pass_inc_gen",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:dynamic_shape_utils",
+        "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
         "//tensorflow/compiler/mlir/tensorflow:serialize_mlir_module_utils",
         "//tensorflow/compiler/mlir/tensorflow:shape_inference_utils",
         "//tensorflow/compiler/mlir/tensorflow:translate_utils",
@@ -786,6 +788,9 @@ cc_library(
         "//tensorflow/core/ir/types:Dialect",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncExtensions",
@@ -796,6 +801,7 @@ cc_library(
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:TransformUtils",
+        "@local_tsl//tsl/platform:errors",
         "@local_xla//xla:shape_util",
         "@local_xla//xla:window_util",
         "@local_xla//xla:xla_data_proto_cc",
@@ -1029,6 +1035,7 @@ cc_library(
         "//tensorflow/core:framework",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -1040,6 +1047,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
         "@local_xla//xla:shape_util",
         "@local_xla//xla/mlir_hlo",
         "@local_xla//xla/stream_executor/tpu:c_api_conversions",
@@ -1062,6 +1070,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
         "@local_tsl//tsl/platform:path",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/annotate_parameter_replication.cc b/tensorflow/compiler/mlir/tensorflow/transforms/annotate_parameter_replication.cc
index 996686eb525d03..52765fb5657eba 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/annotate_parameter_replication.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/annotate_parameter_replication.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
@@ -70,14 +71,14 @@ void AnnotateParameterReplicationPass::runOnOperation() {
     if (mirrored_variable_indices_attr) {
       for (const auto& mirrored_index : mirrored_variable_indices_attr) {
         mirrored_replicate_args.insert(
-            mirrored_index.cast<IntegerAttr>().getInt());
+            mlir::cast<IntegerAttr>(mirrored_index).getInt());
       }
     }
     auto func =
         llvm::cast<func::FuncOp>(m.lookupSymbol(cluster_func.getFunc()));
     for (auto entry : llvm::enumerate(cluster_func.getOperands())) {
       auto operand = SkipIdentityAndReadVariable(entry.value());
-      auto block_arg = operand.dyn_cast<BlockArgument>();
+      auto block_arg = mlir::dyn_cast<BlockArgument>(operand);
       if (block_arg && block_arg.getOwner() == &replicate.GetBody()) {
         // Only mirrored args of ReplicateOp can be annotated.
         if (mirrored_replicate_args.count(block_arg.getArgNumber()) == 0) {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.cc b/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.cc
index 14ab17be0fdee5..c6e21cb1e03054 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.cc
@@ -53,8 +53,8 @@ class ConvertTFBatchMatMulToEinsumOp
     Value input_rhs = op.getY();
 
     // LHS and RHS must be a ranked tensor type
-    auto lhs_type = input_lhs.getType().dyn_cast<RankedTensorType>();
-    auto rhs_type = input_rhs.getType().dyn_cast<RankedTensorType>();
+    auto lhs_type = mlir::dyn_cast<RankedTensorType>(input_lhs.getType());
+    auto rhs_type = mlir::dyn_cast<RankedTensorType>(input_rhs.getType());
 
     if (!lhs_type || !rhs_type) return failure();
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_ops_by_policy.cc b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_ops_by_policy.cc
index 4b409ffe1f614f..3ce5fb5bcb8379 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_ops_by_policy.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_ops_by_policy.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 
 #define DEBUG_TYPE "cluster-ops-by-policy"
@@ -44,7 +45,7 @@ ValueConstraint Merge(ValueConstraint a, ValueConstraint b) {
 
 LogicalResult IsStaticallyResolved(Value value, ValueConstraint constraint) {
   // Resolve constraints inferred from the tensor type.
-  if (auto tensor = value.getType().dyn_cast<TensorType>()) {
+  if (auto tensor = mlir::dyn_cast<TensorType>(value.getType())) {
     if (constraint == ValueConstraint::kRank && tensor.hasRank())
       return success();
     if (constraint == ValueConstraint::kShape && tensor.hasStaticShape())
@@ -710,7 +711,7 @@ void EmitValueConstraintsRemarks(const ValuesConstraintSet &constraints) {
 void EmitInputsConstraintsRemarks(func::FuncOp func,
                                   const ValuesConstraintSet &constraints) {
   constraints.Walk([&](Value value, ValueConstraint constraint) {
-    if (auto arg = value.dyn_cast<BlockArgument>())
+    if (auto arg = mlir::dyn_cast<BlockArgument>(value))
       if (arg.getOwner() == &func.getBody().front())
         func.emitRemark(llvm::formatv("input #{0} constrained to: {1}",
                                       arg.getArgNumber(), constraint));
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_outlining.cc b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_outlining.cc
index cbe4ae6b2e41b1..355aded4f2d97a 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_outlining.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_outlining.cc
@@ -134,10 +134,6 @@ void OutlineCluster(tf_device::ClusterOp cluster_op, SymbolTable* symbol_table,
   auto cluster_func_op = builder->create<tf_device::ClusterFuncOp>(
       cluster_op.getLoc(), outlined_func.getFunctionType().getResults(),
       live_ins.getArrayRef(), cluster_op->getAttrs());
-  auto device_attr = cluster_op->getAttrOfType<StringAttr>(TF::kDeviceAttr);
-  if (device_attr && !device_attr.getValue().empty()) {
-    cluster_func_op->setAttr(TF::kDeviceAttr, device_attr);
-  }
   cluster_op.replaceAllUsesWith(cluster_func_op);
   cluster_op.erase();
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_tf_ops_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_tf_ops_pass.cc
index 5d9f5f9718446f..3d3e1305993a30 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_tf_ops_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_tf_ops_pass.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/IRMapping.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/core/util/device_name_utils.h"
@@ -124,7 +125,7 @@ std::optional<llvm::StringMap<FunctionMetadata>> GetFunctionMetadatas(
 
       // If the value is defined as an argument of the func_op, adds it to
       // the argument list of the function that uses this op.
-      if (BlockArgument block_arg = value.dyn_cast<BlockArgument>()) {
+      if (BlockArgument block_arg = mlir::dyn_cast<BlockArgument>(value)) {
         if (StringAttr attr = func_op.getArgAttrOfType<StringAttr>(
                 block_arg.getArgNumber(), kTFDeviceAttr)) {
           value_device = attr.getValue().str();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.cc b/tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.cc
index 59faa220521f0b..5a83e75e9eedf4 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.cc
@@ -62,7 +62,7 @@ Value GetR1Const(ArrayRef<int64_t> r1, OpBuilder builder, Location loc,
 
 Value GetIndicesForElement(Value index, Value buffer, OpBuilder builder,
                            Location loc) {
-  auto buffer_type = buffer.getType().cast<RankedTensorType>();
+  auto buffer_type = mlir::cast<RankedTensorType>(buffer.getType());
   if (buffer_type.getShape().size() == 1) return index;
   // Create a concat of index and trailing zeros.
   llvm::SmallVector<int64_t, 8> zeros(buffer_type.getShape().size() - 1, 0);
@@ -77,7 +77,7 @@ Value GetIndicesForElement(Value index, Value buffer, OpBuilder builder,
 
 Value GetElement(Value index, Value buffer, OpBuilder builder, Location loc,
                  bool keep_slice_shape) {
-  auto buffer_type = buffer.getType().cast<RankedTensorType>();
+  auto buffer_type = mlir::cast<RankedTensorType>(buffer.getType());
   // Create a slice then reshape to remove the leading trivial dimension of
   // size 1.
   llvm::SmallVector<int64_t, 8> slice_size =
@@ -102,7 +102,7 @@ Value GetElement(Value index, Value buffer, OpBuilder builder, Location loc,
 
 Value SetElement(Value index, Value buffer, Value element, OpBuilder builder,
                  Location loc) {
-  auto buffer_type = buffer.getType().cast<RankedTensorType>();
+  auto buffer_type = mlir::cast<RankedTensorType>(buffer.getType());
   // Reshape the element to add a leading dimension of size 1 if th element does
   // not have that dimension, then perform a dynamic update slice.
   auto slice_shape = llvm::to_vector<8>(buffer_type.getShape());
@@ -208,7 +208,7 @@ std::optional<RankedTensorType> GetElementTypeFromAccess(
       if (type_from_alias.has_value()) return type_from_alias;
     } else if (auto type = infer_from_op(use.getOwner())) {
       if (!type) continue;
-      auto elem_type = type->dyn_cast<RankedTensorType>();
+      auto elem_type = mlir::dyn_cast<RankedTensorType>(*type);
       if (elem_type && elem_type.hasStaticShape()) return elem_type;
     }
   }
@@ -220,8 +220,8 @@ Value ReadLocalVariable(Value local_var, OpBuilder builder, Location loc) {
   return builder
       .create<TF::ReadVariableOp>(
           loc,
-          ArrayRef<Type>{getElementTypeOrSelf(local_var.getType())
-                             .cast<TF::ResourceType>()
+          ArrayRef<Type>{mlir::cast<TF::ResourceType>(
+                             getElementTypeOrSelf(local_var.getType()))
                              .getSubtypes()[0]},
           ArrayRef<Value>{local_var})
       .getValue();
@@ -246,7 +246,7 @@ Value AccumulateBuffers(Value a, Value b, OpBuilder builder, Location loc) {
 namespace {
 
 int64_t GetFirstIfIndicesAreContiguous(Value indices) {
-  auto type = indices.getType().dyn_cast<RankedTensorType>();
+  auto type = mlir::dyn_cast<RankedTensorType>(indices.getType());
   if (!type) return -1;
   auto indices_op = indices.getDefiningOp();
   if (!indices_op) return -1;
@@ -270,9 +270,10 @@ int64_t GetFirstIfIndicesAreContiguous(Value indices) {
 
 Value GatherElements(Value indices, Value buffer, OpBuilder builder,
                      Location loc) {
-  auto buffer_type = buffer.getType().cast<RankedTensorType>();
+  auto buffer_type = mlir::cast<RankedTensorType>(buffer.getType());
   auto result_shape = llvm::to_vector<8>(buffer_type.getShape());
-  result_shape[0] = indices.getType().cast<RankedTensorType>().getDimSize(0);
+  result_shape[0] =
+      mlir::cast<RankedTensorType>(indices.getType()).getDimSize(0);
   int64_t maybe_contiguous_start = GetFirstIfIndicesAreContiguous(indices);
   if (maybe_contiguous_start >= 0) {
     llvm::SmallVector<int64_t, 8> slice_starts(result_shape.size(), 0);
@@ -293,8 +294,8 @@ Value GatherElements(Value indices, Value buffer, OpBuilder builder,
 
 Value ScatterAccumulateElements(Value indices, Value updates, Value buffer,
                                 OpBuilder builder, Location loc) {
-  auto buffer_type = buffer.getType().cast<RankedTensorType>();
-  auto updates_type = updates.getType().cast<RankedTensorType>();
+  auto buffer_type = mlir::cast<RankedTensorType>(buffer.getType());
+  auto updates_type = mlir::cast<RankedTensorType>(updates.getType());
   int64_t maybe_contiguous_start = GetFirstIfIndicesAreContiguous(indices);
   if (maybe_contiguous_start == 0 && buffer_type == updates_type) {
     return AccumulateBuffers(buffer, updates, builder, loc);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
index 880dfa837e881c..ca5eb4bc737b99 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
@@ -47,7 +48,7 @@ static bool IsFoldedByDefaultPolicy(Operation* inst) {
   auto get_size = [&](TypeRange types) {
     int64_t size = 0;
     for (auto t : types) {
-      auto tensor_type = t.cast<TensorType>();
+      auto tensor_type = mlir::cast<TensorType>(t);
       // Ignore types with undefined bit widths.
       if (!tensor_type.getElementType().isIntOrFloat()) continue;
       if (!tensor_type.hasStaticShape()) {
@@ -93,7 +94,7 @@ LogicalResult ConstantFoldFallbackHook(
   // propagation.
   bool has_empty_numerical_results =
       llvm::all_of(inst->getResultTypes(), [](Type ty) {
-        ShapedType shaped_ty = ty.cast<ShapedType>();
+        ShapedType shaped_ty = mlir::cast<ShapedType>(ty);
         Type element_ty = shaped_ty.getElementType();
         return shaped_ty.hasStaticShape() && shaped_ty.getNumElements() == 0 &&
                element_ty.isIntOrFloat();
@@ -103,7 +104,7 @@ LogicalResult ConstantFoldFallbackHook(
       // addressed.
       inst->isRegistered()) {
     for (Type ty : inst->getResultTypes()) {
-      auto shaped_ty = ty.cast<ShapedType>();
+      auto shaped_ty = mlir::cast<ShapedType>(ty);
       results.push_back(
           DenseElementsAttr::get(shaped_ty, llvm::ArrayRef<Attribute>()));
     }
@@ -112,14 +113,14 @@ LogicalResult ConstantFoldFallbackHook(
 
   // Returns directly if any of the operands is not an elements attributes.
   if (std::any_of(operands.begin(), operands.end(), [](Attribute attr) {
-        return !attr || !attr.isa<ElementsAttr>();
+        return !attr || !mlir::isa<ElementsAttr>(attr);
       }))
     return failure();
 
   SmallVector<ElementsAttr, 4> inputs;
   inputs.reserve(operands.size());
   for (auto input : operands) {
-    inputs.push_back(input.cast<ElementsAttr>());
+    inputs.push_back(mlir::cast<ElementsAttr>(input));
   }
 
   SmallVector<Attribute> constants;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold_utils.cc b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold_utils.cc
index 6d28fa03a988a6..84c96590910243 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold_utils.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h"
@@ -65,8 +66,8 @@ bool CanBeFolded(Operation* inst) {
   // This creates opaque variant constants which lose information and would
   // require "raising" later.
   for (const Type type : inst->getResultTypes()) {
-    if (const TensorType tensor_type = type.dyn_cast<TensorType>()) {
-      if (tensor_type.getElementType().isa<VariantType>()) {
+    if (const TensorType tensor_type = mlir::dyn_cast<TensorType>(type)) {
+      if (mlir::isa<VariantType>(tensor_type.getElementType())) {
         return false;
       }
     }
@@ -134,7 +135,7 @@ LogicalResult EvaluateOperation(Operation* inst,
       node_def->get()->op(), node_def->get()->name(), host_cpu, operands.size(),
       [&](tensorflow::AttrValueMap* attr_value_map) {
         *attr_value_map = node_def->get()->attr();
-        return tensorflow::OkStatus();
+        return absl::OkStatus();
       },
       fallback_state.device_manager(),
       fallback_state.process_function_library_runtime());
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/convert_control_to_data_outputs.cc b/tensorflow/compiler/mlir/tensorflow/transforms/convert_control_to_data_outputs.cc
index 4de43317677f63..6262cad26ca6e3 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/convert_control_to_data_outputs.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/convert_control_to_data_outputs.cc
@@ -114,7 +114,7 @@ SmallVector<TF::WhileOp> GetWhileCallers(func::FuncOp func,
 }
 
 bool IsResourceType(Type type) {
-  return getElementTypeOrSelf(type).isa<TF::ResourceType>();
+  return mlir::isa<TF::ResourceType>(getElementTypeOrSelf(type));
 }
 
 bool OnlyOperatesOnCompositeDevices(
@@ -124,11 +124,11 @@ bool OnlyOperatesOnCompositeDevices(
   auto& alias_analysis = side_effect_analysis.GetAliasAnalysis();
   llvm::SmallSet<int, 8> read_array;
   for (const Attribute& attr : op.getDeviceVarReadsIndices()) {
-    read_array.insert(attr.cast<IntegerAttr>().getInt());
+    read_array.insert(mlir::cast<IntegerAttr>(attr).getInt());
   }
   llvm::SmallSet<int, 8> update_array;
   for (const Attribute& attr : op.getDeviceVarUpdatesIndices()) {
-    update_array.insert(attr.cast<IntegerAttr>().getInt());
+    update_array.insert(mlir::cast<IntegerAttr>(attr).getInt());
   }
 
   for (auto& arg : op->getOpOperands()) {
@@ -270,7 +270,7 @@ void CollectChainResources(
 //
 // Checks if the value `control` is a NoOp control barrier.
 bool IsNoOpControlBarrier(Value control) {
-  if (!control.getType().isa<ControlType>()) return false;
+  if (!mlir::isa<ControlType>(control.getType())) return false;
 
   auto control_island = dyn_cast_or_null<IslandOp>(control.getDefiningOp());
   if (!control_island) return false;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_reduce_dataset.cc b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_reduce_dataset.cc
index 8e89f3988dd8d4..4af1246d5a72b6 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_reduce_dataset.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_reduce_dataset.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h"
@@ -76,7 +77,7 @@ AnonymousIteratorV3Op CreateIterator(OpBuilder builder,
   llvm::SmallVector<Attribute, 2> type_attrs;
   for (Type type : dataset_types) {
     shape_attrs.push_back(
-        TF::ShapeAttr::get(builder.getContext(), type.cast<ShapedType>()));
+        TF::ShapeAttr::get(builder.getContext(), mlir::cast<ShapedType>(type)));
     type_attrs.push_back(TypeAttr::get(getElementTypeOrSelf(type)));
   }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc b/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
index 51438ac4901b9d..4cdc90376c2317 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
@@ -90,7 +90,7 @@ TF::SumOp createSumOp(Value value, Location loc,
                       PatternRewriter* rewriter) {
   Value redux_op = createI32ConstantOp(redux_axes, loc, rewriter);
 
-  auto value_type = value.getType().cast<RankedTensorType>();
+  auto value_type = mlir::cast<RankedTensorType>(value.getType());
   auto shape = value_type.getShape();
   llvm::SmallVector<int64_t> sum_shape;
   for (int i = 0; i < shape.size(); ++i) {
@@ -108,7 +108,7 @@ TF::TransposeOp createTransposeOp(Value value, Location loc,
                                   llvm::ArrayRef<int32_t> permutation,
                                   PatternRewriter* rewriter) {
   auto perm_op = createI32ConstantOp(permutation, loc, rewriter);
-  auto value_type = value.getType().cast<RankedTensorType>();
+  auto value_type = mlir::cast<RankedTensorType>(value.getType());
   auto shape = value_type.getShape();
   SmallVector<int64_t, 4> transposed_shape(shape.begin(), shape.end());
   for (int i = 0, end = shape.size(); i < end; ++i) {
@@ -529,7 +529,7 @@ LogicalResult rewriteToReduceSumAndTranspose(TF::EinsumOp op,
   bool needs_transpose = false;
   for (int64_t i = 0; i < dnums.lhs_out.size(); ++i) {
     if (std::get<0>(dnums.lhs_out[i]) >
-        lhs.getType().cast<RankedTensorType>().getRank() - 1) {
+        mlir::cast<RankedTensorType>(lhs.getType()).getRank() - 1) {
       continue;
     }
 
@@ -637,8 +637,8 @@ LogicalResult reshapeForBatchMatmul(const Location& loc,
                                     Value* rhs,
                                     SmallVectorImpl<int64_t>* out_shape,
                                     PatternRewriter* rewriter) {
-  RankedTensorType lhs_type = lhs->getType().cast<RankedTensorType>();
-  RankedTensorType rhs_type = rhs->getType().cast<RankedTensorType>();
+  RankedTensorType lhs_type = mlir::cast<RankedTensorType>(lhs->getType());
+  RankedTensorType rhs_type = mlir::cast<RankedTensorType>(rhs->getType());
 
   int32_t num_lhs_reshape_segids = 0;
   int32_t num_rhs_reshape_segids = 0;
@@ -776,7 +776,7 @@ LogicalResult rewriteToBatchMatmul(TF::EinsumOp op,
   EinsumDimensionNumbers original_dnums = dnums;
 
   RankedTensorType original_type =
-      op.getResult().getType().dyn_cast_or_null<RankedTensorType>();
+      mlir::dyn_cast_or_null<RankedTensorType>(op.getResult().getType());
   if (!original_type) return failure();
 
   std::vector<int32_t> out_transpose;
@@ -822,7 +822,7 @@ LogicalResult matchAndRewriteUnaryEinsumOp(TF::EinsumOp op,
         op, "Function only supports unary einsum op");
   }
   RankedTensorType lhs =
-      op.getOperand(0).getType().dyn_cast_or_null<RankedTensorType>();
+      mlir::dyn_cast_or_null<RankedTensorType>(op.getOperand(0).getType());
   if (!lhs) {
     return failure();
   }
@@ -862,9 +862,9 @@ LogicalResult ConvertTFEinsumOp::matchAndRewrite(
   }
 
   RankedTensorType lhs =
-      op.getOperand(0).getType().dyn_cast_or_null<RankedTensorType>();
+      mlir::dyn_cast_or_null<RankedTensorType>(op.getOperand(0).getType());
   RankedTensorType rhs =
-      op.getOperand(1).getType().dyn_cast_or_null<RankedTensorType>();
+      mlir::dyn_cast_or_null<RankedTensorType>(op.getOperand(1).getType());
   if (!lhs || !rhs) {
     return failure();
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/executor_island_coarsening.cc b/tensorflow/compiler/mlir/tensorflow/transforms/executor_island_coarsening.cc
index 51afea6d84671e..e1611432f36e8c 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/executor_island_coarsening.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/executor_island_coarsening.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/core/platform/logging.h"
@@ -443,7 +444,7 @@ void InsertDummyIslandForFetch(FetchOp fetch) {
   control_fetches.reserve(data_fetches.capacity());
 
   for (auto value : fetch.getFetches()) {
-    if (value.getType().isa<ControlType>()) {
+    if (mlir::isa<ControlType>(value.getType())) {
       control_fetches.push_back(value);
     } else {
       data_fetches.push_back(value);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_island_coarsening.cc b/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_island_coarsening.cc
index 9567278d98dc9c..b75f081d1a0064 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_island_coarsening.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_island_coarsening.cc
@@ -395,7 +395,7 @@ bool is_valid_special_tpu_op(
 
     bool op_has_inconsistent_cluster_name =
         wrapped_op_cluster_name.has_value() &&
-        !wrapped_op_cluster_name.value().equals(cluster_name);
+        wrapped_op_cluster_name.value() != cluster_name;
 
     if (op_has_inconsistent_cluster_name) {
       return false;
@@ -624,7 +624,7 @@ void TpuV1BridgeExecutorIslandCoarsening::runOnOperation() {
     assert(!funcs_for_cluster->second.empty());
     if (funcs_for_cluster->second.size() == 1) return false;
     for (NamedAttribute attr : op->getAttrs()) {
-      auto symbol_ref = attr.getValue().dyn_cast<FlatSymbolRefAttr>();
+      auto symbol_ref = mlir::dyn_cast<FlatSymbolRefAttr>(attr.getValue());
       if (!symbol_ref) continue;
       func::FuncOp callee =
           symbol_table.lookup<func::FuncOp>(symbol_ref.getValue());
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_outline_tpu_island.cc b/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_outline_tpu_island.cc
index 0106d149d3d343..19603170b89e20 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_outline_tpu_island.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_outline_tpu_island.cc
@@ -178,13 +178,14 @@ void TPUBridgeExecutorIslandOutlining::runOnOperation() {
   for (func::FuncOp func : outlined_module.getOps<func::FuncOp>()) {
     func.walk([&](Operation *op) {
       for (NamedAttribute attr : op->getAttrs()) {
-        if (auto symbol_ref = attr.getValue().dyn_cast<FlatSymbolRefAttr>()) {
+        if (auto symbol_ref =
+                mlir::dyn_cast<FlatSymbolRefAttr>(attr.getValue())) {
           MoveFuncOp(symbol_ref, symbol_table, outlined_symbol_table);
           continue;
         }
-        if (auto array_attr = attr.getValue().dyn_cast<ArrayAttr>()) {
+        if (auto array_attr = mlir::dyn_cast<ArrayAttr>(attr.getValue())) {
           for (const Attribute &attribute : array_attr) {
-            auto symbol_ref = attribute.dyn_cast<FlatSymbolRefAttr>();
+            auto symbol_ref = mlir::dyn_cast<FlatSymbolRefAttr>(attribute);
             if (!symbol_ref) continue;
             MoveFuncOp(symbol_ref, symbol_table, outlined_symbol_table);
           }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/extract_tpu_copy_with_dynamic_shape_op.cc b/tensorflow/compiler/mlir/tensorflow/transforms/extract_tpu_copy_with_dynamic_shape_op.cc
index dfd20d8dd0e07a..18480fbd772fa9 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/extract_tpu_copy_with_dynamic_shape_op.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/extract_tpu_copy_with_dynamic_shape_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -47,7 +48,7 @@ class ExtractTPUCopyWithDynamicShapeOpPass
 // Finds op that created a given value. If the value is a BlockArgument, this
 // returns the owner of the Block.
 Operation* GetOpOfValue(Value value) {
-  if (auto block_arg = value.dyn_cast<BlockArgument>())
+  if (auto block_arg = mlir::dyn_cast<BlockArgument>(value))
     return block_arg.getOwner()->getParentOp();
 
   return value.getDefiningOp();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/fold_broadcast.cc b/tensorflow/compiler/mlir/tensorflow/transforms/fold_broadcast.cc
index d755696c74607b..6547b6f168c3bf 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/fold_broadcast.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/fold_broadcast.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -144,7 +145,7 @@ LogicalResult ConvertResultsBroadcastableShapeOp::RewriteOp(
 
   // Check that the result shape is fully defined.
   auto result_type =
-      op->getResultTypes().front().dyn_cast_or_null<RankedTensorType>();
+      mlir::dyn_cast_or_null<RankedTensorType>(op->getResultTypes().front());
   if (!result_type || !result_type.hasStaticShape()) return failure();
 
   bool changed = false;
@@ -155,15 +156,13 @@ LogicalResult ConvertResultsBroadcastableShapeOp::RewriteOp(
     if (!broadcast) continue;
 
     // Check that the operand of the broadcast has fully defined shape.
-    auto broadcast_arg_type =
-        broadcast.getInput().getType().dyn_cast_or_null<RankedTensorType>();
+    auto broadcast_arg_type = mlir::dyn_cast_or_null<RankedTensorType>(
+        broadcast.getInput().getType());
     if (!broadcast_arg_type || !broadcast_arg_type.hasStaticShape()) continue;
 
     // Check that the other argument has fully defined shape.
-    auto argument_type = op->getOpOperand(1 - i)
-                             .get()
-                             .getType()
-                             .dyn_cast_or_null<RankedTensorType>();
+    auto argument_type = mlir::dyn_cast_or_null<RankedTensorType>(
+        op->getOpOperand(1 - i).get().getType());
     if (!argument_type || !argument_type.hasStaticShape()) continue;
 
     // Get the unbroadcasted shapes in the operand order.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/freeze_global_tensors.cc b/tensorflow/compiler/mlir/tensorflow/transforms/freeze_global_tensors.cc
index 6a1a4852e68a1c..9f9da90bf76594 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/freeze_global_tensors.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/freeze_global_tensors.cc
@@ -86,7 +86,7 @@ void FreezeGlobalTensorsPass::runOnOperation() {
   DenseMap<BlockArgument, bool> freezeable;
   for (auto func : module.getOps<func::FuncOp>()) {
     for (BlockArgument val : func.getArguments()) {
-      if (!getElementTypeOrSelf(val.getType()).isa<TF::ResourceType>())
+      if (!mlir::isa<TF::ResourceType>(getElementTypeOrSelf(val.getType())))
         continue;
 
       // Check that there is only a single global tensor associated with arg.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc
index 0cff8946687dcb..11be79869f4fd2 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
@@ -101,7 +102,7 @@ YieldOp CreateCall(Operation* op, func::FuncOp func, Region& caller_region,
 
 // Converts the condition for an IfOp/WhileOp to a boolean value.
 Value ConvertConditionToBoolean(Operation* op, Value cond) {
-  if (auto ranked_type = cond.getType().dyn_cast<RankedTensorType>())
+  if (auto ranked_type = mlir::dyn_cast<RankedTensorType>(cond.getType()))
     if (ranked_type.getRank() == 0 &&
         ranked_type.getElementType().isSignlessInteger(1))
       return cond;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc b/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc
index 1c0a125598cdbe..4eb791a909022d 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc
@@ -332,9 +332,9 @@ class FuseMatMulBiasAdd
     }
     // FusedMatMul kernel does not support grad_a/grad_b attrs
     if ((matmul->hasAttr("grad_a") &&
-         matmul->getAttr("grad_a").cast<BoolAttr>().getValue()) ||
+         mlir::cast<BoolAttr>(matmul->getAttr("grad_a")).getValue()) ||
         (matmul->hasAttr("grad_b") &&
-         matmul->getAttr("grad_b").cast<BoolAttr>().getValue())) {
+         mlir::cast<BoolAttr>(matmul->getAttr("grad_b")).getValue())) {
       (void)rewriter.notifyMatchFailure(matmul, [&](Diagnostic &diag) {
         diag << "FusedMatMul kernel does not support grad_a/grad_b attrs";
       });
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.cc
index dff2223b11567f..e49b0f445d0c70 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.cc
@@ -43,7 +43,7 @@ Status MlirGraphOptimizationPass::Run(
       ::tensorflow::MlirOptimizationPassState::Disabled) {
     VLOG(1) << "Skipping MLIR Graph Optimization Pass"
             << ", session flag not enabled";
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   VLOG(1) << "Run MLIR Graph Optimization Passes";
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/hoist_loop_invariant.cc b/tensorflow/compiler/mlir/tensorflow/transforms/hoist_loop_invariant.cc
index 78fb6aad3abdde..91f14794494de7 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/hoist_loop_invariant.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/hoist_loop_invariant.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/LoopInvariantCodeMotionUtils.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -135,7 +136,7 @@ void HoistLoopInvariantPass::runOnOperation() {
 
   // Skip the pass if the function inputs contain any resource.
   for (const auto &type : func.getArgumentTypes()) {
-    if (getElementTypeOrSelf(type).isa<ResourceType>()) return;
+    if (mlir::isa<ResourceType>(getElementTypeOrSelf(type))) return;
   }
 
   llvm::DenseSet<ResourceHandle> read_only_vars = GetReadOnlyVariables(func);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/BUILD b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/BUILD
index f8e75d9032f3e5..3d046b4c41c51f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/BUILD
@@ -252,6 +252,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
         "@local_xla//xla/mlir_hlo",
     ],
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_metadata_utils.cc b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_metadata_utils.cc
index 767d5cf7f0cf8c..a21c78a9e3ca82 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_metadata_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_metadata_utils.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
@@ -93,7 +94,7 @@ LogicalResult SetMetadataProtoStepMarkerLocation(
 // Parses a xla::OpSharding from a string attribute.
 LogicalResult SetOpSharding(Operation* op, Attribute attr, llvm::StringRef name,
                             int index, xla::OpSharding* sharding_ptr) {
-  auto sharding_attr = attr.dyn_cast<StringAttr>();
+  auto sharding_attr = mlir::dyn_cast<StringAttr>(attr);
   if (!sharding_attr)
     return op->emitOpError(
         llvm::formatv(kBadStringArrayElementMsg, name, index));
@@ -130,7 +131,7 @@ LogicalResult SetMetadataProtoArgs(
   llvm::SmallSet<int, 4> dynamic_arg_idx_set;
   if (dynamic_arg_idx) {
     for (auto idx : dynamic_arg_idx.getValue()) {
-      dynamic_arg_idx_set.insert(idx.dyn_cast<IntegerAttr>().getInt());
+      dynamic_arg_idx_set.insert(mlir::dyn_cast<IntegerAttr>(idx).getInt());
     }
   }
 
@@ -155,7 +156,8 @@ LogicalResult SetMetadataProtoArgs(
 
     // Populate argument shapes.
     *arg->mutable_shape() = tensorflow::TensorShapeProto();
-    if (auto ranked_tensor_type = operand_type.dyn_cast<RankedTensorType>()) {
+    if (auto ranked_tensor_type =
+            mlir::dyn_cast<RankedTensorType>(operand_type)) {
       tensorflow::TensorShapeProto shape_proto;
       ConvertToTensorShapeProto(ranked_tensor_type.getShape(), &shape_proto);
       *arg->mutable_shape() = std::move(shape_proto);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_rewrite_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_rewrite_pass.cc
index ed1e0549dfb769..d8067af3f29557 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_rewrite_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_rewrite_pass.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <string>
 #include <type_traits>
 
-#include "absl/strings/match.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
@@ -43,6 +42,7 @@ limitations under the License.
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -75,7 +75,6 @@ namespace mlir {
 namespace TFTPU {
 
 constexpr char kStepMarkerLocationAttr[] = "step_marker_location";
-constexpr char kDeviceAttr[] = "device";
 constexpr char kDevicesAttr[] = "devices";
 constexpr char kVersionsAttr[] = "tf.versions";
 constexpr char kUseXlaSpmdAttr[] = "use_spmd_for_xla_partitioning";
@@ -139,7 +138,7 @@ LogicalResult EncapsulateFuncAndSerialize(const std::string& module_name,
     assert(uses && "expected to be able to collect symbol uses");
     for (SymbolTable::SymbolUse use : *uses) {
       func::FuncOp referenced_func = entry_module_table.lookup<func::FuncOp>(
-          use.getSymbolRef().cast<FlatSymbolRefAttr>().getValue());
+          mlir::cast<FlatSymbolRefAttr>(use.getSymbolRef()).getValue());
 
       // Skip Symbols that do not map to a function.
       if (!referenced_func) continue;
@@ -380,18 +379,9 @@ LogicalResult AddToParallelExecuteOp(
     // If computation is replicated, use aliased device. Otherwise there is only
     // one execution device per core and the device is assigned to the execute
     // op.
-    std::string device;
-    if (replicated) {
-      device = tensorflow::GetDeviceAliasForLogicalCore(core);
-    } else {
-      auto device_attr = cluster_func->getAttrOfType<StringAttr>(kDeviceAttr);
-      if (device_attr && !device_attr.str().empty() &&
-          absl::StrContains(device_attr.str(), "TPU:")) {
-        device = cluster_func->getAttrOfType<StringAttr>(kDeviceAttr).str();
-      } else {
-        device = tpu_devices.front()[core].device;
-      }
-    }
+    std::string device = replicated
+                             ? tensorflow::GetDeviceAliasForLogicalCore(core)
+                             : tpu_devices.front()[core].device;
     auto block_launch_op = tensorflow::WrapOpInLaunch(
         builder, block.getParent()->getLoc(), execute, device);
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_variable_runtime_reformatting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_variable_runtime_reformatting.cc
index 271e525ef8c7ae..4e87c10b1b7ac6 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_variable_runtime_reformatting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_variable_runtime_reformatting.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
@@ -73,7 +74,7 @@ struct TPUVariableRuntimeReformattingPass
 // provided, it will be used to store the identity nodes skipped.
 Value SkipIdentity(Value v, bool allow_other_use,
                    llvm::SmallPtrSet<Operation*, 4>* skipped = nullptr) {
-  while (auto result = v.dyn_cast<OpResult>()) {
+  while (auto result = mlir::dyn_cast<OpResult>(v)) {
     if (!(allow_other_use || v.hasOneUse())) break;
     auto op = result.getDefiningOp();
     if (!llvm::isa<TF::IdentityOp, TF::IdentityNOp>(op)) {
@@ -108,10 +109,10 @@ AnnotateCompileOpAndGetExecuteArgToWhileArgsMapping(
   for (auto index_and_arg : llvm::enumerate(execute.getArgs())) {
     auto arg = SkipIdentity(index_and_arg.value(), /*allow_other_use=*/false);
     if (!arg.hasOneUse() ||
-        !getElementTypeOrSelf(arg.getType()).isa<TF::ResourceType>()) {
+        !mlir::isa<TF::ResourceType>(getElementTypeOrSelf(arg.getType()))) {
       continue;
     }
-    auto block_arg = arg.dyn_cast<BlockArgument>();
+    auto block_arg = mlir::dyn_cast<BlockArgument>(arg);
     if (!block_arg || block_arg.getOwner() != &replicate.GetBody()) continue;
     assert(replicate_arg_to_execute_arg.count(block_arg.getArgNumber()) == 0 &&
            "Found duplicate use of a resource in the execute op.");
@@ -131,13 +132,13 @@ AnnotateCompileOpAndGetExecuteArgToWhileArgsMapping(
   // variables (arguments of `replicate`), and must be pass-throughs from while
   // operands.
   for (const auto& mirrored_index : mirrored_variable_indices_attr) {
-    int64_t replicate_arg = mirrored_index.cast<IntegerAttr>().getInt();
+    int64_t replicate_arg = mlir::cast<IntegerAttr>(mirrored_index).getInt();
     // Check if the mirrored variable is an input to `execute`.
     auto it = replicate_arg_to_execute_arg.find(replicate_arg);
     if (it == replicate_arg_to_execute_arg.end()) continue;
     // Get the data type of the resource.
-    auto subtypes = getElementTypeOrSelf(execute.getOperand(it->second))
-                        .cast<TF::ResourceType>()
+    auto subtypes = mlir::cast<TF::ResourceType>(
+                        getElementTypeOrSelf(execute.getOperand(it->second)))
                         .getSubtypes();
     if (subtypes.size() != 1) continue;
     auto data_type = getElementTypeOrSelf(subtypes[0]);
@@ -198,14 +199,14 @@ AnnotateCompileOpAndGetExecuteArgToWhileArgsMapping(
   llvm::sort(mapping, llvm::less_first());
   // Populate the `retval_index_for_sharding` field of the argument metadate.
   for (auto entry : llvm::enumerate(execute.getDeviceVarReadsIndices())) {
-    int64_t arg_index = entry.value().cast<IntegerAttr>().getInt();
+    int64_t arg_index = mlir::cast<IntegerAttr>(entry.value()).getInt();
     auto arg_metadata = metadata.mutable_args(arg_index);
     if (arg_metadata->enable_xla_sharding() ==
         ::tensorflow::tpu::TPUCompileMetadataProto_Arg::ALLOWED) {
-      int64_t ret_index = execute.getDeviceVarUpdatesIndices()
-                              .getValue()[entry.index()]
-                              .cast<IntegerAttr>()
-                              .getInt();
+      int64_t ret_index =
+          mlir::cast<IntegerAttr>(
+              execute.getDeviceVarUpdatesIndices().getValue()[entry.index()])
+              .getInt();
       arg_metadata->set_retval_index_for_sharding(ret_index);
     }
   }
@@ -379,12 +380,13 @@ bool HandleReplicateOp(TF::WhileRegionOp while_op,
 
   for (auto it : device_map) {
     auto device_alias = it.getName().strref();
-    auto device_list = it.getValue().cast<ArrayAttr>();
+    auto device_list = mlir::cast<ArrayAttr>(it.getValue());
     llvm::SmallVector<StringRef, 4> device_list_for_alias;
     device_list_for_alias.reserve(device_list.size());
 
     for (auto device : device_list)
-      device_list_for_alias.emplace_back(device.cast<StringAttr>().getValue());
+      device_list_for_alias.emplace_back(
+          mlir::cast<StringAttr>(device).getValue());
 
     devices.insert({device_alias, device_list_for_alias});
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/initialize_variables_in_session_init.cc b/tensorflow/compiler/mlir/tensorflow/transforms/initialize_variables_in_session_init.cc
index 3b974c395706aa..6e7fe42ef4dfab 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/initialize_variables_in_session_init.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/initialize_variables_in_session_init.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
@@ -99,8 +100,7 @@ func::FuncOp GetOrCreateSessionInitFunc(ModuleOp module) {
     // tf_saved_model.initializer_type attribute was introduced.
     SymbolTable symbol_table(module);
     return symbol_table.lookup<func::FuncOp>(
-        session_init_op.getInitializers()[0]
-            .cast<FlatSymbolRefAttr>()
+        mlir::cast<FlatSymbolRefAttr>(session_init_op.getInitializers()[0])
             .getValue());
   } else {
     return CreateSessionInitFunc(module);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/launch_to_device_attribute.cc b/tensorflow/compiler/mlir/tensorflow/transforms/launch_to_device_attribute.cc
index aa1efc6837eee6..015499c6996f38 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/launch_to_device_attribute.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/launch_to_device_attribute.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
@@ -66,7 +67,7 @@ LogicalResult AssignDevicesInRegion(const Dialect* tf_dialect,
       return WalkResult::advance();
     }
 
-    if (auto device_str_attr = device_attr.dyn_cast<StringAttr>()) {
+    if (auto device_str_attr = mlir::dyn_cast<StringAttr>(device_attr)) {
       if (device_str_attr.getValue().empty()) {
         op->setAttr(kDeviceAttr, launch.getDeviceAttr());
         return WalkResult::advance();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc b/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc
index 0fad3c019ea432..e8c1d1997e195e 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_layout_helper.h"
@@ -49,7 +50,7 @@ TransposeOp ReuseExistingTranspose(const OpOperand* operand,
     auto tranpose_op = *it;
     for (auto tranpose_operand : tranpose_op.getOperands()) {
       auto ranked_tranpose_type =
-          tranpose_operand.getType().dyn_cast_or_null<RankedTensorType>();
+          mlir::dyn_cast_or_null<RankedTensorType>(tranpose_operand.getType());
       if (!ranked_tranpose_type) continue;
       if (ranked_tranpose_type.getRank() == permutation.size() &&
           operand->get().getType() ==
@@ -201,7 +202,7 @@ void MoveTransposeBefore(Operation* op, SmallVector<Operation*, 8>* work_list) {
       if (!perm) return;
 
       // With the same permutation indices.
-      auto dense_elem_attr = perm.getValue().dyn_cast<DenseElementsAttr>();
+      auto dense_elem_attr = mlir::dyn_cast<DenseElementsAttr>(perm.getValue());
       if (!dense_elem_attr) return;
 
       if (!permutation_op) permutation_op = perm;
@@ -217,7 +218,7 @@ void MoveTransposeBefore(Operation* op, SmallVector<Operation*, 8>* work_list) {
   // Nothing to do here.
   if (!permutation_op || transpose_ops.empty()) return;
   SmallVector<int64_t, 4> permutation;
-  auto perm_attr = permutation_op.getValue().cast<DenseElementsAttr>();
+  auto perm_attr = mlir::cast<DenseElementsAttr>(permutation_op.getValue());
   for (const auto& value : perm_attr.getValues<APInt>())
     permutation.push_back(value.getSExtValue());
 
@@ -227,10 +228,11 @@ void MoveTransposeBefore(Operation* op, SmallVector<Operation*, 8>* work_list) {
   if (op->hasTrait<OpTrait::ResultsBroadcastableShape>()) {
     auto transpose_op = *transpose_ops.begin();
     auto result_type =
-        transpose_op.getResult().getType().dyn_cast_or_null<ShapedType>();
+        mlir::dyn_cast_or_null<ShapedType>(transpose_op.getResult().getType());
     auto is_valid_move =
         llvm::all_of(op->getOperands(), [result_type](Value operand) -> bool {
-          auto operand_type = operand.getType().dyn_cast_or_null<ShapedType>();
+          auto operand_type =
+              mlir::dyn_cast_or_null<ShapedType>(operand.getType());
           return result_type && operand_type && result_type.hasRank() &&
                  operand_type.hasRank() &&
                  result_type.getRank() == operand_type.getRank();
@@ -343,7 +345,7 @@ void MoveTransposeAfter(Operation* op, SmallVector<Operation*, 8>* work_list,
     if (!perm) return;
 
     // With the same permutation indices.
-    auto dense_elem_attr = perm.getValue().dyn_cast<DenseElementsAttr>();
+    auto dense_elem_attr = mlir::dyn_cast<DenseElementsAttr>(perm.getValue());
     if (!dense_elem_attr) return;
 
     if (!permutation_op) permutation_op = perm;
@@ -365,7 +367,7 @@ void MoveTransposeAfter(Operation* op, SmallVector<Operation*, 8>* work_list,
 
   SmallVector<int64_t, 8> permutation;
 
-  auto attr = permutation_op.getValue().cast<DenseElementsAttr>();
+  auto attr = mlir::cast<DenseElementsAttr>(permutation_op.getValue());
   for (const auto& value : attr.getValues<APInt>())
     permutation.push_back(value.getSExtValue());
 
@@ -373,7 +375,7 @@ void MoveTransposeAfter(Operation* op, SmallVector<Operation*, 8>* work_list,
   if (fold_operands && fold_transpose_in_ops) {
     SmallVector<int64_t, 8> permutation;
 
-    auto attr = permutation_op.getValue().cast<DenseElementsAttr>();
+    auto attr = mlir::cast<DenseElementsAttr>(permutation_op.getValue());
     for (const auto& value : attr.getValues<APInt>())
       permutation.push_back(value.getSExtValue());
 
@@ -408,7 +410,7 @@ void MoveTransposeAfter(Operation* op, SmallVector<Operation*, 8>* work_list,
     // update the result type in `FoldOperandsPermutation`.
     if (layout_agnostic)
       result.setType(ReversePermuteShapedType(
-          result.getType().cast<ShapedType>(), permutation));
+          mlir::cast<ShapedType>(result.getType()), permutation));
 
     // Try to push transpose further down.
     for (Operation* user : result.getUsers()) {
@@ -422,7 +424,7 @@ void MoveTransposeAfter(Operation* op, SmallVector<Operation*, 8>* work_list,
       transpose.getOperation()->moveBefore(op->getNextNode());
       transpose.setOperand(0, result);
       transpose.setOperand(1, permutation_op);
-      transpose.getResult().setType(original_type[idx].cast<TensorType>());
+      transpose.getResult().setType(mlir::cast<TensorType>(original_type[idx]));
     } else {
       transpose = builder.create<TransposeOp>(loc, result, permutation_op);
     }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables.cc b/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables.cc
index bc0534fdb0bb84..c4ea84d8b0948c 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables.cc
@@ -189,10 +189,10 @@ LogicalResult LiftVariables(ModuleOp module, Session* session) {
           func, arg_number, symbol_table);
       if (!global_tensor) continue;
 
-      auto arg_type = arg.getType().cast<RankedTensorType>();
+      auto arg_type = mlir::cast<RankedTensorType>(arg.getType());
       assert(arg_type.getRank() == 0);
       llvm::ArrayRef<TensorType> underlying_type =
-          arg_type.getElementType().cast<TF::ResourceType>().getSubtypes();
+          mlir::cast<TF::ResourceType>(arg_type.getElementType()).getSubtypes();
 
       // If the arg type already matches the global_tensor type, we don't need
       // to do anything.
@@ -206,7 +206,7 @@ LogicalResult LiftVariables(ModuleOp module, Session* session) {
       auto new_arg_type = mlir::RankedTensorType::get(
           /*shape=*/{},
           mlir::TF::ResourceType::get(
-              /*subtypes=*/{global_tensor.getType().cast<TensorType>()},
+              /*subtypes=*/{mlir::cast<TensorType>(global_tensor.getType())},
               module.getContext()));
 
       arg.setType(new_arg_type);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lower_globals_to_ml_program.cc b/tensorflow/compiler/mlir/tensorflow/transforms/lower_globals_to_ml_program.cc
index cdb256ab25f177..8d58b8177b33c8 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/lower_globals_to_ml_program.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lower_globals_to_ml_program.cc
@@ -47,7 +47,7 @@ static LogicalResult traceUpwardsToArgument(Value v, llvm::DenseSet<Value> seen,
   }
   seen.insert(v);
 
-  if (auto blockArg = v.dyn_cast<BlockArgument>()) {
+  if (auto blockArg = mlir::dyn_cast<BlockArgument>(v)) {
     Operation *op = blockArg.getOwner()->getParentOp();
 
     // If we're in the first block, then the argument to that block is the
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc
index 1c9b1e03a663c6..da565f00b45b99 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/TypeRange.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.h"
@@ -79,7 +80,7 @@ static DenseElementsAttr GetF32Scalar(OpBuilder *builder, float value) {
 // Preconditions: The given value must have a ShapedType.
 static Value CreateTFCastOpF32(OpBuilder *builder, Location loc, Value x,
                                BoolAttr truncate) {
-  auto x_type = x.getType().dyn_cast_or_null<ShapedType>();
+  auto x_type = mlir::dyn_cast_or_null<ShapedType>(x.getType());
   if (!x_type) llvm_unreachable("unsupported type");
   Type type = x_type.clone(builder->getF32Type());
   return builder->create<CastOp>(loc, type, x, truncate);
@@ -92,7 +93,7 @@ static Value CreateTFCastOpF32(OpBuilder *builder, Location loc, Value x,
 // Preconditions: The given value must have a ShapedType.
 static Value CreateTFCastOpI32(OpBuilder *builder, Location loc, Value x,
                                BoolAttr truncate) {
-  auto x_type = x.getType().dyn_cast_or_null<ShapedType>();
+  auto x_type = mlir::dyn_cast_or_null<ShapedType>(x.getType());
   if (!x_type) llvm_unreachable("unsupported type");
   Type type = x_type.clone(builder->getI32Type());
   return builder->create<CastOp>(loc, type, x, truncate);
@@ -109,7 +110,8 @@ static APFloat ConvertToAPFloat(double val, Type type) {
 // Performs the operation of `Shape(input)[idx]`.
 static Value GetDimensionSize(OpBuilder *builder, Location loc, Value input,
                               int32_t idx, BoolAttr use_32bit) {
-  if (auto ranked_ty = input.getType().dyn_cast_or_null<RankedTensorType>()) {
+  if (auto ranked_ty =
+          mlir::dyn_cast_or_null<RankedTensorType>(input.getType())) {
     // Canonicalize negative index.
     if (idx < 0) {
       idx += ranked_ty.getRank();
@@ -154,7 +156,7 @@ bool QuantizedTypeIsUnsigned(Type type) {
 // to offset the quantized representation before it gets scaled. In the case
 // of negative quantize types, this offset is half the type's range.
 static DenseElementsAttr DequantizeHalfRange(OpBuilder *builder, Value input) {
-  auto input_type = input.getType().dyn_cast_or_null<ShapedType>();
+  auto input_type = mlir::dyn_cast_or_null<ShapedType>(input.getType());
   if (!input_type) llvm_unreachable("DequantizeHalfRange: not a ShapedType");
   bool is_unsigned = QuantizedTypeIsUnsigned(input_type.getElementType());
   float half_range = is_unsigned ? 0 : 128;
@@ -183,7 +185,7 @@ DenseIntElementsAttr GetBiasAddGradReductionIndices(int64_t rank,
 // Infers ExpandDims op output type for the given input type `ty` and dimension
 // to expand at the given `axis`.
 Type InferExpandDimsType(Type ty, int64_t axis, Builder *builder) {
-  auto ranked_ty = ty.dyn_cast<RankedTensorType>();
+  auto ranked_ty = mlir::dyn_cast<RankedTensorType>(ty);
 
   // Unranked type.
   if (!ranked_ty) return ty;
@@ -258,7 +260,7 @@ class LowerAddNOp : public RewritePattern {
 
     // TODO(hinsu): Support variant with TensorList type. tf.AddV2 doesn't
     // support variant type so variant types require special handling.
-    if (getElementTypeOrSelf(addn_op.getType()).isa<VariantType>())
+    if (mlir::isa<VariantType>(getElementTypeOrSelf(addn_op.getType())))
       return failure();
     llvm::SmallVector<Value, 4> operands(addn_op.getInputs().begin(),
                                          addn_op.getInputs().end());
@@ -324,8 +326,7 @@ class LowerDynamicStitchOp : public RewritePattern {
     // Static output type is used to compute intermediate values. Note that the
     // output type doesn't have to be static but if input types and indices are
     // constant, then the output type can be statically determined.
-    RankedTensorType out_ty =
-        op.getType().template dyn_cast<RankedTensorType>();
+    RankedTensorType out_ty = mlir::dyn_cast<RankedTensorType>(op.getType());
     if (!out_ty || !out_ty.hasStaticShape()) return failure();
 
     // Extract out all the constant indices' attributes and verify that data
@@ -341,7 +342,7 @@ class LowerDynamicStitchOp : public RewritePattern {
       indices.push_back(index_attr);
 
       RankedTensorType data_ty =
-          data.getType().template dyn_cast<RankedTensorType>();
+          mlir::dyn_cast<RankedTensorType>(data.getType());
       if (!data_ty || !data_ty.hasStaticShape()) return failure();
     }
 
@@ -367,9 +368,8 @@ class LowerDynamicStitchOp : public RewritePattern {
 
       auto reshaped_data =
           rewriter.create<ReshapeOp>(loc, data, packed_shape_val);
-      auto num_items = reshaped_data.getType()
-                           .template cast<RankedTensorType>()
-                           .getShape()[0];
+      auto num_items =
+          mlir::cast<RankedTensorType>(reshaped_data.getType()).getShape()[0];
       auto items = rewriter.create<UnpackOp>(
           loc, SmallVector<Type, 4>(num_items, item_ty), reshaped_data,
           /*axis=*/0);
@@ -407,7 +407,7 @@ class ConvertFakeQuantWithMinMaxVarsOp : public RewritePattern {
     auto op = cast<FakeQuantWithMinMaxVarsOp>(src_op);
 
     auto input = op.getInputs();
-    auto input_ty = input.getType().cast<ShapedType>();
+    auto input_ty = mlir::cast<ShapedType>(input.getType());
     auto element_ty = input_ty.getElementType();
     auto scalar_ty = tensorflow::GetTypeFromTFTensorShape({}, element_ty);
 
@@ -534,7 +534,7 @@ class LowerInvertPermutationOp : public RewritePattern {
     auto op = cast<InvertPermutationOp>(src_op);
 
     Location loc = op.getLoc();
-    auto x_type = op.getX().getType().dyn_cast<RankedTensorType>();
+    auto x_type = mlir::dyn_cast<RankedTensorType>(op.getX().getType());
     // x input must have static shape.
     if (!x_type || !x_type.hasStaticShape()) {
       return failure();
@@ -617,12 +617,13 @@ class LowerLgammaOp : public RewritePattern {
 
     Location loc = op.getLoc();
     Value input = op.getX();
-    TensorType original_tensor_type = op.getX().getType().cast<TensorType>();
+    TensorType original_tensor_type =
+        mlir::cast<TensorType>(op.getX().getType());
 
     // The approximation is not precise enough for float16. Do the computation
     // in float32 for that case.
     TensorType tensor_type = original_tensor_type;
-    FloatType float_type = tensor_type.getElementType().cast<FloatType>();
+    FloatType float_type = mlir::cast<FloatType>(tensor_type.getElementType());
     bool needs_cast = float_type.getWidth() < 32;
     if (needs_cast) {
       MLIRContext *context = rewriter.getContext();
@@ -887,17 +888,18 @@ class LowerSpaceToBatchNDOp : public RewritePattern {
     auto op = cast<SpaceToBatchNDOp>(src_op);
 
     Location loc = op.getLoc();
-    auto input_type = op.getInput().getType().cast<TensorType>();
+    auto input_type = mlir::cast<TensorType>(op.getInput().getType());
     auto element_type = input_type.getElementType();
     if (!input_type.hasStaticShape()) {
       return failure();
     }
     ArrayRef<int64_t> input_shape = input_type.getShape();
-    auto block_shape_type = op.getBlockShape().getType().cast<TensorType>();
+    auto block_shape_type =
+        mlir::cast<TensorType>(op.getBlockShape().getType());
     if (!block_shape_type.hasStaticShape()) {
       return failure();
     }
-    auto paddings_type = op.getPaddings().getType().cast<ShapedType>();
+    auto paddings_type = mlir::cast<ShapedType>(op.getPaddings().getType());
     if (!paddings_type.hasRank()) {
       return failure();
     }
@@ -1100,7 +1102,7 @@ class LowerBatchToSpaceND : public RewritePattern {
                                 PatternRewriter &rewriter) const override {
     auto op = cast<BatchToSpaceNDOp>(src_op);
     auto input = op.getInput();
-    auto input_ty = input.getType().cast<ShapedType>();
+    auto input_ty = mlir::cast<ShapedType>(input.getType());
     auto element_ty = input_ty.getElementType();
     if (!input_ty.hasStaticShape()) {
       return failure();
@@ -1279,9 +1281,7 @@ class LowerSparseMatMulOp : public RewritePattern {
 
     // Result type must be f32 for applying the pattern (currently this is
     // required by the op anyway but this might change).
-    if (!op.getProduct()
-             .getType()
-             .cast<TensorType>()
+    if (!mlir::cast<TensorType>(op.getProduct().getType())
              .getElementType()
              .isF32()) {
       return failure();
@@ -1289,7 +1289,7 @@ class LowerSparseMatMulOp : public RewritePattern {
     MLIRContext *context = rewriter.getContext();
     llvm::SmallVector<Value, 2> operands{op.getA(), op.getB()};
     for (Value &operand : operands) {
-      TensorType tensor_type = operand.getType().cast<TensorType>();
+      TensorType tensor_type = mlir::cast<TensorType>(operand.getType());
       Type element_type = tensor_type.getElementType();
       if (element_type.isF32()) continue;
       // Element type can either be f32 or bf16 for `SparseMatMulOp` so it
@@ -1374,13 +1374,13 @@ class LowerResizeNearestNeighbor : public RewritePattern {
                                 PatternRewriter &rewriter) const override {
     auto op = cast<ResizeNearestNeighborOp>(src_op);
     auto loc = op.getLoc();
-    auto result_ty = op.getType().cast<ShapedType>();
+    auto result_ty = mlir::cast<ShapedType>(op.getType());
 
     auto input = op.getImages();
-    auto input_ty = input.getType().cast<ShapedType>();
+    auto input_ty = mlir::cast<ShapedType>(input.getType());
     auto input_element_ty = input_ty.getElementType();
     auto out_size = op.getSize();
-    auto out_size_ty = out_size.getType().cast<ShapedType>();
+    auto out_size_ty = mlir::cast<ShapedType>(out_size.getType());
     auto out_size_element_ty = out_size_ty.getElementType();
 
     // Input should be rank 4.
@@ -1620,7 +1620,7 @@ struct LowerRollOp : public RewritePattern {
     auto tf_roll_op = cast<RollOp>(op);
 
     auto input_ty =
-        tf_roll_op.getInput().getType().dyn_cast<RankedTensorType>();
+        mlir::dyn_cast<RankedTensorType>(tf_roll_op.getInput().getType());
     if (!input_ty || !input_ty.hasStaticShape()) {
       return rewriter.notifyMatchFailure(
           op, "require the type of input to have static shapes");
@@ -1628,7 +1628,8 @@ struct LowerRollOp : public RewritePattern {
 
     DenseIntElementsAttr shift_attr;
     Value shift = tf_roll_op.getShift();
-    auto shift_ranked_attr_type = shift.getType().dyn_cast<RankedTensorType>();
+    auto shift_ranked_attr_type =
+        mlir::dyn_cast<RankedTensorType>(shift.getType());
     if (!shift_ranked_attr_type ||
         !matchPattern(shift, m_Constant(&shift_attr))) {
       return failure();
@@ -1636,7 +1637,8 @@ struct LowerRollOp : public RewritePattern {
 
     DenseIntElementsAttr axis_attr;
     Value axis = tf_roll_op.getAxis();
-    auto axis_ranked_attr_type = axis.getType().dyn_cast<RankedTensorType>();
+    auto axis_ranked_attr_type =
+        mlir::dyn_cast<RankedTensorType>(axis.getType());
     if (!axis_ranked_attr_type || !matchPattern(axis, m_Constant(&axis_attr))) {
       return failure();
     }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/optimize.cc b/tensorflow/compiler/mlir/tensorflow/transforms/optimize.cc
index cd608bdf269ad7..80e7cd3991c727 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/optimize.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/utils/validators.h"
@@ -57,13 +58,14 @@ class SimplifyBroadcastReshape : public OpRewritePattern<BroadcastToOp> {
     auto reshape_op = llvm::dyn_cast_or_null<ReshapeOp>(user);
     if (!reshape_op) return failure();
 
-    auto reshape_type = reshape_op.getOutput().getType().cast<ShapedType>();
+    auto reshape_type =
+        mlir::cast<ShapedType>(reshape_op.getOutput().getType());
 
     if (!reshape_type.hasStaticShape()) return failure();
     ArrayRef<int64_t> reshape_shape = reshape_type.getShape();
 
-    auto input_type = op.getInput().getType().cast<ShapedType>();
-    auto output_type = op.getOutput().getType().cast<ShapedType>();
+    auto input_type = mlir::cast<ShapedType>(op.getInput().getType());
+    auto output_type = mlir::cast<ShapedType>(op.getOutput().getType());
 
     if (!input_type.hasRank() || !output_type.hasRank()) return failure();
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc b/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc
index eaf881c43df95e..bfed05448bd25a 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc
@@ -94,7 +94,7 @@ GlobalTensorUsesMap CreateGlobalTensorUsesMap(ModuleOp module) {
         continue;
       }
       auto global_tensor = symbol_table.lookup<GlobalTensorOp>(
-          sym.cast<FlatSymbolRefAttr>().getValue());
+          mlir::cast<FlatSymbolRefAttr>(sym).getValue());
       if (!global_tensor) {
         continue;
       }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index 3a2ba6f181f649..74f25f90dedf33 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_PASSES_H_
 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_PASSES_H_
 
+#include <cstdint>
 #include <memory>
 #include <string>
 
@@ -99,7 +100,8 @@ std::unique_ptr<OperationPass<func::FuncOp>>
 CreateReplicateTensorListInitOpsPass();
 
 // Performs Shape Inference on the TensorFlow dialect using the global registry.
-std::unique_ptr<OperationPass<ModuleOp>> CreateTFShapeInferencePass();
+std::unique_ptr<OperationPass<ModuleOp>> CreateTFShapeInferencePass(
+    ArrayRef<ArrayRef<int64_t>> input_shapes = {});
 
 // Performs TF.data optimizations.
 std::unique_ptr<OperationPass<func::FuncOp>> CreateTFDataOptimizationPass();
@@ -308,9 +310,8 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateNameAnonymousIteratorsPass();
 
 // Creates a pass that breaks up an island with multiple ops into multiple
 // islands, each with a single op. This pass intentionally does not propagate
-// control dependencies across newly created islands, a following pass will
-// handle this.
-// TODO(b/244596254) Implement followup pass for creating control deps.
+// control dependencies across newly created islands and is handled by
+// CreateTFExecutorUpdateControlDependenciesPass.
 std::unique_ptr<OperationPass<func::FuncOp>> CreateSplitIntoIslandPerOpPass();
 
 // Prints, but otherwise pipes through without changes, the current module.
@@ -531,10 +532,6 @@ CreateTPUResourceReadsWritesPartitioningPass();
 std::unique_ptr<OperationPass<ModuleOp>>
 CreateTPUAnnotateDynamicShapeInputsPass();
 
-// Creates a pass that identifies XLASharding ops in launch op for TPU
-// computation.
-std::unique_ptr<OperationPass<ModuleOp>> CreateTPUShardingIdentificationPass();
-
 // Creates a pass that moves `tf.AssignVariableOp` into a
 // `tf_device.parallel_execute` region if the `tf.AssignVariableOp` is the
 // only consumer of a `tf_device.parallel_execute` result.
@@ -668,7 +665,6 @@ enum MoveTransposeDirection { kBegin, kEnd };
 #define GEN_PASS_DECL_TPUREORDERREPLICATEANDPARTITIONEDINPUTSPASS
 #define GEN_PASS_DECL_TPURESOURCEREADFORWRITEPASS
 #define GEN_PASS_DECL_TPURESOURCEREADSWRITESPARTITIONINGPASS
-#define GEN_PASS_DECL_TPUSHARDINGIDENTIFICATIONPASS
 #define GEN_PASS_DECL_TPUSPACETODEPTHPASS
 #define GEN_PASS_DECL_TPUUPDATEEMBEDDINGENQUEUEOPINPUTSPASS
 #define GEN_PASS_DECL_TPUVALIDATEINPUTSPASS
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/prepare_tpu_computation_for_tf_export.cc b/tensorflow/compiler/mlir/tensorflow/transforms/prepare_tpu_computation_for_tf_export.cc
index 661dafe2a2f327..b968923089cb8f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/prepare_tpu_computation_for_tf_export.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/prepare_tpu_computation_for_tf_export.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
@@ -79,8 +80,8 @@ class RewriteXlaHostComputeMlir
     llvm::SmallVector<Attribute> shape_attrs;
     shape_attrs.reserve(op.getNumResults());
     for (Type ty : op.getResultTypes()) {
-      shape_attrs.push_back(
-          TF::ShapeAttr::get(rewriter.getContext(), ty.cast<ShapedType>()));
+      shape_attrs.push_back(TF::ShapeAttr::get(rewriter.getContext(),
+                                               mlir::cast<ShapedType>(ty)));
     }
 
     // Clone the `host_func` in the `host_mlir_module` attribute if it exists
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc b/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
index a7226b39ebe380..bc64c48c81a596 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
@@ -196,8 +197,8 @@ LogicalResult PromoteResourcesToArguments(
   auto func_args = function.getArguments().take_front(
       function.getNumArguments() - var_handle_shared_names.size());
   for (BlockArgument& func_arg : func_args) {
-    auto resource_type =
-        getElementTypeOrSelf(func_arg.getType()).dyn_cast<TF::ResourceType>();
+    auto resource_type = mlir::dyn_cast<TF::ResourceType>(
+        getElementTypeOrSelf(func_arg.getType()));
     if (!resource_type) continue;
     if (failed(ValidateResourceArgument(function, func_arg, resource_type)))
       return failure();
@@ -212,8 +213,8 @@ LogicalResult PromoteResourcesToArguments(
   auto var_handle_args =
       function.getArguments().take_back(var_handle_shared_names.size());
   for (BlockArgument& var_handle_arg : var_handle_args) {
-    auto resource_type =
-        getElementTypeOrSelf(var_handle_arg.getType()).cast<TF::ResourceType>();
+    auto resource_type = mlir::cast<TF::ResourceType>(
+        getElementTypeOrSelf(var_handle_arg.getType()));
     add_resource_argument(var_handle_arg, resource_type);
   }
 
@@ -226,7 +227,8 @@ LogicalResult PromoteResourcesToArguments(
   // live value.
   for (Operation& op : llvm::make_early_inc_range(block)) {
     if (auto read_op = llvm::dyn_cast<TF::ReadVariableOp>(&op)) {
-      if (auto func_arg = read_op.getResource().dyn_cast<BlockArgument>()) {
+      if (auto func_arg =
+              mlir::dyn_cast<BlockArgument>(read_op.getResource())) {
         if (func_arg.getOwner() != &block)
           return read_op.emitOpError(kResourceFunctionMsg);
 
@@ -239,7 +241,8 @@ LogicalResult PromoteResourcesToArguments(
 
       read_op.erase();
     } else if (auto write_op = llvm::dyn_cast<TF::AssignVariableOp>(&op)) {
-      if (auto func_arg = write_op.getResource().dyn_cast<BlockArgument>()) {
+      if (auto func_arg =
+              mlir::dyn_cast<BlockArgument>(write_op.getResource())) {
         if (func_arg.getOwner() != &block)
           return write_op.emitOpError(kResourceFunctionMsg);
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/readonly_references_to_resources.cc b/tensorflow/compiler/mlir/tensorflow/transforms/readonly_references_to_resources.cc
index 975a1484d6984a..7c488b8992d2cb 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/readonly_references_to_resources.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/readonly_references_to_resources.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
@@ -91,7 +92,7 @@ StringRef GetNodeNameFromClassAttrOrSharedNameAttr(Operation *op) {
 
   StringRef result;
   for (Attribute class_attr : classes_attr) {
-    StringRef node_name = class_attr.cast<StringAttr>().getValue();
+    StringRef node_name = mlir::cast<StringAttr>(class_attr).getValue();
     if (!node_name.starts_with(kLocationPrefix)) {
       continue;
     }
@@ -150,8 +151,8 @@ void ConvertReadonlyReferenceVariablesToResourceVariablesPass::
   for (VariableV2Op variable_v2_op : variable_v2s_to_replace) {
     builder.setInsertionPoint(variable_v2_op);
     ShapedType shaped_type =
-        variable_v2_op.getResult().getType().cast<ShapedType>();
-    TensorType tensor_type = DropRefType(shaped_type).cast<TensorType>();
+        mlir::cast<ShapedType>(variable_v2_op.getResult().getType());
+    TensorType tensor_type = mlir::cast<TensorType>(DropRefType(shaped_type));
     StringAttr device_attr =
         variable_v2_op->getAttrOfType<StringAttr>("device");
     if (!device_attr) device_attr = builder.getStringAttr("");
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/region_control_flow_to_functional.cc b/tensorflow/compiler/mlir/tensorflow/transforms/region_control_flow_to_functional.cc
index a669276e35a175..b740e667dabe84 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/region_control_flow_to_functional.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/region_control_flow_to_functional.cc
@@ -508,10 +508,10 @@ LogicalResult RegionControlFlowToFunctional::ConvertWhileOp(
   // existing function as is.
   auto while_arg_matcher = [](Value first, Region& first_region, Value second,
                               Region& second_region) {
-    if (!first.isa<BlockArgument>() || !second.isa<BlockArgument>())
+    if (!mlir::isa<BlockArgument>(first) || !mlir::isa<BlockArgument>(second))
       return false;
-    BlockArgument first_block_arg = first.cast<BlockArgument>();
-    BlockArgument second_block_arg = second.cast<BlockArgument>();
+    BlockArgument first_block_arg = mlir::cast<BlockArgument>(first);
+    BlockArgument second_block_arg = mlir::cast<BlockArgument>(second);
 
     // 2 block arguments will match if they are the same argument number, and
     // are block arguments of the corresponding containing regions.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/remove_unused_arguments.cc b/tensorflow/compiler/mlir/tensorflow/transforms/remove_unused_arguments.cc
index 6aa3d161c0e121..18f54d6b5826d3 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/remove_unused_arguments.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/remove_unused_arguments.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "mlir/Interfaces/ControlFlowInterfaces.h"  // from @llvm-project
 #include "mlir/Interfaces/FunctionInterfaces.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 
 namespace mlir {
 namespace TF {
@@ -137,7 +138,7 @@ void RemoveUnusedArgumentsPass::runOnOperation() {
     // SymbolUserOpInterface doesn't tell us which attributes contain
     // the symbols, so we have to scan through all of them.
     for (auto attr : op->getAttrs()) {
-      if (auto sym = attr.getValue().dyn_cast<FlatSymbolRefAttr>()) {
+      if (auto sym = mlir::dyn_cast<FlatSymbolRefAttr>(attr.getValue())) {
         Operation* func = mlir::SymbolTable::lookupNearestSymbolFrom(op, sym);
         if (func) {
           do_not_touch.insert(func);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/remove_vars_in_session_initializer.cc b/tensorflow/compiler/mlir/tensorflow/transforms/remove_vars_in_session_initializer.cc
index 9e85c5f9ed6fda..3a6377a3bb63e1 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/remove_vars_in_session_initializer.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/remove_vars_in_session_initializer.cc
@@ -55,7 +55,7 @@ void RecursiveRemove(Operation* op,
   erase_list.push_back(op);
 
   for (auto& use : op->getOpOperands()) {
-    if (auto op_result = use.get().dyn_cast<mlir::OpResult>()) {
+    if (auto op_result = mlir::dyn_cast<mlir::OpResult>(use.get())) {
       Operation* def = op_result.getDefiningOp();
       if (!dead_ops.insert(def).second) continue;
       RecursiveRemove(def, erase_list, dead_ops);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_invariant_op_hoisting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_invariant_op_hoisting.cc
index 1c9558eecda702..803f135af624d7 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_invariant_op_hoisting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_invariant_op_hoisting.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
@@ -90,7 +91,7 @@ void MakeShapeOpInvariant(tf_device::ReplicateOp replicate_op, int num_replicas,
   Value input = shape_op.getInput();
   // If ShapeOp operand is replicate tensor block argument, replace with the
   // associated first replica operand.
-  if (auto block_arg = input.dyn_cast<BlockArgument>()) {
+  if (auto block_arg = mlir::dyn_cast<BlockArgument>(input)) {
     if (block_arg.getOwner() != replicate_block) return;
 
     shape_op.setOperand(replicate_op.GetReplicaOperandForBlockArgument(
@@ -112,7 +113,8 @@ void MakeShapeOpInvariant(tf_device::ReplicateOp replicate_op, int num_replicas,
   // shape has not changed in replicate prior to read. Currently after both
   // ResourceOpLiftingPass and TPURewritePass, there should not be any updates
   // to resources prior to their respective ReadVariableOp.
-  if (auto block_arg = read_var_op.getResource().dyn_cast<BlockArgument>()) {
+  if (auto block_arg =
+          mlir::dyn_cast<BlockArgument>(read_var_op.getResource())) {
     if (block_arg.getOwner() != replicate_block) return;
 
     OpBuilder builder(shape_op);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
index e03eb9a9228f35..90397e7f8237c9 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
@@ -74,14 +74,14 @@ struct ResourceOpLiftingPass
 };
 
 bool IsResource(Value value) {
-  return getElementTypeOrSelf(value.getType()).isa<TF::ResourceType>();
+  return mlir::isa<TF::ResourceType>(getElementTypeOrSelf(value.getType()));
 }
 
 // Get the type of the data contained in a resource. Returns null if there is
 // no single type in the resource.
 Type GetResourceSubtype(Value value) {
   auto resource_type =
-      getElementTypeOrSelf(value.getType()).dyn_cast<TF::ResourceType>();
+      mlir::dyn_cast<TF::ResourceType>(getElementTypeOrSelf(value.getType()));
   auto subtypes = resource_type.getSubtypes();
   if (subtypes.size() == 1) return subtypes[0];
   return nullptr;
@@ -691,7 +691,7 @@ void RemoveUnusedResourceArgumentsAndForwardedRetvals(
   int64_t skipped_retvals = 0;
   for (auto entry : llvm::enumerate(old_return_vals)) {
     auto return_val = entry.value();
-    if (auto arg = return_val.dyn_cast<BlockArgument>()) {
+    if (auto arg = mlir::dyn_cast<BlockArgument>(return_val)) {
       auto it = infos.find(arg.getArgNumber());
       if (it != infos.end() && !it->getSecond().used) {
         return_op->eraseOperand(entry.index() - skipped_retvals++);
@@ -747,7 +747,7 @@ LogicalResult LiftArgRetResourcesForFunction(
   // with type replaced.
   llvm::SmallVector<Value, 4> skipped_args;
   for (auto& it : hoister.GetResources()) {
-    BlockArgument arg = it.first.dyn_cast<BlockArgument>();
+    BlockArgument arg = mlir::dyn_cast<BlockArgument>(it.first);
     assert(arg && "Expect resources for FuncOp to be its arguments");
     auto type_iter = resource_data_types.find(arg.getArgNumber());
     if (type_iter == resource_data_types.end()) {
@@ -772,7 +772,7 @@ LogicalResult LiftArgRetResourcesForFunction(
     Value resource = assign_variable_op.getResource();
     if (!hoister.Contains(resource)) continue;
 
-    auto arg = resource.dyn_cast<BlockArgument>();
+    auto arg = mlir::dyn_cast<BlockArgument>(resource);
     handle_updated_arg_value(arg.getArgNumber(), assign_variable_op.getValue());
     assign_variable_op.erase();
   }
@@ -1018,11 +1018,11 @@ LogicalResult HandlePartitionedCallOpCallee(
   for (auto entry :
        llvm::enumerate(callee.front().getTerminator()->getOperands())) {
     auto retval = entry.value();
-    if (!getElementTypeOrSelf(retval.getType()).isa<TF::ResourceType>()) {
+    if (!mlir::isa<TF::ResourceType>(getElementTypeOrSelf(retval.getType()))) {
       result->old_to_new_output_indices.push_back(non_resource_results++);
       continue;
     }
-    auto aliasing_arg = retval.dyn_cast<BlockArgument>();
+    auto aliasing_arg = mlir::dyn_cast<BlockArgument>(retval);
     if (!aliasing_arg) {
       return callee.emitOpError("unsupported function call: ")
              << "resource return value does not alias an input.";
@@ -1063,7 +1063,7 @@ LogicalResult HandlePartitionedCallOpCallee(
   llvm::SmallVector<int64_t, 4> retval_indices_to_preserve;
   for (auto& val : callee.front().getTerminator()->getOpOperands()) {
     // Store indices of results that are not resources.
-    if (!getElementTypeOrSelf(val.get().getType()).isa<TF::ResourceType>())
+    if (!mlir::isa<TF::ResourceType>(getElementTypeOrSelf(val.get().getType())))
       retval_indices_to_preserve.push_back(val.getOperandNumber());
   }
   int64_t num_retvals = retval_indices_to_preserve.size();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.cc
index 2f1c675b305516..303e5aa2b6ddeb 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
@@ -32,7 +33,7 @@ namespace mlir {
 namespace {
 
 bool IsResource(Value value) {
-  return getElementTypeOrSelf(value.getType()).isa<TF::ResourceType>();
+  return mlir::isa<TF::ResourceType>(getElementTypeOrSelf(value.getType()));
 }
 
 // Checks if a cast op is casting a resource -> resource.
@@ -182,7 +183,7 @@ void EliminateUnusedResultsForIfCase(Operation *op,
     if (cloned == func) continue;
     // Patch up the op attribute to point to the new function.
     for (NamedAttribute attr : op->getAttrs()) {
-      auto symref = attr.getValue().dyn_cast<FlatSymbolRefAttr>();
+      auto symref = mlir::dyn_cast<FlatSymbolRefAttr>(attr.getValue());
       if (!symref) continue;
       if (symref.getValue() != func.getName()) continue;
       op->setAttr(attr.getName(),
@@ -301,7 +302,8 @@ LogicalResult ForwardCommonArgToOutput(Operation *op,
     std::optional<int> common_arg_index;
     for (func::FuncOp func : branches) {
       auto ret = func.front().getTerminator();
-      auto block_arg = ret->getOperand(result_idx).dyn_cast<BlockArgument>();
+      auto block_arg =
+          mlir::dyn_cast<BlockArgument>(ret->getOperand(result_idx));
       if (!block_arg) {
         return op->emitOpError("result #")
                << result_idx << " not tied to function argument for branch @"
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/rewrite_tpu_embedding_ops.cc b/tensorflow/compiler/mlir/tensorflow/transforms/rewrite_tpu_embedding_ops.cc
index 2ff6c78896fff2..faedd25114807e 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/rewrite_tpu_embedding_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/rewrite_tpu_embedding_ops.cc
@@ -37,12 +37,13 @@ struct RewriteTPUEmbeddingOps
 
 // Rewrites the given op to `OpT` op after adding the given operand at the end.
 template <typename OpT>
-OpT AddOperandAndRewriteAs(Operation* op, Value operand, OpBuilder* builder) {
+OpT AddOperandAndRewriteAs(Operation* op, Value operand, NamedAttrList attr,
+                           OpBuilder* builder) {
   builder->setInsertionPoint(op);
   auto operands = llvm::to_vector<4>(op->getOperands());
   operands.push_back(operand);
   auto new_op = builder->create<OpT>(op->getLoc(), op->getResultTypes(),
-                                     operands, op->getAttrs());
+                                     operands, attr.getAttrs());
   op->replaceAllUsesWith(new_op.getOperation()->getResults());
   op->erase();
   return new_op;
@@ -83,8 +84,8 @@ LogicalResult RunOnRegion(Region* region) {
 
   // Rewrite RecvTPUEmbeddingActivations op to the corresponding internal op.
   if (recv_op)
-    AddOperandAndRewriteAs<XlaRecvTPUEmbeddingActivationsOp>(recv_op, dedup_op,
-                                                             &builder);
+    AddOperandAndRewriteAs<XlaRecvTPUEmbeddingActivationsOp>(
+        recv_op, dedup_op, recv_op->getAttrs(), &builder);
 
   // Rewrite SendTPUEmbeddingGradients op to the corresponding internal op and
   // then update the OperandSegmentSize attribute.
@@ -92,11 +93,11 @@ LogicalResult RunOnRegion(Region* region) {
     int32_t operand_sizes[] = {static_cast<int32_t>(send_op.getN()),
                                static_cast<int32_t>(send_op.getNN()), 1};
     auto operand_size_attr = builder.getDenseI32ArrayAttr(operand_sizes);
+    NamedAttrList attrs(send_op->getAttrs());
+    attrs.set(send_op.getOperandSegmentSizeAttr(), operand_size_attr);
 
-    auto new_send_op = AddOperandAndRewriteAs<XlaSendTPUEmbeddingGradientsOp>(
-        send_op, dedup_op, &builder);
-    new_send_op->setAttr(new_send_op.getOperandSegmentSizeAttr(),
-                         operand_size_attr);
+    AddOperandAndRewriteAs<XlaSendTPUEmbeddingGradientsOp>(send_op, dedup_op,
+                                                           attrs, &builder);
   }
   return success();
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/rewrite_util.h b/tensorflow/compiler/mlir/tensorflow/transforms/rewrite_util.h
index c86c4383cc602f..4dd6ae7c8e4a7d 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/rewrite_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/rewrite_util.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "mlir/IR/Matchers.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 
 namespace mlir {
 namespace TF {
@@ -27,13 +28,13 @@ namespace TF {
 template <typename T>
 DenseElementsAttr GetScalarOfType(Type ty, T raw_value) {
   RankedTensorType scalar_ty = RankedTensorType::get({}, ty);
-  if (auto float_ty = ty.dyn_cast<FloatType>()) {
+  if (auto float_ty = mlir::dyn_cast<FloatType>(ty)) {
     FloatAttr attr = FloatAttr::get(float_ty, raw_value);
     return DenseElementsAttr::get(scalar_ty, attr);
-  } else if (auto int_ty = ty.dyn_cast<IntegerType>()) {
+  } else if (auto int_ty = mlir::dyn_cast<IntegerType>(ty)) {
     IntegerAttr attr = IntegerAttr::get(int_ty, raw_value);
     return DenseElementsAttr::get(scalar_ty, attr);
-  } else if (auto complex_ty = ty.dyn_cast<ComplexType>()) {
+  } else if (auto complex_ty = mlir::dyn_cast<ComplexType>(ty)) {
     Type complex_element_ty = complex_ty.getElementType();
     if (complex_element_ty.isF32()) {
       return DenseElementsAttr::get(
@@ -50,13 +51,13 @@ DenseElementsAttr GetScalarOfType(Type ty, T raw_value) {
 // to `raw_value`.
 template <typename T>
 bool IsConstantValueOf(Value value, T raw_value) {
-  auto element_type = value.getType().cast<ShapedType>().getElementType();
-  if (element_type.isa<FloatType>()) {
+  auto element_type = mlir::cast<ShapedType>(value.getType()).getElementType();
+  if (mlir::isa<FloatType>(element_type)) {
     DenseFPElementsAttr float_attr;
     if (matchPattern(value, m_Constant(&float_attr)) && float_attr.isSplat() &&
         float_attr.getSplatValue<APFloat>().isExactlyValue(raw_value))
       return true;
-  } else if (element_type.isa<IntegerType>()) {
+  } else if (mlir::isa<IntegerType>(element_type)) {
     DenseIntElementsAttr int_attr;
     if (matchPattern(value, m_Constant(&int_attr)) && int_attr.isSplat() &&
         int_attr.getSplatValue<APInt>() == raw_value)
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/set_tpu_infeed_layout.cc b/tensorflow/compiler/mlir/tensorflow/transforms/set_tpu_infeed_layout.cc
index 4948aa68e13039..0eb552208194e1 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/set_tpu_infeed_layout.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/set_tpu_infeed_layout.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "xla/layout.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
@@ -64,27 +65,27 @@ FailureOr<Attribute> GetTPUInfeedLayout(const ArrayRef<Type> types,
     llvm::SmallVector<mlir::Attribute> v;
     v.reserve(types.size());
     for (const mlir::Type &t : types) {
-      if (t.isa<mhlo::TokenType>()) continue;
+      if (mlir::isa<mhlo::TokenType>(t)) continue;
       auto layout = GetTPUInfeedLayout({t}, rewriter);
       if (failed(layout)) return failure();
       v.push_back(layout.value());
     }
     ArrayRef<Attribute> shape(v);
     return rewriter.getArrayAttr(shape);
-  } else if (types[0].isa<TupleType>()) {
-    auto tuple_type = types[0].dyn_cast<TupleType>();
+  } else if (mlir::isa<TupleType>(types[0])) {
+    auto tuple_type = mlir::dyn_cast<TupleType>(types[0]);
     const auto &types = tuple_type.getTypes();
     llvm::SmallVector<mlir::Attribute> v;
     v.reserve(types.size());
     for (const mlir::Type &t : types) {
-      if (t.isa<mhlo::TokenType>()) continue;
+      if (mlir::isa<mhlo::TokenType>(t)) continue;
       auto layout = GetTPUInfeedLayout({t}, rewriter);
       if (failed(layout)) return failure();
       v.push_back(layout.value());
     }
     ArrayRef<Attribute> shape(v);
     return rewriter.getArrayAttr(shape);
-  } else if (auto t = types[0].dyn_cast<RankedTensorType>()) {
+  } else if (auto t = mlir::dyn_cast<RankedTensorType>(types[0])) {
     if (!t.hasStaticShape()) return failure();
     auto layout = GetTPUInfeedLayoutFromAPI(t);
     std::vector<int64_t> minor_to_major;
@@ -129,7 +130,7 @@ bool SetTPUInfeedLayout(mlir::OwningOpRef<mlir::ModuleOp> &mlir_module) {
     std::vector<mlir::Type> result_types;
 
     for (mlir::Type t : op.getResultTypes()) {
-      auto ty = t.cast<mlir::TensorType>();
+      auto ty = mlir::cast<mlir::TensorType>(t);
       if (!ty.hasStaticShape()) return mlir::WalkResult::interrupt();
       result_types.push_back(t);
     }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
index dc1cfe3f5920fe..6a9527aea26b3f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
@@ -29,6 +29,8 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Hashing.h"
@@ -75,6 +77,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/shape_inference_utils.h"
@@ -89,6 +92,7 @@ limitations under the License.
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/ir/types/dialect.h"
+#include "tsl/platform/errors.h"
 
 #define DEBUG_TYPE "tf-shape-inference"
 
@@ -121,9 +125,9 @@ Type TypeMeet(Type lhs, Type rhs) {
   DCOMMENT("RefineTypeWith : " << lhs << " : " << rhs);
   if (lhs == rhs) return lhs;
 
-  auto rhs_shape_type = rhs.dyn_cast<ShapedType>();
+  auto rhs_shape_type = mlir::dyn_cast<ShapedType>(rhs);
   if (!rhs_shape_type) return lhs;
-  auto lhs_shape_type = lhs.cast<ShapedType>();
+  auto lhs_shape_type = mlir::cast<ShapedType>(lhs);
   if (lhs_shape_type.hasRank() && rhs_shape_type.hasRank() &&
       lhs_shape_type.getRank() != rhs_shape_type.getRank()) {
     DCOMMENT("Unexpected rank mismatch: " << lhs << " vs " << rhs);
@@ -163,7 +167,8 @@ Type TypeMeet(Type lhs, Type rhs) {
   // returned type.
   auto lhs_element_type = lhs_shape_type.getElementType();
   auto rhs_element_type_with_subtype =
-      rhs_shape_type.getElementType().dyn_cast<TF::TensorFlowTypeWithSubtype>();
+      mlir::dyn_cast<TF::TensorFlowTypeWithSubtype>(
+          rhs_shape_type.getElementType());
   // Look for resource or variant element type and ensure we refine the subtype.
   // We only support a single subtype at the moment, we won't handle something
   // like:
@@ -171,7 +176,7 @@ Type TypeMeet(Type lhs, Type rhs) {
   if (rhs_element_type_with_subtype &&
       rhs_element_type_with_subtype.GetSubtypes().size() == 1) {
     auto lhs_element_type_with_subtype =
-        lhs_element_type.dyn_cast<TF::TensorFlowTypeWithSubtype>();
+        mlir::dyn_cast<TF::TensorFlowTypeWithSubtype>(lhs_element_type);
     TensorType subtype;
     if (!lhs_element_type_with_subtype) {
       DCOMMENT(
@@ -189,10 +194,9 @@ Type TypeMeet(Type lhs, Type rhs) {
       // and:
       //   tensor<!tf_type.variant<tensor<10x8xf32>>>
       // we'll try here to refine tensor<?x8xf32> with tensor<10x8xf32>.
-      auto refined_subtype =
+      auto refined_subtype = mlir::cast<TensorType>(
           TypeMeet(lhs_element_type_with_subtype.GetSubtypes().front(),
-                   rhs_element_type_with_subtype.GetSubtypes().front())
-              .cast<TensorType>();
+                   rhs_element_type_with_subtype.GetSubtypes().front()));
       if (refined_subtype !=
           lhs_element_type_with_subtype.GetSubtypes().front())
         subtype = refined_subtype;
@@ -268,7 +272,7 @@ Value GetElementShapeOperand(Operation* op) {
 // Utility function to create a ranked tensor type after dropping the first
 // dimension from the input type.
 RankedTensorType DropFirstDimension(Type type) {
-  RankedTensorType ranked_type = type.dyn_cast<RankedTensorType>();
+  RankedTensorType ranked_type = mlir::dyn_cast<RankedTensorType>(type);
   if (!ranked_type) return {};
   llvm::ArrayRef<int64_t> dims_except_first =
       ranked_type.getShape().drop_front();
@@ -278,7 +282,7 @@ RankedTensorType DropFirstDimension(Type type) {
 
 Operation* InsertCast(OpBuilder& b, Location loc, Type dst_type, Value input) {
   Type element_type = getElementTypeOrSelf(dst_type);
-  if (element_type.isa<IndexType>())
+  if (mlir::isa<IndexType>(element_type))
     return b.create<tensor::CastOp>(loc, dst_type, input);
   if (isa<TensorFlowDialect, BuiltinDialect>(element_type.getDialect()))
     return b.create<TF::CastOp>(loc, dst_type, input,
@@ -338,7 +342,7 @@ bool CanInferTensorListElementType(Value tensorlist,
     for (auto& use : tensorlist.getUses()) {
       if (auto push = llvm::dyn_cast<TensorListPushBackOp>(use.getOwner())) {
         auto element_type =
-            push.getTensor().getType().dyn_cast<RankedTensorType>();
+            mlir::dyn_cast<RankedTensorType>(push.getTensor().getType());
         if (!verify_and_update_potential_element_type(element_type))
           return false;
         add_to_worklist(push.getOutputHandle());
@@ -357,7 +361,7 @@ bool CanInferTensorListElementType(Value tensorlist,
       }
       if (auto set_item = llvm::dyn_cast<TensorListSetItemOp>(use.getOwner())) {
         auto element_type =
-            set_item.getItem().getType().dyn_cast<RankedTensorType>();
+            mlir::dyn_cast<RankedTensorType>(set_item.getItem().getType());
         DCOMMENT("\tTensorListSetItemOp " << element_type);
         if (!verify_and_update_potential_element_type(element_type))
           return false;
@@ -429,8 +433,8 @@ bool CanInferTensorListElementType(Value tensorlist,
 // Returns the tensor type created from the `shape_attr` and `type_attr`
 // attributes.
 Type GetType(Attribute shape_attr, Attribute type_attr) {
-  auto shape = shape_attr.cast<tf_type::ShapeAttr>();
-  auto type = type_attr.cast<TypeAttr>();
+  auto shape = mlir::cast<tf_type::ShapeAttr>(shape_attr);
+  auto type = mlir::cast<TypeAttr>(type_attr);
   if (shape.hasRank())
     return tensorflow::GetTypeFromTFTensorShape(shape.getShape(),
                                                 type.getValue());
@@ -441,7 +445,7 @@ Type GetType(Attribute shape_attr, Attribute type_attr) {
 
 // Returns whether type can be further refined.
 bool CanBeRefined(Type type) {
-  auto shape_type = type.dyn_cast<ShapedType>();
+  auto shape_type = mlir::dyn_cast<ShapedType>(type);
   if (!shape_type) return false;
 
   // Returns whether type with subtypes can be further refined.
@@ -449,8 +453,8 @@ bool CanBeRefined(Type type) {
     return tws.GetSubtypes().empty() ||
            llvm::any_of(tws.GetSubtypes(), CanBeRefined);
   };
-  auto type_with_subtype =
-      shape_type.getElementType().dyn_cast<TF::TensorFlowTypeWithSubtype>();
+  auto type_with_subtype = mlir::dyn_cast<TF::TensorFlowTypeWithSubtype>(
+      shape_type.getElementType());
   if (type_with_subtype && can_refine_subtypes(type_with_subtype)) return true;
 
   return !shape_type.hasStaticShape();
@@ -463,7 +467,7 @@ Type GetNewArgType(Type old_arg_type, ArrayRef<int64_t> shape,
                    Type element_type, mlir::MLIRContext* context) {
   Type new_arg_type = tensorflow::GetTypeFromTFTensorShape(shape, element_type);
 
-  if (auto input_ty = old_arg_type.dyn_cast<RankedTensorType>()) {
+  if (auto input_ty = mlir::dyn_cast<RankedTensorType>(old_arg_type)) {
     ArrayRef<int64_t> bounds = hlo::encodingToBounds(input_ty.getEncoding());
     // The input type has bounded dynamic dimension.
     if (!bounds.empty()) {
@@ -501,12 +505,12 @@ struct ValuePort {
 
   // Convert output value to ValuePort.
   explicit ValuePort(Value v) {
-    OpResult opr = v.dyn_cast<OpResult>();
+    OpResult opr = mlir::dyn_cast<OpResult>(v);
     if (opr) {
       producer = opr.getOwner();
       port = {opr.getResultNumber()};
     } else {
-      producer = v.cast<BlockArgument>();
+      producer = mlir::cast<BlockArgument>(v);
       port = {0};
     }
   }
@@ -545,7 +549,7 @@ using ValuePortInputs = SmallVectorImpl<ValuePort>;
 // Maps the specified component in the `port` of the given op's result to one of
 // the element in the input.
 ValuePort ComputeInputComponentFor(PackOp op, ArrayRef<unsigned int> port) {
-  auto type = op.getType().cast<TensorType>();
+  auto type = mlir::cast<TensorType>(op.getType());
   if (!type.hasRank() || type.getRank() != 1) return {};
   if (port.size() != 2) return {};
   assert(port[0] == 0);
@@ -558,7 +562,7 @@ ValuePort ComputeInputComponentFor(ConcatV2Op op, ArrayRef<unsigned int> port) {
 
   int64_t element_idx = port[1];
   for (Value val : op.getValues()) {
-    auto val_ty = val.getType().cast<TensorType>();
+    auto val_ty = mlir::cast<TensorType>(val.getType());
     if (!val_ty.hasStaticShape() || val_ty.getRank() != 1) return {};
 
     int64_t dim_size = val_ty.getNumElements();
@@ -579,7 +583,7 @@ ValuePort ComputeInputComponentFor(GatherV2Op op, ArrayRef<unsigned int> port) {
   assert(port[0] == 0);
 
   auto params = op.getParams();
-  auto params_ty = params.getType().dyn_cast<RankedTensorType>();
+  auto params_ty = mlir::dyn_cast<RankedTensorType>(params.getType());
   if (!params_ty || !params_ty.hasStaticShape() || params_ty.getRank() != 1 ||
       op.getBatchDims() != 0) {
     return {};
@@ -683,7 +687,7 @@ Attribute ComputeOutputComponent(const ValuePort& value_port,
   if (auto shape_op = dyn_cast<TF::ShapeOp>(op)) {
     // No shape available in an unranked tensor type.
     auto operand_ty =
-        shape_op.getOperand().getType().dyn_cast<RankedTensorType>();
+        mlir::dyn_cast<RankedTensorType>(shape_op.getOperand().getType());
     if (!operand_ty) return nullptr;
 
     // Shape op has a single output so the first element should always be zero
@@ -1130,14 +1134,14 @@ bool ShapeInference::InferShapeForCast(Operation* op) {
   if (!new_type) {
     // Combine shape information when leaf element types are not the same, not
     // including shape info in subtypes.
-    auto ranked_operand_type = operand_type.dyn_cast<RankedTensorType>();
+    auto ranked_operand_type = mlir::dyn_cast<RankedTensorType>(operand_type);
     if (!ranked_operand_type) return false;
-    auto ranked_res_type = result.getType().dyn_cast<RankedTensorType>();
+    auto ranked_res_type = mlir::dyn_cast<RankedTensorType>(result.getType());
     if (ranked_res_type &&
         ranked_operand_type.getShape() == ranked_res_type.getShape())
       return false;
 
-    auto shaped_res_type = result_type.dyn_cast<ShapedType>();
+    auto shaped_res_type = mlir::dyn_cast<ShapedType>(result_type);
     if (!shaped_res_type) return false;
     new_type = tensorflow::GetTypeFromTFTensorShape(
         ranked_operand_type.getShape(), shaped_res_type.getElementType());
@@ -1292,7 +1296,7 @@ bool ShapeInference::InferShapeForXlaCallModule(XlaCallModuleOp op) {
   int next_op_result = 0;
   for (auto output_type : main_output_types) {
     if (tensorflow::IsTokenType(output_type)) continue;
-    auto output_type_ranked = output_type.dyn_cast<RankedTensorType>();
+    auto output_type_ranked = mlir::dyn_cast<RankedTensorType>(output_type);
     if (output_type_ranked == nullptr) {
       llvm::errs() << "Unsupported XlaCallModule result type: " << output_type
                    << "\n";
@@ -1418,20 +1422,20 @@ bool ShapeInference::InferShapeForRestore(Operation* op) {
     if (!assign_op) {
       continue;
     }
-    auto subtypes = getElementTypeOrSelf(assign_op.getResource())
-                        .cast<TF::ResourceType>()
+    auto subtypes = mlir::cast<TF::ResourceType>(
+                        getElementTypeOrSelf(assign_op.getResource()))
                         .getSubtypes();
     if (subtypes.empty()) {
       continue;
     }
-    auto subtype = subtypes.front().dyn_cast<ShapedType>();
+    auto subtype = mlir::dyn_cast<ShapedType>(subtypes.front());
     if (subtype == nullptr) {
       continue;
     }
     // Preserve the dtype from the restore op even if `AssignVariableOp` uses a
     // different dtype, which is possible when there's a `CastOp` between them.
     subtype = subtype.clone(
-        op->getResult(0).getType().cast<ShapedType>().getElementType());
+        mlir::cast<ShapedType>(op->getResult(0).getType()).getElementType());
     // Update the result type of this op with the resource's type. We only use
     // the resource subtype of the first user since shapes from all the users
     // should be equal or compatible.
@@ -1456,7 +1460,7 @@ DatasetInput GetDatasetInput(Value value) {
   while (
       llvm::isa_and_nonnull<IdentityOp, IdentityNOp>(value.getDefiningOp())) {
     value = value.getDefiningOp()->getOperand(
-        value.cast<OpResult>().getResultNumber());
+        mlir::cast<OpResult>(value).getResultNumber());
   }
 
   Operation* op = value.getDefiningOp();
@@ -1664,14 +1668,14 @@ bool ShapeInference::InferShapeForTensorListPopBackOp(TensorListPopBackOp op) {
   DCOMMENT_OP(op, "Inferring shape for TensorListPopBackOp.");
 
   auto src_list_handle_t =
-      op.getOperand(0).getType().dyn_cast_or_null<TensorType>();
+      mlir::dyn_cast_or_null<TensorType>(op.getOperand(0).getType());
   if (!src_list_handle_t) return false;
 
   // Copy of operand tensorlist type.
   TensorType dst_list_handle_t =
       src_list_handle_t.clone(src_list_handle_t.getElementType());
   auto variant_element_t =
-      dst_list_handle_t.getElementType().dyn_cast_or_null<VariantType>();
+      mlir::dyn_cast_or_null<VariantType>(dst_list_handle_t.getElementType());
   if (!variant_element_t || variant_element_t.getSubtypes().size() != 1)
     return false;
 
@@ -1722,7 +1726,7 @@ bool ShapeInference::InferShapeForVarHandleOp(VarHandleOp op) {
       llvm_unreachable("unexpected operator type");
     }
 
-    TensorType resource_subtype = value.getType().cast<TensorType>();
+    TensorType resource_subtype = mlir::cast<TensorType>(value.getType());
     ResourceType resource_type =
         ResourceType::get({resource_subtype}, op.getContext());
     UnrankedTensorType new_resource_type =
@@ -1854,7 +1858,7 @@ bool ShapeInference::InferShapeForXlaReduceWindowOp(XlaReduceWindowOp op) {
 
   bool changed = false;
 
-  auto input_ty = op.getInput().getType().cast<ShapedType>();
+  auto input_ty = mlir::cast<ShapedType>(op.getInput().getType());
   DenseElementsAttr window_dimensions, window_strides, base_dilations,
       window_dilations, padding;
   if (input_ty.hasStaticShape() &&
@@ -1901,7 +1905,7 @@ bool ShapeInference::InferShapeForXlaReduceWindowOp(XlaReduceWindowOp op) {
     }
     auto output_shape = InferWindowOutputShape(
         input_ty, window.value(),
-        op.getInitValue().getType().cast<ShapedType>().getElementType());
+        mlir::cast<ShapedType>(op.getInitValue().getType()).getElementType());
 
     if (!output_shape) {
       op->emitOpError("failed to infer output shape");
@@ -1918,8 +1922,8 @@ bool ShapeInference::InferShapeForXlaSelectAndScatterOp(
     XlaSelectAndScatterOp op) {
   DCOMMENT_OP(op, "Inferring shape for XlaSelectAndScatterOp");
 
-  auto operand_shape = op.getOperand().getType().cast<ShapedType>();
-  auto source_shape = op.getSource().getType().cast<ShapedType>();
+  auto operand_shape = mlir::cast<ShapedType>(op.getOperand().getType());
+  auto source_shape = mlir::cast<ShapedType>(op.getSource().getType());
   DenseElementsAttr window_dimensions, window_strides, padding;
   if (operand_shape.hasRank() && source_shape.hasRank() &&
       matchPattern(op.getWindowDimensions(), m_Constant(&window_dimensions)) &&
@@ -2081,13 +2085,14 @@ LogicalResult PrecheckForXlaConvV2Op(XlaConvV2Op op) {
   int64_t batch_group_count = op.getBatchGroupCount();
 
   auto input_args_have_static_shape = [&]() -> bool {
-    return input_tensor.getType().cast<TensorType>().hasStaticShape() &&
-           kernel_tensor.getType().cast<TensorType>().hasStaticShape() &&
-           window_strides.getType().cast<TensorType>().hasStaticShape() &&
-           padding.getType().cast<TensorType>().hasStaticShape() &&
-           lhs_dilation.getType().cast<TensorType>().hasStaticShape() &&
-           rhs_dilation.getType().cast<TensorType>().hasStaticShape() &&
-           feature_group_count.getType().cast<TensorType>().hasStaticShape();
+    return mlir::cast<TensorType>(input_tensor.getType()).hasStaticShape() &&
+           mlir::cast<TensorType>(kernel_tensor.getType()).hasStaticShape() &&
+           mlir::cast<TensorType>(window_strides.getType()).hasStaticShape() &&
+           mlir::cast<TensorType>(padding.getType()).hasStaticShape() &&
+           mlir::cast<TensorType>(lhs_dilation.getType()).hasStaticShape() &&
+           mlir::cast<TensorType>(rhs_dilation.getType()).hasStaticShape() &&
+           mlir::cast<TensorType>(feature_group_count.getType())
+               .hasStaticShape();
   };
 
   // Return failure when one of the input args has not a static shape
@@ -2096,9 +2101,9 @@ LogicalResult PrecheckForXlaConvV2Op(XlaConvV2Op op) {
   }
 
   auto input_tensor_shape =
-      input_tensor.getType().cast<RankedTensorType>().getShape();
+      mlir::cast<RankedTensorType>(input_tensor.getType()).getShape();
   auto kernel_tensor_shape =
-      kernel_tensor.getType().cast<RankedTensorType>().getShape();
+      mlir::cast<RankedTensorType>(kernel_tensor.getType()).getShape();
 
   if (input_tensor_shape.size() <= 2) {
     return op.emitOpError()
@@ -2225,14 +2230,16 @@ bool ShapeInference::InferShapeForXlaConvV2Op(XlaConvV2Op op) {
     xla::ConvolutionDimensionNumbers dnums;
     dnums.ParseFromString(op.getDimensionNumbersAttr().getValue().str());
 
-    auto input_tensor_shape = input_tensor.getType().cast<RankedTensorType>();
+    auto input_tensor_shape =
+        mlir::cast<RankedTensorType>(input_tensor.getType());
     for (auto i = 0; i < input_tensor_shape.getShape().size(); ++i) {
       DCOMMENT("Input Tensor Shape " << i << "th is "
                                      << input_tensor_shape.getShape()[i]);
       input_tensor_dims_vec.push_back(input_tensor_shape.getShape()[i]);
     }
 
-    auto kernel_tensor_shape = kernel_tensor.getType().cast<RankedTensorType>();
+    auto kernel_tensor_shape =
+        mlir::cast<RankedTensorType>(kernel_tensor.getType());
     for (auto i = 0; i < kernel_tensor_shape.getShape().size(); ++i) {
       DCOMMENT("Kernel tensor Shape" << i << "th is "
                                      << kernel_tensor_shape.getShape()[i]);
@@ -2315,7 +2322,7 @@ bool ShapeInference::RefineWithInferTypeOpInterface(
 ShapeHandle ShapeInference::ComputeOutputAsShape(OpResult result,
                                                  InferenceContext* ic) {
   LLVM_DEBUG(result.print(llvm::dbgs() << "\nEvaluate partially "));
-  auto rt = result.getType().dyn_cast<RankedTensorType>();
+  auto rt = mlir::dyn_cast<RankedTensorType>(result.getType());
   if (!rt || !rt.hasStaticShape() || rt.getRank() != 1) return {};
   int dim_size = rt.getDimSize(0);
 
@@ -2362,7 +2369,7 @@ ShapeHandle ShapeInference::ComputeOutputAsShape(OpResult result,
       // If worklist is empty, then this is the root query op.
       if (worklist.empty()) {
         LLVM_DEBUG(llvm::dbgs() << "[root node]\n");
-        if (auto dea = ret.dyn_cast<DenseIntElementsAttr>()) {
+        if (auto dea = mlir::dyn_cast<DenseIntElementsAttr>(ret)) {
           if (dea.getNumElements() != 1) {
             LLVM_DEBUG(llvm::dbgs() << "Unexpected number of elements\n");
             return {};
@@ -2400,7 +2407,7 @@ bool ShapeInference::RefineTypeForPassThroughOperands(Operation* op,
   for (auto entry : llvm::zip(operands, results)) {
     Type operand_type = std::get<0>(entry).getType();
     Value result = std::get<1>(entry);
-    TensorType result_type = result.getType().cast<TensorType>();
+    TensorType result_type = mlir::cast<TensorType>(result.getType());
     Type inferred_type = TypeMeet(result_type, operand_type);
     if (result_type == inferred_type) continue;
 
@@ -2466,10 +2473,10 @@ bool ShapeInference::InferShapeForNonTFDialectOperation(Operation* op) {
 Type GetElementTypeFromOperand(TensorType operand_type,
                                TensorType result_type) {
   auto operand_handle_type =
-      operand_type.getElementType().dyn_cast<TensorFlowTypeWithSubtype>();
+      mlir::dyn_cast<TensorFlowTypeWithSubtype>(operand_type.getElementType());
   if (!operand_handle_type) return result_type.getElementType();
   auto result_handle_type =
-      result_type.getElementType().cast<TensorFlowTypeWithSubtype>();
+      mlir::cast<TensorFlowTypeWithSubtype>(result_type.getElementType());
   if (operand_handle_type.GetSubtypes().empty() ||
       !result_handle_type.GetSubtypes().empty())
     return result_type.getElementType();
@@ -2505,9 +2512,8 @@ bool ShapeInference::InferShapeForWhile(WhileOpTy op,
   for (auto entry :
        zip(op.getInput().getTypes(), op.getOutput(), body_result_types)) {
     Value result = std::get<1>(entry);
-    TensorType body_result_type =
-        std::get<2>(entry).template cast<TensorType>();
-    auto result_type = result.getType().cast<TensorType>();
+    TensorType body_result_type = mlir::cast<TensorType>(std::get<2>(entry));
+    auto result_type = mlir::cast<TensorType>(result.getType());
 
     Type potential_refined_type;
     if (CanWhileTypeBeRefinedWith(result_type, body_result_type)) {
@@ -2518,7 +2524,7 @@ bool ShapeInference::InferShapeForWhile(WhileOpTy op,
                                      : std::optional<ArrayRef<int64_t>>(),
           element_type);
     } else {
-      TensorType operand_type = std::get<0>(entry).template cast<TensorType>();
+      TensorType operand_type = mlir::cast<TensorType>(std::get<0>(entry));
       Type element_type = GetElementTypeFromOperand(operand_type, result_type);
       potential_refined_type = CreateTensorType(
           result_type.hasRank() ? result_type.getShape()
@@ -2671,7 +2677,8 @@ bool ShapeInference::InferShapeForSingleOperation(Operation* op,
 
   // Return result element type at `index`.
   auto result_element_type_fn = [&](int index) {
-    return op->getResult(index).getType().cast<TensorType>().getElementType();
+    return mlir::cast<TensorType>(op->getResult(index).getType())
+        .getElementType();
   };
 
   llvm::SmallVector<ShapedTypeComponents, 4> inferred_return_shapes;
@@ -2698,7 +2705,7 @@ bool ShapeInference::InferShapeForSingleOperation(Operation* op,
       inferred_type = UnrankedTensorType::get(inferred.getElementType());
     }
     inferred_type =
-        TypeMeet(op_result.getType(), inferred_type).cast<TensorType>();
+        mlir::cast<TensorType>(TypeMeet(op_result.getType(), inferred_type));
     if (op_result.getType() == inferred_type) continue;
     if (!UpdateTypeAndInsertIncompatibleUseCasts(inferred_type, op_result))
       continue;
@@ -2879,19 +2886,19 @@ llvm::SmallVector<Type, 4> GetWhileCompatibleTypes(
   types.reserve(operand_types.size());
   for (auto entry :
        llvm::zip(operand_types, result_types, region_argument_types)) {
-    auto operand_type = std::get<0>(entry).cast<TensorType>();
-    auto result_type = std::get<1>(entry).cast<TensorType>();
+    auto operand_type = mlir::cast<TensorType>(std::get<0>(entry));
+    auto result_type = mlir::cast<TensorType>(std::get<1>(entry));
     if (operand_type == result_type) {
       types.push_back(operand_type);
     } else if (RankedAndSameRank(operand_type, result_type)) {
-      auto potential_refined_type =
-          GetCompatibleRankedTensorType(operand_type.cast<RankedTensorType>(),
-                                        result_type.cast<RankedTensorType>());
+      auto potential_refined_type = GetCompatibleRankedTensorType(
+          mlir::cast<RankedTensorType>(operand_type),
+          mlir::cast<RankedTensorType>(result_type));
       types.push_back(potential_refined_type);
     } else {
-      auto region_argument_type = std::get<2>(entry).cast<TensorType>();
+      auto region_argument_type = mlir::cast<TensorType>(std::get<2>(entry));
       Type element_type = GetElementTypeFromOperand(
-          operand_type.cast<TensorType>(), region_argument_type);
+          mlir::cast<TensorType>(operand_type), region_argument_type);
       Type potential_refined_type = CreateTensorType(
           region_argument_type.hasRank() ? region_argument_type.getShape()
                                          : std::optional<ArrayRef<int64_t>>(),
@@ -3064,7 +3071,7 @@ LogicalResult ShapeInference::TryToFold(Operation* op) {
       }
     }
 
-    if (ElementsAttr eattr = attr.dyn_cast_or_null<ElementsAttr>()) {
+    if (ElementsAttr eattr = mlir::dyn_cast_or_null<ElementsAttr>(attr)) {
       if (std::get<0>(result).getType() == eattr.getType()) continue;
 
       (void)UpdateTypeAndInsertIncompatibleUseCasts(eattr.getType(),
@@ -3224,6 +3231,25 @@ static FailureOr<bool> InferShapeForFunction(ShapeInference& context,
   return true;
 }
 
+absl::StatusOr<SmallVector<SmallVector<int64_t>>> ParseArgumentShapes(
+    absl::string_view input_shapes) {
+  SmallVector<SmallVector<int64_t>> parsed_shapes;
+  if (input_shapes.empty()) {
+    return parsed_shapes;
+  }
+
+  std::vector<std::optional<std::vector<int>>> shapes;
+  TF_RETURN_IF_ERROR(::tensorflow::ParseNodeShapes(input_shapes, shapes));
+
+  for (const auto& shape : shapes) {
+    if (!shape) {
+      return absl::AbortedError("Missing input argument shapes");
+    }
+    parsed_shapes.push_back(SmallVector<int64_t>(shape->begin(), shape->end()));
+  }
+  return parsed_shapes;
+}
+
 FailureOr<bool> InferShapeForFunction(func::FuncOp func,
                                       ArrayRef<ArrayRef<int64_t>> arg_shapes,
                                       int64_t graph_version,
@@ -3245,13 +3271,15 @@ FailureOr<bool> InferShapeForFunction(func::FuncOp func,
   for (size_t i = 0; i < func_type.getNumInputs(); ++i) {
     ArrayRef<int64_t> shape = arg_shapes[i];
     Type element_type;
-    if (auto input_ty = func_type.getInput(i).dyn_cast<RankedTensorType>()) {
+    if (auto input_ty =
+            mlir::dyn_cast<RankedTensorType>(func_type.getInput(i))) {
       if (input_ty.getRank() != shape.size()) {
         return failure();
       }
       element_type = input_ty.getElementType();
     } else {
-      auto unranked_input_ty = func_type.getInput(i).dyn_cast<TensorType>();
+      auto unranked_input_ty =
+          mlir::dyn_cast<TensorType>(func_type.getInput(i));
       if (!unranked_input_ty) {
         return failure();
       }
@@ -3284,7 +3312,8 @@ FailureOr<bool> InferShapeForFunction(func::FuncOp func,
 }
 
 FailureOr<bool> InferModuleShape(ModuleOp module, int64_t max_iterations,
-                                 ArrayRef<TypeID> ops_to_skip) {
+                                 ArrayRef<TypeID> ops_to_skip,
+                                 ArrayRef<ArrayRef<int64_t>> input_shapes) {
   auto producer_or = tensorflow::GetTfGraphProducerVersion(module);
   if (!producer_or.ok()) {
     // TODO(jpienaar): Keeping the existing behavior for now but this could
@@ -3294,13 +3323,30 @@ FailureOr<bool> InferModuleShape(ModuleOp module, int64_t max_iterations,
     return true;
   }
   int64_t producer = producer_or.value();
+
   // TODO(jpienaar): Clean up propagate_NextIterationSinkOp_callee_constants if
   // it is no longer needed.
   ShapeInference context(producer, module,
                          /*propagate_caller_callee_constants=*/false,
                          ops_to_skip);
-  if (auto main = module.lookupSymbol<mlir::func::FuncOp>("main"))
+  auto main = module.lookupSymbol<mlir::func::FuncOp>("main");
+  // Error if no main to refine with input shapes
+  if (!main && !input_shapes.empty()) {
+    return module->emitError(
+        "Input shapes provided but no `main` function found.");
+  }
+
+  // Add main function to head of queue, refine input shapes if provided
+  if (main) {
+    if (!input_shapes.empty()) {
+      FailureOr<bool> failure_or_converged =
+          InferShapeForFunction(main, input_shapes, producer,
+                                /*max_iterations=*/10, ops_to_skip);
+      if (failed(failure_or_converged) || !failure_or_converged.value())
+        return failure_or_converged;
+    }
     context.enqueue(main);
+  }
   for (auto func : module.getOps<func::FuncOp>()) context.enqueue(func);
   // Arbitrarily upper bound the maximum number of functions that get processed
   // just to avoid pathological cases.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.h b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.h
index bc1cf7b3c8f475..46c1bc9c00e55a 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.h
@@ -18,6 +18,8 @@ limitations under the License.
 
 #include <cstdint>
 
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
@@ -42,8 +44,18 @@ Type GetNewArgType(Type old_arg_type, ArrayRef<int64_t> shape,
 // whose type is in ops_to_skip.
 // Returns a failure() on error, otherwise returns true to indicate that it
 // reached convergence, false otherwise.
+// If input shapes are provided, first refines the `main` function using
+// InferShapeForFunction.
 FailureOr<bool> InferModuleShape(ModuleOp module, int64_t max_iterations = 10,
-                                 ArrayRef<TypeID> ops_to_skip = {});
+                                 ArrayRef<TypeID> ops_to_skip = {},
+                                 ArrayRef<ArrayRef<int64_t>> input_shapes = {});
+
+// Given a tensorflow NodeShape string, returns a vector of argument shapes
+// that can be used with InferShapeForFunction.
+// TF NodeShape uses `,` to separate dimensions, and `:` to separate arguments.
+// Ex: 1,2:3,4,5:6,? --> [[1, 2], [3, 4, 5], [6, ?]]
+absl::StatusOr<SmallVector<SmallVector<int64_t>>> ParseArgumentShapes(
+    absl::string_view input_shapes);
 
 // Given a list of refined shapes matching the function arguments of func, runs
 // shape inference over the function to propagate this updated information,
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference_pass.cc
index 37bcb46b95cc57..392b7807b0d418 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference_pass.cc
@@ -13,15 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <stack>
+#include <cstdint>
+#include <memory>
 
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.h"
-#include "tensorflow/core/framework/shape_inference.h"
 
 namespace mlir {
 namespace TF {
@@ -36,9 +38,26 @@ namespace {
 class ShapeInference
     : public impl::TensorFlowShapeInferencePassBase<ShapeInference> {
  public:
+  ShapeInference() = default;
+  explicit ShapeInference(ArrayRef<ArrayRef<int64_t>> input_shapes)
+      : input_shapes_(input_shapes) {}
   void runOnOperation() override {
-    auto failure_or_converged =
-        InferModuleShape(getOperation(), max_iterations_, /*ops_to_skip=*/{});
+    // Parse `input_arg_shapes_` if provided (test only)
+    SmallVector<ArrayRef<int64_t>> input_shapes_vec;
+    absl::StatusOr<SmallVector<SmallVector<int64_t>>> parsed_shapes;
+    if (!input_arg_shapes_.empty()) {
+      parsed_shapes = ParseArgumentShapes(input_arg_shapes_);
+      if (!parsed_shapes.ok()) {
+        getOperation().emitError() << parsed_shapes.status().message();
+        return signalPassFailure();
+      }
+      input_shapes_vec = SmallVector<ArrayRef<int64_t>>{parsed_shapes->begin(),
+                                                        parsed_shapes->end()};
+      input_shapes_ = input_shapes_vec;
+    }
+
+    auto failure_or_converged = InferModuleShape(
+        getOperation(), max_iterations_, /*ops_to_skip=*/{}, input_shapes_);
     if (failed(failure_or_converged)) return signalPassFailure();
     if (!failure_or_converged.value()) {
       getOperation().emitError()
@@ -47,11 +66,15 @@ class ShapeInference
       return signalPassFailure();
     }
   }
+
+ private:
+  ArrayRef<ArrayRef<int64_t>> input_shapes_;
 };
 }  // namespace
 
-std::unique_ptr<OperationPass<ModuleOp>> CreateTFShapeInferencePass() {
-  return std::make_unique<ShapeInference>();
+std::unique_ptr<OperationPass<ModuleOp>> CreateTFShapeInferencePass(
+    ArrayRef<ArrayRef<int64_t>> input_shapes) {
+  return std::make_unique<ShapeInference>(input_shapes);
 }
 
 }  // namespace TF
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_pipelining.cc b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_pipelining.cc
index e565d50660558c..abef8ee04f2212 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_pipelining.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_pipelining.cc
@@ -174,8 +174,8 @@ namespace TFDevice {
 namespace {
 
 bool IsResourceType(Type val_type) {
-  if (auto tensor_type = val_type.dyn_cast<mlir::TensorType>()) {
-    if (tensor_type.getElementType().isa<TF::ResourceType>()) {
+  if (auto tensor_type = mlir::dyn_cast<mlir::TensorType>(val_type)) {
+    if (mlir::isa<TF::ResourceType>(tensor_type.getElementType())) {
       return true;
     }
   }
@@ -588,7 +588,7 @@ void GatherOpsForExtraction(mlir::SetVector<Operation*>* operations,
       if (predecessors) {
         for (Value operand : op->getOperands()) {
           // Stop at the block boundary.
-          if (operand.isa<BlockArgument>()) continue;
+          if (mlir::isa<BlockArgument>(operand)) continue;
 
           Operation* predecessor = operand.getDefiningOp();
           if (!operations->contains(predecessor) &&
@@ -1867,7 +1867,7 @@ void EmbeddingPipeliningPass::runOnOperation() {
   for (int ret_pos = 0; ret_pos < orig_return_op->getNumOperands(); ++ret_pos) {
     auto operand = orig_return_op->getOperand(ret_pos);
     auto def_op = operand.getDefiningOp();
-    auto result = operand.dyn_cast<OpResult>();
+    auto result = mlir::dyn_cast<OpResult>(operand);
     if (def_op == non_tpu_caller) {
       loop_arg_update_map_non_tpu[result.getResultNumber()] = ret_pos;
     } else if (def_op == core_tpu_caller) {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_program_key.cc b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_program_key.cc
index 3e41762feb16c2..1e7958660fd8c4 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_program_key.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_program_key.cc
@@ -314,7 +314,7 @@ void CreateReducedLaunchOp(OpBuilder* builder, Block* old_block,
   // Handle pass through block arguments.
   for (OpOperand& operand :
        original_launch_op.GetBody().getTerminator()->getOpOperands()) {
-    if (operand.get().isa<BlockArgument>()) {
+    if (mlir::isa<BlockArgument>(operand.get())) {
       original_launch_op.getResult(operand.getOperandNumber())
           .replaceAllUsesWith(operand.get());
     }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_sequencing.cc b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_sequencing.cc
index 577b374a43847d..b224b723cda50d 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_sequencing.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_sequencing.cc
@@ -95,8 +95,8 @@ std::vector<Type> GetValueTypes(const InputContainer& input) {
 }
 
 bool IsResourceType(Type val_type) {
-  if (auto tensor_type = val_type.dyn_cast<mlir::TensorType>()) {
-    if (tensor_type.getElementType().isa<TF::ResourceType>()) {
+  if (auto tensor_type = mlir::dyn_cast<mlir::TensorType>(val_type)) {
+    if (mlir::isa<TF::ResourceType>(tensor_type.getElementType())) {
       return true;
     }
   }
@@ -139,7 +139,7 @@ void GatherOpsForExtraction(mlir::SetVector<Operation*>* operations,
       if (predecessors) {
         for (Value operand : op->getOperands()) {
           // Stop at the block boundary.
-          if (operand.isa<BlockArgument>()) continue;
+          if (mlir::isa<BlockArgument>(operand)) continue;
 
           Operation* predecessor = operand.getDefiningOp();
           if (!operations->contains(predecessor) &&
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc b/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc
index fb9848dbaeac47..476a67b496355f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc
@@ -73,7 +73,7 @@ Type GetSizeVarType(OpBuilder builder) {
 // forwards the argument. Otherwise, returns -1.
 int64_t FindAliasedInput(func::FuncOp func, int64_t return_index) {
   Value return_val = func.front().getTerminator()->getOperand(return_index);
-  auto maybe_arg = return_val.dyn_cast<BlockArgument>();
+  auto maybe_arg = mlir::dyn_cast<BlockArgument>(return_val);
   if (!maybe_arg) return -1;
   return maybe_arg.getArgNumber();
 }
@@ -180,8 +180,8 @@ LogicalResult HandleWhileOp(
       while_op.getLoc(), body.getFunctionType().getInputs(), new_while_operands,
       while_op->getAttrs());
   for (int64_t i = 0; i < while_op.getNumResults(); ++i) {
-    if (!getElementTypeOrSelf(while_op.getOperand(i).getType())
-             .isa<TF::ResourceType>()) {
+    if (!mlir::isa<TF::ResourceType>(
+            getElementTypeOrSelf(while_op.getOperand(i).getType()))) {
       continue;
     }
     int64_t aliased_input = FindAliasedInput(body, i);
@@ -233,7 +233,7 @@ LogicalResult HandleIfOp(
       if_op.getLoc(), then_func.getFunctionType().getResults(), new_if_operands,
       if_op->getAttrs());
   for (auto result : if_op.getResults()) {
-    if (!getElementTypeOrSelf(result.getType()).isa<TF::ResourceType>()) {
+    if (!mlir::isa<TF::ResourceType>(getElementTypeOrSelf(result.getType()))) {
       continue;
     }
     int64_t then_aliased_input =
@@ -287,8 +287,8 @@ LogicalResult HandlePartitionedCallOp(
                  const_cast<func::FuncOp&>(info.decomposed_callee).getName()));
     for (int64_t i = 0; i < call.getNumResults(); ++i) {
       auto result = call.getResult(i);
-      if (!getElementTypeOrSelf(result.getType())
-               .template isa<TF::ResourceType>()) {
+      if (!mlir::isa<TF::ResourceType>(
+              getElementTypeOrSelf(result.getType()))) {
         continue;
       }
       int64_t aliased_input = FindAliasedInput(info.decomposed_callee, i);
@@ -328,9 +328,9 @@ LogicalResult HandlePartitionedCallOp(
   } else {
     info.decomposed_callee = lowered_callee;
     for (auto& entry : callee_map) {
-      info.stack_var_arg_to_size_arg
-          [entry.getFirst().cast<BlockArgument>().getArgNumber()] =
-          entry.getSecond().cast<BlockArgument>().getArgNumber();
+      info.stack_var_arg_to_size_arg[mlir::cast<BlockArgument>(entry.getFirst())
+                                         .getArgNumber()] =
+          mlir::cast<BlockArgument>(entry.getSecond()).getArgNumber();
     }
     if (lowered_callee != callee) {
       // Add the clone with a new name.
@@ -372,7 +372,7 @@ LogicalResult HandleStackV2Op(
   auto size_var_type = GetSizeVarType(builder);
   auto var_type = RankedTensorType::get(
       {}, TF::ResourceType::get(
-              ArrayRef<TensorType>{buffer.getType().cast<TensorType>()},
+              ArrayRef<TensorType>{mlir::cast<TensorType>(buffer.getType())},
               stack.getContext()));
   auto local_var = builder.create<TF::MlirLocalVarOp>(
       stack.getLoc(), ArrayRef<Type>{var_type}, ArrayRef<Value>{});
@@ -446,7 +446,8 @@ LogicalResult HandleRegionControlFlowOps(
     llvm::StringMap<PartitionedCallStackOpsInfo>*
         decomposed_partitioned_call_callees) {
   for (OpOperand& operand : op.getOpOperands()) {
-    if (getElementTypeOrSelf(operand.get().getType()).isa<TF::ResourceType>()) {
+    if (mlir::isa<TF::ResourceType>(
+            getElementTypeOrSelf(operand.get().getType()))) {
       return op.emitOpError()
              << "found unexpected type " << operand.get().getType()
              << " of operand #" << operand.getOperandNumber()
@@ -455,7 +456,7 @@ LogicalResult HandleRegionControlFlowOps(
     }
   }
   for (OpResult result : op.getResults()) {
-    if (getElementTypeOrSelf(result.getType()).isa<TF::ResourceType>()) {
+    if (mlir::isa<TF::ResourceType>(getElementTypeOrSelf(result.getType()))) {
       return op.emitOpError()
              << "found unexpected type " << result.getType() << " of result #"
              << result.getResultNumber()
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_device_copy_conversion.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_device_copy_conversion.cc
index b18a6a3496649a..267f32daa9f6e6 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_device_copy_conversion.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_device_copy_conversion.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Pass/PassOptions.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 
@@ -67,7 +68,7 @@ void TensorDeviceCopyConversionPass::runOnOperation() {
         (isa<TF::TPUExecuteOp, TF::TPUExecuteAndUpdateVariablesOp>(def_op))) {
       return true;
     }
-    if (BlockArgument block_arg = arg.dyn_cast<BlockArgument>()) {
+    if (BlockArgument block_arg = mlir::dyn_cast<BlockArgument>(arg)) {
       // Skip the folding logic if the block argument is not from the function
       // arguments. This can happen when the argument is from a while loop.
       if (block_arg.getParentRegion() != &func_op.getRegion()) {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
index 278ba1f7fdf65b..a9ad31a28461f7 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -510,10 +511,10 @@ LogicalResult HandlePartitionedCallOp(
   } else {
     info.signature_change = true;
     for (auto& entry : callee_map) {
-      auto buffer_arg = entry.getFirst().dyn_cast<BlockArgument>();
+      auto buffer_arg = mlir::dyn_cast<BlockArgument>(entry.getFirst());
       if (!buffer_arg) continue;
       info.buffer_arg_to_size_arg[buffer_arg.getArgNumber()] =
-          entry.getSecond().size.cast<BlockArgument>().getArgNumber();
+          mlir::cast<BlockArgument>(entry.getSecond().size).getArgNumber();
     }
     if (lowered_callee != callee) {
       // Add the clone with a new name.
@@ -549,7 +550,8 @@ LogicalResult GetConstShapeValue(Value shape_value,
 // return error.
 LogicalResult GetElementShapeFromResultType(
     Type type, llvm::SmallVector<int64_t, 8>* shape) {
-  auto variant_type = getElementTypeOrSelf(type).dyn_cast<TF::VariantType>();
+  auto variant_type =
+      mlir::dyn_cast<TF::VariantType>(getElementTypeOrSelf(type));
   if (!variant_type || variant_type.getSubtypes().size() != 1) return failure();
   TensorType tensor_type = variant_type.getSubtypes().front();
   if (!tensor_type.hasStaticShape()) return failure();
@@ -619,7 +621,7 @@ LogicalResult HandleTensorListFromTensorOp(
   Value buffer = builder.create<TF::IdentityOp>(
       list.getLoc(), ArrayRef<Type>{list.getTensor().getType()},
       ArrayRef<Value>{list.getTensor()});
-  auto type = buffer.getType().cast<TensorType>();
+  auto type = mlir::cast<TensorType>(buffer.getType());
   if (!type.hasStaticShape()) {
     return list.emitOpError("TensorListFromTensorOp input has unknown shape.");
   }
@@ -733,8 +735,8 @@ LogicalResult HandleTensorListLengthOp(
   OpBuilder builder(length);
   if (it->getSecond().fixed) {
     auto dim = cutil::CreateScalarConst(
-        length.getInputHandle().getType().cast<RankedTensorType>().getDimSize(
-            0),
+        mlir::cast<RankedTensorType>(length.getInputHandle().getType())
+            .getDimSize(0),
         builder, length.getLoc());
     length.getLength().replaceAllUsesWith(dim);
   } else {
@@ -760,7 +762,7 @@ LogicalResult HandleTensorListElementShapeOp(
   }
   auto buffer = elem_shape.getInputHandle();
   auto result = cutil::GetR1Const(
-      buffer.getType().cast<RankedTensorType>().getShape().drop_front(),
+      mlir::cast<RankedTensorType>(buffer.getType()).getShape().drop_front(),
       OpBuilder(elem_shape), elem_shape.getLoc(),
       elem_shape.getShapeType().getIntOrFloatBitWidth());
   elem_shape.getElementShape().replaceAllUsesWith(result);
@@ -792,7 +794,8 @@ LogicalResult HandleTensorListScatterIntoExistingListOp(
   }
   auto buffer = scatter.getInputHandle();
   OpBuilder builder(scatter);
-  auto indices_type = scatter.getIndices().getType().cast<RankedTensorType>();
+  auto indices_type =
+      mlir::cast<RankedTensorType>(scatter.getIndices().getType());
   if (!indices_type) return scatter.emitOpError("unranked indices shape");
   auto shape_type = RankedTensorType::get({2}, builder.getIntegerType(32));
   auto shape = builder.create<TF::ConstOp>(
@@ -874,7 +877,8 @@ LogicalResult DecomposeTensorListOpsInternal(
     } else if (auto addn = llvm::dyn_cast<TF::AddNOp>(&op)) {
       auto it = buffer_to_size->find(addn.getOperand(0));
       if (it != buffer_to_size->end()) {
-        addn.getSum().setType(addn.getOperand(0).getType().cast<TensorType>());
+        addn.getSum().setType(
+            mlir::cast<TensorType>(addn.getOperand(0).getType()));
         auto size = it->getSecond();
         (*buffer_to_size)[addn.getSum()] = size;
       }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.td b/tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.td
index 6b53cae7099688..dbe938e01a519b 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.td
@@ -193,56 +193,6 @@ def StackOpsDecompositionPass : Pass<"tf-stack-ops-decomposition", "ModuleOp"> {
 
 
 
-def TPUShardingIdentificationPass : Pass<"tf-tpu-sharding-identification", "ModuleOp"> {
-  let summary = "Identifies and handles inputs/outputs of TPU computation that is "
-           "sharded across logical cores.";
-  let constructor = "TFTPU::CreateTPUShardingIdentificationPass()";
-  let description = [{
-    Bubbles up sharding configuration from `cluster_func` regions into
-    the attributes of `cluster_func`. This is done by parsing the
-    `XlaSharding` / `TPUPartitionedOutput` / `TPUPartitionedInput` ops inside
-    `cluster_func`.
-
-    For example, given the following `cluster_func` wrapping `func`:
-
-    ```mlir
-      func @test(%arg0: tensor<*xi32>) {
-        "tf_device.cluster_func"(%arg0) {
-            func = @func,
-            step_marker_location = ""} : (tensor<*xi32>) -> tensor<*xi32>
-        return
-      }
-
-      func @func(%arg0: tensor<*xi32>) -> tensor<*xi32> {
-        %0 = "tf.XlaSharding"(%arg0) {_XlaSharding = "\01\02\03",
-                                      sharding = "\01\02\03"} : (tensor<*xi32>) -> tensor<*xi32>
-        %1 = "tf.A"(%0) : (tensor<*xi32>) -> (tensor<*xi32>)
-        return %1 : tensor<*xi32>
-      }
-    ```
-
-    Now, cluster_func receives the following `*_sharding_configuration`
-    attributes, and `func` receives the mhlo.sharding attribute:
-
-    ```mlir
-      func @test(%arg0: tensor<*xi32>) {
-        %0 = "tf_device.cluster_func"(%arg0) {
-            func = @func,
-            input_sharding_configuration = ["\01\02\03"],
-            output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"],
-            step_marker_location = ""} : (tensor<*xi32>) -> tensor<*xi32>
-        return
-      }
-      func @func(%arg0: tensor<*xi32> {mhlo.sharding = "\01\02\03"}) ->
-                (tensor<*xi32> {mhlo.sharding = "\08\01\1A\01\01\22\01\00"}) {
-        %0 = "tf.XlaSharding"(%arg0) {_XlaSharding = "\01\02\03", sharding = "\01\02\03"} : (tensor<*xi32>) -> tensor<*xi32>
-        %1 = "tf.A"(%0) : (tensor<*xi32>) -> tensor<*xi32>
-        return %1 : tensor<*xi32>
-      }
-    ```
-  }];
-}
-
 def UnrollBatchMatMulPass : Pass<"tf-unroll-batch-matmul", "mlir::func::FuncOp"> {
   let summary = "Unroll TF BatchMatMul op into Reshape, Slice, MatMul, Pack ops.";
   let constructor = "TF::CreateUnrollBatchMatMulPassPass()";
@@ -381,7 +331,9 @@ def TensorFlowShapeInferencePass : Pass<"tf-shape-inference", "ModuleOp"> {
 
   let options = [
     Option<"max_iterations_", "max-iterations", "int64_t", /*default=*/"10",
-           "Maximum shape inference iterations">
+           "Maximum shape inference iterations">,
+    Option<"input_arg_shapes_", "input-arg-shapes", "std::string", /*default=*/"",
+           "Input tensor shapes. Shapes for different tensors are separated by ':', and dimension sizes for the same tensor are separated by ','">,
   ];
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_asset_sinking_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_asset_sinking_pass.cc
index 0bc7b47377fa3f..40d9032b499ff6 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_asset_sinking_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_asset_sinking_pass.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
@@ -70,7 +71,7 @@ class AssetSinkingPass : public impl::AssetSinkingPassBase<AssetSinkingPass> {
     SymbolTable symbol_table(module);
     for (auto initializer : init_op.getInitializers()) {
       auto func = symbol_table.lookup<func::FuncOp>(
-          initializer.cast<FlatSymbolRefAttr>().getValue());
+          mlir::cast<FlatSymbolRefAttr>(initializer).getValue());
       RewriteFunction(symbol_table, func);
     }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_freeze_variables.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_freeze_variables.cc
index 141807309c4a9c..26bc9dae51057c 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_freeze_variables.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_freeze_variables.cc
@@ -54,7 +54,7 @@ constexpr StringRef kTfInputShapesAttr = "tf._input_shapes";
 // Build and returns ElementsAttr which holds the data in 'tensor'.
 ElementsAttr GetTensorValueAsElementsAttr(const tensorflow::Tensor& tensor,
                                           OpBuilder builder) {
-  tensorflow::StatusOr<ElementsAttr> tensor_attr_or =
+  absl::StatusOr<ElementsAttr> tensor_attr_or =
       tensorflow::ConvertTensor(tensor, &builder);
   if (!tensor_attr_or.ok()) return nullptr;
   return tensor_attr_or.value();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tfg-to-tfe.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tfg-to-tfe.cc
index 7f449520030876..68d50e54a1bce0 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tfg-to-tfe.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tfg-to-tfe.cc
@@ -18,12 +18,13 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
@@ -58,7 +59,7 @@ static mlir::LogicalResult FilterTfgSpecificArgResultAttributes(
     llvm::SmallVector<mlir::DictionaryAttr> &output_attrs) {
   for (auto it : llvm::zip(
            types, array_attr.template getAsRange<mlir::DictionaryAttr>())) {
-    if (std::get<0>(it).isa<tfg::ControlType>()) continue;
+    if (mlir::isa<tfg::ControlType>(std::get<0>(it))) continue;
     output_types.push_back(std::get<0>(it));
 
     mlir::NamedAttrList list;
@@ -80,7 +81,7 @@ static mlir::LogicalResult ReformatOpAttributes(
             mlir::tfg::TFGraphDialect::getDeviceAttrKey())) {
       tensorflow::DeviceNameUtils::ParsedName parsed_name;
       if (!tensorflow::DeviceNameUtils::ParseFullName(
-              attr.getValue().cast<mlir::StringAttr>().getValue().str(),
+              mlir::cast<mlir::StringAttr>(attr.getValue()).getValue().str(),
               &parsed_name))
         return mlir::failure();
       if (!parsed_name.has_type) {
@@ -106,7 +107,7 @@ static mlir::LogicalResult ReformatOpAttributes(
 static void FilterOutBlockArgControlDep(
     ValueRange operands, llvm::SmallVectorImpl<Value> &filtered) {
   for (Value value : operands)
-    if (!value.isa<mlir::BlockArgument>()) filtered.push_back(value);
+    if (!mlir::isa<mlir::BlockArgument>(value)) filtered.push_back(value);
 }
 
 // Split the tfg.NextIteration into tf_executor::NextIterationSourceOp and
@@ -114,7 +115,7 @@ static void FilterOutBlockArgControlDep(
 static void SplitNextIteration(Block &block) {
   // TODO(b/207144333): Supports callback for unregistered ops
   block.walk([&](Operation *op) {
-    if (!op->getName().getStringRef().equals("tfg.NextIteration")) return;
+    if (op->getName().getStringRef() != "tfg.NextIteration") return;
     mlir::OpBuilder builder(op);
 
     llvm::SmallVector<Value, 2> new_operands;
@@ -218,7 +219,7 @@ class ConvertGraphFuncOp : public OpConversionPattern<tfg::GraphFuncOp> {
     Block &block = graph_func.getBody().front();
     for (auto iter = block.args_begin(), end_iter = block.args_end();
          iter != end_iter; ++iter) {
-      if (!iter->getType().isa<tfg::ControlType>())
+      if (!mlir::isa<tfg::ControlType>(iter->getType()))
         iter->replaceAllUsesWith(func.getBody().getArgument(idx++));
     }
 
@@ -412,9 +413,9 @@ class ConvertGeneralOp : public ConversionPattern {
     for (Value value : operands) {
       // Because of the property of graph region, the control operands may
       // not have been converted to tf_executor::ControlType.
-      if (value.getType().isa<tfg::ControlType>() ||
-          value.getType().isa<tf_executor::ControlType>()) {
-        if (!value.isa<BlockArgument>())
+      if (mlir::isa<tfg::ControlType>(value.getType()) ||
+          mlir::isa<tf_executor::ControlType>(value.getType())) {
+        if (!mlir::isa<BlockArgument>(value))
           island_control_operands.push_back(value);
       } else {
         inner_op_operands.push_back(value);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_annotate_dynamic_shape_inputs.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_annotate_dynamic_shape_inputs.cc
index d1a244b7f2ec2a..b4a98605a34ac2 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_annotate_dynamic_shape_inputs.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_annotate_dynamic_shape_inputs.cc
@@ -53,7 +53,7 @@ class TPUAnnotateDynamicShapeInputsPass
 // Finds op that created a given value. If the value is a BlockArgument, this
 // returns the owner of the Block.
 Operation* GetOpOfValue(Value value) {
-  if (auto block_arg = value.dyn_cast<BlockArgument>())
+  if (auto block_arg = mlir::dyn_cast<BlockArgument>(value))
     return block_arg.getOwner()->getParentOp();
 
   return value.getDefiningOp();
@@ -98,7 +98,7 @@ void TPUAnnotateDynamicShapeInputsPass::runOnOperation() {
     // Update the marked argument with dynamic shapes.
     for (int index : dynamic_shape_arg_index) {
       BlockArgument arg = func.getArgument(index);
-      auto inputType = arg.getType().dyn_cast<RankedTensorType>();
+      auto inputType = mlir::dyn_cast<RankedTensorType>(arg.getType());
       // Only rank 1 tensor is supported for now.
       if (!inputType || inputType.getRank() != 1) continue;
       auto shape = llvm::to_vector<4>(inputType.getShape());
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_device_propagation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_device_propagation.cc
index a6f9d7d4c63f01..e2b9c62ee8e6bc 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_device_propagation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_device_propagation.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "mlir/IR/UseDefLists.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h"
@@ -94,8 +95,8 @@ void PopulateDeviceForOpResults(
     op_to_update = op_to_update->getParentOp();
 
   for (Value result : op_to_update->getResults()) {
-    if (result.getType().isa<tf_executor::TokenType>()) continue;
-    if (result.getType().isa<tf_executor::ControlType>()) break;
+    if (mlir::isa<tf_executor::TokenType>(result.getType())) continue;
+    if (mlir::isa<tf_executor::ControlType>(result.getType())) break;
 
     value_to_device.insert({result, device});
   }
@@ -118,8 +119,8 @@ llvm::StringRef FindDeviceFromOperands(
   llvm::StringRef new_device;
   const bool is_switch = llvm::isa<tf_executor::SwitchOp>(op);
   for (Value operand : op.getOperands()) {
-    if (operand.getType().isa<tf_executor::TokenType>()) continue;
-    if (operand.getType().isa<tf_executor::ControlType>()) break;
+    if (mlir::isa<tf_executor::TokenType>(operand.getType())) continue;
+    if (mlir::isa<tf_executor::ControlType>(operand.getType())) break;
 
     if (is_switch &&
         llvm::isa_and_nonnull<tf_executor::LoopCondOp>(operand.getDefiningOp()))
@@ -230,7 +231,7 @@ void PropagateDevicesToResults(
   mlir::Builder builder(func.getOperation());
 
   for (OpOperand& operand : fetch.getOperation()->getOpOperands()) {
-    if (operand.get().getType().isa<tf_executor::ControlType>()) break;
+    if (mlir::isa<tf_executor::ControlType>(operand.get().getType())) break;
     auto it = value_to_device.find(operand.get());
     if (it != value_to_device.end()) {
       auto device_attr = func.getResultAttrOfType<StringAttr>(
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_layout_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_layout_pass.cc
index 04b488a38048fd..2281658efc5ed1 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_layout_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_layout_pass.cc
@@ -86,7 +86,7 @@ bool IsSupportedInputOp(
       resource_alias_analysis.GetResourceAliases(resource_iterator);
 
   auto is_generator = [](Value val) {
-    if (val.isa<BlockArgument>()) return true;
+    if (mlir::isa<BlockArgument>(val)) return true;
     Operation* definition = val.getDefiningOp();
     return definition->getNumOperands() == 0 &&
            definition->getNumResults() == 1;
@@ -99,7 +99,7 @@ bool IsSupportedInputOp(
     if (!is_generator(alias)) return true;
 
     StringAttr device;
-    if (auto arg = alias.dyn_cast<BlockArgument>()) {
+    if (auto arg = mlir::dyn_cast<BlockArgument>(alias)) {
       device = func.getArgAttrOfType<mlir::StringAttr>(arg.getArgNumber(),
                                                        kFuncDeviceAttr);
     } else {
@@ -186,10 +186,8 @@ bool HandleReplicatedInputs(
         BuildCopyWithLayout(execute_launch, compile_launch, get_layout,
                             entry.value().get(), &builder);
 
-    auto device_list = replicate.getDevices()
-                           .value()
-                           .get(execute_launch.getDevice())
-                           .cast<ArrayAttr>();
+    auto device_list = mlir::cast<ArrayAttr>(
+        replicate.getDevices().value().get(execute_launch.getDevice()));
     copy_with_layout->setAttr(kDeviceAttr,
                               device_list.getValue()[entry.index()]);
 
@@ -225,7 +223,7 @@ void HandleCompileAndExecutes(
     for (const auto& input_and_idx : llvm::enumerate(execute.getArgs())) {
       Value input = input_and_idx.value();
       const int64_t execute_arg_index = input_and_idx.index();
-      if (auto block_arg = input.dyn_cast<BlockArgument>()) {
+      if (auto block_arg = mlir::dyn_cast<BlockArgument>(input)) {
         // For a block argument, consider transforms only when it is a
         // replicated input (defining ops will be outside the replicate node).
         if (maybe_replicate != block_arg.getParentRegion()->getParentOp() ||
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_host_computation_expansion.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_host_computation_expansion.cc
index fdea45957eb7d8..b2a3b81f63a1a9 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_host_computation_expansion.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_host_computation_expansion.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
@@ -41,7 +42,7 @@ bool HasOutsideCompilationAttribute(Operation* op) {
 // Finds op that created a given value. If the value is a BlockArgument, this
 // returns the owner of the Block.
 Operation* GetOpOfValue(Value value) {
-  if (auto block_arg = value.dyn_cast<BlockArgument>())
+  if (auto block_arg = mlir::dyn_cast<BlockArgument>(value))
     return block_arg.getOwner()->getParentOp();
 
   return value.getDefiningOp();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_partitioned_op_conversion.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_partitioned_op_conversion.cc
index a2232f9f33bf2a..08165fb1435ff2 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_partitioned_op_conversion.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_partitioned_op_conversion.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
@@ -61,12 +62,12 @@ LogicalResult ReplacePartitionedOp(IntegerAttr num_cores_per_replica, T op) {
   }
 
   auto element_type = getElementTypeOrSelf(first_operand_type);
-  if (element_type.isa<TF::ResourceType>()) {
+  if (mlir::isa<TF::ResourceType>(element_type)) {
     first_operand_type =
-        element_type.cast<TF::ResourceType>().getSubtypes().front();
+        mlir::cast<TF::ResourceType>(element_type).getSubtypes().front();
   }
 
-  auto tensor_type = first_operand_type.dyn_cast_or_null<TensorType>();
+  auto tensor_type = mlir::dyn_cast_or_null<TensorType>(first_operand_type);
   if (!(tensor_type && tensor_type.hasRank())) {
     return op->emitError()
            << "cannot convert op with unranked or non-tensor input type "
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_resource_read_for_write.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_resource_read_for_write.cc
index fa18fc25ce9c67..5f708ce0ee1a74 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_resource_read_for_write.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_resource_read_for_write.cc
@@ -75,10 +75,14 @@ ResourceValueAndSubtype GetResourceWriteResult(
 // Checks if resource is read by TPU cluster.
 bool ClusterFuncHasResourceRead(tf_device::ClusterFuncOp cluster_func,
                                 Value resource) {
-  for (Operation* resource_user : resource.getUsers())
-    if (auto read = dyn_cast<TF::ReadVariableOp>(resource_user))
-      for (Operation* read_user : read.getValue().getUsers())
+  for (Operation* resource_user : resource.getUsers()) {
+    if (auto read = dyn_cast<TF::ReadVariableOp>(resource_user)) {
+      for (Operation* read_user : read.getValue().getUsers()) {
         if (read_user == cluster_func) return true;
+        if (isa<tf_device::ReplicateOp>(read_user)) return true;
+      }
+    }
+  }
 
   return false;
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_space_to_depth_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_space_to_depth_pass.cc
index c6ce1428bfb3e4..ef16273e9eea45 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_space_to_depth_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_space_to_depth_pass.cc
@@ -90,7 +90,7 @@ LogicalResult HandleCast(TF::CastOp cast_op, ArrayRef<int64_t> new_shape) {
   auto transform_result_type =
       RankedTensorType::get(new_shape, getElementTypeOrSelf(cast_input));
   cast_input.setType(transform_result_type);
-  auto block_arg = cast_input.dyn_cast<mlir::BlockArgument>();
+  auto block_arg = mlir::dyn_cast<mlir::BlockArgument>(cast_input);
   auto cast_op_input = dyn_cast_or_null<TF::CastOp>(cast_input.getDefiningOp());
   while (block_arg || cast_op_input) {
     if (block_arg) {
@@ -105,7 +105,7 @@ LogicalResult HandleCast(TF::CastOp cast_op, ArrayRef<int64_t> new_shape) {
           RankedTensorType::get(new_shape, getElementTypeOrSelf(cast_input));
       cast_input.setType(transform_result_type);
       // Update block arg and cast_op_input.
-      block_arg = cast_input.dyn_cast<mlir::BlockArgument>();
+      block_arg = mlir::dyn_cast<mlir::BlockArgument>(cast_input);
       cast_op_input = dyn_cast_or_null<TF::CastOp>(cast_input.getDefiningOp());
     }
   }
@@ -114,7 +114,7 @@ LogicalResult HandleCast(TF::CastOp cast_op, ArrayRef<int64_t> new_shape) {
 
 // Handles padding before convolution for space to depth transform.
 LogicalResult HandlePad(TF::PadOp op, int32_t kernel_size, int32_t block_size) {
-  auto ranked_type = op.getInput().getType().dyn_cast<RankedTensorType>();
+  auto ranked_type = mlir::dyn_cast<RankedTensorType>(op.getInput().getType());
   if (!ranked_type) return failure();
   auto pad_input_shape = ranked_type.getShape();
   Location loc = op.getLoc();
@@ -164,7 +164,7 @@ void HandleConv2DStride(TF::Conv2DOp conv2d) {
 // Transforms input shape for the first convolution.
 void HandleConv2DInput(TF::Conv2DOp conv2d, int64_t block_size) {
   auto input = conv2d.getInput();
-  auto input_shape = input.getType().cast<RankedTensorType>().getShape();
+  auto input_shape = mlir::cast<RankedTensorType>(input.getType()).getShape();
   SmallVector<int64_t, 4> transform_shape = {
       input_shape[0], input_shape[1] / block_size, input_shape[2] / block_size,
       input_shape[3] * block_size * block_size};
@@ -228,7 +228,7 @@ void HandleConv2DFilter(TF::Conv2DOp conv2d, int64_t block_size) {
   OpBuilder builder(conv2d);
   builder.setInsertionPoint(conv2d);
   // Book keeping filter information.
-  auto filter_shape = filter.getType().cast<RankedTensorType>().getShape();
+  auto filter_shape = mlir::cast<RankedTensorType>(filter.getType()).getShape();
   int64_t height = filter_shape[0];
   int64_t width = filter_shape[1];
   int64_t channel = filter_shape[2];
@@ -422,7 +422,7 @@ bool HandleHostReplicatedInputs(int64_t index,
   }
   for (auto entry : llvm::enumerate(inputs)) {
     Value input = entry.value().get();
-    auto ranked_type = input.getType().dyn_cast<RankedTensorType>();
+    auto ranked_type = mlir::dyn_cast<RankedTensorType>(input.getType());
     if (!ranked_type) return false;
     auto input_shape = ranked_type.getShape();
     auto space_to_depth =
@@ -442,7 +442,7 @@ void HandleCluster(tf_device::ClusterFuncOp cluster_func, int32_t block_size,
 
   llvm::SmallVector<int64_t, 8> transform_input_indices;
   for (const auto& input : llvm::enumerate(cluster_func.getOperands())) {
-    if (auto block_arg = input.value().dyn_cast<BlockArgument>()) {
+    if (auto block_arg = mlir::dyn_cast<BlockArgument>(input.value())) {
       if (block_arg.getArgNumber() != arg_num) continue;
       // For a block argument, consider transforms only when it is a replicated
       // input (defining ops will be outside the replicate node).
@@ -462,7 +462,8 @@ void HandleCluster(tf_device::ClusterFuncOp cluster_func, int32_t block_size,
         continue;
       }
       if (!IsSupportedHostInputOp(input_op)) continue;
-      auto ranked_type = input.value().getType().dyn_cast<RankedTensorType>();
+      auto ranked_type =
+          mlir::dyn_cast<RankedTensorType>(input.value().getType());
       if (!ranked_type) continue;
       auto input_shape = ranked_type.getShape();
       HandleHostInput(input.value(), input.index(), cluster_func, block_size,
@@ -473,7 +474,7 @@ void HandleCluster(tf_device::ClusterFuncOp cluster_func, int32_t block_size,
 
 // Checks if input shape of convolution is good for space to depth transform.
 bool Conv2DInputShapeCanTransform(Value input) {
-  auto ranked_type = input.getType().dyn_cast<RankedTensorType>();
+  auto ranked_type = mlir::dyn_cast<RankedTensorType>(input.getType());
   if (!ranked_type) return false;
   auto input_shape = ranked_type.getShape();
   int32_t batch_size = input_shape[0];
@@ -486,7 +487,7 @@ bool Conv2DInputShapeCanTransform(Value input) {
 
 // Get block argument id and number of users for the input arg.
 std::optional<BlockArgumentInfo> GetBlockArgNum(Value arg) {
-  if (auto block_arg = arg.dyn_cast<mlir::BlockArgument>()) {
+  if (auto block_arg = mlir::dyn_cast<mlir::BlockArgument>(arg)) {
     if (!Conv2DInputShapeCanTransform(arg)) return std::nullopt;
     unsigned num_users =
         std::distance(block_arg.getUsers().begin(), block_arg.getUsers().end());
@@ -540,9 +541,9 @@ std::optional<BlockArgumentInfo> GetConv2DInputArgNum(TF::Conv2DOp conv2d) {
 void HandleFirstConvolution(TF::Conv2DOp conv2d, int64_t block_size) {
   // Check if input and filter type are RankedTensorType.
   auto input_tensor_type =
-      conv2d.getInput().getType().dyn_cast<RankedTensorType>();
+      mlir::dyn_cast<RankedTensorType>(conv2d.getInput().getType());
   auto filter_tensor_type =
-      conv2d.getFilter().getType().dyn_cast<RankedTensorType>();
+      mlir::dyn_cast<RankedTensorType>(conv2d.getFilter().getType());
   if (!input_tensor_type || !filter_tensor_type) return;
   // Book keeping filter shape for padding and backprop filter rewrite.
   auto filter_shape = filter_tensor_type.getShape();
@@ -550,7 +551,7 @@ void HandleFirstConvolution(TF::Conv2DOp conv2d, int64_t block_size) {
                                            filter_shape.end());
   // Handles input.
   auto conv2d_input = conv2d.getInput();
-  if (auto block_arg = conv2d_input.dyn_cast<mlir::BlockArgument>()) {
+  if (auto block_arg = mlir::dyn_cast<mlir::BlockArgument>(conv2d_input)) {
     // Change on device function type/shape.
     HandleFuncOp(block_arg.getOwner()->getParentOp());
   }
@@ -559,7 +560,7 @@ void HandleFirstConvolution(TF::Conv2DOp conv2d, int64_t block_size) {
     // Rewrite pad_op before Convolutioin.
     if (failed(HandlePad(pad_op, filter_shape[0], block_size))) return;
     auto pad_input = pad_op.getInput();
-    if (auto block_arg = pad_input.dyn_cast<mlir::BlockArgument>()) {
+    if (auto block_arg = mlir::dyn_cast<mlir::BlockArgument>(pad_input)) {
       // Change on device function type/shape.
       HandleFuncOp(block_arg.getOwner()->getParentOp());
     }
@@ -573,7 +574,7 @@ void HandleFirstConvolution(TF::Conv2DOp conv2d, int64_t block_size) {
   // Book keeping new filter shape for backprop filter rewrite.
   // Filter shape is defined in HandleConv2DFilter, thus it is RankedTensorType.
   filter_shape =
-      conv2d.getFilter().getType().cast<RankedTensorType>().getShape();
+      mlir::cast<RankedTensorType>(conv2d.getFilter().getType()).getShape();
   SmallVector<int32_t, 4> new_filter_shape(filter_shape.begin(),
                                            filter_shape.end());
 
@@ -593,7 +594,7 @@ void HandleFirstConvolution(TF::Conv2DOp conv2d, int64_t block_size) {
 int32_t GetConv2DBlockSize(TF::Conv2DOp conv2d) {
   SmallVector<int32_t, 4> strides(4, 1);
   for (int i = 0; i < 3; ++i) {
-    strides[i] = conv2d.getStrides()[i].cast<mlir::IntegerAttr>().getInt();
+    strides[i] = mlir::cast<mlir::IntegerAttr>(conv2d.getStrides()[i]).getInt();
   }
 
   // Space to depth only supports striding at spatial dimension.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_validate_inputs.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_validate_inputs.cc
index 4dc9daa6c705ee..21f62e41383401 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_validate_inputs.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_validate_inputs.cc
@@ -372,7 +372,8 @@ bool CheckOpsClusterIO(Operation* op, MetadataMap& metadata_map) {
 
 bool TypeMustBeNonXLA(const Type& type) {
   const Type elem = getElementTypeOrSelf(type);
-  return !elem.isa<TF::ResourceType>() && !tensorflow::TypeValidForXLA(type);
+  return !mlir::isa<TF::ResourceType>(elem) &&
+         !tensorflow::TypeValidForXLA(type);
 }
 
 // Check if the op cannot be XLA compiled. If the op does not satisfy this
@@ -539,6 +540,18 @@ bool IsValidMAXIMALSharding(Operation* op, MetadataMap& metadata_map) {
   return true;
 }
 
+bool HasSingleCoreTpu(Operation* op) {
+  if (auto compilation_attr =
+          op->getAttrOfType<mlir::StringAttr>(TF::kCompileDeviceTypeAttr)) {
+    if (compilation_attr.getValue().str() == TF::kTpuDevice) {
+      op->emitOpError(
+          "TF2XLA TPU bridge input check: found a single-core TPU graph");
+      return true;
+    }
+  }
+  return false;
+}
+
 void TPUValidateInputsPass::runOnOperation() {
   ModuleOp module = getOperation();
   bool success = true;
@@ -563,10 +576,11 @@ void TPUValidateInputsPass::runOnOperation() {
       success &= IsValidMAXIMALSharding(op, metadata_map);
       success &= IsValidShardingTupleForArity(op);
     }
+    success &= !HasSingleCoreTpu(op);
+    if (!success) {
+      signalPassFailure();
+    }
   });
-  if (!success) {
-    signalPassFailure();
-  }
 }
 
 }  // anonymous namespace
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/unroll_batch_matmul.cc b/tensorflow/compiler/mlir/tensorflow/transforms/unroll_batch_matmul.cc
index abdd1a83d516eb..ff8ac1ad7cacd1 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/unroll_batch_matmul.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/unroll_batch_matmul.cc
@@ -96,7 +96,7 @@ TF::ReshapeOp ConvertTFBatchMatMulOp<BatchMatMulOpType>::createReshapeOp(
 template <typename BatchMatMulOpType>
 std::vector<Value> ConvertTFBatchMatMulOp<BatchMatMulOpType>::sliceInput(
     Value value, int batch_size, Location loc, PatternRewriter& rewriter) {
-  RankedTensorType tensorType = value.getType().cast<RankedTensorType>();
+  RankedTensorType tensorType = mlir::cast<RankedTensorType>(value.getType());
   Type element_type = tensorType.getElementType();
 
   int rank = tensorType.getShape().size();
@@ -150,17 +150,17 @@ LogicalResult ConvertTFBatchMatMulOp<BatchMatMulOpType>::matchAndRewrite(
   Value input_lhs = op.getX();
   Value input_rhs = op.getY();
 
-  if (!input_lhs.getType().isa<RankedTensorType>()) {
+  if (!mlir::isa<RankedTensorType>(input_lhs.getType())) {
     // LHS must be a ranked tensor type
     return failure();
   }
-  if (!input_rhs.getType().isa<RankedTensorType>()) {
+  if (!mlir::isa<RankedTensorType>(input_rhs.getType())) {
     // RHS must be a ranked tensor type
     return failure();
   }
 
-  auto lhs_type = input_lhs.getType().cast<RankedTensorType>();
-  auto rhs_type = input_rhs.getType().cast<RankedTensorType>();
+  auto lhs_type = mlir::cast<RankedTensorType>(input_lhs.getType());
+  auto rhs_type = mlir::cast<RankedTensorType>(input_rhs.getType());
 
   // Skip int8 x int8 => int32.
   if (lhs_type.getElementType().isInteger(8) &&
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/xla_call_module_deserialization.cc b/tensorflow/compiler/mlir/tensorflow/transforms/xla_call_module_deserialization.cc
index 20dcdb8b034c97..9237ff8d5b69dd 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/xla_call_module_deserialization.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/xla_call_module_deserialization.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "stablehlo/dialect/ChloOps.h"  // from @stablehlo  // IWYU pragma: keep
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
@@ -156,8 +157,8 @@ LogicalResult SymbolizeCustomCallCalledIndex(
           return WalkResult::interrupt();
         }
 
-        auto called_index_attr = backend_config.get(kCalledIndexAttrName)
-                                     .dyn_cast_or_null<IntegerAttr>();
+        auto called_index_attr = mlir::dyn_cast_or_null<IntegerAttr>(
+            backend_config.get(kCalledIndexAttrName));
         if (!called_index_attr) {
           op->emitOpError()
               << "is missing attribute '" << kCalledIndexAttrName << "'";
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/xla_call_module_serialization.cc b/tensorflow/compiler/mlir/tensorflow/transforms/xla_call_module_serialization.cc
index a75bf4c75d8033..6ab5da6bdb2e3c 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/xla_call_module_serialization.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/xla_call_module_serialization.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "stablehlo/api/PortableApi.h"  // from @stablehlo
 #include "stablehlo/dialect/Serialization.h"  // from @stablehlo
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
@@ -66,8 +67,8 @@ FailureOr<ArrayAttr> DesymbolizeCustomCallCalledIndex(ModuleOp module) {
                         << "'";
       return WalkResult::interrupt();
     }
-    auto called_func = backend_config.get(kCalledFuncAttrName)
-                           .dyn_cast_or_null<SymbolRefAttr>();
+    auto called_func = mlir::dyn_cast_or_null<SymbolRefAttr>(
+        backend_config.get(kCalledFuncAttrName));
     if (!called_func) {
       op->emitOpError() << "is missing attribute '" << kCalledFuncAttrName
                         << "'";
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/xla_rewrite.cc b/tensorflow/compiler/mlir/tensorflow/transforms/xla_rewrite.cc
index 1992f43a951184..8ce264b47b57d4 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/xla_rewrite.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/xla_rewrite.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <stack>
 
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
@@ -44,8 +45,7 @@ void MoveResourceArgsToEnd(func::FuncOp callee) {
   // Copy the resource-type parameters to the end.
   for (unsigned i = 0; i < num_params; ++i) {
     BlockArgument param = callee.getArgument(i);
-    if (getElementTypeOrSelf(param.getType())
-            .template isa<TF::ResourceType>()) {
+    if (mlir::isa<TF::ResourceType>(getElementTypeOrSelf(param.getType()))) {
       removed_params.set(i);
       callee.getBody().addArgument(param.getType(), param.getLoc());
       param.replaceAllUsesWith(callee.getArguments().back());
@@ -65,7 +65,7 @@ void RewriteCall(tf_device::ClusterFuncOp cluster_func_op, SymbolTable &symtab,
   llvm::SmallVector<Value> non_resource_args, resource_args;
   bool has_resources = false, in_order = true;
   for (const Value &arg : cluster_func_op.getOperands()) {
-    if (!getElementTypeOrSelf(arg.getType()).template isa<TF::ResourceType>()) {
+    if (!mlir::isa<TF::ResourceType>(getElementTypeOrSelf(arg.getType()))) {
       non_resource_args.push_back(arg);
       if (has_resources) in_order = false;
     } else {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/xla_validate_inputs.cc b/tensorflow/compiler/mlir/tensorflow/transforms/xla_validate_inputs.cc
index 24ae9056866ad4..9267607e7e342a 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/xla_validate_inputs.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/xla_validate_inputs.cc
@@ -65,7 +65,7 @@ LogicalResult HasNoNestedEntryFunctions(
       // "tf_saved_model.initializer_type" attribute from the callee of the
       // inner calls if the problem ever arises.
       entry_func->emitError()
-          << "TF2XLA MLIR CPU/GPU phase 1 bridge expects no nested calls"
+          << "TF2XLA MLIR Non-replicated Phase 1 Bridge expects no nested calls"
              " of entry functions as they prevent graph traversal in some "
              "passes from "
              "working correctly";
@@ -75,15 +75,13 @@ LogicalResult HasNoNestedEntryFunctions(
   return success();
 }
 
-// MLIR CPU/GPU phase 1 pipeline assumes an entry function has single region and
-// single block when handling top-level compilation markers.
-LogicalResult HasSingleBlockEntryFunctions(
-    llvm::SmallVector<func::FuncOp> &entry_funcs, SymbolTable &symtab) {
+LogicalResult HasTopLevelCompilationMarker(
+    llvm::SmallVector<func::FuncOp> &entry_funcs) {
   for (auto &entry_func : entry_funcs) {
-    if (!HasSingleBlock(entry_func)) {
-      entry_func->emitError() << "TF2XLA MLIR CPU/GPU MLIR phase 1 bridge "
-                                 "expects single region and single "
-                                 "block in an entry function.";
+    if (entry_func->hasAttr(mlir::TF::kCompileDeviceTypeAttr)) {
+      entry_func->emitError() << "TF2XLA MLIR Non-replicated Phase 1 Bridge "
+                                 "does not support top-level compilation "
+                                 "marker.";
       return failure();
     }
   }
@@ -102,7 +100,7 @@ void XlaValidateInputsPass::runOnOperation() {
     return signalPassFailure();
   }
 
-  if (HasSingleBlockEntryFunctions(entry_funcs, symtab).failed()) {
+  if (HasTopLevelCompilationMarker(entry_funcs).failed()) {
     return signalPassFailure();
   }
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/BUILD b/tensorflow/compiler/mlir/tensorflow/translate/BUILD
index 59d7cfd7081106..f0280340dddf62 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/translate/BUILD
@@ -107,53 +107,11 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:DerivedAttributeOpInterface",
         "@llvm-project//mlir:IR",
-        "@local_xla//xla:status_macros",
-    ],
-)
-
-cc_library(
-    name = "translate_tf_dialect_op",
-    srcs = ["translate_tf_dialect_op.cc"],
-    deps = [
-        ":export_tf_dialect_op",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TranslateLib",
-        "@local_tsl//tsl/platform:protobuf",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "mlir_roundtrip_pass",
-    srcs = ["mlir_roundtrip_pass.cc"],
-    hdrs = ["mlir_roundtrip_pass.h"],
-    deps = [
-        ":export_graphdef",
-        ":import_model",
-        ":mlir_roundtrip_flags",
-        "//tensorflow/compiler/mlir/tensorflow:error_util",
-        "//tensorflow/core:core_cpu_lib",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
         "@local_xla//xla:status_macros",
     ],
 )
 
-cc_library(
-    name = "mlir_roundtrip_pass_registration",
-    srcs = ["mlir_roundtrip_pass_registration.cc"],
-    deps = [
-        ":mlir_roundtrip_pass",
-    ],
-    alwayslink = 1,
-)
-
 cc_library(
     name = "mlir_roundtrip_flags",
     srcs = ["mlir_roundtrip_flags.cc"],
@@ -209,6 +167,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Support",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
index 523048cd7cd582..0d8a75e7f7de9d 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
@@ -129,7 +130,7 @@ class Exporter {
   // function are added to the graph with special op names kArgOp and kRetOp.
   // Later on, this graph can be converted a function definition and added to
   // another graph.
-  static StatusOr<std::unique_ptr<Graph>> Convert(
+  static absl::StatusOr<std::unique_ptr<Graph>> Convert(
       const GraphExportConfig& configs, const Dialect* tf_dialect,
       const SymbolTable& symbol_table, FuncOp function,
       FunctionLibraryDefinition* flib_def,
@@ -155,13 +156,12 @@ class Exporter {
 
   Status AddEdge(Operation* inst);
 
-  StatusOr<std::unique_ptr<NodeDef>> GetArgumentNode(BlockArgument arg,
-                                                     unsigned index,
-                                                     llvm::StringRef name);
-  StatusOr<std::unique_ptr<NodeDef>> GetReturnNode(FuncOp function,
-                                                   Value operand,
-                                                   unsigned index,
-                                                   llvm::StringRef name);
+  absl::StatusOr<std::unique_ptr<NodeDef>> GetArgumentNode(
+      BlockArgument arg, unsigned index, llvm::StringRef name);
+  absl::StatusOr<std::unique_ptr<NodeDef>> GetReturnNode(FuncOp function,
+                                                         Value operand,
+                                                         unsigned index,
+                                                         llvm::StringRef name);
   Status GetControlRetNodes(mlir::tf_executor::FetchOp fetch,
                             absl::flat_hash_set<Node*>* control_ret_nodes);
   // Adds one edge between src_node and dst_node. If it is not a control edge,
@@ -192,7 +192,7 @@ std::string FindFunctionName(const GraphExportConfig& configs, FuncOp func) {
   return func.getName().str();
 }
 
-StatusOr<std::unique_ptr<NodeDef>> Exporter::GetArgumentNode(
+absl::StatusOr<std::unique_ptr<NodeDef>> Exporter::GetArgumentNode(
     BlockArgument arg, unsigned index, llvm::StringRef name) {
   auto func = arg.getParentRegion()->getParentOfType<FuncOp>();
 
@@ -205,9 +205,9 @@ StatusOr<std::unique_ptr<NodeDef>> Exporter::GetArgumentNode(
 
   node_def->set_op(FunctionLibraryDefinition::kArgOp);
 
-  mlir::TensorType arg_type = arg.getType().cast<mlir::TensorType>();
+  mlir::TensorType arg_type = mlir::cast<mlir::TensorType>(arg.getType());
   if (auto resource_type =
-          arg_type.getElementType().dyn_cast<mlir::TF::ResourceType>()) {
+          mlir::dyn_cast<mlir::TF::ResourceType>(arg_type.getElementType())) {
     llvm::ArrayRef<mlir::TensorType> subtypes = resource_type.getSubtypes();
     if (!subtypes.empty()) {
       AttrValue handle_dtypes_attr;
@@ -254,7 +254,7 @@ StatusOr<std::unique_ptr<NodeDef>> Exporter::GetArgumentNode(
   return node_def;
 }
 
-StatusOr<std::unique_ptr<NodeDef>> Exporter::GetReturnNode(
+absl::StatusOr<std::unique_ptr<NodeDef>> Exporter::GetReturnNode(
     FuncOp function, Value operand, unsigned index, llvm::StringRef name) {
   auto node_def = std::make_unique<NodeDef>();
   if (!name.empty())
@@ -266,7 +266,8 @@ StatusOr<std::unique_ptr<NodeDef>> Exporter::GetReturnNode(
   node_def->set_op(FunctionLibraryDefinition::kRetOp);
   DataType dtype;
   TF_RETURN_IF_ERROR(ConvertToDataType(
-      operand.getType().cast<mlir::TensorType>().getElementType(), &dtype));
+      mlir::cast<mlir::TensorType>(operand.getType()).getElementType(),
+      &dtype));
   AttrValue type_attr;
   type_attr.set_type(dtype);
   (*node_def->mutable_attr())["T"] = type_attr;
@@ -290,7 +291,7 @@ StatusOr<std::unique_ptr<NodeDef>> Exporter::GetReturnNode(
 
 Status Exporter::AddEdgeBetweenNodes(Value src, Node* dst_node,
                                      unsigned dst_index) {
-  if (auto input_result = src.dyn_cast<mlir::OpResult>()) {
+  if (auto input_result = mlir::dyn_cast<mlir::OpResult>(src)) {
     auto* input_inst = GetIslandInnerOpOrSelf(input_result.getOwner());
     // Replaces the input node with NextIteration sink if it is a NextIteration
     // source.
@@ -302,23 +303,23 @@ Status Exporter::AddEdgeBetweenNodes(Value src, Node* dst_node,
     auto node_it = nodes_.find(input_inst);
     TF_RET_CHECK(node_it != nodes_.end())
         << "Use of OpResult encountered before def!";
-    if (input_result.getType().isa<mlir::tf_executor::ControlType>()) {
+    if (mlir::isa<mlir::tf_executor::ControlType>(input_result.getType())) {
       graph_->AddControlEdge(node_it->second, dst_node,
                              /*allow_duplicates=*/true);
     } else {
       graph_->AddEdge(node_it->second, input_result.getResultNumber(), dst_node,
                       dst_index);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  auto input_arg = src.cast<BlockArgument>();
+  auto input_arg = mlir::cast<BlockArgument>(src);
   auto input_node_it = args_.find(input_arg);
   TF_RET_CHECK(input_node_it != args_.end())
       << "Use of BlockArgument encounted before def!";
   // For argument, there is only one result output, so the index is always 0.
   graph_->AddEdge(input_node_it->second, 0, dst_node, dst_index);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status Exporter::AddEdge(Operation* inst) {
@@ -327,13 +328,13 @@ Status Exporter::AddEdge(Operation* inst) {
   if (auto fetch = llvm::dyn_cast<mlir::tf_executor::FetchOp>(inst)) {
     for (auto operand_and_idx : llvm::enumerate(fetch.getOperands())) {
       Value operand = operand_and_idx.value();
-      if (operand.getType().isa<mlir::tf_executor::ControlType>()) break;
+      if (mlir::isa<mlir::tf_executor::ControlType>(operand.getType())) break;
 
       auto* dst_node = returns_[fetch][operand_and_idx.index()];
       TF_RETURN_IF_ERROR(AddEdgeBetweenNodes(operand, dst_node, 0));
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // For tf_executor.NextIteration.Sink, skip its token operand and add data and
@@ -348,14 +349,14 @@ Status Exporter::AddEdge(Operation* inst) {
       TF_RETURN_IF_ERROR(AddEdgeBetweenNodes(control_and_idx.value(), dst_node,
                                              control_and_idx.index() + 1));
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // For tf_executor.NextIteration.Source, op can be skipped as it is assumed
   // there are no operands.
   if (llvm::isa<mlir::tf_executor::NextIterationSourceOp>(inst)) {
     assert(inst->getNumOperands() == 0);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Operation* op = GetIslandInnerOpOrSelf(inst);
@@ -377,7 +378,7 @@ Status Exporter::AddEdge(Operation* inst) {
         AddEdgeBetweenNodes(operand_and_idx.value(), dst_node,
                             operand_and_idx.index() + operand_offset));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void Exporter::UseOriginalFunctionNames(NodeDef& node_def) {
@@ -424,7 +425,7 @@ Status Exporter::AddInstructionNode(Operation* inst) {
   TF_ASSIGN_OR_RETURN(Node * node, graph_->AddNode(std::move(*node_def)));
   DCHECK(node != nullptr);
   nodes_[inst] = node;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool IsEntryFunctionArg(BlockArgument arg) {
@@ -438,7 +439,7 @@ Status Exporter::AddArgumentNode(BlockArgument arg, unsigned index,
   TF_ASSIGN_OR_RETURN(auto node_def, GetArgumentNode(arg, index, name));
   TF_ASSIGN_OR_RETURN(Node * node, graph_->AddNode(std::move(*node_def)));
   args_[arg] = node;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Creates return nodes per operand of a FetchOp. If names is supplied, those
@@ -447,7 +448,8 @@ Status Exporter::AddFetchNode(FuncOp function, mlir::tf_executor::FetchOp fetch,
                               llvm::ArrayRef<llvm::StringRef> names) {
   auto& return_nodes = returns_[fetch];
   for (auto operand_and_idx : llvm::enumerate(fetch.getOperands())) {
-    if (operand_and_idx.value().getType().isa<mlir::tf_executor::ControlType>())
+    if (mlir::isa<mlir::tf_executor::ControlType>(
+            operand_and_idx.value().getType()))
       break;
 
     TF_ASSIGN_OR_RETURN(
@@ -458,7 +460,7 @@ Status Exporter::AddFetchNode(FuncOp function, mlir::tf_executor::FetchOp fetch,
     TF_ASSIGN_OR_RETURN(Node * node, graph_->AddNode(std::move(*node_def)));
     return_nodes.push_back(node);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Collects control ret Nodes based on tf_executor.graph's associated
@@ -467,7 +469,7 @@ Status Exporter::GetControlRetNodes(
     mlir::tf_executor::FetchOp fetch,
     absl::flat_hash_set<Node*>* control_ret_nodes) {
   for (Value fetch_operand : fetch.getOperands()) {
-    if (fetch_operand.getType().isa<mlir::tf_executor::ControlType>()) {
+    if (mlir::isa<mlir::tf_executor::ControlType>(fetch_operand.getType())) {
       Operation* defining_op =
           GetIslandInnerOpOrSelf(fetch_operand.getDefiningOp());
       auto node_it = nodes_.find(defining_op);
@@ -475,7 +477,7 @@ Status Exporter::GetControlRetNodes(
       control_ret_nodes->insert(node_it->second);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // After conversion from MLIR the input names are all blank which causes
@@ -494,7 +496,7 @@ void FixupInputNamesFromEdges(Graph* graph) {
     }
   }
 }
-StatusOr<std::unique_ptr<Graph>> Exporter::Convert(
+absl::StatusOr<std::unique_ptr<Graph>> Exporter::Convert(
     const GraphExportConfig& configs, const Dialect* tf_dialect,
     const SymbolTable& symbol_table, FuncOp function,
     FunctionLibraryDefinition* flib_def,
@@ -509,14 +511,16 @@ StatusOr<std::unique_ptr<Graph>> Exporter::Convert(
   auto dict_attr =
       function->getAttrOfType<mlir::DictionaryAttr>(kEntryFuncAttr);
   if (dict_attr) {
-    TF_RET_CHECK(dict_attr.get("inputs").isa<mlir::StringAttr>())
+    TF_RET_CHECK(mlir::isa<mlir::StringAttr>(dict_attr.get("inputs")))
         << "inputs missing in entry function attribute";
-    TF_RET_CHECK(dict_attr.get("outputs").isa<mlir::StringAttr>())
+    TF_RET_CHECK(mlir::isa<mlir::StringAttr>(dict_attr.get("outputs")))
         << "outputs missing in entry function attribute";
-    dict_attr.get("inputs").cast<mlir::StringAttr>().getValue().split(
-        input_names, ',', /*MaxSplit=*/-1, /*KeepEmpty=*/false);
-    dict_attr.get("outputs").cast<mlir::StringAttr>().getValue().split(
-        output_names, ',', /*MaxSplit=*/-1, /*KeepEmpty=*/false);
+    mlir::cast<mlir::StringAttr>(dict_attr.get("inputs"))
+        .getValue()
+        .split(input_names, ',', /*MaxSplit=*/-1, /*KeepEmpty=*/false);
+    mlir::cast<mlir::StringAttr>(dict_attr.get("outputs"))
+        .getValue()
+        .split(output_names, ',', /*MaxSplit=*/-1, /*KeepEmpty=*/false);
   }
 
   auto graph = std::make_unique<Graph>(OpRegistry::Global());
@@ -582,7 +586,7 @@ StatusOr<std::unique_ptr<Graph>> Exporter::Convert(
     int index = it.index();
     auto arg = it.value();
     mlir::Type type = arg.getType();
-    if (!type.isa<mlir::TensorType>()) {
+    if (!mlir::isa<mlir::TensorType>(type)) {
       return errors::InvalidArgument(
           "FuncOps arguments must have tensor types. Found ",
           mlir::debugString(type), " in function ", function.getName().str());
@@ -601,14 +605,14 @@ StatusOr<std::unique_ptr<Graph>> Exporter::Convert(
       // library rather than the all the functions exported so far.
       TF_RETURN_IF_ERROR(graph->mutable_flib_def()->AddLibrary(*flib_def));
     }
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   // Adds nodes for operations.
   for (Operation& inst : graph_op.GetBody()) {
     for (auto type : inst.getResultTypes())
-      if (!type.isa<mlir::TensorType, mlir::tf_executor::ControlType,
-                    mlir::tf_executor::TokenType>())
+      if (!mlir::isa<mlir::TensorType, mlir::tf_executor::ControlType,
+                     mlir::tf_executor::TokenType>(type))
         return errors::InvalidArgument(
             "Values must be of tensor type, TensorFlow control type, or "
             "TensorFlow token type. Found ",
@@ -669,7 +673,7 @@ Status Exporter::ConvertLibFunction(
     llvm::SmallDenseSet<FuncOp>& visited_functions) {
   // Return early if the function has already been exported.
   bool is_new_function = visited_functions.insert(function).second;
-  if (!is_new_function) return OkStatus();
+  if (!is_new_function) return absl::OkStatus();
 
   auto function_name = FindFunctionName(configs, function);
 
@@ -780,7 +784,7 @@ Status Exporter::Convert(mlir::ModuleOp module,
   if (flib_def != nullptr) {
     TF_RETURN_IF_ERROR(flib_def->AddLibrary(temp_flib_def));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -805,7 +809,7 @@ Status ConvertMlirToGraph(mlir::ModuleOp module,
                             &control_ret_nodes);
 }
 
-StatusOr<std::unique_ptr<GraphDef>> ConvertMlirToGraphdef(
+absl::StatusOr<std::unique_ptr<GraphDef>> ConvertMlirToGraphdef(
     mlir::ModuleOp module, const GraphExportConfig& configs) {
   FunctionLibraryDefinition flib_def(OpRegistry::Global(),
                                      FunctionDefLibrary());
@@ -825,7 +829,7 @@ StatusOr<std::unique_ptr<GraphDef>> ConvertMlirToGraphdef(
   return graphdef;
 }
 
-tsl::Status ConvertMlirFunctionToFunctionLibraryDef(
+absl::Status ConvertMlirFunctionToFunctionLibraryDef(
     FuncOp func, const GraphExportConfig& configs, FunctionDef* function_def) {
   Dialect* tf_dialect = func.getContext()->getLoadedDialect("tf");
   FunctionLibraryDefinition flib_def(OpRegistry::Global(),
@@ -844,7 +848,7 @@ tsl::Status ConvertMlirFunctionToFunctionLibraryDef(
   const FunctionDef* func_def = flib_def.Find(name);
   if (func_def != nullptr) {
     *function_def = *func_def;
-    return OkStatus();
+    return absl::OkStatus();
   }
   return absl::InvalidArgumentError(
       absl::StrCat("Function '", name,
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h
index 562226e1c764bc..e5e62a3e05a330 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h
@@ -30,7 +30,7 @@ limitations under the License.
 
 namespace tensorflow {
 // Given an MLIR module, returns a GraphDef.
-tsl::StatusOr<std::unique_ptr<GraphDef>> ConvertMlirToGraphdef(
+absl::StatusOr<std::unique_ptr<GraphDef>> ConvertMlirToGraphdef(
     mlir::ModuleOp module, const GraphExportConfig& configs);
 
 // Converts an MLIR module to TensorFlow graph and FunctionLibraryDefinition.
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.cc b/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.cc
index 6ce83519a0fe6d..debe84b63bbd1c 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/Interfaces/DerivedAttributeOpInterface.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/export_utils.h"
@@ -59,7 +60,7 @@ Status SetTypeAttribute(absl::string_view name, ContainerT types,
   assert(result.second && "cannot have multiple attributes with the same name");
   (void)result;
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Sets shape list attribute with the given `name` to the given `shapes`. If the
@@ -97,7 +98,7 @@ Status GetUnregisteredAttrs(
     absl::flat_hash_set<absl::string_view>* attrs_to_ignore) {
   if (!op_reg_data) {
     // This is likely a function call node, so we should continue.
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Collect all the registered attributes.
@@ -114,7 +115,7 @@ Status GetUnregisteredAttrs(
           absl::string_view(attr.getName().data(), attr.getName().size()));
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Collects all attribute names to ignore in an MLIR operation when exporting to
@@ -183,7 +184,7 @@ Status PopulateDerivedAttributes(mlir::Operation* inst, llvm::StringRef name,
     auto values = inst->getResults();
     auto begin = values.begin();
     auto end = values.begin();
-    while (end != values.end() && (*end).getType().isa<mlir::ShapedType>())
+    while (end != values.end() && mlir::isa<mlir::ShapedType>((*end).getType()))
       end++;
     if (begin != end) {
       mlir::TF::ResultShapeRange output_shapes = {
@@ -193,7 +194,7 @@ Status PopulateDerivedAttributes(mlir::Operation* inst, llvm::StringRef name,
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // A `Cast` with DstT == SrcT can be introduced in MLIR as a shape cast. But
@@ -253,7 +254,7 @@ Status GetAttrValuesFromOperation(
     value.mutable_func()->set_name("");
     (*attributes)[kShapeInferenceGraph] = value;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 absl::StatusOr<std::unique_ptr<NodeDef>> ConvertTFDialectOpToNodeDef(
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h b/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h
index ae87dad305a7e0..f15e741b247340 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h
@@ -42,7 +42,7 @@ Status GetAttrValuesFromOperation(
 // ShapedType for the leading values with ShapedType in the results of the
 // nodes. Set it to true if the returned NodeDef will be executed by the linked
 // TF Eager runtime.
-StatusOr<std::unique_ptr<NodeDef>> ConvertTFDialectOpToNodeDef(
+absl::StatusOr<std::unique_ptr<NodeDef>> ConvertTFDialectOpToNodeDef(
     mlir::Operation* inst, llvm::StringRef name,
     bool ignore_unregistered_attrs);
 
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index 42b059cbd0a527..3e72550a88749a 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -68,6 +68,7 @@ limitations under the License.
 #include "mlir/IR/Verifier.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/cc/saved_model/constants.h"
 #include "tensorflow/cc/saved_model/loader_util.h"
 #include "tensorflow/compiler/jit/shape_inference_helpers.h"
@@ -528,7 +529,7 @@ Status UpdateLegacyFedInputNode(const GraphDef& graph_def,
   auto it = inputs.find(node_name);
 
   // Node is not an input.
-  if (it == inputs.end()) return OkStatus();
+  if (it == inputs.end()) return absl::OkStatus();
 
   if (HasNonPrimaryOutputInUse(graph_def, node_name)) {
     return errors::InvalidArgument(
@@ -549,7 +550,7 @@ Status UpdateLegacyFedInputNode(const GraphDef& graph_def,
   node->clear_input();
   AddNodeAttr("dtype", dtype, node);
   AddNodeAttr("shape", it->second.shape, node);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Preprocesses GraphDef before it can be converted to Graph by,
@@ -575,7 +576,7 @@ Status PreprocessGraphDef(const GraphImportConfig* specs, GraphDef* graph_def) {
     }
     ::tensorflow::AddDefaultsToNodeDef(op_reg_data->op_def, &node_def);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Mapping from node name to feed (index and ArrayInfo). Node name must outlive
@@ -691,7 +692,7 @@ Status ImporterBase::ConvertDeferredFunctions() {
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ImporterBase::RemoveBackedges() {
@@ -718,7 +719,7 @@ Status ImporterBase::RemoveBackedges() {
   GetReversePostOrder(
       *graph_, &ordered_nodes_,
       [](const Node* n1, const Node* n2) { return n1->name() < n2->name(); });
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status CopyStackTraces(const Graph& from, Graph* to) {
@@ -744,7 +745,7 @@ Status CopyStackTraces(const Graph& from, Graph* to) {
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 absl::StatusOr<std::pair<Node*, bool>>
@@ -808,7 +809,7 @@ Status ImporterBase::GetInputOutputNodes(
           absl::StrCat("Graph does not contain node: ", name));
     }
     nodes->insert(it->second);
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   // Remap feeds and fetches to newly created Placeholder nodes.
@@ -835,7 +836,7 @@ Status ImporterBase::GetInputOutputNodes(
   for (const auto& control_output : specs_.control_outputs)
     TF_RETURN_IF_ERROR(add_node(control_output));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // TODO(jpienaar): Remove this post shape inference on import flag is removed.
@@ -934,7 +935,7 @@ Status ImporterBase::AddNodesToShapeRefiner(
                      << kOutputShapesAttrName
                      << " attribute specifies shapes for " << list.shape_size()
                      << " outputs";
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       for (const auto& shape : llvm::enumerate(list.shape())) {
@@ -947,7 +948,7 @@ Status ImporterBase::AddNodesToShapeRefiner(
         }
         node_context->set_output(shape.index(), handle);
       }
-      return OkStatus();
+      return absl::OkStatus();
     };
 
     // If it is the argument node, the shape handle is set explicitly, so it
@@ -1069,7 +1070,7 @@ Status ImporterBase::AddNodesToShapeRefiner(
   }
   VLOG(1) << "Graph shapes were inferred with " << (i - 1)
           << " extra rounds of analysis to reach a fixpoint.";
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 absl::StatusOr<mlir::Type> ImporterBase::InferInputType(const Node& node,
@@ -1228,7 +1229,7 @@ absl::StatusOr<mlir::Type> ImporterBase::InferOutputType(
         TF_ASSIGN_OR_RETURN(
             auto etype, ConvertToMlirTensorType(shape_proto, dtype, &builder));
         return mlir::UnrankedTensorType::get(mlir::TF::ResourceType::get(
-            {etype.cast<TensorType>()}, builder.getContext()));
+            {mlir::cast<TensorType>(etype)}, builder.getContext()));
       } else {
         return mlir::UnrankedTensorType::get(
             mlir::TF::ResourceType::get(builder.getContext()));
@@ -1331,7 +1332,7 @@ Status ImporterBase::ConvertFunctionCallAttribute(const std::string& base_name,
                                                   NamedAttrList* attributes) {
   TF_ASSIGN_OR_RETURN(auto func_attr,
                       ConvertFunctionCallName(value.func().name()));
-  if (!func_attr) return OkStatus();
+  if (!func_attr) return absl::OkStatus();
   attributes->push_back(builder_.getNamedAttr(base_name, func_attr));
 
   for (const auto& it : value.func().attr()) {
@@ -1339,7 +1340,7 @@ Status ImporterBase::ConvertFunctionCallAttribute(const std::string& base_name,
     TF_ASSIGN_OR_RETURN(auto value, ConvertAttributeValue(it.second));
     attributes->push_back(builder_.getNamedAttr(name, value));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 absl::StatusOr<mlir::FlatSymbolRefAttr> ImporterBase::ConvertFunctionCallName(
@@ -1411,7 +1412,7 @@ Status ImporterBase::ConvertLibFunction(llvm::StringRef func_name) {
   // done.
   if (tf_name_to_mlir_name_->find(std::string(func_name)) !=
       tf_name_to_mlir_name_->end())
-    return OkStatus();
+    return absl::OkStatus();
 
   std::string mlir_func_name(
       function_name_uniquifier_->GetUniqueName(func_name));
@@ -1458,7 +1459,7 @@ Status ImporterBase::ConvertLibFunction(llvm::StringRef func_name) {
   }
 
   deferred_functions_.emplace(func_name.str(), attributes);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ImporterBase::PruneUnreachableNodes(
@@ -1475,7 +1476,7 @@ Status ImporterBase::PruneUnreachableNodes(
   } else {
     VLOG(1) << "No output nodes specified, skipping pruning";
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ImporterBase::ConvertFeedsToPlaceholders(
@@ -1524,7 +1525,7 @@ Status ImporterBase::ConvertFeedsToPlaceholders(
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ImporterBase::PrepareConvert(const Graph& graph,
@@ -1568,7 +1569,7 @@ Status ImporterBase::PrepareConvert(const Graph& graph,
         [](const Node* n1, const Node* n2) { return n1->name() < n2->name(); });
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ImporterBase::Convert(
@@ -1622,7 +1623,7 @@ Status ImporterBase::Convert(
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ImporterBase::ConvertFunctionArgAndRets(
@@ -1659,7 +1660,7 @@ Status ImporterBase::ConvertFunctionArgAndRets(
         ret_attrs[index].set(dialect_attribute, converted_attr);
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   auto* bb = &func.front();
@@ -1753,7 +1754,7 @@ Status ImporterBase::ConvertFunctionArgAndRets(
         return list.getDictionary(context_);
       })));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 mlir::Location ImporterBase::GetLocation(const Node& node) {
@@ -2000,7 +2001,7 @@ mlir::Operation* ImporterBase::CreateOperation(
         record_resource = [&](mlir::Type type) {
           type.walk([&](mlir::Type t) {
             if (resource) return mlir::WalkResult::interrupt();
-            if (type.isa<mlir::TF::ResourceType>()) {
+            if (mlir::isa<mlir::TF::ResourceType>(type)) {
               resource = true;
               return mlir::WalkResult::interrupt();
             }
@@ -2035,7 +2036,7 @@ Status ImporterBase::ConvertNode(const Node& node) {
   if (!node.IsOp()) {
     // Don't import the pseudo-nodes _SOURCE or _SINK. These are added by
     // Graph and don't exist in GraphDef.
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // If it is a custom OP, its definition should be found in the library. We
@@ -2223,7 +2224,7 @@ Status ImporterBase::ConvertNode(const Node& node) {
   // Register the mapping between the TF node and the newly created operation.
   node_values_[node.id()] =
       CreateOperation(node, node_type_name, result, control_operands);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Add the backedges to the CFG. Given a backedge, we replace the original
@@ -2249,7 +2250,7 @@ Status ImporterBase::AddBackedges() {
     auto* dst = node_values_[edge.dst->id()];
     TF_RETURN_IF_ERROR(AddBackedge(sink, dst, edge.dst_input));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ImporterBase::AddBackedge(mlir::Operation* sink, mlir::Operation* dst,
@@ -2285,7 +2286,7 @@ Status ImporterBase::AddBackedge(mlir::Operation* sink, mlir::Operation* dst,
   }
   dst->dropAllReferences();
   dst->erase();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 absl::StatusOr<mlir::FunctionType> ImporterBase::InferLibFunctionType(
@@ -2714,7 +2715,7 @@ GraphDefImporter::GetArgsRetsAndTypesFromFunctionGraph(
                                      (*nodes)[index].node->name(), "'");
     (*nodes)[index] = {node, 0};
 
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   // Collect arg and ret nodes from graph.
@@ -2758,7 +2759,7 @@ GraphDefImporter::GetArgsRetsAndTypesFromFunctionGraph(
 Status GraphDefImporter::GetControlRetsFromGraph(
     llvm::ArrayRef<std::string> control_outputs,
     absl::InlinedVector<Node*, 4>* control_ret_nodes) {
-  if (control_outputs.empty()) return OkStatus();
+  if (control_outputs.empty()) return absl::OkStatus();
 
   llvm::SmallDenseMap<llvm::StringRef, int32_t> controls_to_idx;
   for (const auto& control_and_idx : llvm::enumerate(control_outputs))
@@ -2779,7 +2780,7 @@ Status GraphDefImporter::GetControlRetsFromGraph(
       return errors::InvalidArgument(
           "Control output '", std::get<1>(node_and_name), "' is missing");
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Stateful helper class to import a TensorFlow model expressed in SavedModel
@@ -3059,7 +3060,7 @@ Status DiagnoseMultipleConcreteFunctions(const SavedObjectGraph& object_graph,
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Recursively traverses a StructuredValue, linearizing all the leaves.
@@ -3187,10 +3188,10 @@ void StructuredValueLinearizer::RecursivelyFindLeaves(
          << " at index path: <value>";
       for (auto path_element : current_index_path_) {
         os << ".";
-        if (auto integer = path_element.dyn_cast<mlir::IntegerAttr>()) {
+        if (auto integer = mlir::dyn_cast<mlir::IntegerAttr>(path_element)) {
           os << integer.getValue();
         } else {
-          auto str = path_element.cast<mlir::StringAttr>();
+          auto str = mlir::cast<mlir::StringAttr>(path_element);
           os << str.getValue();
         }
       }
@@ -3357,7 +3358,7 @@ Status CreateSavedModelIR(
           const TrackableObjectGraph::TrackableObject& trackable_object) {
         restored_objects.insert(
             std::make_pair(saved_node_id, &trackable_object));
-        return OkStatus();
+        return absl::OkStatus();
       }));
 
   for (int node_id = 0; node_id < object_graph.nodes_size(); node_id++) {
@@ -3554,7 +3555,7 @@ Status CreateSavedModelIR(
   module->setAttr("tf_saved_model.semantics", builder.getUnitAttr());
   SortSavedModelModule(module);
   MarkSavedModelFunctionVisibility(module);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
@@ -3893,7 +3894,7 @@ Status SavedModelSignatureDefImporterLite::MoveConvertedFunctionsToModule(
     symbol_table_.insert(func.clone());
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status SavedModelSignatureDefImporterLite::ConvertInitializer(
@@ -4271,7 +4272,7 @@ Status SavedModelSignatureDefImporter::LiftVariables(
     return diag_handler.Combine(
         errors::Internal("Failed to dedup bound inputs."));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.h b/tensorflow/compiler/mlir/tensorflow/translate/import_model.h
index 1670fd11a1f819..bca1f7f80af9e8 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.h
@@ -39,13 +39,13 @@ inline constexpr absl::string_view kImportModelDefaultGraphFuncName = "main";
 
 // Given a GraphDef, returns a MLIR module containing the graph, expressed with
 // tf_executor dialect.
-tsl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertGraphdefToMlir(
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertGraphdefToMlir(
     const GraphDef& graphdef, const GraphDebugInfo& debug_info,
     const GraphImportConfig& specs, mlir::MLIRContext* context);
 
 // Given a Graph, returns a MLIR module containing the graph, expressed with
 // tf_executor dialect.
-tsl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertGraphToMlir(
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertGraphToMlir(
     const Graph& graph, const GraphDebugInfo& debug_info,
     const FunctionLibraryDefinition& flib_def, const GraphImportConfig& specs,
     mlir::MLIRContext* context);
@@ -53,19 +53,19 @@ tsl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertGraphToMlir(
 // [Experimental]
 // Given a Function, returns a MLIR module containing the graph, expressed with
 // tf_executor dialect.
-tsl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertFunctionToMlir(
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertFunctionToMlir(
     const FunctionBody* fbody, const FunctionLibraryDefinition& flib_def,
     mlir::MLIRContext* context);
 
 // Given a SavedModel, returns a MLIR module containing the functions, expressed
 // with tf_executor dialect.
-tsl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertSavedModelToMlir(
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertSavedModelToMlir(
     SavedModelV2Bundle* saved_model, mlir::MLIRContext* context,
     absl::Span<std::string> exported_names, MLIRImportOptions options = {});
 
 // Given a V1 SavedModel, returns a MLIR module containing the functions,
 // expressed with tf_executor dialect.
-tsl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertSavedModelV1ToMlir(
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertSavedModelV1ToMlir(
     const SavedModelBundle& saved_model, absl::Span<std::string> exported_names,
     mlir::MLIRContext* context, MLIRImportOptions options = {});
 
@@ -79,7 +79,7 @@ tsl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertSavedModelV1ToMlir(
 // ConvertSavedModelV1ToMlir(), and is not related to TFLite.
 //
 // TODO(b/179683149): Rename this class to avoid confusion with TFLite.
-tsl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertSavedModelV1ToMlirLite(
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertSavedModelV1ToMlirLite(
     const MetaGraphDef& meta_graph_def, const GraphDebugInfo& debug_info,
     std::optional<absl::Span<const std::string>> exported_names,
     mlir::MLIRContext* context, MLIRImportOptions options);
@@ -112,8 +112,8 @@ class SavedModelMLIRImportInput {
   // and remain valid for the graph.
   // `name` is a unique identifier for this subgraph, so the implementation can
   // use it for eg. debugging or caching compilation results.
-  virtual tsl::StatusOr<const Graph*> GetSubGraph(absl::string_view name,
-                                                  GraphImportConfig& specs) = 0;
+  virtual absl::StatusOr<const Graph*> GetSubGraph(
+      absl::string_view name, GraphImportConfig& specs) = 0;
 
  private:
   const MetaGraphDef* meta_graph_def_ = nullptr;
@@ -131,7 +131,7 @@ class SavedModelMLIRImportInput {
 // ConvertSavedModelV1ToMlir(), and is not related to TFLite.
 //
 // TODO(b/179683149): Rename this class to avoid confusion with TFLite.
-tsl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertSavedModelV1ToMlirLite(
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertSavedModelV1ToMlirLite(
     SavedModelMLIRImportInput& input,
     std::optional<absl::Span<const std::string>> exported_names,
     mlir::MLIRContext* context,
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.cc b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.cc
index 09115768652d32..dbccd07976a997 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.cc
@@ -70,7 +70,7 @@ std::string GraphImportConfig::str() const {
 Status ParseOutputArrayInfo(absl::string_view array_names,
                             std::vector<string>* outputs) {
   TF_RETURN_IF_ERROR(ParseNodeNames(array_names, *outputs));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ParseOutputArrayInfo(const std::vector<string>& output_names,
@@ -79,7 +79,7 @@ Status ParseOutputArrayInfo(const std::vector<string>& output_names,
     if (output_name.empty()) continue;
     outputs->push_back(output_name);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ParseInputArrayInfo(absl::string_view array_names,
@@ -138,7 +138,7 @@ static Status HandleSubtype(absl::string_view subtype,
     subtype_tensor_shape.add_dim()->set_size(dim);
   }
   *result = {subtype_dtype, subtype_tensor_shape};
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ParseInputArrayInfo(
@@ -214,7 +214,7 @@ Status ParseInputArrayInfo(
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ParseNodeShapes(
@@ -232,13 +232,13 @@ Status ParseNodeShapes(
       shapes_vector.push_back(std::move(shape));
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ParseNodeNames(absl::string_view names_str,
                       std::vector<std::string>& names_vector) {
   names_vector = absl::StrSplit(names_str, ',', absl::SkipEmpty());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 static absl::StatusOr<std::vector<std::string>> ParseDTypesHelper(
@@ -290,7 +290,7 @@ Status ParseNodeDataTypes(absl::string_view data_types_str,
   if (!data_types_str.empty()) {
     TF_ASSIGN_OR_RETURN(data_type_vector, ParseDTypesHelper(data_types_str));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.cc b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.cc
deleted file mode 100644
index f0b415062f2d27..00000000000000
--- a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.h"
-
-#include <utility>
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/Verifier.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
-#include "xla/status_macros.h"
-#include "tensorflow/core/common_runtime/graph_constructor.h"
-#include "tensorflow/core/framework/graph_debug_info.pb.h"
-
-namespace tensorflow {
-
-using mlir::MLIRContext;
-
-static absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> Import(
-    const GraphOptimizationPassOptions& options, const Graph& graph,
-    MLIRContext* context) {
-  // TODO(fengliuai): get debug info at runtime.
-  GraphDebugInfo debug_info;
-  GraphImportConfig specs;
-  specs.enable_shape_inference = options.shape_inference_on_tfe_dialect_import;
-
-  TF_ASSIGN_OR_RETURN(
-      auto module,
-      ConvertGraphToMlir(graph, debug_info, *options.flib_def, specs, context));
-  mlir::StatusScopedDiagnosticHandler status_handler(context);
-  if (failed(mlir::verify(*module))) {
-    if (VLOG_IS_ON(1)) module->dump();
-    return status_handler.ConsumeStatus();
-  }
-  return module;
-}
-
-static Status Export(mlir::OwningOpRef<mlir::ModuleOp> module,
-                     const GraphOptimizationPassOptions& options,
-                     std::unique_ptr<Graph>* graph) {
-  GraphExportConfig confs;
-  return ConvertMlirToGraph(*module, confs, graph, options.flib_def);
-}
-
-static Status Roundtrip(const GraphOptimizationPassOptions& options,
-                        std::unique_ptr<Graph>* graph, MLIRContext* context) {
-  TF_ASSIGN_OR_RETURN(auto module, Import(options, **graph, context));
-  return Export(std::move(module), options, graph);
-}
-
-Status MlirRoundtripPass::Run(const GraphOptimizationPassOptions& options) {
-  MLIRContext context;
-  if (options.graph) return Roundtrip(options, options.graph, &context);
-
-  // If the graph is partitioned, then try and round trip them individually.
-  for (auto& it : *options.partition_graphs) {
-    VLOG(1) << "Roundtripping: " << it.first;
-    // TODO(jpienaar): Roundtrip results in different failures, investigate.
-    TF_RETURN_IF_ERROR(Import(options, *it.second, &context).status());
-  }
-  return OkStatus();
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.h b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.h
deleted file mode 100644
index 81500cc9b78a76..00000000000000
--- a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_MLIR_ROUNDTRIP_PASS_H_
-#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_MLIR_ROUNDTRIP_PASS_H_
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "tensorflow/core/common_runtime/optimization_registry.h"
-#include "tensorflow/core/lib/core/status.h"
-
-namespace tensorflow {
-
-// An optimization pass that simply roundtrips the Graph to MLIR and back.
-class MlirRoundtripPass : public GraphOptimizationPass {
- public:
-  Status Run(const GraphOptimizationPassOptions& options) override;
-};
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_MLIR_ROUNDTRIP_PASS_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc
index b0759da88e4ced..6eaa15e37d45e1 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/cc/saved_model/bundle_v2.h"
 #include "tensorflow/cc/saved_model/reader.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
@@ -46,7 +47,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-static StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphdefToMlirImport(
+static absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphdefToMlirImport(
     llvm::StringRef input, const std::vector<std::string>& input_arrays,
     const std::vector<std::string>& input_dtypes,
     const std::vector<std::optional<std::vector<int>>>& input_shapes,
@@ -109,7 +110,8 @@ static StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphdefToMlirImport(
       context);
 }
 
-StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphdefToMlirTranslateFunction(
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
+GraphdefToMlirTranslateFunction(
     llvm::StringRef input, const std::vector<std::string>& input_arrays,
     const std::vector<std::string>& input_dtypes,
     const std::vector<std::optional<std::vector<int>>>& input_shapes,
@@ -125,7 +127,8 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphdefToMlirTranslateFunction(
   return module_or;
 }
 
-StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphdefToMlirTranslateFunction(
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
+GraphdefToMlirTranslateFunction(
     llvm::StringRef input, absl::string_view input_arrays,
     absl::string_view input_dtypes, absl::string_view input_shapes,
     absl::string_view output_arrays, absl::string_view control_output_arrays,
@@ -147,11 +150,12 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphdefToMlirTranslateFunction(
       context);
 }
 
-StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> SavedModelObjectGraphToMlirImport(
-    absl::string_view saved_model_dir,
-    const std::unordered_set<std::string>& tags,
-    absl::Span<std::string> exported_names, mlir::MLIRContext* context,
-    bool unconditionally_use_set_output_shapes) {
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
+SavedModelObjectGraphToMlirImport(absl::string_view saved_model_dir,
+                                  const std::unordered_set<std::string>& tags,
+                                  absl::Span<std::string> exported_names,
+                                  mlir::MLIRContext* context,
+                                  bool unconditionally_use_set_output_shapes) {
   tensorflow::SavedModelV2Bundle bundle;
   auto load_status = tensorflow::SavedModelV2Bundle::Load(
       std::string(saved_model_dir.data(), saved_model_dir.length()), &bundle);
@@ -174,7 +178,8 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> SavedModelObjectGraphToMlirImport(
   return module_or;
 }
 
-StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> SavedModelSignatureDefsToMlirImport(
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
+SavedModelSignatureDefsToMlirImport(
     absl::string_view saved_model_dir,
     const std::unordered_set<std::string>& tags,
     absl::Span<std::string> exported_names, mlir::MLIRContext* context,
@@ -210,15 +215,15 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> SavedModelSignatureDefsToMlirImport(
   return module_or;
 }
 
-StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
 SavedModelSignatureDefsToMlirImportLite(
     absl::string_view saved_model_dir,
     const std::unordered_set<std::string>& tags,
     absl::Span<std::string> exported_names, mlir::MLIRContext* context,
     MLIRImportOptions options) {
   MetaGraphDef meta_graph_def;
-  auto status = ReadMetaGraphDefFromSavedModel(std::string(saved_model_dir),
-                                               tags, &meta_graph_def);
+  auto status =
+      ReadMetaGraphDefFromSavedModel(saved_model_dir, tags, &meta_graph_def);
   if (!status.ok()) {
     LOG(ERROR) << "Failed to load saved model v1 '" << saved_model_dir
                << "': " << status;
@@ -239,7 +244,7 @@ SavedModelSignatureDefsToMlirImportLite(
   return module_or;
 }
 
-StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
 GraphdefToSplattedMlirTranslateFunction(
     llvm::StringRef input, const std::vector<std::string>& input_arrays,
     const std::vector<std::string>& input_dtypes,
@@ -263,7 +268,7 @@ GraphdefToSplattedMlirTranslateFunction(
         if (auto attr = inst.getAttrOfType<mlir::ElementsAttr>(attr_id)) {
           mlir::Attribute rand_val;
           mlir::Type element_type = attr.getShapedType().getElementType();
-          if (element_type.isa<mlir::IntegerType>()) {
+          if (mlir::isa<mlir::IntegerType>(element_type)) {
             rand_val = mlir::IntegerAttr::get(element_type, std::rand());
           } else if (element_type.isF16() || element_type.isF32() ||
                      element_type.isF64()) {
@@ -286,7 +291,7 @@ GraphdefToSplattedMlirTranslateFunction(
   return module_or;
 }
 
-StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
 GraphdefToSplattedMlirTranslateFunction(
     llvm::StringRef input, absl::string_view input_arrays,
     absl::string_view input_dtypes, absl::string_view input_shapes,
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h
index 3dd76e2c12e85e..cd86b27e13550c 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h
@@ -53,7 +53,8 @@ struct GraphdefToMlirOptions {
 
 // Converts a TensorFlow GraphDef contained in `input` param into a MLIR module.
 // Creates MLIR entities into the given MLIR `context`.
-StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphdefToMlirTranslateFunction(
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
+GraphdefToMlirTranslateFunction(
     llvm::StringRef input, const std::vector<std::string>& input_arrays,
     const std::vector<std::string>& input_dtypes,
     const std::vector<std::optional<std::vector<int>>>& input_shapes,
@@ -66,7 +67,8 @@ ABSL_DEPRECATED(
     "inputs instead of strings")
 // Converts a TensorFlow GraphDef contained in `input` param into a MLIR module.
 // Creates MLIR entities into the given MLIR `context`.
-StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphdefToMlirTranslateFunction(
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
+GraphdefToMlirTranslateFunction(
     llvm::StringRef input, absl::string_view input_arrays,
     absl::string_view input_dtypes, absl::string_view input_shapes,
     absl::string_view output_arrays, absl::string_view control_output_arrays,
@@ -74,7 +76,7 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphdefToMlirTranslateFunction(
 
 // Similar as the above function, but replaces all constant tensors
 // with randomly generated splat values.
-StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
 GraphdefToSplattedMlirTranslateFunction(
     llvm::StringRef input, const std::vector<std::string>& input_arrays,
     const std::vector<std::string>& input_dtypes,
@@ -88,7 +90,7 @@ ABSL_DEPRECATED(
     "inputs instead of strings")
 // Similar as the above function, but replaces all constant tensors
 // with randomly generated splat values.
-StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
 GraphdefToSplattedMlirTranslateFunction(
     llvm::StringRef input, absl::string_view input_arrays,
     absl::string_view input_dtypes, absl::string_view input_shapes,
@@ -98,7 +100,8 @@ GraphdefToSplattedMlirTranslateFunction(
 // Converts a TensorFlow SavedModel stored in the directory with the given
 // `saved_model_dir` into a MLIR module. Creates MLIR entities into the
 // given MLIR `context`.
-StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> SavedModelObjectGraphToMlirImport(
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
+SavedModelObjectGraphToMlirImport(
     absl::string_view saved_model_dir,
     const std::unordered_set<std::string>& tags,
     absl::Span<std::string> exported_names, mlir::MLIRContext* context,
@@ -108,7 +111,8 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> SavedModelObjectGraphToMlirImport(
 // `saved_model_dir` into a MLIR module. Creates MLIR entities into the
 // given MLIR `context`.
 // 'saved_model_bundle' if not null, will be initialized with the model bundle.
-StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> SavedModelSignatureDefsToMlirImport(
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
+SavedModelSignatureDefsToMlirImport(
     absl::string_view saved_model_dir,
     const std::unordered_set<std::string>& tags,
     absl::Span<std::string> exported_names, mlir::MLIRContext* context,
@@ -120,7 +124,7 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> SavedModelSignatureDefsToMlirImport(
 // `saved_model_dir` into a MLIR module. Creates MLIR entities into the
 // given MLIR `context`. This does not create session internally so it is faster
 // and does not perform any graph transformation.
-StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
 SavedModelSignatureDefsToMlirImportLite(
     absl::string_view saved_model_dir,
     const std::unordered_set<std::string>& tags,
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc
index eb9bf3db34106d..0357a2e5f22986 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc
@@ -172,7 +172,7 @@ static LogicalResult MlirToGraphdefTranslateFunction(
   confs.export_entry_func_to_flib = export_entry_func_to_flib;
   confs.export_original_tf_func_name = export_original_tf_func_name;
 
-  StatusOr<std::unique_ptr<tensorflow::GraphDef>> graphdef_or(
+  absl::StatusOr<std::unique_ptr<tensorflow::GraphDef>> graphdef_or(
       tensorflow::ConvertMlirToGraphdef(module, confs));
   if (!graphdef_or.status().ok()) {
     LOG(ERROR) << "Graph export failed: " << graphdef_or.status();
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/translate_tf_dialect_op.cc b/tensorflow/compiler/mlir/tensorflow/translate/translate_tf_dialect_op.cc
deleted file mode 100644
index 856db032e501ae..00000000000000
--- a/tensorflow/compiler/mlir/tensorflow/translate/translate_tf_dialect_op.cc
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/ToolOutputFile.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/Location.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/Tools/mlir-translate/Translation.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h"
-#include "tsl/platform/protobuf.h"
-
-namespace mlir {
-static mlir::Operation* ExtractOnlyOp(mlir::ModuleOp module) {
-  mlir::func::FuncOp fn = module.lookupSymbol<mlir::func::FuncOp>("main");
-  if (!fn) return nullptr;
-
-  if (!llvm::hasSingleElement(fn)) return nullptr;
-
-  // Here, modules with exactly two operations in the only basic block are
-  // supported. The last operation should be a terminator operation and the
-  // other operation is the operation of interest.
-  auto& block = fn.front();
-  if (block.getOperations().size() != 2) return nullptr;
-  if (!block.back().hasTrait<OpTrait::IsTerminator>()) return nullptr;
-
-  return &block.front();
-}
-
-static LogicalResult MlirToTfNodeDef(ModuleOp module,
-                                     llvm::raw_ostream& output) {
-  auto* context = module.getContext();
-
-  Operation* op = ExtractOnlyOp(module);
-  if (!op) {
-    emitError(UnknownLoc::get(context),
-              "modules with exactly one op other than terminator in a "
-              "'main' function's "
-              "only block are supported");
-    return failure();
-  }
-
-  auto node_def_or = tensorflow::ConvertTFDialectOpToNodeDef(
-      op, "node_name", /*ignore_unregistered_attrs=*/false);
-  if (!node_def_or.ok()) {
-    op->emitError("failed to convert to TF NodeDef:")
-        << node_def_or.status().ToString();
-    return failure();
-  }
-
-  output << tsl::LegacyUnredactedDebugString(*node_def_or.value());
-  return success();
-}
-
-// Test only translation to convert a simple MLIR module with a single TF
-// dialect op to NodeDef.
-static TranslateFromMLIRRegistration translate_from_mlir_registration(
-    "test-only-mlir-to-tf-nodedef", "test-only-mlir-to-tf-nodedef",
-    MlirToTfNodeDef, mlir::RegisterAllTensorFlowDialects);
-
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.cc
index 098c7d19411979..45235b2931187c 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 
-#include <algorithm>
 #include <iterator>
 #include <string>
 #include <utility>
@@ -34,16 +33,15 @@ using ::tensorflow::kValidDeviceTypes;
 LogicalResult HasValidCompilationAndReplicationAttributes(Operation& op) {
   auto replicate_attr = op.getAttrOfType<StringAttr>(kReplicationInfoAttr);
   auto compile_attr = op.getAttrOfType<StringAttr>(kCompileDeviceTypeAttr);
-  if (replicate_attr && !compile_attr) {
+  if (!replicate_attr && !compile_attr) return success();
+  if (!replicate_attr || !compile_attr)
+    return op.emitOpError() << "is expected to have either both or none of '"
+                            << kReplicationInfoAttr << "' and '"
+                            << kCompileDeviceTypeAttr << "' attributes.";
+  if (replicate_attr.getValue().empty())
     return op.emitOpError()
-           << "has '" << kReplicationInfoAttr << "' attribute but not '"
-           << kCompileDeviceTypeAttr << "' attribute which is unsupported";
-  }
-  if (replicate_attr && replicate_attr.getValue().empty()) {
-    return op.emitOpError()
-           << "has an empty '" << kReplicationInfoAttr << "' attribute";
-  }
-  if (compile_attr && failed(IsValidDeviceTypeOrEmpty(compile_attr))) {
+           << "has an empty '" << kReplicationInfoAttr << "' attribute.";
+  if (failed(IsValidDeviceTypeOrEmpty(compile_attr))) {
     return op.emitOpError() << "has invalid '" << kCompileDeviceTypeAttr
                             << "' value '" << compile_attr.getValue() << "'";
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h b/tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h
index 5a99806d4295f3..0771b529465a94 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/tf2xla/tf2xla_defs.h"
 
 namespace mlir {
@@ -167,7 +168,7 @@ class IdentityNOp;
 // as an attribute.
 template <typename AttrT>
 bool GetValueAsConstant(Value val, AttrT &attr) {
-  while (auto result = val.dyn_cast<OpResult>()) {
+  while (auto result = mlir::dyn_cast<OpResult>(val)) {
     Operation *op = result.getOwner();
     if (!isa<IdentityOp>(op) && !isa<IdentityNOp>(op)) break;
     val = op->getOperand(result.getResultNumber());
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/call_graph_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/call_graph_util.cc
index fd3c00a3873e5c..030b8ae7575a40 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/call_graph_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/call_graph_util.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 
 namespace mlir {
@@ -54,7 +55,7 @@ llvm::SmallVector<func::FuncOp> GetEntryFunctions(ModuleOp module) {
 LogicalResult GetCallees(SymbolUserOpInterface op, SymbolTable &symtab,
                          llvm::SmallVector<func::FuncOp> &callees) {
   for (auto attr : op->getAttrs()) {
-    auto sym = attr.getValue().dyn_cast<SymbolRefAttr>();
+    auto sym = mlir::dyn_cast<SymbolRefAttr>(attr.getValue());
     if (!sym) continue;
     auto callee = symtab.lookup<func::FuncOp>(sym.getRootReference());
     if (!callee) {
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/cluster_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/cluster_util_test.cc
index 341749eddd0f63..9262f87edb46bd 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/cluster_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/cluster_util_test.cc
@@ -32,7 +32,7 @@ namespace {
 
 constexpr StringRef kTestClusterName = "tpu0";
 
-tsl::StatusOr<OwningOpRef<ModuleOp>> GetMlirModuleFromString(
+absl::StatusOr<OwningOpRef<ModuleOp>> GetMlirModuleFromString(
     StringRef string, MLIRContext* context) {
   DialectRegistry mlir_registry;
   RegisterAllTensorFlowDialects(mlir_registry);
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_attr.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_attr.cc
index fc0ee8b9d20691..5e320f3ab01a5e 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_attr.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_attr.cc
@@ -28,8 +28,8 @@ namespace tensorflow {
 // Converts non func AttrValue proto into an MLIR attribute. Func attribute is
 // exclused in this function because the function might be renamed when the
 // function definition is imported.
-StatusOr<mlir::Attribute> ConvertNonFuncAttributeValue(const AttrValue& value,
-                                                       mlir::Builder* builder) {
+absl::StatusOr<mlir::Attribute> ConvertNonFuncAttributeValue(
+    const AttrValue& value, mlir::Builder* builder) {
   switch (value.value_case()) {
     case AttrValue::kI:
       return builder->getI64IntegerAttr(value.i());
@@ -90,8 +90,8 @@ StatusOr<mlir::Attribute> ConvertNonFuncAttributeValue(const AttrValue& value,
   }
 }
 
-StatusOr<mlir::Attribute> ConvertAttributeValue(const AttrValue& value,
-                                                mlir::Builder* builder) {
+absl::StatusOr<mlir::Attribute> ConvertAttributeValue(const AttrValue& value,
+                                                      mlir::Builder* builder) {
   switch (value.value_case()) {
     case AttrValue::kFunc: {
       // TODO(b/156546237): Unify kFunc/NameAttrList attribute representation.
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_attr.h b/tensorflow/compiler/mlir/tensorflow/utils/convert_attr.h
index 18f732081de8ee..10271fcbd60f5c 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_attr.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_attr.h
@@ -27,12 +27,12 @@ using tsl::StatusOr;
 // Converts non func AttrValue proto into an MLIR attribute. Func attribute is
 // exclused in this function because the function might be renamed when the
 // function definition is imported.
-StatusOr<mlir::Attribute> ConvertNonFuncAttributeValue(const AttrValue& value,
-                                                       mlir::Builder* builder);
+absl::StatusOr<mlir::Attribute> ConvertNonFuncAttributeValue(
+    const AttrValue& value, mlir::Builder* builder);
 
 // Converts all kinds of AttrValue proto into an MLIR attribute.
-StatusOr<mlir::Attribute> ConvertAttributeValue(const AttrValue& value,
-                                                mlir::Builder* builder);
+absl::StatusOr<mlir::Attribute> ConvertAttributeValue(const AttrValue& value,
+                                                      mlir::Builder* builder);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
index 10e882192cfdf3..b9fef486428977 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
@@ -242,12 +243,12 @@ void ConvertToTensorShapeProto(ArrayRef<int64_t> shape,
 }
 
 PartialTensorShape ConvertTypeToTensorShape(const mlir::Type& type) {
-  if (type.isa<mlir::UnrankedTensorType>()) {
+  if (mlir::isa<mlir::UnrankedTensorType>(type)) {
     // An empty PartialTensorShape indicates an unranked tensor.
     return PartialTensorShape();
   }
 
-  if (auto tensor_type = type.dyn_cast<mlir::RankedTensorType>()) {
+  if (auto tensor_type = mlir::dyn_cast<mlir::RankedTensorType>(type)) {
     TensorShapeProto tensor_shape_proto;
     ConvertToTensorShapeProto(tensor_type.getShape(), &tensor_shape_proto);
     return PartialTensorShape(tensor_shape_proto);
@@ -259,11 +260,11 @@ PartialTensorShape ConvertTypeToTensorShape(const mlir::Type& type) {
 }
 
 mlir::TF::ShapeAttr ConvertTypeToTensorShapeAttr(const mlir::Type& type) {
-  if (type.isa<mlir::UnrankedTensorType>()) {
+  if (mlir::isa<mlir::UnrankedTensorType>(type)) {
     return mlir::TF::ShapeAttr::get(type.getContext(), std::nullopt);
   }
 
-  if (auto tensor_type = type.dyn_cast<mlir::RankedTensorType>()) {
+  if (auto tensor_type = mlir::dyn_cast<mlir::RankedTensorType>(type)) {
     return mlir::TF::ShapeAttr::get(type.getContext(), tensor_type.getShape());
   }
 
@@ -427,10 +428,10 @@ Status ConvertToTensorProto(const ElementsAttr attr, TensorProto* output) {
   output->set_dtype(output_dtype);
   ConvertToTensorShapeProto(shape, output->mutable_tensor_shape());
 
-  if (auto tensor_attr = attr.dyn_cast<mlir::TF::TensorProtoAttr>())
+  if (auto tensor_attr = mlir::dyn_cast<mlir::TF::TensorProtoAttr>(attr))
     return ConvertTensorProtoAttr(tensor_attr, output);
 
-  auto dense_attr = attr.dyn_cast<mlir::DenseElementsAttr>();
+  auto dense_attr = mlir::dyn_cast<mlir::DenseElementsAttr>(attr);
   if (!dense_attr) return errors::InvalidArgument("Unsupported elements attr");
 
   switch (output_dtype) {
@@ -496,7 +497,7 @@ Status ConvertToTensorProto(const ElementsAttr attr, TensorProto* output) {
                              output->mutable_tensor_content());
       break;
     case DT_STRING:
-      ConvertStringElementsAttr(dense_attr.cast<DenseStringElementsAttr>(),
+      ConvertStringElementsAttr(mlir::cast<DenseStringElementsAttr>(dense_attr),
                                 output->mutable_string_val());
       break;
     case DT_UINT8:
@@ -521,7 +522,7 @@ Status ConvertToTensorProto(const ElementsAttr attr, TensorProto* output) {
       return errors::Unimplemented(absl::StrCat("Unimplemented data type ",
                                                 DataTypeString(output_dtype)));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConvertToTensor(const mlir::ElementsAttr attr, Tensor* output_tensor) {
@@ -530,7 +531,7 @@ Status ConvertToTensor(const mlir::ElementsAttr attr, Tensor* output_tensor) {
   if (!output_tensor->FromProto(tensor_proto)) {
     return InvalidArgument("Couldn't convert tensor proto to tensor.");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h
index 227e4bf465f70b..92d6ee4bb65356 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h
@@ -31,12 +31,12 @@ namespace tensorflow {
 using tsl::StatusOr;
 
 // Converts an TensorFlow tensor proto into an MLIR elements attribute.
-StatusOr<mlir::ElementsAttr> ConvertTensorProto(const TensorProto& input_tensor,
-                                                mlir::Builder* builder);
+absl::StatusOr<mlir::ElementsAttr> ConvertTensorProto(
+    const TensorProto& input_tensor, mlir::Builder* builder);
 
 // Converts an TensorFlow tensor into an MLIR elements attribute.
-StatusOr<mlir::ElementsAttr> ConvertTensor(const Tensor& input_tensor,
-                                           mlir::Builder* builder);
+absl::StatusOr<mlir::ElementsAttr> ConvertTensor(const Tensor& input_tensor,
+                                                 mlir::Builder* builder);
 
 // Converts a shape from MLIR to a TensorFlow tensor shape proto.
 void ConvertToTensorShapeProto(llvm::ArrayRef<int64_t> shape,
@@ -53,8 +53,8 @@ absl::StatusOr<TensorSpecProto> ConvertTypeToTensorSpecProto(
     const mlir::Type& type);
 
 // Converts a TensorFlow shape attribute to an MLIR shape attribute.
-StatusOr<mlir::Attribute> ConvertTensorShapeProto(const TensorShapeProto& shape,
-                                                  mlir::MLIRContext* context);
+absl::StatusOr<mlir::Attribute> ConvertTensorShapeProto(
+    const TensorShapeProto& shape, mlir::MLIRContext* context);
 
 // Converts an MLIR elements attribute to a TensorFlow tensor proto.
 Status ConvertToTensorProto(mlir::ElementsAttr attr,
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc
index f3c51f88fc7630..3feed8904fab0e 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
 #include "xla/test.h"
@@ -97,8 +98,8 @@ TEST(ConvertTypeToTensorTypeTest, ConvertStringTensor) {
   ASSERT_TRUE(value_or_status.ok());
   auto attr = value_or_status.value();
 
-  EXPECT_TRUE(attr.isa<mlir::DenseStringElementsAttr>());
-  auto string_attr = attr.cast<mlir::DenseStringElementsAttr>();
+  EXPECT_TRUE(mlir::isa<mlir::DenseStringElementsAttr>(attr));
+  auto string_attr = mlir::cast<mlir::DenseStringElementsAttr>(attr);
   auto string_values = string_attr.getRawStringData();
   ASSERT_EQ(string_values.size(), 4);
   EXPECT_EQ(string_values[0], mlir::StringRef("one"));
@@ -191,7 +192,7 @@ TEST_F(ConvertTensorTest, Simple) {
 }
 
 bool IsSplat(mlir::ElementsAttr attr) {
-  return attr.cast<mlir::DenseElementsAttr>().isSplat();
+  return mlir::cast<mlir::DenseElementsAttr>(attr).isSplat();
 }
 
 TEST(ConvertTensorProtoTest, SplatTensor) {
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_type.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_type.cc
index 880501c3e89554..e3404d613c9f83 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_type.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_type.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
 #include "tensorflow/core/framework/types.h"
@@ -38,61 +39,61 @@ Status ConvertDataType(DataType dtype, Builder builder, Type* type) {
   switch (dtype) {
     case DT_HALF:
       *type = builder.getF16Type();
-      return OkStatus();
+      return absl::OkStatus();
     case DT_FLOAT:
       *type = builder.getF32Type();
-      return OkStatus();
+      return absl::OkStatus();
     case DT_DOUBLE:
       *type = builder.getF64Type();
-      return OkStatus();
+      return absl::OkStatus();
     case DT_BOOL:
       *type = builder.getIntegerType(1);
-      return OkStatus();
+      return absl::OkStatus();
     case DT_INT8:
       *type = builder.getIntegerType(8);
-      return OkStatus();
+      return absl::OkStatus();
     case DT_INT16:
       *type = builder.getIntegerType(16);
-      return OkStatus();
+      return absl::OkStatus();
     case DT_INT32:
       *type = builder.getIntegerType(32);
-      return OkStatus();
+      return absl::OkStatus();
     case DT_INT64:
       *type = builder.getIntegerType(64);
-      return OkStatus();
+      return absl::OkStatus();
     case DT_UINT8:
       *type = builder.getIntegerType(8, /*isSigned=*/false);
-      return OkStatus();
+      return absl::OkStatus();
     case DT_UINT16:
       *type = builder.getIntegerType(16, /*isSigned=*/false);
-      return OkStatus();
+      return absl::OkStatus();
     case DT_UINT32:
       *type = builder.getIntegerType(32, /*isSigned=*/false);
-      return OkStatus();
+      return absl::OkStatus();
     case DT_UINT64:
       *type = builder.getIntegerType(64, /*isSigned=*/false);
-      return OkStatus();
+      return absl::OkStatus();
     case DT_BFLOAT16:
       *type = builder.getBF16Type();
-      return OkStatus();
+      return absl::OkStatus();
     case DT_COMPLEX64:
       *type = mlir::ComplexType::get(builder.getF32Type());
-      return OkStatus();
+      return absl::OkStatus();
     case DT_COMPLEX128:
       *type = mlir::ComplexType::get(builder.getF64Type());
-      return OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_FLOAT8_E4M3FN:
       *type = builder.getFloat8E4M3FNType();
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_FLOAT8_E5M2:
       *type = builder.getFloat8E5M2Type();
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     case DT_INT4:
       *type = builder.getIntegerType(4, /*isSigned=*/true);
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     case DT_UINT4:
       *type = builder.getIntegerType(4, /*isSigned=*/false);
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
 #define HANDLE_TF_TYPE(tftype, enumerant, name)             \
   case DT_##enumerant:                                      \
     *type = builder.getType<mlir::tf_type::tftype##Type>(); \
@@ -108,54 +109,54 @@ Status ConvertDataType(DataType dtype, Builder builder, Type* type) {
 Status ConvertScalarTypeToDataType(Type type, DataType* dtype) {
   if (type.isF16()) {
     *dtype = DT_HALF;
-    return OkStatus();
+    return absl::OkStatus();
   } else if (type.isF32()) {
     *dtype = DT_FLOAT;
-    return OkStatus();
+    return absl::OkStatus();
   } else if (type.isF64()) {
     *dtype = DT_DOUBLE;
-    return OkStatus();
+    return absl::OkStatus();
   } else if (type.isBF16()) {
     *dtype = DT_BFLOAT16;
-    return OkStatus();
+    return absl::OkStatus();
   } else if (type.isFloat8E4M3FN()) {
     *dtype = DT_FLOAT8_E4M3FN;
-    return OkStatus();
+    return absl::OkStatus();
   } else if (type.isFloat8E5M2()) {
     *dtype = DT_FLOAT8_E5M2;
-    return OkStatus();
-  } else if (auto itype = type.dyn_cast<mlir::IntegerType>()) {
+    return absl::OkStatus();
+  } else if (auto itype = mlir::dyn_cast<mlir::IntegerType>(type)) {
     switch (itype.getWidth()) {
       case 1:
         *dtype = DT_BOOL;
-        return OkStatus();
+        return absl::OkStatus();
       case 4:
         *dtype = itype.isUnsigned() ? DT_UINT4 : DT_INT4;
-        return OkStatus();
+        return absl::OkStatus();
       case 8:
         *dtype = itype.isUnsigned() ? DT_UINT8 : DT_INT8;
-        return OkStatus();
+        return absl::OkStatus();
       case 16:
         *dtype = itype.isUnsigned() ? DT_UINT16 : DT_INT16;
-        return OkStatus();
+        return absl::OkStatus();
       case 32:
         *dtype = itype.isUnsigned() ? DT_UINT32 : DT_INT32;
-        return OkStatus();
+        return absl::OkStatus();
       case 64:
         *dtype = itype.isUnsigned() ? DT_UINT64 : DT_INT64;
-        return OkStatus();
+        return absl::OkStatus();
       default:
         return errors::Unimplemented(
             absl::StrCat("Converting ", debugString(type), " to DataType"));
     }
-  } else if (auto complex_type = type.dyn_cast<mlir::ComplexType>()) {
+  } else if (auto complex_type = mlir::dyn_cast<mlir::ComplexType>(type)) {
     auto etype = complex_type.getElementType();
     if (etype.isF32()) {
       *dtype = DT_COMPLEX64;
-      return OkStatus();
+      return absl::OkStatus();
     } else if (etype.isF64()) {
       *dtype = DT_COMPLEX128;
-      return OkStatus();
+      return absl::OkStatus();
     }
     return errors::Unimplemented(
         absl::StrCat("Converting ", debugString(type), " to DataType"));
@@ -174,13 +175,13 @@ Status ConvertScalarTypeToDataType(Type type, DataType* dtype) {
 }
 
 Status ConvertToDataType(Type type, DataType* dtype) {
-  if (auto stype = type.dyn_cast<ShapedType>()) {
+  if (auto stype = mlir::dyn_cast<ShapedType>(type)) {
     TF_RETURN_IF_ERROR(
         ConvertScalarTypeToDataType(stype.getElementType(), dtype));
   } else {
     TF_RETURN_IF_ERROR(ConvertScalarTypeToDataType(type, dtype));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void ConvertToMlirShape(const TensorShape& input_shape,
@@ -202,7 +203,7 @@ Status ConvertToMlirShape(const TensorShapeProto& input_shape,
     shape->push_back(d.size() == kTFDynamicSize ? ShapedType::kDynamic
                                                 : d.size());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 absl::StatusOr<mlir::Type> ConvertToMlirTensorType(
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_type.h b/tensorflow/compiler/mlir/tensorflow/utils/convert_type.h
index 35a3d1fb156f2b..3c21aa260499c1 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_type.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_type.h
@@ -45,9 +45,8 @@ Status ConvertToMlirShape(const TensorShapeProto& input_shape,
                           llvm::SmallVectorImpl<int64_t>* shape);
 
 // Given a tensor shape and dtype, get the corresponding MLIR tensor type.
-StatusOr<mlir::Type> ConvertToMlirTensorType(const TensorShapeProto& shape,
-                                             DataType dtype,
-                                             mlir::Builder* builder);
+absl::StatusOr<mlir::Type> ConvertToMlirTensorType(
+    const TensorShapeProto& shape, DataType dtype, mlir::Builder* builder);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/device_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/device_util.cc
index 51db1be0820761..d9249d472b334c 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/device_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/device_util.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_set.h"
@@ -67,7 +68,7 @@ mlir::LogicalResult GetDevicesFromOp(mlir::Operation* op,
   for (const auto& kv : llvm::enumerate(array_attr)) {
     const int idx = kv.index();
 
-    auto string_attr = kv.value().dyn_cast<mlir::StringAttr>();
+    auto string_attr = mlir::dyn_cast<mlir::StringAttr>(kv.value());
     if (!string_attr)
       return op->emitOpError(llvm::formatv(
           "bad '{0}' attribute at index {1}, not a string", kDevicesAttr, idx));
@@ -100,7 +101,7 @@ mlir::LogicalResult GetDevicesFromOp(mlir::Operation* op,
           llvm::formatv("bad '{0}' attribute, '{1}', not a valid device",
                         kDevicesAttr, name.strref()));
 
-    if (auto gpu_metadata = attr.dyn_cast<mlir::TF::GpuDeviceMetadata>()) {
+    if (auto gpu_metadata = mlir::dyn_cast<mlir::TF::GpuDeviceMetadata>(attr)) {
       devices->AddGpuDevice(device, gpu_metadata);
     } else {
       devices->AddDevice(device);
@@ -144,10 +145,11 @@ mlir::LogicalResult GetDevicesFromOp(mlir::Operation* op,
   auto devices_attr = op->getAttr(kDevicesAttr);
   if (!devices_attr) return mlir::success();
 
-  if (auto array_attr = devices_attr.dyn_cast<mlir::ArrayAttr>()) {
+  if (auto array_attr = mlir::dyn_cast<mlir::ArrayAttr>(devices_attr)) {
     return GetDevicesFromOp(op, array_attr, devices);
 
-  } else if (auto dict_attr = devices_attr.dyn_cast<mlir::DictionaryAttr>()) {
+  } else if (auto dict_attr =
+                 mlir::dyn_cast<mlir::DictionaryAttr>(devices_attr)) {
     return GetDevicesFromOp(op, dict_attr, devices);
   }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/device_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/device_util_test.cc
index 326dbbb4781602..f089ec111991e7 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/device_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/device_util_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_set.h"
@@ -87,18 +88,18 @@ TEST(DeviceUtilTest, AddDeviceToOp) {
   ASSERT_EQ(devices_attr.size(), 3);
 
   // CPU device added with an empty metadata.
-  auto device_meta_0 = devices_attr.get(cpu0).dyn_cast<mlir::UnitAttr>();
+  auto device_meta_0 = mlir::dyn_cast<mlir::UnitAttr>(devices_attr.get(cpu0));
   ASSERT_NE(device_meta_0, nullptr);
 
   // GPU device successfully parsed compute capability from description.
   auto device_meta_1 =
-      devices_attr.get(gpu0).dyn_cast<mlir::TF::GpuDeviceMetadata>();
+      mlir::dyn_cast<mlir::TF::GpuDeviceMetadata>(devices_attr.get(gpu0));
   ASSERT_NE(device_meta_1, nullptr);
   ASSERT_EQ(device_meta_1.getCcMajor(), 7);
   ASSERT_EQ(device_meta_1.getCcMinor(), 0);
 
   // If description is empty GPU devices added with an empty metadata.
-  auto device_meta_2 = devices_attr.get(gpu1).dyn_cast<mlir::UnitAttr>();
+  auto device_meta_2 = mlir::dyn_cast<mlir::UnitAttr>(devices_attr.get(gpu1));
   ASSERT_NE(device_meta_2, nullptr);
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/error_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/error_util.cc
index 6a66067920fdcb..f0dd8f1c748a25 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/error_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/error_util.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/util/managed_stack_trace.h"
@@ -33,7 +34,7 @@ StatusScopedDiagnosticHandler::StatusScopedDiagnosticHandler(
     this->shouldShowLocFn = [](Location loc) -> bool {
       // For a Location to be surfaced in the stack, it must evaluate to true.
       // For any Location that is a FileLineColLoc:
-      if (FileLineColLoc fileLoc = loc.dyn_cast<FileLineColLoc>()) {
+      if (FileLineColLoc fileLoc = mlir::dyn_cast<FileLineColLoc>(loc)) {
         return !tensorflow::IsInternalFrameForFilename(
             fileLoc.getFilename().str());
       } else {
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
index f01a3f0e09d19b..96ba0afd096a16 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -81,22 +82,22 @@ Status ConvertLocation(mlir::Location inst_loc, llvm::StringRef node_name,
                        NodeDef::ExperimentalDebugInfo* debug_info) {
   mlir::Location unwrapped_inst_loc = GetLocationWithoutOpType(inst_loc);
 
-  if (auto call_site = unwrapped_inst_loc.dyn_cast<mlir::CallSiteLoc>()) {
-    if (auto name_loc = GetLocationWithoutOpType(call_site.getCallee())
-                            .dyn_cast<mlir::NameLoc>()) {
+  if (auto call_site = mlir::dyn_cast<mlir::CallSiteLoc>(unwrapped_inst_loc)) {
+    if (auto name_loc = mlir::dyn_cast<mlir::NameLoc>(
+            GetLocationWithoutOpType(call_site.getCallee()))) {
       llvm::StringRef original_node_name, original_func_name;
       std::tie(original_node_name, original_func_name) =
           name_loc.getName().strref().split('@');
       // The location points to the current node def.
       if (node_name == original_node_name && original_func_name.empty()) {
-        return OkStatus();
+        return absl::OkStatus();
       }
       debug_info->add_original_node_names(original_node_name.str());
       if (!original_func_name.empty()) {
         debug_info->add_original_func_names(original_func_name.str());
       }
     }
-  } else if (auto fused = unwrapped_inst_loc.dyn_cast<mlir::FusedLoc>()) {
+  } else if (auto fused = mlir::dyn_cast<mlir::FusedLoc>(unwrapped_inst_loc)) {
     auto locations = fused.getLocations();
     if (locations.size() <= 1)
       return errors::InvalidArgument("expected experimental debuf info.");
@@ -105,22 +106,22 @@ Status ConvertLocation(mlir::Location inst_loc, llvm::StringRef node_name,
       TF_RETURN_IF_ERROR(ConvertLocation(locations[i], node_name, debug_info));
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConvertAttribute(const mlir::BoolAttr& attr, AttrValue* value) {
   value->set_b(attr.getValue());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConvertAttribute(const mlir::IntegerAttr& attr, AttrValue* value) {
   value->set_i(attr.getInt());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConvertAttribute(const mlir::FloatAttr& attr, AttrValue* value) {
   value->set_f(attr.getValueAsDouble());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConvertAttribute(const mlir::ElementsAttr& attr, AttrValue* value) {
@@ -130,27 +131,27 @@ Status ConvertAttribute(const mlir::ElementsAttr& attr, AttrValue* value) {
 Status ConvertAttribute(const mlir::TF::PlaceholderAttr& attr,
                         AttrValue* value) {
   value->set_placeholder(attr.getValue().str());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConvertAttribute(const mlir::TF::ShapeAttr& attr, AttrValue* value) {
   SetTensorShapeProto(attr, value->mutable_shape());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConvertAttribute(const mlir::FlatSymbolRefAttr& attr, AttrValue* value) {
   value->mutable_func()->set_name(attr.getValue().str());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConvertAttribute(const mlir::TF::FuncAttr& attr, bool remove_ref_type,
                         AttrValue* value) {
-  TF_RETURN_IF_ERROR(
-      ConvertAttribute(attr.getName().cast<mlir::FlatSymbolRefAttr>(), value));
+  TF_RETURN_IF_ERROR(ConvertAttribute(
+      mlir::cast<mlir::FlatSymbolRefAttr>(attr.getName()), value));
   TF_RETURN_IF_ERROR(ConvertAttributes(attr.getAttrs().getValue(),
                                        /*attrs_to_ignore=*/{}, remove_ref_type,
                                        value->mutable_func()->mutable_attr()));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConvertAttribute(const mlir::StringAttr& attr, AttrValue* value) {
@@ -158,22 +159,22 @@ Status ConvertAttribute(const mlir::StringAttr& attr, AttrValue* value) {
   switch (mangling_util::GetMangledKind(attr_value)) {
     case mangling_util::MangledKind::kUnknown: {
       value->set_s(std::string(attr_value));
-      return OkStatus();
+      return absl::OkStatus();
     }
     case mangling_util::MangledKind::kDataType: {
       DataType dtype;
       TF_RETURN_IF_ERROR(mangling_util::DemangleDataType(attr_value, &dtype));
       value->set_type(dtype);
-      return OkStatus();
+      return absl::OkStatus();
     }
     case mangling_util::MangledKind::kTensorShape:
       TF_RETURN_IF_ERROR(
           mangling_util::DemangleShape(attr_value, value->mutable_shape()));
-      return OkStatus();
+      return absl::OkStatus();
     default:
       return errors::Unimplemented("Mangled string couldn't be handled!");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConvertAttribute(mlir::Type type, bool remove_ref_type,
@@ -182,7 +183,7 @@ Status ConvertAttribute(mlir::Type type, bool remove_ref_type,
   TF_RETURN_IF_ERROR(ConvertToDataType(type, &dtype));
   if (tensorflow::IsRefType(dtype)) dtype = tensorflow::RemoveRefType(dtype);
   value->set_type(dtype);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConvertAttribute(const mlir::TypeAttr& type, bool remove_ref_type,
@@ -192,20 +193,20 @@ Status ConvertAttribute(const mlir::TypeAttr& type, bool remove_ref_type,
 
 Status ConvertAttribute(const mlir::UnitAttr& attr, AttrValue* value) {
   value->clear_value();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConvertAttribute(const mlir::ArrayAttr& attr, bool remove_ref_type,
                         AttrValue* value) {
   auto* list = value->mutable_list();
   for (mlir::Attribute a : attr.getValue()) {
-    if (auto attr = a.dyn_cast<mlir::BoolAttr>()) {
+    if (auto attr = mlir::dyn_cast<mlir::BoolAttr>(a)) {
       list->add_b(attr.getValue());
-    } else if (auto attr = a.dyn_cast<mlir::IntegerAttr>()) {
+    } else if (auto attr = mlir::dyn_cast<mlir::IntegerAttr>(a)) {
       list->add_i(attr.getInt());
-    } else if (auto attr = a.dyn_cast<mlir::FloatAttr>()) {
+    } else if (auto attr = mlir::dyn_cast<mlir::FloatAttr>(a)) {
       list->add_f(attr.getValueAsDouble());
-    } else if (auto attr = a.dyn_cast<mlir::StringAttr>()) {
+    } else if (auto attr = mlir::dyn_cast<mlir::StringAttr>(a)) {
       AttrValue nested_value;
       TF_RETURN_IF_ERROR(ConvertAttribute(attr, &nested_value));
       switch (nested_value.value_case()) {
@@ -221,32 +222,32 @@ Status ConvertAttribute(const mlir::ArrayAttr& attr, bool remove_ref_type,
         default:
           return errors::Unimplemented("Unhandled nested attribute!");
       }
-    } else if (auto attr = a.dyn_cast<mlir::ElementsAttr>()) {
+    } else if (auto attr = mlir::dyn_cast<mlir::ElementsAttr>(a)) {
       TensorProto tensor;
       TF_RETURN_IF_ERROR(ConvertToTensorProto(attr, &tensor));
       *list->add_tensor() = tensor;
-    } else if (auto attr = a.dyn_cast<mlir::FlatSymbolRefAttr>()) {
+    } else if (auto attr = mlir::dyn_cast<mlir::FlatSymbolRefAttr>(a)) {
       AttrValue attr_val;
       TF_RETURN_IF_ERROR(ConvertAttribute(attr, &attr_val));
       *list->add_func() = attr_val.func();
-    } else if (auto attr = a.dyn_cast<mlir::TypeAttr>()) {
+    } else if (auto attr = mlir::dyn_cast<mlir::TypeAttr>(a)) {
       AttrValue attr_val;
       // For type attributes, we only propagate the element type.
       mlir::Type elt_type = attr.getValue();
-      if (auto shaped_type = elt_type.dyn_cast<mlir::ShapedType>()) {
+      if (auto shaped_type = mlir::dyn_cast<mlir::ShapedType>(elt_type)) {
         elt_type = shaped_type.getElementType();
       }
       TF_RETURN_IF_ERROR(
           ConvertAttribute(elt_type, remove_ref_type, &attr_val));
       list->add_type(attr_val.type());
-    } else if (auto attr = a.dyn_cast<mlir::TF::ShapeAttr>()) {
+    } else if (auto attr = mlir::dyn_cast<mlir::TF::ShapeAttr>(a)) {
       AttrValue attr_val;
       TF_RETURN_IF_ERROR(ConvertAttribute(attr, &attr_val));
       *list->add_shape() = attr_val.shape();
-    } else if (auto attr = a.dyn_cast<mlir::ArrayAttr>()) {
+    } else if (auto attr = mlir::dyn_cast<mlir::ArrayAttr>(a)) {
       std::vector<int64_t> vals;
       for (mlir::Attribute a : attr.getValue()) {
-        auto i = a.dyn_cast<mlir::IntegerAttr>();
+        auto i = mlir::dyn_cast<mlir::IntegerAttr>(a);
         if (!i)
           return errors::Unimplemented(
               "Expected 64-bit integer array attributes!");
@@ -263,7 +264,7 @@ Status ConvertAttribute(const mlir::ArrayAttr& attr, bool remove_ref_type,
       return errors::Unimplemented("Unhandled attribute!");
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Returns true if the executor/control dialect op should map to Ref node in
@@ -274,21 +275,21 @@ Status ConvertAttribute(const mlir::ArrayAttr& attr, bool remove_ref_type,
 static bool IsRefTypeControlOp(mlir::Operation* op) {
   if (auto next_iter_sink =
           llvm::dyn_cast<mlir::tf_executor::NextIterationSinkOp>(op))
-    return mlir::getElementTypeOrSelf(next_iter_sink.getInput().getType())
-        .isa<mlir::TF::TensorFlowRefType>();
+    return mlir::isa<mlir::TF::TensorFlowRefType>(
+        mlir::getElementTypeOrSelf(next_iter_sink.getInput().getType()));
 
   auto op_name_or_status = GetTensorFlowOpName(op->getName().getStringRef());
   if (!op_name_or_status.ok()) return false;
 
   auto op_name = std::move(op_name_or_status).value();
-  if (op_name.equals("NextIteration"))
-    return mlir::getElementTypeOrSelf(op->getOperand(0).getType())
-        .isa<mlir::TF::TensorFlowRefType>();
-
-  if (op_name.equals("Enter") || op_name.equals("Exit") ||
-      op_name.equals("Switch") || op_name.equals("Merge")) {
-    return getElementTypeOrSelf(op->getResult(0).getType())
-        .isa<mlir::TF::TensorFlowRefType>();
+  if (op_name == "NextIteration")
+    return mlir::isa<mlir::TF::TensorFlowRefType>(
+        mlir::getElementTypeOrSelf(op->getOperand(0).getType()));
+
+  if (op_name == "Enter" || op_name == "Exit" || op_name == "Switch" ||
+      op_name == "Merge") {
+    return mlir::isa<mlir::TF::TensorFlowRefType>(
+        getElementTypeOrSelf(op->getResult(0).getType()));
   }
   return false;
 }
@@ -393,18 +394,18 @@ Status ConvertAttributes(
       name = mangling_util::DemangleAttributeName(name);
     }
     AttrValue value;
-    if (auto symbol_ref = attr.dyn_cast<mlir::SymbolRefAttr>()) {
-      TF_RETURN_IF_ERROR(
-          ConvertAttribute(symbol_ref.cast<mlir::FlatSymbolRefAttr>(), &value));
+    if (auto symbol_ref = mlir::dyn_cast<mlir::SymbolRefAttr>(attr)) {
+      TF_RETURN_IF_ERROR(ConvertAttribute(
+          mlir::cast<mlir::FlatSymbolRefAttr>(symbol_ref), &value));
       func_call_attrs[string(name)] = std::move(value);
       continue;
     }
-    if (auto func_attr = attr.dyn_cast<mlir::TF::FuncAttr>()) {
+    if (auto func_attr = mlir::dyn_cast<mlir::TF::FuncAttr>(attr)) {
       TF_RETURN_IF_ERROR(ConvertAttribute(func_attr, remove_ref_type, &value));
       func_call_attrs[string(name)] = std::move(value);
       continue;
     }
-    if (attr.isa<mlir::AffineMapAttr>()) {
+    if (mlir::isa<mlir::AffineMapAttr>(attr)) {
       // AffineMapAttr is not implemented.
       return errors::Unimplemented("AffineMap attribute (needed for '",
                                    name_strref, "') unimplemented");
@@ -444,7 +445,7 @@ Status ConvertAttributes(
   for (auto& it : func_call_attrs) {
     (*values)[it.first] = std::move(it.second);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status SetShapeAttribute(absl::string_view name, mlir::ShapedType shaped_type,
@@ -467,7 +468,7 @@ Status SetShapeAttribute(absl::string_view name, mlir::ShapedType shaped_type,
                                      actual_shape.ShortDebugString());
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool IsLegacyCallInstruction(mlir::Operation* inst) {
@@ -476,7 +477,7 @@ bool IsLegacyCallInstruction(mlir::Operation* inst) {
 
 Status AddTensorFlowOpPrefix(std::string prefix) {
   GlobalOpPrefixes()->insert(prefix);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h
index 86ff64b5ed4d0b..c12c2507e1a03c 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h
@@ -46,11 +46,11 @@ Status AddTensorFlowOpPrefix(std::string);
 
 // Maps an MLIR op name in the TensorFlow dialect or the TensorFlow control
 // dialect back into a TensorFlow valid op name.
-StatusOr<llvm::StringRef> GetTensorFlowOpName(llvm::StringRef);
+absl::StatusOr<llvm::StringRef> GetTensorFlowOpName(llvm::StringRef);
 
 // Converts an MLIR operation to TensorFlow NodeDef with given node name. This
 // name should be unique to the graph it is being inserted into.
-StatusOr<std::unique_ptr<NodeDef>> GetOperationNodeDef(
+absl::StatusOr<std::unique_ptr<NodeDef>> GetOperationNodeDef(
     mlir::Operation* inst, llvm::StringRef name);
 
 // Converts MLIR attributes with values to their tensorflow equivalent.
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/location_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/location_utils.cc
index 2a6ff2921a4ad5..afaa78640af3d9 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/location_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/location_utils.cc
@@ -17,15 +17,16 @@ limitations under the License.
 
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 
 namespace tensorflow {
 
 mlir::Location GetLocationWithoutOpType(mlir::Location loc) {
-  if (auto fused_loc = loc.dyn_cast<mlir::FusedLoc>()) {
+  if (auto fused_loc = mlir::dyn_cast<mlir::FusedLoc>(loc)) {
     auto locations = fused_loc.getLocations();
     if (!locations.empty()) {
       // Skip locations for propagating op_type metadata.
-      if (auto name_loc = locations[0].dyn_cast<mlir::NameLoc>()) {
+      if (auto name_loc = mlir::dyn_cast<mlir::NameLoc>(locations[0])) {
         if (name_loc.getName().strref().ends_with(":")) {
           if (locations.size() == 2)
             return locations[1];
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/session_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/session_utils.cc
index 2895ebdc9c6424..9e8db314f51b0d 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/session_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/session_utils.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringRef.h"
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/utils/string_container_utils.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/framework/device.h"
@@ -32,7 +33,7 @@ std::string GetVariableName(TF::VarHandleOp var_handle_op) {
   // In some cases the shared_name attribute doesn't have the same
   // tensor name in the model, so we first try to use the location
   // then fallback to shared_name attribute.
-  if (auto loc = var_handle_op->getLoc().dyn_cast<NameLoc>())
+  if (auto loc = mlir::dyn_cast<NameLoc>(var_handle_op->getLoc()))
     return loc.getName().str();
   return var_handle_op.getSharedName().str();
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/stablehlo_custom_call.cc b/tensorflow/compiler/mlir/tensorflow/utils/stablehlo_custom_call.cc
index 549b665f044314..6ab4aa64a89070 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/stablehlo_custom_call.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/stablehlo_custom_call.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 
 namespace mlir {
@@ -63,7 +64,7 @@ FailureOr<SymbolRefAttr> GetTfFuncCustomCallFuncName(
     return failure();
   }
 
-  if (auto attr = f.dyn_cast<FlatSymbolRefAttr>()) {
+  if (auto attr = mlir::dyn_cast<FlatSymbolRefAttr>(f)) {
     return attr;
   }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tf_xla_mlir_translate.cc b/tensorflow/compiler/mlir/tensorflow/utils/tf_xla_mlir_translate.cc
index 97f1093fe3d56b..5a29bae67afe01 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tf_xla_mlir_translate.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tf_xla_mlir_translate.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Tools/mlir-translate/Translation.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
@@ -396,11 +397,11 @@ SerializedMlirStringAttrToMlirModuleTranslate(llvm::StringRef input,
   // an output parameter is provided for returning the number of chars read.
   size_t numRead;
   mlir::Attribute attr = mlir::parseAttribute(input, context, {}, &numRead);
-  if (!attr || !attr.isa<mlir::StringAttr>()) {
+  if (!attr || !mlir::isa<mlir::StringAttr>(attr)) {
     LOG(ERROR) << "Input is not parsable as a MLIR StringAttr.";
     return nullptr;
   }
-  auto str_attr = attr.cast<mlir::StringAttr>();
+  auto str_attr = mlir::cast<mlir::StringAttr>(attr);
 
   mlir::DialectRegistry registry;
   RegisterMlirInputDialects(registry);
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
index c6ff5f5c93c6ef..d2f10367d0085b 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
@@ -92,7 +93,7 @@ absl::Status MismatchedTPUSystemAttributeErr(absl::string_view attribute, T a,
 // found, the first one lexicographically is returned. If no TPU_SYSTEM device
 // is found or if there are multiple TPU_SYSTEM devices with different jobs or
 // replicas, a failure will be returned.
-StatusOr<llvm::SmallVector<ParsedDevice, 8>> GetTPUSystemDevices(
+absl::StatusOr<llvm::SmallVector<ParsedDevice, 8>> GetTPUSystemDevices(
     ParsedDevices devices) {
   ParsedDevice spec;
   spec.type = kDeviceTPUSystem;
@@ -131,7 +132,7 @@ StatusOr<llvm::SmallVector<ParsedDevice, 8>> GetTPUSystemDevices(
 // Find TPU devices associated to system device based on spec (e.g. from
 // GetTPUSystemDevices). If the number of TPU devices per host do not match for
 // every host, a failure will be returned.
-StatusOr<llvm::SmallVector<llvm::SmallVector<ParsedDevice, 8>, 8>>
+absl::StatusOr<llvm::SmallVector<llvm::SmallVector<ParsedDevice, 8>, 8>>
 GetTPUDevices(ParsedDevices devices,
               llvm::ArrayRef<ParsedDevice> system_devices) {
   llvm::SmallVector<llvm::SmallVector<ParsedDevice, 8>, 8> tpu_devices;
@@ -192,8 +193,8 @@ std::string GetTPUCompilationDevice(ParsedDevice system_device) {
 // Find the host CPU device for a given TPU device with `DEVICE_CPU` as its
 // type. If multiple local cpu devices are disabled, always assign id 0. If
 // set, use the same id as the tpu device.
-StatusOr<std::string> GetCPUHostDeviceForTPUDevice(ParsedDevice tpu_device,
-                                                   ParsedDevices devices) {
+absl::StatusOr<std::string> GetCPUHostDeviceForTPUDevice(
+    ParsedDevice tpu_device, ParsedDevices devices) {
   tpu_device.type = DEVICE_CPU;
   bool enable_multiple_local_cpu_devices =
       tensorflow::GetMlirCommonFlags()
@@ -214,7 +215,7 @@ StatusOr<std::string> GetCPUHostDeviceForTPUDevice(ParsedDevice tpu_device,
 // to every core in the mesh. TPU devices are simply added to
 // `execution_devices` of one replica. `num_replicas` must be 1 or the total
 // number of TPU devices available, and `num_cores_per_replica` must be 1.
-StatusOr<TPUDevicesAndHosts> GetFullMeshTPUExecutionDeviceAssignment(
+absl::StatusOr<TPUDevicesAndHosts> GetFullMeshTPUExecutionDeviceAssignment(
     int num_replicas, int num_cores_per_replica,
     llvm::ArrayRef<llvm::SmallVector<ParsedDevice, 8>> tpu_devices,
     ParsedDevices devices) {
@@ -293,7 +294,7 @@ absl::Status DuplicateCoordinateErrorMsg(absl::string_view attribute, int x,
 //  - device coordinates within the mesh shape
 //  - no duplicate device coordinates
 //  - number of device coordinates (in tuple 3) match number of availabe TPUs
-StatusOr<xla::Array4D<TaskAndDevice>> ParseTopologyAttr(
+absl::StatusOr<xla::Array4D<TaskAndDevice>> ParseTopologyAttr(
     llvm::StringRef topology_attr, int num_tasks, int num_tpus_per_task) {
   tpu::TopologyProto topology_proto;
   if (!topology_proto.ParseFromString(topology_attr.str()))
@@ -375,7 +376,7 @@ StatusOr<xla::Array4D<TaskAndDevice>> ParseTopologyAttr(
 //  - number of device coordinates (in tuple 3) match number 'num_replicas' *
 //    'num_cores_per_replica'
 //  - a TPU device associated with each device coordinate
-StatusOr<std::pair<TPUDevicesAndHosts, xla::DeviceAssignmentProto>>
+absl::StatusOr<std::pair<TPUDevicesAndHosts, xla::DeviceAssignmentProto>>
 GetGeneralTPUExecutionDeviceAssignment(
     int num_replicas, int num_cores_per_replica,
     llvm::ArrayRef<llvm::SmallVector<ParsedDevice, 8>> tpu_devices,
@@ -480,7 +481,7 @@ mlir::LogicalResult GetDeviceAssignmentCoordinates(
     return cluster.emitOpError(llvm::formatv("requires attribute '{0}'",
                                              tensorflow::kDeviceAssignmentAttr)
                                    .str());
-  if (StatusOr<llvm::SmallVector<int64_t, 8>> fetched_device_coordinates =
+  if (absl::StatusOr<llvm::SmallVector<int64_t, 8>> fetched_device_coordinates =
           tensorflow::GetDeviceCoordinates(device_assignment_attr);
       fetched_device_coordinates.ok()) {
     device_coordinates = *fetched_device_coordinates;
@@ -516,7 +517,7 @@ mlir::LogicalResult GetTPUDevicesAndHostsNotReplicated(
   }
 
   // Determine compilation and execution devices.
-  if (StatusOr<TPUDeviceAssignment> tpu_device_assignment =
+  if (absl::StatusOr<TPUDeviceAssignment> tpu_device_assignment =
           tensorflow::GetTPUCompilationAndExecutionDevices(
               devices.device_names(), /*num_replicas=*/1,
               GetNumCoresPerReplica(cluster), topology, device_coordinates);
@@ -601,7 +602,7 @@ mlir::LogicalResult GetTPUToHostMap(
 
 }  // anonymous namespace
 
-StatusOr<llvm::SmallVector<int64_t, 8>> GetDeviceCoordinates(
+absl::StatusOr<llvm::SmallVector<int64_t, 8>> GetDeviceCoordinates(
     mlir::ArrayAttr device_assignment_attr) {
   llvm::SmallVector<int64_t, 8> device_coordinates;
   device_coordinates.reserve(device_assignment_attr.size());
@@ -609,7 +610,7 @@ StatusOr<llvm::SmallVector<int64_t, 8>> GetDeviceCoordinates(
   for (auto device_coordinate_and_idx :
        llvm::enumerate(device_assignment_attr)) {
     auto device_coordinate =
-        device_coordinate_and_idx.value().dyn_cast<mlir::IntegerAttr>();
+        mlir::dyn_cast<mlir::IntegerAttr>(device_coordinate_and_idx.value());
     if (!device_coordinate)
       return absl::InvalidArgumentError(
           llvm::formatv(kBadIntArrayElementMsg, kDeviceAssignmentAttr,
@@ -622,7 +623,7 @@ StatusOr<llvm::SmallVector<int64_t, 8>> GetDeviceCoordinates(
   return device_coordinates;
 }
 
-StatusOr<TPUDeviceAssignment> GetTPUCompilationAndExecutionDevices(
+absl::StatusOr<TPUDeviceAssignment> GetTPUCompilationAndExecutionDevices(
     ParsedDevices devices, int num_replicas, int num_cores_per_replica,
     llvm::StringRef topology_attr,
     llvm::ArrayRef<int64_t> device_assignment_attr) {
@@ -733,8 +734,8 @@ bool IsTPUReplicatedCore(llvm::StringRef device) {
 
 bool TypeValidForXLA(const mlir::Type& type) {
   const mlir::Type elem = getElementTypeOrSelf(type);
-  return !elem.isa<mlir::TF::ResourceType>() &&
-         !elem.isa<mlir::TF::StringType>();
+  return !mlir::isa<mlir::TF::ResourceType>(elem) &&
+         !mlir::isa<mlir::TF::StringType>(elem);
 }
 
 mlir::LogicalResult GetDeviceToHostMap(
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
index 9ed5d7614aaf4c..f7c9b29d6cfdcc 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
@@ -80,7 +80,7 @@ struct TPUDeviceAssignment {
 };
 
 // Extracts device coordinates from a device assignment attribute on an op.
-StatusOr<llvm::SmallVector<int64_t, 8>> GetDeviceCoordinates(
+absl::StatusOr<llvm::SmallVector<int64_t, 8>> GetDeviceCoordinates(
     mlir::ArrayAttr device_assignment_attr);
 
 // Finds the TPU compilation device and execution devices from `devices` for a
@@ -234,7 +234,7 @@ StatusOr<llvm::SmallVector<int64_t, 8>> GetDeviceCoordinates(
 //       replica_device_ids: 7
 //     }
 //   }
-StatusOr<TPUDeviceAssignment> GetTPUCompilationAndExecutionDevices(
+absl::StatusOr<TPUDeviceAssignment> GetTPUCompilationAndExecutionDevices(
     llvm::ArrayRef<DeviceNameUtils::ParsedName> devices, int num_replicas,
     int num_cores_per_replica, llvm::StringRef topology_attr,
     llvm::ArrayRef<int64_t> device_assignment_attr);
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
index 2c749b549cdc86..c6d80802b2aa0a 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
@@ -38,7 +38,7 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-tsl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GetMlirModuleFromString(
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GetMlirModuleFromString(
     llvm::StringRef string, mlir::MLIRContext* context) {
   mlir::DialectRegistry mlir_registry;
   RegisterAllTensorFlowDialects(mlir_registry);
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/translate_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/translate_utils.cc
index 988950389edf8b..e23e2313711f9c 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/translate_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/translate_utils.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/translate_utils.h"
 
 #include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/core/platform/errors.h"
 
 namespace tensorflow {
@@ -44,21 +45,21 @@ mlir::LogicalResult ExtractTfVersions(mlir::ModuleOp module,
   if (!version_attr) return mlir::failure();
 
   auto producer =
-      version_attr.get("producer").dyn_cast_or_null<mlir::IntegerAttr>();
+      mlir::dyn_cast_or_null<mlir::IntegerAttr>(version_attr.get("producer"));
   if (!producer) return mlir::failure();
   versions->set_producer(producer.getInt());
 
-  auto min_consumer =
-      version_attr.get("min_consumer").dyn_cast_or_null<mlir::IntegerAttr>();
+  auto min_consumer = mlir::dyn_cast_or_null<mlir::IntegerAttr>(
+      version_attr.get("min_consumer"));
   if (min_consumer) versions->set_min_consumer(min_consumer.getInt());
 
-  auto bad_consumers =
-      version_attr.get("bad_consumers").dyn_cast_or_null<mlir::ArrayAttr>();
+  auto bad_consumers = mlir::dyn_cast_or_null<mlir::ArrayAttr>(
+      version_attr.get("bad_consumers"));
   if (!bad_consumers) return mlir::success();
 
   for (auto bad_consumer : bad_consumers) {
     auto bad_consumer_int_attr =
-        bad_consumer.dyn_cast_or_null<mlir::IntegerAttr>();
+        mlir::dyn_cast_or_null<mlir::IntegerAttr>(bad_consumer);
     if (!bad_consumer_int_attr) return mlir::failure();
 
     versions->mutable_bad_consumers()->Add(bad_consumer_int_attr.getInt());
@@ -66,13 +67,13 @@ mlir::LogicalResult ExtractTfVersions(mlir::ModuleOp module,
   return mlir::success();
 }
 
-::tsl::StatusOr<int64_t> GetTfGraphProducerVersion(mlir::ModuleOp module) {
+absl::StatusOr<int64_t> GetTfGraphProducerVersion(mlir::ModuleOp module) {
   auto versions = module->getAttrOfType<::mlir::DictionaryAttr>("tf.versions");
   if (!versions) {
     return errors::Internal(
         "Missing 'tf.versions' attribute on the module, abort.\n");
   }
-  auto producer = versions.get("producer").dyn_cast<mlir::IntegerAttr>();
+  auto producer = mlir::dyn_cast<mlir::IntegerAttr>(versions.get("producer"));
   if (!producer) {
     return errors::Internal(
         "Missing 'producer' attribute on the module, abort.\n");
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/translate_utils.h b/tensorflow/compiler/mlir/tensorflow/utils/translate_utils.h
index feccb1754d5781..f9acbb9a88e7cb 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/translate_utils.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/translate_utils.h
@@ -37,7 +37,7 @@ mlir::LogicalResult ExtractTfVersions(mlir::ModuleOp module,
 
 // Returns TensorFlow GraphDef producer version for the given module. Returns an
 // error if the version information is missing for the module or is not valid.
-::tsl::StatusOr<int64_t> GetTfGraphProducerVersion(mlir::ModuleOp module);
+absl::StatusOr<int64_t> GetTfGraphProducerVersion(mlir::ModuleOp module);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/xla_rewrite_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/xla_rewrite_util_test.cc
index eb4c9ee85274ea..c0046f83664223 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/xla_rewrite_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/xla_rewrite_util_test.cc
@@ -36,7 +36,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace {
-tsl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GetMlirModuleFromString(
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GetMlirModuleFromString(
     llvm::StringRef string, mlir::MLIRContext* context) {
   mlir::DialectRegistry mlir_registry;
   RegisterAllTensorFlowDialects(mlir_registry);
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
index ea76adb284b7e2..334cca591cf569 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
@@ -70,7 +70,7 @@ mlir::LogicalResult CreateSplitOp(const int num_split,
   // Correctly set output shapes of split op output if input shape is statically
   // known.
   mlir::Type output_type;
-  auto input_type = src_input.getType().cast<mlir::TensorType>();
+  auto input_type = mlir::cast<mlir::TensorType>(src_input.getType());
 
   if (input_type.hasRank()) {
     if (input_type.getShape()[split_dimension] == mlir::ShapedType::kDynamic) {
@@ -122,7 +122,7 @@ mlir::TF::ConcatOp CreateConcatOp(const int concat_dimension,
   // across logical devices, we refer to the shape of 0th logical device
   // computation output.
   mlir::Type output_type;
-  auto input_type = inputs[0].getType().cast<mlir::TensorType>();
+  auto input_type = mlir::cast<mlir::TensorType>(inputs[0].getType());
 
   if (input_type.hasRank()) {
     if (input_type.getShape()[concat_dimension] == mlir::ShapedType::kDynamic) {
@@ -294,9 +294,9 @@ mlir::LogicalResult DecodeShardingAttribute(const std::string& shard_str,
 mlir::LogicalResult DecodeShardingAttribute(mlir::Attribute shard_attr,
                                             xla::OpSharding& sharding,
                                             bool report_error) {
-  if (!shard_attr.isa<mlir::StringAttr>()) return mlir::failure();
+  if (!mlir::isa<mlir::StringAttr>(shard_attr)) return mlir::failure();
 
-  auto shard_str = shard_attr.cast<mlir::StringAttr>().getValue().str();
+  auto shard_str = mlir::cast<mlir::StringAttr>(shard_attr).getValue().str();
   return DecodeShardingAttribute(shard_str, sharding, report_error);
 }
 
@@ -350,7 +350,8 @@ mlir::LogicalResult ExtractInputsForLogicalDevices(
 
     xla::OpSharding sharding;
     if (DecodeShardingAttribute(
-            sharding_attr.cast<mlir::StringAttr>().getValue().str(), sharding)
+            mlir::cast<mlir::StringAttr>(sharding_attr).getValue().str(),
+            sharding)
             .failed()) {
       return cluster_func.emitError("incorrect sharding format for inputs");
     }
@@ -443,13 +444,14 @@ mlir::LogicalResult ParseAndValidateOutputSharding(
        llvm::enumerate(output_sharding_attrs)) {
     const auto& output_sharding = output_sharding_and_index.value();
     const int sharding_index = output_sharding_and_index.index();
-    if (!output_sharding.isa<mlir::StringAttr>())
+    if (!mlir::isa<mlir::StringAttr>(output_sharding))
       return cluster_func.emitError(llvm::formatv(
           "non-string output sharding at index {0}", sharding_index));
 
     xla::OpSharding sharding;
     if (DecodeShardingAttribute(
-            output_sharding.cast<mlir::StringAttr>().getValue().str(), sharding)
+            mlir::cast<mlir::StringAttr>(output_sharding).getValue().str(),
+            sharding)
             .failed()) {
       return cluster_func.emitError("incorrect sharding format for outputs");
     }
@@ -661,7 +663,7 @@ mlir::LogicalResult GetOutputTypesForLogicalDeviceComputation(
     const auto output_index = result_and_index.index();
     const auto& output_sharding = output_sharding_config[output_index];
     const auto cluster_func_output_type =
-        result_and_index.value().getType().cast<mlir::TensorType>();
+        mlir::cast<mlir::TensorType>(result_and_index.value().getType());
 
     // If output shape of cluster func is statically known and output is tiled
     // sharded, then the corresponding output shape of cluster func must be
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.cc
index 322862828e63b3..7358b97971e0fe 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.cc
@@ -175,13 +175,13 @@ Status GetXlaInputShapes(
 // bounded type by using the bounds as dimension sizes. Returns null if is
 // neither.
 mlir::RankedTensorType GetBufferType(mlir::Type ty) {
-  auto ranked_ty = ty.dyn_cast_or_null<mlir::RankedTensorType>();
+  auto ranked_ty = mlir::dyn_cast_or_null<mlir::RankedTensorType>(ty);
   if (!ranked_ty) return {};
 
   int64_t rank = ranked_ty.getRank();
   llvm::SmallVector<int64_t, 4> dims = llvm::to_vector<4>(ranked_ty.getShape());
-  auto encoding = ranked_ty.getEncoding()
-                      .dyn_cast_or_null<mlir::mhlo::TypeExtensionsAttr>();
+  auto encoding = mlir::dyn_cast_or_null<mlir::mhlo::TypeExtensionsAttr>(
+      ranked_ty.getEncoding());
   if (encoding && !encoding.getBounds().empty()) {
     for (int64_t dim = 0; dim < rank; ++dim) {
       if (dims[dim] == mlir::ShapedType::kDynamic) {
@@ -234,7 +234,7 @@ Status GetOutputInfo(
   auto return_op = main_func.begin()->getTerminator();
   for (const auto& type_and_idx : llvm::enumerate(func_type.getResults())) {
     size_t idx = type_and_idx.index();
-    auto result_ty = type_and_idx.value().cast<mlir::RankedTensorType>();
+    auto result_ty = mlir::cast<mlir::RankedTensorType>(type_and_idx.value());
 
     // If the result type isn't static, then the owner of the result may be a
     // cast op from a more specific bounded type to an unbounded dynamic type.
@@ -275,7 +275,8 @@ Status GetOutputInfo(
     TF_RETURN_IF_ERROR(MaybeRewriteLayoutWithShardedShape(
         sharding, shape_determination_fns, &shape));
 
-    auto tensor_type = type_and_idx.value().dyn_cast<mlir::RankedTensorType>();
+    auto tensor_type =
+        mlir::dyn_cast<mlir::RankedTensorType>(type_and_idx.value());
     shapes.push_back(shape);
 
     auto it = output_to_input_alias.find(type_and_idx.index());
@@ -872,7 +873,7 @@ static absl::StatusOr<std::vector<int>> RewriteWithArgs(
       auto resource_type =
           mlir::TF::ResourceType::get({resource_subtype}, builder.getContext());
 
-      auto tensor_type = mlir_arg.getType().cast<mlir::TensorType>();
+      auto tensor_type = mlir::cast<mlir::TensorType>(mlir_arg.getType());
       if (tensor_type.hasRank()) {
         mlir_arg.setType(
             GetTypeFromTFTensorShape(tensor_type.getShape(), resource_type));
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD b/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
index 545203ad20ea23..709a63bea84ebe 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
@@ -210,6 +210,7 @@ tf_cc_test(
     srcs = ["tf_dialect_to_executor_test.cc"],
     data = [
         "testdata/empty_func.mlir",
+        "testdata/func_with_dead_ops.mlir",
         "testdata/invalid_executor.mlir",
     ],
     deps = [
@@ -220,10 +221,9 @@ tf_cc_test(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
         "@local_tsl//tsl/lib/core:status_test_util",
-        "@local_tsl//tsl/lib/monitoring:test_utils",
-        "@local_tsl//tsl/platform:status",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf_test.cc
index 0e7e61999d8f2b..0f5680bda420d2 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf_test.cc
@@ -46,6 +46,7 @@ namespace tf2xla {
 namespace v2 {
 
 using ::tensorflow::monitoring::testing::CellReader;
+using ::testing::Not;
 using ::testing::TestWithParam;
 using tpu::FunctionToHloArgs;
 using tpu::MlirToHloArgs;
@@ -334,6 +335,41 @@ TEST(LegalizeTFTest, SuccessfullyCompilesModulesWithReturnValues) {
               ComputationProtoContains("opcode:.*constant"));
 }
 
+TEST(LegalizeTFTest, SkipsTensorListSetItemIfDimensionsTooLarge) {
+  static constexpr char kTensorListSetItemDimensionTooLarge[] = R"(
+    module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
+      func.func @main() -> tensor<!tf_type.variant<tensor<64x1xbf16>>> {
+      // unknown rank
+      %elem_shape = "tf.Const"() <{value = dense<-1> : tensor<i32>}> {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> tensor<i32>
+      // zero reserved elements
+      %num_elements = "tf.Const"() <{value = dense<0> : tensor<i32>}> {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> tensor<i32>
+
+      %list = "tf.TensorListReserve"(%elem_shape, %num_elements) : (tensor<i32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<64x1xbf16>>>
+
+      %index = "tf.Const"() <{value = dense<0> : tensor<i32>}> {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> tensor<i32>
+      %element = "tf.Const"() <{value = dense<0.0> : tensor<64x1xbf16>}> {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> tensor<64x1xbf16>
+      // Results in a bad mismatch of shapes.
+      %updated_list = "tf.TensorListSetItem"(%list, %index, %element) : (tensor<!tf_type.variant<tensor<64x1xbf16>>>, tensor<i32>, tensor<64x1xbf16>) -> tensor<!tf_type.variant<tensor<64x1xbf16>>>
+
+      return %updated_list : tensor<!tf_type.variant<tensor<64x1xbf16>>>
+    }
+  })";
+
+  auto compilation_result = CompileMlirModule(
+      kTensorListSetItemDimensionTooLarge,
+      ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_UNSPECIFIED);
+
+  // Ensure that it compile
+  ASSERT_TRUE(compilation_result.ok());
+  // Assert that the tensor list operation is lowered to something.
+  ASSERT_THAT(compilation_result,
+              Not(ComputationProtoContains("%.*= \"tf.TensorListSetItem")));
+  // Assert that the tensor list operation is lowered to something that doesn't
+  // get stuck on a broken dynamic update slice.
+  ASSERT_THAT(compilation_result,
+              Not(ComputationProtoContains("%.*=.*DynamicUpdateSlice")));
+}
+
 }  // namespace v2
 }  // namespace tf2xla
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/testdata/func_with_dead_ops.mlir b/tensorflow/compiler/mlir/tf2xla/api/v2/testdata/func_with_dead_ops.mlir
new file mode 100644
index 00000000000000..f8dd51f4e12d3c
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/testdata/func_with_dead_ops.mlir
@@ -0,0 +1,62 @@
+module attributes {tf.devices = {"/job:tpu_host_worker/replica:0/task:0/device:CPU:0", "/job:tpu_host_worker/replica:0/task:0/device:TPU:0", "/job:tpu_host_worker/replica:0/task:0/device:TPU:1", "/job:tpu_host_worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:tpu_host_worker/replica:0/task:1/device:CPU:0", "/job:tpu_host_worker/replica:0/task:1/device:TPU:0", "/job:tpu_host_worker/replica:0/task:1/device:TPU:1", "/job:tpu_host_worker/replica:0/task:1/device:TPU_SYSTEM:0", "/job:tpu_host_worker/replica:0/task:2/device:CPU:0", "/job:tpu_host_worker/replica:0/task:2/device:TPU:0", "/job:tpu_host_worker/replica:0/task:2/device:TPU:1", "/job:tpu_host_worker/replica:0/task:2/device:TPU_SYSTEM:0", "/job:tpu_host_worker/replica:0/task:3/device:CPU:0", "/job:tpu_host_worker/replica:0/task:3/device:TPU:0", "/job:tpu_host_worker/replica:0/task:3/device:TPU:1", "/job:tpu_host_worker/replica:0/task:3/device:TPU_SYSTEM:0"}, tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 1847 : i32}} {
+  func.func @main(%arg0: tensor<i32> {tf._user_specified_name = "steps", tf.device = "/job:tpu_host_worker/replica:0/task:0/device:CPU:0"}, %arg1: tensor<*x!tf_type.resource<tensor<i64>>> {tf._user_specified_name = "899", tf.device = "/job:tpu_host_worker/replica:0/task:0/device:CPU:0"}, %arg2: tensor<*x!tf_type.resource<tensor<i64>>> {tf._user_specified_name = "901", tf.device = "/job:tpu_host_worker/replica:0/task:0/device:CPU:0"}, %arg3: tensor<*x!tf_type.resource<tensor<128x1024xf32>>> {tf._user_specified_name = "903", tf.device = "/job:tpu_host_worker/replica:0/task:0/device:CPU:0"}, %arg4: tensor<*x!tf_type.resource<tensor<1024xf32>>> {tf._user_specified_name = "905", tf.device = "/job:tpu_host_worker/replica:0/task:0/device:CPU:0"}, %arg5: tensor<*x!tf_type.resource<tensor<1024x1xf32>>> {tf._user_specified_name = "907", tf.device = "/job:tpu_host_worker/replica:0/task:0/device:CPU:0"}, %arg6: tensor<*x!tf_type.resource<tensor<i64>>> {tf._user_specified_name = "909", tf.device = "/job:tpu_host_worker/replica:0/task:0/device:CPU:0"}, %arg7: tensor<*x!tf_type.resource<tensor<25001x64xf32>>> {tf._user_specified_name = "911", tf.device = "/job:tpu_host_worker/replica:0/task:0/device:CPU:0"}, %arg8: tensor<*x!tf_type.resource<tensor<25001x64xf32>>> {tf._user_specified_name = "913", tf.device = "/job:tpu_host_worker/replica:0/task:1/device:CPU:0"}, %arg9: tensor<*x!tf_type.resource<tensor<25001x64xf32>>> {tf._user_specified_name = "915", tf.device = "/job:tpu_host_worker/replica:0/task:2/device:CPU:0"}, %arg10: tensor<*x!tf_type.resource<tensor<25001x64xf32>>> {tf._user_specified_name = "917", tf.device = "/job:tpu_host_worker/replica:0/task:3/device:CPU:0"}, %arg11: tensor<*x!tf_type.resource<tensor<25001x32xf32>>> {tf._user_specified_name = "919", tf.device = "/job:tpu_host_worker/replica:0/task:0/device:CPU:0"}, %arg12: tensor<*x!tf_type.resource<tensor<25001x32xf32>>> {tf._user_specified_name = "921", tf.device = "/job:tpu_host_worker/replica:0/task:1/device:CPU:0"}, %arg13: tensor<*x!tf_type.resource<tensor<25001x32xf32>>> {tf._user_specified_name = "923", tf.device = "/job:tpu_host_worker/replica:0/task:2/device:CPU:0"}, %arg14: tensor<*x!tf_type.resource<tensor<25001x32xf32>>> {tf._user_specified_name = "925", tf.device = "/job:tpu_host_worker/replica:0/task:3/device:CPU:0"}, %arg15: tensor<*x!tf_type.resource<tensor<6x32xf32>>> {tf._user_specified_name = "927", tf.device = "/job:tpu_host_worker/replica:0/task:0/device:CPU:0"}, %arg16: tensor<*x!tf_type.resource<tensor<6x32xf32>>> {tf._user_specified_name = "929", tf.device = "/job:tpu_host_worker/replica:0/task:1/device:CPU:0"}, %arg17: tensor<*x!tf_type.resource<tensor<6x32xf32>>> {tf._user_specified_name = "931", tf.device = "/job:tpu_host_worker/replica:0/task:2/device:CPU:0"}, %arg18: tensor<*x!tf_type.resource<tensor<6x32xf32>>> {tf._user_specified_name = "933", tf.device = "/job:tpu_host_worker/replica:0/task:3/device:CPU:0"}, %arg19: tensor<*x!tf_type.resource<tensor<128x1024xf32>>> {tf._user_specified_name = "935", tf.device = "/job:tpu_host_worker/replica:0/task:0/device:CPU:0"}, %arg20: tensor<*x!tf_type.resource<tensor<1024xf32>>> {tf._user_specified_name = "937", tf.device = "/job:tpu_host_worker/replica:0/task:0/device:CPU:0"}, %arg21: tensor<*x!tf_type.resource<tensor<1024x1xf32>>> {tf._user_specified_name = "939", tf.device = "/job:tpu_host_worker/replica:0/task:0/device:CPU:0"}) -> tensor<i64> attributes {allow_soft_placement = false, tf.entry_function = {control_outputs = "", inputs = "steps,unknown,unknown_0,unknown_1,unknown_2,unknown_3,unknown_4,unknown_5,unknown_6,unknown_7,unknown_8,unknown_9,unknown_10,unknown_11,unknown_12,unknown_13,unknown_14,unknown_15,unknown_16,unknown_17,unknown_18,unknown_19", outputs = "statefulpartitionedcall_RetVal"}} {
+    %0 = "tf.ReadVariableOp"(%arg19) : (tensor<*x!tf_type.resource<tensor<128x1024xf32>>>) -> tensor<128x1024xf32>
+    %1 = "tf.ReadVariableOp"(%arg1) : (tensor<*x!tf_type.resource<tensor<i64>>>) -> tensor<i64>
+    %2 = "tf.ReadVariableOp"(%arg2) : (tensor<*x!tf_type.resource<tensor<i64>>>) -> tensor<i64>
+    %3 = "tf.ReadVariableOp"(%arg4) : (tensor<*x!tf_type.resource<tensor<1024xf32>>>) -> tensor<1024xf32>
+    %4 = "tf.ReadVariableOp"(%arg3) : (tensor<*x!tf_type.resource<tensor<128x1024xf32>>>) -> tensor<128x1024xf32>
+    %5 = "tf.ReadVariableOp"(%arg5) : (tensor<*x!tf_type.resource<tensor<1024x1xf32>>>) -> tensor<1024x1xf32>
+    %6 = "tf.ReadVariableOp"(%arg20) : (tensor<*x!tf_type.resource<tensor<1024xf32>>>) -> tensor<1024xf32>
+    %7 = "tf.ReadVariableOp"(%arg21) : (tensor<*x!tf_type.resource<tensor<1024x1xf32>>>) -> tensor<1024x1xf32>
+    %8 = "tf.ReadVariableOp"(%arg6) : (tensor<*x!tf_type.resource<tensor<i64>>>) -> tensor<i64>
+    %9 = "tf.Const"() <{value = dense<"test"> : tensor<3x!tf_type.string>}> : () -> tensor<3x!tf_type.string>
+    %cst = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+    %11:4 = "tf.Split"(%cst, %0) {num_split = 4 : i32} : (tensor<i32>, tensor<128x1024xf32>) -> (tensor<32x1024xf32>, tensor<32x1024xf32>, tensor<32x1024xf32>, tensor<32x1024xf32>)
+    %cst_0 = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+    %12:4 = "tf.Split"(%cst_0, %4) {num_split = 4 : i32} : (tensor<i32>, tensor<128x1024xf32>) -> (tensor<32x1024xf32>, tensor<32x1024xf32>, tensor<32x1024xf32>, tensor<32x1024xf32>)
+    %cst_1 = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+    %cst_2 = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+    %13:20 = tf_device.replicate {devices = {TPU_REPLICATED_CORE_0 = ["/job:tpu_host_worker/replica:0/task:0/device:TPU:0", "/job:tpu_host_worker/replica:0/task:2/device:TPU:0"], TPU_REPLICATED_CORE_1 = ["/job:tpu_host_worker/replica:0/task:0/device:TPU:1", "/job:tpu_host_worker/replica:0/task:2/device:TPU:1"], TPU_REPLICATED_CORE_2 = ["/job:tpu_host_worker/replica:0/task:1/device:TPU:0", "/job:tpu_host_worker/replica:0/task:3/device:TPU:0"], TPU_REPLICATED_CORE_3 = ["/job:tpu_host_worker/replica:0/task:1/device:TPU:1", "/job:tpu_host_worker/replica:0/task:3/device:TPU:1"], TPU_REPLICATED_HOST_0 = ["/job:tpu_host_worker/replica:0/task:0/device:CPU:0", "/job:tpu_host_worker/replica:0/task:2/device:CPU:0"], TPU_REPLICATED_HOST_1 = ["/job:tpu_host_worker/replica:0/task:0/device:CPU:0", "/job:tpu_host_worker/replica:0/task:2/device:CPU:0"], TPU_REPLICATED_HOST_2 = ["/job:tpu_host_worker/replica:0/task:1/device:CPU:0", "/job:tpu_host_worker/replica:0/task:3/device:CPU:0"], TPU_REPLICATED_HOST_3 = ["/job:tpu_host_worker/replica:0/task:1/device:CPU:0", "/job:tpu_host_worker/replica:0/task:3/device:CPU:0"]}, n = 2 : i32} {
+      %16:40 = "tf_device.parallel_execute"() ({
+        %19:10 = "tf_device.launch"() <{device = "TPU_REPLICATED_CORE_0"}> ({
+          %20:10 = "tf.TPUExecute"(%arg0, %11#0, %1, %2, %3, %12#0, %5, %6, %7, %8, %9) : (tensor<i32>, tensor<32x1024xf32>, tensor<i64>, tensor<i64>, tensor<1024xf32>, tensor<32x1024xf32>, tensor<1024x1xf32>, tensor<1024xf32>, tensor<1024x1xf32>, tensor<i64>, tensor<3x!tf_type.string>) -> (tensor<i32>, tensor<32x1024xf32>, tensor<i64>, tensor<i64>, tensor<1024xf32>, tensor<32x1024xf32>, tensor<1024x1xf32>, tensor<1024xf32>, tensor<1024x1xf32>, tensor<i64>)
+          tf_device.return %20#0, %20#1, %20#2, %20#3, %20#4, %20#5, %20#6, %20#7, %20#8, %20#9 : tensor<i32>, tensor<32x1024xf32>, tensor<i64>, tensor<i64>, tensor<1024xf32>, tensor<32x1024xf32>, tensor<1024x1xf32>, tensor<1024xf32>, tensor<1024x1xf32>, tensor<i64>
+        }) : () -> (tensor<i32>, tensor<32x1024xf32>, tensor<i64>, tensor<i64>, tensor<1024xf32>, tensor<32x1024xf32>, tensor<1024x1xf32>, tensor<1024xf32>, tensor<1024x1xf32>, tensor<i64>)
+        tf_device.return %19#0, %19#1, %19#2, %19#3, %19#4, %19#5, %19#6, %19#7, %19#8, %19#9 : tensor<i32>, tensor<32x1024xf32>, tensor<i64>, tensor<i64>, tensor<1024xf32>, tensor<32x1024xf32>, tensor<1024x1xf32>, tensor<1024xf32>, tensor<1024x1xf32>, tensor<i64>
+      }, {
+        %19:10 = "tf_device.launch"() <{device = "TPU_REPLICATED_CORE_1"}> ({
+          %20:10 = "tf.TPUExecute"(%arg0, %11#1, %1, %2, %3, %12#1, %5, %6, %7, %8, %9) : (tensor<i32>, tensor<32x1024xf32>, tensor<i64>, tensor<i64>, tensor<1024xf32>, tensor<32x1024xf32>, tensor<1024x1xf32>, tensor<1024xf32>, tensor<1024x1xf32>, tensor<i64>, tensor<3x!tf_type.string>) -> (tensor<i32>, tensor<32x1024xf32>, tensor<i64>, tensor<i64>, tensor<1024xf32>, tensor<32x1024xf32>, tensor<1024x1xf32>, tensor<1024xf32>, tensor<1024x1xf32>, tensor<i64>)
+          tf_device.return %20#0, %20#1, %20#2, %20#3, %20#4, %20#5, %20#6, %20#7, %20#8, %20#9 : tensor<i32>, tensor<32x1024xf32>, tensor<i64>, tensor<i64>, tensor<1024xf32>, tensor<32x1024xf32>, tensor<1024x1xf32>, tensor<1024xf32>, tensor<1024x1xf32>, tensor<i64>
+        }) : () -> (tensor<i32>, tensor<32x1024xf32>, tensor<i64>, tensor<i64>, tensor<1024xf32>, tensor<32x1024xf32>, tensor<1024x1xf32>, tensor<1024xf32>, tensor<1024x1xf32>, tensor<i64>)
+        tf_device.return %19#0, %19#1, %19#2, %19#3, %19#4, %19#5, %19#6, %19#7, %19#8, %19#9 : tensor<i32>, tensor<32x1024xf32>, tensor<i64>, tensor<i64>, tensor<1024xf32>, tensor<32x1024xf32>, tensor<1024x1xf32>, tensor<1024xf32>, tensor<1024x1xf32>, tensor<i64>
+      }, {
+        %19:10 = "tf_device.launch"() <{device = "TPU_REPLICATED_CORE_2"}> ({
+          %20:10 = "tf.TPUExecute"(%arg0, %11#2, %1, %2, %3, %12#2, %5, %6, %7, %8, %9) : (tensor<i32>, tensor<32x1024xf32>, tensor<i64>, tensor<i64>, tensor<1024xf32>, tensor<32x1024xf32>, tensor<1024x1xf32>, tensor<1024xf32>, tensor<1024x1xf32>, tensor<i64>, tensor<3x!tf_type.string>) -> (tensor<i32>, tensor<32x1024xf32>, tensor<i64>, tensor<i64>, tensor<1024xf32>, tensor<32x1024xf32>, tensor<1024x1xf32>, tensor<1024xf32>, tensor<1024x1xf32>, tensor<i64>)
+          tf_device.return %20#0, %20#1, %20#2, %20#3, %20#4, %20#5, %20#6, %20#7, %20#8, %20#9 : tensor<i32>, tensor<32x1024xf32>, tensor<i64>, tensor<i64>, tensor<1024xf32>, tensor<32x1024xf32>, tensor<1024x1xf32>, tensor<1024xf32>, tensor<1024x1xf32>, tensor<i64>
+        }) : () -> (tensor<i32>, tensor<32x1024xf32>, tensor<i64>, tensor<i64>, tensor<1024xf32>, tensor<32x1024xf32>, tensor<1024x1xf32>, tensor<1024xf32>, tensor<1024x1xf32>, tensor<i64>)
+        tf_device.return %19#0, %19#1, %19#2, %19#3, %19#4, %19#5, %19#6, %19#7, %19#8, %19#9 : tensor<i32>, tensor<32x1024xf32>, tensor<i64>, tensor<i64>, tensor<1024xf32>, tensor<32x1024xf32>, tensor<1024x1xf32>, tensor<1024xf32>, tensor<1024x1xf32>, tensor<i64>
+      }, {
+        %19:10 = "tf_device.launch"() <{device = "TPU_REPLICATED_CORE_3"}> ({
+          %20:10 = "tf.TPUExecute"(%arg0, %11#3, %1, %2, %3, %12#3, %5, %6, %7, %8, %9) : (tensor<i32>, tensor<32x1024xf32>, tensor<i64>, tensor<i64>, tensor<1024xf32>, tensor<32x1024xf32>, tensor<1024x1xf32>, tensor<1024xf32>, tensor<1024x1xf32>, tensor<i64>, tensor<3x!tf_type.string>) -> (tensor<i32>, tensor<32x1024xf32>, tensor<i64>, tensor<i64>, tensor<1024xf32>, tensor<32x1024xf32>, tensor<1024x1xf32>, tensor<1024xf32>, tensor<1024x1xf32>, tensor<i64>)
+          tf_device.return %20#0, %20#1, %20#2, %20#3, %20#4, %20#5, %20#6, %20#7, %20#8, %20#9 : tensor<i32>, tensor<32x1024xf32>, tensor<i64>, tensor<i64>, tensor<1024xf32>, tensor<32x1024xf32>, tensor<1024x1xf32>, tensor<1024xf32>, tensor<1024x1xf32>, tensor<i64>
+        }) : () -> (tensor<i32>, tensor<32x1024xf32>, tensor<i64>, tensor<i64>, tensor<1024xf32>, tensor<32x1024xf32>, tensor<1024x1xf32>, tensor<1024xf32>, tensor<1024x1xf32>, tensor<i64>)
+        tf_device.return %19#0, %19#1, %19#2, %19#3, %19#4, %19#5, %19#6, %19#7, %19#8, %19#9 : tensor<i32>, tensor<32x1024xf32>, tensor<i64>, tensor<i64>, tensor<1024xf32>, tensor<32x1024xf32>, tensor<1024x1xf32>, tensor<1024xf32>, tensor<1024x1xf32>, tensor<i64>
+      }) : () -> (tensor<i32>, tensor<32x1024xf32>, tensor<i64>, tensor<i64>, tensor<1024xf32>, tensor<32x1024xf32>, tensor<1024x1xf32>, tensor<1024xf32>, tensor<1024x1xf32>, tensor<i64>, tensor<i32>, tensor<32x1024xf32>, tensor<i64>, tensor<i64>, tensor<1024xf32>, tensor<32x1024xf32>, tensor<1024x1xf32>, tensor<1024xf32>, tensor<1024x1xf32>, tensor<i64>, tensor<i32>, tensor<32x1024xf32>, tensor<i64>, tensor<i64>, tensor<1024xf32>, tensor<32x1024xf32>, tensor<1024x1xf32>, tensor<1024xf32>, tensor<1024x1xf32>, tensor<i64>, tensor<i32>, tensor<32x1024xf32>, tensor<i64>, tensor<i64>, tensor<1024xf32>, tensor<32x1024xf32>, tensor<1024x1xf32>, tensor<1024xf32>, tensor<1024x1xf32>, tensor<i64>)
+      %17 = "tf.Concat"(%cst_1, %16#5, %16#15, %16#25, %16#35) : (tensor<i32>, tensor<32x1024xf32>, tensor<32x1024xf32>, tensor<32x1024xf32>, tensor<32x1024xf32>) -> tensor<128x1024xf32>
+      %18 = "tf.Concat"(%cst_2, %16#1, %16#11, %16#21, %16#31) : (tensor<i32>, tensor<32x1024xf32>, tensor<32x1024xf32>, tensor<32x1024xf32>, tensor<32x1024xf32>) -> tensor<128x1024xf32>
+      tf_device.return %16#0, %16#9, %16#8, %16#7, %16#6, %17, %16#4, %16#3, %16#2, %18 : tensor<i32>, tensor<i64>, tensor<1024x1xf32>, tensor<1024xf32>, tensor<1024x1xf32>, tensor<128x1024xf32>, tensor<1024xf32>, tensor<i64>, tensor<i64>, tensor<128x1024xf32>
+    }
+    "tf.AssignVariableOp"(%arg19, %13#18) <{validate_shape = false}> : (tensor<*x!tf_type.resource<tensor<128x1024xf32>>>, tensor<128x1024xf32>) -> ()
+    "tf.AssignVariableOp"(%arg1, %13#16) <{validate_shape = false}> : (tensor<*x!tf_type.resource<tensor<i64>>>, tensor<i64>) -> ()
+    "tf.AssignVariableOp"(%arg2, %13#14) <{validate_shape = false}> : (tensor<*x!tf_type.resource<tensor<i64>>>, tensor<i64>) -> ()
+    "tf.AssignVariableOp"(%arg4, %13#12) <{validate_shape = false}> : (tensor<*x!tf_type.resource<tensor<1024xf32>>>, tensor<1024xf32>) -> ()
+    "tf.AssignVariableOp"(%arg3, %13#10) <{validate_shape = false}> : (tensor<*x!tf_type.resource<tensor<128x1024xf32>>>, tensor<128x1024xf32>) -> ()
+    "tf.AssignVariableOp"(%arg5, %13#8) <{validate_shape = false}> : (tensor<*x!tf_type.resource<tensor<1024x1xf32>>>, tensor<1024x1xf32>) -> ()
+    "tf.AssignVariableOp"(%arg20, %13#6) <{validate_shape = false}> : (tensor<*x!tf_type.resource<tensor<1024xf32>>>, tensor<1024xf32>) -> ()
+    "tf.AssignVariableOp"(%arg21, %13#4) <{validate_shape = false}> : (tensor<*x!tf_type.resource<tensor<1024x1xf32>>>, tensor<1024x1xf32>) -> ()
+    "tf.AssignVariableOp"(%arg6, %13#2) <{validate_shape = true}> {_has_manual_control_dependencies = true} : (tensor<*x!tf_type.resource<tensor<i64>>>, tensor<i64>) -> ()
+    %14 = "tf.ReadVariableOp"(%arg2) {device = ""} : (tensor<*x!tf_type.resource<tensor<i64>>>) -> tensor<i64>
+    %15 = "tf.Identity"(%14) {device = ""} : (tensor<i64>) -> tensor<i64>
+    return %15 : tensor<i64>
+  }
+}
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.cc b/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.cc
index c92fd85d3567b4..cd13e869e811dd 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.cc
@@ -88,6 +88,8 @@ void AddTfDialectToExecutorPasses(OpPassManager &pm) {
   pm.addNestedPass<FuncOp>(mlir::TFTPU::CreateTPUDevicePropagationPass());
   pm.addNestedPass<FuncOp>(mlir::TFTPU::CreateTPUColocateSplitsPass());
   pm.addPass(mlir::createSymbolDCEPass());
+  pm.addNestedPass<FuncOp>(
+      mlir::tf_executor::CreateTFExecutorGraphPruningPass());
   if (tensorflow::GetMlirCommonFlags()
           ->tf_mlir_enable_convert_control_to_data_outputs_pass) {
     bool composite_tpuexecute_side_effects =
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor_test.cc
index 0c64dd3dcbe1a3..897c800d9e4cd7 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor_test.cc
@@ -15,12 +15,16 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.h"
 
+#include <stdlib.h>
+
 #include <cstdint>
 #include <string>
 
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/DialectRegistry.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
@@ -30,7 +34,6 @@ limitations under the License.
 #include "tensorflow/core/lib/monitoring/cell_reader.h"
 #include "tensorflow/core/platform/resource_loader.h"
 #include "tsl/lib/core/status_test_util.h"
-#include "tsl/platform/status.h"
 
 namespace tensorflow {
 namespace tf2xla {
@@ -53,6 +56,16 @@ std::string TestDataPath() {
       "tensorflow/compiler/mlir/tf2xla/api/v2/testdata/");
 }
 
+size_t CountSubstring(absl::string_view str, absl::string_view substr) {
+  size_t count = 0;
+  size_t idx = str.find(substr);
+  while (idx != std::string::npos) {
+    count++;
+    idx = str.find(substr, idx + 1);
+  }
+  return count;
+}
+
 class TensorflowDialectToExecutorTest : public ::testing::Test {
  public:
   TensorflowDialectToExecutorTest() {
@@ -100,6 +113,23 @@ TEST_F(TensorflowDialectToExecutorTest, ErrorsWhenCannotConvert) {
   EXPECT_EQ(compilation_status.Delta(kExportFailed), 1);
 }
 
+TEST_F(TensorflowDialectToExecutorTest, PrunesDeadOps) {
+  CellReader<int64_t> compilation_status(kExportStreamzName);
+
+  TF_ASSERT_OK(CreateMlirModule("func_with_dead_ops.mlir"));
+
+  TF_EXPECT_OK(ExportFromTensorflowDialectToExecutor(*mlir_module_));
+
+  std::string module_dump;
+  llvm::raw_string_ostream raw_stream(module_dump);
+  mlir_module_->print(raw_stream);
+
+  EXPECT_EQ(compilation_status.Delta(kExportSuccess), 1);
+  EXPECT_EQ(compilation_status.Delta(kExportFailed), 0);
+  EXPECT_EQ(
+      CountSubstring(module_dump, "tf_executor.island wraps \"tf.Concat\""), 2);
+}
+
 }  // namespace
 }  // namespace v2
 }  // namespace tf2xla
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.cc b/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.cc
index d0909c452a0325..d702cf308f8f80 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.cc
@@ -35,8 +35,6 @@ namespace internal {
 using mlir::OpPassManager;
 using mlir::func::FuncOp;
 
-// LINT.IfChange(replicated_bridge_passes)
-
 // Adds replicated Bridge clustering pipeline passes to the given pass_manager.
 // Does not run them.
 void AddReplicatedBridgeClusteringPipelinePasses(OpPassManager& pm,
@@ -151,7 +149,8 @@ void AddReplicatedBridgeClusteringPipelinePasses(OpPassManager& pm,
   pm.addPass(mlir::TFDevice::CreateClusterOutliningPass());
   pm.addPass(mlir::TFTPU::CreateTPUResourceReadForWritePass());
   pm.addPass(mlir::TFDevice::CreateMarkInputOutputAliasesPass());
-  pm.addPass(mlir::TFTPU::CreateTPUShardingIdentificationPass());
+  pm.addPass(
+      tensorflow::tf2xla::internal::CreateTPUShardingIdentificationPass());
   pm.addNestedPass<FuncOp>(
       mlir::TFTPU::CreateTPUResourceReadsWritesPartitioningPass());
   pm.addPass(mlir::TFDevice::CreateAnnotateParameterReplicationPass());
@@ -163,12 +162,9 @@ void AddReplicatedBridgeClusteringPipelinePasses(OpPassManager& pm,
   pm.addNestedPass<FuncOp>(
       tensorflow::tf2xla::internal::CreateVerifyClusteringPass());
 }
-// LINT.ThenChange(:non_replicated_bridge_passes)
 
 void NoCanonicalization(OpPassManager& pm) {}
 
-// LINT.IfChange(non_replicated_bridge_passes)
-
 // Same as above but for non-replicated Bridge.
 void AddNonReplicatedBridgeClusteringPipelinePasses(OpPassManager& pm) {
   // The following ops must be preserved regardless of reachability. Ideally,
@@ -218,7 +214,6 @@ void AddNonReplicatedBridgeClusteringPipelinePasses(OpPassManager& pm) {
   pm.addNestedPass<FuncOp>(
       tensorflow::tf2xla::internal::CreateVerifyClusteringPass());
 }
-// LINT.ThenChange(:replicated_bridge_passes)
 
 };  // namespace internal
 };  // namespace tf2xla
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/mlir_bridge_pass_util.cc b/tensorflow/compiler/mlir/tf2xla/internal/mlir_bridge_pass_util.cc
index 7bf4c74e094af5..4adb8ebd160d57 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/mlir_bridge_pass_util.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/mlir_bridge_pass_util.cc
@@ -66,7 +66,7 @@ LogicalResult HasAttr(
     // This is not expected to happen in practice
     if (!status.ok()) {
       LOG(ERROR) << "Failed to parse " << func_name << ": "
-                 << tsl::NullTerminatedMessage(status);
+                 << absl::StatusMessageAsCStr(status);
       return failure();
     }
     if (predicate(*func_body->graph)) {
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/BUILD b/tensorflow/compiler/mlir/tf2xla/internal/passes/BUILD
index 9641e092815b58..4ef78ef8d9b18d 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/BUILD
@@ -30,6 +30,7 @@ cc_library(
         ":hoist_broadcast_read",
         ":mark_ops_for_outside_compilation",
         ":tpu_cluster_formation",
+        ":tpu_sharding_identification_pass",
         ":verify_clustering_pass",
         ":xla_broadcast",
         ":xla_cluster_formation",
@@ -322,6 +323,47 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tpu_sharding_identification_pass",
+    srcs = ["tpu_sharding_identification_pass.cc"],
+    textual_hdrs = [
+        "clustering_passes.h.inc",
+    ],
+    deps = [
+        ":clustering_passes_inc_gen",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:attribute_utils",
+        "//tensorflow/compiler/mlir/tensorflow:string_util",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_analysis",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_traits",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "//tensorflow/compiler/mlir/tensorflow:tpu_rewrite_device_util",
+        "//tensorflow/compiler/mlir/tensorflow:xla_sharding_util",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:lower_tf_lib",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:tf_pass_inc_gen",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:verify_no_outside_compilation_markers_pass",
+        "//tensorflow/compiler/mlir/tf2xla/transforms:legalization_op_config",
+        "//tensorflow/compiler/mlir/tf2xla/transforms:legalize_tf",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Rewrite",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+        "@local_xla//xla:xla_data_proto_cc",
+        "@local_xla//xla/client:sharding_builder",
+    ],
+)
+
 cc_library(
     name = "hoist_broadcast_read",
     srcs = ["hoist_broadcast_read.cc"],
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h b/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h
index fb6e32ac377b79..85703c2306ad6b 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h
@@ -62,11 +62,17 @@ CreateHoistBroadcastReadPass();
 std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
 CreateXlaBroadcastPass();
 
+// Creates a pass that identifies XLASharding ops in launch op for TPU
+// computation.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateTPUShardingIdentificationPass();
+
 #define GEN_PASS_REGISTRATION
 #define GEN_PASS_DECL_MARKOPSFOROUTSIDECOMPILATIONPASS
 #define GEN_PASS_DECL_TPUCLUSTERFORMATIONPASS
 #define GEN_PASS_DECL_TPUEXTRACTHEADTAILOUTSIDECOMPILATIONPASS
 #define GEN_PASS_DECL_TPUEXTRACTOUTSIDECOMPILATIONPASS
+#define GEN_PASS_DECL_TPUSHARDINGIDENTIFICATIONPASS
 #define GEN_PASS_DECL_VERIFYCLUSTERINGPASS
 #define GEN_PASS_DECL_XLACLUSTERFORMATIONPASS
 #include "tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h.inc"
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.td b/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.td
index 2f617f7c154935..c1c34561ff0eb7 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.td
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.td
@@ -390,3 +390,55 @@ def XlaBroadcastPass : Pass<"tf-xla-broadcast", "mlir::func::FuncOp"> {
   let constructor = "tensorflow::tf2xla::internal::CreateXlaBroadcastPass()";
   let dependentDialects = ["mlir::tf_device::TensorFlowDeviceDialect"];
 }
+
+def TPUShardingIdentificationPass : Pass<"tf-tpu-sharding-identification", "ModuleOp"> {
+  let summary = "Identifies and handles inputs/outputs of TPU computation that is "
+           "sharded across logical cores.";
+  let constructor = "tensorflow::tf2xla::internal::CreateTPUShardingIdentificationPass()";
+  let description = [{
+    Bubbles up sharding configuration from `cluster_func` regions into
+    the attributes of `cluster_func`. This is done by parsing the
+    `XlaSharding` / `TPUPartitionedOutput` / `TPUPartitionedInput` ops inside
+    `cluster_func`.
+
+    For example, given the following `cluster_func` wrapping `func`:
+
+    ```mlir
+      func @test(%arg0: tensor<*xi32>) {
+        "tf_device.cluster_func"(%arg0) {
+            func = @func,
+            step_marker_location = ""} : (tensor<*xi32>) -> tensor<*xi32>
+        return
+      }
+
+      func @func(%arg0: tensor<*xi32>) -> tensor<*xi32> {
+        %0 = "tf.XlaSharding"(%arg0) {_XlaSharding = "\01\02\03",
+                                      sharding = "\01\02\03"} : (tensor<*xi32>) -> tensor<*xi32>
+        %1 = "tf.A"(%0) : (tensor<*xi32>) -> (tensor<*xi32>)
+        return %1 : tensor<*xi32>
+      }
+    ```
+
+    Now, cluster_func receives the following `*_sharding_configuration`
+    attributes, and `func` receives the mhlo.sharding attribute:
+
+    ```mlir
+      func @test(%arg0: tensor<*xi32>) {
+        %0 = "tf_device.cluster_func"(%arg0) {
+            func = @func,
+            input_sharding_configuration = ["\01\02\03"],
+            output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"],
+            step_marker_location = ""} : (tensor<*xi32>) -> tensor<*xi32>
+        return
+      }
+      func @func(%arg0: tensor<*xi32> {mhlo.sharding = "\01\02\03"}) ->
+                (tensor<*xi32> {mhlo.sharding = "\08\01\1A\01\01\22\01\00"}) {
+        %0 = "tf.XlaSharding"(%arg0) {_XlaSharding = "\01\02\03", sharding = "\01\02\03"} : (tensor<*xi32>) -> tensor<*xi32>
+        %1 = "tf.A"(%0) : (tensor<*xi32>) -> tensor<*xi32>
+        return %1 : tensor<*xi32>
+      }
+    ```
+  }];
+}
+
+
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/extract_head_tail_outside_compilation.cc b/tensorflow/compiler/mlir/tf2xla/internal/passes/extract_head_tail_outside_compilation.cc
index ad85310291c146..e0dc7bda1f9c86 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/extract_head_tail_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/extract_head_tail_outside_compilation.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h"
@@ -78,7 +79,7 @@ bool HasOutsideCompilationAttribute(Operation* op) {
 // Finds op that created a given value. If the value is a BlockArgument, this
 // returns the owner of the Block.
 Operation* GetOpOfValue(Value value) {
-  if (auto block_arg = value.dyn_cast<BlockArgument>())
+  if (auto block_arg = mlir::dyn_cast<BlockArgument>(value))
     return block_arg.getOwner()->getParentOp();
 
   return value.getDefiningOp();
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/extract_outside_compilation.cc b/tensorflow/compiler/mlir/tf2xla/internal/passes/extract_outside_compilation.cc
index 6bc3468a2729e3..66340e57012e69 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/extract_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/extract_outside_compilation.cc
@@ -97,7 +97,6 @@ constexpr char kDeviceAttr[] = "device";
 constexpr char kHostFunctionAttr[] = "host_func";
 constexpr char kXlaMapOutsideCompilationAttr[] = "_xla_map_outside_compilation";
 constexpr char kXlaOutsideCompilationAttr[] = "_xla_outside_compilation";
-constexpr char kNoReplicationCluster[] = "__no_replication_cluster";
 
 #define GEN_PASS_DEF_EXTRACTOUTSIDECOMPILATIONPASS
 #include "tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h.inc"
@@ -202,7 +201,6 @@ Operation* ApplyXlaHostTransferAttr(Operation* op, OpBuilder& builder) {
 Operation* CreateSendFromHostOp(OpBuilder& builder, Location loc,
                                 ValueRange inputs, Value compilation_key,
                                 Value device_ordinal,
-                                int default_device_ordinal,
                                 StringAttr device_type_attr,
                                 llvm::StringRef communication_key) {
   if (device_ordinal)
@@ -218,8 +216,7 @@ Operation* CreateSendFromHostOp(OpBuilder& builder, Location loc,
           loc, inputs,
           /*dynamic_key=*/compilation_key,
           builder.getStringAttr(communication_key),
-          /*device_ordinal=*/builder.getI64IntegerAttr(default_device_ordinal),
-          device_type_attr),
+          /*device_ordinal=*/builder.getI64IntegerAttr(0), device_type_attr),
       builder);
 }
 
@@ -227,8 +224,7 @@ Operation* CreateSendFromHostOp(OpBuilder& builder, Location loc,
 // present, a tf._XlaRecvAtHostV2 op is created instead.
 Operation* CreateRecvAtHostOp(OpBuilder& builder, Location loc,
                               TypeRange output_types, Value compilation_key,
-                              Value device_ordinal, int default_device_ordinal,
-                              StringAttr device_type_attr,
+                              Value device_ordinal, StringAttr device_type_attr,
                               llvm::StringRef communication_key) {
   if (device_ordinal)
     return ApplyXlaHostTransferAttr(
@@ -241,8 +237,7 @@ Operation* CreateRecvAtHostOp(OpBuilder& builder, Location loc,
       builder.create<mlir::TF::_XlaRecvAtHostOp>(
           loc, output_types, /*dynamic_key=*/compilation_key,
           builder.getStringAttr(communication_key),
-          /*device_ordinal=*/builder.getI64IntegerAttr(default_device_ordinal),
-          device_type_attr),
+          /*device_ordinal=*/builder.getI64IntegerAttr(0), device_type_attr),
       builder);
 }
 
@@ -386,7 +381,7 @@ llvm::SmallSetVector<Value, 4> GetStaticExternalOperands(
           }
           continue;
         }
-        auto block_arg = v.cast<BlockArgument>();
+        auto block_arg = mlir::cast<BlockArgument>(v);
         if (block_arg.getParentRegion() == op->getParentRegion())
           external_values.insert(v);
       }
@@ -475,7 +470,7 @@ void GetExternalOutputs(const llvm::SmallSetVector<Operation*, 4>& cluster_ops,
 LogicalResult GetShardShapedType(Operation* context_op,
                                  int num_cores_per_replica, Type full_type,
                                  Type& shard_type) {
-  RankedTensorType ranked_type = full_type.dyn_cast<RankedTensorType>();
+  RankedTensorType ranked_type = mlir::dyn_cast<RankedTensorType>(full_type);
   if (!ranked_type)
     return context_op->emitOpError()
            << "A map_outside_compilation op's input and output types must be "
@@ -587,7 +582,8 @@ LogicalResult CreateHostComputeMap(
   // Convert MANUAL sharded outputs to split sharded outputs.
   for (auto [full_type, out] :
        llvm::zip(full_output_types, host_compute.getResults())) {
-    RankedTensorType full_type_ranked = full_type.dyn_cast<RankedTensorType>();
+    RankedTensorType full_type_ranked =
+        mlir::dyn_cast<RankedTensorType>(full_type);
     if (!full_type_ranked)
       return original_op->emitOpError()
              << "map_outside_compilation must have ranked outputs";
@@ -775,9 +771,9 @@ Operation* CreateHostOps(ArrayRef<Operation*> clustered_ops,
                          ArrayRef<Value> external_operands,
                          ArrayRef<Value> external_outputs,
                          Operation* host_insertion_point, Value compilation_key,
-                         Value device_ordinal, int default_device_ordinal,
-                         StringAttr device_type_attr, OpBuilder& builder,
-                         Operation& op, std::string args_communication_key,
+                         Value device_ordinal, StringAttr device_type_attr,
+                         OpBuilder& builder, Operation& op,
+                         std::string args_communication_key,
                          std::string retvals_communication_key,
                          SmallVector<Operation*, 4>& host_ops) {
   builder.setInsertionPoint(host_insertion_point);
@@ -787,7 +783,7 @@ Operation* CreateHostOps(ArrayRef<Operation*> clustered_ops,
 
   Operation* recv_at_host = CreateRecvAtHostOp(
       builder, op.getLoc(), host_operand_types, compilation_key, device_ordinal,
-      default_device_ordinal, device_type_attr, args_communication_key);
+      device_type_attr, args_communication_key);
 
   if (!external_operands.empty()) host_ops.push_back(recv_at_host);
   Operation* after_op = recv_at_host;
@@ -801,7 +797,7 @@ Operation* CreateHostOps(ArrayRef<Operation*> clustered_ops,
   if (!external_outputs.empty()) {
     Operation* send_from_host = CreateSendFromHostOp(
         builder, op.getLoc(), external_outputs, compilation_key, device_ordinal,
-        default_device_ordinal, device_type_attr, retvals_communication_key);
+        device_type_attr, retvals_communication_key);
     host_ops.push_back(send_from_host);
   }
 
@@ -855,8 +851,8 @@ LogicalResult MoveToHostSingleCluster(
     llvm::SmallVector<IRMapping>& core_to_mapping,
     ArrayRef<Operation*> core_to_host_insertion_point,
     ArrayRef<Value> core_to_compilation_key,
-    ArrayRef<Value> core_to_device_ordinal, int default_device_ordinal,
-    StringAttr device_type_attr, bool is_map_oc, int num_cores_per_replica,
+    ArrayRef<Value> core_to_device_ordinal, StringAttr device_type_attr,
+    bool is_map_oc, int num_cores_per_replica,
     std::string& common_split_sharding, int& communication_key_index) {
   OpBuilder builder(core_to_host_insertion_point[0]);
   Operation& op = *clustered_ops.back();
@@ -891,8 +887,8 @@ LogicalResult MoveToHostSingleCluster(
       clustered_ops, external_operands, external_outputs,
       core_to_host_insertion_point[0], core_to_compilation_key[0],
       core_to_device_ordinal.empty() ? nullptr : core_to_device_ordinal[0],
-      default_device_ordinal, device_type_attr, builder, op,
-      args_communication_key, retvals_communication_key, host0_ops);
+      device_type_attr, builder, op, std::move(args_communication_key),
+      std::move(retvals_communication_key), host0_ops);
 
   if (external_operands.empty()) {
     recv_at_host->erase();
@@ -960,9 +956,8 @@ LogicalResult MoveToHostMultiCluster(
     mlir::tf_device::ClusterOp device_cluster, Block* src,
     ArrayRef<Operation*> core_to_host_insertion_point,
     ArrayRef<Value> core_to_compilation_key,
-    ArrayRef<Value> core_to_device_ordinal, int default_device_ordinal,
-    bool control_above, std::optional<bool>& is_map_oc,
-    int& communication_key_index,
+    ArrayRef<Value> core_to_device_ordinal, bool control_above,
+    std::optional<bool>& is_map_oc, int& communication_key_index,
     llvm::SmallVector<Value, 4>* return_value_from_host = nullptr) {
   const int num_cores_per_replica = core_to_host_insertion_point.size();
   // common_split_sharding is set upon the first use of map_outside_compilation.
@@ -1005,8 +1000,8 @@ LogicalResult MoveToHostMultiCluster(
               clustered_ops.getArrayRef(), external_operands.getArrayRef(),
               external_outputs.getArrayRef(), core_to_mapping,
               core_to_host_insertion_point, core_to_compilation_key,
-              core_to_device_ordinal, default_device_ordinal, device_type_attr,
-              *is_map_oc, num_cores_per_replica, common_split_sharding,
+              core_to_device_ordinal, device_type_attr, *is_map_oc,
+              num_cores_per_replica, common_split_sharding,
               communication_key_index)))
         return mlir::failure();
       clustered_ops.clear();
@@ -1033,8 +1028,8 @@ LogicalResult MoveToHostMultiCluster(
               clustered_ops.getArrayRef(), external_operands.getArrayRef(),
               external_outputs.getArrayRef(), core_to_mapping,
               core_to_host_insertion_point, core_to_compilation_key,
-              core_to_device_ordinal, default_device_ordinal, device_type_attr,
-              *is_map_oc, num_cores_per_replica, common_split_sharding,
+              core_to_device_ordinal, device_type_attr, *is_map_oc,
+              num_cores_per_replica, common_split_sharding,
               communication_key_index)))
         return mlir::failure();
       clustered_ops.clear();
@@ -1068,7 +1063,6 @@ void GetReturnValueFromDevice(
 LogicalResult DecomposeControlFlow(mlir::tf_device::ClusterOp device_cluster,
                                    ArrayRef<Value> core_to_compilation_key,
                                    ArrayRef<Value> core_to_device_ordinal,
-                                   int default_device_ordinal,
                                    int& communication_key_index,
                                    std::optional<bool>& is_map_oc) {
   auto result = device_cluster.GetBody().walk([&](Operation* op) {
@@ -1080,15 +1074,13 @@ LogicalResult DecomposeControlFlow(mlir::tf_device::ClusterOp device_cluster,
               device_cluster, &if_op.getThenBranch().front(),
               {host_if.getThenBranch().front().getTerminator()},
               core_to_compilation_key, core_to_device_ordinal,
-              default_device_ordinal, /*control_above=*/true, is_map_oc,
-              communication_key_index)))
+              /*control_above=*/true, is_map_oc, communication_key_index)))
         return WalkResult::interrupt();
       if (failed(MoveToHostMultiCluster(
               device_cluster, &if_op.getElseBranch().front(),
               {host_if.getElseBranch().front().getTerminator()},
               core_to_compilation_key, core_to_device_ordinal,
-              default_device_ordinal, /*control_above=*/true, is_map_oc,
-              communication_key_index)))
+              /*control_above=*/true, is_map_oc, communication_key_index)))
         return WalkResult::interrupt();
       // Mark op as stateful due to side-effecting communication ops.
       if_op->setAttr("is_stateless", builder.getBoolAttr(false));
@@ -1118,7 +1110,7 @@ LogicalResult DecomposeControlFlow(mlir::tf_device::ClusterOp device_cluster,
       builder.setInsertionPointToEnd(&cond.front());
       auto recv_condition_at_host = CreateRecvAtHostOp(
           builder, while_op.getLoc(), TypeRange{condition.getType()},
-          core_to_compilation_key[0], device_ordinal0, default_device_ordinal,
+          core_to_compilation_key[0], device_ordinal0,
           device_cluster->getAttrOfType<StringAttr>(
               mlir::TF::kCompileDeviceTypeAttr),
           condition_send_recv_key);
@@ -1128,15 +1120,14 @@ LogicalResult DecomposeControlFlow(mlir::tf_device::ClusterOp device_cluster,
       if (failed(MoveToHostMultiCluster(
               device_cluster, &while_op.getCond().front(),
               {recv_condition_at_host}, core_to_compilation_key,
-              core_to_device_ordinal, default_device_ordinal,
+              core_to_device_ordinal,
               /*control_above=*/true, is_map_oc, communication_key_index)))
         return WalkResult::interrupt();
       if (failed(MoveToHostMultiCluster(
               device_cluster, &while_op.getBody().front(),
               {host_while.getBody().front().getTerminator()},
               core_to_compilation_key, core_to_device_ordinal,
-              default_device_ordinal, /*control_above=*/true, is_map_oc,
-              communication_key_index)))
+              /*control_above=*/true, is_map_oc, communication_key_index)))
         return WalkResult::interrupt();
       // Mark op as stateful due to side-effecting communication ops.
       while_op->setAttr("is_stateless", builder.getBoolAttr(false));
@@ -1160,45 +1151,6 @@ void RemoveOutsideCompilation(mlir::tf_device::LaunchOp host_launch_op) {
   });
 }
 
-// This method extracts default ordinal or default device core associated with a
-// host.
-// If the cluster has replication attribute and it is not empty, then it means
-// it is replicated case and then NO ordinal info is extracted but
-// if it is non replicated cluster and there is a device attr with some
-// non-empty device, then that device's ordinal (0 out of TPU:0 and
-// 1 out of TPU:1) is extracted and the default ordinal is set to this value.
-LogicalResult GetDefaultDeviceOrdinal(mlir::tf_device::ClusterOp device_cluster,
-                                      int& default_ordinal) {
-  bool has_replication =
-      device_cluster->hasAttr(mlir::TF::kReplicationInfoAttr);
-
-  std::string replication_info;
-  if (has_replication) {
-    replication_info =
-        device_cluster
-            ->getAttrOfType<StringAttr>(mlir::TF::kReplicationInfoAttr)
-            .str();
-  }
-  if (replication_info == kNoReplicationCluster || replication_info.empty()) {
-    has_replication = false;
-  }
-  if (!has_replication &&
-      device_cluster->hasAttrOfType<StringAttr>(kDeviceAttr) &&
-      !device_cluster->getAttrOfType<StringAttr>(kDeviceAttr).str().empty()) {
-    int64_t ordinal = 0;
-    mlir::LogicalResult result = tensorflow::GetDeviceOrdinalFromDeviceString(
-        mlir::UnknownLoc::get(device_cluster.getContext()),
-        device_cluster->getAttrOfType<StringAttr>(kDeviceAttr).str(), &ordinal);
-    if (succeeded(result)) {
-      default_ordinal = ordinal;
-    } else {
-      return device_cluster.emitError()
-             << " could not find ordinal for the given device";
-    }
-  }
-  return mlir::success();
-}
-
 // The results of parallel executes is the combination of return values from
 // both host and device.
 llvm::SmallVector<Type, 4> GetParallelExecuteResultsTypes(
@@ -1485,19 +1437,15 @@ LogicalResult CreateParallelExecuteForOutsideCompilation(
   }
 
   builder.setInsertionPoint(tmp_parallel_execute_op);
-  int default_device_ordinal = 0;
-  if (failed(GetDefaultDeviceOrdinal(device_cluster, default_device_ordinal))) {
-    return mlir::failure();
-  }
   // communication_key_index is part of the message identifier and is
   // incremented for each _XlaHostComputeMlir.
   int communication_key_index = 0;
 
   // Decompose control flow into device and host control flow when outside
   // compilation is included.
-  if (failed(DecomposeControlFlow(
-          device_cluster, core_to_compilation_key, core_to_device_ordinal,
-          default_device_ordinal, communication_key_index, is_map_oc)))
+  if (failed(DecomposeControlFlow(device_cluster, core_to_compilation_key,
+                                  core_to_device_ordinal,
+                                  communication_key_index, is_map_oc)))
     return mlir::failure();
 
   // Move all outside compiled ops including control flow to tmp host launch.
@@ -1505,7 +1453,7 @@ LogicalResult CreateParallelExecuteForOutsideCompilation(
   if (failed(MoveToHostMultiCluster(
           device_cluster, &device_cluster.GetBody(),
           core_to_host_insertion_point, core_to_compilation_key,
-          core_to_device_ordinal, default_device_ordinal,
+          core_to_device_ordinal,
           /*control_above=*/false, is_map_oc, communication_key_index,
           &returns_from_host)))
     return mlir::failure();
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/hoist_broadcast_read.cc b/tensorflow/compiler/mlir/tf2xla/internal/passes/hoist_broadcast_read.cc
index 732bae8c67b018..f16df445439084 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/hoist_broadcast_read.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/hoist_broadcast_read.cc
@@ -72,7 +72,7 @@ Operation* GetAncestorBelow(Operation* descendant, Operation* ancestor) {
 // `is_cpu_read` is set to `true` iff `read` is on a resource with device type
 // CPU.
 LogicalResult IsCpuRead(FuncOp func, ReadVariableOp read, bool& is_cpu_read) {
-  if (auto arg = read->getOperand(0).dyn_cast<BlockArgument>()) {
+  if (auto arg = mlir::dyn_cast<BlockArgument>(read->getOperand(0))) {
     if (arg.getOwner() != &(func.front())) {
       is_cpu_read = false;
       return success();
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/input_lowering_metrics_pass.cc b/tensorflow/compiler/mlir/tf2xla/internal/passes/input_lowering_metrics_pass.cc
index bc0e25f505e11b..d6c92101bf608a 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/input_lowering_metrics_pass.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/input_lowering_metrics_pass.cc
@@ -30,9 +30,7 @@ namespace internal {
 
 namespace {
 
-using llvm::DenseSet;
 using mlir::Operation;
-using mlir::TypeID;
 using mlir::WalkResult;
 
 #define GEN_PASS_DEF_INPUTLOWERINGMETRICSPASS
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/mark_ops_for_outside_compilation.cc b/tensorflow/compiler/mlir/tf2xla/internal/passes/mark_ops_for_outside_compilation.cc
index dde1fd4514d719..7308669b6359cb 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/mark_ops_for_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/mark_ops_for_outside_compilation.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Rewrite/PatternApplicator.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
@@ -161,6 +162,8 @@ void AddSupportedFunctionalOps(MLIRContext* context,
       OperationName(mlir::TF::WhileRegionOp::getOperationName(), context));
   supported_ops->insert(
       OperationName(mlir::TF::XlaCallModuleOp::getOperationName(), context));
+  supported_ops->insert(
+      OperationName(mlir::TF::XlaHostComputeOp::getOperationName(), context));
   supported_ops->insert(
       OperationName(mlir::TF::XlaReduceOp::getOperationName(), context));
   supported_ops->insert(
@@ -236,13 +239,13 @@ void AddRewrittenCompositeOps(MLIRContext* context,
 }
 
 bool IsStringType(Type type) {
-  if (type.isa<mlir::TF::StringType>()) return true;
+  if (mlir::isa<mlir::TF::StringType>(type)) return true;
 
-  auto sub_type = type.dyn_cast<mlir::TF::TensorFlowTypeWithSubtype>();
+  auto sub_type = mlir::dyn_cast<mlir::TF::TensorFlowTypeWithSubtype>(type);
   if (!sub_type) return false;
 
   bool has_string = llvm::any_of(sub_type.GetSubtypes(), [](TensorType type) {
-    return type.getElementType().isa<mlir::TF::StringType>();
+    return mlir::isa<mlir::TF::StringType>(type.getElementType());
   });
   return has_string;
 }
@@ -288,7 +291,8 @@ bool IsSupportedOp(Operation& op,
 }
 
 bool IsVariant(Value value) {
-  return getElementTypeOrSelf(value.getType()).isa<mlir::TF::VariantType>();
+  return mlir::isa<mlir::TF::VariantType>(
+      getElementTypeOrSelf(value.getType()));
 }
 
 bool HasOutsideCompiledAncestor(Operation* op) {
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/tpu_cluster_formation.cc b/tensorflow/compiler/mlir/tf2xla/internal/passes/tpu_cluster_formation.cc
index b600c865661d58..e76e2e3bc86b14 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/tpu_cluster_formation.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/tpu_cluster_formation.cc
@@ -51,6 +51,7 @@ limitations under the License.
 #include "mlir/IR/ValueRange.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h"
@@ -90,7 +91,6 @@ constexpr llvm::StringRef kNumCoresPerReplicaAttr = "num_cores_per_replica";
 constexpr llvm::StringRef kNumReplicasAttr = "num_replicas";
 constexpr llvm::StringRef kMirroredVariableIndicesAttr =
     "_mirrored_variable_indices";
-constexpr llvm::StringRef kNoReplicationCluster = "__no_replication_cluster";
 
 constexpr llvm::StringRef kBadReplicateInfoAttrMsg =
     "requires '_replication_info' string attribute";
@@ -142,7 +142,7 @@ LogicalResult CollectMetadata(Block* block, MetadataMap* metadata_map) {
       return metadata_op.emitError() << kBadReplicateInfoAttrMsg;
 
     auto replication_info_attr_str =
-        replication_info_attr.dyn_cast<StringAttr>();
+        mlir::dyn_cast<StringAttr>(replication_info_attr);
     if (!replication_info_attr_str ||
         replication_info_attr_str.getValue().empty())
       return metadata_op.emitError() << kBadReplicateInfoAttrMsg;
@@ -171,39 +171,50 @@ struct OpDevice {
   std::string device;
 };
 
-// Collects and clusters ops either based on `_replication_info` attribute
-// (replicated case) or using one single cluster (non-replicated case). Also
-// sets `device_type` if there is any cluster (note that the device type must be
-// unique, otherwise we emit an error).
-// Returns an error in case of invalid compilation or replication attribute(s).
-LogicalResult CollectAndGroupClusterOps(Block* block, ClusterMap* clusters,
-                                        std::string& device_type,
-                                        std::string& device) {
-  bool has_replicated_compiled_op = false;
-  bool has_non_replicated_compiled_op = false;
-  bool has_local_device_name_collisions = false;
+LogicalResult HasValidDeviceTypeAttribute(Block* block) {
   // Use ordered set here to make error message below deterministic.
   std::set<llvm::StringRef> device_types;
-  absl::flat_hash_map<std::string, OpDevice> devices;
+  for (Operation& op : *block) {
+    // Collect device types which currently must be consistent per block
+    // (checked later).
+    if (auto device_type_attr =
+            op.getAttrOfType<StringAttr>(mlir::TF::kCompileDeviceTypeAttr)) {
+      // tf.StatefulPartitionedCall ops with and without
+      // _tpu_replicate attributes may exist in the same graph. Ops without
+      // the attribute but with _XlaMustCompile=true would have
+      // _xla_compile_device_type="" after
+      // CanonicalizeCompileAndReplicateAttributesPass. Skip empty value here.
+      if (!device_type_attr.getValue().empty()) {
+        device_types.insert(device_type_attr);
+      }
+    }
+  }
+
+  if (device_types.size() > 1) {
+    return block->getParentOp()->emitError()
+           << "found different '" << mlir::TF::kCompileDeviceTypeAttr
+           << "' attribute values (" << llvm::join(device_types, ",")
+           << ") in same block which is not supported";
+  }
+  return success();
+}
+
+// Collects and clusters ops based on `_replication_info` attribute. Returns
+// an error in case of invalid compilation or replication attribute(s).
+LogicalResult CollectAndGroupClusterOps(Block* block, ClusterMap* clusters) {
+  LogicalResult result = HasValidDeviceTypeAttribute(block);
+  if (failed(result)) return result;
+
   for (Operation& op : *block) {
     LogicalResult result =
         mlir::TF::HasValidCompilationAndReplicationAttributes(op);
     if (failed(result)) return result;
 
-    // Collect device types which currently must be consistent per block
-    // (checked later).
+    // Skip ops with non-TPU device type, they are handled elsewhere.
     auto device_type_attr =
         op.getAttrOfType<StringAttr>(mlir::TF::kCompileDeviceTypeAttr);
     if (device_type_attr) {
-      // Some graphs in TPU bridge may have both tf.StatefulPartitionedCall
-      // ops with and without _tpu_replicate attributes. As a result, the ops
-      // without such attribute would have _xla_compile_device_type="" after
-      // CanonicalizeCompileAndReplicateAttributesPass, if they also had
-      // _XlaMustCompile = true before the pass. We should filter out such
-      // unspecified device type here.
       if (device_type_attr.getValue().empty()) continue;
-      device_types.insert(device_type_attr);
-      // Stop here for ops with non-TPU devices, they are handled elsewhere.
       if (device_type_attr.getValue() != mlir::TF::kTpuDevice) continue;
     }
 
@@ -213,105 +224,10 @@ LogicalResult CollectAndGroupClusterOps(Block* block, ClusterMap* clusters,
       // `HasValidCompilationAndReplicationAttributes` above, assert here for
       // documentation and to avoid breakage when that function is changed.
       assert(op.hasAttr(mlir::TF::kCompileDeviceTypeAttr));
-      has_replicated_compiled_op = true;
       auto attr = op.getAttrOfType<StringAttr>(mlir::TF::kReplicationInfoAttr);
       auto it = clusters->try_emplace(attr.getValue());
       it.first->getSecond().insert(&op);
-    } else if (op.hasAttr(mlir::TF::kCompileDeviceTypeAttr)) {
-      // For non-replicated case, assume one cluster per block (in line with
-      // Framework behavior).
-      has_non_replicated_compiled_op = true;
-      auto it = clusters->try_emplace(kNoReplicationCluster);
-      it.first->getSecond().insert(&op);
     }
-    auto device_attr = op.getAttrOfType<StringAttr>(kDeviceAttr);
-    std::string device_local_name;
-    bool is_tpu_device = false;
-    if (device_attr && !device_attr.str().empty()) {
-      tensorflow::DeviceNameUtils::ParsedName parsed;
-      if (!tensorflow::DeviceNameUtils::ParseFullOrLocalName(device_attr.str(),
-                                                             &parsed)) {
-        op.emitWarning() << "Invalid device name " << device_attr.str();
-        return mlir::failure();
-      }
-
-      device_local_name =
-          tensorflow::DeviceNameUtils::LocalName(parsed.type, parsed.id);
-      is_tpu_device = parsed.type == "TPU";
-    }
-
-    // Ignore non-TPU devices when clustering.
-    if (!is_tpu_device) {
-      continue;
-    }
-
-    if (!has_replicated_compiled_op && !device_local_name.empty()) {
-      // It is possible that a device may be same Local Name but
-      // different fullname. Devices with same Local name are identical
-      // so they should only be added once in 'devices'.
-      // and we need the fullname which is longer since longer name has more
-      // information such as task, replica, job etc. An example fullname is
-      // "/job:foo_bar/replica:1/task:2/device:GPU:3"
-      if (devices.count(device_local_name)) {
-        std::string device1 = devices[device_local_name].device;
-        std::string device2 = device_attr.str();
-        // Is either of the two devices just a substring of the other? If
-        // not, we treat them as different devices, and we have a collision.
-        if (device1.find(device2) == std::string::npos &&
-            device2.find(device1) == std::string::npos) {
-          Operation* previous_op = devices[device_local_name].op;
-          has_local_device_name_collisions = true;
-
-          LOG_FIRST_N(WARNING, 1)
-              << "Found two devices with same local name " << device_local_name
-              << " but conflicting fullname: " << device1 << " and " << device2
-              << ".";
-          LOG_FIRST_N(WARNING, 1)
-              << "Previous assignment came from op: "
-              << tensorflow::OpAsString(*previous_op)
-              << ". Current op is: " << tensorflow::OpAsString(op);
-        }
-        // Always keep the longer name.
-        if (devices[device_local_name].device.size() <
-            device_attr.str().size()) {
-          devices[device_local_name] = {&op, device_attr.str()};
-        }
-      } else {
-        devices.insert({device_local_name, {&op, device_attr.str()}});
-      }
-    }
-  }
-  // Do some checks for unsupported cases.
-  if (has_replicated_compiled_op && has_non_replicated_compiled_op) {
-    return block->getParentOp()->emitError()
-           << "found mixed replicated and non-replicated compiled ops in same "
-              "block which is not supported";
-  }
-  if (device_types.size() > 1) {
-    return block->getParentOp()->emitError()
-           << "found different '" << mlir::TF::kCompileDeviceTypeAttr
-           << "' attribute values (" << llvm::join(device_types, ",")
-           << ") in same block which is not supported";
-  }
-  if (!has_replicated_compiled_op) {
-    if (devices.size() > 1) {
-      LOG(WARNING) << "found different devices for no replication: ";
-      for (const auto& device_names : devices) {
-        LOG(WARNING) << device_names.first << ", "
-                     << device_names.second.device;
-      }
-    } else if (has_local_device_name_collisions) {
-      LOG(WARNING) << "Not assigning device because of conflicting fullnames.";
-    } else if (devices.size() == 1 &&
-               absl::StrContains(devices.begin()->second.device, "TPU:")) {
-      device = devices.begin()->second.device;
-    }
-  }
-  if (!clusters->empty()) {
-    // Note that for size < 1 we shouldn't have any cluster while for size > 1
-    // we should have returned with an error above.
-    assert(device_types.size() == 1);
-    device_type = device_types.begin()->str();
   }
   return success();
 }
@@ -637,7 +553,7 @@ Operation* BuildPartitionedOutputs(
   builder.create<mlir::tf_device::ReturnOp>(result_op->getLoc(), results);
 
   // Then erase all the identity and partitioned output ops.
-  for (auto [_, ops] : partitioned_outputs) {
+  for (const auto& [_, ops] : partitioned_outputs) {
     for (mlir::TF::TPUPartitionedOutputV2Op op : ops) {
       op->erase();
     }
@@ -885,39 +801,14 @@ LogicalResult ReplicateCluster(mlir::tf_device::ClusterOp cluster,
   return success();
 }
 
-void SetNoReplicationClusterAttrs(mlir::tf_device::ClusterOp cluster,
-                                  llvm::StringRef device_type,
-                                  llvm::StringRef device) {
-  OpBuilder builder(cluster);
-  cluster->setAttr(mlir::TF::kReplicationInfoAttr,
-                   builder.getStringAttr(kNoReplicationCluster));
-  cluster->setAttr(mlir::TF::kCompileDeviceTypeAttr,
-                   builder.getStringAttr(device_type));
-
-  if (!device.empty()) {
-    cluster->setAttr(kDeviceAttr, builder.getStringAttr(device));
-  }
-  // TODO(b/229992058) Propagate `allow_soft_placement` (and other attributes?)
-  // instead of hard-coding.
-  cluster->setAttr("allow_soft_placement", builder.getBoolAttr(true));
-  cluster->setAttr("topology", builder.getStringAttr(""));
-  cluster->setAttr("num_cores_per_replica",
-                   builder.getIntegerAttr(builder.getI32Type(), 1));
-  cluster->setAttr("device_assignment", builder.getArrayAttr({}));
-  cluster->setAttr("use_spmd_for_xla_partitioning", builder.getBoolAttr(false));
-  cluster->setAttr("step_marker_location", builder.getStringAttr(""));
-}
-
-// Forms compilation clusters in `block`. If the block contains a
-// `TPUReplicateMetadata` op, then we form clusters according to
-// `_replication_info` values (ops with same value go to same cluster).
-// Otherwise, in the non-replicated case, we build one compilation cluster per
+// Forms clusters with ops of the same `_replication_info` attribute under a
 // block.
 //
-// We do this in following steps:
-//   1. Find `TPUReplicateMetadata` op in `block` (might not exist).
-//   2. Collect and group cluster ops (either based on `_replication_info`
-//      attributes or forming one single cluster).
+// For a given block, clusters are formed via grouping ops by
+// `_replication_info` attributes. For every cluster formed:
+//   1. Find associated TPUReplicateMetadata attributes with the same
+//      `_replication_info` attribute.
+//   2. Find users not in cluster that are interleaved between cluster ops.
 //   3. Find external uses of cluster ops.
 //   4. Create `tf_device.cluster` with results consisting of the external uses
 //      of cluster ops determined at 3.
@@ -948,24 +839,22 @@ LogicalResult FormClustersInBlock(
           return mlir::failure();
       }
     }
+    return success();
   }
 
   ClusterMap clusters;
-  std::string device_type;
-  std::string device;
-  result = CollectAndGroupClusterOps(block, &clusters, device_type, device);
+  result = CollectAndGroupClusterOps(block, &clusters);
   if (failed(result)) return result;
 
   for (const auto& cluster_metadata_and_ops : clusters) {
     const auto& cluster_ops = cluster_metadata_and_ops.getSecond();
 
-    bool has_replication =
-        cluster_metadata_and_ops.getFirst() != kNoReplicationCluster;
     auto cluster_metadata =
         metadata_map.find(cluster_metadata_and_ops.getFirst());
 
-    // No TPUReplicateMetadata for a `_replication_info` attribute.
-    if (has_replication && cluster_metadata == metadata_map.end()) {
+    // llvm::errs() << __func__ << "\n";
+    //  No TPUReplicateMetadata for a `_replication_info` attribute.
+    if (cluster_metadata == metadata_map.end()) {
       block->getParentOp()->emitWarning()
           << "TPUReplicateMetadata for associated '"
           << mlir::TF::kReplicationInfoAttr << "' attribute '"
@@ -984,27 +873,19 @@ LogicalResult FormClustersInBlock(
     mlir::tf_device::ClusterOp cluster = CreateClusterOp(
         block, cluster_ops, results, cluster_successor_ops.getArrayRef());
 
-    if (!has_replication) {
-      SetNoReplicationClusterAttrs(cluster, device_type, device);
-      continue;
-    }
-    // Determine `num_replicas`.
-    auto num_replicas_attr =
-        cluster_metadata->getSecond().get(kNumReplicasAttr);
-    if (!num_replicas_attr || !num_replicas_attr.isa<mlir::IntegerAttr>())
+    auto num_replicas = cluster_metadata->getSecond().get(kNumReplicasAttr);
+    if (!num_replicas || !num_replicas.isa<mlir::IntegerAttr>())
       return cluster.emitError()
              << "requires '" << kNumReplicasAttr << "' int attribute";
-    int num_replicas = num_replicas_attr.cast<mlir::IntegerAttr>().getInt();
 
-    // Determine `num_cores_per_replica`.
     int num_cores_per_replica = 1;
-    auto num_cores_per_replica_attr =
-        cluster_metadata->getSecond()
-            .get(kNumCoresPerReplicaAttr)
-            .dyn_cast_or_null<mlir::IntegerAttr>();
+    auto num_cores_per_replica_attr = mlir::dyn_cast_or_null<mlir::IntegerAttr>(
+        cluster_metadata->getSecond().get(kNumCoresPerReplicaAttr));
     if (num_cores_per_replica_attr)
       num_cores_per_replica = num_cores_per_replica_attr.getInt();
-    if (failed(ReplicateCluster(cluster, num_replicas, num_cores_per_replica)))
+    if (failed(ReplicateCluster(cluster,
+                                num_replicas.cast<mlir::IntegerAttr>().getInt(),
+                                num_cores_per_replica)))
       return mlir::failure();
 
     // Copy TPUReplicateMetadata attributes to `tf_device.cluster`.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc b/tensorflow/compiler/mlir/tf2xla/internal/passes/tpu_sharding_identification_pass.cc
similarity index 88%
rename from tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc
rename to tensorflow/compiler/mlir/tf2xla/internal/passes/tpu_sharding_identification_pass.cc
index fb2588f50631e8..ba35b03e8d6be7 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/tpu_sharding_identification_pass.cc
@@ -1,4 +1,4 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -53,14 +53,30 @@ limitations under the License.
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 
-namespace mlir {
-namespace TFTPU {
+namespace tensorflow {
+namespace tf2xla {
+namespace internal {
 namespace {
 
 using OpShardingVariant = std::variant<mlir::Operation*, llvm::StringRef>;
 using OpShardingVector = llvm::SmallVector<OpShardingVariant, 8>;
 using OptionalOpShardingVector =
     llvm::SmallVector<std::optional<OpShardingVariant>, 8>;
+using llvm::StringRef;
+using mlir::Block;
+using mlir::BlockArgument;
+using mlir::BoolAttr;
+using mlir::Builder;
+using mlir::IntegerAttr;
+using mlir::LogicalResult;
+using mlir::ModuleOp;
+using mlir::Operation;
+using mlir::OpOperand;
+using mlir::OpResult;
+using mlir::RankedTensorType;
+using mlir::StringAttr;
+using mlir::Value;
+using mlir::WalkResult;
 
 constexpr char kReplicateSharding[] = "";
 constexpr char kShardingAttr[] = "mhlo.sharding";
@@ -69,7 +85,7 @@ constexpr char kAliasingAttr[] = "tf.aliasing_output";
 constexpr char kNumCoresPerReplicaAttr[] = "num_cores_per_replica";
 
 #define GEN_PASS_DEF_TPUSHARDINGIDENTIFICATIONPASS
-#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.h.inc"
+#include "tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h.inc"
 
 struct TPUShardingIdentificationPass
     : public impl::TPUShardingIdentificationPassBase<
@@ -93,11 +109,11 @@ mlir::Operation* NullUnlessSharded(PartitionedOp op) {
 // a `tf_device.cluster_func`.
 mlir::Operation* GetXlaShardingFromOperand(Value value) {
   Value value_to_visit = value;
-  if (auto read_var = value_to_visit.getDefiningOp<TF::ReadVariableOp>())
+  if (auto read_var = value_to_visit.getDefiningOp<mlir::TF::ReadVariableOp>())
     value_to_visit = read_var.getResource();
 
   if (auto partitioned_input =
-          value_to_visit.getDefiningOp<TF::TPUPartitionedInputV2Op>()) {
+          value_to_visit.getDefiningOp<mlir::TF::TPUPartitionedInputV2Op>()) {
     return NullUnlessSharded(partitioned_input);
   }
 
@@ -107,10 +123,10 @@ mlir::Operation* GetXlaShardingFromOperand(Value value) {
 // Returns the op sharding attribute from a partitioned operator.
 std::optional<StringRef> GetXlaShardingFromOperator(mlir::Operation* op) {
   if (auto partitioned_output =
-          llvm::dyn_cast<TF::TPUPartitionedOutputV2Op>(op)) {
+          llvm::dyn_cast<mlir::TF::TPUPartitionedOutputV2Op>(op)) {
     return partitioned_output.get_XlaSharding();
   } else if (auto partitioned_input =
-                 llvm::dyn_cast<TF::TPUPartitionedInputV2Op>(op)) {
+                 llvm::dyn_cast<mlir::TF::TPUPartitionedInputV2Op>(op)) {
     return partitioned_input.get_XlaSharding();
   } else {
     return std::nullopt;
@@ -174,9 +190,9 @@ LogicalResult VerifySharding(mlir::Type type,
     // Some test cases use \01\02\03 as sharding, to test propagation. Treat
     // a non-proto sharding as valid, and don't verify further. We also only
     // verify shardings that actually break a tensor apart.
-    return success();
+    return mlir::success();
   }
-  if (RankedTensorType ranked_type = type.dyn_cast<RankedTensorType>()) {
+  if (RankedTensorType ranked_type = mlir::dyn_cast<RankedTensorType>(type)) {
     const int64_t tensor_rank = ranked_type.getRank();
     int tile_assignment_rank = sharding->tile_assignment_dimensions_size();
 
@@ -194,10 +210,10 @@ LogicalResult VerifySharding(mlir::Type type,
             << " extra dimension(s) by: " << sharding->DebugString();
       }
 
-      return failure();
+      return mlir::failure();
     }
   }
-  return success();
+  return mlir::success();
 }
 
 // Verify sharding for all arguments and return values.
@@ -209,7 +225,7 @@ LogicalResult VerifyShardings(mlir::func::FuncOp func,
        llvm::zip(sharding_for_args, function_block.getArguments())) {
     const auto& sharding = std::get<0>(sharding_and_arg);
     BlockArgument arg = std::get<1>(sharding_and_arg);
-    if (failed(VerifySharding(arg.getType(), sharding))) return failure();
+    if (failed(VerifySharding(arg.getType(), sharding))) return mlir::failure();
   }
   Operation* terminator = function_block.getTerminator();
   for (auto sharding_and_retval :
@@ -217,9 +233,9 @@ LogicalResult VerifyShardings(mlir::func::FuncOp func,
     const auto& sharding = std::get<0>(sharding_and_retval);
     OpOperand& retval = std::get<1>(sharding_and_retval);
     if (failed(VerifySharding(retval.get().getType(), sharding)))
-      return failure();
+      return mlir::failure();
   }
-  return success();
+  return mlir::success();
 }
 
 // Assign the logical device if an op has an attribute `TPU_REPLICATED_CORE:n`,
@@ -262,7 +278,7 @@ std::optional<llvm::StringRef> GetXlaShardingFromArg(
 
       for (auto& use : value_to_visit.getUses()) {
         Operation* owner = use.getOwner();
-        if (auto sharding = llvm::dyn_cast<TF::XlaShardingOp>(owner))
+        if (auto sharding = llvm::dyn_cast<mlir::TF::XlaShardingOp>(owner))
           return sharding.get_XlaSharding();
 
         if (auto logical_device = AssignLogicalDeviceFromTPUReplicatedCoreAttr(
@@ -270,7 +286,7 @@ std::optional<llvm::StringRef> GetXlaShardingFromArg(
           return logical_device;
         }
 
-        if (auto while_op = llvm::dyn_cast<TF::WhileRegionOp>(owner)) {
+        if (auto while_op = llvm::dyn_cast<mlir::TF::WhileRegionOp>(owner)) {
           const int operand_number = use.getOperandNumber();
           next_values_to_visit.push_back(
               while_op.getCond().front().getArgument(operand_number));
@@ -279,14 +295,15 @@ std::optional<llvm::StringRef> GetXlaShardingFromArg(
           continue;
         }
 
-        if (llvm::isa<TF::IdentityOp, TF::CastOp, TF::ReadVariableOp>(owner)) {
+        if (llvm::isa<mlir::TF::IdentityOp, mlir::TF::CastOp,
+                      mlir::TF::ReadVariableOp>(owner)) {
           next_values_to_visit.push_back(use.getOwner()->getResult(0));
           continue;
         }
 
-        if (auto call_op = llvm::dyn_cast<CallOpInterface>(owner)) {
-          func::FuncOp func =
-              llvm::dyn_cast<func::FuncOp>(call_op.resolveCallable());
+        if (auto call_op = llvm::dyn_cast<mlir::CallOpInterface>(owner)) {
+          mlir::func::FuncOp func =
+              llvm::dyn_cast<mlir::func::FuncOp>(call_op.resolveCallable());
           if (!func) continue;
           next_values_to_visit.push_back(
               func.getArgument(use.getOperandNumber()));
@@ -307,8 +324,8 @@ std::optional<llvm::StringRef> GetXlaShardingFromArg(
 // XlaSharding op.
 void IdentifyXlaShardingForComputationInputs(
     const llvm::SmallVector<std::string>& logical_device_vec,
-    bool infer_from_computation, tf_device::ClusterFuncOp cluster_func,
-    func::FuncOp func, Builder* builder,
+    bool infer_from_computation, mlir::tf_device::ClusterFuncOp cluster_func,
+    mlir::func::FuncOp func, Builder* builder,
     OptionalOpShardingVector& sharding_for_args) {
   // Look up function definition from module.
   Block& function_block = func.front();
@@ -360,20 +377,20 @@ mlir::Operation* GetXlaShardingFromResult(Value value) {
 
   Operation* user = *value.getUsers().begin();
   if (auto partitioned_output =
-          llvm::dyn_cast<TF::TPUPartitionedOutputV2Op>(user))
+          llvm::dyn_cast<mlir::TF::TPUPartitionedOutputV2Op>(user))
     return NullUnlessSharded(partitioned_output);
 
-  if (auto assign_var = llvm::dyn_cast<TF::AssignVariableOp>(user))
+  if (auto assign_var = llvm::dyn_cast<mlir::TF::AssignVariableOp>(user))
     if (auto partitioned_input =
             assign_var.getResource()
-                .getDefiningOp<TF::TPUPartitionedInputV2Op>())
+                .getDefiningOp<mlir::TF::TPUPartitionedInputV2Op>())
       return NullUnlessSharded(partitioned_input);
 
   return nullptr;
 }
 
 absl::Status DetermineShardingFromAlias(
-    func::FuncOp func, OptionalOpShardingVector& input_shardings,
+    mlir::func::FuncOp func, OptionalOpShardingVector& input_shardings,
     OptionalOpShardingVector& output_shardings) {
   for (int arg_idx = 0; arg_idx < func.getNumArguments(); ++arg_idx) {
     if (auto v =
@@ -427,7 +444,7 @@ std::optional<StringRef> GetXlaShardingFromRetval(
       continue;
     }
 
-    if (auto sharding = llvm::dyn_cast_or_null<TF::XlaShardingOp>(def))
+    if (auto sharding = llvm::dyn_cast_or_null<mlir::TF::XlaShardingOp>(def))
       return sharding.get_XlaSharding();
 
     if (auto sharding = def->getAttrOfType<StringAttr>("_XlaSharding")) {
@@ -456,20 +473,20 @@ std::optional<StringRef> GetXlaShardingFromRetval(
       continue;
     }
 
-    if (auto call_op = llvm::dyn_cast_or_null<CallOpInterface>(def)) {
-      func::FuncOp func =
-          llvm::dyn_cast<func::FuncOp>(call_op.resolveCallable());
+    if (auto call_op = llvm::dyn_cast_or_null<mlir::CallOpInterface>(def)) {
+      mlir::func::FuncOp func =
+          llvm::dyn_cast<mlir::func::FuncOp>(call_op.resolveCallable());
       if (!func) continue;
       value_to_visit = func.front().getTerminator()->getOperand(
-          value_to_visit.cast<OpResult>().getResultNumber());
+          mlir::cast<OpResult>(value_to_visit).getResultNumber());
       values_to_visit.push_back(value_to_visit);
       continue;
     }
 
-    if (auto while_op = llvm::dyn_cast<TF::WhileRegionOp>(def)) {
-      if (auto op_result = value_to_visit.cast<OpResult>()) {
+    if (auto while_op = llvm::dyn_cast<mlir::TF::WhileRegionOp>(def)) {
+      if (auto op_result = mlir::cast<OpResult>(value_to_visit)) {
         int result_idx = op_result.getResultNumber();
-        if (auto yield_op = llvm::dyn_cast<TF::YieldOp>(
+        if (auto yield_op = llvm::dyn_cast<mlir::TF::YieldOp>(
                 while_op.getBody().front().getTerminator())) {
           values_to_visit.push_back(yield_op.getOperand(result_idx));
         }
@@ -485,8 +502,8 @@ std::optional<StringRef> GetXlaShardingFromRetval(
 // XlaSharding/ TPUPartitionedOutput op connected to the retvals/results.
 void IdentifyXlaShardingForComputationOutputs(
     const llvm::SmallVector<std::string>& logical_device_vec,
-    bool infer_from_computation, tf_device::ClusterFuncOp cluster_func,
-    func::FuncOp func, Builder* builder,
+    bool infer_from_computation, mlir::tf_device::ClusterFuncOp cluster_func,
+    mlir::func::FuncOp func, Builder* builder,
     OptionalOpShardingVector& sharding_for_rets) {
   Block& function_block = func.front();
   Operation* terminator = function_block.getTerminator();
@@ -566,8 +583,8 @@ absl::Status MoveSharding(OptionalOpShardingVector& optional_shardings,
 // depending on `use_spmd`.
 absl::Status IdentifyXlaShardingForInputsAndOutputs(
     const llvm::SmallVector<std::string>& logical_device_vec, bool use_spmd,
-    bool infer_from_computation, tf_device::ClusterFuncOp cluster_func,
-    func::FuncOp func, Builder* builder, OpShardingVector& input_sharding,
+    bool infer_from_computation, mlir::tf_device::ClusterFuncOp cluster_func,
+    mlir::func::FuncOp func, Builder* builder, OpShardingVector& input_sharding,
     OpShardingVector& output_sharding) {
   OptionalOpShardingVector optional_input_sharding;
   OptionalOpShardingVector optional_output_sharding;
@@ -592,11 +609,11 @@ absl::Status IdentifyXlaShardingForInputsAndOutputs(
 // Extracts input/output sharding configuration of `cluster_func` by parsing
 // XlaSharding ops inside the `cluster_func`.
 LogicalResult IdentifyXlaShardingForTPUComputation(
-    Builder* builder, tf_device::ClusterFuncOp cluster_func) {
+    Builder* builder, mlir::tf_device::ClusterFuncOp cluster_func) {
   // Look up function definition from module.
-  func::FuncOp func =
-      cluster_func->getParentOfType<ModuleOp>().lookupSymbol<func::FuncOp>(
-          cluster_func.getFunc());
+  mlir::func::FuncOp func =
+      cluster_func->getParentOfType<ModuleOp>()
+          .lookupSymbol<mlir::func::FuncOp>(cluster_func.getFunc());
 
   bool use_spmd = false;
   if (auto use_spmd_attr = cluster_func->getAttrOfType<BoolAttr>(kUseSpmdAttr))
@@ -624,7 +641,7 @@ LogicalResult IdentifyXlaShardingForTPUComputation(
           sharding_for_args, sharding_for_rets);
       !status.ok()) {
     LOG(ERROR) << status;
-    return failure();
+    return mlir::failure();
   };
 
   auto has_maximal_sharding =
@@ -654,7 +671,7 @@ LogicalResult IdentifyXlaShardingForTPUComputation(
             sharding_for_args, sharding_for_rets);
         !status.ok()) {
       LOG(ERROR) << status;
-      return failure();
+      return mlir::failure();
     }
   }
 
@@ -686,26 +703,30 @@ LogicalResult IdentifyXlaShardingForTPUComputation(
                         GetStrArrayAttr(builder, sharding_for_args));
   cluster_func->setAttr(tensorflow::kOutputShardingAttr,
                         GetStrArrayAttr(builder, sharding_for_rets));
-  return success();
+  return mlir::success();
 }
 
 void TPUShardingIdentificationPass::runOnOperation() {
   Builder builder(getOperation().getContext());
 
-  auto result = getOperation().walk([&](tf_device::ClusterFuncOp cluster_func) {
-    if (failed(IdentifyXlaShardingForTPUComputation(&builder, cluster_func))) {
-      return WalkResult::interrupt();
-    }
-    return WalkResult::advance();
-  });
+  auto result =
+      getOperation().walk([&](mlir::tf_device::ClusterFuncOp cluster_func) {
+        if (failed(
+                IdentifyXlaShardingForTPUComputation(&builder, cluster_func))) {
+          return WalkResult::interrupt();
+        }
+        return WalkResult::advance();
+      });
   if (result.wasInterrupted()) return signalPassFailure();
 }
 
 }  // namespace
 
-std::unique_ptr<OperationPass<ModuleOp>> CreateTPUShardingIdentificationPass() {
+std::unique_ptr<mlir::OperationPass<ModuleOp>>
+CreateTPUShardingIdentificationPass() {
   return std::make_unique<TPUShardingIdentificationPass>();
 }
 
-}  // namespace TFTPU
-}  // namespace mlir
+}  // namespace internal
+}  // namespace tf2xla
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/hlo_xla_runtime_pipeline.mlir b/tensorflow/compiler/mlir/tf2xla/tests/hlo_xla_runtime_pipeline.mlir
deleted file mode 100644
index 8b0dc1b54bf9e5..00000000000000
--- a/tensorflow/compiler/mlir/tf2xla/tests/hlo_xla_runtime_pipeline.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-// RUN: tf-opt -split-input-file -hlo-xla-runtime-pipeline %s | FileCheck %s
-
-// CHECK-LABEL: func.func @simple_add(
-func.func @simple_add(%arg0: tensor<f64>) -> tensor<f64> {
-  // CHECK: arith.addf
-  %0 = mhlo.add %arg0, %arg0 : tensor<f64>
-  return %0 : tensor<f64>
-}
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-quant.mlir b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-quant.mlir
index 328a00ce59bbec..b015011dae7b2d 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-quant.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-quant.mlir
@@ -407,9 +407,9 @@ func.func @uniform_quantized_clip_by_value(%input: tensor<3x2xf32>) -> tensor<3x
   %zps = "tf.Const"() { value = dense<4> : tensor<2xi32> } : () -> tensor<2xi32>
 
   // tensor_proto that points to dense<127> of type !tf_type.qint32.
-  // CHECK-DAG: %[[MIN_MAX:.*]] = mhlo.constant() <{value = dense<127> : tensor<2xi32>}> : () -> tensor<2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>
-  %min = "tf.Const"() { value = #tf_type<tensor_proto : "0x746674656E736F722464747970653A2044545F51494E5433322074656E736F725F7368617065207B207D2074656E736F725F636F6E74656E743A20225C3137375C3030305C3030305C30303022"> : tensor<2x!tf_type.qint32> } : () -> tensor<2x!tf_type.qint32>
-  %max = "tf.Const"() { value = #tf_type<tensor_proto : "0x746674656E736F722464747970653A2044545F51494E5433322074656E736F725F7368617065207B207D2074656E736F725F636F6E74656E743A20225C3137375C3030305C3030305C30303022"> : tensor<2x!tf_type.qint32> } : () -> tensor<2x!tf_type.qint32>
+  // CHECK-DAG: %[[MIN_MAX:.*]] = mhlo.constant() <{value = dense<127> : tensor<3x2xi32>}> : () -> tensor<3x2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>
+  %min = "tf.Const"() { value = #tf_type<tensor_proto : "0x746674656E736F722464747970653A2044545F51494E5433322074656E736F725F7368617065207B207D2074656E736F725F636F6E74656E743A20225C3137375C3030305C3030305C30303022"> : tensor<3x2x!tf_type.qint32> } : () -> tensor<3x2x!tf_type.qint32>
+  %max = "tf.Const"() { value = #tf_type<tensor_proto : "0x746674656E736F722464747970653A2044545F51494E5433322074656E736F725F7368617065207B207D2074656E736F725F636F6E74656E743A20225C3137375C3030305C3030305C30303022"> : tensor<3x2x!tf_type.qint32> } : () -> tensor<3x2x!tf_type.qint32>
 
   // CHECK-DAG: %[[OPERAND:.*]] = mhlo.uniform_quantize %arg0 : (tensor<3x2xf32>) -> tensor<3x2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>
   %0 = "tf.UniformQuantize"(%input, %scales, %zps) {
@@ -419,10 +419,10 @@ func.func @uniform_quantized_clip_by_value(%input: tensor<3x2xf32>) -> tensor<3x
   // CHECK-DAG: %[[CONVERT_1:.*]] = mhlo.bitcast_convert %[[OPERAND]] : (tensor<3x2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>) -> tensor<3x2xi32>
   // CHECK-DAG: %[[CONVERT_2:.*]] = mhlo.bitcast_convert %[[CONVERT_1]] : (tensor<3x2xi32>) -> tensor<3x2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>
   // CHECK: %[[MIN_CLIPPED:.*]] = chlo.broadcast_maximum %[[CONVERT_2]], %[[MIN_MAX]] {broadcast_dimensions = array<i64: 1>} :
-  // CHECK-SAME: (tensor<3x2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>, tensor<2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>)
+  // CHECK-SAME: (tensor<3x2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>, tensor<3x2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>)
   // CHECK-SAME: -> tensor<3x2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>
   // CHECK: %[[MAX_CLIPPED:.*]] = chlo.broadcast_minimum %[[MIN_CLIPPED]], %[[MIN_MAX]] {broadcast_dimensions = array<i64: 1>} :
-  // CHECK-SAME: (tensor<3x2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>, tensor<2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>)
+  // CHECK-SAME: (tensor<3x2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>, tensor<3x2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>)
   // CHECK-SAME: -> tensor<3x2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>
   // CHECK: %[[RESULT:.*]] = mhlo.bitcast_convert %[[MAX_CLIPPED]] : (tensor<3x2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>) -> tensor<3x2xi32>
   // CHECK: return %[[RESULT]] : tensor<3x2xi32>
@@ -430,48 +430,48 @@ func.func @uniform_quantized_clip_by_value(%input: tensor<3x2xf32>) -> tensor<3x
       quantization_axis = 1 : i64,
       quantization_min_val = -2147483648 : i64,
       quantization_max_val = 2147483647 : i64
-  } : (tensor<3x2x!tf_type.qint32>, tensor<2x!tf_type.qint32>, tensor<2x!tf_type.qint32>, tensor<2xf32>, tensor<2xi32>) -> tensor<3x2x!tf_type.qint32>
+  } : (tensor<3x2x!tf_type.qint32>, tensor<3x2x!tf_type.qint32>, tensor<3x2x!tf_type.qint32>, tensor<2xf32>, tensor<2xi32>) -> tensor<3x2x!tf_type.qint32>
   func.return %1 : tensor<3x2x!tf_type.qint32>
 }
 
 // -----
 
 // CHECK-LABEL: func @uniform_quantized_clip_by_value_min_not_const
-func.func @uniform_quantized_clip_by_value_min_not_const(%input: tensor<3x2x!tf_type.qint32>, %min: tensor<2x!tf_type.qint32>) -> tensor<3x2x!tf_type.qint32> {
+func.func @uniform_quantized_clip_by_value_min_not_const(%input: tensor<3x2x!tf_type.qint32>, %min: tensor<3x2x!tf_type.qint32>) -> tensor<3x2x!tf_type.qint32> {
   %scales = "tf.Const"() { value = dense<2.0> : tensor<2xf32> } : () -> tensor<2xf32>
   %zps = "tf.Const"() { value = dense<4> : tensor<2xi32> } : () -> tensor<2xi32>
   // tensor_proto that points to dense<127> of type !tf_type.qint32.
-  %max = "tf.Const"() { value = #tf_type<tensor_proto : "0x746674656E736F722464747970653A2044545F51494E5433322074656E736F725F7368617065207B207D2074656E736F725F636F6E74656E743A20225C3137375C3030305C3030305C30303022"> : tensor<2x!tf_type.qint32> } : () -> tensor<2x!tf_type.qint32>
+  %max = "tf.Const"() { value = #tf_type<tensor_proto : "0x746674656E736F722464747970653A2044545F51494E5433322074656E736F725F7368617065207B207D2074656E736F725F636F6E74656E743A20225C3137375C3030305C3030305C30303022"> : tensor<3x2x!tf_type.qint32> } : () -> tensor<3x2x!tf_type.qint32>
 
   // CHECK-DAG: %[[INPUT:.*]] = mhlo.bitcast_convert %arg0 : (tensor<3x2xi32>) -> tensor<3x2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>
-  // CHECK-DAG: %[[MIN:.*]] = mhlo.bitcast_convert %arg1 : (tensor<2xi32>) -> tensor<2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>
+  // CHECK-DAG: %[[MIN:.*]] = mhlo.bitcast_convert %arg1 : (tensor<3x2xi32>) -> tensor<3x2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>
   // CHECK: chlo.broadcast_maximum %[[INPUT]], %[[MIN]]
   %res = "tf.UniformQuantizedClipByValue"(%input, %min, %max, %scales, %zps) {
       quantization_axis = 1 : i64,
       quantization_min_val = -2147483648 : i64,
       quantization_max_val = 2147483647 : i64
-  } : (tensor<3x2x!tf_type.qint32>, tensor<2x!tf_type.qint32>, tensor<2x!tf_type.qint32>, tensor<2xf32>, tensor<2xi32>) -> tensor<3x2x!tf_type.qint32>
+  } : (tensor<3x2x!tf_type.qint32>, tensor<3x2x!tf_type.qint32>, tensor<3x2x!tf_type.qint32>, tensor<2xf32>, tensor<2xi32>) -> tensor<3x2x!tf_type.qint32>
   func.return %res : tensor<3x2x!tf_type.qint32>
 }
 
 // -----
 
 // CHECK-LABEL: func @uniform_quantized_clip_by_value_max_not_const
-func.func @uniform_quantized_clip_by_value_max_not_const(%input: tensor<3x2x!tf_type.qint32>, %max: tensor<2x!tf_type.qint32>) -> tensor<3x2x!tf_type.qint32> {
+func.func @uniform_quantized_clip_by_value_max_not_const(%input: tensor<3x2x!tf_type.qint32>, %max: tensor<3x2x!tf_type.qint32>) -> tensor<3x2x!tf_type.qint32> {
   %scales = "tf.Const"() { value = dense<2.0> : tensor<2xf32> } : () -> tensor<2xf32>
   %zps = "tf.Const"() { value = dense<4> : tensor<2xi32> } : () -> tensor<2xi32>
   // tensor_proto that points to dense<127> of type !tf_type.qint32.
-  %min = "tf.Const"() { value = #tf_type<tensor_proto : "0x746674656E736F722464747970653A2044545F51494E5433322074656E736F725F7368617065207B207D2074656E736F725F636F6E74656E743A20225C3137375C3030305C3030305C30303022"> : tensor<2x!tf_type.qint32> } : () -> tensor<2x!tf_type.qint32>
+  %min = "tf.Const"() { value = #tf_type<tensor_proto : "0x746674656E736F722464747970653A2044545F51494E5433322074656E736F725F7368617065207B207D2074656E736F725F636F6E74656E743A20225C3137375C3030305C3030305C30303022"> : tensor<3x2x!tf_type.qint32> } : () -> tensor<3x2x!tf_type.qint32>
 
   // CHECK-DAG: %[[INPUT:.*]] = mhlo.bitcast_convert %arg0 : (tensor<3x2xi32>) -> tensor<3x2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>
-  // CHECK-DAG: %[[MAX:.*]] = mhlo.bitcast_convert %arg1 : (tensor<2xi32>) -> tensor<2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>
+  // CHECK-DAG: %[[MAX:.*]] = mhlo.bitcast_convert %arg1 : (tensor<3x2xi32>) -> tensor<3x2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>
   // CHECK-DAG: %[[INPUT_1:.*]] = chlo.broadcast_maximum
   // CHECK: chlo.broadcast_minimum %[[INPUT_1]], %[[MAX]]
   %res = "tf.UniformQuantizedClipByValue"(%input, %min, %max, %scales, %zps) {
       quantization_axis = 1 : i64,
       quantization_min_val = -2147483648 : i64,
       quantization_max_val = 2147483647 : i64
-  } : (tensor<3x2x!tf_type.qint32>, tensor<2x!tf_type.qint32>, tensor<2x!tf_type.qint32>, tensor<2xf32>, tensor<2xi32>) -> tensor<3x2x!tf_type.qint32>
+  } : (tensor<3x2x!tf_type.qint32>, tensor<3x2x!tf_type.qint32>, tensor<3x2x!tf_type.qint32>, tensor<2xf32>, tensor<2xi32>) -> tensor<3x2x!tf_type.qint32>
   func.return %res : tensor<3x2x!tf_type.qint32>
 }
 
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf.mlir
index bb9ca266fc7abc..91008b91056d40 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf.mlir
@@ -2637,35 +2637,6 @@ func.func @bitcast_smaller_output_width(%arg0: tensor<2xf32>) -> tensor<2x2xf16>
 
 // -----
 
-// CHECK-LABEL: reshape
-func.func @reshape(%arg0: tensor<2xf32>, %arg1: tensor<2xi32>) -> tensor<2x1xf32> {
-  // CHECK:  mhlo.reshape
-  %0 = "tf.Reshape"(%arg0, %arg1) : (tensor<2xf32>, tensor<2xi32>) -> tensor<2x1xf32>
-  func.return %0 : tensor<2x1xf32>
-}
-
-// -----
-
-// CHECK-LABEL: not_lowering_reshape
-func.func @not_lowering_reshape(%arg0: tensor<!tf_type.string>, %arg1: tensor<1xi32>) -> tensor<1x!tf_type.string> {
-  // CHECK:  "tf.Reshape"
-  %0 = "tf.Reshape"(%arg0, %arg1) : (tensor<!tf_type.string>, tensor<1xi32>) -> tensor<1x!tf_type.string>
-  func.return %0 : tensor<1x!tf_type.string>
-}
-
-// -----
-
-// CHECK-LABEL: reshape_dynamic
-func.func @reshape_dynamic(%arg0: tensor<?xf32>, %arg1: tensor<2xi32>) -> tensor<?x?xf32> {
-  // CHECK:  "chlo.dynamic_reshape"
-  // CHLO:  mhlo.compute_reshape_shape
-  // CHLO:  mhlo.dynamic_reshape
-  %0 = "tf.Reshape"(%arg0, %arg1) : (tensor<?xf32>, tensor<2xi32>) -> tensor<?x?xf32>
-  func.return %0 : tensor<?x?xf32>
-}
-
-// -----
-
 // CHECK-LABEL: squeeze
 func.func @squeeze(%arg0: tensor<1x1x10xf32>) -> tensor<1x10xf32> {
   // CHECK: mhlo.reshape
@@ -2680,7 +2651,7 @@ func.func @squeeze_ranked(%arg0: tensor<?x?x?xf32>) -> tensor<?xf32> {
   // CHECK: %[[C2:.*]] = arith.constant 2 : index
   // CHECK: %[[D2:.*]] = tensor.dim %arg0, %[[C2]] : tensor<?x?x?xf32>
   // CHECK: %[[T:.*]] = tensor.from_elements %[[D2]] : tensor<1xindex>
-  // CHECK: %[[R:.*]] = "chlo.dynamic_reshape"(%arg0, %[[T]]) : (tensor<?x?x?xf32>, tensor<1xindex>) -> tensor<?xf32>
+  // CHECK: %[[R:.*]] = mhlo.dynamic_reshape %arg0, %[[T]] : (tensor<?x?x?xf32>, tensor<1xindex>) -> tensor<?xf32>
   // CHECK: return %[[R]] : tensor<?xf32>
   %0 = "tf.Squeeze"(%arg0) { squeeze_dims = [0, 1] }: (tensor<?x?x?xf32>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
@@ -2695,7 +2666,7 @@ func.func @squeeze_ranked_negative(%arg0: tensor<?x?x10xf32>) -> tensor<?x10xf32
   // CHECK: %[[C2:.*]] = arith.constant 2 : index
   // CHECK: %[[D2:.*]] = tensor.dim %arg0, %[[C2]] : tensor<?x?x10xf32>
   // CHECK: %[[T:.*]] = tensor.from_elements %[[D0]], %[[D2]] : tensor<2xindex>
-  // CHECK: %[[R:.*]] = "chlo.dynamic_reshape"(%arg0, %[[T]]) : (tensor<?x?x10xf32>, tensor<2xindex>) -> tensor<?x10xf32>
+  // CHECK: %[[R:.*]] = mhlo.dynamic_reshape %arg0, %[[T]] : (tensor<?x?x10xf32>, tensor<2xindex>) -> tensor<?x10xf32>
   // CHECK: return %[[R]] : tensor<?x10xf32>
   %0 = "tf.Squeeze"(%arg0) { squeeze_dims = [-2] }: (tensor<?x?x10xf32>) -> tensor<?x10xf32>
   func.return %0 : tensor<?x10xf32>
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/BUILD b/tensorflow/compiler/mlir/tf2xla/transforms/BUILD
index b76b52c9fd774a..34ffdfa90f028f 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/BUILD
@@ -477,6 +477,7 @@ cc_library(
     deps = [
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:tpu_embedding_ops_registry",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
     ],
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config.cc
index b5a99b35f7b547..d5e5c5d08e4ff3 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config.h"
 
+#include "llvm/ADT/DenseSet.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tpu_embedding_ops_registry.h"
 
@@ -135,232 +136,243 @@ bool IsOpTypeAllowedTf2XlaFallback(const TypeID& type_id) {
   // end, which would not be thread safe.
 
   static auto* ops = [] {
-    llvm::SmallDenseSet<mlir::TypeID, 512>* ops_set =
-        new llvm::SmallDenseSet<mlir::TypeID, 512>{
-            TypeID::get<TF::AcoshOp>(),
-            TypeID::get<TF::AcosOp>(),
-            TypeID::get<TF::AddNOp>(),
-            TypeID::get<TF::AddV2Op>(),
-            TypeID::get<TF::AngleOp>(),
-            TypeID::get<TF::AdjustContrastv2Op>(),
-            TypeID::get<TF::AdjustHueOp>(),
-            TypeID::get<TF::AdjustSaturationOp>(),
-            TypeID::get<TF::ApproximateEqualOp>(),
-            TypeID::get<TF::ApproxTopKOp>(),
-            TypeID::get<TF::ArgMaxOp>(),
-            TypeID::get<TF::ArgMinOp>(),
-            TypeID::get<TF::AsinhOp>(),
-            TypeID::get<TF::AsinOp>(),
-            TypeID::get<TF::Atan2Op>(),
-            TypeID::get<TF::AtanhOp>(),
-            TypeID::get<TF::BatchMatMulV2Op>(),
-            TypeID::get<TF::BatchMatMulV3Op>(),
-            TypeID::get<TF::BatchToSpaceOp>(),
-            TypeID::get<TF::BesselI0eOp>(),
-            TypeID::get<TF::BesselI1eOp>(),
-            TypeID::get<TF::BetaincOp>(),
-            TypeID::get<TF::BiasAddOp>(),
-            TypeID::get<TF::BitwiseAndOp>(),
-            TypeID::get<TF::BitwiseOrOp>(),
-            TypeID::get<TF::BitwiseXorOp>(),
-            TypeID::get<TF::BucketizeOp>(),
-            // CaseOp isn't actually supported but is enabled for testing to
-            // make sure ops with symbol ref attributes are filtered out.
-            TypeID::get<TF::CaseOp>(),
-            TypeID::get<TF::CastOp>(),
-            TypeID::get<TF::ClipByValueOp>(),
-            TypeID::get<TF::CholeskyOp>(),
-            TypeID::get<TF::CollectiveReduceV2Op>(),
-            TypeID::get<TF::ComplexAbsOp>(),
-            TypeID::get<TF::ConjugateTransposeOp>(),
-            TypeID::get<TF::ConcatV2Op>(),
-            TypeID::get<TF::ConvOp>(),
-            TypeID::get<TF::CoshOp>(),
-            TypeID::get<TF::CrossOp>(),
-            TypeID::get<TF::CumulativeLogsumexpOp>(),
-            TypeID::get<TF::DataFormatDimMapOp>(),
-            TypeID::get<TF::DataFormatVecPermuteOp>(),
-            TypeID::get<TF::DepthToSpaceOp>(),
-            TypeID::get<TF::DepthwiseConv2dNativeBackpropFilterOp>(),
-            TypeID::get<TF::DepthwiseConv2dNativeBackpropInputOp>(),
-            TypeID::get<TF::DiagOp>(),
-            TypeID::get<TF::DigammaOp>(),
-            TypeID::get<TF::DivNoNanOp>(),
-            TypeID::get<TF::DynamicPartitionOp>(),
-            TypeID::get<TF::EluGradOp>(),
-            TypeID::get<TF::EluOp>(),
-            TypeID::get<TF::EnsureShapeOp>(),
-            TypeID::get<TF::EqualOp>(),
-            TypeID::get<TF::ErfcOp>(),
-            TypeID::get<TF::ErfinvOp>(),
-            TypeID::get<TF::ErfOp>(),
-            TypeID::get<TF::ExtractImagePatchesOp>(),
-            TypeID::get<TF::FFT2DOp>(),
-            TypeID::get<TF::FFT3DOp>(),
-            TypeID::get<TF::FFTOp>(),
-            TypeID::get<TF::FakeParamOp>(),
-            TypeID::get<TF::FakeQuantWithMinMaxArgsGradientOp>(),
-            TypeID::get<TF::FakeQuantWithMinMaxVarsGradientOp>(),
-            TypeID::get<TF::FakeQuantWithMinMaxVarsPerChannelOp>(),
-            TypeID::get<TF::FakeQuantWithMinMaxVarsPerChannelGradientOp>(),
-            TypeID::get<TF::FloorDivOp>(),
-            TypeID::get<TF::FloorModOp>(),
-            TypeID::get<TF::GetMinibatchesInCsrWithPhysicalReplicaOp>(),
-            TypeID::get<TF::GetMinibatchSplitsWithPhysicalReplicaOp>(),
-            TypeID::get<TF::GreaterOp>(),
-            TypeID::get<TF::HSVToRGBOp>(),
-            TypeID::get<TF::IFFT2DOp>(),
-            TypeID::get<TF::IFFT3DOp>(),
-            TypeID::get<TF::IRFFT2DOp>(),
-            TypeID::get<TF::IRFFT3DOp>(),
-            TypeID::get<TF::IgammaOp>(),
-            TypeID::get<TF::IgammacOp>(),
-            TypeID::get<TF::IgammaGradAOp>(),
-            TypeID::get<TF::InplaceAddOp>(),
-            TypeID::get<TF::InTopKV2Op>(),
-            TypeID::get<TF::InvertOp>(),
-            TypeID::get<TF::InvOp>(),
-            TypeID::get<TF::KthOrderStatisticOp>(),
-            TypeID::get<TF::LRNOp>(),
-            TypeID::get<TF::LRNGradOp>(),
-            TypeID::get<TF::LeakyReluGradOp>(),
-            TypeID::get<TF::LeakyReluOp>(),
-            TypeID::get<TF::LeftShiftOp>(),
-            TypeID::get<TF::LessOp>(),
-            TypeID::get<TF::ListDiffOp>(),
-            TypeID::get<TF::LogicalAndOp>(),
-            TypeID::get<TF::LogicalNotOp>(),
-            TypeID::get<TF::LogOp>(),
-            TypeID::get<TF::LowerBoundOp>(),
-            TypeID::get<TF::MakeUniqueOp>(),
-            TypeID::get<TF::MatMulOp>(),
-            TypeID::get<TF::MatrixDiagV3Op>(),
-            TypeID::get<TF::MatrixInverseOp>(),
-            TypeID::get<TF::MatrixSetDiagV3Op>(),
-            TypeID::get<TF::MatrixSolveOp>(),
-            TypeID::get<TF::MatrixTriangularSolveOp>(),
-            TypeID::get<TF::MaxPool3DGradGradOp>(),
-            TypeID::get<TF::MaxPoolGradOp>(),
-            TypeID::get<TF::MaxPoolGradGradOp>(),
-            TypeID::get<TF::MirrorPadOp>(),
-            TypeID::get<TF::MirrorPadGradOp>(),
-            TypeID::get<TF::MulOp>(),
-            TypeID::get<TF::MultinomialOp>(),
-            TypeID::get<TF::NdtriOp>(),
-            TypeID::get<TF::NegOp>(),
-            TypeID::get<TF::NextAfterOp>(),
-            TypeID::get<TF::NonMaxSuppressionV4Op>(),
-            TypeID::get<TF::NotEqualOp>(),
-            TypeID::get<TF::PadOp>(),
-            TypeID::get<TF::ParameterizedTruncatedNormalOp>(),
-            TypeID::get<TF::PlaceholderWithDefaultOp>(),
-            TypeID::get<TF::PolygammaOp>(),
-            TypeID::get<TF::PopulationCountOp>(),
-            TypeID::get<TF::PowOp>(),
-            TypeID::get<TF::QrOp>(),
-            // TODO(hinsu): Canonicalize QuantizeAndDequantize and
-            // QuantizeAndDequantizeV2 to QuantizeAndDequantizeV3 by converting
-            // attributes to operands.
-            TypeID::get<TF::QuantizeAndDequantizeOp>(),
-            TypeID::get<TF::QuantizeAndDequantizeV2Op>(),
-            TypeID::get<TF::QuantizeAndDequantizeV3Op>(),
-            TypeID::get<TF::QuantizeAndDequantizeV4Op>(),
-            TypeID::get<TF::RFFT2DOp>(),
-            TypeID::get<TF::RFFT3DOp>(),
-            TypeID::get<TF::RGBToHSVOp>(),
-            TypeID::get<TF::RandomUniformIntOp>(),
-            TypeID::get<TF::RandomUniformOp>(),
-            TypeID::get<TF::RealDivOp>(),
-            TypeID::get<TF::ReciprocalGradOp>(),
-            TypeID::get<TF::Relu6GradOp>(),
-            TypeID::get<TF::ResizeBilinearOp>(),
-            TypeID::get<TF::ResizeBilinearGradOp>(),
-            TypeID::get<TF::ResizeNearestNeighborOp>(),
-            TypeID::get<TF::ResizeNearestNeighborGradOp>(),
-            TypeID::get<TF::ReverseSequenceOp>(),
-            TypeID::get<TF::RightShiftOp>(),
-            TypeID::get<TF::RintOp>(),
-            TypeID::get<TF::RollOp>(),
-            TypeID::get<TF::RoundOp>(),
-            TypeID::get<TF::SegmentSumV2Op>(),
-            TypeID::get<TF::SegmentProdV2Op>(),
-            TypeID::get<TF::SegmentMinV2Op>(),
-            TypeID::get<TF::SegmentMaxV2Op>(),
-            TypeID::get<TF::SelectV2Op>(),
-            TypeID::get<TF::SelfAdjointEigV2Op>(),
-            TypeID::get<TF::SeluGradOp>(),
-            TypeID::get<TF::SeluOp>(),
-            TypeID::get<TF::SigmoidGradOp>(),
-            TypeID::get<TF::SinOp>(),
-            TypeID::get<TF::SliceOp>(),
-            TypeID::get<TF::SoftplusGradOp>(),
-            TypeID::get<TF::SoftsignGradOp>(),
-            TypeID::get<TF::SoftsignOp>(),
-            TypeID::get<TF::SpaceToBatchNDOp>(),
-            TypeID::get<TF::SpaceToBatchOp>(),
-            TypeID::get<TF::SpaceToDepthOp>(),
-            TypeID::get<TF::SparseToDenseOp>(),
-            TypeID::get<TF::SquareOp>(),
-            TypeID::get<TF::StatelessMultinomialOp>(),
-            TypeID::get<TF::StatelessParameterizedTruncatedNormalOp>(),
-            TypeID::get<TF::StatelessRandomGetAlgOp>(),
-            TypeID::get<TF::StatelessRandomGetKeyCounterOp>(),
-            TypeID::get<TF::StatelessRandomGetKeyCounterAlgOp>(),
-            TypeID::get<TF::StatelessRandomNormalOp>(),
-            TypeID::get<TF::StatelessRandomNormalV2Op>(),
-            TypeID::get<TF::StatelessRandomUniformOp>(),
-            TypeID::get<TF::StatelessRandomUniformFullIntOp>(),
-            TypeID::get<TF::StatelessRandomUniformFullIntV2Op>(),
-            TypeID::get<TF::StatelessRandomUniformV2Op>(),
-            TypeID::get<TF::StatelessRandomUniformIntOp>(),
-            TypeID::get<TF::StatelessRandomUniformIntV2Op>(),
-            TypeID::get<TF::StatelessTruncatedNormalOp>(),
-            TypeID::get<TF::StatelessTruncatedNormalV2Op>(),
-            TypeID::get<TF::StoreMinibatchStatisticsInFdoOp>(),
-            TypeID::get<TF::StridedSliceOp>(),
-            TypeID::get<TF::SubOp>(),
-            TypeID::get<TF::SvdOp>(),
-            TypeID::get<TF::TanOp>(),
-            TypeID::get<TF::TensorScatterAddOp>(),
-            TypeID::get<TF::TensorScatterSubOp>(),
-            TypeID::get<TF::TPUEmbeddingActivationsOp>(),
-            TypeID::get<TF::TopKUniqueOp>(),
-            TypeID::get<TF::TopKWithUniqueOp>(),
-            TypeID::get<TF::TransposeOp>(),
-            TypeID::get<TF::TridiagonalSolveOp>(),
-            TypeID::get<TF::TridiagonalMatMulOp>(),
-            TypeID::get<TF::TruncateDivOp>(),
-            TypeID::get<TF::TruncatedNormalOp>(),
-            TypeID::get<TF::TruncateModOp>(),
-            TypeID::get<TF::UniqueOp>(),
-            TypeID::get<TF::UnpackOp>(),
-            TypeID::get<TF::UpperBoundOp>(),
-            TypeID::get<TF::WhereOp>(),
-            TypeID::get<TF::XlaSendTPUEmbeddingGradientsOp>(),
-            TypeID::get<TF::XlaBroadcastHelperOp>(),
-            TypeID::get<TF::XlaCallModuleOp>(),
-            TypeID::get<TF::XlaCustomCallV2Op>(),
-            TypeID::get<TF::XlaDynamicUpdateSliceOp>(),
-            TypeID::get<TF::XlaKeyValueSortOp>(),
-            TypeID::get<TF::XlaPadOp>(),
-            TypeID::get<TF::XlaSetBoundOp>(),
-            TypeID::get<TF::XlaSetDynamicDimensionSizeOp>(),
-            TypeID::get<TF::XlaSparseCoreAdagradMomentumOp>(),
-            TypeID::get<TF::XlaSparseCoreAdagradOp>(),
-            TypeID::get<TF::XlaSparseCoreAdamOp>(),
-            TypeID::get<TF::XlaSparseCoreFtrlOp>(),
-            TypeID::get<TF::XlaSparseCoreSgdOp>(),
-            TypeID::get<TF::XlaSparseDenseMatmulGradWithAdagradAndCsrInputOp>(),
-            TypeID::get<
-                TF::XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInputOp>(),
-            TypeID::get<TF::XlaSparseDenseMatmulGradWithAdamAndCsrInputOp>(),
-            TypeID::get<TF::XlaSparseDenseMatmulGradWithFtrlAndCsrInputOp>(),
-            TypeID::get<TF::XlaSparseDenseMatmulGradWithSgdAndCsrInputOp>(),
-            TypeID::get<TF::XlaSparseDenseMatmulWithCsrInputOp>(),
-            TypeID::get<TF::XlaSpmdFullToShardShapeOp>(),
-            TypeID::get<TF::XlaSpmdShardToFullShapeOp>(),
-            TypeID::get<TF::XlaSvdOp>(),
-        };
+    llvm::SmallDenseSet<mlir::TypeID, 512>* ops_set = new llvm::SmallDenseSet<
+        mlir::TypeID, 512>{
+        TypeID::get<TF::AcoshOp>(),
+        TypeID::get<TF::AcosOp>(),
+        TypeID::get<TF::AddNOp>(),
+        TypeID::get<TF::AddV2Op>(),
+        TypeID::get<TF::AngleOp>(),
+        TypeID::get<TF::AdjustContrastv2Op>(),
+        TypeID::get<TF::AdjustHueOp>(),
+        TypeID::get<TF::AdjustSaturationOp>(),
+        TypeID::get<TF::ApproximateEqualOp>(),
+        TypeID::get<TF::ApproxTopKOp>(),
+        TypeID::get<TF::ArgMaxOp>(),
+        TypeID::get<TF::ArgMinOp>(),
+        TypeID::get<TF::AsinhOp>(),
+        TypeID::get<TF::AsinOp>(),
+        TypeID::get<TF::Atan2Op>(),
+        TypeID::get<TF::AtanhOp>(),
+        TypeID::get<TF::BatchMatMulV2Op>(),
+        TypeID::get<TF::BatchMatMulV3Op>(),
+        TypeID::get<TF::BatchToSpaceOp>(),
+        TypeID::get<TF::BesselI0eOp>(),
+        TypeID::get<TF::BesselI1eOp>(),
+        TypeID::get<TF::BetaincOp>(),
+        TypeID::get<TF::BiasAddOp>(),
+        TypeID::get<TF::BitwiseAndOp>(),
+        TypeID::get<TF::BitwiseOrOp>(),
+        TypeID::get<TF::BitwiseXorOp>(),
+        TypeID::get<TF::BucketizeOp>(),
+        // CaseOp isn't actually supported but is enabled for testing to
+        // make sure ops with symbol ref attributes are filtered out.
+        TypeID::get<TF::CaseOp>(),
+        TypeID::get<TF::CastOp>(),
+        TypeID::get<TF::ClipByValueOp>(),
+        TypeID::get<TF::CholeskyOp>(),
+        TypeID::get<TF::CollectiveReduceV2Op>(),
+        TypeID::get<TF::ComplexAbsOp>(),
+        TypeID::get<TF::ConjugateTransposeOp>(),
+        TypeID::get<TF::ConcatV2Op>(),
+        TypeID::get<TF::ConvOp>(),
+        TypeID::get<TF::CoshOp>(),
+        TypeID::get<TF::CrossOp>(),
+        TypeID::get<TF::CumulativeLogsumexpOp>(),
+        TypeID::get<TF::DataFormatDimMapOp>(),
+        TypeID::get<TF::DataFormatVecPermuteOp>(),
+        TypeID::get<TF::DepthToSpaceOp>(),
+        TypeID::get<TF::DepthwiseConv2dNativeBackpropFilterOp>(),
+        TypeID::get<TF::DepthwiseConv2dNativeBackpropInputOp>(),
+        TypeID::get<TF::DiagOp>(),
+        TypeID::get<TF::DigammaOp>(),
+        TypeID::get<TF::DivNoNanOp>(),
+        TypeID::get<TF::DynamicPartitionOp>(),
+        TypeID::get<TF::EluGradOp>(),
+        TypeID::get<TF::EluOp>(),
+        TypeID::get<TF::EnsureShapeOp>(),
+        TypeID::get<TF::EqualOp>(),
+        TypeID::get<TF::ErfcOp>(),
+        TypeID::get<TF::ErfinvOp>(),
+        TypeID::get<TF::ErfOp>(),
+        TypeID::get<TF::ExtractImagePatchesOp>(),
+        TypeID::get<TF::FFT2DOp>(),
+        TypeID::get<TF::FFT3DOp>(),
+        TypeID::get<TF::FFTOp>(),
+        TypeID::get<TF::FakeParamOp>(),
+        TypeID::get<TF::FakeQuantWithMinMaxArgsGradientOp>(),
+        TypeID::get<TF::FakeQuantWithMinMaxVarsGradientOp>(),
+        TypeID::get<TF::FakeQuantWithMinMaxVarsPerChannelOp>(),
+        TypeID::get<TF::FakeQuantWithMinMaxVarsPerChannelGradientOp>(),
+        TypeID::get<TF::FloorDivOp>(),
+        TypeID::get<TF::FloorModOp>(),
+        TypeID::get<TF::GetMinibatchesInCsrWithPhysicalReplicaOp>(),
+        TypeID::get<TF::GetMinibatchSplitsWithPhysicalReplicaOp>(),
+        TypeID::get<TF::GreaterOp>(),
+        TypeID::get<TF::HSVToRGBOp>(),
+        TypeID::get<TF::IFFT2DOp>(),
+        TypeID::get<TF::IFFT3DOp>(),
+        TypeID::get<TF::IRFFT2DOp>(),
+        TypeID::get<TF::IRFFT3DOp>(),
+        TypeID::get<TF::IgammaOp>(),
+        TypeID::get<TF::IgammacOp>(),
+        TypeID::get<TF::IgammaGradAOp>(),
+        TypeID::get<TF::InplaceAddOp>(),
+        TypeID::get<TF::InTopKV2Op>(),
+        TypeID::get<TF::InvertOp>(),
+        TypeID::get<TF::InvOp>(),
+        TypeID::get<TF::KthOrderStatisticOp>(),
+        TypeID::get<TF::LRNOp>(),
+        TypeID::get<TF::LRNGradOp>(),
+        TypeID::get<TF::LeakyReluGradOp>(),
+        TypeID::get<TF::LeakyReluOp>(),
+        TypeID::get<TF::LeftShiftOp>(),
+        TypeID::get<TF::LessOp>(),
+        TypeID::get<TF::ListDiffOp>(),
+        TypeID::get<TF::LogicalAndOp>(),
+        TypeID::get<TF::LogicalNotOp>(),
+        TypeID::get<TF::LogOp>(),
+        TypeID::get<TF::LowerBoundOp>(),
+        TypeID::get<TF::MakeUniqueOp>(),
+        TypeID::get<TF::MatMulOp>(),
+        TypeID::get<TF::MatrixDiagV3Op>(),
+        TypeID::get<TF::MatrixInverseOp>(),
+        TypeID::get<TF::MatrixSetDiagV3Op>(),
+        TypeID::get<TF::MatrixSolveOp>(),
+        TypeID::get<TF::MatrixTriangularSolveOp>(),
+        TypeID::get<TF::MaxPool3DGradGradOp>(),
+        TypeID::get<TF::MaxPoolGradOp>(),
+        TypeID::get<TF::MaxPoolGradGradOp>(),
+        TypeID::get<TF::MirrorPadOp>(),
+        TypeID::get<TF::MirrorPadGradOp>(),
+        TypeID::get<TF::MulOp>(),
+        TypeID::get<TF::MultinomialOp>(),
+        TypeID::get<TF::NdtriOp>(),
+        TypeID::get<TF::NegOp>(),
+        TypeID::get<TF::NextAfterOp>(),
+        TypeID::get<TF::NonMaxSuppressionV4Op>(),
+        TypeID::get<TF::NotEqualOp>(),
+        TypeID::get<TF::PadOp>(),
+        TypeID::get<TF::ParameterizedTruncatedNormalOp>(),
+        TypeID::get<TF::PlaceholderWithDefaultOp>(),
+        TypeID::get<TF::PolygammaOp>(),
+        TypeID::get<TF::PopulationCountOp>(),
+        TypeID::get<TF::PowOp>(),
+        TypeID::get<TF::QrOp>(),
+        // TODO(hinsu): Canonicalize QuantizeAndDequantize and
+        // QuantizeAndDequantizeV2 to QuantizeAndDequantizeV3 by converting
+        // attributes to operands.
+        TypeID::get<TF::QuantizeAndDequantizeOp>(),
+        TypeID::get<TF::QuantizeAndDequantizeV2Op>(),
+        TypeID::get<TF::QuantizeAndDequantizeV3Op>(),
+        TypeID::get<TF::QuantizeAndDequantizeV4Op>(),
+        TypeID::get<TF::RFFT2DOp>(),
+        TypeID::get<TF::RFFT3DOp>(),
+        TypeID::get<TF::RGBToHSVOp>(),
+        TypeID::get<TF::RandomUniformIntOp>(),
+        TypeID::get<TF::RandomUniformOp>(),
+        TypeID::get<TF::RealDivOp>(),
+        TypeID::get<TF::ReciprocalGradOp>(),
+        TypeID::get<TF::Relu6GradOp>(),
+        TypeID::get<TF::ResizeBilinearOp>(),
+        TypeID::get<TF::ResizeBilinearGradOp>(),
+        TypeID::get<TF::ResizeNearestNeighborOp>(),
+        TypeID::get<TF::ResizeNearestNeighborGradOp>(),
+        TypeID::get<TF::ReverseSequenceOp>(),
+        TypeID::get<TF::RightShiftOp>(),
+        TypeID::get<TF::RintOp>(),
+        TypeID::get<TF::RollOp>(),
+        TypeID::get<TF::RoundOp>(),
+        TypeID::get<TF::SegmentSumV2Op>(),
+        TypeID::get<TF::SegmentProdV2Op>(),
+        TypeID::get<TF::SegmentMinV2Op>(),
+        TypeID::get<TF::SegmentMaxV2Op>(),
+        TypeID::get<TF::SelectV2Op>(),
+        TypeID::get<TF::SelfAdjointEigV2Op>(),
+        TypeID::get<TF::SeluGradOp>(),
+        TypeID::get<TF::SeluOp>(),
+        TypeID::get<TF::SigmoidGradOp>(),
+        TypeID::get<TF::SinOp>(),
+        TypeID::get<TF::SliceOp>(),
+        TypeID::get<TF::SoftplusGradOp>(),
+        TypeID::get<TF::SoftsignGradOp>(),
+        TypeID::get<TF::SoftsignOp>(),
+        TypeID::get<TF::SpaceToBatchNDOp>(),
+        TypeID::get<TF::SpaceToBatchOp>(),
+        TypeID::get<TF::SpaceToDepthOp>(),
+        TypeID::get<TF::SparseToDenseOp>(),
+        TypeID::get<TF::SquareOp>(),
+        TypeID::get<TF::StatelessMultinomialOp>(),
+        TypeID::get<TF::StatelessParameterizedTruncatedNormalOp>(),
+        TypeID::get<TF::StatelessRandomGetAlgOp>(),
+        TypeID::get<TF::StatelessRandomGetKeyCounterOp>(),
+        TypeID::get<TF::StatelessRandomGetKeyCounterAlgOp>(),
+        TypeID::get<TF::StatelessRandomNormalOp>(),
+        TypeID::get<TF::StatelessRandomNormalV2Op>(),
+        TypeID::get<TF::StatelessRandomUniformOp>(),
+        TypeID::get<TF::StatelessRandomUniformFullIntOp>(),
+        TypeID::get<TF::StatelessRandomUniformFullIntV2Op>(),
+        TypeID::get<TF::StatelessRandomUniformV2Op>(),
+        TypeID::get<TF::StatelessRandomUniformIntOp>(),
+        TypeID::get<TF::StatelessRandomUniformIntV2Op>(),
+        TypeID::get<TF::StatelessTruncatedNormalOp>(),
+        TypeID::get<TF::StatelessTruncatedNormalV2Op>(),
+        TypeID::get<TF::StoreMinibatchStatisticsInFdoOp>(),
+        TypeID::get<TF::StridedSliceOp>(),
+        TypeID::get<TF::SubOp>(),
+        TypeID::get<TF::SvdOp>(),
+        TypeID::get<TF::TanOp>(),
+        TypeID::get<TF::TensorScatterAddOp>(),
+        TypeID::get<TF::TensorScatterSubOp>(),
+        TypeID::get<TF::TPUEmbeddingActivationsOp>(),
+        TypeID::get<TF::TopKUniqueOp>(),
+        TypeID::get<TF::TopKWithUniqueOp>(),
+        TypeID::get<TF::TransposeOp>(),
+        TypeID::get<TF::TridiagonalSolveOp>(),
+        TypeID::get<TF::TridiagonalMatMulOp>(),
+        TypeID::get<TF::TruncateDivOp>(),
+        TypeID::get<TF::TruncatedNormalOp>(),
+        TypeID::get<TF::TruncateModOp>(),
+        TypeID::get<TF::UniqueOp>(),
+        TypeID::get<TF::UnpackOp>(),
+        TypeID::get<TF::UpperBoundOp>(),
+        TypeID::get<TF::WhereOp>(),
+        TypeID::get<TF::XlaSendTPUEmbeddingGradientsOp>(),
+        TypeID::get<TF::XlaBroadcastHelperOp>(),
+        TypeID::get<TF::XlaCallModuleOp>(),
+        TypeID::get<TF::XlaCustomCallV2Op>(),
+        TypeID::get<TF::XlaDynamicUpdateSliceOp>(),
+        TypeID::get<TF::XlaKeyValueSortOp>(),
+        TypeID::get<TF::XlaPadOp>(),
+        TypeID::get<TF::XlaSetBoundOp>(),
+        TypeID::get<TF::XlaSetDynamicDimensionSizeOp>(),
+        TypeID::get<TF::XlaSparseCoreAdagradMomentumOp>(),
+        TypeID::get<TF::XlaSparseCoreAdagradOp>(),
+        TypeID::get<TF::XlaSparseCoreAdamOp>(),
+        TypeID::get<TF::XlaSparseCoreFtrlOp>(),
+        TypeID::get<TF::XlaSparseCoreSgdOp>(),
+        TypeID::get<TF::XlaSparseDenseMatmulGradWithAdagradAndCsrInputOp>(),
+        TypeID::get<
+            TF::XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInputOp>(),
+        TypeID::get<TF::XlaSparseDenseMatmulGradWithAdamAndCsrInputOp>(),
+        TypeID::get<TF::XlaSparseDenseMatmulGradWithFtrlAndCsrInputOp>(),
+        TypeID::get<TF::XlaSparseDenseMatmulGradWithSgdAndCsrInputOp>(),
+        TypeID::get<TF::XlaSparseDenseMatmulWithCsrInputOp>(),
+        TypeID::get<TF::XlaSparseDenseMatmulWithStaticBufferSizeOp>(),
+        TypeID::get<
+            TF::XlaSparseDenseMatmulGradWithAdagradAndStaticBufferSizeOp>(),
+        TypeID::get<
+            TF::XlaSparseDenseMatmulGradWithAdagradMomentumAndStaticBufferSizeOp>(),  // NOLINT
+        TypeID::get<
+            TF::XlaSparseDenseMatmulGradWithAdamAndStaticBufferSizeOp>(),
+        TypeID::get<
+            TF::XlaSparseDenseMatmulGradWithFtrlAndStaticBufferSizeOp>(),
+        TypeID::get<
+            TF::XlaSparseDenseMatmulGradWithSgdAndStaticBufferSizeOp>(),  // NOLINT
+        TypeID::get<TF::XlaSpmdFullToShardShapeOp>(),
+        TypeID::get<TF::XlaSpmdShardToFullShapeOp>(),
+        TypeID::get<TF::XlaSvdOp>(),
+    };
 
     // Add the ops from the TPUEmbeddingOpsRegistry.
     for (auto op_type_id :
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc
index bd8b2135882bb2..25b8196ebfe629 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc
@@ -54,12 +54,12 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
 
 class LegalizationOpConfigTest : public ::testing::Test {
  public:
-  tsl::Status CreateMlirModule(std::string module_string = kMlirModuleStr) {
+  absl::Status CreateMlirModule(std::string module_string = kMlirModuleStr) {
     TF_ASSIGN_OR_RETURN(
         module_, test::GetMlirModuleFromString(module_string, &context_));
 
     context_.loadAllAvailableDialects();
-    return tsl::OkStatus();
+    return absl::OkStatus();
   }
 
   absl::StatusOr<FuncOp> GetMain() {
@@ -135,8 +135,8 @@ TEST_F(LegalizationOpConfigTest, CountLoweringsSet) {
   // from MLIR to TF2XLA), these numbers should change. Or if TF Dialect adds
   // a new op, we should expect these to change too.
   EXPECT_EQ(mlir_lowering_count, 67);
-  EXPECT_EQ(tf2xla_fallback_count, 316);
-  EXPECT_EQ(non_categorized_count, 424);
+  EXPECT_EQ(tf2xla_fallback_count, 322);
+  EXPECT_EQ(non_categorized_count, 428);
 }
 
 // Just a counter test to see which ops have duplicate lowerings. This isn't a
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc
index edf0b96b569fea..fd0b33c20c7127 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc
@@ -51,6 +51,7 @@ limitations under the License.
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -89,10 +90,10 @@ static size_t GetFeatureDimension(tensorflow::TensorFormat format,
 
 // Gets all integer values from the given attribute and push them to `values`.
 void GetI64ArrayAttrValues(Attribute attr, SmallVectorImpl<int64_t> *values) {
-  auto array_attr = attr.cast<ArrayAttr>();
+  auto array_attr = mlir::cast<ArrayAttr>(attr);
   values->reserve(array_attr.getValue().size());
   for (Attribute val : array_attr.getValue())
-    values->push_back(val.cast<IntegerAttr>().getValue().getSExtValue());
+    values->push_back(mlir::cast<IntegerAttr>(val).getValue().getSExtValue());
 }
 
 // Returns 1D 32-bit dense elements attribute with the given values.
@@ -142,8 +143,8 @@ Type GetSumAccumulationType(Type input_type) {
 // format supports negative indexing unlike HLO.
 static IntegerAttr GetHLOAxisFromTFAxis(Attribute attr, int64_t rank,
                                         Builder *b) {
-  IntegerAttr intAttr = attr.dyn_cast_or_null<IntegerAttr>();
-  if (auto elementAttr = attr.dyn_cast_or_null<ElementsAttr>()) {
+  IntegerAttr intAttr = mlir::dyn_cast_or_null<IntegerAttr>(attr);
+  if (auto elementAttr = mlir::dyn_cast_or_null<ElementsAttr>(attr)) {
     SmallVector<uint64_t, 1> index(elementAttr.getShapedType().getRank(), 0);
     intAttr = elementAttr.getValues<IntegerAttr>()[index];
   }
@@ -198,7 +199,7 @@ static ConvertOp CastValueToI64(Location loc, Value value,
 // must be a ranked tensor.
 static TF::UnpackOp UnpackTensorAlongZeroDim(Location loc, Value value,
                                              PatternRewriter *rewriter) {
-  auto indices_type = value.getType().cast<RankedTensorType>();
+  auto indices_type = mlir::cast<RankedTensorType>(value.getType());
   int num_outputs = indices_type.getShape().front();
   SmallVector<Type, 2> unpacked_indices_type(
       num_outputs,
@@ -214,7 +215,7 @@ static TF::UnpackOp UnpackTensorAlongZeroDim(Location loc, Value value,
 //
 // Aborts if the type is ranked but doesn't have the dimension.
 int64_t GetDimSize(Type ty, int64_t index) {
-  RankedTensorType ranked_ty = ty.dyn_cast<RankedTensorType>();
+  RankedTensorType ranked_ty = mlir::dyn_cast<RankedTensorType>(ty);
   if (!ranked_ty) return -1;
 
   return ranked_ty.getDimSize(index);
@@ -298,8 +299,8 @@ template <typename BinaryOp>
 static Value StaticBinaryBroadcast(Location loc, Value x, Value y,
                                    DenseIntElementsAttr broadcast_dims,
                                    OpBuilder &builder) {
-  auto x_type = x.getType().cast<RankedTensorType>();
-  auto y_type = y.getType().cast<RankedTensorType>();
+  auto x_type = mlir::cast<RankedTensorType>(x.getType());
+  auto y_type = mlir::cast<RankedTensorType>(y.getType());
   auto result_type = GetStaticBroadcastType(x_type, y_type, broadcast_dims);
   if (!result_type) {
     emitError(loc) << "could not binary broadcast " << x_type << ", " << y_type
@@ -353,7 +354,7 @@ static Value Broadcast1DToFeatureDim(Location loc, Value broadcast_to,
                                      Value broadcast_from, int64_t feature_dim,
                                      OpBuilder &builder) {
   auto broadcast_dims = GetI64ElementsAttr({feature_dim}, &builder);
-  auto to_type = broadcast_to.getType().cast<RankedTensorType>();
+  auto to_type = mlir::cast<RankedTensorType>(broadcast_to.getType());
   auto result_shape = builder.create<shape::ShapeOfOp>(loc, broadcast_to);
   auto result_extents_type = GetExtentsTensorTypeFor(to_type);
   auto result_extents = builder.create<shape::ToExtentTensorOp>(
@@ -372,11 +373,11 @@ static Value Broadcast1DToFeatureDim(Location loc, Value broadcast_to,
 static Value BroadcastToShapeOf(Location loc, Value input, Value broadcast_to,
                                 OpBuilder &builder) {
   auto result_shape = builder.create<shape::ShapeOfOp>(loc, broadcast_to);
-  auto to_type = broadcast_to.getType().cast<TensorType>();
+  auto to_type = mlir::cast<TensorType>(broadcast_to.getType());
   auto result_extents_type = GetExtentsTensorTypeFor(to_type);
   auto result_extents = builder.create<shape::ToExtentTensorOp>(
       loc, result_extents_type, result_shape);
-  int64_t rank = input.getType().cast<RankedTensorType>().getRank();
+  int64_t rank = mlir::cast<RankedTensorType>(input.getType()).getRank();
   auto broadcast_dims = GetI64ElementsAttrForSeq(0, rank, &builder);
   return builder.create<DynamicBroadcastInDimOp>(
       loc, to_type, input, result_extents, broadcast_dims);
@@ -520,8 +521,8 @@ static void CreateWhile32(Location loc, int num_iterations,
 static IntegerAttr getFeatureDimensionAttr(Builder &b,
                                            tensorflow::TensorFormat format,
                                            Value input) {
-  return b.getI64IntegerAttr(
-      GetFeatureDimension(format, input.getType().cast<RankedTensorType>()));
+  return b.getI64IntegerAttr(GetFeatureDimension(
+      format, mlir::cast<RankedTensorType>(input.getType())));
 }
 
 //===----------------------------------------------------------------------===//
@@ -567,7 +568,7 @@ static DenseIntElementsAttr Get2DTransposePerm(BoolAttr transpose, Builder *b) {
 // attribute.
 static DenseIntElementsAttr SliceDenseIntElementsAttrColumn2D(
     ElementsAttr input, int column) {
-  auto int_attr = input.cast<DenseIntElementsAttr>();
+  auto int_attr = mlir::cast<DenseIntElementsAttr>(input);
   auto shaped_type = int_attr.getType();
   auto shape = shaped_type.getShape();
 
@@ -605,8 +606,8 @@ static DenseIntElementsAttr GetInteriorPadding(ElementsAttr tf_padding) {
 // must be broadcasted with a size 1 tensor or another dynamic dimension.
 // Returns false on rankless.
 static bool AreBroadcastCompatible(Value x, Value y) {
-  auto x_rankless = x.getType().dyn_cast<RankedTensorType>();
-  auto y_rankless = y.getType().dyn_cast<RankedTensorType>();
+  auto x_rankless = mlir::dyn_cast<RankedTensorType>(x.getType());
+  auto y_rankless = mlir::dyn_cast<RankedTensorType>(y.getType());
   if (!x_rankless || !y_rankless) {
     return false;
   }
@@ -634,7 +635,7 @@ static bool AreBroadcastCompatible(Value x, Value y) {
 // updated element type.
 static Type ChangeTensorElementType(Builder *b, Type tensor_type,
                                     Type element_type) {
-  RankedTensorType ranked_type = tensor_type.dyn_cast<RankedTensorType>();
+  RankedTensorType ranked_type = mlir::dyn_cast<RankedTensorType>(tensor_type);
   if (ranked_type) {
     return tensorflow::GetTypeFromTFTensorShape(ranked_type.getShape(),
                                                 element_type);
@@ -659,7 +660,7 @@ static Type GetAccumulationType(Type ty) {
 //===----------------------------------------------------------------------===//
 
 static DenseElementsAttr GetEpsilonValue(Type ty) {
-  auto element_ty = ty.cast<TensorType>().getElementType();
+  auto element_ty = mlir::cast<TensorType>(ty).getElementType();
   auto scalar_ty = tensorflow::GetTypeFromTFTensorShape({}, element_ty);
   if (element_ty.isF16()) {
     uint16_t raw_epsilon = Eigen::numext::bit_cast<uint16_t>(
@@ -750,9 +751,10 @@ static bool ArgTypesMatchCallee(mlir::Operation *op, OperandRange args,
 
 static bool CanBeTranslatedToDynamicSlice(Value input, Value start_indices,
                                           DenseIntElementsAttr slice_sizes) {
-  auto input_ty = input.getType().dyn_cast<RankedTensorType>();
+  auto input_ty = mlir::dyn_cast<RankedTensorType>(input.getType());
   if (!input_ty) return false;
-  auto start_indices_ty = start_indices.getType().dyn_cast<RankedTensorType>();
+  auto start_indices_ty =
+      mlir::dyn_cast<RankedTensorType>(start_indices.getType());
   if (!start_indices_ty) return false;
 
   int64_t input_rank = input_ty.getRank();
@@ -780,11 +782,11 @@ static DenseIntElementsAttr TFSliceSizes2HLOSliceSizes(
     Builder *builder) {
   DenseIntElementsAttr constant_start_indices;
   if (!matchPattern(start_indices, m_Constant(&constant_start_indices))) {
-    return hlo::convertElementsAttr(slice_sizes, builder->getIntegerType(64))
-        .cast<DenseIntElementsAttr>();
+    return mlir::cast<DenseIntElementsAttr>(
+        hlo::convertElementsAttr(slice_sizes, builder->getIntegerType(64)));
   }
 
-  auto input_ty = input.getType().dyn_cast<RankedTensorType>();
+  auto input_ty = mlir::dyn_cast<RankedTensorType>(input.getType());
   int64_t input_rank = input_ty.getRank();
   ArrayRef<int64_t> input_shape = input_ty.getShape();
   SmallVector<int64_t, 4> normalized_sizes;
@@ -906,7 +908,7 @@ class ConvertBiasAddOp : public OpRewritePattern<TF::BiasAddOp> {
     if (!FormatFromString(op.getDataFormat().str(), &data_format))
       return op.emitOpError("invalid data format");
 
-    auto value_type = op.getValue().getType().dyn_cast<RankedTensorType>();
+    auto value_type = mlir::dyn_cast<RankedTensorType>(op.getValue().getType());
     if (!value_type) return failure();
     auto feature_dim = GetFeatureDimension(data_format, value_type);
     auto bias_broadcast = Broadcast1DToFeatureDim(
@@ -1008,11 +1010,9 @@ class ConvertConvDynamic : public OpRewritePattern<OpT> {
     if (!GetPaddingFromString(op.getPadding().str(), &padding).ok())
       return failure();
 
-    auto input_ty =
-        op.getInput().getType().template dyn_cast<RankedTensorType>();
-    auto filter_ty =
-        op.getFilter().getType().template dyn_cast<RankedTensorType>();
-    auto result_ty = op.getType().template dyn_cast<RankedTensorType>();
+    auto input_ty = mlir::dyn_cast<RankedTensorType>(op.getInput().getType());
+    auto filter_ty = mlir::dyn_cast<RankedTensorType>(op.getFilter().getType());
+    auto result_ty = mlir::dyn_cast<RankedTensorType>(op.getType());
     if (!input_ty || !filter_ty || !result_ty) return failure();
     // TODO(disc): Remove this constraint once fold and canonicalization
     // implemented.
@@ -1035,7 +1035,7 @@ class ConvertConvDynamic : public OpRewritePattern<OpT> {
     SmallVector<Value, num_spatial_dims * 2> paddings;
 
     auto get_int = [](Attribute attr) {
-      return attr.template cast<IntegerAttr>().getInt();
+      return mlir::cast<IntegerAttr>(attr).getInt();
     };
 
     constexpr int num_dims = num_spatial_dims + 2;
@@ -1177,10 +1177,8 @@ class ConvertConvOp : public OpRewritePattern<OpTy> {
     if (!GetPaddingFromString(op.getPadding().str(), &padding).ok())
       return failure();
 
-    auto input_ty =
-        op.getInput().getType().template dyn_cast<RankedTensorType>();
-    auto filter_ty =
-        op.getFilter().getType().template dyn_cast<RankedTensorType>();
+    auto input_ty = mlir::dyn_cast<RankedTensorType>(op.getInput().getType());
+    auto filter_ty = mlir::dyn_cast<RankedTensorType>(op.getFilter().getType());
 
     // With the exception of input's batch dimension, input and filter need to
     // have static shape for calculation of HLO paddings and feature group count
@@ -1205,7 +1203,7 @@ class ConvertConvOp : public OpRewritePattern<OpTy> {
     SmallVector<int64_t, num_spatial_dims * 2> paddings;
 
     auto get_int = [](Attribute attr) {
-      return attr.template cast<IntegerAttr>().getInt();
+      return mlir::cast<IntegerAttr>(attr).getInt();
     };
 
     constexpr int num_dims = num_spatial_dims + 2;
@@ -1228,7 +1226,7 @@ class ConvertConvOp : public OpRewritePattern<OpTy> {
         int64_t pad_high_int64;
         int64_t input_size = input_ty.getDimSize(dim);
         if (input_size == ShapedType::kDynamic) return failure();
-        tsl::Status status = tensorflow::GetWindowedOutputSizeVerbose(
+        absl::Status status = tensorflow::GetWindowedOutputSizeVerbose(
             input_size, filter_ty.getDimSize(i), dilation, stride, padding,
             &output_size, &pad_low_int64, &pad_high_int64);
         if (!status.ok()) return failure();
@@ -1318,8 +1316,8 @@ class ConvertPadOpDynamic : public OpRewritePattern<TF::PadV2Op> {
     auto input = op.getInput();
     auto paddings = op.getPaddings();
     auto constant_values = op.getConstantValues();
-    auto input_type = input.getType().dyn_cast<RankedTensorType>();
-    auto paddings_type = paddings.getType().dyn_cast<RankedTensorType>();
+    auto input_type = mlir::dyn_cast<RankedTensorType>(input.getType());
+    auto paddings_type = mlir::dyn_cast<RankedTensorType>(paddings.getType());
     if (!input_type || !paddings_type || !paddings_type.hasStaticShape())
       return failure();
 
@@ -1385,9 +1383,9 @@ class ConvertGatherNdOpDynamic : public OpRewritePattern<TF::GatherNdOp> {
                                 PatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
     auto params = op.getParams();
-    auto params_ty = params.getType().dyn_cast<RankedTensorType>();
+    auto params_ty = mlir::dyn_cast<RankedTensorType>(params.getType());
     auto indices = op.getIndices();
-    auto indices_ty = indices.getType().dyn_cast<RankedTensorType>();
+    auto indices_ty = mlir::dyn_cast<RankedTensorType>(indices.getType());
     auto params_rank = params_ty.getRank();
     auto indices_rank = indices_ty.getRank();
     int64_t num_index_dims = indices_ty.getDimSize(indices_rank - 1);
@@ -1485,8 +1483,8 @@ class ConvertBF16FloorDivOp : public OpRewritePattern<TF::FloorDivOp> {
 
   LogicalResult matchAndRewrite(TF::FloorDivOp op,
                                 PatternRewriter &rewriter) const override {
-    auto l = op.getX().dyn_cast<TypedValue<RankedTensorType>>();
-    auto r = op.getY().dyn_cast<TypedValue<RankedTensorType>>();
+    auto l = mlir::dyn_cast<TypedValue<RankedTensorType>>(op.getX());
+    auto r = mlir::dyn_cast<TypedValue<RankedTensorType>>(op.getY());
     if (!l || !r) return failure();
 
     auto element_type = getElementTypeOrSelf(l.getType());
@@ -1515,14 +1513,14 @@ class ConvertBroadcastToOp : public OpRewritePattern<TF::BroadcastToOp> {
 
   LogicalResult matchAndRewrite(TF::BroadcastToOp op,
                                 PatternRewriter &rewriter) const override {
-    auto input_type = op.getInput().getType().dyn_cast<RankedTensorType>();
+    auto input_type = mlir::dyn_cast<RankedTensorType>(op.getInput().getType());
     auto output_type = op.getOutput().getType();
     if (!input_type) {
       return rewriter.notifyMatchFailure(op, "requires ranked input shape");
     }
     llvm::SmallVector<int64_t, 4> broadcast_dimensions;
     if (input_type.getRank() > 0) {
-      auto ranked_output_type = output_type.dyn_cast<RankedTensorType>();
+      auto ranked_output_type = mlir::dyn_cast<RankedTensorType>(output_type);
       if (!ranked_output_type) {
         return rewriter.notifyMatchFailure(op, "requires ranked output shape");
       }
@@ -1546,7 +1544,7 @@ class ConvertRollOp : public OpRewritePattern<TF::RollOp> {
   using OpRewritePattern::OpRewritePattern;
   LogicalResult matchAndRewrite(TF::RollOp op,
                                 PatternRewriter &rewriter) const override {
-    auto shift_ty = op.getShift().getType().dyn_cast<RankedTensorType>();
+    auto shift_ty = mlir::dyn_cast<RankedTensorType>(op.getShift().getType());
     if (!shift_ty || shift_ty.getRank() != 0) {
       return rewriter.notifyMatchFailure(
           op, "require the type of shift to be 0D tensor");
@@ -1558,7 +1556,7 @@ class ConvertRollOp : public OpRewritePattern<TF::RollOp> {
     }
     int axis = val.getSExtValue();
 
-    auto input_ty = op.getInput().getType().dyn_cast<RankedTensorType>();
+    auto input_ty = mlir::dyn_cast<RankedTensorType>(op.getInput().getType());
     if (!input_ty || !input_ty.hasStaticShape()) {
       return rewriter.notifyMatchFailure(
           op, "require the type of input to have static shapes");
@@ -1674,7 +1672,7 @@ class ConvertDiagPartOp : public OpRewritePattern<TF::DiagPartOp> {
 
   LogicalResult matchAndRewrite(TF::DiagPartOp op,
                                 PatternRewriter &rewriter) const override {
-    auto input_type = op.getInput().getType().dyn_cast<RankedTensorType>();
+    auto input_type = mlir::dyn_cast<RankedTensorType>(op.getInput().getType());
     if (!input_type || !input_type.hasStaticShape()) return failure();
     int64_t num_dims = input_type.getRank();
     if (num_dims < 2 || num_dims % 2 != 0) return failure();
@@ -1771,7 +1769,7 @@ class ConvertMatrixDiagPartV3Op
   LogicalResult matchAndRewrite(TF::MatrixDiagPartV3Op op,
                                 PatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
-    ShapedType input_type = op.getInput().getType().dyn_cast<ShapedType>();
+    ShapedType input_type = mlir::dyn_cast<ShapedType>(op.getInput().getType());
 
     // Align is a string specifying how superdiagonals and subdiagonals should
     // be aligned/padded for diagonals that are shorter than max_diag_len. The
@@ -2035,7 +2033,7 @@ class ConvertFFTOp : public OpRewritePattern<OpTy> {
   using OpRewritePattern<OpTy>::OpRewritePattern;
   LogicalResult matchAndRewrite(OpTy op,
                                 PatternRewriter &rewriter) const override {
-    auto input_ty = op.getInput().getType().template cast<ShapedType>();
+    auto input_ty = mlir::cast<ShapedType>(op.getInput().getType());
     if (!input_ty.hasRank()) {
       return failure();
     }
@@ -2131,14 +2129,12 @@ class ConvertFusedBatchNormGradBase
     // TODO(b/141785544): Update this to not require static shapes.
     // activation shape needs to be static to convert negative indices in
     // TensorFlow to absolute indices required by HLO.
-    RankedTensorType act_type =
-        act.getType().template dyn_cast<RankedTensorType>();
+    RankedTensorType act_type = mlir::dyn_cast<RankedTensorType>(act.getType());
     if (!act_type) return failure();
     Type act_ele_type = act_type.getElementType();
     // To support mixed precision, the statistics type, which maybe more
     // precise than the input types, are used for this op.
-    Type kernel_type =
-        scale.getType().template cast<TensorType>().getElementType();
+    Type kernel_type = mlir::cast<TensorType>(scale.getType()).getElementType();
     grad = rewriter.create<ConvertOp>(loc, grad, kernel_type);
     act = rewriter.create<ConvertOp>(loc, act, kernel_type);
 
@@ -2260,14 +2256,13 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
     auto feature_dim =
         getFeatureDimensionAttr(rewriter, data_format, op.getX());
 
-    auto input_type_tensor = op.getX().getType().template cast<TensorType>();
+    auto input_type_tensor = mlir::cast<TensorType>(op.getX().getType());
     auto input_element_type = input_type_tensor.getElementType();
 
-    auto scale_type_tensor =
-        op.getScale().getType().template cast<TensorType>();
+    auto scale_type_tensor = mlir::cast<TensorType>(op.getScale().getType());
     auto scale_element_type = scale_type_tensor.getElementType();
 
-    auto mean_type_tensor = op.getMean().getType().template cast<TensorType>();
+    auto mean_type_tensor = mlir::cast<TensorType>(op.getMean().getType());
     auto mean_element_type = mean_type_tensor.getElementType();
     // In the training case, dimensions of input tensors must be static.
     if (op.getIsTraining() && (!input_type_tensor.hasStaticShape() ||
@@ -2281,7 +2276,7 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
     Value bn_train_input = rewriter.create<mhlo::ConvertOp>(
         op.getLoc(), op.getX(), scale_element_type);
     TensorType bn_train_input_type_tensor =
-        bn_train_input.getType().template cast<TensorType>();
+        mlir::cast<TensorType>(bn_train_input.getType());
 
     if (op.getIsTraining()) {
       // Training case.
@@ -2372,7 +2367,7 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
         // For FusedBatchNormV3Op, also create a constant tensor to forward to
         // last reserve_space_3 output.
         auto reserve_space_3_type =
-            op.getResult(5).getType().template cast<TensorType>();
+            mlir::cast<TensorType>(op.getResult(5).getType());
         int num_elements = reserve_space_3_type.hasStaticShape()
                                ? reserve_space_3_type.getNumElements()
                                : 0;
@@ -2416,7 +2411,7 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
         // For FusedBatchNormV3Op, also create a constant tensor to forward to
         // last reserve_space_3 output.
         auto reserve_space_3_type =
-            op.getResult(5).getType().template cast<TensorType>();
+            mlir::cast<TensorType>(op.getResult(5).getType());
         int num_elements = reserve_space_3_type.hasStaticShape()
                                ? reserve_space_3_type.getNumElements()
                                : 0;
@@ -2465,9 +2460,9 @@ static PaddingArray GetReduceWindowPaddingAsArray(
 
   for (const auto &dim : input_dims) input_shape.push_back(dim);
   for (Attribute attr : window_dims)
-    window_shape.push_back(attr.cast<IntegerAttr>().getInt());
+    window_shape.push_back(mlir::cast<IntegerAttr>(attr).getInt());
   for (Attribute attr : window_strides)
-    strides.push_back(attr.cast<IntegerAttr>().getInt());
+    strides.push_back(mlir::cast<IntegerAttr>(attr).getInt());
 
   PaddingArray paddings = ::xla::MakePadding(input_shape, window_shape, strides,
                                              ::xla::Padding::kSame);
@@ -2509,8 +2504,7 @@ Operation *AvgPoolDivideByCount(
     const SmallVector<int64_t, num_dims> &strides, OpTy op, Value zero,
     PatternRewriter &rewriter) {
   Location loc = op.getLoc();
-  RankedTensorType pooled_type =
-      pooled.getType().template cast<RankedTensorType>();
+  RankedTensorType pooled_type = mlir::cast<RankedTensorType>(pooled.getType());
   Type element_type = pooled_type.getElementType();
   Operation *result = nullptr;
   RankedTensorType orig_input_type =
@@ -2577,8 +2571,7 @@ class ConvertAvgPoolOp : public OpRewritePattern<OpTy> {
   LogicalResult matchAndRewrite(OpTy op,
                                 PatternRewriter &rewriter) const override {
     Value input_value = GetAvgPoolInput(op);
-    auto input_type =
-        input_value.getType().template dyn_cast<RankedTensorType>();
+    auto input_type = mlir::dyn_cast<RankedTensorType>(input_value.getType());
     if (!input_type) return failure();
 
     // We will do accumulation first; use a larger bitwidth if suitable.
@@ -2587,7 +2580,7 @@ class ConvertAvgPoolOp : public OpRewritePattern<OpTy> {
     Type result_type;
 
     // The result type for reduction and division with the proper element type.
-    if (auto ranked_type = op.getType().template dyn_cast<RankedTensorType>())
+    if (auto ranked_type = mlir::dyn_cast<RankedTensorType>(op.getType()))
       result_type = tensorflow::GetTypeFromTFTensorShape(ranked_type.getShape(),
                                                          sum_element_type);
     else
@@ -2695,8 +2688,7 @@ class ConvertAvgPoolGradOp : public OpRewritePattern<OpTy> {
     // `out_grad` is the gradient that was propagated via backpropagation from
     // the output layer.
     Value out_grad = op.getGrad();
-    auto out_grad_type =
-        out_grad.getType().template dyn_cast<RankedTensorType>();
+    auto out_grad_type = mlir::dyn_cast<RankedTensorType>(out_grad.getType());
     if (!out_grad_type) {
       return failure();
     }
@@ -2833,7 +2825,7 @@ class ConvertMaxPoolOp : public OpRewritePattern<OpTy> {
   LogicalResult matchAndRewrite(OpTy op,
                                 PatternRewriter &rewriter) const override {
     Type element_type =
-        op.getInput().getType().template cast<TensorType>().getElementType();
+        mlir::cast<TensorType>(op.getInput().getType()).getElementType();
     if (!element_type.isSignlessIntOrFloat()) return failure();
     tensorflow::Padding padding;
     if (!GetPaddingFromString(op.getPadding().str(), &padding).ok())
@@ -2845,8 +2837,7 @@ class ConvertMaxPoolOp : public OpRewritePattern<OpTy> {
     ConstantOp init = GetScalarLimitConstOfType(
         element_type, loc, hlo::kInfinityLowest, &rewriter);
 
-    auto input_ty =
-        op.getInput().getType().template dyn_cast<RankedTensorType>();
+    auto input_ty = mlir::dyn_cast<RankedTensorType>(op.getInput().getType());
     if (!input_ty) return failure();
     DenseIntElementsAttr paddings_attr = GetReduceWindowPaddingAsAttr<num_dims>(
         input_ty.getShape(), op.getKsize(), op.getStrides(), op.getPadding(),
@@ -2875,9 +2866,12 @@ class ConvertSelectOp : public OpRewritePattern<TF::SelectOp> {
   LogicalResult matchAndRewrite(TF::SelectOp op,
                                 PatternRewriter &rewriter) const override {
     // This lowering only works on ranked types.
-    auto cond_type = op.getCondition().getType().dyn_cast<RankedTensorType>();
-    auto then_type = op.getThenValue().getType().dyn_cast<RankedTensorType>();
-    auto else_type = op.getElseValue().getType().dyn_cast<RankedTensorType>();
+    auto cond_type =
+        mlir::dyn_cast<RankedTensorType>(op.getCondition().getType());
+    auto then_type =
+        mlir::dyn_cast<RankedTensorType>(op.getThenValue().getType());
+    auto else_type =
+        mlir::dyn_cast<RankedTensorType>(op.getElseValue().getType());
     if (!cond_type || !then_type || !else_type) {
       return failure();
     }
@@ -2913,7 +2907,7 @@ class ConvertSelectOp : public OpRewritePattern<TF::SelectOp> {
       assumption = b.createOrFold<shape::AssumingAllOp>(
           witness, ValueRange{assumption, eq_cstr});
     }
-    auto result_type = op.getResult().getType().cast<TensorType>();
+    auto result_type = mlir::cast<TensorType>(op.getResult().getType());
     auto assuming_op =
         b.create<shape::AssumingOp>(ArrayRef<Type>{result_type}, assumption);
 
@@ -2978,7 +2972,7 @@ class ConvertSigmoidOp : public RewritePattern {
 
     // Create constant half with shape and element type same as the operand.
     Value operand = op.getOperand();
-    auto operand_ty = operand.getType().cast<TensorType>();
+    auto operand_ty = mlir::cast<TensorType>(operand.getType());
     auto scalar_ty =
         tensorflow::GetTypeFromTFTensorShape({}, operand_ty.getElementType());
     ElementsAttr attr = mlir::hlo::getSplat(&rewriter, scalar_ty, 0.5);
@@ -3009,9 +3003,9 @@ class ConvertSliceOpDynamic : public OpRewritePattern<TF::SliceOp> {
     Value begin_indices = op.getBegin();
     Value sizes = op.getSize();
 
-    auto input_ty = input.getType().dyn_cast<RankedTensorType>();
-    auto begin_type = begin_indices.getType().dyn_cast<RankedTensorType>();
-    auto size_type = sizes.getType().dyn_cast<RankedTensorType>();
+    auto input_ty = mlir::dyn_cast<RankedTensorType>(input.getType());
+    auto begin_type = mlir::dyn_cast<RankedTensorType>(begin_indices.getType());
+    auto size_type = mlir::dyn_cast<RankedTensorType>(sizes.getType());
 
     if (!input_ty || !begin_type || !size_type ||
         !begin_type.hasStaticShape() || !size_type.hasStaticShape() ||
@@ -3112,8 +3106,8 @@ static void BroadcastBatchMatMulV2Operands(Value lhs, Value rhs, Location loc,
       loc, TypeRange{shape_type, shape_type}, lhs_shape, const_neg2);
   auto rhs_splitted = rewriter->create<shape::SplitAtOp>(
       loc, TypeRange{shape_type, shape_type}, rhs_shape, const_neg2);
-  auto lhs_type = lhs.getType().cast<RankedTensorType>();
-  auto rhs_type = rhs.getType().cast<RankedTensorType>();
+  auto lhs_type = mlir::cast<RankedTensorType>(lhs.getType());
+  auto rhs_type = mlir::cast<RankedTensorType>(rhs.getType());
   // The last two dimensions are the matrix row/col dimensions. Don't broadcast
   // them.
   SmallVector<int64_t, 6> result_batch_shape_compile_time_extents;
@@ -3166,21 +3160,21 @@ class ConvertBatchMatMulV2Op : public OpRewritePattern<TF::BatchMatMulV2Op> {
                                 PatternRewriter &rewriter) const override {
     Value lhs = op.getX();
     Value rhs = op.getY();
-    auto lhs_type = lhs.getType().dyn_cast<RankedTensorType>();
-    auto rhs_type = rhs.getType().dyn_cast<RankedTensorType>();
+    auto lhs_type = mlir::dyn_cast<RankedTensorType>(lhs.getType());
+    auto rhs_type = mlir::dyn_cast<RankedTensorType>(rhs.getType());
     if (!lhs_type || !rhs_type) return failure();
-    if (lhs_type.getElementType().isa<ComplexType>() && op.getAdjX()) {
+    if (mlir::isa<ComplexType>(lhs_type.getElementType()) && op.getAdjX()) {
       lhs = rewriter.create<TF::ConjOp>(op.getLoc(), lhs_type, lhs);
     }
-    if (rhs_type.getElementType().isa<ComplexType>() && op.getAdjY()) {
+    if (mlir::isa<ComplexType>(rhs_type.getElementType()) && op.getAdjY()) {
       rhs = rewriter.create<TF::ConjOp>(op.getLoc(), rhs_type, rhs);
     }
 
     // Broadcast both operands.
     BroadcastBatchMatMulV2Operands(lhs, rhs, op.getLoc(), &lhs, &rhs,
                                    &rewriter);
-    lhs_type = lhs.getType().cast<RankedTensorType>();
-    rhs_type = rhs.getType().cast<RankedTensorType>();
+    lhs_type = mlir::cast<RankedTensorType>(lhs.getType());
+    rhs_type = mlir::cast<RankedTensorType>(rhs.getType());
     assert(lhs_type.getRank() == rhs_type.getRank());
     int64_t rank = lhs_type.getRank();
     auto batch_dimensions = llvm::to_vector<4>(llvm::seq<int64_t>(0, rank - 2));
@@ -3243,7 +3237,7 @@ class ConvertSplitOp : public OpRewritePattern<TF::SplitOp> {
   LogicalResult matchAndRewrite(TF::SplitOp op,
                                 PatternRewriter &rewriter) const override {
     // We can only split inputs that have fully static shape.
-    auto input_type = op.getValue().getType().dyn_cast<RankedTensorType>();
+    auto input_type = mlir::dyn_cast<RankedTensorType>(op.getValue().getType());
     if (!input_type || !input_type.hasStaticShape()) return failure();
 
     // We can only match when the split dimension is a constant scalar.
@@ -3304,7 +3298,7 @@ class ConvertSplitOpDynamic : public OpRewritePattern<TF::SplitOp> {
                                 PatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
     Value input = op.getValue();
-    auto input_type = input.getType().dyn_cast<RankedTensorType>();
+    auto input_type = mlir::dyn_cast<RankedTensorType>(input.getType());
     if (!input_type) return failure();
 
     // TODO(disc): remove static shape check once folding/canonicalization func
@@ -3419,7 +3413,7 @@ class ConvertSplitVOp : public OpRewritePattern<TF::SplitVOp> {
                                 PatternRewriter &rewriter) const override {
     // We can only split inputs that have fully static shape.
     // TODO(b/145731001): enhance to support dynamic-shaped inputs.
-    auto input_type = op.getValue().getType().dyn_cast<RankedTensorType>();
+    auto input_type = mlir::dyn_cast<RankedTensorType>(op.getValue().getType());
     if (!input_type || !input_type.hasStaticShape()) return failure();
 
     // We can only match when the split dimension is a constant scalar.
@@ -3438,7 +3432,7 @@ class ConvertSplitVOp : public OpRewritePattern<TF::SplitVOp> {
     int64_t total_dim_size = 0;  // Total dimension size assigned to splits
     std::optional<int> dynamic_dim_index;
     split_sizes.reserve(
-        split_sizes_attr.getType().cast<ShapedType>().getNumElements());
+        mlir::cast<ShapedType>(split_sizes_attr.getType()).getNumElements());
     for (const auto &dim : llvm::enumerate(split_sizes_attr)) {
       int64_t dim_val = dim.value().getSExtValue();
       split_sizes.push_back(dim_val);
@@ -3620,7 +3614,7 @@ class ConvertStridedSliceOp : public OpRewritePattern<TF::StridedSliceOp> {
     // Begin must be a ranked, 1-dimensional tensor: This is checked by the
     // verifier.
     int64_t slicing_dim_size =
-        op.getBegin().getType().cast<RankedTensorType>().getDimSize(0);
+        mlir::cast<RankedTensorType>(op.getBegin().getType()).getDimSize(0);
     uint64_t begin_mask = op.getBeginMask();
     uint64_t end_mask = op.getEndMask();
     const int input_rank = input_shape.size();
@@ -3642,7 +3636,7 @@ class ConvertStridedSliceOp : public OpRewritePattern<TF::StridedSliceOp> {
     // For the dimensions that are to be sliced, all have slice sizes of 1.
     SmallVector<int64_t, 4> slice_sizes;
     auto begin_element_ty =
-        op.getBegin().getType().cast<ShapedType>().getElementType();
+        mlir::cast<ShapedType>(op.getBegin().getType()).getElementType();
     // Scalar tensor type.
     TensorType type =
         tensorflow::GetTypeFromTFTensorShape(/*shape=*/{}, begin_element_ty);
@@ -3696,14 +3690,14 @@ class ConvertStridedSliceOp : public OpRewritePattern<TF::StridedSliceOp> {
     //
     // TODO(hinsu): Relax this constraint for ops without negative indices and
     // strides.
-    auto input_ty = op.getInput().getType().dyn_cast<RankedTensorType>();
+    auto input_ty = mlir::dyn_cast<RankedTensorType>(op.getInput().getType());
     if (!input_ty || !input_ty.hasStaticShape()) return failure();
 
     // Output shape needs to be static to apply 'new_axis_mask' or
     // 'shrink_axis_mask' by reshaping tensor after slice.
     //
     // TODO(hinsu): Relax this constraint for ops without the above masks.
-    auto result_ty = op.getType().dyn_cast<RankedTensorType>();
+    auto result_ty = mlir::dyn_cast<RankedTensorType>(op.getType());
     if (!result_ty || !result_ty.hasStaticShape()) return failure();
 
     DenseIntElementsAttr sparse_begin_attr, sparse_end_attr;
@@ -3750,7 +3744,7 @@ class ConvertStridedSliceGradOp
       return failure();
 
     Value grad = op.getDy();
-    Type element_type = grad.getType().cast<ShapedType>().getElementType();
+    Type element_type = mlir::cast<ShapedType>(grad.getType()).getElementType();
 
     // Perform reshape to undo any new/shrink axes done by strided slice.
     grad = rewriter.create<mhlo::ReshapeOp>(
@@ -3830,7 +3824,7 @@ class ConvertRangeOp : public OpRewritePattern<TF::RangeOp> {
                                 PatternRewriter &rewriter) const override {
     auto result = op.getResult();
     auto result_type = result.getType();
-    if (!result_type.cast<ShapedType>().hasStaticShape()) {
+    if (!mlir::cast<ShapedType>(result_type).hasStaticShape()) {
       return failure();
     }
 
@@ -3863,7 +3857,7 @@ class ConvertDynamicRangeOp : public OpRewritePattern<TF::RangeOp> {
   LogicalResult matchAndRewrite(TF::RangeOp op,
                                 PatternRewriter &rewriter) const override {
     auto result = op.getResult();
-    auto result_type = result.getType().cast<ShapedType>();
+    auto result_type = mlir::cast<ShapedType>(result.getType());
     if (result_type.hasStaticShape()) {
       return failure();
     }
@@ -3875,11 +3869,12 @@ class ConvertDynamicRangeOp : public OpRewritePattern<TF::RangeOp> {
     // To compute the length we need to use floating point calculations so that
     // ceil can be computed for the number of steps.
     auto compute_element_type =
-        getElementTypeOrSelf(start.getType()).isa<FloatType>()
+        mlir::isa<FloatType>(getElementTypeOrSelf(start.getType()))
             ? getElementTypeOrSelf(start.getType())
             : rewriter.getF64Type();
     auto compute_type = tensorflow::GetTypeFromTFTensorShape(
-        limit.getType().cast<ShapedType>().getShape(), compute_element_type);
+        mlir::cast<ShapedType>(limit.getType()).getShape(),
+        compute_element_type);
 
     // Compute the length of the sequence we are going to need. This includes
     // some conversion to float for the operations.
@@ -3930,8 +3925,8 @@ class ConvertDynamicRangeOp : public OpRewritePattern<TF::RangeOp> {
 };
 
 ElementsAttr ConvertAxisAttr(Value val, ElementsAttr attr, Builder *builder) {
-  auto int_attr = attr.cast<DenseIntElementsAttr>();
-  auto type = val.getType().cast<ShapedType>();
+  auto int_attr = mlir::cast<DenseIntElementsAttr>(attr);
+  auto type = mlir::cast<ShapedType>(val.getType());
 
   SmallVector<int64_t, 6> axis;
   axis.reserve(int_attr.getNumElements());
@@ -3954,7 +3949,7 @@ class ConvertLinSpaceOp : public OpRewritePattern<TF::LinSpaceOp> {
   LogicalResult matchAndRewrite(TF::LinSpaceOp op,
                                 PatternRewriter &rewriter) const override {
     auto result = op.getResult();
-    auto result_type = result.getType().dyn_cast<ShapedType>();
+    auto result_type = mlir::dyn_cast<ShapedType>(result.getType());
     if (!result_type || !result_type.hasStaticShape()) {
       return failure();
     }
@@ -4023,8 +4018,7 @@ class GenericConvertReductionOp : public OpRewritePattern<OpTy> {
     // TODO(b/141785544): Update this to not require ranked shapes.
     // Input shape needs to be ranked to convert negative indices in TensorFlow
     // to absolute indices required by HLO.
-    auto input_ty =
-        op.getInput().getType().template dyn_cast<RankedTensorType>();
+    auto input_ty = mlir::dyn_cast<RankedTensorType>(op.getInput().getType());
     if (!input_ty) return failure();
     ArrayRef<int64_t> input_shape = input_ty.getShape();
 
@@ -4049,8 +4043,9 @@ class GenericConvertReductionOp : public OpRewritePattern<OpTy> {
     Type element_type = input_ty.getElementType();
 
     // Only float, int, and complex types are currently supported.
-    if (!element_type.isa<FloatType>() && !element_type.isa<IntegerType>() &&
-        !element_type.isa<ComplexType>()) {
+    if (!mlir::isa<FloatType>(element_type) &&
+        !mlir::isa<IntegerType>(element_type) &&
+        !mlir::isa<ComplexType>(element_type)) {
       return rewriter.notifyMatchFailure(
           op, "element type must be float, int, or complex type");
     }
@@ -4252,7 +4247,7 @@ class ConvertArgMinMaxOp : public OpRewritePattern<OpTy> {
   LogicalResult matchAndRewrite(OpTy op,
                                 PatternRewriter &rewriter) const override {
     RankedTensorType input_type =
-        op.getInput().getType().template dyn_cast<RankedTensorType>();
+        mlir::dyn_cast<RankedTensorType>(op.getInput().getType());
     if (!input_type) {
       return failure();
     }
@@ -4267,7 +4262,7 @@ class ConvertArgMinMaxOp : public OpRewritePattern<OpTy> {
         Derived::GetInitialValue(input_element_type, loc, rewriter);
 
     RankedTensorType output_type =
-        op.getOutput().getType().template dyn_cast<RankedTensorType>();
+        mlir::dyn_cast<RankedTensorType>(op.getOutput().getType());
     if (!output_type) {
       return rewriter.notifyMatchFailure(op, "requires known rank");
     }
@@ -4364,12 +4359,11 @@ class ConvertTensorScatterOp : public OpRewritePattern<OpTy> {
 
   LogicalResult matchAndRewrite(OpTy op,
                                 PatternRewriter &rewriter) const override {
-    auto tensor_ty =
-        op.getTensor().getType().template dyn_cast<RankedTensorType>();
+    auto tensor_ty = mlir::dyn_cast<RankedTensorType>(op.getTensor().getType());
     auto indices_ty =
-        op.getIndices().getType().template dyn_cast<RankedTensorType>();
+        mlir::dyn_cast<RankedTensorType>(op.getIndices().getType());
     auto updates_ty =
-        op.getUpdates().getType().template dyn_cast<RankedTensorType>();
+        mlir::dyn_cast<RankedTensorType>(op.getUpdates().getType());
 
     if (!tensor_ty || !indices_ty || !updates_ty) return failure();
     // Last dimension of the indices needs to known at compile time for
@@ -4421,13 +4415,13 @@ class ConvertTensorScatterOp : public OpRewritePattern<OpTy> {
       updates = rewriter.create<TF::BroadcastToOp>(
           op->getLoc(), broadcast_to_type, op.getUpdates(), const_op);
 
-      updates_ty = updates.getType().template dyn_cast<RankedTensorType>();
+      updates_ty = mlir::dyn_cast<RankedTensorType>(updates.getType());
     }
 
     int64_t tensor_rank = tensor_ty.getRank();
     int64_t indices_rank = indices_ty.getRank();
     int64_t updates_rank =
-        updates.getType().template dyn_cast<RankedTensorType>().getRank();
+        mlir::dyn_cast<RankedTensorType>(updates.getType()).getRank();
 
     int64_t window_dims = tensor_rank - num_index_dims;
     auto dims_attr = ScatterDimensionNumbersAttr::get(
@@ -4558,7 +4552,7 @@ class ConvertTileOp : public OpRewritePattern<TF::TileOp> {
 
   LogicalResult matchAndRewrite(TF::TileOp op,
                                 PatternRewriter &rewriter) const override {
-    auto input_ty = op.getInput().getType().dyn_cast<RankedTensorType>();
+    auto input_ty = mlir::dyn_cast<RankedTensorType>(op.getInput().getType());
     if (!input_ty || !input_ty.hasStaticShape()) return failure();
     ArrayRef<int64_t> input_shape = input_ty.getShape();
     Type element_type = input_ty.getElementType();
@@ -4639,7 +4633,7 @@ class ConvertTileOpDynamic : public OpRewritePattern<TF::TileOp> {
     Location loc = op.getLoc();
     Value input = op.getInput();
     Value multiples = op.getMultiples();
-    auto input_ty = input.getType().dyn_cast<RankedTensorType>();
+    auto input_ty = mlir::dyn_cast<RankedTensorType>(input.getType());
     if (!input_ty) return failure();
     // TODO(disc): Remove this constraint once fold and canonicalization
     // implemented.
@@ -4659,7 +4653,7 @@ class ConvertTileOpDynamic : public OpRewritePattern<TF::TileOp> {
       }
     }
 
-    auto multiples_ty = multiples.getType().dyn_cast<RankedTensorType>();
+    auto multiples_ty = mlir::dyn_cast<RankedTensorType>(multiples.getType());
     int64_t multiples_rank = multiples_ty.getRank();
     // rank of multiples input of tf.TileOp must be 1
     if (multiples_rank != 1) return failure();
@@ -4728,16 +4722,14 @@ class ConvertMaxPoolGradOp : public OpRewritePattern<OpTy> {
                                 PatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
 
-    Type element_type = op.getOrigInput()
-                            .getType()
-                            .template cast<TensorType>()
-                            .getElementType();
+    Type element_type =
+        mlir::cast<TensorType>(op.getOrigInput().getType()).getElementType();
 
     // Compute paddings using the original input and kernel shape and strides.
     // Here, ReduceWindow op as used as the MaxPool op is lowered to the
     // ReduceWindow op.
     auto input_ty =
-        op.getOrigInput().getType().template dyn_cast<RankedTensorType>();
+        mlir::dyn_cast<RankedTensorType>(op.getOrigInput().getType());
     if (!input_ty) return failure();
     DenseIntElementsAttr paddings_attr = GetReduceWindowPaddingAsAttr<num_dims>(
         input_ty.getShape(), op.getKsize(), op.getStrides(), op.getPadding(),
@@ -4798,9 +4790,8 @@ class ConvertConvBackpropInputOp : public OpRewritePattern<OpTy> {
       return failure();
 
     auto out_backprop_ty =
-        op.getOutBackprop().getType().template dyn_cast<RankedTensorType>();
-    auto filter_ty =
-        op.getFilter().getType().template dyn_cast<RankedTensorType>();
+        mlir::dyn_cast<RankedTensorType>(op.getOutBackprop().getType());
+    auto filter_ty = mlir::dyn_cast<RankedTensorType>(op.getFilter().getType());
 
     // With the exception of out_backprop's batch dimension, out_backprop and
     // filter need to have static shape. Filter is validated here, out_backprop
@@ -4824,7 +4815,7 @@ class ConvertConvBackpropInputOp : public OpRewritePattern<OpTy> {
     } else {
       auto pack = op.getInputSizes().template getDefiningOp<TF::PackOp>();
       if (!pack || pack.getAxis() != 0) return failure();
-      auto pack_ty = pack.getType().template dyn_cast<RankedTensorType>();
+      auto pack_ty = mlir::dyn_cast<RankedTensorType>(pack.getType());
       if (!pack_ty || pack_ty.getRank() != 1) return failure();
       for (auto i = 0; i < pack_ty.getDimSize(0); ++i) {
         if (i == batch_dim) {
@@ -4862,7 +4853,7 @@ class ConvertConvBackpropInputOp : public OpRewritePattern<OpTy> {
       explicit_paddings.reserve(explicit_paddings_attr.size());
       for (Attribute explicit_padding : explicit_paddings_attr)
         explicit_paddings.push_back(
-            explicit_padding.cast<IntegerAttr>().getInt());
+            mlir::cast<IntegerAttr>(explicit_padding).getInt());
     }
 
     ArrayRef<int64_t> filter_shape = filter_ty.getShape();
@@ -5029,9 +5020,8 @@ class ConvertConvBackpropFilterOp : public OpRewritePattern<OpTy> {
       return failure();
 
     auto out_backprop_ty =
-        op.getOutBackprop().getType().template dyn_cast<RankedTensorType>();
-    auto input_ty =
-        op.getInput().getType().template dyn_cast<RankedTensorType>();
+        mlir::dyn_cast<RankedTensorType>(op.getOutBackprop().getType());
+    auto input_ty = mlir::dyn_cast<RankedTensorType>(op.getInput().getType());
 
     for (RankedTensorType ty : {out_backprop_ty, input_ty})
       if (!ty || !ty.hasStaticShape()) return failure();
@@ -5063,7 +5053,7 @@ class ConvertConvBackpropFilterOp : public OpRewritePattern<OpTy> {
       explicit_paddings.reserve(explicit_paddings_attr.size());
       for (Attribute explicit_padding : explicit_paddings_attr)
         explicit_paddings.push_back(
-            explicit_padding.cast<IntegerAttr>().getInt());
+            mlir::cast<IntegerAttr>(explicit_padding).getInt());
     }
 
     constexpr int num_dims = num_spatial_dims + 2;
@@ -5223,7 +5213,8 @@ class ConvertOneHotOp : public OpRewritePattern<TF::OneHotOp> {
 
   LogicalResult matchAndRewrite(TF::OneHotOp op,
                                 PatternRewriter &rewriter) const override {
-    auto indices_ty = op.getIndices().getType().dyn_cast<RankedTensorType>();
+    auto indices_ty =
+        mlir::dyn_cast<RankedTensorType>(op.getIndices().getType());
     if (!indices_ty || !indices_ty.hasStaticShape()) return failure();
     ArrayRef<int64_t> indices_shape = indices_ty.getShape();
     Type element_type = indices_ty.getElementType();
@@ -5307,7 +5298,7 @@ class ConvertInfeedDequeueTupleOp
     result_types.reserve(op.getOutputs().size() + 1);
     for (const auto &output : op.getOutputs()) {
       Type ty = output.getType();
-      if (auto tensor_ty = ty.dyn_cast<RankedTensorType>()) {
+      if (auto tensor_ty = mlir::dyn_cast<RankedTensorType>(ty)) {
         if (!tensor_ty.hasStaticShape()) return failure();
       }
       result_types.push_back(ty);
@@ -5412,7 +5403,7 @@ class ConvertTopKV2Op : public OpRewritePattern<TF::TopKV2Op> {
     if (!matchPattern(op.getK(), m_Constant(&k_attr))) return failure();
     int64_t k = (*k_attr.begin()).getSExtValue();
 
-    TensorType input_type = op.getInput().getType().cast<TensorType>();
+    TensorType input_type = mlir::cast<TensorType>(op.getInput().getType());
     if (!input_type.hasRank()) return failure();
     int64_t input_rank = input_type.getRank();
     int64_t last_dim_index = input_rank - 1;
@@ -5436,7 +5427,7 @@ class ConvertUnpackOp : public OpRewritePattern<TF::UnpackOp> {
 
   LogicalResult matchAndRewrite(TF::UnpackOp op,
                                 PatternRewriter &rewriter) const override {
-    auto value_type = op.getValue().getType().dyn_cast<RankedTensorType>();
+    auto value_type = mlir::dyn_cast<RankedTensorType>(op.getValue().getType());
     if (!value_type) return failure();
 
     int64_t value_rank = value_type.getRank();
@@ -5482,7 +5473,7 @@ class ConvertUnpackOpDynamic : public OpRewritePattern<TF::UnpackOp> {
 
   LogicalResult matchAndRewrite(TF::UnpackOp op,
                                 PatternRewriter &rewriter) const override {
-    auto value_type = op.getValue().getType().dyn_cast<RankedTensorType>();
+    auto value_type = mlir::dyn_cast<RankedTensorType>(op.getValue().getType());
     if (!value_type) return failure();
     // TODO(disc): Remove this constraint once fold and canonicalization
     // implemented.
@@ -5585,8 +5576,8 @@ class ConvertSigmoidGradOpDynamic : public OpRewritePattern<TF::SigmoidGradOp> {
     Location loc = op.getLoc();
     Value y = op.getY();
     Value dy = op.getDy();
-    auto tp_y = y.getType().dyn_cast<RankedTensorType>();
-    auto tp_dy = dy.getType().dyn_cast<RankedTensorType>();
+    auto tp_y = mlir::dyn_cast<RankedTensorType>(y.getType());
+    auto tp_dy = mlir::dyn_cast<RankedTensorType>(dy.getType());
     if (!tp_y || !tp_dy) return failure();
 
     // TODO(disc): Remove this constraint once fold and canonicalization
@@ -5598,7 +5589,7 @@ class ConvertSigmoidGradOpDynamic : public OpRewritePattern<TF::SigmoidGradOp> {
     if (elem_tp.isSignlessInteger()) {
       attr = rewriter.getIntegerAttr(elem_tp, 1);
     } else {
-      assert(elem_tp.isa<FloatType>());
+      assert(mlir::isa<FloatType>(elem_tp));
       attr = rewriter.getFloatAttr(elem_tp, 1);
     }
     Value one = rewriter.create<mhlo::ConstantOp>(
@@ -5640,13 +5631,12 @@ class GenericConvertUnsortedSegmentReductionOp : public OpRewritePattern<OpTy> {
 
   LogicalResult matchAndRewrite(OpTy op,
                                 PatternRewriter &rewriter) const override {
-    auto data_type =
-        op.getData().getType().template dyn_cast<RankedTensorType>();
+    auto data_type = mlir::dyn_cast<RankedTensorType>(op.getData().getType());
     if (!data_type) return failure();
     int64_t data_rank = data_type.getRank();
 
     auto segment_ids_type =
-        op.getSegmentIds().getType().template dyn_cast<RankedTensorType>();
+        mlir::dyn_cast<RankedTensorType>(op.getSegmentIds().getType());
     if (!segment_ids_type) return failure();
     int64_t segment_ids_rank = segment_ids_type.getRank();
 
@@ -5766,7 +5756,7 @@ class ConvertRandomShuffleOp : public OpRewritePattern<TF::RandomShuffleOp> {
       return success();
     };
 
-    auto input_type = op.getValue().getType().dyn_cast<RankedTensorType>();
+    auto input_type = mlir::dyn_cast<RankedTensorType>(op.getValue().getType());
     if (!input_type) return failure();
     if (input_type.hasStaticShape() && input_type.getNumElements() <= 1)
       // No shuffling is required, so copy input directly to output.
@@ -5966,16 +5956,16 @@ class ConvertInplaceUpdateOp : public OpRewritePattern<TF::InplaceUpdateOp> {
 
   LogicalResult matchAndRewrite(TF::InplaceUpdateOp op,
                                 PatternRewriter &rewriter) const override {
-    auto input = op.getX().dyn_cast<TypedValue<RankedTensorType>>();
+    auto input = mlir::dyn_cast<TypedValue<RankedTensorType>>(op.getX());
     if (!input) return failure();
     auto indices = op.getI();
     auto updates = op.getV();
 
     // Slice each row of `i` and `v` to perform a separate dynamic-update-slice
     // on the contents of `x`.
-    auto input_type = input.getType().cast<ShapedType>();
-    auto updates_type = updates.getType().cast<ShapedType>();
-    auto indices_type = indices.getType().cast<ShapedType>();
+    auto input_type = mlir::cast<ShapedType>(input.getType());
+    auto updates_type = mlir::cast<ShapedType>(updates.getType());
+    auto indices_type = mlir::cast<ShapedType>(indices.getType());
     if (!input_type.hasRank()) return failure();
     if (!updates_type.hasRank() || updates_type.isDynamicDim(0))
       return failure();
@@ -6033,7 +6023,8 @@ class ConvertXlaDynamicUpdateSliceOp
 
   LogicalResult matchAndRewrite(TF::XlaDynamicUpdateSliceOp op,
                                 PatternRewriter &rewriter) const override {
-    auto indices_type = op.getIndices().getType().dyn_cast<RankedTensorType>();
+    auto indices_type =
+        mlir::dyn_cast<RankedTensorType>(op.getIndices().getType());
     if (!indices_type || !indices_type.hasStaticShape() ||
         indices_type.getShape().size() != 1)
       return failure();
@@ -6062,8 +6053,8 @@ class ConvertXlaReduceScatterOp
     if (!matchPattern(op.getGroupAssignment(), m_Constant(&group_assignment)))
       return failure();
     auto replica_groups =
-        hlo::convertElementsAttr(group_assignment, rewriter.getIntegerType(64))
-            .cast<DenseIntElementsAttr>();
+        mlir::cast<DenseIntElementsAttr>(hlo::convertElementsAttr(
+            group_assignment, rewriter.getIntegerType(64)));
     if (replica_groups.getType().getRank() != 2) return failure();
 
     APInt scatter_dimension;
@@ -6141,16 +6132,16 @@ class ConvertXlaReduceWindowOp
     // Create the mhlo.SelectAndScatter op.
     auto reduce_window_op = rewriter.create<mhlo::ReduceWindowOp>(
         loc, result_types, op.getInput(), op.getInitValue(),
-        hlo::convertElementsAttr(window_dimensions, rewriter.getIntegerType(64))
-            .cast<DenseIntElementsAttr>(),
-        hlo::convertElementsAttr(window_strides, rewriter.getIntegerType(64))
-            .cast<DenseIntElementsAttr>(),
-        hlo::convertElementsAttr(base_dilations, rewriter.getIntegerType(64))
-            .cast<DenseIntElementsAttr>(),
-        hlo::convertElementsAttr(window_dilations, rewriter.getIntegerType(64))
-            .cast<DenseIntElementsAttr>(),
-        hlo::convertElementsAttr(padding, rewriter.getIntegerType(64))
-            .cast<DenseIntElementsAttr>());
+        mlir::cast<DenseIntElementsAttr>(hlo::convertElementsAttr(
+            window_dimensions, rewriter.getIntegerType(64))),
+        mlir::cast<DenseIntElementsAttr>(hlo::convertElementsAttr(
+            window_strides, rewriter.getIntegerType(64))),
+        mlir::cast<DenseIntElementsAttr>(hlo::convertElementsAttr(
+            base_dilations, rewriter.getIntegerType(64))),
+        mlir::cast<DenseIntElementsAttr>(hlo::convertElementsAttr(
+            window_dilations, rewriter.getIntegerType(64))),
+        mlir::cast<DenseIntElementsAttr>(
+            hlo::convertElementsAttr(padding, rewriter.getIntegerType(64))));
     // Insert a call to the reducer in the region of the mhlo op.
     mlir::SymbolRefAttr func = op.getComputation();
     auto func_op = cast<mlir::func::FuncOp>(SymbolTable::lookupSymbolIn(
@@ -6177,9 +6168,9 @@ class ConvertClipByValueOp : public OpRewritePattern<TF::ClipByValueOp> {
     Value min = op.getClipValueMin();
     Value max = op.getClipValueMax();
 
-    auto input_ty = input.getType().cast<ShapedType>();
-    auto min_ty = min.getType().cast<ShapedType>();
-    auto max_ty = max.getType().cast<ShapedType>();
+    auto input_ty = mlir::cast<ShapedType>(input.getType());
+    auto min_ty = mlir::cast<ShapedType>(min.getType());
+    auto max_ty = mlir::cast<ShapedType>(max.getType());
 
     if (!input_ty.hasRank() || !min_ty.hasRank() || !max_ty.hasRank()) {
       return failure();
@@ -6215,8 +6206,9 @@ class ConvertConstOp : public OpRewritePattern<TF::ConstOp> {
   LogicalResult matchAndRewrite(TF::ConstOp op,
                                 PatternRewriter &rewriter) const override {
     // Convert only for valid HLO tensors.
-    auto ty = op.getType().dyn_cast<TensorType>();
-    if (!ty || !ty.getElementType().isa<FloatType, IntegerType, ComplexType>())
+    auto ty = mlir::dyn_cast<TensorType>(op.getType());
+    if (!ty ||
+        !mlir::isa<FloatType, IntegerType, ComplexType>(ty.getElementType()))
       return failure();
 
     Location loc = op.getLoc();
@@ -6239,9 +6231,9 @@ class ConvertCumOp : public OpRewritePattern<OpT> {
 
   LogicalResult matchAndRewrite(OpT op,
                                 PatternRewriter &rewriter) const override {
-    auto input = op.getX().template dyn_cast<TypedValue<RankedTensorType>>();
+    auto input = mlir::dyn_cast<TypedValue<RankedTensorType>>(op.getX());
     if (!input) return failure();
-    auto input_type = input.getType().template dyn_cast<ShapedType>();
+    auto input_type = mlir::dyn_cast<ShapedType>(input.getType());
     if (!input_type || !input_type.hasStaticShape()) {
       return failure();
     }
@@ -6352,7 +6344,7 @@ class ConvertShapeOp : public OpRewritePattern<TF::ShapeOp> {
                                 PatternRewriter &rewriter) const override {
     Value input = op.getInput();
 
-    auto result_ty = op.getResult().getType().dyn_cast<RankedTensorType>();
+    auto result_ty = mlir::dyn_cast<RankedTensorType>(op.getResult().getType());
     if (!result_ty) {
       return failure();
     }
@@ -6373,8 +6365,8 @@ class ConvertDynamicExpandDimsOp : public OpRewritePattern<TF::ExpandDimsOp> {
   LogicalResult matchAndRewrite(TF::ExpandDimsOp op,
                                 PatternRewriter &rewriter) const override {
     auto input = op.getInput();
-    auto input_ty = input.getType().cast<ShapedType>();
-    auto result_ty = op.getType().cast<ShapedType>();
+    auto input_ty = mlir::cast<ShapedType>(input.getType());
+    auto result_ty = mlir::cast<ShapedType>(op.getType());
     if (!result_ty.hasRank() || !input_ty.hasRank() ||
         result_ty.hasStaticShape()) {
       return failure();
@@ -6431,8 +6423,8 @@ class ConvertDynamicSqueezeOp : public OpRewritePattern<TF::SqueezeOp> {
   LogicalResult matchAndRewrite(TF::SqueezeOp op,
                                 PatternRewriter &rewriter) const override {
     auto input = op.getInput();
-    auto input_ty = input.getType().cast<ShapedType>();
-    auto result_ty = op.getType().cast<ShapedType>();
+    auto input_ty = mlir::cast<ShapedType>(input.getType());
+    auto result_ty = mlir::cast<ShapedType>(op.getType());
     if (!result_ty.hasRank() || !input_ty.hasRank() ||
         result_ty.hasStaticShape()) {
       return failure();
@@ -6465,9 +6457,7 @@ class ConvertDynamicSqueezeOp : public OpRewritePattern<TF::SqueezeOp> {
 
     auto from_extents =
         rewriter.create<tensor::FromElementsOp>(op.getLoc(), dims);
-    // chlo::DynamicReshapeOp checks if the reshape is legal and will fail if
-    // any non-1 dimension is squeezed.
-    rewriter.replaceOpWithNewOp<chlo::DynamicReshapeOp>(op, result_ty, input,
+    rewriter.replaceOpWithNewOp<mhlo::DynamicReshapeOp>(op, result_ty, input,
                                                         from_extents);
     return success();
   }
@@ -6492,24 +6482,23 @@ class ConvertXlaConvV2Op : public OpRewritePattern<TF::XlaConvV2Op> {
       return failure();
 
     auto window_strides_named_attr = rewriter.getNamedAttr(
-        "window_strides", hlo::convertElementsAttr(window_strides_attr,
-                                                   rewriter.getIntegerType(64))
-                              .cast<DenseIntElementsAttr>());
+        "window_strides",
+        mlir::cast<DenseIntElementsAttr>(hlo::convertElementsAttr(
+            window_strides_attr, rewriter.getIntegerType(64))));
 
     auto padding_named_attr = rewriter.getNamedAttr(
-        "padding",
-        hlo::convertElementsAttr(padding_attr, rewriter.getIntegerType(64))
-            .cast<DenseIntElementsAttr>());
+        "padding", mlir::cast<DenseIntElementsAttr>(hlo::convertElementsAttr(
+                       padding_attr, rewriter.getIntegerType(64))));
 
     auto lhs_dilation_named_attr = rewriter.getNamedAttr(
         "lhs_dilation",
-        hlo::convertElementsAttr(lhs_dilation_attr, rewriter.getIntegerType(64))
-            .cast<DenseIntElementsAttr>());
+        mlir::cast<DenseIntElementsAttr>(hlo::convertElementsAttr(
+            lhs_dilation_attr, rewriter.getIntegerType(64))));
 
     auto rhs_dilation_named_attr = rewriter.getNamedAttr(
         "rhs_dilation",
-        hlo::convertElementsAttr(rhs_dilation_attr, rewriter.getIntegerType(64))
-            .cast<DenseIntElementsAttr>());
+        mlir::cast<DenseIntElementsAttr>(hlo::convertElementsAttr(
+            rhs_dilation_attr, rewriter.getIntegerType(64))));
 
     int64_t feature_group_count_val =
         feature_group_count_attr.getValues<IntegerAttr>()[0].getInt();
@@ -6566,12 +6555,12 @@ class ConvertXlaSelectAndScatterOp
     // Create the mhlo.SelectAndScatter op.
     auto select_and_scatter_op = rewriter.create<mhlo::SelectAndScatterOp>(
         loc, result_types, op.getOperand(), op.getSource(), op.getInitValue(),
-        hlo::convertElementsAttr(window_dimensions, rewriter.getIntegerType(64))
-            .cast<DenseIntElementsAttr>(),
-        hlo::convertElementsAttr(window_strides, rewriter.getIntegerType(64))
-            .cast<DenseIntElementsAttr>(),
-        hlo::convertElementsAttr(padding, rewriter.getIntegerType(64))
-            .cast<DenseIntElementsAttr>());
+        mlir::cast<DenseIntElementsAttr>(hlo::convertElementsAttr(
+            window_dimensions, rewriter.getIntegerType(64))),
+        mlir::cast<DenseIntElementsAttr>(hlo::convertElementsAttr(
+            window_strides, rewriter.getIntegerType(64))),
+        mlir::cast<DenseIntElementsAttr>(
+            hlo::convertElementsAttr(padding, rewriter.getIntegerType(64))));
 
     auto insert_call_to = [&](const mlir::SymbolRefAttr &func, Region *region) {
       auto func_op = cast<mlir::func::FuncOp>(SymbolTable::lookupSymbolIn(
@@ -6671,7 +6660,7 @@ class ConvertXlaVariadicReduceV2Op
     auto func_ty = func_op.getFunctionType();
     SmallVector<Type> elementTypes{llvm::map_range(
         func_ty.getResults(),
-        [](Type ty) { return ty.cast<ShapedType>().getElementType(); })};
+        [](Type ty) { return mlir::cast<ShapedType>(ty).getElementType(); })};
 
     // Create the mhlo.reduce op.
     auto reduce_op = rewriter.create<mhlo::ReduceOp>(
@@ -6754,7 +6743,7 @@ class LowerYieldOp : public OpConversionPattern<TF::YieldOp> {
 // Returns a new tensor type from the given type with element type updated to
 // the given type.
 TensorType UpdateElementTypeTo(Type ty, Type element_ty) {
-  auto ranked_ty = ty.dyn_cast<RankedTensorType>();
+  auto ranked_ty = mlir::dyn_cast<RankedTensorType>(ty);
   if (!ranked_ty) {
     return UnrankedTensorType::get(element_ty);
   }
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_collective.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_collective.cc
index 54bd5812644488..34df8fc9759a5c 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_collective.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_collective.cc
@@ -113,9 +113,8 @@ LogicalResult ConvertReplicaGroups(OpBuilder& builder,
   if (!matchPattern(group_assignment_value, m_Constant(&group_assignment))) {
     return op->emitOpError() << "expects constant group_assignment";
   }
-  replica_groups =
-      hlo::convertElementsAttr(group_assignment, builder.getIntegerType(64))
-          .cast<DenseIntElementsAttr>();
+  replica_groups = mlir::cast<DenseIntElementsAttr>(
+      hlo::convertElementsAttr(group_assignment, builder.getIntegerType(64)));
   if (replica_groups.getType().getRank() != 2) {
     return op->emitOpError() << "group_assignment should have rank 2, got "
                              << replica_groups.getType().getRank();
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_communication.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_communication.cc
index 3e8dd5b58ed2f1..68c412f79ff393 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_communication.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_communication.cc
@@ -458,7 +458,7 @@ SmallVector<Value> GetValueWithToken(
     return new_result;
   };
 
-  auto tuple_type = value.getType().dyn_cast<TupleType>();
+  auto tuple_type = mlir::dyn_cast<TupleType>(value.getType());
   // `value` is not a tuple, create a new tuple.
   if (!tuple_type) return {create_tuple({value, token})};
 
@@ -499,7 +499,7 @@ SmallVector<Type> GetTypeWithToken(OpBuilder& builder, ArrayRef<Type> types,
   }
 
   auto type = types[0];
-  if (auto tuple_type = type.dyn_cast<TupleType>()) {
+  if (auto tuple_type = mlir::dyn_cast<TupleType>(type)) {
     auto result_types = llvm::to_vector(tuple_type.getTypes());
     result_types.push_back(token_type);
     return {builder.getTupleType(result_types)};
@@ -536,7 +536,7 @@ void ReplaceWithTupleResult(OpBuilder& builder, ValueRange values,
 
   auto value = values[0];
   auto replacement = replacements[0];
-  auto tuple_type = value.getType().dyn_cast<TupleType>();
+  auto tuple_type = mlir::dyn_cast<TupleType>(value.getType());
   if (!tuple_type) {
     if (!value.use_empty()) {
       auto new_element = builder.create<GetTupleElementOp>(replacement.getLoc(),
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_patterns.td
index 588639e3435aae..401d1e8b954e40 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_patterns.td
@@ -627,10 +627,6 @@ foreach TfOp = [TF_ExpandDimsOp, TF_ReshapeOp, TF_SqueezeOp, ] in {
             (addBenefit 2)>;
 }
 
-// Lowering tf.Reshape with dynamic shape
-def : Pat<(TF_ReshapeOp:$res MHLO_Tensor:$arg, $shape),
-          (CHLO_DynamicReshapeOp $arg, $shape)>;
-
 // Returns NaN if x is NaN, 0 if x is 0, -1 if x < 0 and 1 if x > 0.
 def : Pat<(TF_SignOp $x), (MHLO_SignOp $x)>;
 
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_with_tf2xla.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_with_tf2xla.cc
index d5560f2481b00f..ce8b46708d2f52 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_with_tf2xla.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_with_tf2xla.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -75,13 +76,13 @@ namespace {
 // Returns true if the given type is a ranked tensor type with static or bounded
 // dimensions.
 bool IsBounded(Type ty) {
-  auto ranked_ty = ty.dyn_cast<RankedTensorType>();
+  auto ranked_ty = mlir::dyn_cast<RankedTensorType>(ty);
   if (!ranked_ty) return false;
 
   if (ranked_ty.hasStaticShape()) return true;
 
   auto encoding =
-      ranked_ty.getEncoding().dyn_cast_or_null<TypeExtensionsAttr>();
+      mlir::dyn_cast_or_null<TypeExtensionsAttr>(ranked_ty.getEncoding());
   if (!encoding) return false;
 
   for (int i = 0; i < ranked_ty.getRank(); ++i) {
@@ -96,10 +97,11 @@ bool IsBounded(Type ty) {
 bool HasSymbolRefAttr(Operation* op) {
   for (const auto& attr : op->getAttrs()) {
     Attribute attr_value = attr.getValue();
-    if (attr_value.isa<SymbolRefAttr>()) {
+    if (mlir::isa<SymbolRefAttr>(attr_value)) {
       return true;
-    } else if (auto array_attr = attr_value.dyn_cast<ArrayAttr>()) {
-      if (!array_attr.empty() && array_attr.begin()->isa<SymbolRefAttr>()) {
+    } else if (auto array_attr = mlir::dyn_cast<ArrayAttr>(attr_value)) {
+      if (!array_attr.empty() &&
+          mlir::isa<SymbolRefAttr>(*array_attr.begin())) {
         return true;
       }
     }
@@ -146,8 +148,8 @@ class Tf2XlaRewritePattern : public ConversionPattern {
 };
 
 bool ShouldRefineTypeTo(Type original_ty, Type updated_ty) {
-  auto updated = updated_ty.dyn_cast<ShapedType>();
-  auto original = original_ty.dyn_cast<ShapedType>();
+  auto updated = mlir::dyn_cast<ShapedType>(updated_ty);
+  auto original = mlir::dyn_cast<ShapedType>(original_ty);
 
   // Both types must be shaped types.
   if (!original || !updated) return false;
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/test_utils.cc b/tensorflow/compiler/mlir/tf2xla/transforms/test_utils.cc
index 142bd2b379208f..e43bcdf6d3a26e 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/test_utils.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/test_utils.cc
@@ -35,7 +35,7 @@ using ::mlir::ModuleOp;
 using ::mlir::OwningOpRef;
 using ::tsl::StatusOr;
 
-StatusOr<OwningOpRef<ModuleOp>> GetMlirModuleFromString(
+absl::StatusOr<OwningOpRef<ModuleOp>> GetMlirModuleFromString(
     absl::string_view module_string, MLIRContext* context) {
   DialectRegistry mlir_registry;
   RegisterCommonToolingDialects(mlir_registry);
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/test_utils.h b/tensorflow/compiler/mlir/tf2xla/transforms/test_utils.h
index 9a6aeb44a27279..13baaba06aadb9 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/test_utils.h
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/test_utils.h
@@ -28,7 +28,7 @@ namespace test {
 
 // Given a raw string, return a ModuleOp that can be used with the given
 // MLIRContext.
-tsl::StatusOr<OwningOpRef<ModuleOp>> GetMlirModuleFromString(
+absl::StatusOr<OwningOpRef<ModuleOp>> GetMlirModuleFromString(
     absl::string_view module_string, MLIRContext* mlir_context);
 
 }  // namespace test
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter.cc b/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter.cc
index b17d474f85a652..2709f9dada21a7 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter.cc
@@ -132,7 +132,7 @@ Tf2XlaRewriter::~Tf2XlaRewriter() {
   if (context_) context_->Unref();
 }
 
-tsl::StatusOr<mhlo::TupleOp> Tf2XlaRewriter::ImportXlaComputation(
+absl::StatusOr<mhlo::TupleOp> Tf2XlaRewriter::ImportXlaComputation(
     XlaComputation& computation) {
   xla::DebugOptions debug_options;
   TF_ASSIGN_OR_RETURN(auto hlo_module_config,
@@ -205,7 +205,7 @@ LogicalResult Tf2XlaRewriter::PrepareParams() {
   // concurrently running each of the MLIR functions create a new device.
   step_container_ = std::make_unique<tensorflow::ScopedStepContainer>(
       /*step_id=*/0, cleanup);
-  tsl::Status status = step_container_->Create(
+  absl::Status status = step_container_->Create(
       device_->resource_manager(),
       tensorflow::XlaContext::kXlaContextResourceName, context_);
   if (!status.ok()) {
@@ -214,7 +214,7 @@ LogicalResult Tf2XlaRewriter::PrepareParams() {
   }
   params_.step_container = step_container_.get();
 
-  tsl::StatusOr<int64_t> version_or = tensorflow::GetTfGraphProducerVersion(
+  absl::StatusOr<int64_t> version_or = tensorflow::GetTfGraphProducerVersion(
       op_->getParentOfType<mlir::ModuleOp>());
   if (!version_or.ok()) {
     return emitError(op_->getLoc()) << version_or.status().ToString();
@@ -232,13 +232,13 @@ LogicalResult Tf2XlaRewriter::PrepareParams() {
 // Returns true if the given type is a ranked tensor type with static or
 // bounded dimensions.
 bool IsBounded(Type ty) {
-  auto ranked_ty = ty.dyn_cast<RankedTensorType>();
+  auto ranked_ty = mlir::dyn_cast<RankedTensorType>(ty);
   if (!ranked_ty) return false;
 
   if (ranked_ty.hasStaticShape()) return true;
 
   auto encoding =
-      ranked_ty.getEncoding().dyn_cast_or_null<TypeExtensionsAttr>();
+      mlir::dyn_cast_or_null<TypeExtensionsAttr>(ranked_ty.getEncoding());
   if (!encoding) return false;
 
   for (int i = 0; i < ranked_ty.getRank(); ++i) {
@@ -253,10 +253,11 @@ bool IsBounded(Type ty) {
 bool HasSymbolRefAttr(Operation* op) {
   for (const auto& attr : op->getAttrs()) {
     Attribute attr_value = attr.getValue();
-    if (attr_value.isa<SymbolRefAttr>()) {
+    if (mlir::isa<SymbolRefAttr>(attr_value)) {
       return true;
-    } else if (auto array_attr = attr_value.dyn_cast<ArrayAttr>()) {
-      if (!array_attr.empty() && array_attr.begin()->isa<SymbolRefAttr>()) {
+    } else if (auto array_attr = mlir::dyn_cast<ArrayAttr>(attr_value)) {
+      if (!array_attr.empty() &&
+          mlir::isa<SymbolRefAttr>(*array_attr.begin())) {
         return true;
       }
     }
@@ -305,7 +306,7 @@ LogicalResult Tf2XlaRewriter::PrepareKernelInputs(
 
 LogicalResult Tf2XlaRewriter::LegalizeOp() {
   for (Type ty : op_->getOperandTypes()) {
-    auto ranked_ty = ty.dyn_cast<ShapedType>();
+    auto ranked_ty = mlir::dyn_cast<ShapedType>(ty);
     // Only bounded operands are supported in the XLA builders.
     if (!IsBounded(ranked_ty)) {
       return op_->emitRemark()
@@ -328,7 +329,7 @@ LogicalResult Tf2XlaRewriter::LegalizeOp() {
   if (failed(PrepareParams())) return failure();
 
   std::shared_ptr<const tensorflow::NodeProperties> props;
-  tsl::Status status = tensorflow::NodeProperties::CreateFromNodeDef(
+  absl::Status status = tensorflow::NodeProperties::CreateFromNodeDef(
       *nodedef_or.value(),
       params_.function_library->GetFunctionLibraryDefinition(), &props);
   if (!status.ok()) {
@@ -387,11 +388,11 @@ LogicalResult Tf2XlaRewriter::LegalizeOp() {
 
   if (failed(VerifyOpResults(op_context))) return failure();
 
-    StatusOr<mhlo::TupleOp> tuple_result_or_status =
-        CompileWithHloImporter(op_context);
-    if (!tuple_result_or_status.ok()) {
-      return op_->emitRemark() << tuple_result_or_status.status().ToString();
-    }
+  absl::StatusOr<mhlo::TupleOp> tuple_result_or_status =
+      CompileWithHloImporter(op_context);
+  if (!tuple_result_or_status.ok()) {
+    return op_->emitRemark() << tuple_result_or_status.status().ToString();
+  }
     mhlo::TupleOp tuple_result = tuple_result_or_status.value();
 
     llvm::SmallVector<Value> output_values;
@@ -403,7 +404,7 @@ LogicalResult Tf2XlaRewriter::LegalizeOp() {
   return success();
 }
 
-tsl::StatusOr<mhlo::TupleOp> Tf2XlaRewriter::CompileWithHloImporter(
+absl::StatusOr<mhlo::TupleOp> Tf2XlaRewriter::CompileWithHloImporter(
     tensorflow::OpKernelContext& op_context) {
   // XLA can only return a single value. Wrap all output op return values
   // in a Tuple op that gets unpacked later.
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter.h b/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter.h
index 71cafc5579ff16..2b8c52750a6c44 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter.h
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter.h
@@ -56,12 +56,12 @@ class Tf2XlaRewriter {
 
   // Compiles the given Operation with XlaBuilder and imports the generated HLO
   // via the HLO -> MHLO importer.
-  tsl::StatusOr<mhlo::TupleOp> CompileWithHloImporter(
+  absl::StatusOr<mhlo::TupleOp> CompileWithHloImporter(
       tensorflow::OpKernelContext& op_context);
 
   // Import the given XlaComputation into the parent module. Returns the given
   // generated function.
-  tsl::StatusOr<mhlo::TupleOp> ImportXlaComputation(
+  absl::StatusOr<mhlo::TupleOp> ImportXlaComputation(
       xla::XlaComputation& computation);
 
   // Prepares OpKernelContext params common to all the ops.
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter_test.cc b/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter_test.cc
index 061889965aebd9..aecf9db3f0d5fe 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter_test.cc
@@ -127,7 +127,7 @@ class Tf2XlaRewriterTest : public ::testing::Test {
         module_, test::GetMlirModuleFromString(module_string, &context_));
 
     context_.loadAllAvailableDialects();
-    return tsl::OkStatus();
+    return absl::OkStatus();
   }
 
   Status LegalizeSingleOp(Operation& op) {
@@ -143,7 +143,7 @@ class Tf2XlaRewriterTest : public ::testing::Test {
       return tsl::errors::Internal("Failed to rewrite op");
     }
 
-    return tsl::OkStatus();
+    return absl::OkStatus();
   }
 
   Status LegalizeModule(std::string module_string = kMlirModuleStr) {
@@ -170,7 +170,7 @@ class Tf2XlaRewriterTest : public ::testing::Test {
       return tsl::errors::Internal("Could not legalize all ops");
     }
 
-    return tsl::OkStatus();
+    return absl::OkStatus();
   }
 
   mlir::func::FuncOp GetMainFunc() {
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/verify_tfxla_legalization.cc b/tensorflow/compiler/mlir/tf2xla/transforms/verify_tfxla_legalization.cc
index 7938fc4684ce2b..a6435081820880 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/verify_tfxla_legalization.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/verify_tfxla_legalization.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -89,18 +90,18 @@ static void IncrementCounterFor(tensorflow::monitoring::Counter<1>* counter,
 }
 
 bool HasBounds(RankedTensorType type) {
-  auto encoding =
-      type.getEncoding().dyn_cast_or_null<mlir::mhlo::TypeExtensionsAttr>();
+  auto encoding = mlir::dyn_cast_or_null<mlir::mhlo::TypeExtensionsAttr>(
+      type.getEncoding());
   return (encoding && !encoding.getBounds().empty());
 }
 
 bool HasStaticShapeOrBounded(Value val) {
   auto type = val.getType();
-  if (type.isa<UnrankedTensorType>()) {
+  if (mlir::isa<UnrankedTensorType>(type)) {
     return false;
   }
-  if (type.isa<RankedTensorType>()) {
-    auto ranked_tensor = type.dyn_cast<RankedTensorType>();
+  if (mlir::isa<RankedTensorType>(type)) {
+    auto ranked_tensor = mlir::dyn_cast<RankedTensorType>(type);
     if (ranked_tensor.hasStaticShape()) {
       return true;
     }
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf_test.cc b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf_test.cc
index 39eadcb93fcfce..4183d181fc5611 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf_test.cc
@@ -40,7 +40,7 @@ using ::mlir::OwningOpRef;
 using ::mlir::PassManager;
 using ::tensorflow::monitoring::testing::CellReader;
 
-StatusOr<OwningOpRef<ModuleOp>> GetMlirModuleFromString(
+absl::StatusOr<OwningOpRef<ModuleOp>> GetMlirModuleFromString(
     absl::string_view module_string, MLIRContext* context) {
   mlir::DialectRegistry mlir_registry;
   RegisterAllTensorFlowDialects(mlir_registry);
diff --git a/tensorflow/compiler/mlir/tf_mlir_opt_main.cc b/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
index 1ce45fe7345c11..4583fc9cd967e2 100644
--- a/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
+++ b/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
@@ -38,9 +38,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tosa/tfl_passes.h"
 #include "tensorflow/compiler/mlir/tosa/transforms/passes.h"
 #include "xla/mlir/framework/transforms/passes.h"
-#include "xla/mlir_hlo/lhlo/transforms/passes.h"
 #include "xla/mlir_hlo/mhlo/transforms/passes.h"
-#include "xla/service/cpu/hlo_xla_runtime_pipeline.h"
 
 int main(int argc, char **argv) {
   tensorflow::InitMlir y(&argc, &argv);
@@ -52,7 +50,6 @@ int main(int argc, char **argv) {
   mlir::tf_saved_model::registerTensorFlowSavedModelPasses();
   mlir::TFL::registerTensorFlowLitePasses();
   mlir::mhlo::registerAllMhloPasses();
-  mlir::lmhlo::registerAllLmhloPasses();
 
   // These are in compiler/mlir/tf2xla and not part of the above MHLO passes.
   mlir::mhlo::registerLegalizeTfPasses();
diff --git a/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.cc b/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.cc
index 1e5114c1103c1f..906f828f2d5023 100644
--- a/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.cc
+++ b/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.cc
@@ -57,7 +57,7 @@ namespace tfr {
 
 const char* const kTFRLibEnv = "TF_MLIR_TFR_LIB_DIR";
 
-StatusOr<std::unique_ptr<TFRDecomposeContext>> TFRDecomposeContext::Get(
+absl::StatusOr<std::unique_ptr<TFRDecomposeContext>> TFRDecomposeContext::Get(
     mlir::MLIRContext* mlir_ctx) {
   Env* env = Env::Default();
   std::string tfr_lib_dir;
@@ -121,8 +121,8 @@ std::unique_ptr<TFRDecomposeContext> TFRDecomposeContext::GetFromText(
   return std::make_unique<TFRDecomposeContext>(module_op);
 }
 
-StatusOr<FunctionDef> TFRDecomposeContext::ExpandNode(const NodeDef& node_def,
-                                                      StringPiece func_name) {
+absl::StatusOr<FunctionDef> TFRDecomposeContext::ExpandNode(
+    const NodeDef& node_def, StringPiece func_name) {
   const OpDef* op_def;
   TF_RETURN_IF_ERROR(OpRegistry::Global()->LookUpOpDef(node_def.op(), &op_def));
   DataTypeVector input_dtys, output_dtys;
@@ -209,8 +209,8 @@ TFRDecomposeContext::TFRDecomposeContext(mlir::ModuleOp tfr_module)
 
 void TFRDecomposeContext::Destroy() { tfr_module_.erase(); }
 
-StatusOr<FunctionDef> ExpandNode(const NodeDef& node_def,
-                                 StringPiece func_name) {
+absl::StatusOr<FunctionDef> ExpandNode(const NodeDef& node_def,
+                                       StringPiece func_name) {
   mlir::MLIRContext mlir_ctx;
   TF_ASSIGN_OR_RETURN(auto ctx, TFRDecomposeContext::Get(&mlir_ctx));
   return ctx->ExpandNode(node_def, func_name);
diff --git a/tensorflow/compiler/mlir/tfr/ir/tfr_ops.td b/tensorflow/compiler/mlir/tfr/ir/tfr_ops.td
index 0e036caf4d2c77..b3bd4d618bd808 100644
--- a/tensorflow/compiler/mlir/tfr/ir/tfr_ops.td
+++ b/tensorflow/compiler/mlir/tfr/ir/tfr_ops.td
@@ -41,7 +41,6 @@ def TFR_Dialect : Dialect {
   }];
 
   let cppNamespace = "::mlir::TFR";
-  let usePropertiesForAttributes = 0;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tfr/passes/decompose.cc b/tensorflow/compiler/mlir/tfr/passes/decompose.cc
index 5d59d958d3e7c9..988dc9e612b9c3 100644
--- a/tensorflow/compiler/mlir/tfr/passes/decompose.cc
+++ b/tensorflow/compiler/mlir/tfr/passes/decompose.cc
@@ -84,8 +84,8 @@ namespace {
 // Quantize the float value based on given scale and zero point attributes.
 IntegerAttr Quantize(float value, Attribute scale_attr, Attribute zp_attr,
                      OpBuilder builder) {
-  double scale = scale_attr.cast<FloatAttr>().getValueAsDouble();
-  int64_t zp = zp_attr.cast<IntegerAttr>().getInt();
+  double scale = mlir::cast<FloatAttr>(scale_attr).getValueAsDouble();
+  int64_t zp = mlir::cast<IntegerAttr>(zp_attr).getInt();
 
   int quantized = static_cast<int>(std::round(value / scale) + zp);
   quantized =
@@ -187,11 +187,12 @@ LogicalResult DecomposeTFOpsPass::RewriteUnregisteredTFOps() {
     // default value in the argument attribute.
     llvm::SmallVector<Value, 4> new_operands;
     for (auto arg : llvm::enumerate(compose_func_type.getInputs())) {
-      if (auto tensor_type = arg.value().dyn_cast<TFRTensorType>()) {
+      if (auto tensor_type = mlir::dyn_cast<TFRTensorType>(arg.value())) {
         auto casted = builder.create<CastOp>(op->getLoc(), tensor_type,
                                              op->getOperand(arg.index()));
         new_operands.push_back(casted);
-      } else if (auto list_type = arg.value().dyn_cast<TFRTensorListType>()) {
+      } else if (auto list_type =
+                     mlir::dyn_cast<TFRTensorListType>(arg.value())) {
         llvm::SmallVector<Value, 4> variadic_operands;
         for (int i = arg.index(); i < op->getNumOperands(); i++) {
           auto casted = builder.create<CastOp>(
@@ -211,8 +212,8 @@ LogicalResult DecomposeTFOpsPass::RewriteUnregisteredTFOps() {
         }
         if (!attribute && attr_name.getValue() == "out_type") {
           auto type = op->getResult(0).getType();
-          if (type.isa<TensorType>()) {
-            type = type.cast<TensorType>().getElementType();
+          if (mlir::isa<TensorType>(type)) {
+            type = mlir::cast<TensorType>(type).getElementType();
           }
           attribute = TypeAttr::get(type);
         }
@@ -220,8 +221,9 @@ LogicalResult DecomposeTFOpsPass::RewriteUnregisteredTFOps() {
         // Wrap these special attributes as a special TFR constant, so the SSA
         // value has a valid type to be used as TFR function argument. These
         // attributes are not expected to be manipulated by the lowering passes.
-        if (attribute.isa<TypeAttr>() || attribute.isa<ArrayAttr>() ||
-            attribute.isa<StringAttr>() || attribute.isa<FlatSymbolRefAttr>()) {
+        if (mlir::isa<TypeAttr>(attribute) || mlir::isa<ArrayAttr>(attribute) ||
+            mlir::isa<StringAttr>(attribute) ||
+            mlir::isa<FlatSymbolRefAttr>(attribute)) {
           TFRAttrType output_type = TFRAttrType::get(builder.getContext());
           attr_cst =
               builder.create<ConstOp>(op->getLoc(), output_type, attribute);
@@ -245,9 +247,10 @@ LogicalResult DecomposeTFOpsPass::RewriteUnregisteredTFOps() {
     // op result.
     llvm::SmallVector<Value, 4> new_results;
     for (auto res : llvm::enumerate(compose_func_type.getResults())) {
-      if (res.value().dyn_cast<TFRTensorType>()) {
+      if (mlir::dyn_cast<TFRTensorType>(res.value())) {
         new_results.push_back(new_op.getResult(res.index()));
-      } else if (auto list_type = res.value().dyn_cast<TFRTensorListType>()) {
+      } else if (auto list_type =
+                     mlir::dyn_cast<TFRTensorListType>(res.value())) {
         for (int i = res.index(), j = 0; i < op->getNumResults(); i++, j++) {
           auto index = builder.create<mlir::arith::ConstantOp>(
               op->getLoc(), builder.getIndexAttr(j));
diff --git a/tensorflow/compiler/mlir/tfr/passes/raise_to_tf.cc b/tensorflow/compiler/mlir/tfr/passes/raise_to_tf.cc
index dd85565cfed88e..61aa404847ee07 100644
--- a/tensorflow/compiler/mlir/tfr/passes/raise_to_tf.cc
+++ b/tensorflow/compiler/mlir/tfr/passes/raise_to_tf.cc
@@ -136,7 +136,7 @@ class RewriteTFRCallOp : public OpRewritePattern<CallOp> {
   // by the frontend correctly.
   Value CastToNonDerivedType(PatternRewriter& rewriter, Location loc,
                              CastOp cast_op, Type input_tfr_type) const {
-    auto tensor_type = input_tfr_type.dyn_cast<TFRTensorType>();
+    auto tensor_type = mlir::dyn_cast<TFRTensorType>(input_tfr_type);
     if (!tensor_type) return cast_op.getArg();
 
     auto attr_names = tensor_type.getAttrKeys();
@@ -150,7 +150,7 @@ class RewriteTFRCallOp : public OpRewritePattern<CallOp> {
     }
 
     Type original_input_type =
-        cast_op.getInputElementType().cast<TypeAttr>().getValue();
+        mlir::cast<TypeAttr>(cast_op.getInputElementType()).getValue();
     if (result_elt_type != original_input_type) {
       UnrankedTensorType result_type = UnrankedTensorType::get(result_elt_type);
       return rewriter.create<TF::CastOp>(loc, result_type, cast_op.getArg());
@@ -166,10 +166,10 @@ class RewriteTFRCallOp : public OpRewritePattern<CallOp> {
                             llvm::SmallVectorImpl<Value>& input_values) const {
     if (input_types.size() <= 1) return;
 
-    Type target_input_type = input_types[0].cast<TypeAttr>().getValue();
+    Type target_input_type = mlir::cast<TypeAttr>(input_types[0]).getValue();
     auto result_type = UnrankedTensorType::get(target_input_type);
     for (auto i = 1; i < input_types.size(); ++i) {
-      Type current_input_type = input_types[i].cast<TypeAttr>().getValue();
+      Type current_input_type = mlir::cast<TypeAttr>(input_types[i]).getValue();
       if (current_input_type != target_input_type) {
         input_values[i] =
             rewriter.create<TF::CastOp>(loc, result_type, input_values[i]);
@@ -189,7 +189,7 @@ LogicalResult RewriteTFRCallOp::AddDerivedAttrs(
     llvm::StringMap<Attribute>* derived_attrs) const {
   // If there is an attribute associated to the input in the signature, we
   // store it as an derived attribute.
-  if (auto tensor_type = input_tfr_type.dyn_cast<TFRTensorType>()) {
+  if (auto tensor_type = mlir::dyn_cast<TFRTensorType>(input_tfr_type)) {
     auto attr_names = tensor_type.getAttrKeys();
     if (attr_names.empty()) return success();
 
@@ -201,7 +201,7 @@ LogicalResult RewriteTFRCallOp::AddDerivedAttrs(
 
   // If there is an attribute associated to the input in the signature,
   // we store it as an derived attribute.
-  if (auto list_type = input_tfr_type.dyn_cast<TFRTensorListType>()) {
+  if (auto list_type = mlir::dyn_cast<TFRTensorListType>(input_tfr_type)) {
     auto attr_names = list_type.getAttrKeys();
     if (attr_names.empty()) return success();
 
@@ -314,7 +314,7 @@ Attribute RewriteTFRCallOp::ProcessAttributeValue(Attribute attr,
   if (!attr_type) return attr;
 
   if (attr_type.getValue() == "tensor") {
-    if (auto f = attr.dyn_cast<FloatAttr>()) {
+    if (auto f = mlir::dyn_cast<FloatAttr>(attr)) {
       RankedTensorType type = RankedTensorType::get({}, f.getType());
       return DenseFPElementsAttr::get(type, attr);
     }
@@ -332,13 +332,13 @@ LogicalResult RewriteTFRCallOp::DeriveOutputTypes(
     const llvm::StringMap<Attribute>& attrs,
     SmallVectorImpl<Type>* output_types) const {
   for (auto res : llvm::enumerate(signature.getResults())) {
-    if (auto tensor_type = res.value().dyn_cast<TFRTensorType>()) {
+    if (auto tensor_type = mlir::dyn_cast<TFRTensorType>(res.value())) {
       // tfr.tensor should only have one attribute attached.
       auto attr_key = tensor_type.getAttrKeys().front();
       Builder builder(signature.getContext());
       if (auto attr = attrs.lookup(attr_key.getValue())) {
         output_types->push_back(
-            UnrankedTensorType::get(attr.cast<TypeAttr>().getValue()));
+            UnrankedTensorType::get(mlir::cast<TypeAttr>(attr).getValue()));
       } else if (Type element_type =
                      GetFixedElementType(attr_key.getValue(), builder)) {
         output_types->push_back(UnrankedTensorType::get(element_type));
@@ -350,16 +350,18 @@ LogicalResult RewriteTFRCallOp::DeriveOutputTypes(
       continue;
     }
 
-    if (auto list_type = res.value().dyn_cast<TFRTensorListType>()) {
+    if (auto list_type = mlir::dyn_cast<TFRTensorListType>(res.value())) {
       // There are two cases: N*T or list(dtype)
       auto attr_keys = list_type.getAttrKeys();
       // N*T case
       if (attr_keys.size() == 2) {
         // The first one is N, and the second one is T
         int list_size =
-            attrs.lookup(attr_keys[0].getValue()).cast<IntegerAttr>().getInt();
+            mlir::cast<IntegerAttr>(attrs.lookup(attr_keys[0].getValue()))
+                .getInt();
         Type list_type =
-            attrs.lookup(attr_keys[1].getValue()).cast<TypeAttr>().getValue();
+            mlir::cast<TypeAttr>(attrs.lookup(attr_keys[1].getValue()))
+                .getValue();
         for (int i = 0; i < list_size; ++i) {
           output_types->push_back(UnrankedTensorType::get(list_type));
         }
@@ -398,11 +400,12 @@ LogicalResult RewriteTFRCallOp::CreateAndReplaceOp(
   SmallVector<Value, 4> new_results;
   for (auto res : llvm::enumerate(call_op.getResultTypes())) {
     Type res_type = res.value();
-    if (res_type.dyn_cast<TFRTensorType>()) {
+    if (mlir::dyn_cast<TFRTensorType>(res_type)) {
       Value new_res = new_op->getResult(res.index());
       auto casted = rewriter.create<CastOp>(loc, res_type, new_res);
       new_results.push_back(casted.getOut());
-    } else if (auto list_type = res.value().dyn_cast<TFRTensorListType>()) {
+    } else if (auto list_type =
+                   mlir::dyn_cast<TFRTensorListType>(res.value())) {
       SmallVector<Value, 4> tensor_list;
       for (int i = res.index(); i < new_op->getNumResults(); i++) {
         Value new_res = new_op->getResult(i);
diff --git a/tensorflow/compiler/mlir/tfrt/BUILD b/tensorflow/compiler/mlir/tfrt/BUILD
index 31b6aa272faf1d..1b6dbbd9176d22 100644
--- a/tensorflow/compiler/mlir/tfrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/BUILD
@@ -145,6 +145,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@tf_runtime//:basic_kernels_opdefs",
@@ -166,6 +167,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tfrt/ir:tfrt_fallback_async_opdefs",
         "//tensorflow/compiler/mlir/tfrt/ir:tfrt_fallback_opdefs",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@tf_runtime//:basic_kernels_opdefs",
@@ -206,6 +208,7 @@ cc_library(
         "transforms/deduplicate_if_result_pass.cc",
         "transforms/fuse_tpu_compile_and_execute_ops.cc",
         "transforms/insert_tensor_copy.cc",
+        "transforms/lower_bound_batch_threads.cc",
         "transforms/lower_saved_model.cc",
         "transforms/merge_tf_if_ops.cc",
         "transforms/optimize.cc",
@@ -289,22 +292,21 @@ cc_library(
     deps = [
         ":tf_to_tfrt",
         ":tfrt_compile_options",
+        ":tfrt_pipeline_options",
         "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
-        "//tensorflow/compiler/mlir/tensorflow:import_model",
         "//tensorflow/compiler/mlir/tensorflow:translate_lib",
-        "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
+        "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:status",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings",
-        "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
         "@tf_runtime//:bef",
         "@tf_runtime//:core_runtime",
-        "@tf_runtime//:hostcontext",
         "@tf_runtime//:mlirtobef",
-        "@tf_runtime//:tensor",
     ],
 )
 
@@ -331,6 +333,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
         "@local_tsl//tsl/platform:errors",
         "@tf_runtime//:bef",
         "@tf_runtime//:core_runtime",
@@ -347,7 +350,6 @@ cc_library(
         "translate/import_model.h",
     ],
     visibility = [
-        # copybara:uncomment "//learning/brain/experimental/tfrt/visualization:__pkg__",
         "//tensorflow/compiler/mlir/tfrt/tests/saved_model:__pkg__",
         "//tensorflow/compiler/mlir/tfrt/transforms/mlrt:__pkg__",
         "//tensorflow/core/tfrt/graph_executor:__pkg__",
@@ -406,10 +408,7 @@ cc_library(
     hdrs = ["translate/tfrt_compile_options.h"],
     compatible_with = get_compatible_with_portable(),  # copybara: comment
     visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/core/protobuf:for_core_protos_cc",
-        "@com_google_absl//absl/strings",
-    ],
+    deps = ["//tensorflow/core/protobuf:for_core_protos_cc"],
 )
 
 cc_library(
@@ -426,6 +425,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
         "@tf_runtime//:compiler_tfrt_op_interfaces",
     ],
 )
@@ -621,6 +621,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
         "@tf_runtime//:core_runtime_opdefs",
     ],
 )
@@ -641,6 +642,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
         "@tf_runtime//:basic_kernels_opdefs",
         "@tf_runtime//:core_runtime_opdefs",
     ],
@@ -662,6 +664,7 @@ cc_library(
     hdrs = ["transforms/tpu_passes.h"],
     visibility = [":friends"] + if_google([
         "//learning/brain/tfrt/ifrt/pjrt/__subpackages__",
+        "//learning/serving/servables/tfrt:__subpackages__",
     ]),
     deps = [
         ":fallback_converter",
diff --git a/tensorflow/compiler/mlir/tfrt/analysis/cost_analysis.cc b/tensorflow/compiler/mlir/tfrt/analysis/cost_analysis.cc
index 5573e7c2d46866..28f582723c8b2f 100644
--- a/tensorflow/compiler/mlir/tfrt/analysis/cost_analysis.cc
+++ b/tensorflow/compiler/mlir/tfrt/analysis/cost_analysis.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/container/flat_hash_map.h"
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tfrt/constants.h"
 #include "tensorflow/core/tfrt/fallback/cost_recorder.h"
@@ -59,14 +60,14 @@ int64_t InferLookupTableFindV2Cost(const CostContext& context,
   constexpr int64_t kLookupTableFindCostScale = 8;
   constexpr int64_t kLookupTableFindStringKeyCostScale = 16;
 
-  auto value_type = op.getValues().getType().cast<mlir::TensorType>();
-  auto key_type = op.getKeys().getType().cast<mlir::TensorType>();
+  auto value_type = mlir::cast<mlir::TensorType>(op.getValues().getType());
+  auto key_type = mlir::cast<mlir::TensorType>(op.getKeys().getType());
 
   int64_t output_size = InferTensorSize(context, value_type);
 
   int64_t cost = kLookupTableFindCostScale * output_size;
 
-  if (key_type.getElementType().isa<mlir::TF::StringType>())
+  if (mlir::isa<mlir::TF::StringType>(key_type.getElementType()))
     cost *= kLookupTableFindStringKeyCostScale;
 
   return cost;
@@ -74,15 +75,15 @@ int64_t InferLookupTableFindV2Cost(const CostContext& context,
 
 // The cost function for tf.GatherV2.
 int64_t InferGatherV2Cost(const CostContext& context, mlir::TF::GatherV2Op op) {
-  return InferTensorSize(context,
-                         op.getOutput().getType().cast<mlir::TensorType>());
+  return InferTensorSize(
+      context, mlir::cast<mlir::TensorType>(op.getOutput().getType()));
 }
 
 // The cost function for tf.SparseSegmentSumOp.
 template <typename OpType>
 int64_t InferSparseSegmentOpCost(const CostContext& context, OpType op) {
   return InferTensorSize(
-      context, op.getOutput().getType().template cast<mlir::TensorType>());
+      context, mlir::cast<mlir::TensorType>(op.getOutput().getType()));
 }
 
 // CostFunctionRegistry is a map from op names to their cost functions.
@@ -145,8 +146,8 @@ void CostAnalysis::AnalyzeArguments(mlir::func::FuncOp func_op) {
   // Use the max size among function inputs as the default size of dynamic
   // shaped tensors in the function.
   for (auto arg : func_op.getArguments()) {
-    if (!arg.getType().isa<mlir::TensorType>()) continue;
-    auto type = arg.getType().cast<mlir::TensorType>();
+    if (!mlir::isa<mlir::TensorType>(arg.getType())) continue;
+    auto type = mlir::cast<mlir::TensorType>(arg.getType());
     if (type.hasRank()) {
       max_arg_size_ = std::max(max_arg_size_, GetRankedTensorSize(type));
     }
@@ -204,7 +205,7 @@ void CostAnalysis::EvaluateCost(mlir::Operation* op) {
   // For other ops, use the sum of input sizes as its cost.
   int64_t cost = kDefaultCheapCost;
   for (auto operand : op->getOperands()) {
-    auto type = operand.getType().cast<mlir::TensorType>();
+    auto type = mlir::cast<mlir::TensorType>(operand.getType());
     if (type.hasRank()) {
       cost += GetRankedTensorSize(type);
     } else {
diff --git a/tensorflow/compiler/mlir/tfrt/function/function.cc b/tensorflow/compiler/mlir/tfrt/function/function.cc
index 42b7ff2b38982a..c29b5aeabda8ea 100644
--- a/tensorflow/compiler/mlir/tfrt/function/function.cc
+++ b/tensorflow/compiler/mlir/tfrt/function/function.cc
@@ -15,23 +15,20 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tfrt/function/function.h"
 
+#include "absl/log/log.h"
 #include "absl/strings/match.h"
-#include "absl/strings/str_split.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/tfrt_pipeline_options.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tfrt/bef/bef_buffer.h"  // from @tf_runtime
 #include "tfrt/bef_converter/mlir_to_bef.h"  // from @tf_runtime
-#include "tfrt/core_runtime/core_runtime.h"  // from @tf_runtime
-#include "tfrt/core_runtime/op_handler.h"  // from @tf_runtime
-#include "tfrt/host_context/host_context.h"  // from @tf_runtime
-#include "tfrt/tensor/dense_host_tensor_view.h"  // from @tf_runtime
 
 namespace tensorflow {
 
@@ -93,7 +90,7 @@ Status CompileTFMLIRToBEF(const TfrtFunctionCompileOptions& options,
     return diag_handler.Combine(
         tensorflow::errors::Internal("failed to convert MLIR to BEF."));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/ir/BUILD b/tensorflow/compiler/mlir/tfrt/ir/BUILD
index 68e9624e118453..b29066807fbf78 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/ir/BUILD
@@ -29,6 +29,7 @@ cc_library(
         ":tfrt_fallback_opdefs_inc_gen",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:SideEffectInterfaces",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -43,7 +44,6 @@ cc_library(
         # copybara:uncomment "//learning/brain/experimental/tfrt:__subpackages__",
         # copybara:uncomment "//learning/brain/tfrt/tpu/compiler/mlir:__subpackages__",
         "//tensorflow/compiler/mlir/tfrt:__subpackages__",
-        "//tensorflow/core/runtime_fallback:__subpackages__",
         "//tensorflow/core/tfrt/saved_model:friends",
     ],
     deps = [
@@ -98,6 +98,7 @@ cc_library(
     deps = [
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
         "@tf_runtime//:basic_kernels_opdefs",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tfrt/ir/gpu_ops.td b/tensorflow/compiler/mlir/tfrt/ir/gpu_ops.td
index fce1756d11df31..e8ba2fc4a47ac1 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/gpu_ops.td
+++ b/tensorflow/compiler/mlir/tfrt/ir/gpu_ops.td
@@ -30,7 +30,6 @@ def TFRT_GPU_Dialect : Dialect {
   }];
 
   let cppNamespace = "::tfrt::gpu";
-  let usePropertiesForAttributes = 0;
 }
 
 class Gpu_Op<string mnemonic, list<Trait> traits = []> :
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD b/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD
index bfc93b9252ccbf..374aad2a242d9b 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD
@@ -14,7 +14,6 @@ td_library(
     includes = ["."],
     visibility = [
         # copybara:uncomment "//learning/brain/tfrt/mlir:__subpackages__",
-        "//learning/infra/mira/distributed:__subpackages__",
     ],
     deps = [
         "@llvm-project//mlir:OpBaseTdFiles",
@@ -51,7 +50,6 @@ cc_library(
     ],
     visibility = [
         # copybara:uncomment "//learning/brain/tfrt/mlir:__subpackages__",
-        "//learning/infra/mira/distributed:__subpackages__",
         "//tensorflow/compiler/mlir/tfrt:__subpackages__",
     ],
     deps = [
@@ -59,6 +57,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InliningUtils",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -70,9 +69,6 @@ td_library(
         "tf_ops.td",
     ],
     includes = ["."],
-    visibility = [
-        # copybara:uncomment "//learning/infra/mira/distributed:__subpackages__",
-    ],
     deps = [
         ":mlrt_td_files",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops_td_files",
@@ -155,7 +151,6 @@ cc_library(
     hdrs = ["tf_mlrt_ops.h"],
     visibility = [
         # copybara:uncomment "//learning/brain/experimental/tfrt/mlrt/application/tensorflow/tests:__subpackages__",
-        # copybara:uncomment "//learning/infra/mira/distributed:__subpackages__",
         "//tensorflow/compiler/mlir/tfrt:__subpackages__",
     ],
     deps = [
@@ -167,6 +162,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InliningUtils",
         "@llvm-project//mlir:SideEffectInterfaces",
+        "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
         "@tf_runtime//:compiler_tfrt_op_interfaces",
         "@tf_runtime//:compiler_tfrt_traits",
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.cc b/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.cc
index 50d4cb1214250b..b4e337f328b27b 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.cc
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "mlir/IR/Region.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_ops.h"
 
@@ -73,17 +74,17 @@ mlir::Type MlrtDialect::parseType(mlir::DialectAsmParser &parser) const {
 // Print a type registered to this dialect.
 void MlrtDialect::printType(mlir::Type type,
                             mlir::DialectAsmPrinter &os) const {
-  if (type.isa<FutureType>()) {
+  if (mlir::isa<FutureType>(type)) {
     os << "future";
     return;
   }
 
-  if (type.isa<PromiseType>()) {
+  if (mlir::isa<PromiseType>(type)) {
     os << "promise";
     return;
   }
 
-  if (type.isa<AsyncHandleType>()) {
+  if (mlir::isa<AsyncHandleType>(type)) {
     os << "async_handle";
     return;
   }
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.td b/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.td
index 94416661455a9c..b260dcb402f3f2 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.td
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.td
@@ -26,7 +26,6 @@ def Mlrt_Dialect : Dialect {
   }];
 
   let cppNamespace = "::mlrt::compiler";
-  let usePropertiesForAttributes = 0;
 }
 
 def MlrtFutureType : DialectType<Mlrt_Dialect,
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_dialect.td b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_dialect.td
index 2303a1c180ba60..9cf997e0c3e8ce 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_dialect.td
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_dialect.td
@@ -29,7 +29,6 @@ def TensorflowMlrt_Dialect : Dialect {
   }];
 
   let cppNamespace = "::tensorflow::tf_mlrt";
-  let usePropertiesForAttributes = 0;
 }
 
 class TensorflowMlrt_Op<string mnemonic, list<Trait> traits = []> :
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.cc b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.cc
index fc4cb6a93a28ea..d6ddc8f96fd901 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.cc
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "mlir/IR/DialectImplementation.h"  // from @llvm-project
 #include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.h"
@@ -74,7 +75,7 @@ mlir::Type TensorflowMlrtDialect::parseType(
 // Print a type registered to this dialect.
 void TensorflowMlrtDialect::printType(mlir::Type type,
                                       mlir::DialectAsmPrinter &os) const {
-  if (type.isa<TFTensorType>()) {
+  if (mlir::isa<TFTensorType>(type)) {
     os << "tensor";
     return;
   }
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.td b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.td
index fcbf2358b3b936..0659143f49b39b 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.td
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.td
@@ -449,27 +449,21 @@ def IfrtLoadVariableOp: TensorflowMlrt_Op<"ifrt_load_variable", [Pure]> {
   let summary = "Loads a variable tensor as an IFRT array for mlrt";
 
   let description = [{
-    This is the MLRT version of tf.IfrtLoadVariableOp.
+    This op loads a restored variable tensor as a tensor future. It is a
+    replacement of `tf.ReadVariableOp`.
 
-    This op loads a variable tensor as an IFRT array and binds it with the specified name.
+    This op returns a scalar string tensor containing the restored variable name, which can be
+    used as a key within the runtime, as well as a future for the tensor.
 
-    This op is an replacement of `tf.ReadVariableOp` in the case that a constant
-    variable tensor is an input to the tpu program invoked by `tf.IfrtCall`.
-
-    After a `tf.ReadVariableOp` is lowered into `tf.IfrtLoadVariableOp`, the `tf.IfrtCall` kernel
-    will bind the loaded IFRT array by name with the tpu program's input.
-
-    `tf.IfrtLoadVariableOp` converts the tensor into an IFRT array based on device and sharding
-    configuration specified in `VariableDeviceShardingConfigProto`.
-
-    This op returns a scalar string tensor as a key for user to look for the loaded array
-    and a future containing the restored tensor.
+    The `tf.IfrtCall` kernel uses the output $array_key.
+    Other ops executed by TFRT may make use of $tensor_future.
   }];
 
   let arguments = (ins
     TFTensorType:$variable,
     StrAttr:$device_sharding_config_proto_text,
-    StrAttr:$name
+    StrAttr:$name,
+    DefaultValuedAttr<BoolAttr, "false">:$used_by_host
   );
 
   let results = (outs
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_ops.td b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_ops.td
index 0791423a91c17f..fa08ea5907ac81 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_ops.td
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_ops.td
@@ -172,7 +172,8 @@ def TFIfrtLoadVariableOp: TensorflowMlrt_Op<"tf_ifrt_load_variable", [Pure]> {
   let arguments = (ins
     TF_Tensor:$variable,
     StrAttr:$device_sharding_config_proto_text,
-    StrAttr:$name
+    StrAttr:$name,
+    DefaultValuedAttr<BoolAttr, "false">:$used_by_host
   );
 
   let results = (outs
diff --git a/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback.cc b/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback.cc
index 4bc8a6842bffe1..dd47e81ee1a6df 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback.cc
+++ b/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "mlir/IR/DialectImplementation.h"  // from @llvm-project
 #include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 
 namespace tfrt {
 namespace fallback {
@@ -47,12 +48,12 @@ Type FallbackDialect::parseType(DialectAsmParser &parser) const {
 
 /// Print a type registered to this dialect.
 void FallbackDialect::printType(Type type, DialectAsmPrinter &os) const {
-  if (type.isa<TFTensorType>()) {
+  if (mlir::isa<TFTensorType>(type)) {
     os << "tf_tensor";
     return;
   }
 
-  if (type.isa<TFAllocatorType>()) {
+  if (mlir::isa<TFAllocatorType>(type)) {
     os << "tf_allocator";
     return;
   }
diff --git a/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback.td b/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback.td
index 0f74b0feca1821..0c42590f9aa7ee 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback.td
+++ b/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback.td
@@ -27,7 +27,6 @@ def Fallback_Dialect : Dialect {
   }];
 
   let cppNamespace = "::tfrt::fallback";
-  let usePropertiesForAttributes = 0;
 }
 
 // This corresponds to tensorflow::Tensor.
diff --git a/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_async.td b/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_async.td
index 8c8bcd0ab4ffac..5dd788ef328858 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_async.td
+++ b/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_async.td
@@ -33,7 +33,6 @@ def FallbackAsync_Dialect : Dialect {
   }];
 
   let cppNamespace = "::tfrt::fallback_async";
-  let usePropertiesForAttributes = 0;
 }
 
 class FallbackAsync_Op<string mnemonic, list<Trait> traits = []> :
diff --git a/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_common.cc b/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_common.cc
index 3a835b3796962d..30f6aa234a2d59 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_common.cc
+++ b/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_common.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 
 namespace tfrt {
 namespace fallback_common {
@@ -31,8 +32,8 @@ void GetExecuteOpAttrsCommon(
 
   mlir::Builder builder(context);
   for (auto iter : op_attr_array) {
-    auto key_value = iter.cast<mlir::ArrayAttr>().getValue();
-    llvm::StringRef key = key_value[0].cast<mlir::StringAttr>().getValue();
+    auto key_value = mlir::cast<mlir::ArrayAttr>(iter).getValue();
+    llvm::StringRef key = mlir::cast<mlir::StringAttr>(key_value[0]).getValue();
     mlir::Attribute value = key_value[1];
     op_attrs->push_back({key, value});
   }
diff --git a/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_common.h b/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_common.h
index e78d247c038c64..0cddb1017a33d8 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_common.h
+++ b/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_common.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tfrt/basic_kernels/opdefs/types.h"  // from @tf_runtime
 
 namespace tfrt {
@@ -30,9 +31,9 @@ template <typename OpTy>
 mlir::LogicalResult VerifyExecuteOpCommon(OpTy op) {
   auto op_attr_array = op.getOpAttrs().getValue();
   for (auto op_attr : op_attr_array) {
-    auto key_value = op_attr.template dyn_cast<mlir::ArrayAttr>();
+    auto key_value = mlir::dyn_cast<mlir::ArrayAttr>(op_attr);
     if (!key_value || key_value.getValue().size() != 2 ||
-        !key_value.getValue()[0].template isa<mlir::StringAttr>())
+        !mlir::isa<mlir::StringAttr>(key_value.getValue()[0]))
       return op.emitOpError() << "each op_attr should be a key-value pair, "
                                  "where the key is a string";
   }
@@ -47,10 +48,10 @@ mlir::LogicalResult VerifyFallbackExecuteOp(OpTy op) {
   // Verify function attributes.
   auto op_func_attr_array = op.getOpFuncAttrs().getValue();
   for (auto op_attr : op_func_attr_array) {
-    auto key_value = op_attr.template dyn_cast<mlir::ArrayAttr>();
+    auto key_value = mlir::dyn_cast<mlir::ArrayAttr>(op_attr);
     if (!key_value || key_value.getValue().size() != 2 ||
-        !key_value.getValue()[0].template isa<mlir::StringAttr>() ||
-        !key_value.getValue()[1].template isa<mlir::StringAttr>())
+        !mlir::isa<mlir::StringAttr>(key_value.getValue()[0]) ||
+        !mlir::isa<mlir::StringAttr>(key_value.getValue()[1]))
       return op.emitOpError() << "each op_func_attr should be a key-value "
                                  "pair, where both the key and the value are "
                                  "strings";
@@ -63,11 +64,11 @@ void PrintExecuteOpFuncAttribute(mlir::OpAsmPrinter &p, OpTy op) {
   auto op_func_attrs = op.getOpFuncAttrs();
   if (!op_func_attrs.empty()) {
     auto print_key_value = [&](mlir::Attribute attr) {
-      auto key_value = attr.cast<mlir::ArrayAttr>().getValue();
+      auto key_value = mlir::cast<mlir::ArrayAttr>(attr).getValue();
       auto key = key_value[0];
       auto value = key_value[1];
 
-      p << key.cast<mlir::StringAttr>().getValue();
+      p << mlir::cast<mlir::StringAttr>(key).getValue();
       p << " = ";
       p << value;
     };
@@ -84,11 +85,11 @@ void PrintExecuteOpCommon(mlir::OpAsmPrinter &p, OpTy op) {
   auto op_attrs = op.getOpAttrs();
   if (!op_attrs.empty()) {
     auto print_key_value = [&](mlir::Attribute attr) {
-      auto key_value = attr.cast<mlir::ArrayAttr>().getValue();
+      auto key_value = mlir::cast<mlir::ArrayAttr>(attr).getValue();
       auto key = key_value[0];
       auto value = key_value[1];
 
-      p << key.cast<mlir::StringAttr>().getValue();
+      p << mlir::cast<mlir::StringAttr>(key).getValue();
       p << " = ";
       p << value;
     };
diff --git a/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_sync.cc b/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_sync.cc
index 8083fcac076745..6a429ef275e869 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_sync.cc
+++ b/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_sync.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tfrt/core_runtime/opdefs/attributes.h"  // from @tf_runtime
 #include "tfrt/core_runtime/opdefs/core_runtime.h"  // from @tf_runtime
 #include "tfrt/core_runtime/opdefs/types.h"  // from @tf_runtime
+#include "tfrt/tensor/opdefs/tensor.h"  // from @tf_runtime
 
 namespace tfrt {
 namespace fallback_sync {
@@ -50,7 +51,7 @@ FallbackSyncDialect::FallbackSyncDialect(MLIRContext *context)
 }
 
 static Type GetTensorType(Builder *builder) {
-  return tfrt::t::TensorType::get(builder->getContext());
+  return tfrt::tfrt_tensor::TensorType::get(builder->getContext());
 }
 
 }  // namespace fallback_sync
diff --git a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_executor.cc b/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_executor.cc
index 50b1a199e47c17..c63952f55de1b5 100644
--- a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_executor.cc
+++ b/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_executor.cc
@@ -147,7 +147,6 @@ void RuntimeFallbackExecutor::Prepare(llvm::StringRef mlir_input) {
   pipeline_opts.sink_in_invariant_ops = false;
   pipeline_opts.cost_threshold = 1024;
   pipeline_opts.merge_inter_dependent_streams = true;
-  pipeline_opts.func_use_fallback_tensor = true;
 
   mlir::PassManager pm(module->getContext());
   pm.addPass(CreateTfToTfrtConversionPass(pipeline_opts));
diff --git a/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.cc b/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.cc
index d02155c88b7e22..93d50a012a6fed 100644
--- a/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.cc
+++ b/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
@@ -42,9 +43,9 @@ namespace {
 using ::mlir::tf_saved_model::kTfSavedModelIndexPathAttr;
 
 llvm::StringRef ProcessIndexPath(mlir::ArrayAttr index_path) {
-  if (index_path.size() == 1 && index_path[0].isa<mlir::StringAttr>()) {
+  if (index_path.size() == 1 && mlir::isa<mlir::StringAttr>(index_path[0])) {
     // TODO(chky): Support cases where index_path is not a single string.
-    return index_path[0].cast<mlir::StringAttr>().getValue();
+    return mlir::cast<mlir::StringAttr>(index_path[0]).getValue();
   }
   return "";
 }
@@ -71,7 +72,7 @@ Status MapFunctionSignaturesFromTFSavedModelMLIR(
     llvm::function_ref<void(const TFRTSavedModelSignatureInfo&)> map_fn) {
   // Create bound inputs for each functions.
   mlir::SymbolTable symbol_table(module);
-  tensorflow::Status status = OkStatus();
+  tensorflow::Status status = absl::OkStatus();
   module.walk([&symbol_table, map_fn, &status](mlir::func::FuncOp func) {
     // Use the exported name as the function name, and skip non-exported
     // functions.
@@ -92,8 +93,8 @@ Status MapFunctionSignaturesFromTFSavedModelMLIR(
       if (auto input_index_path = func.getArgAttrOfType<mlir::ArrayAttr>(
               i, kTfSavedModelIndexPathAttr)) {
         input_names.push_back(ProcessIndexPath(input_index_path));
-        auto statusor_spec =
-            ProcessTensorSpec(func_type.getInput(i).cast<mlir::TensorType>());
+        auto statusor_spec = ProcessTensorSpec(
+            mlir::cast<mlir::TensorType>(func_type.getInput(i)));
         if (!statusor_spec.ok()) {
           status = std::move(statusor_spec).status();
           return mlir::WalkResult::interrupt();
@@ -120,8 +121,8 @@ Status MapFunctionSignaturesFromTFSavedModelMLIR(
       if (auto output_index_path = func.getResultAttrOfType<mlir::ArrayAttr>(
               i, kTfSavedModelIndexPathAttr)) {
         output_names.push_back(ProcessIndexPath(output_index_path));
-        auto statusor_spec =
-            ProcessTensorSpec(func_type.getResult(i).cast<mlir::TensorType>());
+        auto statusor_spec = ProcessTensorSpec(
+            mlir::cast<mlir::TensorType>(func_type.getResult(i)));
         if (!statusor_spec.ok()) {
           status = std::move(statusor_spec).status();
           return mlir::WalkResult::interrupt();
diff --git a/tensorflow/compiler/mlir/tfrt/tests/batch_function_lowering.mlir b/tensorflow/compiler/mlir/tfrt/tests/batch_function_lowering.mlir
index 9f6e47567f8c65..6944b477f535e6 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/batch_function_lowering.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/batch_function_lowering.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-tfrt-opt -tf-executor-to-tfrt-pipeline="func-use-fallback-tensor=true" %s | FileCheck %s --dump-input=always
+// RUN: tf-tfrt-opt -tf-executor-to-tfrt-pipeline %s | FileCheck %s --dump-input=always
 
 func.func private @batched_function(%arg0: tensor<1x3xf32> {tf._user_specified_name = "0"}, %arg1: tensor<*x!tf_type.resource>) -> tensor<1x3xf32> attributes {tf._input_shapes = [#tf_type.shape<1x3>, #tf_type.shape<*>], tf.signature.is_stateful} {
   %0 = "tf.ReadVariableOp"(%arg1) {device = "/device:CPU:0"} : (tensor<*x!tf_type.resource>) -> tensor<1x3xf32>
diff --git a/tensorflow/compiler/mlir/tfrt/tests/ifrt/sink_variable_as_named_array.mlir b/tensorflow/compiler/mlir/tfrt/tests/ifrt/sink_variable_as_named_array.mlir
index dec4b733d25b19..39c10c07cdcf35 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/ifrt/sink_variable_as_named_array.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/ifrt/sink_variable_as_named_array.mlir
@@ -9,6 +9,7 @@
 // CHECK-NEXT:   [[KEY:%.*]], [[FUTURE:%.*]] = "tf.IfrtLoadVariable"([[HANDLE2]])
 // CHECK-SAME:       device_sharding_config_proto_text = "sharding { type: OTHER tile_assignment_dimensions: 2 tile_assignment_dimensions: 1 tile_assignment_devices: 0 tile_assignment_devices: 1 } device_ids: 0 device_ids: 1 "
 // CHECK-SAME:       name = "__y"
+// CHECK-SAME:       used_by_host = false 
 // CHECK-NEXT:   [[RES:%.*]] = "tf.IfrtCall"([[KEY]], %arg0) <{program_id = 6515870160938153680 : i64, variable_arg_indices = [0 : i32]}>
 // CHECK-SAME:    : (tensor<!tf_type.string>, tensor<1x3xf32>) -> tensor<1x1xf32>
 // CHECK-NEXT:    return [[RES]] : tensor<1x1xf32>
@@ -29,6 +30,7 @@ module {
 // CHECK:  "tf.VarHandleOp"
 // CHECK-NOT:  [[VARIABLE:%.*]] = "tf.ReadVariableOp"
 // CHECK-NEXT:  [[KEY:%.*]], [[FUTURE:%.*]] = "tf.IfrtLoadVariable"
+// CHECK-SAME:    used_by_host = true
 // CHECK-NEXT:  "tf.MatMul"(%arg0, [[FUTURE]])
 // CHECK-NEXT:   [[RES:%.*]] = "tf.IfrtCall"(%arg0, [[KEY]]) <{program_id = 6515870160938153680 : i64, variable_arg_indices = [1 : i32]}>
 // CHECK-NEXT:    return [[RES]] : tensor<1x1xf32>
@@ -50,6 +52,7 @@ module {
 // CHECK:  "tf.VarHandleOp"
 // CHECK-NOT:  [[VARIABLE:%.*]] = "tf.ReadVariableOp"
 // CHECK-NEXT:  [[KEY:%.*]], [[FUTURE:%.*]] = "tf.IfrtLoadVariable"
+// CHECK-SAME:    used_by_host = true
 // CHECK-NEXT:  [[RES:%.*]] = "tf.MatMul"(%arg0, [[FUTURE]])
 // CHECK-NEXT:    return [[RES]] : tensor<1x1xf32>
 //
diff --git a/tensorflow/compiler/mlir/tfrt/tests/lower_bound_batch_threads.mlir b/tensorflow/compiler/mlir/tfrt/tests/lower_bound_batch_threads.mlir
new file mode 100644
index 00000000000000..317d9b3ad9e00a
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/tests/lower_bound_batch_threads.mlir
@@ -0,0 +1,53 @@
+// RUN: tf-tfrt-opt -split-input-file -tfrt-lower-bound-batch-threads="tfrt-min-num-batch-threads=2" %s | FileCheck %s --dump-input=always
+
+// -----
+
+// The num_batch_threads is lowered bound to 2 from the original attribute of 1
+
+// CHECK-LABEL: func private @batched_function
+func.func private @batched_function(%arg0: tensor<1x3xf32>) -> tensor<1x3xf32> {
+  %2 = "tf.Identity"(%arg0) : (tensor<1x3xf32>) -> tensor<1x3xf32>
+  func.return %2 : tensor<1x3xf32>
+}
+
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<1x3xf32>) -> tensor<*xf32> {
+  // CHECK:  "tf.BatchFunction"
+  // CHECK-SAME: allowed_batch_sizes = [6]
+  // CHECK-SAME: batch_timeout_micros = 100000 : i64
+  // CHECK-SAME: batching_queue = ""
+  // CHECK-SAME: container = ""
+  // CHECK-SAME: enable_large_batch_splitting = false
+  // CHECK-SAME: max_batch_size = 6 : i64
+  // CHECK-SAME: max_enqueued_batches = 10 : i64
+  // CHECK-SAME: num_batch_threads = 2 : i64
+  // CHECK-SAME: shared_name = "batch/"
+  %1 = "tf.BatchFunction"(%arg0) {allowed_batch_sizes = [6], batch_timeout_micros = 100000 : i64, batching_queue = "", container = "", device = "/device:CPU:0", enable_large_batch_splitting = false, f = @batched_function, max_batch_size = 6 : i64, max_enqueued_batches = 10 : i64, num_batch_threads = 1 : i64, operandSegmentSizes = array<i32: 1, 0>, shared_name = "batch/"} : (tensor<1x3xf32>) -> tensor<*xf32>
+  func.return %1 : tensor<*xf32>
+}
+
+// -----
+
+// The num_batch_threads remains 3 (the same as the original attribute)
+
+// CHECK-LABEL: func private @batched_function
+func.func private @batched_function(%arg0: tensor<1x3xf32>) -> tensor<1x3xf32> {
+  %2 = "tf.Identity"(%arg0) : (tensor<1x3xf32>) -> tensor<1x3xf32>
+  func.return %2 : tensor<1x3xf32>
+}
+
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<1x3xf32>) -> tensor<*xf32> {
+  // CHECK:  "tf.BatchFunction"
+  // CHECK-SAME: allowed_batch_sizes = [6]
+  // CHECK-SAME: batch_timeout_micros = 100000 : i64
+  // CHECK-SAME: batching_queue = ""
+  // CHECK-SAME: container = ""
+  // CHECK-SAME: enable_large_batch_splitting = false
+  // CHECK-SAME: max_batch_size = 6 : i64
+  // CHECK-SAME: max_enqueued_batches = 10 : i64
+  // CHECK-SAME: num_batch_threads = 3 : i64
+  // CHECK-SAME: shared_name = "batch/"
+  %1 = "tf.BatchFunction"(%arg0) {allowed_batch_sizes = [6], batch_timeout_micros = 100000 : i64, batching_queue = "", container = "", device = "/device:CPU:0", enable_large_batch_splitting = false, f = @batched_function, max_batch_size = 6 : i64, max_enqueued_batches = 10 : i64, num_batch_threads = 3 : i64, operandSegmentSizes = array<i32: 1, 0>, shared_name = "batch/"} : (tensor<1x3xf32>) -> tensor<*xf32>
+  func.return %1 : tensor<*xf32>
+}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/mlrt/rewrite_ifrt_load_variable.mlir b/tensorflow/compiler/mlir/tfrt/tests/mlrt/rewrite_ifrt_load_variable.mlir
index e1ad0aea205007..24e015734a732f 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/mlrt/rewrite_ifrt_load_variable.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/mlrt/rewrite_ifrt_load_variable.mlir
@@ -5,7 +5,7 @@
 // CHECK-LABEL: func @serving_default(%arg0: tensor<1x3xf32>) -> tensor<1x1xf32>
 // CHECK-NEXT:    [[HANDLE:%.*]] = "tf.VarHandleOp"()
 // CHECK-NEXT:    [[ARRAYKEY:%.*]], [[FURTURE:%.*]] = "tf_mlrt.tf_ifrt_load_variable"([[HANDLE]])
-// CHECK-SAME:       {device_sharding_config_proto_text = "sharding { }", name = "__y"} : (tensor<!tf_type.resource<tensor<3x1xf32>>>) -> (tensor<!tf_type.string>, !mlrt.future)
+// CHECK-SAME:       <{device_sharding_config_proto_text = "sharding { }", name = "__y", used_by_host = true}> : (tensor<!tf_type.resource<tensor<3x1xf32>>>) -> (tensor<!tf_type.string>, !mlrt.future)
 // CHECK-NEXT:    [[TENSOR:%.*]] = "tf_mlrt.tf_await"([[FURTURE]]) : (!mlrt.future) -> tensor<3x1xf32>
 // CHECK-NEXT:    "tf.MatMul"(%arg0, [[TENSOR]]) : (tensor<1x3xf32>, tensor<3x1xf32>) -> tensor<1x1xf32>
 // CHECK-NEXT:    "tf.IfrtCall"(%arg0, [[ARRAYKEY]]) <{program_id = 6515870160938153680 : i64, variable_arg_indices = [1 : i32]}> {__tpu_compile_metadata_text = "retvals { sharding { } }"} : (tensor<1x3xf32>, tensor<!tf_type.string>) -> tensor<1x1xf32>
@@ -13,7 +13,7 @@
 //
  func.func @serving_default(%arg0: tensor<1x3xf32>) -> tensor<1x1xf32> {
     %0 = "tf.VarHandleOp"() <{container = "", shared_name = "y"}> : () -> tensor<!tf_type.resource<tensor<3x1xf32>>>
-    %array_key, %tensor = "tf.IfrtLoadVariable"(%0) <{device_sharding_config_proto_text = "sharding { }", name = "__y"}> : (tensor<!tf_type.resource<tensor<3x1xf32>>>) -> (tensor<!tf_type.string>, tensor<3x1xf32>)
+    %array_key, %tensor = "tf.IfrtLoadVariable"(%0) <{device_sharding_config_proto_text = "sharding { }", name = "__y", used_by_host = true}> : (tensor<!tf_type.resource<tensor<3x1xf32>>>) -> (tensor<!tf_type.string>, tensor<3x1xf32>)
     %1 = "tf.MatMul"(%arg0, %tensor) : (tensor<1x3xf32>, tensor<3x1xf32>) -> tensor<1x1xf32>
     %2 = "tf.IfrtCall"(%arg0, %array_key) <{program_id = 6515870160938153680 : i64, variable_arg_indices = [1 : i32]}> {__tpu_compile_metadata_text = "retvals { sharding { } }"} : (tensor<1x3xf32>, tensor<!tf_type.string>) -> tensor<1x1xf32>
     return %2 : tensor<1x1xf32>
diff --git a/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir b/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir
index 3151daf80ec759..e83e208967b334 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir
@@ -470,7 +470,8 @@ func.func @ifrt_load_variable_test() -> () {
   // CHECK-NEXT: "tf_mlrt.ifrt_load_variable"([[HANDLE]])
   // CHECK-SAME: device_sharding_config_proto_text
   // CHECK-SAME: name = "__variable"
-  %1, %2 = "tf_mlrt.tf_ifrt_load_variable"(%0) {device_sharding_config_proto_text = "sharding { } device_ids: 0 device_ids: 1 ", name = "__variable", __op_key = 2: i32, device = "/device:CPU:0"} : (tensor<!tf_type.resource<tensor<1x3xf32>>>) -> (tensor<!tf_type.string>, !mlrt.future)
+  // CHECK-SAME: used_by_host = true
+  %1, %2 = "tf_mlrt.tf_ifrt_load_variable"(%0) {used_by_host = true, device_sharding_config_proto_text = "sharding { } device_ids: 0 device_ids: 1 ", name = "__variable", __op_key = 2: i32, device = "/device:CPU:0"} : (tensor<!tf_type.resource<tensor<1x3xf32>>>) -> (tensor<!tf_type.string>, !mlrt.future)
   // CHECK-NEXT: mlrt.await_all_control
   // CHECK-NEXT: return
   func.return
@@ -490,7 +491,7 @@ func.func @ifrt_restore_variable_test() -> () {
   %cst_1 = "tf.Const"()  {__op_key = 2: i32, value = dense<["y"]> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
   // CHECK-NEXT: [[HANDLE:%.*]] = tf_mlrt.executeop
   %handle = "tf.VarHandleOp"() {__op_key = 3: i32, container = "x", shared_name = "y"} : () -> tensor<!tf_type.resource<tensor<3x1xf32>>>
-  // CHECK-NEXT: "tf_mlrt.ifrt_restore_variable"([[PREFIX]], [[NAME]], [[SLICE]], [[HANDLE]]) {restored_dtypes = [f32]}
+  // CHECK-NEXT: "tf_mlrt.ifrt_restore_variable"([[PREFIX]], [[NAME]], [[SLICE]], [[HANDLE]]) <{restored_dtypes = [f32]}>
   "tf.IfrtRestoreVariableOp"(%cst, %cst_1, %cst_0, %handle) {restored_dtypes = [f32]} : (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>, tensor<!tf_type.resource<tensor<3x1xf32>>>) -> ()
   // CHECK-NEXT: return
   func.return
diff --git a/tensorflow/compiler/mlir/tfrt/tests/mlrt/while_to_map_fn.mlir b/tensorflow/compiler/mlir/tfrt/tests/mlrt/while_to_map_fn.mlir
index 2b1f5fc9b17a4e..a74d6509a0ed4c 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/mlrt/while_to_map_fn.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/mlrt/while_to_map_fn.mlir
@@ -638,3 +638,49 @@ func.func private @tf.NestedWhileRegion_cond(%arg0: tensor<i32>, %arg1: tensor<i
   return %2 : tensor<i1>
 }
 
+// -----
+
+// Test a while to map_fn conversion is skipped if the tensor list cannot be found in the current function body.
+
+// CHECK-LABEL: map/while_cond
+func.func private @"map/while_cond"(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<!tf_type.variant<tensor<*xf32>>>, %arg3: tensor<?xf32>) -> tensor<i1> {
+  %cst = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<3> : tensor<i32>} : () -> tensor<i32>
+  %0 = "tf.Less"(%arg0, %cst) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  %1 = "tf.Less"(%arg1, %cst) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  %2 = "tf.LogicalAnd"(%0, %1) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  return %2 : tensor<i1>
+}
+
+// CHECK-LABEL: map/while_body
+func.func private @"map/while_body"(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<!tf_type.variant<tensor<*xf32>>>, %arg3: tensor<?xf32>) -> (tensor<i32>, tensor<i32>, tensor<!tf_type.variant<tensor<*xf32>>>, tensor<?xf32>) {
+  %cst = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00, 7.000000e+00, 8.000000e+00, 9.000000e+00]> : tensor<9xf32>} : () -> tensor<9xf32>
+  %cst_0 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %cst_1 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<[0, 1, 2]> : tensor<3xi32>} : () -> tensor<3xi32>
+  %cst_2 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<3> : tensor<2xi32>} : () -> tensor<2xi32>
+  %cst_3 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<[0.000000e+00, 1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00, 7.000000e+00, 8.000000e+00]> : tensor<9xf32>} : () -> tensor<9xf32>
+  %cst_4 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %0 = "tf.AddV2"(%arg0, %cst_4) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %1 = "tf.Mul"(%arg3, %cst_3) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<?xf32>, tensor<9xf32>) -> tensor<9xf32>
+  %2 = "tf.Reshape"(%1, %cst_2) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<9xf32>, tensor<2xi32>) -> tensor<3x3xf32>
+  %3 = "tf.AddV2"(%arg1, %cst_4) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %4 = "tf.GatherV2"(%cst_1, %arg1, %cst_0) {batch_dims = 0 : i64, device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<3xi32>, tensor<i32>, tensor<i32>) -> tensor<i32>
+  %5 = "tf.Cast"(%4) {Truncate = false, device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>) -> tensor<f32>
+  %6 = "tf.Mul"(%5, %cst) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<f32>, tensor<9xf32>) -> tensor<9xf32>
+  %7 = "tf.Reshape"(%6, %cst_2) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<9xf32>, tensor<2xi32>) -> tensor<3x3xf32>
+  %8 = "tf.MatMul"(%2, %7) {device = "/job:localhost/replica:0/task:0/device:CPU:0", transpose_a = false, transpose_b = false} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
+  %9 = "tf.MatrixDeterminant"(%8) {T = f32, device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<3x3xf32>) -> tensor<f32>
+  %10 = "tf.TensorListSetItem"(%arg2, %arg1, %9) {device = "/job:localhost/replica:0/task:0/device:CPU:0", resize_if_index_out_of_bounds = false} : (tensor<!tf_type.variant<tensor<*xf32>>>, tensor<i32>, tensor<f32>) -> tensor<!tf_type.variant<tensor<*xf32>>>
+  return %0, %3, %10, %arg3 : tensor<i32>, tensor<i32>, tensor<!tf_type.variant<tensor<*xf32>>>, tensor<?xf32>
+}
+
+//CHECK-LABEL: @func
+func.func @func(%arg0: tensor<?xf32>, %arg1: tensor<!tf_type.variant<tensor<*xf32>>>) -> tensor<3xf32> attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input:0", outputs = "PartitionedCall:0"}} {
+  %cst = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %cst_0 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
+  %cst_1 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+  %cst_2 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<3> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NOT: tf_map_fn
+  %1:4 = "tf.While"(%cst, %cst, %arg1, %arg0) {_lower_using_switch_merge = true, _num_original_outputs = 6 : i64, _read_only_resource_inputs = [], _xla_propagate_compile_time_consts = true, body = @"map/while_body", cond = @"map/while_cond", device = "/job:localhost/replica:0/task:0/device:CPU:0", is_stateless = true, parallel_iterations = 4 : i64, shape_invariant} : (tensor<i32>, tensor<i32>, tensor<!tf_type.variant<tensor<*xf32>>>, tensor<?xf32>) -> (tensor<i32>, tensor<i32>, tensor<!tf_type.variant<tensor<*xf32>>>, tensor<?xf32>)
+  %2 = "tf.TensorListStack"(%1#2, %cst_0) {device = "/job:localhost/replica:0/task:0/device:CPU:0", num_elements = 3 : i64} : (tensor<!tf_type.variant<tensor<*xf32>>>, tensor<0xi32>) -> tensor<3xf32>
+  return %2 : tensor<3xf32>
+}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/attributes.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/attributes.mlir
index 77e795b8bf47a1..f8951181427fd6 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/attributes.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/attributes.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-tfrt-opt -tf-to-tfrt=func-use-fallback-tensor=true %s | FileCheck %s --dump-input=fail
+// RUN: tf-tfrt-opt -tf-to-tfrt %s | FileCheck %s --dump-input=fail
 
 // _output_shapes and f.* attributes are removed during tf-to-tfrt lowering.
 // CHECK-LABEL: func @remove_unused_attr
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/basic.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/basic.mlir
index 1f8eb1ee6ee01e..57febbbc0ab14c 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/basic.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/basic.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-tfrt-opt -pass-pipeline='builtin.module(func.func(tf-tensor-device-copy),tfrt-lower-tf-savedmodel{hoist-invariant-ops=true},tf-to-tfrt{func-use-fallback-tensor=true tfrt-cost-threshold=1024 tfrt-merge-inter-dependent-streams=true})' %s | FileCheck %s --dump-input-filter=all
+// RUN: tf-tfrt-opt -pass-pipeline='builtin.module(func.func(tf-tensor-device-copy),tfrt-lower-tf-savedmodel{hoist-invariant-ops=true},tf-to-tfrt{tfrt-cost-threshold=1024 tfrt-merge-inter-dependent-streams=true})' %s | FileCheck %s --dump-input-filter=all
 
 // CHECK-NOT: tf_saved_model.semantics
 // CHECK: tfrt.cost_threshold = 1024
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/const_tensor.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/const_tensor.mlir
index b208fe390acc3f..6596d650889384 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/const_tensor.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/const_tensor.mlir
@@ -9,12 +9,12 @@ func.func @string_tensor() -> (tensor<0x!tf_type.string>, tensor<7x!tf_type.stri
   func.return %0, %1 : tensor<0x!tf_type.string>, tensor<7x!tf_type.string>
 }
 
-// Convert tf.Const to corert.const_dense_tensor only on cpu device
+// Convert tf.Const to tfrt_fallback_async.const_dense_tensor only on cpu device
 // CHECK-LABEL: func @dense_tensor
 func.func @dense_tensor() -> tensor<4xui64> {
-  // CHECK: corert.const_dense_tensor dense<[1, 2, 3, 4]> : tensor<4xui64>
+  // CHECK: tfrt_fallback_async.const_dense_tensor dense<[1, 2, 3, 4]> : tensor<4xui64>
   %0 = "tf.Const"() {value = dense<[1, 2, 3, 4]> : tensor<4xui64>} : () -> tensor<4xui64>
-  // CHECK: corert.const_dense_tensor  dense<1.000000e+00> : tensor<1xbf16>
+  // CHECK: tfrt_fallback_async.const_dense_tensor  dense<1.000000e+00> : tensor<1xbf16>
   %1 = "tf.Const"() {device = "/device:CPU:0", value = dense<[1.0]> : tensor<1xbf16>} : () -> tensor<4xbf16>
   // CHECK: corert.executeop({{.*}}) "tf.Const"() {dtype = ui64, value = dense<[1, 2, 3, 4]> : tensor<4xui64>} : 1
   %2 = "tf.Const"() {device = "/device:GPU:0", value = dense<[1, 2, 3, 4]> : tensor<4xui64>} : () -> tensor<4xui64>
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/control_flow.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/control_flow.mlir
index dac8c0a71c15fb..ad3232042ca5e7 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/control_flow.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/control_flow.mlir
@@ -1,44 +1,44 @@
-// RUN: tf-tfrt-opt -tf-to-tfrt %s | FileCheck %s --dump-input=fail
+// RUN: tf-tfrt-opt -tf-to-tfrt="enable-while-parallel-iterations=true" %s | FileCheck %s --dump-input=fail
 
-// CHECK-LABEL: func @cond_false(%arg0: !tfrt.chain, %arg1: !corert.tensorhandle) -> (!tfrt.chain, !corert.tensorhandle)
+// CHECK-LABEL: func @cond_false(%arg0: !tfrt.chain, %arg1: !tfrt_fallback.tf_tensor) -> (!tfrt.chain, !tfrt_fallback.tf_tensor)
 func.func @cond_false(%arg0: tensor<i32>) -> tensor<i32> {
   %0 = "tf.Const"() {device = "/device:CPU:0", value = dense<-1> : tensor<i32>} : () -> tensor<i32>
   %1 = "tf.Add"(%arg0, %0) {device = "/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
   func.return %1 : tensor<i32>
 }
 
-// CHECK-LABEL: func @cond_true(%arg0: !tfrt.chain, %arg1: !corert.tensorhandle) -> (!tfrt.chain, !corert.tensorhandle)
+// CHECK-LABEL: func @cond_true(%arg0: !tfrt.chain, %arg1: !tfrt_fallback.tf_tensor) -> (!tfrt.chain, !tfrt_fallback.tf_tensor)
 func.func @cond_true(%arg0: tensor<i32>) -> tensor<i32> {
   %0 = "tf.Const"() {device = "/device:CPU:0", value = dense<1> : tensor<i32>} : () -> tensor<i32>
   %1 = "tf.Add"(%arg0, %0) {device = "/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
   func.return %1 : tensor<i32>
 }
 
-// CHECK-LABEL: func @cond(%arg0: !tfrt.chain, %arg1: !corert.tensorhandle, %arg2: !corert.tensorhandle) -> (!tfrt.chain, !corert.tensorhandle)
+// CHECK-LABEL: func @cond(%arg0: !tfrt.chain, %arg1: !tfrt_fallback.tf_tensor, %arg2: !tfrt_fallback.tf_tensor) -> (!tfrt.chain, !tfrt_fallback.tf_tensor)
 func.func @cond(%arg0: tensor<i1>, %arg1: tensor<i32>) -> tensor<i32> {
   // CHECK: [[cond:%.*]] = tfrt_fallback_async.predicate
   // CHECK: [[cond_res:%.*]]:2 = tfrt.cond [[cond]]
-  // CHECK-SAME: @cond_true @cond_false(%arg0, %arg2) : (!tfrt.chain, !corert.tensorhandle)
+  // CHECK-SAME: @cond_true @cond_false(%arg0, %arg2) : (!tfrt.chain, !tfrt_fallback.tf_tensor)
   %2 = "tf.If"(%arg0, %arg1) {else_branch = @cond_false, then_branch = @cond_true, is_stateless = true} : (tensor<i1>, tensor<i32>) -> tensor<i32>
   // CHECK: [[out_ch:%.*]] = tfrt.merge.chains [[cond_res]]#0, %arg0 : !tfrt.chain, !tfrt.chain
-  // CHECK: tfrt.return [[out_ch]], [[cond_res]]#1 : !tfrt.chain, !corert.tensorhandle
+  // CHECK: tfrt.return [[out_ch]], [[cond_res]]#1 : !tfrt.chain, !tfrt_fallback.tf_tensor
   func.return %2 : tensor<i32>
 }
 
-// CHECK-LABEL: func @cond_stateful(%arg0: !tfrt.chain, %arg1: !corert.tensorhandle) -> (!tfrt.chain, !corert.tensorhandle)
+// CHECK-LABEL: func @cond_stateful(%arg0: !tfrt.chain, %arg1: !tfrt_fallback.tf_tensor) -> (!tfrt.chain, !tfrt_fallback.tf_tensor)
 func.func @cond_stateful(%arg0: tensor<i32>) -> tensor<i32> {
   %0 = "tf.Const"() {device = "/device:CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
   %1 = "tf.Less"(%arg0, %0) {device = "/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
   // CHECK: [[cond_res:%.*]]:2 = tfrt.cond
-  // CHECK-SAME: @cond_true @cond_false(%arg0, %arg1) : (!tfrt.chain, !corert.tensorhandle)
+  // CHECK-SAME: @cond_true @cond_false(%arg0, %arg1) : (!tfrt.chain, !tfrt_fallback.tf_tensor)
   %2 = "tf.If"(%1, %arg0) {else_branch = @cond_false, then_branch = @cond_true, is_stateless = false} : (tensor<i1>, tensor<i32>) -> tensor<i32>
   // Note: returns %out_op_chain.
-  // CHECK: tfrt.return [[cond_res]]#0, [[cond_res]]#1 : !tfrt.chain, !corert.tensorhandle
+  // CHECK: tfrt.return [[cond_res]]#0, [[cond_res]]#1 : !tfrt.chain, !tfrt_fallback.tf_tensor
   func.return %2 : tensor<i32>
 }
 
 // CHECK-LABEL: func @while_cond_lt9
-// CHECK-SAME: ({{%.+}}: !tfrt.chain, {{%.+}}: !corert.tensorhandle) -> (!tfrt.chain, !corert.tensorhandle)
+// CHECK-SAME: ({{%.+}}: !tfrt.chain, {{%.+}}: !tfrt_fallback.tf_tensor) -> (!tfrt.chain, !tfrt_fallback.tf_tensor)
 func.func @while_cond_lt9(%arg0: tensor<i32>) -> tensor<i1> {
   %0 = "tf.Const"() {device = "/device:CPU:0", value = dense<9> : tensor<i32>} : () -> tensor<i32>
   %1 = "tf.Less"(%arg0, %0) {device = "/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
@@ -46,7 +46,7 @@ func.func @while_cond_lt9(%arg0: tensor<i32>) -> tensor<i1> {
 }
 
 // CHECK-LABEL: func @while_body_add2
-// CHECK-SAME: ({{%.+}}: !tfrt.chain, {{%.+}}: !corert.tensorhandle) -> (!tfrt.chain, !corert.tensorhandle)
+// CHECK-SAME: ({{%.+}}: !tfrt.chain, {{%.+}}: !tfrt_fallback.tf_tensor) -> (!tfrt.chain, !tfrt_fallback.tf_tensor)
 func.func @while_body_add2(%arg0: tensor<i32>) -> tensor<i32> {
   %0 = "tf.Const"() {device = "/device:CPU:0", value = dense<2> : tensor<i32>} : () -> tensor<i32>
   %1 = "tf.Add"(%arg0, %0) {device = "/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
@@ -54,28 +54,26 @@ func.func @while_body_add2(%arg0: tensor<i32>) -> tensor<i32> {
 }
 
 // CHECK-LABEL: func @while_test
-// CHECK-SAME: ([[ARG0:%.+]]: !tfrt.chain) -> (!tfrt.chain, !corert.tensorhandle)
+// CHECK-SAME: ([[ARG0:%.+]]: !tfrt.chain) -> (!tfrt.chain, !tfrt_fallback.tf_tensor)
 func.func @while_test() -> (tensor<i32>) {
-  // CHECK: [[CONST:%.+]] = corert.const_dense_tensor dense<0> : tensor<i32>
+  // CHECK: [[CONST:%.*]] = tfrt_fallback_async.const_dense_tensor dense<0> : tensor<i32>
   %0 = "tf.Const"() {device = "/device:CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
-  // CHECK: [[pred_res:%.*]]:2 = tfrt.call @"while_cond_lt9/tfrt_predicate"([[ARG0]], [[CONST]]) : (!tfrt.chain, !corert.tensorhandle) -> (!tfrt.chain, i1)
+  // CHECK: [[pred_res:%.*]]:2 = tfrt.call @"while_cond_lt9/tfrt_predicate"([[ARG0]], [[CONST]]) : (!tfrt.chain, !tfrt_fallback.tf_tensor) -> (!tfrt.chain, i1)
   // CHECK: [[while_res:%.]]:2 = tfrt.while [[pred_res]]#1 @"while_body_add2/tfrt_body_1"([[pred_res]]#0, [[CONST]])
-  // CHECK-SAME: (!tfrt.chain, !corert.tensorhandle) -> (!tfrt.chain, !corert.tensorhandle)
+  // CHECK-SAME: (!tfrt.chain, !tfrt_fallback.tf_tensor) -> (!tfrt.chain, !tfrt_fallback.tf_tensor)
   %1 = "tf.While"(%0) { cond = @while_cond_lt9, body = @while_body_add2, is_stateless = false, parallel_iterations = 1} : (tensor<i32>) -> (tensor<i32>)
   // CHECK: [[out_chain:%.*]] = tfrt.merge.chains [[while_res]]#0, [[ARG0]]
-  // CHECK: tfrt.return [[out_chain]], [[while_res]]#1 : !tfrt.chain, !corert.tensorhandle
+  // CHECK: tfrt.return [[out_chain]], [[while_res]]#1 : !tfrt.chain, !tfrt_fallback.tf_tensor
   func.return %1 : tensor<i32>
 }
-// CHECK: func @"while_body_add2/tfrt_body_1"([[ch:%.*]]: !tfrt.chain, [[arg:%.*]]: !corert.tensorhandle) -> (!tfrt.chain, !corert.tensorhandle, i1)
-// CHECK: [[body_res:%.*]]:2 = tfrt.call @while_body_add2([[ch]], [[arg]]) : (!tfrt.chain, !corert.tensorhandle) -> (!tfrt.chain, !corert.tensorhandle)
-// CHECK: [[pred_res:%.*]]:2 = tfrt.call @"while_cond_lt9/tfrt_predicate"([[body_res]]#0, [[body_res]]#1) : (!tfrt.chain, !corert.tensorhandle) -> (!tfrt.chain, i1)
-// CHECK: tfrt.return [[pred_res]]#0, [[body_res]]#1, [[pred_res]]#1 : !tfrt.chain, !corert.tensorhandle, i1
+// CHECK: func @"while_body_add2/tfrt_body_1"([[ch:%.*]]: !tfrt.chain, [[arg:%.*]]: !tfrt_fallback.tf_tensor) -> (!tfrt.chain, !tfrt_fallback.tf_tensor, i1)
+// CHECK: [[body_res:%.*]]:2 = tfrt.call @while_body_add2([[ch]], [[arg]]) : (!tfrt.chain, !tfrt_fallback.tf_tensor) -> (!tfrt.chain, !tfrt_fallback.tf_tensor)
+// CHECK: [[pred_res:%.*]]:2 = tfrt.call @"while_cond_lt9/tfrt_predicate"([[body_res]]#0, [[body_res]]#1) : (!tfrt.chain, !tfrt_fallback.tf_tensor) -> (!tfrt.chain, i1)
+// CHECK: tfrt.return [[pred_res]]#0, [[body_res]]#1, [[pred_res]]#1 : !tfrt.chain, !tfrt_fallback.tf_tensor, i1
 
-// CHECK: func @"while_cond_lt9/tfrt_predicate"([[ch:%.*]]: !tfrt.chain, [[arg:%.*]]: !corert.tensorhandle) -> (!tfrt.chain, i1)
-// CHECK: [[cond_res:%.*]]:2 = tfrt.call @while_cond_lt9([[ch]], [[arg]]) : (!tfrt.chain, !corert.tensorhandle) -> (!tfrt.chain, !corert.tensorhandle)
-// CHECK: [[cond:%.*]] = tfrt_fallback_async.corert_tensorhandle_to_fallback_tensor [[cond_res]]#1
-// CHECK-SAME: (!corert.tensorhandle) -> (!tfrt_fallback.tf_tensor)
-// CHECK: [[bool_cond:%.*]] = tfrt_fallback_async.predicate [[cond]]
+// CHECK: func @"while_cond_lt9/tfrt_predicate"([[ch:%.*]]: !tfrt.chain, [[arg:%.*]]: !tfrt_fallback.tf_tensor) -> (!tfrt.chain, i1)
+// CHECK: [[cond_res:%.*]]:2 = tfrt.call @while_cond_lt9([[ch]], [[arg]]) : (!tfrt.chain, !tfrt_fallback.tf_tensor) -> (!tfrt.chain, !tfrt_fallback.tf_tensor)
+// CHECK: [[bool_cond:%.*]] = tfrt_fallback_async.predicate [[cond_res]]#1
 // CHECK: tfrt.return [[cond_res]]#0, [[bool_cond]] : !tfrt.chain, i1
 
 // CHECK-LABEL: func @multi_while_test
@@ -83,30 +81,102 @@ func.func @multi_while_test() -> (tensor<i32>, tensor<i32>) {
   %0 = "tf.Const"() {device = "/device:CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
   %1 = "tf.Const"() {device = "/device:CPU:0", value = dense<1> : tensor<i32>} : () -> tensor<i32>
   // CHECK: [[pred_0:%.*]]:2 = tfrt.call @"while_cond_lt9/tfrt_predicate"
-  // CHECK: tfrt.while [[pred_0]]#1 @"while_body_add2/tfrt_body_1"
+  // CHECK: tfrt.while [[pred_0]]#1 @"while_body_add2/tfrt_body_10"
+  // CHECK-SAME: parallel_iterations(10)
   // CHECK: [[pred_1:%.*]]:2 = tfrt.call @"while_cond_lt9/tfrt_predicate"
   // CHECK: tfrt.while [[pred_1]]#1 @"while_body_add2/tfrt_body_1"
-  %2 = "tf.While"(%0) { cond = @while_cond_lt9, body = @while_body_add2, is_stateless = false, parallel_iterations = 1} : (tensor<i32>) -> (tensor<i32>)
+  // CHECK-SAME: parallel_iterations(1)
+  %2 = "tf.While"(%0) { cond = @while_cond_lt9, body = @while_body_add2, is_stateless = false, parallel_iterations = 10} : (tensor<i32>) -> (tensor<i32>)
   %3 = "tf.While"(%1) { cond = @while_cond_lt9, body = @while_body_add2, is_stateless = false, parallel_iterations = 1} : (tensor<i32>) -> (tensor<i32>)
   func.return %2, %3 : tensor<i32>, tensor<i32>
 }
 
+func.func @side_effect_while_cond_lt9(%arg: tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i1> {
+  %0 = "tf.Const"() {device = "/device:CPU:0", value = dense<9> : tensor<i32>} : () -> tensor<i32>
+  %1 = "tf.ReadVariableOp"(%arg) {device = "/device:CPU:0", dtype = i32} : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
+  %2 = "tf.Less"(%1, %0) {device = "/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  func.return %2 : tensor<i1>
+}
+
+func.func @side_effect_while_body_add2(%arg: tensor<!tf_type.resource<tensor<i32>>>) -> (tensor<!tf_type.resource<tensor<i32>>>) {
+  %0 = "tf.Const"() {device = "/device:CPU:0", value = dense<2> : tensor<i32>} : () -> tensor<i32>
+  %1 = "tf.ReadVariableOp"(%arg) {device = "/device:CPU:0", dtype = i32} : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
+  %2 = "tf.Add"(%1, %0) {device = "/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  "tf.AssignVariableOp"(%arg, %2) {device = "/device:CPU:0"} : (tensor<!tf_type.resource<tensor<i32>>>, tensor<i32>) -> ()
+  func.return %arg : tensor<!tf_type.resource<tensor<i32>>>
+}
+
+// CHECK-LABEL: func @side_effect_while_test
+func.func @side_effect_while_test() -> (tensor<i32>) {
+  %0 = "tf.VarHandleOp"() {device = "/device:CPU:0", container = "c", shared_name = "v"} : () -> tensor<!tf_type.resource<tensor<i32>>>
+  // CHECK: [[while_res:%.]]:2 = tfrt.while {{%.*}} @"side_effect_while_body_add2/tfrt_body_1"
+  // CHECK: [[out_ch:%.*]], [[res:%.*]] = tfrt_fallback_async.executeop.seq([[while_res]]#0) {{.*}} "tf.ReadVariableOp"
+  %1 = "tf.While"(%0) { cond = @side_effect_while_cond_lt9, body = @side_effect_while_body_add2, is_stateless = false, parallel_iterations = 1} : (tensor<!tf_type.resource<tensor<i32>>>) -> (tensor<!tf_type.resource<tensor<i32>>>)
+  %2 = "tf.ReadVariableOp"(%1) {device = "/device:CPU:0", dtype = i32} : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
+  func.return %2 : tensor<i32>
+}
+
+func.func @tensor_array_while_cond(%index: tensor<i32>, %size: tensor<i32>, %flow_0: tensor<f32>, %flow_1: tensor<f32>, %handle_0: tensor<2x!tf_type.resource<tensor<?x100xf32>>>, %handle_1: tensor<2x!tf_type.resource<tensor<?x512xf32>>>) -> (tensor<i1>) {
+  %0 = "tf.Less"(%index, %size) {device = "/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  func.return %0 : tensor<i1>
+}
+
+func.func @tensor_array_while_body(%index: tensor<i32>, %size: tensor<i32>, %flow_0: tensor<f32>, %flow_1: tensor<f32>, %handle_0: tensor<2x!tf_type.resource<tensor<?x100xf32>>>, %handle_1: tensor<2x!tf_type.resource<tensor<?x512xf32>>>) -> (tensor<i32>, tensor<i32>, tensor<f32>, tensor<f32>, tensor<2x!tf_type.resource<tensor<?x100xf32>>>, tensor<2x!tf_type.resource<tensor<?x512xf32>>>) {
+  %cst = "tf.Const"() {value = dense<1.1> : tensor<100x512xf32>} : () -> tensor<100x512xf32>
+  %one = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %x = "tf.TensorArrayReadV3"(%handle_0, %index, %flow_0) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<2x!tf_type.resource<tensor<?x100xf32>>>, tensor<i32>, tensor<f32>) -> tensor<?x100xf32>
+  %y = "tf.MatMul"(%x, %cst) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<?x100xf32>, tensor<100x512xf32>) -> (tensor<?x512xf32>)
+  %flow_1_out = "tf.TensorArrayWriteV3"(%handle_1, %index, %y, %flow_1) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<2x!tf_type.resource<tensor<?x512xf32>>>, tensor<i32>, tensor<?x512xf32>, tensor<f32>) -> tensor<f32>
+  %next_index = "tf.AddV2"(%index, %one) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  func.return %next_index, %size, %flow_0, %flow_1_out, %handle_0, %handle_1 : tensor<i32>, tensor<i32>, tensor<f32>, tensor<f32>, tensor<2x!tf_type.resource<tensor<?x100xf32>>>, tensor<2x!tf_type.resource<tensor<?x512xf32>>>
+}
+
+// CHECK-LABEL: func @tensor_array_while_test
+// CHECK-SAME: ([[in_chain:%.*]]: !tfrt.chain
+func.func @tensor_array_while_test(%indices: tensor<?xi32>, %input_0: tensor<?x?x?xf32>, %input_1: tensor<?x?x?xf32>) -> (tensor<?x?x512xf32>, tensor<?x?x512xf32>) {
+  %index = "tf.Const"() {device = "/device:CPU:0", value = dense<0> : tensor<i32>} : () -> (tensor<i32>)
+  %size = "tf.Const"() {device = "/device:CPU:0", value = dense<9> : tensor<i32>} : () -> (tensor<i32>)
+  %handle_0, %flow_0 = "tf.TensorArrayV3"(%size) {clear_after_read = true, device = "/job:localhost/replica:0/task:0/device:CPU:0", dtype = f32, dynamic_size = false, element_shape = #tf_type.shape<?x100>, identical_element_shapes = true, tensor_array_name = "processed_embeddings/bidirectional_rnn/bw/bw/dynamic_rnn/input_0"} : (tensor<i32>) -> (tensor<2x!tf_type.resource<tensor<?x100xf32>>>, tensor<f32>)
+  %handle_1, %flow_1 = "tf.TensorArrayV3"(%size) {clear_after_read = true, device = "/job:localhost/replica:0/task:0/device:CPU:0", dtype = f32, dynamic_size = false, element_shape = #tf_type.shape<?x512>, identical_element_shapes = true, tensor_array_name = "processed_embeddings/bidirectional_rnn/bw/bw/dynamic_rnn/output_0"} : (tensor<i32>) -> (tensor<2x!tf_type.resource<tensor<?x512xf32>>>, tensor<f32>)
+  %flow_01 = "tf.TensorArrayScatterV3"(%handle_0, %indices, %input_0, %flow_0) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<2x!tf_type.resource<tensor<?x100xf32>>>, tensor<?xi32>, tensor<?x?x?xf32>, tensor<f32>) -> tensor<f32>
+  // CHECK: [[pred_0:%.*]]:2 = tfrt.call @"tensor_array_while_cond/tfrt_predicate"([[in_chain]]
+  // CHECK: [[while_res_0:%.*]]:7 = tfrt.while {{%.*}} @"tensor_array_while_body/tfrt_body_10"([[pred_0]]#0
+  // CHECK-SAME: parallel_iterations(10)
+  %res_0:6 = "tf.While"(%index, %size, %flow_01, %flow_1, %handle_0, %handle_1) {body = @tensor_array_while_body, cond = @tensor_array_while_cond, device = "", is_stateless = false, parallel_iterations = 10 : i64} : (tensor<i32>, tensor<i32>, tensor<f32>, tensor<f32>, tensor<2x!tf_type.resource<tensor<?x100xf32>>>, tensor<2x!tf_type.resource<tensor<?x512xf32>>>) -> (tensor<i32>, tensor<i32>, tensor<f32>, tensor<f32>, tensor<2x!tf_type.resource<tensor<?x100xf32>>>, tensor<2x!tf_type.resource<tensor<?x512xf32>>>)
+  %output_0 = "tf.TensorArrayGatherV3"(%handle_1, %indices, %res_0#3) {device = "/job:localhost/replica:0/task:0/device:CPU:0", element_shape = #tf_type.shape<?x512>} : (tensor<2x!tf_type.resource<tensor<?x512xf32>>>, tensor<?xi32>, tensor<f32>) -> tensor<?x?x512xf32>
+
+  %handle_2, %flow_2 = "tf.TensorArrayV3"(%size) {clear_after_read = true, device = "/job:localhost/replica:0/task:0/device:CPU:0", dtype = f32, dynamic_size = false, element_shape = #tf_type.shape<?x100>, identical_element_shapes = true, tensor_array_name = "processed_embeddings/bidirectional_rnn/bw/bw/dynamic_rnn/input_0"} : (tensor<i32>) -> (tensor<2x!tf_type.resource<tensor<?x100xf32>>>, tensor<f32>)
+  %handle_3, %flow_3 = "tf.TensorArrayV3"(%size) {clear_after_read = true, device = "/job:localhost/replica:0/task:0/device:CPU:0", dtype = f32, dynamic_size = false, element_shape = #tf_type.shape<?x512>, identical_element_shapes = true, tensor_array_name = "processed_embeddings/bidirectional_rnn/bw/bw/dynamic_rnn/output_0"} : (tensor<i32>) -> (tensor<2x!tf_type.resource<tensor<?x512xf32>>>, tensor<f32>)
+  %flow_21 = "tf.TensorArrayScatterV3"(%handle_2, %indices, %input_1, %flow_2) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<2x!tf_type.resource<tensor<?x100xf32>>>, tensor<?xi32>, tensor<?x?x?xf32>, tensor<f32>) -> tensor<f32>
+  // CHECK: [[pred_1:%.*]]:2 = tfrt.call @"tensor_array_while_cond/tfrt_predicate"([[in_chain]]
+  // CHECK: [[while_res_1:%.*]]:7 = tfrt.while {{%.*}} @"tensor_array_while_body/tfrt_body_10"([[pred_1]]#0
+  // CHECK-SAME: parallel_iterations(10)
+  %res_1:6 = "tf.While"(%index, %size, %flow_21, %flow_3, %handle_2, %handle_3) {body = @tensor_array_while_body, cond = @tensor_array_while_cond, device = "", is_stateless = false, parallel_iterations = 10 : i64} : (tensor<i32>, tensor<i32>, tensor<f32>, tensor<f32>, tensor<2x!tf_type.resource<tensor<?x100xf32>>>, tensor<2x!tf_type.resource<tensor<?x512xf32>>>) -> (tensor<i32>, tensor<i32>, tensor<f32>, tensor<f32>, tensor<2x!tf_type.resource<tensor<?x100xf32>>>, tensor<2x!tf_type.resource<tensor<?x512xf32>>>)
+  %output_1 = "tf.TensorArrayGatherV3"(%handle_3, %indices, %res_1#3) {device = "/job:localhost/replica:0/task:0/device:CPU:0", element_shape = #tf_type.shape<?x512>} : (tensor<2x!tf_type.resource<tensor<?x512xf32>>>, tensor<?xi32>, tensor<f32>) -> tensor<?x?x512xf32>
+  func.return %output_0, %output_1 : tensor<?x?x512xf32>, tensor<?x?x512xf32>
+}
+
+// CHECK: func @"tensor_array_while_body/tfrt_body_10"
+
 func.func @callee(%arg0: tensor<i32>) -> (tensor<i32>) {
   func.return %arg0: tensor<i32>
 }
 
 // CHECK-LABEL: func @call_test
 // CHECK-SAME: ([[chain:%.*]]: !tfrt.chain,
-func.func @call_test(%arg0: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
+func.func @call_test(%arg0: tensor<i32>) -> (tensor<i32>, tensor<i32>, tensor<i32>) {
   %0 = "tf.Add"(%arg0, %arg0) {device = "/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
   // CHECK: [[results_0:%.*]]:2 = tfrt.call @callee([[chain]]
-  // CHECK-SAME: (!tfrt.chain, !corert.tensorhandle) -> (!tfrt.chain, !corert.tensorhandle)
+  // CHECK-SAME: (!tfrt.chain, !tfrt_fallback.tf_tensor) -> (!tfrt.chain, !tfrt_fallback.tf_tensor)
   %1 = "tf.StatefulPartitionedCall"(%0) {config = "", config_proto = "", executor_type = "", f = @callee} : (tensor<i32>) -> (tensor<i32>)
-  // CHECK: [[results_1:%.*]]:2 = tfrt.call @callee([[chain]]
-  // CHECK-SAME: (!tfrt.chain, !corert.tensorhandle) -> (!tfrt.chain, !corert.tensorhandle)
+  // CHECK-NEXT: [[results_1:%.*]]:2 = tfrt.call @callee([[chain]]
+  // CHECK-SAME: (!tfrt.chain, !tfrt_fallback.tf_tensor) -> (!tfrt.chain, !tfrt_fallback.tf_tensor)
   %2 = "tf.PartitionedCall"(%0) {config = "", config_proto = "", executor_type = "", f = @callee} : (tensor<i32>) -> (tensor<i32>)
-  // CHECK: [[results_0]]#1, [[results_1]]#1
-  func.return %1, %2 : tensor<i32>, tensor<i32>
+  // CHECK-NEXT: [[results_2:%.*]]:2 = tfrt.call @callee([[chain]]
+  // CHECK-SAME: (!tfrt.chain, !tfrt_fallback.tf_tensor) -> (!tfrt.chain, !tfrt_fallback.tf_tensor)
+  %3 = "tf.LegacyCall"(%0) {f = @callee} : (tensor<i32>) -> (tensor<i32>)
+  // CHECK: [[results_0]]#1, [[results_1]]#1, [[results_2]]#1
+  func.return %1, %2, %3 : tensor<i32>, tensor<i32>, tensor<i32>
 }
 
 func.func @branch0(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
@@ -120,16 +190,12 @@ func.func @branch1(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
   func.return %1 : tensor<f32>
 }
 
-// CHECK-LABEL: func @case_test(
-// CHECK-SAME:                    arg0: !tfrt.chain,
-// CHECK-SAME:                    arg1: !corert.tensorhandle,
-// CHECK-SAME:                    arg2: !corert.tensorhandle,
-// CHECK-SAME:                    arg3: !corert.tensorhandle) -> (!tfrt.chain, !corert.tensorhandle) {
+// CHECK-LABEL: func @case_test
+// CHECK-SAME: ([[chain:%.*]]: !tfrt.chain, [[tf_idx:%.*]]: !tfrt_fallback.tf_tensor, [[branch_arg0:%.*]]: !tfrt_fallback.tf_tensor, [[branch_arg1:%.*]]: !tfrt_fallback.tf_tensor)
 func.func @case_test(%arg0: tensor<i32>, %arg1: tensor<f32>,  %arg2: tensor<f32>) -> tensor<f32> {
-  // CHECK:           %[[res_idx:[^ ]+]] = corert.tensorhandle_to_int32 %arg1
-  // CHECK:           %[[case_out:[^ ]+]]:2 = tfrt.case %[[res_idx]] [@branch0, @branch1](%arg0, %arg2, %arg3) : (!tfrt.chain, !corert.tensorhandle, !corert.tensorhandle) -> (!tfrt.chain, !corert.tensorhandle)
-  // CHECK:           %[[out_chain:[^ ]+]] = tfrt.merge.chains %[[case_out]]#0, %arg0 : !tfrt.chain, !tfrt.chain
+  // CHECK: [[th_idx:%.*]] = tfrt_fallback_async.fallback_tensor_to_corert_tensorhandle [[tf_idx]]
+  // CHECK-NEXT: [[idx:%.*]] = corert.tensorhandle_to_int32 [[th_idx]]
+  // CHECK-NEXT: [[out:%.*]] = tfrt.case [[idx]] [@branch0, @branch1]([[chain]], [[branch_arg0]], [[branch_arg1]])
   %0 = "tf.Case"(%arg0, %arg1, %arg2) {_lower_using_switch_merge = true, branches = [@branch0, @branch1], is_stateless = true} : (tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<f32>
-  // CHECK:           tfrt.return %[[out_chain]], %[[case_out]]#1 : !tfrt.chain, !corert.tensorhandle
   func.return %0 : tensor<f32>
 }
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/decompose_resource_op.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/decompose_resource_op.mlir
index a872b96a2fd6b4..ff0f0e7dbfd2cd 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/decompose_resource_op.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/decompose_resource_op.mlir
@@ -4,13 +4,11 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
 
 // CHECK-LABEL: func @gather
 // CHECK-SAME: ([[in_chain:%.*]]: !tfrt.chain
-// CHECK-SAME: [[arg0:%.*]]: !corert.tensorhandle, [[arg1:%.*]]: !corert.tensorhandle)
-// CHECK: [[const_th:%.*]] = corert.const_dense_tensor
-// CHECK-NEXT: [[const:%.*]] = tfrt_fallback_async.corert_tensorhandle_to_fallback_tensor [[const_th]] {device = "/job:localhost/replica:0/task:0/device:CPU:0"}
+// CHECK-SAME: [[arg0:%.*]]: !tfrt_fallback.tf_tensor, [[arg1:%.*]]: !tfrt_fallback.tf_tensor)
+// CHECK: [[const:%.*]] = tfrt_fallback_async.const_dense_tensor
 // CHECK-NEXT: [[out_chain:%.*]], [[value:%.*]] = tfrt_fallback_async.executeop.seq([[in_chain]]) key(0) cost({{.*}}) device("/job:localhost/replica:0/task:0/device:CPU:0") "tf.ReadVariableOp"({{.*}})
 // CHECK-NEXT: [[res:%.*]] = tfrt_fallback_async.executeop key(1) cost({{.*}}) device("/job:localhost/replica:0/task:0/device:CPU:0") "tf.GatherV2"([[value]], {{.*}}, [[const]])
-// CHECK-NEXT: [[res_th:%.*]] = tfrt_fallback_async.fallback_tensor_to_corert_tensorhandle [[res]] {device = "/job:localhost/replica:0/task:0/device:CPU:0"}
-// CHECK-NEXT: tfrt.return [[out_chain]], [[res_th]] : !tfrt.chain, !corert.tensorhandle
+// CHECK-NEXT: tfrt.return [[out_chain]], [[res]] : !tfrt.chain, !tfrt_fallback.tf_tensor
 func.func @gather(%indices: tensor<?xi32>,
              %resource: tensor<*x!tf_type.resource>) -> tensor<*xi32> {
   %0 = "tf.ResourceGather"(%resource, %indices) {batch_dims = 0 : i64, device = "/device:CPU:0", validate_indices = true}: (tensor<*x!tf_type.resource>, tensor<?xi32>) -> (tensor<*xi32>)
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/device_conversion.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/device_conversion.mlir
index 4c5777c28e2c98..e4dba7e395dbdf 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/device_conversion.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/device_conversion.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-tfrt-opt -tf-to-tfrt=func-use-fallback-tensor=true %s | FileCheck %s --dump-input=fail
+// RUN: tf-tfrt-opt -tf-to-tfrt %s | FileCheck %s --dump-input=fail
 
 // CHECK-LABEL: func @device_test
 func.func @device_test(
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/fallback.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/fallback.mlir
index b1c8c45d8b7a3f..1b03794356f1f6 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/fallback.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/fallback.mlir
@@ -15,7 +15,7 @@
 
 // CHECK-LABEL: func @main
 // CHECK-SAME: {{.*}} !tfrt.chain
-// CHECK-SAME: [[serialized:%.*]]: !corert.tensorhandle
+// CHECK-SAME: [[serialized:%.*]]: !tfrt_fallback.tf_tensor
 func.func @main(%serialized: tensor<32x!tf_type.string>) -> (tensor<?x2xi64>) attributes {tf.entry_function = {inputs = "input0", outputs = "ParseExample/ParseExampleV2"}} {
   %dense_default_0 = "tf.Const"() {device = "/device:CPU:0", dtype = f32, value = dense<[]> : tensor<0xf32>} : () -> tensor<0xf32>
   %dense_default_1 = "tf.Const"() {device = "/device:CPU:0", dtype = f32, value = dense<[]> : tensor<0xf32>} : () -> tensor<0xf32>
@@ -24,10 +24,8 @@ func.func @main(%serialized: tensor<32x!tf_type.string>) -> (tensor<?x2xi64>) at
   %ragged_keys = "tf.Const"() {device = "/device:CPU:0", dtype = !tf_type.string, value = dense<""> : tensor<0x!tf_type.string>} : () -> tensor<0x!tf_type.string>
   %sparse_keys = "tf.Const"() {device = "/device:CPU:0", dtype = !tf_type.string, value = dense<""> : tensor<2x!tf_type.string>} : () -> tensor<2x!tf_type.string>
 
-  // CHECK: [[fallback_serialized:%.*]] = tfrt_fallback_async.corert_tensorhandle_to_fallback_tensor [[serialized]]
-  // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:CPU:0"
   // CHECK: [[outputs:%.*]]:8 = tfrt_fallback_async.executeop key(0) cost({{.*}}) device("/device:CPU:0") "tf.ParseExampleV2"
-  // CHECK-SAME: ([[fallback_serialized]]
+  // CHECK-SAME: ([[serialized]]
   // CHECK-NOT: device
   // CHECK-SAME: Tdense = [f32, f32]
   // CHECK-SAME: dense_shapes = [#corert.shape<>, #corert.shape<>]
@@ -44,9 +42,7 @@ func.func @main(%serialized: tensor<32x!tf_type.string>) -> (tensor<?x2xi64>) at
     } : (tensor<32x!tf_type.string>, tensor<0x!tf_type.string>, tensor<2x!tf_type.string>, tensor<2x!tf_type.string>, tensor<0x!tf_type.string>, tensor<0xf32>, tensor<0xf32>)
     -> (tensor<?x2xi64>, tensor<?x2xi64>, tensor<?x!tf_type.string>, tensor<?xi64>, tensor<2xi64>, tensor<2xi64>, tensor<32xf32>, tensor<32xf32>)
 
-  // CHECK: [[result:%.*]] = tfrt_fallback_async.fallback_tensor_to_corert_tensorhandle [[outputs]]#0
-  // CHECK-SAME: device = "/device:CPU:0"
-  // CHECK: tfrt.return {{.*}}, [[result]]
+  // CHECK: tfrt.return {{.*}}, [[outputs]]#0
   func.return %outputs#0 : tensor<?x2xi64>
 }
 
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/func_use_fallback_tensor.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/func_use_fallback_tensor.mlir
deleted file mode 100644
index 017efc64c31a50..00000000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/func_use_fallback_tensor.mlir
+++ /dev/null
@@ -1,207 +0,0 @@
-// RUN: tf-tfrt-opt -tf-to-tfrt="func-use-fallback-tensor=true enable-while-parallel-iterations=true" %s | FileCheck %s --dump-input=fail
-
-// This file tests the correctness of `func-use-fallback-tensor` option when
-// converting from TF to TFRT. Since func op is used by the control flow ops,
-// the test cases here should cover the control flow ops.
-
-// CHECK-LABEL: func @cond_false(%arg0: !tfrt.chain, %arg1: !tfrt_fallback.tf_tensor) -> (!tfrt.chain, !tfrt_fallback.tf_tensor)
-func.func @cond_false(%arg0: tensor<i32>) -> tensor<i32> {
-  %0 = "tf.Const"() {device = "/device:CPU:0", value = dense<-1> : tensor<i32>} : () -> tensor<i32>
-  %1 = "tf.Add"(%arg0, %0) {device = "/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-  func.return %1 : tensor<i32>
-}
-
-// CHECK-LABEL: func @cond_true(%arg0: !tfrt.chain, %arg1: !tfrt_fallback.tf_tensor) -> (!tfrt.chain, !tfrt_fallback.tf_tensor)
-func.func @cond_true(%arg0: tensor<i32>) -> tensor<i32> {
-  %0 = "tf.Const"() {device = "/device:CPU:0", value = dense<1> : tensor<i32>} : () -> tensor<i32>
-  %1 = "tf.Add"(%arg0, %0) {device = "/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-  func.return %1 : tensor<i32>
-}
-
-// CHECK-LABEL: func @cond(%arg0: !tfrt.chain, %arg1: !tfrt_fallback.tf_tensor, %arg2: !tfrt_fallback.tf_tensor) -> (!tfrt.chain, !tfrt_fallback.tf_tensor)
-func.func @cond(%arg0: tensor<i1>, %arg1: tensor<i32>) -> tensor<i32> {
-  // CHECK: [[cond:%.*]] = tfrt_fallback_async.predicate
-  // CHECK: [[cond_res:%.*]]:2 = tfrt.cond [[cond]]
-  // CHECK-SAME: @cond_true @cond_false(%arg0, %arg2) : (!tfrt.chain, !tfrt_fallback.tf_tensor)
-  %2 = "tf.If"(%arg0, %arg1) {else_branch = @cond_false, then_branch = @cond_true, is_stateless = true} : (tensor<i1>, tensor<i32>) -> tensor<i32>
-  // CHECK: [[out_ch:%.*]] = tfrt.merge.chains [[cond_res]]#0, %arg0 : !tfrt.chain, !tfrt.chain
-  // CHECK: tfrt.return [[out_ch]], [[cond_res]]#1 : !tfrt.chain, !tfrt_fallback.tf_tensor
-  func.return %2 : tensor<i32>
-}
-
-// CHECK-LABEL: func @cond_stateful(%arg0: !tfrt.chain, %arg1: !tfrt_fallback.tf_tensor) -> (!tfrt.chain, !tfrt_fallback.tf_tensor)
-func.func @cond_stateful(%arg0: tensor<i32>) -> tensor<i32> {
-  %0 = "tf.Const"() {device = "/device:CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
-  %1 = "tf.Less"(%arg0, %0) {device = "/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
-  // CHECK: [[cond_res:%.*]]:2 = tfrt.cond
-  // CHECK-SAME: @cond_true @cond_false(%arg0, %arg1) : (!tfrt.chain, !tfrt_fallback.tf_tensor)
-  %2 = "tf.If"(%1, %arg0) {else_branch = @cond_false, then_branch = @cond_true, is_stateless = false} : (tensor<i1>, tensor<i32>) -> tensor<i32>
-  // Note: returns %out_op_chain.
-  // CHECK: tfrt.return [[cond_res]]#0, [[cond_res]]#1 : !tfrt.chain, !tfrt_fallback.tf_tensor
-  func.return %2 : tensor<i32>
-}
-
-// CHECK-LABEL: func @while_cond_lt9
-// CHECK-SAME: ({{%.+}}: !tfrt.chain, {{%.+}}: !tfrt_fallback.tf_tensor) -> (!tfrt.chain, !tfrt_fallback.tf_tensor)
-func.func @while_cond_lt9(%arg0: tensor<i32>) -> tensor<i1> {
-  %0 = "tf.Const"() {device = "/device:CPU:0", value = dense<9> : tensor<i32>} : () -> tensor<i32>
-  %1 = "tf.Less"(%arg0, %0) {device = "/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
-  func.return %1 : tensor<i1>
-}
-
-// CHECK-LABEL: func @while_body_add2
-// CHECK-SAME: ({{%.+}}: !tfrt.chain, {{%.+}}: !tfrt_fallback.tf_tensor) -> (!tfrt.chain, !tfrt_fallback.tf_tensor)
-func.func @while_body_add2(%arg0: tensor<i32>) -> tensor<i32> {
-  %0 = "tf.Const"() {device = "/device:CPU:0", value = dense<2> : tensor<i32>} : () -> tensor<i32>
-  %1 = "tf.Add"(%arg0, %0) {device = "/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-  func.return %1 : tensor<i32>
-}
-
-// CHECK-LABEL: func @while_test
-// CHECK-SAME: ([[ARG0:%.+]]: !tfrt.chain) -> (!tfrt.chain, !tfrt_fallback.tf_tensor)
-func.func @while_test() -> (tensor<i32>) {
-  // CHECK: [[CONST_TH:%.*]] = corert.const_dense_tensor dense<0> : tensor<i32>
-  %0 = "tf.Const"() {device = "/device:CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
-  // CHECK: [[CONST:%.*]] = tfrt_fallback_async.corert_tensorhandle_to_fallback_tensor [[CONST_TH]]
-  // CHECK: (!corert.tensorhandle) -> (!tfrt_fallback.tf_tensor)
-  // CHECK: [[pred_res:%.*]]:2 = tfrt.call @"while_cond_lt9/tfrt_predicate"([[ARG0]], [[CONST]]) : (!tfrt.chain, !tfrt_fallback.tf_tensor) -> (!tfrt.chain, i1)
-  // CHECK: [[while_res:%.]]:2 = tfrt.while [[pred_res]]#1 @"while_body_add2/tfrt_body_1"([[pred_res]]#0, [[CONST]])
-  // CHECK-SAME: (!tfrt.chain, !tfrt_fallback.tf_tensor) -> (!tfrt.chain, !tfrt_fallback.tf_tensor)
-  %1 = "tf.While"(%0) { cond = @while_cond_lt9, body = @while_body_add2, is_stateless = false, parallel_iterations = 1} : (tensor<i32>) -> (tensor<i32>)
-  // CHECK: [[out_chain:%.*]] = tfrt.merge.chains [[while_res]]#0, [[ARG0]]
-  // CHECK: tfrt.return [[out_chain]], [[while_res]]#1 : !tfrt.chain, !tfrt_fallback.tf_tensor
-  func.return %1 : tensor<i32>
-}
-// CHECK: func @"while_body_add2/tfrt_body_1"([[ch:%.*]]: !tfrt.chain, [[arg:%.*]]: !tfrt_fallback.tf_tensor) -> (!tfrt.chain, !tfrt_fallback.tf_tensor, i1)
-// CHECK: [[body_res:%.*]]:2 = tfrt.call @while_body_add2([[ch]], [[arg]]) : (!tfrt.chain, !tfrt_fallback.tf_tensor) -> (!tfrt.chain, !tfrt_fallback.tf_tensor)
-// CHECK: [[pred_res:%.*]]:2 = tfrt.call @"while_cond_lt9/tfrt_predicate"([[body_res]]#0, [[body_res]]#1) : (!tfrt.chain, !tfrt_fallback.tf_tensor) -> (!tfrt.chain, i1)
-// CHECK: tfrt.return [[pred_res]]#0, [[body_res]]#1, [[pred_res]]#1 : !tfrt.chain, !tfrt_fallback.tf_tensor, i1
-
-// CHECK: func @"while_cond_lt9/tfrt_predicate"([[ch:%.*]]: !tfrt.chain, [[arg:%.*]]: !tfrt_fallback.tf_tensor) -> (!tfrt.chain, i1)
-// CHECK: [[cond_res:%.*]]:2 = tfrt.call @while_cond_lt9([[ch]], [[arg]]) : (!tfrt.chain, !tfrt_fallback.tf_tensor) -> (!tfrt.chain, !tfrt_fallback.tf_tensor)
-// CHECK: [[bool_cond:%.*]] = tfrt_fallback_async.predicate [[cond_res]]#1
-// CHECK: tfrt.return [[cond_res]]#0, [[bool_cond]] : !tfrt.chain, i1
-
-// CHECK-LABEL: func @multi_while_test
-func.func @multi_while_test() -> (tensor<i32>, tensor<i32>) {
-  %0 = "tf.Const"() {device = "/device:CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
-  %1 = "tf.Const"() {device = "/device:CPU:0", value = dense<1> : tensor<i32>} : () -> tensor<i32>
-  // CHECK: [[pred_0:%.*]]:2 = tfrt.call @"while_cond_lt9/tfrt_predicate"
-  // CHECK: tfrt.while [[pred_0]]#1 @"while_body_add2/tfrt_body_10"
-  // CHECK-SAME: parallel_iterations(10)
-  // CHECK: [[pred_1:%.*]]:2 = tfrt.call @"while_cond_lt9/tfrt_predicate"
-  // CHECK: tfrt.while [[pred_1]]#1 @"while_body_add2/tfrt_body_1"
-  // CHECK-SAME: parallel_iterations(1)
-  %2 = "tf.While"(%0) { cond = @while_cond_lt9, body = @while_body_add2, is_stateless = false, parallel_iterations = 10} : (tensor<i32>) -> (tensor<i32>)
-  %3 = "tf.While"(%1) { cond = @while_cond_lt9, body = @while_body_add2, is_stateless = false, parallel_iterations = 1} : (tensor<i32>) -> (tensor<i32>)
-  func.return %2, %3 : tensor<i32>, tensor<i32>
-}
-
-func.func @side_effect_while_cond_lt9(%arg: tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i1> {
-  %0 = "tf.Const"() {device = "/device:CPU:0", value = dense<9> : tensor<i32>} : () -> tensor<i32>
-  %1 = "tf.ReadVariableOp"(%arg) {device = "/device:CPU:0", dtype = i32} : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
-  %2 = "tf.Less"(%1, %0) {device = "/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
-  func.return %2 : tensor<i1>
-}
-
-func.func @side_effect_while_body_add2(%arg: tensor<!tf_type.resource<tensor<i32>>>) -> (tensor<!tf_type.resource<tensor<i32>>>) {
-  %0 = "tf.Const"() {device = "/device:CPU:0", value = dense<2> : tensor<i32>} : () -> tensor<i32>
-  %1 = "tf.ReadVariableOp"(%arg) {device = "/device:CPU:0", dtype = i32} : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
-  %2 = "tf.Add"(%1, %0) {device = "/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-  "tf.AssignVariableOp"(%arg, %2) {device = "/device:CPU:0"} : (tensor<!tf_type.resource<tensor<i32>>>, tensor<i32>) -> ()
-  func.return %arg : tensor<!tf_type.resource<tensor<i32>>>
-}
-
-// CHECK-LABEL: func @side_effect_while_test
-func.func @side_effect_while_test() -> (tensor<i32>) {
-  %0 = "tf.VarHandleOp"() {device = "/device:CPU:0", container = "c", shared_name = "v"} : () -> tensor<!tf_type.resource<tensor<i32>>>
-  // CHECK: [[while_res:%.]]:2 = tfrt.while {{%.*}} @"side_effect_while_body_add2/tfrt_body_1"
-  // CHECK: [[out_ch:%.*]], [[res:%.*]] = tfrt_fallback_async.executeop.seq([[while_res]]#0) {{.*}} "tf.ReadVariableOp"
-  %1 = "tf.While"(%0) { cond = @side_effect_while_cond_lt9, body = @side_effect_while_body_add2, is_stateless = false, parallel_iterations = 1} : (tensor<!tf_type.resource<tensor<i32>>>) -> (tensor<!tf_type.resource<tensor<i32>>>)
-  %2 = "tf.ReadVariableOp"(%1) {device = "/device:CPU:0", dtype = i32} : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
-  func.return %2 : tensor<i32>
-}
-
-func.func @tensor_array_while_cond(%index: tensor<i32>, %size: tensor<i32>, %flow_0: tensor<f32>, %flow_1: tensor<f32>, %handle_0: tensor<2x!tf_type.resource<tensor<?x100xf32>>>, %handle_1: tensor<2x!tf_type.resource<tensor<?x512xf32>>>) -> (tensor<i1>) {
-  %0 = "tf.Less"(%index, %size) {device = "/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
-  func.return %0 : tensor<i1>
-}
-
-func.func @tensor_array_while_body(%index: tensor<i32>, %size: tensor<i32>, %flow_0: tensor<f32>, %flow_1: tensor<f32>, %handle_0: tensor<2x!tf_type.resource<tensor<?x100xf32>>>, %handle_1: tensor<2x!tf_type.resource<tensor<?x512xf32>>>) -> (tensor<i32>, tensor<i32>, tensor<f32>, tensor<f32>, tensor<2x!tf_type.resource<tensor<?x100xf32>>>, tensor<2x!tf_type.resource<tensor<?x512xf32>>>) {
-  %cst = "tf.Const"() {value = dense<1.1> : tensor<100x512xf32>} : () -> tensor<100x512xf32>
-  %one = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-  %x = "tf.TensorArrayReadV3"(%handle_0, %index, %flow_0) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<2x!tf_type.resource<tensor<?x100xf32>>>, tensor<i32>, tensor<f32>) -> tensor<?x100xf32>
-  %y = "tf.MatMul"(%x, %cst) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<?x100xf32>, tensor<100x512xf32>) -> (tensor<?x512xf32>)
-  %flow_1_out = "tf.TensorArrayWriteV3"(%handle_1, %index, %y, %flow_1) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<2x!tf_type.resource<tensor<?x512xf32>>>, tensor<i32>, tensor<?x512xf32>, tensor<f32>) -> tensor<f32>
-  %next_index = "tf.AddV2"(%index, %one) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-  func.return %next_index, %size, %flow_0, %flow_1_out, %handle_0, %handle_1 : tensor<i32>, tensor<i32>, tensor<f32>, tensor<f32>, tensor<2x!tf_type.resource<tensor<?x100xf32>>>, tensor<2x!tf_type.resource<tensor<?x512xf32>>>
-}
-
-// CHECK-LABEL: func @tensor_array_while_test
-// CHECK-SAME: ([[in_chain:%.*]]: !tfrt.chain
-func.func @tensor_array_while_test(%indices: tensor<?xi32>, %input_0: tensor<?x?x?xf32>, %input_1: tensor<?x?x?xf32>) -> (tensor<?x?x512xf32>, tensor<?x?x512xf32>) {
-  %index = "tf.Const"() {device = "/device:CPU:0", value = dense<0> : tensor<i32>} : () -> (tensor<i32>)
-  %size = "tf.Const"() {device = "/device:CPU:0", value = dense<9> : tensor<i32>} : () -> (tensor<i32>)
-  %handle_0, %flow_0 = "tf.TensorArrayV3"(%size) {clear_after_read = true, device = "/job:localhost/replica:0/task:0/device:CPU:0", dtype = f32, dynamic_size = false, element_shape = #tf_type.shape<?x100>, identical_element_shapes = true, tensor_array_name = "processed_embeddings/bidirectional_rnn/bw/bw/dynamic_rnn/input_0"} : (tensor<i32>) -> (tensor<2x!tf_type.resource<tensor<?x100xf32>>>, tensor<f32>)
-  %handle_1, %flow_1 = "tf.TensorArrayV3"(%size) {clear_after_read = true, device = "/job:localhost/replica:0/task:0/device:CPU:0", dtype = f32, dynamic_size = false, element_shape = #tf_type.shape<?x512>, identical_element_shapes = true, tensor_array_name = "processed_embeddings/bidirectional_rnn/bw/bw/dynamic_rnn/output_0"} : (tensor<i32>) -> (tensor<2x!tf_type.resource<tensor<?x512xf32>>>, tensor<f32>)
-  %flow_01 = "tf.TensorArrayScatterV3"(%handle_0, %indices, %input_0, %flow_0) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<2x!tf_type.resource<tensor<?x100xf32>>>, tensor<?xi32>, tensor<?x?x?xf32>, tensor<f32>) -> tensor<f32>
-  // CHECK: [[pred_0:%.*]]:2 = tfrt.call @"tensor_array_while_cond/tfrt_predicate"([[in_chain]]
-  // CHECK: [[while_res_0:%.*]]:7 = tfrt.while {{%.*}} @"tensor_array_while_body/tfrt_body_10"([[pred_0]]#0
-  // CHECK-SAME: parallel_iterations(10)
-  %res_0:6 = "tf.While"(%index, %size, %flow_01, %flow_1, %handle_0, %handle_1) {body = @tensor_array_while_body, cond = @tensor_array_while_cond, device = "", is_stateless = false, parallel_iterations = 10 : i64} : (tensor<i32>, tensor<i32>, tensor<f32>, tensor<f32>, tensor<2x!tf_type.resource<tensor<?x100xf32>>>, tensor<2x!tf_type.resource<tensor<?x512xf32>>>) -> (tensor<i32>, tensor<i32>, tensor<f32>, tensor<f32>, tensor<2x!tf_type.resource<tensor<?x100xf32>>>, tensor<2x!tf_type.resource<tensor<?x512xf32>>>)
-  %output_0 = "tf.TensorArrayGatherV3"(%handle_1, %indices, %res_0#3) {device = "/job:localhost/replica:0/task:0/device:CPU:0", element_shape = #tf_type.shape<?x512>} : (tensor<2x!tf_type.resource<tensor<?x512xf32>>>, tensor<?xi32>, tensor<f32>) -> tensor<?x?x512xf32>
-
-  %handle_2, %flow_2 = "tf.TensorArrayV3"(%size) {clear_after_read = true, device = "/job:localhost/replica:0/task:0/device:CPU:0", dtype = f32, dynamic_size = false, element_shape = #tf_type.shape<?x100>, identical_element_shapes = true, tensor_array_name = "processed_embeddings/bidirectional_rnn/bw/bw/dynamic_rnn/input_0"} : (tensor<i32>) -> (tensor<2x!tf_type.resource<tensor<?x100xf32>>>, tensor<f32>)
-  %handle_3, %flow_3 = "tf.TensorArrayV3"(%size) {clear_after_read = true, device = "/job:localhost/replica:0/task:0/device:CPU:0", dtype = f32, dynamic_size = false, element_shape = #tf_type.shape<?x512>, identical_element_shapes = true, tensor_array_name = "processed_embeddings/bidirectional_rnn/bw/bw/dynamic_rnn/output_0"} : (tensor<i32>) -> (tensor<2x!tf_type.resource<tensor<?x512xf32>>>, tensor<f32>)
-  %flow_21 = "tf.TensorArrayScatterV3"(%handle_2, %indices, %input_1, %flow_2) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<2x!tf_type.resource<tensor<?x100xf32>>>, tensor<?xi32>, tensor<?x?x?xf32>, tensor<f32>) -> tensor<f32>
-  // CHECK: [[pred_1:%.*]]:2 = tfrt.call @"tensor_array_while_cond/tfrt_predicate"([[in_chain]]
-  // CHECK: [[while_res_1:%.*]]:7 = tfrt.while {{%.*}} @"tensor_array_while_body/tfrt_body_10"([[pred_1]]#0
-  // CHECK-SAME: parallel_iterations(10)
-  %res_1:6 = "tf.While"(%index, %size, %flow_21, %flow_3, %handle_2, %handle_3) {body = @tensor_array_while_body, cond = @tensor_array_while_cond, device = "", is_stateless = false, parallel_iterations = 10 : i64} : (tensor<i32>, tensor<i32>, tensor<f32>, tensor<f32>, tensor<2x!tf_type.resource<tensor<?x100xf32>>>, tensor<2x!tf_type.resource<tensor<?x512xf32>>>) -> (tensor<i32>, tensor<i32>, tensor<f32>, tensor<f32>, tensor<2x!tf_type.resource<tensor<?x100xf32>>>, tensor<2x!tf_type.resource<tensor<?x512xf32>>>)
-  %output_1 = "tf.TensorArrayGatherV3"(%handle_3, %indices, %res_1#3) {device = "/job:localhost/replica:0/task:0/device:CPU:0", element_shape = #tf_type.shape<?x512>} : (tensor<2x!tf_type.resource<tensor<?x512xf32>>>, tensor<?xi32>, tensor<f32>) -> tensor<?x?x512xf32>
-  func.return %output_0, %output_1 : tensor<?x?x512xf32>, tensor<?x?x512xf32>
-}
-
-// CHECK: func @"tensor_array_while_body/tfrt_body_10"
-
-func.func @callee(%arg0: tensor<i32>) -> (tensor<i32>) {
-  func.return %arg0: tensor<i32>
-}
-
-// CHECK-LABEL: func @call_test
-// CHECK-SAME: ([[chain:%.*]]: !tfrt.chain,
-func.func @call_test(%arg0: tensor<i32>) -> (tensor<i32>, tensor<i32>, tensor<i32>) {
-  %0 = "tf.Add"(%arg0, %arg0) {device = "/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-  // CHECK: [[results_0:%.*]]:2 = tfrt.call @callee([[chain]]
-  // CHECK-SAME: (!tfrt.chain, !tfrt_fallback.tf_tensor) -> (!tfrt.chain, !tfrt_fallback.tf_tensor)
-  %1 = "tf.StatefulPartitionedCall"(%0) {config = "", config_proto = "", executor_type = "", f = @callee} : (tensor<i32>) -> (tensor<i32>)
-  // CHECK-NEXT: [[results_1:%.*]]:2 = tfrt.call @callee([[chain]]
-  // CHECK-SAME: (!tfrt.chain, !tfrt_fallback.tf_tensor) -> (!tfrt.chain, !tfrt_fallback.tf_tensor)
-  %2 = "tf.PartitionedCall"(%0) {config = "", config_proto = "", executor_type = "", f = @callee} : (tensor<i32>) -> (tensor<i32>)
-  // CHECK-NEXT: [[results_2:%.*]]:2 = tfrt.call @callee([[chain]]
-  // CHECK-SAME: (!tfrt.chain, !tfrt_fallback.tf_tensor) -> (!tfrt.chain, !tfrt_fallback.tf_tensor)
-  %3 = "tf.LegacyCall"(%0) {f = @callee} : (tensor<i32>) -> (tensor<i32>)
-  // CHECK: [[results_0]]#1, [[results_1]]#1, [[results_2]]#1
-  func.return %1, %2, %3 : tensor<i32>, tensor<i32>, tensor<i32>
-}
-
-func.func @branch0(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
-  %0 = "tf.Add" (%arg0, %arg1) {device = "/device:CPU:0"}  : (tensor<f32>, tensor<f32>) -> tensor<f32>
-  func.return %0 : tensor<f32>
-}
-
-func.func @branch1(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
-  %0 = "tf.Add" (%arg0, %arg1) {device = "/device:CPU:0"}  : (tensor<f32>, tensor<f32>) -> tensor<f32>
-  %1 = "tf.Add" (%arg0, %0) {device = "/device:CPU:0"}  : (tensor<f32>, tensor<f32>) -> tensor<f32>
-  func.return %1 : tensor<f32>
-}
-
-// CHECK-LABEL: func @case_test
-// CHECK-SAME: ([[chain:%.*]]: !tfrt.chain, [[tf_idx:%.*]]: !tfrt_fallback.tf_tensor, [[branch_arg0:%.*]]: !tfrt_fallback.tf_tensor, [[branch_arg1:%.*]]: !tfrt_fallback.tf_tensor)
-func.func @case_test(%arg0: tensor<i32>, %arg1: tensor<f32>,  %arg2: tensor<f32>) -> tensor<f32> {
-  // CHECK: [[th_idx:%.*]] = tfrt_fallback_async.fallback_tensor_to_corert_tensorhandle [[tf_idx]]
-  // CHECK-NEXT: [[idx:%.*]] = corert.tensorhandle_to_int32 [[th_idx]]
-  // CHECK-NEXT: [[out:%.*]] = tfrt.case [[idx]] [@branch0, @branch1]([[chain]], [[branch_arg0]], [[branch_arg1]])
-  %0 = "tf.Case"(%arg0, %arg1, %arg2) {_lower_using_switch_merge = true, branches = [@branch0, @branch1], is_stateless = true} : (tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<f32>
-  func.return %0 : tensor<f32>
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/tf_to_corert_pipeline.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/tf_to_corert_pipeline.mlir
index dd57c72674a3e8..763e188fd4e5c7 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/tf_to_corert_pipeline.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/tf_to_corert_pipeline.mlir
@@ -5,15 +5,10 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
 
 // CHECK-LABEL: func @__forward_call_369
 // CHECK-SAME: ([[in_chain:%.*]]: !tfrt.chain
-// CHECK-SAME: [[arg1_th:%.*]]: !corert.tensorhandle {tf._user_specified_name = "inputs"},
-// CHECK-SAME: [[arg2_th:%.*]]: !corert.tensorhandle, [[arg3_th:%.*]]: !corert.tensorhandle, [[arg4_th:%.*]]: !corert.tensorhandle, [[arg5_th:%.*]]: !corert.tensorhandle)
+// CHECK-SAME: [[arg1:%.*]]: !tfrt_fallback.tf_tensor {tf._user_specified_name = "inputs"},
+// CHECK-SAME: [[arg2:%.*]]: !tfrt_fallback.tf_tensor, [[arg3:%.*]]: !tfrt_fallback.tf_tensor, [[arg4:%.*]]: !tfrt_fallback.tf_tensor, [[arg5:%.*]]: !tfrt_fallback.tf_tensor)
 // CHECK-SAME: -> (!tfrt.chain
 // CHECK: [[o1:%.*]] = tfrt_fallback_async.const_dense_tensor
-// CHECK-NEXT: [[arg1:%.*]] = tfrt_fallback_async.corert_tensorhandle_to_fallback_tensor [[arg1_th]] {device = "/job:localhost/replica:0/task:0/device:CPU:0"
-// CHECK-NEXT: [[arg4:%.*]] = tfrt_fallback_async.corert_tensorhandle_to_fallback_tensor [[arg4_th]] {device = "/job:localhost/replica:0/task:0/device:CPU:0"
-// CHECK-NEXT: [[arg5:%.*]] = tfrt_fallback_async.corert_tensorhandle_to_fallback_tensor [[arg5_th]] {device = "/job:localhost/replica:0/task:0/device:CPU:0"
-// CHECK-NEXT: [[arg2:%.*]] = tfrt_fallback_async.corert_tensorhandle_to_fallback_tensor [[arg2_th]] {device = "/job:localhost/replica:0/task:0/device:CPU:0"
-// CHECK-NEXT: [[arg3:%.*]] = tfrt_fallback_async.corert_tensorhandle_to_fallback_tensor [[arg3_th]] {device = "/job:localhost/replica:0/task:0/device:CPU:0"
 // CHECK: [[o2_chain:%.*]], [[o2:%.*]] = tfrt_fallback_async.executeop.seq([[in_chain]]) key({{[0-9]+}}) cost({{.*}}) device("/job:localhost/replica:0/task:0/device:CPU:0") "tf.ReadVariableOp"([[arg3]])
 // CHECK-NEXT: [[o3_chain:%.*]], [[o3:%.*]] = tfrt_fallback_async.executeop.seq([[in_chain]]) key({{[0-9]+}}) cost({{.*}}) device("/job:localhost/replica:0/task:0/device:CPU:0") "tf.ReadVariableOp"([[arg2]])
 // CHECK-NEXT: [[o4_chain:%.*]], [[o4:%.*]] = tfrt_fallback_async.executeop.seq([[in_chain]]) key({{[0-9]+}}) cost({{.*}}) device("/job:localhost/replica:0/task:0/device:CPU:0") "tf.ReadVariableOp"([[arg5]])
@@ -23,12 +18,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
 // CHECK-NEXT: [[o8:%.*]] = tfrt_fallback_async.executeop key({{[0-9]+}}) cost({{.*}}) device("/job:localhost/replica:0/task:0/device:CPU:0") "tf.Reshape"([[o7]], [[o1]])
 // CHECK-NEXT: [[o9:%.*]] = tfrt_fallback_async.executeop key({{[0-9]+}}) cost({{.*}}) device("/job:localhost/replica:0/task:0/device:CPU:0") "tf._FusedMatMul"([[o8]], [[o5]], [[o4]])
 // CHECK-NEXT: [[out_chain:%.*]] = tfrt.merge.chains [[o2_chain]], [[o3_chain]], [[o4_chain]], [[o5_chain]]
-// CHECK-NEXT: [[o9_th:%.*]] = tfrt_fallback_async.fallback_tensor_to_corert_tensorhandle [[o9]]
-// CHECK-NEXT: [[o5_th:%.*]] = tfrt_fallback_async.fallback_tensor_to_corert_tensorhandle [[o5]]
-// CHECK-NEXT: [[o8_th:%.*]] = tfrt_fallback_async.fallback_tensor_to_corert_tensorhandle [[o8]]
-// CHECK-NEXT: [[o6_th:%.*]] = tfrt_fallback_async.fallback_tensor_to_corert_tensorhandle [[o6]]
-// CHECK-NEXT: [[o3_th:%.*]] = tfrt_fallback_async.fallback_tensor_to_corert_tensorhandle [[o3]]
-// CHECK-NEXT: tfrt.return [[out_chain]], [[o9_th]], [[o5_th]], [[o8_th]], [[o6_th]], [[arg1_th]], [[o3_th]] : !tfrt.chain, !corert.tensorhandle, !corert.tensorhandle, !corert.tensorhandle, !corert.tensorhandle, !corert.tensorhandle, !corert.tensorhandle
+// CHECK-NEXT: tfrt.return [[out_chain]], [[o9]], [[o5]], [[o8]], [[o6]], [[arg1]], [[o3]] : !tfrt.chain, !tfrt_fallback.tf_tensor, !tfrt_fallback.tf_tensor, !tfrt_fallback.tf_tensor, !tfrt_fallback.tf_tensor, !tfrt_fallback.tf_tensor, !tfrt_fallback.tf_tensor
   func.func @__forward_call_369(%arg0: tensor<16x224x224x3xf32> {tf._user_specified_name = "inputs"}, %arg1: tensor<*x!tf_type.resource>, %arg2: tensor<*x!tf_type.resource>, %arg3: tensor<*x!tf_type.resource>, %arg4: tensor<*x!tf_type.resource>) -> (tensor<?x?xf32>, tensor<*xf32>, tensor<?x16384xf32>, tensor<16x112x112x?xf32>, tensor<16x224x224x3xf32>, tensor<*xf32>) attributes {tf.entry_function = {control_outputs = "", inputs = "inputs_0,conv1_conv2d_readvariableop_resource,conv1_biasadd_readvariableop_resource,fc1000_matmul_readvariableop_resource,fc1000_biasadd_readvariableop_resource", outputs = "identity_RetVal,fc1000_matmul_readvariableop_RetVal,flatten_reshape_RetVal,relu_RetVal,inputs_RetVal,conv1_conv2d_readvariableop_RetVal"}} {
     %0:6 = tf_executor.graph {
       %outputs, %control = tf_executor.island wraps "tf.ReadVariableOp"(%arg2) {device = ""} : (tensor<*x!tf_type.resource>) -> tensor<*xf32>
@@ -62,10 +52,9 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
   }
 
   // CHECK-LABEL: func @while_test
-  // CHECK-SAME: ([[ARG0:%.+]]: !tfrt.chain) -> (!tfrt.chain, !corert.tensorhandle)
+  // CHECK-SAME: ([[ARG0:%.+]]: !tfrt.chain) -> (!tfrt.chain, !tfrt_fallback.tf_tensor)
   func.func @while_test() -> (tensor<i32>) {
     // The predicate function should be inlined.
-    // CHECK: corert.const_dense_tensor dense<0> : tensor<i32>
     // CHECK-DAG: tfrt_fallback_async.const_dense_tensor dense<9> : tensor<i32>
     // CHECK-DAG: tfrt_fallback_async.const_dense_tensor dense<0> : tensor<i32>
     // CHECK-NEXT: tfrt_fallback_async.executeop key({{.*}}) cost({{.*}}) device("/device:CPU:0") "tf.Less"
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/tf_to_corert_pipeline_refvar.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/tf_to_corert_pipeline_refvar.mlir
index e9518003023e96..481b5a421f0aa6 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/tf_to_corert_pipeline_refvar.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/tf_to_corert_pipeline_refvar.mlir
@@ -1,12 +1,11 @@
 // RUN: tf-tfrt-opt -tf-executor-to-tfrt-pipeline %s | FileCheck %s --dump-input=fail
 
 // CHECK-LABEL: func @__inference_pruned_131
-// CHECK-SAME: ([[in_chain:%.*]]: !tfrt.chain) -> (!tfrt.chain, !corert.tensorhandle)
+// CHECK-SAME: ([[in_chain:%.*]]: !tfrt.chain) -> (!tfrt.chain, !tfrt_fallback.tf_tensor)
 // CHECK-NEXT: [[o_chain:%.*]], [[o:%.*]] = tfrt_fallback_async.executeop.seq([[in_chain]]) key(0) cost({{.*}}) device("/job:localhost/replica:0/task:0/device:CPU:0") "tf.VarHandleOp"()
 // CHECK-NEXT: [[o_chain_0:%.*]], [[o1:%.*]] = tfrt_fallback_async.executeop.seq([[in_chain]]) key(1) cost({{.*}}) device("/job:localhost/replica:0/task:0/device:CPU:0") "tf.ReadVariableOp"([[o]]) {dtype = f32} : 1
 // CHECK-NEXT: [[out_ch:%.*]] = tfrt.merge.chains [[o_chain]], [[o_chain_0]]
-// CHECK-NEXT: [[o2:%.*]] = tfrt_fallback_async.fallback_tensor_to_corert_tensorhandle [[o1]]
-// CHECK-NEXT: tfrt.return [[out_ch]], [[o2]] : !tfrt.chain, !corert.tensorhandle
+// CHECK-NEXT: tfrt.return [[out_ch]], [[o1]] : !tfrt.chain, !tfrt_fallback.tf_tensor
 module attributes {tf.devices = ["/job:localhost/replica:0/task:0/device:CPU:0"], tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 679 : i32}}  {
   func.func @__inference_pruned_131() -> tensor<*xf32> attributes {tf.entry_function = {control_outputs = "", inputs = "variable", outputs = "identity_retval_RetVal"}} {
     %0 = tf_executor.graph {
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/whileop.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/whileop.mlir
index 5858a015061596..3a1e6b1e8cbc97 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/whileop.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/whileop.mlir
@@ -19,7 +19,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
   // CHECK-LABEL: func @while_test_remove_unused_results
   // CHECK:       [[pred:%.*]] = tfrt_fallback_async.predicate
   // CHECK-NEXT:  tfrt.while [[pred]] @"[[while_func_prefix:.*]]/tfrt_body_1"
-  // CHECK-SAME:  (!tfrt.chain, !corert.tensorhandle) -> (!tfrt.chain, !corert.tensorhandle)
+  // CHECK-SAME:  (!tfrt.chain, !tfrt_fallback.tf_tensor) -> (!tfrt.chain, !tfrt_fallback.tf_tensor)
   // CHECK-NOT:   func.call
   func.func @while_test_remove_unused_results(%arg0: tensor<i32>, %arg1: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
     %0:2 = "tf.While"(%arg0, %arg1) { cond = @while_cond_lt9, body = @while_body_add2, is_stateless = false, parallel_iterations = 1} : (tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>)
diff --git a/tensorflow/compiler/mlir/tfrt/tests/xla_launch_fallback.mlir b/tensorflow/compiler/mlir/tfrt/tests/xla_launch_fallback.mlir
index 3556f932ac1f2f..4ac2e850ae642f 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/xla_launch_fallback.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/xla_launch_fallback.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-tfrt-opt -split-input-file -tf-executor-to-tfrt-pipeline="target-gpu=true func-use-fallback-tensor=true" -tfrt-lower-tf-savedmodel=hoist-invariant-ops=true %s | FileCheck %s --dump-input=fail --dump-input-filter=all
+// RUN: tf-tfrt-opt -split-input-file -tf-executor-to-tfrt-pipeline="target-gpu=true" -tfrt-lower-tf-savedmodel=hoist-invariant-ops=true %s | FileCheck %s --dump-input=fail --dump-input-filter=all
 
 func.func private @xla_func_0(%arg0: tensor<1x3xf32>, %arg1: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {tf._XlaMustCompile = true, tf._noinline = true, tf._original_func_name = "should_not_be_used"} {
   %1 = "tf.AddV2"(%arg0, %arg1) : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
diff --git a/tensorflow/compiler/mlir/tfrt/tests/xla_launch_lowering.mlir b/tensorflow/compiler/mlir/tfrt/tests/xla_launch_lowering.mlir
index 934166aae198fc..c53b025468d950 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/xla_launch_lowering.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/xla_launch_lowering.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-tfrt-opt -split-input-file -tf-executor-to-tfrt-pipeline="target-gpu=true use-gpu-compile-and-execute-op=true func-use-fallback-tensor=true" -tfrt-lower-tf-savedmodel=hoist-invariant-ops=true %s | FileCheck %s --dump-input=fail --dump-input-filter=all
+// RUN: tf-tfrt-opt -split-input-file -tf-executor-to-tfrt-pipeline="target-gpu=true use-gpu-compile-and-execute-op=true" -tfrt-lower-tf-savedmodel=hoist-invariant-ops=true %s | FileCheck %s --dump-input=fail --dump-input-filter=all
 
 func.func private @xla_func_0(%arg0: tensor<1x3xf32>, %arg1: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {tf._XlaMustCompile = true, tf._noinline = true, tf._original_func_name = "should_not_be_used"} {
   %1 = "tf.AddV2"(%arg0, %arg1) : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/attr_lowering_utils.cc b/tensorflow/compiler/mlir/tfrt/transforms/attr_lowering_utils.cc
index efedf36452dc12..a0ae8cb06e45df 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/attr_lowering_utils.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/attr_lowering_utils.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/OperationSupport.h"
 #include "mlir/IR/Types.h"
+#include "mlir/Support/LLVM.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
@@ -37,39 +38,39 @@ mlir::TypeAttr ConvertTypeAttribute(mlir::TypeAttr type_attr,
   if (IsSupportedTfrtNumericDType(type)) return type_attr;
 
   // For TF custom types, we convert it to custom corert types.
-  if (type.isa<mlir::TF::StringType>())
+  if (mlir::isa<mlir::TF::StringType>(type))
     return mlir::TypeAttr::get(
         tfrt::corert::StringType::get(builder.getContext()));
 
-  if (type.isa<mlir::TF::ResourceType>())
+  if (mlir::isa<mlir::TF::ResourceType>(type))
     return mlir::TypeAttr::get(
         tfrt::corert::ResourceType::get(builder.getContext()));
 
-  if (type.isa<mlir::TF::VariantType>())
+  if (mlir::isa<mlir::TF::VariantType>(type))
     return mlir::TypeAttr::get(
         tfrt::corert::VariantType::get(builder.getContext()));
 
-  if (type.isa<mlir::TF::Quint8Type>()) {
+  if (mlir::isa<mlir::TF::Quint8Type>(type)) {
     return mlir::TypeAttr::get(
         tfrt::corert::Quint8Type::get(builder.getContext()));
   }
 
-  if (type.isa<mlir::TF::Quint16Type>()) {
+  if (mlir::isa<mlir::TF::Quint16Type>(type)) {
     return mlir::TypeAttr::get(
         tfrt::corert::Quint16Type::get(builder.getContext()));
   }
 
-  if (type.isa<mlir::TF::Qint8Type>()) {
+  if (mlir::isa<mlir::TF::Qint8Type>(type)) {
     return mlir::TypeAttr::get(
         tfrt::corert::Qint8Type::get(builder.getContext()));
   }
 
-  if (type.isa<mlir::TF::Qint16Type>()) {
+  if (mlir::isa<mlir::TF::Qint16Type>(type)) {
     return mlir::TypeAttr::get(
         tfrt::corert::Qint16Type::get(builder.getContext()));
   }
 
-  if (type.isa<mlir::TF::Qint32Type>()) {
+  if (mlir::isa<mlir::TF::Qint32Type>(type)) {
     return mlir::TypeAttr::get(
         tfrt::corert::Qint32Type::get(builder.getContext()));
   }
@@ -86,14 +87,15 @@ mlir::Attribute ConvertAttribute(mlir::Attribute attr, mlir::Builder& builder) {
   // attributes are not supported yet.
 
   // Return directly if the attribute is already supported.
-  if (attr.isa<mlir::IntegerAttr, mlir::FloatAttr, mlir::BoolAttr,
-               mlir::StringAttr, mlir::DenseIntOrFPElementsAttr>())
+  if (mlir::isa<mlir::IntegerAttr, mlir::FloatAttr, mlir::BoolAttr,
+                mlir::StringAttr, mlir::DenseIntOrFPElementsAttr>(attr))
     return attr;
 
   // For type attributes, we convert non-standard MLIR types to corresponding
   // corert types.
-  if (auto type_attr = attr.dyn_cast<mlir::TypeAttr>()) {
-    if (auto shape_type = type_attr.getValue().dyn_cast<mlir::TensorType>()) {
+  if (auto type_attr = mlir::dyn_cast<mlir::TypeAttr>(attr)) {
+    if (auto shape_type =
+            mlir::dyn_cast<mlir::TensorType>(type_attr.getValue())) {
       if (!shape_type.hasRank())
         return tfrt::corert::ShapeAttr::get(builder.getContext());
 
@@ -106,7 +108,7 @@ mlir::Attribute ConvertAttribute(mlir::Attribute attr, mlir::Builder& builder) {
 
   // Convert the attribute to the corresponding format in TFRT dialect if
   // needed.
-  if (auto shape_attr = attr.dyn_cast<mlir::TF::ShapeAttr>()) {
+  if (auto shape_attr = mlir::dyn_cast<mlir::TF::ShapeAttr>(attr)) {
     if (!shape_attr.hasRank())
       return tfrt::corert::ShapeAttr::get(builder.getContext());
     return tfrt::corert::ShapeAttr::get(builder.getContext(),
@@ -114,7 +116,7 @@ mlir::Attribute ConvertAttribute(mlir::Attribute attr, mlir::Builder& builder) {
   }
 
   // For arrays, we recursively convert the elements.
-  if (auto array_attr = attr.dyn_cast<mlir::ArrayAttr>()) {
+  if (auto array_attr = mlir::dyn_cast<mlir::ArrayAttr>(attr)) {
     llvm::SmallVector<mlir::Attribute, 8> attrs;
     attrs.reserve(array_attr.size());
     for (auto attr : array_attr) {
@@ -140,7 +142,7 @@ bool IsSupportedTfrtNumericDType(mlir::Type type) {
       type.isUnsignedInteger(64))
     return true;
 
-  if (auto complex_type = type.dyn_cast<mlir::ComplexType>()) {
+  if (auto complex_type = mlir::dyn_cast<mlir::ComplexType>(type)) {
     auto element_type = complex_type.getElementType();
     if (element_type.isF32() || element_type.isF64()) return true;
   }
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/corert_converter.cc b/tensorflow/compiler/mlir/tfrt/transforms/corert_converter.cc
index 48d9f755c16c7b..910f7a83a9f7af 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/corert_converter.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/corert_converter.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "mlir/IR/OperationSupport.h"
 #include "mlir/IR/Types.h"
 #include "mlir/Pass/PassManager.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/Passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -46,7 +47,7 @@ CoreRTConverter::CoreRTConverter(
   addConversion([](tfrt::corert::TensorHandleType type) { return type; });
   addConversion([=](mlir::TensorType type) -> std::optional<mlir::Type> {
     // Ref types are not supported in both compiler and runtime.
-    if (type.getElementType().isa<mlir::TF::TensorFlowRefType>())
+    if (mlir::isa<mlir::TF::TensorFlowRefType>(type.getElementType()))
       return std::nullopt;
     return tensor_handle_type();
   });
@@ -74,8 +75,8 @@ mlir::ArrayAttr CoreRTConverter::CreateOpFuncAttrs(
     auto attr_key = key_and_value.getName();
     auto attr_value = key_and_value.getValue();
     if (!IsUnusedTfrtAttribute(attr_key) &&
-        attr_value.isa<mlir::FlatSymbolRefAttr, mlir::SymbolRefAttr>()) {
-      auto func_attr = attr_value.dyn_cast<mlir::FlatSymbolRefAttr>();
+        mlir::isa<mlir::FlatSymbolRefAttr, mlir::SymbolRefAttr>(attr_value)) {
+      auto func_attr = mlir::dyn_cast<mlir::FlatSymbolRefAttr>(attr_value);
       auto converted = CanonicalizeTensorflowFunctionName(
           symbol_table, func_attr.getValue().str(), use_mlir_func_name);
       if (!converted) return {};
@@ -126,7 +127,7 @@ std::optional<ParseDeviceNameResult> CoreRTConverter::ParseDeviceName(
   }
 
   auto parsed_device_name =
-      ParseDeviceName(device_attr.cast<mlir::StringAttr>().getValue());
+      ParseDeviceName(mlir::cast<mlir::StringAttr>(device_attr).getValue());
   if (!parsed_device_name) op->emitWarning("failed to parse device name.");
   return parsed_device_name;
 }
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/cross_device_transfer.cc b/tensorflow/compiler/mlir/tfrt/transforms/cross_device_transfer.cc
index 2b1e29c5347096..5f539b8c520e65 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/cross_device_transfer.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/cross_device_transfer.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tfrt/basic_kernels/opdefs/basic_kernels.h"  // from @tf_runtime
@@ -81,8 +82,8 @@ static std::string GetDevice(Operation *op) {
     SmallVector<std::pair<StringRef, Attribute>, 4> attrs;
     execute_op.getOpAttrs(&attrs);
     for (std::pair<StringRef, Attribute> entry : attrs) {
-      if (entry.first == kDeviceAttr && entry.second.isa<StringAttr>()) {
-        device = entry.second.cast<StringAttr>().getValue().str();
+      if (entry.first == kDeviceAttr && mlir::isa<StringAttr>(entry.second)) {
+        device = mlir::cast<StringAttr>(entry.second).getValue().str();
         break;
       }
     }
@@ -94,7 +95,7 @@ static std::string GetDevice(Operation *op) {
 // Return the device of the given value.
 static std::string GetDevice(mlir::Value value, func::FuncOp parent_func_op) {
   std::string device = "";
-  if (BlockArgument block_arg = value.dyn_cast<BlockArgument>()) {
+  if (BlockArgument block_arg = mlir::dyn_cast<BlockArgument>(value)) {
     if (StringAttr device_attr = parent_func_op.getArgAttrOfType<StringAttr>(
             block_arg.getArgNumber(), kTFRTDeviceAttr)) {
       device = device_attr.getValue().str();
@@ -140,10 +141,10 @@ void CrossDeviceTransferPass::runOnOperation() {
 
     for (mlir::Value arg : op->getOperands()) {
       // Do not transfer non-TensorHandle values.
-      if (!arg.getType().isa<tfrt::corert::TensorHandleType>()) continue;
+      if (!mlir::isa<tfrt::corert::TensorHandleType>(arg.getType())) continue;
 
       // Do not transfer the result of corert.transfer op.
-      if (OpResult op_result = arg.dyn_cast<OpResult>()) {
+      if (OpResult op_result = mlir::dyn_cast<OpResult>(arg)) {
         Operation *defining_op = arg.getDefiningOp();
         if (llvm::isa<tfrt::corert::TransferOp>(defining_op)) continue;
       }
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/fallback_converter.cc b/tensorflow/compiler/mlir/tfrt/transforms/fallback_converter.cc
index ef8c2ec38ce64b..77759a631f177f 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/fallback_converter.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/fallback_converter.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <optional>
 
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback.h"
 #include "tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_async.h"
@@ -31,7 +32,7 @@ FallbackConverter::FallbackConverter(mlir::MLIRContext *context)
   addConversion([](tfrt::fallback::TFTensorType type) { return type; });
   addConversion([=](mlir::TensorType type) -> std::optional<mlir::Type> {
     // Ref types are not supported in both compiler and runtime.
-    if (type.getElementType().isa<mlir::TF::TensorFlowRefType>()) {
+    if (mlir::isa<mlir::TF::TensorFlowRefType>(type.getElementType())) {
       return std::nullopt;
     }
 
@@ -46,9 +47,9 @@ FallbackConverter::FallbackConverter(mlir::MLIRContext *context)
 mlir::Value ConvertCoreRTTensorHandleToFallbackTensor(
     mlir::Location loc, llvm::StringRef device, mlir::Value value,
     mlir::ConversionPatternRewriter &rewriter) {
-  if (value.getType().isa<tfrt::fallback::TFTensorType>()) return value;
+  if (mlir::isa<tfrt::fallback::TFTensorType>(value.getType())) return value;
 
-  if (!value.getType().isa<tfrt::corert::TensorHandleType>()) return {};
+  if (!mlir::isa<tfrt::corert::TensorHandleType>(value.getType())) return {};
 
   mlir::OpBuilder::InsertionGuard guard(rewriter);
 
@@ -82,9 +83,9 @@ mlir::Value ConvertCoreRTTensorHandleToFallbackTensor(
 mlir::Value ConvertFallbackTensorToCoreRTTensorHandle(
     mlir::Location loc, mlir::Value value,
     mlir::ConversionPatternRewriter &rewriter) {
-  if (value.getType().isa<tfrt::corert::TensorHandleType>()) return value;
+  if (mlir::isa<tfrt::corert::TensorHandleType>(value.getType())) return value;
 
-  if (!value.getType().isa<tfrt::fallback::TFTensorType>()) return {};
+  if (!mlir::isa<tfrt::fallback::TFTensorType>(value.getType())) return {};
 
   // Use CPU device by default if no device is specified.
   llvm::StringRef device = GetDefaultCpuDeviceName();
@@ -134,7 +135,7 @@ mlir::LogicalResult ConvertFallbackOperands(
     llvm::SmallVectorImpl<mlir::Value> *new_operands,
     mlir::ConversionPatternRewriter &rewriter) {
   for (auto operand : operands) {
-    if (!operand.getType().isa<tfrt::fallback::TFTensorType>()) {
+    if (!mlir::isa<tfrt::fallback::TFTensorType>(operand.getType())) {
       auto new_operand = ConvertCoreRTTensorHandleToFallbackTensor(
           op->getLoc(), device, operand, rewriter);
       if (!new_operand)
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/BUILD b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/BUILD
index 305195e744932f..b06b37c9e6cb27 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/BUILD
@@ -15,6 +15,7 @@ package_group(
         "//tensorflow/core/tfrt/saved_model/tests/...",
     ] + if_google([
         "//learning/brain/tfrt/cpp_tests/...",
+        "//learning/serving/servables/tfrt/...",
         "//learning/pathways/serving/runtime/...",
         "//learning/pathways/serving/tests/...",
         "//learning/brain/tfrt/mlir/mlrt/application/pathways/compiler/...",
@@ -124,15 +125,9 @@ cc_library(
         ":ifrt_constants",
         ":ifrt_types",
         "//tensorflow/compiler/jit:xla_cpu_jit",
-        "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
-        "//tensorflow/compiler/mlir/tensorflow:error_util",
         "//tensorflow/compiler/mlir/tensorflow:serialize_mlir_module_utils",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops_a_m_inc_gen",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops_n_z_inc_gen",
-        "//tensorflow/compiler/mlir/tensorflow:visitor",
         "//tensorflow/compiler/mlir/tf2xla/api/v2:legalize_tf",
-        "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:layout_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/core:core_cpu_base",
@@ -199,6 +194,7 @@ tf_cc_test(
     ],
     tags = ["no_oss"],
     deps = [
+        ":ifrt_types",
         ":tf2hlo",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/tf2xla:xla_helpers",
@@ -265,6 +261,7 @@ tf_cc_test(
         "//tensorflow/core/platform:resource_loader",
         "//tensorflow/core/tfrt/graph_executor:graph_execution_options",
         "//tensorflow/core/tfrt/ifrt:ifrt_model_context",
+        "//tensorflow/core/tfrt/ifrt:ifrt_serving_core_selector",
         "//tensorflow/core/tfrt/runtime",
         "//tensorflow/core/tfrt/saved_model:saved_model_testutil",
         "@com_google_absl//absl/strings",
@@ -272,6 +269,7 @@ tf_cc_test(
         "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
+        "@local_tsl//tsl/framework/test_util:mock_serving_device_selector",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:statusor",
         "@local_xla//xla/python/ifrt",
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler.cc
index ebaf2570bba3f4..93e41027ec594e 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler.cc
@@ -88,13 +88,18 @@ CompileAndRegisterIfrtPrograms(absl::string_view model_name,
       }
     });
 
-    auto executable = std::make_unique<IfrtServingExecutable>(
-        model_name, entry_function_name.str(), *std::move(submodule),
-        ifrt_model_context.GetClient(), &ifrt_model_context.GetThreadPool(),
-        &ifrt_model_context.GetLoadedVariableRegistry(),
-        &ifrt_model_context.GetRestoreTensorRegistry(),
-        ifrt_model_context.GetDeviceMgr(),
-        ifrt_model_context.GetShapeRepresentationFn());
+    TF_ASSIGN_OR_RETURN(
+        auto executable,
+        IfrtServingExecutable::Create(
+            program_id, model_name, entry_function_name.str(),
+            *std::move(submodule), ifrt_model_context.GetClient(),
+            &ifrt_model_context.GetThreadPool(),
+            &ifrt_model_context.GetLoadedVariableRegistry(),
+            &ifrt_model_context.GetRestoreTensorRegistry(),
+            ifrt_model_context.checkpoint_loader_queue(),
+            ifrt_model_context.GetDeviceMgr(),
+            ifrt_model_context.GetShapeRepresentationFn(),
+            ifrt_model_context.GetIfrtServingCoreSelector()));
 
     // Register the Ifrt program to `ServingExecutableRegistry` so that
     // the client TF program can invoke them via `IfrtCall` op.
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler.h b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler.h
index 2407fe7cc3546c..085c70812feaed 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler.h
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler.h
@@ -31,6 +31,12 @@ class IfrtBackendCompiler : public tensorflow::BackendCompiler {
   explicit IfrtBackendCompiler(TpuCompiler* tpu_compiler = nullptr)
       : tpu_compiler_(tpu_compiler) {}
 
+  void GetDependentDialects(mlir::DialectRegistry& registry) const override {
+    if (tpu_compiler_) {
+      tpu_compiler_->RegisterTPUDialects(&registry);
+    }
+  }
+
   // Rewrites the tensorflow graph in MLIR for IFRT serving. The methods
   // extracts regions for IFRT execution on accelerator (e.g. TPU).
   absl::Status CompileTensorflow(
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler_test.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler_test.cc
index 71ba7724de922b..dea849f2a1e3fa 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler_test.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler_test.cc
@@ -34,8 +34,10 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/tfrt/graph_executor/graph_execution_options.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_model_context.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_serving_core_selector.h"
 #include "tensorflow/core/tfrt/runtime/runtime.h"
 #include "tensorflow/core/tfrt/saved_model/saved_model_testutil.h"
+#include "tsl/framework/test_util/mock_serving_device_selector.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/statusor.h"
@@ -85,8 +87,11 @@ TEST(IfrtBackendCompilerTest, Basic) {
   tensorflow::tfrt_stub::ModelRuntimeContext runtime_context(
       &graph_execution_options, /*export_dir=*/"", &resource_context);
 
+  tsl::test_util::MockServingDeviceSelector mock_serving_device_selector;
+  IfrtServingCoreSelector core_selector(&mock_serving_device_selector);
+
   runtime_context.resource_context().CreateResource<IfrtModelContext>(
-      "IfrtModelContext", client, &GetThreadPool());
+      "IfrtModelContext", client, &core_selector, &GetThreadPool());
 
   IfrtBackendCompiler compiler;
   TF_ASSERT_OK(compiler.CompileTensorflow(runtime_context, mlir_module.get()));
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/sink_variable_as_named_array.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/sink_variable_as_named_array.cc
index b3bf510003e797..49a7e817ed8f60 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/sink_variable_as_named_array.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/sink_variable_as_named_array.cc
@@ -195,6 +195,9 @@ class SinkVariableAsNamedArrayPass
     // ReadVariableOp.
     module.walk([&](mlir::TF::ReadVariableOp read_variable_op) {
       if (!read_variable_op->use_empty()) {
+        // This variable tensor is used by CPU host.
+        read_to_load[read_variable_op].setUsedByHost(true);
+
         // Replace CPU use of ReadVariableOp
         read_variable_op.replaceAllUsesWith(
             read_to_load[read_variable_op].getTensorFuture());
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/testdata/ifrt_cluster.mlir b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/testdata/ifrt_cluster.mlir
index 1c3ab3703e2384..f93c2532e23d18 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/testdata/ifrt_cluster.mlir
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/testdata/ifrt_cluster.mlir
@@ -1,9 +1,9 @@
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func.func @main() {
-    "tf_device.cluster_func"() {_xla_compile_device_type = "TPU", _replication_info = "cluster0", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "", input_sharding_configuration = [], output_sharding_configuration = [], use_spmd_for_xla_partitioning = false} : () -> ()
+    "tf_device.cluster_func"() {_replication_info = "cluster0", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "", input_sharding_configuration = [], output_sharding_configuration = [], use_spmd_for_xla_partitioning = false} : () -> ()
     func.return
   }
   func.func @empty_func() {
     func.return
   }
-}
\ No newline at end of file
+}
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.cc
index a0b01ba1ffc3f7..0210eb3ed2dc62 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.cc
@@ -66,31 +66,11 @@ namespace tensorflow {
 namespace ifrt_serving {
 namespace {
 static constexpr absl::string_view kEntryFuncName = "main";
+}  // namespace
 
-absl::StatusOr<tensorflow::tpu::TPUCompileMetadataProto> GetCompileMetadata(
-    mlir::func::FuncOp op, absl::Span<const DtypeAndShape> inputs,
-    const xla::ifrt::Client& ifrt_client) {
-  tensorflow::tpu::TPUCompileMetadataProto metadata;
-
-  auto metadata_text_attr =
-      op->getAttrOfType<mlir::StringAttr>(kMetadataTextAttrName);
-
-  if (metadata_text_attr && !metadata_text_attr.getValue().empty()) {
-    // Try __tpu_compile_metadata_text attribute. This only for debugging
-    // purpose.
-    VLOG(1) << "Parsing from attribute " << kMetadataTextAttrName
-            << metadata_text_attr.getValue().str();
-    if (!tsl::protobuf::TextFormat::ParseFromString(
-            metadata_text_attr.getValue().str(), &metadata)) {
-      return absl::InvalidArgumentError(absl::StrCat(
-          "Attribute ", kMetadataTextAttrName, ":",
-          metadata_text_attr.getValue().str(), " cannot be parsed"));
-    }
-  } else {
-    return absl::InvalidArgumentError(
-        absl::StrCat("Missing ", kMetadataTextAttrName));
-  }
-
+absl::Status UpdateCompileMetadata(
+    tensorflow::tpu::TPUCompileMetadataProto& metadata,
+    absl::Span<const DtypeAndShape> inputs) {
   VLOG(3) << "TpuCompileMetadata before shape is populated " << metadata;
   if (metadata.num_replicas() < 1 || metadata.num_cores_per_replica() < 1) {
     return absl::InternalError(
@@ -98,11 +78,6 @@ absl::StatusOr<tensorflow::tpu::TPUCompileMetadataProto> GetCompileMetadata(
                      " and number of cores per replica ",
                      metadata.num_cores_per_replica(), " must be >= 1"));
   }
-  if (op.getNumResults() != metadata.retvals_size()) {
-    return absl::InternalError(
-        absl::StrCat("Number of retvals mismatched! Expected ",
-                     op.getNumResults(), " got ", metadata.retvals_size()));
-  }
   if (metadata.args_size() != inputs.size()) {
     return absl::InternalError(
         absl::StrCat("Number of inputs mismatched! Expected ",
@@ -125,10 +100,39 @@ absl::StatusOr<tensorflow::tpu::TPUCompileMetadataProto> GetCompileMetadata(
     // Update shape.
     *metadata.mutable_args(i)->mutable_shape() = inputs[i].shape.AsProto();
   }
+  return absl::OkStatus();
+}
+
+absl::StatusOr<tensorflow::tpu::TPUCompileMetadataProto> GetCompileMetadata(
+    mlir::ModuleOp module, const xla::ifrt::Client& ifrt_client) {
+  tensorflow::tpu::TPUCompileMetadataProto metadata;
+
+  auto op = module.lookupSymbol<mlir::func::FuncOp>(kEntryFuncName);
+  if (!op) {
+    return absl::InternalError("Could not find entry function in MLIR Module.");
+  }
+
+  auto metadata_text_attr =
+      op->getAttrOfType<mlir::StringAttr>(kMetadataTextAttrName);
+
+  if (metadata_text_attr && !metadata_text_attr.getValue().empty()) {
+    // Try __tpu_compile_metadata_text attribute. This only for debugging
+    // purpose.
+    VLOG(1) << "Parsing from attribute " << kMetadataTextAttrName
+            << metadata_text_attr.getValue().str();
+    if (!tsl::protobuf::TextFormat::ParseFromString(
+            metadata_text_attr.getValue().str(), &metadata)) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Attribute ", kMetadataTextAttrName, ":",
+          metadata_text_attr.getValue().str(), " cannot be parsed"));
+    }
+  } else {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Missing ", kMetadataTextAttrName));
+  }
 
   // Create a default device assignment if one is not given by the model.
   if (!metadata.has_device_assignment()) {
-    // TODO(b/316068010): integrate core selection.
     TF_ASSIGN_OR_RETURN(
         auto device_assignment,
         ifrt_client.GetDefaultDeviceAssignment(
@@ -142,11 +146,11 @@ absl::StatusOr<tensorflow::tpu::TPUCompileMetadataProto> GetCompileMetadata(
 
   return metadata;
 }
-}  // namespace
 
 absl::StatusOr<Tf2HloResult> CompileTfToHlo(
     mlir::ModuleOp module, absl::Span<const DtypeAndShape> inputs,
     absl::string_view entry_function_name, const xla::ifrt::Client& ifrt_client,
+    const tensorflow::tpu::TPUCompileMetadataProto& compile_metadata,
     tensorflow::XlaHelpers::ShapeRepresentationFn shape_representation_fn) {
   if (VLOG_IS_ON(1)) {
     tensorflow::DumpMlirOpToFile("ifrt_before_bridge_phase2", module);
@@ -165,21 +169,6 @@ absl::StatusOr<Tf2HloResult> CompileTfToHlo(
   TF_ASSIGN_OR_RETURN(
       auto* client, xla::ClientLibrary::GetOrCreateCompileOnlyClient(platform));
 
-  auto entry_fn = module.lookupSymbol<mlir::func::FuncOp>(kEntryFuncName);
-  if (!entry_fn) {
-    return absl::InternalError("Could not find entry function in MLIR Module.");
-  }
-
-  if (inputs.size() != entry_fn.getNumArguments()) {
-    return absl::InternalError(
-        absl::StrCat("Entry function arguments mismatched! Expected ",
-                     entry_fn.getNumArguments(), " got", inputs.size()));
-  }
-
-  TF_ASSIGN_OR_RETURN(tensorflow::tpu::TPUCompileMetadataProto compile_metadata,
-                      GetCompileMetadata(entry_fn, inputs, ifrt_client));
-
-  VLOG(1) << "Compilation metadata: " << compile_metadata;
 
   std::vector<TensorShape> arg_shapes;
   for (const auto& input : inputs) {
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h
index fec9bbb2c740e7..48d7cabdd14286 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h
@@ -24,8 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "xla/python/ifrt/client.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
 
@@ -38,11 +36,19 @@ struct Tf2HloResult {
   tf2xla::HostComputeMetadata host_compute_metadata;
 };
 
+absl::Status UpdateCompileMetadata(
+    tensorflow::tpu::TPUCompileMetadataProto& metadata,
+    absl::Span<const DtypeAndShape> inputs);
+
+absl::StatusOr<tensorflow::tpu::TPUCompileMetadataProto> GetCompileMetadata(
+    mlir::ModuleOp module, const xla::ifrt::Client& ifrt_client);
+
 // A class that convert tf module to hlo
 // TODO(b/304839793): provide wrap persistent compilation cache.
 absl::StatusOr<Tf2HloResult> CompileTfToHlo(
     mlir::ModuleOp module, absl::Span<const DtypeAndShape> inputs,
     absl::string_view entry_function_name, const xla::ifrt::Client& ifrt_client,
+    const tensorflow::tpu::TPUCompileMetadataProto& compile_metadata,
     tensorflow::XlaHelpers::ShapeRepresentationFn shape_representation_fn);
 
 }  // namespace ifrt_serving
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo_test.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo_test.cc
index 7ee1c450426b20..b201370ea3ae7b 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo_test.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "mlir/IR/AsmState.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/DialectRegistry.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
@@ -32,6 +33,7 @@ limitations under the License.
 #include "mlir/InitAllDialects.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/test_util.h"
@@ -93,8 +95,14 @@ TEST(Tf2HloTest, Empty) {
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::ifrt::Client> client,
                           xla::ifrt::test_util::GetClient());
 
-  auto result = CompileTfToHlo(mlir_module.get(), {}, "main", *client,
-                               tensorflow::IdentityShapeRepresentationFn());
+  TF_ASSERT_OK_AND_ASSIGN(
+      tensorflow::tpu::TPUCompileMetadataProto compile_metadata,
+      GetCompileMetadata(mlir_module.get(), *client));
+  TF_ASSERT_OK(UpdateCompileMetadata(compile_metadata, {}));
+
+  auto result =
+      CompileTfToHlo(mlir_module.get(), {}, "main", *client, compile_metadata,
+                     tensorflow::IdentityShapeRepresentationFn());
 
   TF_ASSERT_OK(result.status());
 }
@@ -125,9 +133,15 @@ TEST(Tf2HloTest, Tuple) {
   std::vector<DtypeAndShape> dtype_and_shapes;
   dtype_and_shapes.push_back(DtypeAndShape{DT_FLOAT, {1, 3}});
   dtype_and_shapes.push_back(DtypeAndShape{DT_FLOAT, {3, 1}});
-  auto result =
-      CompileTfToHlo(mlir_module.get(), dtype_and_shapes, "main", *client,
-                     tensorflow::IdentityShapeRepresentationFn());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      tensorflow::tpu::TPUCompileMetadataProto compile_metadata,
+      GetCompileMetadata(mlir_module.get(), *client));
+  TF_ASSERT_OK(UpdateCompileMetadata(compile_metadata, dtype_and_shapes));
+
+  auto result = CompileTfToHlo(mlir_module.get(), dtype_and_shapes, "main",
+                               *client, compile_metadata,
+                               tensorflow::IdentityShapeRepresentationFn());
 
   TF_ASSERT_OK(result.status());
 }
@@ -157,9 +171,15 @@ TEST(Tf2HloTest, Spmd) {
 
   std::vector<DtypeAndShape> dtype_and_shapes;
   dtype_and_shapes.push_back(DtypeAndShape{DT_FLOAT, {4, 64}});
-  auto result =
-      CompileTfToHlo(mlir_module.get(), dtype_and_shapes, "main", *client,
-                     tensorflow::IdentityShapeRepresentationFn());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      tensorflow::tpu::TPUCompileMetadataProto compile_metadata,
+      GetCompileMetadata(mlir_module.get(), *client));
+  TF_ASSERT_OK(UpdateCompileMetadata(compile_metadata, dtype_and_shapes));
+
+  auto result = CompileTfToHlo(mlir_module.get(), dtype_and_shapes, "main",
+                               *client, compile_metadata,
+                               tensorflow::IdentityShapeRepresentationFn());
 
   LOG(INFO) << result->compile_metadata;
   TF_ASSERT_OK(result.status());
@@ -227,9 +247,15 @@ TEST(Tf2HloTest, UsingDefaultDeviceAssignment) {
   dtype_and_shapes.push_back(DtypeAndShape{DT_FLOAT, {4, 64}});
   dtype_and_shapes.push_back(DtypeAndShape{DT_FLOAT, {64, 10}});
   dtype_and_shapes.push_back(DtypeAndShape{DT_FLOAT, {1, 4}});
-  auto result =
-      CompileTfToHlo(mlir_module.get(), dtype_and_shapes, "main", *client,
-                     tensorflow::IdentityShapeRepresentationFn());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      tensorflow::tpu::TPUCompileMetadataProto compile_metadata,
+      GetCompileMetadata(mlir_module.get(), *client));
+  TF_ASSERT_OK(UpdateCompileMetadata(compile_metadata, dtype_and_shapes));
+
+  auto result = CompileTfToHlo(mlir_module.get(), dtype_and_shapes, "main",
+                               *client, compile_metadata,
+                               tensorflow::IdentityShapeRepresentationFn());
 
   LOG(INFO) << result->compile_metadata;
   TF_ASSERT_OK(result.status());
@@ -323,9 +349,14 @@ TEST(Tf2HloTest, XlaCallHostCallback) {
   dtype_and_shapes.push_back(DtypeAndShape{DT_INT32, {1}});
   dtype_and_shapes.push_back(DtypeAndShape{DT_INT32, {1}});
 
-  auto result =
-      CompileTfToHlo(mlir_module.get(), dtype_and_shapes, "main", *client,
-                     tensorflow::IdentityShapeRepresentationFn());
+  TF_ASSERT_OK_AND_ASSIGN(
+      tensorflow::tpu::TPUCompileMetadataProto compile_metadata,
+      GetCompileMetadata(mlir_module.get(), *client));
+  TF_ASSERT_OK(UpdateCompileMetadata(compile_metadata, dtype_and_shapes));
+
+  auto result = CompileTfToHlo(mlir_module.get(), dtype_and_shapes, "main",
+                               *client, compile_metadata,
+                               tensorflow::IdentityShapeRepresentationFn());
 
   TF_ASSERT_OK(result.status());
 
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/insert_tensor_copy.cc b/tensorflow/compiler/mlir/tfrt/transforms/insert_tensor_copy.cc
index 2b2cbbcf318d15..d6c87abeedd54a 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/insert_tensor_copy.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/insert_tensor_copy.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback.h"
 #include "tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_async.h"
@@ -69,7 +70,7 @@ class InsertFallbackTensorCopy
 
     // Process function arguments first.
     for (auto arg : func_op.getArguments()) {
-      if (!arg.getType().isa<tfrt::fallback::TFTensorType>()) continue;
+      if (!mlir::isa<tfrt::fallback::TFTensorType>(arg.getType())) continue;
       InsertFallbackTensorCopyForValue(arg, func_op->getLoc(), builder,
                                        stream_analysis);
     }
@@ -91,7 +92,7 @@ class InsertFallbackTensorCopy
 
     // Process each result value.
     for (auto result : op->getResults()) {
-      if (!result.getType().isa<tfrt::fallback::TFTensorType>()) continue;
+      if (!mlir::isa<tfrt::fallback::TFTensorType>(result.getType())) continue;
       InsertFallbackTensorCopyForValue(result, op->getLoc(), builder,
                                        stream_analysis);
     }
@@ -147,7 +148,7 @@ class InsertFallbackTensorCopy
     // For each stream, we will create one new value that replaces the uses in
     // that stream.
 
-    assert(value.getType().isa<tfrt::fallback::TFTensorType>());
+    assert(mlir::isa<tfrt::fallback::TFTensorType>(value.getType()));
 
     // The number of results is the number candidate streams.
     llvm::SmallVector<mlir::Type, 4> result_types(copies.size(),
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/lower_bound_batch_threads.cc b/tensorflow/compiler/mlir/tfrt/transforms/lower_bound_batch_threads.cc
new file mode 100644
index 00000000000000..2c2883181e942c
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/lower_bound_batch_threads.cc
@@ -0,0 +1,93 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/CommandLine.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/passes.h"
+
+namespace tensorflow {
+namespace tfrt_compiler {
+namespace {
+
+class LowerBoundBatchThreadsPass
+    : public mlir::PassWrapper<LowerBoundBatchThreadsPass,
+                               mlir::OperationPass<mlir::ModuleOp>> {
+ public:
+  explicit LowerBoundBatchThreadsPass(uint64_t min_num_batch_threads)
+      : mlir::PassWrapper<LowerBoundBatchThreadsPass,
+                          mlir::OperationPass<mlir::ModuleOp>>() {
+    min_num_batch_threads_ = min_num_batch_threads;
+  }
+  LowerBoundBatchThreadsPass()
+      : mlir::PassWrapper<LowerBoundBatchThreadsPass,
+                          mlir::OperationPass<mlir::ModuleOp>>() {}
+  LowerBoundBatchThreadsPass(const LowerBoundBatchThreadsPass& other)
+      : mlir::PassWrapper<LowerBoundBatchThreadsPass,
+                          mlir::OperationPass<mlir::ModuleOp>>(other) {}
+
+  LowerBoundBatchThreadsPass& operator=(
+      const LowerBoundBatchThreadsPass& other) = delete;
+
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(LowerBoundBatchThreadsPass)
+
+ private:
+  llvm::StringRef getArgument() const final {
+    return "tfrt-lower-bound-batch-threads";
+  }
+
+  llvm::StringRef getDescription() const final {
+    return "Lower bound batch threads for batch ops.";
+  }
+
+  void runOnOperation() override {
+    if (min_num_batch_threads_ > 0) {
+      mlir::ModuleOp module = getOperation();
+      module.walk([&](mlir::TF::BatchFunctionOp batch_op) {
+        int64_t num_batch_threads = batch_op.getNumBatchThreads();
+        num_batch_threads =
+            std::max(num_batch_threads, min_num_batch_threads_.getValue());
+        batch_op.setNumBatchThreads(num_batch_threads);
+      });
+    }
+  }
+
+ protected:
+  mlir::Pass::Option<int64_t> min_num_batch_threads_{
+      *this, "tfrt-min-num-batch-threads", llvm::cl::init(1),
+      llvm::cl::desc("Minimum number of batch threads")};
+  ;
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateLowerBoundBatchThreadsPass(int64_t min_num_batch_threads) {
+  return std::make_unique<LowerBoundBatchThreadsPass>(min_num_batch_threads);
+}
+
+static mlir::PassRegistration<LowerBoundBatchThreadsPass> register_pass;
+
+}  // namespace tfrt_compiler
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/lower_saved_model.cc b/tensorflow/compiler/mlir/tfrt/transforms/lower_saved_model.cc
index 17e3d8be95204d..01ae5811b46b9a 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/lower_saved_model.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/lower_saved_model.cc
@@ -44,6 +44,7 @@ limitations under the License.
 #include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h"
@@ -231,7 +232,7 @@ void FindCalleesRecursiveForOp(const mlir::SymbolTable &symbol_table,
                                llvm::StringSet<> &callees) {
   for (const auto &named_attr : op->getAttrs()) {
     if (auto symbol_attr =
-            named_attr.getValue().dyn_cast<mlir::FlatSymbolRefAttr>()) {
+            mlir::dyn_cast<mlir::FlatSymbolRefAttr>(named_attr.getValue())) {
       auto symbol = symbol_attr.getValue();
       if (!callees.contains(symbol)) {
         callees.insert(symbol);
@@ -337,7 +338,8 @@ class LowerTFSavedModelPass
           func_op->removeAttr(kTfSavedModelExportedNamesAttr);
           for (auto exported_name : exported_names) {
             auto exported_func_op = func_op.clone();
-            exported_func_op.setName(exported_name.cast<mlir::StringAttr>());
+            exported_func_op.setName(
+                mlir::cast<mlir::StringAttr>(exported_name));
 
             // If it is a session initializer, we want to maximize parallelism
             // and do not perform any stream merge, to minimize latency.
@@ -631,8 +633,8 @@ class ConvertReferenceVariableToResourceVariablePass
 
 mlir::LogicalResult ConvertReferenceVariableToResourceVariable(
     mlir::TF::VariableV2Op var_op) {
-  auto tensor_type =
-      mlir::TF::DropRefType(var_op.getRef().getType()).cast<mlir::TensorType>();
+  auto tensor_type = mlir::cast<mlir::TensorType>(
+      mlir::TF::DropRefType(var_op.getRef().getType()));
 
   llvm::SmallVector<mlir::TF::IdentityOp, 4> identity_ops;
   llvm::SmallVector<mlir::TF::AssignOp, 4> assign_ops;
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/BUILD b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/BUILD
index d7fafb49ee6cdd..ed518285828d1a 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/BUILD
@@ -1,7 +1,6 @@
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
-        # copybara:uncomment "//learning/infra/mira/distributed:__subpackages__",
         "//tensorflow/compiler/mlir/tfrt:__subpackages__",
         "//tensorflow/core/tfrt:__subpackages__",
     ],
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.cc b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.cc
index af932ff5011895..03817906b9772c 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.cc
@@ -59,37 +59,39 @@ absl::StatusOr<mlrt::bc::Buffer> ConvertTfMlirToBytecode(
   mlrt::bc::Buffer bytecode_buffer;
   TF_RETURN_IF_ERROR(ConvertTfMlirToRuntimeExecutable(
       options, module,
-      [&bytecode_buffer, &fallback_state, &model_context, module_with_op_keys](
-          mlir::PassManager& pm, mlir::ModuleOp module,
-          const TfrtPipelineOptions& options) {
-        if (auto* flib_def = model_context.function_library_definition()) {
-          // Copy the module before exporting as exporting to graph will
-          // transform the MLIR to TFG dialect.
-          mlir::OwningOpRef<mlir::ModuleOp> copy(module.clone());
-          TF_RETURN_IF_ERROR(
-              ExportFunctionDefs(*copy, [flib_def](FunctionDef function_def) {
-                VLOG(1) << absl::StrCat(
-                    "Exporting MLIR function as function_def: ",
-                    // clang-tidy off
-                    function_def.DebugString()
-                    // clang-tidy on
-                );
+      [&bytecode_buffer, &fallback_state, &model_context,
+       backend_compiler = options.backend_compiler,
+       module_with_op_keys](mlir::PassManager& pm, mlir::ModuleOp module,
+                            const TfrtPipelineOptions& options) {
+        if (backend_compiler) {
+          if (auto* flib_def = model_context.function_library_definition()) {
+            // Copy the module before exporting as exporting to graph will
+            // transform the MLIR to TFG dialect.
+            mlir::OwningOpRef<mlir::ModuleOp> copy(module.clone());
+            TF_RETURN_IF_ERROR(
+                ExportFunctionDefs(*copy, [flib_def](FunctionDef function_def) {
+                  VLOG(1) << absl::StrCat(
+                      "Exporting MLIR function as function_def: ",
+                      // NOLINTNEXTLINE
+                      function_def.DebugString());
 
-                // The TF MLIR compiler may change the function name. Then we
-                // need to retrieve the original name from the
-                // _original_func_name attribute.
-                auto iter = function_def.attr().find("_original_func_name");
-                if (iter != function_def.attr().end()) {
-                  function_def.mutable_signature()->set_name(iter->second.s());
-                }
+                  // The TF MLIR compiler may change the function name. Then we
+                  // need to retrieve the original name from the
+                  // _original_func_name attribute.
+                  auto iter = function_def.attr().find("_original_func_name");
+                  if (iter != function_def.attr().end()) {
+                    function_def.mutable_signature()->set_name(
+                        iter->second.s());
+                  }
 
-                const auto& name = function_def.signature().name();
-                if (flib_def->Contains(name)) {
-                  TF_RETURN_IF_ERROR(flib_def->RemoveFunction(name));
-                }
+                  const auto& name = function_def.signature().name();
+                  if (flib_def->Contains(name)) {
+                    TF_RETURN_IF_ERROR(flib_def->RemoveFunction(name));
+                  }
 
-                return flib_def->AddFunctionDef(function_def);
-              }));
+                  return flib_def->AddFunctionDef(function_def);
+                }));
+          }
         }
 
         mlir::StatusScopedDiagnosticHandler diag_handler(module.getContext());
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
index 350c424636b2f8..0eba87bb7dfcf1 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/host_runtime/tfrt_ops.h.inc"
@@ -349,7 +350,8 @@ class TFIfrtLoadVariableOpConversion
 
     auto new_op = rewriter.create<tf_mlrt::IfrtLoadVariableOp>(
         op.getLoc(), result_types, adaptor.getOperands()[0],
-        op.getDeviceShardingConfigProtoTextAttr(), op.getNameAttr());
+        op.getDeviceShardingConfigProtoTextAttr(), op.getNameAttr(),
+        op.getUsedByHostAttr());
     rewriter.replaceOp(op, new_op);
 
     return mlir::success();
@@ -380,11 +382,11 @@ class IfrtRestoreVariableOpConversion
 };
 
 std::optional<std::string> DecodeLongName(mlir::Location loc) {
-  if (auto name_loc = loc.dyn_cast<mlir::NameLoc>()) {
+  if (auto name_loc = mlir::dyn_cast<mlir::NameLoc>(loc)) {
     return name_loc.getName().str();
   }
 
-  if (auto fused_loc = loc.dyn_cast<mlir::FusedLoc>()) {
+  if (auto fused_loc = mlir::dyn_cast<mlir::FusedLoc>(loc)) {
     std::string fused_name;
     for (auto l : fused_loc.getLocations()) {
       if (auto n = DecodeLongName(l)) {
@@ -462,7 +464,7 @@ class ExecuteOpConversion final : public mlir::ConversionPattern {
       tensorflow::TensorProto tensor_proto;
       auto status = ConvertToTensorProto(const_op.getValue(), &tensor_proto);
       if (!status.ok())
-        return const_op.emitError(tsl::NullTerminatedMessage(status));
+        return const_op.emitError(absl::StatusMessageAsCStr(status));
 
       rewriter.replaceOpWithNewOp<tf_mlrt::ConstOp>(
           op, rewriter.getType<tf_mlrt::TFTensorType>(),
@@ -1027,7 +1029,7 @@ class TfToMlrtConversionPass
     type_converter_.addConversion(
         [=](mlir::TensorType type) -> std::optional<mlir::Type> {
           // Ref types are not supported in both compiler and runtime.
-          if (type.getElementType().isa<mlir::TF::TensorFlowRefType>())
+          if (mlir::isa<mlir::TF::TensorFlowRefType>(type.getElementType()))
             return std::nullopt;
           return tf_mlrt::TFTensorType::get(context);
         });
@@ -1037,8 +1039,8 @@ class TfToMlrtConversionPass
            mlir::ValueRange inputs, mlir::Location loc) -> mlir::Value {
       if (inputs.size() != 1) return mlir::Value();
 
-      if (inputs[0].getType().isa<mlrt::compiler::FutureType>()) {
-        if (desired_type.isa<tf_mlrt::TFTensorType>()) {
+      if (mlir::isa<mlrt::compiler::FutureType>(inputs[0].getType())) {
+        if (mlir::isa<tf_mlrt::TFTensorType>(desired_type)) {
           return builder.create<tf_mlrt::AwaitOp>(loc, desired_type, inputs[0]);
         }
 
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/while_to_map_fn.cc b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/while_to_map_fn.cc
index a7975c40e1ff48..0bc2a9617b12d6 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/while_to_map_fn.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/while_to_map_fn.cc
@@ -286,6 +286,10 @@ class WhileToMapFnPass
     for (auto tensor_list_index : loop_info.tensor_list_or_flow_in) {
       mlir::Operation *tensor_list_or_flow_in_defining_op =
           while_op.getOperand(tensor_list_index).getDefiningOp();
+      if (tensor_list_or_flow_in_defining_op == nullptr) {
+        return mlir::failure();
+      }
+
       mlir::Operation *max_iterations = nullptr;
       if (loop_info.max_iterations_arg_idx.has_value()) {
         max_iterations =
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/optimize.cc b/tensorflow/compiler/mlir/tfrt/transforms/optimize.cc
index e13e8f36b1a436..0e47fad312c7cc 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/optimize.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/passes.h"
@@ -41,7 +42,8 @@ class FoldDeviceIndex : public mlir::OpRewritePattern<mlir::TF::DeviceIndexOp> {
     int32_t i = 0;
     mlir::ArrayAttr device_names = op.getDeviceNames();
     for (; i < device_names.size(); ++i) {
-      auto device_name = device_names[i].cast<mlir::StringAttr>().getValue();
+      auto device_name =
+          mlir::cast<mlir::StringAttr>(device_names[i]).getValue();
       if (device_name == parsed_name.type) break;
     }
 
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/passes.cc b/tensorflow/compiler/mlir/tfrt/transforms/passes.cc
index a0d7dbb371e375..69bc9370424671 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/passes.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/passes.cc
@@ -117,6 +117,10 @@ void CreateTFExecutorToTFPreInvariantOptimizationPipelineHelper(
   // Merge non-side-effecting tf.If ops if their operands are the same.
   pm.addPass(tfrt_compiler::CreateMergeTfIfOpsPass());
 
+  // Lower bound on the number of batch threads in `tf.BatchFunction`.
+  pm.addPass(tfrt_compiler::CreateLowerBoundBatchThreadsPass(
+      options.min_num_batch_threads));
+
   // Deduplicate functions invoked by tf.BatchFunction with the same
   // shared_name
   pm.addPass(
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/passes.h b/tensorflow/compiler/mlir/tfrt/transforms/passes.h
index 6bcbf8dbad317b..e1c848210dd19b 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tfrt/transforms/passes.h
@@ -66,6 +66,10 @@ std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> CreateMergeTfIfOpsPass();
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 CreateDeduplicateFunctionsInovkedByBatchFunctionPass();
 
+// Create a pass to lower bound the number of threads in tf.BatchFunction.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateLowerBoundBatchThreadsPass(int64_t min_num_batch_threads);
+
 // Create a pass to fuse the TPU Ops for TFRT.
 std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
 CreateFuseTpuCompileAndExecutePass();
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/sink_in_invariant_ops.cc b/tensorflow/compiler/mlir/tfrt/transforms/sink_in_invariant_ops.cc
index 848498c68ba71c..8bdb39c913bf75 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/sink_in_invariant_ops.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/sink_in_invariant_ops.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 
@@ -82,7 +83,7 @@ llvm::SmallVector<mlir::Value> FindValueInCallees(
   llvm::SmallDenseSet<llvm::StringRef> callees;
   for (const auto &named_attr : caller->getAttrs()) {
     if (auto symbol_attr =
-            named_attr.getValue().dyn_cast<mlir::FlatSymbolRefAttr>()) {
+            mlir::dyn_cast<mlir::FlatSymbolRefAttr>(named_attr.getValue())) {
       auto symbol = symbol_attr.getValue();
 
       auto callee = symbol_table.lookup<mlir::func::FuncOp>(symbol);
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/tf_to_tfrt.cc b/tensorflow/compiler/mlir/tfrt/transforms/tf_to_tfrt.cc
index 693bd78df0b170..f090745e0ae1c4 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/tf_to_tfrt.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/tf_to_tfrt.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
@@ -181,7 +182,7 @@ class GpuCompileAndExecuteOpConversion
     if (!xla_function) {
       return op->emitWarning("failed to find 'function' attribute");
     }
-    auto func_attr = xla_function.dyn_cast<mlir::FlatSymbolRefAttr>();
+    auto func_attr = mlir::dyn_cast<mlir::FlatSymbolRefAttr>(xla_function);
     if (!func_attr || func_attr.getValue().empty()) {
       return op->emitWarning("failed to find a non-empty 'function' attribute");
     }
@@ -512,7 +513,7 @@ class FallbackConstOpConversion
       mlir::ConversionPatternRewriter &rewriter) const override {
     // Some data types are handled separately using a fast path.
     if (IsSupportedTfrtNumericDType(op.getDtype()) ||
-        op.getDtype().isa<mlir::TF::StringType>())
+        mlir::isa<mlir::TF::StringType>(op.getDtype()))
       return failure();
 
     // For other data types that do not have a fast path (eg. quantized types),
@@ -520,7 +521,7 @@ class FallbackConstOpConversion
 
     tensorflow::TensorProto tensor_proto;
     auto status = ConvertToTensorProto(op.getValue(), &tensor_proto);
-    if (!status.ok()) return op.emitError(tsl::NullTerminatedMessage(status));
+    if (!status.ok()) return op.emitError(absl::StatusMessageAsCStr(status));
 
     rewriter.replaceOpWithNewOp<tfrt::fallback_async::ConstTensorProtoOp>(
         op, rewriter.getType<tfrt::fallback::TFTensorType>(),
@@ -737,11 +738,11 @@ class FallbackBatchFunctionOpConversion
 
 // Lower a tf.Const op that creates a string tensor to a native
 // corert.create_string_tensor op.
-class CoreRTConstDenseTensorOpConversion
+class FallbackConstDenseTensorOpConversion
     : public mlir::OpConversionPattern<mlir::TF::ConstOp> {
  public:
-  CoreRTConstDenseTensorOpConversion(mlir::MLIRContext *context,
-                                     CoreRTConverter *corert_converter)
+  FallbackConstDenseTensorOpConversion(mlir::MLIRContext *context,
+                                       CoreRTConverter *corert_converter)
       : mlir::OpConversionPattern<mlir::TF::ConstOp>(context, kCoreRTBenefit),
         corert_converter_(*corert_converter) {}
 
@@ -755,9 +756,9 @@ class CoreRTConstDenseTensorOpConversion
     if (auto parsed_device_name = corert_converter_.ParseDeviceName(op))
       if (parsed_device_name->device_type != DEVICE_CPU) return failure();
 
-    auto new_op = rewriter.create<tfrt::corert::ConstDenseTensorOp>(
-        op.getLoc(), corert_converter_.tensor_handle_type(),
-        op.getValue().cast<DenseElementsAttr>());
+    auto new_op = rewriter.create<tfrt::fallback_async::ConstDenseTensorOp>(
+        op.getLoc(), rewriter.getType<tfrt::fallback::TFTensorType>(),
+        mlir::cast<DenseElementsAttr>(op.getValue()));
     rewriter.replaceOp(op, new_op->getResult(0));
     return success();
   }
@@ -859,21 +860,21 @@ class TFRTFuncOpSignatureConversion
 
 // Lower a tf.Const op that creates a string tensor to a native
 // corert.create_string_tensor op.
-class CoreRTConstStringTensorOpConversion
+class FallbackConstStringTensorOpConversion
     : public mlir::OpConversionPattern<mlir::TF::ConstOp> {
  public:
-  CoreRTConstStringTensorOpConversion(mlir::MLIRContext *context,
-                                      CoreRTConverter *corert_converter)
+  FallbackConstStringTensorOpConversion(mlir::MLIRContext *context,
+                                        CoreRTConverter *corert_converter)
       : mlir::OpConversionPattern<mlir::TF::ConstOp>(context, kCoreRTBenefit),
         corert_converter_(*corert_converter) {}
 
   LogicalResult matchAndRewrite(
       mlir::TF::ConstOp op, OpAdaptor adaptor,
       ConversionPatternRewriter &rewriter) const override {  // NOLINT
-    if (!op.getDtype().isa<mlir::TF::StringType>()) return failure();
+    if (!mlir::isa<mlir::TF::StringType>(op.getDtype())) return failure();
 
     DenseStringElementsAttr attr =
-        op.getValue().cast<DenseStringElementsAttr>();
+        mlir::cast<DenseStringElementsAttr>(op.getValue());
 
     llvm::SmallVector<Attribute, 4> values;
     values.reserve(attr.getNumElements());
@@ -889,8 +890,8 @@ class CoreRTConstStringTensorOpConversion
     for (auto dim : shape)
       dims.push_back(rewriter.getIntegerAttr(i64_type, dim));
 
-    auto new_op = rewriter.create<tfrt::corert::ConstStringTensorOp>(
-        op.getLoc(), corert_converter_.tensor_handle_type(),
+    auto new_op = rewriter.create<tfrt::fallback_async::ConstStringTensorOp>(
+        op.getLoc(), rewriter.getType<tfrt::fallback::TFTensorType>(),
         rewriter.getArrayAttr(dims), rewriter.getArrayAttr(values));
 
     rewriter.replaceOp(op, new_op.getResult());
@@ -905,16 +906,11 @@ class CoreRTConstStringTensorOpConversion
 LogicalResult ConvertFunctionCallOperands(
     mlir::Operation *op, ValueRange operands,
     llvm::SmallVectorImpl<mlir::Value> *new_operands,
-    mlir::ConversionPatternRewriter &rewriter, bool func_use_fallback_tensor) {
-  if (func_use_fallback_tensor) {
-    // TODO(b/182232457): Support other devices.
-    return tfrt_compiler::ConvertFallbackOperands(
-        op, tfrt_compiler::GetDefaultCpuDeviceName(), operands, new_operands,
-        rewriter);
-  } else {
-    return tfrt_compiler::ConvertCoreRTOperands(op, operands, new_operands,
-                                                rewriter);
-  }
+    mlir::ConversionPatternRewriter &rewriter) {
+  // TODO(b/182232457): Support other devices.
+  return tfrt_compiler::ConvertFallbackOperands(
+      op, tfrt_compiler::GetDefaultCpuDeviceName(), operands, new_operands,
+      rewriter);
 }
 
 // Convert TF call ops (eg. StatefulPartitionedCall) to tfrt.call.
@@ -923,12 +919,10 @@ class TFRTCallOpConversion : public mlir::OpConversionPattern<CallOp> {
  public:
   TFRTCallOpConversion(mlir::MLIRContext *context,
                        mlir::TypeConverter *type_converter,
-                       CoreRTConverter *corert_converter,
-                       bool func_use_fallback_tensor)
+                       CoreRTConverter *corert_converter)
       : mlir::OpConversionPattern<CallOp>(context),
         type_converter_(*type_converter),
-        corert_converter_(*corert_converter),
-        func_use_fallback_tensor_(func_use_fallback_tensor) {}
+        corert_converter_(*corert_converter) {}
 
   LogicalResult matchAndRewrite(
       CallOp op, typename CallOp::Adaptor adaptor,
@@ -953,8 +947,7 @@ class TFRTCallOpConversion : public mlir::OpConversionPattern<CallOp> {
     // operand is !tfrt_fallback.tf_tensor, and it is also used as fallback
     // tensor inside the callee function.
     if (mlir::failed(ConvertFunctionCallOperands(op, adaptor.getOperands(),
-                                                 &new_operands, rewriter,
-                                                 func_use_fallback_tensor_)))
+                                                 &new_operands, rewriter)))
       return failure();
 
     llvm::SmallVector<mlir::Type, 4> result_types;
@@ -982,7 +975,6 @@ class TFRTCallOpConversion : public mlir::OpConversionPattern<CallOp> {
  private:
   mlir::TypeConverter &type_converter_;
   CoreRTConverter &corert_converter_;
-  bool func_use_fallback_tensor_;
 };
 
 // Convert func ReturnOp to tfrt.return.
@@ -993,11 +985,9 @@ class TFRTReturnOpConversion
     : public mlir::OpConversionPattern<mlir::func::ReturnOp> {
  public:
   TFRTReturnOpConversion(mlir::MLIRContext *context,
-                         CoreRTConverter *corert_converter,
-                         bool func_use_fallback_tensor)
+                         CoreRTConverter *corert_converter)
       : mlir::OpConversionPattern<mlir::func::ReturnOp>(context),
-        corert_converter_(*corert_converter),
-        func_use_fallback_tensor_(func_use_fallback_tensor) {}
+        corert_converter_(*corert_converter) {}
 
   LogicalResult matchAndRewrite(
       mlir::func::ReturnOp op, OpAdaptor adaptor,
@@ -1013,8 +1003,7 @@ class TFRTReturnOpConversion
     new_operands.push_back(
         corert_converter_.GetLocalSideEffectChain(op, &rewriter));
     if (mlir::failed(ConvertFunctionCallOperands(op, adaptor.getOperands(),
-                                                 &new_operands, rewriter,
-                                                 func_use_fallback_tensor_)))
+                                                 &new_operands, rewriter)))
       return failure();
 
     rewriter.replaceOpWithNewOp<tfrt::compiler::ReturnOp>(op, new_operands);
@@ -1023,7 +1012,6 @@ class TFRTReturnOpConversion
 
  private:
   CoreRTConverter &corert_converter_;
-  bool func_use_fallback_tensor_;
 };
 
 // Convert tf.Case op to tfrt.Case.
@@ -1038,12 +1026,10 @@ class TFRTCaseOpConversion : public mlir::OpConversionPattern<TF::CaseOp> {
  public:
   TFRTCaseOpConversion(mlir::MLIRContext *context,
                        mlir::TypeConverter *type_converter,
-                       CoreRTConverter *corert_converter,
-                       bool func_use_fallback_tensor)
+                       CoreRTConverter *corert_converter)
       : mlir::OpConversionPattern<TF::CaseOp>(context),
         type_converter_(*type_converter),
-        corert_converter_(*corert_converter),
-        func_use_fallback_tensor_(func_use_fallback_tensor) {}
+        corert_converter_(*corert_converter) {}
 
   LogicalResult matchAndRewrite(
       TF::CaseOp op, OpAdaptor adaptor,
@@ -1060,14 +1046,14 @@ class TFRTCaseOpConversion : public mlir::OpConversionPattern<TF::CaseOp> {
     llvm::SmallVector<mlir::Value, 4> branch_operands;
     branch_operands.push_back(
         corert_converter_.GetLocalSideEffectChain(op, &rewriter));
-    if (mlir::failed(ConvertFunctionCallOperands(
-            op, adaptor.getOperands().drop_front(), &branch_operands, rewriter,
-            func_use_fallback_tensor_)))
+    if (mlir::failed(
+            ConvertFunctionCallOperands(op, adaptor.getOperands().drop_front(),
+                                        &branch_operands, rewriter)))
       return failure();
 
     mlir::Value index_operand = adaptor.getOperands()[0];
     // TODO(b/182233401): Support TF tensor; remove the conversion op here.
-    if (index_operand.getType().isa<tfrt::fallback::TFTensorType>()) {
+    if (mlir::isa<tfrt::fallback::TFTensorType>(index_operand.getType())) {
       // TODO(b/182232457): Support other devices.
       index_operand =
           rewriter
@@ -1079,7 +1065,7 @@ class TFRTCaseOpConversion : public mlir::OpConversionPattern<TF::CaseOp> {
                   tfrt_compiler::GetDefaultCpuDeviceName())
               .getResult(0);
     }
-    if (!index_operand.getType().isa<tfrt::corert::TensorHandleType>())
+    if (!mlir::isa<tfrt::corert::TensorHandleType>(index_operand.getType()))
       return op.emitError(
           "branch index operand is expected to be a TensorHandle.");
     mlir::Value index_value =
@@ -1096,12 +1082,11 @@ class TFRTCaseOpConversion : public mlir::OpConversionPattern<TF::CaseOp> {
  private:
   mlir::TypeConverter &type_converter_;
   CoreRTConverter &corert_converter_;
-  bool func_use_fallback_tensor_;
 };
 
 static mlir::Value GetPredicate(mlir::Operation *op, mlir::Value cond_operand,
                                 mlir::ConversionPatternRewriter &rewriter) {
-  if (!cond_operand.getType().isa<tfrt::fallback::TFTensorType>()) {
+  if (!mlir::isa<tfrt::fallback::TFTensorType>(cond_operand.getType())) {
     cond_operand = tfrt_compiler::ConvertCoreRTTensorHandleToFallbackTensor(
         op->getLoc(), tfrt_compiler::GetDefaultCpuDeviceName(), cond_operand,
         rewriter);
@@ -1119,12 +1104,10 @@ class TFRTCondOpConversion : public mlir::OpConversionPattern<mlir::TF::IfOp> {
  public:
   TFRTCondOpConversion(mlir::MLIRContext *context,
                        mlir::TypeConverter *type_converter,
-                       CoreRTConverter *corert_converter,
-                       bool func_use_fallback_tensor)
+                       CoreRTConverter *corert_converter)
       : mlir::OpConversionPattern<TF::IfOp>(context),
         type_converter_(*type_converter),
-        corert_converter_(*corert_converter),
-        func_use_fallback_tensor_(func_use_fallback_tensor) {}
+        corert_converter_(*corert_converter) {}
 
   mlir::LogicalResult matchAndRewrite(
       mlir::TF::IfOp op, OpAdaptor adaptor,
@@ -1150,8 +1133,7 @@ class TFRTCondOpConversion : public mlir::OpConversionPattern<mlir::TF::IfOp> {
         corert_converter_.GetLocalSideEffectChain(op, &rewriter));
 
     if (mlir::failed(ConvertFunctionCallOperands(
-            op, adaptor.getOperands().drop_front(), &new_operands, rewriter,
-            func_use_fallback_tensor_)))
+            op, adaptor.getOperands().drop_front(), &new_operands, rewriter)))
       return failure();
 
     auto new_op = rewriter.create<tfrt::compiler::CondOp>(
@@ -1174,7 +1156,6 @@ class TFRTCondOpConversion : public mlir::OpConversionPattern<mlir::TF::IfOp> {
  private:
   mlir::TypeConverter &type_converter_;
   CoreRTConverter &corert_converter_;
-  bool func_use_fallback_tensor_;
 };
 
 // Convert TF WhileOp to tfrt.while. tfrt.while use a boolean condition and has
@@ -1219,14 +1200,12 @@ class TFRTWhileOpConversion
                         mlir::SymbolTable *symbol_table,
                         const tfrt_compiler::TensorArraySideEffectAnalysis
                             *tensor_array_side_effect_analysis,
-                        bool func_use_fallback_tensor,
                         bool enable_while_parallel_iterations)
       : mlir::OpConversionPattern<TF::WhileOp>(context),
         type_converter_(*type_converter),
         corert_converter_(*corert_converter),
         symbol_table_(*symbol_table),
         tensor_array_side_effect_analysis_(*tensor_array_side_effect_analysis),
-        func_use_fallback_tensor_(func_use_fallback_tensor),
         enable_while_parallel_iterations_(enable_while_parallel_iterations) {}
 
   mlir::LogicalResult matchAndRewrite(
@@ -1248,8 +1227,7 @@ class TFRTWhileOpConversion
     // specified in the option.
     llvm::SmallVector<mlir::Value, 4> new_operands;
     if (mlir::failed(ConvertFunctionCallOperands(op, adaptor.getOperands(),
-                                                 &new_operands, rewriter,
-                                                 func_use_fallback_tensor_)))
+                                                 &new_operands, rewriter)))
       return failure();
 
     // Create the predicate function that calls the original cond function and
@@ -1328,7 +1306,6 @@ class TFRTWhileOpConversion
   mlir::SymbolTable &symbol_table_;
   const tfrt_compiler::TensorArraySideEffectAnalysis
       &tensor_array_side_effect_analysis_;
-  bool func_use_fallback_tensor_;
   bool enable_while_parallel_iterations_;
 };
 
@@ -1518,9 +1495,8 @@ void PopulateTFToTFRTConversionPatterns(
     const tfrt_compiler::CostAnalysis *cost_analysis,
     const tfrt_compiler::TensorArraySideEffectAnalysis
         *tensor_array_side_effect_analysis,
-    bool func_use_fallback_tensor, bool enable_while_parallel_iterations,
-    bool tpu_lower_to_fallback, bool target_tpurt,
-    bool use_gpu_compile_and_execute_op) {
+    bool enable_while_parallel_iterations, bool tpu_lower_to_fallback,
+    bool target_tpurt, bool use_gpu_compile_and_execute_op) {
   // By default, we lower all TF ops to fallback ops.
   patterns->add<FallbackExecuteOpConversion>(
       context, corert_converter, fallback_converter, symbol_table,
@@ -1534,23 +1510,17 @@ void PopulateTFToTFRTConversionPatterns(
 
   // For control flow ops, we handle them according to the option.
   mlir::TypeConverter *func_type_converter;
-  if (func_use_fallback_tensor) {
-    func_type_converter = fallback_converter;
-  } else {
-    func_type_converter = corert_converter;
-  }
+  func_type_converter = fallback_converter;
   patterns->add<TFRTFuncOpSignatureConversion>(context, func_type_converter);
-  patterns->add<TFRTReturnOpConversion>(context, corert_converter,
-                                        func_use_fallback_tensor);
+  patterns->add<TFRTReturnOpConversion>(context, corert_converter);
   patterns->add<TFRTWhileOpConversion>(
       context, func_type_converter, corert_converter, symbol_table,
-      tensor_array_side_effect_analysis, func_use_fallback_tensor,
-      enable_while_parallel_iterations);
+      tensor_array_side_effect_analysis, enable_while_parallel_iterations);
   patterns->add<TFRTCallOpConversion<mlir::TF::StatefulPartitionedCallOp>,
                 TFRTCallOpConversion<mlir::TF::PartitionedCallOp>,
                 TFRTCallOpConversion<mlir::TF::LegacyCallOp>,
                 TFRTCaseOpConversion, TFRTCondOpConversion>(
-      context, func_type_converter, corert_converter, func_use_fallback_tensor);
+      context, func_type_converter, corert_converter);
 
   // For tf.BatchFunction, we need a special fallback op to batch a BEF
   // function.
@@ -1562,8 +1532,9 @@ void PopulateTFToTFRTConversionPatterns(
 
   // Here we use specialized patterns for tf.Const on CPU as it is incorrect to
   // use ExecuteOp pattern to convert string tensor attribute.
-  patterns->add<CoreRTConstStringTensorOpConversion,
-                CoreRTConstDenseTensorOpConversion>(context, corert_converter);
+  patterns->add<FallbackConstStringTensorOpConversion,
+                FallbackConstDenseTensorOpConversion>(context,
+                                                      corert_converter);
 }
 
 // Lower TF dialect MLIR to TFRT dialect.
@@ -1598,7 +1569,6 @@ class TfToTfrtConversionPass
     tpu_allow_unpadded_batch_ = options.tpu_allow_unpadded_batch;
     cost_threshold_ = options.cost_threshold;
     merge_inter_dependent_streams_ = options.merge_inter_dependent_streams;
-    func_use_fallback_tensor_ = options.func_use_fallback_tensor;
     enable_while_parallel_iterations_ =
         options.enable_while_parallel_iterations;
     target_gpu_ = options.target_gpu;
@@ -1633,19 +1603,15 @@ class TfToTfrtConversionPass
     }
 
     mlir::TypeConverter *func_type_converter;
-    if (func_use_fallback_tensor_) {
-      func_type_converter = &fallback_converter;
-    } else {
-      func_type_converter = &corert_converter;
-    }
+    func_type_converter = &fallback_converter;
     SetUpTFToTFRTConversionLegality(&target, func_type_converter,
                                     corert_converter.chain_type());
 
     PopulateTFToTFRTConversionPatterns(
         &context, &patterns, &corert_converter, &fallback_converter,
         &symbol_table, &cost_analysis, &tensor_array_side_effect_analysis,
-        func_use_fallback_tensor_, enable_while_parallel_iterations_,
-        tpu_lower_to_fallback_, target_tpurt_, use_gpu_compile_and_execute_op_);
+        enable_while_parallel_iterations_, tpu_lower_to_fallback_,
+        target_tpurt_, use_gpu_compile_and_execute_op_);
 
     return mlir::applyPartialConversion(func, target, std::move(patterns));
   }
@@ -1721,7 +1687,7 @@ class TfToTfrtConversionPass
     auto return_op =
         llvm::cast<tfrt::compiler::ReturnOp>(block.getTerminator());
     auto chain = return_op->getOperand(0);
-    assert(chain.getType().isa<tfrt::compiler::ChainType>());
+    assert(mlir::isa<tfrt::compiler::ChainType>(chain.getType()));
     dangling_values.push_back(chain);
 
     mlir::OpBuilder builder(return_op);
@@ -1857,13 +1823,6 @@ class TfToTfrtConversionPass
                      "preferred to be merged for inline execution."),
       llvm::cl::init(false)};
 
-  Option<bool> func_use_fallback_tensor_{
-      *this, "func-use-fallback-tensor",
-      llvm::cl::desc(
-          "If true, use TF tensor as input/output types in func (and other "
-          "control flow) ops."),
-      llvm::cl::init(false)};
-
   Option<bool> enable_while_parallel_iterations_{
       *this, "enable-while-parallel-iterations",
       llvm::cl::desc("If true, tf.While op will be parallelized. This is "
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/tfrt_pipeline_options.h b/tensorflow/compiler/mlir/tfrt/transforms/tfrt_pipeline_options.h
index b1549838b11ce5..f893749011aca0 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/tfrt_pipeline_options.h
+++ b/tensorflow/compiler/mlir/tfrt/transforms/tfrt_pipeline_options.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_TFRT_PIPELINE_OPTIONS_H_
 #define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_TFRT_PIPELINE_OPTIONS_H_
 
+#include <cstdint>
 #include <string>
 
 #include "llvm/Support/CommandLine.h"
@@ -107,13 +108,6 @@ struct TfrtPipelineOptions
       llvm::cl::desc("If true, gpurt.compile_and_execute is used for GPU"),
       llvm::cl::init(false)};
 
-  Option<bool> func_use_fallback_tensor{
-      *this, "func-use-fallback-tensor",
-      llvm::cl::desc(
-          "If true, use TF tensor as input/output types in func (and other "
-          "control flow) ops."),
-      llvm::cl::init(false)};
-
   Option<bool> enable_while_parallel_iterations{
       *this, "enable-while-parallel-iterations",
       llvm::cl::desc("If true, tf.While op will be parallelized. This is "
@@ -144,6 +138,10 @@ struct TfrtPipelineOptions
           "cheap, and then whether it can be executed inline."),
       llvm::cl::init(1)};
 
+  Option<int64_t> min_num_batch_threads{
+      *this, "tfrt-min-num-batch-threads",
+      llvm::cl::desc("The minimum number of batch threads"), llvm::cl::init(1)};
+
   Option<bool> merge_inter_dependent_streams{
       *this, "tfrt-merge-inter-dependent-streams",
       llvm::cl::desc("If true, streams with inter data depenedencies will be "
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/utils.cc b/tensorflow/compiler/mlir/tfrt/transforms/utils.cc
index 9b602babeafe22..711438f21d13f9 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/utils.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/utils.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 #include "tfrt/basic_kernels/opdefs/tfrt_base.h"  // from @tf_runtime
@@ -34,7 +35,7 @@ limitations under the License.
 namespace tensorflow {
 
 bool IsResourceArgument(mlir::Value value) {
-  auto arg = value.dyn_cast<mlir::BlockArgument>();
+  auto arg = mlir::dyn_cast<mlir::BlockArgument>(value);
   if (!arg) return false;
 
   auto func = llvm::cast<mlir::func::FuncOp>(arg.getOwner()->getParentOp());
@@ -44,7 +45,7 @@ bool IsResourceArgument(mlir::Value value) {
 
 bool IsResultVariable(const mlir::Value &original_operand,
                       const mlir::Value &operand) {
-  if (original_operand.isa<mlir::OpResult>()) {
+  if (mlir::isa<mlir::OpResult>(original_operand)) {
     auto defining_op = original_operand.getDefiningOp();
 
     // TODO(b/174753886): When device assignment is properly done, we
@@ -99,7 +100,8 @@ bool IsSessionInitializer(mlir::func::FuncOp op) {
   if (!session_initializer_op) return false;
 
   for (auto sym_ref : session_initializer_op.getInitializers()) {
-    if (op.getSymName() == sym_ref.cast<mlir::FlatSymbolRefAttr>().getValue())
+    if (op.getSymName() ==
+        mlir::cast<mlir::FlatSymbolRefAttr>(sym_ref).getValue())
       return true;
   }
 
diff --git a/tensorflow/compiler/mlir/tfrt/translate/import_model.cc b/tensorflow/compiler/mlir/tfrt/translate/import_model.cc
index 3cf8be9c90cb62..77e3c687f0c02a 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tfrt/translate/import_model.cc
@@ -123,7 +123,7 @@ absl::StatusOr<std::vector<FunctionDef>> ExportXlaFunctions(
     func_op->walk([&](mlir::Operation* op) {
       for (const mlir::NamedAttribute& attr : op->getAttrs()) {
         if (const auto sym =
-                attr.getValue().dyn_cast<mlir::FlatSymbolRefAttr>()) {
+                mlir::dyn_cast<mlir::FlatSymbolRefAttr>(attr.getValue())) {
           mlir::Operation* func =
               mlir::SymbolTable::lookupNearestSymbolFrom(op, sym);
           if (func) {
@@ -342,10 +342,11 @@ std::unique_ptr<tensorflow::TfrtPipelineOptions> GetTfrtPipelineOptions(
   pipeline_options->hoist_invariant_ops = options.hoist_invariant_ops;
   pipeline_options->fuse_get_resource_ops_in_hoisting =
       options.fuse_get_resource_ops_in_hoisting;
-  pipeline_options->func_use_fallback_tensor = true;
   pipeline_options->enable_while_parallel_iterations =
       options.enable_while_parallel_iterations;
   pipeline_options->cost_threshold = options.cost_threshold;
+  pipeline_options->min_num_batch_threads = options.min_num_batch_threads;
+
   pipeline_options->merge_inter_dependent_streams =
       options.merge_inter_dependent_streams;
 
diff --git a/tensorflow/compiler/mlir/tfrt/translate/mlrt/BUILD b/tensorflow/compiler/mlir/tfrt/translate/mlrt/BUILD
index bc887cdfc966f9..cb517d1039711f 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/mlrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/translate/mlrt/BUILD
@@ -7,7 +7,6 @@ package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         # copybara:uncomment "//learning/brain/experimental/tfrt:__subpackages__",
-        # copybara:uncomment "//learning/infra/mira/distributed:__subpackages__",
         # copybara:uncomment "//smartass/brain/ops/tfrt_kernels:__subpackages__",
         "//tensorflow/compiler/mlir/tfrt/transforms/mlrt:__subpackages__",
         "//tensorflow/core/tfrt:__subpackages__",
@@ -29,6 +28,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -43,6 +43,7 @@ tf_cc_test(
         "@com_google_googletest//:gtest_main",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Support",
         "@local_tsl//tsl/platform:resource_loader",
         "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:test",
diff --git a/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.cc b/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.cc
index 98cb26acdba8fa..06606c6fff345e 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.cc
+++ b/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "llvm/ADT/TypeSwitch.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/core/tfrt/mlrt/bytecode/executable.h"
 
 namespace mlrt {
@@ -37,8 +38,8 @@ namespace {
 // LINT.IfChange(mlrt_attributes)
 bool CanBeInlined(mlir::Attribute attr, absl::string_view data) {
   // FlatSymbolRefAttr is a special case as we are emitting it as integer.
-  return attr.isa<mlir::IntegerAttr, mlir::FloatAttr,
-                  mlir::FlatSymbolRefAttr>() &&
+  return mlir::isa<mlir::IntegerAttr, mlir::FloatAttr, mlir::FlatSymbolRefAttr>(
+             attr) &&
          data.size() <= sizeof(uint32_t);
 }
 // LINT.ThenChange(../../../../../core/tfrt/mlrt/interpreter/attribute_span.h:mlrt_attributes)
@@ -64,7 +65,7 @@ std::optional<std::string> EncodeListOfInteger(mlir::ArrayAttr array) {
   mlir::Type type;
 
   for (int i = 0; i < array.size(); ++i) {
-    if (auto integer_attr = array[i].dyn_cast<mlir::IntegerAttr>()) {
+    if (auto integer_attr = mlir::dyn_cast<mlir::IntegerAttr>(array[i])) {
       if (type && integer_attr.getType() != type) return std::nullopt;
       type = integer_attr.getType();
       llvm::APInt value = integer_attr.getValue();
@@ -85,7 +86,7 @@ std::optional<std::string> EncodeListOfSymbolRef(
   auto ctor = bc::New<bc::Vector<uint32_t>>(&allocator, array.size());
 
   for (int i = 0; i < array.size(); ++i) {
-    if (auto symbol_ref = array[i].dyn_cast<mlir::FlatSymbolRefAttr>()) {
+    if (auto symbol_ref = mlir::dyn_cast<mlir::FlatSymbolRefAttr>(array[i])) {
       ctor.ConstructAt(i, module_context.GetFunctionId(symbol_ref.getValue()));
     } else {
       return std::nullopt;
@@ -117,7 +118,7 @@ std::optional<std::string> EncodeListOfString(mlir::ArrayAttr array) {
   auto ctor = bc::New<bc::Vector<bc::String>>(&allocator, array.size());
 
   for (int i = 0; i < array.size(); ++i) {
-    if (auto string_attr = array[i].dyn_cast<mlir::StringAttr>()) {
+    if (auto string_attr = mlir::dyn_cast<mlir::StringAttr>(array[i])) {
       ctor.ConstructAt(i, string_attr.getValue().str());
     } else {
       return std::nullopt;
diff --git a/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode_test.cc b/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode_test.cc
index 8214d1d6deb3b3..07f1fbfdb0c0c1 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode_test.cc
+++ b/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/core/tfrt/mlrt/bytecode/executable.h"
 #include "tensorflow/core/tfrt/mlrt/interpreter/attribute_span.h"
 #include "tsl/platform/resource_loader.h"
@@ -299,13 +300,13 @@ class CustomDense {
 
 absl::StatusOr<std::string> EncodeCustomDense(const ModuleEmitterContext&,
                                               mlir::Attribute attr) {
-  auto dense_int_attr = attr.dyn_cast<mlir::DenseIntElementsAttr>();
+  auto dense_int_attr = mlir::dyn_cast<mlir::DenseIntElementsAttr>(attr);
   if (!dense_int_attr)
     return absl::InvalidArgumentError(
         "The element of the custom dense attribute must be an integer.");
 
-  if (dense_int_attr.getElementType().cast<mlir::IntegerType>().getWidth() !=
-      32) {
+  if (mlir::cast<mlir::IntegerType>(dense_int_attr.getElementType())
+          .getWidth() != 32) {
     return absl::InvalidArgumentError(
         "The element of the custom dense attribute must be an i32 integer.");
   }
diff --git a/tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.cc b/tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.cc
index cf4950d5edbbf3..aadacd8563b7ea 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.cc
+++ b/tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.cc
@@ -19,8 +19,6 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "absl/strings/str_join.h"
-
 namespace tensorflow {
 
 std::ostream& operator<<(std::ostream& os,
@@ -40,8 +38,7 @@ std::ostream& operator<<(std::ostream& os,
 }
 
 std::ostream& operator<<(std::ostream& os, const TfrtCompileOptions& options) {
-  return os << "{"
-            << "variable_device = " << options.variable_device
+  return os << "{" << "variable_device = " << options.variable_device
             << ", default_device = " << options.default_device
             << ", enable_optimizer = " << options.enable_optimizer
             << ", enable_grappler = " << options.enable_grappler
@@ -58,6 +55,7 @@ std::ostream& operator<<(std::ostream& os, const TfrtCompileOptions& options) {
             << ", enable_while_parallel_iterations = "
             << options.enable_while_parallel_iterations
             << ", cost_threshold = " << options.cost_threshold
+            << ", min_num_batch_threads = " << options.min_num_batch_threads
             << ", merge_inter_dependent_streams = "
             << options.merge_inter_dependent_streams
             << ", decompose_resource_ops = " << options.decompose_resource_ops
diff --git a/tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h b/tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h
index 11790e9fa438a0..db50b062b8a209 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h
+++ b/tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h
@@ -136,6 +136,13 @@ struct TfrtCompileOptions {
   // expensive.
   uint64_t cost_threshold = 1;
 
+  // The minimum number of batch threads. This number provides a lower bound on
+  // the number of batch threads on top of what is specified in the model. If
+  // the number of batch threads is too small (e.g. smaller than the number of
+  // parallel hardware accelerator available), it can lead to under utilization
+  // of resources.
+  int64_t min_num_batch_threads = 1;
+
   // If true, streams with inter data depenedencies will be preferred to be
   // merged for inline execution.
   bool merge_inter_dependent_streams = true;
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
index 1069f3fd172411..e9dd7ef5f5e505 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
@@ -159,6 +159,7 @@ tf_cc_binary(
         "//tensorflow/compiler/mlir/tools/kernel_gen/transforms:passes",
         "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:MlirOptLib",
+        "@llvm-project//mlir:Support",
         "@local_xla//xla/mlir_hlo:all_passes",
         "@local_xla//xla/mlir_hlo:hlo_dialect_registration",
         "@stablehlo//:register",
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD
index 42d679c35d0173..2862b79475a930 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD
@@ -92,6 +92,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:SideEffectInterfaces",
+        "@llvm-project//mlir:Support",
         "@local_tsl//tsl/protobuf:error_codes_proto_impl_cc",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc
index a3b8c07cc1bb66..ee295c19335ff5 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/DialectImplementation.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_status.cc.inc"
 
 // Generated dialect definitions.
@@ -61,11 +62,11 @@ Type TFFrameworkDialect::parseType(DialectAsmParser &parser) const {
 
 /// Print a type registered to this dialect.
 void TFFrameworkDialect::printType(Type type, DialectAsmPrinter &os) const {
-  if (type.isa<OpKernelContextType>()) {
+  if (mlir::isa<OpKernelContextType>(type)) {
     os << "op_kernel_context";
     return;
   }
-  if (type.isa<JITCallableType>()) {
+  if (mlir::isa<JITCallableType>(type)) {
     os << "jit_callable";
     return;
   }
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc
index 277511fed098e0..404d134223ab48 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc
@@ -52,6 +52,7 @@ limitations under the License.
 #include "mlir/Interfaces/DataLayoutInterfaces.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h"  // from @llvm-project
@@ -91,7 +92,7 @@ bool IsSmallAlloc(Value alloc) {
   constexpr unsigned kMaximumSizeInBytes = 64;
   constexpr unsigned kMaxRankOfAllocatedMemRef = 1;
 
-  auto type = alloc.getType().dyn_cast<mlir::ShapedType>();
+  auto type = mlir::dyn_cast<mlir::ShapedType>(alloc.getType());
   if (!type || !alloc.getDefiningOp<mlir::memref::AllocOp>()) return false;
   if (!type.hasStaticShape()) {
     // Check if the dynamic shape dimension of the alloc is produced by RankOp
@@ -176,8 +177,6 @@ Status LowerHlotoLoops(mlir::ModuleOp module,
   // Transform HLO operations to LinAlg and standard.
   pm.addNestedPass<FuncOp>(::mlir::mhlo::createLegalizeHloToLinalgPass());
   pm.addPass(::mlir::mhlo::createLegalizeToArithmeticPass());
-  pm.addNestedPass<FuncOp>(
-      mlir::mhlo::createLegalizeHloShapeOpsToStandardPass());
 
   // Remove the remaining references to unsigned types after all HLO compute
   // operations were converted.
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/hlo_to_kernel/add_v2.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/hlo_to_kernel/add_v2.mlir
index 686b34e0d138db..c2a9404ebb926b 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tests/hlo_to_kernel/add_v2.mlir
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/hlo_to_kernel/add_v2.mlir
@@ -46,7 +46,7 @@ func.func @AddV2(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> at
         %cast = tensor.cast %23 : tensor<?xf32> to tensor<*xf32>
         scf.yield %cast : tensor<*xf32>
       } else {
-        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %19:2 = mhlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %20 = shape.rank %19#0 : tensor<?xindex> -> index
         %21 = shape.rank %19#1 : tensor<?xindex> -> index
         %22 = arith.cmpi sgt, %20, %21 : index
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/hlo_to_kernel/add_v2_unsigned.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/hlo_to_kernel/add_v2_unsigned.mlir
index f38a2dca1bc8cd..a83f91663951de 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tests/hlo_to_kernel/add_v2_unsigned.mlir
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/hlo_to_kernel/add_v2_unsigned.mlir
@@ -46,7 +46,7 @@ func.func @AddV2(%arg0: tensor<*xui32>, %arg1: tensor<*xui32>) -> tensor<*xui32>
         %cast = tensor.cast %23 : tensor<?xui32> to tensor<*xui32>
         scf.yield %cast : tensor<*xui32>
       } else {
-        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %19:2 = mhlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %20 = shape.rank %19#0 : tensor<?xindex> -> index
         %21 = shape.rank %19#1 : tensor<?xindex> -> index
         %22 = arith.cmpi sgt, %20, %21 : index
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/hlo_to_kernel/minimum.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/hlo_to_kernel/minimum.mlir
index 1facc06ee500e9..dc41eee4404837 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tests/hlo_to_kernel/minimum.mlir
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/hlo_to_kernel/minimum.mlir
@@ -46,7 +46,7 @@ func.func @Minimum_GPU_DT_UINT32_DT_UINT32(%arg0: tensor<*xui32>, %arg1: tensor<
         %cast = tensor.cast %23 : tensor<?xui32> to tensor<*xui32>
         scf.yield %cast : tensor<*xui32>
       } else {
-        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %19:2 = mhlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %20 = shape.rank %19#0 : tensor<?xindex> -> index
         %21 = shape.rank %19#1 : tensor<?xindex> -> index
         %22 = arith.cmpi sgt, %20, %21 : index
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tools/kernel-gen-opt/kernel-gen-opt.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tools/kernel-gen-opt/kernel-gen-opt.cc
index 178e899cb33a72..dcbe88c3048eae 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tools/kernel-gen-opt/kernel-gen-opt.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tools/kernel-gen-opt/kernel-gen-opt.cc
@@ -15,19 +15,18 @@ limitations under the License.
 
 #include "mlir/InitAllDialects.h"  // from @llvm-project
 #include "mlir/InitAllPasses.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Tools/mlir-opt/MlirOptMain.h"  // from @llvm-project
 #include "stablehlo/dialect/Register.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h"
 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
-#include "xla/mlir_hlo/lhlo/transforms/passes.h"
 #include "xla/mlir_hlo/mhlo/IR/register.h"
 #include "xla/mlir_hlo/mhlo/transforms/passes.h"
 
 int main(int argc, char **argv) {
   mlir::registerAllPasses();
   mlir::mhlo::registerAllMhloPasses();
-  mlir::lmhlo::registerAllLmhloPasses();
   mlir::kernel_gen::registerKernelGenPasses();
 
   mlir::DialectRegistry registry;
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
index d1c3af0b9a6191..489e13d172c059 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
@@ -39,6 +39,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMCommonConversion",
         "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
@@ -75,6 +76,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:MemRefDialect",
+        "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/buffer_reuse_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/buffer_reuse_pass.cc
index 37999960cd69e7..45dbbf993bb6be 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/buffer_reuse_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/buffer_reuse_pass.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h"
 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewriters.h"
@@ -115,7 +116,7 @@ class BufferReuseAnalysis {
       // Find reuse candidates for the regarded allocation.
       SmallVector<int32_t, 2> local_reuse_candidates;
       for (BlockArgument old_buffer : arguments) {
-        if (!old_buffer.getType().isa<BaseMemRefType>()) continue;
+        if (!mlir::isa<BaseMemRefType>(old_buffer.getType())) continue;
 
         // Lifetime criterion: Only reuse buffers that are no longer used on
         // first reuse, i.e. they are no longer alive.
@@ -177,15 +178,16 @@ class BufferReuseAnalysis {
   std::vector<Value> get_buffer_arguments(func::FuncOp &f) {
     std::vector<Value> buffer_arguments;
     for (BlockArgument arg : f.getArguments()) {
-      if (arg.getType().isa<BaseMemRefType>()) buffer_arguments.push_back(arg);
+      if (mlir::isa<BaseMemRefType>(arg.getType()))
+        buffer_arguments.push_back(arg);
     }
     return buffer_arguments;
   }
 
   bool can_reuse_locally(Operation *op, Value old_buffer, Value new_buffer) {
     // For now, we support only memrefs with the same memory layout.
-    auto old_buffer_ty = old_buffer.getType().dyn_cast<MemRefType>();
-    auto new_buffer_ty = old_buffer.getType().dyn_cast<MemRefType>();
+    auto old_buffer_ty = mlir::dyn_cast<MemRefType>(old_buffer.getType());
+    auto new_buffer_ty = mlir::dyn_cast<MemRefType>(old_buffer.getType());
     if (!old_buffer_ty || !new_buffer_ty ||
         old_buffer_ty.getLayout() != new_buffer_ty.getLayout())
       return false;
@@ -205,7 +207,7 @@ class BufferReuseAnalysis {
         // Allow dropping dimensions but no permutations.
         int64_t i = -1;
         for (AffineExpr expr : map.getResults()) {
-          auto dim_expr = expr.dyn_cast<AffineDimExpr>();
+          auto dim_expr = mlir::dyn_cast<AffineDimExpr>(expr);
           if (!dim_expr || dim_expr.getPosition() <= i) return false;
           i = dim_expr.getPosition();
         }
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/copy_cleanup_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/copy_cleanup_pass.cc
index 32faed506e52b4..9f41b399e2fd7f 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/copy_cleanup_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/copy_cleanup_pass.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
 
 namespace mlir {
@@ -135,7 +136,7 @@ void RemoveCopyIfTargetIsFunctionArg(func::FuncOp func) {
   Block &body = func.getBody().front();
   for (auto &op : llvm::reverse(body.without_terminator())) {
     if (auto copy = dyn_cast<memref::CopyOp>(op)) {
-      auto block_arg = copy.getTarget().dyn_cast<BlockArgument>();
+      auto block_arg = mlir::dyn_cast<BlockArgument>(copy.getTarget());
       if (!block_arg) break;
       if (!isa<func::FuncOp>(block_arg.getOwner()->getParentOp()) ||
           !block_arg.hasOneUse())
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework.cc
index b7ad2d4d28b129..a6f23f1ad43aa8 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/TypeRange.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h"
 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewriters.h"
@@ -64,7 +65,7 @@ std::optional<Value> FindOpKernelContext(Operation *op) {
     return std::nullopt;
   }
   Value ctx = func.getArgument(0);
-  if (!ctx.getType().isa<OpKernelContextType>()) {
+  if (!mlir::isa<OpKernelContextType>(ctx.getType())) {
     return std::nullopt;
   }
   return ctx;
@@ -114,7 +115,8 @@ struct DeallocOpConverter : public OpConversionPattern<memref::DeallocOp> {
     if (!ctx) return failure();
 
     // Operand with no layout is expected.
-    auto operand_memref_type = dealloc.getMemref().getType().cast<MemRefType>();
+    auto operand_memref_type =
+        mlir::cast<MemRefType>(dealloc.getMemref().getType());
     if (!operand_memref_type.getLayout().isIdentity()) {
       return failure();
     }
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework_pass.cc
index ed1138849e5a06..b5b22008dcb951 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework_pass.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h"
 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
@@ -71,7 +72,7 @@ class EmbedTFFrameworkPass
       }
       FunctionType func_type = op.getFunctionType();
       return func_type.getNumInputs() > 0 &&
-             func_type.getInput(0).isa<OpKernelContextType>();
+             mlir::isa<OpKernelContextType>(func_type.getInput(0));
     });
     target.addDynamicallyLegalOp<cf::AssertOp, memref::AllocOp,
                                  memref::DeallocOp>(IsNotInsideTfEntryFunction);
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/same_shape_propagation.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/same_shape_propagation.cc
index a6b24b1a3afcc3..fa6ba2491d5906 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/same_shape_propagation.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/same_shape_propagation.cc
@@ -217,7 +217,7 @@ class ShapeEqualityKnowledge {
       }
       if (auto alloc = dyn_cast<memref::AllocOp>(op)) {
         SmallVector<ValueOrConst, 4> shape;
-        ShapedType type = alloc.getResult().getType().cast<ShapedType>();
+        ShapedType type = mlir::cast<ShapedType>(alloc.getResult().getType());
         fillShapeFromAllocLike(alloc.getDynamicSizes(), type, shape);
         registerAssociation(ShapeValue{shape}, alloc.getResult());
         return;
@@ -225,7 +225,7 @@ class ShapeEqualityKnowledge {
       if (auto alloc = dyn_cast<tf_framework::TFAllocOp>(op)) {
         // Construct a symbol representing the allocated shape.
         SmallVector<ValueOrConst, 4> shape;
-        ShapedType type = alloc.getResult().getType().cast<ShapedType>();
+        ShapedType type = mlir::cast<ShapedType>(alloc.getResult().getType());
         fillShapeFromAllocLike(alloc.getDynSizes(), type, shape);
         registerAssociation(ShapeValue{shape}, alloc.getResult());
         return;
@@ -331,7 +331,7 @@ struct PropagateShapeKnowledgeToKernels
       // Position of the kernel argument we are currently at.
       int kernel_p = 0;
       for (auto operand : launch.getKernelOperands()) {
-        auto memref = operand.getType().dyn_cast<MemRefType>();
+        auto memref = mlir::dyn_cast<MemRefType>(operand.getType());
         if (!memref) {
           // Scalar argument, advance kernel position by one.
           kernel_p++;
@@ -341,7 +341,7 @@ struct PropagateShapeKnowledgeToKernels
           if (!knowledge.haveSameShape(operand, previous.first)) {
             continue;
           }
-          auto previous_type = previous.first.getType().cast<MemRefType>();
+          auto previous_type = mlir::cast<MemRefType>(previous.first.getType());
           // We use the first equality found and replace uses of corresponding
           // size and (potentially) stride information here.
           auto args_to_replace = memref.getRank();
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tensorflow_abi_knowledge_propagation.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tensorflow_abi_knowledge_propagation.cc
index 89ecd6da13be74..a7d26813239571 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tensorflow_abi_knowledge_propagation.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tensorflow_abi_knowledge_propagation.cc
@@ -56,7 +56,7 @@ struct PropagateTfAbiKnowledgeToKernelsPass
       // the inner stride is one.
       // TODO(herhut): Insert asserts in debug mode to check this.
       for (auto argument : function.getArguments()) {
-        if (argument.getType().isa<BaseMemRefType>()) {
+        if (mlir::isa<BaseMemRefType>(argument.getType())) {
           worklist.push_back(argument);
           allocated_by_tf_runtime.insert(argument);
           offset_is_zero.insert(argument);
@@ -95,7 +95,7 @@ struct PropagateTfAbiKnowledgeToKernelsPass
       llvm::SmallDenseMap<int64_t, Value> constants;
       auto loc = kernel.getLoc();
       for (auto operand : launch.getKernelOperands()) {
-        auto memref = operand.getType().dyn_cast<MemRefType>();
+        auto memref = mlir::dyn_cast<MemRefType>(operand.getType());
         if (!memref) {
           // Scalar argument, advance kernel position by one.
           kernel_p++;
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc
index cffa5e7b44691e..8748b188f35dfa 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h"
 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewriters.h"
@@ -96,14 +97,14 @@ class ConvertToLLVMCallOpPattern : public ConvertOpToLLVMPattern<OpTy> {
       Location loc, Type size_ty, Type element_ty,
       std::optional<ArrayAttr> attr,
       ConversionPatternRewriter *rewriter) const {
-    assert(size_ty.isa<IntegerType>() && "expect integer size type");
-    assert(element_ty.isa<IntegerType>() && "expect integer element type");
+    assert(mlir::isa<IntegerType>(size_ty) && "expect integer size type");
+    assert(mlir::isa<IntegerType>(element_ty) && "expect integer element type");
     return ConvertArrayAttrToStackAllocatedArray(
         loc, size_ty, element_ty, attr, rewriter, [&](Attribute attr) {
           return rewriter->create<LLVM::ConstantOp>(
               loc, element_ty,
               rewriter->getIntegerAttr(element_ty,
-                                       attr.cast<IntegerAttr>().getInt()));
+                                       mlir::cast<IntegerAttr>(attr).getInt()));
         });
   }
 };
@@ -227,7 +228,7 @@ class TFDeallocOpConverter : public ConvertToLLVMCallOpPattern<TFDeallocOp> {
       TFDeallocOp op, OpAdaptor adaptor,
       ConversionPatternRewriter &rewriter) const override {
     // TODO(herhut) Support unranked memrefs.
-    if (!op.getMemref().getType().isa<MemRefType>()) return failure();
+    if (!mlir::isa<MemRefType>(op.getMemref().getType())) return failure();
     MemRefDescriptor memref(adaptor.getMemref());
 
     Value allocated_bytes_ptr = memref.allocatedPtr(rewriter, op.getLoc());
@@ -429,7 +430,7 @@ class ReportErrorOpConverter
     std::string err_str;
     llvm::raw_string_ostream err_stream(err_str);
     err_stream << message;
-    if (!loc.isa<UnknownLoc>()) {
+    if (!mlir::isa<UnknownLoc>(loc)) {
       err_stream << " at ";
       loc.print(err_stream);
     }
@@ -465,16 +466,18 @@ class NullMemRefOpConverter : public ConvertOpToLLVMPattern<NullMemRefOp> {
     MLIRContext *ctx = null_memref_op.getContext();
     mlir::Operation *op = null_memref_op.getOperation();
 
-    auto shaped_result_type = null_memref_op.getType().cast<BaseMemRefType>();
-    auto mem_space =
-        shaped_result_type.getMemorySpace().dyn_cast_or_null<IntegerAttr>();
+    auto shaped_result_type =
+        mlir::cast<BaseMemRefType>(null_memref_op.getType());
+    auto mem_space = mlir::dyn_cast_or_null<IntegerAttr>(
+        shaped_result_type.getMemorySpace());
     unsigned address_space =
         static_cast<unsigned>(mem_space ? mem_space.getInt() : 0);
     LLVM::LLVMPointerType llvm_ptr_type =
         LLVM::LLVMPointerType::get(ctx, address_space);
 
     Value zero = createIndexAttrConstant(rewriter, loc, getIndexType(), 0);
-    if (auto result_type = null_memref_op.getType().dyn_cast<MemRefType>()) {
+    if (auto result_type =
+            mlir::dyn_cast<MemRefType>(null_memref_op.getType())) {
       // Set all dynamic sizes to 1 and compute fake strides.
       SmallVector<Value, 4> dyn_sizes(
           result_type.getNumDynamicDims(),
@@ -497,7 +500,7 @@ class NullMemRefOpConverter : public ConvertOpToLLVMPattern<NullMemRefOp> {
       return success();
     }
 
-    auto result_type = null_memref_op.getType().cast<UnrankedMemRefType>();
+    auto result_type = mlir::cast<UnrankedMemRefType>(null_memref_op.getType());
     Type llvm_result_type = type_converter.convertType(result_type);
 
     auto desc =
@@ -506,7 +509,7 @@ class NullMemRefOpConverter : public ConvertOpToLLVMPattern<NullMemRefOp> {
 
     // Extract address space and element type.
     auto targetType =
-        null_memref_op.getResult().getType().cast<UnrankedMemRefType>();
+        mlir::cast<UnrankedMemRefType>(null_memref_op.getResult().getType());
     unsigned addressSpace =
         *getTypeConverter()->getMemRefAddressSpace(targetType);
 
@@ -549,7 +552,7 @@ class IsValidMemRefOpConverter
     MemRefDescriptor desc(adaptor.getArg());
 
     // Compare every size in the descriptor to 0 to check num_elements == 0.
-    int64_t rank = op.getArg().getType().cast<MemRefType>().getRank();
+    int64_t rank = mlir::cast<MemRefType>(op.getArg().getType()).getRank();
     Value is_empty_shape = rewriter.create<LLVM::ConstantOp>(
         loc, rewriter.getI1Type(), rewriter.getBoolAttr(false));
     Value zero = createIndexAttrConstant(rewriter, loc, getIndexType(), 0);
diff --git a/tensorflow/compiler/mlir/tosa/tests/retain_call_once_funcs.mlir b/tensorflow/compiler/mlir/tosa/tests/retain_call_once_funcs.mlir
index 5719fd35989a6b..64fcdfc18d081f 100644
--- a/tensorflow/compiler/mlir/tosa/tests/retain_call_once_funcs.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/retain_call_once_funcs.mlir
@@ -4,7 +4,7 @@
 module {
   // CHECK-LABEL: @main
   func.func @main(%arg0: tensor<16x16xf32>) -> (tensor<16x16xf32>) {
-    // CHECK: "tfl.call_once"() {session_init_function = "NoOp", session_init_function_symbol = @NoOp} : () -> ()
+    // CHECK: "tfl.call_once"() <{session_init_function = "NoOp"}> {session_init_function_symbol = @NoOp} : () -> ()
     "tfl.call_once"() {session_init_function = "NoOp"} : () -> ()
     %0 = "tfl.var_handle"() {container = "", shared_name = "Variable"} : () -> tensor<*x!tf_type.resource>
     %1 = "tfl.read_variable"(%0) : (tensor<*x!tf_type.resource>) -> tensor<16x16xf32>
diff --git a/tensorflow/compiler/mlir/tosa/tests/tf-to-tosa-pipeline.mlir b/tensorflow/compiler/mlir/tosa/tests/tf-to-tosa-pipeline.mlir
index 53cbd84d6e441f..11648c9572b63c 100644
--- a/tensorflow/compiler/mlir/tosa/tests/tf-to-tosa-pipeline.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/tf-to-tosa-pipeline.mlir
@@ -100,7 +100,7 @@ func.func @test_mul(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x1x3xf32>) -> te
 // -----
 
 // CHECK-LABEL: test_real_div
-// CHECK: %[[VAR0:.*]] = tosa.div %arg0, %arg1
+// CHECK: %[[VAR0:.*]] = tosa.int_div %arg0, %arg1
 func.func @test_real_div(%arg0: tensor<13x21x3xi32>, %arg1: tensor<13x1x3xi32>) -> tensor<13x21x3xi32> {
   %2 = "tf.RealDiv"(%arg0, %arg1)   : (tensor<13x21x3xi32>, tensor<13x1x3xi32>) -> tensor<13x21x3xi32>
   func.return %2 : tensor<13x21x3xi32>
@@ -109,7 +109,7 @@ func.func @test_real_div(%arg0: tensor<13x21x3xi32>, %arg1: tensor<13x1x3xi32>)
 // -----
 
 // CHECK-LABEL: test_floor_div
-// CHECK: %[[VAR0:.*]] = tosa.div %arg0, %arg1
+// CHECK: %[[VAR0:.*]] = tosa.int_div %arg0, %arg1
 func.func @test_floor_div(%arg0: tensor<13x21x3xi32>, %arg1: tensor<13x1x3xi32>) -> tensor<13x21x3xi32> {
   %2 = "tf.FloorDiv"(%arg0, %arg1)   : (tensor<13x21x3xi32>, tensor<13x1x3xi32>) -> tensor<13x21x3xi32>
   func.return %2 : tensor<13x21x3xi32>
diff --git a/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline-filtered.mlir b/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline-filtered.mlir
index c73ece4991d513..77bafd5bc1ba9a 100644
--- a/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline-filtered.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline-filtered.mlir
@@ -4,7 +4,7 @@
 
 // CHECK-LABEL: test_conv2d
 // CHECK-DAG: %[[VAR0:.*]] = arith.constant dense<0.000000e+00> : tensor<16xf32>
-// CHECK: %[[VAR1:.*]] = "tfl.conv_2d"(%arg0, %arg1, %[[VAR0]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}
+// CHECK: %[[VAR1:.*]] = "tfl.conv_2d"(%arg0, %arg1, %[[VAR0]]) <{dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}>
 func.func @test_conv2d(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x2x2x8xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<0.000000e+00> : tensor<16xf32>
   %0 = "tfl.conv_2d"(%arg0, %arg1, %cst)  {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}  : (tensor<1x32x32x8xf32>, tensor<16x2x2x8xf32>, tensor<16xf32>) -> tensor<*xf32>
@@ -15,7 +15,7 @@ func.func @test_conv2d(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x2x2x8xf32>
 
 // CHECK-LABEL: func @test_softmax(
 // CHECK-SAME:%[[VAR0:.*]]: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
-// CHECK: %[[VAR1:.*]] = "tfl.softmax"(%[[VAR0]]) {beta = 1.000000e+00 : f32} : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+// CHECK: %[[VAR1:.*]] = "tfl.softmax"(%[[VAR0]]) <{beta = 1.000000e+00 : f32}> : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
 // CHECK: return %[[VAR1]] : tensor<13x21x3xf32>
 func.func @test_softmax(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
   %0 = "tfl.softmax"(%arg0)  {beta = 1.000000e+00 : f32}  : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
diff --git a/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir b/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir
index 57a5fdf02205f0..5ee9eaf4cd5517 100644
--- a/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir
@@ -370,7 +370,7 @@ func.func @test_rcp(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
 
 // CHECK-LABEL: test_div
 // CHECK-DAG: %[[RESHAPE:.*]] = tosa.reshape %arg1
-// CHECK: %[[VAR0:.*]] = tosa.div %arg0, %[[RESHAPE]]
+// CHECK: %[[VAR0:.*]] = tosa.int_div %arg0, %[[RESHAPE]]
 func.func @test_div(%arg0: tensor<13x21x3xi32>, %arg1: tensor<i32>) -> tensor<*xi32> {
   %0 = "tfl.div"(%arg0, %arg1)  {fused_activation_function = "NONE"}  : (tensor<13x21x3xi32>, tensor<i32>) -> tensor<*xi32>
   func.return %0 : tensor<*xi32>
@@ -380,7 +380,7 @@ func.func @test_div(%arg0: tensor<13x21x3xi32>, %arg1: tensor<i32>) -> tensor<*x
 
 // CHECK-LABEL: test_floor_div
 // CHECK-DAG: %[[RESHAPE:.*]] = tosa.reshape %arg1
-// CHECK: %[[VAR0:.*]] = tosa.div %arg0, %[[RESHAPE]]
+// CHECK: %[[VAR0:.*]] = tosa.int_div %arg0, %[[RESHAPE]]
 func.func @test_floor_div(%arg0: tensor<13x21x3xi32>, %arg1: tensor<i32>) -> tensor<*xi32> {
   %0 = "tfl.floor_div"(%arg0, %arg1)  {fused_activation_function = "NONE"}  : (tensor<13x21x3xi32>, tensor<i32>) -> tensor<*xi32>
   func.return %0 : tensor<*xi32>
diff --git a/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-stateful.mlir b/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-stateful.mlir
index 401089a6d7cb99..6ad25dca4b8abd 100644
--- a/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-stateful.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-stateful.mlir
@@ -65,7 +65,7 @@ module {
 module {
     // CHECK-LABEL: @nostate
     // CHECK: %[[VAL_0:.*]]: tensor<16x16xf32>) -> tensor<16x16xf32> {
-    // CHECK: %[[VAL_1:.*]] = "tfl.var_handle"() {container = "", shared_name = "Variable"} : () -> tensor<*x!tf_type.resource>
+    // CHECK: %[[VAL_1:.*]] = "tfl.var_handle"() <{container = "", shared_name = "Variable"}> : () -> tensor<*x!tf_type.resource>
     // CHECK: %[[VAL_2:.*]] = "tfl.read_variable"(%[[VAL_1]]) : (tensor<*x!tf_type.resource>) -> tensor<16x16xf32>
     // CHECK: %[[VAL_3:.*]] = tosa.add %[[VAL_2]], %[[VAL_0]] : (tensor<16x16xf32>, tensor<16x16xf32>) -> tensor<16x16xf32>
     // CHECK: "tfl.assign_variable"(%[[VAL_1]], %[[VAL_3]]) : (tensor<*x!tf_type.resource>, tensor<16x16xf32>) -> ()
diff --git a/tensorflow/compiler/mlir/tosa/transforms/convert_tfl_uint8.cc b/tensorflow/compiler/mlir/tosa/transforms/convert_tfl_uint8.cc
index 350d9e47545fb0..6523824611a603 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/convert_tfl_uint8.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/convert_tfl_uint8.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
@@ -90,7 +91,7 @@ struct ConvertUint8QConstOp : public RewritePattern {
     }
 
     mlir::DenseElementsAttr src_dense_attr =
-        tfl_qconst_op.getValue().cast<DenseElementsAttr>();
+        mlir::cast<DenseElementsAttr>(tfl_qconst_op.getValue());
 
     double type_range_min =
         static_cast<double>(output_element_type.getStorageTypeMin() -
diff --git a/tensorflow/compiler/mlir/tosa/transforms/dequantize_tfl_softmax.cc b/tensorflow/compiler/mlir/tosa/transforms/dequantize_tfl_softmax.cc
index b64e4eda6d5e37..ba194e3e81c964 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/dequantize_tfl_softmax.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/dequantize_tfl_softmax.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
@@ -52,8 +53,8 @@ LogicalResult TosaDequantizeTFLSoftmaxPattern::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   TFL::SoftmaxOp tfl_softmax_op = cast<TFL::SoftmaxOp>(op);
   RankedTensorType input_type =
-      tfl_softmax_op.getInput().getType().cast<RankedTensorType>();
-  if (!input_type.getElementType().isa<mlir::quant::QuantizedType>()) {
+      mlir::cast<RankedTensorType>(tfl_softmax_op.getInput().getType());
+  if (!mlir::isa<mlir::quant::QuantizedType>(input_type.getElementType())) {
     return failure();
   }
   Location loc = tfl_softmax_op.getLoc();
diff --git a/tensorflow/compiler/mlir/tosa/transforms/fuse_bias_tf.cc b/tensorflow/compiler/mlir/tosa/transforms/fuse_bias_tf.cc
index efee9aa9e9b9c2..ff07b9d6f91039 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/fuse_bias_tf.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/fuse_bias_tf.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
@@ -80,7 +81,7 @@ LogicalResult ConvertTFBiasAddOp::matchAndRewrite(
 
   auto value = tf_biasadd_op.getValue();
   auto bias = tf_biasadd_op.getBias();
-  auto bias_shape = bias.getType().cast<RankedTensorType>().getShape();
+  auto bias_shape = mlir::cast<RankedTensorType>(bias.getType()).getShape();
   if (bias_shape.size() != 1) {
     return rewriter.notifyMatchFailure(op, "bias tensor must be rank 1");
   }
@@ -89,7 +90,8 @@ LogicalResult ConvertTFBiasAddOp::matchAndRewrite(
           llvm::dyn_cast_if_present<TF::Conv2DOp>(value.getDefiningOp())) {
     // Sanity check to confirm rhs() has the expected shape of bias
     auto filter_shape =
-        tf_conv2d_op.getFilter().getType().cast<RankedTensorType>().getShape();
+        mlir::cast<RankedTensorType>(tf_conv2d_op.getFilter().getType())
+            .getShape();
 
     // Assume the filter shape is [H, W, I, O]
     if (filter_shape.back() != bias_shape.back()) {
@@ -114,7 +116,8 @@ LogicalResult ConvertTFBiasAddOp::matchAndRewrite(
           llvm::dyn_cast_if_present<TF::Conv3DOp>(value.getDefiningOp())) {
     // Sanity check to confirm rhs() has the expected shape of bias
     auto filter_shape =
-        tf_conv3d_op.getFilter().getType().cast<RankedTensorType>().getShape();
+        mlir::cast<RankedTensorType>(tf_conv3d_op.getFilter().getType())
+            .getShape();
 
     // Assume the filter shape is [D, H, W, I, O]
     if (filter_shape.back() != bias_shape.back()) {
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_common.cc b/tensorflow/compiler/mlir/tosa/transforms/legalize_common.cc
index 3b461b8b36ae42..25707c2bde1331 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_common.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_common.cc
@@ -56,6 +56,7 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
 #include "tensorflow/compiler/mlir/tosa/transforms/legalize_utils.h"
 
@@ -571,7 +572,7 @@ std::optional<Value> convertZerosLikeOp(PatternRewriter& rewriter,
   Attribute zero_attr = rewriter.getZeroAttr(zero_type);
 
   return CreateOpAndInfer<tosa::ConstOp>(rewriter, op->getLoc(), zero_type,
-                                         zero_attr.cast<ElementsAttr>())
+                                         mlir::cast<ElementsAttr>(zero_attr))
       .getResult();
 }
 
@@ -586,12 +587,12 @@ std::optional<Value> convertMultiplyOp(PatternRewriter& rewriter, Operation* op,
   // Not a shaped tensor output
   if (!input_lhs_type || !input_rhs_type || !output_type) return std::nullopt;
 
-  bool input_lhs_is_qtype =
-      input_lhs_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
-  bool input_rhs_is_qtype =
-      input_rhs_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
-  bool output_is_qtype =
-      output_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
+  bool input_lhs_is_qtype = mlir::isa<mlir::quant::UniformQuantizedType>(
+      input_lhs_type.getElementType());
+  bool input_rhs_is_qtype = mlir::isa<mlir::quant::UniformQuantizedType>(
+      input_rhs_type.getElementType());
+  bool output_is_qtype = mlir::isa<mlir::quant::UniformQuantizedType>(
+      output_type.getElementType());
 
   if (input_lhs_is_qtype != output_is_qtype ||
       input_rhs_is_qtype != output_is_qtype) {
@@ -603,12 +604,12 @@ std::optional<Value> convertMultiplyOp(PatternRewriter& rewriter, Operation* op,
 
   if (output_is_qtype) {
     ShapedType rescale_type = output_type.clone(rewriter.getI32Type());
-    auto input_lhs_qtype = input_lhs_type.getElementType()
-                               .cast<mlir::quant::UniformQuantizedType>();
-    auto input_rhs_qtype = input_rhs_type.getElementType()
-                               .cast<mlir::quant::UniformQuantizedType>();
-    auto output_qtype =
-        output_type.getElementType().cast<mlir::quant::UniformQuantizedType>();
+    auto input_lhs_qtype = mlir::cast<mlir::quant::UniformQuantizedType>(
+        input_lhs_type.getElementType());
+    auto input_rhs_qtype = mlir::cast<mlir::quant::UniformQuantizedType>(
+        input_rhs_type.getElementType());
+    auto output_qtype = mlir::cast<mlir::quant::UniformQuantizedType>(
+        output_type.getElementType());
 
     // MLIR store scale as double, but TFLite store scale as float
     // Downcasting from double to float to match TFLite behavior
@@ -661,11 +662,11 @@ std::optional<Value> convertSquaredDifferenceOp(PatternRewriter& rewriter,
   }
 
   bool x_is_qtype =
-      x_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
+      mlir::isa<mlir::quant::UniformQuantizedType>(x_type.getElementType());
   bool y_is_qtype =
-      y_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
-  bool result_is_qtype =
-      result_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
+      mlir::isa<mlir::quant::UniformQuantizedType>(y_type.getElementType());
+  bool result_is_qtype = mlir::isa<mlir::quant::UniformQuantizedType>(
+      result_type.getElementType());
 
   if (x_is_qtype != result_is_qtype || y_is_qtype != result_is_qtype) {
     (void)rewriter.notifyMatchFailure(
@@ -678,11 +679,11 @@ std::optional<Value> convertSquaredDifferenceOp(PatternRewriter& rewriter,
   // Then scale back to I8
   if (result_is_qtype) {
     auto x_qtype =
-        x_type.getElementType().cast<mlir::quant::UniformQuantizedType>();
+        mlir::cast<mlir::quant::UniformQuantizedType>(x_type.getElementType());
     auto y_qtype =
-        y_type.getElementType().cast<mlir::quant::UniformQuantizedType>();
-    auto result_qtype =
-        result_type.getElementType().cast<mlir::quant::UniformQuantizedType>();
+        mlir::cast<mlir::quant::UniformQuantizedType>(y_type.getElementType());
+    auto result_qtype = mlir::cast<mlir::quant::UniformQuantizedType>(
+        result_type.getElementType());
 
     uint32_t result_bits = result_qtype.getStorageTypeIntegralWidth();
 
@@ -779,16 +780,16 @@ std::optional<Value> convertConcatV2Op(PatternRewriter& rewriter, Operation* op,
   }
 
   mlir::quant::UniformQuantizedType result_quant_type =
-      result_type.getElementType()
-          .dyn_cast_or_null<mlir::quant::UniformQuantizedType>();
+      mlir::dyn_cast_or_null<mlir::quant::UniformQuantizedType>(
+          result_type.getElementType());
 
   SmallVector<Value> values_rescaled;
 
   for (auto v : values) {
     RankedTensorType operand_type = dyn_cast<RankedTensorType>(v.getType());
     mlir::quant::UniformQuantizedType operand_quant_type =
-        operand_type.getElementType()
-            .dyn_cast_or_null<mlir::quant::UniformQuantizedType>();
+        mlir::dyn_cast_or_null<mlir::quant::UniformQuantizedType>(
+            operand_type.getElementType());
 
     // tfl.concat currently allows different scales for each input tensor, which
     // TFlite team will fix in:
@@ -818,7 +819,8 @@ std::optional<Value> convertConcatV2Op(PatternRewriter& rewriter, Operation* op,
     }
   }
 
-  int32_t tensor_rank = values[0].getType().cast<RankedTensorType>().getRank();
+  int32_t tensor_rank =
+      mlir::cast<RankedTensorType>(values[0].getType()).getRank();
 
   if (axis < 0) axis += tensor_rank;
   if ((axis < 0) || (axis > tensor_rank)) {
@@ -1046,7 +1048,8 @@ std::optional<Value> convertSpaceToBatchNDOp(PatternRewriter& rewriter,
   //  [padded_shape[M] / block_shape[M-1]] +
   //  remaining_shape
   int32_t a2_reshape_a1_rank =
-      a2_reshape_a1_op.getResult().getType().cast<RankedTensorType>().getRank();
+      mlir::cast<RankedTensorType>(a2_reshape_a1_op.getResult().getType())
+          .getRank();
   SmallVector<int32_t> a3_perm(a2_reshape_a1_rank);
   SmallVector<int64_t> a3_transpose_shape(a2_reshape_a1_rank);
 
@@ -1579,17 +1582,19 @@ std::optional<Value> convertSoftmaxOp(PatternRewriter& rewriter, Operation* op,
   int32_t input_rank = input_type.getShape().size();
   ArrayRef<int64_t> logits_shape = output_type.getShape();
 
-  if (input_type.getElementType().isa<mlir::quant::QuantizedType>() &&
-      output_type.getElementType().isa<mlir::quant::QuantizedType>()) {
+  if (mlir::isa<mlir::quant::QuantizedType>(input_type.getElementType()) &&
+      mlir::isa<mlir::quant::QuantizedType>(output_type.getElementType())) {
     SmallVector<int64_t> rsum_shape_v(input_type.getShape().begin(),
                                       input_type.getShape().end() - 1);
     rsum_shape_v.push_back(1);
     ArrayRef<int64_t> rsum_shape(rsum_shape_v);
     // The if condition already checks if these are UQTs
     mlir::quant::UniformQuantizedType in_quant_type =
-        input_type.getElementType().cast<mlir::quant::UniformQuantizedType>();
+        mlir::cast<mlir::quant::UniformQuantizedType>(
+            input_type.getElementType());
     mlir::quant::UniformQuantizedType out_quant_type =
-        output_type.getElementType().cast<mlir::quant::UniformQuantizedType>();
+        mlir::cast<mlir::quant::UniformQuantizedType>(
+            output_type.getElementType());
 
     auto int16_element_qtype = mlir::quant::UniformQuantizedType::get(
         true, rewriter.getIntegerType(16), rewriter.getF32Type(), 1.0f, 0,
@@ -2005,11 +2010,11 @@ std::optional<Value> convertLogSoftmaxOp(PatternRewriter& rewriter,
   }
 
   mlir::quant::UniformQuantizedType in_quant_type =
-      input_type.getElementType()
-          .dyn_cast_or_null<mlir::quant::UniformQuantizedType>();
+      mlir::dyn_cast_or_null<mlir::quant::UniformQuantizedType>(
+          input_type.getElementType());
   mlir::quant::UniformQuantizedType out_quant_type =
-      output_type.getElementType()
-          .dyn_cast_or_null<mlir::quant::UniformQuantizedType>();
+      mlir::dyn_cast_or_null<mlir::quant::UniformQuantizedType>(
+          output_type.getElementType());
   if (in_quant_type || out_quant_type) {
     (void)rewriter.notifyMatchFailure(
         op, "quantized log_softmax lowering not implemented yet");
@@ -2271,7 +2276,8 @@ std::optional<SmallVector<Value>> convertSplitOp(
             tensorflow::ConvertMlirShapeToTF(new_shape)));
   }
 
-  RankedTensorType slice_type = slice_value.getType().cast<RankedTensorType>();
+  RankedTensorType slice_type =
+      mlir::cast<RankedTensorType>(slice_value.getType());
   assert((slice_type.getDimSize(axis) % num_split) == 0);
 
   // Each slice has a different beginning point.
@@ -2442,7 +2448,7 @@ std::optional<Value> convertStridedSliceOp(
   // Limitations:
   // * This implementation only supports ellipsis_mask=0 for now
   auto input_type = dyn_cast<RankedTensorType>(input_value.getType());
-  ShapedType result_type = result_value.getType().cast<ShapedType>();
+  ShapedType result_type = mlir::cast<ShapedType>(result_value.getType());
 
   if (ellipsis_mask != 0) {
     (void)rewriter.notifyMatchFailure(op, "ellipses mask not supported yet");
@@ -2586,7 +2592,7 @@ std::optional<Value> convertStridedSliceOp(
   if (all_strides_one) {
     auto reversed =
         reverseNegativeStride(rewriter, op, a1_slice_op.getResult(), strides);
-    auto shape = reversed.getType().cast<RankedTensorType>().getShape();
+    auto shape = mlir::cast<RankedTensorType>(reversed.getType()).getShape();
 
     SmallVector<int64_t> new_shape;
     for (int i = 0; i < input_rank; ++i) {
@@ -2684,9 +2690,9 @@ std::optional<Value> convertFloorDivOp(PatternRewriter& rewriter, Operation* op,
 
   Type element_type = output_type.getElementType();
 
-  if (element_type.isa<IntegerType>()) {
-    return CreateOpAndInfer<tosa::DivOp>(rewriter, op->getLoc(), output_type,
-                                         lhs_value, rhs_value)
+  if (mlir::isa<IntegerType>(element_type)) {
+    return CreateOpAndInfer<tosa::IntDivOp>(rewriter, op->getLoc(), output_type,
+                                            lhs_value, rhs_value)
         .getResult();
   }
 
@@ -2738,14 +2744,14 @@ std::optional<Value> convertFusedActivation(PatternRewriter& rewriter,
   if (!input_type) return std::nullopt;
 
   bool input_is_qtype =
-      input_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
+      mlir::isa<mlir::quant::UniformQuantizedType>(input_type.getElementType());
 
   if (input_is_qtype) {
     // We can always make output/input tensor's scale/zp always be the same
     // when legalizing fused_activation_function, as it's generated during
     // legalization.
-    auto input_qtype =
-        input_type.getElementType().cast<mlir::quant::UniformQuantizedType>();
+    auto input_qtype = mlir::cast<mlir::quant::UniformQuantizedType>(
+        input_type.getElementType());
 
     if (fused_activation_fn.getValue() == "NONE") {
       return input_value;
@@ -3079,9 +3085,9 @@ std::optional<Value> convertReduceProdOp(PatternRewriter& rewriter,
   if (!input_type) return std::nullopt;
 
   bool input_is_qtype =
-      input_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
-  bool output_is_qtype =
-      output_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
+      mlir::isa<mlir::quant::UniformQuantizedType>(input_type.getElementType());
+  bool output_is_qtype = mlir::isa<mlir::quant::UniformQuantizedType>(
+      output_type.getElementType());
 
   if (input_is_qtype || output_is_qtype) {
     (void)rewriter.notifyMatchFailure(
@@ -3105,9 +3111,9 @@ std::optional<Value> convertReduceSumOp(PatternRewriter& rewriter,
   if (!input_type) return std::nullopt;
 
   bool input_is_qtype =
-      input_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
-  bool output_is_qtype =
-      output_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
+      mlir::isa<mlir::quant::UniformQuantizedType>(input_type.getElementType());
+  bool output_is_qtype = mlir::isa<mlir::quant::UniformQuantizedType>(
+      output_type.getElementType());
 
   if (input_is_qtype != output_is_qtype) {
     (void)rewriter.notifyMatchFailure(
@@ -3123,10 +3129,10 @@ std::optional<Value> convertReduceSumOp(PatternRewriter& rewriter,
   Type reduce_element_type = input_type.getElementType();
 
   if (input_is_qtype) {
-    auto input_qtype =
-        input_type.getElementType().cast<mlir::quant::UniformQuantizedType>();
-    auto output_qtype =
-        output_type.getElementType().cast<mlir::quant::UniformQuantizedType>();
+    auto input_qtype = mlir::cast<mlir::quant::UniformQuantizedType>(
+        input_type.getElementType());
+    auto output_qtype = mlir::cast<mlir::quant::UniformQuantizedType>(
+        output_type.getElementType());
 
     int32_t input_shift = 20;
 
@@ -3164,9 +3170,9 @@ std::optional<Value> convertReduceMeanOp(PatternRewriter& rewriter,
   if (!input_type) return std::nullopt;
 
   bool input_is_qtype =
-      input_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
-  bool output_is_qtype =
-      output_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
+      mlir::isa<mlir::quant::UniformQuantizedType>(input_type.getElementType());
+  bool output_is_qtype = mlir::isa<mlir::quant::UniformQuantizedType>(
+      output_type.getElementType());
 
   if (input_is_qtype != output_is_qtype) {
     (void)rewriter.notifyMatchFailure(
@@ -3176,7 +3182,8 @@ std::optional<Value> convertReduceMeanOp(PatternRewriter& rewriter,
   }
 
   // Only supports float type mean() if it's non-quantized
-  if (!input_is_qtype && !output_type.getElementType().isa<mlir::FloatType>()) {
+  if (!input_is_qtype &&
+      !mlir::isa<mlir::FloatType>(output_type.getElementType())) {
     op->emitWarning("input unquantized type but output element not FloatType");
     return std::nullopt;
   }
@@ -3206,10 +3213,10 @@ std::optional<Value> convertReduceMeanOp(PatternRewriter& rewriter,
   int32_t output_scale_shift = 0;
 
   if (input_is_qtype) {
-    auto input_qtype =
-        input_type.getElementType().cast<mlir::quant::UniformQuantizedType>();
-    auto output_qtype =
-        output_type.getElementType().cast<mlir::quant::UniformQuantizedType>();
+    auto input_qtype = mlir::cast<mlir::quant::UniformQuantizedType>(
+        input_type.getElementType());
+    auto output_qtype = mlir::cast<mlir::quant::UniformQuantizedType>(
+        output_type.getElementType());
 
     const int32_t scale_width = 32;
     computeMultiplierAndShift(1.0f, input_scale_multiplier, input_scale_shift,
@@ -3275,9 +3282,9 @@ std::optional<Value> convertResizeOp(PatternRewriter& rewriter, Operation* op,
   }
 
   bool input_is_qtype =
-      input_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
-  bool output_is_qtype =
-      output_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
+      mlir::isa<mlir::quant::UniformQuantizedType>(input_type.getElementType());
+  bool output_is_qtype = mlir::isa<mlir::quant::UniformQuantizedType>(
+      output_type.getElementType());
 
   if (input_is_qtype != output_is_qtype) {
     (void)rewriter.notifyMatchFailure(
@@ -3287,7 +3294,7 @@ std::optional<Value> convertResizeOp(PatternRewriter& rewriter, Operation* op,
   }
 
   if (!input_is_qtype) {
-    if (!input_type.getElementType().isa<mlir::FloatType>()) {
+    if (!mlir::isa<mlir::FloatType>(input_type.getElementType())) {
       (void)rewriter.notifyMatchFailure(
           op, "only quantized or float types supported");
       return std::nullopt;
@@ -3406,8 +3413,8 @@ std::optional<Value> convertResizeOp(PatternRewriter& rewriter, Operation* op,
     // If quantized bilinear mode, need to lower to RESIZE + RESCALE pair.
     if (is_bilinear) {
       RankedTensorType output_acc_type;
-      auto input_element_qtype =
-          input_type.getElementType().cast<mlir::quant::UniformQuantizedType>();
+      auto input_element_qtype = mlir::cast<mlir::quant::UniformQuantizedType>(
+          input_type.getElementType());
 
       bool is_scale32;
 
@@ -3505,7 +3512,7 @@ std::optional<Value> convertQuantizeOp(PatternRewriter& rewriter, Operation* op,
   auto output_element_type = output_type.getElementType();
 
   // output element type could only be quantized integer
-  if (!output_element_type.isa<mlir::quant::QuantizedType>()) {
+  if (!mlir::isa<mlir::quant::QuantizedType>(output_element_type)) {
     (void)rewriter.notifyMatchFailure(
         op, "lowering quantizeOp but output element type not quantized");
     return std::nullopt;
@@ -3546,7 +3553,7 @@ std::optional<Value> convertDequantizeOp(PatternRewriter& rewriter,
   if (!input_type) return std::nullopt;
 
   // input element type could only be quantized integer
-  if (!input_type.getElementType().isa<mlir::quant::QuantizedType>())
+  if (!mlir::isa<mlir::quant::QuantizedType>(input_type.getElementType()))
     return std::nullopt;
 
   std::optional<Value> zp_val;
@@ -3839,8 +3846,8 @@ std::optional<Value> convertTFConv2DCommon(
       stride = rewriter.getDenseI64ArrayAttr({1, 1});
     } else {
       // Note: hardcoded to NHWC for now
-      int64_t stride_h = strides_attr[1].cast<IntegerAttr>().getInt();
-      int64_t stride_w = strides_attr[2].cast<IntegerAttr>().getInt();
+      int64_t stride_h = mlir::cast<IntegerAttr>(strides_attr[1]).getInt();
+      int64_t stride_w = mlir::cast<IntegerAttr>(strides_attr[2]).getInt();
       stride = rewriter.getDenseI64ArrayAttr({stride_h, stride_w});
     }
   }
@@ -3849,8 +3856,8 @@ std::optional<Value> convertTFConv2DCommon(
       dilation = rewriter.getDenseI64ArrayAttr({1, 1});
     } else {
       // Note: hardcoded to NHWC for now
-      int64_t dilation_h = dilations_attr[1].cast<IntegerAttr>().getInt();
-      int64_t dilation_w = dilations_attr[2].cast<IntegerAttr>().getInt();
+      int64_t dilation_h = mlir::cast<IntegerAttr>(dilations_attr[1]).getInt();
+      int64_t dilation_w = mlir::cast<IntegerAttr>(dilations_attr[2]).getInt();
       dilation = rewriter.getDenseI64ArrayAttr({dilation_h, dilation_w});
     }
   }
@@ -3915,8 +3922,8 @@ std::optional<Value> convertConv3DCommon(PatternRewriter& rewriter,
 
   DenseI64ArrayAttr strides_attr = rewriter.getDenseI64ArrayAttr(strides);
   DenseI64ArrayAttr dilations_attr = rewriter.getDenseI64ArrayAttr(dilations);
-  RankedTensorType input_type = input.getType().cast<RankedTensorType>();
-  RankedTensorType filter_type = filter.getType().cast<RankedTensorType>();
+  RankedTensorType input_type = mlir::cast<RankedTensorType>(input.getType());
+  RankedTensorType filter_type = mlir::cast<RankedTensorType>(filter.getType());
 
   DenseI64ArrayAttr pads_attr;
   if (!getPaddingValuesFromPadType(tf_pad, data_format_tf, 0, input_type,
@@ -3963,9 +3970,9 @@ std::optional<Value> convertTFConv3DCommon(
     // Defaults to [1, 1, 1].
     strides = {1, 1, 1};
   } else {
-    int64_t stride_d = strides_attr[1].cast<IntegerAttr>().getInt();
-    int64_t stride_h = strides_attr[2].cast<IntegerAttr>().getInt();
-    int64_t stride_w = strides_attr[3].cast<IntegerAttr>().getInt();
+    int64_t stride_d = mlir::cast<IntegerAttr>(strides_attr[1]).getInt();
+    int64_t stride_h = mlir::cast<IntegerAttr>(strides_attr[2]).getInt();
+    int64_t stride_w = mlir::cast<IntegerAttr>(strides_attr[3]).getInt();
     strides = {stride_d, stride_h, stride_w};
   }
 
@@ -3974,9 +3981,9 @@ std::optional<Value> convertTFConv3DCommon(
     // Defaults to [1, 1, 1].
     dilations = {1, 1, 1};
   } else {
-    int64_t dilation_d = dilations_attr[1].cast<IntegerAttr>().getInt();
-    int64_t dilation_h = dilations_attr[2].cast<IntegerAttr>().getInt();
-    int64_t dilation_w = dilations_attr[3].cast<IntegerAttr>().getInt();
+    int64_t dilation_d = mlir::cast<IntegerAttr>(dilations_attr[1]).getInt();
+    int64_t dilation_h = mlir::cast<IntegerAttr>(dilations_attr[2]).getInt();
+    int64_t dilation_w = mlir::cast<IntegerAttr>(dilations_attr[3]).getInt();
     dilations = {dilation_d, dilation_h, dilation_w};
   }
 
@@ -4686,7 +4693,7 @@ std::optional<Value> convertSinOp(PatternRewriter& rewriter, Operation* op,
 std::optional<Value> convertSignOp(PatternRewriter& rewriter, Operation* op,
                                    Value input, RankedTensorType output_type) {
   auto output_elem_type = output_type.getElementType();
-  if (output_elem_type.isa<mlir::quant::QuantizedType>()) {
+  if (mlir::isa<mlir::quant::QuantizedType>(output_elem_type)) {
     (void)rewriter.notifyMatchFailure(op, "tfl quantization not yet supported");
     return std::nullopt;
   }
@@ -4695,7 +4702,7 @@ std::optional<Value> convertSignOp(PatternRewriter& rewriter, Operation* op,
   // one element.
   Value pos_one, neg_one, zero;
   ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
-  if (output_elem_type.isa<FloatType>()) {
+  if (mlir::isa<FloatType>(output_elem_type)) {
     pos_one = getTosaConstTensorSingleF32(rewriter, op, 1.0f);
     neg_one = getTosaConstTensorSingleF32(rewriter, op, -1.0f);
     zero = getTosaConstTensorSingleF32(rewriter, op, 0.0f);
@@ -4733,7 +4740,7 @@ std::optional<Value> convertBroadcastToOp(PatternRewriter& rewriter,
   }
 
   Type element_type = input_type.getElementType();
-  if (element_type.isa<ComplexType>()) {
+  if (mlir::isa<ComplexType>(element_type)) {
     (void)rewriter.notifyMatchFailure(op, "input element type is complex");
     return std::nullopt;
   }
@@ -4816,7 +4823,7 @@ std::optional<Value> convertBroadcastToOp(PatternRewriter& rewriter,
   RankedTensorType output_type =
       tensorflow::GetTypeFromTFTensorShape(new_shape, element_type);
 
-  if (element_type.isa<FloatType>()) {
+  if (mlir::isa<FloatType>(element_type)) {
     // F32: legalize to broadcastable Add with (-0.f), instead of 0.f.
     // This is to preserve original values:
     // for corner case where x = -0.f
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/tosa/transforms/legalize_tf.cc
index 0ab48cc417fc98..904394d370bcce 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_tf.cc
@@ -257,7 +257,7 @@ LogicalResult ConvertTFSignOp::matchAndRewrite(
   auto tf_sign_op = cast<TF::SignOp>(op);
 
   RankedTensorType output_type =
-      tf_sign_op.getResult().getType().cast<RankedTensorType>();
+      mlir::cast<RankedTensorType>(tf_sign_op.getResult().getType());
 
   std::optional<Value> result =
       convertSignOp(rewriter, op, tf_sign_op.getX(), output_type);
@@ -270,7 +270,8 @@ LogicalResult ConvertTFSignOp::matchAndRewrite(
 LogicalResult ConvertTFSinOp::matchAndRewrite(Operation* op,
                                               PatternRewriter& rewriter) const {
   auto tf_sin_op = cast<TF::SinOp>(op);
-  ShapedType output_type = tf_sin_op.getResult().getType().cast<ShapedType>();
+  ShapedType output_type =
+      mlir::cast<ShapedType>(tf_sin_op.getResult().getType());
 
   std::optional<Value> result =
       convertSinOp(rewriter, op, tf_sin_op.getX(), output_type);
@@ -289,8 +290,8 @@ LogicalResult ConvertTFCosOp::matchAndRewrite(Operation* op,
 
   if (!input_ty || !output_ty) return failure();
 
-  bool input_is_fp = input_ty.getElementType().isa<mlir::FloatType>();
-  bool output_is_fp = output_ty.getElementType().isa<mlir::FloatType>();
+  bool input_is_fp = mlir::isa<mlir::FloatType>(input_ty.getElementType());
+  bool output_is_fp = mlir::isa<mlir::FloatType>(output_ty.getElementType());
 
   if (!input_is_fp || !output_is_fp) {
     return rewriter.notifyMatchFailure(
@@ -427,7 +428,7 @@ LogicalResult ConvertTFRoundOp::matchAndRewrite(
     return rewriter.notifyMatchFailure(op, "input not tensor type");
   }
 
-  if (input_type.getElementType().isa<FloatType>()) {
+  if (mlir::isa<FloatType>(input_type.getElementType())) {
     std::optional<Value> result = convertRoundOp(
         rewriter, op, tf_round_op.getResult(), tf_round_op.getX());
 
@@ -519,9 +520,9 @@ LogicalResult ConvertTFRealDivOp::matchAndRewrite(
 
   Type element_type = output_type.getElementType();
 
-  if (element_type.isa<IntegerType>()) {
-    CreateReplaceOpAndInfer<tosa::DivOp>(rewriter, op, output_type,
-                                         tf_div_op.getX(), tf_div_op.getY());
+  if (mlir::isa<IntegerType>(element_type)) {
+    CreateReplaceOpAndInfer<tosa::IntDivOp>(rewriter, op, output_type,
+                                            tf_div_op.getX(), tf_div_op.getY());
     return success();
   }
 
@@ -717,7 +718,8 @@ LogicalResult ConvertTFMaxPoolOp::matchAndRewrite(
 LogicalResult ConvertTFConcatV2Op::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   auto tf_concatv2_op = cast<TF::ConcatV2Op>(op);
-  auto result_type = tf_concatv2_op.getResult().getType().cast<ShapedType>();
+  auto result_type =
+      mlir::cast<ShapedType>(tf_concatv2_op.getResult().getType());
   SmallVector<Value> values(tf_concatv2_op.getValues());
 
   ElementsAttr axis_elems;
@@ -877,7 +879,7 @@ LogicalResult ConvertTFFillOp::matchAndRewrite(
   DenseArrayAttr fill_attr;
 
   // Convert to a compatible zero type
-  if (value_elem.getShapedType().getElementType().isa<FloatType>()) {
+  if (mlir::isa<FloatType>(value_elem.getShapedType().getElementType())) {
     SmallVector<float> fill_arr(
         total_size,
         value_elem.getValues<FloatAttr>()[0].getValue().convertToFloat());
@@ -891,7 +893,7 @@ LogicalResult ConvertTFFillOp::matchAndRewrite(
         DenseI32ArrayAttr::get(rewriter.getContext(), llvm::ArrayRef(fill_arr));
   }
   auto fill_const_op = CreateOpAndInfer<tosa::ConstOp>(
-      rewriter, op->getLoc(), fill_type, fill_attr.cast<ElementsAttr>());
+      rewriter, op->getLoc(), fill_type, mlir::cast<ElementsAttr>(fill_attr));
   rewriter.replaceOp(op, {fill_const_op.getResult()});
 
   return success();
@@ -911,8 +913,8 @@ LogicalResult ConvertTFConv2DOp::matchAndRewrite(
   RankedTensorType bias_type = tensorflow::GetTypeFromTFTensorShape(
       {bias_dim}, filter_type.getElementType());
   auto bias_attr = rewriter.getZeroAttr(bias_type);
-  auto bias = CreateOpAndInfer<tosa::ConstOp>(rewriter, op->getLoc(), bias_type,
-                                              bias_attr.cast<ElementsAttr>());
+  auto bias = CreateOpAndInfer<tosa::ConstOp>(
+      rewriter, op->getLoc(), bias_type, mlir::cast<ElementsAttr>(bias_attr));
 
   std::optional<Value> result = convertTFConv2DCommon(
       rewriter, op, output_type, tf_conv2d_op.getInput(),
@@ -946,8 +948,8 @@ LogicalResult ConvertTFConv3DOp::matchAndRewrite(
   RankedTensorType bias_type =
       RankedTensorType::get({bias_dim}, filter_type.getElementType());
   auto bias_attr = rewriter.getZeroAttr(bias_type);
-  auto bias = CreateOpAndInfer<tosa::ConstOp>(rewriter, op->getLoc(), bias_type,
-                                              bias_attr.cast<ElementsAttr>());
+  auto bias = CreateOpAndInfer<tosa::ConstOp>(
+      rewriter, op->getLoc(), bias_type, mlir::cast<ElementsAttr>(bias_attr));
 
   std::optional<Value> result = convertTFConv3DCommon(
       rewriter, op, output_type, tf_conv3d_op.getInput(),
@@ -1036,8 +1038,8 @@ LogicalResult ConvertTFDepthwiseConv2dNativeOp::matchAndRewrite(
   RankedTensorType bias_type = tensorflow::GetTypeFromTFTensorShape(
       {bias_dim}, filter_type.getElementType());
   auto bias_attr = rewriter.getZeroAttr(bias_type);
-  auto bias = CreateOpAndInfer<tosa::ConstOp>(rewriter, op->getLoc(), bias_type,
-                                              bias_attr.cast<ElementsAttr>());
+  auto bias = CreateOpAndInfer<tosa::ConstOp>(
+      rewriter, op->getLoc(), bias_type, mlir::cast<ElementsAttr>(bias_attr));
 
   CreateReplaceOpAndInfer<tosa::DepthwiseConv2DOp>(
       rewriter, op, output_type, tf_dwconv2d_op.getInput(),
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl.cc b/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl.cc
index b0c9a7189a50aa..abb5b80b92bfeb 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl.cc
@@ -321,9 +321,9 @@ LogicalResult ConvertTFLReluOp::matchAndRewrite(
   if (!input_type || !output_type) return failure();
 
   bool input_is_qtype =
-      input_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
-  bool output_is_qtype =
-      output_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
+      mlir::isa<mlir::quant::UniformQuantizedType>(input_type.getElementType());
+  bool output_is_qtype = mlir::isa<mlir::quant::UniformQuantizedType>(
+      output_type.getElementType());
 
   if (input_is_qtype != output_is_qtype) {
     return rewriter.notifyMatchFailure(
@@ -373,9 +373,9 @@ LogicalResult ConvertTFLRelu1Op::matchAndRewrite(
   if (!input_type || !output_type) return failure();
 
   bool input_is_qtype =
-      input_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
-  bool output_is_qtype =
-      output_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
+      mlir::isa<mlir::quant::UniformQuantizedType>(input_type.getElementType());
+  bool output_is_qtype = mlir::isa<mlir::quant::UniformQuantizedType>(
+      output_type.getElementType());
 
   if (input_is_qtype != output_is_qtype) {
     return rewriter.notifyMatchFailure(
@@ -423,14 +423,15 @@ LogicalResult ConvertTFLRelu0To1Op::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   auto tfl_relu0to1_op = cast<TFL::Relu0To1Op>(op);
 
-  ShapedType input_type = tfl_relu0to1_op.getX().getType().cast<ShapedType>();
+  ShapedType input_type =
+      mlir::cast<ShapedType>(tfl_relu0to1_op.getX().getType());
   ShapedType output_type =
-      tfl_relu0to1_op.getResult().getType().cast<ShapedType>();
+      mlir::cast<ShapedType>(tfl_relu0to1_op.getResult().getType());
 
   bool input_is_qtype =
-      input_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
-  bool output_is_qtype =
-      output_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
+      mlir::isa<mlir::quant::UniformQuantizedType>(input_type.getElementType());
+  bool output_is_qtype = mlir::isa<mlir::quant::UniformQuantizedType>(
+      output_type.getElementType());
 
   if (input_is_qtype != output_is_qtype) {
     return rewriter.notifyMatchFailure(
@@ -444,9 +445,11 @@ LogicalResult ConvertTFLRelu0To1Op::matchAndRewrite(
 
   if (output_is_qtype && input_is_qtype) {
     UniformQuantizedType input_qtype =
-        input_type.getElementType().cast<mlir::quant::UniformQuantizedType>();
+        mlir::cast<mlir::quant::UniformQuantizedType>(
+            input_type.getElementType());
     UniformQuantizedType output_qtype =
-        output_type.getElementType().cast<mlir::quant::UniformQuantizedType>();
+        mlir::cast<mlir::quant::UniformQuantizedType>(
+            output_type.getElementType());
 
     clamp_min = output_qtype.getZeroPoint();
 
@@ -482,9 +485,9 @@ LogicalResult ConvertTFLRelu6Op::matchAndRewrite(
   if (!input_type || !output_type) return failure();
 
   bool input_is_qtype =
-      input_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
-  bool output_is_qtype =
-      output_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
+      mlir::isa<mlir::quant::UniformQuantizedType>(input_type.getElementType());
+  bool output_is_qtype = mlir::isa<mlir::quant::UniformQuantizedType>(
+      output_type.getElementType());
 
   if (input_is_qtype != output_is_qtype) {
     return rewriter.notifyMatchFailure(
@@ -539,12 +542,12 @@ static LogicalResult prepareMatchAndRewriteComparison(
   // Not a shaped tensor output
   if (!input_x_type || !input_y_type || !output_type) return failure();
 
-  bool input_x_is_qtype =
-      input_x_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
-  bool input_y_is_qtype =
-      input_y_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
-  bool output_is_qtype =
-      output_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
+  bool input_x_is_qtype = mlir::isa<mlir::quant::UniformQuantizedType>(
+      input_x_type.getElementType());
+  bool input_y_is_qtype = mlir::isa<mlir::quant::UniformQuantizedType>(
+      input_y_type.getElementType());
+  bool output_is_qtype = mlir::isa<mlir::quant::UniformQuantizedType>(
+      output_type.getElementType());
 
   if (input_x_is_qtype != input_y_is_qtype ||
       input_y_is_qtype != output_is_qtype) {
@@ -671,20 +674,20 @@ static LogicalResult matchAndRewriteAddSub(Operation* op,
   auto tfl_add_op = cast<TflOp>(op);
 
   ShapedType input_lhs_type =
-      tfl_add_op.getLhs().getType().template dyn_cast<ShapedType>();
+      mlir::dyn_cast<ShapedType>(tfl_add_op.getLhs().getType());
   ShapedType input_rhs_type =
-      tfl_add_op.getRhs().getType().template dyn_cast<ShapedType>();
+      mlir::dyn_cast<ShapedType>(tfl_add_op.getRhs().getType());
   ShapedType output_type =
-      tfl_add_op.getResult().getType().template dyn_cast<ShapedType>();
+      mlir::dyn_cast<ShapedType>(tfl_add_op.getResult().getType());
   // Not a ranked tensor output
   if (!input_lhs_type || !input_rhs_type || !output_type) return failure();
 
-  bool input_lhs_is_qtype =
-      input_lhs_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
-  bool input_rhs_is_qtype =
-      input_rhs_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
-  bool output_is_qtype =
-      output_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
+  bool input_lhs_is_qtype = mlir::isa<mlir::quant::UniformQuantizedType>(
+      input_lhs_type.getElementType());
+  bool input_rhs_is_qtype = mlir::isa<mlir::quant::UniformQuantizedType>(
+      input_rhs_type.getElementType());
+  bool output_is_qtype = mlir::isa<mlir::quant::UniformQuantizedType>(
+      output_type.getElementType());
 
   if (input_lhs_is_qtype != output_is_qtype ||
       input_rhs_is_qtype != output_is_qtype) {
@@ -847,7 +850,7 @@ LogicalResult ConvertTFLSignOp::matchAndRewrite(
   auto tfl_sign_op = cast<TFL::SignOp>(op);
 
   RankedTensorType output_type =
-      tfl_sign_op.getResult().getType().cast<RankedTensorType>();
+      mlir::cast<RankedTensorType>(tfl_sign_op.getResult().getType());
 
   std::optional<Value> result =
       convertSignOp(rewriter, op, tfl_sign_op.getX(), output_type);
@@ -932,7 +935,7 @@ LogicalResult ConvertTFLRoundOp::matchAndRewrite(
     return rewriter.notifyMatchFailure(op, "input not shaped tensor type");
   }
 
-  if (input_type.getElementType().isa<FloatType>()) {
+  if (mlir::isa<FloatType>(input_type.getElementType())) {
     std::optional<Value> result = convertRoundOp(
         rewriter, op, tfl_round_op.getResult(), tfl_round_op.getX());
 
@@ -962,10 +965,11 @@ LogicalResult ConvertTFLDivOp::matchAndRewrite(
 
   Type element_type = output_type.getElementType();
   Value div_op;
-  if (element_type.isa<IntegerType>()) {
+  if (mlir::isa<IntegerType>(element_type)) {
     div_op =
-        CreateOpAndInfer<tosa::DivOp>(rewriter, op->getLoc(), output_type,
-                                      tfl_div_op.getLhs(), tfl_div_op.getRhs())
+        CreateOpAndInfer<tosa::IntDivOp>(
+            rewriter, op->getLoc(), output_type, tfl_div_op.getLhs(),
+            tfl_div_op.getRhs())
             .getResult();
   } else {
     auto reciprocal_op = CreateOpAndInfer<tosa::ReciprocalOp>(
@@ -1006,12 +1010,12 @@ LogicalResult ConvertTFLMaximumOp::matchAndRewrite(
   // Not a shaped tensor output
   if (!input_lhs_type || !input_rhs_type || !output_type) return failure();
 
-  bool input_lhs_is_qtype =
-      input_lhs_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
-  bool input_rhs_is_qtype =
-      input_rhs_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
-  bool output_is_qtype =
-      output_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
+  bool input_lhs_is_qtype = mlir::isa<mlir::quant::UniformQuantizedType>(
+      input_lhs_type.getElementType());
+  bool input_rhs_is_qtype = mlir::isa<mlir::quant::UniformQuantizedType>(
+      input_rhs_type.getElementType());
+  bool output_is_qtype = mlir::isa<mlir::quant::UniformQuantizedType>(
+      output_type.getElementType());
 
   if (input_lhs_is_qtype != output_is_qtype ||
       input_rhs_is_qtype != output_is_qtype) {
@@ -1062,12 +1066,12 @@ LogicalResult ConvertTFLMinimumOp::matchAndRewrite(
   // Not a shaped tensor output
   if (!input_lhs_type || !input_rhs_type || !output_type) return failure();
 
-  bool input_lhs_is_qtype =
-      input_lhs_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
-  bool input_rhs_is_qtype =
-      input_rhs_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
-  bool output_is_qtype =
-      output_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
+  bool input_lhs_is_qtype = mlir::isa<mlir::quant::UniformQuantizedType>(
+      input_lhs_type.getElementType());
+  bool input_rhs_is_qtype = mlir::isa<mlir::quant::UniformQuantizedType>(
+      input_rhs_type.getElementType());
+  bool output_is_qtype = mlir::isa<mlir::quant::UniformQuantizedType>(
+      output_type.getElementType());
 
   if (input_lhs_is_qtype != output_is_qtype ||
       input_rhs_is_qtype != output_is_qtype) {
@@ -1215,12 +1219,12 @@ LogicalResult ConvertTFLAveragePool2DOp::matchAndRewrite(
   // Tosa supports FP16 and FP32 accumulator type for FP16 input. When the time
   // FP16 is supported, the accumulator type can be selected based on trade-off
   // between performance and accuracy. Set to FP32 by default.
-  TypeAttr acc_attr = average_etype.isa<FloatType>()
+  TypeAttr acc_attr = mlir::isa<FloatType>(average_etype)
                           ? mlir::TypeAttr::get(rewriter.getF32Type())
                           : mlir::TypeAttr::get(rewriter.getIntegerType(32));
 
   Value result;
-  if (average_etype.isa<quant::UniformQuantizedType>()) {
+  if (mlir::isa<quant::UniformQuantizedType>(average_etype)) {
     // TensorFlow Lite doesn't use the zero point when calculating
     // quantized average pool, while TOSA does. Force the TOSA
     // zero_points to zero to ensure that the calculations match
@@ -1445,11 +1449,11 @@ LogicalResult ConvertTFLConv2DOp::matchAndRewrite(
   if (!filter_type) return failure();
 
   bool input_is_qtype =
-      input_type.getElementType().isa<mlir::quant::QuantizedType>();
+      mlir::isa<mlir::quant::QuantizedType>(input_type.getElementType());
   bool filter_is_qtype =
-      filter_type.getElementType().isa<mlir::quant::QuantizedType>();
+      mlir::isa<mlir::quant::QuantizedType>(filter_type.getElementType());
   bool output_is_qtype =
-      output_type.getElementType().isa<mlir::quant::QuantizedType>();
+      mlir::isa<mlir::quant::QuantizedType>(output_type.getElementType());
 
   if ((input_is_qtype != filter_is_qtype) ||
       (input_is_qtype != output_is_qtype)) {
@@ -1499,7 +1503,7 @@ LogicalResult ConvertTFLConv2DOp::matchAndRewrite(
       output_is_qtype ? rewriter.getI32Type() : output_type.getElementType();
   if (unquantized_bias) {
     Type new_bias_ety = getElementTypeOrSelf(unquantized_bias.getType());
-    if (auto qtype = new_bias_ety.dyn_cast<mlir::quant::QuantizedType>()) {
+    if (auto qtype = mlir::dyn_cast<mlir::quant::QuantizedType>(new_bias_ety)) {
       new_bias_ety = qtype.getStorageType();
     }
     if (new_bias_ety.getIntOrFloatBitWidth() >
@@ -1555,11 +1559,11 @@ LogicalResult ConvertTFLConv3DOp::matchAndRewrite(
   }
 
   bool input_is_qtype =
-      input_type.getElementType().isa<mlir::quant::QuantizedType>();
+      mlir::isa<mlir::quant::QuantizedType>(input_type.getElementType());
   bool filter_is_qtype =
-      filter_type.getElementType().isa<mlir::quant::QuantizedType>();
+      mlir::isa<mlir::quant::QuantizedType>(filter_type.getElementType());
   bool output_is_qtype =
-      output_type.getElementType().isa<mlir::quant::QuantizedType>();
+      mlir::isa<mlir::quant::QuantizedType>(output_type.getElementType());
 
   if ((input_is_qtype != filter_is_qtype) ||
       (input_is_qtype != output_is_qtype)) {
@@ -1578,7 +1582,7 @@ LogicalResult ConvertTFLConv3DOp::matchAndRewrite(
         RankedTensorType::get({bias_dim}, filter_type.getElementType());
     auto bias_attr = rewriter.getZeroAttr(bias_type);
     unquantized_bias = CreateOpAndInfer<tosa::ConstOp>(
-        rewriter, op->getLoc(), bias_type, bias_attr.cast<ElementsAttr>());
+        rewriter, op->getLoc(), bias_type, mlir::cast<ElementsAttr>(bias_attr));
   }
 
   SmallVector<int64_t, 3> strides({tfl_conv3d_op.getStrideD(),
@@ -1588,7 +1592,7 @@ LogicalResult ConvertTFLConv3DOp::matchAndRewrite(
                                      tfl_conv3d_op.getDilationHFactor(),
                                      tfl_conv3d_op.getDilationWFactor()});
   Type bias_ety =
-      unquantized_bias.getType().cast<ShapedType>().getElementType();
+      mlir::cast<ShapedType>(unquantized_bias.getType()).getElementType();
   std::optional<Value> a1_conv3d_op = convertConv3DCommon(
       rewriter, op, output_type.clone(bias_ety), tfl_conv3d_op.getInput(),
       tfl_conv3d_op.getFilter(), unquantized_bias, strides, dilations,
@@ -1634,11 +1638,11 @@ LogicalResult ConvertTFLTransposeConvOp::matchAndRewrite(
   if (!filter_type) return failure();
 
   bool input_is_qtype =
-      input_type.getElementType().isa<mlir::quant::QuantizedType>();
+      mlir::isa<mlir::quant::QuantizedType>(input_type.getElementType());
   bool filter_is_qtype =
-      filter_type.getElementType().isa<mlir::quant::QuantizedType>();
+      mlir::isa<mlir::quant::QuantizedType>(filter_type.getElementType());
   bool output_is_qtype =
-      output_type.getElementType().isa<mlir::quant::QuantizedType>();
+      mlir::isa<mlir::quant::QuantizedType>(output_type.getElementType());
 
   if ((input_is_qtype != filter_is_qtype) ||
       (input_is_qtype != output_is_qtype)) {
@@ -1721,7 +1725,7 @@ LogicalResult ConvertTFLTransposeConvOp::matchAndRewrite(
   }
 
   if (!zero_bias) return failure();
-  Type bias_ety = zero_bias->getType().cast<ShapedType>().getElementType();
+  Type bias_ety = mlir::cast<ShapedType>(zero_bias->getType()).getElementType();
 
   auto a1_conv2d_op = CreateOpAndInfer<tosa::TransposeConv2DOp>(
       rewriter, op->getLoc(), output_type.clone(bias_ety),
@@ -1770,11 +1774,11 @@ LogicalResult ConvertTFLDepthwiseConv2DOp::matchAndRewrite(
   if (!filter_type) return failure();
 
   bool input_is_qtype =
-      input_type.getElementType().isa<mlir::quant::QuantizedType>();
+      mlir::isa<mlir::quant::QuantizedType>(input_type.getElementType());
   bool filter_is_qtype =
-      filter_type.getElementType().isa<mlir::quant::QuantizedType>();
+      mlir::isa<mlir::quant::QuantizedType>(filter_type.getElementType());
   bool output_is_qtype =
-      output_type.getElementType().isa<mlir::quant::QuantizedType>();
+      mlir::isa<mlir::quant::QuantizedType>(output_type.getElementType());
 
   if ((input_is_qtype != filter_is_qtype) ||
       (input_is_qtype != output_is_qtype)) {
@@ -1863,7 +1867,7 @@ LogicalResult ConvertTFLDepthwiseConv2DOp::matchAndRewrite(
   Value unquantized_bias = tfl_conv2d_op.getBias();
   if (unquantized_bias) {
     Type new_bias_ety = getElementTypeOrSelf(unquantized_bias.getType());
-    if (auto qtype = new_bias_ety.dyn_cast<mlir::quant::QuantizedType>()) {
+    if (auto qtype = mlir::dyn_cast<mlir::quant::QuantizedType>(new_bias_ety)) {
       new_bias_ety = qtype.getStorageType();
     }
     if (new_bias_ety.getIntOrFloatBitWidth() >
@@ -1906,7 +1910,7 @@ LogicalResult ConvertTFLDepthwiseConv2DOp::matchAndRewrite(
 LogicalResult ConvertTFLBatchMatMulOp::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   auto tfl_mm_op = cast<TFL::BatchMatMulOp>(op);
-  auto result_ty = tfl_mm_op.getType().cast<ShapedType>();
+  auto result_ty = mlir::cast<ShapedType>(tfl_mm_op.getType());
   Value lhs = tfl_mm_op.getX();
   Value rhs = tfl_mm_op.getY();
   RankedTensorType lhs_ty = dyn_cast<RankedTensorType>(lhs.getType());
@@ -1916,10 +1920,12 @@ LogicalResult ConvertTFLBatchMatMulOp::matchAndRewrite(
 
   if (!lhs_ty || !rhs_ty) return failure();
 
-  bool lhs_is_qtype = lhs_ty.getElementType().isa<mlir::quant::QuantizedType>();
-  bool rhs_is_qtype = rhs_ty.getElementType().isa<mlir::quant::QuantizedType>();
+  bool lhs_is_qtype =
+      mlir::isa<mlir::quant::QuantizedType>(lhs_ty.getElementType());
+  bool rhs_is_qtype =
+      mlir::isa<mlir::quant::QuantizedType>(rhs_ty.getElementType());
   bool result_is_qtype =
-      result_ty.getElementType().isa<mlir::quant::QuantizedType>();
+      mlir::isa<mlir::quant::QuantizedType>(result_ty.getElementType());
 
   if ((lhs_is_qtype != rhs_is_qtype) || (lhs_is_qtype != result_is_qtype)) {
     return rewriter.notifyMatchFailure(
@@ -1951,8 +1957,8 @@ LogicalResult ConvertTFLBatchMatMulOp::matchAndRewrite(
         rewriter, op->getLoc(),
         UnrankedTensorType::get(rhs_ty.getElementType()), rhs,
         rewriter.getDenseI64ArrayAttr(new_rhs_shape));
-    lhs_ty = lhs.getType().cast<RankedTensorType>();
-    rhs_ty = rhs.getType().cast<RankedTensorType>();
+    lhs_ty = mlir::cast<RankedTensorType>(lhs.getType());
+    rhs_ty = mlir::cast<RankedTensorType>(rhs.getType());
   }
 
   if (transpose_lhs) {
@@ -1977,12 +1983,12 @@ LogicalResult ConvertTFLBatchMatMulOp::matchAndRewrite(
 
   Type output_ety;
   if (result_is_qtype) {
-    auto lhs_qty_width = lhs_ty.getElementType()
-                             .cast<mlir::quant::QuantizedType>()
-                             .getStorageTypeIntegralWidth();
-    auto rhs_qty_width = rhs_ty.getElementType()
-                             .cast<mlir::quant::QuantizedType>()
-                             .getStorageTypeIntegralWidth();
+    auto lhs_qty_width =
+        mlir::cast<mlir::quant::QuantizedType>(lhs_ty.getElementType())
+            .getStorageTypeIntegralWidth();
+    auto rhs_qty_width =
+        mlir::cast<mlir::quant::QuantizedType>(rhs_ty.getElementType())
+            .getStorageTypeIntegralWidth();
 
     if (lhs_qty_width != rhs_qty_width) {
       return rewriter.notifyMatchFailure(
@@ -2001,13 +2007,13 @@ LogicalResult ConvertTFLBatchMatMulOp::matchAndRewrite(
     output_ety = result_ty.getElementType();
   }
 
-  auto matmul =
+  Value matmul =
       CreateOpAndInfer<tosa::MatMulOp>(
           rewriter, op->getLoc(), UnrankedTensorType::get(output_ety), lhs, rhs)
           .getResult();
 
   // Conditionally reshape rank back to expected rank.
-  auto matmul_ty = matmul.getType().cast<RankedTensorType>();
+  auto matmul_ty = mlir::cast<RankedTensorType>(matmul.getType());
   if (batch_dims.size() != 1) {
     llvm::SmallVector<int64_t> new_shape{};
     for (auto d : batch_dims) {
@@ -2052,11 +2058,11 @@ LogicalResult ConvertTFLFullyConnectedOp::matchAndRewrite(
   if (!input_type || !filter_type) return failure();
 
   bool input_is_qtype =
-      input_type.getElementType().isa<mlir::quant::QuantizedType>();
+      mlir::isa<mlir::quant::QuantizedType>(input_type.getElementType());
   bool filter_is_qtype =
-      filter_type.getElementType().isa<mlir::quant::QuantizedType>();
+      mlir::isa<mlir::quant::QuantizedType>(filter_type.getElementType());
   bool output_is_qtype =
-      output_type.getElementType().isa<mlir::quant::QuantizedType>();
+      mlir::isa<mlir::quant::QuantizedType>(output_type.getElementType());
 
   if ((input_is_qtype != filter_is_qtype) ||
       (input_is_qtype != output_is_qtype)) {
@@ -2099,7 +2105,7 @@ LogicalResult ConvertTFLFullyConnectedOp::matchAndRewrite(
     RankedTensorType new_bias_type;
 
     DenseElementsAttr bias_attr;
-    if (input_type.getElementType().isa<FloatType>()) {
+    if (mlir::isa<FloatType>(input_type.getElementType())) {
       SmallVector<float> bias_arr(bias_shape[0]);
 
       for (int i = 0; i < bias_shape[0]; i++) {
@@ -2120,7 +2126,7 @@ LogicalResult ConvertTFLFullyConnectedOp::matchAndRewrite(
             op, "input must be quantized type if it's not float type");
       }
       auto input_qtype =
-          input_type.getElementType().cast<mlir::quant::QuantizedType>();
+          mlir::cast<mlir::quant::QuantizedType>(input_type.getElementType());
       Type new_bias_ety = input_qtype.getStorageTypeIntegralWidth() == 16
                               ? rewriter.getIntegerType(48)
                               : rewriter.getI32Type();
@@ -2136,7 +2142,7 @@ LogicalResult ConvertTFLFullyConnectedOp::matchAndRewrite(
     bias_val = tfl_fc_op.getBias();
   }
 
-  Type bias_ety = bias_val.getType().cast<ShapedType>().getElementType();
+  Type bias_ety = mlir::cast<ShapedType>(bias_val.getType()).getElementType();
 
   auto fc_op = CreateOpAndInfer<tosa::FullyConnectedOp>(
       rewriter, op->getLoc(), UnrankedTensorType::get(bias_ety), input_val,
@@ -2152,7 +2158,7 @@ LogicalResult ConvertTFLFullyConnectedOp::matchAndRewrite(
   }
 
   // If we know the output rank, we need to ensure the output shape is correct.
-  ShapedType fc_type = fc_output.getType().cast<ShapedType>();
+  ShapedType fc_type = mlir::cast<ShapedType>(fc_output.getType());
   if (output_type.hasRank()) {
     llvm::SmallVector<int64_t> output_shape;
 
@@ -2270,7 +2276,7 @@ LogicalResult ConvertTFLRankOp::matchAndRewrite(
       RankedTensorType::get({1}, rewriter.getIntegerType(32));
   auto rank_attr = DenseI32ArrayAttr::get(rewriter.getContext(), {rank});
   auto rank_const = CreateOpAndInfer<tosa::ConstOp>(
-      rewriter, op->getLoc(), rank_type, rank_attr.cast<ElementsAttr>());
+      rewriter, op->getLoc(), rank_type, mlir::cast<ElementsAttr>(rank_attr));
 
   rewriter.replaceOp(op, {rank_const.getResult()});
 
@@ -2303,7 +2309,7 @@ LogicalResult ConvertTFLShapeOp::matchAndRewrite(
   auto shape_attr =
       DenseI32ArrayAttr::get(rewriter.getContext(), llvm::ArrayRef(shape_arr));
   auto shape_const = CreateOpAndInfer<tosa::ConstOp>(
-      rewriter, op->getLoc(), shape_type, shape_attr.cast<ElementsAttr>());
+      rewriter, op->getLoc(), shape_type, mlir::cast<ElementsAttr>(shape_attr));
 
   rewriter.replaceOp(op, {shape_const.getResult()});
 
@@ -2376,7 +2382,7 @@ LogicalResult ConvertTFLFillOp::matchAndRewrite(
   DenseArrayAttr fill_attr;
 
   // Convert to a compatible zero type.
-  if (value_elem.getShapedType().getElementType().isa<FloatType>()) {
+  if (mlir::isa<FloatType>(value_elem.getShapedType().getElementType())) {
     SmallVector<float> fill_arr(
         total_size, value_elem.getValues<APFloat>()[0].convertToFloat());
     fill_attr =
@@ -2388,7 +2394,7 @@ LogicalResult ConvertTFLFillOp::matchAndRewrite(
         DenseI32ArrayAttr::get(rewriter.getContext(), llvm::ArrayRef(fill_arr));
   }
   auto fill_const_op = CreateOpAndInfer<tosa::ConstOp>(
-      rewriter, op->getLoc(), fill_type, fill_attr.cast<ElementsAttr>());
+      rewriter, op->getLoc(), fill_type, mlir::cast<ElementsAttr>(fill_attr));
   rewriter.replaceOp(op, {fill_const_op.getResult()});
 
   return success();
@@ -2589,11 +2595,11 @@ LogicalResult ConvertTFLRsqrtOp::matchAndRewrite(
       dyn_cast<RankedTensorType>(tfl_rsqrt_op.getX().getType());
 
   mlir::quant::UniformQuantizedType input_qtype =
-      input_type.getElementType()
-          .dyn_cast_or_null<mlir::quant::UniformQuantizedType>();
+      mlir::dyn_cast_or_null<mlir::quant::UniformQuantizedType>(
+          input_type.getElementType());
   mlir::quant::UniformQuantizedType output_qtype =
-      output_type.getElementType()
-          .dyn_cast_or_null<mlir::quant::UniformQuantizedType>();
+      mlir::dyn_cast_or_null<mlir::quant::UniformQuantizedType>(
+          output_type.getElementType());
 
   // Quantization case
   if (input_qtype && output_qtype) {
@@ -2636,7 +2642,7 @@ LogicalResult ConvertTFLL2NormalizationOp::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   auto tfl_l2norm_op = cast<TFL::L2NormalizationOp>(op);
   auto input = tfl_l2norm_op.getInput();
-  auto input_ty = input.getType().cast<ShapedType>();
+  auto input_ty = mlir::cast<ShapedType>(input.getType());
   auto loc = op->getLoc();
 
   if (!input_ty.hasRank()) return failure();
@@ -3200,15 +3206,15 @@ LogicalResult ConvertTFLHardSwishOp::matchAndRewrite(
 
   // TFL hardswish: f(x) -> (x * relu6(x+3))/6
 
-  if (input_type.getElementType().isa<mlir::quant::QuantizedType>() &&
-      output_type.getElementType().isa<mlir::quant::QuantizedType>()) {
+  if (mlir::isa<mlir::quant::QuantizedType>(input_type.getElementType()) &&
+      mlir::isa<mlir::quant::QuantizedType>(output_type.getElementType())) {
     // Should match TFLite reference numerical behavior
     mlir::quant::UniformQuantizedType input_qtype =
-        input_type.getElementType()
-            .dyn_cast_or_null<mlir::quant::UniformQuantizedType>();
+        mlir::dyn_cast_or_null<mlir::quant::UniformQuantizedType>(
+            input_type.getElementType());
     mlir::quant::UniformQuantizedType output_qtype =
-        output_type.getElementType()
-            .dyn_cast_or_null<mlir::quant::UniformQuantizedType>();
+        mlir::dyn_cast_or_null<mlir::quant::UniformQuantizedType>(
+            output_type.getElementType());
 
     auto hardswish_func = [](double v) -> double {
       double w = v + 3.0;
@@ -3286,8 +3292,8 @@ LogicalResult ConvertTFLCosOp::matchAndRewrite(
 
   if (!input_ty || !output_ty) return failure();
 
-  bool input_is_fp = input_ty.getElementType().isa<mlir::FloatType>();
-  bool output_is_fp = output_ty.getElementType().isa<mlir::FloatType>();
+  bool input_is_fp = mlir::isa<mlir::FloatType>(input_ty.getElementType());
+  bool output_is_fp = mlir::isa<mlir::FloatType>(output_ty.getElementType());
 
   if (!input_is_fp || !output_is_fp) {
     return rewriter.notifyMatchFailure(op, "input/result must be fp");
@@ -3440,9 +3446,9 @@ LogicalResult ConvertTFLLogisticOp::matchAndRewrite(
   if (!input_type || !output_type) return failure();
 
   bool input_is_qtype =
-      input_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
-  bool output_is_qtype =
-      output_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
+      mlir::isa<mlir::quant::UniformQuantizedType>(input_type.getElementType());
+  bool output_is_qtype = mlir::isa<mlir::quant::UniformQuantizedType>(
+      output_type.getElementType());
 
   if (input_is_qtype != output_is_qtype) {
     return rewriter.notifyMatchFailure(
@@ -3453,11 +3459,11 @@ LogicalResult ConvertTFLLogisticOp::matchAndRewrite(
   if (input_is_qtype) {
     ShapedType int32_type = output_type.clone(rewriter.getIntegerType(32));
     mlir::quant::UniformQuantizedType input_qtype =
-        input_type.getElementType()
-            .dyn_cast_or_null<mlir::quant::UniformQuantizedType>();
+        mlir::dyn_cast_or_null<mlir::quant::UniformQuantizedType>(
+            input_type.getElementType());
     mlir::quant::UniformQuantizedType output_qtype =
-        output_type.getElementType()
-            .dyn_cast_or_null<mlir::quant::UniformQuantizedType>();
+        mlir::dyn_cast_or_null<mlir::quant::UniformQuantizedType>(
+            output_type.getElementType());
 
     auto sigmoid_func = [](double x) -> double {
       return 1.0 / (1.0 + std::exp(-x));
@@ -3511,9 +3517,9 @@ LogicalResult ConvertTFLTanhOp::matchAndRewrite(
   if (!input_type || !output_type) return failure();
 
   bool input_is_qtype =
-      input_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
-  bool output_is_qtype =
-      output_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
+      mlir::isa<mlir::quant::UniformQuantizedType>(input_type.getElementType());
+  bool output_is_qtype = mlir::isa<mlir::quant::UniformQuantizedType>(
+      output_type.getElementType());
 
   if (input_is_qtype != output_is_qtype) {
     return rewriter.notifyMatchFailure(
@@ -3524,11 +3530,11 @@ LogicalResult ConvertTFLTanhOp::matchAndRewrite(
   if (input_is_qtype) {
     ShapedType int32_type = output_type.clone(rewriter.getIntegerType(32));
     mlir::quant::UniformQuantizedType input_qtype =
-        input_type.getElementType()
-            .dyn_cast_or_null<mlir::quant::UniformQuantizedType>();
+        mlir::dyn_cast_or_null<mlir::quant::UniformQuantizedType>(
+            input_type.getElementType());
     mlir::quant::UniformQuantizedType output_qtype =
-        output_type.getElementType()
-            .dyn_cast_or_null<mlir::quant::UniformQuantizedType>();
+        mlir::dyn_cast_or_null<mlir::quant::UniformQuantizedType>(
+            output_type.getElementType());
 
     auto tanh_func = [](double x) -> double {
       x = std::exp(-2.0 * x);
@@ -3644,9 +3650,9 @@ static LogicalResult LegalizeQuantizedPrelu(Operation* op,
   // Perform an element-wise multiplication on rescaled alpha and input for
   // PReLU.
   Value alpha = tfl_prelu_op.getAlpha();
-  ShapedType alpha_type = alpha.getType().cast<ShapedType>();
+  ShapedType alpha_type = mlir::cast<ShapedType>(alpha.getType());
   UniformQuantizedType alpha_qtype =
-      alpha_type.getElementType().cast<UniformQuantizedType>();
+      mlir::cast<UniformQuantizedType>(alpha_type.getElementType());
 
   Value op_rescale_alpha = removeZeroPointAndCastToInt32(
       rewriter, op, alpha, alpha_qtype.getZeroPoint());
@@ -3698,7 +3704,7 @@ static LogicalResult LegalizeQuantizedLeakyRelu(Operation* op,
                                                 PatternRewriter& rewriter,
                                                 Value input, double alpha,
                                                 ShapedType output_type) {
-  ShapedType input_type = input.getType().cast<ShapedType>();
+  ShapedType input_type = mlir::cast<ShapedType>(input.getType());
   ShapedType rescale_type = input_type.clone(rewriter.getI32Type());
 
   UniformQuantizedType input_qtype =
@@ -3784,9 +3790,9 @@ LogicalResult ConvertTFLLeakyReluOp::matchAndRewrite(
                                        "input or output is not a ShapedType");
 
   bool input_is_qtype =
-      input_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
-  bool output_is_qtype =
-      output_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
+      mlir::isa<mlir::quant::UniformQuantizedType>(input_type.getElementType());
+  bool output_is_qtype = mlir::isa<mlir::quant::UniformQuantizedType>(
+      output_type.getElementType());
 
   if (input_is_qtype != output_is_qtype) {
     return rewriter.notifyMatchFailure(
@@ -3846,8 +3852,7 @@ LogicalResult ConvertTFLCustomOp::matchAndRewrite(
   rewriter.replaceOpWithNewOp<tosa::CustomOp>(
       op, op->getResultTypes(), tfl_custom_op.getCustomCode(),
       rewriter.getStringAttr("TFL"),
-      tfl_custom_op.getCustomOption()
-          .cast<mlir::TFL::ConstBytesAttr>()
+      mlir::cast<mlir::TFL::ConstBytesAttr>(tfl_custom_op.getCustomOption())
           .getValue()
           .str(),
       op->getOperands());
@@ -3966,7 +3971,7 @@ LogicalResult ConvertTFLDequantizeOp::matchAndRewrite(
   if (!qtype) return failure();
 
   Type element_type = qtype.getElementType();
-  if (element_type.isa<FloatType>()) {
+  if (mlir::isa<FloatType>(element_type)) {
     CreateReplaceOpAndInfer<tosa::CastOp>(rewriter, op, output_type,
                                           tfl_dequantize_op.getInput());
     return success();
@@ -4023,7 +4028,7 @@ LogicalResult ConvertTFLConstOp::matchAndRewrite(
 
   ElementsAttr elements = tfl_const_op.getValue();
   Type element_type = elements.getShapedType().getElementType();
-  if (output_type.getElementType().isa<quant::QuantizedType>()) {
+  if (mlir::isa<quant::QuantizedType>(output_type.getElementType())) {
     output_type = RankedTensorType::get(output_type.getShape(), element_type);
   }
 
@@ -4031,7 +4036,8 @@ LogicalResult ConvertTFLConstOp::matchAndRewrite(
   // attribute shape. This occurs as some TFLite folders create constants with
   // unranked shapes.
   if (!output_type.hasRank()) {
-    output_type = elements.getType().cast<ShapedType>().clone(element_type);
+    output_type =
+        mlir::cast<ShapedType>(elements.getType()).clone(element_type);
   }
 
   rewriter.replaceOpWithNewOp<tosa::ConstOp>(op, output_type, elements);
@@ -4053,8 +4059,8 @@ LogicalResult ConvertTFLQConstOp::matchAndRewrite(
   // attribute shape. This occurs as some TFLite folders create constants with
   // unranked shapes.
   if (!output_type.hasRank()) {
-    output_type = elements.getType().cast<ShapedType>().clone(
-        output_type.getElementType());
+    output_type = mlir::cast<ShapedType>(elements.getType())
+                      .clone(output_type.getElementType());
   }
 
   rewriter.replaceOpWithNewOp<tosa::ConstOp>(op, output_type, elements);
@@ -4079,7 +4085,7 @@ LogicalResult ConvertConstantOp::matchAndRewrite(
   // For data type like 64 bits, we need to truncate them into 48 bits.
   if (e_type.isInteger(64)) {
     e_type = rewriter.getIntegerType(48);
-    attr = attr.cast<DenseIntOrFPElementsAttr>().mapValues(
+    attr = mlir::cast<DenseIntOrFPElementsAttr>(attr).mapValues(
         e_type, [](const APInt& x) -> APInt { return x.trunc(48); });
   }
 
@@ -4136,11 +4142,11 @@ LogicalResult ConvertTFLSparseToDenseOp::matchAndRewrite(
   auto indices = tfl_sparse_to_dense_op.getSparseIndices();
   auto values = tfl_sparse_to_dense_op.getSparseValues();
   auto default_value = tfl_sparse_to_dense_op.getDefaultValue();
-  auto indices_ty = indices.getType().cast<ShapedType>();
+  auto indices_ty = mlir::cast<ShapedType>(indices.getType());
   auto indices_ety = indices_ty.getElementType();
-  auto values_ty = values.getType().cast<ShapedType>();
+  auto values_ty = mlir::cast<ShapedType>(values.getType());
   auto result_ty =
-      tfl_sparse_to_dense_op.getResult().getType().cast<ShapedType>();
+      mlir::cast<ShapedType>(tfl_sparse_to_dense_op.getResult().getType());
   auto result_ety = result_ty.getElementType();
   auto loc = op->getLoc();
 
@@ -4262,7 +4268,7 @@ LogicalResult ConvertTFLArgMinOp::matchAndRewrite(
   auto arg_max_op = cast<TFL::ArgMinOp>(op);
   auto loc = arg_max_op.getLoc();
   auto input = arg_max_op.getInput();
-  auto input_ty = input.getType().cast<ShapedType>();
+  auto input_ty = mlir::cast<ShapedType>(input.getType());
   Type input_ety = input_ty.getElementType();
 
   if (auto quantized_ty = dyn_cast<QuantizedType>(input_ety)) {
@@ -4281,9 +4287,9 @@ LogicalResult ConvertTFLArgMinOp::matchAndRewrite(
   int32_t dim = dim_elems.getValues<APInt>()[0].getSExtValue();
   if (dim < 0) dim += input_ty.getRank();
 
-  if (input_ety.isa<FloatType>()) {
+  if (mlir::isa<FloatType>(input_ety)) {
     input = CreateOpAndInfer<tosa::NegateOp>(rewriter, loc, input_ty, input);
-  } else if (input_ety.isa<IntegerType>()) {
+  } else if (mlir::isa<IntegerType>(input_ety)) {
     auto reverse_ty = RankedTensorType::get({}, input_ety);
     Value reverse_val = rewriter.create<tosa::ConstOp>(
         loc, reverse_ty,
@@ -4370,12 +4376,12 @@ LogicalResult ConvertTFLRealOp::matchAndRewrite(
   Type input_ety = input_ty.getElementType();
 
   // For non-complex inputs, return the original tensor.
-  if (!input_ety.isa<ComplexType>()) {
+  if (!mlir::isa<ComplexType>(input_ety)) {
     CreateReplaceOpAndInfer<tosa::IdentityOp>(rewriter, op, input_ty, input);
     return success();
   }
 
-  if (!input_ety.cast<ComplexType>().getElementType().isF32()) {
+  if (!mlir::cast<ComplexType>(input_ety).getElementType().isF32()) {
     return rewriter.notifyMatchFailure(
         op, "complex input must be of type complex64");
   }
@@ -4425,13 +4431,13 @@ LogicalResult ConvertTFLImagOp::matchAndRewrite(
   Type input_ety = input_ty.getElementType();
 
   // For non-complex inputs return all zero's.
-  if (!input_ety.isa<ComplexType>()) {
+  if (!mlir::isa<ComplexType>(input_ety)) {
     CreateReplaceOpAndInfer<tosa::ConstOp>(
         rewriter, op, input_ty, DenseElementsAttr::get(input_ty, {0.0f}));
     return success();
   }
 
-  if (!input_ety.cast<ComplexType>().getElementType().isF32()) {
+  if (!mlir::cast<ComplexType>(input_ety).getElementType().isF32()) {
     return rewriter.notifyMatchFailure(
         op, "complex input must be of type complex64");
   }
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.cc b/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.cc
index de8e777a7d558e..8571995d719484 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
@@ -78,7 +79,7 @@ std::optional<Value> buildReshapeWithDynamicDims(PatternRewriter& rewriter,
                                                  Value input_value,
                                                  ShapedType output_type,
                                                  llvm::ArrayRef<Value> dims) {
-  auto e_ty = input_value.getType().cast<ShapedType>().getElementType();
+  auto e_ty = mlir::cast<ShapedType>(input_value.getType()).getElementType();
   llvm::SmallVector<int64_t> static_dims;
 
   if (output_type.hasRank()) {
@@ -92,7 +93,7 @@ std::optional<Value> buildReshapeWithDynamicDims(PatternRewriter& rewriter,
     auto dim = dims[i];
     SplatElementsAttr dim_attr;
     if (matchPattern(dim, m_Constant(&dim_attr))) {
-      if (dim_attr.getType().cast<ShapedType>().getRank() != 0) {
+      if (mlir::cast<ShapedType>(dim_attr.getType()).getRank() != 0) {
         (void)rewriter.notifyMatchFailure(
             op, "dim for building tosa::ReshapeOp should be rank-0");
         return std::nullopt;
@@ -643,8 +644,8 @@ DenseI64ArrayAttr getPaddingValuesFromExplicitPadAttr(
   for (int i = 0; i < 2; i++) {  // Two spatial dimensions X&Y
     int64_t dim = GetTensorSpatialDimIndex(4, data_format_tf,
                                            i);  // 4D tensor, NHWC/NCHW format
-    pad_before = explicit_pad[dim * 2].template cast<IntegerAttr>().getInt();
-    pad_after = explicit_pad[dim * 2 + 1].template cast<IntegerAttr>().getInt();
+    pad_before = mlir::cast<IntegerAttr>(explicit_pad[dim * 2]).getInt();
+    pad_after = mlir::cast<IntegerAttr>(explicit_pad[dim * 2 + 1]).getInt();
     computed_paddings.push_back(pad_before);
     computed_paddings.push_back(pad_after);
   }
@@ -801,11 +802,11 @@ LogicalResult ApplyPatternsWithShapeResolution(
   // This should be investigate for whether it is still necessary due to quant
   // type stripping changing.
   func.walk([&](tosa::ConstOp op) {
-    if (op.getType().getElementType().isa<QuantizedType>()) {
+    if (mlir::isa<QuantizedType>(op.getType().getElementType())) {
       return;
     }
     auto ety = op.getValue().getShapedType().getElementType();
-    auto new_ty = op.getType().cast<TensorType>().clone(ety);
+    auto new_ty = mlir::cast<TensorType>(op.getType()).clone(ety);
     op.getResult().setType(new_ty);
   });
 
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.h b/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.h
index d2e04ac869ae48..acb9dff2a4a8ff 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.h
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.h
@@ -202,7 +202,7 @@ TosaOp CreateOpAndInfer(ImplicitLocOpBuilder& builder, Type result_ty,
 
   // Compute the knowledge based on the inferred type.
   auto inferredKnowledge = ValueKnowledge::getPessimisticValueState();
-  inferredKnowledge.dtype = result_ty.cast<ShapedType>().getElementType();
+  inferredKnowledge.dtype = mlir::cast<ShapedType>(result_ty).getElementType();
   inferredKnowledge.hasRank = predictedShape.hasRank();
   if (predictedShape.hasRank()) {
     for (auto dim : predictedShape.getDims()) {
diff --git a/tensorflow/compiler/mlir/tosa/transforms/lower_complex_types.cc b/tensorflow/compiler/mlir/tosa/transforms/lower_complex_types.cc
index 987ac5deb7479f..765cf33aa08812 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/lower_complex_types.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/lower_complex_types.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
@@ -115,7 +116,7 @@ class GenericTypeConvert : public ConversionPattern {
 
 static bool isIllegalType(Type type) {
   if (auto shapedType = dyn_cast<ShapedType>(type)) {
-    return shapedType.getElementType().isa<ComplexType>();
+    return mlir::isa<ComplexType>(shapedType.getElementType());
   }
   return false;
 }
diff --git a/tensorflow/compiler/mlir/tosa/transforms/strip_quant_types.cc b/tensorflow/compiler/mlir/tosa/transforms/strip_quant_types.cc
index 85df18855769fc..11857f3b1c3404 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/strip_quant_types.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/strip_quant_types.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
@@ -121,7 +122,7 @@ class GenericTypeConvert : public ConversionPattern {
 };
 
 static bool isIllegalType(Type type) {
-  if (type.isa<quant::QuantizedType>()) return true;
+  if (mlir::isa<quant::QuantizedType>(type)) return true;
   if (auto shapedType = dyn_cast<ShapedType>(type)) {
     return isIllegalType(shapedType.getElementType());
   }
diff --git a/tensorflow/compiler/mlir/utils/BUILD b/tensorflow/compiler/mlir/utils/BUILD
index e34e9cf7be7cca..2256c421b45717 100644
--- a/tensorflow/compiler/mlir/utils/BUILD
+++ b/tensorflow/compiler/mlir/utils/BUILD
@@ -16,6 +16,7 @@ cc_library(
     deps = [
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/utils/name_utils.cc b/tensorflow/compiler/mlir/utils/name_utils.cc
index 6ca366fc9d64d5..7ce1c46861c2bb 100644
--- a/tensorflow/compiler/mlir/utils/name_utils.cc
+++ b/tensorflow/compiler/mlir/utils/name_utils.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 
 namespace mlir {
 
@@ -63,7 +64,7 @@ std::string GetNameFromLoc(Location loc) {
   while (!locs.empty()) {
     Location curr_loc = locs.pop_back_val();
 
-    if (auto name_loc = curr_loc.dyn_cast<NameLoc>()) {
+    if (auto name_loc = mlir::dyn_cast<NameLoc>(curr_loc)) {
       // Add name in NameLoc. For NameLoc we also account for names due to ops
       // in functions where the op's name is first.
       auto name = name_loc.getName().strref().split('@').first;
@@ -73,11 +74,11 @@ std::string GetNameFromLoc(Location loc) {
         if (!name.empty()) names_is_nonempty = true;
       }
       continue;
-    } else if (auto call_loc = curr_loc.dyn_cast<CallSiteLoc>()) {
+    } else if (auto call_loc = mlir::dyn_cast<CallSiteLoc>(curr_loc)) {
       // Use location of the Callee to generate the name.
       locs.push_back(call_loc.getCallee());
       continue;
-    } else if (auto fused_loc = curr_loc.dyn_cast<FusedLoc>()) {
+    } else if (auto fused_loc = mlir::dyn_cast<FusedLoc>(curr_loc)) {
       // Push all locations in FusedLoc in reverse order, so locations are
       // visited based on order in FusedLoc.
       auto reversed_fused_locs = llvm::reverse(fused_loc.getLocations());
diff --git a/tensorflow/compiler/tests/gather_nd_op_test.py b/tensorflow/compiler/tests/gather_nd_op_test.py
index 94589f47c05d1b..60a7949138e9c1 100644
--- a/tensorflow/compiler/tests/gather_nd_op_test.py
+++ b/tensorflow/compiler/tests/gather_nd_op_test.py
@@ -17,7 +17,6 @@
 import numpy as np
 
 from tensorflow.compiler.tests import xla_test
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
@@ -43,7 +42,7 @@ def testSimpleDtype(self):
               np.array([[4], [4], [0]], np.int32)))
 
   @test_util.disable_mlir_bridge("Error handling")
-  def testEmptyIndicesAndParamsOKButJustEmptyParamsFails(self):
+  def testEmptyIndicesAndParamstAndEmptyParamsOk(self):
     with self.session():
       params = np.ones((3, 3), dtype=np.float32)
 
@@ -60,11 +59,11 @@ def testEmptyIndicesAndParamsOKButJustEmptyParamsFails(self):
       gather_nd_ok_val = self._runGather(params_empty, indices_empty)
       self.assertAllClose(np.empty((0,), dtype=np.float32), gather_nd_ok_val)
 
+      # Zero sized indices results in a constant of 0
       params_empty = np.empty((0, 3), dtype=np.float32)
       indices_nonempty = np.zeros((1, 2), dtype=np.int32)
-      with self.assertRaisesWithPredicateMatch(
-          errors.InvalidArgumentError, r"Gather dimension 0 is of size zero"):
-        self._runGather(params_empty, indices_nonempty)
+      gather_nd_ok_val = self._runGather(params_empty, indices_nonempty)
+      self.assertAllEqual(gather_nd_ok_val, np.zeros([3]))
 
   def testIndexScalar(self):
     params = np.array(
diff --git a/tensorflow/compiler/tests/xla_ops_test.py b/tensorflow/compiler/tests/xla_ops_test.py
index 46f192648ecaa6..32ec425c02790a 100644
--- a/tensorflow/compiler/tests/xla_ops_test.py
+++ b/tensorflow/compiler/tests/xla_ops_test.py
@@ -1306,7 +1306,53 @@ def testRngBitGenerator(self, algorithm, dtype):
     with self.assertRaisesRegex(
         TypeError, 'Failed to convert elements .* to Tensor'
     ):
-      res = xla.rng_bit_generator(algorithm, initial_state, shape, dtype=dtype)
+      _ = xla.rng_bit_generator(algorithm, initial_state, shape, dtype=dtype)
+
+  def testGatherShapeInference(self):
+    operand = np.arange(10, dtype=np.int32).reshape([2, 5])
+    start_indices = np.array([2], np.int32)
+    slice_sizes = np.array([1, 3], np.int32)
+    dimension_numbers = xla_data_pb2.GatherDimensionNumbers(
+        offset_dims=[1],
+        collapsed_slice_dims=[0],
+        start_index_map=[0],
+        index_vector_dim=1,
+    )
+
+    res = xla.gather(operand, start_indices, dimension_numbers, slice_sizes)
+    self.assertEqual(res.shape, tensor_shape.TensorShape([1, 3]))
+
+  def testGatherShapeInferenceDynamicSlice(self):
+    operand = np.arange(12, dtype=np.int32).reshape([3, 2, 2])
+    start_indices = array_ops.placeholder(np.int32, shape=(3, None, 2))
+    slice_sizes = np.array([1, 2, 2], np.int32)
+    dimension_numbers = xla_data_pb2.GatherDimensionNumbers(
+        offset_dims=[2, 3],
+        collapsed_slice_dims=[0],
+        start_index_map=[0, 1],
+        index_vector_dim=2,
+    )
+
+    res = xla.gather(operand, start_indices, dimension_numbers, slice_sizes)
+    self.assertEqual(res.shape, tensor_shape.TensorShape([3, None, 2, 2]))
+
+  def testGatherShapeInferenceDynamicInput(self):
+    operand = array_ops.placeholder(np.int32, shape=(None, 5))
+    start_indices = np.array([2], np.int32)
+    slice_sizes = np.array([1, 3], np.int32)
+    dimension_numbers = xla_data_pb2.GatherDimensionNumbers()
+
+    res = xla.gather(operand, start_indices, dimension_numbers, slice_sizes)
+    self.assertEqual(res.shape, tensor_shape.unknown_shape())
+
+  def testGatherShapeInferenceUnknownSliceSizes(self):
+    operand = np.arange(10, dtype=np.int32).reshape([2, 5])
+    start_indices = np.array([2], np.int32)
+    slice_sizes = array_ops.placeholder(np.int32, shape=(2,))
+    dimension_numbers = xla_data_pb2.GatherDimensionNumbers()
+
+    res = xla.gather(operand, start_indices, dimension_numbers, slice_sizes)
+    self.assertEqual(res.shape, tensor_shape.unknown_shape())
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc
index 83d5f9b59656ed..cc02348fc86726 100644
--- a/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc
+++ b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h"
-
 #include <cstring>
 
 #if GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 01e85cc7c6cfc7..c1a2243de20146 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -54,6 +54,7 @@ package_group(
         "//third_party/mlperf/submissions/training/v0_7/models/...",
         "//third_party/py/keras_cv/...",
         "//third_party/py/tf_keras/...",
+        "//third_party/sparse_conv/ops/...",
         "//waymo/ml/deploy/benchmark/...",
     ],
 )
@@ -218,7 +219,6 @@ filegroup(
         "@local_tsl//tsl/framework/fixedpoint:xla_cpu_runtime_hdrs",
         "@local_tsl//tsl/platform:xla_cpu_runtime_srcs",
         "@local_xla//xla:cpu_runtime_hdrs",
-        "@local_xla//xla/runtime:aot_ffi_execution_context_hdrs",
         "@local_xla//xla/service:custom_call_status_hdrs",
         "@local_xla//xla/service/cpu:runtime_hdrs",
     ],
@@ -391,7 +391,6 @@ cc_library(
         # binary produced by tfcompile.
         "@local_xla//xla:cpu_function_runtime",
         "@local_xla//xla:executable_run_options",
-        "@local_xla//xla/runtime:aot_ffi_execution_context",
         "@local_xla//xla/service/cpu:buffer_desc",
         "//tensorflow/core/platform:types",
     ],
@@ -500,6 +499,7 @@ cc_library(
         "//tensorflow/compiler/jit:flags",
         "//tensorflow/compiler/jit:shape_inference",
         "//tensorflow/compiler/jit:xla_compile_util",
+        "//tensorflow/compiler/mlir/tensorflow:attribute_utils",
         "//tensorflow/compiler/mlir/tf2xla:mlir_bridge_rollout_policy",
         "//tensorflow/compiler/mlir/tf2xla/api/v1:compile_mlir_util_no_tf_dialect_passes",
         "//tensorflow/compiler/mlir/utils:array_container_utils",
@@ -706,7 +706,7 @@ cc_library(
         "@local_xla//xla/hlo/ir:hlo",
         "@local_xla//xla/service:computation_placer_hdr",
         "@local_xla//xla/service/gpu:gpu_executable_run_options",
-        "@local_xla//xla/service/gpu:nccl_clique_key",
+        "@local_xla//xla/service/gpu/runtime:nccl_clique_key",
         "@local_xla//xla/stream_executor",
         "@local_xla//xla/translate/mhlo_to_hlo:layout_util",
     ],
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 6a60149d7cc4a1..31e227970083e6 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -297,6 +297,7 @@ cc_library(
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/log",
         "@local_xla//xla:literal_util",
         "@local_xla//xla:shape_util",
         "@local_xla//xla:status_macros",
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op.cc b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
index 5877aea0269643..8087b271ba5fe2 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
@@ -91,9 +91,14 @@ Status XlaGather(const xla::XlaOp& input, const TensorShape& input_shape,
 
   for (int64_t i = 0; i < num_index_dims; ++i) {
     if (input_shape.dim_size(axis + i) == 0) {
-      return errors::InvalidArgument("Gather dimension ", axis + i,
-                                     " is of size zero in tensor with shape ",
-                                     input_shape.DebugString());
+      // Gather dimension of size zero in tensor results in constant 0.
+      // This is done to match the legacy behavior of the MLIR legalization and
+      // avoid breaking existing models.
+      auto slice_sizes = input_shape.dim_sizes();
+      slice_sizes.erase(slice_sizes.begin() + axis);
+      *gather_output =
+          xla::Broadcast(XlaHelpers::Zero(builder, dtype), slice_sizes);
+      return absl::OkStatus();
     }
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc b/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc
index d7a1b5f970561a..c5229072b56429 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <vector>
 
+#include "absl/log/log.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "xla/client/xla_builder.h"
 #include "xla/literal_util.h"
@@ -492,6 +493,27 @@ Status ExecuteTensorListSetItem(xla::XlaOp list, xla::XlaOp index,
   start_indices[0] = index;
 
   xla::XlaOp list_part = xla::GetTupleElement(list, 0);
+  {
+    TF_ASSIGN_OR_RETURN(const xla::Shape* list_part_shape,
+                        b->GetShapePtr(list_part));
+    TF_ASSIGN_OR_RETURN(const xla::Shape* update_shape, b->GetShapePtr(update));
+    for (int i = 0; i < list_part_shape->dimensions_size(); ++i) {
+      auto list_part_dim_size = list_part_shape->dimensions(i);
+      auto update_dim_size = update_shape->dimensions(i);
+      // If the update is larger than the list part, the DynamicUpdateSlice will
+      // fail so just ignore this operation and return list as is.
+      if (update_dim_size > list_part_dim_size) {
+        LOG_FIRST_N(WARNING, 1)
+            << "Warning: TensorListSetItem: ignoring set item because the "
+               "update dim ["
+            << update_dim_size << "] is larger than the list dim ["
+            << list_part_dim_size << "] at dimension " << i << ".";
+
+        *result = list;
+        return absl::OkStatus();
+      }
+    }
+  }
   xla::XlaOp updated_list_part =
       xla::DynamicUpdateSlice(list_part, update, start_indices);
 
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc b/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc
index 5c19b9fe1014d3..39d4b086788ffe 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc
@@ -116,8 +116,8 @@ constexpr llvm::StringRef kCustomCallShimTarget =
 }  // namespace
 
 bool IsTokenType(mlir::Type type) {
-  return type.isa<mlir::stablehlo::TokenType>() ||
-         type.isa<mlir::mhlo::TokenType>();
+  return mlir::isa<mlir::stablehlo::TokenType>(type) ||
+         mlir::isa<mlir::mhlo::TokenType>(type);
 }
 
 absl::StatusOr<std::unique_ptr<XlaCallModuleLoader>>
@@ -174,7 +174,7 @@ absl::Status XlaCallModuleLoader::SetPlatformIndex(
   op_builder.setInsertionPointToStart(&main_body);
   mlir::BlockArgument platform_index_arg = main_body.getArgument(0);
   mlir::RankedTensorType arg_ranked_type =
-      platform_index_arg.getType().dyn_cast<mlir::RankedTensorType>();
+      mlir::dyn_cast<mlir::RankedTensorType>(platform_index_arg.getType());
   if (!arg_ranked_type || arg_ranked_type.getRank() != 0 ||
       !(arg_ranked_type.getElementType().isSignlessInteger(32) ||
         arg_ranked_type.getElementType().isSignlessInteger(64))) {
@@ -301,7 +301,7 @@ absl::Status XlaCallModuleLoader::RefineDynamicShapes(
               << mlir::debugString(type) << " for argument type "
               << mlir::debugString(arg_type);
       mlir::TensorType arg_type =
-          main_body.getArgument(i).getType().dyn_cast<mlir::TensorType>();
+          mlir::dyn_cast<mlir::TensorType>(main_body.getArgument(i).getType());
       if (arg_type == nullptr) {
         return absl::InvalidArgumentError(absl::StrCat(
             "Argument ", i, " passed to XlaCallModule is not a tensor, ",
@@ -316,7 +316,8 @@ absl::Status XlaCallModuleLoader::RefineDynamicShapes(
             mlir::debugString(arg_type), ", got ", mlir::debugString(type)));
       }
 
-      if (auto ranked_arg_type = arg_type.dyn_cast<mlir::RankedTensorType>()) {
+      if (auto ranked_arg_type =
+              mlir::dyn_cast<mlir::RankedTensorType>(arg_type)) {
         if (mlir::failed(mlir::verifyCompatibleShape(ranked_arg_type.getShape(),
                                                      type.getShape()))) {
           return absl::InvalidArgumentError(absl::StrCat(
@@ -380,9 +381,10 @@ absl::Status XlaCallModuleLoader::RefineDynamicShapes(
     if (IsTokenType(arg_type) || is_input_refined) {
       continue;
     }
-    auto ranked_arg_type = arg_type.dyn_cast<mlir::RankedTensorType>();
+    auto ranked_arg_type = mlir::dyn_cast<mlir::RankedTensorType>(arg_type);
     if (!ranked_arg_type || !ranked_arg_type.hasStaticShape()) {
-      auto type = static_array_input_types[i].cast<mlir::RankedTensorType>();
+      auto type =
+          mlir::cast<mlir::RankedTensorType>(static_array_input_types[i]);
       auto custom_call =
           MakeShapeRefinementOperandWrapper(op_builder, arg, type.getShape());
       auto call_result = custom_call.getResult(0);
@@ -409,8 +411,8 @@ absl::Status XlaCallModuleLoader::RefineDynamicShapes(
   // Clean up custom_call shims.
   for (auto call : llvm::make_early_inc_range(
            main_body.getOps<mlir::stablehlo::CustomCallOp>())) {
-    if (call->getAttr("call_target_name").cast<mlir::StringAttr>().strref() ==
-        kCustomCallShimTarget) {
+    if (mlir::cast<mlir::StringAttr>(call->getAttr("call_target_name"))
+            .strref() == kCustomCallShimTarget) {
       auto operand = call->getOperand(0);
       auto result = call->getResult(0);
       if (operand.getType() != result.getType()) {
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_call_module_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_call_module_op.cc
index dbba8e7d8afd6a..529f27a0f7b25d 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_call_module_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_call_module_op.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.h"
@@ -403,7 +404,7 @@ class XlaCallModuleOp : public XlaOpKernel {
         mlir::TypeRange input_types(custom_call->getOperandTypes());
         if (custom_call_has_token_input_output) {
           if (input_types.empty() ||
-              !input_types.front().isa<mlir::mhlo::TokenType>()) {
+              !mlir::isa<mlir::mhlo::TokenType>(input_types.front())) {
             return absl::InvalidArgumentError(absl::StrCat(
                 "stablehlo.custom_call with has_token_input_output = true is "
                 "expected to take !stablehlo.token as the first argument, but "
@@ -422,7 +423,7 @@ class XlaCallModuleOp : public XlaOpKernel {
         mlir::TypeRange result_types(custom_call->getResultTypes());
         if (custom_call_has_token_input_output) {
           if (result_types.empty() ||
-              !result_types.front().isa<mlir::mhlo::TokenType>()) {
+              !mlir::isa<mlir::mhlo::TokenType>(result_types.front())) {
             return absl::InvalidArgumentError(absl::StrCat(
                 "stablehlo.custom_call with has_token_input_output = true is "
                 "expected to return !stablehlo.token as the first result, but "
diff --git a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
index d1a2a68d045bfc..9994f3ae2e5c56 100644
--- a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
+++ b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
@@ -208,9 +208,21 @@ MlirOptimizationPassState MlirBridgePass::GetPassState(
     return MlirOptimizationPassState::Disabled;
   }
 
+  // TODO(b/328084279): when MlirBridgePass::GetPassState() returns
+  // MlirOptimizationPassState::FallbackEnabled or
+  // MlirOptimizationPassState::Enabled, Tensorflow imports a Graph to an
+  // MLIR module, calls MlirBridgePass::Run(), and exports the MLIR module to a
+  // Graph. The Graph->MLIR module->Graph round trip will not happen if
+  // MlirOptimizationPassState::Disabled is returned. Some input graphs with a
+  // TPU device in device_set yet without replication depends on the round
+  // trip, which does not always produce the same Graph. Call
+  // HasTPUDevice(*device_set) to ensure such graps work. Note
+  // MlirBridgePass::Run() will still reject such graphs that they do not go
+  // through the Phase 1 Bridge.
   return GetPassStateImpl(
       /*is_supported_by_replicated_brige*/ IsSupportedByReplicatedBridge(
-          graph, &function_library),
+          graph, &function_library) ||
+          HasTPUDevice(*device_set),
       config_proto, graph, function_library);
 }
 
diff --git a/tensorflow/compiler/tf2xla/ops/BUILD b/tensorflow/compiler/tf2xla/ops/BUILD
index 6adab4c6c7f6b4..61a329aa46f1ba 100644
--- a/tensorflow/compiler/tf2xla/ops/BUILD
+++ b/tensorflow/compiler/tf2xla/ops/BUILD
@@ -18,9 +18,15 @@ cc_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@local_xla//xla:shape_util",
         "@local_xla//xla:xla_data_proto_cc",
+        "@local_xla//xla/service:shape_inference",
     ],
     alwayslink = 1,
 )
@@ -47,6 +53,8 @@ tf_custom_op_library(
     ],
     deps = [
         "@com_google_absl//absl/algorithm:container",
+        "@local_xla//xla:shape_util",
         "@local_xla//xla:xla_data_proto_cc",
+        "@local_xla//xla/service:shape_inference",
     ],
 )
diff --git a/tensorflow/compiler/tf2xla/ops/xla_ops.cc b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
index 27a534296921cd..a51ad205015bad 100644
--- a/tensorflow/compiler/tf2xla/ops/xla_ops.cc
+++ b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
@@ -13,18 +13,31 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <cstddef>
+#include <cstdint>
+#include <set>
+#include <string>
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/match.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_split.h"
+#include "absl/strings/str_join.h"
+#include "xla/service/shape_inference.h"
+#include "xla/shape.h"
 #include "xla/xla_data.pb.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
 
 // Note: Most of the operators defined in this module are used by the jax2tf
 // converter (see go/jax2tf for details) and are used in SavedModel produced
@@ -1123,6 +1136,26 @@ REGISTER_OP("XlaReplicaId")
     })
     .Doc("Replica ID.");
 
+xla::Shape GetShape(shape_inference::ShapeHandle shape_handle,
+                    shape_inference::InferenceContext* c) {
+  if (!c->RankKnown(shape_handle)) {
+    return xla::Shape();
+  }
+  std::vector<int64_t> dims;
+  std::vector<bool> dynamic_dims;
+  for (int i = 0, rank = c->Rank(shape_handle); i < rank; ++i) {
+    bool is_dynamic = !c->ValueKnown(c->Dim(shape_handle, i));
+    dynamic_dims.push_back(is_dynamic);
+    dims.push_back(is_dynamic ? xla::Shape::kUnboundedSize
+                              : c->Value(c->Dim(shape_handle, i)));
+  }
+  return xla::Shape(
+      // Type matters only for indices. S64 is the widest possible type.
+      xla::PrimitiveType::S64, dims,
+      absl::InlinedVector<bool, 4>(dynamic_dims.begin(), dynamic_dims.end()),
+      /*tuple_shapes=*/{});
+}
+
 REGISTER_OP("XlaGather")
     .Input("operand: T")
     .Input("start_indices: Tindices")
@@ -1132,7 +1165,63 @@ REGISTER_OP("XlaGather")
     .Attr("T: {numbertype, bool}")
     .Attr("Tindices: {int32, int64}")
     .Output("output: T")
-    .SetShapeFn(shape_inference::UnknownShape)
+    .SetShapeFn([](shape_inference::InferenceContext* c) -> absl::Status {
+      std::string dimension_numbers;
+      TF_RETURN_IF_ERROR(c->GetAttr("dimension_numbers", &dimension_numbers));
+      xla::GatherDimensionNumbers gather_dim_numbers;
+      if (!gather_dim_numbers.ParseFromString(dimension_numbers)) {
+        return absl::InvalidArgumentError("Failed to parse dimension_numbers.");
+      }
+      VLOG(3) << c->DebugString();
+      VLOG(3) << "dim_numbers: " << gather_dim_numbers.DebugString();
+      VLOG(3) << "Shapes: operand: " << c->DebugString(c->input(0))
+              << ", start_indices: " << c->DebugString(c->input(1))
+              << ", slice_sizes: " << c->DebugString(c->input(2));
+
+      xla::Shape input_shape = GetShape(c->input(0), c);
+      xla::Shape start_indices_shape = GetShape(c->input(1), c);
+      xla::Shape slice_sizes_shape = GetShape(c->input(2), c);
+
+      const Tensor* slice_sizes_tensor = c->input_tensor(2);
+      if (input_shape == xla::Shape() || input_shape.is_unbounded_dynamic() ||
+          start_indices_shape == xla::Shape() ||
+          slice_sizes_shape == xla::Shape()) {
+        VLOG(3) << "output will be unranked due to unknown or dynamic input "
+                   "shapes.";
+        return shape_inference::UnknownShape(c);
+      }
+      if (slice_sizes_tensor == nullptr ||
+          slice_sizes_tensor->NumElements() == -1) {
+        VLOG(3) << "output will be unranked due to non-constant slice_sizes.";
+        return shape_inference::UnknownShape(c);
+      }
+      std::vector<int64_t> slice_sizes;
+      if (slice_sizes_tensor->dtype() == DT_INT32) {
+        for (int i = 0; i < slice_sizes_tensor->NumElements(); ++i) {
+          slice_sizes.push_back(slice_sizes_tensor->flat<int32_t>()(i));
+        }
+      } else if (slice_sizes_tensor->dtype() == DT_INT64) {
+        for (int i = 0; i < slice_sizes_tensor->NumElements(); ++i) {
+          slice_sizes.push_back(slice_sizes_tensor->flat<int64_t>()(i));
+        }
+      }
+      VLOG(3) << "slice_sizes [val]: " << absl::StrJoin(slice_sizes, ",");
+      TF_ASSIGN_OR_RETURN(xla::Shape output_shape,
+                          xla::ShapeInference::InferGatherShape(
+                              input_shape, start_indices_shape,
+                              gather_dim_numbers, slice_sizes));
+      std::vector<shape_inference::DimensionHandle> dims;
+      for (int64_t i = 0; i < output_shape.rank(); ++i) {
+        if (output_shape.is_unbounded_dynamic_dimension(i)) {
+          dims.push_back(c->UnknownDim());
+        } else {
+          dims.push_back(c->MakeDim(output_shape.dimensions(i)));
+        }
+      }
+      c->set_output(0, c->MakeShape(dims));
+      VLOG(3) << "output: " << c->DebugString(c->output(0));
+      return absl::OkStatus();
+    })
     .Doc(R"doc(
 Wraps the XLA Gather operator documented at
   https://www.tensorflow.org/xla/operation_semantics#gather
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
index dc4109f52f96b6..2da560c23635ff 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
@@ -20,44 +20,12 @@ limitations under the License.
 #include <vector>
 
 #include "xla/cpu_function_runtime.h"
-#include "xla/runtime/aot_ffi_execution_context.h"
 
 namespace tensorflow {
 
-namespace {
-// MemrefDesc's are part of the XLA Runtime ABI. Redefine them here (with a
-// slightly different name to avoid confusion) because we cannot depend on
-// XLA Runtime's headers.
-// Note: this is an internal type, to be used exclusively in this file.
-struct MemrefHolder {
-  MemrefHolder(const XlaCompiledCpuFunction::ShapeInfo& shape_info,
-               void* data_ptr)
-      : rank(shape_info.num_dimensions), data(data_ptr), offset(0) {
-    sizes.resize(shape_info.num_dimensions);
-    strides.resize(shape_info.num_dimensions);
-    int64_t multiplier = 1;
-    for (int i = shape_info.num_dimensions - 1; i >= 0; --i) {
-      int64_t size = shape_info.dimensions[i];
-      sizes[i] = size;
-      strides[i] = multiplier;
-      multiplier *= size;
-    }
-  }
-
-  unsigned rank = 0;
-  // Note: dtype is not needed here.
-  void* data = nullptr;
-  int64_t offset = 0;
-  std::vector<int64_t> sizes;
-  std::vector<int64_t> strides;
-};
-}  // namespace
-
 XlaCompiledCpuFunction::XlaCompiledCpuFunction(const StaticData& static_data,
                                                AllocMode alloc_mode)
     : raw_function_(static_data.raw_function_),
-      external_run_function_(static_data.external_run_function_),
-      cpu_executable_(static_data.cpu_executable_),
       result_index_(static_data.result_index_),
       buffer_table_(new void*[static_data.num_buffers_]),
       buffer_infos_(static_data.buffer_infos_),
@@ -73,8 +41,7 @@ XlaCompiledCpuFunction::XlaCompiledCpuFunction(const StaticData& static_data,
       variable_names_(static_data.variable_names_),
       result_names_(static_data.result_names_),
       program_shape_(static_data.program_shape_),
-      hlo_profile_printer_data_(static_data.hlo_profile_printer_data_),
-      use_xla_runtime_(static_data.use_xla_runtime_) {
+      hlo_profile_printer_data_(static_data.hlo_profile_printer_data_) {
   bool allocate_entry_params =
       alloc_mode == AllocMode::ARGS_VARIABLES_RESULTS_PROFILES_AND_TEMPS;
   // Allocate arg and temp buffers.
@@ -92,94 +59,13 @@ XlaCompiledCpuFunction::XlaCompiledCpuFunction(const StaticData& static_data,
   }
 }
 
-bool XlaCompiledCpuFunction::RunXlaRuntime() {
-  size_t num_memref_args = num_args_ + num_results_;
-  std::vector<MemrefHolder> memref_args;
-  memref_args.reserve(num_memref_args);
-
-  size_t num_ptrs = 1;  // execution context.
-
-  // Append arguments.
-  for (int i = 0; i < num_args_; ++i) {
-    const ShapeInfo& shape_info = arg_shape_infos_[i];
-    memref_args.emplace_back(shape_info, buffer_table_[arg_index_table_[i]]);
-    num_ptrs += 3 + 2 * shape_info.num_dimensions;
-  }
-
-  // Append results.
-  for (int i = 0; i < num_results_; ++i) {
-    const ShapeInfo& shape_info = result_shape_infos_[i];
-    memref_args.emplace_back(shape_info, buffer_table_[result_index_table_[i]]);
-    num_ptrs += 3 + 2 * shape_info.num_dimensions;
-
-    // Point to this result from the "result" entry in the buffer table.
-    void** results = static_cast<void**>(buffer_table_[result_index_]);
-    results[i] = buffer_table_[result_index_table_[i]];
-  }
-
-  std::vector<void*> call_frame;
-  call_frame.resize(num_ptrs);
-  size_t ptr_index = 1;
-  for (const MemrefHolder& memref : memref_args) {
-    auto cast = [](const void* p) { return const_cast<void*>(p); };
-    call_frame[ptr_index + 0] = cast(&memref.data);  // memref.basePtr
-    call_frame[ptr_index + 1] = cast(&memref.data);  // memref.data
-    call_frame[ptr_index + 2] = cast(&memref.offset);
-    unsigned rank = memref.rank;
-    for (int64_t d = 0; d < rank; ++d) {
-      call_frame[ptr_index + 3 + d] = cast(&memref.sizes[d]);
-      call_frame[ptr_index + 3 + d + rank] = cast(&memref.strides[d]);
-    }
-    ptr_index += 3 + 2 * rank;
-  }
-
-  assert(num_ptrs == ptr_index);
-
-  xla::runtime::aot::ExecutionContext execution_context;
-  execution_context.custom_call_data = &run_options_;
-  xla::runtime::aot::ExecutionContext* execution_context_ptr =
-      &execution_context;
-  call_frame[0] = &execution_context_ptr;
-
-  auto xla_runtime_func =
-      reinterpret_cast<XlaRuntimeRawFunction>(raw_function_);
-  xla_runtime_func(call_frame.data());
-  if (execution_context.error) {
-    // No error support in XLA; dump error message to stderr.
-    std::cerr << "XLA AOT error: " << execution_context.error << ".\n";
-    return false;
-  }
-  return true;
-}
-
 bool XlaCompiledCpuFunction::Run() {
-  if (use_xla_runtime_) {
-    return RunXlaRuntime();
-  }
-  if (external_run_function_) {
-    std::vector<xla::cpu::BufferDesc> descriptor_table =
-        MakeXlaRuntimeDescriptorTable();
-    return external_run_function_(cpu_executable_, descriptor_table,
-                                  &run_options_);
-  }
   XlaCustomCallStatus status;
   raw_function_(buffer_table_[result_index_], &run_options_, nullptr,
                 buffer_table_, &status, profile_counters_);
   return !xla::CustomCallStatusGetMessage(&status).has_value();
 }
 
-std::vector<xla::cpu::BufferDesc>
-XlaCompiledCpuFunction::MakeXlaRuntimeDescriptorTable() {
-  std::vector<xla::cpu::BufferDesc> descriptor_table;
-  descriptor_table.reserve(num_buffers_);
-  for (int32_t i = 0; i < num_buffers_; ++i) {
-    void* data = buffer_table_[i];
-    uint64_t size = buffer_infos_[i].size();
-    descriptor_table.emplace_back(data, size);
-  }
-  return descriptor_table;
-}
-
 XlaCompiledCpuFunction::~XlaCompiledCpuFunction() {
   xla::cpu_function_runtime::FreeContiguous(alloc_buffer_table_);
   delete[] buffer_table_;
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
index d03f06e14f5bce..db280e239f0441 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
@@ -61,15 +61,6 @@ class XlaCompiledCpuFunction {
                                const void** args, void** temps,
                                XlaCustomCallStatus*, int64_t* profile_counters);
 
-  // Signature of the XLA Runtime raw function. Used only by XLA Runtime AOT.
-  using XlaRuntimeRawFunction = void (*)(void**);
-
-  // Signature of an external run function. Used only by XLA Runtime JIT.
-  using ExternalRunFunction =
-      bool (*)(const xla::cpu::CpuExecutable* cpu_executable,
-               const std::vector<xla::cpu::BufferDesc>& descriptor_table,
-               const xla::ExecutableRunOptions* run_options);
-
   // Simple struct to describe a tensor's shape.
   // Note: this is a poor man's substitute for xla::ShapeProto, but we cannot
   // depend on protobuf's in this library.
@@ -90,9 +81,6 @@ class XlaCompiledCpuFunction {
     // The raw function to call.
     RawFunction raw_function_;
 
-    ExternalRunFunction external_run_function_ = nullptr;
-    const xla::cpu::CpuExecutable* cpu_executable_ = nullptr;
-
     // Contains information about the buffers used by the XLA computation.
     const xla::cpu_function_runtime::BufferInfo* buffer_infos_ = nullptr;
     int32_t num_buffers_ = 0;
@@ -139,8 +127,6 @@ class XlaCompiledCpuFunction {
     // declared so we don't have access to that information here.
     int64_t profile_counters_size_ = 0;
 
-    bool use_xla_runtime_ = false;
-
     // Only XlaCompiledCpuFunction is allowed to read and write the above
     // fields.
     friend class XlaCompiledCpuFunction;
@@ -164,6 +150,8 @@ class XlaCompiledCpuFunction {
 
   XlaCompiledCpuFunction(const XlaCompiledCpuFunction&) = delete;
   XlaCompiledCpuFunction& operator=(const XlaCompiledCpuFunction&) = delete;
+  XlaCompiledCpuFunction(XlaCompiledCpuFunction&&) = default;
+  XlaCompiledCpuFunction& operator=(XlaCompiledCpuFunction&&) = default;
 
   // Sets the intra-op thread pool used to run individual ops concurrently.
   void set_thread_pool(const Eigen::ThreadPoolDevice* pool) {
@@ -331,16 +319,6 @@ class XlaCompiledCpuFunction {
     static_data->raw_function_ = raw_function;
   }
 
-  static void set_static_data_external_run_function(
-      StaticData* static_data, ExternalRunFunction external_run_function) {
-    static_data->external_run_function_ = external_run_function;
-  }
-
-  static void set_static_data_cpu_executable(
-      StaticData* static_data, const xla::cpu::CpuExecutable* cpu_executable) {
-    static_data->cpu_executable_ = cpu_executable;
-  }
-
   static void set_static_data_buffer_infos(
       StaticData* static_data,
       const xla::cpu_function_runtime::BufferInfo* buffer_infos) {
@@ -428,19 +406,13 @@ class XlaCompiledCpuFunction {
     static_data->profile_counters_size_ = profile_counters_size;
   }
 
-  static void set_static_data_use_xla_runtime(StaticData* static_data,
-                                              bool use_xla_runtime) {
-    static_data->use_xla_runtime_ = use_xla_runtime;
-  }
+  // TODO(ezhulenev): This is a no-op after removing xla runtime, however it is
+  // still required for building some targets. Figure out why and delete!
+  static void set_static_data_use_xla_runtime(StaticData* static_data, bool) {}
 
  private:
   const RawFunction raw_function_;
 
-  // [Optional] External Run() function.
-  const ExternalRunFunction external_run_function_;
-  // [Maybe Optional] CpuExecutable to be passed to external_run_function_.
-  const xla::cpu::CpuExecutable* cpu_executable_;
-
   const size_t result_index_;
 
   // Array containing pointers to argument and temp buffers (slots corresponding
@@ -488,13 +460,6 @@ class XlaCompiledCpuFunction {
   const xla::ProgramShapeProto* program_shape_ = nullptr;
   const xla::HloProfilePrinterData* hlo_profile_printer_data_ = nullptr;
 
-  const bool use_xla_runtime_ = false;
-
-  // Creates a descriptor table for XLA Runtime.
-  std::vector<xla::cpu::BufferDesc> MakeXlaRuntimeDescriptorTable();
-
-  bool RunXlaRuntime();
-
   // Add `XlaJitCompiledCpuFunction` as a friend so that it can access the
   // `set_static_data_*` static methods above.
   friend class XlaJitCompiledCpuFunction;
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index 8af2c21994d4c4..b684d9b9df08ef 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/shape_inference.h"
 #include "tensorflow/compiler/jit/xla_compile_util.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 #include "tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.h"
 #include "tensorflow/compiler/mlir/utils/array_container_utils.h"
 #include "tensorflow/compiler/tf2xla/graph_compiler.h"
@@ -78,11 +79,18 @@ limitations under the License.
 #include "tensorflow/core/tpu/tpu_defs.h"
 #include "tensorflow/core/util/debug_data_dumper.h"
 #include "tensorflow/core/util/dump_graph.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/tensor_float_32_utils.h"
 
 namespace tensorflow {
 namespace {
 
+// Name of component for error logging. This name is fixed and required to
+// enable logging.
+constexpr char kSingleOpComponent[] = "TF2XLA_XLA_COMPILER_COMPILE_SINGLE_OP";
+constexpr char kCompileFunctionComponent[] =
+    "TF2XLA_XLA_COMPILER_COMPILE_FUNCTION";
+
 // Checks that arguments `args` match types `types`.
 Status CheckSignature(const DataTypeVector& types,
                       absl::Span<const XlaCompiler::Argument> args) {
@@ -769,6 +777,9 @@ Status XlaCompiler::CompileSingleOp(
     tensorflow::metrics::IncrementPhase2XlaCompilerCounter(
         tensorflow::metrics::Phase2XlaCompilerMetric::
             kCompileSingleOpXlaBuilderFailure);
+    tsl::error_logging::Log(mlir::TF::kBridgeComponent, kSingleOpComponent,
+                            status.ToString())
+        .IgnoreError();
   }
   return status;
 }
@@ -778,7 +789,7 @@ Status XlaCompiler::CompileFunction(
     const NameAttrList& fn_name_attrs,
     absl::Span<const XlaCompiler::Argument> args,
     XlaCompiler::CompilationResult* result) {
-  const string function_id =
+  string function_id =
       Canonicalize(fn_name_attrs.name(), AttrSlice(&fn_name_attrs.attr()));
   VLOG(1) << "XlaCompiler::CompileFunction " << function_id;
 
@@ -861,49 +872,25 @@ Status XlaCompiler::CompileFunction(
 
   VLOG(1) << "====================================================";
 
-  auto state = ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_DISABLED;
-  if (options.is_entry_computation) {
-    state = GetMlirBridgeRolloutState(config_proto);
-  }
-
-  if (state == ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_ENABLED) {
-    GraphDebugInfo debug_info;
-    VLOG(1) << "Using the MLIR bridge to compile the function.";
-    std::vector<std::string> valid_control_rets =
-        GetValidControlRets(fbody->control_ret_nodes, *graph);
-    auto mlir_result = CompileGraphToXlaHlo(
-        std::move(*graph), mlir::SpanToArrayRef<XlaCompiler::Argument>(args),
-        valid_control_rets, options_.device_type.type_string(),
-        options.use_tuple_arg, /*analyse_graph=*/false, *options_.flib_def,
-        debug_info, options_.shape_determination_fns, result);
-    if (mlir_result.ok()) {
-      tensorflow::metrics::IncrementPhase2XlaCompilerCounter(
-          tensorflow::metrics::Phase2XlaCompilerMetric::
-              kCompileFunctionMlirSuccess);
-      VLOG(1) << "MLIR bridge was successfull";
-    } else {
-      tensorflow::metrics::IncrementPhase2XlaCompilerCounter(
-          tensorflow::metrics::Phase2XlaCompilerMetric::
-              kCompileFunctionMlirFailure);
-      VLOG(1) << "MLIR failed, no fallback";
-      return mlir_result;
-    }
-  } else {
-    VLOG(1) << "MLIR bridge off. Using the old bridge to compile the function";
-    auto status =
-        CompileGraph(options, function_id, std::move(graph), args, result);
-    if (!status.ok()) {
-      tensorflow::metrics::IncrementPhase2XlaCompilerCounter(
-          tensorflow::metrics::Phase2XlaCompilerMetric::
-              kCompileFunctionXlaBuilderFailure);
-      ::tsl::errors::AppendToMessage(
-          &status, "tf2xla conversion failed while converting ", function_id,
-          ". Run with TF_DUMP_GRAPH_PREFIX=/path/to/dump/dir and "
-          "--vmodule=xla_compiler=2 to obtain a dump of the compiled "
-          "functions.");
-      return status;
-    }
+  VLOG(1) << "CompileFunction with XlaBuilder";
+  auto status =
+      CompileGraph(options, function_id, std::move(graph), args, result);
+  if (!status.ok()) {
+    tensorflow::metrics::IncrementPhase2XlaCompilerCounter(
+        tensorflow::metrics::Phase2XlaCompilerMetric::
+            kCompileFunctionXlaBuilderFailure);
+    ::tsl::errors::AppendToMessage(
+        &status, "tf2xla conversion failed while converting ",
+        std::move(function_id),
+        ". Run with TF_DUMP_GRAPH_PREFIX=/path/to/dump/dir and "
+        "--vmodule=xla_compiler=2 to obtain a dump of the compiled "
+        "functions.");
+    tsl::error_logging::Log(mlir::TF::kBridgeComponent,
+                            kCompileFunctionComponent, status.ToString())
+        .IgnoreError();
+    return status;
   }
+
   tensorflow::metrics::IncrementPhase2XlaCompilerCounter(
       tensorflow::metrics::Phase2XlaCompilerMetric::
           kCompileFunctionXlaBuilderSuccess);
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.cc b/tensorflow/compiler/tf2xla/xla_helpers.cc
index a83208f5fd0f5e..024d8cd469fb56 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.cc
+++ b/tensorflow/compiler/tf2xla/xla_helpers.cc
@@ -31,7 +31,7 @@ limitations under the License.
 #include "xla/client/xla_builder.h"
 #include "xla/client/xla_computation.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
-#include "xla/service/gpu/nccl_clique_key.h"
+#include "xla/service/gpu/runtime/nccl_clique_key.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/types.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
index 754d018cc5781c..566cc338706044 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
@@ -92,15 +92,6 @@ void CollectNames(const T& entries, std::vector<string>* nonempty_names,
   name_ptrs->push_back(nullptr);  // array terminator
 }
 
-bool RunXlaRuntime(const xla::cpu::CpuExecutable* cpu_executable,
-                   const std::vector<xla::cpu::BufferDesc>& descriptor_table,
-                   const xla::ExecutableRunOptions* run_options) {
-  assert(cpu_executable->IsXlaRuntime());
-  Status status =
-      cpu_executable->ExecuteXlaRuntime(descriptor_table, run_options);
-  return status.ok();
-}
-
 }  // namespace
 
 /*static*/ absl::StatusOr<std::unique_ptr<XlaJitCompiledCpuFunction>>
@@ -171,12 +162,6 @@ XlaJitCompiledCpuFunction::Compile(
       std::make_unique<xla::ProgramShapeProto>(program_shape->ToProto());
   XlaCompiledCpuFunction::set_static_data_raw_function(&jit->static_data_,
                                                        raw_function);
-  if (cpu_executable->IsXlaRuntime()) {
-    XlaCompiledCpuFunction::set_static_data_external_run_function(
-        &jit->static_data_, RunXlaRuntime);
-    XlaCompiledCpuFunction::set_static_data_cpu_executable(&jit->static_data_,
-                                                           cpu_executable);
-  }
   XlaCompiledCpuFunction::set_static_data_buffer_infos(
       &jit->static_data_, jit->buffer_infos_.data());
   XlaCompiledCpuFunction::set_static_data_num_buffers(
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 29e0de5edafbc2..e2adb15245c183 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1266,7 +1266,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@eigen_archive//:eigen3",
         "@ml_dtypes//:float8",
-        "@ml_dtypes//:int4",
+        "@ml_dtypes//:intn",
     ] + if_static([":lib_internal_impl"]),
 )
 
@@ -1294,7 +1294,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@eigen_archive//:eigen3",
         "@ml_dtypes//:float8",
-        "@ml_dtypes//:int4",
+        "@ml_dtypes//:intn",
     ],
 )
 
@@ -1443,7 +1443,7 @@ cc_library(
         "@eigen_archive//:eigen3",
         "@local_tsl//tsl/lib/math:math_util",
         "@ml_dtypes//:float8",
-        "@ml_dtypes//:int4",
+        "@ml_dtypes//:intn",
         "@snappy",
         "@zlib",
     ] + select({
diff --git a/tensorflow/core/api_def/base_api/api_def_ConvertToListOfSparseCoreCooTensors.pbtxt b/tensorflow/core/api_def/base_api/api_def_ConvertToListOfSparseCoreCooTensors.pbtxt
new file mode 100644
index 00000000000000..13f09747d4025a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ConvertToListOfSparseCoreCooTensors.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ConvertToListOfSparseCoreCooTensors"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ConvertToSparseCoreCsrWrappedCooTensor.pbtxt b/tensorflow/core/api_def/base_api/api_def_ConvertToSparseCoreCsrWrappedCooTensor.pbtxt
new file mode 100644
index 00000000000000..8676be4f4f6c2f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ConvertToSparseCoreCsrWrappedCooTensor.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ConvertToSparseCoreCsrWrappedCooTensor"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_GatherV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_GatherV2.pbtxt
index 7f2a8a1cf1ab33..da21e0f6981c7e 100644
--- a/tensorflow/core/api_def/base_api/api_def_GatherV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_GatherV2.pbtxt
@@ -55,6 +55,10 @@ Note that on CPU, if an out of bound index is found, an error is returned.
 On GPU, if an out of bound index is found, a 0 is stored in the
 corresponding output value.
 
+Note that on TPU, if any dimension of `params` is of size 0 then the output will
+be the expected shape filled with zeros. On CPU and GPU an error will be
+returned.
+
 See also `tf.batch_gather` and `tf.gather_nd`.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_GetStatsFromListOfSparseCoreCooTensors.pbtxt b/tensorflow/core/api_def/base_api/api_def_GetStatsFromListOfSparseCoreCooTensors.pbtxt
new file mode 100644
index 00000000000000..e0976255bd776e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_GetStatsFromListOfSparseCoreCooTensors.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "GetStatsFromListOfSparseCoreCooTensors"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_GetTpuTaskId.pbtxt b/tensorflow/core/api_def/base_api/api_def_GetTpuTaskId.pbtxt
new file mode 100644
index 00000000000000..62e52142d9af92
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_GetTpuTaskId.pbtxt
@@ -0,0 +1,14 @@
+op {
+  graph_op_name: "GetTpuTaskId"
+  visibility: HIDDEN
+  out_arg {
+    name: "tpu_task_id"
+    description: <<END
+The TPU task ID from TPU topology.
+END
+  }
+  summary: "An op returns the TPU task ID from TPU topology."
+  description: <<END
+This op is to return the TPU task ID from TPU topology.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SortListOfSparseCoreCooTensors.pbtxt b/tensorflow/core/api_def/base_api/api_def_SortListOfSparseCoreCooTensors.pbtxt
new file mode 100644
index 00000000000000..14e8c47ebd1727
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SortListOfSparseCoreCooTensors.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SortListOfSparseCoreCooTensors"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UpdateTaskIdAndGlobalCoreArray.pbtxt b/tensorflow/core/api_def/base_api/api_def_UpdateTaskIdAndGlobalCoreArray.pbtxt
new file mode 100644
index 00000000000000..3babf66b5063c6
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UpdateTaskIdAndGlobalCoreArray.pbtxt
@@ -0,0 +1,20 @@
+op {
+  graph_op_name: "UpdateTaskIdAndGlobalCoreArray"
+  visibility: HIDDEN
+  in_arg {
+    name: "tpu_task_id_to_shard_id"
+    description: <<END
+An array of int32 that maps TPU task ID to shard ID.
+END
+  }
+  attr {
+    name: "task_count"
+    description: <<END
+The number of tasks.
+END
+  }
+  summary: "An op to update the task ID and global core array."
+  description: <<END
+This op is to update the task ID and global core array.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulGradWithAdagradAndStaticBufferSize.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulGradWithAdagradAndStaticBufferSize.pbtxt
new file mode 100644
index 00000000000000..a696a750c7a4f3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulGradWithAdagradAndStaticBufferSize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulGradWithAdagradAndStaticBufferSize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulGradWithAdagradMomentumAndStaticBufferSize.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulGradWithAdagradMomentumAndStaticBufferSize.pbtxt
new file mode 100644
index 00000000000000..e27946d13079bf
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulGradWithAdagradMomentumAndStaticBufferSize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulGradWithAdagradMomentumAndStaticBufferSize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulGradWithAdamAndStaticBufferSize.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulGradWithAdamAndStaticBufferSize.pbtxt
new file mode 100644
index 00000000000000..592e18b9c7242b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulGradWithAdamAndStaticBufferSize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulGradWithAdamAndStaticBufferSize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulGradWithFtrlAndStaticBufferSize.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulGradWithFtrlAndStaticBufferSize.pbtxt
new file mode 100644
index 00000000000000..63a2d75bd03080
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulGradWithFtrlAndStaticBufferSize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulGradWithFtrlAndStaticBufferSize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulGradWithSgdAndStaticBufferSize.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulGradWithSgdAndStaticBufferSize.pbtxt
new file mode 100644
index 00000000000000..815bb2188a45c5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulGradWithSgdAndStaticBufferSize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulGradWithSgdAndStaticBufferSize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulWithStaticBufferSize.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulWithStaticBufferSize.pbtxt
new file mode 100644
index 00000000000000..d78d0fbcbf3fa8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_XlaSparseDenseMatmulWithStaticBufferSize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulWithStaticBufferSize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ConvertToListOfSparseCoreCooTensors.pbtxt b/tensorflow/core/api_def/python_api/api_def_ConvertToListOfSparseCoreCooTensors.pbtxt
new file mode 100644
index 00000000000000..13f09747d4025a
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ConvertToListOfSparseCoreCooTensors.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ConvertToListOfSparseCoreCooTensors"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ConvertToSparseCoreCsrWrappedCooTensor.pbtxt b/tensorflow/core/api_def/python_api/api_def_ConvertToSparseCoreCsrWrappedCooTensor.pbtxt
new file mode 100644
index 00000000000000..8676be4f4f6c2f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ConvertToSparseCoreCsrWrappedCooTensor.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ConvertToSparseCoreCsrWrappedCooTensor"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_GetStatsFromListOfSparseCoreCooTensors.pbtxt b/tensorflow/core/api_def/python_api/api_def_GetStatsFromListOfSparseCoreCooTensors.pbtxt
new file mode 100644
index 00000000000000..e0976255bd776e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_GetStatsFromListOfSparseCoreCooTensors.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "GetStatsFromListOfSparseCoreCooTensors"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SortListOfSparseCoreCooTensors.pbtxt b/tensorflow/core/api_def/python_api/api_def_SortListOfSparseCoreCooTensors.pbtxt
new file mode 100644
index 00000000000000..14e8c47ebd1727
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SortListOfSparseCoreCooTensors.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SortListOfSparseCoreCooTensors"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulGradWithAdagradAndStaticBufferSize.pbtxt b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulGradWithAdagradAndStaticBufferSize.pbtxt
new file mode 100644
index 00000000000000..a696a750c7a4f3
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulGradWithAdagradAndStaticBufferSize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulGradWithAdagradAndStaticBufferSize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulGradWithAdagradMomentumAndStaticBufferSize.pbtxt b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulGradWithAdagradMomentumAndStaticBufferSize.pbtxt
new file mode 100644
index 00000000000000..e27946d13079bf
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulGradWithAdagradMomentumAndStaticBufferSize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulGradWithAdagradMomentumAndStaticBufferSize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulGradWithAdamAndStaticBufferSize.pbtxt b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulGradWithAdamAndStaticBufferSize.pbtxt
new file mode 100644
index 00000000000000..592e18b9c7242b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulGradWithAdamAndStaticBufferSize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulGradWithAdamAndStaticBufferSize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulGradWithFtrlAndStaticBufferSize.pbtxt b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulGradWithFtrlAndStaticBufferSize.pbtxt
new file mode 100644
index 00000000000000..63a2d75bd03080
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulGradWithFtrlAndStaticBufferSize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulGradWithFtrlAndStaticBufferSize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulGradWithSgdAndStaticBufferSize.pbtxt b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulGradWithSgdAndStaticBufferSize.pbtxt
new file mode 100644
index 00000000000000..815bb2188a45c5
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulGradWithSgdAndStaticBufferSize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulGradWithSgdAndStaticBufferSize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulWithStaticBufferSize.pbtxt b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulWithStaticBufferSize.pbtxt
new file mode 100644
index 00000000000000..d78d0fbcbf3fa8
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_XlaSparseDenseMatmulWithStaticBufferSize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "XlaSparseDenseMatmulWithStaticBufferSize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/common_runtime/BUILD b/tensorflow/core/common_runtime/BUILD
index fd46e483f39906..7fea998611bb50 100644
--- a/tensorflow/core/common_runtime/BUILD
+++ b/tensorflow/core/common_runtime/BUILD
@@ -683,6 +683,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:optional",
+        "@local_tsl//tsl/platform:env",
     ],
     alwayslink = 1,
 )
@@ -936,8 +937,9 @@ cc_library(
         ":function_body",
         ":graph_constructor",
         "//tensorflow/core:framework",
-        "//tensorflow/core:graph",
-        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/lib/core:status",
+        "//tensorflow/core/platform:refcount",
+        "@local_tsl//tsl/platform:errors",
     ],
 )
 
@@ -1267,8 +1269,10 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
+        "//tensorflow/core/config:flag_defs",
         "//tensorflow/core/platform:refcount",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -1569,6 +1573,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@local_tsl//tsl/platform:env",
     ],
 )
 
@@ -3128,6 +3133,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/config:flag_defs",
     ],
 )
 
@@ -3193,6 +3199,7 @@ tf_cc_test(
         "//tensorflow/cc:client_session",
         "//tensorflow/cc:function_ops",
         "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
         "//tensorflow/core:all_kernels",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -3201,6 +3208,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/config:flag_defs",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
     ],
diff --git a/tensorflow/core/common_runtime/base_collective_executor.cc b/tensorflow/core/common_runtime/base_collective_executor.cc
index 3136f6bab840b1..4c7ec29953cbbc 100644
--- a/tensorflow/core/common_runtime/base_collective_executor.cc
+++ b/tensorflow/core/common_runtime/base_collective_executor.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/refcount.h"
-#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/lib/connected_traceme.h"
 #include "tensorflow/core/profiler/lib/scoped_memory_debug_annotation.h"
diff --git a/tensorflow/core/common_runtime/cost_constants.h b/tensorflow/core/common_runtime/cost_constants.h
index a19c567dc0a2f4..4eb71edccb2470 100644
--- a/tensorflow/core/common_runtime/cost_constants.h
+++ b/tensorflow/core/common_runtime/cost_constants.h
@@ -35,6 +35,10 @@ inline constexpr char kNonBatchingSuffix[] = "_non_batching";
 // Full names of per-request cost.
 inline constexpr char kTpuWithSmearCostName[] = "tpu_with_smear";
 inline constexpr char kTpuNoSmearCostName[] = "tpu_no_smear";
+inline constexpr char kTpuDecodeWithSmearCostName[] = "tpu_decode_with_smear";
+inline constexpr char kTpuDecodeNoSmearCostName[] = "tpu_decode_no_smear";
+inline constexpr char kTpuPrefillWithSmearCostName[] = "tpu_prefill_with_smear";
+inline constexpr char kTpuPrefillNoSmearCostName[] = "tpu_prefill_no_smear";
 inline constexpr char kTpuNonBatchingCostName[] = "tpu_non_batching";
 inline constexpr char kGcuWithSmearCostName[] = "gcu_with_smear";
 inline constexpr char kGcuNoSmearCostName[] = "gcu_no_smear";
diff --git a/tensorflow/core/common_runtime/device/device_event_mgr.cc b/tensorflow/core/common_runtime/device/device_event_mgr.cc
index 1941cea9bc74f4..e786e5c9915bc9 100644
--- a/tensorflow/core/common_runtime/device/device_event_mgr.cc
+++ b/tensorflow/core/common_runtime/device/device_event_mgr.cc
@@ -172,8 +172,7 @@ void EventMgr::EnqueueCallback(se::Stream* stream, std::function<void()> func) {
   // Events are created on demand, and repeatedly reused.  There is no
   // limit placed here on the number of allocated Events.
   if (free_events_.empty()) {
-    free_events_.push_back(std::make_unique<se::Event>(exec_));
-    free_events_.back()->Init();
+    free_events_.emplace_back(exec_->CreateEvent().value());
   }
 
   std::unique_ptr<se::Event> e = std::move(free_events_.back());
diff --git a/tensorflow/core/common_runtime/device_mgr.h b/tensorflow/core/common_runtime/device_mgr.h
index 6e8420de1733cb..82688a2311a345 100644
--- a/tensorflow/core/common_runtime/device_mgr.h
+++ b/tensorflow/core/common_runtime/device_mgr.h
@@ -65,7 +65,7 @@ class DeviceMgr {
 
   // Clears given containers of all devices if 'container' is
   // non-empty. Otherwise, clears default containers of all devices.
-  virtual void ClearContainers(gtl::ArraySlice<string> containers) const = 0;
+  virtual void ClearContainers(absl::Span<const string> containers) const = 0;
 
   virtual int NumDeviceType(const string& type) const = 0;
 
@@ -102,7 +102,7 @@ class DynamicDeviceMgr : public DeviceMgr {
   string DeviceMappingString() const override;
   Status LookupDevice(StringPiece name, Device** device) const override;
   bool ContainsDevice(int64_t device_incarnation) const override;
-  void ClearContainers(gtl::ArraySlice<string> containers) const override;
+  void ClearContainers(absl::Span<const string> containers) const override;
   int NumDeviceType(const string& type) const override;
   int NumDevices() const override;
   Device* HostCPU() const override;
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 50285f87b2283c..46a9fd5943fb40 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -73,7 +73,6 @@ limitations under the License.
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/lib/connected_traceme.h"
 #include "tensorflow/core/profiler/lib/device_profiler_session.h"
diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h
index 6421862af65af0..a755befa5bf0d8 100644
--- a/tensorflow/core/common_runtime/direct_session.h
+++ b/tensorflow/core/common_runtime/direct_session.h
@@ -241,8 +241,8 @@ class DirectSession : public Session {
   // Retrieves an already existing set of executors to run 'inputs' and
   // 'outputs', or creates and caches them for future use.
   ::tensorflow::Status GetOrCreateExecutors(
-      gtl::ArraySlice<string> inputs, gtl::ArraySlice<string> outputs,
-      gtl::ArraySlice<string> target_nodes,
+      absl::Span<const string> inputs, absl::Span<const string> outputs,
+      absl::Span<const string> target_nodes,
       ExecutorsAndKeys** executors_and_keys, RunStateArgs* run_state_args);
 
   // Creates a set of executors to run the subgraph defined by
diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index f278dfc233bbf9..355f16d57aac6b 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -68,9 +68,9 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-CallableOptions MakeCallableOptions(gtl::ArraySlice<string> feeds,
-                                    gtl::ArraySlice<string> fetches,
-                                    gtl::ArraySlice<string> targets) {
+CallableOptions MakeCallableOptions(absl::Span<const string> feeds,
+                                    absl::Span<const string> fetches,
+                                    absl::Span<const string> targets) {
   CallableOptions ret;
   for (const string& feed : feeds) {
     ret.add_feed(feed);
@@ -1444,7 +1444,7 @@ TEST(DirectSessionTest, SessionSyncRun_DeepGraph) {
   std::vector<Node*> nodes;
   nodes.reserve(1024);
 
-  auto make_expensive_noop = [&g](gtl::ArraySlice<Node*> control_deps) {
+  auto make_expensive_noop = [&g](absl::Span<Node* const> control_deps) {
     Node* ret;
     auto builder = NodeBuilder(g.NewName("N"), "ExpensiveNoop");
     for (Node* control_dep : control_deps) {
@@ -2850,7 +2850,7 @@ class DirectSessionCollectiveTest : public ::testing::Test {
             {{"group_size", 2},
              {"group_key", 1},
              {"instance_key", instance_key},
-             {"subdiv_offsets", gtl::ArraySlice<int32>({0})},
+             {"subdiv_offsets", absl::Span<const int32>({0})},
              {"merge_op", "Add"},
              {"final_op", "Div"},
              {"T", DT_FLOAT}},
diff --git a/tensorflow/core/common_runtime/dynamic_device_mgr.cc b/tensorflow/core/common_runtime/dynamic_device_mgr.cc
index 325bbfd97b9849..06ccc121440394 100644
--- a/tensorflow/core/common_runtime/dynamic_device_mgr.cc
+++ b/tensorflow/core/common_runtime/dynamic_device_mgr.cc
@@ -127,7 +127,7 @@ bool DynamicDeviceMgr::ContainsDevice(int64_t device_incarnation) const {
 }
 
 void DynamicDeviceMgr::ClearContainers(
-    gtl::ArraySlice<string> containers) const {
+    absl::Span<const string> containers) const {
   Status s;
   tf_shared_lock l(devices_mu_);
   for (const auto& it : dynamic_devices_) {
diff --git a/tensorflow/core/common_runtime/eager/attr_builder_test.cc b/tensorflow/core/common_runtime/eager/attr_builder_test.cc
index 1baf0ddcdceb48..dd0a18cece4a38 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder_test.cc
+++ b/tensorflow/core/common_runtime/eager/attr_builder_test.cc
@@ -132,7 +132,7 @@ TEST(AttrBuilder, GetTypeAndNumber) {
 
 TEST(AttrBuilder, GetTypeList) {
   AttrBuilder a("IdentityN");
-  a.Set("T", gtl::ArraySlice<DataType>({DT_FLOAT, DT_INT64}));
+  a.Set("T", absl::Span<const DataType>({DT_FLOAT, DT_INT64}));
   absl::InlinedVector<DataType, 4> type_list;
   Status s = a.GetTypeList("T", &type_list);
   ASSERT_TRUE(s.ok()) << s;
diff --git a/tensorflow/core/common_runtime/eager/context_distributed_manager.cc b/tensorflow/core/common_runtime/eager/context_distributed_manager.cc
index 9ddb221ab4ef20..35c583927433de 100644
--- a/tensorflow/core/common_runtime/eager/context_distributed_manager.cc
+++ b/tensorflow/core/common_runtime/eager/context_distributed_manager.cc
@@ -1189,7 +1189,6 @@ Status EagerContextDistributedManager::EnableCollectiveOps(
   return absl::OkStatus();
 }
 
-
 Status EagerContextDistributedManager::CheckRemoteAlive(
     const std::string& remote_task_name, bool* is_alive) {
   *is_alive = false;
diff --git a/tensorflow/core/common_runtime/eager/custom_device_op_handler.cc b/tensorflow/core/common_runtime/eager/custom_device_op_handler.cc
index e3e7f455b01123..34b89b4da61eb0 100644
--- a/tensorflow/core/common_runtime/eager/custom_device_op_handler.cc
+++ b/tensorflow/core/common_runtime/eager/custom_device_op_handler.cc
@@ -67,7 +67,7 @@ Status CustomDeviceOpHandler::Execute(ImmediateExecutionOperation* op,
   }
 
   // The op will be placed on physical device. However, it contains custom
-  // device tensor handles. The tensor handles will be copy to physical device
+  // device tensor handles. The tensor handles will be copied to physical device
   // first.
   if (op->HasCustomDeviceInput()) {
     auto inputs = op->GetInputs();
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index a0aa7d9a5c9c9b..46b44f7dc86935 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -70,7 +70,6 @@ limitations under the License.
 #include "tensorflow/core/platform/profile_utils/cpu_utils.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/thread_annotations.h"
-#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/lib/annotated_traceme.h"
 #include "tensorflow/core/profiler/lib/connected_traceme.h"
@@ -82,6 +81,7 @@ limitations under the License.
 #include "tensorflow/core/util/determinism.h"
 #include "tensorflow/core/util/managed_stack_trace.h"
 #include "tensorflow/core/util/tensor_slice_reader_cache.h"
+#include "tsl/platform/tracing.h"
 
 namespace tensorflow {
 
@@ -381,7 +381,7 @@ class ExecutorState {
   // Step-local container.
   ScopedStepContainer* step_container_;
   StepStatsCollectorInterface* const stats_collector_;
-  const tracing::EventCollector* const event_collector_;
+  const tsl::tracing::EventCollector* const event_collector_;
   Context context_;
 
   // QUESTION: Make it a checkpoint::TensorSliceReaderCacheWrapper
@@ -435,8 +435,8 @@ ExecutorState<PropagatorStateType>::ExecutorState(
       tensor_store_(args.tensor_store),
       step_container_(args.step_container),
       stats_collector_(args.stats_collector),
-      event_collector_(
-          tracing::GetEventCollector(tracing::EventCategory::kCompute)),
+      event_collector_(tsl::tracing::GetEventCollector(
+          tsl::tracing::EventCategory::kCompute)),
       context_(ContextKind::kThread),
       slice_reader_cache_(new checkpoint::TensorSliceReaderCacheWrapper),
       call_frame_(args.call_frame),
@@ -563,22 +563,23 @@ struct ExecutorState<PropagatorStateType>::AsyncState {
 
 // Returns true if `item` might be traced by the given trace and event
 // collectors. Returns false only if `item` definitely will not be traced.
-bool MightTrace(const tracing::EventCollector* event_collector,
+bool MightTrace(const tsl::tracing::EventCollector* event_collector,
                 bool is_expensive) {
   // Tracing will only be enabled if either `event_collector` is non null,
   // or `trace_collector` is non-null and enabled for this particular kernel.
   // Although `profiler::TraceMe`, `profiler::ScopedAnnotation`, and
-  // `tracing::ScopedRegion` check subsets of these properties internally in
-  // their constructors, the cost of passing the necessary arguments to them can
-  // be significant, so we avoid constructing them in the common case (when we
-  // know they will not be used).
+  // `tsl::tracing::ScopedRegion` check subsets of these properties internally
+  // in their constructors, the cost of passing the necessary arguments to them
+  // can be significant, so we avoid constructing them in the common case (when
+  // we know they will not be used).
   if (event_collector != nullptr) {
     return true;
   }
 
-  if (profiler::ScopedAnnotation::IsEnabled()) return true;
+  if (tsl::profiler::ScopedAnnotation::IsEnabled()) return true;
 
-  return profiler::TraceMe::Active(profiler::GetTFTraceMeLevel(is_expensive));
+  return tsl::profiler::TraceMe::Active(
+      tsl::profiler::GetTFTraceMeLevel(is_expensive));
 }
 
 template <class PropagatorStateType>
@@ -594,14 +595,14 @@ Status ExecutorState<PropagatorStateType>::ProcessSync(
   const bool is_expensive = kernel_stats_->IsExpensive(item);
 
   if (TF_PREDICT_FALSE(MightTrace(event_collector_, is_expensive))) {
-    tracing::ScopedRegion region(tracing::EventCategory::kCompute,
-                                 op_kernel->name_view());
+    tsl::tracing::ScopedRegion region(tsl::tracing::EventCategory::kCompute,
+                                      op_kernel->name_view());
     profiler::AnnotatedTraceMe activity(
         [op_kernel, &ctx] {
           return op_kernel->TraceString(
-              ctx, /*verbose=*/profiler::TfOpDetailsEnabled());
+              ctx, /*verbose=*/tsl::profiler::TfOpDetailsEnabled());
         },
-        profiler::GetTFTraceMeLevel(is_expensive));
+        tsl::profiler::GetTFTraceMeLevel(is_expensive));
     device->Compute(op_kernel, &ctx);
   } else if (kernel_stats_->HasExpensiveMarker(item)) {
     KernelTimer timer;
@@ -641,9 +642,9 @@ void ExecutorState<PropagatorStateType>::ProcessAsync(
     profiler::AnnotatedTraceMe activity(
         [async_kernel, state] {
           return async_kernel->TraceString(
-              state->ctx, /*verbose=*/profiler::TfOpDetailsEnabled());
+              state->ctx, /*verbose=*/tsl::profiler::TfOpDetailsEnabled());
         },
-        profiler::GetTFTraceMeLevel(/*is_expensive=*/false));
+        tsl::profiler::GetTFTraceMeLevel(/*is_expensive=*/false));
 
     // Trace async op start.
     profiler::TraceMeProducer producer(
@@ -654,7 +655,7 @@ void ExecutorState<PropagatorStateType>::ProcessAsync(
                {"kernel_type", async_kernel->type_string()},
                {"step_id", step_id_}});
         },
-        profiler::ContextType::kTfExecutor);
+        tsl::profiler::ContextType::kTfExecutor);
 
     auto done = [this, state, activity_id, ctx_id = producer.GetContextId()]() {
       // Trace async op done.
@@ -666,7 +667,7 @@ void ExecutorState<PropagatorStateType>::ProcessAsync(
                  {"kernel_type", state->item->kernel->type_string()},
                  {"step_id", step_id_}});
           },
-          profiler::ContextType::kTfExecutor, ctx_id);
+          tsl::profiler::ContextType::kTfExecutor, ctx_id);
 
       Device* device = immutable_state_.params().device;
       NodeExecStatsInterface* stats = state->stats;  // Shorthand
@@ -728,8 +729,8 @@ void ExecutorState<PropagatorStateType>::ProcessConstTensor(
 template <class PropagatorStateType>
 void ExecutorState<PropagatorStateType>::Process(const TaggedNode& tagged_node,
                                                  int64_t scheduled_nsec) {
-  profiler::TraceMe traceme("ExecutorState::Process Scheduled",
-                            profiler::TraceMeLevel::kVerbose);
+  tsl::profiler::TraceMe traceme("ExecutorState::Process Scheduled",
+                                 tsl::profiler::TraceMeLevel::kVerbose);
   TaggedNodeReadyQueue inline_ready;
   inline_ready.push_back(tagged_node);
   return ProcessInline(&inline_ready, scheduled_nsec);
@@ -824,8 +825,8 @@ void ExecutorState<PropagatorStateType>::ProcessInline(
                 "ExecutorState::Process",
                 {{"id", step_id_}, {"iter_num", tagged_node.get_iter_num()}});
           },
-          profiler::ContextType::kTfExecutor, trace_id_,
-          profiler::TraceMeLevel::kInfo);
+          tsl::profiler::ContextType::kTfExecutor, trace_id_,
+          tsl::profiler::TraceMeLevel::kInfo);
       last_iter_num = current_iter_num;
     }
     inline_ready->pop_front();
@@ -1265,7 +1266,7 @@ bool ExecutorState<PropagatorStateType>::NodeDone(
 template <class PropagatorStateType>
 void ExecutorState<PropagatorStateType>::ScheduleReady(
     TaggedNodeSeq* ready, TaggedNodeReadyQueue* inline_ready) {
-  profiler::TraceMe activity(
+  tsl::profiler::TraceMe activity(
       [&]() {
         return strings::StrCat(
             "ExecutorState::ScheduleReady#",
@@ -1273,7 +1274,7 @@ void ExecutorState<PropagatorStateType>::ScheduleReady(
             ",inline_ready_size=",
             (inline_ready == nullptr ? -1 : inline_ready->size()), "#");
       },
-      profiler::GetTFTraceMeLevel(/*is_expensive=*/false));
+      tsl::profiler::GetTFTraceMeLevel(/*is_expensive=*/false));
   DCHECK(!ready->empty());
 
   int64_t scheduled_nsec = 0;
@@ -1352,14 +1353,14 @@ void ExecutorState<PropagatorStateType>::ScheduleReady(
           TaggedNodeSeq ready_chunk{it, end};
           RunTask(
               [this, ready_chunk = std::move(ready_chunk), scheduled_nsec]() {
-                profiler::TraceMe activity(
+                tsl::profiler::TraceMe activity(
                     [&]() {
                       return strings::StrCat(
                           "ExecutorState::ScheduleReady::"
                           "ChildThreadExpensiveNodes#",
                           "ready_chunk_size=", ready_chunk.size(), "#");
                     },
-                    profiler::GetTFTraceMeLevel(/*is_expensive=*/false));
+                    tsl::profiler::GetTFTraceMeLevel(/*is_expensive=*/false));
                 for (auto& tagged_node : ready_chunk) {
                   RunTask(std::bind(&ExecutorState::Process, this, tagged_node,
                                     scheduled_nsec),
@@ -1465,8 +1466,8 @@ void ExecutorState<PropagatorStateType>::Finish() {
             return profiler::TraceMeEncode("ExecutorDoneCallback",
                                            {{"id", step_id}});
           },
-          profiler::ContextType::kTfExecutor, trace_id,
-          profiler::TraceMeLevel::kInfo);
+          tsl::profiler::ContextType::kTfExecutor, trace_id,
+          tsl::profiler::TraceMeLevel::kInfo);
       done_cb(status);
     });
     return;
@@ -1488,8 +1489,8 @@ void ExecutorState<PropagatorStateType>::Finish() {
               return profiler::TraceMeEncode("ExecutorDoneCallback",
                                              {{"id", step_id}});
             },
-            profiler::ContextType::kTfExecutor, trace_id,
-            profiler::TraceMeLevel::kInfo);
+            tsl::profiler::ContextType::kTfExecutor, trace_id,
+            tsl::profiler::TraceMeLevel::kInfo);
         done_cb(status);
       });
     });
@@ -1503,8 +1504,8 @@ void ExecutorState<PropagatorStateType>::Finish() {
             return profiler::TraceMeEncode("ExecutorDoneCallback",
                                            {{"id", step_id}});
           },
-          profiler::ContextType::kTfExecutor, trace_id,
-          profiler::TraceMeLevel::kInfo);
+          tsl::profiler::ContextType::kTfExecutor, trace_id,
+          tsl::profiler::TraceMeLevel::kInfo);
       done_cb(status);
     });
   }
diff --git a/tensorflow/core/common_runtime/executor_test.cc b/tensorflow/core/common_runtime/executor_test.cc
index 6a847b3f1b2f9f..cb683e21a707f9 100644
--- a/tensorflow/core/common_runtime/executor_test.cc
+++ b/tensorflow/core/common_runtime/executor_test.cc
@@ -46,7 +46,6 @@ limitations under the License.
 #include "tensorflow/core/platform/strcat.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
-#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 3789b94a20757e..3cb7aa98a95ec9 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -170,13 +170,13 @@ class FunctionLibraryRuntimeOverlay : public FunctionLibraryRuntime {
 
   Status GetRetTypes(Handle h, DataTypeVector* ret_types) override;
 
-  void Run(const Options& opts, Handle handle, gtl::ArraySlice<Tensor> args,
+  void Run(const Options& opts, Handle handle, absl::Span<const Tensor> args,
            std::vector<Tensor>* rets, DoneCallback done) override;
 
   void Run(const Options& opts, Handle handle, CallFrameInterface* call_frame,
            DoneCallback done) override;
 
-  Status RunSync(Options opts, Handle handle, gtl::ArraySlice<Tensor> args,
+  Status RunSync(Options opts, Handle handle, absl::Span<const Tensor> args,
                  std::vector<Tensor>* rets) override;
 
   Status RunSync(Options opts, Handle handle,
@@ -240,7 +240,7 @@ Status FunctionLibraryRuntimeOverlay::GetRetTypes(Handle h,
 }
 
 void FunctionLibraryRuntimeOverlay::Run(const Options& opts, Handle handle,
-                                        gtl::ArraySlice<Tensor> args,
+                                        absl::Span<const Tensor> args,
                                         std::vector<Tensor>* rets,
                                         DoneCallback done) {
   base_flr_->Run(opts, handle, args, rets, std::move(done));
@@ -253,7 +253,7 @@ void FunctionLibraryRuntimeOverlay::Run(const Options& opts, Handle handle,
 }
 
 Status FunctionLibraryRuntimeOverlay::RunSync(Options opts, Handle handle,
-                                              gtl::ArraySlice<Tensor> args,
+                                              absl::Span<const Tensor> args,
                                               std::vector<Tensor>* rets) {
   return base_flr_->RunSync(std::move(opts), handle, args, rets);
 }
@@ -354,11 +354,11 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   Status CreateKernel(const std::shared_ptr<const NodeProperties>& props,
                       OpKernel** kernel) override;
 
-  void Run(const Options& opts, Handle handle, gtl::ArraySlice<Tensor> args,
+  void Run(const Options& opts, Handle handle, absl::Span<const Tensor> args,
            std::vector<Tensor>* rets, DoneCallback done) override;
   void Run(const Options& opts, Handle handle, CallFrameInterface* frame,
            DoneCallback done) override;
-  Status RunSync(Options opts, Handle handle, gtl::ArraySlice<Tensor> args,
+  Status RunSync(Options opts, Handle handle, absl::Span<const Tensor> args,
                  std::vector<Tensor>* rets) override;
   Status RunSync(Options opts, Handle handle,
                  CallFrameInterface* call_frame) override;
@@ -453,7 +453,7 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   bool IsLocalTarget(const InstantiateOptions& options) const;
   AttrValueMap FixAttrs(const AttrSlice& attrs);
   void RunRemote(const Options& opts, Handle handle,
-                 gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets,
+                 absl::Span<const Tensor> args, std::vector<Tensor>* rets,
                  Item* item, DoneCallback done);
 
   // TODO(fishx): Avoid using std::unique_ptr for PrivateIntraProcessRendezvous,
@@ -897,50 +897,6 @@ Status FunctionLibraryRuntimeImpl::ReleaseHandle(Handle handle) {
   return parent_status;
 }
 
-namespace {
-
-// Removes all stateless nodes that do not contribute to a return
-// value from the function body. Unlike `RemoveDeadNodes()`, which is
-// triggered by `OptimizerOptions.do_function_inlining`, this pass
-// ignores the SINK node, from which (by definition) all nodes are
-// reverse reachable, and preserves all nodes that are reachable from
-// control output nodes.
-//
-// TODO(ezhulenev, skyewm): Function body should not have special treatment of
-// stateful ops, graph should encode nodes that must execute with `control_ret`
-// and `control_output`.
-void PruneFunctionBody(const FunctionDef& fdef, Graph* g) {
-  VLOG(2) << "Pruning function body: function_name=" << fdef.signature().name();
-
-  // `control_ret` nodes must be always executed.
-  std::unordered_set<StringPiece, StringPieceHasher> control_ret_nodes;
-  for (const auto& control_ret : fdef.control_ret()) {
-    control_ret_nodes.insert(control_ret.second);
-  }
-
-  std::unordered_set<const Node*> nodes;
-  for (auto n : g->nodes()) {
-    // NOTE(mrry): "_Retval" nodes are stateful, and so will be added
-    // to the seed set of `nodes`. "_Arg" nodes are also stateful, but we
-    // specifically exclude them as seeds, to avoid unconditionally executing
-    // unused argument nodes (e.g. in a function like `lambda x, y: y`).
-    // TODO(mrry): Investigate whether the `n->IsControlFlow()` test is
-    // still needed. It would be preferable to prune entire loops and/or
-    // conditionals if they are not used in the graph.
-    if (n->IsControlFlow() ||
-        (n->op_def().is_stateful() && n->type_string() != kArgOp) ||
-        (control_ret_nodes.find(n->name()) != control_ret_nodes.end())) {
-      nodes.insert(n);
-    }
-  }
-  bool changed = PruneForReverseReachability(g, std::move(nodes));
-  if (changed) {
-    FixupSourceAndSinkEdges(g);
-  }
-}
-
-}  // namespace
-
 Status FunctionLibraryRuntimeImpl::CreateItem(Item** item) {
   const FunctionBody* fbody;
   FunctionLibraryRuntime* flr;
@@ -1052,7 +1008,7 @@ void FunctionLibraryRuntimeImpl::ExecutorArgsFromOptions(
 }
 
 void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
-                                           gtl::ArraySlice<Tensor> args,
+                                           absl::Span<const Tensor> args,
                                            std::vector<Tensor>* rets,
                                            Item* item, DoneCallback done) {
   string target_device = parent_->GetDeviceName(handle);
@@ -1142,7 +1098,7 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
 }
 
 void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
-                                     gtl::ArraySlice<Tensor> args,
+                                     absl::Span<const Tensor> args,
                                      std::vector<Tensor>* rets,
                                      DoneCallback done) {
   if (opts.cancellation_manager && opts.cancellation_manager->IsCancelled()) {
@@ -1197,14 +1153,14 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
   Executor::Args exec_args;
   ExecutorArgsFromOptions(run_opts, frame, &exec_args);
 
-  profiler::TraceMeProducer activity(
+  tsl::profiler::TraceMeProducer activity(
       // To TraceMeConsumers in ExecutorState::Process/Finish.
       [&run_opts] {
-        return profiler::TraceMeEncode("FunctionRun",
-                                       {{"id", run_opts.step_id}, {"_r", 1}});
+        return tsl::profiler::TraceMeEncode(
+            "FunctionRun", {{"id", run_opts.step_id}, {"_r", 1}});
       },
-      profiler::ContextType::kTfExecutor, *exec_args.function_trace_id,
-      profiler::TraceMeLevel::kInfo);
+      tsl::profiler::ContextType::kTfExecutor, *exec_args.function_trace_id,
+      tsl::profiler::TraceMeLevel::kInfo);
 
   bool allow_dead_tensors = run_opts.allow_dead_tensors;
   item->exec->RunAsync(
@@ -1269,14 +1225,14 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
 
   Executor::Args exec_args;
   ExecutorArgsFromOptions(run_opts, frame, &exec_args);
-  profiler::TraceMeProducer activity(
+  tsl::profiler::TraceMeProducer activity(
       // To TraceMeConsumers in ExecutorState::Process/Finish.
       [&opts] {
-        return profiler::TraceMeEncode("FunctionRun",
-                                       {{"id", opts.step_id}, {"_r", 1}});
+        return tsl::profiler::TraceMeEncode("FunctionRun",
+                                            {{"id", opts.step_id}, {"_r", 1}});
       },
-      profiler::ContextType::kTfExecutor, *exec_args.function_trace_id,
-      profiler::TraceMeLevel::kInfo);
+      tsl::profiler::ContextType::kTfExecutor, *exec_args.function_trace_id,
+      tsl::profiler::TraceMeLevel::kInfo);
 
   item->exec->RunAsync(exec_args, std::move(done));
 }
@@ -1322,7 +1278,7 @@ Status FunctionLibraryRuntimeImpl::PrepareRunSync(
 }
 
 Status FunctionLibraryRuntimeImpl::RunSync(Options opts, Handle handle,
-                                           gtl::ArraySlice<Tensor> args,
+                                           absl::Span<const Tensor> args,
                                            std::vector<Tensor>* rets) {
   Item* item = nullptr;
   std::unique_ptr<PrivateIntraProcessRendezvous> rendezvous;
diff --git a/tensorflow/core/common_runtime/function_def_utils.cc b/tensorflow/core/common_runtime/function_def_utils.cc
index c4d87114291ac5..6cffeabc9205d4 100644
--- a/tensorflow/core/common_runtime/function_def_utils.cc
+++ b/tensorflow/core/common_runtime/function_def_utils.cc
@@ -15,18 +15,26 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/function_def_utils.h"
 
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "tensorflow/core/common_runtime/function_body.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/control_flow.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_debug_info_builder.h"
 #include "tensorflow/core/platform/refcount.h"
+#include "tensorflow/core/platform/types.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/hash.h"
+#include "tsl/platform/logging.h"
 
 namespace tensorflow {
 
@@ -106,4 +114,64 @@ Status FunctionDefToBodyHelper(const FunctionDef& fdef, const AttrSlice& attrs,
                                  get_func_sig, fbody);
 }
 
+namespace {
+bool PrunableStatefulNode(const Node* n) {
+  // This set contains ops that are marked as "stateful" in their op
+  // registration, but can be pruned from a function graph if nothing depends
+  // on them. Typically, these are operations that are "impure" but have no
+  // side effects. For example, "ResourceGather" reads from a resource variable
+  // and can produce different results on each invocation (due to variable
+  // updates) but it does not itself modify the variable.
+  // TODO(b/341721055): Consolidate this set with other side effect modeling.
+  static const absl::flat_hash_set<string>* prunable_stateful_ops =
+      new absl::flat_hash_set<string>{
+          FunctionLibraryDefinition::kArgOp,
+          "ResourceGather",
+          "ResourceGatherNd",
+      };
+  return prunable_stateful_ops->contains(n->type_string());
+}
+}  // namespace
+
+// TODO(ezhulenev, skyewm): Function body should not have special treatment of
+// stateful ops, graph should encode nodes that must execute with `control_ret`
+// and `control_output`.
+void PruneFunctionBody(const FunctionDef& fdef, Graph* g,
+                       absl::Span<Node*> additional_root_nodes) {
+  VLOG(2) << "Pruning function body: function_name=" << fdef.signature().name()
+          << " #nodes = " << g->num_nodes();
+
+  // `control_ret` nodes must be always executed.
+  absl::flat_hash_set<absl::string_view, tsl::StringPieceHasher>
+      control_ret_nodes;
+  for (const auto& control_ret : fdef.control_ret()) {
+    control_ret_nodes.insert(control_ret.second);
+  }
+
+  std::unordered_set<const Node*> nodes;
+  for (auto n : additional_root_nodes) {
+    nodes.insert(n);
+  }
+  for (auto n : g->nodes()) {
+    // NOTE(mrry): "_Retval" nodes are stateful, and so will be added
+    // to the seed set of `nodes`. "_Arg" nodes are also stateful, but we
+    // specifically exclude them as seeds, to avoid unconditionally executing
+    // unused argument nodes (e.g. in a function like `lambda x, y: y`).
+    // TODO(mrry): Investigate whether the `n->IsControlFlow()` test is
+    // still needed. It would be preferable to prune entire loops and/or
+    // conditionals if they are not used in the graph.
+    if (n->IsControlFlow() ||
+        (n->op_def().is_stateful() && !PrunableStatefulNode(n)) ||
+        (control_ret_nodes.find(n->name()) != control_ret_nodes.end())) {
+      nodes.insert(n);
+    }
+  }
+  bool changed = PruneForReverseReachability(g, std::move(nodes));
+  if (changed) {
+    VLOG(2) << "Pruned function body and changed: function_name="
+            << fdef.signature().name() << " #nodes = " << g->num_nodes();
+    FixupSourceAndSinkEdges(g);
+  }
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/common_runtime/function_def_utils.h b/tensorflow/core/common_runtime/function_def_utils.h
index 1d60ce3b38c43c..b5f92660dc7f22 100644
--- a/tensorflow/core/common_runtime/function_def_utils.h
+++ b/tensorflow/core/common_runtime/function_def_utils.h
@@ -57,6 +57,15 @@ Status FunctionDefToBodyHelper(
     const std::function<Status(const string&, const OpDef**)>& get_func_sig,
     std::unique_ptr<FunctionBody>* fbody);
 
+// Removes all stateless nodes that do not contribute to a return
+// value from the function body. Unlike `RemoveDeadNodes()`, which is
+// triggered by `OptimizerOptions.do_function_inlining`, this pass
+// ignores the SINK node, from which (by definition) all nodes are
+// reverse reachable, and preserves all nodes that are reachable from
+// control output nodes.
+void PruneFunctionBody(const FunctionDef& fdef, Graph* g,
+                       absl::Span<Node*> additional_root_nodes = {});
+
 }  // end namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_DEF_UTILS_H_
diff --git a/tensorflow/core/common_runtime/gpu/BUILD b/tensorflow/core/common_runtime/gpu/BUILD
index 7c016cb809ad92..8fdd8521c83cc6 100644
--- a/tensorflow/core/common_runtime/gpu/BUILD
+++ b/tensorflow/core/common_runtime/gpu/BUILD
@@ -328,6 +328,7 @@ tf_cuda_cc_test(
         "//tensorflow/core/kernels:ops_util",
         "@local_tsl//tsl/framework:device_id",
         "@local_xla//xla/stream_executor/gpu:gpu_cudamallocasync_allocator_header",
+        "@local_xla//xla/tests:test_macros_header",
     ],
 )
 
@@ -388,6 +389,7 @@ tf_cuda_cc_test(
         "//tensorflow/core/common_runtime:direct_session_internal",
         "//tensorflow/core/kernels:ops_util",
         "@local_xla//xla/stream_executor/gpu:gpu_cudamallocasync_allocator_header",
+        "@local_xla//xla/tests:test_macros_header",
     ],
 )
 
@@ -477,6 +479,5 @@ cc_library(
     name = "gpu_scheduling_metrics_storage",
     srcs = ["gpu_scheduling_metrics_storage.cc"],
     hdrs = ["gpu_scheduling_metrics_storage.h"],
-    visibility = ["//visibility:public"],
     deps = ["@local_tsl//tsl/framework:real_time_in_memory_metric"],
 )
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
index 48aba783f1f526..b291186f0fba96 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
@@ -512,18 +512,6 @@ class GPUBFCAllocatorPrivateMethodsTest
     }
   }
 
-  void TestLog2FloorNonZeroSlow() {
-    GPUBFCAllocator a(GetParam()(1ull << 32), 1 /* total_memory */, "GPU_0_bfc",
-                      {});
-    EXPECT_EQ(-1, a.Log2FloorNonZeroSlow(0));
-    EXPECT_EQ(0, a.Log2FloorNonZeroSlow(1));
-    EXPECT_EQ(1, a.Log2FloorNonZeroSlow(2));
-    EXPECT_EQ(1, a.Log2FloorNonZeroSlow(3));
-    EXPECT_EQ(9, a.Log2FloorNonZeroSlow(1023));
-    EXPECT_EQ(10, a.Log2FloorNonZeroSlow(1024));
-    EXPECT_EQ(10, a.Log2FloorNonZeroSlow(1025));
-  }
-
   void TestForceAllowGrowth() {
     // Unset flag value uses provided option.
     unsetenv("TF_FORCE_GPU_ALLOW_GROWTH");
@@ -563,10 +551,6 @@ class GPUBFCAllocatorPrivateMethodsTest
 
 TEST_P(GPUBFCAllocatorPrivateMethodsTest, BinDebugInfo) { TestBinDebugInfo(); }
 
-TEST_P(GPUBFCAllocatorPrivateMethodsTest, Log2FloorNonZeroSlow) {
-  TestLog2FloorNonZeroSlow();
-}
-
 TEST_P(GPUBFCAllocatorPrivateMethodsTest, ForceAllowGrowth) {
   TestForceAllowGrowth();
 }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 6680e9622f04e8..6f8761b4985b23 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -459,7 +459,9 @@ BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
       cpu_allocator_(cpu_allocator),
       scoped_allocator_mgr_(new ScopedAllocatorMgr(name)),
       tf_device_id_(tf_device_id),
-      sync_every_op_(sync_every_op) {
+      sync_every_op_(sync_every_op),
+      stream_merge_options_(
+          options.config.gpu_options().experimental().stream_merge_options()) {
   // XLA device IDs for GPUs are arbitrary but must be unique, so we hash device
   // names (which include a replica index even for multi-client).
   set_xla_global_id(Fingerprint32(name) % std::numeric_limits<int32_t>::max());
@@ -2017,8 +2019,8 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(
       tf_device_id, GetShortDeviceDescription(platform_device_id, *desc),
       gpu_allocator, ProcessState::singleton()->GetCPUAllocator(numa_node));
   LOG(INFO) << "Created device " << device_name << " with "
-            << (bytes_limit >> 20) << " MB memory: "
-            << " -> " << GetShortDeviceDescription(platform_device_id, *desc);
+            << (bytes_limit >> 20) << " MB memory: " << " -> "
+            << GetShortDeviceDescription(platform_device_id, *desc);
 #ifdef TF_GPU_USE_PJRT
   TF_RETURN_IF_ERROR(gpu_device->Init(options, xla_local_device_state));
 #else
@@ -2307,9 +2309,9 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
 
     auto description = std::move(description_status).value();
 #if GOOGLE_CUDA
-    VLOG(1) << "Found device " << i << " with properties: "
-            << "\npciBusID: " << description->pci_bus_id()
-            << " name: " << description->name() << " computeCapability: "
+    VLOG(1) << "Found device " << i << " with properties: " << "\npciBusID: "
+            << description->pci_bus_id() << " name: " << description->name()
+            << " computeCapability: "
             << description->cuda_compute_capability().ToString()
             << "\ncoreClock: " << description->clock_rate_ghz() << "GHz"
             << " coreCount: " << description->core_count()
@@ -2321,9 +2323,8 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
 #elif TENSORFLOW_USE_ROCM
     std::string gcn_arch_name =
         description->rocm_compute_capability().gcn_arch_name();
-    VLOG(1) << "Found device " << i << " with properties: "
-            << "\npciBusID: " << description->pci_bus_id()
-            << " name: " << description->name()
+    VLOG(1) << "Found device " << i << " with properties: " << "\npciBusID: "
+            << description->pci_bus_id() << " name: " << description->name()
             << "     ROCm AMDGPU Arch: " << gcn_arch_name
             << "\ncoreClock: " << description->clock_rate_ghz() << "GHz"
             << " coreCount: " << description->core_count()
@@ -2381,9 +2382,8 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
     // Only GPUs with no less than the minimum supported compute capability is
     // accepted.
     if (desc->cuda_compute_capability() < min_supported_capability) {
-      LOG(INFO) << "Ignoring visible gpu device "
-                << "(" << GetShortDeviceDescription(visible_gpu_id, *desc)
-                << ") "
+      LOG(INFO) << "Ignoring visible gpu device " << "("
+                << GetShortDeviceDescription(visible_gpu_id, *desc) << ") "
                 << "with Cuda compute capability "
                 << desc->cuda_compute_capability().ToString()
                 << ". The minimum required Cuda capability is "
@@ -2394,9 +2394,8 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
     // Only GPUs with supported gfx versions are accepted.
     auto rocm_compute_capability = desc->rocm_compute_capability();
     if (!rocm_compute_capability.is_supported_gfx_version()) {
-      LOG(INFO) << "Ignoring visible gpu device "
-                << "(" << GetShortDeviceDescription(visible_gpu_id, *desc)
-                << ") "
+      LOG(INFO) << "Ignoring visible gpu device " << "("
+                << GetShortDeviceDescription(visible_gpu_id, *desc) << ") "
                 << "with AMDGPU version : "
                 << rocm_compute_capability.gfx_version()
                 << ". The supported AMDGPU versions are "
@@ -2410,9 +2409,8 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
     // multiprocessors. If the TF_MIN_GPU_MULTIPROCESSOR_COUNT environment
     // variable is set, its value will be used to filter out GPUs.
     if (desc->core_count() < min_gpu_core_count) {
-      LOG(INFO) << "Ignoring visible gpu device "
-                << "(" << GetShortDeviceDescription(visible_gpu_id, *desc)
-                << ") "
+      LOG(INFO) << "Ignoring visible gpu device " << "("
+                << GetShortDeviceDescription(visible_gpu_id, *desc) << ") "
                 << "with core count: " << desc->core_count()
                 << ". The minimum required count is " << min_gpu_core_count
                 << ". You can adjust this requirement with the env var "
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index 218cdb31ab75ff..ef8578981822bf 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -181,6 +181,18 @@ class BaseGPUDevice : public LocalDevice {
   // for the GPU or vGPU.
   static std::optional<tsl::TfDeviceId> FindTfDeviceId(se::Stream* compute);
 
+  bool merge_host_to_device_stream() const override {
+    return stream_merge_options_.merge_host_to_device_stream();
+  }
+
+  bool merge_device_to_host_stream() const override {
+    return stream_merge_options_.merge_device_to_host_stream();
+  }
+
+  bool merge_device_to_device_stream() const override {
+    return stream_merge_options_.merge_device_to_device_stream();
+  }
+
  protected:
   Allocator* gpu_allocator_;  // not owned
   Allocator* cpu_allocator_;  // not owned
@@ -207,6 +219,7 @@ class BaseGPUDevice : public LocalDevice {
   int32 pending_cap_ = 0;
   bool timestamped_allocator_ = false;
   NodeFileWriter* node_file_writer_ = nullptr;  // not owned
+  const GPUOptions::Experimental::StreamMergeOptions stream_merge_options_;
 
   // Initialize scratch buffers used by Eigen.
   Status InitScratchBuffers();
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
index 8f453f442e2950..7a9a0c2661f571 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "xla/stream_executor/gpu/gpu_cudamallocasync_allocator.h"
 #include "xla/stream_executor/gpu/gpu_init.h"
+#include "xla/tests/test_macros.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
@@ -37,6 +38,11 @@ limitations under the License.
 #include "tensorflow/core/tfrt/common/pjrt_util.h"
 #endif  // TF_GPU_USE_PJRT
 
+#if GOOGLE_CUDA
+// Needed for CUDA_VERSION preprocessor directive
+#include "third_party/gpus/cuda/include/cuda.h"
+#endif
+
 namespace tensorflow {
 namespace {
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_serving_device_selector.cc b/tensorflow/core/common_runtime/gpu/gpu_serving_device_selector.cc
index 4b3fe0345cc7b2..d706e057d342f6 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_serving_device_selector.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_serving_device_selector.cc
@@ -69,7 +69,7 @@ tsl::DeviceReservation GpuServingDeviceSelector::ReserveDevice(
 
 void GpuServingDeviceSelector::FreeDeviceReservation(
     const tsl::DeviceReservation& reservation) {
-  Completed(reservation.device_index(), /*had_error=*/false);
+  Completed(reservation.device_index());
 }
 
 void GpuServingDeviceSelector::Enqueue(int32_t index_on_host,
diff --git a/tensorflow/core/common_runtime/gpu/gpu_serving_device_selector.h b/tensorflow/core/common_runtime/gpu/gpu_serving_device_selector.h
index 1a3af85a22af24..6e5d12512ce923 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_serving_device_selector.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_serving_device_selector.h
@@ -32,6 +32,9 @@ namespace gpu {
 class GpuServingDeviceSelector;
 const char kGpuServingDeviceSelectorResourceName[] =
     "gpu_serving_device_selector";
+// TODO(b/335729939): Disable GPU load tracker for performance regression
+// investigation. Remove when fixed.
+const bool kUseGpuServingDeviceSelector = false;
 
 class GpuServingDeviceSelectorResource : public ResourceBase {
  public:
@@ -61,12 +64,12 @@ class GpuServingDeviceSelector : public tsl::ServingDeviceSelector {
       absl::string_view program_fingerprint) override;
 
   // Enqueues the program on the stream of index `index_on_host`.
-  void Enqueue(int32_t index_on_host, absl::string_view fingerprint) override;
+  void Enqueue(int32_t index_on_host, absl::string_view fingerprint);
 
   // Marks the completion of a program on the given stream.
   // If `had_error` is true, this function doesn't update program's execution
   // time stats to avoid incorrect estimates.
-  void Completed(int32_t index_on_host, bool had_error) override;
+  void Completed(int32_t index_on_host, bool had_error = false);
 
  private:
   friend class ServingDeviceSelectorTestHelper;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_serving_device_selector_test.cc b/tensorflow/core/common_runtime/gpu/gpu_serving_device_selector_test.cc
index 091ab5b7da9c7f..d2f907c2d8d859 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_serving_device_selector_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_serving_device_selector_test.cc
@@ -84,18 +84,18 @@ TEST(GpuServingDeviceSelector, DefaultPolicyOnlyEnqueueCall) {
   serving_device_selector->Enqueue(1, "4ms");
   serving_device_selector->Enqueue(0, "2ms");
   helper.ElapseNs(2e6);
-  serving_device_selector->Completed(0, false);
+  serving_device_selector->Completed(0);
   helper.ElapseNs(2e6);
-  serving_device_selector->Completed(0, false);
-  serving_device_selector->Completed(1, false);
+  serving_device_selector->Completed(0);
+  serving_device_selector->Completed(1);
   helper.ElapseNs(4e6);
-  serving_device_selector->Completed(1, false);
-  serving_device_selector->Completed(2, false);
+  serving_device_selector->Completed(1);
+  serving_device_selector->Completed(2);
   helper.ElapseNs(8e6);
-  serving_device_selector->Completed(2, false);
-  serving_device_selector->Completed(3, false);
+  serving_device_selector->Completed(2);
+  serving_device_selector->Completed(3);
   helper.ElapseNs(16e6);
-  serving_device_selector->Completed(3, false);
+  serving_device_selector->Completed(3);
 
   serving_device_selector->Enqueue(3, "16ms");
   EXPECT_EQ(
@@ -114,22 +114,22 @@ TEST(GpuServingDeviceSelector, DefaultPolicyOnlyEnqueueCall) {
       GpuSchedulingMetricsStorage::GetGlobalStorage().TotalGpuLoadNs().Get(),
       30e6);
   helper.ElapseNs(2e6);
-  serving_device_selector->Completed(0, false);
+  serving_device_selector->Completed(0);
   EXPECT_EQ(
       GpuSchedulingMetricsStorage::GetGlobalStorage().TotalGpuLoadNs().Get(),
       22e6);
   helper.ElapseNs(2e6);
-  serving_device_selector->Completed(1, false);
+  serving_device_selector->Completed(1);
   EXPECT_EQ(
       GpuSchedulingMetricsStorage::GetGlobalStorage().TotalGpuLoadNs().Get(),
       16e6);
   helper.ElapseNs(4e6);
-  serving_device_selector->Completed(2, false);
+  serving_device_selector->Completed(2);
   EXPECT_EQ(
       GpuSchedulingMetricsStorage::GetGlobalStorage().TotalGpuLoadNs().Get(),
       8e6);
   helper.ElapseNs(8e6);
-  serving_device_selector->Completed(3, false);
+  serving_device_selector->Completed(3);
   EXPECT_EQ(
       GpuSchedulingMetricsStorage::GetGlobalStorage().TotalGpuLoadNs().Get(),
       0e6);
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.cc b/tensorflow/core/common_runtime/gpu/gpu_util.cc
index 0b7f8371dcea75..51384baae6f704 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.cc
@@ -227,19 +227,24 @@ void GPUUtil::DeviceToDeviceCopy(
     done(s);
     return;
   }
-  auto send_device_to_device_stream =
-      static_cast<const GPUDeviceContext*>(send_dev_context)
-          ->device_to_device_stream(dev_to_dev_stream_index);
-  if (send_device_to_device_stream == nullptr) {
-    done(errors::Internal("No send gpu copy-out-stream is available."));
-    return;
-  }
-  // Wait for the main stream on the sender to make sure the result is
-  // available.
-  s = send_device_to_device_stream->WaitFor(send_stream);
-  if (!s.ok()) {
-    done(s);
-    return;
+  se::Stream* send_device_to_device_stream = nullptr;
+  if (src->merge_device_to_device_stream()) {
+    send_device_to_device_stream = send_stream;
+  } else {
+    send_device_to_device_stream =
+        static_cast<const GPUDeviceContext*>(send_dev_context)
+            ->device_to_device_stream(dev_to_dev_stream_index);
+    if (send_device_to_device_stream == nullptr) {
+      done(absl::AbortedError("No send gpu copy-out-stream is available."));
+      return;
+    }
+    // Wait for the main stream on the sender to make sure the result is
+    // available.
+    s = send_device_to_device_stream->WaitFor(send_stream);
+    if (!s.ok()) {
+      done(s);
+      return;
+    }
   }
 
   const int64_t total_bytes = input->TotalBytes();
@@ -264,10 +269,12 @@ void GPUUtil::DeviceToDeviceCopy(
     // truly free.
     // TODO(zhengxq): remove this dependency when we switch to a better way
     // to make sure the memory is free.
-    s = send_device_to_device_stream->WaitFor(recv_stream);
-    if (!s.ok()) {
-      done(s);
-      return;
+    if (send_device_to_device_stream != recv_stream) {
+      s = send_device_to_device_stream->WaitFor(recv_stream);
+      if (!s.ok()) {
+        done(s);
+        return;
+      }
     }
 
     VLOG(2) << "src_ptr " << src_ptr << " dst_ptr " << dst_ptr;
@@ -322,18 +329,23 @@ void GPUUtil::CopyGPUTensorToCPU(Device* gpu_device,
     return;
   }
 
-  auto send_device_to_host_stream =
-      static_cast<const GPUDeviceContext*>(device_context)
-          ->device_to_host_stream();
-  if (send_device_to_host_stream == nullptr) {
-    done(absl::InternalError("No send gpu copy-out-stream is available."));
-    return;
-  }
-  // Wait for the sender's main stream to make sure the data are available.
-  s = send_device_to_host_stream->WaitFor(send_stream);
-  if (!s.ok()) {
-    done(s);
-    return;
+  se::Stream* send_device_to_host_stream = nullptr;
+  if (gpu_device->merge_device_to_host_stream()) {
+    send_device_to_host_stream = send_stream;
+  } else {
+    send_device_to_host_stream =
+        static_cast<const GPUDeviceContext*>(device_context)
+            ->device_to_host_stream();
+    if (send_device_to_host_stream == nullptr) {
+      done(absl::InternalError("No send gpu copy-out-stream is available."));
+      return;
+    }
+    // Wait for the sender's main stream to make sure the data are available.
+    s = send_device_to_host_stream->WaitFor(send_stream);
+    if (!s.ok()) {
+      done(s);
+      return;
+    }
   }
 
 #ifdef TF_GPU_USE_PJRT
@@ -398,20 +410,27 @@ void GPUUtil::CopyCPUTensorToGPU(const Tensor* cpu_tensor,
     return;
   }
 
-  auto recv_host_to_device_stream =
-      static_cast<const GPUDeviceContext*>(device_context)
-          ->host_to_device_stream();
-  if (recv_host_to_device_stream == nullptr) {
-    done(errors::Internal("No send gpu copy-out-stream is available."));
-    return;
-  }
-  // Wait for the recv-stream to make sure the buffer is truly available.
-  if (sync_dst_compute) {
-    s = recv_host_to_device_stream->WaitFor(recv_stream);
-    if (!s.ok()) {
-      done(s);
+  const bool merge_host_to_device_stream =
+      gpu_device->merge_host_to_device_stream();
+  se::Stream* recv_host_to_device_stream = nullptr;
+  if (merge_host_to_device_stream) {
+    recv_host_to_device_stream = recv_stream;
+  } else {
+    recv_host_to_device_stream =
+        static_cast<const GPUDeviceContext*>(device_context)
+            ->host_to_device_stream();
+    if (recv_host_to_device_stream == nullptr) {
+      done(absl::AbortedError("No send gpu copy-out-stream is available."));
       return;
     }
+    // Wait for the recv-stream to make sure the buffer is truly available.
+    if (sync_dst_compute) {
+      s = recv_host_to_device_stream->WaitFor(recv_stream);
+      if (!s.ok()) {
+        done(s);
+        return;
+      }
+    }
   }
 
   const int64_t total_bytes = cpu_tensor->TotalBytes();
@@ -457,19 +476,38 @@ void GPUUtil::CopyCPUTensorToGPU(const Tensor* cpu_tensor,
     }
   }
 
+  if (merge_host_to_device_stream) {
+    // It brings acceleration by moving these lines ahead of the event_mgr
+    // callback, because they mark the completion of this data copy, so that
+    // subsequent ops can be scheduled, without needing to wait for the
+    // callback. This is safe, because:
+    //   1) For `recv_host_to_device_stream->ok()`, it checks `Stream::status_`,
+    // which will not be modified by the event_mgr.
+    //   2) For `done(absl::OkStatus())`, it leads to the scheduling of
+    // subsequent ops. If one op needs to access the transferred data, it
+    // must be queued in the same stream as the copy, so there is a
+    // CUDA-promised dependency: the operations will not be executed until the
+    // copy is really finished.
+    if (!recv_host_to_device_stream->ok()) {
+      LOG(FATAL) << "CPU->GPU Memcpy failed";  // Crash OK
+    }
+    done(absl::OkStatus());
+  }
   dev_info->event_mgr->ThenExecute(
       recv_host_to_device_stream,
       [recv_host_to_device_stream, done, input_ref, do_staging, staging_buffer,
-       host_memory_allocator]() {
+       host_memory_allocator, merge_host_to_device_stream]() {
         if (do_staging) {
           host_memory_allocator->DeallocateRaw(staging_buffer);
         } else {
           input_ref.Unref();
         }
-        if (!recv_host_to_device_stream->ok()) {
-          LOG(FATAL) << "CPU->GPU Memcpy failed";
+        if (!merge_host_to_device_stream) {
+          if (!recv_host_to_device_stream->ok()) {
+            LOG(FATAL) << "CPU->GPU Memcpy failed";  // Crash OK
+          }
+          done(absl::OkStatus());
         }
-        done(absl::OkStatus());
       });
 }
 
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index c2e115b80c3cb9..744e72fcf75d57 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -778,11 +778,11 @@ Status GraphExecutionState::OptimizeGraph(
         node_names.insert(node->name());
       }
     }
-    for (const auto& feed : item.feed) {
+    for (auto& feed : item.feed) {
       SafeTensorId tensor_id = ParseTensorName(feed.first);
       if (node_names.find(tensor_id.node()) == node_names.end()) {
         return errors::InvalidArgument("Invalid feed, no such node in graph: ",
-                                       feed.first);
+                                       std::move(feed.first));
       }
     }
     for (const auto& fetch : item.fetch) {
diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
index f05811e8893cd1..dca693dc594b8e 100644
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
@@ -316,9 +316,9 @@ void HierarchicalTreeBroadcaster::RunTree() {
 
     if (my_rank >= 0 && my_rank != source_rank) {
       // Begin by receiving the value.
-      profiler::TraceMe activity(
+      tsl::profiler::TraceMe activity(
           [&] { return strings::StrCat("ReceiveValue:", si); },
-          profiler::TraceMeLevel::kInfo);
+          tsl::profiler::TraceMeLevel::kInfo);
       int recv_from_rank = TreeRecvFrom(*col_params_, si);
       Notification note;
       DispatchRecv(si, recv_from_rank, my_rank, col_ctx_->output,
@@ -332,9 +332,9 @@ void HierarchicalTreeBroadcaster::RunTree() {
 
     // Then forward value to all descendent devices.
     {
-      profiler::TraceMe activity(
+      tsl::profiler::TraceMe activity(
           [&] { return strings::StrCat("ForwardValue:", si); },
-          profiler::TraceMeLevel::kInfo);
+          tsl::profiler::TraceMeLevel::kInfo);
       if (my_rank >= 0 && status_.ok()) {
         std::vector<int> send_to_ranks;
         TreeSendTo(*col_params_, si, &send_to_ranks);
diff --git a/tensorflow/core/common_runtime/lower_function_call_op.cc b/tensorflow/core/common_runtime/lower_function_call_op.cc
index b05509e5246e9e..c5226b4eefcc85 100644
--- a/tensorflow/core/common_runtime/lower_function_call_op.cc
+++ b/tensorflow/core/common_runtime/lower_function_call_op.cc
@@ -18,9 +18,11 @@ limitations under the License.
 #include <utility>
 
 #include "absl/algorithm/container.h"
+#include "absl/types/span.h"
 #include "tensorflow/core/common_runtime/function_def_utils.h"
 #include "tensorflow/core/common_runtime/inline_function_utils.h"
 #include "tensorflow/core/common_runtime/lower_function_call_inline_policy.h"
+#include "tensorflow/core/config/flag_defs.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_node_util.h"
@@ -87,6 +89,20 @@ Status RewriteFunctionCallNode(Node* n, Graph* g,
   TF_RETURN_IF_ERROR(
       FunctionDefToBodyHelper(std::move(fdef), n->attrs(), &flib_def, &fbody));
 
+  if (flags::Global().enable_function_pruning_before_inlining.value()) {
+    // TODO(b/341325107): Enable this path by default and remove the flag.
+    VLOG(2) << "Pruning enabled before inlining";
+    // NOTE(mrry): We pass `fbody->arg_nodes` as an additional set of roots,
+    // because otherwise the `FunctionBody` state will become inconsistent.
+    // The unused `Identity` nodes will be colocated with the arguments, and
+    // pruned in a subsequent pass.
+    PruneFunctionBody(
+        fbody->record->fdef(), fbody->graph,
+        absl::Span<Node*>(fbody->arg_nodes.data(), fbody->arg_nodes.size()));
+  } else {
+    VLOG(2) << "Pruning disabled before inlining";
+  }
+
   Status can_inline_function_call =
       ValidateInlining(n, fbody.get(), inline_options);
   if (can_inline_function_call.ok()) {
diff --git a/tensorflow/core/common_runtime/lower_function_call_op_test.cc b/tensorflow/core/common_runtime/lower_function_call_op_test.cc
index 600c422c8f546c..ea3de9500b9d3c 100644
--- a/tensorflow/core/common_runtime/lower_function_call_op_test.cc
+++ b/tensorflow/core/common_runtime/lower_function_call_op_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/graph_runner.h"
 #include "tensorflow/core/common_runtime/lower_functional_ops.h"
+#include "tensorflow/core/config/flag_defs.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
@@ -133,6 +134,95 @@ TEST(LowerFunctionCallTest, InlineFunctionCall) {
   }
 }
 
+TEST(LowerFunctionCallTest, InlineFunctionCallAfterPruning) {
+  flags::Global().enable_function_pruning_before_inlining.reset(true);
+  using FDH = FunctionDefHelper;
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+
+  FunctionDefLibrary f_lib_proto;
+
+  // `add` node is not required to compute regular output `o`, but it must
+  // execute because it is in `control_ret`.
+  // The `div` node and the unused arguments `j` and `k` should be pruned.
+  *(f_lib_proto.add_function()) = FDH::Create(
+      "AddAndMul", {"i: int32", "j: int32", "k: int32", "r: resource"},
+      {"o: int32"}, {},
+      {{{"add"}, "Add", {"i", "i"}, {{"T", DT_INT32}}},
+       {{"div"}, "FloorDiv", {"i", "i"}, {{"T", DT_INT32}}},
+       {{"gather"},
+        "ResourceGather",
+        {"r", "i"},
+        {{"Tindices", DT_INT32}, {"dtype", DT_FLOAT}}},
+       {{"ret"}, "Mul", {"i", "i"}, {{"T", DT_INT32}}}},
+      /*ret_def=*/{{"o", "ret:z:0"}},
+      /*control_ret_def=*/{{"must_execute", "add"}});
+
+  // Construct a graph:
+  //   X = Placeholder[dtype=int32]
+  //   Y = Placeholder[dtype=int32]
+  //   Z = Placeholder[dtype=int32]
+  //   R = Placeholder[dtype=resource]
+  //   F = PartitionedCall[f=AddAndMul](a)
+  //   B = Identity(func, ^func)
+  Scope root = Scope::NewRootScope().ExitOnError();
+  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(f_lib_proto));
+  auto x = ops::Placeholder(root.WithOpName("X"), DT_INT32);
+  auto y = ops::Placeholder(root.WithOpName("Y"), DT_INT32);
+  auto z = ops::Placeholder(root.WithOpName("Z"), DT_INT32);
+  auto r = ops::Placeholder(root.WithOpName("R"), DT_RESOURCE);
+  Node* function_call;
+  std::vector<NodeBuilder::NodeOut> inputs(
+      {NodeBuilder::NodeOut(x.node()), NodeBuilder::NodeOut(y.node()),
+       NodeBuilder::NodeOut(z.node()), NodeBuilder::NodeOut(r.node())});
+  TF_ASSERT_OK(NodeBuilder("F", "PartitionedCall", &root.graph()->flib_def())
+                   .Input(inputs)
+                   .Attr("Tin", {DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE})
+                   .Attr("Tout", {DT_INT32})
+                   .Attr("f", FuncAttr("AddAndMul"))
+                   .Finalize(root.graph(), &function_call));
+  TF_ASSERT_OK(root.DoShapeInference(function_call));
+
+  auto b = ops::Identity(root.WithOpName("B"), Output(function_call, 0));
+  root.graph()->AddControlEdge(function_call, b.node());
+
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+  TF_ASSERT_OK(Rewrite(&graph));
+
+  // Verify the resultant graph has no PartitionedCall ops and function body was
+  // inlined into the main graph.
+  int partitioned_call_count = 0;
+  int add_count = 0;
+  int mul_count = 0;
+  int floor_div_count = 0;
+  int resource_gather_count = 0;
+  for (const auto* op : graph->op_nodes()) {
+    if (op->IsPartitionedCall()) partitioned_call_count++;
+    if (op->type_string() == "Add") add_count++;
+    if (op->type_string() == "Mul") mul_count++;
+    if (op->type_string() == "FloorDiv") floor_div_count++;
+    if (op->type_string() == "ResourceGather") resource_gather_count++;
+  }
+
+  ASSERT_EQ(partitioned_call_count, 0);
+  ASSERT_EQ(add_count, 1);
+  ASSERT_EQ(mul_count, 1);
+  ASSERT_EQ(floor_div_count, 0);
+  ASSERT_EQ(resource_gather_count, 0);
+
+  // Verify execution.
+  ClientSession session(root, SessionOptionsWithInlining());
+  {
+    ClientSession::FeedType feeds;
+    feeds.emplace(Output(x.node()), Input::Initializer(10));
+    std::vector<Tensor> out_tensors;
+    TF_ASSERT_OK(session.Run(feeds, {Output(b)}, &out_tensors));
+    EXPECT_EQ(out_tensors.size(), 1);
+    EXPECT_EQ(out_tensors[0].scalar<int>()(), 100);
+  }
+  flags::Global().enable_function_pruning_before_inlining.reset(false);
+}
+
 TEST(LowerFunctionCallTest, DoNotInlineTpuOrXlaFunctions) {
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
 
diff --git a/tensorflow/core/common_runtime/lower_while_op_test.cc b/tensorflow/core/common_runtime/lower_while_op_test.cc
index e1237cd5555b5f..b57145c73167ff 100644
--- a/tensorflow/core/common_runtime/lower_while_op_test.cc
+++ b/tensorflow/core/common_runtime/lower_while_op_test.cc
@@ -13,10 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+#include <vector>
+
 #include <gtest/gtest.h>
 #include "absl/strings/match.h"
 #include "tensorflow/cc/client/client_session.h"
 #include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/control_flow_ops_internal.h"
 #include "tensorflow/cc/ops/function_ops.h"
@@ -24,15 +28,18 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/graph_runner.h"
 #include "tensorflow/core/common_runtime/lower_functional_ops.h"
+#include "tensorflow/core/config/flag_defs.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tsl/lib/core/status_test_util.h"
 
 namespace tensorflow {
 namespace {
@@ -173,6 +180,65 @@ TEST(LowerWhileOpTest, Simple) {
   }
 }
 
+static void DanglingNodeTestHelper(int expected_count) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+
+  // Add test functions for cond and body.
+  FunctionDefLibrary f_lib_proto;
+  *f_lib_proto.add_function() =
+      test::function::XTimesTwoWithDanglingFloorDivNode();
+  *f_lib_proto.add_function() = test::function::LessThanOrEqualToN(8);
+
+  Scope root = Scope::NewRootScope().ExitOnError();
+  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(f_lib_proto));
+  auto a = ops::Placeholder(root.WithOpName("A"), DT_INT32);
+  Node* while_node;
+  std::vector<NodeBuilder::NodeOut> inputs({NodeBuilder::NodeOut(a.node())});
+  AttrValue cond_func;
+  cond_func.mutable_func()->set_name("LessThanOrEqualToN");
+  AttrValue body_func;
+  body_func.mutable_func()->set_name("XTimesTwoWithDanglingFloorDivNode");
+  TF_ASSERT_OK(
+      NodeBuilder("while", "While", &root.graph()->flib_def())
+          .Input(inputs)
+          .Attr("T", {DT_INT32})
+          .Attr("cond", cond_func)
+          .Attr("body", body_func)
+          .Attr("parallel_iterations", 100)
+          .Attr(LowerFunctionalOpsPass::kLowerUsingSwitchMergeAttr, true)
+          .Finalize(root.graph(), &while_node));
+  auto c = ops::Identity(
+      root.WithOpName("C").WithControlDependencies(Output(while_node)),
+      Output(while_node));
+  TF_ASSERT_OK(root.DoShapeInference(while_node));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  TF_ASSERT_OK(Rewrite(&graph));
+
+  int mul_count = 0;
+  int floor_div_count = 0;
+
+  for (const auto* op : graph->op_nodes()) {
+    if (op->type_string() == "Mul") {
+      mul_count++;
+    }
+    if (op->type_string() == "FloorDiv") {
+      floor_div_count++;
+    }
+  }
+
+  ASSERT_EQ(mul_count, 1);
+  ASSERT_EQ(floor_div_count, expected_count);
+}
+
+TEST(LowerWhileOpTest, DanglingNode) { DanglingNodeTestHelper(1); }
+
+TEST(LowerWhileOpTest, DanglingNodeWithPruning) {
+  flags::Global().enable_function_pruning_before_inlining.reset(true);
+  DanglingNodeTestHelper(0);
+  flags::Global().enable_function_pruning_before_inlining.reset(false);
+}
+
 TEST(LowerWhileOpTest, ForwardAssignedInputDevice) {
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
 
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/BUILD b/tensorflow/core/common_runtime/next_pluggable_device/BUILD
index 41fd6207bd188d..58ccde7524408f 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/BUILD
+++ b/tensorflow/core/common_runtime/next_pluggable_device/BUILD
@@ -304,6 +304,7 @@ cc_library(
         "//tensorflow/c:kernels_hdrs",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/core/common_runtime/next_pluggable_device:flags",
+        "@com_google_absl//absl/flags:flag",
     ],
 )
 
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/c/tf_rendezvous_c_api_test.cc b/tensorflow/core/common_runtime/next_pluggable_device/c/tf_rendezvous_c_api_test.cc
index 3b391d00bceab9..90a918fc1e3946 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/c/tf_rendezvous_c_api_test.cc
+++ b/tensorflow/core/common_runtime/next_pluggable_device/c/tf_rendezvous_c_api_test.cc
@@ -109,7 +109,7 @@ class FakeDeviceManager : public DeviceMgr {
   bool ContainsDevice(int64_t device_incarnation) const override {
     return false;
   }
-  void ClearContainers(gtl::ArraySlice<string> containers) const override {}
+  void ClearContainers(absl::Span<const string> containers) const override {}
   int NumDeviceType(const string& type) const override { return 0; }
   int NumDevices() const override { return 0; }
   Device* HostCPU() const override { return nullptr; }
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent_helper.h b/tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent_helper.h
index 5d2bc60d74fc4f..a5adfa50656039 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent_helper.h
+++ b/tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent_helper.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/flags/flag.h"
 #include "tensorflow/c/kernels.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/common_runtime/next_pluggable_device/c_plugin_coordination_service_agent.h"
diff --git a/tensorflow/core/common_runtime/optimized_function_graph_info.h b/tensorflow/core/common_runtime/optimized_function_graph_info.h
index b2bd9af5bb1c5a..dd05b026ebfed2 100644
--- a/tensorflow/core/common_runtime/optimized_function_graph_info.h
+++ b/tensorflow/core/common_runtime/optimized_function_graph_info.h
@@ -71,9 +71,10 @@ struct OptimizedFunctionGraphInfo {
   OptimizedFunctionGraphInfo(OptimizedFunctionGraphInfo& info) = delete;
   OptimizedFunctionGraphInfo& operator=(OptimizedFunctionGraphInfo& info) =
       delete;
-  OptimizedFunctionGraphInfo(OptimizedFunctionGraphInfo&& info) = default;
-  OptimizedFunctionGraphInfo& operator=(OptimizedFunctionGraphInfo&& info) =
+  OptimizedFunctionGraphInfo(OptimizedFunctionGraphInfo&& info) noexcept =
       default;
+  OptimizedFunctionGraphInfo& operator=(
+      OptimizedFunctionGraphInfo&& info) noexcept = default;
 
   // Converts from the struct to OptimizedFunctionGraph proto.
   static OptimizedFunctionGraph ToProto(const OptimizedFunctionGraphInfo& info);
diff --git a/tensorflow/core/common_runtime/placer.cc b/tensorflow/core/common_runtime/placer.cc
index 05dc029cc74756..2d03de9f2434bc 100644
--- a/tensorflow/core/common_runtime/placer.cc
+++ b/tensorflow/core/common_runtime/placer.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_node_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/path.h"
@@ -117,6 +118,42 @@ bool IsGeneratorNode(const Node* node) {
          !IsRefType(node->output_type(0));
 }
 
+// If a node is an Identity op with input and output on the same device,
+// assign this Identity the same device. If the node already has a requested
+// or assigned device, don't touch it.
+bool MatchIdentityOperation(const Node* node) {
+  if (!node) {
+    return false;
+  }
+
+  if (!node->IsIdentity()) {
+    return false;
+  }
+
+  if (node->has_assigned_device_name()) {
+    return false;
+  }
+
+  if (!node->requested_device().empty()) {
+    return false;
+  }
+
+  // Strictly only check for IDENTITY nodes with only 1 input and
+  // 1 output edge.
+  if (node->in_edges().size() != 1) {
+    return false;
+  }
+
+  if (node->out_edges().size() != 1) {
+    return false;
+  }
+
+  const Node* input = *node->in_nodes().begin();
+  const Node* output = *node->out_nodes().begin();
+
+  return input->requested_device() == output->requested_device();
+}
+
 void LogDeviceAssignment(const Node* node, bool log_device_placement) {
   // Log placement if log_device_placement is set.
   if (log_device_placement) {
@@ -254,10 +291,10 @@ Status Placer::Run(const GraphOptimizationPassOptions& options) {
     // to perform good placement we can add an interface for this.
     int assigned_device = -1;
 
-    // Heuristic B: If the node only operates on metadata, not data,
-    // then it is desirable to place that metadata node with its
+    // Heuristic B: If the node only operates on metadata (not data) or is
+    // an identity node, then it is desirable to place that node with its
     // input.
-    if (IsMetadata(node)) {
+    if (IsMetadata(node) || MatchIdentityOperation(node)) {
       // Make sure that the input device type is in the list of supported
       // device types for this node.
       const Node* input = (*node->in_edges().begin())->src();
diff --git a/tensorflow/core/common_runtime/placer_test.cc b/tensorflow/core/common_runtime/placer_test.cc
index aeca73546468c0..6d76bc7d12c86e 100644
--- a/tensorflow/core/common_runtime/placer_test.cc
+++ b/tensorflow/core/common_runtime/placer_test.cc
@@ -213,6 +213,8 @@ REGISTER_OP("TestTypedConsumer").Input("i: variant");
 REGISTER_KERNEL_BUILDER(Name("TestTypedConsumer").Device("FakeCPU"), DummyOp);
 REGISTER_KERNEL_BUILDER(Name("TestTypedConsumer").Device("FakeGPU"), DummyOp);
 
+REGISTER_OP("ConvertToListOfCooTensorsV2").Input("i: int32");
+
 ////////////////////////////////////////////////////////////////////////////////
 //
 // A PlacerTest method has three phases:
@@ -1948,6 +1950,9 @@ REGISTER_KERNEL_BUILDER(Name("Add").Device("FakeCPU"), DummyOp);
 REGISTER_KERNEL_BUILDER(Name("Add").Device("FakeGPU"), DummyOp);
 REGISTER_KERNEL_BUILDER(Name("PartitionedCall").Device("FakeCPU"), DummyOp);
 REGISTER_KERNEL_BUILDER(Name("PartitionedCall").Device("FakeGPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("ConvertToListOfCooTensorsV2").Device("FakeCPU"),
+                        DummyOp);
+REGISTER_KERNEL_BUILDER(Name("Cast").Device("FakeCPU"), DummyOp);
 
 TEST_P(SoftPlacementPlacerTest,
        RequestedDeviceOnResourceGeneratorIsTreatedAsAssigned) {
@@ -3108,5 +3113,126 @@ TEST_F(NestedPlacerTest, IndirectRecursion) {
       << s.ToString();
 }
 
+TEST_F(PlacerTest, IdentityMatchesInputAndOutputPlacement) {
+  /*
+   *     Op Input (assigned to task:1)
+   *       |
+   *       v
+   *     // Tests that this gets reassigned to task:1
+   *     Identity (No Assignment)
+   *       |
+   *       v
+   *     Op Output (assigned to task:1)
+   */
+  const std::string task0_device = "/job:b/replica:0/task:0/device:FakeCPU:0";
+  const std::string task1_device = "/job:b/replica:0/task:1/device:FakeCPU:0";
+
+  GraphDef graph = GDef({
+      NDef("a", "_Arg", {}, {{"T", DT_FLOAT}}, task1_device),
+      NDef("identity1", "Identity", {"a"}, {{"T", DT_FLOAT}}, task1_device),
+      NDef("identity2", "Identity", {"identity1:0"}, {{"T", DT_FLOAT}}),
+      NDef("cast", "Cast", {"identity2:0"},
+           {{"SrcT", DT_FLOAT}, {"DstT", DT_INT32}}, task1_device),
+      NDef("COO", "ConvertToListOfCooTensorsV2", {"cast:0"}, {{"T", DT_INT32}},
+           task1_device),
+  });
+
+  Graph g(OpRegistry::Global());
+
+  DeviceSet multiple_tasks;
+  std::unique_ptr<Device> task0_cpu(FakeDevice::MakeCPU(task0_device));
+  multiple_tasks.AddDevice(task0_cpu.get());
+
+  std::unique_ptr<Device> task1_cpu(FakeDevice::MakeCPU(task1_device));
+  multiple_tasks.AddDevice(task1_cpu.get());
+
+  TF_ASSERT_OK(BuildGraph(graph, &g));
+
+  absl::Status s = Place(&g, &multiple_tasks);
+  TF_ASSERT_OK(s);
+
+  Node* identity2 = GetNodeByName(g, "identity2");
+  EXPECT_EQ(identity2->assigned_device_name().c_str(), task1_device);
+}
+
+TEST_F(PlacerTest, IdentityWithoutOutputDoesntCrash) {
+  /*
+   *     Op Input (assigned to task:1)
+   *       |
+   *       v
+   *     // Tests that this doesn't crash.
+   *     Identity (No output)
+   */
+  const std::string task0_device = "/job:b/replica:0/task:0/device:FakeCPU:0";
+  const std::string task1_device = "/job:b/replica:0/task:1/device:FakeCPU:0";
+
+  GraphDef graph = GDef({
+      NDef("a", "_Arg", {}, {{"T", DT_FLOAT}}, task1_device),
+      NDef("identity1", "Identity", {"a"}, {{"T", DT_FLOAT}}, task1_device),
+      NDef("identity2", "Identity", {"identity1:0"}, {{"T", DT_FLOAT}}),
+  });
+
+  Graph g(OpRegistry::Global());
+
+  DeviceSet multiple_tasks;
+  std::unique_ptr<Device> task0_cpu(FakeDevice::MakeCPU(task0_device));
+  multiple_tasks.AddDevice(task0_cpu.get());
+
+  std::unique_ptr<Device> task1_cpu(FakeDevice::MakeCPU(task1_device));
+  multiple_tasks.AddDevice(task1_cpu.get());
+
+  TF_ASSERT_OK(BuildGraph(graph, &g));
+  Node* identity2 = GetNodeByName(g, "identity2");
+  const Edge* out_edge = *identity2->out_edges().begin();
+
+  g.RemoveEdge(out_edge);
+
+  absl::Status s = Place(&g, &multiple_tasks);
+  TF_ASSERT_OK(s);
+}
+
+TEST_F(PlacerTest, IdentityDoesntMatchWithMultipleOutput) {
+  /*
+   *     Op Input (assigned to task:1)
+   *       |
+   *       v
+   *     // Tests that identity gets assigned to default task:0
+   *     Identity (No Assignment)
+   *       |
+   *       v
+   *     Multiple Op Output (assigned to task:1)
+   */
+  const std::string task0_device = "/job:b/replica:0/task:0/device:FakeCPU:0";
+  const std::string task1_device = "/job:b/replica:0/task:1/device:FakeCPU:0";
+
+  GraphDef graph = GDef({
+      NDef("a", "_Arg", {}, {{"T", DT_FLOAT}}, task1_device),
+      NDef("identity1", "Identity", {"a"}, {{"T", DT_FLOAT}}, task1_device),
+      NDef("identity2", "Identity", {"identity1:0"}, {{"T", DT_FLOAT}}),
+      NDef("cast", "Cast", {"identity2:0"},
+           {{"SrcT", DT_FLOAT}, {"DstT", DT_INT32}}, task1_device),
+      NDef("COO", "ConvertToListOfCooTensorsV2", {"cast:0"}, {{"T", DT_INT32}},
+           task1_device),
+      NDef("identity3", "Identity", {"identity2:0"}, {{"T", DT_FLOAT}}),
+  });
+
+  Graph g(OpRegistry::Global());
+
+  DeviceSet multiple_tasks;
+  std::unique_ptr<Device> task0_cpu(FakeDevice::MakeCPU(task0_device));
+  multiple_tasks.AddDevice(task0_cpu.get());
+
+  std::unique_ptr<Device> task1_cpu(FakeDevice::MakeCPU(task1_device));
+  multiple_tasks.AddDevice(task1_cpu.get());
+
+  TF_ASSERT_OK(BuildGraph(graph, &g));
+
+  absl::Status s = Place(&g, &multiple_tasks);
+  TF_ASSERT_OK(s);
+
+  Node* identity2 = GetNodeByName(g, "identity2");
+  EXPECT_EQ(identity2->assigned_device_name().c_str(), task0_device);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index 5fcb4189dcdc06..52f8c3c8df00b4 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -634,7 +634,8 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
   // Generate a random function_name to avoid one function reuse the partition
   // function instantiated by another function.
   FunctionNameGenerator name_generator(
-      &data_lib_def, absl::StrCat(function_name, "_", random::New64()));
+      &data_lib_def,
+      absl::StrCat(function_name, "_partitioned_", random::New64()));
   const int num_subgraphs = subgraphs->size();
   gtl::InlinedVector<Status, 4> instantiate_status(num_subgraphs);
 
diff --git a/tensorflow/core/common_runtime/process_util.cc b/tensorflow/core/common_runtime/process_util.cc
index 53d73952219ecb..e0fa771c4b8280 100644
--- a/tensorflow/core/common_runtime/process_util.cc
+++ b/tensorflow/core/common_runtime/process_util.cc
@@ -26,9 +26,9 @@ limitations under the License.
 #include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/util.h"
+#include "tsl/platform/tracing.h"
 
 namespace tensorflow {
 
@@ -166,14 +166,15 @@ thread::ThreadPool* NewThreadPoolFromSessionOptions(
 }
 
 void SchedClosure(absl::AnyInvocable<void()> closure) {
-  if (!tracing::EventCollector::IsEnabled()) {
+  if (!tsl::tracing::EventCollector::IsEnabled()) {
     return Env::Default()->SchedClosure(std::move(closure));
   }
-  uint64 id = tracing::GetUniqueArg();
-  tracing::RecordEvent(tracing::EventCategory::kScheduleClosure, id);
+  uint64 id = tsl::tracing::GetUniqueArg();
+  tsl::tracing::RecordEvent(tsl::tracing::EventCategory::kScheduleClosure, id);
 
   Env::Default()->SchedClosure([id, closure = std::move(closure)]() mutable {
-    tracing::ScopedRegion region(tracing::EventCategory::kRunClosure, id);
+    tsl::tracing::ScopedRegion region(tsl::tracing::EventCategory::kRunClosure,
+                                      id);
     closure();
   });
 }
diff --git a/tensorflow/core/common_runtime/single_threaded_executor_test.cc b/tensorflow/core/common_runtime/single_threaded_executor_test.cc
index 04fcd51647efc8..a53e65d7a7a513 100644
--- a/tensorflow/core/common_runtime/single_threaded_executor_test.cc
+++ b/tensorflow/core/common_runtime/single_threaded_executor_test.cc
@@ -41,7 +41,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
-#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc
index cd77afcf53dbd3..a06e9e90b7ba16 100644
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -41,7 +41,6 @@ info. It does not have any negative impact on performance. */
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/types.h"
 #include "tensorflow/core/lib/hash/hash.h"
-#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/util/port.h"
diff --git a/tensorflow/core/config/flag_defs.h b/tensorflow/core/config/flag_defs.h
index bc91a9bcc4247b..a773fbb1b20c1c 100644
--- a/tensorflow/core/config/flag_defs.h
+++ b/tensorflow/core/config/flag_defs.h
@@ -61,6 +61,9 @@ class Flags {
                   "propagated during while op lowering to switch/merge ops.")
   TF_DECLARE_FLAG(enable_tf2min_ici_weight, false,
                   "If true, ici weight optimization will be used in tf2/min.")
+  // TODO(b/341325107): Make this behavior the default and remove the flag.
+  TF_DECLARE_FLAG(enable_function_pruning_before_inlining, false,
+                  "If true, functions will be pruned before inlining.")
   // LINT.ThenChange(//tensorflow/core/config/flags_api_wrapper.cc)
 };
 
diff --git a/tensorflow/core/config/flags_api_wrapper.cc b/tensorflow/core/config/flags_api_wrapper.cc
index e6c2192523cca9..a4925180c1ef53 100644
--- a/tensorflow/core/config/flags_api_wrapper.cc
+++ b/tensorflow/core/config/flags_api_wrapper.cc
@@ -55,5 +55,6 @@ PYBIND11_MODULE(flags_pybind, m) {
   TF_PY_DECLARE_FLAG(enable_aggressive_constant_replication);
   TF_PY_DECLARE_FLAG(enable_colocation_key_propagation_in_while_op_lowering);
   TF_PY_DECLARE_FLAG(enable_tf2min_ici_weight)
+  TF_PY_DECLARE_FLAG(enable_function_pruning_before_inlining)
   // LINT.ThenChange(//tensorflow/core/config/flag_defs.h)
 };
diff --git a/tensorflow/core/data/BUILD b/tensorflow/core/data/BUILD
index 4f97c020126071..ede6b1370de72a 100644
--- a/tensorflow/core/data/BUILD
+++ b/tensorflow/core/data/BUILD
@@ -27,6 +27,8 @@ exports_files([
     "dataset_utils.h",
     "finalization_utils.cc",
     "finalization_utils.h",
+    "flat_map_utils.cc",
+    "flat_map_utils.h",
     "global_shuffle_utils.cc",
     "global_shuffle_utils.h",
     "metric_utils.cc",
@@ -185,6 +187,28 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "flat_map_utils",
+    srcs = ["flat_map_utils.cc"],
+    hdrs = ["flat_map_utils.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
+    deps = [
+        ":captured_function",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/framework:types_proto_cc",
+        "//tensorflow/core/kernels/data:iterator_ops",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:refcount",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "global_shuffle_utils",
     srcs = ["global_shuffle_utils.cc"],
diff --git a/tensorflow/core/data/dataset_utils.cc b/tensorflow/core/data/dataset_utils.cc
index 1a8f4a5451080d..c1a7bda137868a 100644
--- a/tensorflow/core/data/dataset_utils.cc
+++ b/tensorflow/core/data/dataset_utils.cc
@@ -1022,8 +1022,6 @@ REGISTER_DATASET_EXPERIMENT("no_compression_v2", RandomJobSamplePercentage<50>,
                             AllTasks);
 REGISTER_DATASET_EXPERIMENT("inject_io_prefetch", RandomJobSamplePercentage<0>,
                             AllTasks);
-REGISTER_DATASET_EXPERIMENT("reduce_array_record_dataset_memory_usage",
-                            RandomJobSamplePercentage<50>, AllTasks);
 REGISTER_DATASET_EXPERIMENT("map_fusion", RandomJobSamplePercentage<0>,
                             AllTasks);
 }  // namespace
diff --git a/tensorflow/core/data/flat_map_utils.cc b/tensorflow/core/data/flat_map_utils.cc
new file mode 100644
index 00000000000000..5b87ccb80ea850
--- /dev/null
+++ b/tensorflow/core/data/flat_map_utils.cc
@@ -0,0 +1,229 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/data/flat_map_utils.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdlib>
+#include <deque>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/data/captured_function.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function_handle_cache.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace data {
+
+FlatMapRandomAccessHandler::FlatMapRandomAccessHandler(
+    OpKernelContext* const ctx, const DatasetBase* input_dataset,
+    CapturedFunction& captured_map_func)
+    : input_dataset_(input_dataset),
+      captured_map_func_(captured_map_func),
+      unbounded_thread_pool_(ctx->env(),
+                             "tf_data_flat_map_random_access_handler") {
+  absl::Status status =
+      ctx->function_library()->Clone(&flib_def_, &pflr_, &flr_, true);
+  if (!status.ok()) {
+    cumulative_cardinalities_ = std::move(status);
+    return;
+  }
+  function_handle_cache_ =
+      std::make_unique<FunctionHandleCache>(pflr_->GetFLR("/device:CPU:0"));
+  IteratorContext::Params params(ctx);
+  params.cancellation_manager = &cancellation_manager_;
+  params.env = ctx->env();
+  params.flr = flr_;
+  params.function_handle_cache = function_handle_cache_.get();
+  params.resource_mgr = &resource_mgr_;
+  params.thread_factory = unbounded_thread_pool_.get_thread_factory();
+  params.thread_pool = &unbounded_thread_pool_;
+  ctx_ = std::make_unique<IteratorContext>(std::move(params));
+}
+
+FlatMapRandomAccessHandler::~FlatMapRandomAccessHandler() {
+  for (DatasetBase* dataset : input_datasets_) {
+    dataset->Unref();
+  }
+  input_datasets_.clear();
+}
+
+absl::StatusOr<int64_t> FlatMapRandomAccessHandler::Cardinality() {
+  TF_RETURN_IF_ERROR(cumulative_cardinalities_.status());
+  if (cumulative_cardinalities_->empty()) {
+    cumulative_cardinalities_ = ComputeCardinalities();
+  }
+  TF_RETURN_IF_ERROR(cumulative_cardinalities_.status());
+  return cumulative_cardinalities_->back();
+}
+
+absl::StatusOr<int64_t> FlatMapRandomAccessHandler::CumulativeCardinality(
+    size_t index) {
+  TF_RETURN_IF_ERROR(cumulative_cardinalities_.status());
+  if (index >= cumulative_cardinalities_->size()) {
+    return absl::OutOfRangeError(absl::StrCat(
+        "Dataset index exceeds the number of input datasets. Got index: ",
+        index, ", number of input datasets: ",
+        cumulative_cardinalities_->size(), "."));
+  }
+  return (*cumulative_cardinalities_)[index];
+}
+
+absl::StatusOr<std::vector<int64_t>>
+FlatMapRandomAccessHandler::ComputeCardinalities() {
+  if (input_datasets_.empty()) {
+    TF_ASSIGN_OR_RETURN(input_datasets_, MakeInputDatasets());
+  }
+
+  std::vector<int64_t> cumulative_cardinalities;
+  cumulative_cardinalities.reserve(input_datasets_.size());
+  for (size_t i = 0; i < input_datasets_.size(); ++i) {
+    int64_t input_cardinality = input_datasets_[i]->Cardinality();
+    if (input_cardinality == kInfiniteCardinality ||
+        input_cardinality == kUnknownCardinality) {
+      cumulative_cardinalities.push_back(input_cardinality);
+      return cumulative_cardinalities;
+    }
+    int64_t cumulative_cardinality = input_cardinality;
+    if (i > 0) {
+      cumulative_cardinality += cumulative_cardinalities.back();
+    }
+    cumulative_cardinalities.push_back(cumulative_cardinality);
+  }
+  if (cumulative_cardinalities.empty()) {
+    cumulative_cardinalities.push_back(0);
+  }
+  return cumulative_cardinalities;
+}
+
+absl::StatusOr<int64_t> FlatMapRandomAccessHandler::GetDatasetIndex(
+    size_t element_position) {
+  TF_ASSIGN_OR_RETURN(int64_t cardinality, Cardinality());
+  if (cardinality < 0) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Failed to globally shuffle flat map dataset. Global shuffling "
+        "requires finite cardinality. Got ",
+        cardinality, "."));
+  }
+  if (element_position >= cardinality) {
+    return absl::OutOfRangeError(absl::StrCat(
+        "Element index exceeds the flat map dataset cardinality. Got index: ",
+        element_position, ", cardinality: ", cardinality, "."));
+  }
+  return std::upper_bound(cumulative_cardinalities_->begin(),
+                          cumulative_cardinalities_->end(), element_position) -
+         cumulative_cardinalities_->begin();
+}
+
+absl::StatusOr<std::vector<std::unique_ptr<IteratorBase>>>
+FlatMapRandomAccessHandler::MakeInputIterators(
+    IteratorContext* ctx, const DatasetBaseIterator* parent,
+    const std::string& prefix) {
+  if (input_datasets_.empty()) {
+    TF_ASSIGN_OR_RETURN(input_datasets_, MakeInputDatasets());
+  }
+
+  std::vector<std::unique_ptr<IteratorBase>> result;
+  if (input_datasets_.empty()) {
+    return result;
+  }
+
+  result.resize(input_datasets_.size());
+  for (size_t i = 0; i < input_datasets_.size(); ++i) {
+    TF_RETURN_IF_ERROR(input_datasets_[i]->MakeIterator(
+        ctx, parent, absl::StrCat(prefix, "[", i, "]"), &result[i]));
+  }
+  return result;
+}
+
+absl::StatusOr<std::deque<DatasetBase*>>
+FlatMapRandomAccessHandler::MakeInputDatasets() const {
+  std::unique_ptr<IteratorBase> iterator;
+  TF_RETURN_IF_ERROR(input_dataset_->MakeIterator(
+      ctx_.get(), /*parent=*/nullptr, "Iterator", &iterator));
+
+  std::unique_ptr<InstantiatedCapturedFunction> map_func;
+  TF_RETURN_IF_ERROR(captured_map_func_.Instantiate(ctx_.get(), &map_func));
+
+  absl::Mutex mu;
+  std::deque<DatasetBase*> input_datasets;
+  absl::Status status;  // Guarded by `mu`.
+  std::vector<std::unique_ptr<tsl::Thread>> threads;
+  while (true) {
+    std::vector<Tensor> input_tensors;
+    bool end_of_sequence = false;
+    TF_RETURN_IF_ERROR(
+        iterator->GetNext(ctx_.get(), &input_tensors, &end_of_sequence));
+    if (end_of_sequence) {
+      break;
+    }
+
+    input_datasets.push_back(nullptr);
+    threads.push_back(ctx_->StartThread(
+        "flat_map_random_access_iterator",
+        [this, input_tensors = std::move(input_tensors), &input_datasets,
+         dataset_index = input_datasets.size() - 1, &map_func, &status, &mu]() {
+          absl::StatusOr<DatasetBase*> dataset =
+              MakeInputDataset(std::move(input_tensors), *map_func);
+          if (!dataset.ok()) {
+            absl::MutexLock l(&mu);
+            status.Update(dataset.status());
+            return;
+          }
+          input_datasets[dataset_index] = *dataset;
+        }));
+  }
+  threads.clear();
+  TF_RETURN_IF_ERROR(std::move(status));
+  return input_datasets;
+}
+
+absl::StatusOr<DatasetBase*> FlatMapRandomAccessHandler::MakeInputDataset(
+    std::vector<Tensor> input_tensors,
+    const InstantiatedCapturedFunction& map_func) const {
+  std::vector<Tensor> mapped_tensors;
+  TF_RETURN_IF_ERROR(
+      map_func.Run(ctx_.get(), std::move(input_tensors), &mapped_tensors));
+  if (!(mapped_tensors.size() == 1 && mapped_tensors[0].dtype() == DT_VARIANT &&
+        TensorShapeUtils::IsScalar(mapped_tensors[0].shape()))) {
+    return absl::InvalidArgumentError(
+        "Flat map function must return a single scalar of dtype DT_VARIANT "
+        "representing a dataset.");
+  }
+
+  DatasetBase* mapped_dataset = nullptr;
+  TF_RETURN_IF_ERROR(
+      GetDatasetFromVariantTensor(mapped_tensors[0], &mapped_dataset));
+  mapped_dataset->Ref();
+  return mapped_dataset;
+}
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/flat_map_utils.h b/tensorflow/core/data/flat_map_utils.h
new file mode 100644
index 00000000000000..cef282234b55fd
--- /dev/null
+++ b/tensorflow/core/data/flat_map_utils.h
@@ -0,0 +1,109 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_FLAT_MAP_UTILS_H_
+#define TENSORFLOW_CORE_DATA_FLAT_MAP_UTILS_H_
+
+#include <cstdint>
+#include <deque>
+#include <memory>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/data/captured_function.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function_handle_cache.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/iterator_ops.h"
+#include "tsl/platform/refcount.h"
+#include "tsl/platform/threadpool.h"
+
+namespace tensorflow {
+namespace data {
+
+// Utility class for computing the cardinality of a flat map dataset.
+class FlatMapRandomAccessHandler {
+ public:
+  // Initializes the counter. This will save necessary information from `ctx`.
+  // `input_dataset` is the input dataset passed to `flat_map` (not the flat_map
+  // dataset). `captured_map_func` is the captured map function.
+  FlatMapRandomAccessHandler(OpKernelContext* ctx,
+                             const DatasetBase* input_dataset,
+                             CapturedFunction& captured_map_func);
+  virtual ~FlatMapRandomAccessHandler();
+  FlatMapRandomAccessHandler(const FlatMapRandomAccessHandler&) = delete;
+  FlatMapRandomAccessHandler& operator=(const FlatMapRandomAccessHandler&) =
+      delete;
+
+  // Returns the dataset cardinality.
+  absl::StatusOr<int64_t> Cardinality();
+
+  // Returns the cumulative cardinality at the index-th dataset.
+  absl::StatusOr<int64_t> CumulativeCardinality(size_t index);
+
+  // Given the flattened element position `element_position`, returns the index
+  // of the dataset to which the element belongs.
+  absl::StatusOr<int64_t> GetDatasetIndex(size_t element_position);
+
+  // Creates the dataset iterators.
+  absl::StatusOr<std::vector<std::unique_ptr<IteratorBase>>> MakeInputIterators(
+      IteratorContext* ctx, const DatasetBaseIterator* parent,
+      const std::string& prefix);
+
+ private:
+  // Computes the cumulative cardinalities.
+  absl::StatusOr<std::vector<int64_t>> ComputeCardinalities();
+
+  // Creates the input datasets. Each dataset is the result of applying the map
+  // function to one element from the input iterator.
+  absl::StatusOr<std::deque<DatasetBase*>> MakeInputDatasets() const;
+  absl::StatusOr<DatasetBase*> MakeInputDataset(
+      std::vector<Tensor> input_tensors,
+      const InstantiatedCapturedFunction& map_func) const;
+
+  const DatasetBase* input_dataset_;
+  CapturedFunction& captured_map_func_;
+
+  // The iterator context which bundles together the necessary runtime support
+  // to create and get elements from the input dataset.
+  std::unique_ptr<IteratorContext> ctx_;
+  FunctionLibraryRuntime* flr_;
+  std::unique_ptr<FunctionLibraryDefinition> flib_def_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+  std::unique_ptr<thread::ThreadPool> interop_threadpool_;
+  std::unique_ptr<FunctionHandleCache> function_handle_cache_;
+  std::function<void(std::function<void()>)> runner_;
+  ResourceMgr resource_mgr_;
+  CancellationManager cancellation_manager_;
+  UnboundedThreadPool unbounded_thread_pool_;
+
+  // Input datasets generated by running the map function. Each dataset is the
+  // result of applying the map function to one element from the input iterator.
+  std::deque<DatasetBase*> input_datasets_;
+
+  // Cumulative cardinalities. Before `ComputeCardinalities` is called, this is
+  // an empty vector. After `ComputeCardinalities` is called, the last element
+  // is the dataset cardinality.
+  absl::StatusOr<std::vector<int64_t>> cumulative_cardinalities_ =
+      std::vector<int64_t>{};
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_FLAT_MAP_UTILS_H_
diff --git a/tensorflow/core/data/service/client/data_service_client.cc b/tensorflow/core/data/service/client/data_service_client.cc
index 15e0e519575354..dbd65ea37d7a61 100644
--- a/tensorflow/core/data/service/client/data_service_client.cc
+++ b/tensorflow/core/data/service/client/data_service_client.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "tensorflow/core/data/utils.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/framework/model.h"
 #include "tensorflow/core/platform/env.h"
@@ -102,7 +103,10 @@ DataServiceClient::~DataServiceClient() {
           << iteration_client_id_;
 }
 
-Status DataServiceClient::Initialize(Allocator* allocator) {
+Status DataServiceClient::Initialize(
+    const DeviceBase::AcceleratorDeviceInfo* accelerator_device_info,
+    Allocator* allocator) {
+  accelerator_device_info_ = accelerator_device_info;
   allocator_ = allocator;
   TF_RETURN_IF_ERROR(ValidateDataServiceParams(params_));
   VLOG(3) << "Connecting to " << params_.address
@@ -343,7 +347,7 @@ DataServiceClient::CreateWorkerClient(const std::string& protocol,
   TF_ASSIGN_OR_RETURN(DataTransferServerInfo transfer_server,
                       GetTransferServer(protocol, task_info));
   return CreateDataServiceWorkerClient(params_.protocol, transfer_server,
-                                       allocator_);
+                                       accelerator_device_info_, allocator_);
 }
 
 absl::StatusOr<std::unique_ptr<DataServiceWorkerClient>>
@@ -356,7 +360,7 @@ DataServiceClient::CreateAlternativeWorkerClientWithGrpcFallback(
     const DataTransferServerInfo& transfer_server, const TaskInfo& task_info) {
   absl::StatusOr<std::unique_ptr<DataServiceWorkerClient>> worker =
       CreateDataServiceWorkerClient(params_.protocol, transfer_server,
-                                    allocator_);
+                                    accelerator_device_info_, allocator_);
   if (worker.ok()) {
     LOG(INFO) << "Successfully started client for data transfer protocol '"
               << transfer_server.protocol() << "' for worker '"
@@ -383,7 +387,8 @@ DataServiceClient::CreateWorkerClient(const TaskInfo& task_info) {
     DataTransferServerInfo info;
     info.set_protocol(kLocalTransferProtocol);
     info.set_address(task_info.worker_address());
-    return CreateDataServiceWorkerClient(params_.protocol, info, allocator_);
+    return CreateDataServiceWorkerClient(params_.protocol, info,
+                                         accelerator_device_info_, allocator_);
   }
   if (!params_.data_transfer_protocol.empty()) {
     TF_ASSIGN_OR_RETURN(
diff --git a/tensorflow/core/data/service/client/data_service_client.h b/tensorflow/core/data/service/client/data_service_client.h
index 5c8e3d82fa92d1..0faa2a9ee19be3 100644
--- a/tensorflow/core/data/service/client/data_service_client.h
+++ b/tensorflow/core/data/service/client/data_service_client.h
@@ -80,7 +80,9 @@ class DataServiceClient {
   DataServiceClient& operator=(const DataServiceClient&) = delete;
 
   // Initializes the client.
-  Status Initialize(Allocator* allocator);
+  Status Initialize(
+      const DeviceBase::AcceleratorDeviceInfo* accelerator_device_info,
+      Allocator* allocator);
 
   // Reads the next element from tf.data workers. Blocks if the next element is
   // not ready.
@@ -246,6 +248,7 @@ class DataServiceClient {
   int64_t job_id_;
   int64_t iteration_client_id_;
   std::unique_ptr<DataServiceDispatcherClient> dispatcher_;
+  const DeviceBase::AcceleratorDeviceInfo* accelerator_device_info_;
   Allocator* allocator_;
 
   int64_t get_next_index_ TF_GUARDED_BY(mu_) = 0;
diff --git a/tensorflow/core/data/service/client/data_service_client_test.cc b/tensorflow/core/data/service/client/data_service_client_test.cc
index 07e7ca0ad9fa40..8ec654b33eabde 100644
--- a/tensorflow/core/data/service/client/data_service_client_test.cc
+++ b/tensorflow/core/data/service/client/data_service_client_test.cc
@@ -134,7 +134,8 @@ TEST(DataServiceClientTest, NoSharding) {
   DataServiceParams params = GetDataServiceParams(
       dataset_id, test_cluster.DispatcherAddress(), ProcessingModeDef::OFF);
   DataServiceClient client(params);
-  TF_ASSERT_OK(client.Initialize(/*allocator=*/nullptr));
+  TF_ASSERT_OK(client.Initialize(/*accelerator_device_info=*/nullptr,
+                                 /*allocator=*/nullptr));
   EXPECT_THAT(GetResults<int64_t>(client),
               IsOkAndHolds(ElementsAreArray(Range(10))));
   client.Cancel();
@@ -150,7 +151,8 @@ TEST(DataServiceClientTest, DynamicSharding) {
   DataServiceParams params = GetDataServiceParams(
       dataset_id, test_cluster.DispatcherAddress(), ProcessingModeDef::DYNAMIC);
   DataServiceClient client(params);
-  TF_ASSERT_OK(client.Initialize(/*allocator=*/nullptr));
+  TF_ASSERT_OK(client.Initialize(/*accelerator_device_info=*/nullptr,
+                                 /*allocator=*/nullptr));
   EXPECT_THAT(GetResults<int64_t>(client),
               IsOkAndHolds(UnorderedElementsAreArray(Range(10))));
   client.Cancel();
@@ -167,7 +169,8 @@ TEST(DataServiceClientTest, StaticSharding) {
       GetDataServiceParams(dataset_id, test_cluster.DispatcherAddress(),
                            ProcessingModeDef::FILE_OR_DATA);
   DataServiceClient client(params);
-  TF_ASSERT_OK(client.Initialize(/*allocator=*/nullptr));
+  TF_ASSERT_OK(client.Initialize(/*accelerator_device_info=*/nullptr,
+                                 /*allocator=*/nullptr));
   EXPECT_THAT(GetResults<int64_t>(client),
               IsOkAndHolds(UnorderedElementsAreArray(Range(10))));
   client.Cancel();
@@ -183,7 +186,8 @@ TEST(DataServiceClientTest, RecordBufferEvents) {
   DataServiceParams params = GetDataServiceParams(
       dataset_id, test_cluster.DispatcherAddress(), ProcessingModeDef::OFF);
   DataServiceClient client(params);
-  TF_ASSERT_OK(client.Initialize(/*allocator=*/nullptr));
+  TF_ASSERT_OK(client.Initialize(/*accelerator_device_info=*/nullptr,
+                                 /*allocator=*/nullptr));
 
   auto mock_context = std::make_unique<TestDataServiceContext>();
   TestDataServiceContext* ctx = mock_context.get();
@@ -206,7 +210,8 @@ TEST(DataServiceClientTest, Cancel) {
   DataServiceParams params = GetDataServiceParams(
       dataset_id, test_cluster.DispatcherAddress(), ProcessingModeDef::OFF);
   DataServiceClient client(params);
-  TF_ASSERT_OK(client.Initialize(/*allocator=*/nullptr));
+  TF_ASSERT_OK(client.Initialize(/*accelerator_device_info=*/nullptr,
+                                 /*allocator=*/nullptr));
   client.Cancel();
   EXPECT_THAT(client.GetNext(GetTestDataServiceContext),
               StatusIs(error::CANCELLED));
@@ -218,7 +223,8 @@ TEST(DataServiceClientTest, ValidationError) {
   params.target_workers = TARGET_WORKERS_LOCAL;
   DataServiceClient client(params);
   EXPECT_THAT(
-      client.Initialize(/*allocator=*/nullptr),
+      client.Initialize(/*accelerator_device_info=*/nullptr,
+                        /*allocator=*/nullptr),
       StatusIs(
           error::INVALID_ARGUMENT,
           HasSubstr(
diff --git a/tensorflow/core/data/service/data_transfer.h b/tensorflow/core/data/service/data_transfer.h
index c769ae40f69a6d..ac3c6f68e95140 100644
--- a/tensorflow/core/data/service/data_transfer.h
+++ b/tensorflow/core/data/service/data_transfer.h
@@ -70,6 +70,7 @@ class DataTransferClient {
   struct Config {
     absl::string_view protocol;
     std::string address;
+    const DeviceBase::AcceleratorDeviceInfo* accelerator_device_info;
     Allocator* allocator;
   };
   using ClientFactoryT =
diff --git a/tensorflow/core/data/service/dispatcher_client_test.cc b/tensorflow/core/data/service/dispatcher_client_test.cc
index 64cf5c3c76e360..013f514658db68 100644
--- a/tensorflow/core/data/service/dispatcher_client_test.cc
+++ b/tensorflow/core/data/service/dispatcher_client_test.cc
@@ -55,6 +55,7 @@ using ::tensorflow::data::testing::LocalTempFilename;
 using ::tensorflow::data::testing::RangeDataset;
 using ::tensorflow::testing::StatusIs;
 using ::testing::AllOf;
+using ::testing::ContainsRegex;
 using ::testing::HasSubstr;
 
 constexpr const char kProtocol[] = "grpc";
@@ -383,7 +384,7 @@ TEST_F(DispatcherClientTest, NamedJobsDoNotMatch) {
       StatusIs(error::INVALID_ARGUMENT,
                AllOf(HasSubstr("but found an existing job with different "
                                "parameters: "),
-                     HasSubstr("Existing processing mode: <>"),
+                     ContainsRegex("Existing processing mode: <\\w*/*\\w* *>"),
                      HasSubstr("Existing cross-trainer cache: <disabled>"))));
 }
 
diff --git a/tensorflow/core/data/service/dispatcher_impl.cc b/tensorflow/core/data/service/dispatcher_impl.cc
index c11bc14132db1d..bd172da8a90fe3 100644
--- a/tensorflow/core/data/service/dispatcher_impl.cc
+++ b/tensorflow/core/data/service/dispatcher_impl.cc
@@ -401,8 +401,8 @@ void DataServiceDispatcherImpl::ReportProcessingTimesFromActiveTasks(
     std::shared_ptr<const Task> task;
     Status s = state_.TaskFromId(task_id, task);
     if (!s.ok()) {
-      LOG(WARNING) << "Could not find task with id " << task_id
-                   << " in tf.data service dispatcher state: " << s;
+      VLOG(1) << "Could not find task with id " << task_id
+              << " in tf.data service dispatcher state: " << s;
       continue;
     }
 
@@ -410,11 +410,10 @@ void DataServiceDispatcherImpl::ReportProcessingTimesFromActiveTasks(
         task->iteration->iteration_id, worker_address,
         absl::Nanoseconds(processing_time_nsec));
     if (!auto_scaler_status.ok()) {
-      LOG_EVERY_N_SEC(WARNING, 300)
-          << "Failed to report processing time for Iteration "
-          << task->iteration->iteration_id << " and worker address "
-          << worker_address
-          << " to tf.data service AutoScaler: " << auto_scaler_status;
+      VLOG(1) << "Failed to report processing time for Iteration "
+              << task->iteration->iteration_id << " and worker address "
+              << worker_address
+              << " to tf.data service AutoScaler: " << auto_scaler_status;
     }
   }
 }
@@ -783,10 +782,9 @@ Status DataServiceDispatcherImpl::MaybeRemoveTask(
   Status auto_scaler_status = auto_scaler_.RemoveWorker(
       task->iteration->iteration_id, task->worker_address);
   if (!auto_scaler_status.ok()) {
-    LOG(WARNING) << "Failed to remove worker with address "
-                 << task->worker_address << " for Iteration "
-                 << task->iteration->iteration_id
-                 << " from tf.data service AutoScaler: " << auto_scaler_status;
+    VLOG(1) << "Failed to remove worker with address " << task->worker_address
+            << " for Iteration " << task->iteration->iteration_id
+            << " from tf.data service AutoScaler: " << auto_scaler_status;
   }
   VLOG(1) << "Task " << task->task_id << " successfully removed";
   return absl::OkStatus();
@@ -804,9 +802,9 @@ Status DataServiceDispatcherImpl::ReleaseIterationClient(
   Status auto_scaler_status =
       auto_scaler_.RemoveConsumer(iteration->iteration_id, iteration_client_id);
   if (!auto_scaler_status.ok()) {
-    LOG(WARNING) << "Failed to remove consumer with ID " << iteration_client_id
-                 << " for Iteration " << iteration->iteration_id
-                 << " from tf.data service AutoScaler: " << auto_scaler_status;
+    VLOG(1) << "Failed to remove consumer with ID " << iteration_client_id
+            << " for Iteration " << iteration->iteration_id
+            << " from tf.data service AutoScaler: " << auto_scaler_status;
   }
   Update update;
   ReleaseIterationClientUpdate* release_iteration_client =
@@ -1138,11 +1136,10 @@ Status DataServiceDispatcherImpl::ClientHeartbeat(
       iteration->iteration_id, request->iteration_client_id(),
       absl::Nanoseconds(request->target_processing_time_nsec()));
   if (!auto_scaler_status.ok()) {
-    LOG_EVERY_N(WARNING, 20)
-        << "Failed to report target processing time for Iteration "
-        << iteration->iteration_id << " and consumer ID "
-        << request->iteration_client_id()
-        << " to tf.data service AutoScaler: " << auto_scaler_status;
+    VLOG(1) << "Failed to report target processing time for Iteration "
+            << iteration->iteration_id << " and consumer ID "
+            << request->iteration_client_id()
+            << " to tf.data service AutoScaler: " << auto_scaler_status;
   }
 
   std::vector<std::shared_ptr<const Task>> tasks;
@@ -1402,9 +1399,9 @@ void DataServiceDispatcherImpl::MaintenanceThread() {
       Status s = auto_scaler_.UpdateOptimalNumberOfWorkersMetric(
           state_.GetNumberOfRegisteredWorkers());
       if (!s.ok()) {
-        LOG(WARNING) << "Error updating the optimal number of workers metric "
-                        "in tf.data service AutoScaler: "
-                     << s;
+        VLOG(1) << "Error updating the optimal number of workers metric "
+                   "in tf.data service AutoScaler: "
+                << s;
       }
     }
     {
@@ -1427,14 +1424,13 @@ void DataServiceDispatcherImpl::RemoveClientFromAutoScaler(int64_t client_id)
     Status auto_scaler_status =
         auto_scaler_.RemoveConsumer(iteration->iteration_id, client_id);
     if (!auto_scaler_status.ok()) {
-      LOG(WARNING) << "Failed to remove consumer with ID " << client_id
-                   << " for Iteration " << iteration->iteration_id
-                   << " from tf.data service AutoScaler: "
-                   << auto_scaler_status;
+      VLOG(1) << "Failed to remove consumer with ID " << client_id
+              << " for Iteration " << iteration->iteration_id
+              << " from tf.data service AutoScaler: " << auto_scaler_status;
     }
   } else {
-    LOG(WARNING) << "Could not find Iteration for client with id " << client_id
-                 << " in tf.data service dispatcher state: " << s;
+    VLOG(1) << "Could not find Iteration for client with id " << client_id
+            << " in tf.data service dispatcher state: " << s;
   }
 }
 
@@ -1468,17 +1464,15 @@ void DataServiceDispatcherImpl::RemoveWorkerFromAutoScaler(
       Status auto_scaler_status = auto_scaler_.RemoveWorker(
           task->iteration->iteration_id, worker_address);
       if (!auto_scaler_status.ok()) {
-        LOG(WARNING) << "Failed to remove worker with address "
-                     << worker_address << " for Iteration "
-                     << task->iteration->iteration_id
-                     << " from tf.data service AutoScaler: "
-                     << auto_scaler_status;
+        VLOG(1) << "Failed to remove worker with address " << worker_address
+                << " for Iteration " << task->iteration->iteration_id
+                << " from tf.data service AutoScaler: " << auto_scaler_status;
       }
     }
   } else {
-    LOG(WARNING) << "Could not find tasks for worker with address "
-                 << worker_address << " in tf.data service dispatcher state: "
-                 << tasks_for_worker_status;
+    VLOG(1) << "Could not find tasks for worker with address " << worker_address
+            << " in tf.data service dispatcher state: "
+            << tasks_for_worker_status;
   }
 }
 
@@ -1516,10 +1510,8 @@ Status DataServiceDispatcherImpl::GcOldIterations()
     Status auto_scaler_status =
         auto_scaler_.UnregisterIteration(iteration->iteration_id);
     if (!auto_scaler_status.ok()) {
-      LOG(WARNING) << "Failed to unregister Iteration "
-                   << iteration->iteration_id
-                   << " with tf.data service AutoScaler: "
-                   << auto_scaler_status;
+      VLOG(1) << "Failed to unregister Iteration " << iteration->iteration_id
+              << " with tf.data service AutoScaler: " << auto_scaler_status;
     }
     LOG(INFO) << "Garbage collected iteration " << iteration->DebugString();
   }
diff --git a/tensorflow/core/data/service/snapshot/BUILD b/tensorflow/core/data/service/snapshot/BUILD
index 523bfebc44dcb5..40b5cbaa6873aa 100644
--- a/tensorflow/core/data/service/snapshot/BUILD
+++ b/tensorflow/core/data/service/snapshot/BUILD
@@ -462,7 +462,7 @@ cc_library(
 
 tf_cc_test(
     name = "snapshot_stream_writer_checkpoint_test",
-    size = "small",
+    size = "medium",
     srcs = ["snapshot_stream_writer_checkpoint_test.cc"],
     deps = [
         ":path_utils",
diff --git a/tensorflow/core/data/service/test_cluster.h b/tensorflow/core/data/service/test_cluster.h
index 4a3ee4b1f608e1..2e1fb24e16753b 100644
--- a/tensorflow/core/data/service/test_cluster.h
+++ b/tensorflow/core/data/service/test_cluster.h
@@ -168,7 +168,8 @@ DatasetClient<T>::DatasetClient(const TestCluster& cluster)
   for (size_t i = 0; i < cluster.NumWorkers(); ++i) {
     worker_clients_[cluster_.WorkerAddress(i)] =
         std::make_unique<DataServiceWorkerClient>(
-            cluster_.WorkerAddress(i), "grpc", "grpc", /*allocator=*/nullptr);
+            cluster_.WorkerAddress(i), "grpc", "grpc",
+            /*accelerator_device_info=*/nullptr, /*allocator=*/nullptr);
   }
 }
 
diff --git a/tensorflow/core/data/service/worker_client.cc b/tensorflow/core/data/service/worker_client.cc
index e6a5091a0d3801..5b7b3facb81c27 100644
--- a/tensorflow/core/data/service/worker_client.cc
+++ b/tensorflow/core/data/service/worker_client.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "tensorflow/core/data/service/worker_impl.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/dataset.pb.h"
+#include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -56,11 +57,13 @@ namespace tensorflow {
 namespace data {
 
 StatusOr<std::unique_ptr<DataServiceWorkerClient>>
-CreateDataServiceWorkerClient(const std::string& dispatcher_protocol,
-                              const DataTransferServerInfo& info,
-                              Allocator* allocator) {
+CreateDataServiceWorkerClient(
+    const std::string& dispatcher_protocol, const DataTransferServerInfo& info,
+    const DeviceBase::AcceleratorDeviceInfo* accelerator_device_info,
+    Allocator* allocator) {
   auto client = std::make_unique<DataServiceWorkerClient>(
-      info.address(), dispatcher_protocol, info.protocol(), allocator);
+      info.address(), dispatcher_protocol, info.protocol(),
+      accelerator_device_info, allocator);
   TF_RETURN_IF_ERROR(client->Initialize());
   TF_RETURN_WITH_CONTEXT_IF_ERROR(
       client->CheckCompatibility(info.compatibility_info()),
@@ -82,7 +85,8 @@ Status DataServiceWorkerClient::EnsureInitialized() {
     return absl::OkStatus();
   }
   TF_RETURN_IF_ERROR(DataTransferClient::Build(
-      GetDataTransferProtocol(), {protocol_, address_, allocator_}, &client_));
+      GetDataTransferProtocol(),
+      {protocol_, address_, accelerator_device_info_, allocator_}, &client_));
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/core/data/service/worker_client.h b/tensorflow/core/data/service/worker_client.h
index f1bcb3e887c4da..b2de89dce25c88 100644
--- a/tensorflow/core/data/service/worker_client.h
+++ b/tensorflow/core/data/service/worker_client.h
@@ -37,12 +37,14 @@ constexpr const char kGrpcTransferProtocol[] = "grpc";
 // Client for communicating with the tf.data service worker.
 class DataServiceWorkerClient : public DataServiceClientBase {
  public:
-  DataServiceWorkerClient(const std::string& address,
-                          const std::string& protocol,
-                          const std::string& transfer_protocol,
-                          Allocator* allocator)
+  DataServiceWorkerClient(
+      const std::string& address, const std::string& protocol,
+      const std::string& transfer_protocol,
+      const DeviceBase::AcceleratorDeviceInfo* accelerator_device_info,
+      Allocator* allocator)
       : DataServiceClientBase(address, protocol),
         transfer_protocol_(transfer_protocol),
+        accelerator_device_info_(accelerator_device_info),
         allocator_(allocator) {}
 
   // Fetches an element from the worker.
@@ -66,6 +68,7 @@ class DataServiceWorkerClient : public DataServiceClientBase {
 
  private:
   std::string transfer_protocol_;
+  const DeviceBase::AcceleratorDeviceInfo* accelerator_device_info_;
   Allocator* allocator_;
 
   mutex mu_;
@@ -77,9 +80,10 @@ class DataServiceWorkerClient : public DataServiceClientBase {
 // Creates and initializes a new tf.data service worker client to read
 // from the data transfer server specified in `info`.
 StatusOr<std::unique_ptr<DataServiceWorkerClient>>
-CreateDataServiceWorkerClient(const std::string& dispatcher_protocol,
-                              const DataTransferServerInfo& info,
-                              Allocator* allocator);
+CreateDataServiceWorkerClient(
+    const std::string& dispatcher_protocol, const DataTransferServerInfo& info,
+    const DeviceBase::AcceleratorDeviceInfo* accelerator_device_info,
+    Allocator* allocator);
 
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/worker_client_test.cc b/tensorflow/core/data/service/worker_client_test.cc
index eab30984c321ac..3974cdb83aec38 100644
--- a/tensorflow/core/data/service/worker_client_test.cc
+++ b/tensorflow/core/data/service/worker_client_test.cc
@@ -107,6 +107,7 @@ class WorkerClientTest : public ::testing::Test {
     info.set_address(GetWorkerAddress());
     info.set_protocol(data_transfer_protocol);
     return CreateDataServiceWorkerClient(kProtocol, info,
+                                         /*accelerator_device_info=*/nullptr,
                                          /*allocator=*/nullptr);
   }
 
diff --git a/tensorflow/core/debug/debug_grpc_io_utils_test.cc b/tensorflow/core/debug/debug_grpc_io_utils_test.cc
index 235bdaed604270..3eaf3651126528 100644
--- a/tensorflow/core/debug/debug_grpc_io_utils_test.cc
+++ b/tensorflow/core/debug/debug_grpc_io_utils_test.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/tracing.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/debug/debug_grpc_testlib.cc b/tensorflow/core/debug/debug_grpc_testlib.cc
index 4927caf5a3285a..2bc06061c459f7 100644
--- a/tensorflow/core/debug/debug_grpc_testlib.cc
+++ b/tensorflow/core/debug/debug_grpc_testlib.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/protobuf.h"
-#include "tensorflow/core/platform/tracing.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index 77c8c9207f8b37..87789060878b05 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -779,6 +779,7 @@ tf_cuda_cc_test(
         "//tensorflow/core/kernels:identity_op",
         "//tensorflow/core/kernels:variable_ops",
         "//tensorflow/core/protobuf:master_proto_cc",
+        "@eigen_archive//:eigen3",
     ] + tf_grpc_cc_dependencies(),
 )
 
diff --git a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.cc b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.cc
index 5f1fed1004bc71..b18d7136c05c76 100644
--- a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.cc
+++ b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.cc
@@ -58,7 +58,7 @@ std::pair<Status, bool> BarrierProxy::Wait() {
   if (num_entered_ == num_local_threads_) {
     // Now that all threads are waiting, starts waiting at the global barrier.
     if (tasks_.size() != 1) {
-      profiler::TraceMe traceme("BarrierProxy::Wait::WaitAtBarrier");
+      tsl::profiler::TraceMe traceme("BarrierProxy::Wait::WaitAtBarrier");
       // TODO(b/198475014) the barrier status will be stored in memory forever.
       // We should have a mechanism to remove it after it has been passed.
       status_ = agent_->WaitAtBarrier(key_, timeout_, tasks_);
@@ -103,12 +103,13 @@ Status BarrierProxyManager::Wait(tsl::CoordinationServiceAgent* agent,
   // Only one device, no need to wait.
   if (tasks.size() == 1 && num_local_threads <= 1) return absl::OkStatus();
 
-  profiler::TraceMe traceme([&] {
-    return profiler::TraceMeEncode("BarrierProxyManager::Wait",
-                                   {
-                                       {"num_tasks", tasks.size()},
-                                       {"num_local_threads", num_local_threads},
-                                   });
+  tsl::profiler::TraceMe traceme([&] {
+    return tsl::profiler::TraceMeEncode(
+        "BarrierProxyManager::Wait",
+        {
+            {"num_tasks", tasks.size()},
+            {"num_local_threads", num_local_threads},
+        });
   });
 
   std::shared_ptr<BarrierProxy> barrier;
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 5593963988d9e5..41ce2df923cad3 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -60,10 +60,10 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/util/device_name_utils.h"
+#include "tsl/platform/tracing.h"
 #include "tsl/protobuf/coordination_config.pb.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/distributed_runtime/master_test.cc b/tensorflow/core/distributed_runtime/master_test.cc
index 5c2f17e31f819d..1e9e5545183191 100644
--- a/tensorflow/core/distributed_runtime/master_test.cc
+++ b/tensorflow/core/distributed_runtime/master_test.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include <memory>
 
 #include "grpcpp/grpcpp.h"
-
+#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_testlib.h"
@@ -389,8 +389,8 @@ TEST_F(MasterTest, EigenProblem) {
   TF_CHECK_OK(CreateSession(def, &handle, &initial_version));
 
   // Temps supporting the computation of the convergence condition.
-  const Eigen::array<Eigen::DenseIndex, 1> sum_along_dim(0);
-  const Eigen::array<Eigen::DenseIndex, 2> matrix_transpose({1, 0});
+  const Eigen::array<Eigen::DenseIndex, 1> sum_along_dim{0};
+  const Eigen::array<Eigen::DenseIndex, 2> matrix_transpose{1, 0};
   Tensor x(DT_FLOAT, TensorShape({2, 1}));
   Tensor y(DT_FLOAT, TensorShape({2, 1}));
   Eigen::Tensor<float, 1, Eigen::RowMajor> y_square_sum;
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index a9f2370ab81fa1..959a8abae6518a 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -241,6 +241,7 @@ cc_library(
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/core/protobuf:master_proto_cc",
         "@com_google_absl//absl/time",
+        "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:retrying_utils",
     ],
     alwayslink = 1,
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
index abc9d12969147e..de9bd049d0d468 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
@@ -41,7 +41,6 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/protobuf/master.pb.h"
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
index c47dc31fc3ef4e..d180b4dc236451 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
@@ -27,10 +27,10 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/protobuf/master.pb.h"
 #include "tsl/platform/retrying_utils.h"
+#include "tsl/platform/tracing.h"
 
 namespace tensorflow {
 
@@ -116,7 +116,7 @@ class GrpcRemoteMaster : public MasterInterface {
   // Start tracing, attaching a unique ID to both the trace and the RPC.
   tsl::profiler::TraceMe* NewTraceRpc(StringPiece name,
                                       ::grpc::ClientContext* ctx) {
-    string trace_id = strings::StrCat(tracing::GetUniqueArg());
+    string trace_id = strings::StrCat(tsl::tracing::GetUniqueArg());
     ctx->AddMetadata(GrpcIdKey(), trace_id);
     return new tsl::profiler::TraceMe(
         [&] { return strings::StrCat(name, ":", trace_id); },
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
index 05bf31918ef543..d1ad012a28f222 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/protobuf/transport_options.pb.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
 #include "tensorflow/core/util/env_var.h"
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
index a990f41af261a2..3fac884a73bd11 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -52,10 +52,10 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/profiler/lib/scoped_memory_debug_annotation.h"
 #include "tensorflow/core/protobuf/transport_options.pb.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
+#include "tsl/platform/tracing.h"
 #include "tsl/protobuf/rpc_options.pb.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index 42708852b36f36..0922b04de0b0f8 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -27,8 +27,8 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/tensor_coding.h"
 #include "tensorflow/core/distributed_runtime/worker_session.h"
 #include "tensorflow/core/framework/collective.h"
-#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/profiler/lib/device_profiler_session.h"
+#include "tsl/platform/tracing.h"
 #include "tsl/protobuf/distributed_runtime_payloads.pb.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/framework/BUILD b/tensorflow/core/framework/BUILD
index c457a044758fba..2051a3d14ea024 100644
--- a/tensorflow/core/framework/BUILD
+++ b/tensorflow/core/framework/BUILD
@@ -9,6 +9,7 @@ load(
 load("//tensorflow:tensorflow.default.bzl", "filegroup", "tf_cuda_cc_test", "tf_generate_proto_text_sources")
 load(
     "//tensorflow/core/platform:build_config.bzl",
+    "tf_jspb_proto_library",
     "tf_proto_library",
     "tf_pyclif_proto_library",
 )
@@ -716,6 +717,8 @@ cc_library(
         "//learning/deepmind/tensorflow/queues:__pkg__",
         "//learning/deepmind/tensorflow/sstable:__pkg__",
         "//tensorflow/compiler/mlir/tools/kernel_gen:__pkg__",
+        "//third_party/py/grain/_src/tensorflow/ops:__pkg__",
+        "//waymo/ml/compiler/frontend/kernels:__pkg__",
     ],
     deps = [
         "//tensorflow/core/lib/core:refcount",
@@ -870,8 +873,8 @@ tf_cuda_library(
         "//tensorflow/core/platform:tensor_coding",
         "//tensorflow/core/platform:types",
         "//tensorflow/core/public:version",
-        "//tensorflow/core/util:managed_stack_trace",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/numeric:bits",
         "@com_google_absl//absl/strings",
         "@eigen_archive//:eigen3",
         "@local_tsl//tsl/framework:device_type",
@@ -1115,6 +1118,19 @@ cc_library(
     deps = ["//tensorflow/core/platform:macros"],
 )
 
+tf_cc_test(
+    name = "op_requires_test",
+    srcs = ["op_requires_test.cc"],
+    deps = [
+        ":op_requires",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/lib/gtl:cleanup",
+        "//tensorflow/core/platform:status_matchers",
+        "@com_google_absl//absl/status",
+    ],
+)
+
 # Files whose users still need to be migrated from core:framework to the
 # above targets.
 # TODO(gonnet): Remove these files once targets depending on them have
@@ -1569,8 +1585,6 @@ tf_proto_library(
     srcs = ["graph_debug_info.proto"],
     cc_api_version = 2,
     make_default_target_header_only = True,
-    protodeps = [
-    ],
 )
 
 tf_proto_library(
@@ -1709,6 +1723,12 @@ tf_proto_library(
     ],
 )
 
+tf_jspb_proto_library(
+    name = "tensor_jspb_proto",
+    visibility = ["//visibility:public"],
+    deps = [":tensor_proto"],
+)
+
 tf_proto_library(
     name = "api_def_proto",
     srcs = ["api_def.proto"],
@@ -1759,6 +1779,12 @@ tf_proto_library(
     make_default_target_header_only = True,
 )
 
+tf_jspb_proto_library(
+    name = "types_jspb_proto",
+    visibility = ["//visibility:public"],
+    deps = [":types_proto"],
+)
+
 tf_proto_library(
     name = "cost_graph_proto",
     srcs = ["cost_graph.proto"],
diff --git a/tensorflow/core/framework/dataset.cc b/tensorflow/core/framework/dataset.cc
index 6c841bb4768435..188e9813359e9a 100644
--- a/tensorflow/core/framework/dataset.cc
+++ b/tensorflow/core/framework/dataset.cc
@@ -922,6 +922,30 @@ Status DatasetBase::MakeSplitProviders(
   return inputs[0]->MakeSplitProviders(split_providers);
 }
 
+std::optional<int64_t> DatasetBase::GetEstimatedElementSize() const {
+  const auto& shapes = output_shapes();
+  const auto& dtypes = output_dtypes();
+  if (shapes.size() != dtypes.size()) {
+    LOG(ERROR) << "This should not happen because the sizes of output_shapes() "
+                  "and output_dtypes() should always be "
+                  "the same.";
+    return std::nullopt;
+  }
+
+  size_t num_outputs = shapes.size();
+  int64_t element_size = 0;
+  for (int i = 0; i < num_outputs; ++i) {
+    const auto& partial_shape = shapes[i];
+    const auto& dtype = dtypes[i];
+    auto num_elements = partial_shape.num_elements();
+    if (num_elements == -1) {
+      return std::nullopt;
+    }
+    element_size += num_elements * DataTypeSize(dtype);
+  }
+  return element_size;
+}
+
 int64_t DatasetBase::Cardinality() const {
   mutex_lock l(cardinality_mu_);
   if (cardinality_ == kUnknownCardinality) {
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index 9d7d89749667ea..394aa9a2d6e86e 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -60,7 +60,7 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/refcount.h"
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/tracing.h"
+#include "tsl/framework/allocator.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/thread_annotations.h"
 
@@ -666,7 +666,8 @@ class IteratorContext {
  public:
   struct Params {
     explicit Params(IteratorContext* ctx)
-        : allocator_getter(ctx->allocator_getter()),
+        : accelerator_device_info(ctx->accelerator_device_info()),
+          allocator_getter(ctx->allocator_getter()),
           cancellation_manager(ctx->cancellation_manager()),
           collective_executor(ctx->collective_executor()),
           env(ctx->env()),
@@ -696,6 +697,7 @@ class IteratorContext {
       // NOTE: need reinterpret_cast because function.h forward-declares Device.
       DeviceBase* device =
           reinterpret_cast<DeviceBase*>(ctx->function_library()->device());
+      accelerator_device_info = device->tensorflow_accelerator_device_info();
       allocator_getter = [device](AllocatorAttributes attrs) {
         return device->GetAllocator(attrs);
       };
@@ -718,6 +720,9 @@ class IteratorContext {
           *ctx->runner(), std::placeholders::_1);
     }
 
+    // If non-null, information about the GPU or TPU on which the op is placed.
+    const DeviceBase::AcceleratorDeviceInfo* accelerator_device_info = nullptr;
+
     // The Allocator to be used to allocate the output of an iterator.
     std::function<Allocator*(AllocatorAttributes)> allocator_getter = nullptr;
 
@@ -824,6 +829,10 @@ class IteratorContext {
     return params_.id_registry;
   }
 
+  const DeviceBase::AcceleratorDeviceInfo* accelerator_device_info() {
+    return params_.accelerator_device_info;
+  }
+
   Allocator* allocator(AllocatorAttributes attrs) {
     return params_.allocator_getter(attrs);
   }
@@ -898,6 +907,10 @@ class IteratorContext {
 
   void SetModel(std::shared_ptr<model::Model> model) { params_.model = model; }
 
+  void SetIndexMapper(const IndexMapperFn& index_mapper) {
+    params_.index_mapper = index_mapper;
+  };
+
   std::unique_ptr<thread::ThreadPool> CreateThreadPool(const string& name,
                                                        int num_threads) {
     if (params_.thread_pool) {
@@ -1326,6 +1339,10 @@ class DatasetBase : public core::RefCounted {
   // Returns the number of bytes allocated for tensors of this dataset.
   virtual int64_t AllocatedBytes() const { return 0; }
 
+  // Returns the estimated element size based on `output_shapes()` and
+  // `output_dtypes()`.
+  virtual std::optional<int64_t> GetEstimatedElementSize() const;
+
   // Returns the estimated number of bytes used for tensors of this dataset.
   virtual int64_t TotalBytes() const { return 0; }
 
diff --git a/tensorflow/core/framework/device.h b/tensorflow/core/framework/device.h
index 3c60452b6eec53..6cdcd2efd90ab9 100644
--- a/tensorflow/core/framework/device.h
+++ b/tensorflow/core/framework/device.h
@@ -193,6 +193,18 @@ class Device : public DeviceBase {
   // Informs if this Device can be used as a caller in RemoteCall operation.
   virtual bool IsRemoteCallAllowed() const;
 
+  // Whether to merge the host_to_device copy stream with the compute stream.
+  // Only useful for GPU devices.
+  virtual bool merge_host_to_device_stream() const { return false; }
+
+  // Whether to merge the device_to_host copy stream with the compute stream.
+  // Only useful for GPU devices.
+  virtual bool merge_device_to_host_stream() const { return false; }
+
+  // Whether to merge the device_to_device copy streams with the compute stream.
+  // Only useful for GPU devices.
+  virtual bool merge_device_to_device_stream() const { return false; }
+
  protected:
   void DeleteResourceMgr() {
     delete rmgr_;
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 95eed7f2e3a202..3cbfde22c75bc6 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -345,12 +345,12 @@ class FunctionCallFrame : public CallFrameInterface {
  private:
   DataTypeVector arg_types_;
   DataTypeVector ret_types_;
-  gtl::InlinedVector<Tensor, 4> args_;
+  absl::InlinedVector<Tensor, 4UL> args_;
   struct Retval {
     bool has_val = false;
     Tensor val;
   };
-  gtl::InlinedVector<Retval, 4> rets_;
+  absl::InlinedVector<Retval, 4UL> rets_;
 
   FunctionCallFrame(const FunctionCallFrame&) = delete;
   void operator=(const FunctionCallFrame&) = delete;
diff --git a/tensorflow/core/framework/function_testlib.cc b/tensorflow/core/framework/function_testlib.cc
index 7303228b935bbf..ae06188b8bc83a 100644
--- a/tensorflow/core/framework/function_testlib.cc
+++ b/tensorflow/core/framework/function_testlib.cc
@@ -15,8 +15,11 @@ limitations under the License.
 
 #include "tensorflow/core/framework/function_testlib.h"
 
+#include <cstdint>
+
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/lib/core/threadpool.h"
@@ -180,6 +183,26 @@ FunctionDef XTimesTwoWithControlOutput() {
       {{"dummy", "dummy"}});
 }
 
+FunctionDef XTimesTwoWithDanglingFloorDivNode() {
+  const Tensor kTwo = test::AsScalar<int64_t>(2);
+  return FDH::Define(
+      // Name
+      "XTimesTwoWithDanglingFloorDivNode",
+      // Args
+      {"x: T"},
+      // Return values
+      {"y: T"},
+      // Attr def
+      {"T: {float, double, int32, int64}"},
+      // Nodes
+      {
+          {{"two"}, "Const", {}, {{"value", kTwo}, {"dtype", DT_INT64}}},
+          {{"scale"}, "Cast", {"two"}, {{"SrcT", DT_INT64}, {"DstT", "$T"}}},
+          {{"z"}, "FloorDiv", {"x", "scale"}, {{"T", "$T"}}},
+          {{"y"}, "Mul", {"x", "scale"}, {{"T", "$T"}}},
+      });
+}
+
 FunctionDef TwoDeviceMult() {
   const Tensor kTwo = test::AsScalar<int64_t>(2);
   const Tensor kThree = test::AsScalar<int64_t>(3);
diff --git a/tensorflow/core/framework/function_testlib.h b/tensorflow/core/framework/function_testlib.h
index 0aba30650d5f84..06e0c3a6d36ca9 100644
--- a/tensorflow/core/framework/function_testlib.h
+++ b/tensorflow/core/framework/function_testlib.h
@@ -74,6 +74,8 @@ FunctionDef XTimesTwo();
 FunctionDef XTimesTwoWithControlInput();
 // Same as `XTimesTwo` above, but with a `dummy` control output node.
 FunctionDef XTimesTwoWithControlOutput();
+// Same as `XTimesTwo` above, but with a dangling `FloorDiv` node.
+FunctionDef XTimesTwoWithDanglingFloorDivNode();
 
 // x: T -> cpu(x * 2) + cpu(x * 3).
 FunctionDef TwoDeviceTimesFive();
diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc
index 0248ff3e62328d..904538cfcffcc3 100644
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@@ -1404,6 +1404,15 @@ std::shared_ptr<Parameter> MakeParameter(const string& name,
   return std::make_shared<Parameter>(name, state, min, max);
 }
 
+std::shared_ptr<Parameter> MakeParameter(const string& name,
+                                         std::shared_ptr<SharedState> state,
+                                         double min, double max, double value) {
+  std::shared_ptr<Parameter> parameter =
+      std::make_shared<Parameter>(name, state, min, max);
+  parameter->value = value;
+  return parameter;
+}
+
 std::shared_ptr<Parameter> MakeNonTunableParameter(const string& name,
                                                    double value) {
   return std::make_shared<Parameter>(name, nullptr, /*min=*/value,
@@ -1446,10 +1455,14 @@ std::shared_ptr<Node> MakeAsyncKnownRatioNode(
 std::shared_ptr<Node> MakeAsyncKnownRatioNode(
     Node::Args args, double ratio,
     std::vector<std::shared_ptr<Parameter>> parameters,
-    bool is_legacy_prefetch_autotuned) {
-  return MakeAsyncKnownRatioNode(std::move(args), /*ratio=*/ratio,
-                                 /*memory_ratio=*/ratio, std::move(parameters),
-                                 is_legacy_prefetch_autotuned);
+    bool is_legacy_prefetch_autotuned,
+    std::optional<int64_t> estimated_element_size) {
+  auto node =
+      MakeAsyncKnownRatioNode(std::move(args), /*ratio=*/ratio,
+                              /*memory_ratio=*/ratio, std::move(parameters),
+                              is_legacy_prefetch_autotuned);
+  node->SetEstimatedElementSize(estimated_element_size);
+  return node;
 }
 
 std::shared_ptr<Node> MakeSourceNode(Node::Args args) {
@@ -1768,6 +1781,9 @@ double Node::AverageBufferedElementSizeLocked() const {
   DCHECK_GE(buffered_elements_, 0);
   if (num_elements_ <= 0) {
     if (buffered_elements_ <= 0) {
+      if (estimated_element_size_) {
+        return *estimated_element_size_;
+      }
       // If there are no produced elements or buffered elements recorded, return
       // 0.
       return 0;
@@ -2071,6 +2087,7 @@ std::shared_ptr<Node> Node::SnapshotHelper(
         cloned_current->parameters_[parameter_name] =
             std::make_shared<Parameter>(parameter_ptr);
       }
+      cloned_current->estimated_element_size_ = estimated_element_size_;
       cloned_current->previous_processing_time_ = previous_processing_time_;
       cloned_current->processing_time_ema_ = processing_time_ema_;
     }
diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index d0b0e0e14eea23..ff9caa326ff4bb 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -137,11 +137,17 @@ struct Parameter {
   std::shared_ptr<SharedState> state;
 };
 
-// Returns a new tunable parameter.
+// Returns a new tunable parameter with the value set to `min`.
 std::shared_ptr<Parameter> MakeParameter(const string& name,
                                          std::shared_ptr<SharedState> state,
                                          double min, double max);
 
+// Returns a new tunable parameter with the value set to `value` instead
+// of `min`.
+std::shared_ptr<Parameter> MakeParameter(const string& name,
+                                         std::shared_ptr<SharedState> state,
+                                         double min, double max, double value);
+
 // Returns a new non-tunable parameter.
 std::shared_ptr<Parameter> MakeNonTunableParameter(const string& name,
                                                    double value);
@@ -630,6 +636,11 @@ class Node {
   // name matches `parameter_name`.
   void SyncStateValuesToParameterValues(const std::string& parameter_name);
 
+  void SetEstimatedElementSize(std::optional<int64_t> estimated_element_size) {
+    mutex_lock l(mu_);
+    estimated_element_size_ = estimated_element_size;
+  }
+
  protected:
   // Used for (incrementally) recording metrics. The class is thread-safe.
   class Metrics {
@@ -854,6 +865,8 @@ class Node {
   // node results in recursive deletion of the subtree rooted in the node.
   Node* const output_;
   std::weak_ptr<Node> output_weak_ptr_;
+  std::optional<int64_t> estimated_element_size_ TF_GUARDED_BY(mu_) =
+      std::nullopt;
 };
 
 // InterleaveMany is used to model datasets whose inputs are used to create
@@ -876,10 +889,13 @@ std::shared_ptr<Node> MakeAsyncKnownRatioNode(
     std::vector<std::shared_ptr<Parameter>> parameters,
     bool is_legacy_prefetch_autotuned = false);
 
+// Makes an AsyncKnownRatioNode. If `estimated_element_size` is provided,
+// it will be used during the estimation of maximum buffered bytes.
 std::shared_ptr<Node> MakeAsyncKnownRatioNode(
     Node::Args args, double ratio,
     std::vector<std::shared_ptr<Parameter>> parameters,
-    bool is_legacy_prefetch_autotuned = false);
+    bool is_legacy_prefetch_autotuned = false,
+    std::optional<int64_t> estimated_element_size = std::nullopt);
 
 // Source nodes represent data sources.
 std::shared_ptr<Node> MakeSourceNode(Node::Args args);
diff --git a/tensorflow/core/framework/model_test.cc b/tensorflow/core/framework/model_test.cc
index 3f0e173041a86b..cdc44d2cac2d3c 100644
--- a/tensorflow/core/framework/model_test.cc
+++ b/tensorflow/core/framework/model_test.cc
@@ -20,14 +20,16 @@ limitations under the License.
 #include <cstdlib>
 #include <functional>
 #include <memory>
+#include <optional>
 #include <string>
 #include <tuple>
 #include <utility>
 
-#include <gmock/gmock.h>
+#include <gtest/gtest.h>
 #include "absl/status/status.h"
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/model.pb.h"
+#include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
@@ -3572,6 +3574,127 @@ TEST(RamBudgetManagerTest, RequestAllocationsWithBudgetAdjustment) {
   EXPECT_TRUE(rbm.RequestLegacyPrefetchBytes(4));
 }
 
+TEST(NodeTest, OnlyCollectParametersThatHaveElementsProduced) {
+  // Builds a graph:
+  // root <- parallel_map <- parallel_interleave
+
+  static constexpr int kIrrelevantMin = 1;
+  static constexpr int kIrrelevantMax = 16;
+
+  auto parallel_interleave = model::MakeAsyncKnownRatioNode(
+      {0, "parallel_interleave", nullptr}, /*ratio=*/1,
+      {model::MakeParameter(kParallelism,
+                            std::make_shared<SharedState>(kAutotune,
+                                                          /*mu=*/nullptr,
+                                                          /*cond_var=*/nullptr),
+                            kIrrelevantMin, kIrrelevantMax)},
+      /*is_legacy_prefetch_autotuned=*/false);
+
+  auto parallel_map = model::MakeAsyncKnownRatioNode(
+      {1, "parallel_map", nullptr}, /*ratio=*/1,
+      {model::MakeParameter(kParallelism,
+                            std::make_shared<SharedState>(kAutotune,
+                                                          /*mu=*/nullptr,
+                                                          /*cond_var=*/nullptr),
+                            kIrrelevantMin, kIrrelevantMax)},
+      /*is_legacy_prefetch_autotuned=*/false);
+
+  parallel_map->add_input(parallel_interleave);
+
+  auto root = MakeUnknownNode({2, "unknown0", nullptr});
+  root->add_input(parallel_map);
+
+  // When there is no nodes seeing any input element, it should return 0
+  EXPECT_EQ(root->CollectTunableParameters().size(), 0);
+
+  // Simulates the situation when the data has been produced by the first op.
+  parallel_interleave->record_element();
+  EXPECT_EQ(root->CollectTunableParameters().size(), 1);
+
+  // Now that the second op has seen one element, there should be two parameters
+  // with nodes having element sizes.
+  parallel_map->record_element();
+  EXPECT_EQ(root->CollectTunableParameters().size(), 2);
+
+  // Finally, the root op has seen an element.
+  // But because it does not have parameters, the parameter size should still
+  // be 2.
+  root->record_element();
+  EXPECT_EQ(root->CollectTunableParameters().size(), 2);
+}
+
+TEST(NodeTest, TotalMaximumBufferedBytesHasValueWhenElementSizeProvided) {
+  // Builds a graph:
+  // root <- parallel_map <- parallel_interleave
+
+  static constexpr int kMin = 1;
+  static constexpr int kIrrelevantMax = 16;
+
+  static constexpr int64_t element_size = 100;
+
+  auto parallel_interleave = model::MakeAsyncKnownRatioNode(
+      {0, "parallel_interleave", nullptr}, /*ratio=*/1,
+      {model::MakeParameter(kParallelism,
+                            std::make_shared<SharedState>(kAutotune,
+                                                          /*mu=*/nullptr,
+                                                          /*cond_var=*/nullptr),
+                            kMin, kIrrelevantMax)},
+      /*is_legacy_prefetch_autotuned=*/false);
+
+  auto parallel_map_parameter =
+      model::MakeParameter(kParallelism,
+                           std::make_shared<SharedState>(kAutotune,
+                                                         /*mu=*/nullptr,
+                                                         /*cond_var=*/nullptr),
+                           kMin, kIrrelevantMax);
+  auto parallel_map = model::MakeAsyncKnownRatioNode(
+      {1, "parallel_map", nullptr}, /*ratio=*/1, {parallel_map_parameter},
+      /*is_legacy_prefetch_autotuned=*/false, element_size);
+
+  parallel_map->add_input(parallel_interleave);
+
+  auto root = MakeUnknownNode({2, "unknown0", nullptr});
+  root->add_input(parallel_map);
+
+  EXPECT_EQ(root->TotalMaximumBufferedBytes(), element_size * kMin);
+
+  parallel_map_parameter->value = 7;
+  EXPECT_EQ(root->TotalMaximumBufferedBytes(), element_size * 7);
+}
+
+TEST(NodeTest, TotalMaximumBufferedBytesNoValueWhenElementSizeNotProvided) {
+  // Builds a graph:
+  // root <- parallel_map <- parallel_interleave
+
+  static constexpr int kIrrelevantMin = 1;
+  static constexpr int kIrrelevantMax = 16;
+
+  auto parallel_interleave = model::MakeAsyncKnownRatioNode(
+      {0, "parallel_interleave", nullptr}, /*ratio=*/1,
+      {model::MakeParameter(kParallelism,
+                            std::make_shared<SharedState>(kAutotune,
+                                                          /*mu=*/nullptr,
+                                                          /*cond_var=*/nullptr),
+                            kIrrelevantMin, kIrrelevantMax)},
+      /*is_legacy_prefetch_autotuned=*/false);
+
+  auto parallel_map = model::MakeAsyncKnownRatioNode(
+      {1, "parallel_map", nullptr}, /*ratio=*/1,
+      {model::MakeParameter(kParallelism,
+                            std::make_shared<SharedState>(kAutotune,
+                                                          /*mu=*/nullptr,
+                                                          /*cond_var=*/nullptr),
+                            kIrrelevantMin, kIrrelevantMax)},
+      /*is_legacy_prefetch_autotuned=*/false, std::nullopt);
+
+  parallel_map->add_input(parallel_interleave);
+
+  auto root = MakeUnknownNode({2, "unknown0", nullptr});
+  root->add_input(parallel_map);
+
+  EXPECT_EQ(root->TotalMaximumBufferedBytes(), 0.);
+}
+
 }  // namespace
 }  // namespace model
 }  // namespace data
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index af22fb7872d399..1161f1545cdfd8 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -1066,8 +1066,8 @@ class OpKernelContext {
     return params_->output_attr_array[index];
   }
 
-  gtl::InlinedVector<WrappedAllocator, 4> ConsumeWrappedAllocators() {
-    gtl::InlinedVector<WrappedAllocator, 4> retrieved;
+  absl::InlinedVector<WrappedAllocator, 4UL> ConsumeWrappedAllocators() {
+    absl::InlinedVector<WrappedAllocator, 4UL> retrieved;
     if (tracking_state_) {
       mutex_lock lock(tracking_state_->mu);
       retrieved.swap(tracking_state_->wrapped_allocators);
@@ -1297,7 +1297,7 @@ class OpKernelContext {
   Status status_;
   friend class CollectiveExecutor;  // for access to params_
   Params* params_;                  // not owned
-  gtl::InlinedVector<TensorValue, 4> outputs_;
+  absl::InlinedVector<TensorValue, 4UL> outputs_;
 
   // Keep track of calls to ScopedAllocator.
   // TODO(ayushd): change to absl::flat_hash_set.
@@ -1308,16 +1308,17 @@ class OpKernelContext {
   // recorded.
   struct TrackingState {
     mutable mutex mu;
-    gtl::InlinedVector<WrappedAllocator, 4> wrapped_allocators
+    absl::InlinedVector<WrappedAllocator, 4UL> wrapped_allocators
         TF_GUARDED_BY(mu);
 
     mutable mutex stats_mu;
     int64_t temp_memory_allocated TF_GUARDED_BY(stats_mu) = 0;
 
     int64_t persistent_memory_allocated TF_GUARDED_BY(stats_mu) = 0;
-    gtl::InlinedVector<std::pair<const void*, int64_t>, 2>
+    absl::InlinedVector<std::pair<const void*, int64_t>, 2UL>
         temp_tensor_buffer_and_size TF_GUARDED_BY(stats_mu);
-    gtl::InlinedVector<int64_t, 2> persistent_alloc_ids TF_GUARDED_BY(stats_mu);
+    absl::InlinedVector<int64_t, 2UL> persistent_alloc_ids
+        TF_GUARDED_BY(stats_mu);
   };
   std::unique_ptr<TrackingState> tracking_state_;
 
diff --git a/tensorflow/core/framework/op_requires.h b/tensorflow/core/framework/op_requires.h
index 118dbbb8a434b6..85e4f53bcf81f1 100644
--- a/tensorflow/core/framework/op_requires.h
+++ b/tensorflow/core/framework/op_requires.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_FRAMEWORK_OP_REQUIRES_H_
 #define TENSORFLOW_CORE_FRAMEWORK_OP_REQUIRES_H_
 
+#include <utility>
+
 #include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
@@ -28,15 +30,28 @@ namespace tensorflow {
 //   OP_REQUIRES(context, context->num_inputs() == 2,
 //               errors::InvalidArgument("FooOp requires 2 arguments"));
 //   ...
-//   Status status = SomeUncertainMethod();
+//   absl::Status status = SomeUncertainMethod();
 //   OP_REQUIRES_OK(context, status);
+//
+//   // Or in one go:
+//   OP_REQUIRES_OK(context, SomeUncertainMethod());
 //   ...
 // }
 //
-// These macros depend on CheckNotInComputeAsync, which must be defined before
-// invoking the macro. We specifically don't include op_kernel.h from this
-// header to reduce this header's dependencies. These macros may be used with
-// alternative implementations of OpKernelContext with fewer dependencies.
+// The *_ASYNC versions take a CALLBACK macro argument which is called just
+// before the return in the failure case; the expression in the macro itself
+// is evaluated only in the failure case, and can therefore be expensive or
+// have side effects that must not occur in the successful case. For example:
+//
+//   auto done = MakeCleanup([&]() { /* necessary continuation */ });
+//   OP_REQUIRES_OK_ASYNC(context, SomeUncertainMethod(), done.release());
+//   // `done` is still engaged if and only if control reaches here.
+//
+// These macros depend on CheckNotInComputeAsync and on absl::Status, both
+// of which must be defined before invoking the macros. We specifically don't
+// include op_kernel.h or the Abseil headers from this header to reduce this
+// header's dependencies. These macros may be used with alternative
+// implementations of OpKernelContext with fewer dependencies.
 
 #define OP_REQUIRES(CTX, EXP, STATUS)                     \
   do {                                                    \
@@ -47,14 +62,17 @@ namespace tensorflow {
     }                                                     \
   } while (0)
 
-#define OP_REQUIRES_OK(CTX, ...)                             \
-  do {                                                       \
-    const ::absl::Status& _s(__VA_ARGS__);                   \
-    if (!TF_PREDICT_TRUE(_s.ok())) {                         \
-      CheckNotInComputeAsync((CTX), "OP_REQUIRES_OK_ASYNC"); \
-      (CTX)->CtxFailureWithWarning(__FILE__, __LINE__, _s);  \
-      return;                                                \
-    }                                                        \
+// The macro arguements passed to the ellipsis must combine to a single
+// expression that is convertible to absl::Status. We accept a variable
+// number of macro arguments only so as to support interior commas.
+#define OP_REQUIRES_OK(CTX, ...)                                        \
+  do {                                                                  \
+    if (!TF_PREDICT_TRUE(                                               \
+            ::tensorflow::op_requires_internal::OkImpl<::absl::Status>( \
+                (CTX), __FILE__, __LINE__,                              \
+                static_cast<const ::absl::Status&>(__VA_ARGS__)))) {    \
+      return;                                                           \
+    }                                                                   \
   } while (0)
 
 #define OP_REQUIRES_OK_OR_SET_PAYLOAD(CTX, PAYLOAD_KEY, PAYLOAD_VALUE, STATUS) \
@@ -78,14 +96,14 @@ namespace tensorflow {
     }                                                  \
   } while (0)
 
-#define OP_REQUIRES_OK_ASYNC(CTX, STATUS, CALLBACK)         \
-  do {                                                      \
-    const ::absl::Status& _s(STATUS);                       \
-    if (!TF_PREDICT_TRUE(_s.ok())) {                        \
-      (CTX)->CtxFailureWithWarning(__FILE__, __LINE__, _s); \
-      (CALLBACK)();                                         \
-      return;                                               \
-    }                                                       \
+#define OP_REQUIRES_OK_ASYNC(CTX, STATUS, CALLBACK)                          \
+  do {                                                                       \
+    if (!TF_PREDICT_TRUE(                                                    \
+            ::tensorflow::op_requires_internal::OkAsyncImpl<::absl::Status>( \
+                (CTX), __FILE__, __LINE__, (STATUS)))) {                     \
+      (CALLBACK)();                                                          \
+      return;                                                                \
+    }                                                                        \
   } while (0)
 
 #define OP_REQUIRES_VALUE(lhs, ctx, rexpr)                                   \
@@ -98,6 +116,40 @@ namespace tensorflow {
   OP_REQUIRES_OK(ctx, statusor.status());                 \
   lhs = std::move(statusor.value())
 
+// The "Impl" functions are implementation details for the above macros. They
+// accept values constructed by the macros, and the values are guaranteed to
+// be alive for the duration of the function call. Passing the macro arguments
+// through a function call is important to support macro arguments that expand
+// to short-lived values (which could not be bound to a reference directly).
+//
+// We use a template parameter S instead of the concrete type absl::Status
+// so as to not require the inclusion of the Abseil header in this file.
+// The header must be included before the macros are used.
+
+namespace op_requires_internal {
+
+template <typename S, typename Ctx>
+bool OkImpl(Ctx&& ctx, const char* file, int line, const S& s) {
+  if (!TF_PREDICT_TRUE(s.ok())) {
+    CheckNotInComputeAsync(ctx, "OP_REQUIRES_OK_ASYNC");
+    ctx->CtxFailureWithWarning(file, line, s);
+    return false;
+  } else {
+    return true;
+  }
+}
+
+template <typename S, typename Ctx>
+bool OkAsyncImpl(Ctx&& ctx, const char* file, int line, const S& s) {
+  if (!TF_PREDICT_TRUE(s.ok())) {
+    ctx->CtxFailureWithWarning(file, line, s);
+    return false;
+  } else {
+    return true;
+  }
+}
+
+}  // namespace op_requires_internal
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_FRAMEWORK_OP_REQUIRES_H_
diff --git a/tensorflow/core/framework/op_requires_test.cc b/tensorflow/core/framework/op_requires_test.cc
new file mode 100644
index 00000000000000..84dc0b530d4037
--- /dev/null
+++ b/tensorflow/core/framework/op_requires_test.cc
@@ -0,0 +1,119 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op_requires.h"
+
+#include <optional>
+#include <utility>
+
+#include "absl/status/status.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/platform/status_matchers.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+using ::tensorflow::testing::StatusIs;
+using ::testing::Optional;
+
+class Holder {
+ public:
+  explicit Holder()
+      : fine_(absl::OkStatus()), foul_(absl::InternalError("test")) {}
+
+  const absl::Status& Fine() const { return fine_; }
+  const absl::Status& Foul() const { return foul_; }
+
+ private:
+  absl::Status fine_;
+  absl::Status foul_;
+};
+
+struct TestContext {
+ public:
+  void CtxFailureWithWarning(const char* file, int line, absl::Status status) {
+    stored_status.emplace(std::move(status));
+  }
+
+  friend void CheckNotInComputeAsync(TestContext* ctx, const char* msg) {}
+
+  std::optional<absl::Status> stored_status = std::nullopt;
+};
+
+void TestFunction(TestContext& ctx, bool success, bool& reached) {
+  if (success) {
+    OP_REQUIRES_OK(&ctx, Holder().Fine());
+  } else {
+    OP_REQUIRES_OK(&ctx, Holder().Foul());
+  }
+  reached = true;
+}
+
+TEST(OpRequires, RequiresOkWithOkStatus) {
+  TestContext ctx;
+  bool reached = false;
+
+  TestFunction(ctx, /*success=*/true, reached);
+  EXPECT_FALSE(ctx.stored_status.has_value());
+  EXPECT_TRUE(reached);
+}
+
+TEST(OpRequires, RequiresOkWithFailedStatus) {
+  TestContext ctx;
+  bool reached = false;
+
+  TestFunction(ctx, /*success=*/false, reached);
+  EXPECT_THAT(ctx.stored_status,
+              Optional(StatusIs(absl::StatusCode::kInternal)));
+  EXPECT_FALSE(reached);
+}
+
+void TestFunctionAsync(TestContext& ctx, bool success, bool& reached,
+                       bool& handled) {
+  auto done = gtl::MakeCleanup([&handled]() { handled = true; });
+  if (success) {
+    OP_REQUIRES_OK_ASYNC(&ctx, Holder().Fine(), done.release());
+  } else {
+    OP_REQUIRES_OK_ASYNC(&ctx, Holder().Foul(), done.release());
+  }
+  reached = true;
+}
+
+TEST(OpRequires, RequiresOkAsyncWithOkStatus) {
+  TestContext ctx;
+  bool reached = false;
+  bool handled = false;
+
+  TestFunctionAsync(ctx, /*success=*/true, reached, handled);
+  EXPECT_FALSE(ctx.stored_status.has_value());
+  EXPECT_TRUE(reached);
+  EXPECT_TRUE(handled);
+}
+
+TEST(OpRequires, RequiresOkAsyncWithFailedStatus) {
+  TestContext ctx;
+  bool reached = false;
+  bool handled = false;
+
+  TestFunctionAsync(ctx, /*success=*/false, reached, handled);
+  EXPECT_THAT(ctx.stored_status,
+              Optional(StatusIs(absl::StatusCode::kInternal)));
+  EXPECT_FALSE(reached);
+  EXPECT_TRUE(handled);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/framework/resource_handle.proto b/tensorflow/core/framework/resource_handle.proto
index 5a41750475d612..aa53f7ff168ae4 100644
--- a/tensorflow/core/framework/resource_handle.proto
+++ b/tensorflow/core/framework/resource_handle.proto
@@ -34,7 +34,9 @@ message ResourceHandleProto {
 
   // Protocol buffer representing a pair of (data type, tensor shape).
   message DtypeAndShape {
+    // Data type of the tensor.
     DataType dtype = 1;
+    // Shape of the tensor.
     TensorShapeProto shape = 2;
   }
 
diff --git a/tensorflow/core/framework/resource_op_kernel.h b/tensorflow/core/framework/resource_op_kernel.h
index a7f20cc33524b8..63ccd938314925 100644
--- a/tensorflow/core/framework/resource_op_kernel.h
+++ b/tensorflow/core/framework/resource_op_kernel.h
@@ -141,7 +141,7 @@ class ResourceOpKernel : public OpKernel {
   // it is compatible with this op's configuration. The verification may fail in
   // cases such as two graphs asking queues of the same shared name to have
   // inconsistent capacities.
-  virtual Status VerifyResource(T* resource) { return OkStatus(); }
+  virtual Status VerifyResource(T* resource) { return absl::OkStatus(); }
 
   Tensor tensor_ TF_GUARDED_BY(mu_);
 
diff --git a/tensorflow/core/framework/resource_op_kernel_test.cc b/tensorflow/core/framework/resource_op_kernel_test.cc
index 267a5bc3efda0e..9b6b7ea30eb824 100644
--- a/tensorflow/core/framework/resource_op_kernel_test.cc
+++ b/tensorflow/core/framework/resource_op_kernel_test.cc
@@ -69,7 +69,7 @@ class StubResourceOpKernel : public ResourceOpKernel<StubResource> {
       return errors::InvalidArgument("stub has code ", resource->code,
                                      " but requested code ", code);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
diff --git a/tensorflow/core/framework/resource_var.cc b/tensorflow/core/framework/resource_var.cc
index 0f5ed5560d6b98..c441a8e2648591 100644
--- a/tensorflow/core/framework/resource_var.cc
+++ b/tensorflow/core/framework/resource_var.cc
@@ -37,7 +37,7 @@ Status Var::AsGraphDef(GraphDefBuilder* builder, Node** out) const {
                     builder->opts().WithAttr("dtype", tensor_.dtype()));
   *out =
       ops::UnaryOp("Identity", var, builder->opts().WithControlInput(assign));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 std::string Var::MakeRefCountingHandleName(int64_t resource_id) const {
diff --git a/tensorflow/core/framework/run_handler.cc b/tensorflow/core/framework/run_handler.cc
index 472a55f1b87f2a..e68025643817d8 100644
--- a/tensorflow/core/framework/run_handler.cc
+++ b/tensorflow/core/framework/run_handler.cc
@@ -31,8 +31,8 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/numa.h"
 #include "tensorflow/core/platform/setround.h"
-#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
+#include "tsl/platform/tracing.h"
 
 namespace tensorflow {
 namespace {
@@ -67,9 +67,10 @@ RunHandlerEnvironment::EnvThread* RunHandlerEnvironment::CreateThread(
 RunHandlerEnvironment::Task RunHandlerEnvironment::CreateTask(
     std::function<void()> f) {
   uint64 id = 0;
-  if (tracing::EventCollector::IsEnabled()) {
-    id = tracing::GetUniqueArg();
-    tracing::RecordEvent(tracing::EventCategory::kScheduleClosure, id);
+  if (tsl::tracing::EventCollector::IsEnabled()) {
+    id = tsl::tracing::GetUniqueArg();
+    tsl::tracing::RecordEvent(tsl::tracing::EventCategory::kScheduleClosure,
+                              id);
   }
   return Task{
       std::unique_ptr<TaskImpl>(new TaskImpl{
@@ -82,8 +83,8 @@ RunHandlerEnvironment::Task RunHandlerEnvironment::CreateTask(
 
 void RunHandlerEnvironment::ExecuteTask(const Task& t) {
   WithContext wc(t.f->context);
-  tracing::ScopedRegion region(tracing::EventCategory::kRunClosure,
-                               t.f->trace_id);
+  tsl::tracing::ScopedRegion region(tsl::tracing::EventCategory::kRunClosure,
+                                    t.f->trace_id);
   t.f->f();
 }
 
@@ -622,24 +623,24 @@ void RunHandlerThreadPool::WorkerLoop(int thread_id,
       }
     }
     if (t.f) {
-      profiler::TraceMe activity(
+      tsl::profiler::TraceMe activity(
           [=] {
             return strings::StrCat(task_from_blocking_queue ? "inter" : "intra",
                                    " #id = ", tws->GetTracemeId(), " ",
                                    thread_id, "#");
           },
-          profiler::TraceMeLevel::kInfo);
+          tsl::profiler::TraceMeLevel::kInfo);
       VLOG(2) << "Running " << (task_from_blocking_queue ? "inter" : "intra")
               << " work from " << tws->GetTracemeId();
       tws->IncrementInflightTaskCount(task_from_blocking_queue);
       env_.ExecuteTask(t);
       tws->DecrementInflightTaskCount(task_from_blocking_queue);
     } else {
-      profiler::TraceMe activity(
+      tsl::profiler::TraceMe activity(
           [=] {
             return strings::StrCat("Sleeping#thread_id=", thread_id, "#");
           },
-          profiler::TraceMeLevel::kInfo);
+          tsl::profiler::TraceMeLevel::kInfo);
       if (VLOG_IS_ON(4)) {
         for (int i = 0; i < thread_work_sources->size(); ++i) {
           VLOG(4) << "source id " << i << " "
@@ -847,12 +848,12 @@ class RunHandlerPool::Impl {
     {
       mutex_lock l(mu_);
       if (!has_free_handler()) {
-        profiler::TraceMe activity(
+        tsl::profiler::TraceMe activity(
             [&] {
               return strings::StrCat("WaitingForHandler#step_id=", step_id,
                                      "#");
             },
-            profiler::TraceMeLevel::kInfo);
+            tsl::profiler::TraceMeLevel::kInfo);
         TRACESTRING(
             strings::StrCat("RunHandlerPool::Impl::Get waiting for a handler "
                             "with timeout in millisecond",
diff --git a/tensorflow/core/framework/run_handler_test.cc b/tensorflow/core/framework/run_handler_test.cc
index 479cba887172f8..582ed1c194c8fb 100644
--- a/tensorflow/core/framework/run_handler_test.cc
+++ b/tensorflow/core/framework/run_handler_test.cc
@@ -650,7 +650,7 @@ TEST_F(RunHandlerTest, UseRunHandlerPoolEnableSubPool) {
   Initialize({3, 2, -1, 0});
   auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
-  EXPECT_EQ(OkStatus(), session->Create(def_));
+  EXPECT_EQ(absl::OkStatus(), session->Create(def_));
   std::vector<std::pair<string, Tensor>> inputs;
 
   // Request two targets: one fetch output and one non-fetched output.
@@ -664,7 +664,7 @@ TEST_F(RunHandlerTest, UseRunHandlerPoolEnableSubPool) {
 
   Status s = session->Run(run_options, inputs, output_names, target_nodes,
                           &outputs, nullptr);
-  EXPECT_EQ(OkStatus(), s);
+  EXPECT_EQ(absl::OkStatus(), s);
 
   ASSERT_EQ(1, outputs.size());
   // The first output should be initialized and have the correct
@@ -678,7 +678,7 @@ TEST_F(RunHandlerTest, TestConcurrencyUseRunHandlerPool) {
   Initialize({1, 2, 3, 4});
   auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
-  EXPECT_EQ(OkStatus(), session->Create(def_));
+  EXPECT_EQ(absl::OkStatus(), session->Create(def_));
 
   RunOptions run_options;
   run_options.mutable_experimental()->set_use_run_handler_pool(true);
@@ -695,7 +695,7 @@ TEST_F(RunHandlerTest, TestConcurrencyUseRunHandlerPool) {
       // Run the graph
       Status s = session->Run(run_options, inputs, output_names, {}, &outputs,
                               nullptr);
-      EXPECT_EQ(OkStatus(), s);
+      EXPECT_EQ(absl::OkStatus(), s);
       ASSERT_EQ(1, outputs.size());
       auto mat = outputs[0].matrix<float>();
       EXPECT_FLOAT_EQ(3.0, mat(0, 0));
@@ -714,7 +714,7 @@ TEST_F(RunHandlerTest, UseRunHandlerPoolEnableSubPoolWithPriority) {
   Initialize({3, 2, -1, 0});
   auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
-  EXPECT_EQ(OkStatus(), session->Create(def_));
+  EXPECT_EQ(absl::OkStatus(), session->Create(def_));
   std::vector<std::pair<string, Tensor>> inputs;
 
   // Request two targets: one fetch output and one non-fetched output.
@@ -731,7 +731,7 @@ TEST_F(RunHandlerTest, UseRunHandlerPoolEnableSubPoolWithPriority) {
 
   Status s = session->Run(run_options, inputs, output_names, target_nodes,
                           &outputs, nullptr);
-  EXPECT_EQ(OkStatus(), s);
+  EXPECT_EQ(absl::OkStatus(), s);
 
   ASSERT_EQ(1, outputs.size());
   // The first output should be initialized and have the correct
@@ -745,7 +745,7 @@ TEST_F(RunHandlerTest, TestConcurrencyUseRunHandlerPoolWithPriority) {
   Initialize({1, 2, 3, 4});
   auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
-  EXPECT_EQ(OkStatus(), session->Create(def_));
+  EXPECT_EQ(absl::OkStatus(), session->Create(def_));
 
   // Fill in the input and ask for the output
   thread::ThreadPool* tp = new thread::ThreadPool(Env::Default(), "test", 4);
@@ -764,7 +764,7 @@ TEST_F(RunHandlerTest, TestConcurrencyUseRunHandlerPoolWithPriority) {
       // Run the graph
       Status s = session->Run(run_options, inputs, output_names, {}, &outputs,
                               nullptr);
-      EXPECT_EQ(OkStatus(), s);
+      EXPECT_EQ(absl::OkStatus(), s);
       ASSERT_EQ(1, outputs.size());
       auto mat = outputs[0].matrix<float>();
       EXPECT_FLOAT_EQ(3.0, mat(0, 0));
diff --git a/tensorflow/core/framework/shape_inference.cc b/tensorflow/core/framework/shape_inference.cc
index 9696600317e39f..d74366937210c9 100644
--- a/tensorflow/core/framework/shape_inference.cc
+++ b/tensorflow/core/framework/shape_inference.cc
@@ -132,7 +132,7 @@ Status InferenceContext::set_output(StringPiece output_name,
       outputs_[i + start] = shapes[i];
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status InferenceContext::input(StringPiece input_name,
@@ -146,7 +146,7 @@ Status InferenceContext::input(StringPiece input_name,
       output->push_back(inputs_[i]);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status InferenceContext::output(StringPiece output_name,
@@ -160,7 +160,7 @@ Status InferenceContext::output(StringPiece output_name,
       output->push_back(outputs_[i]);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void InferenceContext::PreInputInit(
@@ -195,7 +195,7 @@ Status InferenceContext::ExpandOutputs(int new_output_size) {
   }
   outputs_.resize(new_output_size, nullptr);
   output_handle_shapes_and_types_.resize(new_output_size);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void InferenceContext::PostInputInit(
@@ -304,7 +304,7 @@ string InferenceContext::DebugString(const ShapeAndType& shape_and_type) {
 }
 
 string InferenceContext::DebugString(
-    gtl::ArraySlice<ShapeAndType> shape_and_types) {
+    absl::Span<const ShapeAndType> shape_and_types) {
   std::vector<string> pieces;
   for (const ShapeAndType& s : shape_and_types) {
     pieces.push_back(DebugString(s));
@@ -320,7 +320,7 @@ Status InferenceContext::WithRank(ShapeHandle shape, int64_t rank,
   const int32_t existing = Rank(shape);
   if (existing == rank) {
     *out = shape;
-    return OkStatus();
+    return absl::OkStatus();
   }
   if (existing == kUnknownRank) {
     std::vector<DimensionHandle> dims;
@@ -345,7 +345,7 @@ Status InferenceContext::WithRankAtLeast(ShapeHandle shape, int64_t rank,
   const int32_t existing = Rank(shape);
   if (existing >= rank || existing == kUnknownRank) {
     *out = shape;
-    return OkStatus();
+    return absl::OkStatus();
   }
   *out = nullptr;
   return errors::InvalidArgument("Shape must be at least rank ", rank,
@@ -360,7 +360,7 @@ Status InferenceContext::WithRankAtMost(ShapeHandle shape, int64_t rank,
   const int32_t existing = Rank(shape);
   if (existing <= rank || existing == kUnknownRank) {
     *out = shape;
-    return OkStatus();
+    return absl::OkStatus();
   }
   *out = nullptr;
   return errors::InvalidArgument("Shape must be at most rank ", rank,
@@ -372,7 +372,7 @@ Status InferenceContext::WithValue(DimensionHandle dim, int64_t value,
   const int64_t existing = Value(dim);
   if (existing == value) {
     *out = dim;
-    return OkStatus();
+    return absl::OkStatus();
   }
   if (existing == kUnknownDim) {
     DimensionHandle d = MakeDim(value);
@@ -413,18 +413,18 @@ Status InferenceContext::Merge(DimensionHandle d0, DimensionHandle d1,
                                DimensionHandle* out) {
   if (d0.SameHandle(d1)) {
     *out = d0;
-    return OkStatus();
+    return absl::OkStatus();
   } else if (!ValueKnown(d1)) {
     *out = d0;
     merged_dims_.emplace_back(d0, d1);
-    return OkStatus();
+    return absl::OkStatus();
   } else if (!ValueKnown(d0)) {
     *out = d1;
     merged_dims_.emplace_back(d0, d1);
-    return OkStatus();
+    return absl::OkStatus();
   } else if (Value(d0) == Value(d1)) {
     *out = d0;
-    return OkStatus();
+    return absl::OkStatus();
   } else {
     *out = nullptr;
     return errors::InvalidArgument("Dimensions must be equal, but are ",
@@ -439,7 +439,7 @@ Status InferenceContext::MergePrefix(ShapeHandle s, ShapeHandle prefix,
   if (!RankKnown(prefix) || !RankKnown(s)) {
     *s_out = s;
     *prefix_out = prefix;
-    return OkStatus();
+    return absl::OkStatus();
   }
   const int32_t rank = Rank(prefix);
   TF_RETURN_IF_ERROR(WithRankAtLeast(s, rank, &s));
@@ -455,7 +455,7 @@ Status InferenceContext::MergePrefix(ShapeHandle s, ShapeHandle prefix,
   *prefix_out = MakeShape(dims);
   for (int i = rank; i < rank_s; ++i) dims.push_back(Dim(s, i));
   *s_out = MakeShape(dims);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void InferenceContext::Relax(ShapeHandle s_old, ShapeHandle s_new,
@@ -507,15 +507,15 @@ Status InferenceContext::Merge(ShapeHandle s0, ShapeHandle s1,
                                ShapeHandle* out) {
   if (s0.SameHandle(s1)) {
     *out = s0;
-    return OkStatus();
+    return absl::OkStatus();
   } else if (!RankKnown(s1)) {
     *out = s0;
     merged_shapes_.emplace_back(s0, s1);
-    return OkStatus();
+    return absl::OkStatus();
   } else if (!RankKnown(s0)) {
     *out = s1;
     merged_shapes_.emplace_back(s0, s1);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   const int32_t rank = Rank(s0);
@@ -553,7 +553,7 @@ Status InferenceContext::Merge(ShapeHandle s0, ShapeHandle s1,
 
   if (return_s0 || return_s1) {
     *out = return_s0 ? s0 : s1;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Merge dims.
@@ -592,7 +592,7 @@ Status InferenceContext::Subshape(ShapeHandle s, int64_t start, int64_t end,
       ((RankKnown(s) && end >= rank) ||
        end == std::numeric_limits<int64_t>::max())) {
     *out = s;
-    return OkStatus();
+    return absl::OkStatus();
   }
   if (!RankKnown(s)) {
     return ReturnUnknownShape(out);
@@ -732,7 +732,7 @@ Status InferenceContext::MakeShapeFromShapeTensorTreatScalarAsUnknownShape(
       input_tensors_as_shapes_[input_idx].IsSet() &&
       RankKnown(input_tensors_as_shapes_[input_idx])) {
     *out = input_tensors_as_shapes_[input_idx];
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   return InternalMakeShapeFromTensor(
@@ -751,7 +751,7 @@ Status InferenceContext::MakeShapeFromShapeTensor(int input_idx,
       input_tensors_as_shapes_[input_idx].IsSet() &&
       RankKnown(input_tensors_as_shapes_[input_idx])) {
     *out = input_tensors_as_shapes_[input_idx];
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   return InternalMakeShapeFromTensor(
@@ -931,13 +931,13 @@ Status InferenceContext::GetScalarFromTensor(const Tensor* t, int64_t* val) {
 
   if (t->dtype() == DataType::DT_INT16) {
     *val = t->scalar<int16_t>()();
-    return OkStatus();
+    return absl::OkStatus();
   } else if (t->dtype() == DataType::DT_INT32) {
     *val = t->scalar<int32>()();
-    return OkStatus();
+    return absl::OkStatus();
   } else if (t->dtype() == DataType::DT_INT64) {
     *val = t->scalar<int64_t>()();
-    return OkStatus();
+    return absl::OkStatus();
   } else {
     return errors::InvalidArgument(
         "Scalar input must be int16, int32 or int64.");
@@ -959,7 +959,7 @@ Status InferenceContext::GetScalarFromTensor(const Tensor* t, int64_t idx,
                                      " for Tensor of size ", flat_t.size());
     }
     *val = flat_t(idx);
-    return OkStatus();
+    return absl::OkStatus();
   } else if (t->dtype() == DataType::DT_INT64) {
     auto flat_t = t->flat<int64_t>();
     if (idx < 0 || idx >= flat_t.size()) {
@@ -967,7 +967,7 @@ Status InferenceContext::GetScalarFromTensor(const Tensor* t, int64_t idx,
                                      " for Tensor of size ", flat_t.size());
     }
     *val = flat_t(idx);
-    return OkStatus();
+    return absl::OkStatus();
   } else {
     return errors::InvalidArgument("Tensor input must be int32 or int64.");
   }
@@ -979,7 +979,7 @@ Status InferenceContext::MakeDimForScalarInput(int idx, DimensionHandle* out) {
   const Tensor* t = input_tensor(idx);
   if (t == nullptr) {
     *out = UnknownDim();
-    return OkStatus();
+    return absl::OkStatus();
   }
   TF_RETURN_IF_ERROR(GetScalarFromTensor(t, &val));
   if (val < 0) {
@@ -987,7 +987,7 @@ Status InferenceContext::MakeDimForScalarInput(int idx, DimensionHandle* out) {
                                    idx, ", must be non-negative but is ", val);
   }
   *out = MakeDim(val);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status InferenceContext::MakeDimForScalarInputWithNegativeIndexing(
@@ -996,13 +996,13 @@ Status InferenceContext::MakeDimForScalarInputWithNegativeIndexing(
   const Tensor* t = input_tensor(idx);
   if (t == nullptr) {
     *out = UnknownDim();
-    return OkStatus();
+    return absl::OkStatus();
   }
   TF_RETURN_IF_ERROR(GetScalarFromTensor(t, &val));
   if (val < 0) {
     if (input_rank < 0) {
       *out = UnknownDim();
-      return OkStatus();
+      return absl::OkStatus();
     } else if (val + input_rank < 0) {
       return errors::InvalidArgument("Dimension size, given by scalar input ",
                                      val, " must be in range [-", input_rank,
@@ -1016,7 +1016,7 @@ Status InferenceContext::MakeDimForScalarInputWithNegativeIndexing(
                                    ", ", input_rank, ")");
   }
   *out = MakeDim(val);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status InferenceContext::Divide(DimensionHandle dividend,
@@ -1041,7 +1041,7 @@ Status InferenceContext::Divide(DimensionHandle dividend,
     }
     *out = MakeDim(v / divisor_value);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status InferenceContext::Add(DimensionHandle first, DimensionOrConstant second,
@@ -1067,7 +1067,7 @@ Status InferenceContext::Add(DimensionHandle first, DimensionOrConstant second,
     }
     *out = MakeDim(sum);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status InferenceContext::Subtract(DimensionHandle first,
@@ -1090,7 +1090,7 @@ Status InferenceContext::Subtract(DimensionHandle first,
     }
     *out = MakeDim(first_value - second_value);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status InferenceContext::Multiply(DimensionHandle first,
@@ -1119,7 +1119,7 @@ Status InferenceContext::Multiply(DimensionHandle first,
     }
     *out = MakeDim(product);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status InferenceContext::Min(DimensionHandle first, DimensionOrConstant second,
@@ -1139,7 +1139,7 @@ Status InferenceContext::Min(DimensionHandle first, DimensionOrConstant second,
       *out = MakeDim(second);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status InferenceContext::Max(DimensionHandle first, DimensionOrConstant second,
@@ -1155,7 +1155,7 @@ Status InferenceContext::Max(DimensionHandle first, DimensionOrConstant second,
       *out = MakeDim(second);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status InferenceContext::AttachContext(const Status& status) {
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index bb6804e08aa5c5..f00dac88fd0388 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -455,7 +455,7 @@ class InferenceContext {
   std::string DebugString(ShapeHandle s);
   std::string DebugString(DimensionHandle d);
   std::string DebugString(const ShapeAndType& shape_and_type);
-  std::string DebugString(gtl::ArraySlice<ShapeAndType> shape_and_types);
+  std::string DebugString(absl::Span<const ShapeAndType> shape_and_types);
 
   // Describes the whole context, for debugging purposes.
   std::string DebugString() const;
@@ -772,12 +772,12 @@ class InferenceContext {
 
   Status ReturnUnknownShape(ShapeHandle* out) {
     *out = UnknownShape();
-    return OkStatus();
+    return absl::OkStatus();
   }
   Status ReturnCreatedShape(const std::vector<DimensionHandle>& dims,
                             ShapeHandle* out) {
     *out = MakeShape(dims);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Adds additional context to the given status.
diff --git a/tensorflow/core/framework/shape_inference_test.cc b/tensorflow/core/framework/shape_inference_test.cc
index 2aefdea81d179e..e550ff8fb88374 100644
--- a/tensorflow/core/framework/shape_inference_test.cc
+++ b/tensorflow/core/framework/shape_inference_test.cc
@@ -153,7 +153,7 @@ TEST_F(ShapeInferenceTest, Run) {
       TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 6, &h));
       c->set_output(0, c->input(0));
       c->set_output(1, c->input(0));
-      return OkStatus();
+      return absl::OkStatus();
     };
     TF_ASSERT_OK(c.Run(fn));
   }
@@ -164,7 +164,7 @@ TEST_F(ShapeInferenceTest, Run) {
       TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 0, &h));
       c->set_output(0, c->input(0));
       c->set_output(1, c->input(0));
-      return OkStatus();
+      return absl::OkStatus();
     };
     // Extra error message is attached when Run fails.
     EXPECT_THAT(
@@ -189,7 +189,7 @@ TEST_F(ShapeInferenceTest, AttachContext) {
       ShapeHandle h;
       TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 0, &h));
       c->set_output(0, c->input(0));
-      return OkStatus();
+      return absl::OkStatus();
     };
     EXPECT_THAT(
         c.Run(fn),
@@ -212,7 +212,7 @@ TEST_F(ShapeInferenceTest, AttachContext) {
       ShapeHandle h;
       TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 0, &h));
       c->set_output(0, c->input(0));
-      return OkStatus();
+      return absl::OkStatus();
     };
     EXPECT_THAT(
         c.Run(fn),
@@ -238,7 +238,7 @@ TEST_F(ShapeInferenceTest, AttachContext) {
       ShapeHandle h;
       TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 0, &h));
       c->set_output(0, c->input(0));
-      return OkStatus();
+      return absl::OkStatus();
     };
     EXPECT_THAT(
         c.Run(fn),
@@ -264,7 +264,7 @@ TEST_F(ShapeInferenceTest, AttachContext) {
       ShapeHandle h;
       TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 0, &h));
       c->set_output(0, c->input(0));
-      return OkStatus();
+      return absl::OkStatus();
     };
     EXPECT_THAT(
         c.Run(fn),
diff --git a/tensorflow/core/framework/shape_inference_testutil.cc b/tensorflow/core/framework/shape_inference_testutil.cc
index 1125bd259e7760..63c5156dd22664 100644
--- a/tensorflow/core/framework/shape_inference_testutil.cc
+++ b/tensorflow/core/framework/shape_inference_testutil.cc
@@ -226,7 +226,7 @@ Status ShapeInferenceTestutil::InferShapes(ShapeInferenceTestOp op,
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // static
@@ -235,7 +235,7 @@ Status ShapeInferenceTestutil::MakeShapeFromString(
     ShapeHandle* output) {
   if (spec == "?") {
     *output = manager->UnknownShape();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   std::vector<DimensionHandle> dims;
@@ -270,7 +270,7 @@ Status ShapeInferenceTestutil::MakeShapeFromString(
   }
   *output = manager->MakeShape(dims);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace shape_inference
diff --git a/tensorflow/core/framework/shape_inference_testutil_test.cc b/tensorflow/core/framework/shape_inference_testutil_test.cc
index 57bc56e7f0b84c..4bebe251c5d349 100644
--- a/tensorflow/core/framework/shape_inference_testutil_test.cc
+++ b/tensorflow/core/framework/shape_inference_testutil_test.cc
@@ -71,25 +71,25 @@ string RunInferShapes(const string& op_name, const string& ins,
 TEST(ShapeInferenceTestutilTest, Failures) {
   auto fn_copy_input_0 = [](InferenceContext* c) {
     c->set_output(0, c->input(0));
-    return OkStatus();
+    return absl::OkStatus();
   };
   auto fn_copy_input_2 = [](InferenceContext* c) {
     c->set_output(0, c->input(2));
-    return OkStatus();
+    return absl::OkStatus();
   };
   auto fn_output_unknown_shapes = [](InferenceContext* c) {
     for (int i = 0; i < c->num_outputs(); ++i) {
       c->set_output(i, c->UnknownShape());
     }
-    return OkStatus();
+    return absl::OkStatus();
   };
   auto fn_output_1_2 = [](InferenceContext* c) {
     c->set_output(0, c->Matrix(1, 2));
-    return OkStatus();
+    return absl::OkStatus();
   };
   auto fn_output_u_2 = [](InferenceContext* c) {
     c->set_output(0, c->Matrix(InferenceContext::kUnknownDim, 2));
-    return OkStatus();
+    return absl::OkStatus();
   };
   const string& op = "OpOneOut";
 
@@ -141,7 +141,7 @@ TEST(ShapeInferenceTestutilTest, Failures) {
   auto fn = [](InferenceContext* c) {
     c->set_output(0, c->MakeShape({c->Dim(c->input(0), 1), c->MakeDim(2),
                                    c->UnknownDim(), c->Dim(c->input(2), 0)}));
-    return OkStatus();
+    return absl::OkStatus();
   };
   const string ins = "[0,1,?];[2];[1]";
   EXPECT_CONTAINS(RunInferShapes(op, ins, "[?,2,?,d2_0]", fn),
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index d2b0cd3efa0461..99a1752d3a66cd 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -895,7 +895,7 @@ Status Tensor::BitcastFrom(const Tensor& other, DataType dtype,
       RefIfNonNull(buf_);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Notice that buf_ either points to a regular TensorBuffer or a SubBuffer.
@@ -999,7 +999,7 @@ Status Tensor::BuildTensor(DataType type, const TensorShape& shape,
       type, {}, return errors::InvalidArgument("Type not set"),
       return errors::InvalidArgument("Unexpected type: ", DataType_Name(type)));
   *out_tensor = Tensor(type, shape);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // NOTE(mrry): The default allocator for a Tensor (when none is specified) is
@@ -1545,7 +1545,7 @@ void Tensor::FillDescription(TensorDescription* description) const {
 }
 
 gtl::InlinedVector<int64_t, 4> Tensor::ComputeFlatInnerDims(
-    gtl::ArraySlice<int64_t> orig, int64_t num_out_dims) {
+    absl::Span<const int64_t> orig, int64_t num_out_dims) {
   gtl::InlinedVector<int64_t, 4> out_dims(num_out_dims, 0);
   int64_t offset = orig.size() - num_out_dims;
   for (int64_t out_dim = num_out_dims - 1; out_dim >= 0; --out_dim) {
@@ -1559,7 +1559,7 @@ gtl::InlinedVector<int64_t, 4> Tensor::ComputeFlatInnerDims(
 }
 
 gtl::InlinedVector<int64_t, 4> Tensor::ComputeFlatOuterDims(
-    gtl::ArraySlice<int64_t> orig, int64_t num_out_dims) {
+    absl::Span<const int64_t> orig, int64_t num_out_dims) {
   gtl::InlinedVector<int64_t, 4> out_dims(num_out_dims, 0);
   for (int64_t out_dim = 0; out_dim <= num_out_dims - 1; ++out_dim) {
     out_dims[out_dim] = out_dim >= orig.size() ? 1 : orig[out_dim];
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index f8fc1af3480186..3f46d40bb03f5a 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -686,9 +686,9 @@ class Tensor {
   void set_dtype(DataType t) { shape_.set_data_type(t); }
 
   // TensorShape's InlineVector.
-  static gtl::InlinedVector<int64_t, 4> ComputeFlatInnerDims(
+  static absl::InlinedVector<int64_t, 4UL> ComputeFlatInnerDims(
       absl::Span<const int64_t> orig, int64_t num_out_dims);
-  static gtl::InlinedVector<int64_t, 4> ComputeFlatOuterDims(
+  static absl::InlinedVector<int64_t, 4UL> ComputeFlatOuterDims(
       absl::Span<const int64_t> orig, int64_t num_out_dims);
 
   TensorShape shape_;
@@ -975,7 +975,7 @@ typename TTypes<T, NDIMS>::Tensor Tensor::flat_outer_dims() {
 
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::Tensor Tensor::flat_inner_outer_dims(int64_t begin) {
-  gtl::InlinedVector<int64_t, 4> flat_outer =
+  absl::InlinedVector<int64_t, 4UL> flat_outer =
       ComputeFlatOuterDims(shape_.dim_sizes(), begin + NDIMS);
   return shaped<T, NDIMS>(ComputeFlatInnerDims(flat_outer, NDIMS));
 }
@@ -993,7 +993,7 @@ typename TTypes<T, NDIMS>::ConstTensor Tensor::flat_outer_dims() const {
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::ConstTensor Tensor::flat_inner_outer_dims(
     int64_t begin) const {
-  gtl::InlinedVector<int64_t, 4> flat_outer =
+  absl::InlinedVector<int64_t, 4UL> flat_outer =
       ComputeFlatOuterDims(shape_.dim_sizes(), begin + NDIMS);
   return shaped<T, NDIMS>(ComputeFlatInnerDims(flat_outer, NDIMS));
 }
diff --git a/tensorflow/core/framework/tensor.proto b/tensorflow/core/framework/tensor.proto
index 86012a1045d305..9f5a29aec172df 100644
--- a/tensorflow/core/framework/tensor.proto
+++ b/tensorflow/core/framework/tensor.proto
@@ -14,6 +14,7 @@ option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framewo
 
 // Protocol buffer representing a tensor.
 message TensorProto {
+  // Data type of the tensor.
   DataType dtype = 1;
 
   // Shape of the tensor.  TODO(touts): sort out the 0-rank issues.
diff --git a/tensorflow/core/framework/tensor_matcher.cc b/tensorflow/core/framework/tensor_matcher.cc
index 5e159eac21d6d7..c16f700d467e31 100644
--- a/tensorflow/core/framework/tensor_matcher.cc
+++ b/tensorflow/core/framework/tensor_matcher.cc
@@ -41,13 +41,32 @@ namespace {
 
 using tensorflow::Tensor;
 
+template <typename T>
+::testing::Matcher<absl::Span<const T>> MakePointwiseMatcher(
+    absl::Span<const T> target) {
+  return ::testing::MatcherCast<absl::Span<const T>>(
+      ::testing::Pointwise(::testing::Eq(), target));
+}
+
+template <>
+::testing::Matcher<absl::Span<const float>> MakePointwiseMatcher(
+    absl::Span<const float> target) {
+  return ::testing::MatcherCast<absl::Span<const float>>(
+      ::testing::Pointwise(::testing::FloatEq(), target));
+}
+
+template <>
+::testing::Matcher<absl::Span<const double>> MakePointwiseMatcher(
+    absl::Span<const double> target) {
+  return ::testing::MatcherCast<absl::Span<const double>>(
+      ::testing::Pointwise(::testing::DoubleEq(), target));
+}
+
 template <typename T>
 bool MatchAndExplainPointwise(absl::Span<const T> value,
                               absl::Span<const T> target,
                               ::testing::MatchResultListener* listener) {
-  auto matcher = ::testing::MatcherCast<absl::Span<const T>>(
-      ::testing::Pointwise(::testing::Eq(), target));
-  return matcher.MatchAndExplain(value, listener);
+  return MakePointwiseMatcher<T>(target).MatchAndExplain(value, listener);
 }
 
 class TensorEqMatcherImpl : public ::testing::MatcherInterface<const Tensor&> {
@@ -62,9 +81,7 @@ class TensorEqMatcherImpl : public ::testing::MatcherInterface<const Tensor&> {
   case tensorflow::DataTypeToEnum<T>::value: {             \
     *os << ", and tensor data ";                           \
     absl::Span<const T> data(target_.unaligned_flat<T>()); \
-    ::testing::MatcherCast<absl::Span<const T>>(           \
-        ::testing::Pointwise(::testing::Eq(), data))       \
-        .DescribeTo(os);                                   \
+    MakePointwiseMatcher<T>(data).DescribeTo(os);          \
     break;                                                 \
   }
       TF_CALL_POD_STRING_TYPES(CASE_TYPE);
@@ -84,9 +101,7 @@ class TensorEqMatcherImpl : public ::testing::MatcherInterface<const Tensor&> {
   case tensorflow::DataTypeToEnum<T>::value: {             \
     *os << ", or tensor data ";                            \
     absl::Span<const T> data(target_.unaligned_flat<T>()); \
-    ::testing::MatcherCast<absl::Span<const T>>(           \
-        ::testing::Pointwise(::testing::Eq(), data))       \
-        .DescribeNegationTo(os);                           \
+    MakePointwiseMatcher<T>(data).DescribeNegationTo(os);  \
     break;                                                 \
   }
       TF_CALL_POD_STRING_TYPES(CASE_TYPE);
diff --git a/tensorflow/core/framework/tensor_matcher_test.cc b/tensorflow/core/framework/tensor_matcher_test.cc
index 7e93bde4c964e8..614ea6e7ae353a 100644
--- a/tensorflow/core/framework/tensor_matcher_test.cc
+++ b/tensorflow/core/framework/tensor_matcher_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/framework/tensor_matcher.h"
 
+#include <cmath>
 #include <cstdint>
 #include <string>
 #include <vector>
@@ -26,7 +27,9 @@ namespace tensorflow {
 namespace test {
 namespace {
 
+using ::testing::DoubleEq;
 using ::testing::ElementsAre;
+using ::testing::FloatEq;
 
 TEST(TensorMatcherTest, BasicPod) {
   std::vector<Tensor> expected;
@@ -50,6 +53,34 @@ TEST(TensorMatcherTest, BasicString) {
               ElementsAre(TensorEq(Tensor(s1)), TensorEq(Tensor(s2))));
 }
 
+TEST(TensorMatcherTest, FloatComparisonUsesTolerance) {
+  // Two floats that are *nearly* equal.
+  float f1(1);
+  float f2 = std::nextafter(f1, f1 + 1);
+
+  // Direct equality checks should fail, but use of the specialized `FloatEq`
+  // should succeed since this matcher applies ULP-based comparison.
+  // go/matchers#FpMatchers
+  ASSERT_NE(f1, f2);
+  ASSERT_THAT(f1, FloatEq(f2));
+
+  EXPECT_THAT(Tensor(f1), TensorEq(Tensor(f2)));
+}
+
+TEST(TensorMatcherTest, DoubleComparisonUsesTolerance) {
+  // Two doubles that are *nearly* equal.
+  double d1(1);
+  double d2 = std::nextafter(d1, d1 + 1);
+
+  // Direct equality checks should fail, but use of the specialized `DoubleEq`
+  // should succeed since this matcher applies ULP-based comparison.
+  // go/matchers#FpMatchers
+  ASSERT_NE(d1, d2);
+  ASSERT_THAT(d1, DoubleEq(d2));
+
+  EXPECT_THAT(Tensor(d1), TensorEq(Tensor(d2)));
+}
+
 }  // namespace
 }  // namespace test
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/tensor_shape.cc b/tensorflow/core/framework/tensor_shape.cc
index de2401b2368245..74e464dfabe3f1 100644
--- a/tensorflow/core/framework/tensor_shape.cc
+++ b/tensorflow/core/framework/tensor_shape.cc
@@ -103,7 +103,7 @@ Status TensorShapeBase<Shape>::IsValidShape(const TensorShapeProto& proto) {
       return errors::InvalidArgument(
           "An unknown shape must not have any dimensions set.");
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
   int64_t num_elements = 1;
   if (proto.dim().size() > MaxDimensions()) {
@@ -132,7 +132,7 @@ Status TensorShapeBase<Shape>::IsValidShape(const TensorShapeProto& proto) {
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <class Shape>
@@ -169,7 +169,7 @@ Status TensorShapeBase<Shape>::BuildTensorShapeBase(
     out->set_ndims_byte(0);
     out->set_num_elements(1);
     int64_t num_elements_excluding_zero_dims = 1;
-    Status s = OkStatus();
+    Status s = absl::OkStatus();
     for (const auto& d : proto.dim()) {
       s = out->AddDimWithStatus(d.size());
       if (!s.ok()) {
@@ -191,11 +191,11 @@ Status TensorShapeBase<Shape>::BuildTensorShapeBase(
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <class Shape>
-TensorShapeBase<Shape>::TensorShapeBase(gtl::ArraySlice<int64_t> dim_sizes) {
+TensorShapeBase<Shape>::TensorShapeBase(absl::Span<const int64_t> dim_sizes) {
   set_tag(REP16);
   set_data_type(DT_INVALID);
   TF_CHECK_OK(InitDims(dim_sizes));
@@ -203,7 +203,7 @@ TensorShapeBase<Shape>::TensorShapeBase(gtl::ArraySlice<int64_t> dim_sizes) {
 
 template <class Shape>
 Status TensorShapeBase<Shape>::BuildTensorShapeBase(
-    gtl::ArraySlice<int64_t> dim_sizes, TensorShapeBase* out) {
+    absl::Span<const int64_t> dim_sizes, TensorShapeBase* out) {
   out->set_tag(REP16);
   out->set_data_type(DT_INVALID);
   return out->InitDims(dim_sizes);
@@ -224,7 +224,7 @@ static inline bool Set16(bool partial, uint16* dst, int dim, int64_t val) {
 }
 
 template <class Shape>
-Status TensorShapeBase<Shape>::InitDims(gtl::ArraySlice<int64_t> dim_sizes) {
+Status TensorShapeBase<Shape>::InitDims(absl::Span<const int64_t> dim_sizes) {
   DCHECK_EQ(tag(), REP16);
 
   // Allow sizes that are under kint64max^0.25 so that 4-way multiplication
@@ -258,7 +258,7 @@ Status TensorShapeBase<Shape>::InitDims(gtl::ArraySlice<int64_t> dim_sizes) {
         const int64_t size = dim_sizes[0];
         const bool neg = Set16(kIsPartial, dst, 0, size);
         set_num_elements(neg ? -1 : size);
-        return OkStatus();
+        return absl::OkStatus();
       }
       case 2: {
         set_ndims_byte(2);
@@ -267,7 +267,7 @@ Status TensorShapeBase<Shape>::InitDims(gtl::ArraySlice<int64_t> dim_sizes) {
         bool neg = Set16(kIsPartial, dst, 0, size0);
         neg |= Set16(kIsPartial, dst, 1, size1);
         set_num_elements(neg ? -1 : (size0 * size1));
-        return OkStatus();
+        return absl::OkStatus();
       }
       case 3: {
         set_ndims_byte(3);
@@ -278,7 +278,7 @@ Status TensorShapeBase<Shape>::InitDims(gtl::ArraySlice<int64_t> dim_sizes) {
         neg |= Set16(kIsPartial, dst, 1, size1);
         neg |= Set16(kIsPartial, dst, 2, size2);
         set_num_elements(neg ? -1 : (size0 * size1 * size2));
-        return OkStatus();
+        return absl::OkStatus();
       }
       case 4: {
         set_ndims_byte(4);
@@ -291,14 +291,14 @@ Status TensorShapeBase<Shape>::InitDims(gtl::ArraySlice<int64_t> dim_sizes) {
         neg |= Set16(kIsPartial, dst, 2, size2);
         neg |= Set16(kIsPartial, dst, 3, size3);
         set_num_elements(neg ? -1 : (size0 * size1 * size2 * size3));
-        return OkStatus();
+        return absl::OkStatus();
       }
     }
   }
 
   set_ndims_byte(0);
   set_num_elements(1);
-  Status status = OkStatus();
+  Status status = absl::OkStatus();
   for (int64_t s : dim_sizes) {
     status.Update(AddDimWithStatus(internal::SubtleMustCopy(s)));
     if (!status.ok()) {
@@ -387,7 +387,7 @@ template <class Shape>
 Status TensorShapeBase<Shape>::RecomputeNumElements() {
   if (unknown_rank()) {
     set_num_elements(-1);
-    return OkStatus();
+    return absl::OkStatus();
   }
   int64_t n = 1;
   for (auto dim : *this) {
@@ -403,7 +403,7 @@ Status TensorShapeBase<Shape>::RecomputeNumElements() {
     }
   }
   set_num_elements(n);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <class Shape>
@@ -431,7 +431,7 @@ Status TensorShapeBase<Shape>::AddDimWithStatus(int64_t size) {
   }
 
   if (unknown_rank()) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   if (TF_PREDICT_FALSE(ndims_byte() >= MaxDimensions())) {
@@ -451,7 +451,7 @@ Status TensorShapeBase<Shape>::AddDimWithStatus(int64_t size) {
   }
 
   UnsafeAddDim(size, new_num_elements);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <class Shape>
@@ -508,7 +508,7 @@ void TensorShapeBase<Shape>::AppendShape(const TensorShapeBase& shape) {
 template <class Shape>
 Status TensorShapeBase<Shape>::AppendShapeWithStatus(
     const TensorShapeBase& shape) {
-  Status s = OkStatus();
+  Status s = absl::OkStatus();
   for (auto d : shape) {
     s.Update(AddDimWithStatus(d.size));
     if (!s.ok()) {
@@ -560,7 +560,7 @@ Status TensorShapeBase<Shape>::InsertDimWithStatus(int d, int64_t size) {
   vals.insert(vals.begin() + d, size);
   ClearAllButDataType();
 
-  Status s = OkStatus();
+  Status s = absl::OkStatus();
   for (auto dval : vals) {
     s.Update(AddDimWithStatus(dval));
     if (!s.ok()) {
@@ -635,7 +635,7 @@ Status TensorShapeBase<Shape>::SetDimWithStatus(int d, int64_t size) {
     vals[d] = size;
     ClearAllButDataType();
 
-    Status s = OkStatus();
+    Status s = absl::OkStatus();
     for (auto dval : vals) {
       s.Update(AddDimWithStatus(dval));
       if (!s.ok()) {
@@ -670,7 +670,7 @@ void TensorShapeBase<Shape>::RemoveDimRange(int begin, int end) {
 template <class Shape>
 Status TensorShapeBase<Shape>::RemoveDimRangeWithStatus(int begin, int end) {
   if (unknown_rank()) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   begin = begin < 0 ? dims() + begin + 1 : begin;
@@ -692,7 +692,7 @@ Status TensorShapeBase<Shape>::RemoveDimRangeWithStatus(int begin, int end) {
   }
 
   if (begin >= end) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   gtl::InlinedVector<int64_t, 8> vals;
@@ -700,7 +700,7 @@ Status TensorShapeBase<Shape>::RemoveDimRangeWithStatus(int begin, int end) {
   vals.erase(vals.begin() + begin, vals.begin() + end);
   ClearAllButDataType();
 
-  Status s = OkStatus();
+  Status s = absl::OkStatus();
   for (auto dval : vals) {
     s.Update(AddDimWithStatus(dval));
     if (!s.ok()) {
@@ -845,7 +845,7 @@ Status MakeShapeHelper(const T* dims, int64_t n, Shape* out) {
     }
     out->UnsafeAddDim(dim, new_num_elements);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 #define MAKE_SHAPE(T, Shape)                                                 \
@@ -862,7 +862,7 @@ MAKE_SHAPE(int64_t, PartialTensorShape)
 #undef MAKE_SHAPE
 
 string TensorShapeUtils::ShapeListString(
-    const gtl::ArraySlice<TensorShape>& shapes) {
+    const absl::Span<const TensorShape>& shapes) {
   string result = "[";
   bool first = true;
   for (const TensorShape& shape : shapes) {
@@ -899,7 +899,7 @@ Status PartialTensorShape::ConcatenateWithStatus(
     const PartialTensorShape& shape, PartialTensorShape* out) const {
   if (unknown_rank() || shape.unknown_rank()) {
     *out = PartialTensorShape();
-    return OkStatus();
+    return absl::OkStatus();
   }
   *out = *this;
   for (auto dim : shape) {
@@ -907,18 +907,18 @@ Status PartialTensorShape::ConcatenateWithStatus(
     if (!s.ok()) return s;
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status PartialTensorShape::MergeWith(const PartialTensorShape& shape,
                                      PartialTensorShape* result) const {
   if (unknown_rank()) {
     *result = shape;
-    return OkStatus();
+    return absl::OkStatus();
   }
   if (shape.unknown_rank()) {
     *result = *this;
-    return OkStatus();
+    return absl::OkStatus();
   }
   const int dims_ = dims();
   if (dims_ != shape.dims()) {
@@ -933,7 +933,7 @@ Status PartialTensorShape::MergeWith(const PartialTensorShape& shape,
   }
 
   result->Clear();
-  Status s = OkStatus();
+  Status s = absl::OkStatus();
   for (int i = 0; i < dims_; ++i) {
     const int64_t dim0 = dim_size(i);
     const int64_t dim1 = shape.dim_size(i);
@@ -947,7 +947,7 @@ Status PartialTensorShape::MergeWith(const PartialTensorShape& shape,
       return s;
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool PartialTensorShape::AsTensorShape(TensorShape* shape) const {
@@ -983,7 +983,7 @@ bool PartialTensorShape::IsCompatibleWith(
 }
 
 string PartialTensorShapeUtils::PartialShapeListString(
-    const gtl::ArraySlice<PartialTensorShape>& shapes) {
+    const absl::Span<const PartialTensorShape>& shapes) {
   string result = "[";
   bool first = true;
   for (const PartialTensorShape& shape : shapes) {
@@ -995,8 +995,8 @@ string PartialTensorShapeUtils::PartialShapeListString(
 }
 
 bool PartialTensorShapeUtils::AreCompatible(
-    const gtl::ArraySlice<PartialTensorShape>& shapes0,
-    const gtl::ArraySlice<PartialTensorShape>& shapes1) {
+    const absl::Span<const PartialTensorShape>& shapes0,
+    const absl::Span<const PartialTensorShape>& shapes1) {
   if (shapes0.size() == shapes1.size()) {
     for (size_t i = 0; i < shapes0.size(); ++i) {
       if (!shapes0[i].IsCompatibleWith(shapes1[i])) {
@@ -1010,8 +1010,8 @@ bool PartialTensorShapeUtils::AreCompatible(
 }
 
 bool PartialTensorShapeUtils::AreIdentical(
-    const gtl::ArraySlice<PartialTensorShape>& shapes0,
-    const gtl::ArraySlice<PartialTensorShape>& shapes1) {
+    const absl::Span<const PartialTensorShape>& shapes0,
+    const absl::Span<const PartialTensorShape>& shapes1) {
   if (shapes0.size() == shapes1.size()) {
     for (size_t i = 0; i < shapes0.size(); ++i) {
       if (!shapes0[i].IsIdenticalTo(shapes1[i])) {
@@ -1024,7 +1024,7 @@ bool PartialTensorShapeUtils::AreIdentical(
   }
 }
 
-Status TensorShapeUtils::NumElements(gtl::ArraySlice<int64_t> shape,
+Status TensorShapeUtils::NumElements(absl::Span<const int64_t> shape,
                                      int64_t* num_elements) {
   int64_t n = 1;
   for (auto dim : shape) {
@@ -1036,7 +1036,7 @@ Status TensorShapeUtils::NumElements(gtl::ArraySlice<int64_t> shape,
     }
   }
   *num_elements = n;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template class TensorShapeBase<TensorShape>;
diff --git a/tensorflow/core/framework/tensor_shape.h b/tensorflow/core/framework/tensor_shape.h
index 64741f4de77a69..9149b759edd914 100644
--- a/tensorflow/core/framework/tensor_shape.h
+++ b/tensorflow/core/framework/tensor_shape.h
@@ -166,9 +166,9 @@ class TensorShapeBase : public TensorShapeRep {
  public:
   /// \brief Construct a `TensorShapeBase` from the provided sizes.
   /// REQUIRES: `dim_sizes[i] >= 0` (or >= -1 for PartialTensorShape)
-  explicit TensorShapeBase(gtl::ArraySlice<int64_t> dim_sizes);
+  explicit TensorShapeBase(absl::Span<const int64_t> dim_sizes);
   TensorShapeBase(std::initializer_list<int64_t> dim_sizes)
-      : TensorShapeBase(gtl::ArraySlice<int64_t>(dim_sizes)) {}
+      : TensorShapeBase(absl::Span<const int64_t>(dim_sizes)) {}
 
   /// Construct an empty TensorShape, or an unknown rank PartialTensorShape
   TensorShapeBase();
@@ -181,11 +181,11 @@ class TensorShapeBase : public TensorShapeRep {
   // an array of sizes if calling code cannot validate that the sizes specify a
   // valid `TensorShape`.
   // The value in `*out` is valid iff the returned value is `Status::OK`.
-  static Status BuildTensorShapeBase(gtl::ArraySlice<int64_t> dim_sizes,
+  static Status BuildTensorShapeBase(absl::Span<const int64_t> dim_sizes,
                                      TensorShapeBase* out);
   static Status BuildTensorShapeBase(std::initializer_list<int64_t> dim_sizes,
                                      TensorShapeBase* out) {
-    return BuildTensorShapeBase(gtl::ArraySlice<int64_t>(dim_sizes), out);
+    return BuildTensorShapeBase(absl::Span<const int64_t>(dim_sizes), out);
   }
   static Status BuildTensorShapeBase(const TensorShapeProto& proto,
                                      TensorShapeBase* out);
@@ -325,7 +325,7 @@ class TensorShapeBase : public TensorShapeRep {
 
  private:
   Status RecomputeNumElements();
-  Status InitDims(gtl::ArraySlice<int64_t> dim_sizes);
+  Status InitDims(absl::Span<const int64_t> dim_sizes);
 
   // True for PartialTensorShape, false for TensorShape
   static constexpr bool kIsPartial =
@@ -364,13 +364,13 @@ class TensorShape : public TensorShapeBase<TensorShape> {
   // an array of sizes if calling code cannot validate that the sizes specify a
   // valid `TensorShape`.
   // The value in `*out` is valid iff the returned value is `Status::OK`.
-  static Status BuildTensorShape(gtl::ArraySlice<int64_t> dim_sizes,
+  static Status BuildTensorShape(absl::Span<const int64_t> dim_sizes,
                                  TensorShape* out) {
     return BuildTensorShapeBase(dim_sizes, out);
   }
   static Status BuildTensorShape(std::initializer_list<int64_t> dim_sizes,
                                  TensorShape* out) {
-    return BuildTensorShape(gtl::ArraySlice<int64_t>(dim_sizes), out);
+    return BuildTensorShape(absl::Span<const int64_t>(dim_sizes), out);
   }
   static Status BuildTensorShape(const TensorShapeProto& proto,
                                  TensorShape* out) {
@@ -508,19 +508,19 @@ class TensorShapeUtils {
   /// `dims[0]`, `dims[1]`, ..., `dims[n-1]`.
   static Status MakeShape(const int32* dims, int64_t n, TensorShape* out);
   static Status MakeShape(const int64_t* dims, int64_t n, TensorShape* out);
-  static Status MakeShape(gtl::ArraySlice<int32> shape, TensorShape* out);
-  static Status MakeShape(gtl::ArraySlice<int64_t> shape, TensorShape* out);
+  static Status MakeShape(absl::Span<const int32> shape, TensorShape* out);
+  static Status MakeShape(absl::Span<const int64_t> shape, TensorShape* out);
   static Status MakeShape(const int32* dims, int64_t n,
                           PartialTensorShape* out);
   static Status MakeShape(const int64_t* dims, int64_t n,
                           PartialTensorShape* out);
-  static Status MakeShape(gtl::ArraySlice<int32> shape,
+  static Status MakeShape(absl::Span<const int32> shape,
                           PartialTensorShape* out);
-  static Status MakeShape(gtl::ArraySlice<int64_t> shape,
+  static Status MakeShape(absl::Span<const int64_t> shape,
                           PartialTensorShape* out);
 
   static std::string ShapeListString(
-      const gtl::ArraySlice<TensorShape>& shapes);
+      const absl::Span<const TensorShape>& shapes);
 
   /// \brief Returns true iff `shape` starts with `prefix`.
   static bool StartsWith(const TensorShape& shape, const TensorShape& prefix);
@@ -531,7 +531,7 @@ class TensorShapeUtils {
   /// \brief Returns the product of values in an int64 array,
   /// or a failing Status if the array represents a value larger than
   /// a `TensorShape` can hold.
-  static Status NumElements(gtl::ArraySlice<int64_t> shape,
+  static Status NumElements(absl::Span<const int64_t> shape,
                             int64_t* num_elements);
 };
 
@@ -545,13 +545,13 @@ class PartialTensorShape : public TensorShapeBase<PartialTensorShape> {
   // an array of sizes if calling code cannot validate that the sizes specify a
   // valid `PartialTensorShape`.
   // The value in `*out` is valid iff the returned value is `Status::OK`.
-  static Status BuildPartialTensorShape(gtl::ArraySlice<int64_t> dim_sizes,
+  static Status BuildPartialTensorShape(absl::Span<const int64_t> dim_sizes,
                                         PartialTensorShape* out) {
     return BuildTensorShapeBase(dim_sizes, out);
   }
   static Status BuildPartialTensorShape(
       std::initializer_list<int64_t> dim_sizes, PartialTensorShape* out) {
-    return BuildPartialTensorShape(gtl::ArraySlice<int64_t>(dim_sizes), out);
+    return BuildPartialTensorShape(absl::Span<const int64_t>(dim_sizes), out);
   }
   static Status BuildPartialTensorShape(const TensorShapeProto& proto,
                                         PartialTensorShape* out) {
@@ -627,13 +627,14 @@ inline bool operator==(const PartialTensorShape& a,
 class PartialTensorShapeUtils {
  public:
   static std::string PartialShapeListString(
-      const gtl::ArraySlice<PartialTensorShape>& shapes);
+      const absl::Span<const PartialTensorShape>& shapes);
 
-  static bool AreIdentical(const gtl::ArraySlice<PartialTensorShape>& shapes0,
-                           const gtl::ArraySlice<PartialTensorShape>& shapes1);
+  static bool AreIdentical(const absl::Span<const PartialTensorShape>& shapes0,
+                           const absl::Span<const PartialTensorShape>& shapes1);
 
-  static bool AreCompatible(const gtl::ArraySlice<PartialTensorShape>& shapes0,
-                            const gtl::ArraySlice<PartialTensorShape>& shapes1);
+  static bool AreCompatible(
+      const absl::Span<const PartialTensorShape>& shapes0,
+      const absl::Span<const PartialTensorShape>& shapes1);
 };
 
 // ----------------------------------------------------------------------------
@@ -677,7 +678,7 @@ Status TensorShape::AsEigenDSizesWithStatus(
                             " dimensions");
   }
   *out = AsEigenDSizesCopy<NDIMS, IndexType>();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <int NDIMS, typename IndexType>
@@ -695,7 +696,7 @@ Status TensorShape::AsEigenDSizesWithPaddingWithStatus(
                             " dimensions");
   }
   *out = AsEigenDSizesCopyAndPad<NDIMS, IndexType>();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // ----------------------------------------------------------------------------
diff --git a/tensorflow/core/framework/tensor_shape_test.cc b/tensorflow/core/framework/tensor_shape_test.cc
index 3e6b1246000a1e..0281989af4301f 100644
--- a/tensorflow/core/framework/tensor_shape_test.cc
+++ b/tensorflow/core/framework/tensor_shape_test.cc
@@ -531,9 +531,9 @@ class TensorShapeOld {
  public:
   /// \brief Construct a `TensorShape` from the provided sizes.
   /// REQUIRES: `dim_sizes[i] >= 0`
-  explicit TensorShapeOld(gtl::ArraySlice<int64_t> dim_sizes);
+  explicit TensorShapeOld(absl::Span<const int64_t> dim_sizes);
   TensorShapeOld(std::initializer_list<int64_t> dim_sizes)
-      : TensorShapeOld(gtl::ArraySlice<int64_t>(dim_sizes)) {}
+      : TensorShapeOld(absl::Span<const int64_t>(dim_sizes)) {}
 
   /// REQUIRES: `IsValid(proto)`
   explicit TensorShapeOld(const TensorShapeProto& proto);
@@ -587,7 +587,7 @@ class TensorShapeOld {
   }
 
   /// Returns sizes of all dimensions.
-  gtl::ArraySlice<int64_t> dim_sizes() const { return dim_sizes_; }
+  absl::Span<const int64_t> dim_sizes() const { return dim_sizes_; }
 
   /// \brief Returns the number of elements in the tensor.
   ///
@@ -690,7 +690,7 @@ Status TensorShapeOld::IsValidShape(const TensorShapeProto& proto) {
                                      " entries)");
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 TensorShapeOld::TensorShapeOld(const TensorShapeProto& proto) {
@@ -701,7 +701,7 @@ TensorShapeOld::TensorShapeOld(const TensorShapeProto& proto) {
   }
 }
 
-TensorShapeOld::TensorShapeOld(gtl::ArraySlice<int64_t> dim_sizes) {
+TensorShapeOld::TensorShapeOld(absl::Span<const int64_t> dim_sizes) {
   dim_sizes_.reserve(dim_sizes.size());
   num_elements_ = 1;
   for (auto s : dim_sizes) {
@@ -793,7 +793,7 @@ TensorShapeIterOld TensorShapeOld::end() const {
 
 string TensorShapeOld::DebugString() const {
   return strings::StrCat(
-      "[", absl::StrJoin(gtl::ArraySlice<int64_t>(dim_sizes_), ","), "]");
+      "[", absl::StrJoin(absl::Span<const int64_t>(dim_sizes_), ","), "]");
 }
 
 string TensorShapeOld::DebugString(const TensorShapeProto& proto) {
diff --git a/tensorflow/core/framework/tensor_slice.cc b/tensorflow/core/framework/tensor_slice.cc
index 9580707e781e9c..c467590ea5e523 100644
--- a/tensorflow/core/framework/tensor_slice.cc
+++ b/tensorflow/core/framework/tensor_slice.cc
@@ -72,7 +72,7 @@ Status TensorSlice::BuildTensorSlice(const TensorSliceProto& proto,
     output->lengths_.push_back(l);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status TensorSlice::Parse(const string& str, TensorSlice* slice) {
@@ -105,7 +105,7 @@ Status TensorSlice::Parse(const string& str, TensorSlice* slice) {
     slice->lengths_.push_back(l);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void TensorSlice::Clear() {
@@ -295,7 +295,7 @@ Status TensorSlice::SliceTensorShape(const TensorShape& shape,
     }
   }
   // If we are here, we have successfully applied the shape.
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 const int64_t TensorSlice::kFullExtent = -1;
diff --git a/tensorflow/core/framework/tensor_slice.h b/tensorflow/core/framework/tensor_slice.h
index 297a20300e5b05..d0fd25432728d6 100644
--- a/tensorflow/core/framework/tensor_slice.h
+++ b/tensorflow/core/framework/tensor_slice.h
@@ -198,8 +198,8 @@ class TensorSlice {
 
   // TODO(yangke): switch to Eigen once it supports variable size arrays.
   // A value of
-  gtl::InlinedVector<int64_t, 4> starts_;
-  gtl::InlinedVector<int64_t, 4> lengths_;
+  absl::InlinedVector<int64_t, 4UL> starts_;
+  absl::InlinedVector<int64_t, 4UL> lengths_;
 };
 
 template <int NDIMS>
diff --git a/tensorflow/core/framework/tensor_util.cc b/tensorflow/core/framework/tensor_util.cc
index ec624d8e85e511..8a9cc7333f9dab 100644
--- a/tensorflow/core/framework/tensor_util.cc
+++ b/tensorflow/core/framework/tensor_util.cc
@@ -56,7 +56,7 @@ void DeepCopy(const Tensor& input, Tensor* output) {
   }
 }
 
-Status Concat(const gtl::ArraySlice<Tensor>& tensors, Tensor* result) {
+Status Concat(const absl::Span<const Tensor>& tensors, Tensor* result) {
   if (tensors.empty()) {
     return errors::InvalidArgument("Cannot concatenate zero tensors");
   }
@@ -116,10 +116,10 @@ Status Concat(const gtl::ArraySlice<Tensor>& tensors, Tensor* result) {
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status Split(const Tensor& tensor, const gtl::ArraySlice<int64_t>& sizes,
+Status Split(const Tensor& tensor, const absl::Span<const int64_t>& sizes,
              std::vector<Tensor>* result) {
   if (tensor.dims() == 0) {
     return errors::InvalidArgument("Cannot split a zero-dimensional tensor");
@@ -178,7 +178,7 @@ Status Split(const Tensor& tensor, const gtl::ArraySlice<int64_t>& sizes,
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 namespace internal {
diff --git a/tensorflow/core/framework/types.h b/tensorflow/core/framework/types.h
index 1a53fce4d2d997..142ac9913ddc93 100644
--- a/tensorflow/core/framework/types.h
+++ b/tensorflow/core/framework/types.h
@@ -16,10 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_FRAMEWORK_TYPES_H_
 #define TENSORFLOW_CORE_FRAMEWORK_TYPES_H_
 
+#include <cstddef>
 #include <map>
 #include <set>
 #include <string>
 
+#include "absl/numeric/bits.h"
 #include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
 #include "tensorflow/core/framework/bfloat16.h"
 #include "tensorflow/core/framework/full_type.pb.h"
@@ -73,14 +75,14 @@ struct DeviceName<Eigen::GpuDevice> {
 };
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-typedef gtl::InlinedVector<MemoryType, 4> MemoryTypeVector;
+typedef absl::InlinedVector<MemoryType, 4UL> MemoryTypeVector;
 typedef absl::Span<const MemoryType> MemoryTypeSlice;
 
-typedef gtl::InlinedVector<DataType, 4> DataTypeVector;
+typedef absl::InlinedVector<DataType, 4UL> DataTypeVector;
 typedef absl::Span<const DataType> DataTypeSlice;
 
-typedef gtl::InlinedVector<DeviceType, 4> DeviceTypeVector;
-typedef gtl::InlinedVector<std::pair<DeviceType, int32>, 4>
+typedef absl::InlinedVector<DeviceType, 4UL> DeviceTypeVector;
+typedef absl::InlinedVector<std::pair<DeviceType, int32>, 4UL>
     PrioritizedDeviceTypeVector;
 
 // Convert the enums to strings for errors:
@@ -124,7 +126,7 @@ class DataTypeSet {
       if (pos_ < kNumBits) {
         uint32 remaining_mask = set_.mask_ >> pos_;
         if (remaining_mask != 0u) {
-          pos_ += ctz_uint32(remaining_mask);
+          pos_ += absl::countr_zero(remaining_mask);
         }
       }
       DCHECK_LE(pos_, kNumBits);
@@ -137,39 +139,11 @@ class DataTypeSet {
     }
   };
 
-  static uint32 ctz_uint32(uint32 x) {
-    DCHECK_NE(x, 0u);
-#ifdef __GNUC__
-    return __builtin_ctz(x);
-#else
-    uint32 n = 0u;
-    while ((x & 1u) == 0u) {
-      x >>= 1;
-      ++n;
-    }
-    return n;
-#endif
-  }
-
-  static uint32 clz_uint32(uint32 x) {
-    DCHECK_NE(x, 0u);
-#ifdef __GNUC__
-    return __builtin_clz(x);
-#else
-    uint32 n = 0u;
-    while ((x >> (kNumBits - 1u)) == 0u) {
-      x <<= 1;
-      ++n;
-    }
-    return n;
-#endif
-  }
-
   Iterator begin() const {
     // The begin position is the index of the first bit set to 1 in the entire
     // bit mask. If there are no bits set to 1, then the index is 0.
     if (mask_ != 0) {
-      return Iterator(*this, ctz_uint32(mask_));
+      return Iterator(*this, absl::countr_zero(mask_));
     }
     // The set is empty.
     return Iterator(*this, 0);
@@ -179,25 +153,13 @@ class DataTypeSet {
     // The end position is the index of the highest bit that is set, plus 1.
     // If there are no bits set to 1, then the index is 0.
     if (mask_ != 0) {
-      return Iterator(*this, kNumBits - clz_uint32(mask_));
+      return Iterator(*this, kNumBits - absl::countl_zero(mask_));
     }
     // The set is empty.
     return Iterator(*this, 0);
   }
 
-  size_t size() const {
-#if defined(__GNUC__)
-    return __builtin_popcount(mask_);
-#else
-    size_t n = 0;
-    uint32 x = mask_;
-    while (x > 0) {
-      n += x & 1u;
-      x >>= 1;
-    }
-    return n;
-#endif
-  }
+  size_t size() const { return absl::popcount(mask_); }
 
   constexpr DataTypeSet operator|(const DataTypeSet& other) const {
     return DataTypeSet(mask_ | other.mask_);
diff --git a/tensorflow/core/framework/variant_op_registry.cc b/tensorflow/core/framework/variant_op_registry.cc
index 93943287bd7b26..0261feed3f22f6 100644
--- a/tensorflow/core/framework/variant_op_registry.cc
+++ b/tensorflow/core/framework/variant_op_registry.cc
@@ -149,7 +149,7 @@ Status DeviceCopyPrimitiveType(
   // Dummy copy, we don't actually bother copying to the device and back for
   // testing.
   *out = in;
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace
 
@@ -177,7 +177,7 @@ template <typename T>
 Status ZerosLikeVariantPrimitiveType(OpKernelContext* ctx, const T& t,
                                      T* t_out) {
   *t_out = T(0);
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace
 
@@ -199,7 +199,7 @@ template <typename T>
 Status AddVariantPrimitiveType(OpKernelContext* ctx, const T& a, const T& b,
                                T* out) {
   *out = a + b;
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace
 
diff --git a/tensorflow/core/framework/variant_op_registry_test.cc b/tensorflow/core/framework/variant_op_registry_test.cc
index 693d4282d28a1a..2ef7be5cc7fa27 100644
--- a/tensorflow/core/framework/variant_op_registry_test.cc
+++ b/tensorflow/core/framework/variant_op_registry_test.cc
@@ -45,7 +45,7 @@ struct VariantValue {
       return errors::InvalidArgument("early exit zeros_like!");
     }
     v_out->value = 1;  // CPU
-    return OkStatus();
+    return absl::OkStatus();
   }
   static Status GPUZerosLikeFn(OpKernelContext* ctx, const VariantValue& v,
                                VariantValue* v_out) {
@@ -53,7 +53,7 @@ struct VariantValue {
       return errors::InvalidArgument("early exit zeros_like!");
     }
     v_out->value = 2;  // GPU
-    return OkStatus();
+    return absl::OkStatus();
   }
   static Status CPUAddFn(OpKernelContext* ctx, const VariantValue& a,
                          const VariantValue& b, VariantValue* out) {
@@ -61,7 +61,7 @@ struct VariantValue {
       return errors::InvalidArgument("early exit add!");
     }
     out->value = a.value + b.value;  // CPU
-    return OkStatus();
+    return absl::OkStatus();
   }
   static Status GPUAddFn(OpKernelContext* ctx, const VariantValue& a,
                          const VariantValue& b, VariantValue* out) {
@@ -69,14 +69,14 @@ struct VariantValue {
       return errors::InvalidArgument("early exit add!");
     }
     out->value = -(a.value + b.value);  // GPU
-    return OkStatus();
+    return absl::OkStatus();
   }
   static Status CPUToGPUCopyFn(
       const VariantValue& from, VariantValue* to,
       const std::function<Status(const Tensor&, Tensor*)>& copier) {
     TF_RETURN_IF_ERROR(copier(Tensor(), nullptr));
     to->value = 0xdeadbeef;
-    return OkStatus();
+    return absl::OkStatus();
   }
   bool early_exit;
   int value;
@@ -170,7 +170,7 @@ TEST(VariantOpCopyToGPURegistryTest, TestBasic) {
   auto dummy_copy_fn = [&dummy_executed](const Tensor& from,
                                          Tensor* to) -> Status {
     dummy_executed = true;
-    return OkStatus();
+    return absl::OkStatus();
   };
   TF_EXPECT_OK((*copy_to_gpu_fn)(v, &v_out, dummy_copy_fn));
   EXPECT_TRUE(dummy_executed);
diff --git a/tensorflow/core/framework/versions.cc b/tensorflow/core/framework/versions.cc
index 40e28742162c5d..7291f572b32055 100644
--- a/tensorflow/core/framework/versions.cc
+++ b/tensorflow/core/framework/versions.cc
@@ -50,7 +50,7 @@ Status CheckVersions(const VersionDef& versions, int consumer, int min_producer,
   }
 
   // All good!
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/function/transform/transform.py b/tensorflow/core/function/transform/transform.py
index 2d259ddae0d2d0..e4f25151bfe8ab 100644
--- a/tensorflow/core/function/transform/transform.py
+++ b/tensorflow/core/function/transform/transform.py
@@ -317,6 +317,7 @@ def transform_atomic_function(
       fndef.signature.name,
       func_graph,
       fndef.attr,
+      overwrite=True
   )
   # pylint: enable=protected-access
 
diff --git a/tensorflow/core/graph/costmodel.h b/tensorflow/core/graph/costmodel.h
index 7778691701114d..9f7aa35fbecc59 100644
--- a/tensorflow/core/graph/costmodel.h
+++ b/tensorflow/core/graph/costmodel.h
@@ -207,7 +207,7 @@ class CostModel {
   // Cumulative execution time.
   std::vector<Microseconds> time_;
   // Cumulative Bytes output on each channel.
-  std::vector<gtl::InlinedVector<Bytes, 2>> slot_bytes_;
+  std::vector<absl::InlinedVector<Bytes, 2UL>> slot_bytes_;
 
   // Maximum execution time
   std::vector<Microseconds> max_exec_time_;
@@ -220,13 +220,13 @@ class CostModel {
     Bytes temp_memory_size;
     Bytes persistent_memory_size;
 
-    gtl::InlinedVector<Bytes, 2> output_port_mem;
-    gtl::InlinedVector<TensorShapeProto, 2> output_port_shape;
-    gtl::InlinedVector<DataType, 2> output_port_type;
+    absl::InlinedVector<Bytes, 2UL> output_port_mem;
+    absl::InlinedVector<TensorShapeProto, 2UL> output_port_shape;
+    absl::InlinedVector<DataType, 2UL> output_port_type;
   };
   std::vector<MemUsage> max_mem_usage_;
 
-  std::vector<gtl::InlinedVector<int64_t, 2>> output_port_alloc_ids_;
+  std::vector<absl::InlinedVector<int64_t, 2UL>> output_port_alloc_ids_;
 
   std::set<int64_t> persistent_alloc_ids_;
 
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index 10984ae23608bc..b5408c37cda47a 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -270,7 +270,8 @@ gtl::iterator_range<NeighborIter> Node::in_nodes() const {
 void Node::MaybeCopyOnWrite() {
   // TODO(mdan): As nodes become more dynamic, this may not be worth the cost.
   // NodeProperties may be shared between Nodes. Make a copy if so.
-  if (!props_.unique()) {
+  // TODO(b/338453606): use_count() == 1 has a race condition.
+  if (!(props_.use_count() == 1)) {
     props_ = std::make_shared<NodeProperties>(*props_);
   }
 }
diff --git a/tensorflow/core/graph/optimizer_cse.cc b/tensorflow/core/graph/optimizer_cse.cc
index 884c1c44edaa7b..39b53541081659 100644
--- a/tensorflow/core/graph/optimizer_cse.cc
+++ b/tensorflow/core/graph/optimizer_cse.cc
@@ -68,9 +68,9 @@ class OptimizerCSE {
   Graph* g_;
 };
 
-static void FillInputs(const Node* n,
-                       gtl::InlinedVector<const Node*, 4>* control_edges,
-                       gtl::InlinedVector<std::pair<const Node*, int>, 4>* in) {
+static void FillInputs(
+    const Node* n, absl::InlinedVector<const Node*, 4UL>* control_edges,
+    absl::InlinedVector<std::pair<const Node*, int>, 4UL>* in) {
   DCHECK_EQ(in->size(), n->num_inputs());
   control_edges->clear();
   for (const Edge* e : n->in_edges()) {
@@ -205,8 +205,8 @@ size_t OptimizerCSE::NodeHash(const Node* n) {
   }
 
   hasher.MixInteger(n->num_inputs());
-  gtl::InlinedVector<const Node*, 4> control_edges;
-  gtl::InlinedVector<std::pair<const Node*, int>, 4> in(n->num_inputs());
+  absl::InlinedVector<const Node*, 4UL> control_edges;
+  absl::InlinedVector<std::pair<const Node*, int>, 4UL> in(n->num_inputs());
   FillInputs(n, &control_edges, &in);
   for (const auto& edge : in) {
     hasher.MixInteger(edge.first->id());
@@ -255,10 +255,10 @@ bool OptimizerCSE::Equivalent(const Node* a, const Node* b,
   // Compare input sources
   if (a->num_inputs() != b->num_inputs()) return false;
   const int N_in = a->num_inputs();
-  gtl::InlinedVector<const Node*, 4> a_control_edges;
-  gtl::InlinedVector<const Node*, 4> b_control_edges;
-  gtl::InlinedVector<std::pair<const Node*, int>, 4> a_in(N_in);
-  gtl::InlinedVector<std::pair<const Node*, int>, 4> b_in(N_in);
+  absl::InlinedVector<const Node*, 4UL> a_control_edges;
+  absl::InlinedVector<const Node*, 4UL> b_control_edges;
+  absl::InlinedVector<std::pair<const Node*, int>, 4UL> a_in(N_in);
+  absl::InlinedVector<std::pair<const Node*, int>, 4UL> b_in(N_in);
   FillInputs(a, &a_control_edges, &a_in);
   FillInputs(b, &b_control_edges, &b_in);
   if (a_in != b_in) return false;
diff --git a/tensorflow/core/grappler/clusters/utils.cc b/tensorflow/core/grappler/clusters/utils.cc
index d428b6632e6be9..e38807af1c797e 100644
--- a/tensorflow/core/grappler/clusters/utils.cc
+++ b/tensorflow/core/grappler/clusters/utils.cc
@@ -135,7 +135,7 @@ DeviceProperties GetLocalGPUInfo(PlatformDeviceId platform_device_id) {
                        properties.memoryClockRate * 2);
 
   (*device.mutable_environment())["architecture"] =
-      strings::StrCat("gfx", properties.gcnArch);
+      strings::StrCat("gfx", properties.gcnArchName);
 #endif
 
   return device;
diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc
index b1aed906154496..c64a322d4b9dc3 100644
--- a/tensorflow/core/grappler/costs/graph_properties_test.cc
+++ b/tensorflow/core/grappler/costs/graph_properties_test.cc
@@ -268,17 +268,7 @@ TEST_F(GraphPropertiesTest, DynamicProperties) {
         EXPECT_EQ(10, prop.shape().dim(0).size());
         EXPECT_EQ(1, prop.shape().dim(1).size());
         const auto out_props = properties.GetOutputProperties(node.name());
-#ifdef INTEL_MKL
-        if (!NativeFormatEnabled()) {
-          // Intel MKL AddN OP would have two output.
-          // One is the real output, another one for MKL metadata
-          EXPECT_EQ(2, out_props.size());
-        } else {
-          EXPECT_EQ(1, out_props.size());
-        }
-#else
         EXPECT_EQ(1, out_props.size());
-#endif  // INTEL_MKL
         string prop_str;
         ::tensorflow::protobuf::TextFormat::PrintToString(prop, &prop_str);
         string out_prop_str;
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index 04692294bf26d0..da60dacd9aa5e1 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -109,13 +109,15 @@ constexpr int kMissingIndex = -1;
 struct RemapperContext {
   explicit RemapperContext(GrapplerItem* item, Status* status,
                            RewriterConfig::CpuLayout cpu_layout_conversion,
-                           bool xla_auto_clustering_on)
+                           bool xla_auto_clustering_on,
+                           bool xla_cpu_jit_disable_fusion)
       : nodes_to_preserve(item->NodesToPreserve()),
         graph_view(&item->graph, status),
         graph_properties(*item),
         inferred_graph_properties(false),
         cpu_layout_conversion(cpu_layout_conversion),
-        xla_auto_clustering_on(xla_auto_clustering_on) {}
+        xla_auto_clustering_on(xla_auto_clustering_on),
+        xla_cpu_jit_disable_fusion(xla_cpu_jit_disable_fusion) {}
 
   std::unordered_set<string> nodes_to_preserve;
   utils::MutableGraphView graph_view;
@@ -123,6 +125,7 @@ struct RemapperContext {
   bool inferred_graph_properties;
   RewriterConfig::CpuLayout cpu_layout_conversion;
   bool xla_auto_clustering_on;
+  bool xla_cpu_jit_disable_fusion;
 };
 
 // FusedBatchNorm that can be replaced with a cheaper set of primitives.
@@ -446,6 +449,9 @@ bool IsCpuCompatibleDepthwiseConv2dNative(const NodeDef* dw_conv2d) {
 // Checks if we can rewrite a pattern to the `_Fused{Conv2D,MatMul}` on CPU.
 template <typename Pattern>
 bool IsCpuCompatible(const RemapperContext& ctx, const Pattern& matched) {
+  // Disable fusions on CPU when XLA JIT compilation enabled.
+  if (ctx.xla_cpu_jit_disable_fusion) return false;
+
   const NodeDef& node = ctx.graph_view.graph()->node(matched.contraction);
   if (IsConv2D(node)) {
     return IsCpuCompatibleConv2D(ctx, &node);
@@ -998,6 +1004,11 @@ bool FindConv2DWithBatchNorm(const RemapperContext& ctx, int node_index,
   const auto* conv2d_node_view = regular_fanin_0.node_view();
   const auto* conv2d_node_def = conv2d_node_view->node();
 
+  // Disable fusions on CPU when XLA JIT compilation enabled.
+  if (NodeIsOnCpu(conv2d_node_def) && ctx.xla_cpu_jit_disable_fusion) {
+    return false;
+  }
+
   if (!IsConv2D(*conv2d_node_def) || !NodeIsOnCpu(conv2d_node_def) ||
       !HaveSameDataType(node_def, conv2d_node_def) ||
       !IsCpuCompatibleDataType(conv2d_node_def) ||
@@ -1695,6 +1706,12 @@ bool FindMatMulBiasAddAndGelu(RemapperContext* ctx, int node_index,
     // Check if the MatMul to be fused is device compatible.
     NodeDef* matmul_node =
         ctx->graph_view.GetNode(matched_nodes_map->at("matmul"))->node();
+
+    // Disable fusions on CPU when XLA JIT compilation enabled.
+    if (NodeIsOnCpu(matmul_node) && ctx->xla_cpu_jit_disable_fusion) {
+      return false;
+    }
+
     DataType matmul_dtype = GetDataTypeFromAttr(*matmul_node, "T");
 
     bool cpu_ok = IsMKLEnabled() && IsCpuCompatibleMatMul(*ctx, matmul_node);
@@ -1732,6 +1749,11 @@ bool FindMatMulBiasAddAndGelu(RemapperContext* ctx, int node_index,
     NodeDef* matmul_node =
         ctx->graph_view.GetNode(matched_nodes_map->at("matmul"))->node();
 
+    // Disable fusions on CPU when XLA JIT compilation enabled.
+    if (NodeIsOnCpu(matmul_node) && ctx->xla_cpu_jit_disable_fusion) {
+      return false;
+    }
+
     // matmul_node is already the _FusedMatMul and we don't need to check its
     // data type again.
     if (!IsMKLEnabled() && !NodeIsOnGpu(matmul_node)) return false;
@@ -2298,6 +2320,10 @@ bool FindFusedBatchNorm(const RemapperContext& ctx, int node_index,
                         FusedBatchNorm* matched) {
   const auto* node_view = ctx.graph_view.GetNode(node_index);
   const auto* node_def = node_view->node();
+
+  // Disable fusions on CPU when XLA JIT compilation enabled.
+  if (ctx.xla_cpu_jit_disable_fusion && NodeIsOnCpu(node_def)) return false;
+
   if (!IsFusedBatchNorm(*node_def)) return false;
   if (GetDataTypeFromAttr(*node_def, "T") != DT_FLOAT) return false;
 
@@ -2380,6 +2406,8 @@ bool FindFusedBatchNormEx(const RemapperContext& ctx, int node_index,
       // should be processed when it's on GPU and oneDNN CPU is enabled.
       if (t_dtype != DT_FLOAT && t_dtype != DT_HALF) return false;
     } else {
+      // Disable fusions on CPU when XLA JIT compilation enabled.
+      if (ctx.xla_cpu_jit_disable_fusion) return false;
       if (IsMKLEnabled() && !IsDataTypeSupportedByOneDNNOnThisCPU(t_dtype))
         return false;
     }
@@ -4793,15 +4821,29 @@ bool RequiresInferredShapes(const RemapperContext& ctx, int node_index,
          is_matmul_gelu_exact_fusion_candidate() ||
          is_act_biasadd_matmul_candidate();
 }
+
+inline bool IsXlaCpuGlobalJitOn() {
+  std::vector<string> tf_xla_flags;
+  const std::string tf_xla_cpu_global_jit = "--tf_xla_cpu_global_jit";
+  TF_CHECK_OK(ReadStringsFromEnvVar("TF_XLA_FLAGS", "", &tf_xla_flags));
+  return std::find(tf_xla_flags.begin(), tf_xla_flags.end(),
+                   tf_xla_cpu_global_jit) != tf_xla_flags.end();
+}
 }  // namespace
 
 Status Remapper::Optimize(Cluster* cluster, const GrapplerItem& item,
                           GraphDef* optimized_graph) {
   GrapplerItem mutable_item = item;
   Status status;
+  bool xla_cpu_jit_disable_fusion =
+      xla_auto_clustering_on_ && IsXlaCpuGlobalJitOn();
+#ifdef DNNL_AARCH64_USE_ACL
+  xla_cpu_jit_disable_fusion = false;
+#endif  // DNNL_AARCH64_USE_ACL
   RemapperContext ctx(&mutable_item, &status, cpu_layout_conversion_,
-                      xla_auto_clustering_on_);
+                      xla_auto_clustering_on_, xla_cpu_jit_disable_fusion);
   TF_RETURN_IF_ERROR(status);
+
   // Processing graph in reverse-topological sorted order allows to remap
   // longer chains of dependent ops in one pass.
   TF_RETURN_IF_ERROR(
@@ -4852,7 +4894,7 @@ Status Remapper::Optimize(Cluster* cluster, const GrapplerItem& item,
       AddInputShapesAttr(ctx, i);
     }
 
-    if (IsMKLEnabled()) {
+    if (IsMKLEnabled() && !ctx.xla_cpu_jit_disable_fusion) {
       const auto* node_view = ctx.graph_view.GetNode(i);
       const auto* node_def = node_view->node();
       const string& type_attr = "T";
@@ -5049,6 +5091,9 @@ Status Remapper::Optimize(Cluster* cluster, const GrapplerItem& item,
       continue;
     }
 
+    // Fusions are disabled on XLA CPU in IsCpuCompatible(...) invoked by the
+    // following fusions.
+    //
     // Remap {Conv2D,DepthwiseConv2D,MatMul}+BiasAdd into the
     // _Fused{Conv2D,DepthwiseConv2dNative,MatMul}
     ContractionWithBiasAdd contract_with_bias;
@@ -5119,6 +5164,7 @@ Status Remapper::Optimize(Cluster* cluster, const GrapplerItem& item,
       continue;
     }
 
+    // This fusion is enabled on GPU only.
     FusedBatchNormGradEx fused_batch_norm_grad_ex;
     if (allow_non_differentiable_rewrites &&
         FindFusedBatchNormGradEx(ctx, i, &fused_batch_norm_grad_ex)) {
diff --git a/tensorflow/core/grappler/optimizers/remapper_test.cc b/tensorflow/core/grappler/optimizers/remapper_test.cc
index 4f7e042e09fa73..7a02b8283e752e 100644
--- a/tensorflow/core/grappler/optimizers/remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/remapper_test.cc
@@ -3044,5 +3044,69 @@ TEST_F(RemapperControlDependencyPatternMatcher, BF16) {
   RunTest<DT_BFLOAT16>();
 }
 
+class XlaCpuJitDisableFusionTest : public RemapperTest {
+ protected:
+  void SetUp() override {
+    setenv("TF_XLA_FLAGS", "--tf_xla_cpu_global_jit", /*overwrite=*/1);
+  }
+
+  template <DataType DTYPE>
+  void RunTest() {
+    using ::tensorflow::ops::Placeholder;
+
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+    auto lhs_shape = ops::Placeholder::Shape({8, 32});
+    auto rhs_shape = ops::Placeholder::Shape({32, 64});
+    auto bias_shape = ops::Placeholder::Shape({64});
+
+    auto lhs = Placeholder(s.WithOpName("lhs"), DTYPE, lhs_shape);
+    auto rhs = Placeholder(s.WithOpName("rhs"), DTYPE, rhs_shape);
+    auto bias = Placeholder(s.WithOpName("bias"), DTYPE, bias_shape);
+
+    auto matmul = ops::MatMul(s.WithOpName("matmul"), lhs, rhs);
+    auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), matmul, bias);
+    auto fetch = ops::Identity(s.WithOpName("fetch"), bias_add);
+
+    auto lhs_t = GenerateTensorWithSetRandom<DTYPE>({8, 32});
+    auto rhs_t = GenerateTensorWithSetRandom<DTYPE>({32, 64});
+    auto bias_t = GenerateTensorWithSetRandom<DTYPE>({64});
+
+    GrapplerItem item;
+    item.fetch = {"fetch"};
+    item.feed = {{"lhs", lhs_t}, {"rhs", rhs_t}, {"bias", bias_t}};
+    TF_ASSERT_OK(s.ToGraphDef(&item.graph));
+
+    const string device = "/device:CPU:0";
+
+    // Place all nodes on CPU.
+    for (int i = 0; i < item.graph.node_size(); ++i) {
+      item.graph.mutable_node(i)->set_device(device);
+    }
+
+    Remapper optimizer(RewriterConfig::ON, RewriterConfig::NO_CONVERSION_ON_CPU,
+                       /*xla_clustering_on=*/true);
+    GraphDef output;
+    TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+    // Fusion should not take place on CPU when tf_xla_cpu_global_jit in ON.
+    int found = 0;
+    for (const NodeDef& node : output.node()) {
+      if (node.name() == "bias_add") {
+        EXPECT_EQ(node.op(), "BiasAdd");
+        found++;
+      } else if (node.name() == "matmul") {
+        EXPECT_EQ(node.op(), "MatMul");
+        found++;
+      }
+    }
+    EXPECT_EQ(2, found);
+  }
+};
+
+#if !(DNNL_AARCH64_USE_ACL || GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+TEST_F(XlaCpuJitDisableFusionTest, MatMulWithBias) { RunTest<DT_FLOAT>(); }
+#endif  // !(DNNL_AARCH64_USE_ACL || GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/scc_test.cc b/tensorflow/core/grappler/utils/scc_test.cc
index b43fc1c40fdf1a..4fc4e7abaa4339 100644
--- a/tensorflow/core/grappler/utils/scc_test.cc
+++ b/tensorflow/core/grappler/utils/scc_test.cc
@@ -42,7 +42,7 @@ class SCCTest : public ::testing::Test {
 
  protected:
   static NodeDef CreateNode(const string& name,
-                            gtl::ArraySlice<string> inputs) {
+                            absl::Span<const string> inputs) {
     NodeDef node;
     node.set_name(name);
     for (const string& input : inputs) {
diff --git a/tensorflow/core/grappler/verifiers/BUILD b/tensorflow/core/grappler/verifiers/BUILD
index 959a9955b8eab5..d1f784d410db24 100644
--- a/tensorflow/core/grappler/verifiers/BUILD
+++ b/tensorflow/core/grappler/verifiers/BUILD
@@ -58,8 +58,6 @@ tf_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/core/grappler/verifiers/structure_verifier_test.cc b/tensorflow/core/grappler/verifiers/structure_verifier_test.cc
index eac350e2882554..6ce108b92e98f2 100644
--- a/tensorflow/core/grappler/verifiers/structure_verifier_test.cc
+++ b/tensorflow/core/grappler/verifiers/structure_verifier_test.cc
@@ -34,7 +34,7 @@ namespace {
 
 class StructureVerifierTest : public ::testing::Test {
  protected:
-  StructureVerifierTest() { verifier_.reset(new StructureVerifier()); }
+  StructureVerifierTest() { verifier_ = std::make_unique<StructureVerifier>(); }
   void SetGraph(const string& gdef_ascii) {
     CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &graph_));
   }
diff --git a/tensorflow/core/ir/BUILD b/tensorflow/core/ir/BUILD
index 5fc887ce213d76..9369b912467852 100644
--- a/tensorflow/core/ir/BUILD
+++ b/tensorflow/core/ir/BUILD
@@ -192,6 +192,7 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -206,6 +207,7 @@ tf_cc_test(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Support",
     ],
 )
 
diff --git a/tensorflow/core/ir/importexport/BUILD b/tensorflow/core/ir/importexport/BUILD
index 5694f228b32b7b..007fd3de65aa3b 100644
--- a/tensorflow/core/ir/importexport/BUILD
+++ b/tensorflow/core/ir/importexport/BUILD
@@ -109,6 +109,7 @@ cc_library(
         "//tensorflow/core/platform:statusor",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
         "@local_tsl//tsl/platform:ml_dtypes",
     ],
 )
diff --git a/tensorflow/core/ir/importexport/convert_attributes.cc b/tensorflow/core/ir/importexport/convert_attributes.cc
index a591d77afe4977..24ab1d2c12cba2 100644
--- a/tensorflow/core/ir/importexport/convert_attributes.cc
+++ b/tensorflow/core/ir/importexport/convert_attributes.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "xla/status_macros.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/full_type.pb.h"
@@ -55,11 +56,11 @@ namespace {
 // Converts a location to the debug information for the node def.
 Status ConvertLocation(Location inst_loc,
                        NodeDef::ExperimentalDebugInfo* debug_info) {
-  if (auto call_site = inst_loc.dyn_cast<CallSiteLoc>()) {
-    if (auto name_loc = call_site.getCallee().dyn_cast<NameLoc>()) {
+  if (auto call_site = mlir::dyn_cast<CallSiteLoc>(inst_loc)) {
+    if (auto name_loc = mlir::dyn_cast<NameLoc>(call_site.getCallee())) {
       debug_info->add_original_node_names(name_loc.getName().data());
     }
-  } else if (auto fused = inst_loc.dyn_cast<FusedLoc>()) {
+  } else if (auto fused = mlir::dyn_cast<FusedLoc>(inst_loc)) {
     auto locations = fused.getLocations();
     if (locations.size() <= 1)
       return InvalidArgument("Expected experimental debug info.");
@@ -68,22 +69,22 @@ Status ConvertLocation(Location inst_loc,
       TF_RETURN_IF_ERROR(ConvertLocation(locations[i], debug_info));
     }
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConvertAttribute(BoolAttr attr, AttrValue* value) {
   value->set_b(attr.getValue());
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConvertAttribute(IntegerAttr attr, AttrValue* value) {
   value->set_i(attr.getInt());
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConvertAttribute(FloatAttr attr, AttrValue* value) {
   value->set_f(attr.getValueAsDouble());
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConvertAttribute(ElementsAttr attr, AttrValue* value) {
@@ -92,31 +93,31 @@ Status ConvertAttribute(ElementsAttr attr, AttrValue* value) {
 
 Status ConvertAttribute(PlaceholderAttr attr, AttrValue* value) {
   value->set_placeholder(attr.getValue().str());
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConvertAttribute(ShapeAttr attr, AttrValue* value) {
   SetTensorShapeProto(attr, value->mutable_shape());
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConvertAttribute(FlatSymbolRefAttr attr, AttrValue* value) {
   value->mutable_func()->set_name(attr.getValue().str());
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConvertAttribute(FuncAttr attr, bool remove_ref_type, AttrValue* value) {
   TF_RETURN_IF_ERROR(
-      ConvertAttribute(attr.getName().cast<FlatSymbolRefAttr>(), value));
+      ConvertAttribute(mlir::cast<FlatSymbolRefAttr>(attr.getName()), value));
   TF_RETURN_IF_ERROR(ConvertAttributes(attr.getAttrs().getValue(),
                                        /*attrs_to_ignore=*/{}, remove_ref_type,
                                        value->mutable_func()->mutable_attr()));
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConvertAttribute(StringAttr attr, AttrValue* value) {
   value->set_s(attr.str());
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConvertAttribute(Type type, bool remove_ref_type, AttrValue* value) {
@@ -124,7 +125,7 @@ Status ConvertAttribute(Type type, bool remove_ref_type, AttrValue* value) {
   TF_RETURN_IF_ERROR(ConvertToDataType(type, &dtype));
   if (tensorflow::IsRefType(dtype)) dtype = tensorflow::RemoveRefType(dtype);
   value->set_type(dtype);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConvertAttribute(const TypeAttr& type, bool remove_ref_type,
@@ -134,20 +135,20 @@ Status ConvertAttribute(const TypeAttr& type, bool remove_ref_type,
 
 Status ConvertAttribute(const UnitAttr& attr, AttrValue* value) {
   value->clear_value();
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConvertAttribute(const ArrayAttr& attr, bool remove_ref_type,
                         AttrValue* value) {
   auto* list = value->mutable_list();
   for (Attribute a : attr.getValue()) {
-    if (auto attr = a.dyn_cast<BoolAttr>()) {
+    if (auto attr = mlir::dyn_cast<BoolAttr>(a)) {
       list->add_b(attr.getValue());
-    } else if (auto attr = a.dyn_cast<IntegerAttr>()) {
+    } else if (auto attr = mlir::dyn_cast<IntegerAttr>(a)) {
       list->add_i(attr.getInt());
-    } else if (auto attr = a.dyn_cast<FloatAttr>()) {
+    } else if (auto attr = mlir::dyn_cast<FloatAttr>(a)) {
       list->add_f(attr.getValueAsDouble());
-    } else if (auto attr = a.dyn_cast<StringAttr>()) {
+    } else if (auto attr = mlir::dyn_cast<StringAttr>(a)) {
       AttrValue nested_value;
       TF_RETURN_IF_ERROR(ConvertAttribute(attr, &nested_value));
       switch (nested_value.value_case()) {
@@ -163,29 +164,29 @@ Status ConvertAttribute(const ArrayAttr& attr, bool remove_ref_type,
         default:
           return Unimplemented("Unhandled nested attribute!");
       }
-    } else if (auto attr = a.dyn_cast<ElementsAttr>()) {
+    } else if (auto attr = mlir::dyn_cast<ElementsAttr>(a)) {
       TensorProto tensor;
       TF_RETURN_IF_ERROR(ConvertToTensorProto(attr, &tensor));
       *list->add_tensor() = tensor;
-    } else if (auto attr = a.dyn_cast<FlatSymbolRefAttr>()) {
+    } else if (auto attr = mlir::dyn_cast<FlatSymbolRefAttr>(a)) {
       AttrValue attr_val;
       TF_RETURN_IF_ERROR(ConvertAttribute(attr, &attr_val));
       *list->add_func() = attr_val.func();
-    } else if (auto attr = a.dyn_cast<FuncAttr>()) {
+    } else if (auto attr = mlir::dyn_cast<FuncAttr>(a)) {
       AttrValue attr_val;
       TF_RETURN_IF_ERROR(ConvertAttribute(attr, remove_ref_type, &attr_val));
       *list->add_func() = attr_val.func();
-    } else if (auto attr = a.dyn_cast<TypeAttr>()) {
+    } else if (auto attr = mlir::dyn_cast<TypeAttr>(a)) {
       AttrValue attr_val;
       // For type attributes, we only propagate the element type.
       Type elt_type = attr.getValue();
-      if (auto shaped_type = elt_type.dyn_cast<ShapedType>()) {
+      if (auto shaped_type = mlir::dyn_cast<ShapedType>(elt_type)) {
         elt_type = shaped_type.getElementType();
       }
       TF_RETURN_IF_ERROR(
           ConvertAttribute(elt_type, remove_ref_type, &attr_val));
       list->add_type(attr_val.type());
-    } else if (auto attr = a.dyn_cast<ShapeAttr>()) {
+    } else if (auto attr = mlir::dyn_cast<ShapeAttr>(a)) {
       AttrValue attr_val;
       TF_RETURN_IF_ERROR(ConvertAttribute(attr, &attr_val));
       *list->add_shape() = attr_val.shape();
@@ -194,23 +195,23 @@ Status ConvertAttribute(const ArrayAttr& attr, bool remove_ref_type,
                            debugString(a));
     }
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace
 
 absl::StatusOr<AttrValue> ConvertAttribute(Attribute attr) {
   AttrValue value;
-  if (auto symbol_ref = attr.dyn_cast<SymbolRefAttr>()) {
+  if (auto symbol_ref = mlir::dyn_cast<SymbolRefAttr>(attr)) {
     TF_RETURN_IF_ERROR(
-        ConvertAttribute(symbol_ref.cast<FlatSymbolRefAttr>(), &value));
+        ConvertAttribute(mlir::cast<FlatSymbolRefAttr>(symbol_ref), &value));
     return value;
   }
-  if (auto func_attr = attr.dyn_cast<FuncAttr>()) {
+  if (auto func_attr = mlir::dyn_cast<FuncAttr>(attr)) {
     TF_RETURN_IF_ERROR(
         ConvertAttribute(func_attr, /*remove_ref_type=*/false, &value));
     return value;
   }
-  if (attr.isa<AffineMapAttr>())
+  if (mlir::isa<AffineMapAttr>(attr))
     return Unimplemented("AffineMap attribute unimplemented");
   TF_RETURN_IF_ERROR(
       llvm::TypeSwitch<Attribute, Status>(attr)
@@ -251,11 +252,11 @@ Status ConvertAttributes(ArrayRef<NamedAttribute> attrs,
       name = mangling_util::DemangleAttributeName(name);
     }
     TF_ASSIGN_OR_RETURN(AttrValue value, ConvertAttribute(attr));
-    if (attr.isa<SymbolRefAttr>()) {
+    if (mlir::isa<SymbolRefAttr>(attr)) {
       func_call_attrs[std::string(name)] = value;
       continue;
     }
-    if (attr.isa<FuncAttr>()) {
+    if (mlir::isa<FuncAttr>(attr)) {
       func_call_attrs[std::string(name)] = value;
       continue;
     }
@@ -276,7 +277,7 @@ Status ConvertAttributes(ArrayRef<NamedAttribute> attrs,
   for (const auto& it : func_call_attrs) {
     (*values)[it.first] = it.second;
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 Status SetShapeAttribute(absl::string_view name, ShapedType shaped_type,
@@ -299,7 +300,7 @@ Status SetShapeAttribute(absl::string_view name, ShapedType shaped_type,
                              actual_shape.ShortDebugString());
     }
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 // Converts non func AttrValue proto into an MLIR attribute. Func attribute is
@@ -479,7 +480,8 @@ Status ConvertHandleData(ArrayAttr handle_data_arr,
                          tensorflow::OpDef::ArgDef* arg) {
   if (!handle_data_arr) return {};
   for (auto handle_data_attr : handle_data_arr.getAsRange<TypeAttr>()) {
-    TensorType handle_type = handle_data_attr.getValue().dyn_cast<TensorType>();
+    TensorType handle_type =
+        mlir::dyn_cast<TensorType>(handle_data_attr.getValue());
     if (!handle_type) {
       return InvalidArgument("Expected an array of tensor types, but got ",
                              debugString(handle_data_arr));
diff --git a/tensorflow/core/ir/importexport/convert_attributes.h b/tensorflow/core/ir/importexport/convert_attributes.h
index f57aaf99a6d683..e2df6a9ae42329 100644
--- a/tensorflow/core/ir/importexport/convert_attributes.h
+++ b/tensorflow/core/ir/importexport/convert_attributes.h
@@ -39,7 +39,7 @@ tensorflow::Status ConvertAttributes(ArrayRef<NamedAttribute> attrs,
                                      tensorflow::AttrValueMap* values);
 
 // Convert the MLIR attribute `attr` and return a `tensorflow::AttrValue`.
-tensorflow::StatusOr<tensorflow::AttrValue> ConvertAttribute(Attribute attr);
+absl::StatusOr<tensorflow::AttrValue> ConvertAttribute(Attribute attr);
 
 tensorflow::Status SetShapeAttribute(absl::string_view name,
                                      ShapedType shaped_type,
@@ -53,25 +53,25 @@ ShapeAttr ConvertTypeToTensorShapeAttr(const Type& type);
 // Converts non func AttrValue proto into an MLIR attribute. Func attribute is
 // exclused in this function because the function might be renamed when the
 // function definition is imported.
-tensorflow::StatusOr<Attribute> ConvertNonFuncAttributeValue(
+absl::StatusOr<Attribute> ConvertNonFuncAttributeValue(
     const tensorflow::AttrValue& value, Builder& builder);
 
 // Converts all kinds of AttrValue proto into an MLIR attribute.
-tensorflow::StatusOr<Attribute> ConvertAttributeValue(
+absl::StatusOr<Attribute> ConvertAttributeValue(
     const tensorflow::AttrValue& value, Builder& builder);
 
 // Convert the MLIR FullTyoe attribute `attr` and return a
 // `tensorflow::FullTypeDef`.
-tensorflow::StatusOr<tensorflow::FullTypeDef> ConvertAttribute(
+absl::StatusOr<tensorflow::FullTypeDef> ConvertAttribute(
     tf_type::FullTypeAttr full_type);
 
 // Converts fulltype proto to attribute.
-tensorflow::StatusOr<::mlir::tf_type::FullTypeAttr> ConvertAttribute(
+absl::StatusOr< ::mlir::tf_type::FullTypeAttr> ConvertAttribute(
     const tensorflow::FullTypeDef& full_type, Builder& builder);
 
 // Convert an array of handle data (pairs of data types and shapes) to an array
 // attribute of tensor types.
-tensorflow::StatusOr<ArrayAttr> ConvertHandleData(
+absl::StatusOr<ArrayAttr> ConvertHandleData(
     Builder builder,
     const tensorflow::protobuf::RepeatedPtrField<
         tensorflow::ResourceHandleProto_DtypeAndShape>& handle_data);
diff --git a/tensorflow/core/ir/importexport/convert_tensor.cc b/tensorflow/core/ir/importexport/convert_tensor.cc
index e3b801dc52860c..a0b1cd4cc79e9d 100644
--- a/tensorflow/core/ir/importexport/convert_tensor.cc
+++ b/tensorflow/core/ir/importexport/convert_tensor.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
@@ -248,12 +249,12 @@ void ConvertToTensorShapeProto(ArrayRef<int64_t> shape,
 }
 
 PartialTensorShape ConvertTypeToTensorShape(const Type& type) {
-  if (type.isa<UnrankedTensorType>()) {
+  if (mlir::isa<UnrankedTensorType>(type)) {
     // An empty PartialTensorShape indicates an unranked tensor.
     return PartialTensorShape();
   }
 
-  if (auto tensor_type = type.dyn_cast<RankedTensorType>()) {
+  if (auto tensor_type = mlir::dyn_cast<RankedTensorType>(type)) {
     TensorShapeProto tensor_shape_proto;
     ConvertToTensorShapeProto(ConvertMlirShapeToTF(tensor_type.getShape()),
                               &tensor_shape_proto);
@@ -266,11 +267,11 @@ PartialTensorShape ConvertTypeToTensorShape(const Type& type) {
 }
 
 ShapeAttr ConvertTypeToTensorShapeAttr(const Type& type) {
-  if (type.isa<UnrankedTensorType>()) {
+  if (mlir::isa<UnrankedTensorType>(type)) {
     return ShapeAttr::get(type.getContext(), std::nullopt);
   }
 
-  if (auto tensor_type = type.dyn_cast<RankedTensorType>()) {
+  if (auto tensor_type = mlir::dyn_cast<RankedTensorType>(type)) {
     return ShapeAttr::get(
         type.getContext(),
         llvm::ArrayRef(ConvertMlirShapeToTF(tensor_type.getShape())));
@@ -439,10 +440,10 @@ Status ConvertToTensorProto(const ElementsAttr attr, TensorProto* output) {
   output->set_dtype(output_dtype);
   ConvertToTensorShapeProto(shape, output->mutable_tensor_shape());
 
-  if (auto tensor_attr = attr.dyn_cast<mlir::tf_type::TensorProtoAttr>())
+  if (auto tensor_attr = mlir::dyn_cast<mlir::tf_type::TensorProtoAttr>(attr))
     return ConvertTensorProtoAttr(tensor_attr, output);
 
-  auto dense_attr = attr.dyn_cast<DenseElementsAttr>();
+  auto dense_attr = mlir::dyn_cast<DenseElementsAttr>(attr);
   if (!dense_attr) return InvalidArgument("Unsupported elements attr");
 
   switch (output_dtype) {
@@ -508,7 +509,7 @@ Status ConvertToTensorProto(const ElementsAttr attr, TensorProto* output) {
                              output->mutable_tensor_content());
       break;
     case tensorflow::DT_STRING:
-      ConvertStringElementsAttr(dense_attr.cast<DenseStringElementsAttr>(),
+      ConvertStringElementsAttr(mlir::cast<DenseStringElementsAttr>(dense_attr),
                                 output->mutable_string_val());
       break;
     case tensorflow::DT_UINT8:
@@ -533,7 +534,7 @@ Status ConvertToTensorProto(const ElementsAttr attr, TensorProto* output) {
       return Unimplemented(absl::StrCat("Unimplemented data type ",
                                         DataTypeString(output_dtype)));
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConvertToTensor(const ElementsAttr attr, Tensor* output_tensor) {
@@ -542,7 +543,7 @@ Status ConvertToTensor(const ElementsAttr attr, Tensor* output_tensor) {
   if (!output_tensor->FromProto(tensor_proto)) {
     return InvalidArgument("Couldn't convert tensor proto to tensor.");
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 llvm::SmallVector<int64_t> ConvertMlirShapeToTF(llvm::ArrayRef<int64_t> shape) {
diff --git a/tensorflow/core/ir/importexport/convert_tensor.h b/tensorflow/core/ir/importexport/convert_tensor.h
index d20245e2f9fbcc..0a20af3157e7af 100644
--- a/tensorflow/core/ir/importexport/convert_tensor.h
+++ b/tensorflow/core/ir/importexport/convert_tensor.h
@@ -31,11 +31,11 @@ namespace mlir {
 namespace tfg {
 
 // Converts an TensorFlow tensor proto into an MLIR elements attribute.
-tensorflow::StatusOr<ElementsAttr> ConvertTensorProto(
+absl::StatusOr<ElementsAttr> ConvertTensorProto(
     const tensorflow::TensorProto& input_tensor, Builder builder);
 
 // Converts an TensorFlow tensor into an MLIR elements attribute.
-tensorflow::StatusOr<ElementsAttr> ConvertTensor(
+absl::StatusOr<ElementsAttr> ConvertTensor(
     const tensorflow::Tensor& input_tensor, Builder builder);
 
 // Converts a shape from MLIR to a TensorFlow tensor shape proto.
@@ -46,7 +46,7 @@ void ConvertToTensorShapeProto(ArrayRef<int64_t> shape,
 tensorflow::PartialTensorShape ConvertTypeToTensorShape(const Type& type);
 
 // Converts a TensorFlow shape attribute to an MLIR shape attribute.
-tensorflow::StatusOr<ShapeAttr> ConvertTensorShapeProto(
+absl::StatusOr<ShapeAttr> ConvertTensorShapeProto(
     const tensorflow::TensorShapeProto& shape, MLIRContext* context);
 
 // Fill in the contents of TensorShapeProto for the given shape.
diff --git a/tensorflow/core/ir/importexport/convert_types.cc b/tensorflow/core/ir/importexport/convert_types.cc
index 26e21205460512..0379ea8e1b8e76 100644
--- a/tensorflow/core/ir/importexport/convert_types.cc
+++ b/tensorflow/core/ir/importexport/convert_types.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/ir/dialect.h"
@@ -41,61 +42,61 @@ Status ConvertDataType(DataType dtype, Builder& builder, Type* type) {
   switch (dtype) {
     case tensorflow::DT_HALF:
       *type = builder.getF16Type();
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_FLOAT:
       *type = builder.getF32Type();
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_DOUBLE:
       *type = builder.getF64Type();
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_BOOL:
       *type = builder.getIntegerType(1);
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_INT8:
       *type = builder.getIntegerType(8);
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_INT16:
       *type = builder.getIntegerType(16);
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_INT32:
       *type = builder.getIntegerType(32);
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_INT64:
       *type = builder.getIntegerType(64);
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_UINT8:
       *type = builder.getIntegerType(8, /*isSigned=*/false);
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_UINT16:
       *type = builder.getIntegerType(16, /*isSigned=*/false);
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_UINT32:
       *type = builder.getIntegerType(32, /*isSigned=*/false);
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_UINT64:
       *type = builder.getIntegerType(64, /*isSigned=*/false);
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_BFLOAT16:
       *type = builder.getBF16Type();
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_COMPLEX64:
       *type = ComplexType::get(builder.getF32Type());
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_COMPLEX128:
       *type = ComplexType::get(builder.getF64Type());
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_FLOAT8_E4M3FN:
       *type = builder.getFloat8E4M3FNType();
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_FLOAT8_E5M2:
       *type = builder.getFloat8E5M2Type();
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_INT4:
       *type = builder.getIntegerType(4, /*isSigned=*/true);
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_UINT4:
       *type = builder.getIntegerType(4, /*isSigned=*/false);
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
 #define HANDLE_TF_TYPE(tftype, enumerant, name) \
   case tensorflow::DT_##enumerant:              \
     *type = builder.getType<tftype##Type>();    \
@@ -111,59 +112,59 @@ Status ConvertDataType(DataType dtype, Builder& builder, Type* type) {
 Status ConvertScalarTypeToDataType(Type type, DataType* dtype) {
   if (type.isF16()) {
     *dtype = tensorflow::DT_HALF;
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   } else if (type.isF32()) {
     *dtype = tensorflow::DT_FLOAT;
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   } else if (type.isF64()) {
     *dtype = tensorflow::DT_DOUBLE;
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   } else if (type.isBF16()) {
     *dtype = tensorflow::DT_BFLOAT16;
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   } else if (type.isFloat8E4M3FN()) {
     *dtype = ::tensorflow::DT_FLOAT8_E4M3FN;
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   } else if (type.isFloat8E5M2()) {
     *dtype = ::tensorflow::DT_FLOAT8_E5M2;
-    return ::tensorflow::OkStatus();
-  } else if (auto itype = type.dyn_cast<IntegerType>()) {
+    return absl::OkStatus();
+  } else if (auto itype = mlir::dyn_cast<IntegerType>(type)) {
     switch (itype.getWidth()) {
       case 1:
         *dtype = tensorflow::DT_BOOL;
-        return ::tensorflow::OkStatus();
+        return absl::OkStatus();
       case 4:
         *dtype =
             itype.isUnsigned() ? tensorflow::DT_UINT4 : tensorflow::DT_INT4;
-        return ::tensorflow::OkStatus();
+        return absl::OkStatus();
       case 8:
         *dtype =
             itype.isUnsigned() ? tensorflow::DT_UINT8 : tensorflow::DT_INT8;
-        return ::tensorflow::OkStatus();
+        return absl::OkStatus();
       case 16:
         *dtype =
             itype.isUnsigned() ? tensorflow::DT_UINT16 : tensorflow::DT_INT16;
-        return ::tensorflow::OkStatus();
+        return absl::OkStatus();
       case 32:
         *dtype =
             itype.isUnsigned() ? tensorflow::DT_UINT32 : tensorflow::DT_INT32;
-        return ::tensorflow::OkStatus();
+        return absl::OkStatus();
       case 64:
         *dtype =
             itype.isUnsigned() ? tensorflow::DT_UINT64 : tensorflow::DT_INT64;
-        return ::tensorflow::OkStatus();
+        return absl::OkStatus();
       default:
         return Unimplemented(
             absl::StrCat("Converting ", debugString(type), " to DataType"));
     }
-  } else if (auto complex_type = type.dyn_cast<ComplexType>()) {
+  } else if (auto complex_type = mlir::dyn_cast<ComplexType>(type)) {
     auto etype = complex_type.getElementType();
     if (etype.isF32()) {
       *dtype = tensorflow::DT_COMPLEX64;
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     } else if (etype.isF64()) {
       *dtype = tensorflow::DT_COMPLEX128;
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     }
     return Unimplemented(
         absl::StrCat("Converting ", debugString(type), " to DataType"));
@@ -182,13 +183,13 @@ Status ConvertScalarTypeToDataType(Type type, DataType* dtype) {
 }
 
 Status ConvertToDataType(Type type, DataType* dtype) {
-  if (auto stype = type.dyn_cast<ShapedType>()) {
+  if (auto stype = mlir::dyn_cast<ShapedType>(type)) {
     TF_RETURN_IF_ERROR(
         ConvertScalarTypeToDataType(stype.getElementType(), dtype));
   } else {
     TF_RETURN_IF_ERROR(ConvertScalarTypeToDataType(type, dtype));
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 void ConvertToMlirShape(const TensorShape& input_shape,
@@ -214,7 +215,7 @@ Status ConvertToMlirShape(const TensorShapeProto& input_shape,
     else
       shape->push_back(d.size());
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 absl::StatusOr<Type> ConvertToMlirTensorType(const TensorShapeProto& shape,
diff --git a/tensorflow/core/ir/importexport/convert_types.h b/tensorflow/core/ir/importexport/convert_types.h
index 5dd230a614b8d6..3941e1d1a6bf9c 100644
--- a/tensorflow/core/ir/importexport/convert_types.h
+++ b/tensorflow/core/ir/importexport/convert_types.h
@@ -47,7 +47,7 @@ tensorflow::Status ConvertToMlirShape(
     SmallVectorImpl<int64_t>* shape);
 
 // Given a tensor shape and dtype, get the corresponding MLIR tensor type.
-tensorflow::StatusOr<Type> ConvertToMlirTensorType(
+absl::StatusOr<Type> ConvertToMlirTensorType(
     const tensorflow::TensorShapeProto& shape, tensorflow::DataType dtype,
     Builder* builder);
 
diff --git a/tensorflow/core/ir/importexport/functiondef_export.cc b/tensorflow/core/ir/importexport/functiondef_export.cc
index a274aac1b8ab27..a31c61a603dbe0 100644
--- a/tensorflow/core/ir/importexport/functiondef_export.cc
+++ b/tensorflow/core/ir/importexport/functiondef_export.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op_def.pb.h"
@@ -51,9 +52,9 @@ namespace tfg {
 static absl::StatusOr<std::string> GetValueName(Value operand,
                                                 Type control_ty) {
   bool is_control = (operand.getType() == control_ty);
-  OpResult op_result = operand.dyn_cast<OpResult>();
+  OpResult op_result = mlir::dyn_cast<OpResult>(operand);
   if (!op_result) {
-    BlockArgument block_operand = operand.dyn_cast<BlockArgument>();
+    BlockArgument block_operand = mlir::dyn_cast<BlockArgument>(operand);
     int arg_num = block_operand.getArgNumber();
 
     // Function arguments are coming as pair: the even are the actual tensors
@@ -140,7 +141,7 @@ static Status ExportArgDef(OpDef::ArgDef *arg, DictionaryAttr arg_attrs,
         sig_arg_attrs.getValue(), /*attrs_to_ignore=*/{},
         /*remove_ref_type=*/false, arg_def_attrs->mutable_attr()));
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 absl::StatusOr<FunctionDef> ConvertGenericFunctionToFunctionDef(
@@ -174,7 +175,8 @@ absl::StatusOr<FunctionDef> ConvertGenericFunctionToFunctionDef(
     for (NamedAttribute attr : attrs) {
       OpDef_AttrDef *func_attr = signature->add_attr();
       func_attr->set_name(attr.getName().str());
-      DictionaryAttr dict_attr = attr.getValue().dyn_cast<DictionaryAttr>();
+      DictionaryAttr dict_attr =
+          mlir::dyn_cast<DictionaryAttr>(attr.getValue());
       if (!dict_attr) return InvalidArgument("Expects dict attribute");
       if (StringAttr type = dict_attr.getAs<StringAttr>("function_type"))
         func_attr->set_type(type.getValue().str());
@@ -198,7 +200,7 @@ absl::StatusOr<FunctionDef> ConvertGenericFunctionToFunctionDef(
   if (auto control_outputs =
           func_op->getAttrOfType<ArrayAttr>("control_output")) {
     for (Attribute attr : control_outputs) {
-      StringAttr output = attr.dyn_cast<StringAttr>();
+      StringAttr output = mlir::dyn_cast<StringAttr>(attr);
       if (!output)
         return InvalidArgument(
             "Can't export function with non-string \"control_output\" "
@@ -216,7 +218,7 @@ absl::StatusOr<FunctionDef> ConvertGenericFunctionToFunctionDef(
     if (arg_num >= args_attr.size())
       return InvalidArgument("Can't export function ", func_op.getName().str(),
                              " because missing attributes for arg #", arg_num);
-    DictionaryAttr arg_attrs = args_attr[arg_num].cast<DictionaryAttr>();
+    DictionaryAttr arg_attrs = mlir::cast<DictionaryAttr>(args_attr[arg_num]);
     FunctionDef::ArgAttrs func_def_arg_attrs;
     TF_RETURN_WITH_CONTEXT_IF_ERROR(
         ExportArgDef(arg, arg_attrs, &func_def_arg_attrs),
@@ -242,7 +244,7 @@ absl::StatusOr<FunctionDef> ConvertGenericFunctionToFunctionDef(
       return InvalidArgument("Can't export function ", func_op.getName().str(),
                              " because missing attributes for result #",
                              res_num);
-    auto res_attrs = results_attr[res_num].cast<DictionaryAttr>();
+    auto res_attrs = mlir::cast<DictionaryAttr>(results_attr[res_num]);
     auto name = res_attrs.getAs<StringAttr>("tfg.name");
     if (!name)
       return InvalidArgument(
diff --git a/tensorflow/core/ir/importexport/functiondef_export.h b/tensorflow/core/ir/importexport/functiondef_export.h
index 8959b552ee0747..1eec428259ed04 100644
--- a/tensorflow/core/ir/importexport/functiondef_export.h
+++ b/tensorflow/core/ir/importexport/functiondef_export.h
@@ -26,8 +26,8 @@ namespace tfg {
 
 // Export a generic GraphFuncOp into a FunctionDef. This is intended to be a
 // straight serialization, an error is returned in case of failure.
-tensorflow::StatusOr<tensorflow::FunctionDef>
-ConvertGenericFunctionToFunctionDef(GraphFuncOp func);
+absl::StatusOr<tensorflow::FunctionDef> ConvertGenericFunctionToFunctionDef(
+    GraphFuncOp func);
 
 }  // namespace tfg
 }  // namespace mlir
diff --git a/tensorflow/core/ir/importexport/functiondef_import.cc b/tensorflow/core/ir/importexport/functiondef_import.cc
index 74bc67fcf077d5..8ad606df2e4695 100644
--- a/tensorflow/core/ir/importexport/functiondef_import.cc
+++ b/tensorflow/core/ir/importexport/functiondef_import.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op_def.pb.h"
@@ -83,7 +84,7 @@ class ValueMapManager {
     }
     base_operation.push_back(op->getResult(1));
     base_operation.push_back(op->getResult(0));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   absl::StatusOr<Value> GetValueOrCreatePlaceholder(StringRef full_name) {
@@ -230,7 +231,7 @@ Status ImportNodes(ValueMapManager value_manager,
           op.getAttrOfType<StringAttr>(name_attr).getValue().str()));
     }
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 absl::StatusOr<NamedAttrList> ConvertArgDefAttributes(const OpDef::ArgDef& arg,
@@ -504,7 +505,7 @@ Status ImportGenericFunction(
     }
     TF_ASSIGN_OR_RETURN(Value result, value_manager.GetValueOrCreatePlaceholder(
                                           (Twine("^") + ret_val.second).str()));
-    if (!result.getType().isa<ControlType>())
+    if (!mlir::isa<ControlType>(result.getType()))
       return InvalidArgument("failed to map returned value ", ret_val.second,
                              ", isn't a control output");
     ret_vals[func.ret_size() + position->second] = result;
@@ -534,7 +535,7 @@ Status ImportGenericFunction(
                      arg_types_with_ctl, ret_op.getOperandTypes())));
   }
   func_op->setAttrs(attrs);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
diff --git a/tensorflow/core/ir/importexport/graphdef_export.cc b/tensorflow/core/ir/importexport/graphdef_export.cc
index bc0df8d0160628..dc64d84edda355 100644
--- a/tensorflow/core/ir/importexport/graphdef_export.cc
+++ b/tensorflow/core/ir/importexport/graphdef_export.cc
@@ -182,7 +182,7 @@ Status GraphDefExporter::ExportToGraphDef(ModuleOp module, GraphDef *graph) {
     } else {
       TF_ASSIGN_OR_RETURN(gradient, ExportFunction(func, def));
     }
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   };
 
   // TODO(jeffniu): Don't export functions in parallel if there are too few or
@@ -225,7 +225,7 @@ Status GraphDefExporter::ExportToGraphDef(ModuleOp module, GraphDef *graph) {
     }
   }
 
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 // The only dialect attributes allowed have the "tf." prefix. This is a slightly
@@ -243,7 +243,7 @@ static Status ConvertAttributes(
     StringRef name = attr.getName().strref().drop_front(/*strlen("tf.")=*/3);
     TF_ASSIGN_OR_RETURN((*map)[name.str()], ConvertAttribute(attr.getValue()));
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 absl::StatusOr<std::optional<GradientDef>> GraphDefExporter::ExportFunction(
@@ -290,12 +290,12 @@ absl::StatusOr<std::optional<GradientDef>> GraphDefExporter::ExportFunction(
 
   // Convert the arguments.
   for (int i = 0, e = func.getNumArguments(); i < e; i += 2) {
-    auto attrs = func.getArgAttrs().value()[i].cast<DictionaryAttr>();
+    auto attrs = mlir::cast<DictionaryAttr>(func.getArgAttrs().value()[i]);
     TF_ASSIGN_OR_RETURN(OpDef::ArgDef &arg = *signature->add_input_arg(),
                         ConvertArgumentAttributes(attrs));
     DataType dtype;
     TF_RETURN_IF_ERROR(ConvertToDataType(
-        func.getArgument(i).getType().cast<TensorType>().getElementType(),
+        mlir::cast<TensorType>(func.getArgument(i).getType()).getElementType(),
         &dtype));
     arg.set_type(dtype);
     // Convert the attributes.
@@ -317,7 +317,7 @@ absl::StatusOr<std::optional<GradientDef>> GraphDefExporter::ExportFunction(
                         ConvertArgumentAttributes(std::get<1>(it)));
     DataType dtype;
     TF_RETURN_IF_ERROR(ConvertToDataType(
-        std::get<0>(it).cast<TensorType>().getElementType(), &dtype));
+        mlir::cast<TensorType>(std::get<0>(it)).getElementType(), &dtype));
     arg.set_type(dtype);
     // Map the result.
     TF_ASSIGN_OR_RETURN((*def->mutable_ret())[arg.name()],
@@ -333,7 +333,7 @@ absl::StatusOr<std::optional<GradientDef>> GraphDefExporter::ExportFunction(
     if (attrs.empty())
       return InvalidArgument("Control result is missing 'tfg.name'");
     assert(attrs.begin()->getName() == dialect_->getTfgNameAttrIdentifier());
-    std::string name = attrs.begin()->getValue().cast<StringAttr>().str();
+    std::string name = mlir::cast<StringAttr>(attrs.begin()->getValue()).str();
     signature->add_control_output(name);
     // Map the control result.
     TF_ASSIGN_OR_RETURN(std::string value_name,
@@ -383,12 +383,13 @@ static void ExtractExperimentalDebugInfoFromLocation(
     debug_info->add_original_node_names(node.str());
     if (!func.empty()) debug_info->add_original_func_names(func.str());
   };
-  if (auto fused = inst_loc.dyn_cast<mlir::FusedLoc>()) {
+  if (auto fused = mlir::dyn_cast<mlir::FusedLoc>(inst_loc)) {
     for (Location loc : fused.getLocations())
-      if (auto name_loc = loc.dyn_cast<mlir::NameLoc>()) add_name_loc(name_loc);
+      if (auto name_loc = mlir::dyn_cast<mlir::NameLoc>(loc))
+        add_name_loc(name_loc);
     return;
   }
-  if (auto name_loc = inst_loc.dyn_cast<mlir::NameLoc>())
+  if (auto name_loc = mlir::dyn_cast<mlir::NameLoc>(inst_loc))
     add_name_loc(name_loc);
 }
 
@@ -437,14 +438,14 @@ Status ConvertToNodeDef(
   }
 
   // Export the location as debug info.
-  if (!op->getLoc().isa<UnknownLoc>()) {
+  if (!mlir::isa<UnknownLoc>(op->getLoc())) {
     ExtractExperimentalDebugInfoFromLocation(
         op->getLoc(), node->mutable_experimental_debug_info());
     if (node->experimental_debug_info().original_node_names().empty())
       node->clear_experimental_debug_info();
   }
 
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 Status GraphDefExporter::ConvertOperation(Operation *op, NodeDef *node,
@@ -464,15 +465,14 @@ static absl::StatusOr<std::string> GetValueName(
   std::string name;
   bool is_control = value.getType() == dialect->getControlType();
 
-  if (auto arg = value.dyn_cast<BlockArgument>()) {
+  if (auto arg = mlir::dyn_cast<BlockArgument>(value)) {
     auto func = dyn_cast<GraphFuncOp>(arg.getOwner()->getParentOp());
     if (!func)
       return InvalidArgument("Expected block argument owner to be tfg.func");
     // If the block argument is a control token, use the attributes of the
     // associated data argument (which preceeds it).
-    auto attrs = func.getArgAttrs()
-                     .value()[arg.getArgNumber() - is_control]
-                     .cast<DictionaryAttr>();
+    auto attrs = mlir::cast<DictionaryAttr>(
+        func.getArgAttrs().value()[arg.getArgNumber() - is_control]);
     auto name_attr =
         attrs.getAs<StringAttr>(dialect->getTfgNameAttrIdentifier());
     if (!name_attr) {
@@ -486,7 +486,7 @@ static absl::StatusOr<std::string> GetValueName(
     return name;
   }
 
-  auto result = value.cast<OpResult>();
+  auto result = mlir::cast<OpResult>(value);
   auto name_attr = result.getOwner()->getAttrOfType<StringAttr>(
       dialect->getNameAttrIdentifier());
   if (!name_attr)
@@ -535,12 +535,12 @@ absl::StatusOr<std::string> GraphDefExporter::GetEdgeName(Value value,
 static absl::StatusOr<unsigned int> GetOutputSegmentSize(
     Operation *op, const OpDef::ArgDef &arg) {
   if (!arg.type_list_attr().empty()) {
-    if (auto v = op->getAttr(arg.type_list_attr()).dyn_cast<ArrayAttr>())
+    if (auto v = mlir::dyn_cast<ArrayAttr>(op->getAttr(arg.type_list_attr())))
       return v.size();
     return InvalidArgument("Type attr not found: ", arg.type_list_attr());
   }
   if (arg.number_attr().empty()) return 1;
-  if (auto v = op->getAttr(arg.number_attr()).dyn_cast<IntegerAttr>())
+  if (auto v = mlir::dyn_cast<IntegerAttr>(op->getAttr(arg.number_attr())))
     return v.getValue().getZExtValue();
   return InvalidArgument("Type attr not found: ", arg.number_attr());
 }
@@ -644,7 +644,7 @@ Status ConvertToFunctionDef(GraphFuncOp func,
       TF_RETURN_IF_ERROR(library.ReplaceGradient(*gradient));
     }
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tfg
diff --git a/tensorflow/core/ir/importexport/graphdef_export.h b/tensorflow/core/ir/importexport/graphdef_export.h
index 8ae221e454563d..0f4a90d90733a5 100644
--- a/tensorflow/core/ir/importexport/graphdef_export.h
+++ b/tensorflow/core/ir/importexport/graphdef_export.h
@@ -33,8 +33,7 @@ namespace mlir {
 namespace tfg {
 
 // Get the name of a value as if it were an edge in a graph.
-tensorflow::StatusOr<std::string> GetValueName(Value value,
-                                               TFGraphDialect *dialect);
+absl::StatusOr<std::string> GetValueName(Value value, TFGraphDialect *dialect);
 
 // Convert a TFG graph directly to GraphDef. Graph functions in the module are
 // added to the GraphDef's function library.
@@ -45,7 +44,7 @@ tensorflow::Status ConvertToGraphDef(ModuleOp module,
 // `get_value_name` that returns the edge name of the given operand.
 tensorflow::Status ConvertToNodeDef(
     Operation *op, tensorflow::NodeDef *node, TFGraphDialect *dialect,
-    function_ref<tensorflow::StatusOr<std::string>(Value)> get_value_name);
+    function_ref<absl::StatusOr<std::string>(Value)> get_value_name);
 
 // Convert a single TFG function to a FunctionDef and add it to the function
 // library. If a function with the same name already exists, replace it.
diff --git a/tensorflow/core/ir/importexport/graphdef_import.cc b/tensorflow/core/ir/importexport/graphdef_import.cc
index da2ebcf2d9f0c3..8f1aa158af023d 100644
--- a/tensorflow/core/ir/importexport/graphdef_import.cc
+++ b/tensorflow/core/ir/importexport/graphdef_import.cc
@@ -314,7 +314,7 @@ absl::StatusOr<OwningOpRef<ModuleOp>> GraphDefImporter::ConvertGraphDef(
           ConvertFunctionDef(func_op, gradient_map, function),
           "While importing function: ", function.signature().name());
     }
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   };
 
   // TODO(jeffniu): Don't import functions in parallel if there are too few (how
@@ -414,7 +414,7 @@ Status GraphDefImporter::ConvertFunctionAttributes(
     attrs.append(op.getResourceArgUniqueIdsValuesAttrName(),
                  b_.getI32TensorAttr(resource_arg_unique_ids_values));
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 Status GraphDefImporter::ConvertArgumentAttributes(const OpDef::ArgDef &def,
@@ -437,7 +437,7 @@ Status GraphDefImporter::ConvertArgumentAttributes(const OpDef::ArgDef &def,
                         ConvertAttribute(def.experimental_full_type(), b_));
     attrs.append(dialect_->getTfgFullTypeAttrIdentifier(), full_type);
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 Location GraphDefImporter::ConvertLocation(const NodeDef &node) {
@@ -649,7 +649,7 @@ Status GraphDefImporter::ConvertFunctionDef(
                     TypeAttr::get(b_.getFunctionType(arg_types, res_types)));
   func_op->setAttrs(func_attrs.getDictionary(ctx_));
 
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 Status GraphDefImporter::ConvertNodes(
@@ -702,7 +702,7 @@ Status GraphDefImporter::ConvertNodes(
   // delete it from the IR.
   s.Finalize();
 
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 absl::StatusOr<unsigned int> GraphDefImporter::ArgNumType(
@@ -710,10 +710,10 @@ absl::StatusOr<unsigned int> GraphDefImporter::ArgNumType(
     SmallVectorImpl<Type> &types) {
   // Check whether a type list attribute is specified.
   if (!arg_def.type_list_attr().empty()) {
-    if (auto v =
-            attrs.get(arg_def.type_list_attr()).dyn_cast_or_null<ArrayAttr>()) {
+    if (auto v = mlir::dyn_cast_or_null<ArrayAttr>(
+            attrs.get(arg_def.type_list_attr()))) {
       for (Attribute attr : v) {
-        if (auto dtype = attr.dyn_cast<TypeAttr>()) {
+        if (auto dtype = mlir::dyn_cast<TypeAttr>(attr)) {
           types.push_back(UnrankedTensorType::get(dtype.getValue()));
         } else {
           return InvalidArgument("Expected '", arg_def.type_list_attr(),
@@ -728,8 +728,8 @@ absl::StatusOr<unsigned int> GraphDefImporter::ArgNumType(
   unsigned num = 1;
   // Check whether a number attribute is specified.
   if (!arg_def.number_attr().empty()) {
-    if (auto v =
-            attrs.get(arg_def.number_attr()).dyn_cast_or_null<IntegerAttr>()) {
+    if (auto v = mlir::dyn_cast_or_null<IntegerAttr>(
+            attrs.get(arg_def.number_attr()))) {
       num = v.getValue().getZExtValue();
     } else {
       return NotFound("Type attr not found: ", arg_def.number_attr());
@@ -744,7 +744,8 @@ absl::StatusOr<unsigned int> GraphDefImporter::ArgNumType(
     return InvalidArgument("Arg '", arg_def.name(),
                            "' has invalid type and no type attribute");
   } else {
-    if (auto v = attrs.get(arg_def.type_attr()).dyn_cast_or_null<TypeAttr>()) {
+    if (auto v =
+            mlir::dyn_cast_or_null<TypeAttr>(attrs.get(arg_def.type_attr()))) {
       dtype = v.getValue();
     } else {
       return NotFound("Type attr not found: ", arg_def.type_attr());
@@ -791,7 +792,7 @@ Status GraphDefImporter::ConvertNodeDef(OpBuilder &builder, ConversionState &s,
     TF_ASSIGN_OR_RETURN(tf_type::FullTypeAttr full_type,
                         ConvertAttribute(full_type_def, b_));
     state.addAttribute(dialect_->getFullTypeAttrIdentifier(), full_type);
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   };
   if (node.has_experimental_type()) {
     TF_RETURN_IF_ERROR(add_full_type(node.experimental_type()));
@@ -902,7 +903,7 @@ Status GraphDefImporter::ConvertNodeDef(OpBuilder &builder, ConversionState &s,
   }
   info->backedges.clear();
 
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 Status GraphDefImporter::ConvertDataTypesToUnrankedTensorTypes(
@@ -912,7 +913,7 @@ Status GraphDefImporter::ConvertDataTypesToUnrankedTensorTypes(
     TF_RETURN_IF_ERROR(ConvertDataType(tf_dtype, b_, &dtype));
     results.push_back(UnrankedTensorType::get(dtype));
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 absl::StatusOr<OwningOpRef<ModuleOp>> ImportGraphDef(
diff --git a/tensorflow/core/ir/importexport/graphdef_import.h b/tensorflow/core/ir/importexport/graphdef_import.h
index f5489b698f4d77..cda3a9895052ca 100644
--- a/tensorflow/core/ir/importexport/graphdef_import.h
+++ b/tensorflow/core/ir/importexport/graphdef_import.h
@@ -29,12 +29,12 @@ namespace mlir {
 namespace tfg {
 
 // Convert a GraphDef directly to TFG.
-tensorflow::StatusOr<OwningOpRef<ModuleOp>> ImportGraphDef(
+absl::StatusOr<OwningOpRef<ModuleOp>> ImportGraphDef(
     MLIRContext *context, const tensorflow::GraphDebugInfo &debug_info,
     const tensorflow::GraphDef &graph_def);
 
 // Converts a graph and function library to a TFG module.
-tensorflow::StatusOr<OwningOpRef<ModuleOp>> ImportGraphAndFunctionsToMlir(
+absl::StatusOr<OwningOpRef<ModuleOp>> ImportGraphAndFunctionsToMlir(
     MLIRContext *context, const tensorflow::GraphDebugInfo &debug_info,
     const tensorflow::Graph &graph,
     const tensorflow::FunctionLibraryDefinition &flib_def);
diff --git a/tensorflow/core/ir/importexport/savedmodel_import.cc b/tensorflow/core/ir/importexport/savedmodel_import.cc
index 545fc45e132ac6..a91c8cc227c87c 100644
--- a/tensorflow/core/ir/importexport/savedmodel_import.cc
+++ b/tensorflow/core/ir/importexport/savedmodel_import.cc
@@ -22,7 +22,7 @@ limitations under the License.
 namespace mlir {
 namespace tfg {
 
-tensorflow::StatusOr<OwningOpRef<mlir::ModuleOp>> ImportSavedModelToMlir(
+absl::StatusOr<OwningOpRef<mlir::ModuleOp>> ImportSavedModelToMlir(
     mlir::MLIRContext *context, const tensorflow::GraphDebugInfo &debug_info,
     const tensorflow::SavedModel &saved_model) {
   if (saved_model.meta_graphs_size() == 0) {
diff --git a/tensorflow/core/ir/importexport/savedmodel_import.h b/tensorflow/core/ir/importexport/savedmodel_import.h
index ffe2992f30c5a8..787f2ae5d01ae6 100644
--- a/tensorflow/core/ir/importexport/savedmodel_import.h
+++ b/tensorflow/core/ir/importexport/savedmodel_import.h
@@ -30,7 +30,7 @@ namespace tfg {
 // Only the root graph and function library of the saved model gets imported
 // into MLIR TFG dialect.
 // TODO(b/218882780): Consider importing SignatureDefs from the SavedModel.
-tensorflow::StatusOr<OwningOpRef<mlir::ModuleOp>> ImportSavedModelToMlir(
+absl::StatusOr<OwningOpRef<mlir::ModuleOp>> ImportSavedModelToMlir(
     mlir::MLIRContext* context, const tensorflow::GraphDebugInfo& debug_info,
     const tensorflow::SavedModel& saved_model);
 
diff --git a/tensorflow/core/ir/interfaces.cc b/tensorflow/core/ir/interfaces.cc
index a8b84af5e1eafb..a9f69eba6bbff0 100644
--- a/tensorflow/core/ir/interfaces.cc
+++ b/tensorflow/core/ir/interfaces.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "mlir/IR/Region.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/core/ir/ops.h"
 #include "tensorflow/core/ir/types/dialect.h"
 
@@ -30,7 +31,7 @@ LogicalResult ControlArgumentInterface::verifyRegion(Operation *op,
                                                      Region &region) {
   unsigned num_ctl = 0, num_data = 0;
   for (BlockArgument arg : region.getArguments()) {
-    bool is_ctl = arg.getType().isa<tf_type::ControlType>();
+    bool is_ctl = mlir::isa<tf_type::ControlType>(arg.getType());
     num_ctl += is_ctl;
     num_data += !is_ctl;
   }
diff --git a/tensorflow/core/ir/ops.cc b/tensorflow/core/ir/ops.cc
index 0814e1e4af4b35..5b479272de3f57 100644
--- a/tensorflow/core/ir/ops.cc
+++ b/tensorflow/core/ir/ops.cc
@@ -69,10 +69,11 @@ static void GenericGetAsmResultNames(Operation* op,
   // We only name the results when there are results to name, an op like `print`
   // which does not have results will just use the `ctl` name for the control
   // output.
-  if (op->getNumResults() > 1 && !op->getResult(0).getType().isa<ControlType>())
+  if (op->getNumResults() > 1 &&
+      !mlir::isa<ControlType>(op->getResult(0).getType()))
     set_name_fn(op->getResult(0), op->getName().stripDialect());
   for (Value result : op->getResults()) {
-    if (result.getType().isa<ControlType>()) {
+    if (mlir::isa<ControlType>(result.getType())) {
       set_name_fn(op->getResult(op->getNumResults() - 1), "ctl");
       break;
     }
@@ -310,8 +311,8 @@ static ParseResult ParseCustomTfOp(OpAsmParser& parser,
   llvm::SMLoc loc = parser.getCurrentLocation();
   Type control_type = ControlType::get(context);
   if (failed(parser.parseOptionalColonTypeList(arg_types))) return failure();
-  if (arg_types.size() == 1 && arg_types.front().isa<FunctionType>()) {
-    auto funcType = arg_types.front().cast<FunctionType>();
+  if (arg_types.size() == 1 && mlir::isa<FunctionType>(arg_types.front())) {
+    auto funcType = mlir::cast<FunctionType>(arg_types.front());
     if (funcType.getNumInputs() != numNonControlOperands)
       return parser.emitError(loc)
              << "got " << numNonControlOperands
@@ -398,8 +399,9 @@ bool GraphFuncOp::isMarkedForCompilation() {
   auto is_enabled = [this](StringRef attr_name) -> bool {
     Attribute attr = (*this)->getAttr(attr_name);
     if (!attr) return false;
-    if (auto bool_attr = attr.dyn_cast<BoolAttr>()) return bool_attr.getValue();
-    if (auto str_attr = attr.dyn_cast<StringAttr>())
+    if (auto bool_attr = mlir::dyn_cast<BoolAttr>(attr))
+      return bool_attr.getValue();
+    if (auto str_attr = mlir::dyn_cast<StringAttr>(attr))
       return !str_attr.getValue().empty();
     return false;
   };
@@ -673,7 +675,7 @@ void GraphFuncOp::print(OpAsmPrinter& p) {
     p.printOperand(getArgument(i));
     p << ": ";
     p.printType(arg_types[i]);
-    if (auto arg_attrs = args_attr[i].dyn_cast<DictionaryAttr>())
+    if (auto arg_attrs = mlir::dyn_cast<DictionaryAttr>(args_attr[i]))
       p.printOptionalAttrDict(arg_attrs.getValue());
     if (i != e - 2) {
       p << ", ";
@@ -691,7 +693,7 @@ void GraphFuncOp::print(OpAsmPrinter& p) {
     ArrayAttr results_attr = getAllResultAttrs();
     for (int i = 0, e = result_types.size(); i < e; ++i) {
       p.printType(result_types[i]);
-      if (auto result_attrs = results_attr[i].dyn_cast<DictionaryAttr>())
+      if (auto result_attrs = mlir::dyn_cast<DictionaryAttr>(results_attr[i]))
         p.printOptionalAttrDict(result_attrs.getValue());
       if (i != e - 1) {
         p << ", ";
@@ -720,7 +722,8 @@ GraphFuncOp GraphFuncOp::getCalledFunction(Operation* op,
   // Check if a node does indirect function call via PartitionedCallOp.
   // TODO(aminim): consider replacing with isa<...> when possible.
   if (op->getName().getStringRef() == "tfg.PartitionCall" ||
-      op->getName().getStringRef() == "tfg.StatefulPartitionedCall") {
+      op->getName().getStringRef() == "tfg.StatefulPartitionedCall" ||
+      op->getName().getStringRef() == "tfg.TPUPartitionedCall") {
     auto func_attr = op->getAttrOfType<FuncAttr>("f");
     if (!func_attr) return {};
     GraphFuncOp callee = symbol_table.lookup<GraphFuncOp>(
@@ -761,7 +764,8 @@ void GraphFuncOp::getAsmBlockArgumentNames(Region& region,
   ArrayAttr args_attr = getAllArgAttrs();
   if (!args_attr || args_attr.size() != args.size()) return;
   for (int arg_num = 0, e = args.size(); arg_num < e; arg_num += 2) {
-    DictionaryAttr arg_attrs = args_attr[arg_num].dyn_cast<DictionaryAttr>();
+    DictionaryAttr arg_attrs =
+        mlir::dyn_cast<DictionaryAttr>(args_attr[arg_num]);
     if (!arg_attrs) continue;
     if (auto strAttr = arg_attrs.getAs<StringAttr>("tfg.name")) {
       set_name_fn(args[arg_num], strAttr.getValue());
@@ -1053,7 +1057,7 @@ static LogicalResult VerifyCaseLikeOp(CaseLikeOp op,
   TypeRange func_args = ins->drop_front();
 
   for (const auto& it : llvm::enumerate(op.getBranches())) {
-    SymbolRefAttr func_name = it.value().template cast<FuncAttr>().getName();
+    SymbolRefAttr func_name = mlir::cast<FuncAttr>(it.value()).getName();
     auto func =
         symbol_table.lookupNearestSymbolFrom<GraphFuncOp>(op, func_name);
     if (func && failed(VerifySignature(func, op, func_args, *outs,
@@ -1126,7 +1130,7 @@ static LogicalResult VerifyPreservedAttrs(Operation* op,
   assert(op->getNumRegions() == preserved_attrs.size());
   for (auto it : llvm::zip(preserved_attrs, op->getRegions())) {
     // Preserved attributes for a particular region may not exist.
-    auto attrs = std::get<0>(it).dyn_cast_or_null<RegionAttr>();
+    auto attrs = mlir::dyn_cast_or_null<RegionAttr>(std::get<0>(it));
     if (!attrs) continue;
     Region& region = std::get<1>(it);
 
@@ -1195,7 +1199,7 @@ static LogicalResult VerifyIfLikeRegionOp(IfLikeRegionOp op) {
 // TODO(jeffniu): Incorporate the other cases of `tf.ToBool`.
 static std::optional<bool> GetStaticallyKnownBranch(Attribute cond_attr) {
   // Only handle the case of a scalar tensor of i1.
-  auto cond = cond_attr.dyn_cast_or_null<ElementsAttr>();
+  auto cond = mlir::dyn_cast_or_null<ElementsAttr>(cond_attr);
   if (cond && cond.getNumElements() == 1 &&
       cond.getElementType().isSignlessInteger(1))
     return cond.getSplatValue<bool>();
@@ -1275,7 +1279,7 @@ static LogicalResult VerifyCaseLikeRegionOp(CaseLikeRegionOp op) {
 // try to narrow it to a statically known branch index.
 static std::optional<unsigned> GetStaticallyKnownCaseBranch(
     Attribute branch_attr) {
-  auto branch = branch_attr.dyn_cast_or_null<ElementsAttr>();
+  auto branch = mlir::dyn_cast_or_null<ElementsAttr>(branch_attr);
   if (branch && branch.getNumElements() == 1 &&
       branch.getElementType().isSignlessInteger(32))
     return branch.getSplatValue<unsigned>();
@@ -1344,7 +1348,7 @@ static LogicalResult VerifyLoopRegionArgs(Operation* op, Region& region) {
   // the first half of the arguments are not control tokens, then we know for
   // sure that the second half is only control tokens.
   for (BlockArgument data : GetLoopRegionDataArgs(region))
-    if (data.getType().isa<ControlType>())
+    if (mlir::isa<ControlType>(data.getType()))
       return arg_error(data) << "should not be a control token";
   return success();
 }
@@ -1412,7 +1416,7 @@ LogicalResult ForRegionOp::verify() {
         "expected the body block to have at least have the loop index as an "
         "argument");
   }
-  auto index = args.front().getType().dyn_cast<TensorType>();
+  auto index = mlir::dyn_cast<TensorType>(args.front().getType());
   if (!index || !index.getElementType().isSignlessInteger(32)) {
     return emitOpError(
         "expected first body block argument to be an i32 tensor");
@@ -1467,8 +1471,9 @@ bool FunctionTable::MayBeCall(Operation* op) const {
   if (IsLegacyCall(op)) return true;
   // The operation might be a call if it references a symbol.
   bool references_symbol = false;
-  op->getAttrDictionary().walk(
-      [&](Attribute attr) { references_symbol |= attr.isa<SymbolRefAttr>(); });
+  op->getAttrDictionary().walk([&](Attribute attr) {
+    references_symbol |= mlir::isa<SymbolRefAttr>(attr);
+  });
   return references_symbol;
 }
 
diff --git a/tensorflow/core/ir/tf_op_names.cc b/tensorflow/core/ir/tf_op_names.cc
index 0aadaf9288863a..bd43c99d84b397 100644
--- a/tensorflow/core/ir/tf_op_names.cc
+++ b/tensorflow/core/ir/tf_op_names.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/core/ir/dialect.h"
 #include "tensorflow/core/ir/tf_op_wrapper.h"
 
@@ -26,7 +27,7 @@ bool TFGraphDialect::IsAdd(TFOp op) const {
 
   if (op_name == add_v2_) return true;
   if (op_name == add_)
-    return !op->getAttrOfType<TypeAttr>("T").getValue().isa<StringType>();
+    return !mlir::isa<StringType>(op->getAttrOfType<TypeAttr>("T").getValue());
   return false;
 }
 
diff --git a/tensorflow/core/ir/tf_op_wrapper.h b/tensorflow/core/ir/tf_op_wrapper.h
index 1bf02d90877c66..36a3964d45ea7d 100644
--- a/tensorflow/core/ir/tf_op_wrapper.h
+++ b/tensorflow/core/ir/tf_op_wrapper.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/TypeRange.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/core/ir/dialect.h"
 #include "tensorflow/core/ir/types/dialect.h"
 #include "tensorflow/core/ir/utility.h"
@@ -38,7 +39,7 @@ class ControlRetIterator final
                                    ValueIteratorT, Value>::mapped_iterator_base;
 
   Value mapElement(Value value) const {
-    return value.getType().isa<tf_type::ControlType>()
+    return mlir::isa<tf_type::ControlType>(value.getType())
                ? value
                : tfg::LookupControlDependency(value);
   }
diff --git a/tensorflow/core/ir/tf_op_wrapper_test.cc b/tensorflow/core/ir/tf_op_wrapper_test.cc
index 1eaf6731e670fc..b8bc7bfd55c86d 100644
--- a/tensorflow/core/ir/tf_op_wrapper_test.cc
+++ b/tensorflow/core/ir/tf_op_wrapper_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/core/ir/dialect.h"
 #include "tensorflow/core/ir/ops.h"
 #include "tensorflow/core/platform/test.h"
@@ -84,7 +85,7 @@ TEST(TFOpWrapper, ControlOperands) {
   EXPECT_EQ(ctls.size(), 2u);
 
   OperandRange::iterator ctl_it = llvm::find_if(operands, [](Value operand) {
-    return operand.getType().isa<ControlType>();
+    return mlir::isa<ControlType>(operand.getType());
   });
   EXPECT_NE(ctl_it, operands.end());
   EXPECT_EQ(data.end(), ctl_it);
@@ -184,7 +185,7 @@ TEST(TFOpWrapper, ValueControlRet) {
   // Value with ControlType will be the same.
   EXPECT_EQ(ret_range[2], const_op.controlRet());
 
-  for (Value v : ret_range) EXPECT_TRUE(v.getType().isa<ControlType>());
+  for (Value v : ret_range) EXPECT_TRUE(mlir::isa<ControlType>(v.getType()));
 }
 
 }  // namespace
diff --git a/tensorflow/core/ir/types/BUILD b/tensorflow/core/ir/types/BUILD
index 2181219addb51f..18bb1541d76662 100644
--- a/tensorflow/core/ir/types/BUILD
+++ b/tensorflow/core/ir/types/BUILD
@@ -123,5 +123,6 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Support",
     ],
 )
diff --git a/tensorflow/core/ir/types/dialect.cc b/tensorflow/core/ir/types/dialect.cc
index 80fd703bdf1c29..bd693656cf6fbe 100644
--- a/tensorflow/core/ir/types/dialect.cc
+++ b/tensorflow/core/ir/types/dialect.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OpImplementation.h"  // from @llvm-project
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 
 #define GET_ATTRDEF_CLASSES
@@ -274,7 +275,7 @@ static void RawFullTypeAttrPrint(FullTypeAttr tfattr, AsmPrinter& printer) {
   if (!tfattr.getArgs().empty()) {
     printer << "<";
     llvm::interleaveComma(tfattr.getArgs(), printer, [&](Attribute arg) {
-      if (auto t = arg.dyn_cast<FullTypeAttr>())
+      if (auto t = mlir::dyn_cast<FullTypeAttr>(arg))
         RawFullTypeAttrPrint(t, printer);
       else
         printer << "<<INVALID ARG>>";
@@ -320,7 +321,7 @@ Attribute FuncAttr::parse(AsmParser& parser, Type type) {
     parser.emitError(loc) << "expected symbol while parsing tf.func attribute";
     return {};
   }
-  if (auto func_name_str = name.dyn_cast<StringAttr>()) {
+  if (auto func_name_str = mlir::dyn_cast<StringAttr>(name)) {
     if (!func_name_str.getValue().empty()) {
       parser.emitError(loc)
           << "expected empty string or symbol while parsing tf.func "
@@ -329,20 +330,20 @@ Attribute FuncAttr::parse(AsmParser& parser, Type type) {
     }
     name = SymbolRefAttr::get(parser.getContext(), "");
   }
-  if (!name.isa<SymbolRefAttr>()) {
+  if (!mlir::isa<SymbolRefAttr>(name)) {
     parser.emitError(loc) << "expected symbol while parsing tf.func attribute";
     return {};
   }
   if (failed(parser.parseComma())) return {};
   loc = parser.getCurrentLocation();
-  if (failed(parser.parseAttribute(dict)) || !dict.isa<DictionaryAttr>()) {
+  if (failed(parser.parseAttribute(dict)) || !mlir::isa<DictionaryAttr>(dict)) {
     parser.emitError(loc)
         << "expected Dictionary attribute while parsing tf.func attribute";
     return {};
   }
   if (failed(parser.parseGreater())) return {};
-  return FuncAttr::get(parser.getContext(), name.cast<SymbolRefAttr>(),
-                       dict.cast<DictionaryAttr>());
+  return FuncAttr::get(parser.getContext(), mlir::cast<SymbolRefAttr>(name),
+                       mlir::cast<DictionaryAttr>(dict));
 }
 
 void PlaceholderAttr::print(AsmPrinter& os) const {
@@ -455,7 +456,7 @@ namespace {
 // Returns the shape of the given value if it's ranked; returns std::nullopt
 // otherwise.
 std::optional<ArrayRef<int64_t>> GetShape(Value value) {
-  auto shaped_type = value.getType().cast<ShapedType>();
+  auto shaped_type = mlir::cast<ShapedType>(value.getType());
   if (shaped_type.hasRank()) return shaped_type.getShape();
   return std::nullopt;
 }
@@ -516,13 +517,13 @@ bool TensorFlowType::classof(Type type) {
   return llvm::isa<TFTypeDialect>(type.getDialect());
 }
 bool TensorFlowRefType::classof(Type type) {
-  return type.isa<
+  return mlir::isa<
 #define HANDLE_TF_TYPE(tftype, enumerant, name)
 #define HANDLE_TF_REF_TYPE(tftype, enumerant, name) tftype##Type,
 #define HANDLE_LAST_TF_TYPE(tftype, enumerant, name) tftype##Type
 // NOLINTNEXTLINE
 #include "tensorflow/core/ir/types/types.def"
-      >();
+      >(type);
 }
 
 TensorFlowType TensorFlowRefType::get(Type type) {
@@ -540,7 +541,7 @@ TensorFlowType TensorFlowRefType::get(Type type) {
     return Float8E4M3FNRefType::get(ctx);
   } else if (type.isFloat8E5M2()) {
     return Float8E5M2RefType::get(ctx);
-  } else if (auto complex_type = type.dyn_cast<ComplexType>()) {
+  } else if (auto complex_type = mlir::dyn_cast<ComplexType>(type)) {
     Type etype = complex_type.getElementType();
     if (etype.isF32()) {
       return Complex64RefType::get(ctx);
@@ -548,7 +549,7 @@ TensorFlowType TensorFlowRefType::get(Type type) {
       return Complex128RefType::get(ctx);
     }
     llvm_unreachable("unexpected complex type");
-  } else if (auto itype = type.dyn_cast<IntegerType>()) {
+  } else if (auto itype = mlir::dyn_cast<IntegerType>(type)) {
     switch (itype.getWidth()) {
       case 1:
         return BoolRefType::get(ctx);
@@ -583,30 +584,34 @@ TensorFlowType TensorFlowRefType::get(Type type) {
 
 Type TensorFlowRefType::RemoveRef() {
   MLIRContext* ctx = getContext();
-  if (isa<HalfRefType>()) return FloatType::getF16(ctx);
-  if (isa<FloatRefType>()) return FloatType::getF32(ctx);
-  if (isa<DoubleRefType>()) return FloatType::getF64(ctx);
-  if (isa<Bfloat16RefType>()) return FloatType::getBF16(ctx);
-  if (isa<Float8E4M3FNType>()) return FloatType::getFloat8E4M3FN(ctx);
-  if (isa<Float8E5M2Type>()) return FloatType::getFloat8E5M2(ctx);
-  if (isa<BoolRefType>()) return IntegerType::get(ctx, 1);
-  if (isa<Int4RefType>()) return IntegerType::get(ctx, 4, IntegerType::Signed);
-  if (isa<Int8RefType>()) return IntegerType::get(ctx, 8);
-  if (isa<Int16RefType>()) return IntegerType::get(ctx, 16);
-  if (isa<Int32RefType>()) return IntegerType::get(ctx, 32);
-  if (isa<Int64RefType>()) return IntegerType::get(ctx, 64);
-  if (isa<Uint4RefType>())
+  if (mlir::isa<HalfRefType>(*this)) return FloatType::getF16(ctx);
+  if (mlir::isa<FloatRefType>(*this)) return FloatType::getF32(ctx);
+  if (mlir::isa<DoubleRefType>(*this)) return FloatType::getF64(ctx);
+  if (mlir::isa<Bfloat16RefType>(*this)) return FloatType::getBF16(ctx);
+  if (mlir::isa<Float8E4M3FNType>(*this))
+    return FloatType::getFloat8E4M3FN(ctx);
+  if (mlir::isa<Float8E5M2Type>(*this)) return FloatType::getFloat8E5M2(ctx);
+  if (mlir::isa<BoolRefType>(*this)) return IntegerType::get(ctx, 1);
+  if (mlir::isa<Int4RefType>(*this))
+    return IntegerType::get(ctx, 4, IntegerType::Signed);
+  if (mlir::isa<Int8RefType>(*this)) return IntegerType::get(ctx, 8);
+  if (mlir::isa<Int16RefType>(*this)) return IntegerType::get(ctx, 16);
+  if (mlir::isa<Int32RefType>(*this)) return IntegerType::get(ctx, 32);
+  if (mlir::isa<Int64RefType>(*this)) return IntegerType::get(ctx, 64);
+  if (mlir::isa<Uint4RefType>(*this))
     return IntegerType::get(ctx, 4, IntegerType::Unsigned);
-  if (isa<Uint8RefType>())
+  if (mlir::isa<Uint8RefType>(*this))
     return IntegerType::get(ctx, 8, IntegerType::Unsigned);
-  if (isa<Uint16RefType>())
+  if (mlir::isa<Uint16RefType>(*this))
     return IntegerType::get(ctx, 16, IntegerType::Unsigned);
-  if (isa<Uint32RefType>())
+  if (mlir::isa<Uint32RefType>(*this))
     return IntegerType::get(ctx, 32, IntegerType::Unsigned);
-  if (isa<Uint64RefType>())
+  if (mlir::isa<Uint64RefType>(*this))
     return IntegerType::get(ctx, 64, IntegerType::Unsigned);
-  if (isa<Complex64RefType>()) return ComplexType::get(FloatType::getF32(ctx));
-  if (isa<Complex128RefType>()) return ComplexType::get(FloatType::getF64(ctx));
+  if (mlir::isa<Complex64RefType>(*this))
+    return ComplexType::get(FloatType::getF32(ctx));
+  if (mlir::isa<Complex128RefType>(*this))
+    return ComplexType::get(FloatType::getF64(ctx));
 #define HANDLE_TF_TYPE(tftype, enumerant, name) \
   if (isa<tftype##RefType>()) return tftype##Type::get(ctx);
 
@@ -617,32 +622,32 @@ Type TensorFlowRefType::RemoveRef() {
 }
 
 bool TensorFlowTypeWithSubtype::classof(Type type) {
-  return type.isa<ResourceType, VariantType>();
+  return mlir::isa<ResourceType, VariantType>(type);
 }
 
 Type TensorFlowTypeWithSubtype::RemoveSubtypes() {
   MLIRContext* ctx = getContext();
-  if (isa<VariantType>()) return VariantType::get(ctx);
-  if (isa<ResourceType>()) return ResourceType::get(ctx);
+  if (mlir::isa<VariantType>(*this)) return VariantType::get(ctx);
+  if (mlir::isa<ResourceType>(*this)) return ResourceType::get(ctx);
   llvm_unreachable("unexpected tensorflow type with subtypes kind");
 }
 
 TensorFlowTypeWithSubtype TensorFlowTypeWithSubtype::clone(
     ArrayRef<TensorType> new_subtypes) {
   MLIRContext* ctx = getContext();
-  if (isa<VariantType>())
-    return VariantType::get(new_subtypes, ctx)
-        .cast<TensorFlowTypeWithSubtype>();
-  if (isa<ResourceType>())
-    return ResourceType::get(new_subtypes, ctx)
-        .cast<TensorFlowTypeWithSubtype>();
+  if (mlir::isa<VariantType>(*this))
+    return mlir::cast<TensorFlowTypeWithSubtype>(
+        VariantType::get(new_subtypes, ctx));
+  if (mlir::isa<ResourceType>(*this))
+    return mlir::cast<TensorFlowTypeWithSubtype>(
+        ResourceType::get(new_subtypes, ctx));
   llvm_unreachable("unexpected tensorflow type with subtypes kind");
 }
 
 ArrayRef<TensorType> TensorFlowTypeWithSubtype::GetSubtypes() {
-  if (auto variant_type = dyn_cast<VariantType>())
+  if (auto variant_type = mlir::dyn_cast<VariantType>(*this))
     return variant_type.getSubtypes();
-  if (auto resource_type = dyn_cast<ResourceType>())
+  if (auto resource_type = mlir::dyn_cast<ResourceType>(*this))
     return resource_type.getSubtypes();
   llvm_unreachable("unexpected tensorflow type with subtypes kind");
 }
@@ -659,8 +664,8 @@ bool BroadcastCompatible(TypeRange lhs, TypeRange rhs) {
     auto rhs_type = DropRefType(std::get<1>(types));
 
     // This should be true for all TF ops:
-    auto lhs_tt = lhs_type.dyn_cast<TensorType>();
-    auto rhs_tt = rhs_type.dyn_cast<TensorType>();
+    auto lhs_tt = mlir::dyn_cast<TensorType>(lhs_type);
+    auto rhs_tt = mlir::dyn_cast<TensorType>(rhs_type);
     if (!lhs_tt || !rhs_tt) {
       if (lhs_type != rhs_type) return false;
       continue;
@@ -673,8 +678,8 @@ bool BroadcastCompatible(TypeRange lhs, TypeRange rhs) {
     auto rhs_et = rhs_tt.getElementType();
     if (lhs_et != rhs_et) {
       // If either does not have subtypes, then the element types don't match.
-      auto lhs_wst = lhs_et.dyn_cast<TensorFlowTypeWithSubtype>();
-      auto rhs_wst = rhs_et.dyn_cast<TensorFlowTypeWithSubtype>();
+      auto lhs_wst = mlir::dyn_cast<TensorFlowTypeWithSubtype>(lhs_et);
+      auto rhs_wst = mlir::dyn_cast<TensorFlowTypeWithSubtype>(rhs_et);
       if (!lhs_wst || !rhs_wst) return false;
 
       // Consider the subtype of variant types.
@@ -689,8 +694,8 @@ bool BroadcastCompatible(TypeRange lhs, TypeRange rhs) {
       }
     }
 
-    auto lhs_rt = lhs_type.dyn_cast<RankedTensorType>();
-    auto rhs_rt = rhs_type.dyn_cast<RankedTensorType>();
+    auto lhs_rt = mlir::dyn_cast<RankedTensorType>(lhs_type);
+    auto rhs_rt = mlir::dyn_cast<RankedTensorType>(rhs_type);
     if (!lhs_rt || !rhs_rt) return true;
     SmallVector<int64_t, 4> shape;
     return OpTrait::util::getBroadcastedShape(lhs_rt.getShape(),
@@ -721,8 +726,8 @@ Type GetCastCompatibleType(Type a, Type b, bool may_ignore_ref_type_a) {
   // Fast path if everything is equal.
   if (a == b) return b;
 
-  auto a_tt = a.dyn_cast<TensorType>();
-  auto b_tt = b.dyn_cast<TensorType>();
+  auto a_tt = mlir::dyn_cast<TensorType>(a);
+  auto b_tt = mlir::dyn_cast<TensorType>(b);
 
   // If only one of a or b is a tensor type, they are incompatible.
   if (static_cast<bool>(a_tt) ^ static_cast<bool>(b_tt)) return nullptr;
@@ -732,7 +737,7 @@ Type GetCastCompatibleType(Type a, Type b, bool may_ignore_ref_type_a) {
   if (!a_tt && !b_tt) {
     // Remove ref types.
     if (may_ignore_ref_type_a) {
-      if (auto ref_type = a.dyn_cast<TensorFlowRefType>()) {
+      if (auto ref_type = mlir::dyn_cast<TensorFlowRefType>(a)) {
         a = ref_type.RemoveRef();
         if (a == b) return a;
       }
@@ -741,8 +746,8 @@ Type GetCastCompatibleType(Type a, Type b, bool may_ignore_ref_type_a) {
 
     // If either is not a type that contain subtypes then the types are not cast
     // compatible.
-    auto a_wst = a.dyn_cast<TensorFlowTypeWithSubtype>();
-    auto b_wst = b.dyn_cast<TensorFlowTypeWithSubtype>();
+    auto a_wst = mlir::dyn_cast<TensorFlowTypeWithSubtype>(a);
+    auto b_wst = mlir::dyn_cast<TensorFlowTypeWithSubtype>(b);
     if (!a_wst || !b_wst) return nullptr;
 
     // For Variant types we are more permissive right now and accept all pairs
@@ -752,8 +757,8 @@ Type GetCastCompatibleType(Type a, Type b, bool may_ignore_ref_type_a) {
     // one, so we should only assign it one when we know the subtype. Then we
     // can be more constrained and check subtypes for cast compatibility as
     // well.
-    if (a.isa<VariantType>()) return a;
-    if (b.isa<VariantType>()) return b;
+    if (mlir::isa<VariantType>(a)) return a;
+    if (mlir::isa<VariantType>(b)) return b;
 
     // For Resource types, we recursively check the subtypes for cast
     // compatibility, if possible. Otherwise treat them as compatible.
@@ -768,7 +773,7 @@ Type GetCastCompatibleType(Type a, Type b, bool may_ignore_ref_type_a) {
           GetCastCompatibleType(std::get<0>(subtypes), std::get<1>(subtypes),
                                 /*may_ignore_ref_type_a=*/false);
       if (!refined_st) return nullptr;
-      refined_subtypes.push_back(refined_st.cast<TensorType>());
+      refined_subtypes.push_back(mlir::cast<TensorType>(refined_st));
     }
 
     return ResourceType::get(refined_subtypes, a.getContext());
@@ -833,13 +838,13 @@ static Type GetDefaultTypeOf(TensorFlowRefType type) {
 template <typename ComposedType>
 Type DropTypeHelper(Type ty) {
   Type element_ty = getElementTypeOrSelf(ty);
-  auto composed_type = element_ty.dyn_cast<ComposedType>();
+  auto composed_type = mlir::dyn_cast<ComposedType>(element_ty);
   if (!composed_type) return ty;
 
   Type default_ty = GetDefaultTypeOf(composed_type);
-  if (auto ranked_ty = ty.dyn_cast<RankedTensorType>()) {
+  if (auto ranked_ty = mlir::dyn_cast<RankedTensorType>(ty)) {
     return RankedTensorType::get(ranked_ty.getShape(), default_ty);
-  } else if (ty.dyn_cast<UnrankedTensorType>()) {
+  } else if (mlir::dyn_cast<UnrankedTensorType>(ty)) {
     return UnrankedTensorType::get(default_ty);
   } else {
     return default_ty;
@@ -867,7 +872,7 @@ Attribute TensorProtoAttr::parse(AsmParser& parser, Type type) {
     parser.emitError(parser.getNameLoc(), "Hex string doesn't start with `0x`");
     return nullptr;
   }
-  auto shapedType = type.dyn_cast<ShapedType>();
+  auto shapedType = mlir::dyn_cast<ShapedType>(type);
   if (!shapedType) return nullptr;
 
   std::string bytes_data = absl::HexStringToBytes(data.substr(2));
diff --git a/tensorflow/core/ir/types/dialect.h b/tensorflow/core/ir/types/dialect.h
index e2dc8bef70a5d1..7c1a1cda1bec94 100644
--- a/tensorflow/core/ir/types/dialect.h
+++ b/tensorflow/core/ir/types/dialect.h
@@ -28,6 +28,7 @@ limitations under the License.
 // Include the dialect class generated from dialect.td.
 // The constructor and the printing/parsing of dialect types are manually
 // implemented (see ops.cpp).
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/core/ir/types/dialect.h.inc"
 
 // Include the Type classes declaration generated from types.td
@@ -52,15 +53,15 @@ class TensorFlowType : public Type {
 
 // Returns true if the specified type is a valid TensorFlow element type.
 inline bool IsValidTFElementType(Type type) {
-  return type.isa<ComplexType, FloatType, IntegerType, TensorFlowType,
-                  quant::QuantizedType>();
+  return mlir::isa<ComplexType, FloatType, IntegerType, TensorFlowType,
+                   quant::QuantizedType>(type);
 }
 
 // Returns true if this is a valid TensorFlow tensor type.
 inline bool IsValidTFTensorType(Type type) {
   // TensorFlow types should be tensors of one of the valid TensorFlow element
   // types.
-  if (auto tensor_ty = type.dyn_cast<TensorType>())
+  if (auto tensor_ty = mlir::dyn_cast<TensorType>(type))
     return IsValidTFElementType(tensor_ty.getElementType());
   return false;
 }
@@ -329,7 +330,7 @@ using ResultShapeRange = iterator_range<ResultShapeIterator>;
 template <typename RangeT>
 auto filter_resources(RangeT&& range) {
   return llvm::make_filter_range(std::forward<RangeT>(range), [](Value val) {
-    return getElementTypeOrSelf(val.getType()).isa<ResourceType>();
+    return mlir::isa<ResourceType>(getElementTypeOrSelf(val.getType()));
   });
 }
 
@@ -338,7 +339,7 @@ auto filter_resources(RangeT&& range) {
 // standard type if necessary.
 inline Type GetElementTypeOrSelfResolveRef(Type type) {
   Type element_type = getElementTypeOrSelf(type);
-  if (auto ref_type = element_type.dyn_cast<TensorFlowRefType>()) {
+  if (auto ref_type = mlir::dyn_cast<TensorFlowRefType>(element_type)) {
     element_type = ref_type.RemoveRef();
   }
   return element_type;
diff --git a/tensorflow/core/ir/types/dialect_test.cc b/tensorflow/core/ir/types/dialect_test.cc
index db5d1cfed6d502..1fb6537b4684f7 100644
--- a/tensorflow/core/ir/types/dialect_test.cc
+++ b/tensorflow/core/ir/types/dialect_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/core/platform/test.h"
 
 namespace mlir {
@@ -46,7 +47,7 @@ TEST(TFTypesDialect, TestFuncAttrSubElement) {
   ASSERT_TRUE(succeeded(SymbolTable::replaceAllSymbolUses(
       b.getStringAttr("foo"), baz, test_op.getParentRegion())));
 
-  auto func_attr = test_op.getAttr("func").dyn_cast<tf_type::FuncAttr>();
+  auto func_attr = mlir::dyn_cast<tf_type::FuncAttr>(test_op.getAttr("func"));
   ASSERT_TRUE(func_attr);
   auto sym_ref = FlatSymbolRefAttr::get(baz);
   EXPECT_TRUE(func_attr.getName() == sym_ref);
diff --git a/tensorflow/core/ir/utility.cc b/tensorflow/core/ir/utility.cc
index e15eb5799f8f3d..168a71e76bc310 100644
--- a/tensorflow/core/ir/utility.cc
+++ b/tensorflow/core/ir/utility.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "mlir/IR/Block.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/core/ir/dialect.h"
 #include "tensorflow/core/ir/interfaces.h"
 #include "tensorflow/core/ir/types/dialect.h"
@@ -47,35 +48,36 @@ BlockArgument GetLoopRegionDataOf(BlockArgument ctl) {
 }
 
 Value LookupControlDependency(Value data) {
-  assert(!data.getType().isa<ControlType>() && "expected a data type");
+  assert(!mlir::isa<ControlType>(data.getType()) && "expected a data type");
   // If the value is defined by an op, then the last result is the control
   // dependency.
   Value control_dep;
-  if (auto result = data.dyn_cast<OpResult>()) {
+  if (auto result = mlir::dyn_cast<OpResult>(data)) {
     control_dep = *std::prev(result.getOwner()->result_end());
   } else {
-    auto arg = data.cast<BlockArgument>();
+    auto arg = mlir::cast<BlockArgument>(data);
     control_dep = cast<ControlArgumentInterface>(arg.getOwner()->getParentOp())
                       .getControlTokenOf(arg);
   }
-  assert(control_dep.getType().isa<ControlType>() && "expected a control type");
+  assert(mlir::isa<ControlType>(control_dep.getType()) &&
+         "expected a control type");
   return control_dep;
 }
 
 std::optional<Value> LookupDataValue(Value ctl) {
-  assert(ctl.getType().isa<ControlType>() && "expected a control type");
+  assert(mlir::isa<ControlType>(ctl.getType()) && "expected a control type");
   // If the value is defined by an op, then return the first result.
   Value data;
-  if (auto result = ctl.dyn_cast<OpResult>()) {
+  if (auto result = mlir::dyn_cast<OpResult>(ctl)) {
     // If the op only has a control result, then there is no data value.
     if (result.getOwner()->getNumResults() == 1) return {};
     data = *result.getOwner()->result_begin();
   } else {
-    auto arg = ctl.cast<BlockArgument>();
+    auto arg = mlir::cast<BlockArgument>(ctl);
     data = cast<ControlArgumentInterface>(arg.getOwner()->getParentOp())
                .getDataValueOf(arg);
   }
-  assert(!data.getType().isa<ControlType>() && "expected a data type");
+  assert(!mlir::isa<ControlType>(data.getType()) && "expected a data type");
   return data;
 }
 
diff --git a/tensorflow/core/ir/utility_test.cc b/tensorflow/core/ir/utility_test.cc
index 97c3184f228535..200d1103c7d7fb 100644
--- a/tensorflow/core/ir/utility_test.cc
+++ b/tensorflow/core/ir/utility_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/core/ir/dialect.h"
 #include "tensorflow/core/ir/ops.h"
 #include "tensorflow/core/platform/test.h"
@@ -49,7 +50,7 @@ TEST(DialectUtilityTest, TestLookupControlDependency) {
   Value copy = ret_op.getOperand(0);
   Value ctl = LookupControlDependency(copy);
   ASSERT_TRUE(ctl);
-  OpResult ctl_result = ctl.dyn_cast<OpResult>();
+  OpResult ctl_result = mlir::dyn_cast<OpResult>(ctl);
   ASSERT_TRUE(ctl_result);
   EXPECT_EQ(ctl_result.getResultNumber(), 1);
   EXPECT_EQ(copy, ctl_result.getOwner()->getResult(0));
@@ -58,7 +59,7 @@ TEST(DialectUtilityTest, TestLookupControlDependency) {
   Value arg = ctl_result.getOwner()->getOperand(0);
   Value arg_ctl = LookupControlDependency(arg);
   ASSERT_TRUE(arg_ctl);
-  BlockArgument ctl_arg = arg_ctl.dyn_cast<BlockArgument>();
+  BlockArgument ctl_arg = mlir::dyn_cast<BlockArgument>(arg_ctl);
   ASSERT_TRUE(ctl_arg);
   EXPECT_EQ(ctl_arg.getArgNumber(), 1);
   EXPECT_EQ(arg, ctl_arg.getOwner()->getArgument(0));
@@ -84,7 +85,7 @@ TEST(DialectUtilityTest, TestLookupDataValue) {
   Value ctl = ret_op.getOperand(1);
   std::optional<Value> produce = LookupDataValue(ctl);
   ASSERT_TRUE(produce);
-  OpResult produce_result = produce->dyn_cast<OpResult>();
+  OpResult produce_result = mlir::dyn_cast<OpResult>(*produce);
   ASSERT_TRUE(produce_result);
   ASSERT_EQ(produce_result.getResultNumber(), 0);
   ASSERT_EQ(produce_result.getOwner()->getName().getStringRef(), "tfg.Produce");
@@ -93,7 +94,7 @@ TEST(DialectUtilityTest, TestLookupDataValue) {
   Value arg_ctl = produce_result.getOwner()->getOperand(0);
   std::optional<Value> arg = LookupDataValue(arg_ctl);
   ASSERT_TRUE(arg);
-  BlockArgument arg_arg = arg->dyn_cast<BlockArgument>();
+  BlockArgument arg_arg = mlir::dyn_cast<BlockArgument>(*arg);
   ASSERT_TRUE(arg_arg);
   ASSERT_EQ(arg_arg.getArgNumber(), 0);
   ASSERT_EQ(arg_arg.getOwner()->getArgument(1), arg_ctl);
diff --git a/tensorflow/core/ir/utils/shape_inference_utils_test.cc b/tensorflow/core/ir/utils/shape_inference_utils_test.cc
index f1821f5b00b56e..c5359b19048099 100644
--- a/tensorflow/core/ir/utils/shape_inference_utils_test.cc
+++ b/tensorflow/core/ir/utils/shape_inference_utils_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/ir/dialect.h"
@@ -82,7 +83,7 @@ class ShapeInferenceTest : public ::testing::Test {
 
       EXPECT_EQ(op.getNumResults() - 1, info.size());
       for (int i = 0; i < op.getNumResults() - 1; ++i) {
-        ShapedType shape = op.getResultTypes()[i].cast<ShapedType>();
+        ShapedType shape = mlir::cast<ShapedType>(op.getResultTypes()[i]);
         EXPECT_EQ(shape.hasRank(), info[i].hasRank());
         if (shape.hasRank()) EXPECT_EQ(shape.getShape(), info[i].getDims());
         if (check_type)
@@ -114,7 +115,7 @@ TEST_F(ShapeInferenceTest, TestShapeAndTypeInference) {
   // `value` attr contains the tensor information and it's a DenseElementAttr.
   auto op_result_as_shape_fn = [](InferenceContext &ic,
                                   OpResult op_result) -> ShapeHandle {
-    auto rt = op_result.getType().dyn_cast<RankedTensorType>();
+    auto rt = mlir::dyn_cast<RankedTensorType>(op_result.getType());
     if (!rt || rt.getRank() != 1 || !rt.hasStaticShape()) return {};
 
     std::vector<DimensionHandle> dims(rt.getDimSize(0), ic.UnknownDim());
@@ -136,7 +137,8 @@ TEST_F(ShapeInferenceTest, TestShapeAndTypeInference) {
     // `InferReturnTypeComponentsForTFOp`uses this callback to get the type
     // information.
     auto result_element_type_fn = [&](int idx) -> Type {
-      return op.getResult(idx).getType().cast<ShapedType>().getElementType();
+      return mlir::cast<ShapedType>(op.getResult(idx).getType())
+          .getElementType();
     };
 
     // We use TFG operation so that we don't need to provide
@@ -178,7 +180,8 @@ TEST_F(ShapeInferenceTest, TestShapeAndTypeInference) {
   all_results.clear();
   for (Operation &op : block.without_terminator()) {
     auto result_element_type_fn = [&](int idx) -> Type {
-      return op.getResult(idx).getType().cast<ShapedType>().getElementType();
+      return mlir::cast<ShapedType>(op.getResult(idx).getType())
+          .getElementType();
     };
 
     SmallVector<ShapedTypeComponents> results;
@@ -220,7 +223,7 @@ TEST_F(ShapeInferenceTest, TestInferenceFailure) {
   // "value" attribute.
   auto get_empty_attr_values_fn =
       [](Operation *, llvm::StringRef, const tensorflow::OpRegistrationData *,
-         bool, tensorflow::AttrValueMap *) { return ::tensorflow::OkStatus(); };
+         bool, tensorflow::AttrValueMap *) { return absl::OkStatus(); };
 
   for (Operation &op : block.without_terminator()) {
     SmallVector<ShapedTypeComponents> results;
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index c541597c8d9d80..158ace64ba184b 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -5138,7 +5138,7 @@ tf_kernel_library(
 tf_kernel_library(
     name = "sparse_reduce_op",
     prefix = "sparse_reduce_op",
-    deps = SPARSE_DEPS,
+    deps = SPARSE_DEPS + ["@com_google_absl//absl/status"],
 )
 
 tf_kernel_library(
diff --git a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
index 28e8f4396fd668..9e6a4b2829aa5a 100644
--- a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
@@ -639,7 +639,7 @@ void AdaptiveSharedBatchScheduler<TaskType>::CallbackWrapper(
             "ProcessBatch", {{"batch_size_before_padding", batch->size()},
                              {"_r", 2} /*root_event*/});
       },
-      profiler::ContextType::kAdaptiveSharedBatchScheduler,
+      tsl::profiler::ContextType::kAdaptiveSharedBatchScheduler,
       batch->traceme_context_id());
   const int64_t start_time = batch->creation_time_micros();
   callback(std::unique_ptr<Batch<TaskType>>(
@@ -798,7 +798,7 @@ Status ASBSQueue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
                 "ASBSQueue::Schedule",
                 {{"batching_input_task_size", task_size}});
           },
-          profiler::ContextType::kAdaptiveSharedBatchScheduler,
+          tsl::profiler::ContextType::kAdaptiveSharedBatchScheduler,
           this->current_batch_->traceme_context_id());
       current_batch_->AddTask(std::move(task));
       num_enqueued_tasks_++;
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
index 7e638224369e82..500dfc662ee359 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -959,7 +959,7 @@ Status Queue<TaskType>::ScheduleWithLazySplit(std::unique_ptr<TaskType>* task) {
             return profiler::TraceMeEncode("ScheduleOutputTask",
                                            {{"size", task_handles[i]->size()}});
           },
-          profiler::ContextType::kSharedBatchScheduler,
+          tsl::profiler::ContextType::kSharedBatchScheduler,
           task_handle_batches_.back()->traceme_context_id());
 
       task_handle_batches_.back()->AddTask(std::move(task_handles[i]));
@@ -1040,7 +1040,7 @@ Status Queue<TaskType>::ScheduleWithoutOrEagerSplitImpl(
           return profiler::TraceMeEncode("ScheduleOutputTask",
                                          {{"size", output_tasks[i]->size()}});
         },
-        profiler::ContextType::kSharedBatchScheduler,
+        tsl::profiler::ContextType::kSharedBatchScheduler,
         batches.back()->traceme_context_id());
     batches.back()->AddTask(std::move(output_tasks[i]));
   }
@@ -1332,7 +1332,7 @@ void Queue<TaskType>::ProcessBatch(
             "ProcessBatch", {{"batch_size_before_padding", batch->size()},
                              {"_r", 2} /*root_event*/});
       },
-      profiler::ContextType::kSharedBatchScheduler,
+      tsl::profiler::ContextType::kSharedBatchScheduler,
       batch->traceme_context_id());
 
   if (std::holds_alternative<ProcessBatchCallbackWithoutPaddingTasks>(
diff --git a/tensorflow/core/kernels/collective_nccl.cc b/tensorflow/core/kernels/collective_nccl.cc
index 9a1cb900f3bf11..c44680b27124aa 100644
--- a/tensorflow/core/kernels/collective_nccl.cc
+++ b/tensorflow/core/kernels/collective_nccl.cc
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/collective_util.h"
 #include "tensorflow/core/nccl/nccl_manager.h"
-#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/collective_nccl_all_to_all.cc b/tensorflow/core/kernels/collective_nccl_all_to_all.cc
index a24183ee71e40b..a531f19b976cec 100644
--- a/tensorflow/core/kernels/collective_nccl_all_to_all.cc
+++ b/tensorflow/core/kernels/collective_nccl_all_to_all.cc
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/collective_util.h"
 #include "tensorflow/core/nccl/nccl_manager.h"
-#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/collective_nccl_broadcaster.cc b/tensorflow/core/kernels/collective_nccl_broadcaster.cc
index 12b7f5222a9dcf..d3a2ec9218a002 100644
--- a/tensorflow/core/kernels/collective_nccl_broadcaster.cc
+++ b/tensorflow/core/kernels/collective_nccl_broadcaster.cc
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/collective_util.h"
 #include "tensorflow/core/nccl/nccl_manager.h"
-#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/collective_nccl_gatherer.cc b/tensorflow/core/kernels/collective_nccl_gatherer.cc
index a029e5bb6604f2..f6d8e63a748877 100644
--- a/tensorflow/core/kernels/collective_nccl_gatherer.cc
+++ b/tensorflow/core/kernels/collective_nccl_gatherer.cc
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/collective_util.h"
 #include "tensorflow/core/nccl/nccl_manager.h"
-#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/collective_nccl_reducer.cc b/tensorflow/core/kernels/collective_nccl_reducer.cc
index 46fbc2dc12d087..ebc2129f4c4e03 100644
--- a/tensorflow/core/kernels/collective_nccl_reducer.cc
+++ b/tensorflow/core/kernels/collective_nccl_reducer.cc
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/collective_util.h"
 #include "tensorflow/core/nccl/nccl_manager.h"
-#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index 2694b0d1b89e19..06d753722e7c9a 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -422,7 +422,7 @@ struct functor_traits<google_floor_div_real<Scalar>> {
                    Scalar, packet_traits<Scalar>::HasDiv>::value +
            2 * NumTraits<Scalar>::AddCost,
     PacketAccess =
-        packet_traits<Scalar>::HasDiv && packet_traits<Scalar>::HasFloor
+        packet_traits<Scalar>::HasDiv && packet_traits<Scalar>::HasRound
   };
 };
 
@@ -489,9 +489,9 @@ struct functor_traits<google_truncate_div_real<Scalar>> {
     Cost = 2 * Eigen::internal::scalar_div_cost<
                    Scalar, packet_traits<Scalar>::HasDiv>::value +
            3 * NumTraits<Scalar>::AddCost,
-    PacketAccess =
-        packet_traits<Scalar>::HasDiv && packet_traits<Scalar>::HasFloor &&
-        packet_traits<Scalar>::HasCeil && packet_traits<Scalar>::HasCmp
+    PacketAccess = packet_traits<Scalar>::HasDiv &&
+                   packet_traits<Scalar>::HasRound &&
+                   packet_traits<Scalar>::HasCmp
   };
 };
 
@@ -506,7 +506,7 @@ struct functor_traits<google_truncate_div_real<Scalar>> {
 #endif
 
 template <typename Scalar, bool IsInteger = Eigen::NumTraits<Scalar>::IsInteger,
-          bool HasRint = packet_traits<Scalar>::HasRint>
+          bool HasRint = packet_traits<Scalar>::HasRound>
 struct scalar_round_half_to_even_op {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
   operator()(const Scalar& x) const {
@@ -568,7 +568,7 @@ struct functor_traits<scalar_round_half_to_even_op<Scalar>> {
   enum {
     Cost = Eigen::NumTraits<Scalar>::IsInteger ? 0
                                                : 4 * NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasFloor &&
+    PacketAccess = packet_traits<Scalar>::HasRound &&
                    packet_traits<Scalar>::HasAdd &&
                    packet_traits<Scalar>::HasMul,
   };
@@ -606,7 +606,7 @@ template <typename Scalar, bool IsInteger>
 struct functor_traits<scalar_round_up_op<Scalar, IsInteger>> {
   enum {
     Cost = IsInteger ? 0 : 4 * NumTraits<Scalar>::AddCost,
-    PacketAccess = IsInteger || packet_traits<Scalar>::HasFloor
+    PacketAccess = IsInteger || packet_traits<Scalar>::HasRound
   };
 };
 
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index bf855d8eba0c40..bbdf43816269a1 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -66,6 +66,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/data:global_shuffle_utils",
         "//tensorflow/core/data:name_utils",
         "//tensorflow/core/data:serialization_utils",
         "//tensorflow/core/framework:dataset_options_proto_cc",
@@ -287,13 +288,17 @@ tf_kernel_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/data:captured_function",
         "//tensorflow/core/data:dataset_utils",
+        "//tensorflow/core/data:flat_map_utils",
         "//tensorflow/core/data:name_utils",
         "//tensorflow/core/data:serialization_utils",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:thread_annotations",
     ],
 )
 
@@ -844,7 +849,9 @@ tf_kernel_library(
         "//tensorflow/core/data:stats_utils",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/core/profiler/lib:traceme_encode",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:str_format",
     ],
 )
 
@@ -853,6 +860,7 @@ tf_cc_test(
     size = "small",
     srcs = ["parallel_map_dataset_op_test.cc"],
     deps = [
+        ":batch_dataset_op",
         ":iterator_ops",
         ":parallel_map_dataset_op",
         ":range_dataset_op",
@@ -869,6 +877,7 @@ tf_cc_test(
         "//tensorflow/core/data:stats_utils",
         "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:function_ops",
+        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1526,8 +1535,10 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/data:dataset_utils",
+        "//tensorflow/core/data:global_shuffle_utils",
         "//tensorflow/core/data:name_utils",
         "//tensorflow/core/data:split_utils",
+        "@com_google_absl//absl/status",
     ],
 )
 
@@ -1558,6 +1569,7 @@ filegroup(
         "//tensorflow/core/data:compression_utils.h",
         "//tensorflow/core/data:dataset_utils.h",
         "//tensorflow/core/data:finalization_utils.h",
+        "//tensorflow/core/data:flat_map_utils.h",
         "//tensorflow/core/data:global_shuffle_utils.h",
         "//tensorflow/core/data:metric_utils.h",
         "//tensorflow/core/data:name_utils.h",
@@ -1588,6 +1600,7 @@ filegroup(
         "//tensorflow/core/data:compression_utils.cc",
         "//tensorflow/core/data:dataset_utils.cc",
         "//tensorflow/core/data:finalization_utils.cc",
+        "//tensorflow/core/data:flat_map_utils.cc",
         "//tensorflow/core/data:global_shuffle_utils.cc",
         "//tensorflow/core/data:metric_utils.cc",
         "//tensorflow/core/data:name_utils.cc",
diff --git a/tensorflow/core/kernels/data/cache_dataset_ops.cc b/tensorflow/core/kernels/data/cache_dataset_ops.cc
index 3460bbc7475ca2..de8f50db1b9903 100644
--- a/tensorflow/core/kernels/data/cache_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/cache_dataset_ops.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/status.h"
+#include "tensorflow/core/data/global_shuffle_utils.h"
 #include "tensorflow/core/data/name_utils.h"
 #include "tensorflow/core/data/serialization_utils.h"
 #include "tensorflow/core/framework/dataset.h"
@@ -139,29 +140,19 @@ class DatasetRandomAccessCache {
 };
 
 // Caches dataset elements when global shuffling is enabled.
-// TODO(b/325112575): Support save/load.
 class IteratorRandomAccessCache {
  public:
   explicit IteratorRandomAccessCache(const DatasetBase* input)
       : input_(input) {}
 
-  absl::Status Get(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
-                   bool* end_of_sequence) {
-    TF_ASSIGN_OR_RETURN(size_t element_position,
-                        ctx->index_mapper()(element_count_++));
+  absl::Status Get(AnyContext ctx, size_t element_position,
+                   std::vector<Tensor>* out_tensors) {
     if (element_position < cache_.size() && !cache_[element_position].empty()) {
       *out_tensors = cache_[element_position];
-      *end_of_sequence = false;
-      return absl::OkStatus();
-    }
-
-    absl::Status status =
-        input_->Get(AnyContext(ctx), element_position, out_tensors);
-    if (absl::IsOutOfRange(status)) {
-      *end_of_sequence = true;
       return absl::OkStatus();
     }
 
+    TF_RETURN_IF_ERROR(input_->Get(ctx, element_position, out_tensors));
     if (element_position >= cache_.size()) {
       cache_.resize(element_position + 1);
     }
@@ -170,7 +161,6 @@ class IteratorRandomAccessCache {
   }
 
  private:
-  int64_t element_count_ = 0;
   const DatasetBase* input_ = nullptr;
   std::vector<std::vector<Tensor>> cache_;
 };
@@ -828,6 +818,16 @@ class CacheDatasetOp::MemoryDatasetBase : public DatasetBase {
     return dataset_random_access_cache_->Get(ctx, index, out_tensors);
   }
 
+  Status Get(AnyContext ctx, int64 index,
+             std::vector<Tensor>* out_tensors) const override {
+    mutex_lock l(mu_);
+    if (!iterator_random_access_cache_) {
+      iterator_random_access_cache_ =
+          std::make_unique<IteratorRandomAccessCache>(input_);
+    }
+    return iterator_random_access_cache_->Get(ctx, index, out_tensors);
+  }
+
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
     return absl::OkStatus();
@@ -845,7 +845,9 @@ class CacheDatasetOp::MemoryDatasetBase : public DatasetBase {
   class MemoryIterator : public DatasetIterator<MemoryDatasetBase> {
    public:
     explicit MemoryIterator(const Params& params, MemoryCache* cache)
-        : DatasetIterator<MemoryDatasetBase>(params), cache_(cache) {}
+        : DatasetIterator<MemoryDatasetBase>(params),
+          cache_(cache),
+          global_shuffle_iterator_(dataset()) {}
 
     Status Initialize(IteratorContext* ctx) override {
       mutex_lock l(mu_);
@@ -855,15 +857,11 @@ class CacheDatasetOp::MemoryDatasetBase : public DatasetBase {
     Status GetNextInternal(IteratorContext* ctx,
                            std::vector<Tensor>* out_tensors,
                            bool* end_of_sequence) override {
-      mutex_lock l(mu_);
       if (ctx->index_mapper() != nullptr) {
-        if (!iterator_random_access_cache_) {
-          iterator_random_access_cache_ =
-              std::make_unique<IteratorRandomAccessCache>(dataset()->input_);
-        }
-        return iterator_random_access_cache_->Get(ctx, out_tensors,
-                                                  end_of_sequence);
+        return global_shuffle_iterator_.GetNext(ctx, out_tensors,
+                                                end_of_sequence);
       }
+      mutex_lock l(mu_);
       return iterator_->GetNext(ctx, out_tensors, end_of_sequence);
     }
 
@@ -887,6 +885,9 @@ class CacheDatasetOp::MemoryDatasetBase : public DatasetBase {
 
     Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
+      if (ctx->restored_element_count().has_value()) {
+        return global_shuffle_iterator_.Restore(ctx);
+      }
       mutex_lock l(mu_);
       iterator_.reset();
       cache_->Reset();
@@ -1068,8 +1069,7 @@ class CacheDatasetOp::MemoryDatasetBase : public DatasetBase {
     mutex mu_;
     MemoryCache* cache_ TF_GUARDED_BY(mu_);  // not owned.
     std::unique_ptr<IteratorBase> iterator_ TF_GUARDED_BY(mu_);
-    std::unique_ptr<IteratorRandomAccessCache> iterator_random_access_cache_
-        TF_GUARDED_BY(mu_);
+    GlobalShuffleIterator global_shuffle_iterator_;
   };  // MemoryIterator
 
   mutable mutex mu_;
@@ -1077,6 +1077,8 @@ class CacheDatasetOp::MemoryDatasetBase : public DatasetBase {
   const std::shared_ptr<MemoryCache> cache_;
   mutable std::unique_ptr<DatasetRandomAccessCache> dataset_random_access_cache_
       TF_GUARDED_BY(mu_);
+  mutable std::unique_ptr<IteratorRandomAccessCache>
+      iterator_random_access_cache_;
   absl::Status random_indexing_compatible_ = absl::OkStatus();
 };  // MemoryDatasetBase
 
diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index c6192b9e41f320..5adbe7abe25ee8 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -342,6 +342,7 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/data:name_utils",
+        "//tensorflow/core/framework:dataset_options_proto_cc",
         "//tensorflow/core/framework:types_proto_cc",
         "//tensorflow/core/kernels:random_index_shuffle",
         "//tensorflow/core/kernels/data:random_seed_ops",
@@ -921,8 +922,9 @@ tf_kernel_library(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/data:dataset_utils",
         "//tensorflow/core/framework:dataset_options_proto_cc",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:mutex",
         "@local_tsl//tsl/platform:status",
diff --git a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
index f5086454228c12..951b53a3b8504b 100644
--- a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
@@ -340,7 +340,8 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
       TF_RETURN_IF_ERROR(RegisterCancellationCallback(
           ctx->cancellation_manager(),
           [this]() { data_service_client_.Cancel(); }, &deregister_fn_));
-      return data_service_client_.Initialize(ctx->allocator(/*attrs=*/{}));
+      return data_service_client_.Initialize(ctx->accelerator_device_info(),
+                                             ctx->allocator(/*attrs=*/{}));
     }
 
     Status GetNextInternal(IteratorContext* ctx,
diff --git a/tensorflow/core/kernels/data/experimental/global_shuffle_dataset_op.cc b/tensorflow/core/kernels/data/experimental/global_shuffle_dataset_op.cc
index 346121c0eb6982..c937f61b2d9100 100644
--- a/tensorflow/core/kernels/data/experimental/global_shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/global_shuffle_dataset_op.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "tensorflow/core/data/name_utils.h"
 #include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/dataset_options.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_requires.h"
 #include "tensorflow/core/framework/resource_handle.h"
@@ -55,7 +56,9 @@ constexpr const char kGlobalShuffleDataset[] = "GlobalShuffleDataset";
 constexpr const char kReshuffleEachIteration[] = "reshuffle_each_iteration";
 constexpr const char kSeed[] = "seed";
 constexpr const char kSeed2[] = "seed2";
+constexpr const char kSeed3[] = "seed3";
 constexpr const char kSeedGenerator[] = "SeedGenerator";
+constexpr const char kEpochNumRandomSamples[] = "epoch_num_random_samples";
 
 class GlobalShuffleDatasetOp : public UnaryDatasetOpKernel {
  public:
@@ -186,9 +189,19 @@ class GlobalShuffleDatasetOp::Dataset::Iterator
   absl::Status Initialize(IteratorContext* ctx) override
       ABSL_LOCKS_EXCLUDED(mu_) {
     absl::MutexLock l(&mu_);
+    if (ctx->cancellation_manager()->IsCancelled()) {
+      return absl::CancelledError(
+          "ctx->cancellation_manager()->IsCancelled() is true. Would not "
+          "execute `seed_generator_` to prevent incorrect results when "
+          "restoring.");
+    }
     int64_t seed4;
     seed_generator_->GenerateSeeds(&seed_, &seed2_);
     seed_generator_->GenerateSeeds(&seed3_, &seed4);
+
+    // Snapshots `num_random_samples()` so that
+    // we know how to recover the seed later.
+    num_random_samples_ = seed_generator_->num_random_samples();
     TF_RETURN_IF_ERROR(
         dataset()->input_->MakeIterator(ctx, this, prefix(), &input_impl_));
     return absl::OkStatus();
@@ -246,6 +259,12 @@ class GlobalShuffleDatasetOp::Dataset::Iterator
     absl::MutexLock l(&mu_);
     TF_RETURN_IF_ERROR(
         writer->WriteScalar(prefix(), kElementCount, element_count_));
+    TF_RETURN_IF_ERROR(writer->WriteScalar(prefix(), kEpochNumRandomSamples,
+                                           num_random_samples_));
+
+    TF_RETURN_IF_ERROR(writer->WriteScalar(prefix(), kSeed, seed_));
+    TF_RETURN_IF_ERROR(writer->WriteScalar(prefix(), kSeed2, seed2_));
+    TF_RETURN_IF_ERROR(writer->WriteScalar(prefix(), kSeed3, seed3_));
     return absl::OkStatus();
   }
 
@@ -258,6 +277,22 @@ class GlobalShuffleDatasetOp::Dataset::Iterator
       TF_RETURN_IF_ERROR(
           reader->ReadScalar(prefix(), kElementCount, &element_count_));
     }
+
+    // Restoring the seed_generator is necessary when
+    // combine this op with `.repeat()`.
+    // This is similar to how shuffle dataset recovers the seed generator -
+    // `tensorflow::data::ShuffleDatasetOpBase::ShuffleDatasetBase::Iterator::RestoreInternal`.
+    TF_RETURN_IF_ERROR(reader->ReadScalar(prefix(), kEpochNumRandomSamples,
+                                          &num_random_samples_));
+    seed_generator_->set_num_random_samples(num_random_samples_);
+    seed_generator_->Reset();
+
+    // Required to recover seeds because `Initialize` is always called
+    // before `RestoreInternal.
+    TF_RETURN_IF_ERROR(reader->ReadScalar(prefix(), kSeed, &seed_));
+    TF_RETURN_IF_ERROR(reader->ReadScalar(prefix(), kSeed2, &seed2_));
+    TF_RETURN_IF_ERROR(reader->ReadScalar(prefix(), kSeed3, &seed3_));
+
     IteratorContext::Params params(ctx);
     params.restored_element_count = element_count_;
     params.index_mapper = GetIndexMapper(ctx->index_mapper());
@@ -278,6 +313,7 @@ class GlobalShuffleDatasetOp::Dataset::Iterator
 
   std::unique_ptr<IteratorBase> input_impl_ ABSL_GUARDED_BY(mu_);
   int64_t element_count_ ABSL_GUARDED_BY(mu_) = 0;
+  int64_t num_random_samples_ ABSL_GUARDED_BY(mu_) = 0;
 };
 
 GlobalShuffleDatasetOp::GlobalShuffleDatasetOp(OpKernelConstruction* ctx)
@@ -297,7 +333,9 @@ void GlobalShuffleDatasetOp::MakeDataset(OpKernelContext* ctx,
                   "compatible with random access. Got: ",
                   input->RandomIndexingCompatible().ToString())));
 
-  int64_t cardinality = input->Cardinality();
+  CardinalityOptions options;
+  options.set_compute_level(CardinalityOptions::CARDINALITY_COMPUTE_MODERATE);
+  int64_t cardinality = input->Cardinality(std::move(options));
   OP_REQUIRES(ctx, cardinality > 0,
               absl::InvalidArgumentError(absl::StrCat(
                   "`global_shuffle` requires the input dataset to have a "
diff --git a/tensorflow/core/kernels/data/experimental/index_flat_map_dataset_op.cc b/tensorflow/core/kernels/data/experimental/index_flat_map_dataset_op.cc
index 0938502220a125..a0b7d6aa4b6c9f 100644
--- a/tensorflow/core/kernels/data/experimental/index_flat_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/index_flat_map_dataset_op.cc
@@ -82,7 +82,7 @@ absl::StatusOr<size_t> GetValue(const Tensor& tensor) {
       return tensor.scalar<int32_t>()();
     default:
       return absl::InvalidArgumentError(absl::StrCat(
-          "The `index_map_fn` for `index_flat_map` is expected to return two "
+          "The `index_map_func` for `index_flat_map` is expected to return two "
           "int32/int64 values representing the element index and an offset "
           "within the element. Got: ",
           tensor.DebugString()));
@@ -92,6 +92,16 @@ absl::StatusOr<size_t> GetValue(const Tensor& tensor) {
 // Returns the `offset`-th element from `tensors`.
 absl::StatusOr<std::vector<Tensor>> GetSlice(const std::vector<Tensor>& tensors,
                                              size_t offset) {
+  if (tensors.size() > 1) {
+    if (offset >= tensors.size()) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "`index_flat_map` got invalid `index_map_func` which returns offset ",
+          offset, ", but the input element has ", tensors.size(),
+          " elements: ", ToDebugString(tensors)));
+    }
+    return std::vector<Tensor>{tensors[offset]};
+  }
+
   std::vector<Tensor> result;
   for (size_t i = 0; i < tensors.size(); ++i) {
     if (tensors[i].dims() == 0) {  // Scalar.
@@ -100,7 +110,7 @@ absl::StatusOr<std::vector<Tensor>> GetSlice(const std::vector<Tensor>& tensors,
     }
     if (offset > tensors[i].dim_size(0)) {
       return absl::InvalidArgumentError(absl::StrCat(
-          "`index_flat_map` got invalid `index_map_fn` which returns offset ",
+          "`index_flat_map` got invalid `index_map_func` which returns offset ",
           offset, ", but the input element has ", tensors[i].dim_size(0),
           " elements: ", tensors[i].DebugString()));
     }
@@ -256,50 +266,56 @@ class IndexFlatMapDatasetOp::Dataset::Iterator
                                std::vector<Tensor>* out_tensors,
                                bool* end_of_sequence) override
       ABSL_LOCKS_EXCLUDED(mu_) {
+    if (ctx->index_mapper()) {
+      return Get(ctx, out_tensors, end_of_sequence);
+    }
+
     absl::MutexLock l(&mu_);
-    *end_of_sequence = false;
-    if (ctx->index_mapper() == nullptr) {
-      absl::StatusOr<std::tuple<size_t, size_t>> next_input_index_and_offset =
-          GetUnflattenedIndex(ctx, element_count_);
-      TF_RETURN_IF_ERROR(next_input_index_and_offset.status());
-      const auto [next_input_index, offset] = *next_input_index_and_offset;
-      // When all the values of the current input element have been read,
-      // advances to the next input element. Otherwise, returns an element from
-      // the current `input_unflattened_tensors_`.
-      if (next_input_index > input_element_count_ ||
-          input_unflattened_tensors_.empty()) {
-        input_unflattened_tensors_.clear();
-        TF_RETURN_IF_ERROR(GetMappedTensorsFromInput(
-            ctx, &input_unflattened_tensors_, end_of_sequence));
-        if (*end_of_sequence) {
-          return absl::OkStatus();
-        }
-        input_element_count_ = next_input_index;
-      }
-      TF_ASSIGN_OR_RETURN(*out_tensors,
-                          GetSlice(input_unflattened_tensors_, offset));
-      ++element_count_;
-    } else {
-      const int64_t cardinality = dataset()->Cardinality();
-      if (cardinality < 0) {
-        return absl::FailedPreconditionError(absl::StrCat(
-            "Global shuffling requires finite cardinality. Got cardinality ",
-            cardinality, " for dataset ", dataset()->DebugString(), "."));
-      }
-      // TODO(b/325112575): Make it easier to return multiple values from
-      // IndexMapperFn.
-      size_t offset = 0;
-      IteratorContext ctx_with_index_mapper =
-          GetContextWithIndexMapper(ctx, offset);
-      std::vector<Tensor> mapped_tensors;
+    absl::StatusOr<std::tuple<size_t, size_t>> next_input_index_and_offset =
+        GetUnflattenedIndex(ctx, element_count_);
+    TF_RETURN_IF_ERROR(next_input_index_and_offset.status());
+    const auto [next_input_index, offset] =
+        *std::move(next_input_index_and_offset);
+    // When all the values of the current input element have been read,
+    // advances to the next input element. Otherwise, returns an element from
+    // the current `input_unflattened_tensors_`.
+    if (next_input_index > input_element_count_ ||
+        input_unflattened_tensors_.empty()) {
+      input_unflattened_tensors_.clear();
       TF_RETURN_IF_ERROR(GetMappedTensorsFromInput(
-          &ctx_with_index_mapper, &mapped_tensors, end_of_sequence));
-      ctx->MergeCheckpoint(ctx_with_index_mapper.checkpoint());
+          ctx, &input_unflattened_tensors_, end_of_sequence));
       if (*end_of_sequence) {
         return absl::OkStatus();
       }
-      TF_ASSIGN_OR_RETURN(*out_tensors, GetSlice(mapped_tensors, offset));
+      input_element_count_ = next_input_index;
+    }
+    TF_ASSIGN_OR_RETURN(*out_tensors,
+                        GetSlice(input_unflattened_tensors_, offset));
+    ++element_count_;
+    return absl::OkStatus();
+  }
+
+  absl::Status Get(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                   bool* end_of_sequence) ABSL_LOCKS_EXCLUDED(mu_) {
+    const int64_t cardinality = dataset()->Cardinality();
+    if (cardinality < 0) {
+      return absl::FailedPreconditionError(absl::StrCat(
+          "Global shuffling requires finite cardinality. Got cardinality ",
+          cardinality, " for dataset ", dataset()->DebugString(), "."));
+    }
+
+    absl::MutexLock l(&mu_);
+    size_t offset = 0;
+    IteratorContext ctx_with_index_mapper =
+        GetContextWithIndexMapper(ctx, offset);
+    std::vector<Tensor> mapped_tensors;
+    TF_RETURN_IF_ERROR(GetMappedTensorsFromInput(
+        &ctx_with_index_mapper, &mapped_tensors, end_of_sequence));
+    ctx->MergeCheckpoint(ctx_with_index_mapper.checkpoint());
+    if (*end_of_sequence) {
+      return absl::OkStatus();
     }
+    TF_ASSIGN_OR_RETURN(*out_tensors, GetSlice(mapped_tensors, offset));
     return absl::OkStatus();
   }
 
diff --git a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
index 13173f183a9d8c..d88506b4176a29 100644
--- a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
@@ -38,7 +38,6 @@ limitations under the License.
 #include "tensorflow/core/platform/env_time.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/stringprintf.h"
-#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/profiler/lib/traceme_encode.h"
 
diff --git a/tensorflow/core/kernels/data/experimental/random_dataset_op.cc b/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
index c83febdeac422b..31443f9cf76d2d 100644
--- a/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
@@ -73,7 +73,7 @@ class RandomDatasetOp::Dataset : public DatasetBase {
       Status s = resource_mgr_->Delete<SeedGeneratorManager>(
           resource_handle_.container(), resource_handle_.name());
       if (!s.ok()) {
-        LOG(WARNING) << "Failed to delete RNG resource: " << s.ToString();
+        LOG(WARNING) << "Failed to delete RNG resource: " << s;
       }
     }
   }
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
index 4108b5e19354b0..314a3c905bd244 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
@@ -919,7 +919,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
         snapshot_util::DumpDatasetGraph(ctx->env(), path, hash, &graph_def);
     if (!dump_status.ok()) {
       LOG(WARNING) << "Unable to write graphdef to disk, error: "
-                   << dump_status.ToString();
+                   << dump_status;
     }
 
     std::string graph_hash =
@@ -1557,7 +1557,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
               }
               curr_filenames_[i] = GetNextFilename();
             } else {
-              LOG(ERROR) << "Encountered an error: " << s.ToString();
+              LOG(ERROR) << "Encountered an error: " << s;
               BufferElement elem;
               elem.status = s;
               mutex_lock l(mu_);
@@ -2099,7 +2099,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
               kCurrentVersion, dataset()->output_dtypes(), &writer);
           if (!s.ok()) {
             LOG(ERROR) << "Creating " << snapshot_data_filename
-                       << " failed: " << s.ToString();
+                       << " failed: " << s;
             mutex_lock l(mu_);
             snapshot_failed_ = true;
             cond_var_.notify_all();
@@ -2112,8 +2112,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
                 ProcessOneElement(env, &bytes_written, &snapshot_data_filename,
                                   &writer, &end_of_processing);
             if (!s.ok()) {
-              LOG(INFO) << "Error while writing snapshot data to disk: "
-                        << s.ToString();
+              LOG(INFO) << "Error while writing snapshot data to disk: " << s;
               mutex_lock l(mu_);
               snapshot_failed_ = true;
               cond_var_.notify_all();
diff --git a/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
index 0d153486f10eee..3cc332024328e8 100644
--- a/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/dataset_options.pb.h"
 #include "tensorflow/core/framework/model.h"
@@ -43,7 +45,6 @@ namespace {
 
 using tsl::mutex;
 using tsl::mutex_lock;
-using tsl::OkStatus;
 using tsl::Status;
 using tsl::strings::StrCat;
 
@@ -123,6 +124,32 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
       return input_->CheckExternalState();
     }
 
+    absl::Status Get(OpKernelContext* ctx, int64_t index,
+                     std::vector<Tensor>* out_tensors) const override {
+      TF_RETURN_IF_ERROR(CheckRandomAccessCompatible(index));
+      if (batch_size_ <= 0) {
+        return absl::FailedPreconditionError(absl::StrCat(
+            "Random access for the `unbatch` dataset requires a known batch "
+            "size. Got ",
+            batch_size_, "."));
+      }
+
+      const int64_t input_index = index / batch_size_;
+      const int64_t input_offset = index % batch_size_;
+      std::vector<Tensor> input_tensors;
+      TF_RETURN_IF_ERROR(input_->Get(ctx, input_index, &input_tensors));
+      for (int64_t i = 0; i < input_tensors.size(); ++i) {
+        const DataType& dtype = input_tensors[i].dtype();
+        TensorShape shape = input_tensors[i].shape();
+        shape.RemoveDim(0);
+
+        out_tensors->emplace_back(ctx->get_allocator({}), dtype, shape);
+        TF_RETURN_IF_ERROR(batch_util::MaybeMoveSliceToElement(
+            &input_tensors[i], &out_tensors->back(), input_offset));
+      }
+      return absl::OkStatus();
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/experimental/weighted_flat_map_dataset_op.cc b/tensorflow/core/kernels/data/experimental/weighted_flat_map_dataset_op.cc
index 2218d8f877c9e8..da2b73e8d80ebc 100644
--- a/tensorflow/core/kernels/data/experimental/weighted_flat_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/weighted_flat_map_dataset_op.cc
@@ -225,7 +225,11 @@ class WeightedFlatMapDatasetOp::Dataset : public DatasetBase {
   }
 
   absl::Status RandomIndexingCompatible() const override {
-    return random_indexing_compatible_;
+    // TODO(b/325112575): Fix incompatibility issue with batch dataset where it
+    // assumes the index_mapper is stateless. And fix the failed test in
+    // weighted_flat_map_test.py.
+    return absl::FailedPreconditionError(
+        absl::StrCat(type_string(), " does not support random access."));
   }
 
  private:
@@ -233,12 +237,12 @@ class WeightedFlatMapDatasetOp::Dataset : public DatasetBase {
    public:
     explicit Iterator(const Params& params)
         : DatasetIterator<Dataset>(params),
-          input_impls_(dataset()->inputs_.size()),
+          cumulative_input_cardinalities_(
+              dataset()->ComputeCumulativeInputCardinalities()),
           element_count_(0),
           inputs_element_count_(dataset()->inputs_.size(), 0),
           next_positions_(dataset()->inputs_.size(), 0),
-          cumulative_input_cardinalities_(
-              dataset()->ComputeCumulativeInputCardinalities()) {}
+          input_impls_(dataset()->inputs_.size()) {}
 
     bool SymbolicCheckpointCompatible() const override { return true; }
 
@@ -392,9 +396,9 @@ class WeightedFlatMapDatasetOp::Dataset : public DatasetBase {
     }
 
    private:
+    const std::vector<uint64_t> cumulative_input_cardinalities_;
+
     mutable absl::Mutex mu_;
-    std::vector<std::unique_ptr<IteratorBase>> input_impls_
-        ABSL_GUARDED_BY(mu_);
     // Counts the number of elements this iterator has produced.
     int64_t element_count_ ABSL_GUARDED_BY(mu_) = 0;
     // Counts the number of elements each input iterator has produced.
@@ -402,7 +406,8 @@ class WeightedFlatMapDatasetOp::Dataset : public DatasetBase {
     // Keeps track of the position of this iterator that each input starts to
     // scan for its next index.
     std::vector<size_t> next_positions_;
-    const std::vector<uint64_t> cumulative_input_cardinalities_;
+    std::vector<std::unique_ptr<IteratorBase>> input_impls_
+        ABSL_GUARDED_BY(mu_);
   };
 
   const std::vector<DatasetBase*> inputs_;
diff --git a/tensorflow/core/kernels/data/flat_map_dataset_op.cc b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
index 6a9afa758ed3ab..69b1fa5d957d19 100644
--- a/tensorflow/core/kernels/data/flat_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
@@ -14,22 +14,32 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/kernels/data/flat_map_dataset_op.h"
 
+#include <algorithm>
+#include <cstdint>
+#include <cstdlib>
 #include <string>
 #include <utility>
 
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/graph_runner.h"
 #include "tensorflow/core/common_runtime/input_colocation_exemption_registry.h"
 #include "tensorflow/core/data/captured_function.h"
 #include "tensorflow/core/data/dataset_utils.h"
+#include "tensorflow/core/data/flat_map_utils.h"
 #include "tensorflow/core/data/name_utils.h"
 #include "tensorflow/core/data/serialization_utils.h"
 #include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/dataset_options.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/random/random.h"
+#include "tsl/platform/logging.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/thread_annotations.h"
 
 namespace tensorflow {
 namespace data {
@@ -63,8 +73,10 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
         input_(input),
         captured_func_(std::move(captured_func)),
         output_types_(output_types),
-        output_shapes_(output_shapes) {
+        output_shapes_(output_shapes),
+        random_access_handler_(ctx, input, *captured_func_) {
     input_->Ref();
+    random_indexing_compatible_ = input_->RandomIndexingCompatible();
   }
 
   ~Dataset() override { input_->Unref(); }
@@ -85,6 +97,20 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
     return name_utils::DatasetDebugString(kDatasetType);
   }
 
+  int64_t CardinalityInternal(CardinalityOptions options) const override {
+    if (options.compute_level() <
+        CardinalityOptions::CARDINALITY_COMPUTE_MODERATE) {
+      return kUnknownCardinality;
+    }
+    absl::StatusOr<int64_t> cardinality = random_access_handler_.Cardinality();
+    if (!cardinality.ok()) {
+      LOG(ERROR) << "Unable to compute cardinality for dataset "
+                 << DebugString() << " due to error: " << cardinality.status();
+      return kUnknownCardinality;
+    }
+    return *cardinality;
+  }
+
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
     return absl::OkStatus();
@@ -95,6 +121,10 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
     return input_->CheckExternalState();
   }
 
+  absl::Status RandomIndexingCompatible() const override {
+    return random_indexing_compatible_;
+  }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
@@ -139,6 +169,10 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
     Status GetNextInternal(IteratorContext* ctx,
                            std::vector<Tensor>* out_tensors,
                            bool* end_of_sequence) override {
+      if (ctx->index_mapper()) {
+        return Get(ctx, out_tensors, end_of_sequence);
+      }
+
       // LINT.IfChange(GetNextInternal)
       mutex_lock l(mu_);
       do {
@@ -266,6 +300,89 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
       // LINT.ThenChange(:GetNextInternal)
     }
 
+    // TODO(b/325112575): Support save/load.
+    absl::Status Get(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) TF_LOCKS_EXCLUDED(mu_) {
+      mutex_lock l(mu_);
+      TF_ASSIGN_OR_RETURN(size_t parent_index,
+                          ctx->index_mapper()(element_count_));
+
+      FlatMapRandomAccessHandler& random_access =
+          dataset()->random_access_handler_;
+      absl::StatusOr<int64_t> dataset_index =
+          random_access.GetDatasetIndex(parent_index);
+      if (absl::IsOutOfRange(dataset_index.status())) {
+        *end_of_sequence = true;
+        return absl::OkStatus();
+      }
+      TF_RETURN_IF_ERROR(dataset_index.status());
+
+      if (dataset_iterators_.empty()) {
+        // TODO(b/325112575): Consider moving this to `Initialize()`, which
+        // requires passing the `index_mapper` to the `IteratorContext` there.
+        TF_ASSIGN_OR_RETURN(
+            dataset_iterators_,
+            random_access.MakeInputIterators(ctx, this, prefix()));
+        next_positions_.resize(dataset_iterators_.size(), 0);
+        input_element_counts_.resize(dataset_iterators_.size(), 0);
+      }
+
+      IteratorContext::Params params(ctx);
+      params.index_mapper =
+          GetFlatMapIndexMapper(ctx->index_mapper(), *dataset_index);
+      IteratorContext global_shuffle_ctx(std::move(params));
+      TF_RETURN_IF_ERROR(dataset_iterators_[*dataset_index]->GetNext(
+          &global_shuffle_ctx, out_tensors, end_of_sequence));
+      ctx->MergeCheckpoint(global_shuffle_ctx.checkpoint());
+      ++element_count_;
+      ++input_element_counts_[*dataset_index];
+      return absl::OkStatus();
+    }
+
+    // TODO(b/325112575): Refactor and reuse this code from weighted flat map.
+    IndexMapperFn GetFlatMapIndexMapper(IndexMapperFn parent_index_mapper,
+                                        size_t input_dataset_index)
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      absl::StatusOr<int64_t> cardinality =
+          dataset()->random_access_handler_.Cardinality();
+      return [this, parent_index_mapper = std::move(parent_index_mapper),
+              input_dataset_index, cardinality = std::move(cardinality)](
+                 size_t element_position) -> absl::StatusOr<size_t> {
+        if (!cardinality.ok() || *cardinality < 0) {
+          return absl::FailedPreconditionError(
+              "Global shuffling requires finite cardinalities.");
+        }
+
+        FlatMapRandomAccessHandler& random_access =
+            dataset()->random_access_handler_;
+        while (next_positions_[input_dataset_index] < *cardinality) {
+          // `index` is the shuffled index of this dataset, not any of the
+          // inputs.
+          size_t index = next_positions_[input_dataset_index];
+          if (parent_index_mapper != nullptr) {
+            TF_ASSIGN_OR_RETURN(index, parent_index_mapper(index));
+          }
+          ++next_positions_[input_dataset_index];
+          // Finds the shuffled `index` comes from dataset
+          // `input_dataset_index`, computes the local offset to the input and
+          // return the offset. If not, iterate to continue scanning.
+          TF_ASSIGN_OR_RETURN(int64_t shuffled_dataset_index,
+                              random_access.GetDatasetIndex(index));
+          if (input_dataset_index == shuffled_dataset_index) {
+            // Finds the offset in input `input_dataset_index`.
+            if (input_dataset_index > 0) {
+              TF_ASSIGN_OR_RETURN(
+                  int64_t cumulative_cardinality,
+                  random_access.CumulativeCardinality(input_dataset_index - 1));
+              index -= cumulative_cardinality;
+            }
+            return index;
+          }
+        }
+        return *cardinality;
+      };
+    }
+
    protected:
     std::shared_ptr<model::Node> CreateNode(
         IteratorContext* ctx, model::Node::Args args) const override {
@@ -275,7 +392,8 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
     }
 
     Status SaveInternal(SerializationContext* ctx,
-                        IteratorStateWriter* writer) override {
+                        IteratorStateWriter* writer) override
+        TF_LOCKS_EXCLUDED(mu_) {
       TF_RETURN_IF_ERROR(ctx->HandleCheckExternalStateStatus(
           dataset()->captured_func_->CheckExternalState()));
       mutex_lock l(mu_);
@@ -302,7 +420,12 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
     }
 
     Status RestoreInternal(IteratorContext* ctx,
-                           IteratorStateReader* reader) override {
+                           IteratorStateReader* reader) override
+        TF_LOCKS_EXCLUDED(mu_) {
+      if (ctx->restored_element_count().has_value()) {
+        return RestoreForGlobalShuffle(ctx, reader);
+      }
+
       mutex_lock l(mu_);
       input_impl_.reset();
       element_index_ = 0;
@@ -332,6 +455,53 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
       return absl::OkStatus();
     }
 
+    Status RestoreForGlobalShuffle(IteratorContext* ctx,
+                                   IteratorStateReader* reader)
+        TF_LOCKS_EXCLUDED(mu_) {
+      mutex_lock l(mu_);
+      element_count_ = *ctx->restored_element_count();
+
+      FlatMapRandomAccessHandler& random_access =
+          dataset()->random_access_handler_;
+      TF_ASSIGN_OR_RETURN(int64_t cardinality, random_access.Cardinality());
+      if (dataset_iterators_.empty()) {
+        // TODO(b/325112575): Consider moving this to `Initialize()`, which
+        // requires passing the `index_mapper` to the `IteratorContext` there.
+        TF_ASSIGN_OR_RETURN(
+            dataset_iterators_,
+            random_access.MakeInputIterators(ctx, this, prefix()));
+      }
+      input_element_counts_.resize(dataset_iterators_.size(), 0);
+      next_positions_.resize(dataset_iterators_.size(), 0);
+      std::fill(input_element_counts_.begin(), input_element_counts_.end(), 0);
+      std::fill(next_positions_.begin(), next_positions_.end(), 0);
+
+      // Counts how many elements each input dataset has produced.
+      for (size_t count = 0; count < element_count_ && count < cardinality;
+           ++count) {
+        TF_ASSIGN_OR_RETURN(size_t parent_index, ctx->index_mapper()(count));
+        absl::StatusOr<size_t> dataset_index =
+            random_access.GetDatasetIndex(parent_index);
+        if (absl::IsOutOfRange(dataset_index.status())) {
+          break;
+        }
+        TF_RETURN_IF_ERROR(dataset_index.status());
+        ++input_element_counts_[*dataset_index];
+        next_positions_[*dataset_index] = count + 1;
+      }
+
+      // Passes individual element counts to each dataset to be restored.
+      for (size_t i = 0; i < dataset_iterators_.size(); ++i) {
+        IteratorContext::Params params(ctx);
+        params.restored_element_count = input_element_counts_[i];
+        IteratorContext ctx_copy(std::move(params));
+        TF_RETURN_IF_ERROR(
+            RestoreInput(&ctx_copy, reader, dataset_iterators_[i]));
+        ctx->MergeCheckpoint(ctx_copy.checkpoint());
+      }
+      return absl::OkStatus();
+    }
+
    private:
     Status BuildCurrentElementIteratorLocked(IteratorContext* ctx,
                                              bool is_get_next)
@@ -401,16 +571,31 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
     // to restore the partially-flat-mapped dataset, we can do so by
     // re-generating the input.
     std::unique_ptr<MemoryCheckpoint> input_ckpt_ TF_GUARDED_BY(mu_);
-    std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
-    std::unique_ptr<IteratorBase> current_element_iterator_ TF_GUARDED_BY(mu_);
     std::vector<Tensor> inputs_ TF_GUARDED_BY(mu_);
     std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
+    // Number of flattened elements produced by the iterator. Note this differs
+    // from `element_index_` which counts the input datasets that have been
+    // iterated over.
+    size_t element_count_ TF_GUARDED_BY(mu_) = 0;
+    // All dataset iterators. Only populated when global shuffling is enabled.
+    // Counts the number of elements each input iterator has produced. Only
+    // populated when global shuffling is enabled.
+    std::vector<int64_t> input_element_counts_ TF_GUARDED_BY(mu_);
+    // Keeps track of the position of this iterator that each input starts to
+    // scan for its next index. Only populated when global shuffling is enabled.
+    std::vector<size_t> next_positions_;
+    std::vector<std::unique_ptr<IteratorBase>> dataset_iterators_
+        TF_GUARDED_BY(mu_);
+    std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
+    std::unique_ptr<IteratorBase> current_element_iterator_ TF_GUARDED_BY(mu_);
   };
 
   const DatasetBase* const input_;
   const std::unique_ptr<CapturedFunction> captured_func_;
   const DataTypeVector output_types_;
   const std::vector<PartialTensorShape> output_shapes_;
+  absl::Status random_indexing_compatible_ = absl::OkStatus();
+  mutable FlatMapRandomAccessHandler random_access_handler_;
 };
 
 FlatMapDatasetOp::FlatMapDatasetOp(OpKernelConstruction* ctx)
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 4b5c498941f447..499dfcfbb2c1cb 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -133,6 +133,7 @@ Status IteratorResource::GetNext(OpKernelContext* ctx,
   params.thread_pool = &unbounded_thread_pool_;
   params.id_registry = captured_state->id_registry();
   params.warm_start = dataset->options().warm_start();
+  params.model = captured_state->model();
   std::function<void()> deregister_fn;
   TF_RETURN_IF_ERROR(RegisterCancellationCallback(
       ctx->cancellation_manager(),
@@ -236,6 +237,16 @@ Status IteratorResource::Restore(OpKernelContext* ctx,
                                 iterator_state_->pflr(), iterator_state_->flr(),
                                 /*iterator=*/nullptr);
     input_dataset = iterator_state_->dataset();
+
+    // This is to ensure the checkpoint can be restored correctly
+    // without worrying thread interleaving events.
+    // For example, `GlobalShuffleDatasetOp::Dataset::Iterator::Initialize`
+    // could be stateful due to the seed generator.
+    // Therefore, before restoring from the checkpoint, we need to make
+    // sure cancellation is marked so that
+    // `GlobalShuffleDatasetOp::Dataset::Iterator::Initialize` would know not to
+    // execute anymore stateful operations like seed generation.
+    iterator_state_->cancellation_manager()->StartCancel();
   }
   core::ScopedUnref scoped_unref(dataset);
   IteratorContext::Params params(ctx);
diff --git a/tensorflow/core/kernels/data/parallel_batch_dataset_op.cc b/tensorflow/core/kernels/data/parallel_batch_dataset_op.cc
index 5e6abb3abe58c9..7527226d94a8f3 100644
--- a/tensorflow/core/kernels/data/parallel_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_batch_dataset_op.cc
@@ -439,7 +439,7 @@ class ParallelBatchDatasetOp::Dataset : public DatasetBase {
         return status;
       };
 
-      (*ctx->runner())(copy_elements_fn);
+      (*ctx->runner())(std::move(copy_elements_fn));
     }
 
     void CancelThreads(bool wait) TF_LOCKS_EXCLUDED(mu_) {
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
index 568dedb5bc8e9e..24f8b400d06756 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
@@ -14,24 +14,30 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/kernels/data/parallel_map_dataset_op.h"
 
+#include <cstddef>
 #include <deque>
 #include <functional>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_format.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/input_colocation_exemption_registry.h"
 #include "tensorflow/core/data/dataset_utils.h"
 #include "tensorflow/core/data/name_utils.h"
 #include "tensorflow/core/data/stats_utils.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/framework/model.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/status.h"
@@ -315,11 +321,34 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
    protected:
     std::shared_ptr<model::Node> CreateNode(
         IteratorContext* ctx, model::Node::Args args) const override {
+      std::shared_ptr<model::Parameter> parameter;
+      if (num_parallel_calls_ &&
+          dataset()->num_parallel_calls_ == model::kAutotune) {
+        parameter = model::MakeParameter(
+            "parallelism", num_parallel_calls_, /*min=*/1,
+            /*max=*/ctx->runner_threadpool_size(),
+            // This is to ensure before this op has seen its first element,
+            // `MaximumBufferedBytes()` can use the correct `parameter->value`
+            // to estimate the maximum buffer bytes.
+            GetAutotuneDefaultParallelism(ctx));
+      } else {
+        parameter =
+            model::MakeParameter("parallelism", num_parallel_calls_, /*min=*/1,
+                                 /*max=*/ctx->runner_threadpool_size());
+      }
+      std::optional<int64_t> estimated_element_size =
+          dataset()->GetEstimatedElementSize();
+      if (!estimated_element_size) {
+        VLOG(2) << absl::StrFormat(
+            "Cannot estimate the size of the output tensor because the "
+            "output shape of node %s(id:%d) is only partially known.",
+            args.name, args.id);
+      }
+
       return model::MakeAsyncKnownRatioNode(
           std::move(args),
-          /*ratio=*/1,
-          {model::MakeParameter("parallelism", num_parallel_calls_, /*min=*/1,
-                                /*max=*/ctx->runner_threadpool_size())});
+          /*ratio=*/1, {std::move(parameter)},
+          /*is_legacy_prefetch_autotuned=*/false, estimated_element_size);
     }
 
     Status SaveInternal(SerializationContext* ctx,
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op_test.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op_test.cc
index cfd139cc6bbfe7..357e279d7396a8 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op_test.cc
@@ -11,8 +11,11 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/kernels/data/parallel_map_dataset_op.h"
 
+#include <gtest/gtest.h>
 #include "tensorflow/core/data/dataset_test_base.h"
 #include "tensorflow/core/data/name_utils.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tsl/lib/core/status_test_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -251,6 +254,29 @@ ParallelMapDatasetParams ParallelMapDatasetParams8() {
       /*node_name=*/kNodeName);
 }
 
+// Case when the output_shapes is partial(unknown)
+ParallelMapDatasetParams ParallelMapDatasetParams9() {
+  return ParallelMapDatasetParams(
+      BatchDatasetParams(RangeDatasetParams(0, 4, 1),
+                         /*batch_size=*/3,
+                         /*drop_remainder=*/false,
+                         /*parallel_copy*/ false,
+                         /*output_dtypes=*/{DT_INT64},
+                         /*output_shapes=*/{PartialTensorShape({-1})},
+                         /*node_name=*/"batch_dataset"),
+      /*other_arguments=*/{},
+      /*num_parallel_calls=*/1,
+      /*func=*/MapFunc("XTimesTwo", DT_INT64),
+      /*func_lib*/ {test::function::XTimesTwo()},
+      /*type_arguments=*/{},
+      /*output_dtypes=*/{DT_INT64},
+      /*output_shapes=*/{PartialTensorShape({-1})},
+      /*use_inter_op_parallelism=*/false,
+      /*deterministic=*/DeterminismPolicy::kDeterministic,
+      /*preserve_cardinality=*/false,
+      /*node_name=*/kNodeName);
+}
+
 ParallelMapDatasetParams ParallelMapDatasetParamsWithInvalidNumParallelCalls() {
   return ParallelMapDatasetParams(
       RangeDatasetParams(0, 10, 3),
@@ -292,6 +318,12 @@ std::vector<GetNextTestCase<ParallelMapDatasetParams>> GetNextTestCases() {
            ParallelMapDatasetParams6(),
            /*expected_outputs=*/
            CreateTensors<int64_t>(TensorShape{}, {{0}, {12}, {24}, {36}}),
+           /*compare_order=*/true},
+          {/*dataset_params=*/
+           ParallelMapDatasetParams9(),
+           /*expected_outputs=*/
+           {CreateTensor<int64_t>(TensorShape{3}, {0, 2, 4}),
+            CreateTensor<int64_t>(TensorShape{1}, {6})},
            /*compare_order=*/true}};
 }
 
@@ -325,6 +357,20 @@ TEST_F(ParallelMapDatasetOpTest, DatasetOutputShapes) {
   TF_ASSERT_OK(CheckDatasetOutputShapes({PartialTensorShape({})}));
 }
 
+TEST_F(ParallelMapDatasetOpTest, DatasetElementSizeHasValue) {
+  auto dataset_params = ParallelMapDatasetParams1();
+  TF_ASSERT_OK(Initialize(dataset_params));
+  auto element_size = dataset_->GetEstimatedElementSize();
+  ASSERT_TRUE(element_size.has_value());
+  EXPECT_GT(element_size.value(), 0);
+}
+
+TEST_F(ParallelMapDatasetOpTest, DatasetElementSizeNoValue) {
+  auto dataset_params = ParallelMapDatasetParams9();
+  TF_ASSERT_OK(Initialize(dataset_params));
+  EXPECT_FALSE(dataset_->GetEstimatedElementSize().has_value());
+}
+
 std::vector<CardinalityTestCase<ParallelMapDatasetParams>>
 CardinalityTestCases() {
   return {{/*dataset_params=*/ParallelMapDatasetParams1(),
diff --git a/tensorflow/core/kernels/data/repeat_dataset_op.cc b/tensorflow/core/kernels/data/repeat_dataset_op.cc
index 431649ac2d3fe5..820fc9ba6b0007 100644
--- a/tensorflow/core/kernels/data/repeat_dataset_op.cc
+++ b/tensorflow/core/kernels/data/repeat_dataset_op.cc
@@ -357,12 +357,18 @@ class RepeatDatasetOp::Dataset : public DatasetBase {
                            IteratorStateReader* reader) override {
       mutex_lock l(mu_);
       if (ctx->restored_element_count().has_value()) {
-        i_ = *ctx->restored_element_count() / dataset()->input_->Cardinality();
+        CardinalityOptions options;
+        options.set_compute_level(
+            CardinalityOptions::CARDINALITY_COMPUTE_MODERATE);
+        const int64_t input_cardinality =
+            dataset()->input_->Cardinality(std::move(options));
+        i_ = *ctx->restored_element_count() / input_cardinality;
         // For upstream iterators, the restored element count should be the
         // element count within the current repetition.
         IteratorContext::Params params(ctx);
         params.restored_element_count =
-            *ctx->restored_element_count() % dataset()->input_->Cardinality();
+            *ctx->restored_element_count() % input_cardinality;
+        params.index_mapper = GetIndexMapper(ctx->index_mapper());
         IteratorContext ctx_with_restored_element_count(params);
         return RestoreInput(&ctx_with_restored_element_count, reader,
                             input_impl_);
diff --git a/tensorflow/core/kernels/data/zip_dataset_op.cc b/tensorflow/core/kernels/data/zip_dataset_op.cc
index 3ceadfb5db2def..1bda8e47e94191 100644
--- a/tensorflow/core/kernels/data/zip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/zip_dataset_op.cc
@@ -18,12 +18,17 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/status/status.h"
 #include "tensorflow/core/data/dataset_utils.h"
 #include "tensorflow/core/data/name_utils.h"
 #include "tensorflow/core/data/split_utils.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/errors.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/macros.h"
+#include "tsl/platform/thread_annotations.h"
 
 namespace tensorflow {
 namespace data {
@@ -52,6 +57,11 @@ class ZipDatasetOp::Dataset : public DatasetBase {
       output_shapes_.insert(output_shapes_.end(),
                             input->output_shapes().begin(),
                             input->output_shapes().end());
+
+      if (input != nullptr && random_indexing_compatible_.ok() &&
+          !input->RandomIndexingCompatible().ok()) {
+        random_indexing_compatible_ = input->RandomIndexingCompatible();
+      }
     }
   }
 
@@ -127,6 +137,10 @@ class ZipDatasetOp::Dataset : public DatasetBase {
     return absl::OkStatus();
   }
 
+  absl::Status RandomIndexingCompatible() const override {
+    return random_indexing_compatible_;
+  }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
@@ -177,6 +191,14 @@ class ZipDatasetOp::Dataset : public DatasetBase {
       out_tensors->reserve(dataset()->output_dtypes().size());
       Status status = absl::OkStatus();
       *end_of_sequence = false;
+
+      if (TF_PREDICT_FALSE(ctx->index_mapper() && !input_contexts_.empty() &&
+                           input_contexts_.back().index_mapper() == nullptr)) {
+        for (IteratorContext& input_context : input_contexts_) {
+          input_context.SetIndexMapper(ctx->index_mapper());
+        }
+      }
+
       for (int i = 0; i < input_impls_.size(); ++i) {
         const auto& input_impl = input_impls_[i];
         std::vector<Tensor> input_tensors;
@@ -238,6 +260,24 @@ class ZipDatasetOp::Dataset : public DatasetBase {
     Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
       mutex_lock l(mu_);
+      // Note: When restoring, `SaveInternal` would not be called
+      // if there is a global_shuffle_dataset_op.cc above this op.
+      if (ctx->restored_element_count()) {
+        if (input_impls_.size() != dataset()->inputs_.size()) {
+          return absl::FailedPreconditionError(
+              "`Initialize` should be called before restoring from the "
+              "checkpoint.");
+        }
+        if (ctx->index_mapper() == nullptr) {
+          return absl::FailedPreconditionError(
+              "ctx->index_mapper() should be provided along with "
+              "ctx->restored_element_count() when restoring.");
+        }
+        for (const auto& input_impl : input_impls_) {
+          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl));
+        }
+        return absl::OkStatus();
+      }
       int64_t inputs_empty;
       TF_RETURN_IF_ERROR(
           reader->ReadScalar(prefix(), kInputImplsEmpty, &inputs_empty));
@@ -254,12 +294,13 @@ class ZipDatasetOp::Dataset : public DatasetBase {
    private:
     mutex mu_;
     std::vector<std::unique_ptr<IteratorBase>> input_impls_ TF_GUARDED_BY(mu_);
-    std::vector<IteratorContext> input_contexts_;
+    std::vector<IteratorContext> input_contexts_ TF_GUARDED_BY(mu_);
   };
 
   const std::vector<DatasetBase*> inputs_;
   DataTypeVector output_dtypes_;
   std::vector<PartialTensorShape> output_shapes_;
+  absl::Status random_indexing_compatible_ = absl::OkStatus();
 };
 
 ZipDatasetOp::ZipDatasetOp(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {}
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index 3f4bb1bb96ed4d..7c865b62d0452b 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -31,7 +31,6 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
diff --git a/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc b/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
index 227cd311c244ee..c26f5bdf492e39 100644
--- a/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
@@ -39,11 +39,14 @@ __global__ void GatherSliceOpKernel(
     const auto indices_i = indices + IXDIM * loc;
     bool out_of_bounds = false;
     Index offset = 0;
+    // Avoid empty std::array access, which fails to compile on GPU.
+    if constexpr (IXDIM > 0) {
 #pragma unroll
-    for (int j = 0; j < IXDIM; ++j) {
-      const Index index_j = ldg(indices_i + j);
-      out_of_bounds |= !FastBoundsCheck(index_j, batch_indices[j]);
-      offset += batch_strides[j] * index_j;
+      for (int j = 0; j < IXDIM; ++j) {
+        const Index index_j = ldg(indices_i + j);
+        out_of_bounds |= !FastBoundsCheck(index_j, batch_indices[j]);
+        offset += batch_strides[j] * index_j;
+      }
     }
     // TODO(ebrevdo):
     // This is the only part that depends on the offset.  The part
diff --git a/tensorflow/core/kernels/gather_op_test.cc b/tensorflow/core/kernels/gather_op_test.cc
index 2d04dbe33946a6..636c3cf18fd336 100644
--- a/tensorflow/core/kernels/gather_op_test.cc
+++ b/tensorflow/core/kernels/gather_op_test.cc
@@ -101,6 +101,19 @@ TEST_F(GatherOpTest, Simple_TwoD32_Axis0) {
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
+TEST_F(GatherOpTest, InvalidInputShape_TwoD32) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed invalid input shape and run
+  AddInput<float>(TensorShape({0, 3}), [](int) -> float { return 0.f; });
+  AddInputFromArray<int32>(TensorShape({4}), {0, 4, 0, 2});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  auto s = RunOpKernel();
+  EXPECT_TRUE(
+      absl::StrContains(s.ToString(), "indices[0] = 0 is not in [0, 0)"))
+      << s;
+}
+
 TEST_F(GatherOpTest, Simple_TwoD32_Axis1) {
   MakeOp(DT_FLOAT, DT_INT32);
 
diff --git a/tensorflow/core/kernels/image/adjust_contrast_op.cc b/tensorflow/core/kernels/image/adjust_contrast_op.cc
index 7cef95b9479022..df8650ebfed515 100644
--- a/tensorflow/core/kernels/image/adjust_contrast_op.cc
+++ b/tensorflow/core/kernels/image/adjust_contrast_op.cc
@@ -248,6 +248,7 @@ class AdjustContrastOpv2<CPUDevice, float> : public AdjustContrastOpV2Base {
     TTypes<float, 1>::Tensor mean_flat(&mean(0, 0), mean.size());
     TTypes<float, 1>::Tensor summation_scratch(&scratch(0, 0, 0),
                                                scratch.size());
+    using Eigen::DenseIndex;
     typedef Eigen::array<Eigen::DenseIndex, 1> Index;
     const int64_t plane_size = image_size * channels;
     // Since the number of channels in the early layers is often small, a
@@ -255,10 +256,10 @@ class AdjustContrastOpv2<CPUDevice, float> : public AdjustContrastOpV2Base {
     // This algorithm repeatedly folds each image plane by half, until
     // only one set of channels remains.
     for (int64_t i = 0; i < batch; i++) {
-      auto input_plane =
-          input_flat.slice(Index(i * plane_size), Index(plane_size));
-      auto summation_plane =
-          summation_scratch.slice(Index(i * plane_size), Index(plane_size));
+      auto input_plane = input_flat.slice(Index{DenseIndex(i * plane_size)},
+                                          Index{DenseIndex(plane_size)});
+      auto summation_plane = summation_scratch.slice(
+          Index{DenseIndex(i * plane_size)}, Index{DenseIndex(plane_size)});
       int64_t remaining_size = image_size;
       int round = 0;
       // Sum the input(i, :, k) into mean(i, k). Repeatedly splits the input
@@ -289,26 +290,29 @@ class AdjustContrastOpv2<CPUDevice, float> : public AdjustContrastOpV2Base {
         if (round == 0) {
           // In the first round, sum the left side and right side of the input
           // array into the summation area.
-          summation_plane.slice(Index(0), Index(right_size * channels)) =
-              input_plane.slice(Index(left_size * channels),
-                                Index(right_size * channels)) +
-              input_plane.slice(Index(0), Index(right_size * channels));
+          summation_plane.slice(Index{0},
+                                Index{DenseIndex(right_size * channels)}) =
+              input_plane.slice(Index{DenseIndex(left_size * channels)},
+                                Index{DenseIndex(right_size * channels)}) +
+              input_plane.slice(Index{0},
+                                Index{DenseIndex(right_size * channels)});
           if (left_size > right_size) {
             DCHECK_EQ(left_size - right_size, 1);
             // Copy over the remaining column if the remaining_size is odd.
             // This also handles the case where image_size == 1.
-            summation_plane.slice(Index(right_size * channels),
-                                  Index(channels)) =
-                input_plane.slice(Index(right_size * channels),
-                                  Index(channels));
+            summation_plane.slice(Index{DenseIndex(right_size * channels)},
+                                  Index{DenseIndex(channels)}) =
+                input_plane.slice(Index{DenseIndex(right_size * channels)},
+                                  Index{DenseIndex(channels)});
           }
         } else {
           // For all the remaining rounds, add the second half of the inputs
           // into the first half of the inputs. With the flat structure and
           // large size, this utilizes vectorization between components.
-          summation_plane.slice(Index(0), Index(right_size * channels)) +=
-              summation_plane.slice(Index(left_size * channels),
-                                    Index(right_size * channels));
+          summation_plane.slice(Index{0},
+                                Index{DenseIndex(right_size * channels)}) +=
+              summation_plane.slice(Index{DenseIndex(left_size * channels)},
+                                    Index{DenseIndex(right_size * channels)});
         }
         remaining_size = left_size;
         round++;
@@ -316,9 +320,11 @@ class AdjustContrastOpv2<CPUDevice, float> : public AdjustContrastOpV2Base {
       const float mean_scaling = 1.0f / image_size;
       // The first channels elements in summation_plane now holds the summation.
       // Scale it with image_size and copy over to the means.
-      auto mean_plane = mean_flat.slice(Index(i * channels), Index(channels));
+      auto mean_plane = mean_flat.slice(Index{DenseIndex(i * channels)},
+                                        Index{DenseIndex(channels)});
       mean_plane =
-          summation_plane.slice(Index(0), Index(channels)) * mean_scaling;
+          summation_plane.slice(Index{0}, Index{DenseIndex(channels)}) *
+          mean_scaling;
     }
   }
 
diff --git a/tensorflow/core/kernels/image/crop_and_resize_op.cc b/tensorflow/core/kernels/image/crop_and_resize_op.cc
index e9352c09bf1154..35d10f9396bdfa 100644
--- a/tensorflow/core/kernels/image/crop_and_resize_op.cc
+++ b/tensorflow/core/kernels/image/crop_and_resize_op.cc
@@ -148,6 +148,11 @@ class CropAndResizeOp : public AsyncOpKernel {
     OP_REQUIRES_ASYNC(
         context, image_height > 0 && image_width > 0,
         errors::InvalidArgument("image dimensions must be positive"), done);
+    OP_REQUIRES_ASYNC(
+        context, boxes.dims() == 2,
+        absl::InvalidArgumentError(absl::StrCat("boxes must be 2-D, got: ",
+                                                boxes.shape().DebugString())),
+        done);
     OP_REQUIRES_ASYNC(
         context, TensorShapeUtils::IsVector(box_index.shape()),
         errors::InvalidArgument("box_indices must be rank 1 but is shape ",
diff --git a/tensorflow/core/kernels/mkl/BUILD b/tensorflow/core/kernels/mkl/BUILD
index 329139874b101f..81a2eeba488541 100644
--- a/tensorflow/core/kernels/mkl/BUILD
+++ b/tensorflow/core/kernels/mkl/BUILD
@@ -95,6 +95,7 @@ tf_mkl_kernel_library(
     hdrs = [
         "mkl_kernel_util.h",
         "mkl_matmul_ops_common.h",
+        "mkl_quantized_conv_ops.h",
     ],
     deps = [
         "//tensorflow/core:graph",
@@ -171,6 +172,7 @@ tf_cc_test_mkl(
     srcs = ["mkl_qmatmul_op_test.cc"],
     linkstatic = 1,  # Fixes dyld error on MacOS.
     deps = [
+        ":mkl_matmul_op",
         ":mkl_qmatmul_op",
         "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:math_ops_op_lib",
@@ -554,3 +556,21 @@ tf_cc_test_mkl(
         "//tensorflow/core/kernels/mkl:mkl_softmax_op",
     ] + MKL_TEST_DEPS,
 )
+
+tf_cc_test_mkl(
+    name = "onednn_fused_matmul_ops_test",
+    size = "medium",
+    srcs = ["onednn_fused_matmul_ops_test.cc"],
+    linkstatic = 1,  # Fixes dyld error on MacOS.
+    deps = [
+        ":mkl_kernel_util",
+        ":mkl_matmul_op",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/core:direct_session",
+        "//tensorflow/core/kernels:bias_op",
+        "//tensorflow/core/kernels:matmul_op",
+        "//tensorflow/core/kernels:quantization_utils",
+        "//tensorflow/core/kernels:relu_op",
+        "@com_google_absl//absl/strings",
+    ] + MKL_TEST_DEPS,
+)
diff --git a/tensorflow/core/kernels/mkl/mkl_fused_ops_test.cc b/tensorflow/core/kernels/mkl/mkl_fused_ops_test.cc
index 857a25bbe55e7a..6d709794b6fb92 100644
--- a/tensorflow/core/kernels/mkl/mkl_fused_ops_test.cc
+++ b/tensorflow/core/kernels/mkl/mkl_fused_ops_test.cc
@@ -1098,7 +1098,7 @@ class MklFusedMatMulCacheTest : public OpsTestBase {
     // Bias vector.
     AddInputFromArray<float>(TensorShape({4}), {1, 2, 3, 4});
 
-    using KernelType = MklDnnMatMulOpBase<float, void, float>;
+    using KernelType = MklDnnMatMulOpBase<float, float, float>;
     // Before the first time kernel execution, weight should be empty
     EXPECT_TRUE(static_cast<KernelType*>(this->kernel_.get())
                     ->IsWeightCacheEmpty(this->context_.get()));
diff --git a/tensorflow/core/kernels/mkl/mkl_kernel_util.cc b/tensorflow/core/kernels/mkl/mkl_kernel_util.cc
index a4731ea7888337..504247ff6613ec 100644
--- a/tensorflow/core/kernels/mkl/mkl_kernel_util.cc
+++ b/tensorflow/core/kernels/mkl/mkl_kernel_util.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
 
@@ -40,13 +41,16 @@ void MklTestingUtil::RunMklQuantizeOp(const Tensor& input,
   Node* max_node = test::graph::Constant(&*graph, Tensor(max), "max");
 
   Node* quantize_op;
+  string round_mode =
+      (mode == "SCALE") ? "HALF_TO_EVEN" : "HALF_AWAY_FROM_ZERO";
+
   TF_CHECK_OK(NodeBuilder("mkl_quantizeV2", "_MklQuantizeV2")
                   .Input(input_node)
                   .Input(min_node)
                   .Input(max_node)
                   .Attr("T", type)
                   .Attr("mode", mode)
-                  .Attr("round_mode", "HALF_TO_EVEN")
+                  .Attr("round_mode", round_mode)
                   .Attr("_kernel", "QuantizedMklOp")
                   .Finalize(&*graph, &quantize_op));
 
diff --git a/tensorflow/core/kernels/mkl/mkl_kernel_util.h b/tensorflow/core/kernels/mkl/mkl_kernel_util.h
index fb9df4d237c74b..da600fb001e038 100644
--- a/tensorflow/core/kernels/mkl/mkl_kernel_util.h
+++ b/tensorflow/core/kernels/mkl/mkl_kernel_util.h
@@ -49,6 +49,38 @@ class MklTestingUtil {
     *tensor_min = min();
     *tensor_max = max();
   }
+
+  // This utility function mimics Quantization of float/bfloat16 tensor with
+  // oneDNN backend QuantizeV2 operation. Since the op signature requires min
+  // and max values to be in float type, min_tensor and max_tensor should have
+  // their dtype set to DT_FLOAT.
+  template <typename T>
+  static Status GetQuantizationTensors(const Tensor& input, Tensor* output,
+                                       DataType out_type, const string mode,
+                                       Tensor* min_tensor, Tensor* max_tensor) {
+    if (min_tensor->dtype() != DT_FLOAT || max_tensor->dtype() != DT_FLOAT) {
+      return absl::UnimplementedError("Tensor must be float32.");
+    }
+    T min;
+    T max;
+    ComputeMinMax<T>(input, &min, &max);
+
+    float adjusted_min = static_cast<float>(min);
+    float adjusted_max = static_cast<float>(max);
+    if (mode == "SCALED") {
+      if (output->dtype() != DT_QINT8) {
+        return absl::UnimplementedError("Tensor must be QInt8 in SCALED mode.");
+      }
+      float range = std::max(std::abs(adjusted_min), std::abs(adjusted_max));
+      adjusted_min = -range;
+      adjusted_max = range;
+    }
+    RunMklQuantizeOp(input, adjusted_min, adjusted_max, out_type, mode, output);
+    min_tensor->flat<float>()(0) = adjusted_min;
+    max_tensor->flat<float>()(0) = adjusted_max;
+
+    return OkStatus();
+  }
 };
 
 #ifdef ENABLE_ONEDNN_V3
diff --git a/tensorflow/core/kernels/mkl/mkl_matmul_op_fused.cc b/tensorflow/core/kernels/mkl/mkl_matmul_op_fused.cc
index 2d0065a52e5b4a..fb76ab765f1bf9 100644
--- a/tensorflow/core/kernels/mkl/mkl_matmul_op_fused.cc
+++ b/tensorflow/core/kernels/mkl/mkl_matmul_op_fused.cc
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -19,19 +19,29 @@ limitations under the License.
 // Multiplication (MatMul) with bias (BiasAdd) operations.
 #if defined(INTEL_MKL)
 
+#include <type_traits>
+
+#include "oneapi/dnnl/dnnl.hpp"
+#include "absl/container/inlined_vector.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h"
+#include "tensorflow/core/kernels/mkl/mkl_quantized_conv_ops.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/errors.h"
 
 namespace tensorflow {
 
 // Fuse Operation
-template <typename Device, typename T, bool native_format = false>
-class MklFusedMatMulOp : public MklDnnMatMulOpBase<T, void, T> {
+template <typename Device, typename T1, typename T2, typename Tbias,
+          typename Toutput, typename U, bool native_format = false>
+class MklFusedMatMulOp : public MklDnnMatMulOpBase<T2, Tbias, Toutput> {
  public:
   explicit MklFusedMatMulOp(OpKernelConstruction* ctx)
-      : MklDnnMatMulOpBase<T, void, T>(ctx) {
+      : MklDnnMatMulOpBase<T2, Tbias, Toutput>(ctx) {
+    if (std::is_same<T2, qint8>::value) {
+      return;  // Quantized version will have own contstruction code.
+    }
     OP_REQUIRES_OK(ctx, ctx->GetAttr("fused_ops", &fused_ops_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_a", &transpose_a_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_b", &transpose_b_));
@@ -41,7 +51,6 @@ class MklFusedMatMulOp : public MklDnnMatMulOpBase<T, void, T> {
       OP_REQUIRES_OK(
           ctx, ctx->GetAttr("is_filter_const", &(this->is_weight_const_)));
     }
-
     OP_REQUIRES(ctx, fused_ops_.size() <= 2,
                 absl::InvalidArgumentError(
                     "MklFusedMatMul must have 2 post-arguments at most."));
@@ -54,7 +63,7 @@ class MklFusedMatMulOp : public MklDnnMatMulOpBase<T, void, T> {
         ctx, transpose_a_ == false,
         absl::InvalidArgumentError("In[0] of MklMatMul can't be transposed."));
     if (fused_ops_.size() == 2 && fused_ops_[1] == "LeakyRelu") {
-      OP_REQUIRES_OK(ctx, ctx->GetAttr("leakyrelu_alpha", &leakyrelu_alpha));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("leakyrelu_alpha", &leakyrelu_alpha_));
     }
   }
 
@@ -64,7 +73,7 @@ class MklFusedMatMulOp : public MklDnnMatMulOpBase<T, void, T> {
     const Tensor& weight_tensor = ctx->input(this->kInputIndexWeight);
     const Tensor& bias_tensor = MklGetInput(ctx, this->kInputIndexBias);
 
-    if (std::is_same<T, float>::value) {
+    if (std::is_same<T1, float>::value) {
       (void)SetFPMathMode();
     }
 
@@ -134,15 +143,16 @@ class MklFusedMatMulOp : public MklDnnMatMulOpBase<T, void, T> {
         memory::format_tag::nc, this->is_weight_const_);
     // Extend the basic parameters for data types and fusions.
     ExtendMklDnnMatMulFwdParams(ctx, matmul_params);
-    auto st = ExecuteSingleThreadedGemm(batch, channel, k, sizeof(T));
+    auto st = ExecuteSingleThreadedGemm(batch, channel, k, sizeof(T1));
     // Create the oneDNN wrapper over Eigen threadpool and set max threads
     // in oneDNN.
     Eigen::ThreadPoolInterface* eigen_interface =
         EigenThreadPoolFromTfContext(ctx);
     tsl::OneDnnThreadPool eigen_tp(eigen_interface, ThreadPoolUseCallerThread(),
                                    st ? 1 : -1);
-    MklDnnMatMulFwdPrimitive<T, T, T, T, T>* matmul_prim =
-        MklDnnMatMulFwdPrimitiveFactory<T, T, T, T, T>::Get(matmul_params, 0);
+    MklDnnMatMulFwdPrimitive<float, T1, T2, Tbias, Toutput>* matmul_prim =
+        MklDnnMatMulFwdPrimitiveFactory<float, T1, T2, Tbias, Toutput>::Get(
+            matmul_params, 0);
 
     // Allocate output tensor.
     Tensor* dst_tensor = nullptr;
@@ -158,17 +168,17 @@ class MklFusedMatMulOp : public MklDnnMatMulOpBase<T, void, T> {
     TensorShape output_tf_shape({batch, channel});
 
     if (fuse_add_) {
-      const Tensor& add_tensor = MklGetInput(ctx, kInputIndex_Add);
+      const Tensor& add_tensor = MklGetInput(ctx, input_idx_add_);
       MklDnnShape add_mkl_shape;
-      GetMklShape(ctx, kInputIndex_Add, &add_mkl_shape, native_format);
+      GetMklShape(ctx, input_idx_add_, &add_mkl_shape, native_format);
 
       // For native format, we need not to set metadata.
-      if (native_format && ctx->forward_input_to_output_with_shape(
-                               kInputIndex_Add, kOutputIndex_Dst,
-                               output_tf_shape, &dst_tensor)) {
+      if (native_format &&
+          ctx->forward_input_to_output_with_shape(
+              input_idx_add_, kOutputIndex_Dst, output_tf_shape, &dst_tensor)) {
         ;  // Need to do nothing for native format
       } else if (!native_format && ForwardMklTensorInToOutWithMklShape(
-                                       ctx, kInputIndex_Add, kOutputIndex_Dst,
+                                       ctx, input_idx_add_, kOutputIndex_Dst,
                                        &dst_tensor, output_mkl_shape, false)) {
         ;  // If it's not native format, need to forward and set meta first
       } else {
@@ -182,19 +192,20 @@ class MklFusedMatMulOp : public MklDnnMatMulOpBase<T, void, T> {
         auto add_md =
             add_mkl_shape.IsMklTensor()
                 ? add_mkl_shape.GetMklLayout()
-                : memory::desc(dst_dims, MklDnnType<T>(), output_format_tag);
+                : memory::desc(dst_dims, MklDnnType<U>(), output_format_tag);
         auto dst_md =
-            memory::desc(dst_dims, MklDnnType<T>(), output_format_tag);
+            memory::desc(dst_dims, MklDnnType<Toutput>(), output_format_tag);
 
         void* add_buf =
-            static_cast<void*>(const_cast<T*>(add_tensor.flat<T>().data()));
-        void* dst_buf = static_cast<void*>((dst_tensor)->flat<T>().data());
+            static_cast<void*>(const_cast<U*>(add_tensor.flat<U>().data()));
+        void* dst_buf =
+            static_cast<void*>((dst_tensor)->flat<Toutput>().data());
 
         if (native_format) {
           // We are simply deep copying the add_tensor to dst_tensor without
           // changing memory layout, hence using same memory descriptor.
           add_md = dst_md =
-              memory::desc({add_tensor.NumElements()}, MklDnnType<T>(),
+              memory::desc({add_tensor.NumElements()}, MklDnnType<U>(),
                            dnnl::memory::format_tag::x);
         }
 
@@ -218,31 +229,33 @@ class MklFusedMatMulOp : public MklDnnMatMulOpBase<T, void, T> {
 
     try {
       // Prepare the input and output for primitive.
-      T* src_data = const_cast<T*>(src_tensor.flat<T>().data());
-      T* weight_data = const_cast<T*>(weight_tensor.flat<T>().data());
-      T* bias_data = const_cast<T*>(bias_tensor.flat<T>().data());
-      T* dst_data = const_cast<T*>(dst_tensor->flat<T>().data());
+      T1* src_data = const_cast<T1*>(src_tensor.flat<T1>().data());
+      T2* weight_data = const_cast<T2*>(weight_tensor.flat<T2>().data());
+      void* bias_data = static_cast<void*>(
+          const_cast<Tbias*>(bias_tensor.flat<Tbias>().data()));
+      Toutput* dst_data =
+          const_cast<Toutput*>(dst_tensor->flat<Toutput>().data());
 
       // Reorder input if necessary.
-      MklDnnData<T> src_mkl(&(this->cpu_engine_));
-      MklDnnData<T> weight_mkl(&(this->cpu_engine_));
+      MklDnnData<T1> src_mkl(&(this->cpu_engine_));
+      MklDnnData<T2> weight_mkl(&(this->cpu_engine_));
 
       auto src_md = src_mkl_shape.IsMklTensor()
                         ? src_mkl_shape.GetMklLayout()
-                        : memory::desc(src_dims, MklDnnType<T>(), src_format);
+                        : memory::desc(src_dims, MklDnnType<T1>(), src_format);
 
       if (src_md != matmul_pd->src_desc()) {
         src_mkl.SetUsrMem(src_md, src_data);
         src_mkl.CheckReorderToOpMem(matmul_pd.get()->src_desc(),
                                     this->cpu_engine_, ctx);
-        src_data = reinterpret_cast<T*>(src_mkl.GetOpMem().get_data_handle());
+        src_data = static_cast<T1*>(src_mkl.GetOpMem().get_data_handle());
       }
 
       // Get cached data when weight is const.
       const memory::desc weight_md =
-          memory::desc(weight_dims, MklDnnType<T>(), weight_format);
+          memory::desc(weight_dims, MklDnnType<T2>(), weight_format);
       if (weight_md != matmul_pd->weights_desc()) {
-        T* cached_weight_data = nullptr;
+        T2* cached_weight_data = nullptr;
 
         if (this->is_weight_const_) {
           // TODO(intel-tf): When oneDNN major version changes to v4.x, weight
@@ -268,16 +281,22 @@ class MklFusedMatMulOp : public MklDnnMatMulOpBase<T, void, T> {
           weight_mkl.CheckReorderToOpMem(matmul_pd.get()->weights_desc(),
                                          this->cpu_engine_, ctx);
           weight_data =
-              reinterpret_cast<T*>(weight_mkl.GetOpMem().get_data_handle());
+              static_cast<T2*>(weight_mkl.GetOpMem().get_data_handle());
         }
       }
       std::shared_ptr<stream> cpu_stream;
-
       cpu_stream.reset(CreateStream(&eigen_tp, matmul_prim->GetEngine()));
 
       UserScratchPad<unsigned char> scratch_pad;
       scratch_pad.AllocateSPTensor(matmul_prim, ctx);
 
+      // Temporary tensor for scaled bias when op is quantized version.
+      Tensor temp_scaled_bias_tensor;
+      if (std::is_same<T2, qint8>::value) {
+        this->GetScaledBias(ctx, matmul_pd, bias_tensor,
+                            &temp_scaled_bias_tensor, &bias_data);
+      }
+
       // Execute fused matmul op.
       matmul_prim->Execute(src_data, weight_data, bias_data, dst_data,
                            matmul_params, scratch_pad.Get(), cpu_stream);
@@ -290,30 +309,31 @@ class MklFusedMatMulOp : public MklDnnMatMulOpBase<T, void, T> {
     }
   }
 
-  void ExtendMklDnnMatMulFwdParams(OpKernelContext* ctx,
-                                   MklDnnMatMulFwdParams& params) {
+  virtual void ExtendMklDnnMatMulFwdParams(OpKernelContext* ctx,
+                                           MklDnnMatMulFwdParams& params) {
+    // Create a string from data types of input, weight, bias, and output.
+    params.dtypes.append(typeid(T1).name());
+    params.dtypes.append(typeid(T2).name());
+    params.dtypes.append(typeid(Tbias).name());
+    params.dtypes.append(typeid(Toutput).name());
     if (fused_ops_.size() == 2) {
       string post_op = fused_ops_[1];
-
-      if (post_op == "Relu") {
-        params.post_op_params.push_back({"relu", {1.0, 0.0, 0.0}});
-      } else if (post_op == "Relu6") {
-        params.post_op_params.push_back({"relu6", {1.0, 6.0, 0.0}});
+      float scale = 1.0f;
+      float alpha = 0.0f;
+      float beta = 0.0f;
+      if (post_op == "Relu6") {
+        alpha = 6.0f;
+      } else if (post_op == "LeakyRelu") {
+        alpha = leakyrelu_alpha_;
       } else if (post_op == "Elu") {
-        params.post_op_params.push_back({"elu", {1.0, 1.0, 0.0}});
-      } else if (post_op == "GeluApproximate") {
-        params.post_op_params.push_back({"gelu_approximate", {1.0, 1.0, 0.0}});
-      } else if (post_op == "GeluExact") {
-        params.post_op_params.push_back({"gelu_exact", {1.0, 1.0, 0.0}});
-      } else if (post_op == "Tanh") {
-        params.post_op_params.push_back({"tanh", {1.0, 0.0, 0.0}});
+        alpha = 1.0f;
+      }
+      if (post_op == "Relu" || post_op == "Relu6" || post_op == "LeakyRelu" ||
+          post_op == "Elu" || post_op == "GeluApproximate" ||
+          post_op == "GeluExact" || post_op == "Tanh" || post_op == "Sigmoid") {
+        params.post_op_params.push_back({post_op, {scale, alpha, beta}});
       } else if (post_op == "Add") {
         params.post_op_params.push_back({"sum", {1.0}});
-      } else if (post_op == "LeakyRelu") {
-        params.post_op_params.push_back(
-            {"leakyrelu", {1.0, leakyrelu_alpha, 0.0}});
-      } else if (post_op == "Sigmoid") {
-        params.post_op_params.push_back({"logistic", {1.0, 0.0, 0.0}});
       } else {
         OP_REQUIRES_OK(ctx, absl::InvalidArgumentError(absl::StrCat(
                                 "Unsupported post-argument in MklFusedMatMul: ",
@@ -322,34 +342,630 @@ class MklFusedMatMulOp : public MklDnnMatMulOpBase<T, void, T> {
     }
   }
 
- private:
+ protected:
+  virtual void GetScaledBias(
+      OpKernelContext*,
+      std::shared_ptr<dnnl::inner_product_forward::primitive_desc>&,
+      const Tensor&, Tensor*, void**) {}
+
   bool fuse_add_ = false;
   bool transpose_a_;
   bool transpose_b_;
-  float leakyrelu_alpha = 0.2;
+  float leakyrelu_alpha_ = 0.2;
   std::vector<string> fused_ops_;
-  const int kInputIndex_Add = 3;
+  int input_idx_add_ = 3;
   const int kOutputIndex_Dst = 0;
-};  // namespace tensorflow
+#ifdef DNNL_AARCH64_USE_ACL
+  const int kWeightTensorHashLength = 1024;
+#endif
+};
+
+namespace {
+
+enum class FusedComputationType {
+  kUndefined,
+  kBiasAdd,
+  kBiasAdd_Dequantize,
+  kBiasAdd_Requantize,
+  kBiasAdd_Activation,
+  kBiasAdd_Activation_Dequantize,
+  kBiasAdd_Activation_Requantize,
+  kBiasAdd_Add,
+  kBiasAdd_Add_Dequantize,
+  kBiasAdd_Add_Requantize,
+};
+
+struct FusedComputationPattern {
+  FusedComputationType fused_computation;
+  std::vector<string> fused_ops;
+};
+
+}  // namespace
+
+// OneDNN uses post-ops to implement different kind of fusions. The category of
+// each individual post-op can be inferred from the fused_ops attribute. The
+// following enum is used to identify list of required post-ops.
+enum class PostOpKind { kActivation, kSum, kOutputScale, kLinear };
+
+template <typename Device, typename T1, typename T2, typename Tbias,
+          typename Toutput, typename U, bool native_format = true>
+class QuantizedFusedMatMulOp
+    : public MklFusedMatMulOp<Device, T1, T2, Tbias, Toutput, U,
+                              native_format> {
+ protected:
+  string input_quant_mode_;   // 0-th input
+  string output_quant_mode_;  // 0-th output
+  string activation_type_;    // Activation op type
+
+  // Initialize minmax tensor indices with default values for the most common
+  // cases.
+  int input_min_idx_ = 3;
+  int input_max_idx_ = 4;
+  int weight_min_idx_ = 5;
+  int weight_max_idx_ = 6;
+
+  struct PostOpInfo {
+    PostOpKind post_op_kind;
+    struct OperandInfo {
+      int idx = -1;  // Operand tensor index if needed by a post-op.
+      // Indices of min and max value tensors, if the operand is quantized.
+      absl::InlinedVector<int, 4> min_max_indices;
+    } operand_info;
+    // Indices of output min and max value tensors. It is used when requantize
+    // is fused.
+    absl::InlinedVector<int, 4> min_max_indices;
+  };
+
+  absl::InlinedVector<PostOpInfo, 4> post_op_info_list_;
+
+  void Initialize(OpKernelConstruction* context) {
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("transpose_a", &this->transpose_a_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("transpose_b", &this->transpose_b_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("input_quant_mode", &input_quant_mode_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("output_quant_mode", &output_quant_mode_));
+    OP_REQUIRES_OK(
+        context, context->GetAttr("is_weight_const", &this->is_weight_const_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("is_bias_const", &this->is_bias_const_));
+    if (context->HasAttr("leakyrelu_alpha")) {
+      OP_REQUIRES_OK(context, context->GetAttr("leakyrelu_alpha",
+                                               &this->leakyrelu_alpha_));
+    }
+
+    // Extract activation info and canonicalize activation types to
+    // common name "Activation" in the fused_ops attribute.
+    std::vector<string> fused_ops;
+    OP_REQUIRES_OK(context, context->GetAttr("fused_ops", &fused_ops));
+    for (auto it = fused_ops.begin(); it != fused_ops.end(); ++it) {
+      if (*it == "Relu" || *it == "Relu6" || *it == "Elu" ||
+          *it == "GeluApproximate" || *it == "GeluExact" || *it == "Tanh" ||
+          *it == "LeakyRelu" || *it == "Sigmoid") {
+        if (*it != "Relu") {
+          string last_fusion = fused_ops.back();
+          OP_REQUIRES(
+              context,
+              (last_fusion == "Dequantize" || last_fusion == "Requantize"),
+              absl::UnimplementedError(absl::StrCat(
+                  "Nonlinear activation except Relu can be ",
+                  "supported only with Dequantize or Requantize fusion.")));
+        }
+        activation_type_ = *it;
+        // Canonicalize all activation types into "Activation" for simplifying
+        // post ops construction.
+        *it = "Activation";
+      }
+    }
+
+    using FCT = FusedComputationType;
+
+    // TODO(intel-tf): Add more patterns when implemented.
+    std::vector<FusedComputationPattern> patterns{
+        {FCT::kBiasAdd, {"BiasAdd"}},
+        {FCT::kBiasAdd_Dequantize, {"BiasAdd", "Dequantize"}},
+        {FCT::kBiasAdd_Requantize, {"BiasAdd", "Requantize"}},
+        {FCT::kBiasAdd_Activation, {"BiasAdd", "Activation"}},
+        {FCT::kBiasAdd_Activation_Dequantize,
+         {"BiasAdd", "Activation", "Dequantize"}},
+        {FCT::kBiasAdd_Activation_Requantize,
+         {"BiasAdd", "Activation", "Requantize"}},
+        {FCT::kBiasAdd_Add_Dequantize, {"BiasAdd", "Add", "Dequantize"}},
+    };
+
+    FusedComputationType fused_computation = FusedComputationType::kUndefined;
+    for (const auto& pattern : patterns) {
+      if (fused_ops == pattern.fused_ops) {
+        fused_computation = pattern.fused_computation;
+        break;
+      }
+    }
+
+    // Configure oneDNN post ops
+    switch (fused_computation) {
+      case FCT::kBiasAdd:
+        // No post op is required.
+        OP_REQUIRES(context, (std::is_same<Toutput, qint32>::value),
+                    absl::UnimplementedError(absl::StrCat(
+                        "Qunatized fusion: [", absl::StrJoin(fused_ops, ","),
+                        "] needs output in qint32.")));
+        break;
+      case FCT::kBiasAdd_Dequantize:
+        post_op_info_list_ = {{PostOpKind::kOutputScale, {}, {}}};
+        break;
+      case FCT::kBiasAdd_Requantize:
+        post_op_info_list_ = {{PostOpKind::kOutputScale, {}, {}},
+                              {PostOpKind::kLinear, {}, {7, 8}}};
+        break;
+      case FCT::kBiasAdd_Activation:
+        OP_REQUIRES(context,
+                    (std::is_same<Toutput, qint32>::value &&
+                     activation_type_ == "Relu"),
+                    absl::UnimplementedError(absl::StrCat(
+                        "Qunatized fusion: [", absl::StrJoin(fused_ops, ","),
+                        "] needs output in qint32 and ",
+                        "activation supported is only Relu")));
+        post_op_info_list_ = {{PostOpKind::kActivation, {}, {}}};
+        break;
+      case FCT::kBiasAdd_Activation_Dequantize:
+        post_op_info_list_ = {{PostOpKind::kOutputScale, {}, {}},
+                              {PostOpKind::kActivation, {}, {}}};
+        break;
+      case FCT::kBiasAdd_Activation_Requantize:
+        post_op_info_list_ = {{PostOpKind::kOutputScale, {}, {}},
+                              {PostOpKind::kActivation, {}, {}},
+                              {PostOpKind::kLinear, {}, {7, 8}}};
+        break;
+      case FCT::kBiasAdd_Add_Dequantize: {
+        OP_REQUIRES(
+            context,
+            (std::is_same<U, float>::value || std::is_same<U, bfloat16>::value),
+            absl::UnimplementedError(
+                "Quantized addend tensor is not implemented yet."));
+        // Addend tensor precedes all minmax tensors. Shift the indices from
+        // default initilized values.
+        input_min_idx_ += 1;
+        input_max_idx_ += 1;
+        weight_min_idx_ += 1;
+        weight_max_idx_ += 1;
+        post_op_info_list_ = {{PostOpKind::kOutputScale, {}, {}},
+                              {PostOpKind::kSum, {3, {}}, {}}};
+      } break;
+      default:
+        OP_REQUIRES(context, false,
+                    absl::UnimplementedError(
+                        absl::StrCat("Fusion is not implemented: [",
+                                     absl::StrJoin(fused_ops, ","), "]")));
+    }
+  }
+
+ public:
+  explicit QuantizedFusedMatMulOp(OpKernelConstruction* context)
+      : MklFusedMatMulOp<Device, T1, T2, Tbias, Toutput, U, true>(context) {
+    Initialize(context);
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    MklFusedMatMulOp<Device, T1, T2, Tbias, Toutput, U, true>::Compute(ctx);
+    // Compute additional outputs
+    if (std::is_same<Toutput, qint8>::value ||
+        std::is_same<Toutput, quint8>::value ||
+        std::is_same<Toutput, qint32>::value) {
+      Tensor* min_output = nullptr;
+      Tensor* max_output = nullptr;
+
+      const float min_input = ctx->input(input_min_idx_).flat<float>()(0);
+      const float max_input = ctx->input(input_max_idx_).flat<float>()(0);
+      const Tensor& min_weight = ctx->input(weight_min_idx_);
+      const Tensor& max_weight = ctx->input(weight_max_idx_);
+      OP_REQUIRES(ctx, min_weight.shape() == max_weight.shape(),
+                  absl::InvalidArgumentError(
+                      "Shape of min-weight and max-weight must be same."));
+
+      if (std::is_same<Toutput, qint32>::value) {
+        TensorShape output_minmax_shape = min_weight.shape();
+        OP_REQUIRES_OK(
+            ctx, ctx->allocate_output(1, output_minmax_shape, &min_output));
+        OP_REQUIRES_OK(
+            ctx, ctx->allocate_output(2, output_minmax_shape, &max_output));
+        if (min_weight.dims() == 0) {
+          float min_output_value;
+          float max_output_value;
+          MklQuantizationRangeForMultiplication<T1, T2, qint32>(
+              min_input, max_input, min_weight.flat<float>()(0),
+              max_weight.flat<float>()(0), &min_output_value,
+              &max_output_value);
+          min_output->flat<float>()(0) = min_output_value;
+          max_output->flat<float>()(0) = max_output_value;
+        } else {
+          MklQuantizationRangeForMultiplication<T1, T2, qint32>(
+              min_input, max_input, min_weight, max_weight, &min_output,
+              &max_output);
+        }
+      } else {
+        // When output type is qint8 or quint8, the kernel is registered for
+        // Requantize fusion.
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(1, {}, &min_output));
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(2, {}, &max_output));
+        int output_min_idx = ctx->num_inputs() - 2;
+        int output_max_idx = ctx->num_inputs() - 1;
+        const float requested_min = ctx->input(output_min_idx).flat<float>()(0);
+        const float requested_max = ctx->input(output_max_idx).flat<float>()(0);
+        if (output_quant_mode_ == "SCALED") {
+          const float range_output =
+              std::max(std::abs(requested_min), std::abs(requested_max));
+          if (std::is_same<Toutput, qint8>::value) {
+            min_output->flat<float>()(0) = -range_output;
+            max_output->flat<float>()(0) = range_output;
+          } else {
+            min_output->flat<float>()(0) = 0;
+            max_output->flat<float>()(0) = range_output;
+          }
+        } else {
+          min_output->flat<float>()(0) = requested_min;
+          max_output->flat<float>()(0) = requested_max;
+        }
+      }
+    } else if (std::is_same<Toutput, float>::value ||
+               std::is_same<Toutput, bfloat16>::value) {
+      // Kernel is registered for Dequantization fusion. Nothing to do.
+    } else {
+      OP_REQUIRES_OK(ctx,
+                     absl::InvalidArgumentError("Unsupported output type."));
+    }
+  }
+
+  void ExtendMklDnnMatMulFwdParams(OpKernelContext* ctx,
+                                   MklDnnMatMulFwdParams& params) override {
+    // Create a string from data types of input, weight, bias, and output.
+    params.dtypes.append(typeid(T1).name());
+    params.dtypes.append(typeid(T2).name());
+    params.dtypes.append(typeid(Tbias).name());
+    params.dtypes.append(typeid(Toutput).name());
+
+    params.input_quant_mode = input_quant_mode_;
+
+    for (const auto& post_op_info : post_op_info_list_) {
+      auto post_op_kind = post_op_info.post_op_kind;
+      switch (post_op_kind) {
+        case PostOpKind::kOutputScale: {
+          if constexpr (std::is_same<Toutput, qint32>::value) {
+            // No scaling is required.
+            break;
+          }
+          const float min_input = ctx->input(input_min_idx_).flat<float>()(0);
+          const float max_input = ctx->input(input_max_idx_).flat<float>()(0);
+          const Tensor& min_weight_tensor = ctx->input(weight_min_idx_);
+          const Tensor& max_weight_tensor = ctx->input(weight_max_idx_);
+          const float* min_weight = min_weight_tensor.flat<float>().data();
+          const float* max_weight = max_weight_tensor.flat<float>().data();
+          const size_t num_weight_scales = min_weight_tensor.NumElements();
+
+          const float max_int8_input =
+              (std::is_same<T1, quint8>::value) ? 255.0f : 127.0f;
+          const float max_int8_weight =
+              (std::is_same<T2, quint8>::value) ? 255.0f : 127.0f;
+          const float range_input =
+              (input_quant_mode_ == "MIN_FIRST")
+                  ? max_input - min_input
+                  : std::max(std::abs(min_input), std::abs(max_input));
+
+          const float src_scale = range_input / max_int8_input;
+          std::vector<float> wei_scales(num_weight_scales);
+#ifndef ENABLE_ONEDNN_V3
+          std::vector<float> output_scales(num_weight_scales);
+#endif  // ENABLE_ONEDNN_V3
+          for (size_t i = 0; i < num_weight_scales; ++i) {
+            float range_weight =
+                std::max(std::abs(min_weight[i]), std::abs(max_weight[i]));
+            wei_scales[i] = range_weight / max_int8_weight;
+#ifndef ENABLE_ONEDNN_V3
+            output_scales[i] = src_scale * wei_scales[i];
+#endif  // ENABLE_ONEDNN_V3
+          }
+          FactoryKeyCreator src_partial_key;
+          src_partial_key.AddAsKey<float>(min_input);
+          src_partial_key.AddAsKey<float>(max_input);
+
+          FactoryKeyCreator wei_partial_key;
+          wei_partial_key.AddAsKey<const float*>(min_weight);
+          wei_partial_key.AddAsKey<const float*>(max_weight);
+#ifndef ENABLE_ONEDNN_V3
+          FactoryKeyCreator output_scales_partial_key;
+          output_scales_partial_key.AddAsKey(src_partial_key.GetKey());
+          output_scales_partial_key.AddAsKey(wei_partial_key.GetKey());
+          params.post_op_params.push_back({"output_scale", output_scales,
+                                           output_scales_partial_key.GetKey()});
+#else
+          params.post_op_params.push_back(
+              {"src_scale", {src_scale}, src_partial_key.GetKey()});
+          params.post_op_params.push_back(
+              {"wei_scale", wei_scales, wei_partial_key.GetKey()});
+#endif  // ENABLE_ONEDNN_V3
+        } break;
+
+        case PostOpKind::kActivation: {
+          float scale = 1.0f;
+          float alpha = 0.0f;
+          float beta = 0.0f;
+          if (activation_type_ == "LeakyRelu")
+            alpha = this->leakyrelu_alpha_;
+          else if (activation_type_ == "Relu6")
+            alpha = 6.0f;
+          else if (activation_type_ == "Elu")
+            alpha = 1.0f;
+          params.post_op_params.push_back(
+              {activation_type_, {scale, alpha, beta}});
+        } break;
+
+        case PostOpKind::kLinear: {
+          // Update output_scale for requantize fusion.
+          auto output_min_idx = post_op_info.min_max_indices[0];
+          auto output_max_idx = post_op_info.min_max_indices[1];
+          const float min_output =
+              ctx->input(output_min_idx).template flat<float>()(0);
+          const float max_output =
+              ctx->input(output_max_idx).template flat<float>()(0);
+          const float max_int8_output =
+              (std::is_same<Toutput, quint8>::value) ? 255.0f : 127.0f;
+          const float range_output =
+              (output_quant_mode_ == "MIN_FIRST")
+                  ? max_output - min_output
+                  : std::max(std::abs(min_output), std::abs(max_output));
+          float req_scale = max_int8_output / range_output;
+          float req_shift = 0.0f;
+          if (output_quant_mode_ == "MIN_FIRST") {
+            req_shift = -min_output * max_int8_output / range_output;
+          }
+          params.post_op_params.push_back(
+              {"linear", {1.0, req_scale, req_shift}});
+        } break;
+
+        case PostOpKind::kSum: {
+          this->fuse_add_ = true;
+          this->input_idx_add_ = post_op_info.operand_info.idx;
+          params.post_op_params.push_back({"sum", {1.0}});
+        } break;
+
+        default:
+          OP_REQUIRES_OK(
+              ctx, absl::InvalidArgumentError("Unsupported post-op-kind."));
+      }
+    }
+  }
+
+  void GetScaledBias(
+      OpKernelContext* ctx,
+      std::shared_ptr<dnnl::inner_product_forward::primitive_desc>& matmul_pd,
+      const Tensor& bias_tensor, Tensor* temp_scaled_bias_tensor,
+      void** bias_data) override {
+#ifdef ENABLE_ONEDNN_V3
+#define TSCALED_BIAS float
+#else
+#define TSCALED_BIAS Tbias
+#endif  // ENABLE_ONEDNN_V3
+
+#ifndef ENABLE_ONEDNN_V3
+    if (std::is_same<Tbias, qint32>::value) {
+      // Bias already has been scaled for quantized input and weight.
+#else
+    if ((std::is_same<Tbias, float>::value ||
+         std::is_same<Tbias, bfloat16>::value) &&
+        input_quant_mode_ == "SCALED") {
+#endif  // !ENABLE_ONEDNN_V3
+      return;
+    } else {
+      const float min_input = ctx->input(input_min_idx_).flat<float>()(0);
+      const float max_input = ctx->input(input_max_idx_).flat<float>()(0);
+      const Tensor& min_weight_tensor = ctx->input(weight_min_idx_);
+      const Tensor& max_weight_tensor = ctx->input(weight_max_idx_);
+      const float* min_weight = min_weight_tensor.flat<float>().data();
+      const float* max_weight = max_weight_tensor.flat<float>().data();
+      bool is_cached_bias_valid = false;
+      bool is_bias_cache_empty = this->IsBiasCacheEmpty();
+      if (!is_bias_cache_empty) {
+        this->GetCachedBias(min_input, max_input, bias_data);
+        is_cached_bias_valid = (*bias_data != nullptr);
+      }
+      if (!is_cached_bias_valid) {
+        void* input_bias_buf = static_cast<void*>(
+            const_cast<Tbias*>(bias_tensor.flat<Tbias>().data()));
+        auto scaled_bias_md = matmul_pd->bias_desc();
+        TensorShape scaled_bias_shape;
+        scaled_bias_shape.AddDim((scaled_bias_md.get_size() / sizeof(float)));
+        OP_REQUIRES_OK(ctx, ctx->allocate_temp(
+                                DataTypeToEnum<TSCALED_BIAS>::v(),
+                                scaled_bias_shape, temp_scaled_bias_tensor));
+        void* scaled_bias_buf = static_cast<void*>(
+            temp_scaled_bias_tensor->flat<TSCALED_BIAS>().data());
+
+        const float max_int8_input =
+            (std::is_same<T1, quint8>::value) ? 255.0f : 127.0f;
+        const float max_int8_weight =
+            (std::is_same<T2, quint8>::value) ? 255.0f : 127.0f;
+        const float range_input =
+            (input_quant_mode_ == "MIN_FIRST")
+                ? max_input - min_input
+                : std::max(std::abs(min_input), std::abs(max_input));
+        const size_t num_weight_scales = min_weight_tensor.NumElements();
+        std::vector<float> bias_scales(num_weight_scales, 1.0);
+        for (size_t i = 0; i < num_weight_scales; ++i) {
+          float range_weight =
+              std::max(std::abs(min_weight[i]), std::abs(max_weight[i]));
+          float scale_factor =
+              (max_int8_input * max_int8_weight) / (range_input * range_weight);
+          bias_scales[i] = scale_factor;
+        }
+        if (input_quant_mode_ == "MIN_FIRST") {
+          Tbias* input_bias = (Tbias*)input_bias_buf;
+          TSCALED_BIAS* adjusted_bias = (TSCALED_BIAS*)scaled_bias_buf;
+          float q_min_input = max_int8_input * min_input / range_input;
+          const Tensor& weight_tensor = ctx->input(1);
+          int stride_ic = 1;
+          int stride_oc = 1;
+          int k = 0;
+          int n = 0;
+          if (this->transpose_b_) {
+            k = weight_tensor.dim_size(1);
+            n = weight_tensor.dim_size(0);
+            stride_ic = 1;
+            stride_oc = k;
+          } else {
+            k = weight_tensor.dim_size(0);
+            n = weight_tensor.dim_size(1);
+            stride_ic = n;
+            stride_oc = 1;
+          }
+          T2* weight_buf = const_cast<T2*>(weight_tensor.flat<T2>().data());
+          std::vector<float> scales(n);
+          if (num_weight_scales == 1) {
+            // Weights are quantized per_tensor. Scales need to be expanded to
+            // number of output channels.
+            std::fill(scales.begin(), scales.end(), bias_scales[0]);
+          } else {
+            scales = bias_scales;
+          }
+          // TODO(intel-tf): Paralellize loop for large weights.
+          for (int j = 0; j < n; ++j) {
+            int sum = 0;
+            for (int i = 0; i < k; ++i) {
+              sum += weight_buf[i * stride_ic + j * stride_oc];
+            }
+#ifndef ENABLE_ONEDNN_V3
+            adjusted_bias[j] = static_cast<TSCALED_BIAS>(
+                (static_cast<float>(input_bias[j]) * scales[j]) +
+                (sum * q_min_input));
+#else
+            // TODO(intel-tf): Use zeropoint for quantized input tensor instead
+            // of manual adjustments.
+            if (std::is_same<Tbias, qint32>::value) {
+              // Starting with oneDNN v3.0, bias is expected to be dequantized
+              // to float32.
+              adjusted_bias[j] = static_cast<float>(input_bias[j]) / scales[j];
+            } else {
+              // Bias is float32 or bfloat16 but still needs to be compensated.
+              adjusted_bias[j] = static_cast<float>(input_bias[j]) +
+                                 ((sum * q_min_input) / scales[j]);
+            }
+#endif  // !ENABLE_ONEDNN_V3
+          }
+        } else {
+          memory::dims input_bias_dims =
+              memory::dims({bias_tensor.shape().dim_size(0)});
+          auto input_bias_md = dnnl::memory::desc(
+              input_bias_dims, MklDnnType<Tbias>(), memory::format_tag::x);
+          auto input_bias_mem =
+              dnnl::memory(input_bias_md, this->cpu_engine_, input_bias_buf);
+          auto scaled_bias_mem =
+              dnnl::memory(scaled_bias_md, this->cpu_engine_, scaled_bias_buf);
+          dnnl::primitive_attr bias_attr;
+#ifndef ENABLE_ONEDNN_V3
+          (num_weight_scales == 1)
+              ? bias_attr.set_output_scales(0, bias_scales)
+              : bias_attr.set_output_scales(1, bias_scales);
+#else
+          (num_weight_scales == 1) ? bias_attr.set_scales_mask(DNNL_ARG_SRC, 0)
+                                   : bias_attr.set_scales_mask(DNNL_ARG_SRC, 1);
+#endif  // !ENABLE_ONEDNN_V3
+          auto reorder_prim =
+              dnnl::reorder(input_bias_mem, scaled_bias_mem, bias_attr);
+          std::unordered_map<int, memory> reorder_net_args = {
+              {DNNL_ARG_FROM, input_bias_mem}, {DNNL_ARG_TO, scaled_bias_mem}};
+#ifdef ENABLE_ONEDNN_V3
+          auto scale_mem =
+              memory({{1}, MklDnnType<float>(), memory::format_tag::x},
+                     this->cpu_engine_, bias_scales.data());
+          reorder_net_args.insert(
+              {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, scale_mem});
+#endif  // ENABLE_ONEDNN_V3
+          reorder_prim.execute(dnnl::stream(this->cpu_engine_),
+                               reorder_net_args);
+        }
+
+        *bias_data = temp_scaled_bias_tensor->flat<float>().data();
+
+        if (is_bias_cache_empty) {
+          // Only try to cache the bias in the first iteration.
+          this->CacheBias(ctx, *temp_scaled_bias_tensor, min_input, max_input);
+        }
+      }
+    }
+  }
+
+  bool IsCachedBiasValid(float current_min_input,
+                         float current_max_input) override
+      TF_LOCKS_EXCLUDED(this->bias_cache_mutex_) {
+    tf_shared_lock lock(this->bias_cache_mutex_);
+    if (this->is_bias_const_ && this->is_weight_const_ &&
+        std::abs(current_min_input - this->saved_min_input_) < 1e-5 &&
+        std::abs(current_max_input - this->saved_max_input_) < 1e-5) {
+      return true;
+    }
+    return false;
+  }
+};
 
 // Register mkl kernels for supported operations and types.
-#define REGISTER_FUSEDMATMUL_MKL_SUPPORTED_KERNELS_TYPES(type)                \
-  REGISTER_KERNEL_BUILDER(                                                    \
-      Name("_MklFusedMatMul")                                                 \
-          .Device(DEVICE_CPU)                                                 \
-          .TypeConstraint<type>("T")                                          \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),                \
-      MklFusedMatMulOp<CPUDevice, type>);                                     \
-  REGISTER_KERNEL_BUILDER(Name("_MklNativeFusedMatMul")                       \
-                              .Device(DEVICE_CPU)                             \
-                              .TypeConstraint<type>("T")                      \
-                              .Label(mkl_op_registry::kMklNameChangeOpLabel), \
-                          MklFusedMatMulOp<CPUDevice, type, true>);
+#define REGISTER_FUSEDMATMUL_MKL_SUPPORTED_KERNELS_TYPES(type)    \
+  REGISTER_KERNEL_BUILDER(                                        \
+      Name("_MklFusedMatMul")                                     \
+          .Device(DEVICE_CPU)                                     \
+          .TypeConstraint<type>("T")                              \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),    \
+      MklFusedMatMulOp<CPUDevice, type, type, type, type, type>); \
+  REGISTER_KERNEL_BUILDER(                                        \
+      Name("_MklNativeFusedMatMul")                               \
+          .Device(DEVICE_CPU)                                     \
+          .TypeConstraint<type>("T")                              \
+          .Label(mkl_op_registry::kMklNameChangeOpLabel),         \
+      MklFusedMatMulOp<CPUDevice, type, type, type, type, type, true>);
 TF_CALL_float(REGISTER_FUSEDMATMUL_MKL_SUPPORTED_KERNELS_TYPES);
 TF_CALL_bfloat16(REGISTER_FUSEDMATMUL_MKL_SUPPORTED_KERNELS_TYPES);
 TF_CALL_half(REGISTER_FUSEDMATMUL_MKL_SUPPORTED_KERNELS_TYPES);
 #undef REGISTER_FUSEDMATMUL_MKL_SUPPORTED_KERNELS_TYPES
 
+#define REGISTER_QUANTIZED_MATMUL(input_type, weight_type, bias_type,       \
+                                  output_type, additional_type)             \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("_QuantizedMatMul")                                              \
+          .Device(DEVICE_CPU)                                               \
+          .TypeConstraint<input_type>("T1")                                 \
+          .TypeConstraint<weight_type>("T2")                                \
+          .TypeConstraint<bias_type>("Tbias")                               \
+          .TypeConstraint<output_type>("Tout")                              \
+          .TypeConstraint<additional_type>("U"),                            \
+      QuantizedFusedMatMulOp<CPUDevice, input_type, weight_type, bias_type, \
+                             output_type, additional_type, true>);
+
+#define REGISTER_ALL_OUTPUT_TYPES(input_type, weight_type, bias_type,     \
+                                  additional_type)                        \
+  REGISTER_QUANTIZED_MATMUL(input_type, weight_type, bias_type, qint8,    \
+                            additional_type)                              \
+  REGISTER_QUANTIZED_MATMUL(input_type, weight_type, bias_type, quint8,   \
+                            additional_type)                              \
+  REGISTER_QUANTIZED_MATMUL(input_type, weight_type, bias_type, qint32,   \
+                            additional_type)                              \
+  REGISTER_QUANTIZED_MATMUL(input_type, weight_type, bias_type, float,    \
+                            additional_type)                              \
+  REGISTER_QUANTIZED_MATMUL(input_type, weight_type, bias_type, bfloat16, \
+                            additional_type)
+
+#define REGISTER_ALL_BIAS_OUTPUT_TYPES(input_type, weight_type,              \
+                                       additional_type)                      \
+  REGISTER_ALL_OUTPUT_TYPES(input_type, weight_type, float, additional_type) \
+  REGISTER_ALL_OUTPUT_TYPES(input_type, weight_type, bfloat16,               \
+                            additional_type)                                 \
+  REGISTER_ALL_OUTPUT_TYPES(input_type, weight_type, qint32, additional_type)
+
+#define REGISTER_ALL_INPUT_BIAS_OUTPUT_TYPES(weight_type, additional_type) \
+  REGISTER_ALL_BIAS_OUTPUT_TYPES(qint8, weight_type, additional_type)      \
+  REGISTER_ALL_BIAS_OUTPUT_TYPES(quint8, weight_type, additional_type)
+
+REGISTER_ALL_INPUT_BIAS_OUTPUT_TYPES(qint8, float);
+REGISTER_ALL_INPUT_BIAS_OUTPUT_TYPES(qint8, bfloat16);
+
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h b/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
index 922e6464663bb5..8af21582ee9652 100644
--- a/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
+++ b/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
@@ -102,8 +102,10 @@ struct MklDnnMatMulFwdParams {
   struct PostOpParam {
     string name;
     std::vector<float> param;
+    string partial_key;
   };
   std::vector<PostOpParam> post_op_params;
+  string input_quant_mode;
 
   MklDnnMatMulFwdParams(
       memory::dims src_dims, memory::dims weight_dims, memory::dims bias_dims,
@@ -244,7 +246,7 @@ class MklDnnMatMulFwdPrimitive : public MklPrimitive {
           dst_scale_mem(nullptr),
 #ifndef ENABLE_ONEDNN_V3
           fwd_desc(nullptr),
-#endif  // !ENABLE_ONEDNN_V3
+#endif  // ENABLE_ONEDNN_V3
           fwd_pd(nullptr),
           src_md(nullptr),
           weight_md(nullptr),
@@ -276,15 +278,26 @@ class MklDnnMatMulFwdPrimitive : public MklPrimitive {
                                            MklDnnType<Toutput>(),
                                            matmul_fwd_params.dst_format));
 
-    if (std::is_same<Tbias, qint32>::value) {
-      context_.bias_md.reset(new memory::desc({matmul_fwd_params.bias_dims},
-                                              MklDnnType<TSCALED_BIAS>(),
-                                              memory::format_tag::any));
+    memory::data_type bias_dt;
+#ifndef ENABLE_ONEDNN_V3
+    bias_dt = MklDnnType<Tbias>();
+#else
+    if (std::is_same<Tweight, qint8>::value) {
+      // For QuantizedMatMul, bias needs to be passed to oneDNN as float of
+      // bfloat16 (even if Tbias is qint32).
+      if (std::is_same<Tbias, bfloat16>::value &&
+          matmul_fwd_params.input_quant_mode == "SCALED") {
+        bias_dt = MklDnnType<bfloat16>();
+      } else {
+        bias_dt = MklDnnType<float>();
+      }
     } else {
-      context_.bias_md.reset(new memory::desc({matmul_fwd_params.bias_dims},
-                                              MklDnnType<Tbias>(),
-                                              memory::format_tag::any));
+      bias_dt = MklDnnType<Tbias>();
     }
+#endif  // !ENABLE_ONEDNN_V3
+    context_.bias_md.reset(new memory::desc({matmul_fwd_params.bias_dims},
+                                            bias_dt, memory::format_tag::any));
+
     // Create an inner-product.
 #ifndef ENABLE_ONEDNN_V3
     context_.fwd_desc.reset(new inner_product_forward::desc(
@@ -304,60 +317,68 @@ class MklDnnMatMulFwdPrimitive : public MklPrimitive {
     std::unordered_map<string, bool> is_scale_set;
     if (!post_op_params.empty()) {
       for (auto const& post_op_param : post_op_params) {
-        if (post_op_param.name == "relu" || post_op_param.name == "leakyrelu") {
+        if (post_op_param.name == "Relu" || post_op_param.name == "LeakyRelu") {
           DCHECK_EQ(post_op_param.param.size(), 3);
           float op_scale = post_op_param.param[0];
           float op_alpha = post_op_param.param[1];
           float op_beta = post_op_param.param[2];
           post_ops.APPEND_ELTWISE(op_scale, dnnl::algorithm::eltwise_relu,
                                   op_alpha, op_beta);
-        } else if (post_op_param.name == "relu6") {
+        } else if (post_op_param.name == "Relu6") {
           DCHECK_EQ(post_op_param.param.size(), 3);
           float op_scale = post_op_param.param[0];
           float op_alpha = post_op_param.param[1];
           float op_beta = post_op_param.param[2];
           post_ops.APPEND_ELTWISE_RELU6(op_scale, op_alpha, op_beta);
-        } else if (post_op_param.name == "elu") {
+        } else if (post_op_param.name == "Elu") {
           DCHECK_EQ(post_op_param.param.size(), 3);
           float op_scale = post_op_param.param[0];
           float op_alpha = post_op_param.param[1];
           float op_beta = post_op_param.param[2];
           post_ops.APPEND_ELTWISE(op_scale, dnnl::algorithm::eltwise_elu,
                                   op_alpha, op_beta);
-        } else if (post_op_param.name == "gelu_approximate") {
+        } else if (post_op_param.name == "GeluApproximate") {
           DCHECK_EQ(post_op_param.param.size(), 3);
           float op_scale = post_op_param.param[0];
           float op_alpha = post_op_param.param[1];
           float op_beta = post_op_param.param[2];
           post_ops.APPEND_ELTWISE(op_scale, dnnl::algorithm::eltwise_gelu_tanh,
                                   op_alpha, op_beta);
-        } else if (post_op_param.name == "gelu_exact") {
+        } else if (post_op_param.name == "GeluExact") {
           DCHECK_EQ(post_op_param.param.size(), 3);
           float op_scale = post_op_param.param[0];
           float op_alpha = post_op_param.param[1];
           float op_beta = post_op_param.param[2];
           post_ops.APPEND_ELTWISE(op_scale, dnnl::algorithm::eltwise_gelu_erf,
                                   op_alpha, op_beta);
-        } else if (post_op_param.name == "tanh") {
+        } else if (post_op_param.name == "Tanh") {
           DCHECK_EQ(post_op_param.param.size(), 3);
           float op_scale = post_op_param.param[0];
           float op_alpha = post_op_param.param[1];
           float op_beta = post_op_param.param[2];
           post_ops.APPEND_ELTWISE(op_scale, dnnl::algorithm::eltwise_tanh,
                                   op_alpha, op_beta);
-        } else if (post_op_param.name == "logistic") {
+        } else if (post_op_param.name == "Sigmoid") {
           DCHECK_EQ(post_op_param.param.size(), 3);
           float op_scale = post_op_param.param[0];
           float op_alpha = post_op_param.param[1];
           float op_beta = post_op_param.param[2];
           post_ops.APPEND_ELTWISE(op_scale, dnnl::algorithm::eltwise_logistic,
                                   op_alpha, op_beta);
+        } else if (post_op_param.name == "linear") {
+          DCHECK_EQ(post_op_param.param.size(), 3);
+          float op_scale = post_op_param.param[0];
+          float op_alpha = post_op_param.param[1];
+          float op_beta = post_op_param.param[2];
+          post_ops.APPEND_ELTWISE(op_scale, dnnl::algorithm::eltwise_linear,
+                                  op_alpha, op_beta);
 #ifndef ENABLE_ONEDNN_V3
         } else if (post_op_param.name == "output_scale") {
-          DCHECK_EQ(post_op_param.param.size(), 1);
-          std::vector<float> scales;
-          scales.push_back(post_op_param.param[0]);
-          post_ops_attr.set_output_scales(0, scales);
+          if (post_op_param.param.size() == 1) {
+            post_ops_attr.set_output_scales(0, post_op_param.param);
+          } else {
+            post_ops_attr.set_output_scales(2, post_op_param.param);
+          }
 #else
         } else if (post_op_param.name == "src_scale") {
           is_scale_set.insert({"src", true});
@@ -368,14 +389,18 @@ class MklDnnMatMulFwdPrimitive : public MklPrimitive {
               new memory(*context_.src_scale_md, cpu_engine_, DummyData));
         } else if (post_op_param.name == "wei_scale") {
           is_scale_set.insert({"wei", true});
-          post_ops_attr.set_scales_mask(DNNL_ARG_WEIGHTS, 0);
-          context_.wei_scale_md.reset(new memory::desc({1}, MklDnnType<float>(),
-                                                       memory::format_tag::x));
+          const int scale_size = post_op_param.param.size();
+          const int mask = scale_size == 1 ? 0 : 1;
+          post_ops_attr.set_scales_mask(DNNL_ARG_WEIGHTS, mask);
+          context_.wei_scale_md.reset(new memory::desc(
+              {scale_size}, MklDnnType<float>(), memory::format_tag::x));
           context_.wei_scale_mem.reset(
               new memory(*context_.wei_scale_md, cpu_engine_, DummyData));
         } else if (post_op_param.name == "dst_scale") {
           is_scale_set.insert({"dst", true});
-          post_ops_attr.set_scales_mask(DNNL_ARG_DST, 0);
+          const int scale_size = post_op_param.param.size();
+          const int mask = scale_size == 1 ? 0 : 1;
+          post_ops_attr.set_scales_mask(DNNL_ARG_DST, mask);
           context_.dst_scale_md.reset(new memory::desc({1}, MklDnnType<float>(),
                                                        memory::format_tag::x));
           context_.dst_scale_mem.reset(
@@ -387,13 +412,15 @@ class MklDnnMatMulFwdPrimitive : public MklPrimitive {
           post_ops.append_sum(op_scale);
 
         } else {
-          DCHECK((post_op_param.name == "relu") ||
-                 (post_op_param.name == "relu6") ||
-                 (post_op_param.name == "elu") ||
-                 (post_op_param.name == "tanh") ||
-                 (post_op_param.name == "logistic") ||
+          DCHECK((post_op_param.name == "Relu") ||
+                 (post_op_param.name == "Relu6") ||
+                 (post_op_param.name == "Elu") ||
+                 (post_op_param.name == "GeluApproximate") ||
+                 (post_op_param.name == "GeluExact") ||
+                 (post_op_param.name == "Tanh") ||
+                 (post_op_param.name == "Sigmoid") ||
                  (post_op_param.name == "sum") ||
-                 (post_op_param.name == "leakyrelu") || OUTPUT_SCALE_DCHECK);
+                 (post_op_param.name == "Leakyrelu") || OUTPUT_SCALE_DCHECK);
         }
       }
       post_ops_attr.set_post_ops(post_ops);
@@ -433,11 +460,15 @@ class MklDnnMatMulFwdPrimitive : public MklPrimitive {
         {DNNL_ARG_SCRATCHPAD, *context_.sp_mem},
         {DNNL_ARG_DST, *context_.dst_mem}};
 #ifdef ENABLE_ONEDNN_V3
-    if (is_scale_set["src"] && is_scale_set["wei"] && is_scale_set["dst"]) {
+    if (is_scale_set["src"]) {
       net_args.insert(
           {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, *context_.src_scale_mem});
+    }
+    if (is_scale_set["wei"]) {
       net_args.insert(
           {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, *context_.wei_scale_mem});
+    }
+    if (is_scale_set["dst"]) {
       net_args.insert(
           {DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, *context_.dst_scale_mem});
     }
@@ -510,12 +541,12 @@ class MklDnnMatMulFwdPrimitiveFactory : public MklPrimitiveFactory<T> {
 
     // Generate keys for post-ops
     for (auto const& post_op_param : mkldnn_matmul_fwd_dims.post_op_params) {
-      if (post_op_param.name == "relu" || post_op_param.name == "relu6" ||
-          post_op_param.name == "elu" || post_op_param.name == "tanh" ||
-          post_op_param.name == "logistic" ||
-          post_op_param.name == "leakyrelu" ||
-          post_op_param.name == "gelu_approximate" ||
-          post_op_param.name == "gelu_exact") {
+      if (post_op_param.name == "Relu" || post_op_param.name == "Relu6" ||
+          post_op_param.name == "Elu" || post_op_param.name == "Tanh" ||
+          post_op_param.name == "Sigmoid" ||
+          post_op_param.name == "LeakyRelu" ||
+          post_op_param.name == "GeluApproximate" ||
+          post_op_param.name == "GeluExact" || post_op_param.name == "linear") {
         DCHECK_EQ(post_op_param.param.size(), 3);
         key_creator.AddAsKey(post_op_param.name);
         key_creator.AddAsKey(post_op_param.param[0]);
@@ -532,9 +563,16 @@ class MklDnnMatMulFwdPrimitiveFactory : public MklPrimitiveFactory<T> {
                  post_op_param.name == "wei_scale" ||
                  post_op_param.name == "dst_scale") {
 #endif  // !ENABLE_ONEDNN_V3
-        DCHECK_EQ(post_op_param.param.size(), 1);
         key_creator.AddAsKey(post_op_param.name);
-        key_creator.AddAsKey(post_op_param.param[0]);
+        if (post_op_param.partial_key.empty()) {
+          DCHECK_GE(post_op_param.param.size(), 1);
+          // Old Quantized MatMul kernels do not create part of key beforehand
+          // as primitive caching-key-creation optimization.
+          key_creator.AddAsKey(post_op_param.param[0]);
+        } else {
+          // New Quantized MatMul kernels pre-create partial key.
+          key_creator.AddAsKey(post_op_param.partial_key);
+        }
       } else {
         return string("not_a_key");
       }
diff --git a/tensorflow/core/kernels/mkl/mkl_qmatmul_op.cc b/tensorflow/core/kernels/mkl/mkl_qmatmul_op.cc
index efb33375d1669d..92a21e1255c778 100644
--- a/tensorflow/core/kernels/mkl/mkl_qmatmul_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_qmatmul_op.cc
@@ -783,7 +783,7 @@ class MklDnnQuantizedMatMulReluOp
     MklDnnQuantizedMatMulOp<Device, quint8, qint8, Tbias, Toutput,
                             native_format>::ExtendMklDnnMatMulFwdParams(context,
                                                                         params);
-    params.post_op_params.push_back({"relu", {1.0, 0.0, 0.0}});
+    params.post_op_params.push_back({"Relu", {1.0, 0.0, 0.0}});
   }
 };
 
diff --git a/tensorflow/core/kernels/mkl/mkl_qmatmul_op_test.cc b/tensorflow/core/kernels/mkl/mkl_qmatmul_op_test.cc
index 22b56e19e3bb63..3d862e0b8e8fd5 100644
--- a/tensorflow/core/kernels/mkl/mkl_qmatmul_op_test.cc
+++ b/tensorflow/core/kernels/mkl/mkl_qmatmul_op_test.cc
@@ -35,23 +35,43 @@ limitations under the License.
 
 namespace tensorflow {
 
-class QuantizedMatMulTest : public OpsTestBase {};
+class QuantizedMatMulTest : public OpsTestBase,
+                            public ::testing::WithParamInterface<bool> {};
 
 // Two small matrices A of type uint8 and B of type int8  are multiplied
 // and the result is added with int32 bias
-TEST_F(QuantizedMatMulTest, Small_withBias) {
-  TF_ASSERT_OK(
-      NodeDefBuilder("quantized_mat_mul_op", "_MklQuantizedMatMulWithBias")
-          .Input(FakeInput(DT_QUINT8))
-          .Input(FakeInput(DT_QINT8))
-          .Input(FakeInput(DT_QINT32))
-          .Input(FakeInput(DT_FLOAT))
-          .Input(FakeInput(DT_FLOAT))
-          .Input(FakeInput(DT_FLOAT))
-          .Input(FakeInput(DT_FLOAT))
-          .Attr("Toutput", DataTypeToEnum<qint32>::v())
-          .Attr("_kernel", "QuantizedMklOp")
-          .Finalize(node_def()));
+TEST_P(QuantizedMatMulTest, Small_withBias) {
+  const bool is_old_api = GetParam();
+  if (is_old_api) {
+    TF_ASSERT_OK(
+        NodeDefBuilder("quantized_mat_mul_op", "_MklQuantizedMatMulWithBias")
+            .Input(FakeInput(DT_QUINT8))
+            .Input(FakeInput(DT_QINT8))
+            .Input(FakeInput(DT_QINT32))
+            .Input(FakeInput(DT_FLOAT))
+            .Input(FakeInput(DT_FLOAT))
+            .Input(FakeInput(DT_FLOAT))
+            .Input(FakeInput(DT_FLOAT))
+            .Attr("Toutput", DataTypeToEnum<qint32>::v())
+            .Attr("_kernel", "QuantizedMklOp")
+            .Finalize(node_def()));
+  } else {
+    TF_ASSERT_OK(
+        NodeDefBuilder("quantized_mat_mul_op", "_QuantizedMatMul")
+            .Attr("Thost_inputs", {DT_QUINT8, DT_QINT8, DT_QINT32, DT_FLOAT,
+                                   DT_FLOAT, DT_FLOAT, DT_FLOAT})
+            .Attr("Thost_outputs", {DT_QINT32, DT_FLOAT, DT_FLOAT})
+            .Attr("Tdevice_inputs", std::vector<DataType>())
+            .Attr("Tdevice_outputs", std::vector<DataType>())
+            .Attr("T1", DT_QUINT8)
+            .Attr("T2", DT_QINT8)
+            .Attr("Tbias", DT_QINT32)
+            .Attr("Tout", DT_QINT32)
+            .Attr("fused_ops", {"BiasAdd"})
+            .Input(FakeInput())
+            .Input(FakeInput())
+            .Finalize(node_def()));
+  }
   TF_ASSERT_OK(InitOp());
   // A matrix is:
   // |  1 |  2 |  3 |
@@ -91,19 +111,38 @@ TEST_F(QuantizedMatMulTest, Small_withBias) {
 
 // Two small matrices A of type uint8 and B of type int8  are multiplied
 // and the result is added with neg bias as well
-TEST_F(QuantizedMatMulTest, Small_withNegBias) {
-  TF_ASSERT_OK(
-      NodeDefBuilder("quantized_mat_mul_op", "_MklQuantizedMatMulWithBias")
-          .Input(FakeInput(DT_QUINT8))
-          .Input(FakeInput(DT_QINT8))
-          .Input(FakeInput(DT_QINT32))
-          .Input(FakeInput(DT_FLOAT))
-          .Input(FakeInput(DT_FLOAT))
-          .Input(FakeInput(DT_FLOAT))
-          .Input(FakeInput(DT_FLOAT))
-          .Attr("Toutput", DataTypeToEnum<qint32>::v())
-          .Attr("_kernel", "QuantizedMklOp")
-          .Finalize(node_def()));
+TEST_P(QuantizedMatMulTest, Small_withNegBias) {
+  const bool is_old_api = GetParam();
+  if (is_old_api) {
+    TF_ASSERT_OK(
+        NodeDefBuilder("quantized_mat_mul_op", "_MklQuantizedMatMulWithBias")
+            .Input(FakeInput(DT_QUINT8))
+            .Input(FakeInput(DT_QINT8))
+            .Input(FakeInput(DT_QINT32))
+            .Input(FakeInput(DT_FLOAT))
+            .Input(FakeInput(DT_FLOAT))
+            .Input(FakeInput(DT_FLOAT))
+            .Input(FakeInput(DT_FLOAT))
+            .Attr("Toutput", DataTypeToEnum<qint32>::v())
+            .Attr("_kernel", "QuantizedMklOp")
+            .Finalize(node_def()));
+  } else {
+    TF_ASSERT_OK(
+        NodeDefBuilder("quantized_mat_mul_op", "_QuantizedMatMul")
+            .Attr("Thost_inputs", {DT_QUINT8, DT_QINT8, DT_QINT32, DT_FLOAT,
+                                   DT_FLOAT, DT_FLOAT, DT_FLOAT})
+            .Attr("Thost_outputs", {DT_QINT32, DT_FLOAT, DT_FLOAT})
+            .Attr("Tdevice_inputs", std::vector<DataType>())
+            .Attr("Tdevice_outputs", std::vector<DataType>())
+            .Attr("T1", DT_QUINT8)
+            .Attr("T2", DT_QINT8)
+            .Attr("Tbias", DT_QINT32)
+            .Attr("Tout", DT_QINT32)
+            .Attr("fused_ops", {"BiasAdd"})
+            .Input(FakeInput())
+            .Input(FakeInput())
+            .Finalize(node_def()));
+  }
   TF_ASSERT_OK(InitOp());
   // A matrix is:
   // |  1 |  2 |  3 |
@@ -144,20 +183,40 @@ TEST_F(QuantizedMatMulTest, Small_withNegBias) {
 
 // Two small matrices A of type uint8 (converted from signed integer)
 // and B of type int8  are multiplied and the result is added with float bias
-TEST_F(QuantizedMatMulTest, Small_WithNegInp) {
-  TF_ASSERT_OK(
-      NodeDefBuilder("quantized_mat_mul_op", "_MklQuantizedMatMulWithBias")
-          .Input(FakeInput(DT_QUINT8))
-          .Input(FakeInput(DT_QINT8))
-          .Input(FakeInput(DT_FLOAT))
-          .Input(FakeInput(DT_FLOAT))
-          .Input(FakeInput(DT_FLOAT))
-          .Input(FakeInput(DT_FLOAT))
-          .Input(FakeInput(DT_FLOAT))
-          .Attr("Toutput", DataTypeToEnum<qint32>::v())
-          .Attr("input_quant_mode", "MIN_FIRST")
-          .Attr("_kernel", "QuantizedMklOp")
-          .Finalize(node_def()));
+TEST_P(QuantizedMatMulTest, Small_WithNegInp) {
+  const bool is_old_api = GetParam();
+  if (is_old_api) {
+    TF_ASSERT_OK(
+        NodeDefBuilder("quantized_mat_mul_op", "_MklQuantizedMatMulWithBias")
+            .Input(FakeInput(DT_QUINT8))
+            .Input(FakeInput(DT_QINT8))
+            .Input(FakeInput(DT_FLOAT))
+            .Input(FakeInput(DT_FLOAT))
+            .Input(FakeInput(DT_FLOAT))
+            .Input(FakeInput(DT_FLOAT))
+            .Input(FakeInput(DT_FLOAT))
+            .Attr("Toutput", DataTypeToEnum<qint32>::v())
+            .Attr("input_quant_mode", "MIN_FIRST")
+            .Attr("_kernel", "QuantizedMklOp")
+            .Finalize(node_def()));
+  } else {
+    TF_ASSERT_OK(
+        NodeDefBuilder("quantized_mat_mul_op", "_QuantizedMatMul")
+            .Attr("Thost_inputs", {DT_QUINT8, DT_QINT8, DT_FLOAT, DT_FLOAT,
+                                   DT_FLOAT, DT_FLOAT, DT_FLOAT})
+            .Attr("Thost_outputs", {DT_QINT32, DT_FLOAT, DT_FLOAT})
+            .Attr("Tdevice_inputs", std::vector<DataType>())
+            .Attr("Tdevice_outputs", std::vector<DataType>())
+            .Attr("T1", DT_QUINT8)
+            .Attr("T2", DT_QINT8)
+            .Attr("Tbias", DT_FLOAT)
+            .Attr("Tout", DT_QINT32)
+            .Attr("fused_ops", {"BiasAdd"})
+            .Attr("input_quant_mode", "MIN_FIRST")
+            .Input(FakeInput())
+            .Input(FakeInput())
+            .Finalize(node_def()));
+  }
   TF_ASSERT_OK(InitOp());
   // The A matrix is:
   // |  -1 |  -5 |  -9 |
@@ -213,21 +272,41 @@ TEST_F(QuantizedMatMulTest, Small_WithNegInp) {
 
 // Two small matrices A of type uint8 and B of type int8  are multiplied
 // and the result is added with int32 bias and Requantization fusion
-TEST_F(QuantizedMatMulTest, Small_withBiasAndReq) {
-  TF_ASSERT_OK(NodeDefBuilder("quantized_mat_mul_op",
-                              "_MklQuantizedMatMulWithBiasAndRequantize")
-                   .Input(FakeInput(DT_QUINT8))
-                   .Input(FakeInput(DT_QINT8))
-                   .Input(FakeInput(DT_QINT32))
-                   .Input(FakeInput(DT_FLOAT))
-                   .Input(FakeInput(DT_FLOAT))
-                   .Input(FakeInput(DT_FLOAT))
-                   .Input(FakeInput(DT_FLOAT))
-                   .Input(FakeInput(DT_FLOAT))
-                   .Input(FakeInput(DT_FLOAT))
-                   .Attr("Toutput", DataTypeToEnum<quint8>::v())
-                   .Attr("_kernel", "QuantizedMklOp")
-                   .Finalize(node_def()));
+TEST_P(QuantizedMatMulTest, Small_withBiasAndReq) {
+  const bool is_old_api = GetParam();
+  if (is_old_api) {
+    TF_ASSERT_OK(NodeDefBuilder("quantized_mat_mul_op",
+                                "_MklQuantizedMatMulWithBiasAndRequantize")
+                     .Input(FakeInput(DT_QUINT8))
+                     .Input(FakeInput(DT_QINT8))
+                     .Input(FakeInput(DT_QINT32))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Attr("Toutput", DataTypeToEnum<quint8>::v())
+                     .Attr("_kernel", "QuantizedMklOp")
+                     .Finalize(node_def()));
+  } else {
+    TF_ASSERT_OK(NodeDefBuilder("quantized_mat_mul_op", "_QuantizedMatMul")
+                     .Attr("Thost_inputs",
+                           {DT_QUINT8, DT_QINT8, DT_QINT32, DT_FLOAT, DT_FLOAT,
+                            DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT})
+                     .Attr("Thost_outputs", {DT_QUINT8, DT_FLOAT, DT_FLOAT})
+                     .Attr("Tdevice_inputs", std::vector<DataType>())
+                     .Attr("Tdevice_outputs", std::vector<DataType>())
+                     .Attr("T1", DT_QUINT8)
+                     .Attr("T2", DT_QINT8)
+                     .Attr("Tbias", DT_QINT32)
+                     .Attr("Tout", DT_QUINT8)
+                     .Attr("fused_ops", {"BiasAdd", "Requantize"})
+                     .Input(FakeInput())
+                     .Input(FakeInput())
+                     .Finalize(node_def()));
+  }
+
   TF_ASSERT_OK(InitOp());
   // A matrix is:
   // |  1 |  2 |  3 |
@@ -273,11 +352,25 @@ TEST_F(QuantizedMatMulTest, Small_withBiasAndReq) {
   // 178 * 1.00392 ~= 178.698 ~= 179
 
   Tensor expected(allocator(), DT_QUINT8, TensorShape({2, 4}));
+  if (is_old_api) {
 #ifdef ENABLE_ONEDNN_V3
-  test::FillValues<quint8>(&expected, {84, 60, 116, 52, 183, 168, 233, 178});
+    test::FillValues<quint8>(&expected, {84, 60, 116, 52, 183, 168, 233, 178});
 #else
-  test::FillValues<quint8>(&expected, {84, 60, 116, 52, 184, 169, 234, 179});
+    test::FillValues<quint8>(&expected, {84, 60, 116, 52, 184, 169, 234, 179});
 #endif  // ENABLE_ONEDNN_V3
+  } else {
+    // New api uses more numerical precision preserving equation. Old api scales
+    // up to 32-bit and then scales down from 32-bit to 8-bit. New api instead
+    // does a dequantization followed by a scaling to 8-bit.
+    // In this test,
+    //    input deq. scale =  ((255.0 * 127.0) / (255.0 * 127.0)) = 1.0
+    //    output req. scale = 255.0 / 255.0 = 1.0
+    //    combined scale = 1.0 * 1.0 = 1.0
+    // Note: new api scale value is 1.0, whereas the old api scale is 1.000392.
+    // Correct value is 1.0f. The closer it is to the correct value the better
+    // the formula is.
+    test::FillValues<quint8>(&expected, {84, 60, 116, 52, 183, 168, 233, 178});
+  }
 
   const Tensor& output = *GetOutput(0);
   test::ExpectTensorEqual<quint8>(expected, output);
@@ -285,21 +378,40 @@ TEST_F(QuantizedMatMulTest, Small_withBiasAndReq) {
 
 // Two small matrices A of type uint8 and B of type int8  are multiplied
 // and the result is added with int32 bias and Requantization fusion
-TEST_F(QuantizedMatMulTest, Small_withBiasAndDeq) {
-  TF_ASSERT_OK(NodeDefBuilder("quantized_mat_mul_op",
-                              "_MklQuantizedMatMulWithBiasAndDequantize")
-                   .Input(FakeInput(DT_QUINT8))
-                   .Input(FakeInput(DT_QINT8))
-                   .Input(FakeInput(DT_QINT32))
-                   .Input(FakeInput(DT_FLOAT))
-                   .Input(FakeInput(DT_FLOAT))
-                   .Input(FakeInput(DT_FLOAT))
-                   .Input(FakeInput(DT_FLOAT))
-                   .Input(FakeInput(DT_FLOAT))
-                   .Input(FakeInput(DT_FLOAT))
-                   .Attr("Toutput", DataTypeToEnum<float>::v())
-                   .Attr("_kernel", "QuantizedMklOp")
-                   .Finalize(node_def()));
+TEST_P(QuantizedMatMulTest, Small_withBiasAndDeq) {
+  const bool is_old_api = GetParam();
+  if (is_old_api) {
+    TF_ASSERT_OK(NodeDefBuilder("quantized_mat_mul_op",
+                                "_MklQuantizedMatMulWithBiasAndDequantize")
+                     .Input(FakeInput(DT_QUINT8))
+                     .Input(FakeInput(DT_QINT8))
+                     .Input(FakeInput(DT_QINT32))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Attr("Toutput", DataTypeToEnum<float>::v())
+                     .Attr("_kernel", "QuantizedMklOp")
+                     .Finalize(node_def()));
+  } else {
+    TF_ASSERT_OK(
+        NodeDefBuilder("quantized_mat_mul_op", "_QuantizedMatMul")
+            .Attr("Thost_inputs", {DT_QUINT8, DT_QINT8, DT_QINT32, DT_FLOAT,
+                                   DT_FLOAT, DT_FLOAT, DT_FLOAT})
+            .Attr("Thost_outputs", {DT_FLOAT})
+            .Attr("Tdevice_inputs", std::vector<DataType>())
+            .Attr("Tdevice_outputs", std::vector<DataType>())
+            .Attr("T1", DT_QUINT8)
+            .Attr("T2", DT_QINT8)
+            .Attr("Tbias", DT_QINT32)
+            .Attr("Tout", DT_FLOAT)
+            .Attr("fused_ops", {"BiasAdd", "Dequantize"})
+            .Input(FakeInput())
+            .Input(FakeInput())
+            .Finalize(node_def()));
+  }
   TF_ASSERT_OK(InitOp());
   // A matrix is:
   // |  1 |  2 |  3 |
@@ -316,9 +428,11 @@ TEST_F(QuantizedMatMulTest, Small_withBiasAndDeq) {
   AddInputFromArray<float>(TensorShape({}), {255.0f});
   AddInputFromArray<float>(TensorShape({}), {-127.0f});
   AddInputFromArray<float>(TensorShape({}), {127.0f});
-  AddInputFromArray<float>(TensorShape({}), {0});
-  AddInputFromArray<float>(TensorShape({}), {255.0f});
 
+  if (is_old_api) {
+    AddInputFromArray<float>(TensorShape({}), {0});
+    AddInputFromArray<float>(TensorShape({}), {255.0f});
+  }
   TF_ASSERT_OK(RunOpKernel());
   // Here are the results we expect, from hand calculations:
   // (1 * 7) + (2 * 11) + (3 * 15) = 74
@@ -353,19 +467,38 @@ TEST_F(QuantizedMatMulTest, Small_withBiasAndDeq) {
 
 // Two small matrices A of type uint8 and B of type int8  are multiplied
 // and the result is added with float bias and then performed relu on the result
-TEST_F(QuantizedMatMulTest, Small_withBiasAndRelu) {
-  TF_ASSERT_OK(NodeDefBuilder("quantized_mat_mul_op",
-                              "_MklQuantizedMatMulWithBiasAndRelu")
-                   .Input(FakeInput(DT_QUINT8))
-                   .Input(FakeInput(DT_QINT8))
-                   .Input(FakeInput(DT_FLOAT))
-                   .Input(FakeInput(DT_FLOAT))
-                   .Input(FakeInput(DT_FLOAT))
-                   .Input(FakeInput(DT_FLOAT))
-                   .Input(FakeInput(DT_FLOAT))
-                   .Attr("Toutput", DataTypeToEnum<qint32>::v())
-                   .Attr("_kernel", "QuantizedMklOp")
-                   .Finalize(node_def()));
+TEST_P(QuantizedMatMulTest, Small_withBiasAndRelu) {
+  const bool is_old_api = GetParam();
+  if (is_old_api) {
+    TF_ASSERT_OK(NodeDefBuilder("quantized_mat_mul_op",
+                                "_MklQuantizedMatMulWithBiasAndRelu")
+                     .Input(FakeInput(DT_QUINT8))
+                     .Input(FakeInput(DT_QINT8))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Attr("Toutput", DataTypeToEnum<qint32>::v())
+                     .Attr("_kernel", "QuantizedMklOp")
+                     .Finalize(node_def()));
+  } else {
+    TF_ASSERT_OK(
+        NodeDefBuilder("quantized_mat_mul_op", "_QuantizedMatMul")
+            .Attr("Thost_inputs", {DT_QUINT8, DT_QINT8, DT_FLOAT, DT_FLOAT,
+                                   DT_FLOAT, DT_FLOAT, DT_FLOAT})
+            .Attr("Thost_outputs", {DT_QINT32, DT_FLOAT, DT_FLOAT})
+            .Attr("Tdevice_inputs", std::vector<DataType>())
+            .Attr("Tdevice_outputs", std::vector<DataType>())
+            .Attr("T1", DT_QUINT8)
+            .Attr("T2", DT_QINT8)
+            .Attr("Tbias", DT_FLOAT)
+            .Attr("Tout", DT_QINT32)
+            .Attr("fused_ops", {"BiasAdd", "Relu"})
+            .Input(FakeInput())
+            .Input(FakeInput())
+            .Finalize(node_def()));
+  }
   TF_ASSERT_OK(InitOp());
   // A matrix is:
   // |  1 |  2 |  3 |
@@ -408,21 +541,41 @@ TEST_F(QuantizedMatMulTest, Small_withBiasAndRelu) {
 
 // Simple test for Matrix multiplication with Bias, Relu and
 // Requantization fusion
-TEST_F(QuantizedMatMulTest, Small_withBiasAndReluAndReq) {
-  TF_ASSERT_OK(NodeDefBuilder("quantized_mat_mul_op",
-                              "_MklQuantizedMatMulWithBiasAndReluAndRequantize")
-                   .Input(FakeInput(DT_QUINT8))
-                   .Input(FakeInput(DT_QINT8))
-                   .Input(FakeInput(DT_QINT32))
-                   .Input(FakeInput(DT_FLOAT))
-                   .Input(FakeInput(DT_FLOAT))
-                   .Input(FakeInput(DT_FLOAT))
-                   .Input(FakeInput(DT_FLOAT))
-                   .Input(FakeInput(DT_FLOAT))
-                   .Input(FakeInput(DT_FLOAT))
-                   .Attr("Toutput", DataTypeToEnum<quint8>::v())
-                   .Attr("_kernel", "QuantizedMklOp")
-                   .Finalize(node_def()));
+TEST_P(QuantizedMatMulTest, Small_withBiasAndReluAndReq) {
+  const bool is_old_api = GetParam();
+  if (is_old_api) {
+    TF_ASSERT_OK(
+        NodeDefBuilder("quantized_mat_mul_op",
+                       "_MklQuantizedMatMulWithBiasAndReluAndRequantize")
+            .Input(FakeInput(DT_QUINT8))
+            .Input(FakeInput(DT_QINT8))
+            .Input(FakeInput(DT_QINT32))
+            .Input(FakeInput(DT_FLOAT))
+            .Input(FakeInput(DT_FLOAT))
+            .Input(FakeInput(DT_FLOAT))
+            .Input(FakeInput(DT_FLOAT))
+            .Input(FakeInput(DT_FLOAT))
+            .Input(FakeInput(DT_FLOAT))
+            .Attr("Toutput", DataTypeToEnum<quint8>::v())
+            .Attr("_kernel", "QuantizedMklOp")
+            .Finalize(node_def()));
+  } else {
+    TF_ASSERT_OK(NodeDefBuilder("quantized_mat_mul_op", "_QuantizedMatMul")
+                     .Attr("Thost_inputs",
+                           {DT_QUINT8, DT_QINT8, DT_QINT32, DT_FLOAT, DT_FLOAT,
+                            DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT})
+                     .Attr("Thost_outputs", {DT_QUINT8, DT_FLOAT, DT_FLOAT})
+                     .Attr("Tdevice_inputs", std::vector<DataType>())
+                     .Attr("Tdevice_outputs", std::vector<DataType>())
+                     .Attr("T1", DT_QUINT8)
+                     .Attr("T2", DT_QINT8)
+                     .Attr("Tbias", DT_QINT32)
+                     .Attr("Tout", DT_QUINT8)
+                     .Attr("fused_ops", {"BiasAdd", "Relu", "Requantize"})
+                     .Input(FakeInput())
+                     .Input(FakeInput())
+                     .Finalize(node_def()));
+  }
   TF_ASSERT_OK(InitOp());
   // A matrix is:
   // |  1 |  2 |  3 |
@@ -470,11 +623,25 @@ TEST_F(QuantizedMatMulTest, Small_withBiasAndReluAndReq) {
   // 178 * 1.00392 ~= 178.698 ~= 179
 
   Tensor expected(allocator(), DT_QUINT8, TensorShape({2, 4}));
+  if (is_old_api) {
 #ifdef ENABLE_ONEDNN_V3
-  test::FillValues<quint8>(&expected, {84, 60, 116, 52, 183, 168, 233, 178});
+    test::FillValues<quint8>(&expected, {84, 60, 116, 52, 183, 168, 233, 178});
 #else
-  test::FillValues<quint8>(&expected, {84, 60, 116, 52, 184, 169, 234, 179});
+    test::FillValues<quint8>(&expected, {84, 60, 116, 52, 184, 169, 234, 179});
 #endif  // ENABLE_ONEDNN_V3
+  } else {
+    // New api uses more numerical precision preserving equation. Old api scales
+    // up to 32-bit and then scales down from 32-bit to 8-bit. New api instead
+    // does a dequantization followed by a scaling to 8-bit.
+    // In this test,
+    //    input deq. scale =  ((255.0 * 127.0) / (255.0 * 127.0)) = 1.0
+    //    output req. scale = 255.0 / 255.0 = 1.0
+    //    combined scale = 1.0 * 1.0 = 1.0
+    // Note: new api scale value is 1.0, whereas the old api scale is 1.000392.
+    // Correct value is 1.0f. The closer it is to the correct value the better
+    // the formula is.
+    test::FillValues<quint8>(&expected, {84, 60, 116, 52, 183, 168, 233, 178});
+  }
 
   const Tensor& output = *GetOutput(0);
   test::ExpectTensorEqual<quint8>(expected, output);
@@ -484,19 +651,38 @@ TEST_F(QuantizedMatMulTest, Small_withBiasAndReluAndReq) {
 // and the result is added with int32 bias
 // For the first time B matrix will be reordered and cached which will be
 // used for subsequent runs
-TEST_F(QuantizedMatMulTest, Small_withWeightCached) {
-  TF_ASSERT_OK(
-      NodeDefBuilder("quantized_mat_mul_op", "_MklQuantizedMatMulWithBias")
-          .Input(FakeInput(DT_QUINT8))
-          .Input(FakeInput(DT_QINT8))
-          .Input(FakeInput(DT_QINT32))
-          .Input(FakeInput(DT_FLOAT))
-          .Input(FakeInput(DT_FLOAT))
-          .Input(FakeInput(DT_FLOAT))
-          .Input(FakeInput(DT_FLOAT))
-          .Attr("Toutput", DataTypeToEnum<qint32>::v())
-          .Attr("_kernel", "QuantizedMklOp")
-          .Finalize(node_def()));
+TEST_P(QuantizedMatMulTest, Small_withWeightCached) {
+  const bool is_old_api = GetParam();
+  if (is_old_api) {
+    TF_ASSERT_OK(
+        NodeDefBuilder("quantized_mat_mul_op", "_MklQuantizedMatMulWithBias")
+            .Input(FakeInput(DT_QUINT8))
+            .Input(FakeInput(DT_QINT8))
+            .Input(FakeInput(DT_QINT32))
+            .Input(FakeInput(DT_FLOAT))
+            .Input(FakeInput(DT_FLOAT))
+            .Input(FakeInput(DT_FLOAT))
+            .Input(FakeInput(DT_FLOAT))
+            .Attr("Toutput", DataTypeToEnum<qint32>::v())
+            .Attr("_kernel", "QuantizedMklOp")
+            .Finalize(node_def()));
+  } else {
+    TF_ASSERT_OK(
+        NodeDefBuilder("quantized_mat_mul_op", "_QuantizedMatMul")
+            .Attr("Thost_inputs", {DT_QUINT8, DT_QINT8, DT_QINT32, DT_FLOAT,
+                                   DT_FLOAT, DT_FLOAT, DT_FLOAT})
+            .Attr("Thost_outputs", {DT_QINT32, DT_FLOAT, DT_FLOAT})
+            .Attr("Tdevice_inputs", std::vector<DataType>())
+            .Attr("Tdevice_outputs", std::vector<DataType>())
+            .Attr("T1", DT_QUINT8)
+            .Attr("T2", DT_QINT8)
+            .Attr("Tbias", DT_QINT32)
+            .Attr("Tout", DT_QINT32)
+            .Attr("fused_ops", {"BiasAdd"})
+            .Input(FakeInput())
+            .Input(FakeInput())
+            .Finalize(node_def()));
+  }
   TF_ASSERT_OK(InitOp());
   // The tensor shape of (1,3) is selected to allow the oneDNN expected
   // weight format to be made as OI rather than IO for BS > 1
@@ -549,6 +735,9 @@ TEST_F(QuantizedMatMulTest, Small_withWeightCached) {
   test::ExpectTensorEqual<qint32>(expected, output_new);
 }
 
+INSTANTIATE_TEST_SUITE_P(All, QuantizedMatMulTest,
+                         ::testing::Values(true, false));
+
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl/mkl_quantize_op.cc b/tensorflow/core/kernels/mkl/mkl_quantize_op.cc
index 41c9d260d31c16..ec0dac12916b15 100644
--- a/tensorflow/core/kernels/mkl/mkl_quantize_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_quantize_op.cc
@@ -263,9 +263,9 @@ class MklReorderWithScalePrimitiveFactory : public MklPrimitiveFactory<T> {
   }
 };
 
-// Quantizes a tensor from float to T, with user-specified min_range and
-// max_range.
-template <typename Device, typename T, bool native_format = false>
+// Quantizes a tensor from S(input) to T(output), with user-specified min_range
+// and max_range.
+template <typename Device, typename T, typename S, bool native_format = false>
 class MklQuantizeV2Op : public OpKernel {
  public:
   explicit MklQuantizeV2Op(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -435,7 +435,7 @@ class MklQuantizeV2Op : public OpKernel {
     }
     // Create reorder memory for src, dst: both are defined in mkl_util.h,
     // they are wrapper
-    MklDnnData<float> src(&cpu_engine);
+    MklDnnData<S> src(&cpu_engine);
     MklDnnData<T> dst(&cpu_engine);
 #ifdef ENABLE_ONEDNN_V3
     MklDnnData<float> scale(&cpu_engine);
@@ -444,34 +444,9 @@ class MklQuantizeV2Op : public OpKernel {
     auto src_md =
         src_mkl_shape.IsMklTensor()
             ? src_mkl_shape.GetMklLayout()
-            : memory::desc(src_dims, MklDnnType<float>(), dst_layout_type);
-
-    // If the mode is min_first, input data has to be subtracted from
-    // min_range, before being scaled
-    auto flat_input = input.flat<float>().data();
-    Tensor min_shifted_input_tensor;
-    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, input.shape(),
-                                           &min_shifted_input_tensor));
-    if (mode_ == QUANTIZE_MODE_MIN_FIRST) {
-      auto minfirst_input = min_shifted_input_tensor.flat<float>().data();
-      const Eigen::TensorOpCost cost(
-          sizeof(float), /*load bytes*/
-          sizeof(float), /*saved bytes*/
-          Eigen::TensorOpCost::AddCost<float>() /*sub cost*/);
-
-      const CPUDevice& d = ctx->eigen_device<CPUDevice>();
-      auto ParallelSub = [&](int64 start, int64 end) {
-        for (int i = start; i < end; ++i) {
-          minfirst_input[i] = flat_input[i] - min_range;
-        }
-      };
-      d.parallelFor(input.NumElements(), cost, ParallelSub);
-
-      src.SetUsrMem(src_md, &min_shifted_input_tensor);
-    } else {
-      src.SetUsrMem(src_md, &src_tensor);
-    }
+            : memory::desc(src_dims, MklDnnType<S>(), dst_layout_type);
 
+    src.SetUsrMem(src_md, &src_tensor);
     memory::desc dst_md =
         memory::desc(src_dims, MklDnnType<T>(), dst_layout_type);
 
@@ -509,12 +484,21 @@ class MklQuantizeV2Op : public OpKernel {
     AllocateOutputSetMklShape(ctx, 2, &output_max_tensor, max_tf_shape,
                               max_mkl_shape, native_format);
 
+    // Create the oneDNN wrapper over Eigen threadpool and set max threads
+    // in oneDNN.
+    Eigen::ThreadPoolInterface* eigen_interface =
+        EigenThreadPoolFromTfContext(ctx);
+    tsl::OneDnnThreadPool eigen_tp(eigen_interface,
+                                   ThreadPoolUseCallerThread());
+
     float scale_factor = 0;
     if (mode_ == QUANTIZE_MODE_SCALED) {
       // Estimating scales for quantization.
       const int num_bits = sizeof(T) * 8;
       const float max_abs = std::max(std::abs(min_range), std::abs(max_range));
-      const bool is_signed = std::is_signed<T>::value;
+      const bool is_signed = std::is_same<T, qint8>() ||
+                             std::is_same<T, qint16>() ||
+                             std::is_same<T, qint32>();
       float target_range;
       if (is_signed) {
         max_range = max_abs;
@@ -532,44 +516,86 @@ class MklQuantizeV2Op : public OpKernel {
         target_range = static_cast<float>((uint64_t{1} << num_bits) - 1);
       }
       scale_factor = target_range / max_abs;
+
+#ifdef ENABLE_ONEDNN_V3
+      auto scale_md =
+          memory::desc({1}, MklDnnType<float>(), memory::format_tag::x);
+      MklReorderWithScaleFwdParams fwdParams(src_dims, src_md, dst_md,
+                                             scale_md);
+      Tensor scale_tensor;
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, {1}, &scale_tensor));
+      scale_tensor.flat<float>()(0) = scale_factor;
+      scale.SetUsrMem(scale_md, &scale_tensor);
+#else
+      MklReorderWithScaleFwdParams fwdParams(src_dims, src_md, dst_md);
+#endif  // ENABLE_ONEDNN_V3
+      fwdParams.dtypes.append(typeid(S).name());
+      fwdParams.dtypes.append(typeid(T).name());
+      fwdParams.post_op_params.name = "scale";
+      fwdParams.post_op_params.param.push_back(scale_factor);
+
+      MklReorderWithScalePrimitive* reorder_prim =
+          MklReorderWithScalePrimitiveFactory<T>::Get(
+              src.GetUsrMem(), dst.GetUsrMem(), fwdParams);
+
+      std::shared_ptr<stream> cpu_stream;
+      cpu_stream.reset(CreateStream(&eigen_tp, reorder_prim->GetEngine()));
+      reorder_prim->Execute(src.GetUsrMemDataHandle(),
+                            dst.GetUsrMemDataHandle(),
+#ifdef ENABLE_ONEDNN_V3
+                            scale.GetUsrMemDataHandle(),
+#endif  // ENABLE_ONEDNN_V3
+                            cpu_stream);
     } else if (mode_ == QUANTIZE_MODE_MIN_FIRST) {
-      // Estimate scale for qunatization
-      const int number_of_bits = sizeof(T) * 8;
-      const int64 number_of_steps = static_cast<int64_t>(1) << number_of_bits;
-      scale_factor = (number_of_steps - 1.0) / (max_range - min_range);
-    }
+      using namespace dnnl;
+      std::shared_ptr<stream> cpu_stream;
+      cpu_stream.reset(CreateStream(&eigen_tp, cpu_engine));
+
+      auto shift = static_cast<S>(-min_range);
+      memory::dims shift_dims(src_tf_shape.dims(), 1);
+      auto shift_md =
+          memory::desc(shift_dims, MklDnnType<S>(), dst_layout_type);
+      memory shift_mem(shift_md, cpu_engine, (void*)(&shift));
+
+      primitive_attr attr;
+      std::vector<float> src_0_scale{255.0f / (max_range - min_range)};
+      std::vector<float> src_1_scale{255.0f / (max_range - min_range)};
 #ifdef ENABLE_ONEDNN_V3
-    auto scale_md =
-        memory::desc({1}, MklDnnType<float>(), memory::format_tag::x);
-    MklReorderWithScaleFwdParams fwdParams(src_dims, src_md, dst_md, scale_md);
-    Tensor scale_tensor;
-    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, {1}, &scale_tensor));
-    scale_tensor.flat<float>()(0) = scale_factor;
-    scale.SetUsrMem(scale_md, &scale_tensor);
+      attr.set_scales_mask(DNNL_ARG_SRC_0, 0);
+      attr.set_scales_mask(DNNL_ARG_SRC_1, 0);
+      auto binary_pd = binary::primitive_desc(cpu_engine, algorithm::binary_add,
+                                              src_md, shift_md, dst_md, attr);
 #else
-    MklReorderWithScaleFwdParams fwdParams(src_dims, src_md, dst_md);
-    fwdParams.dtypes.append(typeid(T).name());
+      attr.set_scales(DNNL_ARG_SRC_0, 0, src_0_scale);
+      attr.set_scales(DNNL_ARG_SRC_1, 0, src_1_scale);
+      auto binary_d =
+          binary::desc(algorithm::binary_add, src_md, shift_md, dst_md);
+      auto binary_pd = binary::primitive_desc(binary_d, attr, cpu_engine);
 #endif  // ENABLE_ONEDNN_V3
-    fwdParams.post_op_params.name = "scale";
-    fwdParams.post_op_params.param.push_back(scale_factor);
 
-    // Create the oneDNN wrapper over Eigen threadpool and set max threads
-    // in oneDNN.
-    Eigen::ThreadPoolInterface* eigen_interface =
-        EigenThreadPoolFromTfContext(ctx);
-    tsl::OneDnnThreadPool eigen_tp(eigen_interface,
-                                   ThreadPoolUseCallerThread());
-    MklReorderWithScalePrimitive* reorder_prim =
-        MklReorderWithScalePrimitiveFactory<T>::Get(src.GetUsrMem(),
-                                                    dst.GetUsrMem(), fwdParams);
-    std::shared_ptr<stream> cpu_stream;
-
-    cpu_stream.reset(CreateStream(&eigen_tp, reorder_prim->GetEngine()));
-    reorder_prim->Execute(src.GetUsrMemDataHandle(), dst.GetUsrMemDataHandle(),
+      auto binary_prim = binary(binary_pd);
+      auto src_0_scale_mem =
+          memory({{1}, MklDnnType<float>(), memory::format_tag::x}, cpu_engine,
+                 src_0_scale.data());
+      auto src_1_scale_mem =
+          memory({{1}, MklDnnType<float>(), memory::format_tag::x}, cpu_engine,
+                 src_1_scale.data());
+      std::unordered_map<int, memory> net_args{
+          {DNNL_ARG_SRC_0, *src.GetUsrMem()},
+          {DNNL_ARG_SRC_1, shift_mem},
+          {DNNL_ARG_DST, *dst.GetUsrMem()},
 #ifdef ENABLE_ONEDNN_V3
-                          scale.GetUsrMemDataHandle(),
-#endif  // ENABLE_ONEDNN_V3
-                          cpu_stream);
+          {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_0, src_0_scale_mem},
+          { DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_1,
+            src_1_scale_mem }
+#endif
+      };
+      binary_prim.execute(*cpu_stream, net_args);
+    } else {
+      OP_REQUIRES(ctx, false,
+                  absl::UnimplementedError(
+                      "Supported modes are MIN_FIRST and SCALED only."));
+    }
 
     output_min_tensor->scalar<float>()() = min_range;
     output_max_tensor->scalar<float>()() = max_range;
@@ -583,16 +609,16 @@ class MklQuantizeV2Op : public OpKernel {
   bool narrow_range_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("_MklQuantizeV2")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<quint8>("T")
-                            .Label(mkl_op_registry::kMklQuantizedOpLabel),
-                        MklQuantizeV2Op<CPUDevice, quint8, true>);
-REGISTER_KERNEL_BUILDER(Name("_MklQuantizeV2")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<qint8>("T")
-                            .Label(mkl_op_registry::kMklQuantizedOpLabel),
-                        MklQuantizeV2Op<CPUDevice, qint8, true>);
+#define REGISTER_QUANTIZE(src_type, dst_type)            \
+  REGISTER_KERNEL_BUILDER(                               \
+      Name("_MklQuantizeV2")                             \
+          .Device(DEVICE_CPU)                            \
+          .TypeConstraint<dst_type>("T")                 \
+          .Label(mkl_op_registry::kMklQuantizedOpLabel), \
+      MklQuantizeV2Op<CPUDevice, dst_type, src_type, true>)
+
+REGISTER_QUANTIZE(float, qint8);
+REGISTER_QUANTIZE(float, quint8);
 
 #undef SET_MKL_LAYOUT
 
diff --git a/tensorflow/core/kernels/mkl/mkl_quantize_op_test.cc b/tensorflow/core/kernels/mkl/mkl_quantize_op_test.cc
index 6c81ffb7bc81dd..6bad9720909dbc 100644
--- a/tensorflow/core/kernels/mkl/mkl_quantize_op_test.cc
+++ b/tensorflow/core/kernels/mkl/mkl_quantize_op_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if defined(INTEL_MKL) && defined(ENABLE_MKL)
+#if defined(INTEL_MKL)
 #include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -155,4 +155,4 @@ TEST_F(MklQuantizeV2OpTest, small_minfirst_int) {
 }
 
 }  // end namespace tensorflow
-#endif  // INTEL_MKL && ENABLE_MKL
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl/onednn_fused_matmul_ops_test.cc b/tensorflow/core/kernels/mkl/onednn_fused_matmul_ops_test.cc
new file mode 100644
index 00000000000000..a98ddbcc359c82
--- /dev/null
+++ b/tensorflow/core/kernels/mkl/onednn_fused_matmul_ops_test.cc
@@ -0,0 +1,748 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#if defined(INTEL_MKL)
+
+#define EIGEN_USE_THREADS
+
+#include <algorithm>
+#include <limits>
+
+#include "gtest/gtest.h"
+#include "absl/algorithm/container.h"
+#include "absl/strings/match.h"
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/math_ops.h"
+#include "tensorflow/cc/ops/nn_ops.h"
+#include "tensorflow/cc/ops/nn_ops_internal.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/kernels/mkl/mkl_kernel_util.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/util/util.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+// The test suite contains different categories of tests.
+//    (1) Realnumber (float/bfloat16): The output of _FusedMatMul should be
+//    close enough to the final output of the sequence of unfused operations.
+//    Only Gelu fusion is included here. All other fusion tests can be found in
+//    tensorflow/core/kernels/mkl/mkl_fused_ops_test.cc
+//
+//    (2) Quantized: Possible fusions are done in _QuantizedMatMul op. The
+//    output of
+//        quantize --> quantized_op --> dequantize, or
+//        quantize --> quantized_op --> requantize --> dequantize
+//    should be close (with a higher tolerance) to the final output of the
+//    sequence of unfused real number type operations. For the quantized
+//    scenario, it is assumed that the first matrix of MatMul op represents
+//    feature, while the second matrix represents weight parameters. The feature
+//    matrix can be quantized with MIN_FIRST (to QUINT8) or SCALED (to QINT8)
+//    mode and always quantized per-tensor. The weight can be quantized with
+//    per-tensor or per-channel, only with SCALED mode to QINT8.
+
+// T: float or bfloat16 used as tensor type of the MatMul and fusion operation.
+template <typename T>
+class FusedMatMulOpsTest : public OpsTestBase {
+ private:
+  float leakyrelu_alpha_ = 0.2f;
+
+ protected:
+  struct FusedOpsAndDims {
+    // List of fusions.
+    std::vector<string> fused_ops;
+    // Tensor dimension associated with the fusions. It is assumed here that
+    // each fusion requires no more than one addtional tensor. If some fusion
+    // does not require a tensor, e.g., Relu, the tensor dimensions will be {0}
+    // implying an an empty tensor.
+    std::vector<std::vector<int64_t>> fusion_dims;
+  };
+
+  struct FusedOpsAndTensors {
+    // List of fusions.
+    std::vector<string> fused_ops;
+    // Tensors associated with the fusions. It is assumed here that each fusion
+    // requires no more than one additional tensor. If some fusion does not
+    // require a tensor, e.g., Relu, the tensor will be an empty tensor.
+    std::vector<Tensor> fusion_tensors;
+  };
+
+  using GraphRunner =
+      std::function<void(const Tensor& x, const Tensor& y,
+                         const FusedOpsAndTensors& fused_ops_and_tensors,
+                         Tensor* result, bool transpose_x, bool transpose_y)>;
+
+  using QuantizedGraphRunner = std::function<void(
+      const Tensor& x, const Tensor& y,
+      const FusedOpsAndTensors& fused_ops_and_tensors, Tensor* result,
+      bool transpose_x, bool transpose_y, string input_quant_mode,
+      string output_quant_mode, bool is_bias_quantized, bool is_perchannel,
+      bool requantize, float output_min, float output_max)>;
+
+  bool HasQuantizationSupport() {
+    return TestCPUFeature(tensorflow::port::CPUFeature::AVX_VNNI_INT8) ||
+           TestCPUFeature(tensorflow::port::CPUFeature::AVX512_VNNI) ||
+           TestCPUFeature(port::CPUFeature::AMX_INT8);
+  }
+
+  // Runs a Tensorflow graph defined by the root scope, and fetches the result
+  // of 'fetch' node into the outputs. Optional `add_nodes` parameter
+  // allows to define nodes directly using NodeDefBuilder.
+  void RunAndFetch(const tensorflow::Scope& root,
+                   const std::vector<string>& fetch,
+                   std::vector<Tensor>* outputs,
+                   const std::vector<const NodeDef*> add_nodes = {}) {
+    tensorflow::GraphDef graph;
+    TF_ASSERT_OK(root.ToGraphDef(&graph));
+
+    for (const NodeDef* add_node : add_nodes) {
+      *graph.add_node() = *add_node;
+    }
+
+    // We really want to make sure that graph executed exactly as we passed it
+    // to the session, so we disable various optimizations.
+    tensorflow::SessionOptions session_options;
+
+    // Disable common runtime constant folding.
+    session_options.config.mutable_graph_options()
+        ->mutable_optimizer_options()
+        ->set_opt_level(OptimizerOptions::L0);
+
+    // Disable Grappler optimizations for tests.
+    tensorflow::RewriterConfig* cfg =
+        session_options.config.mutable_graph_options()
+            ->mutable_rewrite_options();
+    cfg->set_constant_folding(tensorflow::RewriterConfig::OFF);
+    cfg->set_layout_optimizer(tensorflow::RewriterConfig::OFF);
+    cfg->set_remapping(tensorflow::RewriterConfig::OFF);
+
+    std::unique_ptr<tensorflow::Session> session(
+        tensorflow::NewSession(session_options));
+
+    const string device = "/device:CPU:0";
+    for (NodeDef& mutable_node : *graph.mutable_node()) {
+      mutable_node.set_device(device);
+    }
+
+    TF_ASSERT_OK(session->Create(graph));
+    TF_ASSERT_OK(session->Run({}, fetch, {}, outputs));
+  }
+
+  Output ActivationOp(Scope& root, string op, Output x, string name) {
+    // TODO(intel-tf): Add GeluExact (Erf op based) when the Erf op is enabled
+    // for bfloat16. GeluExact with float32 precision test can be found in
+    //    tensorflow/python/grappler/remapper_test.py
+    if (op == "Relu") {
+      return ops::Relu(root.WithOpName(name), x);
+    } else if (op == "Relu6") {
+      return ops::Relu6(root.WithOpName(name), x);
+    } else if (op == "LeakyRelu") {
+      return ops::internal::LeakyRelu(
+          root.WithOpName(name), x,
+          ops::internal::LeakyRelu::Attrs().Alpha(this->leakyrelu_alpha_));
+    } else if (op == "Elu") {
+      return ops::Elu(root.WithOpName(name), x);
+    } else if (op == "Tanh") {
+      return ops::Tanh(root.WithOpName(name), x);
+    } else if (op == "Sigmoid") {
+      return ops::Sigmoid(root.WithOpName(name), x);
+    } else if (op == "GeluApproximate") {
+      Output three = ops::Const<T>(root.WithOpName("gelu_three"), 3.0f);
+      Output empirical =
+          ops::Const<T>(root.WithOpName("gelu_empirical"), 0.044715f);
+      Output square_root_two_over_pi = ops::Const<T>(
+          root.WithOpName("gelu_square_root_two_over_pi"), 0.7978845608028654f);
+      Output one = ops::Const<T>(root.WithOpName("gelu_one"), 1.0f);
+      Output half = ops::Const<T>(root.WithOpName("gelu_half"), 0.5f);
+      Output pow = ops::Pow(root.WithOpName("gelu_pow"), x, three);
+      Output mul1 = ops::Multiply(root.WithOpName("gelu_mul1"), empirical, pow);
+      Output add1 = ops::AddV2(root.WithOpName("gelu_add1"), x, mul1);
+      Output mul2 = ops::Multiply(root.WithOpName("gelu_mul2"),
+                                  square_root_two_over_pi, add1);
+      Output tanh = ops::Tanh(root.WithOpName("gelu_tanh"), mul2);
+      Output add3 = ops::AddV2(root.WithOpName("gelu_add3"), one, tanh);
+      Output mul3 = ops::Multiply(root.WithOpName("gelu_mul3"), half, x);
+      return ops::Multiply(root.WithOpName(name), mul3, add3);
+    } else {
+      EXPECT_TRUE(false) << absl::StrCat("The activation: ", op,
+                                         " is not supported in this test.");
+    }
+  }
+
+  void RunMatMulAndFusedOps(const Tensor& x, const Tensor& y,
+                            const FusedOpsAndTensors& fused_ops_and_tensors,
+                            Tensor* result, bool transpose_x,
+                            bool transpose_y) {
+    Scope root = tensorflow::Scope::NewRootScope();
+
+    Output x_input =
+        ops::Const(root.WithOpName("x_input"), Input::Initializer(x));
+    Output y_input =
+        ops::Const(root.WithOpName("y_input"), Input::Initializer(y));
+    Output last_output = ops::MatMul(
+        root.WithOpName("matmul"), x_input, y_input,
+        ops::MatMul::Attrs().TransposeA(transpose_x).TransposeB(transpose_y));
+    auto& fused_ops = fused_ops_and_tensors.fused_ops;
+    auto& fusion_tensors = fused_ops_and_tensors.fusion_tensors;
+    for (int i = 0; i < fused_ops.size(); ++i) {
+      const string& op = fused_ops[i];
+      if (op == "BiasAdd") {
+        Output arg = ops::Const(root.WithOpName(absl::StrCat("arg", i)),
+                                Input::Initializer(fusion_tensors[i]));
+        last_output = ops::BiasAdd(
+            root.WithOpName(absl::StrCat("bias_add_at_", i)), last_output, arg);
+      } else if (op == "Relu" || op == "Relu6" || op == "LeakyRelu" ||
+                 op == "Elu" || op == "Tanh" || op == "Sigmoid" ||
+                 op == "GeluApproximate") {
+        last_output =
+            ActivationOp(root, op, last_output, absl::StrCat(op, "_at_", i));
+      } else if (op == "Add") {
+        ASSERT_EQ(x.dtype(), fusion_tensors[i].dtype());
+        Output arg = ops::Const(root.WithOpName(absl::StrCat("arg", i)),
+                                Input::Initializer(fusion_tensors[i]));
+        last_output = ops::AddV2(root.WithOpName(absl::StrCat("add_at_", i)),
+                                 last_output, arg);
+      } else {
+        EXPECT_TRUE(false) << absl::StrCat("The fusion: [",
+                                           absl::StrJoin(fused_ops, ","),
+                                           "] is not supported in this test.");
+      }
+    }
+    std::vector<Tensor> outputs;
+    RunAndFetch(root, {last_output.name()}, &outputs);
+    *result = outputs[0];
+  }
+
+  void RunFusedMatMul(const Tensor& x, const Tensor& y,
+                      const FusedOpsAndTensors& fused_ops_and_tensors,
+                      Tensor* result, bool transpose_x, bool transpose_y) {
+    Scope root = tensorflow::Scope::NewRootScope();
+
+    DataType dtype = DataTypeToEnum<T>::v();
+
+    Output x_input =
+        ops::Const(root.WithOpName("x_input"), Input::Initializer(x));
+    Output y_input =
+        ops::Const(root.WithOpName("y_input"), Input::Initializer(y));
+    auto& fused_ops = fused_ops_and_tensors.fused_ops;
+    auto& fusion_tensors = fused_ops_and_tensors.fusion_tensors;
+    int num_fusion_inputs = 0;
+    bool has_leaky_relu = false;
+    std::vector<NodeDefBuilder::NodeOut> fusion_inputs;
+    for (int i = 0; i < fused_ops.size(); ++i) {
+      const string& op = fused_ops[i];
+      if (op == "BiasAdd") {
+        Output arg = ops::Const(root.WithOpName(absl::StrCat("arg", i)),
+                                Input::Initializer(fusion_tensors[i]));
+        fusion_inputs.push_back({arg.name(), 0, dtype});
+        num_fusion_inputs++;
+      } else if (op == "Add") {
+        ASSERT_EQ(x.dtype(), fusion_tensors[i].dtype());
+        Output arg = ops::Const(root.WithOpName(absl::StrCat("arg", i)),
+                                Input::Initializer(fusion_tensors[i]));
+        fusion_inputs.push_back({arg.name(), 0, dtype});
+        num_fusion_inputs++;
+      } else if (op == "LeakyRelu") {
+        has_leaky_relu = true;
+      } else {
+        bool is_supported = op == "Relu" || op == "Relu6" ||
+                            op == "LeakyRelu" || op == "Elu" || op == "Tanh" ||
+                            op == "Sigmoid" || op == "GeluApproximate";
+        EXPECT_TRUE(is_supported)
+            << absl::StrCat("The fusion: [", absl::StrJoin(fused_ops, ","),
+                            "] is not supported in this test.");
+      }
+    }
+    NodeDef fused_matmul;
+    std::vector<const NodeDef*> add_nodes;
+    TF_EXPECT_OK(NodeDefBuilder("fused_batch_matmul", "_MklNativeFusedMatMul")
+                     .Input({x_input.name(), 0, dtype})
+                     .Input({y_input.name(), 0, dtype})
+                     .Input(fusion_inputs)
+                     .Attr("transpose_a", transpose_x)
+                     .Attr("transpose_b", transpose_y)
+                     .Attr("num_args", num_fusion_inputs)
+                     .Attr("fused_ops", fused_ops)
+                     .Attr("leakyrelu_alpha",
+                           has_leaky_relu ? this->leakyrelu_alpha_ : 0.2f)
+                     .Attr("_kernel", "MklNameChangeOp")
+                     .Finalize(&fused_matmul));
+    add_nodes = {&fused_matmul};
+    std::vector<Tensor> outputs;
+    RunAndFetch(root, {fused_matmul.name()}, &outputs, add_nodes);
+    *result = outputs[0];
+  }
+
+  // Compute quantized tensor perchannel (aka axis) in SCALED mode for 2D
+  // tensor.
+  template <bool transpose = false>
+  void GetPerchannelQuantizationTensors(const Tensor& input, Tensor* output,
+                                        Tensor* min_tensor,
+                                        Tensor* max_tensor) {
+    ASSERT_EQ(input.dims(), 2);
+    ASSERT_EQ(output->dtype(), DT_QINT8);
+    constexpr int axis = transpose ? 0 : 1;
+    int num_channels = input.dim_size(axis);
+    ASSERT_EQ(min_tensor->NumElements(), num_channels);
+    ASSERT_EQ(max_tensor->NumElements(), num_channels);
+
+    auto eigen_input_tensor = input.matrix<T>().template cast<float>();
+    auto eigen_output_tensor = output->matrix<qint8>();
+    std::vector<float> scales(num_channels);
+    float* min_tensor_buf = min_tensor->flat<float>().data();
+    float* max_tensor_buf = max_tensor->flat<float>().data();
+    for (int i = 0; i < num_channels; ++i) {
+      auto input_slice = eigen_input_tensor.template chip<axis>(i);
+      auto output_slice = eigen_output_tensor.template chip<axis>(i);
+      Eigen::Tensor<float, 0, Eigen::RowMajor> min = input_slice.minimum();
+      Eigen::Tensor<float, 0, Eigen::RowMajor> max = input_slice.maximum();
+      float min_i = min();
+      float max_i = max();
+      float range = std::max(std::abs(min_i), std::abs(max_i));
+      min_tensor_buf[i] = -range;
+      max_tensor_buf[i] = range;
+      const float scale = 127.0f / range;
+      output_slice = (input_slice * scale).round().template cast<qint8>();
+    }
+  }
+
+  void RunQuantizedMatMul(const Tensor& x, const Tensor& y,
+                          const FusedOpsAndTensors& fused_ops_and_tensors,
+                          Tensor* result, bool transpose_x, bool transpose_y,
+                          string input_quant_mode, string output_quant_mode,
+                          bool is_bias_quantized, bool is_perchannel,
+                          bool requantize, float output_min, float output_max) {
+    // TODO(intel-tf): Extend test with quantized bias
+    ASSERT_EQ(is_bias_quantized, false);
+
+    DataType real_dtype = DataTypeToEnum<T>::v();
+    DataType qinput_dtype =
+        (input_quant_mode == "MIN_FIRST") ? DT_QUINT8 : DT_QINT8;
+    // Quantize x and y
+    Tensor x_qtensor(qinput_dtype, x.shape());
+    Tensor x_min_tensor(DT_FLOAT, TensorShape({}));
+    Tensor x_max_tensor(DT_FLOAT, TensorShape({}));
+    auto status = MklTestingUtil::GetQuantizationTensors<T>(
+        x, &x_qtensor, qinput_dtype, input_quant_mode, &x_min_tensor,
+        &x_max_tensor);
+    ASSERT_TRUE(status.ok());
+    Tensor y_qtensor(DT_QINT8, y.shape());
+    const int num_channels = transpose_y ? y.dim_size(0) : y.dim_size(1);
+    TensorShape minmax_shape =
+        is_perchannel ? TensorShape({num_channels}) : TensorShape({});
+    Tensor y_min_tensor(DT_FLOAT, minmax_shape);
+    Tensor y_max_tensor(DT_FLOAT, minmax_shape);
+    if (is_perchannel) {
+      if (transpose_y) {
+        GetPerchannelQuantizationTensors<true>(y, &y_qtensor, &y_min_tensor,
+                                               &y_max_tensor);
+      } else {
+        GetPerchannelQuantizationTensors<false>(y, &y_qtensor, &y_min_tensor,
+                                                &y_max_tensor);
+      }
+    } else {
+      auto status = MklTestingUtil::GetQuantizationTensors<T>(
+          y, &y_qtensor, DT_QINT8, "SCALED", &y_min_tensor, &y_max_tensor);
+      ASSERT_TRUE(status.ok());
+    }
+
+    Scope root = tensorflow::Scope::NewRootScope();
+
+    Output x_input =
+        ops::Const(root.WithOpName("x_input"), Input::Initializer(x_qtensor));
+    Output x_min =
+        ops::Const(root.WithOpName("x_min"), Input::Initializer(x_min_tensor));
+    Output x_max =
+        ops::Const(root.WithOpName("x_max"), Input::Initializer(x_max_tensor));
+    Output y_input =
+        ops::Const(root.WithOpName("y_input"), Input::Initializer(y_qtensor));
+    Output y_min =
+        ops::Const(root.WithOpName("y_min"), Input::Initializer(y_min_tensor));
+    Output y_max =
+        ops::Const(root.WithOpName("y_max"), Input::Initializer(y_max_tensor));
+    auto& fused_ops = fused_ops_and_tensors.fused_ops;
+    auto& fusion_tensors = fused_ops_and_tensors.fusion_tensors;
+    int num_fusion_inputs = 0;
+    std::vector<NodeDefBuilder::NodeOut> fusion_inputs;
+    bool has_leaky_relu = false;
+    for (int i = 0; i < fused_ops.size(); ++i) {
+      const string& op = fused_ops[i];
+      if (op == "BiasAdd") {
+        Output arg = ops::Const(root.WithOpName(absl::StrCat("arg", i)),
+                                Input::Initializer(fusion_tensors[i]));
+        fusion_inputs.push_back({arg.name(), 0, real_dtype});
+        num_fusion_inputs++;
+      } else if (op == "Add") {
+        ASSERT_EQ(real_dtype, fusion_tensors[i].dtype());
+        Output arg = ops::Const(root.WithOpName(absl::StrCat("arg", i)),
+                                Input::Initializer(fusion_tensors[i]));
+        fusion_inputs.push_back({arg.name(), 0, real_dtype});
+        num_fusion_inputs++;
+      } else if (op == "LeakyRelu") {
+        has_leaky_relu = true;
+      }
+    }
+    NodeDef fused_matmul;
+    std::vector<const NodeDef*> add_nodes;
+    std::vector<Tensor> outputs;
+    std::vector<NodeDefBuilder::NodeOut> inputs;
+    inputs.push_back({"x_input", 0, qinput_dtype});
+    inputs.push_back({"y_input", 0, DT_QINT8});
+    inputs.insert(std::end(inputs), std::begin(fusion_inputs),
+                  std::end(fusion_inputs));
+    inputs.push_back({"x_min", 0, DT_FLOAT});
+    inputs.push_back({"x_max", 0, DT_FLOAT});
+    inputs.push_back({"y_min", 0, DT_FLOAT});
+    inputs.push_back({"y_max", 0, DT_FLOAT});
+    std::vector<string> extended_fused_ops(fused_ops);
+    DataType out_dtype;
+    if (requantize) {
+      if (output_quant_mode == "SCALED") {
+        out_dtype = DT_QINT8;
+      } else {
+        out_dtype = DT_QUINT8;
+      }
+    } else {
+      out_dtype = real_dtype;
+    }
+    std::vector<DataType> output_dtypes;
+    if (requantize) {
+      Output out_min = ops::Const(root.WithOpName("output_min"), output_min);
+      Output out_max = ops::Const(root.WithOpName("output_max"), output_max);
+      inputs.push_back({"output_min", 0, DT_FLOAT});
+      inputs.push_back({"output_max", 0, DT_FLOAT});
+      extended_fused_ops.push_back("Requantize");
+      output_dtypes = {out_dtype, DT_FLOAT, DT_FLOAT};
+    } else {
+      extended_fused_ops.push_back("Dequantize");
+      output_dtypes = {out_dtype};
+    }
+
+    TF_EXPECT_OK(NodeDefBuilder("quantized_fused_matmul", "_QuantizedMatMul")
+                     .Attr("Tdevice_inputs", std::vector<DataType>())
+                     .Input(FakeInput())
+                     .Input(inputs)
+                     .Attr("Thost_outputs", output_dtypes)
+                     .Attr("Tdevice_outputs", std::vector<DataType>())
+                     .Attr("T1", qinput_dtype)
+                     .Attr("T2", DT_QINT8)
+                     .Attr("Tbias", real_dtype)
+                     .Attr("Tout", out_dtype)
+                     .Attr("U", real_dtype)
+                     .Attr("transpose_a", transpose_x)
+                     .Attr("transpose_b", transpose_y)
+                     .Attr("fused_ops", extended_fused_ops)
+                     .Attr("leakyrelu_alpha",
+                           has_leaky_relu ? this->leakyrelu_alpha_ : 0.2f)
+                     .Attr("input_quant_mode", input_quant_mode)
+                     .Attr("output_quant_mode", output_quant_mode)
+                     .Finalize(&fused_matmul));
+    if (requantize) {
+      NodeDef dequantize;
+      TF_EXPECT_OK(NodeDefBuilder("dequantize", "Dequantize")
+                       .Input({"quantized_fused_matmul", 0, out_dtype})
+                       .Input({"quantized_fused_matmul", 1, DT_FLOAT})
+                       .Input({"quantized_fused_matmul", 2, DT_FLOAT})
+                       .Attr("dtype", real_dtype)
+                       .Attr("mode", output_quant_mode)
+                       .Finalize(&dequantize));
+      add_nodes = {&fused_matmul, &dequantize};
+      RunAndFetch(root, {dequantize.name()}, &outputs, add_nodes);
+    } else {
+      add_nodes = {&fused_matmul};
+      RunAndFetch(root, {fused_matmul.name()}, &outputs, add_nodes);
+    }
+    *result = outputs[0];
+  }
+
+  template <typename FusedGraphRunner>
+  void VerifyTensorsNear(const std::vector<int64_t>& x_dims,
+                         const std::vector<int64_t>& y_dims,
+                         const FusedOpsAndDims& fused_ops_and_dims,
+                         const GraphRunner& run_default,
+                         const FusedGraphRunner& run_fused, bool transpose_x,
+                         bool transpose_y, const double atol = 1e-5,
+                         // The following arguments are used by quantized fusion
+                         string input_quant_mode = "SCALED",
+                         string output_quant_mode = "SCALED",
+                         bool is_bias_quantized = false,
+                         bool is_perchannel = false, bool requantize = false) {
+    DataType dtype = DataTypeToEnum<T>::v();
+    TensorShape x_shape = TensorShape(x_dims);
+    TensorShape y_shape = TensorShape(y_dims);
+
+    Tensor x_tensor(dtype, x_shape);
+    x_tensor.flat<T>().setRandom();
+    x_tensor.flat<T>() -= x_tensor.flat<T>().constant(static_cast<T>(0.5));
+
+    Tensor y_tensor(dtype, y_shape);
+    y_tensor.flat<T>().setRandom();
+    y_tensor.flat<T>() -= y_tensor.flat<T>().constant(static_cast<T>(0.5));
+
+    FusedOpsAndTensors fused_ops_and_tensors;
+    fused_ops_and_tensors.fused_ops = fused_ops_and_dims.fused_ops;
+    const auto& fused_ops = fused_ops_and_tensors.fused_ops;   // Alias to field
+    const auto& fusion_dims = fused_ops_and_dims.fusion_dims;  // Alias to field
+    auto& fusion_tensors = fused_ops_and_tensors.fusion_tensors;
+    for (int i = 0; i < fused_ops.size(); ++i) {
+      TensorShape arg_shape = TensorShape(fusion_dims[i]);
+      Tensor arg_tensor(dtype, arg_shape);
+      arg_tensor.flat<T>().setRandom();
+      arg_tensor.flat<T>() -=
+          arg_tensor.flat<T>().constant(static_cast<T>(0.5));
+      fusion_tensors.push_back(arg_tensor);
+    }
+    Tensor default_result;
+    run_default(x_tensor, y_tensor, fused_ops_and_tensors, &default_result,
+                transpose_x, transpose_y);
+
+    Tensor fused_result;
+    if constexpr (std::is_same<FusedGraphRunner, QuantizedGraphRunner>::value) {
+      float output_min = 1.0;
+      float output_max = 1.0 + std::numeric_limits<float>::epsilon();
+      if (requantize) {
+        T min;
+        T max;
+        MklTestingUtil::ComputeMinMax<T>(default_result, &min, &max);
+        output_min = static_cast<float>(min);
+        output_max = static_cast<float>(max);
+      }
+      // Run quantized fusion.
+      run_fused(x_tensor, y_tensor, fused_ops_and_tensors, &fused_result,
+                transpose_x, transpose_y, input_quant_mode, output_quant_mode,
+                is_bias_quantized, is_perchannel, requantize, output_min,
+                output_max);
+    } else {
+      // Run realnumber type fusion.
+      run_fused(x_tensor, y_tensor, fused_ops_and_tensors, &fused_result,
+                transpose_x, transpose_y);
+    }
+    std::vector<std::pair<Tensor, Tensor>> tensor_pairs = {
+        {default_result, fused_result}};
+    for (auto& pair : tensor_pairs) {
+      const Tensor& expected = pair.first;
+      const Tensor& evaluated = pair.second;
+
+      ASSERT_EQ(expected.dtype(), evaluated.dtype());
+      ASSERT_EQ(expected.shape(), evaluated.shape());
+
+      test::ExpectClose(expected, evaluated, atol);
+    }
+  }
+
+  void GetFusionConfiguration(const std::vector<string>& fused_ops,
+                              const int row, const int col,
+                              FusedOpsAndDims* fused_ops_and_dims) {
+    if (fused_ops == std::vector<string>{"BiasAdd"}) {
+      *fused_ops_and_dims = {fused_ops, {std::vector<int64_t>{col}}};
+    } else if (fused_ops == std::vector<string>{"BiasAdd", "Relu"} ||
+               fused_ops == std::vector<string>{"BiasAdd", "Relu6"} ||
+               fused_ops == std::vector<string>{"BiasAdd", "LeakyRelu"} ||
+               fused_ops == std::vector<string>{"BiasAdd", "Elu"} ||
+               fused_ops == std::vector<string>{"BiasAdd", "Tanh"} ||
+               fused_ops == std::vector<string>{"BiasAdd", "Sigmoid"} ||
+               fused_ops == std::vector<string>{"BiasAdd", "GeluApproximate"}) {
+      *fused_ops_and_dims = {
+          fused_ops, {std::vector<int64_t>{col}, std::vector<int64_t>{0}}};
+    } else if (fused_ops == std::vector<string>{"BiasAdd", "Add"}) {
+      *fused_ops_and_dims = {
+          fused_ops,
+          {std::vector<int64_t>{col}, std::vector<int64_t>{row, col}}};
+    } else {
+      EXPECT_TRUE(false) << absl::StrCat("The fusion: [",
+                                         absl::StrJoin(fused_ops, ","),
+                                         "] is not supported in this test.");
+    }
+  }
+
+  void VerifyFusedMatMul(std::vector<string> fused_ops) {
+    const GraphRunner run_default =
+        [&](const Tensor& x, const Tensor& y,
+            const FusedOpsAndTensors& fused_ops_and_tensors, Tensor* result,
+            bool transpose_x, bool transpose_y) {
+          this->RunMatMulAndFusedOps(x, y, fused_ops_and_tensors, result,
+                                     transpose_x, transpose_y);
+        };
+
+    const GraphRunner run_fused =
+        [&](const Tensor& x, const Tensor& y,
+            const FusedOpsAndTensors& fused_ops_and_tensors, Tensor* result,
+            bool transpose_x, bool transpose_y) {
+          this->RunFusedMatMul(x, y, fused_ops_and_tensors, result, transpose_x,
+                               transpose_y);
+        };
+    const double atol = std::is_same<T, bfloat16>::value ? 1e-2 : 1e-5;
+    constexpr int M = 3;
+    constexpr int K = 4;
+    constexpr int N = 5;
+    bool transpose_x = false;  // OpKernel does not support transpose_x.
+    std::vector<int64_t> x_dims;
+    std::vector<int64_t> y_dims;
+    FusedOpsAndDims fused_ops_and_dims;
+    GetFusionConfiguration(fused_ops, M, N, &fused_ops_and_dims);
+    for (bool transpose_y : {false, true}) {
+      x_dims =
+          transpose_x ? std::vector<int64_t>{K, M} : std::vector<int64_t>{M, K};
+      y_dims =
+          transpose_y ? std::vector<int64_t>{N, K} : std::vector<int64_t>{K, N};
+      VerifyTensorsNear<GraphRunner>(x_dims, y_dims, fused_ops_and_dims,
+                                     run_default, run_fused, transpose_x,
+                                     transpose_y, atol);
+    }
+  }
+
+  // The following test runs with 32 configurations.
+  //    (1) input quantization mode : {"MIN_FIRST", "SCALED"}
+  //    (2) input quantization mode : {"MIN_FIRST", "SCALED"}
+  //    (3) weight quantization per_channel : {false, true}
+  //    (4) output is requantized or dequantized:
+  //        false: dequantized
+  //        true: requantized
+  //    (5) weight matrix is transposed : {false, true}
+  void VerifyQuantizedMatMul(std::vector<string> fused_ops) {
+    if (!HasQuantizationSupport()) {
+      GTEST_SKIP() << "oneDNN based Quantized ops are not enabled on this CPU.";
+    }
+    const GraphRunner run_default =
+        [&](const Tensor& x, const Tensor& y,
+            const FusedOpsAndTensors& fused_ops_and_tensors, Tensor* result,
+            bool transpose_x, bool transpose_y) {
+          this->RunMatMulAndFusedOps(x, y, fused_ops_and_tensors, result,
+                                     transpose_x, transpose_y);
+        };
+
+    const QuantizedGraphRunner run_quantized =
+        [&](const Tensor& x, const Tensor& y,
+            const FusedOpsAndTensors& fused_ops_and_tensors, Tensor* result,
+            bool transpose_x, bool transpose_y, string input_quant_mode,
+            string output_quant_mode, bool is_bias_quantized,
+            bool is_perchannel, bool requantize, float output_min,
+            float output_max) {
+          this->RunQuantizedMatMul(
+              x, y, fused_ops_and_tensors, result, transpose_x, transpose_y,
+              input_quant_mode, output_quant_mode, is_bias_quantized,
+              is_perchannel, requantize, output_min, output_max);
+        };
+
+    const double atol = 1e-2;
+    constexpr int M = 3;
+    constexpr int K = 4;
+    constexpr int N = 5;
+    bool transpose_x = false;  // OpKernel does not support transpose_x.
+    std::vector<int64_t> x_dims;
+    std::vector<int64_t> y_dims;
+    FusedOpsAndDims fused_ops_and_dims;
+    GetFusionConfiguration(fused_ops, M, N, &fused_ops_and_dims);
+    std::vector<bool> requantization_config;
+    if (fused_ops == std::vector<string>{"BiasAdd", "Add"}) {
+      // MatMul + BiasAdd + Add + Requantize fusion is not supported yet.
+      requantization_config = {false};
+    } else {
+      requantization_config = {false, true};
+    }
+    for (bool transpose_y : {false, true}) {
+      x_dims =
+          transpose_x ? std::vector<int64_t>{K, M} : std::vector<int64_t>{M, K};
+      y_dims =
+          transpose_y ? std::vector<int64_t>{N, K} : std::vector<int64_t>{K, N};
+      for (bool per_channel : {false, true}) {
+        for (string input_quant_mode : {"MIN_FIRST", "SCALED"}) {
+          for (string output_quant_mode : {"MIN_FIRST", "SCALED"}) {
+            for (bool requantize : requantization_config) {
+              VerifyTensorsNear<QuantizedGraphRunner>(
+                  x_dims, y_dims, fused_ops_and_dims, run_default,
+                  run_quantized, transpose_x, transpose_y, atol,
+                  input_quant_mode, output_quant_mode, false, per_channel,
+                  requantize);
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+TYPED_TEST_SUITE_P(FusedMatMulOpsTest);
+
+// Realnumber typed test.
+TYPED_TEST_P(FusedMatMulOpsTest, BiasAddGeluApproximate) {
+  this->VerifyFusedMatMul({"BiasAdd", "GeluApproximate"});
+}
+
+// The following tests are for quantized fusions.
+TYPED_TEST_P(FusedMatMulOpsTest, Quantized_BiasAdd) {
+  this->VerifyQuantizedMatMul({"BiasAdd"});
+}
+
+TYPED_TEST_P(FusedMatMulOpsTest, Quantized_BiasAddRelu) {
+  this->VerifyQuantizedMatMul({"BiasAdd", "Relu"});
+}
+
+TYPED_TEST_P(FusedMatMulOpsTest, Quantized_BiasAddRelu6) {
+  this->VerifyQuantizedMatMul({"BiasAdd", "Relu6"});
+}
+
+TYPED_TEST_P(FusedMatMulOpsTest, Quantized_BiasAddLeakyRelu) {
+  this->VerifyQuantizedMatMul({"BiasAdd", "LeakyRelu"});
+}
+
+TYPED_TEST_P(FusedMatMulOpsTest, Quantized_BiasAddElu) {
+  this->VerifyQuantizedMatMul({"BiasAdd", "Elu"});
+}
+
+TYPED_TEST_P(FusedMatMulOpsTest, Quantized_BiasAddTanh) {
+  this->VerifyQuantizedMatMul({"BiasAdd", "Tanh"});
+}
+
+TYPED_TEST_P(FusedMatMulOpsTest, Quantized_BiasAddSigmoid) {
+  this->VerifyQuantizedMatMul({"BiasAdd", "Sigmoid"});
+}
+
+TYPED_TEST_P(FusedMatMulOpsTest, Quantized_BiasAddGeluApproximate) {
+  this->VerifyQuantizedMatMul({"BiasAdd", "GeluApproximate"});
+}
+
+TYPED_TEST_P(FusedMatMulOpsTest, Quantized_BiasAddAdd) {
+  this->VerifyQuantizedMatMul({"BiasAdd", "Add"});
+}
+
+REGISTER_TYPED_TEST_SUITE_P(FusedMatMulOpsTest, BiasAddGeluApproximate,
+                            Quantized_BiasAdd, Quantized_BiasAddRelu,
+                            Quantized_BiasAddRelu6, Quantized_BiasAddLeakyRelu,
+                            Quantized_BiasAddElu, Quantized_BiasAddTanh,
+                            Quantized_BiasAddSigmoid,
+                            Quantized_BiasAddGeluApproximate,
+                            Quantized_BiasAddAdd);
+
+// TODO(intel-tf): Add bfloat16 to Types when PR#56613 is merged.
+using DataTypes = ::testing::Types<float>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Test, FusedMatMulOpsTest, DataTypes);
+
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/add_v2.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/add_v2.mlir.tmpl
index 8681673198aad1..8651fd688a2623 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/add_v2.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/add_v2.mlir.tmpl
@@ -44,7 +44,7 @@ func.func @AddV2_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg
         %cast = tensor.cast %23 : tensor<?xoutput_type> to tensor<*xoutput_type>
         scf.yield %cast : tensor<*xoutput_type>
       } else {
-        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %19:2 = mhlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %20 = shape.rank %19#0 : tensor<?xindex> -> index
         %21 = shape.rank %19#1 : tensor<?xindex> -> index
         %22 = arith.cmpi sgt, %20, %21 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/atan2.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/atan2.mlir.tmpl
index a61936c7ec2bfe..44f78a7ecb8d98 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/atan2.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/atan2.mlir.tmpl
@@ -44,7 +44,7 @@ func.func @Atan2_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg
         %cast = tensor.cast %23 : tensor<?xoutput_type> to tensor<*xoutput_type>
         scf.yield %cast : tensor<*xoutput_type>
       } else {
-        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %19:2 = mhlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %20 = shape.rank %19#0 : tensor<?xindex> -> index
         %21 = shape.rank %19#1 : tensor<?xindex> -> index
         %22 = arith.cmpi sgt, %20, %21 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/bitwise_and.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/bitwise_and.mlir.tmpl
index 2f4e9084ee8beb..acb9828ef6cc7b 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/bitwise_and.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/bitwise_and.mlir.tmpl
@@ -44,7 +44,7 @@ func.func @BitwiseAnd_platform_elem_type_output_type(%arg0: tensor<*xelem_type>,
         %cast = tensor.cast %23 : tensor<?xelem_type> to tensor<*xelem_type>
         scf.yield %cast : tensor<*xelem_type>
       } else {
-        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %19:2 = mhlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %20 = shape.rank %19#0 : tensor<?xindex> -> index
         %21 = shape.rank %19#1 : tensor<?xindex> -> index
         %22 = arith.cmpi sgt, %20, %21 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/bitwise_or.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/bitwise_or.mlir.tmpl
index 8f9cb17256e7f5..e3de974c077684 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/bitwise_or.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/bitwise_or.mlir.tmpl
@@ -44,7 +44,7 @@ func.func @BitwiseOr_platform_elem_type_output_type(%arg0: tensor<*xelem_type>,
         %cast = tensor.cast %23 : tensor<?xelem_type> to tensor<*xelem_type>
         scf.yield %cast : tensor<*xelem_type>
       } else {
-        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %19:2 = mhlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %20 = shape.rank %19#0 : tensor<?xindex> -> index
         %21 = shape.rank %19#1 : tensor<?xindex> -> index
         %22 = arith.cmpi sgt, %20, %21 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/bitwise_xor.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/bitwise_xor.mlir.tmpl
index e4ba7c1486571a..359592578c26db 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/bitwise_xor.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/bitwise_xor.mlir.tmpl
@@ -44,7 +44,7 @@ func.func @BitwiseXor_platform_elem_type_output_type(%arg0: tensor<*xelem_type>,
         %cast = tensor.cast %23 : tensor<?xelem_type> to tensor<*xelem_type>
         scf.yield %cast : tensor<*xelem_type>
       } else {
-        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %19:2 = mhlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %20 = shape.rank %19#0 : tensor<?xindex> -> index
         %21 = shape.rank %19#1 : tensor<?xindex> -> index
         %22 = arith.cmpi sgt, %20, %21 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/complex.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/complex.mlir.tmpl
index c9c7330058e09e..e96da619311271 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/complex.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/complex.mlir.tmpl
@@ -44,7 +44,7 @@ func.func @Complex_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %a
         %cast = tensor.cast %23 : tensor<?xoutput_type> to tensor<*xoutput_type>
         scf.yield %cast : tensor<*xoutput_type>
       } else {
-        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %19:2 = mhlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %20 = shape.rank %19#0 : tensor<?xindex> -> index
         %21 = shape.rank %19#1 : tensor<?xindex> -> index
         %22 = arith.cmpi sgt, %20, %21 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/div.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/div.mlir.tmpl
index fc4650e4ea4096..811492971cc765 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/div.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/div.mlir.tmpl
@@ -44,7 +44,7 @@ func.func @Div_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1:
         %cast = tensor.cast %23 : tensor<?xoutput_type> to tensor<*xoutput_type>
         scf.yield %cast : tensor<*xoutput_type>
       } else {
-        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %19:2 = mhlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %20 = shape.rank %19#0 : tensor<?xindex> -> index
         %21 = shape.rank %19#1 : tensor<?xindex> -> index
         %22 = arith.cmpi sgt, %20, %21 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/div_no_nan.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/div_no_nan.mlir.tmpl
index 0dd4fc448970e6..3b672c284c1290 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/div_no_nan.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/div_no_nan.mlir.tmpl
@@ -51,7 +51,7 @@ func.func @DivNoNan_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %
         %cast = tensor.cast %28 : tensor<?xoutput_type> to tensor<*xoutput_type>
         scf.yield %cast : tensor<*xoutput_type>
       } else {
-        %22:2 = chlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %22:2 = mhlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %23 = shape.rank %22#0 : tensor<?xindex> -> index
         %24 = shape.rank %22#1 : tensor<?xindex> -> index
         %25 = arith.cmpi sgt, %23, %24 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/div_no_nan_cmplx.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/div_no_nan_cmplx.mlir.tmpl
index f75314cf0590d2..ab883db1ec899d 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/div_no_nan_cmplx.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/div_no_nan_cmplx.mlir.tmpl
@@ -51,7 +51,7 @@ func.func @DivNoNan_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %
         %cast = tensor.cast %28 : tensor<?xoutput_type> to tensor<*xoutput_type>
         scf.yield %cast : tensor<*xoutput_type>
       } else {
-        %22:2 = chlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %22:2 = mhlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %23 = shape.rank %22#0 : tensor<?xindex> -> index
         %24 = shape.rank %22#1 : tensor<?xindex> -> index
         %25 = arith.cmpi sgt, %23, %24 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/equal.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/equal.mlir.tmpl
index fbf599ae3b3375..a0fde2eb9f17ef 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/equal.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/equal.mlir.tmpl
@@ -44,7 +44,7 @@ func.func @Equal_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg
         %cast = tensor.cast %23 : tensor<?xoutput_type> to tensor<*xoutput_type>
         scf.yield %cast : tensor<*xoutput_type>
       } else {
-        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %19:2 = mhlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %20 = shape.rank %19#0 : tensor<?xindex> -> index
         %21 = shape.rank %19#1 : tensor<?xindex> -> index
         %22 = arith.cmpi sgt, %20, %21 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/floor_div.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/floor_div.mlir.tmpl
index 0b19056d6c641a..8a751a666b88c0 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/floor_div.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/floor_div.mlir.tmpl
@@ -70,7 +70,7 @@ func.func @FloorDiv_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %
         %cast = tensor.cast %33 : tensor<?xelem_type> to tensor<*xelem_type>
         scf.yield %cast : tensor<*xelem_type>
       } else {
-        %21:2 = chlo.minimum_broadcast_shapes %7, %8 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %21:2 = mhlo.minimum_broadcast_shapes %7, %8 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %22 = shape.rank %21#0 : tensor<?xindex> -> index
         %23 = shape.rank %21#1 : tensor<?xindex> -> index
         %24 = arith.cmpi sgt, %22, %23 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/floor_div_float.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/floor_div_float.mlir.tmpl
index 280d957a6bf705..b9c1b5b140fe3c 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/floor_div_float.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/floor_div_float.mlir.tmpl
@@ -47,7 +47,7 @@ func.func @FloorDiv_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %
         %cast = tensor.cast %24 : tensor<?xoutput_type> to tensor<*xoutput_type>
         scf.yield %cast : tensor<*xoutput_type>
       } else {
-        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %19:2 = mhlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %20 = shape.rank %19#0 : tensor<?xindex> -> index
         %21 = shape.rank %19#1 : tensor<?xindex> -> index
         %22 = arith.cmpi sgt, %20, %21 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/floor_mod.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/floor_mod.mlir.tmpl
index d7a703c12f3b37..07ddd422b37549 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/floor_mod.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/floor_mod.mlir.tmpl
@@ -66,7 +66,7 @@ func.func @FloorMod_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %
         %cast = tensor.cast %33 : tensor<?xelem_type> to tensor<*xelem_type>
         scf.yield %cast : tensor<*xelem_type>
       } else {
-        %22:2 = chlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %22:2 = mhlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %23 = shape.rank %22#0 : tensor<?xindex> -> index
         %24 = shape.rank %22#1 : tensor<?xindex> -> index
         %25 = arith.cmpi sgt, %23, %24 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/floor_mod_float.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/floor_mod_float.mlir.tmpl
index c71f34da2cd8ac..ea67ebda4aecfc 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/floor_mod_float.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/floor_mod_float.mlir.tmpl
@@ -66,7 +66,7 @@ func.func @FloorMod_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %
         %cast = tensor.cast %33 : tensor<?xelem_type> to tensor<*xelem_type>
         scf.yield %cast : tensor<*xelem_type>
       } else {
-        %22:2 = chlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %22:2 = mhlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %23 = shape.rank %22#0 : tensor<?xindex> -> index
         %24 = shape.rank %22#1 : tensor<?xindex> -> index
         %25 = arith.cmpi sgt, %23, %24 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/floor_mod_unsigned.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/floor_mod_unsigned.mlir.tmpl
index e9babe7dcf6c6b..9cc83d0147d2c0 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/floor_mod_unsigned.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/floor_mod_unsigned.mlir.tmpl
@@ -44,7 +44,7 @@ func.func @FloorMod_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %
         %cast = tensor.cast %23 : tensor<?xelem_type> to tensor<*xelem_type>
         scf.yield %cast : tensor<*xelem_type>
       } else {
-        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %19:2 = mhlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %20 = shape.rank %19#0 : tensor<?xindex> -> index
         %21 = shape.rank %19#1 : tensor<?xindex> -> index
         %22 = arith.cmpi sgt, %20, %21 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/greater.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/greater.mlir.tmpl
index f4a4472466165a..2c87f0399584c2 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/greater.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/greater.mlir.tmpl
@@ -44,7 +44,7 @@ func.func @Greater_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %a
         %cast = tensor.cast %23 : tensor<?xoutput_type> to tensor<*xoutput_type>
         scf.yield %cast : tensor<*xoutput_type>
       } else {
-        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %19:2 = mhlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %20 = shape.rank %19#0 : tensor<?xindex> -> index
         %21 = shape.rank %19#1 : tensor<?xindex> -> index
         %22 = arith.cmpi sgt, %20, %21 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/greater_equal.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/greater_equal.mlir.tmpl
index 573b389ebb5148..c9c202ffbaeb71 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/greater_equal.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/greater_equal.mlir.tmpl
@@ -44,7 +44,7 @@ func.func @GreaterEqual_platform_elem_type_output_type(%arg0: tensor<*xelem_type
         %cast = tensor.cast %23 : tensor<?xoutput_type> to tensor<*xoutput_type>
         scf.yield %cast : tensor<*xoutput_type>
       } else {
-        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %19:2 = mhlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %20 = shape.rank %19#0 : tensor<?xindex> -> index
         %21 = shape.rank %19#1 : tensor<?xindex> -> index
         %22 = arith.cmpi sgt, %20, %21 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/left_shift.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/left_shift.mlir.tmpl
index a8638d08cc86b8..dfa3fc5fb437fb 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/left_shift.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/left_shift.mlir.tmpl
@@ -44,7 +44,7 @@ func.func @LeftShift_platform_elem_type_output_type(%arg0: tensor<*xelem_type>,
         %cast = tensor.cast %23 : tensor<?xelem_type> to tensor<*xelem_type>
         scf.yield %cast : tensor<*xelem_type>
       } else {
-        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %19:2 = mhlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %20 = shape.rank %19#0 : tensor<?xindex> -> index
         %21 = shape.rank %19#1 : tensor<?xindex> -> index
         %22 = arith.cmpi sgt, %20, %21 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/less.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/less.mlir.tmpl
index 0d3a3cd1844dc7..3b0966e36d9fbf 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/less.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/less.mlir.tmpl
@@ -44,7 +44,7 @@ func.func @Less_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1
         %cast = tensor.cast %23 : tensor<?xoutput_type> to tensor<*xoutput_type>
         scf.yield %cast : tensor<*xoutput_type>
       } else {
-        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %19:2 = mhlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %20 = shape.rank %19#0 : tensor<?xindex> -> index
         %21 = shape.rank %19#1 : tensor<?xindex> -> index
         %22 = arith.cmpi sgt, %20, %21 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/less_equal.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/less_equal.mlir.tmpl
index 31a1516a298815..22305789dd9bea 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/less_equal.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/less_equal.mlir.tmpl
@@ -44,7 +44,7 @@ func.func @LessEqual_platform_elem_type_output_type(%arg0: tensor<*xelem_type>,
         %cast = tensor.cast %23 : tensor<?xoutput_type> to tensor<*xoutput_type>
         scf.yield %cast : tensor<*xoutput_type>
       } else {
-        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %19:2 = mhlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %20 = shape.rank %19#0 : tensor<?xindex> -> index
         %21 = shape.rank %19#1 : tensor<?xindex> -> index
         %22 = arith.cmpi sgt, %20, %21 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/logical_and.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/logical_and.mlir.tmpl
index 62ea4c76bb670a..aaa9ac808b2afb 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/logical_and.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/logical_and.mlir.tmpl
@@ -44,7 +44,7 @@ func.func @LogicalAnd_platform_elem_type_output_type(%arg0: tensor<*xelem_type>,
         %cast = tensor.cast %23 : tensor<?xelem_type> to tensor<*xelem_type>
         scf.yield %cast : tensor<*xelem_type>
       } else {
-        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %19:2 = mhlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %20 = shape.rank %19#0 : tensor<?xindex> -> index
         %21 = shape.rank %19#1 : tensor<?xindex> -> index
         %22 = arith.cmpi sgt, %20, %21 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/logical_or.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/logical_or.mlir.tmpl
index d515ae404b1e68..9c72ccfa5bfe0e 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/logical_or.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/logical_or.mlir.tmpl
@@ -44,7 +44,7 @@ func.func @LogicalOr_platform_elem_type_output_type(%arg0: tensor<*xelem_type>,
         %cast = tensor.cast %23 : tensor<?xelem_type> to tensor<*xelem_type>
         scf.yield %cast : tensor<*xelem_type>
       } else {
-        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %19:2 = mhlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %20 = shape.rank %19#0 : tensor<?xindex> -> index
         %21 = shape.rank %19#1 : tensor<?xindex> -> index
         %22 = arith.cmpi sgt, %20, %21 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/maximum.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/maximum.mlir.tmpl
index 8529768fb90285..f7adb69637e402 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/maximum.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/maximum.mlir.tmpl
@@ -44,7 +44,7 @@ func.func @Maximum_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %a
         %cast = tensor.cast %23 : tensor<?xoutput_type> to tensor<*xoutput_type>
         scf.yield %cast : tensor<*xoutput_type>
       } else {
-        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %19:2 = mhlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %20 = shape.rank %19#0 : tensor<?xindex> -> index
         %21 = shape.rank %19#1 : tensor<?xindex> -> index
         %22 = arith.cmpi sgt, %20, %21 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/minimum.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/minimum.mlir.tmpl
index 42ceb3a3306f1f..5510404e9c31fe 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/minimum.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/minimum.mlir.tmpl
@@ -44,7 +44,7 @@ func.func @Minimum_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %a
         %cast = tensor.cast %23 : tensor<?xelem_type> to tensor<*xelem_type>
         scf.yield %cast : tensor<*xelem_type>
       } else {
-        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %19:2 = mhlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %20 = shape.rank %19#0 : tensor<?xindex> -> index
         %21 = shape.rank %19#1 : tensor<?xindex> -> index
         %22 = arith.cmpi sgt, %20, %21 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/mul.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/mul.mlir.tmpl
index ace06691c5977c..bbfebc5909fbe0 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/mul.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/mul.mlir.tmpl
@@ -44,7 +44,7 @@ func.func @Mul_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1:
         %cast = tensor.cast %23 : tensor<?xoutput_type> to tensor<*xoutput_type>
         scf.yield %cast : tensor<*xoutput_type>
       } else {
-        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %19:2 = mhlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %20 = shape.rank %19#0 : tensor<?xindex> -> index
         %21 = shape.rank %19#1 : tensor<?xindex> -> index
         %22 = arith.cmpi sgt, %20, %21 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/mul_no_nan.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/mul_no_nan.mlir.tmpl
index 4699da4dbf95f6..509f1fae4927f4 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/mul_no_nan.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/mul_no_nan.mlir.tmpl
@@ -51,7 +51,7 @@ func.func @MulNoNan_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %
         %cast = tensor.cast %28 : tensor<?xelem_type> to tensor<*xelem_type>
         scf.yield %cast : tensor<*xelem_type>
       } else {
-        %22:2 = chlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %22:2 = mhlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %23 = shape.rank %22#0 : tensor<?xindex> -> index
         %24 = shape.rank %22#1 : tensor<?xindex> -> index
         %25 = arith.cmpi sgt, %23, %24 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/mul_no_nan_cmplx.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/mul_no_nan_cmplx.mlir.tmpl
index 80374748a840d9..6ad7fd2cd78ecc 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/mul_no_nan_cmplx.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/mul_no_nan_cmplx.mlir.tmpl
@@ -51,7 +51,7 @@ func.func @MulNoNan_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %
         %cast = tensor.cast %28 : tensor<?xelem_type> to tensor<*xelem_type>
         scf.yield %cast : tensor<*xelem_type>
       } else {
-        %22:2 = chlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %22:2 = mhlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %23 = shape.rank %22#0 : tensor<?xindex> -> index
         %24 = shape.rank %22#1 : tensor<?xindex> -> index
         %25 = arith.cmpi sgt, %23, %24 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/next_after.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/next_after.mlir.tmpl
index 6c1ab509c9ff4a..557c3b3a76d635 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/next_after.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/next_after.mlir.tmpl
@@ -44,7 +44,7 @@ func.func @NextAfter_platform_elem_type_output_type(%arg0: tensor<*xelem_type>,
         %cast = tensor.cast %23 : tensor<?xoutput_type> to tensor<*xoutput_type>
         scf.yield %cast : tensor<*xoutput_type>
       } else {
-        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %19:2 = mhlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %20 = shape.rank %19#0 : tensor<?xindex> -> index
         %21 = shape.rank %19#1 : tensor<?xindex> -> index
         %22 = arith.cmpi sgt, %20, %21 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/not_equal.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/not_equal.mlir.tmpl
index 21c1f7c0ca1933..c598abded6607f 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/not_equal.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/not_equal.mlir.tmpl
@@ -44,7 +44,7 @@ func.func @NotEqual_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %
         %cast = tensor.cast %23 : tensor<?xoutput_type> to tensor<*xoutput_type>
         scf.yield %cast : tensor<*xoutput_type>
       } else {
-        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %19:2 = mhlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %20 = shape.rank %19#0 : tensor<?xindex> -> index
         %21 = shape.rank %19#1 : tensor<?xindex> -> index
         %22 = arith.cmpi sgt, %20, %21 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/polygamma.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/polygamma.mlir.tmpl
index 1eb02924c460df..f9aeb213fedf69 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/polygamma.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/polygamma.mlir.tmpl
@@ -44,7 +44,7 @@ func.func @Polygamma_platform_elem_type_output_type(%arg0: tensor<*xelem_type>,
         %cast = tensor.cast %23 : tensor<?xoutput_type> to tensor<*xoutput_type>
         scf.yield %cast : tensor<*xoutput_type>
       } else {
-        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %19:2 = mhlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %20 = shape.rank %19#0 : tensor<?xindex> -> index
         %21 = shape.rank %19#1 : tensor<?xindex> -> index
         %22 = arith.cmpi sgt, %20, %21 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/pow.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/pow.mlir.tmpl
index a64b681bb30d95..36e24a1a12f83b 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/pow.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/pow.mlir.tmpl
@@ -44,7 +44,7 @@ func.func @Pow_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1:
         %cast = tensor.cast %23 : tensor<?xoutput_type> to tensor<*xoutput_type>
         scf.yield %cast : tensor<*xoutput_type>
       } else {
-        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %19:2 = mhlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %20 = shape.rank %19#0 : tensor<?xindex> -> index
         %21 = shape.rank %19#1 : tensor<?xindex> -> index
         %22 = arith.cmpi sgt, %20, %21 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/right_shift.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/right_shift.mlir.tmpl
index 1398a68d4713a4..f2d9cd9c4b83ca 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/right_shift.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/right_shift.mlir.tmpl
@@ -44,7 +44,7 @@ func.func @RightShift_platform_elem_type_output_type(%arg0: tensor<*xelem_type>,
         %cast = tensor.cast %23 : tensor<?xelem_type> to tensor<*xelem_type>
         scf.yield %cast : tensor<*xelem_type>
       } else {
-        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %19:2 = mhlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %20 = shape.rank %19#0 : tensor<?xindex> -> index
         %21 = shape.rank %19#1 : tensor<?xindex> -> index
         %22 = arith.cmpi sgt, %20, %21 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/right_shift_unsigned.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/right_shift_unsigned.mlir.tmpl
index 647783751d5cd0..468e3096342c4c 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/right_shift_unsigned.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/right_shift_unsigned.mlir.tmpl
@@ -44,7 +44,7 @@ func.func @RightShift_platform_elem_type_output_type(%arg0: tensor<*xelem_type>,
         %cast = tensor.cast %23 : tensor<?xelem_type> to tensor<*xelem_type>
         scf.yield %cast : tensor<*xelem_type>
       } else {
-        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %19:2 = mhlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %20 = shape.rank %19#0 : tensor<?xindex> -> index
         %21 = shape.rank %19#1 : tensor<?xindex> -> index
         %22 = arith.cmpi sgt, %20, %21 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/select_v2.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/select_v2.mlir.tmpl
index 0c39b00d033bce..087b63cd8db170 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/select_v2.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/select_v2.mlir.tmpl
@@ -32,7 +32,7 @@ func.func @SelectV2_platform_elem_type_output_type(%arg0: tensor<*xi1>, %arg1: t
     %cast = tensor.cast %25 : tensor<?xelem_type> to tensor<*xelem_type>
     scf.yield %cast : tensor<*xelem_type>
   } else {
-    %20:3 = chlo.minimum_broadcast_shapes %8, %9, %10 : tensor<?xindex>, tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>, tensor<?xindex>
+    %20:3 = mhlo.minimum_broadcast_shapes %8, %9, %10 : tensor<?xindex>, tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>, tensor<?xindex>
     %21 = shape.rank %20#0 : tensor<?xindex> -> index
     %22 = shape.rank %20#1 : tensor<?xindex> -> index
     %23 = arith.cmpi sgt, %21, %22 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/squared_difference.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/squared_difference.mlir.tmpl
index 0c4220928702b5..ee3d26da151e1c 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/squared_difference.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/squared_difference.mlir.tmpl
@@ -47,7 +47,7 @@ func.func @SquaredDifference_platform_elem_type_output_type(%arg0: tensor<*xelem
         %cast = tensor.cast %28 : tensor<?xelem_type> to tensor<*xelem_type>
         scf.yield %cast : tensor<*xelem_type>
       } else {
-        %23:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %23:2 = mhlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %24 = shape.rank %23#0 : tensor<?xindex> -> index
         %25 = shape.rank %23#1 : tensor<?xindex> -> index
         %26 = arith.cmpi sgt, %24, %25 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/sub.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/sub.mlir.tmpl
index b4f74d07b0e79b..2d6a11ec447a21 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/sub.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/sub.mlir.tmpl
@@ -44,7 +44,7 @@ func.func @Sub_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1:
         %cast = tensor.cast %23 : tensor<?xelem_type> to tensor<*xelem_type>
         scf.yield %cast : tensor<*xelem_type>
       } else {
-        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %19:2 = mhlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %20 = shape.rank %19#0 : tensor<?xindex> -> index
         %21 = shape.rank %19#1 : tensor<?xindex> -> index
         %22 = arith.cmpi sgt, %20, %21 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/truncate_div.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/truncate_div.mlir.tmpl
index 18921f3a068eea..1f48b11f0d30ba 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/truncate_div.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/truncate_div.mlir.tmpl
@@ -44,7 +44,7 @@ func.func @TruncateDiv_platform_elem_type_output_type(%arg0: tensor<*xelem_type>
         %cast = tensor.cast %23 : tensor<?xelem_type> to tensor<*xelem_type>
         scf.yield %cast : tensor<*xelem_type>
       } else {
-        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %19:2 = mhlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %20 = shape.rank %19#0 : tensor<?xindex> -> index
         %21 = shape.rank %19#1 : tensor<?xindex> -> index
         %22 = arith.cmpi sgt, %20, %21 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/truncate_div_float.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/truncate_div_float.mlir.tmpl
index bff954b451b307..336f7c96d02431 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/truncate_div_float.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/truncate_div_float.mlir.tmpl
@@ -57,7 +57,7 @@ func.func @TruncateDiv_platform_elem_type_output_type(%arg0: tensor<*xelem_type>
         %cast = tensor.cast %35 : tensor<?xelem_type> to tensor<*xelem_type>
         scf.yield %cast : tensor<*xelem_type>
       } else {
-        %27:2 = chlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %27:2 = mhlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %28 = shape.rank %27#0 : tensor<?xindex> -> index
         %29 = shape.rank %27#1 : tensor<?xindex> -> index
         %30 = arith.cmpi sgt, %28, %29 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/xdivy.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/xdivy.mlir.tmpl
index 4abf5a03d7fb04..ca643673411019 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/xdivy.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/xdivy.mlir.tmpl
@@ -51,7 +51,7 @@ func.func @Xdivy_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg
         %cast = tensor.cast %29 : tensor<?xelem_type> to tensor<*xelem_type>
         scf.yield %cast : tensor<*xelem_type>
       } else {
-        %23:2 = chlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %23:2 = mhlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %24 = shape.rank %23#0 : tensor<?xindex> -> index
         %25 = shape.rank %23#1 : tensor<?xindex> -> index
         %26 = arith.cmpi sgt, %24, %25 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/xdivy_cmplx.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/xdivy_cmplx.mlir.tmpl
index ad1baeb9204232..3639d77f585401 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/xdivy_cmplx.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/xdivy_cmplx.mlir.tmpl
@@ -51,7 +51,7 @@ func.func @Xdivy_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg
         %cast = tensor.cast %29 : tensor<?xelem_type> to tensor<*xelem_type>
         scf.yield %cast : tensor<*xelem_type>
       } else {
-        %23:2 = chlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %23:2 = mhlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %24 = shape.rank %23#0 : tensor<?xindex> -> index
         %25 = shape.rank %23#1 : tensor<?xindex> -> index
         %26 = arith.cmpi sgt, %24, %25 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/xlog1py.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/xlog1py.mlir.tmpl
index f53f620dea1f08..ea9ccf3991427f 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/xlog1py.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/xlog1py.mlir.tmpl
@@ -54,7 +54,7 @@ func.func @Xlog1py_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %a
         %cast = tensor.cast %30 : tensor<?xelem_type> to tensor<*xelem_type>
         scf.yield %cast : tensor<*xelem_type>
       } else {
-        %23:2 = chlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %23:2 = mhlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %24 = shape.rank %23#0 : tensor<?xindex> -> index
         %25 = shape.rank %23#1 : tensor<?xindex> -> index
         %26 = arith.cmpi sgt, %24, %25 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/xlog1py_cmplx.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/xlog1py_cmplx.mlir.tmpl
index c5282303d94f7a..b5c11b68ea152f 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/xlog1py_cmplx.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/xlog1py_cmplx.mlir.tmpl
@@ -54,7 +54,7 @@ func.func @Xlog1py_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %a
         %cast = tensor.cast %30 : tensor<?xelem_type> to tensor<*xelem_type>
         scf.yield %cast : tensor<*xelem_type>
       } else {
-        %23:2 = chlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %23:2 = mhlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %24 = shape.rank %23#0 : tensor<?xindex> -> index
         %25 = shape.rank %23#1 : tensor<?xindex> -> index
         %26 = arith.cmpi sgt, %24, %25 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/xlogy.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/xlogy.mlir.tmpl
index 224b409127f35c..62833c046a0a61 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/xlogy.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/xlogy.mlir.tmpl
@@ -54,7 +54,7 @@ func.func @Xlogy_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg
         %cast = tensor.cast %30 : tensor<?xelem_type> to tensor<*xelem_type>
         scf.yield %cast : tensor<*xelem_type>
       } else {
-        %23:2 = chlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %23:2 = mhlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %24 = shape.rank %23#0 : tensor<?xindex> -> index
         %25 = shape.rank %23#1 : tensor<?xindex> -> index
         %26 = arith.cmpi sgt, %24, %25 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/xlogy_cmplx.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/xlogy_cmplx.mlir.tmpl
index 4c5fa22db919db..6ab349a0c52627 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/xlogy_cmplx.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/xlogy_cmplx.mlir.tmpl
@@ -54,7 +54,7 @@ func.func @Xlogy_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg
         %cast = tensor.cast %30 : tensor<?xelem_type> to tensor<*xelem_type>
         scf.yield %cast : tensor<*xelem_type>
       } else {
-        %23:2 = chlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %23:2 = mhlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %24 = shape.rank %23#0 : tensor<?xindex> -> index
         %25 = shape.rank %23#1 : tensor<?xindex> -> index
         %26 = arith.cmpi sgt, %24, %25 : index
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/zeta.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/zeta.mlir.tmpl
index c31280f1daa362..8c9c2cc179a852 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/zeta.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/zeta.mlir.tmpl
@@ -44,7 +44,7 @@ func.func @Zeta_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1
         %cast = tensor.cast %23 : tensor<?xoutput_type> to tensor<*xoutput_type>
         scf.yield %cast : tensor<*xoutput_type>
       } else {
-        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %19:2 = mhlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
         %20 = shape.rank %19#0 : tensor<?xindex> -> index
         %21 = shape.rank %19#1 : tensor<?xindex> -> index
         %22 = arith.cmpi sgt, %20, %21 : index
diff --git a/tensorflow/core/kernels/nn_ops_test.cc b/tensorflow/core/kernels/nn_ops_test.cc
index 83102db5c99277..42218c003d46f6 100644
--- a/tensorflow/core/kernels/nn_ops_test.cc
+++ b/tensorflow/core/kernels/nn_ops_test.cc
@@ -800,7 +800,7 @@ static void BM_LRNFloat(::testing::benchmark::State& state, int depth, int cols,
                                            num_threads);
   device->set_eigen_cpu_device(&eigen_cpu_device);
 
-  gtl::InlinedVector<TensorValue, 4> inputs;
+  absl::InlinedVector<TensorValue, 4> inputs;
   TensorShape shape({batch_size, rows, cols, depth});
 
   Tensor input(DT_FLOAT, shape);
@@ -881,7 +881,7 @@ static void BM_AvgPool(::testing::benchmark::State& state, int batch_size,
                                            num_threads);
   device->set_eigen_cpu_device(&eigen_cpu_device);
 
-  gtl::InlinedVector<TensorValue, 4> inputs;
+  absl::InlinedVector<TensorValue, 4> inputs;
   TensorShape shape1({batch_size, rows, cols, depth});
   Tensor input1(DT_FLOAT, shape1);
   test::FillIota<float>(&input1, 1.0);
@@ -971,7 +971,7 @@ static void BM_AvgPoolBk(::testing::benchmark::State& state, int batch_size,
                                            num_threads);
   device->set_eigen_cpu_device(&eigen_cpu_device);
 
-  gtl::InlinedVector<TensorValue, 4> inputs;
+  absl::InlinedVector<TensorValue, 4> inputs;
 
   int64_t out_height, out_width, pad_rows, pad_cols;
   TF_CHECK_OK(GetWindowedOutputSize(rows, kernel_rows, /*dilation_rate=*/1,
@@ -1076,7 +1076,7 @@ static void BM_MaxPool(::testing::benchmark::State& state, int batch_size,
                                            num_threads);
   device->set_eigen_cpu_device(&eigen_cpu_device);
 
-  gtl::InlinedVector<TensorValue, 4> inputs;
+  absl::InlinedVector<TensorValue, 4> inputs;
   TensorShape shape1({batch_size, rows, cols, depth});
   Tensor input1(DT_FLOAT, shape1);
   test::FillIota<float>(&input1, 1.0);
@@ -1262,7 +1262,7 @@ static void BM_ReluFloat(::testing::benchmark::State& state, int batch_size,
                                            num_threads);
   device->set_eigen_cpu_device(&eigen_cpu_device);
 
-  gtl::InlinedVector<TensorValue, 4> inputs;
+  absl::InlinedVector<TensorValue, 4> inputs;
   TensorShape shape1({batch_size, rows, cols, depth});
   Tensor input1(DT_FLOAT, shape1);
   test::FillIota<float>(&input1, 1.0);
@@ -1333,7 +1333,7 @@ static void BM_SoftplusFloat(::testing::benchmark::State& state, int batch_size,
                                            num_threads);
   device->set_eigen_cpu_device(&eigen_cpu_device);
 
-  gtl::InlinedVector<TensorValue, 4> inputs;
+  absl::InlinedVector<TensorValue, 4> inputs;
   TensorShape shape1({batch_size, rows, cols, depth});
   Tensor input1(DT_FLOAT, shape1);
   input1.flat<float>().setRandom();
diff --git a/tensorflow/core/kernels/ops_testutil.h b/tensorflow/core/kernels/ops_testutil.h
index 0a94bc4291920c..d098d1e6265add 100644
--- a/tensorflow/core/kernels/ops_testutil.h
+++ b/tensorflow/core/kernels/ops_testutil.h
@@ -181,7 +181,7 @@ class OpsTestBase : public ::testing::Test {
 
   mutex lock_for_refs_;  // Used as the Mutex for inputs added as refs
 
-  gtl::InlinedVector<TensorValue, 4> inputs_;
+  absl::InlinedVector<TensorValue, 4> inputs_;
   // Owns Tensors.
   std::vector<Tensor*> tensors_;
   // Copies of the outputs in unified memory (host and device accessible).
diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
index b826564437c0a1..e7b76653dc329e 100644
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
@@ -132,8 +132,6 @@ __global__ void __launch_bounds__(1024)
                 (normMax >= T(0.))) ||
                ((normMax > kStdDevsInsideBoundsToUseRandnSampler) &&
                 (normMin <= T(0.)))) {
-      Eigen::array<T, 4> n;
-
       int numIterations = 0;
       while (numIterations < kMaxIterations) {
         const auto randn = normal_dist(&gen);
diff --git a/tensorflow/core/kernels/parse_tensor_test.cc b/tensorflow/core/kernels/parse_tensor_test.cc
index 9815b7a7a4d912..ed2327b8c360ae 100644
--- a/tensorflow/core/kernels/parse_tensor_test.cc
+++ b/tensorflow/core/kernels/parse_tensor_test.cc
@@ -47,7 +47,7 @@ class SerializeTensorOpTest : public OpsTestBase {
                                   Tensor* serialized, Tensor* parse_output) {
     std::unique_ptr<Device> device(
         DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
-    gtl::InlinedVector<TensorValue, 4> inputs;
+    absl::InlinedVector<TensorValue, 4> inputs;
     inputs.push_back({nullptr, serialized});
     Status status;
     std::unique_ptr<OpKernel> op(CreateOpKernel(DEVICE_CPU, device.get(),
diff --git a/tensorflow/core/kernels/quantized_instance_norm_test.cc b/tensorflow/core/kernels/quantized_instance_norm_test.cc
index f34ab9dcd8750e..ab729e3d6cd205 100644
--- a/tensorflow/core/kernels/quantized_instance_norm_test.cc
+++ b/tensorflow/core/kernels/quantized_instance_norm_test.cc
@@ -160,6 +160,8 @@ void TestOutputRangeGiven() {
 }
 
 void TestClamp() {
+  GTEST_SKIP() << "TODO(b/339058131): Fix test failure.";
+
   Tensor input_tensor(DT_QUINT8, {1, 4, 4, 32});
   auto input = input_tensor.flat<quint8>();
   input = input.random(Eigen::internal::UniformRandomGenerator<quint8>());
diff --git a/tensorflow/core/kernels/ragged_cross_op.cc b/tensorflow/core/kernels/ragged_cross_op.cc
index c6afecc8b80f0d..ba8d01248a9ab9 100644
--- a/tensorflow/core/kernels/ragged_cross_op.cc
+++ b/tensorflow/core/kernels/ragged_cross_op.cc
@@ -208,7 +208,7 @@ class OutputWriterImpl : public OutputWriter {
   void WriteCombination(int64_t batch_index,
                         const std::vector<int>& combination, tstring* out) {
     static const auto k_feature_separator = "_X_";
-    gtl::InlinedVector<tstring, 6> cross_vec(features_.size());
+    absl::InlinedVector<tstring, 6> cross_vec(features_.size());
     for (int i = 0; i < combination.size(); ++i) {
       features_[i]->ReadValue(batch_index, combination[i], &cross_vec[i]);
     }
diff --git a/tensorflow/core/kernels/random_binomial_op.cc b/tensorflow/core/kernels/random_binomial_op.cc
index 8fceaf70c0dbbb..98118b78eb5b58 100644
--- a/tensorflow/core/kernels/random_binomial_op.cc
+++ b/tensorflow/core/kernels/random_binomial_op.cc
@@ -187,8 +187,6 @@ struct RandomBinomialFunctor<CPUDevice, T, U> {
                    &gen, &output](int64_t start_output, int64_t limit_output) {
       // Vectorized intermediate calculations for uniform rejection sampling.
       // We always generate at most 4 samples.
-      Eigen::array<T, 4> z;
-      Eigen::array<T, 4> g;
       const bool should_bcast = bcast.IsBroadcastingRequired();
       const auto& counts_batch_indices = bcast.x_batch_indices();
       const auto& probs_batch_indices = bcast.y_batch_indices();
diff --git a/tensorflow/core/kernels/reduce_join_op.cc b/tensorflow/core/kernels/reduce_join_op.cc
index 8ef25ebd40e971..72c41f8ab1420d 100644
--- a/tensorflow/core/kernels/reduce_join_op.cc
+++ b/tensorflow/core/kernels/reduce_join_op.cc
@@ -31,8 +31,8 @@ namespace tensorflow {
 
 namespace {
 
-const gtl::InlinedVector<int64_t, 8> GetStrides(const TensorShape& shape) {
-  gtl::InlinedVector<int64_t, 8> result(shape.dims());
+const absl::InlinedVector<int64_t, 8> GetStrides(const TensorShape& shape) {
+  absl::InlinedVector<int64_t, 8> result(shape.dims());
   int64_t product = 1;
   for (int32_t i = shape.dims() - 1; i >= 0; --i) {
     result[i] = product;
@@ -47,9 +47,9 @@ const gtl::InlinedVector<int64_t, 8> GetStrides(const TensorShape& shape) {
 // nonspecified dimensions set to 0.  Dimensions must be ordered from outer-most
 // to inner-most with respect to the subset linear index.
 inline int64_t LinearSubIndexToFullIndex(
-    int64_t output_index, const gtl::InlinedVector<int32, 8>& dim_list,
+    int64_t output_index, const absl::InlinedVector<int32, 8>& dim_list,
     const TensorShape& input_shape,
-    const gtl::InlinedVector<int64_t, 8>& strides) {
+    const absl::InlinedVector<int64_t, 8>& strides) {
   int64_t result = 0;
   int64_t quotient = output_index;
   for (int32_t i = dim_list.size() - 1; i >= 0; --i) {
@@ -63,7 +63,7 @@ inline int64_t LinearSubIndexToFullIndex(
 
 // Computes the number of input elements reduced per output element.
 int64_t GetReductionIterSize(
-    const gtl::InlinedVector<int32, 8>& reduced_indices,
+    const absl::InlinedVector<int32, 8>& reduced_indices,
     const TensorShape& input_shape) {
   int64_t result = 1;
   for (int32_t reduce_dim : reduced_indices) {
@@ -74,12 +74,12 @@ int64_t GetReductionIterSize(
 
 // Computes a list of all true reduced indices, accounting for negative
 // indices.
-gtl::InlinedVector<int32, 8> GetReducedIndices(const Tensor& reduction_indices,
-                                               int32_t input_dims) {
+absl::InlinedVector<int32, 8> GetReducedIndices(const Tensor& reduction_indices,
+                                                int32_t input_dims) {
   const auto reduction_indices_flat = reduction_indices.flat<int32>();
   const int32_t reduction_dims = reduction_indices_flat.size();
 
-  gtl::InlinedVector<int32, 8> reduced_indices(reduction_dims);
+  absl::InlinedVector<int32, 8> reduced_indices(reduction_dims);
   for (int32_t i = 0; i < reduction_dims; ++i) {
     reduced_indices[i] = reduction_indices_flat(reduction_dims - i - 1);
     reduced_indices[i] += reduced_indices[i] < 0 ? input_dims : 0;
@@ -89,15 +89,15 @@ gtl::InlinedVector<int32, 8> GetReducedIndices(const Tensor& reduction_indices,
 }
 
 // Appends all unreduced dimensions to the given vector.
-void MakeUnreducedIndices(gtl::InlinedVector<bool, 8> index_is_reduced,
+void MakeUnreducedIndices(absl::InlinedVector<bool, 8> index_is_reduced,
                           int32_t input_dims,
-                          gtl::InlinedVector<int32, 8>* unreduced_indices) {
+                          absl::InlinedVector<int32, 8>* unreduced_indices) {
   for (int32_t index = 0; index < input_dims; ++index) {
     if (!index_is_reduced[index]) unreduced_indices->push_back(index);
   }
 }
 
-TensorShape GetOutputShape(gtl::InlinedVector<bool, 8> index_is_reduced,
+TensorShape GetOutputShape(absl::InlinedVector<bool, 8> index_is_reduced,
                            const TensorShape& input_shape, bool keep_dims) {
   TensorShape output_shape;
   for (size_t index = 0; index < index_is_reduced.size(); ++index) {
@@ -131,7 +131,7 @@ class ReduceJoinOp : public OpKernel {
     const auto reduction_indices_flat = reduction_indices.flat<int32>();
     const int32_t reduction_dims = reduction_indices_flat.size();
 
-    gtl::InlinedVector<bool, 8> index_is_reduced(input_dims, false);
+    absl::InlinedVector<bool, 8> index_is_reduced(input_dims, false);
     for (int32_t i = 0; i < reduction_dims; i++) {
       int32_t reduce_index = reduction_indices_flat(i);
       const int32_t true_reduce_index =
@@ -146,9 +146,9 @@ class ReduceJoinOp : public OpKernel {
       index_is_reduced[true_reduce_index] = true;
     }
 
-    gtl::InlinedVector<int32, 8> reduced_indices =
+    absl::InlinedVector<int32, 8> reduced_indices =
         GetReducedIndices(reduction_indices, input_dims);
-    gtl::InlinedVector<int32, 8> unreduced_indices;
+    absl::InlinedVector<int32, 8> unreduced_indices;
     MakeUnreducedIndices(index_is_reduced, input_dims, &unreduced_indices);
     const auto strides = GetStrides(input_shape);
 
@@ -161,7 +161,7 @@ class ReduceJoinOp : public OpKernel {
 
     const int64_t reduction_iter_size =
         GetReductionIterSize(reduced_indices, input_shape);
-    gtl::InlinedVector<StringPiece, 8> curr_strings(reduction_iter_size);
+    absl::InlinedVector<StringPiece, 8> curr_strings(reduction_iter_size);
     for (int64_t output_index = 0; output_index < output_shape.num_elements();
          ++output_index) {
       int64_t output_full_index = LinearSubIndexToFullIndex(
diff --git a/tensorflow/core/kernels/reduction_ops_common.cc b/tensorflow/core/kernels/reduction_ops_common.cc
index d66ad0954f710b..babb3fbb465178 100644
--- a/tensorflow/core/kernels/reduction_ops_common.cc
+++ b/tensorflow/core/kernels/reduction_ops_common.cc
@@ -44,10 +44,10 @@ TensorShape ReductionHelper::shuffled_shape() {
   return shape;
 }
 
-gtl::InlinedVector<int32, 8> ReductionHelper::permutation() {
+absl::InlinedVector<int32, 8> ReductionHelper::permutation() {
   const int dims = data_reshape_.size();
   const int unreduced_dims = (dims + !reduce_first_axis_) / 2;
-  gtl::InlinedVector<int32, 8> perm(dims);
+  absl::InlinedVector<int32, 8> perm(dims);
   for (int i = 0; i < unreduced_dims; i++) {
     perm[i] = 2 * i + reduce_first_axis_;
   }
@@ -59,7 +59,7 @@ gtl::InlinedVector<int32, 8> ReductionHelper::permutation() {
 
 template <typename Tperm>
 Status SimplifyHelper(const Tensor& data, const Tensor& axis,
-                      gtl::InlinedVector<bool, 4>& bitmap) {
+                      absl::InlinedVector<bool, 4>& bitmap) {
   auto axis_vec = axis.flat<Tperm>();
   for (int64_t i = 0; i < axis.NumElements(); ++i) {
     Tperm index = axis_vec(i);
@@ -82,7 +82,7 @@ Status SimplifyHelper(const Tensor& data, const Tensor& axis,
 Status ReductionHelper::Simplify(const Tensor& data, const Tensor& axis,
                                  const bool keep_dims) {
   // bitmap[i] indicates whether to reduce data along i-th axis.
-  gtl::InlinedVector<bool, 4> bitmap(data.dims(), false);
+  absl::InlinedVector<bool, 4> bitmap(data.dims(), false);
   if (axis.dtype() == DT_INT32) {
     TF_RETURN_IF_ERROR(SimplifyHelper<int32>(data, axis, bitmap));
   } else {
diff --git a/tensorflow/core/kernels/reduction_ops_common.h b/tensorflow/core/kernels/reduction_ops_common.h
index 0ffc638eda175d..a4789936cebfa1 100644
--- a/tensorflow/core/kernels/reduction_ops_common.h
+++ b/tensorflow/core/kernels/reduction_ops_common.h
@@ -25,7 +25,6 @@ limitations under the License.
 
 #include "Eigen/Core"  // from @eigen_archive
 #include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
-
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -114,14 +113,15 @@ class ReductionHelper {
   TensorShape shuffled_shape();
 
   // Permutation of reduced dims needed to put reduction dimensions at the end
-  gtl::InlinedVector<int32, 8> permutation();
+  absl::InlinedVector<int32, 8> permutation();
 
  private:
   bool reduce_first_axis_;  // True if need to reduce the 0-th dimension.
-  gtl::InlinedVector<int64_t, 4>
-      data_reshape_;                          // Reshape data before reduction.
-  gtl::InlinedVector<int64_t, 4> out_shape_;  // The final output shape.
-  gtl::InlinedVector<int64_t, 4> out_reshape_;  // Reshape output for reduction.
+  absl::InlinedVector<int64_t, 4>
+      data_reshape_;                           // Reshape data before reduction.
+  absl::InlinedVector<int64_t, 4> out_shape_;  // The final output shape.
+  absl::InlinedVector<int64_t, 4>
+      out_reshape_;  // Reshape output for reduction.
 };
 
 // For operations where the output is a reduction function along some
diff --git a/tensorflow/core/kernels/reshape_util.cc b/tensorflow/core/kernels/reshape_util.cc
index 939901dfd8687b..d5383b3319db4b 100644
--- a/tensorflow/core/kernels/reshape_util.cc
+++ b/tensorflow/core/kernels/reshape_util.cc
@@ -48,7 +48,7 @@ struct ReshapeSparseTensorFunctor<CPUDevice> {
     const int64_t input_rank = input_shape.dims();
     const int64_t output_rank = output_shape.dims();
     const int64_t nnz = input_indices.dimension(0);
-    gtl::InlinedVector<int64_t, 8> input_strides(input_rank);
+    absl::InlinedVector<int64_t, 8> input_strides(input_rank);
     if (input_rank > 0) {
       input_strides[input_rank - 1] = 1;
       for (int d = input_rank - 2; d >= 0; --d) {
@@ -56,7 +56,7 @@ struct ReshapeSparseTensorFunctor<CPUDevice> {
       }
     }
 
-    gtl::InlinedVector<int64_t, 8> output_strides(output_rank);
+    absl::InlinedVector<int64_t, 8> output_strides(output_rank);
     if (output_rank > 0) {
       output_strides[output_rank - 1] = 1;
       for (int d = output_rank - 2; d >= 0; --d) {
@@ -75,7 +75,7 @@ struct ReshapeSparseTensorFunctor<CPUDevice> {
         id %= output_strides[j];
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index 43b4ddbeb195cd..c82519de31e583 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -501,7 +501,7 @@ class AssignVariableOp<Device, Variant> : public OpKernel {
                                 [](Var** ptr) {
                                   // Created on host.
                                   *ptr = new Var(DT_VARIANT);
-                                  return OkStatus();
+                                  return absl::OkStatus();
                                 }));
 
     // For purposes of forwarding DT_VARIANT, we want the least
diff --git a/tensorflow/core/kernels/restore_op_test.cc b/tensorflow/core/kernels/restore_op_test.cc
index 1fca17255af724..a08732cd3b8bce 100644
--- a/tensorflow/core/kernels/restore_op_test.cc
+++ b/tensorflow/core/kernels/restore_op_test.cc
@@ -85,7 +85,7 @@ TEST_F(RestoreOpTest, RestoreSimple) {
     std::unique_ptr<Device> device(
         DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
 
-    gtl::InlinedVector<TensorValue, 4> inputs;
+    absl::InlinedVector<TensorValue, 4> inputs;
 
     Status status;
     std::unique_ptr<OpKernel> op(CreateOpKernel(DEVICE_CPU, device.get(),
@@ -389,7 +389,7 @@ TEST_F(RestoreSliceOpTest, RestoreInt) {
     std::unique_ptr<Device> device(
         DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
 
-    gtl::InlinedVector<TensorValue, 4> inputs;
+    absl::InlinedVector<TensorValue, 4> inputs;
 
     Status status;
     std::unique_ptr<OpKernel> op(CreateOpKernel(DEVICE_CPU, device.get(),
diff --git a/tensorflow/core/kernels/restore_v2_op_test.cc b/tensorflow/core/kernels/restore_v2_op_test.cc
index cb04f7ad68e53a..3a3649429e490b 100644
--- a/tensorflow/core/kernels/restore_v2_op_test.cc
+++ b/tensorflow/core/kernels/restore_v2_op_test.cc
@@ -96,7 +96,7 @@ class RestoreV2OpTest : public OpsTestBase {
       std::unique_ptr<Device> device(
           DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
 
-      gtl::InlinedVector<TensorValue, 4> inputs;
+      absl::InlinedVector<TensorValue, 4> inputs;
 
       Status status;
       std::unique_ptr<OpKernel> op(
diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc
index 6a5053460e4400..6bc31b2db43f30 100644
--- a/tensorflow/core/kernels/reverse_op.cc
+++ b/tensorflow/core/kernels/reverse_op.cc
@@ -259,7 +259,7 @@ class ReverseV2Op : public OpKernel {
       OP_REQUIRES(context, TensorShapeUtils::IsVector(sparse_dims_shape),
                   errors::InvalidArgument("'dims' must be 1-dimension, not ",
                                           sparse_dims.dims()));
-      gtl::InlinedVector<bool, 8> axes_dense(input_dims, false);
+      absl::InlinedVector<bool, 8> axes_dense(input_dims, false);
       for (int dummy = 0; dummy < axes_sparse_flat.size(); dummy++) {
         Tidx axis = internal::SubtleMustCopy<Tidx>(axes_sparse_flat(dummy));
         Tidx canonical_axis = axis < 0 ? input_dims + axis : axis;
diff --git a/tensorflow/core/kernels/risc/experimental/BUILD b/tensorflow/core/kernels/risc/experimental/BUILD
index 72bafd5b3575d7..a754009cdb771e 100644
--- a/tensorflow/core/kernels/risc/experimental/BUILD
+++ b/tensorflow/core/kernels/risc/experimental/BUILD
@@ -15,6 +15,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -25,6 +26,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -35,6 +37,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -45,17 +48,14 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@eigen_archive//:eigen3",
     ],
 )
 
 tf_kernel_library(
     name = "risc_bitcast_op",
     srcs = ["risc_bitcast_op.cc"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
+    deps = ["//tensorflow/core:framework"],
 )
 
 tf_kernel_library(
@@ -65,17 +65,14 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@eigen_archive//:eigen3",
     ],
 )
 
 tf_kernel_library(
     name = "risc_cast_op",
     srcs = ["risc_cast_op.cc"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
+    deps = ["//tensorflow/core:framework"],
 )
 
 tf_kernel_library(
@@ -85,6 +82,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -95,6 +93,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -105,27 +104,20 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@eigen_archive//:eigen3",
     ],
 )
 
 tf_kernel_library(
     name = "risc_condition_op",
     srcs = ["risc_condition_op.cc"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
+    deps = ["//tensorflow/core:framework"],
 )
 
 tf_kernel_library(
     name = "risc_conv_op",
     srcs = ["risc_conv_op.cc"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
+    deps = ["//tensorflow/core:framework"],
 )
 
 tf_kernel_library(
@@ -135,6 +127,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -145,6 +138,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -155,6 +149,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -165,17 +160,14 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@eigen_archive//:eigen3",
     ],
 )
 
 tf_kernel_library(
     name = "risc_fft_op",
     srcs = ["risc_fft_op.cc"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
+    deps = ["//tensorflow/core:framework"],
 )
 
 tf_kernel_library(
@@ -185,37 +177,26 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@eigen_archive//:eigen3",
     ],
 )
 
 tf_kernel_library(
     name = "risc_gather_op",
     srcs = ["risc_gather_op.cc"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
+    deps = ["//tensorflow/core:framework"],
 )
 
 tf_kernel_library(
     name = "risc_imag_op",
     srcs = ["risc_imag_op.cc"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
+    deps = ["//tensorflow/core:framework"],
 )
 
 tf_kernel_library(
     name = "risc_is_finite_op",
     srcs = ["risc_is_finite_op.cc"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
+    deps = ["//tensorflow/core:framework"],
 )
 
 tf_kernel_library(
@@ -225,37 +206,26 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@eigen_archive//:eigen3",
     ],
 )
 
 tf_kernel_library(
     name = "risc_logical_and_op",
     srcs = ["risc_logical_and_op.cc"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
+    deps = ["//tensorflow/core:framework"],
 )
 
 tf_kernel_library(
     name = "risc_logical_not_op",
     srcs = ["risc_logical_not_op.cc"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
+    deps = ["//tensorflow/core:framework"],
 )
 
 tf_kernel_library(
     name = "risc_logical_or_op",
     srcs = ["risc_logical_or_op.cc"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
+    deps = ["//tensorflow/core:framework"],
 )
 
 tf_kernel_library(
@@ -265,6 +235,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -275,6 +246,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -285,6 +257,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -295,6 +268,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -305,6 +279,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -315,6 +290,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -325,37 +301,26 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@eigen_archive//:eigen3",
     ],
 )
 
 tf_kernel_library(
     name = "risc_random_uniform_op",
     srcs = ["risc_random_uniform_op.cc"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
+    deps = ["//tensorflow/core:framework"],
 )
 
 tf_kernel_library(
     name = "risc_real_op",
     srcs = ["risc_real_op.cc"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
+    deps = ["//tensorflow/core:framework"],
 )
 
 tf_kernel_library(
     name = "risc_reduce_op",
     srcs = ["risc_reduce_op.cc"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
+    deps = ["//tensorflow/core:framework"],
 )
 
 tf_kernel_library(
@@ -365,6 +330,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -381,21 +347,13 @@ tf_kernel_library(
 tf_kernel_library(
     name = "risc_reverse_op",
     srcs = ["risc_reverse_op.cc"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
+    deps = ["//tensorflow/core:framework"],
 )
 
 tf_kernel_library(
     name = "risc_scatter_op",
     srcs = ["risc_scatter_op.cc"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
+    deps = ["//tensorflow/core:framework"],
 )
 
 tf_kernel_library(
@@ -415,6 +373,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -425,27 +384,20 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@eigen_archive//:eigen3",
     ],
 )
 
 tf_kernel_library(
     name = "risc_sort_op",
     srcs = ["risc_sort_op.cc"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
+    deps = ["//tensorflow/core:framework"],
 )
 
 tf_kernel_library(
     name = "risc_squeeze_op",
     srcs = ["risc_squeeze_op.cc"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
+    deps = ["//tensorflow/core:framework"],
 )
 
 tf_kernel_library(
@@ -455,47 +407,32 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@eigen_archive//:eigen3",
     ],
 )
 
 tf_kernel_library(
     name = "risc_transpose_op",
     srcs = ["risc_transpose_op.cc"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
+    deps = ["//tensorflow/core:framework"],
 )
 
 tf_kernel_library(
     name = "risc_triangular_solve_op",
     srcs = ["risc_triangular_solve_op.cc"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
+    deps = ["//tensorflow/core:framework"],
 )
 
 tf_kernel_library(
     name = "risc_unary_op",
     srcs = ["risc_unary_op.cc"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
+    deps = ["//tensorflow/core:framework"],
 )
 
 tf_kernel_library(
     name = "risc_while_op",
     srcs = ["risc_while_op.cc"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
+    deps = ["//tensorflow/core:framework"],
 )
 
 tf_kernel_library(
diff --git a/tensorflow/core/kernels/risc/experimental/risc_abs_op.cc b/tensorflow/core/kernels/risc/experimental/risc_abs_op.cc
index 48abd4f04dfd31..5ef26397dd5518 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_abs_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_abs_op.cc
@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
+#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_add_op.cc b/tensorflow/core/kernels/risc/experimental/risc_add_op.cc
index 4d2c9dcc731d5b..f4a0e9785c2fa3 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_add_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_add_op.cc
@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
+#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_binary_arithmetic_op.cc b/tensorflow/core/kernels/risc/experimental/risc_binary_arithmetic_op.cc
index 59da954af2e70d..1192c21413436d 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_binary_arithmetic_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_binary_arithmetic_op.cc
@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
+#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_binary_comparison_op.cc b/tensorflow/core/kernels/risc/experimental/risc_binary_comparison_op.cc
index a614a2225c0d5d..f374af9772cfe8 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_binary_comparison_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_binary_comparison_op.cc
@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
+#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_bitcast_op.cc b/tensorflow/core/kernels/risc/experimental/risc_bitcast_op.cc
index d7144dde64ee1e..4a6deea972d00a 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_bitcast_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_bitcast_op.cc
@@ -13,11 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_broadcast_op.cc b/tensorflow/core/kernels/risc/experimental/risc_broadcast_op.cc
index 4444db2d14ae5b..6662ec1bed31f1 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_broadcast_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_broadcast_op.cc
@@ -13,9 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/op.h"
+#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_cast_op.cc b/tensorflow/core/kernels/risc/experimental/risc_cast_op.cc
index bfbaa66a73659a..23330e5bfdceb8 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_cast_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_cast_op.cc
@@ -13,11 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_ceil_op.cc b/tensorflow/core/kernels/risc/experimental/risc_ceil_op.cc
index 45483849baebff..bb0b692da6938c 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_ceil_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_ceil_op.cc
@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
+#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_cholesky_op.cc b/tensorflow/core/kernels/risc/experimental/risc_cholesky_op.cc
index 05b2497ce4f67d..690fcbee9ac3d8 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_cholesky_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_cholesky_op.cc
@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
+#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_concat_op.cc b/tensorflow/core/kernels/risc/experimental/risc_concat_op.cc
index b01a9128d8be71..34b32c45caef78 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_concat_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_concat_op.cc
@@ -13,9 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/op.h"
+#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_condition_op.cc b/tensorflow/core/kernels/risc/experimental/risc_condition_op.cc
index e76b217c7fd368..a29c9725bf3950 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_condition_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_condition_op.cc
@@ -13,11 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_conv_op.cc b/tensorflow/core/kernels/risc/experimental/risc_conv_op.cc
index 58c5ee98eae261..7c6bb761f0784f 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_conv_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_conv_op.cc
@@ -13,11 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_cos_op.cc b/tensorflow/core/kernels/risc/experimental/risc_cos_op.cc
index 7f1726e42a7ed7..31bc1acf552f0f 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_cos_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_cos_op.cc
@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
+#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_div_op.cc b/tensorflow/core/kernels/risc/experimental/risc_div_op.cc
index b19dbc6512b246..45f240650f9230 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_div_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_div_op.cc
@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
+#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_dot_op.cc b/tensorflow/core/kernels/risc/experimental/risc_dot_op.cc
index 6b5e382e15ae12..e125f2e12b1d4b 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_dot_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_dot_op.cc
@@ -13,9 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/op.h"
+#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_exp_op.cc b/tensorflow/core/kernels/risc/experimental/risc_exp_op.cc
index 3daf488a9c8fcc..530fad7cc51db2 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_exp_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_exp_op.cc
@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
+#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_fft_op.cc b/tensorflow/core/kernels/risc/experimental/risc_fft_op.cc
index d21aa208d19bd9..eb9b7a95c99bc5 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_fft_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_fft_op.cc
@@ -13,11 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_floor_op.cc b/tensorflow/core/kernels/risc/experimental/risc_floor_op.cc
index 989dc70af0bcdc..1caef198390e15 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_floor_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_floor_op.cc
@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
+#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_gather_op.cc b/tensorflow/core/kernels/risc/experimental/risc_gather_op.cc
index 424733f231dae0..fa8d99224b0422 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_gather_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_gather_op.cc
@@ -13,11 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_imag_op.cc b/tensorflow/core/kernels/risc/experimental/risc_imag_op.cc
index a1112da04ee115..8034fcff699e9d 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_imag_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_imag_op.cc
@@ -13,11 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_is_finite_op.cc b/tensorflow/core/kernels/risc/experimental/risc_is_finite_op.cc
index f223cb209713ab..0e3357bec444fb 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_is_finite_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_is_finite_op.cc
@@ -13,11 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_log_op.cc b/tensorflow/core/kernels/risc/experimental/risc_log_op.cc
index 756b256d753e6e..de34adb05af21d 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_log_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_log_op.cc
@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
+#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_logical_and_op.cc b/tensorflow/core/kernels/risc/experimental/risc_logical_and_op.cc
index 7c2f2cb2624b13..b61ab655fde34d 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_logical_and_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_logical_and_op.cc
@@ -13,11 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_logical_not_op.cc b/tensorflow/core/kernels/risc/experimental/risc_logical_not_op.cc
index 2f96c2b38e4a77..b01dde25b0784c 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_logical_not_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_logical_not_op.cc
@@ -13,11 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_logical_or_op.cc b/tensorflow/core/kernels/risc/experimental/risc_logical_or_op.cc
index 1e9ac0a7dff293..2c8dd1e09d9263 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_logical_or_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_logical_or_op.cc
@@ -13,11 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_max_op.cc b/tensorflow/core/kernels/risc/experimental/risc_max_op.cc
index affc97696d51cc..7d96bb3e91d28b 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_max_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_max_op.cc
@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
+#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_min_op.cc b/tensorflow/core/kernels/risc/experimental/risc_min_op.cc
index 3b3393d499c805..dc0ccae28c8f72 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_min_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_min_op.cc
@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
+#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_mul_op.cc b/tensorflow/core/kernels/risc/experimental/risc_mul_op.cc
index 2d9aebfe173709..152822b333bbc4 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_mul_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_mul_op.cc
@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
+#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_neg_op.cc b/tensorflow/core/kernels/risc/experimental/risc_neg_op.cc
index 9b5194732fbeaf..0d2a3779baa691 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_neg_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_neg_op.cc
@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
+#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_pad_op.cc b/tensorflow/core/kernels/risc/experimental/risc_pad_op.cc
index 6a95e2abcf8f59..9a1e42a92c6aa7 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_pad_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_pad_op.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/op.h"
+#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/bfloat16.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_pool_op.cc b/tensorflow/core/kernels/risc/experimental/risc_pool_op.cc
index 3f139c468b70ac..1122f3496a26c0 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_pool_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_pool_op.cc
@@ -13,9 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/op.h"
+#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_pow_op.cc b/tensorflow/core/kernels/risc/experimental/risc_pow_op.cc
index 25e022af07f70a..2205bfd0f8e480 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_pow_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_pow_op.cc
@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
+#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_random_uniform_op.cc b/tensorflow/core/kernels/risc/experimental/risc_random_uniform_op.cc
index 8c326a47ab79ef..1300d715203b7f 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_random_uniform_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_random_uniform_op.cc
@@ -13,11 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_real_op.cc b/tensorflow/core/kernels/risc/experimental/risc_real_op.cc
index 4d8f3799e63bd0..7eabde399aa607 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_real_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_real_op.cc
@@ -13,11 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_reduce_op.cc b/tensorflow/core/kernels/risc/experimental/risc_reduce_op.cc
index 2a5cbc8e43ba3a..b2786a5a3d0fbf 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_reduce_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_reduce_op.cc
@@ -13,11 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_rem_op.cc b/tensorflow/core/kernels/risc/experimental/risc_rem_op.cc
index d987c045717bca..6a36b547b1116d 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_rem_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_rem_op.cc
@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
+#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_reshape_op.cc b/tensorflow/core/kernels/risc/experimental/risc_reshape_op.cc
index fb2040dffd52e8..bdcbdc0fe98f38 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_reshape_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_reshape_op.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_reverse_op.cc b/tensorflow/core/kernels/risc/experimental/risc_reverse_op.cc
index 815bc43c273879..168578a86f715c 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_reverse_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_reverse_op.cc
@@ -13,11 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_scatter_op.cc b/tensorflow/core/kernels/risc/experimental/risc_scatter_op.cc
index 55e6d182698e58..46806f2e8859d4 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_scatter_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_scatter_op.cc
@@ -13,11 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_shape_op.cc b/tensorflow/core/kernels/risc/experimental/risc_shape_op.cc
index 46446ee1f6d669..510abf196c72ca 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_shape_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_shape_op.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_sign_op.cc b/tensorflow/core/kernels/risc/experimental/risc_sign_op.cc
index a20876338d9b73..d95fdc4c09967f 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_sign_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_sign_op.cc
@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
+#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_slice_op.cc b/tensorflow/core/kernels/risc/experimental/risc_slice_op.cc
index 513f40a8a4b365..8025c75bed8415 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_slice_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_slice_op.cc
@@ -13,9 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/op.h"
+#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_sort_op.cc b/tensorflow/core/kernels/risc/experimental/risc_sort_op.cc
index 698ca2dd7d60f2..9ba377f3ae70ed 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_sort_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_sort_op.cc
@@ -13,11 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_squeeze_op.cc b/tensorflow/core/kernels/risc/experimental/risc_squeeze_op.cc
index aca072d45395be..d31850b066de9e 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_squeeze_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_squeeze_op.cc
@@ -13,11 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_sub_op.cc b/tensorflow/core/kernels/risc/experimental/risc_sub_op.cc
index affc97696d51cc..7d96bb3e91d28b 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_sub_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_sub_op.cc
@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
+#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_transpose_op.cc b/tensorflow/core/kernels/risc/experimental/risc_transpose_op.cc
index 58e0e4ad091be8..1bff2acb8ef574 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_transpose_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_transpose_op.cc
@@ -13,11 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_triangular_solve_op.cc b/tensorflow/core/kernels/risc/experimental/risc_triangular_solve_op.cc
index d6e0be8da3395a..24bfa4c7f17741 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_triangular_solve_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_triangular_solve_op.cc
@@ -13,11 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_unary_op.cc b/tensorflow/core/kernels/risc/experimental/risc_unary_op.cc
index 499686f54a9403..62bef00ae9b6d0 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_unary_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_unary_op.cc
@@ -13,11 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/risc/experimental/risc_while_op.cc b/tensorflow/core/kernels/risc/experimental/risc_while_op.cc
index 165a41f83ca7e6..b5c5dd92ff4627 100644
--- a/tensorflow/core/kernels/risc/experimental/risc_while_op.cc
+++ b/tensorflow/core/kernels/risc/experimental/risc_while_op.cc
@@ -13,11 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.h"
 
 namespace tensorflow {
 namespace risc {
diff --git a/tensorflow/core/kernels/roll_op.cc b/tensorflow/core/kernels/roll_op.cc
index 269f1011acc61a..a2beaaf7e0ade8 100644
--- a/tensorflow/core/kernels/roll_op.cc
+++ b/tensorflow/core/kernels/roll_op.cc
@@ -69,7 +69,7 @@ class RollOp : public OpKernel {
 
     // if there are any duplicate axes, shift_mod_sum will have the
     // total modulo sum of shifts for each dimension
-    gtl::InlinedVector<int32, 4> shift_mod_sum(num_dims, 0);
+    absl::InlinedVector<int32, 4> shift_mod_sum(num_dims, 0);
     for (int i = 0; i < num_shifts; i++) {
       int axis = axis_flat(i);
       if (axis < 0) {
@@ -83,13 +83,13 @@ class RollOp : public OpKernel {
       shift_mod_sum[axis] = (sum % ds + ds) % ds;
     }
     // the size of each dimension
-    gtl::InlinedVector<int32, 4> dim_size(num_dims);
+    absl::InlinedVector<int32, 4> dim_size(num_dims);
     // threshold[i] is the index that the roll starts to wrap back to the front
-    gtl::InlinedVector<int32, 4> threshold(num_dims);
+    absl::InlinedVector<int32, 4> threshold(num_dims);
     // dim_range is the number of indices over in the flattened tensor
     // you need to skip in order to make it over from one side of a dimension
     // to the other. Used to make the shifts wrap around after a threshold.
-    gtl::InlinedVector<int64_t, 4> dim_range(num_dims);
+    absl::InlinedVector<int64_t, 4> dim_range(num_dims);
     int64_t dim_size_prod = 1;  // dimension size product
     // inner shift dimension (inner most shifted dimension)
     int64_t isd = 0;
@@ -130,7 +130,7 @@ void DoRoll(const OpKernelContext* context, const int64_t num_elements,
   auto work = [input, output, num_dims, &dim_size, &threshold, &dim_range](
                   int64_t start, int64_t end) {
     // array of indices for each dimension
-    gtl::InlinedVector<int, 4> indices(num_dims);
+    absl::InlinedVector<int, 4> indices(num_dims);
     int offset = 0;  // the shift along the flattened tensor for current element
     // initialize indices and offset
     for (int i = 0; i < num_dims; i++) {
@@ -216,7 +216,7 @@ void DoRollWithMemcpy(const OpKernelContext* context,
 
     // array of indices for each dimension
     // indices = [i, j, k, l, m, n]
-    gtl::InlinedVector<int, 4> indices(num_dims);
+    absl::InlinedVector<int, 4> indices(num_dims);
     // the offset needed to make all inner non-shifting dimensions become 0
     int64_t remainder_offset = 0;
     // initialize indices
diff --git a/tensorflow/core/kernels/scoped_allocator_ops_test.cc b/tensorflow/core/kernels/scoped_allocator_ops_test.cc
index ec4571b91cea62..4edc38e0f772a5 100644
--- a/tensorflow/core/kernels/scoped_allocator_ops_test.cc
+++ b/tensorflow/core/kernels/scoped_allocator_ops_test.cc
@@ -87,7 +87,7 @@ void PrepOp(DataType dtype, int32_t id,
             Tensor** backing_tensor, Allocator* allocator,
             ScopedAllocatorMgr* sam, const string& op_name,
             std::vector<Tensor>* tensors,
-            gtl::InlinedVector<TensorValue, 4>* inputs,
+            absl::InlinedVector<TensorValue, 4>* inputs,
             const DataTypeVector& input_types) {
   ScopedAllocatorMgr::PopulateFields(id, fields_shapes, dtype, fields);
   // We don't simply allocate a tensor with shape as backing_tensor_shape,
diff --git a/tensorflow/core/kernels/sdca_internal.h b/tensorflow/core/kernels/sdca_internal.h
index 5bf6325f4d2fae..6e574640d0e585 100644
--- a/tensorflow/core/kernels/sdca_internal.h
+++ b/tensorflow/core/kernels/sdca_internal.h
@@ -56,10 +56,10 @@ struct ExampleStatistics {
   // case, this vector has the same length as the number of classes, where each
   // value corresponds to one class.
   // Use InlinedVector to avoid heap allocation for small number of classes.
-  gtl::InlinedVector<double, 1> wx;
+  absl::InlinedVector<double, 1> wx;
 
   // Logits for each class, using the previous weights.
-  gtl::InlinedVector<double, 1> prev_wx;
+  absl::InlinedVector<double, 1> prev_wx;
 
   // Sum of squared feature values occurring in the example divided by
   // L2 * sum(example_weights).
diff --git a/tensorflow/core/kernels/searchsorted_op.cc b/tensorflow/core/kernels/searchsorted_op.cc
index e21a3ca45c53bf..d955ae7d2ac022 100644
--- a/tensorflow/core/kernels/searchsorted_op.cc
+++ b/tensorflow/core/kernels/searchsorted_op.cc
@@ -58,7 +58,7 @@ struct UpperBoundFunctor<CPUDevice, T, OutType> {
     int64_t cost_per_unit =
         kCostMultiplier * batch_size * Log2Ceiling(num_inputs);
     thread_pool->ParallelFor(num_values, cost_per_unit, work_fn);
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
@@ -87,7 +87,7 @@ struct LowerBoundFunctor<CPUDevice, T, OutType> {
     int64_t cost_per_unit =
         kCostMultiplier * batch_size * Log2Ceiling(num_inputs);
     thread_pool->ParallelFor(num_values, cost_per_unit, work_fn);
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 }  // namespace functor
diff --git a/tensorflow/core/kernels/segment_reduction_ops_impl.h b/tensorflow/core/kernels/segment_reduction_ops_impl.h
index f1bd11414a0261..ac29ec572217c9 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_impl.h
+++ b/tensorflow/core/kernels/segment_reduction_ops_impl.h
@@ -493,7 +493,7 @@ class UnsortedSegmentReductionOp : public OpKernel {
                    internal::ValidateUnsortedSegmentReduction(
                        this, context, data, segment_ids, num_segments));
     const auto segment_flat = segment_ids.flat<Index>();
-    const int64_t output_rows = internal::SubtleMustCopy(static_cast<int64_t>(
+    const Index output_rows = internal::SubtleMustCopy(static_cast<Index>(
         num_segments.dtype() == DT_INT32 ? num_segments.scalar<int32>()()
                                          : num_segments.scalar<int64_t>()()));
     OP_REQUIRES(context, output_rows >= 0,
diff --git a/tensorflow/core/kernels/segment_reduction_ops_test.cc b/tensorflow/core/kernels/segment_reduction_ops_test.cc
index 7e43b12d92938a..2763d97f5e3a2f 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_test.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops_test.cc
@@ -45,7 +45,7 @@ static void BM_UnsortedSegmentReduction(::testing::benchmark::State& state,
       DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
 
   // Create inputs
-  gtl::InlinedVector<TensorValue, 4> reduction_inputs;
+  absl::InlinedVector<TensorValue, 4> reduction_inputs;
   TensorShape shape1({num_rows, num_cols});
   Tensor input(DT_FLOAT, shape1);
   // input.flat<float>().setRandom();
@@ -114,7 +114,7 @@ static void BM_SegmentReduction(::testing::benchmark::State& state,
       DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
 
   // Create inputs
-  gtl::InlinedVector<TensorValue, 4> reduction_inputs;
+  absl::InlinedVector<TensorValue, 4> reduction_inputs;
   TensorShape shape1({num_rows, num_cols});
   Tensor input1(DT_FLOAT, shape1);
   reduction_inputs.push_back({nullptr, &input1});
diff --git a/tensorflow/core/kernels/serialize_sparse_op.cc b/tensorflow/core/kernels/serialize_sparse_op.cc
index 3c077a6cf895c8..f55c5ecdb37e84 100644
--- a/tensorflow/core/kernels/serialize_sparse_op.cc
+++ b/tensorflow/core/kernels/serialize_sparse_op.cc
@@ -107,7 +107,7 @@ bool SerializeSparseOp<Variant>::IsExpensive() {
 template <>
 Status SerializeSparseOp<tstring>::Initialize(Tensor* result) {
   *result = Tensor(DT_STRING, TensorShape({3}));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <>
@@ -116,7 +116,7 @@ Status SerializeSparseOp<tstring>::Serialize(const Tensor& input,
   TensorProto proto;
   input.AsProtoTensorContent(&proto);
   *result = proto.SerializeAsString();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 REGISTER_KERNEL_BUILDER(Name("SerializeSparse")
@@ -127,14 +127,14 @@ REGISTER_KERNEL_BUILDER(Name("SerializeSparse")
 template <>
 Status SerializeSparseOp<Variant>::Initialize(Tensor* result) {
   *result = Tensor(DT_VARIANT, TensorShape({3}));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <>
 Status SerializeSparseOp<Variant>::Serialize(const Tensor& input,
                                              Variant* result) {
   *result = input;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 REGISTER_KERNEL_BUILDER(Name("SerializeSparse")
@@ -213,7 +213,7 @@ struct SerializeGroups<T, tstring> {
       serialize_empty_element(empty_b);
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
@@ -328,7 +328,7 @@ struct SerializeGroups<T, Variant> {
       serialize_empty_element(empty_b);
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
@@ -371,7 +371,7 @@ class SerializeManySparseOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    TensorShape::BuildTensorShape(input_shape->vec<int64_t>(),
                                                  &tensor_input_shape));
-    gtl::InlinedVector<int64_t, 8> std_order(rank);
+    absl::InlinedVector<int64_t, 8> std_order(rank);
     std::iota(std_order.begin(), std_order.end(), 0);
     SparseTensor input_st;
     OP_REQUIRES_OK(context, SparseTensor::Create(*input_indices, *input_values,
@@ -409,7 +409,6 @@ class SerializeManySparseOp : public OpKernel {
 TF_CALL_ALL_TYPES(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 
-
 #define REGISTER_KERNELS(type)                                      \
   REGISTER_KERNEL_BUILDER(Name("SerializeManySparse")               \
                               .Device(DEVICE_CPU)                   \
diff --git a/tensorflow/core/kernels/slice_op.cc b/tensorflow/core/kernels/slice_op.cc
index f3f73a4c718d71..cac7c248e3b5b9 100644
--- a/tensorflow/core/kernels/slice_op.cc
+++ b/tensorflow/core/kernels/slice_op.cc
@@ -37,7 +37,7 @@ namespace tensorflow {
 namespace {
 
 void IntTensorToInt64Vec(const Tensor& tensor,
-                         gtl::InlinedVector<int64_t, 4>* out) {
+                         absl::InlinedVector<int64_t, 4>* out) {
   out->resize(tensor.NumElements());
   int64_t* out_ptr = out->data();
   if (tensor.dtype() == DT_INT32) {
@@ -63,8 +63,8 @@ typedef Eigen::GpuDevice GPUDevice;
 void SharedSliceValidation(OpKernelContext* context, const Tensor& input,
                            TensorShape* output_shape, bool* is_identity,
                            bool* slice_dim0,
-                           gtl::InlinedVector<int64_t, 4>* begin,
-                           gtl::InlinedVector<int64_t, 4>* size) {
+                           absl::InlinedVector<int64_t, 4>* begin,
+                           absl::InlinedVector<int64_t, 4>* size) {
   const Tensor& begin_tensor = context->input(1);
   const Tensor& size_tensor = context->input(2);
 
@@ -121,8 +121,8 @@ void SharedSliceValidation(OpKernelContext* context, const Tensor& input,
 template <typename T>
 static void SharedSliceCommonCases(OpKernelContext* context,
                                    const Tensor& input,
-                                   gtl::InlinedVector<int64, 4>* begin,
-                                   gtl::InlinedVector<int64, 4>* size,
+                                   absl::InlinedVector<int64, 4>* begin,
+                                   absl::InlinedVector<int64, 4>* size,
                                    Tensor** result, bool* done) {
   bool is_identity = true;
   bool slice_dim0 = true;
@@ -157,8 +157,8 @@ class SliceOp : public OpKernel {
   explicit SliceOp(OpKernelConstruction* context) : OpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
-    gtl::InlinedVector<int64_t, 4> begin;
-    gtl::InlinedVector<int64_t, 4> size;
+    absl::InlinedVector<int64_t, 4> begin;
+    absl::InlinedVector<int64_t, 4> size;
     const Tensor& input = context->input(0);
     Tensor* result = nullptr;
     bool done = false;
diff --git a/tensorflow/core/kernels/spacetobatch_functor.cc b/tensorflow/core/kernels/spacetobatch_functor.cc
index 24ffba9e20aae3..bdc2f4680927f2 100644
--- a/tensorflow/core/kernels/spacetobatch_functor.cc
+++ b/tensorflow/core/kernels/spacetobatch_functor.cc
@@ -148,7 +148,7 @@ struct SpaceToBatchFunctor<CPUDevice, T, NUM_BLOCK_DIMS, B2S> {
           block_offsets, batch_tensor_shape, &batch_tensor_strides[1],
           batch_tensor_ptr + batch_tensor_b * batch_tensor_strides[0]);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
diff --git a/tensorflow/core/kernels/spacetobatch_op.cc b/tensorflow/core/kernels/spacetobatch_op.cc
index 210ff0c93023a4..de5f9728592872 100644
--- a/tensorflow/core/kernels/spacetobatch_op.cc
+++ b/tensorflow/core/kernels/spacetobatch_op.cc
@@ -68,8 +68,8 @@ Status SpaceToBatchOpCompute(OpKernelContext* context,
 
   // To avoid out-of-bounds access in the case that the block_shape and/or
   // paddings tensors are concurrently modified, we must copy the values.
-  gtl::InlinedVector<int64_t, 4> block_shape;
-  gtl::InlinedVector<int64_t, 8> paddings;
+  absl::InlinedVector<int64_t, 4> block_shape;
+  absl::InlinedVector<int64_t, 8> paddings;
   internal::spacetobatch::SubtleMustCopyFlat(orig_block_shape, &block_shape);
   internal::spacetobatch::SubtleMustCopyFlat(orig_paddings, &paddings);
 
diff --git a/tensorflow/core/kernels/sparse/transpose_op.cc b/tensorflow/core/kernels/sparse/transpose_op.cc
index 44e4783613ccae..30a887749ac7b6 100644
--- a/tensorflow/core/kernels/sparse/transpose_op.cc
+++ b/tensorflow/core/kernels/sparse/transpose_op.cc
@@ -246,7 +246,7 @@ struct CSRSparseMatrixTransposeComponent<CPUDevice, T> {
         current_col_count[col_idx] += 1;
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
diff --git a/tensorflow/core/kernels/sparse_reduce_op.cc b/tensorflow/core/kernels/sparse_reduce_op.cc
index 348a73e0816280..8f875053c66579 100644
--- a/tensorflow/core/kernels/sparse_reduce_op.cc
+++ b/tensorflow/core/kernels/sparse_reduce_op.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
+#include "absl/status/status.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_requires.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -47,13 +48,14 @@ struct ReduceDetails {
 
 // Compute common reduce parameters that'll be used for SparseTensor
 // reductions. Usage:
-// ReduceDetails reduction = SparseTensorReduceHelper(sp, axes, keep_dims);
-// sp.Reorder(reduction.reorder_dims);
-// for (const auto& g : sp.group(reduction.group_by_dims)) {
+// StatusOr<ReduceDetails> reduction =
+//     SparseTensorReduceHelper(sp, axes, keep_dims);
+// sp.Reorder(reduction->reorder_dims);
+// for (const auto& g : sp.group(reduction->group_by_dims)) {
 //   ...
 // }
-// // Set output shape to reduction.reduced_shape.
-ReduceDetails SparseTensorReduceHelper(const SparseTensor &sp,
+// // Set output shape to reduction->reduced_shape.
+absl::StatusOr<ReduceDetails> SparseTensorReduceHelper(const SparseTensor &sp,
                                        absl::Span<const int32> axes_slice,
                                        bool keep_dims) {
   ReduceDetails reduction;
@@ -101,7 +103,11 @@ ReduceDetails SparseTensorReduceHelper(const SparseTensor &sp,
     out_dim_sizes = sp.PickDims(reduction.group_by_dims);
   }
 
-  reduction.reduced_shape = TensorShape(out_dim_sizes);
+  absl::Status success =
+      TensorShape::BuildTensorShape(out_dim_sizes, &reduction.reduced_shape);
+  if (!success.ok()) {
+    return success;
+  }
   return reduction;
 }
 
@@ -181,8 +187,10 @@ class SparseReduceOp : public OpKernel {
     OP_REQUIRES_OK(ctx, SparseTensor::Create(
         tensor::DeepCopy(*indices_t), tensor::DeepCopy(*values_t),
                     shape, &sp));
-    ReduceDetails reduction = SparseTensorReduceHelper(
+    absl::StatusOr<ReduceDetails> reduction_or = SparseTensorReduceHelper(
         sp, reduction_axes_t->flat<int32>(), keep_dims_);
+    OP_REQUIRES_OK(ctx, reduction_or.status());
+    ReduceDetails reduction = *reduction_or;
 
     Tensor *out_values;
     OP_REQUIRES_OK(
@@ -287,8 +295,10 @@ class SparseReduceSparseOp : public OpKernel {
     OP_REQUIRES_OK(ctx, SparseTensor::Create(tensor::DeepCopy(*indices_t),
                                          tensor::DeepCopy(*values_t),
                     shape, &sp));
-    ReduceDetails reduction = SparseTensorReduceHelper(
+    absl::StatusOr<ReduceDetails> reduction_or = SparseTensorReduceHelper(
         sp, reduction_axes_t->flat<int32>(), keep_dims_);
+    OP_REQUIRES_OK(ctx, reduction_or.status());
+    ReduceDetails reduction = *reduction_or;
 
     sp.Reorder<T>(reduction.reorder_dims);
     // Count nnzs in the output SparseTensor.
diff --git a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc
index 04aff711362552..cb80fa34230a20 100644
--- a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/sparse_tensor_dense_matmul_op.h"
 
+#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -310,7 +311,7 @@ Status SparseTensorDenseMatMulImpl(
     if (ADJ_B) {
       // Perform transpose and conjugation on B once, since we chip out B's
       // columns in the nnz loop.
-      Eigen::array<int, 2> shuffle(1, 0);  // preserve dimension order
+      Eigen::array<int, 2> shuffle{1, 0};  // preserve dimension order
       Eigen::Tensor<T, 2, Eigen::ColMajor> col_major_conj_b =
           b.swap_layout().shuffle(shuffle).conjugate();
       LOOP_NNZ(col_major_conj_b);
diff --git a/tensorflow/core/kernels/stochastic_cast_op.h b/tensorflow/core/kernels/stochastic_cast_op.h
index 508bf55bde33f6..a1039b7f1a2461 100644
--- a/tensorflow/core/kernels/stochastic_cast_op.h
+++ b/tensorflow/core/kernels/stochastic_cast_op.h
@@ -128,7 +128,7 @@ struct functor_traits<
   enum {
     Cost = 3 * NumTraits<Scalar>::AddCost,
     PacketAccess =
-        packet_traits<Scalar>::HasCmp && packet_traits<Scalar>::HasFloor,
+        packet_traits<Scalar>::HasCmp && packet_traits<Scalar>::HasRound,
   };
 };
 
diff --git a/tensorflow/core/kernels/stochastic_cast_op_test.cc b/tensorflow/core/kernels/stochastic_cast_op_test.cc
index 133aa10e50789e..b0a58356e85d0e 100644
--- a/tensorflow/core/kernels/stochastic_cast_op_test.cc
+++ b/tensorflow/core/kernels/stochastic_cast_op_test.cc
@@ -118,7 +118,7 @@ struct functor_traits<
   enum {
     Cost = 3 * NumTraits<Scalar>::AddCost,
     PacketAccess =
-        packet_traits<Scalar>::HasCmp && packet_traits<Scalar>::HasFloor,
+        packet_traits<Scalar>::HasCmp && packet_traits<Scalar>::HasRound,
   };
 };
 
diff --git a/tensorflow/core/kernels/string_to_number_op.cc b/tensorflow/core/kernels/string_to_number_op.cc
index d0ee2a96e4572a..7d1553874d9a7a 100644
--- a/tensorflow/core/kernels/string_to_number_op.cc
+++ b/tensorflow/core/kernels/string_to_number_op.cc
@@ -16,6 +16,8 @@ limitations under the License.
 // See docs in ../ops/parse_ops.cc.
 
 #include <errno.h>
+
+#include <cstdint>
 #include <string>
 
 #include "tensorflow/core/framework/kernel_def_builder.h"
@@ -68,6 +70,8 @@ REGISTER(float);
 REGISTER(double);
 REGISTER(int32);
 REGISTER(int64_t);
+REGISTER(uint32_t);
+REGISTER(uint64_t);
 #undef REGISTER
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/uniform_quant_ops/tensor_utils.cc b/tensorflow/core/kernels/uniform_quant_ops/tensor_utils.cc
index d75659ad5161f6..842b3dd4c683ae 100644
--- a/tensorflow/core/kernels/uniform_quant_ops/tensor_utils.cc
+++ b/tensorflow/core/kernels/uniform_quant_ops/tensor_utils.cc
@@ -57,7 +57,7 @@ Status QuantizationAxisAndShapeValid(const TensorShape& data_shape,
 }
 
 TensorShape TransposedShape(const TensorShape& in_shape,
-                            const gtl::ArraySlice<int32_t> perm) {
+                            const absl::Span<const int32_t> perm) {
   TensorShape out_shape = in_shape;
   for (int i = 0; i < out_shape.dims(); ++i) {
     out_shape.set_dim(i, in_shape.dim_size(perm[i]));
diff --git a/tensorflow/core/lib/db/BUILD b/tensorflow/core/lib/db/BUILD
index c937ef52c3b98e..c79a73c1a5e69b 100644
--- a/tensorflow/core/lib/db/BUILD
+++ b/tensorflow/core/lib/db/BUILD
@@ -22,6 +22,8 @@ cc_library(
         ":snapfn",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@org_sqlite",
     ],
diff --git a/tensorflow/core/lib/db/sqlite.cc b/tensorflow/core/lib/db/sqlite.cc
index b208c3a7242c65..c0d366ec695421 100644
--- a/tensorflow/core/lib/db/sqlite.cc
+++ b/tensorflow/core/lib/db/sqlite.cc
@@ -14,9 +14,18 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/lib/db/sqlite.h"
 
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/strcat.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/stringprintf.h"
+#include "tensorflow/core/platform/types.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/macros.h"
+#include "tsl/platform/status.h"
 
 extern "C" int sqlite3_snapfn_init(sqlite3*, const char**, const void*);
 
diff --git a/tensorflow/core/lib/db/sqlite.h b/tensorflow/core/lib/db/sqlite.h
index 5c286403aa687e..28029020aac6f1 100644
--- a/tensorflow/core/lib/db/sqlite.h
+++ b/tensorflow/core/lib/db/sqlite.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <mutex>
 
+#include "absl/log/check.h"
 #include "sqlite3.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -24,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
+#include "tsl/platform/status.h"
 
 /// TensorFlow SQLite Veneer
 ///
diff --git a/tensorflow/core/lib/db/sqlite_test.cc b/tensorflow/core/lib/db/sqlite_test.cc
index 1fb7121820a680..39ea38e4fd7e01 100644
--- a/tensorflow/core/lib/db/sqlite_test.cc
+++ b/tensorflow/core/lib/db/sqlite_test.cc
@@ -17,11 +17,11 @@ limitations under the License.
 #include <array>
 #include <climits>
 
-#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
-#include "tensorflow/core/lib/io/path.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+#include "tsl/lib/core/status_test_util.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/core/lib/hash/BUILD b/tensorflow/core/lib/hash/BUILD
index 90694da2e8d910..c2b6018d034e64 100644
--- a/tensorflow/core/lib/hash/BUILD
+++ b/tensorflow/core/lib/hash/BUILD
@@ -13,15 +13,8 @@ package(
         "//tensorflow/core/lib/gtl:__pkg__",
         # tensorflow/core/lib/io/table_builder.cc uses crc functionality
         "//tensorflow/core/lib/io:__pkg__",
-        # tensorflow/core/lib/strings/proto_serialization.cc hashes proto bytes
-        "//tensorflow/core/lib/strings:__pkg__",
-        # tensorflow/core/platform:tracing depends on hash
-        "//tensorflow/core/platform:__subpackages__",
-        "@local_tsl//tsl/platform:__subpackages__",
         # tensorflow/core/framework:tensor depends on hash
         "//tensorflow/core/framework:__pkg__",
-        # For dependency on hash
-        "//tensorflow/cc/experimental/tf2:__subpackages__",
     ],
     licenses = ["notice"],
 )
diff --git a/tensorflow/core/lib/monitoring/BUILD b/tensorflow/core/lib/monitoring/BUILD
index b832e86aedaf46..62e26fa28d30c9 100644
--- a/tensorflow/core/lib/monitoring/BUILD
+++ b/tensorflow/core/lib/monitoring/BUILD
@@ -9,14 +9,6 @@ package(
     default_visibility = [
         # tensorflow/core:lib effectively exposes all targets under tensorflow/core/lib/**
         "//tensorflow/core:__pkg__",
-        # tensorflow/core/platform:monitoring depends on this package
-        "//tensorflow/core/platform:__subpackages__",
-        # tensorflow/compiler/xla/pjrt:metrics depends on this package
-        "@local_xla//xla/pjrt:__subpackages__",
-        "@local_xla//xla/service/gpu:__subpackages__",
-        # tensorflow/compiler/mlir/tfrt:tf_jitrt depends on this package
-        "//tensorflow/compiler/mlir/tfrt:__subpackages__",
-        "@local_tsl//tsl/lib/monitoring:__subpackages__",
     ],
     licenses = ["notice"],
 )
@@ -98,7 +90,6 @@ cc_library(
     visibility = [
         "//learning/brain/google/monitoring:__subpackages__",
         "//tensorflow/core:__subpackages__",
-        "@local_tsl//tsl:__subpackages__",
     ],
     deps = [
         ":collected_metrics",
@@ -148,11 +139,7 @@ cc_library(
 cc_library(
     name = "metric_def",
     hdrs = ["metric_def.h"],
-    visibility = [
-        "//learning/brain/google/monitoring:__subpackages__",
-        "//tensorflow/core:__subpackages__",
-        "@local_tsl//tsl/lib/monitoring:__subpackages__",
-    ],
+    visibility = ["//tensorflow/core:__subpackages__"],
     deps = [
         ":types",
         "//tensorflow/core/framework:summary_proto_cc",
diff --git a/tensorflow/core/lib/strings/BUILD b/tensorflow/core/lib/strings/BUILD
index 2f04ae43b027a3..72eb0a6dac308c 100644
--- a/tensorflow/core/lib/strings/BUILD
+++ b/tensorflow/core/lib/strings/BUILD
@@ -43,6 +43,7 @@ cc_library(
         "//tensorflow/core/platform:macros",
         "//tensorflow/core/platform:stringpiece",
         "//tensorflow/core/platform:types",
+        "@local_tsl//tsl/lib/core:bits",
     ],
 )
 
diff --git a/tensorflow/core/lib/strings/ordered_code.cc b/tensorflow/core/lib/strings/ordered_code.cc
index 8d3f0a66d7977d..65210dc28d1467 100644
--- a/tensorflow/core/lib/strings/ordered_code.cc
+++ b/tensorflow/core/lib/strings/ordered_code.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stringpiece.h"
+#include "tsl/lib/core/bits.h"
 
 namespace tensorflow {
 namespace strings {
@@ -403,49 +404,15 @@ static const uint64 kLengthToMask[1 + kMaxSigned64Length] = {
 // For positive numbers, the number of bits is 1 plus the most significant
 // bit position (the highest bit position in a positive int64 is 63).
 // For a negative number n, we count the bits in ~n.
-// That is, length = kBitsToLength[Bits::Log2Floor64(n < 0 ? ~n : n) + 1].
+// That is, length = kBitsToLength[tsl::Log2Floor64(n < 0 ? ~n : n) + 1].
 static const int8 kBitsToLength[1 + 63] = {
     1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4,
     4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 7, 7,
     7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 10};
 
-#if defined(__GNUC__)
-// Returns floor(lg(n)).  Returns -1 if n == 0.
-static int Log2Floor64(uint64 n) {
-  return n == 0 ? -1 : 63 ^ __builtin_clzll(n);
-}
-#else
-// Portable slow version
-static int Log2Floor32_Portable(uint32 n) {
-  if (n == 0) return -1;
-  int log = 0;
-  uint32 value = n;
-  for (int i = 4; i >= 0; --i) {
-    int shift = (1 << i);
-    uint32 x = value >> shift;
-    if (x != 0) {
-      value = x;
-      log += shift;
-    }
-  }
-  assert(value == 1);
-  return log;
-}
-// Returns floor(lg(n)).  Returns -1 if n == 0.
-static int Log2Floor64(uint64 n) {
-  const uint32 topbits = static_cast<uint32>(n >> 32);
-  if (topbits == 0) {
-    // Top bits are zero, so scan in bottom bits
-    return Log2Floor32_Portable(static_cast<uint32>(n));
-  } else {
-    return 32 + Log2Floor32_Portable(topbits);
-  }
-}
-#endif
-
 // Calculates the encoding length in bytes of the signed number n.
 static inline int SignedEncodingLength(int64_t n) {
-  return kBitsToLength[Log2Floor64(n < 0 ? ~n : n) + 1];
+  return kBitsToLength[tsl::Log2Floor64(n < 0 ? ~n : n) + 1];
 }
 
 static void StoreBigEndian64(char* dst, uint64 v) {
@@ -494,7 +461,7 @@ bool OrderedCode::ReadSignedNumIncreasing(StringPiece* src, int64_t* result) {
   int len;
   uint64 x;
   if (first_byte != 0xff) {
-    len = 7 - Log2Floor64(first_byte ^ 0xff);
+    len = 7 - tsl::Log2Floor64(first_byte ^ 0xff);
     if (src->size() < static_cast<size_t>(len)) return false;
     x = xor_mask;  // sign extend using xor_mask
     for (int i = 0; i < len; ++i)
diff --git a/tensorflow/core/nccl/collective_communicator.cc b/tensorflow/core/nccl/collective_communicator.cc
index 1555ed9aac46a0..ac62f471087a21 100644
--- a/tensorflow/core/nccl/collective_communicator.cc
+++ b/tensorflow/core/nccl/collective_communicator.cc
@@ -21,7 +21,6 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "tensorflow/core/nccl/nccl_manager.h"
-#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 650ef381266ea7..a54fcb7f5954d6 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -1752,7 +1752,7 @@ REGISTER_OP("StridedSlice")
 
       PartialTensorShape processing_shape, final_shape;
       bool is_identity, is_simple_slice, slice_dim0;
-      gtl::InlinedVector<int64, 4> begin, end, strides;
+      absl::InlinedVector<int64, 4UL> begin, end, strides;
       TF_RETURN_IF_ERROR(ValidateStridedSliceOp(
           begin_value, end_value, *strides_value, input_shape, begin_mask,
           end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask,
diff --git a/tensorflow/core/ops/compat/BUILD b/tensorflow/core/ops/compat/BUILD
index db7225099edf8d..72bc3ab7dda784 100644
--- a/tensorflow/core/ops/compat/BUILD
+++ b/tensorflow/core/ops/compat/BUILD
@@ -61,5 +61,6 @@ tf_cc_binary(
         ":op_compatibility_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/platform:path",
     ],
 )
diff --git a/tensorflow/core/ops/compat/op_compatibility_lib.h b/tensorflow/core/ops/compat/op_compatibility_lib.h
index 2f26fd63bdd7a5..1693f2bd5cfe27 100644
--- a/tensorflow/core/ops/compat/op_compatibility_lib.h
+++ b/tensorflow/core/ops/compat/op_compatibility_lib.h
@@ -52,7 +52,11 @@ class OpCompatibilityLib {
 
   // Should match the contents of ops_file().  Run before calling
   // ValidateCompatible().
-  string OpsString() const { return op_list_.DebugString(); }
+  string OpsString() const {
+    string result;
+    google::protobuf::TextFormat::PrintToString(op_list_, &result);
+    return result;
+  }
 
   // Returns the number of ops in OpsString(), includes all ops, not
   // just stable ops.
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Abort.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Abort.pbtxt
index 5d88f788a8cda1..4752385d6ecf6d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Abort.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Abort.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Abort"
   attr {
     name: "error_msg"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Abs.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Abs.pbtxt
index e901bc5794f682..5f44f9c6dca631 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Abs.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Abs.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Abs"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AccumulateNV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AccumulateNV2.pbtxt
index 7a44ffd9fda37d..3e94aa154434f3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AccumulateNV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AccumulateNV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AccumulateNV2"
   input_arg {
     name: "inputs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AccumulatorApplyGradient.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AccumulatorApplyGradient.pbtxt
index e920df37858235..dd1c9870a63327 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AccumulatorApplyGradient.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AccumulatorApplyGradient.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AccumulatorApplyGradient"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AccumulatorNumAccumulated.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AccumulatorNumAccumulated.pbtxt
index 131af0c1b5b24a..f378509e1e02f9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AccumulatorNumAccumulated.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AccumulatorNumAccumulated.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AccumulatorNumAccumulated"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AccumulatorSetGlobalStep.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AccumulatorSetGlobalStep.pbtxt
index 0f50ac1852b834..9b4170df332a6a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AccumulatorSetGlobalStep.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AccumulatorSetGlobalStep.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AccumulatorSetGlobalStep"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AccumulatorTakeGradient.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AccumulatorTakeGradient.pbtxt
index de3a79650b8f9d..1e28a68455fbf7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AccumulatorTakeGradient.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AccumulatorTakeGradient.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AccumulatorTakeGradient"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Acos.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Acos.pbtxt
index 577976c6e4c579..504486147b2d30 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Acos.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Acos.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Acos"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Acosh.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Acosh.pbtxt
index 136b9bfb209241..e53c8177f7f2f7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Acosh.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Acosh.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Acosh"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Add.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Add.pbtxt
index d1f87827922fac..ce30e6d5544d04 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Add.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Add.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Add"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AddManySparseToTensorsMap.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AddManySparseToTensorsMap.pbtxt
index 433a7f40b042e9..c1433ccbaf9e3b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AddManySparseToTensorsMap.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AddManySparseToTensorsMap.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AddManySparseToTensorsMap"
   input_arg {
     name: "sparse_indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AddN.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AddN.pbtxt
index 106bb1a9a9b7ae..8935c304edc5a8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AddN.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AddN.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AddN"
   input_arg {
     name: "inputs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AddSparseToTensorsMap.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AddSparseToTensorsMap.pbtxt
index 8c226e1965ce41..8a4c020d067b18 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AddSparseToTensorsMap.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AddSparseToTensorsMap.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AddSparseToTensorsMap"
   input_arg {
     name: "sparse_indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AddV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AddV2.pbtxt
index ad79c179db20ab..a2be10b4abe51e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AddV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AddV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AddV2"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AdjustContrast.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AdjustContrast.pbtxt
index b2dc5e78e4b4e6..e51900d718bcc6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AdjustContrast.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AdjustContrast.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AdjustContrast"
   input_arg {
     name: "images"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AdjustContrastv2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AdjustContrastv2.pbtxt
index 08fc84a5ab0cb0..6869f269dadf87 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AdjustContrastv2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AdjustContrastv2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AdjustContrastv2"
   input_arg {
     name: "images"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AdjustHue.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AdjustHue.pbtxt
index 6cce51abdc8b28..9a6c72d3d8f515 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AdjustHue.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AdjustHue.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AdjustHue"
   input_arg {
     name: "images"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AdjustSaturation.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AdjustSaturation.pbtxt
index 4bc90aec904be0..918ea188d8523b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AdjustSaturation.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AdjustSaturation.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AdjustSaturation"
   input_arg {
     name: "images"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/All.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/All.pbtxt
index 0afd8d468145d4..c0bc8f4beae4fc 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/All.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/All.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "All"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AllCandidateSampler.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AllCandidateSampler.pbtxt
index ff93f20c348920..e452850c261223 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AllCandidateSampler.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AllCandidateSampler.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AllCandidateSampler"
   input_arg {
     name: "true_classes"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AllToAll.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AllToAll.pbtxt
index 23796980cd62fc..005d16aec0d9e6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AllToAll.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AllToAll.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AllToAll"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Angle.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Angle.pbtxt
index 5a1ff8b86c421c..ce28927f2b8118 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Angle.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Angle.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Angle"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AnonymousHashTable.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AnonymousHashTable.pbtxt
index 1a62f991251223..15826399d357e4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AnonymousHashTable.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AnonymousHashTable.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AnonymousHashTable"
   output_arg {
     name: "table_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AnonymousIterator.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AnonymousIterator.pbtxt
index 8094c8d79d6511..bf8f8fc2ed49d8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AnonymousIterator.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AnonymousIterator.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AnonymousIterator"
   output_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AnonymousIteratorV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AnonymousIteratorV2.pbtxt
index 6b6dad7212bfe9..e7dca69e3e041c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AnonymousIteratorV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AnonymousIteratorV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AnonymousIteratorV2"
   output_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AnonymousIteratorV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AnonymousIteratorV3.pbtxt
index 328434c6042ce2..15e975a4453c2e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AnonymousIteratorV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AnonymousIteratorV3.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AnonymousIteratorV3"
   output_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AnonymousMemoryCache.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AnonymousMemoryCache.pbtxt
index b3ab7638e496a9..7f15df3e956212 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AnonymousMemoryCache.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AnonymousMemoryCache.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AnonymousMemoryCache"
   output_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AnonymousMultiDeviceIterator.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AnonymousMultiDeviceIterator.pbtxt
index a19b45630b5d2f..b8afaa363d6a66 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AnonymousMultiDeviceIterator.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AnonymousMultiDeviceIterator.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AnonymousMultiDeviceIterator"
   output_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AnonymousMultiDeviceIteratorV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AnonymousMultiDeviceIteratorV3.pbtxt
index 20d70a6acc9855..c503bf5c9d3e83 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AnonymousMultiDeviceIteratorV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AnonymousMultiDeviceIteratorV3.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AnonymousMultiDeviceIteratorV3"
   output_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AnonymousMutableDenseHashTable.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AnonymousMutableDenseHashTable.pbtxt
index f14a62b4faa484..6026fd263d36ff 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AnonymousMutableDenseHashTable.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AnonymousMutableDenseHashTable.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AnonymousMutableDenseHashTable"
   input_arg {
     name: "empty_key"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AnonymousMutableHashTable.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AnonymousMutableHashTable.pbtxt
index b375400621b8ee..6f34858f4d3bd3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AnonymousMutableHashTable.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AnonymousMutableHashTable.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AnonymousMutableHashTable"
   output_arg {
     name: "table_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AnonymousMutableHashTableOfTensors.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AnonymousMutableHashTableOfTensors.pbtxt
index 692d1963adffc8..21b3744a98b973 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AnonymousMutableHashTableOfTensors.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AnonymousMutableHashTableOfTensors.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AnonymousMutableHashTableOfTensors"
   output_arg {
     name: "table_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AnonymousRandomSeedGenerator.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AnonymousRandomSeedGenerator.pbtxt
index e805c04147a283..da2558b596ac6b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AnonymousRandomSeedGenerator.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AnonymousRandomSeedGenerator.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AnonymousRandomSeedGenerator"
   input_arg {
     name: "seed"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AnonymousSeedGenerator.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AnonymousSeedGenerator.pbtxt
index c2b9eee9b6a976..370b0460e22f3c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AnonymousSeedGenerator.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AnonymousSeedGenerator.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AnonymousSeedGenerator"
   input_arg {
     name: "seed"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Any.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Any.pbtxt
index 9b5d6350b83cfc..da020906a6d358 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Any.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Any.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Any"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ApplyAdaMax.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ApplyAdaMax.pbtxt
index fe6e128a18034c..06f6134c329f5a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ApplyAdaMax.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ApplyAdaMax.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ApplyAdaMax"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ApplyAdadelta.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ApplyAdadelta.pbtxt
index 211c8d4a64efca..477fd7e824e958 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ApplyAdadelta.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ApplyAdadelta.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ApplyAdadelta"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ApplyAdagrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ApplyAdagrad.pbtxt
index 5ede2bc76ea499..3da12f57396533 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ApplyAdagrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ApplyAdagrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ApplyAdagrad"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ApplyAdagradDA.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ApplyAdagradDA.pbtxt
index 3d00cb6f01bc9f..6795b12ea72d2f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ApplyAdagradDA.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ApplyAdagradDA.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ApplyAdagradDA"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ApplyAdagradV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ApplyAdagradV2.pbtxt
index f7673fe102988f..12f70b41e8c237 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ApplyAdagradV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ApplyAdagradV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ApplyAdagradV2"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ApplyAdam.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ApplyAdam.pbtxt
index fdcf81e0d03a74..ae512d525f0a76 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ApplyAdam.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ApplyAdam.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ApplyAdam"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ApplyAddSign.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ApplyAddSign.pbtxt
index 9e485fcfec65c2..f489857b9940d1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ApplyAddSign.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ApplyAddSign.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ApplyAddSign"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ApplyCenteredRMSProp.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ApplyCenteredRMSProp.pbtxt
index 3e50dc38c7c12f..908b913f567d1c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ApplyCenteredRMSProp.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ApplyCenteredRMSProp.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ApplyCenteredRMSProp"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ApplyFtrl.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ApplyFtrl.pbtxt
index 6a67647bad7fae..2025575439504e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ApplyFtrl.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ApplyFtrl.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ApplyFtrl"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ApplyFtrlV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ApplyFtrlV2.pbtxt
index 201d61b2fa8373..d2d1af5d40ccb6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ApplyFtrlV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ApplyFtrlV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ApplyFtrlV2"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ApplyGradientDescent.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ApplyGradientDescent.pbtxt
index 25fb5723ebdaf9..8451d74ea1c4ba 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ApplyGradientDescent.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ApplyGradientDescent.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ApplyGradientDescent"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ApplyMomentum.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ApplyMomentum.pbtxt
index 289c6ea6151f89..f54384d4fbd58f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ApplyMomentum.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ApplyMomentum.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ApplyMomentum"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ApplyPowerSign.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ApplyPowerSign.pbtxt
index fb3838c02a2e16..f2f8f79d5bad9b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ApplyPowerSign.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ApplyPowerSign.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ApplyPowerSign"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ApplyProximalAdagrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ApplyProximalAdagrad.pbtxt
index c25959517fcd7b..632d4d5ff91e2f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ApplyProximalAdagrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ApplyProximalAdagrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ApplyProximalAdagrad"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ApplyProximalGradientDescent.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ApplyProximalGradientDescent.pbtxt
index 3482b511cffd59..6a9c4e9ae6d27c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ApplyProximalGradientDescent.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ApplyProximalGradientDescent.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ApplyProximalGradientDescent"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ApplyRMSProp.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ApplyRMSProp.pbtxt
index 24fe49118c943d..77a1e75f1f959f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ApplyRMSProp.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ApplyRMSProp.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ApplyRMSProp"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ApproxTopK.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ApproxTopK.pbtxt
index a7a32594df6aa1..b97ee1259579ee 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ApproxTopK.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ApproxTopK.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ApproxTopK"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ApproximateEqual.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ApproximateEqual.pbtxt
index be01e038fdef3a..0b62abc45a378d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ApproximateEqual.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ApproximateEqual.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ApproximateEqual"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ArgMax.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ArgMax.pbtxt
index 90679987916abd..0cfef9a64eb544 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ArgMax.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ArgMax.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ArgMax"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ArgMin.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ArgMin.pbtxt
index d2113ec185ec1d..4b83dd746cc3c5 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ArgMin.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ArgMin.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ArgMin"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AsString.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AsString.pbtxt
index b2b275470159db..7b985f6126d4a3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AsString.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AsString.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AsString"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Asin.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Asin.pbtxt
index 690d2a556e4be8..652e0ea057b672 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Asin.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Asin.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Asin"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Asinh.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Asinh.pbtxt
index 3d78d9de59399d..7f31ec1236ce91 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Asinh.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Asinh.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Asinh"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Assert.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Assert.pbtxt
index 343b2bd0b8da08..a891ca8c601309 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Assert.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Assert.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Assert"
   input_arg {
     name: "condition"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AssertCardinalityDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AssertCardinalityDataset.pbtxt
index 147978a3363667..edf77a307ece00 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AssertCardinalityDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AssertCardinalityDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AssertCardinalityDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AssertNextDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AssertNextDataset.pbtxt
index 0bdc9b4f9b278e..f1ebd27d543b14 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AssertNextDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AssertNextDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AssertNextDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AssertPrevDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AssertPrevDataset.pbtxt
index 23bced136aa919..62899074c77fd8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AssertPrevDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AssertPrevDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AssertPrevDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Assign.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Assign.pbtxt
index 6760fe93d26221..9255e12f1a8f11 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Assign.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Assign.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Assign"
   input_arg {
     name: "ref"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AssignAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AssignAdd.pbtxt
index eea9bd9ad00ac5..a631b1b807a5ba 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AssignAdd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AssignAdd.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AssignAdd"
   input_arg {
     name: "ref"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AssignAddVariableOp.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AssignAddVariableOp.pbtxt
index 91adfade5b6bc3..c3a8b74a0daaf8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AssignAddVariableOp.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AssignAddVariableOp.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AssignAddVariableOp"
   input_arg {
     name: "resource"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AssignSub.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AssignSub.pbtxt
index 3e13c2c8c52e17..0337c992553583 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AssignSub.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AssignSub.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AssignSub"
   input_arg {
     name: "ref"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AssignSubVariableOp.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AssignSubVariableOp.pbtxt
index a93f5576589e29..a5c9a567d077ae 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AssignSubVariableOp.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AssignSubVariableOp.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AssignSubVariableOp"
   input_arg {
     name: "resource"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AssignVariableOp.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AssignVariableOp.pbtxt
index fa84589b303260..e7c5290c53ecdb 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AssignVariableOp.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AssignVariableOp.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AssignVariableOp"
   input_arg {
     name: "resource"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AssignVariableXlaConcatND.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AssignVariableXlaConcatND.pbtxt
index 8a17f4cadc3417..7c9d5df2c9c8c8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AssignVariableXlaConcatND.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AssignVariableXlaConcatND.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AssignVariableXlaConcatND"
   input_arg {
     name: "resource"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Atan.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Atan.pbtxt
index 78fd00eea4d39b..b063bf81719aed 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Atan.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Atan.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Atan"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Atan2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Atan2.pbtxt
index 9ae51baed83ca7..e58675db4c19d0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Atan2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Atan2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Atan2"
   input_arg {
     name: "y"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Atanh.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Atanh.pbtxt
index 76aaad73963159..28d417a08544e9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Atanh.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Atanh.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Atanh"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AudioSpectrogram.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AudioSpectrogram.pbtxt
index 96bf31789fcec6..dbc2a2280dee3c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AudioSpectrogram.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AudioSpectrogram.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AudioSpectrogram"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AudioSummary.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AudioSummary.pbtxt
index 0a33e5ca25463e..4b1830595e07e4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AudioSummary.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AudioSummary.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AudioSummary"
   input_arg {
     name: "tag"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AudioSummaryV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AudioSummaryV2.pbtxt
index e92012709af98d..313c044aaeb506 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AudioSummaryV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AudioSummaryV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AudioSummaryV2"
   input_arg {
     name: "tag"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AutoShardDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AutoShardDataset.pbtxt
index 6ab98040d710eb..465b757c8e967b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AutoShardDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AutoShardDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AutoShardDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AvgPool.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AvgPool.pbtxt
index f7472845fbcbd4..8e7db139a9a3c7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AvgPool.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AvgPool.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AvgPool"
   input_arg {
     name: "value"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AvgPool3D.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AvgPool3D.pbtxt
index 619b1b4c4eaa70..f3f60cbc1f18d2 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AvgPool3D.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AvgPool3D.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AvgPool3D"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AvgPool3DGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AvgPool3DGrad.pbtxt
index 3c74eac2382c39..67fef9572878db 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AvgPool3DGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AvgPool3DGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AvgPool3DGrad"
   input_arg {
     name: "orig_input_shape"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AvgPoolGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AvgPoolGrad.pbtxt
index f3952fab895c1f..6c72effaffaa43 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AvgPoolGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AvgPoolGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "AvgPoolGrad"
   input_arg {
     name: "orig_input_shape"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BandedTriangularSolve.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BandedTriangularSolve.pbtxt
index c69a9650010a38..5cf85a62392a2f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BandedTriangularSolve.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BandedTriangularSolve.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BandedTriangularSolve"
   input_arg {
     name: "matrix"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Barrier.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Barrier.pbtxt
index bdc8f8f53ba666..9391157b888851 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Barrier.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Barrier.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Barrier"
   output_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BarrierClose.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BarrierClose.pbtxt
index 4d121c65796ef0..69230484813264 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BarrierClose.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BarrierClose.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BarrierClose"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BarrierIncompleteSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BarrierIncompleteSize.pbtxt
index cb040bc1db3c0f..0d17c183684932 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BarrierIncompleteSize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BarrierIncompleteSize.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BarrierIncompleteSize"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BarrierInsertMany.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BarrierInsertMany.pbtxt
index db48fd4b4554d9..86b64f603ebb0a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BarrierInsertMany.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BarrierInsertMany.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BarrierInsertMany"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BarrierReadySize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BarrierReadySize.pbtxt
index 8d214c6e6fedf0..e7b063005593aa 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BarrierReadySize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BarrierReadySize.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BarrierReadySize"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BarrierTakeMany.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BarrierTakeMany.pbtxt
index 71e75d20ca7a08..e324042930451c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BarrierTakeMany.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BarrierTakeMany.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BarrierTakeMany"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Batch.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Batch.pbtxt
index 24fc21b2f288db..d3ee8d8513d434 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Batch.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Batch.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Batch"
   input_arg {
     name: "in_tensors"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchCholesky.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchCholesky.pbtxt
index a8d4223751ce30..5d38acc7c2d563 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchCholesky.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchCholesky.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BatchCholesky"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchCholeskyGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchCholeskyGrad.pbtxt
index 1beef0ebc8dda0..286ae3a81169d6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchCholeskyGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchCholeskyGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BatchCholeskyGrad"
   input_arg {
     name: "l"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchDataset.pbtxt
index da9db473d53bbb..39467ae1bb6d33 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BatchDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchDatasetV2.pbtxt
index 4e95dbe7edd433..a3dc3afed0f53a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchDatasetV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchDatasetV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BatchDatasetV2"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchFFT.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchFFT.pbtxt
index 872afa3d2c76d1..4fe86a392f079b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchFFT.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchFFT.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BatchFFT"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchFFT2D.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchFFT2D.pbtxt
index 1bd0127c2a50ee..b52a6bdca44512 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchFFT2D.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchFFT2D.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BatchFFT2D"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchFFT3D.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchFFT3D.pbtxt
index c1d39ed70f7671..7f19cf13c10573 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchFFT3D.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchFFT3D.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BatchFFT3D"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchFunction.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchFunction.pbtxt
index e35c2c8618eddd..cf5e5896d084ba 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchFunction.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchFunction.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BatchFunction"
   input_arg {
     name: "in_tensors"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchIFFT.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchIFFT.pbtxt
index 256417563cbc66..09d7b4ad7863a0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchIFFT.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchIFFT.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BatchIFFT"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchIFFT2D.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchIFFT2D.pbtxt
index 1c21807dededf1..23cc9cc51df5fc 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchIFFT2D.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchIFFT2D.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BatchIFFT2D"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchIFFT3D.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchIFFT3D.pbtxt
index 3d6ddd79d52095..10a78fab914335 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchIFFT3D.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchIFFT3D.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BatchIFFT3D"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchMatMul.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchMatMul.pbtxt
index 9d7ac3ca8e2a33..8bd778087f1467 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchMatMul.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchMatMul.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BatchMatMul"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchMatMulV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchMatMulV2.pbtxt
index 4769d8220f53e1..ed724ddc49db94 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchMatMulV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchMatMulV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BatchMatMulV2"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchMatMulV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchMatMulV3.pbtxt
index 1bcfdb937064ca..052a39e77c85ca 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchMatMulV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchMatMulV3.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BatchMatMulV3"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchMatrixBandPart.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchMatrixBandPart.pbtxt
index 167fe7b69d484e..413681e612999d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchMatrixBandPart.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchMatrixBandPart.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BatchMatrixBandPart"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchMatrixDeterminant.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchMatrixDeterminant.pbtxt
index 02e8c5dc93ec83..4bc6081aa4482a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchMatrixDeterminant.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchMatrixDeterminant.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BatchMatrixDeterminant"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchMatrixDiag.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchMatrixDiag.pbtxt
index 4b06bc66e2a6dd..6104bef9340001 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchMatrixDiag.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchMatrixDiag.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BatchMatrixDiag"
   input_arg {
     name: "diagonal"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchMatrixDiagPart.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchMatrixDiagPart.pbtxt
index 611f21ce1a1b67..9bd200f8cf5384 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchMatrixDiagPart.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchMatrixDiagPart.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BatchMatrixDiagPart"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchMatrixInverse.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchMatrixInverse.pbtxt
index 7e413ef3b05da5..03a694d973a1b5 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchMatrixInverse.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchMatrixInverse.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BatchMatrixInverse"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchMatrixSetDiag.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchMatrixSetDiag.pbtxt
index 377fe7cea09d92..f459184a0a3a43 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchMatrixSetDiag.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchMatrixSetDiag.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BatchMatrixSetDiag"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchMatrixSolve.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchMatrixSolve.pbtxt
index 020873dea15c47..909502e91ab546 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchMatrixSolve.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchMatrixSolve.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BatchMatrixSolve"
   input_arg {
     name: "matrix"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchMatrixSolveLs.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchMatrixSolveLs.pbtxt
index f6bc34b54f7b77..8c9d24efc7a404 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchMatrixSolveLs.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchMatrixSolveLs.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BatchMatrixSolveLs"
   input_arg {
     name: "matrix"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchMatrixTriangularSolve.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchMatrixTriangularSolve.pbtxt
index f259cceac49a17..406fa62171f511 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchMatrixTriangularSolve.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchMatrixTriangularSolve.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BatchMatrixTriangularSolve"
   input_arg {
     name: "matrix"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchNormWithGlobalNormalization.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchNormWithGlobalNormalization.pbtxt
index 846475ea1bfbbf..b9959a7d2dba82 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchNormWithGlobalNormalization.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchNormWithGlobalNormalization.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BatchNormWithGlobalNormalization"
   input_arg {
     name: "t"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchNormWithGlobalNormalizationGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchNormWithGlobalNormalizationGrad.pbtxt
index c8b1b878780d0c..170d512aec6406 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchNormWithGlobalNormalizationGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchNormWithGlobalNormalizationGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BatchNormWithGlobalNormalizationGrad"
   input_arg {
     name: "t"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchSelfAdjointEig.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchSelfAdjointEig.pbtxt
index e124892ed2d677..42ba04199f7c81 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchSelfAdjointEig.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchSelfAdjointEig.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BatchSelfAdjointEig"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchSelfAdjointEigV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchSelfAdjointEigV2.pbtxt
index bc9128925886cb..df3996ea2378c1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchSelfAdjointEigV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchSelfAdjointEigV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BatchSelfAdjointEigV2"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchSvd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchSvd.pbtxt
index 7080ed06437879..0595ffcd2a6eb6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchSvd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchSvd.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BatchSvd"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchToSpace.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchToSpace.pbtxt
index a96fdc58dfcb32..ac089e5ca76cf0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchToSpace.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchToSpace.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BatchToSpace"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchToSpaceND.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchToSpaceND.pbtxt
index 18560c832264a4..464beb31614de7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchToSpaceND.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchToSpaceND.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BatchToSpaceND"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselI0.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselI0.pbtxt
index b45aa841a7be67..78d524c916c861 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BesselI0.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselI0.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BesselI0"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselI0e.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselI0e.pbtxt
index 8696576a64aa23..299cf82535aa06 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BesselI0e.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselI0e.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BesselI0e"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselI1.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselI1.pbtxt
index 600fdb5771c3c1..e756c4655ddfdf 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BesselI1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselI1.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BesselI1"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselI1e.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselI1e.pbtxt
index d7284658a8bbb8..a9c8d0eb0e5a1b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BesselI1e.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselI1e.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BesselI1e"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselJ0.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselJ0.pbtxt
index 73ee83477debbf..35e14e5fdf173e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BesselJ0.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselJ0.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BesselJ0"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselJ1.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselJ1.pbtxt
index de8e56a83c2739..ef8814ea8f723a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BesselJ1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselJ1.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BesselJ1"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselK0.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselK0.pbtxt
index 6bd13898bf74fa..ebb364d0371e52 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BesselK0.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselK0.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BesselK0"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselK0e.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselK0e.pbtxt
index 4b1125eb3eac38..e3e680c9549023 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BesselK0e.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselK0e.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BesselK0e"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselK1.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselK1.pbtxt
index 04aef9f8fe0e5b..f7ca7c2f6e27b1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BesselK1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselK1.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BesselK1"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselK1e.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselK1e.pbtxt
index 461db7a9222ea1..96fe68d7b7f313 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BesselK1e.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselK1e.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BesselK1e"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselY0.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselY0.pbtxt
index 6d97aba4578264..cd62af3477370a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BesselY0.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselY0.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BesselY0"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselY1.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselY1.pbtxt
index 6aa5123957547c..06f4c08eaf6932 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BesselY1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselY1.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BesselY1"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Betainc.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Betainc.pbtxt
index 330e5456a7afa6..b1523bff9e9807 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Betainc.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Betainc.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Betainc"
   input_arg {
     name: "a"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BiasAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BiasAdd.pbtxt
index 15e7dad6982866..2eba00671d375c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BiasAdd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BiasAdd.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BiasAdd"
   input_arg {
     name: "value"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BiasAddGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BiasAddGrad.pbtxt
index ea11e9ee5dda20..f85a2b9f6a5c01 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BiasAddGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BiasAddGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BiasAddGrad"
   input_arg {
     name: "out_backprop"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BiasAddV1.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BiasAddV1.pbtxt
index b1f6b0cc1fc478..b35e45165b8d63 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BiasAddV1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BiasAddV1.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BiasAddV1"
   input_arg {
     name: "value"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Bincount.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Bincount.pbtxt
index 1e8d56cd7b2dc9..12135bbd54a063 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Bincount.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Bincount.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Bincount"
   input_arg {
     name: "arr"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Bitcast.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Bitcast.pbtxt
index e3c5814e29872b..993a0c6da9ec65 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Bitcast.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Bitcast.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Bitcast"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BitwiseAnd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BitwiseAnd.pbtxt
index 1d8c1eb88c94e3..4b90e0e3de225a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BitwiseAnd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BitwiseAnd.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BitwiseAnd"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BitwiseOr.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BitwiseOr.pbtxt
index 681b469fc95130..393a506f339896 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BitwiseOr.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BitwiseOr.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BitwiseOr"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BitwiseXor.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BitwiseXor.pbtxt
index b8d801443cff93..c72b23fc432331 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BitwiseXor.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BitwiseXor.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BitwiseXor"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BlockLSTM.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BlockLSTM.pbtxt
index 2c4ad7866c77ea..63180f534f17b0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BlockLSTM.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BlockLSTM.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BlockLSTM"
   input_arg {
     name: "seq_len_max"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BlockLSTMGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BlockLSTMGrad.pbtxt
index a03cda1c387f96..e7b6458bc8adb9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BlockLSTMGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BlockLSTMGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BlockLSTMGrad"
   input_arg {
     name: "seq_len_max"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BlockLSTMGradV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BlockLSTMGradV2.pbtxt
index 11b3e4c48f42ea..ed0bd6b245645a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BlockLSTMGradV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BlockLSTMGradV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BlockLSTMGradV2"
   input_arg {
     name: "seq_len_max"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BlockLSTMV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BlockLSTMV2.pbtxt
index 35df5226c632c4..5fce517277de83 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BlockLSTMV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BlockLSTMV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BlockLSTMV2"
   input_arg {
     name: "seq_len_max"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesAggregateStats.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesAggregateStats.pbtxt
index 45fe3a867da0b5..72994094399599 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesAggregateStats.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesAggregateStats.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BoostedTreesAggregateStats"
   input_arg {
     name: "node_ids"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesBucketize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesBucketize.pbtxt
index 61a170763f1dd8..5f277d3e0db3ae 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesBucketize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesBucketize.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BoostedTreesBucketize"
   input_arg {
     name: "float_values"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesCalculateBestFeatureSplit.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesCalculateBestFeatureSplit.pbtxt
index f885b3317006fe..50f35695e006fb 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesCalculateBestFeatureSplit.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesCalculateBestFeatureSplit.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BoostedTreesCalculateBestFeatureSplit"
   input_arg {
     name: "node_id_range"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesCalculateBestFeatureSplitV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesCalculateBestFeatureSplitV2.pbtxt
index 38ddb60c44ff2c..e900ed9c6748ca 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesCalculateBestFeatureSplitV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesCalculateBestFeatureSplitV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BoostedTreesCalculateBestFeatureSplitV2"
   input_arg {
     name: "node_id_range"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesCalculateBestGainsPerFeature.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesCalculateBestGainsPerFeature.pbtxt
index e5f863eb4ac8ef..f100db7b38671c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesCalculateBestGainsPerFeature.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesCalculateBestGainsPerFeature.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BoostedTreesCalculateBestGainsPerFeature"
   input_arg {
     name: "node_id_range"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesCenterBias.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesCenterBias.pbtxt
index 78eba4fcb4ba17..5c2fb9b5c54747 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesCenterBias.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesCenterBias.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BoostedTreesCenterBias"
   input_arg {
     name: "tree_ensemble_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesCreateEnsemble.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesCreateEnsemble.pbtxt
index 8816746475cf45..cea6d23f91ff7b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesCreateEnsemble.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesCreateEnsemble.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BoostedTreesCreateEnsemble"
   input_arg {
     name: "tree_ensemble_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesCreateQuantileStreamResource.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesCreateQuantileStreamResource.pbtxt
index d14e8e40b3a966..3d0d64adcd58fc 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesCreateQuantileStreamResource.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesCreateQuantileStreamResource.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BoostedTreesCreateQuantileStreamResource"
   input_arg {
     name: "quantile_stream_resource_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesDeserializeEnsemble.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesDeserializeEnsemble.pbtxt
index 18aeb311841f89..b6d55ea0544c06 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesDeserializeEnsemble.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesDeserializeEnsemble.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BoostedTreesDeserializeEnsemble"
   input_arg {
     name: "tree_ensemble_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesEnsembleResourceHandleOp.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesEnsembleResourceHandleOp.pbtxt
index 6c79d9a75101cf..00573c1b95a7ec 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesEnsembleResourceHandleOp.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesEnsembleResourceHandleOp.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BoostedTreesEnsembleResourceHandleOp"
   output_arg {
     name: "resource"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesExampleDebugOutputs.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesExampleDebugOutputs.pbtxt
index 8e8f1aa7f332c7..066be042842aef 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesExampleDebugOutputs.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesExampleDebugOutputs.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BoostedTreesExampleDebugOutputs"
   input_arg {
     name: "tree_ensemble_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesFlushQuantileSummaries.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesFlushQuantileSummaries.pbtxt
index d2b84d1e01c5ae..ae35e1023d4fae 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesFlushQuantileSummaries.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesFlushQuantileSummaries.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BoostedTreesFlushQuantileSummaries"
   input_arg {
     name: "quantile_stream_resource_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesGetEnsembleStates.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesGetEnsembleStates.pbtxt
index a6bc2b2749409e..1959384a36b4a5 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesGetEnsembleStates.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesGetEnsembleStates.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BoostedTreesGetEnsembleStates"
   input_arg {
     name: "tree_ensemble_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesMakeQuantileSummaries.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesMakeQuantileSummaries.pbtxt
index 14eacf93d81cdd..bbefa8b8711b1a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesMakeQuantileSummaries.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesMakeQuantileSummaries.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BoostedTreesMakeQuantileSummaries"
   input_arg {
     name: "float_values"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesMakeStatsSummary.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesMakeStatsSummary.pbtxt
index 0772fc259310aa..49a82d2ba0c772 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesMakeStatsSummary.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesMakeStatsSummary.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BoostedTreesMakeStatsSummary"
   input_arg {
     name: "node_ids"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesPredict.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesPredict.pbtxt
index 7f66b1248fe293..7f176cdc901666 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesPredict.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesPredict.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BoostedTreesPredict"
   input_arg {
     name: "tree_ensemble_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesQuantileStreamResourceAddSummaries.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesQuantileStreamResourceAddSummaries.pbtxt
index ba12d9cdccfd88..97e875f4ea0bd8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesQuantileStreamResourceAddSummaries.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesQuantileStreamResourceAddSummaries.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BoostedTreesQuantileStreamResourceAddSummaries"
   input_arg {
     name: "quantile_stream_resource_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesQuantileStreamResourceDeserialize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesQuantileStreamResourceDeserialize.pbtxt
index a08188e2dfbcc2..c3f01fef2a49a9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesQuantileStreamResourceDeserialize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesQuantileStreamResourceDeserialize.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BoostedTreesQuantileStreamResourceDeserialize"
   input_arg {
     name: "quantile_stream_resource_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesQuantileStreamResourceFlush.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesQuantileStreamResourceFlush.pbtxt
index ca7affea02c882..fc2613a0d372e9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesQuantileStreamResourceFlush.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesQuantileStreamResourceFlush.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BoostedTreesQuantileStreamResourceFlush"
   input_arg {
     name: "quantile_stream_resource_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesQuantileStreamResourceGetBucketBoundaries.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesQuantileStreamResourceGetBucketBoundaries.pbtxt
index b359fce14edefc..b2aa8dd4e72321 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesQuantileStreamResourceGetBucketBoundaries.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesQuantileStreamResourceGetBucketBoundaries.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BoostedTreesQuantileStreamResourceGetBucketBoundaries"
   input_arg {
     name: "quantile_stream_resource_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesQuantileStreamResourceHandleOp.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesQuantileStreamResourceHandleOp.pbtxt
index c106a233c27023..ca40a0aa1ede35 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesQuantileStreamResourceHandleOp.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesQuantileStreamResourceHandleOp.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BoostedTreesQuantileStreamResourceHandleOp"
   output_arg {
     name: "resource"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesSerializeEnsemble.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesSerializeEnsemble.pbtxt
index 88a5330954209c..29d19f03117634 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesSerializeEnsemble.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesSerializeEnsemble.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BoostedTreesSerializeEnsemble"
   input_arg {
     name: "tree_ensemble_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesSparseAggregateStats.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesSparseAggregateStats.pbtxt
index b482c304d3dac7..9260634bf1edb5 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesSparseAggregateStats.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesSparseAggregateStats.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BoostedTreesSparseAggregateStats"
   input_arg {
     name: "node_ids"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesSparseCalculateBestFeatureSplit.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesSparseCalculateBestFeatureSplit.pbtxt
index cd590bfbc3838b..86f7a5ffd2118c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesSparseCalculateBestFeatureSplit.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesSparseCalculateBestFeatureSplit.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BoostedTreesSparseCalculateBestFeatureSplit"
   input_arg {
     name: "node_id_range"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesTrainingPredict.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesTrainingPredict.pbtxt
index 8b8b1053ff2c64..615f52c656de08 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesTrainingPredict.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesTrainingPredict.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BoostedTreesTrainingPredict"
   input_arg {
     name: "tree_ensemble_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesUpdateEnsemble.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesUpdateEnsemble.pbtxt
index 9d50a5d4797069..9cd779e314a38c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesUpdateEnsemble.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesUpdateEnsemble.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BoostedTreesUpdateEnsemble"
   input_arg {
     name: "tree_ensemble_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesUpdateEnsembleV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesUpdateEnsembleV2.pbtxt
index f1dc44c6f7ed06..2a573217f3ec25 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesUpdateEnsembleV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BoostedTreesUpdateEnsembleV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BoostedTreesUpdateEnsembleV2"
   input_arg {
     name: "tree_ensemble_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BroadcastArgs.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BroadcastArgs.pbtxt
index 328d3fdb0da8ed..e6dc3990634597 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BroadcastArgs.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BroadcastArgs.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BroadcastArgs"
   input_arg {
     name: "s0"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BroadcastGradientArgs.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BroadcastGradientArgs.pbtxt
index a0e831191f46cf..2e1d739f9881ac 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BroadcastGradientArgs.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BroadcastGradientArgs.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BroadcastGradientArgs"
   input_arg {
     name: "s0"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BroadcastTo.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BroadcastTo.pbtxt
index 350e630e9df0fe..4d29f9ebeb8d22 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BroadcastTo.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BroadcastTo.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BroadcastTo"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Bucketize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Bucketize.pbtxt
index 76174ffde85193..abe818e666cafd 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Bucketize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Bucketize.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Bucketize"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BytesProducedStatsDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BytesProducedStatsDataset.pbtxt
index 2e71b250953aaf..7cbfbbd146c83f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BytesProducedStatsDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BytesProducedStatsDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "BytesProducedStatsDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CSRSparseMatrixComponents.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CSRSparseMatrixComponents.pbtxt
index 050ebb37abfa96..614097be7efe9e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CSRSparseMatrixComponents.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CSRSparseMatrixComponents.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CSRSparseMatrixComponents"
   input_arg {
     name: "csr_sparse_matrix"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CSRSparseMatrixToDense.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CSRSparseMatrixToDense.pbtxt
index be8df4e2115271..ed0cab0fcca8f4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CSRSparseMatrixToDense.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CSRSparseMatrixToDense.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CSRSparseMatrixToDense"
   input_arg {
     name: "sparse_input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CSRSparseMatrixToSparseTensor.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CSRSparseMatrixToSparseTensor.pbtxt
index dba86fe65b7ed8..39870fe0ca893c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CSRSparseMatrixToSparseTensor.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CSRSparseMatrixToSparseTensor.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CSRSparseMatrixToSparseTensor"
   input_arg {
     name: "sparse_matrix"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CSVDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CSVDataset.pbtxt
index 02655c49a13501..56e1a03ab62995 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CSVDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CSVDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CSVDataset"
   input_arg {
     name: "filenames"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CSVDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CSVDatasetV2.pbtxt
index 0d3909d2bff660..c540e8407e8480 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CSVDatasetV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CSVDatasetV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CSVDatasetV2"
   input_arg {
     name: "filenames"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CTCBeamSearchDecoder.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CTCBeamSearchDecoder.pbtxt
index 6bcd60420e5fdb..5bd1968c832d29 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CTCBeamSearchDecoder.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CTCBeamSearchDecoder.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CTCBeamSearchDecoder"
   input_arg {
     name: "inputs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CTCGreedyDecoder.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CTCGreedyDecoder.pbtxt
index ab7f9af6e8c943..65e36d169a3bd9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CTCGreedyDecoder.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CTCGreedyDecoder.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CTCGreedyDecoder"
   input_arg {
     name: "inputs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CTCLoss.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CTCLoss.pbtxt
index 54ced032292324..1d7e041dd9111e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CTCLoss.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CTCLoss.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CTCLoss"
   input_arg {
     name: "inputs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CTCLossV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CTCLossV2.pbtxt
index e9631882f6595a..5a68abaa4e5ad8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CTCLossV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CTCLossV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CTCLossV2"
   input_arg {
     name: "inputs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CacheDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CacheDataset.pbtxt
index 3acc3ce899e6c4..8b8ec246c5be80 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CacheDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CacheDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CacheDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CacheDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CacheDatasetV2.pbtxt
index 78d13d28238562..43fe482cba6e0a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CacheDatasetV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CacheDatasetV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CacheDatasetV2"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Case.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Case.pbtxt
index 5a2cb6a6dfafbb..39cfc3f723630d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Case.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Case.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Case"
   input_arg {
     name: "branch_index"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Cast.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Cast.pbtxt
index 695048c5c775e2..581a8e4a8cc8a7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Cast.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Cast.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Cast"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Ceil.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Ceil.pbtxt
index ad3771e9a84fdb..cdec08500072df 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Ceil.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Ceil.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Ceil"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CheckNumerics.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CheckNumerics.pbtxt
index 64bafc119602a5..9e63b170b231db 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CheckNumerics.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CheckNumerics.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CheckNumerics"
   input_arg {
     name: "tensor"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CheckNumericsV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CheckNumericsV2.pbtxt
index 362728e3dc53b1..ba06f6a3524e48 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CheckNumericsV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CheckNumericsV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CheckNumericsV2"
   input_arg {
     name: "tensor"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Cholesky.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Cholesky.pbtxt
index bd26758db797c8..e3cee5fcf89aa8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Cholesky.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Cholesky.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Cholesky"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CholeskyGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CholeskyGrad.pbtxt
index e526acd1f84513..0f7c7efd88c46f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CholeskyGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CholeskyGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CholeskyGrad"
   input_arg {
     name: "l"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ChooseFastestBranchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ChooseFastestBranchDataset.pbtxt
index ab120b02bcfafd..4850496a3954b3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ChooseFastestBranchDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ChooseFastestBranchDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ChooseFastestBranchDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ChooseFastestDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ChooseFastestDataset.pbtxt
index f27c03e6e881d4..476e834edde27e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ChooseFastestDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ChooseFastestDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ChooseFastestDataset"
   input_arg {
     name: "input_datasets"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ClipByValue.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ClipByValue.pbtxt
index 821ca0c2425f7d..8652a168e85187 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ClipByValue.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ClipByValue.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ClipByValue"
   input_arg {
     name: "t"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CloseSummaryWriter.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CloseSummaryWriter.pbtxt
index 7a4dea11575c66..f67e1aaff2470c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CloseSummaryWriter.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CloseSummaryWriter.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CloseSummaryWriter"
   input_arg {
     name: "writer"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CollateTPUEmbeddingMemory.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CollateTPUEmbeddingMemory.pbtxt
index 61f6f6fcccabaf..16c232371ad944 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CollateTPUEmbeddingMemory.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CollateTPUEmbeddingMemory.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CollateTPUEmbeddingMemory"
   input_arg {
     name: "memory_configs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CollectiveAllToAllV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CollectiveAllToAllV2.pbtxt
index db1520d1af3f19..85e76487c019f1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CollectiveAllToAllV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CollectiveAllToAllV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CollectiveAllToAllV2"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CollectiveAllToAllV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CollectiveAllToAllV3.pbtxt
index ce690300d651bb..8e08f0bbbf21bd 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CollectiveAllToAllV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CollectiveAllToAllV3.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CollectiveAllToAllV3"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CollectiveAssignGroupV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CollectiveAssignGroupV2.pbtxt
index 888884e1dd1a87..891eab6162c961 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CollectiveAssignGroupV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CollectiveAssignGroupV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CollectiveAssignGroupV2"
   input_arg {
     name: "group_assignment"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CollectiveBcastRecv.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CollectiveBcastRecv.pbtxt
index aee2ceeb9dd728..bb2f18d7fd2637 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CollectiveBcastRecv.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CollectiveBcastRecv.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CollectiveBcastRecv"
   output_arg {
     name: "data"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CollectiveBcastRecvV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CollectiveBcastRecvV2.pbtxt
index 7da7797030f61b..35a48eec2582f2 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CollectiveBcastRecvV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CollectiveBcastRecvV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CollectiveBcastRecvV2"
   input_arg {
     name: "group_size"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CollectiveBcastSend.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CollectiveBcastSend.pbtxt
index 5ac9ecf25b7da5..50719b2f193899 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CollectiveBcastSend.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CollectiveBcastSend.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CollectiveBcastSend"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CollectiveBcastSendV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CollectiveBcastSendV2.pbtxt
index f5f058fca6ec59..3fa3a7d269d76b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CollectiveBcastSendV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CollectiveBcastSendV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CollectiveBcastSendV2"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CollectiveGather.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CollectiveGather.pbtxt
index 1c5a520a41daf3..7a4610a33f6072 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CollectiveGather.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CollectiveGather.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CollectiveGather"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CollectiveGatherV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CollectiveGatherV2.pbtxt
index 3bd706437fafed..d85b0612c3dddb 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CollectiveGatherV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CollectiveGatherV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CollectiveGatherV2"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CollectiveInitializeCommunicator.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CollectiveInitializeCommunicator.pbtxt
index 4ac5ab48cdba99..9b9340a118a35e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CollectiveInitializeCommunicator.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CollectiveInitializeCommunicator.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CollectiveInitializeCommunicator"
   input_arg {
     name: "group_key"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CollectivePermute.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CollectivePermute.pbtxt
index 9a4ba13ea1917b..b3d33d2447e8e7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CollectivePermute.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CollectivePermute.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CollectivePermute"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduce.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduce.pbtxt
index 303ce3ebd83abd..a93a8505dd3209 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduce.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduce.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CollectiveReduce"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduceScatterV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduceScatterV2.pbtxt
index 49551a88bcf242..f64772c2d270f8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduceScatterV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduceScatterV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CollectiveReduceScatterV2"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduceV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduceV2.pbtxt
index 37e60303ec3657..b42db7ed9ae53c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduceV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduceV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CollectiveReduceV2"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduceV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduceV3.pbtxt
index 4459e5580d2264..53d1697add4c55 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduceV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduceV3.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CollectiveReduceV3"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CombinedNonMaxSuppression.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CombinedNonMaxSuppression.pbtxt
index 983b747b1bac53..55e27122e9d862 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CombinedNonMaxSuppression.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CombinedNonMaxSuppression.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CombinedNonMaxSuppression"
   input_arg {
     name: "boxes"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Complex.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Complex.pbtxt
index c22e2b8045e10a..5d17643c89473c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Complex.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Complex.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Complex"
   input_arg {
     name: "real"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ComplexAbs.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ComplexAbs.pbtxt
index 60f95372d7c985..6e7cfc1266aa15 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ComplexAbs.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ComplexAbs.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ComplexAbs"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CompositeTensorVariantFromComponents.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CompositeTensorVariantFromComponents.pbtxt
index f6a900fd692d55..ed0660bc02e05f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CompositeTensorVariantFromComponents.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CompositeTensorVariantFromComponents.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CompositeTensorVariantFromComponents"
   input_arg {
     name: "components"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CompositeTensorVariantToComponents.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CompositeTensorVariantToComponents.pbtxt
index b877c9535bd31d..fa8c5a4d95b316 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CompositeTensorVariantToComponents.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CompositeTensorVariantToComponents.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CompositeTensorVariantToComponents"
   input_arg {
     name: "encoded"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CompressElement.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CompressElement.pbtxt
index 5da73190c445c8..07d8cb461afcbe 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CompressElement.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CompressElement.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CompressElement"
   input_arg {
     name: "components"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ComputeAccidentalHits.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ComputeAccidentalHits.pbtxt
index 6d8dfad02d5845..0bac269ba6b9f2 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ComputeAccidentalHits.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ComputeAccidentalHits.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ComputeAccidentalHits"
   input_arg {
     name: "true_classes"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ComputeBatchSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ComputeBatchSize.pbtxt
index 67dfb6ef08def0..13ab4eef4d0a61 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ComputeBatchSize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ComputeBatchSize.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ComputeBatchSize"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataSize.pbtxt
index b9f68e23d3d77b..0d4ec98a96f357 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataSize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataSize.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ComputeDedupDataSize"
   output_arg {
     name: "num_elements"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataSizeV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataSizeV2.pbtxt
index 7851f740a6b679..2493251c1fddc6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataSizeV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataSizeV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ComputeDedupDataSizeV2"
   output_arg {
     name: "num_elements"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataTupleMask.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataTupleMask.pbtxt
index d04dc21c9546d6..ba91c01af1ba78 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataTupleMask.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataTupleMask.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ComputeDedupDataTupleMask"
   output_arg {
     name: "output_shape"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataTupleMaskV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataTupleMaskV2.pbtxt
index f1e3e21c75ad1b..87deca2a2daecd 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataTupleMaskV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataTupleMaskV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ComputeDedupDataTupleMaskV2"
   output_arg {
     name: "output_shape"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Concat.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Concat.pbtxt
index 878098fbb5586a..21ff0fda4338f0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Concat.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Concat.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Concat"
   input_arg {
     name: "concat_dim"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ConcatOffset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ConcatOffset.pbtxt
index a2d85962951864..f2fbb0062a1f11 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ConcatOffset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ConcatOffset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ConcatOffset"
   input_arg {
     name: "concat_dim"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ConcatV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ConcatV2.pbtxt
index d14b7b8d507d4e..d11dc14a9fffba 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ConcatV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ConcatV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ConcatV2"
   input_arg {
     name: "values"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ConcatenateDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ConcatenateDataset.pbtxt
index f3d827b100dd35..3bdf420d07b14b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ConcatenateDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ConcatenateDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ConcatenateDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ConditionalAccumulator.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ConditionalAccumulator.pbtxt
index fbb1e42c13cb5b..678e9736ff978e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ConditionalAccumulator.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ConditionalAccumulator.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ConditionalAccumulator"
   output_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ConfigureAndInitializeGlobalTPU.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ConfigureAndInitializeGlobalTPU.pbtxt
index 0ea06f390bcb5f..96ba7c2527d732 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ConfigureAndInitializeGlobalTPU.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ConfigureAndInitializeGlobalTPU.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ConfigureAndInitializeGlobalTPU"
   output_arg {
     name: "output"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ConfigureDistributedTPU.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ConfigureDistributedTPU.pbtxt
index 790c463a57b7f2..72f944042dcaaf 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ConfigureDistributedTPU.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ConfigureDistributedTPU.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ConfigureDistributedTPU"
   output_arg {
     name: "topology"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ConfigureTPUEmbedding.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ConfigureTPUEmbedding.pbtxt
index 6949705a6a0183..6e61f8870c30b7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ConfigureTPUEmbedding.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ConfigureTPUEmbedding.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ConfigureTPUEmbedding"
   attr {
     name: "config"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ConfigureTPUEmbeddingHost.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ConfigureTPUEmbeddingHost.pbtxt
index e7c07085a59269..2e99447437a6e9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ConfigureTPUEmbeddingHost.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ConfigureTPUEmbeddingHost.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ConfigureTPUEmbeddingHost"
   input_arg {
     name: "common_config"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ConfigureTPUEmbeddingMemory.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ConfigureTPUEmbeddingMemory.pbtxt
index 138d1486a5bcaf..ab1b0b8511e4c0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ConfigureTPUEmbeddingMemory.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ConfigureTPUEmbeddingMemory.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ConfigureTPUEmbeddingMemory"
   input_arg {
     name: "common_config"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Conj.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Conj.pbtxt
index 74a543143d071a..6e98e166726cd9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Conj.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Conj.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Conj"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ConjugateTranspose.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ConjugateTranspose.pbtxt
index 16047a1a25d6fd..417a2a53cd0e7d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ConjugateTranspose.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ConjugateTranspose.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ConjugateTranspose"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ConnectTPUEmbeddingHosts.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ConnectTPUEmbeddingHosts.pbtxt
index 78b2395bae806d..af1e2d5e0e596f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ConnectTPUEmbeddingHosts.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ConnectTPUEmbeddingHosts.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ConnectTPUEmbeddingHosts"
   input_arg {
     name: "network_configs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Const.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Const.pbtxt
index ea0c1c0864951d..6512d22e435f73 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Const.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Const.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Const"
   output_arg {
     name: "output"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ConsumeMutexLock.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ConsumeMutexLock.pbtxt
index 3a36b864063ee6..4340267edf935b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ConsumeMutexLock.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ConsumeMutexLock.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ConsumeMutexLock"
   input_arg {
     name: "mutex_lock"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ControlTrigger.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ControlTrigger.pbtxt
index 6c77cdfebf25dd..2fe84fee80152f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ControlTrigger.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ControlTrigger.pbtxt
@@ -1,3 +1,3 @@
-op 	 {
+op {
   name: "ControlTrigger"
 }
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Conv.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Conv.pbtxt
index 1651b7a2150dd7..c626db885b594f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Conv.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Conv.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Conv"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Conv2D.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Conv2D.pbtxt
index 1e7916c736761c..a95e3d23aea137 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Conv2D.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Conv2D.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Conv2D"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Conv2DBackpropFilter.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Conv2DBackpropFilter.pbtxt
index a7bb1cbfc63127..1c656127f7e76a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Conv2DBackpropFilter.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Conv2DBackpropFilter.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Conv2DBackpropFilter"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Conv2DBackpropFilterV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Conv2DBackpropFilterV2.pbtxt
index c56a507acddd69..e9bf463cc70237 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Conv2DBackpropFilterV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Conv2DBackpropFilterV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Conv2DBackpropFilterV2"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Conv2DBackpropInput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Conv2DBackpropInput.pbtxt
index 0c46e05bbe485e..04d32885860ea6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Conv2DBackpropInput.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Conv2DBackpropInput.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Conv2DBackpropInput"
   input_arg {
     name: "input_sizes"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Conv2DBackpropInputV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Conv2DBackpropInputV2.pbtxt
index 82148176cadab3..414a06604d1d71 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Conv2DBackpropInputV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Conv2DBackpropInputV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Conv2DBackpropInputV2"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Conv3D.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Conv3D.pbtxt
index f62fda0c7c17c6..a04a66081fff37 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Conv3D.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Conv3D.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Conv3D"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Conv3DBackpropFilter.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Conv3DBackpropFilter.pbtxt
index 1c43d8de15d2f2..3091cfdcba7629 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Conv3DBackpropFilter.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Conv3DBackpropFilter.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Conv3DBackpropFilter"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Conv3DBackpropFilterV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Conv3DBackpropFilterV2.pbtxt
index fa96e2ceea7d65..2494eba6e8a2b6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Conv3DBackpropFilterV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Conv3DBackpropFilterV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Conv3DBackpropFilterV2"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Conv3DBackpropInput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Conv3DBackpropInput.pbtxt
index 0a2429303607fa..7fa3a5548d7628 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Conv3DBackpropInput.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Conv3DBackpropInput.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Conv3DBackpropInput"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Conv3DBackpropInputV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Conv3DBackpropInputV2.pbtxt
index 0bdc6e2d7d3d30..e01b33dc4a87ec 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Conv3DBackpropInputV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Conv3DBackpropInputV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Conv3DBackpropInputV2"
   input_arg {
     name: "input_sizes"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ConvertToCooTensor.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ConvertToCooTensor.pbtxt
index 31aeb0dfd962c4..65577c03134142 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ConvertToCooTensor.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ConvertToCooTensor.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ConvertToCooTensor"
   input_arg {
     name: "indices_or_row_splits"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ConvertToListOfSparseCoreCooTensors.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ConvertToListOfSparseCoreCooTensors.pbtxt
new file mode 100644
index 00000000000000..137b4eca0acfb6
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/ConvertToListOfSparseCoreCooTensors.pbtxt
@@ -0,0 +1,73 @@
+op {
+  name: "ConvertToListOfSparseCoreCooTensors"
+  input_arg {
+    name: "indices_or_row_splits"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "weights"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "row_ids_list"
+    type: DT_INT32
+    number_attr: "num_sc_per_chip"
+  }
+  output_arg {
+    name: "col_ids_list"
+    type: DT_INT32
+    number_attr: "num_sc_per_chip"
+  }
+  output_arg {
+    name: "gains_list"
+    type: DT_FLOAT
+    number_attr: "num_sc_per_chip"
+  }
+  attr {
+    name: "sample_count"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sc_per_chip"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "row_offset"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "col_offset"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "col_shift"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_sc_shards"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "stacked_table_sample_count"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "combiner"
+    type: "string"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ConvertToSparseCoreCsrWrappedCooTensor.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ConvertToSparseCoreCsrWrappedCooTensor.pbtxt
new file mode 100644
index 00000000000000..defd0e3b53b1c0
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/ConvertToSparseCoreCsrWrappedCooTensor.pbtxt
@@ -0,0 +1,105 @@
+op {
+  name: "ConvertToSparseCoreCsrWrappedCooTensor"
+  input_arg {
+    name: "sorted_row_ids_list"
+    type: DT_INT32
+    number_attr: "num_sc_per_chip"
+  }
+  input_arg {
+    name: "sorted_col_ids_list"
+    type: DT_INT32
+    number_attr: "num_sc_per_chip"
+  }
+  input_arg {
+    name: "sorted_gains_list"
+    type: DT_FLOAT
+    number_attr: "num_sc_per_chip"
+  }
+  input_arg {
+    name: "id_counts_list"
+    type: DT_INT32
+    number_attr: "num_sc_per_chip"
+  }
+  input_arg {
+    name: "splits"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "row_pointers_unpadded_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "ids_unpadded_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "num_minibatches_per_sc"
+    type: DT_INT32
+  }
+  attr {
+    name: "sample_count_per_sc"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_replica"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "max_minibatches_per_sc"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "max_ids_per_chip_per_sample"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "table_vocab_size"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "feature_width"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sc_per_chip"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+  attr {
+    name: "allow_id_dropping"
+    type: "bool"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Copy.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Copy.pbtxt
index 693fbfe081ac3f..258aecc2947457 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Copy.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Copy.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Copy"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CopyHost.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CopyHost.pbtxt
index 89d49896724450..07eb864f460c25 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CopyHost.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CopyHost.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CopyHost"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CopyToMesh.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CopyToMesh.pbtxt
index 3d1b4b1bffb059..50e0a66e784a74 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CopyToMesh.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CopyToMesh.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CopyToMesh"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CopyToMeshGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CopyToMeshGrad.pbtxt
index c64c8dc3790bf8..e75ffe9bc3eb37 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CopyToMeshGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CopyToMeshGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CopyToMeshGrad"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Cos.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Cos.pbtxt
index 00cdee090bb89b..52b7c1e795560c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Cos.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Cos.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Cos"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Cosh.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Cosh.pbtxt
index 67b5ea0e591614..7a29316305061a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Cosh.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Cosh.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Cosh"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CountUpTo.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CountUpTo.pbtxt
index e9ff14eb74db60..05726df8c11e4c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CountUpTo.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CountUpTo.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CountUpTo"
   input_arg {
     name: "ref"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CreateSummaryDbWriter.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CreateSummaryDbWriter.pbtxt
index 1b8760c6bc65c9..7a5f844bbb6ed8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CreateSummaryDbWriter.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CreateSummaryDbWriter.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CreateSummaryDbWriter"
   input_arg {
     name: "writer"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CreateSummaryFileWriter.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CreateSummaryFileWriter.pbtxt
index b2edd27ffa5695..61106e9fc851eb 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CreateSummaryFileWriter.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CreateSummaryFileWriter.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CreateSummaryFileWriter"
   input_arg {
     name: "writer"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CropAndResize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CropAndResize.pbtxt
index cb5c951b82f088..57b02c6638874e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CropAndResize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CropAndResize.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CropAndResize"
   input_arg {
     name: "image"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CropAndResizeGradBoxes.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CropAndResizeGradBoxes.pbtxt
index 6d6be12cf1146f..d3f62e3ba4cea4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CropAndResizeGradBoxes.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CropAndResizeGradBoxes.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CropAndResizeGradBoxes"
   input_arg {
     name: "grads"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CropAndResizeGradImage.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CropAndResizeGradImage.pbtxt
index a3d0713d1f34ac..6ae744f428a506 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CropAndResizeGradImage.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CropAndResizeGradImage.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CropAndResizeGradImage"
   input_arg {
     name: "grads"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Cross.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Cross.pbtxt
index 12ba58289127fb..b80215fcbc16ee 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Cross.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Cross.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Cross"
   input_arg {
     name: "a"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CrossReplicaSum.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CrossReplicaSum.pbtxt
index 13c1bb582fccf9..947028f2c539a9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CrossReplicaSum.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CrossReplicaSum.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CrossReplicaSum"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNN.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNN.pbtxt
index 4a7753b4988a00..181b2c30b153d5 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNN.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNN.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CudnnRNN"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNBackprop.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNBackprop.pbtxt
index bc95978eb0da08..fbab0e98b82484 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNBackprop.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNBackprop.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CudnnRNNBackprop"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNBackpropV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNBackpropV2.pbtxt
index c76d1583656abd..3d2b03657a5e88 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNBackpropV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNBackpropV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CudnnRNNBackpropV2"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNBackpropV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNBackpropV3.pbtxt
index b901c64dbcbd3f..19caef08548618 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNBackpropV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNBackpropV3.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CudnnRNNBackpropV3"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNCanonicalToParams.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNCanonicalToParams.pbtxt
index 3b46f61998c093..b8162cb8123909 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNCanonicalToParams.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNCanonicalToParams.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CudnnRNNCanonicalToParams"
   input_arg {
     name: "num_layers"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNCanonicalToParamsV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNCanonicalToParamsV2.pbtxt
index 9c51d58cdbe96c..25b6fa3b1bb323 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNCanonicalToParamsV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNCanonicalToParamsV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CudnnRNNCanonicalToParamsV2"
   input_arg {
     name: "num_layers"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNParamsSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNParamsSize.pbtxt
index d88ab4c5499f7a..d67825a0b67366 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNParamsSize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNParamsSize.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CudnnRNNParamsSize"
   input_arg {
     name: "num_layers"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNParamsToCanonical.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNParamsToCanonical.pbtxt
index 4cce256653d512..0c4917bdf6b554 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNParamsToCanonical.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNParamsToCanonical.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CudnnRNNParamsToCanonical"
   input_arg {
     name: "num_layers"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNParamsToCanonicalV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNParamsToCanonicalV2.pbtxt
index 70fe8004c48bdd..40c3cd58536000 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNParamsToCanonicalV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNParamsToCanonicalV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CudnnRNNParamsToCanonicalV2"
   input_arg {
     name: "num_layers"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNV2.pbtxt
index caf4d17262626c..fb05fea4beb8c7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CudnnRNNV2"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNV3.pbtxt
index 91535bbe7adab8..8e71b147b727af 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CudnnRNNV3.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CudnnRNNV3"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Cumprod.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Cumprod.pbtxt
index 9ae7fcfd322abb..2294de2efe238c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Cumprod.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Cumprod.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Cumprod"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Cumsum.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Cumsum.pbtxt
index a5817e2d44679b..4bf125962944d0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Cumsum.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Cumsum.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Cumsum"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CumulativeLogsumexp.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CumulativeLogsumexp.pbtxt
index e6e47e700f278f..21d25d772f478e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CumulativeLogsumexp.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CumulativeLogsumexp.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "CumulativeLogsumexp"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DTensorRestoreV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DTensorRestoreV2.pbtxt
index 8b96f73f15b9b1..3e082064faff93 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DTensorRestoreV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DTensorRestoreV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DTensorRestoreV2"
   input_arg {
     name: "prefix"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DTensorSetGlobalTPUArray.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DTensorSetGlobalTPUArray.pbtxt
index f466f7f4a0f78a..b7cf1cfa9c5904 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DTensorSetGlobalTPUArray.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DTensorSetGlobalTPUArray.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DTensorSetGlobalTPUArray"
   input_arg {
     name: "topology"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DataFormatDimMap.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DataFormatDimMap.pbtxt
index 0550a275713d1d..a01806bad6afa3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DataFormatDimMap.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DataFormatDimMap.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DataFormatDimMap"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DataFormatVecPermute.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DataFormatVecPermute.pbtxt
index 1304fbbc4bacd8..e439414d6d3503 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DataFormatVecPermute.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DataFormatVecPermute.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DataFormatVecPermute"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DataServiceDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DataServiceDataset.pbtxt
index f15d2af6c6b9b9..449ae514cbb89c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DataServiceDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DataServiceDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DataServiceDataset"
   input_arg {
     name: "dataset_id"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DataServiceDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DataServiceDatasetV2.pbtxt
index cc213b8a2c0f84..1b3547e62b6ad8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DataServiceDatasetV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DataServiceDatasetV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DataServiceDatasetV2"
   input_arg {
     name: "dataset_id"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DataServiceDatasetV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DataServiceDatasetV3.pbtxt
index 8860baa354c08b..0151a0ed46c148 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DataServiceDatasetV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DataServiceDatasetV3.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DataServiceDatasetV3"
   input_arg {
     name: "dataset_id"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DataServiceDatasetV4.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DataServiceDatasetV4.pbtxt
index ca1f1709f98197..1ea88a869b037c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DataServiceDatasetV4.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DataServiceDatasetV4.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DataServiceDatasetV4"
   input_arg {
     name: "dataset_id"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DatasetCardinality.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DatasetCardinality.pbtxt
index 8e6e5f85f83e4d..435bf6774e03cd 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DatasetCardinality.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DatasetCardinality.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DatasetCardinality"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DatasetFingerprint.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DatasetFingerprint.pbtxt
index 11fcf992a5dd7b..db1c39a335720d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DatasetFingerprint.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DatasetFingerprint.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DatasetFingerprint"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DatasetFromGraph.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DatasetFromGraph.pbtxt
index b7afa744de1fb4..54c8ed8b014816 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DatasetFromGraph.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DatasetFromGraph.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DatasetFromGraph"
   input_arg {
     name: "graph_def"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DatasetToGraph.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DatasetToGraph.pbtxt
index 1cf25d1f8bf874..5bc711b6b66080 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DatasetToGraph.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DatasetToGraph.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DatasetToGraph"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DatasetToGraphV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DatasetToGraphV2.pbtxt
index 8fd09dfe6ba036..9f0500c8bdf214 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DatasetToGraphV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DatasetToGraphV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DatasetToGraphV2"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DatasetToSingleElement.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DatasetToSingleElement.pbtxt
index 1531425688c05b..8633fe65f4dcc1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DatasetToSingleElement.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DatasetToSingleElement.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DatasetToSingleElement"
   input_arg {
     name: "dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DatasetToTFRecord.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DatasetToTFRecord.pbtxt
index e6547a87e970ec..e13f88e1c25d3b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DatasetToTFRecord.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DatasetToTFRecord.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DatasetToTFRecord"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Dawsn.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Dawsn.pbtxt
index 7b69d80753e825..a2f8cba9d58b43 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Dawsn.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Dawsn.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Dawsn"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DebugGradientIdentity.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DebugGradientIdentity.pbtxt
index 9c94f17ae44f02..e1b425730d8cd4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DebugGradientIdentity.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DebugGradientIdentity.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DebugGradientIdentity"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DebugGradientRefIdentity.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DebugGradientRefIdentity.pbtxt
index 1cd87c9d5335b1..f75b7784ac0c05 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DebugGradientRefIdentity.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DebugGradientRefIdentity.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DebugGradientRefIdentity"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DebugIdentity.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DebugIdentity.pbtxt
index f971fb59d398e2..50f97d847d4f3d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DebugIdentity.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DebugIdentity.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DebugIdentity"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DebugIdentityV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DebugIdentityV2.pbtxt
index c6a523b2038bd3..ea92aaa4943cb1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DebugIdentityV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DebugIdentityV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DebugIdentityV2"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DebugIdentityV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DebugIdentityV3.pbtxt
index ddcfd4f6a874a0..73d76b888bc0d8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DebugIdentityV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DebugIdentityV3.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DebugIdentityV3"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DebugNanCount.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DebugNanCount.pbtxt
index cf635ac53aa6e1..82ae073497fd90 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DebugNanCount.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DebugNanCount.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DebugNanCount"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DebugNumericSummary.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DebugNumericSummary.pbtxt
index 965ef30ab24a90..d108b54e1cdea9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DebugNumericSummary.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DebugNumericSummary.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DebugNumericSummary"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DebugNumericSummaryV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DebugNumericSummaryV2.pbtxt
index f58db91ce09782..58c11f58af0bf5 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DebugNumericSummaryV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DebugNumericSummaryV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DebugNumericSummaryV2"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DecodeAndCropJpeg.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DecodeAndCropJpeg.pbtxt
index ee8f8536e500b8..d1d767f4714b32 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DecodeAndCropJpeg.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DecodeAndCropJpeg.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DecodeAndCropJpeg"
   input_arg {
     name: "contents"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DecodeBase64.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DecodeBase64.pbtxt
index 700152f16eccb5..cc7d61ce119cf4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DecodeBase64.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DecodeBase64.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DecodeBase64"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DecodeBmp.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DecodeBmp.pbtxt
index d8ab837cc10e35..40ac5f0b20bd00 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DecodeBmp.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DecodeBmp.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DecodeBmp"
   input_arg {
     name: "contents"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DecodeCSV.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DecodeCSV.pbtxt
index e60342992df29d..f4fee2a95a5ff3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DecodeCSV.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DecodeCSV.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DecodeCSV"
   input_arg {
     name: "records"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DecodeCompressed.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DecodeCompressed.pbtxt
index ec0e15b92b74ac..8a345fffa8f409 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DecodeCompressed.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DecodeCompressed.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DecodeCompressed"
   input_arg {
     name: "bytes"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DecodeGif.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DecodeGif.pbtxt
index c0e732cd5f5d8d..89b21b376e4035 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DecodeGif.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DecodeGif.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DecodeGif"
   input_arg {
     name: "contents"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DecodeImage.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DecodeImage.pbtxt
index 6771daae228047..066ffd1091d0ee 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DecodeImage.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DecodeImage.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DecodeImage"
   input_arg {
     name: "contents"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DecodeJSONExample.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DecodeJSONExample.pbtxt
index 348bf9b1cda17a..ec37ae546c5b7d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DecodeJSONExample.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DecodeJSONExample.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DecodeJSONExample"
   input_arg {
     name: "json_examples"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DecodeJpeg.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DecodeJpeg.pbtxt
index ad09dea051a347..9a4b4e4443c530 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DecodeJpeg.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DecodeJpeg.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DecodeJpeg"
   input_arg {
     name: "contents"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DecodePaddedRaw.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DecodePaddedRaw.pbtxt
index d0bd867a9c3893..d4c3dab8607e8a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DecodePaddedRaw.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DecodePaddedRaw.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DecodePaddedRaw"
   input_arg {
     name: "input_bytes"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DecodePng.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DecodePng.pbtxt
index 68c77685db7118..dd7bd024365408 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DecodePng.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DecodePng.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DecodePng"
   input_arg {
     name: "contents"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DecodeProtoV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DecodeProtoV2.pbtxt
index e6a52119199998..ae72d29d4e72f9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DecodeProtoV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DecodeProtoV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DecodeProtoV2"
   input_arg {
     name: "bytes"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DecodeRaw.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DecodeRaw.pbtxt
index 5a3d5a475ffce7..bc8ce80563a202 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DecodeRaw.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DecodeRaw.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DecodeRaw"
   input_arg {
     name: "bytes"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DecodeWav.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DecodeWav.pbtxt
index 698e67694d1224..8eba7b95ad01f0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DecodeWav.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DecodeWav.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DecodeWav"
   input_arg {
     name: "contents"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DeepCopy.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DeepCopy.pbtxt
index 197842671a3467..e673960be5dc04 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DeepCopy.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DeepCopy.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DeepCopy"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DeleteIterator.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DeleteIterator.pbtxt
index afd3cf0c3fdc15..3050ea922611dd 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DeleteIterator.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DeleteIterator.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DeleteIterator"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DeleteMemoryCache.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DeleteMemoryCache.pbtxt
index 21de51004ebdda..821293ba6a7587 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DeleteMemoryCache.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DeleteMemoryCache.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DeleteMemoryCache"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DeleteMultiDeviceIterator.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DeleteMultiDeviceIterator.pbtxt
index 4bba829418c3e4..b4ae640cec2abc 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DeleteMultiDeviceIterator.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DeleteMultiDeviceIterator.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DeleteMultiDeviceIterator"
   input_arg {
     name: "multi_device_iterator"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DeleteRandomSeedGenerator.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DeleteRandomSeedGenerator.pbtxt
index 9272e55a0d7b63..0c0d2d1057422b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DeleteRandomSeedGenerator.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DeleteRandomSeedGenerator.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DeleteRandomSeedGenerator"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DeleteSeedGenerator.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DeleteSeedGenerator.pbtxt
index 125eae81544aa3..e588b1e9e7e85e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DeleteSeedGenerator.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DeleteSeedGenerator.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DeleteSeedGenerator"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DeleteSessionTensor.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DeleteSessionTensor.pbtxt
index 6b4c55134baf66..def4c105535d95 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DeleteSessionTensor.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DeleteSessionTensor.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DeleteSessionTensor"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DenseBincount.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DenseBincount.pbtxt
index c96abfc5ab012c..9bab6854e406a9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DenseBincount.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DenseBincount.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DenseBincount"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DenseCountSparseOutput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DenseCountSparseOutput.pbtxt
index cb500a1f4c9931..be566eab9f438d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DenseCountSparseOutput.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DenseCountSparseOutput.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DenseCountSparseOutput"
   input_arg {
     name: "values"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DenseToCSRSparseMatrix.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DenseToCSRSparseMatrix.pbtxt
index 159fe06324c252..c8b2f66380791a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DenseToCSRSparseMatrix.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DenseToCSRSparseMatrix.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DenseToCSRSparseMatrix"
   input_arg {
     name: "dense_input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DenseToDenseSetOperation.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DenseToDenseSetOperation.pbtxt
index 74a3d2aaa02b5b..5188a82414ef18 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DenseToDenseSetOperation.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DenseToDenseSetOperation.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DenseToDenseSetOperation"
   input_arg {
     name: "set1"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DenseToSparseBatchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DenseToSparseBatchDataset.pbtxt
index 16838ea7d82424..cb972cca8aa42f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DenseToSparseBatchDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DenseToSparseBatchDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DenseToSparseBatchDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DenseToSparseSetOperation.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DenseToSparseSetOperation.pbtxt
index e79d8f5e213011..71c9c37798f7b1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DenseToSparseSetOperation.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DenseToSparseSetOperation.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DenseToSparseSetOperation"
   input_arg {
     name: "set1"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DepthToSpace.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DepthToSpace.pbtxt
index f5dd95b60c97d0..422fe7ff53f2a4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DepthToSpace.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DepthToSpace.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DepthToSpace"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DepthwiseConv2dNative.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DepthwiseConv2dNative.pbtxt
index 90eae83c303cc0..805bf2d379aa03 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DepthwiseConv2dNative.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DepthwiseConv2dNative.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DepthwiseConv2dNative"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DepthwiseConv2dNativeBackpropFilter.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DepthwiseConv2dNativeBackpropFilter.pbtxt
index 199b883851c5ab..119933bf0ae7de 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DepthwiseConv2dNativeBackpropFilter.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DepthwiseConv2dNativeBackpropFilter.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DepthwiseConv2dNativeBackpropFilter"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DepthwiseConv2dNativeBackpropInput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DepthwiseConv2dNativeBackpropInput.pbtxt
index 99edc181b7d635..8ad511910587f6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DepthwiseConv2dNativeBackpropInput.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DepthwiseConv2dNativeBackpropInput.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DepthwiseConv2dNativeBackpropInput"
   input_arg {
     name: "input_sizes"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Dequantize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Dequantize.pbtxt
index 8db5d571979aea..1a39e1869bfb90 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Dequantize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Dequantize.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Dequantize"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DeserializeIterator.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DeserializeIterator.pbtxt
index 2d531bd93df362..1ae290e93c0f37 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DeserializeIterator.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DeserializeIterator.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DeserializeIterator"
   input_arg {
     name: "resource_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DeserializeManySparse.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DeserializeManySparse.pbtxt
index 0ebb833a4b6383..f0e75d96d94d7e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DeserializeManySparse.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DeserializeManySparse.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DeserializeManySparse"
   input_arg {
     name: "serialized_sparse"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DeserializeSparse.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DeserializeSparse.pbtxt
index c327f01280ec25..c23a9b58a62ed6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DeserializeSparse.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DeserializeSparse.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DeserializeSparse"
   input_arg {
     name: "serialized_sparse"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DestroyResourceOp.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DestroyResourceOp.pbtxt
index 355227a484d32b..aa16c5ad5235a5 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DestroyResourceOp.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DestroyResourceOp.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DestroyResourceOp"
   input_arg {
     name: "resource"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DestroyTemporaryVariable.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DestroyTemporaryVariable.pbtxt
index 773e399783efe7..7e073b2f20b040 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DestroyTemporaryVariable.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DestroyTemporaryVariable.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DestroyTemporaryVariable"
   input_arg {
     name: "ref"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DeviceIndex.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DeviceIndex.pbtxt
index cfb79b60e3e44d..c513889e7b1813 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DeviceIndex.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DeviceIndex.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DeviceIndex"
   output_arg {
     name: "index"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Diag.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Diag.pbtxt
index bae6dbfc7d2e76..92cb2071cf9788 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Diag.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Diag.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Diag"
   input_arg {
     name: "diagonal"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DiagPart.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DiagPart.pbtxt
index 00e64aa39669e2..aec8c871407ccb 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DiagPart.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DiagPart.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DiagPart"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Digamma.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Digamma.pbtxt
index d29ad8723b06de..0c294e54f216ca 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Digamma.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Digamma.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Digamma"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Dilation2D.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Dilation2D.pbtxt
index 30f06dc42062a5..1db8503014ae72 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Dilation2D.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Dilation2D.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Dilation2D"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Dilation2DBackpropFilter.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Dilation2DBackpropFilter.pbtxt
index 4dcc4b6f248e45..5a5a9f1dbb39cd 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Dilation2DBackpropFilter.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Dilation2DBackpropFilter.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Dilation2DBackpropFilter"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Dilation2DBackpropInput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Dilation2DBackpropInput.pbtxt
index dc2e964457a8bf..8944211d86d45a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Dilation2DBackpropInput.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Dilation2DBackpropInput.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Dilation2DBackpropInput"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DirectedInterleaveDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DirectedInterleaveDataset.pbtxt
index 9aaadbf0a6c014..61a9462fac3b59 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DirectedInterleaveDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DirectedInterleaveDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DirectedInterleaveDataset"
   input_arg {
     name: "selector_input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DisableCopyOnRead.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DisableCopyOnRead.pbtxt
index 8a91880bab9c5b..61d189ae526cfa 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DisableCopyOnRead.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DisableCopyOnRead.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DisableCopyOnRead"
   input_arg {
     name: "resource"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DistributedSave.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DistributedSave.pbtxt
index 3cb284d442a94e..221820fddb1962 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DistributedSave.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DistributedSave.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DistributedSave"
   input_arg {
     name: "dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Div.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Div.pbtxt
index 3ec0077535986c..fdc955f19ebe17 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Div.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Div.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Div"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DivNoNan.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DivNoNan.pbtxt
index 40fc208137bcbd..ee9025f52f25a7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DivNoNan.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DivNoNan.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DivNoNan"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DrawBoundingBoxes.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DrawBoundingBoxes.pbtxt
index 35952f3512e107..729817314e9c39 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DrawBoundingBoxes.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DrawBoundingBoxes.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DrawBoundingBoxes"
   input_arg {
     name: "images"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DrawBoundingBoxesV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DrawBoundingBoxesV2.pbtxt
index 85ab33cee4f34c..0a561796ca0b61 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DrawBoundingBoxesV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DrawBoundingBoxesV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DrawBoundingBoxesV2"
   input_arg {
     name: "images"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DummyIterationCounter.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DummyIterationCounter.pbtxt
index 2f6bf602b173d3..b1df20cae731ca 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DummyIterationCounter.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DummyIterationCounter.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DummyIterationCounter"
   output_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DummyMemoryCache.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DummyMemoryCache.pbtxt
index bbce6cafdc3958..63901e2585ea15 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DummyMemoryCache.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DummyMemoryCache.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DummyMemoryCache"
   output_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DummySeedGenerator.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DummySeedGenerator.pbtxt
index f08c0e07723d46..585bc7c7528344 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DummySeedGenerator.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DummySeedGenerator.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DummySeedGenerator"
   output_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DynamicEnqueueTPUEmbeddingArbitraryTensorBatch.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DynamicEnqueueTPUEmbeddingArbitraryTensorBatch.pbtxt
index 5ac83512996ed4..b494ece1a77218 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DynamicEnqueueTPUEmbeddingArbitraryTensorBatch.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DynamicEnqueueTPUEmbeddingArbitraryTensorBatch.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DynamicEnqueueTPUEmbeddingArbitraryTensorBatch"
   input_arg {
     name: "sample_indices_or_row_lengths"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DynamicEnqueueTPUEmbeddingRaggedTensorBatch.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DynamicEnqueueTPUEmbeddingRaggedTensorBatch.pbtxt
index 506a023aa23583..46adf791fe5977 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DynamicEnqueueTPUEmbeddingRaggedTensorBatch.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DynamicEnqueueTPUEmbeddingRaggedTensorBatch.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DynamicEnqueueTPUEmbeddingRaggedTensorBatch"
   input_arg {
     name: "sample_splits"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DynamicPartition.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DynamicPartition.pbtxt
index c497964b636bc3..3565bd6f754540 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DynamicPartition.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DynamicPartition.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DynamicPartition"
   input_arg {
     name: "data"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DynamicStitch.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DynamicStitch.pbtxt
index 76226f65312d0a..aba8346995ccd3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DynamicStitch.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DynamicStitch.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "DynamicStitch"
   input_arg {
     name: "indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/EagerPyFunc.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/EagerPyFunc.pbtxt
index 302e420778b6f0..56c12e3845c6ff 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/EagerPyFunc.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/EagerPyFunc.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "EagerPyFunc"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/EditDistance.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/EditDistance.pbtxt
index 78fbc3f1ac4d9e..aba098b7020cdb 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/EditDistance.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/EditDistance.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "EditDistance"
   input_arg {
     name: "hypothesis_indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Eig.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Eig.pbtxt
index 3872e50f11ed76..d95892a88367df 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Eig.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Eig.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Eig"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Einsum.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Einsum.pbtxt
index 8a365bb22bf4d9..3855daa079bd40 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Einsum.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Einsum.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Einsum"
   input_arg {
     name: "inputs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Elu.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Elu.pbtxt
index 3c9ea1633638b9..4b8a81527569a9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Elu.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Elu.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Elu"
   input_arg {
     name: "features"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/EluGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/EluGrad.pbtxt
index 2b2e8b974c18f7..cfbc9f99e314f3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/EluGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/EluGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "EluGrad"
   input_arg {
     name: "gradients"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Empty.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Empty.pbtxt
index 760cab96732ccc..147854bb88cf23 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Empty.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Empty.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Empty"
   input_arg {
     name: "shape"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/EmptyTensorList.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/EmptyTensorList.pbtxt
index 46015e651e219f..d15fa1ad47048c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/EmptyTensorList.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/EmptyTensorList.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "EmptyTensorList"
   input_arg {
     name: "element_shape"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/EmptyTensorMap.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/EmptyTensorMap.pbtxt
index ee1fc166627b41..25327b4e1e8bfb 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/EmptyTensorMap.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/EmptyTensorMap.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "EmptyTensorMap"
   output_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/EncodeBase64.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/EncodeBase64.pbtxt
index 001c3d0d9fb36d..6e5241d0fd7033 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/EncodeBase64.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/EncodeBase64.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "EncodeBase64"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/EncodeJpeg.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/EncodeJpeg.pbtxt
index 165a02aed32de1..9f3c3453e33e32 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/EncodeJpeg.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/EncodeJpeg.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "EncodeJpeg"
   input_arg {
     name: "image"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/EncodeJpegVariableQuality.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/EncodeJpegVariableQuality.pbtxt
index 97f93d28b288f0..94c41ea4e5d574 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/EncodeJpegVariableQuality.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/EncodeJpegVariableQuality.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "EncodeJpegVariableQuality"
   input_arg {
     name: "images"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/EncodePng.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/EncodePng.pbtxt
index 7b7d161e8cba6e..7d2cbd85225125 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/EncodePng.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/EncodePng.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "EncodePng"
   input_arg {
     name: "image"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/EncodeProto.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/EncodeProto.pbtxt
index 2cea90e4183ddf..e619618946a6c0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/EncodeProto.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/EncodeProto.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "EncodeProto"
   input_arg {
     name: "sizes"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/EncodeWav.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/EncodeWav.pbtxt
index e468d229ff4024..b013362a47d9e4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/EncodeWav.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/EncodeWav.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "EncodeWav"
   input_arg {
     name: "audio"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/EnqueueTPUEmbeddingArbitraryTensorBatch.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/EnqueueTPUEmbeddingArbitraryTensorBatch.pbtxt
index f8d3f09b3cd32a..efb854a44baf5e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/EnqueueTPUEmbeddingArbitraryTensorBatch.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/EnqueueTPUEmbeddingArbitraryTensorBatch.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "EnqueueTPUEmbeddingArbitraryTensorBatch"
   input_arg {
     name: "sample_indices_or_row_lengths"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/EnqueueTPUEmbeddingBatch.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/EnqueueTPUEmbeddingBatch.pbtxt
index b1be42b17eed44..a09c5d87f44fda 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/EnqueueTPUEmbeddingBatch.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/EnqueueTPUEmbeddingBatch.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "EnqueueTPUEmbeddingBatch"
   input_arg {
     name: "batch"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/EnqueueTPUEmbeddingIntegerBatch.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/EnqueueTPUEmbeddingIntegerBatch.pbtxt
index 37b123d9791a2a..26d63b6e49dc5e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/EnqueueTPUEmbeddingIntegerBatch.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/EnqueueTPUEmbeddingIntegerBatch.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "EnqueueTPUEmbeddingIntegerBatch"
   input_arg {
     name: "batch"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/EnqueueTPUEmbeddingRaggedTensorBatch.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/EnqueueTPUEmbeddingRaggedTensorBatch.pbtxt
index 5e1a15cbc95fcc..327bd4fb387700 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/EnqueueTPUEmbeddingRaggedTensorBatch.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/EnqueueTPUEmbeddingRaggedTensorBatch.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "EnqueueTPUEmbeddingRaggedTensorBatch"
   input_arg {
     name: "sample_splits"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/EnqueueTPUEmbeddingSparseBatch.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/EnqueueTPUEmbeddingSparseBatch.pbtxt
index 03a19c2feed572..64b8cb5178c5a6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/EnqueueTPUEmbeddingSparseBatch.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/EnqueueTPUEmbeddingSparseBatch.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "EnqueueTPUEmbeddingSparseBatch"
   input_arg {
     name: "sample_indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/EnqueueTPUEmbeddingSparseTensorBatch.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/EnqueueTPUEmbeddingSparseTensorBatch.pbtxt
index 19521bb9213393..ab1c9d264eca1e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/EnqueueTPUEmbeddingSparseTensorBatch.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/EnqueueTPUEmbeddingSparseTensorBatch.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "EnqueueTPUEmbeddingSparseTensorBatch"
   input_arg {
     name: "sample_indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/EnsureShape.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/EnsureShape.pbtxt
index 336e2aed15cff6..24fa5589131df3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/EnsureShape.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/EnsureShape.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "EnsureShape"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Enter.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Enter.pbtxt
index d0a555da70c264..d39d15f34dbf6f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Enter.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Enter.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Enter"
   input_arg {
     name: "data"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Equal.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Equal.pbtxt
index 8ad87805a6c558..a50cbdfcfeb1ac 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Equal.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Equal.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Equal"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Erf.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Erf.pbtxt
index a53d8e380ef17c..680b736fa3e4a3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Erf.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Erf.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Erf"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Erfc.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Erfc.pbtxt
index 589ca5a917c652..2fcfc68f04fbf2 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Erfc.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Erfc.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Erfc"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Erfinv.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Erfinv.pbtxt
index 3b51f3b83c9d3b..78443dada86bc4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Erfinv.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Erfinv.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Erfinv"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/EuclideanNorm.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/EuclideanNorm.pbtxt
index b4916ba2c7ee8c..b88f521b57199f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/EuclideanNorm.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/EuclideanNorm.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "EuclideanNorm"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExecuteTPUEmbeddingPartitioner.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExecuteTPUEmbeddingPartitioner.pbtxt
index c2a3d248d1f844..37025570305f98 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExecuteTPUEmbeddingPartitioner.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExecuteTPUEmbeddingPartitioner.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExecuteTPUEmbeddingPartitioner"
   output_arg {
     name: "common_config"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Exit.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Exit.pbtxt
index a722f2b7d5ced9..56a1371bec6b3f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Exit.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Exit.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Exit"
   input_arg {
     name: "data"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Exp.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Exp.pbtxt
index a237e834372291..7afeb677a29a54 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Exp.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Exp.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Exp"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExpandDims.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExpandDims.pbtxt
index 9bd506742ce3d7..c7bb353162c8c8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExpandDims.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExpandDims.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExpandDims"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalAssertNextDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalAssertNextDataset.pbtxt
index d33df8039e8b66..937afb4e0ae48f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalAssertNextDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalAssertNextDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExperimentalAssertNextDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalAutoShardDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalAutoShardDataset.pbtxt
index 7a609f0300a365..d8d8d7ffc96e4e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalAutoShardDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalAutoShardDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExperimentalAutoShardDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalBytesProducedStatsDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalBytesProducedStatsDataset.pbtxt
index 509dd811947653..10555bb3ebfbf5 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalBytesProducedStatsDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalBytesProducedStatsDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExperimentalBytesProducedStatsDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalCSVDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalCSVDataset.pbtxt
index 85d8950a845328..a618d55fcb6289 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalCSVDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalCSVDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExperimentalCSVDataset"
   input_arg {
     name: "filenames"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalChooseFastestDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalChooseFastestDataset.pbtxt
index 7d52752fffe21d..2c04d58db4c0a4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalChooseFastestDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalChooseFastestDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExperimentalChooseFastestDataset"
   input_arg {
     name: "input_datasets"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalDatasetCardinality.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalDatasetCardinality.pbtxt
index d0e7b0934d0dd6..f6ba3657864e1a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalDatasetCardinality.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalDatasetCardinality.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExperimentalDatasetCardinality"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalDatasetToTFRecord.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalDatasetToTFRecord.pbtxt
index 76b1f8c1d0036f..0d0e46c8b39ade 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalDatasetToTFRecord.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalDatasetToTFRecord.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExperimentalDatasetToTFRecord"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalDenseToSparseBatchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalDenseToSparseBatchDataset.pbtxt
index 94afefd024b797..c322ef95777609 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalDenseToSparseBatchDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalDenseToSparseBatchDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExperimentalDenseToSparseBatchDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalDirectedInterleaveDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalDirectedInterleaveDataset.pbtxt
index c2214754670c61..2a877497ff29fd 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalDirectedInterleaveDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalDirectedInterleaveDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExperimentalDirectedInterleaveDataset"
   input_arg {
     name: "selector_input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalGroupByReducerDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalGroupByReducerDataset.pbtxt
index 94d88c86c5fd0c..5e2fd15a22c908 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalGroupByReducerDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalGroupByReducerDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExperimentalGroupByReducerDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalGroupByWindowDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalGroupByWindowDataset.pbtxt
index 25ede182ce533a..35f9c3c1a9547d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalGroupByWindowDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalGroupByWindowDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExperimentalGroupByWindowDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalIgnoreErrorsDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalIgnoreErrorsDataset.pbtxt
index 5180b62d1e57b5..8fc4e7d2c8adee 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalIgnoreErrorsDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalIgnoreErrorsDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExperimentalIgnoreErrorsDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalIteratorGetDevice.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalIteratorGetDevice.pbtxt
index d65b5f8bb6c4a0..8e1e10240f9ea7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalIteratorGetDevice.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalIteratorGetDevice.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExperimentalIteratorGetDevice"
   input_arg {
     name: "resource"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalLMDBDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalLMDBDataset.pbtxt
index 05c9db9bd03516..e8b8694a947d95 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalLMDBDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalLMDBDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExperimentalLMDBDataset"
   input_arg {
     name: "filenames"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalLatencyStatsDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalLatencyStatsDataset.pbtxt
index aa401c14384320..29ba38a500c2b7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalLatencyStatsDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalLatencyStatsDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExperimentalLatencyStatsDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalMapAndBatchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalMapAndBatchDataset.pbtxt
index 519fa71185c471..7799116408c237 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalMapAndBatchDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalMapAndBatchDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExperimentalMapAndBatchDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalMapDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalMapDataset.pbtxt
index 75fef0d63ddb9f..401331b716073d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalMapDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalMapDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExperimentalMapDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalMatchingFilesDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalMatchingFilesDataset.pbtxt
index 29a30d9b257a07..45ef522bf7fdb4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalMatchingFilesDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalMatchingFilesDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExperimentalMatchingFilesDataset"
   input_arg {
     name: "patterns"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalMaxIntraOpParallelismDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalMaxIntraOpParallelismDataset.pbtxt
index 004b1dd4f9067d..109f3906b31852 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalMaxIntraOpParallelismDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalMaxIntraOpParallelismDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExperimentalMaxIntraOpParallelismDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalNonSerializableDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalNonSerializableDataset.pbtxt
index 72b5a2ef5b2d32..b0c45ac19941ce 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalNonSerializableDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalNonSerializableDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExperimentalNonSerializableDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalParallelInterleaveDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalParallelInterleaveDataset.pbtxt
index 5d9a463a93ab4f..a90031c9dad06a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalParallelInterleaveDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalParallelInterleaveDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExperimentalParallelInterleaveDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalParseExampleDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalParseExampleDataset.pbtxt
index 3e150e33913691..44701f67286318 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalParseExampleDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalParseExampleDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExperimentalParseExampleDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalPrivateThreadPoolDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalPrivateThreadPoolDataset.pbtxt
index e81bde0383e847..3098c30ba69d11 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalPrivateThreadPoolDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalPrivateThreadPoolDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExperimentalPrivateThreadPoolDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalRandomDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalRandomDataset.pbtxt
index 04d661ee71297c..c3276a46df5b41 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalRandomDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalRandomDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExperimentalRandomDataset"
   input_arg {
     name: "seed"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalRebatchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalRebatchDataset.pbtxt
index 2656cc09900abd..2c024741713d48 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalRebatchDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalRebatchDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExperimentalRebatchDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalScanDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalScanDataset.pbtxt
index cc4fc8c0e9c249..39d42061ef58d6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalScanDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalScanDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExperimentalScanDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalSetStatsAggregatorDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalSetStatsAggregatorDataset.pbtxt
index 4a6d59b28c2e80..291597bf11b8dc 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalSetStatsAggregatorDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalSetStatsAggregatorDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExperimentalSetStatsAggregatorDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalSleepDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalSleepDataset.pbtxt
index 06dbf87163fdae..806d764d9cdf12 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalSleepDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalSleepDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExperimentalSleepDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalSlidingWindowDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalSlidingWindowDataset.pbtxt
index 6d81c4c97f833b..ab18f4e214c578 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalSlidingWindowDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalSlidingWindowDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExperimentalSlidingWindowDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalSqlDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalSqlDataset.pbtxt
index 634a65912fa64a..f56ce488df0aba 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalSqlDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalSqlDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExperimentalSqlDataset"
   input_arg {
     name: "driver_name"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalStatsAggregatorHandle.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalStatsAggregatorHandle.pbtxt
index a3493ec933b7c0..b00cadbca09498 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalStatsAggregatorHandle.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalStatsAggregatorHandle.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExperimentalStatsAggregatorHandle"
   output_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalStatsAggregatorSummary.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalStatsAggregatorSummary.pbtxt
index 87f63b282fe5f9..7886f7a6cb305e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalStatsAggregatorSummary.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalStatsAggregatorSummary.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExperimentalStatsAggregatorSummary"
   input_arg {
     name: "iterator"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalTakeWhileDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalTakeWhileDataset.pbtxt
index db64c7eb605f44..7c9b4f86adbbe4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalTakeWhileDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalTakeWhileDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExperimentalTakeWhileDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalThreadPoolDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalThreadPoolDataset.pbtxt
index dc50cd1b975c71..da23c415fd24f0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalThreadPoolDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalThreadPoolDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExperimentalThreadPoolDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalThreadPoolHandle.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalThreadPoolHandle.pbtxt
index 47dc873ee476fb..8b230f90470f29 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalThreadPoolHandle.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalThreadPoolHandle.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExperimentalThreadPoolHandle"
   output_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalUnbatchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalUnbatchDataset.pbtxt
index 42819a367ee74e..83f3a39f5e9244 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalUnbatchDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalUnbatchDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExperimentalUnbatchDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalUniqueDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalUniqueDataset.pbtxt
index 4aa39a71b2d8bc..95668c930d7269 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalUniqueDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalUniqueDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExperimentalUniqueDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Expint.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Expint.pbtxt
index afb722b09ac2c3..3080bf15de9170 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Expint.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Expint.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Expint"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Expm1.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Expm1.pbtxt
index 6ea265efc65ffe..b09aac454d000e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Expm1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Expm1.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Expm1"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExtractGlimpse.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExtractGlimpse.pbtxt
index 1b305eb38867d2..597a77a3f3b8fa 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExtractGlimpse.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExtractGlimpse.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExtractGlimpse"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExtractGlimpseV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExtractGlimpseV2.pbtxt
index 8dababaccb9423..08725f4504ce01 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExtractGlimpseV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExtractGlimpseV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExtractGlimpseV2"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExtractImagePatches.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExtractImagePatches.pbtxt
index ada7acbbf57863..dee8034d6c7076 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExtractImagePatches.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExtractImagePatches.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExtractImagePatches"
   input_arg {
     name: "images"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExtractJpegShape.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExtractJpegShape.pbtxt
index ef8068e1e91f00..ac3d34ca234fea 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExtractJpegShape.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExtractJpegShape.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExtractJpegShape"
   input_arg {
     name: "contents"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExtractVolumePatches.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExtractVolumePatches.pbtxt
index 96d79b25ea52eb..09cc21a38b47e6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExtractVolumePatches.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExtractVolumePatches.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ExtractVolumePatches"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FFT.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FFT.pbtxt
index 1bc74e426a247f..e986f323936f51 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FFT.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FFT.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FFT"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FFT2D.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FFT2D.pbtxt
index 38c837f8e91a87..adb1c253867f22 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FFT2D.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FFT2D.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FFT2D"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FFT3D.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FFT3D.pbtxt
index df20d1970b2bd3..9266d6db4a688f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FFT3D.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FFT3D.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FFT3D"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FFTND.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FFTND.pbtxt
index 1cb40fa334b2f2..20afcc3c5b466d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FFTND.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FFTND.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FFTND"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FIFOQueue.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FIFOQueue.pbtxt
index d3d57195e72efd..c3321a8c6e4782 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FIFOQueue.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FIFOQueue.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FIFOQueue"
   output_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FIFOQueueV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FIFOQueueV2.pbtxt
index 6f9c48db9ebcb7..9b1c8404d0ddac 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FIFOQueueV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FIFOQueueV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FIFOQueueV2"
   output_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Fact.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Fact.pbtxt
index 426124e73678e1..90a0ad8dd00112 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Fact.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Fact.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Fact"
   output_arg {
     name: "fact"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FakeParam.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FakeParam.pbtxt
index 7b286c4697e1d3..dc2a7c5ea46991 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FakeParam.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FakeParam.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FakeParam"
   output_arg {
     name: "output"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FakeQuantWithMinMaxArgs.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FakeQuantWithMinMaxArgs.pbtxt
index 5faa52d9136ca7..2d8eac83c59be4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FakeQuantWithMinMaxArgs.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FakeQuantWithMinMaxArgs.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FakeQuantWithMinMaxArgs"
   input_arg {
     name: "inputs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FakeQuantWithMinMaxArgsGradient.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FakeQuantWithMinMaxArgsGradient.pbtxt
index 02850fbbc9b4d1..5d02f59da1e444 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FakeQuantWithMinMaxArgsGradient.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FakeQuantWithMinMaxArgsGradient.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FakeQuantWithMinMaxArgsGradient"
   input_arg {
     name: "gradients"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FakeQuantWithMinMaxVars.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FakeQuantWithMinMaxVars.pbtxt
index 8fc979749d448a..233f5cc2f66134 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FakeQuantWithMinMaxVars.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FakeQuantWithMinMaxVars.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FakeQuantWithMinMaxVars"
   input_arg {
     name: "inputs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FakeQuantWithMinMaxVarsGradient.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FakeQuantWithMinMaxVarsGradient.pbtxt
index 595d2688c7b99d..cf8ed6f8b7e18d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FakeQuantWithMinMaxVarsGradient.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FakeQuantWithMinMaxVarsGradient.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FakeQuantWithMinMaxVarsGradient"
   input_arg {
     name: "gradients"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FakeQuantWithMinMaxVarsPerChannel.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FakeQuantWithMinMaxVarsPerChannel.pbtxt
index 7300a1daf33c8a..551ae79cd94ce9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FakeQuantWithMinMaxVarsPerChannel.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FakeQuantWithMinMaxVarsPerChannel.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FakeQuantWithMinMaxVarsPerChannel"
   input_arg {
     name: "inputs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FakeQuantWithMinMaxVarsPerChannelGradient.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FakeQuantWithMinMaxVarsPerChannelGradient.pbtxt
index 80e037e94e46f3..a787e251c60a6e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FakeQuantWithMinMaxVarsPerChannelGradient.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FakeQuantWithMinMaxVarsPerChannelGradient.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FakeQuantWithMinMaxVarsPerChannelGradient"
   input_arg {
     name: "gradients"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FakeQueue.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FakeQueue.pbtxt
index 1a45d2509e62a4..5e4cb62d941eb9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FakeQueue.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FakeQueue.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FakeQueue"
   input_arg {
     name: "resource"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FileSystemSetConfiguration.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FileSystemSetConfiguration.pbtxt
index 06b193366fea68..95d26fc09818f5 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FileSystemSetConfiguration.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FileSystemSetConfiguration.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FileSystemSetConfiguration"
   input_arg {
     name: "scheme"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Fill.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Fill.pbtxt
index e0859caefb7b41..543ae42239b4c3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Fill.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Fill.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Fill"
   input_arg {
     name: "dims"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FilterByLastComponentDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FilterByLastComponentDataset.pbtxt
index 804e6f5dadc754..cf9bbc586524a9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FilterByLastComponentDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FilterByLastComponentDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FilterByLastComponentDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FilterDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FilterDataset.pbtxt
index 193fa2eb091518..aad48d7aed4f62 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FilterDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FilterDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FilterDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FinalizeDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FinalizeDataset.pbtxt
index cc8c78b502d528..38e49288d662e1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FinalizeDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FinalizeDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FinalizeDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FinalizeTPUEmbedding.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FinalizeTPUEmbedding.pbtxt
index 00bc279a92250a..bd367fcdd12451 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FinalizeTPUEmbedding.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FinalizeTPUEmbedding.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FinalizeTPUEmbedding"
   input_arg {
     name: "common_config"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FinalizeTPUEmbeddingV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FinalizeTPUEmbeddingV2.pbtxt
index fd57e3726f6040..63c69eaff3aaba 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FinalizeTPUEmbeddingV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FinalizeTPUEmbeddingV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FinalizeTPUEmbeddingV2"
   input_arg {
     name: "common_config"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Fingerprint.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Fingerprint.pbtxt
index b00a23b0f22f8c..3a5585701ba722 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Fingerprint.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Fingerprint.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Fingerprint"
   input_arg {
     name: "data"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FixedLengthRecordDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FixedLengthRecordDataset.pbtxt
index 04aff9c6ec69fa..63522d5f47c331 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FixedLengthRecordDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FixedLengthRecordDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FixedLengthRecordDataset"
   input_arg {
     name: "filenames"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FixedLengthRecordDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FixedLengthRecordDatasetV2.pbtxt
index e27a084317ac5a..88d744513572c6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FixedLengthRecordDatasetV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FixedLengthRecordDatasetV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FixedLengthRecordDatasetV2"
   input_arg {
     name: "filenames"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FixedLengthRecordReader.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FixedLengthRecordReader.pbtxt
index 9f3aeb4fa8f374..75b6018f249c13 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FixedLengthRecordReader.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FixedLengthRecordReader.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FixedLengthRecordReader"
   output_arg {
     name: "reader_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FixedLengthRecordReaderV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FixedLengthRecordReaderV2.pbtxt
index 1bd086e2d92964..b16e5225240580 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FixedLengthRecordReaderV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FixedLengthRecordReaderV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FixedLengthRecordReaderV2"
   output_arg {
     name: "reader_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FixedUnigramCandidateSampler.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FixedUnigramCandidateSampler.pbtxt
index 264c48f683c7ab..a7911344886c97 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FixedUnigramCandidateSampler.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FixedUnigramCandidateSampler.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FixedUnigramCandidateSampler"
   input_arg {
     name: "true_classes"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FlatMapDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FlatMapDataset.pbtxt
index f0515e584c7b92..dcf1a7ae71c41f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FlatMapDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FlatMapDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FlatMapDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Floor.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Floor.pbtxt
index 0653fc9864ca3f..27e405e22de85e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Floor.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Floor.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Floor"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FloorDiv.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FloorDiv.pbtxt
index 5b6aa1c29f34e1..dcaff127c7d9d0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FloorDiv.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FloorDiv.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FloorDiv"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FloorMod.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FloorMod.pbtxt
index 2f912df3b49de2..bbf48e9b826570 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FloorMod.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FloorMod.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FloorMod"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FlushSummaryWriter.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FlushSummaryWriter.pbtxt
index c7c659b79f6d5c..f928d4abe99a55 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FlushSummaryWriter.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FlushSummaryWriter.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FlushSummaryWriter"
   input_arg {
     name: "writer"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/For.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/For.pbtxt
index d0cdc427284e7f..139990f3994b84 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/For.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/For.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "For"
   input_arg {
     name: "start"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FractionalAvgPool.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FractionalAvgPool.pbtxt
index d503b56852ad5d..5fc527b066de60 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FractionalAvgPool.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FractionalAvgPool.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FractionalAvgPool"
   input_arg {
     name: "value"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FractionalAvgPoolGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FractionalAvgPoolGrad.pbtxt
index 56028951959e56..cceb2fe903ab05 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FractionalAvgPoolGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FractionalAvgPoolGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FractionalAvgPoolGrad"
   input_arg {
     name: "orig_input_tensor_shape"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FractionalMaxPool.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FractionalMaxPool.pbtxt
index 4360d2356fc979..a11b4ef05f1bc9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FractionalMaxPool.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FractionalMaxPool.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FractionalMaxPool"
   input_arg {
     name: "value"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FractionalMaxPoolGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FractionalMaxPoolGrad.pbtxt
index 517e9117186358..711e98a5df1479 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FractionalMaxPoolGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FractionalMaxPoolGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FractionalMaxPoolGrad"
   input_arg {
     name: "orig_input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FresnelCos.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FresnelCos.pbtxt
index 19d0bf180e584b..7be5bbcb2ff8d5 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FresnelCos.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FresnelCos.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FresnelCos"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FresnelSin.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FresnelSin.pbtxt
index a05ab41d50ac8e..c8c91ba6a68b8d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FresnelSin.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FresnelSin.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FresnelSin"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FusedBatchNorm.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FusedBatchNorm.pbtxt
index f2480179c00962..e5ac169b31ef96 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FusedBatchNorm.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FusedBatchNorm.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FusedBatchNorm"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FusedBatchNormGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FusedBatchNormGrad.pbtxt
index 9aabfafdd18be8..bff7eecf0ce852 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FusedBatchNormGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FusedBatchNormGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FusedBatchNormGrad"
   input_arg {
     name: "y_backprop"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FusedBatchNormGradV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FusedBatchNormGradV2.pbtxt
index 78e2509276d1e5..dea20af8afcc6e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FusedBatchNormGradV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FusedBatchNormGradV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FusedBatchNormGradV2"
   input_arg {
     name: "y_backprop"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FusedBatchNormGradV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FusedBatchNormGradV3.pbtxt
index 2d2d3234b91df4..aa05a575bfeec0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FusedBatchNormGradV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FusedBatchNormGradV3.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FusedBatchNormGradV3"
   input_arg {
     name: "y_backprop"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FusedBatchNormV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FusedBatchNormV2.pbtxt
index 1f8fe62eb8b782..99f482fc721d6d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FusedBatchNormV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FusedBatchNormV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FusedBatchNormV2"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FusedBatchNormV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FusedBatchNormV3.pbtxt
index 1afdb5c8875269..b1f608dbe3659d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FusedBatchNormV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FusedBatchNormV3.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FusedBatchNormV3"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FusedPadConv2D.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FusedPadConv2D.pbtxt
index b4142aea4fbbf9..1a89c018c4e059 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FusedPadConv2D.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FusedPadConv2D.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FusedPadConv2D"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FusedResizeAndPadConv2D.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FusedResizeAndPadConv2D.pbtxt
index b5870f695e766a..cfc716fa1d695c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FusedResizeAndPadConv2D.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FusedResizeAndPadConv2D.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "FusedResizeAndPadConv2D"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/GRUBlockCell.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/GRUBlockCell.pbtxt
index 7c4519f6d7191f..7c0dd9d5fcff40 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/GRUBlockCell.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/GRUBlockCell.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "GRUBlockCell"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/GRUBlockCellGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/GRUBlockCellGrad.pbtxt
index a8b3b505af37d7..723bcbd0b6f49c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/GRUBlockCellGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/GRUBlockCellGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "GRUBlockCellGrad"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Gather.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Gather.pbtxt
index 37b2ae11c7eeb4..264a8366bb82b8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Gather.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Gather.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Gather"
   input_arg {
     name: "params"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/GatherNd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/GatherNd.pbtxt
index 78376f9baff7b4..5ec2fd92ae606e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/GatherNd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/GatherNd.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "GatherNd"
   input_arg {
     name: "params"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/GatherV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/GatherV2.pbtxt
index 607cb850411e0d..891e82e5fb3ed6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/GatherV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/GatherV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "GatherV2"
   input_arg {
     name: "params"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/GenerateBoundingBoxProposals.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/GenerateBoundingBoxProposals.pbtxt
index 6cb47345d8ccb7..adbc9d4c8e00f3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/GenerateBoundingBoxProposals.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/GenerateBoundingBoxProposals.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "GenerateBoundingBoxProposals"
   input_arg {
     name: "scores"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/GenerateVocabRemapping.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/GenerateVocabRemapping.pbtxt
index adb2f799c542d0..a095253dbb20b4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/GenerateVocabRemapping.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/GenerateVocabRemapping.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "GenerateVocabRemapping"
   input_arg {
     name: "new_vocab_file"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/GeneratorDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/GeneratorDataset.pbtxt
index b9ac804e012177..9f8da9c542648e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/GeneratorDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/GeneratorDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "GeneratorDataset"
   input_arg {
     name: "init_func_other_args"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/GetElementAtIndex.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/GetElementAtIndex.pbtxt
index 22b27a5530c826..82121301bd9fbb 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/GetElementAtIndex.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/GetElementAtIndex.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "GetElementAtIndex"
   input_arg {
     name: "dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/GetMinibatchSplitsWithPhysicalReplica.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/GetMinibatchSplitsWithPhysicalReplica.pbtxt
index 764a8982073313..85bc30bf6dc098 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/GetMinibatchSplitsWithPhysicalReplica.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/GetMinibatchSplitsWithPhysicalReplica.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "GetMinibatchSplitsWithPhysicalReplica"
   input_arg {
     name: "program_key"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/GetMinibatchesInCsrWithPhysicalReplica.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/GetMinibatchesInCsrWithPhysicalReplica.pbtxt
index 45c3b531422fc9..79632c44322921 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/GetMinibatchesInCsrWithPhysicalReplica.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/GetMinibatchesInCsrWithPhysicalReplica.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "GetMinibatchesInCsrWithPhysicalReplica"
   input_arg {
     name: "program_key"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/GetOptions.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/GetOptions.pbtxt
index 1562e89627eddd..f8f161238d5630 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/GetOptions.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/GetOptions.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "GetOptions"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/GetSessionHandle.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/GetSessionHandle.pbtxt
index 90e66c7bd59690..e5345ec6f0ea1f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/GetSessionHandle.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/GetSessionHandle.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "GetSessionHandle"
   input_arg {
     name: "value"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/GetSessionHandleV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/GetSessionHandleV2.pbtxt
index 55dcf277a9e465..60405234b168b4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/GetSessionHandleV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/GetSessionHandleV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "GetSessionHandleV2"
   input_arg {
     name: "value"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/GetSessionTensor.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/GetSessionTensor.pbtxt
index fef3d08c0a7b9f..5c4cf8af9c55dc 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/GetSessionTensor.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/GetSessionTensor.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "GetSessionTensor"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/GetStatsFromListOfSparseCoreCooTensors.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/GetStatsFromListOfSparseCoreCooTensors.pbtxt
new file mode 100644
index 00000000000000..f2ea7cfeb6ed88
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/GetStatsFromListOfSparseCoreCooTensors.pbtxt
@@ -0,0 +1,68 @@
+op {
+  name: "GetStatsFromListOfSparseCoreCooTensors"
+  input_arg {
+    name: "row_ids_list"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "col_ids_list"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "gains_list"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "max_ids_per_sparse_core"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "max_unique_ids_per_sparse_core"
+    type: DT_INT32
+  }
+  attr {
+    name: "sample_count_list"
+    type: "list(int)"
+  }
+  attr {
+    name: "col_offset_list"
+    type: "list(int)"
+  }
+  attr {
+    name: "num_replica"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "table_vocab_size"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "feature_width"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sc_per_chip"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/GetTpuTaskId.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/GetTpuTaskId.pbtxt
new file mode 100644
index 00000000000000..a5c46e67ad3e0e
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/GetTpuTaskId.pbtxt
@@ -0,0 +1,7 @@
+op {
+  name: "GetTpuTaskId"
+  output_arg {
+    name: "tpu_task_id"
+    type: DT_INT32
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/GlobalIterId.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/GlobalIterId.pbtxt
index 5fa2302622c9ac..9dcfcb49a6d0cb 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/GlobalIterId.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/GlobalIterId.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "GlobalIterId"
   output_arg {
     name: "iter_id"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/GlobalShuffleDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/GlobalShuffleDataset.pbtxt
index 131281a80ec590..0bce0beea54070 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/GlobalShuffleDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/GlobalShuffleDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "GlobalShuffleDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Greater.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Greater.pbtxt
index bbf5f46aab7d53..8860e3c0c1097c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Greater.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Greater.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Greater"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/GreaterEqual.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/GreaterEqual.pbtxt
index db2bbd30ff2ce3..5bcdd3789c789a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/GreaterEqual.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/GreaterEqual.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "GreaterEqual"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/GroupByReducerDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/GroupByReducerDataset.pbtxt
index d550546eb64bd1..320e628f8aabba 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/GroupByReducerDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/GroupByReducerDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "GroupByReducerDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/GroupByWindowDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/GroupByWindowDataset.pbtxt
index da3bfd2542624d..0de0de53d0e7d3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/GroupByWindowDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/GroupByWindowDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "GroupByWindowDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/GuaranteeConst.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/GuaranteeConst.pbtxt
index 9ec864b74c024b..71d47e3758090e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/GuaranteeConst.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/GuaranteeConst.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "GuaranteeConst"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/HSVToRGB.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/HSVToRGB.pbtxt
index 6169e00a62058a..2b209cc6547ab6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/HSVToRGB.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/HSVToRGB.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "HSVToRGB"
   input_arg {
     name: "images"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/HashTable.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/HashTable.pbtxt
index eb3a57246ed277..83afe2b9448103 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/HashTable.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/HashTable.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "HashTable"
   output_arg {
     name: "table_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/HashTableV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/HashTableV2.pbtxt
index ea6383d9f392b7..24a9bc7176d8ae 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/HashTableV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/HashTableV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "HashTableV2"
   output_arg {
     name: "table_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/HistogramFixedWidth.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/HistogramFixedWidth.pbtxt
index 201df5b3467a1c..f39eabe4f72506 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/HistogramFixedWidth.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/HistogramFixedWidth.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "HistogramFixedWidth"
   input_arg {
     name: "values"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/HistogramSummary.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/HistogramSummary.pbtxt
index 6b5c4c9df1303a..0c46f397972bf6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/HistogramSummary.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/HistogramSummary.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "HistogramSummary"
   input_arg {
     name: "tag"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/HostConst.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/HostConst.pbtxt
index 81975b57d5e37d..6dd4c1757073fd 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/HostConst.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/HostConst.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "HostConst"
   output_arg {
     name: "output"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/IFFT.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/IFFT.pbtxt
index a4c712b70cdf89..8571a132950cb1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/IFFT.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/IFFT.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "IFFT"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/IFFT2D.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/IFFT2D.pbtxt
index f079604a4bec8c..0b208d46939354 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/IFFT2D.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/IFFT2D.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "IFFT2D"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/IFFT3D.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/IFFT3D.pbtxt
index bba5431e81392a..8b9667f882c67c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/IFFT3D.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/IFFT3D.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "IFFT3D"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/IFFTND.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/IFFTND.pbtxt
index 72956229f0cde9..0dfb1fc9c51904 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/IFFTND.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/IFFTND.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "IFFTND"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/IRFFT.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/IRFFT.pbtxt
index 1a42324399eeb1..8ac3dfc979d8ef 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/IRFFT.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/IRFFT.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "IRFFT"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/IRFFT2D.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/IRFFT2D.pbtxt
index e348e4e8cbdf39..5d1f872d605e32 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/IRFFT2D.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/IRFFT2D.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "IRFFT2D"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/IRFFT3D.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/IRFFT3D.pbtxt
index 3a6fa0da90e102..b69417ee1a7e57 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/IRFFT3D.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/IRFFT3D.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "IRFFT3D"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/IRFFTND.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/IRFFTND.pbtxt
index c1d1da300962db..175092d5aa8158 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/IRFFTND.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/IRFFTND.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "IRFFTND"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Identity.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Identity.pbtxt
index 7646ed8ddb0964..f3ca3dbd24324a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Identity.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Identity.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Identity"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/IdentityN.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/IdentityN.pbtxt
index aedd30ebafdddb..61c3b63279003c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/IdentityN.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/IdentityN.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "IdentityN"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/IdentityReader.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/IdentityReader.pbtxt
index 9e1aa0e11e219d..3330154b4d60ca 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/IdentityReader.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/IdentityReader.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "IdentityReader"
   output_arg {
     name: "reader_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/IdentityReaderV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/IdentityReaderV2.pbtxt
index 27c72561ec1364..f37e9cedab961a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/IdentityReaderV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/IdentityReaderV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "IdentityReaderV2"
   output_arg {
     name: "reader_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/If.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/If.pbtxt
index 595f6002939acb..7ccb12afa61896 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/If.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/If.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "If"
   input_arg {
     name: "cond"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Igamma.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Igamma.pbtxt
index 821fa48275501f..d356f5ec5816c6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Igamma.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Igamma.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Igamma"
   input_arg {
     name: "a"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/IgammaGradA.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/IgammaGradA.pbtxt
index 739385e8c4da4f..964067de5dcb69 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/IgammaGradA.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/IgammaGradA.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "IgammaGradA"
   input_arg {
     name: "a"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Igammac.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Igammac.pbtxt
index 1cfa801aeeb09a..cdf44f684bbdfc 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Igammac.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Igammac.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Igammac"
   input_arg {
     name: "a"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/IgnoreErrorsDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/IgnoreErrorsDataset.pbtxt
index 94fed463e57f7c..32af3bb466528e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/IgnoreErrorsDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/IgnoreErrorsDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "IgnoreErrorsDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Imag.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Imag.pbtxt
index f8c3ce95533bf1..1444b0c60b05f6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Imag.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Imag.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Imag"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ImageProjectiveTransformV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ImageProjectiveTransformV2.pbtxt
index bd58faf8e16445..891b8b1cb7a6cc 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ImageProjectiveTransformV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ImageProjectiveTransformV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ImageProjectiveTransformV2"
   input_arg {
     name: "images"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ImageProjectiveTransformV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ImageProjectiveTransformV3.pbtxt
index 92887b2cc18151..115ee4269e7490 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ImageProjectiveTransformV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ImageProjectiveTransformV3.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ImageProjectiveTransformV3"
   input_arg {
     name: "images"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ImageSummary.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ImageSummary.pbtxt
index cbe4a0123f2eda..fafd7173195e3d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ImageSummary.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ImageSummary.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ImageSummary"
   input_arg {
     name: "tag"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ImmutableConst.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ImmutableConst.pbtxt
index 51bb4050b9f65c..ba1180951f7083 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ImmutableConst.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ImmutableConst.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ImmutableConst"
   output_arg {
     name: "tensor"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ImportEvent.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ImportEvent.pbtxt
index d80a4f171a4090..7be31dd0ae5844 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ImportEvent.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ImportEvent.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ImportEvent"
   input_arg {
     name: "writer"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/InTopK.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/InTopK.pbtxt
index 881ead0a3631b5..6acd3b62e91d28 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/InTopK.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/InTopK.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "InTopK"
   input_arg {
     name: "predictions"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/InTopKV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/InTopKV2.pbtxt
index 003655ca052f60..a6ca2b83a45a37 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/InTopKV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/InTopKV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "InTopKV2"
   input_arg {
     name: "predictions"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/IndexFlatMapDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/IndexFlatMapDataset.pbtxt
index 348e21901d1743..e28bead11f8c03 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/IndexFlatMapDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/IndexFlatMapDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "IndexFlatMapDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/InfeedDequeue.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/InfeedDequeue.pbtxt
index 595fbefc51366c..a48d840da663f3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/InfeedDequeue.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/InfeedDequeue.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "InfeedDequeue"
   output_arg {
     name: "output"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/InfeedDequeueTuple.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/InfeedDequeueTuple.pbtxt
index e3c4cdc4a0f5bb..dc6ab2b0b66476 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/InfeedDequeueTuple.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/InfeedDequeueTuple.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "InfeedDequeueTuple"
   output_arg {
     name: "outputs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/InfeedEnqueue.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/InfeedEnqueue.pbtxt
index f62d220e778769..759b91401e9de0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/InfeedEnqueue.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/InfeedEnqueue.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "InfeedEnqueue"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/InfeedEnqueuePrelinearizedBuffer.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/InfeedEnqueuePrelinearizedBuffer.pbtxt
index e6dfae7dcea10c..d281b700bd4aad 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/InfeedEnqueuePrelinearizedBuffer.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/InfeedEnqueuePrelinearizedBuffer.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "InfeedEnqueuePrelinearizedBuffer"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/InfeedEnqueueTuple.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/InfeedEnqueueTuple.pbtxt
index f1339fc123d0b6..459c5d9218fd6a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/InfeedEnqueueTuple.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/InfeedEnqueueTuple.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "InfeedEnqueueTuple"
   input_arg {
     name: "inputs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/InitializeTable.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/InitializeTable.pbtxt
index 571557348711b2..35a46a99c24987 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/InitializeTable.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/InitializeTable.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "InitializeTable"
   input_arg {
     name: "table_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/InitializeTableFromDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/InitializeTableFromDataset.pbtxt
index 69dc6753d84f1a..fe0ec4d4b176c2 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/InitializeTableFromDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/InitializeTableFromDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "InitializeTableFromDataset"
   input_arg {
     name: "table_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/InitializeTableFromTextFile.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/InitializeTableFromTextFile.pbtxt
index 08c63cbe8d4ced..77be4cadf98245 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/InitializeTableFromTextFile.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/InitializeTableFromTextFile.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "InitializeTableFromTextFile"
   input_arg {
     name: "table_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/InitializeTableFromTextFileV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/InitializeTableFromTextFileV2.pbtxt
index 69b41a04dc866e..6593434f2a6f4d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/InitializeTableFromTextFileV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/InitializeTableFromTextFileV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "InitializeTableFromTextFileV2"
   input_arg {
     name: "table_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/InitializeTableV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/InitializeTableV2.pbtxt
index 6e7aeb5f95946e..62c565902faf69 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/InitializeTableV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/InitializeTableV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "InitializeTableV2"
   input_arg {
     name: "table_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/InplaceAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/InplaceAdd.pbtxt
index 31799cddf4fee9..7c6685770b7f3f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/InplaceAdd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/InplaceAdd.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "InplaceAdd"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/InplaceSub.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/InplaceSub.pbtxt
index 28754d0755e050..42d6c14a586c49 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/InplaceSub.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/InplaceSub.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "InplaceSub"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/InplaceUpdate.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/InplaceUpdate.pbtxt
index 448bff4d381669..94b7f24aecc2ca 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/InplaceUpdate.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/InplaceUpdate.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "InplaceUpdate"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/InterleaveDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/InterleaveDataset.pbtxt
index 9a2505d5540d19..124a84cf82fc55 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/InterleaveDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/InterleaveDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "InterleaveDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Inv.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Inv.pbtxt
index 30ed71735243f7..0c191790030e8d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Inv.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Inv.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Inv"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/InvGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/InvGrad.pbtxt
index 1c1d4971bf43f5..af882a90b23f58 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/InvGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/InvGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "InvGrad"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Invert.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Invert.pbtxt
index 6915b9d38fd471..cd9c81231790c4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Invert.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Invert.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Invert"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/InvertPermutation.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/InvertPermutation.pbtxt
index 74c7d64b10ba81..fa028961e3220c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/InvertPermutation.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/InvertPermutation.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "InvertPermutation"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/IsBoostedTreesEnsembleInitialized.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/IsBoostedTreesEnsembleInitialized.pbtxt
index 4086fdf15d3ffa..1b19fef0df2c93 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/IsBoostedTreesEnsembleInitialized.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/IsBoostedTreesEnsembleInitialized.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "IsBoostedTreesEnsembleInitialized"
   input_arg {
     name: "tree_ensemble_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/IsBoostedTreesQuantileStreamResourceInitialized.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/IsBoostedTreesQuantileStreamResourceInitialized.pbtxt
index 868a34f393b315..359e0e9ba5798c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/IsBoostedTreesQuantileStreamResourceInitialized.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/IsBoostedTreesQuantileStreamResourceInitialized.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "IsBoostedTreesQuantileStreamResourceInitialized"
   input_arg {
     name: "quantile_stream_resource_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/IsFinite.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/IsFinite.pbtxt
index d3051d645ba2b1..8410dce0cb011f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/IsFinite.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/IsFinite.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "IsFinite"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/IsInf.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/IsInf.pbtxt
index 20604d6e930955..1ce6c74691e3d6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/IsInf.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/IsInf.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "IsInf"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/IsNan.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/IsNan.pbtxt
index bc6c66435d00e0..826f2fff6c507d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/IsNan.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/IsNan.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "IsNan"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/IsTPUEmbeddingInitialized.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/IsTPUEmbeddingInitialized.pbtxt
index 6a6a893bfddcd4..bbfe80cc97ac21 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/IsTPUEmbeddingInitialized.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/IsTPUEmbeddingInitialized.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "IsTPUEmbeddingInitialized"
   output_arg {
     name: "is_tpu_embedding_initialized"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/IsVariableInitialized.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/IsVariableInitialized.pbtxt
index c61e3263a6aeb9..03496db8d32030 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/IsVariableInitialized.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/IsVariableInitialized.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "IsVariableInitialized"
   input_arg {
     name: "ref"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/IsotonicRegression.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/IsotonicRegression.pbtxt
index 3c9050f67f2607..abe6fb4bbd849a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/IsotonicRegression.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/IsotonicRegression.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "IsotonicRegression"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Iterator.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Iterator.pbtxt
index a9b8b66ee19046..76b9fdef4ed26e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Iterator.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Iterator.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Iterator"
   output_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/IteratorFromStringHandle.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/IteratorFromStringHandle.pbtxt
index e4964182ad4259..ebd34378194d2d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/IteratorFromStringHandle.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/IteratorFromStringHandle.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "IteratorFromStringHandle"
   input_arg {
     name: "string_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/IteratorFromStringHandleV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/IteratorFromStringHandleV2.pbtxt
index 3faf956c0b6030..624c47394db730 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/IteratorFromStringHandleV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/IteratorFromStringHandleV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "IteratorFromStringHandleV2"
   input_arg {
     name: "string_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/IteratorGetDevice.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/IteratorGetDevice.pbtxt
index 02020169c71f71..8d379c1557b2ab 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/IteratorGetDevice.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/IteratorGetDevice.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "IteratorGetDevice"
   input_arg {
     name: "resource"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/IteratorGetModelProto.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/IteratorGetModelProto.pbtxt
index ee47f9d7d43634..b1343becfa9bd6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/IteratorGetModelProto.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/IteratorGetModelProto.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "IteratorGetModelProto"
   input_arg {
     name: "iterator"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/IteratorGetNext.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/IteratorGetNext.pbtxt
index daddc26f37be32..f204011ed431b7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/IteratorGetNext.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/IteratorGetNext.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "IteratorGetNext"
   input_arg {
     name: "iterator"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/IteratorGetNextAsOptional.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/IteratorGetNextAsOptional.pbtxt
index 5cbd8314b9f0e7..c1a532f165c6f0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/IteratorGetNextAsOptional.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/IteratorGetNextAsOptional.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "IteratorGetNextAsOptional"
   input_arg {
     name: "iterator"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/IteratorGetNextSync.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/IteratorGetNextSync.pbtxt
index c7f29d2773cfad..e1a7351d2da357 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/IteratorGetNextSync.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/IteratorGetNextSync.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "IteratorGetNextSync"
   input_arg {
     name: "iterator"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/IteratorToStringHandle.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/IteratorToStringHandle.pbtxt
index 47fa0764034917..87f2dffc941b41 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/IteratorToStringHandle.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/IteratorToStringHandle.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "IteratorToStringHandle"
   input_arg {
     name: "resource_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/IteratorV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/IteratorV2.pbtxt
index 244fe9d0a49c62..6f7ab705485da4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/IteratorV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/IteratorV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "IteratorV2"
   output_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/KMC2ChainInitialization.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/KMC2ChainInitialization.pbtxt
index f695595658365b..e9640975b0e685 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/KMC2ChainInitialization.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/KMC2ChainInitialization.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "KMC2ChainInitialization"
   input_arg {
     name: "distances"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/KmeansPlusPlusInitialization.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/KmeansPlusPlusInitialization.pbtxt
index ea5914ba22f17c..27ab4b34885bc9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/KmeansPlusPlusInitialization.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/KmeansPlusPlusInitialization.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "KmeansPlusPlusInitialization"
   input_arg {
     name: "points"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/KthOrderStatistic.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/KthOrderStatistic.pbtxt
index 3b0ca9d2d07479..8e5b79cec0442b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/KthOrderStatistic.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/KthOrderStatistic.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "KthOrderStatistic"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/L2Loss.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/L2Loss.pbtxt
index 1e72b270d39aee..90e8619d09f2e7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/L2Loss.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/L2Loss.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "L2Loss"
   input_arg {
     name: "t"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LMDBDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LMDBDataset.pbtxt
index 6609f2d64fd84f..9ba1bd98191f8e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LMDBDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LMDBDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LMDBDataset"
   input_arg {
     name: "filenames"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LMDBReader.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LMDBReader.pbtxt
index b6f52544c52965..967c74bb72c778 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LMDBReader.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LMDBReader.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LMDBReader"
   output_arg {
     name: "reader_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LRN.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LRN.pbtxt
index aba0c94025dd81..75880682c31830 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LRN.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LRN.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LRN"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LRNGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LRNGrad.pbtxt
index 65a6c221aaeff0..37db775eaa236b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LRNGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LRNGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LRNGrad"
   input_arg {
     name: "input_grads"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LSTMBlockCell.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LSTMBlockCell.pbtxt
index 88643e69b1c1ae..f1071f7fc51699 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LSTMBlockCell.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LSTMBlockCell.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LSTMBlockCell"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LSTMBlockCellGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LSTMBlockCellGrad.pbtxt
index 1851e5bd6e6bfb..b20d47c5c01ef5 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LSTMBlockCellGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LSTMBlockCellGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LSTMBlockCellGrad"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LatencyStatsDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LatencyStatsDataset.pbtxt
index cdb1716e83521e..546bcdcbd1233e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LatencyStatsDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LatencyStatsDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LatencyStatsDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LeakyRelu.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LeakyRelu.pbtxt
index 4cbb4fdf427b38..c0358f96a87f10 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LeakyRelu.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LeakyRelu.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LeakyRelu"
   input_arg {
     name: "features"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LeakyReluGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LeakyReluGrad.pbtxt
index 524638322a3f62..786872202c456d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LeakyReluGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LeakyReluGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LeakyReluGrad"
   input_arg {
     name: "gradients"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LearnedUnigramCandidateSampler.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LearnedUnigramCandidateSampler.pbtxt
index cce62fa5d0cc4b..71466c56726990 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LearnedUnigramCandidateSampler.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LearnedUnigramCandidateSampler.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LearnedUnigramCandidateSampler"
   input_arg {
     name: "true_classes"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LeftShift.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LeftShift.pbtxt
index 94088ea60d6d12..c3f56bee3bb47a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LeftShift.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LeftShift.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LeftShift"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LegacyParallelInterleaveDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LegacyParallelInterleaveDatasetV2.pbtxt
index 6d4d712d004182..49f6a5574721c8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LegacyParallelInterleaveDatasetV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LegacyParallelInterleaveDatasetV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LegacyParallelInterleaveDatasetV2"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Less.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Less.pbtxt
index 818d92f302140e..e4f12455aa5ae3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Less.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Less.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Less"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LessEqual.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LessEqual.pbtxt
index 477d8bf1d92fdd..9162a684069fd8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LessEqual.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LessEqual.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LessEqual"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Lgamma.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Lgamma.pbtxt
index f43959c6afe82a..fcb0241217bdfa 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Lgamma.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Lgamma.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Lgamma"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LinSpace.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LinSpace.pbtxt
index ad58235a544266..044ba244206f96 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LinSpace.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LinSpace.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LinSpace"
   input_arg {
     name: "start"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ListDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ListDataset.pbtxt
index c095f6d1194705..2180d6d82a474c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ListDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ListDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ListDataset"
   input_arg {
     name: "tensors"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ListDiff.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ListDiff.pbtxt
index 090baff95aefa0..39c3ee8606ccfd 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ListDiff.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ListDiff.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ListDiff"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ListSnapshotChunksDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ListSnapshotChunksDataset.pbtxt
index be35470141fb28..1b667052f0e069 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ListSnapshotChunksDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ListSnapshotChunksDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ListSnapshotChunksDataset"
   input_arg {
     name: "snapshot_path"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LoadAllTPUEmbeddingParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LoadAllTPUEmbeddingParameters.pbtxt
index 68a146348b2758..5ac0c1cfb2b345 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LoadAllTPUEmbeddingParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LoadAllTPUEmbeddingParameters.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LoadAllTPUEmbeddingParameters"
   input_arg {
     name: "parameters"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LoadAndRemapMatrix.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LoadAndRemapMatrix.pbtxt
index be16dec44ee721..54b4a68f2b344b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LoadAndRemapMatrix.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LoadAndRemapMatrix.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LoadAndRemapMatrix"
   input_arg {
     name: "ckpt_path"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LoadDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LoadDataset.pbtxt
index 41e306f86429a9..c46f54ef3c53a1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LoadDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LoadDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LoadDataset"
   input_arg {
     name: "path"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingADAMParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingADAMParameters.pbtxt
index 38aec474b28889..5294493f6d1f30 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingADAMParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingADAMParameters.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LoadTPUEmbeddingADAMParameters"
   input_arg {
     name: "parameters"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingAdadeltaParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingAdadeltaParameters.pbtxt
index c8d55510e0c983..c093049ee0d2f3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingAdadeltaParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingAdadeltaParameters.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LoadTPUEmbeddingAdadeltaParameters"
   input_arg {
     name: "parameters"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingAdagradMomentumParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingAdagradMomentumParameters.pbtxt
index be33451441efe9..82f2cb2161fc7a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingAdagradMomentumParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingAdagradMomentumParameters.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LoadTPUEmbeddingAdagradMomentumParameters"
   input_arg {
     name: "parameters"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingAdagradParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingAdagradParameters.pbtxt
index b4325420e97e5d..485ec861a1eae4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingAdagradParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingAdagradParameters.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LoadTPUEmbeddingAdagradParameters"
   input_arg {
     name: "parameters"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingCenteredRMSPropParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingCenteredRMSPropParameters.pbtxt
index 42b53f9cf2e60c..72c02df3763afb 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingCenteredRMSPropParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingCenteredRMSPropParameters.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LoadTPUEmbeddingCenteredRMSPropParameters"
   input_arg {
     name: "parameters"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingFTRLParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingFTRLParameters.pbtxt
index 0bceec9c7897de..a429697249129a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingFTRLParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingFTRLParameters.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LoadTPUEmbeddingFTRLParameters"
   input_arg {
     name: "parameters"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingFrequencyEstimatorParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingFrequencyEstimatorParameters.pbtxt
index 2485f0c96ca4f5..ab6af2875d5dc1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingFrequencyEstimatorParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingFrequencyEstimatorParameters.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LoadTPUEmbeddingFrequencyEstimatorParameters"
   input_arg {
     name: "parameters"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingMDLAdagradLightParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingMDLAdagradLightParameters.pbtxt
index 2140a2b84b2b59..a175817a6e931d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingMDLAdagradLightParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingMDLAdagradLightParameters.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LoadTPUEmbeddingMDLAdagradLightParameters"
   input_arg {
     name: "parameters"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingMomentumParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingMomentumParameters.pbtxt
index 3da833e5595b2a..0f135f7c6f0e25 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingMomentumParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingMomentumParameters.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LoadTPUEmbeddingMomentumParameters"
   input_arg {
     name: "parameters"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingProximalAdagradParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingProximalAdagradParameters.pbtxt
index c272880303b6ad..10d611a38b2e78 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingProximalAdagradParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingProximalAdagradParameters.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LoadTPUEmbeddingProximalAdagradParameters"
   input_arg {
     name: "parameters"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingProximalYogiParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingProximalYogiParameters.pbtxt
index 7feae76245ed14..8f51ed094ae8e1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingProximalYogiParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingProximalYogiParameters.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LoadTPUEmbeddingProximalYogiParameters"
   input_arg {
     name: "parameters"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingRMSPropParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingRMSPropParameters.pbtxt
index 711f9e56dfe055..cfec4be6aa7d1c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingRMSPropParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingRMSPropParameters.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LoadTPUEmbeddingRMSPropParameters"
   input_arg {
     name: "parameters"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingStochasticGradientDescentParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingStochasticGradientDescentParameters.pbtxt
index a61ee8b1e4bc4e..48b965b4ea3691 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingStochasticGradientDescentParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LoadTPUEmbeddingStochasticGradientDescentParameters.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LoadTPUEmbeddingStochasticGradientDescentParameters"
   input_arg {
     name: "parameters"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Log.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Log.pbtxt
index 7e5b08a05a1282..a16862c0735f93 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Log.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Log.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Log"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Log1p.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Log1p.pbtxt
index b1f686eebe6982..1f8ba12957e726 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Log1p.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Log1p.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Log1p"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LogMatrixDeterminant.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LogMatrixDeterminant.pbtxt
index aee2342ce070aa..3807cdda42595e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LogMatrixDeterminant.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LogMatrixDeterminant.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LogMatrixDeterminant"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LogSoftmax.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LogSoftmax.pbtxt
index 59748cdb10238b..92d2727bbe7e23 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LogSoftmax.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LogSoftmax.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LogSoftmax"
   input_arg {
     name: "logits"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LogUniformCandidateSampler.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LogUniformCandidateSampler.pbtxt
index 1066189f03b1b3..9ec45571159093 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LogUniformCandidateSampler.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LogUniformCandidateSampler.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LogUniformCandidateSampler"
   input_arg {
     name: "true_classes"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LogicalAnd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LogicalAnd.pbtxt
index bdad772540320e..b10b115df4f23f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LogicalAnd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LogicalAnd.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LogicalAnd"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LogicalNot.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LogicalNot.pbtxt
index cba489e4022795..5cf13ad8399c7b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LogicalNot.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LogicalNot.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LogicalNot"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LogicalOr.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LogicalOr.pbtxt
index 9306d3d95b357a..635a66d8ba0634 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LogicalOr.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LogicalOr.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LogicalOr"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LookupTableExport.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LookupTableExport.pbtxt
index f083b589d1cd21..6c56cdeb1de748 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LookupTableExport.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LookupTableExport.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LookupTableExport"
   input_arg {
     name: "table_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LookupTableExportV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LookupTableExportV2.pbtxt
index 264b3867d11698..b86fd3a32ad47b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LookupTableExportV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LookupTableExportV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LookupTableExportV2"
   input_arg {
     name: "table_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LookupTableFind.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LookupTableFind.pbtxt
index e9e33182da56de..5923b502abcb15 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LookupTableFind.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LookupTableFind.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LookupTableFind"
   input_arg {
     name: "table_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LookupTableFindV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LookupTableFindV2.pbtxt
index 4fc069f0e48d0d..53cbafbeee082f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LookupTableFindV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LookupTableFindV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LookupTableFindV2"
   input_arg {
     name: "table_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LookupTableImport.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LookupTableImport.pbtxt
index c3184809ed3962..73b53a5b6f5784 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LookupTableImport.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LookupTableImport.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LookupTableImport"
   input_arg {
     name: "table_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LookupTableImportV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LookupTableImportV2.pbtxt
index aa684ec6b5bcb2..41c03b83c71ba1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LookupTableImportV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LookupTableImportV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LookupTableImportV2"
   input_arg {
     name: "table_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LookupTableInsert.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LookupTableInsert.pbtxt
index b33cb4837a7bfc..b96cb478887922 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LookupTableInsert.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LookupTableInsert.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LookupTableInsert"
   input_arg {
     name: "table_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LookupTableInsertV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LookupTableInsertV2.pbtxt
index c2e5b1a3cb448c..19d7d49b8600a3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LookupTableInsertV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LookupTableInsertV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LookupTableInsertV2"
   input_arg {
     name: "table_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LookupTableRemoveV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LookupTableRemoveV2.pbtxt
index 325b4316da3810..d7fe0bb4dd4e40 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LookupTableRemoveV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LookupTableRemoveV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LookupTableRemoveV2"
   input_arg {
     name: "table_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LookupTableSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LookupTableSize.pbtxt
index a878f30b9ea764..0d4bf61189fa33 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LookupTableSize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LookupTableSize.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LookupTableSize"
   input_arg {
     name: "table_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LookupTableSizeV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LookupTableSizeV2.pbtxt
index 6fb6f47f945262..511beedff01e3b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LookupTableSizeV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LookupTableSizeV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LookupTableSizeV2"
   input_arg {
     name: "table_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LoopCond.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LoopCond.pbtxt
index 3be4701ef48460..7111fff007b5f2 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LoopCond.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LoopCond.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LoopCond"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LowerBound.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LowerBound.pbtxt
index 934c879c0907dc..b7d1dee7797707 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LowerBound.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LowerBound.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "LowerBound"
   input_arg {
     name: "sorted_inputs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Lu.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Lu.pbtxt
index a5140b8af7a87e..59c28e09e704d4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Lu.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Lu.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Lu"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MakeIterator.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MakeIterator.pbtxt
index 84e12f49dba0e6..b11c2b9e1dd1b1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MakeIterator.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MakeIterator.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MakeIterator"
   input_arg {
     name: "dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MakeUnique.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MakeUnique.pbtxt
index a6dbebb1e588a9..685f52d66eaead 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MakeUnique.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MakeUnique.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MakeUnique"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MapAndBatchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MapAndBatchDataset.pbtxt
index d4e5d9a29a79f2..8e7b6a32493801 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MapAndBatchDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MapAndBatchDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MapAndBatchDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MapClear.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MapClear.pbtxt
index 261e8ba9b6f5b0..22c5e5fcad020b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MapClear.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MapClear.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MapClear"
   attr {
     name: "capacity"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MapDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MapDataset.pbtxt
index 6840222b360148..b01b535e48d6fd 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MapDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MapDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MapDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MapDefun.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MapDefun.pbtxt
index 243d9866d0bbe0..7cb9d19231cfd9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MapDefun.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MapDefun.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MapDefun"
   input_arg {
     name: "arguments"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MapIncompleteSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MapIncompleteSize.pbtxt
index b6c12953967540..ca9c629887fa58 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MapIncompleteSize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MapIncompleteSize.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MapIncompleteSize"
   output_arg {
     name: "size"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MapPeek.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MapPeek.pbtxt
index 13973572d1cf49..4a61cb9e40f47c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MapPeek.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MapPeek.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MapPeek"
   input_arg {
     name: "key"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MapSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MapSize.pbtxt
index 5bffb06eb1ea59..6828f8fbb09a3f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MapSize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MapSize.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MapSize"
   output_arg {
     name: "size"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MapStage.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MapStage.pbtxt
index c34737c065777f..4ad2131a1f1844 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MapStage.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MapStage.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MapStage"
   input_arg {
     name: "key"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MapUnstage.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MapUnstage.pbtxt
index f288ab5f51cdb2..9901130961c8a1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MapUnstage.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MapUnstage.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MapUnstage"
   input_arg {
     name: "key"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MapUnstageNoKey.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MapUnstageNoKey.pbtxt
index ce11b85c5063cb..ee4cca51346065 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MapUnstageNoKey.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MapUnstageNoKey.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MapUnstageNoKey"
   input_arg {
     name: "indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MatMul.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MatMul.pbtxt
index 8f79fa11000f7f..42a2c794353672 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MatMul.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MatMul.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MatMul"
   input_arg {
     name: "a"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MatchingFiles.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MatchingFiles.pbtxt
index e374694de6f10a..3f8af5f3226375 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MatchingFiles.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MatchingFiles.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MatchingFiles"
   input_arg {
     name: "pattern"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MatchingFilesDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MatchingFilesDataset.pbtxt
index 916f7e1e60a1c5..bf43730d6dc87f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MatchingFilesDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MatchingFilesDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MatchingFilesDataset"
   input_arg {
     name: "patterns"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MatrixBandPart.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MatrixBandPart.pbtxt
index c6ab4921aa4f70..c25aa9615c51d1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MatrixBandPart.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MatrixBandPart.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MatrixBandPart"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MatrixDeterminant.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MatrixDeterminant.pbtxt
index 791c7f7a5afc57..4dd524d4894ed4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MatrixDeterminant.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MatrixDeterminant.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MatrixDeterminant"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MatrixDiag.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MatrixDiag.pbtxt
index 299678fef206a3..9b0ddb0285d0cd 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MatrixDiag.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MatrixDiag.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MatrixDiag"
   input_arg {
     name: "diagonal"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MatrixDiagPart.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MatrixDiagPart.pbtxt
index 4952fc6b2f2acc..efb1e18fccba39 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MatrixDiagPart.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MatrixDiagPart.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MatrixDiagPart"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MatrixDiagPartV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MatrixDiagPartV2.pbtxt
index 3325a0d4069336..f709c6d5eb556c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MatrixDiagPartV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MatrixDiagPartV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MatrixDiagPartV2"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MatrixDiagPartV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MatrixDiagPartV3.pbtxt
index c7df214d46ed30..75a1307f279142 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MatrixDiagPartV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MatrixDiagPartV3.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MatrixDiagPartV3"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MatrixDiagV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MatrixDiagV2.pbtxt
index 4fafe31b76cc93..3f6aa1e6a72fa5 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MatrixDiagV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MatrixDiagV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MatrixDiagV2"
   input_arg {
     name: "diagonal"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MatrixDiagV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MatrixDiagV3.pbtxt
index c8829be1bc41f8..793efcec3b6425 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MatrixDiagV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MatrixDiagV3.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MatrixDiagV3"
   input_arg {
     name: "diagonal"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MatrixExponential.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MatrixExponential.pbtxt
index d173fbe0515836..008291a4cafd64 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MatrixExponential.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MatrixExponential.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MatrixExponential"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MatrixInverse.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MatrixInverse.pbtxt
index 6b3befc4dd031f..81d35ad1d082ae 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MatrixInverse.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MatrixInverse.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MatrixInverse"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MatrixLogarithm.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MatrixLogarithm.pbtxt
index c3cc81e38c6cb0..0a87e5905d0123 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MatrixLogarithm.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MatrixLogarithm.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MatrixLogarithm"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MatrixSetDiag.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MatrixSetDiag.pbtxt
index 0dcfca6102cc1b..e8c08f8d295192 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MatrixSetDiag.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MatrixSetDiag.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MatrixSetDiag"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MatrixSetDiagV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MatrixSetDiagV2.pbtxt
index c11740fbd69a34..1147220c00c774 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MatrixSetDiagV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MatrixSetDiagV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MatrixSetDiagV2"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MatrixSetDiagV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MatrixSetDiagV3.pbtxt
index 1490839cf0b3f6..d5a6af8b119291 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MatrixSetDiagV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MatrixSetDiagV3.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MatrixSetDiagV3"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MatrixSolve.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MatrixSolve.pbtxt
index 50cf802d9ff263..2a28fa0adb0377 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MatrixSolve.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MatrixSolve.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MatrixSolve"
   input_arg {
     name: "matrix"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MatrixSolveLs.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MatrixSolveLs.pbtxt
index 082064ef822021..5df48fc28e3b29 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MatrixSolveLs.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MatrixSolveLs.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MatrixSolveLs"
   input_arg {
     name: "matrix"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MatrixSquareRoot.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MatrixSquareRoot.pbtxt
index b7e690181cad3e..32ff859e8f9826 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MatrixSquareRoot.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MatrixSquareRoot.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MatrixSquareRoot"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MatrixTriangularSolve.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MatrixTriangularSolve.pbtxt
index e925e1189322b4..915e582d00a0e1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MatrixTriangularSolve.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MatrixTriangularSolve.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MatrixTriangularSolve"
   input_arg {
     name: "matrix"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Max.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Max.pbtxt
index 91262d8e435aee..bf147acf0f405f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Max.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Max.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Max"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MaxIntraOpParallelismDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MaxIntraOpParallelismDataset.pbtxt
index 946f2f18f84161..85547917acc6e3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MaxIntraOpParallelismDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MaxIntraOpParallelismDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MaxIntraOpParallelismDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MaxPool.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MaxPool.pbtxt
index 2fd8174576359a..f4fd1cccf29f7d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MaxPool.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MaxPool.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MaxPool"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MaxPool3D.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MaxPool3D.pbtxt
index 928c2c0aeaf32b..7af4fca0e93e15 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MaxPool3D.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MaxPool3D.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MaxPool3D"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MaxPool3DGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MaxPool3DGrad.pbtxt
index 8e8f10e4fe80e7..77edcb4c89887a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MaxPool3DGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MaxPool3DGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MaxPool3DGrad"
   input_arg {
     name: "orig_input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MaxPool3DGradGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MaxPool3DGradGrad.pbtxt
index 5012c312b1bf1f..55d26c13c9cab9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MaxPool3DGradGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MaxPool3DGradGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MaxPool3DGradGrad"
   input_arg {
     name: "orig_input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MaxPoolGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MaxPoolGrad.pbtxt
index 2a36894500bdd8..131a3633cf98f7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MaxPoolGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MaxPoolGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MaxPoolGrad"
   input_arg {
     name: "orig_input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MaxPoolGradGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MaxPoolGradGrad.pbtxt
index 9589f707f4c481..9b1f4de08ea069 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MaxPoolGradGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MaxPoolGradGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MaxPoolGradGrad"
   input_arg {
     name: "orig_input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MaxPoolGradGradV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MaxPoolGradGradV2.pbtxt
index c716043ae9ac56..fba1ab57dc6b28 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MaxPoolGradGradV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MaxPoolGradGradV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MaxPoolGradGradV2"
   input_arg {
     name: "orig_input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MaxPoolGradGradWithArgmax.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MaxPoolGradGradWithArgmax.pbtxt
index dc96f707c66e8a..3c3cdbb90d1287 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MaxPoolGradGradWithArgmax.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MaxPoolGradGradWithArgmax.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MaxPoolGradGradWithArgmax"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MaxPoolGradV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MaxPoolGradV2.pbtxt
index 42d63ab79b1e60..7e38cf840dd0e0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MaxPoolGradV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MaxPoolGradV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MaxPoolGradV2"
   input_arg {
     name: "orig_input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MaxPoolGradWithArgmax.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MaxPoolGradWithArgmax.pbtxt
index 1d30e8703369a6..7c3ab4a0cd05c6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MaxPoolGradWithArgmax.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MaxPoolGradWithArgmax.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MaxPoolGradWithArgmax"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MaxPoolV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MaxPoolV2.pbtxt
index 82dc586d6d487d..3ef7da8d9d9848 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MaxPoolV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MaxPoolV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MaxPoolV2"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MaxPoolWithArgmax.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MaxPoolWithArgmax.pbtxt
index 0a4a4df9c8135d..d33bbd2f70769e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MaxPoolWithArgmax.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MaxPoolWithArgmax.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MaxPoolWithArgmax"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Maximum.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Maximum.pbtxt
index 32bbcb645478ef..c6ee10114c63f0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Maximum.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Maximum.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Maximum"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Mean.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Mean.pbtxt
index 10769852ec7d10..e0b5f145616867 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Mean.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Mean.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Mean"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Merge.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Merge.pbtxt
index 23610f501a46a0..d08f9cc55e94f0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Merge.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Merge.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Merge"
   input_arg {
     name: "inputs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MergeDedupData.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MergeDedupData.pbtxt
index 147341c8f374af..a5bcb48c34a0f1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MergeDedupData.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MergeDedupData.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MergeDedupData"
   input_arg {
     name: "integer_tensor"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MergeSummary.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MergeSummary.pbtxt
index ffe893e53bfa8a..d9b14d4511093e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MergeSummary.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MergeSummary.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MergeSummary"
   input_arg {
     name: "inputs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MergeV2Checkpoints.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MergeV2Checkpoints.pbtxt
index 96b092e55660ac..2a6b60b4e531c9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MergeV2Checkpoints.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MergeV2Checkpoints.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MergeV2Checkpoints"
   input_arg {
     name: "checkpoint_prefixes"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Mfcc.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Mfcc.pbtxt
index 8f4577250c9f3d..4c22eb8c69fe03 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Mfcc.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Mfcc.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Mfcc"
   input_arg {
     name: "spectrogram"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Min.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Min.pbtxt
index bb0cd548c8a13a..4959b5e8d583b9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Min.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Min.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Min"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Minimum.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Minimum.pbtxt
index 756451b0b748d9..01cc483ba54dcb 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Minimum.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Minimum.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Minimum"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MirrorPad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MirrorPad.pbtxt
index 0642f97fafe52a..bf64a6ca5040b1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MirrorPad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MirrorPad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MirrorPad"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MirrorPadGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MirrorPadGrad.pbtxt
index d1503a556c6127..b544cfbe72e7c4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MirrorPadGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MirrorPadGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MirrorPadGrad"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MlirPassthroughOp.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MlirPassthroughOp.pbtxt
index 4f7ddeb99acaf0..5990eb2850281d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MlirPassthroughOp.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MlirPassthroughOp.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MlirPassthroughOp"
   input_arg {
     name: "inputs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Mod.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Mod.pbtxt
index 37a302eaa25de1..6c39ed683f6bee 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Mod.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Mod.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Mod"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ModelDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ModelDataset.pbtxt
index d3228c40e1d4dc..14ac940f6a7764 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ModelDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ModelDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ModelDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Mul.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Mul.pbtxt
index d13b3adae90bd6..ef592669ff62cd 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Mul.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Mul.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Mul"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MulNoNan.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MulNoNan.pbtxt
index ba06844a75700e..ca5c92fb15bfde 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MulNoNan.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MulNoNan.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MulNoNan"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MultiDeviceIterator.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MultiDeviceIterator.pbtxt
index d7067ebf6eab43..d85c553f186f3b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MultiDeviceIterator.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MultiDeviceIterator.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MultiDeviceIterator"
   output_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MultiDeviceIteratorFromStringHandle.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MultiDeviceIteratorFromStringHandle.pbtxt
index 74acb5eb9ce0c9..384b1477dbc599 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MultiDeviceIteratorFromStringHandle.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MultiDeviceIteratorFromStringHandle.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MultiDeviceIteratorFromStringHandle"
   input_arg {
     name: "string_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MultiDeviceIteratorGetNextFromShard.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MultiDeviceIteratorGetNextFromShard.pbtxt
index 8cf4fb6c28514c..2e007c25b2337d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MultiDeviceIteratorGetNextFromShard.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MultiDeviceIteratorGetNextFromShard.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MultiDeviceIteratorGetNextFromShard"
   input_arg {
     name: "multi_device_iterator"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MultiDeviceIteratorInit.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MultiDeviceIteratorInit.pbtxt
index a2829e514c5beb..a011997186af14 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MultiDeviceIteratorInit.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MultiDeviceIteratorInit.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MultiDeviceIteratorInit"
   input_arg {
     name: "dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MultiDeviceIteratorToStringHandle.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MultiDeviceIteratorToStringHandle.pbtxt
index 6f332755692ba1..d7780d79687166 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MultiDeviceIteratorToStringHandle.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MultiDeviceIteratorToStringHandle.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MultiDeviceIteratorToStringHandle"
   input_arg {
     name: "multi_device_iterator"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Multinomial.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Multinomial.pbtxt
index ee55af8dddac95..c258fa6e7ed4c8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Multinomial.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Multinomial.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Multinomial"
   input_arg {
     name: "logits"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MutableDenseHashTable.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MutableDenseHashTable.pbtxt
index 4ebb385704164c..eecaeb2d4fb570 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MutableDenseHashTable.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MutableDenseHashTable.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MutableDenseHashTable"
   input_arg {
     name: "empty_key"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MutableDenseHashTableV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MutableDenseHashTableV2.pbtxt
index 861d25c0c1b3f4..739079ced16cb6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MutableDenseHashTableV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MutableDenseHashTableV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MutableDenseHashTableV2"
   input_arg {
     name: "empty_key"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MutableHashTable.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MutableHashTable.pbtxt
index c1f2e216da5b83..a8ecc34cb18bd0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MutableHashTable.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MutableHashTable.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MutableHashTable"
   output_arg {
     name: "table_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MutableHashTableOfTensors.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MutableHashTableOfTensors.pbtxt
index 7b71665fb96804..bdec2ff5939b39 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MutableHashTableOfTensors.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MutableHashTableOfTensors.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MutableHashTableOfTensors"
   output_arg {
     name: "table_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MutableHashTableOfTensorsV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MutableHashTableOfTensorsV2.pbtxt
index d32880aba77e28..dc46d075df3d94 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MutableHashTableOfTensorsV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MutableHashTableOfTensorsV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MutableHashTableOfTensorsV2"
   output_arg {
     name: "table_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MutableHashTableV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MutableHashTableV2.pbtxt
index eb690ee06e1aa9..610214dfa76e66 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MutableHashTableV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MutableHashTableV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MutableHashTableV2"
   output_arg {
     name: "table_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MutexLock.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MutexLock.pbtxt
index 16f1ad6e0a1049..6b5747cd4e383e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MutexLock.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MutexLock.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MutexLock"
   input_arg {
     name: "mutex"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MutexV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MutexV2.pbtxt
index ef04c075cd8d6e..b20f9b1e7996b3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MutexV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MutexV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "MutexV2"
   output_arg {
     name: "resource"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/NcclAllReduce.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/NcclAllReduce.pbtxt
index 59ab7c672eb6cc..80f91edef1d237 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/NcclAllReduce.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/NcclAllReduce.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "NcclAllReduce"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/NcclBroadcast.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/NcclBroadcast.pbtxt
index 46a2cab258fbf0..02a5487d1acf6f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/NcclBroadcast.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/NcclBroadcast.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "NcclBroadcast"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/NcclReduce.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/NcclReduce.pbtxt
index 141189947600f3..507f92cff2c87e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/NcclReduce.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/NcclReduce.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "NcclReduce"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Ndtri.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Ndtri.pbtxt
index c1549d77e8e841..a7a923f780db40 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Ndtri.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Ndtri.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Ndtri"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/NearestNeighbors.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/NearestNeighbors.pbtxt
index 42b7b7c906bd85..5d1e5ed57659ac 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/NearestNeighbors.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/NearestNeighbors.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "NearestNeighbors"
   input_arg {
     name: "points"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Neg.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Neg.pbtxt
index 22a09654a9b7c3..864d0257fe4b2d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Neg.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Neg.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Neg"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/NegTrain.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/NegTrain.pbtxt
index d97f107378d3b9..f12529fd6328b8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/NegTrain.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/NegTrain.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "NegTrain"
   input_arg {
     name: "w_in"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/NextAfter.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/NextAfter.pbtxt
index 1cbacb54fe2bec..70e4afe6c77db5 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/NextAfter.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/NextAfter.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "NextAfter"
   input_arg {
     name: "x1"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/NextIteration.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/NextIteration.pbtxt
index 6044c37879b4d2..7186fc0b684029 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/NextIteration.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/NextIteration.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "NextIteration"
   input_arg {
     name: "data"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/NoOp.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/NoOp.pbtxt
index 6c06e127e1bd6b..8f0370633fc1ce 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/NoOp.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/NoOp.pbtxt
@@ -1,3 +1,3 @@
-op 	 {
+op {
   name: "NoOp"
 }
diff --git a/tensorflow/core/ops/compat/ops_history_v2/NonDeterministicInts.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/NonDeterministicInts.pbtxt
index 470bdf155b510a..3fa5aa4a605c7d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/NonDeterministicInts.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/NonDeterministicInts.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "NonDeterministicInts"
   input_arg {
     name: "shape"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/NonMaxSuppression.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/NonMaxSuppression.pbtxt
index a63ddf63dbac33..ded8b3728f4613 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/NonMaxSuppression.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/NonMaxSuppression.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "NonMaxSuppression"
   input_arg {
     name: "boxes"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/NonMaxSuppressionV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/NonMaxSuppressionV2.pbtxt
index 0ff1ffcb5ae632..90c23bc0457dcc 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/NonMaxSuppressionV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/NonMaxSuppressionV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "NonMaxSuppressionV2"
   input_arg {
     name: "boxes"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/NonMaxSuppressionV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/NonMaxSuppressionV3.pbtxt
index bf9aac7c28f2c5..daeffd841b8e00 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/NonMaxSuppressionV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/NonMaxSuppressionV3.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "NonMaxSuppressionV3"
   input_arg {
     name: "boxes"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/NonMaxSuppressionV4.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/NonMaxSuppressionV4.pbtxt
index ee1f325a9bd8d6..07ca92fef71df1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/NonMaxSuppressionV4.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/NonMaxSuppressionV4.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "NonMaxSuppressionV4"
   input_arg {
     name: "boxes"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/NonMaxSuppressionV5.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/NonMaxSuppressionV5.pbtxt
index fbfe05a13141a1..cabec767a15617 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/NonMaxSuppressionV5.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/NonMaxSuppressionV5.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "NonMaxSuppressionV5"
   input_arg {
     name: "boxes"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/NonMaxSuppressionWithOverlaps.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/NonMaxSuppressionWithOverlaps.pbtxt
index a150d688af1b69..d89eeee4a4a34b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/NonMaxSuppressionWithOverlaps.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/NonMaxSuppressionWithOverlaps.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "NonMaxSuppressionWithOverlaps"
   input_arg {
     name: "overlaps"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/NonSerializableDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/NonSerializableDataset.pbtxt
index b47c4157f9dbde..5fbd4bdb656c57 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/NonSerializableDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/NonSerializableDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "NonSerializableDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/NotEqual.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/NotEqual.pbtxt
index 738fc0de0114d8..099ef75d622d83 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/NotEqual.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/NotEqual.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "NotEqual"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/NthElement.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/NthElement.pbtxt
index a1c7a12ba00126..c9e797273df119 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/NthElement.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/NthElement.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "NthElement"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/OneHot.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/OneHot.pbtxt
index 83163e8685a19e..0c1cfb6d686a95 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/OneHot.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/OneHot.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "OneHot"
   input_arg {
     name: "indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/OneShotIterator.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/OneShotIterator.pbtxt
index 362e5b271b172e..a2969bcc0e36c9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/OneShotIterator.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/OneShotIterator.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "OneShotIterator"
   output_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/OnesLike.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/OnesLike.pbtxt
index 6827c87d832660..d4609438139b35 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/OnesLike.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/OnesLike.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "OnesLike"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/OptimizeDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/OptimizeDataset.pbtxt
index 4a2b3f7420fd6c..5b40b213eb5dc8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/OptimizeDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/OptimizeDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "OptimizeDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/OptimizeDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/OptimizeDatasetV2.pbtxt
index d647c7bc1f0d58..7e1537ebe57a3a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/OptimizeDatasetV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/OptimizeDatasetV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "OptimizeDatasetV2"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/OptionalFromValue.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/OptionalFromValue.pbtxt
index 3a9510a674c426..ddfc9c43d348c3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/OptionalFromValue.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/OptionalFromValue.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "OptionalFromValue"
   input_arg {
     name: "components"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/OptionalGetValue.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/OptionalGetValue.pbtxt
index e5f3ab817ab570..e7364a1014afe8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/OptionalGetValue.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/OptionalGetValue.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "OptionalGetValue"
   input_arg {
     name: "optional"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/OptionalHasValue.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/OptionalHasValue.pbtxt
index e744d908129750..da76333cecbf70 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/OptionalHasValue.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/OptionalHasValue.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "OptionalHasValue"
   input_arg {
     name: "optional"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/OptionalNone.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/OptionalNone.pbtxt
index a051d978224361..c47d6a745481e9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/OptionalNone.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/OptionalNone.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "OptionalNone"
   output_arg {
     name: "optional"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/OptionsDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/OptionsDataset.pbtxt
index 6e3c0a9fdb22a7..fc63e5ee2951cf 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/OptionsDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/OptionsDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "OptionsDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/OrderedMapClear.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/OrderedMapClear.pbtxt
index 2f77ea5e73bbb4..726e26e6172f05 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/OrderedMapClear.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/OrderedMapClear.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "OrderedMapClear"
   attr {
     name: "capacity"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/OrderedMapIncompleteSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/OrderedMapIncompleteSize.pbtxt
index a6439051fe23a0..9a9572a51beea8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/OrderedMapIncompleteSize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/OrderedMapIncompleteSize.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "OrderedMapIncompleteSize"
   output_arg {
     name: "size"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/OrderedMapPeek.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/OrderedMapPeek.pbtxt
index 8e61ac23f9bb73..0d9fd20fe077dd 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/OrderedMapPeek.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/OrderedMapPeek.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "OrderedMapPeek"
   input_arg {
     name: "key"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/OrderedMapSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/OrderedMapSize.pbtxt
index 46777f1d7e7e8e..ea07d7e4215fb8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/OrderedMapSize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/OrderedMapSize.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "OrderedMapSize"
   output_arg {
     name: "size"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/OrderedMapStage.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/OrderedMapStage.pbtxt
index badbf44b424839..76af456ed8372b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/OrderedMapStage.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/OrderedMapStage.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "OrderedMapStage"
   input_arg {
     name: "key"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/OrderedMapUnstage.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/OrderedMapUnstage.pbtxt
index f3d4dced87d04e..c09b4be94f4908 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/OrderedMapUnstage.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/OrderedMapUnstage.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "OrderedMapUnstage"
   input_arg {
     name: "key"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/OrderedMapUnstageNoKey.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/OrderedMapUnstageNoKey.pbtxt
index 295f8258d599c4..bc3e8c7da30200 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/OrderedMapUnstageNoKey.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/OrderedMapUnstageNoKey.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "OrderedMapUnstageNoKey"
   input_arg {
     name: "indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/OutfeedDequeue.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/OutfeedDequeue.pbtxt
index b802b78283c0ee..29dc8b5c5879bd 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/OutfeedDequeue.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/OutfeedDequeue.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "OutfeedDequeue"
   output_arg {
     name: "output"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/OutfeedDequeueTuple.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/OutfeedDequeueTuple.pbtxt
index 580babfb502d44..3e0d31078b3ec6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/OutfeedDequeueTuple.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/OutfeedDequeueTuple.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "OutfeedDequeueTuple"
   output_arg {
     name: "outputs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/OutfeedDequeueTupleV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/OutfeedDequeueTupleV2.pbtxt
index e8d13591681564..744744b4545cad 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/OutfeedDequeueTupleV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/OutfeedDequeueTupleV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "OutfeedDequeueTupleV2"
   input_arg {
     name: "device_ordinal"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/OutfeedDequeueV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/OutfeedDequeueV2.pbtxt
index 5863aae467fed3..c5ca1f31a676cb 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/OutfeedDequeueV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/OutfeedDequeueV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "OutfeedDequeueV2"
   input_arg {
     name: "device_ordinal"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/OutfeedEnqueue.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/OutfeedEnqueue.pbtxt
index 4836db6b12b31f..d8c16f4d62978c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/OutfeedEnqueue.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/OutfeedEnqueue.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "OutfeedEnqueue"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/OutfeedEnqueueTuple.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/OutfeedEnqueueTuple.pbtxt
index e7464383175911..0bf1a5ba4804eb 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/OutfeedEnqueueTuple.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/OutfeedEnqueueTuple.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "OutfeedEnqueueTuple"
   input_arg {
     name: "inputs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Pack.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Pack.pbtxt
index 12eecb122fbe6a..65eb67509d3fd8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Pack.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Pack.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Pack"
   input_arg {
     name: "values"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Pad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Pad.pbtxt
index 5d1b80e2f3976c..1c7b9c7b457b3c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Pad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Pad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Pad"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/PadV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/PadV2.pbtxt
index aa1a0fb109be72..463cb71f207ed0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/PadV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/PadV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "PadV2"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/PaddedBatchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/PaddedBatchDataset.pbtxt
index f67cb9f4e4e5af..a118fc102f10a2 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/PaddedBatchDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/PaddedBatchDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "PaddedBatchDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/PaddedBatchDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/PaddedBatchDatasetV2.pbtxt
index 1142cbf9eb5bed..4ae5a66624a65d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/PaddedBatchDatasetV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/PaddedBatchDatasetV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "PaddedBatchDatasetV2"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/PaddingFIFOQueue.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/PaddingFIFOQueue.pbtxt
index e03ca8fecfec5f..f5eca52ba1927e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/PaddingFIFOQueue.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/PaddingFIFOQueue.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "PaddingFIFOQueue"
   output_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/PaddingFIFOQueueV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/PaddingFIFOQueueV2.pbtxt
index 12404b10303679..c398f9ee3a8a14 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/PaddingFIFOQueueV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/PaddingFIFOQueueV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "PaddingFIFOQueueV2"
   output_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ParallelBatchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ParallelBatchDataset.pbtxt
index 023d855a4fa2fd..5c160cae2ddad7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ParallelBatchDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ParallelBatchDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ParallelBatchDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ParallelConcat.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ParallelConcat.pbtxt
index 154f48829d78d4..b0d1cc3918574a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ParallelConcat.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ParallelConcat.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ParallelConcat"
   input_arg {
     name: "values"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ParallelDynamicStitch.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ParallelDynamicStitch.pbtxt
index 8566fc274f031d..9ab18a1ba5e69e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ParallelDynamicStitch.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ParallelDynamicStitch.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ParallelDynamicStitch"
   input_arg {
     name: "indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ParallelFilterDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ParallelFilterDataset.pbtxt
index e33694113b5105..1c895e2767636e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ParallelFilterDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ParallelFilterDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ParallelFilterDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ParallelInterleaveDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ParallelInterleaveDataset.pbtxt
index 730f9b76b490be..f278cb0efc6dfb 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ParallelInterleaveDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ParallelInterleaveDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ParallelInterleaveDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ParallelInterleaveDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ParallelInterleaveDatasetV2.pbtxt
index 507463da15bb97..110573b42ed39f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ParallelInterleaveDatasetV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ParallelInterleaveDatasetV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ParallelInterleaveDatasetV2"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ParallelInterleaveDatasetV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ParallelInterleaveDatasetV3.pbtxt
index 3f73fc38abc0d0..096460fb1efcdb 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ParallelInterleaveDatasetV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ParallelInterleaveDatasetV3.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ParallelInterleaveDatasetV3"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ParallelInterleaveDatasetV4.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ParallelInterleaveDatasetV4.pbtxt
index 65d63780827683..94f9ae0f6eebd1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ParallelInterleaveDatasetV4.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ParallelInterleaveDatasetV4.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ParallelInterleaveDatasetV4"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ParallelMapDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ParallelMapDataset.pbtxt
index 20ac8a4dcc3d51..991e4192983c3d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ParallelMapDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ParallelMapDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ParallelMapDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ParallelMapDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ParallelMapDatasetV2.pbtxt
index 87012c3c49f3d6..55e73b740adefd 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ParallelMapDatasetV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ParallelMapDatasetV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ParallelMapDatasetV2"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ParameterizedTruncatedNormal.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ParameterizedTruncatedNormal.pbtxt
index 8bca662ec52d00..1f96da6f7886e6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ParameterizedTruncatedNormal.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ParameterizedTruncatedNormal.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ParameterizedTruncatedNormal"
   input_arg {
     name: "shape"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ParseExample.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ParseExample.pbtxt
index b1020d356343db..a1e35bde86b0a1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ParseExample.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ParseExample.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ParseExample"
   input_arg {
     name: "serialized"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ParseExampleDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ParseExampleDataset.pbtxt
index 4e12e94a6a28c8..4dc9ac1efb6cd3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ParseExampleDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ParseExampleDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ParseExampleDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ParseExampleDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ParseExampleDatasetV2.pbtxt
index 8af77a0bd59095..59632a160b121c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ParseExampleDatasetV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ParseExampleDatasetV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ParseExampleDatasetV2"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ParseExampleV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ParseExampleV2.pbtxt
index cc2706b046e368..0d7e1d37e23ac9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ParseExampleV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ParseExampleV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ParseExampleV2"
   input_arg {
     name: "serialized"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ParseSequenceExample.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ParseSequenceExample.pbtxt
index 8060fe90a3553e..03ac5be8d26160 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ParseSequenceExample.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ParseSequenceExample.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ParseSequenceExample"
   input_arg {
     name: "serialized"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ParseSequenceExampleV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ParseSequenceExampleV2.pbtxt
index 07dfc9d7467e14..7ba59734893b0b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ParseSequenceExampleV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ParseSequenceExampleV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ParseSequenceExampleV2"
   input_arg {
     name: "serialized"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ParseSingleExample.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ParseSingleExample.pbtxt
index 0ea9857ac83b6f..aaa69af4f62ac6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ParseSingleExample.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ParseSingleExample.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ParseSingleExample"
   input_arg {
     name: "serialized"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ParseSingleSequenceExample.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ParseSingleSequenceExample.pbtxt
index c0f76a20b44160..a0f52dbdd1f406 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ParseSingleSequenceExample.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ParseSingleSequenceExample.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ParseSingleSequenceExample"
   input_arg {
     name: "serialized"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ParseTensor.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ParseTensor.pbtxt
index 20232f99165c8d..63d1f1292eed08 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ParseTensor.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ParseTensor.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ParseTensor"
   input_arg {
     name: "serialized"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/PartitionedCall.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/PartitionedCall.pbtxt
index 59752220042a63..b51bd1de9fce0d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/PartitionedCall.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/PartitionedCall.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "PartitionedCall"
   input_arg {
     name: "args"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Placeholder.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Placeholder.pbtxt
index ec0fdcf19f7ebd..7c0f57a94e177d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Placeholder.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Placeholder.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Placeholder"
   output_arg {
     name: "output"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/PlaceholderV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/PlaceholderV2.pbtxt
index e9d5b4840cb8e0..b2cd20b238fa5e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/PlaceholderV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/PlaceholderV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "PlaceholderV2"
   output_arg {
     name: "output"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/PlaceholderWithDefault.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/PlaceholderWithDefault.pbtxt
index 3d02d762312073..79a2ffb4492a6a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/PlaceholderWithDefault.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/PlaceholderWithDefault.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "PlaceholderWithDefault"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Polygamma.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Polygamma.pbtxt
index 3f20bc10cd2332..6bf0d9ba4cf9cc 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Polygamma.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Polygamma.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Polygamma"
   input_arg {
     name: "a"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/PopulationCount.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/PopulationCount.pbtxt
index 97e98373418383..d66c1ac00ae122 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/PopulationCount.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/PopulationCount.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "PopulationCount"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Pow.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Pow.pbtxt
index ad30b536cd7742..b1cc1b8f479801 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Pow.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Pow.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Pow"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/PrefetchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/PrefetchDataset.pbtxt
index 92926f6523d1ff..81953a0e75c0c9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/PrefetchDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/PrefetchDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "PrefetchDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Prelinearize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Prelinearize.pbtxt
index f74c9b381dce7d..b5ed810c25a426 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Prelinearize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Prelinearize.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Prelinearize"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/PrelinearizeTuple.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/PrelinearizeTuple.pbtxt
index 92de7f372a74d5..bb1ae7d3e2f828 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/PrelinearizeTuple.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/PrelinearizeTuple.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "PrelinearizeTuple"
   input_arg {
     name: "inputs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/PreventGradient.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/PreventGradient.pbtxt
index 19a95b09d5e645..1649fc808aa935 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/PreventGradient.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/PreventGradient.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "PreventGradient"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Print.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Print.pbtxt
index 1966093e81c6ad..fbbb514b177737 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Print.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Print.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Print"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/PrintV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/PrintV2.pbtxt
index 8f748465b810bd..c5942f0a6145a5 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/PrintV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/PrintV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "PrintV2"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/PriorityQueue.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/PriorityQueue.pbtxt
index af0f36d238d9e6..b44d83dfb20036 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/PriorityQueue.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/PriorityQueue.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "PriorityQueue"
   output_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/PriorityQueueV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/PriorityQueueV2.pbtxt
index ab426238988baf..a4e7c750b65a14 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/PriorityQueueV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/PriorityQueueV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "PriorityQueueV2"
   output_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/PrivateThreadPoolDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/PrivateThreadPoolDataset.pbtxt
index 483f4b88fd93f7..c16c1eb164728d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/PrivateThreadPoolDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/PrivateThreadPoolDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "PrivateThreadPoolDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Prod.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Prod.pbtxt
index d3c71eacd78cf9..fe9126bca16ed1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Prod.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Prod.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Prod"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/PyFunc.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/PyFunc.pbtxt
index de5661de7e2150..987f028051ea0d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/PyFunc.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/PyFunc.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "PyFunc"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/PyFuncStateless.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/PyFuncStateless.pbtxt
index fa2ac3e446b1a7..2a587d53d937c8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/PyFuncStateless.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/PyFuncStateless.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "PyFuncStateless"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Qr.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Qr.pbtxt
index 251f8ff7f7f0d6..8319528f8a016e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Qr.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Qr.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Qr"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizeAndDequantize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizeAndDequantize.pbtxt
index 30d0b750a8df0a..fb662f7057c2f0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizeAndDequantize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizeAndDequantize.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizeAndDequantize"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizeAndDequantizeV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizeAndDequantizeV2.pbtxt
index d1fa4fba7fd4e5..d946e753cf7bbf 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizeAndDequantizeV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizeAndDequantizeV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizeAndDequantizeV2"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizeAndDequantizeV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizeAndDequantizeV3.pbtxt
index 8d6a0317190cfe..cb7762a9deda20 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizeAndDequantizeV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizeAndDequantizeV3.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizeAndDequantizeV3"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizeAndDequantizeV4.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizeAndDequantizeV4.pbtxt
index fca137de6cd4bc..2a49131faaf2d8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizeAndDequantizeV4.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizeAndDequantizeV4.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizeAndDequantizeV4"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizeAndDequantizeV4Grad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizeAndDequantizeV4Grad.pbtxt
index 71df7854231502..0bbe87452b145d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizeAndDequantizeV4Grad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizeAndDequantizeV4Grad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizeAndDequantizeV4Grad"
   input_arg {
     name: "gradients"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizeDownAndShrinkRange.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizeDownAndShrinkRange.pbtxt
index ea6737ee390894..42783d3a14ebb4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizeDownAndShrinkRange.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizeDownAndShrinkRange.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizeDownAndShrinkRange"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizeV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizeV2.pbtxt
index d164717d5aaf39..37cd1384176d6e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizeV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizeV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizeV2"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizedAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizedAdd.pbtxt
index be3f511f03aec3..4532bc23d50a2b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizedAdd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizedAdd.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizedAdd"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizedAvgPool.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizedAvgPool.pbtxt
index 44e51eb39509f7..0ae3390d303c11 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizedAvgPool.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizedAvgPool.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizedAvgPool"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizedBatchNormWithGlobalNormalization.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizedBatchNormWithGlobalNormalization.pbtxt
index be4b9356265362..832b8ba577577d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizedBatchNormWithGlobalNormalization.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizedBatchNormWithGlobalNormalization.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizedBatchNormWithGlobalNormalization"
   input_arg {
     name: "t"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizedBiasAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizedBiasAdd.pbtxt
index 3c94fb54f0a497..b479c2c54e4d26 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizedBiasAdd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizedBiasAdd.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizedBiasAdd"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizedConcat.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizedConcat.pbtxt
index 2752ebfca09e38..449f588ac8f498 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizedConcat.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizedConcat.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizedConcat"
   input_arg {
     name: "concat_dim"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2D.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2D.pbtxt
index 3680e575ace2f3..b1cf1c8d334182 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2D.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2D.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizedConv2D"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DAndRelu.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DAndRelu.pbtxt
index c9f1696d1ecf67..229e4c436dd622 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DAndRelu.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DAndRelu.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizedConv2DAndRelu"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DAndReluAndRequantize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DAndReluAndRequantize.pbtxt
index 1cbc11d524e167..bc566896f53dfe 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DAndReluAndRequantize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DAndReluAndRequantize.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizedConv2DAndReluAndRequantize"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DAndRequantize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DAndRequantize.pbtxt
index d3166c425a984f..5d26709f14e31c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DAndRequantize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DAndRequantize.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizedConv2DAndRequantize"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DPerChannel.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DPerChannel.pbtxt
index 06c3d5f115fab3..93640944477061 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DPerChannel.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DPerChannel.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizedConv2DPerChannel"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DWithBias.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DWithBias.pbtxt
index f059cf786d14f0..8372a882260457 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DWithBias.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DWithBias.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizedConv2DWithBias"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DWithBiasAndRelu.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DWithBiasAndRelu.pbtxt
index 4eef8cb4c4f3c2..af0ce39a844546 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DWithBiasAndRelu.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DWithBiasAndRelu.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizedConv2DWithBiasAndRelu"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DWithBiasAndReluAndRequantize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DWithBiasAndReluAndRequantize.pbtxt
index 174a196a036974..599f19e666dcdc 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DWithBiasAndReluAndRequantize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DWithBiasAndReluAndRequantize.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizedConv2DWithBiasAndReluAndRequantize"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DWithBiasAndRequantize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DWithBiasAndRequantize.pbtxt
index 96ed2852b68046..8cf8fbb2eae658 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DWithBiasAndRequantize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DWithBiasAndRequantize.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizedConv2DWithBiasAndRequantize"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DWithBiasSignedSumAndReluAndRequantize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DWithBiasSignedSumAndReluAndRequantize.pbtxt
index 37f884f1cb0e88..e46786a9a74a86 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DWithBiasSignedSumAndReluAndRequantize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DWithBiasSignedSumAndReluAndRequantize.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizedConv2DWithBiasSignedSumAndReluAndRequantize"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DWithBiasSumAndRelu.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DWithBiasSumAndRelu.pbtxt
index ed100f68a9b58a..d74439b670ee37 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DWithBiasSumAndRelu.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DWithBiasSumAndRelu.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizedConv2DWithBiasSumAndRelu"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DWithBiasSumAndReluAndRequantize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DWithBiasSumAndReluAndRequantize.pbtxt
index d4a3c49b46c5c7..70c2366a19f2bd 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DWithBiasSumAndReluAndRequantize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizedConv2DWithBiasSumAndReluAndRequantize.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizedConv2DWithBiasSumAndReluAndRequantize"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizedDepthwiseConv2D.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizedDepthwiseConv2D.pbtxt
index 6b4163c799cdfa..f88bba239bf07b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizedDepthwiseConv2D.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizedDepthwiseConv2D.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizedDepthwiseConv2D"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizedDepthwiseConv2DWithBias.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizedDepthwiseConv2DWithBias.pbtxt
index c9e584543f3767..4faf839b3b6651 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizedDepthwiseConv2DWithBias.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizedDepthwiseConv2DWithBias.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizedDepthwiseConv2DWithBias"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizedDepthwiseConv2DWithBiasAndRelu.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizedDepthwiseConv2DWithBiasAndRelu.pbtxt
index a4bb2fcd372ded..6a4b514e23ffec 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizedDepthwiseConv2DWithBiasAndRelu.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizedDepthwiseConv2DWithBiasAndRelu.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizedDepthwiseConv2DWithBiasAndRelu"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize.pbtxt
index c24ba3165a8aca..3de6c3f9e28f47 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizedInstanceNorm.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizedInstanceNorm.pbtxt
index 720358f1bd01d7..98136d82ebda45 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizedInstanceNorm.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizedInstanceNorm.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizedInstanceNorm"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizedMatMul.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizedMatMul.pbtxt
index d1332abc5169ca..7e4707a316f80d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizedMatMul.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizedMatMul.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizedMatMul"
   input_arg {
     name: "a"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizedMatMulWithBias.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizedMatMulWithBias.pbtxt
index 55c80678a6eb90..a59adb7f78c6d7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizedMatMulWithBias.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizedMatMulWithBias.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizedMatMulWithBias"
   input_arg {
     name: "a"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizedMatMulWithBiasAndDequantize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizedMatMulWithBiasAndDequantize.pbtxt
index 6c6370a1ac0191..04ecfdbd855f83 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizedMatMulWithBiasAndDequantize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizedMatMulWithBiasAndDequantize.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizedMatMulWithBiasAndDequantize"
   input_arg {
     name: "a"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizedMatMulWithBiasAndRelu.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizedMatMulWithBiasAndRelu.pbtxt
index ecdeb1d05d1c90..cd0acb9d721657 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizedMatMulWithBiasAndRelu.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizedMatMulWithBiasAndRelu.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizedMatMulWithBiasAndRelu"
   input_arg {
     name: "a"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizedMatMulWithBiasAndReluAndRequantize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizedMatMulWithBiasAndReluAndRequantize.pbtxt
index 140ed993453766..b591d3fb37c868 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizedMatMulWithBiasAndReluAndRequantize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizedMatMulWithBiasAndReluAndRequantize.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizedMatMulWithBiasAndReluAndRequantize"
   input_arg {
     name: "a"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizedMatMulWithBiasAndRequantize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizedMatMulWithBiasAndRequantize.pbtxt
index 16b180475e46e2..1aab9762eea036 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizedMatMulWithBiasAndRequantize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizedMatMulWithBiasAndRequantize.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizedMatMulWithBiasAndRequantize"
   input_arg {
     name: "a"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizedMaxPool.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizedMaxPool.pbtxt
index 9164c0fcfc6160..47d6ac80518ef9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizedMaxPool.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizedMaxPool.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizedMaxPool"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizedMul.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizedMul.pbtxt
index 97c025672c48b2..795ab1341d9c67 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizedMul.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizedMul.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizedMul"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizedRelu.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizedRelu.pbtxt
index 77fb5c2eeb9041..724d8b3946bdb1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizedRelu.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizedRelu.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizedRelu"
   input_arg {
     name: "features"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizedRelu6.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizedRelu6.pbtxt
index b76f1159ec4f16..0f389d5eae9337 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizedRelu6.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizedRelu6.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizedRelu6"
   input_arg {
     name: "features"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizedReluX.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizedReluX.pbtxt
index 772ac59eba26ff..9ee6f0d2e274b9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizedReluX.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizedReluX.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizedReluX"
   input_arg {
     name: "features"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizedReshape.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizedReshape.pbtxt
index fa7b1fee5cf5f1..f54db98943cc14 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizedReshape.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizedReshape.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizedReshape"
   input_arg {
     name: "tensor"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizedResizeBilinear.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizedResizeBilinear.pbtxt
index aca635c5016a1a..bee577ed23b411 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QuantizedResizeBilinear.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizedResizeBilinear.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QuantizedResizeBilinear"
   input_arg {
     name: "images"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QueueClose.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QueueClose.pbtxt
index 0a4a305cc97568..582eeccd6a263a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QueueClose.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QueueClose.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QueueClose"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QueueCloseV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QueueCloseV2.pbtxt
index ab9c620c7fb7b4..e0544c13e654b4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QueueCloseV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QueueCloseV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QueueCloseV2"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QueueDequeue.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QueueDequeue.pbtxt
index f4a3b6fe195c42..f06745f20fe456 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QueueDequeue.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QueueDequeue.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QueueDequeue"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QueueDequeueMany.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QueueDequeueMany.pbtxt
index 986897a5bcd27f..374ecfb18a87c3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QueueDequeueMany.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QueueDequeueMany.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QueueDequeueMany"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QueueDequeueManyV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QueueDequeueManyV2.pbtxt
index 355b7905840281..f3ebc6c7e59288 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QueueDequeueManyV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QueueDequeueManyV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QueueDequeueManyV2"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QueueDequeueUpTo.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QueueDequeueUpTo.pbtxt
index 6e006006a784f5..6fa30ac810a0e5 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QueueDequeueUpTo.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QueueDequeueUpTo.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QueueDequeueUpTo"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QueueDequeueUpToV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QueueDequeueUpToV2.pbtxt
index 5901d313926bdf..2016cc7f04a0ad 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QueueDequeueUpToV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QueueDequeueUpToV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QueueDequeueUpToV2"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QueueDequeueV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QueueDequeueV2.pbtxt
index 44a893ea7c3a00..e338ccbd355a76 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QueueDequeueV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QueueDequeueV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QueueDequeueV2"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QueueEnqueue.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QueueEnqueue.pbtxt
index 689046ebae4e2d..fb94d288f2400e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QueueEnqueue.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QueueEnqueue.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QueueEnqueue"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QueueEnqueueMany.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QueueEnqueueMany.pbtxt
index f3cbf429f28639..2d9582430725b9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QueueEnqueueMany.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QueueEnqueueMany.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QueueEnqueueMany"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QueueEnqueueManyV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QueueEnqueueManyV2.pbtxt
index 159b3012075455..c327d27e2f11f6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QueueEnqueueManyV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QueueEnqueueManyV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QueueEnqueueManyV2"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QueueEnqueueV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QueueEnqueueV2.pbtxt
index e6bf061965d84b..da8cdd3cd67c48 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QueueEnqueueV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QueueEnqueueV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QueueEnqueueV2"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QueueIsClosed.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QueueIsClosed.pbtxt
index 42426552b51808..11a421b27c85e3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QueueIsClosed.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QueueIsClosed.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QueueIsClosed"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QueueIsClosedV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QueueIsClosedV2.pbtxt
index efe540401c9591..7cf1fde1bcee16 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QueueIsClosedV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QueueIsClosedV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QueueIsClosedV2"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QueueSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QueueSize.pbtxt
index 4dfe7aff427356..d2a49624f20ded 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QueueSize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QueueSize.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QueueSize"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QueueSizeV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QueueSizeV2.pbtxt
index c7e11d7cf326e4..46eb229a3fa7e6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/QueueSizeV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/QueueSizeV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "QueueSizeV2"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RFFT.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RFFT.pbtxt
index 0879b0cfe5f2ae..02456ea217b791 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RFFT.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RFFT.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RFFT"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RFFT2D.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RFFT2D.pbtxt
index 79ed06b41d4dea..f3676f45cde226 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RFFT2D.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RFFT2D.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RFFT2D"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RFFT3D.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RFFT3D.pbtxt
index 754d372cf560ce..6475cd47316c19 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RFFT3D.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RFFT3D.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RFFT3D"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RFFTND.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RFFTND.pbtxt
index 7f24e19efc06bf..8f75bea6baef82 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RFFTND.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RFFTND.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RFFTND"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RGBToHSV.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RGBToHSV.pbtxt
index 22b6995682fb33..9ed50d337d0346 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RGBToHSV.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RGBToHSV.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RGBToHSV"
   input_arg {
     name: "images"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RaggedBincount.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RaggedBincount.pbtxt
index 3fda312a6a4653..4f5fb24109cad4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RaggedBincount.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RaggedBincount.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RaggedBincount"
   input_arg {
     name: "splits"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RaggedCountSparseOutput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RaggedCountSparseOutput.pbtxt
index 644b0eca3ad379..aa1a4e07aafaa2 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RaggedCountSparseOutput.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RaggedCountSparseOutput.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RaggedCountSparseOutput"
   input_arg {
     name: "splits"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RaggedCross.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RaggedCross.pbtxt
index 2407315da2ee8a..0e9fe0adcfba2c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RaggedCross.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RaggedCross.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RaggedCross"
   input_arg {
     name: "ragged_values"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RaggedFillEmptyRows.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RaggedFillEmptyRows.pbtxt
index 54ba7be3cb1fdc..4587abc0f66e9a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RaggedFillEmptyRows.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RaggedFillEmptyRows.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RaggedFillEmptyRows"
   input_arg {
     name: "value_rowids"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RaggedFillEmptyRowsGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RaggedFillEmptyRowsGrad.pbtxt
index 04f8b3e6f3a821..ea9546d504068f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RaggedFillEmptyRowsGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RaggedFillEmptyRowsGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RaggedFillEmptyRowsGrad"
   input_arg {
     name: "reverse_index_map"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RaggedGather.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RaggedGather.pbtxt
index 5e0e69e365a36c..afa14e8e8a8b20 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RaggedGather.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RaggedGather.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RaggedGather"
   input_arg {
     name: "params_nested_splits"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RaggedRange.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RaggedRange.pbtxt
index e895110b6372a1..866c9b472d906d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RaggedRange.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RaggedRange.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RaggedRange"
   input_arg {
     name: "starts"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RaggedTensorFromVariant.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RaggedTensorFromVariant.pbtxt
index 50d8a8dd1b86db..5c750d0ce7e567 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RaggedTensorFromVariant.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RaggedTensorFromVariant.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RaggedTensorFromVariant"
   input_arg {
     name: "encoded_ragged"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RaggedTensorToSparse.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RaggedTensorToSparse.pbtxt
index bcbb0d25bc94ff..f9172b4cf3772a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RaggedTensorToSparse.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RaggedTensorToSparse.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RaggedTensorToSparse"
   input_arg {
     name: "rt_nested_splits"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RaggedTensorToTensor.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RaggedTensorToTensor.pbtxt
index 8ecd7fb2c86ec3..60fceb565ebc03 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RaggedTensorToTensor.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RaggedTensorToTensor.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RaggedTensorToTensor"
   input_arg {
     name: "shape"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RaggedTensorToVariant.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RaggedTensorToVariant.pbtxt
index 93d6b27fd053a1..f875a2a2ca7fae 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RaggedTensorToVariant.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RaggedTensorToVariant.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RaggedTensorToVariant"
   input_arg {
     name: "rt_nested_splits"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RaggedTensorToVariantGradient.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RaggedTensorToVariantGradient.pbtxt
index fc6ede9bd741bf..45f2fcefe04210 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RaggedTensorToVariantGradient.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RaggedTensorToVariantGradient.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RaggedTensorToVariantGradient"
   input_arg {
     name: "encoded_ragged_grad"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RandomCrop.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RandomCrop.pbtxt
index 9ca7d2f4b5b3bd..a5353cf58d5d41 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RandomCrop.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RandomCrop.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RandomCrop"
   input_arg {
     name: "image"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RandomDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RandomDataset.pbtxt
index 99fb23493250ca..d646d19b2e5ef2 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RandomDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RandomDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RandomDataset"
   input_arg {
     name: "seed"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RandomDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RandomDatasetV2.pbtxt
index d561045e0b738d..3cf01f7644e9f1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RandomDatasetV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RandomDatasetV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RandomDatasetV2"
   input_arg {
     name: "seed"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RandomGamma.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RandomGamma.pbtxt
index 7c655027ce1832..2f38a20f8f0a2e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RandomGamma.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RandomGamma.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RandomGamma"
   input_arg {
     name: "shape"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RandomGammaGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RandomGammaGrad.pbtxt
index f0c1b5033a5ddd..1e1c0723f6cbfe 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RandomGammaGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RandomGammaGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RandomGammaGrad"
   input_arg {
     name: "alpha"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RandomIndexShuffle.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RandomIndexShuffle.pbtxt
index 06954c6cc639a3..22e6b88af0a921 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RandomIndexShuffle.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RandomIndexShuffle.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RandomIndexShuffle"
   input_arg {
     name: "index"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RandomPoisson.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RandomPoisson.pbtxt
index 33d8920e36dcaa..5499e8d678c590 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RandomPoisson.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RandomPoisson.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RandomPoisson"
   input_arg {
     name: "shape"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RandomPoissonV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RandomPoissonV2.pbtxt
index 1330833972ef23..6c3d9827a35cf8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RandomPoissonV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RandomPoissonV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RandomPoissonV2"
   input_arg {
     name: "shape"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RandomShuffle.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RandomShuffle.pbtxt
index 189e7d770a0652..ddd1a8d3f2b2c1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RandomShuffle.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RandomShuffle.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RandomShuffle"
   input_arg {
     name: "value"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RandomShuffleQueue.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RandomShuffleQueue.pbtxt
index 5793974674e561..550acae8d5be11 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RandomShuffleQueue.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RandomShuffleQueue.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RandomShuffleQueue"
   output_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RandomShuffleQueueV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RandomShuffleQueueV2.pbtxt
index b73d70541fafab..7d9807c4e9564b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RandomShuffleQueueV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RandomShuffleQueueV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RandomShuffleQueueV2"
   output_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RandomStandardNormal.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RandomStandardNormal.pbtxt
index c693a9f7ba2e17..71fe5e5ef326b8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RandomStandardNormal.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RandomStandardNormal.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RandomStandardNormal"
   input_arg {
     name: "shape"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RandomUniform.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RandomUniform.pbtxt
index 63c637f0de1309..449a9ef973929c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RandomUniform.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RandomUniform.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RandomUniform"
   input_arg {
     name: "shape"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RandomUniformInt.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RandomUniformInt.pbtxt
index 35ad67cb371c9e..3b89715afcad98 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RandomUniformInt.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RandomUniformInt.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RandomUniformInt"
   input_arg {
     name: "shape"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Range.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Range.pbtxt
index 6c2023eb68e78c..306116f59bfd42 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Range.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Range.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Range"
   input_arg {
     name: "start"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RangeDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RangeDataset.pbtxt
index 91fe787a7be594..9121cf0567e518 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RangeDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RangeDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RangeDataset"
   input_arg {
     name: "start"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Rank.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Rank.pbtxt
index d44aeb9ec5fe4e..c12fd9a0abf07b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Rank.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Rank.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Rank"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ReadFile.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ReadFile.pbtxt
index 8bf1beeb43a473..ce1985ec3c5cba 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ReadFile.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ReadFile.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ReadFile"
   input_arg {
     name: "filename"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ReadVariableOp.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ReadVariableOp.pbtxt
index 2123cfedd6cb68..5459632d58351e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ReadVariableOp.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ReadVariableOp.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ReadVariableOp"
   input_arg {
     name: "resource"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ReadVariableXlaSplitND.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ReadVariableXlaSplitND.pbtxt
index be5364ceb1f1b5..f5216da5c96aac 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ReadVariableXlaSplitND.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ReadVariableXlaSplitND.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ReadVariableXlaSplitND"
   input_arg {
     name: "resource"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ReaderNumRecordsProduced.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ReaderNumRecordsProduced.pbtxt
index 4514d3aac61611..50b1ea00da8f29 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ReaderNumRecordsProduced.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ReaderNumRecordsProduced.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ReaderNumRecordsProduced"
   input_arg {
     name: "reader_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ReaderNumRecordsProducedV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ReaderNumRecordsProducedV2.pbtxt
index bff4305f3bfbb8..f560f01d443b08 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ReaderNumRecordsProducedV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ReaderNumRecordsProducedV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ReaderNumRecordsProducedV2"
   input_arg {
     name: "reader_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ReaderNumWorkUnitsCompleted.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ReaderNumWorkUnitsCompleted.pbtxt
index 4ef2c1950f9507..b1e361e0119f22 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ReaderNumWorkUnitsCompleted.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ReaderNumWorkUnitsCompleted.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ReaderNumWorkUnitsCompleted"
   input_arg {
     name: "reader_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ReaderNumWorkUnitsCompletedV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ReaderNumWorkUnitsCompletedV2.pbtxt
index 8f136052d08810..ee4c93e19bab64 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ReaderNumWorkUnitsCompletedV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ReaderNumWorkUnitsCompletedV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ReaderNumWorkUnitsCompletedV2"
   input_arg {
     name: "reader_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ReaderRead.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ReaderRead.pbtxt
index fefc032a3460d9..b2a933892c4226 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ReaderRead.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ReaderRead.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ReaderRead"
   input_arg {
     name: "reader_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ReaderReadUpTo.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ReaderReadUpTo.pbtxt
index 148095de5cae3a..e3bb64ec391d86 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ReaderReadUpTo.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ReaderReadUpTo.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ReaderReadUpTo"
   input_arg {
     name: "reader_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ReaderReadUpToV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ReaderReadUpToV2.pbtxt
index f007588ebb608b..2ad62b16c9e928 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ReaderReadUpToV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ReaderReadUpToV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ReaderReadUpToV2"
   input_arg {
     name: "reader_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ReaderReadV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ReaderReadV2.pbtxt
index 4e74f1b5b75080..3a1573147dcbe4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ReaderReadV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ReaderReadV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ReaderReadV2"
   input_arg {
     name: "reader_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ReaderReset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ReaderReset.pbtxt
index 4379c400833e0f..9607f83c47016d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ReaderReset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ReaderReset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ReaderReset"
   input_arg {
     name: "reader_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ReaderResetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ReaderResetV2.pbtxt
index eecd28bb726e79..56f862ae795184 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ReaderResetV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ReaderResetV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ReaderResetV2"
   input_arg {
     name: "reader_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ReaderRestoreState.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ReaderRestoreState.pbtxt
index 1c5f71fe0f3521..717a5c34c8de93 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ReaderRestoreState.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ReaderRestoreState.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ReaderRestoreState"
   input_arg {
     name: "reader_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ReaderRestoreStateV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ReaderRestoreStateV2.pbtxt
index 1b10902e081b7b..f75b04fc59d4e0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ReaderRestoreStateV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ReaderRestoreStateV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ReaderRestoreStateV2"
   input_arg {
     name: "reader_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ReaderSerializeState.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ReaderSerializeState.pbtxt
index d3a95d06f33347..2f708cb8926695 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ReaderSerializeState.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ReaderSerializeState.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ReaderSerializeState"
   input_arg {
     name: "reader_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ReaderSerializeStateV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ReaderSerializeStateV2.pbtxt
index 4946352509bcb4..c4ade1409fbacd 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ReaderSerializeStateV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ReaderSerializeStateV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ReaderSerializeStateV2"
   input_arg {
     name: "reader_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Real.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Real.pbtxt
index 0f326a3de147ed..d7e783ebe72f15 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Real.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Real.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Real"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RealDiv.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RealDiv.pbtxt
index a5a78f14cc7c5a..6f725e22c64473 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RealDiv.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RealDiv.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RealDiv"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RebatchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RebatchDataset.pbtxt
index bc70fcfadadb8e..176f94a3329c2b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RebatchDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RebatchDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RebatchDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RebatchDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RebatchDatasetV2.pbtxt
index fda65d7bfaba18..fd853b65b2685c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RebatchDatasetV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RebatchDatasetV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RebatchDatasetV2"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Reciprocal.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Reciprocal.pbtxt
index 35cab85ee2a898..7e03554871a4eb 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Reciprocal.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Reciprocal.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Reciprocal"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ReciprocalGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ReciprocalGrad.pbtxt
index eea1bbe1c8d667..8884c796da5e6e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ReciprocalGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ReciprocalGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ReciprocalGrad"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RecordInput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RecordInput.pbtxt
index c353a17b318002..a72374420ee28e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RecordInput.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RecordInput.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RecordInput"
   output_arg {
     name: "records"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Recv.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Recv.pbtxt
index 87c7eb65ca87e8..e6717e57f8dedf 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Recv.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Recv.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Recv"
   output_arg {
     name: "tensor"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RecvTPUEmbeddingActivations.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RecvTPUEmbeddingActivations.pbtxt
index 4cb9bd42ec045c..0fec828421f91b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RecvTPUEmbeddingActivations.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RecvTPUEmbeddingActivations.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RecvTPUEmbeddingActivations"
   output_arg {
     name: "outputs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ReduceDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ReduceDataset.pbtxt
index c98f1045e53af2..a1a17bc20c534d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ReduceDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ReduceDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ReduceDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ReduceJoin.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ReduceJoin.pbtxt
index 3603cf38c0b098..28880fcfb06a99 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ReduceJoin.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ReduceJoin.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ReduceJoin"
   input_arg {
     name: "inputs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RefEnter.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RefEnter.pbtxt
index 8c7921571d9d24..9af599d1b338bb 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RefEnter.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RefEnter.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RefEnter"
   input_arg {
     name: "data"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RefExit.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RefExit.pbtxt
index b13adfac00995a..1f9e84e7fade8b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RefExit.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RefExit.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RefExit"
   input_arg {
     name: "data"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RefIdentity.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RefIdentity.pbtxt
index 4fa1105b6952d0..d2293fdf467a7b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RefIdentity.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RefIdentity.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RefIdentity"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RefMerge.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RefMerge.pbtxt
index 31913cc794708d..fc4794d2f2f9d7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RefMerge.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RefMerge.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RefMerge"
   input_arg {
     name: "inputs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RefNextIteration.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RefNextIteration.pbtxt
index 453bce7c335929..d447a3a87b23e4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RefNextIteration.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RefNextIteration.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RefNextIteration"
   input_arg {
     name: "data"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RefSelect.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RefSelect.pbtxt
index 688a34a5886fd4..aa2645f9ff17a3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RefSelect.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RefSelect.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RefSelect"
   input_arg {
     name: "index"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RefSwitch.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RefSwitch.pbtxt
index fba45dc4f98e00..6d12be2e5a8fa9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RefSwitch.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RefSwitch.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RefSwitch"
   input_arg {
     name: "data"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RegexFullMatch.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RegexFullMatch.pbtxt
index 7b337477adb60d..f2c0b7b99f1c71 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RegexFullMatch.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RegexFullMatch.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RegexFullMatch"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RegexReplace.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RegexReplace.pbtxt
index d8e7b8c143f4be..591773ce37416d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RegexReplace.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RegexReplace.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RegexReplace"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RegisterDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RegisterDataset.pbtxt
index 3cac6a9d2a0b6d..a144d96163e746 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RegisterDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RegisterDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RegisterDataset"
   input_arg {
     name: "dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RegisterDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RegisterDatasetV2.pbtxt
index 3031d65c39b18b..1ebced2af8d75f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RegisterDatasetV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RegisterDatasetV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RegisterDatasetV2"
   input_arg {
     name: "dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Relayout.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Relayout.pbtxt
index 118574bf66fa05..da96a32cf86245 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Relayout.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Relayout.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Relayout"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RelayoutLike.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RelayoutLike.pbtxt
index df5b2f89a73424..7a5af4f919e1c9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RelayoutLike.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RelayoutLike.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RelayoutLike"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Relu.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Relu.pbtxt
index 42ae6688c797ef..703fbbeff56f91 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Relu.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Relu.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Relu"
   input_arg {
     name: "features"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Relu6.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Relu6.pbtxt
index 110c799682733e..311c3297a16411 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Relu6.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Relu6.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Relu6"
   input_arg {
     name: "features"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Relu6Grad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Relu6Grad.pbtxt
index 5455dc384cca0c..618e13a2297acc 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Relu6Grad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Relu6Grad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Relu6Grad"
   input_arg {
     name: "gradients"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ReluGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ReluGrad.pbtxt
index 89c486883f26c7..b14f23bb30bddc 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ReluGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ReluGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ReluGrad"
   input_arg {
     name: "gradients"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RemoteCall.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RemoteCall.pbtxt
index efe22174170af6..c6bc594510340c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RemoteCall.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RemoteCall.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RemoteCall"
   input_arg {
     name: "target"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RepeatDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RepeatDataset.pbtxt
index f8f2cd8da0f50c..b7914feb4dfbc7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RepeatDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RepeatDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RepeatDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RequantizationRange.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RequantizationRange.pbtxt
index 9ff7aa997d1a0c..6a489081eda2da 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RequantizationRange.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RequantizationRange.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RequantizationRange"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RequantizationRangePerChannel.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RequantizationRangePerChannel.pbtxt
index f0352b164b215b..b621afb7a80b52 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RequantizationRangePerChannel.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RequantizationRangePerChannel.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RequantizationRangePerChannel"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Requantize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Requantize.pbtxt
index a142594170dc41..c04d32f19aeb16 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Requantize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Requantize.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Requantize"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RequantizePerChannel.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RequantizePerChannel.pbtxt
index 1e5b406fc0fb1e..3ed03fe12de763 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RequantizePerChannel.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RequantizePerChannel.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RequantizePerChannel"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Reshape.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Reshape.pbtxt
index fcbd49a8ce805f..e422ffa2470633 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Reshape.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Reshape.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Reshape"
   input_arg {
     name: "tensor"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResizeArea.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResizeArea.pbtxt
index 315069f44fa499..ec861564fc4cc6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResizeArea.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResizeArea.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResizeArea"
   input_arg {
     name: "images"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResizeBicubic.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResizeBicubic.pbtxt
index 2de5ed2e280f54..5e479a9432b876 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResizeBicubic.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResizeBicubic.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResizeBicubic"
   input_arg {
     name: "images"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResizeBicubicGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResizeBicubicGrad.pbtxt
index b085c1d6f46f67..6de227d7e0fd8f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResizeBicubicGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResizeBicubicGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResizeBicubicGrad"
   input_arg {
     name: "grads"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResizeBilinear.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResizeBilinear.pbtxt
index ea43150a420c04..b991a8ad63ddc8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResizeBilinear.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResizeBilinear.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResizeBilinear"
   input_arg {
     name: "images"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResizeBilinearGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResizeBilinearGrad.pbtxt
index a78a0b699d7072..79d1605fd1c4a8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResizeBilinearGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResizeBilinearGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResizeBilinearGrad"
   input_arg {
     name: "grads"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResizeNearestNeighbor.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResizeNearestNeighbor.pbtxt
index a61928b06f38ff..18d87c3583c856 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResizeNearestNeighbor.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResizeNearestNeighbor.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResizeNearestNeighbor"
   input_arg {
     name: "images"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResizeNearestNeighborGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResizeNearestNeighborGrad.pbtxt
index 5bf5aa96eccc2b..001525b07337d6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResizeNearestNeighborGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResizeNearestNeighborGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResizeNearestNeighborGrad"
   input_arg {
     name: "grads"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceAccumulatorApplyGradient.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceAccumulatorApplyGradient.pbtxt
index 05ec4d8a404634..a1e03efd365a02 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceAccumulatorApplyGradient.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceAccumulatorApplyGradient.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceAccumulatorApplyGradient"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceAccumulatorNumAccumulated.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceAccumulatorNumAccumulated.pbtxt
index 5773bd5cc85b18..398171da210bf4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceAccumulatorNumAccumulated.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceAccumulatorNumAccumulated.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceAccumulatorNumAccumulated"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceAccumulatorSetGlobalStep.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceAccumulatorSetGlobalStep.pbtxt
index 902748a29f5836..3e9c5a29882859 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceAccumulatorSetGlobalStep.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceAccumulatorSetGlobalStep.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceAccumulatorSetGlobalStep"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceAccumulatorTakeGradient.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceAccumulatorTakeGradient.pbtxt
index 5f37567706f7f0..d690fd198e62bd 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceAccumulatorTakeGradient.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceAccumulatorTakeGradient.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceAccumulatorTakeGradient"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdaMax.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdaMax.pbtxt
index ef87929c4f5eb6..3f307b70b7cac2 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdaMax.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdaMax.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceApplyAdaMax"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdadelta.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdadelta.pbtxt
index 2a671e082a687f..f4c58b9860a033 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdadelta.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdadelta.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceApplyAdadelta"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdagrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdagrad.pbtxt
index 67881c44ac854d..3c8e9d071a98b3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdagrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdagrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceApplyAdagrad"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdagradDA.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdagradDA.pbtxt
index 1bab7daf7aea46..996bc0b1a5cd71 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdagradDA.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdagradDA.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceApplyAdagradDA"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdagradV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdagradV2.pbtxt
index 66aaa456bedf82..09168ff5c4e03b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdagradV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdagradV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceApplyAdagradV2"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdam.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdam.pbtxt
index e76225de60701e..88cafe521d6800 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdam.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdam.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceApplyAdam"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdamWithAmsgrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdamWithAmsgrad.pbtxt
index ab8696a3263a89..ccd84e52d5a820 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdamWithAmsgrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdamWithAmsgrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceApplyAdamWithAmsgrad"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAddSign.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAddSign.pbtxt
index 44f5b7d88e0c07..4653229b0aebf1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAddSign.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAddSign.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceApplyAddSign"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyCenteredRMSProp.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyCenteredRMSProp.pbtxt
index b7c69b1c832fff..f6fa5b2a351164 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyCenteredRMSProp.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyCenteredRMSProp.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceApplyCenteredRMSProp"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyFtrl.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyFtrl.pbtxt
index f94944686d5a14..f917b0fc22fa6f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyFtrl.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyFtrl.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceApplyFtrl"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyFtrlV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyFtrlV2.pbtxt
index 597ce4ab164aa0..d5586b7f73de74 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyFtrlV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyFtrlV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceApplyFtrlV2"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyGradientDescent.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyGradientDescent.pbtxt
index 6bd3170e2c6bde..83639c23ea1d7d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyGradientDescent.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyGradientDescent.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceApplyGradientDescent"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyKerasMomentum.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyKerasMomentum.pbtxt
index 12e22d4167d7dc..737020cd744516 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyKerasMomentum.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyKerasMomentum.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceApplyKerasMomentum"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyMomentum.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyMomentum.pbtxt
index 20de47ab2895d0..a6d0dd9243836e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyMomentum.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyMomentum.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceApplyMomentum"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyPowerSign.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyPowerSign.pbtxt
index 261463a5512584..5a7e103c707c04 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyPowerSign.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyPowerSign.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceApplyPowerSign"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyProximalAdagrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyProximalAdagrad.pbtxt
index 2c6007597ca3b1..c3adae4e5fc900 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyProximalAdagrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyProximalAdagrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceApplyProximalAdagrad"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyProximalGradientDescent.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyProximalGradientDescent.pbtxt
index dbe02a88ff079d..9c55be37dfc8da 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyProximalGradientDescent.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyProximalGradientDescent.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceApplyProximalGradientDescent"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyRMSProp.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyRMSProp.pbtxt
index 90f24a83fb81e7..b0be4acdf19643 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyRMSProp.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyRMSProp.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceApplyRMSProp"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceConditionalAccumulator.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceConditionalAccumulator.pbtxt
index 389486faef4e8e..cf3c150ccb9984 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceConditionalAccumulator.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceConditionalAccumulator.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceConditionalAccumulator"
   output_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceCountUpTo.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceCountUpTo.pbtxt
index 3f07aa17a613e4..352935c2c167a4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceCountUpTo.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceCountUpTo.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceCountUpTo"
   input_arg {
     name: "resource"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceGather.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceGather.pbtxt
index 47a841492de645..9aa33d994c6cea 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceGather.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceGather.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceGather"
   input_arg {
     name: "resource"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceGatherNd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceGatherNd.pbtxt
index f8df557edbbdbb..04794f402cde00 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceGatherNd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceGatherNd.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceGatherNd"
   input_arg {
     name: "resource"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterAdd.pbtxt
index 5b10cf3f16c0ce..a061da9db360bc 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterAdd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterAdd.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceScatterAdd"
   input_arg {
     name: "resource"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterDiv.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterDiv.pbtxt
index 0b94ef0dec43bd..d5b683c107ef47 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterDiv.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterDiv.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceScatterDiv"
   input_arg {
     name: "resource"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterMax.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterMax.pbtxt
index d6e97844047d95..d7f1e9c8a37e69 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterMax.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterMax.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceScatterMax"
   input_arg {
     name: "resource"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterMin.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterMin.pbtxt
index d012a861218190..617f12b35c83a7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterMin.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterMin.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceScatterMin"
   input_arg {
     name: "resource"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterMul.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterMul.pbtxt
index aa859ad4d252fe..313b2ed03c038b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterMul.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterMul.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceScatterMul"
   input_arg {
     name: "resource"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterNdAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterNdAdd.pbtxt
index 5cc51a5559e4a0..507b30eff9a2d6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterNdAdd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterNdAdd.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceScatterNdAdd"
   input_arg {
     name: "ref"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterNdMax.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterNdMax.pbtxt
index 9933b54896edff..8a40ac6e842a03 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterNdMax.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterNdMax.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceScatterNdMax"
   input_arg {
     name: "ref"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterNdMin.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterNdMin.pbtxt
index bd0e7589184074..7567b43d39f025 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterNdMin.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterNdMin.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceScatterNdMin"
   input_arg {
     name: "ref"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterNdSub.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterNdSub.pbtxt
index bd5785e83d4bf8..9d1a74daa931f8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterNdSub.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterNdSub.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceScatterNdSub"
   input_arg {
     name: "ref"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterNdUpdate.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterNdUpdate.pbtxt
index 0b6ae92e036d84..4305163fc92280 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterNdUpdate.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterNdUpdate.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceScatterNdUpdate"
   input_arg {
     name: "ref"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterSub.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterSub.pbtxt
index 32257603e3aeef..3a267f9f63b289 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterSub.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterSub.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceScatterSub"
   input_arg {
     name: "resource"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterUpdate.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterUpdate.pbtxt
index 320a7b0285e79d..55101b84c33871 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterUpdate.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterUpdate.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceScatterUpdate"
   input_arg {
     name: "resource"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyAdadelta.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyAdadelta.pbtxt
index 21aa0addab9259..562e667d584b04 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyAdadelta.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyAdadelta.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceSparseApplyAdadelta"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyAdagrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyAdagrad.pbtxt
index 7cc9e1f5cd2927..4588dc668bd9b4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyAdagrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyAdagrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceSparseApplyAdagrad"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyAdagradDA.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyAdagradDA.pbtxt
index c28ddc89e06058..e9ef6077dcedf7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyAdagradDA.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyAdagradDA.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceSparseApplyAdagradDA"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyAdagradV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyAdagradV2.pbtxt
index a14ba1dc5d2d55..e0724d11142c5a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyAdagradV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyAdagradV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceSparseApplyAdagradV2"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyCenteredRMSProp.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyCenteredRMSProp.pbtxt
index 855c9982f3f6f5..6d02f81629efda 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyCenteredRMSProp.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyCenteredRMSProp.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceSparseApplyCenteredRMSProp"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyFtrl.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyFtrl.pbtxt
index 7bb28e90106812..8ac38c09e514b4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyFtrl.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyFtrl.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceSparseApplyFtrl"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyFtrlV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyFtrlV2.pbtxt
index c43d4a60f27998..cff68beaf1cbe7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyFtrlV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyFtrlV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceSparseApplyFtrlV2"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyKerasMomentum.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyKerasMomentum.pbtxt
index 96b22b6cf0bf4a..648d85a4989da1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyKerasMomentum.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyKerasMomentum.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceSparseApplyKerasMomentum"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyMomentum.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyMomentum.pbtxt
index 03b05570cb5ccd..ec3e5a96db592b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyMomentum.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyMomentum.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceSparseApplyMomentum"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyProximalAdagrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyProximalAdagrad.pbtxt
index 0093fb2c50f763..59ae118cd9522d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyProximalAdagrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyProximalAdagrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceSparseApplyProximalAdagrad"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyProximalGradientDescent.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyProximalGradientDescent.pbtxt
index 0844c8d434921c..246f442d27756e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyProximalGradientDescent.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyProximalGradientDescent.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceSparseApplyProximalGradientDescent"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyRMSProp.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyRMSProp.pbtxt
index 87803f3e89366f..8735a6fcacc18e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyRMSProp.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyRMSProp.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceSparseApplyRMSProp"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceStridedSliceAssign.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceStridedSliceAssign.pbtxt
index 430195fab5c0d0..867f205958e76c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceStridedSliceAssign.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceStridedSliceAssign.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ResourceStridedSliceAssign"
   input_arg {
     name: "ref"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Restore.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Restore.pbtxt
index 269b27c0b71f59..1db029076032ac 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Restore.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Restore.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Restore"
   input_arg {
     name: "file_pattern"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RestoreSlice.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RestoreSlice.pbtxt
index 74303c89f05152..03d2aa3bbf56ff 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RestoreSlice.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RestoreSlice.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RestoreSlice"
   input_arg {
     name: "file_pattern"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RestoreV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RestoreV2.pbtxt
index 99319760e597e7..a88db314bc4906 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RestoreV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RestoreV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RestoreV2"
   input_arg {
     name: "prefix"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RetrieveAllTPUEmbeddingParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RetrieveAllTPUEmbeddingParameters.pbtxt
index ea305f05a813bb..226bed39734bf1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RetrieveAllTPUEmbeddingParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RetrieveAllTPUEmbeddingParameters.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RetrieveAllTPUEmbeddingParameters"
   output_arg {
     name: "parameters"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingADAMParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingADAMParameters.pbtxt
index adfc5f50337be2..dfef40371c8098 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingADAMParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingADAMParameters.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RetrieveTPUEmbeddingADAMParameters"
   output_arg {
     name: "parameters"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingAdadeltaParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingAdadeltaParameters.pbtxt
index 3c514f34bde42c..f887714c78b20e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingAdadeltaParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingAdadeltaParameters.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RetrieveTPUEmbeddingAdadeltaParameters"
   output_arg {
     name: "parameters"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingAdagradMomentumParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingAdagradMomentumParameters.pbtxt
index 747aecc018ccc4..9a84dc7b0cc57f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingAdagradMomentumParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingAdagradMomentumParameters.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RetrieveTPUEmbeddingAdagradMomentumParameters"
   output_arg {
     name: "parameters"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingAdagradParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingAdagradParameters.pbtxt
index cc9f172411d370..27e1aa441b60ae 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingAdagradParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingAdagradParameters.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RetrieveTPUEmbeddingAdagradParameters"
   output_arg {
     name: "parameters"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingCenteredRMSPropParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingCenteredRMSPropParameters.pbtxt
index 6ce83985ca788f..afdcbd5cb5e7c9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingCenteredRMSPropParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingCenteredRMSPropParameters.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RetrieveTPUEmbeddingCenteredRMSPropParameters"
   output_arg {
     name: "parameters"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingFTRLParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingFTRLParameters.pbtxt
index d0d5fd74a5f8b1..161fde82db383b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingFTRLParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingFTRLParameters.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RetrieveTPUEmbeddingFTRLParameters"
   output_arg {
     name: "parameters"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingFrequencyEstimatorParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingFrequencyEstimatorParameters.pbtxt
index 1125ad0e594c68..633bf51cab0d68 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingFrequencyEstimatorParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingFrequencyEstimatorParameters.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RetrieveTPUEmbeddingFrequencyEstimatorParameters"
   output_arg {
     name: "parameters"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingMDLAdagradLightParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingMDLAdagradLightParameters.pbtxt
index c2e4047eacfc4a..bf9d96bf37dea8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingMDLAdagradLightParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingMDLAdagradLightParameters.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RetrieveTPUEmbeddingMDLAdagradLightParameters"
   output_arg {
     name: "parameters"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingMomentumParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingMomentumParameters.pbtxt
index 6db5898c1e3a9a..3b8f98b755312f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingMomentumParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingMomentumParameters.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RetrieveTPUEmbeddingMomentumParameters"
   output_arg {
     name: "parameters"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingProximalAdagradParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingProximalAdagradParameters.pbtxt
index 5cf6dbfe8d767a..bd7c8288ff2af4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingProximalAdagradParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingProximalAdagradParameters.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RetrieveTPUEmbeddingProximalAdagradParameters"
   output_arg {
     name: "parameters"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingProximalYogiParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingProximalYogiParameters.pbtxt
index dedd0ee1826811..a74c0caa0862ad 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingProximalYogiParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingProximalYogiParameters.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RetrieveTPUEmbeddingProximalYogiParameters"
   output_arg {
     name: "parameters"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingRMSPropParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingRMSPropParameters.pbtxt
index d4028e2d164b75..137d3bd497dd76 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingRMSPropParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingRMSPropParameters.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RetrieveTPUEmbeddingRMSPropParameters"
   output_arg {
     name: "parameters"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingStochasticGradientDescentParameters.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingStochasticGradientDescentParameters.pbtxt
index 44b3a038b1ca5e..da65f5ae98e542 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingStochasticGradientDescentParameters.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RetrieveTPUEmbeddingStochasticGradientDescentParameters.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RetrieveTPUEmbeddingStochasticGradientDescentParameters"
   output_arg {
     name: "parameters"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Reverse.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Reverse.pbtxt
index 9eced34f0a50d9..aecf71f072f0cd 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Reverse.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Reverse.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Reverse"
   input_arg {
     name: "tensor"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ReverseSequence.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ReverseSequence.pbtxt
index c3119a2fe49707..74d3601e1f4aba 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ReverseSequence.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ReverseSequence.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ReverseSequence"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ReverseV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ReverseV2.pbtxt
index fc9a980a8712bc..8cbc98e30a5390 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ReverseV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ReverseV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ReverseV2"
   input_arg {
     name: "tensor"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RewriteDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RewriteDataset.pbtxt
index d831b7cac3f549..3e3e43b46fed65 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RewriteDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RewriteDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RewriteDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RightShift.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RightShift.pbtxt
index 76bcdc4bd20b38..97257a019725a2 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RightShift.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RightShift.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RightShift"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Rint.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Rint.pbtxt
index f128a08a47a130..feed3bca0b3ef1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Rint.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Rint.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Rint"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscAbs.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscAbs.pbtxt
index a1d64124f38587..2b3111bf580739 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscAbs.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscAbs.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscAbs"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscAdd.pbtxt
index d98094d312b9ad..86cc81e964c101 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscAdd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscAdd.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscAdd"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscBinaryArithmetic.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscBinaryArithmetic.pbtxt
index f2919e4c26cbf9..9d5f080ed02465 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscBinaryArithmetic.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscBinaryArithmetic.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscBinaryArithmetic"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscBinaryComparison.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscBinaryComparison.pbtxt
index 88731716d93946..d131476f8fe944 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscBinaryComparison.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscBinaryComparison.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscBinaryComparison"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscBitcast.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscBitcast.pbtxt
index 738659831a958d..1d37369adec753 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscBitcast.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscBitcast.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscBitcast"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscBroadcast.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscBroadcast.pbtxt
index 61318f39439e77..e81e8413dfb88b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscBroadcast.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscBroadcast.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscBroadcast"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscCast.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscCast.pbtxt
index 2b82bc3ddc9939..344d0496b27962 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscCast.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscCast.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscCast"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscCeil.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscCeil.pbtxt
index 904aca9f50b86d..ff1fefc0115839 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscCeil.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscCeil.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscCeil"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscCholesky.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscCholesky.pbtxt
index 3b8d59b6efac76..c6b24d107be7f3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscCholesky.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscCholesky.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscCholesky"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscConcat.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscConcat.pbtxt
index 0b062957805f5f..889de347165b26 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscConcat.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscConcat.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscConcat"
   input_arg {
     name: "values"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscCondition.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscCondition.pbtxt
index a348796ece3143..859814c65905b2 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscCondition.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscCondition.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscCondition"
   input_arg {
     name: "pred"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscConv.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscConv.pbtxt
index 06103b7729dab5..e9d326be8f286f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscConv.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscConv.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscConv"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscCos.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscCos.pbtxt
index d89ad75fc9fa25..98cebc12c5a9ee 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscCos.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscCos.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscCos"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscDiv.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscDiv.pbtxt
index d2196f83d64b61..98aaf5ae97b8c5 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscDiv.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscDiv.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscDiv"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscDot.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscDot.pbtxt
index 55101191acb75f..0b3ff5c11e3646 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscDot.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscDot.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscDot"
   input_arg {
     name: "a"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscExp.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscExp.pbtxt
index 9775069c7f693e..4386db96ae9960 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscExp.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscExp.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscExp"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscFft.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscFft.pbtxt
index de6960750f34ef..605cd7edd6e04e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscFft.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscFft.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscFft"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscFloor.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscFloor.pbtxt
index 71b73dca0fe261..55941ca352c2ec 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscFloor.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscFloor.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscFloor"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscGather.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscGather.pbtxt
index 87681965ec7f3d..18d4ba3fdc90d4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscGather.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscGather.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscGather"
   input_arg {
     name: "params"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscImag.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscImag.pbtxt
index c6e17d5296d93e..555ea9d071fe6a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscImag.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscImag.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscImag"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscIsFinite.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscIsFinite.pbtxt
index 1b6574899d378d..19a4ae6617c0cb 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscIsFinite.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscIsFinite.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscIsFinite"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscLog.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscLog.pbtxt
index e52bc8fbe8ad02..23bbef3b07a3b0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscLog.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscLog.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscLog"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscLogicalAnd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscLogicalAnd.pbtxt
index 04777105ffae7f..8bd4410a056174 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscLogicalAnd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscLogicalAnd.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscLogicalAnd"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscLogicalNot.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscLogicalNot.pbtxt
index d743f6c5f935bf..3496ef02e435a8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscLogicalNot.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscLogicalNot.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscLogicalNot"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscLogicalOr.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscLogicalOr.pbtxt
index 5550a34f7d9bc1..3cf31921d5aa0c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscLogicalOr.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscLogicalOr.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscLogicalOr"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscMax.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscMax.pbtxt
index 9ecd924242589f..11c4517d7566b3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscMax.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscMax.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscMax"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscMin.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscMin.pbtxt
index 225a66f5f952de..7ac92ec5ca4a12 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscMin.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscMin.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscMin"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscMul.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscMul.pbtxt
index 787d58e56f074d..e55fca7a910539 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscMul.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscMul.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscMul"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscNeg.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscNeg.pbtxt
index 2d4a2b3425a37c..429d94153c2490 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscNeg.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscNeg.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscNeg"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscPad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscPad.pbtxt
index 3707792d018c3c..13ea65b0bd3974 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscPad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscPad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscPad"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscPool.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscPool.pbtxt
index 766557c3b401f4..57847ff00d0cb4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscPool.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscPool.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscPool"
   input_arg {
     name: "value"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscPow.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscPow.pbtxt
index 89d58894ce23f3..150c846fb0396f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscPow.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscPow.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscPow"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscRandomUniform.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscRandomUniform.pbtxt
index 29261856fe30a6..2d3cd00a70c0ea 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscRandomUniform.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscRandomUniform.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscRandomUniform"
   input_arg {
     name: "shape"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscReal.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscReal.pbtxt
index 5c299538003771..cf624262303984 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscReal.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscReal.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscReal"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscReduce.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscReduce.pbtxt
index fc067edf50fa64..1dff780022c33e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscReduce.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscReduce.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscReduce"
   input_arg {
     name: "tensor"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscRem.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscRem.pbtxt
index 5392ef47912f15..ffa8ff1bc61e82 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscRem.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscRem.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscRem"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscReshape.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscReshape.pbtxt
index a4508f5347d382..f686036497b850 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscReshape.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscReshape.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscReshape"
   input_arg {
     name: "tensor"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscReverse.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscReverse.pbtxt
index 3ee303ebc4ea2c..60dec4d9b9d64b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscReverse.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscReverse.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscReverse"
   input_arg {
     name: "tensor"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscScatter.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscScatter.pbtxt
index d37788f996fb13..5def9d11dc486b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscScatter.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscScatter.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscScatter"
   input_arg {
     name: "indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscShape.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscShape.pbtxt
index a9e95b87a8abd1..615e2b7e8bbba9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscShape.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscShape.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscShape"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscSign.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscSign.pbtxt
index c3478d94634773..db7894468abf7b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscSign.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscSign.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscSign"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscSlice.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscSlice.pbtxt
index afdc888c886ef2..b09072c5ae7f08 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscSlice.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscSlice.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscSlice"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscSort.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscSort.pbtxt
index 60ca56bc40cab6..c49a6951bdbeff 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscSort.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscSort.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscSort"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscSqueeze.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscSqueeze.pbtxt
index 0c70b1088f6aba..bf4e7123ddabc0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscSqueeze.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscSqueeze.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscSqueeze"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscSub.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscSub.pbtxt
index 8a83934419e223..2590f9fa34cce6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscSub.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscSub.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscSub"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscTranspose.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscTranspose.pbtxt
index eb7dce89a9aa35..856b0d67ffb98c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscTranspose.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscTranspose.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscTranspose"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscTriangularSolve.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscTriangularSolve.pbtxt
index c2095295f40a00..5b8518fc9bedd7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscTriangularSolve.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscTriangularSolve.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscTriangularSolve"
   input_arg {
     name: "matrix"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscUnary.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscUnary.pbtxt
index d825acd378f0b8..0a7af35478abb0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscUnary.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscUnary.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscUnary"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RiscWhile.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RiscWhile.pbtxt
index 9e4695be194b3f..8bb4745a1e135c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RiscWhile.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RiscWhile.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RiscWhile"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RngReadAndSkip.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RngReadAndSkip.pbtxt
index d369d4dc9ae400..e64f5dd6d2680c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RngReadAndSkip.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RngReadAndSkip.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RngReadAndSkip"
   input_arg {
     name: "resource"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RngSkip.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RngSkip.pbtxt
index 6ef7c4b9ae69ea..dc3e9b948b7ed6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RngSkip.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RngSkip.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RngSkip"
   input_arg {
     name: "resource"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Roll.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Roll.pbtxt
index cfb7c101757c7b..ac81404fece17e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Roll.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Roll.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Roll"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Round.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Round.pbtxt
index bb883d57b9a84e..c5685dc61439b1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Round.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Round.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Round"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Rsqrt.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Rsqrt.pbtxt
index 87e8420b1e96f6..6d066c9e00cee9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Rsqrt.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Rsqrt.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Rsqrt"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RsqrtGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RsqrtGrad.pbtxt
index 9298cb8a73e1dd..4509b1af361e3d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RsqrtGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RsqrtGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "RsqrtGrad"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SampleDistortedBoundingBox.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SampleDistortedBoundingBox.pbtxt
index 527d0b2f5000c6..95b4a2ddd5f95f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SampleDistortedBoundingBox.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SampleDistortedBoundingBox.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SampleDistortedBoundingBox"
   input_arg {
     name: "image_size"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SampleDistortedBoundingBoxV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SampleDistortedBoundingBoxV2.pbtxt
index 3600eaf126b48b..d857ee0a68795d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SampleDistortedBoundingBoxV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SampleDistortedBoundingBoxV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SampleDistortedBoundingBoxV2"
   input_arg {
     name: "image_size"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SamplingDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SamplingDataset.pbtxt
index 549b71b08c77fd..160a9e9bb16588 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SamplingDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SamplingDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SamplingDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Save.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Save.pbtxt
index c815380c490ad8..c632730bd91e7e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Save.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Save.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Save"
   input_arg {
     name: "filename"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SaveDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SaveDataset.pbtxt
index 8b5b8fcbd1dbc7..18f99fbe3591ee 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SaveDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SaveDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SaveDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SaveDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SaveDatasetV2.pbtxt
index e20303ad3b8d21..9407de7812bb12 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SaveDatasetV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SaveDatasetV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SaveDatasetV2"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SaveSlices.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SaveSlices.pbtxt
index 4415c13dbb8f69..306d67bd688456 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SaveSlices.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SaveSlices.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SaveSlices"
   input_arg {
     name: "filename"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SaveV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SaveV2.pbtxt
index c91f4d3159d0d3..d9bae4c8b8e12b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SaveV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SaveV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SaveV2"
   input_arg {
     name: "prefix"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ScalarSummary.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ScalarSummary.pbtxt
index 48fa50e9382251..bf4948076abe9c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ScalarSummary.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ScalarSummary.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ScalarSummary"
   input_arg {
     name: "tags"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ScaleAndTranslate.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ScaleAndTranslate.pbtxt
index 4c3b0727ac80b0..516cca34539b9b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ScaleAndTranslate.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ScaleAndTranslate.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ScaleAndTranslate"
   input_arg {
     name: "images"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ScaleAndTranslateGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ScaleAndTranslateGrad.pbtxt
index 647f70653585b0..8eaa03c3933199 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ScaleAndTranslateGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ScaleAndTranslateGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ScaleAndTranslateGrad"
   input_arg {
     name: "grads"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ScanDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ScanDataset.pbtxt
index 7563a34c8c7df4..25de8c51a7a388 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ScanDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ScanDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ScanDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ScatterAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ScatterAdd.pbtxt
index 70f2fc7f5ae9ae..0e47d683e05717 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ScatterAdd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ScatterAdd.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ScatterAdd"
   input_arg {
     name: "ref"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ScatterDiv.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ScatterDiv.pbtxt
index c34776538354d3..2e75f22b0ed101 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ScatterDiv.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ScatterDiv.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ScatterDiv"
   input_arg {
     name: "ref"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ScatterMax.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ScatterMax.pbtxt
index 7ad8c98ea588fe..fe176e143b1874 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ScatterMax.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ScatterMax.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ScatterMax"
   input_arg {
     name: "ref"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ScatterMin.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ScatterMin.pbtxt
index e9dc8087a99349..7099d89f366c6a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ScatterMin.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ScatterMin.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ScatterMin"
   input_arg {
     name: "ref"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ScatterMul.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ScatterMul.pbtxt
index aa6f863e0bca77..149540b04aecc1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ScatterMul.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ScatterMul.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ScatterMul"
   input_arg {
     name: "ref"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ScatterNd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ScatterNd.pbtxt
index 06877c844cbd08..75749c807b7368 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ScatterNd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ScatterNd.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ScatterNd"
   input_arg {
     name: "indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ScatterNdAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ScatterNdAdd.pbtxt
index 409f7d35e5f1e8..d093276f490609 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ScatterNdAdd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ScatterNdAdd.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ScatterNdAdd"
   input_arg {
     name: "ref"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ScatterNdMax.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ScatterNdMax.pbtxt
index 1425d91fc0563a..2af2bf50069c0c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ScatterNdMax.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ScatterNdMax.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ScatterNdMax"
   input_arg {
     name: "ref"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ScatterNdMin.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ScatterNdMin.pbtxt
index 996fd4036e1cb5..c2a91a0ed313c8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ScatterNdMin.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ScatterNdMin.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ScatterNdMin"
   input_arg {
     name: "ref"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ScatterNdNonAliasingAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ScatterNdNonAliasingAdd.pbtxt
index 5cf7d9f91f111b..eb8f48444b62ed 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ScatterNdNonAliasingAdd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ScatterNdNonAliasingAdd.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ScatterNdNonAliasingAdd"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ScatterNdSub.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ScatterNdSub.pbtxt
index 6e85f0669efa5e..c9c73604d8ad20 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ScatterNdSub.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ScatterNdSub.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ScatterNdSub"
   input_arg {
     name: "ref"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ScatterNdUpdate.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ScatterNdUpdate.pbtxt
index c3f7a023e18568..73def71d094156 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ScatterNdUpdate.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ScatterNdUpdate.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ScatterNdUpdate"
   input_arg {
     name: "ref"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ScatterSub.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ScatterSub.pbtxt
index bf168e222834a8..9665d98a94e2d7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ScatterSub.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ScatterSub.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ScatterSub"
   input_arg {
     name: "ref"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ScatterUpdate.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ScatterUpdate.pbtxt
index fc91f285899ac2..2f292734f17fb6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ScatterUpdate.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ScatterUpdate.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ScatterUpdate"
   input_arg {
     name: "ref"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SdcaFprint.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SdcaFprint.pbtxt
index 53ee0176bcf390..979c0016b3400d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SdcaFprint.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SdcaFprint.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SdcaFprint"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SdcaOptimizer.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SdcaOptimizer.pbtxt
index d13dd131607794..3746f9504ac61c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SdcaOptimizer.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SdcaOptimizer.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SdcaOptimizer"
   input_arg {
     name: "sparse_example_indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SdcaOptimizerV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SdcaOptimizerV2.pbtxt
index 2052c0c4a26703..cb16c8f55a1677 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SdcaOptimizerV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SdcaOptimizerV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SdcaOptimizerV2"
   input_arg {
     name: "sparse_example_indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SdcaShrinkL1.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SdcaShrinkL1.pbtxt
index 8bc28bda66d735..23d9fdd793a2e9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SdcaShrinkL1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SdcaShrinkL1.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SdcaShrinkL1"
   input_arg {
     name: "weights"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SegmentMax.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SegmentMax.pbtxt
index c137b5ce06d672..a1d5968f9dd7bc 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SegmentMax.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SegmentMax.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SegmentMax"
   input_arg {
     name: "data"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SegmentMaxV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SegmentMaxV2.pbtxt
index a2186b6d58c8cb..f61b9101812e8a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SegmentMaxV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SegmentMaxV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SegmentMaxV2"
   input_arg {
     name: "data"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SegmentMean.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SegmentMean.pbtxt
index a5a4a423220e35..b3c5ead77d8510 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SegmentMean.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SegmentMean.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SegmentMean"
   input_arg {
     name: "data"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SegmentMin.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SegmentMin.pbtxt
index feae22be352cfd..bf87e8294e9421 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SegmentMin.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SegmentMin.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SegmentMin"
   input_arg {
     name: "data"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SegmentMinV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SegmentMinV2.pbtxt
index 16fd710dac8b47..fe2e5396ff0dde 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SegmentMinV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SegmentMinV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SegmentMinV2"
   input_arg {
     name: "data"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SegmentProd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SegmentProd.pbtxt
index 4726236438d1a9..8fbc39235bf0f7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SegmentProd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SegmentProd.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SegmentProd"
   input_arg {
     name: "data"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SegmentProdV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SegmentProdV2.pbtxt
index a66300ef3e4230..9868f42e95da01 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SegmentProdV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SegmentProdV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SegmentProdV2"
   input_arg {
     name: "data"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SegmentSum.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SegmentSum.pbtxt
index 9f033b0db192b4..69289d5b7444b8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SegmentSum.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SegmentSum.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SegmentSum"
   input_arg {
     name: "data"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SegmentSumV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SegmentSumV2.pbtxt
index f0bc5dce91d3b8..e11efffc4be916 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SegmentSumV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SegmentSumV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SegmentSumV2"
   input_arg {
     name: "data"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Select.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Select.pbtxt
index c26b378e476155..38d00af197a978 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Select.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Select.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Select"
   input_arg {
     name: "condition"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SelectV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SelectV2.pbtxt
index 0536e625bc1f15..a7c59f0d2a1778 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SelectV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SelectV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SelectV2"
   input_arg {
     name: "condition"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SelfAdjointEig.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SelfAdjointEig.pbtxt
index c9bcf0815924a1..3657cc1bdc365d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SelfAdjointEig.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SelfAdjointEig.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SelfAdjointEig"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SelfAdjointEigV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SelfAdjointEigV2.pbtxt
index 2e4b645639d6a6..8fbbfc961d7788 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SelfAdjointEigV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SelfAdjointEigV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SelfAdjointEigV2"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Selu.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Selu.pbtxt
index d24219cff310f3..2acf579a5ca9a7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Selu.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Selu.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Selu"
   input_arg {
     name: "features"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SeluGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SeluGrad.pbtxt
index d55750d1e75dd1..f96c7cbc158564 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SeluGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SeluGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SeluGrad"
   input_arg {
     name: "gradients"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Send.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Send.pbtxt
index c2f8bd93e93c5c..73835060ddbcb2 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Send.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Send.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Send"
   input_arg {
     name: "tensor"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SendTPUEmbeddingGradients.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SendTPUEmbeddingGradients.pbtxt
index 0794e1c7e5c448..f6c486fe542bef 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SendTPUEmbeddingGradients.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SendTPUEmbeddingGradients.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SendTPUEmbeddingGradients"
   input_arg {
     name: "inputs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SerializeIterator.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SerializeIterator.pbtxt
index ba098cea894104..262c9def883fca 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SerializeIterator.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SerializeIterator.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SerializeIterator"
   input_arg {
     name: "resource_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SerializeManySparse.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SerializeManySparse.pbtxt
index cb0c4c86419eec..9e741634385c16 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SerializeManySparse.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SerializeManySparse.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SerializeManySparse"
   input_arg {
     name: "sparse_indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SerializeSparse.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SerializeSparse.pbtxt
index b9a18bcf35107c..5040d77fc2f407 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SerializeSparse.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SerializeSparse.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SerializeSparse"
   input_arg {
     name: "sparse_indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SerializeTensor.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SerializeTensor.pbtxt
index c739dc6a8330f3..4d7b5cf5766745 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SerializeTensor.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SerializeTensor.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SerializeTensor"
   input_arg {
     name: "tensor"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SetSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SetSize.pbtxt
index 38c8b19d19dee3..185e5e09734e96 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SetSize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SetSize.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SetSize"
   input_arg {
     name: "set_indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SetStatsAggregatorDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SetStatsAggregatorDataset.pbtxt
index cf7b57075cfa14..fa2dfd389adb8c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SetStatsAggregatorDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SetStatsAggregatorDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SetStatsAggregatorDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Shape.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Shape.pbtxt
index c679caa24aad13..371bb9ef2fde71 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Shape.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Shape.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Shape"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ShapeN.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ShapeN.pbtxt
index 04be1dd59c613c..15e9f11c0f8d3b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ShapeN.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ShapeN.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ShapeN"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ShardDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ShardDataset.pbtxt
index 2c08879ca68dd0..c23a4d3d2e3f98 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ShardDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ShardDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ShardDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ShardedFilename.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ShardedFilename.pbtxt
index df6d834f1f8c1e..cf46ffdbd78d54 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ShardedFilename.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ShardedFilename.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ShardedFilename"
   input_arg {
     name: "basename"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ShardedFilespec.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ShardedFilespec.pbtxt
index 7c2e0806b5ec58..7d1badcf09e83b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ShardedFilespec.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ShardedFilespec.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ShardedFilespec"
   input_arg {
     name: "basename"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ShuffleAndRepeatDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ShuffleAndRepeatDataset.pbtxt
index ca111a93648d73..195d66b2ab8956 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ShuffleAndRepeatDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ShuffleAndRepeatDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ShuffleAndRepeatDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ShuffleAndRepeatDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ShuffleAndRepeatDatasetV2.pbtxt
index 2ac2e838c3db86..1d22404cf064e0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ShuffleAndRepeatDatasetV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ShuffleAndRepeatDatasetV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ShuffleAndRepeatDatasetV2"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ShuffleDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ShuffleDataset.pbtxt
index e1b5ff3283e57a..35c0aa70c11696 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ShuffleDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ShuffleDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ShuffleDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ShuffleDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ShuffleDatasetV2.pbtxt
index 5dce75878aa05f..9ec7fa282d6307 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ShuffleDatasetV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ShuffleDatasetV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ShuffleDatasetV2"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ShuffleDatasetV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ShuffleDatasetV3.pbtxt
index ea30455a9f4acd..e037b818d4ffe1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ShuffleDatasetV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ShuffleDatasetV3.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ShuffleDatasetV3"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ShutdownDistributedTPU.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ShutdownDistributedTPU.pbtxt
index 5784b4418c3112..9e60b7f507554b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ShutdownDistributedTPU.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ShutdownDistributedTPU.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ShutdownDistributedTPU"
   is_stateful: true
 }
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ShutdownTPUSystem.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ShutdownTPUSystem.pbtxt
index df9e4f58f57dcf..ddddeddc63a18e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ShutdownTPUSystem.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ShutdownTPUSystem.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ShutdownTPUSystem"
   output_arg {
     name: "success"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Sigmoid.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Sigmoid.pbtxt
index 16f433a337c378..dee59f6fa02f41 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Sigmoid.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Sigmoid.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Sigmoid"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SigmoidGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SigmoidGrad.pbtxt
index 20c59f8f9311e6..788c3385098097 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SigmoidGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SigmoidGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SigmoidGrad"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Sign.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Sign.pbtxt
index ee388e8ae2a4bf..07cb519996650f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Sign.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Sign.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Sign"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Sin.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Sin.pbtxt
index 6a3398c8da6e69..f6122e6e30b1f6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Sin.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Sin.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Sin"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Sinh.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Sinh.pbtxt
index 0ff80863b11fda..7225234c7edcdf 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Sinh.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Sinh.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Sinh"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Size.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Size.pbtxt
index 2c5e61f1496822..db039e4254ced1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Size.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Size.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Size"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SkipDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SkipDataset.pbtxt
index e46a083e7dac16..07e0cf257f87ce 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SkipDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SkipDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SkipDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Skipgram.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Skipgram.pbtxt
index 3734477985b239..d31bc826301db8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Skipgram.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Skipgram.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Skipgram"
   output_arg {
     name: "vocab_word"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SleepDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SleepDataset.pbtxt
index c9cdd0432da162..0a1d637995e146 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SleepDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SleepDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SleepDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Slice.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Slice.pbtxt
index 0d66369807d6ae..ced3fb6e0f0886 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Slice.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Slice.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Slice"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SlidingWindowDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SlidingWindowDataset.pbtxt
index d3361f9d45b76e..ab63899bf4fb51 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SlidingWindowDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SlidingWindowDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SlidingWindowDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Snapshot.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Snapshot.pbtxt
index c1fb2d69ed9253..aea213f7c50b1f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Snapshot.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Snapshot.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Snapshot"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SnapshotChunkDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SnapshotChunkDataset.pbtxt
index 339bcd9980923d..e20fbcefee50e0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SnapshotChunkDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SnapshotChunkDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SnapshotChunkDataset"
   input_arg {
     name: "chunk_file"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SnapshotDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SnapshotDataset.pbtxt
index b535c43a80371c..6d9002761ae02f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SnapshotDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SnapshotDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SnapshotDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SnapshotDatasetReader.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SnapshotDatasetReader.pbtxt
index 7a9354125604db..d59d8edf2fc492 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SnapshotDatasetReader.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SnapshotDatasetReader.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SnapshotDatasetReader"
   input_arg {
     name: "shard_dir"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SnapshotDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SnapshotDatasetV2.pbtxt
index e8636e3537e451..c9e244ed0e9099 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SnapshotDatasetV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SnapshotDatasetV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SnapshotDatasetV2"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SnapshotNestedDatasetReader.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SnapshotNestedDatasetReader.pbtxt
index 60728b657a6dd4..078460bcb23930 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SnapshotNestedDatasetReader.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SnapshotNestedDatasetReader.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SnapshotNestedDatasetReader"
   input_arg {
     name: "inputs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SobolSample.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SobolSample.pbtxt
index 16377290c6194d..4fe7c45282a15f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SobolSample.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SobolSample.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SobolSample"
   input_arg {
     name: "dim"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Softmax.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Softmax.pbtxt
index 886e1dcb7168c8..03f499777eef54 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Softmax.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Softmax.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Softmax"
   input_arg {
     name: "logits"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SoftmaxCrossEntropyWithLogits.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SoftmaxCrossEntropyWithLogits.pbtxt
index 4b258297d20a27..8ac8052e30ffac 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SoftmaxCrossEntropyWithLogits.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SoftmaxCrossEntropyWithLogits.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SoftmaxCrossEntropyWithLogits"
   input_arg {
     name: "features"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Softplus.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Softplus.pbtxt
index 66a82cdfd19bb1..3757e8d75039a8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Softplus.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Softplus.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Softplus"
   input_arg {
     name: "features"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SoftplusGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SoftplusGrad.pbtxt
index 20ed580575492b..331b1abbf371f0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SoftplusGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SoftplusGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SoftplusGrad"
   input_arg {
     name: "gradients"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Softsign.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Softsign.pbtxt
index 99ac45e08ba465..c83bc9929ab275 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Softsign.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Softsign.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Softsign"
   input_arg {
     name: "features"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SoftsignGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SoftsignGrad.pbtxt
index da12fc0333ef5a..5411f9b5187758 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SoftsignGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SoftsignGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SoftsignGrad"
   input_arg {
     name: "gradients"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SortListOfSparseCoreCooTensors.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SortListOfSparseCoreCooTensors.pbtxt
new file mode 100644
index 00000000000000..b86af96551b108
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/SortListOfSparseCoreCooTensors.pbtxt
@@ -0,0 +1,88 @@
+op {
+  name: "SortListOfSparseCoreCooTensors"
+  input_arg {
+    name: "row_ids_list"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "col_ids_list"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "gains_list"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "sorted_row_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "sorted_col_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "id_counts"
+    type: DT_INT32
+  }
+  attr {
+    name: "sample_count_list"
+    type: "list(int)"
+  }
+  attr {
+    name: "col_offset_list"
+    type: "list(int)"
+  }
+  attr {
+    name: "num_replica"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "table_vocab_size"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "feature_width"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sc_per_chip"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "max_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "max_unique_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SpaceToBatch.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SpaceToBatch.pbtxt
index 6a3a33227e6b4f..155e1b3a985e44 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SpaceToBatch.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SpaceToBatch.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SpaceToBatch"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SpaceToBatchND.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SpaceToBatchND.pbtxt
index 3b0379dbec0ae4..c38026e5cde09c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SpaceToBatchND.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SpaceToBatchND.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SpaceToBatchND"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SpaceToDepth.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SpaceToDepth.pbtxt
index 3abc14c711a049..c7dd03ea1041fc 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SpaceToDepth.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SpaceToDepth.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SpaceToDepth"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseAccumulatorApplyGradient.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseAccumulatorApplyGradient.pbtxt
index cbd19b930643ff..7973ae8b558156 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseAccumulatorApplyGradient.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseAccumulatorApplyGradient.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseAccumulatorApplyGradient"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseAccumulatorTakeGradient.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseAccumulatorTakeGradient.pbtxt
index 5a35297ec764e3..1aaa40667ff84a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseAccumulatorTakeGradient.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseAccumulatorTakeGradient.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseAccumulatorTakeGradient"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseAdd.pbtxt
index 84b00a504d9f9d..06122b12980987 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseAdd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseAdd.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseAdd"
   input_arg {
     name: "a_indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseAddGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseAddGrad.pbtxt
index 96192d0094101c..5c5e9aa0118f89 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseAddGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseAddGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseAddGrad"
   input_arg {
     name: "backprop_val_grad"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyAdadelta.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyAdadelta.pbtxt
index abff5aab28cbc4..5cd7caf5d819d7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyAdadelta.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyAdadelta.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseApplyAdadelta"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyAdagrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyAdagrad.pbtxt
index 248c28c0b8e9e7..3e2e873c9ab59d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyAdagrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyAdagrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseApplyAdagrad"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyAdagradDA.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyAdagradDA.pbtxt
index 8dcc79dc49aa9c..e47c75ddc6cfa1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyAdagradDA.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyAdagradDA.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseApplyAdagradDA"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyAdagradV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyAdagradV2.pbtxt
index 00a3ca86f43359..2b5360f68b6ae1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyAdagradV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyAdagradV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseApplyAdagradV2"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyCenteredRMSProp.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyCenteredRMSProp.pbtxt
index a5ae87d42c445a..8f958ce67f413a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyCenteredRMSProp.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyCenteredRMSProp.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseApplyCenteredRMSProp"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyFtrl.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyFtrl.pbtxt
index d43fe26ffc1a11..17f289a6bbba7f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyFtrl.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyFtrl.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseApplyFtrl"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyFtrlV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyFtrlV2.pbtxt
index 4ced4fe22d596a..1d7b1cab4169df 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyFtrlV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyFtrlV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseApplyFtrlV2"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyMomentum.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyMomentum.pbtxt
index 17e60ae80df5eb..ffabce2fb7af25 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyMomentum.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyMomentum.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseApplyMomentum"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyProximalAdagrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyProximalAdagrad.pbtxt
index 5fb249592fba4f..c2a7440e3dcacb 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyProximalAdagrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyProximalAdagrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseApplyProximalAdagrad"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyProximalGradientDescent.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyProximalGradientDescent.pbtxt
index f04e6553369cc9..525119034a7653 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyProximalGradientDescent.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyProximalGradientDescent.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseApplyProximalGradientDescent"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyRMSProp.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyRMSProp.pbtxt
index aa27af72a44866..f5f7725c0e95e8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyRMSProp.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyRMSProp.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseApplyRMSProp"
   input_arg {
     name: "var"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseBincount.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseBincount.pbtxt
index 9739aa7f2588de..9bbc5132845f1f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseBincount.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseBincount.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseBincount"
   input_arg {
     name: "indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseConcat.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseConcat.pbtxt
index 640957dd1750a5..ac291f4acbacce 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseConcat.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseConcat.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseConcat"
   input_arg {
     name: "indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseConditionalAccumulator.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseConditionalAccumulator.pbtxt
index 26bc9fa77b61d6..59def3f130ef21 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseConditionalAccumulator.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseConditionalAccumulator.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseConditionalAccumulator"
   output_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseCountSparseOutput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseCountSparseOutput.pbtxt
index d80e465d0205ba..ed79733f97fcdf 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseCountSparseOutput.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseCountSparseOutput.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseCountSparseOutput"
   input_arg {
     name: "indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseCross.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseCross.pbtxt
index 81bd4101693f0e..f25372f5808567 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseCross.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseCross.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseCross"
   input_arg {
     name: "indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseCrossHashed.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseCrossHashed.pbtxt
index b0be6ee5dbcc64..73002a92f24850 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseCrossHashed.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseCrossHashed.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseCrossHashed"
   input_arg {
     name: "indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseCrossV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseCrossV2.pbtxt
index e2a3e7053512c3..206542e4713902 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseCrossV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseCrossV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseCrossV2"
   input_arg {
     name: "indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseDenseCwiseAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseDenseCwiseAdd.pbtxt
index 474457187b36bf..ca71405a4c9f61 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseDenseCwiseAdd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseDenseCwiseAdd.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseDenseCwiseAdd"
   input_arg {
     name: "sp_indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseDenseCwiseDiv.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseDenseCwiseDiv.pbtxt
index d91c4c89585002..fe4b629ad1cf5b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseDenseCwiseDiv.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseDenseCwiseDiv.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseDenseCwiseDiv"
   input_arg {
     name: "sp_indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseDenseCwiseMul.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseDenseCwiseMul.pbtxt
index f6fd9e956c0884..80f7245d66d1bb 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseDenseCwiseMul.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseDenseCwiseMul.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseDenseCwiseMul"
   input_arg {
     name: "sp_indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseFillEmptyRows.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseFillEmptyRows.pbtxt
index a93278ac57811e..d99257aa7103af 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseFillEmptyRows.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseFillEmptyRows.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseFillEmptyRows"
   input_arg {
     name: "indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseFillEmptyRowsGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseFillEmptyRowsGrad.pbtxt
index af97bf3745301b..87f1c5c4e2d1e3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseFillEmptyRowsGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseFillEmptyRowsGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseFillEmptyRowsGrad"
   input_arg {
     name: "reverse_index_map"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseMatMul.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseMatMul.pbtxt
index e3a0ab25bb5f8d..d1eaa6a5edcb98 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseMatMul.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseMatMul.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseMatMul"
   input_arg {
     name: "a"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixAdd.pbtxt
index 1a87d1beedee98..3a9efffdae686c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixAdd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixAdd.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseMatrixAdd"
   input_arg {
     name: "a"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixMatMul.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixMatMul.pbtxt
index db0ded9e1f7cae..a38613747717c9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixMatMul.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixMatMul.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseMatrixMatMul"
   input_arg {
     name: "a"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixMul.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixMul.pbtxt
index 1ed54fe50f9f58..649992a0298912 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixMul.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixMul.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseMatrixMul"
   input_arg {
     name: "a"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixNNZ.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixNNZ.pbtxt
index 2073ae629c54cd..40363327c68c83 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixNNZ.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixNNZ.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseMatrixNNZ"
   input_arg {
     name: "sparse_matrix"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixOrderingAMD.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixOrderingAMD.pbtxt
index cfa3dda19d4b26..b851bde928900d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixOrderingAMD.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixOrderingAMD.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseMatrixOrderingAMD"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixSoftmax.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixSoftmax.pbtxt
index 63579485eae114..d994082a1b3e9e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixSoftmax.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixSoftmax.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseMatrixSoftmax"
   input_arg {
     name: "logits"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixSoftmaxGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixSoftmaxGrad.pbtxt
index 133ffbbfc1047c..3bb68d7797d134 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixSoftmaxGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixSoftmaxGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseMatrixSoftmaxGrad"
   input_arg {
     name: "softmax"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixSparseCholesky.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixSparseCholesky.pbtxt
index f14d9834fa04ef..1f74136d3a480e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixSparseCholesky.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixSparseCholesky.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseMatrixSparseCholesky"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixSparseMatMul.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixSparseMatMul.pbtxt
index 51afd8da1a2ec7..3726d6bd8c0a4d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixSparseMatMul.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixSparseMatMul.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseMatrixSparseMatMul"
   input_arg {
     name: "a"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixTranspose.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixTranspose.pbtxt
index ea071da4e77177..be6bcd6dc1e496 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixTranspose.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixTranspose.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseMatrixTranspose"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixZeros.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixZeros.pbtxt
index 87b09ffd306f2c..6d743e24a0ad4c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixZeros.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseMatrixZeros.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseMatrixZeros"
   input_arg {
     name: "dense_shape"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseReduceMax.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseReduceMax.pbtxt
index 86a4d1e0046906..4df1254af7460d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseReduceMax.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseReduceMax.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseReduceMax"
   input_arg {
     name: "input_indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseReduceMaxSparse.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseReduceMaxSparse.pbtxt
index c44461c9d090be..81896440ab75a9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseReduceMaxSparse.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseReduceMaxSparse.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseReduceMaxSparse"
   input_arg {
     name: "input_indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseReduceSum.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseReduceSum.pbtxt
index d0e5258dad8801..9f5a2d2e2be133 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseReduceSum.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseReduceSum.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseReduceSum"
   input_arg {
     name: "input_indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseReduceSumSparse.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseReduceSumSparse.pbtxt
index 12a18fbda1c045..b554e7ca6a00f6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseReduceSumSparse.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseReduceSumSparse.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseReduceSumSparse"
   input_arg {
     name: "input_indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseReorder.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseReorder.pbtxt
index 9a9bd24e406c00..5c5ad9078385e6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseReorder.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseReorder.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseReorder"
   input_arg {
     name: "input_indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseReshape.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseReshape.pbtxt
index ab9f157b09f23f..934b5010a1e415 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseReshape.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseReshape.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseReshape"
   input_arg {
     name: "input_indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMean.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMean.pbtxt
index 6f56f3f5364548..0891aa6b0175ec 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMean.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMean.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseSegmentMean"
   input_arg {
     name: "data"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanGrad.pbtxt
index a75fd742bf4e91..54e44b3e62d64b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseSegmentMeanGrad"
   input_arg {
     name: "grad"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanGradV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanGradV2.pbtxt
index b02a4838190c33..e1282b858a90ec 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanGradV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanGradV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseSegmentMeanGradV2"
   input_arg {
     name: "grad"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanWithNumSegments.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanWithNumSegments.pbtxt
index 930da74b96b061..a27a2efc560465 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanWithNumSegments.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanWithNumSegments.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseSegmentMeanWithNumSegments"
   input_arg {
     name: "data"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtN.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtN.pbtxt
index feec752c94d180..b7a209c48d9ee6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtN.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtN.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseSegmentSqrtN"
   input_arg {
     name: "data"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNGrad.pbtxt
index 18397fbb9cda45..d9a3c930a122de 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseSegmentSqrtNGrad"
   input_arg {
     name: "grad"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNGradV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNGradV2.pbtxt
index 1d8a3b4c921189..9436be8beb8e17 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNGradV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNGradV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseSegmentSqrtNGradV2"
   input_arg {
     name: "grad"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNWithNumSegments.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNWithNumSegments.pbtxt
index 5aa63ca35fcace..b67358819acdc6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNWithNumSegments.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNWithNumSegments.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseSegmentSqrtNWithNumSegments"
   input_arg {
     name: "data"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSum.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSum.pbtxt
index db9042011a6f8d..5962058393cf14 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSum.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSum.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseSegmentSum"
   input_arg {
     name: "data"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSumGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSumGrad.pbtxt
index e441e7f0a60f69..ce9742f898ef6b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSumGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSumGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseSegmentSumGrad"
   input_arg {
     name: "grad"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSumGradV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSumGradV2.pbtxt
index 4643278f47fad1..3baf45183e9464 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSumGradV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSumGradV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseSegmentSumGradV2"
   input_arg {
     name: "grad"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSumWithNumSegments.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSumWithNumSegments.pbtxt
index 55674b28bbae69..4b8183aeeeabcd 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSumWithNumSegments.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSumWithNumSegments.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseSegmentSumWithNumSegments"
   input_arg {
     name: "data"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSlice.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSlice.pbtxt
index 60d914313b3da7..a6434cbfa71d2c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSlice.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSlice.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseSlice"
   input_arg {
     name: "indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSliceGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSliceGrad.pbtxt
index d3d6693044a564..77134f37194b5a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSliceGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSliceGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseSliceGrad"
   input_arg {
     name: "backprop_val_grad"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSoftmax.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSoftmax.pbtxt
index 003ea791ed61f0..5e447367f3be6d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSoftmax.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSoftmax.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseSoftmax"
   input_arg {
     name: "sp_indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSoftmaxCrossEntropyWithLogits.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSoftmaxCrossEntropyWithLogits.pbtxt
index 884008f4ea9937..57d8f4c4662535 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSoftmaxCrossEntropyWithLogits.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSoftmaxCrossEntropyWithLogits.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseSoftmaxCrossEntropyWithLogits"
   input_arg {
     name: "features"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSparseMaximum.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSparseMaximum.pbtxt
index c1bb9cb237fb35..bdd017c2252867 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSparseMaximum.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSparseMaximum.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseSparseMaximum"
   input_arg {
     name: "a_indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSparseMinimum.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSparseMinimum.pbtxt
index 7d1569221cce76..b600e95b3ae3bb 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSparseMinimum.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSparseMinimum.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseSparseMinimum"
   input_arg {
     name: "a_indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSplit.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSplit.pbtxt
index af8deac1a17908..997b2b21abdd44 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSplit.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSplit.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseSplit"
   input_arg {
     name: "split_dim"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseTensorDenseAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseTensorDenseAdd.pbtxt
index c1b647cef05e96..8e95e67610204f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseTensorDenseAdd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseTensorDenseAdd.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseTensorDenseAdd"
   input_arg {
     name: "a_indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseTensorDenseMatMul.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseTensorDenseMatMul.pbtxt
index 8fa6733bfc1b91..ce66c5306a5c2a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseTensorDenseMatMul.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseTensorDenseMatMul.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseTensorDenseMatMul"
   input_arg {
     name: "a_indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseTensorSliceDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseTensorSliceDataset.pbtxt
index bc7b81876d1c4e..af26fd8c180a3f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseTensorSliceDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseTensorSliceDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseTensorSliceDataset"
   input_arg {
     name: "indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseTensorToCSRSparseMatrix.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseTensorToCSRSparseMatrix.pbtxt
index a1215c2000ae1e..b45376fcd00d3c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseTensorToCSRSparseMatrix.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseTensorToCSRSparseMatrix.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseTensorToCSRSparseMatrix"
   input_arg {
     name: "indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseToDense.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseToDense.pbtxt
index 7e693267c06d2a..351603424e6ccd 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseToDense.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseToDense.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseToDense"
   input_arg {
     name: "sparse_indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseToSparseSetOperation.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseToSparseSetOperation.pbtxt
index bac3763b0510a7..a7775a2f24a465 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseToSparseSetOperation.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseToSparseSetOperation.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SparseToSparseSetOperation"
   input_arg {
     name: "set1_indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Spence.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Spence.pbtxt
index fefe73924fda23..7032cac3dce437 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Spence.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Spence.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Spence"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Split.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Split.pbtxt
index 3d69a43f2e71ec..49428f7e5ce590 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Split.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Split.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Split"
   input_arg {
     name: "split_dim"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SplitDedupData.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SplitDedupData.pbtxt
index 53d3ccdf1c6a6f..ada9d479888933 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SplitDedupData.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SplitDedupData.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SplitDedupData"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SplitV.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SplitV.pbtxt
index 577d8a604fac7b..706d1c3c81c729 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SplitV.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SplitV.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SplitV"
   input_arg {
     name: "value"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SqlDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SqlDataset.pbtxt
index fb233f139edaa7..68af0ac17eff32 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SqlDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SqlDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SqlDataset"
   input_arg {
     name: "driver_name"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Sqrt.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Sqrt.pbtxt
index 371520ff6ae795..3c566b98b0dbff 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Sqrt.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Sqrt.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Sqrt"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SqrtGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SqrtGrad.pbtxt
index 055baa35e98f93..d738e2023ffc24 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SqrtGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SqrtGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SqrtGrad"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Square.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Square.pbtxt
index 52c0e31486d601..7501f29f1d95e4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Square.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Square.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Square"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SquaredDifference.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SquaredDifference.pbtxt
index a680e0087f531d..29ea33c95e2d2a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SquaredDifference.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SquaredDifference.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SquaredDifference"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Squeeze.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Squeeze.pbtxt
index d3733f99e07910..54335545f4807f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Squeeze.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Squeeze.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Squeeze"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Stack.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Stack.pbtxt
index e4398a4783de8f..e8e459cfe2c08d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Stack.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Stack.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Stack"
   output_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StackClose.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StackClose.pbtxt
index dd2b8efd9c95ab..8c916ab52a2c69 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StackClose.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StackClose.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StackClose"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StackCloseV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StackCloseV2.pbtxt
index 66ddab1d90bed8..18c5934b0d7961 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StackCloseV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StackCloseV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StackCloseV2"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StackPop.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StackPop.pbtxt
index 53965ff252d508..80e3ef79d09c61 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StackPop.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StackPop.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StackPop"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StackPopV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StackPopV2.pbtxt
index 77300ed42e46f9..438d52b8ea5625 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StackPopV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StackPopV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StackPopV2"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StackPush.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StackPush.pbtxt
index f2052fe41c1ab4..44fae0ce455f0c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StackPush.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StackPush.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StackPush"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StackPushV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StackPushV2.pbtxt
index 6b3f8b5655b2a3..7149b4fda435c6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StackPushV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StackPushV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StackPushV2"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StackV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StackV2.pbtxt
index 5d934a7b8bb00b..606361dd26fdb1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StackV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StackV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StackV2"
   input_arg {
     name: "max_size"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Stage.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Stage.pbtxt
index d7b0fbc1b9a4b7..8a64d696118f7f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Stage.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Stage.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Stage"
   input_arg {
     name: "values"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StageClear.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StageClear.pbtxt
index 7ce3219c586d58..1f43cdb901967d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StageClear.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StageClear.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StageClear"
   attr {
     name: "capacity"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StagePeek.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StagePeek.pbtxt
index e9e6c68ed56344..a7397c488167fe 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StagePeek.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StagePeek.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StagePeek"
   input_arg {
     name: "index"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StageSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StageSize.pbtxt
index b9bb218b6909aa..6f22fd3d032706 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StageSize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StageSize.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StageSize"
   output_arg {
     name: "size"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatefulPartitionedCall.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatefulPartitionedCall.pbtxt
index a069806185b334..7d411c155ab2b9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StatefulPartitionedCall.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatefulPartitionedCall.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StatefulPartitionedCall"
   input_arg {
     name: "args"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatefulRandomBinomial.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatefulRandomBinomial.pbtxt
index 798854feef481b..97eb7d4e8f0fa0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StatefulRandomBinomial.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatefulRandomBinomial.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StatefulRandomBinomial"
   input_arg {
     name: "resource"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatefulStandardNormal.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatefulStandardNormal.pbtxt
index 4738b843a0b92a..44ef92c5a8f7ed 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StatefulStandardNormal.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatefulStandardNormal.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StatefulStandardNormal"
   input_arg {
     name: "resource"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatefulStandardNormalV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatefulStandardNormalV2.pbtxt
index 76e31871a6bbae..1b99b2320b0264 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StatefulStandardNormalV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatefulStandardNormalV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StatefulStandardNormalV2"
   input_arg {
     name: "resource"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatefulTruncatedNormal.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatefulTruncatedNormal.pbtxt
index 8ef450390d5367..e74de4f0fceb55 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StatefulTruncatedNormal.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatefulTruncatedNormal.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StatefulTruncatedNormal"
   input_arg {
     name: "resource"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatefulUniform.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatefulUniform.pbtxt
index fef8635b75cb94..fd2b87c6e45988 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StatefulUniform.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatefulUniform.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StatefulUniform"
   input_arg {
     name: "resource"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatefulUniformFullInt.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatefulUniformFullInt.pbtxt
index 0536cdf62ce4cc..35ab70e0f3a1e2 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StatefulUniformFullInt.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatefulUniformFullInt.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StatefulUniformFullInt"
   input_arg {
     name: "resource"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatefulUniformInt.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatefulUniformInt.pbtxt
index cd71c24e2762c1..06f62faaace7b6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StatefulUniformInt.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatefulUniformInt.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StatefulUniformInt"
   input_arg {
     name: "resource"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessCase.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessCase.pbtxt
index 367c9ddfdb9602..174c00e5c8ab4a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StatelessCase.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessCase.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StatelessCase"
   input_arg {
     name: "branch_index"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessIf.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessIf.pbtxt
index a38480703ef14a..6eda6df052a58b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StatelessIf.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessIf.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StatelessIf"
   input_arg {
     name: "cond"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessMultinomial.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessMultinomial.pbtxt
index 3a06bef6ef7747..16dac7dfd152bb 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StatelessMultinomial.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessMultinomial.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StatelessMultinomial"
   input_arg {
     name: "logits"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessParameterizedTruncatedNormal.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessParameterizedTruncatedNormal.pbtxt
index 86509705c4f188..598125677b114a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StatelessParameterizedTruncatedNormal.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessParameterizedTruncatedNormal.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StatelessParameterizedTruncatedNormal"
   input_arg {
     name: "shape"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomBinomial.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomBinomial.pbtxt
index da4c351d221a53..1ba7e5c119b147 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomBinomial.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomBinomial.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StatelessRandomBinomial"
   input_arg {
     name: "shape"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomGammaV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomGammaV2.pbtxt
index 56d573cf2ec6d9..b3988b1f407d96 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomGammaV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomGammaV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StatelessRandomGammaV2"
   input_arg {
     name: "shape"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomGammaV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomGammaV3.pbtxt
index d2fbd60387e24d..00d9da80c101e1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomGammaV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomGammaV3.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StatelessRandomGammaV3"
   input_arg {
     name: "shape"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomGetAlg.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomGetAlg.pbtxt
index ff50f6fad96a1c..522fce59c88a05 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomGetAlg.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomGetAlg.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StatelessRandomGetAlg"
   output_arg {
     name: "alg"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomGetKeyCounter.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomGetKeyCounter.pbtxt
index 7532f9b2f6ffca..0d9c547181bb61 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomGetKeyCounter.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomGetKeyCounter.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StatelessRandomGetKeyCounter"
   input_arg {
     name: "seed"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomGetKeyCounterAlg.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomGetKeyCounterAlg.pbtxt
index 6ed78cbec62348..4897ee80bb82d5 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomGetKeyCounterAlg.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomGetKeyCounterAlg.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StatelessRandomGetKeyCounterAlg"
   input_arg {
     name: "seed"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomNormal.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomNormal.pbtxt
index 0de87cb2569ce2..804d904c148234 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomNormal.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomNormal.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StatelessRandomNormal"
   input_arg {
     name: "shape"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomNormalV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomNormalV2.pbtxt
index e5a48f3116056e..dac945afe5354e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomNormalV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomNormalV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StatelessRandomNormalV2"
   input_arg {
     name: "shape"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomPoisson.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomPoisson.pbtxt
index 575f54b2e22617..525b933d8b005b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomPoisson.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomPoisson.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StatelessRandomPoisson"
   input_arg {
     name: "shape"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniform.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniform.pbtxt
index d27528775e2460..22a5b25466b90f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniform.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniform.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StatelessRandomUniform"
   input_arg {
     name: "shape"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformFullInt.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformFullInt.pbtxt
index ede922b448a4de..a9d652634d7cb7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformFullInt.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformFullInt.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StatelessRandomUniformFullInt"
   input_arg {
     name: "shape"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformFullIntV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformFullIntV2.pbtxt
index 5a5bd12f262c5d..d4511c5447bda5 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformFullIntV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformFullIntV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StatelessRandomUniformFullIntV2"
   input_arg {
     name: "shape"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformInt.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformInt.pbtxt
index 13014cca798e16..834a6fd5ad3983 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformInt.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformInt.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StatelessRandomUniformInt"
   input_arg {
     name: "shape"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformIntV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformIntV2.pbtxt
index 96bc39656a4b17..be4ed6072852be 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformIntV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformIntV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StatelessRandomUniformIntV2"
   input_arg {
     name: "shape"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformV2.pbtxt
index 9434be69beff1f..f66ee72bd4af30 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StatelessRandomUniformV2"
   input_arg {
     name: "shape"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessSampleDistortedBoundingBox.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessSampleDistortedBoundingBox.pbtxt
index 6c322d848ac079..6858a110cf46f5 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StatelessSampleDistortedBoundingBox.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessSampleDistortedBoundingBox.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StatelessSampleDistortedBoundingBox"
   input_arg {
     name: "image_size"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessShuffle.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessShuffle.pbtxt
index 06b6eb02eac58d..eab3565990135e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StatelessShuffle.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessShuffle.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StatelessShuffle"
   input_arg {
     name: "value"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessTruncatedNormal.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessTruncatedNormal.pbtxt
index a301c48afbe058..c8c8d850341f5f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StatelessTruncatedNormal.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessTruncatedNormal.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StatelessTruncatedNormal"
   input_arg {
     name: "shape"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessTruncatedNormalV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessTruncatedNormalV2.pbtxt
index f074ea7423b525..23f886f104dd24 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StatelessTruncatedNormalV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessTruncatedNormalV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StatelessTruncatedNormalV2"
   input_arg {
     name: "shape"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessWhile.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessWhile.pbtxt
index dc0b6353373c65..28579edbde5e67 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StatelessWhile.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessWhile.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StatelessWhile"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StaticRegexFullMatch.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StaticRegexFullMatch.pbtxt
index d053ad6e1db6f6..be6078c102232d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StaticRegexFullMatch.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StaticRegexFullMatch.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StaticRegexFullMatch"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StaticRegexReplace.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StaticRegexReplace.pbtxt
index e570f0de09ff55..fe3eb69a1a0044 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StaticRegexReplace.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StaticRegexReplace.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StaticRegexReplace"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatsAggregatorHandle.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatsAggregatorHandle.pbtxt
index 45080da97423ca..2d55e00492fddc 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StatsAggregatorHandle.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatsAggregatorHandle.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StatsAggregatorHandle"
   output_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatsAggregatorHandleV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatsAggregatorHandleV2.pbtxt
index 0fca6c398316bf..7dc361e958d794 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StatsAggregatorHandleV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatsAggregatorHandleV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StatsAggregatorHandleV2"
   output_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatsAggregatorSetSummaryWriter.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatsAggregatorSetSummaryWriter.pbtxt
index 55d5207beec293..24730ade1494c6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StatsAggregatorSetSummaryWriter.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatsAggregatorSetSummaryWriter.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StatsAggregatorSetSummaryWriter"
   input_arg {
     name: "stats_aggregator"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatsAggregatorSummary.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatsAggregatorSummary.pbtxt
index 92d1f2ba507923..a0702a11168ff4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StatsAggregatorSummary.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatsAggregatorSummary.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StatsAggregatorSummary"
   input_arg {
     name: "iterator"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StochasticCastToInt.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StochasticCastToInt.pbtxt
index 66896c5d662940..930525b4364525 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StochasticCastToInt.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StochasticCastToInt.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StochasticCastToInt"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StopGradient.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StopGradient.pbtxt
index 06500717b6034a..26f7c677ab8e8a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StopGradient.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StopGradient.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StopGradient"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StoreMinibatchStatisticsInFdo.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StoreMinibatchStatisticsInFdo.pbtxt
index 2250ba0eee369c..22766cb4409917 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StoreMinibatchStatisticsInFdo.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StoreMinibatchStatisticsInFdo.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StoreMinibatchStatisticsInFdo"
   input_arg {
     name: "program_key"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StridedSlice.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StridedSlice.pbtxt
index 60186d9449d593..2c60bcb73c757f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StridedSlice.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StridedSlice.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StridedSlice"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StridedSliceAssign.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StridedSliceAssign.pbtxt
index ba3eaed57e270e..8393dc7272c59c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StridedSliceAssign.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StridedSliceAssign.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StridedSliceAssign"
   input_arg {
     name: "ref"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StridedSliceGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StridedSliceGrad.pbtxt
index 92bad979c0f579..14f6a464020a5a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StridedSliceGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StridedSliceGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StridedSliceGrad"
   input_arg {
     name: "shape"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StringFormat.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StringFormat.pbtxt
index 23c48e47c2761d..bea32908608e6f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StringFormat.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StringFormat.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StringFormat"
   input_arg {
     name: "inputs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StringJoin.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StringJoin.pbtxt
index a45262ee65ed29..790cb7b5b6c8d6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StringJoin.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StringJoin.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StringJoin"
   input_arg {
     name: "inputs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StringLength.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StringLength.pbtxt
index 169adeecbed33f..5bdf993f907a2a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StringLength.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StringLength.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StringLength"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StringLower.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StringLower.pbtxt
index 5f7dbf3b2bb283..1c886146d0560d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StringLower.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StringLower.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StringLower"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StringNGrams.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StringNGrams.pbtxt
index 4281b96aee5007..025fc052819bf2 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StringNGrams.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StringNGrams.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StringNGrams"
   input_arg {
     name: "data"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StringSplit.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StringSplit.pbtxt
index 1832d334e79837..35e8594235e170 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StringSplit.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StringSplit.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StringSplit"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StringSplitV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StringSplitV2.pbtxt
index 03ba25fd70168b..fbdf8e06f372c1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StringSplitV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StringSplitV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StringSplitV2"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StringStrip.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StringStrip.pbtxt
index 153bd63f4e2313..3fff999e93789b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StringStrip.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StringStrip.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StringStrip"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StringToHashBucket.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StringToHashBucket.pbtxt
index 6cbdd4c7bf8c28..7147a40a12d5af 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StringToHashBucket.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StringToHashBucket.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StringToHashBucket"
   input_arg {
     name: "string_tensor"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StringToHashBucketFast.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StringToHashBucketFast.pbtxt
index a07a00a41eecee..8ef1227faae9b7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StringToHashBucketFast.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StringToHashBucketFast.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StringToHashBucketFast"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StringToHashBucketStrong.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StringToHashBucketStrong.pbtxt
index de121b287b3aa9..2dbd9920711fe4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StringToHashBucketStrong.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StringToHashBucketStrong.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StringToHashBucketStrong"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StringToNumber.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StringToNumber.pbtxt
index 833aaa61533c1f..fb09d67516e7fd 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StringToNumber.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StringToNumber.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StringToNumber"
   input_arg {
     name: "string_tensor"
@@ -48,3 +48,31 @@ op {
     }
   }
 }
+op {
+  name: "StringToNumber"
+  input_arg {
+    name: "string_tensor"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StringUpper.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StringUpper.pbtxt
index 69c6133ad0501b..8df4881554cefe 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/StringUpper.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/StringUpper.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "StringUpper"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Sub.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Sub.pbtxt
index 44761cb1a3bf95..4d89817f561874 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Sub.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Sub.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Sub"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Substr.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Substr.pbtxt
index ebfc6279bbe635..a5c1d2c0ae0aa8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Substr.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Substr.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Substr"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Sum.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Sum.pbtxt
index 4a71be5f59cfe9..fb7ce8600c4ecd 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Sum.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Sum.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Sum"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SummaryWriter.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SummaryWriter.pbtxt
index 199ded68ff3bbf..a6fd9170f2a121 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SummaryWriter.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SummaryWriter.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SummaryWriter"
   output_arg {
     name: "writer"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Svd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Svd.pbtxt
index 3750a39144739e..48003906cc05ed 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Svd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Svd.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Svd"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Switch.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Switch.pbtxt
index 2ff607548010d7..0856f3459b3c02 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Switch.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Switch.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Switch"
   input_arg {
     name: "data"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SymbolicGradient.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SymbolicGradient.pbtxt
index 5193954ea7eebd..aae5457863e176 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SymbolicGradient.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SymbolicGradient.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SymbolicGradient"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SyncDevice.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SyncDevice.pbtxt
index cc6bf6b6f0c0b0..e55c5f4ade1d4b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SyncDevice.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SyncDevice.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "SyncDevice"
   is_stateful: true
 }
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TFRecordDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TFRecordDataset.pbtxt
index b68eea8fbd264c..6caa343ebc05b8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TFRecordDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TFRecordDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TFRecordDataset"
   input_arg {
     name: "filenames"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TFRecordDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TFRecordDatasetV2.pbtxt
index af944b699a0e9d..d3b89247493e91 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TFRecordDatasetV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TFRecordDatasetV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TFRecordDatasetV2"
   input_arg {
     name: "filenames"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TFRecordReader.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TFRecordReader.pbtxt
index 0f223c57ec5ed3..684c21ea45e8f5 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TFRecordReader.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TFRecordReader.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TFRecordReader"
   output_arg {
     name: "reader_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TFRecordReaderV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TFRecordReaderV2.pbtxt
index 0f9f02ce01d876..bcdb4764d378da 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TFRecordReaderV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TFRecordReaderV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TFRecordReaderV2"
   output_arg {
     name: "reader_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TPUAnnotateTensorsWithDynamicShape.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TPUAnnotateTensorsWithDynamicShape.pbtxt
index 09d484e3194e76..eb1f07856a9ef6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TPUAnnotateTensorsWithDynamicShape.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TPUAnnotateTensorsWithDynamicShape.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TPUAnnotateTensorsWithDynamicShape"
   input_arg {
     name: "tensors"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TPUCompilationResult.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TPUCompilationResult.pbtxt
index bdaf1f9e51a8c9..04a95cc089fd4e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TPUCompilationResult.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TPUCompilationResult.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TPUCompilationResult"
   output_arg {
     name: "output"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TPUCompile.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TPUCompile.pbtxt
index 7328f9214e3be6..be95091c809227 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TPUCompile.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TPUCompile.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TPUCompile"
   input_arg {
     name: "dynamic_shapes"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TPUCompileSucceededAssert.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TPUCompileSucceededAssert.pbtxt
index a39d0e72843b10..bc1b3c153f10ec 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TPUCompileSucceededAssert.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TPUCompileSucceededAssert.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TPUCompileSucceededAssert"
   input_arg {
     name: "compilation_status"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TPUCopyWithDynamicShape.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TPUCopyWithDynamicShape.pbtxt
index 8b897ff34cca06..1e8386d91a5760 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TPUCopyWithDynamicShape.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TPUCopyWithDynamicShape.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TPUCopyWithDynamicShape"
   input_arg {
     name: "tensors"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TPUEmbeddingActivations.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TPUEmbeddingActivations.pbtxt
index 0bd460f3a0bf50..3975077297a6fa 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TPUEmbeddingActivations.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TPUEmbeddingActivations.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TPUEmbeddingActivations"
   input_arg {
     name: "embedding_variable"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TPUExecute.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TPUExecute.pbtxt
index a231036be50e8b..97a2c2a1f6673d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TPUExecute.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TPUExecute.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TPUExecute"
   input_arg {
     name: "args"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TPUExecuteAndUpdateVariables.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TPUExecuteAndUpdateVariables.pbtxt
index 2d41c28beff465..5a611f9202d83d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TPUExecuteAndUpdateVariables.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TPUExecuteAndUpdateVariables.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TPUExecuteAndUpdateVariables"
   input_arg {
     name: "args"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TPUOrdinalSelector.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TPUOrdinalSelector.pbtxt
index de910326bd6a79..3fb272504068ce 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TPUOrdinalSelector.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TPUOrdinalSelector.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TPUOrdinalSelector"
   output_arg {
     name: "device_ordinals"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TPUPartitionedCall.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TPUPartitionedCall.pbtxt
index cc0652a3790b31..1ec9cb3e43400f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TPUPartitionedCall.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TPUPartitionedCall.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TPUPartitionedCall"
   input_arg {
     name: "args"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TPUPartitionedInput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TPUPartitionedInput.pbtxt
index b5eca4ed94d216..aab0574d99eb95 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TPUPartitionedInput.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TPUPartitionedInput.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TPUPartitionedInput"
   input_arg {
     name: "inputs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TPUPartitionedInputV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TPUPartitionedInputV2.pbtxt
index 52ec50cc1bf7bd..fe0ef345a1c628 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TPUPartitionedInputV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TPUPartitionedInputV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TPUPartitionedInputV2"
   input_arg {
     name: "inputs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TPUPartitionedOutput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TPUPartitionedOutput.pbtxt
index ad5122c3f1b732..38a85e319644ac 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TPUPartitionedOutput.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TPUPartitionedOutput.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TPUPartitionedOutput"
   input_arg {
     name: "inputs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TPUPartitionedOutputV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TPUPartitionedOutputV2.pbtxt
index 83b7375fa2f1bf..3e7a6039ad8ca0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TPUPartitionedOutputV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TPUPartitionedOutputV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TPUPartitionedOutputV2"
   input_arg {
     name: "inputs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TPUReplicateMetadata.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TPUReplicateMetadata.pbtxt
index 9ea81b7f929a1f..9742ad0d8d40c9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TPUReplicateMetadata.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TPUReplicateMetadata.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TPUReplicateMetadata"
   attr {
     name: "num_replicas"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TPUReplicatedInput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TPUReplicatedInput.pbtxt
index f5a8ddcf284366..b549b570c13777 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TPUReplicatedInput.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TPUReplicatedInput.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TPUReplicatedInput"
   input_arg {
     name: "inputs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TPUReplicatedOutput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TPUReplicatedOutput.pbtxt
index f7e9600cf4fb99..70b7d0ae71aadc 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TPUReplicatedOutput.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TPUReplicatedOutput.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TPUReplicatedOutput"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TPUReshardVariables.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TPUReshardVariables.pbtxt
index 0bc7b4611fc7e8..fecd05d06eb1b9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TPUReshardVariables.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TPUReshardVariables.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TPUReshardVariables"
   input_arg {
     name: "vars"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TPURoundRobin.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TPURoundRobin.pbtxt
index 1405bc8cd6b167..7630e0cac746a0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TPURoundRobin.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TPURoundRobin.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TPURoundRobin"
   output_arg {
     name: "device_ordinal"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TakeDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TakeDataset.pbtxt
index 4d9da96372ae09..8ced9c67054cc1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TakeDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TakeDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TakeDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TakeManySparseFromTensorsMap.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TakeManySparseFromTensorsMap.pbtxt
index 7b852a26ddec81..0e3ca630eb178e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TakeManySparseFromTensorsMap.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TakeManySparseFromTensorsMap.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TakeManySparseFromTensorsMap"
   input_arg {
     name: "sparse_handles"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TakeWhileDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TakeWhileDataset.pbtxt
index 7586b070a73be4..bfde2664966ef2 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TakeWhileDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TakeWhileDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TakeWhileDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Tan.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Tan.pbtxt
index 45442d6d712d71..a78f07b8d21382 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Tan.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Tan.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Tan"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Tanh.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Tanh.pbtxt
index 9b7ccfe0bc9503..1672b0dc825c79 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Tanh.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Tanh.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Tanh"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TanhGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TanhGrad.pbtxt
index 41a827121b8c56..67d28f8ad7e0b4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TanhGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TanhGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TanhGrad"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TemporaryVariable.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TemporaryVariable.pbtxt
index d5c19d9f1ef34d..191354ec959700 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TemporaryVariable.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TemporaryVariable.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TemporaryVariable"
   output_arg {
     name: "ref"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorArray.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorArray.pbtxt
index 67aa5f61327ac7..74b1a54976cc8b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorArray.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorArray.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorArray"
   input_arg {
     name: "size"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayClose.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayClose.pbtxt
index 44b06bcbf10e95..63c0100942005b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayClose.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayClose.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorArrayClose"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayCloseV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayCloseV2.pbtxt
index 5298e3d85742b8..b0fb5804f1a2cf 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayCloseV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayCloseV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorArrayCloseV2"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayCloseV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayCloseV3.pbtxt
index 63af4407f62ae6..c5d1c2b1f244ce 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayCloseV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayCloseV3.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorArrayCloseV3"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayConcat.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayConcat.pbtxt
index 1111e79677e141..e2c59abd687402 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayConcat.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayConcat.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorArrayConcat"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayConcatV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayConcatV2.pbtxt
index b08f04af049c80..72376bd561c910 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayConcatV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayConcatV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorArrayConcatV2"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayConcatV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayConcatV3.pbtxt
index 70a7939d25c9d2..91e575ca87f6b3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayConcatV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayConcatV3.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorArrayConcatV3"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayGather.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayGather.pbtxt
index 30b54dbce7b8b4..a8ded38550bc8b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayGather.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayGather.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorArrayGather"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayGatherV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayGatherV2.pbtxt
index 3025ec43a0da9c..f72968388437f6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayGatherV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayGatherV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorArrayGatherV2"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayGatherV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayGatherV3.pbtxt
index bf80504e78a1fb..c87538a40d26ab 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayGatherV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayGatherV3.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorArrayGatherV3"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayGrad.pbtxt
index 326cb594c3b5af..422154510dbcdf 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorArrayGrad"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayGradV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayGradV2.pbtxt
index 092dd8435055b7..d989c4071435ee 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayGradV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayGradV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorArrayGradV2"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayGradV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayGradV3.pbtxt
index 749e282ae92a2d..53e20429ec0e6f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayGradV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayGradV3.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorArrayGradV3"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayGradWithShape.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayGradWithShape.pbtxt
index 100c3e7c78a4ef..1ce739062eb4c2 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayGradWithShape.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayGradWithShape.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorArrayGradWithShape"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayPack.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayPack.pbtxt
index 7b4ca8e9f4d39e..f608e453cee31b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayPack.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayPack.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorArrayPack"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayRead.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayRead.pbtxt
index c3398036b5e4b5..62660bec758965 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayRead.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayRead.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorArrayRead"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayReadV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayReadV2.pbtxt
index 5620c0ade70a6c..cd0a2a32a8c06b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayReadV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayReadV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorArrayReadV2"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayReadV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayReadV3.pbtxt
index bb53ce3649d56c..59e66fc84d5cdb 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayReadV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayReadV3.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorArrayReadV3"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayScatter.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayScatter.pbtxt
index 37c69a9b398e2e..b2017163f5c072 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayScatter.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayScatter.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorArrayScatter"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayScatterV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayScatterV2.pbtxt
index 798222d0d00d64..1eacf2d9acfa3c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayScatterV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayScatterV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorArrayScatterV2"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayScatterV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayScatterV3.pbtxt
index c9ecffbc9692c1..5053ed60b1130e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayScatterV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayScatterV3.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorArrayScatterV3"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorArraySize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorArraySize.pbtxt
index 690f2118b6269f..7f6ce9510a0bff 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorArraySize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorArraySize.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorArraySize"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorArraySizeV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorArraySizeV2.pbtxt
index bf446335854293..8ee9eda30bd21f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorArraySizeV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorArraySizeV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorArraySizeV2"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorArraySizeV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorArraySizeV3.pbtxt
index c156e6af69ea9b..8932b0dcf2ddf7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorArraySizeV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorArraySizeV3.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorArraySizeV3"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorArraySplit.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorArraySplit.pbtxt
index 4a693b29ff9cba..06bf8bfc3595fd 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorArraySplit.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorArraySplit.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorArraySplit"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorArraySplitV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorArraySplitV2.pbtxt
index fa9e02adb9326b..b45ea7a6108a66 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorArraySplitV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorArraySplitV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorArraySplitV2"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorArraySplitV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorArraySplitV3.pbtxt
index 2eed16078de27e..c072c0c65fc008 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorArraySplitV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorArraySplitV3.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorArraySplitV3"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayUnpack.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayUnpack.pbtxt
index 6fd145fbf55fac..81e5abec891b9b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayUnpack.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayUnpack.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorArrayUnpack"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayV2.pbtxt
index 3c01113a63e8c4..1293e1999c2031 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorArrayV2"
   input_arg {
     name: "size"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayV3.pbtxt
index ef70cea090839b..906e407de181e0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayV3.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorArrayV3"
   input_arg {
     name: "size"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayWrite.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayWrite.pbtxt
index af5ad923f1d8c1..8f1a94c36b3878 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayWrite.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayWrite.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorArrayWrite"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayWriteV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayWriteV2.pbtxt
index ba33173b1dc971..fa0c1a679f7dfa 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayWriteV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayWriteV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorArrayWriteV2"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayWriteV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayWriteV3.pbtxt
index 44670bec744cf5..45327d42b1be1b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorArrayWriteV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorArrayWriteV3.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorArrayWriteV3"
   input_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorDataset.pbtxt
index a70bea1c6975d5..9e71deef2c597c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorDataset"
   input_arg {
     name: "components"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorListConcat.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorListConcat.pbtxt
index 8b715d4864011e..010be2e120b46e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorListConcat.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorListConcat.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorListConcat"
   input_arg {
     name: "input_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorListConcatLists.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorListConcatLists.pbtxt
index e532be3918c367..57dd05a90feb7d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorListConcatLists.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorListConcatLists.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorListConcatLists"
   input_arg {
     name: "input_a"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorListConcatV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorListConcatV2.pbtxt
index 6051430fd552bf..0bb9546d155d29 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorListConcatV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorListConcatV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorListConcatV2"
   input_arg {
     name: "input_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorListElementShape.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorListElementShape.pbtxt
index e4143fdbe9dd06..26b982f6cfd9c3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorListElementShape.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorListElementShape.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorListElementShape"
   input_arg {
     name: "input_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorListFromTensor.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorListFromTensor.pbtxt
index f3111b9756db4e..c2be2938c8607e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorListFromTensor.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorListFromTensor.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorListFromTensor"
   input_arg {
     name: "tensor"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorListGather.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorListGather.pbtxt
index 271fc2db2c0161..43b4773a4f7e0f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorListGather.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorListGather.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorListGather"
   input_arg {
     name: "input_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorListGetItem.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorListGetItem.pbtxt
index ff087f92a0721a..fa124bc94971fe 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorListGetItem.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorListGetItem.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorListGetItem"
   input_arg {
     name: "input_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorListLength.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorListLength.pbtxt
index 8b95320f200740..b4ea660dca13de 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorListLength.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorListLength.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorListLength"
   input_arg {
     name: "input_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorListPopBack.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorListPopBack.pbtxt
index 61dd1111cb64f5..35aa68e07584cc 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorListPopBack.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorListPopBack.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorListPopBack"
   input_arg {
     name: "input_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorListPushBack.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorListPushBack.pbtxt
index 1ae20365fb9bab..8175cfe350dab0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorListPushBack.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorListPushBack.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorListPushBack"
   input_arg {
     name: "input_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorListPushBackBatch.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorListPushBackBatch.pbtxt
index 6e805caed1bcc2..29b878e527a4d5 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorListPushBackBatch.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorListPushBackBatch.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorListPushBackBatch"
   input_arg {
     name: "input_handles"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorListReserve.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorListReserve.pbtxt
index bef49b63477ebe..98ade8cb4a9fd0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorListReserve.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorListReserve.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorListReserve"
   input_arg {
     name: "element_shape"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorListResize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorListResize.pbtxt
index 50e070018e2a0b..b322d89d47aa68 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorListResize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorListResize.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorListResize"
   input_arg {
     name: "input_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorListScatter.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorListScatter.pbtxt
index 31fa2452bb11d9..daa2f4130ab06b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorListScatter.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorListScatter.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorListScatter"
   input_arg {
     name: "tensor"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorListScatterIntoExistingList.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorListScatterIntoExistingList.pbtxt
index 311157650998f6..4427bab8a358c2 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorListScatterIntoExistingList.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorListScatterIntoExistingList.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorListScatterIntoExistingList"
   input_arg {
     name: "input_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorListScatterV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorListScatterV2.pbtxt
index 29cf4aba78680f..de588984614839 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorListScatterV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorListScatterV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorListScatterV2"
   input_arg {
     name: "tensor"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorListSetItem.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorListSetItem.pbtxt
index d16b98d59bdc54..e2510ca98e0b8f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorListSetItem.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorListSetItem.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorListSetItem"
   input_arg {
     name: "input_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorListSplit.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorListSplit.pbtxt
index 2ed8f207fed4c3..ff83247addf89b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorListSplit.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorListSplit.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorListSplit"
   input_arg {
     name: "tensor"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorListStack.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorListStack.pbtxt
index 8623e34a934845..5a8e7bcd81b9ef 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorListStack.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorListStack.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorListStack"
   input_arg {
     name: "input_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorMapErase.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorMapErase.pbtxt
index 009fd49ee6288a..854e7311eab331 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorMapErase.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorMapErase.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorMapErase"
   input_arg {
     name: "input_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorMapHasKey.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorMapHasKey.pbtxt
index e2856e3ecaea56..a095c36d7c26f5 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorMapHasKey.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorMapHasKey.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorMapHasKey"
   input_arg {
     name: "input_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorMapInsert.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorMapInsert.pbtxt
index 492d3538de0ed5..10061ea1cde6dc 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorMapInsert.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorMapInsert.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorMapInsert"
   input_arg {
     name: "input_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorMapLookup.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorMapLookup.pbtxt
index 9f1097226fc7b3..b48fda8ac4623f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorMapLookup.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorMapLookup.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorMapLookup"
   input_arg {
     name: "input_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorMapSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorMapSize.pbtxt
index a94f2c515ece36..dd8ade84414f56 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorMapSize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorMapSize.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorMapSize"
   input_arg {
     name: "input_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorMapStackKeys.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorMapStackKeys.pbtxt
index 10aef43aca1343..c3befaa320a385 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorMapStackKeys.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorMapStackKeys.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorMapStackKeys"
   input_arg {
     name: "input_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorScatterAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorScatterAdd.pbtxt
index 71f0e9fb625334..5fb5b8cb0dd693 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorScatterAdd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorScatterAdd.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorScatterAdd"
   input_arg {
     name: "tensor"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorScatterMax.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorScatterMax.pbtxt
index 3abba955f62158..84a05c9bd5dba9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorScatterMax.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorScatterMax.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorScatterMax"
   input_arg {
     name: "tensor"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorScatterMin.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorScatterMin.pbtxt
index 047619805f2aff..d1ae6117921ee0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorScatterMin.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorScatterMin.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorScatterMin"
   input_arg {
     name: "tensor"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorScatterSub.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorScatterSub.pbtxt
index d66d823ef317c8..81920523ae9bbc 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorScatterSub.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorScatterSub.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorScatterSub"
   input_arg {
     name: "tensor"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorScatterUpdate.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorScatterUpdate.pbtxt
index 4bc0747881f943..1e8281cc247958 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorScatterUpdate.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorScatterUpdate.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorScatterUpdate"
   input_arg {
     name: "tensor"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorSliceDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorSliceDataset.pbtxt
index 7be6e03ca4a19c..3810a6b4023657 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorSliceDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorSliceDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorSliceDataset"
   input_arg {
     name: "components"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorStridedSliceUpdate.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorStridedSliceUpdate.pbtxt
index 67dae5ab4dd346..3854eeed137057 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorStridedSliceUpdate.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorStridedSliceUpdate.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorStridedSliceUpdate"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorSummary.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorSummary.pbtxt
index 5486c2ef84aaed..bf4114aeef398c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorSummary.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorSummary.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorSummary"
   input_arg {
     name: "tensor"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorSummaryV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorSummaryV2.pbtxt
index 107c53939f3b29..39092b078161f0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorSummaryV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorSummaryV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TensorSummaryV2"
   input_arg {
     name: "tag"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TextLineDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TextLineDataset.pbtxt
index 5f539c525b0048..c51a34124f2d1e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TextLineDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TextLineDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TextLineDataset"
   input_arg {
     name: "filenames"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TextLineReader.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TextLineReader.pbtxt
index 2e0924bb51786e..baf1ef10d91047 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TextLineReader.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TextLineReader.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TextLineReader"
   output_arg {
     name: "reader_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TextLineReaderV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TextLineReaderV2.pbtxt
index d30d08b5fbf150..c669951acdf6b6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TextLineReaderV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TextLineReaderV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TextLineReaderV2"
   output_arg {
     name: "reader_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ThreadPoolDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ThreadPoolDataset.pbtxt
index b8c817fc13876a..8e185af579f2be 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ThreadPoolDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ThreadPoolDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ThreadPoolDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ThreadPoolHandle.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ThreadPoolHandle.pbtxt
index 4fac8fb83b79f1..e2518b1439d732 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ThreadPoolHandle.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ThreadPoolHandle.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ThreadPoolHandle"
   output_arg {
     name: "handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ThreadUnsafeUnigramCandidateSampler.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ThreadUnsafeUnigramCandidateSampler.pbtxt
index 0e99e93edcc732..89106aab220583 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ThreadUnsafeUnigramCandidateSampler.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ThreadUnsafeUnigramCandidateSampler.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ThreadUnsafeUnigramCandidateSampler"
   input_arg {
     name: "true_classes"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Tile.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Tile.pbtxt
index f3a139e2265abd..67de1e5201698c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Tile.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Tile.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Tile"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TileGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TileGrad.pbtxt
index f068a74b1c3d3d..f710e1c470f254 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TileGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TileGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TileGrad"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Timestamp.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Timestamp.pbtxt
index 18f5f4d2d23f03..6e51504d17653e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Timestamp.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Timestamp.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Timestamp"
   output_arg {
     name: "ts"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ToBool.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ToBool.pbtxt
index 2903fe7f0b2e7f..6e02fdbb52cdeb 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ToBool.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ToBool.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ToBool"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TopK.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TopK.pbtxt
index 8ad9a3c23bb9c7..71c98b7fd8120e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TopK.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TopK.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TopK"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TopKUnique.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TopKUnique.pbtxt
index 23b15af2aef16e..12463385bcc816 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TopKUnique.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TopKUnique.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TopKUnique"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TopKV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TopKV2.pbtxt
index 5da8c479b9a8b3..22908f661e62f9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TopKV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TopKV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TopKV2"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TopKWithUnique.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TopKWithUnique.pbtxt
index cfb4e9da928507..5e3216fa554877 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TopKWithUnique.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TopKWithUnique.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TopKWithUnique"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TpuHandleToProtoKey.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TpuHandleToProtoKey.pbtxt
index 3d2c41d36b4d15..1f8d36887b9733 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TpuHandleToProtoKey.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TpuHandleToProtoKey.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TpuHandleToProtoKey"
   input_arg {
     name: "uid"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Transpose.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Transpose.pbtxt
index e83ec349f19b72..fa4fb6d58937c4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Transpose.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Transpose.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Transpose"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TridiagonalMatMul.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TridiagonalMatMul.pbtxt
index 98f70443f67a82..117d68b48d29b6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TridiagonalMatMul.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TridiagonalMatMul.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TridiagonalMatMul"
   input_arg {
     name: "superdiag"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TridiagonalSolve.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TridiagonalSolve.pbtxt
index f86be1a7508c37..d824c1cb76c35e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TridiagonalSolve.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TridiagonalSolve.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TridiagonalSolve"
   input_arg {
     name: "diagonals"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TruncateDiv.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TruncateDiv.pbtxt
index 74a4db11dc5efc..0b6c414f616163 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TruncateDiv.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TruncateDiv.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TruncateDiv"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TruncateMod.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TruncateMod.pbtxt
index 72517db5294d89..70ce81b35c4209 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TruncateMod.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TruncateMod.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TruncateMod"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TruncatedNormal.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TruncatedNormal.pbtxt
index c066b3283ec0ad..018d657985d5e8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TruncatedNormal.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TruncatedNormal.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "TruncatedNormal"
   input_arg {
     name: "shape"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Unbatch.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Unbatch.pbtxt
index 624522ee54ced0..3934b1823ff052 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Unbatch.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Unbatch.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Unbatch"
   input_arg {
     name: "batched_tensor"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UnbatchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UnbatchDataset.pbtxt
index fa3075b92f41ee..84479c117206d7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/UnbatchDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/UnbatchDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "UnbatchDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UnbatchGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UnbatchGrad.pbtxt
index f2619637143ee8..97240f0be53a0f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/UnbatchGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/UnbatchGrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "UnbatchGrad"
   input_arg {
     name: "original_input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UncompressElement.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UncompressElement.pbtxt
index 04674945d82c97..68406e0e4bc755 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/UncompressElement.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/UncompressElement.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "UncompressElement"
   input_arg {
     name: "compressed"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UnicodeDecode.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UnicodeDecode.pbtxt
index a8aac23ae733e7..fa036b31ef6c38 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/UnicodeDecode.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/UnicodeDecode.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "UnicodeDecode"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UnicodeDecodeWithOffsets.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UnicodeDecodeWithOffsets.pbtxt
index 05a35cc1ad2c7f..29d274738da829 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/UnicodeDecodeWithOffsets.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/UnicodeDecodeWithOffsets.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "UnicodeDecodeWithOffsets"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UnicodeEncode.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UnicodeEncode.pbtxt
index de0b916b6ab2a2..31a7a5b838820d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/UnicodeEncode.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/UnicodeEncode.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "UnicodeEncode"
   input_arg {
     name: "input_values"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UnicodeScript.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UnicodeScript.pbtxt
index b9d7590e7c2ff2..60877b544480fe 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/UnicodeScript.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/UnicodeScript.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "UnicodeScript"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UnicodeTranscode.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UnicodeTranscode.pbtxt
index 494d7a9d6d8373..5cab73782ce8ec 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/UnicodeTranscode.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/UnicodeTranscode.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "UnicodeTranscode"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UniformCandidateSampler.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UniformCandidateSampler.pbtxt
index affff4ad02d2dc..bea963f908ee14 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/UniformCandidateSampler.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/UniformCandidateSampler.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "UniformCandidateSampler"
   input_arg {
     name: "true_classes"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UniformDequantize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UniformDequantize.pbtxt
index 68557e82c563e1..7653370635d18c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/UniformDequantize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/UniformDequantize.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "UniformDequantize"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UniformQuantize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UniformQuantize.pbtxt
index 3e7d42c86198c8..900da8e5be53c3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/UniformQuantize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/UniformQuantize.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "UniformQuantize"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UniformQuantizedAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UniformQuantizedAdd.pbtxt
index 480e70a6693739..1adac5ae59e790 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/UniformQuantizedAdd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/UniformQuantizedAdd.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "UniformQuantizedAdd"
   input_arg {
     name: "lhs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UniformQuantizedClipByValue.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UniformQuantizedClipByValue.pbtxt
index a9d0a79fded26e..8ab01d6c42d9a6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/UniformQuantizedClipByValue.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/UniformQuantizedClipByValue.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "UniformQuantizedClipByValue"
   input_arg {
     name: "operand"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UniformQuantizedConvolution.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UniformQuantizedConvolution.pbtxt
index 68c4746c5210ae..4ebbe6c80690e8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/UniformQuantizedConvolution.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/UniformQuantizedConvolution.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "UniformQuantizedConvolution"
   input_arg {
     name: "lhs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UniformQuantizedConvolutionHybrid.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UniformQuantizedConvolutionHybrid.pbtxt
index 9a9f8fc085fccb..23096f391d9b6e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/UniformQuantizedConvolutionHybrid.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/UniformQuantizedConvolutionHybrid.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "UniformQuantizedConvolutionHybrid"
   input_arg {
     name: "lhs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UniformQuantizedDot.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UniformQuantizedDot.pbtxt
index 9b3030bb3bd8a6..159dabb9798621 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/UniformQuantizedDot.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/UniformQuantizedDot.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "UniformQuantizedDot"
   input_arg {
     name: "lhs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UniformQuantizedDotHybrid.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UniformQuantizedDotHybrid.pbtxt
index 38c8ed55f5978e..2cabf91667e386 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/UniformQuantizedDotHybrid.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/UniformQuantizedDotHybrid.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "UniformQuantizedDotHybrid"
   input_arg {
     name: "lhs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UniformRequantize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UniformRequantize.pbtxt
index bfc3d47945b686..af2d7387c5d695 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/UniformRequantize.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/UniformRequantize.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "UniformRequantize"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Unique.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Unique.pbtxt
index 9d91d9ef4c6164..be389ba1482be8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Unique.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Unique.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Unique"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UniqueDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UniqueDataset.pbtxt
index 58fc7e07a35822..281ba7c1bd0619 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/UniqueDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/UniqueDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "UniqueDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UniqueV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UniqueV2.pbtxt
index e76969551065c0..83113e14232fb2 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/UniqueV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/UniqueV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "UniqueV2"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UniqueWithCounts.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UniqueWithCounts.pbtxt
index 4d5682561cf926..c386059943a70e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/UniqueWithCounts.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/UniqueWithCounts.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "UniqueWithCounts"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UniqueWithCountsV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UniqueWithCountsV2.pbtxt
index d54bba74d97327..85a12b70007320 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/UniqueWithCountsV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/UniqueWithCountsV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "UniqueWithCountsV2"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Unpack.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Unpack.pbtxt
index 6f24385e25bce6..cc5fd918d4c694 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Unpack.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Unpack.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Unpack"
   input_arg {
     name: "value"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UnravelIndex.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UnravelIndex.pbtxt
index 36e66f2d2b0f4b..df2c2bc8469451 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/UnravelIndex.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/UnravelIndex.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "UnravelIndex"
   input_arg {
     name: "indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UnsortedSegmentJoin.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UnsortedSegmentJoin.pbtxt
index 8c95acc7ab9dba..dcbb91bc2f13c4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/UnsortedSegmentJoin.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/UnsortedSegmentJoin.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "UnsortedSegmentJoin"
   input_arg {
     name: "inputs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UnsortedSegmentMax.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UnsortedSegmentMax.pbtxt
index fc9ca18d6cfeed..ee8578f289bdc0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/UnsortedSegmentMax.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/UnsortedSegmentMax.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "UnsortedSegmentMax"
   input_arg {
     name: "data"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UnsortedSegmentMin.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UnsortedSegmentMin.pbtxt
index 33e3635173d589..6a8e5ba6d1fbe6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/UnsortedSegmentMin.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/UnsortedSegmentMin.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "UnsortedSegmentMin"
   input_arg {
     name: "data"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UnsortedSegmentProd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UnsortedSegmentProd.pbtxt
index e1543a33441e81..d100cde127e9f1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/UnsortedSegmentProd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/UnsortedSegmentProd.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "UnsortedSegmentProd"
   input_arg {
     name: "data"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UnsortedSegmentSum.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UnsortedSegmentSum.pbtxt
index ff0bf2d7a8cf8a..28ddbd3bdec499 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/UnsortedSegmentSum.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/UnsortedSegmentSum.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "UnsortedSegmentSum"
   input_arg {
     name: "data"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Unstage.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Unstage.pbtxt
index af51a8ce0b8ca6..4bcfd02758ced6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Unstage.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Unstage.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Unstage"
   output_arg {
     name: "values"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UnwrapDatasetVariant.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UnwrapDatasetVariant.pbtxt
index fc01d23c863703..10e23a97750a70 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/UnwrapDatasetVariant.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/UnwrapDatasetVariant.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "UnwrapDatasetVariant"
   input_arg {
     name: "input_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UpdateTaskIdAndGlobalCoreArray.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UpdateTaskIdAndGlobalCoreArray.pbtxt
new file mode 100644
index 00000000000000..be278cf7f1f1c7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/UpdateTaskIdAndGlobalCoreArray.pbtxt
@@ -0,0 +1,15 @@
+op {
+  name: "UpdateTaskIdAndGlobalCoreArray"
+  input_arg {
+    name: "tpu_task_id_to_shard_id"
+    type: DT_INT32
+    number_attr: "task_count"
+  }
+  attr {
+    name: "task_count"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UpperBound.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UpperBound.pbtxt
index 5cfd0d536c5238..d1b3fa060c6942 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/UpperBound.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/UpperBound.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "UpperBound"
   input_arg {
     name: "sorted_inputs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/VarHandleOp.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/VarHandleOp.pbtxt
index acca5ff60992e7..80bc633b5ac613 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/VarHandleOp.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/VarHandleOp.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "VarHandleOp"
   output_arg {
     name: "resource"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/VarIsInitializedOp.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/VarIsInitializedOp.pbtxt
index 975983e81af8ff..395360158262b3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/VarIsInitializedOp.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/VarIsInitializedOp.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "VarIsInitializedOp"
   input_arg {
     name: "resource"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Variable.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Variable.pbtxt
index 7ad65f1fe6a3f6..943c24def5944d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Variable.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Variable.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Variable"
   output_arg {
     name: "ref"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/VariableShape.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/VariableShape.pbtxt
index e0bb7d05dafe25..570b4f241aaa95 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/VariableShape.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/VariableShape.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "VariableShape"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/VariableV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/VariableV2.pbtxt
index dabd46dcfb467c..c27112f15887b1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/VariableV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/VariableV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "VariableV2"
   output_arg {
     name: "ref"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/WeightedFlatMapDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/WeightedFlatMapDataset.pbtxt
index 5c50229e4c5b07..98c853bb2f3e64 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/WeightedFlatMapDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/WeightedFlatMapDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "WeightedFlatMapDataset"
   input_arg {
     name: "input_datasets"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Where.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Where.pbtxt
index b23dca17037197..8e64cd2419e2c1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Where.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Where.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Where"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/While.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/While.pbtxt
index 8e609867f8d94f..807461b00984bd 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/While.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/While.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "While"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/WholeFileReader.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/WholeFileReader.pbtxt
index b1513138650504..729d76503e53dd 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/WholeFileReader.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/WholeFileReader.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "WholeFileReader"
   output_arg {
     name: "reader_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/WholeFileReaderV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/WholeFileReaderV2.pbtxt
index f451cf41c57bc3..2430494342d709 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/WholeFileReaderV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/WholeFileReaderV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "WholeFileReaderV2"
   output_arg {
     name: "reader_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/WindowDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/WindowDataset.pbtxt
index a5fe3aabbfe45e..43784faad6dc62 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/WindowDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/WindowDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "WindowDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/WindowOp.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/WindowOp.pbtxt
index 60e080a0472b0e..336a13805eaf6a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/WindowOp.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/WindowOp.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "WindowOp"
   input_arg {
     name: "inputs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/WorkerHeartbeat.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/WorkerHeartbeat.pbtxt
index bfadb835956750..ae5c7b8caaad6a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/WorkerHeartbeat.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/WorkerHeartbeat.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "WorkerHeartbeat"
   input_arg {
     name: "request"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/WrapDatasetVariant.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/WrapDatasetVariant.pbtxt
index 247262ec65ce5d..0b1e4363bd20e6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/WrapDatasetVariant.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/WrapDatasetVariant.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "WrapDatasetVariant"
   input_arg {
     name: "input_handle"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/WriteAudioSummary.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/WriteAudioSummary.pbtxt
index 6ec7f394c44bb2..8cc81eba8ff3a2 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/WriteAudioSummary.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/WriteAudioSummary.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "WriteAudioSummary"
   input_arg {
     name: "writer"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/WriteFile.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/WriteFile.pbtxt
index 55a4c60b10173b..6a15b39873d560 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/WriteFile.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/WriteFile.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "WriteFile"
   input_arg {
     name: "filename"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/WriteGraphSummary.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/WriteGraphSummary.pbtxt
index 7851117301a1b3..2957e224f59514 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/WriteGraphSummary.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/WriteGraphSummary.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "WriteGraphSummary"
   input_arg {
     name: "writer"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/WriteHistogramSummary.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/WriteHistogramSummary.pbtxt
index 869b72fc133101..492d573056823b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/WriteHistogramSummary.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/WriteHistogramSummary.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "WriteHistogramSummary"
   input_arg {
     name: "writer"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/WriteImageSummary.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/WriteImageSummary.pbtxt
index 45e3243c6bcf40..1cfc5ca69a7f40 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/WriteImageSummary.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/WriteImageSummary.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "WriteImageSummary"
   input_arg {
     name: "writer"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/WriteRawProtoSummary.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/WriteRawProtoSummary.pbtxt
index 6c571b9f2c7f7a..82ac51a137894b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/WriteRawProtoSummary.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/WriteRawProtoSummary.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "WriteRawProtoSummary"
   input_arg {
     name: "writer"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/WriteScalarSummary.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/WriteScalarSummary.pbtxt
index e40411aabdc256..0f359a85dce91b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/WriteScalarSummary.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/WriteScalarSummary.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "WriteScalarSummary"
   input_arg {
     name: "writer"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/WriteSummary.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/WriteSummary.pbtxt
index f6f887199db2e1..a641ece08df095 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/WriteSummary.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/WriteSummary.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "WriteSummary"
   input_arg {
     name: "writer"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Xdivy.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Xdivy.pbtxt
index 6536322552738d..898987f947cd3a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Xdivy.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Xdivy.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Xdivy"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaConcatND.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaConcatND.pbtxt
index 7d320b8f5c4544..c571497bcd5535 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaConcatND.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaConcatND.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "XlaConcatND"
   input_arg {
     name: "inputs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaHostCompute.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaHostCompute.pbtxt
index 787e744ed8350a..9675bda59a86dd 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaHostCompute.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaHostCompute.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "XlaHostCompute"
   input_arg {
     name: "inputs"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaRecvFromHost.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaRecvFromHost.pbtxt
index 10c8f79678c334..d3760ea79b1272 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaRecvFromHost.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaRecvFromHost.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "XlaRecvFromHost"
   output_arg {
     name: "output"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingActivations.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingActivations.pbtxt
index 208eea2857258f..b624b26a628bc2 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingActivations.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingActivations.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "XlaRecvTPUEmbeddingActivations"
   input_arg {
     name: "deduplication_data"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingActivationsV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingActivationsV2.pbtxt
index c5abbb2d5f1a4e..2e8fb4d4f2530c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingActivationsV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingActivationsV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "XlaRecvTPUEmbeddingActivationsV2"
   input_arg {
     name: "deduplication_data"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingDeduplicationData.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingDeduplicationData.pbtxt
index c931212779893b..3c3b92f9d7bff7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingDeduplicationData.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingDeduplicationData.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "XlaRecvTPUEmbeddingDeduplicationData"
   output_arg {
     name: "output"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingDeduplicationDataV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingDeduplicationDataV2.pbtxt
index 71632c6d871452..d97710b91e46fb 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingDeduplicationDataV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingDeduplicationDataV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "XlaRecvTPUEmbeddingDeduplicationDataV2"
   output_arg {
     name: "output"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSendTPUEmbeddingGradients.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSendTPUEmbeddingGradients.pbtxt
index 6da5fb2f5395cb..77f6547229554f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaSendTPUEmbeddingGradients.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSendTPUEmbeddingGradients.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "XlaSendTPUEmbeddingGradients"
   input_arg {
     name: "gradients"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSendTPUEmbeddingGradientsV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSendTPUEmbeddingGradientsV2.pbtxt
index f52b83abc14719..b416d0ad1a8f0c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaSendTPUEmbeddingGradientsV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSendTPUEmbeddingGradientsV2.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "XlaSendTPUEmbeddingGradientsV2"
   input_arg {
     name: "gradients"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSendToHost.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSendToHost.pbtxt
index 90cc16e20a2e95..f2dfeaf444491e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaSendToHost.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSendToHost.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "XlaSendToHost"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseCoreAdagrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseCoreAdagrad.pbtxt
index 9cf626b404b1e7..bc20baf287c8b1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseCoreAdagrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseCoreAdagrad.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "XlaSparseCoreAdagrad"
   input_arg {
     name: "indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseCoreAdagradMomentum.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseCoreAdagradMomentum.pbtxt
index b64460485d5ac9..5e2b17ab4c9238 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseCoreAdagradMomentum.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseCoreAdagradMomentum.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "XlaSparseCoreAdagradMomentum"
   input_arg {
     name: "indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseCoreAdam.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseCoreAdam.pbtxt
index 38af8afcc1d10d..625c164bb20ed3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseCoreAdam.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseCoreAdam.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "XlaSparseCoreAdam"
   input_arg {
     name: "embedding_table"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseCoreFtrl.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseCoreFtrl.pbtxt
index afbf9e024d7041..b65b707befee31 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseCoreFtrl.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseCoreFtrl.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "XlaSparseCoreFtrl"
   input_arg {
     name: "embedding_table"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseCoreSgd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseCoreSgd.pbtxt
index 7f507c7d722106..677ce14ddf039d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseCoreSgd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseCoreSgd.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "XlaSparseCoreSgd"
   input_arg {
     name: "indices"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmul.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmul.pbtxt
index 5ecf0c2472748a..90aa2cf22849b3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmul.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmul.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "XlaSparseDenseMatmul"
   input_arg {
     name: "row_ids"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdagradAndCsrInput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdagradAndCsrInput.pbtxt
index e13cbfcff32417..fbf266b10e35fa 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdagradAndCsrInput.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdagradAndCsrInput.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "XlaSparseDenseMatmulGradWithAdagradAndCsrInput"
   input_arg {
     name: "row_pointers"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdagradAndStaticBufferSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdagradAndStaticBufferSize.pbtxt
new file mode 100644
index 00000000000000..359a038ea9b6b7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdagradAndStaticBufferSize.pbtxt
@@ -0,0 +1,77 @@
+op {
+  name: "XlaSparseDenseMatmulGradWithAdagradAndStaticBufferSize"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "activation_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulator"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_minibatches_per_physical_sparse_core"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "updated_embedding_table"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_accumulator"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "clip_weight_min"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "clip_weight_max"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "max_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "max_unique_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInput.pbtxt
index e6f2eed0c1d75b..5150a4f23b598f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInput.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInput.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInput"
   input_arg {
     name: "row_pointers"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdagradMomentumAndStaticBufferSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdagradMomentumAndStaticBufferSize.pbtxt
new file mode 100644
index 00000000000000..4fd6fa9bb5a5b2
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdagradMomentumAndStaticBufferSize.pbtxt
@@ -0,0 +1,105 @@
+op {
+  name: "XlaSparseDenseMatmulGradWithAdagradMomentumAndStaticBufferSize"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "activation_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulator"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_minibatches_per_physical_sparse_core"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "updated_embedding_table"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_accumulator"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_momenta"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+  }
+  attr {
+    name: "exponent"
+    type: "float"
+  }
+  attr {
+    name: "beta1"
+    type: "float"
+  }
+  attr {
+    name: "beta2"
+    type: "float"
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+  }
+  attr {
+    name: "clip_weight_min"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "clip_weight_max"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "max_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "max_unique_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdamAndCsrInput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdamAndCsrInput.pbtxt
index 202e6f4f4f7b09..aaa27b25954a9e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdamAndCsrInput.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdamAndCsrInput.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "XlaSparseDenseMatmulGradWithAdamAndCsrInput"
   input_arg {
     name: "row_pointers"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdamAndStaticBufferSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdamAndStaticBufferSize.pbtxt
new file mode 100644
index 00000000000000..5024f72b5c66cb
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithAdamAndStaticBufferSize.pbtxt
@@ -0,0 +1,101 @@
+op {
+  name: "XlaSparseDenseMatmulGradWithAdamAndStaticBufferSize"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "activation_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "velocity"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_minibatches_per_physical_sparse_core"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "updated_embedding_table"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_momenta"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_velocity"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "use_sum_inside_sqrt"
+    type: "bool"
+  }
+  attr {
+    name: "beta1"
+    type: "float"
+  }
+  attr {
+    name: "beta2"
+    type: "float"
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+  }
+  attr {
+    name: "clip_weight_min"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "clip_weight_max"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "max_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "max_unique_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithFtrlAndCsrInput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithFtrlAndCsrInput.pbtxt
index 96121a6bab883d..261f25bebfd7df 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithFtrlAndCsrInput.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithFtrlAndCsrInput.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "XlaSparseDenseMatmulGradWithFtrlAndCsrInput"
   input_arg {
     name: "row_pointers"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithFtrlAndStaticBufferSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithFtrlAndStaticBufferSize.pbtxt
new file mode 100644
index 00000000000000..f2f57f2f744d7b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithFtrlAndStaticBufferSize.pbtxt
@@ -0,0 +1,105 @@
+op {
+  name: "XlaSparseDenseMatmulGradWithFtrlAndStaticBufferSize"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "activation_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulator"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "linear"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_minibatches_per_physical_sparse_core"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "updated_embedding_table"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_accumulator"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_linear"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "multiply_linear_by_learning_rate"
+    type: "bool"
+  }
+  attr {
+    name: "beta"
+    type: "float"
+  }
+  attr {
+    name: "learning_rate_power"
+    type: "float"
+  }
+  attr {
+    name: "l1_regularization_strength"
+    type: "float"
+  }
+  attr {
+    name: "l2_regularization_strength"
+    type: "float"
+  }
+  attr {
+    name: "clip_weight_min"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "clip_weight_max"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "max_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "max_unique_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithSgdAndCsrInput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithSgdAndCsrInput.pbtxt
index 3ad518fadb629e..9446a6fa98c515 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithSgdAndCsrInput.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithSgdAndCsrInput.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "XlaSparseDenseMatmulGradWithSgdAndCsrInput"
   input_arg {
     name: "row_pointers"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithSgdAndStaticBufferSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithSgdAndStaticBufferSize.pbtxt
new file mode 100644
index 00000000000000..dbb06c95f6d643
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulGradWithSgdAndStaticBufferSize.pbtxt
@@ -0,0 +1,69 @@
+op {
+  name: "XlaSparseDenseMatmulGradWithSgdAndStaticBufferSize"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "activation_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_minibatches_per_physical_sparse_core"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "updated_embedding_table"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "clip_weight_min"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "clip_weight_max"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "max_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "max_unique_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulWithCsrInput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulWithCsrInput.pbtxt
index 1aa1743718a32a..2b4bc1dcba74ac 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulWithCsrInput.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulWithCsrInput.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "XlaSparseDenseMatmulWithCsrInput"
   input_arg {
     name: "row_pointers"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulWithStaticBufferSize.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulWithStaticBufferSize.pbtxt
new file mode 100644
index 00000000000000..471ded1635244a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSparseDenseMatmulWithStaticBufferSize.pbtxt
@@ -0,0 +1,65 @@
+op {
+  name: "XlaSparseDenseMatmulWithStaticBufferSize"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_minibatches_per_physical_sparse_core"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "activations"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "input_size"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "quantization_config_low"
+    type: "float"
+  }
+  attr {
+    name: "quantization_config_high"
+    type: "float"
+  }
+  attr {
+    name: "quantization_config_num_buckets"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "max_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "max_unique_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSplitND.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSplitND.pbtxt
index d4d5391b1340df..353b6e166d0f2e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/XlaSplitND.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSplitND.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "XlaSplitND"
   input_arg {
     name: "input"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Xlog1py.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Xlog1py.pbtxt
index f37a09eea17228..9fe466c2fb8165 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Xlog1py.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Xlog1py.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Xlog1py"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Xlogy.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Xlogy.pbtxt
index 8da356ec49fe8c..8e7df823ff73dc 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Xlogy.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Xlogy.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Xlogy"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ZerosLike.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ZerosLike.pbtxt
index c1200273821a51..5bb8d0ab3781d5 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ZerosLike.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ZerosLike.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ZerosLike"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Zeta.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Zeta.pbtxt
index 7f86bd29b404cd..c391bd1f22c6a1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Zeta.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Zeta.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Zeta"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ZipDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ZipDataset.pbtxt
index 60efabb2d58f62..5dd34535a666d3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ZipDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ZipDataset.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "ZipDataset"
   input_arg {
     name: "input_datasets"
diff --git a/tensorflow/core/ops/compat/update_ops_main.cc b/tensorflow/core/ops/compat/update_ops_main.cc
index 3618617c4fc74a..eae80b8a94f5ee 100644
--- a/tensorflow/core/ops/compat/update_ops_main.cc
+++ b/tensorflow/core/ops/compat/update_ops_main.cc
@@ -22,8 +22,10 @@ limitations under the License.
 #include "tensorflow/core/ops/compat/op_compatibility_lib.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/public/version.h"
+#include "tsl/platform/protobuf.h"
 
 namespace tensorflow {
 namespace {
@@ -59,9 +61,9 @@ void WriteUpdateTo(const string& directory) {
     // Write out new op history.
     printf("Writing updated op history to %s/...\n", history_dir.c_str());
     for (const auto& op_file : out_op_history) {
-      TF_QCHECK_OK(WriteStringToFile(env,
-                                     io::JoinPath(history_dir, op_file.first),
-                                     op_file.second.DebugString()));
+      TF_QCHECK_OK(
+          WriteStringToFile(env, io::JoinPath(history_dir, op_file.first),
+                            tsl::LegacyUnredactedDebugString(op_file.second)));
     }
   }
 }
diff --git a/tensorflow/core/ops/math_grad.cc b/tensorflow/core/ops/math_grad.cc
index db530b08868836..836d29ebb9c3ad 100644
--- a/tensorflow/core/ops/math_grad.cc
+++ b/tensorflow/core/ops/math_grad.cc
@@ -830,9 +830,9 @@ static Status MatMulGradHelper(FunctionDef* g, const string& opname,
   // broadcasting-specific ops.
   if (enable_broadcasting) {
     std::vector<FDH::Node> unbroadcast_gradients = {
-        FDH::Const<int32>("zero", gtl::ArraySlice<int32>{0}),
-        FDH::Const<int32>("one", gtl::ArraySlice<int32>{1}),
-        FDH::Const<int32>("minustwo", gtl::ArraySlice<int32>{-2}),
+        FDH::Const<int32>("zero", absl::Span<const int32>{0}),
+        FDH::Const<int32>("one", absl::Span<const int32>{1}),
+        FDH::Const<int32>("minustwo", absl::Span<const int32>{-2}),
         // Compute the batch shapes of the inputs (all but last two dims).
         {{"sx"}, "Shape", {"x"}, {{"T", "$T"}}},
         {{"sy"}, "Shape", {"y"}, {{"T", "$T"}}},
diff --git a/tensorflow/core/ops/math_grad_test.cc b/tensorflow/core/ops/math_grad_test.cc
index 0bc0e351b397b0..7e7d5f8798f99e 100644
--- a/tensorflow/core/ops/math_grad_test.cc
+++ b/tensorflow/core/ops/math_grad_test.cc
@@ -225,7 +225,7 @@ class MathGradTest : public ::testing::Test {
     *di = outputs[1];
   }
 
-  Tensor ReduceSum(const Tensor& x, gtl::ArraySlice<int32> axes) {
+  Tensor ReduceSum(const Tensor& x, absl::Span<const int32> axes) {
     int num_axes = axes.length();
     Tensor y(DT_INT32, TensorShape({num_axes}));
     for (size_t i = 0; i < axes.size(); ++i) {
diff --git a/tensorflow/core/ops/mkl_nn_ops.cc b/tensorflow/core/ops/mkl_nn_ops.cc
index 868e15113b406c..14a42f29ddecbd 100644
--- a/tensorflow/core/ops/mkl_nn_ops.cc
+++ b/tensorflow/core/ops/mkl_nn_ops.cc
@@ -1937,6 +1937,46 @@ operation.
 expected to invoke these operators.
 )doc");
 
+REGISTER_OP("_QuantizedMatMul")
+    // Variable number of inputs depending on fusion. The inputs contain
+    // quantized or real tensors. Some of the inputs carry min-max values for
+    // quantized tensors.
+    .Input("device_inputs: Tdevice_inputs")
+    .Input("host_inputs: Thost_inputs")
+    // Variable number of outputs depending on the main output type. For
+    // example, quantized output will need additional tensors to carry min-max
+    // values. If the output type is real tensor (e.g. Dequantize fusion), the
+    // op should produce only single output tensor.
+    .Output("device_outputs: Tdevice_outputs")
+    .Output("host_outputs: Thost_outputs")
+    .Attr("Tdevice_inputs: list(type) >= 0 = []")
+    .Attr("Thost_inputs: list(type) >= 0 = []")
+    .Attr("Tdevice_outputs: list(type) >= 0 = []")
+    .Attr("Thost_outputs: list(type) >= 0 = []")
+    // The following attributes T1, T2, U, and Tout are members of Tinputs
+    // and Toutputs, used here for type constraints in the templatized OpKernel
+    // registrations.
+    .Attr("T1: quantizedtype")  // 0-th input    
+    .Attr("T2: quantizedtype")  // 1st input
+    .Attr("Tbias: {bfloat16, float, quantizedtype} = DT_FLOAT")
+    // Additional inputs' type. Currently, restricting all to be of same type.
+    .Attr("U: {bfloat16, float, quantizedtype} = DT_FLOAT")
+    .Attr("Tout: {bfloat16, float, quantizedtype} = DT_FLOAT")  // 0-th output  
+    .Attr("transpose_a: bool = false")
+    .Attr("transpose_b: bool = false")
+    .Attr("is_weight_const: bool = true")
+    .Attr("is_bias_const: bool = true")
+    .Attr("fused_ops: list(string) = []")
+    // Attribute for quantization mode of all quantized input tensors.
+    // Currently restricting all operands using same quantization mode.
+    .Attr("input_quant_mode: {'MIN_FIRST', 'SCALED'} = 'SCALED'")
+    // Attribute for activation (0-th output) requnatization mode
+    .Attr("output_quant_mode: {'MIN_FIRST', 'SCALED'} = 'SCALED'")
+    // Attributes for the LeakyRelu ----------------------------------------- //
+    .Attr("leakyrelu_alpha: float = 0.2")
+    // ---------------------------------------------------------------------- //
+    .SetShapeFn(shape_inference::MatMulShape);
+
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 672f72adeb82c4..042577243a4e90 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -1,4 +1,4 @@
-op 	 {
+op {
   name: "Abort"
   attr {
     name: "error_msg"
@@ -10790,6 +10790,184 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ConvertToListOfSparseCoreCooTensors"
+  input_arg {
+    name: "indices_or_row_splits"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "weights"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "row_ids_list"
+    type: DT_INT32
+    number_attr: "num_sc_per_chip"
+  }
+  output_arg {
+    name: "col_ids_list"
+    type: DT_INT32
+    number_attr: "num_sc_per_chip"
+  }
+  output_arg {
+    name: "gains_list"
+    type: DT_FLOAT
+    number_attr: "num_sc_per_chip"
+  }
+  attr {
+    name: "sample_count"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sc_per_chip"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "row_offset"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "col_offset"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "col_shift"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_sc_shards"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "stacked_table_sample_count"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "combiner"
+    type: "string"
+  }
+}
+op {
+  name: "ConvertToSparseCoreCsrWrappedCooTensor"
+  input_arg {
+    name: "sorted_row_ids_list"
+    type: DT_INT32
+    number_attr: "num_sc_per_chip"
+  }
+  input_arg {
+    name: "sorted_col_ids_list"
+    type: DT_INT32
+    number_attr: "num_sc_per_chip"
+  }
+  input_arg {
+    name: "sorted_gains_list"
+    type: DT_FLOAT
+    number_attr: "num_sc_per_chip"
+  }
+  input_arg {
+    name: "id_counts_list"
+    type: DT_INT32
+    number_attr: "num_sc_per_chip"
+  }
+  input_arg {
+    name: "splits"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "row_pointers_unpadded_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "ids_unpadded_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "num_minibatches_per_sc"
+    type: DT_INT32
+  }
+  attr {
+    name: "sample_count_per_sc"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_replica"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "max_minibatches_per_sc"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "max_ids_per_chip_per_sample"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "table_vocab_size"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "feature_width"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sc_per_chip"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+  attr {
+    name: "allow_id_dropping"
+    type: "bool"
+  }
+}
 op {
   name: "Copy"
   input_arg {
@@ -22169,6 +22347,81 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "GetStatsFromListOfSparseCoreCooTensors"
+  input_arg {
+    name: "row_ids_list"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "col_ids_list"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "gains_list"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "max_ids_per_sparse_core"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "max_unique_ids_per_sparse_core"
+    type: DT_INT32
+  }
+  attr {
+    name: "sample_count_list"
+    type: "list(int)"
+  }
+  attr {
+    name: "col_offset_list"
+    type: "list(int)"
+  }
+  attr {
+    name: "num_replica"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "table_vocab_size"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "feature_width"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sc_per_chip"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "GetTpuTaskId"
+  output_arg {
+    name: "tpu_task_id"
+    type: DT_INT32
+  }
+}
 op {
   name: "GlobalIterId"
   output_arg {
@@ -52616,6 +52869,94 @@ op {
     }
   }
 }
+op {
+  name: "SortListOfSparseCoreCooTensors"
+  input_arg {
+    name: "row_ids_list"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "col_ids_list"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "gains_list"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "sorted_row_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "sorted_col_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "id_counts"
+    type: DT_INT32
+  }
+  attr {
+    name: "sample_count_list"
+    type: "list(int)"
+  }
+  attr {
+    name: "col_offset_list"
+    type: "list(int)"
+  }
+  attr {
+    name: "num_replica"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "table_vocab_size"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "feature_width"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sc_per_chip"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "max_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "max_unique_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "SpaceToBatch"
   input_arg {
@@ -59719,6 +60060,8 @@ op {
         type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -65619,6 +65962,21 @@ op {
     type: DT_VARIANT
   }
 }
+op {
+  name: "UpdateTaskIdAndGlobalCoreArray"
+  input_arg {
+    name: "tpu_task_id_to_shard_id"
+    type: DT_INT32
+    number_attr: "task_count"
+  }
+  attr {
+    name: "task_count"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "UpperBound"
   input_arg {
@@ -67085,6 +67443,83 @@ op {
     type: "string"
   }
 }
+op {
+  name: "XlaSparseDenseMatmulGradWithAdagradAndStaticBufferSize"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "activation_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulator"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_minibatches_per_physical_sparse_core"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "updated_embedding_table"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_accumulator"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "clip_weight_min"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "clip_weight_max"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "max_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "max_unique_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+}
 op {
   name: "XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInput"
   input_arg {
@@ -67178,6 +67613,111 @@ op {
     type: "string"
   }
 }
+op {
+  name: "XlaSparseDenseMatmulGradWithAdagradMomentumAndStaticBufferSize"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "activation_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulator"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_minibatches_per_physical_sparse_core"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "updated_embedding_table"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_accumulator"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_momenta"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+  }
+  attr {
+    name: "exponent"
+    type: "float"
+  }
+  attr {
+    name: "beta1"
+    type: "float"
+  }
+  attr {
+    name: "beta2"
+    type: "float"
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+  }
+  attr {
+    name: "clip_weight_min"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "clip_weight_max"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "max_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "max_unique_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+}
 op {
   name: "XlaSparseDenseMatmulGradWithAdamAndCsrInput"
   input_arg {
@@ -67267,6 +67807,107 @@ op {
     type: "string"
   }
 }
+op {
+  name: "XlaSparseDenseMatmulGradWithAdamAndStaticBufferSize"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "activation_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "velocity"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_minibatches_per_physical_sparse_core"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "updated_embedding_table"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_momenta"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_velocity"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "use_sum_inside_sqrt"
+    type: "bool"
+  }
+  attr {
+    name: "beta1"
+    type: "float"
+  }
+  attr {
+    name: "beta2"
+    type: "float"
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+  }
+  attr {
+    name: "clip_weight_min"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "clip_weight_max"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "max_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "max_unique_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+}
 op {
   name: "XlaSparseDenseMatmulGradWithFtrlAndCsrInput"
   input_arg {
@@ -67360,6 +68001,111 @@ op {
     type: "string"
   }
 }
+op {
+  name: "XlaSparseDenseMatmulGradWithFtrlAndStaticBufferSize"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "activation_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulator"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "linear"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_minibatches_per_physical_sparse_core"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "updated_embedding_table"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_accumulator"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updated_linear"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "multiply_linear_by_learning_rate"
+    type: "bool"
+  }
+  attr {
+    name: "beta"
+    type: "float"
+  }
+  attr {
+    name: "learning_rate_power"
+    type: "float"
+  }
+  attr {
+    name: "l1_regularization_strength"
+    type: "float"
+  }
+  attr {
+    name: "l2_regularization_strength"
+    type: "float"
+  }
+  attr {
+    name: "clip_weight_min"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "clip_weight_max"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "max_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "max_unique_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+}
 op {
   name: "XlaSparseDenseMatmulGradWithSgdAndCsrInput"
   input_arg {
@@ -67417,6 +68163,75 @@ op {
     type: "string"
   }
 }
+op {
+  name: "XlaSparseDenseMatmulGradWithSgdAndStaticBufferSize"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "activation_gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "learning_rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_minibatches_per_physical_sparse_core"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "updated_embedding_table"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "clip_weight_min"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "clip_weight_max"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "max_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "max_unique_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+}
 op {
   name: "XlaSparseDenseMatmulWithCsrInput"
   input_arg {
@@ -67470,6 +68285,71 @@ op {
     type: "string"
   }
 }
+op {
+  name: "XlaSparseDenseMatmulWithStaticBufferSize"
+  input_arg {
+    name: "row_pointers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_sample_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_token_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sorted_gains"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "embedding_table"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_minibatches_per_physical_sparse_core"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "activations"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "input_size"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "quantization_config_low"
+    type: "float"
+  }
+  attr {
+    name: "quantization_config_high"
+    type: "float"
+  }
+  attr {
+    name: "quantization_config_num_buckets"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "max_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "max_unique_ids_per_sparse_core"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+  }
+}
 op {
   name: "XlaSplitND"
   input_arg {
diff --git a/tensorflow/core/ops/parsing_ops.cc b/tensorflow/core/ops/parsing_ops.cc
index d765455b6f1864..a3e801a87099c7 100644
--- a/tensorflow/core/ops/parsing_ops.cc
+++ b/tensorflow/core/ops/parsing_ops.cc
@@ -498,7 +498,7 @@ REGISTER_OP("DecodeCSV")
 REGISTER_OP("StringToNumber")
     .Input("string_tensor: string")
     .Output("output: out_type")
-    .Attr("out_type: {float, double, int32, int64} = DT_FLOAT")
+    .Attr("out_type: {float, double, int32, int64, uint32, uint64} = DT_FLOAT")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/file_system_test.cc b/tensorflow/core/platform/file_system_test.cc
index 2ecc93f11ab507..1c4d978f13ca6c 100644
--- a/tensorflow/core/platform/file_system_test.cc
+++ b/tensorflow/core/platform/file_system_test.cc
@@ -38,7 +38,7 @@ class InterPlanetaryFileSystem : public NullFileSystem {
     string parsed_path;
     ParsePath(fname, &parsed_path);
     if (BodyExists(parsed_path)) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     return Status(absl::StatusCode::kNotFound, "File does not exist");
   }
@@ -58,13 +58,13 @@ class InterPlanetaryFileSystem : public NullFileSystem {
       return Status(absl::StatusCode::kInvalidArgument, "Bad dirname");
     }
     if (split_path.empty()) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     if (split_path.size() == 1) {
       celestial_bodies_[""].insert(parsed_path);
       celestial_bodies_.insert(
           std::pair<string, std::set<string>>(parsed_path, {}));
-      return OkStatus();
+      return absl::OkStatus();
     }
     if (split_path.size() == 2) {
       if (!BodyExists(split_path[0])) {
@@ -74,7 +74,7 @@ class InterPlanetaryFileSystem : public NullFileSystem {
       celestial_bodies_[split_path[0]].insert(split_path[1]);
       celestial_bodies_.insert(
           std::pair<string, std::set<string>>(parsed_path, {}));
-      return OkStatus();
+      return absl::OkStatus();
     }
     if (split_path.size() == 3) {
       const string& parent_path = this->JoinPath(split_path[0], split_path[1]);
@@ -85,7 +85,7 @@ class InterPlanetaryFileSystem : public NullFileSystem {
       celestial_bodies_[parent_path].insert(split_path[2]);
       celestial_bodies_.insert(
           std::pair<string, std::set<string>>(parsed_path, {}));
-      return OkStatus();
+      return absl::OkStatus();
     }
     return Status(absl::StatusCode::kFailedPrecondition, "Failed to create");
   }
@@ -102,7 +102,7 @@ class InterPlanetaryFileSystem : public NullFileSystem {
       return Status(absl::StatusCode::kFailedPrecondition, "Not a dir");
     }
     if (celestial_bodies_.find(parsed_path) != celestial_bodies_.end()) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     return Status(absl::StatusCode::kFailedPrecondition, "Not a dir");
   }
@@ -114,7 +114,7 @@ class InterPlanetaryFileSystem : public NullFileSystem {
     ParsePath(dir, &parsed_path);
     result->insert(result->begin(), celestial_bodies_[parsed_path].begin(),
                    celestial_bodies_[parsed_path].end());
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -279,7 +279,7 @@ class TestFileSystem : public NullFileSystem {
   // Only allow for a single root directory.
   Status IsDirectory(const string& dirname, TransactionToken* token) override {
     if (dirname == "." || dirname.empty()) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     return Status(absl::StatusCode::kFailedPrecondition, "Not a dir");
   }
@@ -290,7 +290,7 @@ class TestFileSystem : public NullFileSystem {
     if (dir == "." || dir.empty()) {
       result->push_back("test");
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
diff --git a/tensorflow/core/platform/tensor_coding.cc b/tensorflow/core/platform/tensor_coding.cc
index ffa88ab151d709..dd91086efaf4dc 100644
--- a/tensorflow/core/platform/tensor_coding.cc
+++ b/tensorflow/core/platform/tensor_coding.cc
@@ -194,7 +194,7 @@ bool DecodeStringList(const absl::Cord& src, tstring* strings, int64_t n) {
 }
 
 void CopyFromArray(absl::Cord* c, const char* base, size_t bytes) {
-  c->CopyFrom(base, bytes);
+  *c = absl::string_view(base, bytes);
 }
 
 class CordStringListEncoderImpl : public StringListEncoder {
diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD
index ac4fb188bac6d5..a3c6a228922112 100644
--- a/tensorflow/core/profiler/convert/BUILD
+++ b/tensorflow/core/profiler/convert/BUILD
@@ -736,6 +736,7 @@ cc_library(
         ":xplane_to_hlo",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/profiler/protobuf:memory_viewer_preprocess_proto_cc",
         "@com_google_absl//absl/strings",
         "@local_xla//xla/service:hlo_proto_cc",
     ],
diff --git a/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.cc b/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.cc
index 2dc3be35c0c866..ab40287a231aa6 100644
--- a/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.cc
+++ b/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.cc
@@ -64,7 +64,6 @@ Shape ResolveShapeIndex(const xla::ShapeProto& shape_proto,
   // Choosing the last subshape to maintain historical behavior.
   int64_t i = shape_index.back();
   if (i >= shape_proto.tuple_shapes_size()) {
-    LOG(WARNING) << "shape_index out of tuple_shapes range.";
     return Shape(shape_proto);
   }
   return Shape(shape_proto.tuple_shapes(i));
@@ -220,15 +219,6 @@ class HloProtoBufferWrapper {
   // Get the raw HLO proto.
   const ::xla::HloProto& GetHloProto() const { return hlo_proto_; }
 
-  const BufferAllocationStruct& GetBufferAllocation(
-      int64_t buffer_allocation_id) const {
-    if (!id_to_buffer_allocation_.contains(buffer_allocation_id)) {
-      LOG(DFATAL) << "buffer_allocation_id " << buffer_allocation_id
-                  << " not found.";
-    }
-    return *id_to_buffer_allocation_.at(buffer_allocation_id);
-  }
-
   std::vector<const BufferAllocationStruct*> GetBufferAllocations(
       int64_t memory_color) const {
     std::vector<const BufferAllocationStruct*> buffer_allocations;
@@ -239,11 +229,12 @@ class HloProtoBufferWrapper {
     return buffer_allocations;
   }
 
-  LogicalBufferStruct& GetLogicalBuffer(int64_t logical_buffer_id) const {
+  LogicalBufferStruct* GetLogicalBuffer(int64_t logical_buffer_id) const {
     if (!id_to_logical_buffer_.contains(logical_buffer_id)) {
       LOG(DFATAL) << "logical_buffer_id " << logical_buffer_id << "not found.";
+      return nullptr;
     }
-    return *id_to_logical_buffer_.at(logical_buffer_id);
+    return id_to_logical_buffer_.at(logical_buffer_id).get();
   }
 
   // Get the logical buffers with indefinite lifetime (excluding thread_local).
@@ -262,11 +253,12 @@ class HloProtoBufferWrapper {
       const LogicalBufferStruct* best_logical_buffer = nullptr;
       size_t best_size = 0;
       for (const auto& assigned : buffer_assignment->proto().assigned()) {
-        const auto& logical_buffer_struct =
+        const LogicalBufferStruct* logical_buffer_struct =
             GetLogicalBuffer(assigned.logical_buffer_id());
-        if (logical_buffer_struct.size() > best_size) {
-          best_size = logical_buffer_struct.size();
-          best_logical_buffer = &logical_buffer_struct;
+        if (logical_buffer_struct == nullptr) continue;
+        if (logical_buffer_struct->size() > best_size) {
+          best_size = logical_buffer_struct->size();
+          best_logical_buffer = logical_buffer_struct;
         }
       }
       if (best_logical_buffer) {
@@ -442,12 +434,13 @@ void Convert(const xla::BufferAllocationProto_Assigned& assigned,
              const HloProtoBufferWrapper& wrapper, LogicalBuffer* result) {
   result->set_id(assigned.logical_buffer_id()),
       result->set_size_mib(BytesToMiB(assigned.size()));
-  const auto& logical_buffer =
+  const LogicalBufferStruct* logical_buffer =
       wrapper.GetLogicalBuffer(assigned.logical_buffer_id());
-  result->set_hlo_name(std::string(logical_buffer.instruction_name()));
+  if (logical_buffer == nullptr) return;
+  result->set_hlo_name(std::string(logical_buffer->instruction_name()));
   result->mutable_shape_index()->CopyFrom(
-      logical_buffer.proto.defined_at().shape_index());
-  result->set_shape(ShapeDescription(logical_buffer.shape));
+      logical_buffer->proto.defined_at().shape_index());
+  result->set_shape(ShapeDescription(logical_buffer->shape));
 }
 
 bool IsReusable(const BufferAllocationProto& buffer_allocation) {
@@ -542,9 +535,11 @@ struct HeapSimulatorStats {
     // Update memory timelines and seen buffers.
     heap_size_bytes_timeline.push_back(heap_size_bytes);
     unpadded_heap_size_bytes_timeline.push_back(unpadded_heap_size_bytes);
-    const auto& logical_buffer = wrapper.GetLogicalBuffer(event.buffer_id());
-    seen_logical_buffers.insert(&logical_buffer);
-    seen_buffer_allocations.insert(&logical_buffer.buffer_allocation.proto());
+    const LogicalBufferStruct* logical_buffer =
+        wrapper.GetLogicalBuffer(event.buffer_id());
+    if (logical_buffer == nullptr) return;
+    seen_logical_buffers.insert(logical_buffer);
+    seen_buffer_allocations.insert(&logical_buffer->buffer_allocation.proto());
   }
 
   // Update stats when memory usage increase.
@@ -670,36 +665,44 @@ Status ProcessHeapSimulatorTrace(const HloProtoBufferWrapper& wrapper,
   stats->SetSimulatorTraceEventSize(trace.events_size());
   for (const auto& event : trace.events()) {
     stats->UpdateOnSimulatorEvent(event);
-    auto& logical_buffer = wrapper.GetLogicalBuffer(event.buffer_id());
+    LogicalBufferStruct* logical_buffer =
+        wrapper.GetLogicalBuffer(event.buffer_id());
+    if (logical_buffer == nullptr) {
+      continue;
+    }
     if (event.kind() == HeapSimulatorTrace::Event::ALLOC) {
       // ALLOC event increases memory usage and initializes the buffer lifetime
       // span.
-      logical_buffer.inc();
-      stats->IncreaseMemoryUsage(&logical_buffer,
+      logical_buffer->inc();
+      stats->IncreaseMemoryUsage(logical_buffer,
                                  /*init_buffer_span=*/true);
     } else if (event.kind() == HeapSimulatorTrace::Event::FREE) {
-      auto ref_count = logical_buffer.dec();
+      auto ref_count = logical_buffer->dec();
       if (ref_count < 0) {
         return errors::InvalidArgument(absl::StrCat(
-            "Buffer ", logical_buffer.proto.id(), "is freed multiple times."));
+            "Buffer ", logical_buffer->proto.id(), "is freed multiple times."));
       }
       if (ref_count == 0) {
         // There is no more reference to the canonical buffer, the canonical
         // buffer is finally freed. Update memory usage and memory timespan
         // using the metadata of canonical buffer.
-        auto& canonical_buffer = *logical_buffer.get_canonical_buffer();
+        auto& canonical_buffer = *logical_buffer->get_canonical_buffer();
         TF_RETURN_IF_ERROR(stats->DecreaseMemoryUsage(&canonical_buffer));
       }
     } else if (event.kind() == HeapSimulatorTrace::Event::SHARE_WITH) {
       int64_t canonical_buffer_id = event.share_with_canonical_id();
-      auto& canonical_buffer = wrapper.GetLogicalBuffer(canonical_buffer_id);
-      auto ref_count = logical_buffer.share_with(&canonical_buffer);
+      LogicalBufferStruct* canonical_buffer =
+          wrapper.GetLogicalBuffer(canonical_buffer_id);
+      if (canonical_buffer == nullptr) {
+        continue;
+      }
+      auto ref_count = logical_buffer->share_with(canonical_buffer);
 
       if (ref_count == 1) {
         // SHARE_WITH happens after the FREE of a canonical buffer.
         // SHARE_WITH event does not initialize buffer lifetime span, it was
         // initialized by ALLOC event using the canonical logical buffer.
-        stats->IncreaseMemoryUsage(&canonical_buffer,
+        stats->IncreaseMemoryUsage(canonical_buffer,
                                    /*init_buffer_span=*/false);
       }
     } else {
@@ -735,8 +738,10 @@ struct PeakUsageSnapshot {
     // Buffers from HeapSimulatorTrace.
     for (const int64_t logical_buffer_id :
          simulator_stats.peak_logical_buffers) {
-      const auto& logical_buffer = wrapper.GetLogicalBuffer(logical_buffer_id);
-      AddHeapObject(logical_buffer);
+      const LogicalBufferStruct* logical_buffer =
+          wrapper.GetLogicalBuffer(logical_buffer_id);
+      if (logical_buffer == nullptr) return;
+      AddHeapObject(*logical_buffer);
     }
 
     // Make a single HeapObject out of all the small buffers.
@@ -963,14 +968,15 @@ void ConvertAllocationTimeline(const HloProtoBufferWrapper& wrapper,
              ba_colors[buffer_id % num_ba_colors]);
 
     for (const auto& assigned : buffer_allocation->proto().assigned()) {
-      const LogicalBufferStruct& logical_buffer =
+      const LogicalBufferStruct* logical_buffer =
           wrapper.GetLogicalBuffer(assigned.logical_buffer_id());
+      if (logical_buffer == nullptr) continue;
       // Exclude non-canonical logical buffers.
-      if (!logical_buffer.span || logical_buffer.canonical_buffer) continue;
-      size_t width = logical_buffer.span->second - logical_buffer.span->first;
-      size_t height = buffer_allocation_offset + logical_buffer.size();
-      add_rect(logical_buffer.span->first, logical_buffer.offset, width, height,
-               logical_buffer.description(),
+      if (!logical_buffer->span || logical_buffer->canonical_buffer) continue;
+      size_t width = logical_buffer->span->second - logical_buffer->span->first;
+      size_t height = buffer_allocation_offset + logical_buffer->size();
+      add_rect(logical_buffer->span->first, logical_buffer->offset, width,
+               height, logical_buffer->description(),
                lb_colors[node_id % num_lb_colors]);
     }
   }
diff --git a/tensorflow/core/profiler/convert/hlo_to_tools_data.cc b/tensorflow/core/profiler/convert/hlo_to_tools_data.cc
index e6773b5a298c1e..7fbcd397906eb7 100644
--- a/tensorflow/core/profiler/convert/hlo_to_tools_data.cc
+++ b/tensorflow/core/profiler/convert/hlo_to_tools_data.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <optional>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/strings/string_view.h"
@@ -30,13 +31,14 @@ limitations under the License.
 #include "tensorflow/core/profiler/convert/repository.h"
 #include "tensorflow/core/profiler/convert/tool_options.h"
 #include "tensorflow/core/profiler/convert/xplane_to_hlo.h"
+#include "tensorflow/core/profiler/protobuf/memory_viewer_preprocess.pb.h"
 
 namespace tensorflow {
 namespace profiler {
 
 namespace {
 
-absl::StatusOr<std::string> ConvertHloProtoToMemoryViewer(
+absl::StatusOr<PreprocessResult> GetMemoryViewerPreprocessResult(
     const xla::HloProto& hlo_proto) {
   static constexpr int kSmallBufferSize = 16 * 1024;  // 16KB
   static constexpr int kMemorySpaceColor = 0;         // HBM
@@ -48,6 +50,15 @@ absl::StatusOr<std::string> ConvertHloProtoToMemoryViewer(
         "Failed to convert HLO proto to memory viewer result: ",
         result_or.status().message());
   }
+  return result_or;
+}
+
+absl::StatusOr<std::string> ConvertHloProtoToMemoryViewer(
+    const xla::HloProto& hlo_proto) {
+  auto result_or = GetMemoryViewerPreprocessResult(hlo_proto);
+  if (!result_or.ok()) {
+    return result_or.status();
+  }
 
   std::string json_output;
   tensorflow::protobuf::util::JsonPrintOptions options;
@@ -64,6 +75,16 @@ absl::StatusOr<std::string> ConvertHloProtoToMemoryViewer(
   return json_output;
 }
 
+absl::StatusOr<std::string> ConvertHloProtoToAllocationTimeline(
+    const xla::HloProto& hlo_proto) {
+  auto result_or = GetMemoryViewerPreprocessResult(hlo_proto);
+  if (!result_or.ok()) {
+    return result_or.status();
+  }
+
+  return WrapDotInHtml(std::move(result_or.value().allocation_timeline()));
+}
+
 absl::StatusOr<std::string> ConvertHloProtoToGraphViewer(
     const xla::HloProto& hlo_proto, const ToolOptions& options) {
   TF_ASSIGN_OR_RETURN(GraphViewerParams params,
@@ -98,6 +119,9 @@ absl::StatusOr<std::string> ConvertHloProtoToToolData(
 
   // Convert from HLO proto to tools data.
   if (tool_name == "memory_viewer") {
+    if (GetParamWithDefault(options, "view_memory_allocation_timeline", 0)) {
+      return ConvertHloProtoToAllocationTimeline(hlo_proto);
+    }
     return ConvertHloProtoToMemoryViewer(hlo_proto);
   } else if (tool_name == "graph_viewer") {
     return ConvertHloProtoToGraphViewer(hlo_proto, options);
diff --git a/tensorflow/core/profiler/convert/op_profile_builder.cc b/tensorflow/core/profiler/convert/op_profile_builder.cc
index 75f5c32fc158d2..e234bc7239d0e9 100644
--- a/tensorflow/core/profiler/convert/op_profile_builder.cc
+++ b/tensorflow/core/profiler/convert/op_profile_builder.cc
@@ -152,8 +152,6 @@ void PopulateOpMetricsNode(
     const OpMetrics& op_metrics, double peak_gigaflops_per_second_per_core,
     std::vector<double> peak_mem_gibibytes_per_second_per_core,
     uint64_t total_time_ps, Node* node) {
-  DCHECK_EQ(ChildrenTimePs(op_metrics), 0);
-
   // TODO(dfinchel): remove this temporary change to avoid crash.
   // This is only needed while we make an update to proto version that is not
   // backwards compatible.
@@ -226,12 +224,6 @@ void PopulateOpMetricsNode(
   metrics->add_raw_bytes_accessed_array(sram_wr_bytes);
 }
 
-// Sets the total time on the root node metrics.
-void SetTotalTime(uint64_t total_time_ps, Node* root) {
-  Metrics* metrics = root->mutable_metrics();
-  metrics->set_raw_time(total_time_ps);
-}
-
 // Recursively insert "fused instruction" nodes (with raw flops).
 void InsertFusedInstructions(const OpMetrics& op_metrics, Node* node) {
   if (!op_metrics.has_children()) return;
@@ -245,6 +237,20 @@ void InsertFusedInstructions(const OpMetrics& op_metrics, Node* node) {
   }
 }
 
+void UpdateNodeMetrics(const OpMetrics& child, OpMetrics* parent) {
+  DCHECK(parent != nullptr);
+  parent->set_time_ps(child.self_time_ps() + parent->time_ps());
+  if (ChildrenTimePs(child) == 0) {
+    parent->set_flops(child.flops() + parent->flops());
+    parent->set_model_flops(child.model_flops() + parent->model_flops());
+    parent->set_bytes_accessed(child.bytes_accessed() +
+                               parent->bytes_accessed());
+    parent->set_dma_stall_ps(child.dma_stall_ps() + parent->dma_stall_ps());
+    CombineMemoryAccessedBreakdown(child.memory_accessed_breakdown(),
+                                   parent->mutable_memory_accessed_breakdown());
+  }
+}
+
 }  // namespace
 
 std::string OpProfileBuilder::GenerateProgramName(uint64_t program_id) const {
@@ -309,41 +315,37 @@ OpProfileBuilder::Program* OpProfileBuilder::LookupOrAddProgramNode(
 }
 
 void OpProfileBuilder::AddOp(const OpMetrics& op_metrics) {
-  // Exclude ops with children ops to avoid double counting of flops, bytes and
-  // time from children ops.
-  if (ChildrenTimePs(op_metrics) > 0) return;
-
-  // The path from the root to the leaf node:
-  // e.g. by_program -> cluster_xx -> convolution -> convolution.1 and its
-  // deduplicates -> convolution.1
-  // We will aggregate the metrics of convolution.1 to all its parent nodes.
-  std::vector<Node*> all_paths = {root_};
+  // 1. Deal with nested parent nodes
+  // op_metrics.time_ps in root node will be reset to total_time_ps later
+  UpdateNodeMetrics(op_metrics, &metrics_[root_]);
+  Program* program = nullptr;
+  if (!IsIdleOp(op_metrics) && options_.group_by_program) {
+    program = LookupOrAddProgramNode(op_metrics);
+    UpdateNodeMetrics(op_metrics, &metrics_[program->node]);
+  }
 
+  // 2. Deal with nested grouping nodes, only accumulate non-child ops
+  if (ChildrenTimePs(op_metrics) > 0) return;
+  std::vector<Node*> nested_grouping_nodes;
   if (IsIdleOp(op_metrics)) {
     Node* leaf = AddOpNode(op_metrics);
-    all_paths.push_back(leaf);
+    nested_grouping_nodes.push_back(leaf);
   } else {
-    Program* program = nullptr;
-    if (options_.group_by_program) {
-      program = LookupOrAddProgramNode(op_metrics);
-      all_paths.push_back(program->node);
-    }
-
     Category* category = LookupOrAddCategoryNode(op_metrics, program);
-    all_paths.push_back(category->node);
+    nested_grouping_nodes.push_back(category->node);
 
     Node* deduplicated_node = nullptr;
     if (options_.group_by_deduplicated_name &&
         !op_metrics.deduplicated_name().empty()) {
       deduplicated_node = LookupOrAddDeduplicatedNode(op_metrics, category);
-      all_paths.push_back(deduplicated_node);
+      nested_grouping_nodes.push_back(deduplicated_node);
     }
 
     Node* leaf = AddOpNode(op_metrics, category, deduplicated_node);
-    all_paths.push_back(leaf);
+    nested_grouping_nodes.push_back(leaf);
   }
 
-  for (auto* node : all_paths) {
+  for (auto* node : nested_grouping_nodes) {
     // Per program combiner does not need to update OpMetrics.num_cores
     CombineOpMetrics(op_metrics, &metrics_[node], /*update_num_cores=*/false);
   }
@@ -353,12 +355,17 @@ void OpProfileBuilder::Finalize(
     double peak_gigaflops_per_second_per_core,
     std::vector<double> peak_mem_gibibytes_per_second_per_core,
     uint64_t total_time_ps) {
+  // Call to `PopulateOpMetricsNode` depends on node time_ps to calculate
+  // flops, bandwidth_utils..etc. The root / program node time_ps might
+  // be off a bit, missing its own self_time when calling `UpdateNodeMetrics`.
+  // This is best effort to at least reset the time_ps for root node to be more
+  // precise.
+  metrics_[root_].set_time_ps(total_time_ps);
   for (const auto& [node, op_metrics] : metrics_) {
     PopulateOpMetricsNode(op_metrics, peak_gigaflops_per_second_per_core,
                           peak_mem_gibibytes_per_second_per_core, total_time_ps,
                           node);
   }
-  SetTotalTime(total_time_ps, root_);
   // If grouping by program, we build a two-level pruned tree: the first level
   // is per program and the second level is per category. Otherwise we build a
   // single-level per category pruned tree.
diff --git a/tensorflow/core/profiler/convert/op_profile_builder.h b/tensorflow/core/profiler/convert/op_profile_builder.h
index 516138c5fa5c2e..3d4e7abd1f6b18 100644
--- a/tensorflow/core/profiler/convert/op_profile_builder.h
+++ b/tensorflow/core/profiler/convert/op_profile_builder.h
@@ -35,14 +35,64 @@ struct OpProfileOptions {
   int children_per_node = 100;
 };
 
+// The structure of an op profile tree may looks like below:
+// 1. group "by_program"
+// - It starts from the root node, named as "by_program", and this node does
+// not show up in op profile.
+// - The children of root node is a list of hlo program node, named as the
+// program/module name (eg. cluster.xx).
+// - The children of a program node is hlo op category node, named as the
+// category name (eg. data formatting).
+// - The children of a category node is a list of op node or deduplicated
+// group node:
+//   - For op that has duplicates, the child will be a deduplicated node,
+// named like "copy.1111 and its deduplicate(s)". Its children will be all op
+// nodes that are deduplicated.
+//   - For op that does not have duplicates, the child will be an op node
+// under the op category (eg. copy.2222).
+//
+// Example path: "by_program" -> "main(...)"
+// -> "data_formatting" -> "copy.12345 and its duplicate(s) -> "copy.12345"
+//
+// 2. group "by_category"
+// Similarly to how the `by_program` op profile tree is constructed,
+// `by_category` just removed the "program_node" layer:
+// - It starts from the root node, named as "by_category", this node also does
+// not show up in op profile.
+// - The children of root node is a list of op category node, everything below
+// is similar to above.
+// - ...
+//
+// Example path: "by_category" -> "data_formatting" -> "copy.12345 and its
+// duplicate(s) -> "copy.12345"
+//
+// How the op profile metrics are calculated:
+// 1. For parent node in the nested structure like root node and program node:
+// - time_ps will be accumulated from the self_time of all op nodes under it
+// (might still be off a bit if the parent node has self_time, more details in
+// b/333608397#comment5)
+// - flops and memory access will only be accumulated from leaf op node under
+// it to avoid double counting
+// - unable to get occurrences of program executions now
+// 2. For conceptual horizontal grouping node (eg.category, deduplicated)
+// - all op_metris fields will be accumulated from leaf op node only in the
+// group, to avoid double counting
 class OpProfileBuilder {
  public:
   OpProfileBuilder(const OpProfileOptions& options, op_profile::Node* root,
                    const tensorflow::protobuf::Map<uint64_t, std::string>*
                        program_name_map = nullptr);
 
+  // Accumulate the op_metrics to the op_profile node tree
   void AddOp(const OpMetrics& op_metrics);
 
+  // Finalize the op_profile proto in a few steps (inter-dependent):
+  // 1. Reset time_ps for root node for more precise total time
+  // 2. Loop over the node to op_metrics map, populate corresponding op_metrics
+  // to the node.metrics
+  // 3. `SortAndPruneChildren` given query param `op_profile_limit`
+  // 4. `FinalizeDeduplicatedNodes` by coping the first op node data to the
+  // deduplicated node
   void Finalize(double peak_gigaflops_per_second_per_core,
                 std::vector<double> peak_mem_gibibytes_per_second_per_core,
                 uint64_t total_time_ps);
diff --git a/tensorflow/core/profiler/convert/trace_viewer/BUILD b/tensorflow/core/profiler/convert/trace_viewer/BUILD
index 8b2ff03a343174..bdb65463505483 100644
--- a/tensorflow/core/profiler/convert/trace_viewer/BUILD
+++ b/tensorflow/core/profiler/convert/trace_viewer/BUILD
@@ -118,6 +118,7 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
+        "@local_tsl//tsl/lib/io:iterator",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/profiler/utils:timespan",
     ],
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_events.cc b/tensorflow/core/profiler/convert/trace_viewer/trace_events.cc
index 35ac11941f4d0e..c514988f8f2b12 100644
--- a/tensorflow/core/profiler/convert/trace_viewer/trace_events.cc
+++ b/tensorflow/core/profiler/convert/trace_viewer/trace_events.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/base/internal/endian.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/file_system.h"
 #include "tensorflow/core/platform/macros.h"
@@ -36,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/convert/trace_viewer/trace_viewer_visibility.h"
 #include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
 #include "tensorflow/core/profiler/protobuf/trace_events_raw.pb.h"
+#include "tsl/lib/io/iterator.h"
 #include "tsl/lib/io/table.h"
 #include "tsl/lib/io/table_builder.h"
 #include "tsl/lib/io/table_options.h"
@@ -170,6 +172,36 @@ std::vector<std::vector<const TraceEvent*>> GetEventsByLevel(
   return events_by_level;
 }
 
+absl::Status ReadFileTraceMetadata(std::string& filepath, Trace* trace) {
+  // 1. Open the file.
+  uint64_t file_size;
+  TF_RETURN_IF_ERROR(tsl::Env::Default()->GetFileSize(filepath, &file_size));
+
+  tsl::FileSystem* file_system;
+  TF_RETURN_IF_ERROR(
+      tsl::Env::Default()->GetFileSystemForFile(filepath, &file_system));
+
+  std::unique_ptr<tsl::RandomAccessFile> file;
+  TF_RETURN_IF_ERROR(file_system->NewRandomAccessFile(filepath, &file));
+
+  tsl::table::Options options;
+  options.block_size = 20 * 1024 * 1024;
+  tsl::table::Table* table = nullptr;
+  TF_RETURN_IF_ERROR(
+      tsl::table::Table::Open(options, file.get(), file_size, &table));
+  std::unique_ptr<tsl::table::Table> table_deleter(table);
+
+  std::unique_ptr<tsl::table::Iterator> iterator(table->NewIterator());
+  if (iterator == nullptr) return absl::UnknownError("Could not open table");
+
+  // 2. Read the metadata.
+  iterator->SeekToFirst();
+  if (!ReadTraceMetadata(iterator.get(), kTraceMetadataKey, trace)) {
+    return absl::UnknownError("Could not parse Trace proto");
+  }
+  return absl::OkStatus();
+}
+
 // Store the contents of this container in an sstable file. The format is as
 // follows:
 //
@@ -265,7 +297,8 @@ tsl::Status DoLoadFromLevelDbTable(
   // Read the metadata.
   iterator->SeekToFirst();
   if (!ReadTraceMetadata(iterator.get(), kTraceMetadataKey, &trace)) {
-    return tsl::errors::Unknown("Could not parse Trace proto");
+    return absl::UnknownError(
+        "Could not parse Trace proto to read trace metadata");
   }
 
   if (filter) filter->SetUp(trace);
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_events.h b/tensorflow/core/profiler/convert/trace_viewer/trace_events.h
index 68a27c22f86b67..fb2ae7766a6c58 100644
--- a/tensorflow/core/profiler/convert/trace_viewer/trace_events.h
+++ b/tensorflow/core/profiler/convert/trace_viewer/trace_events.h
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/lib/context_types.h"
 #include "tensorflow/core/profiler/protobuf/task.pb.h"
 #include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
+#include "tsl/lib/io/table.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/file_system.h"
 #include "tsl/platform/status.h"
@@ -65,6 +66,9 @@ tsl::Status DoLoadFromLevelDbTable(
     const std::function<TraceEvent*(const TraceEvent&)>& copy_event_to_arena,
     const std::function<void(TraceEvent*)>& add_arena_event);
 
+// Reads the trace metadata from a file with given path
+absl::Status ReadFileTraceMetadata(std::string& filepath, Trace* trace);
+
 std::vector<std::vector<const TraceEvent*>> GetEventsByLevel(
     const Trace& trace, std::vector<TraceEvent*>& events);
 
diff --git a/tensorflow/core/profiler/convert/xspace_to_dcn_slack_analysis.cc b/tensorflow/core/profiler/convert/xspace_to_dcn_slack_analysis.cc
index f85d0f92904c80..82cf25f4e2b180 100644
--- a/tensorflow/core/profiler/convert/xspace_to_dcn_slack_analysis.cc
+++ b/tensorflow/core/profiler/convert/xspace_to_dcn_slack_analysis.cc
@@ -241,8 +241,9 @@ int DcnTracker::GetReplicaGroupSize(const std::string& rendezvous_name,
   return rendezvous_to_replica_group_size_map_[rendezvous_name];
 }
 
+// ComputeTransmittedDataSize is called with the buffer_size for recv-done.
 uint64_t DcnTracker::ComputeTransmittedDataSize(
-    const int64_t buffer_size, const int group_size,
+    const int64_t recv_buffer_size, const int group_size,
     const std::string& transfer_type) {
   uint64_t transmitted_bytes = 0;
   if (group_size == 0) {
@@ -251,17 +252,20 @@ uint64_t DcnTracker::ComputeTransmittedDataSize(
   }
 
   if (transfer_type == "ONE_TO_ONE") {
-    transmitted_bytes = group_size * buffer_size;
+    transmitted_bytes = group_size * recv_buffer_size;
   } else if (transfer_type == "ALL_GATHER") {
-    transmitted_bytes = (group_size - 1) * buffer_size;
+    transmitted_bytes =
+        SafeDivide((group_size - 1) * recv_buffer_size, group_size);
   } else if (transfer_type == "ALL_REDUCE") {
     // Since the reduced buffer now has to be sent back to the replicas,
     // the total bytes transmitted over the network is 2x the shape of the op.
     transmitted_bytes =
-        2 * SafeDivide(group_size - 1, group_size) * buffer_size;
-  } else if (transfer_type == "ALL_TO_ALL" ||
-             transfer_type == "REDUCE_SCATTER") {
-    transmitted_bytes = SafeDivide(group_size - 1, group_size) * buffer_size;
+        2 * SafeDivide(group_size - 1, group_size) * recv_buffer_size;
+  } else if (transfer_type == "ALL_TO_ALL") {
+    transmitted_bytes =
+        SafeDivide(group_size - 1, group_size) * recv_buffer_size;
+  } else if (transfer_type == "REDUCE_SCATTER") {
+    transmitted_bytes = recv_buffer_size * (group_size - 1);
   } else {
     LOG(ERROR) << "Unsupported transfer type: " << transfer_type;
   }
diff --git a/tensorflow/core/profiler/internal/tfprof_utils.cc b/tensorflow/core/profiler/internal/tfprof_utils.cc
index 251e632ec001be..ad6759629ebf2c 100644
--- a/tensorflow/core/profiler/internal/tfprof_utils.cc
+++ b/tensorflow/core/profiler/internal/tfprof_utils.cc
@@ -287,7 +287,7 @@ tensorflow::Status ParseCmdLine(const string& line, string* cmd,
       return ReturnError(pieces, i);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void PrintHelp() {
diff --git a/tensorflow/core/profiler/lib/BUILD b/tensorflow/core/profiler/lib/BUILD
index d8bb6c98e1fda1..27155ff3c5e393 100644
--- a/tensorflow/core/profiler/lib/BUILD
+++ b/tensorflow/core/profiler/lib/BUILD
@@ -61,10 +61,7 @@ alias(
 cc_library(
     name = "profiler_factory",
     hdrs = ["profiler_factory.h"],
-    visibility = [
-        "//perftools/accelerators/xprof/xprofilez/tpu:__pkg__",
-        "//tensorflow/c/experimental/pluggable_profiler:__pkg__",
-    ],
+    visibility = ["//tensorflow/c/experimental/pluggable_profiler:__pkg__"],
     deps = [
         ":profiler_interface",
         "@com_google_absl//absl/base:core_headers",
diff --git a/tensorflow/core/profiler/lib/profiler_disabled_test.cc b/tensorflow/core/profiler/lib/profiler_disabled_test.cc
index 089c8692ca7bde..7bf0b9090a31f1 100644
--- a/tensorflow/core/profiler/lib/profiler_disabled_test.cc
+++ b/tensorflow/core/profiler/lib/profiler_disabled_test.cc
@@ -24,7 +24,7 @@ namespace {
 
 TEST(ProfilerDisabledTest, ProfilerDisabledTest) {
   setenv("TF_DISABLE_PROFILING", "1", /*overwrite=*/1);
-  absl::StatusOr<ProfilerLock> profiler_lock =
+  absl::StatusOr<tsl::profiler::ProfilerLock> profiler_lock =
       tsl::profiler::ProfilerLock::Acquire();
   EXPECT_FALSE(profiler_lock.ok());
 }
diff --git a/tensorflow/core/profiler/profiler.cc b/tensorflow/core/profiler/profiler.cc
index 4c7b39dce6f484..c6b68f71011094 100644
--- a/tensorflow/core/profiler/profiler.cc
+++ b/tensorflow/core/profiler/profiler.cc
@@ -162,7 +162,7 @@ int Run(int argc, char** argv) {
   string output_type;
   std::map<string, string> output_options;
   Status s = ParseOutput(FLAGS_output, &output_type, &output_options);
-  CHECK(s.ok()) << s.ToString();
+  CHECK(s.ok()) << s;
 
   string cmd = "";
   if (argc == 1 && FLAGS_graph_path.empty() && FLAGS_profile_path.empty() &&
diff --git a/tensorflow/core/profiler/protobuf/task.proto b/tensorflow/core/profiler/protobuf/task.proto
index 7a3fe800762c0c..b2cd02fee8f9ac 100644
--- a/tensorflow/core/profiler/protobuf/task.proto
+++ b/tensorflow/core/profiler/protobuf/task.proto
@@ -9,7 +9,7 @@ option cc_enable_arenas = true;
 // 'Task' contains information about a task that profiler traced.
 message Task {
   // The most recent changelist number from the client that built the binary.
-  optional int32 changelist = 1;
+  optional int64 changelist = 1;
   // True if the client that built the binary was mint (no local changes).
   optional bool clean_build = 2;
   // Build time (in ns relative to the Unix epoch).
diff --git a/tensorflow/core/profiler/utils/xplane_utils.h b/tensorflow/core/profiler/utils/xplane_utils.h
index f78350760106b2..75ed0d1b3ed330 100644
--- a/tensorflow/core/profiler/utils/xplane_utils.h
+++ b/tensorflow/core/profiler/utils/xplane_utils.h
@@ -27,6 +27,7 @@ namespace profiler {
 
 using tsl::profiler::AddFlowsToXplane;               // NOLINT
 using tsl::profiler::AggregateXPlane;                // NOLINT
+using tsl::profiler::FindLinesWithId;                // NOLINT
 using tsl::profiler::FindLineWithId;                 // NOLINT
 using tsl::profiler::FindLineWithName;               // NOLINT
 using tsl::profiler::FindMutablePlanes;              // NOLINT
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index 377b53c48d04b2..e54660b16bb961 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -273,6 +273,35 @@ message GPUOptions {
     // node_id for use when creating a PjRt GPU client with remote devices,
     // which enumerates jobs*tasks from a ServerDef.
     int32 node_id = 18;
+
+    // Whether to merge data transfer streams into the compute stream in the
+    // same stream group. Stream merging helps reduce the overhead caused by
+    // stream synchronization, especially when data transfers are frequent. For
+    // example, setting "merge_host_to_device_stream = true" will make the
+    // compute stream responsible for both computation and host to device memory
+    // copy.
+    message StreamMergeOptions {
+      // If true, the compute stream will be used for host_to_device copy as
+      // well. It's no longer necessary to record an event before the copy to
+      // let the copy stream wait for the compute stream to finish. There is
+      // also no need to wait for the copy to complete before executing the
+      // callback function.
+      bool merge_host_to_device_stream = 1;
+
+      // If true, the compute stream will be used for device_to_host copy as
+      // well. It's no longer necessary to record an event before the copy to
+      // let the copy stream wait for the compute stream to finish.
+      bool merge_device_to_host_stream = 2;
+
+      // If true, the compute stream will be used for device_to_device copy as
+      // well. It's no longer necessary to record an event before the copy to
+      // let the copy stream wait for the compute stream of the sending device
+      // to finish. There is also no need to wait for the compute stream of the
+      // receiving device to finish if the copy is within the same device.
+      bool merge_device_to_device_stream = 3;
+    }
+
+    StreamMergeOptions stream_merge_options = 19;
   }
 
   // Everything inside experimental is subject to change and is not subject
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 9249f032b1e1c1..c436412c0e88c6 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1835  // Updated: 2024/4/17
+#define TF_GRAPH_DEF_VERSION 1869  // Updated: 2024/5/21
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
diff --git a/tensorflow/core/runtime_fallback/conversion/BUILD b/tensorflow/core/runtime_fallback/conversion/BUILD
index 9b90ab2aa3d062..bf533491c77136 100644
--- a/tensorflow/core/runtime_fallback/conversion/BUILD
+++ b/tensorflow/core/runtime_fallback/conversion/BUILD
@@ -29,11 +29,15 @@ cc_library(
     srcs = ["conversion.cc"],
     hdrs = ["conversion.h"],
     deps = [
+        "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/common_runtime/eager:tensor_handle",
+        "//tensorflow/core/framework:tensor",
+        "//tensorflow/core/platform:status",
         "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_tensor",
         "//tensorflow/core/runtime_fallback/kernel:tensor_util",
+        "//tensorflow/core/runtime_fallback/runtime:kernel_utils",
         "//tensorflow/core/runtime_fallback/runtime:runtime_fallback_tensor",
-        "//tensorflow/core/tfrt/utils:error_util",
         "@tf_runtime//:hostcontext",
         "@tf_runtime//:tensor",
     ],
diff --git a/tensorflow/core/runtime_fallback/conversion/conversion.cc b/tensorflow/core/runtime_fallback/conversion/conversion.cc
index 71d97860aaa0af..f7a8fe018d18e2 100644
--- a/tensorflow/core/runtime_fallback/conversion/conversion.cc
+++ b/tensorflow/core/runtime_fallback/conversion/conversion.cc
@@ -20,11 +20,15 @@ limitations under the License.
 
 #include <utility>
 
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/framework/device.h"
+#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_tensor.h"
 #include "tensorflow/core/runtime_fallback/kernel/tensor_util.h"
+#include "tensorflow/core/runtime_fallback/runtime/kernel_utils.h"
 #include "tensorflow/core/runtime_fallback/runtime/runtime_fallback_tensor.h"
-#include "tensorflow/core/tfrt/utils/error_util.h"
 #include "tfrt/host_context/async_value_ref.h"  // from @tf_runtime
 #include "tfrt/host_context/device.h"  // from @tf_runtime
 #include "tfrt/tensor/conversion_registry.h"  // from @tf_runtime
diff --git a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.cc b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.cc
index f76269995e0964..aa48bcf6be10f0 100644
--- a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.cc
+++ b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.cc
@@ -101,9 +101,9 @@ void KernelFallbackEmitError(
 
 }  // namespace
 
-static llvm::Expected<gtl::InlinedVector<tensorflow::Tensor, 4>>
+static llvm::Expected<absl::InlinedVector<tensorflow::Tensor, 4UL>>
 ConvertInputTensors(llvm::ArrayRef<tfrt::Tensor*> arguments) {
-  gtl::InlinedVector<tensorflow::Tensor, 4> input_tf_tensors;
+  absl::InlinedVector<tensorflow::Tensor, 4UL> input_tf_tensors;
   input_tf_tensors.reserve(arguments.size());
   for (tfrt::Tensor* argument : arguments) {
     auto expected_tf_tensor = tfrt::TFRTTensorToTFTensor(*argument);
@@ -119,7 +119,7 @@ ConvertInputTensors(llvm::ArrayRef<tfrt::Tensor*> arguments) {
 
 static Status ValidateInputTypes(
     tfrt::string_view op_name,
-    const gtl::InlinedVector<tensorflow::Tensor, 4>& input_tf_tensors,
+    const absl::InlinedVector<tensorflow::Tensor, 4UL>& input_tf_tensors,
     const DataTypeVector& input_types) {
   const size_t n_inputs = input_tf_tensors.size();
 
@@ -286,7 +286,7 @@ tfrt::AsyncValueRef<tfrt::Chain> KernelFallbackExecuteCompatCoreRuntimeDispatch(
   // TODO(b/176997538): Skip checking dtypes for tf._BatchFunctionFallback op
   // due to b/176997538. Remove the skipping once the SavedModel lowering
   // problem is fixed.
-  if (!status.ok() && !op_name.equals("_BatchFunctionFallback")) {
+  if (!status.ok() && op_name != "_BatchFunctionFallback") {
     KernelFallbackEmitError(exec_ctx, &fallback_request_state, op_name,
                             &op_chain, results, status);
     return op_chain;
diff --git a/tensorflow/core/runtime_fallback/test/BUILD b/tensorflow/core/runtime_fallback/test/BUILD
index e339e59bcd6fa9..66a07623d7ea8a 100644
--- a/tensorflow/core/runtime_fallback/test/BUILD
+++ b/tensorflow/core/runtime_fallback/test/BUILD
@@ -45,6 +45,9 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/core:portable_gif_internal",
+        "//tensorflow/core/framework:tensor_shape",
+        "//tensorflow/core/framework:types_proto_cc",
         "@tf_runtime//:hostcontext",
     ] + select({
         "//tensorflow:android": [
@@ -231,6 +234,7 @@ cc_library(
     ],
     visibility = ["//tensorflow/core/runtime_fallback:__subpackages__"],
     deps = [
+        "@com_google_absl//absl/log",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
diff --git a/tensorflow/core/runtime_fallback/test/test_kernels.cc b/tensorflow/core/runtime_fallback/test/test_kernels.cc
index 2020cad1d2b133..05c23d918c4f59 100644
--- a/tensorflow/core/runtime_fallback/test/test_kernels.cc
+++ b/tensorflow/core/runtime_fallback/test/test_kernels.cc
@@ -18,19 +18,16 @@ limitations under the License.
 #include <string>
 #include <utility>
 
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/platform/env.h"
+#include "tsl/platform/env.h"
 #include "tfrt/core_runtime/tensor_handle.h"  // from @tf_runtime
 #include "tfrt/dtype/dtype.h"  // from @tf_runtime
 #include "tfrt/host_context/async_dispatch.h"  // from @tf_runtime
 #include "tfrt/host_context/async_value.h"  // from @tf_runtime
-#include "tfrt/host_context/async_value_ref.h"  // from @tf_runtime
 #include "tfrt/host_context/attribute_utils.h"  // from @tf_runtime
 #include "tfrt/host_context/execution_context.h"  // from @tf_runtime
 #include "tfrt/host_context/host_context.h"  // from @tf_runtime
 #include "tfrt/host_context/kernel_registry.h"  // from @tf_runtime
 #include "tfrt/host_context/kernel_utils.h"  // from @tf_runtime
-#include "tfrt/host_context/sync_kernel_utils.h"  // from @tf_runtime
 #include "tfrt/support/logging.h"  // from @tf_runtime
 #include "tfrt/tensor/string_host_tensor.h"  // from @tf_runtime
 #include "tfrt/tensor/tensor_metadata.h"  // from @tf_runtime
@@ -38,7 +35,6 @@ limitations under the License.
 namespace tfd {
 
 namespace {
-using ::tensorflow::Env;
 using ::tfrt::Attribute;
 using ::tfrt::DType;
 using ::tfrt::EnqueueWork;
@@ -51,6 +47,7 @@ using ::tfrt::RemainingResults;
 using ::tfrt::StringHostTensor;
 using ::tfrt::TensorHandle;
 using ::tfrt::TensorMetadata;
+using ::tsl::Env;
 
 static void CreateTensorHandleWithDelayedAsyncTensor(
     RemainingArguments inputs, RemainingResults output_tensors,
diff --git a/tensorflow/core/runtime_fallback/test/test_opkernels.cc b/tensorflow/core/runtime_fallback/test/test_opkernels.cc
index 3616ab7a76ba09..5b648abf144528 100644
--- a/tensorflow/core/runtime_fallback/test/test_opkernels.cc
+++ b/tensorflow/core/runtime_fallback/test/test_opkernels.cc
@@ -18,13 +18,16 @@ limitations under the License.
 #include <memory>
 #include <string>
 
+#include "absl/log/log.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
+#include "tsl/platform/env.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/runtime_fallback/test/tfrt_forwarding_kernels.cc b/tensorflow/core/runtime_fallback/test/tfrt_forwarding_kernels.cc
index 613376ca89f51f..6d45437dae4625 100644
--- a/tensorflow/core/runtime_fallback/test/tfrt_forwarding_kernels.cc
+++ b/tensorflow/core/runtime_fallback/test/tfrt_forwarding_kernels.cc
@@ -16,6 +16,9 @@ limitations under the License.
 // TFRT kernels for testing tfrt_forwarding delegate.
 
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/types.h"
 #include "tfrt/host_context/kernel_registry.h"  // from @tf_runtime
 #include "tfrt/host_context/kernel_utils.h"  // from @tf_runtime
 
diff --git a/tensorflow/core/runtime_fallback/util/attr_util.cc b/tensorflow/core/runtime_fallback/util/attr_util.cc
index 89eca1afdcf32b..581207a9b7dc94 100644
--- a/tensorflow/core/runtime_fallback/util/attr_util.cc
+++ b/tensorflow/core/runtime_fallback/util/attr_util.cc
@@ -270,22 +270,22 @@ llvm::Error FillAttrValueMapUsingScalar(const OpAttrsRawEntry& entry,
 Status ParseTfDataType(absl::string_view dtype, DataType* data_type) {
   if (dtype == "DT_INT8") {
     *data_type = DataType::DT_INT8;
-    return OkStatus();
+    return absl::OkStatus();
   } else if (dtype == "DT_INT32") {
     *data_type = DataType::DT_INT32;
-    return OkStatus();
+    return absl::OkStatus();
   } else if (dtype == "DT_INT64") {
     *data_type = DataType::DT_INT64;
-    return OkStatus();
+    return absl::OkStatus();
   } else if (dtype == "DT_HALF") {
     *data_type = DataType::DT_HALF;
-    return OkStatus();
+    return absl::OkStatus();
   } else if (dtype == "DT_FLOAT") {
     *data_type = DataType::DT_FLOAT;
-    return OkStatus();
+    return absl::OkStatus();
   } else if (dtype == "DT_DOUBLE") {
     *data_type = DataType::DT_DOUBLE;
-    return OkStatus();
+    return absl::OkStatus();
   } else {
     return errors::InvalidArgument("Unsupported dtype, ", std::string(dtype),
                                    " in ParseTfDataType.");
@@ -432,10 +432,10 @@ tfrt::DType ConvertTfDataTypeToBefAttrType(DataType data_type) {
 Status ParseBoolAttrValue(absl::string_view attr_value, bool* bool_val) {
   if (attr_value == "false") {
     *bool_val = false;
-    return OkStatus();
+    return absl::OkStatus();
   } else if (attr_value == "true") {
     *bool_val = true;
-    return OkStatus();
+    return absl::OkStatus();
   } else {
     return errors::InvalidArgument("Could not parse bool from \"", attr_value,
                                    "\"");
@@ -448,7 +448,7 @@ Status ParseIntAttrValue(absl::string_view attr_value, int64_t* int_val) {
     return errors::InvalidArgument("Could not parse int from \"", attr_value,
                                    "\"");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ParseTensorAttrValue(absl::string_view attr_value,
@@ -464,7 +464,7 @@ Status ParseTensorAttrValue(absl::string_view attr_value,
     if (protobuf::TextFormat::ParseFromString(
             static_cast<std::string>(attr_value), message) &&
         tensor->FromProto(tensor_proto)) {
-      return OkStatus();
+      return absl::OkStatus();
     } else {
       return errors::InvalidArgument("Could not parse tensor value from \"",
                                      attr_value, "\"");
@@ -498,7 +498,7 @@ Status ParseTensorShapeAttrValue(absl::string_view attr_value,
     }
     shape_val->push_back(int_val);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool IsUnusedAttribute(absl::string_view attr_name) {
@@ -576,14 +576,14 @@ Status SetUpScalarAttr(tfrt::TypedAttrBase bef_attr,
     return tensorflow::errors::Internal("Failed to set up attribute.");
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status SetUpScalarFunctionAttr(tfrt::StringAttr func_attr,
                                tensorflow::AttrValue& tf_attr) {
   tfrt::string_view func_name = func_attr.GetValue();
   tf_attr.mutable_func()->set_name(func_name.data(), func_name.size());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void AddShapeToAttrList(tfrt::ShapeAttr shape,
@@ -618,7 +618,7 @@ Status SetUpListAttr(tfrt::AggregateAttr aggregate_attr,
       return tensorflow::errors::Internal("Failed to set up list attr.");
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status SetUpListAttr(tfrt::ArrayAttr array_attr,
@@ -627,7 +627,7 @@ Status SetUpListAttr(tfrt::ArrayAttr array_attr,
 
   // Handle an empty array case.
   if (array_attr.GetNumElements() == 0) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   tfrt::BEFAttributeType element_type = array_attr.GetElementType();
@@ -638,19 +638,19 @@ Status SetUpListAttr(tfrt::ArrayAttr array_attr,
         for (auto value : array_attr.GetValue<bool>()) {
           list->add_b(value);
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
       case tfrt::DType::I64: {
         for (auto value : array_attr.GetValue<int64_t>()) {
           list->add_i(value);
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
       case tfrt::DType::F32: {
         for (auto value : array_attr.GetValue<float>()) {
           list->add_f(value);
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
       default:
         return tensorflow::errors::Internal(
@@ -661,7 +661,7 @@ Status SetUpListAttr(tfrt::ArrayAttr array_attr,
     for (auto value : array_attr.GetValue<tfrt::DType>()) {
       list->add_type(ConvertBefAttrTypeToTfDataType(value));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   return tensorflow::errors::Internal("Failed to set up list attr.");
@@ -705,7 +705,7 @@ Status SetUpAttrValueMap(tfrt::AggregateAttr op_attr_array,
     TF_RETURN_IF_ERROR(SetUpScalarFunctionAttr(attr_value, tf_attr));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tfd
diff --git a/tensorflow/core/runtime_fallback/util/attr_util_test.cc b/tensorflow/core/runtime_fallback/util/attr_util_test.cc
index 2beeb0848600a7..91811eb63dd01c 100644
--- a/tensorflow/core/runtime_fallback/util/attr_util_test.cc
+++ b/tensorflow/core/runtime_fallback/util/attr_util_test.cc
@@ -72,7 +72,7 @@ INSTANTIATE_TEST_SUITE_P(
 
 TEST_P(ParseTfDataTypeTest, Ok) {
   DataType data_type;
-  ASSERT_EQ(ParseTfDataType(GetParam().str_val, &data_type), OkStatus());
+  ASSERT_EQ(ParseTfDataType(GetParam().str_val, &data_type), absl::OkStatus());
   EXPECT_EQ(data_type, GetParam().dtype);
 }
 
@@ -210,7 +210,7 @@ TEST(UtilsTest, ParseTensorAttrValueOk) {
                                 int_val: 1
                                 int_val: 1
                                 int_val: 1)pb";
-  ASSERT_EQ(ParseTensorAttrValue(tensor_str, &tensor), OkStatus());
+  ASSERT_EQ(ParseTensorAttrValue(tensor_str, &tensor), absl::OkStatus());
   EXPECT_EQ(tensor.dtype(), DT_INT32);
   EXPECT_EQ(tensor.NumElements(), 4);
 }
@@ -225,7 +225,7 @@ TEST(UtilsTest, ParseTensorAttrValueReturnsInvalidArgument) {
 
 TEST(UtilsTest, ParseTensorShapeAttrValueOk) {
   std::vector<int64_t> dims;
-  ASSERT_THAT(ParseTensorShapeAttrValue("[1,2,3]", &dims), OkStatus());
+  ASSERT_THAT(ParseTensorShapeAttrValue("[1,2,3]", &dims), absl::OkStatus());
   EXPECT_THAT(dims, ElementsAre(Eq(1), Eq(2), Eq(3)));
 }
 
@@ -246,10 +246,10 @@ TEST(UtilsTest, ParseTensorShapeAttrValueInvalidArgumentEmptyString) {
 
 TEST(UtilsTest, ParseBoolAttrValueOk) {
   bool bool_val;
-  ASSERT_THAT(ParseBoolAttrValue("false", &bool_val), OkStatus());
+  ASSERT_THAT(ParseBoolAttrValue("false", &bool_val), absl::OkStatus());
   EXPECT_FALSE(bool_val);
 
-  ASSERT_THAT(ParseBoolAttrValue("true", &bool_val), OkStatus());
+  ASSERT_THAT(ParseBoolAttrValue("true", &bool_val), absl::OkStatus());
   EXPECT_TRUE(bool_val);
 }
 
@@ -261,7 +261,7 @@ TEST(UtilsTest, ParseBoolAttrValueInvalidArgument) {
 
 TEST(UtilsTest, ParseIntAttrValueOk) {
   int64_t int_val;
-  ASSERT_THAT(ParseIntAttrValue("42", &int_val), OkStatus());
+  ASSERT_THAT(ParseIntAttrValue("42", &int_val), absl::OkStatus());
   EXPECT_EQ(int_val, 42);
 }
 
@@ -283,6 +283,7 @@ TEST(UtilsTest, FillAttrValueMapOk) {
   attrs.SetArray("shape", tfrt::ArrayRef<int64_t>{2, 2});
   attrs.SetArray("values", tfrt::ArrayRef<float>{2});
   attrs.SetArray("flags", tfrt::ArrayRef<bool>{false, true});
+  attrs.SetArray("baz", tfrt::ArrayRef<char>{'a'});
 
   attrs.Set<bool>("transpose_a", false);
   attrs.Set<bool>("transpose_b", true);
@@ -290,6 +291,9 @@ TEST(UtilsTest, FillAttrValueMapOk) {
   attrs.Set<float>("foo", 2);
   attrs.Set<int64_t>("bar", 2);
 
+  tfrt::AggregateAttr aggAttr;
+  attrs.Set<tfrt::AggregateAttr>("aggAttr", aggAttr);
+
   AttrValueMap map;
   auto host_context = CreateTestHostContext();
 
@@ -303,10 +307,12 @@ TEST(UtilsTest, FillAttrValueMapOk) {
           Pair(Eq("shape"), EqualsProto(R"pb(list { i: 2 i: 2 })pb")),
           Pair(Eq("values"), EqualsProto(R"pb(list { f: 2 })pb")),
           Pair(Eq("flags"), EqualsProto(R"pb(list { b: false b: true })pb")),
+          Pair(Eq("baz"), EqualsProto(R"pb(s: "a")pb")),
           Pair(Eq("transpose_a"), EqualsProto(R"pb(b: false)pb")),
           Pair(Eq("transpose_b"), EqualsProto(R"pb(b: true)pb")),
           Pair(Eq("foo"), EqualsProto(R"pb(f: 2)pb")),
-          Pair(Eq("bar"), EqualsProto(R"pb(i: 2)pb"))));
+          Pair(Eq("bar"), EqualsProto(R"pb(i: 2)pb")),
+          Pair(Eq("aggAttr"), EqualsProto(R"pb(list {})pb"))));
 }
 
 }  // namespace
diff --git a/tensorflow/core/summary/summary_converter.cc b/tensorflow/core/summary/summary_converter.cc
index 2aadabe53ab04f..4e9908a9fa0eab 100644
--- a/tensorflow/core/summary/summary_converter.cc
+++ b/tensorflow/core/summary/summary_converter.cc
@@ -58,7 +58,7 @@ Status TensorValueAt(Tensor t, int64_t i, T* out) {
                                      " not supported.");
   }
   // clang-format on
-  return OkStatus();
+  return absl::OkStatus();
 #undef CASE
 #undef COMPLEX_CASE
 }
@@ -102,7 +102,7 @@ Status AddImages(const string& tag, int max_images, int batch_size, int w,
       return errors::Internal("PNG encoding failed");
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <class T>
@@ -211,7 +211,7 @@ Status AddTensorAsScalarToSummary(const Tensor& t, const string& tag,
   float value;
   TF_RETURN_IF_ERROR(TensorValueAt<float>(t, 0, &value));
   v->set_simple_value(value);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status AddTensorAsHistogramToSummary(const Tensor& t, const string& tag,
@@ -231,7 +231,7 @@ Status AddTensorAsHistogramToSummary(const Tensor& t, const string& tag,
     histo.Add(double_val);
   }
   histo.EncodeToProto(v->mutable_histo(), false /* Drop zero buckets */);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status AddTensorAsImageToSummary(const Tensor& tensor, const string& tag,
@@ -280,7 +280,7 @@ Status AddTensorAsImageToSummary(const Tensor& tensor, const string& tag,
         "Got ",
         DataTypeString(tensor.dtype()));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status AddTensorAsAudioToSummary(const Tensor& tensor, const string& tag,
@@ -321,7 +321,7 @@ Status AddTensorAsAudioToSummary(const Tensor& tensor, const string& tag,
         channels_by_frames.data(), sample_rate_truncated, num_channels,
         length_frames, sa->mutable_encoded_audio_string()));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/summary/summary_db_writer.cc b/tensorflow/core/summary/summary_db_writer.cc
index 7f6c69c50681e5..8772ac1c8a25d1 100644
--- a/tensorflow/core/summary/summary_db_writer.cc
+++ b/tensorflow/core/summary/summary_db_writer.cc
@@ -109,7 +109,7 @@ Status CheckSupportedType(const Tensor& t) {
       return errors::Unimplemented(DataTypeString(t.dtype()),
                                    " tensors unsupported on platform");
   }
-  return OkStatus();
+  return absl::OkStatus();
 #undef CASE
 }
 
@@ -226,7 +226,7 @@ class GraphWriter {
     TF_RETURN_WITH_CONTEXT_IF_ERROR(saver.SaveNodeInputs(), "SaveNodeInputs");
     TF_RETURN_WITH_CONTEXT_IF_ERROR(saver.SaveNodes(), "SaveNodes");
     TF_RETURN_WITH_CONTEXT_IF_ERROR(saver.SaveGraph(run_id), "SaveGraph");
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -295,7 +295,7 @@ class GraphWriter {
         TF_RETURN_IF_ERROR(MaybeFlush());
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status SaveNodes() {
@@ -330,7 +330,7 @@ class GraphWriter {
       TF_RETURN_WITH_CONTEXT_IF_ERROR(insert.StepAndReset(), node->name());
       TF_RETURN_IF_ERROR(MaybeFlush());
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status SaveGraph(int64_t run_id) {
@@ -361,7 +361,7 @@ class GraphWriter {
                                       unflushed_bytes_, " bytes");
       unflushed_bytes_ = 0;
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Sqlite* const db_;
@@ -428,7 +428,7 @@ class RunMetadata {
     auto e = tag_ids_.find(tag_name);
     if (e != tag_ids_.end()) {
       *tag_id = e->second;
-      return OkStatus();
+      return absl::OkStatus();
     }
     TF_RETURN_IF_ERROR(ids_->CreateNewId(tag_id));
     tag_ids_[tag_name] = *tag_id;
@@ -468,7 +468,7 @@ class RunMetadata {
  private:
   Status InitializeUser(Sqlite* db, uint64 now)
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    if (user_id_ != kAbsent || user_name_.empty()) return OkStatus();
+    if (user_id_ != kAbsent || user_name_.empty()) return absl::OkStatus();
     const char* get_sql = R"sql(
       SELECT user_id FROM Users WHERE user_name = ?
     )sql";
@@ -479,7 +479,7 @@ class RunMetadata {
     TF_RETURN_IF_ERROR(get.Step(&is_done));
     if (!is_done) {
       user_id_ = get.ColumnInt(0);
-      return OkStatus();
+      return absl::OkStatus();
     }
     TF_RETURN_IF_ERROR(ids_->CreateNewId(&user_id_));
     const char* insert_sql = R"sql(
@@ -495,12 +495,12 @@ class RunMetadata {
     insert.BindText(2, user_name_);
     insert.BindDouble(3, DoubleTime(now));
     TF_RETURN_IF_ERROR(insert.StepAndReset());
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status InitializeExperiment(Sqlite* db, uint64 now, double computed_time)
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    if (experiment_name_.empty()) return OkStatus();
+    if (experiment_name_.empty()) return absl::OkStatus();
     if (experiment_id_ == kAbsent) {
       TF_RETURN_IF_ERROR(InitializeUser(db, now));
       const char* get_sql = R"sql(
@@ -562,12 +562,12 @@ class RunMetadata {
       update.BindInt(2, experiment_id_);
       TF_RETURN_IF_ERROR(update.StepAndReset());
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status InitializeRun(Sqlite* db, uint64 now, double computed_time)
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    if (run_name_.empty()) return OkStatus();
+    if (run_name_.empty()) return absl::OkStatus();
     TF_RETURN_IF_ERROR(InitializeExperiment(db, now, computed_time));
     if (run_id_ == kAbsent) {
       TF_RETURN_IF_ERROR(ids_->CreateNewId(&run_id_));
@@ -606,7 +606,7 @@ class RunMetadata {
       update.BindInt(2, run_id_);
       TF_RETURN_IF_ERROR(update.StepAndReset());
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   mutex mu_;
@@ -674,7 +674,7 @@ class SeriesWriter {
       TF_RETURN_IF_ERROR(txn.Commit());
       rowids_.clear();
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -718,7 +718,7 @@ class SeriesWriter {
     stmt.BindBlobUnsafe(5, data);
     stmt.BindInt(6, rowid);
     TF_RETURN_IF_ERROR(stmt.StepAndReset());
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status UpdateNdString(Sqlite* db, const Tensor& t, int64_t tensor_rowid)
@@ -748,7 +748,7 @@ class SeriesWriter {
       inserter.BindBlobUnsafe(3, flat(i));
       TF_RETURN_WITH_CONTEXT_IF_ERROR(inserter.StepAndReset(), "i=", i);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status Reserve(Sqlite* db, const Tensor& t) SQLITE_TRANSACTIONS_EXCLUDED(*db)
@@ -799,7 +799,7 @@ class SeriesWriter {
       unflushed_bytes_ += reserved_bytes;
       TF_RETURN_IF_ERROR(MaybeFlush(db, txn));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status MaybeFlush(Sqlite* db, SqliteTransaction* txn)
@@ -810,7 +810,7 @@ class SeriesWriter {
                                       unflushed_bytes_, " bytes");
       unflushed_bytes_ = 0;
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   mutex mu_;
@@ -845,14 +845,14 @@ class RunWriter {
   Status Finish(Sqlite* db) SQLITE_TRANSACTIONS_EXCLUDED(*db)
       TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock lock(mu_);
-    if (series_writers_.empty()) return OkStatus();
+    if (series_writers_.empty()) return absl::OkStatus();
     for (auto i = series_writers_.begin(); i != series_writers_.end(); ++i) {
       if (!i->second) continue;
       TF_RETURN_WITH_CONTEXT_IF_ERROR(i->second->Finish(db),
                                       "finish tag_id=", i->first);
       i->second.reset();
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -918,7 +918,7 @@ class SummaryDbWriter : public SummaryWriterInterface {
     }
   }
 
-  Status Flush() override { return OkStatus(); }
+  Status Flush() override { return absl::OkStatus(); }
 
   Status WriteTensor(int64_t global_step, Tensor t, const string& tag,
                      const string& serialized_metadata) override {
@@ -994,7 +994,7 @@ class SummaryDbWriter : public SummaryWriterInterface {
         run_.Append(db_, tag_id, step, now, computed_time, t),
         meta_.user_name(), "/", meta_.experiment_name(), "/", meta_.run_name(),
         "/", tag, "@", step);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status MigrateEvent(std::unique_ptr<Event> e) {
@@ -1021,7 +1021,7 @@ class SummaryDbWriter : public SummaryWriterInterface {
         // TODO(@jart): Handle other stuff.
         break;
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status MigrateGraph(const Event* e, const string& graph_def) {
@@ -1053,7 +1053,7 @@ class SummaryDbWriter : public SummaryWriterInterface {
       default:
         break;
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status MigrateTensor(const Event* e, Summary::Value* s, uint64 now) {
@@ -1150,7 +1150,7 @@ Status CreateSummaryDbWriter(Sqlite* db, const string& experiment_name,
                              const string& run_name, const string& user_name,
                              Env* env, SummaryWriterInterface** result) {
   *result = new SummaryDbWriter(env, db, experiment_name, run_name, user_name);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/summary/summary_file_writer.cc b/tensorflow/core/summary/summary_file_writer.cc
index f21ce98719e1b7..69903c6e791ede 100644
--- a/tensorflow/core/summary/summary_file_writer.cc
+++ b/tensorflow/core/summary/summary_file_writer.cc
@@ -63,7 +63,7 @@ class SummaryFileWriter : public SummaryWriterInterface {
         "Could not initialize events writer.");
     last_flush_ = env_->NowMicros();
     is_initialized_ = true;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status Flush() override {
@@ -157,7 +157,7 @@ class SummaryFileWriter : public SummaryWriterInterface {
         env_->NowMicros() - last_flush_ > 1000 * flush_millis_) {
       return InternalFlush();
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   string DebugString() const override { return "SummaryFileWriter"; }
@@ -175,7 +175,7 @@ class SummaryFileWriter : public SummaryWriterInterface {
     TF_RETURN_WITH_CONTEXT_IF_ERROR(events_writer_->Flush(),
                                     "Could not flush events file.");
     last_flush_ = env_->NowMicros();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   bool is_initialized_;
@@ -205,7 +205,7 @@ Status CreateSummaryFileWriter(int max_queue, int flush_millis,
     return s;
   }
   *result = w;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/summary/summary_file_writer_test.cc b/tensorflow/core/summary/summary_file_writer_test.cc
index 28272db25da028..a62876e82e0537 100644
--- a/tensorflow/core/summary/summary_file_writer_test.cc
+++ b/tensorflow/core/summary/summary_file_writer_test.cc
@@ -84,7 +84,7 @@ class SummaryFileWriterTest : public ::testing::Test {
     if (!found) {
       return errors::Unknown("Found no file for ", test_name);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   FakeClockEnv env_;
@@ -99,7 +99,7 @@ TEST_F(SummaryFileWriterTest, WriteTensor) {
                                       2, one, "name",
                                       SummaryMetadata().SerializeAsString()));
                                   TF_RETURN_IF_ERROR(writer->Flush());
-                                  return OkStatus();
+                                  return absl::OkStatus();
                                 },
                                 [](const Event& e) {
                                   EXPECT_EQ(e.step(), 2);
@@ -114,7 +114,7 @@ TEST_F(SummaryFileWriterTest, WriteTensor) {
         TF_RETURN_IF_ERROR(writer->WriteTensor(
             2, hello, "name", SummaryMetadata().SerializeAsString()));
         TF_RETURN_IF_ERROR(writer->Flush());
-        return OkStatus();
+        return absl::OkStatus();
       },
       [](const Event& e) {
         EXPECT_EQ(e.step(), 2);
@@ -133,7 +133,7 @@ TEST_F(SummaryFileWriterTest, WriteScalar) {
         one.scalar<float>()() = 1.0;
         TF_RETURN_IF_ERROR(writer->WriteScalar(2, one, "name"));
         TF_RETURN_IF_ERROR(writer->Flush());
-        return OkStatus();
+        return absl::OkStatus();
       },
       [](const Event& e) {
         EXPECT_EQ(e.step(), 2);
@@ -151,7 +151,7 @@ TEST_F(SummaryFileWriterTest, WriteHistogram) {
                                   TF_RETURN_IF_ERROR(
                                       writer->WriteHistogram(2, one, "name"));
                                   TF_RETURN_IF_ERROR(writer->Flush());
-                                  return OkStatus();
+                                  return absl::OkStatus();
                                 },
                                 [](const Event& e) {
                                   EXPECT_EQ(e.step(), 2);
@@ -173,7 +173,7 @@ static Status CreateImage(SummaryWriterInterface* writer) {
   one.scalar<T>()() = T(1);
   TF_RETURN_IF_ERROR(writer->WriteImage(2, one, "name", 1, bad_color));
   TF_RETURN_IF_ERROR(writer->Flush());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Verify that the event contains an image generated by CreateImage above.
@@ -217,7 +217,7 @@ TEST_F(SummaryFileWriterTest, WriteAudio) {
         one.scalar<float>()() = 1.0;
         TF_RETURN_IF_ERROR(writer->WriteAudio(2, one, "name", 1, 1));
         TF_RETURN_IF_ERROR(writer->Flush());
-        return OkStatus();
+        return absl::OkStatus();
       },
       [](const Event& e) {
         EXPECT_EQ(e.step(), 2);
@@ -236,7 +236,7 @@ TEST_F(SummaryFileWriterTest, WriteEvent) {
                           e->mutable_summary()->add_value()->set_tag("hi");
                           TF_RETURN_IF_ERROR(writer->WriteEvent(std::move(e)));
                           TF_RETURN_IF_ERROR(writer->Flush());
-                          return OkStatus();
+                          return absl::OkStatus();
                         },
                         [](const Event& e) {
                           EXPECT_EQ(e.step(), 7);
@@ -254,7 +254,7 @@ TEST_F(SummaryFileWriterTest, WallTime) {
         one.scalar<float>()() = 1.0;
         TF_RETURN_IF_ERROR(writer->WriteScalar(2, one, "name"));
         TF_RETURN_IF_ERROR(writer->Flush());
-        return OkStatus();
+        return absl::OkStatus();
       },
       [](const Event& e) { EXPECT_EQ(e.wall_time(), 7.023); }));
 }
diff --git a/tensorflow/core/tfrt/common/BUILD b/tensorflow/core/tfrt/common/BUILD
index 2de9b7fcf904b3..8129cdb0ea0f65 100644
--- a/tensorflow/core/tfrt/common/BUILD
+++ b/tensorflow/core/tfrt/common/BUILD
@@ -70,6 +70,19 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "async_value_tensor_test",
+    srcs = ["async_value_tensor_test.cc"],
+    deps = [
+        ":async_value_tensor",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_googletest//:gtest_main",
+        "@local_xla//xla/pjrt:pjrt_client",
+        "@local_xla//xla/tsl/concurrency:async_value",
+    ],
+)
+
 cc_library(
     name = "pjrt_state",
     srcs = [
@@ -163,6 +176,8 @@ tf_cc_test(
         "//tensorflow/core:framework",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
         "@local_tsl//tsl/protobuf:error_codes_proto_impl_cc",
         "@local_xla//xla/pjrt:tfrt_cpu_pjrt_client",
diff --git a/tensorflow/core/tfrt/common/async_value_tensor_test.cc b/tensorflow/core/tfrt/common/async_value_tensor_test.cc
new file mode 100644
index 00000000000000..a7c89af53c9da7
--- /dev/null
+++ b/tensorflow/core/tfrt/common/async_value_tensor_test.cc
@@ -0,0 +1,76 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tfrt/common/async_value_tensor.h"
+
+#include <cstdint>
+#include <memory>
+
+#include <gtest/gtest.h>
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+
+namespace tensorflow {
+namespace {
+
+TEST(AsyncValueTensorTest, InvalidTensor) {
+  tensorflow::Tensor tensor(tensorflow::DT_INT64, tensorflow::TensorShape({1}));
+
+  AsyncValueTensor* avt = AsyncValueTensor::FromTensor(&tensor);
+
+  ASSERT_EQ(avt, nullptr);
+}
+
+TEST(AsyncValueTensorTest, SetAndGetAsyncValue) {
+  AsyncValueAllocator allocator;
+  tensorflow::Tensor tensor(&allocator, tensorflow::DT_INT64,
+                            tensorflow::TensorShape({1}));
+
+  AsyncValueTensor* avt = AsyncValueTensor::FromTensor(&tensor);
+
+  ASSERT_NE(avt, nullptr);
+
+  tsl::AsyncValueRef<int32_t> value =
+      tsl::MakeConstructedAsyncValueRef<int32_t>(123);
+
+  avt->SetAsyncRef(value.CopyRCRef());
+
+  auto ret_value = avt->GetAsyncRef();
+  ASSERT_EQ(ret_value, value.CopyRCRef());
+}
+
+TEST(AsyncValueTensorTest, SetAndGetBuffer) {
+  AsyncValueAllocator allocator;
+  tensorflow::Tensor tensor(&allocator, tensorflow::DT_INT64,
+                            tensorflow::TensorShape({1}));
+
+  AsyncValueTensor* avt = AsyncValueTensor::FromTensor(&tensor);
+
+  ASSERT_NE(avt, nullptr);
+
+  std::shared_ptr<xla::PjRtBuffer> buffer;
+
+  avt->SetBuffer(buffer);
+
+  auto ret_buffer = avt->GetBuffer();
+
+  ASSERT_EQ(ret_buffer, buffer);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/common/pjrt_util_test.cc b/tensorflow/core/tfrt/common/pjrt_util_test.cc
index 68cc2e98dd1a5d..f8de14dd034812 100644
--- a/tensorflow/core/tfrt/common/pjrt_util_test.cc
+++ b/tensorflow/core/tfrt/common/pjrt_util_test.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tfrt/common/pjrt_util.h"
 
+#include <memory>
+#include <utility>
+
 #include "xla/pjrt/tfrt_cpu_pjrt_client.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/types.h"
@@ -21,11 +24,14 @@ limitations under the License.
 #include "tensorflow/core/tfrt/common/pjrt_state.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/status_matchers.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/test.h"
 #include "tsl/protobuf/error_codes.pb.h"
 
 namespace tensorflow {
 namespace {
 
+using ::testing::ElementsAre;
 using ::testing::HasSubstr;
 using ::tsl::testing::StatusIs;
 
@@ -44,5 +50,17 @@ TEST(PjRtStateResourceManagerTest, SetNullPjRtClient) {
       StatusIs(error::INVALID_ARGUMENT, HasSubstr("PJRT client is nullptr")));
 }
 
+TEST(PjRtGpuClientCreationInfoTest, SetAndGet) {
+  auto info = std::make_unique<PjRtGpuClientCreationInfo>();
+  info->allowed_devices.insert(123);
+  TF_ASSERT_OK(
+      SetPjRtGpuClientCreationInfoInTFGlobalResourceManager(std::move(info)));
+
+  TF_ASSERT_OK_AND_ASSIGN(PjRtGpuClientCreationInfo * retrieved_info,
+                          GetPjRtGpuClientCreationInfo());
+
+  EXPECT_THAT(retrieved_info->allowed_devices, ElementsAre(123));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/fallback/BUILD b/tensorflow/core/tfrt/fallback/BUILD
index 454c102deec081..c77b4133def642 100644
--- a/tensorflow/core/tfrt/fallback/BUILD
+++ b/tensorflow/core/tfrt/fallback/BUILD
@@ -57,10 +57,13 @@ tf_cc_test(
     srcs = ["fallback_state_test.cc"],
     deps = [
         ":fallback_state",
-        "//tensorflow/core:framework",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:const_op",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core:all_kernels",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core/framework:function_proto_cc",
         "//tensorflow/core/platform:status_matchers",
         "//tensorflow/core/protobuf:error_codes_proto_impl_cc",
     ],
diff --git a/tensorflow/core/tfrt/fallback/fallback_state.cc b/tensorflow/core/tfrt/fallback/fallback_state.cc
index 0371de9ad71d6f..61500e050a7237 100644
--- a/tensorflow/core/tfrt/fallback/fallback_state.cc
+++ b/tensorflow/core/tfrt/fallback/fallback_state.cc
@@ -56,7 +56,7 @@ DeviceAttributes BuildDeviceAttributes(absl::string_view name_prefix,
 
 }  // namespace
 
-StatusOr<std::unique_ptr<FallbackState>> FallbackState::Create(
+absl::StatusOr<std::unique_ptr<FallbackState>> FallbackState::Create(
     const SessionOptions &session_options,
     const tensorflow::FunctionDefLibrary &fdef_lib) {
   // Create devices.
@@ -68,7 +68,8 @@ StatusOr<std::unique_ptr<FallbackState>> FallbackState::Create(
                                          fdef_lib);
 }
 
-StatusOr<std::unique_ptr<FallbackState>> FallbackState::CreateWithCpuDevice(
+absl::StatusOr<std::unique_ptr<FallbackState>>
+FallbackState::CreateWithCpuDevice(
     const SessionOptions &session_options,
     const tensorflow::FunctionDefLibrary &fdef_lib) {
   // Create devices.
@@ -80,7 +81,8 @@ StatusOr<std::unique_ptr<FallbackState>> FallbackState::CreateWithCpuDevice(
                                          fdef_lib);
 }
 
-StatusOr<std::unique_ptr<FallbackState>> FallbackState::CreateWithMockGpuDevice(
+absl::StatusOr<std::unique_ptr<FallbackState>>
+FallbackState::CreateWithMockGpuDevice(
     const SessionOptions &session_options,
     const tensorflow::FunctionDefLibrary &fdef_lib) {
   // Create devices.
@@ -112,7 +114,7 @@ FallbackState::FallbackState(const SessionOptions &session_options,
                                    tsl::core::RefCountPtr<Rendezvous> *r) {
               *r = tsl::core::RefCountPtr<Rendezvous>(
                   new IntraProcessRendezvous(device_mgr));
-              return OkStatus();
+              return absl::OkStatus();
             }}) {
   for (auto *d : device_manager_.ListDevices()) {
     device_set_.AddDevice(d);
@@ -122,7 +124,7 @@ FallbackState::FallbackState(const SessionOptions &session_options,
   device_set_.set_client_device(device_manager_.HostCPU());
 }
 
-StatusOr<std::unique_ptr<GraphExecutionState>>
+absl::StatusOr<std::unique_ptr<GraphExecutionState>>
 FallbackState::CreateGraphExecutionState(GraphDef graph_def,
                                          bool run_placer) const {
   // Create GraphExecutionState which contains the preprocessed graph including
diff --git a/tensorflow/core/tfrt/fallback/fallback_state.h b/tensorflow/core/tfrt/fallback/fallback_state.h
index efd96af150fa33..cf293b1b406a28 100644
--- a/tensorflow/core/tfrt/fallback/fallback_state.h
+++ b/tensorflow/core/tfrt/fallback/fallback_state.h
@@ -35,15 +35,15 @@ class FallbackState {
  public:
   // The FunctionDefLibrary is passed in to initialize the
   // ProcessFunctionLibraryRuntime member of this class
-  static StatusOr<std::unique_ptr<FallbackState>> Create(
+  static absl::StatusOr<std::unique_ptr<FallbackState>> Create(
       const SessionOptions &session_options,
       const tensorflow::FunctionDefLibrary &fdef_lib);
 
-  static StatusOr<std::unique_ptr<FallbackState>> CreateWithCpuDevice(
+  static absl::StatusOr<std::unique_ptr<FallbackState>> CreateWithCpuDevice(
       const SessionOptions &session_options,
       const tensorflow::FunctionDefLibrary &fdef_lib);
 
-  static StatusOr<std::unique_ptr<FallbackState>> CreateWithMockGpuDevice(
+  static absl::StatusOr<std::unique_ptr<FallbackState>> CreateWithMockGpuDevice(
       const SessionOptions &session_options,
       const tensorflow::FunctionDefLibrary &fdef_lib);
 
@@ -53,8 +53,8 @@ class FallbackState {
 
   // Create GraphExecutionState from the `graph_def`. The result will contain a
   // preprocessed graph with runtime information such as devices.
-  StatusOr<std::unique_ptr<GraphExecutionState>> CreateGraphExecutionState(
-      GraphDef graph_def, bool run_placer = true) const;
+  absl::StatusOr<std::unique_ptr<GraphExecutionState>>
+  CreateGraphExecutionState(GraphDef graph_def, bool run_placer = true) const;
 
   // Adds `func_def` to the function library.
   Status AddFunctionDef(const FunctionDef &func_def);
@@ -62,6 +62,7 @@ class FallbackState {
   const SessionOptions &session_options() const { return session_options_; }
 
   const DeviceMgr &device_manager() const { return device_manager_; }
+  DeviceMgr &device_manager() { return device_manager_; }
 
   const DeviceSet &device_set() const { return device_set_; }
 
diff --git a/tensorflow/core/tfrt/fallback/fallback_state_test.cc b/tensorflow/core/tfrt/fallback/fallback_state_test.cc
index 352a5119e69460..d7d55311e7ffd4 100644
--- a/tensorflow/core/tfrt/fallback/fallback_state_test.cc
+++ b/tensorflow/core/tfrt/fallback/fallback_state_test.cc
@@ -14,11 +14,15 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tfrt/fallback/fallback_state.h"
 
-#include "tensorflow/core/framework/function.h"
-#include "tensorflow/core/framework/function.pb.h"
+#include <utility>
+
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/core/platform/status_matchers.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
+#include "tsl/lib/core/status_test_util.h"
 
 namespace tensorflow {
 namespace {
@@ -54,5 +58,37 @@ TEST(FallbackStateTest, CreateRendezvous) {
                                    HasSubstr("rendezvous"))));
 }
 
+TEST(FallbackStateTest, CreateGraphExecutionState) {
+  tensorflow::SessionOptions session_options;
+  tensorflow::FunctionDefLibrary fdef_lib;
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto fallback_state,
+      tfrt_stub::FallbackState::CreateWithCpuDevice(session_options, fdef_lib));
+
+  GraphDef graphdef;
+  {
+    auto scope = tensorflow::Scope::NewRootScope().WithDevice(
+        "/job:localhost/replica:0/task:0/device:CPU:0");
+
+    Output a = ops::Const(scope.WithOpName("a"), 2.0, {1, 1});
+
+    TF_ASSERT_OK(scope.ToGraphDef(&graphdef));
+  }
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto graph_execution_state,
+      fallback_state->CreateGraphExecutionState(std::move(graphdef)));
+}
+
+TEST(FallbackStateTest, CreateWithMockGpuDevice) {
+  tensorflow::SessionOptions session_options;
+  tensorflow::FunctionDefLibrary fdef_lib;
+  TF_ASSERT_OK_AND_ASSIGN(auto fallback_state,
+                          tfrt_stub::FallbackState::CreateWithMockGpuDevice(
+                              session_options, fdef_lib));
+  const auto& device_manager = fallback_state->device_manager();
+  EXPECT_GT(device_manager.NumDeviceType("GPU"), 0);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/fallback/op_kernel_runner.cc b/tensorflow/core/tfrt/fallback/op_kernel_runner.cc
index fd0bae2f3fac7b..01439a4e92a746 100644
--- a/tensorflow/core/tfrt/fallback/op_kernel_runner.cc
+++ b/tensorflow/core/tfrt/fallback/op_kernel_runner.cc
@@ -31,7 +31,7 @@ Status CheckOpDefCompatibility(const tensorflow::OpDef& op_def) {
       return tensorflow::errors::Internal(
           "TFRT kernel fallback error: Unsupported ref args in ",
           op_def.name());
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   for (const auto& arg_def : op_def.input_arg())
@@ -39,11 +39,11 @@ Status CheckOpDefCompatibility(const tensorflow::OpDef& op_def) {
   for (const auto& arg_def : op_def.output_arg())
     TF_RETURN_IF_ERROR(check_arg_def(arg_def));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Create a tensorflow::NodeDef from the tensorflow::OpDef and the attributes.
-StatusOr<tensorflow::NodeDef> BuildNodeDef(
+absl::StatusOr<tensorflow::NodeDef> BuildNodeDef(
     const tensorflow::OpDef& op_def, absl::string_view node_name, int num_args,
     const std::function<Status(tensorflow::AttrValueMap*)>& attr_builder) {
   tensorflow::NodeDef node_def;
@@ -78,12 +78,12 @@ tensorflow::Status CreateOpKernel(
   tensorflow::OpKernel* k = nullptr;
   TF_RETURN_IF_ERROR(flr->CreateKernel(props, &k));
   result->reset(k);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
 
-StatusOr<OpKernelRunner> OpKernelRunner::Create(
+absl::StatusOr<OpKernelRunner> OpKernelRunner::Create(
     absl::string_view op_name, absl::string_view node_name,
     absl::string_view device_name, int num_args,
     const std::function<Status(tensorflow::AttrValueMap*)>& attr_builder,
@@ -106,7 +106,7 @@ StatusOr<OpKernelRunner> OpKernelRunner::Create(
                 process_function_library_runtime, device);
 }
 
-StatusOr<OpKernelRunner> OpKernelRunner::Create(
+absl::StatusOr<OpKernelRunner> OpKernelRunner::Create(
     absl::string_view op_name, absl::string_view node_name, int num_args,
     const std::function<Status(tensorflow::AttrValueMap*)>& attr_builder,
     const tensorflow::ProcessFunctionLibraryRuntime&
diff --git a/tensorflow/core/tfrt/fallback/op_kernel_runner.h b/tensorflow/core/tfrt/fallback/op_kernel_runner.h
index 3ecfa4a176aee3..0f138d9b75ddf7 100644
--- a/tensorflow/core/tfrt/fallback/op_kernel_runner.h
+++ b/tensorflow/core/tfrt/fallback/op_kernel_runner.h
@@ -41,7 +41,7 @@ namespace tfrt_stub {
 
 class OpKernelRunner {
  public:
-  static StatusOr<OpKernelRunner> Create(
+  static absl::StatusOr<OpKernelRunner> Create(
       absl::string_view op_name, absl::string_view node_name,
       absl::string_view device_name, int num_args,
       const std::function<Status(tensorflow::AttrValueMap*)>& attr_builder,
@@ -50,7 +50,7 @@ class OpKernelRunner {
           process_function_library_runtime);
 
   ABSL_DEPRECATED("Please use the Create() method that takes node_name.")
-  static StatusOr<OpKernelRunner> Create(
+  static absl::StatusOr<OpKernelRunner> Create(
       absl::string_view op_name, absl::string_view device_name, int num_args,
       const std::function<Status(tensorflow::AttrValueMap*)>& attr_builder,
       const tensorflow::DeviceMgr& device_manager,
@@ -61,7 +61,7 @@ class OpKernelRunner {
                   process_function_library_runtime);
   }
 
-  static StatusOr<OpKernelRunner> Create(
+  static absl::StatusOr<OpKernelRunner> Create(
       absl::string_view op_name, absl::string_view node_name, int num_args,
       const std::function<Status(tensorflow::AttrValueMap*)>& attr_builder,
       const tensorflow::ProcessFunctionLibraryRuntime&
@@ -69,7 +69,7 @@ class OpKernelRunner {
       tensorflow::Device* device);
 
   ABSL_DEPRECATED("Please use the Create() method that takes node_name.")
-  static StatusOr<OpKernelRunner> Create(
+  static absl::StatusOr<OpKernelRunner> Create(
       absl::string_view op_name, int num_args,
       const std::function<Status(tensorflow::AttrValueMap*)>& attr_builder,
       const tensorflow::ProcessFunctionLibraryRuntime&
@@ -130,8 +130,8 @@ class OpKernelRunner {
     tensorflow::FunctionLibraryRuntime* function_library_runtime = nullptr;
     tensorflow::ResourceMgr* resource_manager = nullptr;
     bool is_async = false;
-    gtl::InlinedVector<AllocatorAttributes, 4> input_alloc_attrs;
-    gtl::InlinedVector<AllocatorAttributes, 1> output_alloc_attrs;
+    absl::InlinedVector<AllocatorAttributes, 4UL> input_alloc_attrs;
+    absl::InlinedVector<AllocatorAttributes, 1UL> output_alloc_attrs;
   };
   std::unique_ptr<Info> info_;
 };
@@ -141,7 +141,7 @@ struct OpKernelRunState {
   std::vector<const tensorflow::TensorBuffer*> tensor_buffers;
   std::vector<tensorflow::TensorValue> input_tf_tensor_values;
   OpKernelContext::Params params;
-  gtl::InlinedVector<tensorflow::Tensor, 4> input_tf_tensors;
+  absl::InlinedVector<tensorflow::Tensor, 4UL> input_tf_tensors;
 
   OpKernelRunState() = default;
   OpKernelRunState(absl::Span<const tensorflow::TensorValue> tensor_values,
diff --git a/tensorflow/core/tfrt/fallback/op_kernel_runner_cache.cc b/tensorflow/core/tfrt/fallback/op_kernel_runner_cache.cc
index ae5514f376b0e6..c8031a4937177b 100644
--- a/tensorflow/core/tfrt/fallback/op_kernel_runner_cache.cc
+++ b/tensorflow/core/tfrt/fallback/op_kernel_runner_cache.cc
@@ -24,7 +24,7 @@ limitations under the License.
 namespace tensorflow {
 namespace tfrt_stub {
 
-StatusOr<OpKernelRunner*> OpKernelRunnerCache::GetOrCreate(
+absl::StatusOr<OpKernelRunner*> OpKernelRunnerCache::GetOrCreate(
     tfrt::Location loc, absl::string_view op_name,
     absl::string_view device_name, int num_args,
     const std::function<Status(tensorflow::AttrValueMap*)>& attr_builder,
diff --git a/tensorflow/core/tfrt/fallback/op_kernel_runner_cache.h b/tensorflow/core/tfrt/fallback/op_kernel_runner_cache.h
index 125ad64ba4c2e5..3fdb2e2dc46b39 100644
--- a/tensorflow/core/tfrt/fallback/op_kernel_runner_cache.h
+++ b/tensorflow/core/tfrt/fallback/op_kernel_runner_cache.h
@@ -51,7 +51,7 @@ class OpKernelRunnerCache {
  public:
   OpKernelRunnerCache() = default;
 
-  StatusOr<OpKernelRunner*> GetOrCreate(
+  absl::StatusOr<OpKernelRunner*> GetOrCreate(
       tfrt::Location loc, absl::string_view op_name,
       absl::string_view device_name, int num_args,
       const std::function<Status(tensorflow::AttrValueMap*)>& attr_builder,
diff --git a/tensorflow/core/tfrt/fallback/op_kernel_runner_test.cc b/tensorflow/core/tfrt/fallback/op_kernel_runner_test.cc
index 389fbb4dbbb931..9c481aa56d413e 100644
--- a/tensorflow/core/tfrt/fallback/op_kernel_runner_test.cc
+++ b/tensorflow/core/tfrt/fallback/op_kernel_runner_test.cc
@@ -146,11 +146,12 @@ TEST(OpKernelRunnerTest, OpKernelRunState) {
 
   Tensor a(DT_FLOAT, TensorShape({}));
   Tensor b(DT_INT32, TensorShape({}));
-  gtl::InlinedVector<TensorValue, 4> inputs{TensorValue(&a), TensorValue(&b)};
+  absl::InlinedVector<TensorValue, 4UL> inputs{TensorValue(&a),
+                                               TensorValue(&b)};
   params.inputs = inputs;
 
   Tensor c(DT_UINT8, TensorShape({}));
-  gtl::InlinedVector<TensorValue, 4> new_inputs{TensorValue(&c)};
+  absl::InlinedVector<TensorValue, 4UL> new_inputs{TensorValue(&c)};
 
   OpKernelRunState run_state(new_inputs, params);
 
diff --git a/tensorflow/core/tfrt/gpu/kernel/BUILD b/tensorflow/core/tfrt/gpu/kernel/BUILD
index 139da4d3b16103..16599c77cbd4e0 100644
--- a/tensorflow/core/tfrt/gpu/kernel/BUILD
+++ b/tensorflow/core/tfrt/gpu/kernel/BUILD
@@ -66,6 +66,7 @@ cc_library(
         "@local_tsl//tsl/framework:serving_device_selector",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:fingerprint",
+        "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:statusor",
         "@local_xla//xla/pjrt:pjrt_client",
         "@tf_runtime//:hostcontext",
@@ -78,9 +79,12 @@ tf_cuda_cc_test(
     name = "gpu_runner_test",
     srcs = ["gpu_runner_test.cc"],
     tags = [
-        "config-cuda-only",
+        "gpu",  # Only enables test on GPU.
         "no_oss",  # This test only runs with GPU.
-        "requires-gpu-nvidia",
+        "noasan",
+        "nomsan",
+        "noopt",
+        "notsan",
     ],
     deps = [
         ":gpu_runner",
@@ -101,6 +105,7 @@ tf_cuda_cc_test(
         "//tensorflow/core/tfrt/common:pjrt_util",
         "//tensorflow/core/tfrt/fallback:fallback_state",
         "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/framework:serving_device_selector_policies",
         "@tf_runtime//:hostcontext",
         "@tf_runtime//:tensor",
     ],
diff --git a/tensorflow/core/tfrt/gpu/kernel/gpu_runner.cc b/tensorflow/core/tfrt/gpu/kernel/gpu_runner.cc
index b88f895f326628..666717ca92ae62 100644
--- a/tensorflow/core/tfrt/gpu/kernel/gpu_runner.cc
+++ b/tensorflow/core/tfrt/gpu/kernel/gpu_runner.cc
@@ -55,6 +55,7 @@ limitations under the License.
 #include "tsl/framework/serving_device_selector.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/fingerprint.h"
+#include "tsl/platform/protobuf.h"
 #include "tsl/platform/statusor.h"
 #include "tfrt/host_context/async_dispatch.h"  // from @tf_runtime
 #include "tfrt/host_context/async_value_ref.h"  // from @tf_runtime
@@ -293,7 +294,7 @@ absl::StatusOr<uint64_t> GenerateFingerprint(
   return tsl::Fingerprint64(
       absl::StrCat(fallback_request_state->session_metadata().name(),
                    fallback_request_state->session_metadata().version(),
-                   fdef->signature().DebugString()));
+                   tsl::LegacyUnredactedDebugString(fdef->signature())));
 }
 
 std::vector<XlaCompiler::Argument> BuildXlaCompilerArguments(
diff --git a/tensorflow/core/tfrt/gpu/kernel/gpu_runner_test.cc b/tensorflow/core/tfrt/gpu/kernel/gpu_runner_test.cc
index e2b999cb23ec06..7371f62df1f20b 100644
--- a/tensorflow/core/tfrt/gpu/kernel/gpu_runner_test.cc
+++ b/tensorflow/core/tfrt/gpu/kernel/gpu_runner_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/tfrt/gpu/kernel/gpu_runner.h"
 
+#include <memory>
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/cc/framework/scope.h"
@@ -145,7 +147,7 @@ class GpuRunnerTest : public ::testing::Test {
     exec_ctx_ = std::make_unique<tfrt::ExecutionContext>(std::move(*req_ctx));
 
     // Create a gpu runner.
-    auto policy = std::make_unique<RoundRobinPolicy>();
+    auto policy = std::make_unique<tsl::RoundRobinPolicy>();
     serving_device_selector_ = std::make_unique<GpuServingDeviceSelector>(
         kNumVirtualGpuDevices, std::move(policy));
     gpu_runner_ = std::make_unique<GpuRunner>(serving_device_selector_.get());
diff --git a/tensorflow/core/tfrt/graph_executor/BUILD b/tensorflow/core/tfrt/graph_executor/BUILD
index 931e00ff95aef4..3a5d94536f7462 100644
--- a/tensorflow/core/tfrt/graph_executor/BUILD
+++ b/tensorflow/core/tfrt/graph_executor/BUILD
@@ -130,11 +130,11 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncExtensions",
         "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/concurrency:async_value",
-        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/platform:refcount",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/profiler/lib:traceme",
+        "@local_xla//xla/tsl/concurrency:async_value",
+        "@local_xla//xla/tsl/concurrency:ref_count",
         "@tf_runtime//:basic_kernels_alwayslink",
         "@tf_runtime//:bef",
         "@tf_runtime//:befexecutor",
diff --git a/tensorflow/core/tfrt/graph_executor/graph_execution_options.cc b/tensorflow/core/tfrt/graph_executor/graph_execution_options.cc
index 492c28beac5113..2c997ff35da7e8 100644
--- a/tensorflow/core/tfrt/graph_executor/graph_execution_options.cc
+++ b/tensorflow/core/tfrt/graph_executor/graph_execution_options.cc
@@ -118,13 +118,12 @@ void UpdateTpuTargetByBridgeCompatibility(
 
 std::ostream& operator<<(std::ostream& os,
                          const GraphExecutionOptions& options) {
-  return os << "{"
-            << "run_placer_grappler_on_functions = "
+  return os << "{" << "run_placer_grappler_on_functions = "
             << options.run_placer_grappler_on_functions
             << ", enable_grappler_function_optimizer = "
             << options.enable_grappler_function_optimizer
             << ", enable_tfrt_gpu = " << options.enable_tfrt_gpu
-            << ", runtime = "
+            << ", use_ifrt = " << options.use_ifrt << ", runtime = "
             << options.runtime
             // clang-tidy off
             << ", model_metadata = "
diff --git a/tensorflow/core/tfrt/graph_executor/graph_execution_options.h b/tensorflow/core/tfrt/graph_executor/graph_execution_options.h
index dc902eb695e90d..32d0a007f78799 100644
--- a/tensorflow/core/tfrt/graph_executor/graph_execution_options.h
+++ b/tensorflow/core/tfrt/graph_executor/graph_execution_options.h
@@ -104,6 +104,10 @@ struct GraphExecutionOptions {
   // This option is experimental.
   bool enable_mlrt = false;
 
+  // If true, the IFRT will be used instead of the TPU Runner.
+  // This option is experimental.
+  bool use_ifrt = false;
+
   tensorflow::TfrtCompileOptions compile_options;
 };
 
diff --git a/tensorflow/core/tfrt/graph_executor/graph_executor.cc b/tensorflow/core/tfrt/graph_executor/graph_executor.cc
index 14e29d9c91b620..763770d6e5b327 100644
--- a/tensorflow/core/tfrt/graph_executor/graph_executor.cc
+++ b/tensorflow/core/tfrt/graph_executor/graph_executor.cc
@@ -55,6 +55,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tfrt/transforms/update_op_cost_in_tfrt_mlir.h"
 #include "tensorflow/compiler/mlir/tfrt/translate/import_model.h"
 #include "tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/function.h"
@@ -95,7 +96,6 @@ limitations under the License.
 #include "tensorflow/core/tfrt/utils/fallback_tensor.h"
 #include "tensorflow/core/tfrt/utils/tfrt_graph_execution_state.h"
 #include "tensorflow/core/tfrt/utils/utils.h"
-#include "tsl/concurrency/async_value_ref.h"
 #include "tsl/lib/monitoring/sampler.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/refcount.h"
@@ -168,6 +168,16 @@ tensorflow::Status RunMlrtFunction(
   execution_context.AddUserContext(std::make_unique<tf_mlrt::Context>(
       fallback_request_state, request_context->resource_context(),
       request_context->cancellation_context().get()));
+  execution_context.AddUserErrorLogger(
+      [fallback_request_state](absl::Status status) {
+        if (fallback_request_state) {
+          LOG(ERROR) << "Model "
+                     << fallback_request_state->session_metadata().name()
+                     << " version "
+                     << fallback_request_state->session_metadata().version()
+                     << " has error: " << status;
+        }
+      });
 
   absl::InlinedVector<mlrt::Value, 4> mlrt_inputs;
   mlrt_inputs.reserve(inputs.size());
@@ -686,6 +696,7 @@ GraphExecutor::ImportAndCompileClientGraph(
   // compilation.
   auto context = std::make_unique<mlir::MLIRContext>(
       registry, mlir::MLIRContext::Threading::DISABLED);
+  context->loadAllAvailableDialects();
   ASSIGN_OR_RETURN_IN_IMPORT(
       auto flib_def_and_module,
       ImportClientGraphToMlirModule(client_graph, context.get()));
diff --git a/tensorflow/core/tfrt/graph_executor/graph_executor.h b/tensorflow/core/tfrt/graph_executor/graph_executor.h
index c7f5748f2ee99e..a49f2f21d619e9 100644
--- a/tensorflow/core/tfrt/graph_executor/graph_executor.h
+++ b/tensorflow/core/tfrt/graph_executor/graph_executor.h
@@ -34,6 +34,7 @@ limitations under the License.
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/compiler/mlir/tfrt/backend_compiler.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/function.h"
@@ -56,7 +57,6 @@ limitations under the License.
 #include "tensorflow/core/tfrt/runtime/stream.h"
 #include "tensorflow/core/tfrt/runtime/work_queue_interface.h"
 #include "tensorflow/core/tfrt/utils/tfrt_graph_execution_state.h"
-#include "tsl/concurrency/ref_count.h"
 #include "tsl/lib/monitoring/sampler.h"
 #include "tsl/platform/thread_annotations.h"
 #include "tfrt/bef/bef_buffer.h"  // from @tf_runtime
diff --git a/tensorflow/core/tfrt/ifrt/BUILD b/tensorflow/core/tfrt/ifrt/BUILD
index e8d4b629be8400..8b3d61c8bbbc12 100644
--- a/tensorflow/core/tfrt/ifrt/BUILD
+++ b/tensorflow/core/tfrt/ifrt/BUILD
@@ -7,6 +7,7 @@ package(
         # copybara:uncomment "//learning/brain/experimental/tfrt:__subpackages__",
         # copybara:uncomment "//learning/brain/tfrt:__subpackages__",
         # copybara:uncomment "//learning/infra/mira/distributed:__subpackages__",
+        # copybara:uncomment "//learning/serving/servables/tfrt:__subpackages__",
         "//tensorflow/compiler/mlir/tfrt:__subpackages__",
         "//tensorflow/compiler/mlir/tfrt/transforms/mlrt:__subpackages__",
         "//tensorflow/compiler/mlir/tfrt/translate/mlrt:__subpackages__",
@@ -52,8 +53,11 @@ cc_library(
     srcs = ["ifrt_serving_executable.cc"],
     hdrs = ["ifrt_serving_executable.h"],
     deps = [
+        ":ifrt_config_proto_cc",
         ":ifrt_loaded_variable_registry",
+        ":ifrt_loaded_variable_utils",
         ":ifrt_restore_tensor_registry",
+        ":ifrt_serving_core_selector",
         ":ifrt_tensor_utils",
         ":sharding_utils",
         ":tf_host_callback",
@@ -70,15 +74,17 @@ cc_library(
         "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/concurrency:ref_count",
+        "@local_tsl//tsl/framework:serving_device_selector",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
@@ -89,9 +95,12 @@ cc_library(
         "@local_xla//xla/pjrt:host_callback",
         "@local_xla//xla/pjrt:pjrt_executable",
         "@local_xla//xla/python/ifrt",
+        "@local_xla//xla/python/ifrt/hlo:hlo_program",
         "@local_xla//xla/python/pjrt_ifrt",
         "@local_xla//xla/python/pjrt_ifrt:xla_ifrt",
         "@local_xla//xla/service:computation_placer_hdr",
+        "@local_xla//xla/tsl/concurrency:ref_count",
+        "@tf_runtime//:hostcontext",
     ],
 )
 
@@ -137,18 +146,18 @@ cc_library(
     srcs = ["ifrt_loaded_variable_registry.cc"],
     hdrs = ["ifrt_loaded_variable_registry.h"],
     deps = [
-        "//tensorflow/compiler/mlir/tfrt/transforms/ifrt:ifrt_types",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
-        "@local_tsl//tsl/concurrency:ref_count",
-        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
         "@local_xla//xla/python/ifrt",
+        "@local_xla//xla/tsl/concurrency:ref_count",
     ],
 )
 
@@ -160,15 +169,13 @@ cc_library(
         ":ifrt_executable_registry",
         ":ifrt_loaded_variable_registry",
         ":ifrt_restore_tensor_registry",
+        ":ifrt_serving_core_selector",
         "//tensorflow/compiler/tf2xla:xla_helpers",
         "//tensorflow/core:core_cpu_base",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
         "@local_xla//xla/python/ifrt",
         "@tf_runtime//:hostcontext",
     ],
@@ -210,7 +217,6 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:status",
-        "//tensorflow/core/platform:statusor",
         "//tensorflow/core/tpu/kernels:sharding_utils",
         "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/log",
@@ -220,14 +226,13 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
-        "@local_xla//xla:executable_run_options",
         "@local_xla//xla/hlo/ir:hlo",
         "@local_xla//xla/python/ifrt",
         "@local_xla//xla/python/pjrt_ifrt:xla_ifrt",
+        "@local_xla//xla/tsl/concurrency:ref_count",
     ],
 )
 
@@ -242,16 +247,18 @@ cc_library(
         ":sharding_utils",
         "//tensorflow/compiler/mlir/tfrt/transforms/ifrt:ifrt_types",
         "//tensorflow/core:framework",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
         "@local_xla//xla/hlo/ir:hlo",
         "@local_xla//xla/python/ifrt",
+        "@local_xla//xla/tsl/concurrency:ref_count",
         "@tf_runtime//:hostcontext",
     ],
 )
@@ -287,6 +294,61 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "ifrt_serving_executable_test_util",
+    testonly = True,
+    srcs = ["ifrt_serving_executable_test_util.cc"],
+    hdrs = ["ifrt_serving_executable_test_util.h"],
+    data = [
+        "//tensorflow/core/tfrt/ifrt/testdata",
+    ],
+    deps = [
+        ":ifrt_loaded_variable_registry",
+        ":ifrt_restore_tensor_registry",
+        ":ifrt_serving_core_selector",
+        ":ifrt_serving_executable",
+        ":tf_host_callback",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/tf2xla:xla_helpers",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core/framework:tensor",
+        "//tensorflow/core/framework:types_proto_cc",
+        "//tensorflow/core/platform:resource_loader",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@local_tsl//tsl/framework/test_util:mock_serving_device_selector",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:status",
+        "@local_xla//xla/python/ifrt",
+        "@local_xla//xla/python/ifrt:test_util",
+        "@tf_runtime//:hostcontext",
+    ],
+)
+
+tf_cc_test(
+    name = "ifrt_restore_tensor_registry_test",
+    srcs = ["ifrt_restore_tensor_registry_test.cc"],
+    tags = ["no_oss"],
+    deps = [
+        ":ifrt_restore_tensor_registry",
+        "//tensorflow/compiler/mlir/tfrt/transforms/ifrt:ifrt_types",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/framework:tensor",
+        "//tensorflow/core/framework:tensor_testutil",
+        "//tensorflow/core/framework:types_proto_cc",
+        "@com_google_absl//absl/status",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_xla//xla/python/ifrt",
+    ],
+)
+
 tf_cc_test(
     name = "ifrt_loaded_variable_utils_test",
     srcs = ["ifrt_loaded_variable_utils_test.cc"],
@@ -303,7 +365,6 @@ tf_cc_test(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:status_matchers",
@@ -313,6 +374,7 @@ tf_cc_test(
         "@local_xla//xla/python/ifrt",
         "@local_xla//xla/python/ifrt:test_util",
         "@local_xla//xla/python/pjrt_ifrt:tfrt_cpu_client_test_lib",
+        "@local_xla//xla/tsl/concurrency:ref_count",
         "@tf_runtime//:hostcontext",
     ],
 )
@@ -363,7 +425,6 @@ tf_cc_test(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:statusor",
@@ -373,6 +434,7 @@ tf_cc_test(
         "@local_xla//xla/python/ifrt:test_util",
         "@local_xla//xla/python/pjrt_ifrt:tfrt_cpu_client_test_lib",
         "@local_xla//xla/python/pjrt_ifrt:xla_ifrt",
+        "@local_xla//xla/tsl/concurrency:ref_count",
     ],
 )
 
@@ -386,12 +448,10 @@ tf_cc_test(
     ],
     tags = ["no_oss"],
     deps = [
-        ":ifrt_loaded_variable_registry",
         ":ifrt_restore_tensor_registry",
+        ":ifrt_serving_core_selector",
         ":ifrt_serving_executable",
-        ":sharding_utils",
-        ":tf_host_callback",
-        "//tensorflow/compiler/mlir/tensorflow",
+        ":ifrt_serving_executable_test_util",
         "//tensorflow/compiler/tf2xla:xla_helpers",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
@@ -403,19 +463,17 @@ tf_cc_test(
         "//tensorflow/core/framework:tensor_matcher",
         "//tensorflow/core/framework:tensor_testutil",
         "//tensorflow/core/framework:types_proto_cc",
-        "//tensorflow/core/platform:resource_loader",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
-        "@llvm-project//mlir:AllPassesAndDialects",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Parser",
-        "@local_tsl//tsl/concurrency:ref_count",
+        "@local_tsl//tsl/framework:serving_device_selector",
+        "@local_tsl//tsl/framework/test_util:mock_serving_device_selector",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:tstring",
-        "@local_xla//xla/hlo/ir:hlo",
         "@local_xla//xla/python/ifrt",
         "@local_xla//xla/python/ifrt:test_util",
         "@local_xla//xla/python/pjrt_ifrt:tfrt_cpu_client_test_lib",
@@ -452,6 +510,7 @@ tf_cc_test(
         "//tensorflow/core/framework:tensor",
         "//tensorflow/core/framework:types_proto_cc",
         "//tensorflow/core/platform:resource_loader",
+        "//tensorflow/core/platform:status_matchers",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -466,6 +525,7 @@ tf_cc_test(
         "@local_xla//xla/python/pjrt_ifrt:tfrt_cpu_client_test_lib",
         "@tf_runtime//:basic_kernels_alwayslink",
         "@tf_runtime//:core_runtime_alwayslink",
+        "@tf_runtime//:hostcontext",
         "@tf_runtime//:test_kernels_alwayslink",
         "@tf_runtime//backends/cpu:core_runtime_alwayslink",
         "@tf_runtime//backends/cpu:tf_ops_alwayslink",
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_config.proto b/tensorflow/core/tfrt/ifrt/ifrt_config.proto
index 38c0f6ec0e0902..f11d61049ce666 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_config.proto
+++ b/tensorflow/core/tfrt/ifrt/ifrt_config.proto
@@ -10,3 +10,11 @@ message VariableDeviceShardingConfigProto {
   xla.OpSharding sharding = 1;
   repeated int32 device_ids = 2;
 }
+
+enum IfrtServingCoreSelectionPolicy {
+  // Default policy to select the soonest-to-finish core defined at
+  // http://shortn/_XQhI1ASAu0.
+  IFRT_SERVING_CORE_SELECTION_POLICY_DEFAULT = 0;
+  // Policy that round robin with local ordinal http://shortn/_7BtVe4dkp5.
+  IFRT_SERVING_CORE_SELECTION_POLICY_LOCAL_ROUND_ROBIN = 1;
+}
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_executable_registry.cc b/tensorflow/core/tfrt/ifrt/ifrt_executable_registry.cc
index dce09b8e7df228..8a3515c6b87773 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_executable_registry.cc
+++ b/tensorflow/core/tfrt/ifrt/ifrt_executable_registry.cc
@@ -48,6 +48,27 @@ ServingExecutableRegistry::Handle& ServingExecutableRegistry::Handle::operator=(
 
 ServingExecutableRegistry::Handle::~Handle() { Release(); }
 
+absl::Status ServingExecutableRegistry::Handle::Freeze() {
+  if (!program_id_.has_value()) {
+    return absl::FailedPreconditionError("Program is not registered");
+  }
+
+  absl::MutexLock l(&ServingExecutableRegistry::mu_);
+
+  const auto it = ServingExecutableRegistry::executables_->find(*program_id_);
+  if (it == ServingExecutableRegistry::executables_->end()) {
+    return absl::NotFoundError(
+        absl::StrCat("Program ", *program_id_, " not found in the registry"));
+  }
+
+  VLOG(1) << "Freeze the program " << *program_id_ << " from signature '"
+          << it->second->signature_name() << "' of model '"
+          << it->second->model_name() << "'";
+
+  it->second->Freeze();
+  return absl::OkStatus();
+}
+
 void ServingExecutableRegistry::Handle::Release() {
   if (!program_id_.has_value()) {
     return;
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_executable_registry.h b/tensorflow/core/tfrt/ifrt/ifrt_executable_registry.h
index 247e6e0a3848fd..5275b683027602 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_executable_registry.h
+++ b/tensorflow/core/tfrt/ifrt/ifrt_executable_registry.h
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h"
@@ -61,6 +62,11 @@ class ServingExecutableRegistry {
     // Calling this method multiple times is a no-op.
     void Release();
 
+    // Freezes the program's compilation. After Freeze() is called, no new model
+    // signature will be compiled. Using a signature or an input shape that
+    // wasn't compiled before the freeze will lead to an error.
+    absl::Status Freeze();
+
    private:
     friend class ServingExecutableRegistry;
 
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_executable_registry_test.cc b/tensorflow/core/tfrt/ifrt/ifrt_executable_registry_test.cc
index b30340287badd2..e3ee0e43547771 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_executable_registry_test.cc
+++ b/tensorflow/core/tfrt/ifrt/ifrt_executable_registry_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -39,6 +40,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/resource_loader.h"
+#include "tensorflow/core/platform/status_matchers.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h"
@@ -47,6 +49,7 @@ limitations under the License.
 #include "tsl/platform/env.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/threadpool.h"
+#include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
 
 namespace tensorflow {
 namespace ifrt_serving {
@@ -60,7 +63,7 @@ const tsl::thread::ThreadPool& GetThreadPool() {
 }
 
 absl::StatusOr<std::unique_ptr<IfrtServingExecutable>>
-CreateIfrtServingExecutable(mlir::MLIRContext& context) {
+CreateIfrtServingExecutable(mlir::MLIRContext& context, int64_t program_id) {
   // Create test input module
   constexpr absl::string_view kDataDirectory =
       "tensorflow/core/tfrt/ifrt/testdata";
@@ -81,13 +84,18 @@ CreateIfrtServingExecutable(mlir::MLIRContext& context) {
 
   IfrtLoadedVariableRegistry ifrt_loaded_variable_registry;
   IfrtRestoreTensorRegistry ifrt_restore_tensor_registry;
+  std::unique_ptr<tfrt::ConcurrentWorkQueue> work_queue =
+      tfrt::CreateMultiThreadedWorkQueue(
+          /*num_threads=*/4, /*num_blocking_threads=*/4);
   TF_ASSIGN_OR_RETURN(std::unique_ptr<tensorflow::StaticDeviceMgr> device_mgr,
                       CreateTfStaticDeviceMgr());
 
-  return std::make_unique<IfrtServingExecutable>(
-      "test", "main", std::move(mlir_module), client, &GetThreadPool(),
-      &ifrt_loaded_variable_registry, &ifrt_restore_tensor_registry,
-      device_mgr.get(), tensorflow::IdentityShapeRepresentationFn());
+  return IfrtServingExecutable::Create(
+      program_id, "test", "main", std::move(mlir_module), client,
+      &GetThreadPool(), &ifrt_loaded_variable_registry,
+      &ifrt_restore_tensor_registry, work_queue.get(), device_mgr.get(),
+      tensorflow::IdentityShapeRepresentationFn(),
+      /*ifrt_serving_core_selector=*/nullptr);
 }
 
 TEST(IfrtExecutableRegistry, Basic) {
@@ -97,20 +105,103 @@ TEST(IfrtExecutableRegistry, Basic) {
 
   mlir::MLIRContext context(registry);
 
+  int64_t program_id = 1234;
+
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<IfrtServingExecutable> executable,
-                          CreateIfrtServingExecutable(context));
+                          CreateIfrtServingExecutable(context, program_id));
   IfrtServingExecutable* raw_ptr = executable.get();
 
+  TF_ASSERT_OK_AND_ASSIGN(auto handle, ServingExecutableRegistry::Register(
+                                           program_id, std::move(executable)));
+
+  IfrtServingExecutable* executable_ptr =
+      ServingExecutableRegistry::Lookup(program_id);
+  ASSERT_EQ(executable_ptr, raw_ptr);
+}
+
+TEST(IfrtExecutableRegistry, DuplicateRegistrationFails) {
+  mlir::DialectRegistry registry;
+  mlir::registerAllDialects(registry);
+  mlir::RegisterAllTensorFlowDialects(registry);
+
+  mlir::MLIRContext context(registry);
+
   int64_t program_id = 1234;
 
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<IfrtServingExecutable> executable,
+                          CreateIfrtServingExecutable(context, program_id));
   TF_ASSERT_OK_AND_ASSIGN(auto handle, ServingExecutableRegistry::Register(
                                            program_id, std::move(executable)));
 
+  EXPECT_THAT(
+      ServingExecutableRegistry::Register(program_id, std::move(executable)),
+      testing::StatusIs(absl::StatusCode::kAlreadyExists));
+}
+
+TEST(IfrtExecutableRegistry, ReleaseOk) {
+  mlir::DialectRegistry registry;
+  mlir::registerAllDialects(registry);
+  mlir::RegisterAllTensorFlowDialects(registry);
+
+  mlir::MLIRContext context(registry);
+
+  int64_t program_id = 1234;
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<IfrtServingExecutable> executable,
+                          CreateIfrtServingExecutable(context, program_id));
+  TF_ASSERT_OK_AND_ASSIGN(auto handle, ServingExecutableRegistry::Register(
+                                           program_id, std::move(executable)));
+
+  handle.Release();
+
+  EXPECT_EQ(ServingExecutableRegistry::Lookup(program_id), nullptr);
+}
+
+TEST(IfrtExecutableRegistry, FreezeOk) {
+  mlir::DialectRegistry registry;
+  mlir::registerAllDialects(registry);
+  mlir::RegisterAllTensorFlowDialects(registry);
+
+  mlir::MLIRContext context(registry);
+
+  int64_t program_id = 1234;
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<IfrtServingExecutable> executable,
+                          CreateIfrtServingExecutable(context, program_id));
+  IfrtServingExecutable* raw_ptr = executable.get();
+
+  TF_ASSERT_OK_AND_ASSIGN(auto handle, ServingExecutableRegistry::Register(
+                                           program_id, std::move(executable)));
+
+  ASSERT_OK(handle.Freeze());
+
+  // After the freeze, the lookup should still return the same pointer.
   IfrtServingExecutable* executable_ptr =
       ServingExecutableRegistry::Lookup(program_id);
   ASSERT_EQ(executable_ptr, raw_ptr);
 }
 
+TEST(IfrtExecutableRegistry, FreezeFailedProgramNotRegistered) {
+  mlir::DialectRegistry registry;
+  mlir::registerAllDialects(registry);
+  mlir::RegisterAllTensorFlowDialects(registry);
+
+  mlir::MLIRContext context(registry);
+
+  int64_t program_id = 1234;
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<IfrtServingExecutable> executable,
+                          CreateIfrtServingExecutable(context, program_id));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto handle, ServingExecutableRegistry::Register(
+                                           program_id, std::move(executable)));
+
+  handle.Release();
+
+  EXPECT_THAT(handle.Freeze(),
+              testing::StatusIs(absl::StatusCode::kFailedPrecondition));
+}
+
 TEST(IfrtExecutableRegistry, InvalidProgramIdShallReturnNull) {
   int64_t program_id = 1234;
 
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.cc b/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.cc
index cdc8d669df989e..73651cf7ed5cf1 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.cc
+++ b/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "tsl/platform/statusor.h"
 
@@ -30,13 +29,12 @@ namespace tensorflow {
 namespace ifrt_serving {
 
 absl::Status IfrtLoadedVariableRegistry::TryRegisterLoadedVariable(
-    absl::string_view name,
-    LoadedVariableConstructor&& loaded_variable_constructor) {
+    const Key& key, LoadedVariableConstructor&& loaded_variable_constructor) {
   absl::MutexLock lock(&mutex_);
-  auto& variable = loaded_variable_map_[name];
+  auto& variable = loaded_variable_map_[key];
   if (variable.array.IsValid()) {
     // Already registered. This is rare.
-    VLOG(1) << "Variable '" << name << "' already registered.";
+    VLOG(1) << "Variable '" << key.input_name << "' already registered.";
     return absl::OkStatus();
   }
   TF_ASSIGN_OR_RETURN(variable, loaded_variable_constructor());
@@ -44,12 +42,12 @@ absl::Status IfrtLoadedVariableRegistry::TryRegisterLoadedVariable(
 }
 
 absl::StatusOr<IfrtLoadedVariableRegistry::LoadedVariable>
-IfrtLoadedVariableRegistry::GetLoadedVariable(absl::string_view name) const {
+IfrtLoadedVariableRegistry::GetLoadedVariable(const Key& key) const {
   absl::MutexLock lock(&mutex_);
-  auto it = loaded_variable_map_.find(name);
+  auto it = loaded_variable_map_.find(key);
   if (it == loaded_variable_map_.end()) {
     return absl::NotFoundError(
-        absl::StrCat("Variable '", name, "' not found."));
+        absl::StrCat("Variable '", key.input_name, "' not found."));
   }
   return it->second;
 }
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h b/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h
index 469dae297dd0b0..e799c571a246b7 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h
+++ b/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h
@@ -16,16 +16,18 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TFRT_IFRT_IFRT_LOADED_VARIABLE_REGISTRY_H_
 #define TENSORFLOW_CORE_TFRT_IFRT_IFRT_LOADED_VARIABLE_REGISTRY_H_
 
+#include <string>
+
+#include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
-#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/future.h"
-#include "tsl/concurrency/ref_count.h"
+#include "xla/tsl/concurrency/ref_count.h"
 
 namespace tensorflow {
 namespace ifrt_serving {
@@ -33,8 +35,27 @@ namespace ifrt_serving {
 // This class is thread safe.
 class IfrtLoadedVariableRegistry {
  public:
+  // The key is per variable tensor per device assignment. For single -device
+  // program, variables can be loaded on multiple devices with core selection.
+  // For SPMD program, we currently assume all devices will be used, so we use
+  // set to make it compatible with SPMD.
+  struct Key {
+    // We use a set to make it compatible with SPMD.
+    absl::flat_hash_set<int> device_ids;
+    std::string input_name;
+    template <typename H>
+    friend H AbslHashValue(H h, const Key& key) {
+      h = H::combine(std::move(h), key.input_name, key.device_ids);
+      return h;
+    }
+
+    friend bool operator==(const Key& x, const Key& y) {
+      return x.input_name == y.input_name && x.device_ids == y.device_ids;
+    }
+  };
+
   struct LoadedVariable {
-    xla::ifrt::Future<absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>> array;
+    xla::ifrt::Future<tsl::RCReference<xla::ifrt::Array>> array;
   };
   using LoadedVariableConstructor =
       absl::AnyInvocable<absl::StatusOr<LoadedVariable>() const>;
@@ -45,16 +66,15 @@ class IfrtLoadedVariableRegistry {
   // OK if the named array already exists.
   // loaded_variable_constructor is invoked in the caller thread.
   absl::Status TryRegisterLoadedVariable(
-      absl::string_view name,
-      LoadedVariableConstructor&& loaded_variable_constructor)
+      const Key& key, LoadedVariableConstructor&& loaded_variable_constructor)
       ABSL_LOCKS_EXCLUDED(mutex_);
 
-  absl::StatusOr<LoadedVariable> GetLoadedVariable(absl::string_view name) const
+  absl::StatusOr<LoadedVariable> GetLoadedVariable(const Key& key) const
       ABSL_LOCKS_EXCLUDED(mutex_);
 
  private:
   mutable absl::Mutex mutex_;
-  absl::flat_hash_map<std::string, LoadedVariable> loaded_variable_map_
+  absl::flat_hash_map<Key, LoadedVariable> loaded_variable_map_
       ABSL_GUARDED_BY(mutex_);
 };
 
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_utils.cc b/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_utils.cc
index 9161a37f55f5ce..7a17a7e14ccb38 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_utils.cc
+++ b/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_utils.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
@@ -30,13 +32,13 @@ limitations under the License.
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/future.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "tensorflow/core/framework/resource_handle.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h"
 #include "tensorflow/core/tfrt/ifrt/sharding_utils.h"
-#include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/threadpool.h"
@@ -88,41 +90,48 @@ std::string GetRuntimeNameFromVarHandle(const ResourceHandle& handle) {
   return absl::StrCat(handle.container(), "__", handle.name());
 }
 
-absl::Status LoadRestoredTensorAsIfrtLoadedVariable(
+absl::Status AsyncLoadRestoredTensorAsIfrtLoadedVariable(
     absl::string_view runtime_name,
     std::shared_ptr<xla::ifrt::Client> ifrt_client,
     const tsl::thread::ThreadPool& thread_pool,
-    ifrt_serving::IfrtRestoreTensorRegistry& ifrt_restore_tensor_registry,
+    const ifrt_serving::IfrtRestoreTensorRegistry& ifrt_restore_tensor_registry,
     ifrt_serving::IfrtLoadedVariableRegistry& ifrt_loaded_variable_registry,
     tfrt::ConcurrentWorkQueue* checkpoint_loader_queue,
     const VariableDeviceShardingConfigProto& sharding_config) {
-  xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>> restored_tensor_future =
+  absl::flat_hash_set<int> device_ids{sharding_config.device_ids().begin(),
+                                      sharding_config.device_ids().end()};
+  IfrtLoadedVariableRegistry::Key loaded_variable_key{
+      .device_ids = std::move(device_ids),
+      .input_name = std::string(runtime_name),
+  };
+  if (ifrt_loaded_variable_registry.GetLoadedVariable(loaded_variable_key)
+          .ok()) {
+    VLOG(1) << "Found alread registered variable for " << runtime_name;
+    return absl::OkStatus();
+  }
+  xla::ifrt::Future<tensorflow::Tensor> restored_tensor_future =
       ifrt_restore_tensor_registry.GetRestoredTensor(runtime_name);
   if (!restored_tensor_future.IsValid()) {
     return absl::InternalError(absl::StrCat(
         "LoadVariableOp: failed to fetch variable tensor: ", runtime_name));
   }
-
-  auto loaded_variable_promise = xla::ifrt::Future<
-      absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>>::CreatePromise();
+  auto loaded_variable_promise =
+      xla::ifrt::Future<tsl::RCReference<xla::ifrt::Array>>::CreatePromise();
   auto loaded_variable_future =
-      xla::ifrt::Future<absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>>(
+      xla::ifrt::Future<tsl::RCReference<xla::ifrt::Array>>(
           loaded_variable_promise);
-
   TF_ASSIGN_OR_RETURN(
       absl::StatusOr<ifrt_serving::DtypeAndShape> dtype_and_shape,
       ifrt_restore_tensor_registry.GetDtypeAndShape(runtime_name));
-  // TODO(b/330360798) Load variable on devices from the result of core
-  // selection.
   TF_RETURN_IF_ERROR(ifrt_loaded_variable_registry.TryRegisterLoadedVariable(
-      runtime_name,
+      loaded_variable_key,
       [&]() -> absl::StatusOr<
                 ifrt_serving::IfrtLoadedVariableRegistry::LoadedVariable> {
         return ifrt_serving::IfrtLoadedVariableRegistry::LoadedVariable(
             {.array = loaded_variable_future});
       }));
   restored_tensor_future.OnReady(
-      [ifrt_client = ifrt_client, &thread_pool = thread_pool,
+      [ifrt_client = std::move(ifrt_client), &thread_pool = thread_pool,
        checkpoint_loader_queue = checkpoint_loader_queue,
        sharding_config = sharding_config,
        loaded_variable_promise = std::move(loaded_variable_promise)](
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_utils.h b/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_utils.h
index aafebbda16bd77..4d07d1a3771a8a 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_utils.h
+++ b/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_utils.h
@@ -20,6 +20,9 @@ limitations under the License.
 #include <string>
 
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h"
 #include "xla/python/ifrt/client.h"
 #include "tensorflow/core/framework/resource_handle.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_config.pb.h"
@@ -31,6 +34,9 @@ limitations under the License.
 namespace tensorflow {
 namespace ifrt_serving {
 
+// An index to indicate a non per-core executable bundle cache.
+inline constexpr int kNoCoreSelectedIndex = -1;
+
 absl::StatusOr<ifrt_serving::DtypeAndShape> GetDtypeAndShape(
     const ResourceHandle& resource_handle);
 
@@ -44,11 +50,11 @@ std::string GetRuntimeNameFromVarHandle(const ResourceHandle& handle);
 // future of IFRT loaded variable in `ifrt_loaded_variable_registry`. The caller
 // can look for the actual loaded variable value in
 // `ifrt_loaded_variable_registry`.
-absl::Status LoadRestoredTensorAsIfrtLoadedVariable(
+absl::Status AsyncLoadRestoredTensorAsIfrtLoadedVariable(
     absl::string_view runtime_name,
     std::shared_ptr<xla::ifrt::Client> ifrt_client,
     const tsl::thread::ThreadPool& thread_pool,
-    ifrt_serving::IfrtRestoreTensorRegistry& ifrt_restore_tensor_registry,
+    const ifrt_serving::IfrtRestoreTensorRegistry& ifrt_restore_tensor_registry,
     ifrt_serving::IfrtLoadedVariableRegistry& ifrt_loaded_variable_registry,
     tfrt::ConcurrentWorkQueue* checkpoint_loader_queue,
     const VariableDeviceShardingConfigProto& sharding_config);
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_utils_test.cc b/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_utils_test.cc
index 40cf1903d46fe8..4777d0a3c18103 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_utils_test.cc
+++ b/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_utils_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/future.h"
 #include "xla/python/ifrt/test_util.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/xla_data.pb.h"
 #include "tensorflow/core/framework/resource_handle.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -38,7 +39,6 @@ limitations under the License.
 #include "tensorflow/core/tfrt/ifrt/ifrt_config.pb.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h"
-#include "tsl/concurrency/ref_count.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/status_matchers.h"
@@ -80,18 +80,18 @@ TEST(ShardingUtilsTest, ShardTensorToIfrtLoadedVariableNotFoundWrongName) {
   VariableDeviceShardingConfigProto sharding_config;
   sharding_config.add_device_ids(0);
 
-  auto promise =
-      xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>>::CreatePromise();
-  auto future = xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>>(promise);
+  auto promise = xla::ifrt::Future<tensorflow::Tensor>::CreatePromise();
+  auto future = xla::ifrt::Future<tensorflow::Tensor>(promise);
 
   IfrtRestoreTensorRegistry::RestoredTensorInfo restored_tensor_info = {
+      false,
       GetDtypeAndShape(variable_handle.scalar<ResourceHandle>()()).value(),
       future};
   TF_ASSERT_OK(restored_tensor_registry.TryRegister("var_x_wrong",
                                                     restored_tensor_info));
   promise.Set(input_tensor);
   EXPECT_THAT(
-      LoadRestoredTensorAsIfrtLoadedVariable(
+      AsyncLoadRestoredTensorAsIfrtLoadedVariable(
           "var_x", client, thread_pool, restored_tensor_registry,
           loaded_variable_registry, restore_work_queue.get(), sharding_config),
       StatusIs(absl::StatusCode::kNotFound));
@@ -123,22 +123,26 @@ TEST(ShardingUtilsTest, ShardTensorToIfrtLoadedVariableSucceed) {
   VariableDeviceShardingConfigProto sharding_config;
   sharding_config.add_device_ids(0);
 
-  auto promise =
-      xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>>::CreatePromise();
-  auto future = xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>>(promise);
+  auto promise = xla::ifrt::Future<tensorflow::Tensor>::CreatePromise();
+  auto future = xla::ifrt::Future<tensorflow::Tensor>(promise);
 
   IfrtRestoreTensorRegistry::RestoredTensorInfo restored_tensor_info = {
+      false,
       GetDtypeAndShape(variable_handle.scalar<ResourceHandle>()()).value(),
       future};
 
   TF_ASSERT_OK(
       restored_tensor_registry.TryRegister("var_x", restored_tensor_info));
-  TF_ASSERT_OK(LoadRestoredTensorAsIfrtLoadedVariable(
+  TF_ASSERT_OK(AsyncLoadRestoredTensorAsIfrtLoadedVariable(
       "var_x", client, thread_pool, restored_tensor_registry,
       loaded_variable_registry, restore_work_queue.get(), sharding_config));
   promise.Set(input_tensor);
+  IfrtLoadedVariableRegistry::Key key{
+      .device_ids = {0},
+      .input_name = "var_x",
+  };
   TF_ASSERT_OK_AND_ASSIGN(auto v,
-                          loaded_variable_registry.GetLoadedVariable("var_x"));
+                          loaded_variable_registry.GetLoadedVariable(key));
   TF_ASSERT_OK_AND_ASSIGN(auto assembled_array, v.array.Await());
 
   TF_ASSERT_OK_AND_ASSIGN(auto disassembled_arrays,
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_model_context.cc b/tensorflow/core/tfrt/ifrt/ifrt_model_context.cc
index c8d6b205382e64..191fc765715ba7 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_model_context.cc
+++ b/tensorflow/core/tfrt/ifrt/ifrt_model_context.cc
@@ -16,12 +16,9 @@ limitations under the License.
 
 #include "tensorflow/core/tfrt/ifrt/ifrt_model_context.h"
 
-#include <utility>
 
 #include "absl/status/status.h"
-#include "absl/strings/string_view.h"
-#include "xla/python/ifrt/array.h"
-#include "tsl/concurrency/ref_count.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/threadpool.h"
 
 namespace tensorflow {
@@ -31,5 +28,13 @@ const tsl::thread::ThreadPool& IfrtModelContext::GetThreadPool() const {
   return thread_pool_;
 }
 
+absl::Status IfrtModelContext::Freeze() {
+  restore_tensor_registry_.Freeze();
+  for (auto& program_handle : handles_) {
+    TF_RETURN_IF_ERROR(program_handle.Freeze());
+  }
+  return absl::OkStatus();
+}
+
 }  // namespace ifrt_serving
 }  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_model_context.h b/tensorflow/core/tfrt/ifrt/ifrt_model_context.h
index 8aa214372303e0..6b4a600b85a3cb 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_model_context.h
+++ b/tensorflow/core/tfrt/ifrt/ifrt_model_context.h
@@ -17,15 +17,11 @@ limitations under the License.
 #define TENSORFLOW_CORE_TFRT_IFRT_IFRT_MODEL_CONTEXT_H_
 
 #include <memory>
-#include <string>
 #include <utility>
 #include <vector>
 
-#include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
-#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "absl/synchronization/mutex.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
@@ -33,7 +29,7 @@ limitations under the License.
 #include "tensorflow/core/tfrt/ifrt/ifrt_executable_registry.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h"
-#include "tsl/concurrency/ref_count.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_serving_core_selector.h"
 #include "tsl/platform/threadpool.h"
 #include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
 
@@ -55,16 +51,21 @@ struct DeviceConfig {
 class IfrtModelContext {
  public:
   explicit IfrtModelContext(std::shared_ptr<xla::ifrt::Client> client,
+                            IfrtServingCoreSelector* ifrt_serving_core_selector,
                             const tsl::thread::ThreadPool* thread_pool)
-      : client_(std::move(client)), thread_pool_(*thread_pool) {}
+      : client_(std::move(client)),
+        ifrt_serving_core_selector_(ifrt_serving_core_selector),
+        thread_pool_(*thread_pool) {}
   IfrtModelContext(
       std::shared_ptr<xla::ifrt::Client> client,
+      IfrtServingCoreSelector* ifrt_serving_core_selector,
       const tsl::thread::ThreadPool* thread_pool,
-      std::unique_ptr<tensorflow::StaticDeviceMgr> device_mgr,
+      tensorflow::DeviceMgr* device_mgr,
       tensorflow::XlaHelpers::ShapeRepresentationFn shape_representation_fn)
       : client_(std::move(client)),
+        ifrt_serving_core_selector_(ifrt_serving_core_selector),
         thread_pool_(*thread_pool),
-        device_mgr_(std::move(device_mgr)),
+        device_mgr_(device_mgr),
         shape_representation_fn_(shape_representation_fn) {}
 
   void RegisterHandle(ServingExecutableRegistry::Handle handle) {
@@ -94,8 +95,9 @@ class IfrtModelContext {
     return restore_tensor_registry_;
   }
 
-  tensorflow::StaticDeviceMgr* GetDeviceMgr() const {
-    return device_mgr_.get();
+  tensorflow::DeviceMgr* GetDeviceMgr() const { return device_mgr_; }
+  IfrtServingCoreSelector* GetIfrtServingCoreSelector() const {
+    return ifrt_serving_core_selector_;
   }
 
   tfrt::ConcurrentWorkQueue* checkpoint_loader_queue() const {
@@ -105,11 +107,20 @@ class IfrtModelContext {
     checkpoint_loader_queue_ = work_queue;
   }
 
+  // Freeze the model: release the resources such as host tensors that are used
+  // by the device only. The caller guarantees all resources released in this
+  // function is no longer in use in regular execution path.
+  // After Freeze() is called, no new model signature will be compiled. Using a
+  // signature or an input shape that wasn't compiled before the freeze will
+  // leads to an error.
+  absl::Status Freeze();
+
  private:
   std::shared_ptr<xla::ifrt::Client> client_;
+  IfrtServingCoreSelector* ifrt_serving_core_selector_;  // May be nullptr
   const tsl::thread::ThreadPool& thread_pool_;
 
-  std::unique_ptr<tensorflow::StaticDeviceMgr> device_mgr_;
+  tensorflow::DeviceMgr* device_mgr_ = nullptr;  // Not owned.
   tensorflow::XlaHelpers::ShapeRepresentationFn shape_representation_fn_ =
       tensorflow::IdentityShapeRepresentationFn();
 
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.cc b/tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.cc
index e04bafd11ef16c..e8e056afcd1f2d 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.cc
+++ b/tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.cc
@@ -43,18 +43,43 @@ absl::Status IfrtRestoreTensorRegistry::TryRegister(
   return absl::OkStatus();
 }
 
-xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>>
+xla::ifrt::Future<tensorflow::Tensor>
 IfrtRestoreTensorRegistry::GetRestoredTensor(absl::string_view name) const {
   absl::MutexLock lock(&mutex_);
   auto it = restored_tensors_.find(name);
   if (it == restored_tensors_.end()) {
-    return xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>>(
+    return xla::ifrt::Future<tensorflow::Tensor>(
         absl::NotFoundError(absl::StrCat("Variable '", name, "' not found.")));
   }
 
   return it->second.tensor_future;
 }
 
+absl::Status IfrtRestoreTensorRegistry::SetUsedByHost(absl::string_view name) {
+  absl::MutexLock lock(&mutex_);
+  auto it = restored_tensors_.find(name);
+  if (it == restored_tensors_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("Variable '", name, "' not found."));
+  }
+
+  it->second.used_by_host = true;
+  return absl::OkStatus();
+}
+
+void IfrtRestoreTensorRegistry::Freeze() {
+  absl::MutexLock lock(&mutex_);
+  xla::ifrt::Future<tensorflow::Tensor> release_tensor_future(
+      absl::UnavailableError("Tensor is already release."));
+  for (auto& [name, info] : restored_tensors_) {
+    if (!info.used_by_host) {
+      // Release the tensor by replacing the future containing the tensor with
+      // an future containing a status.
+      info.tensor_future = release_tensor_future;
+    }
+  }
+}
+
 absl::StatusOr<DtypeAndShape> IfrtRestoreTensorRegistry::GetDtypeAndShape(
     absl::string_view name) const {
   absl::MutexLock lock(&mutex_);
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h b/tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h
index a178065170b94e..73b7fec318f31f 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h
+++ b/tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h
@@ -36,8 +36,9 @@ namespace ifrt_serving {
 class IfrtRestoreTensorRegistry {
  public:
   struct RestoredTensorInfo {
+    bool used_by_host = false;
     DtypeAndShape dtype_and_shape;
-    xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>> tensor_future;
+    xla::ifrt::Future<tensorflow::Tensor> tensor_future;
   };
   // Tries to register a loaded variable with the given name.
   // Returns an error if the named tensor already exists.
@@ -45,12 +46,22 @@ class IfrtRestoreTensorRegistry {
                            RestoredTensorInfo restored_tensor_info)
       ABSL_LOCKS_EXCLUDED(mutex_);
 
-  xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>> GetRestoredTensor(
+  xla::ifrt::Future<tensorflow::Tensor> GetRestoredTensor(
       absl::string_view name) const ABSL_LOCKS_EXCLUDED(mutex_);
 
+  // Sets the tensor as used by the host. To ensure a tensor's host memory
+  // is released, this function must be called at least once before the Freeze.
+  absl::Status SetUsedByHost(absl::string_view name)
+      ABSL_LOCKS_EXCLUDED(mutex_);
+
   absl::StatusOr<DtypeAndShape> GetDtypeAndShape(absl::string_view name) const
       ABSL_LOCKS_EXCLUDED(mutex_);
 
+  // Part of freezing the model is to release the host tensors that are used by
+  // the device only. The caller guarantees those tensors are already loaded to
+  // the device.
+  void Freeze() ABSL_LOCKS_EXCLUDED(mutex_);
+
  private:
   mutable absl::Mutex mutex_;
   absl::flat_hash_map<std::string, RestoredTensorInfo> restored_tensors_
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry_test.cc b/tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry_test.cc
new file mode 100644
index 00000000000000..de0a27aecc4104
--- /dev/null
+++ b/tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry_test.cc
@@ -0,0 +1,190 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h"
+
+#include <cstdint>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h"
+#include "xla/python/ifrt/future.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/status_matchers.h"
+#include "tsl/platform/statusor.h"
+
+using tsl::testing::IsOk;
+using tsl::testing::StatusIs;
+
+namespace tensorflow {
+namespace ifrt_serving {
+namespace {
+
+TEST(IfrtRestoreTensorRegistryTest, RetrieveNonRegisteredTensorFails) {
+  IfrtRestoreTensorRegistry registry;
+  EXPECT_THAT(registry.GetRestoredTensor("input_tensor_1").Await(),
+              StatusIs(absl::StatusCode::kNotFound));
+}
+
+TEST(IfrtRestoreTensorRegistryTest,
+     RetrieveNonRegisteredTensorDTypeAndShapeFails) {
+  IfrtRestoreTensorRegistry registry;
+  EXPECT_THAT(registry.GetDtypeAndShape("input_tensor_1"),
+              StatusIs(absl::StatusCode::kNotFound));
+}
+
+TEST(IfrtRestoreTensorRegistryTest, SetNonExistedTensorAsUsedByHostFails) {
+  IfrtRestoreTensorRegistry registry;
+  EXPECT_THAT(registry.SetUsedByHost("input_tensor_1"),
+              StatusIs(absl::StatusCode::kNotFound));
+}
+
+TEST(IfrtRestoreTensorRegistryTest, RegisteredExistedTensorFails) {
+  auto input_tensor =
+      test::AsTensor<int32_t>({1, 2, 3, 4}, tensorflow::TensorShape({2, 2}));
+  auto promise = xla::ifrt::Future<tensorflow::Tensor>::CreatePromise();
+  auto future = xla::ifrt::Future<tensorflow::Tensor>(promise);
+
+  IfrtRestoreTensorRegistry::RestoredTensorInfo restored_tensor_info = {
+      .used_by_host = false,
+      .dtype_and_shape =
+          {
+              .dtype = DT_INT32,
+              .shape = tensorflow::TensorShape({2, 2}),
+          },
+      .tensor_future = future};
+  IfrtRestoreTensorRegistry registry;
+  EXPECT_THAT(registry.TryRegister("input_tensor_2", restored_tensor_info),
+              IsOk());
+  promise.Set(input_tensor);
+  EXPECT_THAT(registry.TryRegister("input_tensor_2", restored_tensor_info),
+              StatusIs(absl::StatusCode::kAlreadyExists));
+}
+
+TEST(IfrtRestoreTensorRegistryTest, SetTensorAsUsedByHost) {
+  auto promise = xla::ifrt::Future<tensorflow::Tensor>::CreatePromise();
+  auto future = xla::ifrt::Future<tensorflow::Tensor>(promise);
+  IfrtRestoreTensorRegistry::RestoredTensorInfo restored_tensor_info = {
+      .used_by_host = false,
+      .dtype_and_shape =
+          {
+              .dtype = DT_INT32,
+              .shape = tensorflow::TensorShape({2, 2}),
+          },
+      .tensor_future = future};
+  IfrtRestoreTensorRegistry registry;
+  EXPECT_THAT(registry.TryRegister("input_tensor_1", restored_tensor_info),
+              IsOk());
+  EXPECT_THAT(registry.SetUsedByHost("input_tensor_1"), IsOk());
+}
+
+TEST(IfrtRestoreTensorRegistryTest, RegisteredTensorCanBeRetrieved) {
+  auto input_tensor =
+      test::AsTensor<int32_t>({1, 2, 3, 4}, tensorflow::TensorShape({2, 2}));
+  auto promise = xla::ifrt::Future<tensorflow::Tensor>::CreatePromise();
+  auto future = xla::ifrt::Future<tensorflow::Tensor>(promise);
+
+  IfrtRestoreTensorRegistry::RestoredTensorInfo restored_tensor_info = {
+      .used_by_host = false,
+      .dtype_and_shape =
+          {
+              .dtype = DT_INT32,
+              .shape = tensorflow::TensorShape({2, 2}),
+          },
+      .tensor_future = future};
+  IfrtRestoreTensorRegistry registry;
+  EXPECT_THAT(registry.TryRegister("input_tensor_1", restored_tensor_info),
+              IsOk());
+  promise.Set(input_tensor);
+  TF_ASSERT_OK_AND_ASSIGN(tensorflow::Tensor retrieved,
+                          registry.GetRestoredTensor("input_tensor_1").Await());
+  test::ExpectEqual(retrieved, input_tensor);
+  TF_ASSERT_OK_AND_ASSIGN(DtypeAndShape dtype_and_shape,
+                          registry.GetDtypeAndShape("input_tensor_1"));
+  EXPECT_TRUE(
+      dtype_and_shape.shape.IsSameSize(tensorflow::TensorShape({2, 2})));
+  EXPECT_EQ(dtype_and_shape.dtype, DT_INT32);
+}
+
+TEST(IfrtRestoreTensorRegistryTest,
+     RegisteredTensorDTypeAndShapeCanBeRetrieved) {
+  auto input_tensor =
+      test::AsTensor<int32_t>({1, 2, 3, 4}, tensorflow::TensorShape({2, 2}));
+  auto promise = xla::ifrt::Future<tensorflow::Tensor>::CreatePromise();
+  auto future = xla::ifrt::Future<tensorflow::Tensor>(promise);
+
+  IfrtRestoreTensorRegistry::RestoredTensorInfo restored_tensor_info = {
+      .used_by_host = false,
+      .dtype_and_shape =
+          {
+              .dtype = DT_INT32,
+              .shape = tensorflow::TensorShape({2, 2}),
+          },
+      .tensor_future = future};
+  IfrtRestoreTensorRegistry registry;
+  EXPECT_THAT(registry.TryRegister("input_tensor_1", restored_tensor_info),
+              IsOk());
+  TF_ASSERT_OK_AND_ASSIGN(DtypeAndShape dtype_and_shape,
+                          registry.GetDtypeAndShape("input_tensor_1"));
+  EXPECT_TRUE(
+      dtype_and_shape.shape.IsSameSize(tensorflow::TensorShape({2, 2})));
+  EXPECT_EQ(dtype_and_shape.dtype, DT_INT32);
+}
+
+TEST(IfrtRestoreTensorRegistryTest, FeezeTensorRegistry) {
+  auto input_tensor =
+      test::AsTensor<int32_t>({1, 2, 3, 4}, tensorflow::TensorShape({2, 2}));
+  auto promise1 = xla::ifrt::Future<tensorflow::Tensor>::CreatePromise();
+  auto future1 = xla::ifrt::Future<tensorflow::Tensor>(promise1);
+  auto promise2 = xla::ifrt::Future<tensorflow::Tensor>::CreatePromise();
+  auto future2 = xla::ifrt::Future<tensorflow::Tensor>(promise2);
+
+  IfrtRestoreTensorRegistry::RestoredTensorInfo restored_tensor_info1 = {
+      .used_by_host = false,
+      .dtype_and_shape =
+          {
+              .dtype = DT_INT32,
+              .shape = tensorflow::TensorShape({2, 2}),
+          },
+      .tensor_future = future1};
+  IfrtRestoreTensorRegistry::RestoredTensorInfo restored_tensor_info2 = {
+      .used_by_host = true,
+      .dtype_and_shape =
+          {
+              .dtype = DT_INT32,
+              .shape = tensorflow::TensorShape({2, 2}),
+          },
+      .tensor_future = future2};
+  IfrtRestoreTensorRegistry registry;
+  TF_ASSERT_OK(registry.TryRegister("input_tensor_1", restored_tensor_info1));
+  TF_ASSERT_OK(registry.TryRegister("input_tensor_2", restored_tensor_info2));
+  promise1.Set(input_tensor);
+  promise2.Set(input_tensor);
+  registry.Freeze();
+  // Tensor with `used_by_host` set to false will be freed after freeze.
+  EXPECT_THAT(registry.GetRestoredTensor("input_tensor_1").Await(),
+              StatusIs(absl::StatusCode::kUnavailable));
+  // Tensor with `used_by_host` set to true will be kept after freeze.
+  TF_ASSERT_OK_AND_ASSIGN(tensorflow::Tensor retrieved,
+                          registry.GetRestoredTensor("input_tensor_2").Await());
+  test::ExpectEqual(retrieved, input_tensor);
+}
+}  // namespace
+}  // namespace ifrt_serving
+}  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc
index 1e713d17365988..822df677a97240 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc
+++ b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc
@@ -24,8 +24,10 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/memory/memory.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
@@ -49,6 +51,7 @@ limitations under the License.
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/executable.h"
 #include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt/hlo/hlo_program.h"
 #include "xla/python/ifrt/host_callback.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
@@ -56,6 +59,7 @@ limitations under the License.
 #include "xla/python/pjrt_ifrt/xla_compiler.h"
 #include "xla/service/computation_placer.h"
 #include "xla/shape.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/xla_data.pb.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/framework/function.pb.h"
@@ -64,19 +68,31 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_config.pb.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_utils.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_serving_core_selector.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_tensor_utils.h"
 #include "tensorflow/core/tfrt/ifrt/sharding_utils.h"
 #include "tensorflow/core/tfrt/ifrt/tf_host_callback.h"
-#include "tsl/concurrency/ref_count.h"
+#include "tsl/framework/serving_device_selector.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
+#include "tsl/platform/threadpool.h"
 #include "tsl/platform/tstring.h"
+#include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
 
 namespace tensorflow {
 namespace ifrt_serving {
 namespace {
+
+bool IsSingleDevice(
+    const tensorflow::tpu::TPUCompileMetadataProto& compile_metadata) {
+  return compile_metadata.num_replicas() == 1 &&
+         compile_metadata.num_cores_per_replica() == 1;
+}
+
 absl::StatusOr<std::vector<DtypeAndShape>> BuildDtypeAndShape(
     absl::Span<const tensorflow::Tensor> inputs,
     absl::Span<const int> variable_arg_indices,
@@ -92,7 +108,7 @@ absl::StatusOr<std::vector<DtypeAndShape>> BuildDtypeAndShape(
       TF_ASSIGN_OR_RETURN(auto dtype_and_shape,
                           ifrt_restore_tensor_registry.GetDtypeAndShape(
                               inputs[i].scalar<tsl::tstring>()()));
-      dtypes_and_shapes.push_back(dtype_and_shape);
+      dtypes_and_shapes.push_back(std::move(dtype_and_shape));
 
       variable_index++;
     } else {
@@ -119,7 +135,6 @@ absl::StatusOr<std::vector<xla::ifrt::Device*>> GetAssignedDevices(
     const tensorflow::tpu::TPUCompileMetadataProto& compile_metadata) {
   TF_ASSIGN_OR_RETURN(auto device_assignment,
                       GetXlaDeviceAssignment(compile_metadata));
-
   const int num_devices =
       device_assignment.replica_count() * device_assignment.computation_count();
   std::vector<xla::ifrt::Device*> devices;
@@ -141,6 +156,30 @@ absl::StatusOr<std::vector<xla::ifrt::Device*>> GetAssignedDevices(
 
 }  // namespace
 
+absl::StatusOr<std::unique_ptr<IfrtServingExecutable>>
+IfrtServingExecutable::Create(
+    int64_t program_id, absl::string_view model_name,
+    absl::string_view signature_name, mlir::OwningOpRef<mlir::ModuleOp> module,
+    std::shared_ptr<xla::ifrt::Client> client,
+    const tsl::thread::ThreadPool* thread_pool,
+    IfrtLoadedVariableRegistry* ifrt_loaded_variable_registry,
+    const IfrtRestoreTensorRegistry* ifrt_restore,
+    tfrt::ConcurrentWorkQueue* checkpoint_loader_queue,
+    tensorflow::DeviceMgr* device_mgr,
+    tensorflow::XlaHelpers::ShapeRepresentationFn shape_representation_fn,
+    IfrtServingCoreSelector* ifrt_serving_core_selector) {
+  TF_ASSIGN_OR_RETURN(
+      tensorflow::tpu::TPUCompileMetadataProto original_compile_metadata,
+      GetCompileMetadata(*module, *client));
+
+  return absl::WrapUnique(new IfrtServingExecutable(
+      program_id, model_name, signature_name, std::move(module),
+      std::move(client), thread_pool, ifrt_loaded_variable_registry,
+      ifrt_restore, checkpoint_loader_queue, device_mgr,
+      std::move(shape_representation_fn), ifrt_serving_core_selector,
+      std::move(original_compile_metadata)));
+}
+
 absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
 IfrtServingExecutable::ConvertTensorToArray(
     const tensorflow::Tensor& tensor, const xla::ifrt::DeviceList& device_list,
@@ -164,7 +203,7 @@ absl::StatusOr<std::vector<tensorflow::FunctionDef>> BuildFunctionDef(
   TF_RETURN_IF_ERROR(ExportFunctionDefs(
       module,
       [&](tensorflow::FunctionDef function_def) {
-        function_defs.push_back(function_def);
+        function_defs.push_back(std::move(function_def));
         return absl::OkStatus();
       },
       /*export_tf_original_func_name=*/false));
@@ -198,8 +237,8 @@ GroupHostCallbackByKey(const Tf2HloResult& tf2hlo_result) {
 // TODO: shape propagation in module
 absl::StatusOr<xla::HostCallback> BuildHostCallback(
     absl::string_view key, const HostCallbackBuilderInfo& builder_info,
-    mlir::ModuleOp module, tensorflow::StaticDeviceMgr* device_mgr,
-    std::vector<std::shared_ptr<TfHostCallback>>& tf_host_callbacks) {
+    mlir::ModuleOp module, tensorflow::DeviceMgr* device_mgr,
+    std::vector<std::unique_ptr<TfHostCallback>>& tf_host_callbacks) {
   VLOG(2) << "BuildHostCallback for key: " << key;
 
   DCHECK(device_mgr);
@@ -241,7 +280,8 @@ absl::StatusOr<xla::HostCallback> BuildHostCallback(
                         to_xla_shape(metadata.type(), metadata.shape()));
     uint16_t channel_id = static_cast<uint16_t>(metadata.channel_id());
     VLOG(2) << "Channel id: " << channel_id;
-    host_callback.results.push_back({.channel_id = channel_id, .shape = shape});
+    host_callback.results.push_back(
+        {.channel_id = channel_id, .shape = std::move(shape)});
     result_type_and_shapes.push_back(
         DtypeAndShape{.dtype = metadata.type(), .shape = metadata.shape()});
   }
@@ -255,7 +295,7 @@ absl::StatusOr<xla::HostCallback> BuildHostCallback(
                       BuildFunctionDef(*callback_module));
 
   TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<TfHostCallback> tf_host_callback,
+      std::unique_ptr<TfHostCallback> tf_host_callback,
       TfHostCallback::Create(function_defs, key, operand_type_and_shapes,
                              result_type_and_shapes, device_mgr));
 
@@ -270,8 +310,8 @@ absl::StatusOr<xla::HostCallback> BuildHostCallback(
 
 absl::StatusOr<std::vector<xla::HostCallback>> BuildHostCallbacks(
     const Tf2HloResult& tf2hlo_result, mlir::ModuleOp module,
-    tensorflow::StaticDeviceMgr* device_mgr,
-    std::vector<std::shared_ptr<TfHostCallback>>& tf_host_callbacks) {
+    tensorflow::DeviceMgr* device_mgr,
+    std::vector<std::unique_ptr<TfHostCallback>>& tf_host_callbacks) {
   TF_ASSIGN_OR_RETURN(auto host_callback_maps,
                       GroupHostCallbackByKey(tf2hlo_result));
 
@@ -287,14 +327,16 @@ absl::StatusOr<std::vector<xla::HostCallback>> BuildHostCallbacks(
   return host_callbacks;
 }
 
-absl::StatusOr<IfrtServingExecutable::CachedExecutableBundle>
+absl::StatusOr<IfrtServingExecutable::SharedCachedExecutableBundle>
 IfrtServingExecutable::CreateExecutableSynchronously(
+    mlir::OwningOpRef<mlir::ModuleOp> module_copy,
+    const tensorflow::tpu::TPUCompileMetadataProto& compile_metadata,
     absl::Span<const DtypeAndShape> dtypes_and_shapes) {
   TF_ASSIGN_OR_RETURN(
       Tf2HloResult tf2hlo_result,
-      CompileTfToHlo(*module_, dtypes_and_shapes, signature_name(),
-                     *ifrt_client_, shape_representation_fn_));
-
+      CompileTfToHlo(*module_copy, dtypes_and_shapes, signature_name(),
+                     *ifrt_client_, compile_metadata,
+                     shape_representation_fn_));
   const int num_replicas = tf2hlo_result.compile_metadata.num_replicas();
   const int num_partitions =
       tf2hlo_result.compile_metadata.num_cores_per_replica();
@@ -308,11 +350,6 @@ IfrtServingExecutable::CreateExecutableSynchronously(
                      num_replicas, " and num_partitions is ", num_partitions));
   }
 
-  TF_ASSIGN_OR_RETURN(xla::DeviceAssignment da,
-                      GetXlaDeviceAssignment(tf2hlo_result.compile_metadata));
-
-  VLOG(2) << "Device assignment :" << da.ToString();
-
   xla::CompileOptions xla_compile_options;
   // TODO(b/304839793): populate xla_compile_options.argument_layouts.
   // TODO(b/316071625): per model config in TFRT + IFRT.
@@ -322,12 +359,20 @@ IfrtServingExecutable::CreateExecutableSynchronously(
 
   xla_compile_options.executable_build_options.set_use_spmd_partitioning(true);
   xla_compile_options.parameter_is_tupled_arguments = false;
-  xla_compile_options.executable_build_options.set_device_assignment(da);
+  // Use portable execution for single device + core selection.
+  if (UsePortableExecution(compile_metadata)) {
+    xla_compile_options.compile_portable_executable = true;
+  } else {
+    TF_ASSIGN_OR_RETURN(xla::DeviceAssignment da,
+                        GetXlaDeviceAssignment(tf2hlo_result.compile_metadata));
+    VLOG(2) << "Device assignment :" << da.ToString();
+    xla_compile_options.executable_build_options.set_device_assignment(da);
+  }
 
-  std::vector<std::shared_ptr<TfHostCallback>> tf_host_callbacks;
+  std::vector<std::unique_ptr<TfHostCallback>> tf_host_callbacks;
   TF_ASSIGN_OR_RETURN(auto host_callbacks,
-                      BuildHostCallbacks(tf2hlo_result, *module_, device_mgr_,
-                                         tf_host_callbacks));
+                      BuildHostCallbacks(tf2hlo_result, *module_copy,
+                                         device_mgr_, tf_host_callbacks));
 
   std::vector<tsl::RCReference<xla::ifrt::LoadedHostCallback>>
       loaded_host_callbacks;
@@ -342,32 +387,34 @@ IfrtServingExecutable::CreateExecutableSynchronously(
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<xla::ifrt::LoadedExecutable> ifrt_executable,
       ifrt_client_->GetDefaultCompiler()->Compile(
-          std::make_unique<xla::ifrt::XlaProgram>(
+          std::make_unique<xla::ifrt::HloProgram>(
               tf2hlo_result.mlir_hlo_module.get()),
           std::make_unique<xla::ifrt::XlaCompileOptions>(
               xla_compile_options, loaded_host_callbacks)));
 
-  CachedExecutableBundle executable_bundle;
-  executable_bundle.ifrt_executable = std::move(ifrt_executable);
-  executable_bundle.compile_metadata =
+  SharedCachedExecutableBundle executable_bundle =
+      std::make_shared<CachedExecutableBundle>();
+  executable_bundle->ifrt_executable = std::move(ifrt_executable);
+  executable_bundle->compile_metadata =
       std::move(tf2hlo_result.compile_metadata);
-  executable_bundle.host_callbacks = std::move(tf_host_callbacks);
+  executable_bundle->host_callbacks = std::move(tf_host_callbacks);
 
   return executable_bundle;
 }
 
-xla::ifrt::Future<absl::StatusOr<IfrtServingExecutable::CachedExecutableBundle>>
+xla::ifrt::Future<IfrtServingExecutable::SharedCachedExecutableBundle>
 IfrtServingExecutable::LookUpOrCreateExecutable(
+    const tensorflow::tpu::TPUCompileMetadataProto& compile_metadata,
     absl::Span<const DtypeAndShape> dtypes_and_shapes) {
   std::vector<tensorflow::TensorShape> input_shapes;
   for (const auto& dtype_and_shape : dtypes_and_shapes) {
     input_shapes.push_back(dtype_and_shape.shape);
   }
-  Key key = {input_shapes};
-
-  xla::ifrt::Promise<absl::StatusOr<CachedExecutableBundle>> promise;
-  xla::ifrt::Future<absl::StatusOr<CachedExecutableBundle>> future;
+  Key key = {.input_shapes = std::move(input_shapes)};
 
+  xla::ifrt::Promise<SharedCachedExecutableBundle> promise;
+  xla::ifrt::Future<SharedCachedExecutableBundle> future;
+  mlir::OwningOpRef<mlir::ModuleOp> module_copy;
   {
     absl::MutexLock lock(&mutex_);
 
@@ -376,22 +423,46 @@ IfrtServingExecutable::LookUpOrCreateExecutable(
       return it->second;
     }
 
+    if (is_frozen_) {
+      xla::ifrt::Future<SharedCachedExecutableBundle> frozen_future(
+          absl::FailedPreconditionError(
+              "Cannot compile for new input shapes after the executable is "
+              "already frozen."));
+      return frozen_future;
+    }
+
     // Only create promise and future when cache missed.
-    promise = xla::ifrt::Future<
-        absl::StatusOr<CachedExecutableBundle>>::CreatePromise();
-    future = xla::ifrt::Future<absl::StatusOr<CachedExecutableBundle>>(promise);
+    promise = xla::ifrt::Future<SharedCachedExecutableBundle>::CreatePromise();
+    future = xla::ifrt::Future<SharedCachedExecutableBundle>(promise);
 
     executable_bundles_.emplace(key, future);
+    // Clone the module to avoid race condition between Freeze() and
+    // compilation.
+    module_copy = mlir::OwningOpRef<mlir::ModuleOp>(module_->clone());
   }
 
   LOG(INFO) << "Cache missed. Building executable";
-  absl::StatusOr<CachedExecutableBundle> executable_bundle =
-      CreateExecutableSynchronously(dtypes_and_shapes);
-
+  absl::StatusOr<SharedCachedExecutableBundle> executable_bundle =
+      CreateExecutableSynchronously(std::move(module_copy), compile_metadata,
+                                    dtypes_and_shapes);
   promise.Set(std::move(executable_bundle));
   return future;
 }
 
+void IfrtServingExecutable::Freeze() {
+  LOG(INFO) << "Freezing executable. Program id: " << program_id_;
+  absl::MutexLock lock(&mutex_);
+  is_frozen_ = true;
+  module_ = nullptr;
+}
+
+bool IfrtServingExecutable::UsePortableExecution(
+    const tensorflow::tpu::TPUCompileMetadataProto& compile_metadata) {
+  // TODO(b/335247101) Add a check that the core selector must be non-null if
+  // it is a single-device program after core selection in Ifrt is stable.
+  return IsSingleDevice(compile_metadata) && ifrt_serving_core_selector_;
+}
+
 absl::StatusOr<std::vector<tensorflow::Tensor>> IfrtServingExecutable::Execute(
     absl::Span<const tensorflow::Tensor> inputs,
     absl::Span<const int> variable_arg_indices) {
@@ -427,43 +498,75 @@ absl::StatusOr<std::vector<tensorflow::Tensor>> IfrtServingExecutable::Execute(
   TF_ASSIGN_OR_RETURN(std::vector<DtypeAndShape> dtypes_and_shapes,
                       BuildDtypeAndShape(inputs, variable_arg_indices,
                                          ifrt_restore_tensor_registry_));
-  TF_ASSIGN_OR_RETURN(
-      CachedExecutableBundle executable_bundle,
-      LookUpOrCreateExecutable(absl::MakeSpan(dtypes_and_shapes)).Await());
 
-  TF_ASSIGN_OR_RETURN(
-      std::vector<xla ::ifrt::Device*> devices,
-      GetAssignedDevices(*ifrt_client_, executable_bundle.compile_metadata));
+  tensorflow::tpu::TPUCompileMetadataProto compile_metadata =
+      original_compile_metadata_;
+  TF_RETURN_IF_ERROR(
+      UpdateCompileMetadata(compile_metadata, dtypes_and_shapes));
+
+  // `device_reservation` should be alive before the end of the execution.
+  tsl::DeviceReservation device_reservation(kNoCoreSelectedIndex, nullptr);
+  std::vector<xla ::ifrt::Device*> devices;
+  if (UsePortableExecution(compile_metadata)) {
+    device_reservation =
+        ifrt_serving_core_selector_->ReserveDevice(program_id_);
+    // Clear device_assignment because portable execution doesn't allow device
+    // assignment.
+    compile_metadata.clear_device_assignment();
+    TF_ASSIGN_OR_RETURN(xla::ifrt::Device * device,
+                        ifrt_client_->LookupDevice(xla::ifrt::DeviceId(
+                            device_reservation.device_index())));
+    devices.push_back(device);
+  } else {
+    TF_ASSIGN_OR_RETURN(devices,
+                        GetAssignedDevices(*ifrt_client_, compile_metadata));
+  }
+  TF_ASSIGN_OR_RETURN(SharedCachedExecutableBundle executable_bundle,
+                      LookUpOrCreateExecutable(
+                          compile_metadata, absl::MakeSpan(dtypes_and_shapes))
+                          .Await());
+
   xla::ifrt::DeviceList device_list(
       xla::ifrt::DeviceList::Devices(devices.begin(), devices.end()));
 
-  if (executable_bundle.compile_metadata.args().size() !=
+  if (executable_bundle->compile_metadata.args().size() !=
       dtypes_and_shapes.size()) {
     return absl::InternalError(absl::StrCat(
-        "Expected ", executable_bundle.compile_metadata.args().size(),
+        "Expected ", executable_bundle->compile_metadata.args().size(),
         " but got ", dtypes_and_shapes.size(), " arguments"));
   }
 
+  // Asynchronously load the restored variable tensors to Ifrt array.
+  TF_RETURN_IF_ERROR(AsyncLoadIfrtArray(inputs, variable_arg_indices,
+                                        *executable_bundle, devices));
+
   std::vector<tsl::RCReference<xla::ifrt::Array>> args;
   args.reserve(inputs.size());
-
   int variable_index = 0;
   for (int i = 0; i < inputs.size(); i++) {
     if (variable_index < variable_arg_indices.size() &&
         i == variable_arg_indices[variable_index]) {
-      TF_ASSIGN_OR_RETURN(auto loaded_variable,
-                          ifrt_loaded_variable_registry_.GetLoadedVariable(
-                              inputs[i].scalar<tsl::tstring>()()));
+      absl::flat_hash_set<int> device_ids;
+      for (const auto& device : devices) {
+        device_ids.insert(device->Id().value());
+      }
+      IfrtLoadedVariableRegistry::Key key{
+          .device_ids = std::move(device_ids),
+          .input_name = inputs[i].scalar<tsl::tstring>()(),
+      };
+      TF_ASSIGN_OR_RETURN(
+          auto loaded_variable,
+          ifrt_loaded_variable_registry_.GetLoadedVariable(key));
       TF_ASSIGN_OR_RETURN(tsl::RCReference<xla::ifrt::Array> single_array,
                           loaded_variable.array.Await());
-      args.push_back(single_array);
+      args.push_back(std::move(single_array));
       variable_index++;
     } else {
       TF_ASSIGN_OR_RETURN(
           auto single_array,
           ConvertTensorToArray(
               inputs[i], device_list,
-              executable_bundle.compile_metadata.args()[i].sharding()));
+              executable_bundle->compile_metadata.args()[i].sharding()));
       args.push_back(single_array);
     }
   }
@@ -471,24 +574,28 @@ absl::StatusOr<std::vector<tensorflow::Tensor>> IfrtServingExecutable::Execute(
 
   VLOG(2) << "Start Execution";
 
+  std::optional<xla::ifrt::DeviceList> execution_device_list;
+  if (UsePortableExecution(compile_metadata)) {
+    execution_device_list = device_list;
+  }
   TF_ASSIGN_OR_RETURN(
       auto execution_result,
-      executable_bundle.ifrt_executable->Execute(
+      executable_bundle->ifrt_executable->Execute(
           absl::MakeSpan(args),
           /*options=*/
           {.untuple_result = true,
            .use_major_to_minor_data_layout_for_callbacks = true},
-          std::nullopt));
+          std::move(execution_device_list)));
 
   auto status = execution_result.status.Await();
   TF_RETURN_IF_ERROR(status);
 
   std::vector<tensorflow::Tensor> outputs;
 
-  if (executable_bundle.compile_metadata.retvals().size() !=
+  if (executable_bundle->compile_metadata.retvals().size() !=
       execution_result.outputs.size()) {
     return absl::InternalError(absl::StrCat(
-        "Expect ", executable_bundle.compile_metadata.retvals().size(),
+        "Expect ", executable_bundle->compile_metadata.retvals().size(),
         " but got ", execution_result.outputs.size(), " outputs"));
   }
   for (int i = 0; i < execution_result.outputs.size(); ++i) {
@@ -496,7 +603,7 @@ absl::StatusOr<std::vector<tensorflow::Tensor>> IfrtServingExecutable::Execute(
     const tsl::RCReference<xla::ifrt::Array>& array_for_copy =
         execution_result.outputs[i];
     const tpu::TPUCompileMetadataProto::Retval& metadata_retval =
-        executable_bundle.compile_metadata.retvals()[i];
+        executable_bundle->compile_metadata.retvals()[i];
 
     // IFRT's return does not contain sufficient information; so we use
     // sharding spec from metadata.
@@ -514,5 +621,36 @@ absl::StatusOr<std::vector<tensorflow::Tensor>> IfrtServingExecutable::Execute(
   return outputs;
 }
 
+absl::Status IfrtServingExecutable::AsyncLoadIfrtArray(
+    absl::Span<const tensorflow::Tensor> inputs,
+    absl::Span<const int> variable_arg_indices,
+    const CachedExecutableBundle& executable_bundle,
+    const std::vector<xla::ifrt::Device*>& devices) {
+  for (const int i : variable_arg_indices) {
+    if (inputs[i].dtype() != tensorflow::DT_STRING ||
+        !tensorflow::TensorShapeUtils::IsScalar(inputs[i].shape())) {
+      return absl::FailedPreconditionError(
+          absl::StrCat("Expected a scalar tensor as loaded variable array key, "
+                       "but got type ",
+                       inputs[i].dtype(), " and shape ",
+                       inputs[i].shape().DebugString(), " at index ", i));
+    }
+    std::string runtime_name = inputs[i].scalar<tsl::tstring>()();
+    // TODO(b/339521818): Add test cases for OpSharding on variables.
+    VariableDeviceShardingConfigProto sharding_config;
+    *sharding_config.mutable_sharding() =
+        executable_bundle.compile_metadata.args()[i].sharding();
+    for (const auto& device : devices) {
+      sharding_config.add_device_ids(device->Id().value());
+    }
+
+    TF_RETURN_IF_ERROR(
+        ifrt_serving::AsyncLoadRestoredTensorAsIfrtLoadedVariable(
+            runtime_name, ifrt_client_, thread_pool_,
+            ifrt_restore_tensor_registry_, ifrt_loaded_variable_registry_,
+            checkpoint_loader_queue_, sharding_config));
+  }
+  return absl::OkStatus();
+}
 }  // namespace ifrt_serving
 }  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h
index 691a22b1e4638b..f7a67e680a07e0 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h
+++ b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TFRT_IFRT_IFRT_SERVING_EXECUTABLE_H_
 #define TENSORFLOW_CORE_TFRT_IFRT_IFRT_SERVING_EXECUTABLE_H_
 
+#include <algorithm>
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <utility>
@@ -29,7 +31,6 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
@@ -40,6 +41,7 @@ limitations under the License.
 #include "xla/python/ifrt/future.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/xla_data.pb.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -47,33 +49,28 @@ limitations under the License.
 #include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_serving_core_selector.h"
 #include "tensorflow/core/tfrt/ifrt/tf_host_callback.h"
-#include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/threadpool.h"
+#include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
 
 namespace tensorflow {
 namespace ifrt_serving {
 
 class IfrtServingExecutable {
  public:
-  IfrtServingExecutable(
-      absl::string_view model_name, absl::string_view signature_name,
+  static absl::StatusOr<std::unique_ptr<IfrtServingExecutable>> Create(
+      int64_t program_id, absl::string_view model_name,
+      absl::string_view signature_name,
       mlir::OwningOpRef<mlir::ModuleOp> module,
       std::shared_ptr<xla::ifrt::Client> client,
       const tsl::thread::ThreadPool* thread_pool,
-      const IfrtLoadedVariableRegistry* ifrt_loaded_variable_registry,
-      const IfrtRestoreTensorRegistry* ifrt_restore_tensor_registry,
-      tensorflow::StaticDeviceMgr* device_mgr,
-      tensorflow::XlaHelpers::ShapeRepresentationFn shape_representation_fn)
-      : model_name_(std::string(model_name)),
-        signature_name_(std::string(signature_name)),
-        module_(std::move(module)),
-        ifrt_client_(std::move(client)),
-        thread_pool_(*thread_pool),
-        ifrt_loaded_variable_registry_(*ifrt_loaded_variable_registry),
-        ifrt_restore_tensor_registry_(*ifrt_restore_tensor_registry),
-        device_mgr_(device_mgr),
-        shape_representation_fn_(std::move(shape_representation_fn)) {}
+      IfrtLoadedVariableRegistry* ifrt_loaded_variable_registry,
+      const IfrtRestoreTensorRegistry* ifrt_restore,
+      tfrt::ConcurrentWorkQueue* checkpoint_loader_queue,
+      tensorflow::DeviceMgr* device_mgr,
+      tensorflow::XlaHelpers::ShapeRepresentationFn shape_representation_fn,
+      IfrtServingCoreSelector* ifrt_serving_core_selector);
 
   // Movable but not copyable.
   IfrtServingExecutable(IfrtServingExecutable&& other) = default;
@@ -90,6 +87,10 @@ class IfrtServingExecutable {
       absl::Span<const tensorflow::Tensor> inputs,
       absl::Span<const int> variable_arg_indices);
 
+  // Freezes the model. After the Freeze(), JIT compile is not supported and
+  // Execute() will return error if inputs contain uncompiled shapes.
+  void Freeze();
+
   int num_executables() const {
     absl::MutexLock lock(&mutex_);
     return executable_bundles_.size();
@@ -115,41 +116,93 @@ class IfrtServingExecutable {
   };
 
   struct CachedExecutableBundle {
-    std::shared_ptr<xla::ifrt::LoadedExecutable> ifrt_executable;
+    std::unique_ptr<xla::ifrt::LoadedExecutable> ifrt_executable;
     tensorflow::tpu::TPUCompileMetadataProto compile_metadata;
-    // TODO(b/322541827): change from std::shared_ptr to std::unique_ptr once
-    // we avoid copy use of CachedExectableBundle.
-    std::vector<std::shared_ptr<TfHostCallback>> host_callbacks;
+    std::vector<std::unique_ptr<TfHostCallback>> host_callbacks;
+
+    CachedExecutableBundle() = default;
+    // Move only
+    CachedExecutableBundle(CachedExecutableBundle&& other) = default;
+    CachedExecutableBundle& operator=(CachedExecutableBundle&& other) = default;
+    CachedExecutableBundle(const CachedExecutableBundle& other) = delete;
+    CachedExecutableBundle& operator=(const CachedExecutableBundle& other) =
+        delete;
   };
 
+  IfrtServingExecutable(
+      int64_t program_id, absl::string_view model_name,
+      absl::string_view signature_name,
+      mlir::OwningOpRef<mlir::ModuleOp> module,
+      std::shared_ptr<xla::ifrt::Client> client,
+      const tsl::thread::ThreadPool* thread_pool,
+      IfrtLoadedVariableRegistry* ifrt_loaded_variable_registry,
+      const IfrtRestoreTensorRegistry* ifrt_restore_tensor_registry,
+      tfrt::ConcurrentWorkQueue* checkpoint_loader_queue,
+      tensorflow::DeviceMgr* device_mgr,
+      tensorflow::XlaHelpers::ShapeRepresentationFn shape_representation_fn,
+      IfrtServingCoreSelector* ifrt_serving_core_selector,
+      tensorflow::tpu::TPUCompileMetadataProto original_compile_metadata)
+      : program_id_(program_id),
+        model_name_(std::string(model_name)),
+        signature_name_(std::string(signature_name)),
+        module_(std::move(module)),
+        original_compile_metadata_(std::move(original_compile_metadata)),
+        ifrt_client_(std::move(client)),
+        thread_pool_(*thread_pool),
+        ifrt_loaded_variable_registry_(*ifrt_loaded_variable_registry),
+        ifrt_restore_tensor_registry_(*ifrt_restore_tensor_registry),
+        checkpoint_loader_queue_(checkpoint_loader_queue),
+        device_mgr_(device_mgr),
+        shape_representation_fn_(std::move(shape_representation_fn)),
+        ifrt_serving_core_selector_(std::move(ifrt_serving_core_selector)) {}
+
+  int64_t program_id_;
+  using SharedCachedExecutableBundle = std::shared_ptr<CachedExecutableBundle>;
+
   std::string model_name_;
   std::string signature_name_;
 
-  std::unique_ptr<mlir::MLIRContext> context_;
-  mlir::OwningOpRef<mlir::ModuleOp> module_;
+  mlir::OwningOpRef<mlir::ModuleOp> module_ ABSL_GUARDED_BY(mutex_);
+  // The original compile metadata. We need to keep it around to be able to
+  // test portable execution condition even if the Module itsel is already
+  // released.
+  tensorflow::tpu::TPUCompileMetadataProto original_compile_metadata_;
 
   std::shared_ptr<xla::ifrt::Client> ifrt_client_;
   const tsl::thread::ThreadPool& thread_pool_;
 
-  const IfrtLoadedVariableRegistry& ifrt_loaded_variable_registry_;
+  IfrtLoadedVariableRegistry& ifrt_loaded_variable_registry_;
   const IfrtRestoreTensorRegistry& ifrt_restore_tensor_registry_;
-  tensorflow::StaticDeviceMgr* device_mgr_;  // Not owned. For host callback.
+  tfrt::ConcurrentWorkQueue* checkpoint_loader_queue_;
+  tensorflow::DeviceMgr* device_mgr_;  // Not owned. For host callback.
   tensorflow::XlaHelpers::ShapeRepresentationFn shape_representation_fn_;
+  IfrtServingCoreSelector* ifrt_serving_core_selector_;
 
   mutable absl::Mutex mutex_;
-  absl::flat_hash_map<Key,
-                      xla::ifrt::Future<absl::StatusOr<CachedExecutableBundle>>>
+  absl::flat_hash_map<Key, xla::ifrt::Future<SharedCachedExecutableBundle>>
       executable_bundles_ ABSL_GUARDED_BY(mutex_);
 
+  bool is_frozen_ ABSL_GUARDED_BY(mutex_) = false;
+
+  // Asynchronously load the restored variable tensors to Ifrt array.
+  absl::Status AsyncLoadIfrtArray(
+      absl::Span<const tensorflow::Tensor> inputs,
+      absl::Span<const int> variable_arg_indices,
+      const CachedExecutableBundle& executable_bundle,
+      const std::vector<xla::ifrt::Device*>& devices);
+
   absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> ConvertTensorToArray(
       const tensorflow::Tensor& tensor,
       const xla::ifrt::DeviceList& device_list,
       const xla::OpSharding& sharding);
 
-  xla::ifrt::Future<absl::StatusOr<CachedExecutableBundle>>
-  LookUpOrCreateExecutable(absl::Span<const DtypeAndShape> dtypes_and_shapes);
-  absl::StatusOr<IfrtServingExecutable::CachedExecutableBundle>
+  xla::ifrt::Future<SharedCachedExecutableBundle> LookUpOrCreateExecutable(
+      const tensorflow::tpu::TPUCompileMetadataProto& compile_metadata,
+      absl::Span<const DtypeAndShape> dtypes_and_shapes);
+  absl::StatusOr<IfrtServingExecutable::SharedCachedExecutableBundle>
   CreateExecutableSynchronously(
+      mlir::OwningOpRef<mlir::ModuleOp> module_copy,
+      const tensorflow::tpu::TPUCompileMetadataProto& compile_metadata,
       absl::Span<const DtypeAndShape> dtypes_and_shapes);
 
   absl::StatusOr<std::unique_ptr<xla::ifrt::Sharding>> CreateSharding(
@@ -158,6 +211,9 @@ class IfrtServingExecutable {
 
   std::vector<xla::ifrt::Shape> GetArgShape(
       int arg_index, const CachedExecutableBundle& entry);
+
+  bool UsePortableExecution(
+      const tensorflow::tpu::TPUCompileMetadataProto& compile_metadata);
 };
 
 }  // namespace ifrt_serving
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test.cc b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test.cc
index 593deedb4cb22b..aca38747cff03d 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test.cc
+++ b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test.cc
@@ -23,36 +23,27 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
-#include "mlir/InitAllDialects.h"  // from @llvm-project
-#include "mlir/Parser/Parser.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
-#include "tensorflow/compiler/tf2xla/xla_helpers.h"
-#include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/future.h"
 #include "xla/python/ifrt/test_util.h"
-#include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_matcher.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/platform/resource_loader.h"
 #include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h"
-#include "tensorflow/core/tfrt/ifrt/sharding_utils.h"
-#include "tensorflow/core/tfrt/ifrt/tf_host_callback.h"
-#include "tsl/concurrency/ref_count.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_serving_core_selector.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test_util.h"
+#include "tsl/framework/serving_device_selector.h"
+#include "tsl/framework/test_util/mock_serving_device_selector.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/threadpool.h"
@@ -61,6 +52,13 @@ limitations under the License.
 namespace tensorflow {
 namespace ifrt_serving {
 namespace {
+using tensorflow::ifrt_serving::test_utils::GetMlirModulePath;
+using ::tensorflow::test::AsTensor;
+using ::tensorflow::test::TensorEq;
+using ::testing::ElementsAre;
+using ::testing::Return;
+using ::tsl::testing::StatusIs;
+
 struct VariableInputTestParam {
   std::vector<tensorflow::Tensor> in_tensors;
   std::vector<bool>
@@ -70,10 +68,6 @@ struct VariableInputTestParam {
 };
 using VariableInputTest = ::testing::TestWithParam<VariableInputTestParam>;
 
-using ::tensorflow::test::AsTensor;
-using ::tensorflow::test::TensorEq;
-using ::testing::ElementsAre;
-
 const tsl::thread::ThreadPool& GetThreadPool() {
   constexpr int kMaxParallelism = 16;
   static auto* const thread_pool =
@@ -81,46 +75,36 @@ const tsl::thread::ThreadPool& GetThreadPool() {
                                   "IfrtSharding", kMaxParallelism);
   return *thread_pool;
 }
+class IfrtServingExecutableTest : public ::testing::Test {
+ protected:
+  explicit IfrtServingExecutableTest() {
+    absl::StatusOr<std::shared_ptr<xla::ifrt::Client>> client =
+        xla::ifrt::test_util::GetClient();
+    CHECK_OK(client);
+    client_ = *std::move(client);
+    core_selector_ = std::make_unique<IfrtServingCoreSelector>(&selector_);
+  }
 
-TEST(IfrtServingExecutableTest, Basic) {
-  // Create test input module
-  constexpr absl::string_view kDataDirectory =
-      "tensorflow/core/tfrt/ifrt/testdata";
-  std::string mlir_module_path = tensorflow::GetDataDependencyFilepath(
-      absl::StrCat(kDataDirectory, "/executable.mlir"));
-
-  mlir::DialectRegistry registry;
-  mlir::registerAllDialects(registry);
-  mlir::RegisterAllTensorFlowDialects(registry);
-
-  mlir::MLIRContext context(registry);
-
-  mlir::OwningOpRef<mlir::ModuleOp> mlir_module =
-      mlir::parseSourceFile<mlir::ModuleOp>(mlir_module_path, &context);
-
-  ASSERT_TRUE(mlir_module);
-
-  // Create contexts required for the compiler execution.
-  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::ifrt::Client> client,
-                          xla::ifrt::test_util::GetClient());
-
-  IfrtLoadedVariableRegistry ifrt_loaded_variable_registry;
-  IfrtRestoreTensorRegistry ifrt_restore_tensor_registry;
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<tensorflow::StaticDeviceMgr> device_mgr,
-      CreateTfStaticDeviceMgr());
+  tsl::test_util::MockServingDeviceSelector selector_;
+  std::unique_ptr<IfrtServingCoreSelector> core_selector_;
+  std::shared_ptr<xla::ifrt::Client> client_;
+};
 
-  IfrtServingExecutable executable(
-      "test", "main", std::move(mlir_module), client, &GetThreadPool(),
-      &ifrt_loaded_variable_registry, &ifrt_restore_tensor_registry,
-      device_mgr.get(), tensorflow::IdentityShapeRepresentationFn());
+TEST_F(IfrtServingExecutableTest, Basic) {
+  test_utils::IfrtServingExecutableTestHelper helper(&selector_);
+  int64_t program_id = 123456;
+  EXPECT_CALL(selector_, ReserveDevice(absl::StrCat(program_id)))
+      .Times(1)
+      .WillOnce(Return(tsl::DeviceReservation(0, /*selector=*/nullptr)));
+  auto executable =
+      helper.MakeExecutable(program_id, GetMlirModulePath("executable.mlir"));
 
   auto x = AsTensor<int32_t>({1, 2, 3}, tensorflow::TensorShape({1, 3}));
   auto y = AsTensor<int32_t>({1, 2, 3}, tensorflow::TensorShape({3, 1}));
   std::vector<tensorflow::Tensor> inputs{x, y};
 
   TF_ASSERT_OK_AND_ASSIGN(auto result,
-                          executable.Execute(absl::MakeSpan(inputs), {}));
+                          executable->Execute(absl::MakeSpan(inputs), {}));
 
   const auto expected_out =
       AsTensor<int32_t>({14}, tensorflow::TensorShape({1, 1}));
@@ -128,38 +112,15 @@ TEST(IfrtServingExecutableTest, Basic) {
   EXPECT_THAT(result, ElementsAre(TensorEq(expected_out)));
 }
 
-TEST(IfrtServingExecutableTest, MultipleShapes) {
-  // Create test input module
-  constexpr absl::string_view kDataDirectory =
-      "tensorflow/core/tfrt/ifrt/testdata";
-  std::string mlir_module_path = tensorflow::GetDataDependencyFilepath(
-      absl::StrCat(kDataDirectory, "/executable.mlir"));
-
-  mlir::DialectRegistry registry;
-  mlir::registerAllDialects(registry);
-  mlir::RegisterAllTensorFlowDialects(registry);
-
-  mlir::MLIRContext context(registry);
-
-  mlir::OwningOpRef<mlir::ModuleOp> mlir_module =
-      mlir::parseSourceFile<mlir::ModuleOp>(mlir_module_path, &context);
-
-  ASSERT_TRUE(mlir_module);
-
-  // Create contexts required for the compiler execution.
-  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::ifrt::Client> client,
-                          xla::ifrt::test_util::GetClient());
-
-  IfrtLoadedVariableRegistry ifrt_loaded_variable_registry;
-  IfrtRestoreTensorRegistry ifrt_restore_tensor_registry;
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<tensorflow::StaticDeviceMgr> device_mgr,
-      CreateTfStaticDeviceMgr());
-
-  IfrtServingExecutable executable(
-      "test", "main", std::move(mlir_module), client, &GetThreadPool(),
-      &ifrt_loaded_variable_registry, &ifrt_restore_tensor_registry,
-      device_mgr.get(), tensorflow::IdentityShapeRepresentationFn());
+TEST_F(IfrtServingExecutableTest, MultipleShapes) {
+  test_utils::IfrtServingExecutableTestHelper helper(&selector_);
+  int64_t program_id = 123456;
+  EXPECT_CALL(selector_, ReserveDevice(absl::StrCat(program_id)))
+      .Times(6)
+      .WillRepeatedly(
+          [](::testing::Unused) { return tsl::DeviceReservation(0, nullptr); });
+  auto executable =
+      helper.MakeExecutable(program_id, GetMlirModulePath("executable.mlir"));
 
   auto x1 = AsTensor<int32_t>({1, 2, 3}, tensorflow::TensorShape({1, 3}));
   auto y1 = AsTensor<int32_t>({1, 2, 3}, tensorflow::TensorShape({3, 1}));
@@ -177,50 +138,63 @@ TEST(IfrtServingExecutableTest, MultipleShapes) {
   std::vector<tensorflow::Tensor> outputs1, outputs2;
   for (int i = 0; i < 3; i++) {
     TF_ASSERT_OK_AND_ASSIGN(outputs1,
-                            executable.Execute(absl::MakeSpan(inputs1), {}));
+                            executable->Execute(absl::MakeSpan(inputs1), {}));
     TF_ASSERT_OK_AND_ASSIGN(outputs2,
-                            executable.Execute(absl::MakeSpan(inputs2), {}));
+                            executable->Execute(absl::MakeSpan(inputs2), {}));
   }
 
-  ASSERT_EQ(executable.num_executables(), 2);
+  ASSERT_EQ(executable->num_executables(), 2);
 
   EXPECT_THAT(outputs1, ElementsAre(TensorEq(expected_out1)));
 
   EXPECT_THAT(outputs2, ElementsAre(TensorEq(expected_out2)));
 }
 
-TEST(IfrtServingExecutableTest, Spmd) {
-  // Create test input module
-  constexpr absl::string_view kDataDirectory =
-      "tensorflow/core/tfrt/ifrt/testdata";
-  std::string mlir_module_path = tensorflow::GetDataDependencyFilepath(
-      absl::StrCat(kDataDirectory, "/spmd_executable.mlir"));
-
-  mlir::DialectRegistry registry;
-  mlir::registerAllDialects(registry);
-  mlir::RegisterAllTensorFlowDialects(registry);
+TEST_F(IfrtServingExecutableTest, ReturnFailOnUncompiledShapeAfterFrozen) {
+  test_utils::IfrtServingExecutableTestHelper helper(&selector_);
+  int64_t program_id = 123456;
+  EXPECT_CALL(selector_, ReserveDevice(absl::StrCat(program_id)))
+      .Times(3)
+      .WillRepeatedly(
+          [](::testing::Unused) { return tsl::DeviceReservation(0, nullptr); });
+  auto executable =
+      helper.MakeExecutable(program_id, GetMlirModulePath("executable.mlir"));
 
-  mlir::MLIRContext context(registry);
-
-  mlir::OwningOpRef<mlir::ModuleOp> mlir_module =
-      mlir::parseSourceFile<mlir::ModuleOp>(mlir_module_path, &context);
+  auto x1 = AsTensor<int32_t>({1, 2, 3}, tensorflow::TensorShape({1, 3}));
+  auto y1 = AsTensor<int32_t>({1, 2, 3}, tensorflow::TensorShape({3, 1}));
+  const auto expected_out1 =
+      AsTensor<int32_t>({14}, tensorflow::TensorShape({1, 1}));
+  std::vector<tensorflow::Tensor> inputs1{x1, y1};
+  std::vector<tensorflow::Tensor> outputs1;
+  TF_ASSERT_OK_AND_ASSIGN(outputs1,
+                          executable->Execute(absl::MakeSpan(inputs1), {}));
+
+  // Freeze the model
+  executable->Freeze();
+
+  // After the freeze(), already compiled shape works ok, but uncompiled shape
+  // shall return failure.
+  outputs1.clear();
+  TF_ASSERT_OK_AND_ASSIGN(outputs1,
+                          executable->Execute(absl::MakeSpan(inputs1), {}));
+  EXPECT_THAT(outputs1, ElementsAre(TensorEq(expected_out1)));
 
-  ASSERT_TRUE(mlir_module);
+  auto x2 = AsTensor<int32_t>({1, 2, 3, 4}, tensorflow::TensorShape({1, 4}));
+  auto y2 = AsTensor<int32_t>({1, 2, 3, 4}, tensorflow::TensorShape({4, 1}));
+  std::vector<tensorflow::Tensor> inputs2{x2, y2};
 
-  // Create contexts required for the compiler execution.
-  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::ifrt::Client> client,
-                          xla::ifrt::test_util::GetClient());
+  std::vector<tensorflow::Tensor> outputs2;
+  auto status = executable->Execute(absl::MakeSpan(inputs2), {});
 
-  IfrtLoadedVariableRegistry ifrt_loaded_variable_registry;
-  IfrtRestoreTensorRegistry ifrt_restore_tensor_registry;
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<tensorflow::StaticDeviceMgr> device_mgr,
-      CreateTfStaticDeviceMgr());
+  EXPECT_THAT(status, StatusIs(absl::StatusCode::kFailedPrecondition));
+}
 
-  IfrtServingExecutable executable(
-      "test", "main", std::move(mlir_module), client, &GetThreadPool(),
-      &ifrt_loaded_variable_registry, &ifrt_restore_tensor_registry,
-      device_mgr.get(), tensorflow::IdentityShapeRepresentationFn());
+TEST_F(IfrtServingExecutableTest, Spmd) {
+  test_utils::IfrtServingExecutableTestHelper helper(&selector_);
+  int64_t program_id = 111111;
+  EXPECT_CALL(selector_, ReserveDevice(absl::StrCat(program_id))).Times(0);
+  auto executable = helper.MakeExecutable(
+      program_id, GetMlirModulePath("spmd_executable.mlir"));
 
   auto x = AsTensor<int32_t>({1, 2, 3, 4, 5, 6, 7, 8},
                              tensorflow::TensorShape({4, 2}));
@@ -235,43 +209,17 @@ TEST(IfrtServingExecutableTest, Spmd) {
 
   std::vector<tensorflow::Tensor> inputs{x, y, z};
   TF_ASSERT_OK_AND_ASSIGN(auto result,
-                          executable.Execute(absl::MakeSpan(inputs), {}));
+                          executable->Execute(absl::MakeSpan(inputs), {}));
 
   EXPECT_THAT(result, ElementsAre(TensorEq(expected_out)));
 }
 
-TEST(IfrtServingExecutableTest, SpmdTwoReturns) {
-  // Create test input module
-  constexpr absl::string_view kDataDirectory =
-      "tensorflow/core/tfrt/ifrt/testdata";
-  std::string mlir_module_path = tensorflow::GetDataDependencyFilepath(
-      absl::StrCat(kDataDirectory, "/spmd_executable_two_returns.mlir"));
-
-  mlir::DialectRegistry registry;
-  mlir::registerAllDialects(registry);
-  mlir::RegisterAllTensorFlowDialects(registry);
-
-  mlir::MLIRContext context(registry);
-
-  mlir::OwningOpRef<mlir::ModuleOp> mlir_module =
-      mlir::parseSourceFile<mlir::ModuleOp>(mlir_module_path, &context);
-
-  ASSERT_TRUE(mlir_module);
-
-  // Create contexts required for the compiler execution.
-  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::ifrt::Client> client,
-                          xla::ifrt::test_util::GetClient());
-
-  IfrtLoadedVariableRegistry ifrt_loaded_variable_registry;
-  IfrtRestoreTensorRegistry ifrt_restore_tensor_registry;
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<tensorflow::StaticDeviceMgr> device_mgr,
-      CreateTfStaticDeviceMgr());
-
-  IfrtServingExecutable executable(
-      "test", "main", std::move(mlir_module), client, &GetThreadPool(),
-      &ifrt_loaded_variable_registry, &ifrt_restore_tensor_registry,
-      device_mgr.get(), tensorflow::IdentityShapeRepresentationFn());
+TEST_F(IfrtServingExecutableTest, SpmdTwoReturns) {
+  test_utils::IfrtServingExecutableTestHelper helper(&selector_);
+  int64_t program_id = 111111;
+  EXPECT_CALL(selector_, ReserveDevice(absl::StrCat(program_id))).Times(0);
+  auto executable = helper.MakeExecutable(
+      program_id, GetMlirModulePath("spmd_executable_two_returns.mlir"));
 
   auto x = AsTensor<int32_t>({1, 2, 3, 4, 5, 6, 7, 8},
                              tensorflow::TensorShape({4, 2}));
@@ -289,119 +237,62 @@ TEST(IfrtServingExecutableTest, SpmdTwoReturns) {
   std::vector<tensorflow::Tensor> inputs{x, y, z};
 
   TF_ASSERT_OK_AND_ASSIGN(auto result,
-                          executable.Execute(absl::MakeSpan(inputs), {}));
+                          executable->Execute(absl::MakeSpan(inputs), {}));
 
   EXPECT_THAT(result,
               ElementsAre(TensorEq(expected_out0), TensorEq(expected_out1)));
 }
 
-TEST(IfrtServingExecutableTest, NoReturn) {
-  // Create test input module
-  constexpr absl::string_view kDataDirectory =
-      "tensorflow/core/tfrt/ifrt/testdata";
-  std::string mlir_module_path = tensorflow::GetDataDependencyFilepath(
-      absl::StrCat(kDataDirectory, "/executable_no_return.mlir"));
-
-  mlir::DialectRegistry registry;
-  mlir::registerAllDialects(registry);
-  mlir::RegisterAllTensorFlowDialects(registry);
-
-  mlir::MLIRContext context(registry);
-
-  mlir::OwningOpRef<mlir::ModuleOp> mlir_module =
-      mlir::parseSourceFile<mlir::ModuleOp>(mlir_module_path, &context);
-
-  ASSERT_TRUE(mlir_module);
-
-  // Create contexts required for the compiler execution.
-  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::ifrt::Client> client,
-                          xla::ifrt::test_util::GetClient());
-
-  IfrtLoadedVariableRegistry ifrt_loaded_variable_registry;
-  IfrtRestoreTensorRegistry ifrt_restore_tensor_registry;
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<tensorflow::StaticDeviceMgr> device_mgr,
-      CreateTfStaticDeviceMgr());
-
-  IfrtServingExecutable executable(
-      "test", "main", std::move(mlir_module), client, &GetThreadPool(),
-      &ifrt_loaded_variable_registry, &ifrt_restore_tensor_registry,
-      device_mgr.get(), tensorflow::IdentityShapeRepresentationFn());
+TEST_F(IfrtServingExecutableTest, NoReturn) {
+  test_utils::IfrtServingExecutableTestHelper helper(&selector_);
+  int64_t program_id = 111111;
+  EXPECT_CALL(selector_, ReserveDevice(absl::StrCat(program_id)))
+      .Times(1)
+      .WillRepeatedly(
+          [](::testing::Unused) { return tsl::DeviceReservation(0, nullptr); });
+  auto executable = helper.MakeExecutable(
+      program_id, GetMlirModulePath("executable_no_return.mlir"));
 
   auto x = AsTensor<int32_t>({1, 2, 3}, tensorflow::TensorShape({1, 3}));
   auto y = AsTensor<int32_t>({1, 2, 3}, tensorflow::TensorShape({3, 1}));
   std::vector<tensorflow::Tensor> inputs{x, y};
 
   TF_ASSERT_OK_AND_ASSIGN(auto result,
-                          executable.Execute(absl::MakeSpan(inputs), {}));
+                          executable->Execute(absl::MakeSpan(inputs), {}));
 
   ASSERT_EQ(result.size(), 0);
 }
 
 TEST_P(VariableInputTest, InterleaveVariable) {
-  // Create test input module
-  constexpr absl::string_view kDataDirectory =
-      "tensorflow/core/tfrt/ifrt/testdata";
-  std::string mlir_module_path = tensorflow::GetDataDependencyFilepath(
-      absl::StrCat(kDataDirectory, "/executable_long_inputs.mlir"));
-
-  mlir::DialectRegistry registry;
-  mlir::registerAllDialects(registry);
-  mlir::RegisterAllTensorFlowDialects(registry);
-
-  mlir::MLIRContext context(registry);
-
-  mlir::OwningOpRef<mlir::ModuleOp> mlir_module =
-      mlir::parseSourceFile<mlir::ModuleOp>(mlir_module_path, &context);
-
-  ASSERT_TRUE(mlir_module);
-
-  // Create contexts required for the compiler execution.
-  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::ifrt::Client> client,
-                          xla::ifrt::test_util::GetClient());
-
-  IfrtLoadedVariableRegistry ifrt_loaded_variable_registry;
-  IfrtRestoreTensorRegistry ifrt_restore_tensor_registry;
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<tensorflow::StaticDeviceMgr> device_mgr,
-      CreateTfStaticDeviceMgr());
-  IfrtServingExecutable executable(
-      "test", "main", std::move(mlir_module), client, &GetThreadPool(),
-      &ifrt_loaded_variable_registry, &ifrt_restore_tensor_registry,
-      device_mgr.get(), tensorflow::IdentityShapeRepresentationFn());
+  tsl::test_util::MockServingDeviceSelector device_selector;
+  test_utils::IfrtServingExecutableTestHelper helper(&device_selector);
+  int64_t program_id = 111111;
+  EXPECT_CALL(device_selector, ReserveDevice(absl::StrCat(program_id)))
+      .Times(1)
+      .WillRepeatedly(
+          [](::testing::Unused) { return tsl::DeviceReservation(0, nullptr); });
+  auto executable = helper.MakeExecutable(
+      program_id, GetMlirModulePath("executable_long_inputs.mlir"));
+  IfrtRestoreTensorRegistry* ifrt_restore_tensor_registry =
+      helper.ifrt_restore_tensor_registry();
 
   std::vector<tensorflow::Tensor> inputs;
   std::vector<int> loaded_variable_indices;
   for (int i = 0; i < GetParam().in_tensors.size(); i++) {
     if (GetParam().is_variable[i]) {
+      auto input_tensor_promise =
+          xla::ifrt::Future<tensorflow::Tensor>::CreatePromise();
+      auto input_tensor_future =
+          xla::ifrt::Future<tensorflow::Tensor>(input_tensor_promise);
       IfrtRestoreTensorRegistry::RestoredTensorInfo restore_tensor_info = {
-          {GetParam().in_tensors[i].dtype(), GetParam().in_tensors[i].shape()}};
+          .dtype_and_shape{.dtype = GetParam().in_tensors[i].dtype(),
+                           .shape = GetParam().in_tensors[i].shape()},
+          .tensor_future = input_tensor_future};
       std::string variable_name = absl::StrCat("variable_", i);
-      ASSERT_OK(ifrt_restore_tensor_registry.TryRegister(variable_name,
-                                                         restore_tensor_info));
-      ASSERT_OK(ifrt_loaded_variable_registry.TryRegisterLoadedVariable(
-          variable_name,
-          [&]() -> absl::StatusOr<IfrtLoadedVariableRegistry::LoadedVariable> {
-            tensorflow::Tensor in_tensor = GetParam().in_tensors[i];
-            TF_ASSIGN_OR_RETURN(
-                tsl::RCReference<xla::ifrt::Array> array,
-                MakeArrayFromTensor(*client, in_tensor,
-                                    /*device_ids=*/{0},
-                                    xla::HloSharding::Replicate(),
-                                    GetThreadPool()));
-
-            auto promise = xla::ifrt::Future<absl::StatusOr<
-                tsl::RCReference<xla::ifrt::Array>>>::CreatePromise();
-            auto future = xla::ifrt::Future<
-                absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>>(promise);
-            promise.Set(array);
-
-            IfrtLoadedVariableRegistry::LoadedVariable loaded_variable;
-            loaded_variable.array = future;
-            return loaded_variable;
-          }));
+      ASSERT_OK(ifrt_restore_tensor_registry->TryRegister(variable_name,
+                                                          restore_tensor_info));
       loaded_variable_indices.push_back(i);
-
+      input_tensor_promise.Set(GetParam().in_tensors[i]);
       // Use string tensor containing the key (name) in place of variable
       // tensor.
       tensorflow::Tensor key_tensor(tensorflow::DT_STRING, {});
@@ -415,8 +306,9 @@ TEST_P(VariableInputTest, InterleaveVariable) {
   ASSERT_EQ(inputs.size(), GetParam().is_variable.size());
 
   TF_ASSERT_OK_AND_ASSIGN(
-      auto result, executable.Execute(absl::MakeSpan(inputs),
-                                      absl::MakeSpan(loaded_variable_indices)));
+      auto result,
+      executable->Execute(absl::MakeSpan(inputs),
+                          absl::MakeSpan(loaded_variable_indices)));
 
   EXPECT_THAT(result,
               ElementsAre(TensorEq(GetParam().expected_out_tensors[0]),
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test_util.cc b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test_util.cc
new file mode 100644
index 00000000000000..49ca7b3a9bbd98
--- /dev/null
+++ b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test_util.cc
@@ -0,0 +1,95 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test_util.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/InitAllDialects.h"  // from @llvm-project
+#include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "xla/python/ifrt/test_util.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/resource_loader.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_serving_core_selector.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h"
+#include "tensorflow/core/tfrt/ifrt/tf_host_callback.h"
+#include "tsl/framework/test_util/mock_serving_device_selector.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/threadpool.h"
+#include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
+namespace tensorflow {
+namespace ifrt_serving {
+namespace test_utils {
+
+inline constexpr absl::string_view kMlirModulePath =
+    "tensorflow/core/tfrt/ifrt/testdata/";
+
+std::string GetMlirModulePath(absl::string_view module_name) {
+  return tensorflow::GetDataDependencyFilepath(
+      absl::StrCat(kMlirModulePath, module_name));
+}
+
+IfrtServingExecutableTestHelper::IfrtServingExecutableTestHelper(
+    tsl::test_util::MockServingDeviceSelector* device_selector)
+    : device_selector_(device_selector) {
+  core_selector_ = std::make_unique<IfrtServingCoreSelector>(device_selector_);
+  auto client_or = xla::ifrt::test_util::GetClient();
+  TF_CHECK_OK(client_or.status());
+  client_ = std::move(client_or.value());
+
+  thread_pool_ = std::make_unique<tsl::thread::ThreadPool>(
+      tsl::Env::Default(), tsl::ThreadOptions(), "IfrtSharding",
+      kThreadPoolNumThreads);
+  work_queue_ = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+
+  auto device_mgr_or = ifrt_serving::CreateTfStaticDeviceMgr();
+  TF_CHECK_OK(device_mgr_or.status());
+  device_mgr_ = std::move(device_mgr_or.value());
+
+  mlir::registerAllDialects(registry_);
+  mlir::RegisterAllTensorFlowDialects(registry_);
+  context_ = std::make_unique<mlir::MLIRContext>(registry_);
+}
+
+std::unique_ptr<IfrtServingExecutable>
+IfrtServingExecutableTestHelper::MakeExecutable(int64_t program_id,
+                                                std::string mlir_module_path) {
+  auto mlir_module =
+      mlir::parseSourceFile<mlir::ModuleOp>(mlir_module_path, context_.get());
+  auto executable_or = IfrtServingExecutable::Create(
+      program_id, "test", "main", std::move(mlir_module), client_,
+      thread_pool_.get(), &ifrt_loaded_variable_registry_,
+      &ifrt_restore_tensor_registry_, work_queue_.get(), device_mgr_.get(),
+      tensorflow::IdentityShapeRepresentationFn(), core_selector_.get());
+  TF_CHECK_OK(executable_or.status());
+  return std::move(executable_or.value());
+}
+
+}  // namespace test_utils
+}  // namespace ifrt_serving
+}  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test_util.h b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test_util.h
new file mode 100644
index 00000000000000..2fc669c08a5062
--- /dev/null
+++ b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test_util.h
@@ -0,0 +1,82 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TFRT_IFRT_IFRT_SERVING_EXECUTABLE_TEST_UTIL_H_
+#define TENSORFLOW_CORE_TFRT_IFRT_IFRT_SERVING_EXECUTABLE_TEST_UTIL_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/test_util.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_serving_core_selector.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h"
+#include "tsl/framework/test_util/mock_serving_device_selector.h"
+#include "tsl/platform/threadpool.h"
+#include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace ifrt_serving {
+namespace test_utils {
+
+// A test helper class to create and IfrtServingExecutable.
+class IfrtServingExecutableTestHelper {
+ public:
+  explicit IfrtServingExecutableTestHelper(
+      tsl::test_util::MockServingDeviceSelector* device_selector);
+
+  // Creates an IfrtServingExecutable with the given program id.
+  // Note the instance of this class must outlive the returned
+  // IfrtServingExecutable.
+  std::unique_ptr<IfrtServingExecutable> MakeExecutable(
+      int64_t program_id, std::string mlir_module_path);
+
+  IfrtRestoreTensorRegistry* ifrt_restore_tensor_registry() {
+    return &ifrt_restore_tensor_registry_;
+  }
+
+ private:
+  static constexpr int kThreadPoolNumThreads = 16;
+
+  tsl::test_util::MockServingDeviceSelector* device_selector_;  // Not owned.
+  std::unique_ptr<IfrtServingCoreSelector> core_selector_;
+  std::shared_ptr<xla::ifrt::Client> client_;
+  std::unique_ptr<tsl::thread::ThreadPool> thread_pool_;
+  IfrtLoadedVariableRegistry ifrt_loaded_variable_registry_;
+  IfrtRestoreTensorRegistry ifrt_restore_tensor_registry_;
+  std::unique_ptr<tfrt::ConcurrentWorkQueue> work_queue_;
+  std::unique_ptr<tensorflow::StaticDeviceMgr> device_mgr_;
+
+  mlir::DialectRegistry registry_;
+  std::unique_ptr<mlir::MLIRContext> context_;
+};
+
+// Returns the path to the MLIR module for the given module name.
+std::string GetMlirModulePath(absl::string_view module_name);
+
+}  // namespace test_utils
+}  // namespace ifrt_serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_IFRT_IFRT_SERVING_EXECUTABLE_TEST_UTIL_H_
diff --git a/tensorflow/core/tfrt/ifrt/sharding_utils.cc b/tensorflow/core/tfrt/ifrt/sharding_utils.cc
index ceabfcd28c66b8..73ddec940649e2 100644
--- a/tensorflow/core/tfrt/ifrt/sharding_utils.cc
+++ b/tensorflow/core/tfrt/ifrt/sharding_utils.cc
@@ -45,16 +45,15 @@ limitations under the License.
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/pjrt_ifrt/xla_sharding.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_tensor_utils.h"
 #include "tensorflow/core/tpu/kernels/sharding_utils.h"
-#include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/threadpool.h"
@@ -174,8 +173,8 @@ SplitAndCreateArraysFromHostBuffer(
                   kImmutableUntilTransferCompletes,
               [tensor, slice_idx]() {
                 // Keep tensor alive
-                LOG(INFO) << "Done with host buffer for slice " << slice_idx
-                          << " at " << tensor.data();
+                VLOG(2) << "Done with host buffer for slice " << slice_idx
+                        << " at " << tensor.data();
               }));
       arrays.push_back(std::move(array));
       device_iter++;
diff --git a/tensorflow/core/tfrt/ifrt/sharding_utils.h b/tensorflow/core/tfrt/ifrt/sharding_utils.h
index 16a4bf97312b29..12f04bdc990692 100644
--- a/tensorflow/core/tfrt/ifrt/sharding_utils.h
+++ b/tensorflow/core/tfrt/ifrt/sharding_utils.h
@@ -17,15 +17,13 @@ limitations under the License.
 #define TENSORFLOW_CORE_TFRT_IFRT_SHARDING_UTILS_H_
 
 #include "absl/status/statusor.h"
-#include "xla/executable_run_options.h"
+#include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/device.h"
-#include "xla/python/ifrt/future.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/platform/statusor.h"
-#include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/threadpool.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/tfrt/ifrt/sharding_utils_test.cc b/tensorflow/core/tfrt/ifrt/sharding_utils_test.cc
index e8393a6ad20048..ded49b2c0f14c0 100644
--- a/tensorflow/core/tfrt/ifrt/sharding_utils_test.cc
+++ b/tensorflow/core/tfrt/ifrt/sharding_utils_test.cc
@@ -28,17 +28,18 @@ limitations under the License.
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/memory.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/ifrt/test_util.h"
 #include "xla/python/pjrt_ifrt/xla_sharding.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/xla_data.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_matcher.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_tensor_utils.h"
-#include "tsl/concurrency/ref_count.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/status_matchers.h"
diff --git a/tensorflow/core/tfrt/ifrt/testdata/BUILD b/tensorflow/core/tfrt/ifrt/testdata/BUILD
index 948ce54ab983a7..0fcbd0de55f217 100644
--- a/tensorflow/core/tfrt/ifrt/testdata/BUILD
+++ b/tensorflow/core/tfrt/ifrt/testdata/BUILD
@@ -1,6 +1,9 @@
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//tensorflow/core/tfrt/ifrt:__subpackages__"],
+    default_visibility = [
+        "//tensorflow/core/tfrt/ifrt:__subpackages__",
+        "//tensorflow/core/tfrt/kernels:__subpackages__",
+    ],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/core/tfrt/ifrt/tf_host_callback.cc b/tensorflow/core/tfrt/ifrt/tf_host_callback.cc
index 084d1ea1a3ec7a..5c5a48f4fc52b4 100644
--- a/tensorflow/core/tfrt/ifrt/tf_host_callback.cc
+++ b/tensorflow/core/tfrt/ifrt/tf_host_callback.cc
@@ -127,7 +127,7 @@ absl::StatusOr<std::unique_ptr<TfHostCallback>> TfHostCallback::Create(
     absl::string_view entry_function_name,
     absl::Span<const DtypeAndShape> operand_type_and_shapes,
     absl::Span<const DtypeAndShape> result_type_and_shapes,
-    tensorflow::StaticDeviceMgr* device_mgr) {
+    tensorflow::DeviceMgr* device_mgr) {
   tensorflow::SessionOptions options;
   // Explicitly disable non-CPU devices to avoid triggering TPU device
   // initialization inside TF.
diff --git a/tensorflow/core/tfrt/ifrt/tf_host_callback.h b/tensorflow/core/tfrt/ifrt/tf_host_callback.h
index ddb00050e8df58..a78b0e5d0aecea 100644
--- a/tensorflow/core/tfrt/ifrt/tf_host_callback.h
+++ b/tensorflow/core/tfrt/ifrt/tf_host_callback.h
@@ -45,7 +45,7 @@ class TfHostCallback {
       absl::string_view entry_function_name,
       absl::Span<const DtypeAndShape> operand_type_and_shapes,
       absl::Span<const DtypeAndShape> result_type_and_shapes,
-      tensorflow::StaticDeviceMgr* device_mgr);
+      tensorflow::DeviceMgr* device_mgr);
 
   // The host callback function takes two pointer arrays, each element of which
   // points to allocated host buffer in host layout according to corresponding
diff --git a/tensorflow/core/tfrt/kernels/BUILD b/tensorflow/core/tfrt/kernels/BUILD
index fa342c7a6bcb17..a6c6ba58c6659a 100644
--- a/tensorflow/core/tfrt/kernels/BUILD
+++ b/tensorflow/core/tfrt/kernels/BUILD
@@ -34,6 +34,46 @@ cc_library(
     alwayslink = 1,
 )
 
+tf_cc_test(
+    name = "ifrt_program_ops_test",
+    srcs = ["ifrt_program_ops_test.cc"],
+    data = [
+        "//tensorflow/core/tfrt/ifrt/testdata",
+    ],
+    tags = ["no_oss"],
+    deps = [
+        ":ifrt_program_ops",
+        "//tensorflow/compiler/tf2xla:xla_helpers",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/framework:tensor_matcher",
+        "//tensorflow/core/framework:types_proto_cc",
+        "//tensorflow/core/lib/gtl:cleanup",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/core/tfrt/ifrt:ifrt_executable_registry",
+        "//tensorflow/core/tfrt/ifrt:ifrt_serving_executable_test_util",
+        "//tensorflow/core/tfrt/ops:ifrt_program_ops_op_lib",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/framework:serving_device_selector",
+        "@local_tsl//tsl/framework/test_util:mock_serving_device_selector",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_xla//xla/pjrt/cpu:cpu_client",
+        "@local_xla//xla/python/ifrt",
+        "@local_xla//xla/python/ifrt:test_util",
+        "@local_xla//xla/python/pjrt_ifrt",
+    ],
+)
+
 cc_library(
     name = "stream_ops",
     srcs = ["stream_ops.cc"],
diff --git a/tensorflow/core/tfrt/kernels/ifrt_program_ops_test.cc b/tensorflow/core/tfrt/kernels/ifrt_program_ops_test.cc
new file mode 100644
index 00000000000000..292bae8da04685
--- /dev/null
+++ b/tensorflow/core/tfrt/kernels/ifrt_program_ops_test.cc
@@ -0,0 +1,121 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/log/check.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/pjrt/cpu/cpu_client.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/test_util.h"
+#include "xla/python/pjrt_ifrt/pjrt_client.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_matcher.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_executable_registry.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test_util.h"
+#include "tsl/framework/serving_device_selector.h"
+#include "tsl/framework/test_util/mock_serving_device_selector.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace tfrt_stub {
+namespace {
+
+using tensorflow::ifrt_serving::ServingExecutableRegistry;
+using tensorflow::ifrt_serving::test_utils::GetMlirModulePath;
+using tensorflow::ifrt_serving::test_utils::IfrtServingExecutableTestHelper;
+using tensorflow::test::AsTensor;
+using tensorflow::test::TensorEq;
+using ::testing::Return;
+
+const bool kUnused =
+    (xla::ifrt::test_util::RegisterClientFactory(
+         []() -> absl::StatusOr<std::shared_ptr<xla::ifrt::Client>> {
+           xla::CpuClientOptions options;
+           options.cpu_device_count = 4;
+           TF_ASSIGN_OR_RETURN(auto pjrt_client,
+                               xla::GetTfrtCpuClient(options));
+           return std::shared_ptr<xla::ifrt::Client>(
+               xla::ifrt::PjRtClient::Create(std::move(pjrt_client)));
+         }),
+     true);
+
+class IfrtCallOpTest : public OpsTestBase {
+ protected:
+  Status Init(int64_t program_id, int num_inputs, DataType input_type,
+              const std::vector<int>& variable_arg_indices,
+              const std::vector<DataType>& output_type_list) {
+    TF_CHECK_OK(NodeDefBuilder("op", "IfrtCall")
+                    .Input(FakeInput(num_inputs, input_type))
+                    .Attr("program_id", program_id)
+                    .Attr("variable_arg_indices", variable_arg_indices)
+                    .Attr("Tout", output_type_list)
+                    .Finalize(node_def()));
+    return InitOp();
+  }
+};
+
+TEST_F(IfrtCallOpTest, Basic) {
+  int64_t program_id = 123;
+  TF_ASSERT_OK(Init(
+      /*program_id=*/program_id,
+      /*num_inputs=*/2,
+      /*input_type=*/DT_INT32,
+      /*variable_arg_indices=*/{},
+      /*output_type_list=*/{DT_INT32}));
+
+  tsl::test_util::MockServingDeviceSelector selector;
+  IfrtServingExecutableTestHelper helper(&selector);
+  EXPECT_CALL(selector, ReserveDevice(absl::StrCat(program_id)))
+      .Times(1)
+      .WillOnce(Return(tsl::DeviceReservation(0, /*selector=*/nullptr)));
+  auto executable =
+      helper.MakeExecutable(program_id, GetMlirModulePath("executable.mlir"));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      ServingExecutableRegistry::Handle handle,
+      ServingExecutableRegistry::Register(program_id, std::move(executable)));
+  auto handle_cleaner = gtl::MakeCleanup([&handle] { handle.Release(); });
+
+  AddInputFromArray<int32_t>(TensorShape({1, 3}), {1, 2, 3});
+  AddInputFromArray<int32_t>(TensorShape({3, 1}), {1, 2, 3});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected_out = AsTensor<int32_t>({14}, TensorShape({1, 1}));
+  EXPECT_THAT(*GetOutput(0), TensorEq(expected_out));
+}
+
+}  // namespace
+}  // namespace tfrt_stub
+}  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/mlrt/attribute/BUILD b/tensorflow/core/tfrt/mlrt/attribute/BUILD
index 5bebee19f89926..9a9d33042fcb75 100644
--- a/tensorflow/core/tfrt/mlrt/attribute/BUILD
+++ b/tensorflow/core/tfrt/mlrt/attribute/BUILD
@@ -28,6 +28,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
         "@local_tsl//tsl/platform:errors",
     ],
 )
diff --git a/tensorflow/core/tfrt/mlrt/attribute/attribute.cc b/tensorflow/core/tfrt/mlrt/attribute/attribute.cc
index daa92fcaca5953..a9749e92871b14 100644
--- a/tensorflow/core/tfrt/mlrt/attribute/attribute.cc
+++ b/tensorflow/core/tfrt/mlrt/attribute/attribute.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.h"
@@ -42,7 +43,7 @@ absl::StatusOr<std::string> EncodeTensorflowAttribute(
     return std::move(*result);
   }
 
-  if (auto dense_attr = attr.dyn_cast<mlir::DenseElementsAttr>()) {
+  if (auto dense_attr = mlir::dyn_cast<mlir::DenseElementsAttr>(attr)) {
     auto element_type = dense_attr.getElementType();
 
     tensorflow::DataType dtype;
@@ -95,7 +96,7 @@ absl::StatusOr<std::string> EncodeTensorflowAttribute(
   }
 
   // Handle dtype attrs
-  if (auto type_attr = attr.dyn_cast<mlir::TypeAttr>()) {
+  if (auto type_attr = mlir::dyn_cast<mlir::TypeAttr>(attr)) {
     tensorflow::DataType dtype;
     TF_RETURN_IF_ERROR(
         tensorflow::ConvertToDataType(type_attr.getValue(), &dtype));
@@ -105,7 +106,7 @@ absl::StatusOr<std::string> EncodeTensorflowAttribute(
   }
 
   // Handle shape attrs
-  if (auto shape_attr = attr.dyn_cast<mlir::TF::ShapeAttr>()) {
+  if (auto shape_attr = mlir::dyn_cast<mlir::TF::ShapeAttr>(attr)) {
     llvm::ArrayRef<int64_t> shape;
     if (!shape_attr.getUnranked()) {
       auto shape_or = shape_attr.getValue();
@@ -131,7 +132,7 @@ absl::StatusOr<std::string> EncodeTensorflowAttribute(
   }
 
   // Handle attribute arrays.
-  if (auto array_attr = attr.dyn_cast<mlir::ArrayAttr>()) {
+  if (auto array_attr = mlir::dyn_cast<mlir::ArrayAttr>(attr)) {
     mlrt::bc::Buffer buffer;
     mlrt::bc::Allocator allocator(&buffer);
     auto ctor = mlrt::bc::New<mlrt::bc::Vector<tensorflow::DataType>>(
@@ -139,7 +140,7 @@ absl::StatusOr<std::string> EncodeTensorflowAttribute(
 
     int i;
     for (i = 0; i < array_attr.size(); ++i) {
-      if (auto type_attr = array_attr[i].dyn_cast<mlir::TypeAttr>()) {
+      if (auto type_attr = mlir::dyn_cast<mlir::TypeAttr>(array_attr[i])) {
         tensorflow::DataType dtype;
         TF_RETURN_IF_ERROR(
             tensorflow::ConvertToDataType(type_attr.getValue(), &dtype));
diff --git a/tensorflow/core/tfrt/mlrt/bytecode/BUILD b/tensorflow/core/tfrt/mlrt/bytecode/BUILD
index 82c10e8645d573..9e1f0a979c846a 100644
--- a/tensorflow/core/tfrt/mlrt/bytecode/BUILD
+++ b/tensorflow/core/tfrt/mlrt/bytecode/BUILD
@@ -8,7 +8,6 @@ package(
     default_visibility = [
         # copybara:uncomment "//learning/brain/experimental/tfrt:__subpackages__",
         # copybara:uncomment "//learning/brain/tfrt:__subpackages__",
-        # copybara:uncomment "//learning/infra/mira/distributed:__subpackages__",
         "//tensorflow/compiler/mlir/tfrt/transforms/mlrt:__subpackages__",
         "//tensorflow/compiler/mlir/tfrt/translate/mlrt:__subpackages__",
         "//tensorflow/core/tfrt:__subpackages__",
diff --git a/tensorflow/core/tfrt/mlrt/interpreter/BUILD b/tensorflow/core/tfrt/mlrt/interpreter/BUILD
index 470e179c1ed163..552959b1ce0c5e 100644
--- a/tensorflow/core/tfrt/mlrt/interpreter/BUILD
+++ b/tensorflow/core/tfrt/mlrt/interpreter/BUILD
@@ -8,7 +8,6 @@ package(
     default_visibility = [
         # copybara:uncomment "//learning/brain/experimental/tfrt:__subpackages__",
         # copybara:uncomment "//learning/brain/tfrt:__subpackages__",
-        # copybara:uncomment "//learning/infra/mira/distributed:__subpackages__",
         # copybara:uncomment "//smartass/brain/inference:__subpackages__",
         # copybara:uncomment "//smartass/brain/ops/tfrt_kernels:__subpackages__",
         "//tensorflow/compiler/mlir/tfrt/translate/mlrt:__subpackages__",
@@ -47,6 +46,8 @@ tf_cc_test(
     srcs = ["context_test.cc"],
     deps = [
         ":context",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -62,6 +63,8 @@ cc_library(
         "//tensorflow/core/tfrt/mlrt/bytecode:kernel",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
@@ -124,7 +127,7 @@ cc_library(
         ":future",
         ":value",
         "@com_google_absl//absl/log:check",
-        "@local_tsl//tsl/concurrency:async_value",
+        "@local_xla//xla/tsl/concurrency:async_value",
         "@tf_runtime//:async_value",
     ],
 )
@@ -223,7 +226,7 @@ tf_cc_test(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/concurrency:async_value",
         "@local_tsl//tsl/platform:status_matchers",
+        "@local_xla//xla/tsl/concurrency:async_value",
     ],
 )
diff --git a/tensorflow/core/tfrt/mlrt/interpreter/async_handle.cc b/tensorflow/core/tfrt/mlrt/interpreter/async_handle.cc
index 37d605da610b93..bc5f658b62ee0a 100644
--- a/tensorflow/core/tfrt/mlrt/interpreter/async_handle.cc
+++ b/tensorflow/core/tfrt/mlrt/interpreter/async_handle.cc
@@ -17,9 +17,9 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/concurrency/chain.h"
 #include "tensorflow/core/tfrt/mlrt/interpreter/context.h"
-#include "tsl/concurrency/async_value_ref.h"
-#include "tsl/concurrency/chain.h"
 
 namespace mlrt {
 
@@ -28,7 +28,8 @@ std::pair<AsyncHandle::Promise, AsyncHandle> AsyncHandle::Allocate(
   auto user_contexts = current.CopyUserContexts();
 
   auto new_context = std::make_unique<ExecutionContext>(
-      &current.loaded_executable(), std::move(user_contexts));
+      &current.loaded_executable(), std::move(user_contexts),
+      current.user_error_loggers());
   new_context->set_work_queue(current.work_queue());
 
   auto shared_state = tsl::MakeConstructedAsyncValueRef<tsl::Chain>();
diff --git a/tensorflow/core/tfrt/mlrt/interpreter/context.h b/tensorflow/core/tfrt/mlrt/interpreter/context.h
index a4b431c3dbd90d..35329ced3b22ab 100644
--- a/tensorflow/core/tfrt/mlrt/interpreter/context.h
+++ b/tensorflow/core/tfrt/mlrt/interpreter/context.h
@@ -259,8 +259,10 @@ class ExecutionContext {
   ExecutionContext(
       const LoadedExecutable* loaded_executable,
       std::vector<std::unique_ptr<context_internal::UserContextBase>>
-          user_contexts)
+          user_contexts,
+      const std::vector<std::function<void(absl::Status)>>& user_error_loggers)
       : user_contexts_(std::move(user_contexts)),
+        user_error_loggers_(user_error_loggers),
         loaded_executable_(loaded_executable) {}
 
   void set_exit_handler(absl::AnyInvocable<void() &&> exit_handler) {
@@ -456,6 +458,21 @@ class ExecutionContext {
     return user_contexts;
   }
 
+  void AddUserErrorLogger(std::function<void(absl::Status)> error_logger) {
+    user_error_loggers_.push_back(error_logger);
+  }
+
+  const std::vector<std::function<void(absl::Status)>>& user_error_loggers()
+      const {
+    return user_error_loggers_;
+  }
+
+  void LogError(absl::Status status) {
+    for (auto& error_logger : user_error_loggers_) {
+      error_logger(status);
+    }
+  }
+
   enum class State {
     // The function is pushed to the stack, and ready for execution.
     kReady = 0,
@@ -494,6 +511,8 @@ class ExecutionContext {
   std::vector<std::unique_ptr<context_internal::UserContextBase>>
       user_contexts_;
 
+  std::vector<std::function<void(absl::Status)>> user_error_loggers_;
+
   const LoadedExecutable* loaded_executable_ = nullptr;
 
   friend class AsyncHandle;
diff --git a/tensorflow/core/tfrt/mlrt/interpreter/context_test.cc b/tensorflow/core/tfrt/mlrt/interpreter/context_test.cc
index 62c5443f34d22e..09af47cbf93589 100644
--- a/tensorflow/core/tfrt/mlrt/interpreter/context_test.cc
+++ b/tensorflow/core/tfrt/mlrt/interpreter/context_test.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <utility>
 
 #include <gtest/gtest.h>
+#include "absl/log/log.h"
+#include "absl/status/status.h"
 
 namespace mlrt {
 namespace {
@@ -85,8 +87,9 @@ TEST(ContextTest, UserContext) {
   EXPECT_EQ(execution_context.GetUserContext<TestContext0>().v, 0);
   EXPECT_EQ(execution_context.GetUserContext<TestContext1>().v, 1);
 
-  ExecutionContext execution_context_copy(/*loaded_executable=*/nullptr,
-                                          execution_context.CopyUserContexts());
+  ExecutionContext execution_context_copy(
+      /*loaded_executable=*/nullptr, execution_context.CopyUserContexts(),
+      execution_context.user_error_loggers());
   EXPECT_NE(&execution_context_copy.GetUserContext<TestContext0>(), test_0_ptr);
   EXPECT_NE(&execution_context_copy.GetUserContext<TestContext1>(), test_1_ptr);
 
@@ -107,12 +110,43 @@ TEST(ContextTest, PartialUserContext) {
   EXPECT_EQ(&execution_context.GetUserContext<TestContext1>(), test_1_ptr);
   EXPECT_EQ(execution_context.GetUserContext<TestContext1>().v, 1);
 
-  ExecutionContext execution_context_copy(/*loaded_executable=*/nullptr,
-                                          execution_context.CopyUserContexts());
+  ExecutionContext execution_context_copy(
+      /*loaded_executable=*/nullptr, execution_context.CopyUserContexts(),
+      execution_context.user_error_loggers());
   EXPECT_NE(&execution_context_copy.GetUserContext<TestContext1>(), test_1_ptr);
 
   EXPECT_EQ(execution_context_copy.GetUserContext<TestContext1>().v, 1);
 }
 
+TEST(ContextTest, UserErrorLoggerCanBeCopied) {
+  int num_error_reported = 0;
+  ExecutionContext execution_context(/*loaded_executable=*/nullptr);
+  execution_context.AddUserErrorLogger(
+      [&num_error_reported](absl::Status status) {
+        num_error_reported++;
+        LOG(INFO) << "User error logger called";
+      });
+
+  execution_context.LogError(absl::InternalError("Test error"));
+
+  ExecutionContext execution_context_copy(
+      /*loaded_executable=*/nullptr, execution_context.CopyUserContexts(),
+      execution_context.user_error_loggers());
+
+  execution_context_copy.LogError(absl::InternalError("Test error"));
+
+  ASSERT_EQ(num_error_reported, 2);
+}
+
+TEST(ContextTest, NoUserErrorLoggerRunOk) {
+  ExecutionContext execution_context(/*loaded_executable=*/nullptr);
+  execution_context.LogError(absl::InternalError("Test error"));
+
+  ExecutionContext execution_context_copy(
+      /*loaded_executable=*/nullptr, execution_context.CopyUserContexts(),
+      execution_context.user_error_loggers());
+  execution_context_copy.LogError(absl::InternalError("Test error"));
+}
+
 }  // namespace
 }  // namespace mlrt
diff --git a/tensorflow/core/tfrt/mlrt/interpreter/execute.cc b/tensorflow/core/tfrt/mlrt/interpreter/execute.cc
index ebc6e13aae5683..a35988893bda98 100644
--- a/tensorflow/core/tfrt/mlrt/interpreter/execute.cc
+++ b/tensorflow/core/tfrt/mlrt/interpreter/execute.cc
@@ -14,10 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tfrt/mlrt/interpreter/execute.h"
 
+#include <string>
 #include <utility>
 
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/tfrt/mlrt/bytecode/kernel.h"
 #include "tensorflow/core/tfrt/mlrt/interpreter/context.h"
 #include "tensorflow/core/tfrt/mlrt/interpreter/register_span.h"
@@ -170,6 +173,13 @@ void Execute(ExecutionContext& ctx) {
 namespace execute_internal {
 
 void UnwindOnError(ExecutionContext& context, int64_t pc) {
+  std::string function_name;
+  if (!context.function_stack_.empty()) {
+    function_name = context.function_stack_.back().function_object().name();
+  }
+  context.LogError(absl::InternalError(absl::StrCat(
+      "Start UnwindOnError from function", function_name, " at pc: ", pc)));
+
   while (!context.function_stack_.empty()) {
     DCHECK(context.state_ == ExecutionContext::State::kError);
 
@@ -237,6 +247,10 @@ void UnwindOnError(ExecutionContext& context, int64_t pc) {
     context.function_stack_.pop_back();
   }
 
+  context.LogError(absl::InternalError(
+      absl::StrCat("Finish UnwindOnError for function ", function_name)));
+
+  // Context may no longer be valid after exit_handler_ is called.
   if (context.exit_handler_) {
     std::move(context.exit_handler_)();
   }
diff --git a/tensorflow/core/tfrt/mlrt/interpreter/future_test.cc b/tensorflow/core/tfrt/mlrt/interpreter/future_test.cc
index 1c50b1e50aca29..f9b0b1459d05c8 100644
--- a/tensorflow/core/tfrt/mlrt/interpreter/future_test.cc
+++ b/tensorflow/core/tfrt/mlrt/interpreter/future_test.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
 #include "tsl/platform/status_matchers.h"
 
 namespace mlrt {
diff --git a/tensorflow/core/tfrt/mlrt/kernel/BUILD b/tensorflow/core/tfrt/mlrt/kernel/BUILD
index a68cff125ea408..525e76ce993bff 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/BUILD
+++ b/tensorflow/core/tfrt/mlrt/kernel/BUILD
@@ -8,7 +8,7 @@ package(
     default_visibility = [
         # copybara:uncomment "//learning/brain/experimental/tfrt:__subpackages__",
         # copybara:uncomment "//learning/brain/tfrt:__subpackages__",
-        # copybara:uncomment "//learning/infra/mira/distributed:__subpackages__",
+        # copybara:uncomment "//learning/serving/servables/tfrt:__subpackages__",
         "//tensorflow/core/tfrt/graph_executor:__subpackages__",
         "//tensorflow/core/tfrt/saved_model:__subpackages__",
         "//tensorflow/core/tfrt/tfrt_session:__subpackages__",
@@ -75,6 +75,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:tstring",
         "@local_xla//xla:xla_data_proto_cc",
         "@local_xla//xla/python/ifrt",
@@ -192,8 +193,10 @@ tf_cc_shared_test(
         "//tensorflow/core/tfrt/fallback:fallback_state",
         "//tensorflow/core/tfrt/fallback:op_kernel_runner",
         "//tensorflow/core/tfrt/ifrt:ifrt_config_proto_cc",
+        "//tensorflow/core/tfrt/ifrt:ifrt_loaded_variable_registry",
         "//tensorflow/core/tfrt/ifrt:ifrt_model_context",
         "//tensorflow/core/tfrt/ifrt:ifrt_restore_tensor_registry",
+        "//tensorflow/core/tfrt/ifrt:ifrt_serving_core_selector",
         "//tensorflow/core/tfrt/mlrt/bytecode",
         "//tensorflow/core/tfrt/mlrt/bytecode:executable",
         "//tensorflow/core/tfrt/mlrt/interpreter:builtin_kernels",
@@ -209,6 +212,8 @@ tf_cc_shared_test(
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
         "@eigen_archive//:eigen3",
+        "@local_tsl//tsl/framework:serving_device_selector",
+        "@local_tsl//tsl/framework/test_util:mock_serving_device_selector",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:status",
diff --git a/tensorflow/core/tfrt/mlrt/kernel/batch_kernel.cc b/tensorflow/core/tfrt/mlrt/kernel/batch_kernel.cc
index d4536743757c2a..9d30f31a124ae7 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/batch_kernel.cc
+++ b/tensorflow/core/tfrt/mlrt/kernel/batch_kernel.cc
@@ -354,7 +354,8 @@ void MlrtBatchResource::ProcessFuncBatchImpl(
   // Copy the ExecutionContext and its user contexts for async execution.
   auto user_contexts = caller_context.CopyUserContexts();
   mlrt::ExecutionContext execution_context(&caller_context.loaded_executable(),
-                                           std::move(user_contexts));
+                                           std::move(user_contexts),
+                                           caller_context.user_error_loggers());
   execution_context.GetUserContext<tf_mlrt::Context>()
       .set_fallback_request_state(&fallback_request_state);
 
diff --git a/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel.cc b/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel.cc
index 3d3cd0ae7d6c04..899b189046ebfb 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel.cc
+++ b/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel.cc
@@ -50,11 +50,11 @@ limitations under the License.
 #include "tensorflow/core/tfrt/mlrt/kernel/kernel_runner_utils.h"
 #include "tensorflow/core/tfrt/utils/fallback_tensor.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 #include "tsl/platform/tstring.h"
 #include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
 
 using tensorflow::ifrt_serving::IfrtModelContext;
-using tensorflow::ifrt_serving::VariableDeviceShardingConfigProto;
 
 namespace tensorflow {
 namespace tf_mlrt {
@@ -96,16 +96,26 @@ struct MlrtIfrtRestoreVariableKernel : mlrt::KernelFrame {
 
   Context& context() { return execution_context().GetUserContext<Context>(); }
   void Invoke();
+
+ private:
+  absl::Status InvokeHelper();
 };
 
 void MlrtIfrtRestoreVariableKernel::Invoke() {
+  absl::Status status = InvokeHelper();
+  if (!status.ok()) {
+    execution_context().Fail(std::move(status));
+    return;
+  }
+}
+
+absl::Status MlrtIfrtRestoreVariableKernel::InvokeHelper() {
   std::optional<IfrtModelContext*> ifrt_model_context =
       context().resource_context().GetResource<IfrtModelContext>(
           "IfrtModelContext");
   if (!ifrt_model_context.has_value()) {
-    execution_context().Fail(absl::FailedPreconditionError(
-        "RestoreVariableOp: failed to fetch IfrtModelContext"));
-    return;
+    return absl::FailedPreconditionError(
+        "RestoreVariableOp: failed to fetch IfrtModelContext");
   }
   const int num_outputs = var_handles().size();
   DCHECK_EQ(num_outputs, tensor_names().tensor().NumElements());
@@ -144,6 +154,8 @@ void MlrtIfrtRestoreVariableKernel::Invoke() {
 
   auto& params = context().params();
   SetUpParams(runner, input_tf_tensor_values, params);
+  // Use persistent device instead of the per request device.
+  params.device = context().fallback_request_state().device_manager().HostCPU();
 
   struct AsyncState {
     explicit AsyncState(
@@ -154,7 +166,7 @@ void MlrtIfrtRestoreVariableKernel::Invoke() {
 
     tfrt_stub::OpKernelRunState run_state;
     OpKernelContext context;
-    std::vector<xla::ifrt::Promise<absl::StatusOr<tensorflow::Tensor>>> results;
+    std::vector<xla::ifrt::Promise<tensorflow::Tensor>> results;
   };
   auto async_state =
       std::make_unique<AsyncState>(input_tf_tensor_values, params, num_outputs);
@@ -164,24 +176,19 @@ void MlrtIfrtRestoreVariableKernel::Invoke() {
   ifrt_serving::IfrtRestoreTensorRegistry& ifrt_restore_tensor_registry =
       (*ifrt_model_context)->GetRestoreTensorRegistry();
   for (int i = 0; i < num_outputs; ++i) {
-    auto promise =
-        xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>>::CreatePromise();
-    auto future =
-        xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>>(promise);
+    auto promise = xla::ifrt::Future<tensorflow::Tensor>::CreatePromise();
+    auto future = xla::ifrt::Future<tensorflow::Tensor>(promise);
     const ResourceHandle& var_handle =
         var_handles()[i].tensor().scalar<ResourceHandle>()();
-    absl::StatusOr<ifrt_serving::DtypeAndShape> dtype_and_shape =
-        ifrt_serving::GetDtypeAndShape(var_handle);
-    if (!dtype_and_shape.ok()) {
-      // TODO(b/330360798) Refactor Invoke() to have less usage on
-      // execution_context().Fail.
-      execution_context().Fail(dtype_and_shape.status());
-      return;
-    }
+
+    TF_ASSIGN_OR_RETURN(ifrt_serving::DtypeAndShape dtype_and_shape,
+                        ifrt_serving::GetDtypeAndShape(var_handle));
+
     std::string runtime_name =
         ifrt_serving::GetRuntimeNameFromVarHandle(var_handle);
     ifrt_serving::IfrtRestoreTensorRegistry::RestoredTensorInfo
-        restored_tensor_info = {*std::move(dtype_and_shape), std::move(future)};
+        restored_tensor_info = {false, std::move(dtype_and_shape),
+                                std::move(future)};
     if (auto status = ifrt_restore_tensor_registry.TryRegister(
             runtime_name, restored_tensor_info);
         !status.ok()) {
@@ -189,9 +196,8 @@ void MlrtIfrtRestoreVariableKernel::Invoke() {
       // on, they can be unblocked.
       for (auto& result : async_state->results) {
         std::move(result).Set(status);
-      }
-      execution_context().Fail(std::move(status));
-      return;
+      };
+      return status;
     }
     async_state->results.push_back(std::move(promise));
   }
@@ -218,6 +224,7 @@ void MlrtIfrtRestoreVariableKernel::Invoke() {
                   .Set(std::move(*op_kernel_context.mutable_output(i)));
             }
           });
+  return absl::OkStatus();
 }
 
 class MlrtIfrtLoadVariableKernel : public mlrt::KernelFrame {
@@ -235,10 +242,15 @@ class MlrtIfrtLoadVariableKernel : public mlrt::KernelFrame {
   }
 
   absl::string_view sharding_config_proto_text() const {
-    DCHECK_EQ(attributes().size(), 2);
+    DCHECK_EQ(attributes().size(), 3);
     return attributes().GetAs<mlrt::bc::String>(0).Get();
   }
 
+  bool used_by_host() const {
+    DCHECK_EQ(attributes().size(), 3);
+    return attributes().GetAs<bool>(2);
+  }
+
   Context& context() { return execution_context().GetUserContext<Context>(); }
   void Invoke();
 
@@ -263,16 +275,6 @@ absl::Status MlrtIfrtLoadVariableKernel::InvokeHelper() {
     return absl::FailedPreconditionError(
         "LoadVariableOp: failed to fetch IfrtModelContext: ");
   }
-
-  VariableDeviceShardingConfigProto sharding_config;
-  absl::string_view sharding_config_text = sharding_config_proto_text();
-
-  if (!tensorflow::protobuf::TextFormat::ParseFromString(sharding_config_text,
-                                                         &sharding_config)) {
-    return absl::InvalidArgumentError(
-        absl::StrCat("Attribute: ", sharding_config_text, " cannot be parsed"));
-  }
-
   auto tensor_promise =
       mlrt::Promise::Allocate<tensorflow::tfrt_stub::FallbackTensor>();
   auto tensor_future = tensor_promise.GetFuture();
@@ -283,25 +285,31 @@ absl::Status MlrtIfrtLoadVariableKernel::InvokeHelper() {
   std::string runtime_name = ifrt_serving::GetRuntimeNameFromVarHandle(
       variable_handler_tensor().scalar<ResourceHandle>()());
 
-  TF_RETURN_IF_ERROR(ifrt_serving::LoadRestoredTensorAsIfrtLoadedVariable(
-      runtime_name, (*ifrt_model_context)->GetClient(),
-      (*ifrt_model_context)->GetThreadPool(), ifrt_restore_tensor_registry,
-      (*ifrt_model_context)->GetLoadedVariableRegistry(),
-      (*ifrt_model_context)->checkpoint_loader_queue(), sharding_config));
-  xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>> restored_tensor_future =
-      ifrt_restore_tensor_registry.GetRestoredTensor(runtime_name);
-
-  restored_tensor_future.OnReady(
-      [tensor_promise = std::move(tensor_promise)](
-          absl::StatusOr<tensorflow::Tensor> restored_tensor) mutable {
-        if (!restored_tensor.ok()) {
-          std::move(tensor_promise).SetError(restored_tensor.status());
-          return;
-        }
-        std::move(tensor_promise)
-            .Set<tensorflow::tfrt_stub::FallbackTensor>(
-                tensorflow::tfrt_stub::FallbackTensor(*restored_tensor));
-      });
+  if (used_by_host()) {
+    TF_RETURN_IF_ERROR(
+        ifrt_restore_tensor_registry.SetUsedByHost(runtime_name));
+
+    xla::ifrt::Future<tensorflow::Tensor> restored_tensor_future =
+        ifrt_restore_tensor_registry.GetRestoredTensor(runtime_name);
+
+    restored_tensor_future.OnReady(
+        [tensor_promise = std::move(tensor_promise)](
+            absl::StatusOr<tensorflow::Tensor> restored_tensor) mutable {
+          if (!restored_tensor.ok()) {
+            std::move(tensor_promise).SetError(restored_tensor.status());
+            return;
+          }
+          std::move(tensor_promise)
+              .Set<tensorflow::tfrt_stub::FallbackTensor>(
+                  tensorflow::tfrt_stub::FallbackTensor(*restored_tensor));
+        });
+  } else {
+    // If not used by host, set the future to be ready immediately with an empty
+    // tensor so that it does not block the graph execution.
+    std::move(tensor_promise)
+        .Set<tensorflow::tfrt_stub::FallbackTensor>(
+            tensorflow::tfrt_stub::FallbackTensor());
+  }
   // Return the name as the key
   tensorflow::Tensor key_tensor(tensorflow::DT_STRING, {});
   key_tensor.scalar<tsl::tstring>()() = runtime_name;
diff --git a/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel_test.cc b/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel_test.cc
index 56d276fbfc0abc..d3a50c9edc1f4f 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel_test.cc
+++ b/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel_test.cc
@@ -44,6 +44,7 @@ limitations under the License.
 #include "tensorflow/core/tfrt/ifrt/ifrt_config.pb.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_model_context.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_serving_core_selector.h"
 #include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
 #include "tensorflow/core/tfrt/mlrt/bytecode/executable.h"
 #include "tensorflow/core/tfrt/mlrt/interpreter/builtin_kernels.h"
@@ -54,6 +55,7 @@ limitations under the License.
 #include "tensorflow/core/tfrt/mlrt/kernel/context.h"
 #include "tensorflow/core/tfrt/mlrt/kernel/kernel.h"
 #include "tensorflow/core/tfrt/utils/fallback_tensor.h"
+#include "tsl/framework/test_util/mock_serving_device_selector.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/status.h"
@@ -222,16 +224,10 @@ mlrt::bc::Buffer CreateExecutableForIfrtLoadVariableOp(
   kernels.Def(kernel_names);
 
   mlrt::testing::AttributeTable attributes(
-      executable_ctor.construct_attributes(4));
-
-  tensorflow::ifrt_serving::VariableDeviceShardingConfigProto sharding_config;
-  sharding_config.add_device_ids(0);
-  std::string serialized_sharding_config;
-  tsl::protobuf::TextFormat::Printer printer;
-  printer.SetSingleLineMode(true);
-  printer.PrintToString(sharding_config, &serialized_sharding_config);
+      executable_ctor.construct_attributes(5));
 
-  attributes.Add("sharding_config", serialized_sharding_config);
+  // TODO(b/339423851) Redefine the IfrtLoadVariableOp as it doesn't require the
+  // sharding info in the attribute after confirming multihost do not need it.
   attributes.Add("variable_name", kVariableRuntimeName);
 
   attributes.Add("var_handle_op_node_def",
@@ -259,6 +255,8 @@ mlrt::bc::Buffer CreateExecutableForIfrtLoadVariableOp(
                      kContainer, kSharedName));
 
   attributes.Add("var_handle_op_key", 0);
+  attributes.Add("used_by_host", false);
+  attributes.Add("sharding_config_proto", "");
 
   auto functions_ctor = executable_ctor.construct_functions(1);
 
@@ -302,9 +300,10 @@ mlrt::bc::Buffer CreateExecutableForIfrtLoadVariableOp(
       kernel_ctor.construct_results(2).Assign(
           {regs.Use("output_tensor"), regs.Def("dummy_future")});
       kernel_ctor.construct_arguments(1).Assign({regs.Use("variable_handle")});
-      kernel_ctor.construct_attributes(2).Assign(
-          {attributes.GetHandle("sharding_config"),
-           attributes.GetHandle("variable_name")});
+      kernel_ctor.construct_attributes(3).Assign(
+          {attributes.GetHandle("variable_name"),
+           attributes.GetHandle("sharding_config_proto"),
+           attributes.GetHandle("used_by_host")});
       kernel_ctor.construct_last_uses(1).Assign(
           {redundant_ifrt_load_variable_op ? 0 : 1});
       kernel_index++;
@@ -314,9 +313,10 @@ mlrt::bc::Buffer CreateExecutableForIfrtLoadVariableOp(
       kernel_ctor.set_code(kernels.Use("tf_mlrt.ifrt_load_variable"));
       kernel_ctor.construct_results(2).Assign(
           {regs.Def("dummy"), regs.Def("dummy_future2")});
-      kernel_ctor.construct_attributes(2).Assign(
-          {attributes.GetHandle("sharding_config"),
-           attributes.GetHandle("variable_name")});
+      kernel_ctor.construct_attributes(3).Assign(
+          {attributes.GetHandle("variable_name"),
+           attributes.GetHandle("sharding_config_proto"),
+           attributes.GetHandle("used_by_host")});
       kernel_ctor.construct_arguments(1).Assign({regs.Use("variable_handle")});
       kernel_ctor.construct_last_uses(1).Assign({1});
       kernel_index++;
@@ -335,81 +335,94 @@ mlrt::bc::Buffer CreateExecutableForIfrtLoadVariableOp(
   return buffer;
 }
 
-TEST(KernelTest, IfrtLoadVariableOp) {
+class KernelTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    mlrt::RegisterBuiltinKernels(registry_);
+    RegisterTfMlrtKernels(registry_);
+    execution_work_queue_ = tfrt::CreateMultiThreadedWorkQueue(
+        /*num_threads=*/4, /*num_blocking_threads=*/4);
+    restore_work_queue_ = tfrt::CreateMultiThreadedWorkQueue(
+        /*num_threads=*/4, /*num_blocking_threads=*/4);
+    TF_ASSERT_OK_AND_ASSIGN(fallback_state_, tfrt_stub::FallbackState::Create(
+                                                 session_options_, fdef_lib_));
+    runner_ = [](const std::function<void()>& f) { f(); };
+    fallback_request_state_ =
+        std::make_unique<tfd::KernelFallbackCompatRequestState>(
+            &runner_, &fallback_state_->device_manager(), /*step_id=*/0,
+            &runner_table_, &resource_array_,
+            /*user_intra_op_threadpool=*/nullptr,
+            /*model_metadata=*/std::nullopt,
+            &fallback_state_->process_function_library_runtime());
+
+    TF_ASSERT_OK_AND_ASSIGN(client_, xla::ifrt::test_util::GetClient());
+    resource_context_
+        .CreateResource<tensorflow::ifrt_serving::IfrtModelContext>(
+            "IfrtModelContext", client_, ifrt_core_selector_.get(),
+            &GetThreadPool());
+
+    tf_context_ = std::make_unique<Context>(fallback_request_state_.get(),
+                                            &resource_context_);
+    ifrt_model_context_ =
+        resource_context_
+            .GetResource<tensorflow::ifrt_serving::IfrtModelContext>(
+                "IfrtModelContext")
+            .value();
+    ifrt_model_context_->set_checkpoint_loader_queue(restore_work_queue_.get());
+
+    serving_device_selector_ =
+        std::make_unique<tsl::test_util::MockServingDeviceSelector>();
+    ifrt_core_selector_ =
+        std::make_unique<ifrt_serving::IfrtServingCoreSelector>(
+            serving_device_selector_.get());
+  }
+
+  std::unique_ptr<tsl::test_util::MockServingDeviceSelector>
+      serving_device_selector_;
+  std::unique_ptr<ifrt_serving::IfrtServingCoreSelector> ifrt_core_selector_;
+  mlrt::KernelRegistry registry_;
+  std::unique_ptr<tfrt::ConcurrentWorkQueue> execution_work_queue_;
+  std::unique_ptr<tfrt::ConcurrentWorkQueue> restore_work_queue_;
+  tensorflow::SessionOptions session_options_;
+  tensorflow::FunctionDefLibrary fdef_lib_;
+  std::function<void(std::function<void()>)> runner_;
+  tfrt_stub::OpKernelRunnerTable runner_table_;
+  tfd::FallbackResourceArray resource_array_;
+  std::unique_ptr<tfrt_stub::FallbackState> fallback_state_;
+  tfrt::ResourceContext resource_context_;
+  std::shared_ptr<xla::ifrt::Client> client_;
+  std::unique_ptr<tfd::KernelFallbackCompatRequestState>
+      fallback_request_state_;
+  std::unique_ptr<Context> tf_context_;
+  tensorflow::ifrt_serving::IfrtModelContext* ifrt_model_context_;
+};
+
+TEST_F(KernelTest, IfrtLoadVariableOp) {
   auto buffer = CreateExecutableForIfrtLoadVariableOp();
 
   mlrt::bc::Executable executable(buffer.data());
 
-  mlrt::KernelRegistry registry;
-  mlrt::RegisterBuiltinKernels(registry);
-  RegisterTfMlrtKernels(registry);
+  mlrt::LoadedExecutable loaded_executable(executable, registry_);
 
-  mlrt::LoadedExecutable loaded_executable(executable, registry);
-
-  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
-      /*num_threads=*/4, /*num_blocking_threads=*/4);
   mlrt::ExecutionContext execution_context(&loaded_executable);
-  execution_context.set_work_queue(work_queue.get());
-
-  tensorflow::SessionOptions session_options;
-  tensorflow::FunctionDefLibrary fdef_lib;
-  TF_ASSERT_OK_AND_ASSIGN(auto fallback_state, tfrt_stub::FallbackState::Create(
-                                                   session_options, fdef_lib));
-
-  std::function<void(std::function<void()>)> runner =
-      [](const std::function<void()>& f) { f(); };
-  tfrt_stub::OpKernelRunnerTable runner_table;
-  tfd::FallbackResourceArray resource_array;
-  tfd::KernelFallbackCompatRequestState fallback_request_state(
-      &runner, &fallback_state->device_manager(), /*step_id=*/0, &runner_table,
-      &resource_array, /*user_intra_op_threadpool=*/nullptr,
-      /*model_metadata=*/std::nullopt,
-      &fallback_state->process_function_library_runtime());
-
-  tfrt::ResourceContext resource_context;
-
-  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::ifrt::Client> client,
-                          xla::ifrt::test_util::GetClient());
-  resource_context.CreateResource<tensorflow::ifrt_serving::IfrtModelContext>(
-      "IfrtModelContext", client, &GetThreadPool());
-
-  auto tf_context =
-      std::make_unique<Context>(&fallback_request_state, &resource_context);
-  execution_context.AddUserContext(std::move(tf_context));
-
-  std::optional<tensorflow::ifrt_serving::IfrtModelContext*>
-      ifrt_model_context =
-          resource_context
-              .GetResource<tensorflow::ifrt_serving::IfrtModelContext>(
-                  "IfrtModelContext");
-
-  ASSERT_TRUE(ifrt_model_context.has_value());
-  EXPECT_THAT((*ifrt_model_context)
-                  ->GetLoadedVariableRegistry()
-                  .GetLoadedVariable(kVariableRuntimeName)
-                  .status(),
-              ::tsl::testing::StatusIs(absl::StatusCode::kNotFound));
+  execution_context.set_work_queue(execution_work_queue_.get());
 
-  auto restore_work_queue = tfrt::CreateMultiThreadedWorkQueue(
-      /*num_threads=*/4, /*num_blocking_threads=*/4);
-  (*ifrt_model_context)->set_checkpoint_loader_queue(restore_work_queue.get());
+  execution_context.AddUserContext(std::move(tf_context_));
 
   tensorflow::Tensor input_tensor;
   TF_CHECK_OK(tensorflow::Tensor::BuildTensor(DT_INT32, {}, &input_tensor));
   input_tensor.scalar<int32_t>()() = 1234;
   auto input_tensor_promise =
-      xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>>::CreatePromise();
+      xla::ifrt::Future<tensorflow::Tensor>::CreatePromise();
   auto input_tensor_future =
-      xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>>(
-          input_tensor_promise);
+      xla::ifrt::Future<tensorflow::Tensor>(input_tensor_promise);
   ifrt_serving::IfrtRestoreTensorRegistry::RestoredTensorInfo
       restore_tensor_info{.dtype_and_shape = {.dtype = input_tensor.dtype(),
                                               .shape = input_tensor.shape()},
                           .tensor_future = input_tensor_future};
   input_tensor_promise.Set(input_tensor);
-  TF_ASSERT_OK((*ifrt_model_context)
-                   ->GetRestoreTensorRegistry()
-                   .TryRegister(kVariableRuntimeName, restore_tensor_info));
+  TF_ASSERT_OK(ifrt_model_context_->GetRestoreTensorRegistry().TryRegister(
+      kVariableRuntimeName, restore_tensor_info));
 
   std::vector<mlrt::Value> args;
   std::vector<uint8_t> last_uses;
@@ -423,96 +436,40 @@ TEST(KernelTest, IfrtLoadVariableOp) {
   execution_context.Call(executable.functions()[0], last_uses,
                          absl::MakeSpan(args), absl::MakeSpan(results));
   mlrt::Execute(execution_context);
-
   notification.WaitForNotification();
 
   TF_ASSERT_OK(execution_context.status());
 
-  TF_ASSERT_OK((*ifrt_model_context)
-                   ->GetLoadedVariableRegistry()
-                   .GetLoadedVariable(kVariableRuntimeName)
-                   .status());
-
   ExpectEqual(results[0].Get<tfrt_stub::FallbackTensor>().tensor(),
               AsScalar(tsl::tstring(kVariableRuntimeName)));
 }
 
-TEST(KernelTest, DuplicateIfrtLoadVariableOpShallSucceed) {
+TEST_F(KernelTest, DuplicateIfrtLoadVariableOpShallSucceed) {
   auto buffer = CreateExecutableForIfrtLoadVariableOp(
       /*redundant_ifrt_load_variable_op=*/true);
 
   mlrt::bc::Executable executable(buffer.data());
 
-  mlrt::KernelRegistry registry;
-  mlrt::RegisterBuiltinKernels(registry);
-  RegisterTfMlrtKernels(registry);
+  mlrt::LoadedExecutable loaded_executable(executable, registry_);
 
-  mlrt::LoadedExecutable loaded_executable(executable, registry);
-
-  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
-      /*num_threads=*/4, /*num_blocking_threads=*/4);
   mlrt::ExecutionContext execution_context(&loaded_executable);
-  execution_context.set_work_queue(work_queue.get());
-
-  tensorflow::SessionOptions session_options;
-  tensorflow::FunctionDefLibrary fdef_lib;
-  TF_ASSERT_OK_AND_ASSIGN(auto fallback_state, tfrt_stub::FallbackState::Create(
-                                                   session_options, fdef_lib));
-
-  std::function<void(std::function<void()>)> runner =
-      [](const std::function<void()>& f) { f(); };
-  tfrt_stub::OpKernelRunnerTable runner_table;
-  tfd::FallbackResourceArray resource_array;
-  tfd::KernelFallbackCompatRequestState fallback_request_state(
-      &runner, &fallback_state->device_manager(), /*step_id=*/0, &runner_table,
-      &resource_array, /*user_intra_op_threadpool=*/nullptr,
-      /*model_metadata=*/std::nullopt,
-      &fallback_state->process_function_library_runtime());
-
-  tfrt::ResourceContext resource_context;
-
-  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::ifrt::Client> client,
-                          xla::ifrt::test_util::GetClient());
-  resource_context.CreateResource<tensorflow::ifrt_serving::IfrtModelContext>(
-      "IfrtModelContext", client, &GetThreadPool());
-
-  auto tf_context =
-      std::make_unique<Context>(&fallback_request_state, &resource_context);
-  execution_context.AddUserContext(std::move(tf_context));
-
-  std::optional<tensorflow::ifrt_serving::IfrtModelContext*>
-      ifrt_model_context =
-          resource_context
-              .GetResource<tensorflow::ifrt_serving::IfrtModelContext>(
-                  "IfrtModelContext");
-
-  ASSERT_TRUE(ifrt_model_context.has_value());
-  EXPECT_THAT((*ifrt_model_context)
-                  ->GetLoadedVariableRegistry()
-                  .GetLoadedVariable(kVariableRuntimeName)
-                  .status(),
-              ::tsl::testing::StatusIs(absl::StatusCode::kNotFound));
-
-  auto restore_work_queue = tfrt::CreateMultiThreadedWorkQueue(
-      /*num_threads=*/4, /*num_blocking_threads=*/4);
-  (*ifrt_model_context)->set_checkpoint_loader_queue(restore_work_queue.get());
+  execution_context.set_work_queue(execution_work_queue_.get());
+  execution_context.AddUserContext(std::move(tf_context_));
 
   tensorflow::Tensor input_tensor;
   TF_CHECK_OK(tensorflow::Tensor::BuildTensor(DT_INT32, {}, &input_tensor));
   input_tensor.scalar<int32_t>()() = 1234;
   auto input_tensor_promise =
-      xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>>::CreatePromise();
+      xla::ifrt::Future<tensorflow::Tensor>::CreatePromise();
   auto input_tensor_future =
-      xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>>(
-          input_tensor_promise);
+      xla::ifrt::Future<tensorflow::Tensor>(input_tensor_promise);
   ifrt_serving::IfrtRestoreTensorRegistry::RestoredTensorInfo
       restore_tensor_info{.dtype_and_shape = {.dtype = input_tensor.dtype(),
                                               .shape = input_tensor.shape()},
                           .tensor_future = input_tensor_future};
   input_tensor_promise.Set(input_tensor);
-  TF_ASSERT_OK((*ifrt_model_context)
-                   ->GetRestoreTensorRegistry()
-                   .TryRegister(kVariableRuntimeName, restore_tensor_info));
+  TF_ASSERT_OK(ifrt_model_context_->GetRestoreTensorRegistry().TryRegister(
+      kVariableRuntimeName, restore_tensor_info));
 
   std::vector<mlrt::Value> args;
   std::vector<uint8_t> last_uses;
@@ -531,16 +488,11 @@ TEST(KernelTest, DuplicateIfrtLoadVariableOpShallSucceed) {
 
   TF_ASSERT_OK(execution_context.status());
 
-  TF_ASSERT_OK((*ifrt_model_context)
-                   ->GetLoadedVariableRegistry()
-                   .GetLoadedVariable(kVariableRuntimeName)
-                   .status());
-
   ExpectEqual(results[0].Get<tfrt_stub::FallbackTensor>().tensor(),
               AsScalar(tsl::tstring(kVariableRuntimeName)));
 }
 
-TEST(KernelTest, IfrtRestoreVariableOp) {
+TEST_F(KernelTest, IfrtRestoreVariableOp) {
   std::string checkpoint_prefix =
       tensorflow::GetDataDependencyFilepath(
           "tensorflow/core/tfrt/mlrt/kernel/testdata/"
@@ -551,62 +503,20 @@ TEST(KernelTest, IfrtRestoreVariableOp) {
 
   mlrt::bc::Executable executable(buffer.data());
 
-  mlrt::KernelRegistry registry;
-  mlrt::RegisterBuiltinKernels(registry);
-  RegisterTfMlrtKernels(registry);
+  mlrt::LoadedExecutable loaded_executable(executable, registry_);
 
-  mlrt::LoadedExecutable loaded_executable(executable, registry);
-
-  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
-      /*num_threads=*/4, /*num_blocking_threads=*/4);
   mlrt::ExecutionContext execution_context(&loaded_executable);
-  execution_context.set_work_queue(work_queue.get());
-
-  tensorflow::SessionOptions session_options;
-  tensorflow::FunctionDefLibrary fdef_lib;
-  TF_ASSERT_OK_AND_ASSIGN(auto fallback_state, tfrt_stub::FallbackState::Create(
-                                                   session_options, fdef_lib));
-
-  std::function<void(std::function<void()>)> runner =
-      [](const std::function<void()>& f) { f(); };
-  tfrt_stub::OpKernelRunnerTable runner_table;
-  tfd::FallbackResourceArray resource_array;
-  tfd::KernelFallbackCompatRequestState fallback_request_state(
-      &runner, &fallback_state->device_manager(), /*step_id=*/0, &runner_table,
-      &resource_array, /*user_intra_op_threadpool=*/nullptr,
-      /*model_metadata=*/std::nullopt,
-      &fallback_state->process_function_library_runtime());
-
-  tfrt::ResourceContext resource_context;
-
-  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::ifrt::Client> client,
-                          xla::ifrt::test_util::GetClient());
-  resource_context.CreateResource<tensorflow::ifrt_serving::IfrtModelContext>(
-      "IfrtModelContext", client, &GetThreadPool());
-
-  auto tf_context =
-      std::make_unique<Context>(&fallback_request_state, &resource_context);
-  execution_context.AddUserContext(std::move(tf_context));
-
-  std::optional<tensorflow::ifrt_serving::IfrtModelContext*>
-      ifrt_model_context =
-          resource_context
-              .GetResource<tensorflow::ifrt_serving::IfrtModelContext>(
-                  "IfrtModelContext");
-
-  ASSERT_TRUE(ifrt_model_context.has_value());
-  xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>> uninitialized_entry =
-      (*ifrt_model_context)
-          ->GetRestoreTensorRegistry()
-          .GetRestoredTensor(kVariableRuntimeName);
+  execution_context.set_work_queue(execution_work_queue_.get());
+
+  execution_context.AddUserContext(std::move(tf_context_));
+
+  xla::ifrt::Future<tensorflow::Tensor> uninitialized_entry =
+      ifrt_model_context_->GetRestoreTensorRegistry().GetRestoredTensor(
+          kVariableRuntimeName);
   ASSERT_TRUE(uninitialized_entry.IsReady());
   EXPECT_THAT(uninitialized_entry.Await().status(),
               ::tsl::testing::StatusIs(absl::StatusCode::kNotFound));
 
-  auto restore_work_queue = tfrt::CreateMultiThreadedWorkQueue(
-      /*num_threads=*/4, /*num_blocking_threads=*/4);
-  (*ifrt_model_context)->set_checkpoint_loader_queue(restore_work_queue.get());
-
   std::vector<mlrt::Value> args;
   args.resize(3);
 
@@ -636,10 +546,9 @@ TEST(KernelTest, IfrtRestoreVariableOp) {
 
   TF_ASSERT_OK(execution_context.status());
 
-  xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>> restored_future =
-      (*ifrt_model_context)
-          ->GetRestoreTensorRegistry()
-          .GetRestoredTensor(kVariableRuntimeName);
+  xla::ifrt::Future<tensorflow::Tensor> restored_future =
+      ifrt_model_context_->GetRestoreTensorRegistry().GetRestoredTensor(
+          kVariableRuntimeName);
   absl::StatusOr<tensorflow::Tensor> restored_tensor = restored_future.Await();
   TF_ASSERT_OK(restored_tensor.status());
   EXPECT_THAT(*restored_tensor, TensorEq(AsTensor<int32_t>({1, 2, 3}, {3})));
diff --git a/tensorflow/core/tfrt/run_handler_thread_pool/BUILD b/tensorflow/core/tfrt/run_handler_thread_pool/BUILD
index a7c5fb24092b17..a70727213d4e0c 100644
--- a/tensorflow/core/tfrt/run_handler_thread_pool/BUILD
+++ b/tensorflow/core/tfrt/run_handler_thread_pool/BUILD
@@ -57,6 +57,7 @@ cc_library(
         "//tensorflow/core/protobuf:for_core_protos_cc",
         "//tensorflow/core/tfrt/runtime:work_queue_interface",
         "@eigen_archive//:eigen3",
+        "@local_tsl//tsl/platform:env",
         "@tf_runtime//:hostcontext",
     ],
 )
diff --git a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler.cc b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler.cc
index c0147eb6392fb5..aafa0155f14426 100644
--- a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler.cc
+++ b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler.cc
@@ -32,13 +32,13 @@ limitations under the License.
 #include "tensorflow/core/platform/denormal.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/setround.h"
-#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/profiler/lib/connected_traceme.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/profiler/lib/traceme_encode.h"
 #include "tensorflow/core/tfrt/run_handler_thread_pool/run_handler.h"
 #include "tensorflow/core/tfrt/run_handler_thread_pool/run_handler_util.h"
 #include "tensorflow/core/tfrt/runtime/work_queue_interface.h"
+#include "tsl/platform/tracing.h"
 #include "tfrt/host_context/async_dispatch.h"  // from @tf_runtime
 
 namespace tfrt {
@@ -72,10 +72,10 @@ RunHandlerEnvironment::EnvThread* RunHandlerEnvironment::CreateThread(
 
 RunHandlerEnvironment::Task RunHandlerEnvironment::CreateTask(TaskFunction f) {
   uint64_t id = 0;
-  if (tensorflow::tracing::EventCollector::IsEnabled()) {
-    id = tensorflow::tracing::GetUniqueArg();
-    tensorflow::tracing::RecordEvent(
-        tensorflow::tracing::EventCategory::kScheduleClosure, id);
+  if (tsl::tracing::EventCollector::IsEnabled()) {
+    id = tsl::tracing::GetUniqueArg();
+    tsl::tracing::RecordEvent(tsl::tracing::EventCategory::kScheduleClosure,
+                              id);
   }
   return Task{
       std::unique_ptr<TaskImpl>(new TaskImpl{
@@ -88,8 +88,8 @@ RunHandlerEnvironment::Task RunHandlerEnvironment::CreateTask(TaskFunction f) {
 
 void RunHandlerEnvironment::ExecuteTask(const Task& t) {
   tensorflow::WithContext wc(t.f->context);
-  tensorflow::tracing::ScopedRegion region(
-      tensorflow::tracing::EventCategory::kRunClosure, t.f->trace_id);
+  tsl::tracing::ScopedRegion region(tsl::tracing::EventCategory::kRunClosure,
+                                    t.f->trace_id);
   t.f->f();
 }
 
diff --git a/tensorflow/core/tfrt/runtime/BUILD b/tensorflow/core/tfrt/runtime/BUILD
index 95712a19629fd8..c4cf2dda41b87e 100644
--- a/tensorflow/core/tfrt/runtime/BUILD
+++ b/tensorflow/core/tfrt/runtime/BUILD
@@ -153,6 +153,7 @@ tf_cc_shared_test(
     srcs = ["stream_test.cc"],
     tags = ["no_oss"],
     deps = [
+        ":step_id",
         ":stream",
         "//tensorflow/core/framework:tensor",
         "//tensorflow/core/framework:tensor_testutil",
@@ -162,6 +163,8 @@ tf_cc_shared_test(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:env",
diff --git a/tensorflow/core/tfrt/runtime/runtime.h b/tensorflow/core/tfrt/runtime/runtime.h
index 988715ba31bac4..1a6925c1535cc9 100644
--- a/tensorflow/core/tfrt/runtime/runtime.h
+++ b/tensorflow/core/tfrt/runtime/runtime.h
@@ -27,9 +27,12 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/framework/device.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tensorflow/core/tfrt/graph_executor/graph_execution_options.h"
 #include "tensorflow/core/tfrt/runtime/work_queue_interface.h"
@@ -63,9 +66,12 @@ class ModelRuntimeContext {
 
   absl::string_view export_dir() const { return export_dir_; }
 
-  const MetaGraphDef* meta_graph_def() const { return meta_graph_def_; }
-  void set_meta_graph_def(const MetaGraphDef* meta_graph_def) {
-    meta_graph_def_ = meta_graph_def;
+  const GraphDef* graph_def() const { return graph_def_; }
+  void set_graph_def(const GraphDef* graph_def) { graph_def_ = graph_def; }
+
+  const CallableOptions* callable_options() const { return callable_options_; }
+  void set_callable_options(const CallableOptions* callable_options) {
+    callable_options_ = callable_options;
   }
 
   FunctionLibraryDefinition* function_library_definition() const {
@@ -75,6 +81,11 @@ class ModelRuntimeContext {
     flib_def_ = flib_def;
   }
 
+  tensorflow::DeviceMgr* device_mgr() const { return device_mgr_; }
+  void set_device_mgr(tensorflow::DeviceMgr* device_mgr) {
+    device_mgr_ = device_mgr;
+  }
+
   bool is_local_session() const { return is_local_session_; }
 
   void set_is_local_session(bool is_local_session) {
@@ -97,9 +108,10 @@ class ModelRuntimeContext {
   const GraphExecutionOptions* graph_execution_options_ = nullptr;
 
   std::string export_dir_;
-  const MetaGraphDef* meta_graph_def_ = nullptr;
-
+  const GraphDef* graph_def_ = nullptr;
+  const CallableOptions* callable_options_ = nullptr;
   tfrt::ResourceContext* resource_context_ = nullptr;
+  tensorflow::DeviceMgr* device_mgr_ = nullptr;
 
   FunctionLibraryDefinition* flib_def_ = nullptr;
 
diff --git a/tensorflow/core/tfrt/runtime/stream_test.cc b/tensorflow/core/tfrt/runtime/stream_test.cc
index cac9113053bfab..bcb8a14a553675 100644
--- a/tensorflow/core/tfrt/runtime/stream_test.cc
+++ b/tensorflow/core/tfrt/runtime/stream_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/tfrt/runtime/stream.h"
 
 #include <cstdint>
+#include <functional>
 #include <memory>
 #include <string>
 #include <utility>
@@ -26,10 +27,13 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/memory/memory.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/tfrt/runtime/step_id.h"
 #include "tensorflow/core/tfrt/saved_model/saved_model_testutil.h"
 #include "tensorflow/core/tfrt/utils/thread_pool.h"
 #include "tsl/platform/env.h"
@@ -44,27 +48,17 @@ using ::testing::AnyOf;
 using ::testing::ElementsAreArray;
 using ::testing::Pair;
 using ::testing::UnorderedElementsAre;
-
-class TestStreamInterface : public StreamControllerInterface {
- public:
-  TestStreamInterface() : StreamControllerInterface("test_address") {}
-};
-
-const bool kUnused = []() {
-  GetGlobalStreamInterfaceFactory().RegisterController(
-      []() { return std::make_unique<TestStreamInterface>(); });
-  return true;
-}();
+using ::testing::status::StatusIs;
 
 TEST(StreamTest, Simple) {
   StreamCallbackId callback_id(1234);
   StepId step_id(5678);
 
   std::vector<absl::flat_hash_map<std::string, tensorflow::Tensor>> outputs;
-
+  ScopedStreamCallback scoped_stream_callback;
   {
     TF_ASSERT_OK_AND_ASSIGN(
-        auto scoped_stream_callback,
+        scoped_stream_callback,
         GetGlobalStreamCallbackRegistry().Register(
             "test_model", callback_id, step_id,
             [&](absl::flat_hash_map<std::string, tensorflow::Tensor> arg) {
@@ -91,6 +85,16 @@ TEST(StreamTest, Simple) {
               ElementsAreArray({200}));
   EXPECT_THAT(GetTfTensorData<int32_t>(outputs[1]["c"]),
               ElementsAreArray({300}));
+
+  ScopedStreamCallback scoped_stream_callback_copy;
+  scoped_stream_callback_copy = std::move(scoped_stream_callback);
+
+  auto status = GetGlobalStreamCallbackRegistry().Register(
+      "test_model", callback_id, step_id,
+      [&](absl::flat_hash_map<std::string, tensorflow::Tensor> arg) {
+        outputs.push_back(std::move(arg));
+      });
+  EXPECT_THAT(status, StatusIs(absl::StatusCode::kAlreadyExists));
 }
 
 TEST(StreamTest, MultipleWriters) {
@@ -142,6 +146,63 @@ TEST(StreamTest, MultipleWriters) {
   }
 }
 
+class TestStreamControllerInterface : public StreamControllerInterface {
+ public:
+  TestStreamControllerInterface()
+      : StreamControllerInterface("test_controller_address") {}
+};
+
+TEST(StreamControllerInterface, Initialize) {
+  GetGlobalStreamInterfaceFactory().RegisterController(
+      []() { return std::make_unique<TestStreamControllerInterface>(); });
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto controller_interface,
+      GetGlobalStreamInterfaceFactory().CreateControllerStreamInterface());
+  EXPECT_EQ(controller_interface->controller_address(),
+            "test_controller_address");
+}
+
+class TestStreamWorkerInterface : public StreamWorkerInterface {
+ public:
+  explicit TestStreamWorkerInterface(std::string worker_address)
+      : StreamWorkerInterface(worker_address) {}
+  absl::Status InvokeStreamCallback(
+      const StreamCallbackId& callback_id,
+      const std::vector<std::string>& names,
+      const std::vector<std::pair<int64_t, std::vector<tensorflow::Tensor>>>&
+          responses) override {
+    return absl::OkStatus();
+  }
+};
+
+TEST(StreamWorkerInterface, Initialize) {
+  GetGlobalStreamInterfaceFactory().RegisterWorker(
+      [](absl::string_view address)
+          -> absl::StatusOr<std::unique_ptr<TestStreamWorkerInterface>> {
+        return std::make_unique<TestStreamWorkerInterface>(
+            "test_worker_address");
+      });
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto worker_interface,
+      GetGlobalStreamInterfaceFactory().CreateWorkerStreamInterface()(
+          "test_worker_address"));
+  EXPECT_EQ(worker_interface->controller_address(), "test_worker_address");
+}
+
+TEST(StepId, Generate) {
+  StepId step_id(1234);
+  EXPECT_EQ(step_id.id, 1234);
+  StepIdGenerator step_id_generator;
+  EXPECT_EQ(step_id_generator.GetNextStepId(), StepId(1));
+  EXPECT_EQ(step_id_generator.GetNextStepId(), StepId(2));
+  EXPECT_EQ(step_id_generator.GetNextStepId(), StepId(3));
+}
+
+TEST(StepId, GlobalInitial) {
+  EXPECT_EQ(GetGlobalInitialStepId(), 0);
+  TEST_ScopedInitialStepId test_id(127);
+  EXPECT_EQ(GetGlobalInitialStepId(), 127);
+}
 }  // namespace
 }  // namespace tfrt_stub
 }  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/saved_model/BUILD b/tensorflow/core/tfrt/saved_model/BUILD
index 7a682f7ce6d00f..9e85c14baef362 100644
--- a/tensorflow/core/tfrt/saved_model/BUILD
+++ b/tensorflow/core/tfrt/saved_model/BUILD
@@ -118,6 +118,7 @@ cc_library(
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/common_runtime:device_mgr",
         "//tensorflow/core/framework:function_proto_cc",
         "//tensorflow/core/framework:graph_proto_cc",
         "//tensorflow/core/framework:tensor_proto_cc",
@@ -276,24 +277,23 @@ cc_library(
         "//tensorflow/compiler/mlir/tfrt:transforms/gpu_passes",
         "//tensorflow/compiler/mlir/tfrt/ir:tfrt_fallback_async_opdefs",
         "//tensorflow/compiler/mlir/tfrt/ir:tfrt_fallback_opdefs",
+        "//tensorflow/core:framework",
         "//tensorflow/core:framework_types_hdr",
         "//tensorflow/core:lib",
         "//tensorflow/core/framework:graph_proto_cc",
         "//tensorflow/core/framework:tensor",
         "//tensorflow/core/framework:tensor_proto_cc",
         "//tensorflow/core/platform:path",
-        "//tensorflow/core/platform:thread_annotations",
         "//tensorflow/core/protobuf:for_core_protos_cc",
         "//tensorflow/core/tfrt/fallback:fallback_state",
-        "//tensorflow/core/tfrt/graph_executor",
-        "//tensorflow/core/tfrt/graph_executor:graph_execution_options",
         "//tensorflow/core/tfrt/mlrt/bytecode",
-        "//tensorflow/core/tfrt/runtime",
         "//tensorflow/core/tfrt/saved_model/utils:serialize_utils",
+        "//tensorflow/core/tfrt/utils:tfrt_graph_execution_state",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
@@ -301,7 +301,6 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:protobuf",
         "@tf_runtime//:bef",
-        "@tf_runtime//:hostcontext",
         "@tf_runtime//:init_tfrt_dialects",
     ],
 )
diff --git a/tensorflow/core/tfrt/saved_model/python/saved_model_load_and_run.cc b/tensorflow/core/tfrt/saved_model/python/saved_model_load_and_run.cc
index 35bb55a75efd76..e1ccd2ee468d0b 100644
--- a/tensorflow/core/tfrt/saved_model/python/saved_model_load_and_run.cc
+++ b/tensorflow/core/tfrt/saved_model/python/saved_model_load_and_run.cc
@@ -40,7 +40,7 @@ limitations under the License.
 namespace tensorflow::tfrt_stub {
 using RefCountHandle = tsl::core::RefCountPtr<tensorflow::TensorHandle>;
 
-tensorflow::StatusOr<std::unique_ptr<SavedModel>> LoadSavedModel(
+absl::StatusOr<std::unique_ptr<SavedModel>> LoadSavedModel(
     absl::string_view saved_model_dir,
     const std::unordered_set<std::string>& tags) {
   auto runtime = tensorflow::tfrt_stub::Runtime::Create(
diff --git a/tensorflow/core/tfrt/saved_model/python/saved_model_load_and_run.h b/tensorflow/core/tfrt/saved_model/python/saved_model_load_and_run.h
index ccb0b5be028564..86c565d09f463c 100644
--- a/tensorflow/core/tfrt/saved_model/python/saved_model_load_and_run.h
+++ b/tensorflow/core/tfrt/saved_model/python/saved_model_load_and_run.h
@@ -30,7 +30,7 @@ limitations under the License.
 
 namespace tensorflow::tfrt_stub {
 
-tensorflow::StatusOr<std::unique_ptr<SavedModel>> LoadSavedModel(
+absl::StatusOr<std::unique_ptr<SavedModel>> LoadSavedModel(
     absl::string_view saved_model_dir,
     const std::unordered_set<std::string>& tags);
 
diff --git a/tensorflow/core/tfrt/saved_model/saved_model.cc b/tensorflow/core/tfrt/saved_model/saved_model.cc
index 0dc659468e28f0..13ff33bb9e086c 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model.cc
+++ b/tensorflow/core/tfrt/saved_model/saved_model.cc
@@ -449,6 +449,16 @@ absl::StatusOr<std::unique_ptr<SavedModel>> SavedModelImpl::LoadSavedModel(
     Options options, tensorflow::MetaGraphDef meta_graph_def,
     absl::string_view saved_model_dir) {
   LOG(INFO) << "TFRT loading v1 savedmodel: " << saved_model_dir;
+
+  if (options.graph_execution_options.use_ifrt) {
+    if (!options.graph_execution_options.enable_mlrt ||
+        !options.enable_lazy_loading ||
+        !options.lazy_loading_use_graph_executor) {
+      return absl::UnimplementedError(
+          "Using IFRT in TFRT requires mlrt and lazy loading.");
+    }
+  }
+
   tfrt::metrics::AddTFRTVersionMetric();
 
   UpdateTpuTargetByBridgeCompatibility(options.graph_execution_options,
@@ -563,12 +573,19 @@ absl::StatusOr<std::unique_ptr<SavedModel>> SavedModelImpl::LoadSavedModel(
                                     resource_context.get());
 
   {
-    model_context.set_meta_graph_def(&meta_graph_def);
+    CallableOptions callable_options =
+        CombineSignatureDefs(meta_graph_def.signature_def());
+    model_context.set_graph_def(&meta_graph_def.graph_def());
+    model_context.set_callable_options(&callable_options);
+    model_context.set_device_mgr(&fallback_state->device_manager());
+
     TF_RETURN_IF_ERROR(
         options.graph_execution_options.runtime->CreateRuntimeResources(
             model_context));
-
-    model_context.set_meta_graph_def(nullptr);
+    // These are only needed for `CreateRuntimeResources`, and also safer
+    // since meta_graph_def will be moved.
+    model_context.set_graph_def(nullptr);
+    model_context.set_callable_options(nullptr);
   }
 
   GetDefaultInputValue(meta_graph_def.signature_def(), model_context,
@@ -707,14 +724,17 @@ SavedModelImpl::SavedModelImpl(
       meta_graph_def_(std::move(meta_graph_def)),
       bef_(std::move(bef)),
       bef_file_(std::move(bef_file)),
-      bytecode_(std::move(bytecode)),
-      loaded_executable_(std::move(loaded_executable)),
       req_deadline_tracker_(
           options_.graph_execution_options.runtime->core_runtime()
               ->GetHostContext()),
       signatures_(std::move(signatures)),
       runner_table_(std::move(runner_table)),
-      resource_array_(std::move(resource_array)) {}
+      resource_array_(std::move(resource_array)) {
+  if (!options_.enable_lazy_loading) {
+    bytecode_ = std::move(bytecode);
+    loaded_executable_ = std::move(loaded_executable);
+  }
+}
 
 std::vector<std::string> SavedModelImpl::GetFunctionNames() const {
   std::vector<std::string> result;
diff --git a/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc b/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc
index 3e75f8aa09b186..0ed2e8618c4d71 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc
+++ b/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc
@@ -238,14 +238,17 @@ absl::StatusOr<AotResult> AotCompileSavedModel(
                                     std::string(input_model_dir),
                                     resource_context.get());
 
-  {
-    model_context.set_meta_graph_def(&meta_graph_def);
-    TF_RETURN_IF_ERROR(
-        aot_options.graph_execution_options->runtime->CreateRuntimeResources(
-            model_context));
-
-    model_context.set_meta_graph_def(nullptr);
-  }
+  CallableOptions callable_options =
+      CombineSignatureDefs(meta_graph_def.signature_def());
+  model_context.set_graph_def(&meta_graph_def.graph_def());
+  model_context.set_callable_options(&callable_options);
+  TF_RETURN_IF_ERROR(
+      aot_options.graph_execution_options->runtime->CreateRuntimeResources(
+          model_context));
+  // These are only needed for `CreateRuntimeResources`, and also safer
+  // since meta_graph_def will be moved.
+  model_context.set_graph_def(nullptr);
+  model_context.set_callable_options(nullptr);
 
   tfrt::BefBuffer bef;
   std::vector<std::string> xla_function_names;
diff --git a/tensorflow/core/tfrt/saved_model/saved_model_util.cc b/tensorflow/core/tfrt/saved_model/saved_model_util.cc
index e03c7d6f24fd76..aeea4b58c6ecfa 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model_util.cc
+++ b/tensorflow/core/tfrt/saved_model/saved_model_util.cc
@@ -366,6 +366,22 @@ absl::Status DeserializeAoTMlirModule(
   return absl::OkStatus();
 }
 
+CallableOptions CombineSignatureDefs(
+    const google::protobuf::Map<std::string, SignatureDef>& signature_defs) {
+  CallableOptions callable_options;
+  for (const auto& sig_iter : signature_defs) {
+    const auto& signature_def = sig_iter.second;
+
+    for (const auto& p : signature_def.inputs()) {
+      callable_options.add_feed(p.second.name());
+    }
+    for (const auto& p : signature_def.outputs()) {
+      callable_options.add_fetch(p.second.name());
+    }
+  }
+  return callable_options;
+}
+
 void RegisterTfrtDialectsForAot(mlir::DialectRegistry& registry) {
   tfrt::RegisterTFRTDialects(registry);
   registry.insert<tfrt::fallback::FallbackDialect>();
diff --git a/tensorflow/core/tfrt/saved_model/saved_model_util.h b/tensorflow/core/tfrt/saved_model/saved_model_util.h
index 42fd2473875053..f073f0b4a33784 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model_util.h
+++ b/tensorflow/core/tfrt/saved_model/saved_model_util.h
@@ -16,32 +16,30 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TFRT_SAVED_MODEL_SAVED_MODEL_UTIL_H_
 #define TENSORFLOW_CORE_TFRT_SAVED_MODEL_SAVED_MODEL_UTIL_H_
 
-#include <cstdint>
-#include <functional>
-#include <limits>
-#include <memory>
-#include <optional>
 #include <string>
 #include <unordered_set>
 #include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "absl/types/span.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
-#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tensorflow/core/tfrt/fallback/fallback_state.h"
-#include "tensorflow/core/tfrt/graph_executor/graph_execution_options.h"
-#include "tensorflow/core/tfrt/graph_executor/graph_executor.h"
-#include "tensorflow/core/tfrt/runtime/runtime.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
 #include "tsl/platform/protobuf.h"
-#include "tfrt/host_context/function.h"  // from @tf_runtime
-#include "tfrt/host_context/request_deadline_tracker.h"  // from @tf_runtime
-#include "tfrt/host_context/resource_context.h"  // from @tf_runtime
+#include "tfrt/bef/bef_buffer.h"  // from @tf_runtime
 
 namespace tensorflow {
 namespace tfrt_stub {
@@ -143,6 +141,9 @@ absl::Status DeserializeAoTMlirModule(
     absl::string_view saved_model_dir, mlir::MLIRContext* context,
     mlir::OwningOpRef<mlir::ModuleOp>* mlir_module);
 
+CallableOptions CombineSignatureDefs(
+    const google::protobuf::Map<std::string, SignatureDef>& signature_defs);
+
 void RegisterTfrtDialectsForAot(mlir::DialectRegistry& registry);
 
 }  // namespace tfrt_stub
diff --git a/tensorflow/core/tfrt/saved_model/tests/BUILD b/tensorflow/core/tfrt/saved_model/tests/BUILD
index 5da2ed329f5115..64e8a3f61f3785 100644
--- a/tensorflow/core/tfrt/saved_model/tests/BUILD
+++ b/tensorflow/core/tfrt/saved_model/tests/BUILD
@@ -650,12 +650,14 @@ cc_library(
         "//tensorflow/core/runtime_fallback/runtime:runtime_fallback_alwayslink",
         "//tensorflow/core/tfrt:ifrt_program_ops_op_lib",
         "//tensorflow/core/tfrt/ifrt:ifrt_model_context",
+        "//tensorflow/core/tfrt/ifrt:ifrt_serving_core_selector",
         "//tensorflow/core/tfrt/mlrt/kernel:ifrt_ops_kernel",
         "//tensorflow/core/tfrt/runtime",
         "//tensorflow/core/tfrt/saved_model:saved_model_cpu",
         "//tensorflow/core/tfrt/saved_model:saved_model_testutil",
         "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/framework/test_util:mock_serving_device_selector",
         "@local_tsl//tsl/platform:env",
         "@local_xla//xla/python/ifrt",
         "@local_xla//xla/python/ifrt:test_util",
diff --git a/tensorflow/core/tfrt/saved_model/tests/saved_model_ifrt_test.cc b/tensorflow/core/tfrt/saved_model/tests/saved_model_ifrt_test.cc
index cef110c75083d0..2ffaf1893ae1c8 100644
--- a/tensorflow/core/tfrt/saved_model/tests/saved_model_ifrt_test.cc
+++ b/tensorflow/core/tfrt/saved_model/tests/saved_model_ifrt_test.cc
@@ -28,9 +28,11 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/resource_loader.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_model_context.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_serving_core_selector.h"
 #include "tensorflow/core/tfrt/runtime/runtime.h"
 #include "tensorflow/core/tfrt/saved_model/saved_model.h"
 #include "tensorflow/core/tfrt/saved_model/saved_model_testutil.h"
+#include "tsl/framework/test_util/mock_serving_device_selector.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/statusor.h"
@@ -63,12 +65,15 @@ TEST(SavedModelIfrt, Basic) {
   auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
       /*num_threads=*/4, /*num_blocking_threads=*/4);
 
+  tsl::test_util::MockServingDeviceSelector selector;
+  ifrt_serving::IfrtServingCoreSelector core_selector(&selector);
+
   // Use IFRT compiler
   runtime->AddCreateRuntimeResourceFn(
       [&](tensorflow::tfrt_stub::ModelRuntimeContext& model_context) {
         model_context.resource_context()
             .CreateResource<tensorflow::ifrt_serving::IfrtModelContext>(
-                "IfrtModelContext", client, &GetThreadPool());
+                "IfrtModelContext", client, &core_selector, &GetThreadPool());
 
         (*model_context.resource_context()
               .GetResource<tensorflow::ifrt_serving::IfrtModelContext>(
diff --git a/tensorflow/core/tfrt/saved_model/tests/saved_model_test.cc b/tensorflow/core/tfrt/saved_model/tests/saved_model_test.cc
index cbb7f716c7632d..183b41805d4fab 100644
--- a/tensorflow/core/tfrt/saved_model/tests/saved_model_test.cc
+++ b/tensorflow/core/tfrt/saved_model/tests/saved_model_test.cc
@@ -1088,7 +1088,7 @@ TEST(SavedModelTest, CustomModelConfig) {
         test_config = model_context.graph_execution_options()
                           .runtime_config.Get<TestConfig1>()
                           .value();
-        EXPECT_TRUE(model_context.meta_graph_def());
+        EXPECT_TRUE(model_context.graph_def());
         return absl::OkStatus();
       });
 
diff --git a/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc b/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc
index 7acc2ae074e7af..67475b0b23ed3e 100644
--- a/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc
+++ b/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/log/die_if_null.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/clock.h"
@@ -34,6 +35,7 @@ limitations under the License.
 #include "Eigen/ThreadPool"  // from @eigen_archive
 #include "llvm/ADT/STLExtras.h"
 #include "tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/local_session_selection.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/session_factory.h"
@@ -140,13 +142,14 @@ class TfrtSession : public tensorflow::Session {
   explicit TfrtSession(const SessionOptions& options,
                        tensorflow::tfrt_stub::Runtime* runtime,
                        TfrtDeviceInfraTarget device_target,
-                       bool tpu_use_tpu_runner,
+                       bool tpu_use_tpu_runner, bool use_gpu,
                        TfrtSessionInterOpThreadPools inter_op_thread_pools,
                        bool enable_mlrt,
                        tensorflow::BackendCompiler* backend_compiler)
       : runtime_{runtime},
         device_target_{device_target},
         tpu_use_tpu_runner_{tpu_use_tpu_runner},
+        use_gpu_{use_gpu},
         inter_op_thread_pools_{std::move(inter_op_thread_pools)},
         enable_mlrt_(enable_mlrt),
         options_{options},
@@ -214,35 +217,13 @@ class TfrtSession : public tensorflow::Session {
     auto resource_context = std::make_unique<tfrt::ResourceContext>();
     tfrt_stub::ModelRuntimeContext model_context(
         &options, /*export_dir=*/"unknown_export_dir", resource_context.get());
-    MetaGraphDef meta_graph_def;
-    *meta_graph_def.mutable_graph_def() = graph;
-    model_context.set_meta_graph_def(&meta_graph_def);
+    // TODO(b/334641254): Offer a Session option that prunes the graph_def.
+    model_context.set_graph_def(&graph);
+    model_context.set_device_mgr(&fallback_state->device_manager());
     // In the multi-host case, this prevents local Sessions from running
     // global resource creation functions.
     model_context.set_is_local_session(
         !options_.config.experimental().enable_multi_host());
-    // Create mock signature def with all inputs and outputs. This
-    // prevents graph pruning.
-    if (options_.config.experimental().enable_multi_host()) {
-      // Fake a SignatureDef with all inputs and outputs.
-      // TODO(b/303480573): Cleanup ServingContext to not use
-      // MetaGraphDef.
-      SignatureDef& signature_def =
-          (*meta_graph_def.mutable_signature_def())["dummy_signature"];
-      for (const auto& node : graph.node()) {
-        if (node.op() == "Placeholder" || node.op() == "Const") {
-          TensorInfo& tensor_info =
-              (*signature_def.mutable_inputs())[node.name()];
-          tensor_info.set_name(node.name());
-        }
-        if (node.attr().contains("Tout")) {
-          TensorInfo& tensor_info =
-              (*signature_def.mutable_outputs())[node.name()];
-          tensor_info.set_name(node.name());
-        }
-      }
-    }
-
     TF_RETURN_IF_ERROR(options.runtime->CreateRuntimeResources(model_context));
 
     // `GraphExecutor::Create()` will preprocess the graph (e.g., apply
@@ -489,6 +470,11 @@ class TfrtSession : public tensorflow::Session {
     compile_options.sink_in_invariant_ops = false;
     compile_options.cost_threshold = 1024;
 
+    if (use_gpu_) {
+      options.enable_tfrt_gpu = true;
+      options.enable_grappler_function_optimizer = true;
+    }
+
     // Enable TpuHostAllocator only for TpuRunner as it is the only
     // implementation that supports the premapped memory optimization.
     compile_options.use_tpu_host_allocator_for_inputs = tpu_use_tpu_runner_;
@@ -527,6 +513,7 @@ class TfrtSession : public tensorflow::Session {
   tensorflow::tfrt_stub::Runtime* runtime_ = nullptr;
   const TfrtDeviceInfraTarget device_target_;
   const bool tpu_use_tpu_runner_;
+  const bool use_gpu_;
   TfrtSessionInterOpThreadPools inter_op_thread_pools_;
 
   mutable absl::Mutex callables_lock_;
@@ -756,8 +743,13 @@ Status TfrtSessionFactory::InitializeLocked(const TfrtSessionOptions& options) {
   mutex_.AssertHeld();
   if (options.use_tpu) {
     DCHECK(!options.backend_compiler);
+    DCHECK(!options.use_gpu);
     device_target_ = TfrtDeviceInfraTarget::kTpurt;
     tpu_use_tpu_runner_ = true;
+  } else if (options.use_gpu) {
+    DCHECK(!options.backend_compiler);
+    device_target_ = TfrtDeviceInfraTarget::kGpu;
+    use_gpu_ = true;
   } else if (options.backend_compiler) {
     backend_compiler_ = options.backend_compiler;
   }
@@ -809,7 +801,7 @@ Status TfrtSessionFactory::NewSession(const SessionOptions& options,
                                ? backend_compiler_
                                : nullptr;
   *out_session = new TfrtSession(
-      options, runtime_, device_target_, tpu_use_tpu_runner_,
+      options, runtime_, device_target_, tpu_use_tpu_runner_, use_gpu_,
       std::move(inter_op_thread_pools), enable_mlrt_, backend_compiler);
   return absl::OkStatus();
 }
diff --git a/tensorflow/core/tfrt/tfrt_session/tfrt_session.h b/tensorflow/core/tfrt/tfrt_session/tfrt_session.h
index 31aaa4e0cfe716..4b53e93f115376 100644
--- a/tensorflow/core/tfrt/tfrt_session/tfrt_session.h
+++ b/tensorflow/core/tfrt/tfrt_session/tfrt_session.h
@@ -52,10 +52,9 @@ struct TfrtSessionOptions {
   TfrtThreadpoolOptions threadpool_options;
   tensorflow::tfrt_stub::Runtime* runtime = nullptr;
   bool enable_mlrt = false;
-  // TODO(b/319186082): Currently this is set through a separate call to
-  // `InitializeTfrtSession`, but should be set in the same call.
-  // Should only set one of `use_tpu` and `backend_compiler`.
+  // Should only set one of `use_tpu` and `use_gpu` and `backend_compiler`.
   bool use_tpu = false;
+  bool use_gpu = false;
   tensorflow::BackendCompiler* backend_compiler = nullptr;
 };
 
@@ -100,6 +99,7 @@ class TfrtSessionFactory : public tensorflow::SessionFactory {
   TfrtDeviceInfraTarget device_target_ TF_GUARDED_BY(mutex_) =
       TfrtDeviceInfraTarget::kCpu;
   bool tpu_use_tpu_runner_ TF_GUARDED_BY(mutex_) = false;
+  bool use_gpu_ TF_GUARDED_BY(mutex_) = false;
   std::unique_ptr<ThreadPoolManager> thread_pool_manager_ TF_GUARDED_BY(mutex_);
   bool enable_mlrt_ TF_GUARDED_BY(mutex_) = false;
   tensorflow::BackendCompiler* backend_compiler_ TF_GUARDED_BY(mutex_);
diff --git a/tensorflow/core/tfrt/utils/BUILD b/tensorflow/core/tfrt/utils/BUILD
index 7517e07928f87b..7c5cf7a46579a5 100644
--- a/tensorflow/core/tfrt/utils/BUILD
+++ b/tensorflow/core/tfrt/utils/BUILD
@@ -155,9 +155,10 @@ tf_cc_test(
     deps = [
         ":error_util",
         "//tensorflow/core/platform:status",
+        "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
+        "@local_xla//xla/tsl/concurrency:async_value",
         "@tf_runtime//:support",
-        "@tf_runtime//cpp_tests:common",
     ],
 )
 
diff --git a/tensorflow/core/tfrt/utils/debug/BUILD b/tensorflow/core/tfrt/utils/debug/BUILD
index 38965aed32f1dc..9f103159f717a7 100644
--- a/tensorflow/core/tfrt/utils/debug/BUILD
+++ b/tensorflow/core/tfrt/utils/debug/BUILD
@@ -14,9 +14,10 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core/lib/core:errors",
         "//tensorflow/core/platform:status",
-        "//tensorflow/core/platform:statusor",
         "//tensorflow/core/protobuf:for_core_protos_cc",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
@@ -47,11 +48,11 @@ cc_library(
 #         "//tensorflow/cc:sendrecv_ops",
 #         "//tensorflow/cc/saved_model:reader",
 #         "//tensorflow/core:core_cpu_base",
+#         "//tensorflow/core:portable_gif_internal",
 #         "//tensorflow/core:test",
 #         "//tensorflow/core/framework:op",
 #         "//tensorflow/core/framework:tensor",
 #         "//tensorflow/core/framework:types_proto_cc",
-#         "//tensorflow/core/lib/core:status",
 #         "//tensorflow/core/platform:env",
 #         "//tensorflow/core/platform:resource_loader",
 #         "//tensorflow/core/protobuf:for_core_protos_cc",
diff --git a/tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter.cc b/tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter.cc
index af28439d42ca19..80017a11cd8a36 100644
--- a/tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter.cc
+++ b/tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/common_runtime/function_body.h"
@@ -32,7 +34,6 @@ limitations under the License.
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
diff --git a/tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter_test.cc b/tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter_test.cc
index 3ccb66f704df60..330added9a6099 100644
--- a/tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter_test.cc
+++ b/tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter_test.cc
@@ -39,9 +39,9 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/resource_loader.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tensorflow/core/tfrt/saved_model/saved_model.h"
 #include "tensorflow/core/tfrt/saved_model/saved_model_testutil.h"
diff --git a/tensorflow/core/tfrt/utils/error_util.cc b/tensorflow/core/tfrt/utils/error_util.cc
index 1f8fb6a6770570..2530b98f051041 100644
--- a/tensorflow/core/tfrt/utils/error_util.cc
+++ b/tensorflow/core/tfrt/utils/error_util.cc
@@ -47,10 +47,4 @@ tensorflow::Status ToTfStatus(const tfrt::AsyncValue* av) {
   return absl::OkStatus();
 }
 
-absl::Status AbslStatusFromTfStatus(tensorflow::Status status) {
-  if (status.ok()) return absl::OkStatus();
-  return absl::Status(static_cast<absl::StatusCode>(status.code()),
-                      status.message());
-}
-
 }  // namespace tfrt
diff --git a/tensorflow/core/tfrt/utils/error_util.h b/tensorflow/core/tfrt/utils/error_util.h
index e694931f82e2e5..ee7bcd81dd913f 100644
--- a/tensorflow/core/tfrt/utils/error_util.h
+++ b/tensorflow/core/tfrt/utils/error_util.h
@@ -76,8 +76,6 @@ inline llvm::Error MakeStatusError(tensorflow::Status status) {
   return MakeStringError(MakeStatusString(status));
 }
 
-absl::Status AbslStatusFromTfStatus(tensorflow::Status status);
-
 }  // namespace tfrt
 
 #endif  // TENSORFLOW_CORE_TFRT_UTILS_ERROR_UTIL_H_
diff --git a/tensorflow/core/tfrt/utils/error_util_test.cc b/tensorflow/core/tfrt/utils/error_util_test.cc
index 07c65905825a53..06edb63c897af4 100644
--- a/tensorflow/core/tfrt/utils/error_util_test.cc
+++ b/tensorflow/core/tfrt/utils/error_util_test.cc
@@ -16,6 +16,8 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
 #include "tensorflow/core/platform/status.h"
 #include "tfrt/support/error_util.h"  // from @tf_runtime
 
@@ -38,5 +40,13 @@ TEST(ErrorUtilTest, UnsupportedErrorConversion) {
             tfrt::ErrorCode::kUnknown);
 }
 
+TEST(ErrorUtilTest, ToTfStatusError) {
+  auto error_av =
+      tsl::MakeErrorAsyncValueRef(absl::UnauthenticatedError("test_error"));
+  auto status = ToTfStatus(error_av.get());
+  EXPECT_EQ(status.code(), absl::StatusCode::kUnauthenticated);
+  EXPECT_EQ(status.message(), "test_error");
+}
+
 }  // namespace
 }  // namespace tfrt
diff --git a/tensorflow/core/tfrt/utils/utils.h b/tensorflow/core/tfrt/utils/utils.h
index d97c174a913875..3276101c1db970 100644
--- a/tensorflow/core/tfrt/utils/utils.h
+++ b/tensorflow/core/tfrt/utils/utils.h
@@ -41,7 +41,7 @@ class BEFFile;
 class ExecutionContext;
 class HostContext;
 
-typedef tensorflow::gtl::InlinedVector<tfrt::DType, 4> TfrtDataTypeVector;
+typedef absl::InlinedVector<tfrt::DType, 4UL> TfrtDataTypeVector;
 typedef absl::Span<const tfrt::DType> TfrtDataTypeSlice;
 
 DType ConvertTfDTypeToTfrtDType(tensorflow::DataType dtype);
diff --git a/tensorflow/core/tpu/graph_rewrite/configure_tpu_embedding_rewrite_pass.cc b/tensorflow/core/tpu/graph_rewrite/configure_tpu_embedding_rewrite_pass.cc
index 329b98396a1195..c20b3c8096ccf2 100644
--- a/tensorflow/core/tpu/graph_rewrite/configure_tpu_embedding_rewrite_pass.cc
+++ b/tensorflow/core/tpu/graph_rewrite/configure_tpu_embedding_rewrite_pass.cc
@@ -73,7 +73,7 @@ Status AddSynchronizationNode(
       graph->AddEdge(sync_node, dep.src_output, dep.dst, dep.dst_input);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status AddSetupPropagationEmbeddingNode(const string& device_name,
@@ -96,7 +96,7 @@ Status AddSetupPropagationEmbeddingNode(const string& device_name,
   for (int i = 0; i < input_nodes.size(); ++i) {
     graph->AddEdge(input_nodes[i], 0, *node, i);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status AddExecutePartitionerNode(const string& configuration_device_name,
@@ -116,7 +116,7 @@ Status AddExecutePartitionerNode(const string& configuration_device_name,
     graph->AddControlEdge(src_node, *partitioner_node);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status AddConfigureMemoryNode(const string& host_device_name,
@@ -130,7 +130,7 @@ Status AddConfigureMemoryNode(const string& host_device_name,
   TF_ASSIGN_OR_RETURN(*embedding_node, graph->AddNode(embedding_def));
   (*embedding_node)->set_assigned_device_name(host_device_name);
   graph->AddEdge(partitioner_node, 0, *embedding_node, 0);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status AddCollateMemoryNode(const string& configuration_device_name,
@@ -160,7 +160,7 @@ Status AddConfigureHostNode(const string& host_device_name,
   graph->AddEdge(partitioner_node, 0, *embedding_node, 0);
   graph->AddEdge(memory_node, 0, *embedding_node, 1);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status AddConnectHostsNode(const string& host_device_name,
@@ -188,7 +188,7 @@ Status AddFinalizeNode(const string& configuration_device_name,
   graph->AddEdge(partitioner_node, 0, *finalize_node, 0);
   graph->AddEdge(memory_node, 0, *finalize_node, 1);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -328,7 +328,7 @@ Status ConfigureTPUEmbeddingRewritePass::Run(
                   end_nodes, output_dependencies, graph));
             }
 
-            return OkStatus();
+            return absl::OkStatus();
           }));
 
   if (VLOG_IS_ON(1)) {
@@ -336,7 +336,7 @@ Status ConfigureTPUEmbeddingRewritePass::Run(
   }
 
   VLOG(1) << "ConfigureTPUEmbeddingRewritePass::Run() finished";
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc b/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc
index 9008591c2800db..a3af3d082f1280 100644
--- a/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc
+++ b/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc
@@ -701,10 +701,10 @@ struct ShardedInputInfo {
 
 // Adds pad node after split node to graph for uneven sharding tiled inputs.
 // |graph| owns the returned Node* instance.
-StatusOr<Node*> CreatePadNode(const int padding, const int num_dims,
-                              const int split_dim, DataType dtype,
-                              Node* control_predecessor, Node* split_node,
-                              const int split_index, Graph* graph) {
+absl::StatusOr<Node*> CreatePadNode(const int padding, const int num_dims,
+                                    const int split_dim, DataType dtype,
+                                    Node* control_predecessor, Node* split_node,
+                                    const int split_index, Graph* graph) {
   // Add paddings node.
   Status s;
   NodeDef paddings_def;
@@ -749,12 +749,12 @@ StatusOr<Node*> CreatePadNode(const int padding, const int num_dims,
 
 // Adds split node and split dimension node to graph for sharding tiled inputs.
 // |graph| owns the returned Node* instance.
-StatusOr<Node*> CreateSplitNode(const int num_splits, const int dim,
-                                const int num_dims, const int64_t padding,
-                                const int orig_src_output, DataType dtype,
-                                absl::string_view name_prefix,
-                                Node* control_predecessor, Node* orig_src,
-                                Graph* graph) {
+absl::StatusOr<Node*> CreateSplitNode(const int num_splits, const int dim,
+                                      const int num_dims, const int64_t padding,
+                                      const int orig_src_output, DataType dtype,
+                                      absl::string_view name_prefix,
+                                      Node* control_predecessor, Node* orig_src,
+                                      Graph* graph) {
   const std::string input_assigned_device = orig_src->assigned_device_name();
   Node* to_split_node = orig_src;
   int to_split_index = orig_src_output;
@@ -825,7 +825,7 @@ int64_t GetPadding(const int split_dim, const int num_splits,
 }
 
 // Creates a set of splits nodes that shards tiled input node in graph.
-StatusOr<ShardedInputInfo> CreateOrGetSplitNodesForInputSharding(
+absl::StatusOr<ShardedInputInfo> CreateOrGetSplitNodesForInputSharding(
     const xla::OpSharding& sharding, int orig_arg_num, DataType dtype,
     const PartialTensorShape& partial_tensor_shape, int replica_id,
     int orig_src_output, Node* orig_src, Node* control_predecessor,
@@ -940,14 +940,12 @@ StatusOr<ShardedInputInfo> CreateOrGetSplitNodesForInputSharding(
 
 // Creates a xla split node to shard an input, and adds that new node to a
 // Graph.
-StatusOr<Node*> CreateXlaSplitOp(absl::string_view node_name,
-                                 const bool is_resource, const NodeOut& input,
-                                 const PartialTensorShape& partial_tensor_shape,
-                                 const std::vector<Node*>& control_inputs,
-                                 const std::vector<Node*>& control_outputs,
-                                 const DataType dtype, const int num_shards,
-                                 const xla::OpSharding& sharding,
-                                 Graph* graph) {
+absl::StatusOr<Node*> CreateXlaSplitOp(
+    absl::string_view node_name, const bool is_resource, const NodeOut& input,
+    const PartialTensorShape& partial_tensor_shape,
+    const std::vector<Node*>& control_inputs,
+    const std::vector<Node*>& control_outputs, const DataType dtype,
+    const int num_shards, const xla::OpSharding& sharding, Graph* graph) {
   const std::string& input_assigned_device = input.node->assigned_device_name();
   NodeDef xla_split_def;
   xla_split_def.set_name(graph->NewName(node_name));
@@ -996,7 +994,7 @@ StatusOr<Node*> CreateXlaSplitOp(absl::string_view node_name,
 }
 
 // Creates a sharded tensor list for all input shards of an input with sharding.
-StatusOr<std::vector<NodeOut>> ShardInputWithXlaSplitOp(
+absl::StatusOr<std::vector<NodeOut>> ShardInputWithXlaSplitOp(
     absl::string_view node_name, const bool is_resource, const NodeOut& input,
     const PartialTensorShape& partial_tensor_shape,
     const std::vector<Node*>& control_inputs,
@@ -1028,7 +1026,7 @@ StatusOr<std::vector<NodeOut>> ShardInputWithXlaSplitOp(
 }
 
 // Creates an XlaSplitND op to shard a per-replica arg.
-StatusOr<ShardedInputInfo> CreateOrGetXlaSplitNodeForShardedPerReplicaArg(
+absl::StatusOr<ShardedInputInfo> CreateOrGetXlaSplitNodeForShardedPerReplicaArg(
     const xla::OpSharding& sharding, const int replica_id,
     const int orig_arg_num, DataType dtype,
     const PartialTensorShape& partial_tensor_shape, Node* orig_src,
@@ -1055,7 +1053,7 @@ StatusOr<ShardedInputInfo> CreateOrGetXlaSplitNodeForShardedPerReplicaArg(
 }
 
 // Creates an XlaSplitND op to shard a distributed arg.
-StatusOr<ShardedInputInfo> CreateOrGetXlaSplitNodeForDistributedArg(
+absl::StatusOr<ShardedInputInfo> CreateOrGetXlaSplitNodeForDistributedArg(
     const xla::OpSharding& sharding, const int num_replicas,
     const int replica_id, const int orig_arg_num, DataType dtype,
     const PartialTensorShape& partial_tensor_shape, Node* orig_src,
@@ -1085,7 +1083,7 @@ StatusOr<ShardedInputInfo> CreateOrGetXlaSplitNodeForDistributedArg(
 }
 
 // Creates an ReadVariableXlaSplitND op to shard a variable arg.
-StatusOr<ShardedInputInfo> CreateOrGetXlaSplitNodeForVariableArg(
+absl::StatusOr<ShardedInputInfo> CreateOrGetXlaSplitNodeForVariableArg(
     const xla::OpSharding& sharding, const int num_replicas,
     const int replica_id, const int orig_arg_num, DataType dtype,
     const PartialTensorShape& partial_tensor_shape, Node* orig_src,
@@ -1147,10 +1145,10 @@ StatusOr<ShardedInputInfo> CreateOrGetXlaSplitNodeForVariableArg(
 
 // Creates a concat node to be used for aggregating sharded retvals across
 // logical cores.
-StatusOr<Node*> CreateConcatNode(int dim, int num_splits, DataType dtype,
-                                 absl::string_view name_prefix,
-                                 const std::vector<NodeOut>& inputs,
-                                 Graph* graph, absl::string_view device) {
+absl::StatusOr<Node*> CreateConcatNode(int dim, int num_splits, DataType dtype,
+                                       absl::string_view name_prefix,
+                                       const std::vector<NodeOut>& inputs,
+                                       Graph* graph, absl::string_view device) {
   // Add a Concat dim node.
   NodeDef concat_dim_def;
   concat_dim_def.set_name(
@@ -1192,9 +1190,11 @@ StatusOr<Node*> CreateConcatNode(int dim, int num_splits, DataType dtype,
 }
 
 // Adds slice node after concat node to graph for uneven sharding tiled inputs.
-StatusOr<Node*> CreateSliceNode(DataType dtype, const PartialTensorShape& shape,
-                                Node* concat_node, const int concat_out_index,
-                                Graph* graph, absl::string_view device) {
+absl::StatusOr<Node*> CreateSliceNode(DataType dtype,
+                                      const PartialTensorShape& shape,
+                                      Node* concat_node,
+                                      const int concat_out_index, Graph* graph,
+                                      absl::string_view device) {
   Status s;
   // Add begin node for concat.
   NodeDef begin_def;
@@ -1252,7 +1252,7 @@ StatusOr<Node*> CreateSliceNode(DataType dtype, const PartialTensorShape& shape,
 // Creates a set of Concat nodes that aggregates sharded outputs from TPUExecute
 // nodes into a single output. Sharded outputs are concatenated along row major
 // order. That is, tiled output along 0th dimension will be concatenated last.
-StatusOr<Node*> CreateConcatNodesForRetval(
+absl::StatusOr<Node*> CreateConcatNodesForRetval(
     const xla::OpSharding& sharding, DataType dtype,
     const PartialTensorShape& inferred_shape, int replica_id,
     const std::vector<NodeOut>& orig_inputs, Graph* graph,
@@ -1302,7 +1302,7 @@ StatusOr<Node*> CreateConcatNodesForRetval(
   return inputs_to_sharded_retval.at(0).node;
 }
 
-StatusOr<Node*> CreateXlaConcatNode(
+absl::StatusOr<Node*> CreateXlaConcatNode(
     const xla::OpSharding& sharding, const int replica_id, DataType dtype,
     const PartialTensorShape& partial_tensor_shape,
     const std::vector<NodeOut>& orig_inputs, absl::string_view device,
@@ -1482,8 +1482,9 @@ void FindNodesMaybeContainingShardingInfo(const Node& input_node,
 // XlaSharding configuration may be derived from
 //   a) Connected Identity op node.
 //   b) Connected Cast op node.
-StatusOr<std::optional<NodeAndSharding>> ParseInputShardingFromAdjacentNode(
-    const int num_cores_per_replica, const Node& node) {
+absl::StatusOr<std::optional<NodeAndSharding>>
+ParseInputShardingFromAdjacentNode(const int num_cores_per_replica,
+                                   const Node& node) {
   // If |node| has `device` attribute or is a XlaSharding op,
   // return the parsed OpSharding.
   TF_ASSIGN_OR_RETURN(std::optional<xla::OpSharding> sharding,
@@ -1947,7 +1948,7 @@ Status DistributedTPURewritePass::GetArgAndRetvalShapes(
          !info->handle_shape.IsFullyDefined())) {
       any_replica_shape_unknown[input_index] = true;
     }
-    StatusOr<InferredShape> status =
+    absl::StatusOr<InferredShape> status =
         MergeInferredShapes((*arg_shapes)[input_index], *info);
     if (!status.ok()) {
       return absl::InvalidArgumentError(
@@ -3028,11 +3029,10 @@ Status ComputeShardedArgShapes(TensorShape* shape,
 }
 
 // Creates nodes for zero-initialized dummy arguments for TPUExecute nodes.
-StatusOr<Node*> CreateTpuExecuteDummyArg(const TensorShape& var_shape,
-                                         const DataType& dtype,
-                                         const std::string& host_cpu_device,
-                                         Node* var_read, int replica_id,
-                                         Graph* graph) {
+absl::StatusOr<Node*> CreateTpuExecuteDummyArg(
+    const TensorShape& var_shape, const DataType& dtype,
+    const std::string& host_cpu_device, Node* var_read, int replica_id,
+    Graph* graph) {
   Status status;
 
   // Const - shape_as_tensor
@@ -3179,7 +3179,7 @@ Status CreatePartitionedDummyVarArgs(
 //
 // Returns the node and its output index to be consumed by TPUExecute for the
 // requested variable index.
-StatusOr<NodeOut> CreateOrGetPerHostVariableCopy(
+absl::StatusOr<NodeOut> CreateOrGetPerHostVariableCopy(
     const std::string& host_cpu_device, int64_t var_index,
     const std::vector<Node*>& variable_reads,
     const DistributedTPURewritePass::ParameterInfo& params_info,
diff --git a/tensorflow/core/tpu/graph_rewrite/encapsulate_tpu_computations_pass.cc b/tensorflow/core/tpu/graph_rewrite/encapsulate_tpu_computations_pass.cc
index d848a3ef468970..920dafa5383a45 100644
--- a/tensorflow/core/tpu/graph_rewrite/encapsulate_tpu_computations_pass.cc
+++ b/tensorflow/core/tpu/graph_rewrite/encapsulate_tpu_computations_pass.cc
@@ -1669,7 +1669,7 @@ Status RenameClustersWithDuplicatedNames(Graph* g) {
 // Instantiate a function that is associated with a functional control flow
 // node. The function name is found by looking up `function_name_attr` of given
 // node.
-StatusOr<std::unique_ptr<FunctionBody>> InstantiateAssociatedFunction(
+absl::StatusOr<std::unique_ptr<FunctionBody>> InstantiateAssociatedFunction(
     const Node& n, absl::string_view function_name_attr,
     FunctionLibraryDefinition* fld) {
   std::unique_ptr<FunctionBody> fbody;
@@ -1688,7 +1688,7 @@ StatusOr<std::unique_ptr<FunctionBody>> InstantiateAssociatedFunction(
 
 // Find inputs of If node that are only used for outside compilation if used at
 // all in both if/else branches
-StatusOr<absl::flat_hash_set<int>> FindArgsToLiftForIfNode(
+absl::StatusOr<absl::flat_hash_set<int>> FindArgsToLiftForIfNode(
     const Node& if_node, FunctionLibraryDefinition* fld) {
   absl::flat_hash_set<int> args_to_lift_indices;
   std::vector<DataType> dtypes;
@@ -1750,7 +1750,7 @@ StatusOr<absl::flat_hash_set<int>> FindArgsToLiftForIfNode(
 // 2. only used for outside compilation in body func,
 // 3. loop invariant.
 // These inputs can be lifted out of the while loop.
-StatusOr<absl::flat_hash_set<int>> FindArgsToLiftForWhileNode(
+absl::StatusOr<absl::flat_hash_set<int>> FindArgsToLiftForWhileNode(
     Node* while_node, FunctionLibraryDefinition* fld) {
   // DT_RESOURCE inputs are candidates.
   absl::flat_hash_set<int> result;
@@ -1835,7 +1835,7 @@ StatusOr<absl::flat_hash_set<int>> FindArgsToLiftForWhileNode(
 
 // Find inputs of function call node that are only used for outside compilation.
 // These inputs can be lifted out of the function call node.
-StatusOr<absl::flat_hash_set<int>> FindArgsToLiftForCallNode(
+absl::StatusOr<absl::flat_hash_set<int>> FindArgsToLiftForCallNode(
     Node* call_node, const FunctionBody& fbody) {
   // DT_RESOURCE inputs are candidates.
   absl::flat_hash_set<int> result;
diff --git a/tensorflow/core/tpu/graph_rewrite/host_training_loop_optimization_util.cc b/tensorflow/core/tpu/graph_rewrite/host_training_loop_optimization_util.cc
index 872b97c414dade..e25ca0742ad39c 100644
--- a/tensorflow/core/tpu/graph_rewrite/host_training_loop_optimization_util.cc
+++ b/tensorflow/core/tpu/graph_rewrite/host_training_loop_optimization_util.cc
@@ -141,7 +141,7 @@ Status ExtractExecuteNodeInfo(
   new_metadata->ParsePartialFromString(metadata_string);
   if (new_metadata->num_cores_per_replica() != 1) {
     // We do not support model parallelism yet.
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   execute_node_info->clear();
@@ -151,7 +151,7 @@ Status ExtractExecuteNodeInfo(
     }
   }
   if (execute_node_info->empty()) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   TF_RET_CHECK(execute_node_info->size() == new_metadata->num_replicas())
       << "Number of replicas does not equal number of execute nodes: "
@@ -233,7 +233,7 @@ Status ExtractExecuteNodeInfo(
     // We don't need to process anything if no input is added.
     execute_node_info->clear();
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool IsTPUCompileOp(const Node& n) { return n.type_string() == "TPUCompile"; }
@@ -335,7 +335,7 @@ Status GetOrCreateBeforeEachIterationNode(const Node& loop_cond_node,
   for (const auto out_edge : loop_switch_node->out_edges()) {
     if (out_edge->src_output() == 1) {
       *node_out = out_edge->dst();
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
 
@@ -356,7 +356,7 @@ Status GetOrCreateBeforeEachIterationNode(const Node& loop_cond_node,
 
   graph->AddEdge(loop_switch_node, 1, at_loop_iteration_node, 0);
   *node_out = at_loop_iteration_node;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Injects a NoOp node in that is executed after the very last iteration
@@ -413,7 +413,7 @@ Status AddNoOpAfterLastIteration(const Node& loop_cond_node, Graph* graph,
   }
 
   *node_out = after_last_iteration_node;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -434,7 +434,7 @@ Status DetectHostTrainingLoop(
                                     associated_functions.end());
   }
 
-  Status ret_status = OkStatus();
+  Status ret_status = absl::OkStatus();
   for (const auto& function : associated_function_list) {
     if (function.type() != AssociatedFunctionInfo::kFunctionAttr) continue;
 
@@ -489,11 +489,11 @@ Status AddReshardOp(Graph* graph, const HostTrainingLoopInfo& host_loop_info) {
     LOG(ERROR) << "Encountered error when trying to extract execute nodes, "
                   "skipping host loop optimization. Status: "
                << status;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   if (execute_nodes_info.empty()) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Update the TPUCompileMetadata such that sharding config of the
@@ -654,7 +654,7 @@ Status AddReshardOp(Graph* graph, const HostTrainingLoopInfo& host_loop_info) {
   for (auto exit : FindLoopExitNodes(*loop_condition_node)) {
     graph->AddControlEdge(after_unshard_node, exit);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tpu
diff --git a/tensorflow/core/tpu/graph_rewrite/tpu_embedding_software_deduplication_rewrite_pass.cc b/tensorflow/core/tpu/graph_rewrite/tpu_embedding_software_deduplication_rewrite_pass.cc
index 7787dea7322fb6..cdcf935dc50da3 100644
--- a/tensorflow/core/tpu/graph_rewrite/tpu_embedding_software_deduplication_rewrite_pass.cc
+++ b/tensorflow/core/tpu/graph_rewrite/tpu_embedding_software_deduplication_rewrite_pass.cc
@@ -58,7 +58,7 @@ absl::Status CheckNumInputsOrOutputs(
         tpu_embedding_config.feature_descriptor_size(), attribute_name,
         num_input_or_outputs, node_name));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Constructs a NodeDef proto for the XlaRecvTPUEmbeddingDeduplicationData node
@@ -364,7 +364,7 @@ Status ValidateAndGetTPUEmbeddingConfiguration(
   TF_RETURN_IF_ERROR(
       GetNodeAttr(compile_node->def(), "config", tpu_embedding_config_str));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Adds a XlaRecvTPUEmbeddingDeduplicationNode to the graph assigning to the
@@ -418,7 +418,7 @@ Status AddRecvDeduplicationDataNode(const Node* old_activations_node,
 
   VLOG(1) << "Inserted RecvDeduplicationData node: "
           << (*deduplication_data_node)->DebugString();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Replaces the old_activations_node (Op=RecvTPUEmbeddingActivations) with a new
@@ -460,7 +460,7 @@ Status ReplaceRecvActivationsNodeAndAddDeduplicationInputs(
   VLOG(1) << "Inserted new RecvTPUEmbeddingActivations node: "
           << activations_node->DebugString();
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Replaces the old_gradients_node (Op=SendTPUEmbeddingGradients) with a new
@@ -504,7 +504,7 @@ Status ReplaceSendGradientsNodeAndAddDeduplicationInputs(
   VLOG(1) << "Inserted new SendTPUEmbeddingGradients node: "
           << gradients_node->DebugString();
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Rewrites the graph for a particular _tpu_replicate attribute.
@@ -539,7 +539,7 @@ Status RewriteGraphForTpuReplicateAttrAndDevice(
         tpu_replicate_attr, tpu_embedding_config_str, old_gradients_node,
         deduplication_data_node, graph));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Inserts a RecvTPUEmbeddingActivations node into the send_recv_nodes_map. This
@@ -579,7 +579,7 @@ Status InsertActivationsNodeIntoMap(Node* activations_node,
         key, SendRecvNodes{.activations_node = activations_node,
                            .gradients_node = nullptr});
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Inserts a SendTPUEmbeddingGradients node into the send_recv_nodes_map. This
@@ -619,7 +619,7 @@ Status InsertGradientsNodeIntoMap(Node* gradients_node,
         key, SendRecvNodes{.activations_node = nullptr,
                            .gradients_node = gradients_node});
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Groups the RecvTPUEmbeddingActivations and SendTPUEmbeddingGradients of the
@@ -638,7 +638,7 @@ Status GroupSendRecvNodesByTpuReplicateAttrAndDevice(
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Rewrites the graph in the specified GraphOptimizationPassOptions object for
@@ -660,7 +660,7 @@ Status RewriteGraph(Graph* graph) {
         graph));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Rewriter configuration for each function def. For function defs, only node
@@ -811,7 +811,7 @@ Status MergeRewriterConfigs(const RewriterConfig& rewriter_config,
         rewriter_config.gradients_node_def_name;
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Data type for map from device name to RewriterConfig.
@@ -938,7 +938,7 @@ Status RewriteFunctionDefs(FunctionLibraryDefinition* flib_def) {
 
     TF_RETURN_IF_ERROR(flib_def->ReplaceFunction(fname, new_fdef));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -947,7 +947,7 @@ Status TPUEmbeddingSoftwareDeduplicationRewritePass::Run(
     const GraphOptimizationPassOptions& options) {
   TF_RETURN_IF_ERROR(RewriteGraph(options.graph->get()));
   TF_RETURN_IF_ERROR(RewriteFunctionDefs(options.flib_def));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index 3f4aadae425640..7d55de7bbc1de3 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -160,7 +160,6 @@ cc_library(
         ":sparse_core_ops_utils",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib_proto_parsing",
-        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:status",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
@@ -169,6 +168,7 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@com_google_highway//:hwy",
         "@com_google_highway//hwy/contrib/sort:vqsort",
+        "@local_tsl//tsl/profiler/lib:traceme",
         "@local_xla//xla:util",
         "@local_xla//xla/stream_executor/tpu:tpu_api",
         "@local_xla//xla/stream_executor/tpu:tpu_ops_c_api_hdrs",
@@ -986,6 +986,7 @@ cc_library(
     name = "tpu_embedding_ops",
     srcs = ["tpu_embedding_ops.cc"],
     deps = [
+        "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla:xla_context",
         "//tensorflow/compiler/tf2xla:xla_helpers",
@@ -1480,6 +1481,21 @@ cc_library(
     visibility = ["//visibility:public"],
 )
 
+cc_library(
+    name = "_pywrap_sparse_core_layout_header_only",
+    srcs = [],
+    hdrs = ["sparse_core_layout.h"],
+    visibility = ["//tensorflow/python/tpu:__pkg__"],  # ONLY for `_pywrap_sparse_core_layout`.
+    deps = [
+        ":sparse_core_layout_proto_cc",
+        "//tensorflow/core/platform:stringpiece",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+    ],
+)
+
 cc_library(
     name = "sparse_core_layout",
     srcs = ["sparse_core_layout.cc"],
diff --git a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
index 785367ff5cefdc..fd97d7ad2c9c2e 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
+++ b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
@@ -15,12 +15,15 @@ limitations under the License.
 #include "tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.h"
 
 #include <algorithm>
+#include <cmath>
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
+#include <map>
 #include <memory>
 #include <optional>
 #include <string>
+#include <type_traits>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -44,9 +47,12 @@ limitations under the License.
 #include "tensorflow/core/platform/tstring.h"
 #include "tensorflow/core/tpu/kernels/sparse_core_ops_stats_handler.h"
 #include "tensorflow/core/tpu/kernels/sparse_core_ops_utils.h"
+#include "tsl/profiler/lib/traceme.h"
 
 namespace tensorflow {
 
+bool IsPowerOfTwo(int32_t x) { return x > 0 && (x & (x - 1)) == 0; }
+
 Status ValidateInputs(const Tensor& indices_or_row_splits, const Tensor& values,
                       const Tensor& weights, int sample_count) {
   if (values.dims() != 1) {
@@ -114,6 +120,8 @@ Status ComputeRowIdsBeforePadding(const Tensor& indices_or_row_splits,
   if (indices_or_row_splits.NumElements() == 0) {
     // Dense tensor to COO format.
     // Row ids are just the index ids.
+    // Note: this path is also taken when the input is a ragged/sparse tensor
+    // with 0 elements. In that case, the row_ids will just be empty as well.
     for (int32 i = 0; i < total_id_count; ++i) {
       *(row_ids_before_padding + i) = i;
     }
@@ -157,6 +165,119 @@ Status ComputeRowIdsBeforePadding(const Tensor& indices_or_row_splits,
   return absl::OkStatus();
 }
 
+absl::Status ConcatInputFeatureFromSameTable(
+    OpInputList* row_ids_list, OpInputList* col_ids_list,
+    OpInputList* gains_list,
+    std::vector<std::unique_ptr<int32_t[]>>* updated_row_ids,
+    std::vector<std::unique_ptr<int32_t[]>>* updated_col_ids,
+    std::vector<std::unique_ptr<float[]>>* updated_gains,
+    std::vector<int32_t>* total_id_counts,
+    std::map<int32_t, std::vector<int32_t>> col_offset_to_feature_id) {
+  int32_t feature_group_id = 0;
+  for (const auto& [col_offset, feature_id_list] : col_offset_to_feature_id) {
+    int32_t total_id_count = 0;
+    for (int32_t feature_id : feature_id_list) {
+      total_id_count += (*col_ids_list)[feature_id].NumElements();
+    }
+    (*total_id_counts)[feature_group_id] = total_id_count;
+    (*updated_row_ids)[feature_group_id] =
+        std::unique_ptr<int32_t[]>(new int32_t[total_id_count]);
+    (*updated_col_ids)[feature_group_id] =
+        std::unique_ptr<int32_t[]>(new int32_t[total_id_count]);
+    (*updated_gains)[feature_group_id] =
+        std::unique_ptr<float[]>(new float[total_id_count]);
+    int32_t tmp_size = 0;
+    for (int32_t feature_id : feature_id_list) {
+      int32_t feature_id_count = (*row_ids_list)[feature_id].NumElements();
+      std::copy_n((*row_ids_list)[feature_id].flat<int32_t>().data(),
+                  feature_id_count,
+                  (*updated_row_ids)[feature_group_id].get() + tmp_size);
+      std::copy_n((*col_ids_list)[feature_id].flat<int32_t>().data(),
+                  feature_id_count,
+                  (*updated_col_ids)[feature_group_id].get() + tmp_size);
+      std::copy_n((*gains_list)[feature_id].flat<float>().data(),
+                  feature_id_count,
+                  (*updated_gains)[feature_group_id].get() + tmp_size);
+      tmp_size += feature_id_count;
+    }
+    feature_group_id++;
+  }
+  return absl::OkStatus();
+}
+
+absl::Status SortDedupAndCountStatsOfCooTensor(
+    std::vector<std::unique_ptr<int32_t[]>>* updated_row_ids,
+    std::vector<std::unique_ptr<int32_t[]>>* updated_col_ids,
+    std::vector<std::unique_ptr<float[]>>* updated_gains,
+    std::vector<int32_t>* total_id_counts,
+    std::vector<std::unique_ptr<uint32_t[]>>* dedup_ids_index_mapping,
+    std::vector<std::unique_ptr<float[]>>* gains_after_dedup,
+    std::vector<std::unique_ptr<uint64_t[]>>* col_ids_index_list,
+    std::vector<int32_t>* total_id_counter,
+    std::vector<int32_t>* total_unique_id_counter,
+    int32_t num_physical_replica_mod, int32_t num_input_feature_group) {
+  for (int feature_group_id = 0; feature_group_id < num_input_feature_group;
+       ++feature_group_id) {
+    int32_t total_id_count = (*total_id_counts)[feature_group_id];
+    (*dedup_ids_index_mapping)[feature_group_id] =
+        std::unique_ptr<uint32_t[]>(new uint32_t[total_id_count]);
+
+    (*gains_after_dedup)[feature_group_id] =
+        std::unique_ptr<float[]>(new float[total_id_count]);
+
+    uint32_t* per_feature_dedup_ids_index_mapping =
+        (*dedup_ids_index_mapping)[feature_group_id].get();
+
+    float* per_feature_gains_after_dedup =
+        (*gains_after_dedup)[feature_group_id].get();
+    const int32_t* row_ids_ptr = (*updated_row_ids)[feature_group_id].get();
+    const int32_t* col_ids_ptr = (*updated_col_ids)[feature_group_id].get();
+    const float* gains_ptr = (*updated_gains)[feature_group_id].get();
+    (*col_ids_index_list)[feature_group_id] =
+        std::make_unique<uint64_t[]>(total_id_count);
+    uint64_t* per_feature_col_ids_index_list =
+        (*col_ids_index_list)[feature_group_id].get();
+    for (int32_t index = 0; index < total_id_count; ++index) {
+      per_feature_col_ids_index_list[index] =
+          (static_cast<uint64_t>(*(col_ids_ptr + index)) << 32) + index;
+    }
+    hwy::VQSort(per_feature_col_ids_index_list, total_id_count,
+                hwy::SortAscending());
+
+    // Loop through the col ids to count the ids and unique ids.
+    int32_t previous_col_id = -1;
+    int32_t previous_row_id = -1;
+    uint32_t previous_id_array_index = 0;
+    for (int32_t index = 0; index < total_id_count; ++index) {
+      uint64_t item = per_feature_col_ids_index_list[index];
+      int32 col_id = item >> 32;
+      uint32_t id_array_index = item & 0xffffffff;
+      int32_t row_id = *(row_ids_ptr + id_array_index);
+      // If the row ids and col ids are both same as the previous one,
+      // dedup the id by adding the gains.
+      if (row_id != previous_row_id || col_id != previous_col_id) {
+        per_feature_dedup_ids_index_mapping[id_array_index] = id_array_index;
+        per_feature_gains_after_dedup[id_array_index] =
+            *(gains_ptr + id_array_index);
+        uint32_t replica_id = col_id & num_physical_replica_mod;
+        (*total_id_counter)[replica_id]++;
+        if (col_id != previous_col_id) (*total_unique_id_counter)[replica_id]++;
+      } else {
+        // Dedup the id if both row id and col id is the same.
+        uint32_t parent_idx =
+            per_feature_dedup_ids_index_mapping[previous_id_array_index];
+        per_feature_dedup_ids_index_mapping[id_array_index] = parent_idx;
+        per_feature_gains_after_dedup[parent_idx] +=
+            *(gains_ptr + id_array_index);
+      }
+      previous_id_array_index = id_array_index;
+      previous_col_id = col_id;
+      previous_row_id = row_id;
+    }
+  }
+  return absl::OkStatus();
+}
+
 // Convert the input sparse/dense/ragged tensor into COO format and normalize
 // the combiner. Note the COO tensor it produces only contains three 1D tensors
 // and no partitioning is performed on these tensors.
@@ -1044,4 +1165,736 @@ REGISTER_KERNEL_BUILDER(
     StoreMinibatchStatisticsInFdoOp)
 #endif
 
+ConvertToListOfSparseCoreCooTensorsOp::ConvertToListOfSparseCoreCooTensorsOp(
+    OpKernelConstruction* ctx)
+    : OpKernel(ctx) {
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("sample_count", &sample_count_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("combiner", &combiner_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("num_sc_per_chip", &num_sc_per_chip_));
+  OP_REQUIRES_OK(ctx, ValidateInputCombiner(combiner_));
+
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("row_offset", &row_offset_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("col_offset", &col_offset_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("col_shift", &col_shift_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("num_sc_shards", &num_sc_shards_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("stacked_table_sample_count",
+                                   &stacked_table_sample_count_));
+
+  OP_REQUIRES(
+      ctx, IsPowerOfTwo(num_sc_shards_),
+      absl::InvalidArgumentError(absl::StrCat("num_sc_shards ", num_sc_shards_,
+                                              " is not a power of two.")));
+
+  int32_t num_sc_shards_bit = std::log2(num_sc_shards_);
+  num_sc_shards_bit_mod_ = (1 << num_sc_shards_bit) - 1;
+  num_sc_shards_bit_mod_inv_ = ~num_sc_shards_bit_mod_;
+
+  per_sc_sample_count_ = sample_count_ / num_sc_per_chip_;
+  per_sc_row_offset_ = row_offset_ / num_sc_per_chip_;
+  per_sc_stacked_table_sample_count_ =
+      stacked_table_sample_count_ / num_sc_per_chip_;
+}
+
+void ConvertToListOfSparseCoreCooTensorsOp::Compute(OpKernelContext* ctx) {
+  const Tensor* indices_or_row_splits;
+  OP_REQUIRES_OK(ctx,
+                 ctx->input("indices_or_row_splits", &indices_or_row_splits));
+  const Tensor* values;
+  OP_REQUIRES_OK(ctx, ctx->input("values", &values));
+  const Tensor* weights;
+  OP_REQUIRES_OK(ctx, ctx->input("weights", &weights));
+
+  OP_REQUIRES_OK(ctx, ValidateInputs(*indices_or_row_splits, *values, *weights,
+                                     sample_count_));
+
+  const int32 total_id_count = values->NumElements();
+
+  auto row_ids_before_dedup = std::unique_ptr<int32[]>(
+      new std::remove_extent_t<int32[]>[total_id_count]);
+
+  OP_REQUIRES_OK(
+      ctx, ComputeRowIdsBeforePadding(*indices_or_row_splits, total_id_count,
+                                      row_ids_before_dedup.get()));
+
+  // Compute the rescaled gains for non-sum combiners.
+  std::optional<std::vector<float>> gains_rescale =
+      combiner_ != "sum"
+          ? std::make_optional<std::vector<float>>(sample_count_, 0.0f)
+          : std::nullopt;
+
+  auto combiner_scale_contribution_fn =
+      GetCombinerScaleContributionFunction(combiner_);
+
+  auto combiner_scale_transform_fn =
+      GetCombinerScaleTransformFunction(combiner_);
+
+  const int32* row_ids_before_dedup_ptr = row_ids_before_dedup.get();
+  const int32* values_ptr = values->flat<int32>().data();
+  const float* weights_ptr = weights->flat<float>().data();
+
+  // Dedup the ids within one sample by just checking the adjacent ids. This
+  // will NOT result in a full deduplication.
+  std::vector<int32> row_ids;
+  std::vector<int32> col_ids;
+  std::vector<float> gains;
+  row_ids.reserve(total_id_count);
+  col_ids.reserve(total_id_count);
+  gains.reserve(total_id_count);
+
+  std::vector<int32_t> per_sc_token_count(num_sc_per_chip_, 0);
+
+  // TODO(pineapplejuice233): unify the two identical for loops if possible.
+  if (weights->NumElements() == 1) {
+    // Broadcast the same weight to all tokens.
+    const float gain = *weights_ptr;
+    const float rescaled_gain = combiner_scale_contribution_fn(gain);
+    for (int token_id = 0; token_id < total_id_count; ++token_id) {
+      const int32 row_id = *(row_ids_before_dedup_ptr + token_id);
+      const int32 col_id = *(values_ptr + token_id);
+      if (gains_rescale.has_value()) {
+        // Compute the gain rescale before doing the dedup.
+        (*gains_rescale)[row_id] += rescaled_gain;
+      }
+      if (!row_ids.empty() && row_ids.back() == row_id &&
+          col_ids.back() == col_id) {
+        gains.back() = gains.back() + gain;
+      } else {
+        row_ids.push_back(row_id);
+        col_ids.push_back(col_id);
+        gains.push_back(gain);
+        ++per_sc_token_count[row_id / per_sc_sample_count_];
+      }
+    }
+  } else {
+    for (int token_id = 0; token_id < total_id_count; ++token_id) {
+      const int32 row_id = *(row_ids_before_dedup_ptr + token_id);
+      const int32 col_id = *(values_ptr + token_id);
+      const float gain = *(weights_ptr + token_id);
+      if (gains_rescale.has_value()) {
+        // Compute the gain rescale before doing the dedup.
+        (*gains_rescale)[row_id] += combiner_scale_contribution_fn(gain);
+      }
+      if (!row_ids.empty() && row_ids.back() == row_id &&
+          col_ids.back() == col_id) {
+        gains.back() = gains.back() + gain;
+      } else {
+        row_ids.push_back(row_id);
+        col_ids.push_back(col_id);
+        gains.push_back(gain);
+        ++per_sc_token_count[row_id / per_sc_sample_count_];
+      }
+    }
+  }
+
+  OpOutputList gains_output_list;
+  OpOutputList row_ids_output_list;
+  OpOutputList col_ids_output_list;
+
+  OP_REQUIRES_OK(ctx, ctx->output_list("gains_list", &gains_output_list));
+  OP_REQUIRES_OK(ctx, ctx->output_list("row_ids_list", &row_ids_output_list));
+  OP_REQUIRES_OK(ctx, ctx->output_list("col_ids_list", &col_ids_output_list));
+
+  int32_t previous_index = 0;
+
+  if (gains_rescale.has_value()) {
+    absl::c_transform(*gains_rescale, gains_rescale->begin(),
+                      combiner_scale_transform_fn);
+  }
+  for (int i = 0; i < num_sc_per_chip_; ++i) {
+    Tensor* gains_tensor;
+    OP_REQUIRES_OK(
+        ctx, gains_output_list.allocate(i, TensorShape({per_sc_token_count[i]}),
+                                        &gains_tensor));
+    Tensor* row_ids_tensor;
+    OP_REQUIRES_OK(
+        ctx, row_ids_output_list.allocate(
+                 i, TensorShape({per_sc_token_count[i]}), &row_ids_tensor));
+    Tensor* col_ids_tensor;
+    OP_REQUIRES_OK(
+        ctx, col_ids_output_list.allocate(
+                 i, TensorShape({per_sc_token_count[i]}), &col_ids_tensor));
+
+    int32* row_ids_tensor_ptr = row_ids_tensor->flat<int32>().data();
+    int32* col_ids_tensor_ptr = col_ids_tensor->flat<int32>().data();
+    float* gains_tensor_ptr = gains_tensor->flat<float>().data();
+
+    WriteToOutputTensor(
+        row_ids.data(), col_ids.data(), gains.data(), row_ids_tensor_ptr,
+        col_ids_tensor_ptr, gains_tensor_ptr, previous_index,
+        previous_index + per_sc_token_count[i], i, gains_rescale);
+    previous_index += per_sc_token_count[i];
+  }
+}
+
+void ConvertToListOfSparseCoreCooTensorsOp::WriteToOutputTensor(
+    int32* row_ids, int32* col_ids, float* gains, int32* row_ids_tensor_ptr,
+    int32* col_ids_tensor_ptr, float* gains_tensor_ptr, int32_t begin_index,
+    int32_t end_index, int32_t sc_id,
+    std::optional<std::vector<float>> gains_rescale) {
+  tsl::profiler::TraceMe traceme(
+      "ConvertToListOfSparseCoreCooTensorsOp::WriteToOutputTensor");
+  if (gains_rescale.has_value()) {
+    // Rescale the gain so that we can always do 'sum' combine on it later.
+    for (int token_id = 0; token_id < end_index - begin_index; ++token_id) {
+      *(row_ids_tensor_ptr + token_id) =
+          *(row_ids + token_id + begin_index) % per_sc_sample_count_ +
+          per_sc_row_offset_ + per_sc_stacked_table_sample_count_ * sc_id;
+      *(col_ids_tensor_ptr + token_id) =
+          ((*(col_ids + token_id + begin_index) + col_shift_) &
+           num_sc_shards_bit_mod_) +
+          (*(col_ids + token_id + begin_index) & num_sc_shards_bit_mod_inv_) +
+          col_offset_;
+      *(gains_tensor_ptr + token_id) =
+          *(gains + token_id + begin_index) *
+          (*gains_rescale)[*(row_ids + token_id + begin_index)];
+    }
+  } else {
+    std::transform(row_ids + begin_index, row_ids + end_index,
+                   row_ids_tensor_ptr, [this, &sc_id](int32 row_id) -> int32 {
+                     return row_id % per_sc_sample_count_ + per_sc_row_offset_ +
+                            per_sc_stacked_table_sample_count_ * sc_id;
+                   });
+    std::transform(col_ids + begin_index, col_ids + end_index,
+                   col_ids_tensor_ptr, [this](int32 col_id) -> int32 {
+                     return ((col_id + col_shift_) & num_sc_shards_bit_mod_) +
+                            (col_id & num_sc_shards_bit_mod_inv_) + col_offset_;
+                   });
+    std::copy(gains + begin_index, gains + end_index, gains_tensor_ptr);
+  }
+}
+
+REGISTER_KERNEL_BUILDER(
+    Name("ConvertToListOfSparseCoreCooTensors").Device(DEVICE_CPU),
+    ConvertToListOfSparseCoreCooTensorsOp)
+
+SortListOfSparseCoreCooTensorsOp::SortListOfSparseCoreCooTensorsOp(
+    OpKernelConstruction* ctx)
+    : OpKernel(ctx) {
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("table_name", &table_name_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("num_replica", &num_replica_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("sample_count_list", &sample_count_list_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("col_offset_list", &col_offset_list_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("feature_width", &feature_width_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("num_sc_per_chip", &num_sc_per_chip_));
+
+  // Col offset needs to be sorted.
+  for (int i = 0; i < col_offset_list_.size(); ++i) {
+    col_offset_to_feature_id_[col_offset_list_[i]].push_back(i);
+  }
+
+  num_physical_replica_ = num_replica_ * num_sc_per_chip_;
+
+  OP_REQUIRES(ctx, IsPowerOfTwo(num_physical_replica_),
+              absl::FailedPreconditionError(
+                  "Expected num_physical_replica to be a power of two"));
+
+  num_physical_replica_bit_ = std::log2(num_physical_replica_);
+
+  OP_REQUIRES_OK(
+      ctx, ctx->GetAttr("max_ids_per_sparse_core", &max_ids_per_sparse_core_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("max_unique_ids_per_sparse_core",
+                                   &max_unique_ids_per_sparse_core_));
+
+  OP_REQUIRES(
+      ctx, max_ids_per_sparse_core_ > 0,
+      absl::InvalidArgumentError("max_ids_per_sparse_core must be > 0"));
+  OP_REQUIRES(
+      ctx, max_unique_ids_per_sparse_core_ > 0,
+      absl::InvalidArgumentError("max_unique_ids_per_sparse_core must be > 0"));
+}
+
+void SortListOfSparseCoreCooTensorsOp::Compute(OpKernelContext* ctx) {
+  OpInputList row_ids_list;
+  OpInputList col_ids_list;
+  OpInputList gains_list;
+  OP_REQUIRES_OK(ctx, ctx->input_list("row_ids_list", &row_ids_list));
+  OP_REQUIRES_OK(ctx, ctx->input_list("col_ids_list", &col_ids_list));
+  OP_REQUIRES_OK(ctx, ctx->input_list("gains_list", &gains_list));
+
+  const int32_t per_sparse_core_batch_size =
+      absl::c_accumulate(sample_count_list_, 0);
+
+  const int32_t num_input_feature_group = col_offset_to_feature_id_.size();
+
+  std::vector<std::unique_ptr<uint64_t[]>> col_ids_index_list(
+      num_input_feature_group);
+
+  const int32_t num_physical_replica_mod = (1 << num_physical_replica_bit_) - 1;
+
+  Tensor* id_counts_tensor;
+  OP_REQUIRES_OK(ctx, ctx->allocate_output(
+                          "id_counts", TensorShape({num_physical_replica_ + 1}),
+                          &id_counts_tensor));
+  int32_t* id_counts_tensor_ptr = id_counts_tensor->flat<int32_t>().data();
+  *id_counts_tensor_ptr = 0;
+
+  std::vector<int32_t> total_id_counter(num_physical_replica_);
+  std::vector<int32_t> total_unique_id_counter(num_physical_replica_);
+  std::vector<std::unique_ptr<uint32_t[]>> dedup_ids_index_mapping(
+      num_input_feature_group);
+  std::vector<std::unique_ptr<float[]>> gains_after_dedup(
+      num_input_feature_group);
+
+  std::vector<int32_t> total_id_counts(num_input_feature_group);
+  std::vector<std::unique_ptr<int32_t[]>> updated_row_ids(
+      num_input_feature_group);
+  std::vector<std::unique_ptr<int32_t[]>> updated_col_ids(
+      num_input_feature_group);
+  std::vector<std::unique_ptr<float[]>> updated_gains(num_input_feature_group);
+
+  // Concatenate the input features together if they are mapped to the same
+  // table.
+  OP_REQUIRES_OK(ctx, ConcatInputFeatureFromSameTable(
+                          &row_ids_list, &col_ids_list, &gains_list,
+                          &updated_row_ids, &updated_col_ids, &updated_gains,
+                          &total_id_counts, col_offset_to_feature_id_));
+
+  OP_REQUIRES_OK(
+      ctx, SortDedupAndCountStatsOfCooTensor(
+               &updated_row_ids, &updated_col_ids, &updated_gains,
+               &total_id_counts, &dedup_ids_index_mapping, &gains_after_dedup,
+               &col_ids_index_list, &total_id_counter, &total_unique_id_counter,
+               num_physical_replica_mod, num_input_feature_group));
+
+  for (int replica_id = 0; replica_id < num_physical_replica_; ++replica_id) {
+    // If the one of the replica (unique) id count is larger than the max
+    // setting, then we will fail the op.
+    OP_REQUIRES(
+        ctx, total_id_counter[replica_id] <= max_ids_per_sparse_core_,
+        absl::InvalidArgumentError(absl::StrCat(
+            "Sparse core ", replica_id, " gets ", total_id_counter[replica_id],
+            " ids while the max ids per sparse core is set to be ",
+            max_ids_per_sparse_core_, " for table ", table_name_)));
+    OP_REQUIRES(
+        ctx,
+        total_unique_id_counter[replica_id] <= max_unique_ids_per_sparse_core_,
+        absl::InvalidArgumentError(absl::StrCat(
+            "Sparse core ", replica_id, " gets ",
+            total_unique_id_counter[replica_id],
+            " unique ids while the max unique ids per sparse core is set "
+            "to be ",
+            max_unique_ids_per_sparse_core_, " for table ", table_name_)));
+    *(id_counts_tensor_ptr + replica_id + 1) =
+        total_id_counter[replica_id] + *(id_counts_tensor_ptr + replica_id);
+  }
+
+  const int32_t updated_total_id_count =
+      absl::c_accumulate(total_id_counter, 0);
+
+  Tensor* sorted_row_ids_tensor;
+  OP_REQUIRES_OK(ctx,
+                 ctx->allocate_output("sorted_row_ids",
+                                      TensorShape({updated_total_id_count}),
+                                      &sorted_row_ids_tensor));
+  Tensor* sorted_col_ids_tensor;
+  OP_REQUIRES_OK(ctx,
+                 ctx->allocate_output("sorted_col_ids",
+                                      TensorShape({updated_total_id_count}),
+                                      &sorted_col_ids_tensor));
+  Tensor* sorted_gains_tensor;
+  OP_REQUIRES_OK(ctx, ctx->allocate_output(
+                          "sorted_gains", TensorShape({updated_total_id_count}),
+                          &sorted_gains_tensor));
+
+  int32_t* sorted_row_ids_tensor_ptr =
+      sorted_row_ids_tensor->flat<int32_t>().data();
+  int32_t* sorted_col_ids_tensor_ptr =
+      sorted_col_ids_tensor->flat<int32_t>().data();
+  float* sorted_gains_tensor_ptr = sorted_gains_tensor->flat<float>().data();
+
+  std::vector<int32_t> per_physical_replica_index(num_physical_replica_);
+
+  for (int feature_group_id = 0; feature_group_id < num_input_feature_group;
+       ++feature_group_id) {
+    const int32_t* row_ids_ptr = updated_row_ids[feature_group_id].get();
+
+    const uint32_t* per_feature_dedup_ids_index_mapping =
+        dedup_ids_index_mapping[feature_group_id].get();
+
+    const float* per_feature_gains_after_dedup =
+        gains_after_dedup[feature_group_id].get();
+
+    const uint64_t* per_feature_col_ids_index_list =
+        col_ids_index_list[feature_group_id].get();
+
+    const int32_t total_id_count = total_id_counts[feature_group_id];
+
+    for (int32_t index = 0; index < total_id_count; ++index) {
+      uint64_t item = per_feature_col_ids_index_list[index];
+      uint32_t id_array_index = item & 0xffffffff;
+      if (id_array_index !=
+          per_feature_dedup_ids_index_mapping[id_array_index]) {
+        continue;
+      }
+      int32_t col_id = item >> 32;
+      int32_t replica_id = col_id & num_physical_replica_mod;
+
+      int32_t main_index = *(id_counts_tensor_ptr + replica_id) +
+                           per_physical_replica_index[replica_id];
+      *(sorted_row_ids_tensor_ptr + main_index) =
+          *(row_ids_ptr + id_array_index) % per_sparse_core_batch_size;
+      *(sorted_col_ids_tensor_ptr + main_index) =
+          col_id >> num_physical_replica_bit_;
+      // Use the updated gains instead.
+      *(sorted_gains_tensor_ptr + main_index) =
+          per_feature_gains_after_dedup[id_array_index];
+      ++per_physical_replica_index[replica_id];
+    }
+  }
+}
+
+REGISTER_KERNEL_BUILDER(
+    Name("SortListOfSparseCoreCooTensors").Device(DEVICE_CPU),
+    SortListOfSparseCoreCooTensorsOp)
+
+ConvertToSparseCoreCsrWrappedCooTensorOp::
+    ConvertToSparseCoreCsrWrappedCooTensorOp(OpKernelConstruction* ctx)
+    : OpKernel(ctx) {
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("table_name", &table_name_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("num_replica", &num_replica_));
+  OP_REQUIRES_OK(ctx,
+                 ctx->GetAttr("sample_count_per_sc", &sample_count_per_sc_));
+  OP_REQUIRES_OK(
+      ctx, ctx->GetAttr("max_minibatches_per_sc", &max_minibatches_per_sc_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("max_ids_per_chip_per_sample",
+                                   &max_ids_per_chip_per_sample_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("table_vocab_size", &table_vocab_size_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("feature_width", &feature_width_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("num_sc_per_chip", &num_sc_per_chip_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("allow_id_dropping", &allow_id_dropping_));
+
+  device_name_ = ctx->device()->name();
+}
+
+void ConvertToSparseCoreCsrWrappedCooTensorOp::Compute(OpKernelContext* ctx) {
+  OpInputList sorted_row_ids_list;
+  OpInputList sorted_col_ids_list;
+  OpInputList sorted_gains_list;
+  OpInputList id_counts_list;
+  OP_REQUIRES_OK(ctx,
+                 ctx->input_list("sorted_row_ids_list", &sorted_row_ids_list));
+  OP_REQUIRES_OK(ctx,
+                 ctx->input_list("sorted_col_ids_list", &sorted_col_ids_list));
+  OP_REQUIRES_OK(ctx, ctx->input_list("sorted_gains_list", &sorted_gains_list));
+  OP_REQUIRES_OK(ctx, ctx->input_list("id_counts_list", &id_counts_list));
+  const Tensor* splits;
+  OP_REQUIRES_OK(ctx, ctx->input("splits", &splits));
+
+  OP_REQUIRES(
+      ctx, sorted_row_ids_list.size() == num_sc_per_chip_,
+      absl::InvalidArgumentError(
+          "Sorted row ids list size is not equal to the num sc per chip."));
+  const int32_t num_physical_replica = num_replica_ * num_sc_per_chip_;
+
+  const int32_t max_ids_per_chip =
+      max_ids_per_chip_per_sample_ * sample_count_per_sc_ * num_sc_per_chip_;
+
+  const int max_division_level = GetMinibatchMaxDivisionLevel();
+
+  const int32_t kMaxDivisions = 1 << max_division_level;
+
+  const int64_t* splits_tensor_ptr = splits->flat<int64_t>().data();
+
+  int64_t binary_splits = 0;
+  for (int i = 0; i < splits->NumElements(); ++i) {
+    binary_splits |= *(splits_tensor_ptr + i);
+  }
+
+  std::vector<int> bucket_splits =
+      ConvertBinarySplitsToBucketSplits(binary_splits, max_division_level);
+
+  // Compute the number of minibatch per sparsecore.
+  const int32_t num_minibatch_per_sc = bucket_splits.size() + 1;
+
+  int32_t total_id_count = 0;
+
+  for (int sc_id = 0; sc_id < num_sc_per_chip_; ++sc_id) {
+    OP_REQUIRES(
+        ctx,
+        id_counts_list[sc_id].NumElements() == num_physical_replica + 1 ||
+            id_counts_list[sc_id].NumElements() ==
+                num_physical_replica * kMaxDivisions + 1,
+        absl::InvalidArgumentError(absl::StrCat(
+            "The id counts should have either ", num_physical_replica + 1,
+            " elements when there is no minibatching or ",
+            num_physical_replica * kMaxDivisions + 1,
+            " elements when there are multiple minibatches for each "
+            "sparsecore. But instead got ",
+            id_counts_list[sc_id].NumElements(), " elements.")));
+    total_id_count += *(id_counts_list[sc_id].flat<int32_t>().data() +
+                        id_counts_list[sc_id].NumElements() - 1);
+  }
+
+  // We use the number of elements in the id_counts_list to determine whether
+  // minibatching is needed rather than num_minibatch_per_sc. This is because
+  // the minibatch logic can be triggered but only one minibatch is
+  // computed at the end with some ids getting dropped.
+  const bool is_minibatching = id_counts_list[0].NumElements() ==
+                               num_physical_replica * kMaxDivisions + 1;
+
+  const int32_t total_num_minibatch = num_minibatch_per_sc * num_sc_per_chip_;
+
+  bucket_splits.insert(bucket_splits.begin(), 0);
+  bucket_splits.push_back(kMaxDivisions);
+
+  const int32_t xla_pad_size = 8;
+
+  OP_REQUIRES(
+      ctx, max_ids_per_chip % xla_pad_size == 0,
+      absl::InvalidArgumentError(absl::StrCat(
+          "The max_ids_per_chip is set to be ", max_ids_per_chip,
+          " which is not divisible by the xla_pad_size ", xla_pad_size, " .")));
+
+  const int32_t padded_row_pointers_size_per_sc =
+      xla::RoundUpTo<int32_t>(num_physical_replica, xla_pad_size);
+
+  Tensor* row_pointers_tensor;
+  OP_REQUIRES_OK(ctx,
+                 ctx->allocate_output(
+                     "row_pointers",
+                     TensorShape({max_minibatches_per_sc_ * num_sc_per_chip_ *
+                                  padded_row_pointers_size_per_sc}),
+                     &row_pointers_tensor));
+
+  Tensor* sorted_sample_ids_tensor;
+  OP_REQUIRES_OK(ctx, ctx->allocate_output("sorted_sample_ids",
+                                           TensorShape({max_ids_per_chip}),
+                                           &sorted_sample_ids_tensor));
+  Tensor* sorted_token_ids_tensor;
+  OP_REQUIRES_OK(ctx, ctx->allocate_output("sorted_token_ids",
+                                           TensorShape({max_ids_per_chip}),
+                                           &sorted_token_ids_tensor));
+  Tensor* sorted_gains_tensor;
+  OP_REQUIRES_OK(
+      ctx, ctx->allocate_output("sorted_gains", TensorShape({max_ids_per_chip}),
+                                &sorted_gains_tensor));
+
+  int32_t* row_pointers_tensor_ptr =
+      row_pointers_tensor->flat<int32_t>().data();
+  int32_t* sorted_sample_ids_tensor_ptr =
+      sorted_sample_ids_tensor->flat<int32_t>().data();
+  int32_t* sorted_token_ids_tensor_ptr =
+      sorted_token_ids_tensor->flat<int32_t>().data();
+  float* sorted_gains_tensor_ptr = sorted_gains_tensor->flat<float>().data();
+
+  // This packed id count is used to track how many ids we have packed into
+  // the output tensor and based on this we would know how many ids that we
+  // dropped.
+  int32_t packed_id_count = 0;
+
+  int32_t global_index = 0;
+  int32_t row_pointers_index = 0;
+  for (int sc_id = 0; sc_id < num_sc_per_chip_; ++sc_id) {
+    const int32_t* row_ids_tensor_ptr =
+        sorted_row_ids_list[sc_id].flat<int32_t>().data();
+    const int32_t* col_ids_tensor_ptr =
+        sorted_col_ids_list[sc_id].flat<int32_t>().data();
+    const float* gains_tensor_ptr =
+        sorted_gains_list[sc_id].flat<float>().data();
+    const int32_t* id_counts_tensor_ptr =
+        id_counts_list[sc_id].flat<int32_t>().data();
+    for (int bucket_id = 1; bucket_id < bucket_splits.size(); ++bucket_id) {
+      for (int replica_id = 0; replica_id < num_physical_replica;
+           ++replica_id) {
+        int start_division_pos, end_division_pos;
+        if (is_minibatching) {
+          start_division_pos =
+              replica_id * kMaxDivisions + bucket_splits[bucket_id - 1];
+          end_division_pos =
+              replica_id * kMaxDivisions + bucket_splits[bucket_id];
+        } else {
+          start_division_pos = replica_id;
+          end_division_pos = replica_id + 1;
+        }
+        const int32_t start_pos = *(id_counts_tensor_ptr + start_division_pos);
+        const int32_t end_pos = *(id_counts_tensor_ptr + end_division_pos);
+
+        const int32_t token_id_count = end_pos - start_pos;
+
+        if (global_index + token_id_count > max_ids_per_chip) {
+          if (allow_id_dropping_) {
+            const int32_t copy_id_count =
+                std::min(max_ids_per_chip - global_index, token_id_count);
+            std::copy_n(col_ids_tensor_ptr + start_pos, copy_id_count,
+                        sorted_token_ids_tensor_ptr + global_index);
+            std::copy_n(row_ids_tensor_ptr + start_pos, copy_id_count,
+                        sorted_sample_ids_tensor_ptr + global_index);
+            std::copy_n(gains_tensor_ptr + start_pos, copy_id_count,
+                        sorted_gains_tensor_ptr + global_index);
+            packed_id_count += copy_id_count;
+            global_index = max_ids_per_chip;
+          } else {
+            const int32_t remain_id_count = total_id_count - packed_id_count;
+            ctx->CtxFailure(absl::InvalidArgumentError(absl::StrCat(
+                "The max_ids_per_chip is set to be ", max_ids_per_chip,
+                " which is not going to fit all ids. The remaining id count "
+                "is ",
+                remain_id_count,
+                " . Please consider setting the allow_id_dropping to be "
+                "true. ")));
+            return;
+          }
+        } else {
+          std::copy_n(col_ids_tensor_ptr + start_pos, token_id_count,
+                      sorted_token_ids_tensor_ptr + global_index);
+          std::copy_n(row_ids_tensor_ptr + start_pos, token_id_count,
+                      sorted_sample_ids_tensor_ptr + global_index);
+          std::copy_n(gains_tensor_ptr + start_pos, token_id_count,
+                      sorted_gains_tensor_ptr + global_index);
+
+          global_index += token_id_count;
+          packed_id_count += token_id_count;
+        }
+
+        *(row_pointers_tensor_ptr + row_pointers_index) = global_index;
+        int32 num_ids_to_pad_per_replica =
+            xla::RoundUpTo<int32_t>(global_index, xla_pad_size) - global_index;
+
+        std::fill_n(sorted_token_ids_tensor_ptr + global_index,
+                    num_ids_to_pad_per_replica, kXlaPadValue);
+        std::fill_n(sorted_sample_ids_tensor_ptr + global_index,
+                    num_ids_to_pad_per_replica, kXlaPadValue);
+        std::fill_n(sorted_gains_tensor_ptr + global_index,
+                    num_ids_to_pad_per_replica, kXlaPadValue);
+
+        global_index += num_ids_to_pad_per_replica;
+        ++row_pointers_index;
+      }
+      // Pad the row_pointers to be memory aligned.
+      int32 num_row_pointers_to_pad =
+          xla::RoundUpTo<int32>(row_pointers_index, xla_pad_size) -
+          row_pointers_index;
+      std::fill_n(row_pointers_tensor_ptr + row_pointers_index,
+                  num_row_pointers_to_pad, global_index);
+      row_pointers_index += num_row_pointers_to_pad;
+    }
+  }
+  int32_t ids_unpadded_size = global_index;
+
+  if (packed_id_count < total_id_count) {
+    const int32_t dropped_id_count = total_id_count - packed_id_count;
+    LOG(WARNING) << "Table " << table_name_ << " is dropping "
+                 << dropped_id_count
+                 << " ids so that the produced CsrWrappedCooTensor can be fit "
+                    "in static bound of "
+                 << max_ids_per_chip
+                 << " . This could potentially impact the model quality.";
+  }
+
+  int32 row_pointers_unpadded_size =
+      total_num_minibatch * padded_row_pointers_size_per_sc;
+
+  Tensor* num_minibatches_per_sc_tensor;
+  OP_REQUIRES_OK(ctx,
+                 ctx->allocate_output("num_minibatches_per_sc", TensorShape({}),
+                                      &num_minibatches_per_sc_tensor));
+
+  Tensor* row_pointers_unpadded_size_tensor;
+  OP_REQUIRES_OK(
+      ctx, ctx->allocate_output("row_pointers_unpadded_size", TensorShape({}),
+                                &row_pointers_unpadded_size_tensor));
+
+  Tensor* ids_unpadded_size_tensor;
+  OP_REQUIRES_OK(ctx, ctx->allocate_output("ids_unpadded_size", TensorShape({}),
+                                           &ids_unpadded_size_tensor));
+
+  num_minibatches_per_sc_tensor->flat<int32>()(0) = num_minibatch_per_sc;
+  row_pointers_unpadded_size_tensor->flat<int32>()(0) =
+      row_pointers_unpadded_size;
+  ids_unpadded_size_tensor->flat<int32>()(0) = ids_unpadded_size;
+}
+
+REGISTER_KERNEL_BUILDER(
+    Name("ConvertToSparseCoreCsrWrappedCooTensor").Device(DEVICE_CPU),
+    ConvertToSparseCoreCsrWrappedCooTensorOp)
+
+GetStatsFromListOfSparseCoreCooTensorsOp::
+    GetStatsFromListOfSparseCoreCooTensorsOp(OpKernelConstruction* ctx)
+    : OpKernel(ctx) {
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("table_name", &table_name_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("num_replica", &num_replica_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("sample_count_list", &sample_count_list_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("col_offset_list", &col_offset_list_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("feature_width", &feature_width_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("num_sc_per_chip", &num_sc_per_chip_));
+
+  // Col offset needs to be sorted.
+  for (int i = 0; i < col_offset_list_.size(); ++i) {
+    col_offset_to_feature_id_[col_offset_list_[i]].push_back(i);
+  }
+
+  num_physical_replica_ = num_replica_ * num_sc_per_chip_;
+
+  OP_REQUIRES(ctx, IsPowerOfTwo(num_physical_replica_),
+              absl::FailedPreconditionError(
+                  "Expected num_physical_replica to be a power of two"));
+
+  num_physical_replica_bit_ = std::log2(num_physical_replica_);
+}
+
+void GetStatsFromListOfSparseCoreCooTensorsOp::Compute(OpKernelContext* ctx) {
+  OpInputList row_ids_list;
+  OpInputList col_ids_list;
+  OpInputList gains_list;
+  OP_REQUIRES_OK(ctx, ctx->input_list("row_ids_list", &row_ids_list));
+  OP_REQUIRES_OK(ctx, ctx->input_list("col_ids_list", &col_ids_list));
+  OP_REQUIRES_OK(ctx, ctx->input_list("gains_list", &gains_list));
+
+  const int32_t num_input_feature_group = col_offset_to_feature_id_.size();
+
+  std::vector<std::unique_ptr<uint64_t[]>> col_ids_index_list(
+      num_input_feature_group);
+
+  const int32_t num_physical_replica_mod = (1 << num_physical_replica_bit_) - 1;
+
+  std::vector<int32_t> total_id_counter(num_physical_replica_);
+  std::vector<int32_t> total_unique_id_counter(num_physical_replica_);
+  std::vector<std::unique_ptr<uint32_t[]>> dedup_ids_index_mapping(
+      num_input_feature_group);
+  std::vector<std::unique_ptr<float[]>> gains_after_dedup(
+      num_input_feature_group);
+
+  std::vector<int32_t> total_id_counts(num_input_feature_group);
+  std::vector<std::unique_ptr<int32_t[]>> updated_row_ids(
+      num_input_feature_group);
+  std::vector<std::unique_ptr<int32_t[]>> updated_col_ids(
+      num_input_feature_group);
+  std::vector<std::unique_ptr<float[]>> updated_gains(num_input_feature_group);
+
+  // Concatenate the input features together if they are mapped to the same
+  // table.
+  OP_REQUIRES_OK(ctx, ConcatInputFeatureFromSameTable(
+                          &row_ids_list, &col_ids_list, &gains_list,
+                          &updated_row_ids, &updated_col_ids, &updated_gains,
+                          &total_id_counts, col_offset_to_feature_id_));
+
+  OP_REQUIRES_OK(
+      ctx, SortDedupAndCountStatsOfCooTensor(
+               &updated_row_ids, &updated_col_ids, &updated_gains,
+               &total_id_counts, &dedup_ids_index_mapping, &gains_after_dedup,
+               &col_ids_index_list, &total_id_counter, &total_unique_id_counter,
+               num_physical_replica_mod, num_input_feature_group));
+
+  int32_t max_ids_per_sparse_core = *absl::c_max_element(total_id_counter);
+  int32_t max_unique_ids_per_sparse_core =
+      *absl::c_max_element(total_unique_id_counter);
+
+  Tensor* max_ids_per_sparse_core_tensor;
+  OP_REQUIRES_OK(
+      ctx, ctx->allocate_output("max_ids_per_sparse_core", TensorShape({}),
+                                &max_ids_per_sparse_core_tensor));
+  max_ids_per_sparse_core_tensor->flat<int32_t>()(0) = max_ids_per_sparse_core;
+
+  Tensor* max_unique_ids_per_sparse_core_tensor;
+  OP_REQUIRES_OK(ctx, ctx->allocate_output(
+                          "max_unique_ids_per_sparse_core", TensorShape({}),
+                          &max_unique_ids_per_sparse_core_tensor));
+  max_unique_ids_per_sparse_core_tensor->flat<int32_t>()(0) =
+      max_unique_ids_per_sparse_core;
+}
+
+REGISTER_KERNEL_BUILDER(
+    Name("GetStatsFromListOfSparseCoreCooTensors").Device(DEVICE_CPU),
+    GetStatsFromListOfSparseCoreCooTensorsOp)
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.h b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.h
index f2d35b3fa76cd6..ce43521cbc5147 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.h
+++ b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.h
@@ -16,12 +16,15 @@ limitations under the License.
 #define TENSORFLOW_CORE_TPU_KERNELS_SPARSE_CORE_PREPROCESS_OPS_H_
 
 #include <cstdint>
+#include <map>
 #include <memory>
+#include <optional>
 #include <string>
+#include <vector>
 
-#include "absl/status/status.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/tstring.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/tpu/kernels/sparse_core_ops_stats_handler.h"
@@ -146,6 +149,111 @@ class StoreMinibatchStatisticsInFdoOp : public OpKernel {
   int64_t num_sc_per_chip_;
 };
 
+// TODO(pineapplejuice233): Unify this op with ConvertToListOfCooTensorsV2Op.
+class ConvertToListOfSparseCoreCooTensorsOp : public OpKernel {
+ public:
+  explicit ConvertToListOfSparseCoreCooTensorsOp(OpKernelConstruction* ctx);
+  ~ConvertToListOfSparseCoreCooTensorsOp() override = default;
+  ConvertToListOfSparseCoreCooTensorsOp(
+      const ConvertToListOfSparseCoreCooTensorsOp&) = delete;
+  ConvertToListOfSparseCoreCooTensorsOp& operator=(
+      const ConvertToListOfSparseCoreCooTensorsOp&) = delete;
+
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  void WriteToOutputTensor(int32* row_ids, int32* col_ids, float* gains,
+                           int32* row_ids_tensor_ptr, int32* col_ids_tensor_ptr,
+                           float* gains_tensor_ptr, int32_t begin_index,
+                           int32_t end_index, int32_t sc_id,
+                           std::optional<std::vector<float>> gains_rescale);
+  int sample_count_;
+  int num_sc_per_chip_;
+  int per_sc_sample_count_;
+  int row_offset_;
+  int col_offset_;
+  int col_shift_;
+  int num_sc_shards_;
+  int stacked_table_sample_count_;
+  int num_sc_shards_bit_mod_;
+  int num_sc_shards_bit_mod_inv_;
+  int per_sc_row_offset_;
+  int per_sc_stacked_table_sample_count_;
+  std::string combiner_;
+};
+
+class SortListOfSparseCoreCooTensorsOp : public OpKernel {
+ public:
+  explicit SortListOfSparseCoreCooTensorsOp(OpKernelConstruction* ctx);
+  ~SortListOfSparseCoreCooTensorsOp() override = default;
+  SortListOfSparseCoreCooTensorsOp(const SortListOfSparseCoreCooTensorsOp&) =
+      delete;
+  SortListOfSparseCoreCooTensorsOp& operator=(
+      const SortListOfSparseCoreCooTensorsOp&) = delete;
+
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  int32_t num_sc_per_chip_;
+  int32_t feature_width_;
+  int32_t num_replica_;
+  int32_t num_physical_replica_;
+  int32_t num_physical_replica_bit_;
+  int32_t max_ids_per_sparse_core_;
+  int32_t max_unique_ids_per_sparse_core_;
+  std::string table_name_;
+  std::vector<int32_t> sample_count_list_;
+  std::vector<int32_t> col_offset_list_;
+  std::map<int32_t, std::vector<int32_t>> col_offset_to_feature_id_;
+};
+
+class ConvertToSparseCoreCsrWrappedCooTensorOp : public OpKernel {
+ public:
+  explicit ConvertToSparseCoreCsrWrappedCooTensorOp(OpKernelConstruction* ctx);
+  ~ConvertToSparseCoreCsrWrappedCooTensorOp() override = default;
+  ConvertToSparseCoreCsrWrappedCooTensorOp(
+      const ConvertToSparseCoreCsrWrappedCooTensorOp&) = delete;
+  ConvertToSparseCoreCsrWrappedCooTensorOp& operator=(
+      const ConvertToSparseCoreCsrWrappedCooTensorOp&) = delete;
+
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  int32_t num_sc_per_chip_;
+  int32_t table_vocab_size_;
+  int32_t feature_width_;
+  int32_t num_replica_;
+  int32_t sample_count_per_sc_;
+  int32_t max_minibatches_per_sc_;
+  int32_t max_ids_per_chip_per_sample_;
+  bool allow_id_dropping_;
+  std::string table_name_;
+  std::string device_name_;
+};
+
+class GetStatsFromListOfSparseCoreCooTensorsOp : public OpKernel {
+ public:
+  explicit GetStatsFromListOfSparseCoreCooTensorsOp(OpKernelConstruction* ctx);
+  ~GetStatsFromListOfSparseCoreCooTensorsOp() override = default;
+  GetStatsFromListOfSparseCoreCooTensorsOp(
+      const GetStatsFromListOfSparseCoreCooTensorsOp&) = delete;
+  GetStatsFromListOfSparseCoreCooTensorsOp& operator=(
+      const GetStatsFromListOfSparseCoreCooTensorsOp&) = delete;
+
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  int32_t num_sc_per_chip_;
+  int32_t feature_width_;
+  int32_t num_replica_;
+  int32_t num_physical_replica_;
+  int32_t num_physical_replica_bit_;
+  std::string table_name_;
+  std::vector<int32_t> sample_count_list_;
+  std::vector<int32_t> col_offset_list_;
+  std::map<int32_t, std::vector<int32_t>> col_offset_to_feature_id_;
+};
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_TPU_KERNELS_SPARSE_CORE_PREPROCESS_OPS_H_
diff --git a/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc b/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc
index cd3dd1a1cc59f2..47e171ac7fea60 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc
+++ b/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc
@@ -55,12 +55,16 @@ static TFGaugeMetric* max_unique_ids_per_partition_gauge_ = TFGaugeMetric::New(
     "/tensorflow/tpu/embedding/maximum_unique_ids_per_partition",
     "Max unique_ids_per_partition limit for each table", "device", "table");
 
+constexpr char kUnknownProgramKey[] = "";
+
 namespace tensorflow {
 namespace {
 
 // Get the SparseCore logical replica count.
-// TODO(agagik): get it from the tpu topology.
-absl::StatusOr<int64_t> GetSparseCoresPerChip() { return 4; }
+absl::StatusOr<int64_t> GetSparseCoresPerChip() {
+  return stream_executor::tpu::OpsApiFn()->TpuTopology_AvailableCoresPerChipFn(
+      /*tpu_core_type=*/TpuCoreTypeEnum::kEmbeddingV2);
+}
 
 // This TensorFlow op performs the embedding lookup on SparseCore. It takes the
 // embedding table and input sparse tensor represented by the `row_ids`,
@@ -230,6 +234,14 @@ class XlaSparseDenseMatmulWithCsrInputOp : public XlaOpKernel {
 
   ~XlaSparseDenseMatmulWithCsrInputOp() override = default;
 
+  virtual absl::Status GetMaxIdsAndUniques(
+      int64_t num_samples_per_sparse_core, int64_t feature_width,
+      int64_t* max_ids_per_partition, int64_t* max_unique_ids_per_partition) {
+    return GetMaxIdsAndUniquesExternal(
+        kUnknownProgramKey, table_name_, num_samples_per_sparse_core,
+        feature_width, max_ids_per_partition, max_unique_ids_per_partition);
+  }
+
   void Compile(XlaOpKernelContext* ctx) override {
     int64_t per_sparse_core_batch_size =
         input_size_ / num_sparsecores_per_chip_;
@@ -248,9 +260,9 @@ class XlaSparseDenseMatmulWithCsrInputOp : public XlaOpKernel {
     const int32_t feature_width = embedding_table_shape.dimensions(1);
 
     OP_REQUIRES_OK(
-        ctx, GetMaxIdsAndUniquesExternal(
-                 "", table_name_, per_sparse_core_batch_size, feature_width,
-                 &max_ids_per_partition, &max_unique_ids_per_partition));
+        ctx, GetMaxIdsAndUniques(per_sparse_core_batch_size, feature_width,
+                                 &max_ids_per_partition,
+                                 &max_unique_ids_per_partition));
     // Log max_ids and max_uniques for offline analysis. We do this here since
     // these values are fixed at TPU compile time and remain fixed during
     // training.
@@ -379,6 +391,14 @@ class XlaSparseDenseMatmulGradWithCsrInputBase : public XlaOpKernel {
     return clipped_table;
   }
 
+  virtual absl::Status GetMaxIdsAndUniques(
+      int64_t num_samples_per_sparse_core, int64_t feature_width,
+      int64_t* max_ids_per_partition, int64_t* max_unique_ids_per_partition) {
+    return GetMaxIdsAndUniquesExternal(
+        kUnknownProgramKey, table_name_, num_samples_per_sparse_core,
+        feature_width, max_ids_per_partition, max_unique_ids_per_partition);
+  }
+
   void Compile(XlaOpKernelContext* ctx) override {
     xla::XlaBuilder* builder = ctx->builder();
 
@@ -417,9 +437,9 @@ class XlaSparseDenseMatmulGradWithCsrInputBase : public XlaOpKernel {
 
     const int32_t feature_width = embedding_table_shape.dimensions(1);
     OP_REQUIRES_OK(
-        ctx, GetMaxIdsAndUniquesExternal(
-                 "", table_name_, per_sparse_core_batch_size, feature_width,
-                 &max_ids_per_partition, &max_unique_ids_per_partition));
+        ctx, GetMaxIdsAndUniques(per_sparse_core_batch_size, feature_width,
+                                 &max_ids_per_partition,
+                                 &max_unique_ids_per_partition));
     LOG(INFO) << "Lowering XlaSparseDenseMatmulGradWithCsrInputOp to HLO: "
               << "table_name = '" << table_name_
               << "', max_ids = " << max_ids_per_partition
@@ -1469,5 +1489,248 @@ class XlaSparseCoreFtrlOp : public XlaSparseCoreOptimizerOpBase {
 
 REGISTER_XLA_OP(Name("XlaSparseCoreFtrl"), XlaSparseCoreFtrlOp);
 
+//***************************************************************************
+// Below is the SparseCore ops with static buffer size. They are the same as
+// the above ops except that they take the max_ids/uniques as input attributes.
+//***************************************************************************
+class XlaSparseDenseMatmulWithStaticBufferSizeOp
+    : public XlaSparseDenseMatmulWithCsrInputOp {
+ public:
+  explicit XlaSparseDenseMatmulWithStaticBufferSizeOp(OpKernelConstruction* ctx)
+      : XlaSparseDenseMatmulWithCsrInputOp(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("max_ids_per_sparse_core",
+                                     &max_ids_per_sparse_core_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("max_unique_ids_per_sparse_core",
+                                     &max_unique_ids_per_sparse_core_));
+
+    OP_REQUIRES(
+        ctx, max_ids_per_sparse_core_ > 0,
+        absl::InvalidArgumentError("max_ids_per_sparse_core must be > 0"));
+    OP_REQUIRES(ctx, max_unique_ids_per_sparse_core_ > 0,
+                absl::InvalidArgumentError(
+                    "max_unique_ids_per_sparse_core must be > 0"));
+  }
+
+  absl::Status GetMaxIdsAndUniques(
+      int64_t num_samples_per_sparse_core, int64_t feature_width,
+      int64_t* max_ids_per_partition,
+      int64_t* max_unique_ids_per_partition) override {
+    if (max_ids_per_partition == nullptr ||
+        max_unique_ids_per_partition == nullptr) {
+      return absl::InternalError("Setting the max ids/uniques failed.");
+    }
+    *max_ids_per_partition = max_ids_per_sparse_core_;
+    *max_unique_ids_per_partition = max_unique_ids_per_sparse_core_;
+    return absl::OkStatus();
+  }
+
+ private:
+  int32_t max_ids_per_sparse_core_;
+  int32_t max_unique_ids_per_sparse_core_;
+};
+
+REGISTER_XLA_OP(Name("XlaSparseDenseMatmulWithStaticBufferSize"),
+                XlaSparseDenseMatmulWithStaticBufferSizeOp);
+
+class XlaSparseDenseMatmulGradWithSgdAndStaticBufferSizeOp
+    : public XlaSparseDenseMatmulGradWithSgdAndCsrInputOp {
+ public:
+  explicit XlaSparseDenseMatmulGradWithSgdAndStaticBufferSizeOp(
+      OpKernelConstruction* ctx)
+      : XlaSparseDenseMatmulGradWithSgdAndCsrInputOp(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("max_ids_per_sparse_core",
+                                     &max_ids_per_sparse_core_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("max_unique_ids_per_sparse_core",
+                                     &max_unique_ids_per_sparse_core_));
+
+    OP_REQUIRES(
+        ctx, max_ids_per_sparse_core_ > 0,
+        absl::InvalidArgumentError("max_ids_per_sparse_core must be > 0"));
+    OP_REQUIRES(ctx, max_unique_ids_per_sparse_core_ > 0,
+                absl::InvalidArgumentError(
+                    "max_unique_ids_per_sparse_core must be > 0"));
+  }
+
+  absl::Status GetMaxIdsAndUniques(
+      int64_t num_samples_per_sparse_core, int64_t feature_width,
+      int64_t* max_ids_per_partition,
+      int64_t* max_unique_ids_per_partition) override {
+    if (max_ids_per_partition == nullptr ||
+        max_unique_ids_per_partition == nullptr) {
+      return absl::InternalError("Setting the max ids/uniques failed.");
+    }
+    *max_ids_per_partition = max_ids_per_sparse_core_;
+    *max_unique_ids_per_partition = max_unique_ids_per_sparse_core_;
+    return absl::OkStatus();
+  }
+
+ private:
+  int32_t max_ids_per_sparse_core_;
+  int32_t max_unique_ids_per_sparse_core_;
+};
+
+REGISTER_XLA_OP(Name("XlaSparseDenseMatmulGradWithSgdAndStaticBufferSize"),
+                XlaSparseDenseMatmulGradWithSgdAndStaticBufferSizeOp);
+
+class XlaSparseDenseMatmulGradWithAdamAndStaticBufferSizeOp
+    : public XlaSparseDenseMatmulGradWithAdamAndCsrInputOp {
+ public:
+  explicit XlaSparseDenseMatmulGradWithAdamAndStaticBufferSizeOp(
+      OpKernelConstruction* ctx)
+      : XlaSparseDenseMatmulGradWithAdamAndCsrInputOp(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("max_ids_per_sparse_core",
+                                     &max_ids_per_sparse_core_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("max_unique_ids_per_sparse_core",
+                                     &max_unique_ids_per_sparse_core_));
+
+    OP_REQUIRES(
+        ctx, max_ids_per_sparse_core_ > 0,
+        absl::InvalidArgumentError("max_ids_per_sparse_core must be > 0"));
+    OP_REQUIRES(ctx, max_unique_ids_per_sparse_core_ > 0,
+                absl::InvalidArgumentError(
+                    "max_unique_ids_per_sparse_core must be > 0"));
+  }
+
+  absl::Status GetMaxIdsAndUniques(
+      int64_t num_samples_per_sparse_core, int64_t feature_width,
+      int64_t* max_ids_per_partition,
+      int64_t* max_unique_ids_per_partition) override {
+    if (max_ids_per_partition == nullptr ||
+        max_unique_ids_per_partition == nullptr) {
+      return absl::InternalError("Setting the max ids/uniques failed.");
+    }
+    *max_ids_per_partition = max_ids_per_sparse_core_;
+    *max_unique_ids_per_partition = max_unique_ids_per_sparse_core_;
+    return absl::OkStatus();
+  }
+
+ private:
+  int32_t max_ids_per_sparse_core_;
+  int32_t max_unique_ids_per_sparse_core_;
+};
+
+REGISTER_XLA_OP(Name("XlaSparseDenseMatmulGradWithAdamAndStaticBufferSize"),
+                XlaSparseDenseMatmulGradWithAdamAndStaticBufferSizeOp);
+
+class XlaSparseDenseMatmulGradWithAdagradAndStaticBufferSizeOp
+    : public XlaSparseDenseMatmulGradWithAdagradAndCsrInputOp {
+ public:
+  explicit XlaSparseDenseMatmulGradWithAdagradAndStaticBufferSizeOp(
+      OpKernelConstruction* ctx)
+      : XlaSparseDenseMatmulGradWithAdagradAndCsrInputOp(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("max_ids_per_sparse_core",
+                                     &max_ids_per_sparse_core_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("max_unique_ids_per_sparse_core",
+                                     &max_unique_ids_per_sparse_core_));
+
+    OP_REQUIRES(
+        ctx, max_ids_per_sparse_core_ > 0,
+        absl::InvalidArgumentError("max_ids_per_sparse_core must be > 0"));
+    OP_REQUIRES(ctx, max_unique_ids_per_sparse_core_ > 0,
+                absl::InvalidArgumentError(
+                    "max_unique_ids_per_sparse_core must be > 0"));
+  }
+
+  absl::Status GetMaxIdsAndUniques(
+      int64_t num_samples_per_sparse_core, int64_t feature_width,
+      int64_t* max_ids_per_partition,
+      int64_t* max_unique_ids_per_partition) override {
+    if (max_ids_per_partition == nullptr ||
+        max_unique_ids_per_partition == nullptr) {
+      return absl::InternalError("Setting the max ids/uniques failed.");
+    }
+    *max_ids_per_partition = max_ids_per_sparse_core_;
+    *max_unique_ids_per_partition = max_unique_ids_per_sparse_core_;
+    return absl::OkStatus();
+  }
+
+ private:
+  int32_t max_ids_per_sparse_core_;
+  int32_t max_unique_ids_per_sparse_core_;
+};
+
+REGISTER_XLA_OP(Name("XlaSparseDenseMatmulGradWithAdagradAndStaticBufferSize"),
+                XlaSparseDenseMatmulGradWithAdagradAndStaticBufferSizeOp);
+
+class XlaSparseDenseMatmulGradWithFtrlAndStaticBufferSizeOp
+    : public XlaSparseDenseMatmulGradWithFtrlAndCsrInputOp {
+ public:
+  explicit XlaSparseDenseMatmulGradWithFtrlAndStaticBufferSizeOp(
+      OpKernelConstruction* ctx)
+      : XlaSparseDenseMatmulGradWithFtrlAndCsrInputOp(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("max_ids_per_sparse_core",
+                                     &max_ids_per_sparse_core_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("max_unique_ids_per_sparse_core",
+                                     &max_unique_ids_per_sparse_core_));
+
+    OP_REQUIRES(
+        ctx, max_ids_per_sparse_core_ > 0,
+        absl::InvalidArgumentError("max_ids_per_sparse_core must be > 0"));
+    OP_REQUIRES(ctx, max_unique_ids_per_sparse_core_ > 0,
+                absl::InvalidArgumentError(
+                    "max_unique_ids_per_sparse_core must be > 0"));
+  }
+
+  absl::Status GetMaxIdsAndUniques(
+      int64_t num_samples_per_sparse_core, int64_t feature_width,
+      int64_t* max_ids_per_partition,
+      int64_t* max_unique_ids_per_partition) override {
+    if (max_ids_per_partition == nullptr ||
+        max_unique_ids_per_partition == nullptr) {
+      return absl::InternalError("Setting the max ids/uniques failed.");
+    }
+    *max_ids_per_partition = max_ids_per_sparse_core_;
+    *max_unique_ids_per_partition = max_unique_ids_per_sparse_core_;
+    return absl::OkStatus();
+  }
+
+ private:
+  int32_t max_ids_per_sparse_core_;
+  int32_t max_unique_ids_per_sparse_core_;
+};
+
+REGISTER_XLA_OP(Name("XlaSparseDenseMatmulGradWithFtrlAndStaticBufferSize"),
+                XlaSparseDenseMatmulGradWithFtrlAndStaticBufferSizeOp);
+
+class XlaSparseDenseMatmulGradWithAdagradMomentumAndStaticBufferSizeOp
+    : public XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInputOp {
+ public:
+  explicit XlaSparseDenseMatmulGradWithAdagradMomentumAndStaticBufferSizeOp(
+      OpKernelConstruction* ctx)
+      : XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInputOp(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("max_ids_per_sparse_core",
+                                     &max_ids_per_sparse_core_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("max_unique_ids_per_sparse_core",
+                                     &max_unique_ids_per_sparse_core_));
+
+    OP_REQUIRES(
+        ctx, max_ids_per_sparse_core_ > 0,
+        absl::InvalidArgumentError("max_ids_per_sparse_core must be > 0"));
+    OP_REQUIRES(ctx, max_unique_ids_per_sparse_core_ > 0,
+                absl::InvalidArgumentError(
+                    "max_unique_ids_per_sparse_core must be > 0"));
+  }
+
+  absl::Status GetMaxIdsAndUniques(
+      int64_t num_samples_per_sparse_core, int64_t feature_width,
+      int64_t* max_ids_per_partition,
+      int64_t* max_unique_ids_per_partition) override {
+    if (max_ids_per_partition == nullptr ||
+        max_unique_ids_per_partition == nullptr) {
+      return absl::InternalError("Setting the max ids/uniques failed.");
+    }
+    *max_ids_per_partition = max_ids_per_sparse_core_;
+    *max_unique_ids_per_partition = max_unique_ids_per_sparse_core_;
+    return absl::OkStatus();
+  }
+
+ private:
+  int32_t max_ids_per_sparse_core_;
+  int32_t max_unique_ids_per_sparse_core_;
+};
+
+REGISTER_XLA_OP(
+    Name("XlaSparseDenseMatmulGradWithAdagradMomentumAndStaticBufferSize"),
+    XlaSparseDenseMatmulGradWithAdagradMomentumAndStaticBufferSizeOp);
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_embedding_ops.cc b/tensorflow/core/tpu/kernels/tpu_embedding_ops.cc
index 364c1eefb9d005..f23215c086b513 100644
--- a/tensorflow/core/tpu/kernels/tpu_embedding_ops.cc
+++ b/tensorflow/core/tpu/kernels/tpu_embedding_ops.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/cleanup/cleanup.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "xla/client/xla_builder.h"
@@ -191,11 +192,11 @@ void CompileSendTPUEmbeddingGradients(
   std::vector<xla::Shape> gradient_shapes;
   auto builder = ctx->builder();
   gradient_shapes.reserve(gradients.size());
-  for (xla::XlaOp op : gradients) {
-    // Gradient layout information is added by XLA, so we can just create
-    // default layout information.
-    xla::Shape gradient_shape = builder->GetShape(op).value();
-    xla::LayoutUtil::SetToDefaultLayout(&gradient_shape);
+  for (int i = 0; i < gradients.size(); ++i) {
+    DataType dtype = ctx->input_type(i);
+    xla::Shape gradient_shape;
+    OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(dtype, tf_gradient_shapes[i],
+                                              &gradient_shape));
     gradient_shapes.push_back(gradient_shape);
   }
 
diff --git a/tensorflow/core/tpu/kernels/tpu_execute_op.cc b/tensorflow/core/tpu/kernels/tpu_execute_op.cc
index 02654c0d6a2cc6..7a9812be9ae43b 100644
--- a/tensorflow/core/tpu/kernels/tpu_execute_op.cc
+++ b/tensorflow/core/tpu/kernels/tpu_execute_op.cc
@@ -42,7 +42,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/device_memory.h"
@@ -739,9 +738,8 @@ Status TPUExecuteOp::DoWork(OpKernelContext* context) {
         });
   }
 
-  auto definition_event = std::make_shared<se::Event>(stream->parent());
-  TF_RET_CHECK(definition_event->Init())
-      << "TPU definition event initialization failed";
+  TF_ASSIGN_OR_RETURN(std::shared_ptr<se::Event> definition_event,
+                      stream->parent()->CreateEvent());
 
   trace_me_init.Stop();
 
diff --git a/tensorflow/core/tpu/kernels/tpu_functional_ops.cc b/tensorflow/core/tpu/kernels/tpu_functional_ops.cc
index 2c95582a186e03..98f0c438d96778 100644
--- a/tensorflow/core/tpu/kernels/tpu_functional_ops.cc
+++ b/tensorflow/core/tpu/kernels/tpu_functional_ops.cc
@@ -2397,7 +2397,8 @@ Status TPUPartitionedCallOp::GetGraphFromFunction(
         if (!tpu_metadata->device_assignment.empty() && device_ordinal > 0) {
           return absl::InvalidArgumentError(
               "`device_assignment` shouldn't be set manually in the graph when "
-              "round-robin core selection is enabled.");
+              "core selector is enabled or device_ordinal of the op is "
+              "non-zero.");
         }
 
         tpu_metadata->topology = GetTPUTopology();
@@ -2426,8 +2427,8 @@ Status TPUPartitionedCallOp::GetGraphFromFunction(
           // The auto generated device assignment should be the same as or a
           // slice of TPU topology device_coordinates. This guarantees the
           // logical device IDs order the same as the physical device IDs order.
-          // It is important for round-robin core selection, as we assume
-          // the TPU device group for one inference request is
+          // It is important for core selection, as we assume the TPU device
+          // group for one inference request is
           // [TPU:device_ordinal, TPU:device_ordinal + num_cores_per_replica].
 
           auto coordinates_start =
diff --git a/tensorflow/core/tpu/kernels/tpu_reshard_variables_op.cc b/tensorflow/core/tpu/kernels/tpu_reshard_variables_op.cc
index 96ae248347474f..e311a802a158f3 100644
--- a/tensorflow/core/tpu/kernels/tpu_reshard_variables_op.cc
+++ b/tensorflow/core/tpu/kernels/tpu_reshard_variables_op.cc
@@ -216,9 +216,8 @@ Status TPUReshardVariablesOpKernel::DoTpuExecute(
   TF_RET_CHECK(!executable->has_session_module())
       << "session module not supported in sharding/unsharding program.";
 
-  auto definition_event = std::make_shared<se::Event>(stream->parent());
-  TF_RET_CHECK(definition_event->Init())
-      << "TPU definition event initialization failed";
+  TF_ASSIGN_OR_RETURN(std::shared_ptr<se::Event> definition_event,
+                      stream->parent()->CreateEvent());
 
   trace_me_init.Stop();
 
diff --git a/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.cc b/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.cc
index 8a9b7f6dc55e90..c2c5d8dbcb8d69 100644
--- a/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.cc
+++ b/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.cc
@@ -56,7 +56,7 @@ Status FlushProgramMemory(se::Platform* platform, int device_ordinal) {
                       tpu::TpuNodeContext::Create(device_ordinal));
 
   auto* executor = tensorflow::down_cast<tpu::TpuExecutorInterface*>(
-      node_interfaces->stream_executor()->implementation());
+      node_interfaces->stream_executor());
   return executor->UnloadAllPrograms();
 }
 
@@ -214,8 +214,7 @@ absl::StatusOr<xla::ShapeTree<xla::MaybeOwningDeviceMemory>> BuildInputBuffers(
 // Perform a compaction to reduce fragmentation.
 Status PerformCompaction(stream_executor::Stream* stream) {
   tsl::profiler::TraceMe trace_me("PerformCompaction", /*level=*/2);
-  auto* ds_executor =
-      down_cast<tpu::TpuExecutorInterface*>(stream->parent()->implementation());
+  auto* ds_executor = down_cast<tpu::TpuExecutorInterface*>(stream->parent());
   TF_RETURN_IF_ERROR(ds_executor->EnqueueCompactionOnStreamForHbm(stream));
   // LoadProgram and GetOrCreateConstantHandle are not managed by stream
   // dependencies but they write to shared memory, so we need to block here to
diff --git a/tensorflow/core/tpu/ops/BUILD b/tensorflow/core/tpu/ops/BUILD
index 79e0d83c933006..75729c64f27ab5 100644
--- a/tensorflow/core/tpu/ops/BUILD
+++ b/tensorflow/core/tpu/ops/BUILD
@@ -189,6 +189,8 @@ cc_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@local_xla//xla:util",
     ],
     alwayslink = 1,
@@ -202,9 +204,7 @@ cc_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/tpu:tpu_embedding_optimization_parameters_utils",
-        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/status",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/tpu/ops/sparse_core_ops.cc b/tensorflow/core/tpu/ops/sparse_core_ops.cc
index ff71f572993c62..81b1b8ebdd9e1d 100644
--- a/tensorflow/core/tpu/ops/sparse_core_ops.cc
+++ b/tensorflow/core/tpu/ops/sparse_core_ops.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "absl/status/status.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
@@ -330,4 +331,173 @@ REGISTER_OP("GlobalIterId")
       return absl::OkStatus();
     });
 
+REGISTER_OP("XlaSparseDenseMatmulWithStaticBufferSize")
+    .Input("row_pointers: int32")
+    .Input("sorted_sample_ids: int32")
+    .Input("sorted_token_ids: int32")
+    .Input("sorted_gains: float32")
+    .Input("embedding_table: float32")
+    .Input("num_minibatches_per_physical_sparse_core: int32")
+    .Output("activations: float32")
+    .Attr("input_size: int >= 0")
+    .Attr("quantization_config_low: float")
+    .Attr("quantization_config_high: float")
+    .Attr("quantization_config_num_buckets: int >= 0")
+    .Attr("max_ids_per_sparse_core: int >= 1")
+    .Attr("max_unique_ids_per_sparse_core: int >= 1")
+    .Attr("table_name: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) -> Status {
+      int input_size;
+      TF_RETURN_IF_ERROR(c->GetAttr("input_size", &input_size));
+      shape_inference::ShapeHandle rank;
+      for (int i = 0; i < 4; ++i) {
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &rank));
+      }
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 2, &rank));
+      for (int i = 2; i < 4; ++i) {
+        shape_inference::ShapeHandle merged;
+        TF_RETURN_IF_ERROR(c->Merge(c->input(i), c->input(1), &merged));
+      }
+      shape_inference::ShapeHandle output_shape;
+      TF_RETURN_IF_ERROR(
+          c->ReplaceDim(c->input(4), 0, c->MakeDim(input_size), &output_shape));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &rank));
+      c->set_output(0, output_shape);
+      return absl::OkStatus();
+    });
+
+REGISTER_OP("XlaSparseDenseMatmulGradWithSgdAndStaticBufferSize")
+    .Input("row_pointers: int32")
+    .Input("sorted_sample_ids: int32")
+    .Input("sorted_token_ids: int32")
+    .Input("sorted_gains: float32")
+    .Input("activation_gradients: float32")
+    .Input("learning_rate: float32")
+    .Input("embedding_table: float32")
+    .Input("num_minibatches_per_physical_sparse_core: int32")
+    .Output("updated_embedding_table: float32")
+    .Attr("clip_weight_min: float = -inf")
+    .Attr("clip_weight_max: float = inf")
+    .Attr("max_ids_per_sparse_core: int >= 1")
+    .Attr("max_unique_ids_per_sparse_core: int >= 1")
+    .Attr("table_name: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) -> Status {
+      c->set_output(0, c->input(6));
+      return absl::OkStatus();
+    });
+
+REGISTER_OP("XlaSparseDenseMatmulGradWithAdagradAndStaticBufferSize")
+    .Input("row_pointers: int32")
+    .Input("sorted_sample_ids: int32")
+    .Input("sorted_token_ids: int32")
+    .Input("sorted_gains: float32")
+    .Input("activation_gradients: float32")
+    .Input("learning_rate: float32")
+    .Input("embedding_table: float32")
+    .Input("accumulator: float32")
+    .Input("num_minibatches_per_physical_sparse_core: int32")
+    .Output("updated_embedding_table: float32")
+    .Output("updated_accumulator: float32")
+    .Attr("clip_weight_min: float = -inf")
+    .Attr("clip_weight_max: float = inf")
+    .Attr("max_ids_per_sparse_core: int >= 1")
+    .Attr("max_unique_ids_per_sparse_core: int >= 1")
+    .Attr("table_name: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) -> Status {
+      c->set_output(0, c->input(6));
+      c->set_output(1, c->input(7));
+      return absl::OkStatus();
+    });
+
+REGISTER_OP("XlaSparseDenseMatmulGradWithAdagradMomentumAndStaticBufferSize")
+    .Input("row_pointers: int32")
+    .Input("sorted_sample_ids: int32")
+    .Input("sorted_token_ids: int32")
+    .Input("sorted_gains: float32")
+    .Input("activation_gradients: float32")
+    .Input("learning_rate: float32")
+    .Input("embedding_table: float32")
+    .Input("accumulator: float32")
+    .Input("momenta: float32")
+    .Input("num_minibatches_per_physical_sparse_core: int32")
+    .Output("updated_embedding_table: float32")
+    .Output("updated_accumulator: float32")
+    .Output("updated_momenta: float32")
+    .Attr("use_nesterov: bool")
+    .Attr("exponent: float")
+    .Attr("beta1: float")
+    .Attr("beta2: float")
+    .Attr("epsilon: float")
+    .Attr("clip_weight_min: float = -inf")
+    .Attr("clip_weight_max: float = inf")
+    .Attr("max_ids_per_sparse_core: int >= 1")
+    .Attr("max_unique_ids_per_sparse_core: int >= 1")
+    .Attr("table_name: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) -> Status {
+      c->set_output(0, c->input(6));
+      c->set_output(1, c->input(7));
+      c->set_output(2, c->input(8));
+      return absl::OkStatus();
+    });
+
+REGISTER_OP("XlaSparseDenseMatmulGradWithAdamAndStaticBufferSize")
+    .Input("row_pointers: int32")
+    .Input("sorted_sample_ids: int32")
+    .Input("sorted_token_ids: int32")
+    .Input("sorted_gains: float32")
+    .Input("activation_gradients: float32")
+    .Input("learning_rate: float32")
+    .Input("embedding_table: float32")
+    .Input("momenta: float32")
+    .Input("velocity: float32")
+    .Input("num_minibatches_per_physical_sparse_core: int32")
+    .Output("updated_embedding_table: float32")
+    .Output("updated_momenta: float32")
+    .Output("updated_velocity: float32")
+    .Attr("use_sum_inside_sqrt: bool")
+    .Attr("beta1: float")
+    .Attr("beta2: float")
+    .Attr("epsilon: float")
+    .Attr("clip_weight_min: float = -inf")
+    .Attr("clip_weight_max: float = inf")
+    .Attr("max_ids_per_sparse_core: int >= 1")
+    .Attr("max_unique_ids_per_sparse_core: int >= 1")
+    .Attr("table_name: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) -> Status {
+      c->set_output(0, c->input(6));
+      c->set_output(1, c->input(7));
+      c->set_output(2, c->input(8));
+      return absl::OkStatus();
+    });
+
+REGISTER_OP("XlaSparseDenseMatmulGradWithFtrlAndStaticBufferSize")
+    .Input("row_pointers: int32")
+    .Input("sorted_sample_ids: int32")
+    .Input("sorted_token_ids: int32")
+    .Input("sorted_gains: float32")
+    .Input("activation_gradients: float32")
+    .Input("learning_rate: float32")
+    .Input("embedding_table: float32")
+    .Input("accumulator: float32")
+    .Input("linear: float32")
+    .Input("num_minibatches_per_physical_sparse_core: int32")
+    .Output("updated_embedding_table: float32")
+    .Output("updated_accumulator: float32")
+    .Output("updated_linear: float32")
+    .Attr("multiply_linear_by_learning_rate: bool")
+    .Attr("beta: float")
+    .Attr("learning_rate_power: float")
+    .Attr("l1_regularization_strength: float")
+    .Attr("l2_regularization_strength: float")
+    .Attr("clip_weight_min: float = -inf")
+    .Attr("clip_weight_max: float = inf")
+    .Attr("max_ids_per_sparse_core: int >= 1")
+    .Attr("max_unique_ids_per_sparse_core: int >= 1")
+    .Attr("table_name: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) -> Status {
+      c->set_output(0, c->input(6));
+      c->set_output(1, c->input(7));
+      c->set_output(2, c->input(8));
+      return absl::OkStatus();
+    });
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/ops/sparse_core_preprocess_ops.cc b/tensorflow/core/tpu/ops/sparse_core_preprocess_ops.cc
index 5d96559932e286..6eaa669b399ec5 100644
--- a/tensorflow/core/tpu/ops/sparse_core_preprocess_ops.cc
+++ b/tensorflow/core/tpu/ops/sparse_core_preprocess_ops.cc
@@ -12,10 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstdint>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "xla/util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tsl/platform/errors.h"
 
@@ -152,4 +156,159 @@ REGISTER_OP("StoreMinibatchStatisticsInFdo")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       return absl::OkStatus();
     });
+
+REGISTER_OP("ConvertToListOfSparseCoreCooTensors")
+    .Input("indices_or_row_splits: int32")
+    .Input("values: int32")
+    .Input("weights: float32")
+    .Output("row_ids_list: num_sc_per_chip * int32")
+    .Output("col_ids_list: num_sc_per_chip * int32")
+    .Output("gains_list: num_sc_per_chip * float32")
+    .Attr("sample_count: int >= 1")
+    .Attr("num_sc_per_chip: int >= 1")
+    .Attr("row_offset: int >= 0")
+    .Attr("col_offset: int >= 0")
+    .Attr("col_shift: int >= 0")
+    .Attr("num_sc_shards: int >= 1")
+    .Attr("stacked_table_sample_count: int >= 1")
+    .Attr("combiner: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      int32_t num_sc_per_chip;
+      TF_RETURN_IF_ERROR(c->GetAttr("num_sc_per_chip", &num_sc_per_chip));
+      std::vector<shape_inference::ShapeHandle> output_id_shape(
+          num_sc_per_chip, c->UnknownShapeOfRank(1));
+      TF_RETURN_IF_ERROR(c->set_output("row_ids_list", output_id_shape));
+      TF_RETURN_IF_ERROR(c->set_output("col_ids_list", output_id_shape));
+      TF_RETURN_IF_ERROR(c->set_output("gains_list", output_id_shape));
+      return absl::OkStatus();
+    });
+
+REGISTER_OP("SortListOfSparseCoreCooTensors")
+    .Input("row_ids_list: N * int32")
+    .Input("col_ids_list: N * int32")
+    .Input("gains_list:  N * float32")
+    .Output("sorted_row_ids: int32")
+    .Output("sorted_col_ids: int32")
+    .Output("sorted_gains: float32")
+    .Output("id_counts: int32")
+    .Attr("sample_count_list : list(int)")
+    .Attr("col_offset_list : list(int)")
+    .Attr("num_replica: int >= 1")
+    .Attr("table_vocab_size: int >= 1")
+    .Attr("feature_width: int >= 1")
+    .Attr("num_sc_per_chip: int >= 1")
+    .Attr("max_ids_per_sparse_core: int >= 1")
+    .Attr("max_unique_ids_per_sparse_core: int >= 1")
+    .Attr("table_name: string")
+    .Attr("N: int >= 1")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      std::vector<int32_t> sample_count_list;
+      TF_RETURN_IF_ERROR(c->GetAttr("sample_count_list", &sample_count_list));
+      std::vector<int32_t> col_offset_list;
+      TF_RETURN_IF_ERROR(c->GetAttr("col_offset_list", &col_offset_list));
+      int32_t num_features;
+      TF_RETURN_IF_ERROR(c->GetAttr("N", &num_features));
+
+      if (sample_count_list.size() != num_features) {
+        return absl::InvalidArgumentError(
+            absl::StrCat("sample_count_list must have the same size as number "
+                         "of feature inputs, but got ",
+                         sample_count_list.size(), " vs ", num_features));
+      }
+
+      if (col_offset_list.size() != num_features) {
+        return absl::InvalidArgumentError(
+            absl::StrCat("col_offset_list must have the same size as number of "
+                         "feature inputs, but got ",
+                         col_offset_list.size(), " vs ", num_features));
+      }
+
+      c->set_output(0, c->UnknownShapeOfRank(1));
+      c->set_output(1, c->UnknownShapeOfRank(1));
+      c->set_output(2, c->UnknownShapeOfRank(1));
+      c->set_output(3, c->UnknownShapeOfRank(1));
+      return absl::OkStatus();
+    });
+
+REGISTER_OP("ConvertToSparseCoreCsrWrappedCooTensor")
+    .Input("sorted_row_ids_list: num_sc_per_chip * int32")
+    .Input("sorted_col_ids_list: num_sc_per_chip * int32")
+    .Input("sorted_gains_list: num_sc_per_chip * float32")
+    .Input("id_counts_list: num_sc_per_chip * int32")
+    .Input("splits: int64")
+    .Output("row_pointers: int32")
+    .Output("sorted_sample_ids: int32")
+    .Output("sorted_token_ids: int32")
+    .Output("sorted_gains: float32")
+    .Output("row_pointers_unpadded_size: int32")
+    .Output("ids_unpadded_size: int32")
+    .Output("num_minibatches_per_sc: int32")
+    .Attr("sample_count_per_sc : int >= 1")
+    .Attr("num_replica: int >= 1")
+    .Attr("max_minibatches_per_sc: int >= 1")
+    .Attr("max_ids_per_chip_per_sample: int >= 1")
+    .Attr("table_vocab_size: int >= 1")
+    .Attr("feature_width: int >= 1")
+    .Attr("num_sc_per_chip: int >= 1")
+    .Attr("table_name: string")
+    .Attr("allow_id_dropping: bool")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      int32 max_minibatches_per_sc;
+      TF_RETURN_IF_ERROR(
+          c->GetAttr("max_minibatches_per_sc", &max_minibatches_per_sc));
+      int32 num_replica;
+      TF_RETURN_IF_ERROR(c->GetAttr("num_replica", &num_replica));
+      int32 sample_count_per_sc;
+      TF_RETURN_IF_ERROR(
+          c->GetAttr("sample_count_per_sc", &sample_count_per_sc));
+      int32 max_ids_per_chip_per_sample;
+      TF_RETURN_IF_ERROR(c->GetAttr("max_ids_per_chip_per_sample",
+                                    &max_ids_per_chip_per_sample));
+      // We can't get this number programmatically since the shape inference
+      // will be run as part of the graph generation which might not have the
+      // tpu system available.
+      const int xla_pad_size = 8;
+      int32 num_sc_per_chip;
+      TF_RETURN_IF_ERROR(c->GetAttr("num_sc_per_chip", &num_sc_per_chip));
+
+      const int num_physical_replica = num_replica * num_sc_per_chip;
+      const int max_total_minibatches =
+          num_sc_per_chip * max_minibatches_per_sc;
+      const int max_ids_per_chip =
+          max_ids_per_chip_per_sample * sample_count_per_sc * num_sc_per_chip;
+
+      const int padded_row_pointers_size_per_sc =
+          xla::RoundUpTo(num_physical_replica, xla_pad_size);
+
+      c->set_output(0, c->MakeShape({max_total_minibatches *
+                                     padded_row_pointers_size_per_sc}));
+      for (int i = 1; i < 4; ++i) {
+        c->set_output(i, c->MakeShape({max_ids_per_chip}));
+      }
+      c->set_output(4, c->Scalar());
+      c->set_output(5, c->Scalar());
+      c->set_output(6, c->Scalar());
+      return absl::OkStatus();
+    });
+
+REGISTER_OP("GetStatsFromListOfSparseCoreCooTensors")
+    .Input("row_ids_list: N * int32")
+    .Input("col_ids_list: N * int32")
+    .Input("gains_list:  N * float32")
+    .Output("max_ids_per_sparse_core: int32")
+    .Output("max_unique_ids_per_sparse_core: int32")
+    .Attr("sample_count_list : list(int)")
+    .Attr("col_offset_list : list(int)")
+    .Attr("num_replica: int >= 1")
+    .Attr("table_vocab_size: int >= 1")
+    .Attr("feature_width: int >= 1")
+    .Attr("num_sc_per_chip: int >= 1")
+    .Attr("table_name: string")
+    .Attr("N: int >= 1")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      c->set_output(1, c->Scalar());
+      return absl::OkStatus();
+    });
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/ops/tpu_embedding_ops.cc b/tensorflow/core/tpu/ops/tpu_embedding_ops.cc
index 5d15a34bb14ba5..1e257f9a177325 100644
--- a/tensorflow/core/tpu/ops/tpu_embedding_ops.cc
+++ b/tensorflow/core/tpu/ops/tpu_embedding_ops.cc
@@ -696,4 +696,14 @@ REGISTER_OP("FinalizeTPUEmbeddingV2")
       return absl::OkStatus();
     });
 
+REGISTER_OP("GetTpuTaskId")
+    .Output("tpu_task_id: int32")
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape);
+
+REGISTER_OP("UpdateTaskIdAndGlobalCoreArray")
+    .Input("tpu_task_id_to_shard_id: task_count * int32")
+    .Attr("task_count: int >= 1")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::UnknownShape);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/tpu_compile.cc b/tensorflow/core/tpu/tpu_compile.cc
index ff4f16d2481203..02249750ba8d88 100644
--- a/tensorflow/core/tpu/tpu_compile.cc
+++ b/tensorflow/core/tpu/tpu_compile.cc
@@ -478,7 +478,7 @@ Status CompileTFFunctionToHlo(
     args[i].node_name = fbody->arg_nodes[i]->name();
   }
 
-  std::vector<gtl::InlinedVector<int64_t, 4>> arg_shape_dims;
+  std::vector<absl::InlinedVector<int64_t, 4UL>> arg_shape_dims;
   arg_shape_dims.reserve(arg_shapes.size());
   std::vector<PartialTensorShape> partial_arg_shapes(arg_shapes.size());
   for (const TensorShape& shape : arg_shapes) {
diff --git a/tensorflow/core/tpu/tpu_execute.cc b/tensorflow/core/tpu/tpu_execute.cc
index 5daa760f96ba85..8daf680e09314f 100644
--- a/tensorflow/core/tpu/tpu_execute.cc
+++ b/tensorflow/core/tpu/tpu_execute.cc
@@ -46,7 +46,6 @@ limitations under the License.
 #include "xla/shape_layout.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/device_memory.h"
@@ -166,7 +165,7 @@ absl::Status UpdateDynamicInputs(
   TF_RET_CHECK(runtime_inputs->size() == compile_time_shapes.size());
   TF_ASSIGN_OR_RETURN(
       auto transfer_manager,
-      xla::TransferManager::GetForPlatform(stream->parent()->platform()));
+      xla::TransferManager::GetForPlatform(stream->parent()->GetPlatform()));
   for (int64_t i = 0; i < compile_time_shapes.size(); i++) {
     // TODO(yunxing): Iterating over thousands of elements can be slow. One way
     // to optimize for fast path without dynamic shapes is add a field in
diff --git a/tensorflow/core/transforms/BUILD b/tensorflow/core/transforms/BUILD
index 54850bc90c587b..99c24186049dad 100644
--- a/tensorflow/core/transforms/BUILD
+++ b/tensorflow/core/transforms/BUILD
@@ -90,7 +90,6 @@ cc_library(
         ":PDLLUtilsIncGen",
         ":utils",
         "//tensorflow/core:framework_types_hdr",
-        "//tensorflow/core/ir:Dialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
     ],
@@ -105,6 +104,7 @@ cc_library(
         "//tensorflow/core/ir:Dialect",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
 )
 
diff --git a/tensorflow/core/transforms/consolidate_attrs/BUILD b/tensorflow/core/transforms/consolidate_attrs/BUILD
index 5558172c0669e2..de32ec2909989a 100644
--- a/tensorflow/core/transforms/consolidate_attrs/BUILD
+++ b/tensorflow/core/transforms/consolidate_attrs/BUILD
@@ -16,11 +16,11 @@ cc_library(
     hdrs = ["pass.h"],
     deps = [
         "//tensorflow/core/ir:Dialect",
-        "//tensorflow/core/ir:shape_inference_utils",
         "//tensorflow/core/ir/importexport:convert_tensor",
         "//tensorflow/core/ir/types:Dialect",
         "//tensorflow/core/transforms:PassIncGen",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FunctionInterfaces",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
diff --git a/tensorflow/core/transforms/consolidate_attrs/pass.cc b/tensorflow/core/transforms/consolidate_attrs/pass.cc
index 2f8b629b2d2988..cce777dc4f6141 100644
--- a/tensorflow/core/transforms/consolidate_attrs/pass.cc
+++ b/tensorflow/core/transforms/consolidate_attrs/pass.cc
@@ -19,14 +19,25 @@ limitations under the License.
 #include <optional>
 #include <utility>
 
-#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Block.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "mlir/Interfaces/FunctionInterfaces.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/core/ir/dialect.h"
 #include "tensorflow/core/ir/importexport/convert_tensor.h"
@@ -34,7 +45,6 @@ limitations under the License.
 #include "tensorflow/core/ir/tf_op_wrapper.h"
 #include "tensorflow/core/ir/types/dialect.h"
 #include "tensorflow/core/ir/utility.h"
-#include "tensorflow/core/ir/utils/shape_inference_utils.h"
 
 namespace mlir {
 namespace tfg {
@@ -47,13 +57,13 @@ static const char *kRegenerateOutputShapes = "tfg.regenerate_output_shapes";
 
 // Returns true if an attribute is an array of shapes;
 static bool IsArrayOfShapes(ArrayAttr array) {
-  return llvm::all_of(array,
-                      [](Attribute attr) { return attr.isa<ShapeAttr>(); });
+  return llvm::all_of(
+      array, [](Attribute attr) { return mlir::isa<ShapeAttr>(attr); });
 }
 
 // Given a tensor type and shape information, try to refine the type.
 static Type GetReifiedType(Type orig, ShapeAttr shape) {
-  Type element_type = orig.cast<ShapedType>().getElementType();
+  Type element_type = mlir::cast<ShapedType>(orig).getElementType();
   TensorType inferred;
   if (shape.hasRank()) {
     // Replace dimensions less than -1 with ?
@@ -137,11 +147,11 @@ Type ConsolidateAttributesPassImpl::refineTypeWithOutputShapes(
   // Get the output shapes attribute. If the attribute is not an array of
   // exactly one shape, ignore it.
   if (auto output_shapes =
-          attrs.get(output_shapes_id_).dyn_cast_or_null<ArrayAttr>()) {
+          mlir::dyn_cast_or_null<ArrayAttr>(attrs.get(output_shapes_id_))) {
     if (output_shapes.size() == 1 && IsArrayOfShapes(output_shapes)) {
       attrs.erase(output_shapes_id_);
       attrs.set(regenerate_output_shapes_id_, UnitAttr::get(&getContext()));
-      return GetReifiedType(type, output_shapes[0].cast<ShapeAttr>());
+      return GetReifiedType(type, mlir::cast<ShapeAttr>(output_shapes[0]));
     }
   }
   return type;
@@ -153,8 +163,9 @@ Type ConsolidateAttributesPassImpl::refineTypeWithHandleData(
   SmallVector<TensorType> subtypes;
   // Because `tfg.handle_data` is a TFG internal attribute, it will be
   // well-formed.
-  for (Type type : handle_data.cast<ArrayAttr>().getAsValueRange<TypeAttr>())
-    subtypes.push_back(type.cast<TensorType>());
+  for (Type type :
+       mlir::cast<ArrayAttr>(handle_data).getAsValueRange<TypeAttr>())
+    subtypes.push_back(mlir::cast<TensorType>(type));
   auto resource =
       UnrankedTensorType::get(ResourceType::get(subtypes, &getContext()));
   Type reified = tf_type::GetCastCompatibleType(resource, type);
@@ -167,7 +178,7 @@ ArrayAttr ConsolidateAttributesPassImpl::reifyAndDropFunctionArgumentAttributes(
   // we will ignore it. If it isn't an array of shapes or has an inconsistent
   // number of shapes, ignore it.
   ArrayAttr input_shapes =
-      func->getAttr(input_shapes_id_).dyn_cast_or_null<ArrayAttr>();
+      mlir::dyn_cast_or_null<ArrayAttr>(func->getAttr(input_shapes_id_));
   unsigned num_args = func.getNumArguments() / 2;
   if (input_shapes) {
     if (input_shapes.size() != num_args || !IsArrayOfShapes(input_shapes)) {
@@ -188,7 +199,8 @@ ArrayAttr ConsolidateAttributesPassImpl::reifyAndDropFunctionArgumentAttributes(
     arg_type = refineTypeWithOutputShapes(arg_type, attrs);
     arg_type = refineTypeWithHandleData(arg_type, attrs.erase(handle_data_id_));
     if (input_shapes)
-      arg_type = GetReifiedType(arg_type, input_shapes[i].cast<ShapeAttr>());
+      arg_type =
+          GetReifiedType(arg_type, mlir::cast<ShapeAttr>(input_shapes[i]));
     arg.setType(arg_type);
     attrs.erase(dtype_id_);
     attrs.erase(is_ref_id_);
@@ -242,7 +254,7 @@ class ReifyOperationOutputShapes : public RewritePattern {
     // attribute, if it has an inconsistent number of shapes, or if it is not
     // an array of shapes.
     ArrayAttr output_shapes =
-        op->getAttr(output_shapes_id_).dyn_cast_or_null<ArrayAttr>();
+        mlir::dyn_cast_or_null<ArrayAttr>(op->getAttr(output_shapes_id_));
     if (!output_shapes || results.size() != output_shapes.size() ||
         !IsArrayOfShapes(output_shapes))
       return failure();
@@ -422,7 +434,7 @@ void PrepareAttributesForExportPassImpl::prepareFunctionAttributes(
       continue;
     }
     arg_attrs.push_back(prepareAttributesFor(type, attrs));
-    if (auto ranked = type.dyn_cast<RankedTensorType>()) {
+    if (auto ranked = mlir::dyn_cast<RankedTensorType>(type)) {
       input_shapes.push_back(ShapeAttr::get(&getContext(), ranked.getShape()));
     } else {
       input_shapes.push_back(ShapeAttr::get(&getContext(), std::nullopt));
@@ -450,14 +462,14 @@ DictionaryAttr PrepareAttributesForExportPassImpl::prepareAttributesFor(
   NamedAttrList attrs(attr_dict);
   // Add shape data if requested.
   if (attrs.erase(regenerate_output_shapes_id_)) {
-    auto shape = ShapeAttr::get(&getContext(),
-                                type.isa<RankedTensorType>()
-                                    ? type.cast<RankedTensorType>().getShape()
-                                    : std::optional<ArrayRef<int64_t>>());
+    auto shape = ShapeAttr::get(
+        &getContext(), mlir::isa<RankedTensorType>(type)
+                           ? mlir::cast<RankedTensorType>(type).getShape()
+                           : std::optional<ArrayRef<int64_t>>());
     attrs.set(output_shapes_id_, ArrayAttr::get(&getContext(), {shape}));
   }
-  auto element_type = type.cast<TensorType>().getElementType();
-  if (auto resource = element_type.dyn_cast<ResourceType>()) {
+  auto element_type = mlir::cast<TensorType>(type).getElementType();
+  if (auto resource = mlir::dyn_cast<ResourceType>(element_type)) {
     SmallVector<Attribute> handle_data;
     for (TensorType subtype : resource.getSubtypes())
       handle_data.push_back(TypeAttr::get(subtype));
@@ -465,7 +477,7 @@ DictionaryAttr PrepareAttributesForExportPassImpl::prepareAttributesFor(
     if (!handle_data.empty())
       attrs.set(handle_data_id_, ArrayAttr::get(&getContext(), handle_data));
   }
-  if (element_type.isa<tf_type::TensorFlowRefType>())
+  if (mlir::isa<tf_type::TensorFlowRefType>(element_type))
     attrs.set(is_ref_id_, UnitAttr::get(&getContext()));
   return attrs.getDictionary(&getContext());
 }
@@ -475,8 +487,8 @@ static ArrayAttr GetElementTypesAttr(PatternRewriter &rewriter,
                                      ValueRange values) {
   SmallVector<Attribute> types;
   for (Value value : values) {
-    types.push_back(
-        TypeAttr::get(value.getType().cast<TensorType>().getElementType()));
+    types.push_back(TypeAttr::get(
+        mlir::cast<TensorType>(value.getType()).getElementType()));
   }
   return rewriter.getArrayAttr(types);
 }
@@ -515,11 +527,10 @@ struct MaterializeIfAttrs : public MaterializeAttrsPattern<IfLikeOp> {
                                 PatternRewriter &rewriter) const override {
     if (op.getTcond() && op.getTin() && op.getTout()) return failure();
     NamedAttrList attrs(op->getAttrDictionary());
-    attrs.set(op.getTcondAttrName(),
-              TypeAttr::get(op.getCond()
-                                .getType()
-                                .template cast<TensorType>()
-                                .getElementType()));
+    attrs.set(
+        op.getTcondAttrName(),
+        TypeAttr::get(
+            mlir::cast<TensorType>(op.getCond().getType()).getElementType()));
     attrs.set(op.getTinAttrName(),
               this->getArgumentElementTypesAttr(rewriter, op));
     attrs.set(op.getToutAttrName(),
@@ -583,7 +594,7 @@ class MaterializeOutputShapesBase : public RewritePattern {
 
     SmallVector<Attribute> shapes;
     for (Value result : results) {
-      if (auto ranked = result.getType().dyn_cast<RankedTensorType>()) {
+      if (auto ranked = mlir::dyn_cast<RankedTensorType>(result.getType())) {
         shapes.push_back(ShapeAttr::get(op->getContext(), ranked.getShape()));
       } else {
         shapes.push_back(ShapeAttr::get(op->getContext(), std::nullopt));
diff --git a/tensorflow/core/transforms/const_dedupe_hoist/BUILD b/tensorflow/core/transforms/const_dedupe_hoist/BUILD
index 381b666a80a711..54d58e8c7ee09f 100644
--- a/tensorflow/core/transforms/const_dedupe_hoist/BUILD
+++ b/tensorflow/core/transforms/const_dedupe_hoist/BUILD
@@ -21,6 +21,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
     ],
 )
 
diff --git a/tensorflow/core/transforms/const_dedupe_hoist/pass.cc b/tensorflow/core/transforms/const_dedupe_hoist/pass.cc
index 8358bd338d374c..6a793cedea3b13 100644
--- a/tensorflow/core/transforms/const_dedupe_hoist/pass.cc
+++ b/tensorflow/core/transforms/const_dedupe_hoist/pass.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/core/ir/dialect.h"
 #include "tensorflow/core/ir/ops.h"
 #include "tensorflow/core/ir/utility.h"
@@ -189,7 +190,7 @@ void DedupeAndHoistConstantPass::RunOnGraphOrFuncOp(Operation* op) {
   op->walk([&](Operation* inner_op) {
     if (inner_op->getName().getIdentifier() != tfg_const) return;
 
-    ElementsAttr val = inner_op->getAttr(value_id).cast<ElementsAttr>();
+    ElementsAttr val = mlir::cast<ElementsAttr>(inner_op->getAttr(value_id));
     if (val.getNumElements() > max_size_) return;
     constant_ops[inner_op].push_back(inner_op);
   });
diff --git a/tensorflow/core/transforms/constant_folding/pass.cc b/tensorflow/core/transforms/constant_folding/pass.cc
index 07e831317891c4..a168283bbb7190 100644
--- a/tensorflow/core/transforms/constant_folding/pass.cc
+++ b/tensorflow/core/transforms/constant_folding/pass.cc
@@ -107,7 +107,7 @@ static FailureOr<TFOp> CreateConstantTensorOp(
     OpBuilder &builder, Location loc, StringRef name_prefix, Type type,
     ValueRange control_operands, TypedAttr tensor_value,
     ArrayRef<NamedAttribute> other_attrs = std::nullopt) {
-  if (type.isa<VariantType>()) return failure();
+  if (mlir::isa<VariantType>(type)) return failure();
   // TODO(chiahungduan): Reuse ConstOp Like
   // OperationFolder::tryGetOrCreateConstant.
   OperationState state(loc, "tfg.Const");
@@ -116,8 +116,9 @@ static FailureOr<TFOp> CreateConstantTensorOp(
   state.attributes = other_attrs;
   util::EraseRegularNodeAttributes(state.attributes);
   state.attributes.set(
-      "dtype", TypeAttr::get(
-                   tensor_value.getType().cast<ShapedType>().getElementType()));
+      "dtype",
+      TypeAttr::get(
+          mlir::cast<ShapedType>(tensor_value.getType()).getElementType()));
   state.attributes.set("value", tensor_value);
   if (!name_prefix.empty()) {
     state.attributes.set(
@@ -170,7 +171,7 @@ static TFOp GetControlAnchorForSwitchResult(
   if (StringAttr device_attr = switch_op.deviceAttr())
     identity_op.setRequestedDevice(device_attr);
   identity_op.setName(Twine(switch_op.name(), "/ControlDependencyCtrl_") +
-                      Twine(value.cast<OpResult>().getResultNumber()));
+                      Twine(mlir::cast<OpResult>(value).getResultNumber()));
   return identity_op;
 }
 
@@ -179,12 +180,12 @@ static TFOp GetControlAnchorForSwitchResult(
 // the output does not necessarily activate when the switch op activates. We
 // add a "control anchor" in the form of an identity op instead.
 static Value GetControlDependency(OpBuilder &builder, Value value) {
-  if (value.getType().isa<ControlType>()) return value;
+  if (mlir::isa<ControlType>(value.getType())) return value;
 
   TFGraphDialect *dialect =
       builder.getContext()->getLoadedDialect<TFGraphDialect>();
   assert(dialect);
-  if (OpResult result = value.dyn_cast<OpResult>();
+  if (OpResult result = mlir::dyn_cast<OpResult>(value);
       result && dialect->IsSwitch(result.getOwner())) {
     return GetControlAnchorForSwitchResult(builder, result, dialect)
         .controlRet();
@@ -196,7 +197,7 @@ static Value GetControlDependency(OpBuilder &builder, Value value) {
 // Add control operand to `op` if it doesn't exist.
 static void AddControlOperand(Operation *op, Value control,
                               PatternRewriter &rewriter) {
-  assert(control.getType().isa<ControlType>());
+  assert(mlir::isa<ControlType>(control.getType()));
   if (llvm::is_contained(op->getOperands(), control)) return;
   rewriter.startOpModification(op);
   op->insertOperands(op->getNumOperands(), control);
@@ -271,7 +272,7 @@ static FailureOr<TFOp> ReplaceOpWithNoOp(OpBuilder &builder, TFOp op) {
 
 static FailureOr<TFOp> ReplaceOpWithConstant(OpBuilder &builder, Operation *op,
                                              double constant_value) {
-  auto res = (*op->result_type_begin()).cast<ShapedType>();
+  auto res = mlir::cast<ShapedType>((*op->result_type_begin()));
   Type dtype = GetDataTypeFromOp(builder, op);
   Attribute value_attr;
   if (dtype.isIntOrIndex())
@@ -315,7 +316,7 @@ static FailureOr<TFOp> ReplaceOpWithSnapshot(OpBuilder &builder, TFOp op,
 
 static FailureOr<TFOp> ReplaceOpWithBroadcastTo(OpBuilder &builder, TFOp op,
                                                 int idx_to_replace) {
-  ShapedType tensor_type = (*op->result_type_begin()).cast<ShapedType>();
+  ShapedType tensor_type = mlir::cast<ShapedType>((*op->result_type_begin()));
   if (!tensor_type.hasStaticShape()) return failure();
   ElementsAttr const_attr = ConvertShapeToAttr(tensor_type);
 
@@ -551,7 +552,8 @@ bool OpPropertyHelper::IsFoldableUncached(TFOp op) {
     TFOp operand_op = operand.getDefiningOp();
     if (operand_op && dialect_->IsConstant(operand_op)) {
       auto dtype = operand_op->getAttrOfType<TypeAttr>("dtype");
-      if (!dtype || dtype.getValue().isa<tf_type::StringType>()) return false;
+      if (!dtype || mlir::isa<tf_type::StringType>(dtype.getValue()))
+        return false;
 
       // Special case: If a Merge node has at least one constant input that
       // does not depend on a control input, we can fold it.
@@ -572,7 +574,7 @@ bool OpPropertyHelper::IsFoldableUncached(TFOp op) {
   // to materialize.
   int64_t input_size_bytes = 0;
   for (Value operand : operands) {
-    auto shape = operand.getType().dyn_cast<ShapedType>();
+    auto shape = mlir::dyn_cast<ShapedType>(operand.getType());
     if (!shape || !shape.hasStaticShape()) continue;
     auto element_type = shape.getElementType();
 
@@ -581,7 +583,7 @@ bool OpPropertyHelper::IsFoldableUncached(TFOp op) {
     input_size_bytes += shape.getNumElements() * DataTypeSize(dtype);
   }
   for (Value res : op->getResults().drop_back()) {
-    auto shape = res.getType().dyn_cast<ShapedType>();
+    auto shape = mlir::dyn_cast<ShapedType>(res.getType());
     if (!shape || !shape.hasStaticShape()) continue;
     auto element_type = shape.getElementType();
 
@@ -742,7 +744,7 @@ class EvaluateConstant : public FolderPatternBase<EvaluateConstant> {
     // TODO(tlongeri): Is CreateConstantTensorNode check correct? Shouldn't it
     // always be a ShapedType?
     for (TypedAttr r : result)
-      if (r && r.getType().isa<VariantType>()) return failure();
+      if (r && mlir::isa<VariantType>(r.getType())) return failure();
 
     StringAttr name_attr = static_cast<TFGraphDialect *>(op->getDialect())
                                ->getNameAttrIdentifier();
@@ -824,7 +826,7 @@ class MaterializeShapeOp : public FolderPatternBase<MaterializeShapeOp> {
                                 PatternRewriter &rewriter) const override {
     Value input = op->getOperand(0);
 
-    auto input_shape = input.getType().cast<ShapedType>();
+    auto input_shape = mlir::cast<ShapedType>(input.getType());
     if (!input_shape.hasStaticShape()) return failure();
 
     // TODO(rmlarsen): Remove this workaround for b/150861569
@@ -834,7 +836,7 @@ class MaterializeShapeOp : public FolderPatternBase<MaterializeShapeOp> {
       return failure();
 
     Type output_dtype =
-        op->getResult(0).getType().cast<ShapedType>().getElementType();
+        mlir::cast<ShapedType>(op->getResult(0).getType()).getElementType();
     ElementsAttr const_attr = CreateElementsAttrOfTypeValues(
         output_dtype, {input_shape.getRank()}, input_shape.getShape());
 
@@ -863,10 +865,10 @@ class MaterializeSizeOp : public FolderPatternBase<MaterializeSizeOp> {
                                 PatternRewriter &rewriter) const override {
     Value input = op->getOperand(0);
 
-    auto input_shape = input.getType().cast<ShapedType>();
+    auto input_shape = mlir::cast<ShapedType>(input.getType());
     if (!input_shape.hasStaticShape()) return failure();
 
-    ShapedType result_type = (*op->result_type_begin()).cast<ShapedType>();
+    ShapedType result_type = mlir::cast<ShapedType>((*op->result_type_begin()));
     if (!result_type.getElementType().isIntOrIndexOrFloat()) return failure();
 
     ElementsAttr const_attr = CreateElementsAttrOfTypeValues(
@@ -898,10 +900,10 @@ class MaterializeRankOp : public FolderPatternBase<MaterializeRankOp> {
                                 PatternRewriter &rewriter) const override {
     Value input = op->getOperand(0);
 
-    auto input_shape = input.getType().cast<ShapedType>();
+    auto input_shape = mlir::cast<ShapedType>(input.getType());
     if (!input_shape.hasRank()) return failure();
 
-    ShapedType result_type = (*op->result_type_begin()).cast<ShapedType>();
+    ShapedType result_type = mlir::cast<ShapedType>((*op->result_type_begin()));
     if (!result_type.getElementType().isIntOrIndexOrFloat()) return failure();
 
     ElementsAttr const_attr = CreateElementsAttrOfTypeValues(
@@ -976,7 +978,7 @@ class MaterializeShapeNOp : public FolderPatternBase<MaterializeShapeNOp> {
     for (const auto &it : llvm::enumerate(TFOp(op).getNonControlOperands())) {
       Value operand = op->getOperand(it.index());
 
-      auto operand_shape = operand.getType().cast<ShapedType>();
+      auto operand_shape = mlir::cast<ShapedType>(operand.getType());
       if (!operand_shape.hasStaticShape()) continue;
 
       if (op->getResults()[it.index()].use_empty()) continue;
@@ -1033,7 +1035,7 @@ class MaterializeBroadcastGradientArgsOp
     auto get_shape = [this](Operation *op,
                             SmallVector<int64_t> &shape) -> bool {
       if (dialect_->IsShape(op)) {
-        auto type = op->getOperand(0).getType().cast<ShapedType>();
+        auto type = mlir::cast<ShapedType>(op->getOperand(0).getType());
         if (!type.hasRank()) return false;
 
         llvm::append_range(shape, type.getShape());
@@ -1139,18 +1141,19 @@ class MaterializeReductionIndices
     // The reduction indices are already constant, there's nothing to do.
     if (!indices || dialect_->IsConstant(indices)) return failure();
 
-    auto indices_shape = indices->getResult(0).getType().cast<ShapedType>();
+    auto indices_shape =
+        mlir::cast<ShapedType>(indices->getResult(0).getType());
     if (!indices_shape.hasRank()) return failure();
     if (!indices_shape.getElementType().isInteger(32) &&
         !indices_shape.getElementType().isInteger(64)) {
       return failure();
     }
 
-    auto input_shape = op->getOperand(0).getType().cast<ShapedType>();
+    auto input_shape = mlir::cast<ShapedType>(op->getOperand(0).getType());
     // Unexpected graph, don't try to change it.
     if (!input_shape.hasRank() || input_shape.getRank() < 1) return failure();
 
-    auto output_shape = op->getResult(0).getType().cast<ShapedType>();
+    auto output_shape = mlir::cast<ShapedType>(op->getResult(0).getType());
     const int output_rank =
         output_shape.hasRank() ? output_shape.getRank() : -1;
 
@@ -1167,7 +1170,7 @@ class MaterializeReductionIndices
         full_reduction = false;
         if (!dialect_->IsReshape(user)) return failure();
 
-        auto shape = user->getResult(0).getType().cast<ShapedType>();
+        auto shape = mlir::cast<ShapedType>(user->getResult(0).getType());
         if (!shape.hasStaticShape() || shape.getNumElements() != 1)
           return failure();
         else
@@ -1214,7 +1217,7 @@ class MaterializeFillNode : public FolderPatternBase<MaterializeFillNode> {
     // Only handles single result op. Note that another result is control ret.
     if (op->getNumResults() != 2) return failure();
 
-    auto output_type = op->getResult(0).getType().cast<ShapedType>();
+    auto output_type = mlir::cast<ShapedType>(op->getResult(0).getType());
     if (!output_type.hasStaticShape()) return failure();
     if (!output_type.isIntOrIndexOrFloat()) return failure();
 
@@ -1262,7 +1265,7 @@ class MaterializeConstantValuedNode
 
     // TODO(chiahungduan): If op->getOperand(0) has static shape, can we use
     // that to materialize?
-    auto output_type = op->getResult(0).getType().cast<ShapedType>();
+    auto output_type = mlir::cast<ShapedType>(op->getResult(0).getType());
     if (!output_type.hasStaticShape()) return failure();
 
     int value = is_zeros_like ? 0 : 1;
@@ -1277,8 +1280,9 @@ class MaterializeConstantValuedNode
     } else {
       const_attr = SplatElementsAttr::get(
           output_type,
-          APFloat(output_element_type.cast<FloatType>().getFloatSemantics(),
-                  value));
+          APFloat(
+              mlir::cast<FloatType>(output_element_type).getFloatSemantics(),
+              value));
     }
 
     FailureOr<TFOp> const_op =
@@ -1457,7 +1461,7 @@ class RemoveShuffleOp : public FolderPatternBase<RemoveShuffleOp> {
     ElementsAttr perm_tensor = perm_op->getAttrOfType<ElementsAttr>("value");
     if (!perm_tensor) return failure();
 
-    ShapedType x_shape = op->getOperand(0).getType().cast<ShapedType>();
+    ShapedType x_shape = mlir::cast<ShapedType>(op->getOperand(0).getType());
     if (!x_shape.hasRank()) return failure();
     if (perm_tensor.getNumElements() != x_shape.getRank()) return failure();
 
@@ -1489,7 +1493,7 @@ class RemoveTransposeOp : public FolderPatternBase<RemoveTransposeOp> {
     ElementsAttr perm_tensor = perm_op->getAttrOfType<ElementsAttr>("value");
     if (!perm_tensor) return failure();
 
-    ShapedType x_shape = op->getOperand(0).getType().cast<ShapedType>();
+    ShapedType x_shape = mlir::cast<ShapedType>(op->getOperand(0).getType());
     if (!x_shape.hasRank()) return failure();
     if (perm_tensor.getNumElements() != x_shape.getRank()) return failure();
 
@@ -1516,7 +1520,7 @@ class RemoveRandomShuffleOp : public FolderPatternBase<RemoveRandomShuffleOp> {
       : FolderPatternBase<RemoveRandomShuffleOp>("tfg.RandomShuffle", helper) {}
   LogicalResult matchAndRewrite(Operation *op,
                                 PatternRewriter &rewriter) const override {
-    auto shape = op->getOperand(0).getType().cast<ShapedType>();
+    auto shape = mlir::cast<ShapedType>(op->getOperand(0).getType());
     if (!shape.hasRank()) return failure();
     if (shape.getRank() != 0 && shape.getShape()[0] != 1) return failure();
 
@@ -1536,7 +1540,8 @@ class RemoveReverse : public FolderPatternBase<RemoveReverse> {
       : FolderPatternBase<RemoveReverse>("tfg.ReverseV2", helper) {}
   LogicalResult matchAndRewrite(Operation *op,
                                 PatternRewriter &rewriter) const override {
-    ShapedType tensor_type = op->getOperand(0).getType().cast<ShapedType>();
+    ShapedType tensor_type =
+        mlir::cast<ShapedType>(op->getOperand(0).getType());
     if (!tensor_type.hasRank()) return failure();
 
     Operation *dim_op = op->getOperand(1).getDefiningOp();
@@ -1588,7 +1593,7 @@ class SimplifySliceOp : public FolderPatternBase<SimplifySliceOp> {
     auto begin_attr = begin_op->getAttrOfType<ElementsAttr>("value");
     auto size_attr = size_op->getAttrOfType<ElementsAttr>("value");
 
-    ShapedType input_type = op->getOperand(0).getType().cast<ShapedType>();
+    ShapedType input_type = mlir::cast<ShapedType>(op->getOperand(0).getType());
     if (!input_type.hasRank()) return failure();
 
     for (unsigned i = 0; i < input_type.getRank(); ++i) {
@@ -1643,7 +1648,7 @@ class SimplifyStridedSlice : public FolderPatternBase<SimplifyStridedSlice> {
     if (!begin_mask_attr || !end_mask_attr || !ellipsis_mask_attr)
       return failure();
 
-    ShapedType input_type = op->getOperand(0).getType().cast<ShapedType>();
+    ShapedType input_type = mlir::cast<ShapedType>(op->getOperand(0).getType());
     if (!input_type.hasStaticShape()) return failure();
 
     Operation *begin_op = op->getOperand(1).getDefiningOp();
@@ -1805,7 +1810,7 @@ class SimplifySqueezeOp : public FolderPatternBase<SimplifySqueezeOp> {
       : FolderPatternBase<SimplifySqueezeOp>("tfg.Squeeze", helper) {}
   LogicalResult matchAndRewrite(Operation *op,
                                 PatternRewriter &rewriter) const override {
-    auto shape_type = op->getOperand(0).getType().cast<ShapedType>();
+    auto shape_type = mlir::cast<ShapedType>(op->getOperand(0).getType());
     if (!shape_type.hasRank()) return failure();
     if (llvm::any_of(shape_type.getShape(), [](int64_t s) { return s <= 1; }))
       return failure();
@@ -1836,13 +1841,13 @@ class SimplifyPackOp : public FolderPatternBase<SimplifyPackOp> {
     // protos, e.g. there is DT_RESOURCE).
     // TODO(tlongeri): is there a reason ExpandDims does not support DT_VARIANT?
     if (ShapedType values_type =
-            non_control_operands[0].getType().dyn_cast<ShapedType>();
-        !values_type || values_type.getElementType().isa<VariantType>())
+            mlir::dyn_cast<ShapedType>(non_control_operands[0].getType());
+        !values_type || mlir::isa<VariantType>(values_type.getElementType()))
       return failure();
 
     // It's unsafe to add a control dependency on the feed node, because it
     // might have been never executed otherwiwise.
-    if (non_control_operands[0].isa<BlockArgument>()) return failure();
+    if (mlir::isa<BlockArgument>(non_control_operands[0])) return failure();
 
     IntegerAttr axis = op->getAttrOfType<IntegerAttr>("axis");
     ElementsAttr const_attr = CreateElementsAttrOfTypeValues(
@@ -2033,8 +2038,8 @@ class SimplifyReductionOp : public FolderPatternBase<SimplifyReductionOp> {
     }
 
     // Check `IsReductionCandidateForSimplification`
-    auto input_type = op->getOperand(0).getType().cast<ShapedType>();
-    auto op_type = (*op->result_type_begin()).cast<ShapedType>();
+    auto input_type = mlir::cast<ShapedType>(op->getOperand(0).getType());
+    auto op_type = mlir::cast<ShapedType>((*op->result_type_begin()));
     if (!input_type.hasStaticShape() || !op_type.hasStaticShape())
       return failure();
 
@@ -2096,7 +2101,7 @@ class SimplifyReductionOp : public FolderPatternBase<SimplifyReductionOp> {
   Operation *ReplaceReductionWithReshape(OpBuilder &builder, Operation *op,
                                          Operation *reduction_indices) const {
     const int new_num_dimensions =
-        (*op->result_type_begin()).cast<ShapedType>().getRank();
+        mlir::cast<ShapedType>((*op->result_type_begin())).getRank();
     SmallVector<int64_t> elements(new_num_dimensions);
     std::iota(elements.begin(), elements.end(), 1);
     ElementsAttr const_attr = CreateElementsAttrOfTypeValues(
@@ -2164,7 +2169,7 @@ class SimplifyReshapeOp : public FolderPatternBase<SimplifyReshapeOp> {
                                 PatternRewriter &rewriter) const override {
     if (!dialect_->IsReshape(op) || !op->hasAttr("T")) return failure();
 
-    auto input_shape = op->getOperand(0).getType().cast<ShapedType>();
+    auto input_shape = mlir::cast<ShapedType>(op->getOperand(0).getType());
     if (!input_shape.hasStaticShape()) return failure();
 
     Operation *shape_op = op->getOperand(1).getDefiningOp();
@@ -2227,9 +2232,9 @@ class SimplifyArithmeticOp
     Operation *y = op->getOperand(1).getDefiningOp();
     if (!x || !y) return failure();
 
-    ShapedType op_type = (*op->result_type_begin()).cast<ShapedType>();
-    ShapedType x_type = (*x->result_type_begin()).cast<ShapedType>();
-    ShapedType y_type = (*y->result_type_begin()).cast<ShapedType>();
+    ShapedType op_type = mlir::cast<ShapedType>((*op->result_type_begin()));
+    ShapedType x_type = mlir::cast<ShapedType>((*x->result_type_begin()));
+    ShapedType y_type = mlir::cast<ShapedType>((*y->result_type_begin()));
 
     const bool y_matches_output_shape = op_type.hasStaticShape() &&
                                         y_type.hasStaticShape() &&
@@ -2277,8 +2282,8 @@ class SimplifyArithmeticOp
       TypeAttr type_attr = op->getAttrOfType<TypeAttr>("T");
       if (!type_attr) return failure();
 
-      if (type_attr.getValue().isa<FloatType>() ||
-          type_attr.getValue().isa<ComplexType>()) {
+      if (mlir::isa<FloatType>(type_attr.getValue()) ||
+          mlir::isa<ComplexType>(type_attr.getValue())) {
         OperationState state(op->getLoc(), "tfg.Reciprocal");
         state.addOperands({op->getOperand(1),
                            GetControlDependency(rewriter, op->getOperand(0))});
@@ -2401,8 +2406,9 @@ class ReduceDivToReciprocalMul
     if (!type_attr) return failure();
 
     // Skip integer division.
-    if (dialect_->IsDiv(op) && !(type_attr.getValue().isa<FloatType>() ||
-                                 type_attr.getValue().isa<ComplexType>())) {
+    if (dialect_->IsDiv(op) &&
+        !(mlir::isa<FloatType>(type_attr.getValue()) ||
+          mlir::isa<ComplexType>(type_attr.getValue()))) {
       return failure();
     }
 
@@ -2572,8 +2578,8 @@ class ConstantPushDown : public ConstantPushDownBase<ConstantPushDown> {
       // Dimensions of X must be smaller than or equal than those of C.
       // This also avoids having to increase the size of the child op's result
       // to match the broadcast with a bigger operand.
-      auto c_shape = const_op->getResult(0).getType().cast<ShapedType>();
-      auto x_shape = x_value.getType().cast<ShapedType>();
+      auto c_shape = mlir::cast<ShapedType>(const_op->getResult(0).getType());
+      auto x_shape = mlir::cast<ShapedType>(x_value.getType());
 
       if (c_shape.hasStaticShape() && x_shape.hasStaticShape() &&
           c_shape.getNumElements() > x_shape.getNumElements()) {
@@ -2677,7 +2683,7 @@ class PartialConstPropThroughIdentityN
     SmallVector<Value> control_operands;
     for (OpOperand &operand : op->getOpOperands()) {
       Value v = operand.get();
-      if (v.getType().isa<ControlType>()) break;
+      if (mlir::isa<ControlType>(v.getType())) break;
 
       Operation *v_op = v.getDefiningOp();
       if (!v_op || !dialect_->IsIdentityN(v_op) ||
@@ -2685,7 +2691,7 @@ class PartialConstPropThroughIdentityN
         continue;
       }
 
-      int res_index = v.cast<OpResult>().getResultNumber();
+      int res_index = mlir::cast<OpResult>(v).getResultNumber();
       Value value_to_forward = v_op->getOperand(res_index);
       if (!value_to_forward.getDefiningOp() ||
           !dialect_->IsConstant(value_to_forward.getDefiningOp())) {
@@ -2965,21 +2971,22 @@ class MulConvPushDown : public ConstantPatternBase<MulConvPushDown, FolderTrait,
       return failure();
     }
 
-    ShapedType mul_shape = (*op->result_type_begin()).cast<ShapedType>();
+    ShapedType mul_shape = mlir::cast<ShapedType>((*op->result_type_begin()));
     ShapedType conv_shape =
-        (*conv_node->result_type_begin()).cast<ShapedType>();
+        mlir::cast<ShapedType>((*conv_node->result_type_begin()));
     // TODO(chiahungduan): Symbolic shape equivalence is acceptable.
     if (!mul_shape.hasStaticShape() || !conv_shape.hasStaticShape() ||
         mul_shape != conv_shape) {
       return failure();
     }
 
-    auto filter_shape = conv_node->getOperand(1).getType().cast<ShapedType>();
+    auto filter_shape =
+        mlir::cast<ShapedType>(conv_node->getOperand(1).getType());
 
     Operation *const_node =
         left_child_is_constant ? mul_left_child : mul_right_child;
     auto const_node_shape =
-        (*const_node->result_type_begin()).cast<ShapedType>();
+        mlir::cast<ShapedType>((*const_node->result_type_begin()));
     if (!IsValidConstShapeForMulConvPushDown(
             conv_node->getAttrOfType<StringAttr>("data_format"), filter_shape,
             const_node_shape)) {
@@ -3235,7 +3242,7 @@ class ConstantPushDownBiasAdd
     if (!IsOperandsSafeToMove(add_child, const_child)) return failure();
 
     auto hasRank = [&](Value value) {
-      return value.getType().cast<ShapedType>().hasRank();
+      return mlir::cast<ShapedType>(value.getType()).hasRank();
     };
 
     if (!hasRank(op->getOperand(0)) || !hasRank(op->getOperand(1)) ||
@@ -3246,19 +3253,19 @@ class ConstantPushDownBiasAdd
 
     // Now get the ranks and types of the 3 leaf nodes.
     const int left_leaf_rank =
-        add_child->getOperand(0).getType().cast<ShapedType>().getRank();
+        mlir::cast<ShapedType>(add_child->getOperand(0).getType()).getRank();
     const int right_leaf_rank =
-        add_child->getOperand(1).getType().cast<ShapedType>().getRank();
+        mlir::cast<ShapedType>(add_child->getOperand(1).getType()).getRank();
 
     // At least one leaf must be a vector.
     if (left_leaf_rank != 1 && right_leaf_rank != 1) return failure();
 
     const int vector_idx = left_leaf_rank == 1 ? 0 : 1;
     auto vector_type =
-        add_child->getOperand(vector_idx).getType().cast<ShapedType>();
+        mlir::cast<ShapedType>(add_child->getOperand(vector_idx).getType());
     Type vector_d_type = vector_type.getElementType();
 
-    auto const_type = const_child->getResultTypes()[0].cast<ShapedType>();
+    auto const_type = mlir::cast<ShapedType>(const_child->getResultTypes()[0]);
     const int const_rank = const_type.getRank();
     Type const_d_type = const_type.getElementType();
 
@@ -3336,7 +3343,7 @@ class ConstantPushDownAdd : public ConstantPushDownBase<ConstantPushDownAdd> {
     if (!child_is_bias_add && !dialect_->IsAdd(add_child)) return failure();
 
     auto hasRank = [&](Value value) {
-      return value.getType().cast<ShapedType>().hasRank();
+      return mlir::cast<ShapedType>(value.getType()).hasRank();
     };
 
     if (!hasRank(op->getOperand(0)) || !hasRank(op->getOperand(1)) ||
@@ -3347,9 +3354,9 @@ class ConstantPushDownAdd : public ConstantPushDownBase<ConstantPushDownAdd> {
 
     // Now get the ranks and types of the 3 leaf nodes.
     const int left_leaf_rank =
-        add_child->getOperand(0).getType().cast<ShapedType>().getRank();
+        mlir::cast<ShapedType>(add_child->getOperand(0).getType()).getRank();
     const int right_leaf_rank =
-        add_child->getOperand(1).getType().cast<ShapedType>().getRank();
+        mlir::cast<ShapedType>(add_child->getOperand(1).getType()).getRank();
     // At least one leaf must be a vector.
     if (left_leaf_rank != 1 && right_leaf_rank != 1) return failure();
 
@@ -3357,18 +3364,18 @@ class ConstantPushDownAdd : public ConstantPushDownBase<ConstantPushDownAdd> {
     const int matrix_idx = 1 - vector_idx;
 
     ShapedType vector_type =
-        add_child->getOperand(vector_idx).getType().cast<ShapedType>();
+        mlir::cast<ShapedType>(add_child->getOperand(vector_idx).getType());
     Type vector_d_type = vector_type.getElementType();
 
     ShapedType matrix_type =
-        add_child->getOperand(matrix_idx).getType().cast<ShapedType>();
+        mlir::cast<ShapedType>(add_child->getOperand(matrix_idx).getType());
     const int matrix_rank = matrix_type.getRank();
     Type matrix_d_type = matrix_type.getElementType();
 
     const int const_index =
         op->getOperand(0).getDefiningOp() == const_child ? 0 : 1;
     ShapedType const_type =
-        const_child->getResult(0).getType().cast<ShapedType>();
+        mlir::cast<ShapedType>(const_child->getResult(0).getType());
     const int const_rank = const_type.getRank();
     Type const_d_type = const_type.getElementType();
 
@@ -3518,9 +3525,9 @@ class SimplifySelectOpBase : public FolderPatternBase<ConcreteType> {
     bool is_all_false = this->helper_.IsZeros(condition_op);
     if (!is_all_true && !is_all_false) return failure();
 
-    auto condition_type = op->getOperand(0).getType().cast<ShapedType>();
-    auto t_type = op->getOperand(1).getType().cast<ShapedType>();
-    auto e_type = op->getOperand(2).getType().cast<ShapedType>();
+    auto condition_type = mlir::cast<ShapedType>(op->getOperand(0).getType());
+    auto t_type = mlir::cast<ShapedType>(op->getOperand(1).getType());
+    auto e_type = mlir::cast<ShapedType>(op->getOperand(2).getType());
     if (!condition_type.hasStaticShape() || !t_type.hasStaticShape() ||
         !e_type.hasStaticShape()) {
       return failure();
diff --git a/tensorflow/core/transforms/func_to_graph/BUILD b/tensorflow/core/transforms/func_to_graph/BUILD
index 0c62a5a2f90894..8463483b01d80f 100644
--- a/tensorflow/core/transforms/func_to_graph/BUILD
+++ b/tensorflow/core/transforms/func_to_graph/BUILD
@@ -19,6 +19,7 @@ cc_library(
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:status",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
 )
 
diff --git a/tensorflow/core/transforms/func_to_graph/func_to_graph.cc b/tensorflow/core/transforms/func_to_graph/func_to_graph.cc
index 15bd5bc00252a6..dcedaed70f4c1a 100644
--- a/tensorflow/core/transforms/func_to_graph/func_to_graph.cc
+++ b/tensorflow/core/transforms/func_to_graph/func_to_graph.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/transforms/func_to_graph/func_to_graph.h"
 
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/core/ir/ops.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
@@ -47,8 +48,9 @@ tensorflow::Status FuncToGraph(GraphFuncOp func) {
 
       // Init the entry with nullptr and it'll be updated with associated op
       // later.
-      referred_ops.insert({lifted_value_attr[0].cast<StringAttr>().getValue(),
-                           /*Operation=*/nullptr});
+      referred_ops.insert(
+          {mlir::cast<StringAttr>(lifted_value_attr[0]).getValue(),
+           /*Operation=*/nullptr});
     }
   }
 
@@ -59,7 +61,7 @@ tensorflow::Status FuncToGraph(GraphFuncOp func) {
   }
 
   for (const auto &it : llvm::enumerate(func.getArguments())) {
-    if (it.value().getType().isa<ControlType>()) continue;
+    if (mlir::isa<ControlType>(it.value().getType())) continue;
 
     auto lifted_value_attr =
         func.getArgAttrOfType<ArrayAttr>(it.index(), lifted_value_attr_name);
@@ -70,7 +72,7 @@ tensorflow::Status FuncToGraph(GraphFuncOp func) {
     }
 
     StringRef value_defining_op_name =
-        lifted_value_attr[0].cast<StringAttr>().getValue();
+        mlir::cast<StringAttr>(lifted_value_attr[0]).getValue();
     Operation *op = referred_ops[value_defining_op_name];
     if (!op) {
       return tensorflow::errors::InvalidArgument(
@@ -79,7 +81,7 @@ tensorflow::Status FuncToGraph(GraphFuncOp func) {
     }
 
     uint64_t result_index =
-        lifted_value_attr[1].cast<IntegerAttr>().getValue().getZExtValue();
+        mlir::cast<IntegerAttr>(lifted_value_attr[1]).getValue().getZExtValue();
     if (result_index >= op->getNumResults()) {
       return tensorflow::errors::InvalidArgument(
           "result index out of bound: seeing index ", result_index,
diff --git a/tensorflow/core/transforms/functional_to_region/impl.cc b/tensorflow/core/transforms/functional_to_region/impl.cc
index ffdffd14d494e5..046f64493eb4a7 100644
--- a/tensorflow/core/transforms/functional_to_region/impl.cc
+++ b/tensorflow/core/transforms/functional_to_region/impl.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/core/ir/dialect.h"
 #include "tensorflow/core/ir/ops.h"
@@ -383,7 +384,7 @@ LogicalResult ConvertCaseLikeOp<CaseLikeOp, CaseLikeRegionOp>::matchAndRewrite(
   }
   ArrayAttr region_attrs = nullptr;
   if (!llvm::all_of(preserved_attrs, [](Attribute attr) {
-        return AreRegionAttrsEmpty(attr.cast<RegionAttr>());
+        return AreRegionAttrsEmpty(mlir::cast<RegionAttr>(attr));
       }))
     region_attrs = rewriter.getArrayAttr(preserved_attrs);
 
diff --git a/tensorflow/core/transforms/graph_compactor/pass.cc b/tensorflow/core/transforms/graph_compactor/pass.cc
index bf5eb785c8f181..883bc36891e9d7 100644
--- a/tensorflow/core/transforms/graph_compactor/pass.cc
+++ b/tensorflow/core/transforms/graph_compactor/pass.cc
@@ -110,7 +110,8 @@ class NameCompressPass : public impl::NameCompressBase<NameCompressPass> {
       arg_attrs.reserve(func.getNumArguments());
       // Iterate over the function arguments, skipping the control tokens.
       for (int i = 0, e = func.getNumArguments(); i != e; i += 2) {
-        NamedAttrList attrs = func.getArgAttrsAttr()[i].cast<DictionaryAttr>();
+        NamedAttrList attrs =
+            mlir::cast<DictionaryAttr>(func.getArgAttrsAttr()[i]);
         attrs.set(dialect_->getTfgNameAttrIdentifier(), encode_new_name());
         arg_attrs.append({attrs.getDictionary(&getContext()), empty_dict_});
       }
diff --git a/tensorflow/core/transforms/graph_to_func/graph_to_func.cc b/tensorflow/core/transforms/graph_to_func/graph_to_func.cc
index 83c2230afa40e0..4db1cc15ff310c 100644
--- a/tensorflow/core/transforms/graph_to_func/graph_to_func.cc
+++ b/tensorflow/core/transforms/graph_to_func/graph_to_func.cc
@@ -86,11 +86,11 @@ tensorflow::Status GraphToFunc(GraphOp graph, ArrayRef<Value> feeds,
     feed.replaceAllUsesWith(body->addArgument(feed.getType(), loc));
     body->addArgument(control_ty, loc);
     llvm::SmallVector<NamedAttribute> arg_attrs;
-    std::string slot = OpResultToSlotName(feed.cast<OpResult>());
+    std::string slot = OpResultToSlotName(mlir::cast<OpResult>(feed));
     arg_attrs.push_back(NamedAttribute(tfg_name, builder.getStringAttr(slot)));
-    arg_attrs.push_back(
-        NamedAttribute(lifted_value_name,
-                       createLiftedValueAttr(builder, feed.cast<OpResult>())));
+    arg_attrs.push_back(NamedAttribute(
+        lifted_value_name,
+        createLiftedValueAttr(builder, mlir::cast<OpResult>(feed))));
     args_rets_attrs.push_back(builder.getDictionaryAttr(arg_attrs));
     args_rets_attrs.push_back(Attribute{});
   }
@@ -99,7 +99,7 @@ tensorflow::Status GraphToFunc(GraphOp graph, ArrayRef<Value> feeds,
   args_rets_attrs.clear();
   for (Value fetch : fetches) {
     llvm::SmallVector<NamedAttribute> arg_attrs;
-    std::string slot = OpResultToSlotName(fetch.cast<OpResult>());
+    std::string slot = OpResultToSlotName(mlir::cast<OpResult>(fetch));
     arg_attrs.push_back(NamedAttribute(tfg_name, builder.getStringAttr(slot)));
     args_rets_attrs.push_back(builder.getDictionaryAttr(arg_attrs));
   }
diff --git a/tensorflow/core/transforms/region_to_functional/impl.cc b/tensorflow/core/transforms/region_to_functional/impl.cc
index a4d8b07f2594e2..439f622240b5aa 100644
--- a/tensorflow/core/transforms/region_to_functional/impl.cc
+++ b/tensorflow/core/transforms/region_to_functional/impl.cc
@@ -518,7 +518,7 @@ NamedAttrList BasePattern::BuildAttributes(RegionAttr preserved,
   // For each argument and result, lookup a name and regenerate output shapes.
   const auto build_attrs = [&](ArrayAttr attr, auto &it,
                                std::optional<ValueRange> args) {
-    NamedAttrList attrs(attr ? attr[it.index()].template cast<DictionaryAttr>()
+    NamedAttrList attrs(attr ? mlir::cast<DictionaryAttr>(attr[it.index()])
                              : DictionaryAttr());
     // If no name was preserved, try to find one.
     if (!attrs.get(ids_.tfg_name)) {
@@ -548,7 +548,7 @@ NamedAttrList BasePattern::BuildAttributes(RegionAttr preserved,
 StringAttr BasePattern::TryFindName(Value value,
                                     std::optional<ValueRange> args) const {
   // If this is an op result, return the op's name.
-  if (auto result = value.dyn_cast<OpResult>()) {
+  if (auto result = mlir::dyn_cast<OpResult>(value)) {
     Operation *op = result.getOwner();
     if (auto name =
             op->getAttrOfType<StringAttr>(dialect_.getNameAttrIdentifier())) {
@@ -558,7 +558,7 @@ StringAttr BasePattern::TryFindName(Value value,
     return {};
   }
 
-  auto arg = value.cast<BlockArgument>();
+  auto arg = mlir::cast<BlockArgument>(value);
   Operation *parent = arg.getOwner()->getParentOp();
   auto iface = dyn_cast<ControlArgumentInterface>(parent);
   if (!iface) return {};
@@ -904,12 +904,11 @@ LogicalResult ConvertCaseLikeOp<CaseLikeRegionOp, CaseLikeOp>::matchAndRewrite(
     // Get the preserved attributes, if there are any.
     RegionAttr preserved =
         op.getRegionAttrs()
-            ? op.getRegionAttrsAttr()[idx].template cast<RegionAttr>()
+            ? mlir::cast<RegionAttr>(op.getRegionAttrsAttr()[idx])
             : nullptr;
     DictionaryAttr attrs =
-        branch_func_attrs
-            ? branch_func_attrs[idx].template cast<DictionaryAttr>()
-            : nullptr;
+        branch_func_attrs ? mlir::cast<DictionaryAttr>(branch_func_attrs[idx])
+                          : nullptr;
     branch_regions.push_back(BasePattern::RegionFunction{
         it.value(), preserved, attrs, ("case_function_" + Twine(idx)).str()});
   }
diff --git a/tensorflow/core/transforms/remapper/BUILD b/tensorflow/core/transforms/remapper/BUILD
index 38b09b8bf601cb..e75b654b69a59f 100644
--- a/tensorflow/core/transforms/remapper/BUILD
+++ b/tensorflow/core/transforms/remapper/BUILD
@@ -48,6 +48,7 @@ cc_library(
         "@llvm-project//mlir:PDLInterpDialect",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
diff --git a/tensorflow/core/transforms/remapper/pass.cc b/tensorflow/core/transforms/remapper/pass.cc
index 7adc8d6d84a069..f881fce665da32 100644
--- a/tensorflow/core/transforms/remapper/pass.cc
+++ b/tensorflow/core/transforms/remapper/pass.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/ir/dialect.h"
@@ -55,8 +56,8 @@ class MatchMulSigmoid : public RewritePattern {
   LogicalResult matchAndRewrite(Operation *op,
                                 PatternRewriter &rewriter) const override {
     TypeAttr dtype_attr = op->getAttrOfType<TypeAttr>("T");
-    if (!dtype_attr.getValue().isa<Float32Type>() &&
-        !dtype_attr.getValue().isa<BFloat16Type>()) {
+    if (!mlir::isa<Float32Type>(dtype_attr.getValue()) &&
+        !mlir::isa<BFloat16Type>(dtype_attr.getValue())) {
       return failure();
     }
 
@@ -217,7 +218,7 @@ class MatchStringToHashBucket : public RemapperPatternBase {
     TypeAttr dtype_attr = as_string_op->getAttrOfType<TypeAttr>("T");
     if (!dtype_attr) return failure();
     Type dtype = dtype_attr.getValue();
-    if (!dtype.isa<IntegerType>()) return failure();
+    if (!mlir::isa<IntegerType>(dtype)) return failure();
 
     // width/fill attributes must be default values
     auto width_attr = as_string_op->getAttrOfType<IntegerAttr>("width");
@@ -270,7 +271,7 @@ class MatchSoftplusTanhMul : public RemapperPatternBase {
     auto attr = op->getAttrOfType<TypeAttr>("T");
     if (!attr) return failure();
     Type dtype = attr.getValue();
-    if (!dtype.isa<Float32Type, BFloat16Type>()) return failure();
+    if (!mlir::isa<Float32Type, BFloat16Type>(dtype)) return failure();
 
     TFOp mul_wrapper(op);
 
@@ -511,12 +512,12 @@ class FusedBatchNormExRewriter : public RemapperPatternBase {
       // GPU supports float and half.
       // Put this condition before check `isOneDNNEnabled()` because this node
       // should be processed when it's on GPU and oneDNN CPU is enabled.
-      if (!dtype_T.isa<Float32Type, Float16Type>()) return false;
+      if (!mlir::isa<Float32Type, Float16Type>(dtype_T)) return false;
     } else {
       // Bfloat16 is available only with oneDNN.
       // Half is not available with oneDNN.
       if (this->helper_.isOneDNNEnabled() &&
-          !dtype_T.isa<Float32Type, BFloat16Type>()) {
+          !mlir::isa<Float32Type, BFloat16Type>(dtype_T)) {
         return false;
       }
     }
@@ -543,11 +544,11 @@ class FusedBatchNormExRewriter : public RemapperPatternBase {
       if (data_format != "NHWC") return false;
 
       // Data type must be Float16.
-      if (!dtype_T.isa<Float16Type>()) return false;
+      if (!mlir::isa<Float16Type>(dtype_T)) return false;
 
       // Channel dimension must be a multiple of 4.
       auto fbn_input0_shape =
-          fused_batch_norm_op->getOperand(0).getType().cast<ShapedType>();
+          mlir::cast<ShapedType>(fused_batch_norm_op->getOperand(0).getType());
       auto fbn_input0_shape_dims = fbn_input0_shape.getShape();
 
       const bool valid_channel_dim = (fbn_input0_shape.getRank() == 4) &&
@@ -562,7 +563,7 @@ class FusedBatchNormExRewriter : public RemapperPatternBase {
     // FusedBatchNormV2 and V3 have an extra type parameter.
     if (fused_batch_norm_op->getName().getStringRef() != "tfg.FusedBatchNorm") {
       auto attr = fused_batch_norm_op->getAttrOfType<TypeAttr>("U");
-      if (attr && !attr.getValue().isa<Float32Type>()) {
+      if (attr && !mlir::isa<Float32Type>(attr.getValue())) {
         return false;
       }
     }
@@ -618,9 +619,9 @@ class FusedBatchNormExRewriter : public RemapperPatternBase {
       auto add_input1_op = activation_input_op->getOperand(1).getDefiningOp();
       if (add_input0_op == nullptr || add_input1_op == nullptr) return false;
       auto add_input0_shape =
-          activation_input_op->getOperand(0).getType().cast<ShapedType>();
+          mlir::cast<ShapedType>(activation_input_op->getOperand(0).getType());
       auto add_input1_shape =
-          activation_input_op->getOperand(1).getType().cast<ShapedType>();
+          mlir::cast<ShapedType>(activation_input_op->getOperand(1).getType());
       if (add_input0_shape.getShape() != add_input1_shape.getShape()) {
         return false;
       }
diff --git a/tensorflow/core/transforms/remapper/remapping_helper.h b/tensorflow/core/transforms/remapper/remapping_helper.h
index 0e751f3d550629..1d8db8fc459a8e 100644
--- a/tensorflow/core/transforms/remapper/remapping_helper.h
+++ b/tensorflow/core/transforms/remapper/remapping_helper.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <string>
 
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/transforms/utils/op_cat_helper.h"
 #include "tensorflow/core/transforms/utils/utils.h"
@@ -111,9 +112,9 @@ class OpPropertyHelper : public OpCatHelper {
     if (!attr) return false;
     Type dtype = attr.getValue();
     if (dialect_->IsConv2D(contraction_op)) {
-      return dtype.isa<Float32Type>();
+      return mlir::isa<Float32Type>(dtype);
     } else if (dialect_->IsMatMul(contraction_op)) {
-      return dtype.isa<Float32Type, Float64Type>();
+      return mlir::isa<Float32Type, Float64Type>(dtype);
     } else {
       return false;
     }
@@ -131,13 +132,13 @@ class OpPropertyHelper : public OpCatHelper {
       // fusions are handled differently than contraction ops.
       bool is_supported = IsContraction(contraction_op) ||
                           dialect_->IsAnyBatchMatMul(contraction_op);
-      return is_supported && dtype.isa<Float32Type, BFloat16Type>();
+      return is_supported && mlir::isa<Float32Type, BFloat16Type>(dtype);
     }
 
     if (dialect_->IsConv2D(contraction_op)) {
-      return dtype.isa<Float32Type, Float64Type>();
+      return mlir::isa<Float32Type, Float64Type>(dtype);
     } else if (dialect_->IsMatMul(contraction_op)) {
-      return dtype.isa<Float32Type>();
+      return mlir::isa<Float32Type>(dtype);
     } else {
       return false;
     }
diff --git a/tensorflow/core/transforms/shape_inference/pass.cc b/tensorflow/core/transforms/shape_inference/pass.cc
index 80a28e0b53dbfb..3b95924476adfb 100644
--- a/tensorflow/core/transforms/shape_inference/pass.cc
+++ b/tensorflow/core/transforms/shape_inference/pass.cc
@@ -52,7 +52,7 @@ using tensorflow::shape_inference::ShapeHandle;
 
 // Only non-static shape or type with subtype can be refined.
 static bool CanBeRefined(Type type) {
-  auto shape_type = type.dyn_cast<ShapedType>();
+  auto shape_type = mlir::dyn_cast<ShapedType>(type);
   if (!shape_type) return false;
 
   // Returns whether type with subtypes can be further refined.
@@ -60,8 +60,8 @@ static bool CanBeRefined(Type type) {
     return tws.GetSubtypes().empty() ||
            llvm::any_of(tws.GetSubtypes(), CanBeRefined);
   };
-  auto type_with_subtype = shape_type.getElementType()
-                               .dyn_cast<tf_type::TensorFlowTypeWithSubtype>();
+  auto type_with_subtype = mlir::dyn_cast<tf_type::TensorFlowTypeWithSubtype>(
+      shape_type.getElementType());
   if (type_with_subtype && can_refine_subtypes(type_with_subtype)) return true;
 
   return !shape_type.hasStaticShape();
@@ -85,7 +85,7 @@ class ShapeInference : public impl::ShapeInferenceBase<ShapeInference> {
 
   // Get the tensor value if possible, return nullptr otherwise.
   DenseElementsAttr GetTensorValue(Value result) {
-    OpResult op_result = result.dyn_cast<OpResult>();
+    OpResult op_result = mlir::dyn_cast<OpResult>(result);
     if (op_result) {
       auto it = cached_tensor_values_.find(op_result);
       if (it != cached_tensor_values_.end()) return it->second;
@@ -99,7 +99,7 @@ class ShapeInference : public impl::ShapeInferenceBase<ShapeInference> {
 void ShapeInference::TryToCacheResultsTensorValue(Operation *op) {
   // Only op with static shape is able to construct the tensor value.
   if (llvm::all_of(op->getResults().drop_back(), [this](Value value) {
-        auto shape = value.getType().cast<ShapedType>();
+        auto shape = mlir::cast<ShapedType>(value.getType());
         /// NOMUTANTS -- shape.hasStaticShape is a cheaper operation than
         /// GetTensorValue
         return (!shape.hasStaticShape() || GetTensorValue(value) != nullptr);
@@ -117,9 +117,10 @@ void ShapeInference::TryToCacheResultsTensorValue(Operation *op) {
     if (!operand_tensor_value) return;
     cached_tensor_values_[op->getResult(0)] = operand_tensor_value;
   } else if (op_name == "Rank") {
-    ShapedType operand_shape = op->getOperand(0).getType().cast<ShapedType>();
+    ShapedType operand_shape =
+        mlir::cast<ShapedType>(op->getOperand(0).getType());
     if (!operand_shape.hasRank()) return;
-    ShapedType return_shape = op->getResultTypes()[0].cast<ShapedType>();
+    ShapedType return_shape = mlir::cast<ShapedType>(op->getResultTypes()[0]);
     DenseElementsAttr tensor_value;
     if (return_shape.getElementType().isInteger(32)) {
       tensor_value = DenseElementsAttr::get(
@@ -130,9 +131,10 @@ void ShapeInference::TryToCacheResultsTensorValue(Operation *op) {
     }
     cached_tensor_values_[op->getResult(0)] = tensor_value;
   } else if (op_name == "Size") {
-    ShapedType operand_shape = op->getOperand(0).getType().cast<ShapedType>();
+    ShapedType operand_shape =
+        mlir::cast<ShapedType>(op->getOperand(0).getType());
     if (!operand_shape.hasStaticShape()) return;
-    ShapedType return_shape = op->getResultTypes()[0].cast<ShapedType>();
+    ShapedType return_shape = mlir::cast<ShapedType>(op->getResultTypes()[0]);
     DenseElementsAttr tensor_value;
     if (return_shape.getElementType().isInteger(32)) {
       tensor_value =
@@ -147,13 +149,14 @@ void ShapeInference::TryToCacheResultsTensorValue(Operation *op) {
   } else if (op_name == "Shape" || op_name == "ShapeN") {
     for (OpOperand &operand : op->getOpOperands()) {
       Type operand_type = operand.get().getType();
-      if (operand_type.isa<ControlType>()) break;
+      if (mlir::isa<ControlType>(operand_type)) break;
 
-      auto operand_shape = operand_type.cast<ShapedType>();
+      auto operand_shape = mlir::cast<ShapedType>(operand_type);
       if (!operand_shape.hasStaticShape()) continue;
 
       int idx = operand.getOperandNumber();
-      ShapedType return_shape = op->getResultTypes()[idx].cast<ShapedType>();
+      ShapedType return_shape =
+          mlir::cast<ShapedType>(op->getResultTypes()[idx]);
       DenseElementsAttr tensor_value;
       if (return_shape.getElementType().isInteger(32)) {
         tensor_value = DenseElementsAttr::get<int>(
@@ -182,7 +185,7 @@ void ShapeInference::runOnOperation() {
 
   auto op_result_as_shape_fn = [this](InferenceContext &ic,
                                       OpResult op_result) -> ShapeHandle {
-    auto rt = op_result.getType().dyn_cast<RankedTensorType>();
+    auto rt = mlir::dyn_cast<RankedTensorType>(op_result.getType());
     // NOMUTANTS -- TODO(chiahungduan): Review this condition to see if shape
     // with known rank but unknown dimension is acceptable.
     if (!rt || rt.getRank() != 1 || !rt.hasStaticShape()) return {};
@@ -199,7 +202,8 @@ void ShapeInference::runOnOperation() {
 
   auto infer_and_update_shapes = [&](Operation *op) -> bool {
     auto result_element_type_fn = [&](int idx) -> Type {
-      return op->getResult(idx).getType().cast<ShapedType>().getElementType();
+      return mlir::cast<ShapedType>(op->getResult(idx).getType())
+          .getElementType();
     };
 
     SmallVector<ShapedTypeComponents> results;
@@ -223,7 +227,7 @@ void ShapeInference::runOnOperation() {
       }
 
       Type refined_type = tf_type::GetCastCompatibleType(
-          op_result.getType().cast<ShapedType>(), inferred_type);
+          mlir::cast<ShapedType>(op_result.getType()), inferred_type);
 
       // Certain attributes like _output_shapes may have incorrect shape
       // information. When it's incompatible, use the result of shape inference
diff --git a/tensorflow/core/transforms/utils/eval_utils_test.cc b/tensorflow/core/transforms/utils/eval_utils_test.cc
index 78b71a9683f876..072e88f2db7465 100644
--- a/tensorflow/core/transforms/utils/eval_utils_test.cc
+++ b/tensorflow/core/transforms/utils/eval_utils_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/resource_mgr.h"
@@ -103,8 +104,8 @@ TEST(EvalUtilsTest, EvaluateOperation) {
       {const_0->getAttrOfType<ElementsAttr>("value")}, result)));
 
   ASSERT_EQ(result.size(), 1);
-  ASSERT_TRUE(result[0].isa<ElementsAttr>());
-  EXPECT_EQ(result[0].cast<ElementsAttr>().getValues<int>()[0], 1);
+  ASSERT_TRUE(mlir::isa<ElementsAttr>(result[0]));
+  EXPECT_EQ(mlir::cast<ElementsAttr>(result[0]).getValues<int>()[0], 1);
 
   result.clear();
 
@@ -113,8 +114,8 @@ TEST(EvalUtilsTest, EvaluateOperation) {
       {const_1->getAttrOfType<ElementsAttr>("value")}, result)));
 
   ASSERT_EQ(result.size(), 1);
-  ASSERT_TRUE(result[0].isa<ElementsAttr>());
-  EXPECT_EQ(result[0].cast<ElementsAttr>().getValues<int>()[0], 2);
+  ASSERT_TRUE(mlir::isa<ElementsAttr>(result[0]));
+  EXPECT_EQ(mlir::cast<ElementsAttr>(result[0]).getValues<int>()[0], 2);
 
   result.clear();
 
@@ -125,8 +126,8 @@ TEST(EvalUtilsTest, EvaluateOperation) {
                               result)));
 
   ASSERT_EQ(result.size(), 1);
-  ASSERT_TRUE(result[0].isa<ElementsAttr>());
-  EXPECT_EQ(result[0].cast<ElementsAttr>().getValues<int>()[0], 3);
+  ASSERT_TRUE(mlir::isa<ElementsAttr>(result[0]));
+  EXPECT_EQ(mlir::cast<ElementsAttr>(result[0]).getValues<int>()[0], 3);
 }
 
 TEST(EvalUtilsTest, OutputInvalidation) {
@@ -170,7 +171,7 @@ TEST(EvalUtilsTest, OutputInvalidation) {
   ASSERT_EQ(result.size(), 2);
   // Output 0 is invalidated.
   EXPECT_EQ(result[0], nullptr);
-  EXPECT_EQ(result[1].cast<ElementsAttr>().getValues<int>()[0], 2);
+  EXPECT_EQ(mlir::cast<ElementsAttr>(result[1]).getValues<int>()[0], 2);
 }
 
 }  // namespace tfg
diff --git a/tensorflow/core/transforms/utils/op_cat_helper.cc b/tensorflow/core/transforms/utils/op_cat_helper.cc
index 5107c2e7b7c94e..1347072cd87676 100644
--- a/tensorflow/core/transforms/utils/op_cat_helper.cc
+++ b/tensorflow/core/transforms/utils/op_cat_helper.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/transforms/utils/op_cat_helper.h"
 
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/ir/dialect.h"
@@ -55,22 +56,22 @@ bool SplatElementsAttrHasValue(SplatElementsAttr attr, float v) {
     IF_SPLAT_VALUE_IS(tensorflow::DT_DOUBLE, v);
   } else if (type.isBF16()) {
     IF_SPLAT_VALUE_IS(tensorflow::DT_BFLOAT16, v);
-  } else if (type.isa<ComplexType>()) {
-    ComplexType complex_type = type.cast<ComplexType>();
+  } else if (mlir::isa<ComplexType>(type)) {
+    ComplexType complex_type = mlir::cast<ComplexType>(type);
     if (complex_type.getElementType().isF32()) {
       IF_SPLAT_VALUE_IS(tensorflow::DT_COMPLEX64, v);
     } else if (complex_type.getElementType().isF64()) {
       IF_SPLAT_VALUE_IS(tensorflow::DT_COMPLEX128, v);
     }
-  } else if (type.isa<tf_type::Qint8Type>()) {
+  } else if (mlir::isa<tf_type::Qint8Type>(type)) {
     IF_SPLAT_VALUE_IS(tensorflow::DT_QINT8, v);
-  } else if (type.isa<tf_type::Qint16Type>()) {
+  } else if (mlir::isa<tf_type::Qint16Type>(type)) {
     IF_SPLAT_VALUE_IS(tensorflow::DT_QINT16, v);
-  } else if (type.isa<tf_type::Qint32Type>()) {
+  } else if (mlir::isa<tf_type::Qint32Type>(type)) {
     IF_SPLAT_VALUE_IS(tensorflow::DT_QINT32, v);
-  } else if (type.isa<tf_type::Quint8Type>()) {
+  } else if (mlir::isa<tf_type::Quint8Type>(type)) {
     IF_SPLAT_VALUE_IS(tensorflow::DT_QUINT8, v);
-  } else if (type.isa<tf_type::Quint16Type>()) {
+  } else if (mlir::isa<tf_type::Quint16Type>(type)) {
     IF_SPLAT_VALUE_IS(tensorflow::DT_QUINT16, v);
   }
 #undef IF_SPLAT_VALUE_IS
@@ -82,7 +83,7 @@ bool SplatElementsAttrHasValue(SplatElementsAttr attr, float v) {
 bool OpCatHelper::IsAggregate(TFOp op) {
   if (dialect_->IsAdd(op)) {
     auto attr = op->getAttrOfType<TypeAttr>("T");
-    return !attr || !attr.getValue().isa<StringType>();
+    return !attr || !mlir::isa<StringType>(attr.getValue());
   }
   const tensorflow::OpDef *op_def = nullptr;
   tensorflow::Status status = tensorflow::OpRegistry::Global()->LookUpOpDef(
@@ -93,7 +94,7 @@ bool OpCatHelper::IsAggregate(TFOp op) {
 bool OpCatHelper::IsCommutative(TFOp op) {
   if (dialect_->IsAdd(op)) {
     auto attr = op->getAttrOfType<TypeAttr>("T");
-    return !attr || !attr.getValue().isa<StringType>();
+    return !attr || !mlir::isa<StringType>(attr.getValue());
   }
   const tensorflow::OpDef *op_def = nullptr;
   tensorflow::Status status = tensorflow::OpRegistry::Global()->LookUpOpDef(
diff --git a/tensorflow/core/transforms/utils/pdll/utils.cc b/tensorflow/core/transforms/utils/pdll/utils.cc
index 83970495f293ca..b433e6ba9b4a9d 100644
--- a/tensorflow/core/transforms/utils/pdll/utils.cc
+++ b/tensorflow/core/transforms/utils/pdll/utils.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/ir/tf_op_wrapper.h"
 #include "tensorflow/core/transforms/utils/utils.h"
 
 namespace mlir {
diff --git a/tensorflow/core/util/bcast.h b/tensorflow/core/util/bcast.h
index e5fa5c1311482a..5c8a5dbfda4fcf 100644
--- a/tensorflow/core/util/bcast.h
+++ b/tensorflow/core/util/bcast.h
@@ -32,10 +32,11 @@ namespace tensorflow {
 // (flattened) batch index of the input that must be used to compute the i'th
 // batch output.
 //
-inline void ComputeBatchIndices(const int64_t output_batch_size,
-                                const gtl::InlinedVector<int64_t, 4>& reshape,
-                                const gtl::InlinedVector<int64_t, 4>& bcast,
-                                std::vector<int64_t>* out_indices) {
+inline void ComputeBatchIndices(
+    const int64_t output_batch_size,
+    const absl::InlinedVector<int64_t, 4UL>& reshape,
+    const absl::InlinedVector<int64_t, 4UL>& bcast,
+    std::vector<int64_t>* out_indices) {
   // Populates the mapping in out_indices. This algorithm is identical to
   // the following steps:
   //  - Reshape {0, 1, ..., input_batch_size - 1} to the input shape.
@@ -65,7 +66,7 @@ class BCastList {
   // element is the outer-most dimension and the last element is the
   // inner-most dimension. Note that we do not use TensorShape since
   // it's more convenient to manipulate Vec directly for this module.
-  typedef gtl::InlinedVector<int64_t, 4> Vec;
+  typedef absl::InlinedVector<int64_t, 4UL> Vec;
 
   // Constructs all helper shapes, following the aforementioned rules.
   //
@@ -360,7 +361,7 @@ class BCast : public BCastList<2> {
   //
   // If false, all intermediate shapes (except for grad_{x,y}_reduce_idx()) have
   // the same number of dimensions as the larger of the two inputs.
-  typedef gtl::InlinedVector<int64_t, 4> Vec;
+  typedef absl::InlinedVector<int64_t, 4UL> Vec;
 
   BCast(const Vec& x, const Vec& y, const bool fewer_dims_optimization = true,
         const bool return_flattened_batch_indices = false)
diff --git a/tensorflow/core/util/ctc/ctc_beam_search.h b/tensorflow/core/util/ctc/ctc_beam_search.h
index 74ab5370d8e346..c325426aed3559 100644
--- a/tensorflow/core/util/ctc/ctc_beam_search.h
+++ b/tensorflow/core/util/ctc/ctc_beam_search.h
@@ -213,7 +213,7 @@ Status CTCBeamSearchDecoder<T, CTCBeamState, CTCBeamComparer>::Decode(
       (*scores)(b, i) = -beam_log_probabilities[i];
     }
   }  // for (int b...
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename T, typename CTCBeamState, typename CTCBeamComparer>
@@ -427,7 +427,7 @@ Status CTCBeamSearchDecoder<T, CTCBeamState, CTCBeamComparer>::TopPaths(
     paths->push_back(e->LabelSeq(merge_repeated));
     log_probs->push_back(e->newp.total);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace ctc
diff --git a/tensorflow/core/util/ctc/ctc_decoder.h b/tensorflow/core/util/ctc/ctc_decoder.h
index 76b3c19b0fea5e..088722adb892b8 100644
--- a/tensorflow/core/util/ctc/ctc_decoder.h
+++ b/tensorflow/core/util/ctc/ctc_decoder.h
@@ -110,7 +110,7 @@ class CTCGreedyDecoder : public CTCDecoder<T> {
         prev_class_ix = max_class_ix;
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
diff --git a/tensorflow/core/util/ctc/ctc_loss_calculator.h b/tensorflow/core/util/ctc/ctc_loss_calculator.h
index 42114ad4a40d8e..4a7894c7a9af29 100644
--- a/tensorflow/core/util/ctc/ctc_loss_calculator.h
+++ b/tensorflow/core/util/ctc/ctc_loss_calculator.h
@@ -279,7 +279,7 @@ Status CTCLossCalculator<T>::CalculateLoss(
   } else {
     ComputeLossAndGradients(0, batch_size);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <class T>
@@ -363,7 +363,7 @@ Status CTCLossCalculator<T>::PopulateLPrimes(
     GetLPrimeIndices(l, &l_primes->at(b));
     *max_u_prime = std::max(*max_u_prime, l_primes->at(b).size());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Calculates the alpha(t, u) as described in (GravesTh) Section 7.3.
diff --git a/tensorflow/core/util/einsum_op_util.cc b/tensorflow/core/util/einsum_op_util.cc
index 0d36dc353b415e..ff133d9df18530 100644
--- a/tensorflow/core/util/einsum_op_util.cc
+++ b/tensorflow/core/util/einsum_op_util.cc
@@ -27,10 +27,10 @@ limitations under the License.
 
 namespace tensorflow {
 
-Status ValidateEinsumEquation(const string& equation,
-                              gtl::InlinedVector<string, 2>* input_subscripts,
-                              string* output_subscript) {
-  gtl::InlinedVector<string, 2> inputs_and_output_subscripts =
+Status ValidateEinsumEquation(
+    const string& equation, absl::InlinedVector<string, 2UL>* input_subscripts,
+    string* output_subscript) {
+  absl::InlinedVector<string, 2UL> inputs_and_output_subscripts =
       absl::StrSplit(equation, "->");
   if (inputs_and_output_subscripts.size() != 2) {
     return errors::InvalidArgument(
@@ -86,9 +86,9 @@ Status ParseEinsumEquation(const string& equation, OperandLabels* input_labels,
                            std::vector<EinsumDimensionType>* label_types,
                            OperandLabelCounts* input_label_counts,
                            LabelCounts* output_label_counts,
-                           gtl::InlinedVector<bool, 2>* input_has_ellipsis,
+                           absl::InlinedVector<bool, 2UL>* input_has_ellipsis,
                            bool* output_has_ellipsis) {
-  gtl::InlinedVector<string, 2> input_str;
+  absl::InlinedVector<string, 2UL> input_str;
   string output_str;
   TF_RETURN_IF_ERROR(ValidateEinsumEquation(equation, &input_str, &output_str));
 
diff --git a/tensorflow/core/util/einsum_op_util.h b/tensorflow/core/util/einsum_op_util.h
index 2d99dd2cad5a35..664cb22b9fac08 100644
--- a/tensorflow/core/util/einsum_op_util.h
+++ b/tensorflow/core/util/einsum_op_util.h
@@ -22,10 +22,10 @@ limitations under the License.
 
 namespace tensorflow {
 
-using Labels = gtl::InlinedVector<int, 8>;
-using OperandLabels = gtl::InlinedVector<Labels, 2>;
-using LabelCounts = gtl::InlinedVector<int, 8>;
-using OperandLabelCounts = gtl::InlinedVector<LabelCounts, 2>;
+using Labels = absl::InlinedVector<int, 8UL>;
+using OperandLabels = absl::InlinedVector<Labels, 2UL>;
+using LabelCounts = absl::InlinedVector<int, 8UL>;
+using OperandLabelCounts = absl::InlinedVector<LabelCounts, 2UL>;
 
 // Dummy axis label used to denote an ellipsis in an input or output subscript.
 constexpr int kEllipsisLabel = -1;
@@ -52,9 +52,9 @@ enum EinsumDimensionType {
 };
 
 // Parses and validates an einsum equation in explicit form.
-Status ValidateEinsumEquation(const string& equation,
-                              gtl::InlinedVector<string, 2>* input_subscripts,
-                              string* output_subscript);
+Status ValidateEinsumEquation(
+    const string& equation, absl::InlinedVector<string, 2UL>* input_subscripts,
+    string* output_subscript);
 
 // Parses and validates the equation and the input shapes. Single character
 // labels are integerized and we populate input and output label subscripts
@@ -65,7 +65,7 @@ Status ParseEinsumEquation(const string& equation, OperandLabels* input_labels,
                            std::vector<EinsumDimensionType>* label_types,
                            OperandLabelCounts* input_label_counts,
                            LabelCounts* output_label_counts,
-                           gtl::InlinedVector<bool, 2>* input_has_ellipsis,
+                           absl::InlinedVector<bool, 2UL>* input_has_ellipsis,
                            bool* output_has_ellipsis);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/mkl_util_test.cc b/tensorflow/core/util/mkl_util_test.cc
index edb6d04f2af465..d9d084803773e4 100644
--- a/tensorflow/core/util/mkl_util_test.cc
+++ b/tensorflow/core/util/mkl_util_test.cc
@@ -54,38 +54,6 @@ TEST(MklUtilTest, MklDnnTfShape) {
   EXPECT_NE(b_tf_shape_nchw, b_mkldnn_tf_shape);
 }
 
-#ifndef ENABLE_ONEDNN_V3
-// TODO(intel-tf): This code is not tested for oneDNN v3.x and needs to be
-// removed
-TEST(MklUtilTest, MklDnnBlockedFormatTest) {
-  // Let's create 2D tensor of shape {3, 4} with 3 being innermost dimension
-  // first (case 1) and then it being outermost dimension (case 2).
-  auto cpu_engine = engine(engine::kind::cpu, 0);
-
-  // Setting for case 1
-  MklDnnData<float> a(&cpu_engine);
-  memory::dims dim1 = {3, 4};
-  memory::dims strides1 = {1, 3};
-  a.SetUsrMem(dim1, strides1);
-
-  memory::desc a_md1 = a.GetUsrMemDesc();
-  EXPECT_EQ(a_md1.data.ndims, 2);
-  EXPECT_EQ(a_md1.data.dims[0], 3);
-  EXPECT_EQ(a_md1.data.dims[1], 4);
-
-  // Setting for case 2
-  MklDnnData<float> b(&cpu_engine);
-  memory::dims dim2 = {3, 4};
-  memory::dims strides2 = {4, 1};
-  b.SetUsrMem(dim2, strides2);
-
-  memory::desc b_md2 = b.GetUsrMemDesc();
-  EXPECT_EQ(b_md2.data.ndims, 2);
-  EXPECT_EQ(b_md2.data.dims[0], 3);
-  EXPECT_EQ(b_md2.data.dims[1], 4);
-}
-#endif  // !ENABLE_ONEDNN_V3
-
 TEST(MklUtilTest, LRUCacheTest) {
   // The cached objects are of type int*
   size_t capacity = 100;
diff --git a/tensorflow/core/util/ragged_to_dense_util.cc b/tensorflow/core/util/ragged_to_dense_util.cc
index 8bd39867ff8c62..998684003183f7 100644
--- a/tensorflow/core/util/ragged_to_dense_util.cc
+++ b/tensorflow/core/util/ragged_to_dense_util.cc
@@ -37,7 +37,7 @@ tensorflow::Status GetRowPartitionTypesHelper(
         "Unknown string for partition info type: ",
         row_partition_type_strings.at(row_partition_types->size()));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status CombineRaggedTensorToTensorShapes(
@@ -50,7 +50,7 @@ tensorflow::Status CombineRaggedTensorToTensorShapes(
   if (value_shape.unknown_rank() && shape.unknown_rank()) {
     output_shape->Clear();
     output_shape->set_unknown_rank(true);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   if (shape.unknown_rank()) {
@@ -62,7 +62,7 @@ tensorflow::Status CombineRaggedTensorToTensorShapes(
     *output_shape = shape;
   }
   if (value_shape.unknown_rank()) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   // At this point, value_shape and output_shape have known ranks.
   if (ragged_rank + value_shape.dim_size() != output_shape->dim_size()) {
@@ -92,14 +92,14 @@ tensorflow::Status CombineRaggedTensorToTensorShapes(
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ValidateDefaultValueShape(
     const TensorShapeProto& default_value_shape,
     const TensorShapeProto& value_shape) {
   if (default_value_shape.unknown_rank() || value_shape.unknown_rank()) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   int default_ndims = default_value_shape.dim_size();
@@ -127,7 +127,7 @@ tensorflow::Status ValidateDefaultValueShape(
           i - default_value_shape.dim_size(), "] = ", value_dim);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/reffed_status_callback_test.cc b/tensorflow/core/util/reffed_status_callback_test.cc
index d71e9866dd21af..f6728e27a3cb74 100644
--- a/tensorflow/core/util/reffed_status_callback_test.cc
+++ b/tensorflow/core/util/reffed_status_callback_test.cc
@@ -47,7 +47,7 @@ TEST(TestReffedStatusCallback, CallsBackOK) {
 
 TEST(TestReffedStatusCallback, CallsBackFail) {
   bool called = false;
-  Status status = OkStatus();
+  Status status = absl::OkStatus();
   auto done = [&called, &status](const Status& s) {
     called = true;
     status = s;
@@ -68,7 +68,7 @@ TEST(TestReffedStatusCallback, CallsBackFail) {
 
 TEST(TestReffedStatusCallback, RefMulti) {
   int called = false;
-  Status status = OkStatus();
+  Status status = absl::OkStatus();
   auto done = [&called, &status](const Status& s) {
     called = true;
     status = s;
diff --git a/tensorflow/core/util/saved_tensor_slice_util.cc b/tensorflow/core/util/saved_tensor_slice_util.cc
index 9149bedafac4bf..654224c0ad93d6 100644
--- a/tensorflow/core/util/saved_tensor_slice_util.cc
+++ b/tensorflow/core/util/saved_tensor_slice_util.cc
@@ -86,7 +86,7 @@ Status DecodeTensorNameSlice(const string& code, string* name,
       slice->set_length(d, length);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ParseShapeAndSlice(const string& shape_and_slice, TensorShape* shape,
diff --git a/tensorflow/core/util/sparse/dim_comparator.h b/tensorflow/core/util/sparse/dim_comparator.h
index 76d50c67811f3b..7112d6a2dcfabe 100644
--- a/tensorflow/core/util/sparse/dim_comparator.h
+++ b/tensorflow/core/util/sparse/dim_comparator.h
@@ -45,7 +45,7 @@ namespace sparse {
 // the values in IX in particular columns (dimensions) of interest.
 class DimComparator {
  public:
-  typedef typename gtl::ArraySlice<int64_t> VarDimArray;
+  typedef absl::Span<const int64_t> VarDimArray;
 
   DimComparator(const TTypes<int64_t>::Matrix& ix, const VarDimArray& order,
                 const VarDimArray& shape)
diff --git a/tensorflow/core/util/sparse/group_iterator.h b/tensorflow/core/util/sparse/group_iterator.h
index f1d16ce6d2ae4a..0d425fa65b692a 100644
--- a/tensorflow/core/util/sparse/group_iterator.h
+++ b/tensorflow/core/util/sparse/group_iterator.h
@@ -76,7 +76,7 @@ class Group {
 // Forward declaration of SparseTensor
 class GroupIterable {
  public:
-  typedef gtl::ArraySlice<int64_t> VarDimArray;
+  typedef absl::Span<const int64_t> VarDimArray;
 
   GroupIterable(Tensor ix, Tensor vals, int dims, const VarDimArray& group_dims)
       : ix_(ix),
@@ -132,7 +132,7 @@ class GroupIterable {
   const TTypes<int64_t>::ConstMatrix ix_matrix_;
   Tensor vals_;
   const int dims_;
-  const gtl::InlinedVector<int64_t, 8> group_dims_;
+  const absl::InlinedVector<int64_t, 8UL> group_dims_;
 };
 
 inline int64_t Group::group_at(size_t index) const {
diff --git a/tensorflow/core/util/sparse/sparse_tensor.h b/tensorflow/core/util/sparse/sparse_tensor.h
index 04240b92764473..469502b3f63ce2 100644
--- a/tensorflow/core/util/sparse/sparse_tensor.h
+++ b/tensorflow/core/util/sparse/sparse_tensor.h
@@ -42,7 +42,7 @@ namespace sparse {
 class SparseTensor {
  public:
   typedef absl::Span<const int64_t> VarDimArray;
-  typedef typename gtl::InlinedVector<int64_t, 8> ShapeArray;
+  typedef absl::InlinedVector<int64_t, 8UL> ShapeArray;
 
   static Status Create(Tensor ix, Tensor vals, const VarDimArray shape,
                        const VarDimArray order, SparseTensor* result);
@@ -392,7 +392,7 @@ inline bool SparseTensor::ToDense(Tensor* out, bool initialize) {
     return true;
   } else {
     // General path for N-dimensional sparse tensors.
-    gtl::InlinedVector<int64_t, 4> strides(dims_);
+    absl::InlinedVector<int64_t, 4UL> strides(dims_);
     const auto& out_shape = out->shape().dim_sizes();
     if (dims_ > 0) {
       strides[dims_ - 1] = 1;
diff --git a/tensorflow/core/util/sparse/sparse_tensor_test.cc b/tensorflow/core/util/sparse/sparse_tensor_test.cc
index 33fb2fde66496d..586b1790e55de6 100644
--- a/tensorflow/core/util/sparse/sparse_tensor_test.cc
+++ b/tensorflow/core/util/sparse/sparse_tensor_test.cc
@@ -558,7 +558,7 @@ TEST(SparseTensorTest, Concat) {
 
   SparseTensor concatted = SparseTensor::Concat<tstring>({st, st, st, st});
   EXPECT_EQ(concatted.order(), st.order());
-  gtl::InlinedVector<int64_t, 8> expected_shape{40, 10, 10};
+  absl::InlinedVector<int64_t, 8UL> expected_shape{40, 10, 10};
   EXPECT_EQ(concatted.shape(), expected_shape);
   EXPECT_EQ(concatted.num_entries(), 4 * N);
   TF_EXPECT_OK(concatted.IndicesValid());
@@ -637,7 +637,7 @@ TEST(SparseTensorTest, Split) {
   TF_ASSERT_OK(SparseTensor::Split<int64_t>(st, 0, 2, &st_list));
 
   EXPECT_EQ(st_list.size(), 2);
-  auto expected_shape = gtl::InlinedVector<int64_t, 8>{2, 3};
+  auto expected_shape = absl::InlinedVector<int64_t, 8UL>{2, 3};
 
   EXPECT_EQ(st_list[0].shape(), expected_shape);
   EXPECT_EQ(st_list[0].values().NumElements(), 3);
diff --git a/tensorflow/core/util/strided_slice_op.cc b/tensorflow/core/util/strided_slice_op.cc
index 334b0325f1c487..18b08ef73be038 100644
--- a/tensorflow/core/util/strided_slice_op.cc
+++ b/tensorflow/core/util/strided_slice_op.cc
@@ -53,22 +53,22 @@ struct StridedSliceDenseSpec {
   int32 end_mask;
   bool begin_valid;
   bool end_valid;
-  gtl::InlinedVector<int64_t, 4>& begin;
-  gtl::InlinedVector<int64_t, 4>& end;
-  gtl::InlinedVector<int64_t, 4>& strides;
+  absl::InlinedVector<int64_t, 4UL>& begin;
+  absl::InlinedVector<int64_t, 4UL>& end;
+  absl::InlinedVector<int64_t, 4UL>& strides;
   // This vector helps construct the final shape of the slice.
   // The final tensor is reduced in rank whenever a single index e.g. foo[3]
   // is called for. The final tensor increases in rank with tf.newaxis
   // entries. If an index in this array is positive, the size of the dimension
   // is obtained from canonical end-begin. Otherwise, if it is a kNewAxis,
   // it will be 1. A shrunk dimension is skipped.
-  gtl::InlinedVector<int32, 4> final_shape_gather_indices;
+  absl::InlinedVector<int32, 4UL> final_shape_gather_indices;
   // This vector has the same size as final_shape_gather_indices, but it
   // remembers the sparse index that a dimension comes from, instead of dense
   // index. A -1 in this vector means there the index is not from the sparse
   // input.
-  gtl::InlinedVector<int32, 4> final_shape_gather_indices_sparse;
-  gtl::InlinedVector<int32, 4> input_shape_gather_indices_sparse;
+  absl::InlinedVector<int32, 4UL> final_shape_gather_indices_sparse;
+  absl::InlinedVector<int32, 4UL> input_shape_gather_indices_sparse;
   // The dense indexed shrink mask is which processing dimensions
   // should be shrunk. For example, if foo.shape = (10,10,10,10)
   // foo[3, ..., 5] has sparse_shrink_axis_mask of 0x5 and
@@ -190,8 +190,9 @@ Status ValidateStridedSliceOp(
     int32_t new_axis_mask, int32_t shrink_axis_mask,
     PartialTensorShape* processing_shape, PartialTensorShape* final_shape,
     bool* is_identity, bool* is_simple_slice, bool* slice_dim0,
-    gtl::InlinedVector<int64_t, 4>* begin, gtl::InlinedVector<int64_t, 4>* end,
-    gtl::InlinedVector<int64_t, 4>* strides,
+    absl::InlinedVector<int64_t, 4UL>* begin,
+    absl::InlinedVector<int64_t, 4UL>* end,
+    absl::InlinedVector<int64_t, 4UL>* strides,
     StridedSliceShapeSpec* shape_spec) {
   if (input_shape.unknown_rank()) {
     // Note: If the rank is unknown, "input_shape.dims()" is -1.
@@ -447,8 +448,9 @@ Status ValidateStridedSliceOp(
     int32_t new_axis_mask, int32_t shrink_axis_mask,
     TensorShape* processing_shape, TensorShape* final_shape, bool* is_identity,
     bool* is_simple_slice, bool* slice_dim0,
-    gtl::InlinedVector<int64_t, 4>* begin, gtl::InlinedVector<int64_t, 4>* end,
-    gtl::InlinedVector<int64_t, 4>* strides,
+    absl::InlinedVector<int64_t, 4UL>* begin,
+    absl::InlinedVector<int64_t, 4UL>* end,
+    absl::InlinedVector<int64_t, 4UL>* strides,
     StridedSliceShapeSpec* shape_spec) {
   // Validate with PartialTensorShape output
   PartialTensorShape partial_processing_shape, partial_final_shape;
diff --git a/tensorflow/core/util/strided_slice_op.h b/tensorflow/core/util/strided_slice_op.h
index ef3b75eb35c79b..9ed6911ee25fb8 100644
--- a/tensorflow/core/util/strided_slice_op.h
+++ b/tensorflow/core/util/strided_slice_op.h
@@ -34,13 +34,13 @@ struct StridedSliceShapeSpec {
   // index in the begin_tensor. If
   // output_to_sparse_mapping[i] is -1, it means the dimension doesn't show up
   // in sparse_mapping.
-  gtl::InlinedVector<int64_t, 4> output_to_sparse_mapping;
+  absl::InlinedVector<int64_t, 4UL> output_to_sparse_mapping;
   // output_to_processing_mapping is similar to output_to_sparse_mapping, but
   // for processing shape.
-  gtl::InlinedVector<int64_t, 4> output_to_processing_mapping;
+  absl::InlinedVector<int64_t, 4UL> output_to_processing_mapping;
   // processing_to_sparse_mapping[i] represents input_shape[i]'s corresponding
   // dim index in the begin_tensor.
-  gtl::InlinedVector<int64_t, 4> processing_to_sparse_mapping;
+  absl::InlinedVector<int64_t, 4UL> processing_to_sparse_mapping;
 };
 
 // Runs validation on the strided slice op parameters.
@@ -71,8 +71,9 @@ Status ValidateStridedSliceOp(
     int32_t new_axis_mask, int32_t shrink_axis_mask,
     PartialTensorShape* processing_shape, PartialTensorShape* final_shape,
     bool* is_identity, bool* is_simple_slice, bool* slice_dim0,
-    gtl::InlinedVector<int64_t, 4>* begin, gtl::InlinedVector<int64_t, 4>* end,
-    gtl::InlinedVector<int64_t, 4>* strides,
+    absl::InlinedVector<int64_t, 4UL>* begin,
+    absl::InlinedVector<int64_t, 4UL>* end,
+    absl::InlinedVector<int64_t, 4UL>* strides,
     StridedSliceShapeSpec* shape_spec = nullptr);
 
 // Same as above, but the outputs are TensorShape, not PartialTensorShape
@@ -83,8 +84,9 @@ Status ValidateStridedSliceOp(
     int32_t new_axis_mask, int32_t shrink_axis_mask,
     TensorShape* processing_shape, TensorShape* final_shape, bool* is_identity,
     bool* is_simple_slice, bool* slice_dim0,
-    gtl::InlinedVector<int64_t, 4>* begin, gtl::InlinedVector<int64_t, 4>* end,
-    gtl::InlinedVector<int64_t, 4>* strides,
+    absl::InlinedVector<int64_t, 4UL>* begin,
+    absl::InlinedVector<int64_t, 4UL>* end,
+    absl::InlinedVector<int64_t, 4UL>* strides,
     StridedSliceShapeSpec* shape_spec = nullptr);
 
 // Simple class for determining if it is possible to broadcast a tensor to a
@@ -98,7 +100,7 @@ Status ValidateStridedSliceOp(
 //   with correct dimensions in the full (unsliced) destination tensor.
 class StridedSliceAssignBCast {
  public:
-  using Vec = gtl::InlinedVector<int64_t, 4>;
+  using Vec = absl::InlinedVector<int64_t, 4UL>;
 
   StridedSliceAssignBCast(const Vec& input_shape, const Vec& output_shape);
 
diff --git a/tensorflow/core/util/strided_slice_op_test.cc b/tensorflow/core/util/strided_slice_op_test.cc
index f0a9dccf502264..cbe097683be662 100644
--- a/tensorflow/core/util/strided_slice_op_test.cc
+++ b/tensorflow/core/util/strided_slice_op_test.cc
@@ -348,7 +348,7 @@ TEST(StridedSliceAssignBCastTest, RemapDimensionsOutOfBoundsFails) {
   }
 }
 
-using IntVector = gtl::InlinedVector<int64_t, 4>;
+using IntVector = absl::InlinedVector<int64_t, 4UL>;
 
 TensorShape AsTensorShape(absl::Span<const int64_t> dim_sizes) {
   TensorShape out;
diff --git a/tensorflow/core/util/tensor_format.h b/tensorflow/core/util/tensor_format.h
index 59e702697a1ccb..0f1dc463a4eed3 100644
--- a/tensorflow/core/util/tensor_format.h
+++ b/tensorflow/core/util/tensor_format.h
@@ -524,7 +524,7 @@ inline Status ShapeFromFormatWithStatus(TensorFormat format, int64_t N,
                                         absl::Span<const int64_t> spatial,
                                         int64_t C, TensorShape* shape) {
   const int dims = GetTensorDimsFromSpatialDims(spatial.size(), format);
-  gtl::InlinedVector<int64_t, 6> dim_sizes(dims);
+  absl::InlinedVector<int64_t, 6UL> dim_sizes(dims);
   dim_sizes[GetTensorBatchDimIndex(dims, format)] = N;
   for (int dim = 0; static_cast<size_t>(dim) < spatial.size(); dim++) {
     auto dim_size = spatial[dim];
@@ -566,7 +566,7 @@ inline TensorShape ShapeFromFilterTensorFormat(
     FilterTensorFormat format, absl::Span<const int64_t> spatial, int64_t I,
     int64_t O) {
   const int dims = GetFilterTensorDimsFromSpatialDims(spatial.size(), format);
-  gtl::InlinedVector<int64_t, 6> dim_sizes(dims);
+  absl::InlinedVector<int64_t, 6UL> dim_sizes(dims);
   dim_sizes[GetFilterTensorOutputChannelsDimIndex(dims, format)] = O;
   for (int dim = 0; static_cast<size_t>(dim) < spatial.size(); dim++) {
     dim_sizes[GetFilterTensorSpatialDimIndex(dims, format, dim)] = spatial[dim];
diff --git a/tensorflow/core/util/util.cc b/tensorflow/core/util/util.cc
index 3f23c292f90801..6e9c20d0a39671 100644
--- a/tensorflow/core/util/util.cc
+++ b/tensorflow/core/util/util.cc
@@ -105,7 +105,7 @@ string SliceDebugString(const TensorShape& shape, const int64_t flat) {
   if (dims == 1) return strings::StrCat("[", flat, "]");
 
   // Compute strides
-  gtl::InlinedVector<int64_t, 32> strides(dims);
+  absl::InlinedVector<int64_t, 32UL> strides(dims);
   strides.back() = 1;
   for (int i = dims - 2; i >= 0; i--) {
     strides[i] = strides[i + 1] * shape.dim_size(i + 1);
diff --git a/tensorflow/dtensor/cc/BUILD b/tensorflow/dtensor/cc/BUILD
index 3d28e474d680ea..a1ba57eb3f104c 100644
--- a/tensorflow/dtensor/cc/BUILD
+++ b/tensorflow/dtensor/cc/BUILD
@@ -204,6 +204,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
 )
 
diff --git a/tensorflow/dtensor/cc/dtensor_device.cc b/tensorflow/dtensor/cc/dtensor_device.cc
index 6632f22be4f6d7..ca1460e6260990 100644
--- a/tensorflow/dtensor/cc/dtensor_device.cc
+++ b/tensorflow/dtensor/cc/dtensor_device.cc
@@ -2826,7 +2826,7 @@ void ExperimentalSetDefaultLayout(const std::string& serialized_layout,
   StatusOr<Layout> layout = Layout::FromString(serialized_layout);
   if (!layout.ok()) {
     RETURN_STATUS(status, TF_INTERNAL,
-                  tsl::NullTerminatedMessage(layout.status()));
+                  absl::StatusMessageAsCStr(layout.status()));
   }
   DTensorDevice* device = reinterpret_cast<DTensorDevice*>(device_info);
   device->SetDefaultLayout(layout.value());
@@ -2842,7 +2842,7 @@ void ExperimentalSetDefaultMesh(const std::string& serialized_mesh,
   StatusOr<Mesh> mesh = Mesh::FromString(serialized_mesh);
   if (!mesh.ok()) {
     RETURN_STATUS(status, TF_INTERNAL,
-                  tsl::NullTerminatedMessage(mesh.status()));
+                  absl::StatusMessageAsCStr(mesh.status()));
   }
   DTensorDevice* device = reinterpret_cast<DTensorDevice*>(device_info);
   device->SetDefaultMesh(mesh.value());
diff --git a/tensorflow/dtensor/cc/parallel_executor.h b/tensorflow/dtensor/cc/parallel_executor.h
index 8234cd36e317b3..e57218e541ff02 100644
--- a/tensorflow/dtensor/cc/parallel_executor.h
+++ b/tensorflow/dtensor/cc/parallel_executor.h
@@ -68,7 +68,7 @@ class ParallelExecutor {
       TensorWithLayout* t) = 0;
 
   // Returns a tensor copied from `t` when `t` contains only a single device.
-  virtual Future<StatusOr<Tensor>> ToHostBuffer(TensorWithLayout* t) = 0;
+  virtual Future<Tensor> ToHostBuffer(TensorWithLayout* t) = 0;
 };
 
 // Factory method for Default ParallelExecutor instance.
diff --git a/tensorflow/dtensor/cc/save_restore_util.cc b/tensorflow/dtensor/cc/save_restore_util.cc
index b6f040a5f24fc6..dcaf41baf5f1e6 100644
--- a/tensorflow/dtensor/cc/save_restore_util.cc
+++ b/tensorflow/dtensor/cc/save_restore_util.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/dtensor/mlir/value_utils.h"
 namespace tensorflow {
@@ -158,7 +159,8 @@ SaveOpSpecs BuildPerDeviceSave(
             builder
                 .create<mlir::TF::AddOp>(
                     prefix.getLoc(),
-                    prefix.getType().dyn_cast<mlir::RankedTensorType>(), prefix,
+                    mlir::dyn_cast<mlir::RankedTensorType>(prefix.getType()),
+                    prefix,
                     StringScalarConst(builder, prefix.getLoc(),
                                       DeviceSuffix(device_id, total_devices)))
                 .getZ();
diff --git a/tensorflow/dtensor/mlir/BUILD b/tensorflow/dtensor/mlir/BUILD
index 80531f9c04036a..5cd39d3619908d 100644
--- a/tensorflow/dtensor/mlir/BUILD
+++ b/tensorflow/dtensor/mlir/BUILD
@@ -104,10 +104,12 @@ cc_library(
         "//tensorflow/dtensor/cc:dstatus",
         "//tensorflow/dtensor/cc:dtensor_utils",
         "//tensorflow/dtensor/cc:tensor_layout",
+        "//tensorflow/dtensor/mlir/dtensor_dialect:ir/dtensor_attributes",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -116,8 +118,11 @@ cc_library(
     srcs = ["collectives_common.cc"],
     hdrs = ["collectives_common.h"],
     deps = [
+        "//tensorflow/core:portable_gif_internal",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/dtensor/cc:dstatus",
         "//tensorflow/dtensor/cc:tensor_layout",
-        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -129,8 +134,10 @@ cc_library(
         "//tensorflow/core/platform:errors",
         "//tensorflow/dtensor/cc:dstatus",
         "//tensorflow/dtensor/cc:tensor_layout",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
     alwayslink = True,
 )
@@ -249,6 +256,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/dtensor/cc:constants",
+        "//tensorflow/dtensor/cc:dstatus",
         "//tensorflow/dtensor/cc:dtensor_utils",
         "//tensorflow/dtensor/cc:layout_to_xla_sharding",
         "//tensorflow/dtensor/cc:tensor_layout",
@@ -257,11 +265,15 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:FunctionInterfaces",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SideEffectInterfaces",
@@ -326,6 +338,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/dtensor/mlir/annotate_global_shape.cc b/tensorflow/dtensor/mlir/annotate_global_shape.cc
index e251254e1d38cf..834e8d1d1d7cc0 100644
--- a/tensorflow/dtensor/mlir/annotate_global_shape.cc
+++ b/tensorflow/dtensor/mlir/annotate_global_shape.cc
@@ -27,11 +27,11 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/Transforms/Passes.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 #include "tensorflow/dtensor/cc/constants.h"
-#include "tensorflow/dtensor/mlir/dtensor_mlir_passes.h"
 #include "tensorflow/dtensor/mlir/ir/tf_dtensor.h"
 #include "tensorflow/dtensor/mlir/value_utils.h"
 
@@ -51,8 +51,8 @@ void AnnotateFunctionArgRetvalGlobalShapes(mlir::func::FuncOp function,
     const auto& argument_type = argument_type_and_index.value();
     // Extract TensorType from element of resource type to allow setting proper
     // global shape of resource types.
-    if (auto resource_type = mlir::getElementTypeOrSelf(argument_type)
-                                 .dyn_cast<mlir::TF::ResourceType>()) {
+    if (auto resource_type = mlir::dyn_cast<mlir::TF::ResourceType>(
+            mlir::getElementTypeOrSelf(argument_type))) {
       auto subtype = resource_type.getSubtypes();
       if (subtype.size() == 1) {
         // subtype returns a Array of TensorType -- if it contains more than one
diff --git a/tensorflow/dtensor/mlir/cluster_function_conversion.cc b/tensorflow/dtensor/mlir/cluster_function_conversion.cc
index a1a40fc0801165..906044ad780ea2 100644
--- a/tensorflow/dtensor/mlir/cluster_function_conversion.cc
+++ b/tensorflow/dtensor/mlir/cluster_function_conversion.cc
@@ -18,26 +18,29 @@ limitations under the License.
 #include <optional>
 #include <utility>
 
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 #include "tensorflow/dtensor/cc/constants.h"
 #include "tensorflow/dtensor/cc/tensor_layout.h"
-#include "tensorflow/dtensor/mlir/dtensor_mlir_passes.h"
-#include "tensorflow/dtensor/mlir/ir/tf_dtensor.h"
 #include "tensorflow/dtensor/mlir/layout_parsing.h"
 #include "tensorflow/dtensor/mlir/op_utils.h"
 #include "tensorflow/dtensor/mlir/spmd_expander_common.h"
@@ -80,7 +83,7 @@ mlir::LogicalResult AttachRetvalLayouts(
     // operations. In that case, query the input layouts for function callsite
     // operations for layout information.
     if (!result_layout) {
-      if (auto block_arg = operand.dyn_cast<mlir::BlockArgument>()) {
+      if (auto block_arg = mlir::dyn_cast<mlir::BlockArgument>(operand)) {
         auto layout_or_status = ExtractLayoutFromOperand(
             sp_call_op.getOperand(block_arg.getArgNumber()));
         if (!layout_or_status.ok())
diff --git a/tensorflow/dtensor/mlir/collectives.cc b/tensorflow/dtensor/mlir/collectives.cc
index 696fd52923b36e..b82304c6fd1749 100644
--- a/tensorflow/dtensor/mlir/collectives.cc
+++ b/tensorflow/dtensor/mlir/collectives.cc
@@ -23,19 +23,27 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.h"
 #include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/dtensor/cc/dstatus.h"
 #include "tensorflow/dtensor/cc/dtensor_utils.h"
 #include "tensorflow/dtensor/cc/tensor_layout.h"
 #include "tensorflow/dtensor/mlir/collectives_common.h"
+#include "tensorflow/dtensor/mlir/dtensor_dialect/ir/dtensor_attributes.h"
 #include "tensorflow/dtensor/mlir/dtensor_location.h"
 #include "tensorflow/dtensor/mlir/ir/tf_dtensor.h"
 #include "tensorflow/dtensor/mlir/layout_parsing.h"
@@ -79,7 +87,7 @@ StatusOr<mlir::Value> EmitAllGather(
   // For convenience, operate on explicit input shapes. This isn't necessary,
   // as we could instead generate operations on top of the dynamic shape.
   const mlir::TensorType input_type =
-      input.getType().dyn_cast<mlir::TensorType>();
+      mlir::dyn_cast<mlir::TensorType>(input.getType());
   if (!input_type) {
     return errors::Internal(
         llvm::formatv(
@@ -133,7 +141,7 @@ StatusOr<const mlir::Value> EmitAllScatter(
   }
 
   const mlir::TensorType input_type =
-      original_value.getType().dyn_cast<mlir::TensorType>();
+      mlir::dyn_cast<mlir::TensorType>(original_value.getType());
   if (!input_type)
     return errors::InvalidArgument(
         "input to EmitAllScatter does not have a TensorType");
@@ -199,7 +207,7 @@ StatusOr<mlir::Value> EmitAllToAll(
   // For convenience, operate on explicit input shapes. This isn't necessary,
   // as we could instead generate operations on top of the dynamic shape.
   const mlir::TensorType input_type =
-      input.getType().dyn_cast<mlir::TensorType>();
+      mlir::dyn_cast<mlir::TensorType>(input.getType());
   if (!input_type) {
     return errors::Internal(
         llvm::formatv(
@@ -260,7 +268,7 @@ StatusOr<mlir::Value> EmitDenseToSparseToDense(
       mlir::Value zero_scalar,
       CreateZeroScalarConst(
           builder, input.getLoc(),
-          input.getType().cast<mlir::TensorType>().getElementType()));
+          mlir::cast<mlir::TensorType>(input.getType()).getElementType()));
 
   auto dense = builder.create<mlir::TF::SparseToDenseOp>(
       input.getLoc(), input.getType(),
@@ -311,7 +319,7 @@ StatusOr<mlir::Value> EmitRelayout(
   // Save whether the input is from a SparseToDenseOp. If it is, then we will
   // emit a DenseToSparse and a SparseToDense op.
   bool is_sparse = IsSparseValue(input);
-  if (!input.getType().isa<mlir::RankedTensorType>())
+  if (!mlir::isa<mlir::RankedTensorType>(input.getType()))
     return errors::Internal(
         "attempting to relayout a tensor that does not "
         "have a rank");
@@ -389,7 +397,7 @@ StatusOr<mlir::Value> EmitRelayout(
 mlir::Operation* EmitTransposeOp(mlir::OpBuilder& builder,
                                  const mlir::Location& loc, mlir::Value input,
                                  std::vector<int64_t>& perm_arr) {
-  auto tr_input_type = input.getType().cast<mlir::ShapedType>();
+  auto tr_input_type = mlir::cast<mlir::ShapedType>(input.getType());
   auto shape = tr_input_type.getShape();
 
   auto perm_type = mlir::RankedTensorType::get(
@@ -591,7 +599,8 @@ StatusOr<mlir::Value> EmitHaloExchange(mlir::OpBuilder& builder, int halo_size,
   if (!mesh.is_tpu_mesh())
     return errors::InvalidArgument("Halo exchange is only supported on TPU.");
 
-  auto input_tensor_type = tensor.getType().dyn_cast<mlir::RankedTensorType>();
+  auto input_tensor_type =
+      mlir::dyn_cast<mlir::RankedTensorType>(tensor.getType());
   if (!input_tensor_type || !input_tensor_type.hasStaticShape())
     return errors::InvalidArgument(
         "Static shape of input tensor must be known for halo exchange.");
diff --git a/tensorflow/dtensor/mlir/collectives.h b/tensorflow/dtensor/mlir/collectives.h
index 8cea0f3feab278..fc0f8f0203d68a 100644
--- a/tensorflow/dtensor/mlir/collectives.h
+++ b/tensorflow/dtensor/mlir/collectives.h
@@ -21,8 +21,13 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/string_view.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/dtensor/cc/dstatus.h"
 #include "tensorflow/dtensor/cc/tensor_layout.h"
 
 namespace tensorflow {
diff --git a/tensorflow/dtensor/mlir/collectives_common.cc b/tensorflow/dtensor/mlir/collectives_common.cc
index 5346dff475606e..9305a443ebfcd4 100644
--- a/tensorflow/dtensor/mlir/collectives_common.cc
+++ b/tensorflow/dtensor/mlir/collectives_common.cc
@@ -19,6 +19,12 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+
 namespace tensorflow {
 namespace dtensor {
 
diff --git a/tensorflow/dtensor/mlir/collectives_common.h b/tensorflow/dtensor/mlir/collectives_common.h
index fcac4085fa1459..6041eb4501de3f 100644
--- a/tensorflow/dtensor/mlir/collectives_common.h
+++ b/tensorflow/dtensor/mlir/collectives_common.h
@@ -20,6 +20,9 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/dtensor/cc/dstatus.h"
 #include "tensorflow/dtensor/cc/tensor_layout.h"
 
 namespace tensorflow {
diff --git a/tensorflow/dtensor/mlir/constant_folding.cc b/tensorflow/dtensor/mlir/constant_folding.cc
index edde8507d22ba8..0fa3ebd5cea006 100644
--- a/tensorflow/dtensor/mlir/constant_folding.cc
+++ b/tensorflow/dtensor/mlir/constant_folding.cc
@@ -15,13 +15,16 @@ limitations under the License.
 
 #include <memory>
 
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/FoldUtils.h"  // from @llvm-project
-#include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/dtensor/mlir/dtensor_mlir_passes.h"
 
 namespace tensorflow {
 namespace dtensor {
diff --git a/tensorflow/dtensor/mlir/dce.cc b/tensorflow/dtensor/mlir/dce.cc
index b3771107e97818..53b43f8f1ea2d1 100644
--- a/tensorflow/dtensor/mlir/dce.cc
+++ b/tensorflow/dtensor/mlir/dce.cc
@@ -17,11 +17,10 @@ limitations under the License.
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Transforms/Passes.h"  // from @llvm-project
-#include "tensorflow/dtensor/mlir/dtensor_mlir_passes.h"
+#include "mlir/Pass/Pass.h"  // from @llvm-project
 
 namespace tensorflow {
 namespace dtensor {
diff --git a/tensorflow/dtensor/mlir/designate_resource_handle_mesh.cc b/tensorflow/dtensor/mlir/designate_resource_handle_mesh.cc
index 4618e3fa9ff55f..746d2d4069d834 100644
--- a/tensorflow/dtensor/mlir/designate_resource_handle_mesh.cc
+++ b/tensorflow/dtensor/mlir/designate_resource_handle_mesh.cc
@@ -15,20 +15,19 @@ limitations under the License.
 
 #include <memory>
 
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
-#include "mlir/Transforms/Passes.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/dtensor/cc/constants.h"
 #include "tensorflow/dtensor/cc/tensor_layout.h"
-#include "tensorflow/dtensor/mlir/dtensor_mlir_passes.h"
-#include "tensorflow/dtensor/mlir/ir/tf_dtensor.h"
 
 namespace tensorflow {
 namespace dtensor {
diff --git a/tensorflow/dtensor/mlir/device_mesh_cluster_coarsening.cc b/tensorflow/dtensor/mlir/device_mesh_cluster_coarsening.cc
index c1484beb5c2e3d..df48779b0c2091 100644
--- a/tensorflow/dtensor/mlir/device_mesh_cluster_coarsening.cc
+++ b/tensorflow/dtensor/mlir/device_mesh_cluster_coarsening.cc
@@ -20,24 +20,21 @@ limitations under the License.
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Block.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Transforms/Passes.h"  // from @llvm-project
-#include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/dtensor/cc/constants.h"
-#include "tensorflow/dtensor/cc/tensor_layout.h"
-#include "tensorflow/dtensor/mlir/dtensor_mlir_passes.h"
-#include "tensorflow/dtensor/mlir/ir/tf_dtensor.h"
 #include "tensorflow/dtensor/mlir/layout_parsing.h"
+#include "tsl/platform/status.h"
 
 namespace tensorflow {
 namespace dtensor {
@@ -62,12 +59,12 @@ mlir::LogicalResult ShouldMergeClusters(mlir::tf_device::ClusterOp cluster_a,
   auto mesh_a_or_status = ExtractDeviceMeshFromOp(cluster_a.getOperation());
   if (!mesh_a_or_status.ok())
     return cluster_a.emitOpError(
-        tsl::NullTerminatedMessage(mesh_a_or_status.status()));
+        absl::StatusMessageAsCStr(mesh_a_or_status.status()));
 
   auto mesh_b_or_status = ExtractDeviceMeshFromOp(cluster_b.getOperation());
   if (!mesh_b_or_status.ok())
     return cluster_b.emitOpError(
-        tsl::NullTerminatedMessage(mesh_b_or_status.status()));
+        absl::StatusMessageAsCStr(mesh_b_or_status.status()));
 
   auto mesh_a = mesh_a_or_status.value();
   auto mesh_b = mesh_b_or_status.value();
diff --git a/tensorflow/dtensor/mlir/device_utils.cc b/tensorflow/dtensor/mlir/device_utils.cc
index f32ae0f0beff91..16edf8660d3407 100644
--- a/tensorflow/dtensor/mlir/device_utils.cc
+++ b/tensorflow/dtensor/mlir/device_utils.cc
@@ -15,9 +15,13 @@ limitations under the License.
 
 #include "tensorflow/dtensor/mlir/device_utils.h"
 
+#include "llvm/Support/Casting.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/core/platform/errors.h"
+#include "tensorflow/dtensor/cc/dstatus.h"
 
 namespace tensorflow {
 namespace dtensor {
@@ -38,9 +42,10 @@ StatusOr<mlir::Value> DeviceId(mlir::Operation* op) {
         "enclosing function must contain device id as argument");
 
   auto device_id = function.getArgument(0);
-  auto device_id_type = device_id.getType().dyn_cast<mlir::RankedTensorType>();
+  auto device_id_type =
+      mlir::dyn_cast<mlir::RankedTensorType>(device_id.getType());
   if (!device_id_type ||
-      !device_id_type.getElementType().isa<mlir::IntegerType>())
+      !mlir::isa<mlir::IntegerType>(device_id_type.getElementType()))
     return errors::InvalidArgument(
         "0-th argument of the enclosing function should be integer device id.");
 
@@ -48,12 +53,12 @@ StatusOr<mlir::Value> DeviceId(mlir::Operation* op) {
 }
 
 StatusOr<mlir::Value> DeviceId(mlir::Value val) {
-  if (auto block_arg = val.dyn_cast<mlir::BlockArgument>()) {
+  if (auto block_arg = mlir::dyn_cast<mlir::BlockArgument>(val)) {
     auto device_id = block_arg.getOwner()->getArgument(0);
     auto device_id_type =
-        device_id.getType().dyn_cast<mlir::RankedTensorType>();
+        mlir::dyn_cast<mlir::RankedTensorType>(device_id.getType());
     if (!device_id_type ||
-        !device_id_type.getElementType().isa<mlir::IntegerType>())
+        !mlir::isa<mlir::IntegerType>(device_id_type.getElementType()))
       return errors::InvalidArgument(
           "0-th argument of the enclosing block should be integer device id.");
     return device_id;
diff --git a/tensorflow/dtensor/mlir/dtensor_allreduce_combine_optimization.cc b/tensorflow/dtensor/mlir/dtensor_allreduce_combine_optimization.cc
index 8d9e34b6aa77a5..89bb3800e1741e 100644
--- a/tensorflow/dtensor/mlir/dtensor_allreduce_combine_optimization.cc
+++ b/tensorflow/dtensor/mlir/dtensor_allreduce_combine_optimization.cc
@@ -21,28 +21,40 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/TopologicalSortUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.h"
-#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/utils/name_utils.h"
-#include "tensorflow/core/platform/str_util.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/strcat.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/dtensor/cc/constants.h"
-#include "tensorflow/dtensor/cc/dtensor_utils.h"
+#include "tensorflow/dtensor/cc/dstatus.h"
 #include "tensorflow/dtensor/mlir/dtensor_location.h"
-#include "tensorflow/dtensor/mlir/dtensor_mlir_passes.h"
 #include "tensorflow/dtensor/mlir/ir/tf_dtensor.h"
 #include "tensorflow/dtensor/mlir/layout_parsing.h"
 
@@ -112,7 +124,7 @@ mlir::LogicalResult MergeAllReduceGroup(
   all_reduce_shapes.reserve(num_all_reduces);
   for (mlir::TF::DTensorAllReduceOp& all_reduce : all_reduce_group) {
     auto all_reduce_ranked_type =
-        all_reduce.getType().dyn_cast<mlir::RankedTensorType>();
+        mlir::dyn_cast<mlir::RankedTensorType>(all_reduce.getType());
     if (!all_reduce_ranked_type || !all_reduce_ranked_type.hasStaticShape()) {
       return all_reduce.emitOpError(llvm::formatv(
           "requires static shape for DTensorAllReduceOp, but got : {0}",
@@ -152,7 +164,7 @@ mlir::LogicalResult MergeAllReduceGroup(
     mlir::TF::DTensorAllReduceOp& all_reduce = all_reduce_group[i];
     mlir::Location loc = all_reduce.getLoc();
     auto all_reduce_ranked_type =
-        all_reduce.getType().dyn_cast<mlir::RankedTensorType>();
+        mlir::dyn_cast<mlir::RankedTensorType>(all_reduce.getType());
     if (!all_reduce_ranked_type || !all_reduce_ranked_type.hasStaticShape()) {
       return all_reduce.emitOpError(llvm::formatv(
           "requires static shape for DTensorAllReduceOp, but got : {0}",
@@ -201,7 +213,7 @@ mlir::LogicalResult MergeAllReduceGroup(
     mlir::TF::DTensorAllReduceOp& all_reduce = all_reduce_group[i];
     mlir::Location loc = all_reduce.getLoc();
     auto all_reduce_ranked_type =
-        all_reduce.getType().dyn_cast<mlir::RankedTensorType>();
+        mlir::dyn_cast<mlir::RankedTensorType>(all_reduce.getType());
     if (!all_reduce_ranked_type || !all_reduce_ranked_type.hasStaticShape()) {
       return all_reduce.emitOpError(llvm::formatv(
           "requires static shape for DTensorAllReduceOp, but got : {0}",
@@ -676,7 +688,7 @@ struct DTensorAllReduceCombineOptimization
         if (!all_reduce.getDeviceType().contains("TPU")) {
           // Only combine all reduces for GPU and CPU
           mlir::RankedTensorType all_reduce_ranked_type =
-              all_reduce.getType().dyn_cast<mlir::RankedTensorType>();
+              mlir::dyn_cast<mlir::RankedTensorType>(all_reduce.getType());
 
           if (all_reduce_ranked_type &&
               all_reduce_ranked_type.hasStaticShape()) {
diff --git a/tensorflow/dtensor/mlir/dtensor_allreduce_scatter_optimization.cc b/tensorflow/dtensor/mlir/dtensor_allreduce_scatter_optimization.cc
index 2eb8d4da18890d..682af5ae92b021 100644
--- a/tensorflow/dtensor/mlir/dtensor_allreduce_scatter_optimization.cc
+++ b/tensorflow/dtensor/mlir/dtensor_allreduce_scatter_optimization.cc
@@ -18,23 +18,25 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/Support/FormatVariadic.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/UseDefLists.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
 #include "tensorflow/dtensor/mlir/collectives_common.h"
-#include "tensorflow/dtensor/mlir/dtensor_mlir_passes.h"
 #include "tensorflow/dtensor/mlir/ir/tf_dtensor.h"
 #include "tensorflow/dtensor/mlir/layout_parsing.h"
-#include "tensorflow/dtensor/mlir/spmd_expander_common.h"
 
 namespace tensorflow {
 namespace dtensor {
diff --git a/tensorflow/dtensor/mlir/dtensor_allreduce_sum_optimization.cc b/tensorflow/dtensor/mlir/dtensor_allreduce_sum_optimization.cc
index 78a67ea1b9f404..0a7a232290b8a7 100644
--- a/tensorflow/dtensor/mlir/dtensor_allreduce_sum_optimization.cc
+++ b/tensorflow/dtensor/mlir/dtensor_allreduce_sum_optimization.cc
@@ -16,19 +16,24 @@ limitations under the License.
 #include <memory>
 #include <string>
 
-#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/UseDefLists.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/dtensor/mlir/dtensor_mlir_passes.h"
 #include "tensorflow/dtensor/mlir/ir/tf_dtensor.h"
 #include "tensorflow/dtensor/mlir/layout_parsing.h"
 #include "tensorflow/dtensor/mlir/spmd_expander_common.h"
@@ -56,7 +61,7 @@ bool IsZeroConstant(mlir::Value val) {
       GetIdentitySkippedInputs(val).getDefiningOp());
   if (!const_input) return false;
   mlir::DenseFPElementsAttr attr =
-      const_input.getValue().dyn_cast<mlir::DenseFPElementsAttr>();
+      mlir::dyn_cast<mlir::DenseFPElementsAttr>(const_input.getValue());
   // This uses the fact that constant Attrs becomes splats, so we only need to
   // check one value.
   if (!attr || !attr.isSplat()) return false;
@@ -255,9 +260,9 @@ void OptimizeIdentityLikeOps(mlir::Operation* op, bool* changed) {
   mlir::Value op_output = op->getResult(0);
   dtensor_all_reduce.setOperand(0, op_output);
   dtensor_all_reduce.getInput().setType(
-      op_output.getType().cast<mlir::TensorType>());
+      mlir::cast<mlir::TensorType>(op_output.getType()));
   dtensor_all_reduce.getOutput().setType(
-      op_output.getType().cast<mlir::TensorType>());
+      mlir::cast<mlir::TensorType>(op_output.getType()));
 
   llvm::SmallPtrSet<mlir::Operation*, 4> exceptions{dtensor_all_reduce};
   op_output.replaceAllUsesExcept(dtensor_all_reduce.getOutput(), exceptions);
@@ -296,12 +301,12 @@ bool CheckWhileLoopOptimizationCriteria(
       llvm::dyn_cast_or_null<mlir::TF::DTensorAllReduceOp>(
           first_operand.get().getDefiningOp());
   if (all_reduce) {
-    block_arg = second_operand.get().dyn_cast<mlir::BlockArgument>();
+    block_arg = mlir::dyn_cast<mlir::BlockArgument>(second_operand.get());
     *add_input = &second_operand;
   } else {
     all_reduce = llvm::dyn_cast_or_null<mlir::TF::DTensorAllReduceOp>(
         second_operand.get().getDefiningOp());
-    block_arg = first_operand.get().dyn_cast<mlir::BlockArgument>();
+    block_arg = mlir::dyn_cast<mlir::BlockArgument>(first_operand.get());
     *add_input = &first_operand;
   }
   if (!block_arg || !all_reduce) return false;
diff --git a/tensorflow/dtensor/mlir/dtensor_collective_type_lowering.cc b/tensorflow/dtensor/mlir/dtensor_collective_type_lowering.cc
index 9c3171cbf0131d..3cb81b7560a664 100644
--- a/tensorflow/dtensor/mlir/dtensor_collective_type_lowering.cc
+++ b/tensorflow/dtensor/mlir/dtensor_collective_type_lowering.cc
@@ -17,39 +17,25 @@ limitations under the License.
 #include <optional>
 #include <string>
 
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
-#include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/dtensor/cc/constants.h"
-#include "tensorflow/dtensor/cc/dtensor_utils.h"
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/dtensor/cc/dstatus.h"
 #include "tensorflow/dtensor/cc/tensor_layout.h"
-#include "tensorflow/dtensor/mlir/collectives_common.h"
-#include "tensorflow/dtensor/mlir/device_utils.h"
-#include "tensorflow/dtensor/mlir/dtensor_dialect/ir/dialect.h"
-#include "tensorflow/dtensor/mlir/dtensor_dialect/ir/dtensor_attributes.h"
-#include "tensorflow/dtensor/mlir/dtensor_location.h"
 #include "tensorflow/dtensor/mlir/ir/tf_dtensor.h"
 #include "tensorflow/dtensor/mlir/layout_parsing.h"
 #include "tensorflow/dtensor/mlir/spmd_expander_common.h"
-#include "tensorflow/dtensor/mlir/value_utils.h"
 
 namespace tensorflow {
 namespace dtensor {
@@ -99,9 +85,9 @@ mlir::LogicalResult ConvertShortIntReduce(ReduceOpType reduce_op) {
 
   // Handle bools by first casting to int32 and swapping All/Any for Min/Max.
   const mlir::TensorType& tensor_input_type =
-      input_type.dyn_cast<mlir::TensorType>();
+      mlir::dyn_cast<mlir::TensorType>(input_type);
   const mlir::TensorType& tensor_output_type =
-      output_type.dyn_cast<mlir::TensorType>();
+      mlir::dyn_cast<mlir::TensorType>(output_type);
   if (!tensor_input_type) return mlir::success();
   if (!tensor_output_type) return mlir::success();
 
@@ -166,12 +152,12 @@ mlir::LogicalResult ConvertComplexReduce(ReduceOpType reduce_op) {
   const mlir::Value tensor_input = reduce_op.getInput();
   const mlir::Value tensor_result = reduce_op.getResult();
   const mlir::TensorType complex_input_tensor_type =
-      tensor_input.getType().dyn_cast<mlir::TensorType>();
+      mlir::dyn_cast<mlir::TensorType>(tensor_input.getType());
   if (!complex_input_tensor_type) {
     return mlir::success();
   }
   const mlir::TensorType complex_result_tensor_type =
-      tensor_result.getType().dyn_cast<mlir::TensorType>();
+      mlir::dyn_cast<mlir::TensorType>(tensor_result.getType());
   if (!complex_result_tensor_type) {
     return mlir::success();
   }
@@ -222,12 +208,12 @@ mlir::LogicalResult ConvertComplexCollectives(CollectiveType op) {
   const mlir::Value tensor_input = op.getInput();
   const mlir::Value tensor_result = op.getResult();
   const mlir::TensorType complex_input_tensor_type =
-      tensor_input.getType().dyn_cast<mlir::TensorType>();
+      mlir::dyn_cast<mlir::TensorType>(tensor_input.getType());
   if (!complex_input_tensor_type) {
     return mlir::success();
   }
   const mlir::TensorType& complex_result_tensor_type =
-      tensor_result.getType().dyn_cast<mlir::TensorType>();
+      mlir::dyn_cast<mlir::TensorType>(tensor_result.getType());
   if (!complex_result_tensor_type) {
     return mlir::success();
   }
diff --git a/tensorflow/dtensor/mlir/dtensor_dialect/BUILD b/tensorflow/dtensor/mlir/dtensor_dialect/BUILD
index d484322d42fea4..534638eed7b6a6 100644
--- a/tensorflow/dtensor/mlir/dtensor_dialect/BUILD
+++ b/tensorflow/dtensor/mlir/dtensor_dialect/BUILD
@@ -71,6 +71,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:Support",
     ],
 )
 
diff --git a/tensorflow/dtensor/mlir/dtensor_dialect/ir/ops.cc b/tensorflow/dtensor/mlir/dtensor_dialect/ir/ops.cc
index 7fa35ca75f11d7..80a9a91f61b645 100644
--- a/tensorflow/dtensor/mlir/dtensor_dialect/ir/ops.cc
+++ b/tensorflow/dtensor/mlir/dtensor_dialect/ir/ops.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/IR/DialectImplementation.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/utils/string_container_utils.h"
 #include "tensorflow/dtensor/cc/tensor_layout.h"
 #include "tensorflow/dtensor/mlir/dtensor_dialect/ir/dialect.h"
@@ -144,9 +145,10 @@ static void printLayoutAttr(LayoutAttr attr, DialectAsmPrinter &os) {
 void DTensorDialect::printAttribute(Attribute attr,
                                     DialectAsmPrinter &os) const {
   // Cast into correct attribute and print
-  if (auto mesh_attr = attr.dyn_cast<MeshAttr>()) printMeshAttr(mesh_attr, os);
+  if (auto mesh_attr = mlir::dyn_cast<MeshAttr>(attr))
+    printMeshAttr(mesh_attr, os);
 
-  if (auto layout_attr = attr.dyn_cast<LayoutAttr>())
+  if (auto layout_attr = mlir::dyn_cast<LayoutAttr>(attr))
     printLayoutAttr(layout_attr, os);
 }
 }  // namespace dtensor
diff --git a/tensorflow/dtensor/mlir/dtensor_layout_to_xla_sharding_op.cc b/tensorflow/dtensor/mlir/dtensor_layout_to_xla_sharding_op.cc
index 0f633c3707a459..b9e9b0648a7220 100644
--- a/tensorflow/dtensor/mlir/dtensor_layout_to_xla_sharding_op.cc
+++ b/tensorflow/dtensor/mlir/dtensor_layout_to_xla_sharding_op.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -65,7 +66,7 @@ class DTensorLayoutToXlaShardingOpPass
 mlir::LogicalResult RemoveDTensorLayoutAfterConstOrBlockArgPattern::match(
     DTensorLayout layout_op) const {
   auto input = layout_op.getInput();
-  if (input.isa<mlir::BlockArgument>()) {
+  if (mlir::isa<mlir::BlockArgument>(input)) {
     return mlir::success();
   }
   mlir::Operation* input_op = input.getDefiningOp();
diff --git a/tensorflow/dtensor/mlir/dtensor_location.cc b/tensorflow/dtensor/mlir/dtensor_location.cc
index 23be51a4c96695..a129889c5e51c0 100644
--- a/tensorflow/dtensor/mlir/dtensor_location.cc
+++ b/tensorflow/dtensor/mlir/dtensor_location.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/utils/name_utils.h"
 
 namespace tensorflow {
@@ -66,12 +67,12 @@ std::string DTensorLocationToString(mlir::Location loc) {
 
   while (!queue.empty()) {
     mlir::Location& front = queue.front();
-    if (auto name_loc = front.dyn_cast<mlir::NameLoc>()) {
+    if (auto name_loc = mlir::dyn_cast<mlir::NameLoc>(front)) {
       queue.push(name_loc.getChildLoc());
-    } else if (auto callsite_loc = front.dyn_cast<mlir::CallSiteLoc>()) {
+    } else if (auto callsite_loc = mlir::dyn_cast<mlir::CallSiteLoc>(front)) {
       queue.push(callsite_loc.getCallee());
       queue.push(callsite_loc.getCaller());
-    } else if (auto line_loc = front.dyn_cast<mlir::FileLineColLoc>()) {
+    } else if (auto line_loc = mlir::dyn_cast<mlir::FileLineColLoc>(front)) {
       stack.push_back(CreateLocalLocationString(line_loc));
     }
     queue.pop();
diff --git a/tensorflow/dtensor/mlir/dtensor_location_test.cc b/tensorflow/dtensor/mlir/dtensor_location_test.cc
index 3989c3623b3901..773f6af8db4d82 100644
--- a/tensorflow/dtensor/mlir/dtensor_location_test.cc
+++ b/tensorflow/dtensor/mlir/dtensor_location_test.cc
@@ -25,8 +25,8 @@ namespace {
 
 void CheckFileLineColLocation(mlir::Location loc, unsigned line,
                               unsigned column) {
-  ASSERT_TRUE(loc.isa<mlir::FileLineColLoc>());
-  auto file_line_col_loc = loc.cast<mlir::FileLineColLoc>();
+  ASSERT_TRUE(mlir::isa<mlir::FileLineColLoc>(loc));
+  auto file_line_col_loc = mlir::cast<mlir::FileLineColLoc>(loc);
   EXPECT_EQ(file_line_col_loc.getFilename(), "test.cc");
   EXPECT_EQ(file_line_col_loc.getLine(), line);
   EXPECT_EQ(file_line_col_loc.getColumn(), column);
@@ -37,8 +37,8 @@ TEST(DTensorLocationTest, HandlesEmptyLocation) {
   mlir::Location loc = mlir::FileLineColLoc::get(&ctx, "test.cc", 10, 20);
   loc = tensorflow::dtensor::DTensorLocation(loc, "test.cc", 21);
 
-  ASSERT_TRUE(loc.isa<mlir::CallSiteLoc>());
-  auto callsite_loc = loc.cast<mlir::CallSiteLoc>();
+  ASSERT_TRUE(mlir::isa<mlir::CallSiteLoc>(loc));
+  auto callsite_loc = mlir::cast<mlir::CallSiteLoc>(loc);
   CheckFileLineColLocation(callsite_loc.getCallee(), 21, 0);
   CheckFileLineColLocation(callsite_loc.getCaller(), 10, 20);
 
@@ -57,8 +57,8 @@ TEST(DTensorLocationTest, HandlesMultipleCalls) {
 
   auto verify_loc = test_loc;
   for (int i = 0; i < 4; ++i) {
-    ASSERT_TRUE(verify_loc.isa<mlir::CallSiteLoc>());
-    auto callsite_loc = verify_loc.cast<mlir::CallSiteLoc>();
+    ASSERT_TRUE(mlir::isa<mlir::CallSiteLoc>(verify_loc));
+    auto callsite_loc = mlir::cast<mlir::CallSiteLoc>(verify_loc);
     auto callee_loc = callsite_loc.getCallee();
     CheckFileLineColLocation(callee_loc, 24 - i, 0);
     verify_loc = callsite_loc.getCaller();
@@ -80,17 +80,18 @@ TEST(DTensorLocationTest, HandlesNameLoc) {
                          mlir::FileLineColLoc::get(&ctx, "test.cc", 10, 20));
   test_loc = tensorflow::dtensor::DTensorLocation(test_loc, "test.cc", 21);
   ASSERT_EQ(mlir::GetNameFromLoc(test_loc), "op");
-  ASSERT_TRUE(test_loc.isa<mlir::CallSiteLoc>());
-  auto callsite_loc = test_loc.cast<mlir::CallSiteLoc>();
-  mlir::Location caller_loc = test_loc.cast<mlir::CallSiteLoc>().getCaller();
-  ASSERT_TRUE(caller_loc.isa<mlir::NameLoc>());
-  CheckFileLineColLocation(caller_loc.cast<mlir::NameLoc>().getChildLoc(), 10,
-                           20);
+  ASSERT_TRUE(mlir::isa<mlir::CallSiteLoc>(test_loc));
+  auto callsite_loc = mlir::cast<mlir::CallSiteLoc>(test_loc);
+  mlir::Location caller_loc =
+      mlir::cast<mlir::CallSiteLoc>(test_loc).getCaller();
+  ASSERT_TRUE(mlir::isa<mlir::NameLoc>(caller_loc));
+  CheckFileLineColLocation(mlir::cast<mlir::NameLoc>(caller_loc).getChildLoc(),
+                           10, 20);
 
   mlir::Location callee_loc = callsite_loc.getCallee();
-  ASSERT_TRUE(callee_loc.isa<mlir::NameLoc>());
-  CheckFileLineColLocation(callee_loc.cast<mlir::NameLoc>().getChildLoc(), 21,
-                           0);
+  ASSERT_TRUE(mlir::isa<mlir::NameLoc>(callee_loc));
+  CheckFileLineColLocation(mlir::cast<mlir::NameLoc>(callee_loc).getChildLoc(),
+                           21, 0);
 
   constexpr char stack[] = R"stack(>> test.cc:10:20
 >> test.cc:21:0)stack";
diff --git a/tensorflow/dtensor/mlir/dtensor_mixed_precision_reduce.cc b/tensorflow/dtensor/mlir/dtensor_mixed_precision_reduce.cc
index 5ecb38b3e97590..c5fbe20ac067f3 100644
--- a/tensorflow/dtensor/mlir/dtensor_mixed_precision_reduce.cc
+++ b/tensorflow/dtensor/mlir/dtensor_mixed_precision_reduce.cc
@@ -69,9 +69,7 @@ template <class ReduceOpType>
 mlir::LogicalResult MaybeUpcastForReduction(ReduceOpType reduce_op,
                                             bool* changed) {
   const mlir::RankedTensorType& input_type =
-      reduce_op.getInput()
-          .getType()
-          .template dyn_cast<mlir::RankedTensorType>();
+      mlir::dyn_cast<mlir::RankedTensorType>(reduce_op.getInput().getType());
   if (!input_type.getElementType().isBF16()) {
     // Upcast only applies for bfloat16 input.
     return mlir::success();
@@ -96,9 +94,7 @@ mlir::LogicalResult MaybeUpcastForReduction(ReduceOpType reduce_op,
   // The original output tensor type that would have been used by all users of
   // the reduce op.
   const mlir::RankedTensorType& output_type =
-      reduce_op.getOutput()
-          .getType()
-          .template dyn_cast<mlir::RankedTensorType>();
+      mlir::dyn_cast<mlir::RankedTensorType>(reduce_op.getOutput().getType());
 
   mlir::TF::CastOp upcast = builder.create<mlir::TF::CastOp>(
       loc,
diff --git a/tensorflow/dtensor/mlir/dtensor_multi_device_expansion.cc b/tensorflow/dtensor/mlir/dtensor_multi_device_expansion.cc
index 1f47934230e3bc..572f0870374c7e 100644
--- a/tensorflow/dtensor/mlir/dtensor_multi_device_expansion.cc
+++ b/tensorflow/dtensor/mlir/dtensor_multi_device_expansion.cc
@@ -160,7 +160,7 @@ StatusOr<std::optional<std::vector<Layout>>> GetResourceLayouts(
     std::vector<Layout> layouts;
     layouts.reserve(attrs.size());
     for (mlir::Attribute attr : attrs) {
-      auto string_attr = attr.cast<mlir::StringAttr>();
+      auto string_attr = mlir::cast<mlir::StringAttr>(attr);
       auto layout = Layout::FromString(string_attr.str());
       if (layout.ok()) {
         layouts.emplace_back(std::move(layout.value()));
@@ -175,7 +175,8 @@ StatusOr<std::optional<std::vector<Layout>>> GetResourceLayouts(
 }
 
 bool IsResource(mlir::Value value) {
-  return getElementTypeOrSelf(value.getType()).isa<mlir::TF::ResourceType>();
+  return mlir::isa<mlir::TF::ResourceType>(
+      getElementTypeOrSelf(value.getType()));
 }
 
 StatusOr<std::optional<Layout>> FindResourceLayout(mlir::BlockArgument arg) {
@@ -298,8 +299,7 @@ mlir::LogicalResult RewriteTPUFunction(mlir::func::FuncOp func,
         mappings[dev_idx].map(argument, expanded_arguments->at(dev_idx));
       }
     } else {
-      func->emitOpError(
-          tsl::NullTerminatedMessage(expanded_arguments.status()));
+      func->emitOpError(absl::StatusMessageAsCStr(expanded_arguments.status()));
       return mlir::failure();
     }
   }
@@ -417,11 +417,11 @@ mlir::LogicalResult ExpandTPUOperation(
 
   llvm::SmallVector<mlir::Value, 8> operands;
   for (const mlir::Value& operand : op->getOperands()) {
-    if (const auto arg = operand.dyn_cast_or_null<mlir::BlockArgument>()) {
+    if (const auto arg = mlir::dyn_cast_or_null<mlir::BlockArgument>(operand)) {
       const StatusOr<absl::Span<mlir::Value>> new_args = GetExpandedArguments(
           builder, target_func, expanded_arguments, arg, &target_mesh);
       if (!new_args.ok()) {
-        op->emitOpError(tsl::NullTerminatedMessage(new_args.status()));
+        op->emitOpError(absl::StatusMessageAsCStr(new_args.status()));
         return mlir::failure();
       } else if (new_args->empty()) {
         operands.push_back(operand);
@@ -481,11 +481,12 @@ mlir::LogicalResult ExpandOperation(
   for (size_t i = 0; i < num_devices; ++i) {
     llvm::SmallVector<mlir::Value, 8> operands;
     for (const mlir::Value& operand : op->getOperands()) {
-      if (const auto arg = operand.dyn_cast_or_null<mlir::BlockArgument>()) {
+      if (const auto arg =
+              mlir::dyn_cast_or_null<mlir::BlockArgument>(operand)) {
         const StatusOr<absl::Span<mlir::Value>> new_args = GetExpandedArguments(
             builder, target_func, expanded_arguments, arg, &target_mesh);
         if (!new_args.ok()) {
-          op->emitOpError(tsl::NullTerminatedMessage(new_args.status()));
+          op->emitOpError(absl::StatusMessageAsCStr(new_args.status()));
           return mlir::failure();
         } else if (new_args->empty()) {
           operands.push_back(operand);
@@ -609,7 +610,7 @@ StatusOr<absl::Span<mlir::Value>> GetExpandedArguments(
         }
       } else {
         mlir::TensorType tensor_type =
-            arg.getType().dyn_cast_or_null<mlir::TensorType>();
+            mlir::dyn_cast_or_null<mlir::TensorType>(arg.getType());
         if (!tensor_type) {
           return errors::InvalidArgument("Could not determine tensor type.");
         }
@@ -807,7 +808,7 @@ struct DTensorMultiDeviceExpansion
                                main_func.getArgument(i));
       if (!expanded_arguments.ok()) {
         main_func->emitOpError(
-            tsl::NullTerminatedMessage(expanded_arguments.status()));
+            absl::StatusMessageAsCStr(expanded_arguments.status()));
         return;
       }
     }
diff --git a/tensorflow/dtensor/mlir/dtensor_send_recv.cc b/tensorflow/dtensor/mlir/dtensor_send_recv.cc
index ed4b636081fc93..fe5885012ef471 100644
--- a/tensorflow/dtensor/mlir/dtensor_send_recv.cc
+++ b/tensorflow/dtensor/mlir/dtensor_send_recv.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -48,14 +49,14 @@ namespace dtensor {
 namespace {
 
 bool IsStringType(mlir::Type type) {
-  if (type.isa<mlir::TF::StringType>()) return true;
+  if (mlir::isa<mlir::TF::StringType>(type)) return true;
 
-  auto sub_type = type.dyn_cast<mlir::TF::TensorFlowTypeWithSubtype>();
+  auto sub_type = mlir::dyn_cast<mlir::TF::TensorFlowTypeWithSubtype>(type);
   if (!sub_type) return false;
 
   bool has_string =
       llvm::any_of(sub_type.GetSubtypes(), [](mlir::TensorType type) {
-        return type.getElementType().isa<mlir::TF::StringType>();
+        return mlir::isa<mlir::TF::StringType>(type.getElementType());
       });
   return has_string;
 }
@@ -421,7 +422,7 @@ StatusOr<mlir::Operation*> LowerOneToOneDTensorSendToTFHostSend(
                            op_builder.getStringAttr(send_layout.ToString()));
         mlir::Value val = arg;
         if (i32_copy) {
-          auto val_type = val.getType().cast<mlir::TensorType>();
+          auto val_type = mlir::cast<mlir::TensorType>(val.getType());
           val = op_builder
                     .create<mlir::TF::CastOp>(
                         loc,
@@ -673,7 +674,7 @@ StatusOr<mlir::Operation*> LowerDTensorSend(mlir::Operation* send_op,
                                                       dtensor_send));
     } else {
       mlir::TensorType send_type =
-          send_input.getType().cast<mlir::TensorType>();
+          mlir::cast<mlir::TensorType>(send_input.getType());
       if (!recv_mesh.is_cpu_mesh() &&
           send_type.getElementType().isInteger(32)) {
         builder.setInsertionPointAfter(send_input.getDefiningOp());
@@ -745,7 +746,7 @@ StatusOr<mlir::Operation*> LowerDTensorRecv(mlir::Operation* send_op,
     TF_ASSIGN_OR_RETURN(
         mlir::TensorType local_output_type,
         LocalTypeFromGlobalType(
-            recv_layout, dtensor_recv.getType().cast<mlir::TensorType>()));
+            recv_layout, mlir::cast<mlir::TensorType>(dtensor_recv.getType())));
     TF_ASSIGN_OR_RETURN(
         lowered_recv, LowerDTensorRecvToXlaOp(dtensor_recv, local_output_type));
     dtensor_recv->replaceAllUsesWith(lowered_recv);
diff --git a/tensorflow/dtensor/mlir/expansions/control_flow_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/control_flow_spmd_expander.cc
index 6c618def560dfe..1f9711220824d2 100644
--- a/tensorflow/dtensor/mlir/expansions/control_flow_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/control_flow_spmd_expander.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/dtensor/mlir/expansions/control_flow_spmd_expander.h"
 
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
 namespace tensorflow {
 namespace dtensor {
 
@@ -79,7 +81,7 @@ StatusOr<mlir::Operation*> IfRegionSPMDExpander::ExpandOp(mlir::Operation* op) {
 
       result.setType(mlir::RankedTensorType::get(
           layout.LocalShapeFromGlobalShape(*global_shape),
-          result.getType().cast<mlir::TensorType>().getElementType()));
+          mlir::cast<mlir::TensorType>(result.getType()).getElementType()));
     }
   }
   return op;
diff --git a/tensorflow/dtensor/mlir/expansions/conv_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/conv_spmd_expander.cc
index 2602a16eb9f824..fa6dee1683936c 100644
--- a/tensorflow/dtensor/mlir/expansions/conv_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/conv_spmd_expander.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/core/platform/errors.h"
@@ -68,7 +69,7 @@ Status VerifyConvLayout(const Layout& input_layout, const Layout& filter_layout,
 
   const int num_non_default_dilations =
       llvm::count_if(conv_op.getDilations(), [](mlir::Attribute dilation) {
-        return dilation.cast<mlir::IntegerAttr>().getInt() != 1;
+        return mlir::cast<mlir::IntegerAttr>(dilation).getInt() != 1;
       });
   if (num_non_default_dilations > 0)
     return errors::InvalidArgument(
@@ -78,21 +79,21 @@ Status VerifyConvLayout(const Layout& input_layout, const Layout& filter_layout,
   // TODO(b/208700444): support convolution with strides greater than 1.
   const int num_non_default_strides =
       llvm::count_if(conv_op.getStrides(), [](mlir::Attribute stride) {
-        return stride.cast<mlir::IntegerAttr>().getInt() != 1;
+        return mlir::cast<mlir::IntegerAttr>(stride).getInt() != 1;
       });
   if (num_non_default_strides > 0)
     return errors::InvalidArgument(
         "Only stride 1 is supported for convolution with spatial partitions.");
 
   mlir::Value input = conv_op.getInput();
-  auto input_type = input.getType().dyn_cast<mlir::RankedTensorType>();
+  auto input_type = mlir::dyn_cast<mlir::RankedTensorType>(input.getType());
   if (!input_type || !input_type.hasStaticShape())
     return errors::InvalidArgument(
         "Input must have static shapes for convolution with spatial "
         "partitions.");
 
   mlir::Value filter = conv_op.getFilter();
-  auto filter_type = filter.getType().dyn_cast<mlir::RankedTensorType>();
+  auto filter_type = mlir::dyn_cast<mlir::RankedTensorType>(filter.getType());
   if (!filter_type || !filter_type.hasStaticShape())
     return errors::InvalidArgument(
         "Filter must have static shapes for convolution with spatial "
@@ -114,7 +115,7 @@ mlir::Value PadInputOnUnshardedDim(mlir::OpBuilder& builder,
                                    mlir::Value input_tensor, int curr_input_dim,
                                    int64_t curr_filter_dim_size) {
   auto input_tensor_type =
-      input_tensor.getType().dyn_cast<mlir::RankedTensorType>();
+      mlir::dyn_cast<mlir::RankedTensorType>(input_tensor.getType());
   auto input_tensor_shape = input_tensor_type.getShape();
 
   const size_t paddings_flat_length = input_tensor_type.getRank() * 2;
@@ -171,7 +172,7 @@ StatusOr<mlir::Operation*> HandleConv(ConvOp conv_op) {
   const auto output_num_shards = output_layout.num_shards();
 
   auto filter_type =
-      conv_op.getFilter().getType().template dyn_cast<mlir::RankedTensorType>();
+      mlir::dyn_cast<mlir::RankedTensorType>(conv_op.getFilter().getType());
   auto filter_shape = filter_type.getShape();
 
   int begin_input_dim = -1, end_input_dim = -1;
@@ -194,9 +195,8 @@ StatusOr<mlir::Operation*> HandleConv(ConvOp conv_op) {
        ++curr_input_dim) {
     int curr_filter_dim = curr_input_dim - begin_input_dim;
 
-    auto input_type = conv_op.getInput()
-                          .getType()
-                          .template dyn_cast<mlir::RankedTensorType>();
+    auto input_type =
+        mlir::dyn_cast<mlir::RankedTensorType>(conv_op.getInput().getType());
     auto input_shape = input_type.getShape();
 
     if (input_sharding_spec[curr_input_dim] == Layout::kUnshardedDim) {
@@ -410,7 +410,7 @@ StatusOr<mlir::Operation*> HandleConvBackpropInputTensor(
   // HandleConvBackpropInput which expects there to be a layout here.
   mlir::TF::ShapeAttr global_input_shape_shape = mlir::TF::ShapeAttr::get(
       builder.getContext(),
-      global_input_shape.getType().cast<mlir::TensorType>());
+      mlir::cast<mlir::TensorType>(global_input_shape.getType()));
   mlir::TF::DTensorLayout global_input_shape_with_layout =
       builder.create<mlir::TF::DTensorLayout>(
           conv_op->getLoc(), global_input_shape,
@@ -536,7 +536,7 @@ StatusOr<mlir::Operation*> HandleConvBackpropFilterTensor(
   // HandleConvBackpropInput which expects there to be a layout here.
   mlir::TF::ShapeAttr global_filter_shape_shape = mlir::TF::ShapeAttr::get(
       builder.getContext(),
-      global_filter_shape_const.getType().cast<mlir::TensorType>());
+      mlir::cast<mlir::TensorType>(global_filter_shape_const.getType()));
   mlir::TF::DTensorLayout global_filter_shape_with_layout =
       builder.create<mlir::TF::DTensorLayout>(
           conv_op->getLoc(), global_filter_shape_const,
diff --git a/tensorflow/dtensor/mlir/expansions/fft_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/fft_spmd_expander.cc
index d8de0e07b3be25..2893128e1f01c1 100644
--- a/tensorflow/dtensor/mlir/expansions/fft_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/fft_spmd_expander.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/core/platform/errors.h"
@@ -74,7 +75,7 @@ bool IsDistributedFFTN(int num_transform_axes, const Layout& layout) {
 bool IsComplexFFT(mlir::Value input) {
   auto data_type =
       mlir::dyn_cast<mlir::TensorType>(input.getType()).getElementType();
-  return data_type.isa<mlir::ComplexType>();
+  return mlir::isa<mlir::ComplexType>(data_type);
 }
 
 Status IsProperFFTLength(mlir::Operation* op,
diff --git a/tensorflow/dtensor/mlir/expansions/fill_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/fill_spmd_expander.cc
index 8aeb53f5e8314e..75dde58edee01c 100644
--- a/tensorflow/dtensor/mlir/expansions/fill_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/fill_spmd_expander.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <optional>
 
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 #include "xla/mlir_hlo/utils/convert_op_folder.h"
@@ -68,11 +69,9 @@ StatusOr<mlir::Operation*> FillSPMDExpander::ExpandOp(mlir::Operation* op) {
   auto int_type = mlir::RankedTensorType::get(
       static_cast<int64>(shard_values.size()), builder.getIntegerType(32));
   auto int_attr = mlir::DenseIntElementsAttr::get(int_type, shard_values);
-  auto target_type_attr =
-      mlir::hlo::convertElementsAttr(int_attr, original_fill.getDims()
-                                                   .getType()
-                                                   .cast<mlir::TensorType>()
-                                                   .getElementType());
+  auto target_type_attr = mlir::hlo::convertElementsAttr(
+      int_attr, mlir::cast<mlir::TensorType>(original_fill.getDims().getType())
+                    .getElementType());
 
   auto location = DT_LOC(op);
   auto num_shards =
@@ -82,7 +81,7 @@ StatusOr<mlir::Operation*> FillSPMDExpander::ExpandOp(mlir::Operation* op) {
                                              num_shards.getResult());
   // Copy over static shape information if available
   auto global_output_type =
-      original_fill.getResult().getType().cast<mlir::TensorType>();
+      mlir::cast<mlir::TensorType>(original_fill.getResult().getType());
   TF_ASSIGN_OR_RETURN(
       auto local_type,
       LocalTypeFromGlobalType(output_layout.value(), global_output_type));
diff --git a/tensorflow/dtensor/mlir/expansions/io_op_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/io_op_spmd_expander.cc
index 6cd2f10c4c97aa..64c55363d3451d 100644
--- a/tensorflow/dtensor/mlir/expansions/io_op_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/io_op_spmd_expander.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 
 #include "llvm/Support/FormatVariadic.h"
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/dtensor/cc/tensor_layout.h"
 #include "tensorflow/dtensor/mlir/collectives.h"
@@ -99,7 +100,7 @@ StatusOr<mlir::Operation*> Expand(mlir::Operation* op) {
       mlir::Value zero_scalar,
       CreateZeroScalarConst(
           builder, location,
-          device_id.getType().cast<mlir::TensorType>().getElementType()));
+          mlir::cast<mlir::TensorType>(device_id.getType()).getElementType()));
 
   mlir::TF::NotEqualOp not_equal = builder.create<mlir::TF::NotEqualOp>(
       location, device_id, zero_scalar,
diff --git a/tensorflow/dtensor/mlir/expansions/iterator_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/iterator_spmd_expander.cc
index cbe59bdb8557a0..b0c84ce679733b 100644
--- a/tensorflow/dtensor/mlir/expansions/iterator_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/iterator_spmd_expander.cc
@@ -50,7 +50,7 @@ StatusOr<mlir::Operation*> IteratorGetNextSPMDExpander::ExpandOp(
 
   for (int i = 0; i < original_op->getNumResults(); ++i) {
     mlir::TensorType global_output_type =
-        original_op.getResult(i).getType().cast<mlir::TensorType>();
+        mlir::cast<mlir::TensorType>(original_op.getResult(i).getType());
     std::vector<int64_t> local_shape =
         output_layouts[i].LocalShapeFromGlobalShape(
             global_output_type.getShape());
@@ -111,10 +111,9 @@ StatusOr<mlir::Operation*> IteratorGetNextAsOptionalSPMDExpander::ExpandOp(
   for (int i = 0; i < array_attr.size(); ++i) {
     std::vector<int64_t> local_shape =
         output_layouts[i].LocalShapeFromGlobalShape(
-            array_attr[i].cast<mlir::TF::ShapeAttr>().getShape());
-    output_shape_attrs[i] =
-        mlir::TF::ShapeAttr::get(op->getContext(), {local_shape})
-            .cast<mlir::Attribute>();
+            mlir::cast<mlir::TF::ShapeAttr>(array_attr[i]).getShape());
+    output_shape_attrs[i] = mlir::cast<mlir::Attribute>(
+        mlir::TF::ShapeAttr::get(op->getContext(), {local_shape}));
   }
 
   // Update the `output_shapes` attribute on the op to match the local shape
diff --git a/tensorflow/dtensor/mlir/expansions/meta_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/meta_spmd_expander.cc
index cc82fa42901fa7..0ef04e88833e00 100644
--- a/tensorflow/dtensor/mlir/expansions/meta_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/meta_spmd_expander.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.h"
@@ -244,8 +245,9 @@ namespace {
 Status VerifyPaddedDimensionNotSharded(const Layout& layout,
                                        mlir::Value pad_input,
                                        mlir::Value pad_output) {
-  auto input_type = pad_input.getType().dyn_cast<mlir::RankedTensorType>();
-  auto output_type = pad_output.getType().dyn_cast<mlir::RankedTensorType>();
+  auto input_type = mlir::dyn_cast<mlir::RankedTensorType>(pad_input.getType());
+  auto output_type =
+      mlir::dyn_cast<mlir::RankedTensorType>(pad_output.getType());
   if (!input_type || !output_type)
     return errors::InvalidArgument(
         "pad op input/output should have statically known shape for SPMD.");
@@ -435,7 +437,7 @@ StatusOr<mlir::Operation*> TileSPMDExpander::ExpandOp(mlir::Operation* op) {
   auto multiples_const = IntConst(builder, location, local_tile_multiples);
 
   auto global_output_type =
-      tile_op.getResult().getType().cast<mlir::TensorType>();
+      mlir::cast<mlir::TensorType>(tile_op.getResult().getType());
   TF_ASSIGN_OR_RETURN(
       auto local_type,
       LocalTypeFromGlobalType(output_layout.value(), global_output_type));
@@ -458,7 +460,7 @@ StatusOr<llvm::DenseMap<int, Layout>> TileSPMDExpander::ComputeLayoutForward(
   auto tile_op = llvm::cast<mlir::TF::TileOp>(op);
 
   auto output_ranked_type =
-      tile_op.getOutput().getType().dyn_cast<mlir::RankedTensorType>();
+      mlir::dyn_cast<mlir::RankedTensorType>(tile_op.getOutput().getType());
   if (!output_ranked_type || !output_ranked_type.hasStaticShape()) {
     return errors::InvalidArgument(
         llvm::formatv(
@@ -503,7 +505,7 @@ StatusOr<llvm::DenseMap<int, Layout>> TileSPMDExpander::ComputeLayoutBackward(
 
   // Retrieve operand/output shapes of tile op.
   auto input_ranked_type =
-      tile_op.getInput().getType().dyn_cast<mlir::RankedTensorType>();
+      mlir::dyn_cast<mlir::RankedTensorType>(tile_op.getInput().getType());
   if (!input_ranked_type || !input_ranked_type.hasStaticShape()) {
     return errors::InvalidArgument(
         llvm::formatv(
@@ -516,11 +518,9 @@ StatusOr<llvm::DenseMap<int, Layout>> TileSPMDExpander::ComputeLayoutBackward(
   llvm::DenseMap<int, Layout> input_layouts(op->getNumOperands());
 
   // `multiples` operand is always set to have replicated layout.
-  input_layouts[1] =
-      Layout::ReplicatedOnMesh(mesh, tile_op.getMultiples()
-                                         .getType()
-                                         .cast<mlir::RankedTensorType>()
-                                         .getRank());
+  input_layouts[1] = Layout::ReplicatedOnMesh(
+      mesh, mlir::cast<mlir::RankedTensorType>(tile_op.getMultiples().getType())
+                .getRank());
 
   llvm::SmallVector<int64_t, 4> static_multiple;
   auto status =
@@ -1043,9 +1043,9 @@ StatusOr<mlir::Operation*> OneHotSPMDExpander::ExpandOp(mlir::Operation* op) {
     mlir::TF::SliceOp selected_sharding_at_dimension = builder.create<
         mlir::TF::SliceOp>(
         one_hot_op.getLoc(),
-        mlir::RankedTensorType::get({1, 1}, mesh_coordinates.getType()
-                                                .cast<mlir::TensorType>()
-                                                .getElementType()),
+        mlir::RankedTensorType::get(
+            {1, 1}, mlir::cast<mlir::TensorType>(mesh_coordinates.getType())
+                        .getElementType()),
         /*input=*/mesh_coordinates,
         /*begin=*/IntConst(builder, one_hot_op.getLoc(), {0, mesh_dim_index}),
         /*size=*/IntConst(builder, one_hot_op.getLoc(), {1, 1}));
diff --git a/tensorflow/dtensor/mlir/expansions/nullary_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/nullary_spmd_expander.cc
index 20ca961d55c8f4..b6c0f573382ea2 100644
--- a/tensorflow/dtensor/mlir/expansions/nullary_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/nullary_spmd_expander.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/utils/array_container_utils.h"
@@ -53,7 +54,8 @@ StatusOr<mlir::Operation*> NullarySPMDExpander::ExpandOp(mlir::Operation* op) {
   if (all_operands_fully_replicated) return op;
 
   if (auto const_op = mlir::dyn_cast<mlir::TF::ConstOp>(op)) {
-    if (auto dense = const_op.getValue().dyn_cast<mlir::DenseElementsAttr>()) {
+    if (auto dense =
+            mlir::dyn_cast<mlir::DenseElementsAttr>(const_op.getValue())) {
       if (dense.isSplat()) {
         // A 'splat' value for a DenseElementsAttr, has a single value for
         // all its elements. For these inputs, we don't need to slice. We just
@@ -120,7 +122,7 @@ StatusOr<llvm::DenseMap<int, Layout>> NullarySPMDExpander::ComputeLayoutForward(
   // Nullary ops always output replicated layout for output values.
   for (auto i = 0; i < op->getNumResults(); ++i) {
     auto output_ranked_type =
-        op->getResult(i).getType().dyn_cast<mlir::RankedTensorType>();
+        mlir::dyn_cast<mlir::RankedTensorType>(op->getResult(i).getType());
     if (!output_ranked_type) {
       return errors::InvalidArgument(
           llvm::formatv("requires output type to have statically known rank, "
diff --git a/tensorflow/dtensor/mlir/expansions/optional_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/optional_spmd_expander.cc
index d0f5efac876b1f..b2c082d3bb71b0 100644
--- a/tensorflow/dtensor/mlir/expansions/optional_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/optional_spmd_expander.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/dtensor/cc/constants.h"
 #include "tensorflow/dtensor/cc/dstatus.h"
 #include "tensorflow/dtensor/mlir/dtensor_location.h"
@@ -39,7 +40,7 @@ StatusOr<mlir::Operation*> OptionalGetValueSPMDExpander::ExpandOp(
 
   for (int i = 0; i < original_op->getNumResults(); ++i) {
     mlir::TensorType global_output_type =
-        original_op.getResult(i).getType().cast<mlir::TensorType>();
+        mlir::cast<mlir::TensorType>(original_op.getResult(i).getType());
     TF_ASSIGN_OR_RETURN(
         mlir::TensorType local_type,
         LocalTypeFromGlobalType(output_layouts[i], global_output_type));
diff --git a/tensorflow/dtensor/mlir/expansions/random_op_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/random_op_spmd_expander.cc
index 5156e09e417c1f..fb4e5302c9dfae 100644
--- a/tensorflow/dtensor/mlir/expansions/random_op_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/random_op_spmd_expander.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/IntegerSet.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.h"
 #include "tensorflow/dtensor/cc/constants.h"
 #include "tensorflow/dtensor/cc/tensor_layout.h"
@@ -192,7 +193,7 @@ StatusOr<mlir::Value> ComputeNewSeed(mlir::OpBuilder& builder,
                                      mlir::Value op_seed) {
   TF_ASSIGN_OR_RETURN(auto device_id_seed, GetDeviceSeed(layout, op));
   mlir::Type seed_type =
-      op_seed.getType().cast<mlir::TensorType>().getElementType();
+      mlir::cast<mlir::TensorType>(op_seed.getType()).getElementType();
 
   device_id_seed = builder.create<mlir::TF::CastOp>(
       location, mlir::RankedTensorType::get({}, seed_type), device_id_seed);
@@ -222,8 +223,8 @@ StatusOr<mlir::Operation*> CreatedShardedLocalRandomOpV1(const Layout& layout,
   // StatelessRandom op is used to make random op SPMD expansion
   // deterministic.
   mlir::Type new_random_type = mlir::RankedTensorType::get(
-      new_random_shape,
-      op->getResult(0).getType().cast<mlir::TensorType>().getElementType());
+      new_random_shape, mlir::cast<mlir::TensorType>(op->getResult(0).getType())
+                            .getElementType());
 
   auto new_shape_value = Int64Const(builder, location, new_random_shape);
   // TODO(zhonglinhan) : check different input for StatelessRandomUniformInt
@@ -254,8 +255,8 @@ StatusOr<mlir::Operation*> CreatedShardedLocalRandomOpV2(const Layout& layout,
   // StatelessRandom op is used to make random op SPMD expansion
   // deterministic.
   mlir::Type new_random_type = mlir::RankedTensorType::get(
-      new_random_shape,
-      op->getResult(0).getType().cast<mlir::TensorType>().getElementType());
+      new_random_shape, mlir::cast<mlir::TensorType>(op->getResult(0).getType())
+                            .getElementType());
 
   auto new_shape_value = Int64Const(builder, location, new_random_shape);
 
@@ -287,8 +288,8 @@ StatusOr<mlir::Operation*> CreatedShardedLocalRandomOpV2Range(
   // StatelessRandom op is used to make random op SPMD expansion
   // deterministic.
   mlir::Type new_random_type = mlir::RankedTensorType::get(
-      new_random_shape,
-      op->getResult(0).getType().cast<mlir::TensorType>().getElementType());
+      new_random_shape, mlir::cast<mlir::TensorType>(op->getResult(0).getType())
+                            .getElementType());
 
   auto new_shape_value = Int64Const(builder, location, new_random_shape);
 
diff --git a/tensorflow/dtensor/mlir/expansions/reduce_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/reduce_spmd_expander.cc
index 9d40da3173cee4..4905d49c8faf1a 100644
--- a/tensorflow/dtensor/mlir/expansions/reduce_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/reduce_spmd_expander.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.h"
@@ -110,7 +111,7 @@ template <>
 Status ExtractDims<mlir::TF::L2LossOp>(
     mlir::Operation* op, llvm::SmallVector<int64_t, 4>* reduced_dims,
     bool* keep_dims, bool* matched) {
-  if (!llvm::isa<mlir::TF::L2LossOp>(op)) return OkStatus();
+  if (!llvm::isa<mlir::TF::L2LossOp>(op)) return absl::OkStatus();
   auto loss_op = llvm::cast<mlir::TF::L2LossOp>(op);
   *reduced_dims = llvm::SmallVector<int64_t, 4>{};
   reduced_dims->resize(ValueRank(loss_op->getOperand(0)));
@@ -119,23 +120,23 @@ Status ExtractDims<mlir::TF::L2LossOp>(
   }
   *keep_dims = false;
   *matched = true;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <>
 Status ExtractDims<mlir::TF::BiasAddGradOp>(
     mlir::Operation* op, llvm::SmallVector<int64_t, 4>* reduced_dims,
     bool* keep_dims, bool* matched) {
-  if (!llvm::isa<mlir::TF::BiasAddGradOp>(op)) return OkStatus();
+  if (!llvm::isa<mlir::TF::BiasAddGradOp>(op)) return absl::OkStatus();
   auto bias_add_grad_op = llvm::cast<mlir::TF::BiasAddGradOp>(op);
   auto data_format = bias_add_grad_op.getDataFormat();
   // rank is at least 2 (required by BiasAddGrad).
   int rank = ValueRank(bias_add_grad_op->getOperand(0));
-  if (data_format.equals("NHWC")) {
+  if (data_format == "NHWC") {
     for (int dim = 0; dim < rank - 1; ++dim) {
       reduced_dims->push_back(dim);
     }
-  } else if (data_format.equals("NCHW")) {
+  } else if (data_format == "NCHW") {
     for (int dim = 0; dim < rank; ++dim) {
       if (dim == 1) continue;
       reduced_dims->push_back(dim);
@@ -146,18 +147,18 @@ Status ExtractDims<mlir::TF::BiasAddGradOp>(
   }
   *keep_dims = false;
   *matched = true;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <>
 Status ExtractDims<mlir::TF::EncodePngOp>(
     mlir::Operation* op, llvm::SmallVector<int64_t, 4>* reduced_dims,
     bool* keep_dims, bool* matched) {
-  if (!llvm::isa<mlir::TF::EncodePngOp>(op)) return OkStatus();
+  if (!llvm::isa<mlir::TF::EncodePngOp>(op)) return absl::OkStatus();
   *reduced_dims = {-3, -2, -1};
   *keep_dims = false;
   *matched = true;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ExtractReductionParameters(mlir::Operation* op,
@@ -218,7 +219,7 @@ StatusOr<mlir::Operation*> ReduceSPMDExpander::ExpandOp(mlir::Operation* op) {
   // Generate an error message for TPU int64.
   if (input_layout->mesh().is_tpu_mesh()) {
     if (auto tensor_type =
-            op->getOperand(0).getType().dyn_cast<mlir::TensorType>()) {
+            mlir::dyn_cast<mlir::TensorType>(op->getOperand(0).getType())) {
       if (tensor_type.getElementType().isInteger(64)) {
         return errors::InvalidArgument(
             "ReduceOp on TPU does not support int64 as dtype.");
diff --git a/tensorflow/dtensor/mlir/expansions/resource_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/resource_spmd_expander.cc
index c6f5042dfb8034..63d1bf4fd7e198 100644
--- a/tensorflow/dtensor/mlir/expansions/resource_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/resource_spmd_expander.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h"
@@ -238,7 +239,7 @@ StatusOr<mlir::Operation*> ResourceSPMDExpander::ExpandOp(mlir::Operation* op) {
       TF_RETURN_WITH_CONTEXT(errors::Internal(
           "if both resource and value layout are set they must be equal"));
 
-    auto block_arg = input_resource_value.dyn_cast<mlir::BlockArgument>();
+    auto block_arg = mlir::dyn_cast<mlir::BlockArgument>(input_resource_value);
     auto enclosing_device_cluster =
         op->getParentOfType<mlir::tf_device::ClusterOp>();
 
diff --git a/tensorflow/dtensor/mlir/expansions/save_restore_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/save_restore_spmd_expander.cc
index 4df082defd1817..ea25519f13b7c3 100644
--- a/tensorflow/dtensor/mlir/expansions/save_restore_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/save_restore_spmd_expander.cc
@@ -85,7 +85,7 @@ StatusOr<mlir::Value> GetAllCandidateCheckpointPrefixes(
       builder
           .create<mlir::TF::AddOp>(
               prefix.getLoc(),
-              prefix.getType().dyn_cast<mlir::RankedTensorType>(), prefix,
+              mlir::dyn_cast<mlir::RankedTensorType>(prefix.getType()), prefix,
               StringConst(builder, prefix.getLoc(),
                           llvm::SmallVector<llvm::StringRef>(
                               {DeviceSuffix(0, mesh.num_devices())})))
@@ -96,7 +96,8 @@ StatusOr<mlir::Value> GetAllCandidateCheckpointPrefixes(
         builder
             .create<mlir::TF::AddOp>(
                 prefix.getLoc(),
-                prefix.getType().dyn_cast<mlir::RankedTensorType>(), prefix,
+                mlir::dyn_cast<mlir::RankedTensorType>(prefix.getType()),
+                prefix,
                 StringConst(builder, prefix.getLoc(),
                             llvm::SmallVector<llvm::StringRef>(
                                 {DeviceSuffix(device_id, mesh.num_devices())})))
@@ -529,7 +530,7 @@ StatusOr<mlir::Operation*> ExpandMergeV2Op(mlir::Operation* op) {
       mlir::Value zero_scalar,
       CreateZeroScalarConst(
           builder, location,
-          device_id.getType().cast<mlir::TensorType>().getElementType()));
+          mlir::cast<mlir::TensorType>(device_id.getType()).getElementType()));
 
   mlir::TF::NotEqualOp not_equal = builder.create<mlir::TF::NotEqualOp>(
       location, device_id, zero_scalar,
@@ -697,7 +698,7 @@ StatusOr<mlir::Operation*> ExpandDTensorRestoreV2Op(mlir::Operation* op) {
   std::vector<std::vector<int64_t>> input_shapes;
   input_shapes.reserve(input_shapes_attr.size());
   for (const auto& shape : input_shapes_attr) {
-    mlir::TF::ShapeAttr shape_attr = shape.cast<mlir::TF::ShapeAttr>();
+    mlir::TF::ShapeAttr shape_attr = mlir::cast<mlir::TF::ShapeAttr>(shape);
     if (!shape_attr.hasStaticShape()) {
       return absl::InvalidArgumentError(
           llvm::formatv("DTensorRestoreV2Op requires statically known input "
@@ -718,7 +719,8 @@ StatusOr<mlir::Operation*> ExpandDTensorRestoreV2Op(mlir::Operation* op) {
   input_layouts.reserve(input_layouts_attr.size());
   for (const auto& layout : input_layouts_attr.getValue().vec()) {
     input_layouts.push_back(
-        Layout::FromString(layout.cast<mlir::StringAttr>().getValue().str())
+        Layout::FromString(
+            mlir::cast<mlir::StringAttr>(layout).getValue().str())
             .value());
   }
 
@@ -778,7 +780,7 @@ StatusOr<mlir::Operation*> ExpandRestoreV2Op(mlir::Operation* op) {
     Layout& layout = std::get<2>(it);
     new_types.push_back(mlir::RankedTensorType::get(
         layout.LocalShapeFromGlobalShape(shape),
-        type.dyn_cast<mlir::RankedTensorType>().getElementType()));
+        mlir::dyn_cast<mlir::RankedTensorType>(type).getElementType()));
   }
 
   return ExpandRestoreV2OpHelper(
@@ -910,7 +912,7 @@ SaveRestoreSPMDExpander::ComputeLayoutForward(
       TF_ASSIGN_OR_RETURN(
           Layout layout,
           Layout::FromString(
-              it.value().cast<mlir::StringAttr>().getValue().str()));
+              mlir::cast<mlir::StringAttr>(it.value()).getValue().str()));
       output_layouts[it.index()] = layout;
     }
     return output_layouts;
diff --git a/tensorflow/dtensor/mlir/expansions/slice_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/slice_spmd_expander.cc
index 13768bc95419b5..275a1bbe1af07e 100644
--- a/tensorflow/dtensor/mlir/expansions/slice_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/slice_spmd_expander.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/dtensor/cc/dstatus.h"
@@ -116,7 +117,7 @@ StatusOr<mlir::Operation*> SliceSPMDExpander::ExpandOp(mlir::Operation* op) {
   // The dyn_cast will never be nullptr as it is checked in
   // GetLayoutFromOperands.
   auto input_type =
-      slice_op.getInput().getType().dyn_cast<mlir::RankedTensorType>();
+      mlir::dyn_cast<mlir::RankedTensorType>(slice_op.getInput().getType());
   if (!input_type)
     return errors::InvalidArgument(
         "rank of input tensor must be statically known for slice op.");
@@ -172,10 +173,10 @@ StatusOr<mlir::Operation*> SliceSPMDExpander::ExpandOp(mlir::Operation* op) {
   auto loc = op->getLoc();
   // Both begin and size need to be the same type, so we must match the new
   // size input with the type of begin.
-  if (!slice_op.getBegin().getType().isa<mlir::ShapedType>())
+  if (!mlir::isa<mlir::ShapedType>(slice_op.getBegin().getType()))
     return errors::Internal("type of begin is not a ShapedType");
   mlir::ShapedType type =
-      slice_op.getBegin().getType().cast<mlir::ShapedType>();
+      mlir::cast<mlir::ShapedType>(slice_op.getBegin().getType());
   if (type.getElementType().isInteger(32))
     new_size = IntConst(
         builder, loc, llvm::SmallVector<int32, 4>(sizes.begin(), sizes.end()));
diff --git a/tensorflow/dtensor/mlir/expansions/softmax_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/softmax_spmd_expander.cc
index 8ccdf0ee8168ae..2a2b03273f28dc 100644
--- a/tensorflow/dtensor/mlir/expansions/softmax_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/softmax_spmd_expander.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/dtensor/cc/tensor_layout.h"
 #include "tensorflow/dtensor/mlir/collectives.h"
@@ -79,10 +80,8 @@ StatusOr<mlir::Value> ComputeGlobalReduce(
                     local_reduce, reduce_op));
 
   if (!keep_dims) {
-    mlir::RankedTensorType output_type =
-        global_reduce->getResult(0)
-            .getType()
-            .dyn_cast<mlir::RankedTensorType>();
+    mlir::RankedTensorType output_type = mlir::dyn_cast<mlir::RankedTensorType>(
+        global_reduce->getResult(0).getType());
     if (!output_type)
       return errors::Internal(
           "output of EmitAllReduce is not a RankedTensorType");
@@ -209,7 +208,8 @@ StatusOr<Layout> GetBroadcastedLayout(llvm::ArrayRef<int64_t> global_shape,
 // value. Assumes builder's insertion point is after input.
 StatusOr<mlir::Value> GetFPConstOfType(mlir::OpBuilder& builder,
                                        const mlir::Value& input, float value) {
-  if (mlir::TensorType type = input.getType().dyn_cast<mlir::TensorType>()) {
+  if (mlir::TensorType type =
+          mlir::dyn_cast<mlir::TensorType>(input.getType())) {
     return builder
         .create<mlir::TF::ConstOp>(
             input.getLoc(),
@@ -239,7 +239,7 @@ StatusOr<mlir::Value> ComputeOneHot(mlir::OpBuilder& builder,
   // Get the number of classes for this onehot. The number of classes is the
   // global size of the last dimension of features.
   mlir::RankedTensorType features_type =
-      features.getType().dyn_cast<mlir::RankedTensorType>();
+      mlir::dyn_cast<mlir::RankedTensorType>(features.getType());
   if (!features_type)
     return errors::InvalidArgument(
         "feature input shape must be statically known");
@@ -297,7 +297,8 @@ StatusOr<mlir::Value> ComputeOneHot(mlir::OpBuilder& builder,
 
   // Note that the type of id_offset (int32) may not match the type of input.
   // So we insert a cast in this case.
-  mlir::TensorType input_type = input.getType().dyn_cast<mlir::TensorType>();
+  mlir::TensorType input_type =
+      mlir::dyn_cast<mlir::TensorType>(input.getType());
   if (!input_type) return errors::InvalidArgument("input is not a TensorType");
   if (!input_type.getElementType().isInteger(32))
     id_offset =
diff --git a/tensorflow/dtensor/mlir/expansions/sparse_to_dense_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/sparse_to_dense_spmd_expander.cc
index 1e9b003a14d46f..ee7ee35bbee5aa 100644
--- a/tensorflow/dtensor/mlir/expansions/sparse_to_dense_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/sparse_to_dense_spmd_expander.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <optional>
 
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/dtensor/mlir/layout_parsing.h"
 #include "tensorflow/dtensor/mlir/shape_utils.h"
 #include "tensorflow/dtensor/mlir/value_utils.h"
@@ -35,7 +36,7 @@ StatusOr<mlir::Operation*> SparseToDenseSPMDExpander::ExpandOp(
   auto op_result = op->getResult(0);
 
   const auto element_type =
-      op_result.getType().cast<mlir::TensorType>().getElementType();
+      mlir::cast<mlir::TensorType>(op_result.getType()).getElementType();
   op_result.setType(mlir::RankedTensorType::get(local_shape, element_type));
   // No-op
   return op;
diff --git a/tensorflow/dtensor/mlir/expansions/squeeze_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/squeeze_spmd_expander.cc
index c40e08814c36a1..5df9f648ffabfd 100644
--- a/tensorflow/dtensor/mlir/expansions/squeeze_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/squeeze_spmd_expander.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/dtensor/cc/dstatus.h"
 #include "tensorflow/dtensor/mlir/layout_parsing.h"
@@ -37,7 +38,8 @@ std::set<int64_t> GetSqueezeDims(mlir::Operation* op, int64_t rank) {
   if (array_attribute) {
     auto attr_list = array_attribute.getValue().vec();
     for (const auto& attr : attr_list) {
-      int64_t dim = attr.cast<mlir::IntegerAttr>().getValue().getSExtValue();
+      int64_t dim =
+          mlir::cast<mlir::IntegerAttr>(attr).getValue().getSExtValue();
       // Offset the negative indices to positive range.
       squeeze_dims.insert((dim + rank) % rank);
     }
diff --git a/tensorflow/dtensor/mlir/expansions/tensorlist_reserve_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/tensorlist_reserve_spmd_expander.cc
index 0ef5ca777e8446..ce82bae63e930b 100644
--- a/tensorflow/dtensor/mlir/expansions/tensorlist_reserve_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/tensorlist_reserve_spmd_expander.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/dtensor/mlir/expansions/tensorlist_reserve_spmd_expander.h"
 
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/dtensor/mlir/dtensor_location.h"
@@ -38,9 +39,9 @@ StatusOr<mlir::Operation*> TensorListReserveSPMDExpander::ExpandOp(
       llvm::dyn_cast<mlir::TF::TensorListReserveOp>(op);
   mlir::OpBuilder builder(op);
 
-  mlir::Type element_type = GetSubtypeOrSelf(op->getOpResult(0))
-                                .cast<mlir::TensorType>()
-                                .getElementType();
+  mlir::Type element_type =
+      mlir::cast<mlir::TensorType>(GetSubtypeOrSelf(op->getOpResult(0)))
+          .getElementType();
 
   mlir::RankedTensorType new_output_type = mlir::RankedTensorType::get(
       {}, mlir::TF::VariantType::get(
diff --git a/tensorflow/dtensor/mlir/function_renaming.cc b/tensorflow/dtensor/mlir/function_renaming.cc
index 0c485e91878d15..3453986221730c 100644
--- a/tensorflow/dtensor/mlir/function_renaming.cc
+++ b/tensorflow/dtensor/mlir/function_renaming.cc
@@ -17,13 +17,14 @@ limitations under the License.
 #include <string>
 
 #include "absl/strings/str_cat.h"
+#include "llvm/ADT/STLExtras.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "tensorflow/dtensor/cc/constants.h"
-#include "tensorflow/dtensor/mlir/dtensor_mlir_passes.h"
 
 namespace tensorflow {
 namespace dtensor {
diff --git a/tensorflow/dtensor/mlir/handle_cross_cluster_dependencies.cc b/tensorflow/dtensor/mlir/handle_cross_cluster_dependencies.cc
index 422a4d4a8cafd1..5244e18e13c724 100644
--- a/tensorflow/dtensor/mlir/handle_cross_cluster_dependencies.cc
+++ b/tensorflow/dtensor/mlir/handle_cross_cluster_dependencies.cc
@@ -22,16 +22,23 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/UseDefLists.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
 #include "tensorflow/dtensor/mlir/dtensor_dialect/ir/dialect.h"
+#include "tensorflow/dtensor/mlir/dtensor_dialect/ir/dtensor_attributes.h"
 #include "tensorflow/dtensor/mlir/ir/tf_dtensor.h"
 #include "tensorflow/dtensor/mlir/layout_parsing.h"
 #include "tensorflow/dtensor/mlir/spmd_expander_common.h"
@@ -109,7 +116,7 @@ mlir::LogicalResult CloneOpToCluster(mlir::Operation* const_op,
 
 mlir::LogicalResult GetInputProducingValue(mlir::OpOperand& operand,
                                            mlir::Value* val_output) {
-  auto input_value = operand.get().dyn_cast<mlir::OpResult>();
+  auto input_value = mlir::dyn_cast<mlir::OpResult>(operand.get());
   if (!input_value) return mlir::success();
 
   auto input_cluster =
@@ -216,7 +223,7 @@ mlir::LogicalResult LowerToSendRecv(mlir::TF::CopyToMeshOp copy_to_mesh,
                                     mlir::MLIRContext* context,
                                     int* send_recv_counter) {
   const mlir::OpResult copied_value =
-      copy_to_mesh.getInput().cast<mlir::OpResult>();
+      mlir::cast<mlir::OpResult>(copy_to_mesh.getInput());
   const int result_index = copied_value.getResultNumber();
   auto src_cluster =
       llvm::cast<mlir::tf_device::ClusterOp>(copied_value.getDefiningOp());
@@ -243,7 +250,7 @@ mlir::LogicalResult LowerToSendRecv(mlir::TF::CopyToMeshOp copy_to_mesh,
       mlir::dtensor::MeshAttr::get(context, target_mesh));
 
   // Create recv op that recvs data from send op.
-  auto tensor_type = value_to_send.getType().dyn_cast<mlir::TensorType>();
+  auto tensor_type = mlir::dyn_cast<mlir::TensorType>(value_to_send.getType());
   if (!tensor_type)
     return copy_to_mesh.emitOpError(
         "found CopyToMesh sending value with unknown shape. Inputs to "
diff --git a/tensorflow/dtensor/mlir/handle_sparsetensors.cc b/tensorflow/dtensor/mlir/handle_sparsetensors.cc
index 30c18accca8ff0..1464c47beaf4a4 100644
--- a/tensorflow/dtensor/mlir/handle_sparsetensors.cc
+++ b/tensorflow/dtensor/mlir/handle_sparsetensors.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -27,22 +29,22 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/ValueRange.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Interfaces/FunctionInterfaces.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 #include "tensorflow/dtensor/cc/constants.h"
-#include "tensorflow/dtensor/mlir/device_utils.h"
-#include "tensorflow/dtensor/mlir/dtensor_mlir_passes.h"
-#include "tensorflow/dtensor/mlir/op_utils.h"
-#include "tensorflow/dtensor/mlir/spmd_expander_common.h"
+#include "tensorflow/dtensor/cc/dstatus.h"
 #include "tensorflow/dtensor/mlir/value_utils.h"
 
 namespace tensorflow {
@@ -95,11 +97,12 @@ mlir::LogicalResult UpdateFunctionInputAttributes(
   auto dict_attr =
       main_func->getAttrOfType<mlir::DictionaryAttr>(kEntryFuncAttr);
   if (dict_attr) {
-    if (!dict_attr.get("inputs").isa<mlir::StringAttr>())
+    if (!mlir::isa<mlir::StringAttr>(dict_attr.get("inputs")))
       return main_func.emitOpError("Missing attribute inputs in main FuncOp.");
 
-    dict_attr.get("inputs").cast<mlir::StringAttr>().getValue().split(
-        input_names, ',', /*MaxSplit=*/-1, /*KeepEmpty=*/false);
+    mlir::cast<mlir::StringAttr>(dict_attr.get("inputs"))
+        .getValue()
+        .split(input_names, ',', /*MaxSplit=*/-1, /*KeepEmpty=*/false);
 
     llvm::SmallVector<std::string, 2> new_input_names;
 
@@ -148,10 +151,10 @@ void CreateComponentTensorsFromSparseTensors(
               {mlir::ShapedType::kDynamic, ValueRank(block_arg)},
               builder.getI64Type()),
           /*values=*/
-          mlir::RankedTensorType::get({mlir::ShapedType::kDynamic},
-                                      block_arg.getType()
-                                          .dyn_cast<mlir::RankedTensorType>()
-                                          .getElementType()),
+          mlir::RankedTensorType::get(
+              {mlir::ShapedType::kDynamic},
+              mlir::dyn_cast<mlir::RankedTensorType>(block_arg.getType())
+                  .getElementType()),
           /*dense_shapes=*/
           mlir::RankedTensorType::get({ValueRank(block_arg)},
                                       builder.getI64Type()),
@@ -214,11 +217,10 @@ struct DTensorSparseTensorToDenseTensor
 
       // Emit a SparseToDenseOp and replace the SparseTensor with the result of
       // this new op.
-      StatusOr<mlir::Value> zero_scalar =
-          CreateZeroScalarConst(builder, front_op->getLoc(),
-                                sparse_tensor_value.getType()
-                                    .cast<mlir::TensorType>()
-                                    .getElementType());
+      StatusOr<mlir::Value> zero_scalar = CreateZeroScalarConst(
+          builder, front_op->getLoc(),
+          mlir::cast<mlir::TensorType>(sparse_tensor_value.getType())
+              .getElementType());
       if (!zero_scalar.ok()) return signalPassFailure();
       mlir::TF::SparseToDenseOp sparse_to_dense_op =
           builder.create<mlir::TF::SparseToDenseOp>(
diff --git a/tensorflow/dtensor/mlir/ir/tf_dtensor.cc b/tensorflow/dtensor/mlir/ir/tf_dtensor.cc
index 14e66c98b6cb2c..d119f034eecc38 100644
--- a/tensorflow/dtensor/mlir/ir/tf_dtensor.cc
+++ b/tensorflow/dtensor/mlir/ir/tf_dtensor.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -38,13 +39,13 @@ namespace {
 RankedTensorType GetRankedTensorType(mlir::Value val) {
   mlir::Type type = val.getType();
   if (auto type_with_subtype =
-          mlir::getElementTypeOrSelf(val)
-              .dyn_cast<mlir::TF::TensorFlowTypeWithSubtype>()) {
+          mlir::dyn_cast<mlir::TF::TensorFlowTypeWithSubtype>(
+              mlir::getElementTypeOrSelf(val))) {
     if (type_with_subtype.GetSubtypes().size() == 1) {
       type = type_with_subtype.GetSubtypes().front();
     }
   }
-  return type.dyn_cast_or_null<RankedTensorType>();
+  return mlir::dyn_cast_or_null<RankedTensorType>(type);
 }
 }  // namespace
 
@@ -110,7 +111,7 @@ mlir::LogicalResult DTensorAllGatherOp::verify() {
   }
 
   RankedTensorType input_type =
-      op.getInput().getType().dyn_cast<RankedTensorType>();
+      mlir::dyn_cast<RankedTensorType>(op.getInput().getType());
   if (!input_type) return mlir::success();
 
   if (input_type.getRank() != input_layout.rank())
@@ -119,7 +120,7 @@ mlir::LogicalResult DTensorAllGatherOp::verify() {
            << " is not equal to input rank " << input_type.getRank();
 
   RankedTensorType output_type =
-      op.getOutput().getType().dyn_cast<RankedTensorType>();
+      mlir::dyn_cast<RankedTensorType>(op.getOutput().getType());
   if (!output_type) return mlir::success();
 
   if (output_type.getRank() != output_layout.rank())
@@ -166,7 +167,7 @@ mlir::LogicalResult DTensorAllScatterOp::verify() {
   }
 
   RankedTensorType input_type =
-      op.getInput().getType().dyn_cast<RankedTensorType>();
+      mlir::dyn_cast<RankedTensorType>(op.getInput().getType());
   if (!input_type) return mlir::success();
 
   if (input_type.getRank() != input_layout.rank())
@@ -175,7 +176,7 @@ mlir::LogicalResult DTensorAllScatterOp::verify() {
            << " is not equal to input rank " << input_type.getRank();
 
   RankedTensorType output_type =
-      op.getOutput().getType().dyn_cast<RankedTensorType>();
+      mlir::dyn_cast<RankedTensorType>(op.getOutput().getType());
   if (!output_type) return mlir::success();
 
   if (output_type.getRank() != output_layout.rank())
@@ -237,7 +238,7 @@ mlir::LogicalResult DTensorAllToAllOp::verify() {
   }
 
   RankedTensorType input_type =
-      op.getInput().getType().dyn_cast<RankedTensorType>();
+      mlir::dyn_cast<RankedTensorType>(op.getInput().getType());
   if (!input_type) return mlir::success();
 
   if (input_type.getRank() != input_layout.rank())
@@ -246,7 +247,7 @@ mlir::LogicalResult DTensorAllToAllOp::verify() {
            << " is not equal to input rank " << input_type.getRank();
 
   RankedTensorType output_type =
-      op.getOutput().getType().dyn_cast<RankedTensorType>();
+      mlir::dyn_cast<RankedTensorType>(op.getOutput().getType());
   if (!output_type) return mlir::success();
 
   if (output_type.getRank() != output_layout.rank())
diff --git a/tensorflow/dtensor/mlir/layout_parsing.cc b/tensorflow/dtensor/mlir/layout_parsing.cc
index 441ab2b86ede67..4b9011f42b0af0 100644
--- a/tensorflow/dtensor/mlir/layout_parsing.cc
+++ b/tensorflow/dtensor/mlir/layout_parsing.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -105,7 +106,7 @@ StatusOr<std::vector<std::optional<Layout>>> ExtractLayoutFromOp(
     if (!serialized_layouts) return outs;
 
     for (auto const& attr : serialized_layouts) {
-      auto attr_str = attr.cast<mlir::StringAttr>().getValue().str();
+      auto attr_str = mlir::cast<mlir::StringAttr>(attr).getValue().str();
       if (!attr_str.empty()) {
         TF_ASSIGN_OR_RETURN(auto layout, Layout::FromString(attr_str));
         outs.emplace_back(std::move(layout));
@@ -162,7 +163,7 @@ StatusOr<std::optional<Mesh>> ExtractDeviceMeshFromOp(mlir::Operation* op) {
 }
 
 StatusOr<std::optional<Layout>> ExtractLayoutFromOperand(mlir::Value operand) {
-  if (auto op_result = operand.dyn_cast<mlir::OpResult>()) {
+  if (auto op_result = mlir::dyn_cast<mlir::OpResult>(operand)) {
     mlir::Operation* op = op_result.getDefiningOp();
     std::optional<Layout> out;
     if (auto layout_op = llvm::dyn_cast<mlir::TF::DTensorLayout>(op)) {
@@ -185,7 +186,7 @@ StatusOr<std::optional<Layout>> ExtractLayoutFromOperand(mlir::Value operand) {
     return out;
   }
 
-  auto block_arg = operand.dyn_cast<mlir::BlockArgument>();
+  auto block_arg = mlir::dyn_cast<mlir::BlockArgument>(operand);
   if (!block_arg)
     return errors::Internal(
         "Operand is not either a OpResult or a BlockArgument. This should not "
@@ -293,7 +294,7 @@ StatusOr<llvm::SmallVector<Layout, 4>> ExtractElementLayoutsFromOperand(
                       operand_index, op->getName())
             .str());
 
-  auto block_arg = input_value.get().dyn_cast<mlir::BlockArgument>();
+  auto block_arg = mlir::dyn_cast<mlir::BlockArgument>(input_value.get());
   auto array_attr = enclosing_function.getArgAttrOfType<mlir::ArrayAttr>(
       block_arg.getArgNumber(), kIteratorElementLayouts);
   if (!array_attr)
@@ -305,9 +306,10 @@ StatusOr<llvm::SmallVector<Layout, 4>> ExtractElementLayoutsFromOperand(
 
   llvm::SmallVector<Layout, 4> layouts(array_attr.size());
   for (int i = 0; i < array_attr.size(); ++i) {
-    layouts[i] = Layout::FromString(
-                     array_attr[i].cast<mlir::StringAttr>().getValue().str())
-                     .value();
+    layouts[i] =
+        Layout::FromString(
+            mlir::cast<mlir::StringAttr>(array_attr[i]).getValue().str())
+            .value();
   }
 
   return layouts;
diff --git a/tensorflow/dtensor/mlir/layout_propagation_v2.cc b/tensorflow/dtensor/mlir/layout_propagation_v2.cc
index a7f089eb1726ec..54b798881e5b09 100644
--- a/tensorflow/dtensor/mlir/layout_propagation_v2.cc
+++ b/tensorflow/dtensor/mlir/layout_propagation_v2.cc
@@ -113,7 +113,7 @@ void UpdateLayoutForSkippedOps(
   llvm::SmallVector<mlir::Value, 4> skipped_values;
   TraceUseToNextTFOp(&operand, func_to_caller, &skipped_values);
   for (const mlir::Value& skipped_value : skipped_values)
-    if ((!skipped_value.isa<mlir::OpResult>() ||
+    if ((!mlir::isa<mlir::OpResult>(skipped_value) ||
          !mlir::isa<mlir::TF::DTensorLayout, mlir::tf_device::ClusterOp>(
              skipped_value.getDefiningOp())) &&
         layouts.find(skipped_value) == layouts.end())
@@ -724,7 +724,7 @@ mlir::LogicalResult InsertDTensorLayoutOps(
     // resource type elements.
     mlir::Type value_type = GetSubtypeOrSelf(merged_layout.first);
 
-    if (auto type = value_type.dyn_cast<mlir::TensorType>()) {
+    if (auto type = mlir::dyn_cast<mlir::TensorType>(value_type)) {
       auto layout_op = builder.create<mlir::TF::DTensorLayout>(
           merged_layout.first.getLoc(), merged_layout.first, layout_attr,
           mlir::TF::ShapeAttr::get(builder.getContext(), type));
@@ -810,7 +810,7 @@ void GetOperationsNeedingUpdate(
         if (!mlir::isa<mlir::TF::CopyToMeshOp>(use->getOwner()))
           operations.insert(use->getOwner());
     // If this is an OpResult, also add the op that produces it.
-    if (value.isa<mlir::OpResult>() &&
+    if (mlir::isa<mlir::OpResult>(value) &&
         !mlir::isa<mlir::TF::CopyToMeshOp>(value.getDefiningOp()))
       operations.insert(value.getDefiningOp());
   }
@@ -933,7 +933,7 @@ class LayoutPrinter : public mlir::OpAsmPrinter {
   // Print an operand, this could be both the OpResult or a BlockArgument.
   // We also print the layout if it exists and the type.
   void printOperand(mlir::Value value, llvm::raw_ostream& os) override {
-    if (auto result = value.dyn_cast<mlir::OpResult>()) {
+    if (auto result = mlir::dyn_cast<mlir::OpResult>(value)) {
       // If DTensorLayout ops are already in the module, we need to skip them
       // since we aren't printing them out.
       if (mlir::isa<mlir::TF::DTensorLayout>(result.getDefiningOp())) {
@@ -946,7 +946,7 @@ class LayoutPrinter : public mlir::OpAsmPrinter {
       os << "%" << location_[result.getDefiningOp()];
       if (result.getDefiningOp()->getNumResults() > 1)
         os << ":" << result.getResultNumber();
-    } else if (auto argument = value.dyn_cast<mlir::BlockArgument>()) {
+    } else if (auto argument = mlir::dyn_cast<mlir::BlockArgument>(value)) {
       if (arguments_.find(argument) == arguments_.end())
         arguments_[argument] = next_argument_++;
       os << "%arg" << arguments_[argument];
@@ -1203,7 +1203,7 @@ mlir::LogicalResult InsertRelayoutForWhileLoops(
           mlir::cast<mlir::TF::DTensorLayout>(input_layout_op).getLayout();
 
       // Inputs to Yield should also be a DTensorLayout op.
-      if (!yield_op->getOperand(i).isa<mlir::OpResult>() ||
+      if (!mlir::isa<mlir::OpResult>(yield_op->getOperand(i)) ||
           !mlir::isa<mlir::TF::DTensorLayout>(
               yield_op->getOperand(i).getDefiningOp()))
         return yield_op->emitOpError()
@@ -1220,12 +1220,12 @@ mlir::LogicalResult InsertRelayoutForWhileLoops(
 
       // Insert the first Relayout op (in the loop body).
       builder.setInsertionPointAfter(output_layout_op);
-      if (!yield_op->getOperand(i).getType().isa<mlir::TensorType>())
+      if (!mlir::isa<mlir::TensorType>(yield_op->getOperand(i).getType()))
         return yield_op->emitOpError()
                << "operand " << i << " does not have TensorType";
       mlir::TF::ShapeAttr global_shape = mlir::TF::ShapeAttr::get(
           builder.getContext(),
-          yield_op->getOperand(i).getType().cast<mlir::TensorType>());
+          mlir::cast<mlir::TensorType>(yield_op->getOperand(i).getType()));
       mlir::TF::RelayoutOp first_relayout =
           builder.create<mlir::TF::RelayoutOp>(
               op.getLoc(), yield_op->getOperand(i).getType(),
diff --git a/tensorflow/dtensor/mlir/merge_clusters.cc b/tensorflow/dtensor/mlir/merge_clusters.cc
index 3aa323ec1d4cc1..575a4e643b331d 100644
--- a/tensorflow/dtensor/mlir/merge_clusters.cc
+++ b/tensorflow/dtensor/mlir/merge_clusters.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
@@ -278,7 +279,7 @@ void CloneEmptyIfWithPredicate(mlir::TF::IfRegionOp if_region, const Mesh& mesh,
   // DTensorSend op sends the predicate to `mesh` cluster with replicated
   // layout.
   mlir::TensorType predicate_tensor_type =
-      if_region.getCond().getType().cast<mlir::TensorType>();
+      mlir::cast<mlir::TensorType>(if_region.getCond().getType());
   const std::string send_recv_key =
       absl::StrCat(kSendRecvKeyPrefix, *num_send_recvs);
   *num_send_recvs += 1;
@@ -341,7 +342,7 @@ mlir::LogicalResult VerifyClusterInputOutput(
   mlir::LogicalResult result = mlir::success();
   mlir::visitUsedValuesDefinedAbove(
       cluster.getBody(), cluster.getBody(), [&](mlir::OpOperand* input) {
-        if (!input->get().isa<mlir::BlockArgument>()) {
+        if (!mlir::isa<mlir::BlockArgument>(input->get())) {
           result = cluster.emitOpError(
               "found nested tf_device.Cluster op with inputs. Nested cluster "
               "must use send/recv instead.");
diff --git a/tensorflow/dtensor/mlir/mesh_propagation.cc b/tensorflow/dtensor/mlir/mesh_propagation.cc
index 6d57b068bd5062..bb5369a530c2cb 100644
--- a/tensorflow/dtensor/mlir/mesh_propagation.cc
+++ b/tensorflow/dtensor/mlir/mesh_propagation.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
@@ -101,7 +102,7 @@ mlir::LogicalResult ExtractMeshFromBlockArgumentWhile(
     }
     return mlir::success();
   } else if (auto func_block_arg =
-                 while_op_operand.dyn_cast<mlir::BlockArgument>()) {
+                 mlir::dyn_cast<mlir::BlockArgument>(while_op_operand)) {
     // The while op operand is a block argument of the function, then follow the
     // same routine of getting mesh from function argument.
     auto function_op = mlir::dyn_cast_or_null<mlir::func::FuncOp>(
@@ -183,7 +184,7 @@ mlir::LogicalResult ExtractMeshFromOperand(
 
   // If `operand` is a block argument then extract mesh from `tf._mesh`
   // attribute of the corresponding function argument.
-  if (auto block_arg = operand_value.dyn_cast<mlir::BlockArgument>()) {
+  if (auto block_arg = mlir::dyn_cast<mlir::BlockArgument>(operand_value)) {
     if (mlir::failed(ExtractMeshFromBlockArgument(block_arg, out)))
       return mlir::failure();
 
@@ -193,7 +194,7 @@ mlir::LogicalResult ExtractMeshFromOperand(
         auto producer_values = it->getSecond();
         std::optional<Mesh> operand_mesh;
         for (mlir::Value producer_value : producer_values) {
-          if (auto arg = producer_value.dyn_cast<mlir::BlockArgument>()) {
+          if (auto arg = mlir::dyn_cast<mlir::BlockArgument>(producer_value)) {
             std::optional<Mesh> mesh;
             if (mlir::failed(ExtractMeshFromBlockArgument(arg, &mesh)))
               return mlir::failure();
@@ -206,7 +207,7 @@ mlir::LogicalResult ExtractMeshFromOperand(
                 producer_value.getDefiningOp()
                     ->getParentOfType<mlir::tf_device::ClusterOp>();
             auto output_from_producing_op = input_cluster.getResult(
-                producer_value.cast<mlir::OpResult>().getResultNumber());
+                mlir::cast<mlir::OpResult>(producer_value).getResultNumber());
 
             std::optional<Mesh> mesh;
             if (mlir::failed(
diff --git a/tensorflow/dtensor/mlir/move_compilation_to_host.cc b/tensorflow/dtensor/mlir/move_compilation_to_host.cc
index d10a65f7d6b465..2fd20105388bf4 100644
--- a/tensorflow/dtensor/mlir/move_compilation_to_host.cc
+++ b/tensorflow/dtensor/mlir/move_compilation_to_host.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/dtensor/cc/tensor_layout.h"
@@ -141,9 +142,9 @@ mlir::LogicalResult CreateSendRecvOpsToTransferProgramKey(
     mlir::OpBuilder fn_builder = mlir::OpBuilder::atBlockEnd(fn_block);
     auto recv = fn_builder.create<mlir::TF::_HostRecvOp>(
         compile_op->getLoc(),
-        compilation_key.getType().cast<mlir::TensorType>(), device_key_map[i],
-        compile_op_launch.getDevice(), /*send_device_incarnation=*/0,
-        local_devices[i]);
+        mlir::cast<mlir::TensorType>(compilation_key.getType()),
+        device_key_map[i], compile_op_launch.getDevice(),
+        /*send_device_incarnation=*/0, local_devices[i]);
     recv->setAttr("device", builder.getStringAttr(local_devices[i]));
 
     fn_builder.create<mlir::func::ReturnOp>(recv_select_fn.getLoc(),
diff --git a/tensorflow/dtensor/mlir/propagate_default_layout.cc b/tensorflow/dtensor/mlir/propagate_default_layout.cc
index 39aba3a8716b9b..030e8af63aa1d8 100644
--- a/tensorflow/dtensor/mlir/propagate_default_layout.cc
+++ b/tensorflow/dtensor/mlir/propagate_default_layout.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/dtensor/cc/constants.h"
@@ -83,7 +84,7 @@ mlir::LogicalResult PropagateDTensorLayoutForRelayout(
 
   mlir::OpBuilder builder(relayout->getBlock(),
                           ++mlir::Block::iterator(relayout));
-  mlir::TensorType type = relayout.getType().dyn_cast<mlir::TensorType>();
+  mlir::TensorType type = mlir::dyn_cast<mlir::TensorType>(relayout.getType());
   if (!type) return relayout.emitOpError("type required for Relayout op");
 
   CreateDTensorLayoutOp(layout, relayout.getOutput(), type, relayout.getLoc(),
@@ -110,7 +111,7 @@ mlir::LogicalResult PropagateFunctionArgAttrToLayoutOp(
     mlir::OpBuilder builder(function.getBody());
     auto arg = function.getArgument(arg_index);
     mlir::Type tensor_type = GetSubtypeOrSelf(arg);
-    if (auto type = tensor_type.dyn_cast<mlir::TensorType>()) {
+    if (auto type = mlir::dyn_cast<mlir::TensorType>(tensor_type)) {
       CreateDTensorLayoutOp(layout_or_status.value(), arg, type,
                             function.getLoc(),
                             builder.getI64IntegerAttr(arg_index), &builder, &c);
@@ -149,7 +150,7 @@ mlir::LogicalResult PropagateFunctionDefaultLayoutAttrToLayoutOp(
     mlir::OpBuilder builder(function_terminator);
     auto return_value = function_terminator->getOperand(ret_index);
 
-    if (auto type = return_value.getType().dyn_cast<mlir::TensorType>())
+    if (auto type = mlir::dyn_cast<mlir::TensorType>(return_value.getType()))
       CreateDTensorLayoutOp(result_layout_or_status.value(), return_value, type,
                             function.getLoc(), nullptr, &builder, &c);
     else
@@ -187,7 +188,8 @@ mlir::LogicalResult PropagateOpAttrToLayoutOp(mlir::MLIRContext& context,
           if (!layout || layout->IsEmpty()) continue;
 
           auto op_output = op->getResult(index);
-          if (auto type = op_output.getType().dyn_cast<mlir::TensorType>()) {
+          if (auto type =
+                  mlir::dyn_cast<mlir::TensorType>(op_output.getType())) {
             CreateDTensorLayoutOp(*layout, op_output, type, function.getLoc(),
                                   arg_index, &builder, &context);
           } else {
diff --git a/tensorflow/dtensor/mlir/propagate_device_id_to_function_args.cc b/tensorflow/dtensor/mlir/propagate_device_id_to_function_args.cc
index 404be2b38e61fc..34648f9f407f2c 100644
--- a/tensorflow/dtensor/mlir/propagate_device_id_to_function_args.cc
+++ b/tensorflow/dtensor/mlir/propagate_device_id_to_function_args.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -69,7 +70,7 @@ llvm::SmallVector<FunctionToChangeInfo, 4> FindFunctionsToRewrite(
       symbol = call_op.getF();
     } else {
       auto symbol_ref = llvm::dyn_cast<mlir::TF::PartitionedCallOp>(op).getF();
-      if (!symbol_ref.isa<mlir::FlatSymbolRefAttr>()) return;
+      if (!mlir::isa<mlir::FlatSymbolRefAttr>(symbol_ref)) return;
       symbol = symbol_ref.getRootReference().getValue();
     }
 
diff --git a/tensorflow/dtensor/mlir/restore_shape_inference.cc b/tensorflow/dtensor/mlir/restore_shape_inference.cc
index abbda8cbdd80ae..4a463a8917fade 100644
--- a/tensorflow/dtensor/mlir/restore_shape_inference.cc
+++ b/tensorflow/dtensor/mlir/restore_shape_inference.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/dtensor/mlir/dtensor_send_recv.h"
@@ -81,9 +82,7 @@ mlir::LogicalResult BackwardShapeInferenceToRestoreOp(mlir::ModuleOp module,
     // the type to the operand element type.
     mlir::RankedTensorType new_type = mlir::RankedTensorType::get(
         GetShapeOfValue(new_cast_op.getResult()).value(),
-        new_cast_op.getOperand()
-            .getType()
-            .cast<mlir::TensorType>()
+        mlir::cast<mlir::TensorType>(new_cast_op.getOperand().getType())
             .getElementType());
 
     // Recursively shape inference to the input of the cast op with the
@@ -120,7 +119,7 @@ mlir::LogicalResult BackwardShapeInferenceToRestoreOp(mlir::ModuleOp module,
     auto new_recv_op = builder->create<mlir::TF::DTensorRecv>(
         recv_op.getLoc(), type, builder->getStringAttr(recv_op.getKey()),
         mlir::TF::ShapeAttr::get(builder->getContext(),
-                                 type.dyn_cast<mlir::TensorType>()),
+                                 mlir::dyn_cast<mlir::TensorType>(type)),
         mlir::dtensor::MeshAttr::get(builder->getContext(), recv_op.getMesh()));
 
     recv_op.replaceAllUsesWith(new_recv_op.getOutput());
@@ -130,7 +129,7 @@ mlir::LogicalResult BackwardShapeInferenceToRestoreOp(mlir::ModuleOp module,
         module, new_recv_op);
 
     if (!send_op.ok())
-      return recv_op.emitOpError(tsl::NullTerminatedMessage(send_op.status()));
+      return recv_op.emitOpError(absl::StatusMessageAsCStr(send_op.status()));
 
     // Recursively shape inference to the input of the send op.
     return BackwardShapeInferenceToRestoreOp(
diff --git a/tensorflow/dtensor/mlir/sparse_expansions/dynamic_enqueue_sparse_expander.cc b/tensorflow/dtensor/mlir/sparse_expansions/dynamic_enqueue_sparse_expander.cc
index 3cc8d9e3ec513c..f609c1576f72fd 100644
--- a/tensorflow/dtensor/mlir/sparse_expansions/dynamic_enqueue_sparse_expander.cc
+++ b/tensorflow/dtensor/mlir/sparse_expansions/dynamic_enqueue_sparse_expander.cc
@@ -39,7 +39,7 @@ namespace {
 StatusOr<mlir::Value> ExpandIndices(mlir::OpBuilder& builder,
                                     mlir::Value indices) {
   int64_t num_dim =
-      indices.getType().dyn_cast<mlir::RankedTensorType>().getDimSize(1);
+      mlir::dyn_cast<mlir::RankedTensorType>(indices.getType()).getDimSize(1);
   if (num_dim != 2)
     return errors::Unimplemented(
         "Sparse tensors with dense rank not equal to 2 is not yet supported in "
@@ -47,7 +47,8 @@ StatusOr<mlir::Value> ExpandIndices(mlir::OpBuilder& builder,
   mlir::Location loc = indices.getLoc();
   auto indices_padded_type = mlir::RankedTensorType::get(
       {mlir::ShapedType::kDynamic, 3},
-      indices.getType().dyn_cast<mlir::RankedTensorType>().getElementType());
+      mlir::dyn_cast<mlir::RankedTensorType>(indices.getType())
+          .getElementType());
   // Little trick to make a rank-2 tensor of [[0,0], [0,1]] using rank 1
   // constants.
   mlir::Value indices_padding = builder.create<mlir::TF::ReshapeOp>(
diff --git a/tensorflow/dtensor/mlir/spmd_expander.cc b/tensorflow/dtensor/mlir/spmd_expander.cc
index 4e63f87970777f..5538afe18c1a94 100644
--- a/tensorflow/dtensor/mlir/spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/spmd_expander.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/core/platform/errors.h"
@@ -177,7 +178,7 @@ Status SPMDExpanderBase::ExpandOpAndSetLayout(mlir::Operation* op,
   global_output_shapes.reserve(op->getNumResults());
   for (auto output_value : op->getResults()) {
     auto maybe_ranked =
-        output_value.getType().dyn_cast<mlir::RankedTensorType>();
+        mlir::dyn_cast<mlir::RankedTensorType>(output_value.getType());
     // Do not extract global shape if the shape isn't statically known.
     //
     // This is a bit subtle and relies on the check of static shape of output
diff --git a/tensorflow/dtensor/mlir/spmd_expander_common.cc b/tensorflow/dtensor/mlir/spmd_expander_common.cc
index b3f823ae7e9fc4..1cee400f1a283f 100644
--- a/tensorflow/dtensor/mlir/spmd_expander_common.cc
+++ b/tensorflow/dtensor/mlir/spmd_expander_common.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/core/platform/errors.h"
@@ -113,7 +114,7 @@ Status CreateSplitOp(const int num_split, const int split_dimension,
   // Correctly set output shapes of split op output if input shape is statically
   // known.
   mlir::Type output_type;
-  auto input_type = src_input.getType().cast<mlir::TensorType>();
+  auto input_type = mlir::cast<mlir::TensorType>(src_input.getType());
 
   if (input_type.hasRank()) {
     if (input_type.getShape()[split_dimension] == mlir::ShapedType::kDynamic) {
@@ -680,7 +681,7 @@ mlir::StringAttr GetUniqueControlflowFnName(const std::string& prefix,
 
 Status SetBuilderInsertionAfterValue(mlir::Value value,
                                      mlir::OpBuilder& builder) {
-  if (value.isa<mlir::OpResult>()) {
+  if (mlir::isa<mlir::OpResult>(value)) {
     builder.setInsertionPointAfterValue(value);
     return absl::OkStatus();
   }
@@ -719,7 +720,7 @@ Status PrintTensor(mlir::Value value, const std::string& format_string = "%s") {
 Status ExtractConstStringVectorFromValue(
     mlir::Value value, llvm::SmallVectorImpl<std::string>& out_vector) {
   value = GetForwardedDTensorLayoutInput(value);
-  if (value.isa<mlir::BlockArgument>())
+  if (mlir::isa<mlir::BlockArgument>(value))
     return errors::Internal("Unable get constant value from block argument.");
   mlir::DenseStringElementsAttr attr;
   if (!matchPattern(value, m_Constant(&attr))) {
@@ -736,7 +737,7 @@ Status ExtractConstStringVectorFromValue(
 
 StatusOr<std::string> ExtractConstScalarStringFromValue(mlir::Value value) {
   value = GetForwardedDTensorLayoutInput(value);
-  if (value.isa<mlir::BlockArgument>())
+  if (mlir::isa<mlir::BlockArgument>(value))
     return errors::Internal("Unable get constant value from block argument.");
   mlir::DenseStringElementsAttr attr;
   if (!matchPattern(value, m_Constant(&attr))) {
diff --git a/tensorflow/dtensor/mlir/tpu_add_resource_device_attribute.cc b/tensorflow/dtensor/mlir/tpu_add_resource_device_attribute.cc
index 7fcf1d3b3e2c20..7094adde2ea977 100644
--- a/tensorflow/dtensor/mlir/tpu_add_resource_device_attribute.cc
+++ b/tensorflow/dtensor/mlir/tpu_add_resource_device_attribute.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
@@ -92,10 +93,10 @@ struct DTensorTpuAddResourceDeviceAttribute
     mlir::WalkResult walk_result =
         module.walk([](mlir::TF::TPUExecuteOp tpu_execute) {
           for (mlir::Value tpu_input : tpu_execute.getOperands()) {
-            if (tpu_input.isa<mlir::BlockArgument>() &&
+            if (mlir::isa<mlir::BlockArgument>(tpu_input) &&
                 IsResourceType(tpu_input))
               AddPlaceholderDeviceAttributeToResource(
-                  tpu_input.cast<mlir::BlockArgument>(), tpu_execute);
+                  mlir::cast<mlir::BlockArgument>(tpu_input), tpu_execute);
 
             mlir::Operation* input_op = tpu_input.getDefiningOp();
             auto read_variable_op =
@@ -103,7 +104,7 @@ struct DTensorTpuAddResourceDeviceAttribute
             if (!read_variable_op) continue;
 
             AddPlaceholderDeviceAttributeToResource(
-                read_variable_op.getResource().cast<mlir::BlockArgument>(),
+                mlir::cast<mlir::BlockArgument>(read_variable_op.getResource()),
                 tpu_execute);
           }
 
@@ -113,9 +114,9 @@ struct DTensorTpuAddResourceDeviceAttribute
             if (assign_variable == nullptr) continue;
 
             AddPlaceholderDeviceAttributeToResource(
-                llvm::cast<mlir::TF::AssignVariableOp>(assign_variable)
-                    .getResource()
-                    .cast<mlir::BlockArgument>(),
+                mlir::cast<mlir::BlockArgument>(
+                    llvm::cast<mlir::TF::AssignVariableOp>(assign_variable)
+                        .getResource()),
                 tpu_execute);
           }
 
diff --git a/tensorflow/dtensor/mlir/utils/collective_lowering.cc b/tensorflow/dtensor/mlir/utils/collective_lowering.cc
index 5a12d4de95dcc0..fb71b092d3c695 100644
--- a/tensorflow/dtensor/mlir/utils/collective_lowering.cc
+++ b/tensorflow/dtensor/mlir/utils/collective_lowering.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.h"
 #include "xla/tsl/util/env_var.h"
 #include "tensorflow/core/platform/errors.h"
@@ -269,7 +270,8 @@ mlir::Operation* EmitCollectiveReduceScatter(
     const mlir::DenseIntElementsAttr& group_assignment, int32 scatter_dimension,
     int32 key_base, mlir::Value device_id, int32 host_group_size,
     const mlir::StringRef device_type) {
-  mlir::TensorType input_type = input.getType().dyn_cast<mlir::TensorType>();
+  mlir::TensorType input_type =
+      mlir::dyn_cast<mlir::TensorType>(input.getType());
 
   const bool need_transpose = scatter_dimension != 0;
   std::vector<int64> perm_for_transpose;
@@ -282,9 +284,10 @@ mlir::Operation* EmitCollectiveReduceScatter(
     auto pre_transpose_op =
         EmitTransposeOp(builder, loc, input, perm_for_transpose);
     input = pre_transpose_op->getResult(0);
-    input_type = input.getType().dyn_cast<mlir::TensorType>();
+    input_type = mlir::dyn_cast<mlir::TensorType>(input.getType());
     // Compute transposed output type for CollectiveReduceScatter
-    auto output_shape = output_type.dyn_cast<mlir::TensorType>().getShape();
+    auto output_shape =
+        mlir::dyn_cast<mlir::TensorType>(output_type).getShape();
     std::vector<int64> transposed_shape(output_shape.begin(),
                                         output_shape.end());
     for (int i = 0; i < output_shape.size(); i++) {
@@ -338,8 +341,8 @@ mlir::Operation* EmitCollectiveAllToAll(
   // data correctly. An example relayout that requires this is [y, unsharded, x]
   // -> [y, x, unsharded].
   const mlir::TensorType input_type =
-      input.getType().dyn_cast<mlir::TensorType>();
-  auto input_shape = input_type.dyn_cast<mlir::TensorType>().getShape();
+      mlir::dyn_cast<mlir::TensorType>(input.getType());
+  auto input_shape = mlir::dyn_cast<mlir::TensorType>(input_type).getShape();
 
   // TODO(trevor-m): One of the transpose pairs created when requires_transpose
   // is true can be combined with the transpose in permute_data() that lies on
@@ -478,7 +481,7 @@ mlir::Operation* EmitCollectiveGather(
   auto shape = group_assignment.getType().getShape();
   const int32 group_size = shape[1];
   const mlir::TensorType input_type =
-      input.getType().dyn_cast<mlir::TensorType>();
+      mlir::dyn_cast<mlir::TensorType>(input.getType());
   auto input_shape = input_type.getShape();
   auto dim_0_shape = input_shape[0];
   std::vector<int64> output_shape = {input_shape.begin(), input_shape.end()};
@@ -758,9 +761,9 @@ mlir::LogicalResult LowerAllGatherOpToCollective(
   const std::string device_type = device_type_or_status.value();
 
   const mlir::RankedTensorType input_type =
-      all_gather.getInput().getType().dyn_cast<mlir::RankedTensorType>();
+      mlir::dyn_cast<mlir::RankedTensorType>(all_gather.getInput().getType());
   const mlir::RankedTensorType output_type =
-      all_gather.getOutput().getType().dyn_cast<mlir::RankedTensorType>();
+      mlir::dyn_cast<mlir::RankedTensorType>(all_gather.getOutput().getType());
 
   if (!input_type)
     return all_gather.emitOpError() << "input type is not a RankedTensorType";
@@ -901,9 +904,9 @@ mlir::LogicalResult LowerAllGatherOp(mlir::TF::DTensorAllGatherOp all_gather) {
   }
 
   const mlir::RankedTensorType input_type =
-      all_gather.getInput().getType().dyn_cast<mlir::RankedTensorType>();
+      mlir::dyn_cast<mlir::RankedTensorType>(all_gather.getInput().getType());
   const mlir::RankedTensorType output_type =
-      all_gather.getOutput().getType().dyn_cast<mlir::RankedTensorType>();
+      mlir::dyn_cast<mlir::RankedTensorType>(all_gather.getOutput().getType());
 
   if (!input_type)
     return all_gather.emitOpError() << "input type is not a RankedTensorType";
@@ -1048,7 +1051,7 @@ mlir::LogicalResult LowerAllGatherOp(mlir::TF::DTensorAllGatherOp all_gather) {
   // position in the tensor, only one task in the reduction group can have a 1.
   // This is sufficient.
   const mlir::TensorType type =
-      update_result.getType().dyn_cast<mlir::TensorType>();
+      mlir::dyn_cast<mlir::TensorType>(update_result.getType());
   absl::string_view reduce_type = kReduceOpAdd;
   if (type && type.getElementType().isInteger(1)) reduce_type = kReduceOpAny;
   mlir::TF::DTensorAllReduceOp all_reduce =
@@ -1090,7 +1093,7 @@ mlir::LogicalResult LowerAllScatterOp(
   // sharding_spec[j]=i and this is a dimension with split and 0 otherwise.
 
   mlir::RankedTensorType output_type =
-      all_scatter.getOutput().getType().dyn_cast<mlir::RankedTensorType>();
+      mlir::dyn_cast<mlir::RankedTensorType>(all_scatter.getOutput().getType());
   if (!output_type)
     return all_scatter.emitOpError() << "input must have static rank";
 
diff --git a/tensorflow/examples/custom_ops_doc/multiplex_4/multiplex_4_op.cc b/tensorflow/examples/custom_ops_doc/multiplex_4/multiplex_4_op.cc
index 3529af44819f59..4a1f5b9a877dd2 100644
--- a/tensorflow/examples/custom_ops_doc/multiplex_4/multiplex_4_op.cc
+++ b/tensorflow/examples/custom_ops_doc/multiplex_4/multiplex_4_op.cc
@@ -42,7 +42,7 @@ Status MultiplexShapeFunction(InferenceContext* c) {
     TF_RETURN_IF_ERROR(c->Merge(c->input(i), c->input(last), &unused));
   }
   c->set_output(0, c->input(last));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 REGISTER_OP("Examples>MultiplexDense")
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index ed921329d2fea3..ea62d33e92d431 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -19563,6 +19563,10 @@ func GatherV2BatchDims(value int64) GatherV2Attr {
 // On GPU, if an out of bound index is found, a 0 is stored in the
 // corresponding output value.
 //
+// Note that on TPU, if any dimension of `params` is of size 0 then the output will
+// be the expected shape filled with zeros. On CPU and GPU an error will be
+// returned.
+//
 // See also `tf.batch_gather` and `tf.gather_nd`.
 //
 // Arguments:
@@ -19859,6 +19863,22 @@ func GetSessionTensor(scope *Scope, handle tf.Output, dtype tf.DataType) (value
 	return op.Output(0)
 }
 
+// An op returns the TPU task ID from TPU topology.
+//
+// This op is to return the TPU task ID from TPU topology.
+//
+// Returns The TPU task ID from TPU topology.
+func GetTpuTaskId(scope *Scope) (tpu_task_id tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "GetTpuTaskId",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns the truth value of (x > y) element-wise.
 //
 // *NOTE*: `Greater` supports broadcasting. More about broadcasting
@@ -57778,6 +57798,28 @@ func Unstage(scope *Scope, dtypes []tf.DataType, optional ...UnstageAttr) (value
 	return values
 }
 
+// An op to update the task ID and global core array.
+//
+// This op is to update the task ID and global core array.
+//
+// Arguments:
+//
+//	tpu_task_id_to_shard_id: An array of int32 that maps TPU task ID to shard ID.
+//
+// Returns the created operation.
+func UpdateTaskIdAndGlobalCoreArray(scope *Scope, tpu_task_id_to_shard_id []tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "UpdateTaskIdAndGlobalCoreArray",
+		Input: []tf.Input{
+			tf.OutputList(tpu_task_id_to_shard_id),
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // UpperBoundAttr is an optional argument to UpperBound.
 type UpperBoundAttr func(optionalAttr)
 
diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index e9fd75d7050234..a40a359fe3f4aa 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -12,45 +12,6 @@ package_group(
     packages = ["//tensorflow/lite/..."],
 )
 
-package_group(
-    name = "tf_lite_current_dependencies_in_tf",
-    packages = [
-        "//tensorflow",
-        "//tensorflow/compiler/mlir/lite",
-        "//tensorflow/compiler/mlir/lite/experimental/google/interpreter_from_mlir",
-        "//tensorflow/compiler/mlir/lite/experimental/google/tooling",
-        "//tensorflow/compiler/mlir/lite/experimental/google/tooling/tools",
-        "//tensorflow/compiler/mlir/lite/experimental/jax/tests",
-        "//tensorflow/compiler/mlir/lite/experimental/tac/tools",
-        "//tensorflow/compiler/mlir/lite/experimental/xnnpack_mhlo",
-        "//tensorflow/compiler/mlir/lite/metrics",
-        "//tensorflow/compiler/mlir/lite/python",
-        "//tensorflow/compiler/mlir/lite/quantization/lite",
-        "//tensorflow/compiler/mlir/lite/sparsity",
-        "//tensorflow/compiler/mlir/lite/stablehlo",
-        "//tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir",
-        "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
-        "//tensorflow/compiler/mlir/quantization/tensorflow/utils",
-        "//tensorflow/compiler/mlir/tosa",
-        "//tensorflow/core",
-        "//tensorflow/python",
-        "//tensorflow/python/kernel_tests/signal",
-        "//tensorflow/python/tools/api/generator",
-    ],
-)
-
-package_group(
-    name = "anything_but_tf",
-    includes = [
-        "tf_lite_current_dependencies_in_tf",
-        "tf_lite_runtime",
-    ],
-    packages = [
-        "-//third_party/tensorflow/...",
-        "//...",
-    ],
-)
-
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:LICENSE"],
     default_visibility = ["//visibility:public"],
@@ -534,9 +495,7 @@ cc_library(
     ],
     compatible_with = get_compatible_with_portable(),
     copts = tflite_copts_warnings(),
-    deps = [
-        "//tensorflow/lite/core/api:error_reporter",
-    ],
+    deps = ["//tensorflow/lite/core/api:error_reporter"],
 )
 
 cc_library(
@@ -583,6 +542,7 @@ cc_library_with_tflite(
     ],
     deps = [
         ":allocation",
+        ":array",
         ":external_cpu_backend_context",
         ":graph_info",
         ":kernel_api",
@@ -629,6 +589,7 @@ cc_library_with_tflite(
     ],
     deps = [
         ":allocation",
+        ":array",
         ":external_cpu_backend_context",
         ":graph_info",
         ":kernel_api",
@@ -699,6 +660,7 @@ cc_library_with_tflite(
     ],
     deps = [
         ":allocation",
+        ":array",
         ":external_cpu_backend_context",
         ":graph_info",
         ":logger",
@@ -781,6 +743,7 @@ cc_library_with_tflite(
     ],
     deps = [
         ":allocation",
+        ":array",
         ":builtin_ops",
         ":external_cpu_backend_context",
         ":macros",
diff --git a/tensorflow/lite/acceleration/configuration/configuration.proto b/tensorflow/lite/acceleration/configuration/configuration.proto
index a2bbd88004f8f6..cf82771c225328 100644
--- a/tensorflow/lite/acceleration/configuration/configuration.proto
+++ b/tensorflow/lite/acceleration/configuration/configuration.proto
@@ -317,6 +317,10 @@ enum XNNPackFlags {
 message XNNPackSettings {
   optional int32 num_threads = 1;
   optional XNNPackFlags flags = 2 [default = TFLITE_XNNPACK_DELEGATE_NO_FLAGS];
+  // Path to the experimental XNNPack cache file. XNNPack packed buffers are
+  // saved to and reloaded from this cache which can reduce initialization time
+  // and the packing memory footprint.
+  optional string experimental_weight_cache_file_path = 3;
 }
 
 // CoreML Delegate settings.
diff --git a/tensorflow/lite/acceleration/configuration/configuration_generated.h b/tensorflow/lite/acceleration/configuration/configuration_generated.h
index aa6d94af9e30d8..5fd634acad5837 100644
--- a/tensorflow/lite/acceleration/configuration/configuration_generated.h
+++ b/tensorflow/lite/acceleration/configuration/configuration_generated.h
@@ -22,9 +22,9 @@ limitations under the License.
 
 // Ensure the included flatbuffers.h is the same version as when this file was
 // generated, otherwise it may not be compatible.
-static_assert(FLATBUFFERS_VERSION_MAJOR == 23 &&
-              FLATBUFFERS_VERSION_MINOR == 5 &&
-              FLATBUFFERS_VERSION_REVISION == 26,
+static_assert(FLATBUFFERS_VERSION_MAJOR == 24 &&
+              FLATBUFFERS_VERSION_MINOR == 3 &&
+              FLATBUFFERS_VERSION_REVISION == 25,
              "Non-compatible flatbuffers version included");
 
 namespace tflite {
@@ -1692,6 +1692,7 @@ struct XNNPackSettingsT : public ::flatbuffers::NativeTable {
   typedef XNNPackSettings TableType;
   int32_t num_threads = 0;
   tflite::XNNPackFlags flags = tflite::XNNPackFlags_TFLITE_XNNPACK_DELEGATE_NO_FLAGS;
+  std::string experimental_weight_cache_file_path{};
 };
 
 struct XNNPackSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
@@ -1699,7 +1700,8 @@ struct XNNPackSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef XNNPackSettingsBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_NUM_THREADS = 4,
-    VT_FLAGS = 6
+    VT_FLAGS = 6,
+    VT_EXPERIMENTAL_WEIGHT_CACHE_FILE_PATH = 8
   };
   int32_t num_threads() const {
     return GetField<int32_t>(VT_NUM_THREADS, 0);
@@ -1707,10 +1709,15 @@ struct XNNPackSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   tflite::XNNPackFlags flags() const {
     return static_cast<tflite::XNNPackFlags>(GetField<int32_t>(VT_FLAGS, 0));
   }
+  const ::flatbuffers::String *experimental_weight_cache_file_path() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_EXPERIMENTAL_WEIGHT_CACHE_FILE_PATH);
+  }
   bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_NUM_THREADS, 4) &&
            VerifyField<int32_t>(verifier, VT_FLAGS, 4) &&
+           VerifyOffset(verifier, VT_EXPERIMENTAL_WEIGHT_CACHE_FILE_PATH) &&
+           verifier.VerifyString(experimental_weight_cache_file_path()) &&
            verifier.EndTable();
   }
   XNNPackSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -1728,6 +1735,9 @@ struct XNNPackSettingsBuilder {
   void add_flags(tflite::XNNPackFlags flags) {
     fbb_.AddElement<int32_t>(XNNPackSettings::VT_FLAGS, static_cast<int32_t>(flags), 0);
   }
+  void add_experimental_weight_cache_file_path(::flatbuffers::Offset<::flatbuffers::String> experimental_weight_cache_file_path) {
+    fbb_.AddOffset(XNNPackSettings::VT_EXPERIMENTAL_WEIGHT_CACHE_FILE_PATH, experimental_weight_cache_file_path);
+  }
   explicit XNNPackSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -1742,13 +1752,28 @@ struct XNNPackSettingsBuilder {
 inline ::flatbuffers::Offset<XNNPackSettings> CreateXNNPackSettings(
     ::flatbuffers::FlatBufferBuilder &_fbb,
     int32_t num_threads = 0,
-    tflite::XNNPackFlags flags = tflite::XNNPackFlags_TFLITE_XNNPACK_DELEGATE_NO_FLAGS) {
+    tflite::XNNPackFlags flags = tflite::XNNPackFlags_TFLITE_XNNPACK_DELEGATE_NO_FLAGS,
+    ::flatbuffers::Offset<::flatbuffers::String> experimental_weight_cache_file_path = 0) {
   XNNPackSettingsBuilder builder_(_fbb);
+  builder_.add_experimental_weight_cache_file_path(experimental_weight_cache_file_path);
   builder_.add_flags(flags);
   builder_.add_num_threads(num_threads);
   return builder_.Finish();
 }
 
+inline ::flatbuffers::Offset<XNNPackSettings> CreateXNNPackSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t num_threads = 0,
+    tflite::XNNPackFlags flags = tflite::XNNPackFlags_TFLITE_XNNPACK_DELEGATE_NO_FLAGS,
+    const char *experimental_weight_cache_file_path = nullptr) {
+  auto experimental_weight_cache_file_path__ = experimental_weight_cache_file_path ? _fbb.CreateString(experimental_weight_cache_file_path) : 0;
+  return tflite::CreateXNNPackSettings(
+      _fbb,
+      num_threads,
+      flags,
+      experimental_weight_cache_file_path__);
+}
+
 ::flatbuffers::Offset<XNNPackSettings> CreateXNNPackSettings(::flatbuffers::FlatBufferBuilder &_fbb, const XNNPackSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct CoreMLSettingsT : public ::flatbuffers::NativeTable {
@@ -4911,7 +4936,8 @@ inline ::flatbuffers::Offset<HexagonSettings> CreateHexagonSettings(::flatbuffer
 inline bool operator==(const XNNPackSettingsT &lhs, const XNNPackSettingsT &rhs) {
   return
       (lhs.num_threads == rhs.num_threads) &&
-      (lhs.flags == rhs.flags);
+      (lhs.flags == rhs.flags) &&
+      (lhs.experimental_weight_cache_file_path == rhs.experimental_weight_cache_file_path);
 }
 
 inline bool operator!=(const XNNPackSettingsT &lhs, const XNNPackSettingsT &rhs) {
@@ -4930,6 +4956,7 @@ inline void XNNPackSettings::UnPackTo(XNNPackSettingsT *_o, const ::flatbuffers:
   (void)_resolver;
   { auto _e = num_threads(); _o->num_threads = _e; }
   { auto _e = flags(); _o->flags = _e; }
+  { auto _e = experimental_weight_cache_file_path(); if (_e) _o->experimental_weight_cache_file_path = _e->str(); }
 }
 
 inline ::flatbuffers::Offset<XNNPackSettings> XNNPackSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const XNNPackSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
@@ -4942,10 +4969,12 @@ inline ::flatbuffers::Offset<XNNPackSettings> CreateXNNPackSettings(::flatbuffer
   struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const XNNPackSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _num_threads = _o->num_threads;
   auto _flags = _o->flags;
+  auto _experimental_weight_cache_file_path = _o->experimental_weight_cache_file_path.empty() ? 0 : _fbb.CreateString(_o->experimental_weight_cache_file_path);
   return tflite::CreateXNNPackSettings(
       _fbb,
       _num_threads,
-      _flags);
+      _flags,
+      _experimental_weight_cache_file_path);
 }
 
 
diff --git a/tensorflow/lite/acceleration/configuration/testdata/configuration.proto_prev b/tensorflow/lite/acceleration/configuration/testdata/configuration.proto_prev
index 655d95f5990120..e9616ecb4afb65 100644
--- a/tensorflow/lite/acceleration/configuration/testdata/configuration.proto_prev
+++ b/tensorflow/lite/acceleration/configuration/testdata/configuration.proto_prev
@@ -97,6 +97,8 @@ enum Delegate {
   CORE_ML = 7;
   // Arm NN Delegate.
   ARMNN = 8;
+  // MediaTek Neuron Delegate.
+  MTK_NEURON = 9;
 }
 
 enum NNAPIExecutionPreference {
@@ -662,6 +664,111 @@ message ArmNNSettings {
   optional string additional_parameters = 3;
 }
 
+// MediaTek Neuron Delegate Settings.
+// See https://neuropilot.mediatek.com/ for more information.
+message MtkNeuronSettings {
+  enum ExecutionPreference {
+    PREFERENCE_UNDEFINED = 0;
+
+    // Prefer execution in a power-efficient mode, optimizing for low power
+    // consumption.
+    PREFERENCE_LOW_POWER = 1;
+
+    // Prefer execution that provides shorter single-shot latency, optimizing
+    // for fast response times.
+    PREFERENCE_FAST_SINGLE_ANSWER = 2;
+
+    // Prefer execution that provides sustained speed for continuous operation
+    // and higher throughput, optimizing for overall performance in ongoing or
+    // repetitive tasks.
+    PREFERENCE_SUSTAINED_SPEED = 3;
+
+    // Prefer execution in the turbo boost mode, which may boost the frequencies
+    // of APU and other system components such as CPU and DRAM, to achieve
+    // maximum performance. If boosting is not supported in the underlying
+    // system, it falls back to the behavior of PREFERENCE_FAST_SINGLE_ANSWER.
+    PREFERENCE_TURBO_BOOST = 4;
+  }
+
+  enum ExecutionPriority {
+    PRIORITY_UNDEFINED = 0;
+    PRIORITY_LOW = 90;
+    PRIORITY_MEDIUM = 100;
+    PRIORITY_HIGH = 110;
+  }
+
+  enum OptimizationHint {
+    OPTIMIZATION_NONE = 0;
+
+    // Optimization hint for reducing latency. This hint may distribute the
+    // workload across multiple APU cores in the compiled model to achieve
+    // faster execution.
+    OPTIMIZATION_LOW_LATENCY = 1;
+
+    // Optimization hint for reducing DRAM access and minimizing memory
+    // bandwidth usage through kernel fusion and data fusion techniques.
+    OPTIMIZATION_DEEP_FUSION = 2;
+
+    // Optimization hint for processing multiple input samples in parallel
+    // across available APU cores in the batch dimension. This optimization is
+    // effective for models with a batch size greater than 1.
+    OPTIMIZATION_BATCH_PROCESSING = 3;
+  }
+
+  // How to check the operator compatibility with the underlying accelerator.
+  enum OperationCheckMode {
+    NO_OPERATION_CHECK = 0;
+
+    // Checks each node separately with multiple queries to the backend.
+    PER_NODE_OPERATION_CHECK = 1;
+
+    // Checks all nodes in the graph at once with a batched query to the
+    // backend.
+    PRE_OPERATION_CHECK = 2;
+  }
+
+  // The preferred execution mode. The system-wide default will be used when
+  // PREFERENCE_UNDEFINED is passed to the delegate.
+  optional ExecutionPreference execution_preference = 1;
+
+  // The execution priority of the inference request. The system-wide default
+  // will be used when PRIORITY_UNDEFINED is passed to the delegate.
+  optional ExecutionPriority execution_priority = 2;
+
+  // The optimization hints that will instruct the model compiler.
+  repeated OptimizationHint optimization_hints = 3 [packed = true];
+
+  // Whether and how to check the operator compatibility with the underlying
+  // accelerator.
+  optional OperationCheckMode operation_check_mode = 4;
+
+  // Whether to allow the accelerator to optionally use lower-precision FP16
+  // arithmetic when performing calculations on FP32 data.
+  optional bool allow_fp16_precision_for_fp32 = 5;
+
+  // Whether to use AHardwareBuffer_* API to manage buffers. Requires Android
+  // API level >= 26, or a dedicated AHardwareBuffer API shim on non-Android
+  // platforms.
+  optional bool use_ahwb = 6;
+
+  // Whether to use cachable (consistent / coherent) memory. This will affect
+  // both buffer allocation and buffer importing behaviors.
+  optional bool use_cacheable_buffer = 7 [default = true];
+
+  // Extra options for the Neuron compiler, such as "--opt-bw".
+  // See docs at https://neuropilot.mediatek.com/ for available options.
+  repeated string compile_options = 8;
+
+  // Optional list of target accelerator device names.
+  // If empty, the delegate will automatically select the accelerator.
+  // See docs at https://neuropilot.mediatek.com/ for available accelerators.
+  repeated string accelerator_names = 9;
+
+  // Optional path to the platform-dependent Neuron configuration file.
+  // See docs at https://neuropilot.mediatek.com/ for more details.
+  optional string neuron_config_path = 10;
+}
+
 // How to configure TFLite.
 message TFLiteSettings {
   // Which delegate to use.
@@ -719,6 +826,9 @@ message TFLiteSettings {
 
   // For configuring the Arm NN delegate.
   optional ArmNNSettings armnn_settings = 16;
+
+  // For configuring MediaTek Neuron delegate.
+  optional MtkNeuronSettings mtk_neuron_settings = 17;
 }
 
 // Whether to automatically fallback to TFLite CPU path on delegation errors.
diff --git a/tensorflow/lite/builtin_ops.h b/tensorflow/lite/builtin_ops.h
index a8d68368b06656..5dba0f6e2e8534 100644
--- a/tensorflow/lite/builtin_ops.h
+++ b/tensorflow/lite/builtin_ops.h
@@ -233,6 +233,7 @@ typedef enum {
   kTfLiteBuiltinDilate = 203,
   kTfLiteBuiltinStablehloRngBitGenerator = 204,
   kTfLiteBuiltinReduceWindow = 205,
+  kTfLiteBuiltinStablehloComposite = 206,
 } TfLiteBuiltinOperator;
 
 #ifdef __cplusplus
diff --git a/tensorflow/lite/core/BUILD b/tensorflow/lite/core/BUILD
index da0becf7c6a52b..c3939ab4b33c8d 100644
--- a/tensorflow/lite/core/BUILD
+++ b/tensorflow/lite/core/BUILD
@@ -48,7 +48,9 @@ cc_library(
     ],
     deps = [
         ":cc_api_stable",
+        ":signature_runner",
         "//tensorflow/lite:allocation",
+        "//tensorflow/lite:array",
         "//tensorflow/lite:external_cpu_backend_context",
         "//tensorflow/lite:graph_info",
         "//tensorflow/lite:interpreter_options_header",
@@ -60,7 +62,6 @@ cc_library(
         "//tensorflow/lite:type_to_tflitetype",
         "//tensorflow/lite:util",
         "//tensorflow/lite/c:common_internal",
-        "//tensorflow/lite/core:signature_runner",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/core/api:verifier",
         "//tensorflow/lite/core/async:async_signature_runner",
@@ -118,7 +119,9 @@ cc_library(
         ":cc_api_experimental",
         ":cc_api_stable",
         ":model_builder",
+        ":signature_runner",
         "//tensorflow/lite:allocation",
+        "//tensorflow/lite:array",
         "//tensorflow/lite:external_cpu_backend_context",
         "//tensorflow/lite:graph_info",
         "//tensorflow/lite:interpreter_options_header",
@@ -130,7 +133,6 @@ cc_library(
         "//tensorflow/lite:type_to_tflitetype",
         "//tensorflow/lite:util",
         "//tensorflow/lite/c:common_internal",
-        "//tensorflow/lite/core:signature_runner",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/core/api:verifier",
         "//tensorflow/lite/core/async:async_signature_runner",
@@ -170,8 +172,10 @@ cc_library(
     ],
     deps = [
         ":model_builder",
+        ":signature_runner",
         ":subgraph",
         "//tensorflow/lite:allocation",
+        "//tensorflow/lite:array",
         "//tensorflow/lite:external_cpu_backend_context",
         "//tensorflow/lite:graph_info",
         "//tensorflow/lite:interpreter_options_header",
@@ -188,7 +192,6 @@ cc_library(
         "//tensorflow/lite:util",
         "//tensorflow/lite:version",
         "//tensorflow/lite/c:common_internal",
-        "//tensorflow/lite/core:signature_runner",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/core/api:verifier",
         "//tensorflow/lite/core/async:async_signature_runner",
@@ -249,7 +252,9 @@ cc_library(
     ],
     deps = [
         ":cc_api_stable",
+        ":signature_runner",
         "//tensorflow/lite:allocation",
+        "//tensorflow/lite:array",
         "//tensorflow/lite:external_cpu_backend_context",
         "//tensorflow/lite:graph_info",
         "//tensorflow/lite:interpreter_options_header",
@@ -261,7 +266,6 @@ cc_library(
         "//tensorflow/lite:type_to_tflitetype",
         "//tensorflow/lite:util",
         "//tensorflow/lite/c:common_internal",
-        "//tensorflow/lite/core:signature_runner",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/core/api:verifier",
         "//tensorflow/lite/core/async:async_signature_runner",
@@ -463,6 +467,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/lite:allocation",
+        "//tensorflow/lite:array",
         "//tensorflow/lite:graph_info",
         "//tensorflow/lite:interpreter_options_header",
         "//tensorflow/lite:kernel_api",
diff --git a/tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin.cc b/tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin.cc
index c82b0a7964af04..c2e9ec540fcbb0 100644
--- a/tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin.cc
+++ b/tensorflow/lite/core/acceleration/configuration/c/xnnpack_plugin.cc
@@ -39,6 +39,10 @@ static TfLiteDelegate* CreateDelegate(const void* settings) {
     if (xnnpack_settings->flags()) {
       options.flags = xnnpack_settings->flags();
     }
+    if (xnnpack_settings->experimental_weight_cache_file_path()) {
+      options.experimental_weight_cache_file_path =
+          xnnpack_settings->experimental_weight_cache_file_path()->c_str();
+    }
   }
   return TfLiteXNNPackDelegateCreate(&options);
 }
diff --git a/tensorflow/lite/core/api/BUILD b/tensorflow/lite/core/api/BUILD
index 1d6e1ca1eed47a..08ac033fcb0f77 100644
--- a/tensorflow/lite/core/api/BUILD
+++ b/tensorflow/lite/core/api/BUILD
@@ -48,7 +48,6 @@ cc_library(
         # central location.
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/schema:schema_utils",
     ],
 )
 
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index d36c2b69f4058a..35268103be8792 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -921,6 +921,10 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_STABLEHLO_PAD: {
       return ParseStablehloPad(op, error_reporter, allocator, builtin_data);
     }
+    case BuiltinOperator_STABLEHLO_COMPOSITE: {
+      return ParseStablehloComposite(op, error_reporter, allocator,
+                                     builtin_data);
+    }
     // TODO: skip param parsing for now since ops below don't have kernels
     case BuiltinOperator_STABLEHLO_SLICE:
     case BuiltinOperator_STABLEHLO_BROADCAST_IN_DIM:
@@ -2381,6 +2385,31 @@ TfLiteStatus ParseStablehloPad(const Operator* op,
   return kTfLiteError;
 }
 
+TfLiteStatus ParseStablehloComposite(const Operator* op,
+                                     ErrorReporter* error_reporter,
+                                     BuiltinDataAllocator* allocator,
+                                     void** builtin_data) {
+  CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
+
+  SafeBuiltinDataAllocator safe_allocator(allocator);
+  auto params = safe_allocator.Allocate<TfLiteStablehloCompositeParams>();
+  const StableHLOCompositeOptions* schema_params =
+      op->builtin_options_2_as_StableHLOCompositeOptions();
+  if (schema_params) {
+    params->name = schema_params->name()->c_str();
+    params->version = schema_params->version();
+    params->subgraph_index = schema_params->decomposition_subgraph_index();
+    params->attributes = schema_params->composite_attributes()->data();
+    params->attributes_size = schema_params->composite_attributes()->size();
+    *builtin_data = params.release();
+    return kTfLiteOk;
+  }
+  TF_LITE_REPORT_ERROR(
+      error_reporter,
+      "Could not get 'stablehlo.composite' operation parameters.");
+  return kTfLiteError;
+}
+
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.h b/tensorflow/lite/core/api/flatbuffer_conversions.h
index 1c90e9fd9bdd68..c01e8875813f93 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.h
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.h
@@ -445,6 +445,11 @@ TfLiteStatus ParseStablehloPad(const Operator* op,
                                BuiltinDataAllocator* allocator,
                                void** builtin_data);
 
+TfLiteStatus ParseStablehloComposite(const Operator* op,
+                                     ErrorReporter* error_reporter,
+                                     BuiltinDataAllocator* allocator,
+                                     void** builtin_data);
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_CORE_API_FLATBUFFER_CONVERSIONS_H_
diff --git a/tensorflow/lite/core/c/BUILD b/tensorflow/lite/core/c/BUILD
index 1b0bccb9e4857e..018b6e1004c06a 100644
--- a/tensorflow/lite/core/c/BUILD
+++ b/tensorflow/lite/core/c/BUILD
@@ -628,6 +628,7 @@ tflite_cc_library_with_c_headers_test(
     ] + c_api_visibility_allowlist(),
     deps = [
         "//tensorflow/lite:builtin_ops",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/c:common_internal",
         "//tensorflow/lite/core/async/c:types",
         "//tensorflow/lite/core/c:c_api_types",
@@ -646,6 +647,7 @@ tflite_cc_library_with_c_headers_test(
     copts = tflite_copts(),
     deps = [
         "//tensorflow/lite:builtin_ops",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/c:common_internal",
         "//tensorflow/lite/core/async/c:types",
         "//tensorflow/lite/core/c:c_api_types",
diff --git a/tensorflow/lite/core/c/builtin_op_data.h b/tensorflow/lite/core/c/builtin_op_data.h
index 1ac385b932b15e..e1428e72307134 100644
--- a/tensorflow/lite/core/c/builtin_op_data.h
+++ b/tensorflow/lite/core/c/builtin_op_data.h
@@ -21,6 +21,7 @@ limitations under the License.
 #define TENSORFLOW_LITE_CORE_C_BUILTIN_OP_DATA_H_
 
 #include <stdbool.h>
+#include <stddef.h>
 #include <stdint.h>
 
 #include "tensorflow/lite/core/c/common.h"
@@ -645,6 +646,14 @@ typedef struct {
   int64_t interior_padding[TFLITE_STABLEHLO_PAD_PARAMS_MAX_DIMENSION_COUNT];
 } TfLiteStablehloPadParams;
 
+typedef struct {
+  const char* name;
+  int32_t subgraph_index;
+  int32_t version;
+  const uint8_t* attributes;
+  size_t attributes_size;
+} TfLiteStablehloCompositeParams;
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/lite/core/c/c_api_test.cc b/tensorflow/lite/core/c/c_api_test.cc
index aaeecfbee21e0e..5f9b820a173827 100644
--- a/tensorflow/lite/core/c/c_api_test.cc
+++ b/tensorflow/lite/core/c/c_api_test.cc
@@ -1102,10 +1102,10 @@ void* FlexSinhInit(TfLiteOpaqueContext* context, const char* buffer,
                    size_t length) {
   auto sinh_params = new SinhParams;
   // The buffer that is passed into here is the custom_options
-  // field from the flatbuffer (tensorflow/lite/schema/schema.fbs)
-  // `Operator` for this node.
-  // Typically it should be stored as a FlexBuffer, but for this test
-  // we assume that it is just a string.
+  // field from the flatbuffer
+  // (third_party/tensorflow/compiler/mlir/lite/schema/schema.fbs) `Operator`
+  // for this node. Typically it should be stored as a FlexBuffer, but for this
+  // test we assume that it is just a string.
   if (std::string(buffer, length) == "use_cosh") {
     sinh_params->use_cosh_instead = true;
   }
diff --git a/tensorflow/lite/core/c/common.h b/tensorflow/lite/core/c/common.h
index ea54be9490ef01..96f19f12336bc4 100644
--- a/tensorflow/lite/core/c/common.h
+++ b/tensorflow/lite/core/c/common.h
@@ -472,6 +472,8 @@ typedef enum TfLiteCustomAllocationFlags {
   kTfLiteCustomAllocationFlagsSkipAlignCheck = 1,
 } TfLiteCustomAllocationFlags;
 
+enum { kTfLiteNoBufferIdentifier = SIZE_MAX };
+
 /// A tensor in the interpreter system which is a wrapper around a buffer of
 /// data including a dimensionality (or NULL if not currently defined).
 #ifndef TF_LITE_STATIC_MEMORY
diff --git a/tensorflow/lite/core/c/operator.cc b/tensorflow/lite/core/c/operator.cc
index f4571fe2b2615d..261504de8e166d 100644
--- a/tensorflow/lite/core/c/operator.cc
+++ b/tensorflow/lite/core/c/operator.cc
@@ -18,21 +18,23 @@ limitations under the License.
 #include <stdint.h>
 
 #include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/c/common_internal.h"
 #include "tensorflow/lite/core/async/c/types.h"
 #include "tensorflow/lite/core/c/c_api_types.h"
 
 TfLiteOperator* TfLiteOperatorCreate(TfLiteBuiltinOperator builtin_code,
                                      const char* custom_name, int version) {
-  return new TfLiteOperator{/*.custom_name =*/custom_name,
-                            /*.version =*/version,
-                            /*.init =*/nullptr,
-                            /*.free =*/nullptr,
-                            /*.prepare =*/nullptr,
-                            /*.invoke =*/nullptr,
-                            /*.async_kernel =*/nullptr,
-                            /*.builtin_code =*/builtin_code,
-                            /*.node_index =*/-1};
+  return new TfLiteOperator{.custom_name = custom_name,
+                            .version = version,
+                            .init = nullptr,
+                            .free = nullptr,
+                            .prepare = nullptr,
+                            .invoke = nullptr,
+                            .async_kernel = nullptr,
+                            .builtin_code = builtin_code,
+                            .node_index = -1,
+                            .inplace_operator = kTfLiteInplaceOpNone};
 }
 
 void TfLiteOperatorDelete(TfLiteOperator* reg) { delete reg; }
diff --git a/tensorflow/lite/core/interpreter_builder.cc b/tensorflow/lite/core/interpreter_builder.cc
index 41e62cfd675340..d8c6d181ebdd1a 100644
--- a/tensorflow/lite/core/interpreter_builder.cc
+++ b/tensorflow/lite/core/interpreter_builder.cc
@@ -691,7 +691,8 @@ TfLiteStatus InterpreterBuilder::ParseTensors(
 
       if (subgraph->SetTensorParametersReadOnly(
               i, type, get_name(tensor), dims, quantization, buffer_ptr,
-              buffer_size, allocation_, sparsity) != kTfLiteOk) {
+              buffer_size, allocation_, sparsity,
+              /*buffer_identifier=*/tensor->buffer()) != kTfLiteOk) {
         TF_LITE_REPORT_ERROR(error_reporter_,
                              "Tensor %d is invalidly specified in schema.\n",
                              i);
diff --git a/tensorflow/lite/core/kernels/BUILD b/tensorflow/lite/core/kernels/BUILD
index fcb5a458edcbce..71f9980fe1b957 100644
--- a/tensorflow/lite/core/kernels/BUILD
+++ b/tensorflow/lite/core/kernels/BUILD
@@ -25,7 +25,6 @@ cc_test(
         ":builtin_ops",
         "//tensorflow/lite:mutable_op_resolver",
         "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
-        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate_test",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest_main",
     ],
diff --git a/tensorflow/lite/core/kernels/builtin_op_kernels.h b/tensorflow/lite/core/kernels/builtin_op_kernels.h
index e0dcbf8d4b0605..00dd44d997b7e0 100644
--- a/tensorflow/lite/core/kernels/builtin_op_kernels.h
+++ b/tensorflow/lite/core/kernels/builtin_op_kernels.h
@@ -321,6 +321,10 @@ Register_STABLEHLO_TRANSPOSE();  // WARNING: not implemented, using this
 TfLiteRegistration* Register_DILATE();
 
 TfLiteRegistration* Register_REDUCE_WINDOW();
+
+TfLiteRegistration*
+Register_STABLEHLO_COMPOSITE();  // WARNING: not implemented, using this
+                                 // op will crash the runtime
 }  // namespace builtin
 }  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/core/kernels/register_test.cc b/tensorflow/lite/core/kernels/register_test.cc
index f47a7f84397c35..6bbc1bd0db07ae 100644
--- a/tensorflow/lite/core/kernels/register_test.cc
+++ b/tensorflow/lite/core/kernels/register_test.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
-#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate_test.h"
 #include "tensorflow/lite/mutable_op_resolver.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
@@ -57,13 +56,13 @@ TEST(BuiltinOpResolverTest, HasXNNPACKDelegate_QS8) {
       builtin_op_resolver.GetDelegateCreators()[0];
   std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate *)> delegate =
       delegate_creator(nullptr);
-  const void *delegate_data = TfLiteOpaqueDelegateGetData(delegate.get());
-  TfLiteXNNPackDelegateOptions options = GetOptions(delegate_data);
+  const TfLiteXNNPackDelegateOptions *options =
+      TfLiteXNNPackDelegateGetOptions(delegate.get());
 
-  ASSERT_EQ(options.flags & TFLITE_XNNPACK_DELEGATE_FLAG_QU8,
+  ASSERT_EQ(options->flags & TFLITE_XNNPACK_DELEGATE_FLAG_QU8,
             TFLITE_XNNPACK_DELEGATE_FLAG_QU8);
 
-  ASSERT_EQ(options.flags & TFLITE_XNNPACK_DELEGATE_FLAG_QS8,
+  ASSERT_EQ(options->flags & TFLITE_XNNPACK_DELEGATE_FLAG_QS8,
             TFLITE_XNNPACK_DELEGATE_FLAG_QS8);
 }
 
@@ -74,13 +73,13 @@ TEST(BuiltinOpResolverTest, HasXNNPACKDelegate_QS8_QU8) {
       builtin_op_resolver.GetDelegateCreators()[0];
   std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate *)> delegate =
       delegate_creator(nullptr);
-  const void *delegate_data = TfLiteOpaqueDelegateGetData(delegate.get());
-  TfLiteXNNPackDelegateOptions options = GetOptions(delegate_data);
+  const TfLiteXNNPackDelegateOptions *options =
+      TfLiteXNNPackDelegateGetOptions(delegate.get());
 
-  ASSERT_EQ(options.flags & TFLITE_XNNPACK_DELEGATE_FLAG_QU8,
+  ASSERT_EQ(options->flags & TFLITE_XNNPACK_DELEGATE_FLAG_QU8,
             TFLITE_XNNPACK_DELEGATE_FLAG_QU8);
 
-  ASSERT_EQ(options.flags & TFLITE_XNNPACK_DELEGATE_FLAG_QS8,
+  ASSERT_EQ(options->flags & TFLITE_XNNPACK_DELEGATE_FLAG_QS8,
             TFLITE_XNNPACK_DELEGATE_FLAG_QS8);
 }
 
@@ -91,12 +90,12 @@ TEST(BuiltinOpResolverTest, Disable_QU8) {
       builtin_op_resolver.GetDelegateCreators()[0];
   std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate *)> delegate =
       delegate_creator(nullptr);
-  const void *delegate_data = TfLiteOpaqueDelegateGetData(delegate.get());
-  TfLiteXNNPackDelegateOptions options = GetOptions(delegate_data);
+  const TfLiteXNNPackDelegateOptions *options =
+      TfLiteXNNPackDelegateGetOptions(delegate.get());
 
-  ASSERT_EQ(options.flags & TFLITE_XNNPACK_DELEGATE_FLAG_QU8, 0);
+  ASSERT_EQ(options->flags & TFLITE_XNNPACK_DELEGATE_FLAG_QU8, 0);
 
-  ASSERT_EQ(options.flags & TFLITE_XNNPACK_DELEGATE_FLAG_QS8,
+  ASSERT_EQ(options->flags & TFLITE_XNNPACK_DELEGATE_FLAG_QS8,
             TFLITE_XNNPACK_DELEGATE_FLAG_QS8);
 }
 #endif  // TFLITE_WITHOUT_XNNPACK
diff --git a/tensorflow/lite/core/macros.h b/tensorflow/lite/core/macros.h
index 1bab15bc2675c0..9eab6be877d2e5 100644
--- a/tensorflow/lite/core/macros.h
+++ b/tensorflow/lite/core/macros.h
@@ -74,7 +74,7 @@ inline constexpr char tflite_metadata_buffer_location[] = "buffer_location";
 inline constexpr char tflite_metadata_min_runtime_version[] =
     "min_runtime_version";
 // the stablehlo op version is supported by the tflite runtime
-inline constexpr char tflite_supported_stablehlo_version[] = "0.14.0";
+inline constexpr char tflite_supported_stablehlo_version[] = "1.0.0";
 #endif
 
 #endif  // TENSORFLOW_LITE_CORE_MACROS_H_
diff --git a/tensorflow/lite/core/model_builder.cc b/tensorflow/lite/core/model_builder.cc
index e044c6da7e65c9..f3c33a0847828a 100644
--- a/tensorflow/lite/core/model_builder.cc
+++ b/tensorflow/lite/core/model_builder.cc
@@ -17,12 +17,17 @@ limitations under the License.
 #include <stddef.h>
 #include <stdint.h>
 
+#include <algorithm>
 #include <cstring>
+#include <map>
 #include <memory>
 #include <string>
 #include <utility>
 
+#include "flatbuffers/base.h"  // from @flatbuffers
 #include "flatbuffers/buffer.h"  // from @flatbuffers
+#include "flatbuffers/vector.h"  // from @flatbuffers
+#include "flatbuffers/verifier.h"  // from @flatbuffers
 #include "tensorflow/lite/allocation.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/verifier.h"
@@ -276,23 +281,27 @@ std::unique_ptr<FlatBufferModel> FlatBufferModel::VerifyAndBuildFromAllocation(
     return nullptr;
   }
 
-  // Only run validator on models less than 2GB
-  if (allocation->bytes() < flatbuffer_size_max) {
+  {
+    // Flatbuffers can only be smaller than 2GB. The file format appends some
+    // data after the actual flabuffer. We truncate the allocation size to 2GB
+    // so that the verifier doesn't early exit on us.
+    size_t allocation_size =
+        std::min(allocation->bytes(),
+                 static_cast<size_t>(FLATBUFFERS_MAX_BUFFER_SIZE - 1));
     flatbuffers::Verifier base_verifier(
-        reinterpret_cast<const uint8_t*>(allocation->base()),
-        allocation->bytes());
+        reinterpret_cast<const uint8_t*>(allocation->base()), allocation_size);
     if (!VerifyModelBuffer(base_verifier)) {
       TF_LITE_REPORT_ERROR(error_reporter,
                            "The model is not a valid Flatbuffer buffer");
       return nullptr;
     }
-  }
 
-  if (extra_verifier &&
-      !extra_verifier->Verify(static_cast<const char*>(allocation->base()),
-                              allocation->bytes(), error_reporter)) {
-    // The verifier will have already logged an appropriate error message.
-    return nullptr;
+    if (extra_verifier &&
+        !extra_verifier->Verify(static_cast<const char*>(allocation->base()),
+                                allocation_size, error_reporter)) {
+      // The verifier will have already logged an appropriate error message.
+      return nullptr;
+    }
   }
 
   return BuildFromAllocation(std::move(allocation), error_reporter);
diff --git a/tensorflow/lite/core/signature_runner.h b/tensorflow/lite/core/signature_runner.h
index c38048362875f7..bead9288ea85e7 100644
--- a/tensorflow/lite/core/signature_runner.h
+++ b/tensorflow/lite/core/signature_runner.h
@@ -130,9 +130,12 @@ class SignatureRunner {
                                  const std::vector<int>& new_size);
 
   /// Change the dimensionality of a given tensor. This is only acceptable for
-  /// tensor indices that are inputs or variables. Only unknown dimensions can
-  /// be resized with this function. Unknown dimensions are indicated as `-1` in
-  /// the `dims_signature` attribute of a TfLiteTensor.
+  /// tensor indices that are inputs or variables.
+  ///
+  /// Difference from ResizeInputTensor: Only unknown dimensions can be resized
+  /// with this function. Unknown dimensions are indicated as `-1` in the
+  /// `dims_signature` attribute of a TfLiteTensor.
+  ///
   /// Returns status of failure or success. Note that this doesn't actually
   /// resize any existing buffers. A call to AllocateTensors() is required to
   /// change the tensor input buffer.
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index 3ccc72bc7ea949..ce3622105b1ce5 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/lite/allocation.h"
+#include "tensorflow/lite/array.h"
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/common_internal.h"
 #include "tensorflow/lite/context_util.h"
@@ -1168,7 +1169,13 @@ bool Subgraph::OpMightHaveSideEffect(
 }
 
 TfLiteStatus Subgraph::ResizeInputTensor(int tensor_index,
-                                         const std::vector<int>& dims) {
+                                         const int* const dims_data,
+                                         const int rank) {
+  if (rank && !dims_data) {
+    ReportError("ResizeInputTensor was given a NULL shape.");
+    return kTfLiteError;
+  }
+
   const bool delegates_applied = !delegates_applied_.empty();
   const bool graph_is_immutable = state_ == kStateInvokableAndImmutable;
   if (graph_is_immutable && !delegates_applied) {
@@ -1187,7 +1194,7 @@ TfLiteStatus Subgraph::ResizeInputTensor(int tensor_index,
   // the subgraph won't allocate memory for a dynamic tensor when its size
   // is equal to the original tensor size.
   if (tensor->data.raw != nullptr &&
-      EqualArrayAndTfLiteIntArray(tensor->dims, dims.size(), dims.data())) {
+      EqualArrayAndTfLiteIntArray(tensor->dims, rank, dims_data)) {
     return kTfLiteOk;
   }
 
@@ -1196,7 +1203,12 @@ TfLiteStatus Subgraph::ResizeInputTensor(int tensor_index,
     TF_LITE_ENSURE_STATUS(UndoAllDelegates());
   }
   state_ = kStateUninvokable;
-  return ResizeTensorImpl(tensor, ConvertVectorToTfLiteIntArray(dims));
+  return ResizeTensorImpl(tensor, BuildTfLiteArray(rank, dims_data).release());
+}
+
+TfLiteStatus Subgraph::ResizeInputTensor(int tensor_index,
+                                         const std::vector<int>& dims) {
+  return ResizeInputTensor(tensor_index, dims.data(), dims.size());
 }
 
 TfLiteStatus Subgraph::ResizeInputTensorStrict(int tensor_index,
@@ -1844,7 +1856,8 @@ TfLiteStatus Subgraph::GetNodeAndRegistration(
 TfLiteStatus Subgraph::SetTensorParametersReadOnly(
     int tensor_index, TfLiteType type, const char* name, const size_t ndims,
     const int* dims, TfLiteQuantization quantization, const char* buffer,
-    size_t bytes, const Allocation* allocation, TfLiteSparsity* sparsity) {
+    size_t bytes, const Allocation* allocation, TfLiteSparsity* sparsity,
+    const size_t buffer_identifier) {
   // Ensure quantization cleanup on failure.
   ScopedTfLiteQuantization scoped_quantization(&quantization);
   ScopedTfLiteSparsity scoped_sparsity(sparsity);
@@ -1892,6 +1905,9 @@ TfLiteStatus Subgraph::SetTensorParametersReadOnly(
     tensor.quantization = *scoped_quantization.release();
     tensor.sparsity = scoped_sparsity.release();
   }
+  if (buffer_identifier != kTfLiteNoBufferIdentifier) {
+    tensor_buffer_identifiers_[tensor_index] = buffer_identifier;
+  }
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
index 379374bcc56d63..281ac04adc2096 100644
--- a/tensorflow/lite/core/subgraph.h
+++ b/tensorflow/lite/core/subgraph.h
@@ -23,11 +23,13 @@ limitations under the License.
 #include <map>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <unordered_set>
 #include <utility>
 #include <vector>
 
 #include "tensorflow/lite/allocation.h"
+#include "tensorflow/lite/array.h"
 #include "tensorflow/lite/c/common_internal.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
@@ -131,16 +133,18 @@ class Subgraph {
       int tensor_index, TfLiteType type, const char* name,
       const std::vector<int>& dims, TfLiteQuantization quantization,
       const char* buffer, size_t bytes, const Allocation* allocation = nullptr,
-      TfLiteSparsity* sparsity = nullptr) {
+      TfLiteSparsity* sparsity = nullptr,
+      size_t buffer_identifier = kTfLiteNoBufferIdentifier) {
     return SetTensorParametersReadOnly(tensor_index, type, name, dims.size(),
                                        dims.data(), quantization, buffer, bytes,
-                                       allocation, sparsity);
+                                       allocation, sparsity, buffer_identifier);
   }
   TfLiteStatus SetTensorParametersReadOnly(
       int tensor_index, TfLiteType type, const char* name, const size_t ndims,
       const int* dims, TfLiteQuantization quantization, const char* buffer,
       size_t bytes, const Allocation* allocation = nullptr,
-      TfLiteSparsity* sparsity = nullptr);
+      TfLiteSparsity* sparsity = nullptr,
+      size_t buffer_identifier = kTfLiteNoBufferIdentifier);
 
   // Set description of inputs/outputs/data/fptrs for node `node_index`.
   // This variant assumes an external buffer has been allocated of size
@@ -250,9 +254,15 @@ class Subgraph {
     return &nodes_and_registration_[node_index];
   }
 
-  // Change the dimensionality of a given tensor. Note, this is only acceptable
-  // for tensor indices that are inputs.
-  // Returns status of failure or success.
+  // Change the dimensionality of a given tensor.
+  //
+  // Note, this is only acceptable for tensor indices that are inputs.
+  TfLiteStatus ResizeInputTensor(int tensor_index, const int* dims_data,
+                                 int rank);
+
+  // Change the dimensionality of a given tensor.
+  //
+  // Note, this is only acceptable for tensor indices that are inputs.
   TfLiteStatus ResizeInputTensor(int tensor_index,
                                  const std::vector<int>& dims);
 
@@ -582,6 +592,10 @@ class Subgraph {
   // Returns true if the subgraph has been fully delegated.
   bool IsFullyDelegated() const;
 
+  const std::unordered_map<size_t, size_t>& GetTensorBufferIdentifiers() {
+    return tensor_buffer_identifiers_;
+  }
+
  private:
 #ifndef DOXYGEN_SKIP
   friend class tflite::impl::InterpreterBuilder;
@@ -1146,6 +1160,10 @@ class Subgraph {
   /// The allocator used for holding memory of the model. Note that this will
   /// be null if the client provides a tflite::Model directly.
   const Allocation* allocation_ = nullptr;
+
+  // Maps tensor constant buffers used in the subgraph to a model-wide
+  // identifiers.
+  std::unordered_map<size_t, size_t> tensor_buffer_identifiers_;
 };
 
 }  // namespace tflite
diff --git a/tensorflow/lite/core/tools/BUILD b/tensorflow/lite/core/tools/BUILD
index 8891caef8f75de..9400d8c622a1ef 100644
--- a/tensorflow/lite/core/tools/BUILD
+++ b/tensorflow/lite/core/tools/BUILD
@@ -20,8 +20,8 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite:string_util",
         "//tensorflow/lite:util",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core:framework",
         "//tensorflow/lite/core/api:error_reporter",
         "//tensorflow/lite/core/api:op_resolver",
@@ -30,6 +30,8 @@ cc_library(
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/schema:schema_utils",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/types:optional",
+        "@flatbuffers//:runtime_cc",
     ],
 )
 
@@ -45,16 +47,17 @@ cc_test(
     ],
     deps = [
         ":verifier",
-        "//tensorflow/core:framework_lite",
         "//tensorflow/core/platform:resource_loader",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite:string",
         "//tensorflow/lite:util",
+        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core:framework",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest_main",
         "@flatbuffers",
     ],
@@ -72,6 +75,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/lite/schema:schema_fbs",
+        "@flatbuffers//:runtime_cc",
     ],
 )
 
@@ -81,15 +85,12 @@ cc_test(
     srcs = ["verifier_internal_test.cc"],
     deps = [
         ":verifier_internal",
-        "//tensorflow/core:framework_lite",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite:util",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core:framework",
-        "//tensorflow/lite/core/api",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest_main",
         "@flatbuffers",
     ],
diff --git a/tensorflow/lite/core/tools/verifier.cc b/tensorflow/lite/core/tools/verifier.cc
index c878f7c392d14a..cd301aadbdc87d 100644
--- a/tensorflow/lite/core/tools/verifier.cc
+++ b/tensorflow/lite/core/tools/verifier.cc
@@ -22,10 +22,14 @@ limitations under the License.
 #include <cstring>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/types/optional.h"
+#include "flatbuffers/string.h"  // from @flatbuffers
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/core/tools/verifier_internal.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/schema/schema_utils.h"
-#include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/util.h"
 #include "tensorflow/lite/version.h"
 
diff --git a/tensorflow/lite/core/tools/verifier.h b/tensorflow/lite/core/tools/verifier.h
index 1ea028da294a88..be3c915b48b0e9 100644
--- a/tensorflow/lite/core/tools/verifier.h
+++ b/tensorflow/lite/core/tools/verifier.h
@@ -21,10 +21,12 @@ limitations under the License.
 
 #include <stdio.h>
 
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/core/model.h"      // Legacy.
 #include "tensorflow/lite/error_reporter.h"  // Legacy.
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/core/tools/verifier_internal.cc b/tensorflow/lite/core/tools/verifier_internal.cc
index 471060bacfa879..706d534d6320bf 100644
--- a/tensorflow/lite/core/tools/verifier_internal.cc
+++ b/tensorflow/lite/core/tools/verifier_internal.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/core/tools/verifier_internal.h"
 
+#include "flatbuffers/verifier.h"  // from @flatbuffers
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/core/tools/verifier_internal_test.cc b/tensorflow/lite/core/tools/verifier_internal_test.cc
index 50b52def2f6fad..6f03d034de3ebd 100644
--- a/tensorflow/lite/core/tools/verifier_internal_test.cc
+++ b/tensorflow/lite/core/tools/verifier_internal_test.cc
@@ -18,21 +18,15 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "flatbuffers/flatbuffers.h"
-#include "flatbuffers/util.h"
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/core/framework/numeric_types.h"
-#include "tensorflow/lite/allocation.h"
-#include "tensorflow/lite/core/api/flatbuffer_conversions.h"
-#include "tensorflow/lite/core/model.h"
-#include "tensorflow/lite/error_reporter.h"
+#include "flatbuffers/buffer.h"  // from @flatbuffers
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "flatbuffers/vector.h"  // from @flatbuffers
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/mutable_op_resolver.h"
 #include "tensorflow/lite/op_resolver.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/testing/util.h"
-#include "tensorflow/lite/util.h"
 #include "tensorflow/lite/version.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/core/tools/verifier_test.cc b/tensorflow/lite/core/tools/verifier_test.cc
index 3d7378433e86d6..999e06c489ca68 100644
--- a/tensorflow/lite/core/tools/verifier_test.cc
+++ b/tensorflow/lite/core/tools/verifier_test.cc
@@ -18,21 +18,22 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "flatbuffers/flatbuffers.h"
-#include "flatbuffers/util.h"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/core/framework/numeric_types.h"
+#include "flatbuffers/buffer.h"  // from @flatbuffers
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "flatbuffers/vector.h"  // from @flatbuffers
 #include "tensorflow/core/platform/resource_loader.h"
-#include "tensorflow/lite/allocation.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
-#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/error_reporter.h"
+#include "tensorflow/lite/model_builder.h"
 #include "tensorflow/lite/mutable_op_resolver.h"
 #include "tensorflow/lite/op_resolver.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/testing/util.h"
+#include "tensorflow/lite/string_type.h"
 #include "tensorflow/lite/util.h"
 #include "tensorflow/lite/version.h"
 
diff --git a/tensorflow/lite/delegates/flex/delegate.cc b/tensorflow/lite/delegates/flex/delegate.cc
index a5ff7be819b409..0b5b42c99f0a7e 100644
--- a/tensorflow/lite/delegates/flex/delegate.cc
+++ b/tensorflow/lite/delegates/flex/delegate.cc
@@ -69,7 +69,7 @@ TfLiteStatus FlexDelegate::Initialize(TfLiteContext* context) {
       base_delegate_);
   if (!status.ok()) {
     TF_LITE_KERNEL_LOG(context, "Failed to initialize TensorFlow context: %s",
-                       tsl::NullTerminatedMessage(status));
+                       absl::StatusMessageAsCStr(status));
     return kTfLiteError;
   }
 
diff --git a/tensorflow/lite/delegates/flex/kernel.cc b/tensorflow/lite/delegates/flex/kernel.cc
index 46fa5a21f31a24..9bd48436c38839 100644
--- a/tensorflow/lite/delegates/flex/kernel.cc
+++ b/tensorflow/lite/delegates/flex/kernel.cc
@@ -155,14 +155,14 @@ class OpOutputs {
 
   int TfLiteIndex(int i) const { return outputs_[i]; }
 
-  tensorflow::gtl::InlinedVector<tensorflow::Tensor, 2>* GetTensors() {
+  absl::InlinedVector<tensorflow::Tensor, 2UL>* GetTensors() {
     return &vector_;
   }
 
  private:
   std::vector<int> outputs_;
   std::vector<bool> subgraph_outputs_;
-  tensorflow::gtl::InlinedVector<tensorflow::Tensor, 2> vector_;
+  absl::InlinedVector<tensorflow::Tensor, 2UL> vector_;
 };
 
 // This struct holds information such as tensor lifecycle and BufferMap which
diff --git a/tensorflow/lite/delegates/flex/util.cc b/tensorflow/lite/delegates/flex/util.cc
index 9940fadb8d7625..52fdd9a2f44a3a 100644
--- a/tensorflow/lite/delegates/flex/util.cc
+++ b/tensorflow/lite/delegates/flex/util.cc
@@ -34,7 +34,7 @@ static constexpr char kResourceVariablePrefix[] = "tflite_resource_variable";
 TfLiteStatus ConvertStatus(TfLiteContext* context,
                            const tensorflow::Status& status) {
   if (!status.ok()) {
-    TF_LITE_KERNEL_LOG(context, "%s", tsl::NullTerminatedMessage(status));
+    TF_LITE_KERNEL_LOG(context, "%s", absl::StatusMessageAsCStr(status));
     return kTfLiteError;
   }
   return kTfLiteOk;
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_device.cc b/tensorflow/lite/delegates/gpu/cl/cl_device.cc
index 852cc95eedf261..4f3ea93b1cf792 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_device.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_device.cc
@@ -344,21 +344,20 @@ GpuInfo GpuInfoFromDeviceID(cl_device_id id, cl_platform_id platform_id) {
     }
   }
 
-  if (info.IsIntel()) {
-    if (info.SupportsExtension("cl_intel_required_subgroup_size")) {
-      size_t sub_groups_count;
-      cl_int status =
-          clGetDeviceInfo(id, 0x4108 /*CL_DEVICE_SUB_GROUP_SIZES_INTEL*/, 0,
-                          nullptr, &sub_groups_count);
+  if (info.SupportsExtension("cl_intel_required_subgroup_size")) {
+    size_t sub_groups_ret_size;
+    cl_int status =
+        clGetDeviceInfo(id, 0x4108 /*CL_DEVICE_SUB_GROUP_SIZES_INTEL*/, 0,
+                        nullptr, &sub_groups_ret_size);
+    if (status == CL_SUCCESS) {
+      size_t sub_groups_count = sub_groups_ret_size / sizeof(size_t);
+      std::vector<size_t> sub_group_sizes(sub_groups_count);
+      status =
+          clGetDeviceInfo(id, 0x4108 /*CL_DEVICE_SUB_GROUP_SIZES_INTEL*/,
+                          sub_groups_ret_size, sub_group_sizes.data(), nullptr);
       if (status == CL_SUCCESS) {
-        std::vector<size_t> sub_group_sizes(sub_groups_count);
-        status = clGetDeviceInfo(id, 0x4108 /*CL_DEVICE_SUB_GROUP_SIZES_INTEL*/,
-                                 sizeof(size_t) * sub_groups_count,
-                                 sub_group_sizes.data(), nullptr);
-        if (status == CL_SUCCESS) {
-          for (int i = 0; i < sub_groups_count; ++i) {
-            info.supported_subgroup_sizes.push_back(sub_group_sizes[i]);
-          }
+        for (int i = 0; i < sub_groups_count; ++i) {
+          info.supported_subgroup_sizes.push_back(sub_group_sizes[i]);
         }
       }
     }
diff --git a/tensorflow/lite/delegates/gpu/cl/compiled_program_cache_generated.h b/tensorflow/lite/delegates/gpu/cl/compiled_program_cache_generated.h
index 18cd2993d4c3a7..2bf907007949d9 100755
--- a/tensorflow/lite/delegates/gpu/cl/compiled_program_cache_generated.h
+++ b/tensorflow/lite/delegates/gpu/cl/compiled_program_cache_generated.h
@@ -22,9 +22,9 @@ limitations under the License.
 
 // Ensure the included flatbuffers.h is the same version as when this file was
 // generated, otherwise it may not be compatible.
-static_assert(FLATBUFFERS_VERSION_MAJOR == 23 &&
-              FLATBUFFERS_VERSION_MINOR == 5 &&
-              FLATBUFFERS_VERSION_REVISION == 26,
+static_assert(FLATBUFFERS_VERSION_MAJOR == 24 &&
+              FLATBUFFERS_VERSION_MINOR == 3 &&
+              FLATBUFFERS_VERSION_REVISION == 25,
              "Non-compatible flatbuffers version included");
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/cl/serialization_generated.h b/tensorflow/lite/delegates/gpu/cl/serialization_generated.h
index 3f9912890fc1cf..17d079b7ce17cd 100755
--- a/tensorflow/lite/delegates/gpu/cl/serialization_generated.h
+++ b/tensorflow/lite/delegates/gpu/cl/serialization_generated.h
@@ -22,9 +22,9 @@ limitations under the License.
 
 // Ensure the included flatbuffers.h is the same version as when this file was
 // generated, otherwise it may not be compatible.
-static_assert(FLATBUFFERS_VERSION_MAJOR == 23 &&
-              FLATBUFFERS_VERSION_MINOR == 5 &&
-              FLATBUFFERS_VERSION_REVISION == 26,
+static_assert(FLATBUFFERS_VERSION_MAJOR == 24 &&
+              FLATBUFFERS_VERSION_MINOR == 3 &&
+              FLATBUFFERS_VERSION_REVISION == 25,
              "Non-compatible flatbuffers version included");
 
 #include "gpu_model_generated.h"
diff --git a/tensorflow/lite/delegates/gpu/common/BUILD b/tensorflow/lite/delegates/gpu/common/BUILD
index 0a55ad05a76968..195124f269c7c8 100644
--- a/tensorflow/lite/delegates/gpu/common/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/BUILD
@@ -44,6 +44,7 @@ cc_library(
     deps = [
         ":data_type",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/lite/delegates/gpu/common/data_type_test.cc b/tensorflow/lite/delegates/gpu/common/data_type_test.cc
index a88eb38047eda8..f7ee35c73145dd 100644
--- a/tensorflow/lite/delegates/gpu/common/data_type_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/data_type_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/common/flops_util.cc b/tensorflow/lite/delegates/gpu/common/flops_util.cc
index 88d7edad88fc57..3807f6b6a46274 100644
--- a/tensorflow/lite/delegates/gpu/common/flops_util.cc
+++ b/tensorflow/lite/delegates/gpu/common/flops_util.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/flops_util.h"
 
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+
 namespace tflite {
 namespace gpu {
 
diff --git a/tensorflow/lite/delegates/gpu/common/gpu_info.cc b/tensorflow/lite/delegates/gpu/common/gpu_info.cc
index 3b1c264e786f48..2627adda13c6bd 100644
--- a/tensorflow/lite/delegates/gpu/common/gpu_info.cc
+++ b/tensorflow/lite/delegates/gpu/common/gpu_info.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/strings/ascii.h"
 
 namespace tflite {
@@ -53,6 +54,7 @@ GpuVendor GetGpuVendor(const std::string& gpu_description) {
 AdrenoGpu GetAdrenoGpuVersion(const std::string& gpu_description) {
   const std::map<std::string, AdrenoGpu> kMapping = {
       // Adreno 7xx series
+      {"750", AdrenoGpu::kAdreno750},
       {"740", AdrenoGpu::kAdreno740},
       {"730", AdrenoGpu::kAdreno730},
       // Adreno 6xx series
@@ -230,7 +232,8 @@ bool AdrenoInfo::IsAdreno6xx() const {
 
 bool AdrenoInfo::IsAdreno7xx() const {
   return adreno_gpu == AdrenoGpu::kAdreno730 ||
-         adreno_gpu == AdrenoGpu::kAdreno740;
+         adreno_gpu == AdrenoGpu::kAdreno740 ||
+         adreno_gpu == AdrenoGpu::kAdreno750;
 }
 
 bool AdrenoInfo::IsBetterThan(AdrenoGpu gpu) const {
@@ -301,6 +304,8 @@ int AdrenoInfo::GetComputeUnitsCount() const {
   // can provide not correct numbers.
   switch (adreno_gpu) {
     // Adreno 7xx series
+    case AdrenoGpu::kAdreno750:
+      return 6;
     case AdrenoGpu::kAdreno740:
       return 6;
     case AdrenoGpu::kAdreno730:
@@ -977,6 +982,17 @@ bool GpuInfo::SupportsSubGroupWithSize(int sub_group_size) const {
   return false;
 }
 
+absl::Status GpuInfo::GetMinSubGroupSize(int& min_sub_group_size) const {
+  auto begin = supported_subgroup_sizes.begin();
+  auto end = supported_subgroup_sizes.end();
+  auto min = std::min_element(begin, end);
+  if (min == end) {
+    return absl::InternalError("No supported subgroup sizes");
+  }
+  min_sub_group_size = *min;
+  return absl::OkStatus();
+}
+
 bool GpuInfo::SupportsFloatImage2D(DataType data_type, int channels) const {
   if (IsApiOpenCl()) {
     return opencl_info.supported_images_2d.SupportsImage2D(data_type, channels);
diff --git a/tensorflow/lite/delegates/gpu/common/gpu_info.h b/tensorflow/lite/delegates/gpu/common/gpu_info.h
index eeb6c33c312de8..9849e2405d32e2 100644
--- a/tensorflow/lite/delegates/gpu/common/gpu_info.h
+++ b/tensorflow/lite/delegates/gpu/common/gpu_info.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "absl/strings/match.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 
@@ -49,6 +50,7 @@ enum class GpuApi {
 
 enum class AdrenoGpu {
   // Adreno 7xx series
+  kAdreno750,
   kAdreno740,
   kAdreno730,
   // Adreno 6xx series
@@ -523,6 +525,7 @@ struct GpuInfo {
   // returns true if device have fixed wave size equal to 32
   bool IsWaveSizeEqualTo32() const;
   bool SupportsSubGroupWithSize(int sub_group_size) const;
+  absl::Status GetMinSubGroupSize(int& min_sub_group_size) const;
 
   bool SupportsFloatImage2D(DataType data_type, int channels) const;
   bool SupportsExtension(const std::string& extension) const;
diff --git a/tensorflow/lite/delegates/gpu/common/gpu_model_generated.h b/tensorflow/lite/delegates/gpu/common/gpu_model_generated.h
index d7bed1160faa23..49fd064a05f499 100755
--- a/tensorflow/lite/delegates/gpu/common/gpu_model_generated.h
+++ b/tensorflow/lite/delegates/gpu/common/gpu_model_generated.h
@@ -21,9 +21,9 @@ limitations under the License.
 
 // Ensure the included flatbuffers.h is the same version as when this file was
 // generated, otherwise it may not be compatible.
-static_assert(FLATBUFFERS_VERSION_MAJOR == 23 &&
-              FLATBUFFERS_VERSION_MINOR == 5 &&
-              FLATBUFFERS_VERSION_REVISION == 26,
+static_assert(FLATBUFFERS_VERSION_MAJOR == 24 &&
+              FLATBUFFERS_VERSION_MINOR == 3 &&
+              FLATBUFFERS_VERSION_REVISION == 25,
              "Non-compatible flatbuffers version included");
 
 #include "serialization_base_generated.h"
diff --git a/tensorflow/lite/delegates/gpu/common/task/serialization_base_generated.h b/tensorflow/lite/delegates/gpu/common/task/serialization_base_generated.h
index a700ba1716f8c0..0647c2b40898cb 100755
--- a/tensorflow/lite/delegates/gpu/common/task/serialization_base_generated.h
+++ b/tensorflow/lite/delegates/gpu/common/task/serialization_base_generated.h
@@ -22,9 +22,9 @@ limitations under the License.
 
 // Ensure the included flatbuffers.h is the same version as when this file was
 // generated, otherwise it may not be compatible.
-static_assert(FLATBUFFERS_VERSION_MAJOR == 23 &&
-              FLATBUFFERS_VERSION_MINOR == 5 &&
-              FLATBUFFERS_VERSION_REVISION == 26,
+static_assert(FLATBUFFERS_VERSION_MAJOR == 24 &&
+              FLATBUFFERS_VERSION_MINOR == 3 &&
+              FLATBUFFERS_VERSION_REVISION == 25,
              "Non-compatible flatbuffers version included");
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/BUILD b/tensorflow/lite/delegates/gpu/common/tasks/BUILD
index 4352c480468e77..0b9662471277e4 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/tasks/BUILD
@@ -143,6 +143,7 @@ cc_library(
     hdrs = ["conv_generic.h"],
     deps = [
         "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:gpu_info",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:status",
@@ -495,7 +496,11 @@ cc_library(
     srcs = ["elementwise.cc"],
     hdrs = ["elementwise.h"],
     deps = [
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:gpu_info",
         "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:precision",
+        "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/task:gpu_operation",
         "@com_google_absl//absl/strings",
@@ -509,8 +514,15 @@ cc_library(
     hdrs = ["elementwise_test_util.h"],
     deps = [
         ":elementwise",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:precision",
+        "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "//tensorflow/lite/delegates/gpu/common:types",
+        "//tensorflow/lite/delegates/gpu/common/task:gpu_operation",
+        "//tensorflow/lite/delegates/gpu/common/task:tensor_desc",
         "//tensorflow/lite/delegates/gpu/common/task:testing_util",
     ],
 )
@@ -521,7 +533,10 @@ cc_library(
     hdrs = ["fully_connected.h"],
     deps = [
         "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:gpu_info",
+        "//tensorflow/lite/delegates/gpu/common:kernel_info",
         "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:precision",
         "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:tensor",
         "//tensorflow/lite/delegates/gpu/common:types",
@@ -529,6 +544,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common/task:buffer_desc",
         "//tensorflow/lite/delegates/gpu/common/task:gpu_operation",
         "//tensorflow/lite/delegates/gpu/common/task:tensor_desc",
+        "//tensorflow/lite/delegates/gpu/common/task:tuning_type",
         "@com_google_absl//absl/memory",
     ],
 )
@@ -541,7 +557,11 @@ cc_library(
     deps = [
         ":fully_connected",
         "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:precision",
+        "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "//tensorflow/lite/delegates/gpu/common/task:gpu_operation",
         "//tensorflow/lite/delegates/gpu/common/task:testing_util",
     ],
 )
@@ -551,10 +571,14 @@ cc_library(
     srcs = ["gather.cc"],
     hdrs = ["gather.h"],
     deps = [
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:gpu_info",
         "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/common/task:gpu_operation",
+        "//tensorflow/lite/delegates/gpu/common/task:tensor_desc",
     ],
 )
 
@@ -565,10 +589,16 @@ cc_library(
     hdrs = ["gather_test_util.h"],
     deps = [
         ":gather",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:precision",
+        "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "//tensorflow/lite/delegates/gpu/common:types",
+        "//tensorflow/lite/delegates/gpu/common/task:gpu_operation",
+        "//tensorflow/lite/delegates/gpu/common/task:tensor_desc",
         "//tensorflow/lite/delegates/gpu/common/task:testing_util",
-        "@com_google_googletest//:gtest",
     ],
 )
 
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/conv_generic.cc b/tensorflow/lite/delegates/gpu/common/tasks/conv_generic.cc
index c2a1bd8f539512..1e5cc2f14b9e82 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/conv_generic.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/conv_generic.cc
@@ -1791,16 +1791,20 @@ ConvGeneric::ConvParams ConvGeneric::GuessBestParams(
         const int kSubGroupSize = 16;
         const bool supports_subgroup_size_control =
             gpu_info.SupportsExtension("cl_intel_required_subgroup_size");
+        int min_subgroup_size;
+        auto min_subgroup_size_status =
+            gpu_info.GetMinSubGroupSize(min_subgroup_size);
         if (supports_subgroup_size_control &&
             gpu_info.SupportsSubGroupWithSize(kSubGroupSize)) {
           conv_params.weights_upload_type =
               WeightsUploadType::PRIVATE_MEM_SIMD_BROADCAST;
           conv_params.simd_size = kSubGroupSize;
-        } else if (gpu_info.opencl_info.IsCLVK()) {
-          // It will work because of specific driver using subgroup size 16
+        } else if (supports_subgroup_size_control &&
+                   min_subgroup_size_status.ok()) {
           conv_params.weights_upload_type =
               WeightsUploadType::PRIVATE_MEM_SIMD_BROADCAST;
-          conv_params.simd_size = 16;
+          conv_params.simd_size = min_subgroup_size;
+          work_group_size_ = int3(min_subgroup_size, 1, 1);
         } else {
           // no support of subgroup size control
           // only smallest subgroup size (8) can be used safely, otherwise
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/elementwise.h b/tensorflow/lite/delegates/gpu/common/tasks/elementwise.h
index 5624a605279ac4..6eda55bdf48086 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/elementwise.h
+++ b/tensorflow/lite/delegates/gpu/common/tasks/elementwise.h
@@ -18,7 +18,11 @@ limitations under the License.
 
 #include <string>
 
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/precision.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
 
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/elementwise_test_util.cc b/tensorflow/lite/delegates/gpu/common/tasks/elementwise_test_util.cc
index bb17899d4023ee..3e1f884905bc8d 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/elementwise_test_util.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/elementwise_test_util.cc
@@ -19,10 +19,17 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/precision.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
 #include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
 #include "tensorflow/lite/delegates/gpu/common/tasks/elementwise.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/fully_connected.cc b/tensorflow/lite/delegates/gpu/common/tasks/fully_connected.cc
index 96e53b2f5a7534..43eab75e99b91d 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/fully_connected.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/fully_connected.cc
@@ -20,11 +20,16 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "absl/memory/memory.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/precision.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
 #include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/fully_connected.h b/tensorflow/lite/delegates/gpu/common/tasks/fully_connected.h
index f44383028840bd..20a5ab64252d3c 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/fully_connected.h
+++ b/tensorflow/lite/delegates/gpu/common/tasks/fully_connected.h
@@ -25,10 +25,15 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/common/kernel_info.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/precision.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tuning_type.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/common/util.h"
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/fully_connected_test_util.cc b/tensorflow/lite/delegates/gpu/common/tasks/fully_connected_test_util.cc
index abac094adb6546..075363e1746fdd 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/fully_connected_test_util.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/fully_connected_test_util.cc
@@ -19,9 +19,13 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/precision.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
 #include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
 #include "tensorflow/lite/delegates/gpu/common/tasks/fully_connected.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/gather.cc b/tensorflow/lite/delegates/gpu/common/tasks/gather.cc
index 5b97f2558738c1..a32c0a9109fe0b 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/gather.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/gather.cc
@@ -20,6 +20,13 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
+
 namespace tflite {
 namespace gpu {
 
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/gather.h b/tensorflow/lite/delegates/gpu/common/tasks/gather.h
index cb6fe6f5a0668e..120eb325a649b6 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/gather.h
+++ b/tensorflow/lite/delegates/gpu/common/tasks/gather.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_GATHER_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_GATHER_H_
 
+#include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/gather_test_util.cc b/tensorflow/lite/delegates/gpu/common/tasks/gather_test_util.cc
index 7bb4ceebf5e3c3..de5766efeb4d53 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/gather_test_util.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/gather_test_util.cc
@@ -18,10 +18,17 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/precision.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
 #include "tensorflow/lite/delegates/gpu/common/task/testing_util.h"
 #include "tensorflow/lite/delegates/gpu/common/tasks/gather.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/hexagon/builders/tests/BUILD b/tensorflow/lite/delegates/hexagon/builders/tests/BUILD
index 76e5ad7a1e4d74..981d0f921ff955 100644
--- a/tensorflow/lite/delegates/hexagon/builders/tests/BUILD
+++ b/tensorflow/lite/delegates/hexagon/builders/tests/BUILD
@@ -81,14 +81,14 @@ cc_test(
     ],
     deps = [
         "//tensorflow/core:framework_internal",
-        "//tensorflow/core:test",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite:string",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/api:op_resolver",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/delegates/hexagon:hexagon_delegate",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/kernels:kernel_util",
-        "//tensorflow/lite/kernels:test_util",
         "//tensorflow/lite/testing:util",
         "//tensorflow/lite/tools:logging",
         "//tensorflow/lite/tools/benchmark:benchmark_utils",
diff --git a/tensorflow/lite/delegates/hexagon/builders/tests/activations_test.cc b/tensorflow/lite/delegates/hexagon/builders/tests/activations_test.cc
index 8ee15e6689acd9..373342f054140c 100644
--- a/tensorflow/lite/delegates/hexagon/builders/tests/activations_test.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/tests/activations_test.cc
@@ -18,8 +18,11 @@ limitations under the License.
 #include <limits>
 #include <random>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 using testing::ElementsAreArray;
diff --git a/tensorflow/lite/delegates/hexagon/builders/tests/arg_min_max_test.cc b/tensorflow/lite/delegates/hexagon/builders/tests/arg_min_max_test.cc
index ecd2e89507981c..656596143e8872 100644
--- a/tensorflow/lite/delegates/hexagon/builders/tests/arg_min_max_test.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/tests/arg_min_max_test.cc
@@ -14,9 +14,11 @@ limitations under the License.
 ==============================================================================*/
 #include <initializer_list>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 using testing::ElementsAreArray;
diff --git a/tensorflow/lite/delegates/hexagon/builders/tests/arithmetic_test.cc b/tensorflow/lite/delegates/hexagon/builders/tests/arithmetic_test.cc
index 7a84a2d3f285df..ad7b20a52cb182 100644
--- a/tensorflow/lite/delegates/hexagon/builders/tests/arithmetic_test.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/tests/arithmetic_test.cc
@@ -14,9 +14,11 @@ limitations under the License.
 ==============================================================================*/
 #include <initializer_list>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/hexagon/builders/tests/batch_seq_config_test.cc b/tensorflow/lite/delegates/hexagon/builders/tests/batch_seq_config_test.cc
index c71e2da343e772..157701a40e2d39 100644
--- a/tensorflow/lite/delegates/hexagon/builders/tests/batch_seq_config_test.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/tests/batch_seq_config_test.cc
@@ -18,17 +18,17 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_delegate.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/interpreter_builder.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/string_type.h"
 #include "tensorflow/lite/testing/util.h"
 #include "tensorflow/lite/tools/benchmark/benchmark_utils.h"
 #include "tensorflow/lite/tools/logging.h"
diff --git a/tensorflow/lite/delegates/hexagon/builders/tests/concat_test.cc b/tensorflow/lite/delegates/hexagon/builders/tests/concat_test.cc
index 3b7f9c68618e0d..619e41fba8f3e2 100644
--- a/tensorflow/lite/delegates/hexagon/builders/tests/concat_test.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/tests/concat_test.cc
@@ -14,8 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include <random>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 using testing::ElementsAreArray;
diff --git a/tensorflow/lite/delegates/hexagon/builders/tests/conv_test.cc b/tensorflow/lite/delegates/hexagon/builders/tests/conv_test.cc
index 7c1b6ecbe326a7..b95911bcdba366 100644
--- a/tensorflow/lite/delegates/hexagon/builders/tests/conv_test.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/tests/conv_test.cc
@@ -15,11 +15,13 @@ limitations under the License.
 #include <initializer_list>
 #include <numeric>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
 #include "tensorflow/lite/kernels/internal/test_util.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 using testing::ElementsAreArray;
diff --git a/tensorflow/lite/delegates/hexagon/builders/tests/l2_norm_test.cc b/tensorflow/lite/delegates/hexagon/builders/tests/l2_norm_test.cc
index 82fcd6b90a8f1a..51926ef61465a2 100644
--- a/tensorflow/lite/delegates/hexagon/builders/tests/l2_norm_test.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/tests/l2_norm_test.cc
@@ -14,8 +14,11 @@ limitations under the License.
 ==============================================================================*/
 #include <initializer_list>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 using testing::ElementsAreArray;
diff --git a/tensorflow/lite/delegates/hexagon/builders/tests/matmul_test.cc b/tensorflow/lite/delegates/hexagon/builders/tests/matmul_test.cc
index cd2a421e47db96..97117c2b89dfd0 100644
--- a/tensorflow/lite/delegates/hexagon/builders/tests/matmul_test.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/tests/matmul_test.cc
@@ -12,9 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
-#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/hexagon/builders/tests/min_max_builder_test.cc b/tensorflow/lite/delegates/hexagon/builders/tests/min_max_builder_test.cc
index 8eedd35585e1f3..b417ce821f1922 100644
--- a/tensorflow/lite/delegates/hexagon/builders/tests/min_max_builder_test.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/tests/min_max_builder_test.cc
@@ -15,8 +15,13 @@ limitations under the License.
 
 #include <initializer_list>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 using testing::ElementsAreArray;
diff --git a/tensorflow/lite/delegates/hexagon/builders/tests/mirror_pad_test.cc b/tensorflow/lite/delegates/hexagon/builders/tests/mirror_pad_test.cc
index fd9a589154716b..fe8f1631109582 100644
--- a/tensorflow/lite/delegates/hexagon/builders/tests/mirror_pad_test.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/tests/mirror_pad_test.cc
@@ -14,8 +14,11 @@ limitations under the License.
 ==============================================================================*/
 #include <initializer_list>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 using testing::ElementsAreArray;
diff --git a/tensorflow/lite/delegates/hexagon/builders/tests/mul_test.cc b/tensorflow/lite/delegates/hexagon/builders/tests/mul_test.cc
index 8e615d3d5fb049..6f424da5a8be2c 100644
--- a/tensorflow/lite/delegates/hexagon/builders/tests/mul_test.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/tests/mul_test.cc
@@ -12,8 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+#include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/hexagon/builders/tests/neg_test.cc b/tensorflow/lite/delegates/hexagon/builders/tests/neg_test.cc
index afe19ccbdc777f..4e4b8c1323a97c 100644
--- a/tensorflow/lite/delegates/hexagon/builders/tests/neg_test.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/tests/neg_test.cc
@@ -14,8 +14,11 @@ limitations under the License.
 ==============================================================================*/
 #include <initializer_list>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 using testing::ElementsAreArray;
diff --git a/tensorflow/lite/delegates/hexagon/builders/tests/pack_test.cc b/tensorflow/lite/delegates/hexagon/builders/tests/pack_test.cc
index d512868177a164..e9735e414c9596 100644
--- a/tensorflow/lite/delegates/hexagon/builders/tests/pack_test.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/tests/pack_test.cc
@@ -14,8 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include <initializer_list>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 using testing::ElementsAreArray;
diff --git a/tensorflow/lite/delegates/hexagon/builders/tests/pad_test.cc b/tensorflow/lite/delegates/hexagon/builders/tests/pad_test.cc
index ec8c3e4de1eb6f..4c9f0767259a5b 100644
--- a/tensorflow/lite/delegates/hexagon/builders/tests/pad_test.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/tests/pad_test.cc
@@ -14,8 +14,11 @@ limitations under the License.
 ==============================================================================*/
 #include <initializer_list>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 using testing::ElementsAreArray;
diff --git a/tensorflow/lite/delegates/hexagon/builders/tests/pool_test.cc b/tensorflow/lite/delegates/hexagon/builders/tests/pool_test.cc
index fa49629c966a5d..d742f212db702e 100644
--- a/tensorflow/lite/delegates/hexagon/builders/tests/pool_test.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/tests/pool_test.cc
@@ -14,8 +14,11 @@ limitations under the License.
 ==============================================================================*/
 #include <random>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+#include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/hexagon/builders/tests/quantize_test.cc b/tensorflow/lite/delegates/hexagon/builders/tests/quantize_test.cc
index 7e305308f8f00e..eeaebe26991c79 100644
--- a/tensorflow/lite/delegates/hexagon/builders/tests/quantize_test.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/tests/quantize_test.cc
@@ -12,8 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 using testing::ElementsAreArray;
diff --git a/tensorflow/lite/delegates/hexagon/builders/tests/reduce_test.cc b/tensorflow/lite/delegates/hexagon/builders/tests/reduce_test.cc
index 3bc7ff3aa9ebeb..a124b5a6243185 100644
--- a/tensorflow/lite/delegates/hexagon/builders/tests/reduce_test.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/tests/reduce_test.cc
@@ -14,8 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include <initializer_list>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 using testing::ElementsAreArray;
diff --git a/tensorflow/lite/delegates/hexagon/builders/tests/reshape_test.cc b/tensorflow/lite/delegates/hexagon/builders/tests/reshape_test.cc
index 817aa3d817f6aa..7f3ee4278b1c45 100644
--- a/tensorflow/lite/delegates/hexagon/builders/tests/reshape_test.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/tests/reshape_test.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
 #include "tensorflow/lite/kernels/reshape_test_common.h"
-#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 using ::testing::ElementsAreArray;
diff --git a/tensorflow/lite/delegates/hexagon/builders/tests/resize_test.cc b/tensorflow/lite/delegates/hexagon/builders/tests/resize_test.cc
index 11af9b3979b5d2..2b2e8d42aef338 100644
--- a/tensorflow/lite/delegates/hexagon/builders/tests/resize_test.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/tests/resize_test.cc
@@ -14,8 +14,11 @@ limitations under the License.
 ==============================================================================*/
 #include <initializer_list>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 using testing::ElementsAreArray;
diff --git a/tensorflow/lite/delegates/hexagon/builders/tests/rsqrt_test.cc b/tensorflow/lite/delegates/hexagon/builders/tests/rsqrt_test.cc
index bf7620f93f4817..6a4c3976264f00 100644
--- a/tensorflow/lite/delegates/hexagon/builders/tests/rsqrt_test.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/tests/rsqrt_test.cc
@@ -14,8 +14,11 @@ limitations under the License.
 ==============================================================================*/
 #include <initializer_list>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 using testing::ElementsAreArray;
diff --git a/tensorflow/lite/delegates/hexagon/builders/tests/slice_test.cc b/tensorflow/lite/delegates/hexagon/builders/tests/slice_test.cc
index 29540885435424..f957641d2ebfa4 100644
--- a/tensorflow/lite/delegates/hexagon/builders/tests/slice_test.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/tests/slice_test.cc
@@ -14,8 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include <initializer_list>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 using testing::ElementsAreArray;
diff --git a/tensorflow/lite/delegates/hexagon/builders/tests/softmax_test.cc b/tensorflow/lite/delegates/hexagon/builders/tests/softmax_test.cc
index 5671784dc6f756..58cbf3bd833a97 100644
--- a/tensorflow/lite/delegates/hexagon/builders/tests/softmax_test.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/tests/softmax_test.cc
@@ -12,8 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 using testing::ElementsAreArray;
diff --git a/tensorflow/lite/delegates/hexagon/builders/tests/space_to_depth_test.cc b/tensorflow/lite/delegates/hexagon/builders/tests/space_to_depth_test.cc
index 1082f4e8f0644d..c3c2824b7aa9c9 100644
--- a/tensorflow/lite/delegates/hexagon/builders/tests/space_to_depth_test.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/tests/space_to_depth_test.cc
@@ -12,8 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 using testing::ElementsAreArray;
diff --git a/tensorflow/lite/delegates/hexagon/builders/tests/split_test.cc b/tensorflow/lite/delegates/hexagon/builders/tests/split_test.cc
index 310985aa4b9aa3..cc5d421ee13de4 100644
--- a/tensorflow/lite/delegates/hexagon/builders/tests/split_test.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/tests/split_test.cc
@@ -15,8 +15,11 @@ limitations under the License.
 #include <algorithm>
 #include <initializer_list>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 using testing::ElementsAreArray;
diff --git a/tensorflow/lite/delegates/hexagon/builders/tests/squared_difference_test.cc b/tensorflow/lite/delegates/hexagon/builders/tests/squared_difference_test.cc
index ade78b66d818d2..627b6c4ca6c934 100644
--- a/tensorflow/lite/delegates/hexagon/builders/tests/squared_difference_test.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/tests/squared_difference_test.cc
@@ -12,8 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 using ::testing::ElementsAreArray;
diff --git a/tensorflow/lite/delegates/hexagon/builders/tests/strided_slice_test.cc b/tensorflow/lite/delegates/hexagon/builders/tests/strided_slice_test.cc
index dc50b4ff770766..fde39b0d60b3fa 100644
--- a/tensorflow/lite/delegates/hexagon/builders/tests/strided_slice_test.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/tests/strided_slice_test.cc
@@ -14,8 +14,11 @@ limitations under the License.
 ==============================================================================*/
 #include <initializer_list>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 using testing::ElementsAreArray;
diff --git a/tensorflow/lite/delegates/hexagon/builders/tests/transpose_conv_test.cc b/tensorflow/lite/delegates/hexagon/builders/tests/transpose_conv_test.cc
index 70087b8240905f..b894691120f605 100644
--- a/tensorflow/lite/delegates/hexagon/builders/tests/transpose_conv_test.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/tests/transpose_conv_test.cc
@@ -14,9 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include <initializer_list>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 using testing::ElementsAreArray;
diff --git a/tensorflow/lite/delegates/hexagon/builders/tests/transpose_test.cc b/tensorflow/lite/delegates/hexagon/builders/tests/transpose_test.cc
index 75693e47eafdc4..9e17fb833b8579 100644
--- a/tensorflow/lite/delegates/hexagon/builders/tests/transpose_test.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/tests/transpose_test.cc
@@ -14,8 +14,14 @@ limitations under the License.
 ==============================================================================*/
 #include <initializer_list>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 using testing::ElementsAreArray;
diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index 9a7360165d0888..46c327528076e8 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -1,3 +1,4 @@
+load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow/lite:special_rules.bzl", "internal_visibility_allowlist", "tflite_portable_test_suite_combined")
@@ -11,7 +12,6 @@ package(
 EMSCRIPTEN_LINKOPTS = [
     "-s ASSERTIONS=2",
     "-s ERROR_ON_UNDEFINED_SYMBOLS=1",
-    "-s DEMANGLE_SUPPORT=1",
     "-s EXIT_RUNTIME=1",
     "-s ALLOW_MEMORY_GROWTH=1",
     "-s TOTAL_MEMORY=134217728",
@@ -215,15 +215,6 @@ cc_library(
     }),
 )
 
-cc_library(
-    name = "xnnpack_delegate_test",
-    testonly = True,
-    hdrs = ["xnnpack_delegate_test.h"],
-    deps = [
-        ":xnnpack_delegate",
-    ],
-)
-
 cc_library(
     name = "xnnpack_delegate",
     srcs = ["xnnpack_delegate.cc"],
@@ -252,7 +243,9 @@ cc_library(
         ":tflite_with_xnnpack_qs8",
         ":tflite_with_xnnpack_qu8",
         ":tflite_with_xnnpack_transient_indirection_buffer",
+        ":weight_cache",
         "//tensorflow/lite:kernel_api",
+        "//tensorflow/lite:logger",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/core:subgraph",
@@ -290,6 +283,7 @@ cc_library(
     linkstatic = True,
     deps = [
         ":quantization_util",
+        ":weight_cache",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/c:c_api_types",
@@ -324,6 +318,30 @@ cc_library(
     ],
 )
 
+flatbuffer_cc_library(
+    name = "weight_cache_schema",
+    srcs = ["weight_cache_schema.fbs"],
+    compatible_with = get_compatible_with_portable(),
+    flatc_args = [
+        "--gen-mutable",
+        "--gen-object-api",
+    ],
+)
+
+cc_library(
+    name = "weight_cache",
+    srcs = ["weight_cache.cc"],
+    hdrs = ["weight_cache.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":weight_cache_schema",
+        "//tensorflow/lite:minimal_logging",
+        "//tensorflow/lite/c:common",
+        "@XNNPACK",
+        "@flatbuffers//:runtime_cc",
+    ],
+)
+
 ################################ Tester classes ################################
 
 cc_library(
@@ -514,6 +532,26 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "dynamically_quantized_transpose_conv_tester",
+    testonly = 1,
+    srcs = ["dynamically_quantized_transpose_conv_tester.cc"],
+    hdrs = ["dynamically_quantized_transpose_conv_tester.h"],
+    deps = [
+        ":xnnpack_delegate_test_mode",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_conversion_utils",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers//:runtime_cc",
+    ],
+)
+
 cc_library(
     name = "fully_connected_tester",
     testonly = 1,
@@ -1371,6 +1409,23 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "dynamically_quantized_transpose_conv_test",
+    srcs = ["dynamically_quantized_transpose_conv_test.cc"],
+    linkopts = select({
+        "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":dynamically_quantized_transpose_conv_tester",
+        ":test_main",
+        ":xnnpack_delegate_test_mode",
+        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_test(
     name = "elu_test",
     srcs = ["elu_test.cc"],
@@ -2792,4 +2847,21 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "weight_cache_test",
+    srcs = ["weight_cache_test.cc"],
+    tags = [
+        "tflite_disable_mobile_test",  # TODO - b/341104412 - enable on Android
+    ],
+    deps = [
+        ":test_main",
+        ":weight_cache",
+        ":weight_cache_schema",
+        "//tensorflow/lite/c:common",
+        "@XNNPACK",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers//:runtime_cc",
+    ],
+)
+
 tflite_portable_test_suite_combined(combine_conditions = {"deps": [":test_main"]})
diff --git a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_test.cc b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_test.cc
new file mode 100644
index 00000000000000..de863e4f1e2125
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_test.cc
@@ -0,0 +1,323 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <random>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+
+namespace tflite {
+namespace xnnpack {
+
+TEST(DynamicallyQuantizedTransposeConvTest, 2x2Stride2) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto output_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(5, 25), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  DynamicallyQuantizedTransposeConvTester()
+      .OutputHeight(output_rng())
+      .OutputWidth(output_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(2)
+      .KernelWidth(2)
+      .StrideHeight(2)
+      .StrideWidth(2)
+      .ValidPadding()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(DynamicallyQuantizedTransposeConvTest, 3x3Stride2) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto output_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(5, 25), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  DynamicallyQuantizedTransposeConvTester()
+      .OutputHeight(output_rng())
+      .OutputWidth(output_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(3)
+      .KernelWidth(3)
+      .StrideHeight(2)
+      .StrideWidth(2)
+      .SamePadding()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(DynamicallyQuantizedTransposeConvTest, 4x4Stride2) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto output_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(5, 25), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  DynamicallyQuantizedTransposeConvTester()
+      .OutputHeight(output_rng())
+      .OutputWidth(output_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(4)
+      .KernelWidth(4)
+      .StrideHeight(2)
+      .StrideWidth(2)
+      .ValidPadding()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(DynamicallyQuantizedTransposeConvTest, 4x4Stride4) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto output_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(5, 25), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  DynamicallyQuantizedTransposeConvTester()
+      .OutputHeight(output_rng())
+      .OutputWidth(output_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(4)
+      .KernelWidth(4)
+      .StrideHeight(4)
+      .StrideWidth(4)
+      .ValidPadding()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(DynamicallyQuantizedTransposeConvTest, SmallKernelWithSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto output_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 7), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  DynamicallyQuantizedTransposeConvTester()
+      .BatchSize(batch_rng())
+      .OutputHeight(output_rng())
+      .OutputWidth(output_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .SamePadding()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(DynamicallyQuantizedTransposeConvTest, SmallKernelWithValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto output_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 7), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  DynamicallyQuantizedTransposeConvTester()
+      .BatchSize(batch_rng())
+      .OutputHeight(output_rng())
+      .OutputWidth(output_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .ValidPadding()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(DynamicallyQuantizedTransposeConvTest, StrideWithSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto output_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  DynamicallyQuantizedTransposeConvTester()
+      .BatchSize(batch_rng())
+      .OutputHeight(output_rng())
+      .OutputWidth(output_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .SamePadding()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(DynamicallyQuantizedTransposeConvTest, StrideWithValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto output_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  DynamicallyQuantizedTransposeConvTester()
+      .BatchSize(batch_rng())
+      .OutputHeight(output_rng())
+      .OutputWidth(output_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .ValidPadding()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(DynamicallyQuantizedTransposeConvTest, MultiThreading) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.num_threads = 2;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto output_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  DynamicallyQuantizedTransposeConvTester()
+      .BatchSize(batch_rng())
+      .OutputHeight(output_rng())
+      .OutputWidth(output_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .SamePadding()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(DynamicallyQuantizedTransposeConvTest, WeightsCache) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
+                  decltype(&TfLiteXNNPackDelegateWeightsCacheDelete)>
+      weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
+                    TfLiteXNNPackDelegateWeightsCacheDelete);
+  delegate_options.weights_cache = weights_cache.get();
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto output_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  DynamicallyQuantizedTransposeConvTester()
+      .BatchSize(batch_rng())
+      .OutputHeight(output_rng())
+      .OutputWidth(output_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .SamePadding()
+      .WeightsCache(weights_cache.get())
+      .Test(xnnpack_delegate.get());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_tester.cc b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_tester.cc
new file mode 100644
index 00000000000000..35c1b590c7396a
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_tester.cc
@@ -0,0 +1,296 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_tester.h"
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <random>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "flatbuffers/buffer.h"  // from @flatbuffers
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "flatbuffers/string.h"  // from @flatbuffers
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/schema/schema_conversion_utils.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+namespace tflite {
+namespace xnnpack {
+
+void DynamicallyQuantizedTransposeConvTester::Test(
+    TfLiteDelegate* delegate) const {
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(-10, 10), rng);
+
+  const std::vector<int8_t> kernel_data = GenerateKernelData();
+  const std::vector<float> bias_data = GenerateBiasData();
+  const std::vector<float> kernel_scale_data = GenerateKernelScaleData();
+  std::vector<char> buffer =
+      CreateTfLiteModel(kernel_data, bias_data, kernel_scale_data);
+  const Model* model = GetModel(buffer.data());
+
+  std::unique_ptr<Interpreter> delegate_interpreter;
+  ASSERT_EQ(InterpreterBuilder(
+                model, ::tflite::ops::builtin::BuiltinOpResolverWithXNNPACK())(
+                &delegate_interpreter),
+            kTfLiteOk);
+  std::unique_ptr<Interpreter> default_interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
+          &default_interpreter),
+      kTfLiteOk);
+
+  ASSERT_TRUE(delegate_interpreter);
+  ASSERT_TRUE(default_interpreter);
+
+  ASSERT_EQ(delegate_interpreter->inputs().size(), 1);
+  ASSERT_EQ(default_interpreter->inputs().size(), 1);
+
+  ASSERT_EQ(delegate_interpreter->outputs().size(), 1);
+  ASSERT_EQ(default_interpreter->outputs().size(), 1);
+
+  ASSERT_EQ(delegate_interpreter->AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(default_interpreter->AllocateTensors(), kTfLiteOk);
+
+  ASSERT_EQ(delegate_interpreter->ModifyGraphWithDelegate(delegate), kTfLiteOk);
+
+  if (weights_cache_ != nullptr) {
+    TfLiteXNNPackDelegateWeightsCacheFinalizeHard(weights_cache_);
+  }
+
+  const int input_data_size =
+      BatchSize() * InputHeight() * InputWidth() * InputChannels();
+  float* default_input_data = default_interpreter->typed_input_tensor<float>(0);
+  std::generate_n(default_input_data, input_data_size, std::ref(f32rng));
+
+  float* xnnpack_input_data =
+      delegate_interpreter->typed_input_tensor<float>(0);
+  std::copy_n(default_input_data, input_data_size, xnnpack_input_data);
+
+  ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
+  ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
+
+  float* default_output_data = default_interpreter->typed_tensor<float>(
+      default_interpreter->outputs()[0]);
+  float* xnnpack_output_data = delegate_interpreter->typed_tensor<float>(
+      delegate_interpreter->outputs()[0]);
+
+  const int num_output_values =
+      BatchSize() * OutputHeight() * OutputWidth() * OutputChannels();
+  int different_output_values = 0;
+  for (size_t i = 0; i < num_output_values; i++) {
+    if (std::abs(default_output_data[i] - xnnpack_output_data[i]) >
+        0.005 * std::abs(default_output_data[i])) {
+      ++different_output_values;
+    }
+  }
+  if (different_output_values > 0.05 * num_output_values) {
+    GTEST_FAIL() << (float)different_output_values / num_output_values * 100.f
+                 << "% of output values differ";
+  }
+}
+
+std::vector<int8_t>
+DynamicallyQuantizedTransposeConvTester::GenerateKernelData() const {
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto range_rng = std::bind(std::uniform_int_distribution<int32_t>(
+                                 std::numeric_limits<int8_t>::min(),
+                                 std::numeric_limits<int8_t>::max()),
+                             std::ref(rng));
+  std::vector<int8_t> filter_data(OutputChannels() * KernelHeight() *
+                                  KernelWidth() * InputChannels());
+
+  for (int32_t oc = 0; oc < OutputChannels(); oc++) {
+    // Use the same range of all-positive or all-negative values to generate
+    // all weights within the same output channel, but different ranges for
+    // different output channels. This ensures that no catastrophic
+    // cancellation occur, but test covers both positive and negative
+    // inputs.
+    const int32_t range = range_rng();
+    const auto value_dist = std::uniform_int_distribution<int32_t>(
+        std::min(range, 0), std::max(range, 0));
+    auto value_rng = std::bind(value_dist, std::ref(rng));
+    for (int32_t ic = 0; ic < InputChannels(); ic++) {
+      for (int32_t y = 0; y < KernelHeight(); y++) {
+        for (int32_t x = 0; x < KernelWidth(); x++) {
+          const int32_t index =
+              ((oc * KernelHeight() + y) * KernelWidth() + x) *
+                  InputChannels() +
+              ic;
+          filter_data[index] = value_rng();
+        }
+      }
+    }
+  }
+  return filter_data;
+}
+
+std::vector<float> DynamicallyQuantizedTransposeConvTester::GenerateBiasData()
+    const {
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto bias_rng =
+      std::bind(std::uniform_real_distribution<float>(-10, 10), std::ref(rng));
+  std::vector<float> bias_data(OutputChannels());
+  std::generate(bias_data.begin(), bias_data.end(), std::ref(bias_rng));
+  return bias_data;
+}
+
+std::vector<float>
+DynamicallyQuantizedTransposeConvTester::GenerateKernelScaleData() const {
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto kernel_scale_rng =
+      std::bind(std::uniform_real_distribution<float>(0.1, 3), std::ref(rng));
+
+  std::vector<float> kernel_scale(OutputChannels());
+  std::generate(kernel_scale.begin(), kernel_scale.end(),
+                std::ref(kernel_scale_rng));
+
+  return kernel_scale;
+}
+
+std::vector<char> DynamicallyQuantizedTransposeConvTester::CreateTfLiteModel(
+    const std::vector<int8_t>& filter_data, const std::vector<float>& bias_data,
+    const std::vector<float>& kernel_scale) const {
+  /*************************** Define operator codes **************************/
+  flatbuffers::FlatBufferBuilder builder;
+  std::vector<flatbuffers::Offset<OperatorCode>> operator_codes{
+      {CreateOperatorCode(builder, BuiltinOperator_TRANSPOSE_CONV)}};
+
+  /****************************** Define buffers ******************************/
+  std::vector<flatbuffers::Offset<tflite::Buffer>> buffers{
+      {CreateBuffer(builder, builder.CreateVector({}))}};
+  const int filter_buffer_id = buffers.size();
+  buffers.emplace_back(CreateBuffer(
+      builder,
+      builder.CreateVector(reinterpret_cast<const uint8_t*>(filter_data.data()),
+                           sizeof(int8_t) * filter_data.size())));
+  const int bias_buffer_id = buffers.size();
+  buffers.emplace_back(CreateBuffer(
+      builder,
+      builder.CreateVector(reinterpret_cast<const uint8_t*>(bias_data.data()),
+                           sizeof(float) * bias_data.size())));
+  const std::array<int32_t, 4> output_shape{
+      {BatchSize(), OutputHeight(), OutputWidth(), OutputChannels()}};
+  const int output_shape_buffer_id = buffers.size();
+  buffers.emplace_back(CreateBuffer(
+      builder, builder.CreateVector(
+                   reinterpret_cast<const uint8_t*>(output_shape.data()),
+                   sizeof(int32_t) * output_shape.size())));
+
+  /****************************** Define tensors ******************************/
+  const std::vector<int32_t> filter_shape = {OutputChannels(), KernelHeight(),
+                                             KernelWidth(), InputChannels()};
+  const std::vector<int32_t> bias_shape = {OutputChannels()};
+  std::vector<flatbuffers::Offset<tflite::Tensor>> tensors;
+  const int input_tensor_id = tensors.size();
+  const std::array<int32_t, 4> input_shape{
+      {BatchSize(), InputHeight(), InputWidth(), InputChannels()}};
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(input_shape.data(), input_shape.size()),
+      TensorType_FLOAT32));
+
+  const int filter_tensor_id = tensors.size();
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
+      TensorType_INT8,
+      /*buffer=*/filter_buffer_id, /*name=*/0,
+      CreateQuantizationParameters(
+          builder, /*min=*/0, /*max=*/0,
+          builder.CreateVector<float>(kernel_scale),
+          builder.CreateVector<int64_t>(
+              std::vector<int64_t>(OutputChannels(), 0)))));
+
+  const int bias_tensor_id = tensors.size();
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
+      TensorType_FLOAT32, bias_buffer_id));
+
+  const int output_tensor_id = tensors.size();
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(output_shape.data(), output_shape.size()),
+      TensorType_FLOAT32));
+
+  const int output_shape_tensor_id = tensors.size();
+  const std::array<int32_t, 1> output_shape_shape{{4}};
+  tensors.emplace_back(
+      CreateTensor(builder,
+                   builder.CreateVector<int32_t>(output_shape_shape.data(),
+                                                 output_shape_shape.size()),
+                   TensorType_INT32, output_shape_buffer_id));
+
+  /***************************** Define operators *****************************/
+  std::vector<flatbuffers::Offset<tflite::Operator>> operators;
+
+  std::vector<int32_t> op_inputs{{output_shape_tensor_id, filter_tensor_id,
+                                  input_tensor_id, bias_tensor_id}};
+  const std::array<int32_t, 1> op_outputs{{output_tensor_id}};
+  const flatbuffers::Offset<TransposeConvOptions> transpose_conv_options =
+      CreateTransposeConvOptions(builder, Padding(), StrideWidth(),
+                                 StrideHeight());
+  operators.emplace_back(CreateOperator(
+      builder, /*opcode_index=*/0,
+      builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
+      builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()),
+      BuiltinOptions_TransposeConvOptions, transpose_conv_options.Union()));
+
+  /****************************** Define subgraph *****************************/
+  const std::array<int32_t, 1> subgraph_inputs{{input_tensor_id}};
+  const std::array<int32_t, 1> subgraph_outputs{{output_tensor_id}};
+  const flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
+      builder, builder.CreateVector(tensors.data(), tensors.size()),
+      builder.CreateVector<int32_t>(subgraph_inputs.data(),
+                                    subgraph_inputs.size()),
+      builder.CreateVector<int32_t>(subgraph_outputs.data(),
+                                    subgraph_outputs.size()),
+      builder.CreateVector(operators.data(), operators.size()));
+
+  const flatbuffers::Offset<flatbuffers::String> description =
+      builder.CreateString("Dynamically Quantized Transpose Conv2D model");
+
+  const flatbuffers::Offset<Model> model_buffer = CreateModel(
+      builder, TFLITE_SCHEMA_VERSION,
+      builder.CreateVector(operator_codes.data(), operator_codes.size()),
+      builder.CreateVector(&subgraph, 1), description,
+      builder.CreateVector(buffers.data(), buffers.size()));
+
+  builder.Finish(model_buffer);
+
+  return std::vector<char>(builder.GetBufferPointer(),
+                           builder.GetBufferPointer() + builder.GetSize());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_tester.h b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_tester.h
new file mode 100644
index 00000000000000..5c69808c836b33
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_tester.h
@@ -0,0 +1,209 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_DYNAMICALLY_QUANTIZED_TRANSPOSE_CONV_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_DYNAMICALLY_QUANTIZED_TRANSPOSE_CONV_TESTER_H_
+
+#include <cstdint>
+#include <functional>
+#include <random>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class DynamicallyQuantizedTransposeConvTester {
+ public:
+  DynamicallyQuantizedTransposeConvTester() = default;
+  DynamicallyQuantizedTransposeConvTester(
+      const DynamicallyQuantizedTransposeConvTester&) = delete;
+  DynamicallyQuantizedTransposeConvTester& operator=(
+      const DynamicallyQuantizedTransposeConvTester&) = delete;
+
+  inline DynamicallyQuantizedTransposeConvTester& BatchSize(
+      int32_t batch_size) {
+    EXPECT_GT(batch_size, 0);
+    batch_size_ = batch_size;
+    return *this;
+  }
+
+  inline int32_t BatchSize() const { return batch_size_; }
+
+  inline DynamicallyQuantizedTransposeConvTester& InputChannels(
+      int32_t input_channels) {
+    EXPECT_GT(input_channels, 0);
+    input_channels_ = input_channels;
+    return *this;
+  }
+
+  inline int32_t InputChannels() const { return input_channels_; }
+
+  inline DynamicallyQuantizedTransposeConvTester& OutputChannels(
+      int32_t output_channels) {
+    EXPECT_GT(output_channels, 0);
+    output_channels_ = output_channels;
+    return *this;
+  }
+
+  inline int32_t OutputChannels() const { return output_channels_; }
+
+  inline DynamicallyQuantizedTransposeConvTester& OutputHeight(
+      int32_t output_height) {
+    EXPECT_GT(output_height, 0);
+    output_height_ = output_height;
+    return *this;
+  }
+
+  inline int32_t OutputHeight() const { return output_height_; }
+
+  inline DynamicallyQuantizedTransposeConvTester& OutputWidth(
+      int32_t output_width) {
+    EXPECT_GT(output_width, 0);
+    output_width_ = output_width;
+    return *this;
+  }
+
+  inline int32_t OutputWidth() const { return output_width_; }
+
+  inline DynamicallyQuantizedTransposeConvTester& KernelHeight(
+      int32_t kernel_height) {
+    EXPECT_GT(kernel_height, 0);
+    kernel_height_ = kernel_height;
+    return *this;
+  }
+
+  inline int32_t KernelHeight() const { return kernel_height_; }
+
+  inline DynamicallyQuantizedTransposeConvTester& KernelWidth(
+      int32_t kernel_width) {
+    EXPECT_GT(kernel_width, 0);
+    kernel_width_ = kernel_width;
+    return *this;
+  }
+
+  inline int32_t KernelWidth() const { return kernel_width_; }
+
+  inline DynamicallyQuantizedTransposeConvTester& StrideHeight(
+      int32_t stride_height) {
+    EXPECT_GT(stride_height, 0);
+    stride_height_ = stride_height;
+    return *this;
+  }
+
+  inline int32_t StrideHeight() const { return stride_height_; }
+
+  inline DynamicallyQuantizedTransposeConvTester& StrideWidth(
+      int32_t stride_width) {
+    EXPECT_GT(stride_width, 0);
+    stride_width_ = stride_width;
+    return *this;
+  }
+
+  inline int32_t StrideWidth() const { return stride_width_; }
+  inline DynamicallyQuantizedTransposeConvTester& DilationHeight(
+      int32_t dilation_height) {
+    EXPECT_GT(dilation_height, 0);
+    dilation_height_ = dilation_height;
+    return *this;
+  }
+
+  inline int32_t DilationHeight() const { return dilation_height_; }
+
+  inline DynamicallyQuantizedTransposeConvTester& DilationWidth(
+      int32_t dilation_width) {
+    EXPECT_GT(dilation_width, 0);
+    dilation_width_ = dilation_width;
+    return *this;
+  }
+
+  inline int32_t DilationWidth() const { return dilation_width_; }
+
+  inline DynamicallyQuantizedTransposeConvTester& SamePadding() {
+    padding_ = ::tflite::Padding_SAME;
+    return *this;
+  }
+
+  inline DynamicallyQuantizedTransposeConvTester& ValidPadding() {
+    padding_ = ::tflite::Padding_VALID;
+    return *this;
+  }
+
+  inline ::tflite::Padding Padding() const { return padding_; }
+
+  inline int32_t InputWidth() const {
+    return ComputeInputSize(OutputWidth(), KernelWidth(), StrideWidth());
+  }
+
+  inline int32_t InputHeight() const {
+    return ComputeInputSize(OutputHeight(), KernelHeight(), StrideHeight());
+  }
+
+  inline DynamicallyQuantizedTransposeConvTester& WeightsCache(
+      TfLiteXNNPackDelegateWeightsCache* weights_cache) {
+    weights_cache_ = weights_cache;
+    return *this;
+  }
+
+  void Test(TfLiteDelegate* delegate) const;
+
+ private:
+  int32_t ComputeInputSize(int32_t output_size, int32_t kernel_size,
+                           int32_t stride) const {
+    // Roughly follows TFLite's `ComputeOutSize`.
+    switch (padding_) {
+      case ::tflite::Padding_VALID:
+        return (output_size + stride - kernel_size) / stride;
+        break;
+      case ::tflite::Padding_SAME:
+        return (output_size + stride - 1) / stride;
+        break;
+      default:
+        assert(false);
+    }
+  }
+
+ private:
+  std::vector<int8_t> GenerateKernelData() const;
+  std::vector<float> GenerateBiasData() const;
+  std::vector<float> GenerateKernelScaleData() const;
+  std::vector<char> CreateTfLiteModel(
+      const std::vector<int8_t>& filter_data,
+      const std::vector<float>& bias_data,
+      const std::vector<float>& kernel_scale) const;
+
+  int32_t batch_size_ = 1;
+  int32_t input_channels_ = 1;
+  int32_t output_channels_ = 1;
+  int32_t output_height_ = 1;
+  int32_t output_width_ = 1;
+  int32_t kernel_height_ = 1;
+  int32_t kernel_width_ = 1;
+  int32_t stride_height_ = 1;
+  int32_t stride_width_ = 1;
+  int32_t dilation_height_ = 1;
+  int32_t dilation_width_ = 1;
+  ::tflite::Padding padding_ = ::tflite::Padding_VALID;
+  TfLiteXNNPackDelegateWeightsCache* weights_cache_ = nullptr;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_DYNAMICALLY_QUANTIZED_TRANSPOSE_CONV_TESTER_H_
diff --git a/tensorflow/lite/delegates/xnnpack/odml_sdpa_test.cc b/tensorflow/lite/delegates/xnnpack/odml_sdpa_test.cc
index e015b2006dc056..843de362400634 100644
--- a/tensorflow/lite/delegates/xnnpack/odml_sdpa_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/odml_sdpa_test.cc
@@ -30,7 +30,7 @@ TEST(ODMLSDPA, MQA) {
 
   const auto batch = 1;
   const auto input_seq_len = 1;
-  const auto max_seq_len = 500;
+  const auto max_seq_len = 64;
   const auto q_heads = 32;
   const auto kv_heads = 1;
   const auto head_dim = 4;  // embedding_dim//q_heads
@@ -50,7 +50,7 @@ TEST(ODMLSDPA, MHA) {
 
   const auto batch = 1;
   const auto input_seq_len = 1;
-  const auto max_seq_len = 500;
+  const auto max_seq_len = 64;
   const auto q_heads = 32;
   const auto kv_heads = 32;
   const auto head_dim = 4;  // embedding_dim//q_heads
@@ -63,5 +63,25 @@ TEST(ODMLSDPA, MHA) {
       .Test(xnnpack_delegate.get());
 }
 
+TEST(ODMLSDPA, GQA) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  const auto batch = 1;
+  const auto input_seq_len = 1;
+  const auto max_seq_len = 64;
+  const auto q_heads = 32;
+  const auto kv_heads = 4;
+  const auto head_dim = 4;  // embedding_dim//q_heads
+
+  ODMLSDPATester()
+      .QueryShape({batch, input_seq_len, q_heads, head_dim})  // q
+      .KeyShape({batch, max_seq_len, kv_heads, head_dim})     // k
+      .ValueShape({batch, max_seq_len, kv_heads, head_dim})   // v
+      .MaskShape({batch, 1, input_seq_len, max_seq_len})      // mask
+      .Test(xnnpack_delegate.get());
+}
+
 }  // namespace xnnpack
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/weight_cache.cc b/tensorflow/lite/delegates/xnnpack/weight_cache.cc
new file mode 100644
index 00000000000000..aa7307386ff2cd
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/weight_cache.cc
@@ -0,0 +1,523 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/delegates/xnnpack/weight_cache.h"
+
+#include <fcntl.h>
+#include <sys/stat.h>
+
+#if defined(_MSC_VER)
+#include <io.h>
+#define F_OK 0
+#else
+#include <sys/mman.h>
+#include <unistd.h>
+#endif
+
+#include <algorithm>
+#include <cerrno>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+
+#include "xnnpack.h"  // from @XNNPACK
+#include "flatbuffers/base.h"  // from @flatbuffers
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "flatbuffers/verifier.h"  // from @flatbuffers
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/xnnpack/weight_cache_schema_generated.h"
+#include "tensorflow/lite/logger.h"
+#include "tensorflow/lite/minimal_logging.h"
+
+#define XNNPACK_ABORT_CHECK(TEST, ...)                      \
+  if (!(TEST)) {                                            \
+    TFLITE_LOG_PROD(tflite::TFLITE_LOG_ERROR, __VA_ARGS__); \
+    std::abort();                                           \
+  }
+
+namespace tflite::xnnpack {
+
+namespace {
+constexpr size_t kMinAlignment = 64;
+
+template <class F>
+class ScopeGuard {
+ public:
+  explicit ScopeGuard(F&& callback) : callback_(std::forward<F>(callback)) {}
+  ~ScopeGuard() {
+    if (active_) {
+      callback_();
+    }
+  }
+
+  void Deactivate() { active_ = false; }
+
+ private:
+  F callback_;
+  bool active_ = true;
+};
+
+template <class F>
+ScopeGuard(F&&) -> ScopeGuard<F>;
+
+// Returns true if the given path exists.
+[[nodiscard]]
+bool FileExists(const char* path) {
+  return access(path, F_OK) != -1;
+}
+
+}  // namespace
+
+void swap(MMapHandle& a, MMapHandle& b) {
+  using std::swap;
+  swap(a.size_, b.size_);
+  swap(a.data_, b.data_);
+}
+
+MMapHandle::~MMapHandle() { UnMap(); }
+
+MMapHandle::MMapHandle(MMapHandle&& other) { swap(*this, other); }
+
+MMapHandle& MMapHandle::operator=(MMapHandle&& other) {
+  swap(*this, other);
+  return *this;
+}
+
+bool MMapHandle::Map(const char* path) {
+  this->UnMap();
+
+  const int fd = open(path, O_RDONLY);
+  if (fd == -1) {
+    TFLITE_LOG_PROD(
+        tflite::TFLITE_LOG_ERROR,
+        "XNNPack weight cache: could not open file to mmap ('%s'): %s.", path,
+        strerror(errno))
+    return false;
+  }
+
+  const ScopeGuard close_fd_on_return([&fd] {
+    if (fd >= 0) {
+      close(fd);
+    }
+  });
+
+  struct stat file_stats;
+  if (fstat(fd, &file_stats)) {
+    TFLITE_LOG_PROD(tflite::TFLITE_LOG_ERROR,
+                    "XNNPack weight cache: could not access file stats to get "
+                    "size ('%s'): %s.",
+                    path, strerror(errno))
+    return false;
+  }
+
+  size_ = file_stats.st_size;
+#if defined(_MSC_VER)
+  data_ = new uint8_t[size_];
+  {
+    uint8_t* data_reader = data_;
+    size_t remaining_bytes = size_;
+    while (remaining_bytes > 0) {
+      const auto bytes = read(fd, data_reader, remaining_bytes);
+      if (bytes == -1) {
+        TFLITE_LOG_PROD(tflite::TFLITE_LOG_ERROR,
+                        "XNNPack weight cache: could not read file ('%s'): %s.",
+                        path, strerror(errno))
+        UnMap();
+        return false;
+      }
+      remaining_bytes -= bytes;
+      data_reader += bytes;
+    }
+  }
+#else
+  data_ = static_cast<uint8_t*>(
+      mmap(/*addr=*/nullptr, size_, PROT_READ, MAP_SHARED, fd, /*offset=*/0));
+  if (data_ == MAP_FAILED) {
+    TFLITE_LOG_PROD(tflite::TFLITE_LOG_ERROR,
+                    "XNNPack weight cache: could not mmap file (%s): %s.", path,
+                    strerror(errno))
+    data_ = nullptr;
+    size_ = 0;
+    return false;
+  }
+#endif
+
+  return true;
+}
+
+void MMapHandle::UnMap() {
+  if (data_) {
+#if defined(_MSC_VER)
+    delete[] data_;
+#else
+    munmap(data_, size_);
+#endif
+    data_ = nullptr;
+    size_ = 0;
+  }
+}
+
+void* WeightCacheBuilder::Reserve(size_t size) {
+  size_t offset = buffer_data_.size();
+  const size_t misalign = offset % kMinAlignment;
+  if (misalign) {
+    size += kMinAlignment - misalign;
+    offset += kMinAlignment - misalign;
+  }
+  buffer_data_.resize(buffer_data_.size() + size);
+  return buffer_data_.data() + offset;
+}
+
+bool WeightCacheBuilder::SpanIsWithinBuffer(const void* ptr,
+                                            uint64_t size) const {
+  const uintptr_t buf_begin = reinterpret_cast<uintptr_t>(buffer_data_.data());
+  const uintptr_t buf_end = buf_begin + buffer_data_.size();
+  const uintptr_t ptr_begin = reinterpret_cast<uintptr_t>(ptr);
+  const uintptr_t ptr_end = ptr_begin + size;
+  return ptr_begin >= buf_begin && ptr_begin <= buf_end &&
+         ptr_end >= buf_begin && ptr_end <= buf_end;
+}
+
+BufferLocation WeightCacheBuilder::Append(PackIdentifier pack_id,
+                                          const void* data, uint64_t size) {
+  const void* append_data = data;
+  if (!SpanIsWithinBuffer(data, size)) {
+    void* reserved_data = Reserve(size);
+    std::memcpy(reserved_data, data, size);
+    append_data = reserved_data;
+  }
+  BufferLocation loc{.offset = reinterpret_cast<uintptr_t>(append_data) -
+                               reinterpret_cast<uintptr_t>(buffer_data_.data()),
+                     .size = size};
+  schema_.buffers.push_back(std::make_unique<cache::schema::BufferT>(
+      cache::schema::BufferT{.packing_algorithm_id = pack_id.pack_algorithm_id,
+                             .weights_id = pack_id.weights_id,
+                             .bias_id = pack_id.bias_id,
+                             .offset = loc.offset,
+                             .size = loc.size}));
+  return loc;
+}
+
+bool WeightCacheBuilder::ShouldWrite() const { return !buffer_data_.empty(); }
+
+namespace {
+
+bool WriteData(const int fd, const uint8_t* data, size_t size,
+               const char* const file_path, const char* step_description) {
+  for (size_t bytes = 0; bytes < size;) {
+    const auto written_bytes = write(fd, data + bytes, size - bytes);
+    if (written_bytes == -1) {
+      TFLITE_LOG_PROD(
+          tflite::TFLITE_LOG_ERROR,
+          "XNNPack weight cache: file write incomplete (%s). %s: %s.",
+          file_path, step_description, strerror(errno))
+    }
+    bytes += written_bytes;
+  }
+
+  return true;
+}
+
+}  // namespace
+
+bool WeightCacheBuilder::Write(const char* path) {
+  const int fd = open(path, O_WRONLY | O_CREAT | O_TRUNC, 0644);
+  if (fd == -1) {
+    TFLITE_LOG_PROD(
+        tflite::TFLITE_LOG_ERROR,
+        "XNNPack weight cache: could not open cache file ('%s') for "
+        "writing: %s.",
+        path, strerror(errno))
+    return false;
+  }
+
+  const ScopeGuard close_fd_on_return([&fd] {
+    if (fd >= 0) {
+      close(fd);
+    }
+  });
+
+  flatbuffers::FlatBufferBuilder builder;
+  // Add a fake size and the base offset to mutate them afterwards. Otherwise
+  // space for it won't be added to the flatbuffer.
+  schema_.flatbuffer_size = 1;
+  schema_.base_offset = 1;
+  FinishPackedWeightsBuffer(
+      builder, cache::schema::PackedWeights::Pack(builder, &schema_));
+
+  // Mutate the flatbuffer size and base offset fields.
+  auto* mutable_packed_weights =
+      cache::schema::GetMutablePackedWeights(builder.GetBufferPointer());
+  mutable_packed_weights->mutate_flatbuffer_size(builder.GetSize());
+  const size_t misalign = builder.GetSize() % kMinAlignment;
+  const size_t alignment_offset = misalign ? kMinAlignment - misalign : 0;
+  mutable_packed_weights->mutate_base_offset(builder.GetSize() +
+                                             alignment_offset);
+
+  // Write the flatbuffer which serves as a header to index the following
+  // data.
+  if (!WriteData(fd, builder.GetBufferPointer(), builder.GetSize(), path,
+                 "Header")) {
+    return false;
+  }
+  // Add some padding so that the cache file can be mmaped and the buffers
+  // stay aligned correctly.
+  const uint8_t fill[kMinAlignment] = {0};
+  if (!WriteData(fd, fill, alignment_offset, path, "Alignment padding")) {
+    return false;
+  }
+  // Write the actual buffer data.
+  if (!WriteData(fd, buffer_data_.data(), buffer_data_.size(), path,
+                 "Buffer data")) {
+    return false;
+  }
+  TFLITE_LOG_PROD(tflite::TFLITE_LOG_INFO,
+                  "XNNPack weight cache: written to '%s'.", path);
+  return true;
+}
+
+MMapWeightCacheProvider::MMapWeightCacheProvider(
+    MMapWeightCacheProvider&& other) {
+  *this = std::move(other);
+}
+
+MMapWeightCacheProvider& MMapWeightCacheProvider::operator=(
+    MMapWeightCacheProvider&& other) {
+  using std::swap;
+  swap(cache_provider_, other.cache_provider_);
+  // The contexts need to keep pointing to their owning object.
+  cache_provider_.context = this;
+  other.cache_provider_.context = &other;
+  swap(file_path_, other.file_path_);
+  swap(buffer_address_to_identifier_, other.buffer_address_to_identifier_);
+  swap(cache_key_to_offset_, other.cache_key_to_offset_);
+  swap(mmap_handle_, other.mmap_handle_);
+  swap(mmap_buffer_base_offset_, other.mmap_buffer_base_offset_);
+  swap(builder_, other.builder_);
+  return *this;
+}
+
+void MMapWeightCacheProvider::SetFilePath(const char* path) {
+  XNNPACK_ABORT_CHECK(
+      !IsFinalized(),
+      "Cannot change the path of a cache that has already been loaded.");
+  // We try to keep file_path_'s data as stable as possible. Don't overwrite
+  // if the path hasn't changed.
+  if (file_path_ != path) {
+    file_path_ = path;
+  }
+}
+
+bool MMapWeightCacheProvider::Load(const std::string& path) {
+  SetFilePath(path.c_str());
+  return Load();
+}
+
+bool MMapWeightCacheProvider::Load() {
+  XNNPACK_ABORT_CHECK(!file_path_.empty(),
+                      "Path wasn't provided to weight cache provider.");
+  mmap_buffer_base_offset_ = 0;
+  cache_key_to_offset_.clear();
+
+  if (!FileExists(file_path_.c_str())) {
+    TFLITE_LOG_PROD(tflite::TFLITE_LOG_WARNING,
+                    "XNNPack weight cache: could not load '%s': %s.",
+                    file_path_.c_str(), strerror(errno));
+    return false;
+  }
+
+  if (!mmap_handle_.Map(file_path_.c_str())) {
+    return false;
+  }
+
+  // Verifiy the flabuffer part of the file.
+  const size_t verifier_size =
+      std::min(mmap_handle_.size(),
+               static_cast<size_t>(FLATBUFFERS_MAX_BUFFER_SIZE - 1));
+  flatbuffers::Verifier verifier(mmap_handle_.data(), verifier_size);
+  if (!cache::schema::VerifyPackedWeightsBuffer(verifier)) {
+    TFLITE_LOG_PROD(tflite::TFLITE_LOG_ERROR,
+                    "XNNPack weight cache: header validation failed.");
+    return false;
+  }
+
+  // Load flatbuffer.
+  const cache::schema::PackedWeights* packed_weights =
+      cache::schema::GetPackedWeights(mmap_handle_.data());
+  if (!packed_weights) {
+    TFLITE_LOG_PROD(
+        tflite::TFLITE_LOG_ERROR,
+        "XNNPack weight cache: could not get packed weights from flatbuffer.");
+    return false;
+  }
+  mmap_buffer_base_offset_ = packed_weights->base_offset();
+  if (const auto buffers = packed_weights->buffers(); buffers) {
+    for (auto* buffer : *buffers) {
+      if (!buffer) {
+        TFLITE_LOG_PROD(
+            tflite::TFLITE_LOG_ERROR,
+            "XNNPack weight cache: Invalid buffer address in buffer list.");
+        return false;
+      }
+      cache_key_to_offset_.emplace(
+          PackIdentifier{.pack_algorithm_id = buffer->packing_algorithm_id(),
+                         .weights_id = buffer->weights_id(),
+                         .bias_id = buffer->bias_id()},
+          BufferLocation{.offset = buffer->offset(), .size = buffer->size()});
+    }
+  }
+  return true;
+}
+
+void MMapWeightCacheProvider::MapTensorIdentifiers(
+    const TfLiteTensor* tensors, const size_t size,
+    const std::unordered_map<size_t, size_t>& tensor_index_to_identifier) {
+  for (const auto [index, identifier] : tensor_index_to_identifier) {
+    XNNPACK_ABORT_CHECK(index < size,
+                        "Tensor index corresponds to a non existing tensor.");
+    buffer_address_to_identifier_[tensors[index].data.data] = identifier;
+  }
+}
+
+size_t MMapWeightCacheProvider::LookUp(
+    const xnn_weights_cache_look_up_key* cache_key) {
+  if (!cache_key) {
+    TFLITE_LOG_PROD(tflite::TFLITE_LOG_ERROR,
+                    "XNNPack weight cache: a null cache key was provided.");
+    return SIZE_MAX;
+  }
+  const PackIdentifier pack_id = BuildPackIdentifier(*cache_key);
+  if (auto offset_it = cache_key_to_offset_.find(pack_id);
+      offset_it != cache_key_to_offset_.end()) {
+    return offset_it->second.offset;
+  }
+  return SIZE_MAX;
+}
+
+void* MMapWeightCacheProvider::ReserveSpace(size_t size) {
+  XNNPACK_ABORT_CHECK(!IsFinalized(),
+                      "Cannot reserve space in a finalized cache.");
+  return builder_.Reserve(size);
+}
+
+size_t MMapWeightCacheProvider::LookUpOrInsert(
+    const xnn_weights_cache_look_up_key* cache_key, void* ptr, size_t size) {
+  XNNPACK_ABORT_CHECK(cache_key, "A null cache key was provided.");
+
+  const PackIdentifier pack_id = BuildPackIdentifier(*cache_key);
+  if (auto offset_it = cache_key_to_offset_.find(pack_id);
+      offset_it != cache_key_to_offset_.end()) {
+    return offset_it->second.offset;
+  }
+
+  XNNPACK_ABORT_CHECK(!IsFinalized(),
+                      "Cannot insert a buffer in a finalized cache.");
+
+  const BufferLocation location = builder_.Append(pack_id, ptr, size);
+  cache_key_to_offset_.emplace(pack_id, location);
+  return location.offset;
+}
+
+void* MMapWeightCacheProvider::OffsetToAddr(const size_t offset) {
+  // While the cache is being built, the buffer could grow and need to be
+  // reallocated so we cannot ensure pointer stability.
+  XNNPACK_ABORT_CHECK(
+      IsFinalized(),
+      "Cannot get the address of a buffer in a non finalized cache.");
+  return mmap_handle_.data() + mmap_buffer_base_offset_ + offset;
+}
+
+void MMapWeightCacheProvider::Release() {
+  buffer_address_to_identifier_.clear();
+  cache_key_to_offset_.clear();
+  mmap_handle_ = MMapHandle();
+  mmap_buffer_base_offset_ = 0;
+  builder_ = WeightCacheBuilder();
+}
+
+bool MMapWeightCacheProvider::Finalize() {
+  if (IsFinalized()) {
+    return true;
+  }
+  if (file_path_.empty()) {
+    TFLITE_LOG_PROD(tflite::TFLITE_LOG_ERROR,
+                    "XNNPack weight cache: file path wasn't set. Cannot "
+                    "finalize the cache.");
+    return false;
+  }
+  if (!builder_.Write(file_path_.c_str())) {
+    return false;
+  }
+  builder_ = WeightCacheBuilder();
+
+  return Load();
+}
+
+bool MMapWeightCacheProvider::IsFinalized() const {
+  return mmap_handle_.IsMapped();
+}
+
+size_t MMapWeightCacheProvider::look_up(
+    void* context, const xnn_weights_cache_look_up_key* cache_key) {
+  return reinterpret_cast<MMapWeightCacheProvider*>(context)->LookUp(cache_key);
+}
+
+void* MMapWeightCacheProvider::reserve_space(void* context, size_t n) {
+  return reinterpret_cast<MMapWeightCacheProvider*>(context)->ReserveSpace(n);
+}
+
+size_t MMapWeightCacheProvider::look_up_or_insert(
+    void* context, const xnn_weights_cache_look_up_key* cache_key, void* ptr,
+    size_t size) {
+  return reinterpret_cast<MMapWeightCacheProvider*>(context)->LookUpOrInsert(
+      cache_key, ptr, size);
+}
+
+bool MMapWeightCacheProvider::is_finalized(void* context) {
+  return reinterpret_cast<MMapWeightCacheProvider*>(context)->IsFinalized();
+}
+
+void* MMapWeightCacheProvider::offset_to_addr(void* context, size_t offset) {
+  return reinterpret_cast<MMapWeightCacheProvider*>(context)->OffsetToAddr(
+      offset);
+}
+
+enum xnn_status MMapWeightCacheProvider::delete_cache(void* context) {
+  reinterpret_cast<MMapWeightCacheProvider*>(context)->Release();
+  return xnn_status_success;
+}
+
+PackIdentifier MMapWeightCacheProvider::BuildPackIdentifier(
+    const xnn_weights_cache_look_up_key& key) {
+  const auto get_buffer_id = [&](const void* buffer) -> size_t {
+    if (buffer) {
+      const auto identifier_it = buffer_address_to_identifier_.find(buffer);
+      XNNPACK_ABORT_CHECK(identifier_it != buffer_address_to_identifier_.end(),
+                          "Unknown constant buffer passed to HashCacheKey.");
+      return identifier_it->second;
+    }
+    return PackIdentifier::kNoId;
+  };
+  return PackIdentifier{.pack_algorithm_id = key.seed,
+                        .weights_id = get_buffer_id(key.kernel),
+                        .bias_id = get_buffer_id(key.bias)};
+}
+
+}  // namespace tflite::xnnpack
diff --git a/tensorflow/lite/delegates/xnnpack/weight_cache.h b/tensorflow/lite/delegates/xnnpack/weight_cache.h
new file mode 100644
index 00000000000000..0eb66308a37d85
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/weight_cache.h
@@ -0,0 +1,307 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_WEIGHT_CACHE_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_WEIGHT_CACHE_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "xnnpack.h"  // from @XNNPACK
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/xnnpack/weight_cache_schema_generated.h"
+
+// WARNING: the interface in this file is still under experimentation and WILL
+// CHANGE. Do not rely on it.
+
+// TFLite doesn't use absl hashing utilities.
+
+namespace tflite {
+namespace xnnpack {
+
+struct PackIdentifier {
+  enum { kNoId = SIZE_MAX };
+  uint64_t pack_algorithm_id = kNoId;
+  uint64_t weights_id = kNoId;
+  uint64_t bias_id = kNoId;
+
+  friend bool operator==(const PackIdentifier& a, const PackIdentifier& b) {
+    return a.pack_algorithm_id == b.pack_algorithm_id &&
+           a.weights_id == b.weights_id && a.bias_id == b.bias_id;
+  }
+
+  struct Hash {
+    size_t operator()(const PackIdentifier& p) const {
+      std::hash<uint64_t> hasher;
+      return hasher(p.pack_algorithm_id) ^ hasher(p.weights_id) ^
+             hasher(p.bias_id);
+    }
+  };
+};
+
+struct BufferLocation {
+  uint64_t offset;
+  uint64_t size;
+};
+
+// Handles MMap allocations lifetime.
+//
+// When mapped, provides a view over the allocation for convenience.
+//
+// WARNING: the interface in this file is still under experimentation and WILL
+// CHANGE. Do not rely on it.
+class MMapHandle {
+ public:
+  using value_type = uint8_t;
+
+  MMapHandle() = default;
+  ~MMapHandle();
+  MMapHandle(const MMapHandle&) = delete;
+  MMapHandle& operator=(const MMapHandle&) = delete;
+  MMapHandle(MMapHandle&&);
+  MMapHandle& operator=(MMapHandle&&);
+
+  // Maps the file at the given path.
+  [[nodiscard /*Mapping a file can fail.*/]]
+  bool Map(const char* path);
+
+  // Unmaps an existing mapping.
+  void UnMap();
+
+  // Returns true if a mapping exists.
+  bool IsMapped() const { return data_ != nullptr; }
+
+  // Returns the mapping buffer.
+  uint8_t* data() { return data_; }
+
+  // Returns the mapping buffer.
+  const uint8_t* data() const { return data_; }
+
+  // Returns the mapping size in bytes.
+  size_t size() const { return size_; }
+
+  uint8_t* begin() { return data(); }
+
+  const uint8_t* begin() const { return data(); }
+
+  uint8_t* end() { return data() + size(); }
+
+  const uint8_t* end() const { return data() + size(); }
+
+  friend void swap(MMapHandle& a, MMapHandle& b);
+
+ private:
+  size_t size_ = 0;
+  uint8_t* data_ = nullptr;
+};
+
+// Provides storage to write the packed buffers to and saves those to disk.
+//
+// WARNING: the interface in this file is still under experimentation and WILL
+// CHANGE. Do not rely on it.
+class WeightCacheBuilder {
+ public:
+  // Reserves space in the data buffer for the required size in bytes and
+  // returns the address of that space.
+  //
+  // Sets `last_reserve` to the offset from `buffer_data_`'s start and `n`.
+  //
+  // A call to `Reserve` should alway be followed by a call to `Append`.
+  [[nodiscard /*The pointer to reserved space should be used.*/]]
+  void* Reserve(size_t size);
+
+  // Adds a buffer to the cache.
+  //
+  // The buffer space must have been reserved before using `Reserve`. If not, a
+  // new call to `Reserve` will be done and the data will be copied over.
+  [[nodiscard /*The location to the appended data should be saved.*/]]
+  BufferLocation Append(PackIdentifier pack_id, const void* data,
+                        uint64_t size);
+
+  // Checks whether this builder has data that needs to be written to disk.
+  bool ShouldWrite() const;
+
+  // Writes the flatbuffer to disk.
+  [[nodiscard /*Writing the weight cache can fail.*/]]
+  bool Write(const char* path);
+
+  // Helper for testing.
+  //
+  // WARNING: this exposes class implementation details for testing purposes and
+  // may be removed at any time.
+  const std::vector<uint8_t>& BufferData() const { return buffer_data_; }
+
+ private:
+  bool SpanIsWithinBuffer(const void* ptr, uint64_t size) const;
+
+  cache::schema::PackedWeightsT schema_;
+  std::vector<uint8_t> buffer_data_;
+};
+
+// Allows XNNPack to directly load packed weights from disk instead of having to
+// repack them every time.
+//
+// XNNPack kernels do not have knowledge of the TFLite context. The only thing
+// they can access is the buffers address. We rely on the fact that the address
+// provided by TFLite is unique in order to find out the buffer identifier.
+//
+// To use the cache you need to:
+//
+//  - Map the buffer addresses to their identifier with `MapTensorIdentifiers`
+//  - Load the cache file.
+//  - Finalize the cache before calling the run functions of XNNPack (setup and
+//    reshape are ok).
+class MMapWeightCacheProvider {
+ public:
+  MMapWeightCacheProvider() = default;
+  MMapWeightCacheProvider(const MMapWeightCacheProvider&) = delete;
+  MMapWeightCacheProvider& operator=(const MMapWeightCacheProvider&) = delete;
+  MMapWeightCacheProvider(MMapWeightCacheProvider&&);
+  MMapWeightCacheProvider& operator=(MMapWeightCacheProvider&&);
+
+  // Changes the file path to save the cache to.
+  //
+  // WARNING: Can only be called if the cache isn't finalized.
+  void SetFilePath(const char* file_path);
+
+  const std::string& GetFilePath() const { return file_path_; }
+
+  // Set the weight file path and loads it.
+  [[nodiscard /*Loading a cache file may fail.*/]]
+  bool Load(const std::string& path);
+
+  // Loads the weight cache previouslt set with `SetFilePath`.
+  [[nodiscard /*Loading cache data may fail.*/]]
+  bool Load();
+
+  // Creates the tensor map.
+  void MapTensorIdentifiers(
+      const TfLiteTensor* tensors, size_t size,
+      const std::unordered_map<size_t, size_t>& tensor_index_to_identifier);
+
+  // Returns the offset of the buffer identified by `cache_key`.
+  //
+  // If the buffer isn't found, return SIZE_MAX.
+  [[nodiscard]]
+  size_t LookUp(const xnn_weights_cache_look_up_key* cache_key);
+
+  // Reserves space for a buffer of given size and returns a pointer to it.
+  //
+  // The buffer data should be filled and `LookUpOrInsert` should be immediately
+  // called.
+  [[nodiscard]]
+  void* ReserveSpace(size_t size);
+
+  // Returns the offset of the buffer identified by `cache_key`. If the lookup
+  // fails, inserts the span `[ptr, ptr+size)`.
+  //
+  // This should be called after ReserveSpace and `ptr` should be the result of
+  // that call with the given `size`.
+  //
+  // WARNING: The cache key cannot be null.
+  [[nodiscard]]
+  size_t LookUpOrInsert(const xnn_weights_cache_look_up_key* cache_key,
+                        void* ptr, size_t size);
+
+  // Gets the pointer to the buffer at the given offset.
+  //
+  // WARNING: This requires the buffer to be finalized.
+  // WARNING: This does not check the validity of the passed offset.
+  void* OffsetToAddr(size_t offset);
+
+  // Releases the weight cache's memory.
+  void Release();
+
+  // Ensures that the cache is ready.
+  //
+  // If the cache file already exists, this is a no-op. Otherwise, this writes
+  // the file to disk and reloads it.
+  [[nodiscard /*Writing the cache file may fail.*/]]
+  bool Finalize();
+
+  // Checks whether the cache is ready to be used.
+  bool IsFinalized() const;
+
+  // Returns true if any weights have been added to the underlying builder.
+  bool IsBuilding() const { return !IsFinalized() && !file_path_.empty(); };
+
+  // Returns true if a file is mapped or a file path is set.
+  bool IsActive() const { return IsFinalized() || !file_path_.empty(); };
+
+  // Returns the cache provider expected by XNNPack.
+  xnn_weights_cache_provider& GetCacheProvider() { return cache_provider_; }
+
+  // C interface: `xnn_weights_cache_provider` callback.
+  static size_t look_up(void* context,
+                        const xnn_weights_cache_look_up_key* cache_key);
+
+  // C interface: `xnn_weights_cache_provider` callback.
+  static void* reserve_space(void* context, size_t n);
+
+  // C interface: `xnn_weights_cache_provider` callback.
+  static size_t look_up_or_insert(
+      void* context, const xnn_weights_cache_look_up_key* cache_key, void* ptr,
+      size_t size);
+
+  // C interface: `xnn_weights_cache_provider` callback.
+  static bool is_finalized(void* context);
+
+  // C interface: `xnn_weights_cache_provider` callback.
+  static void* offset_to_addr(void* context, size_t offset);
+
+  // C interface: `xnn_weights_cache_provider` callback.
+  static enum xnn_status delete_cache(void* context);
+
+ private:
+  // Hashes a cache key to lookup in `cache_key_to_identifier_`.
+  PackIdentifier BuildPackIdentifier(const xnn_weights_cache_look_up_key& key);
+
+  // Cache provider implementation for XNNPack.
+  xnn_weights_cache_provider cache_provider_{
+      .context = this,
+      .look_up = MMapWeightCacheProvider::look_up,
+      .reserve_space = MMapWeightCacheProvider::reserve_space,
+      .look_up_or_insert = MMapWeightCacheProvider::look_up_or_insert,
+      .is_finalized = MMapWeightCacheProvider::is_finalized,
+      .offset_to_addr = MMapWeightCacheProvider::offset_to_addr,
+      .delete_cache = MMapWeightCacheProvider::delete_cache};
+
+  // Path to the cache file.
+  std::string file_path_;
+
+  // Maps buffer addresses to buffer identifiers.
+  std::unordered_map<const void*, uint64_t> buffer_address_to_identifier_;
+
+  // Maps cache request hashes to the buffer identifier.
+  std::unordered_multimap<PackIdentifier, BufferLocation, PackIdentifier::Hash>
+      cache_key_to_offset_;
+
+  // MMap allocation handler.
+  MMapHandle mmap_handle_;
+
+  // The offset to the first buffer data in the MMap allocation.
+  size_t mmap_buffer_base_offset_;
+
+  // Used to build the cache.
+  WeightCacheBuilder builder_;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_WEIGHT_CACHE_H_
diff --git a/tensorflow/lite/delegates/xnnpack/weight_cache_schema.fbs b/tensorflow/lite/delegates/xnnpack/weight_cache_schema.fbs
new file mode 100644
index 00000000000000..0658054f21c07e
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/weight_cache_schema.fbs
@@ -0,0 +1,52 @@
+// Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This is a list of buffers with identifiers, to host the CPU-specific cache on disk.
+namespace tflite.xnnpack.cache.schema;
+
+// Schema version.
+file_identifier "V001";
+// File extension of written files.
+file_extension "xnn_weights";
+
+table Buffer {
+  // To uniquely identify a packed buffer we need to keep track of the packing
+  // algorithm and of the buffers that were used to generate it.
+  packing_algorithm_id: uint64;
+  weights_id: uint64;
+  bias_id: uint64;
+
+  /// The buffer data is appended after the flatbuffer to bypass 2GB file size
+  /// limitation. The offset is calculated relative to the base offset.
+  /// (i.e. beginning of the file + base_offset).
+  offset: uint64;
+
+  /// Size of the buffer in bytes.
+  size: uint64;
+}
+
+table PackedWeights {
+  /// A list of buffers.
+  buffers: [Buffer];
+
+  /// The serialized file is `flatbuffer_size` of bytes representing
+  /// `NamedBuffers` appended with a blob representing the buffer content.
+  flatbuffer_size: uint64;
+
+  /// Defines the base offset for the data appended to the file. That offset
+  /// may be needed to guarantee data alignment.
+  base_offset:uint64;
+}
+
+root_type PackedWeights;
diff --git a/tensorflow/lite/delegates/xnnpack/weight_cache_schema_generated.h b/tensorflow/lite/delegates/xnnpack/weight_cache_schema_generated.h
new file mode 100755
index 00000000000000..fa5d30a4cdae65
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/weight_cache_schema_generated.h
@@ -0,0 +1,422 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_WEIGHTCACHESCHEMA_TFLITE_XNNPACK_CACHE_SCHEMA_H_
+#define FLATBUFFERS_GENERATED_WEIGHTCACHESCHEMA_TFLITE_XNNPACK_CACHE_SCHEMA_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+// Ensure the included flatbuffers.h is the same version as when this file was
+// generated, otherwise it may not be compatible.
+static_assert(FLATBUFFERS_VERSION_MAJOR == 24 &&
+              FLATBUFFERS_VERSION_MINOR == 3 &&
+              FLATBUFFERS_VERSION_REVISION == 25,
+             "Non-compatible flatbuffers version included");
+
+namespace tflite {
+namespace xnnpack {
+namespace cache {
+namespace schema {
+
+struct Buffer;
+struct BufferBuilder;
+struct BufferT;
+
+struct PackedWeights;
+struct PackedWeightsBuilder;
+struct PackedWeightsT;
+
+struct BufferT : public ::flatbuffers::NativeTable {
+  typedef Buffer TableType;
+  uint64_t packing_algorithm_id = 0;
+  uint64_t weights_id = 0;
+  uint64_t bias_id = 0;
+  uint64_t offset = 0;
+  uint64_t size = 0;
+};
+
+struct Buffer FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BufferT NativeTableType;
+  typedef BufferBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_PACKING_ALGORITHM_ID = 4,
+    VT_WEIGHTS_ID = 6,
+    VT_BIAS_ID = 8,
+    VT_OFFSET = 10,
+    VT_SIZE = 12
+  };
+  uint64_t packing_algorithm_id() const {
+    return GetField<uint64_t>(VT_PACKING_ALGORITHM_ID, 0);
+  }
+  bool mutate_packing_algorithm_id(uint64_t _packing_algorithm_id = 0) {
+    return SetField<uint64_t>(VT_PACKING_ALGORITHM_ID, _packing_algorithm_id, 0);
+  }
+  uint64_t weights_id() const {
+    return GetField<uint64_t>(VT_WEIGHTS_ID, 0);
+  }
+  bool mutate_weights_id(uint64_t _weights_id = 0) {
+    return SetField<uint64_t>(VT_WEIGHTS_ID, _weights_id, 0);
+  }
+  uint64_t bias_id() const {
+    return GetField<uint64_t>(VT_BIAS_ID, 0);
+  }
+  bool mutate_bias_id(uint64_t _bias_id = 0) {
+    return SetField<uint64_t>(VT_BIAS_ID, _bias_id, 0);
+  }
+  /// The buffer data is appended after the flatbuffer to bypass 2GB file size
+  /// limitation. The offset is calculated relative to the base offset.
+  /// (i.e. beginning of the file + base_offset).
+  uint64_t offset() const {
+    return GetField<uint64_t>(VT_OFFSET, 0);
+  }
+  bool mutate_offset(uint64_t _offset = 0) {
+    return SetField<uint64_t>(VT_OFFSET, _offset, 0);
+  }
+  /// Size of the buffer in bytes.
+  uint64_t size() const {
+    return GetField<uint64_t>(VT_SIZE, 0);
+  }
+  bool mutate_size(uint64_t _size = 0) {
+    return SetField<uint64_t>(VT_SIZE, _size, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint64_t>(verifier, VT_PACKING_ALGORITHM_ID, 8) &&
+           VerifyField<uint64_t>(verifier, VT_WEIGHTS_ID, 8) &&
+           VerifyField<uint64_t>(verifier, VT_BIAS_ID, 8) &&
+           VerifyField<uint64_t>(verifier, VT_OFFSET, 8) &&
+           VerifyField<uint64_t>(verifier, VT_SIZE, 8) &&
+           verifier.EndTable();
+  }
+  BufferT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BufferT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<Buffer> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BufferT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BufferBuilder {
+  typedef Buffer Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_packing_algorithm_id(uint64_t packing_algorithm_id) {
+    fbb_.AddElement<uint64_t>(Buffer::VT_PACKING_ALGORITHM_ID, packing_algorithm_id, 0);
+  }
+  void add_weights_id(uint64_t weights_id) {
+    fbb_.AddElement<uint64_t>(Buffer::VT_WEIGHTS_ID, weights_id, 0);
+  }
+  void add_bias_id(uint64_t bias_id) {
+    fbb_.AddElement<uint64_t>(Buffer::VT_BIAS_ID, bias_id, 0);
+  }
+  void add_offset(uint64_t offset) {
+    fbb_.AddElement<uint64_t>(Buffer::VT_OFFSET, offset, 0);
+  }
+  void add_size(uint64_t size) {
+    fbb_.AddElement<uint64_t>(Buffer::VT_SIZE, size, 0);
+  }
+  explicit BufferBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Buffer> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Buffer>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Buffer> CreateBuffer(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    uint64_t packing_algorithm_id = 0,
+    uint64_t weights_id = 0,
+    uint64_t bias_id = 0,
+    uint64_t offset = 0,
+    uint64_t size = 0) {
+  BufferBuilder builder_(_fbb);
+  builder_.add_size(size);
+  builder_.add_offset(offset);
+  builder_.add_bias_id(bias_id);
+  builder_.add_weights_id(weights_id);
+  builder_.add_packing_algorithm_id(packing_algorithm_id);
+  return builder_.Finish();
+}
+
+::flatbuffers::Offset<Buffer> CreateBuffer(::flatbuffers::FlatBufferBuilder &_fbb, const BufferT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct PackedWeightsT : public ::flatbuffers::NativeTable {
+  typedef PackedWeights TableType;
+  std::vector<std::unique_ptr<tflite::xnnpack::cache::schema::BufferT>> buffers{};
+  uint64_t flatbuffer_size = 0;
+  uint64_t base_offset = 0;
+  PackedWeightsT() = default;
+  PackedWeightsT(const PackedWeightsT &o);
+  PackedWeightsT(PackedWeightsT&&) FLATBUFFERS_NOEXCEPT = default;
+  PackedWeightsT &operator=(PackedWeightsT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct PackedWeights FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef PackedWeightsT NativeTableType;
+  typedef PackedWeightsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_BUFFERS = 4,
+    VT_FLATBUFFER_SIZE = 6,
+    VT_BASE_OFFSET = 8
+  };
+  /// A list of buffers.
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::xnnpack::cache::schema::Buffer>> *buffers() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::xnnpack::cache::schema::Buffer>> *>(VT_BUFFERS);
+  }
+  ::flatbuffers::Vector<::flatbuffers::Offset<tflite::xnnpack::cache::schema::Buffer>> *mutable_buffers() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<tflite::xnnpack::cache::schema::Buffer>> *>(VT_BUFFERS);
+  }
+  /// The serialized file is `flatbuffer_size` of bytes representing
+  /// `NamedBuffers` appended with a blob representing the buffer content.
+  uint64_t flatbuffer_size() const {
+    return GetField<uint64_t>(VT_FLATBUFFER_SIZE, 0);
+  }
+  bool mutate_flatbuffer_size(uint64_t _flatbuffer_size = 0) {
+    return SetField<uint64_t>(VT_FLATBUFFER_SIZE, _flatbuffer_size, 0);
+  }
+  /// Defines the base offset for the data appended to the file. That offset
+  /// may be needed to guarantee data alignment.
+  uint64_t base_offset() const {
+    return GetField<uint64_t>(VT_BASE_OFFSET, 0);
+  }
+  bool mutate_base_offset(uint64_t _base_offset = 0) {
+    return SetField<uint64_t>(VT_BASE_OFFSET, _base_offset, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_BUFFERS) &&
+           verifier.VerifyVector(buffers()) &&
+           verifier.VerifyVectorOfTables(buffers()) &&
+           VerifyField<uint64_t>(verifier, VT_FLATBUFFER_SIZE, 8) &&
+           VerifyField<uint64_t>(verifier, VT_BASE_OFFSET, 8) &&
+           verifier.EndTable();
+  }
+  PackedWeightsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(PackedWeightsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<PackedWeights> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const PackedWeightsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct PackedWeightsBuilder {
+  typedef PackedWeights Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_buffers(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::xnnpack::cache::schema::Buffer>>> buffers) {
+    fbb_.AddOffset(PackedWeights::VT_BUFFERS, buffers);
+  }
+  void add_flatbuffer_size(uint64_t flatbuffer_size) {
+    fbb_.AddElement<uint64_t>(PackedWeights::VT_FLATBUFFER_SIZE, flatbuffer_size, 0);
+  }
+  void add_base_offset(uint64_t base_offset) {
+    fbb_.AddElement<uint64_t>(PackedWeights::VT_BASE_OFFSET, base_offset, 0);
+  }
+  explicit PackedWeightsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<PackedWeights> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<PackedWeights>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<PackedWeights> CreatePackedWeights(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::xnnpack::cache::schema::Buffer>>> buffers = 0,
+    uint64_t flatbuffer_size = 0,
+    uint64_t base_offset = 0) {
+  PackedWeightsBuilder builder_(_fbb);
+  builder_.add_base_offset(base_offset);
+  builder_.add_flatbuffer_size(flatbuffer_size);
+  builder_.add_buffers(buffers);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<PackedWeights> CreatePackedWeightsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<::flatbuffers::Offset<tflite::xnnpack::cache::schema::Buffer>> *buffers = nullptr,
+    uint64_t flatbuffer_size = 0,
+    uint64_t base_offset = 0) {
+  auto buffers__ = buffers ? _fbb.CreateVector<::flatbuffers::Offset<tflite::xnnpack::cache::schema::Buffer>>(*buffers) : 0;
+  return tflite::xnnpack::cache::schema::CreatePackedWeights(
+      _fbb,
+      buffers__,
+      flatbuffer_size,
+      base_offset);
+}
+
+::flatbuffers::Offset<PackedWeights> CreatePackedWeights(::flatbuffers::FlatBufferBuilder &_fbb, const PackedWeightsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+inline BufferT *Buffer::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BufferT>(new BufferT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void Buffer::UnPackTo(BufferT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = packing_algorithm_id(); _o->packing_algorithm_id = _e; }
+  { auto _e = weights_id(); _o->weights_id = _e; }
+  { auto _e = bias_id(); _o->bias_id = _e; }
+  { auto _e = offset(); _o->offset = _e; }
+  { auto _e = size(); _o->size = _e; }
+}
+
+inline ::flatbuffers::Offset<Buffer> Buffer::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const BufferT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBuffer(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<Buffer> CreateBuffer(::flatbuffers::FlatBufferBuilder &_fbb, const BufferT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const BufferT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _packing_algorithm_id = _o->packing_algorithm_id;
+  auto _weights_id = _o->weights_id;
+  auto _bias_id = _o->bias_id;
+  auto _offset = _o->offset;
+  auto _size = _o->size;
+  return tflite::xnnpack::cache::schema::CreateBuffer(
+      _fbb,
+      _packing_algorithm_id,
+      _weights_id,
+      _bias_id,
+      _offset,
+      _size);
+}
+
+inline PackedWeightsT::PackedWeightsT(const PackedWeightsT &o)
+      : flatbuffer_size(o.flatbuffer_size),
+        base_offset(o.base_offset) {
+  buffers.reserve(o.buffers.size());
+  for (const auto &buffers_ : o.buffers) { buffers.emplace_back((buffers_) ? new tflite::xnnpack::cache::schema::BufferT(*buffers_) : nullptr); }
+}
+
+inline PackedWeightsT &PackedWeightsT::operator=(PackedWeightsT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(buffers, o.buffers);
+  std::swap(flatbuffer_size, o.flatbuffer_size);
+  std::swap(base_offset, o.base_offset);
+  return *this;
+}
+
+inline PackedWeightsT *PackedWeights::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<PackedWeightsT>(new PackedWeightsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void PackedWeights::UnPackTo(PackedWeightsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = buffers(); if (_e) { _o->buffers.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->buffers[_i]) { _e->Get(_i)->UnPackTo(_o->buffers[_i].get(), _resolver); } else { _o->buffers[_i] = std::unique_ptr<tflite::xnnpack::cache::schema::BufferT>(_e->Get(_i)->UnPack(_resolver)); }; } } else { _o->buffers.resize(0); } }
+  { auto _e = flatbuffer_size(); _o->flatbuffer_size = _e; }
+  { auto _e = base_offset(); _o->base_offset = _e; }
+}
+
+inline ::flatbuffers::Offset<PackedWeights> PackedWeights::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const PackedWeightsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreatePackedWeights(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<PackedWeights> CreatePackedWeights(::flatbuffers::FlatBufferBuilder &_fbb, const PackedWeightsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const PackedWeightsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _buffers = _o->buffers.size() ? _fbb.CreateVector<::flatbuffers::Offset<tflite::xnnpack::cache::schema::Buffer>> (_o->buffers.size(), [](size_t i, _VectorArgs *__va) { return CreateBuffer(*__va->__fbb, __va->__o->buffers[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _flatbuffer_size = _o->flatbuffer_size;
+  auto _base_offset = _o->base_offset;
+  return tflite::xnnpack::cache::schema::CreatePackedWeights(
+      _fbb,
+      _buffers,
+      _flatbuffer_size,
+      _base_offset);
+}
+
+inline const tflite::xnnpack::cache::schema::PackedWeights *GetPackedWeights(const void *buf) {
+  return ::flatbuffers::GetRoot<tflite::xnnpack::cache::schema::PackedWeights>(buf);
+}
+
+inline const tflite::xnnpack::cache::schema::PackedWeights *GetSizePrefixedPackedWeights(const void *buf) {
+  return ::flatbuffers::GetSizePrefixedRoot<tflite::xnnpack::cache::schema::PackedWeights>(buf);
+}
+
+inline PackedWeights *GetMutablePackedWeights(void *buf) {
+  return ::flatbuffers::GetMutableRoot<PackedWeights>(buf);
+}
+
+inline tflite::xnnpack::cache::schema::PackedWeights *GetMutableSizePrefixedPackedWeights(void *buf) {
+  return ::flatbuffers::GetMutableSizePrefixedRoot<tflite::xnnpack::cache::schema::PackedWeights>(buf);
+}
+
+inline const char *PackedWeightsIdentifier() {
+  return "V001";
+}
+
+inline bool PackedWeightsBufferHasIdentifier(const void *buf) {
+  return ::flatbuffers::BufferHasIdentifier(
+      buf, PackedWeightsIdentifier());
+}
+
+inline bool SizePrefixedPackedWeightsBufferHasIdentifier(const void *buf) {
+  return ::flatbuffers::BufferHasIdentifier(
+      buf, PackedWeightsIdentifier(), true);
+}
+
+inline bool VerifyPackedWeightsBuffer(
+    ::flatbuffers::Verifier &verifier) {
+  return verifier.VerifyBuffer<tflite::xnnpack::cache::schema::PackedWeights>(PackedWeightsIdentifier());
+}
+
+inline bool VerifySizePrefixedPackedWeightsBuffer(
+    ::flatbuffers::Verifier &verifier) {
+  return verifier.VerifySizePrefixedBuffer<tflite::xnnpack::cache::schema::PackedWeights>(PackedWeightsIdentifier());
+}
+
+inline const char *PackedWeightsExtension() {
+  return "xnn_weights";
+}
+
+inline void FinishPackedWeightsBuffer(
+    ::flatbuffers::FlatBufferBuilder &fbb,
+    ::flatbuffers::Offset<tflite::xnnpack::cache::schema::PackedWeights> root) {
+  fbb.Finish(root, PackedWeightsIdentifier());
+}
+
+inline void FinishSizePrefixedPackedWeightsBuffer(
+    ::flatbuffers::FlatBufferBuilder &fbb,
+    ::flatbuffers::Offset<tflite::xnnpack::cache::schema::PackedWeights> root) {
+  fbb.FinishSizePrefixed(root, PackedWeightsIdentifier());
+}
+
+inline std::unique_ptr<tflite::xnnpack::cache::schema::PackedWeightsT> UnPackPackedWeights(
+    const void *buf,
+    const ::flatbuffers::resolver_function_t *res = nullptr) {
+  return std::unique_ptr<tflite::xnnpack::cache::schema::PackedWeightsT>(GetPackedWeights(buf)->UnPack(res));
+}
+
+inline std::unique_ptr<tflite::xnnpack::cache::schema::PackedWeightsT> UnPackSizePrefixedPackedWeights(
+    const void *buf,
+    const ::flatbuffers::resolver_function_t *res = nullptr) {
+  return std::unique_ptr<tflite::xnnpack::cache::schema::PackedWeightsT>(GetSizePrefixedPackedWeights(buf)->UnPack(res));
+}
+
+}  // namespace schema
+}  // namespace cache
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // FLATBUFFERS_GENERATED_WEIGHTCACHESCHEMA_TFLITE_XNNPACK_CACHE_SCHEMA_H_
diff --git a/tensorflow/lite/delegates/xnnpack/weight_cache_test.cc b/tensorflow/lite/delegates/xnnpack/weight_cache_test.cc
new file mode 100644
index 00000000000000..6ecbba3b17b8b2
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/weight_cache_test.cc
@@ -0,0 +1,727 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/delegates/xnnpack/weight_cache.h"
+
+#include <fcntl.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <iterator>
+#include <map>
+#include <ostream>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "xnnpack.h"  // from @XNNPACK
+#include "flatbuffers/verifier.h"  // from @flatbuffers
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/xnnpack/weight_cache_schema_generated.h"
+
+namespace tflite::xnnpack {
+
+std::ostream& operator<<(std::ostream& os, const PackIdentifier& p) {
+  return os << "PackIdentifier{pack_algo: " << p.pack_algorithm_id
+            << ", weights_id: " << p.weights_id << ", bias_id: " << p.bias_id
+            << "}";
+}
+
+namespace {
+
+using testing::ElementsAreArray;
+using testing::Ge;
+
+#ifndef XNN_TEST_WEIGHT_CACHE_TEMP_FILE_TEMPATE
+#define XNN_TEST_WEIGHT_CACHE_TEMP_FILE_TEMPATE \
+  "/tmp/weight_cache_test_file.XXXXXX"
+#endif
+constexpr const char kTempFileTemplate[] =
+    XNN_TEST_WEIGHT_CACHE_TEMP_FILE_TEMPATE;
+
+// Wraps a call to `mkstemp` to create temporary files.
+class TempFileDesc {
+ public:
+  static constexpr struct AutoClose {
+  } kAutoCLose{};
+
+#if defined(_MSC_VER)
+  TempFileDesc() : fd_() {
+    char filename[L_tmpnam_s];
+    errno_t err = tmpnam_s(filename, L_tmpnam_s);
+    if (err) {
+      fprintf(stderr, "Could not create temporary filename.\n");
+      std::abort();
+    }
+    path_ = filename;
+    fd_ = open(path_.c_str(), O_CREAT | O_EXCL | O_RDWR, 0644);
+    if (fd_ < 0) {
+      fprintf(stderr, "Could not create temporary filename.\n");
+      std::abort();
+    }
+  }
+#else
+  TempFileDesc() : fd_(mkstemp(path_.data())) {
+    if (GetFd() < 0) {
+      perror("Could not create temporary file");
+    }
+  }
+#endif
+
+  explicit TempFileDesc(AutoClose) : TempFileDesc() { Close(); }
+
+  TempFileDesc(const TempFileDesc&) = delete;
+  TempFileDesc& operator=(const TempFileDesc&) = delete;
+
+  friend void swap(TempFileDesc& a, TempFileDesc& b) {
+    std::swap(a.path_, b.path_);
+    std::swap(a.fd_, b.fd_);
+  }
+
+  TempFileDesc(TempFileDesc&& other) { swap(*this, other); }
+  TempFileDesc& operator=(TempFileDesc&& other) {
+    swap(*this, other);
+    return *this;
+  }
+
+  ~TempFileDesc() { Close(); }
+
+  void Close() {
+    if (GetFd() >= 0) {
+      close(fd_);
+      fd_ = -1;
+    }
+  }
+
+  const std::string& GetPath() const { return path_; }
+
+  const char* GetCPath() const { return path_.c_str(); }
+
+  int GetFd() const { return fd_; }
+
+  bool IsOpen() const { return fd_ >= 0; }
+
+ private:
+  std::string path_ = kTempFileTemplate;
+  int fd_ = -1;
+};
+
+TEST(MMapHandleTest, DefaultConstructs) {
+  MMapHandle handle;
+  EXPECT_FALSE(handle.IsMapped());
+  EXPECT_EQ(handle.data(), nullptr);
+  EXPECT_EQ(handle.size(), 0);
+}
+
+TEST(MMapHandleTest, MapNonExitxingFileFails) {
+  // I hope this path doesn't exist...
+  const char* file_path = "sdbgfd";
+  MMapHandle handle;
+  EXPECT_FALSE(handle.Map(file_path));
+}
+
+TEST(MMapHandleTest, MapExistingFileWorks) {
+  using std::size;
+
+  const std::string payload = "This is some data in the file.";
+
+  TempFileDesc tmp_file;
+  ASSERT_TRUE(tmp_file.IsOpen());
+  ASSERT_EQ(write(tmp_file.GetFd(), payload.c_str(), size(payload)),
+            size(payload));
+  tmp_file.Close();
+
+  MMapHandle handle;
+  ASSERT_TRUE(handle.Map(tmp_file.GetCPath()));
+  EXPECT_TRUE(handle.IsMapped());
+  EXPECT_NE(handle.data(), nullptr);
+  EXPECT_THAT(handle.size(), Ge(size(payload)));
+  EXPECT_THAT(handle, ElementsAreArray(payload));
+
+  handle.UnMap();
+  EXPECT_FALSE(handle.IsMapped());
+  EXPECT_EQ(handle.data(), nullptr);
+  EXPECT_EQ(handle.size(), 0);
+}
+
+TEST(MMapHandleTest, MoveConstructs) {
+  const std::string payload = "This is some data in the file.";
+
+  TempFileDesc tmp_file;
+  ASSERT_TRUE(tmp_file.IsOpen());
+  ASSERT_EQ(write(tmp_file.GetFd(), payload.c_str(), size(payload)),
+            size(payload));
+  tmp_file.Close();
+
+  MMapHandle handle;
+  ASSERT_TRUE(handle.Map(tmp_file.GetCPath()));
+
+  MMapHandle handle2(std::move(handle));
+
+  // We are checking that the moved from handle has lost control over the data.
+  // NOLINTBEGIN(bugprone-use-after-move)
+  EXPECT_FALSE(handle.IsMapped());
+  EXPECT_EQ(handle.data(), nullptr);
+  EXPECT_EQ(handle.size(), 0);
+  // NOLINTEND(bugprone-use-after-move)
+
+  EXPECT_TRUE(handle2.IsMapped());
+  EXPECT_NE(handle2.data(), nullptr);
+  EXPECT_THAT(handle2.size(), Ge(size(payload)));
+  EXPECT_THAT(handle2, ElementsAreArray(payload));
+}
+
+TEST(WeightCacheBuilderTest, ReserveAppendWriteWorks) {
+  using std::size;
+
+  const std::string payload = "This is some data in the file.";
+  const PackIdentifier dummy_id{1, 2, 3};
+
+  WeightCacheBuilder builder;
+
+  const size_t payload_size = size(payload);
+  void* buffer = builder.Reserve(payload_size);
+  std::memcpy(buffer, payload.c_str(), payload_size);
+  auto loc = builder.Append(dummy_id, buffer, payload_size);
+
+  EXPECT_EQ(loc.size, payload_size);
+  EXPECT_EQ(builder.BufferData().size(), payload_size);
+  EXPECT_TRUE(builder.ShouldWrite());
+
+  TempFileDesc tmp_file;
+  ASSERT_TRUE(tmp_file.IsOpen());
+  tmp_file.Close();
+
+  ASSERT_TRUE(builder.Write(tmp_file.GetCPath()));
+
+  MMapHandle handle;
+  ASSERT_TRUE(handle.Map(tmp_file.GetCPath()));
+
+  const cache::schema::PackedWeights* const packed_weights =
+      cache::schema::GetPackedWeights(handle.data());
+  ASSERT_NE(packed_weights, nullptr);
+  EXPECT_LE(packed_weights->flatbuffer_size(), size(handle) - size(payload));
+  ASSERT_NE(packed_weights->buffers(), nullptr);
+  ASSERT_EQ(packed_weights->buffers()->size(), 1);
+  ASSERT_NE(packed_weights->buffers()->Get(0), nullptr);
+  ASSERT_EQ(packed_weights->buffers()->Get(0)->size(), size(payload));
+  EXPECT_EQ(packed_weights->buffers()->Get(0)->offset(), 0);
+  ASSERT_EQ(packed_weights->buffers()->Get(0)->packing_algorithm_id(),
+            dummy_id.pack_algorithm_id);
+  ASSERT_EQ(packed_weights->buffers()->Get(0)->weights_id(),
+            dummy_id.weights_id);
+  ASSERT_EQ(packed_weights->buffers()->Get(0)->bias_id(), dummy_id.bias_id);
+
+  flatbuffers::Verifier verifier(handle.data(), handle.size());
+  EXPECT_TRUE(cache::schema::VerifyPackedWeightsBuffer(verifier))
+      << packed_weights->flatbuffer_size() << " " << handle.size() << " "
+      << packed_weights->buffers()->size() << "\n"
+      << tmp_file.GetPath();
+}
+
+TEST(WeightCacheBuilderTest, AppendWithoutReserveWriteWorks) {
+  using std::size;
+
+  const std::string payload = "This is some data in the file.";
+  const PackIdentifier dummy_id{1, 2, 3};
+
+  WeightCacheBuilder builder;
+
+  const size_t payload_size = size(payload);
+  auto loc = builder.Append(dummy_id, payload.c_str(), payload_size);
+
+  EXPECT_EQ(loc.size, payload_size);
+  EXPECT_EQ(builder.BufferData().size(), payload_size);
+  EXPECT_TRUE(builder.ShouldWrite());
+
+  TempFileDesc tmp_file;
+  ASSERT_TRUE(tmp_file.IsOpen());
+  tmp_file.Close();
+
+  ASSERT_TRUE(builder.Write(tmp_file.GetCPath()));
+
+  MMapHandle handle;
+  ASSERT_TRUE(handle.Map(tmp_file.GetCPath()));
+
+  const cache::schema::PackedWeights* const packed_weights =
+      cache::schema::GetPackedWeights(handle.data());
+  ASSERT_NE(packed_weights, nullptr);
+  EXPECT_LE(packed_weights->flatbuffer_size(), size(handle) - size(payload));
+  ASSERT_NE(packed_weights->buffers(), nullptr);
+  ASSERT_EQ(packed_weights->buffers()->size(), 1);
+  ASSERT_NE(packed_weights->buffers()->Get(0), nullptr);
+  ASSERT_EQ(packed_weights->buffers()->Get(0)->size(), size(payload));
+  EXPECT_EQ(packed_weights->buffers()->Get(0)->offset(), 0);
+  ASSERT_EQ(packed_weights->buffers()->Get(0)->packing_algorithm_id(),
+            dummy_id.pack_algorithm_id);
+  ASSERT_EQ(packed_weights->buffers()->Get(0)->weights_id(),
+            dummy_id.weights_id);
+  ASSERT_EQ(packed_weights->buffers()->Get(0)->bias_id(), dummy_id.bias_id);
+
+  flatbuffers::Verifier verifier(handle.data(), handle.size());
+  EXPECT_TRUE(cache::schema::VerifyPackedWeightsBuffer(verifier))
+      << packed_weights->flatbuffer_size() << " " << handle.size() << " "
+      << packed_weights->buffers()->size() << "\n"
+      << tmp_file.GetPath();
+}
+
+TEST(WeightCacheBuilderTest, NonExistingPathFails) {
+  using std::size;
+
+  const std::string payload = "This is some data in the file.";
+  const PackIdentifier dummy_id{1, 2, 3};
+
+  WeightCacheBuilder builder;
+
+  const size_t payload_size = size(payload);
+  auto loc = builder.Append(dummy_id, payload.c_str(), payload_size);
+
+  EXPECT_EQ(loc.size, payload_size);
+  EXPECT_EQ(builder.BufferData().size(), payload_size);
+  EXPECT_TRUE(builder.ShouldWrite());
+
+  EXPECT_FALSE(builder.Write(""));
+  EXPECT_FALSE(builder.Write("/selktjdsljf"));
+}
+
+struct FakeContext {
+  // Adds a new tensor and it's backing buffer to the context.
+  //
+  // The tensor `data` will not be set until `FinalizeTensors` is called.
+  void AddTensor(int buffer_identifier, size_t size) {
+    buffers.emplace_back(size, buffer_identifier);
+    tensors.push_back({});
+    tensors.back().allocation_type = kTfLiteMmapRo;
+    tensor_buffer_identifiers[tensors.size() - 1] = buffer_identifier;
+  }
+
+  // Updates the tensor data mappings.
+  //
+  // This needs to be called every time the context `tensors` list is
+  // reallocated (mainly because of insertions).
+  void FinalizeTensors() {
+    for (size_t i = 0; i < tensors.size(); ++i) {
+      tensors[i].data.data = buffers[i].data();
+      tensors[i].bytes = buffers[i].size();
+    }
+  }
+
+  // Creates a look up key for the XNNPack weight provider C interface.
+  xnn_weights_cache_look_up_key LookUpKey(const uint32_t algorithm_seed,
+                                          const int weights_index) const {
+    return {.seed = algorithm_seed,
+            .kernel = buffers[weights_index].data(),
+            .bias = nullptr};
+  }
+
+  // Creates a look up key for the XNNPack weight provider C interface.
+  xnn_weights_cache_look_up_key LookUpKey(const uint32_t algorithm_seed,
+                                          const int weights_index,
+                                          const int bias_index) const {
+    return {.seed = algorithm_seed,
+            .kernel = buffers[weights_index].data(),
+            .bias = buffers[bias_index].data()};
+  }
+
+  // Helps creating fake packed data.
+  void AddTensorToPack(std::vector<uint8_t>& pack_buffer, int index) {
+    const std::vector<uint8_t>& buffer = buffers[index];
+    pack_buffer.resize(std::max(size(pack_buffer), size(buffer)));
+    for (size_t i = 0; i < size(buffer); ++i) {
+      pack_buffer[i] ^= buffer[i];
+    }
+  }
+
+  // Packs the referenced tensors into one buffer.
+  //
+  // Returns the pack id to retrieve the packed reference data from
+  // `packed_buffers`.
+  template <class... Ids>
+  PackIdentifier PackTensors(xnn_weights_cache_t weight_cache,
+                             const uint32_t algorithm_seed,
+                             const Ids... tensor_indices) {
+    // Create fake packed and save the result for later lookup tests.
+
+    PackIdentifier pack_id{algorithm_seed,
+                           tensor_buffer_identifiers[tensor_indices]...};
+    PackedBuffer& packed =
+        packed_buffers.emplace(pack_id, PackedBuffer{})->second;
+    (AddTensorToPack(packed.buffer, tensor_indices), ...);
+
+    // Add the packed buffer to the XNNPack cache. Normaly you would pack in
+    // place where the reserved space is.
+    xnn_weights_cache_look_up_key look_up_key =
+        LookUpKey(algorithm_seed, tensor_indices...);
+    packed.offset = weight_cache->look_up_or_insert(
+        weight_cache->context, &look_up_key, packed.buffer.data(),
+        packed.buffer.size());
+    return pack_id;
+  }
+
+  struct PackedBuffer {
+    size_t offset;
+    std::vector<uint8_t> buffer;
+  };
+
+  std::vector<TfLiteTensor> tensors;
+  std::vector<std::vector<uint8_t>> buffers;
+  std::unordered_multimap<PackIdentifier, PackedBuffer, PackIdentifier::Hash>
+      packed_buffers;
+  std::unordered_map<size_t, size_t> tensor_buffer_identifiers;
+};
+
+struct BuildMMapWeightCacheProviderTest : testing::Test {
+  enum { kAlgoSeed1, kAlgoSeed2, kAlgoSeed3 };
+  enum { kBufferId1, kBufferId2, kBufferId3, kBufferId4 };
+
+  void SetUp() override {
+    AddTensors();
+    EndSetup();
+  }
+
+  void AddTensors() {
+    ctx.AddTensor(/*buffer_identifier=*/kBufferId1, /*size=*/12);
+    ctx.AddTensor(/*buffer_identifier=*/kBufferId2, /*size=*/43);
+    ctx.AddTensor(/*buffer_identifier=*/kBufferId3, /*size=*/64);
+    ctx.AddTensor(/*buffer_identifier=*/kBufferId4, /*size=*/8);
+  }
+
+  void EndSetup() {
+    ctx.FinalizeTensors();
+    cache_provider.MapTensorIdentifiers(ctx.tensors.data(), ctx.tensors.size(),
+                                        ctx.tensor_buffer_identifiers);
+  }
+
+  FakeContext ctx;
+  MMapWeightCacheProvider cache_provider;
+};
+
+TEST_F(BuildMMapWeightCacheProviderTest, LookUpFailsIfKeyDoesntMatch) {
+  xnn_weights_cache_look_up_key look_up_key{};
+  EXPECT_EQ(cache_provider.LookUp(&look_up_key), SIZE_MAX);
+}
+
+TEST_F(BuildMMapWeightCacheProviderTest, LookUpSucceeds) {
+  enum { kWeightIndex, kBiasIndex };
+  const auto pack_id = ctx.PackTensors(&cache_provider.GetCacheProvider(),
+                                       kAlgoSeed1, kWeightIndex, kBiasIndex);
+  const xnn_weights_cache_look_up_key look_up_key =
+      ctx.LookUpKey(kAlgoSeed1, kWeightIndex, kBiasIndex);
+
+  EXPECT_EQ(cache_provider.LookUp(&look_up_key),
+            ctx.packed_buffers.find(pack_id)->second.offset);
+}
+
+TEST_F(BuildMMapWeightCacheProviderTest,
+       DifferentAlgoSeedsSameTensorsDontConflict) {
+  enum { kWeightIndex, kBiasIndex };
+  const auto pack_id_1 = ctx.PackTensors(&cache_provider.GetCacheProvider(),
+                                         kAlgoSeed1, kWeightIndex, kBiasIndex);
+  const auto pack_id_2 = ctx.PackTensors(&cache_provider.GetCacheProvider(),
+                                         kAlgoSeed2, kWeightIndex, kBiasIndex);
+
+  const xnn_weights_cache_look_up_key look_up_key_1 =
+      ctx.LookUpKey(kAlgoSeed1, kWeightIndex, kBiasIndex);
+  const xnn_weights_cache_look_up_key look_up_key_2 =
+      ctx.LookUpKey(kAlgoSeed2, kWeightIndex, kBiasIndex);
+
+  EXPECT_EQ(cache_provider.LookUp(&look_up_key_1),
+            ctx.packed_buffers.find(pack_id_1)->second.offset);
+  EXPECT_EQ(cache_provider.LookUp(&look_up_key_2),
+            ctx.packed_buffers.find(pack_id_2)->second.offset);
+  EXPECT_NE(cache_provider.LookUp(&look_up_key_1),
+            cache_provider.LookUp(&look_up_key_2));
+}
+
+TEST_F(BuildMMapWeightCacheProviderTest,
+       SameAlgoSeedDifferentTensorsDontConflict) {
+  enum { kWeightIndex1, kWeightIndex2, kBiasIndex1, kBiasIndex2 };
+  const auto pack_id_1 =
+      ctx.PackTensors(&cache_provider.GetCacheProvider(), kAlgoSeed1,
+                      kWeightIndex1, kBiasIndex1);
+  const auto pack_id_2 =
+      ctx.PackTensors(&cache_provider.GetCacheProvider(), kAlgoSeed1,
+                      kWeightIndex2, kBiasIndex1);
+  const auto pack_id_3 =
+      ctx.PackTensors(&cache_provider.GetCacheProvider(), kAlgoSeed1,
+                      kWeightIndex1, kBiasIndex2);
+  const auto pack_id_4 =
+      ctx.PackTensors(&cache_provider.GetCacheProvider(), kAlgoSeed1,
+                      kWeightIndex2, kBiasIndex2);
+
+  const xnn_weights_cache_look_up_key look_up_key_1 =
+      ctx.LookUpKey(kAlgoSeed1, kWeightIndex1, kBiasIndex1);
+  const xnn_weights_cache_look_up_key look_up_key_2 =
+      ctx.LookUpKey(kAlgoSeed1, kWeightIndex2, kBiasIndex1);
+  const xnn_weights_cache_look_up_key look_up_key_3 =
+      ctx.LookUpKey(kAlgoSeed1, kWeightIndex1, kBiasIndex2);
+  const xnn_weights_cache_look_up_key look_up_key_4 =
+      ctx.LookUpKey(kAlgoSeed1, kWeightIndex2, kBiasIndex2);
+
+  EXPECT_EQ(cache_provider.LookUp(&look_up_key_1),
+            ctx.packed_buffers.find(pack_id_1)->second.offset);
+  EXPECT_EQ(cache_provider.LookUp(&look_up_key_2),
+            ctx.packed_buffers.find(pack_id_2)->second.offset);
+  EXPECT_EQ(cache_provider.LookUp(&look_up_key_3),
+            ctx.packed_buffers.find(pack_id_3)->second.offset);
+  EXPECT_EQ(cache_provider.LookUp(&look_up_key_4),
+            ctx.packed_buffers.find(pack_id_4)->second.offset);
+  EXPECT_NE(cache_provider.LookUp(&look_up_key_1),
+            cache_provider.LookUp(&look_up_key_2));
+  EXPECT_NE(cache_provider.LookUp(&look_up_key_1),
+            cache_provider.LookUp(&look_up_key_3));
+  EXPECT_NE(cache_provider.LookUp(&look_up_key_1),
+            cache_provider.LookUp(&look_up_key_4))
+      << pack_id_1 << " " << pack_id_4;
+  EXPECT_NE(cache_provider.LookUp(&look_up_key_2),
+            cache_provider.LookUp(&look_up_key_3));
+  EXPECT_NE(cache_provider.LookUp(&look_up_key_2),
+            cache_provider.LookUp(&look_up_key_4));
+  EXPECT_NE(cache_provider.LookUp(&look_up_key_3),
+            cache_provider.LookUp(&look_up_key_4));
+}
+
+TEST_F(BuildMMapWeightCacheProviderTest, FinalizeWorks) {
+  enum { kWeightIndex1, kBiasIndex, kWeightIndex2 };
+  TempFileDesc tmp_file;
+
+  ctx.PackTensors(&cache_provider.GetCacheProvider(), kAlgoSeed1, kWeightIndex1,
+                  kBiasIndex);
+  ctx.PackTensors(&cache_provider.GetCacheProvider(), kAlgoSeed2,
+                  kWeightIndex2);
+
+  EXPECT_FALSE(cache_provider.Finalize());
+  cache_provider.SetFilePath(tmp_file.GetCPath());
+
+  EXPECT_TRUE(cache_provider.IsActive());
+  EXPECT_TRUE(cache_provider.IsBuilding());
+  ASSERT_TRUE(cache_provider.Finalize());
+
+  ASSERT_TRUE(cache_provider.IsFinalized());
+}
+
+struct LoadMMapWeightCacheProviderTest : BuildMMapWeightCacheProviderTest {
+  enum { kWeightIndex1, kBiasIndex, kWeightIndex2 };
+
+  void SetUp() override {
+    BuildMMapWeightCacheProviderTest::SetUp();
+    cache_provider.SetFilePath(tmp_file.GetCPath());
+
+    pack_id_1 = ctx.PackTensors(&cache_provider.GetCacheProvider(), kAlgoSeed1,
+                                kWeightIndex1, kBiasIndex);
+    pack_id_2 = ctx.PackTensors(&cache_provider.GetCacheProvider(), kAlgoSeed2,
+                                kWeightIndex2);
+
+    ASSERT_TRUE(cache_provider.Finalize());
+    ASSERT_TRUE(cache_provider.IsFinalized());
+  }
+
+  xnn_weights_cache_look_up_key LookUpKey1() const {
+    return ctx.LookUpKey(kAlgoSeed1, kWeightIndex1, kBiasIndex);
+  }
+
+  xnn_weights_cache_look_up_key LookUpKey2() const {
+    return ctx.LookUpKey(kAlgoSeed2, kWeightIndex2);
+  }
+
+  TempFileDesc tmp_file;
+  PackIdentifier pack_id_1;
+  PackIdentifier pack_id_2;
+};
+
+TEST_F(LoadMMapWeightCacheProviderTest, LookUpFailsIfKeyDoesntMatch) {
+  xnn_weights_cache_look_up_key look_up_key{};
+  EXPECT_EQ(cache_provider.LookUp(&look_up_key), SIZE_MAX);
+}
+
+template <class T>
+class LightSpan {
+ public:
+  using value_type = T;
+
+  LightSpan(const void* data, const size_t size)
+      : ptr_(reinterpret_cast<T*>(data)), size_(size) {}
+
+  const T* begin() const { return ptr_; }
+  const T* end() const { return ptr_ + size_; }
+
+ private:
+  T* ptr_;
+  size_t size_;
+};
+
+TEST_F(LoadMMapWeightCacheProviderTest, LookUpSucceeds) {
+  const auto& reference_1 = ctx.packed_buffers.find(pack_id_1)->second;
+  const auto& reference_2 = ctx.packed_buffers.find(pack_id_2)->second;
+
+  const xnn_weights_cache_look_up_key look_up_key_1 = LookUpKey1();
+  const xnn_weights_cache_look_up_key look_up_key_2 = LookUpKey2();
+
+  const uint64_t offset_1 = cache_provider.LookUp(&look_up_key_1);
+  const uint64_t offset_2 = cache_provider.LookUp(&look_up_key_2);
+
+  ASSERT_EQ(offset_1, reference_1.offset);
+  ASSERT_EQ(offset_2, reference_2.offset);
+
+  const void* const addr_1 = cache_provider.OffsetToAddr(offset_1);
+  const void* const addr_2 = cache_provider.OffsetToAddr(offset_2);
+
+  ASSERT_NE(addr_1, nullptr);
+  ASSERT_NE(addr_2, nullptr);
+
+  EXPECT_THAT(LightSpan<const uint8_t>(addr_1, reference_1.buffer.size()),
+              ElementsAreArray(reference_1.buffer));
+  EXPECT_THAT(LightSpan<const uint8_t>(addr_2, reference_2.buffer.size()),
+              ElementsAreArray(reference_2.buffer));
+}
+
+TEST(MMapWeightCacheProviderTest, XnnpackCApiJourney) {
+  using std::size;
+  TempFileDesc temp_fd(TempFileDesc::kAutoCLose);
+  const int32_t fake_packing_algo_seed = 0xBA0BAB;
+  const char packed_data_ref_1[] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+  const char packed_data_ref_2[] = {26, 32, 43, 59, 34, 65, 80, 101};
+  auto bytes = [](const auto& array) { return size(array) * sizeof(array[0]); };
+
+  constexpr int kBufferCount = 10;
+  // We are going to feed dummy packed data. We only need a valid pointer
+  // address to map to a buffer identifier.
+  char fake_buffer_pointer[kBufferCount] = {0};
+
+  {  // Build and reload scenario.
+    TfLiteTensor tensors[kBufferCount];
+    std::unordered_map<size_t, size_t> tensor_buffer_identifiers;
+    for (int i = 0; i < kBufferCount; ++i) {
+      tensors[i].data.data = (void*)(fake_buffer_pointer + i);
+      tensor_buffer_identifiers[i] = i;
+    }
+
+    MMapWeightCacheProvider cache_provider;
+    cache_provider.SetFilePath(temp_fd.GetCPath());
+
+    xnn_weights_cache_t cache = &cache_provider.GetCacheProvider();
+    cache_provider.MapTensorIdentifiers(tensors, size(tensors),
+                                        tensor_buffer_identifiers);
+
+    const xnn_weights_cache_look_up_key look_up_key_1{
+        .seed = fake_packing_algo_seed,
+        .kernel = tensors[0].data.data,
+        .bias = tensors[1].data.data};
+
+    // Lookup non-packed tensor.
+    ASSERT_EQ(cache->look_up(cache, &look_up_key_1), SIZE_MAX);
+    // Reserve space, write data and add packed data.
+    void* const reserved_ptr =
+        cache->reserve_space(cache, bytes(packed_data_ref_1));
+    ASSERT_NE(reserved_ptr, nullptr);
+    std::memcpy(reserved_ptr, packed_data_ref_1, bytes(packed_data_ref_1));
+    const size_t build_offset_1 = cache->look_up_or_insert(
+        cache, &look_up_key_1, reserved_ptr, bytes(packed_data_ref_1));
+
+    // Check that a second insertion with the same key returns the same offset.
+    const size_t build_offset_redundant = cache->look_up_or_insert(
+        cache, &look_up_key_1, reserved_ptr, bytes(packed_data_ref_1));
+    EXPECT_EQ(build_offset_1, build_offset_redundant);
+
+    // Lookup newly packed tensor.
+    ASSERT_EQ(cache->look_up(cache, &look_up_key_1), build_offset_1);
+
+    // Add a tensor without reserving before.
+    const xnn_weights_cache_look_up_key look_up_key_2{
+        .seed = fake_packing_algo_seed,
+        .kernel = tensors[2].data.data,
+        .bias = tensors[3].data.data};
+    const size_t build_offset_2 = cache->look_up_or_insert(
+        cache, &look_up_key_2, (void*)packed_data_ref_2,
+        bytes(packed_data_ref_2));
+
+    // Save the cache to disk and reload.
+    ASSERT_TRUE(cache_provider.Finalize());
+
+    ASSERT_TRUE(cache->is_finalized(cache));
+
+    const size_t reload_offset_1 = cache->look_up(cache, &look_up_key_1);
+    ASSERT_EQ(reload_offset_1, build_offset_1);
+
+    const void* const loaded_packed_data_1 =
+        cache->offset_to_addr(cache, reload_offset_1);
+    ASSERT_NE(loaded_packed_data_1, nullptr);
+    EXPECT_THAT(
+        LightSpan<const char>(loaded_packed_data_1, size(packed_data_ref_1)),
+        ElementsAreArray(packed_data_ref_1));
+
+    const size_t reload_offset_2 = cache->look_up(cache, &look_up_key_2);
+    ASSERT_EQ(reload_offset_2, build_offset_2);
+
+    const void* const loaded_packed_data_2 =
+        cache->offset_to_addr(cache, reload_offset_2);
+    ASSERT_NE(loaded_packed_data_2, nullptr);
+    EXPECT_THAT(
+        LightSpan<const char>(loaded_packed_data_2, size(packed_data_ref_2)),
+        ElementsAreArray(packed_data_ref_2));
+  }
+
+  {  // Load existing cache scenario.
+    TfLiteTensor tensors[kBufferCount];
+    std::unordered_map<size_t, size_t> tensor_buffer_identifiers;
+    for (int i = 0; i < kBufferCount; ++i) {
+      tensors[i].data.data = (void*)(fake_buffer_pointer + i);
+      tensor_buffer_identifiers[i] = i;
+    }
+
+    MMapWeightCacheProvider cache_provider;
+    ASSERT_TRUE(cache_provider.Load(temp_fd.GetCPath()));
+
+    xnn_weights_cache_t cache = &cache_provider.GetCacheProvider();
+    cache_provider.MapTensorIdentifiers(tensors, size(tensors),
+                                        tensor_buffer_identifiers);
+
+    const xnn_weights_cache_look_up_key look_up_key_1{
+        .seed = fake_packing_algo_seed,
+        .kernel = tensors[0].data.data,
+        .bias = tensors[1].data.data};
+
+    const xnn_weights_cache_look_up_key look_up_key_2{
+        .seed = fake_packing_algo_seed,
+        .kernel = tensors[2].data.data,
+        .bias = tensors[3].data.data};
+
+    ASSERT_TRUE(cache->is_finalized(cache));
+
+    const size_t offset_1 = cache->look_up(cache, &look_up_key_1);
+    const void* const loaded_packed_data_1 =
+        cache->offset_to_addr(cache, offset_1);
+    ASSERT_NE(loaded_packed_data_1, nullptr);
+    EXPECT_THAT(
+        LightSpan<const char>(loaded_packed_data_1, size(packed_data_ref_1)),
+        ElementsAreArray(packed_data_ref_1));
+
+    const size_t offset_2 = cache->look_up(cache, &look_up_key_2);
+    const void* const loaded_packed_data_2 =
+        cache->offset_to_addr(cache, offset_2);
+    ASSERT_NE(loaded_packed_data_2, nullptr);
+    EXPECT_THAT(
+        LightSpan<const char>(loaded_packed_data_2, size(packed_data_ref_2)),
+        ElementsAreArray(packed_data_ref_2));
+  }
+}
+
+}  // namespace
+}  // namespace tflite::xnnpack
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index d9d2bc1125b4eb..26d24bca18032d 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -38,13 +38,16 @@ limitations under the License.
 #include "tensorflow/lite/core/api/profiler.h"
 #include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/delegates/xnnpack/quantization_util.h"
+#include "tensorflow/lite/delegates/xnnpack/weight_cache.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/utils/sparsity_format_converter.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
+#include "tensorflow/lite/logger.h"
 #include "tensorflow/lite/minimal_logging.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/tools/optimize/reduced_precision_support.h"
@@ -487,6 +490,8 @@ class VariableHolder {
   std::map<uint32_t, const TfLiteTensor*> global_id_to_dims_and_type_;
 };
 
+class Subgraph;
+
 class Delegate {
   friend class Subgraph;
 
@@ -518,13 +523,38 @@ class Delegate {
     }
 
 #endif
-    TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
-                         "Created TensorFlow Lite XNNPACK delegate for CPU.");
+    TFLITE_LOG_PROD(tflite::TFLITE_LOG_INFO,
+                    "Created TensorFlow Lite XNNPACK delegate for CPU.");
 
     options_ =
         options != nullptr ? *options : TfLiteXNNPackDelegateOptionsDefault();
     delegate_.flags = GetXNNPackDelegateFlags();
     workspace_.reset(workspace);
+
+    // If no weight cache is provided, add one when requested.
+    if (!options_.weights_cache) {
+      if (options_.experimental_weight_cache_file_path) {
+        if (weight_cache_provider_.Load(
+                options_.experimental_weight_cache_file_path)) {
+          TFLITE_LOG_PROD(tflite::TFLITE_LOG_INFO,
+                          "XNNPack weight cache loaded from '%s'.",
+                          options_.experimental_weight_cache_file_path);
+        } else {
+          TFLITE_LOG_PROD(
+              tflite::TFLITE_LOG_INFO,
+              "XNNPack weight cache not found at '%s', building it.",
+              options_.experimental_weight_cache_file_path);
+        }
+        options_.weights_cache =
+            reinterpret_cast<TfLiteXNNPackDelegateWeightsCache*>(
+                weight_cache_provider_.GetCacheProvider().context);
+        options_.experimental_weight_cache_file_path =
+            weight_cache_provider_.GetFilePath().data();
+      } else {
+        TFLITE_LOG_PROD(tflite::TFLITE_LOG_INFO,
+                        "XNNPack weight cache not enabled.");
+      }
+    }
   }
 
   TfLiteIntArray* PrepareOpsToDelegate(TfLiteContext* context);
@@ -664,7 +694,7 @@ class Delegate {
 #endif
   }
 
-  TfLiteXNNPackDelegateOptions options() const { return options_; }
+  const TfLiteXNNPackDelegateOptions& options() const { return options_; }
 
   int64_t GetXNNPackDelegateFlags() {
     if (enable_subgraph_reshaping()) {
@@ -711,6 +741,10 @@ class Delegate {
   TfLiteXNNPackDelegateOptions options_{};
   VariableHolder variable_holder_;
   std::mutex workspace_mutex_;
+
+  // If no weight cache is provided and a cache is set in the delegate options,
+  // this will be used as a weight cache.
+  MMapWeightCacheProvider weight_cache_provider_;
 };
 
 class Subgraph {
@@ -781,6 +815,13 @@ class Subgraph {
   static Subgraph* Create(TfLiteContext* context,
                           const TfLiteDelegateParams* params,
                           Delegate& delegate) {
+    // Map tensors identifiers before packing anything.
+    if (delegate.weight_cache_provider_.IsActive()) {
+      delegate.weight_cache_provider_.MapTensorIdentifiers(
+          context->tensors, context->tensors_size,
+          reinterpret_cast<tflite::Subgraph*>(context->impl_)
+              ->GetTensorBufferIdentifiers());
+    }
     // Convert subgraph inputs and outputs to hash sets for faster lookup.
     const std::unordered_set<int> inputs(
         &params->input_tensors->data[0],
@@ -1121,6 +1162,18 @@ class Subgraph {
   TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node,
                        bool enable_subgraph_reshaping, Delegate* delegate) {
     std::lock_guard<std::mutex> lock(delegate->workspace_mutex_);
+
+    // The weights cache needs to be finalized only once. Prepare will be called
+    // for each partition after all the partitions have been created (therefore
+    // all the weights are known and have been packed).
+    if (delegate->weight_cache_provider_.IsActive()) {
+      if (!delegate->weight_cache_provider_.Finalize()) {
+        TF_LITE_KERNEL_LOG(context,
+                           "XNNPack delegate failed to finalize cache.");
+        return kTfLiteError;
+      }
+    }
+
     if (enable_subgraph_reshaping) {
       xnn_status status = xnn_status_invalid_state;
       for (int i = 0; i < inputs_.size(); ++i) {
@@ -1170,10 +1223,8 @@ class Subgraph {
           return kTfLiteError;
         }
       }
-      return kTfLiteOk;
-    } else {
-      return kTfLiteOk;
     }
+    return kTfLiteOk;
   }
 
   TfLiteStatus Invoke(TfLiteContext* context, bool enable_subgraph_reshaping,
@@ -2899,17 +2950,19 @@ class Subgraph {
                           "odml.scaled_dot_product_attention") == 0) {
           const float* scale_val = nullptr;
           // ensure 28 bytes as we expect
-          if (node->custom_initial_data_size == 28) {
+          // TODO(b/339106680): this reading method may not work for every case.
+          if (node->custom_initial_data_size == 28 && sizeof(float) == 4) {
             // Custom data here is a flexbuffer map.
             // byte_width is 4 for our map.
             // First 5 values are "scale", then is the float value, and last is
             // flexbuffer metadata.
             const uint8_t* buffer =
                 reinterpret_cast<const uint8_t*>(node->custom_initial_data);
-            char str_val[20];
-            memcpy(str_val, buffer, 5 * 4);
-            if (strcmp(str_val, "scale") == 0)
-              scale_val = reinterpret_cast<const float*>(buffer + 5 * 4);
+            if (strcmp("scale", reinterpret_cast<const char*>(buffer)) == 0) {
+              constexpr size_t kScaleValOffset = 20;
+              scale_val =
+                  reinterpret_cast<const float*>(buffer + kScaleValOffset);
+            }
           }
 
           return VisitDotAttentionNode(subgraph, delegate, context, node_index,
@@ -4088,7 +4141,7 @@ class Subgraph {
                 num_bias_elements, output_channels, node->inputs->data[0]);
             return kTfLiteError;
           }
-          TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQInt32Type(
+          TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQCInt32Type(
               delegate, logging_context, bias_tensor, node->inputs->data[2],
               node_index));
           if (quasi_static_tensors.count(node->inputs->data[2]) == 0) {
@@ -4701,8 +4754,9 @@ class Subgraph {
         logging_context, node, 2, 1, BuiltinOperator_MEAN, node_index));
 
     const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input_tensor,
+                                       node->inputs->data[0], node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_tensor, 4,
                                            node->inputs->data[0],
                                            BuiltinOperator_MEAN, node_index));
@@ -6521,70 +6575,80 @@ class Subgraph {
       float default_out_max = std::numeric_limits<float>::infinity();
 
       // Attention Type
+      TF_LITE_ENSURE_EQ(logging_context,
+                        query_proj.dims->data[2] % key_proj.dims->data[2], 0);
       bool is_mqa = (key_proj.dims->data[2] == 1);
+      bool is_gqa =
+          !is_mqa && (key_proj.dims->data[2] != query_proj.dims->data[2]);
 
-      // Scale the query values by multiplying 1 / sqrt(dim_per_head).
+      // Scale the query values
       const auto query_dim = query_proj.dims;
       TF_LITE_ENSURE_EQ(logging_context, query_dim->size, 4);
       float scale_const = 1.0f / sqrt(query_dim->data[3]);
       uint32_t scale_out_id = XNN_INVALID_VALUE_ID;
-      if (scale_param != nullptr && *scale_param == scale_const) {
+      if (scale_param != nullptr) {
         TF_LITE_ENSURE_EQ(
             logging_context, xnn_status_success,
-            xnn_define_tensor_value(subgraph, xnn_datatype_fp32, 0, nullptr,
-                                    scale_param, XNN_INVALID_VALUE_ID, 0,
-                                    &scale_out_id));
+            xnn_define_tensor_value(subgraph, xnn_datatype_fp32, /*num_dims=*/0,
+                                    /*dims=*/nullptr, scale_param,
+                                    XNN_INVALID_VALUE_ID, 0, &scale_out_id));
       } else {
-        // fallback
+        // fallback, use default scale = 1 / sqrt(dim_per_head)
         uint32_t scale_orig_id = XNN_INVALID_VALUE_ID;
         TF_LITE_ENSURE_EQ(
             logging_context, xnn_status_success,
-            xnn_define_tensor_value(subgraph, xnn_datatype_fp32, 0, nullptr,
-                                    &query_proj.dims->data[3],
+            xnn_define_tensor_value(subgraph, xnn_datatype_fp32, /*num_dims=*/0,
+                                    /*dims=*/nullptr, &query_proj.dims->data[3],
                                     XNN_INVALID_VALUE_ID, 0, &scale_orig_id));
-        TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
-                          xnn_define_tensor_value(
-                              subgraph, xnn_datatype_fp32, 0, nullptr, nullptr,
-                              XNN_INVALID_VALUE_ID, 0, &scale_out_id));
-        TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
-                          xnn_define_clamp(subgraph, scale_const, scale_const,
-                                           scale_orig_id, scale_out_id, 0));
+        TF_LITE_ENSURE_EQ(
+            logging_context, xnn_status_success,
+            xnn_define_tensor_value(subgraph, xnn_datatype_fp32, /*num_dims=*/0,
+                                    /*dims=*/nullptr, nullptr,
+                                    XNN_INVALID_VALUE_ID, 0, &scale_out_id));
+        TF_LITE_ENSURE_EQ(
+            logging_context, xnn_status_success,
+            xnn_define_clamp(subgraph, scale_const, scale_const, scale_orig_id,
+                             scale_out_id, /*flags=*/0));
       }
       uint32_t multiply_out_id = XNN_INVALID_VALUE_ID;
-      TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
-                        xnn_define_tensor_value(
-                            subgraph, xnn_datatype_fp32, 0, nullptr, nullptr,
-                            XNN_INVALID_VALUE_ID, 0, &multiply_out_id));
-      TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
-                        xnn_define_multiply2(subgraph, default_out_min,
-                                             default_out_max, query_proj_id,
-                                             scale_out_id, multiply_out_id, 0));
+      TF_LITE_ENSURE_EQ(
+          logging_context, xnn_status_success,
+          xnn_define_tensor_value(subgraph, xnn_datatype_fp32, /*num_dims=*/0,
+                                  /*dims=*/nullptr, nullptr,
+                                  XNN_INVALID_VALUE_ID, 0, &multiply_out_id));
+      TF_LITE_ENSURE_EQ(
+          logging_context, xnn_status_success,
+          xnn_define_multiply2(subgraph, default_out_min, default_out_max,
+                               query_proj_id, scale_out_id, multiply_out_id,
+                               /*flags=*/0));
       // Dot similarity
       // BTNH -> BNTH
-      std::vector<size_t> permute_q = {0, 2, 1, 3};
+      std::array<size_t, 4> permute_q = {0, 2, 1, 3};
       TF_LITE_ENSURE_EQ(logging_context, query_proj.dims->size,
                         permute_q.size());
       uint32_t permute_q_out_id = XNN_INVALID_VALUE_ID;
-      TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
-                        xnn_define_tensor_value(
-                            subgraph, xnn_datatype_fp32, 0, nullptr, nullptr,
-                            XNN_INVALID_VALUE_ID, 0, &permute_q_out_id));
+      TF_LITE_ENSURE_EQ(
+          logging_context, xnn_status_success,
+          xnn_define_tensor_value(subgraph, xnn_datatype_fp32, /*num_dims=*/0,
+                                  /*dims=*/nullptr, nullptr,
+                                  XNN_INVALID_VALUE_ID, 0, &permute_q_out_id));
       TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
                         xnn_define_static_transpose(
                             subgraph, permute_q.size(), permute_q.data(),
-                            multiply_out_id, permute_q_out_id, 0));
+                            multiply_out_id, permute_q_out_id, /*flags=*/0));
       // BSNH -> BNSH
-      std::vector<size_t> permute_k = {0, 2, 1, 3};
+      std::array<size_t, 4> permute_k = {0, 2, 1, 3};
       TF_LITE_ENSURE_EQ(logging_context, key_proj.dims->size, permute_k.size());
       uint32_t permute_k_out_id = XNN_INVALID_VALUE_ID;
-      TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
-                        xnn_define_tensor_value(
-                            subgraph, xnn_datatype_fp32, 0, nullptr, nullptr,
-                            XNN_INVALID_VALUE_ID, 0, &permute_k_out_id));
+      TF_LITE_ENSURE_EQ(
+          logging_context, xnn_status_success,
+          xnn_define_tensor_value(subgraph, xnn_datatype_fp32, /*num_dims=*/0,
+                                  /*dims=*/nullptr, nullptr,
+                                  XNN_INVALID_VALUE_ID, 0, &permute_k_out_id));
       TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
                         xnn_define_static_transpose(
                             subgraph, permute_k.size(), permute_k.data(),
-                            key_proj_id, permute_k_out_id, 0));
+                            key_proj_id, permute_k_out_id, /*flags=*/0));
       // einsum(BNTH.BNSH -> BNTS)
       uint32_t fc_out_id = XNN_INVALID_VALUE_ID;
       if (!is_mqa) {
@@ -6592,78 +6656,139 @@ class Subgraph {
         // [B, N, T, S] . [B, N, H, S]
         // output shape [query_proj_dim[0], query_proj_dim[2],
         // query_proj_dim[1], key_proj_dim[1]];
-        TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
-                          xnn_define_tensor_value(
-                              subgraph, xnn_datatype_fp32, 0, nullptr, nullptr,
-                              XNN_INVALID_VALUE_ID, 0, &fc_out_id));
-        TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
-                          xnn_define_batch_matrix_multiply(
-                              subgraph, permute_q_out_id, permute_k_out_id,
-                              fc_out_id, XNN_FLAG_TRANSPOSE_B));
+        TF_LITE_ENSURE_EQ(
+            logging_context, xnn_status_success,
+            xnn_define_tensor_value(subgraph, xnn_datatype_fp32, /*num_dims=*/0,
+                                    /*dims=*/nullptr, nullptr,
+                                    XNN_INVALID_VALUE_ID, 0, &fc_out_id));
+        if (is_gqa) {
+          uint32_t q_reshape_id = XNN_INVALID_VALUE_ID;
+          TF_LITE_ENSURE_EQ(
+              logging_context, xnn_status_success,
+              xnn_define_tensor_value(subgraph, xnn_datatype_fp32,
+                                      /*num_dims=*/0, /*dims=*/nullptr, nullptr,
+                                      XNN_INVALID_VALUE_ID, 0, &q_reshape_id));
+          uint32_t k_reshape_id = XNN_INVALID_VALUE_ID;
+          TF_LITE_ENSURE_EQ(
+              logging_context, xnn_status_success,
+              xnn_define_tensor_value(subgraph, xnn_datatype_fp32,
+                                      /*num_dims=*/0, /*dims=*/nullptr, nullptr,
+                                      XNN_INVALID_VALUE_ID, 0, &k_reshape_id));
+          uint32_t bmm_reshape_id = XNN_INVALID_VALUE_ID;
+          TF_LITE_ENSURE_EQ(
+              logging_context, xnn_status_success,
+              xnn_define_tensor_value(
+                  subgraph, xnn_datatype_fp32, /*num_dims=*/0, /*dims=*/nullptr,
+                  nullptr, XNN_INVALID_VALUE_ID, 0, &bmm_reshape_id));
+          size_t num_query_groups = key_proj.dims->data[2];
+          size_t head_per_query = query_proj.dims->data[2] / num_query_groups;
+          std::array<size_t, 5> q_reshape_dims = {
+              (size_t)query_proj.dims->data[0], num_query_groups,
+              head_per_query, (size_t)query_proj.dims->data[1],
+              (size_t)query_proj.dims->data[3]};
+          std::array<size_t, 5> k_reshape_dims = {
+              (size_t)key_proj.dims->data[0], num_query_groups, 1,
+              (size_t)key_proj.dims->data[1], (size_t)key_proj.dims->data[3]};
+          std::array<size_t, 4> bmm_reshape_dims = {
+              (size_t)query_proj.dims->data[0],
+              num_query_groups * head_per_query,
+              (size_t)query_proj.dims->data[1], (size_t)key_proj.dims->data[1]};
+          TF_LITE_ENSURE_EQ(
+              logging_context, xnn_status_success,
+              xnn_define_static_reshape(subgraph, q_reshape_dims.size(),
+                                        q_reshape_dims.data(), permute_q_out_id,
+                                        q_reshape_id, /*flags=*/0));
+          TF_LITE_ENSURE_EQ(
+              logging_context, xnn_status_success,
+              xnn_define_static_reshape(subgraph, k_reshape_dims.size(),
+                                        k_reshape_dims.data(), permute_k_out_id,
+                                        k_reshape_id, /*flags=*/0));
+          TF_LITE_ENSURE_EQ(
+              logging_context, xnn_status_success,
+              xnn_define_batch_matrix_multiply(subgraph, q_reshape_id,
+                                               k_reshape_id, bmm_reshape_id,
+                                               /*flags=*/XNN_FLAG_TRANSPOSE_B));
+          TF_LITE_ENSURE_EQ(
+              logging_context, xnn_status_success,
+              xnn_define_static_reshape(subgraph, bmm_reshape_dims.size(),
+                                        bmm_reshape_dims.data(), bmm_reshape_id,
+                                        fc_out_id, /*flags=*/0));
+        } else {
+          TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
+                            xnn_define_batch_matrix_multiply(
+                                subgraph, permute_q_out_id, permute_k_out_id,
+                                fc_out_id, /*flags=*/XNN_FLAG_TRANSPOSE_B));
+        }
       } else {
         // FC (permute_q, permute_k)
         TFLITE_DCHECK(key_proj.dims->data[0] == 1);
         TFLITE_DCHECK(key_proj.dims->data[2] == 1);
         // squeezed_rhs shape: [S, H]
-        std::vector<size_t> reshape_dims_k = {(size_t)key_proj.dims->data[1],
-                                              (size_t)key_proj.dims->data[3]};
+        std::array<size_t, 2> reshape_dims_k = {(size_t)key_proj.dims->data[1],
+                                                (size_t)key_proj.dims->data[3]};
         uint32_t reshape_dims_k_out_id = XNN_INVALID_VALUE_ID;
-        TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
-                          xnn_define_tensor_value(
-                              subgraph, xnn_datatype_fp32, 0, nullptr, nullptr,
-                              XNN_INVALID_VALUE_ID, 0, &reshape_dims_k_out_id));
+        TF_LITE_ENSURE_EQ(
+            logging_context, xnn_status_success,
+            xnn_define_tensor_value(
+                subgraph, xnn_datatype_fp32, /*num_dims=*/0, /*dims=*/nullptr,
+                nullptr, XNN_INVALID_VALUE_ID, 0, &reshape_dims_k_out_id));
         TF_LITE_ENSURE_EQ(
             logging_context, xnn_status_success,
             xnn_define_static_reshape(subgraph, reshape_dims_k.size(),
                                       reshape_dims_k.data(), permute_k_out_id,
-                                      reshape_dims_k_out_id, 0));
+                                      reshape_dims_k_out_id, /*flags=*/0));
         // Output shape: [B, N, T, S]
         // FC: input = permuted_q, weight = reshaped_k, bias = nullptr,
         // params=(transpose=false)
         // assumes no sparse computation for now
-        TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
-                          xnn_define_tensor_value(
-                              subgraph, xnn_datatype_fp32, 0, nullptr, nullptr,
-                              XNN_INVALID_VALUE_ID, 0, &fc_out_id));
         TF_LITE_ENSURE_EQ(
             logging_context, xnn_status_success,
-            xnn_define_fully_connected(
-                subgraph, default_out_min, default_out_max, permute_q_out_id,
-                reshape_dims_k_out_id, XNN_INVALID_VALUE_ID, fc_out_id, 0));
+            xnn_define_tensor_value(subgraph, xnn_datatype_fp32, /*num_dims=*/0,
+                                    /*dims=*/nullptr, nullptr,
+                                    XNN_INVALID_VALUE_ID, 0, &fc_out_id));
+        TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
+                          xnn_define_fully_connected(
+                              subgraph, default_out_min, default_out_max,
+                              permute_q_out_id, reshape_dims_k_out_id,
+                              XNN_INVALID_VALUE_ID, fc_out_id, /*flags=*/0));
       }
       // TODO(b/323195341): add CapTanh support.
       // element_add atten_mask and matmul_out
       uint32_t padded_logits_id = XNN_INVALID_VALUE_ID;
-      TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
-                        xnn_define_tensor_value(
-                            subgraph, xnn_datatype_fp32, 0, nullptr, nullptr,
-                            XNN_INVALID_VALUE_ID, 0, &padded_logits_id));
+      TF_LITE_ENSURE_EQ(
+          logging_context, xnn_status_success,
+          xnn_define_tensor_value(subgraph, xnn_datatype_fp32, /*num_dims=*/0,
+                                  /*dims=*/nullptr, nullptr,
+                                  XNN_INVALID_VALUE_ID, 0, &padded_logits_id));
       TF_LITE_ENSURE_EQ(
           logging_context, xnn_status_success,
           xnn_define_add2(subgraph, default_out_min, default_out_max,
-                          atten_mask_id, fc_out_id, padded_logits_id, 0));
+                          atten_mask_id, fc_out_id, padded_logits_id,
+                          /*flags=*/0));
       // softmax(padded_logits)
       uint32_t probs_id = XNN_INVALID_VALUE_ID;
       TF_LITE_ENSURE_EQ(
           logging_context, xnn_status_success,
-          xnn_define_tensor_value(subgraph, xnn_datatype_fp32, 0, nullptr,
-                                  nullptr, XNN_INVALID_VALUE_ID, 0, &probs_id));
-      TF_LITE_ENSURE_EQ(
-          logging_context, xnn_status_success,
-          xnn_define_softmax(subgraph, padded_logits_id, probs_id, 0));
+          xnn_define_tensor_value(subgraph, xnn_datatype_fp32, /*num_dims=*/0,
+                                  /*dims=*/nullptr, nullptr,
+                                  XNN_INVALID_VALUE_ID, 0, &probs_id));
+      TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
+                        xnn_define_softmax(subgraph, padded_logits_id, probs_id,
+                                           /*flags=*/0));
       // Permute(value_proj, {0, 2, 3, 1})
-      std::vector<size_t> permute_v = {0, 2, 3, 1};
+      std::array<size_t, 4> permute_v = {0, 2, 3, 1};
       TF_LITE_ENSURE_EQ(logging_context, value_proj.dims->size,
                         permute_v.size());
       uint32_t permute_v_out_id = XNN_INVALID_VALUE_ID;
-      TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
-                        xnn_define_tensor_value(
-                            subgraph, xnn_datatype_fp32, 0, nullptr, nullptr,
-                            XNN_INVALID_VALUE_ID, 0, &permute_v_out_id));
+      TF_LITE_ENSURE_EQ(
+          logging_context, xnn_status_success,
+          xnn_define_tensor_value(subgraph, xnn_datatype_fp32, /*num_dims=*/0,
+                                  /*dims=*/nullptr, nullptr,
+                                  XNN_INVALID_VALUE_ID, 0, &permute_v_out_id));
       TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
                         xnn_define_static_transpose(
                             subgraph, permute_v.size(), permute_v.data(),
-                            value_proj_id, permute_v_out_id, 0));
+                            value_proj_id, permute_v_out_id, /*flags=*/0));
       // Outcome
       // BNTS.BNHS -> BNTH
       uint32_t fc2_out_id = XNN_INVALID_VALUE_ID;
@@ -6672,51 +6797,110 @@ class Subgraph {
         // [B, N, T, S] . [B, N, H, S]
         // output shape [padded_logits_dims[0], padded_logits_dims[1],
         // padded_logits_dims[2], value_proj_dims[3]];
-        TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
-                          xnn_define_tensor_value(
-                              subgraph, xnn_datatype_fp32, 0, nullptr, nullptr,
-                              XNN_INVALID_VALUE_ID, 0, &fc2_out_id));
-        TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
-                          xnn_define_batch_matrix_multiply(
-                              subgraph, probs_id, permute_v_out_id, fc2_out_id,
-                              XNN_FLAG_TRANSPOSE_B));
+        TF_LITE_ENSURE_EQ(
+            logging_context, xnn_status_success,
+            xnn_define_tensor_value(subgraph, xnn_datatype_fp32, /*num_dims=*/0,
+                                    /*dims=*/nullptr, nullptr,
+                                    XNN_INVALID_VALUE_ID, 0, &fc2_out_id));
+        if (is_gqa) {
+          uint32_t padded_logits_reshape_id = XNN_INVALID_VALUE_ID;
+          TF_LITE_ENSURE_EQ(
+              logging_context, xnn_status_success,
+              xnn_define_tensor_value(
+                  subgraph, xnn_datatype_fp32, /*num_dims=*/0, /*dims=*/nullptr,
+                  nullptr, XNN_INVALID_VALUE_ID, 0, &padded_logits_reshape_id));
+          uint32_t v_reshape_id = XNN_INVALID_VALUE_ID;
+          TF_LITE_ENSURE_EQ(
+              logging_context, xnn_status_success,
+              xnn_define_tensor_value(subgraph, xnn_datatype_fp32,
+                                      /*num_dims=*/0, /*dims=*/nullptr, nullptr,
+                                      XNN_INVALID_VALUE_ID, 0, &v_reshape_id));
+          uint32_t bmm2_reshape_id = XNN_INVALID_VALUE_ID;
+          TF_LITE_ENSURE_EQ(
+              logging_context, xnn_status_success,
+              xnn_define_tensor_value(
+                  subgraph, xnn_datatype_fp32, /*num_dims=*/0, /*dims=*/nullptr,
+                  nullptr, XNN_INVALID_VALUE_ID, 0, &bmm2_reshape_id));
+          size_t num_query_groups = value_proj.dims->data[2];
+          size_t head_per_query = query_proj.dims->data[2] / num_query_groups;
+          std::array<size_t, 5> padded_logits_reshape_dims = {
+              (size_t)query_proj.dims->data[0], num_query_groups,
+              head_per_query, (size_t)query_proj.dims->data[1],
+              (size_t)value_proj.dims->data[1]};
+          std::array<size_t, 5> v_reshape_dims = {
+              (size_t)value_proj.dims->data[0], num_query_groups, 1,
+              (size_t)value_proj.dims->data[3],
+              (size_t)value_proj.dims->data[1]};
+          std::array<size_t, 4> bmm2_reshape_dims = {
+              (size_t)query_proj.dims->data[0],
+              num_query_groups * head_per_query,
+              (size_t)query_proj.dims->data[1],
+              (size_t)query_proj.dims->data[3]};
+          TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
+                            xnn_define_static_reshape(
+                                subgraph, padded_logits_reshape_dims.size(),
+                                padded_logits_reshape_dims.data(), probs_id,
+                                padded_logits_reshape_id, /*flags=*/0));
+          TF_LITE_ENSURE_EQ(
+              logging_context, xnn_status_success,
+              xnn_define_static_reshape(subgraph, v_reshape_dims.size(),
+                                        v_reshape_dims.data(), permute_v_out_id,
+                                        v_reshape_id, /*flags=*/0));
+          TF_LITE_ENSURE_EQ(
+              logging_context, xnn_status_success,
+              xnn_define_batch_matrix_multiply(
+                  subgraph, padded_logits_reshape_id, v_reshape_id,
+                  bmm2_reshape_id, /*flags=*/XNN_FLAG_TRANSPOSE_B));
+          TF_LITE_ENSURE_EQ(
+              logging_context, xnn_status_success,
+              xnn_define_static_reshape(
+                  subgraph, bmm2_reshape_dims.size(), bmm2_reshape_dims.data(),
+                  bmm2_reshape_id, fc2_out_id, /*flags=*/0));
+        } else {
+          TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
+                            xnn_define_batch_matrix_multiply(
+                                subgraph, probs_id, permute_v_out_id,
+                                fc2_out_id, /*flags=*/XNN_FLAG_TRANSPOSE_B));
+        }
       } else {
         // FC (padded_logits, permute_v)
         TFLITE_DCHECK(value_proj.dims->data[0] == 1);
         TFLITE_DCHECK(value_proj.dims->data[2] == 1);
         // squeezed_rhs shape: [S, H]
-        std::vector<size_t> reshape_dims_v = {(size_t)value_proj.dims->data[3],
-                                              (size_t)value_proj.dims->data[1]};
+        std::array<size_t, 2> reshape_dims_v = {
+            (size_t)value_proj.dims->data[3], (size_t)value_proj.dims->data[1]};
         uint32_t reshape_dims_v_out_id = XNN_INVALID_VALUE_ID;
-        TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
-                          xnn_define_tensor_value(
-                              subgraph, xnn_datatype_fp32, 0, nullptr, nullptr,
-                              XNN_INVALID_VALUE_ID, 0, &reshape_dims_v_out_id));
+        TF_LITE_ENSURE_EQ(
+            logging_context, xnn_status_success,
+            xnn_define_tensor_value(
+                subgraph, xnn_datatype_fp32, /*num_dims=*/0, /*dims=*/nullptr,
+                nullptr, XNN_INVALID_VALUE_ID, 0, &reshape_dims_v_out_id));
         TF_LITE_ENSURE_EQ(
             logging_context, xnn_status_success,
             xnn_define_static_reshape(subgraph, reshape_dims_v.size(),
                                       reshape_dims_v.data(), permute_v_out_id,
-                                      reshape_dims_v_out_id, 0));
+                                      reshape_dims_v_out_id, /*flags=*/0));
         // Output shape: [B, N, T, S]
         // FC: input = padded_logits, weight = reshaped_v, bias = nullptr,
         // params=(transpose=false)
         // assumes no sparse computation for now
-        TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
-                          xnn_define_tensor_value(
-                              subgraph, xnn_datatype_fp32, 0, nullptr, nullptr,
-                              XNN_INVALID_VALUE_ID, 0, &fc2_out_id));
         TF_LITE_ENSURE_EQ(
             logging_context, xnn_status_success,
-            xnn_define_fully_connected(
-                subgraph, default_out_min, default_out_max, probs_id,
-                reshape_dims_v_out_id, XNN_INVALID_VALUE_ID, fc2_out_id, 0));
+            xnn_define_tensor_value(subgraph, xnn_datatype_fp32, /*num_dims=*/0,
+                                    /*dims=*/nullptr, nullptr,
+                                    XNN_INVALID_VALUE_ID, 0, &fc2_out_id));
+        TF_LITE_ENSURE_EQ(logging_context, xnn_status_success,
+                          xnn_define_fully_connected(
+                              subgraph, default_out_min, default_out_max,
+                              probs_id, reshape_dims_v_out_id,
+                              XNN_INVALID_VALUE_ID, fc2_out_id, /*flags=*/0));
       }
       // [B, N, T, H] -> BTNH
       // Permute(fc2_out_id, {0, 2, 1, 3}) -> output tensor
-      std::vector<size_t> permute_fc = {0, 2, 1, 3};
+      std::array<size_t, 4> permute_fc = {0, 2, 1, 3};
       const xnn_status status = xnn_define_static_transpose(
           subgraph, permute_fc.size(), permute_fc.data(), fc2_out_id, output_id,
-          0);
+          /*flags=*/0);
       if (status != xnn_status_success) {
         TF_LITE_KERNEL_LOG(logging_context, "failed to delegate %s node #%d",
                            "odml.scaled_dot_product_attention", node_index);
@@ -6832,9 +7016,6 @@ class Subgraph {
 
     const int filter_tensor_index = node->inputs->data[1];
     const TfLiteTensor& filter_tensor = tensors[filter_tensor_index];
-    TF_LITE_ENSURE_STATUS(
-        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, filter_tensor,
-                                       filter_tensor_index, node_index));
     TF_LITE_ENSURE_STATUS(
         CheckTensorShape(logging_context, filter_tensor, 4, filter_tensor_index,
                          BuiltinOperator_TRANSPOSE_CONV, node_index));
@@ -6856,6 +7037,18 @@ class Subgraph {
         CheckTensorNonDynamicAllocation(delegate, logging_context, input_tensor,
                                         input_tensor_index, node_index));
 
+    bool dynamically_quantized = (input_tensor.type == kTfLiteFloat32 &&
+                                  filter_tensor.type == kTfLiteInt8);
+    if (dynamically_quantized) {
+      TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQCInt8Type(
+          delegate, logging_context, filter_tensor,
+          /*expected_quantized_dimension=*/0, filter_tensor_index, node_index));
+    } else {
+      TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQUInt8Type(
+          delegate, logging_context, filter_tensor, filter_tensor_index,
+          node_index));
+    }
+
     uint32_t xnnpack_tensor_bias = XNN_INVALID_VALUE_ID;  // "No bias".
     if (use_bias) {
       const int bias_tensor_index = node->inputs->data[3];
@@ -6943,36 +7136,127 @@ class Subgraph {
         &output_max));
 
     if (subgraph != nullptr) {
-      const xnn_status status = xnn_define_deconvolution_2d(
-          subgraph,
-          /*padding_top=*/padding_top,
-          /*padding_right=*/padding_right,
-          /*padding_bottom=*/padding_bottom,
-          /*padding_left=*/padding_left,
-          /*adjustment_height=*/adjustment_height,
-          /*adjustment_width=*/adjustment_width,
-          static_cast<uint32_t>(kernel_height),
-          static_cast<uint32_t>(kernel_width),
-          static_cast<uint32_t>(deconv_params->stride_height),
-          static_cast<uint32_t>(deconv_params->stride_width),
-          /*dilation_height=*/1,
-          /*dilation_width=*/1,
-          /*groups=*/1,
-          /*group_input_channels=*/input_channels,
-          /*group_output_channels=*/output_channels,
-          /*output_min=*/output_min,
-          /*output_max=*/output_max,
-          /*input_id=*/input_output_tensors.at(input_tensor_index),
-          /*filter_id=*/input_output_tensors.at(filter_tensor_index),
-          /*bias_id=*/xnnpack_tensor_bias,
-          /*output_id=*/input_output_tensors.at(output_tensor_index),
-          /*flags=*/0);
-      if (status != xnn_status_success) {
-        TF_LITE_KERNEL_LOG(
-            logging_context, "failed to delegate %s node #%d",
-            EnumNameBuiltinOperator(BuiltinOperator_TRANSPOSE_CONV),
-            node_index);
-        return kTfLiteError;
+      if (dynamically_quantized) {
+        TfLiteAffineQuantization* filter_params =
+            reinterpret_cast<TfLiteAffineQuantization*>(
+                filter_tensor.quantization.params);
+        if (filter_params->scale->size != output_channels) {
+          TfLiteFloatArrayFree(filter_params->scale);
+          filter_params->scale = TfLiteFloatArrayCreate(output_channels);
+          for (int i = 0; i < output_channels; ++i) {
+            filter_params->scale->data[i] = filter_tensor.params.scale;
+          }
+          TfLiteIntArrayFree(filter_params->zero_point);
+          filter_params->zero_point = TfLiteIntArrayCreate(output_channels);
+          for (int i = 0; i < output_channels; ++i) {
+            filter_params->zero_point->data[i] =
+                filter_tensor.params.zero_point;
+          }
+        }
+        uint32_t dq_quantized_id = XNN_INVALID_VALUE_ID;
+        std::vector<size_t> input_dims(
+            &input_tensor.dims->data[0],
+            &input_tensor.dims->data[NumDimensions(&input_tensor)]);
+        xnn_status status = xnn_define_dynamically_quantized_tensor_value(
+            subgraph, xnn_datatype_qdint8, input_dims.size(),
+            /*num_nonbatch_dims=*/3, input_dims.data(), XNN_INVALID_VALUE_ID,
+            /*flags=*/0, &dq_quantized_id);
+        if (status != xnn_status_success) {
+          TF_LITE_KERNEL_LOG(logging_context,
+                             "failed to create XNNPACK Value for tensor %d",
+                             -1);
+          return kTfLiteError;
+        }
+        status = xnn_define_convert(
+            subgraph,
+            /*input_id=*/input_output_tensors.at(node->inputs->data[2]),
+            dq_quantized_id, /*flags=*/0);
+        if (status != xnn_status_success) {
+          TF_LITE_KERNEL_LOG(
+              logging_context, "failed to delegate %s node #%d",
+              EnumNameBuiltinOperator(BuiltinOperator_TRANSPOSE_CONV),
+              node_index);
+          return kTfLiteError;
+        }
+        std::vector<size_t> filter_dims(
+            &filter_tensor.dims->data[0],
+            &filter_tensor.dims->data[NumDimensions(&filter_tensor)]);
+        uint32_t kernel_id = XNN_INVALID_VALUE_ID;
+        status = xnn_define_channelwise_quantized_tensor_value(
+            subgraph, xnn_datatype_qcint8, filter_params->scale->data,
+            filter_dims.size(), /*channel_dim=*/0, filter_dims.data(),
+            GetTensorData<int8_t>(&filter_tensor), XNN_INVALID_VALUE_ID,
+            /*flags=*/0, &kernel_id);
+        if (status != xnn_status_success) {
+          TF_LITE_KERNEL_LOG(
+              logging_context, "failed to update filter tensor %s node #%d",
+              EnumNameBuiltinOperator(BuiltinOperator_TRANSPOSE_CONV),
+              node_index);
+          return kTfLiteError;
+        }
+        status = xnn_define_deconvolution_2d(
+            subgraph,
+            /*padding_top=*/padding_top,
+            /*padding_right=*/padding_right,
+            /*padding_bottom=*/padding_bottom,
+            /*padding_left=*/padding_left,
+            /*adjustment_height=*/adjustment_height,
+            /*adjustment_width=*/adjustment_width,
+            static_cast<uint32_t>(kernel_height),
+            static_cast<uint32_t>(kernel_width),
+            static_cast<uint32_t>(deconv_params->stride_height),
+            static_cast<uint32_t>(deconv_params->stride_width),
+            /*dilation_height=*/1,
+            /*dilation_width=*/1,
+            /*groups=*/1,
+            /*group_input_channels=*/input_channels,
+            /*group_output_channels=*/output_channels,
+            /*output_min=*/output_min,
+            /*output_max=*/output_max,
+            /*input_id=*/dq_quantized_id,
+            /*filter_id=*/kernel_id,
+            /*bias_id=*/xnnpack_tensor_bias,
+            /*output_id=*/input_output_tensors.at(output_tensor_index),
+            /*flags=*/0);
+        if (status != xnn_status_success) {
+          TF_LITE_KERNEL_LOG(
+              logging_context, "failed to delegate %s node #%d",
+              EnumNameBuiltinOperator(BuiltinOperator_TRANSPOSE_CONV),
+              node_index);
+          return kTfLiteError;
+        }
+      } else {
+        const xnn_status status = xnn_define_deconvolution_2d(
+            subgraph,
+            /*padding_top=*/padding_top,
+            /*padding_right=*/padding_right,
+            /*padding_bottom=*/padding_bottom,
+            /*padding_left=*/padding_left,
+            /*adjustment_height=*/adjustment_height,
+            /*adjustment_width=*/adjustment_width,
+            static_cast<uint32_t>(kernel_height),
+            static_cast<uint32_t>(kernel_width),
+            static_cast<uint32_t>(deconv_params->stride_height),
+            static_cast<uint32_t>(deconv_params->stride_width),
+            /*dilation_height=*/1,
+            /*dilation_width=*/1,
+            /*groups=*/1,
+            /*group_input_channels=*/input_channels,
+            /*group_output_channels=*/output_channels,
+            /*output_min=*/output_min,
+            /*output_max=*/output_max,
+            /*input_id=*/input_output_tensors.at(input_tensor_index),
+            /*filter_id=*/input_output_tensors.at(filter_tensor_index),
+            /*bias_id=*/xnnpack_tensor_bias,
+            /*output_id=*/input_output_tensors.at(output_tensor_index),
+            /*flags=*/0);
+        if (status != xnn_status_success) {
+          TF_LITE_KERNEL_LOG(
+              logging_context, "failed to delegate %s node #%d",
+              EnumNameBuiltinOperator(BuiltinOperator_TRANSPOSE_CONV),
+              node_index);
+          return kTfLiteError;
+        }
       }
     }
 
@@ -7565,7 +7849,6 @@ void TfLiteXNNPackDelegateWeightsCacheDelete(
   }
   auto weights_cache = reinterpret_cast<xnn_weights_cache_t>(cache);
   xnn_delete_weights_cache(weights_cache);
-  xnn_deinitialize();
 }
 
 TfLiteXNNPackDelegateOptions TfLiteXNNPackDelegateOptionsDefault() {
@@ -7597,14 +7880,6 @@ TfLiteXNNPackDelegateOptions TfLiteXNNPackDelegateOptionsDefault() {
   return options;
 }
 
-TfLiteXNNPackDelegateOptions GetOptions(const void* delegate_data) {
-  if (delegate_data == nullptr) {
-    return TfLiteXNNPackDelegateOptionsDefault();
-  }
-  return static_cast<const tflite::xnnpack::Delegate*>(delegate_data)
-      ->options();
-}
-
 TfLiteDelegate* TfLiteXNNPackDelegateCreate(
     const TfLiteXNNPackDelegateOptions* options) {
   return TfLiteXNNPackDelegateCreateWithThreadpool(options, nullptr);
@@ -7636,6 +7911,15 @@ void* TfLiteXNNPackDelegateGetThreadPool(TfLiteDelegate* delegate) {
       static_cast<::tflite::xnnpack::Delegate*>(delegate->data_)->threadpool());
 }
 
+const TfLiteXNNPackDelegateOptions* TfLiteXNNPackDelegateGetOptions(
+    TfLiteDelegate* delegate) {
+  if (delegate == nullptr) {
+    return nullptr;
+  }
+  return &(static_cast<const tflite::xnnpack::Delegate*>(delegate->data_)
+               ->options());
+}
+
 int TfLiteXNNPackDelegateGetFlags(TfLiteDelegate* delegate) {
   if (delegate == nullptr) {
     return 0;
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h
index aa11998dc0fc49..55eddcf1a54d67 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h
@@ -70,6 +70,10 @@ typedef struct {
   bool handle_variable_ops;
   // Enable adaptive optimization for AVX CPUs.
   bool experimental_adaptive_avx_optimization;
+  // Path to the weight cache to load if `weight_cache` is undefined.
+  //
+  // WARNING this is an experimental flag.
+  const char* experimental_weight_cache_file_path;
 } TfLiteXNNPackDelegateOptions;
 
 // Returns a structure with the default XNNPack delegate options.
@@ -97,6 +101,13 @@ TfLiteDelegate* TfLiteXNNPackDelegateCreateWithThreadpool(
 TFL_CAPI_EXPORT void* TfLiteXNNPackDelegateGetThreadPool(
     TfLiteDelegate* delegate);
 
+// Returns the options in the delegate.
+// Returns NULL if the delegate is NULL.
+//
+// WARNING: This API is experimental and subject to change.
+TFL_CAPI_EXPORT const TfLiteXNNPackDelegateOptions*
+TfLiteXNNPackDelegateGetOptions(TfLiteDelegate* delegate);
+
 // Returns the flags used for an XNNPack delegate.
 // See documentation for TfLiteXNNPackDelegateOptions.flags.
 //
@@ -111,11 +122,13 @@ TFL_CAPI_EXPORT void TfLiteXNNPackDelegateDelete(TfLiteDelegate* delegate);
 // reduce memory bandwidth.
 TFL_CAPI_EXPORT struct TfLiteXNNPackDelegateWeightsCache*
 TfLiteXNNPackDelegateWeightsCacheCreate();
+
 // Creates a new weights cache with a specified initial size that can be shared
 // with multiple delegate instances. The weights cache can hold up to size bytes
 // without growing.
 TFL_CAPI_EXPORT struct TfLiteXNNPackDelegateWeightsCache*
 TfLiteXNNPackDelegateWeightsCacheCreateWithSize(size_t size);
+
 // Soft-finalize a weights cache. Extra space will be left in the weights cache
 // to allow for cache "insertion" only if it is a cache hit. This has memory
 // overhead compared to TfLiteXNNPackDelegateWeightsCacheFinalizeHard. Use this
@@ -124,6 +137,7 @@ TfLiteXNNPackDelegateWeightsCacheCreateWithSize(size_t size);
 // Returns true on success, false on error.
 TFL_CAPI_EXPORT bool TfLiteXNNPackDelegateWeightsCacheFinalizeSoft(
     struct TfLiteXNNPackDelegateWeightsCache* cache);
+
 // Hard-finalize a weights cache, cache is effectively frozen and no more cache
 // operations are allowed. Memory is resized to smallest possible. Use this if
 // the number of interpreter instances using XNNPACK delegate can be fixed and
@@ -132,6 +146,7 @@ TFL_CAPI_EXPORT bool TfLiteXNNPackDelegateWeightsCacheFinalizeSoft(
 // Returns true on success, false on error.
 TFL_CAPI_EXPORT bool TfLiteXNNPackDelegateWeightsCacheFinalizeHard(
     struct TfLiteXNNPackDelegateWeightsCache* cache);
+
 // Destroys a weights cache created with
 // `TfLiteXNNPackDelegateWeightsCacheCreate` call.
 TFL_CAPI_EXPORT void TfLiteXNNPackDelegateWeightsCacheDelete(
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate_test.h b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate_test.h
deleted file mode 100644
index b20d024f06debe..00000000000000
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate_test.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_XNNPACK_DELEGATE_TEST_H_
-#define TENSORFLOW_LITE_DELEGATES_XNNPACK_XNNPACK_DELEGATE_TEST_H_
-
-#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
-
-// This test only function has undefined behaviour unless delegate was obtained
-// by calling TfLiteOpaqueDelegateGetData on a TfLiteOpaqueDelegate* that refers
-// to an XNNPACK delegate, or from the 'data' field of a TfLiteDelegate* that
-// refers to an XNNPACK delegate.
-TfLiteXNNPackDelegateOptions GetOptions(const void* delegate_data);
-#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_XNNPACK_DELEGATE_TEST_H_
diff --git a/tensorflow/lite/examples/label_image/bitmap_helpers.h b/tensorflow/lite/examples/label_image/bitmap_helpers.h
index 1a05d80162dc85..e16e10c8e1680b 100644
--- a/tensorflow/lite/examples/label_image/bitmap_helpers.h
+++ b/tensorflow/lite/examples/label_image/bitmap_helpers.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <string>
 
-#include "tensorflow/lite/examples/label_image/bitmap_helpers_impl.h"
+#include "tensorflow/lite/examples/label_image/bitmap_helpers_impl.h"  // IWYU pragma: export
 #include "tensorflow/lite/examples/label_image/label_image.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/examples/label_image/get_top_n.h b/tensorflow/lite/examples/label_image/get_top_n.h
index 13527260e46ed3..29bf702ccb42df 100644
--- a/tensorflow/lite/examples/label_image/get_top_n.h
+++ b/tensorflow/lite/examples/label_image/get_top_n.h
@@ -16,7 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_H_
 #define TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_H_
 
-#include "tensorflow/lite/examples/label_image/get_top_n_impl.h"
+#include <cstddef>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/examples/label_image/get_top_n_impl.h"  // IWYU pragma: export
 
 namespace tflite {
 namespace label_image {
diff --git a/tensorflow/lite/examples/label_image/label_image.cc b/tensorflow/lite/examples/label_image/label_image.cc
index 5811fc3006553d..9368f00d0fc5fe 100644
--- a/tensorflow/lite/examples/label_image/label_image.cc
+++ b/tensorflow/lite/examples/label_image/label_image.cc
@@ -22,17 +22,13 @@ limitations under the License.
 #include <sys/uio.h>    // NOLINT(build/include_order)
 #include <unistd.h>     // NOLINT(build/include_order)
 
-#include <cstdarg>
 #include <cstdio>
 #include <cstdlib>
 #include <fstream>
 #include <iomanip>
 #include <iostream>
-#include <map>
 #include <memory>
-#include <sstream>
 #include <string>
-#include <unordered_set>
 #include <utility>
 #include <vector>
 
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.bin b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.bin
index bfe2754f16ff3c..f67bed1daffa57 100644
Binary files a/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.bin and b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.bin differ
diff --git a/tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h b/tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h
index e49884e8e8f04f..471ca20469ae74 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h
+++ b/tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h
@@ -22,9 +22,9 @@ limitations under the License.
 
 // Ensure the included flatbuffers.h is the same version as when this file was
 // generated, otherwise it may not be compatible.
-static_assert(FLATBUFFERS_VERSION_MAJOR == 23 &&
-              FLATBUFFERS_VERSION_MINOR == 5 &&
-              FLATBUFFERS_VERSION_REVISION == 26,
+static_assert(FLATBUFFERS_VERSION_MAJOR == 24 &&
+              FLATBUFFERS_VERSION_MINOR == 3 &&
+              FLATBUFFERS_VERSION_REVISION == 25,
              "Non-compatible flatbuffers version included");
 
 namespace tflite {
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/BUILD b/tensorflow/lite/experimental/acceleration/mini_benchmark/BUILD
index ad625bdff90c29..a9990f90a1b7f3 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/BUILD
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/BUILD
@@ -1012,6 +1012,10 @@ cc_test(
 cc_test(
     name = "validator_runner_impl_test",
     srcs = ["validator_runner_impl_test.cc"],
+    # Disable odr violation check because of a libc++ symbol (b/337114862).
+    env = {
+        "ASAN_OPTIONS": "detect_odr_violation=0",
+    },
     deps = [
         ":benchmark_result_evaluator",
         ":embedded_mobilenet_model",
@@ -1099,6 +1103,13 @@ cc_test(
 cc_test(
     name = "validator_runner_test",
     srcs = ["validator_runner_test.cc"],
+    # Shared libraries that are built and loaded into internal Google applications are generally
+    # unsupported. One consequence of this lack of support is that AddressSanitizer's ODR checking
+    # will produce spurious failures with an upcoming Crosstool change. The workaround for this is
+    # to disable the ODR checking on affected tests.
+    env = {
+        "ASAN_OPTIONS": "detect_odr_violation=0",
+    },
     tags = [
         "no_mac",
         "no_windows",
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/build_defs.bzl b/tensorflow/lite/experimental/acceleration/mini_benchmark/build_defs.bzl
index 1c2a1a3f8cb561..6bc40c3c19571b 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/build_defs.bzl
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/build_defs.bzl
@@ -104,7 +104,7 @@ def validation_model(
         srcs = [
             main_model,
             jpegs,
-            "//tensorflow/lite/schema:schema.fbs",
+            "//tensorflow/compiler/mlir/lite/schema:schema.fbs",
             metrics_model,
         ],
         outs = [name + ".tflite"],
@@ -112,7 +112,7 @@ def validation_model(
           JPEGS='$(locations %s)'
           JPEGS=$${JPEGS// /,}
           $(location //tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier:embedder_cmdline) \
-              --schema=$(location //tensorflow/lite/schema:schema.fbs) \
+              --schema=$(location //tensorflow/compiler/mlir/lite/schema:schema.fbs) \
               --main_model=$(location %s) \
               --metrics_model=$(location %s) \
               %s %s \
diff --git a/tensorflow/lite/experimental/genai/kvcache.cc b/tensorflow/lite/experimental/genai/kvcache.cc
index adff1d5e0f18c6..14617bf6ab2f20 100644
--- a/tensorflow/lite/experimental/genai/kvcache.cc
+++ b/tensorflow/lite/experimental/genai/kvcache.cc
@@ -54,7 +54,8 @@ struct OpData {
   resource::CacheBuffer* key_cache_buffer;
   resource::CacheBuffer* value_cache_buffer;
   bool is_initialized;
-  bool output_initialized;
+  uint8_t* key_cache_ptr;
+  uint8_t* value_cache_ptr;
 };
 
 void* KVCacheInit(TfLiteContext* context, const char* buffer, size_t length) {
@@ -68,7 +69,8 @@ void* KVCacheInit(TfLiteContext* context, const char* buffer, size_t length) {
   op_data->key_cache_buffer = nullptr;
   op_data->value_cache_buffer = nullptr;
   op_data->is_initialized = false;
-  op_data->output_initialized = false;
+  op_data->key_cache_ptr = nullptr;
+  op_data->value_cache_ptr = nullptr;
   return op_data;
 }
 
@@ -125,9 +127,9 @@ TfLiteStatus KVCachePrepare(TfLiteContext* context, TfLiteNode* node) {
                     GetOutputSafe(context, node, kFullKeyTensor, &kfull));
   TF_LITE_ENSURE_OK(context,
                     GetOutputSafe(context, node, kFullValueTensor, &vfull));
-  // Need this to ensure memory remains persistent across invokes.
-  kfull->allocation_type = kTfLiteArenaRwPersistent;
-  vfull->allocation_type = kTfLiteArenaRwPersistent;
+  // Custom data pointer to the resource cache buffer.
+  kfull->allocation_type = kTfLiteCustom;
+  vfull->allocation_type = kTfLiteCustom;
 
   kfull->type = kTfLiteFloat32;
   vfull->type = kTfLiteFloat32;
@@ -152,11 +154,6 @@ TfLiteStatus KVCachePrepare(TfLiteContext* context, TfLiteNode* node) {
 
   TfLiteIntArray* vcache_buffer_dims = TfLiteIntArrayCopy(kcache_buffer_dims);
 
-  TF_LITE_ENSURE_OK(context,
-                    context->ResizeTensor(context, kfull, kcache_dims));
-  TF_LITE_ENSURE_OK(context,
-                    context->ResizeTensor(context, vfull, vcache_dims));
-
   // Get the pointer to the tensor for our buffer storage.
   Subgraph* subgraph = reinterpret_cast<Subgraph*>(context->impl_);
   auto& resources = subgraph->resources();
@@ -183,6 +180,43 @@ TfLiteStatus KVCachePrepare(TfLiteContext* context, TfLiteNode* node) {
     resource::CacheBuffer* cbuffer = (resource::CacheBuffer*)(resourcePtr);
     op_data->value_cache_buffer = cbuffer;
   }
+
+  // Get the pointers to the individual caches for a layer.
+  RuntimeShape shape(GetTensorShape(key));
+  const int elements_in_one_entry = shape.Dims(2) * shape.Dims(3);
+  const int elements_in_one_block =
+      op_data->max_num_entries * elements_in_one_entry;
+  uint8_t* k_ptr =
+      reinterpret_cast<uint8_t*>(op_data->key_cache_buffer->GetBuffer());
+  uint8_t* v_ptr =
+      reinterpret_cast<uint8_t*>(op_data->value_cache_buffer->GetBuffer());
+  k_ptr = k_ptr + sizeof(float) * op_data->layer_index * elements_in_one_block;
+  v_ptr = v_ptr + sizeof(float) * op_data->layer_index * elements_in_one_block;
+
+  size_t kcache_dims_flatsize = kcache_dims->data[0] * kcache_dims->data[1] *
+                                kcache_dims->data[2] * kcache_dims->data[3];
+  size_t vcache_dims_flatsize = vcache_dims->data[0] * vcache_dims->data[1] *
+                                vcache_dims->data[2] * vcache_dims->data[3];
+  RuntimeShape kfull_shape(GetTensorShape(kfull));
+  RuntimeShape vfull_shape(GetTensorShape(vfull));
+  // Some testing utils don't fully set the output tensor shape
+  if (kfull_shape.FlatSize() > 1 && vfull_shape.FlatSize() > 1) {
+    TF_LITE_ENSURE_EQ(context, kfull_shape.FlatSize(), kcache_dims_flatsize);
+    TF_LITE_ENSURE_EQ(context, vfull_shape.FlatSize(), vcache_dims_flatsize);
+  }
+  TF_LITE_ENSURE_EQ(context, elements_in_one_block, kcache_dims_flatsize);
+  TF_LITE_ENSURE_EQ(context, elements_in_one_block, vcache_dims_flatsize);
+
+  kfull->data.data = k_ptr;
+  vfull->data.data = v_ptr;
+  op_data->key_cache_ptr = k_ptr;
+  op_data->value_cache_ptr = v_ptr;
+
+  TF_LITE_ENSURE_OK(context,
+                    context->ResizeTensor(context, kfull, kcache_dims));
+  TF_LITE_ENSURE_OK(context,
+                    context->ResizeTensor(context, vfull, vcache_dims));
+
   TfLiteIntArrayFree(kcache_buffer_dims);
   TfLiteIntArrayFree(vcache_buffer_dims);
   return kTfLiteOk;
@@ -231,14 +265,11 @@ TfLiteStatus KVCacheEval(TfLiteContext* context, TfLiteNode* node) {
   k_ptr = k_ptr + sizeof(float) * op_data->layer_index * elements_in_one_block;
   v_ptr = v_ptr + sizeof(float) * op_data->layer_index * elements_in_one_block;
 
-  // 0. Init the outputs from the cache buffer if needed.
-  if (!op_data->output_initialized) {
-    float* kfull_ptr = GetTensorData<float>(kfull);
-    float* vfull_ptr = GetTensorData<float>(vfull);
-    memcpy(kfull_ptr, k_ptr, kfull->bytes);
-    memcpy(vfull_ptr, v_ptr, kfull->bytes);
-    op_data->output_initialized = true;
-  }
+  // 0. Ensure output ptr is pointing to the cache data
+  TF_LITE_ENSURE_EQ(context, k_ptr, op_data->key_cache_ptr);
+  TF_LITE_ENSURE_EQ(context, v_ptr, op_data->value_cache_ptr);
+  TF_LITE_ENSURE_EQ(context, k_ptr, kfull->data.data);
+  TF_LITE_ENSURE_EQ(context, v_ptr, vfull->data.data);
 
   // 1. Determine which slots the inputs take up, and which slots are in the
   //    existing span of the cache.
@@ -302,14 +333,6 @@ TfLiteStatus KVCacheEval(TfLiteContext* context, TfLiteNode* node) {
   memcpy(k_ptr + bytes_offset_for_cache, key->data.data, key->bytes);
   memcpy(v_ptr + bytes_offset_for_cache, value->data.data, value->bytes);
 
-  // 5. Set the output tensors with the relevant block's cache.
-  memcpy((uint8_t*)(kfull->data.data) + byte_offset_for_output,
-         k_ptr + byte_offset_for_output,
-         num_slots_for_output * num_bytes_per_tensor);
-  memcpy((uint8_t*)(vfull->data.data) + byte_offset_for_output,
-         v_ptr + byte_offset_for_output,
-         num_slots_for_output * num_bytes_per_tensor);
-
   // Update counts.
   current_num_entries =
       std::min(first_slot + num_slots_needed, max_num_entries);
diff --git a/tensorflow/lite/experimental/genai/sdpa.cc b/tensorflow/lite/experimental/genai/sdpa.cc
index c551c68096c315..59d9e0225a0339 100644
--- a/tensorflow/lite/experimental/genai/sdpa.cc
+++ b/tensorflow/lite/experimental/genai/sdpa.cc
@@ -45,7 +45,7 @@ static const int kValueTensor = 2;
 static const int kAttentionMaskTensor = 3;
 static const int kOutputTensor = 0;
 
-static const int kNumTempTensors = 8;
+static const int kNumTempTensors = 10;
 static const int kTransposeQueryTempTensorIndex = 0;
 static const int kTransposeKeyTempTensorIndex = 1;
 static const int kMatMul1TempTensorIndex = 2;
@@ -54,6 +54,8 @@ static const int kTransposeValueTempTensorIndex = 4;
 static const int kMatMul2TempTensorIndex = 5;
 static const int kReshape1TempTensorIndex = 6;
 static const int kReshape2TempTensorIndex = 7;
+static const int kBroadcastKTempTensorIndex = 8;
+static const int kBroadcastVTempTensorIndex = 9;
 
 struct OpData {
   float scale;
@@ -305,6 +307,50 @@ TfLiteStatus SDPAPrepare(TfLiteContext* context, TfLiteNode* node) {
                                                      scratch_buffer_size));
   }
 
+  // Temp tensor for Broadcast K
+  {
+    node->temporaries->data[kBroadcastKTempTensorIndex] =
+        op_data->scratch_tensor_index + kBroadcastKTempTensorIndex;
+    TfLiteTensor* scratch_buffer;
+    TF_LITE_ENSURE_OK(context,
+                      GetTemporarySafe(context, node,
+                                       /*index=*/kBroadcastKTempTensorIndex,
+                                       &scratch_buffer));
+    TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(4);
+
+    scratch_buffer_size->data[0] = k_tensor->dims->data[0];
+    scratch_buffer_size->data[1] = q_tensor->dims->data[2];  // num_heads
+    scratch_buffer_size->data[2] = k_tensor->dims->data[1];
+    scratch_buffer_size->data[3] = k_tensor->dims->data[3];
+
+    scratch_buffer->type = kTfLiteFloat32;
+    scratch_buffer->allocation_type = kTfLiteArenaRw;
+    TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer,
+                                                     scratch_buffer_size));
+  }
+
+  // Temp tensor for Broadcast V
+  {
+    node->temporaries->data[kBroadcastVTempTensorIndex] =
+        op_data->scratch_tensor_index + kBroadcastVTempTensorIndex;
+    TfLiteTensor* scratch_buffer;
+    TF_LITE_ENSURE_OK(context,
+                      GetTemporarySafe(context, node,
+                                       /*index=*/kBroadcastVTempTensorIndex,
+                                       &scratch_buffer));
+    TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(4);
+
+    scratch_buffer_size->data[0] = v_tensor->dims->data[0];
+    scratch_buffer_size->data[1] = q_tensor->dims->data[2];  // num_heads
+    scratch_buffer_size->data[2] = v_tensor->dims->data[3];
+    scratch_buffer_size->data[3] = v_tensor->dims->data[1];
+
+    scratch_buffer->type = kTfLiteFloat32;
+    scratch_buffer->allocation_type = kTfLiteArenaRw;
+    TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer,
+                                                     scratch_buffer_size));
+  }
+
   return kTfLiteOk;
 }
 
@@ -407,10 +453,25 @@ TfLiteStatus SDPAEval(TfLiteContext* context, TfLiteNode* node) {
   auto reshape_v_or_add_out_shape = GetTensorShape(reshape_v_or_add_out_tensor);
   auto reshape_v_or_add_out_data =
       GetTensorData<float>(reshape_v_or_add_out_tensor);
+  TfLiteTensor* broadcast_k_out_tensor;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetTemporarySafe(context, node, /*index=*/kBroadcastKTempTensorIndex,
+                       &broadcast_k_out_tensor));
+  auto broadcast_k_out_shape = GetTensorShape(broadcast_k_out_tensor);
+  auto broadcast_k_out_data = GetTensorData<float>(broadcast_k_out_tensor);
+  TfLiteTensor* broadcast_v_out_tensor;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetTemporarySafe(context, node, /*index=*/kBroadcastVTempTensorIndex,
+                       &broadcast_v_out_tensor));
+  auto broadcast_v_out_shape = GetTensorShape(broadcast_v_out_tensor);
+  auto broadcast_v_out_data = GetTensorData<float>(broadcast_v_out_tensor);
 
   OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
 
   bool mqa = key_tensor->dims->data[2] == 1;
+  bool gqa = !mqa && (key_tensor->dims->data[2] != query_tensor->dims->data[2]);
 
   // scale * q
   float scale = op_data->scale;
@@ -442,6 +503,27 @@ TfLiteStatus SDPAEval(TfLiteContext* context, TfLiteNode* node) {
   reference_ops::Transpose(transpose_k_params, key_shape, key_data,
                            transpose_k_out_shape, transpose_k_out_data);
 
+  // broadcast k to match num_heads
+  // broadcasting similar to torch.repeat_interleave
+  if (gqa) {
+    float* transpose_k_ptr = transpose_k_out_data;
+    float* broadcast_k_ptr = broadcast_k_out_data;
+    int num_elements =
+        transpose_k_out_shape.Dims(2) * transpose_k_out_shape.Dims(3);
+    int num_repeat =
+        broadcast_k_out_shape.Dims(1) / transpose_k_out_shape.Dims(1);
+    for (int i = 0; i < transpose_k_out_shape.Dims(0); ++i) {
+      for (int j = 0; j < transpose_k_out_shape.Dims(1); ++j) {
+        for (int k = 0; k < num_repeat; ++k) {
+          memcpy(broadcast_k_ptr, transpose_k_ptr,
+                 num_elements * sizeof(float));
+          broadcast_k_ptr += num_elements;
+        }
+        transpose_k_ptr += num_elements;
+      }
+    }
+  }
+
   // reshape k for MQA, or transpose q for MHA
   if (mqa) {
     TF_LITE_ENSURE_EQ(context, transpose_k_out_tensor->bytes,
@@ -471,8 +553,12 @@ TfLiteStatus SDPAEval(TfLiteContext* context, TfLiteNode* node) {
         fc_params, transpose_q_out_shape, transpose_q_out_data,
         reshape_k_or_q_out_shape, reshape_k_or_q_out_data, RuntimeShape(),
         nullptr, matmul1_out_shape, matmul1_out_data);
-  } else {
+  } else if (gqa) {
     // pass rhs first (this is why we transpose q above)
+    reference_ops::BatchMatMul(
+        broadcast_k_out_shape, broadcast_k_out_data, reshape_k_or_q_out_shape,
+        reshape_k_or_q_out_data, matmul1_out_shape, matmul1_out_data);
+  } else {
     reference_ops::BatchMatMul(
         transpose_k_out_shape, transpose_k_out_data, reshape_k_or_q_out_shape,
         reshape_k_or_q_out_data, matmul1_out_shape, matmul1_out_data);
@@ -501,6 +587,27 @@ TfLiteStatus SDPAEval(TfLiteContext* context, TfLiteNode* node) {
   reference_ops::Transpose(transpose_v_params, value_shape, value_data,
                            transpose_v_out_shape, transpose_v_out_data);
 
+  // broadcast v to match num_heads
+  // broadcasting similar to torch.repeat_interleave
+  if (gqa) {
+    float* transpose_v_ptr = transpose_v_out_data;
+    float* broadcast_v_ptr = broadcast_v_out_data;
+    int num_elements =
+        transpose_v_out_shape.Dims(2) * transpose_v_out_shape.Dims(3);
+    int num_repeat =
+        broadcast_v_out_shape.Dims(1) / transpose_v_out_shape.Dims(1);
+    for (int i = 0; i < transpose_v_out_shape.Dims(0); ++i) {
+      for (int j = 0; j < transpose_v_out_shape.Dims(1); ++j) {
+        for (int k = 0; k < num_repeat; ++k) {
+          memcpy(broadcast_v_ptr, transpose_v_ptr,
+                 num_elements * sizeof(float));
+          broadcast_v_ptr += num_elements;
+        }
+        transpose_v_ptr += num_elements;
+      }
+    }
+  }
+
   // reshape v for MQA, or add_out (softmax_out)
   if (mqa) {
     TF_LITE_ENSURE_EQ(context, transpose_v_out_tensor->bytes,
@@ -530,8 +637,12 @@ TfLiteStatus SDPAEval(TfLiteContext* context, TfLiteNode* node) {
                                   reshape_v_or_add_out_shape,
                                   reshape_v_or_add_out_data, RuntimeShape(),
                                   nullptr, matmul2_out_shape, matmul2_out_data);
-  } else {
+  } else if (gqa) {
     // pass rhs first (this is why we transpose add_out above)
+    reference_ops::BatchMatMul(
+        broadcast_v_out_shape, broadcast_v_out_data, reshape_v_or_add_out_shape,
+        reshape_v_or_add_out_data, matmul2_out_shape, matmul2_out_data);
+  } else {
     reference_ops::BatchMatMul(
         transpose_v_out_shape, transpose_v_out_data, reshape_v_or_add_out_shape,
         reshape_v_or_add_out_data, matmul2_out_shape, matmul2_out_data);
diff --git a/tensorflow/lite/experimental/microfrontend/ops/audio_microfrontend_op.cc b/tensorflow/lite/experimental/microfrontend/ops/audio_microfrontend_op.cc
index 9168628e317523..8cba5779565223 100644
--- a/tensorflow/lite/experimental/microfrontend/ops/audio_microfrontend_op.cc
+++ b/tensorflow/lite/experimental/microfrontend/ops/audio_microfrontend_op.cc
@@ -96,7 +96,7 @@ REGISTER_OP("AudioMicrofrontend")
 
       ShapeHandle output = ctx->MakeShape({num_frames, num_features});
       ctx->set_output(0, output);
-      return OkStatus();
+      return absl::OkStatus();
     })
     .Doc(R"doc(
 Audio Microfrontend Op.
diff --git a/tensorflow/lite/experimental/shlo/data_type.h b/tensorflow/lite/experimental/shlo/data_type.h
index f3484483813739..267b923884c29a 100644
--- a/tensorflow/lite/experimental/shlo/data_type.h
+++ b/tensorflow/lite/experimental/shlo/data_type.h
@@ -33,63 +33,49 @@ enum class DataType {
   kSI8,
   kSI16,
   kSI32,
+  kSI64,
   kBF16,
   kF16,
   kF32,
 };
 
+template <class T>
+struct DefaultStorageDescription {
+  using Type = T;
+  static constexpr Type kMinValue = std::numeric_limits<Type>::lowest();
+  static constexpr Type kMaxValue = std::numeric_limits<Type>::max();
+};
+
 // Storage provides the corresponding C++ type for the given DataType.
 template <DataType data_type>
 struct Storage {};
 
 template <>
-struct Storage<DataType::kI1> {
-  using Type = bool;
-  static constexpr Type kMinValue = std::numeric_limits<Type>::lowest();
-  static constexpr Type kMaxValue = std::numeric_limits<Type>::max();
-};
+struct Storage<DataType::kI1> : DefaultStorageDescription<bool> {};
+
 template <>
-struct Storage<DataType::kSI4> {
-  using Type = I4;
-  static constexpr Type kMinValue = std::numeric_limits<Type>::lowest();
-  static constexpr Type kMaxValue = std::numeric_limits<Type>::max();
-};
+struct Storage<DataType::kSI4> : DefaultStorageDescription<I4> {};
+
 template <>
-struct Storage<DataType::kSI8> {
-  using Type = int8_t;
-  static constexpr Type kMinValue = std::numeric_limits<Type>::lowest();
-  static constexpr Type kMaxValue = std::numeric_limits<Type>::max();
-};
+struct Storage<DataType::kSI8> : DefaultStorageDescription<int8_t> {};
+
 template <>
-struct Storage<DataType::kSI16> {
-  using Type = int16_t;
-  static constexpr Type kMinValue = std::numeric_limits<Type>::lowest();
-  static constexpr Type kMaxValue = std::numeric_limits<Type>::max();
-};
+struct Storage<DataType::kSI16> : DefaultStorageDescription<int16_t> {};
+
 template <>
-struct Storage<DataType::kSI32> {
-  using Type = int32_t;
-  static constexpr Type kMinValue = std::numeric_limits<Type>::lowest();
-  static constexpr Type kMaxValue = std::numeric_limits<Type>::max();
-};
+struct Storage<DataType::kSI32> : DefaultStorageDescription<int32_t> {};
+
 template <>
-struct Storage<DataType::kBF16> {
-  using Type = BF16;
-  static constexpr Type kMinValue = std::numeric_limits<Type>::lowest();
-  static constexpr Type kMaxValue = std::numeric_limits<Type>::max();
-};
+struct Storage<DataType::kSI64> : DefaultStorageDescription<int64_t> {};
+
 template <>
-struct Storage<DataType::kF16> {
-  using Type = F16;
-  static constexpr Type kMinValue = std::numeric_limits<Type>::lowest();
-  static constexpr Type kMaxValue = std::numeric_limits<Type>::max();
-};
+struct Storage<DataType::kBF16> : DefaultStorageDescription<BF16> {};
+
 template <>
-struct Storage<DataType::kF32> {
-  using Type = float;
-  static constexpr Type kMinValue = std::numeric_limits<Type>::lowest();
-  static constexpr Type kMaxValue = std::numeric_limits<Type>::max();
-};
+struct Storage<DataType::kF16> : DefaultStorageDescription<F16> {};
+
+template <>
+struct Storage<DataType::kF32> : DefaultStorageDescription<float> {};
 
 template <DataType data_type>
 using StorageType = typename Storage<data_type>::Type;
@@ -129,13 +115,14 @@ constexpr int64_t SizeOf(DataType data_type) {
       return SizeOf<DataType::kSI16>();
     case DataType::kSI32:
       return SizeOf<DataType::kSI32>();
+    case DataType::kSI64:
+      return SizeOf<DataType::kSI64>();
     case DataType::kBF16:
       return SizeOf<DataType::kBF16>();
     case DataType::kF16:
       return SizeOf<DataType::kF16>();
     case DataType::kF32:
       return SizeOf<DataType::kF32>();
-      break;
   }
 }
 
@@ -157,6 +144,9 @@ constexpr const char* ToString(DataType t) {
     case DataType::kSI32:
       return "SI32";
       break;
+    case DataType::kSI64:
+      return "SI64";
+      break;
     case DataType::kBF16:
       return "BF16";
       break;
diff --git a/tensorflow/lite/experimental/shlo/f16.h b/tensorflow/lite/experimental/shlo/f16.h
index 82f9425cfe6ca7..594adadcf1b3ea 100644
--- a/tensorflow/lite/experimental/shlo/f16.h
+++ b/tensorflow/lite/experimental/shlo/f16.h
@@ -50,18 +50,63 @@ class alignas(uint16_t) F16 {
 
   F16& operator=(float x) { return *this = static_cast<F16>(x); }
 
-#define SHLO_REF_DECLARE_BINARY_OP(OP)   \
-  friend F16 operator OP(F16 x, F16 y);  \
-  template <typename T, typename SIFNAE> \
-  friend auto operator OP(F16 x, T y);   \
-  template <typename T, typename SIFNAE> \
-  friend auto operator OP(T x, F16 y);
-
-#define SHLO_REF_DECLARE_BINARY_ASSIGN_OP(OP) \
-  SHLO_REF_DECLARE_BINARY_OP(OP);             \
-  friend F16& operator OP##=(F16 & x, F16 y); \
-  template <typename T, typename SIFNAE>      \
-  friend F16& operator OP##=(F16 & x, T y);
+#ifdef SHLO_REF_HAS_BUILTIN_FLOAT16
+
+#define SHLO_REF_DEFINE_BINARY_OP(OP)                                         \
+  friend F16 operator OP(F16 x, F16 y) { return x.native_ OP y.native_; }     \
+                                                                              \
+  template <typename T, typename = std::enable_if_t<std::is_arithmetic_v<T>>> \
+  friend auto operator OP(F16 x, T y) {                                       \
+    return x.native_ OP y;                                                    \
+  }                                                                           \
+                                                                              \
+  template <typename T, typename = std::enable_if_t<std::is_arithmetic_v<T>>> \
+  friend auto operator OP(T x, F16 y) {                                       \
+    return x OP y.native_;                                                    \
+  }
+
+#define SHLO_REF_DEFINE_BINARY_ASSIGN_OP(OP)                               \
+  SHLO_REF_DEFINE_BINARY_OP(OP);                                           \
+  friend F16& operator OP##=(F16 & x, F16 y) {                             \
+    x.native_ OP## = y.native_;                                            \
+    return x;                                                              \
+  }                                                                        \
+                                                                           \
+  template <class T, typename = std::enable_if_t<std::is_arithmetic_v<T>>> \
+  friend F16& operator OP##=(F16 & x, T y) {                               \
+    x.native_ OP## = y;                                                    \
+    return x;                                                              \
+  }
+
+#else  // !SHLO_REF_HAS_BUILTIN_FLOAT16
+
+#define SHLO_REF_DEFINE_BINARY_OP(OP)                              \
+  friend F16 operator OP(F16 x, F16 y) {                           \
+    return F16(static_cast<float>(x) OP static_cast<float>(y));    \
+  }                                                                \
+                                                                   \
+  template <typename T, typename C = std::common_type_t<F16, T>>   \
+  friend C operator OP(F16 x, T y) {                               \
+    return static_cast<C>(static_cast<C>(x) OP static_cast<C>(y)); \
+  }                                                                \
+                                                                   \
+  template <typename T, typename C = std::common_type_t<F16, T>>   \
+  friend C operator OP(T x, F16 y) {                               \
+    return static_cast<C>(static_cast<C>(x) OP static_cast<C>(y)); \
+  }
+
+#define SHLO_REF_DEFINE_BINARY_ASSIGN_OP(OP)                               \
+  SHLO_REF_DEFINE_BINARY_OP(OP);                                           \
+  friend F16& operator OP##=(F16 & x, F16 y) {                             \
+    return x = F16(static_cast<float>(x) OP static_cast<float>(y));        \
+  }                                                                        \
+                                                                           \
+  template <class T, typename = std::enable_if_t<std::is_arithmetic_v<T>>> \
+  friend F16& operator OP##=(F16 & x, T y) {                               \
+    return x = static_cast<float>(x) OP y;                                 \
+  }
+
+#endif  // SHLO_REF_HAS_BUILTIN_FLOAT16
 
   friend F16 operator+(F16 x);
   friend F16 operator-(F16 x);
@@ -69,18 +114,18 @@ class alignas(uint16_t) F16 {
   friend F16 operator++(F16& x, int);
   friend F16& operator--(F16& x);
   friend F16 operator--(F16& x, int);
-  SHLO_REF_DECLARE_BINARY_ASSIGN_OP(+);
-  SHLO_REF_DECLARE_BINARY_ASSIGN_OP(-);
-  SHLO_REF_DECLARE_BINARY_ASSIGN_OP(*);
-  SHLO_REF_DECLARE_BINARY_ASSIGN_OP(/);
-  SHLO_REF_DECLARE_BINARY_OP(<);
-  SHLO_REF_DECLARE_BINARY_OP(<=);
-  SHLO_REF_DECLARE_BINARY_OP(>);
-  SHLO_REF_DECLARE_BINARY_OP(>=);
-  SHLO_REF_DECLARE_BINARY_OP(==);
-  SHLO_REF_DECLARE_BINARY_OP(!=);
-#undef SHLO_REF_DECLARE_BINARY_ASSIGN_OP
-#undef SHLO_REF_DECLARE_BINARY_OP
+  SHLO_REF_DEFINE_BINARY_ASSIGN_OP(+);
+  SHLO_REF_DEFINE_BINARY_ASSIGN_OP(-);
+  SHLO_REF_DEFINE_BINARY_ASSIGN_OP(*);
+  SHLO_REF_DEFINE_BINARY_ASSIGN_OP(/);
+  SHLO_REF_DEFINE_BINARY_OP(<);
+  SHLO_REF_DEFINE_BINARY_OP(<=);
+  SHLO_REF_DEFINE_BINARY_OP(>);
+  SHLO_REF_DEFINE_BINARY_OP(>=);
+  SHLO_REF_DEFINE_BINARY_OP(==);
+  SHLO_REF_DEFINE_BINARY_OP(!=);
+#undef SHLO_REF_DEFINE_BINARY_ASSIGN_OP
+#undef SHLO_REF_DEFINE_BINARY_OP
 
  private:
   union {
@@ -91,20 +136,51 @@ class alignas(uint16_t) F16 {
     uint16_t bits_;
   };
 };
+
+namespace detail {
+
+template <class T, class SFINAE = void>
+struct F16CommonType {};
+
+template <class T>
+struct F16CommonType<T, std::enable_if_t<std::is_integral_v<T>>> {
+  using type = F16;
+};
+
+template <class T>
+struct F16CommonType<T, std::enable_if_t<std::is_floating_point_v<T>>> {
+  using type = T;
+};
+
+template <class T>
+struct F16CommonType<T, std::enable_if_t<!std::is_arithmetic_v<T> &&
+                                         std::is_convertible_v<T, float>>> {
+  using type = float;
+};
+
+template <class T>
+struct F16CommonType<T, std::enable_if_t<!std::is_arithmetic_v<T> &&
+                                         !std::is_convertible_v<T, float> &&
+                                         std::is_convertible_v<T, double>>> {
+  using type = double;
+};
+
+}  // namespace detail
+
 }  // namespace shlo_ref
 
 namespace std {
 
-template <typename T>
-struct common_type<shlo_ref::F16, T> {
-  static_assert(
-      std::is_arithmetic_v<T>,
-      "Can only find a common type between F16 and an arithmetic types.");
+template <>
+struct common_type<shlo_ref::F16, shlo_ref::F16> {
   using type = shlo_ref::F16;
 };
 
 template <typename T>
-struct common_type<T, shlo_ref::F16> : common_type<shlo_ref::F16, T> {};
+struct common_type<shlo_ref::F16, T> : shlo_ref::detail::F16CommonType<T> {};
+
+template <typename T>
+struct common_type<T, shlo_ref::F16> : shlo_ref::detail::F16CommonType<T> {};
 
 }  // namespace std
 
@@ -115,7 +191,7 @@ static_assert(alignof(F16) == alignof(uint16_t));
 #ifdef SHLO_REF_HAS_BUILTIN_FLOAT16
 
 template <typename T, typename _>
-F16::F16(T x) : native_(static_cast<_Float16>(x)) {}
+F16::F16(T x) : native_(static_cast<_Float16>(static_cast<float>(x))) {}
 
 inline F16::operator float() const { return native_; }
 
@@ -125,46 +201,14 @@ inline F16 operator+(F16 x) { return x.native_; }
 
 inline F16 operator-(F16 x) { return -x.native_; }
 
-inline F16& operator++(F16& x) {
-  ++x.native_;
-  return x;
-}
+inline F16& operator++(F16& x) { return x += 1; }
 
 inline F16 operator++(F16& x, int) { return x.native_++; }
 
-inline F16& operator--(F16& x) {
-  --x.native_;
-  return x;
-}
+inline F16& operator--(F16& x) { return x -= 1; }
 
 inline F16 operator--(F16& x, int) { return x.native_--; }
 
-#define INTERNAL_F16_ARITHMETIC_OP(OP)                                        \
-  inline F16 operator OP(F16 x, F16 y) { return x.native_ OP y.native_; }     \
-                                                                              \
-  template <typename T, typename = std::enable_if_t<std::is_arithmetic_v<T>>> \
-  inline auto operator OP(F16 x, T y) {                                       \
-    return x.native_ OP y;                                                    \
-  }                                                                           \
-                                                                              \
-  template <typename T, typename = std::enable_if_t<std::is_arithmetic_v<T>>> \
-  inline auto operator OP(T x, F16 y) {                                       \
-    return x OP y.native_;                                                    \
-  }
-
-#define INTERNAL_F16_ARITHMETIC_ASSIGN_OP(OP)                              \
-  INTERNAL_F16_ARITHMETIC_OP(OP);                                          \
-  inline F16& operator OP##=(F16 & x, F16 y) {                             \
-    x.native_ OP## = y.native_;                                            \
-    return x;                                                              \
-  }                                                                        \
-                                                                           \
-  template <class T, typename = std::enable_if_t<std::is_arithmetic_v<T>>> \
-  inline F16& operator OP##=(F16 & x, T y) {                               \
-    x.native_ OP## = y;                                                    \
-    return x;                                                              \
-  }
-
 #else  // !SHLO_REF_HAS_BUILTIN_FLOAT16
 
 template <typename T, typename _>
@@ -175,32 +219,6 @@ inline F16::operator float() const { return fp16_ieee_to_fp32_value(bits_); }
 
 inline F16::operator bool() const { return bits_; }
 
-#define INTERNAL_F16_ARITHMETIC_OP(OP)                          \
-  inline F16 operator OP(F16 x, F16 y) {                        \
-    return F16(static_cast<float>(x) OP static_cast<float>(y)); \
-  }                                                             \
-                                                                \
-  template <typename T, typename C = std::common_type<F16, T>>  \
-  inline auto operator OP(F16 x, T y) {                         \
-    return static_cast<C>(static_cast<float>(x) OP y);          \
-  }                                                             \
-                                                                \
-  template <typename T, typename C = std::common_type<F16, T>>  \
-  inline std::common_type<F16, T> operator OP(T x, F16 y) {     \
-    return static_cast<C>(x OP static_cast<float>(y));          \
-  }
-
-#define INTERNAL_F16_ARITHMETIC_ASSIGN_OP(OP)                              \
-  INTERNAL_F16_ARITHMETIC_OP(OP);                                          \
-  inline F16& operator OP##=(F16 & x, F16 y) {                             \
-    return x = F16(static_cast<float>(x) OP static_cast<float>(y));        \
-  }                                                                        \
-                                                                           \
-  template <class T, typename = std::enable_if_t<std::is_arithmetic_v<T>>> \
-  inline F16& operator OP##=(F16 & x, T y) {                               \
-    return x = static_cast<float>(x) OP y;                                 \
-  }
-
 inline F16 operator-(F16 x) { return F16(-static_cast<float>(x)); }
 inline F16 operator+(F16 x) { return F16(static_cast<float>(x)); }
 
@@ -222,20 +240,6 @@ inline F16 operator--(F16& x, int) {
 
 #endif
 
-INTERNAL_F16_ARITHMETIC_ASSIGN_OP(+)
-INTERNAL_F16_ARITHMETIC_ASSIGN_OP(-)
-INTERNAL_F16_ARITHMETIC_ASSIGN_OP(*)
-INTERNAL_F16_ARITHMETIC_ASSIGN_OP(/)
-INTERNAL_F16_ARITHMETIC_OP(<)
-INTERNAL_F16_ARITHMETIC_OP(<=)
-INTERNAL_F16_ARITHMETIC_OP(>)
-INTERNAL_F16_ARITHMETIC_OP(>=)
-INTERNAL_F16_ARITHMETIC_OP(==)
-INTERNAL_F16_ARITHMETIC_OP(!=)
-
-#undef INTERNAL_F16_ARITHMETIC_OP
-#undef INTERNAL_F16_ARITHMETIC_ASSIGN_OP
-
 }  // namespace shlo_ref
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_F16_H_
diff --git a/tensorflow/lite/experimental/shlo/i4.h b/tensorflow/lite/experimental/shlo/i4.h
index 07a2c0f87f55a6..35d7203131fa26 100644
--- a/tensorflow/lite/experimental/shlo/i4.h
+++ b/tensorflow/lite/experimental/shlo/i4.h
@@ -24,7 +24,7 @@ limitations under the License.
 namespace shlo_ref {
 
 struct I4 {
-  int8_t data;
+  int8_t data = 0;
 
   constexpr I4() = default;
   constexpr I4(const I4&) = default;
diff --git a/tensorflow/lite/experimental/shlo/legacy/test/elementwise_unary_test.cc b/tensorflow/lite/experimental/shlo/legacy/test/elementwise_unary_test.cc
index 11a2564cba9d11..195157362d5d24 100644
--- a/tensorflow/lite/experimental/shlo/legacy/test/elementwise_unary_test.cc
+++ b/tensorflow/lite/experimental/shlo/legacy/test/elementwise_unary_test.cc
@@ -101,24 +101,9 @@ TEST(ElementwiseBinary, AbsQuantized) {
   test<ElementType::kSI8, ElementType::kF32>(
       Abs, {5}, {.scale = 1e-1, .zero_point = -1}, {0, 1, -2, 3, -4},
       {0, 1, 2, 3, 4});
-  test<ElementType::kSI16, ElementType::kBF16>(
-      Abs, {5}, {.scale = 1, .zero_point = 0}, {0, 1, -2, 3, -4},
-      {0, 1, 2, 3, 4});
-  test<ElementType::kSI16, ElementType::kF16>(
-      Abs, {5}, {.scale = 1e-2, .zero_point = 1}, {0, 1, -2, 3, -4},
-      {0, 1, 2, 3, 4});
   test<ElementType::kSI16, ElementType::kF32>(
       Abs, {5}, {.scale = 1e-3, .zero_point = -1}, {0, 1, -2, 3, -4},
       {0, 1, 2, 3, 4});
-  test<ElementType::kSI32, ElementType::kBF16>(
-      Abs, {5}, {.scale = 1, .zero_point = 0}, {0, 1, -2, 3, -4},
-      {0, 1, 2, 3, 4});
-  test<ElementType::kSI32, ElementType::kF16>(
-      Abs, {5}, {.scale = 1e-1, .zero_point = 1}, {0, 1, -2, 3, -4},
-      {0, 1, 2, 3, 4});
-  test<ElementType::kSI32, ElementType::kF32>(
-      Abs, {5}, {.scale = 1e-2, .zero_point = -1}, {0, 1, -2, 3, -4},
-      {0, 1, 2, 3, 4});
 }
 
 TEST(ElementwiseUnary, Cbrt) {
@@ -143,24 +128,9 @@ TEST(ElementwiseUnary, CbrtQuantized) {
   test<ElementType::kSI8, ElementType::kF32>(
       Cbrt, {4}, {.scale = 1e-1, .zero_point = 4}, {0, 1, -2, 3},
       {0, 1, -1.25992104989487316476f, 1.44224957030740838232f});
-  test<ElementType::kSI16, ElementType::kBF16>(
-      Cbrt, {4}, {.scale = 1e-1, .zero_point = 0}, {0, 1, -2, 3},
-      {0, 1, -1.25992104989487316476f, 1.44224957030740838232f});
-  test<ElementType::kSI16, ElementType::kF16>(
-      Cbrt, {4}, {.scale = 1e-1, .zero_point = -2}, {0, 1, -2, 3},
-      {0, 1, -1.25992104989487316476f, 1.44224957030740838232f});
   test<ElementType::kSI16, ElementType::kF32>(
       Cbrt, {4}, {.scale = 1e-1, .zero_point = 4}, {0, 1, -2, 3},
       {0, 1, -1.25992104989487316476f, 1.44224957030740838232f});
-  test<ElementType::kSI32, ElementType::kBF16>(
-      Cbrt, {4}, {.scale = 1e-1, .zero_point = 0}, {0, 1, -2, 3},
-      {0, 1, -1.25992104989487316476f, 1.44224957030740838232f});
-  test<ElementType::kSI32, ElementType::kF16>(
-      Cbrt, {4}, {.scale = 1e-1, .zero_point = -2}, {0, 1, -2, 3},
-      {0, 1, -1.25992104989487316476f, 1.44224957030740838232f});
-  test<ElementType::kSI32, ElementType::kF32>(
-      Cbrt, {4}, {.scale = 1e-1, .zero_point = 4}, {0, 1, -2, 3},
-      {0, 1, -1.25992104989487316476f, 1.44224957030740838232f});
 }
 
 TEST(ElementwiseUnary, Ceil) {
@@ -179,24 +149,9 @@ TEST(ElementwiseUnary, CeilQuantized) {
   test<ElementType::kSI8, ElementType::kF32>(
       Ceil, {4}, {.scale = 1e-1, .zero_point = -4}, {0, 1.1, -2.7, 3.5},
       {0, 2, -2, 4});
-  test<ElementType::kSI16, ElementType::kBF16>(
-      Ceil, {4}, {.scale = 1e-2, .zero_point = 0}, {0, 1.11, -2.77, 3.55},
-      {0, 2, -2, 4});
-  test<ElementType::kSI16, ElementType::kF16>(
-      Ceil, {4}, {.scale = 1e-2, .zero_point = 4}, {0, 1.11, -2.77, 3.55},
-      {0, 2, -2, 4});
   test<ElementType::kSI16, ElementType::kF32>(
       Ceil, {4}, {.scale = 1e-2, .zero_point = -4}, {0, 1.11, -2.77, 3.55},
       {0, 2, -2, 4});
-  test<ElementType::kSI32, ElementType::kBF16>(
-      Ceil, {4}, {.scale = 1e-3, .zero_point = 0}, {0, 1.11, -2.77, 3.55},
-      {0, 2, -2, 4});
-  test<ElementType::kSI32, ElementType::kF16>(
-      Ceil, {4}, {.scale = 1e-3, .zero_point = 4}, {0, 1.11, -2.77, 3.55},
-      {0, 2, -2, 4});
-  test<ElementType::kSI32, ElementType::kF32>(
-      Ceil, {4}, {.scale = 1e-3, .zero_point = -4}, {0, 1.11, -2.77, 3.55},
-      {0, 2, -2, 4});
 }
 
 TEST(ElementwiseUnary, Cosine) {
@@ -224,30 +179,10 @@ TEST(ElementwiseUnary, CosineQuantized) {
       Cosine, {4}, {.scale = 1e-1, .zero_point = 0}, {0, 1.1, -1.1, 2.3},
       {1, 0.45359612142557738777f, 0.45359612142557738777f,
        -0.66627602127982419331f});
-  test<ElementType::kSI16, ElementType::kBF16>(
-      Cosine, {4}, {.scale = 1e-2, .zero_point = 0}, {0, 1.1, -1.1, 2.3},
-      {1, 0.45359612142557738777f, 0.45359612142557738777f,
-       -0.66627602127982419331f});
-  test<ElementType::kSI16, ElementType::kF16>(
-      Cosine, {4}, {.scale = 1e-2, .zero_point = 0}, {0, 1.1, -1.1, 2.3},
-      {1, 0.45359612142557738777f, 0.45359612142557738777f,
-       -0.66627602127982419331f});
   test<ElementType::kSI16, ElementType::kF32>(
       Cosine, {4}, {.scale = 1e-4, .zero_point = 0}, {0, 1.1, -1.1, 2.3},
       {1, 0.45359612142557738777f, 0.45359612142557738777f,
        -0.66627602127982419331f});
-  test<ElementType::kSI32, ElementType::kBF16>(
-      Cosine, {4}, {.scale = 1e-2, .zero_point = 0}, {0, 1.1, -1.1, 2.3},
-      {1, 0.45359612142557738777f, 0.45359612142557738777f,
-       -0.66627602127982419331f});
-  test<ElementType::kSI32, ElementType::kF16>(
-      Cosine, {4}, {.scale = 1e-2, .zero_point = 0}, {0, 1.1, -1.1, 2.3},
-      {1, 0.45359612142557738777f, 0.45359612142557738777f,
-       -0.66627602127982419331f});
-  test<ElementType::kSI32, ElementType::kF32>(
-      Cosine, {4}, {.scale = 1e-4, .zero_point = 0}, {0, 1.1, -1.1, 2.3},
-      {1, 0.45359612142557738777f, 0.45359612142557738777f,
-       -0.66627602127982419331f});
 }
 
 TEST(ElementwiseUnary, CountLeadingZeros) {
@@ -284,30 +219,10 @@ TEST(ElementwiseUnary, ExponentialQuantized) {
       Exponential, {4}, {.scale = 1e-1, .zero_point = 0}, {0, 0.5, 1, 1.5},
       {1, 1.64872127070012814684f, 2.71828182845904523536f,
        4.48168907033806482260f});
-  test<ElementType::kSI16, ElementType::kBF16>(
-      Exponential, {4}, {.scale = 1e-2, .zero_point = 0}, {0, 0.5, 1, 1.5},
-      {1, 1.64872127070012814684f, 2.71828182845904523536f,
-       4.48168907033806482260f});
-  test<ElementType::kSI16, ElementType::kF16>(
-      Exponential, {4}, {.scale = 1e-2, .zero_point = 0}, {0, 0.5, 1, 1.5},
-      {1, 1.64872127070012814684f, 2.71828182845904523536f,
-       4.48168907033806482260f});
   test<ElementType::kSI16, ElementType::kF32>(
       Exponential, {4}, {.scale = 1e-2, .zero_point = 0}, {0, 0.5, 1, 1.5},
       {1, 1.64872127070012814684f, 2.71828182845904523536f,
        4.48168907033806482260f});
-  test<ElementType::kSI32, ElementType::kBF16>(
-      Exponential, {4}, {.scale = 1e-4, .zero_point = 0}, {0, 0.5, 1, 1.5},
-      {1, 1.64872127070012814684f, 2.71828182845904523536f,
-       4.48168907033806482260f});
-  test<ElementType::kSI32, ElementType::kF16>(
-      Exponential, {4}, {.scale = 1e-4, .zero_point = 0}, {0, 0.5, 1, 1.5},
-      {1, 1.64872127070012814684f, 2.71828182845904523536f,
-       4.48168907033806482260f});
-  test<ElementType::kSI32, ElementType::kF32>(
-      Exponential, {4}, {.scale = 1e-4, .zero_point = 0}, {0, 0.5, 1, 1.5},
-      {1, 1.64872127070012814684f, 2.71828182845904523536f,
-       4.48168907033806482260f});
 }
 
 TEST(ElementwiseUnary, ExponentialMinusOne) {
@@ -338,36 +253,11 @@ TEST(ElementwiseUnary, ExponentialMinusOneQuantized) {
       {0, 0.5, 1, 1.5},
       {0, 0.64872127070012814684f, 1.71828182845904523536f,
        3.48168907033806482260f});
-  test<ElementType::kSI16, ElementType::kBF16>(
-      ExponentialMinusOne, {4}, {.scale = 1e-2, .zero_point = 0},
-      {0, 0.5, 1, 1.5},
-      {0, 0.64872127070012814684f, 1.71828182845904523536f,
-       3.48168907033806482260f});
-  test<ElementType::kSI16, ElementType::kF16>(
-      ExponentialMinusOne, {4}, {.scale = 1e-2, .zero_point = 0},
-      {0, 0.5, 1, 1.5},
-      {0, 0.64872127070012814684f, 1.71828182845904523536f,
-       3.48168907033806482260f});
   test<ElementType::kSI16, ElementType::kF32>(
       ExponentialMinusOne, {4}, {.scale = 1e-2, .zero_point = 0},
       {0, 0.5, 1, 1.5},
       {0, 0.64872127070012814684f, 1.71828182845904523536f,
        3.48168907033806482260f});
-  test<ElementType::kSI32, ElementType::kBF16>(
-      ExponentialMinusOne, {4}, {.scale = 1e-4, .zero_point = 0},
-      {0, 0.5, 1, 1.5},
-      {0, 0.64872127070012814684f, 1.71828182845904523536f,
-       3.48168907033806482260f});
-  test<ElementType::kSI32, ElementType::kF16>(
-      ExponentialMinusOne, {4}, {.scale = 1e-4, .zero_point = 0},
-      {0, 0.5, 1, 1.5},
-      {0, 0.64872127070012814684f, 1.71828182845904523536f,
-       3.48168907033806482260f});
-  test<ElementType::kSI32, ElementType::kF32>(
-      ExponentialMinusOne, {4}, {.scale = 1e-4, .zero_point = 0},
-      {0, 0.5, 1, 1.5},
-      {0, 0.64872127070012814684f, 1.71828182845904523536f,
-       3.48168907033806482260f});
 }
 
 TEST(ElementwiseUnary, Floor) {
@@ -386,24 +276,9 @@ TEST(ElementwiseUnary, FloorQuantized) {
   test<ElementType::kSI8, ElementType::kF32>(
       Floor, {4}, {.scale = 1e-1, .zero_point = -4}, {0, 1.1, -2.7, 3.5},
       {0, 1, -3, 3});
-  test<ElementType::kSI16, ElementType::kBF16>(
-      Floor, {4}, {.scale = 1e-2, .zero_point = 0}, {0, 1.11, -2.77, 3.55},
-      {0, 1, -3, 3});
-  test<ElementType::kSI16, ElementType::kF16>(
-      Floor, {4}, {.scale = 1e-2, .zero_point = 4}, {0, 1.11, -2.77, 3.55},
-      {0, 1, -3, 3});
   test<ElementType::kSI16, ElementType::kF32>(
       Floor, {4}, {.scale = 1e-2, .zero_point = -4}, {0, 1.11, -2.77, 3.55},
       {0, 1, -3, 3});
-  test<ElementType::kSI32, ElementType::kBF16>(
-      Floor, {4}, {.scale = 1e-3, .zero_point = 0}, {0, 1.11, -2.77, 3.55},
-      {0, 1, -3, 3});
-  test<ElementType::kSI32, ElementType::kF16>(
-      Floor, {4}, {.scale = 1e-3, .zero_point = 4}, {0, 1.11, -2.77, 3.55},
-      {0, 1, -3, 3});
-  test<ElementType::kSI32, ElementType::kF32>(
-      Floor, {4}, {.scale = 1e-3, .zero_point = -4}, {0, 1.11, -2.77, 3.55},
-      {0, 1, -3, 3});
 }
 
 TEST(ElementwiseUnary, Log) {
@@ -431,30 +306,10 @@ TEST(ElementwiseUnary, LogQuantized) {
       Log, {4}, {.scale = 1e-1, .zero_point = -4}, {0.1, 0.5, 1, 1.5},
       {-2.30258509299404568401f, -0.69314718055994530941f, 0,
        0.40546510810816438197f});
-  test<ElementType::kSI16, ElementType::kBF16>(
-      Log, {4}, {.scale = 1e-2, .zero_point = -4}, {0.1, 0.5, 1, 1.5},
-      {-2.30258509299404568401f, -0.69314718055994530941f, 0,
-       0.40546510810816438197f});
-  test<ElementType::kSI16, ElementType::kF16>(
-      Log, {4}, {.scale = 1e-2, .zero_point = -4}, {0.1, 0.5, 1, 1.5},
-      {-2.30258509299404568401f, -0.69314718055994530941f, 0,
-       0.40546510810816438197f});
   test<ElementType::kSI16, ElementType::kF32>(
       Log, {4}, {.scale = 1e-3, .zero_point = -4}, {0.1, 0.5, 1, 1.5},
       {-2.30258509299404568401f, -0.69314718055994530941f, 0,
        0.40546510810816438197f});
-  test<ElementType::kSI32, ElementType::kBF16>(
-      Log, {4}, {.scale = 1e-2, .zero_point = -4}, {0.1, 0.5, 1, 1.5},
-      {-2.30258509299404568401f, -0.69314718055994530941f, 0,
-       0.40546510810816438197f});
-  test<ElementType::kSI32, ElementType::kF16>(
-      Log, {4}, {.scale = 1e-2, .zero_point = -4}, {0.1, 0.5, 1, 1.5},
-      {-2.30258509299404568401f, -0.69314718055994530941f, 0,
-       0.40546510810816438197f});
-  test<ElementType::kSI32, ElementType::kF32>(
-      Log, {4}, {.scale = 1e-3, .zero_point = -4}, {0.1, 0.5, 1, 1.5},
-      {-2.30258509299404568401f, -0.69314718055994530941f, 0,
-       0.40546510810816438197f});
 }
 
 TEST(ElementwiseUnary, LogPlusOne) {
@@ -482,30 +337,10 @@ TEST(ElementwiseUnary, LogPlusOneQuantized) {
       LogPlusOne, {4}, {.scale = 1e-1, .zero_point = 0}, {-0.9, -0.5, 0, 0.5},
       {-2.30258509299404568401f, -0.69314718055994530941f, 0,
        0.40546510810816438197f});
-  test<ElementType::kSI16, ElementType::kBF16>(
-      LogPlusOne, {4}, {.scale = 1e-1, .zero_point = 0}, {-0.9, -0.5, 0, 0.5},
-      {-2.30258509299404568401f, -0.69314718055994530941f, 0,
-       0.40546510810816438197f});
-  test<ElementType::kSI16, ElementType::kF16>(
-      LogPlusOne, {4}, {.scale = 1e-1, .zero_point = 0}, {-0.9, -0.5, 0, 0.5},
-      {-2.30258509299404568401f, -0.69314718055994530941f, 0,
-       0.40546510810816438197f});
   test<ElementType::kSI16, ElementType::kF32>(
       LogPlusOne, {4}, {.scale = 1e-4, .zero_point = 0}, {-0.9, -0.5, 0, 0.5},
       {-2.30258509299404568401f, -0.69314718055994530941f, 0,
        0.40546510810816438197f});
-  test<ElementType::kSI32, ElementType::kBF16>(
-      LogPlusOne, {4}, {.scale = 1e-1, .zero_point = 0}, {-0.9, -0.5, 0, 0.5},
-      {-2.30258509299404568401f, -0.69314718055994530941f, 0,
-       0.40546510810816438197f});
-  test<ElementType::kSI32, ElementType::kF16>(
-      LogPlusOne, {4}, {.scale = 1e-1, .zero_point = 0}, {-0.9, -0.5, 0, 0.5},
-      {-2.30258509299404568401f, -0.69314718055994530941f, 0,
-       0.40546510810816438197f});
-  test<ElementType::kSI32, ElementType::kF32>(
-      LogPlusOne, {4}, {.scale = 1e-4, .zero_point = 0}, {-0.9, -0.5, 0, 0.5},
-      {-2.30258509299404568401f, -0.69314718055994530941f, 0,
-       0.40546510810816438197f});
 }
 
 TEST(ElementwiseUnary, Logistic) {
@@ -533,30 +368,10 @@ TEST(ElementwiseUnary, LogisticQuantized) {
       Logistic, {4}, {.scale = 1e-1, .zero_point = 0}, {-1, -0.5, 0, 0.5},
       {0.26894142136999512074f, 0.37754066879814543536f, 0.5,
        0.62245933120185456464f});
-  test<ElementType::kSI16, ElementType::kBF16>(
-      Logistic, {4}, {.scale = 1e-2, .zero_point = 0}, {-1, -0.5, 0, 0.5},
-      {0.26894142136999512074f, 0.37754066879814543536f, 0.5,
-       0.62245933120185456464f});
-  test<ElementType::kSI16, ElementType::kF16>(
-      Logistic, {4}, {.scale = 1e-2, .zero_point = 0}, {-1, -0.5, 0, 0.5},
-      {0.26894142136999512074f, 0.37754066879814543536f, 0.5,
-       0.62245933120185456464f});
   test<ElementType::kSI16, ElementType::kF32>(
       Logistic, {4}, {.scale = 1e-3, .zero_point = 0}, {-1, -0.5, 0, 0.5},
       {0.26894142136999512074f, 0.37754066879814543536f, 0.5,
        0.62245933120185456464f});
-  test<ElementType::kSI32, ElementType::kBF16>(
-      Logistic, {4}, {.scale = 1e-2, .zero_point = 0}, {-1, -0.5, 0, 0.5},
-      {0.26894142136999512074f, 0.37754066879814543536f, 0.5,
-       0.62245933120185456464f});
-  test<ElementType::kSI32, ElementType::kF16>(
-      Logistic, {4}, {.scale = 1e-2, .zero_point = 0}, {-1, -0.5, 0, 0.5},
-      {0.26894142136999512074f, 0.37754066879814543536f, 0.5,
-       0.62245933120185456464f});
-  test<ElementType::kSI32, ElementType::kF32>(
-      Logistic, {4}, {.scale = 1e-3, .zero_point = 0}, {-1, -0.5, 0, 0.5},
-      {0.26894142136999512074f, 0.37754066879814543536f, 0.5,
-       0.62245933120185456464f});
 }
 
 TEST(ElementwiseUnary, Negate) {
@@ -578,24 +393,9 @@ TEST(ElementwiseBinary, NegateQuantized) {
   test<ElementType::kSI8, ElementType::kF32>(
       Negate, {5}, {.scale = 1e-1, .zero_point = -1}, {0, 1, -2, 3, -4},
       {0, -1, 2, -3, 4});
-  test<ElementType::kSI16, ElementType::kBF16>(
-      Negate, {5}, {.scale = 1, .zero_point = 0}, {0, 1, -2, 3, -4},
-      {0, -1, 2, -3, 4});
-  test<ElementType::kSI16, ElementType::kF16>(
-      Negate, {5}, {.scale = 1e-2, .zero_point = 1}, {0, 1, -2, 3, -4},
-      {0, -1, 2, -3, 4});
   test<ElementType::kSI16, ElementType::kF32>(
       Negate, {5}, {.scale = 1e-3, .zero_point = -1}, {0, 1, -2, 3, -4},
       {0, -1, 2, -3, 4});
-  test<ElementType::kSI32, ElementType::kBF16>(
-      Negate, {5}, {.scale = 1, .zero_point = 0}, {0, 1, -2, 3, -4},
-      {0, -1, 2, -3, 4});
-  test<ElementType::kSI32, ElementType::kF16>(
-      Negate, {5}, {.scale = 1e-1, .zero_point = 1}, {0, 1, -2, 3, -4},
-      {0, -1, 2, -3, 4});
-  test<ElementType::kSI32, ElementType::kF32>(
-      Negate, {5}, {.scale = 1e-2, .zero_point = -1}, {0, 1, -2, 3, -4},
-      {0, -1, 2, -3, 4});
 }
 
 TEST(ElementwiseUnary, Not) {
@@ -635,24 +435,9 @@ TEST(ElementwiseBinary, RoundNearestAfzQuantized) {
   test<ElementType::kSI8, ElementType::kF32>(
       RoundNearestAfz, {5}, {.scale = 1e-1, .zero_point = 0},
       {-2.5, 0.4, 0.5, 0.6, 2.5}, {-3.0, 0.0, 1.0, 1.0, 3.0});
-  test<ElementType::kSI16, ElementType::kBF16>(
-      RoundNearestAfz, {5}, {.scale = 1e-2, .zero_point = 0},
-      {-2.5, 0.4, 0.5, 0.6, 2.5}, {-3.0, 0.0, 1.0, 1.0, 3.0});
-  test<ElementType::kSI16, ElementType::kF16>(
-      RoundNearestAfz, {5}, {.scale = 1e-2, .zero_point = 0},
-      {-2.5, 0.4, 0.5, 0.6, 2.5}, {-3.0, 0.0, 1.0, 1.0, 3.0});
   test<ElementType::kSI16, ElementType::kF32>(
       RoundNearestAfz, {5}, {.scale = 1e-2, .zero_point = 0},
       {-2.5, 0.4, 0.5, 0.6, 2.5}, {-3.0, 0.0, 1.0, 1.0, 3.0});
-  test<ElementType::kSI32, ElementType::kBF16>(
-      RoundNearestAfz, {5}, {.scale = 1e-2, .zero_point = 0},
-      {-2.5, 0.4, 0.5, 0.6, 2.5}, {-3.0, 0.0, 1.0, 1.0, 3.0});
-  test<ElementType::kSI32, ElementType::kF16>(
-      RoundNearestAfz, {5}, {.scale = 1e-2, .zero_point = 0},
-      {-2.5, 0.4, 0.5, 0.6, 2.5}, {-3.0, 0.0, 1.0, 1.0, 3.0});
-  test<ElementType::kSI32, ElementType::kF32>(
-      RoundNearestAfz, {5}, {.scale = 1e-3, .zero_point = 0},
-      {-2.5, 0.4, 0.5, 0.6, 2.5}, {-3.0, 0.0, 1.0, 1.0, 3.0});
 }
 
 TEST(ElementwiseUnary, RoundNearestEven) {
@@ -674,24 +459,9 @@ TEST(ElementwiseBinary, RoundNearestEvenQuantized) {
   test<ElementType::kSI8, ElementType::kF32>(
       RoundNearestEven, {5}, {.scale = 1e-1, .zero_point = 0},
       {-2.5, 0.4, 0.5, 0.6, 2.5}, {-2.0, 0.0, 0.0, 1.0, 2.0});
-  test<ElementType::kSI16, ElementType::kBF16>(
-      RoundNearestEven, {5}, {.scale = 1e-2, .zero_point = 0},
-      {-2.5, 0.4, 0.5, 0.6, 2.5}, {-2.0, 0.0, 0.0, 1.0, 2.0});
-  test<ElementType::kSI16, ElementType::kF16>(
-      RoundNearestEven, {5}, {.scale = 1e-2, .zero_point = 0},
-      {-2.5, 0.4, 0.5, 0.6, 2.5}, {-2.0, 0.0, 0.0, 1.0, 2.0});
   test<ElementType::kSI16, ElementType::kF32>(
       RoundNearestEven, {5}, {.scale = 1e-2, .zero_point = 0},
       {-2.5, 0.4, 0.5, 0.6, 2.5}, {-2.0, 0.0, 0.0, 1.0, 2.0});
-  test<ElementType::kSI32, ElementType::kBF16>(
-      RoundNearestEven, {5}, {.scale = 1e-2, .zero_point = 0},
-      {-2.5, 0.4, 0.5, 0.6, 2.5}, {-2.0, 0.0, 0.0, 1.0, 2.0});
-  test<ElementType::kSI32, ElementType::kF16>(
-      RoundNearestEven, {5}, {.scale = 1e-2, .zero_point = 0},
-      {-2.5, 0.4, 0.5, 0.6, 2.5}, {-2.0, 0.0, 0.0, 1.0, 2.0});
-  test<ElementType::kSI32, ElementType::kF32>(
-      RoundNearestEven, {5}, {.scale = 1e-3, .zero_point = 0},
-      {-2.5, 0.4, 0.5, 0.6, 2.5}, {-2.0, 0.0, 0.0, 1.0, 2.0});
 }
 
 TEST(ElementwiseUnary, Rsqrt) {
@@ -704,24 +474,9 @@ TEST(ElementwiseUnary, Rsqrt) {
 }
 
 TEST(ElementwiseUnary, RsqrtQuantized) {
-  test<ElementType::kSI16, ElementType::kBF16>(
-      Rsqrt, {4}, {.scale = 1e-3, .zero_point = 0}, {1.0, 4.0, 9.0, 25.0},
-      {1.0, 1.0 / 2.0, 1.0 / 3.0, 1.0 / 5.0});
-  test<ElementType::kSI16, ElementType::kF16>(
-      Rsqrt, {4}, {.scale = 1e-3, .zero_point = 0}, {1.0, 4.0, 9.0, 25.0},
-      {1.0, 1.0 / 2.0, 1.0 / 3.0, 1.0 / 5.0});
   test<ElementType::kSI16, ElementType::kF32>(
       Rsqrt, {4}, {.scale = 1e-3, .zero_point = 0}, {1.0, 4.0, 9.0, 25.0},
       {1.0, 1.0 / 2.0, 1.0 / 3.0, 1.0 / 5.0});
-  test<ElementType::kSI32, ElementType::kBF16>(
-      Rsqrt, {4}, {.scale = 1e-3, .zero_point = 0}, {1.0, 4.0, 9.0, 25.0},
-      {1.0, 1.0 / 2.0, 1.0 / 3.0, 1.0 / 5.0});
-  test<ElementType::kSI32, ElementType::kF16>(
-      Rsqrt, {4}, {.scale = 1e-3, .zero_point = 0}, {1.0, 4.0, 9.0, 25.0},
-      {1.0, 1.0 / 2.0, 1.0 / 3.0, 1.0 / 5.0});
-  test<ElementType::kSI32, ElementType::kF32>(
-      Rsqrt, {4}, {.scale = 1e-3, .zero_point = 0}, {1.0, 4.0, 9.0, 25.0},
-      {1.0, 1.0 / 2.0, 1.0 / 3.0, 1.0 / 5.0});
 }
 
 TEST(ElementwiseUnary, Sign) {
@@ -749,24 +504,9 @@ TEST(ElementwiseUnary, SignQuantized) {
   test<ElementType::kSI8, ElementType::kF32>(
       Sign, {4}, {.scale = 1e-1, .zero_point = 0}, {-2.0, -0.0, +0.0, 2.0},
       {-1, 0, 0, 1});
-  test<ElementType::kSI16, ElementType::kBF16>(
-      Sign, {4}, {.scale = 1e-2, .zero_point = 0}, {-2.0, -0.0, +0.0, 2.0},
-      {-1, 0, 0, 1});
-  test<ElementType::kSI16, ElementType::kF16>(
-      Sign, {4}, {.scale = 1e-2, .zero_point = 0}, {-2.0, -0.0, +0.0, 2.0},
-      {-1, 0, 0, 1});
   test<ElementType::kSI16, ElementType::kF32>(
       Sign, {4}, {.scale = 1e-2, .zero_point = 0}, {-2.0, -0.0, +0.0, 2.0},
       {-1, 0, 0, 1});
-  test<ElementType::kSI32, ElementType::kBF16>(
-      Sign, {4}, {.scale = 1e-3, .zero_point = 0}, {-2.0, -0.0, +0.0, 2.0},
-      {-1, 0, 0, 1});
-  test<ElementType::kSI32, ElementType::kF16>(
-      Sign, {4}, {.scale = 1e-3, .zero_point = 0}, {-2.0, -0.0, +0.0, 2.0},
-      {-1, 0, 0, 1});
-  test<ElementType::kSI32, ElementType::kF32>(
-      Sign, {4}, {.scale = 1e-3, .zero_point = 0}, {-2.0, -0.0, +0.0, 2.0},
-      {-1, 0, 0, 1});
 }
 
 TEST(ElementwiseUnary, Sine) {
@@ -788,24 +528,9 @@ TEST(ElementwiseUnary, SineQuantized) {
   test<ElementType::kSI8, ElementType::kF32>(
       Sine, {5}, {.scale = 1e-1, .zero_point = 0},
       {0, M_PI_2, M_PI, 3 * M_PI_2, 2 * M_PI}, {0, 1, 0, -1, 0});
-  test<ElementType::kSI16, ElementType::kBF16>(
-      Sine, {5}, {.scale = 1e-2, .zero_point = 0},
-      {0, M_PI_2, M_PI, 3 * M_PI_2, 2 * M_PI}, {0, 1, 0, -1, 0});
-  test<ElementType::kSI16, ElementType::kF16>(
-      Sine, {5}, {.scale = 1e-2, .zero_point = 0},
-      {0, M_PI_2, M_PI, 3 * M_PI_2, 2 * M_PI}, {0, 1, 0, -1, 0});
   test<ElementType::kSI16, ElementType::kF32>(
       Sine, {5}, {.scale = 1e-2, .zero_point = 0},
       {0, M_PI_2, M_PI, 3 * M_PI_2, 2 * M_PI}, {0, 1, 0, -1, 0});
-  test<ElementType::kSI32, ElementType::kBF16>(
-      Sine, {5}, {.scale = 1e-3, .zero_point = 0},
-      {0, M_PI_2, M_PI, 3 * M_PI_2, 2 * M_PI}, {0, 1, 0, -1, 0});
-  test<ElementType::kSI32, ElementType::kF16>(
-      Sine, {5}, {.scale = 1e-3, .zero_point = 0},
-      {0, M_PI_2, M_PI, 3 * M_PI_2, 2 * M_PI}, {0, 1, 0, -1, 0});
-  test<ElementType::kSI32, ElementType::kF32>(
-      Sine, {5}, {.scale = 1e-3, .zero_point = 0},
-      {0, M_PI_2, M_PI, 3 * M_PI_2, 2 * M_PI}, {0, 1, 0, -1, 0});
 }
 
 TEST(ElementwiseUnary, Sqrt) {
@@ -821,18 +546,8 @@ TEST(ElementwiseUnary, SqrtQuantized) {
       Sqrt, {4}, {.scale = 1e-1, .zero_point = 0}, {0, 1, 4, 9}, {0, 1, 2, 3});
   test<ElementType::kSI8, ElementType::kF32>(
       Sqrt, {4}, {.scale = 1e-1, .zero_point = 0}, {0, 1, 4, 9}, {0, 1, 2, 3});
-  test<ElementType::kSI16, ElementType::kBF16>(
-      Sqrt, {4}, {.scale = 1e-2, .zero_point = 0}, {0, 1, 4, 9}, {0, 1, 2, 3});
-  test<ElementType::kSI16, ElementType::kF16>(
-      Sqrt, {4}, {.scale = 1e-2, .zero_point = 0}, {0, 1, 4, 9}, {0, 1, 2, 3});
   test<ElementType::kSI16, ElementType::kF32>(
       Sqrt, {4}, {.scale = 1e-2, .zero_point = 0}, {0, 1, 4, 9}, {0, 1, 2, 3});
-  test<ElementType::kSI32, ElementType::kBF16>(
-      Sqrt, {4}, {.scale = 1e-3, .zero_point = 0}, {0, 1, 4, 9}, {0, 1, 2, 3});
-  test<ElementType::kSI32, ElementType::kF16>(
-      Sqrt, {4}, {.scale = 1e-3, .zero_point = 0}, {0, 1, 4, 9}, {0, 1, 2, 3});
-  test<ElementType::kSI32, ElementType::kF32>(
-      Sqrt, {4}, {.scale = 1e-3, .zero_point = 0}, {0, 1, 4, 9}, {0, 1, 2, 3});
 }
 
 TEST(ElementwiseUnary, Tanh) {
@@ -844,7 +559,7 @@ TEST(ElementwiseUnary, Tanh) {
                           {-0.76159416, 0.0, 0.76159416});
 }
 
-TEST(ElementwiseUnary, TanhQuantize) {
+TEST(ElementwiseUnary, TanhQuantized) {
   test<ElementType::kSI8, ElementType::kBF16>(
       Tanh, {3}, {.scale = 1e-1, .zero_point = 0}, {-1, 0, 1},
       {-0.76159416, 0.0, 0.76159416});
@@ -854,24 +569,9 @@ TEST(ElementwiseUnary, TanhQuantize) {
   test<ElementType::kSI8, ElementType::kF32>(
       Tanh, {3}, {.scale = 1e-1, .zero_point = 0}, {-1, 0, 1},
       {-0.76159416, 0.0, 0.76159416});
-  test<ElementType::kSI16, ElementType::kBF16>(
-      Tanh, {3}, {.scale = 1e-2, .zero_point = 0}, {-1, 0, 1},
-      {-0.76159416, 0.0, 0.76159416});
-  test<ElementType::kSI16, ElementType::kF16>(
-      Tanh, {3}, {.scale = 1e-2, .zero_point = 0}, {-1, 0, 1},
-      {-0.76159416, 0.0, 0.76159416});
   test<ElementType::kSI16, ElementType::kF32>(
       Tanh, {3}, {.scale = 1e-2, .zero_point = 0}, {-1, 0, 1},
       {-0.76159416, 0.0, 0.76159416});
-  test<ElementType::kSI32, ElementType::kBF16>(
-      Tanh, {3}, {.scale = 1e-3, .zero_point = 0}, {-1, 0, 1},
-      {-0.76159416, 0.0, 0.76159416});
-  test<ElementType::kSI32, ElementType::kF16>(
-      Tanh, {3}, {.scale = 1e-3, .zero_point = 0}, {-1, 0, 1},
-      {-0.76159416, 0.0, 0.76159416});
-  test<ElementType::kSI32, ElementType::kF32>(
-      Tanh, {3}, {.scale = 1e-3, .zero_point = 0}, {-1, 0, 1},
-      {-0.76159416, 0.0, 0.76159416});
 }
 
 }  // namespace testing
diff --git a/tensorflow/lite/experimental/shlo/ops/compare_test.cc b/tensorflow/lite/experimental/shlo/ops/compare_test.cc
index 4962bdcf2bf98c..8580ae3e365a28 100644
--- a/tensorflow/lite/experimental/shlo/ops/compare_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/compare_test.cc
@@ -163,7 +163,7 @@ TYPED_TEST_SUITE(CompareTest, SupportedTypes, TestParamNames);
 TYPED_TEST(CompareTest, SupportedTestTypesTensorsWork) {
   using StorageT = typename TypeParam::StorageT;
 
-  absl::SharedBitGen bit_gen;
+  absl::BitGen bit_gen;
   const Shape shape({2, 3, 4});
   Vector<StorageT> lhs_data =
       RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-50, /*max=*/50);
@@ -204,7 +204,7 @@ TYPED_TEST(QuantizedCompareTest, PerTensorWorks) {
   using StorageT = typename TypeParam::StorageT;
   using ExpressedT = typename TypeParam::ExpressedT;
 
-  absl::SharedBitGen bit_gen;
+  absl::BitGen bit_gen;
   const Shape shape({2, 2, 2});
   const ExpressedT scale = static_cast<ExpressedT>(1.5);
   const StorageT zero_point = static_cast<StorageT>(2);
diff --git a/tensorflow/lite/experimental/shlo/tensor_matcher.h b/tensorflow/lite/experimental/shlo/tensor_matcher.h
index 29ef119d6d6658..6d58466b19eb02 100644
--- a/tensorflow/lite/experimental/shlo/tensor_matcher.h
+++ b/tensorflow/lite/experimental/shlo/tensor_matcher.h
@@ -53,6 +53,11 @@ MATCHER_P(TensorEq, tensor, "") {
           ::testing::Pointwise(::testing::Eq(),
                                tensor.template Flat<DataType::kSI32>()),
           arg.template Flat<DataType::kSI32>(), result_listener);
+    case DataType::kSI64:
+      return ::testing::ExplainMatchResult(
+          ::testing::Pointwise(::testing::Eq(),
+                               tensor.template Flat<DataType::kSI64>()),
+          arg.template Flat<DataType::kSI32>(), result_listener);
     case DataType::kBF16:
       return ::testing::ExplainMatchResult(
           ::testing::Pointwise(::testing::Eq(),
diff --git a/tensorflow/lite/g3doc/examples/jax_conversion/jax_to_tflite_resnet50.ipynb b/tensorflow/lite/g3doc/examples/jax_conversion/jax_to_tflite_resnet50.ipynb
new file mode 100644
index 00000000000000..6d618df1318d07
--- /dev/null
+++ b/tensorflow/lite/g3doc/examples/jax_conversion/jax_to_tflite_resnet50.ipynb
@@ -0,0 +1,517 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "eB-dvsMI09O4"
+      },
+      "source": [
+        "##### Copyright 2021 The TensorFlow Authors."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "mwvC53CC1K3n"
+      },
+      "outputs": [],
+      "source": [
+        "# @title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+        "# you may not use this file except in compliance with the License.\n",
+        "# You may obtain a copy of the License at\n",
+        "#\n",
+        "# https://www.apache.org/licenses/LICENSE-2.0\n",
+        "#\n",
+        "# Unless required by applicable law or agreed to in writing, software\n",
+        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+        "# See the License for the specific language governing permissions and\n",
+        "# limitations under the License."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "0WlR2-Y3v1mf"
+      },
+      "source": [
+        "End-to-end Example to show how a pre-trained FLAX model can be downloaded, exported and ran.\n",
+        "===============\n",
+        "1. The model used is Resnet50, downloaded from huggingface pre-trained models\n",
+        "2. Test image is of a CAT\n",
+        "3. `orbax-export` API is used to export the JAX Module to a TF Saved Model, along with image pre/post-processing functions.\n",
+        "4. TFLite Converter \u0026 Runtime are used to convert the TF Saved Model to .tflite and run on the test image.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "zQXWQ7y11eIR"
+      },
+      "source": [
+        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/lite/examples/jax_conversion/jax_to_tflite_resnet50\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/examples/jax_conversion/jax_to_tflite_resnet50.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/examples/jax_conversion/jax_to_tflite_resnet50.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/tensorflow/tensorflow/lite/g3doc/examples/jax_conversion/jax_to_tflite_resnet50.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "\u003c/table\u003e"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "gq1373VwVlKV"
+      },
+      "source": [
+        "# Setup"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "TUOym6hJnTxC"
+      },
+      "source": [
+        "```\n",
+        "!pip install orbax-export\n",
+        "\n",
+        "!pip install tf-nightly\n",
+        "\n",
+        "!pip install --upgrade jax jaxlib\n",
+        "\n",
+        "!pip install transformers flax\n",
+        "```"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "hDzBKqEfrafW"
+      },
+      "source": [
+        "## Show Test Image"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "2IOV4p_jripo"
+      },
+      "outputs": [],
+      "source": [
+        "from PIL import Image\n",
+        "import jax\n",
+        "import requests\n",
+        "\n",
+        "url = \"https://storage.googleapis.com/download.tensorflow.org/example_images/astrid_l_shaped.jpg\"\n",
+        "image = Image.open(requests.get(url, stream=True).raw)\n",
+        "image"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "SPcMyI8ERmIB"
+      },
+      "source": [
+        "## Download and test pre-trained Resnet50 FLAX model from HuggingFace"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "a__LrY62PUtg"
+      },
+      "outputs": [],
+      "source": [
+        "from transformers import ConvNextImageProcessor, FlaxResNetForImageClassification\n",
+        "\n",
+        "image_processor = ConvNextImageProcessor.from_pretrained(\"microsoft/resnet-50\")\n",
+        "model = FlaxResNetForImageClassification.from_pretrained(\"microsoft/resnet-50\")\n",
+        "\n",
+        "inputs = image_processor(images=image, return_tensors=\"np\")\n",
+        "outputs = model(**inputs)\n",
+        "logits = outputs.logits\n",
+        "\n",
+        "# model predicts one of the 1000 ImageNet classes\n",
+        "predicted_class_idx = jax.numpy.argmax(logits, axis=-1)\n",
+        "print(\"Predicted class:\", model.config.id2label[predicted_class_idx.item()])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "MKJHRRjGR1rC"
+      },
+      "source": [
+        "# Create a JAX wrapper for the model\n",
+        "Wrapper is needed in order to comply with TFLite accepts inputs. TFLite accets a tensor or a tuple-of-tensors."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Af2zv1qFSOR3"
+      },
+      "outputs": [],
+      "source": [
+        "import flax.linen as nn\n",
+        "from transformers import FlaxResNetForImageClassification\n",
+        "\n",
+        "\n",
+        "class Resnet50Wrapper(nn.Module):\n",
+        "  pretrained_model_name: str = \"microsoft/resnet-50\"  # Pre-trained model name\n",
+        "\n",
+        "  def setup(self):\n",
+        "    # Initialize the pre-trained ResNet50 model\n",
+        "    self.model = FlaxResNetForImageClassification.from_pretrained(\n",
+        "        self.pretrained_model_name\n",
+        "    )\n",
+        "\n",
+        "  def __call__(self, inputs):\n",
+        "    # Process input images through the ResNet50 model\n",
+        "    outputs = self.model(pixel_values=inputs)\n",
+        "\n",
+        "    # Return logits or directly apply softmax for probabilities (optional)\n",
+        "    return outputs.logits"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "mrKtRfkhf73M"
+      },
+      "source": [
+        "## Tensorflow image pre-processor function\n",
+        "This essentialliy implements the underlying logic of the `ConvNextImageProcessor` class in huggingface transformers:\n",
+        "\u003e ```\n",
+        "\u003e image_processor = ConvNextImageProcessor.from_pretrained(\"microsoft/resnet-50\")\n",
+        "\u003e inputs = image_processor(images=image, return_tensors=\"np\")\n",
+        "\u003e ```\n",
+        "\n",
+        "This utility can be reused later during orbax-export for `tf_preprocessing`.\n",
+        "\n",
+        "Note: We can perfectly use the result of `ConvNextImageProcessor` to run a TFLite model. But this example would like to showcase how `orbax-export` helps handle input/output pre/post-processing."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "h85-Qb1PgANA"
+      },
+      "outputs": [],
+      "source": [
+        "import tensorflow as tf\n",
+        "import numpy as np\n",
+        "\n",
+        "\n",
+        "def resnet_image_processor(image_tensor):\n",
+        "  # 1. Resize and Cast to Float32\n",
+        "  image_resized = tf.image.resize(\n",
+        "      image_tensor, (224, 224), method=tf.image.ResizeMethod.BILINEAR\n",
+        "  )\n",
+        "  image_float = tf.cast(image_resized, tf.float32)\n",
+        "\n",
+        "  # 2. Normalize (Using TensorFlow Constants)\n",
+        "  mean = tf.constant([0.485, 0.456, 0.406])\n",
+        "  std = tf.constant([0.229, 0.224, 0.225])\n",
+        "  image_normalized = (image_float / 255.0 - mean) / std\n",
+        "\n",
+        "  # 3. Transpose for Channel-First Format\n",
+        "  image_transposed = tf.transpose(image_normalized, perm=[2, 0, 1])\n",
+        "\n",
+        "  # 4. Add Batch Dimension\n",
+        "  return tf.expand_dims(image_transposed, axis=0)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "3GD4Wb2K3VT_"
+      },
+      "source": [
+        "## Initialize the Jax model"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "2wHvNFpiThVL"
+      },
+      "outputs": [],
+      "source": [
+        "# Initialize the JAX Model\n",
+        "jax_model = Resnet50Wrapper()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "uwP7XjS43M7W"
+      },
+      "source": [
+        "## Validate the wrapped JAX Model"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Gl8mypra3MGw"
+      },
+      "outputs": [],
+      "source": [
+        "# Convert the raw image values to RGB tensor\n",
+        "raw_image_tensor = tf.convert_to_tensor(np.array(image, dtype=np.float32))\n",
+        "\n",
+        "# Appy the above TF imape preprocessing to get an input tensor supported by Resnet50\n",
+        "input_tensor = resnet_image_processor(raw_image_tensor)\n",
+        "\n",
+        "# Run the JAX model\n",
+        "jax_logits = jax_model.apply({}, input_tensor.numpy())\n",
+        "\n",
+        "jax_predicted_class_idx = jax.numpy.argmax(jax_logits, axis=-1)\n",
+        "print(\"Predicted class:\", model.config.id2label[jax_predicted_class_idx.item()])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "lKZst83Mu6TD"
+      },
+      "outputs": [],
+      "source": [
+        "raw_image_tensor.shape"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "zuAl8ubTTKxZ"
+      },
+      "source": [
+        "# Export to TFLite model and run"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Pz61IKnwjhbn"
+      },
+      "source": [
+        "## Export the JAX to TF Saved Model using orbax-export"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Cwq1rxEZjpoU"
+      },
+      "outputs": [],
+      "source": [
+        "from orbax.export import ExportManager, JaxModule, ServingConfig\n",
+        "\n",
+        "# Wrap the model params and function into a JaxModule.\n",
+        "jax_module = JaxModule({}, jax_model.apply, trainable=False)\n",
+        "\n",
+        "# Specify the serving configuration and export the model.\n",
+        "serving_config = ServingConfig(\n",
+        "    \"serving_default\",\n",
+        "    input_signature=[tf.TensorSpec([480, 640, 3], tf.float32, name=\"inputs\")],\n",
+        "    tf_preprocessor=resnet_image_processor,\n",
+        "    tf_postprocessor=lambda x: tf.argmax(x, axis=-1),\n",
+        ")\n",
+        "\n",
+        "export_manager = ExportManager(jax_module, [serving_config])\n",
+        "\n",
+        "saved_model_dir = \"resnet50_saved_model\"\n",
+        "export_manager.save(saved_model_dir)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "O00xe8MevVLj"
+      },
+      "source": [
+        "### Convert to a TFLite Model"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Uf2-ZMECmtJ3"
+      },
+      "outputs": [],
+      "source": [
+        "converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)\n",
+        "tflite_model = converter.convert()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "n-fBAd64vZse"
+      },
+      "source": [
+        "### Apply TFLite Runtime API on the `raw_image_tensor`"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "unFFby0Zmup-"
+      },
+      "outputs": [],
+      "source": [
+        "def run_tflite_model(tflite_model_content, input_tensor):\n",
+        "  interpreter = tf.lite.Interpreter(model_content=tflite_model_content)\n",
+        "  interpreter.allocate_tensors()\n",
+        "\n",
+        "  input_details = interpreter.get_input_details()[0]\n",
+        "  interpreter.set_tensor(input_details[\"index\"], input_tensor)\n",
+        "  interpreter.invoke()\n",
+        "\n",
+        "  output_details = interpreter.get_output_details()\n",
+        "  return interpreter.get_tensor(output_details[0][\"index\"])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Z9Rwk8iFvSBO"
+      },
+      "outputs": [],
+      "source": [
+        "output_data = run_tflite_model(tflite_model, raw_image_tensor)\n",
+        "print(\"Predicted class:\", model.config.id2label[output_data[0]])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "NbtCrNLYKpK8"
+      },
+      "source": [
+        "## Export to TF Saved Model without pre/post-processing"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "qPLmCDniKvV3"
+      },
+      "outputs": [],
+      "source": [
+        "saved_model_dir_2 = \"resnet50_saved_model_1\"\n",
+        "\n",
+        "tf.saved_model.save(\n",
+        "    jax_module,\n",
+        "    saved_model_dir_2,\n",
+        "    signatures=jax_module.methods[JaxModule.DEFAULT_METHOD_KEY].get_concrete_function(\n",
+        "        tf.TensorSpec([1, 3, 224, 224], tf.float32, name=\"inputs\")\n",
+        "    ),\n",
+        "    options=tf.saved_model.SaveOptions(experimental_custom_gradients=True),\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Wzp8M2ebLYoH"
+      },
+      "outputs": [],
+      "source": [
+        "converter_1 = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir_2)\n",
+        "tflite_model_1 = converter_1.convert()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "3Di5ikJYMcKy"
+      },
+      "outputs": [],
+      "source": [
+        "output_data_1 = run_tflite_model(tflite_model_1, input_tensor)\n",
+        "tfl_predicted_class_idx_1 = tf.argmax(output_data_1, axis=-1).numpy()\n",
+        "print(\"Predicted class:\", model.config.id2label[tfl_predicted_class_idx_1[0]])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "_SwsbUH_Smt7"
+      },
+      "source": [
+        "## Export from TF Function"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "2MbcN1vjSt5E"
+      },
+      "outputs": [],
+      "source": [
+        "converter_2 = tf.lite.TFLiteConverter.from_concrete_functions(\n",
+        "    [\n",
+        "        jax_module.methods[JaxModule.DEFAULT_METHOD_KEY].get_concrete_function(\n",
+        "            tf.TensorSpec([1, 3, 224, 224], tf.float32, name=\"inputs\")\n",
+        "        )\n",
+        "    ]\n",
+        ")\n",
+        "tflite_model_2 = converter_2.convert()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "WdM3p03AS-xq"
+      },
+      "outputs": [],
+      "source": [
+        "output_data_2 = run_tflite_model(tflite_model_2, input_tensor)\n",
+        "tfl_predicted_class_idx_2 = tf.argmax(output_data_2, axis=-1).numpy()\n",
+        "print(\"Predicted class:\", model.config.id2label[tfl_predicted_class_idx_2[0]])"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "name": "jax_to_tflite_resnet50.ipynb",
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/tensorflow/lite/g3doc/examples/keras/keras_jax_backend_to_tfl.ipynb b/tensorflow/lite/g3doc/examples/keras/keras_jax_backend_to_tfl.ipynb
new file mode 100644
index 00000000000000..642c6e819ab484
--- /dev/null
+++ b/tensorflow/lite/g3doc/examples/keras/keras_jax_backend_to_tfl.ipynb
@@ -0,0 +1,263 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "eB-dvsMI09O4"
+      },
+      "source": [
+        "##### Copyright 2024 The TensorFlow Authors."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "mwvC53CC1K3n"
+      },
+      "outputs": [],
+      "source": [
+        "# @title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+        "# you may not use this file except in compliance with the License.\n",
+        "# You may obtain a copy of the License at\n",
+        "#\n",
+        "# https://www.apache.org/licenses/LICENSE-2.0\n",
+        "#\n",
+        "# Unless required by applicable law or agreed to in writing, software\n",
+        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+        "# See the License for the specific language governing permissions and\n",
+        "# limitations under the License."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "w61LHyw7v_yx"
+      },
+      "source": [
+        "Converting Keras to TFLite (via the JAX backend)\n",
+        "=========="
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "zQXWQ7y11eIR"
+      },
+      "source": [
+        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/lite/examples/keras/keras_jax_backend_to_tfl\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/examples/keras/keras_jax_backend_to_tfl.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/examples/keras/keras_jax_backend_to_tfl.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/tensorflow/tensorflow/lite/g3doc/examples/keras/keras_jax_backend_to_tfl.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "\u003c/table\u003e"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "2-t7bCE0lGsH"
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "\n",
+        "os.environ[\"KERAS_BACKEND\"] = \"jax\""
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "SNUGBsILwSSs"
+      },
+      "source": [
+        "## Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Ht0UjgDxliW9"
+      },
+      "outputs": [],
+      "source": [
+        "import keras\n",
+        "import tensorflow as tf\n",
+        "import numpy as np"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "kZhJOer0wXWP"
+      },
+      "source": [
+        "## Get the test image data"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "FirUqiycez0X"
+      },
+      "outputs": [],
+      "source": [
+        "from PIL import Image\n",
+        "import requests\n",
+        "\n",
+        "url = \"https://storage.googleapis.com/download.tensorflow.org/example_images/astrid_l_shaped.jpg\"\n",
+        "image = Image.open(requests.get(url, stream=True).raw)\n",
+        "image = image.resize((224, 224))\n",
+        "input_image = np.array(image)\n",
+        "input_image = np.expand_dims(input_image, axis=0)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "CHMbP6ZWwcV_"
+      },
+      "source": [
+        "## Instatiate a Resnet50 model from the Keras models library"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "CbJyUj1IoqF6"
+      },
+      "outputs": [],
+      "source": [
+        "jax_model = keras.applications.resnet.ResNet50(include_top=True, weights=\"imagenet\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "B5yqkEEXwo13"
+      },
+      "source": [
+        "## Run the keras JAX model with the test input"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "IBHBetUqfhDA"
+      },
+      "outputs": [],
+      "source": [
+        "input_data = keras.applications.resnet50.preprocess_input(input_image)\n",
+        "jax_model_output = jax_model(input_data)\n",
+        "\n",
+        "decoded_preds = keras.applications.resnet.decode_predictions(jax_model_output, top=1)[\n",
+        "    0\n",
+        "][0]\n",
+        "print(\"Predicted class:\", decoded_preds[1])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "HaUJGHGpw0KD"
+      },
+      "source": [
+        "## Save the Keras JAX model"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "IZ4YqZLTrGc6"
+      },
+      "outputs": [],
+      "source": [
+        "saved_model_dir = \"resnet50_saved_model\"\n",
+        "jax_model.export(saved_model_dir)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "bNSGGXfhw4uO"
+      },
+      "source": [
+        "## Convert to a TFLite model file"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "MdJz2eKqsEhA"
+      },
+      "outputs": [],
+      "source": [
+        "converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)\n",
+        "tflite_model = converter.convert()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "NZ4DjfGSWS7O"
+      },
+      "source": [
+        "## Run using TFLite Runtime"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "CtMSYAkwWWVm"
+      },
+      "outputs": [],
+      "source": [
+        "interpreter = tf.lite.Interpreter(model_content=tflite_model)\n",
+        "interpreter.allocate_tensors()\n",
+        "\n",
+        "input_details = interpreter.get_input_details()[0]\n",
+        "interpreter.set_tensor(input_details[\"index\"], input_data)\n",
+        "interpreter.invoke()\n",
+        "\n",
+        "output_details = interpreter.get_output_details()\n",
+        "output_data = interpreter.get_tensor(output_details[0][\"index\"])\n",
+        "\n",
+        "tfl_predicted_class_idx = keras.applications.resnet.decode_predictions(\n",
+        "    output_data, top=1\n",
+        ")[0][0]\n",
+        "print(\"Predicted class:\", tfl_predicted_class_idx[1])"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "name": "keras_jax_backend_to_tfl.ipynb",
+      "provenance": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/tensorflow/lite/g3doc/models/convert/convert_models.md b/tensorflow/lite/g3doc/models/convert/convert_models.md
index 62547184c8e2d7..167d8add480fb4 100644
--- a/tensorflow/lite/g3doc/models/convert/convert_models.md
+++ b/tensorflow/lite/g3doc/models/convert/convert_models.md
@@ -204,7 +204,7 @@ You are required to provide the `--output_file` flag and either the `--saved_mod
 
 If you have the
 [TensorFlow 2.x source](https://www.tensorflow.org/install/source)
-donwloaded and want to run the converter from that source without building and
+downloaded and want to run the converter from that source without building and
 installing the package,
 you can replace '`tflite_convert`' with
 '`bazel run tensorflow/lite/python:tflite_convert --`' in the command.
diff --git a/tensorflow/lite/g3doc/models/convert/index.md b/tensorflow/lite/g3doc/models/convert/index.md
index 094758708a0faa..5aee45c5c1ae1f 100644
--- a/tensorflow/lite/g3doc/models/convert/index.md
+++ b/tensorflow/lite/g3doc/models/convert/index.md
@@ -97,7 +97,7 @@ for your model:
 1. [Optimization flags](../../performance/model_optimization) allow you to
    specify the type of optimization to apply
    during conversion. The most commonly used optimization technique is
-   [post-training quanitization]().
+   [post-training quantization]().
 1. [Metadata flags](metadata) allow you to add metadata to the converted model
    which makes it easier to create platform specific wrapper code when deploying
    models on devices.
@@ -142,7 +142,7 @@ format model and a custom runtime environment for that model.
   converting your model.
 * See the [optimization overview](../../performance/model_optimization) for
   guidance on how to optimize your converted model using techniques like
-  [post-training quanitization](../../performance/post_training_quantization).
+  [post-training quantization](../../performance/post_training_quantization).
 * See the [Adding metadata overview](metadata) to learn how to add metadata to
   your models. Metadata provides other uses a description of your model as well
   as information that can be leveraged by code generators.
diff --git a/tensorflow/lite/g3doc/models/modify/model_maker/text_classification.ipynb b/tensorflow/lite/g3doc/models/modify/model_maker/text_classification.ipynb
index c9d1bb8dc3ef9c..881945469b0049 100644
--- a/tensorflow/lite/g3doc/models/modify/model_maker/text_classification.ipynb
+++ b/tensorflow/lite/g3doc/models/modify/model_maker/text_classification.ipynb
@@ -102,7 +102,7 @@
       "source": [
         "!sudo apt -y install libportaudio2\n",
         "!pip install -q tflite-model-maker\n",
-        "!pip uninstall tflite_support_nightly\n",
+        "!pip uninstall -y tflite_support_nightly\n",
         "!pip install tflite_support_nightly"
       ]
     },
diff --git a/tensorflow/lite/g3doc/tools/build_java_api_docs.py b/tensorflow/lite/g3doc/tools/build_java_api_docs.py
index 9af6e8c6ab0304..39669215b712ed 100644
--- a/tensorflow/lite/g3doc/tools/build_java_api_docs.py
+++ b/tensorflow/lite/g3doc/tools/build_java_api_docs.py
@@ -62,8 +62,6 @@
 SECTION_LABELS = {
     'org.tensorflow.lite': 'Core',
     'org.tensorflow.lite.support': 'Support Library',
-    'org.tensorflow.lite.task.gms': 'Task Library (Play Services)',
-    'org.tensorflow.lite.task': 'Task Library',
     'org.tensorflow.lite.gpu': 'Delegates',
     # If we ever need other ODML packages, drop the `.image` here.
     'com.google.android.odml.image': 'ODML',
@@ -106,7 +104,7 @@ def main(unused_argv):
   all_deps = [SOURCE_PATH_CORE, SOURCE_PATH_SUPPORT, SOURCE_PATH_ODML]
   # Keep searching upwards for a root that hosts the various dependencies. We
   # test `root.name` to ensure we haven't hit /.
-  while root.name and not (exists := exists_maybe_nested(all_deps, root)):
+  while not (exists := exists_maybe_nested(all_deps, root)) and root.name:
     root = root.parent
   if not exists:
     raise FileNotFoundError('Could not find dependencies.')
diff --git a/tensorflow/lite/ios/TensorFlowLiteC.podspec.template b/tensorflow/lite/ios/TensorFlowLiteC.podspec.template
index 2ad964382d0933..e26660e4684050 100644
--- a/tensorflow/lite/ios/TensorFlowLiteC.podspec.template
+++ b/tensorflow/lite/ios/TensorFlowLiteC.podspec.template
@@ -23,29 +23,32 @@ Pod::Spec.new do |s|
 
   # TODO: Remove this after adding support for arm64 simulator.
   s.pod_target_xcconfig = {
-    'EXCLUDED_ARCHS[sdk=iphonesimulator*]' => '${TFL_EXCLUDED_SIM_ARCHS}'
+    'EXCLUDED_ARCHS[sdk=iphonesimulator*]' => 'i386'
   }
 
   # TODO: Remove this after adding support for arm64 simulator.
   s.user_target_xcconfig = {
-    'EXCLUDED_ARCHS[sdk=iphonesimulator*]' => '${TFL_EXCLUDED_SIM_ARCHS}'
+    'EXCLUDED_ARCHS[sdk=iphonesimulator*]' => 'i386'
   }
 
   s.default_subspec = 'Core'
 
   s.subspec 'Core' do |core|
-    core.vendored_frameworks = 'Frameworks/TensorFlowLiteC.${TFL_VENDORED_BUNDLE_TYPE}'
+    core.vendored_frameworks = 'Frameworks/TensorFlowLiteC.xcframework'
+    core.resource_bundle = { 'TensorFlowLiteC' => 'Frameworks/TensorFlowLiteC.xcframework/PrivacyInfo.xcprivacy' }
   end
 
   s.subspec 'CoreML' do |coreml|
     coreml.weak_framework = 'CoreML'
     coreml.dependency 'TensorFlowLiteC/Core'
-    coreml.vendored_frameworks = 'Frameworks/TensorFlowLiteCCoreML.${TFL_VENDORED_BUNDLE_TYPE}'
+    coreml.vendored_frameworks = 'Frameworks/TensorFlowLiteCCoreML.xcframework'
+    coreml.resource_bundle = { 'TensorFlowLiteCCoreML' => 'Frameworks/TensorFlowLiteCCoreML.xcframework/PrivacyInfo.xcprivacy' }
   end
 
   s.subspec 'Metal' do |metal|
     metal.weak_framework = 'Metal'
     metal.dependency 'TensorFlowLiteC/Core'
-    metal.vendored_frameworks = 'Frameworks/TensorFlowLiteCMetal.${TFL_VENDORED_BUNDLE_TYPE}'
+    metal.vendored_frameworks = 'Frameworks/TensorFlowLiteCMetal.xcframework'
+    metal.resource_bundle = { 'TensorFlowLiteCMetal' => 'Frameworks/TensorFlowLiteCMetal.xcframework/PrivacyInfo.xcprivacy' }
   end
 end
diff --git a/tensorflow/lite/ios/TensorFlowLiteC.xcprivacy b/tensorflow/lite/ios/TensorFlowLiteC.xcprivacy
new file mode 100644
index 00000000000000..276f7610da6dfb
--- /dev/null
+++ b/tensorflow/lite/ios/TensorFlowLiteC.xcprivacy
@@ -0,0 +1,23 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>NSPrivacyTracking</key>
+	<false/>
+	<key>NSPrivacyCollectedDataTypes</key>
+	<array/>
+	<key>NSPrivacyTrackingDomains</key>
+	<array/>
+	<key>NSPrivacyAccessedAPITypes</key>
+	<array>
+		<dict>
+			<key>NSPrivacyAccessedAPIType</key>
+			<string>NSPrivacyAccessedAPICategoryFileTimestamp</string>
+			<key>NSPrivacyAccessedAPITypeReasons</key>
+			<array>
+				<string>C617.1</string>
+			</array>
+		</dict>
+	</array>
+</dict>
+</plist>
diff --git a/tensorflow/lite/ios/TensorFlowLiteCCoreML.xcprivacy b/tensorflow/lite/ios/TensorFlowLiteCCoreML.xcprivacy
new file mode 100644
index 00000000000000..04aa8ca56079bc
--- /dev/null
+++ b/tensorflow/lite/ios/TensorFlowLiteCCoreML.xcprivacy
@@ -0,0 +1,31 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>NSPrivacyTracking</key>
+	<false/>
+	<key>NSPrivacyCollectedDataTypes</key>
+	<array/>
+	<key>NSPrivacyTrackingDomains</key>
+	<array/>
+	<key>NSPrivacyAccessedAPITypes</key>
+	<array>
+		<dict>
+			<key>NSPrivacyAccessedAPIType</key>
+			<string>NSPrivacyAccessedAPICategoryFileTimestamp</string>
+			<key>NSPrivacyAccessedAPITypeReasons</key>
+			<array>
+				<string>C617.1</string>
+			</array>
+		</dict>
+		<dict>
+			<key>NSPrivacyAccessedAPIType</key>
+			<string>NSPrivacyAccessedAPICategorySystemBootTime</string>
+			<key>NSPrivacyAccessedAPITypeReasons</key>
+			<array>
+				<string>35F9.1</string>
+			</array>
+		</dict>
+	</array>
+</dict>
+</plist>
diff --git a/tensorflow/lite/ios/TensorFlowLiteCMetal.xcprivacy b/tensorflow/lite/ios/TensorFlowLiteCMetal.xcprivacy
new file mode 100644
index 00000000000000..276f7610da6dfb
--- /dev/null
+++ b/tensorflow/lite/ios/TensorFlowLiteCMetal.xcprivacy
@@ -0,0 +1,23 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>NSPrivacyTracking</key>
+	<false/>
+	<key>NSPrivacyCollectedDataTypes</key>
+	<array/>
+	<key>NSPrivacyTrackingDomains</key>
+	<array/>
+	<key>NSPrivacyAccessedAPITypes</key>
+	<array>
+		<dict>
+			<key>NSPrivacyAccessedAPIType</key>
+			<string>NSPrivacyAccessedAPICategoryFileTimestamp</string>
+			<key>NSPrivacyAccessedAPITypeReasons</key>
+			<array>
+				<string>C617.1</string>
+			</array>
+		</dict>
+	</array>
+</dict>
+</plist>
diff --git a/tensorflow/lite/ios/TensorFlowLiteSelectTfOps.podspec.template b/tensorflow/lite/ios/TensorFlowLiteSelectTfOps.podspec.template
index 3917f61c55be78..5b99b4804d2960 100644
--- a/tensorflow/lite/ios/TensorFlowLiteSelectTfOps.podspec.template
+++ b/tensorflow/lite/ios/TensorFlowLiteSelectTfOps.podspec.template
@@ -18,7 +18,8 @@ Pod::Spec.new do |s|
 
   s.module_name = 'TensorFlowLiteSelectTfOps'
   s.library = 'c++'
-  s.vendored_frameworks = 'Frameworks/TensorFlowLiteSelectTfOps.${TFL_VENDORED_BUNDLE_TYPE}'
+  s.vendored_frameworks = 'Frameworks/TensorFlowLiteSelectTfOps.xcframework'
+  s.resource_bundle = { 'TensorFlowLiteSelectTfOps' => 'Frameworks/TensorFlowLiteSelectTfOps.xcframework/PrivacyInfo.xcprivacy' }
   s.weak_frameworks = 'CoreML'
 
   # TODO(b/149803849): Remove this after adding support for simulators.
diff --git a/tensorflow/lite/ios/TensorFlowLiteSelectTfOps.xcprivacy b/tensorflow/lite/ios/TensorFlowLiteSelectTfOps.xcprivacy
new file mode 100644
index 00000000000000..276f7610da6dfb
--- /dev/null
+++ b/tensorflow/lite/ios/TensorFlowLiteSelectTfOps.xcprivacy
@@ -0,0 +1,23 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>NSPrivacyTracking</key>
+	<false/>
+	<key>NSPrivacyCollectedDataTypes</key>
+	<array/>
+	<key>NSPrivacyTrackingDomains</key>
+	<array/>
+	<key>NSPrivacyAccessedAPITypes</key>
+	<array>
+		<dict>
+			<key>NSPrivacyAccessedAPIType</key>
+			<string>NSPrivacyAccessedAPICategoryFileTimestamp</string>
+			<key>NSPrivacyAccessedAPITypeReasons</key>
+			<array>
+				<string>C617.1</string>
+			</array>
+		</dict>
+	</array>
+</dict>
+</plist>
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index a808854f036d0c..866dba87121877 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -7,7 +7,7 @@ load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
-        "//tensorflow/lite:anything_but_tf",
+        "//visibility:public",
     ],
     licenses = ["notice"],
 )
@@ -662,13 +662,13 @@ cc_library(
 
 cc_library(
     name = "control_flow_common",
-    srcs = [],
+    srcs = ["control_flow_common.cc"],
     hdrs = ["control_flow_common.h"],
     compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         ":kernel_util",
-        "//tensorflow/lite:kernel_api",
+        "//tensorflow/lite:util",
         "//tensorflow/lite/core:subgraph",
         "//tensorflow/lite/core/c:common",
     ],
@@ -766,6 +766,7 @@ BUILTIN_KERNEL_SRCS = [
     "shape.cc",
     "stablehlo_gather.cc",
     "stablehlo_add.cc",
+    "stablehlo_composite.cc",
     "stablehlo_multiply.cc",
     "stablehlo_pad.cc",
     "stablehlo_reduce_window.cc",
@@ -887,6 +888,7 @@ cc_library(
         "@fft2d",
         "@ruy//ruy/profiler:instrumentation",
         "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite:array",
         "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:cc_api_stable",
@@ -1340,6 +1342,24 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "stablehlo_composite_test",
+    srcs = ["stablehlo_composite_test.cc"],
+    tags = ["tflite_nnapi"],
+    deps = [
+        ":kernel_util",
+        ":subgraph_test_util",
+        ":test_main",
+        ":test_util",
+        "//tensorflow/lite:cc_api_stable",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate_hdrs_only",
+        "//tensorflow/lite/kernels/internal:tensor_ctypes",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_test(
     name = "stablehlo_pad_test",
     srcs = ["stablehlo_pad_test.cc"],
diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index 82fcf583fe7c38..169707d8514c81 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -1754,7 +1754,6 @@ TfLiteStatus GeluEval(TfLiteContext* context, TfLiteNode* node) {
           TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
-  return kTfLiteOk;
 }
 
 }  // namespace activations
diff --git a/tensorflow/lite/kernels/builtin_ops_list.inc b/tensorflow/lite/kernels/builtin_ops_list.inc
index 793649a48f9cd6..3c516bcfae4ded 100644
--- a/tensorflow/lite/kernels/builtin_ops_list.inc
+++ b/tensorflow/lite/kernels/builtin_ops_list.inc
@@ -218,3 +218,4 @@ TFLITE_OP(Register_STABLEHLO_TRANSPOSE)
 TFLITE_OP(Register_DILATE)
 TFLITE_OP(Register_STABLEHLO_RNG_BIT_GENERATOR)
 TFLITE_OP(Register_REDUCE_WINDOW)
+TFLITE_OP(Register_STABLEHLO_COMPOSITE)
diff --git a/tensorflow/compiler/mlir/lite/emit_error_reporter.cc b/tensorflow/lite/kernels/control_flow_common.cc
similarity index 59%
rename from tensorflow/compiler/mlir/lite/emit_error_reporter.cc
rename to tensorflow/lite/kernels/control_flow_common.cc
index f9c4760326b5d6..4463ad0361329d 100644
--- a/tensorflow/compiler/mlir/lite/emit_error_reporter.cc
+++ b/tensorflow/lite/kernels/control_flow_common.cc
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,18 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/mlir/lite/emit_error_reporter.h"
+#include "tensorflow/lite/kernels/control_flow_common.h"
 
-#include <cstdio>
+#include <algorithm>
 #include <vector>
 
 namespace tflite {
+namespace ops {
+namespace builtin {
 
-int EmitErrorReporter::Report(const char* format, va_list args) {
-  std::vector<char> buf(1 + snprintf(nullptr, 0, format, args));
-  std::vsnprintf(buf.data(), buf.size(), format, args);
-  module_.emitError() << std::string(buf.begin(), buf.end());
-  return 0;
+int OutputIsInput(int output_idx, const std::vector<int>& subgraph_inputs) {
+  auto e =
+      std::find(subgraph_inputs.begin(), subgraph_inputs.end(), output_idx);
+  return (e != subgraph_inputs.end()) ? (e - subgraph_inputs.begin()) : -1;
 }
 
+}  // namespace builtin
+}  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/control_flow_common.h b/tensorflow/lite/kernels/control_flow_common.h
index 7f2f33df5b62bc..dfb1f54d4d4dd9 100644
--- a/tensorflow/lite/kernels/control_flow_common.h
+++ b/tensorflow/lite/kernels/control_flow_common.h
@@ -17,10 +17,10 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/lite/context_util.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/util.h"
 
 namespace tflite {
 namespace ops {
@@ -52,9 +52,9 @@ TfLiteStatus CopyTensorsShapeAndType(TfLiteContext* context,
 
     TfLiteTensor* dst_tensor = dst_subgraph->tensor(dst_tensor_indices[i]);
     if (resize_subgraph_inputs) {
-      std::vector<int> dims(src_tensor->dims->data,
-                            src_tensor->dims->data + src_tensor->dims->size);
-      dst_subgraph->ResizeInputTensor(dst_tensor_indices[i], dims);
+      dst_subgraph->ResizeInputTensor(dst_tensor_indices[i],
+                                      src_tensor->dims->data,
+                                      src_tensor->dims->size);
     } else {
       TF_LITE_ENSURE_OK(
           context, context->ResizeTensor(context, dst_tensor,
@@ -160,6 +160,11 @@ TfLiteStatus DeepOrShallowCopyTensorsShapeTypeData(
   }
   return kTfLiteOk;
 }
+
+// Returns the subgraph input tensor index if the given output is also an input.
+// Otherwise returns -1.
+int OutputIsInput(int output_idx, const std::vector<int>& subgraph_inputs);
+
 }  // namespace builtin
 }  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/if.cc b/tensorflow/lite/kernels/if.cc
index dce144d3a892d1..81b77f6e7e99df 100644
--- a/tensorflow/lite/kernels/if.cc
+++ b/tensorflow/lite/kernels/if.cc
@@ -20,12 +20,14 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "tensorflow/lite/context_util.h"
 #include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/kernels/control_flow_common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/util.h"
 
 namespace tflite {
 namespace ops {
@@ -145,13 +147,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-// Returns the subgraph input tensor index if the given output is also an input.
-int output_is_input(int output_idx, const std::vector<int>& subgraph_inputs) {
-  auto e =
-      std::find(subgraph_inputs.begin(), subgraph_inputs.end(), output_idx);
-  return (e != subgraph_inputs.end()) ? (e - subgraph_inputs.begin()) : -1;
-}
-
 // Evaluate IF op when subgraphs have dynamic outputs.
 TfLiteStatus Eval_dynamic(TfLiteContext* context, TfLiteNode* node,
                           Subgraph* active_branch_subgraph) {
@@ -182,8 +177,8 @@ TfLiteStatus Eval_dynamic(TfLiteContext* context, TfLiteNode* node,
                         TfLiteIntArrayView(node->outputs), true));
 
   for (int i = 0; i < num_outputs; ++i) {
-    const int input_pos = output_is_input(active_branch_subgraph->outputs()[i],
-                                          active_branch_subgraph->inputs());
+    const int input_pos = OutputIsInput(active_branch_subgraph->outputs()[i],
+                                        active_branch_subgraph->inputs());
     if (input_pos != -1) {
       TfLiteTensor* this_input =
           this_subgraph->tensor(node->inputs->data[input_pos + 1]);
@@ -233,9 +228,8 @@ TfLiteStatus Eval_static(TfLiteContext* context, TfLiteNode* node,
       TfLiteTensorResizeMaybeCopy(this_input->bytes, this_output, false);
       TfLiteTensorCopy(this_input, this_output);
     } else {
-      const int input_pos =
-          output_is_input(active_branch_subgraph->outputs()[i],
-                          active_branch_subgraph->inputs());
+      const int input_pos = OutputIsInput(active_branch_subgraph->outputs()[i],
+                                          active_branch_subgraph->inputs());
       if (input_pos != -1) {
         TfLiteTensor* this_input =
             this_subgraph->tensor(node->inputs->data[input_pos + 1]);
diff --git a/tensorflow/lite/kernels/internal/optimized/cpu_check.cc b/tensorflow/lite/kernels/internal/optimized/cpu_check.cc
index 8fd17a7e33a03e..565299a8004acf 100644
--- a/tensorflow/lite/kernels/internal/optimized/cpu_check.cc
+++ b/tensorflow/lite/kernels/internal/optimized/cpu_check.cc
@@ -42,9 +42,9 @@ bool DetectDotprodByLinuxAuxvMethod() {
 bool DetectArmNeonDotprod() {
 #if defined __linux__ && defined __aarch64__
   return DetectDotprodByLinuxAuxvMethod();
-#endif
-
+#else
   return false;
+#endif
 }
 
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h
index 0cb1a23e5567b9..8d761dd3dc3e10 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_3X3_FILTER_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_3X3_FILTER_H_
 
+#include <stddef.h>
+
 #include <memory>
 
 #include "ruy/profiler/instrumentation.h"  // from @ruy
@@ -32,7 +34,6 @@ namespace depthwise_conv {
 // Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
 // Jetson TX-2. This compiler does not support the offsetof() macro.
 #if defined(__aarch64__) && !defined(GOOGLE_L4T)
-#include <stddef.h>
 
 // Represents the number of bytes offset from the start of the
 // DepthwiseConvParams struct. This is used in the asm to load parameters.
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_hybrid_3x3_filter.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_hybrid_3x3_filter.h
index f9472515417e85..5c7abda84fcfa0 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_hybrid_3x3_filter.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_hybrid_3x3_filter.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_HYBRID_3X3_FILTER_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_HYBRID_3X3_FILTER_H_
 
+#include <stddef.h>
+
 #include <memory>
 
 #include "ruy/profiler/instrumentation.h"  // from @ruy
@@ -32,7 +34,6 @@ namespace depthwise_conv {
 // Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
 // Jetson TX-2. This compiler does not support the offsetof() macro.
 #if defined(__aarch64__) && !defined(GOOGLE_L4T)
-#include <stddef.h>
 
 // Represents the number of bytes offset from the start of the
 // DepthwiseConvParams struct. This is used in the asm to load parameters.
diff --git a/tensorflow/lite/kernels/internal/reference/transpose_conv.h b/tensorflow/lite/kernels/internal/reference/transpose_conv.h
index 8a51e0fa5e9742..744ed0f826b335 100644
--- a/tensorflow/lite/kernels/internal/reference/transpose_conv.h
+++ b/tensorflow/lite/kernels/internal/reference/transpose_conv.h
@@ -219,6 +219,103 @@ inline void TransposeConv(
   }
 }
 
+inline void HybridTransposeConv(
+    const ConvParams& params, float* scaling_factors_ptr,
+    const RuntimeShape& input_shape, const int8_t* input_data,
+    const RuntimeShape& filter_shape, const int8_t* filter_data,
+    const RuntimeShape& bias_shape, const float* bias_data,
+    const RuntimeShape& output_shape, float* output_data,
+    const float* per_channel_scale, int32_t* input_offset) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
+
+  // Although transpose convolution simplifies to convolution with transposed
+  // weights for strides of 1, non-unitary striding complicates matters. To
+  // keep this reference implementation as clear as possible, we use a
+  // "scatter" access pattern, where we loop through all the input elements,
+  // computing their influence on the output, rather than looping through the
+  // output elements in the typical "gather" access pattern of a conv. We
+  // therefore must initialize the output array to zero.
+  const int num_elements = output_shape.FlatSize();
+  for (int i = 0; i < num_elements; i++) {
+    output_data[i] = 0.0f;
+  }
+
+  // Loop through input elements one at a time.
+  for (int batch = 0; batch < batches; ++batch) {
+    const float scaling_factor = scaling_factors_ptr[batch];
+    for (int in_y = 0; in_y < input_height; ++in_y) {
+      for (int in_x = 0; in_x < input_width; ++in_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          // Loop through the output elements it will influence
+          const int out_x_origin = (in_x * stride_width) - pad_width;
+          const int out_y_origin = (in_y * stride_height) - pad_height;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int out_channel = 0; out_channel < output_depth;
+                   ++out_channel) {
+                // Compute output element location
+                const int out_x = out_x_origin + filter_x;
+                const int out_y = out_y_origin + filter_y;
+                // We cannot accumulate out of bounds
+                if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
+                    (out_y < output_height)) {
+                  int32_t input_value = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_value =
+                      filter_data[Offset(filter_shape, out_channel, filter_y,
+                                         filter_x, in_channel)];
+                  int32_t acc =
+                      (input_value - input_offset[batch]) * filter_value;
+                  output_data[Offset(output_shape, batch, out_y, out_x,
+                                     out_channel)] +=
+                      acc * per_channel_scale[out_channel] * scaling_factor;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          float acc = output_data[Offset(output_shape, batch, out_y, out_x,
+                                         out_channel)];
+          if (bias_data) acc += bias_data[out_channel];
+
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+              ActivationFunctionWithMinMax(acc, output_activation_min,
+                                           output_activation_max);
+        }
+      }
+    }
+  }
+}
+
 }  // namespace reference_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/kernel_util.cc b/tensorflow/lite/kernels/kernel_util.cc
index 58fd99f8fb15be..39f7bc7da53a49 100644
--- a/tensorflow/lite/kernels/kernel_util.cc
+++ b/tensorflow/lite/kernels/kernel_util.cc
@@ -572,12 +572,11 @@ int TfLiteTypeGetSize(TfLiteType type) {
 bool IsMobilePlatform() {
 #if defined(ANDROID) || defined(__ANDROID__)
   return true;
-#elif defined(__APPLE__)
-#if TARGET_IPHONE_SIMULATOR || TARGET_OS_IPHONE
+#elif defined(__APPLE__) && (TARGET_IPHONE_SIMULATOR || TARGET_OS_IPHONE)
   return true;
-#endif
-#endif
+#else
   return false;
+#endif
 }
 
 bool HasUnspecifiedDimension(const TfLiteTensor* tensor) {
diff --git a/tensorflow/lite/kernels/pad.cc b/tensorflow/lite/kernels/pad.cc
index 036afde5fb3aed..0b3366e30c88e5 100644
--- a/tensorflow/lite/kernels/pad.cc
+++ b/tensorflow/lite/kernels/pad.cc
@@ -202,7 +202,6 @@ tflite::PadParams GetPadParams(TfLiteContext* context,
     }
     case kTfLiteInt32: {
       return GetPadParams<int32_t>(context, op_context);
-      break;
     }
     default:
       TF_LITE_KERNEL_LOG(context,
diff --git a/tensorflow/lite/kernels/parse_example/example_proto_fast_parsing.cc b/tensorflow/lite/kernels/parse_example/example_proto_fast_parsing.cc
index a0d6093496ec4e..69d9fc84a5836e 100644
--- a/tensorflow/lite/kernels/parse_example/example_proto_fast_parsing.cc
+++ b/tensorflow/lite/kernels/parse_example/example_proto_fast_parsing.cc
@@ -20,7 +20,7 @@ limitations under the License.
 namespace tensorflow {
 namespace example {
 
-string ExampleName(const gtl::ArraySlice<tstring> example_names, int n) {
+string ExampleName(const absl::Span<const tstring> example_names, int n) {
   return example_names.empty() ? "<unknown>" : example_names[n];
 }
 
diff --git a/tensorflow/lite/kernels/parse_example/example_proto_fast_parsing.h b/tensorflow/lite/kernels/parse_example/example_proto_fast_parsing.h
index 971ddad7f95feb..f25e10f235d0e7 100644
--- a/tensorflow/lite/kernels/parse_example/example_proto_fast_parsing.h
+++ b/tensorflow/lite/kernels/parse_example/example_proto_fast_parsing.h
@@ -450,7 +450,7 @@ struct FeatureProtos {
 // Map from feature name to FeatureProtos for that feature.
 using FeatureProtosMap = absl::flat_hash_map<StringPiece, FeatureProtos>;
 
-string ExampleName(const gtl::ArraySlice<tstring> example_names, int n);
+string ExampleName(const absl::Span<const tstring> example_names, int n);
 
 // Return the number of bytes elements parsed, or -1 on error. If out is null,
 // this method simply counts the number of elements without any copying.
diff --git a/tensorflow/lite/kernels/parse_example/parse_example.cc b/tensorflow/lite/kernels/parse_example/parse_example.cc
index 86ac20625c7cde..0d3e8657c2a7fe 100644
--- a/tensorflow/lite/kernels/parse_example/parse_example.cc
+++ b/tensorflow/lite/kernels/parse_example/parse_example.cc
@@ -430,7 +430,7 @@ void CopySparseBufferToTensor(tf::DataType dtype, size_t offset,
   }
 }
 
-inline void CopyToBuffer(tf::gtl::ArraySlice<tstring> vec, char* tensor_buffer,
+inline void CopyToBuffer(absl::Span<const tstring> vec, char* tensor_buffer,
                          int num_examples, int batch_size,
                          int elements_per_stride) {
   int i = 0, k = 0;
@@ -454,7 +454,7 @@ inline void CopyToBuffer(tf::gtl::ArraySlice<tstring> vec, char* tensor_buffer,
 
 Status FastParseExampleLite(
     const FastParseExampleConfig& config, const TfLiteTensor* serialized,
-    tf::gtl::ArraySlice<tstring> example_names, bool* quick_filter,
+    absl::Span<const tstring> example_names, bool* quick_filter,
     int quick_filter_size, const std::unique_ptr<ConfigIndex>& config_index,
     int config_index_size, SeededHasher* hasher, TfLiteResult* result,
     std::map<absl::string_view, int>& stats, TfLiteContext* context) {
@@ -633,7 +633,7 @@ Status FastParseExampleLite(
         memcpy(tensor_buffer + sizeof(int32_t) * (i + 1), &offset_i,
                sizeof(int32_t));
       }
-      tf::gtl::ArraySlice<tstring> slice(vec.data(), vec.size());
+      absl::Span<const tstring> slice(vec.data(), vec.size());
       CopyToBuffer(slice, tensor_buffer + start, count, batch_size,
                    elements_per_stride);
     }
diff --git a/tensorflow/lite/kernels/stablehlo_composite.cc b/tensorflow/lite/kernels/stablehlo_composite.cc
new file mode 100644
index 00000000000000..5ea88fa7e56054
--- /dev/null
+++ b/tensorflow/lite/kernels/stablehlo_composite.cc
@@ -0,0 +1,257 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/kernels/control_flow_common.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace stablehlo_composite {
+
+struct State {
+  int32_t subgraph_index;
+  bool subgraph_has_dynamic_output_tensors = false;
+};
+
+void* Init(TfLiteContext* context, const char* options, size_t options_len) {
+  auto data = std::make_unique<State>();
+  const TfLiteStablehloCompositeParams* params =
+      reinterpret_cast<const TfLiteStablehloCompositeParams*>(options);
+  data->subgraph_index = params->subgraph_index;
+  return data.release();
+}
+
+void Free(TfLiteContext* context, void* node_data) {
+  delete static_cast<State*>(node_data);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  State* op_state = reinterpret_cast<State*>(node->user_data);
+
+  TF_LITE_ENSURE(context, node->inputs->size > 0);
+
+  const int num_inputs = node->inputs->size;
+  const int num_outputs = node->outputs->size;
+
+  Subgraph* this_subgraph = reinterpret_cast<Subgraph*>(context->impl_);
+  const auto* subgraphs = this_subgraph->GetSubgraphs();
+  TF_LITE_ENSURE(context, op_state->subgraph_index < subgraphs->size());
+
+  Subgraph* decomposition_subgraph =
+      (*subgraphs)[op_state->subgraph_index].get();
+
+  TF_LITE_ENSURE_EQ(context, num_inputs,
+                    decomposition_subgraph->inputs().size());
+  TF_LITE_ENSURE_EQ(context, num_outputs,
+                    decomposition_subgraph->outputs().size());
+
+  // Remove unused inputs of subgraph to skip copying unnecessary inputs.
+  decomposition_subgraph->RemoveUnusedInputs();
+
+  std::vector<int> node_inputs(node->inputs->data,
+                               node->inputs->data + num_inputs);
+
+  // Prepare and check the subgraphs.
+  TF_LITE_ENSURE_OK(context,
+                    CopyTensorsShapeAndType(context, this_subgraph, node_inputs,
+                                            decomposition_subgraph,
+                                            decomposition_subgraph->inputs(),
+                                            /*resize_subgraph_inputs=*/true));
+
+  // Handle resource input tensors.
+  for (int i = 0; i < num_inputs; ++i) {
+    int input_idx = decomposition_subgraph->inputs()[i];
+    if (input_idx == kTfLiteOptionalTensor) {
+      continue;
+    }
+    TfLiteTensor* subgraph_input = decomposition_subgraph->tensor(input_idx);
+    if (!IsResourceOrVariant(subgraph_input)) {
+      // Set the allocation type to custom to prevent memory allocation.
+      subgraph_input->allocation_type = kTfLiteCustom;
+    }
+  }
+
+  // Allocate the memory for the subgraph.
+  TF_LITE_ENSURE_OK(context, decomposition_subgraph->AllocateTensors());
+  op_state->subgraph_has_dynamic_output_tensors |=
+      decomposition_subgraph->HasDynamicTensors();
+
+  for (int i = 0; i < num_outputs; ++i) {
+    if (node->outputs->data[i] == kTfLiteOptionalTensor) {
+      continue;
+    }
+    TfLiteTensor* output;
+    TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, i, &output));
+    if (op_state->subgraph_has_dynamic_output_tensors) {
+      SetTensorToDynamic(output);
+    } else {
+      TfLiteTensor* subgraph_output =
+          decomposition_subgraph->tensor(decomposition_subgraph->outputs()[i]);
+      TfLiteIntArray* output_size = TfLiteIntArrayCopy(subgraph_output->dims);
+      TF_LITE_ENSURE_OK(context,
+                        context->ResizeTensor(context, output, output_size));
+    }
+  }
+  return kTfLiteOk;
+}
+
+// Evaluate the COMPOSITE op when the subgraph has dynamic outputs.
+TfLiteStatus Eval_dynamic(TfLiteContext* context, TfLiteNode* node,
+                          Subgraph* this_subgraph,
+                          Subgraph* decomposition_subgraph) {
+  TF_LITE_ENSURE_OK(context, decomposition_subgraph->AllocateTensors());
+  const int num_inputs = node->inputs->size;
+  const int num_outputs = node->outputs->size;
+  const int* const start = node->inputs->data;
+  std::vector<int> node_inputs(start, start + num_inputs);
+  // node->inputs -> subgraph->inputs
+  TF_LITE_ENSURE_OK(
+      context, DeepOrShallowCopyTensorsShapeTypeData(
+                   context, node, this_subgraph, node_inputs,
+                   decomposition_subgraph, decomposition_subgraph->inputs()));
+
+  // Invoke decomposition_subgraph subgraph
+  TF_LITE_ENSURE_OK(context, decomposition_subgraph->Invoke());
+  for (int tensor_index : decomposition_subgraph->outputs()) {
+    decomposition_subgraph->EnsureTensorDataIsReadable(tensor_index);
+  }
+
+  // subgraph->outputs -> node->outputs
+  TF_LITE_ENSURE_OK(context,
+                    DeepCopyTensorsShapeTypeData(
+                        context, node, decomposition_subgraph,
+                        decomposition_subgraph->outputs(), this_subgraph,
+                        TfLiteIntArrayView(node->outputs), true));
+
+  for (int i = 0; i < num_outputs; ++i) {
+    const int input_pos = OutputIsInput(decomposition_subgraph->outputs()[i],
+                                        decomposition_subgraph->inputs());
+    if (input_pos != -1) {
+      TfLiteTensor* this_input =
+          this_subgraph->tensor(node->inputs->data[input_pos]);
+      TfLiteTensor* this_output = this_subgraph->tensor(node->outputs->data[i]);
+      TfLiteTensorCopy(this_input, this_output);
+    }
+  }
+  return kTfLiteOk;
+}
+
+// Evaluate the COMPOSITE op when the subgraph has static outputs.
+TfLiteStatus Eval_static(TfLiteContext* context, TfLiteNode* node,
+                         Subgraph* this_subgraph,
+                         Subgraph* decomposition_subgraph) {
+  const int num_inputs = node->inputs->size;
+  const int num_outputs = node->outputs->size;
+  const int* const start = node->inputs->data;
+  std::vector<int> node_inputs(start, start + num_inputs);
+  for (int i = 0; i < num_outputs; ++i) {
+    int output_idx = decomposition_subgraph->outputs()[i];
+    if (output_idx == kTfLiteOptionalTensor) continue;
+    TfLiteTensor* subgraph_output = decomposition_subgraph->tensor(output_idx);
+    if (!IsResourceOrVariant(subgraph_output) &&
+        !IsConstantTensor(subgraph_output)) {
+      subgraph_output->allocation_type = kTfLiteCustom;
+    }
+  }
+  // node->inputs -> subgraph->inputs
+  TF_LITE_ENSURE_OK(
+      context, DeepOrShallowCopyTensorsShapeTypeData(
+                   context, node, this_subgraph, node_inputs,
+                   decomposition_subgraph, decomposition_subgraph->inputs()));
+
+  TF_LITE_ENSURE_OK(
+      context,
+      CopyTensorsShapeAndType(context, decomposition_subgraph,
+                              decomposition_subgraph->outputs(), this_subgraph,
+                              TfLiteIntArrayView(node->outputs), false));
+  for (int i = 0; i < num_outputs; ++i) {
+    TfLiteTensor* this_output = this_subgraph->tensor(node->outputs->data[i]);
+    TfLiteTensor* subgraph_output =
+        decomposition_subgraph->tensor(decomposition_subgraph->outputs()[i]);
+    if (decomposition_subgraph->outputs()[i] == kTfLiteOptionalTensor) {
+      TfLiteTensor* this_input = this_subgraph->tensor(node->inputs->data[i]);
+      TfLiteTensorResizeMaybeCopy(this_input->bytes, this_output, false);
+      TfLiteTensorCopy(this_input, this_output);
+    } else {
+      const int input_pos = OutputIsInput(decomposition_subgraph->outputs()[i],
+                                          decomposition_subgraph->inputs());
+      if (input_pos != -1) {
+        TfLiteTensor* this_input =
+            this_subgraph->tensor(node->inputs->data[input_pos]);
+        TfLiteTensorResizeMaybeCopy(this_input->bytes, this_output, false);
+        TfLiteTensorCopy(this_input, this_output);
+      } else if (IsConstantTensor(subgraph_output)) {
+        TfLiteTensorCopy(subgraph_output, this_output);
+      } else {
+        subgraph_output->data = this_output->data;
+      }
+    }
+  }
+
+  // Invoke subgraph
+  TF_LITE_ENSURE_OK(context, decomposition_subgraph->Invoke());
+  for (int tensor_index : decomposition_subgraph->outputs()) {
+    decomposition_subgraph->EnsureTensorDataIsReadable(tensor_index);
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  State* op_state = reinterpret_cast<State*>(node->user_data);
+  Subgraph* this_subgraph = reinterpret_cast<Subgraph*>(context->impl_);
+  auto* subgraphs = this_subgraph->GetSubgraphs();
+  Subgraph* decomposition_subgraph =
+      (*subgraphs)[op_state->subgraph_index].get();
+
+  if (op_state->subgraph_has_dynamic_output_tensors) {
+    TF_LITE_ENSURE_OK(context, Eval_dynamic(context, node, this_subgraph,
+                                            decomposition_subgraph));
+  } else {
+    TF_LITE_ENSURE_OK(context, Eval_static(context, node, this_subgraph,
+                                           decomposition_subgraph));
+  }
+
+  if (!this_subgraph->ShouldPreserveAllTensors()) {
+    TF_LITE_ENSURE_OK(context, decomposition_subgraph->ReleaseMemory());
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace stablehlo_composite
+
+TfLiteRegistration* Register_STABLEHLO_COMPOSITE() {
+  static TfLiteRegistration r = {/*.init=*/stablehlo_composite::Init,
+                                 /*.free=*/stablehlo_composite::Free,
+                                 /*.prepare=*/stablehlo_composite::Prepare,
+                                 /*.invoke=*/stablehlo_composite::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/stablehlo_composite_test.cc b/tensorflow/lite/kernels/stablehlo_composite_test.cc
new file mode 100644
index 00000000000000..65a935ca94a113
--- /dev/null
+++ b/tensorflow/lite/kernels/stablehlo_composite_test.cc
@@ -0,0 +1,138 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstddef>
+#include <memory>
+#include <numeric>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/types/span.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/subgraph_test_util.h"
+#include "tensorflow/lite/kernels/test_util.h"
+
+using testing::ElementsAreArray;
+using testing::FloatEq;
+using testing::Pointwise;
+
+namespace tflite {
+namespace {
+
+class CompositeTest : public subgraph_test_util::ControlFlowOpTest {
+ protected:
+  template <class IndirectionVector>
+  TfLiteTensor* GetTensorWithIndirection(int id,
+                                         const IndirectionVector& tensor_map) {
+    return interpreter_->tensor(tensor_map[id]);
+  }
+
+  TfLiteTensor* GetInputTensor(int id) {
+    return GetTensorWithIndirection(id, interpreter_->inputs());
+  }
+
+  TfLiteTensor* GetOutputTensor(int id) {
+    return GetTensorWithIndirection(id, interpreter_->outputs());
+  }
+
+  template <class T, class IndirectionVector>
+  absl::Span<T> GetTensorDataWithIndirection(
+      int id, const IndirectionVector& tensor_map) {
+    TfLiteTensor* const tensor = GetTensorWithIndirection(id, tensor_map);
+    const size_t size = NumElements(tensor);
+    return absl::Span<T>(GetTensorData<T>(tensor), size);
+  }
+
+  template <class T>
+  absl::Span<T> GetInputData(int id) {
+    return GetTensorDataWithIndirection<T>(id, interpreter_->inputs());
+  }
+
+  template <class T>
+  absl::Span<T> GetOutputData(int id) {
+    return GetTensorDataWithIndirection<T>(id, interpreter_->outputs());
+  }
+};
+
+TEST_F(CompositeTest, TestInvokeWorks) {
+  AddSubgraphs(1);
+  builder_->BuildAddSubgraph(interpreter_->subgraph(1));
+  builder_->BuildCompositeSubgraph(&interpreter_->primary_subgraph(),
+                                   interpreter_->subgraph(1));
+
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {2, 3});
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {2, 3});
+
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  subgraph_test_util::FillIntTensor(GetInputTensor(0), {1, 2, 3, 4, 5, 6});
+  subgraph_test_util::FillIntTensor(GetInputTensor(1), {7, 8, 9, 10, 11, 12});
+
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+
+  const TfLiteTensor* const output = GetOutputTensor(0);
+  ASSERT_THAT(output, DimsAre({2, 3}));
+  EXPECT_THAT(GetOutputData<int>(0), ElementsAreArray({8, 10, 12, 14, 16, 18}));
+}
+
+TEST_F(CompositeTest, TestXNNPACKDelegation) {
+  interpreter_ = std::make_unique<Interpreter>();
+  AddSubgraphs(1);
+  builder_->BuildXNNPACKSubgraph(interpreter_->subgraph(1));
+  builder_->BuildCompositeSubgraph(&interpreter_->primary_subgraph(),
+                                   interpreter_->subgraph(1));
+
+  const auto opt = TfLiteXNNPackDelegateOptionsDefault();
+  TfLiteDelegate* xnnpack_delegate = TfLiteXNNPackDelegateCreate(&opt);
+  interpreter_->primary_subgraph().MarkAsDelegationSkippable();
+  ASSERT_EQ(interpreter_->ModifyGraphWithDelegate(xnnpack_delegate), kTfLiteOk);
+  ASSERT_EQ(interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {2, 3}),
+            kTfLiteOk);
+  ASSERT_EQ(interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {2, 3}),
+            kTfLiteOk);
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  absl::Span<float> input0 = GetInputData<float>(0);
+  std::iota(input0.begin(), input0.end(), 1.0f);
+  absl::Span<float> input1 = GetInputData<float>(1);
+  std::iota(input1.begin(), input1.end(), 7.0f);
+
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+
+  const std::vector<float> expected_values = {16, 20, 24, 28, 32, 36};
+
+  TfLiteTensor* output0 = interpreter_->tensor(interpreter_->outputs()[0]);
+  const absl::Span<float> output0_data(GetTensorData<float>(output0), 6);
+  ASSERT_THAT(output0, DimsAre({2, 3}));
+  EXPECT_THAT(output0_data, Pointwise(FloatEq(), expected_values));
+
+  TfLiteTensor* output1 = interpreter_->tensor(interpreter_->outputs()[1]);
+  const absl::Span<float> output1_data(GetTensorData<float>(output1), 6);
+  ASSERT_THAT(output1, DimsAre({2, 3}));
+  EXPECT_THAT(output1_data, Pointwise(FloatEq(), expected_values));
+
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+  TfLiteXNNPackDelegateDelete(xnnpack_delegate);
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/sub.cc b/tensorflow/lite/kernels/sub.cc
index e971e2bb0305c2..0136e94ab21323 100644
--- a/tensorflow/lite/kernels/sub.cc
+++ b/tensorflow/lite/kernels/sub.cc
@@ -152,9 +152,22 @@ TfLiteStatus PrepareGeneralSubOp(TfLiteContext* context,
   tflite::QuantizeMultiplierSmallerThanOneExp(real_input2_multiplier,
                                               &op_params->input2_multiplier,
                                               &op_params->input2_shift);
-  tflite::QuantizeMultiplierSmallerThanOneExp(real_output_multiplier,
-                                              &op_params->output_multiplier,
-                                              &op_params->output_shift);
+
+  // real_output_multiplier can be > 1 and valid if the stats range is very
+  // narrow. kNearZeroTolerance == 1.0e-6 defined, we extend the range to
+  // 2.0e-6 (max - min) and with a quantized range of [-128, 127] we get an
+  // absolute difference of 255 leading to output_quantization_params.scale ==
+  // (2.0e-6/255) == 7.84314e-9. The above computation for real_output_muliplier
+  // and input scales of 1.0 produces: 2/((1 << 20) * 7.84314e-9) ~= 243
+  if (real_output_multiplier > 1) {
+    tflite::QuantizeMultiplierGreaterThanOne(real_output_multiplier,
+                                             &op_params->output_multiplier,
+                                             &op_params->output_shift);
+  } else {
+    tflite::QuantizeMultiplierSmallerThanOneExp(real_output_multiplier,
+                                                &op_params->output_multiplier,
+                                                &op_params->output_shift);
+  }
 
   TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
       context, params->activation, output, &op_params->output_activation_min,
diff --git a/tensorflow/lite/kernels/subgraph_test_util.cc b/tensorflow/lite/kernels/subgraph_test_util.cc
index 0521ae31889fa0..549085f543af5d 100644
--- a/tensorflow/lite/kernels/subgraph_test_util.cc
+++ b/tensorflow/lite/kernels/subgraph_test_util.cc
@@ -729,6 +729,53 @@ void SubgraphBuilder::BuildIfSubgraph(Subgraph* subgraph) {
                                   nullptr, 0, params, if_reg, &node_index);
 }
 
+void SubgraphBuilder::BuildCompositeSubgraph(Subgraph* subgraph,
+                                             const Subgraph* decomposition) {
+  //               +-----------+
+  // kInputs ----> | COMPOSITE | --> kOutput
+  //               +-----------+
+  //               |           ^
+  //               v           |
+  //               DECOMPOSITION
+
+  const int decomposition_subgraph_index = decomposition->GetSubgraphIndex();
+  const auto& inputs = decomposition->inputs();
+  const auto& outputs = decomposition->outputs();
+  const int decomposition_tensor_count = inputs.size() + outputs.size();
+
+  int first_new_tensor_index;
+  ASSERT_EQ(
+      subgraph->AddTensors(decomposition_tensor_count, &first_new_tensor_index),
+      kTfLiteOk);
+  ASSERT_EQ(first_new_tensor_index, 0);
+  ASSERT_EQ(subgraph->SetInputs(inputs), kTfLiteOk);
+  ASSERT_EQ(subgraph->SetOutputs(outputs), kTfLiteOk);
+
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    const TfLiteTensor* src = decomposition->tensor(inputs[i]);
+    SetupTensor(subgraph, inputs[i], src->type);
+  }
+  for (size_t i = 0; i < outputs.size(); ++i) {
+    const TfLiteTensor* src = decomposition->tensor(outputs[i]);
+    SetupTensor(subgraph, outputs[i], src->type);
+  }
+
+  TfLiteStablehloCompositeParams* params =
+      reinterpret_cast<TfLiteStablehloCompositeParams*>(
+          malloc(sizeof(TfLiteStablehloCompositeParams)));
+  params->name = "test_composite";
+  params->subgraph_index = decomposition_subgraph_index;
+  params->attributes = nullptr;
+  params->attributes_size = 0;
+  params->version = 1;
+  auto* composite_reg = ops::builtin::Register_STABLEHLO_COMPOSITE();
+  composite_reg->builtin_code = kTfLiteBuiltinStablehloComposite;
+
+  int node_index;
+  subgraph->AddNodeWithParameters(inputs, outputs, {}, nullptr, 0, params,
+                                  composite_reg, &node_index);
+}
+
 void SubgraphBuilder::BuildLargeLessEqualCondSubgraph(Subgraph* subgraph,
                                                       int rhs, int num_inputs) {
   const int kOutput = 0;
diff --git a/tensorflow/lite/kernels/subgraph_test_util.h b/tensorflow/lite/kernels/subgraph_test_util.h
index 311397bad9d72b..f6518c686a1e7e 100644
--- a/tensorflow/lite/kernels/subgraph_test_util.h
+++ b/tensorflow/lite/kernels/subgraph_test_util.h
@@ -165,6 +165,10 @@ class SubgraphBuilder {
   // 1 output.
   void BuildIfSubgraph(Subgraph* subgraph);
 
+  // Build a subgraph with a single StableHLO Composite op.
+  void BuildCompositeSubgraph(Subgraph* subgraph,
+                              const Subgraph* decomposition);
+
   // Build a subgraph which triggers the reallocation of an inplace output
   // tensor whose corresponding input has not been consumed yet. This tests that
   // the input pointer has be updated.
diff --git a/tensorflow/lite/kernels/topk_v2.cc b/tensorflow/lite/kernels/topk_v2.cc
index 17e8b716bad24a..6ec3887faea0da 100644
--- a/tensorflow/lite/kernels/topk_v2.cc
+++ b/tensorflow/lite/kernels/topk_v2.cc
@@ -320,18 +320,16 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   switch (output_indexes->type) {
     case kTfLiteInt32: {
       return TopKImpl(context, node, k, GetTensorData<int32_t>(output_indexes));
-    } break;
+    }
     case kTfLiteInt16: {
       return TopKImpl(context, node, k, GetTensorData<int16_t>(output_indexes));
-    } break;
+    }
     default:
       TF_LITE_KERNEL_LOG(
           context, "Output index type %s is currently not supported by TopK.",
           TfLiteTypeGetName(output_values->type));
       return kTfLiteError;
   }
-
-  return kTfLiteOk;
 }
 
 }  // namespace topk_v2
diff --git a/tensorflow/lite/kernels/transpose_conv.cc b/tensorflow/lite/kernels/transpose_conv.cc
index 93c6df28890c9c..7d7c9a410ef451 100644
--- a/tensorflow/lite/kernels/transpose_conv.cc
+++ b/tensorflow/lite/kernels/transpose_conv.cc
@@ -26,8 +26,10 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/optimized/integer_ops/transpose_conv.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 // NOLINTNEXTLINE - This header file shouldn't go to the top.
+#include "tensorflow/lite/kernels/internal/portable_tensor_utils.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/transpose_conv.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/transpose_conv.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
@@ -59,6 +61,9 @@ struct OpData {
   int col2im_id = kTensorNotAllocated;
   int transposed_weights_id = kTensorNotAllocated;
   int scratch_tensor_id = kTensorNotAllocated;
+  int input_quantized_id = kTensorNotAllocated;
+  int scaling_factors_id = kTensorNotAllocated;
+  int input_offset_id = kTensorNotAllocated;
 
   // col2im is the temporary tensor allocated and used in optimized path for
   // storing col2im data:gemm result for input_matrix x filter_matrix.
@@ -73,6 +78,11 @@ struct OpData {
   // results.
   int32_t scratch_tensor_index;
 
+  // Indexes are used for hybrid (dynamic range quantization) path.
+  int32_t input_quantized_index;
+  int32_t scaling_factors_index;
+  int32_t input_offset_index;
+
   TfLitePaddingValues padding;
   // The scaling factor from input to output (aka the 'real multiplier') can
   // be represented as a fixed point multiplier plus a left shift.
@@ -160,6 +170,32 @@ static TfLiteStatus AllocateTemporaryTensorsIfRequired(TfLiteContext* context,
     ++temporaries_count;
   }
 
+  if (input_type == kTfLiteFloat32 && weights_type == kTfLiteInt8) {
+    // Allocate tensor to store the on-the-fly quantized inputs.
+    data->input_quantized_index = temporaries_count;
+    if (data->input_quantized_id == kTensorNotAllocated) {
+      TF_LITE_ENSURE_OK(
+          context, context->AddTensors(context, 1, &data->input_quantized_id));
+    }
+    ++temporaries_count;
+
+    // Allocate tensor to store the quantization params computed during
+    // on-the-fly input quantization.
+    data->scaling_factors_index = temporaries_count;
+    if (data->scaling_factors_id == kTensorNotAllocated) {
+      TF_LITE_ENSURE_OK(
+          context, context->AddTensors(context, 1, &data->scaling_factors_id));
+    }
+    ++temporaries_count;
+
+    data->input_offset_index = temporaries_count;
+    if (data->input_offset_id == kTensorNotAllocated) {
+      TF_LITE_ENSURE_OK(
+          context, context->AddTensors(context, 1, &data->input_offset_id));
+    }
+    ++temporaries_count;
+  }
+
   TfLiteIntArrayFree(node->temporaries);
   node->temporaries = TfLiteIntArrayCreate(temporaries_count);
 
@@ -308,8 +344,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                   bias->type == params->quantized_bias_type);
       data->quantized_bias_type = params->quantized_bias_type;
     }
-  } else {
-    TF_LITE_ENSURE_TYPES_EQ(context, weights->type, input->type);
   }
   TF_LITE_ENSURE_TYPES_EQ(context, output->type, input->type);
   // Ensure that weights and inputs have the same channel dimension.
@@ -406,6 +440,69 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         data->per_channel_output_shift.data(), channels_out));
   }
 
+  if (input->type == kTfLiteFloat32 && weights->type == kTfLiteInt8) {
+    node->temporaries->data[data->input_quantized_index] =
+        data->input_quantized_id;
+    TfLiteTensor* input_quantized;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, data->input_quantized_index,
+                                  &input_quantized));
+    input_quantized->type = kTfLiteInt8;
+    input_quantized->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
+      TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
+      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized,
+                                                       input_quantized_size));
+    }
+
+    node->temporaries->data[data->scaling_factors_index] =
+        data->scaling_factors_id;
+    TfLiteTensor* scaling_factors;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, data->scaling_factors_index,
+                                  &scaling_factors));
+    scaling_factors->type = kTfLiteFloat32;
+    scaling_factors->allocation_type = kTfLiteArenaRw;
+    // Only one scale factor per batch is typically necessary. See optimized
+    // implementation for why we need to allocate for the height of the inputs
+    // flattened to 2D.
+    const int channels_in = weights->dims->data[3];
+    TF_LITE_ENSURE(context, channels_in != 0);
+    const int height = NumElements(input) / channels_in;
+    int scaling_dims[1] = {height};
+    if (!TfLiteIntArrayEqualsArray(scaling_factors->dims, 1, scaling_dims)) {
+      TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
+      scaling_factors_size->data[0] = height;
+      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
+                                                       scaling_factors_size));
+    }
+
+    const auto* affine_quantization =
+        reinterpret_cast<TfLiteAffineQuantization*>(
+            weights->quantization.params);
+    TF_LITE_ENSURE(context, affine_quantization);
+    TF_LITE_ENSURE(context, affine_quantization->scale);
+    TF_LITE_ENSURE_EQ(
+        context, affine_quantization->scale->size,
+        weights->dims->data[affine_quantization->quantized_dimension]);
+    node->temporaries->data[data->input_offset_index] = data->input_offset_id;
+    TfLiteTensor* input_offsets;
+    TF_LITE_ENSURE_OK(context,
+                      GetTemporarySafe(context, node, data->input_offset_index,
+                                       &input_offsets));
+    input_offsets->type = kTfLiteInt32;
+    input_offsets->allocation_type = kTfLiteArenaRw;
+    // See above comment for the need to allocate for height of inputs.
+    TF_LITE_ENSURE(context, channels_in != 0);
+    const int input_offset_dims[1] = {height};
+    if (!TfLiteIntArrayEqualsArray(input_offsets->dims, 1, input_offset_dims)) {
+      TfLiteIntArray* input_offsets_size = TfLiteIntArrayCreate(1);
+      input_offsets_size->data[0] = input_offset_dims[0];
+      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_offsets,
+                                                       input_offsets_size));
+    }
+  }
+
   return kTfLiteOk;
 }
 
@@ -617,6 +714,67 @@ void EvalQuantizedPerChannel16x8(
   }
 }
 
+TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node,
+                        const TfLiteTransposeConvParams* params, OpData* data,
+                        const TfLiteTensor* input, const TfLiteTensor* weights,
+                        const TfLiteTensor* bias, TfLiteTensor* output) {
+  float output_activation_min, output_activation_max;
+  CalculateActivationRange(params->activation, &output_activation_min,
+                           &output_activation_max);
+
+  const int batch_size = SizeOfDimension(input, 0);
+  TF_LITE_ENSURE(context, batch_size != 0);
+  const int input_size = NumElements(input) / batch_size;
+  TfLiteTensor* quantized_input_tensor;
+  TF_LITE_ENSURE_OK(context,
+                    GetTemporarySafe(context, node, data->input_quantized_index,
+                                     &quantized_input_tensor));
+  int8_t* quantized_input_ptr_batch =
+      GetTensorData<int8_t>(quantized_input_tensor);
+  TfLiteTensor* scaling_factors_tensor;
+  TF_LITE_ENSURE_OK(context,
+                    GetTemporarySafe(context, node, data->scaling_factors_index,
+                                     &scaling_factors_tensor));
+  float* scaling_factors_ptr = GetTensorData<float>(scaling_factors_tensor);
+  TfLiteTensor* input_offset_tensor;
+  TF_LITE_ENSURE_OK(context,
+                    GetTemporarySafe(context, node, data->input_offset_index,
+                                     &input_offset_tensor));
+  int32_t* input_offset_ptr = GetTensorData<int32_t>(input_offset_tensor);
+
+  for (int b = 0; b < batch_size; ++b) {
+    const int offset = b * input_size;
+    tensor_utils::AsymmetricQuantizeFloats(
+        GetTensorData<float>(input) + offset, input_size,
+        quantized_input_ptr_batch + offset, &scaling_factors_ptr[b],
+        &input_offset_ptr[b]);
+  }
+
+  const auto* affine_quantization =
+      reinterpret_cast<TfLiteAffineQuantization*>(weights->quantization.params);
+
+  tflite::ConvParams op_params;
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = data->padding.width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width_offset = data->padding.width_offset;
+  op_params.padding_values.height_offset = data->padding.height_offset;
+  op_params.stride_width = params->stride_width;
+  op_params.stride_height = params->stride_height;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+
+  reference_ops::HybridTransposeConv(
+      op_params, scaling_factors_ptr, GetTensorShape(input),
+      quantized_input_ptr_batch, GetTensorShape(weights),
+      GetTensorData<int8_t>(weights), GetTensorShape(bias),
+      GetTensorData<float>(bias), GetTensorShape(output),
+      GetTensorData<float>(output), affine_quantization->scale->data,
+      input_offset_ptr);
+
+  return kTfLiteOk;
+}
+
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   // Retrieve tensors (All should be allocated by now)
@@ -677,14 +835,19 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   // Currently support float32, uint8, int8, int16.
   switch (input->type) {
     case kTfLiteFloat32: {
-      // Only for GenericOptimized path, we use transposed weights.
-      if (data->weights_are_transposed) {
-        if (!IsConstantTensor(weights)) {
-          ResizeAndTransposeWeights(context, weights, transposed_weights);
+      if (weights->type == kTfLiteInt8) {
+        TF_LITE_ENSURE_OK(context, EvalHybrid(context, node, params, data,
+                                              input, weights, bias, output));
+      } else {
+        // Only for GenericOptimized path, we use transposed weights.
+        if (data->weights_are_transposed) {
+          if (!IsConstantTensor(weights)) {
+            ResizeAndTransposeWeights(context, weights, transposed_weights);
+          }
         }
+        EvalFloat<kernel_type>(context, params, data, input, weights, bias,
+                               transposed_weights, col2im, output);
       }
-      EvalFloat<kernel_type>(context, params, data, input, weights, bias,
-                             transposed_weights, col2im, output);
       break;
     }
     case kTfLiteUInt8: {
diff --git a/tensorflow/lite/kernels/transpose_conv_test.cc b/tensorflow/lite/kernels/transpose_conv_test.cc
index 84dfd29dfb9945..bbe7d3f022c39b 100644
--- a/tensorflow/lite/kernels/transpose_conv_test.cc
+++ b/tensorflow/lite/kernels/transpose_conv_test.cc
@@ -1251,6 +1251,102 @@ TEST_P(TransposeConvOpTest, SimpleBiasTestQuantizedPerChannel16x8Bias64) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 2, 3, 2}));
 }
 
+class HybridTransposeConvOpModel
+    : public BaseTransposeConvBiasOpModel<float, int8_t> {
+ public:
+  using BaseTransposeConvBiasOpModel::BaseTransposeConvBiasOpModel;
+
+  void SetFilter(std::initializer_list<float> f) {
+    PerChannelSymmetricQuantizeAndPopulate(filter_, f);
+  }
+
+  void SetBias(std::initializer_list<float> b) { PopulateTensor(bias_, b); }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+TEST_P(TransposeConvOpTest, SimpleTestHybridInt8) {
+  const std::initializer_list<float> filter_data = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+  const std::initializer_list<int8_t> const_filter_data = {14, 28, 42,  56, 71,
+                                                           85, 99, 113, 127};
+  HybridTransposeConvOpModel model(
+      /*registration=*/GetRegistration(), /*output_shape_data=*/{1, 4, 4, 1},
+      /*filter=*/
+      {TensorType_INT8, {1, 3, 3, 1}, 0, 0, 0, 0, true, {9.0 / 127}, {0}, 0},
+      /*filter_data=*/const_filter_data,
+      /*input=*/{TensorType_FLOAT32, {1, 4, 4, 1}},
+      /*output=*/{TensorType_FLOAT32, {}},
+      /*padding=*/Padding_SAME, /*stride_w=*/1, /*stride_h=*/1,
+      /*fused_activation=*/ActivationFunctionType_NONE,
+      /*test_type=*/GetTestType(),
+      /*version=*/3,
+      /*bias_type=*/TensorType_FLOAT32);
+  model.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  if (GetTestType() == TestType::kDynamic) {
+    model.SetFilter(filter_data);
+  }
+
+  model.SetBias({1});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+
+  // The values are taken from float model "SimpleTest".
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                     {30, 63, 84, 76, 100, 193, 238, 199, 208,
+                                      373, 417.5, 331, 263.7, 447, 486, 366.5},
+                                     0.19)));
+
+  // GetOutputShape() should always be same as model.SetOutputShape(...);
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST_P(TransposeConvOpTest, SimpleTestHybridInt8MultiChannel) {
+  const std::initializer_list<float> filter_data = {
+      1, 3, 5, 7, 9, 11, 13, 15, 17, 2, 4, 6, 8, 10, 12, 14, 16, 18};
+  const std::initializer_list<int8_t> const_filter_data = {
+      7,  22, 37, 52, 67, 82, 97, 112, 127,
+      14, 28, 42, 56, 71, 85, 99, 113, 127};
+  HybridTransposeConvOpModel model(
+      /*registration=*/GetRegistration(), /*output_shape_data=*/{1, 5, 5, 2},
+      /*filter=*/
+      {TensorType_INT8,
+       {2, 3, 3, 1},
+       0,
+       0,
+       0,
+       0,
+       true,
+       {17.0 / 127, 18.0 / 127},
+       {0, 0},
+       0},
+      /*filter_data=*/const_filter_data,
+      /*input=*/{TensorType_FLOAT32, {1, 2, 2, 1}},
+      /*output=*/{TensorType_FLOAT32, {}},
+      /*padding=*/Padding_VALID, /*stride_w=*/2, /*stride_h=*/2,
+      /*fused_activation=*/ActivationFunctionType_NONE,
+      /*test_type=*/GetTestType(),
+      /*version=*/3,
+      /*bias_type=*/TensorType_FLOAT32);
+
+  model.SetInput({1, 2, 3, 4});
+  if (GetTestType() == TestType::kDynamic) {
+    model.SetFilter(filter_data);
+  }
+  model.SetBias({3, 4});
+
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+
+  // The values are taken from float model "MultiChannelBiasTest".
+  EXPECT_THAT(
+      model.GetOutput(),
+      ElementsAreArray(ArrayFloatNear(
+          {4,    6,  6,    8,    10, 14,  9,   12, 13,   16, 10, 12,   12,
+           14,   28, 32,   21,   24, 25,  28,  19, 24,   27, 32, 64.5, 76,
+           44.5, 52, 56.5, 63.5, 24, 28,  30,  34, 63.5, 72, 39, 44,   47,
+           52,   42, 46,   48,   52, 106, 114, 63, 68,   71, 76},
+          0.26)));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 5, 5, 2}));
+}
+
 INSTANTIATE_TEST_SUITE_P(
     TransposeConvOpTest, TransposeConvOpTest,
     ::testing::Combine(
diff --git a/tensorflow/lite/kernels/where.cc b/tensorflow/lite/kernels/where.cc
index 1999072430e9b2..294ee248378429 100644
--- a/tensorflow/lite/kernels/where.cc
+++ b/tensorflow/lite/kernels/where.cc
@@ -103,7 +103,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                          TfLiteTypeGetName(cond_tensor->type));
       return kTfLiteError;
   }
-  return kTfLiteOk;
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
diff --git a/tensorflow/lite/objc/PrivacyInfo.xcprivacy b/tensorflow/lite/objc/PrivacyInfo.xcprivacy
new file mode 100644
index 00000000000000..6af16412a006d0
--- /dev/null
+++ b/tensorflow/lite/objc/PrivacyInfo.xcprivacy
@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>NSPrivacyTracking</key>
+	<false/>
+	<key>NSPrivacyCollectedDataTypes</key>
+	<array/>
+	<key>NSPrivacyTrackingDomains</key>
+	<array/>
+	<key>NSPrivacyAccessedAPITypes</key>
+	<array/>
+</dict>
+</plist>
diff --git a/tensorflow/lite/objc/TensorFlowLiteObjC.podspec.template b/tensorflow/lite/objc/TensorFlowLiteObjC.podspec.template
index 4228688fbaab5b..5cad7f57efbba9 100644
--- a/tensorflow/lite/objc/TensorFlowLiteObjC.podspec.template
+++ b/tensorflow/lite/objc/TensorFlowLiteObjC.podspec.template
@@ -28,16 +28,23 @@ Pod::Spec.new do |s|
       '"${PODS_TARGET_SRCROOT}" ' +
       '"${PODS_TARGET_SRCROOT}/' + objc_dir  + 'apis"',
     # TODO: Remove this after adding support for arm64 simulator.
-    'EXCLUDED_ARCHS[sdk=iphonesimulator*]' => '${TFL_EXCLUDED_SIM_ARCHS}'
+    'EXCLUDED_ARCHS[sdk=iphonesimulator*]' => 'i386'
   }
 
   # TODO: Remove this after adding support for arm64 simulator.
   s.user_target_xcconfig = {
-    'EXCLUDED_ARCHS[sdk=iphonesimulator*]' => '${TFL_EXCLUDED_SIM_ARCHS}'
+    'EXCLUDED_ARCHS[sdk=iphonesimulator*]' => 'i386'
   }
 
   s.default_subspec = 'Core'
 
+  # The privacy manifest shared by other subspecs.
+  s.subspec 'Privacy' do |ss|
+    ss.resource_bundles = {
+      s.module_name => objc_dir + 'PrivacyInfo.xcprivacy'
+    }
+  end
+
   s.subspec 'Core' do |core|
     core.public_header_files = objc_dir + 'apis/*.h'
     core.source_files = [
@@ -47,6 +54,7 @@ Pod::Spec.new do |s|
       objc_dir + '{apis,sources}/TFL{Metal,CoreML}Delegate.{h,m}',
     ]
     core.dependency 'TensorFlowLiteC', "#{s.version}"
+    core.dependency "#{s.name}/Privacy", "#{s.version}"
 
     core.test_spec 'Tests' do |ts|
       ts.source_files = objc_dir + 'tests/*.m'
@@ -66,6 +74,7 @@ Pod::Spec.new do |s|
     coreml.ios.deployment_target = '12.0'
     coreml.dependency 'TensorFlowLiteC/CoreML', "#{s.version}"
     coreml.dependency 'TensorFlowLiteObjC/Core', "#{s.version}"
+    coreml.dependency "#{s.name}/Privacy", "#{s.version}"
 
     coreml.test_spec 'Tests' do |ts|
       ts.source_files = objc_dir + 'tests/TFLCoreMLDelegateTests.m'
@@ -81,6 +90,7 @@ Pod::Spec.new do |s|
     ]
     metal.dependency 'TensorFlowLiteC/Metal', "#{s.version}"
     metal.dependency 'TensorFlowLiteObjC/Core', "#{s.version}"
+    metal.dependency "#{s.name}/Privacy", "#{s.version}"
 
     metal.test_spec 'Tests' do |ts|
       ts.source_files = objc_dir + 'tests/TFLMetalDelegateTests.m'
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index 7a5cd8ff42dc42..386ad3d703eeb1 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -180,6 +180,9 @@ py_strict_library(
     name = "lite",
     srcs = ["lite.py"],
     srcs_version = "PY3",
+    tags = [
+        "ignore_for_dep=third_party.py.keras",
+    ],
     visibility = ["//visibility:public"],
     deps = [
         ":conversion_metadata_schema_py",
@@ -208,6 +211,7 @@ py_strict_library(
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:importer",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:versions",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/saved_model:load",
@@ -218,6 +222,7 @@ py_strict_library(
         "//tensorflow/python/saved_model:tag_constants",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:keras_deps",
+        "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
         "@absl_py//absl/logging",
     ],
@@ -439,6 +444,7 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:keras_deps",
         "//tensorflow/python/util:nest",
diff --git a/tensorflow/lite/python/analyzer_test.py b/tensorflow/lite/python/analyzer_test.py
index 5395062ce2fbdc..43412bdf57c9d7 100644
--- a/tensorflow/lite/python/analyzer_test.py
+++ b/tensorflow/lite/python/analyzer_test.py
@@ -70,8 +70,8 @@ def testMlirHugeConst(self):
           model_path=model_path, experimental_use_mlir=True)
     mlir = mock_stdout.getvalue()
     self.assertIn(
-        '%1 = "tfl.pseudo_const"() {value = dense_resource<__elided__> : '
-        'tensor<3x3x3x8xf32>} : () -> tensor<3x3x3x8xf32>', mlir)
+        '%1 = "tfl.pseudo_const"() <{value = dense_resource<__elided__> : '
+        'tensor<3x3x3x8xf32>}> : () -> tensor<3x3x3x8xf32>', mlir)
 
   def testTxtWithFlatBufferModel(self):
 
diff --git a/tensorflow/lite/python/analyzer_wrapper/BUILD b/tensorflow/lite/python/analyzer_wrapper/BUILD
index ed59bb96204331..1c53a0a5535f1e 100644
--- a/tensorflow/lite/python/analyzer_wrapper/BUILD
+++ b/tensorflow/lite/python/analyzer_wrapper/BUILD
@@ -27,6 +27,7 @@ cc_library(
     hdrs = ["model_analyzer.h"],
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/core:framework",
         "//tensorflow/lite:version",
         "//tensorflow/lite/core:model_builder",
         "//tensorflow/lite/core/api:error_reporter",
@@ -34,5 +35,6 @@ cc_library(
         "//tensorflow/lite/schema:schema_utils",
         "//tensorflow/lite/tools/versioning:gpu_compatibility",
         "@com_google_absl//absl/strings",
+        "@flatbuffers//:runtime_cc",
     ],
 )
diff --git a/tensorflow/lite/python/analyzer_wrapper/model_analyzer.cc b/tensorflow/lite/python/analyzer_wrapper/model_analyzer.cc
index 274a24883838b7..50221eb6824add 100644
--- a/tensorflow/lite/python/analyzer_wrapper/model_analyzer.cc
+++ b/tensorflow/lite/python/analyzer_wrapper/model_analyzer.cc
@@ -18,12 +18,13 @@ limitations under the License.
 #include <string>
 
 #include "absl/strings/str_join.h"
+#include "flatbuffers/vector.h"  // from @flatbuffers
+#include "tensorflow/core/public/version.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/model_builder.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/tools/versioning/gpu_compatibility.h"
-#include "tensorflow/lite/version.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 3443f14d2f5fdd..b6cfbaa60f632e 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -62,6 +62,7 @@
 from tensorflow.lite.python.util import get_debug_info as _get_debug_info
 from tensorflow.lite.python.util import get_grappler_config as _get_grappler_config
 from tensorflow.lite.python.util import get_model_hash as _get_model_hash
+from tensorflow.lite.python.util import get_save_spec as _get_save_spec
 from tensorflow.lite.python.util import get_sparsity_modes as _get_sparsity_modes
 from tensorflow.lite.python.util import get_tensor_name as _get_tensor_name
 from tensorflow.lite.python.util import get_tensors_from_tensor_names as _get_tensors_from_tensor_names
@@ -84,6 +85,7 @@
 from tensorflow.python.framework import convert_to_constants as _convert_to_constants
 from tensorflow.python.framework import dtypes as _dtypes
 from tensorflow.python.framework import ops as _ops
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import versions
 from tensorflow.python.framework.errors_impl import NotFoundError as _NotFoundError
 from tensorflow.python.framework.importer import import_graph_def as _import_graph_def
@@ -97,6 +99,7 @@
 from tensorflow.python.saved_model.loader_impl import parse_saved_model_with_debug_info as _parse_saved_model_with_debug_info
 from tensorflow.python.util import deprecation as _deprecation
 from tensorflow.python.util import keras_deps
+from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export as _tf_export
 
 
@@ -1580,11 +1583,57 @@ def _convert_keras_to_saved_model(self, output_dir):
       output_tensors: List of output tensors.
     """
     try:
-      _save.save(
-          self._keras_model,
-          output_dir,
-          options=_save_options.SaveOptions(save_debug_info=True),
-      )
+
+      def _is_keras_3():
+        """Returns true if _keras_model is a Keras 3+ model."""
+        try:
+          import keras  # pylint: disable=g-import-not-at-top
+
+          return keras.__version__.startswith("3") and isinstance(
+              self._keras_model, keras.layers.Layer
+          )
+        except ImportError:
+          return False
+
+      if _is_keras_3():
+        import keras  # pylint: disable=g-import-not-at-top
+
+        # Keras 3 model `export` by default saves model.__call__ with
+        # training=True. Need to export the model call with training=False for
+        # inference only and TFLite conversion.
+        export_archive = keras.export.ExportArchive()
+        export_archive.track(self._keras_model)
+        if isinstance(
+            self._keras_model,
+            (keras.src.models.Functional, keras.src.models.Sequential),
+        ):
+          input_signature = nest.map_structure(
+              lambda x: tensor_spec.TensorSpec(
+                  x.shape, dtype=x.dtype, name=x.name
+              ),
+              self._keras_model.inputs,
+          )
+          if isinstance(input_signature, list) and len(input_signature) > 1:
+            input_signature = [input_signature]
+        else:
+          save_spec = _get_save_spec(self._keras_model)
+          if not save_spec:
+            raise ValueError(
+                "The model provided has never been called. "
+                "It must be called at least once before export."
+            )
+          input_signature = [save_spec]
+        inference_fn = functools.partial(
+            self._keras_model.__call__, training=False
+        )
+        export_archive.add_endpoint("serve", inference_fn, input_signature)
+        export_archive.write_out(output_dir)
+      else:
+        _save.save(
+            self._keras_model,
+            output_dir,
+            options=_save_options.SaveOptions(save_debug_info=True),
+        )
     except Exception:  # pylint: disable=broad-except
       # When storing the given keras model to a saved model is failed, let's
       # use original keras model conversion pipeline.
diff --git a/tensorflow/lite/python/lite_v2_test.py b/tensorflow/lite/python/lite_v2_test.py
index 9e74a9c136f907..687c87d03108a8 100644
--- a/tensorflow/lite/python/lite_v2_test.py
+++ b/tensorflow/lite/python/lite_v2_test.py
@@ -391,9 +391,19 @@ def _createV2QATSavedModelWithFloatOpsAtEnd(self):
     """Create a simple QAT SavedModel that includes float ops at the end."""
     saved_model_dir = os.path.join(self.get_temp_dir(), 'qat_float_ops_at_end')
     input_tensor = tf.keras.layers.Input((32, 32, 128))
-    x = tf.quantization.fake_quant_with_min_max_args(input_tensor, -3.0, 3.0)
+
+    class _FakeQuantArgsLayer(tf.keras.layers.Layer):
+      """A fake quantization layer with fake_quant_with_min_max_args.
+
+      Keras 3 requires wrapping the tf function inside Keras layer.
+      """
+
+      def call(self, x):
+        return tf.quantization.fake_quant_with_min_max_args(x, -3.0, 3.0)
+
+    x = _FakeQuantArgsLayer()(input_tensor)
     x = tf.keras.layers.Conv2D(1, (3, 3))(x)
-    x = tf.quantization.fake_quant_with_min_max_args(x, -3.0, 3.0)
+    x = _FakeQuantArgsLayer()(x)
     # Exclude the quantization of the following Dense layer by not putting
     # fake quant layer after the dense layer.
     output_tensor = tf.keras.layers.Dense(1, activation='sigmoid')(x)
@@ -1607,10 +1617,19 @@ def _createV2QATSavedModel(self, shape):
     input_name = 'input'
     output_name = 'scores'
 
+    class _FakeQuantArgsLayer(tf.keras.layers.Layer):
+      """A fake quantization layer with fake_quant_with_min_max_args.
+
+      Keras 3 requires wrapping the tf function inside Keras layer.
+      """
+
+      def call(self, x):
+        return tf.quantization.fake_quant_with_min_max_args(x, -3.0, 3.0)
+
     input_tensor = tf.keras.layers.Input((32, 32, 128), name=input_name)
-    x = tf.quantization.fake_quant_with_min_max_args(input_tensor, -3.0, 3.0)
+    x = _FakeQuantArgsLayer()(input_tensor)
     x = tf.keras.layers.Conv2D(1, (3, 3))(x)
-    x = tf.quantization.fake_quant_with_min_max_args(x, -3.0, 3.0)
+    x = _FakeQuantArgsLayer()(x)
     scores = tf.keras.layers.Reshape((-1,), name=output_name)(x)
     model = tf.keras.Model(input_tensor, scores)
     model.save(saved_model_dir)
@@ -2679,9 +2698,18 @@ def testKerasFullyConnectedOutputShape3D(self):
         batch_size=1, shape=[3, 3], name='input_tensor', dtype=tf.float32
     )
 
-    x = tf.quantization.fake_quant_with_min_max_args(input_tensor, -3.0, 3.0)
+    class _FakeQuantArgsLayer(tf.keras.layers.Layer):
+      """A fake quantization layer with fake_quant_with_min_max_args.
+
+      Keras 3 requires wrapping the tf function inside Keras layer.
+      """
+
+      def call(self, x):
+        return tf.quantization.fake_quant_with_min_max_args(x, -3.0, 3.0)
+
+    x = _FakeQuantArgsLayer()(input_tensor)
     x = tf.keras.layers.Dense(3)(x)
-    x = tf.quantization.fake_quant_with_min_max_args(x, -3.0, 3.0)
+    x = _FakeQuantArgsLayer()(x)
     model = tf.keras.Model(input_tensor, x)
 
     model.compile(
@@ -2736,14 +2764,20 @@ def call(self, inputs):
             inputs, filters, [*inputs.shape[:-1], 24], 1
         )
 
+    class _FakeQuantVarsLayer(tf.keras.layers.Layer):
+      """A fake quantization layer with fake_quant_with_min_max_vars.
+
+      Keras 3 requires wrapping the tf function inside Keras layer.
+      """
+
+      def call(self, x):
+        return tf.quantization.fake_quant_with_min_max_vars(
+            x, -3.0, 3.0, narrow_range=True)
+
     inp = tf.keras.Input(shape=(6, 8, 48), batch_size=1)
-    x = tf.quantization.fake_quant_with_min_max_vars(
-        inp, -3.0, 3.0, narrow_range=True
-    )
+    x = _FakeQuantVarsLayer()(inp)
     x = QuantConv2DTransposed()(x)
-    x = tf.quantization.fake_quant_with_min_max_vars(
-        x, -3.0, 3.0, narrow_range=True
-    )
+    x = _FakeQuantVarsLayer()(x)
 
     model = tf.keras.Model(inp, x)
 
@@ -3512,10 +3546,18 @@ def call(self, inputs):
             result, -3.0, 3.0, narrow_range=True
         )
 
+    class _FakeQuantVarsLayer(tf.keras.layers.Layer):
+      """A fake quantization layer with fake_quant_with_min_max_vars.
+
+      Keras 3 requires wrapping the tf function inside Keras layer.
+      """
+
+      def call(self, x):
+        return tf.quantization.fake_quant_with_min_max_vars(
+            x, -3.0, 3.0, narrow_range=True)
+
     inp = tf.keras.Input(shape=(6, 8, 6), batch_size=1)
-    x = tf.quantization.fake_quant_with_min_max_vars(
-        inp, -3.0, 3.0, narrow_range=True
-    )
+    x = _FakeQuantVarsLayer()(inp)
     x = QuantConv2DTransposedWithBiasAndActivation()(x)
 
     model = tf.keras.Model(inp, x)
@@ -4083,6 +4125,27 @@ def testKerasRNNMultiBatches(self, rnn_layer):
     expected_value = model.predict(input_data)
     self.assertAllClose(expected_value, actual_value, atol=1e-05)
 
+  @test_util.run_v2_only
+  def testKerasRNNLSTMFloat16Quant(self):
+    input_data = tf.constant(
+        np.array(np.random.random_sample((4, 10, 10)), dtype=np.float32)
+    )
+    # Specify a fixed batch size(4) for the test model.
+    x = tf.keras.layers.Input(batch_shape=(4, 10, 10))
+    y = tf.keras.layers.LSTM(units=10, input_shape=(10, 10))(x)
+    model = tf.keras.Model(inputs=[x], outputs=[y])
+
+    # Convert model.
+    converter = lite.TFLiteConverterV2.from_keras_model(model)
+    converter.optimizations = [tf.lite.Optimize.DEFAULT]
+    converter.target_spec.supported_types = [tf.float16]
+    tflite_model = converter.convert()
+    actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])[0]
+
+    # Check values from converted model.
+    expected_value = model.predict(input_data)
+    self.assertAllClose(expected_value, actual_value, atol=1e-03)
+
   @parameterized.named_parameters(
       ('ForceToUseBatchSizeOne', True), ('DontForceToUseBatchSizeOne', False)
   )
diff --git a/tensorflow/lite/python/tflite_keras_util.py b/tensorflow/lite/python/tflite_keras_util.py
index 80434edd09c520..4c1f79a1fd5a9d 100644
--- a/tensorflow/lite/python/tflite_keras_util.py
+++ b/tensorflow/lite/python/tflite_keras_util.py
@@ -24,6 +24,7 @@
 import copy
 
 from tensorflow.python.eager import def_function
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.util import keras_deps
 from tensorflow.python.util import nest
 from tensorflow.python.util.compat import collections_abc
@@ -51,6 +52,40 @@ def _clear_name(spec):
   return specs
 
 
+def get_save_spec(model):
+  """Returns the save spec of the subclassing keras model."""
+  shapes_dict = getattr(model, '_build_shapes_dict', None)
+  if not shapes_dict:
+    return None
+
+  if 'input_shape' not in shapes_dict:
+    raise ValueError(
+        'Model {} cannot be saved because the input shapes have not been set.'
+    )
+
+  input_shape = shapes_dict['input_shape']
+  if isinstance(input_shape, tuple):
+    shape = input_shape
+    shape = (None,) + shape[1:]
+    return tensor_spec.TensorSpec(
+        shape=shape, dtype=model.input_dtype
+    )
+  elif isinstance(input_shape, dict):
+    specs = {}
+    for key, shape in input_shape.items():
+      shape = (None,) + shape[1:]
+      specs[key] = tensor_spec.TensorSpec(
+          shape=shape, dtype=model.input_dtype, name=key
+      )
+    return specs
+  elif isinstance(input_shape, list):
+    specs = []
+    for shape in input_shape:
+      shape = (None,) + shape[1:]
+      specs.append(tensor_spec.TensorSpec(shape=shape, dtype=model.input_dtype))
+    return specs
+
+
 def model_input_signature(model, keep_original_batch_size=False):
   """Inspect model to get its input signature.
 
diff --git a/tensorflow/lite/python/util.py b/tensorflow/lite/python/util.py
index 05d7ff2c8df8a5..a03ba04605fa2b 100644
--- a/tensorflow/lite/python/util.py
+++ b/tensorflow/lite/python/util.py
@@ -46,6 +46,7 @@
 # Keras functions used by TFLite
 model_input_signature = _tflite_keras_util.model_input_signature
 trace_model_call = _tflite_keras_util.trace_model_call
+get_save_spec = _tflite_keras_util.get_save_spec
 
 # Jax functions used by TFLite
 # pylint: disable=g-import-not-at-top
diff --git a/tensorflow/lite/python/util_test.py b/tensorflow/lite/python/util_test.py
index 3a8b212115c376..5c09fd47171bd8 100644
--- a/tensorflow/lite/python/util_test.py
+++ b/tensorflow/lite/python/util_test.py
@@ -274,7 +274,7 @@ def _generate_integer_tflite_model(quantization_type=dtypes.int8,
     # Convert TF Model to an Integer Quantized TFLite Model
     converter = tf.lite.TFLiteConverter.from_keras_model(model)
   else:
-    model.save(saved_model_dir)
+    tf.saved_model.save(model, saved_model_dir)
     converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
   converter.optimizations = {tf.lite.Optimize.DEFAULT}
 
diff --git a/tensorflow/lite/schema/BUILD b/tensorflow/lite/schema/BUILD
index 3195a5016aa118..29d09105e59950 100644
--- a/tensorflow/lite/schema/BUILD
+++ b/tensorflow/lite/schema/BUILD
@@ -115,14 +115,14 @@ exports_files([
 
 flatbuffer_cc_library(
     name = "schema_fbs",
-    srcs = ["schema.fbs"],
+    srcs = ["//tensorflow/compiler/mlir/lite/schema:schema.fbs"],
     compatible_with = get_compatible_with_portable(),
 )
 
 # Generic schema for flatbuffer converter (but with mutable makes bigger).
 flatbuffer_cc_library(
     name = "schema_fbs_with_mutable",
-    srcs = ["schema.fbs"],
+    srcs = ["//tensorflow/compiler/mlir/lite/schema:schema.fbs"],
     compatible_with = get_compatible_with_portable(),
     flatc_args = [
         "--gen-mutable",
@@ -134,7 +134,7 @@ flatbuffer_cc_library(
 # Generic schema for inference on device (but with reflections makes bigger).
 flatbuffer_cc_library(
     name = "schema_fbs_with_reflection",
-    srcs = ["schema.fbs"],
+    srcs = ["//tensorflow/compiler/mlir/lite/schema:schema.fbs"],
     compatible_with = get_compatible_with_portable(),
     flatc_args = [
         "--reflect-types",
diff --git a/tensorflow/lite/schema/conversion_metadata_generated.h b/tensorflow/lite/schema/conversion_metadata_generated.h
index 2a059f65032a85..12af129c5c2cf6 100755
--- a/tensorflow/lite/schema/conversion_metadata_generated.h
+++ b/tensorflow/lite/schema/conversion_metadata_generated.h
@@ -22,9 +22,9 @@ limitations under the License.
 
 // Ensure the included flatbuffers.h is the same version as when this file was
 // generated, otherwise it may not be compatible.
-static_assert(FLATBUFFERS_VERSION_MAJOR == 23 &&
-              FLATBUFFERS_VERSION_MINOR == 5 &&
-              FLATBUFFERS_VERSION_REVISION == 26,
+static_assert(FLATBUFFERS_VERSION_MAJOR == 24 &&
+              FLATBUFFERS_VERSION_MINOR == 3 &&
+              FLATBUFFERS_VERSION_REVISION == 25,
              "Non-compatible flatbuffers version included");
 
 namespace tflite {
diff --git a/tensorflow/lite/schema/schema.fbs b/tensorflow/lite/schema/schema.fbs
index fe9ee4c11cc5c9..7ab78be26737ee 100644
--- a/tensorflow/lite/schema/schema.fbs
+++ b/tensorflow/lite/schema/schema.fbs
@@ -471,6 +471,7 @@ enum BuiltinOperator : int32 {
   DILATE = 203,
   STABLEHLO_RNG_BIT_GENERATOR = 204,
   REDUCE_WINDOW = 205 (deprecated),
+  STABLEHLO_COMPOSITE = 206, // WARNING: No runtime support
 }
 // LINT.ThenChange(nnapi_linter/linter.proto)
 
@@ -628,6 +629,7 @@ union BuiltinOptions2{
   DilateOptions,
   StablehloRngBitGeneratorOptions,
   ReduceWindowOptions (deprecated),
+  StableHLOCompositeOptions,
 }
 
 table StablehloGatherOptions{
@@ -1487,6 +1489,14 @@ enum CustomOptionsFormat : byte {
   FLEXBUFFERS = 0,
 }
 
+table StableHLOCompositeOptions {
+  name:string;
+  decomposition_subgraph_index:int32;
+  composite_attributes:[ubyte];
+  composite_attributes_format:CustomOptionsFormat;
+  version:int32;
+}
+
 // An operator takes tensors as inputs and outputs. The type of operation being
 // performed is determined by an index into the list of valid OperatorCodes,
 // while the specifics of each operations is configured using builtin_options
diff --git a/tensorflow/lite/schema/schema_generated.h b/tensorflow/lite/schema/schema_generated.h
index 79d78c1fc84341..0ec511eaa57528 100755
--- a/tensorflow/lite/schema/schema_generated.h
+++ b/tensorflow/lite/schema/schema_generated.h
@@ -22,9 +22,9 @@ limitations under the License.
 
 // Ensure the included flatbuffers.h is the same version as when this file was
 // generated, otherwise it may not be compatible.
-static_assert(FLATBUFFERS_VERSION_MAJOR == 23 &&
-              FLATBUFFERS_VERSION_MINOR == 5 &&
-              FLATBUFFERS_VERSION_REVISION == 26,
+static_assert(FLATBUFFERS_VERSION_MAJOR == 24 &&
+              FLATBUFFERS_VERSION_MINOR == 3 &&
+              FLATBUFFERS_VERSION_REVISION == 25,
              "Non-compatible flatbuffers version included");
 
 namespace tflite {
@@ -653,6 +653,10 @@ struct OperatorCode;
 struct OperatorCodeBuilder;
 struct OperatorCodeT;
 
+struct StableHLOCompositeOptions;
+struct StableHLOCompositeOptionsBuilder;
+struct StableHLOCompositeOptionsT;
+
 struct Operator;
 struct OperatorBuilder;
 struct OperatorT;
@@ -1215,11 +1219,12 @@ enum BuiltinOperator : int32_t {
   BuiltinOperator_DILATE = 203,
   BuiltinOperator_STABLEHLO_RNG_BIT_GENERATOR = 204,
   BuiltinOperator_REDUCE_WINDOW = 205,
+  BuiltinOperator_STABLEHLO_COMPOSITE = 206,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_REDUCE_WINDOW
+  BuiltinOperator_MAX = BuiltinOperator_STABLEHLO_COMPOSITE
 };
 
-inline const BuiltinOperator (&EnumValuesBuiltinOperator())[206] {
+inline const BuiltinOperator (&EnumValuesBuiltinOperator())[207] {
   static const BuiltinOperator values[] = {
     BuiltinOperator_ADD,
     BuiltinOperator_AVERAGE_POOL_2D,
@@ -1426,13 +1431,14 @@ inline const BuiltinOperator (&EnumValuesBuiltinOperator())[206] {
     BuiltinOperator_STABLEHLO_TRANSPOSE,
     BuiltinOperator_DILATE,
     BuiltinOperator_STABLEHLO_RNG_BIT_GENERATOR,
-    BuiltinOperator_REDUCE_WINDOW
+    BuiltinOperator_REDUCE_WINDOW,
+    BuiltinOperator_STABLEHLO_COMPOSITE
   };
   return values;
 }
 
 inline const char * const *EnumNamesBuiltinOperator() {
-  static const char * const names[207] = {
+  static const char * const names[208] = {
     "ADD",
     "AVERAGE_POOL_2D",
     "CONCATENATION",
@@ -1639,13 +1645,14 @@ inline const char * const *EnumNamesBuiltinOperator() {
     "DILATE",
     "STABLEHLO_RNG_BIT_GENERATOR",
     "REDUCE_WINDOW",
+    "STABLEHLO_COMPOSITE",
     nullptr
   };
   return names;
 }
 
 inline const char *EnumNameBuiltinOperator(BuiltinOperator e) {
-  if (::flatbuffers::IsOutRange(e, BuiltinOperator_ADD, BuiltinOperator_REDUCE_WINDOW)) return "";
+  if (::flatbuffers::IsOutRange(e, BuiltinOperator_ADD, BuiltinOperator_STABLEHLO_COMPOSITE)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesBuiltinOperator()[index];
 }
@@ -4136,11 +4143,12 @@ enum BuiltinOptions2 : uint8_t {
   BuiltinOptions2_DilateOptions = 18,
   BuiltinOptions2_StablehloRngBitGeneratorOptions = 19,
   BuiltinOptions2_ReduceWindowOptions = 20,
+  BuiltinOptions2_StableHLOCompositeOptions = 21,
   BuiltinOptions2_MIN = BuiltinOptions2_NONE,
-  BuiltinOptions2_MAX = BuiltinOptions2_ReduceWindowOptions
+  BuiltinOptions2_MAX = BuiltinOptions2_StableHLOCompositeOptions
 };
 
-inline const BuiltinOptions2 (&EnumValuesBuiltinOptions2())[21] {
+inline const BuiltinOptions2 (&EnumValuesBuiltinOptions2())[22] {
   static const BuiltinOptions2 values[] = {
     BuiltinOptions2_NONE,
     BuiltinOptions2_StablehloConcatenateOptions,
@@ -4162,13 +4170,14 @@ inline const BuiltinOptions2 (&EnumValuesBuiltinOptions2())[21] {
     BuiltinOptions2_StablehloTransposeOptions,
     BuiltinOptions2_DilateOptions,
     BuiltinOptions2_StablehloRngBitGeneratorOptions,
-    BuiltinOptions2_ReduceWindowOptions
+    BuiltinOptions2_ReduceWindowOptions,
+    BuiltinOptions2_StableHLOCompositeOptions
   };
   return values;
 }
 
 inline const char * const *EnumNamesBuiltinOptions2() {
-  static const char * const names[22] = {
+  static const char * const names[23] = {
     "NONE",
     "StablehloConcatenateOptions",
     "StablehloBroadcastInDimOptions",
@@ -4190,13 +4199,14 @@ inline const char * const *EnumNamesBuiltinOptions2() {
     "DilateOptions",
     "StablehloRngBitGeneratorOptions",
     "ReduceWindowOptions",
+    "StableHLOCompositeOptions",
     nullptr
   };
   return names;
 }
 
 inline const char *EnumNameBuiltinOptions2(BuiltinOptions2 e) {
-  if (::flatbuffers::IsOutRange(e, BuiltinOptions2_NONE, BuiltinOptions2_ReduceWindowOptions)) return "";
+  if (::flatbuffers::IsOutRange(e, BuiltinOptions2_NONE, BuiltinOptions2_StableHLOCompositeOptions)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesBuiltinOptions2()[index];
 }
@@ -4285,6 +4295,10 @@ template<> struct BuiltinOptions2Traits<tflite::ReduceWindowOptions> {
   static const BuiltinOptions2 enum_value = BuiltinOptions2_ReduceWindowOptions;
 };
 
+template<> struct BuiltinOptions2Traits<tflite::StableHLOCompositeOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StableHLOCompositeOptions;
+};
+
 template<typename T> struct BuiltinOptions2UnionTraits {
   static const BuiltinOptions2 enum_value = BuiltinOptions2_NONE;
 };
@@ -4369,6 +4383,10 @@ template<> struct BuiltinOptions2UnionTraits<tflite::ReduceWindowOptionsT> {
   static const BuiltinOptions2 enum_value = BuiltinOptions2_ReduceWindowOptions;
 };
 
+template<> struct BuiltinOptions2UnionTraits<tflite::StableHLOCompositeOptionsT> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2_StableHLOCompositeOptions;
+};
+
 struct BuiltinOptions2Union {
   BuiltinOptions2 type;
   void *value;
@@ -4559,6 +4577,14 @@ struct BuiltinOptions2Union {
     return type == BuiltinOptions2_ReduceWindowOptions ?
       reinterpret_cast<const tflite::ReduceWindowOptionsT *>(value) : nullptr;
   }
+  tflite::StableHLOCompositeOptionsT *AsStableHLOCompositeOptions() {
+    return type == BuiltinOptions2_StableHLOCompositeOptions ?
+      reinterpret_cast<tflite::StableHLOCompositeOptionsT *>(value) : nullptr;
+  }
+  const tflite::StableHLOCompositeOptionsT *AsStableHLOCompositeOptions() const {
+    return type == BuiltinOptions2_StableHLOCompositeOptions ?
+      reinterpret_cast<const tflite::StableHLOCompositeOptionsT *>(value) : nullptr;
+  }
 };
 
 bool VerifyBuiltinOptions2(::flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions2 type);
@@ -14612,6 +14638,122 @@ inline ::flatbuffers::Offset<OperatorCode> CreateOperatorCodeDirect(
 
 ::flatbuffers::Offset<OperatorCode> CreateOperatorCode(::flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct StableHLOCompositeOptionsT : public ::flatbuffers::NativeTable {
+  typedef StableHLOCompositeOptions TableType;
+  std::string name{};
+  int32_t decomposition_subgraph_index = 0;
+  std::vector<uint8_t> composite_attributes{};
+  tflite::CustomOptionsFormat composite_attributes_format = tflite::CustomOptionsFormat_FLEXBUFFERS;
+  int32_t version = 0;
+};
+
+struct StableHLOCompositeOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StableHLOCompositeOptionsT NativeTableType;
+  typedef StableHLOCompositeOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4,
+    VT_DECOMPOSITION_SUBGRAPH_INDEX = 6,
+    VT_COMPOSITE_ATTRIBUTES = 8,
+    VT_COMPOSITE_ATTRIBUTES_FORMAT = 10,
+    VT_VERSION = 12
+  };
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
+  }
+  int32_t decomposition_subgraph_index() const {
+    return GetField<int32_t>(VT_DECOMPOSITION_SUBGRAPH_INDEX, 0);
+  }
+  const ::flatbuffers::Vector<uint8_t> *composite_attributes() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_COMPOSITE_ATTRIBUTES);
+  }
+  tflite::CustomOptionsFormat composite_attributes_format() const {
+    return static_cast<tflite::CustomOptionsFormat>(GetField<int8_t>(VT_COMPOSITE_ATTRIBUTES_FORMAT, 0));
+  }
+  int32_t version() const {
+    return GetField<int32_t>(VT_VERSION, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyField<int32_t>(verifier, VT_DECOMPOSITION_SUBGRAPH_INDEX, 4) &&
+           VerifyOffset(verifier, VT_COMPOSITE_ATTRIBUTES) &&
+           verifier.VerifyVector(composite_attributes()) &&
+           VerifyField<int8_t>(verifier, VT_COMPOSITE_ATTRIBUTES_FORMAT, 1) &&
+           VerifyField<int32_t>(verifier, VT_VERSION, 4) &&
+           verifier.EndTable();
+  }
+  StableHLOCompositeOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(StableHLOCompositeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<StableHLOCompositeOptions> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StableHLOCompositeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct StableHLOCompositeOptionsBuilder {
+  typedef StableHLOCompositeOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
+    fbb_.AddOffset(StableHLOCompositeOptions::VT_NAME, name);
+  }
+  void add_decomposition_subgraph_index(int32_t decomposition_subgraph_index) {
+    fbb_.AddElement<int32_t>(StableHLOCompositeOptions::VT_DECOMPOSITION_SUBGRAPH_INDEX, decomposition_subgraph_index, 0);
+  }
+  void add_composite_attributes(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> composite_attributes) {
+    fbb_.AddOffset(StableHLOCompositeOptions::VT_COMPOSITE_ATTRIBUTES, composite_attributes);
+  }
+  void add_composite_attributes_format(tflite::CustomOptionsFormat composite_attributes_format) {
+    fbb_.AddElement<int8_t>(StableHLOCompositeOptions::VT_COMPOSITE_ATTRIBUTES_FORMAT, static_cast<int8_t>(composite_attributes_format), 0);
+  }
+  void add_version(int32_t version) {
+    fbb_.AddElement<int32_t>(StableHLOCompositeOptions::VT_VERSION, version, 0);
+  }
+  explicit StableHLOCompositeOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StableHLOCompositeOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StableHLOCompositeOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StableHLOCompositeOptions> CreateStableHLOCompositeOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
+    int32_t decomposition_subgraph_index = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> composite_attributes = 0,
+    tflite::CustomOptionsFormat composite_attributes_format = tflite::CustomOptionsFormat_FLEXBUFFERS,
+    int32_t version = 0) {
+  StableHLOCompositeOptionsBuilder builder_(_fbb);
+  builder_.add_version(version);
+  builder_.add_composite_attributes(composite_attributes);
+  builder_.add_decomposition_subgraph_index(decomposition_subgraph_index);
+  builder_.add_name(name);
+  builder_.add_composite_attributes_format(composite_attributes_format);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<StableHLOCompositeOptions> CreateStableHLOCompositeOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr,
+    int32_t decomposition_subgraph_index = 0,
+    const std::vector<uint8_t> *composite_attributes = nullptr,
+    tflite::CustomOptionsFormat composite_attributes_format = tflite::CustomOptionsFormat_FLEXBUFFERS,
+    int32_t version = 0) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  auto composite_attributes__ = composite_attributes ? _fbb.CreateVector<uint8_t>(*composite_attributes) : 0;
+  return tflite::CreateStableHLOCompositeOptions(
+      _fbb,
+      name__,
+      decomposition_subgraph_index,
+      composite_attributes__,
+      composite_attributes_format,
+      version);
+}
+
+::flatbuffers::Offset<StableHLOCompositeOptions> CreateStableHLOCompositeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StableHLOCompositeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct OperatorT : public ::flatbuffers::NativeTable {
   typedef Operator TableType;
   uint32_t opcode_index = 0;
@@ -15124,6 +15266,9 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   const tflite::ReduceWindowOptions *builtin_options_2_as_ReduceWindowOptions() const {
     return builtin_options_2_type() == tflite::BuiltinOptions2_ReduceWindowOptions ? static_cast<const tflite::ReduceWindowOptions *>(builtin_options_2()) : nullptr;
   }
+  const tflite::StableHLOCompositeOptions *builtin_options_2_as_StableHLOCompositeOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2_StableHLOCompositeOptions ? static_cast<const tflite::StableHLOCompositeOptions *>(builtin_options_2()) : nullptr;
+  }
   bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<uint32_t>(verifier, VT_OPCODE_INDEX, 4) &&
@@ -15737,6 +15882,10 @@ template<> inline const tflite::ReduceWindowOptions *Operator::builtin_options_2
   return builtin_options_2_as_ReduceWindowOptions();
 }
 
+template<> inline const tflite::StableHLOCompositeOptions *Operator::builtin_options_2_as<tflite::StableHLOCompositeOptions>() const {
+  return builtin_options_2_as_StableHLOCompositeOptions();
+}
+
 struct OperatorBuilder {
   typedef Operator Table;
   ::flatbuffers::FlatBufferBuilder &fbb_;
@@ -20909,6 +21058,44 @@ inline ::flatbuffers::Offset<OperatorCode> CreateOperatorCode(::flatbuffers::Fla
       _builtin_code);
 }
 
+inline StableHLOCompositeOptionsT *StableHLOCompositeOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<StableHLOCompositeOptionsT>(new StableHLOCompositeOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void StableHLOCompositeOptions::UnPackTo(StableHLOCompositeOptionsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = name(); if (_e) _o->name = _e->str(); }
+  { auto _e = decomposition_subgraph_index(); _o->decomposition_subgraph_index = _e; }
+  { auto _e = composite_attributes(); if (_e) { _o->composite_attributes.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->composite_attributes.begin()); } }
+  { auto _e = composite_attributes_format(); _o->composite_attributes_format = _e; }
+  { auto _e = version(); _o->version = _e; }
+}
+
+inline ::flatbuffers::Offset<StableHLOCompositeOptions> StableHLOCompositeOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StableHLOCompositeOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateStableHLOCompositeOptions(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<StableHLOCompositeOptions> CreateStableHLOCompositeOptions(::flatbuffers::FlatBufferBuilder &_fbb, const StableHLOCompositeOptionsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const StableHLOCompositeOptionsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _name = _o->name.empty() ? 0 : _fbb.CreateString(_o->name);
+  auto _decomposition_subgraph_index = _o->decomposition_subgraph_index;
+  auto _composite_attributes = _o->composite_attributes.size() ? _fbb.CreateVector(_o->composite_attributes) : 0;
+  auto _composite_attributes_format = _o->composite_attributes_format;
+  auto _version = _o->version;
+  return tflite::CreateStableHLOCompositeOptions(
+      _fbb,
+      _name,
+      _decomposition_subgraph_index,
+      _composite_attributes,
+      _composite_attributes_format,
+      _version);
+}
+
 inline OperatorT *Operator::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<OperatorT>(new OperatorT());
   UnPackTo(_o.get(), _resolver);
@@ -24211,6 +24398,10 @@ inline bool VerifyBuiltinOptions2(::flatbuffers::Verifier &verifier, const void
       auto ptr = reinterpret_cast<const tflite::ReduceWindowOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
+    case BuiltinOptions2_StableHLOCompositeOptions: {
+      auto ptr = reinterpret_cast<const tflite::StableHLOCompositeOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
     default: return true;
   }
 }
@@ -24310,6 +24501,10 @@ inline void *BuiltinOptions2Union::UnPack(const void *obj, BuiltinOptions2 type,
       auto ptr = reinterpret_cast<const tflite::ReduceWindowOptions *>(obj);
       return ptr->UnPack(resolver);
     }
+    case BuiltinOptions2_StableHLOCompositeOptions: {
+      auto ptr = reinterpret_cast<const tflite::StableHLOCompositeOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
     default: return nullptr;
   }
 }
@@ -24397,6 +24592,10 @@ inline ::flatbuffers::Offset<void> BuiltinOptions2Union::Pack(::flatbuffers::Fla
       auto ptr = reinterpret_cast<const tflite::ReduceWindowOptionsT *>(value);
       return CreateReduceWindowOptions(_fbb, ptr, _rehasher).Union();
     }
+    case BuiltinOptions2_StableHLOCompositeOptions: {
+      auto ptr = reinterpret_cast<const tflite::StableHLOCompositeOptionsT *>(value);
+      return CreateStableHLOCompositeOptions(_fbb, ptr, _rehasher).Union();
+    }
     default: return 0;
   }
 }
@@ -24483,6 +24682,10 @@ inline BuiltinOptions2Union::BuiltinOptions2Union(const BuiltinOptions2Union &u)
       value = new tflite::ReduceWindowOptionsT(*reinterpret_cast<tflite::ReduceWindowOptionsT *>(u.value));
       break;
     }
+    case BuiltinOptions2_StableHLOCompositeOptions: {
+      value = new tflite::StableHLOCompositeOptionsT(*reinterpret_cast<tflite::StableHLOCompositeOptionsT *>(u.value));
+      break;
+    }
     default:
       break;
   }
@@ -24590,6 +24793,11 @@ inline void BuiltinOptions2Union::Reset() {
       delete ptr;
       break;
     }
+    case BuiltinOptions2_StableHLOCompositeOptions: {
+      auto ptr = reinterpret_cast<tflite::StableHLOCompositeOptionsT *>(value);
+      delete ptr;
+      break;
+    }
     default: break;
   }
   value = nullptr;
diff --git a/tensorflow/lite/swift/BUILD.apple b/tensorflow/lite/swift/BUILD.apple
index 292c2ce94e4c63..92f93975c0b240 100644
--- a/tensorflow/lite/swift/BUILD.apple
+++ b/tensorflow/lite/swift/BUILD.apple
@@ -128,7 +128,7 @@ apple_static_xcframework(
 
 ios_unit_test(
     name = "Tests",
-    size = "small",
+    size = "medium",
     minimum_os_version = TFL_MINIMUM_OS_VERSION,
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = TFL_DEFAULT_TAGS + TFL_DISABLED_SANITIZER_TAGS,
diff --git a/tensorflow/lite/swift/PrivacyInfo.xcprivacy b/tensorflow/lite/swift/PrivacyInfo.xcprivacy
new file mode 100644
index 00000000000000..6af16412a006d0
--- /dev/null
+++ b/tensorflow/lite/swift/PrivacyInfo.xcprivacy
@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>NSPrivacyTracking</key>
+	<false/>
+	<key>NSPrivacyCollectedDataTypes</key>
+	<array/>
+	<key>NSPrivacyTrackingDomains</key>
+	<array/>
+	<key>NSPrivacyAccessedAPITypes</key>
+	<array/>
+</dict>
+</plist>
diff --git a/tensorflow/lite/swift/TensorFlowLiteSwift.podspec.template b/tensorflow/lite/swift/TensorFlowLiteSwift.podspec.template
index 6e5f09ad18d9ec..457ce2e2e10751 100644
--- a/tensorflow/lite/swift/TensorFlowLiteSwift.podspec.template
+++ b/tensorflow/lite/swift/TensorFlowLiteSwift.podspec.template
@@ -25,18 +25,26 @@ Pod::Spec.new do |s|
 
   # TODO: Remove this after adding support for arm64 simulator.
   s.pod_target_xcconfig = {
-    'EXCLUDED_ARCHS[sdk=iphonesimulator*]' => '${TFL_EXCLUDED_SIM_ARCHS}'
+    'EXCLUDED_ARCHS[sdk=iphonesimulator*]' => 'i386'
   }
 
   # TODO: Remove this after adding support for arm64 simulator.
   s.user_target_xcconfig = {
-    'EXCLUDED_ARCHS[sdk=iphonesimulator*]' => '${TFL_EXCLUDED_SIM_ARCHS}'
+    'EXCLUDED_ARCHS[sdk=iphonesimulator*]' => 'i386'
   }
 
   s.default_subspec = 'Core'
 
+  # The privacy manifest shared by other subspecs.
+  s.subspec 'Privacy' do |ss|
+    ss.resource_bundles = {
+      s.module_name => swift_dir + 'PrivacyInfo.xcprivacy'
+    }
+  end
+
   s.subspec 'Core' do |core|
     core.dependency 'TensorFlowLiteC', "#{s.version}"
+    core.dependency "#{s.name}/Privacy", "#{s.version}"
     core.source_files = swift_dir + 'Sources/*.swift'
     core.exclude_files = swift_dir + 'Sources/{CoreML,Metal}Delegate.swift'
 
@@ -55,12 +63,14 @@ Pod::Spec.new do |s|
     coreml.source_files = swift_dir + 'Sources/CoreMLDelegate.swift'
     coreml.dependency 'TensorFlowLiteC/CoreML', "#{s.version}"
     coreml.dependency 'TensorFlowLiteSwift/Core', "#{s.version}"
+    coreml.dependency "#{s.name}/Privacy", "#{s.version}"
   end
 
   s.subspec 'Metal' do |metal|
     metal.source_files = swift_dir + 'Sources/MetalDelegate.swift'
     metal.dependency 'TensorFlowLiteC/Metal', "#{s.version}"
     metal.dependency 'TensorFlowLiteSwift/Core', "#{s.version}"
+    metal.dependency "#{s.name}/Privacy", "#{s.version}"
 
     metal.test_spec 'Tests' do |ts|
       ts.source_files = swift_dir + 'Tests/{Interpreter,MetalDelegate}Tests.swift'
diff --git a/tensorflow/lite/testdata/string_input_model_with_signature.bin b/tensorflow/lite/testdata/string_input_model_with_signature.bin
new file mode 100644
index 00000000000000..6040ef7ade4e73
Binary files /dev/null and b/tensorflow/lite/testdata/string_input_model_with_signature.bin differ
diff --git a/tensorflow/lite/testing/BUILD b/tensorflow/lite/testing/BUILD
index 1ffa19e4e7b65d..a03653b1b89ebf 100644
--- a/tensorflow/lite/testing/BUILD
+++ b/tensorflow/lite/testing/BUILD
@@ -443,9 +443,12 @@ cc_library(
     deps = [
         ":join",
         ":split",
+        ":test_runner",
         ":tf_driver",
         ":tflite_driver",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/lite:string",
+        "@com_google_absl//absl/log:check",
     ] + select({
         "//conditions:default": [
             "//tensorflow/core:framework",
diff --git a/tensorflow/lite/testing/generate_testspec.cc b/tensorflow/lite/testing/generate_testspec.cc
index cc1781341e1e13..e3a05a8289b2b4 100644
--- a/tensorflow/lite/testing/generate_testspec.cc
+++ b/tensorflow/lite/testing/generate_testspec.cc
@@ -20,9 +20,13 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/log/check.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/lite/string_type.h"
 #include "tensorflow/lite/testing/join.h"
 #include "tensorflow/lite/testing/split.h"
+#include "tensorflow/lite/testing/test_runner.h"
 #include "tensorflow/lite/testing/tf_driver.h"
 #include "tensorflow/lite/testing/tflite_driver.h"
 
diff --git a/tensorflow/lite/testing/generate_testspec_test.cc b/tensorflow/lite/testing/generate_testspec_test.cc
index 1887c8f0cd08dd..726c8afa8c516f 100644
--- a/tensorflow/lite/testing/generate_testspec_test.cc
+++ b/tensorflow/lite/testing/generate_testspec_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 
 #include <random>
 
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
 namespace tflite {
diff --git a/tensorflow/lite/testing/init_tensorflow.cc b/tensorflow/lite/testing/init_tensorflow.cc
index ed4d12374489ed..c992db8bd3178a 100644
--- a/tensorflow/lite/testing/init_tensorflow.cc
+++ b/tensorflow/lite/testing/init_tensorflow.cc
@@ -28,4 +28,9 @@ void InitTensorFlow() {
   ::tensorflow::port::InitMain(kFakeName, &argc, &argv);
   free(fake_name_copy);
 }
+
+void InitTensorFlow(int argc, char** argv) {
+  ::tensorflow::port::InitMain(argv[0], &argc, &argv);
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/testing/init_tensorflow.h b/tensorflow/lite/testing/init_tensorflow.h
index 0c36a247912b71..51d08c11cf6775 100644
--- a/tensorflow/lite/testing/init_tensorflow.h
+++ b/tensorflow/lite/testing/init_tensorflow.h
@@ -21,6 +21,9 @@ namespace tflite {
 // command line, so flags are not initialized.
 void InitTensorFlow();
 
+// Initializes tensorflow's libraries with the given command line arguments.
+void InitTensorFlow(int argc, char** argv);
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_TESTING_INIT_TENSORFLOW_H_
diff --git a/tensorflow/lite/testing/tf_driver.cc b/tensorflow/lite/testing/tf_driver.cc
index 8d1a8d6f6e8caa..a1cc88d3a7f2e8 100644
--- a/tensorflow/lite/testing/tf_driver.cc
+++ b/tensorflow/lite/testing/tf_driver.cc
@@ -31,7 +31,7 @@ namespace {
 
 tensorflow::Tensor CreateTensor(const tensorflow::DataType type,
                                 const std::vector<int64_t>& dim) {
-  tensorflow::TensorShape shape{tensorflow::gtl::ArraySlice<int64_t>{
+  tensorflow::TensorShape shape{absl::Span<const int64_t>{
       reinterpret_cast<const int64_t*>(dim.data()), dim.size()}};
   return {type, shape};
 }
diff --git a/tensorflow/lite/testing/tf_driver_test.cc b/tensorflow/lite/testing/tf_driver_test.cc
index 1bdb9c8117acd0..2d3df2239e9eab 100644
--- a/tensorflow/lite/testing/tf_driver_test.cc
+++ b/tensorflow/lite/testing/tf_driver_test.cc
@@ -35,7 +35,7 @@ class TestDriver : public TfDriver {
                           const string& values) {
     tensorflow::Tensor t = {
         type,
-        tensorflow::TensorShape{tensorflow::gtl::ArraySlice<int64_t>{
+        tensorflow::TensorShape{absl::Span<const int64_t>{
             reinterpret_cast<const int64_t*>(shape.data()), shape.size()}}};
     SetInput(values, &t);
     return ReadOutput(t);
diff --git a/tensorflow/lite/tflite_with_xnnpack.cc b/tensorflow/lite/tflite_with_xnnpack.cc
index 22e8617ec74e21..d443d404c21f05 100644
--- a/tensorflow/lite/tflite_with_xnnpack.cc
+++ b/tensorflow/lite/tflite_with_xnnpack.cc
@@ -23,6 +23,10 @@ namespace tflite {
 std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>
 AcquireXNNPACKDelegate() {
   auto opts = TfLiteXNNPackDelegateOptionsDefault();
+#ifdef TFLITE_XNNPACK_DELEGATE_EXPERIMENTAL_WEIGHT_CACHE_FILE_PATH
+  opts.experimental_weight_cache_file_path =
+      TFLITE_XNNPACK_DELEGATE_EXPERIMENTAL_WEIGHT_CACHE_FILE_PATH;
+#endif
   return std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>(
       TfLiteXNNPackDelegateCreate(&opts), TfLiteXNNPackDelegateDelete);
 }
diff --git a/tensorflow/lite/toco/graph_transformations/convert_expanddims_to_reshape.cc b/tensorflow/lite/toco/graph_transformations/convert_expanddims_to_reshape.cc
index 03d5293fffb315..89da8a6888b9eb 100644
--- a/tensorflow/lite/toco/graph_transformations/convert_expanddims_to_reshape.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_expanddims_to_reshape.cc
@@ -31,7 +31,7 @@ ::tensorflow::Status ConvertExpandDimsToReshape::Run(Model* model,
   *modified = false;
   auto expand_it = model->operators.begin() + op_index;
   if (expand_it->get()->type != OperatorType::kExpandDims) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   ExpandDimsOperator* expand_op =
       static_cast<ExpandDimsOperator*>(expand_it->get());
@@ -41,18 +41,18 @@ ::tensorflow::Status ConvertExpandDimsToReshape::Run(Model* model,
   const auto& input_array = model->GetArray(expand_op->inputs[0]);
   if (!input_array.has_shape()) {
     // Yield until input dims have been resolved.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   const auto& axis_array = model->GetArray(expand_op->inputs[1]);
   if (!axis_array.has_shape()) {
     // Yield until input axis array shape has been resolved.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   CHECK_EQ(RequiredBufferSizeForShape(axis_array.shape()), 1);
   if (!axis_array.buffer) {
     // Yield until the input axis array is constant
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   int axis = axis_array.GetBuffer<ArrayDataType::kInt32>().data[0];
   std::vector<int> reshape_dims(input_array.shape().dims());
@@ -99,7 +99,7 @@ ::tensorflow::Status ConvertExpandDimsToReshape::Run(Model* model,
   DeleteOpAndArrays(model, expand_op);
 
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/convert_matrix_diag_v2_or_v3_to_v1.cc b/tensorflow/lite/toco/graph_transformations/convert_matrix_diag_v2_or_v3_to_v1.cc
index ba2dc5d645a581..cc519e4f559647 100644
--- a/tensorflow/lite/toco/graph_transformations/convert_matrix_diag_v2_or_v3_to_v1.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_matrix_diag_v2_or_v3_to_v1.cc
@@ -30,7 +30,7 @@ ::tensorflow::Status ConvertMatrixDiagV2OrV3ToV1::Run(Model* model,
   const auto* op = it->get();
   if (op->type != OperatorType::kMatrixDiagV2 &&
       op->type != OperatorType::kMatrixDiagV3) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (op->inputs.size() != 5) {
@@ -45,7 +45,7 @@ ::tensorflow::Status ConvertMatrixDiagV2OrV3ToV1::Run(Model* model,
 
   if (!input_k.buffer || !input_num_rows.buffer || !input_num_cols.buffer ||
       !input_padding_value.buffer) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (input_k.GetBuffer<ArrayDataType::kInt32>().data.size() != 1 ||
@@ -98,7 +98,7 @@ ::tensorflow::Status ConvertMatrixDiagV2OrV3ToV1::Run(Model* model,
   DeleteOpAndArrays(model, op);
 
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/convert_matrix_set_diag_v2_or_v3_to_v1.cc b/tensorflow/lite/toco/graph_transformations/convert_matrix_set_diag_v2_or_v3_to_v1.cc
index d0de51cf916d3f..66d7f64d7b7483 100644
--- a/tensorflow/lite/toco/graph_transformations/convert_matrix_set_diag_v2_or_v3_to_v1.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_matrix_set_diag_v2_or_v3_to_v1.cc
@@ -36,7 +36,7 @@ ::tensorflow::Status ConvertMatrixSetDiagV2OrV3ToV1::Run(Model* model,
   const auto* op = it->get();
   if (op->type != OperatorType::kMatrixSetDiagV2 &&
       op->type != OperatorType::kMatrixSetDiagV3) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (op->inputs.size() != 3) {
@@ -47,7 +47,7 @@ ::tensorflow::Status ConvertMatrixSetDiagV2OrV3ToV1::Run(Model* model,
   const auto& input_k = model->GetArray(op->inputs[2]);
 
   if (!input_k.buffer) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (input_k.GetBuffer<ArrayDataType::kInt32>().data.size() != 1) {
@@ -77,7 +77,7 @@ ::tensorflow::Status ConvertMatrixSetDiagV2OrV3ToV1::Run(Model* model,
   DeleteOpAndArrays(model, op);
 
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc b/tensorflow/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc
index 56a03dd234a22f..8f56bfa36794f7 100644
--- a/tensorflow/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc
@@ -30,26 +30,26 @@ ::tensorflow::Status ConvertPureConvToDepthwise::Run(Model* model,
   *modified = false;
   auto conv_it = model->operators.begin() + op_index;
   if (conv_it->get()->type != OperatorType::kConv) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   const auto* conv_op = static_cast<ConvOperator*>(conv_it->get());
   if (conv_op->stride_width != conv_op->stride_height) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   if ((conv_op->dilation_width_factor != 1) ||
       (conv_op->dilation_height_factor != 1)) {
     // Depthwise conv does not support dilation
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   auto& input_array = model->GetArray(conv_op->inputs[0]);
   if (!input_array.has_shape()) {
     // Shapes not propagated yet
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   if (input_array.shape().dims(3) != 1) {
     // Not a pure convolution: Conv does accumulation across the depth
     // dimension.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   const auto& weights_name = conv_op->inputs[1];
@@ -59,15 +59,15 @@ ::tensorflow::Status ConvertPureConvToDepthwise::Run(Model* model,
         "Not changing %s to DepthwiseConv because the weights is consumed by "
         "another op.",
         LogName(*conv_op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   auto& weights_array = model->GetArray(weights_name);
   if (!weights_array.buffer) {
     // Yield until the weights are resolved as a constant array.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   if (weights_array.data_type != ArrayDataType::kFloat) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   // At this point we know we have a pure conv. Rewrite it as DepthwiseConv.
   AddMessageF(
@@ -113,7 +113,7 @@ ::tensorflow::Status ConvertPureConvToDepthwise::Run(Model* model,
   *weights_array.mutable_shape()->mutable_dims() = {1, width, height, depth};
   weights_buffer.data = depthwise_conv_weights_data;
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/convert_reorder_axes.cc b/tensorflow/lite/toco/graph_transformations/convert_reorder_axes.cc
index 3f30623a44d412..49c380b3564d27 100644
--- a/tensorflow/lite/toco/graph_transformations/convert_reorder_axes.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_reorder_axes.cc
@@ -92,7 +92,7 @@ ::tensorflow::Status ConvertReorderAxes::Run(Model* model, std::size_t op_index,
   *modified = false;
   auto reorder_it = model->operators.begin() + op_index;
   if (reorder_it->get()->type != OperatorType::kReorderAxes)
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
 
   auto* reorder_op = static_cast<ReorderAxesOperator*>(reorder_it->get());
   CHECK_EQ(reorder_op->inputs.size(), 1);
@@ -118,8 +118,8 @@ ::tensorflow::Status ConvertReorderAxes::Run(Model* model, std::size_t op_index,
   // been adjusted to reflect the permutations in ReorderAxes. ReorderAxes will
   // be merged into a constant array when possible.
   if (IsConstantParameterArray(*model, constant_input_array_name))
-    return ::tensorflow::OkStatus();
-  if (!output_array.has_shape()) return ::tensorflow::OkStatus();
+    return absl::OkStatus();
+  if (!output_array.has_shape()) return absl::OkStatus();
 
   const auto input_axes_order = reorder_op->input_axes_order;
   const auto output_axes_order = reorder_op->output_axes_order;
@@ -145,7 +145,7 @@ ::tensorflow::Status ConvertReorderAxes::Run(Model* model, std::size_t op_index,
   DeleteOpAndArrays(model, reorder_op);
 
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/convert_squeeze_to_reshape.cc b/tensorflow/lite/toco/graph_transformations/convert_squeeze_to_reshape.cc
index 598ae4cda0ad2d..c3bfbf5369ef0c 100644
--- a/tensorflow/lite/toco/graph_transformations/convert_squeeze_to_reshape.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_squeeze_to_reshape.cc
@@ -36,7 +36,7 @@ ::tensorflow::Status ConvertSqueezeToReshape::Run(Model* model,
   *modified = false;
   auto squeeze_it = model->operators.begin() + op_index;
   if (squeeze_it->get()->type != OperatorType::kSqueeze) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   auto squeeze_op = static_cast<SqueezeOperator*>(squeeze_it->get());
   CHECK_EQ(squeeze_op->inputs.size(), 1);
@@ -45,16 +45,16 @@ ::tensorflow::Status ConvertSqueezeToReshape::Run(Model* model,
   const auto& input_array = model->GetArray(squeeze_op->inputs[0]);
   if (!input_array.has_shape()) {
     // Yield until input dims have been resolved.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   if (input_array.shape().dimensions_count() == 0) {
     // Input array cannot be 0-D.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   if (!model->HasArray(squeeze_op->outputs[0]) ||
       !model->GetArray(squeeze_op->outputs[0]).has_shape()) {
     // Yield until shape propagation has set the output shape for us.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // We use the output shape that has been calculated by shape propagation.
@@ -62,7 +62,7 @@ ::tensorflow::Status ConvertSqueezeToReshape::Run(Model* model,
 
   // Empty shapes will not work as empty data arrays.
   if (output_shape.dimensions_count() == 0) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   auto* reshape_op = new TensorFlowReshapeOperator;
@@ -81,7 +81,7 @@ ::tensorflow::Status ConvertSqueezeToReshape::Run(Model* model,
   DeleteOpAndArrays(model, squeeze_op);
 
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/convert_trivial_addn_to_add.cc b/tensorflow/lite/toco/graph_transformations/convert_trivial_addn_to_add.cc
index f2326983b1403f..547e0d805757b6 100644
--- a/tensorflow/lite/toco/graph_transformations/convert_trivial_addn_to_add.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_trivial_addn_to_add.cc
@@ -27,7 +27,7 @@ ::tensorflow::Status ConvertTrivialAddNToAdd::Run(Model* model,
   *modified = false;
   auto addn_it = model->operators.begin() + op_index;
   if (addn_it->get()->type != OperatorType::kAddN) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   AddNOperator* addn_op = static_cast<AddNOperator*>(addn_it->get());
   CHECK_GE(addn_op->inputs.size(), 2);
@@ -35,7 +35,7 @@ ::tensorflow::Status ConvertTrivialAddNToAdd::Run(Model* model,
 
   // We only reduce AddN with N=2 to a regular Add.
   if (addn_op->inputs.size() != 2) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Copy inputs & outputs to regular Add.
@@ -48,7 +48,7 @@ ::tensorflow::Status ConvertTrivialAddNToAdd::Run(Model* model,
   model->operators.emplace(addn_it, add_op);
   DeleteOpAndArrays(model, addn_op);
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/convert_trivial_pack_to_reshape.cc b/tensorflow/lite/toco/graph_transformations/convert_trivial_pack_to_reshape.cc
index c965f28dbc4b5f..f493d4ec9c6ae7 100644
--- a/tensorflow/lite/toco/graph_transformations/convert_trivial_pack_to_reshape.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_trivial_pack_to_reshape.cc
@@ -31,24 +31,24 @@ ::tensorflow::Status ConvertTrivialPackToReshape::Run(Model* model,
   *modified = false;
   auto pack_it = model->operators.begin() + op_index;
   if (pack_it->get()->type != OperatorType::kPack) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   auto* pack_op = static_cast<PackOperator*>(pack_it->get());
   if (pack_op->inputs.size() > 1) {
     // Not trivial.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   CHECK_EQ(pack_op->outputs.size(), 1);
 
   const auto& input_array = model->GetArray(pack_op->inputs[0]);
   if (!input_array.has_shape()) {
     // Yield until input dims have been resolved.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   if (input_array.shape().dimensions_count() == 0) {
     // Input array cannot be 0-D.
     // (Unsure if this is TF behavior, but was required to get a test to pass.)
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   AddMessageF("Converting trivial %s to a reshape", LogName(*pack_op));
@@ -81,7 +81,7 @@ ::tensorflow::Status ConvertTrivialPackToReshape::Run(Model* model,
   DeleteOpAndArrays(model, pack_op);
 
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/convert_trivial_tile_to_concat.cc b/tensorflow/lite/toco/graph_transformations/convert_trivial_tile_to_concat.cc
index bd15e0e908591d..4781f4ef88e780 100644
--- a/tensorflow/lite/toco/graph_transformations/convert_trivial_tile_to_concat.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_trivial_tile_to_concat.cc
@@ -27,7 +27,7 @@ ::tensorflow::Status ConvertTrivialTileToConcat::Run(Model* model,
   *modified = false;
   auto tile_it = model->operators.begin() + op_index;
   if (tile_it->get()->type != OperatorType::kTile) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   auto* tile_op = static_cast<TransposeOperator*>(tile_it->get());
 
@@ -37,13 +37,13 @@ ::tensorflow::Status ConvertTrivialTileToConcat::Run(Model* model,
   if (!input_array.has_shape() || !multiples_array.has_shape() ||
       !output_array.has_shape()) {
     // Yield until PropagateFixedSizes has been run on this op.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   // Note: We can assume we have error checked inputs in PropagateFixedSizes.
 
   if (!multiples_array.buffer) {
     // Yield until the multiples is constant.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   std::vector<int32> const& multiples =
       multiples_array.GetBuffer<ArrayDataType::kInt32>().data;
@@ -62,7 +62,7 @@ ::tensorflow::Status ConvertTrivialTileToConcat::Run(Model* model,
     // The tile is non-trivial. Good luck.
     AddMessageF("Tile %s is non-trivial (has more than one multiply dimension)",
                 LogName(*tile_op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // The tile is like a concat.
@@ -90,7 +90,7 @@ ::tensorflow::Status ConvertTrivialTileToConcat::Run(Model* model,
   DeleteOpAndArrays(model, tile_op);
 
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc b/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
index d9959c218a5154..183cb536eefe1d 100644
--- a/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
@@ -55,7 +55,7 @@ ::tensorflow::Status ConvertTrivialTransposeToReshape::Run(Model* model,
   *modified = false;
   auto transpose_it = model->operators.begin() + op_index;
   if (transpose_it->get()->type != OperatorType::kTranspose) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   TransposeOperator* transpose_op =
       static_cast<TransposeOperator*>(transpose_it->get());
@@ -64,14 +64,14 @@ ::tensorflow::Status ConvertTrivialTransposeToReshape::Run(Model* model,
   const auto& output_array = model->GetArray(transpose_op->outputs[0]);
   if (!input_array.has_shape() || !output_array.has_shape()) {
     // Yield until PropagateFixedSizes has been run on this op.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   // Note: We can assume we have error checked inputs in PropagateFixedSizes.
 
   // Check that the permutation has propagated.
   std::vector<int> const& perm = transpose_op->perm;
   if (perm.empty()) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // This transpose is trivial if non-unitary dimensions remain in the same
@@ -80,7 +80,7 @@ ::tensorflow::Status ConvertTrivialTransposeToReshape::Run(Model* model,
   std::vector<int> const& output_dims = output_array.shape().dims();
 
   if (TransposeAffectsMemoryOrder(perm, input_dims)) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // This transpose is trivial. Replace it with a Reshape op.
@@ -113,7 +113,7 @@ ::tensorflow::Status ConvertTrivialTransposeToReshape::Run(Model* model,
   DeleteOpAndArrays(model, transpose_op);
 
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/create_im2col_arrays.cc b/tensorflow/lite/toco/graph_transformations/create_im2col_arrays.cc
index b0ecd578822d7e..f69afe4b237730 100644
--- a/tensorflow/lite/toco/graph_transformations/create_im2col_arrays.cc
+++ b/tensorflow/lite/toco/graph_transformations/create_im2col_arrays.cc
@@ -82,13 +82,13 @@ ::tensorflow::Status CreateIm2colArrays::Run(Model* model, std::size_t op_index,
   switch (op->type) {
     case OperatorType::kConv:
       *modified = ProcessConvOperator(model, static_cast<ConvOperator*>(op));
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     case OperatorType::kTransposeConv:
       *modified = ProcessTransposeConvOperator(
           model, static_cast<TransposeConvOperator*>(op));
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     default:
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
   }
 }
 
diff --git a/tensorflow/lite/toco/graph_transformations/dequantize.cc b/tensorflow/lite/toco/graph_transformations/dequantize.cc
index 49f0df47526bd6..1aa50692e6748b 100644
--- a/tensorflow/lite/toco/graph_transformations/dequantize.cc
+++ b/tensorflow/lite/toco/graph_transformations/dequantize.cc
@@ -195,10 +195,10 @@ ::tensorflow::Status Dequantize::Run(Model* model, std::size_t op_index,
   if (op->type == OperatorType::kDequantize) {
     auto& input_array = model->GetArray(op->inputs[0]);
     if (input_array.data_type == ArrayDataType::kFloat) {
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     }
     if (input_array.final_data_type != ArrayDataType::kFloat) {
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     }
     input_array.data_type = ArrayDataType::kFloat;
     input_array.quantization_params = nullptr;
@@ -206,7 +206,7 @@ ::tensorflow::Status Dequantize::Run(Model* model, std::size_t op_index,
     output_array.data_type = ArrayDataType::kFloat;
     output_array.quantization_params = nullptr;
     *modified = RemoveTrivialPassthroughOp(this, model, op_index);
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   std::vector<std::string> arrays;
@@ -225,7 +225,7 @@ ::tensorflow::Status Dequantize::Run(Model* model, std::size_t op_index,
   }
 
   *modified = changed;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/drop_fake_quant.cc b/tensorflow/lite/toco/graph_transformations/drop_fake_quant.cc
index e80974d59be170..0a7af2f78e72ff 100644
--- a/tensorflow/lite/toco/graph_transformations/drop_fake_quant.cc
+++ b/tensorflow/lite/toco/graph_transformations/drop_fake_quant.cc
@@ -31,17 +31,17 @@ ::tensorflow::Status DropFakeQuant::Run(Model* model, std::size_t op_index,
   const auto fakequant_it = model->operators.begin() + op_index;
   auto* fakequant_base_op = fakequant_it->get();
   if (fakequant_base_op->type != OperatorType::kFakeQuant) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   auto* fakequant_op = static_cast<FakeQuantOperator*>(fakequant_base_op);
 
   if (!fakequant_op->minmax) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   const auto& output_array = model->GetArray(fakequant_op->outputs[0]);
   if (!output_array.minmax) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Drop min/max inputs
@@ -53,7 +53,7 @@ ::tensorflow::Status DropFakeQuant::Run(Model* model, std::size_t op_index,
   fakequant_op->inputs.resize(1);
 
   *modified = RemoveTrivialPassthroughOp(this, model, op_index);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/drop_im2col_arrays.cc b/tensorflow/lite/toco/graph_transformations/drop_im2col_arrays.cc
index 8ad020104ba458..a0768141def7df 100644
--- a/tensorflow/lite/toco/graph_transformations/drop_im2col_arrays.cc
+++ b/tensorflow/lite/toco/graph_transformations/drop_im2col_arrays.cc
@@ -24,12 +24,12 @@ ::tensorflow::Status DropIm2colArrays::Run(Model* model, std::size_t op_index,
   *modified = false;
   auto conv_it = model->operators.begin() + op_index;
   if (conv_it->get()->type != OperatorType::kConv) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   auto* conv_op = static_cast<ConvOperator*>(conv_it->get());
   if (conv_op->outputs.size() < 2) {
     // Conv op does not have im2col.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Drop the im2col array.
@@ -39,7 +39,7 @@ ::tensorflow::Status DropIm2colArrays::Run(Model* model, std::size_t op_index,
   AddMessageF("Dropped an im2col array for %s", LogName(*conv_op));
 
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/ensure_bias_vectors.cc b/tensorflow/lite/toco/graph_transformations/ensure_bias_vectors.cc
index 37342abee24942..22d6d94cde2f95 100644
--- a/tensorflow/lite/toco/graph_transformations/ensure_bias_vectors.cc
+++ b/tensorflow/lite/toco/graph_transformations/ensure_bias_vectors.cc
@@ -85,10 +85,10 @@ ::tensorflow::Status EnsureBiasVectors::Run(Model* model, std::size_t op_index,
     if (ProcessLinearOperator(model, op)) {
       AddMessageF("Added bias vector to %s as %s", LogName(*op), op->inputs[2]);
       *modified = true;
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     }
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc b/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
index f6120d2d4e2251..b496f5111ae239 100644
--- a/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
+++ b/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
@@ -149,16 +149,16 @@ ::tensorflow::Status EnsureUint8WeightsSafeForFastInt8Kernels::Run(
       // That's why at the moment we only handle operators that use a GEMM
       // (Conv, fully-connected --- note that LSTM merely wraps a
       // fully-connected operator).
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
   }
 
   const std::string& name = op.inputs[weights_index];
   auto& array = model->GetArray(name);
   if (!array.buffer) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   if (array.data_type != ArrayDataType::kUint8) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   auto& buffer_data = array.GetMutableBuffer<ArrayDataType::kUint8>().data;
 
@@ -214,7 +214,7 @@ ::tensorflow::Status EnsureUint8WeightsSafeForFastInt8Kernels::Run(
   }
 
   *modified = changed;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/fuse_activation_functions.cc b/tensorflow/lite/toco/graph_transformations/fuse_activation_functions.cc
index 2ad3bbeb6b0874..7d342702594e2c 100644
--- a/tensorflow/lite/toco/graph_transformations/fuse_activation_functions.cc
+++ b/tensorflow/lite/toco/graph_transformations/fuse_activation_functions.cc
@@ -35,20 +35,20 @@ ::tensorflow::Status FuseActivationFunctions::Run(Model* model,
   if (ac_op->type != OperatorType::kRelu6 &&
       ac_op->type != OperatorType::kRelu1 &&
       ac_op->type != OperatorType::kRelu) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Find the op producing the array passed to this activation function
   Operator* op = GetOpWithOutput(*model, ac_op->inputs[0]);
 
-  if (!op) return ::tensorflow::OkStatus();
+  if (!op) return absl::OkStatus();
 
   if (CountTrueOutputs(*model, *op) > 1) {
     AddMessageF(
         "Not fusing activation function %s into %s because it has more than "
         "one  consumed output",
         LogName(*ac_op), LogName(*op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   CHECK_EQ(op->outputs[0], ac_op->inputs[0]);
@@ -60,7 +60,7 @@ ::tensorflow::Status FuseActivationFunctions::Run(Model* model,
         "Not fusing activation function into %s because it is consumed by more "
         "than 1 other operator",
         LogName(*ac_op), LogName(*op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (!IsDiscardableArray(*model, op->outputs[0])) {
@@ -68,7 +68,7 @@ ::tensorflow::Status FuseActivationFunctions::Run(Model* model,
         "Not fusing activation function %s into %s because output %s it is not "
         "discardable",
         LogName(*ac_op), LogName(*op), op->outputs[0]);
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (op->fused_activation_function != FusedActivationFunctionType::kNone) {
@@ -76,7 +76,7 @@ ::tensorflow::Status FuseActivationFunctions::Run(Model* model,
         "Not fusing activation function %s into %s because it already has a "
         "fused activation function",
         LogName(*ac_op), LogName(*op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (!OperatorSupportsFusedActivation(op->type)) {
@@ -84,7 +84,7 @@ ::tensorflow::Status FuseActivationFunctions::Run(Model* model,
         "Not fusing activation function %s because the %s op doesn't support "
         "it",
         LogName(*ac_op), LogName(*op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   AddMessageF("Fusing activation function %s into the preceding %s",
@@ -101,7 +101,7 @@ ::tensorflow::Status FuseActivationFunctions::Run(Model* model,
   op->outputs[0] = ac_op->outputs[0];
   DeleteOpAndArrays(model, ac_op);
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc b/tensorflow/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc
index 190ee649ded9a9..926d41ca6bbe1e 100644
--- a/tensorflow/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc
+++ b/tensorflow/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc
@@ -160,7 +160,7 @@ ::tensorflow::Status FuseBinaryIntoFollowingAffine::Run(Model* model,
       binary_op->type != OperatorType::kMul &&
       binary_op->type != OperatorType::kSub &&
       binary_op->type != OperatorType::kDiv) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   CHECK_EQ(binary_op->inputs.size(), 2);
@@ -178,12 +178,12 @@ ::tensorflow::Status FuseBinaryIntoFollowingAffine::Run(Model* model,
   };
   if (!is_input_constant[0] && !is_input_constant[1]) {
     // Neither input is constant, so nothing we can fuse into a constant.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   if (is_input_constant[0] && is_input_constant[1]) {
     // Both inputs are constants. That's a job for constants
     // propagation, not for us to handle here.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   const int index_of_constant_input = is_input_constant[0] ? 0 : 1;
   const int index_of_variable_input = is_input_constant[0] ? 1 : 0;
@@ -195,7 +195,7 @@ ::tensorflow::Status FuseBinaryIntoFollowingAffine::Run(Model* model,
     if (index_of_constant_input != 1) {
       AddMessageF("Not fusing %s because the denominator is not constant",
                   LogName(*binary_op));
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     }
   }
 
@@ -207,7 +207,7 @@ ::tensorflow::Status FuseBinaryIntoFollowingAffine::Run(Model* model,
           "Not fusing %s into the following affine op, because we only know "
           "how to do so when the constant operand is a scalar",
           LogName(*binary_op));
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     }
   }
 
@@ -215,13 +215,13 @@ ::tensorflow::Status FuseBinaryIntoFollowingAffine::Run(Model* model,
       FusedActivationFunctionType::kNone) {
     AddMessageF("Not fusing %s because it has a fused activation function",
                 LogName(*binary_op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (CountOpsWithInput(*model, binary_op->outputs[0]) != 1) {
     AddMessageF("Not fusing %s because it's consumed by multiple ops",
                 LogName(*binary_op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   Operator* following_op = GetOpWithInput(*model, binary_op->outputs[0]);
@@ -229,7 +229,7 @@ ::tensorflow::Status FuseBinaryIntoFollowingAffine::Run(Model* model,
   if (!following_op) {
     AddMessageF("Not fusing %s because it is not consumed by any op",
                 LogName(*binary_op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (following_op->type != OperatorType::kConv &&
@@ -239,14 +239,14 @@ ::tensorflow::Status FuseBinaryIntoFollowingAffine::Run(Model* model,
         "Not fusing %s because the following %s is not of one of the supported "
         "types",
         LogName(*binary_op), LogName(*following_op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (following_op->inputs.size() < 3) {
     AddMessageF(
         "Not fusing %s because the following %s does not have a bias vector",
         LogName(*following_op), LogName(*binary_op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   const auto& weights = model->GetArray(following_op->inputs[1]);
@@ -256,7 +256,7 @@ ::tensorflow::Status FuseBinaryIntoFollowingAffine::Run(Model* model,
         "Not fusing %s because the following %s has non-constant weights or "
         "bias arrays",
         LogName(*binary_op), LogName(*following_op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Try to fuse the binary params into the following op's params
@@ -268,7 +268,7 @@ ::tensorflow::Status FuseBinaryIntoFollowingAffine::Run(Model* model,
         AddMessageF(
             "Not fusing %s because the following %s does not use VALID padding",
             LogName(*binary_op), LogName(*following_op));
-        return ::tensorflow::OkStatus();
+        return absl::OkStatus();
       }
     }
     if (following_op->type == OperatorType::kDepthwiseConv) {
@@ -277,7 +277,7 @@ ::tensorflow::Status FuseBinaryIntoFollowingAffine::Run(Model* model,
         AddMessageF(
             "Not fusing %s because the following %s does not use VALID padding",
             LogName(*binary_op), LogName(*following_op));
-        return ::tensorflow::OkStatus();
+        return absl::OkStatus();
       }
     }
     FuseAddOrSubParamsIntoFollowingAffine(model, following_op, binary_op,
@@ -298,7 +298,7 @@ ::tensorflow::Status FuseBinaryIntoFollowingAffine::Run(Model* model,
   following_op->inputs[0] = binary_op->inputs[index_of_variable_input];
   DeleteOpAndArrays(model, binary_op);
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc b/tensorflow/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
index 14e5df1f433b84..130cb0b3dd4bb4 100644
--- a/tensorflow/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
+++ b/tensorflow/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
@@ -213,7 +213,7 @@ ::tensorflow::Status FuseBinaryIntoPrecedingAffine::Run(Model* model,
       binary_op->type != OperatorType::kMul &&
       binary_op->type != OperatorType::kSub &&
       binary_op->type != OperatorType::kDiv) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   CHECK_EQ(binary_op->inputs.size(), 2);
@@ -231,12 +231,12 @@ ::tensorflow::Status FuseBinaryIntoPrecedingAffine::Run(Model* model,
   };
   if (!is_input_constant[0] && !is_input_constant[1]) {
     // Neither input is constant, so nothing we can fuse into a constant.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   if (is_input_constant[0] && is_input_constant[1]) {
     // Both inputs are constants. That's a job for constants
     // propagation, not for us to handle here.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   const int index_of_constant_input = is_input_constant[0] ? 0 : 1;
   const int index_of_variable_input = is_input_constant[0] ? 1 : 0;
@@ -248,7 +248,7 @@ ::tensorflow::Status FuseBinaryIntoPrecedingAffine::Run(Model* model,
     if (index_of_constant_input != 1) {
       AddMessageF("Not fusing %s because the denominator is not constant",
                   LogName(*binary_op));
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     }
   }
 
@@ -257,12 +257,12 @@ ::tensorflow::Status FuseBinaryIntoPrecedingAffine::Run(Model* model,
   if (!preceding_op) {
     AddMessageF("Not fusing %s because it is not the output of another op",
                 LogName(*binary_op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   for (const std::string& output_array : model->flags.output_arrays()) {
     if (preceding_op->outputs[0] == output_array) {
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     }
   }
 
@@ -274,14 +274,14 @@ ::tensorflow::Status FuseBinaryIntoPrecedingAffine::Run(Model* model,
         "Not fusing %s because the preceding %s is not of one of the supported "
         "types",
         LogName(*binary_op), LogName(*preceding_op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (preceding_op->type == OperatorType::kTransposeConv &&
       binary_op->type != OperatorType::kAdd) {
     AddMessageF("Not fusing %s to preceding %s", LogName(*binary_op),
                 LogName(*preceding_op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (preceding_op->fused_activation_function !=
@@ -290,14 +290,14 @@ ::tensorflow::Status FuseBinaryIntoPrecedingAffine::Run(Model* model,
         "Not fusing %s because the preceding %s has a fused activation "
         "function",
         LogName(*binary_op), LogName(*preceding_op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (preceding_op->inputs.size() < 3) {
     AddMessageF(
         "Not fusing %s because the preceding %s does not have a bias vector",
         LogName(*binary_op), LogName(*preceding_op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   const auto& weights_name = preceding_op->inputs[1];
@@ -314,7 +314,7 @@ ::tensorflow::Status FuseBinaryIntoPrecedingAffine::Run(Model* model,
         LogName(*binary_op), LogName(*preceding_op),
         ArrayDataTypeName(weights.data_type),
         ArrayDataTypeName(bias.data_type));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   const int count_ops_consuming_bias = CountOpsWithInput(*model, bias_name);
@@ -328,14 +328,14 @@ ::tensorflow::Status FuseBinaryIntoPrecedingAffine::Run(Model* model,
           "Not fusing %s because the preceding %s has a non-constant bias "
           "array",
           LogName(*binary_op), LogName(*preceding_op));
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     }
     if (count_ops_consuming_bias > 1) {
       AddMessageF(
           "Not fusing %s because the bias of the preceding %s is consumed by "
           "another op",
           LogName(*binary_op), LogName(*preceding_op));
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     }
   } else {
     if (!weights.buffer || !bias.buffer) {
@@ -343,14 +343,14 @@ ::tensorflow::Status FuseBinaryIntoPrecedingAffine::Run(Model* model,
           "Not fusing %s because the preceding %s has non-constant weights or "
           "bias arrays",
           LogName(*binary_op), LogName(*preceding_op));
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     }
     if (count_ops_consuming_weights > 1 || count_ops_consuming_bias > 1) {
       AddMessageF(
           "Not fusing %s because the weights or bias of the preceding %s is "
           "consumed by another op",
           LogName(*binary_op), LogName(*preceding_op));
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     }
   }
 
@@ -362,7 +362,7 @@ ::tensorflow::Status FuseBinaryIntoPrecedingAffine::Run(Model* model,
         "Not fusing %s because the output of the preceding %s is consumed by "
         "another op",
         LogName(*binary_op), LogName(*preceding_op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   AddMessageF("Fusing %s into the preceding %s", LogName(*binary_op),
@@ -386,7 +386,7 @@ ::tensorflow::Status FuseBinaryIntoPrecedingAffine::Run(Model* model,
       binary_op->fused_activation_function;
   DeleteOpAndArrays(model, binary_op);
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/fuse_broadcast_into_following_binary.cc b/tensorflow/lite/toco/graph_transformations/fuse_broadcast_into_following_binary.cc
index 8d1f77cc44a842..df1d6daf213f81 100644
--- a/tensorflow/lite/toco/graph_transformations/fuse_broadcast_into_following_binary.cc
+++ b/tensorflow/lite/toco/graph_transformations/fuse_broadcast_into_following_binary.cc
@@ -60,13 +60,13 @@ ::tensorflow::Status FuseBroadcastIntoFollowingBinary::Run(Model* model,
 
   // Test for binary ops of types that we know how to resolve
   if (binary_op->inputs.size() != 2) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   if (binary_op->type != OperatorType::kAdd &&
       binary_op->type != OperatorType::kMul &&
       binary_op->type != OperatorType::kSub &&
       binary_op->type != OperatorType::kDiv) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // NOTE: either of these ops may be nullptr if the input array is constant.
@@ -81,14 +81,14 @@ ::tensorflow::Status FuseBroadcastIntoFollowingBinary::Run(Model* model,
   if (!is_op_0_broadcast && !is_op_1_broadcast) {
     // Neither input is a broadcast-looking thing.
     AddMessageF("Neither input looks broadcasty");
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   } else if (is_op_0_broadcast && is_op_1_broadcast) {
     AddMessageF(
         "Unable to fuse broadcast into %s as both inputs (%s, %s) are "
         "broadcasts",
         LogName(*binary_op), op[0] ? LogName(*op[0]) : "(?)",
         op[1] ? LogName(*op[1]) : "(?)");
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   int broadcast_index = is_op_0_broadcast ? 0 : 1;
 
@@ -100,7 +100,7 @@ ::tensorflow::Status FuseBroadcastIntoFollowingBinary::Run(Model* model,
 
   // We leave the broadcast op in; it'll get cleaned up if it's not used later.
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc b/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc
index 6e2a8f462c6e4a..f5a8d161e926a2 100644
--- a/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc
+++ b/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc
@@ -413,14 +413,14 @@ ::tensorflow::Status GroupDynamicSequenceOps(Model* model, std::size_t op_index,
   if (final_concat_op->type != OperatorType::kConcatenation &&
       final_concat_op->type != OperatorType::kConcat &&
       final_concat_op->type != OperatorType::kConcatV2) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // for bw, there will be a reverse op at the end.
   Operator *fw_sequence_output, *bw_sequence_output;
   if (!MatchDynamicBidirectionalSequenceOutputs(
           final_concat_op, *model, &fw_sequence_output, &bw_sequence_output)) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Find all upstream unidirectional sequence ops.
@@ -439,14 +439,14 @@ ::tensorflow::Status GroupDynamicSequenceOps(Model* model, std::size_t op_index,
       !FindUnidirectionalSequenceOp(
           *model, *bw_sequence_output, unidirectional_op_type,
           &bw_unidirectional_sequence_ops, &first_bw_sequence_input)) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (!CheckTwoUnidirectionalSequenceOpsAreValid(
           *model, fw_unidirectional_sequence_ops,
           bw_unidirectional_sequence_ops, first_fw_sequence_input,
           first_bw_sequence_input, /*is_dynamic_rnn=*/true)) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   std::vector<T> bidirectional_sequence_ops;
@@ -470,7 +470,7 @@ ::tensorflow::Status GroupDynamicSequenceOps(Model* model, std::size_t op_index,
   // Only keep the fw lstm's input.
   DeleteOpAndArrays(model, first_bw_sequence_input);
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -488,7 +488,7 @@ ::tensorflow::Status GroupBidirectionalSequenceLstm::Run(Model* model,
   if (final_concat_op->type != OperatorType::kConcatenation &&
       final_concat_op->type != OperatorType::kConcat &&
       final_concat_op->type != OperatorType::kConcatV2) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Match fw unidirectional lstm outputs and bw unidirectional lstm outputs:
@@ -496,7 +496,7 @@ ::tensorflow::Status GroupBidirectionalSequenceLstm::Run(Model* model,
   Operator *fw_lstm_output, *bw_lstm_output;
   if (!MatchTwoUnpackOps(*final_concat_op, *model, &fw_lstm_output,
                          &bw_lstm_output)) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Find all upstream unidirectional lstm ops.
@@ -509,14 +509,14 @@ ::tensorflow::Status GroupBidirectionalSequenceLstm::Run(Model* model,
       !FindUnidirectionalSequenceOp(
           *model, *bw_lstm_output, OperatorType::kUnidirectionalSequenceLstm,
           &bw_unidirectional_sequence_lstm_ops, &first_bw_lstm_input)) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (!CheckTwoUnidirectionalSequenceOpsAreValid(
           *model, fw_unidirectional_sequence_lstm_ops,
           bw_unidirectional_sequence_lstm_ops, first_fw_lstm_input,
           first_bw_lstm_input, /*is_dynamic_rnn=*/false)) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   std::vector<BidirectionalSequenceLstmOperator*>
@@ -549,7 +549,7 @@ ::tensorflow::Status GroupBidirectionalSequenceLstm::Run(Model* model,
   // Only keep the fw lstm's pack input.
   DeleteOpAndArrays(model, first_bw_lstm_input);
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 ::tensorflow::Status GroupBidirectionalSequenceRnn::Run(Model* model,
@@ -565,7 +565,7 @@ ::tensorflow::Status GroupBidirectionalSequenceRnn::Run(Model* model,
   if (final_concat_op->type != OperatorType::kConcatenation &&
       final_concat_op->type != OperatorType::kConcat &&
       final_concat_op->type != OperatorType::kConcatV2) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Match fw unidirectional rnn outputs and bw unidirectional rnn outputs:
@@ -573,7 +573,7 @@ ::tensorflow::Status GroupBidirectionalSequenceRnn::Run(Model* model,
   Operator *fw_rnn_output, *bw_rnn_output;
   if (!MatchTwoUnpackOps(*final_concat_op, *model, &fw_rnn_output,
                          &bw_rnn_output)) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Find all upstream unidirectional rnn ops.
@@ -586,14 +586,14 @@ ::tensorflow::Status GroupBidirectionalSequenceRnn::Run(Model* model,
       !FindUnidirectionalSequenceOp(
           *model, *bw_rnn_output, OperatorType::kUnidirectionalSequenceRnn,
           &bw_unidirectional_sequence_rnn_ops, &first_bw_rnn_input)) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (!CheckTwoUnidirectionalSequenceOpsAreValid(
           *model, fw_unidirectional_sequence_rnn_ops,
           bw_unidirectional_sequence_rnn_ops, first_fw_rnn_input,
           first_bw_rnn_input, /*is_dynamic_rnn=*/false)) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   std::vector<BidirectionalSequenceRnnOperator*> bidirectional_sequence_rnn_ops;
@@ -624,7 +624,7 @@ ::tensorflow::Status GroupBidirectionalSequenceRnn::Run(Model* model,
   // Only keep the fw rnn's pack input.
   DeleteOpAndArrays(model, first_bw_rnn_input);
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 ::tensorflow::Status GroupDynamicBidirectionalSequenceRnn::Run(
diff --git a/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc b/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
index b0f5176ef621b4..53c12b476a8c0e 100644
--- a/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
+++ b/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
@@ -527,7 +527,7 @@ ::tensorflow::Status HardcodeMinMax::Run(Model* model, std::size_t op_index,
     AddMessageF("Hardcoded min-max through %s", LogName(*op));
   }
   *modified = changed;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/identify_dilated_conv.cc b/tensorflow/lite/toco/graph_transformations/identify_dilated_conv.cc
index d2178d7ae43e68..026f51ab144a9b 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_dilated_conv.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_dilated_conv.cc
@@ -177,17 +177,17 @@ ::tensorflow::Status IdentifyDilatedConv::Run(Model* model,
   // ***************************************************************************
   // SpaceToBatch Op.
   if (stb_op->type != OperatorType::kSpaceToBatchND) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   if (stb_op->inputs.size() != 3) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   CHECK_EQ(stb_op->outputs.size(), 1);
   // Extract the dilation factor from Input[1] of SpaceToBatch
   // TODO(mjmatthews): Support 2D dilation factors.
   const auto& block_shape_array = model->GetArray(stb_op->inputs[1]);
   if (!block_shape_array.buffer) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   CHECK_EQ(block_shape_array.shape().dimensions_count(), 1);
   int dilation_factor =
@@ -196,7 +196,7 @@ ::tensorflow::Status IdentifyDilatedConv::Run(Model* model,
   // Expand Op
   auto* post_stb_op = GetOpWithInput(*model, stb_op->outputs[0]);
   if (!post_stb_op) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   bool has_expand_op = false;
   if (post_stb_op->type == OperatorType::kExpandDims) {
@@ -231,7 +231,7 @@ ::tensorflow::Status IdentifyDilatedConv::Run(Model* model,
   }
 
   *modified = changed;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/identify_hardswish.cc b/tensorflow/lite/toco/graph_transformations/identify_hardswish.cc
index 663e9b5b1ff695..4a6dea0c487c42 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_hardswish.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_hardswish.cc
@@ -43,7 +43,7 @@ ::tensorflow::Status IdentifyHardSwish::Run(Model* model, std::size_t op_index,
   const auto add_with_relu6_op = add_with_relu6_op_it->get();
   if (!util::IsBinaryOp(add_with_relu6_op, OperatorType::kAdd,
                         FusedActivationFunctionType::kRelu6)) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   std::vector<const Operator*> ops;
   ops.push_back(add_with_relu6_op);
@@ -55,7 +55,7 @@ ::tensorflow::Status IdentifyHardSwish::Run(Model* model, std::size_t op_index,
     ops.push_back(mul_op);
   }
   if (!IsBinaryOp(mul_op, OperatorType::kMul)) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   const auto* output_op = GetOpWithInput(*model, mul_op->outputs[0]);
@@ -65,13 +65,13 @@ ::tensorflow::Status IdentifyHardSwish::Run(Model* model, std::size_t op_index,
     ops.push_back(output_op);
   }
   if (!IsBinaryOp(output_op, OperatorType::kMul)) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   const auto add_3_tensor =
       util::GetSingleScalarInputIndexOfBinaryOp(model, add_with_relu6_op, 3.0f);
   if (add_3_tensor < 0) {
     // Expected 3.0f got something else.;
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   const auto input_tensor_name = add_with_relu6_op->inputs[1 - add_3_tensor];
 
@@ -87,7 +87,7 @@ ::tensorflow::Status IdentifyHardSwish::Run(Model* model, std::size_t op_index,
   if (std::find(mul_inputs.begin(), mul_inputs.end(), input_tensor_name) ==
       mul_inputs.end()) {
     // Input tensor not found! << input_tensor_name << std::endl;
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   // 2. Find 1/6
   bool found = false;
@@ -96,7 +96,7 @@ ::tensorflow::Status IdentifyHardSwish::Run(Model* model, std::size_t op_index,
   }
   if (!found) {
     // Input tensor is not divided by 6!.";
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   //  Success! Now delete the subgraph and instert new one
   const auto output_tensor_name = output_op->outputs[0];
@@ -111,7 +111,7 @@ ::tensorflow::Status IdentifyHardSwish::Run(Model* model, std::size_t op_index,
     ops.pop_back();
   }
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/identify_l2_normalization.cc b/tensorflow/lite/toco/graph_transformations/identify_l2_normalization.cc
index e710d09bd13753..b66f0b02b2ab12 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_l2_normalization.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_l2_normalization.cc
@@ -37,7 +37,7 @@ ::tensorflow::Status IdentifyL2Normalization::Run(Model* model,
   } else if (div_or_mul_op->type == OperatorType::kMul) {
     expected_op_type_producing_div_or_mul_input = OperatorType::kRsqrt;
   } else {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   CHECK_EQ(div_or_mul_op->inputs.size(), 2);
   Operator* op_producing_div_or_mul_input[2] = {
@@ -47,14 +47,14 @@ ::tensorflow::Status IdentifyL2Normalization::Run(Model* model,
   if (!op_producing_div_or_mul_input[1] ||
       op_producing_div_or_mul_input[1]->type !=
           expected_op_type_producing_div_or_mul_input) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   Operator* sqrt_or_rsqrt_op = op_producing_div_or_mul_input[1];
   CHECK_EQ(sqrt_or_rsqrt_op->inputs.size(), 1);
   Operator* op_producing_sqrt_or_rsqrt_input =
       GetOpWithOutput(*model, sqrt_or_rsqrt_op->inputs[0]);
   if (!op_producing_sqrt_or_rsqrt_input) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // There may be an Add or a Maximum here, adding or clamping to a "small"
@@ -94,7 +94,7 @@ ::tensorflow::Status IdentifyL2Normalization::Run(Model* model,
           " because the operator producing the input to the square root, %s,"
           ", does not match the expected pattern",
           LogName(*op_producing_sqrt_or_rsqrt_input));
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     }
   }
 
@@ -105,7 +105,7 @@ ::tensorflow::Status IdentifyL2Normalization::Run(Model* model,
         "Giving up trying to identify L2Normalization subgraph: "
         "expected Sum op, got %s",
         LogName(*sum_op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   Operator* square_op = GetOpWithOutput(*model, sum_op->inputs[0]);
@@ -114,7 +114,7 @@ ::tensorflow::Status IdentifyL2Normalization::Run(Model* model,
         "Giving up trying to identify L2Normalization subgraph: "
         "expected Square op, got %s",
         LogName(*square_op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   CHECK_EQ(square_op->inputs.size(), 1);
@@ -124,7 +124,7 @@ ::tensorflow::Status IdentifyL2Normalization::Run(Model* model,
         "Giving up trying to identify L2Normalization subgraph: %s does not "
         "take the same input as the Mul/Div node",
         LogName(*square_op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Create and emplace the new L2Normalization
@@ -144,7 +144,7 @@ ::tensorflow::Status IdentifyL2Normalization::Run(Model* model,
   DeleteOpAndArrays(model, sqrt_or_rsqrt_op);
   DeleteOpAndArrays(model, div_or_mul_op);
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/identify_l2_pool.cc b/tensorflow/lite/toco/graph_transformations/identify_l2_pool.cc
index 0bd82bfb2bc991..91bda7ef825cf0 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_l2_pool.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_l2_pool.cc
@@ -30,7 +30,7 @@ ::tensorflow::Status IdentifyL2Pool::Run(Model* model, std::size_t op_index,
   const auto sqrt_it = model->operators.begin() + op_index;
   const auto* sqrt_op = sqrt_it->get();
   if (sqrt_op->type != OperatorType::kSqrt) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   CHECK_EQ(sqrt_op->inputs.size(), 1);
@@ -44,7 +44,7 @@ ::tensorflow::Status IdentifyL2Pool::Run(Model* model, std::size_t op_index,
     AddMessageF(
         "Giving up trying to identify L2Pool subgraph: "
         "expected AveragePool op, but Sqrt op has no preceding op");
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (prev_to_sqrt_op->type != OperatorType::kAveragePool) {
@@ -52,7 +52,7 @@ ::tensorflow::Status IdentifyL2Pool::Run(Model* model, std::size_t op_index,
         "Giving up trying to identify L2Pool subgraph: "
         "expected AveragePool op, got %s",
         LogName(*prev_to_sqrt_op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   avpool_op = static_cast<const AveragePoolOperator*>(prev_to_sqrt_op);
@@ -65,7 +65,7 @@ ::tensorflow::Status IdentifyL2Pool::Run(Model* model, std::size_t op_index,
         "Giving up trying to identify L2Pool subgraph: "
         "expected Square op, got %s",
         LogName(*square_op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Create and emplace L2Pool node.
@@ -91,7 +91,7 @@ ::tensorflow::Status IdentifyL2Pool::Run(Model* model, std::size_t op_index,
   DeleteOpAndArrays(model, sqrt_op);
 
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/identify_lstm.cc b/tensorflow/lite/toco/graph_transformations/identify_lstm.cc
index ca04dbdedc9f70..18e74ae6270bc6 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_lstm.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_lstm.cc
@@ -145,27 +145,27 @@ ::tensorflow::Status IdentifyLstmCell::Run(Model* model, std::size_t op_index,
   auto op_it = model->operators.begin() + op_index;
   Operator* final_output_mul = op_it->get();
   if (final_output_mul->type != OperatorType::kMul) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   // final_output_mul->outputs[0] would be one of the two outputs of our
   // LstmCell. Exit if it does not already have a data type.
   // We won't be able to propagate data types through a fused LstmCell.
   if (model->GetArray(final_output_mul->outputs[0]).data_type ==
       ArrayDataType::kNone) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   Operator *state_output_tanh, *fc_output_sig;
   if (!MatchOperatorInputs(*final_output_mul, *model, OperatorType::kTanh,
                            &state_output_tanh, OperatorType::kLogistic,
                            &fc_output_sig)) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   // state_output_tanh->inputs[0] would be one of the two outputs of our
   // LstmCell. Exit if it does not already have a data type.
   // We won't be able to propagate data types through a fused LstmCell.
   if (model->GetArray(state_output_tanh->inputs[0]).data_type ==
       ArrayDataType::kNone) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // State output TanH
@@ -174,7 +174,7 @@ ::tensorflow::Status IdentifyLstmCell::Run(Model* model, std::size_t op_index,
   Operator* state_combine_add;
   if (!MatchOperatorInputs(*state_output_tanh, *model, OperatorType::kAdd,
                            &state_combine_add)) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // State forget & remember addition
@@ -182,7 +182,7 @@ ::tensorflow::Status IdentifyLstmCell::Run(Model* model, std::size_t op_index,
   if (!MatchOperatorInputs(*state_combine_add, *model, OperatorType::kMul,
                            &state_forget_mul, OperatorType::kMul,
                            &state_remember_mul)) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   const std::string prev_state = state_forget_mul->inputs[0];
 
@@ -191,7 +191,7 @@ ::tensorflow::Status IdentifyLstmCell::Run(Model* model, std::size_t op_index,
   if (!MatchOperatorInputs(*state_forget_mul, *model, OperatorType::kNone,
                            nullptr, OperatorType::kLogistic,
                            &state_forget_sig)) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // State remember gate
@@ -199,40 +199,40 @@ ::tensorflow::Status IdentifyLstmCell::Run(Model* model, std::size_t op_index,
   if (!MatchOperatorInputs(*state_remember_mul, *model, OperatorType::kLogistic,
                            &state_remember_sig, OperatorType::kTanh,
                            &state_info_tanh)) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // State remember "information" activation function
   Operator* fc_output_split;
   if (!MatchOperatorInputs(*state_info_tanh, *model, OperatorType::kSplit,
                            &fc_output_split)) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   // State remember gate activation function
   Operator* tmp;
   if (!MatchOperatorInputs(*state_remember_sig, *model, OperatorType::kSplit,
                            &tmp) ||
       (tmp != fc_output_split)) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   // State forget gate activation function
   if (!MatchOperatorInputs(*state_forget_sig, *model, OperatorType::kSplit,
                            &tmp) ||
       (tmp != fc_output_split)) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   // Fully connected output activation function
   if (!MatchOperatorInputs(*fc_output_sig, *model, OperatorType::kSplit,
                            &tmp) ||
       (tmp != fc_output_split)) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   // Fully connected output split
   Operator* fully_connected;
   if (!MatchOperatorInputs(*fc_output_split, *model, OperatorType::kNone,
                            nullptr, OperatorType::kFullyConnected,
                            &fully_connected)) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Fully connected op
@@ -241,13 +241,13 @@ ::tensorflow::Status IdentifyLstmCell::Run(Model* model, std::size_t op_index,
                            OperatorType::kConcatenation, &concat_inputs,
                            OperatorType::kNone, nullptr, OperatorType::kNone,
                            nullptr)) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (static_cast<FullyConnectedOperator*>(fully_connected)->weights_format !=
       FullyConnectedWeightsFormat::kDefault) {
     // Not yet implemented: experimental shuffled weights in fused LSTM cell.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Emplace a new LSTM cell operator
@@ -304,7 +304,7 @@ ::tensorflow::Status IdentifyLstmCell::Run(Model* model, std::size_t op_index,
   DeleteOpAndArrays(model, concat_inputs);
 
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc b/tensorflow/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc
index fe9d9362d94c35..4b2c49757aef65 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc
@@ -35,14 +35,14 @@ ::tensorflow::Status MergeLstmCellInputs::Run(Model* model,
   auto op_it = model->operators.begin() + op_index;
   auto src_op = op_it->get();
   if (src_op->type != OperatorType::kLstmCell) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Already a compact LstmCell. Do not need to merge cell inputs.
   const auto* src_lstm_op = static_cast<LstmCellOperator*>(src_op);
   if (src_lstm_op->kernel_type != LstmCellOperator::KERNEL_FULL ||
       src_lstm_op->inputs.size() != kExtendedLstmInputCount) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Identify prev_activ_input, prev_state_input as required Op inputs,
@@ -50,12 +50,12 @@ ::tensorflow::Status MergeLstmCellInputs::Run(Model* model,
   std::string prev_activ_input;
   if (!GetMatchingRnnArray(model, src_op->outputs[kOutputTensor],
                            &prev_activ_input)) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   std::string prev_state_input;
   if (!GetMatchingRnnArray(model, src_op->outputs[kCellStateTensor],
                            &prev_state_input)) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Get LstmCell's cell, input, output size.
@@ -175,7 +175,7 @@ ::tensorflow::Status MergeLstmCellInputs::Run(Model* model,
   DeleteOpAndArrays(model, src_op);
 
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/identify_lstm_split_inputs.cc b/tensorflow/lite/toco/graph_transformations/identify_lstm_split_inputs.cc
index a4bf8709e4fb67..3de0a7198ee49d 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_lstm_split_inputs.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_lstm_split_inputs.cc
@@ -35,14 +35,14 @@ ::tensorflow::Status SplitLstmCellInputs::Run(Model* model,
   auto op_it = model->operators.begin() + op_index;
   auto curr_op = op_it->get();
   if (curr_op->type != OperatorType::kLstmCell) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   const auto* curr_lstm_op = static_cast<LstmCellOperator*>(curr_op);
   // Already an extended LstmCell. Do not need to split cell inputs.
   if (curr_lstm_op->kernel_type != LstmCellOperator::KERNEL_BASIC ||
       curr_lstm_op->inputs.size() != LstmCellOperator::NUM_INPUTS) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Make sure the WEIGHTS_INPUT and BIASES_INPUT are constant arrays,
@@ -51,13 +51,13 @@ ::tensorflow::Status SplitLstmCellInputs::Run(Model* model,
           *model, curr_op->inputs[LstmCellOperator::WEIGHTS_INPUT]) ||
       !IsConstantParameterArray(
           *model, curr_op->inputs[LstmCellOperator::BIASES_INPUT])) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Make sure propagate_fixed_sizes has defined the size of the output.
   if (!model->GetArray(curr_op->outputs[LstmCellOperator::ACTIV_OUTPUT])
            .has_shape()) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Emplace a new LstmCell operator with extended inputs (kernel/lstm.cc).
@@ -168,7 +168,7 @@ ::tensorflow::Status SplitLstmCellInputs::Run(Model* model,
   DeleteOpAndArrays(model, curr_op);
 
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/identify_nearest_upsample.cc b/tensorflow/lite/toco/graph_transformations/identify_nearest_upsample.cc
index 7c653c966debc8..580b680fa6800a 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_nearest_upsample.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_nearest_upsample.cc
@@ -86,7 +86,7 @@ ::tensorflow::Status IdentifyNearestUpsample::Run(Model* model,
   auto op_it = model->operators.begin() + op_index;
   auto* op = op_it->get();
   if (op->type != OperatorType::kMul) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // We only support one operand being constant.
@@ -96,12 +96,12 @@ ::tensorflow::Status IdentifyNearestUpsample::Run(Model* model,
 
   Operator* next_op = GetOpWithOutput(*model, output);
   if (next_op == nullptr) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (IsConstantParameterArray(*model, lhs) ==
       IsConstantParameterArray(*model, rhs)) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   Array& const_array = IsConstantParameterArray(*model, lhs)
@@ -115,30 +115,30 @@ ::tensorflow::Status IdentifyNearestUpsample::Run(Model* model,
   // Wait for shape propogation finished.
   if (!const_array.has_shape() || !nonconst_array.has_shape() ||
       !output_array.has_shape()) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // We need to make sure they have same dimension count & the const parameter
   // only contain ones.
   if (const_array.shape().dimensions_count() !=
       nonconst_array.shape().dimensions_count()) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (const_array.data_type == ArrayDataType::kFloat) {
     if (!HasSameValues<ArrayDataType::kFloat, float>(const_array, 1))
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
   } else if (const_array.data_type == ArrayDataType::kInt32) {
     if (!HasSameValues<ArrayDataType::kInt32, int>(const_array, 1))
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
   } else if (const_array.data_type == ArrayDataType::kInt8) {
     if (!HasSameValues<ArrayDataType::kInt8, int8_t>(const_array, 127))
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
   } else if (const_array.data_type == ArrayDataType::kUint8) {
     if (!HasSameValues<ArrayDataType::kUint8, uint8_t>(const_array, 255))
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
   } else {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // We're recognizing the following patterns:
@@ -177,7 +177,7 @@ ::tensorflow::Status IdentifyNearestUpsample::Run(Model* model,
   }
 
   if (pack_axis.empty()) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   std::vector<Operator*> to_be_inserted_ops;
@@ -276,7 +276,7 @@ ::tensorflow::Status IdentifyNearestUpsample::Run(Model* model,
   model->operators.erase(FindOperator(model, *op));
 
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/identify_prelu.cc b/tensorflow/lite/toco/graph_transformations/identify_prelu.cc
index 39e2210a801f77..31edcb47a6d516 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_prelu.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_prelu.cc
@@ -51,7 +51,7 @@ ::tensorflow::Status IdentifyPRelu::Run(Model* model, std::size_t op_index,
   if (add_op == nullptr || add_op->type != OperatorType::kAdd ||
       add_op->inputs.size() != 2 ||
       add_op->fused_activation_function != FusedActivationFunctionType::kNone) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   const auto* relu_input_op = GetOpWithOutput(*model, add_op->inputs[0]);
@@ -59,14 +59,14 @@ ::tensorflow::Status IdentifyPRelu::Run(Model* model, std::size_t op_index,
       relu_input_op->inputs.size() != 1 ||
       relu_input_op->fused_activation_function !=
           FusedActivationFunctionType::kNone) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   const auto* mul_op = GetOpWithOutput(*model, add_op->inputs[1]);
   if (mul_op == nullptr || mul_op->type != OperatorType::kMul ||
       mul_op->inputs.size() != 2 ||
       mul_op->fused_activation_function != FusedActivationFunctionType::kNone) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   const auto neg_alpha_tensor_name = mul_op->inputs[0];
@@ -75,7 +75,7 @@ ::tensorflow::Status IdentifyPRelu::Run(Model* model, std::size_t op_index,
 
   if (relu_neg_input_op == nullptr ||
       relu_neg_input_op->inputs.size() != 1) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   const Operator* final_input_op;
@@ -92,13 +92,13 @@ ::tensorflow::Status IdentifyPRelu::Run(Model* model, std::size_t op_index,
         relu_neg_input_op->type != OperatorType::kRelu ||
         relu_neg_input_op->fused_activation_function !=
             FusedActivationFunctionType::kNone) {
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     }
     final_input_op = neg_input_op;
   }
 
   if (relu_input_op->inputs[0] != final_input_op->inputs[0]) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   const auto input_tensor_name = relu_input_op->inputs[0];
@@ -125,7 +125,7 @@ ::tensorflow::Status IdentifyPRelu::Run(Model* model, std::size_t op_index,
   DeleteArrayIfUnusedOutsideOfOp(mul_op->inputs[1], mul_op, model);
   DeleteOpAndArrays(model, add_op);
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/identify_relu1.cc b/tensorflow/lite/toco/graph_transformations/identify_relu1.cc
index 0ea9f47cc83e84..dad425c5809526 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_relu1.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_relu1.cc
@@ -35,7 +35,7 @@ ::tensorflow::Status IdentifyRelu1::Run(Model* model, std::size_t op_index,
   const auto* op_0 = op_it->get();
   if (op_0->type != OperatorType::kMinimum &&
       op_0->type != OperatorType::kMaximum) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Get the paired op and ensure it's the counter to the first.
@@ -44,17 +44,17 @@ ::tensorflow::Status IdentifyRelu1::Run(Model* model, std::size_t op_index,
       (op_1->type != OperatorType::kMinimum &&
        op_1->type != OperatorType::kMaximum) ||
       op_0->type == op_1->type) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   const auto* min_op = op_0->type == OperatorType::kMinimum ? op_0 : op_1;
   const auto* max_op = op_0->type == OperatorType::kMaximum ? op_0 : op_1;
 
   if (min_op->inputs.size() != 2 || max_op->inputs.size() != 2) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   if (min_op->outputs.size() != 1 || max_op->outputs.size() != 1) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Get the original input to the min+max pair.
@@ -63,7 +63,7 @@ ::tensorflow::Status IdentifyRelu1::Run(Model* model, std::size_t op_index,
   int max_scalar_input_index =
       GetSingleScalarInputIndexOfBinaryOp(model, max_op, -1.0f);
   if (min_scalar_input_index == -1 || max_scalar_input_index == -1) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   int op_0_scalar_input_index =
       op_0 == min_op ? min_scalar_input_index : max_scalar_input_index;
@@ -79,7 +79,7 @@ ::tensorflow::Status IdentifyRelu1::Run(Model* model, std::size_t op_index,
   DeleteOpAndArrays(model, op_1);
 
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/make_initial_dequantize_operator.cc b/tensorflow/lite/toco/graph_transformations/make_initial_dequantize_operator.cc
index df8c5cfbbe9434..290dc7fe457323 100644
--- a/tensorflow/lite/toco/graph_transformations/make_initial_dequantize_operator.cc
+++ b/tensorflow/lite/toco/graph_transformations/make_initial_dequantize_operator.cc
@@ -119,7 +119,7 @@ ::tensorflow::Status MakeInitialDequantizeOperator::Run(Model* model,
     }
   }
   *modified = change_made;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc b/tensorflow/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc
index 728a1840b34ac4..b07815ea350b18 100644
--- a/tensorflow/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc
+++ b/tensorflow/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc
@@ -110,11 +110,11 @@ ::tensorflow::Status MergeReshapeIntoPrecedingTranspose::Run(
       it->get(), OperatorType::kReshape);
 
   if (reshape_op == nullptr) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (!OperatorReady(*model, reshape_op) || reshape_op->shape.empty()) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   const std::string intermediate_name = reshape_op->inputs[0];
@@ -122,13 +122,13 @@ ::tensorflow::Status MergeReshapeIntoPrecedingTranspose::Run(
 
   // Guarantee the input is only consume by the reshape.
   if (CountOpsWithInput(*model, intermediate_name) != 1) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Check for the parent operator.
   const auto& transpose_it = FindOpWithOutput(*model, intermediate_name);
   if (transpose_it == model->operators.end()) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Find the parent operator and guarantee it is a transpose.
@@ -136,16 +136,16 @@ ::tensorflow::Status MergeReshapeIntoPrecedingTranspose::Run(
       transpose_it->get(), OperatorType::kTranspose);
 
   if (transpose_op == nullptr) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (!OperatorReady(*model, transpose_op) || transpose_op->perm.empty()) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (!ReshapeIsEquivalentToTranspose(*model, reshape_op,
                                       false /*allow_extra_unary_dimensions*/)) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Check that the intermediate is not an output array.
@@ -154,7 +154,7 @@ ::tensorflow::Status MergeReshapeIntoPrecedingTranspose::Run(
         "Cannot fuse %s and %s as it would invalidate the transpose "
         "output array.",
         LogName(*transpose_op), LogName(*reshape_op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   AddMessageF("Merging operations %s and %s", LogName(*transpose_op),
@@ -173,7 +173,7 @@ ::tensorflow::Status MergeReshapeIntoPrecedingTranspose::Run(
 
   // Remove the reshape as passthrough operation.
   if (!RemoveTrivialPassthroughOp(this, model, op_index)) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Update transpose_op's constant buffer to contain the new permutation.
@@ -186,7 +186,7 @@ ::tensorflow::Status MergeReshapeIntoPrecedingTranspose::Run(
   model->GetArray(transpose_op->outputs[0]).clear_shape();
 
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/move_binary_operator_before_reshape.cc b/tensorflow/lite/toco/graph_transformations/move_binary_operator_before_reshape.cc
index 1b9644b07a4843..85384139b638cc 100644
--- a/tensorflow/lite/toco/graph_transformations/move_binary_operator_before_reshape.cc
+++ b/tensorflow/lite/toco/graph_transformations/move_binary_operator_before_reshape.cc
@@ -73,7 +73,7 @@ ::tensorflow::Status MoveBinaryOperatorBeforeReshape::Run(Model* model,
       binary_op->type != OperatorType::kLessEqual &&
       binary_op->type != OperatorType::kGreater &&
       binary_op->type != OperatorType::kGreaterEqual) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // BINARY OP INPUT CHECKS
@@ -85,11 +85,11 @@ ::tensorflow::Status MoveBinaryOperatorBeforeReshape::Run(Model* model,
   if (!input_is_const[0] && !input_is_const[1]) {
     // To limit our scope, we require one constant input. Though there's no
     // reason this transformation wouldn't work with all variable inputs.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   if (input_is_const[0] && input_is_const[1]) {
     // Both inputs are constants. Leave this for constants propagation.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   const int constant_input_idx = input_is_const[0] ? 0 : 1;
   const int variable_input_idx = input_is_const[0] ? 1 : 0;
@@ -102,13 +102,13 @@ ::tensorflow::Status MoveBinaryOperatorBeforeReshape::Run(Model* model,
     AddMessageF(
         "Not moving %s because it's non-constant input shape is not resolved.",
         LogName(*binary_op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   if (!IsTailOfShape(
           model->GetArray(binary_op->inputs[constant_input_idx]).shape(),
           model->GetArray(binary_op->inputs[variable_input_idx]).shape())) {
     // Constant array shape must be the latter part of the variable shape.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // RESHAPE OP CHECKS
@@ -117,13 +117,13 @@ ::tensorflow::Status MoveBinaryOperatorBeforeReshape::Run(Model* model,
   if (reshape_it == model->operators.end()) {
     AddMessageF("Not moving %s because it's variable input is not connected.",
                 LogName(*binary_op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   Operator* reshape_op = reshape_it->get();
   if (reshape_op->type != OperatorType::kReshape) {
     AddMessageF("Not moving %s because the preceding %s is not a reshape op",
                 LogName(*binary_op), LogName(*reshape_op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   const auto& reshape_input_array = model->GetArray(reshape_op->inputs[0]);
   if (!reshape_input_array.has_shape()) {
@@ -131,14 +131,14 @@ ::tensorflow::Status MoveBinaryOperatorBeforeReshape::Run(Model* model,
         "Not moving %s because it's non-constant input shape is not resolved "
         "yet",
         LogName(*binary_op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   if (!IsTailOfShape(
           model->GetArray(binary_op->inputs[constant_input_idx]).shape(),
           model->GetArray(reshape_op->outputs[0]).shape())) {
     // Constant array shape must be the latter part of the binary op output
     // shape.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // EXTRA CHECKS ON CONNECTING ARRAY
@@ -147,7 +147,7 @@ ::tensorflow::Status MoveBinaryOperatorBeforeReshape::Run(Model* model,
       AddMessageF(
           "Not moving %s because the output of reshape op %s is an output op.",
           LogName(*binary_op), LogName(*reshape_op));
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     }
   }
   int count_ops_consuming_output =
@@ -158,7 +158,7 @@ ::tensorflow::Status MoveBinaryOperatorBeforeReshape::Run(Model* model,
         "Not moving %s because the output of reshape op %s is consumed by "
         "another op",
         LogName(*binary_op), LogName(*reshape_op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // SWAP ORDER OF BINARY AND RESHAPE OPS
@@ -177,7 +177,7 @@ ::tensorflow::Status MoveBinaryOperatorBeforeReshape::Run(Model* model,
   model->GetArray(binary_op->outputs[0]).clear_shape();
 
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_activation_function_into_constants.cc b/tensorflow/lite/toco/graph_transformations/propagate_activation_function_into_constants.cc
index 7232b3b2251614..79d8229da8ea2e 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_activation_function_into_constants.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_activation_function_into_constants.cc
@@ -34,13 +34,13 @@ ::tensorflow::Status PropagateActivationFunctionIntoConstants::Run(
   if (ac_op->type != OperatorType::kRelu6 &&
       ac_op->type != OperatorType::kRelu1 &&
       ac_op->type != OperatorType::kRelu) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Find the op producing the array passed to this activation function.
   auto* src_op = GetOpWithOutput(*model, ac_op->inputs[0]);
   if (!src_op) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Ensure the src_op is not used without the activation function applied.
@@ -58,7 +58,7 @@ ::tensorflow::Status PropagateActivationFunctionIntoConstants::Run(
       src_op_input = src_op->inputs[0];
       break;
     default:
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
   }
   CHECK_EQ(src_op->outputs[0], ac_op->inputs[0]);
 
@@ -70,7 +70,7 @@ ::tensorflow::Status PropagateActivationFunctionIntoConstants::Run(
         "Not propagating activation function %s into %s:%s because it is not "
         "constant",
         LogName(*ac_op), LogName(*src_op), src_op_input);
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Get the array we'll be working with and ensure it's a compatible type.
@@ -80,7 +80,7 @@ ::tensorflow::Status PropagateActivationFunctionIntoConstants::Run(
         "Not propagating activation function %s into %s:%s because it is "
         "non-float data",
         LogName(*ac_op), LogName(*src_op), src_op_input);
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   auto& const_array_data =
       const_array.GetMutableBuffer<ArrayDataType::kFloat>().data;
@@ -109,7 +109,7 @@ ::tensorflow::Status PropagateActivationFunctionIntoConstants::Run(
       }
       default:
         LOG(FATAL) << "Unsupported activation function " << LogName(*ac_op);
-        return ::tensorflow::OkStatus();
+        return absl::OkStatus();
     }
     const_array_data[i] = new_value;
   }
@@ -117,7 +117,7 @@ ::tensorflow::Status PropagateActivationFunctionIntoConstants::Run(
   AddMessageF("Propagated activation function %s into %s:%s", LogName(*ac_op),
               LogName(*src_op), src_op_input);
   *modified = RemoveTrivialPassthroughOp(this, model, op_index);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc b/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
index 5e1935ca9ad34c..af801c3cbf1d63 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
@@ -43,7 +43,7 @@ ::tensorflow::Status PropagateArrayDataTypes::Run(Model* model,
   for (const auto& input : op->inputs) {
     if (!model->IsOptionalArray(input) &&
         model->GetArray(input).data_type == ArrayDataType::kNone) {
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     }
   }
   // Record data types of output before processing, so we can see at the
@@ -146,7 +146,7 @@ ::tensorflow::Status PropagateArrayDataTypes::Run(Model* model,
       auto* rand_op = static_cast<RandomUniformOperator*>(op);
       // The output type of RandomUniform is specified with an attribute
       if (rand_op->dtype == ArrayDataType::kNone) {
-        return ::tensorflow::OkStatus();
+        return absl::OkStatus();
       }
       CHECK_EQ(op->outputs.size(), 1);
       SetDataTypeForAllOutputs(model, op, rand_op->dtype);
@@ -168,7 +168,7 @@ ::tensorflow::Status PropagateArrayDataTypes::Run(Model* model,
       // This can make unsupported_op->output_data_types have more elements than
       // op->outputs.
       if (unsupported_op->output_data_types.size() < op->outputs.size()) {
-        return ::tensorflow::OkStatus();
+        return absl::OkStatus();
       }
       for (size_t i = 0; i < op->outputs.size(); ++i) {
         const std::string& output = op->outputs[i];
@@ -179,7 +179,7 @@ ::tensorflow::Status PropagateArrayDataTypes::Run(Model* model,
     }
     case OperatorType::kExpandDims: {
       // Yield on ExpandDim until it is converted to Reshape
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     }
     case OperatorType::kSelect: {
       // Select produces outputs with the same type as their 2nd input
@@ -253,13 +253,13 @@ ::tensorflow::Status PropagateArrayDataTypes::Run(Model* model,
     }
     case OperatorType::kUnidirectionalSequenceLstm: {
       const ArrayDataType data_type = model->GetArray(op->inputs[0]).data_type;
-      if (data_type != ArrayDataType::kFloat) return ::tensorflow::OkStatus();
+      if (data_type != ArrayDataType::kFloat) return absl::OkStatus();
       SetDataTypeForAllOutputs(model, op, data_type);
       break;
     }
     case OperatorType::kUnidirectionalSequenceRnn: {
       const ArrayDataType data_type = model->GetArray(op->inputs[0]).data_type;
-      if (data_type != ArrayDataType::kFloat) return ::tensorflow::OkStatus();
+      if (data_type != ArrayDataType::kFloat) return absl::OkStatus();
       SetDataTypeForAllOutputs(model, op, data_type);
       break;
     }
@@ -273,13 +273,13 @@ ::tensorflow::Status PropagateArrayDataTypes::Run(Model* model,
     }
     case OperatorType::kBidirectionalSequenceLstm: {
       const ArrayDataType data_type = model->GetArray(op->inputs[0]).data_type;
-      if (data_type != ArrayDataType::kFloat) return ::tensorflow::OkStatus();
+      if (data_type != ArrayDataType::kFloat) return absl::OkStatus();
       SetDataTypeForAllOutputs(model, op, data_type);
       break;
     }
     case OperatorType::kBidirectionalSequenceRnn: {
       const ArrayDataType data_type = model->GetArray(op->inputs[0]).data_type;
-      if (data_type != ArrayDataType::kFloat) return ::tensorflow::OkStatus();
+      if (data_type != ArrayDataType::kFloat) return absl::OkStatus();
       SetDataTypeForAllOutputs(model, op, data_type);
       break;
     }
@@ -318,10 +318,10 @@ ::tensorflow::Status PropagateArrayDataTypes::Run(Model* model,
   for (const auto& output : op->outputs) {
     if (old_output_data_types[output] != model->GetArray(output).data_type) {
       *modified = true;
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     }
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_default_min_max.cc b/tensorflow/lite/toco/graph_transformations/propagate_default_min_max.cc
index 78532f04a06fbb..0f9197cd485ea5 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_default_min_max.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_default_min_max.cc
@@ -65,7 +65,7 @@ ::tensorflow::Status PropagateDefaultMinMax::Run(Model* model,
   }
 
   *modified = did_change;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 // Sets the min/max on the given array, adjusting the reference_minmax for the
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc b/tensorflow/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc
index 8fb7ec9e297992..10968a93211ece 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc
@@ -284,7 +284,7 @@ ::tensorflow::Status PropagateFakeQuantNumBits::Run(Model* model,
   auto it = model->operators.begin() + op_index;
   auto* op = it->get();
   if (op->type != OperatorType::kFakeQuant) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   auto* fakequant_op = static_cast<FakeQuantOperator*>(op);
 
@@ -293,7 +293,7 @@ ::tensorflow::Status PropagateFakeQuantNumBits::Run(Model* model,
                                            &quantized_data_type)) {
     AddMessageF("FakeQuant op %s num_bits=%d is out of range, ignoring",
                 LogName(*op), fakequant_op->num_bits);
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   const auto& final_minmax = *fakequant_op->minmax;
 
@@ -315,7 +315,7 @@ ::tensorflow::Status PropagateFakeQuantNumBits::Run(Model* model,
       RecursivelyForwardPropagateDataType(this, model, op, quantized_data_type);
 
   *modified = did_change;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index ffaf21bcb17a96..ab6f40765d8854 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -2398,7 +2398,7 @@ ::tensorflow::Status PropagateFixedSizes::Run(Model* model,
           static_cast<TensorFlowUnsupportedOperator*>(op);
       // Attribute can be not specified, ignore it.
       if (unsupported_op->output_shapes.size() < op->outputs.size()) {
-        return ::tensorflow::OkStatus();
+        return absl::OkStatus();
       }
       for (size_t i = 0; i < op->outputs.size(); ++i) {
         const std::string& output = op->outputs[i];
@@ -2492,10 +2492,10 @@ ::tensorflow::Status PropagateFixedSizes::Run(Model* model,
       AddMessageF("Set shape of %s to [%s]", output,
                   absl::StrJoin(model->GetArray(output).shape().dims(), ","));
       *modified = true;
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     }
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/quantize.cc b/tensorflow/lite/toco/graph_transformations/quantize.cc
index 6153552650c558..36d6819ecf31e0 100644
--- a/tensorflow/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/lite/toco/graph_transformations/quantize.cc
@@ -518,7 +518,7 @@ ::tensorflow::Status Quantize::Run(Model* model, std::size_t op_index,
   auto& op = *model->operators[op_index];
   if (op.type == OperatorType::kDequantize ||
       op.type == OperatorType::kFakeQuant) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Our assumption here is that the input arrays are already quantized -
@@ -555,7 +555,7 @@ ::tensorflow::Status Quantize::Run(Model* model, std::size_t op_index,
       if (!array.minmax && !array.buffer) {
         LOG(WARNING) << "Can't quantize input array " << input
                      << " because it lacks min/max info";
-        return ::tensorflow::OkStatus();
+        return absl::OkStatus();
       }
       const auto* other_op = GetOpWithOutput(*model, input);
       if (other_op && other_op->type != OperatorType::kDequantize) {
@@ -565,7 +565,7 @@ ::tensorflow::Status Quantize::Run(Model* model, std::size_t op_index,
             "which means that we should yield and let other ops "
             "get quantized first",
             LogName(op), input);
-        return ::tensorflow::OkStatus();
+        return absl::OkStatus();
       }
     }
   }
@@ -727,7 +727,7 @@ ::tensorflow::Status Quantize::Run(Model* model, std::size_t op_index,
   }
 
   *modified = changed;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/read_array_minmax_and_narrow_range_from_fake_quant.cc b/tensorflow/lite/toco/graph_transformations/read_array_minmax_and_narrow_range_from_fake_quant.cc
index 33c150364756c7..5e867eab33e8d4 100644
--- a/tensorflow/lite/toco/graph_transformations/read_array_minmax_and_narrow_range_from_fake_quant.cc
+++ b/tensorflow/lite/toco/graph_transformations/read_array_minmax_and_narrow_range_from_fake_quant.cc
@@ -57,13 +57,13 @@ ::tensorflow::Status ReadArrayMinmaxAndNarrowRangeFromFakeQuant::Run(
   const auto fakequant_it = model->operators.begin() + op_index;
   auto* fakequant_base_op = fakequant_it->get();
   if (fakequant_base_op->type != OperatorType::kFakeQuant) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   auto* fq_op = static_cast<FakeQuantOperator*>(fakequant_base_op);
 
   if (!fq_op->minmax) {
     // Need to be resolved first by ResolveFakeQuantArgsFromVars.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // At this point, this FakeQuantOperator should have a MinMax
@@ -76,7 +76,7 @@ ::tensorflow::Status ReadArrayMinmaxAndNarrowRangeFromFakeQuant::Run(
   changed |= ApplyAttrsToArray(this, model, *fq_op, fq_op->inputs[0]);
   changed |= ApplyAttrsToArray(this, model, *fq_op, fq_op->outputs[0]);
   *modified = changed;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/remove_final_dequantize_op.cc b/tensorflow/lite/toco/graph_transformations/remove_final_dequantize_op.cc
index ab2819d6b29876..438c7a63aba1e8 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_final_dequantize_op.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_final_dequantize_op.cc
@@ -32,7 +32,7 @@ ::tensorflow::Status RemoveFinalDequantizeOp::Run(Model* model,
   const auto dequantize_it = model->operators.begin() + op_index;
   const auto* dequantize_op = dequantize_it->get();
   if (dequantize_op->type != OperatorType::kDequantize) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   const auto& output = dequantize_op->outputs[0];
   // We can remove any dequantize op whose output is not consumed by
@@ -41,7 +41,7 @@ ::tensorflow::Status RemoveFinalDequantizeOp::Run(Model* model,
   // in the middle of the graph might be designated as an output
   // array.
   if (CountOpsWithInput(*model, output)) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // If one of the model's output arrays was actually the Dequantize op's
@@ -56,7 +56,7 @@ ::tensorflow::Status RemoveFinalDequantizeOp::Run(Model* model,
   AddMessageF("Removed final %s", LogName(*dequantize_op));
   DeleteOpAndArrays(model, dequantize_op);
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/remove_successive_transpose.cc b/tensorflow/lite/toco/graph_transformations/remove_successive_transpose.cc
index 0a0bb7cbb19dc9..fdc4d274b33b5f 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_successive_transpose.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_successive_transpose.cc
@@ -63,21 +63,21 @@ ::tensorflow::Status RemoveSuccessiveTranspose::Run(Model* model,
   *modified = false;
   auto op = model->operators.begin() + op_index;
   if (op->get()->type != OperatorType::kTranspose) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   TransposeOperator* t_op = static_cast<TransposeOperator*>(op->get());
   if (CountOpsWithInput(*model, t_op->outputs[0]) != 1) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   Operator* next = GetOpWithInput(*model, t_op->outputs[0]);
   if (!next || next->type != OperatorType::kTranspose) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   TransposeOperator* t_next = static_cast<TransposeOperator*>(next);
   if (!CountOpsWithInput(*model, t_next->outputs[0])) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   if (TransformsToIdentity(t_op->perm, t_next->perm)) {
@@ -90,7 +90,7 @@ ::tensorflow::Status RemoveSuccessiveTranspose::Run(Model* model,
     *modified = true;
   }
 
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/remove_tensorflow_assert.cc b/tensorflow/lite/toco/graph_transformations/remove_tensorflow_assert.cc
index 5397e0f61318b2..88402f0092e509 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_tensorflow_assert.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_tensorflow_assert.cc
@@ -30,7 +30,7 @@ ::tensorflow::Status RemoveTensorFlowAssert::Run(Model* model,
   const auto assert_it = model->operators.begin() + op_index;
   const auto* assert_op = assert_it->get();
   if (assert_op->type != OperatorType::kAssert) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   bool changed = false;
@@ -58,7 +58,7 @@ ::tensorflow::Status RemoveTensorFlowAssert::Run(Model* model,
   // That's it. We can stop here, no need to duplicate the work that
   // RemoveUnusedOp will do removing this now-unused node.
   *modified = changed;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/remove_tensorflow_identity.cc b/tensorflow/lite/toco/graph_transformations/remove_tensorflow_identity.cc
index 967acee4c32b09..e74090bec86364 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_tensorflow_identity.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_tensorflow_identity.cc
@@ -32,11 +32,11 @@ ::tensorflow::Status RemoveTensorFlowIdentity::Run(Model* model,
   const auto passthru_it = model->operators.begin() + op_index;
   const auto* passthru_op = passthru_it->get();
   if (passthru_op->type != OperatorType::kIdentity) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   *modified = RemoveTrivialPassthroughOp(this, model, op_index);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/remove_trivial_binary.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_binary.cc
index 28fb86fe975e46..ee5180616e4e2e 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_trivial_binary.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_binary.cc
@@ -56,7 +56,7 @@ ::tensorflow::Status RemoveTrivialBinaryOperator::Run(Model* model,
       binary_op->type != OperatorType::kMul &&
       binary_op->type != OperatorType::kSub &&
       binary_op->type != OperatorType::kDiv) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   CHECK_EQ(binary_op->inputs.size(), 2);
@@ -69,12 +69,12 @@ ::tensorflow::Status RemoveTrivialBinaryOperator::Run(Model* model,
   };
   if (!is_input_constant[0] && !is_input_constant[1]) {
     // Neither input is constant, so nothing we can resolve here.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   if (is_input_constant[0] && is_input_constant[1]) {
     // Both inputs are constants. That's a job for constants
     // propagation, not for us to handle here.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   const int index_of_constant_input = is_input_constant[0] ? 0 : 1;
   const int index_of_variable_input = is_input_constant[0] ? 1 : 0;
@@ -87,7 +87,7 @@ ::tensorflow::Status RemoveTrivialBinaryOperator::Run(Model* model,
   const auto& input_array_1 = model->GetArray(binary_op->inputs[1]);
   if (!input_array_0.has_shape() || !input_array_1.has_shape()) {
     // Both input shapes must be known.
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   if (input_array_0.shape().dimensions_count() ==
           input_array_1.shape().dimensions_count() &&
@@ -97,7 +97,7 @@ ::tensorflow::Status RemoveTrivialBinaryOperator::Run(Model* model,
         "(lhs %s, rhs %s)",
         LogName(*binary_op), ShapeToString(input_array_0.shape()),
         ShapeToString(input_array_1.shape()));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Now check if the constant operand makes this binary
@@ -106,7 +106,7 @@ ::tensorflow::Status RemoveTrivialBinaryOperator::Run(Model* model,
       model->GetArray(binary_op->inputs[index_of_constant_input]);
   // For now, we only handle floats here.
   if (constant_input_array.data_type != ArrayDataType::kFloat) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   const auto& constant_input_float_data =
       constant_input_array.GetBuffer<ArrayDataType::kFloat>().data;
@@ -127,13 +127,13 @@ ::tensorflow::Status RemoveTrivialBinaryOperator::Run(Model* model,
                                  FusedActivationFunctionType::kNone;
 
   if (!is_trivial) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Now we know that this node is trivial, so we can remove it.
   AddMessageF("Removing trivial %s", LogName(*binary_op));
   *modified = RemoveTrivialPassthroughOp(this, model, op_index);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/remove_trivial_concatenation.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_concatenation.cc
index e441f332ec9de4..bbde6122c03f5b 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_trivial_concatenation.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_concatenation.cc
@@ -32,13 +32,13 @@ ::tensorflow::Status RemoveTrivialConcatenation::Run(Model* model,
   const auto concat_it = model->operators.begin() + op_index;
   auto* concat_op = concat_it->get();
   if (concat_op->type != OperatorType::kConcatenation) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   if (concat_op->inputs.size() != 1) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   *modified = RemoveTrivialPassthroughOp(this, model, op_index);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc
index 7df0cb67343a04..57ebfe15a02669 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc
@@ -39,7 +39,7 @@ ::tensorflow::Status RemoveTrivialConcatenationInput::Run(Model* model,
   const auto concat_it = model->operators.begin() + op_index;
   auto* concat_op = concat_it->get();
   if (concat_op->type != OperatorType::kConcatenation) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   std::vector<std::string> trivial_inputs;
   std::vector<std::string> nontrivial_inputs;
@@ -55,7 +55,7 @@ ::tensorflow::Status RemoveTrivialConcatenationInput::Run(Model* model,
   }
 
   if (trivial_inputs.empty()) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   // Drop trivial inputs.
@@ -64,7 +64,7 @@ ::tensorflow::Status RemoveTrivialConcatenationInput::Run(Model* model,
     DeleteArrayIfUnusedOutsideOfOp(input, concat_op, model);
   }
   *modified = true;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/remove_trivial_fake_quant.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_fake_quant.cc
index 85fcb5ef9f5a2d..91d910c1551304 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_trivial_fake_quant.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_fake_quant.cc
@@ -71,20 +71,20 @@ ::tensorflow::Status RemoveTrivialFakeQuant::Run(Model* model,
   const auto op_it = model->operators.begin() + op_index;
   auto* op = op_it->get();
   if (op->type != OperatorType::kFakeQuant) {
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   auto* fakequant_op = static_cast<FakeQuantOperator*>(op);
 
   if (!IsFakeQuantTrivial(this, *model, *fakequant_op)) {
     AddMessageF("%s is not trivial", LogName(*fakequant_op));
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   AddMessageF("Removing trivial %s", LogName(*fakequant_op));
 
   CHECK_EQ(fakequant_op->inputs.size(), 1);
   *modified = RemoveTrivialPassthroughOp(this, model, op_index);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/import_tensorflow.cc b/tensorflow/lite/toco/import_tensorflow.cc
index 3d02ec907a068f..8e3159c8646f34 100644
--- a/tensorflow/lite/toco/import_tensorflow.cc
+++ b/tensorflow/lite/toco/import_tensorflow.cc
@@ -153,7 +153,7 @@ tensorflow::Status CheckOptionalAttr(const NodeDef& node,
           expected_value + "'");
     }
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status CheckOptionalAttr(
@@ -167,13 +167,13 @@ tensorflow::Status CheckOptionalAttr(
           tensorflow::DataType_Name(expected_value) + "'");
     }
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename T1, typename T2>
 tensorflow::Status ExpectValue(const T1& v1, const T2& v2,
                                const std::string& description) {
-  if (v1 == v2) return ::tensorflow::OkStatus();
+  if (v1 == v2) return absl::OkStatus();
   return tensorflow::errors::InvalidArgument(absl::StrCat(
       "Unexpected ", description, ": got ", v1, ", expected ", v2));
 }
@@ -235,12 +235,12 @@ tensorflow::Status ImportShape(
   if (zero_sized_shape) {
     shape->mutable_dims()->clear();
     if (input_flat_size != nullptr) *input_flat_size = 0;
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   *shape->mutable_dims() = input_dims_only_sizes;
 
-  if (input_flat_size == nullptr) return ::tensorflow::OkStatus();
+  if (input_flat_size == nullptr) return absl::OkStatus();
 
   return NumElements(input_dims_only_sizes, input_flat_size);
 }
@@ -381,7 +381,7 @@ tensorflow::Status ImportTensorData(const TensorProto& input_tensor,
                      ") have the right dimensions (", input_flat_size,
                      ") for this ", type_name, " tensor"));
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ImportFloatArray(const TensorProto& input_tensor,
@@ -510,7 +510,7 @@ tensorflow::Status ImportBoolArray(const TensorProto& input_tensor,
     // So far only encountered that in an array with 1 entry, let's
     // require that until we encounter a graph where that's not the case.
     output_bool_data[0] = false;
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
   return status;
 }
@@ -538,7 +538,7 @@ tensorflow::Status ImportStringArray(const TensorProto& input_tensor,
   for (int i = 0; i < input_flat_size; ++i) {
     output_string_data[i] = input_tensor.string_val(i);
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 // Count the number of inputs of a given node. If
@@ -564,7 +564,7 @@ tensorflow::Status CheckInputsCount(
         node.op(), " node expects ", expected_input_count,
         " input(s) other than control dependencies: ", node.DebugString());
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 template <ArrayDataType T>
@@ -774,7 +774,7 @@ tensorflow::Status ConvertUnsupportedOperator(
       op->output_shapes.push_back(output_shape);
     }
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertConstOperator(
@@ -784,7 +784,7 @@ tensorflow::Status ConvertConstOperator(
   const auto& tensor = GetTensorAttr(node, "value");
   const auto dtype = GetDataTypeAttr(node, "dtype");
 
-  tensorflow::Status status = ::tensorflow::OkStatus();
+  tensorflow::Status status = absl::OkStatus();
 
   auto& array = model->GetOrCreateArray(node.name());
   switch (dtype) {
@@ -830,7 +830,7 @@ tensorflow::Status ConvertConstOperator(
   }
   TF_RETURN_WITH_CONTEXT_IF_ERROR(
       status, " (while processing node '" + node.name() + "')");
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertConvOperator(
@@ -911,7 +911,7 @@ tensorflow::Status ConvertConvOperator(
   conv->padding.type = padding_type;
   model->operators.emplace_back(conv);
 
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertDepthwiseConvOperator(
@@ -989,7 +989,7 @@ tensorflow::Status ConvertDepthwiseConvOperator(
   conv->dilation_width_factor = dilation_width_factor;
   conv->padding.type = padding_type;
   model->operators.emplace_back(conv);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertDepthToSpaceOperator(
@@ -1012,7 +1012,7 @@ tensorflow::Status ConvertDepthToSpaceOperator(
   op->block_size = GetIntAttr(node, "block_size");
   QCHECK_GE(op->block_size, 2);
   model->operators.emplace_back(op);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertSpaceToDepthOperator(
@@ -1035,7 +1035,7 @@ tensorflow::Status ConvertSpaceToDepthOperator(
   op->block_size = GetIntAttr(node, "block_size");
   QCHECK_GE(op->block_size, 2);
   model->operators.emplace_back(op);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertBiasAddOperator(
@@ -1052,7 +1052,7 @@ tensorflow::Status ConvertBiasAddOperator(
   biasadd->inputs.push_back(bias_name);
   biasadd->outputs.push_back(node.name());
   model->operators.emplace_back(biasadd);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertRandomUniform(
@@ -1070,7 +1070,7 @@ tensorflow::Status ConvertRandomUniform(
   op->seed2 = GetIntAttr(node, "seed2");
   CHECK(model != nullptr);
   model->operators.emplace_back(std::move(op));
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertIdentityOperator(
@@ -1093,7 +1093,7 @@ tensorflow::Status ConvertIdentityOperator(
   op->inputs.push_back(input_name);
   op->outputs.push_back(node.name());
   model->operators.emplace_back(op);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertIdentityNOperator(
@@ -1111,7 +1111,7 @@ tensorflow::Status ConvertIdentityNOperator(
     op->outputs.push_back(output_name);
     model->operators.emplace_back(op);
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertFakeQuantWithMinMaxArgs(
@@ -1132,7 +1132,7 @@ tensorflow::Status ConvertFakeQuantWithMinMaxArgs(
     op->narrow_range = GetBoolAttr(node, "narrow_range");
   }
   model->operators.emplace_back(op);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertFakeQuantWithMinMaxVars(
@@ -1154,7 +1154,7 @@ tensorflow::Status ConvertFakeQuantWithMinMaxVars(
     op->narrow_range = GetBoolAttr(node, "narrow_range");
   }
   model->operators.emplace_back(op);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertSqueezeOperator(
@@ -1175,7 +1175,7 @@ tensorflow::Status ConvertSqueezeOperator(
   }
 
   model->operators.emplace_back(op);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertSplitOperator(
@@ -1193,7 +1193,7 @@ tensorflow::Status ConvertSplitOperator(
   }
   op->num_split = num_split;
   model->operators.emplace_back(op);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertSplitVOperator(
@@ -1212,7 +1212,7 @@ tensorflow::Status ConvertSplitVOperator(
   }
   op->num_split = num_split;
   model->operators.emplace_back(op);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertSwitchOperator(
@@ -1227,7 +1227,7 @@ tensorflow::Status ConvertSwitchOperator(
   // Switch operators have two outputs: "name" and "name:1".
   op->outputs.push_back(node.name() + ":1");
   model->operators.emplace_back(op);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertSoftmaxOperator(
@@ -1247,7 +1247,7 @@ tensorflow::Status ConvertSoftmaxOperator(
     softmax->beta = 1.f;
   }
   model->operators.emplace_back(softmax);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertLRNOperator(
@@ -1264,7 +1264,7 @@ tensorflow::Status ConvertLRNOperator(
   lrn->alpha = GetFloatAttr(node, "alpha");
   lrn->beta = GetFloatAttr(node, "beta");
   model->operators.emplace_back(lrn);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertMaxPoolOperator(
@@ -1307,7 +1307,7 @@ tensorflow::Status ConvertMaxPoolOperator(
     LOG(FATAL) << "Bad padding (only SAME and VALID are supported)";
   }
   model->operators.emplace_back(maxpool);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertAvgPoolOperator(
@@ -1346,7 +1346,7 @@ tensorflow::Status ConvertAvgPoolOperator(
     LOG(FATAL) << "Bad padding (only SAME and VALID are supported)";
   }
   model->operators.emplace_back(avgpool);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertBatchMatMulOperator(
@@ -1369,7 +1369,7 @@ tensorflow::Status ConvertBatchMatMulOperator(
   RetainTensorFlowNodeDef(node, batch_matmul);
 
   model->operators.emplace_back(batch_matmul);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertMatMulOperator(
@@ -1393,7 +1393,7 @@ tensorflow::Status ConvertMatMulOperator(
   matmul->inputs = {node.input(0), node.input(1)};
   matmul->outputs = {node.name()};
   model->operators.emplace_back(matmul);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertConcatOperator(
@@ -1418,7 +1418,7 @@ tensorflow::Status ConvertConcatOperator(
   }
   op->outputs.push_back(node.name());
   model->operators.emplace_back(op);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertMirrorPadOperator(
@@ -1443,7 +1443,7 @@ tensorflow::Status ConvertMirrorPadOperator(
 
   model->operators.emplace_back(op);
 
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 static constexpr int kAnyNumInputs = -1;
@@ -1479,7 +1479,7 @@ tensorflow::Status ConvertSimpleOperatorGeneric(
   }
 
   model->operators.emplace_back(op);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 // Convert a simple operator which is not valid as a flex op.
@@ -1557,7 +1557,7 @@ tensorflow::Status ConvertStridedSliceOperator(
                              : 0;
 
   model->operators.emplace_back(op);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertPlaceholderOperator(
@@ -1597,13 +1597,13 @@ tensorflow::Status ConvertPlaceholderOperator(
       }
     }
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertNoOpOperator(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
     const ModelFlags& model_flags, Model* model) {
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertCastOperator(
@@ -1619,7 +1619,7 @@ tensorflow::Status ConvertCastOperator(
   op->inputs.push_back(node.input(0));
   op->outputs.push_back(node.name());
   model->operators.emplace_back(op);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertFloorOperator(
@@ -1633,7 +1633,7 @@ tensorflow::Status ConvertFloorOperator(
   op->inputs.push_back(node.input(0));
   op->outputs.push_back(node.name());
   model->operators.emplace_back(op);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertCeilOperator(
@@ -1647,7 +1647,7 @@ tensorflow::Status ConvertCeilOperator(
   op->inputs.push_back(node.input(0));
   op->outputs.push_back(node.name());
   model->operators.emplace_back(op);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertRoundOperator(
@@ -1661,7 +1661,7 @@ tensorflow::Status ConvertRoundOperator(
   op->inputs.push_back(node.input(0));
   op->outputs.push_back(node.name());
   model->operators.emplace_back(op);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertGatherOperator(
@@ -1690,7 +1690,7 @@ tensorflow::Status ConvertGatherOperator(
   }
   op->outputs.push_back(node.name());
   model->operators.emplace_back(op);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertGatherNdOperator(
@@ -1705,7 +1705,7 @@ tensorflow::Status ConvertGatherNdOperator(
   op->inputs.push_back(node.input(1));
   op->outputs.push_back(node.name());
   model->operators.emplace_back(op);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename Op>
@@ -1726,7 +1726,7 @@ tensorflow::Status ConvertArgMinMaxOperator(
   op->inputs.push_back(node.input(1));
   op->outputs.push_back(node.name());
   model->operators.emplace_back(op);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertArgMaxOperator(
@@ -1765,7 +1765,7 @@ tensorflow::Status ConvertResizeBilinearOperator(
   op->inputs.push_back(node.input(1));
   op->outputs.push_back(node.name());
   model->operators.emplace_back(op);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertResizeNearestNeighborOperator(
@@ -1788,7 +1788,7 @@ tensorflow::Status ConvertResizeNearestNeighborOperator(
   op->inputs.push_back(node.input(1));
   op->outputs.push_back(node.name());
   model->operators.emplace_back(op);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertBatchNormWithGlobalNormalizationOperator(
@@ -1838,7 +1838,7 @@ tensorflow::Status ConvertBatchNormWithGlobalNormalizationOperator(
   op->outputs.push_back(node.name());
 
   model->operators.emplace_back(op);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertFusedBatchNormOperator(
@@ -1893,7 +1893,7 @@ tensorflow::Status ConvertFusedBatchNormOperator(
   op->outputs.push_back(node.name());
 
   model->operators.emplace_back(op);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertSpaceToBatchNDOperator(
@@ -1909,7 +1909,7 @@ tensorflow::Status ConvertSpaceToBatchNDOperator(
   op->inputs.push_back(node.input(2));
   op->outputs.push_back(node.name());
   model->operators.emplace_back(op);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertBatchToSpaceNDOperator(
@@ -1925,7 +1925,7 @@ tensorflow::Status ConvertBatchToSpaceNDOperator(
   op->inputs.push_back(node.input(2));
   op->outputs.push_back(node.name());
   model->operators.emplace_back(op);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename T>
@@ -1943,7 +1943,7 @@ tensorflow::Status ConvertReduceOperator(
   } else if (HasAttr(node, "keep_dims")) {
     op->keep_dims = GetBoolAttr(node, "keep_dims");
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 // TODO(b/139320642): Add test when fused op is supported.
@@ -1973,7 +1973,7 @@ tensorflow::Status ConvertSvdfOperator(
   }
   op->rank = node.attr().at("Rank").i();
   model->operators.emplace_back(op);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 // This is just bare bones support to get the shapes to propagate.
@@ -2045,7 +2045,7 @@ tensorflow::Status ConvertTransposeConvOperator(
                   "Conv2DBackpropInput nodes.";
   }
   model->operators.emplace_back(op);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertRangeOperator(
@@ -2066,7 +2066,7 @@ tensorflow::Status ConvertRangeOperator(
   op->outputs.push_back(node.name());
 
   model->operators.emplace_back(op);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 // Note that it's easy to confuse/conflate "Stack" and "Pack" operators, but
@@ -2092,7 +2092,7 @@ tensorflow::Status ConvertPackOperator(
   op->dtype = ConvertDataType(toco::GetDataTypeAttr(node, "T"));
   op->outputs.push_back(node.name());
   model->operators.emplace_back(std::move(op));
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertUnpackOperator(
@@ -2112,7 +2112,7 @@ tensorflow::Status ConvertUnpackOperator(
     op->outputs.push_back(node.name() + ":" + std::to_string(i));
   }
   model->operators.emplace_back(std::move(op));
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 // Some TensorFlow ops only occur in graph cycles, representing
@@ -2141,7 +2141,7 @@ tensorflow::Status ConvertOperatorSpecialCasedAsRNNBackEdge(
   // TODO(tianjuny): Temporary set the size to 1 to avoid transient array
   // allocation crash. The real value should depend on the hidden_size of RNN.
   rnn_state->set_size(1);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertShapeOperator(
@@ -2157,7 +2157,7 @@ tensorflow::Status ConvertShapeOperator(
   op->inputs.push_back(node.input(0));
   op->outputs.push_back(node.name());
   model->operators.push_back(std::move(op));
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertReverseSequenceOperator(
@@ -2178,7 +2178,7 @@ tensorflow::Status ConvertReverseSequenceOperator(
   }
   op->outputs.push_back(node.name());
   model->operators.push_back(std::move(op));
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 void StripCaretFromArrayNames(Model* model) {
@@ -2346,7 +2346,7 @@ tensorflow::Status ConvertTopKV2Operator(
   op->outputs.push_back(node.name());
   op->outputs.push_back(node.name() + ":1");
   model->operators.emplace_back(op.release());
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertDynamicPartitionOperator(
@@ -2364,7 +2364,7 @@ tensorflow::Status ConvertDynamicPartitionOperator(
     op->outputs.push_back(node.name() + ":" + std::to_string(i));
   }
   model->operators.emplace_back(op.release());
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertDynamicStitchOperator(
@@ -2383,7 +2383,7 @@ tensorflow::Status ConvertDynamicStitchOperator(
   }
   op->outputs.push_back(node.name());
   model->operators.emplace_back(op.release());
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertSparseToDenseOperator(
@@ -2402,7 +2402,7 @@ tensorflow::Status ConvertSparseToDenseOperator(
                              ? GetBoolAttr(node, "validate_indices")
                              : true;
   model->operators.emplace_back(op);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertOneHotOperator(
@@ -2423,7 +2423,7 @@ tensorflow::Status ConvertOneHotOperator(
   }
   op->outputs.push_back(node.name());
   model->operators.emplace_back(op.release());
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertCTCBeamSearchDecoderOperator(
@@ -2451,7 +2451,7 @@ tensorflow::Status ConvertCTCBeamSearchDecoderOperator(
     op->outputs.push_back(node.name() + ":" + std::to_string(i + 1));
   }
   model->operators.emplace_back(op);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 // This isn't a TensorFlow builtin op. Currently this node can only be generated
@@ -2509,7 +2509,7 @@ tensorflow::Status ConvertUnidirectionalSequenceLstm(
   op->outputs.push_back(node.name() + ":2");
   model->operators.emplace_back(op);
 
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertLeakyReluOperator(
@@ -2524,7 +2524,7 @@ tensorflow::Status ConvertLeakyReluOperator(
   op->outputs.push_back(node.name());
   op->alpha = GetFloatAttr(node, "alpha");
   model->operators.emplace_back(op);
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status ConvertUnidirectionalSequenceRnn(
@@ -2545,7 +2545,7 @@ tensorflow::Status ConvertUnidirectionalSequenceRnn(
   op->outputs.push_back(node.name() + ":1");
   model->operators.emplace_back(op);
 
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
diff --git a/tensorflow/lite/toco/python/toco_python_api.cc b/tensorflow/lite/toco/python/toco_python_api.cc
index 7b1bd15b86a938..f88e59bd68fa4f 100644
--- a/tensorflow/lite/toco/python/toco_python_api.cc
+++ b/tensorflow/lite/toco/python/toco_python_api.cc
@@ -223,7 +223,7 @@ PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
   }
 
   if (!status.ok()) {
-    PyErr_SetString(PyExc_Exception, tsl::NullTerminatedMessage(status));
+    PyErr_SetString(PyExc_Exception, absl::StatusMessageAsCStr(status));
     return nullptr;
   }
   if (extended_return && !enable_mlir_converter) {
diff --git a/tensorflow/lite/toco/tflite/BUILD b/tensorflow/lite/toco/tflite/BUILD
index a4ea52f9ac37ae..2c1becd924e824 100644
--- a/tensorflow/lite/toco/tflite/BUILD
+++ b/tensorflow/lite/toco/tflite/BUILD
@@ -26,12 +26,18 @@ cc_library(
         ":types",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/lite:builtin_op_data",
+        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/delegates/flex:allowlisted_flex_ops_lib",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/toco:graph_transformations",
         "//tensorflow/lite/toco:model",
+        "//tensorflow/lite/toco:runtime",
+        "//tensorflow/lite/toco:toco_port",
         "//tensorflow/lite/tools/versioning",
+        "//tensorflow/lite/tools/versioning:op_signature",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
         "@flatbuffers",
     ],
@@ -46,8 +52,11 @@ tf_cc_test(
         ":operator",
         "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/toco:model",
+        "//tensorflow/lite/toco:runtime",
         "//tensorflow/lite/toco:tooling_util",
+        "@com_google_absl//absl/log:check",
         "@com_google_googletest//:gtest_main",
         "@flatbuffers",
     ],
@@ -65,6 +74,9 @@ cc_library(
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/toco:model",
+        "//tensorflow/lite/toco:runtime",
+        "@com_google_absl//absl/log",
+        "@flatbuffers//:runtime_cc",
     ],
 )
 
@@ -76,7 +88,11 @@ tf_cc_test(
     deps = [
         ":types",
         "//tensorflow/core:ops",
+        "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/toco:model",
+        "//tensorflow/lite/toco:runtime",
         "@com_google_googletest//:gtest_main",
+        "@flatbuffers//:runtime_cc",
     ],
 )
 
@@ -93,15 +109,18 @@ cc_library(
         ":operator",
         ":types",
         "//tensorflow/core:lib_proto_parsing",
-        "//tensorflow/lite:context",
+        "//tensorflow/core/platform:status",
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite:util",
+        "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/toco:model",
+        "//tensorflow/lite/toco:toco_port",
         "//tensorflow/lite/toco:tooling_util",
         "//tensorflow/lite/tools/optimize:quantize_weights",
         "//tensorflow/lite/tools/versioning",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings",
         "@flatbuffers",
     ],
@@ -121,8 +140,11 @@ tf_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/schema:schema_utils",
+        "//tensorflow/lite/toco:model",
+        "@com_google_absl//absl/log",
         "@com_google_googletest//:gtest_main",
         "@flatbuffers",
+        "@local_tsl//tsl/protobuf:error_codes_proto_impl_cc",
     ],
 )
 
@@ -144,7 +166,10 @@ cc_library(
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/schema:schema_utils",
         "//tensorflow/lite/toco:model",
+        "//tensorflow/lite/toco:model_flags_proto_cc",
         "//tensorflow/lite/toco:tooling_util",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@flatbuffers",
     ],
 )
@@ -163,6 +188,9 @@ tf_cc_test(
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/toco:model",
+        "//tensorflow/lite/toco:model_flags_proto_cc",
+        "//tensorflow/lite/toco:toco_port",
         "@com_google_googletest//:gtest_main",
         "@flatbuffers",
     ],
diff --git a/tensorflow/lite/toco/tflite/export.cc b/tensorflow/lite/toco/tflite/export.cc
index 9c29f700ba4594..ec4fcee3c7e23a 100644
--- a/tensorflow/lite/toco/tflite/export.cc
+++ b/tensorflow/lite/toco/tflite/export.cc
@@ -17,16 +17,25 @@ limitations under the License.
 #include <string>
 
 #include "flatbuffers/flexbuffers.h"
+#include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "flatbuffers/buffer.h"  // from @flatbuffers
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "flatbuffers/string.h"  // from @flatbuffers
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/lite/context.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/toco/model.h"
 #include "tensorflow/lite/toco/tflite/operator.h"
 #include "tensorflow/lite/toco/tflite/types.h"
+#include "tensorflow/lite/toco/toco_types.h"
 #include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/lite/tools/optimize/quantize_weights.h"
 #include "tensorflow/lite/tools/versioning/runtime_version.h"
+#include "tensorflow/lite/util.h"
 #include "tensorflow/lite/version.h"
 
 namespace toco {
diff --git a/tensorflow/lite/toco/tflite/export.h b/tensorflow/lite/toco/tflite/export.h
index 38ea2a925e1ebd..c4bd9ba99eaf04 100644
--- a/tensorflow/lite/toco/tflite/export.h
+++ b/tensorflow/lite/toco/tflite/export.h
@@ -17,6 +17,9 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/log/log.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/toco/model.h"
 #include "tensorflow/lite/toco/tflite/operator.h"
 #include "tensorflow/lite/util.h"
diff --git a/tensorflow/lite/toco/tflite/export_test.cc b/tensorflow/lite/toco/tflite/export_test.cc
index 7659ed7c1e237e..e2b812b9decb29 100644
--- a/tensorflow/lite/toco/tflite/export_test.cc
+++ b/tensorflow/lite/toco/tflite/export_test.cc
@@ -21,14 +21,18 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "absl/log/log.h"
+#include "flatbuffers/buffer.h"  // from @flatbuffers
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/schema/schema_utils.h"
+#include "tensorflow/lite/toco/model.h"
 #include "tensorflow/lite/toco/tflite/builtin_operator.h"
 #include "tensorflow/lite/toco/tflite/operator.h"
 #include "tensorflow/lite/toco/tflite/types.h"
+#include "tsl/protobuf/error_codes.pb.h"
 
 namespace toco {
 namespace tflite {
diff --git a/tensorflow/lite/toco/tflite/import.cc b/tensorflow/lite/toco/tflite/import.cc
index bc1bdcf6493fa3..5c0bdc37598d88 100644
--- a/tensorflow/lite/toco/tflite/import.cc
+++ b/tensorflow/lite/toco/tflite/import.cc
@@ -17,11 +17,15 @@ limitations under the License.
 #include <memory>
 #include <string>
 
-#include "flatbuffers/flexbuffers.h"
-#include "tensorflow/lite/core/model.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "flatbuffers/verifier.h"  // from @flatbuffers
 #include "tensorflow/lite/core/tools/verifier.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/schema/schema_utils.h"
+#include "tensorflow/lite/stderr_reporter.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/tflite/operator.h"
 #include "tensorflow/lite/toco/tflite/types.h"
 #include "tensorflow/lite/toco/tooling_util.h"
diff --git a/tensorflow/lite/toco/tflite/import.h b/tensorflow/lite/toco/tflite/import.h
index 5dc3cc58f9b050..30930fdc1e33a6 100644
--- a/tensorflow/lite/toco/tflite/import.h
+++ b/tensorflow/lite/toco/tflite/import.h
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
 
 namespace toco {
 
diff --git a/tensorflow/lite/toco/tflite/import_test.cc b/tensorflow/lite/toco/tflite/import_test.cc
index c175e4fd810928..f54e056827b0cc 100644
--- a/tensorflow/lite/toco/tflite/import_test.cc
+++ b/tensorflow/lite/toco/tflite/import_test.cc
@@ -17,11 +17,14 @@ limitations under the License.
 #include <initializer_list>
 #include <string>
 
-#include "flatbuffers/flexbuffers.h"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+#include "tensorflow/lite/toco/toco_types.h"
 #include "tensorflow/lite/version.h"
 
 namespace toco {
diff --git a/tensorflow/lite/toco/tflite/operator.cc b/tensorflow/lite/toco/tflite/operator.cc
index 6e583a6ee48796..e0cf439626f24f 100644
--- a/tensorflow/lite/toco/tflite/operator.cc
+++ b/tensorflow/lite/toco/tflite/operator.cc
@@ -19,21 +19,30 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "flatbuffers/buffer.h"  // from @flatbuffers
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 
 // graph_transformation module.
-#include "tensorflow/lite/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/flex/allowlisted_flex_ops.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/toco/graph_transformations/lstm_utils.h"
 #include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/runtime/types.h"
 #include "tensorflow/lite/toco/tflite/builtin_operator.h"
 #include "tensorflow/lite/toco/tflite/custom_operator.h"
 #include "tensorflow/lite/toco/tflite/simple_operator.h"
 #include "tensorflow/lite/toco/tflite/types.h"
+#include "tensorflow/lite/toco/toco_types.h"
+#include "tensorflow/lite/tools/versioning/op_signature.h"
 #include "tensorflow/lite/tools/versioning/op_version.h"
 
 namespace toco {
diff --git a/tensorflow/lite/toco/tflite/operator.h b/tensorflow/lite/toco/tflite/operator.h
index cc1b38e5cf82af..7b8b6b64e21e83 100644
--- a/tensorflow/lite/toco/tflite/operator.h
+++ b/tensorflow/lite/toco/tflite/operator.h
@@ -19,8 +19,12 @@ limitations under the License.
 
 #include "flatbuffers/flatbuffers.h"
 #include "flatbuffers/flexbuffers.h"
+#include "flatbuffers/buffer.h"  // from @flatbuffers
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "flatbuffers/vector.h"  // from @flatbuffers
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/tools/versioning/op_signature.h"
 #include "tensorflow/lite/tools/versioning/op_version.h"
 
 namespace toco {
diff --git a/tensorflow/lite/toco/tflite/operator_test.cc b/tensorflow/lite/toco/tflite/operator_test.cc
index e7e258aa3111d0..8f1d42ad8fb9d8 100644
--- a/tensorflow/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/lite/toco/tflite/operator_test.cc
@@ -16,12 +16,15 @@ limitations under the License.
 
 #include <string>
 
-#include "flatbuffers/flexbuffers.h"
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/log/check.h"
+#include "flatbuffers/buffer.h"  // from @flatbuffers
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/runtime/types.h"
 #include "tensorflow/lite/toco/tooling_util.h"
 
 namespace toco {
diff --git a/tensorflow/lite/toco/tflite/types.cc b/tensorflow/lite/toco/tflite/types.cc
index 659b42bf0537b9..f67aad1f7f7b0d 100644
--- a/tensorflow/lite/toco/tflite/types.cc
+++ b/tensorflow/lite/toco/tflite/types.cc
@@ -16,7 +16,14 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/log/log.h"
+#include "flatbuffers/buffer.h"  // from @flatbuffers
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "flatbuffers/vector.h"  // from @flatbuffers
+#include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_util.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/runtime/types.h"
 
 namespace toco {
 
diff --git a/tensorflow/lite/toco/tflite/types.h b/tensorflow/lite/toco/tflite/types.h
index bc2edb74297426..cccba6a45db5c5 100644
--- a/tensorflow/lite/toco/tflite/types.h
+++ b/tensorflow/lite/toco/tflite/types.h
@@ -15,8 +15,12 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_TOCO_TFLITE_TYPES_H_
 #define TENSORFLOW_LITE_TOCO_TFLITE_TYPES_H_
 
+#include "flatbuffers/buffer.h"  // from @flatbuffers
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "flatbuffers/vector.h"  // from @flatbuffers
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/runtime/types.h"
 
 namespace toco {
 
diff --git a/tensorflow/lite/toco/tflite/types_test.cc b/tensorflow/lite/toco/tflite/types_test.cc
index ca1ff282d06e64..d5ac84d9769768 100644
--- a/tensorflow/lite/toco/tflite/types_test.cc
+++ b/tensorflow/lite/toco/tflite/types_test.cc
@@ -18,6 +18,11 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "flatbuffers/buffer.h"  // from @flatbuffers
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/runtime/types.h"
 
 namespace toco {
 
diff --git a/tensorflow/lite/toco/toco.cc b/tensorflow/lite/toco/toco.cc
index f1486b66e3fcbb..bd3cedb947867c 100644
--- a/tensorflow/lite/toco/toco.cc
+++ b/tensorflow/lite/toco/toco.cc
@@ -50,7 +50,7 @@ int main(int argc, char** argv) {
   toco::port::InitGoogle(argv[0], effective_argc, &effective_argv, true);
   auto status = toco::Convert(parsed_toco_flags, parsed_model_flags);
   if (!status.ok()) {
-    fprintf(stderr, "%s\n", tsl::NullTerminatedMessage(status));
+    fprintf(stderr, "%s\n", absl::StatusMessageAsCStr(status));
     fflush(stderr);
     return 1;
   }
diff --git a/tensorflow/lite/toco/toco_convert.cc b/tensorflow/lite/toco/toco_convert.cc
index a3cd7183028ab1..5e2d3e3dea0005 100644
--- a/tensorflow/lite/toco/toco_convert.cc
+++ b/tensorflow/lite/toco/toco_convert.cc
@@ -83,7 +83,7 @@ tensorflow::Status Convert(const std::string& graph_def_contents,
   if (arithmetic_ops_count != nullptr) {
     *arithmetic_ops_count = model->ArithmeticOpsCount();
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status Convert(const ParsedTocoFlags& parsed_toco_flags,
diff --git a/tensorflow/lite/toco/toco_port.cc b/tensorflow/lite/toco/toco_port.cc
index 56bac97b2ed1ba..f2e5e29f8c266c 100644
--- a/tensorflow/lite/toco/toco_port.cc
+++ b/tensorflow/lite/toco/toco_port.cc
@@ -77,7 +77,7 @@ tensorflow::Status ToStatus(const absl::Status& uts) {
     return tensorflow::Status(absl::StatusCode(::util::RetrieveErrorCode(uts)),
                               uts.message());
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 // Conversion to our wrapper Options.
diff --git a/tensorflow/lite/toco/toco_tooling.cc b/tensorflow/lite/toco/toco_tooling.cc
index 2cf306f159fd13..d0a28da9381917 100644
--- a/tensorflow/lite/toco/toco_tooling.cc
+++ b/tensorflow/lite/toco/toco_tooling.cc
@@ -469,7 +469,7 @@ tensorflow::Status TransformWithStatus(const TocoFlags& toco_flags,
     params_count += RequiredBufferSizeForShape(array.shape());
   }
   LOG(INFO) << "Number of parameters: " << params_count;
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status Export(const TocoFlags& toco_flags, const Model& model,
diff --git a/tensorflow/lite/toco/tooling_util.h b/tensorflow/lite/toco/tooling_util.h
index ec022e89c5dbcc..e37108dcc47c03 100644
--- a/tensorflow/lite/toco/tooling_util.h
+++ b/tensorflow/lite/toco/tooling_util.h
@@ -349,7 +349,7 @@ tensorflow::Status NumElements(const std::vector<T>& shape, U* num_elements) {
     }
     *num_elements *= dim;
   }
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 // A model file may have shuffled FC weights.
diff --git a/tensorflow/lite/toco/tooling_util_test.cc b/tensorflow/lite/toco/tooling_util_test.cc
index c8b37ff5eca72d..c92e546146d100 100644
--- a/tensorflow/lite/toco/tooling_util_test.cc
+++ b/tensorflow/lite/toco/tooling_util_test.cc
@@ -105,7 +105,7 @@ static const char kLargeTensorMessage[] = "Tensor shape is too large";
 
 TEST(NumElementsTest, Int) {
   int count;
-  tensorflow::Status status = ::tensorflow::OkStatus();
+  tensorflow::Status status = absl::OkStatus();
 
   status = NumElements(std::vector<int>{1024, 1024, 2047}, &count);
   EXPECT_TRUE(status.ok());
@@ -124,7 +124,7 @@ TEST(NumElementsTest, Int) {
 
 TEST(NumElementsTest, Int32) {
   int32_t count;
-  tensorflow::Status status = ::tensorflow::OkStatus();
+  tensorflow::Status status = absl::OkStatus();
 
   status = NumElements(std::vector<int32_t>{1024, 1024, 2047}, &count);
   EXPECT_TRUE(status.ok());
@@ -139,7 +139,7 @@ TEST(NumElementsTest, Int32) {
 
 TEST(NumElementsTest, Int64) {
   int64_t count;
-  tensorflow::Status status = ::tensorflow::OkStatus();
+  tensorflow::Status status = absl::OkStatus();
 
   status = NumElements(std::vector<int64_t>{16777216, 16777216, 32767}, &count);
   EXPECT_TRUE(status.ok());
@@ -154,7 +154,7 @@ TEST(NumElementsTest, Int64) {
 
 TEST(NumElementsTest, UnsignedInt32) {
   uint32_t count;
-  tensorflow::Status status = ::tensorflow::OkStatus();
+  tensorflow::Status status = absl::OkStatus();
 
   status = NumElements(std::vector<uint32_t>{1024, 2048, 2047}, &count);
   EXPECT_TRUE(status.ok());
@@ -169,7 +169,7 @@ TEST(NumElementsTest, UnsignedInt32) {
 
 TEST(NumElementsTest, UnsignedInt64) {
   uint64_t count;
-  tensorflow::Status status = ::tensorflow::OkStatus();
+  tensorflow::Status status = absl::OkStatus();
 
   status =
       NumElements(std::vector<uint64_t>{16777216, 16777216, 65535}, &count);
@@ -185,7 +185,7 @@ TEST(NumElementsTest, UnsignedInt64) {
 }
 
 TEST(NumElementsTest, Scalar) {
-  tensorflow::Status status = ::tensorflow::OkStatus();
+  tensorflow::Status status = absl::OkStatus();
 
   int32_t count;
   status = NumElements(std::vector<int32_t>{}, &count);
diff --git a/tensorflow/lite/tools/benchmark/BUILD b/tensorflow/lite/tools/benchmark/BUILD
index ada283aab36a9f..452aa4758ee25f 100644
--- a/tensorflow/lite/tools/benchmark/BUILD
+++ b/tensorflow/lite/tools/benchmark/BUILD
@@ -98,12 +98,16 @@ cc_test(
     args = [
         "--fp32_graph=$(location //tensorflow/lite:testdata/multi_add.bin)",
         "--int8_graph=$(location //tensorflow/lite:testdata/add_quantized_int8.bin)",
-        "--string_graph=$(location //tensorflow/lite:testdata/string_input_model.bin)",
+        "--string_graph_with_signature=$(location //tensorflow/lite:testdata/string_input_model_with_signature.bin)",
+        "--string_graph_without_signature=$(location //tensorflow/lite:testdata/string_input_model.bin)",
+        "--multi_signature_graph=$(location //tensorflow/lite:testdata/multi_signatures.bin)",
     ],
     data = [
         "//tensorflow/lite:testdata/add_quantized_int8.bin",
         "//tensorflow/lite:testdata/multi_add.bin",
+        "//tensorflow/lite:testdata/multi_signatures.bin",
         "//tensorflow/lite:testdata/string_input_model.bin",
+        "//tensorflow/lite:testdata/string_input_model_with_signature.bin",
     ],
     tags = [
         "tflite_not_portable_android",
@@ -123,6 +127,7 @@ cc_test(
         "@com_google_absl//absl/algorithm",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -158,6 +163,7 @@ cc_library(
         "//tensorflow/lite:framework",
         "//tensorflow/lite:simple_memory_arena_debug_dump",
         "//tensorflow/lite:string_util",
+        "//tensorflow/lite/core:cc_api_stable",
         "//tensorflow/lite/core:framework",
         "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/core/c:common",
diff --git a/tensorflow/lite/tools/benchmark/README.md b/tensorflow/lite/tools/benchmark/README.md
index 1eb038b45ca77d..f25da51705d6b8 100644
--- a/tensorflow/lite/tools/benchmark/README.md
+++ b/tensorflow/lite/tools/benchmark/README.md
@@ -25,6 +25,13 @@ The binary takes the following required parameters:
 
 and the following optional parameters:
 
+*   `signature_to_run_for`: `string` (default="") \
+    If the model contains multiple signatures, use this flag to specify the
+    signature to benchmark.
+    - If multiple signatures are present and this flag is not specified, the
+    benchmark will throw an error.
+    - If only one signature is present and this flag is not specified, the
+    default signature will be used.
 *   `num_threads`: `int` (default=-1) \
     The number of threads to use for running TFLite interpreter. By default,
     this is set to the platform default value -1.
@@ -243,6 +250,11 @@ delegate first, and then the XNNPACK delegate secondly.
 
 ## To build/install/run
 
+Note: The benchmarking tool must be compiled with a TFLite runtime that
+supports the ops found in the model to be tested.<br/>
+If Tensorflow Ops ("flex ops")
+or other custom ops are used in the model, please see the section [below](#build-the-benchmark-tool-with-tensorflow-ops-support).
+
 ### On Android:
 
 (0) Refer to https://www.tensorflow.org/lite/guide/build_android to edit the
@@ -301,8 +313,8 @@ adb shell /data/local/tmp/benchmark_model \
 bazel build -c opt tensorflow/lite/tools/benchmark:benchmark_model
 ```
 
-(2) Run on your compute graph, similar to the Android case but without the need of adb shell.
-For example:
+(2) Run on your compute graph, similar to the Android case but without the need
+of adb shell. For example:
 
 ```
 bazel-bin/tensorflow/lite/tools/benchmark/benchmark_model \
@@ -435,12 +447,19 @@ some additional parameters as detailed below.
 
 ## Build the benchmark tool with Tensorflow ops support
 
-You can build the benchmark tool with [Tensorflow operators support](https://www.tensorflow.org/lite/guide/ops_select).
+If you see an error that says: `ERROR: Select TensorFlow op(s), included in the
+given model, is(are) not supported by this interpreter.` you will need to
+build with [Tensorflow operators support](https://www.tensorflow.org/lite/guide/ops_select).
+
+Having Tensorflow ops in the TFLite file works when the benchmark tool is built
+with Tensorflow ops support. It doesn't require any additional option to use it.
 
 ### How to build
 
-To build the tool, you need to use 'benchmark_model_plus_flex' target with
-'--config=monolithic' option.
+To build the tool, you need to use the `benchmark_model_plus_flex` target with
+the `--config=monolithic` flag.
+
+**Desktop**
 
 ```
 bazel build -c opt \
@@ -448,12 +467,53 @@ bazel build -c opt \
   tensorflow/lite/tools/benchmark:benchmark_model_plus_flex
 ```
 
+**Android**
+
+```
+bazel build -c opt \
+  --config=monolithic --config=android_arm64 \
+  tensorflow/lite/tools/benchmark:benchmark_model_plus_flex
+```
+
 ### How to benchmark tflite model with Tensorflow ops
 
-Tensorflow ops support just works the benchmark tool is built with Tensorflow
-ops support. It doesn't require any additional option to use it.
+Follow the further instructions [above](#to-buildinstallrun) replacing
+`benchmark_model` with the `benchmark_model_plus_flex` file created here.
+
+For example, on desktop it's very easy:
 
 ```
 bazel-bin/tensorflow/lite/tools/benchmark/benchmark_model_plus_flex \
   --graph=model_converted_with_TF_ops.tflite \
 ```
+
+## Build the benchmark tool with Custom ops support
+
+If you see an error that says `ERROR: Op type not registered 'XXXXXXXX'
+in binary running on localhost.` for custom ops running in your TFLite model,
+you will need to manually build the tool to include your libraries providing
+the custom ops.
+
+### How to build
+
+While possible, this is not necessarily supported.
+
+However, you should be able to create a new `cc_binary` rule that depends on
+`tensorflow/lite/tools/benchmark:benchmark_model_main` along with your custom op
+rules.
+
+```
+cc_binary(
+    name = "benchmark_model_plus_custom_ops",
+    deps = [
+        ":my_custom_ops_provider",
+        "//tensorflow/lite/tools/benchmark:benchmark_model_main",
+    ],
+)
+```
+
+### How to benchmark tflite model with Custom ops
+
+Use the `benchmark_model_plus_custom_ops` (or whatever) file created by your
+custom rule instead of the `benchmark_model` file in the instructions,
+[above](#to-buildinstallrun).
diff --git a/tensorflow/lite/tools/benchmark/benchmark_test.cc b/tensorflow/lite/tools/benchmark/benchmark_test.cc
index 83cb10dd2746d4..5f97b32deb1e37 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_test.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/algorithm/algorithm.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/interpreter.h"
@@ -43,6 +44,8 @@ namespace {
 const std::string* g_fp32_model_path = nullptr;
 const std::string* g_int8_model_path = nullptr;
 const std::string* g_string_model_path = nullptr;
+const std::string* g_string_model_path_no_signature = nullptr;
+const std::string* g_multi_signature_model_path = nullptr;
 }  // namespace
 
 namespace tflite {
@@ -55,7 +58,9 @@ enum class ModelReadOption { FROM_PATH, FROM_FD };
 void InitializeParams(
     BenchmarkParams& params, int32_t num_runs, float min_secs, float max_secs,
     ModelReadOption model_read_option = ModelReadOption::FROM_PATH,
-    ModelGraphType graph_type = ModelGraphType::FP32) {
+    ModelGraphType graph_type = ModelGraphType::FP32,
+    absl::string_view signature_key = "",
+    bool use_legacy_string_model = false) {
   params.Set<int32_t>("num_runs", num_runs);
   params.Set<float>("min_secs", min_secs);
   params.Set<float>("max_secs", max_secs);
@@ -65,7 +70,10 @@ void InitializeParams(
   if (graph_type == ModelGraphType::INT8) {
     graph_path = *g_int8_model_path;
   } else if (graph_type == ModelGraphType::STRING) {
-    graph_path = *g_string_model_path;
+    graph_path = use_legacy_string_model ? *g_string_model_path_no_signature
+                                         : *g_string_model_path;
+  } else if (!signature_key.empty()) {
+    graph_path = *g_multi_signature_model_path;
   }
   std::string fd_or_graph_path = graph_path;
 #ifndef _WIN32
@@ -81,6 +89,9 @@ void InitializeParams(
   }
 #endif  // !defined(_WIN32)
   params.Set<std::string>("graph", fd_or_graph_path);
+  if (!signature_key.empty()) {
+    params.Set<std::string>("signature_to_run_for", std::string(signature_key));
+  }
 }
 
 BenchmarkParams InitializeParams() {
@@ -110,6 +121,14 @@ BenchmarkParams CreateStringParams() {
       /*model_read_option=*/ModelReadOption::FROM_PATH, ModelGraphType::STRING);
   return params;
 }
+BenchmarkParams CreateLegacyStringParams() {
+  BenchmarkParams params = BenchmarkTfLiteModel::DefaultParams();
+  InitializeParams(
+      params, /*num_runs=*/2, /*min_secs=*/1.0f, /*max_secs=*/150.0f,
+      /*model_read_option=*/ModelReadOption::FROM_PATH, ModelGraphType::STRING,
+      /*signature_key=*/"", /*use_legacy_string_model=*/true);
+  return params;
+}
 BenchmarkParams CreateStringFdParams() {
   BenchmarkParams params = BenchmarkTfLiteModel::DefaultParams();
   InitializeParams(
@@ -117,6 +136,14 @@ BenchmarkParams CreateStringFdParams() {
       /*model_read_option=*/ModelReadOption::FROM_FD, ModelGraphType::STRING);
   return params;
 }
+BenchmarkParams CreateMultiSignatureParams(std::string signature_key) {
+  BenchmarkParams params = BenchmarkTfLiteModel::DefaultParams();
+  InitializeParams(
+      params, /*num_runs=*/2, /*min_secs=*/1.0f, /*max_secs=*/150.0f,
+      /*model_read_option=*/ModelReadOption::FROM_PATH, ModelGraphType::FP32,
+      /*signature_key=*/signature_key);
+  return params;
+}
 
 std::string CreateFilePath(const std::string& file_name) {
   const char* tmp_dir = getenv("TEST_TMPDIR");
@@ -175,9 +202,10 @@ class TestBenchmark : public BenchmarkTfLiteModel {
   }
 
   const TfLiteTensor* GetInputTensor(int index) {
-    return index >= interpreter_->inputs().size()
+    return index >= interpreter_runner_->inputs().size()
                ? nullptr
-               : interpreter_->input_tensor(index);
+               : interpreter_runner_->tensor(
+                     interpreter_runner_->inputs()[index]);
   }
 };
 
@@ -196,21 +224,48 @@ TEST(BenchmarkTest, DoesntCrashInt8Model) {
 }
 
 TEST(BenchmarkTest, DoesntCrashStringModel) {
-  ASSERT_THAT(g_int8_model_path, testing::NotNull());
+  ASSERT_THAT(g_string_model_path, testing::NotNull());
 
   TestBenchmark benchmark(CreateStringParams());
   benchmark.Run();
 }
 
+TEST(BenchmarkTest, DoesntCrashStringLegacyModel) {
+  ASSERT_THAT(g_string_model_path_no_signature, testing::NotNull());
+
+  TestBenchmark benchmark(CreateLegacyStringParams());
+  benchmark.Run();
+}
+
 #ifndef _WIN32
 TEST(BenchmarkTest, DoesntCrashStringModelWithFd) {
-  ASSERT_THAT(g_int8_model_path, testing::NotNull());
+  ASSERT_THAT(g_string_model_path, testing::NotNull());
 
   TestBenchmark benchmark(CreateStringFdParams());
   benchmark.Run();
 }
 #endif  // !defined(_WIN32)
 
+TEST(BenchmarkTest, DoesntCrashMultiSignatureModel) {
+  ASSERT_THAT(g_multi_signature_model_path, testing::NotNull());
+
+  TestBenchmark benchmark(CreateMultiSignatureParams("add"));
+  auto status = benchmark.Run();
+  EXPECT_EQ(kTfLiteOk, status);
+
+  TestBenchmark benchmark_sub(CreateMultiSignatureParams("sub"));
+  auto status_sub = benchmark_sub.Run();
+  EXPECT_EQ(kTfLiteOk, status_sub);
+}
+
+TEST(BenchmarkTest, MultiSignatureModelWithInvalidSignatureKeyFails) {
+  ASSERT_THAT(g_multi_signature_model_path, testing::NotNull());
+
+  TestBenchmark benchmark(CreateMultiSignatureParams("addisabbaba"));
+  auto status = benchmark.Run();
+  EXPECT_EQ(kTfLiteError, status);
+}
+
 TEST(BenchmarkTest, SplitInputLayerNameAndValueFile) {
   std::vector<std::string> input_layer_value_files = {
       "input:/tmp/input",
@@ -363,6 +418,44 @@ TEST(BenchmarkTest, DoesntCrashWithExplicitInputValueFilesInt8Model) {
   CheckInputTensorValue(benchmark.GetInputTensor(0), file_value);
 }
 
+TEST(BenchmarkTest, DoesntCrashWithExplicitInputValueFilesMultiSignatureModel) {
+  ASSERT_THAT(g_multi_signature_model_path, testing::NotNull());
+  const std::string file_path_add =
+      CreateFilePath("multi_signature_binary_add");
+  char file_value_add = 'a';
+  WriteInputLayerValueFile(file_path_add, ModelGraphType::FP32, 192,
+                           file_value_add);
+
+  // Note: the following input-related params are *specific* to model
+  // 'g_multi_signature_model_path' which is specified as
+  // 'lite:testdata/add_quantized_int8.bin for the test.
+  BenchmarkParams params = CreateMultiSignatureParams("add");
+  params.Set<std::string>("input_layer", "x");
+  params.Set<std::string>("input_layer_shape", "192");
+  params.Set<std::string>("input_layer_value_files", "x:" + file_path_add);
+  TestBenchmark benchmark(std::move(params));
+  benchmark.Run();
+
+  CheckInputTensorValue(benchmark.GetInputTensor(0), file_value_add);
+
+  const std::string file_path_sub =
+      CreateFilePath("multi_signature_binary_sub");
+  char file_value_sub = 'z';
+  WriteInputLayerValueFile(file_path_sub, ModelGraphType::FP32, 192,
+                           file_value_sub);
+  // Note: the following input-related params are *specific* to model
+  // 'g_multi_signature_model_path' which is specified as
+  // 'lite:testdata/add_quantized_int8.bin for the test.
+  BenchmarkParams params_2 = CreateMultiSignatureParams("sub");
+  params_2.Set<std::string>("input_layer", "x");
+  params_2.Set<std::string>("input_layer_shape", "192");
+  params_2.Set<std::string>("input_layer_value_files", "x:" + file_path_sub);
+  TestBenchmark benchmark_2(std::move(params_2));
+  benchmark_2.Run();
+
+  CheckInputTensorValue(benchmark_2.GetInputTensor(0), file_value_sub);
+}
+
 TEST(BenchmarkTest, DoesntCrashWithExplicitInputValueFilesStringModel) {
   ASSERT_THAT(g_string_model_path, testing::NotNull());
   const std::string file_path = CreateFilePath("string_binary");
@@ -521,19 +614,30 @@ TEST(BenchmarkTest, InitializationFailedWhenInvalidGraphFdIsProvided) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  std::string fp32_model_path, int8_model_path, string_model_path;
+  std::string fp32_model_path, int8_model_path, string_model_path,
+      string_model_path_with_no_signature, multi_signature_model_path;
   std::vector<tflite::Flag> flags = {
       tflite::Flag::CreateFlag("fp32_graph", &fp32_model_path,
                                "Path to a fp32 model file."),
       tflite::Flag::CreateFlag("int8_graph", &int8_model_path,
                                "Path to a int8 model file."),
-      tflite::Flag::CreateFlag("string_graph", &string_model_path,
-                               "Path to a string model file."),
+      tflite::Flag::CreateFlag("string_graph_with_signature",
+                               &string_model_path,
+                               "Path to a string model file with a signature."),
+      tflite::Flag::CreateFlag(
+          "string_graph_without_signature",
+          &string_model_path_with_no_signature,
+          "Path to a string model file without signatures."),
+      tflite::Flag::CreateFlag("multi_signature_graph",
+                               &multi_signature_model_path,
+                               "Path to a multi-signature model file."),
   };
 
   g_fp32_model_path = &fp32_model_path;
   g_int8_model_path = &int8_model_path;
   g_string_model_path = &string_model_path;
+  g_multi_signature_model_path = &multi_signature_model_path;
+  g_string_model_path_no_signature = &string_model_path_with_no_signature;
 
   const bool parse_result =
       tflite::Flags::Parse(&argc, const_cast<const char**>(argv), flags);
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
index 05ce93d2e6d588..d775122fe9c1fc 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -15,9 +15,11 @@ limitations under the License.
 
 #include "tensorflow/lite/tools/benchmark/benchmark_tflite_model.h"
 
+#include <algorithm>
 #include <cstdarg>
 #include <cstdint>
 #include <cstdlib>
+#include <cstring>
 #include <fstream>
 #include <functional>
 #include <iostream>
@@ -40,6 +42,7 @@ limitations under the License.
 #include "tensorflow/lite/core/kernels/register.h"
 #include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/core/model_builder.h"
+#include "tensorflow/lite/core/signature_runner.h"
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
@@ -133,7 +136,8 @@ class InterpreterStatePrinter : public BenchmarkListener {
 
 class OutputSaver : public BenchmarkListener {
  public:
-  explicit OutputSaver(Interpreter* interpreter) : interpreter_(interpreter) {}
+  explicit OutputSaver(BenchmarkInterpreterRunner* runner)
+      : interpreter_runner_(runner) {}
 
   void OnBenchmarkStart(const BenchmarkParams& params) override {
     params_ = &params;
@@ -145,16 +149,17 @@ class OutputSaver : public BenchmarkListener {
 
     std::ofstream ofs(path, std::ofstream::out);
     if (ofs.good()) {
-      for (int i = 0; i < interpreter_->outputs().size(); i++) {
-        ofs.write(interpreter_->output_tensor(i)->data.raw,
-                  interpreter_->output_tensor(i)->bytes);
+      for (int i = 0; i < interpreter_runner_->outputs().size(); i++) {
+        int tensor_index = interpreter_runner_->outputs()[i];
+        ofs.write(interpreter_runner_->tensor(tensor_index)->data.raw,
+                  interpreter_runner_->tensor(tensor_index)->bytes);
       }
       ofs.close();
     }
   }
 
  private:
-  Interpreter* const interpreter_ = nullptr;
+  BenchmarkInterpreterRunner* const interpreter_runner_;
   const BenchmarkParams* params_ = nullptr;
 };
 
@@ -346,9 +351,120 @@ TfLiteStatus SplitInputLayerNameAndValueFile(
   return kTfLiteOk;
 }
 
+std::pair<TfLiteStatus, std::unique_ptr<BenchmarkInterpreterRunner>>
+BenchmarkInterpreterRunner::Create(tflite::Interpreter* const interpreter,
+                                   std::string signature_key) {
+  if (!signature_key.empty()) {
+    const std::vector<const std::string*>& keys = interpreter->signature_keys();
+    bool found = std::any_of(
+        keys.begin(), keys.end(),
+        [&signature_key](const auto& k) { return *k == signature_key; });
+
+    if (keys.size() > 1 && (signature_key.empty() || !found)) {
+      TFLITE_LOG(ERROR)
+          << "Signature not specified or incorrect for graph with multiple "
+             "signatures. Pass one of the following to the flag "
+             "\"--signature_to_run_for\"";
+      for (const std::string* k : keys) {
+        TFLITE_LOG(ERROR) << " #> Signature key: " << *k;
+      }
+      return {kTfLiteError, nullptr};
+    } else if (keys.size() == 1 && signature_key.empty()) {
+      signature_key = *keys[0];
+    }
+
+    if (!signature_key.empty() && !keys.empty()) {
+      TFLITE_LOG(INFO) << "Using signature: " << signature_key;
+      auto signature_runner =
+          interpreter->GetSignatureRunner(signature_key.c_str());
+      if (signature_runner == nullptr) {
+        return {kTfLiteError, nullptr};
+      } else {
+        int subgraph_index =
+            interpreter->GetSubgraphIndexFromSignature(signature_key.c_str());
+
+        return {kTfLiteOk, std::make_unique<BenchmarkInterpreterRunner>(
+                               interpreter, signature_runner,
+                               interpreter->subgraph(subgraph_index))};
+      }
+    }
+  }
+  return {kTfLiteOk, std::make_unique<BenchmarkInterpreterRunner>(
+                         interpreter, nullptr, nullptr)};
+}
+
+TfLiteStatus BenchmarkInterpreterRunner::AllocateTensors() {
+  if (signature_runner_ != nullptr) {
+    return signature_runner_->AllocateTensors();
+  } else {
+    return interpreter_->AllocateTensors();
+  }
+}
+
+TfLiteStatus BenchmarkInterpreterRunner::Invoke() {
+  if (signature_runner_ != nullptr) {
+    return signature_runner_->Invoke();
+  } else {
+    return interpreter_->Invoke();
+  }
+}
+
+const std::vector<int>& BenchmarkInterpreterRunner::execution_plan() const {
+  if (signature_runner_ != nullptr) {
+    return subgraph_->execution_plan();
+  } else {
+    return interpreter_->execution_plan();
+  }
+}
+
+const std::vector<int>& BenchmarkInterpreterRunner::inputs() const {
+  if (signature_runner_ != nullptr) {
+    return subgraph_->inputs();
+  } else {
+    return interpreter_->inputs();
+  }
+}
+
+const std::vector<int>& BenchmarkInterpreterRunner::outputs() const {
+  if (signature_runner_ != nullptr) {
+    return subgraph_->outputs();
+  } else {
+    return interpreter_->outputs();
+  }
+}
+TfLiteTensor* BenchmarkInterpreterRunner::tensor(int tensor_index) {
+  if (signature_runner_ != nullptr) {
+    return subgraph_->tensor(tensor_index);
+  } else {
+    return interpreter_->tensor(tensor_index);
+  }
+}
+
+const std::pair<TfLiteNode, TfLiteRegistration>*
+BenchmarkInterpreterRunner::node_and_registration(int node_index) const {
+  if (signature_runner_ != nullptr) {
+    return subgraph_->node_and_registration(node_index);
+  } else {
+    return interpreter_->node_and_registration(node_index);
+  }
+}
+
+TfLiteStatus BenchmarkInterpreterRunner::ResizeInputTensor(
+    int tensor_index, const std::vector<int>& new_size) {
+  if (signature_runner_ != nullptr) {
+    return subgraph_->ResizeInputTensor(tensor_index, new_size);
+  } else {
+    return interpreter_->ResizeInputTensor(tensor_index, new_size);
+  }
+}
+
 BenchmarkParams BenchmarkTfLiteModel::DefaultParams() {
   BenchmarkParams default_params = BenchmarkModel::DefaultParams();
   default_params.AddParam("graph", BenchmarkParam::Create<std::string>(""));
+  default_params.AddParam("signature_to_run_for",
+                          BenchmarkParam::Create<std::string>(""));
+  default_params.AddParam("list_signatures",
+                          BenchmarkParam::Create<bool>(false));
   default_params.AddParam("input_layer",
                           BenchmarkParam::Create<std::string>(""));
   default_params.AddParam("input_layer_shape",
@@ -412,6 +528,10 @@ void BenchmarkTfLiteModel::CleanUp() {
 BenchmarkTfLiteModel::~BenchmarkTfLiteModel() {
   CleanUp();
 
+  // Release the pointer to the interpreter_runner_ before the interpreter is
+  // destroyed.
+  interpreter_runner_.reset();
+
   // Destory the owned interpreter earlier than other objects (specially
   // 'owned_delegates_').
   interpreter_.reset();
@@ -488,7 +608,17 @@ std::vector<Flag> BenchmarkTfLiteModel::GetFlags() {
       CreateFlag<int32_t>(
           "alloc_type_display_length", &params_,
           "The number of characters to show for the tensor's allocation type "
-          "when printing the interpeter's state, defaults to 18.")};
+          "when printing the interpeter's state, defaults to 18."),
+      CreateFlag<std::string>(
+          "signature_to_run_for", &params_,
+          "If the model contains multiple signatures, use this flag to specify "
+          "the signature to benchmark. If multiple signatures are present and "
+          "this flag is not specified, the benchmark will throw an error. If "
+          "only one signature is present and this flag is not specified, the "
+          "default signature will be used."),
+      CreateFlag<bool>("list_signatures", &params_,
+                       "Displays all signatures present in the model and then "
+                       "terminates the program.")};
 
   flags.insert(flags.end(), specific_flags.begin(), specific_flags.end());
 
@@ -503,6 +633,10 @@ void BenchmarkTfLiteModel::LogParams() {
   const bool verbose = params_.Get<bool>("verbose");
   // Always log the value of --graph.
   LOG_BENCHMARK_PARAM(std::string, "graph", "Graph", /*verbose*/ true);
+  LOG_BENCHMARK_PARAM(std::string, "signature_to_run_for", "Signature to run",
+                      /*verbose*/ true);
+  LOG_BENCHMARK_PARAM(bool, "list_signatures",
+                      "List signatures from the provided model", false);
   LOG_BENCHMARK_PARAM(std::string, "input_layer", "Input layers", verbose);
   LOG_BENCHMARK_PARAM(std::string, "input_layer_shape", "Input shapes",
                       verbose);
@@ -567,10 +701,10 @@ TfLiteStatus BenchmarkTfLiteModel::ValidateParams() {
 }
 
 uint64_t BenchmarkTfLiteModel::ComputeInputBytes() {
-  TFLITE_TOOLS_CHECK(interpreter_);
+  TFLITE_TOOLS_CHECK(interpreter_runner_);
   uint64_t total_input_bytes = 0;
-  for (int input : interpreter_->inputs()) {
-    auto* t = interpreter_->tensor(input);
+  for (int input : interpreter_runner_->inputs()) {
+    auto* t = interpreter_runner_->tensor(input);
     total_input_bytes += t->bytes;
   }
   return total_input_bytes;
@@ -669,15 +803,15 @@ InputTensorData BenchmarkTfLiteModel::CreateRandomTensorData(
 TfLiteStatus BenchmarkTfLiteModel::PrepareInputData() {
   CleanUp();
 
-  // Note the corresponding relation between 'interpreter_inputs' and 'inputs_'
-  // (i.e. the specified input layer info) has been checked in
-  // BenchmarkTfLiteModel::Init() before calling this function. So, we simply
-  // use the corresponding input layer info to initialize the input data value
-  // properly.
-  auto interpreter_inputs = interpreter_->inputs();
-  for (int i = 0; i < interpreter_inputs.size(); ++i) {
-    int tensor_index = interpreter_inputs[i];
-    const TfLiteTensor& t = *(interpreter_->tensor(tensor_index));
+  // Note the corresponding relation between 'runner_inputs' and
+  // 'inputs_' (i.e. the specified input layer info) has been checked in
+  // BenchmarkTfLiteModel::Init() before calling this function. So, we
+  // simply use the corresponding input layer info to initialize the input
+  // data value properly.
+  const std::vector<int>& runner_inputs = interpreter_runner_->inputs();
+  for (int i = 0; i < runner_inputs.size(); ++i) {
+    int tensor_index = runner_inputs[i];
+    const TfLiteTensor& t = *(interpreter_runner_->tensor(tensor_index));
     const InputLayerInfo* input_layer_info = nullptr;
     // Note that when input layer parameters (i.e. --input_layer,
     // --input_layer_shape) are not specified, inputs_ is empty.
@@ -695,11 +829,11 @@ TfLiteStatus BenchmarkTfLiteModel::PrepareInputData() {
 }
 
 TfLiteStatus BenchmarkTfLiteModel::ResetInputsAndOutputs() {
-  auto interpreter_inputs = interpreter_->inputs();
+  const std::vector<int>& runner_inputs = interpreter_runner_->inputs();
   // Set the values of the input tensors from inputs_data_.
-  for (int j = 0; j < interpreter_inputs.size(); ++j) {
-    int i = interpreter_inputs[j];
-    TfLiteTensor* t = interpreter_->tensor(i);
+  for (int j = 0; j < runner_inputs.size(); ++j) {
+    int i = runner_inputs[j];
+    TfLiteTensor* t = interpreter_runner_->tensor(i);
     if (t->type == kTfLiteString) {
       if (inputs_data_[j].data) {
         static_cast<DynamicBuffer*>(inputs_data_[j].data.get())
@@ -707,7 +841,8 @@ TfLiteStatus BenchmarkTfLiteModel::ResetInputsAndOutputs() {
       } else {
         tflite::DynamicBuffer buffer;
         FillRandomString(&buffer, t->dims, []() {
-          return "we're have some friends over saturday to hang out in the "
+          return "we're have some friends over saturday to hang out in "
+                 "the "
                  "yard";
         });
         buffer.WriteToTensor(t, /*new_shape=*/nullptr);
@@ -767,6 +902,20 @@ TfLiteStatus BenchmarkTfLiteModel::Init() {
   TF_LITE_ENSURE_STATUS(LoadModel());
   TF_LITE_ENSURE_STATUS(InitInterpreter());
 
+  if (params_.Get<bool>("list_signatures")) {
+    const std::vector<const std::string*>& keys =
+        interpreter_->signature_keys();
+    TFLITE_LOG(INFO) << "The Model contains " << keys.size()
+                     << " signature key(s).";
+    if (!keys.empty()) {
+      TFLITE_LOG(INFO) << "They are listed below: ";
+    }
+    for (const std::string* key : keys) {
+      TFLITE_LOG(INFO) << "-> Signature Key: " << *key;
+    }
+    return kTfLiteError;
+  }
+
   // Install profilers if necessary right after interpreter is created so that
   // any memory allocations inside the TFLite runtime could be recorded if the
   // installed profiler profile memory usage information.
@@ -789,19 +938,26 @@ TfLiteStatus BenchmarkTfLiteModel::Init() {
 
   interpreter_->SetAllowFp16PrecisionForFp32(params_.Get<bool>("allow_fp16"));
 
-  auto interpreter_inputs = interpreter_->inputs();
+  std::pair<TfLiteStatus, std::unique_ptr<BenchmarkInterpreterRunner>>
+      status_and_runner = BenchmarkInterpreterRunner::Create(
+          interpreter_.get(), params_.Get<std::string>("signature_to_run_for"));
+
+  TF_LITE_ENSURE_STATUS(status_and_runner.first);
+  interpreter_runner_ = std::move(status_and_runner.second);
+
+  const std::vector<int>& runner_inputs = interpreter_runner_->inputs();
 
   if (!inputs_.empty()) {
-    TFLITE_TOOLS_CHECK_EQ(inputs_.size(), interpreter_inputs.size())
+    TFLITE_TOOLS_CHECK_EQ(inputs_.size(), runner_inputs.size())
         << "Inputs mismatch: Model inputs #:" << inputs_.size()
-        << " expected: " << interpreter_inputs.size();
+        << " expected: " << runner_inputs.size();
   }
 
   // Check if the tensor names match, and log a warning if it doesn't.
   for (int j = 0; j < inputs_.size(); ++j) {
     const InputLayerInfo& input = inputs_[j];
-    int i = interpreter_inputs[j];
-    TfLiteTensor* t = interpreter_->tensor(i);
+    int i = runner_inputs[j];
+    TfLiteTensor* t = interpreter_runner_->tensor(i);
     if (input.name != t->name) {
       TFLITE_LOG(WARN) << "Tensor # " << i << " is named " << t->name
                        << " but flags call it " << input.name;
@@ -817,17 +973,17 @@ TfLiteStatus BenchmarkTfLiteModel::Init() {
   // Resize all non-string tensors.
   for (int j = 0; j < inputs_.size(); ++j) {
     const InputLayerInfo& input = inputs_[j];
-    int i = interpreter_inputs[j];
-    TfLiteTensor* t = interpreter_->tensor(i);
+    int i = runner_inputs[j];
+    TfLiteTensor* t = interpreter_runner_->tensor(i);
     if (t->type != kTfLiteString) {
-      interpreter_->ResizeInputTensor(i, input.shape);
+      interpreter_runner_->ResizeInputTensor(i, input.shape);
     }
   }
 
   owned_delegates_.clear();
 
-  // Contains all ids of TfLiteNodes that have been checked to see whether it's
-  // delegated or not.
+  // Contains all ids of TfLiteNodes that have been checked to see whether
+  // it's delegated or not.
   std::unordered_set<int> checked_node_ids;
   tools::ProvidedDelegateList delegate_providers(&params_);
   auto created_delegates = delegate_providers.CreateAllRankedDelegates();
@@ -835,9 +991,9 @@ TfLiteStatus BenchmarkTfLiteModel::Init() {
       << "Going to apply " << created_delegates.size()
       << " delegates one after another.";
 
-  // If created_delegates is empty, 'require_full_delegation' flag will not be
-  // checked, thus CPU fallback will happen. Adding check here to avoid
-  // fallback in this situation.
+  // If created_delegates is empty, 'require_full_delegation' flag will
+  // not be checked, thus CPU fallback will happen. Adding check here to
+  // avoid fallback in this situation.
   if (created_delegates.empty() &&
       params_.Get<bool>("require_full_delegation")) {
     TFLITE_LOG(ERROR) << "Disallowed CPU fallback detected.";
@@ -849,10 +1005,10 @@ TfLiteStatus BenchmarkTfLiteModel::Init() {
     TFLITE_TOOLS_CHECK(delegate != nullptr)
         << "The created delegate by the delegate provider should not be "
            "nullptr!";
-    // The interpreter becomes dependent on the delegate once the delegate is
-    // used, so the order of destruction must be interpreter first, delegate
-    // later.
-    // Moving the delegate to a list of owned delegates to guarantee that.
+    // The interpreter becomes dependent on the delegate once the delegate
+    // is used, so the order of destruction must be interpreter first,
+    // delegate later. Moving the delegate to a list of owned delegates to
+    // guarantee that.
     owned_delegates_.emplace_back(std::move(created_delegate.delegate));
     if (interpreter_->ModifyGraphWithDelegate(delegate) != kTfLiteOk) {
       TFLITE_LOG(ERROR) << "Failed to apply " << delegate_provider->GetName()
@@ -862,25 +1018,27 @@ TfLiteStatus BenchmarkTfLiteModel::Init() {
       // Ideally, such delegate info should already be computed when the
       // delegate is being applied to the model graph.
       int num_delegated_kernels = 0;
-      for (int i = 0; i < interpreter_->execution_plan().size(); ++i) {
-        int node_id = interpreter_->execution_plan()[i];
+      for (int i = 0; i < interpreter_runner_->execution_plan().size(); ++i) {
+        int node_id = interpreter_runner_->execution_plan()[i];
         if (checked_node_ids.find(node_id) != checked_node_ids.end()) {
           continue;
         }
         const TfLiteNode& node =
-            interpreter_->node_and_registration(node_id)->first;
+            interpreter_runner_->node_and_registration(node_id)->first;
 
-        // Note that the 'delegate' here could be an ExternalDelegateWrapper
-        // object that wraps an actual external delegate, in which case,
-        // 'node.delegate' will be different from 'delegate' because
-        // 'node.delegate' refers to the actual external delegate.
+        // Note that the 'delegate' here could be an
+        // ExternalDelegateWrapper object that wraps an actual external
+        // delegate, in which case, 'node.delegate' will be different from
+        // 'delegate' because 'node.delegate' refers to the actual
+        // external delegate.
         if (node.delegate != nullptr) {
           num_delegated_kernels++;
           checked_node_ids.insert(node_id);
         }
       }
-      bool fully_delegated = (num_delegated_kernels == 1 &&
-                              interpreter_->execution_plan().size() == 1);
+      bool fully_delegated =
+          (num_delegated_kernels == 1 &&
+           interpreter_runner_->execution_plan().size() == 1);
 
       if (params_.Get<bool>("require_full_delegation") && !fully_delegated) {
         TFLITE_LOG(ERROR) << "Disallowed CPU fallback detected.";
@@ -898,23 +1056,24 @@ TfLiteStatus BenchmarkTfLiteModel::Init() {
                          << " executed by the delegate w/ "
                          << num_delegated_kernels << " delegate kernels.";
       } else {
-        TFLITE_LOG(INFO)
-            << "Though " << delegate_provider->GetName()
-            << " delegate is explicitly applied, the model graph will not be"
-            << " executed by the delegate.";
+        TFLITE_LOG(INFO) << "Though " << delegate_provider->GetName()
+                         << " delegate is explicitly applied, the model "
+                            "graph will not be"
+                         << " executed by the delegate.";
       }
     }
   }
 
-  if (interpreter_->AllocateTensors() != kTfLiteOk) {
+  if (interpreter_runner_->AllocateTensors() != kTfLiteOk) {
     TFLITE_LOG(ERROR) << "Failed to allocate tensors!";
     return kTfLiteError;
   }
 
   AddOwnedListener(
       std::unique_ptr<BenchmarkListener>(new RuyProfileListener()));
-  AddOwnedListener(
-      std::unique_ptr<BenchmarkListener>(new OutputSaver(interpreter_.get())));
+
+  AddOwnedListener(std::unique_ptr<BenchmarkListener>(
+      new OutputSaver(interpreter_runner_.get())));
 
   return kTfLiteOk;
 }
@@ -944,7 +1103,7 @@ std::unique_ptr<tflite::OpResolver> BenchmarkTfLiteModel::GetOpResolver()
   tflite::ops::builtin::BuiltinOpResolver* resolver = nullptr;
   // When --use_xnnpack is explicitly set to false, skip applying the default
   // XNNPACK delegate in TfLite runtime so that the original execution path
-  // based on the unmodified model graph is still excercised.
+  // based on the unmodified model graph is still exercised.
   if (params_.HasParam("use_xnnpack") &&
       params_.HasValueSet<bool>("use_xnnpack") &&
       !params_.Get<bool>("use_xnnpack")) {
@@ -969,7 +1128,9 @@ BenchmarkTfLiteModel::MayCreateProfilingListener() const {
           !params_.Get<std::string>("profiling_output_csv_file").empty())));
 }
 
-TfLiteStatus BenchmarkTfLiteModel::RunImpl() { return interpreter_->Invoke(); }
+TfLiteStatus BenchmarkTfLiteModel::RunImpl() {
+  return interpreter_runner_->Invoke();
+}
 
 }  // namespace benchmark
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
index f5bf0aa6591c85..468d9037b52c35 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
@@ -25,7 +25,9 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/lite/core/model.h"
+#include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/profiling/profiler.h"
+#include "tensorflow/lite/signature_runner.h"
 #include "tensorflow/lite/tools/benchmark/benchmark_model.h"
 #include "tensorflow/lite/tools/model_loader.h"
 #include "tensorflow/lite/tools/utils.h"
@@ -52,6 +54,88 @@ TfLiteStatus SplitInputLayerNameAndValueFile(
     const std::string& name_and_value_file,
     std::pair<std::string, std::string>& name_file_pair);
 
+// Provides a simplified interface to work with the interpreter and signature
+// runner, automatically selecting the appropriate one based on whether a
+// signature is specified.
+class BenchmarkInterpreterRunner {
+ public:
+  BenchmarkInterpreterRunner(tflite::Interpreter* const interpreter,
+                             tflite::SignatureRunner* const signature_runner,
+                             tflite::Subgraph* const subgraph)
+      : interpreter_(interpreter), subgraph_(subgraph) {
+    if (signature_runner != nullptr) {
+      signature_runner_.reset(signature_runner);
+    }
+  }
+
+  ~BenchmarkInterpreterRunner() {
+    if (signature_runner_ != nullptr) {
+      signature_runner_.release();
+    }
+  }
+
+  // Creates a BenchmarkInterpreterRunner for the given interpreter. If a
+  // signature key is specified, the signature runner is used. Otherwise, the
+  // interpreter is used.
+  static std::pair<TfLiteStatus, std::unique_ptr<BenchmarkInterpreterRunner>>
+  Create(tflite::Interpreter* interpreter, std::string signature_key);
+
+  // Updates allocations for all tensors, related to the given signature.
+  TfLiteStatus AllocateTensors();
+
+  // Invokes the interpreter or signature runner (run the graph identified by
+  // the given signature in dependency order).
+  TfLiteStatus Invoke();
+
+  // Return vector of node indices in the order of execution.
+  //
+  // This is a list of node indices (to index into nodes_and_registration).
+  // This represents a valid topological sort (dependency ordered) execution
+  // plan. In particular, it is valid for this ordering to contain only a
+  // subset of the node indices.
+  // Warning: This is an experimental API and subject to change.
+  const std::vector<int>& execution_plan() const;
+
+  // Read only access to list of inputs.
+  //
+  // Array of indices representing the tensors that are inputs to the
+  // interpreter.
+  // Warning: This is an experimental API and subject to change.
+  const std::vector<int>& inputs() const;
+
+  // Read only access to list of outputs.
+  //
+  // Array of indices representing the tensors that are outputs to the
+  // interpreter.
+  // Warning: This is an experimental API and subject to change.
+  const std::vector<int>& outputs() const;
+
+  // Get a mutable tensor data structure via index.
+  // Warning: This is an experimental API and subject to change.
+  TfLiteTensor* tensor(int tensor_index);
+
+  // Get a pointer to an operation and registration data structure if in
+  // bounds of the signature subgraph.
+  // Warning: This is an experimental API and subject to change.
+  const std::pair<TfLiteNode, TfLiteRegistration>* node_and_registration(
+      int node_index) const;
+
+  // Change the dimensionality of a given tensor. Note, this is only acceptable
+  // for tensor indices that are inputs or variables.
+  // Returns status of failure or success. Note that this doesn't actually
+  // resize any existing buffers. A call to AllocateTensors() is required to
+  // change the tensor input buffer.
+  TfLiteStatus ResizeInputTensor(int tensor_index,
+                                 const std::vector<int>& new_size);
+
+ private:
+  BenchmarkInterpreterRunner() = delete;
+  tflite::Interpreter* const interpreter_ = nullptr;
+
+  std::unique_ptr<tflite::SignatureRunner> signature_runner_;
+  tflite::Subgraph* const subgraph_ = nullptr;
+};
+
 // Benchmarks a TFLite model by running tflite interpreter.
 class BenchmarkTfLiteModel : public BenchmarkModel {
  public:
@@ -96,7 +180,7 @@ class BenchmarkTfLiteModel : public BenchmarkModel {
   // Allow subclasses to create a customized Op resolver during init.
   virtual std::unique_ptr<tflite::OpResolver> GetOpResolver() const;
 
-  // Allow subclass to initialize a customized tflite interpereter.
+  // Allow subclass to initialize a customized tflite interpreter.
   virtual TfLiteStatus InitInterpreter();
 
   // Create a BenchmarkListener that's specifically for TFLite profiling if
@@ -112,6 +196,7 @@ class BenchmarkTfLiteModel : public BenchmarkModel {
   std::vector<utils::InputTensorData> inputs_data_;
   std::unique_ptr<tflite::FlatBufferModel> model_;
   std::unique_ptr<tflite::Interpreter> interpreter_;
+  std::unique_ptr<BenchmarkInterpreterRunner> interpreter_runner_;
   std::unique_ptr<tflite::ExternalCpuBackendContext> external_context_;
 
  private:
diff --git a/tensorflow/lite/tools/cmake/modules/eigen.cmake b/tensorflow/lite/tools/cmake/modules/eigen.cmake
index 1bb203388a04a5..59f30e8ce3b211 100644
--- a/tensorflow/lite/tools/cmake/modules/eigen.cmake
+++ b/tensorflow/lite/tools/cmake/modules/eigen.cmake
@@ -23,7 +23,7 @@ OverridableFetchContent_Declare(
   eigen
   GIT_REPOSITORY https://gitlab.com/libeigen/eigen.git
   # Sync with tensorflow/third_party/eigen3/workspace.bzl
-  GIT_TAG aa6964bf3a34fd607837dd8123bc42465185c4f8
+  GIT_TAG c1d637433e3b3f9012b226c2c9125c494b470ae6
   # It's not currently (cmake 3.17) possible to shallow clone with a GIT TAG
   # as cmake attempts to git checkout the commit hash after the clone
   # which doesn't work as it's a shallow clone hence a different commit hash.
diff --git a/tensorflow/lite/tools/cmake/modules/flatbuffers.cmake b/tensorflow/lite/tools/cmake/modules/flatbuffers.cmake
index 45d5b7f224948e..8febb1fbd1c706 100644
--- a/tensorflow/lite/tools/cmake/modules/flatbuffers.cmake
+++ b/tensorflow/lite/tools/cmake/modules/flatbuffers.cmake
@@ -23,8 +23,9 @@ OverridableFetchContent_Declare(
   flatbuffers
   GIT_REPOSITORY https://github.com/google/flatbuffers
   # Sync with tensorflow/third_party/flatbuffers/workspace.bzl
-  GIT_TAG v23.5.26
-  GIT_SHALLOW TRUE
+  GIT_TAG e6463926479bd6b330cbcf673f7e917803fd5831
+  # NOTE: b/340264458 - `GIT_SHALLOW TRUE` works for tag name only.
+  GIT_SHALLOW FALSE
   GIT_PROGRESS TRUE
   SOURCE_DIR "${CMAKE_BINARY_DIR}/flatbuffers"
 )
diff --git a/tensorflow/lite/tools/cmake/modules/ml_dtypes.cmake b/tensorflow/lite/tools/cmake/modules/ml_dtypes.cmake
index e1bf18710d04ce..5e61934afafae0 100644
--- a/tensorflow/lite/tools/cmake/modules/ml_dtypes.cmake
+++ b/tensorflow/lite/tools/cmake/modules/ml_dtypes.cmake
@@ -23,7 +23,7 @@ OverridableFetchContent_Declare(
   ml_dtypes
   GIT_REPOSITORY https://github.com/jax-ml/ml_dtypes
   # Sync with tensorflow/third_party/py/ml_dtypes/workspace.bzl
-  GIT_TAG 780b6d0ee01ffbfac45f7ec5418bc08f2b166483
+  GIT_TAG 24084d9ed2c3d45bf83b7a9bff833aa185bf9172
   # It's not currently possible to shallow clone with a GIT TAG
   # as cmake attempts to git checkout the commit hash after the clone
   # which doesn't work as it's a shallow clone hence a different commit hash.
diff --git a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
index 27aa3a272741e7..33ebc63beba438 100644
--- a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
+++ b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
@@ -23,7 +23,7 @@ OverridableFetchContent_Declare(
   xnnpack
   GIT_REPOSITORY https://github.com/google/XNNPACK
   # Sync with tensorflow/workspace2.bzl
-  GIT_TAG 7f3e8aa632ab976b8a195c8d3d17e2f5831dde0e
+  GIT_TAG 50037f8072731a2cc30a961b96e199ad691887e4
   GIT_PROGRESS TRUE
   PREFIX "${CMAKE_BINARY_DIR}"
   SOURCE_DIR "${CMAKE_BINARY_DIR}/xnnpack"
diff --git a/tensorflow/lite/tools/delegates/BUILD b/tensorflow/lite/tools/delegates/BUILD
index a6ba60436ebcee..97ab1d7b0787b1 100644
--- a/tensorflow/lite/tools/delegates/BUILD
+++ b/tensorflow/lite/tools/delegates/BUILD
@@ -173,6 +173,20 @@ cc_library_with_tflite(
     alwayslink = 1,
 )
 
+cc_test(
+    name = "xnnpack_delegate_provider_test",
+    srcs = ["xnnpack_delegate_provider_test.cc"],
+    copts = tflite_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":delegate_provider_hdr",
+        ":xnnpack_delegate_provider",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
+        "//tensorflow/lite/tools:tool_params",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "external_delegate_provider",
     srcs = ["external_delegate_provider.cc"],
diff --git a/tensorflow/lite/tools/delegates/compatibility/common/BUILD b/tensorflow/lite/tools/delegates/compatibility/common/BUILD
index 64c96f22a21960..22e8bc53e369e9 100644
--- a/tensorflow/lite/tools/delegates/compatibility/common/BUILD
+++ b/tensorflow/lite/tools/delegates/compatibility/common/BUILD
@@ -11,9 +11,10 @@ cc_library(
     srcs = ["online_helper_delegate.cc"],
     hdrs = ["online_helper_delegate.h"],
     deps = [
-        ":delegate_compatibility_checker_base",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:minimal_logging",
+        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/tools/delegates/compatibility/protos:compatibility_result_cc",
         "@com_google_absl//absl/status",
@@ -29,11 +30,10 @@ cc_library(
     deps = [
         ":delegate_compatibility_checker_util",
         "//tensorflow/lite:framework_stable",
-        "//tensorflow/lite:kernel_api",
         "//tensorflow/lite/core:framework_stable",
-        "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/tools/delegates/compatibility/protos:compatibility_result_cc",
         "//tensorflow/lite/tools/versioning:op_signature",
         "@com_google_absl//absl/status",
diff --git a/tensorflow/lite/tools/delegates/compatibility/common/delegate_compatibility_checker_base.cc b/tensorflow/lite/tools/delegates/compatibility/common/delegate_compatibility_checker_base.cc
index e41c8359909f49..88a76f67d5fdb9 100644
--- a/tensorflow/lite/tools/delegates/compatibility/common/delegate_compatibility_checker_base.cc
+++ b/tensorflow/lite/tools/delegates/compatibility/common/delegate_compatibility_checker_base.cc
@@ -22,9 +22,11 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/status.h"
-#include "tensorflow/lite/core/c/common.h"
-#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/tools/delegates/compatibility/common/delegate_compatibility_checker_util.h"
+#include "tensorflow/lite/tools/delegates/compatibility/protos/compatibility_result.pb.h"
+#include "tensorflow/lite/tools/versioning/op_signature.h"
 
 namespace tflite {
 namespace tools {
diff --git a/tensorflow/lite/tools/delegates/compatibility/common/delegate_compatibility_checker_base.h b/tensorflow/lite/tools/delegates/compatibility/common/delegate_compatibility_checker_base.h
index 52df79cbfd4b93..6eacdaf6f21a96 100644
--- a/tensorflow/lite/tools/delegates/compatibility/common/delegate_compatibility_checker_base.h
+++ b/tensorflow/lite/tools/delegates/compatibility/common/delegate_compatibility_checker_base.h
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "tensorflow/lite/core/model_builder.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/tools/delegates/compatibility/protos/compatibility_result.pb.h"
 #include "tensorflow/lite/tools/versioning/op_signature.h"
 
diff --git a/tensorflow/lite/tools/delegates/compatibility/common/online_helper_delegate.cc b/tensorflow/lite/tools/delegates/compatibility/common/online_helper_delegate.cc
index 73ef0f13191fab..8c389fd8cc70e8 100644
--- a/tensorflow/lite/tools/delegates/compatibility/common/online_helper_delegate.cc
+++ b/tensorflow/lite/tools/delegates/compatibility/common/online_helper_delegate.cc
@@ -15,8 +15,12 @@ limitations under the License.
 
 #include "tensorflow/lite/tools/delegates/compatibility/common/online_helper_delegate.h"
 
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/logger.h"
 #include "tensorflow/lite/minimal_logging.h"
+#include "tensorflow/lite/tools/delegates/compatibility/protos/compatibility_result.pb.h"
 
 namespace tflite {
 namespace tools {
diff --git a/tensorflow/lite/tools/delegates/xnnpack_delegate_provider.cc b/tensorflow/lite/tools/delegates/xnnpack_delegate_provider.cc
index 7d0ef212966397..9a9f7236991438 100644
--- a/tensorflow/lite/tools/delegates/xnnpack_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/xnnpack_delegate_provider.cc
@@ -27,6 +27,8 @@ class XnnpackDelegateProvider : public DelegateProvider {
     default_params_.AddParam("use_xnnpack", ToolParam::Create<bool>(false));
     default_params_.AddParam("xnnpack_force_fp16",
                              ToolParam::Create<bool>(false));
+    default_params_.AddParam("xnnpack_experimental_weight_cache_file_path",
+                             ToolParam::Create<std::string>(""));
   }
 
   std::vector<Flag> CreateFlags(ToolParams* params) const final;
@@ -54,6 +56,8 @@ std::vector<Flag> XnnpackDelegateProvider::CreateFlags(
                        "false explicitly."),
       CreateFlag<bool>("xnnpack_force_fp16", params,
                        "enforce float16 inference."),
+      CreateFlag<std::string>("xnnpack_experimental_weight_cache_file_path",
+                              params, "enable file-backed weight caching."),
   };
   return flags;
 }
@@ -63,6 +67,9 @@ void XnnpackDelegateProvider::LogParams(const ToolParams& params,
   LOG_TOOL_PARAM(params, bool, "use_xnnpack", "Use xnnpack", verbose);
   LOG_TOOL_PARAM(params, bool, "xnnpack_force_fp16", "xnnpack_force_fp16",
                  verbose);
+  LOG_TOOL_PARAM(params, std::string,
+                 "xnnpack_experimental_weight_cache_file_path",
+                 "xnnpack_experimental_weight_cache_file_path", verbose);
 }
 
 TfLiteDelegatePtr XnnpackDelegateProvider::CreateTfLiteDelegate(
@@ -70,7 +77,9 @@ TfLiteDelegatePtr XnnpackDelegateProvider::CreateTfLiteDelegate(
   if (params.Get<bool>("use_xnnpack")) {
     return evaluation::CreateXNNPACKDelegate(
         params.Get<int32_t>("num_threads"),
-        params.Get<bool>("xnnpack_force_fp16"));
+        params.Get<bool>("xnnpack_force_fp16"),
+        params.Get<std::string>("xnnpack_experimental_weight_cache_file_path")
+            .c_str());
   }
   return CreateNullDelegate();
 }
diff --git a/tensorflow/lite/tools/delegates/xnnpack_delegate_provider_test.cc b/tensorflow/lite/tools/delegates/xnnpack_delegate_provider_test.cc
new file mode 100644
index 00000000000000..19fd593fa51136
--- /dev/null
+++ b/tensorflow/lite/tools/delegates/xnnpack_delegate_provider_test.cc
@@ -0,0 +1,82 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+#include "tensorflow/lite/tools/delegates/delegate_provider.h"
+#include "tensorflow/lite/tools/tool_params.h"
+
+namespace tflite {
+namespace tools {
+namespace {
+
+TEST(XNNPackDelegateProviderTest, Test) {
+  const char kFakeCacheParam[] = "/tmp/random/path";
+
+  const auto& providers = GetRegisteredDelegateProviders();
+  ASSERT_EQ(providers.size(), 1);
+  ToolParams params;
+
+  const auto& xnnpack_provider = providers[0];
+  ASSERT_NE(xnnpack_provider, nullptr);
+
+  params.Merge(xnnpack_provider->DefaultParams());
+  params.AddParam("num_threads", ToolParam::Create<int32_t>(-1));
+
+  EXPECT_TRUE(params.HasParam("use_xnnpack"));
+  EXPECT_FALSE(params.HasValueSet<bool>("use_xnnpack"));
+  ASSERT_NE(params.GetParam("use_xnnpack"), nullptr);
+
+  EXPECT_TRUE(params.HasParam("xnnpack_force_fp16"));
+  EXPECT_FALSE(params.HasValueSet<bool>("xnnpack_force_fp16"));
+  ASSERT_NE(params.GetParam("xnnpack_force_fp16"), nullptr);
+
+  EXPECT_TRUE(params.HasParam("xnnpack_experimental_weight_cache_file_path"));
+  EXPECT_FALSE(params.HasValueSet<std::string>(
+      "xnnpack_experimental_weight_cache_file_path"));
+  ASSERT_NE(params.GetParam("xnnpack_experimental_weight_cache_file_path"),
+            nullptr);
+
+  params.Set<bool>("use_xnnpack", true, /*position=*/0);
+
+  {
+    TfLiteDelegatePtr delegate = xnnpack_provider->CreateTfLiteDelegate(params);
+    const TfLiteXNNPackDelegateOptions* options =
+        TfLiteXNNPackDelegateGetOptions(delegate.get());
+    ASSERT_NE(options, nullptr);
+    EXPECT_EQ(options->experimental_weight_cache_file_path, nullptr);
+  }
+
+  params.Set<bool>("xnnpack_force_fp16", true, /*position=*/1);
+  params.Set<std::string>("xnnpack_experimental_weight_cache_file_path",
+                          kFakeCacheParam, /*position=*/2);
+  {
+    TfLiteDelegatePtr delegate = xnnpack_provider->CreateTfLiteDelegate(params);
+    const TfLiteXNNPackDelegateOptions* options =
+        TfLiteXNNPackDelegateGetOptions(delegate.get());
+    ASSERT_NE(options, nullptr);
+    EXPECT_THAT(options->experimental_weight_cache_file_path,
+                testing::StrEq(kFakeCacheParam));
+    EXPECT_TRUE(options->flags & TFLITE_XNNPACK_DELEGATE_FLAG_FORCE_FP16);
+  }
+}
+
+}  // namespace
+}  // namespace tools
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/evaluation/BUILD b/tensorflow/lite/tools/evaluation/BUILD
index 914c33471616e8..a095d0934fa4bf 100644
--- a/tensorflow/lite/tools/evaluation/BUILD
+++ b/tensorflow/lite/tools/evaluation/BUILD
@@ -75,7 +75,9 @@ cc_library_with_stable_tflite_abi(
             "//tensorflow/lite/acceleration/configuration/c:xnnpack_plugin",
         ],
     }],
-    deps = select({
+    deps = [
+        "@flatbuffers",
+    ] + select({
         "//tensorflow/lite:tflite_with_xnnpack_explicit_false": [],
         "//conditions:default": [
             "//tensorflow/lite/acceleration/configuration:configuration_fbs",
diff --git a/tensorflow/lite/tools/evaluation/stages/BUILD b/tensorflow/lite/tools/evaluation/stages/BUILD
index b2eb3a2ee0cdde..e9811081e9623e 100644
--- a/tensorflow/lite/tools/evaluation/stages/BUILD
+++ b/tensorflow/lite/tools/evaluation/stages/BUILD
@@ -40,6 +40,8 @@ cc_library(
     copts = tflite_copts(),
     deps = [
         "//tensorflow/core:tflite_portable_logging",
+        "//tensorflow/lite:string",
+        "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/kernels/internal:reference_base",
         "//tensorflow/lite/kernels/internal:types",
         "//tensorflow/lite/profiling:time",
@@ -49,6 +51,7 @@ cc_library(
         "//tensorflow/lite/tools/evaluation/proto:preprocessing_steps_cc_proto",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/strings",
+        "@libjpeg_turbo//:jpeg",
         "@local_xla//xla/tsl/util:stats_calculator_portable",
     ] + select({
         "//tensorflow:android": [
@@ -68,6 +71,7 @@ cc_test(
     linkstatic = 1,
     deps = [
         ":image_preprocessing_stage",
+        "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_config_cc_proto",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_stages_cc_proto",
         "@com_google_googletest//:gtest_main",
@@ -151,6 +155,7 @@ cc_library(
         ":tflite_inference_stage",
         ":topk_accuracy_eval_stage",
         "//tensorflow/core:tflite_portable_logging",
+        "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/tools/evaluation:evaluation_delegate_provider",
         "//tensorflow/lite/tools/evaluation:evaluation_stage",
         "//tensorflow/lite/tools/evaluation:utils",
@@ -167,6 +172,7 @@ cc_library(
     deps = [
         ":tflite_inference_stage",
         "//tensorflow/core:tflite_portable_logging",
+        "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/tools/evaluation:evaluation_delegate_provider",
         "//tensorflow/lite/tools/evaluation:evaluation_stage",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_config_cc_proto",
diff --git a/tensorflow/lite/tools/evaluation/stages/image_classification_stage.cc b/tensorflow/lite/tools/evaluation/stages/image_classification_stage.cc
index 8434d9a98887fd..3c6c0417399d72 100644
--- a/tensorflow/lite/tools/evaluation/stages/image_classification_stage.cc
+++ b/tensorflow/lite/tools/evaluation/stages/image_classification_stage.cc
@@ -20,8 +20,13 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
+#include "tensorflow/lite/tools/evaluation/stages/image_preprocessing_stage.h"
+#include "tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h"
+#include "tensorflow/lite/tools/evaluation/stages/topk_accuracy_eval_stage.h"
 #include "tensorflow/lite/tools/evaluation/utils.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/tools/evaluation/stages/image_classification_stage.h b/tensorflow/lite/tools/evaluation/stages/image_classification_stage.h
index d468afbc35949e..d1ec8da77c48b0 100644
--- a/tensorflow/lite/tools/evaluation/stages/image_classification_stage.h
+++ b/tensorflow/lite/tools/evaluation/stages/image_classification_stage.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h"
 #include "tensorflow/lite/tools/evaluation/evaluation_stage.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
diff --git a/tensorflow/lite/tools/evaluation/stages/image_preprocessing_stage.cc b/tensorflow/lite/tools/evaluation/stages/image_preprocessing_stage.cc
index a1418c3bcb6d31..2b219abefdc557 100644
--- a/tensorflow/lite/tools/evaluation/stages/image_preprocessing_stage.cc
+++ b/tensorflow/lite/tools/evaluation/stages/image_preprocessing_stage.cc
@@ -27,12 +27,16 @@ limitations under the License.
 
 #include "absl/base/casts.h"
 #include "absl/strings/ascii.h"
-#include "tensorflow/core/lib/jpeg/jpeg_handle.h"
+#include "jpeglib.h"  // from @libjpeg_turbo
 #include "tensorflow/core/lib/jpeg/jpeg_mem.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/kernels/internal/reference/pad.h"
+#include "tensorflow/lite/kernels/internal/reference/resize_bilinear.h"
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/profiling/time.h"
+#include "tensorflow/lite/string_type.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
 #include "tensorflow/lite/tools/evaluation/proto/preprocessing_steps.pb.h"
diff --git a/tensorflow/lite/tools/evaluation/stages/image_preprocessing_stage.h b/tensorflow/lite/tools/evaluation/stages/image_preprocessing_stage.h
index 1e7d36e098fb5c..f16fda5b9a027a 100644
--- a/tensorflow/lite/tools/evaluation/stages/image_preprocessing_stage.h
+++ b/tensorflow/lite/tools/evaluation/stages/image_preprocessing_stage.h
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "xla/tsl/util/stats_calculator.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/tools/evaluation/evaluation_stage.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
diff --git a/tensorflow/lite/tools/evaluation/stages/image_preprocessing_stage_test.cc b/tensorflow/lite/tools/evaluation/stages/image_preprocessing_stage_test.cc
index 32105cbe7b461d..c9f75f4d7cc20c 100644
--- a/tensorflow/lite/tools/evaluation/stages/image_preprocessing_stage_test.cc
+++ b/tensorflow/lite/tools/evaluation/stages/image_preprocessing_stage_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 
 #include <gtest/gtest.h>
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
 
diff --git a/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.cc b/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.cc
index c79a97f2bc228b..094d76134f3bac 100644
--- a/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.cc
+++ b/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.cc
@@ -21,8 +21,11 @@ limitations under the License.
 
 #include "fp16.h"  // from @FP16
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
+#include "tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h"
 
 namespace tflite {
 namespace evaluation {
diff --git a/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.h b/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.h
index 5f1bee82d33a35..d48c836f035b44 100644
--- a/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.h
+++ b/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "xla/tsl/util/stats_calculator.h"
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h"
 #include "tensorflow/lite/tools/evaluation/evaluation_stage.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
diff --git a/tensorflow/lite/tools/evaluation/stages/utils/image_metrics_test.cc b/tensorflow/lite/tools/evaluation/stages/utils/image_metrics_test.cc
index 11d71d663fff22..5e95165081781c 100644
--- a/tensorflow/lite/tools/evaluation/stages/utils/image_metrics_test.cc
+++ b/tensorflow/lite/tools/evaluation/stages/utils/image_metrics_test.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <cmath>
 #include <cstdlib>
 
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
 namespace tflite {
diff --git a/tensorflow/lite/tools/evaluation/tasks/task_executor.cc b/tensorflow/lite/tools/evaluation/tasks/task_executor.cc
index 615733d8d18559..1d7e697dd8d36a 100644
--- a/tensorflow/lite/tools/evaluation/tasks/task_executor.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/task_executor.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <string>
 
 #include "absl/types/optional.h"
+#include "tensorflow/lite/tools/command_line_flags.h"
+#include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
 #include "tensorflow/lite/tools/logging.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/tools/evaluation/utils.cc b/tensorflow/lite/tools/evaluation/utils.cc
index 20221df8d37c91..6628911f79441e 100644
--- a/tensorflow/lite/tools/evaluation/utils.cc
+++ b/tensorflow/lite/tools/evaluation/utils.cc
@@ -15,7 +15,19 @@ limitations under the License.
 
 #include "tensorflow/lite/tools/evaluation/utils.h"
 
+#include <algorithm>
+#include <cctype>
+#include <cstddef>
+#include <cstdint>
+#include <fstream>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "flatbuffers/buffer.h"  // from @flatbuffers
+#include "flatbuffers/string.h"  // from @flatbuffers
 #include "tensorflow/lite/tools/delegates/delegate_provider.h"
+
 #if defined(__APPLE__)
 #include "TargetConditionals.h"
 #if (TARGET_OS_IPHONE && !TARGET_IPHONE_SIMULATOR) || \
@@ -31,6 +43,7 @@ limitations under the License.
 #include "tensorflow/lite/acceleration/configuration/c/xnnpack_plugin.h"
 #include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #endif  // !defined(TFLITE_WITHOUT_XNNPACK)
 
 #if !defined(_WIN32)
@@ -38,11 +51,6 @@ limitations under the License.
 #endif
 #include <sys/stat.h>
 
-#include <algorithm>
-#include <fstream>
-#include <memory>
-#include <string>
-
 namespace tflite {
 namespace evaluation {
 
@@ -177,7 +185,9 @@ TfLiteDelegatePtr CreateHexagonDelegate(
 #endif  // TFLITE_ENABLE_HEXAGON
 
 #ifdef TFLITE_WITHOUT_XNNPACK
-TfLiteDelegatePtr CreateXNNPACKDelegate(int num_threads, bool force_fp16) {
+TfLiteDelegatePtr CreateXNNPACKDelegate(
+    int num_threads, bool force_fp16,
+    const char* experimental_weight_cache_file_path) {
   return tools::CreateNullDelegate();
 }
 #else  // !defined(TFLITE_WITHOUT_XNNPACK)
@@ -212,6 +222,12 @@ TfLiteDelegatePtr CreateXNNPACKDelegate() {
 TfLiteDelegatePtr CreateXNNPACKDelegate(
     const TfLiteXNNPackDelegateOptions* xnnpack_options) {
   flatbuffers::FlatBufferBuilder flatbuffer_builder;
+  flatbuffers::Offset<flatbuffers::String> experimental_weight_cache_file_path;
+  if (xnnpack_options->experimental_weight_cache_file_path) {
+    experimental_weight_cache_file_path = flatbuffer_builder.CreateString(
+        xnnpack_options->experimental_weight_cache_file_path);
+  }
+
   tflite::XNNPackSettingsBuilder xnnpack_settings_builder(flatbuffer_builder);
   int num_threads = xnnpack_options->num_threads;
   if (num_threads >= 0) {
@@ -220,6 +236,8 @@ TfLiteDelegatePtr CreateXNNPACKDelegate(
   xnnpack_settings_builder.fbb_.AddElement<int32_t>(
       XNNPackSettings::VT_FLAGS, static_cast<int32_t>(xnnpack_options->flags),
       0);
+  xnnpack_settings_builder.add_experimental_weight_cache_file_path(
+      experimental_weight_cache_file_path);
   flatbuffers::Offset<tflite::XNNPackSettings> xnnpack_settings =
       xnnpack_settings_builder.Finish();
   tflite::TFLiteSettingsBuilder tflite_settings_builder(flatbuffer_builder);
@@ -240,7 +258,9 @@ TfLiteDelegatePtr CreateXNNPACKDelegate(
   return TfLiteDelegatePtr(delegate, delegate_deleter);
 }
 
-TfLiteDelegatePtr CreateXNNPACKDelegate(int num_threads, bool force_fp16) {
+TfLiteDelegatePtr CreateXNNPACKDelegate(
+    int num_threads, bool force_fp16,
+    const char* experimental_weight_cache_file_path) {
   auto opts = XNNPackDelegateOptionsDefault();
   // Note that we don't want to use the thread pool for num_threads == 1.
   opts.num_threads = num_threads > 1 ? num_threads : 0;
@@ -248,6 +268,12 @@ TfLiteDelegatePtr CreateXNNPACKDelegate(int num_threads, bool force_fp16) {
     TFLITE_LOG(INFO) << "XNNPack FP16 inference enabled.";
     opts.flags |= TFLITE_XNNPACK_DELEGATE_FLAG_FORCE_FP16;
   }
+  if (experimental_weight_cache_file_path &&
+      experimental_weight_cache_file_path[0] != '\0') {
+    TFLITE_LOG(INFO) << "XNNPack file-backed weight cache enabled.";
+    opts.experimental_weight_cache_file_path =
+        experimental_weight_cache_file_path;
+  }
   return CreateXNNPACKDelegate(&opts);
 }
 #endif
diff --git a/tensorflow/lite/tools/evaluation/utils.h b/tensorflow/lite/tools/evaluation/utils.h
index 1735b9ee08a0fc..ed89f609770e4e 100644
--- a/tensorflow/lite/tools/evaluation/utils.h
+++ b/tensorflow/lite/tools/evaluation/utils.h
@@ -105,7 +105,9 @@ TfLiteDelegatePtr CreateXNNPACKDelegate();
 TfLiteDelegatePtr CreateXNNPACKDelegate(
     const TfLiteXNNPackDelegateOptions* options);
 #endif  // !defined(TFLITE_WITHOUT_XNNPACK)
-TfLiteDelegatePtr CreateXNNPACKDelegate(int num_threads, bool force_fp16);
+TfLiteDelegatePtr CreateXNNPACKDelegate(
+    int num_threads, bool force_fp16,
+    const char* experimental_weight_cache_file_path = nullptr);
 
 TfLiteDelegatePtr CreateCoreMlDelegate();
 }  // namespace evaluation
diff --git a/tensorflow/lite/tools/optimize/quantize_model.cc b/tensorflow/lite/tools/optimize/quantize_model.cc
index 45088ee67a9ed7..dbbf8a79136468 100644
--- a/tensorflow/lite/tools/optimize/quantize_model.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model.cc
@@ -2013,6 +2013,18 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                        /*activations_type=*/TensorType_INT8,
                        /*bias_type=*/TensorType_INT32, error_reporter);
 }
+TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
+                           ModelT* model, const TensorType& input_type,
+                           const TensorType& output_type, bool allow_float,
+                           bool disable_per_channel,
+                           ErrorReporter* error_reporter) {
+  return QuantizeModel(builder, model, input_type, output_type, allow_float,
+                       GetAllOperatorOutputs(model),
+                       /*activations_type=*/TensorType_INT8,
+                       /*bias_type=*/TensorType_INT32,
+                       /*disable_per_channel=*/disable_per_channel,
+                       error_reporter);
+}
 
 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                            ModelT* model, const TensorType& input_type,
diff --git a/tensorflow/lite/tools/optimize/quantize_model.h b/tensorflow/lite/tools/optimize/quantize_model.h
index 73f5a8b5ab91d4..a4b1ce5ebbf7a1 100644
--- a/tensorflow/lite/tools/optimize/quantize_model.h
+++ b/tensorflow/lite/tools/optimize/quantize_model.h
@@ -55,6 +55,15 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                            const TensorType& output_type, bool allow_float,
                            ErrorReporter* error_reporter);
 
+// Same as above but with added option of disabling per channel quantization
+//
+// Note: This is a private API, subject to change.
+TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
+                           ModelT* input_model, const TensorType& input_type,
+                           const TensorType& output_type, bool allow_float,
+                           bool disable_per_channel,
+                           ErrorReporter* error_reporter);
+
 // Same as above, but enables only quantizing an allowlist of operations,
 // specified by their operator output name.
 //
diff --git a/tensorflow/lite/tools/tool_params.h b/tensorflow/lite/tools/tool_params.h
index 70a16ccd2a73f6..09f64797f3eebf 100644
--- a/tensorflow/lite/tools/tool_params.h
+++ b/tensorflow/lite/tools/tool_params.h
@@ -94,7 +94,7 @@ class TypedToolParam : public ToolParam {
     has_value_set_ = true;
   }
 
-  T Get() const { return value_; }
+  const T& Get() const { return value_; }
 
   void Set(const ToolParam& other) override {
     Set(other.AsConstTyped<T>()->Get());
diff --git a/tensorflow/lite/tools/versioning/gpu_compatibility.cc b/tensorflow/lite/tools/versioning/gpu_compatibility.cc
index 73d61c10fd37fa..c5b50ac728cdad 100644
--- a/tensorflow/lite/tools/versioning/gpu_compatibility.cc
+++ b/tensorflow/lite/tools/versioning/gpu_compatibility.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/lite/builtin_op_data.h"
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/tools/versioning/op_signature.h"
@@ -434,6 +435,41 @@ absl::Status CheckCustomOpsGpuDelegateCompatibility(const OpSignature& op_sig) {
       absl::StrCat("Not supported custom op ", op_sig.custom_name));
 }
 
+absl::Status CheckAddMulBroadcastCompatibility(
+    const OpSignatureTensorSpec& input0, const OpSignatureTensorSpec& input1) {
+  if (input0.dims.size() > 1 && input1.dims.size() > 1 &&
+      input0.dims.size() != input1.dims.size()) {
+    const std::vector<int32_t>*longer_dims, *shorter_dims;
+    if (input0.dims.size() >= input1.dims.size()) {
+      longer_dims = &input0.dims;
+      shorter_dims = &input1.dims;
+    } else {
+      longer_dims = &input1.dims;
+      shorter_dims = &input0.dims;
+    }
+    bool is_broadcastable = false;
+
+    if (longer_dims->size() == 4 && shorter_dims->size() == 3 &&
+        longer_dims->at(0) == 1) {
+      // Broadcasting 3D to 4D with batch 1 works.
+      is_broadcastable = true;
+    } else if (longer_dims->size() == 4 && shorter_dims->size() == 2 &&
+               longer_dims->at(0) == 1 && shorter_dims->at(0) == 1 &&
+               shorter_dims->at(1) == 1) {
+      // Broadcasting 2D [1, 1] to 4D [1, x, y, z] works.
+      is_broadcastable = true;
+    }
+
+    if (!is_broadcastable) {
+      return absl::UnimplementedError(
+          absl::StrCat("Doesn't support broadcasting - input0: [",
+                       absl::StrJoin(input0.dims, ","), "], input1: [",
+                       absl::StrJoin(input1.dims, ","), "]"));
+    }
+  }
+  return absl::OkStatus();
+}
+
 }  // namespace
 
 // Logics here used to be in TFLiteOperationParser:IsSupported()
@@ -446,6 +482,12 @@ absl::Status CheckGpuDelegateCompatibility(const OpSignature& op_sig) {
       if (op_sig.inputs.size() != 2) {
         return absl::UnimplementedError("ADD requires two input tensors.");
       }
+      const auto& input0 = op_sig.inputs.at(0);
+      const auto& input1 = op_sig.inputs.at(1);
+      auto broadcastable = CheckAddMulBroadcastCompatibility(input0, input1);
+      if (!broadcastable.ok()) {
+        return broadcastable;
+      }
       const TfLiteAddParams* tf_options;
       return RetrieveBuiltinData(op_sig, &tf_options);
     }
@@ -690,6 +732,13 @@ absl::Status CheckGpuDelegateCompatibility(const OpSignature& op_sig) {
               "MUL requires one tensor that not less than second in all "
               "dimensions.");
         }
+      } else {
+        const auto& input0 = op_sig.inputs.at(0);
+        const auto& input1 = op_sig.inputs.at(1);
+        auto broadcastable = CheckAddMulBroadcastCompatibility(input0, input1);
+        if (!broadcastable.ok()) {
+          return broadcastable;
+        }
       }
       const TfLiteMulParams* tf_options;
       RETURN_IF_ERROR(RetrieveBuiltinData(op_sig, &tf_options));
diff --git a/tensorflow/lite/tools/versioning/gpu_compatibility_test.cc b/tensorflow/lite/tools/versioning/gpu_compatibility_test.cc
index 86a8f925af6055..c5283e8e4adfab 100644
--- a/tensorflow/lite/tools/versioning/gpu_compatibility_test.cc
+++ b/tensorflow/lite/tools/versioning/gpu_compatibility_test.cc
@@ -94,4 +94,96 @@ TEST(CheckGpuDelegateCompatibility, FCConstInput) {
             "FullyConnected doesn't support constant input.");
 }
 
+TEST(CheckGpuDelegateCompatibility, Add1Dto3DBroadcastSuccess) {
+  OpSignature op_sig = OpSignature();
+  op_sig.op = BuiltinOperator_ADD;
+  auto params = std::make_unique<TfLiteAddParams>();
+  op_sig.builtin_data = static_cast<void*>(params.get());
+  op_sig.inputs = std::vector<OpSignatureTensorSpec>(2);
+  op_sig.inputs[0] = OpSignatureTensorSpec();
+  op_sig.inputs[0].dims = {4, 1, 2};
+  op_sig.inputs[1] = OpSignatureTensorSpec();
+  op_sig.inputs[1].dims = {2};
+
+  EXPECT_TRUE(CheckGpuDelegateCompatibility(op_sig).message().empty());
+}
+
+TEST(CheckGpuDelegateCompatibility, Add2Dto3DBroadcastFail) {
+  OpSignature op_sig = OpSignature();
+  op_sig.op = BuiltinOperator_ADD;
+  auto params = std::make_unique<TfLiteAddParams>();
+  op_sig.builtin_data = static_cast<void*>(params.get());
+  op_sig.inputs = std::vector<OpSignatureTensorSpec>(2);
+  op_sig.inputs[0] = OpSignatureTensorSpec();
+  op_sig.inputs[0].dims = {1, 100, 256};
+  op_sig.inputs[1] = OpSignatureTensorSpec();
+  op_sig.inputs[1].dims = {100, 256};
+
+  EXPECT_EQ(CheckGpuDelegateCompatibility(op_sig).message(),
+            "Doesn't support broadcasting - input0: [1,100,256], input1: "
+            "[100,256]");
+}
+
+TEST(CheckGpuDelegateCompatibility, Add3Dto4DBroadcastFail) {
+  OpSignature op_sig = OpSignature();
+  op_sig.op = BuiltinOperator_ADD;
+  auto params = std::make_unique<TfLiteAddParams>();
+  op_sig.builtin_data = static_cast<void*>(params.get());
+  op_sig.inputs = std::vector<OpSignatureTensorSpec>(2);
+  op_sig.inputs[0] = OpSignatureTensorSpec();
+  op_sig.inputs[0].dims = {4, 1, 1, 2};
+  op_sig.inputs[1] = OpSignatureTensorSpec();
+  // Can't broadcast using batch of 4
+  op_sig.inputs[1].dims = {1, 1, 2};
+
+  EXPECT_EQ(
+      CheckGpuDelegateCompatibility(op_sig).message(),
+      "Doesn't support broadcasting - input0: [4,1,1,2], input1: [1,1,2]");
+}
+
+TEST(CheckGpuDelegateCompatibility, Add3Dto4DBroadcastSuccess) {
+  OpSignature op_sig = OpSignature();
+  op_sig.op = BuiltinOperator_ADD;
+  auto params = std::make_unique<TfLiteAddParams>();
+  op_sig.builtin_data = static_cast<void*>(params.get());
+  op_sig.inputs = std::vector<OpSignatureTensorSpec>(2);
+  op_sig.inputs[0] = OpSignatureTensorSpec();
+  op_sig.inputs[0].dims = {1, 128, 513, 3};
+  op_sig.inputs[1] = OpSignatureTensorSpec();
+  // Can be broadcasted to {1, 128, 513, 3}
+  op_sig.inputs[1].dims = {128, 513, 3};
+
+  EXPECT_TRUE(CheckGpuDelegateCompatibility(op_sig).message().empty());
+}
+
+TEST(CheckGpuDelegateCompatibility, Add2Dto4DBroadcastSuccess) {
+  OpSignature op_sig = OpSignature();
+  op_sig.op = BuiltinOperator_ADD;
+  auto params = std::make_unique<TfLiteAddParams>();
+  op_sig.builtin_data = static_cast<void*>(params.get());
+  op_sig.inputs = std::vector<OpSignatureTensorSpec>(2);
+  op_sig.inputs[0] = OpSignatureTensorSpec();
+  op_sig.inputs[0].dims = {1, 512, 512, 1};
+  op_sig.inputs[1] = OpSignatureTensorSpec();
+  // Can be broadcasted to {1, 1, 1, 1}
+  op_sig.inputs[1].dims = {1, 1};
+
+  EXPECT_TRUE(CheckGpuDelegateCompatibility(op_sig).message().empty());
+}
+
+TEST(CheckGpuDelegateCompatibility, Add2Dto4DBroadcastSuccess2) {
+  OpSignature op_sig = OpSignature();
+  op_sig.op = BuiltinOperator_ADD;
+  auto params = std::make_unique<TfLiteAddParams>();
+  op_sig.builtin_data = static_cast<void*>(params.get());
+  op_sig.inputs = std::vector<OpSignatureTensorSpec>(2);
+  op_sig.inputs[0] = OpSignatureTensorSpec();
+  op_sig.inputs[0].dims = {1, 384, 384, 3};
+  op_sig.inputs[1] = OpSignatureTensorSpec();
+  // Can be broadcasted to {1, 1, 1, 1}
+  op_sig.inputs[1].dims = {1, 1};
+
+  EXPECT_TRUE(CheckGpuDelegateCompatibility(op_sig).message().empty());
+}
+
 }  // namespace tflite
diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index 27c35be416d552..49adcabcd51766 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -101,6 +101,7 @@ tf_staging/tensorflow/lite/delegates/utils/experimental/stable_delegate/BUILD:
 tf_staging/tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader.cc:
 tf_staging/tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader.h:
 tf_staging/tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader_test.cc:
+tf_staging/tensorflow/lite/delegates/xnnpack/weight_cache_schema_generated.h:
 tf_staging/tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h:
 tf_staging/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api.h:
 tf_staging/tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg.h:
@@ -182,6 +183,7 @@ tf_staging/tensorflow/tools/toolchains/java/BUILD:
 tf_staging/tensorflow/tools/toolchains/python/BUILD:
 tf_staging/tensorflow/tools/toolchains/remote/BUILD:
 tf_staging/tensorflow/tools/toolchains/remote_config/BUILD:
+tf_staging/tensorflow/tools/toolchains/win/20240424/BUILD:
 tf_staging/tensorflow/tools/toolchains/win/BUILD:
 tf_staging/tensorflow/tools/toolchains/win/bazel_211/BUILD:
 tf_staging/tensorflow/tools/toolchains/win/tf_win_05022023/BUILD:
@@ -298,15 +300,16 @@ tf_staging/third_party/py/BUILD.tpl:
 tf_staging/third_party/py/BUILD:
 tf_staging/third_party/py/ml_dtypes/BUILD:
 tf_staging/third_party/py/ml_dtypes/LICENSE:
-tf_staging/third_party/py/non_hermetic/BUILD.tpl:
-tf_staging/third_party/py/non_hermetic/BUILD:
-tf_staging/third_party/py/non_hermetic/README:
 tf_staging/third_party/py/non_hermetic/ml_dtypes/BUILD:
 tf_staging/third_party/py/non_hermetic/ml_dtypes/LICENSE:
 tf_staging/third_party/py/non_hermetic/numpy/BUILD:
-tf_staging/third_party/py/non_hermetic/python_configure.bzl:
 tf_staging/third_party/py/numpy/BUILD:
 tf_staging/third_party/py/python_configure.bzl:
+tf_staging/third_party/py/python_init_pip.bzl:
+tf_staging/third_party/py/python_init_repositories.bzl:
+tf_staging/third_party/py/python_init_rules.bzl:
+tf_staging/third_party/py/python_init_toolchains.bzl:
+tf_staging/third_party/py/python_repo.bzl:
 tf_staging/third_party/pybind11.BUILD:
 tf_staging/third_party/pybind11_bazel/BUILD:
 tf_staging/third_party/pybind11_protobuf/BUILD:
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 1311722b20b38e..199cf34c4f27ab 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -10,7 +10,7 @@ load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
 load("//tensorflow:strict.default.bzl", "py_strict_library")
 
 # Placeholder: load py_proto_library
-load("//tensorflow:tensorflow.bzl", "VERSION", "cc_header_only_library", "clean_dep", "if_google", "if_mlir", "if_oss", "if_windows", "if_xla_available", "tf_enable_mlir_bridge", "tf_python_pybind_static_deps", "tsl_async_value_deps")
+load("//tensorflow:tensorflow.bzl", "VERSION", "cc_header_only_library", "clean_dep", "if_google", "if_oss", "if_windows", "if_xla_available", "tf_enable_mlir_bridge", "tf_python_pybind_static_deps", "tsl_async_value_deps")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable", "pywrap_tensorflow_macro", "tf_external_workspace_visible", "tf_monitoring_python_deps", "tf_pybind_cc_library_wrapper", "tf_python_pybind_extension")
 load(
     "//tensorflow/core/platform:build_config.bzl",
@@ -134,6 +134,7 @@ py_strict_library(
         ":pywrap_tensorflow",
         ":pywrap_tfe",
         "//tensorflow/compiler/mlir/quantization/tensorflow/python:quantize_model",
+        "//tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/python:pywrap_tensorflow_to_stablehlo",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/dtensor/python:dtensor",
         "//tensorflow/python/autograph",
@@ -285,6 +286,7 @@ py_strict_library(
         "//tensorflow/python/summary/writer",
         "//tensorflow/python/tools:module_util",
         "//tensorflow/python/tools/api/generator:create_python_api",
+        "//tensorflow/python/tpu:_pywrap_sparse_core_layout",
         "//tensorflow/python/tpu:datasets",
         "//tensorflow/python/tpu:functional",
         "//tensorflow/python/tpu:preempted_hook_py",
@@ -611,8 +613,8 @@ py_strict_library(
     name = "global_test_configuration",
     compatible_with = get_compatible_with_portable(),
     srcs_version = "PY3",
-    deps = if_mlir(["//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_pass_registration"]) +
-           tf_enable_mlir_bridge(),
+    deps =
+        tf_enable_mlir_bridge(),
 )
 
 # `tree.compat` requires visibility exception to test against `nest_test`
@@ -767,6 +769,7 @@ pywrap_tensorflow_macro(
         "//tensorflow/cc/saved_model:metrics_impl",
         "//tensorflow/compiler/mlir/quantization/stablehlo/python:pywrap_quantization_lib_impl",
         "//tensorflow/compiler/mlir/quantization/tensorflow/python:quantize_model_cc_impl",
+        "//tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/python:pywrap_tensorflow_to_stablehlo_lib_impl",
         "//tensorflow/compiler/mlir/tensorflow/c:mlir_c_api_registration",
         "//tensorflow/compiler/tf2tensorrt:op_converter_registry_impl",
         "//tensorflow/compiler/tf2xla:tf2xla_opset",
@@ -799,6 +802,7 @@ pywrap_tensorflow_macro(
         "//tensorflow/core/platform:stacktrace_handler",
         "//tensorflow/core/profiler:profiler_impl",
         "//tensorflow/core/profiler/internal:print_model_analysis",
+        "//tensorflow/core/tpu/kernels:sparse_core_layout",
         "//tensorflow/core/util:determinism",
         "//tensorflow/distribute/experimental/rpc/kernels:rpc_ops",
         "//tensorflow/dtensor/cc:dtensor_device_cc",
@@ -881,6 +885,7 @@ filegroup(
         "//tensorflow/compiler/jit:flags",  # tfe
         "//tensorflow/compiler/jit:get_compiler_ir",  # tfe
         "//tensorflow/compiler/mlir/quantization/tensorflow/python:quantize_model_cc_impl",  # quantization
+        "//tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/python:pywrap_tensorflow_to_stablehlo_lib_impl",  # tensorflow_to_stablehlo
         "//tensorflow/compiler/tf2xla:tf2xla_opset",  # pywrap_xla_ops
         "//tensorflow/core:framework_internal_impl",  # op_def_registry
         "//tensorflow/core:lib_internal_impl",  # device_lib
@@ -920,6 +925,7 @@ filegroup(
         "//tensorflow/core/platform:statusor",  # tfe
         "//tensorflow/core/profiler/internal:print_model_analysis",  # tfprof
         "//tensorflow/core/profiler/rpc/client:profiler_client_impl",  # profiler
+        "//tensorflow/core/tpu/kernels:sparse_core_layout",  # sparse_core_layouts
         "//tensorflow/core/util:determinism",  # determinism
         "//tensorflow/core/util:port",  # util_port
         "//tensorflow/core/util/tensor_bundle",  # checkpoint_reader
diff --git a/tensorflow/python/autograph/core/BUILD b/tensorflow/python/autograph/core/BUILD
index d1d4ee16fe1761..1a0e294dbef754 100644
--- a/tensorflow/python/autograph/core/BUILD
+++ b/tensorflow/python/autograph/core/BUILD
@@ -51,7 +51,7 @@ py_strict_library(
 py_strict_library(
     name = "config_lib",
     srcs = ["config_lib.py"],
-    visibility = ["//tensorflow:__subpackages__"],
+    visibility = ["//visibility:private"],
 )
 
 py_strict_library(
diff --git a/tensorflow/python/autograph/core/config.py b/tensorflow/python/autograph/core/config.py
index 6686b35d526fdc..eb3bd783136eff 100644
--- a/tensorflow/python/autograph/core/config.py
+++ b/tensorflow/python/autograph/core/config.py
@@ -58,4 +58,5 @@
     DoNotConvert('tensorflow_datasets.core'),
 
     DoNotConvert('keras'),
+    DoNotConvert('tf_keras'),
 )
diff --git a/tensorflow/python/checkpoint/functional_saver.py b/tensorflow/python/checkpoint/functional_saver.py
index 48d8f043458ca4..995e850b82d53b 100644
--- a/tensorflow/python/checkpoint/functional_saver.py
+++ b/tensorflow/python/checkpoint/functional_saver.py
@@ -284,7 +284,7 @@ def __init__(
 
           if (checkpoint_key, slice_spec) in self._keys_to_restore_fn:
             raise ValueError(
-                "Recieved multiple tensors with the same checkpoint key and "
+                "Received multiple tensors with the same checkpoint key and "
                 "slice spec. This is invalid because one will overwrite the "
                 "other in the checkpoint. This indicates a bug in the "
                 "Checkpoint key-generation.")
diff --git a/tensorflow/python/client/tf_session_helper.cc b/tensorflow/python/client/tf_session_helper.cc
index 004411f5550c06..f06c65e7837d2e 100644
--- a/tensorflow/python/client/tf_session_helper.cc
+++ b/tensorflow/python/client/tf_session_helper.cc
@@ -495,11 +495,11 @@ string EqualAttrValueWrapper(const string& actual, const string& expected) {
 }
 
 // Return value set to 6 inlined elements so it fits in a 64-byte cache line.
-tensorflow::gtl::InlinedVector<int64_t, 6> TF_GraphGetTensorShapeHelper(
+absl::InlinedVector<int64_t, 6UL> TF_GraphGetTensorShapeHelper(
     TF_Graph* graph, TF_Output output, TF_Status* out_status,
     bool* unknown_shape) {
   // Allocate a single variable for holding the result for RVO.
-  tensorflow::gtl::InlinedVector<int64_t, 6> result;
+  absl::InlinedVector<int64_t, 6UL> result;
   *unknown_shape = false;
   int num_dims = TF_GraphGetTensorNumDims(graph, output, out_status);
   if (TF_GetCode(out_status) != TF_OK) {
diff --git a/tensorflow/python/client/tf_session_helper.h b/tensorflow/python/client/tf_session_helper.h
index 8a9682307c4ff5..5d2cbcf1e4a98f 100644
--- a/tensorflow/python/client/tf_session_helper.h
+++ b/tensorflow/python/client/tf_session_helper.h
@@ -34,13 +34,13 @@ namespace tensorflow {
 
 // A NameVector is a vector of tensor or operation names, as borrowed
 // C strings.
-typedef tensorflow::gtl::InlinedVector<const char*, 8> NameVector;
+typedef absl::InlinedVector<const char*, 8UL> NameVector;
 
 // A PyObjectVector is a vector of borrowed pointers to PyObjects.
-typedef tensorflow::gtl::InlinedVector<PyObject*, 8> PyObjectVector;
+typedef absl::InlinedVector<PyObject*, 8UL> PyObjectVector;
 
 // A TF_TensorVector is a vector of borrowed pointers to TF_Tensors.
-typedef gtl::InlinedVector<TF_Tensor*, 8> TF_TensorVector;
+typedef absl::InlinedVector<TF_Tensor*, 8UL> TF_TensorVector;
 
 TF_Session* TF_NewSessionRef(TF_Graph* graph, const TF_SessionOptions* opts,
                              TF_Status* status);
@@ -139,7 +139,7 @@ string EqualAttrValueWrapper(const string& actual, const string& expected);
 // dimension".  Sets unknown_shape to false.
 //
 // If shape is unknown, sets unknown_shape to true.
-tensorflow::gtl::InlinedVector<int64_t, 6> TF_GraphGetTensorShapeHelper(
+absl::InlinedVector<int64_t, 6UL> TF_GraphGetTensorShapeHelper(
     TF_Graph* graph, TF_Output output, TF_Status* status, bool* unknown_shape);
 
 // Runs the graph associated with the session starting with the supplied inputs.
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 41d91d9ae07bf0..0a9176e41d5a19 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 4, 17)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 5, 21)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index 29faa384d7d28d..5426772ec4d8ae 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -758,12 +758,8 @@ tf_py_strict_test(
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/framework:combinations",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
-        "//tensorflow/python/framework:random_seed",
         "//tensorflow/python/platform:client_testlib",
-        "@absl_py//absl/logging",
         "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/data/experimental/kernel_tests/global_shuffle_test.py b/tensorflow/python/data/experimental/kernel_tests/global_shuffle_test.py
index 7cbd6fb685ee4c..51141263e312e1 100644
--- a/tensorflow/python/data/experimental/kernel_tests/global_shuffle_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/global_shuffle_test.py
@@ -198,5 +198,129 @@ def _build_dataset() -> dataset_ops.Dataset:
         assert_items_equal=reshuffle_each_iteration)
 
 
+class GlobalShuffleNumpyIteratorCheckpointTest(parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(
+          test_base.v2_eager_only_combinations(),
+          combinations.combine(
+              dataset_range=[6, 10],
+              reshuffle_each_iteration=[True, False],
+              prefetch=[True, False],
+              symbolic_checkpoint=[True, False],
+          ),
+      )
+  )
+  def testRange(
+      self,
+      dataset_range: int,
+      reshuffle_each_iteration: bool,
+      prefetch: bool,
+      symbolic_checkpoint: bool,
+  ):
+
+    def _build_dataset() -> dataset_ops.Dataset:
+      dataset = dataset_ops.Dataset.range(dataset_range)
+      if prefetch:
+        dataset = dataset.prefetch(buffer_size=dataset_ops.AUTOTUNE)
+      dataset = global_shuffle_op._global_shuffle(
+          dataset, seed=10, reshuffle_each_iteration=reshuffle_each_iteration
+      )
+      if symbolic_checkpoint:
+        options = options_lib.Options()
+        options.experimental_symbolic_checkpoint = symbolic_checkpoint
+        dataset = dataset.with_options(options)
+      return dataset
+
+    dataset = _build_dataset()
+    it = dataset.as_numpy_iterator()
+
+    expected_elements = []
+    # For each step, we checkpoint the iterator to try to regenerate
+    # the element again to make sure the regenerated element
+    # is the same as expected.
+    for _ in range(dataset_range):
+      checkpoint = it.save()
+      expected = next(it)
+      it.restore(checkpoint)
+      actual = next(it)
+
+      self.assertEqual(expected, actual)
+
+      expected_elements.append(expected)
+
+    self.assertCountEqual(expected_elements, range(dataset_range))
+
+  @combinations.generate(
+      combinations.times(
+          test_base.v2_eager_only_combinations(),
+          combinations.combine(
+              dataset_range=[6, 10],
+              reshuffle_each_iteration=[True, False],
+              prefetch=[True, False],
+              symbolic_checkpoint=[True, False],
+          ),
+      )
+  )
+  def testRangeAndRepeat(
+      self,
+      dataset_range: int,
+      reshuffle_each_iteration: bool,
+      prefetch: bool,
+      symbolic_checkpoint: bool,
+  ):
+
+    num_repeat = 3
+
+    def _build_dataset() -> dataset_ops.Dataset:
+      dataset = dataset_ops.Dataset.range(dataset_range)
+      if prefetch:
+        dataset = dataset.prefetch(buffer_size=dataset_ops.AUTOTUNE)
+      dataset = global_shuffle_op._global_shuffle(
+          dataset, seed=10, reshuffle_each_iteration=reshuffle_each_iteration
+      )
+      dataset = dataset.repeat(num_repeat)
+      if symbolic_checkpoint:
+        options = options_lib.Options()
+        options.experimental_symbolic_checkpoint = symbolic_checkpoint
+        dataset = dataset.with_options(options)
+      return dataset
+
+    dataset = _build_dataset()
+    it = dataset.as_numpy_iterator()
+    checkpoint = it.save()
+    first_pass_results = []
+
+    # First pass
+    for _ in range(dataset_range * num_repeat):
+      e = next(it)
+      first_pass_results.append(e)
+
+    it.restore(checkpoint)
+
+    # Second pass
+    second_pass_results = []
+    for _ in range(dataset_range * num_repeat):
+      checkpoint = it.save()
+      expected = next(it)
+      it.restore(checkpoint)
+      actual = next(it)
+
+      self.assertEqual(expected, actual)
+
+      second_pass_results.append(expected)
+
+    self.assertCountEqual(
+        second_pass_results, list(range(dataset_range)) * num_repeat
+    )
+
+    self.assertSequenceEqual(
+        first_pass_results,
+        second_pass_results,
+        "First pass and second pass should generate the same results because"
+        " the underlying seed generator should be restored properly as well.",
+    )
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/index_flat_map_test.py b/tensorflow/python/data/experimental/kernel_tests/index_flat_map_test.py
index a2d689835fb4db..869c6125691098 100644
--- a/tensorflow/python/data/experimental/kernel_tests/index_flat_map_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/index_flat_map_test.py
@@ -126,6 +126,31 @@ def _index_map_func(index: int) -> tuple[int, int]:
         dataset, _map_func, _index_map_func)
     self.assertDatasetProduces(dataset, list(range(dataset_range)))
 
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(
+              input_range=[0, 10],
+              map_output=[
+                  [[1, 2], [3, 4], [5, 6]],
+                  [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[9, 10], [11, 12]]]
+              ],
+              use_tensors=[True, False])))
+  def test_nested_list(
+      self, input_range: int, map_output: list[Any], use_tensors: bool):
+
+    def _map_func(_) -> Union[tensor.Tensor, list[list[int]]]:
+      return (constant_op.constant(map_output, dtype=dtypes.int64)
+              if use_tensors else map_output)
+
+    def _index_map_func(i: int) -> tuple[int, int]:
+      return (i // len(map_output), i % len(map_output))
+
+    dataset = dataset_ops.Dataset.range(input_range)
+    dataset = index_flat_map_op.index_flat_map(
+        dataset, _map_func, _index_map_func)
+    self.assertDatasetProduces(dataset, map_output * input_range)
+
   @combinations.generate(test_base.default_test_combinations())
   def test_offset_out_of_range(self):
 
@@ -137,11 +162,11 @@ def _index_map_func(_) -> tuple[int, int]:
     dataset = index_flat_map_op.index_flat_map(dataset, _split, _index_map_func)
     with self.assertRaisesRegex(
         errors.InvalidArgumentError,
-        "invalid `index_map_fn` which returns offset 1000"):
+        "invalid `index_map_func` which returns offset 1000"):
       self.getDatasetOutput(dataset)
 
   @combinations.generate(test_base.default_test_combinations())
-  def test_invalid_map_fn(self):
+  def test_invalid_map_fn_type(self):
 
     def _index_map_func(_) -> str:
       # Expected to return two integers.
diff --git a/tensorflow/python/data/experimental/kernel_tests/weighted_flat_map_test.py b/tensorflow/python/data/experimental/kernel_tests/weighted_flat_map_test.py
index 8ff845dc0816a9..40ba2ee4b4ae42 100644
--- a/tensorflow/python/data/experimental/kernel_tests/weighted_flat_map_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/weighted_flat_map_test.py
@@ -15,9 +15,9 @@
 """Tests for `tf.data.Dataset.weighted_flat_map()`."""
 
 from typing import Callable
+import unittest
 
 from absl.testing import parameterized
-
 import numpy as np
 
 from tensorflow.python.data.experimental.ops import global_shuffle_op
@@ -92,31 +92,40 @@ def testZeroWeight(self):
       self.getDatasetOutput(dataset, requires_initialization=True)
 
 
+@unittest.skip("TODO(b/325112575): Fix incompatibility with batch dataset")
 class GlobalShuffleTest(test_base.DatasetTestBase, parameterized.TestCase):
   """Tests for global shuffling of tf.data datasets."""
 
   @combinations.generate(test_base.default_test_combinations())
   def testShuffledOutput(self):
-    dataset1 = dataset_ops.Dataset.range(10)
-    dataset2 = dataset_ops.Dataset.range(10, 20)
-    dataset3 = dataset_ops.Dataset.range(20, 30)
+    dataset1 = dataset_ops.Dataset.range(10).prefetch(
+        buffer_size=dataset_ops.AUTOTUNE)
+    dataset2 = dataset_ops.Dataset.range(10, 20).prefetch(
+        buffer_size=dataset_ops.AUTOTUNE)
+    dataset3 = dataset_ops.Dataset.range(20, 30).prefetch(
+        buffer_size=dataset_ops.AUTOTUNE)
     dataset = weighted_flat_map_op._weighted_flat_map(
         [dataset1, dataset2, dataset3], np.asarray([0.25, 0.25, 0.5]))
     dataset = global_shuffle_op._global_shuffle(dataset)
+
     output = self.getDatasetOutput(dataset, requires_initialization=True)
     self.assertCountEqual(
         output, list(range(5)) + list(range(10, 15)) + list(range(20, 30)))
 
   @combinations.generate(test_base.default_test_combinations())
   def testShuffledInputs(self):
-    dataset1 = dataset_ops.Dataset.range(10)
-    dataset2 = dataset_ops.Dataset.range(10, 20)
-    dataset3 = dataset_ops.Dataset.range(20, 30)
+    dataset1 = dataset_ops.Dataset.range(10).prefetch(
+        buffer_size=dataset_ops.AUTOTUNE)
+    dataset2 = dataset_ops.Dataset.range(10, 20).prefetch(
+        buffer_size=dataset_ops.AUTOTUNE)
+    dataset3 = dataset_ops.Dataset.range(20, 30).prefetch(
+        buffer_size=dataset_ops.AUTOTUNE)
     dataset1 = global_shuffle_op._global_shuffle(dataset1, seed=42)
     dataset2 = global_shuffle_op._global_shuffle(dataset2, seed=42)
     dataset3 = global_shuffle_op._global_shuffle(dataset3, seed=42)
     dataset = weighted_flat_map_op._weighted_flat_map(
         [dataset1, dataset2, dataset3], np.asarray([0.25, 0.25, 0.5]))
+
     output = self.getDatasetOutput(dataset, requires_initialization=True)
     # Verifies that the first 5 elements are from `dataset1` in a random order.
     self.assertFalse(set(output[:5]).issubset(set(range(5))))
@@ -155,7 +164,24 @@ def testShuffledInputsAndOutput(self):
     # Verifies that there are 10 elements from dataset3
     self.assertTrue(set(sorted_output[10:]).issubset(set(range(20, 30))))
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testShuffledWithBatch(self):
+    dataset1 = dataset_ops.Dataset.range(10)
+    dataset2 = dataset_ops.Dataset.range(10, 20)
+    dataset1 = dataset1.batch(2, drop_remainder=True)
+    dataset2 = dataset2.batch(2, drop_remainder=True)
+    dataset = weighted_flat_map_op._weighted_flat_map(
+        [dataset1, dataset2], np.asarray([0.5, 0.5])
+    )
+    shuffled_dataset = global_shuffle_op._global_shuffle(dataset, seed=42)
+    shuffled_output = self.getDatasetOutput(
+        shuffled_dataset, requires_initialization=True
+    )
+    output = self.getDatasetOutput(dataset, requires_initialization=True)
+    self.assertLen(shuffled_output, len(output))
+
 
+@unittest.skip("TODO(b/325112575): Fix incompatibility with batch dataset")
 class WeightedFlatMapGlobalShuffleCheckpointTest(
     checkpoint_test_base.CheckpointTestBase, parameterized.TestCase):
 
diff --git a/tensorflow/python/data/experimental/ops/index_flat_map_op.py b/tensorflow/python/data/experimental/ops/index_flat_map_op.py
index 5b1242b521b7f8..cfd1480c97c0af 100644
--- a/tensorflow/python/data/experimental/ops/index_flat_map_op.py
+++ b/tensorflow/python/data/experimental/ops/index_flat_map_op.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Python API for `index_flat_map` dataset, which supports global shuffling."""
 
-from typing import Any, Callable, Optional, Union
+from typing import Any, Callable, Optional, Sequence, Union
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import structured_function
@@ -79,6 +79,10 @@ def _index_map_func(flattened_index: int) -> tuple[int, int]:
 
   Returns:
     A new `Dataset` with the transformation applied as described above.
+
+  Raises:
+    errors.InvalidArgumentError: If `index_map_func` does not return a tuple of
+      two integers, or the returned offset is out of range.
   """
   return _IndexFlatMapDataset(
       input_dataset, map_func, index_map_func, output_cardinality, name)
@@ -117,12 +121,21 @@ def __init__(
         **self._common_args)
     super().__init__(input_dataset, variant_tensor)
 
-  # TODO(b/325112575): Make sure this works if `map_func` returns lists.
   @property
   def element_spec(self) -> Any:
-    return tensor_spec.TensorSpec(
-        shape=[],
-        dtype=self._map_func.output_structure.dtype)
+    output_structure = self._map_func.output_structure
+    # If the `map_func` returns a Python list of lists, then `output_structure`
+    # contains s sequence of `TensorSpecs`, each representing the structure of
+    # the inner lists.
+    if isinstance(output_structure, Sequence):
+      return output_structure[0]
+    # If the `map_func` returns a Tensor of nested lists, then each mapped
+    # element is one stacked Tensor.
+    if output_structure.shape.rank > 1:
+      return tensor_spec.TensorSpec(
+          shape=output_structure.shape[1:], dtype=output_structure.dtype)
+    # `map_func` returns a list of scalars.
+    return tensor_spec.TensorSpec(shape=[], dtype=output_structure.dtype)
 
   def _functions(self) -> list[structured_function.StructuredFunctionWrapper]:
     return [self._map_func, self._index_map_func]
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index daf24612198162..da147e0f39435e 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -374,7 +374,7 @@ tf_py_strict_test(
 
 tf_py_strict_test(
     name = "flat_map_test",
-    size = "small",
+    size = "medium",
     srcs = ["flat_map_test.py"],
     grpc_enabled = True,
     shard_count = 8,
@@ -384,7 +384,9 @@ tf_py_strict_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python/client:session",
+        "//tensorflow/python/data/experimental/ops:global_shuffle_op",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/framework:combinations",
@@ -1358,6 +1360,7 @@ tf_py_strict_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
+        "//tensorflow/python/data/experimental/ops:random_access",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/framework:combinations",
@@ -1455,6 +1458,7 @@ tf_py_strict_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
+        "//tensorflow/python/data/experimental/ops:global_shuffle_op",
         "//tensorflow/python/data/experimental/ops:random_access",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
diff --git a/tensorflow/python/data/kernel_tests/cache_test.py b/tensorflow/python/data/kernel_tests/cache_test.py
index 7e0324cdd11921..7923fb75756828 100644
--- a/tensorflow/python/data/kernel_tests/cache_test.py
+++ b/tensorflow/python/data/kernel_tests/cache_test.py
@@ -18,7 +18,7 @@
 from os import path
 import shutil
 import tempfile
-from typing import Optional
+from typing import Callable, Optional
 
 from absl.testing import parameterized
 import numpy as np
@@ -727,5 +727,39 @@ def test(
     self.assertLen(dataset_output, self.evaluate(dataset.cardinality()))
 
 
+class CacheGlobalShuffleCheckpointTest(
+    checkpoint_test_base.CheckpointTestBase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(
+              dataset_range=[10],
+              repetitions=[1, 2],
+              reshuffle_each_iteration=[True, False])))
+  def test(
+      self,
+      verify_fn: Callable[..., None],
+      dataset_range: int,
+      repetitions: int,
+      reshuffle_each_iteration: bool):
+
+    def _build_dataset() -> dataset_ops.Dataset:
+      dataset = dataset_ops.Dataset.range(dataset_range)
+      dataset = dataset.cache()
+      dataset = dataset.prefetch(buffer_size=dataset_ops.AUTOTUNE)
+      if repetitions > 1:
+        dataset = dataset.repeat(repetitions)
+      return global_shuffle_op._global_shuffle(
+          dataset, seed=42, reshuffle_each_iteration=reshuffle_each_iteration)
+
+    verify_fn(
+        self,
+        _build_dataset,
+        num_outputs=dataset_range * repetitions,
+        assert_items_equal=reshuffle_each_iteration)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/flat_map_test.py b/tensorflow/python/data/kernel_tests/flat_map_test.py
index 49be5a7124613e..1e6182d97b373e 100644
--- a/tensorflow/python/data/kernel_tests/flat_map_test.py
+++ b/tensorflow/python/data/kernel_tests/flat_map_test.py
@@ -14,11 +14,14 @@
 # ==============================================================================
 """Tests for `tf.data.Dataset.flat_map()`."""
 import random
+from typing import Callable, Optional
 
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.core.framework import dataset_options_pb2
 from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import global_shuffle_op
 from tensorflow.python.data.kernel_tests import checkpoint_test_base
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
@@ -189,6 +192,61 @@ def fn(x):
     dataset = dataset_ops.Dataset.from_tensors(42).flat_map(fn, name="flat_map")
     self.assertDatasetProduces(dataset, [42])
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testCardinality(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        [[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    dataset = dataset.flat_map(dataset_ops.Dataset.from_tensor_slices)
+    options = dataset_options_pb2.CardinalityOptions(
+        compute_level="CARDINALITY_COMPUTE_MODERATE")
+    cardinality = dataset_ops.gen_dataset_ops.dataset_cardinality(
+        dataset._variant_tensor, options.SerializeToString())
+    self.assertEqual(self.evaluate(cardinality), 9)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testInfiniteCardinality(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        [[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    dataset = dataset.flat_map(lambda _: dataset_ops.Dataset.range(1).repeat())
+    options = dataset_options_pb2.CardinalityOptions(
+        compute_level="CARDINALITY_COMPUTE_MODERATE")
+    cardinality = dataset_ops.gen_dataset_ops.dataset_cardinality(
+        dataset._variant_tensor, options.SerializeToString())
+    self.assertEqual(self.evaluate(cardinality), dataset_ops.INFINITE)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testUnknownCardinality(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        [[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    dataset = dataset.flat_map(
+        lambda _: dataset_ops.Dataset.range(10).filter(lambda x: x % 2 == 1))
+    options = dataset_options_pb2.CardinalityOptions(
+        compute_level="CARDINALITY_COMPUTE_MODERATE")
+    cardinality = dataset_ops.gen_dataset_ops.dataset_cardinality(
+        dataset._variant_tensor, options.SerializeToString())
+    self.assertEqual(self.evaluate(cardinality), dataset_ops.UNKNOWN)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testEmptyCardinality(self):
+    dataset = dataset_ops.Dataset.range(0)
+    dataset = dataset.flat_map(dataset_ops.Dataset.range)
+    options = dataset_options_pb2.CardinalityOptions(
+        compute_level="CARDINALITY_COMPUTE_MODERATE")
+    cardinality = dataset_ops.gen_dataset_ops.dataset_cardinality(
+        dataset._variant_tensor, options.SerializeToString())
+    self.assertEqual(self.evaluate(cardinality), 0)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testCardinalityLowEffort(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        [[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    dataset = dataset.flat_map(dataset_ops.Dataset.from_tensor_slices)
+    options = dataset_options_pb2.CardinalityOptions(
+        compute_level="CARDINALITY_COMPUTE_LOW")
+    cardinality = dataset_ops.gen_dataset_ops.dataset_cardinality(
+        dataset._variant_tensor, options.SerializeToString())
+    self.assertEqual(self.evaluate(cardinality), dataset_ops.UNKNOWN)
+
   @combinations.generate(test_base.default_test_combinations())
   def testMapFuncFailWithErrorContext(self):
 
@@ -408,5 +466,77 @@ def my_map(x):
     verify_fn(self, build_dataset, num_outputs=3 * 4 - num_skips)
 
 
+class FlatMapGlobalShuffleTest(
+    test_base.DatasetTestBase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(
+              repetitions=[1, 2],
+              seed=[None, 42],
+              reshuffle_each_iteration=[True, False])))
+  def test(
+      self,
+      repetitions: int,
+      seed: Optional[int],
+      reshuffle_each_iteration: bool):
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        [[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    dataset = dataset.prefetch(buffer_size=dataset_ops.AUTOTUNE)
+    dataset = dataset.flat_map(dataset_ops.Dataset.from_tensor_slices)
+    if repetitions > 1:
+      dataset = dataset.repeat(repetitions)
+    dataset = dataset.prefetch(buffer_size=dataset_ops.AUTOTUNE)
+    dataset = global_shuffle_op._global_shuffle(
+        dataset, seed=seed, reshuffle_each_iteration=reshuffle_each_iteration)
+
+    expected = list(range(1, 10)) * repetitions
+    dataset_output = self.getDatasetOutput(
+        dataset, requires_initialization=True)
+    self.assertCountEqual(dataset_output, expected)
+    self.assertNotEqual(dataset_output, expected)
+    self.assertLen(dataset_output, self.evaluate(dataset.cardinality()))
+
+
+class FlatMapGlobalShuffleCheckpointTest(
+    checkpoint_test_base.CheckpointTestBase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(
+              repetitions=[1, 2],
+              reshuffle_each_iteration=[True, False],
+              symbolic_checkpoint=[True, False])))
+  def test(
+      self,
+      verify_fn: Callable[..., None],
+      repetitions: int,
+      reshuffle_each_iteration: bool,
+      symbolic_checkpoint: bool):
+
+    def _build_dataset() -> dataset_ops.Dataset:
+      dataset = dataset_ops.Dataset.from_tensor_slices(
+          [[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+      dataset = dataset.prefetch(buffer_size=dataset_ops.AUTOTUNE)
+      dataset = dataset.flat_map(dataset_ops.Dataset.from_tensor_slices)
+      if repetitions > 1:
+        dataset = dataset.repeat(repetitions)
+      dataset = dataset.prefetch(buffer_size=dataset_ops.AUTOTUNE)
+      dataset = global_shuffle_op._global_shuffle(
+          dataset, seed=42, reshuffle_each_iteration=reshuffle_each_iteration)
+      options = options_lib.Options()
+      options.experimental_symbolic_checkpoint = symbolic_checkpoint
+      return dataset.with_options(options)
+
+    verify_fn(
+        self,
+        _build_dataset,
+        num_outputs=9 * repetitions,
+        assert_items_equal=reshuffle_each_iteration)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/unbatch_test.py b/tensorflow/python/data/kernel_tests/unbatch_test.py
index d86075cd2b73bf..a65c6a2f2f393d 100644
--- a/tensorflow/python/data/kernel_tests/unbatch_test.py
+++ b/tensorflow/python/data/kernel_tests/unbatch_test.py
@@ -16,6 +16,7 @@
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.data.experimental.ops import random_access
 from tensorflow.python.data.kernel_tests import checkpoint_test_base
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
@@ -262,5 +263,34 @@ def test(self, verify_fn, symbolic_checkpoint):
         num_outputs)
 
 
+class UnbatchRandomAccessTest(test_base.DatasetTestBase,
+                              parameterized.TestCase):
+  @combinations.generate(test_base.default_test_combinations())
+  def test(self):
+    dataset = dataset_ops.Dataset.range(10)
+    dataset = dataset.batch(4, drop_remainder=True)
+    dataset = dataset.unbatch()
+    for i in range(8):
+      self.assertEqual(self.evaluate(random_access.at(dataset, i)), i)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testNotDropRemainder(self):
+    dataset = dataset_ops.Dataset.range(10)
+    dataset = dataset.batch(4, drop_remainder=False)
+    dataset = dataset.unbatch()
+    with self.assertRaises(errors.FailedPreconditionError):
+      self.evaluate(random_access.at(dataset, 0))
+
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         combinations.combine(index=[-1, 100])))
+  def testInvalidIndex(self, index):
+    dataset = dataset_ops.Dataset.range(10)
+    dataset = dataset.batch(4, drop_remainder=True)
+    dataset = dataset.unbatch()
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(random_access.at(dataset, index=index))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/zip_test.py b/tensorflow/python/data/kernel_tests/zip_test.py
index c81c70a2cd8bd6..c1bcfcde94ab5d 100644
--- a/tensorflow/python/data/kernel_tests/zip_test.py
+++ b/tensorflow/python/data/kernel_tests/zip_test.py
@@ -15,10 +15,12 @@
 """Tests for `tf.data.Dataset.zip()`."""
 import collections
 import dataclasses
+from typing import Callable, Tuple
 
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.data.experimental.ops import global_shuffle_op
 from tensorflow.python.data.experimental.ops import random_access
 from tensorflow.python.data.kernel_tests import checkpoint_test_base
 from tensorflow.python.data.kernel_tests import test_base
@@ -384,5 +386,139 @@ class Foo:
       self.evaluate(random_access.at(dataset, index=4))
 
 
+class ZipGlobalShuffleTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(
+          # Tested only on v2 because map v1 does not preserve cardinality.
+          test_base.v2_only_combinations(),
+          combinations.combine(dataset_range=[5, 6, 7]),
+      )
+  )
+  def testZipV2SameLengthInputs(self, dataset_range: int):
+    first_dataset = dataset_ops.Dataset.range(dataset_range)
+    first_dataset = first_dataset.map(lambda x: x * 2)
+    second_dataset = dataset_ops.Dataset.range(dataset_range)
+
+    dataset = dataset_ops.Dataset.zip(first_dataset, second_dataset)
+    dataset = dataset.prefetch(buffer_size=dataset_ops.AUTOTUNE)
+    shuffle_dataset1 = global_shuffle_op._global_shuffle(dataset, seed=10)
+    shuffle_dataset2 = global_shuffle_op._global_shuffle(dataset, seed=11)
+
+    dataset_output1 = self.getDatasetOutput(
+        shuffle_dataset1, requires_initialization=True
+    )
+    dataset_output2 = self.getDatasetOutput(
+        shuffle_dataset2, requires_initialization=True
+    )
+    expected = [(x * 2, x) for x in range(dataset_range)]
+
+    self.assertLen(dataset_output1, dataset_range)
+    self.assertLen(dataset_output2, dataset_range)
+    self.assertCountEqual(dataset_output1, expected)
+    self.assertCountEqual(dataset_output2, expected)
+
+    self.assertNotAllEqual(
+        dataset_output1,
+        dataset_output2,
+        "Different seeds should generate different orders of outputs.",
+    )
+
+    for x_first, x_second in dataset_output1:
+      self.assertEqual(x_first, x_second * 2)
+
+    for x_first, x_second in dataset_output2:
+      self.assertEqual(x_first, x_second * 2)
+
+  @combinations.generate(
+      combinations.times(
+          test_base.v2_only_combinations(),
+          combinations.combine(
+              dataset_ranges=[(10, 8), (9, 5), (4, 7), (5, 8)]
+          ),
+      )
+  )
+  def testZipV2DifferentLengthInputs(self, dataset_ranges: Tuple[int, int]):
+    first_dataset = dataset_ops.Dataset.range(dataset_ranges[0])
+    first_dataset = first_dataset.map(lambda x: x * 2)
+    second_dataset = dataset_ops.Dataset.range(dataset_ranges[1])
+
+    dataset = dataset_ops.Dataset.zip(first_dataset, second_dataset)
+    dataset = dataset.prefetch(buffer_size=dataset_ops.AUTOTUNE)
+    shuffle_dataset1 = global_shuffle_op._global_shuffle(dataset, seed=10)
+    shuffle_dataset2 = global_shuffle_op._global_shuffle(dataset, seed=11)
+
+    dataset_output1 = self.getDatasetOutput(
+        shuffle_dataset1, requires_initialization=True
+    )
+    dataset_output2 = self.getDatasetOutput(
+        shuffle_dataset2, requires_initialization=True
+    )
+    expected = [(x * 2, x) for x in range(min(dataset_ranges))]
+
+    self.assertLen(dataset_output1, min(dataset_ranges))
+    self.assertLen(dataset_output2, min(dataset_ranges))
+    self.assertCountEqual(dataset_output1, expected)
+    self.assertCountEqual(dataset_output2, expected)
+    self.assertNotAllEqual(
+        dataset_output1,
+        dataset_output2,
+        "Different seeds should generate different orders of outputs.",
+    )
+
+    for x_first, x_second in dataset_output1:
+      self.assertEqual(x_first, x_second * 2)
+
+    for x_first, x_second in dataset_output2:
+      self.assertEqual(x_first, x_second * 2)
+
+
+class ZipGlobalShuffleCheckpointTest(
+    checkpoint_test_base.CheckpointTestBase, parameterized.TestCase
+):
+
+  @combinations.generate(
+      combinations.times(
+          test_base.v2_only_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(
+              dataset_ranges=[(10, 8), (9, 5), (4, 7), (5, 8)],
+              reshuffle_each_iteration=[True, False],
+              symbolic_checkpoint=[True, False],
+          ),
+      )
+  )
+  def testZipV2(
+      self,
+      verify_fn: Callable[..., None],
+      dataset_ranges: Tuple[int, int],
+      reshuffle_each_iteration: bool,
+      symbolic_checkpoint: bool,
+  ):
+
+    def _build_dataset():
+      first_dataset = dataset_ops.Dataset.range(dataset_ranges[0])
+      first_dataset = first_dataset.map(lambda x: x * 2)
+      second_dataset = dataset_ops.Dataset.range(dataset_ranges[1])
+
+      dataset = dataset_ops.Dataset.zip(first_dataset, second_dataset)
+      dataset = dataset.prefetch(buffer_size=dataset_ops.AUTOTUNE)
+      dataset = global_shuffle_op._global_shuffle(
+          dataset, seed=10, reshuffle_each_iteration=reshuffle_each_iteration
+      )
+
+      options = options_lib.Options()
+      options.experimental_optimization.apply_default_optimizations = False
+      options.experimental_symbolic_checkpoint = symbolic_checkpoint
+      return dataset.with_options(options)
+
+    verify_fn(
+        self,
+        _build_dataset,
+        num_outputs=min(dataset_ranges),
+        assert_items_equal=reshuffle_each_iteration,
+    )
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 8bcd53d37efb49..5a17d482598288 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -658,6 +658,7 @@ cuda_py_strict_test(
     srcs_version = "PY3",
     tags = [
         "multi_and_single_gpu",
+        "notap",  # TODO(b/341375925): Re-enable this test when flakiness is fixed.
     ],
     xla_enabled = True,
     deps = [
diff --git a/tensorflow/python/distribute/distribute_utils.py b/tensorflow/python/distribute/distribute_utils.py
index 73ef2fcf0f5454..fd577236210a90 100644
--- a/tensorflow/python/distribute/distribute_utils.py
+++ b/tensorflow/python/distribute/distribute_utils.py
@@ -260,7 +260,7 @@ def is_distributed_table(v):
 
 def _validate_colocate_extended(v, extended):
   variable_strategy = v._distribute_strategy  # pylint: disable=protected-access
-  if variable_strategy.extended is not extended:
+  if not variable_strategy or variable_strategy.extended is not extended:
     raise ValueError(
         "`colocate_vars_with` must only be passed a variable created in this "
         "tf.distribute.Strategy.scope(), not %s created in scope: %s" %
diff --git a/tensorflow/python/eager/ops_test.py b/tensorflow/python/eager/ops_test.py
index 1b2cb760ae59e8..b6dd620b816934 100644
--- a/tensorflow/python/eager/ops_test.py
+++ b/tensorflow/python/eager/ops_test.py
@@ -468,6 +468,10 @@ def testEagerTensorsCanBeGarbageCollected(self):
     weak_y = weakref.ref(y)
     del x
     del y
+    # Run a gc a few times to ensure cycles are resolved.
+    gc.collect()
+    gc.collect()
+    gc.collect()
     gc.collect()
     self.assertIs(weak_x(), None)
     self.assertIs(weak_y(), None)
diff --git a/tensorflow/python/eager/polymorphic_function/polymorphic_function_xla_jit_test.py b/tensorflow/python/eager/polymorphic_function/polymorphic_function_xla_jit_test.py
index 5606865ccf1cd8..32dd1b292a97f3 100644
--- a/tensorflow/python/eager/polymorphic_function/polymorphic_function_xla_jit_test.py
+++ b/tensorflow/python/eager/polymorphic_function/polymorphic_function_xla_jit_test.py
@@ -175,8 +175,8 @@ def fn2(x):
       func = polymorphic_function.function(fn2, jit_compile=False)
       inputs = constant_op.constant([1, 2, 2, 3, 3])
       with self.assertRaisesRegex(
-          errors.InvalidArgumentError, 'legalization failed'
-          if test_util.is_mlir_bridge_enabled() else 'unsupported operations'):
+          errors.InvalidArgumentError, 'unsupported operations'
+      ):
         func(inputs)
 
   def testUnsupportedOps(self):
@@ -189,8 +189,8 @@ def fn(x):
       xla_func = polymorphic_function.function(fn, jit_compile=True)
 
       with self.assertRaisesRegex(
-          errors.InvalidArgumentError, 'legalization failed'
-          if test_util.is_mlir_bridge_enabled() else 'unsupported operations'):
+          errors.InvalidArgumentError, 'unsupported operations'
+      ):
         xla_func(constant_op.constant([3.1, 3.2]))
 
   def testCollectiveReduceChannelId(self):
@@ -500,8 +500,8 @@ def f1(self, x):
       inputs = constant_op.constant([1, 2, 2, 3, 3])
       c = C()
       with self.assertRaisesRegex(
-          errors.InvalidArgumentError, 'legalization failed'
-          if test_util.is_mlir_bridge_enabled() else 'unsupported operations'):
+          errors.InvalidArgumentError, 'unsupported operations'
+      ):
         c.f1(inputs)
 
   def testMustBeConstantPropagation(self):
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index 6dc8541ef09592..e04ad38b81d4c9 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "tensorflow/python/eager/pywrap_tensor.h"
 
+#include <Python.h>  // NOLINT
 #include <stdlib.h>  // NOLINT
 #include <string.h>  // NOLINT
 
@@ -419,6 +420,9 @@ extern "C" {
 
 static const int kMaxEagerTensorParentSize = 64;
 
+// Type object for EagerTensor. This is set by TFE_Py_InitEagerTensor.
+PyTypeObject* EagerTensorType = nullptr;
+
 // TODO(agarwal): store context handle in EagerTensor.
 typedef struct EagerTensor {
   PyObject_HEAD;
@@ -435,8 +439,6 @@ typedef struct EagerTensor {
   // tensors, this will contain a serialized HandleData proto with shape
   // inference metadata about shapes and dtypes of resources accessible from
   // this handle.
-  // Note that we assume that handle_data cannot participate in reference
-  // cycles, and hence don't provide GC support for it.
   PyObject* handle_data;
 
   // This stores `_tensor_shape`, a cached `TensorShape` object, and is set the
@@ -458,9 +460,7 @@ typedef struct EagerTensor {
   // Per-instance attribute dictionary, to support monkey patching
   // (e.g. EagerTensor.assign when slicing variables). This dictionary is
   // created by CPython the first time an attribute is assigned, pointed to by
-  // tp_dictoffset. Note that garbage collection is not enabled for
-  // EagerTensors, so assigning objects to EagerTensor attributes which require
-  // garbage collection is likely to cause issues.
+  // tp_dictoffset.
   PyObject* dict;
 } EagerTensor;
 
@@ -548,28 +548,31 @@ void EagerTensor_dealloc(EagerTensor* self) {
   // Needs to happen before any actual destruction.
   PyObject_ClearWeakRefs((PyObject*)self);
 
-  Py_DECREF(self->handle_data);
-  Py_DECREF(self->tensor_shape);
-  // If an attribute dictionary has been created, release it. Note that this
-  // is only ever created by CPython's attribute setting methods; we don't
-  // create it ourselves.
-  Py_CLEAR(self->dict);
+  PyObject* context = self->context;
+
+  // Ensure context is alive during Destructor calls.
+  Py_XINCREF(context);
+
+  Py_TYPE(self)->tp_clear((PyObject*)self);
+
   if (self->handle != nullptr) {
     // Destructor may call arbitrary functions that end up calling into
     // Python from another thread.
+    TFE_TensorHandle* handle = self->handle;
+    self->handle = nullptr;
     Py_BEGIN_ALLOW_THREADS;
-    TFE_DeleteTensorHandle(self->handle);
+    TFE_DeleteTensorHandle(handle);
     Py_END_ALLOW_THREADS;
-    self->handle = nullptr;
   }
 
   // Decref context after deleting the tensor handle.
-  Py_XDECREF(self->context);
+  Py_XDECREF(context);
 
   // We have the global interpreter lock, so use this chance to perform delayed
   // refcount decrements.
   tensorflow::ClearDecrefCache();
   auto id = self->id;
+  self->id = -1;  // get_uid() starts from 0.
   Py_TYPE(self)->tp_free(self);
   TFE_Py_TapeSetDeleteTrace(id);
 }
@@ -864,6 +867,46 @@ static int EagerTensor_getbuffer(EagerTensor* self, Py_buffer* view,
   return 0;
 }
 
+// dynamic_attr: Allow the garbage collector to traverse the internal instance
+// `__dict__`.
+static int EagerTensor_traverse(PyObject* self, visitproc visit, void* arg) {
+#if PY_VERSION_HEX < 0x030C0000  // < Python 3.12
+  PyObject*& dict = *_PyObject_GetDictPtr(self);
+  Py_VISIT(dict);
+#else
+  _PyObject_VisitManagedDict(self, visit, arg);
+#endif  // PY_VERSION_HEX < 0x030C0000
+  Py_VISIT(((EagerTensor*)self)->handle_data);
+  Py_VISIT(((EagerTensor*)self)->tensor_shape);
+  if (((EagerTensor*)self)->context) {
+    Py_VISIT(((EagerTensor*)self)->context);
+  }
+#if PY_VERSION_HEX >= 0x03090000  // >= Python 3.9
+  // https://docs.python.org/3/c-api/typeobj.html#c.PyTypeObject.tp_traverse
+  Py_VISIT(Py_TYPE(self));
+#endif  // PY_VERSION_HEX >= 0x03090000
+  return 0;
+}
+
+// dynamic_attr: Allow the GC to clear the dictionary.
+extern int EagerTensor_clear(PyObject* self) {
+  // If an attribute dictionary has been created, release it. Note that this
+  // is only ever created by CPython's attribute setting methods; we don't
+  // create it ourselves.
+#if PY_VERSION_HEX < 0x030C0000  // < Python 3.12
+  PyObject*& dict = *_PyObject_GetDictPtr(self);
+  Py_CLEAR(dict);
+#else
+  _PyObject_ClearManagedDict(self);
+#endif  // PY_VERSION_HEX < 0x030C0000
+
+  Py_CLEAR(((EagerTensor*)self)->handle_data);
+  Py_CLEAR(((EagerTensor*)self)->tensor_shape);
+  Py_CLEAR(((EagerTensor*)self)->context);
+
+  return 0;
+}
+
 static PyBufferProcs EagerTensor_as_buffer = {
 #if PY_MAJOR_VERSION < 3
     nullptr, nullptr, nullptr, nullptr,
@@ -887,15 +930,15 @@ static PyBufferProcs EagerTensor_as_buffer = {
 // However it provides a new function, PyType_FromSpecWithBases, to create
 // types dynamically.
 
-// Type object for EagerTensor. This is set by TFE_Py_InitEagerTensor.
-PyTypeObject* EagerTensorType = nullptr;
-
 #if PY_MAJOR_VERSION >= 3
 static PyType_Slot EagerTensor_Type_slots[] = {
     {Py_tp_dealloc, reinterpret_cast<void*>(EagerTensor_dealloc)},
     {Py_tp_methods, reinterpret_cast<void*>(EagerTensor_methods)},
     {Py_tp_getset, reinterpret_cast<void*>(EagerTensor_getsetters)},
     {Py_tp_init, reinterpret_cast<void*>(EagerTensor_init)},
+    {Py_tp_clear, reinterpret_cast<void*>(EagerTensor_clear)},
+    {Py_tp_traverse, reinterpret_cast<void*>(EagerTensor_traverse)},
+    {Py_tp_free, reinterpret_cast<void*>(PyObject_GC_Del)},
     {0, nullptr},
 };
 #else
@@ -1095,16 +1138,12 @@ PyObject* TFE_Py_InitEagerTensor(PyObject* base_class) {
     PyErr_SetString(PyExc_RuntimeError, "Error while creating EagerTensorType");
     return nullptr;
   }
-#if PY_VERSION_HEX >= 0x030B0000
-  // Py_TPFLAGS_MANAGED_DICT is turned on by PyType_FromSpecWithBases by
-  // default. It tells Python that the class's __dict__ should be managed by VM,
-  // but EagerTensor sets a `tp_dictoffset` (below) to explicitly manage the
-  // dict. See:
-  // - https://docs.python.org/3/c-api/typeobj.html#c.Py_TPFLAGS_MANAGED_DICT
-  // - https://docs.python.org/3/c-api/typeobj.html#c.PyTypeObject.tp_dictoffset
-  EagerTensorType->tp_flags &= ~Py_TPFLAGS_MANAGED_DICT;
-#endif
+  EagerTensorType->tp_flags |= Py_TPFLAGS_HAVE_GC;
+#if PY_VERSION_HEX < 0x030B0000  // < Python 3.11
+  // After Python 3.11, Py_TPFLAGS_MANAGED_DICT is enabled by default from
+  // subclasses of classes defined in Python.
   EagerTensorType->tp_dictoffset = offsetof(EagerTensor, dict);
+#endif  // PY_VERSION_HEX < 0x030B0000
   EagerTensorType->tp_as_buffer = &EagerTensor_as_buffer;
 #else
   _EagerTensorType.tp_base = base_class_type;
diff --git a/tensorflow/python/eager/pywrap_tensor_test.py b/tensorflow/python/eager/pywrap_tensor_test.py
index c1539b24f802c3..ee299cd2558565 100644
--- a/tensorflow/python/eager/pywrap_tensor_test.py
+++ b/tensorflow/python/eager/pywrap_tensor_test.py
@@ -15,6 +15,7 @@
 """Tests for TFE_TensorHandleToNumpy."""
 
 import numpy as np
+
 from tensorflow.python.eager import pywrap_tensor_test_util as util
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
@@ -46,6 +47,30 @@ def test_no_leak(self):
       layer = my_layer(x)
     self.assertIsNotNone(layer)
 
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
+  def test_no_leak_cycles(self):
+    for i in range(int(1e2)):
+      # use multiply to avoid cached tensors.
+      x = 1.0 * constant_op.constant([1.0, 1, 1, i])
+      y = 1.0 * constant_op.constant([1.0, 1, 2, i])
+      x.self_ref = lambda x: x
+      x.y = y
+      y.x = x
+
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
+  def test_no_leak_shape(self):
+    for i in range(int(1e2)):
+      # use multiply to avoid cached tensors.
+      x = 1.0 * constant_op.constant([3.0, 1, 1, i])
+      x.shape.x = x
+
+  @test_util.assert_no_new_pyobjects_executing_eagerly()
+  def test_no_leak_handle_data(self):
+    for i in range(int(1e2)):
+      # use multiply to avoid cached tensors.
+      x = 1.0 * constant_op.constant([4.0, 1, 1, i])
+      x._handle_data = x
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/eager/pywrap_tfe.h b/tensorflow/python/eager/pywrap_tfe.h
index 2b0eba3114e48e..67b9ddf2fb045e 100755
--- a/tensorflow/python/eager/pywrap_tfe.h
+++ b/tensorflow/python/eager/pywrap_tfe.h
@@ -29,10 +29,8 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
 
-typedef tensorflow::gtl::InlinedVector<TFE_TensorHandle*, 4>
-    TFE_InputTensorHandles;
-typedef tensorflow::gtl::InlinedVector<TFE_TensorHandle*, 2>
-    TFE_OutputTensorHandles;
+typedef absl::InlinedVector<TFE_TensorHandle*, 4UL> TFE_InputTensorHandles;
+typedef absl::InlinedVector<TFE_TensorHandle*, 2UL> TFE_OutputTensorHandles;
 
 // Execute a TensorFlow operation.
 //
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 0f6a0a28eac7e3..bb22079755d1d5 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -152,8 +152,7 @@ typedef std::function<PyObject*(PyObject*, const std::vector<int64_t>&)>
     PyBackwardFunction;
 
 using AttrToInputsMap =
-    tensorflow::gtl::FlatMap<string,
-                             tensorflow::gtl::InlinedVector<InputInfo, 4>>;
+    tensorflow::gtl::FlatMap<string, absl::InlinedVector<InputInfo, 4UL>>;
 
 tensorflow::gtl::FlatMap<string, AttrToInputsMap*>* GetAllAttrToInputsMaps() {
   static auto* all_attr_to_input_maps =
@@ -913,8 +912,8 @@ void TFE_Py_ExecuteCancelable(TFE_Context* ctx, const char* device_name,
                               TFE_CancellationManager* cancellation_manager,
                               TFE_OutputTensorHandles* outputs,
                               TF_Status* out_status) {
-  tensorflow::profiler::TraceMe activity(
-      "TFE_Py_ExecuteCancelable", tensorflow::profiler::TraceMeLevel::kInfo);
+  tsl::profiler::TraceMe activity("TFE_Py_ExecuteCancelable",
+                                  tsl::profiler::TraceMeLevel::kInfo);
 
   TFE_Op* op = GetOp(ctx, op_name, device_name, out_status);
 
@@ -1120,7 +1119,7 @@ int MaybeRaiseExceptionFromTFStatus(TF_Status* status, PyObject* exception) {
 int MaybeRaiseExceptionFromStatus(const tensorflow::Status& status,
                                   PyObject* exception) {
   if (status.ok()) return 0;
-  const char* msg = tsl::NullTerminatedMessage(status);
+  const char* msg = absl::StatusMessageAsCStr(status);
   if (exception == nullptr) {
     tensorflow::mutex_lock l(exception_class_mutex);
     if (exception_class != nullptr) {
@@ -1332,7 +1331,7 @@ class PyVSpace : public tensorflow::eager::VSpace<PyObject, PyBackwardFunction,
   }
 
   PyObject* AggregateGradients(
-      tensorflow::gtl::ArraySlice<PyObject*> gradient_tensors) const final {
+      absl::Span<PyObject* const> gradient_tensors) const final {
     PyObject* list = PyList_New(gradient_tensors.size());
     for (int i = 0; i < gradient_tensors.size(); ++i) {
       // Note: stealing a reference to the gradient tensors.
@@ -1406,7 +1405,7 @@ class PyVSpace : public tensorflow::eager::VSpace<PyObject, PyBackwardFunction,
   tensorflow::Status CallBackwardFunction(
       const string& op_type, PyBackwardFunction* backward_function,
       const std::vector<int64_t>& unneeded_gradients,
-      tensorflow::gtl::ArraySlice<PyObject*> output_gradients,
+      absl::Span<PyObject* const> output_gradients,
       absl::Span<PyObject*> result) const final {
     PyObject* grads = PyTuple_New(output_gradients.size());
     for (int i = 0; i < output_gradients.size(); ++i) {
@@ -3680,8 +3679,8 @@ bool RunCallbacks(
 }  // namespace
 
 PyObject* TFE_Py_FastPathExecute_C(PyObject* args) {
-  tensorflow::profiler::TraceMe activity(
-      "TFE_Py_FastPathExecute_C", tensorflow::profiler::TraceMeLevel::kInfo);
+  tsl::profiler::TraceMe activity("TFE_Py_FastPathExecute_C",
+                                  tsl::profiler::TraceMeLevel::kInfo);
   Py_ssize_t args_size = PyTuple_GET_SIZE(args);
   if (args_size < FAST_PATH_EXECUTE_ARG_INPUT_START) {
     PyErr_SetString(
@@ -3899,7 +3898,7 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject* args) {
       const string& attr_name = input_arg.type_list_attr();
       Py_ssize_t len = PySequence_Fast_GET_SIZE(fast_input.get());
       PyObject** fast_input_array = PySequence_Fast_ITEMS(fast_input.get());
-      tensorflow::gtl::InlinedVector<TF_DataType, 4> attr_value(len);
+      absl::InlinedVector<TF_DataType, 4UL> attr_value(len);
       PyObject* py_attr_value = nullptr;
       if (op_exec_info.run_callbacks) {
         py_attr_value = PyTuple_New(len);
@@ -3973,7 +3972,7 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject* args) {
   }
   int num_retvals = num_outputs;
 
-  tensorflow::gtl::InlinedVector<TFE_TensorHandle*, 2> retvals(num_retvals);
+  absl::InlinedVector<TFE_TensorHandle*, 2UL> retvals(num_retvals);
 
   Py_BEGIN_ALLOW_THREADS;
   TFE_Execute(op, retvals.data(), &num_retvals, status);
diff --git a/tensorflow/python/flags_pybind.pyi b/tensorflow/python/flags_pybind.pyi
index c08b02ddcb9b5d..b34ed2f4b68c19 100644
--- a/tensorflow/python/flags_pybind.pyi
+++ b/tensorflow/python/flags_pybind.pyi
@@ -21,6 +21,7 @@ class Flag:
 class Flags:
     enable_aggressive_constant_replication: Flag
     enable_colocation_key_propagation_in_while_op_lowering: Flag
+    enable_function_pruning_before_inlining: Flag
     enable_nested_function_shape_inference: Flag
     enable_quantized_dtypes_training: Flag
     enable_tf2min_ici_weight: Flag
diff --git a/tensorflow/python/framework/BUILD b/tensorflow/python/framework/BUILD
index e1a09c076382fc..bac9403d63dc27 100644
--- a/tensorflow/python/framework/BUILD
+++ b/tensorflow/python/framework/BUILD
@@ -3115,7 +3115,11 @@ tf_python_pybind_extension(
         "_errors_test_helper.pyi",
     ],
     deps = [
+        "//tensorflow/c:tf_status_headers",
+        "//tensorflow/core/platform:status",
         "//tensorflow/python/lib/core:pybind11_status",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:cord",
         "@pybind11",
     ],
 )
diff --git a/tensorflow/python/framework/errors_test_helper.cc b/tensorflow/python/framework/errors_test_helper.cc
index 2c4ea592d517b8..23c3ea6e655736 100644
--- a/tensorflow/python/framework/errors_test_helper.cc
+++ b/tensorflow/python/framework/errors_test_helper.cc
@@ -13,7 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "absl/status/status.h"
+#include "absl/strings/cord.h"
 #include "pybind11/pybind11.h"  // from @pybind11
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/python/lib/core/pybind11_status.h"
 
 namespace tensorflow {
diff --git a/tensorflow/python/framework/experimental/tape.cc b/tensorflow/python/framework/experimental/tape.cc
index 18f3b149356249..2282d355131f63 100644
--- a/tensorflow/python/framework/experimental/tape.cc
+++ b/tensorflow/python/framework/experimental/tape.cc
@@ -41,7 +41,7 @@ Status RegisterGradients(GradientRegistry* registry) {
   TF_RETURN_IF_ERROR(registry->Register("Mul", MulRegisterer));
   TF_RETURN_IF_ERROR(registry->Register("Log1p", Log1pRegisterer));
   TF_RETURN_IF_ERROR(registry->Register("DivNoNan", DivNoNanRegisterer));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 PYBIND11_MODULE(_tape, m) {
diff --git a/tensorflow/python/framework/graph_util_impl.py b/tensorflow/python/framework/graph_util_impl.py
index 7ad402b978bd02..f34a3d15f4783f 100644
--- a/tensorflow/python/framework/graph_util_impl.py
+++ b/tensorflow/python/framework/graph_util_impl.py
@@ -325,7 +325,7 @@ def remove_training_nodes(input_graph, protected_nodes=None):
       new_node.input.append(full_input_name)
     nodes_after_removal.append(new_node)
 
-  types_to_splice = {"Identity": True}
+  types_to_splice = {"Identity": True, "StopGradient": True}
   control_input_names = set()
   node_names_with_control_input = set()
   node_in_colocated = set()
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index d49c62d94d040d..ecb2fb760859ff 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -506,8 +506,7 @@ def _import_graph_def_internal(  # pylint: disable=invalid-name
 
   # _ProcessNewOps mutates the new operations. _mutation_lock ensures a
   # Session.run call cannot occur between creating the TF_Operations in the
-  # TF_GraphImportGraphDefWithResults call and mutating the them in
-  # _ProcessNewOps.
+  # TF_GraphImportGraphDefWithResults call and mutating them in _ProcessNewOps.
   with graph._mutation_lock():  # pylint: disable=protected-access
     if is_oss:
       graph_def_input = c_api.TF_NewBufferFromString(
diff --git a/tensorflow/python/framework/offset_counter_helper.cc b/tensorflow/python/framework/offset_counter_helper.cc
index 219fa700f330f6..71c147c3821248 100644
--- a/tensorflow/python/framework/offset_counter_helper.cc
+++ b/tensorflow/python/framework/offset_counter_helper.cc
@@ -26,8 +26,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-tsl::Status FindOpRegistationFromFile(absl::string_view filename,
-                                      OpRegOffsets& op_reg_offsets) {
+absl::Status FindOpRegistationFromFile(absl::string_view filename,
+                                       OpRegOffsets& op_reg_offsets) {
   static constexpr LazyRE2 reg_pattern = {
       R"regex((REGISTER_OP)\("([\w>]+)"\))regex"};
   std::ifstream f(std::string{filename});
@@ -53,7 +53,7 @@ tsl::Status FindOpRegistationFromFile(absl::string_view filename,
     offsets += line.size() + 1;  // `line` doesn't contain line break
   }
   f.close();
-  return tsl::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/python/framework/offset_counter_helper.h b/tensorflow/python/framework/offset_counter_helper.h
index 6983517f98daa3..072b6c0060c6aa 100644
--- a/tensorflow/python/framework/offset_counter_helper.h
+++ b/tensorflow/python/framework/offset_counter_helper.h
@@ -23,8 +23,8 @@ limitations under the License.
 #include "tsl/platform/types.h"
 
 namespace tensorflow {
-tsl::Status FindOpRegistationFromFile(absl::string_view filename,
-                                      OpRegOffsets& op_reg_offsets);
+absl::Status FindOpRegistationFromFile(absl::string_view filename,
+                                       OpRegOffsets& op_reg_offsets);
 }
 
 #endif  // TENSORFLOW_PYTHON_FRAMEWORK_OFFSET_COUNTER_HELPER_H_
diff --git a/tensorflow/python/framework/python_api_info.cc b/tensorflow/python/framework/python_api_info.cc
index 43fd58b5cbe519..7df48e4d1be528 100644
--- a/tensorflow/python/framework/python_api_info.cc
+++ b/tensorflow/python/framework/python_api_info.cc
@@ -167,7 +167,7 @@ Status PythonAPIInfo::Initialize(const OpDef& op_def,
       },
       &inputs_with_number_attrs_);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status PythonAPIInfo::CheckParamNames() const {
@@ -190,7 +190,7 @@ Status PythonAPIInfo::CheckParamNames() const {
           api_name_, ": missing specification for parameter ", param_names_[i]);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status PythonAPIInfo::InitializeFromRegisteredOp(const std::string& op_name) {
@@ -201,7 +201,7 @@ Status PythonAPIInfo::InitializeFromRegisteredOp(const std::string& op_name) {
   Safe_PyObjectPtr defaults_tuple;
   GetOpDefNamesAndDefaults(*op_def, param_names, defaults_tuple);
   TF_RETURN_IF_ERROR(Initialize(*op_def, param_names, defaults_tuple.get()));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status PythonAPIInfo::InitializeFromParamSpecs(
@@ -223,7 +223,7 @@ Status PythonAPIInfo::InitializeFromParamSpecs(
   TF_RETURN_IF_ERROR(
       Initialize(op_reg_data.op_def, param_names, defaults_tuple));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status PythonAPIInfo::InitializeAttribute(
@@ -293,7 +293,7 @@ Status PythonAPIInfo::InitializeAttribute(
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status PythonAPIInfo::InitializeInput(
@@ -360,7 +360,7 @@ Status PythonAPIInfo::InitializeInput(
     input->tensor_list_params.push_back(param_index);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 PythonAPIInfo::InputsWithTypeAttr* PythonAPIInfo::FindInputsWithTypeAttr(
diff --git a/tensorflow/python/framework/python_op_gen_main.cc b/tensorflow/python/framework/python_op_gen_main.cc
index fc6426e9e438bf..940c6a349b1c1f 100644
--- a/tensorflow/python/framework/python_op_gen_main.cc
+++ b/tensorflow/python/framework/python_op_gen_main.cc
@@ -69,7 +69,7 @@ Status ReadOpListFromFile(const string& filename,
     s = input_buffer->ReadLine(&line_contents);
   }
   if (!errors::IsOutOfRange(s)) return s;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ReadOpRegOffsetsFromFile(absl::string_view filename,
@@ -82,7 +82,7 @@ Status ReadOpRegOffsetsFromFile(absl::string_view filename,
   string contents;
   TF_RETURN_IF_ERROR(in.ReadAll(&contents));
   op_reg_offsets->ParseFromString(contents);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 std::vector<string> GetSourceFileListFromOpRegOffsets(
@@ -150,7 +150,7 @@ Status PrintAllPythonOps(absl::Span<const string> api_def_dirs,
     TF_RETURN_IF_ERROR(file->Append(result));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
diff --git a/tensorflow/python/framework/test_file_system.cc b/tensorflow/python/framework/test_file_system.cc
index f4114e84662e44..1bb3bff3520b10 100644
--- a/tensorflow/python/framework/test_file_system.cc
+++ b/tensorflow/python/framework/test_file_system.cc
@@ -42,13 +42,13 @@ class TestFileSystem : public NullFileSystem {
       const string& fname, TransactionToken* token,
       std::unique_ptr<RandomAccessFile>* result) override {
     result->reset(new TestRandomAccessFile);
-    return OkStatus();
+    return absl::OkStatus();
   }
   // Always return size of 10
   Status GetFileSize(const string& fname, TransactionToken* token,
                      uint64* file_size) override {
     *file_size = 10;
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
diff --git a/tensorflow/python/framework/test_ops.cc b/tensorflow/python/framework/test_ops.cc
index 1a6560aa636e7f..28e1aa5c0cb996 100644
--- a/tensorflow/python/framework/test_ops.cc
+++ b/tensorflow/python/framework/test_ops.cc
@@ -42,7 +42,7 @@ REGISTER_OP("KernelLabelRequired")
       shape_inference::ShapeHandle out;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &out));
       c->set_output(0, c->Scalar());
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 REGISTER_OP("GraphDefVersion")
diff --git a/tensorflow/python/kernel_tests/math_ops/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/math_ops/segment_reduction_ops_test.py
index 13dd349750b0f4..179ffe7714a490 100644
--- a/tensorflow/python/kernel_tests/math_ops/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/math_ops/segment_reduction_ops_test.py
@@ -601,7 +601,9 @@ def testBadNumSegments(self):
       num_segments = 8327099846119777499
       unsorted = math_ops.unsorted_segment_sum(
           np.ones((3)), segment_ids=898042203, num_segments=num_segments)
-      with self.assertRaisesOpError("Encountered overflow when multiplying"):
+      with self.assertRaisesOpError(
+          "Encountered overflow when multiplying | must not be negative"
+      ):
         self.evaluate(unsorted)
 
 
diff --git a/tensorflow/python/lib/core/BUILD b/tensorflow/python/lib/core/BUILD
index 0ab9f789976e11..0079ecbf1e7669 100644
--- a/tensorflow/python/lib/core/BUILD
+++ b/tensorflow/python/lib/core/BUILD
@@ -288,3 +288,23 @@ cc_library(
         "//third_party/python_runtime:headers",
     ],
 )
+
+# copybara:uncomment_begin(google-only)
+# cc_library(
+#     name = "pyclif_tensor",
+#     srcs = ["pyclif_tensor.cc"],
+#     hdrs = ["pyclif_tensor.h"],
+#     visibility = ["//visibility:public"],
+#     deps = [
+#         ":ndarray_tensor",
+#         ":ndarray_tensor_bridge",
+#         ":safe_pyobject_ptr",
+#         "@com_google_absl//absl/log:check",
+#         "@com_google_absl//absl/status",
+#         "//third_party/clif/python:clif",
+#         "//third_party/python_runtime:headers",
+#         "@local_xla//xla/tsl/python/lib/core:numpy",
+#         "//tensorflow/core:framework",
+#     ],
+# )
+# copybara:uncomment_end
diff --git a/tensorflow/python/lib/core/ndarray_tensor.cc b/tensorflow/python/lib/core/ndarray_tensor.cc
index 5096026cbdb1db..3b350c1633678c 100644
--- a/tensorflow/python/lib/core/ndarray_tensor.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor.cc
@@ -333,7 +333,7 @@ Status CopyTF_TensorStringsToPyArray(const TF_Tensor* src, uint64 nelems,
 // Determine the dimensions of a numpy ndarray to be created to represent an
 // output Tensor.
 Status GetPyArrayDimensionsForTensor(const TF_Tensor* tensor,
-                                     gtl::InlinedVector<npy_intp, 4>* dims,
+                                     absl::InlinedVector<npy_intp, 4UL>* dims,
                                      int64_t* nelems) {
   dims->clear();
   const int ndims = TF_NumDims(tensor);
@@ -431,7 +431,7 @@ Status TF_TensorToMaybeAliasedPyArray(Safe_TF_TensorPtr tensor,
 
   TF_Tensor* moved = tensor.release();
   int64_t nelems = -1;
-  gtl::InlinedVector<npy_intp, 4> dims;
+  absl::InlinedVector<npy_intp, 4UL> dims;
   TF_RETURN_IF_ERROR(GetPyArrayDimensionsForTensor(moved, &dims, &nelems));
   return ArrayFromMemory(
       dims.size(), dims.data(), TF_TensorData(moved),
@@ -450,7 +450,7 @@ Status TF_TensorToPyArray(Safe_TF_TensorPtr tensor, PyObject** out_ndarray) {
     return OkStatus();
   }
   int64_t nelems = -1;
-  gtl::InlinedVector<npy_intp, 4> dims;
+  absl::InlinedVector<npy_intp, 4UL> dims;
   TF_RETURN_IF_ERROR(
       GetPyArrayDimensionsForTensor(tensor.get(), &dims, &nelems));
 
@@ -525,7 +525,7 @@ Status NdarrayToTensor(TFE_Context* ctx, PyObject* ndarray,
   TF_RETURN_IF_ERROR(PyArray_TYPE_to_TF_DataType(array, &dtype));
 
   int64_t nelems = 1;
-  gtl::InlinedVector<int64_t, 4> dims;
+  absl::InlinedVector<int64_t, 4UL> dims;
   for (int i = 0; i < PyArray_NDIM(array); ++i) {
     dims.push_back(PyArray_SHAPE(array)[i]);
     nelems *= dims[i];
diff --git a/tensorflow/python/lib/core/py_seq_tensor.cc b/tensorflow/python/lib/core/py_seq_tensor.cc
index aeac93b3711984..4cd43ae0c37d28 100644
--- a/tensorflow/python/lib/core/py_seq_tensor.cc
+++ b/tensorflow/python/lib/core/py_seq_tensor.cc
@@ -76,7 +76,7 @@ bool IsPyFloat(PyObject* obj) {
 
 struct ConverterState {
   // The inferred tensor shape.
-  gtl::InlinedVector<int64_t, 4> inferred_shape;
+  absl::InlinedVector<int64_t, 4UL> inferred_shape;
 
   // The inferred tensor data type.
   DataType inferred_dtype;
diff --git a/tensorflow/python/lib/io/BUILD b/tensorflow/python/lib/io/BUILD
index fd03e91c846a98..0a97ef20c14055 100644
--- a/tensorflow/python/lib/io/BUILD
+++ b/tensorflow/python/lib/io/BUILD
@@ -63,7 +63,6 @@ py_strict_library(
         "//tensorflow:internal",
         "//third_party/cloud_tpu/convergence_tools:__subpackages__",
         "//third_party/proto_splitter:__subpackages__",  # TODO(b/277279227): remove this dep from proto_splitter
-        "//third_party/py/tf_agents:__subpackages__",
         "//third_party/py/tf_slim:__subpackages__",
     ],
     deps = [
@@ -82,9 +81,7 @@ py_strict_library(
     visibility = [
         "//tensorflow:__subpackages__",
         "//tensorflow:internal",
-        "//third_party/cloud_tpu/convergence_tools:__subpackages__",
         "//third_party/py/tf_agents:__subpackages__",
-        "//third_party/py/tf_slim:__subpackages__",
     ],
     deps = [
         ":_pywrap_record_io",
@@ -100,9 +97,6 @@ py_strict_library(
     visibility = [
         "//tensorflow:__subpackages__",
         "//tensorflow:internal",
-        "//third_party/cloud_tpu/convergence_tools:__subpackages__",
-        "//third_party/py/tf_agents:__subpackages__",
-        "//third_party/py/tf_slim:__subpackages__",
     ],
     deps = [":tf_record"],
 )
diff --git a/tensorflow/python/module/module_test.py b/tensorflow/python/module/module_test.py
index bcfa84b14d1507..31623f8376ae88 100644
--- a/tensorflow/python/module/module_test.py
+++ b/tensorflow/python/module/module_test.py
@@ -535,8 +535,6 @@ def test_with_path(self):
                       ("decoder", "w", 0, 0, "k"): mod.decoder.w[0][0]["k"],
                       ("decoder", "w", 0, 1, "k"): mod.decoder.w[0][1]["k"]},)
 
-  @unittest.skipIf(sys.version_info.major == 3 and sys.version_info.minor == 12,
-                   reason="b/313658911: _TupleWrapper __dict__ attribute error")
   def test_cycles_with_path(self):
     mod = module.Module()
     mod.w = variables.Variable(1.)
diff --git a/tensorflow/python/ops/BUILD b/tensorflow/python/ops/BUILD
index 533263a282ffbf..d90ff627336285 100644
--- a/tensorflow/python/ops/BUILD
+++ b/tensorflow/python/ops/BUILD
@@ -2072,6 +2072,7 @@ py_strict_library(
         ":array_ops_stack",
         ":bitwise_ops_gen",
         ":data_flow_ops_gen",
+        ":logging_ops_gen",
         ":math_ops_gen",
         ":nn_ops_gen",
         ":sparse_ops_gen",
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index c5ba8157dc73e2..7f856d10b0c082 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -811,6 +811,27 @@ def _IdNGrad(_, *grad):
 
 @ops.RegisterGradient("Reshape")
 def _ReshapeGrad(op: ops.Operation, grad):
+  """Defines the gradient for `array_ops.reshape()`."""
+  input_shape = op.inputs[0].shape
+  if input_shape.rank is not None and not input_shape.is_fully_defined():
+    # If only one dimension is undefined, we can use a wildcard dimension in
+    # the argument to `reshape()` to avoid creating a data dependency via
+    # a dynamic `shape()` operation.
+    input_shape_as_list = input_shape.as_list()
+    undefined_dims = []
+    has_zero_dim = False
+    for i, dim in enumerate(input_shape_as_list):
+      if dim is None:
+        undefined_dims.append(i)
+      elif dim == 0:
+        # When the tensor has zero elements, the wildcard dimension
+        # is underspecified, and `reshape()` will arbitrarily set the unknown
+        # to `1`, triggering shape errors downstream.
+        has_zero_dim = True
+    if len(undefined_dims) == 1 and not has_zero_dim:
+      input_shape_as_list[undefined_dims[0]] = -1
+      return [array_ops.reshape(_IndexedSlicesToTensorNoWarning(grad),
+                                input_shape_as_list), None]
   return [
       array_ops.reshape(
           _IndexedSlicesToTensorNoWarning(grad), array_ops.shape(op.inputs[0])),
diff --git a/tensorflow/python/ops/array_grad_test.py b/tensorflow/python/ops/array_grad_test.py
index 3dfe394a2c15e3..1f7c65e5b6fe70 100644
--- a/tensorflow/python/ops/array_grad_test.py
+++ b/tensorflow/python/ops/array_grad_test.py
@@ -147,6 +147,41 @@ def f(x):
 
     self._testGrad(f, x)
 
+  def test_reshape_simple(self):
+    x = constant_op.constant([1., 2., 3.], dtype=dtypes.float64)
+    y = constant_op.constant([3, 1], dtype=dtypes.int64)
+
+    def f(x):
+      return array_ops.reshape(x, y)
+
+    self._testGrad(f, x)
+
+  def test_reshape_one_unknown_dim(self):
+    def f(x):
+      x_without_shape = array_ops.placeholder_with_default(x, shape=[None, 2])
+      return array_ops.reshape(x_without_shape, [3, 2])
+
+    x = constant_op.constant([[1., 2.], [3., 4.], [5., 6.]],
+                             dtype=dtypes.float64)
+    self._testGrad(f, x)
+
+  def test_reshape_two_unknown_dims(self):
+    def f(x):
+      x_without_shape = array_ops.placeholder_with_default(x,
+                                                           shape=[None, None])
+      return array_ops.reshape(x_without_shape, [6])
+
+    x = constant_op.constant([[1., 2., 3.], [4., 5., 6.]], dtype=dtypes.float64)
+    self._testGrad(f, x)
+
+  def test_reshape_one_unknown_dim_and_zero_elements(self):
+    def f(x):
+      x_without_shape = array_ops.placeholder_with_default(x, shape=[None, 0])
+      return array_ops.reshape(x_without_shape, [0])
+
+    x = constant_op.constant([], shape=[3, 0], dtype=dtypes.float64)
+    self._testGrad(f, x)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/linalg/sparse/BUILD b/tensorflow/python/ops/linalg/sparse/BUILD
index 0012a19a12d047..b05cddf928098e 100644
--- a/tensorflow/python/ops/linalg/sparse/BUILD
+++ b/tensorflow/python/ops/linalg/sparse/BUILD
@@ -21,7 +21,7 @@ tf_gen_op_wrapper_py(
         "//tensorflow/python/util:tf_export",
     ],
     py_lib_rule = py_strict_library,
-    visibility = ["//tensorflow/python/kernel_tests/linalg/sparse:__pkg__"],
+    visibility = ["//visibility:private"],
     deps = ["//tensorflow/core:sparse_csr_matrix_ops_op_lib"],
 )
 
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 7941ae9bf6594a..510865596fe4b5 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -87,6 +87,7 @@
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_bitwise_ops
 from tensorflow.python.ops import gen_data_flow_ops
+from tensorflow.python.ops import gen_logging_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gen_sparse_ops
@@ -4678,6 +4679,200 @@ def sparse_segment_sum(
     )
 
 
+@tf_export("sparse.sampled_addmm", v1=[])
+def sampled_addmm(
+    indices,
+    values,
+    dense_shape,
+    mat1,
+    mat2,
+    beta=1.0,
+    alpha=1.0,
+    output_type=dtypes.float32,
+):
+  """Performs the sampled matrix multiplication of two dense matrices.
+
+  Multiplies matrix `mat1` by matrix `mat2` at the locations defined by
+  `indices`. The product is scaled and added to `values`,
+  producing `alpha` * (`mat1` @ `mat2`) * spy(`indices`) + `beta` * `values`.
+
+  The function `spy(indices)` is the sparsity pattern matrix derived from
+  `indices`.
+
+  The `mat1` and `mat2` inputs must be tensors of rank >= 2 where the inner 2
+  dimensions specify valid matrix multiplication dimensions, and any further
+  dimensions specify matching batch size.
+
+  The `indices`, `values`, and `dense_shape` inputs make up the components of a
+  `SparseTensor` which defines the sparsity pattern of the output. The sparsity
+  pattern has values of 1 at the positions defined by the `SparseTensor`, and 0
+  elsewhere.
+
+  The `alpha` and `beta` inputs are the scaling factors.
+
+  The supported types for `values`, `mat1`, and `mat2` are:
+  `bfloat16`, `float16`, `float32`, `float64`.
+
+  A simple 2-D tensor operation:
+
+  >>> indices = tf.constant([0, 0, 1, 1], shape=[2, 2])
+  >>> indices
+  <tf.Tensor: shape=(2, 2), dtype=int32, numpy=
+  array([[0, 0],
+         [1, 1]], dtype=int32)>
+  >>> values = tf.constant([0.5, 0.3])
+  >>> values
+  <tf.Tensor: shape=(2,), dtype=float32, numpy=array([0.5, 0.3], dtype=float32)>
+  >>> dense_shape = tf.constant([2, 2])
+  >>> dense_shape
+  <tf.Tensor: shape=(2,), dtype=int32, numpy=array([2, 2], dtype=int32)>
+  >>> mat1 = tf.constant([1, 2, 3, 4, 5, 6], shape=[2, 3], dtype=tf.float32)
+  >>> mat1
+  <tf.Tensor: shape=(2, 3), dtype=float32, numpy=
+  array([[1., 2., 3.],
+         [4., 5., 6.]], dtype=float32)>
+  >>> mat2 = tf.constant([7, 8, 9, 10, 11, 12], shape=[3, 2], dtype=tf.float32)
+  >>> mat2
+  <tf.Tensor: shape=(3, 2), dtype=float32, numpy=
+  array([[ 7.,  8.],
+         [ 9., 10.],
+         [11., 12.]], dtype=float32)>
+  >>> tf.sparse.sampled_addmm(indices, values, dense_shape, mat1, mat2,
+  ... alpha=0.75, beta=0.25)
+  (<tf.Tensor: shape=(2, 2), dtype=int32, numpy=
+  array([[0, 0],
+         [1, 1]], dtype=int32)>, <tf.Tensor: shape=(2,), dtype=float32, numpy=
+  array([ 43.625, 115.575], dtype=float32)>,
+  <tf.Tensor: shape=(2,), dtype=int32, numpy=array([2, 2], dtype=int32)>)
+
+  A batch operation:
+
+  >>> indices = tf.constant([0, 1, 1, 0, 0, 0, 1, 0], shape=[2, 2, 2])
+  >>> indices
+  <tf.Tensor: shape=(2, 2, 2), dtype=int32, numpy=
+  array([[[0, 1],
+          [1, 0]],
+         [[0, 0],
+          [1, 0]]], dtype=int32)>
+  >>> values = tf.constant([3, 5, 2, 7], shape=[2, 2], dtype=tf.float32)
+  >>> values
+  <tf.Tensor: shape=(2, 2), dtype=float32, numpy=
+  array([[3., 5.],
+         [2., 7.]], dtype=float32)>
+  >>> dense_shape = tf.constant([2, 2])
+  >>> dense_shape
+  <tf.Tensor: shape=(2,), dtype=int32, numpy=array([2, 2], dtype=int32)>
+  >>> mat1 = tf.constant(np.arange(1, 13), shape=[2, 2, 3], dtype=tf.float32)
+  >>> mat1
+  <tf.Tensor: shape=(2, 2, 3), dtype=float32, numpy=
+  array([[[ 1.,  2.,  3.],
+          [ 4.,  5.,  6.]],
+         [[ 7.,  8.,  9.],
+          [10., 11., 12.]]], dtype=float32)>
+  >>> mat2 = tf.constant(np.arange(13, 25), shape=[2, 3, 2], dtype=tf.float32)
+  >>> mat2
+  <tf.Tensor: shape=(2, 3, 2), dtype=float32, numpy=
+  array([[[13., 14.],
+          [15., 16.],
+          [17., 18.]],
+         [[19., 20.],
+          [21., 22.],
+          [23., 24.]]], dtype=float32)>
+  >>> tf.sparse.sampled_addmm(indices, values, dense_shape, mat1, mat2,
+  ... alpha=0.75, beta=0.25)
+  (<tf.Tensor: shape=(2, 2, 2), dtype=int32, numpy=
+  array([[[0, 1],
+          [1, 0]],
+         [[0, 0],
+          [1, 0]]], dtype=int32)>, <tf.Tensor: shape=(2, 2), dtype=float32,
+  numpy=array([[ 75.75, 173.  ],
+         [381.5 , 524.5 ]], dtype=float32)>,
+  <tf.Tensor: shape=(2,), dtype=int32, numpy=array([2, 2], dtype=int32)>)
+
+  Args:
+    indices: `tf.Tensor` containing coordinates for the rows and columns to be
+      multiplied. Must have rank > 1.
+    values: `tf.Tensor` containing the values to be scaled and added to the
+      sampled dot product.
+    dense_shape: `tf.Tensor` defining the dense shape of the output.
+    mat1: `tf.Tensor` to be multiplied. Must have rank > 1.
+    mat2: `tf.Tensor` to be multiplied. Must have rank > 1.
+    beta: Number to be multipled with `values`. Defaults to 1.0.
+    alpha: Number to be multiplied with the sampled dot product of `mat1` and
+      `mat2`. Defaults to 1.0.
+    output_type: The output datatype if needed. Defaults to float32.
+
+  Returns:
+    A tuple representing the `SparseTensor` components of the result of the
+    operation.
+
+  Raises:
+    ValueError: If `dense_shape` does not match the shape of the product.
+  """
+  indices = ops.convert_to_tensor(indices)
+  values = ops.convert_to_tensor(values, dtype=output_type)
+  dense_shape = ops.convert_to_tensor(dense_shape, dtype=dtypes.int32)
+  mat1 = ops.convert_to_tensor(mat1, dtype=output_type)
+  mat2 = ops.convert_to_tensor(mat2, dtype=output_type)
+
+  mat1_shape = tensor_util.constant_value(array_ops.shape(mat1))
+  mat2_shape = tensor_util.constant_value(array_ops.shape(mat2))
+
+  dense_rows = mat1_shape[-2]
+  dense_cols = mat2_shape[-1]
+
+  output_shape = array_ops_stack.stack([dense_rows, dense_cols])
+  condition = reduce_all(equal(dense_shape, output_shape))
+
+  # Use dense_shape to validate input matrix shapes.
+  if context.executing_eagerly():
+    if not condition:
+      raise ValueError(
+          f"Dense shape: {dense_shape} does not match "
+          f"output shape: {output_shape}"
+      )
+  else:  # not context.executing_eagerly()
+    dense_shape_static = tensor_util.constant_value(dense_shape)
+    output_shape_static = tensor_util.constant_value(output_shape)
+    if dense_shape_static is not None and output_shape_static is not None:
+      condition_static = np.all(
+          np.equal(dense_shape_static, output_shape_static)
+      )
+      if not condition_static:
+        raise ValueError(
+            f"Dense shape: {dense_shape} does not match "
+            f"output shape: {output_shape}"
+        )
+
+    data = [
+        "Dense shape: ",
+        dense_shape,
+        " does not match output shape: ",
+        output_shape,
+    ]
+
+    gen_logging_ops._assert(condition, data, None, name="Assert")
+
+  # Extract row and column indices.
+  batch_indices = indices[..., :-2]
+  row_indices = indices[..., :-1]
+  col_indices = array_ops.concat([batch_indices, indices[..., -1:]], axis=-1)
+
+  # Calculate batch dimensions.
+  rank = tensor_util.constant_value(array_ops.rank(mat1))
+  batch_dims = rank - 2
+
+  # Extract rows and columns.
+  rows = array_ops.gather_nd(mat1, row_indices, batch_dims=batch_dims)
+  cols = array_ops.gather_nd(
+      array_ops.matrix_transpose(mat2), col_indices, batch_dims=batch_dims
+  )
+
+  # Calculate dot product for the extracted rows and columns.
+  dot = reduce_sum(rows * cols, axis=-1)
+  return (indices, dot * alpha + values * beta, dense_shape)
+
+
 @tf_export("sparse.segment_sum", v1=[])
 def sparse_segment_sum_v2(
     data,
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 1694537d68aaf6..08c16974e8c4ee 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -372,6 +372,120 @@ def testInvalidOutputTypeMatmul(self):
           self.evaluate(math_ops.matmul(a, b, output_type=dtypes.float32))
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class SampledADDMMTest(test_util.TensorFlowTestCase):
+  """Test for sampled_addmm."""
+
+  SUPPORTED_DTYPES = [
+      dtypes.bfloat16,
+      dtypes.float16,
+      dtypes.float32,
+      dtypes.float64,
+  ]
+
+  def sampledADDMMRef(
+      self,
+      indices,
+      values,
+      dense_shape,
+      mat1,
+      mat2,
+      beta=1.0,
+      alpha=1.0,
+      output_type=dtypes.float32,
+  ):
+    dense = math_ops.matmul(mat1, mat2, output_type=output_type)
+    dense_vals = array_ops.gather_nd(dense, indices, batch_dims=dense.ndim - 2)
+    return alpha * dense_vals + beta * values
+
+  def testSampledADDMM2D(self):
+    for dtype in self.SUPPORTED_DTYPES:
+      indices = constant_op.constant([[0, 0], [1, 1]])
+      values = constant_op.constant([0.5, 0.3], dtype=dtype)
+      dense_shape = constant_op.constant([2, 2])
+      mat1 = constant_op.constant([1, 2, 3, 4, 5, 6], shape=[2, 3], dtype=dtype)
+      mat2 = constant_op.constant(
+          [7, 8, 9, 10, 11, 12], shape=[3, 2], dtype=dtype
+      )
+      alpha = 0.75
+      beta = 0.25
+
+      _, res, _ = math_ops.sampled_addmm(
+          indices,
+          values,
+          dense_shape,
+          mat1,
+          mat2,
+          beta=beta,
+          alpha=alpha,
+          output_type=dtype,
+      )
+      ref = self.sampledADDMMRef(
+          indices,
+          values,
+          dense_shape,
+          mat1,
+          mat2,
+          beta=beta,
+          alpha=alpha,
+          output_type=dtype,
+      )
+      self.assertAllClose(res, ref, atol=1e-2)
+
+  def testBatchSampledADDMM(self):
+    for dtype in self.SUPPORTED_DTYPES:
+      indices = constant_op.constant([[[0, 1], [1, 0]], [[0, 0], [1, 0]]])
+      values = constant_op.constant([[3, 5], [2, 7]], dtype=dtype)
+      dense_shape = constant_op.constant([2, 2])
+      mat1 = constant_op.constant(
+          np.arange(1, 13), shape=[2, 2, 3], dtype=dtype
+      )
+      mat2 = constant_op.constant(
+          np.arange(13, 25), shape=[2, 3, 2], dtype=dtype
+      )
+      alpha = 0.4
+      beta = 0.6
+
+      _, res, _ = math_ops.sampled_addmm(
+          indices,
+          values,
+          dense_shape,
+          mat1,
+          mat2,
+          beta=beta,
+          alpha=alpha,
+          output_type=dtype,
+      )
+      ref = self.sampledADDMMRef(
+          indices,
+          values,
+          dense_shape,
+          mat1,
+          mat2,
+          beta=beta,
+          alpha=alpha,
+          output_type=dtype,
+      )
+      self.assertAllClose(res, ref, atol=1e-2)
+
+  def testInvalidDenseShape(self):
+    for dtype in self.SUPPORTED_DTYPES:
+      indices = constant_op.constant([[[0, 1], [1, 0]], [[0, 0], [1, 0]]])
+      values = constant_op.constant([[3, 5], [2, 7]], dtype=dtype)
+      dense_shape = constant_op.constant([1, 2])
+      mat1 = constant_op.constant(
+          np.arange(1, 13), shape=[2, 2, 3], dtype=dtype
+      )
+      mat2 = constant_op.constant(
+          np.arange(13, 25), shape=[2, 3, 2], dtype=dtype
+      )
+
+      with self.assertRaisesRegex(ValueError, "does not match output shape"):
+        math_ops.sampled_addmm(
+            indices, values, dense_shape, mat1, mat2, output_type=dtype
+        )
+
+
 @test_util.run_all_in_graph_and_eager_modes
 class ModTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/ops/numpy_ops/np_dtypes.py b/tensorflow/python/ops/numpy_ops/np_dtypes.py
index 00e2cdfc22964b..ec3608df67cb7b 100644
--- a/tensorflow/python/ops/numpy_ops/np_dtypes.py
+++ b/tensorflow/python/ops/numpy_ops/np_dtypes.py
@@ -27,10 +27,6 @@
 tf_export.tf_export('experimental.numpy.bool_', v1=[]).export_constant(
     __name__, 'bool_'
 )
-complex_ = np.complex_
-tf_export.tf_export('experimental.numpy.complex_', v1=[]).export_constant(
-    __name__, 'complex_'
-)
 complex128 = np.complex128
 tf_export.tf_export('experimental.numpy.complex128', v1=[]).export_constant(
     __name__, 'complex128'
@@ -39,10 +35,6 @@
 tf_export.tf_export('experimental.numpy.complex64', v1=[]).export_constant(
     __name__, 'complex64'
 )
-float_ = np.float_
-tf_export.tf_export('experimental.numpy.float_', v1=[]).export_constant(
-    __name__, 'float_'
-)
 float16 = np.float16
 tf_export.tf_export('experimental.numpy.float16', v1=[]).export_constant(
     __name__, 'float16'
@@ -107,6 +99,20 @@
 tf_export.tf_export('experimental.numpy.unicode_', v1=[]).export_constant(
     __name__, 'unicode_'
 )
+if int(np.__version__.split('.')[0]) < 2:
+  complex_ = np.complex_
+  float_ = np.float_
+else:
+  # Aliases np.complex_ and np.float_ have been removed in Numpy 2.0. Use
+  # np.complex128 and np.float64 instead.
+  complex_ = np.complex128
+  float_ = np.float64
+tf_export.tf_export('experimental.numpy.complex_', v1=[]).export_constant(
+    __name__, 'complex_'
+)
+tf_export.tf_export('experimental.numpy.float_', v1=[]).export_constant(
+    __name__, 'float_'
+)
 
 
 iinfo = np.iinfo
diff --git a/tensorflow/python/ops/numpy_ops/tests/np_test.py b/tensorflow/python/ops/numpy_ops/tests/np_test.py
index d27c39a896dea7..fde60098841096 100644
--- a/tensorflow/python/ops/numpy_ops/tests/np_test.py
+++ b/tensorflow/python/ops/numpy_ops/tests/np_test.py
@@ -62,7 +62,7 @@
 all_dtypes = number_dtypes + bool_dtypes
 
 
-python_scalar_dtypes = [tnp.bool_, tnp.int_, tnp.float_, tnp.complex_]
+python_scalar_dtypes = [tnp.bool_, tnp.int_, tnp.float64, tnp.complex128]
 # pylint: disable=unnecessary-lambda,g-long-lambda,expression-not-assigned
 
 def _valid_dtypes_for_shape(shape, dtypes):
@@ -434,7 +434,7 @@ def wrapper(*args, **kw):
     if inexact and not any(
         tnp.issubdtype(tnp.result_type(x).as_numpy_dtype, tnp.inexact)
         for x in flat_args):
-      dtype = tnp.result_type(tnp.float_, *flat_args)
+      dtype = tnp.result_type(tnp.float64, *flat_args)
     else:
       dtype = tnp.result_type(*flat_args)
     dtype = dtype.as_numpy_dtype
@@ -479,7 +479,7 @@ class LaxBackedNumpyTests(jtu.TestCase):
 
   def _GetArgsMaker(self, rng, shapes, dtypes, onp_arrays=True):
     def f():
-      out = [rng(shape, dtype or tnp.float_)
+      out = [rng(shape, dtype or tnp.float64)
              for shape, dtype in zip(shapes, dtypes)]
       return out if onp_arrays else [tnp.asarray(a) for a in out]
     return f
@@ -1775,14 +1775,14 @@ def testAverage(self, shape, dtype, axis, weights_shape, returned, rng_factory):
        "arg": arg, "ndmin": ndmin, "dtype": dtype}
       for i, (arg, dtype) in enumerate([
           ([True, False, True], tnp.bool_),
-          (3., tnp.float_),
+          (3., tnp.float64),
           ([1, 2, 3], tnp.int_),
-          ([1., 2., 3.], tnp.float_),
+          ([1., 2., 3.], tnp.float64),
           ([[1, 2], [3, 4], [5, 6]], tnp.int_),
-          ([[1, 2.], [3, 4], [5, 6]], tnp.float_),
-          ([[1., 2j], [3., 4.], [5., 6.]], tnp.complex_),
-          ([[3, onp.array(2, dtype=tnp.float_), 1],
-           onp.arange(3., dtype=tnp.float_)], tnp.float_),  # pylint: disable=bad-continuation
+          ([[1, 2.], [3, 4], [5, 6]], tnp.float64),
+          ([[1., 2j], [3., 4.], [5., 6.]], tnp.complex128),
+          ([[3, onp.array(2, dtype=tnp.float64), 1],
+           onp.arange(3., dtype=tnp.float64)], tnp.float64),  # pylint: disable=bad-continuation
       ])
       for ndmin in [None, onp.ndim(arg), onp.ndim(arg) + 1, onp.ndim(arg) + 2]))
   def testArray(self, arg, ndmin, dtype):
@@ -2055,14 +2055,14 @@ def testAstype(self):
 
   def testOnpMean(self):
     # from https://github.com/google/jax/issues/125
-    x = tnp.add(tnp.eye(3, dtype=tnp.float_), 0.)
+    x = tnp.add(tnp.eye(3, dtype=tnp.float64), 0.)
     ans = onp.mean(x)
     self.assertAllClose(ans, onp.array(1./3), check_dtypes=False)
 
   @jtu.disable
   def testArangeOnFloats(self):
     # from https://github.com/google/jax/issues/145
-    expected = onp.arange(0.0, 1.0, 0.1, dtype=tnp.float_)
+    expected = onp.arange(0.0, 1.0, 0.1, dtype=tnp.float64)
     ans = tnp.arange(0.0, 1.0, 0.1)
     self.assertAllClose(expected, ans, check_dtypes=True)
 
@@ -2431,8 +2431,8 @@ def testIssue453(self):
   @named_parameters(jtu.cases_from_list(
       {"testcase_name": "_op={}_dtype={}".format(op, pytype.__name__),
        "pytype": pytype, "dtype": dtype, "op": op}
-      for pytype, dtype in [(int, tnp.int_), (float, tnp.float_),
-                            (bool, tnp.bool_), (complex, tnp.complex_)]
+      for pytype, dtype in [(int, tnp.int_), (float, tnp.float64),
+                            (bool, tnp.bool_), (complex, tnp.complex128)]
       for op in ["atleast_1d", "atleast_2d", "atleast_3d"]))
   def testAtLeastNdLiterals(self, pytype, dtype, op):
     # Fixes: https://github.com/google/jax/issues/634
@@ -2470,7 +2470,7 @@ def testArange(self):
         onp.arange(2, 13, dtype=int),
         check_dtypes=True)
     self.assertAllClose(tnp.arange(0, 1, -0.5),
-                        onp.arange(0, 1, -0.5, dtype=tnp.float_),
+                        onp.arange(0, 1, -0.5, dtype=tnp.float64),
                         check_dtypes=True)
 
     self.assertRaises(TypeError, lambda: tnp.arange())
diff --git a/tensorflow/python/ops/numpy_ops/tests/test_util.py b/tensorflow/python/ops/numpy_ops/tests/test_util.py
index cf178a3f5dfbcc..086b182bf2a628 100644
--- a/tensorflow/python/ops/numpy_ops/tests/test_util.py
+++ b/tensorflow/python/ops/numpy_ops/tests/test_util.py
@@ -76,8 +76,8 @@
 python_scalar_dtypes = {
   bool: onp.dtype(onp.bool_),
   int: onp.dtype(onp.int_),
-  float: onp.dtype(onp.float_),
-  complex: onp.dtype(onp.complex_),
+  float: onp.dtype(onp.float64),
+  complex: onp.dtype(onp.complex128),
 }
 
 
diff --git a/tensorflow/python/ops/partitioned_variables.py b/tensorflow/python/ops/partitioned_variables.py
index 81b9036e318b60..d5c7108566ea5c 100644
--- a/tensorflow/python/ops/partitioned_variables.py
+++ b/tensorflow/python/ops/partitioned_variables.py
@@ -124,9 +124,8 @@ def _partitioner(shape, dtype):
       raise ValueError(f"shape is not a TensorShape: {shape}")
     if not shape.is_fully_defined():
       raise ValueError(f"shape is not fully defined: {shape}")
-    if not isinstance(dtype, dtypes.DType):
-      raise ValueError(f"dtype is not a DType: {dtype}")
 
+    dtype = dtypes.as_dtype(dtype)
     if dtype.base_dtype == dtypes.string:
       element_size = bytes_per_string_element
     else:
@@ -201,6 +200,7 @@ def _partitioner(shape, dtype):
       raise ValueError(
           f"Cannot partition variable along axis {axis} when shape is "
           f"only {shape}")
+    dtype = dtypes.as_dtype(dtype)
     if dtype.base_dtype == dtypes.string:
       bytes_per_element = bytes_per_string_element
     else:
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index d2620a94d622d8..236ccc0121a9ee 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -461,6 +461,9 @@ def py_function_wrapper(*args):
 @tf_export("py_function")
 @dispatch.add_dispatch_support
 def eager_py_func(func=None, inp=None, Tout=None, name=None):
+  # TODO(b/338268835): Remove "pyformat: disable" and the "pyformat: enable"
+  # line below if this feature request is implemented.
+  # pyformat: disable
   """Wraps a python function into a TensorFlow op that executes it eagerly.
 
   Using `tf.py_function` inside a `tf.function` allows you to run a python
@@ -609,13 +612,13 @@ def eager_py_func(func=None, inp=None, Tout=None, name=None):
       `CompositeTensors` (such as `tf.RaggedTensor`); or a single `Tensor` or
       `CompositeTensor`. Do not set `inp` when using `tf.py_function` as a
       decorator.
-    Tout: The type(s) of the value(s) returned by `func`.  One of the following.
+    Tout: The type(s) of the value(s) returned by `func`. One of the following:
       * If `func` returns a `Tensor` (or a value that can be converted to a
-      Tensor): the `tf.DType` for that value. * If `func` returns a
-      `CompositeTensor`: The `tf.TypeSpec` for that value. * If `func` returns
-      `None`: the empty list (`[]`). * If `func` returns a list of `Tensor` and
-      `CompositeTensor` values: a corresponding list of `tf.DType`s and
-      `tf.TypeSpec`s for each value.
+        Tensor): the `tf.DType` for that value.
+      * If `func` returns a `CompositeTensor`: The `tf.TypeSpec` for that value.
+      * If `func` returns `None`: the empty list (`[]`).
+      * If `func` returns a list of `Tensor` and `CompositeTensor` values: a
+        corresponding list of `tf.DType`s and `tf.TypeSpec`s for each value.
     name: A name for the operation (optional).
 
   Returns:
@@ -626,6 +629,8 @@ def eager_py_func(func=None, inp=None, Tout=None, name=None):
     and returns the result: a `Tensor`, `CompositeTensor`, or list of
     `Tensor` and `CompositeTensor`; or an empty list if `func` returns `None`.
   """
+  # pyformat: enable
+
   decorator = _check_args_and_maybe_make_decorator(
       eager_py_func, "tf.py_function", func=func, inp=inp, Tout=Tout, name=name
   )
diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py
index 2e3bb68e0c20f2..b6c0e46c5f1028 100644
--- a/tensorflow/python/ops/string_ops.py
+++ b/tensorflow/python/ops/string_ops.py
@@ -478,7 +478,7 @@ def string_to_number(input, out_type=dtypes.float32, name=None):
   Args:
     input: A `Tensor` of type `string`.
     out_type: An optional `tf.DType` from: `tf.float32, tf.float64, tf.int32,
-      tf.int64`. Defaults to `tf.float32`.
+      tf.int64, tf.uint32, tf.uint64`. Defaults to `tf.float32`.
       The numeric type to interpret each string in `string_tensor` as.
     name: A name for the operation (optional).
 
diff --git a/tensorflow/python/platform/BUILD b/tensorflow/python/platform/BUILD
index df26d8b84ec5a3..0ca7e7bfae738f 100644
--- a/tensorflow/python/platform/BUILD
+++ b/tensorflow/python/platform/BUILD
@@ -15,6 +15,7 @@ visibility = [
     # copybara:uncomment "//learning/brain/contrib/eager/numlib/benchmarks/kumamon:__subpackages__",
     # copybara:uncomment "//learning/brain/mobile/lite/tooling/model_analyzer:__subpackages__",
     # copybara:uncomment "//tensorflow_serving/model_servers:__subpackages__",
+    # copybara:uncomment "//third_party/odml/model_customization/quantization:__subpackages__",
 ]
 
 package(
diff --git a/tensorflow/python/profiler/internal/profiler_wrapper.cc b/tensorflow/python/profiler/internal/profiler_wrapper.cc
index 6acc8d1965d5a9..d44fc9a39c6e6c 100644
--- a/tensorflow/python/profiler/internal/profiler_wrapper.cc
+++ b/tensorflow/python/profiler/internal/profiler_wrapper.cc
@@ -160,7 +160,7 @@ PYBIND11_MODULE(_pywrap_profiler, m) {
 
         std::string tool_name = std::string(py_tool_name);
         ToolOptions tool_options = ToolOptionsFromPythonDict(options);
-        ::tensorflow::StatusOr<std::string> status_or_tool_data;
+        absl::StatusOr<std::string> status_or_tool_data;
         {
           py::gil_scoped_release release;
           status_or_tool_data =
diff --git a/tensorflow/python/tfe_wrapper.cc b/tensorflow/python/tfe_wrapper.cc
index 846b8693c227b9..36daad0d5ef925 100644
--- a/tensorflow/python/tfe_wrapper.cc
+++ b/tensorflow/python/tfe_wrapper.cc
@@ -537,7 +537,7 @@ static py::bytes TFE_GetCompilerIr(py::handle& ctx,
         TensorHandleFromInterface(abstract_tensor_handle));
   }
 
-  StatusOr<std::string> hlo_str;
+  absl::StatusOr<std::string> hlo_str;
   std::vector<Device*> devices = context->local_device_mgr()->ListDevices();
   Device* selected_device =
       GetDevice(context, device_name, platform_name, devices);
@@ -902,7 +902,7 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
             tensorflow::FunctionDef function_def;
             return function_def;
           }
-          status->status = ::tensorflow::OkStatus();
+          status->status = absl::OkStatus();
           tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
           return *ctx_function_def;
         });
diff --git a/tensorflow/python/tools/BUILD b/tensorflow/python/tools/BUILD
index 74e009c2d00f9b..e378764329546e 100644
--- a/tensorflow/python/tools/BUILD
+++ b/tensorflow/python/tools/BUILD
@@ -294,6 +294,7 @@ py_strict_test(
         "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/ops:image_ops",
         "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:math_ops_gen",
         "//tensorflow/python/ops:nn_ops",
         "//tensorflow/python/ops:nn_ops_gen",
         "//tensorflow/python/platform:client_testlib",
diff --git a/tensorflow/python/tools/optimize_for_inference.py b/tensorflow/python/tools/optimize_for_inference.py
index 9cef95399e20bd..148511cd1651c1 100644
--- a/tensorflow/python/tools/optimize_for_inference.py
+++ b/tensorflow/python/tools/optimize_for_inference.py
@@ -27,6 +27,8 @@
 
  - Removing debug operations like CheckNumerics.
 
+ - Fusing a group of primitive ops for batch normalization to FusedBatchNorm op.
+
  - Folding batch normalization ops into the pre-calculated weights.
 
  - Fusing common operations into unified versions.
diff --git a/tensorflow/python/tools/optimize_for_inference_lib.py b/tensorflow/python/tools/optimize_for_inference_lib.py
index c5ffb8ebfeee07..94af8f323ab9f1 100644
--- a/tensorflow/python/tools/optimize_for_inference_lib.py
+++ b/tensorflow/python/tools/optimize_for_inference_lib.py
@@ -25,6 +25,8 @@
 
  - Removing debug operations like CheckNumerics.
 
+ - Fusing a group of primitive ops for batch normalization to FusedBatchNorm op.
+
  - Folding batch normalization ops into the pre-calculated weights.
 
  - Fusing common operations into unified versions.
@@ -136,6 +138,7 @@ def optimize_for_inference(
   optimized_graph_def = graph_util.remove_training_nodes(
       optimized_graph_def, output_node_names
   )
+  optimized_graph_def = fuse_decomposed_batch_norm(optimized_graph_def)
   optimized_graph_def = fold_batch_norms(optimized_graph_def)
   if not toco_compatible:
     optimized_graph_def = fuse_resize_and_conv(
@@ -728,3 +731,278 @@ def convert_placeholder_to_const(input_graph_def, nodes_to_convert=None):
 
     result_graph_def.node.extend([new_node])
   return result_graph_def
+
+
+def get_const_dim_count(node_def):
+  """Get the number of dimensions for a Const node.
+
+  Args:
+    node_def: Const NodeDef.
+
+  Returns:
+    Number of dimensions for the Const node.
+  """
+  const_value = values_from_const(node_def)
+  return const_value.ndim
+
+
+def fuse_decomposed_batch_norm(input_graph_def):
+  """Fuse individual ops in batch normalization to FusedBatchNorm.
+
+  In some models, the batch normalization is performed via a group of individual
+  ops instead of using single FusedBatchNorm op. This function identifies a
+  pattern of batch normalization subgraph which is made of multiple ops and
+  transforms the graph by replacing those individual ops with a FusedBatchNorm
+  op. This will provide the opportunity to further fold the FusedBatchNorm with
+  convolution ops to reduce the computation steps during inference.
+  This function currently recognizes batch normalization patterns described
+  below, though this could be extended if newer patterns are seen. Also, the
+  fusion is only attempted if the input graph is in NHWC format.
+
+  Computation function:
+    (X * multiplier) + (Beta - Mean * multiplier)
+      where multiplier = rsqrt (Variance + Epsilon) * Gamma
+                    OR = rsqrt (Variance + Epsilon) when Gamma is 1
+
+  Subgraph:
+  {"Add"
+      {{"Mul"  // mul_0
+          {{"*"},  // input to apply batchnorm
+           {"Mul"  // mul_1, same op is used inside the Sub block
+              {{"Rsqrt"
+                  {"Add"
+                      {{"Const"},  // Variance
+                       {"Const"}  // Epsilon
+                      }
+                  }
+                },  // end - Rsqrt
+                {"Const"}  // Gamma
+              }
+            }  // end - mul_1
+          }
+       },  // end - mul_0
+       {"Sub"
+          {{"Const"},  // Beta
+           {"Mul"  // mul_3
+              {{"Const"},  // Mean
+               {"Mul"  // same mul_1 op as in previous block
+                  {{"Rsqrt"
+                      {"Add"
+                          {{"Const"},  // Variance
+                           {"Const"}  // Epsilon
+                          }
+                      }
+                   },  // end - Rsqrt
+                   {"Const"}  // Gamma
+                  }
+                }  // end - mul_1
+              }
+            }  // end - mul_3
+          }
+        }  // end - Sub
+      }
+  }  // end - Add
+
+  Subgraph pattern when gamma value is 1 and the gamma scaling Mul is skipped
+  {"Add"
+      {{"Mul"  // mul_0
+          {{"*"},  // input to apply batchnorm
+           {"Rsqrt"  // same Rsqrt op used in Sub block
+              {"Add"
+                 {{"Const"},  // Variance
+                  {"Const"}  // Epsilon
+                 }
+              }
+            }  // end - Rsqrt
+          }
+        },  // end - mul_0
+        {"Sub"
+          {{"Const"},  // Beta
+           {"Mul"  // mul_1
+              {{"Const"},  // Mean
+               {"Rsqrt"  // same Rsqrt op as in previous mul_0 block
+                  {"Add"
+                    {{"Const"},  // Variance
+                     {"Const"}  // Epsilon
+                    }
+                  }
+                }  // end - Rsqrt
+              }
+           }  // end - mul_1
+          }
+        }  // end - Sub
+      }
+  }  // end - Add
+
+  Args:
+    input_graph_def: A GraphDef containing a model.
+
+  Returns:
+    Modified graph with individual ops that made up of batch normalization
+    fused to FusedBatchNorm.
+
+  Raises:
+    ValueError: If the graph is badly formed with duplicate node names.
+  """
+  input_node_map = {}
+  for node in input_graph_def.node:
+    if node.name not in input_node_map:
+      input_node_map[node.name] = node
+    else:
+      raise ValueError("Duplicate node names detected for ", node.name)
+
+  nodes_to_skip = {}
+  new_ops = []
+  for node in input_graph_def.node:
+    if node.op != "Add":
+      continue
+
+    # Add (Mul, Sub) or Add (Sub, Mul)
+    input0_op = node_from_map(input_node_map, node.input[0])
+    input1_op = node_from_map(input_node_map, node.input[1])
+
+    if input0_op.op == "Mul" and input1_op.op == "Sub":
+      data_scale_mul_op = input0_op
+      bias_mean_sub_op = input1_op
+    elif input0_op.op == "Sub" and input1_op.op == "Mul":
+      bias_mean_sub_op = input0_op
+      data_scale_mul_op = input1_op
+    else:
+      continue
+
+    # Mul (input, Mul)
+    input_data_op = node_from_map(input_node_map, data_scale_mul_op.input[0])
+    scale_op = node_from_map(input_node_map, data_scale_mul_op.input[1])
+
+    # Check input to batchnorm and only proceed fusion if input is
+    # Conv2D or DepthwiseConv2dNative and data format is NHWC.
+    data_format = None
+    if input_data_op.op in ["Conv2D", "DepthwiseConv2dNative"]:
+      data_format = input_data_op.attr["data_format"]
+    else:
+      for in_node_name in input_data_op.input:
+        in_node = node_from_map(input_node_map, in_node_name)
+        if in_node is None:
+          raise ValueError("The node map has no entry for ", in_node_name)
+        if in_node.op in ["Conv2D", "DepthwiseConv2dNative"]:
+          data_format = in_node.attr["data_format"]
+          break
+
+    if data_format is None or data_format.s != b"NHWC":
+      continue
+
+    if scale_op.op == "Rsqrt":
+      gamma_op = None
+      rsqrt_op = scale_op
+    elif scale_op.op == "Mul":
+      # Mul (Rsqrt, Constant_gamma)
+      rsqrt_op = node_from_map(input_node_map, scale_op.input[0])
+      gamma_op = node_from_map(input_node_map, scale_op.input[1])
+      if rsqrt_op.op != "Rsqrt":
+        continue
+      if gamma_op.op != "Const" or get_const_dim_count(gamma_op) != 1:
+        continue
+    else:
+      continue
+
+    # Sub (Constant_beta, Mul)
+    beta_op = node_from_map(input_node_map, bias_mean_sub_op.input[0])
+    mean_scale_mul_op = node_from_map(input_node_map, bias_mean_sub_op.input[1])
+    if mean_scale_mul_op.op != "Mul":
+      continue
+    if beta_op.op != "Const" or get_const_dim_count(beta_op) != 1:
+      continue
+
+    # Common scale applies to both input and running mean
+    if scale_op != node_from_map(input_node_map, mean_scale_mul_op.input[1]):
+      continue
+
+    mean_op = node_from_map(input_node_map, mean_scale_mul_op.input[0])
+    if mean_op.op != "Const" or get_const_dim_count(mean_op) != 1:
+      continue
+
+    # Add (Constant_variance, Constant_epsilon)
+    variance_epsilon_add_op = node_from_map(input_node_map, rsqrt_op.input[0])
+    if variance_epsilon_add_op.op != "Add":
+      continue
+
+    variance_op = node_from_map(
+        input_node_map, variance_epsilon_add_op.input[0]
+    )
+    epsilon_op = node_from_map(input_node_map, variance_epsilon_add_op.input[1])
+    if epsilon_op.op != "Const" or get_const_dim_count(epsilon_op) != 0:
+      continue
+    if variance_op.op != "Const" or get_const_dim_count(variance_op) != 1:
+      continue
+
+    epsilon = values_from_const(epsilon_op)
+
+    nodes_to_skip[node.name] = True
+    nodes_to_skip[data_scale_mul_op.name] = True
+    nodes_to_skip[bias_mean_sub_op.name] = True
+    nodes_to_skip[mean_scale_mul_op.name] = True
+    nodes_to_skip[scale_op.name] = True
+    if scale_op.op != "Rsqrt":
+      nodes_to_skip[rsqrt_op.name] = True
+    nodes_to_skip[variance_epsilon_add_op.name] = True
+
+    if gamma_op is None:
+      gamma_op = node_def_pb2.NodeDef()
+      gamma_op.op = "Const"
+      # Assign name with same root of Rsqrt op's name plus "gamma"
+      m = re.search(r"(.*)/(.*)", scale_op.name)
+      if m:
+        gamma_op.name = m.group(1) + "/gamma"
+      else:
+        gamma_op.name = scale_op.name + "/gamma"
+      gamma_op.attr["dtype"].CopyFrom(beta_op.attr["dtype"])
+      beta_value = values_from_const(beta_op)
+      gamma_op.attr["value"].CopyFrom(
+          attr_value_pb2.AttrValue(
+              tensor=tensor_util.make_tensor_proto(
+                  1,
+                  beta_value.dtype.type,
+                  beta_value.shape,
+                  allow_broadcast=True,
+              )
+          )
+      )
+      new_ops.append(gamma_op)
+
+    new_fused_batchnorm_op = node_def_pb2.NodeDef()
+    new_fused_batchnorm_op.op = "FusedBatchNorm"
+    new_fused_batchnorm_op.name = node.name
+    new_fused_batchnorm_op.attr["T"].CopyFrom(node.attr["T"])
+    new_fused_batchnorm_op.attr["is_training"].CopyFrom(
+        attr_value_pb2.AttrValue(b=False)
+    )
+    new_fused_batchnorm_op.attr["epsilon"].CopyFrom(
+        attr_value_pb2.AttrValue(f=epsilon.tolist())
+    )
+    new_fused_batchnorm_op.attr["data_format"].CopyFrom(data_format)
+    new_fused_batchnorm_op.input.extend([
+        input_data_op.name,
+        gamma_op.name,
+        beta_op.name,
+        mean_op.name,
+        variance_op.name,
+    ])
+
+    new_ops.append(new_fused_batchnorm_op)
+
+  result_graph_def = graph_pb2.GraphDef()
+  for node in input_graph_def.node:
+    if node.name in nodes_to_skip:
+      continue
+    new_node = node_def_pb2.NodeDef()
+    new_node.CopyFrom(node)
+    retained_input = []
+    for input_node in new_node.input:
+      if not input_node.startswith("^") or input_node[1:] not in nodes_to_skip:
+        retained_input.append(input_node)
+    new_node.input[:] = retained_input
+    result_graph_def.node.append(new_node)
+
+  result_graph_def.node.extend(new_ops)
+  result_graph_def.versions.CopyFrom(input_graph_def.versions)
+  return result_graph_def
diff --git a/tensorflow/python/tools/optimize_for_inference_test.py b/tensorflow/python/tools/optimize_for_inference_test.py
index 73563e111844c5..cd2f71fb3e598f 100644
--- a/tensorflow/python/tools/optimize_for_inference_test.py
+++ b/tensorflow/python/tools/optimize_for_inference_test.py
@@ -28,6 +28,7 @@
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
+from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import image_ops
 from tensorflow.python.ops import math_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import nn_ops
@@ -500,6 +501,317 @@ def testFusePadAndConv(self):
       self.assertNotEqual("Conv2D", node.op)
       self.assertNotEqual("ResizeBilinear", node.op)
 
+  def count_batchnorm_relavant_ops(self, graph_def):
+    """Return the count of FusedBatchNorm op and the count of primitive
+
+    ops which may make up batchnorm computation in a given graph.
+    """
+    batchnorm_count = 0
+    decompose_count = 0
+    for node in graph_def.node:
+      if node.op == "FusedBatchNorm":
+        batchnorm_count += 1
+      if node.op in ["Add", "Rsqrt", "Mul", "Sub"]:
+        decompose_count += 1
+    return batchnorm_count, decompose_count
+
+  @test_util.run_deprecated_v1
+  def create_base_for_fuse_batchnorm(self, pattern_match_mode="MATCH_ALL"):
+    """Create testing graph and compute the result from original graph.
+
+    Args:
+      pattern_match_mode: A label string to indicate which batchnorm composition
+        pattern to create in the resulting graph. "MATCH_ALL" - Create a graph
+        matching the decomposed batchnorm pattern with full set of primitive
+        ops. "MATCH_NO_GAMMA" - Create a graph matching the decomposed batchnorm
+        pattern when gamma factor is 1 and multiplication with gamma is omitted.
+        "MATCH_SWITCH_ORDER" - Create a graph matching the decomposed batchnorm
+        pattern with a different order of inputs to the root Add node.
+        "MISMATCH_PATTERN" - Create a graph with same set of primitive ops which
+        makes up the decomposed batchnorm, but not matching the pattern.
+        "MISMATCH_FORMAT" - Create a graph with NCHW format as input.
+
+    Returns:
+      A GraphDef as original graph to run the decomposed batchnorm test cases.
+      Computation result from executing the original graph defined by GraphDef.
+    """
+    with self.cached_session() as sess:
+      data_format = "NHWC"
+      if pattern_match_mode == "MISMATCH_FORMAT":
+        data_format = "NCHW"
+      inputs = [1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6]
+      input_op = constant_op.constant(
+          np.array(inputs),
+          shape=[1, 1, 6, 2] if data_format == "NHWC" else [1, 2, 1, 6],
+          dtype=dtypes.float32,
+      )
+      weights = [1, 2, 3, 4, 0.1, 0.2, 0.3, 0.4]
+      weights_op = constant_op.constant(
+          np.array(weights), shape=[1, 2, 2, 2], dtype=dtypes.float32
+      )
+      conv_op = nn_ops.conv2d(
+          input_op,
+          weights_op,
+          [1, 1, 1, 1],
+          data_format=data_format,
+          padding="SAME",
+          name="conv_op",
+      )
+
+      const_op_1 = None
+      const_op_2 = constant_op.constant(0.00001, dtype=dtypes.float32)
+      const_op_3 = None
+      const_op_4 = None
+      const_op_5 = None
+      const_op_6 = None
+
+      if data_format == "NHWC":
+        const_op_1 = constant_op.constant(
+            np.array([0.25, 0.5]), shape=[2], dtype=dtypes.float32
+        )
+        const_op_3 = constant_op.constant(
+            np.array([10, 20]), shape=[2], dtype=dtypes.float32
+        )
+        const_op_4 = constant_op.constant(
+            np.array([0.1, 0.6]), shape=[2], dtype=dtypes.float32
+        )
+        const_op_5 = constant_op.constant(
+            np.array([1.0, 2.0]), shape=[2], dtype=dtypes.float32
+        )
+        const_op_6 = constant_op.constant(
+            np.array([0.2, 0.5]), shape=[2], dtype=dtypes.float32
+        )
+      else:
+        const_op_1 = constant_op.constant(
+            np.array([0.25, 0.5, 0.6, 0.7, 0.8, 0.9]),
+            shape=[6],
+            dtype=dtypes.float32,
+        )
+        const_op_3 = constant_op.constant(
+            np.array([10, 20, 30, 40, 50, 60]), shape=[6], dtype=dtypes.float32
+        )
+        const_op_4 = constant_op.constant(
+            np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6]),
+            shape=[6],
+            dtype=dtypes.float32,
+        )
+        const_op_5 = constant_op.constant(
+            np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]),
+            shape=[6],
+            dtype=dtypes.float32,
+        )
+        const_op_6 = constant_op.constant(
+            np.array([0.2, 0.4, 0.5, 0.6, 0.7, 0.8]),
+            shape=[6],
+            dtype=dtypes.float32,
+        )
+
+      add_op_1 = gen_math_ops.add(const_op_1, const_op_2)
+      rsqrt_op = math_ops.rsqrt(add_op_1)
+
+      variable_op = None
+      if pattern_match_mode == "MATCH_NO_GAMMA":
+        variable_op = rsqrt_op
+      else:
+        variable_op = math_ops.multiply(rsqrt_op, const_op_5)
+
+      mul_op_1 = math_ops.multiply(conv_op, variable_op)
+
+      mul_op_2 = None
+      if pattern_match_mode == "MISMATCH_PATTERN":
+        mul_op_2 = math_ops.multiply(const_op_3, const_op_6)
+      else:
+        mul_op_2 = math_ops.multiply(const_op_3, variable_op)
+
+      sub_op = math_ops.subtract(const_op_4, mul_op_2)
+
+      if pattern_match_mode == "MATCH_SWITCH_ORDER":
+        gen_math_ops.add(sub_op, mul_op_1, name="output")
+      else:
+        gen_math_ops.add(mul_op_1, sub_op, name="output")
+
+      test_util.set_producer_version(ops.get_default_graph(), 8)
+
+      original_graph = sess.graph_def
+      original_result = sess.run(["output:0"])
+
+      return original_graph, original_result
+
+  @test_util.run_deprecated_v1
+  def testFuseDecomposedBatchNorm_MatchAll(self):
+    original_graph_def, original_result = self.create_base_for_fuse_batchnorm(
+        "MATCH_ALL"
+    )
+
+    # Test correctness of fusing individual ops to FusedBatchNorm.
+    optimized_graph_def = optimize_for_inference_lib.fuse_decomposed_batch_norm(
+        original_graph_def
+    )
+
+    batchnorm_count, decompose_count = self.count_batchnorm_relavant_ops(
+        optimized_graph_def
+    )
+    self.assertEqual(batchnorm_count, 1)
+    self.assertEqual(decompose_count, 0)
+
+    with self.cached_session() as sess:
+      _ = importer.import_graph_def(
+          optimized_graph_def, input_map={}, name="optimized"
+      )
+      optimized_result = sess.run(["optimized/output:0"])
+
+    self.assertAllClose(original_result, optimized_result)
+
+    # Test correctness of fusing individual ops to FusedBatchNorm followed by
+    # folding FusedBatchNorm.
+    optimized_graph_def = optimize_for_inference_lib.fold_batch_norms(
+        optimized_graph_def
+    )
+    for node in optimized_graph_def.node:
+      self.assertNotEqual("FusedBatchNorm", node.op)
+
+    with self.cached_session() as sess:
+      _ = importer.import_graph_def(
+          optimized_graph_def, input_map={}, name="optimized2"
+      )
+      optimized_result = sess.run(["optimized2/output:0"])
+
+    self.assertAllClose(
+        original_result, optimized_result, rtol=1e-04, atol=1e-06
+    )
+
+  @test_util.run_deprecated_v1
+  def testFuseDecomposedBatchNorm_MatchNoGamma(self):
+    original_graph_def, original_result = self.create_base_for_fuse_batchnorm(
+        "MATCH_NO_GAMMA"
+    )
+
+    # Test correctness of fusing individual ops to FusedBatchNorm.
+    optimized_graph_def = optimize_for_inference_lib.fuse_decomposed_batch_norm(
+        original_graph_def
+    )
+
+    batchnorm_count, decompose_count = self.count_batchnorm_relavant_ops(
+        optimized_graph_def
+    )
+    self.assertEqual(batchnorm_count, 1)
+    self.assertEqual(decompose_count, 0)
+
+    with self.cached_session() as sess:
+      _ = importer.import_graph_def(
+          optimized_graph_def, input_map={}, name="optimized"
+      )
+      optimized_result = sess.run(["optimized/output:0"])
+
+    self.assertAllClose(original_result, optimized_result)
+
+    # Test correctness of fusing individual ops to FusedBatchNorm followed by
+    # folding FusedBatchNorm.
+    optimized_graph_def = optimize_for_inference_lib.fold_batch_norms(
+        optimized_graph_def
+    )
+    for node in optimized_graph_def.node:
+      self.assertNotEqual("FusedBatchNorm", node.op)
+
+    with self.cached_session() as sess:
+      _ = importer.import_graph_def(
+          optimized_graph_def, input_map={}, name="optimized2"
+      )
+      optimized_result = sess.run(["optimized2/output:0"])
+
+    self.assertAllClose(
+        original_result, optimized_result, rtol=1e-04, atol=1e-06
+    )
+
+  @test_util.run_deprecated_v1
+  def testFuseDecomposedBatchNorm_MatchSwitchOrder(self):
+    original_graph_def, original_result = self.create_base_for_fuse_batchnorm(
+        "MATCH_SWITCH_ORDER"
+    )
+
+    # Test correctness of fusing individual ops to FusedBatchNorm.
+    optimized_graph_def = optimize_for_inference_lib.fuse_decomposed_batch_norm(
+        original_graph_def
+    )
+
+    batchnorm_count, decompose_count = self.count_batchnorm_relavant_ops(
+        optimized_graph_def
+    )
+    self.assertEqual(batchnorm_count, 1)
+    self.assertEqual(decompose_count, 0)
+
+    with self.cached_session() as sess:
+      _ = importer.import_graph_def(
+          optimized_graph_def, input_map={}, name="optimized"
+      )
+      optimized_result = sess.run(["optimized/output:0"])
+
+    self.assertAllClose(original_result, optimized_result)
+
+    # Test correctness of fusing individual ops to FusedBatchNorm followed by
+    # folding FusedBatchNorm.
+    optimized_graph_def = optimize_for_inference_lib.fold_batch_norms(
+        optimized_graph_def
+    )
+    for node in optimized_graph_def.node:
+      self.assertNotEqual("FusedBatchNorm", node.op)
+
+    with self.cached_session() as sess:
+      _ = importer.import_graph_def(
+          optimized_graph_def, input_map={}, name="optimized2"
+      )
+      optimized_result = sess.run(["optimized2/output:0"])
+
+    self.assertAllClose(
+        original_result, optimized_result, rtol=1e-04, atol=1e-06
+    )
+
+  @test_util.run_deprecated_v1
+  def testFuseDecomposedBatchNorm_PatternMismatchCase(self):
+    original_graph_def, original_result = self.create_base_for_fuse_batchnorm(
+        "MISMATCH_PATTERN"
+    )
+
+    # Test for not to fuse ops if graph has same types of ops but pattern mismatch.
+    optimized_graph_def = optimize_for_inference_lib.fuse_decomposed_batch_norm(
+        original_graph_def
+    )
+
+    batchnorm_count, math_op_count = self.count_batchnorm_relavant_ops(
+        optimized_graph_def
+    )
+    self.assertEqual(batchnorm_count, 0)
+    self.assertEqual(math_op_count, 7)
+
+    with self.cached_session() as sess:
+      _ = importer.import_graph_def(
+          optimized_graph_def, input_map={}, name="optimized"
+      )
+      optimized_result = sess.run(["optimized/output:0"])
+
+    self.assertAllClose(original_result, optimized_result)
+
+  @test_util.run_deprecated_v1
+  def testFuseDecomposedBatchNorm_FormatUnsupportedCase(self):
+    if not test_util.IsMklEnabled():
+      # Non-Mkl build doesn't support NCHW format on CPU.
+      self.skipTest("Skip test for non-Mkl build.")
+
+    original_graph_def, original_result = self.create_base_for_fuse_batchnorm(
+        "MISMATCH_FORMAT"
+    )
+
+    # Test for not to fuse ops if graph has same types of ops but pattern mismatch.
+    optimized_graph_def = optimize_for_inference_lib.fuse_decomposed_batch_norm(
+        original_graph_def
+    )
+
+    batchnorm_count, math_op_count = self.count_batchnorm_relavant_ops(
+        optimized_graph_def
+    )
+    self.assertEqual(batchnorm_count, 0)
+    self.assertEqual(math_op_count, 7)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/tpu/BUILD b/tensorflow/python/tpu/BUILD
index 89e5caf9ad6ef8..c9c2b7f75a0710 100644
--- a/tensorflow/python/tpu/BUILD
+++ b/tensorflow/python/tpu/BUILD
@@ -66,6 +66,29 @@ py_strict_test(
     ],
 )
 
+tf_py_strict_test(
+    name = "tpu_embedding_v3_cpu_ops_test",
+    srcs = ["tpu_embedding_v3_cpu_ops_test.py"],
+    shard_count = 8,
+    tags = ["no_oss"],
+    deps = [
+        # copybara:uncomment "//tensorflow/core/tpu/kernels",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:bitwise_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/tpu/ops:gen_xla_ops",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 alias(
     name = "tpu_ops",
     actual = "//tensorflow/python/tpu/ops",
@@ -763,11 +786,11 @@ pytype_strict_library(
     name = "tpu_embedding_v3_utils",
     srcs = ["tpu_embedding_v3_utils.py"],
     deps = [
-        "//tensorflow/core/tpu/kernels:sparse_core_layout_proto_py",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:tensor",
         "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops_gen",
         "//tensorflow/python/ops:manip_ops",
         "//tensorflow/python/ops:variables",
         "//tensorflow/python/trackable:base",
@@ -992,6 +1015,26 @@ tf_proto_library(
 # )
 # copybara:uncomment_end
 
+tf_python_pybind_extension(
+    name = "_pywrap_sparse_core_layout",
+    srcs = ["pywrap_sparse_core_layout.cc"],
+    enable_stub_generation = True,
+    features = ["-layering_check"],
+    pytype_srcs = [
+        "_pywrap_sparse_core_layout.pyi",
+    ],
+    deps = [
+        "//tensorflow/core/tpu/kernels:_pywrap_sparse_core_layout_header_only",
+        "//tensorflow/python/lib/core:pybind11_lib",
+        "//tensorflow/python/lib/core:pybind11_status",
+        "//tensorflow/python/lib/core:pybind11_status_headers",
+        "//third_party/python_runtime:headers",  # buildcleaner: keep
+        "@pybind11",
+        "@pybind11_abseil//pybind11_abseil:status_casters",
+        "@pybind11_protobuf//pybind11_protobuf:native_proto_caster",
+    ],
+)
+
 tf_python_pybind_extension(
     name = "_pywrap_tpu_embedding",
     srcs = ["pywrap_tpu_embedding.cc"],
@@ -1016,11 +1059,13 @@ py_strict_library(
     name = "tpu_embedding_v3",
     srcs = ["tpu_embedding_v3.py"],
     deps = [
+        ":_pywrap_sparse_core_layout",
         ":_pywrap_tpu_embedding",
         ":tpu_embedding_base",
         ":tpu_embedding_v2_utils",
         ":tpu_replication",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/core/tpu/kernels:sparse_core_layout_proto_py",
         "//tensorflow/python/checkpoint:saveable_compat",
         "//tensorflow/python/distribute:device_util",
         "//tensorflow/python/distribute:distribute_lib",
@@ -1037,8 +1082,6 @@ py_strict_library(
         "//tensorflow/python/framework:tensor",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/ops:array_ops",
-        "//tensorflow/python/ops:collective_ops_gen",
-        "//tensorflow/python/ops:cond",
         "//tensorflow/python/ops:control_flow_ops",
         "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/ops:resource_variable_ops_gen",
@@ -1056,4 +1099,43 @@ py_strict_library(
     ],
 )
 
+tpu_py_strict_test(
+    name = "tpu_embedding_v3_test",
+    srcs = ["tpu_embedding_v3_test.py"],
+    disable_experimental = True,
+    disable_tfrt = False,
+    disable_v2 = True,
+    disable_v3 = True,
+    # copybara:uncomment disable_v4i = True,
+    # copybara:uncomment disable_v5 = False,
+    # copybara:uncomment disable_v5_grm = True,
+    python_version = "PY3",
+    shard_count = 4,
+    srcs_version = "PY3",
+    # copybara:uncomment use_tpu_plugin_and_capi = True,
+    deps = [
+        ":tpu_embedding_for_serving",
+        ":tpu_embedding_v2_utils",
+        ":tpu_embedding_v3",
+        ":tpu_replication",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:tpu_strategy",
+        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:remote",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:init_ops_v2",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/util:nest",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 internal_create_sanitizer_settings()
diff --git a/tensorflow/python/tpu/_pywrap_sparse_core_layout.pyi b/tensorflow/python/tpu/_pywrap_sparse_core_layout.pyi
new file mode 100644
index 00000000000000..778093406836bc
--- /dev/null
+++ b/tensorflow/python/tpu/_pywrap_sparse_core_layout.pyi
@@ -0,0 +1,24 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from typing import Any
+
+class SparseCoreLayoutStacker:
+    def __init__(self, num_partitions: int, sparse_cores_per_partition: int) -> None: ...
+    def AddTable(self, table_name: str, table_height: int, table_width: int, group: str, output_samples: int) -> None: ...
+    def GetLayouts(self, *args, **kwargs) -> Any: ...
+    def SetActivationMemoryBytesLimit(self, arg0: int) -> None: ...
+    def SetStackingEnabled(self, arg0: bool) -> None: ...
+    def SetVariableShardBytesLimit(self, arg0: int) -> None: ...
diff --git a/tensorflow/python/tpu/ops/BUILD b/tensorflow/python/tpu/ops/BUILD
index bc70f4c2a3ba56..4fa0268cb1b6f0 100644
--- a/tensorflow/python/tpu/ops/BUILD
+++ b/tensorflow/python/tpu/ops/BUILD
@@ -53,6 +53,10 @@ tf_gen_op_wrapper_py(
         "GetMinibatchesInCsrWithPhysicalReplica",
         "GetMinibatchSplitsWithPhysicalReplica",
         "StoreMinibatchStatisticsInFdo",
+        "ConvertToListOfSparseCoreCooTensors",
+        "SortListOfSparseCoreCooTensors",
+        "ConvertToSparseCoreCsrWrappedCooTensor",
+        "GetStatsFromListOfSparseCoreCooTensors",
         "GlobalIterId",
         "TPUCopyWithDynamicShape",
         "TPUAnnotateTensorsWithDynamicShape",
@@ -64,6 +68,12 @@ tf_gen_op_wrapper_py(
         "XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInput",
         "XlaSparseDenseMatmulGradWithAdamAndCsrInput",
         "XlaSparseDenseMatmulGradWithFtrlAndCsrInput",
+        "XlaSparseDenseMatmulWithStaticBufferSize",
+        "XlaSparseDenseMatmulGradWithSgdAndStaticBufferSize",
+        "XlaSparseDenseMatmulGradWithAdagradAndStaticBufferSize",
+        "XlaSparseDenseMatmulGradWithAdagradMomentumAndStaticBufferSize",
+        "XlaSparseDenseMatmulGradWithAdamAndStaticBufferSize",
+        "XlaSparseDenseMatmulGradWithFtrlAndStaticBufferSize",
         "XlaSparseCoreSgd",
         "XlaSparseCoreAdagrad",
         "XlaSparseCoreAdagradMomentum",
diff --git a/tensorflow/python/tpu/pywrap_sparse_core_layout.cc b/tensorflow/python/tpu/pywrap_sparse_core_layout.cc
new file mode 100644
index 00000000000000..785efe5d682a25
--- /dev/null
+++ b/tensorflow/python/tpu/pywrap_sparse_core_layout.cc
@@ -0,0 +1,43 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <pybind11/pybind11.h>
+
+#include "pybind11/cast.h"  // from @pybind11
+#include "pybind11/detail/common.h"  // from @pybind11
+#include "pybind11_abseil/status_casters.h"  // from @pybind11_abseil
+#include "pybind11_protobuf/native_proto_caster.h"  // from @pybind11_protobuf
+#include "tensorflow/core/tpu/kernels/sparse_core_layout.h"
+#include "tensorflow/core/tpu/kernels/sparse_core_layout.pb.h"
+
+namespace tensorflow::tpu {
+
+namespace py = pybind11;
+
+PYBIND11_MODULE(_pywrap_sparse_core_layout, m) {
+  py::class_<SparseCoreLayoutStacker>(m, "SparseCoreLayoutStacker")
+      .def(py::init<int, int>(), py::arg("num_partitions"),
+           py::arg("sparse_cores_per_partition"))
+      .def("SetActivationMemoryBytesLimit",
+           &SparseCoreLayoutStacker::SetActivationMemoryBytesLimit)
+      .def("SetVariableShardBytesLimit",
+           &SparseCoreLayoutStacker::SetVariableShardBytesLimit)
+      .def("SetStackingEnabled", &SparseCoreLayoutStacker::SetStackingEnabled)
+      .def("AddTable", &SparseCoreLayoutStacker::AddTable,
+           py::arg("table_name"), py::arg("table_height"),
+           py::arg("table_width"), py::arg("group"), py::arg("output_samples"))
+      .def("GetLayouts", &SparseCoreLayoutStacker::GetLayouts);
+}
+
+}  // namespace tensorflow::tpu
diff --git a/tensorflow/python/tpu/tpu_embedding_v3.py b/tensorflow/python/tpu/tpu_embedding_v3.py
index c85af8c2478efb..c1eb333915a0d8 100644
--- a/tensorflow/python/tpu/tpu_embedding_v3.py
+++ b/tensorflow/python/tpu/tpu_embedding_v3.py
@@ -16,13 +16,15 @@
 
 import collections
 import copy
+import dataclasses
 import functools
 import operator
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union
 
 from absl import logging
 
 from tensorflow.core.framework import attr_value_pb2
+from tensorflow.core.tpu.kernels import sparse_core_layout_pb2
 from tensorflow.python.checkpoint import saveable_compat
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
@@ -39,9 +41,7 @@
 from tensorflow.python.framework import tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import gen_collective_ops
 from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
@@ -59,11 +59,29 @@
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
 
+
 _PIPELINE_ATTRIBUTE = "_embedding_pipelining"
 _PIPELINE_MODE_FORWARD = "forward"
 _PIPELINE_MODE_BACKWARD = "backward"
 
 
+TableConfig = tpu_embedding_v2_utils.TableConfig
+FeatureConfig = tpu_embedding_v2_utils.TableConfig
+QuantizationConfig = tpu_embedding_v2_utils.QuantizationConfig
+
+
+@tf_export("tpu.experimental.embedding.SparseCoreEmbeddingConfig")
+@dataclasses.dataclass(frozen=True)
+class SparseCoreEmbeddingConfig:
+  """Config for sparsecore embedding."""
+
+  disable_table_stacking: bool = True
+  max_ids_per_chip_per_sample: int = 64
+  max_ids_per_table: Optional[Dict[str, int]] = None
+  max_unique_ids_per_table: Optional[Dict[str, int]] = None
+  allow_id_dropping: bool = False
+
+
 class EmbeddingPipeliningContext(control_flow_ops.ControlFlowContext):
   """Sets the _embedding_pipelining attribute on all ops created in the scope."""
 
@@ -137,6 +155,33 @@ def restore(
     )
 
 
+def _fielddict():
+  return dataclasses.field(default_factory=dict)
+
+
+@dataclasses.dataclass
+class TableStacking:
+  """Information about how we stack tables."""
+
+  # Indexed by stacked table name:
+  stacked_table_to_tables: Dict[str, TableConfig] = _fielddict()
+  quantization_configs: Dict[str, QuantizationConfig] = _fielddict()
+
+  # Indexed by table name:
+  table_name_to_table: Dict[str, TableConfig] = _fielddict()
+  table_to_padding_rows: Dict[str, int] = _fielddict()
+  table_to_padding_columns: Dict[str, int] = _fielddict()
+  table_to_sample_count: Dict[str, int] = _fielddict()
+  table_to_layout: Dict[str, sparse_core_layout_pb2.SparseCoreTableLayout] = (
+      _fielddict()
+  )
+  # Maps table name to (stacked table, row offset, shard rotation)
+  table_to_stacked_table_offset: Dict[str, Tuple[str, int, int]] = _fielddict()
+
+  # Indexed by feature_path the key of flat_features:
+  feature_to_sample_offset: Dict[str, int] = _fielddict()
+
+
 @saveable_compat.legacy_saveable_name("")
 class TPUEmbeddingShardedVariable(
     tpu_values.TPUVariableMixin, values.DistributedVariable
@@ -262,16 +307,183 @@ def read_from_device(self, device):
 )
 
 
+def _clone_feature_config(feature_config):
+  old_to_new_table = {}
+  new_features = []
+
+  for old_feature in nest.flatten(feature_config):
+    feature = copy.copy(old_feature)
+    if feature.table not in old_to_new_table:
+      old_to_new_table[feature.table] = copy.copy(feature.table)
+    feature.table = old_to_new_table[feature.table]
+    new_features.append(feature)
+
+  return nest.pack_sequence_as(feature_config, new_features)
+
+
+def _stack_tables_with_same_table_dim_and_optimizer(
+    table_config: Sequence[TableConfig],
+    flat_features: Sequence[Tuple[Any, FeatureConfig]],
+    num_partitions: int,
+    num_sc_per_partition: int,
+    sparse_core_embedding_config: Optional[SparseCoreEmbeddingConfig] = None,
+) -> TableStacking:
+  """Stack tables with the same table dim and optimizer."""
+  logging.info("Number of tables before stacking is %d", len(table_config))
+  disable_table_stacking = False
+  if sparse_core_embedding_config:
+    disable_table_stacking = sparse_core_embedding_config.disable_table_stacking
+
+  s = TableStacking()
+
+  # Round the table sizes to be divisible by the number of SCs.
+  num_shards = num_partitions * num_sc_per_partition * 8
+
+  s.table_to_padding_columns = {}
+  s.table_to_padding_rows = {}
+  table_name_to_table = {}
+  for table in table_config:
+    table_name_to_table[table.name] = table
+    extra_rows = (
+        num_shards - (table.vocabulary_size % num_shards)
+    ) % num_shards
+    extra_cols = (8 - (table.dim % 8)) % 8
+    if extra_rows != 0:
+      if table.vocabulary_size < num_shards:
+        logging.warning(
+            "!!! Adding %d extra rows to a small table %s!!! Table had"
+            " %d rows before padding and %d rows after padding.",
+            extra_rows,
+            table.name,
+            table.vocabulary_size,
+            table.vocabulary_size + extra_rows,
+        )
+      else:
+        logging.warning(
+            "Adding %d extra rows to table %s to get %d rows.",
+            extra_rows,
+            table.name,
+            table.vocabulary_size + extra_rows,
+        )
+    if extra_cols != 0:
+      logging.warning(
+          "Adding %d extra columns to table %s to get %d columns.",
+          extra_cols,
+          table.name,
+          table.dim + extra_cols,
+      )
+    s.table_to_padding_columns[table.name] = extra_cols
+    s.table_to_padding_rows[table.name] = extra_rows
+    table.vocabulary_size += extra_rows
+    table.dim += extra_cols
+
+  if disable_table_stacking:
+    logging.warn("Table stacking is disabled.")
+    table_stacks = [[table] for table in table_config]
+  else:
+    table_names = []
+    table_widths = []
+    table_heights = []
+    table_num_samples = []
+    table_groups = []
+
+    table_data_to_group = {}
+    table_to_num_samples = {table.name: 0 for table in table_config}
+    for _, feature in flat_features:
+      table_to_num_samples[feature.table.name] += functools.reduce(
+          operator.mul, feature.output_shape
+      )
+
+    for table in table_config:
+      key = (
+          table.dim,
+          table.optimizer,
+          repr(table.quantization_config)
+          if table.quantization_config
+          else None,
+      )
+      if key not in table_data_to_group:
+        table_data_to_group[key] = len(table_data_to_group)
+      table_groups.append(table_data_to_group[key])
+      table_names.append(table.name)
+      table_widths.append(table.dim)
+      table_heights.append(table.vocabulary_size)
+      table_num_samples.append(table_to_num_samples[table.name])
+
+    table_stacks_by_name = _pywrap_tpu_embedding.stack_tables(
+        table_heights,
+        table_widths,
+        table_num_samples,
+        table_groups,
+        table_names,
+        num_partitions,
+    )
+
+    table_stacks = [
+        [table_name_to_table[table_name] for table_name in stack_by_name]
+        for stack_by_name in table_stacks_by_name
+    ]
+
+  s.table_name_to_table = table_name_to_table
+  # Store the mapping between stacked table names to the actual tableConfigs.
+  s.stacked_table_to_tables = {}
+  # Store the mapping between table to name of the stacked table which
+  # contains the table and its offset.
+  s.table_to_stacked_table_offset = {}
+  # Save Quantization Config per stacked tables
+  s.quantization_configs = {}
+  for tables in table_stacks:
+    stacked_table_name = "_".join(map(lambda table: table.name, tables))
+    if stacked_table_name in s.stacked_table_to_tables:
+      raise ValueError(f"{stacked_table_name} already exists!")
+    s.stacked_table_to_tables[stacked_table_name] = tables
+    s.quantization_configs[stacked_table_name] = tables[0].quantization_config
+
+    current_offset = 0
+    current_index = 0
+    for table in tables:
+      s.table_to_stacked_table_offset[table.name] = (
+          stacked_table_name,
+          current_offset,
+          num_sc_per_partition * current_index,
+      )
+      current_offset += table.vocabulary_size
+      current_index += 1
+
+  logging.info(
+      "Number of tables after stacking is %d.",
+      len(s.stacked_table_to_tables),
+  )
+
+  s.feature_to_sample_offset = {}
+  s.table_to_sample_count = {
+      table_name: 0 for table_name in s.stacked_table_to_tables
+  }
+  for feature_path, feature in flat_features:
+    stacked_table_name = s.table_to_stacked_table_offset[feature.table.name][0]
+    s.feature_to_sample_offset[feature_path] = s.table_to_sample_count[
+        stacked_table_name
+    ]
+    s.table_to_sample_count[stacked_table_name] += functools.reduce(
+        operator.mul, feature.output_shape
+    )
+  return s
+
+
 # TODO(b/233952762): Add tests of this version of the mid-level API.
 @tf_export("tpu.experimental.embedding.TPUEmbeddingV2")
 class TPUEmbeddingV2(tpu_embedding_base.TPUEmbeddingBase):
   """The TPUEmbedding mid level API running on TPU with sparse core accelerator."""
 
+  DEFAULT_MAX_IDS_PER_TABLE = 256
+  DEFAULT_MAX_UNIQUE_IDS_PER_TABLE = 256
+
   def __init__(
       self,
       feature_config: Union[tpu_embedding_v2_utils.FeatureConfig, Iterable],  # pylint:disable=g-bare-generic
       optimizer: Optional[tpu_embedding_v2_utils._Optimizer] = None,  # pylint:disable=protected-access
       pipeline_execution_with_tensor_core: bool = False,
+      sparse_core_embedding_config: Optional[SparseCoreEmbeddingConfig] = None,
   ):
     """Creates the TPUEmbeddingV2 mid level API object.
 
@@ -287,6 +499,8 @@ def __init__(
       pipeline_execution_with_tensor_core: If True, the TPU embedding
         computations will overlap with the TensorCore computations (and hence
         will be one step old). Set to True for improved performance.
+      sparse_core_embedding_config: Configs for sparse core embedding including
+        settings for table stacking, input feature static buffer size etc.
 
     Raises:
       ValueError: If optimizer is not one of tf.tpu.experimental.embedding.(SGD,
@@ -296,7 +510,7 @@ def __init__(
     # We do a clone on the feature_config here as we will alter settings in it
     # and we don't want the user to see these. We can't just use clone here
     # as we need to maintain some object relationships.
-    super().__init__(self._clone_feature_config(feature_config), optimizer)
+    super().__init__(_clone_feature_config(feature_config), optimizer)
     self._strategy = distribute_lib.get_strategy()
     if not isinstance(
         self._strategy, (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV2)
@@ -306,6 +520,19 @@ def __init__(
           .format(self._strategy)
       )
 
+    # TODO(pineapplejuice233): Remove this once weight decay is supported.
+    for table in self._table_config:
+      if (
+          table.optimizer.weight_decay_factor is not None
+          or table.optimizer.multiply_weight_decay_factor_by_learning_rate
+          is not None
+      ):
+        raise NotImplementedError(
+            "weight_decay_factor and"
+            " multiply_weight_decay_factor_by_learning_rate are not supported"
+            f" yet. But found in table {table.name} setting."
+        )
+
     self._num_sc_per_chip = (
         self._strategy.extended.tpu_hardware_feature.num_embedding_devices_per_chip
     )
@@ -326,70 +553,87 @@ def __init__(
         self._feature_config
     )
 
-    self._round_table_sizes()
-    self._stack_tables_with_same_table_dim_and_optimizer()
+    if sparse_core_embedding_config is None:
+      self._sparse_core_embedding_config = SparseCoreEmbeddingConfig()
+      logging.warning(
+          "SparseCoreEmbeddingConfig is not provided. Using default values %s",
+          self._sparse_core_embedding_config,
+      )
+    else:
+      self._sparse_core_embedding_config = sparse_core_embedding_config
+
+    self._s = _stack_tables_with_same_table_dim_and_optimizer(
+        self._table_config,
+        self._flat_features,
+        self._strategy.num_replicas_in_sync,
+        self._num_sc_per_chip,
+        self._sparse_core_embedding_config,
+    )
+
+    self._table_name_to_table = self._s.table_name_to_table
+    self._stacked_table_to_tables = self._s.stacked_table_to_tables
+    self._table_to_padding_columns = self._s.table_to_padding_columns
+    self._table_to_padding_rows = self._s.table_to_padding_rows
+    self._table_to_stacked_table_offset = self._s.table_to_stacked_table_offset
+    self._table_to_sample_count = self._s.table_to_sample_count
+    self._feature_to_sample_offset = self._s.feature_to_sample_offset
+    self._quantization_configs = self._s.quantization_configs
 
     # These hyperparameters will be provided by the FDO. Currently hardcode
     # here just for testing.
-    # TODO(pineapplejuice233): Remove these hyperparameters.
-    self.max_ids_per_chip_per_sample = 64
+    self.max_ids_per_chip_per_sample = (
+        self._sparse_core_embedding_config.max_ids_per_chip_per_sample
+    )
     self.max_minibatches_per_sc = 64
 
-    self._pipelining = pipeline_execution_with_tensor_core
-
-  def _clone_feature_config(self, feature_config):
-    old_to_new_table = {}
-    new_features = []
-
-    for old_feature in nest.flatten(feature_config):
-      feature = copy.copy(old_feature)
-      if feature.table not in old_to_new_table:
-        old_to_new_table[feature.table] = copy.copy(feature.table)
-      feature.table = old_to_new_table[feature.table]
-      new_features.append(feature)
+    self._table_to_max_ids_per_sparse_core = {}
+    self._table_to_max_unique_ids_per_sparse_core = {}
 
-    return nest.pack_sequence_as(feature_config, new_features)
+    self._update_sparse_core_buffer_size_after_table_stacking()
 
-  def _round_table_sizes(self):
-    num_shards = self._num_sc_shards * 8
-
-    self._table_to_padding_columns = {}
-    self._table_to_padding_rows = {}
+    self._pipelining = pipeline_execution_with_tensor_core
 
-    for table in self._table_config:
-      extra_rows = (
-          num_shards - (table.vocabulary_size % num_shards)
-      ) % num_shards
-      extra_cols = (8 - (table.dim % 8)) % 8
-      if extra_rows != 0:
-        if table.vocabulary_size < num_shards:
-          logging.warning(
-              "!!! Adding %d extra rows to a small table %s!!! Table had"
-              " %d rows before padding and %d rows after padding.",
-              extra_rows,
-              table.name,
-              table.vocabulary_size,
-              table.vocabulary_size + extra_rows,
-          )
-        else:
-          logging.warning(
-              "Adding %d extra rows to table %s to get %d rows.",
-              extra_rows,
-              table.name,
-              table.vocabulary_size + extra_rows,
-          )
-      if extra_cols != 0:
+  def _update_sparse_core_buffer_size_after_table_stacking(self):
+    """Update the sparse core buffer size after table stacking."""
+    for table_name in self._stacked_table_to_tables:
+      if (
+          self._sparse_core_embedding_config.max_ids_per_table is None
+          or table_name
+          not in self._sparse_core_embedding_config.max_ids_per_table
+      ):
         logging.warning(
-            "Adding %d extra columns to table %s to get %d columns.",
-            extra_cols,
-            table.name,
-            table.dim + extra_cols,
+            "Table %s is not found in max_ids_per_table provided by"
+            " SparseCoreEmbeddingConfig. Using default value 256.",
+            table_name,
+        )
+        self._table_to_max_ids_per_sparse_core[table_name] = (
+            self.DEFAULT_MAX_IDS_PER_TABLE
+        )
+      else:
+        self._table_to_max_ids_per_sparse_core[table_name] = (
+            self._sparse_core_embedding_config.max_ids_per_table[table_name]
+        )
+      if (
+          self._sparse_core_embedding_config.max_unique_ids_per_table is None
+          or table_name
+          not in self._sparse_core_embedding_config.max_unique_ids_per_table
+      ):
+        logging.warning(
+            (
+                "Table %s is not found in max_unique_ids_per_table provided by"
+                " SparseCoreEmbeddingConfig. Using default value 256."
+            ),
+            table_name,
+        )
+        self._table_to_max_unique_ids_per_sparse_core[table_name] = (
+            self.DEFAULT_MAX_UNIQUE_IDS_PER_TABLE
+        )
+      else:
+        self._table_to_max_unique_ids_per_sparse_core[table_name] = (
+            self._sparse_core_embedding_config.max_unique_ids_per_table[
+                table_name
+            ]
         )
-      self._table_to_padding_columns[table.name] = extra_cols
-      self._table_to_padding_rows[table.name] = extra_rows
-      table.vocabulary_size += extra_rows
-      table.dim += extra_cols
-    return
 
   @property
   def embedding_tables(
@@ -565,104 +809,6 @@ def slot_creator(name, initializer):
     slot_vars["parameters"] = parameters
     return slot_vars
 
-  def _stack_tables_with_same_table_dim_and_optimizer(self):
-    """Stack tables with the same table dim and optimizer."""
-    logging.info(
-        "Number of tables before stacking is %d", len(self._table_config)
-    )
-
-    table_names = []
-    table_widths = []
-    table_heights = []
-    table_num_samples = []
-    table_groups = []
-
-    table_data_to_group = {}
-    table_to_num_samples = {table.name: 0 for table in self._table_config}
-    table_name_to_table = {}
-    for _, feature in self._flat_features:
-      table_to_num_samples[feature.table.name] += functools.reduce(
-          operator.mul, feature.output_shape
-      )
-
-    for table in self._table_config:
-      table_name_to_table[table.name] = table
-      key = (
-          table.dim,
-          table.optimizer,
-          repr(table.quantization_config)
-          if table.quantization_config
-          else None,
-      )
-      if key not in table_data_to_group:
-        table_data_to_group[key] = len(table_data_to_group)
-      table_groups.append(table_data_to_group[key])
-      table_names.append(table.name)
-      table_widths.append(table.dim)
-      table_heights.append(table.vocabulary_size)
-      table_num_samples.append(table_to_num_samples[table.name])
-
-    table_stacks_by_name = _pywrap_tpu_embedding.stack_tables(
-        table_heights,
-        table_widths,
-        table_num_samples,
-        table_groups,
-        table_names,
-        self._strategy.num_replicas_in_sync,
-    )
-
-    table_stacks = [
-        [table_name_to_table[table_name] for table_name in stack_by_name]
-        for stack_by_name in table_stacks_by_name
-    ]
-
-    # Store the mapping between stacked table names to the actual tableConfigs.
-    self._stacked_table_to_tables = {}
-    # Store the mapping between table to name of the stacked table which
-    # contains the table and its offset.
-    self._table_to_stacked_table_offset = {}
-    # Save Quantization Config per stacked tables
-    self._quantization_configs = {}
-    for tables in table_stacks:
-      stacked_table_name = "_".join(map(lambda table: table.name, tables))
-      if stacked_table_name in self._stacked_table_to_tables:
-        raise ValueError(f"{stacked_table_name} already exists!")
-      self._stacked_table_to_tables[stacked_table_name] = tables
-      self._quantization_configs[stacked_table_name] = tables[
-          0
-      ].quantization_config
-
-      current_offset = 0
-      current_index = 0
-      for table in tables:
-        self._table_to_stacked_table_offset[table.name] = (
-            stacked_table_name,
-            current_offset,
-            self._num_sc_per_chip * current_index,
-        )
-        current_offset += table.vocabulary_size
-        current_index += 1
-
-    logging.info(
-        "Number of tables after stacking is %d.",
-        len(self._stacked_table_to_tables),
-    )
-
-    self._feature_to_sample_offset = {}
-    self._table_to_sample_count = {
-        table_name: 0 for table_name in self._stacked_table_to_tables
-    }
-    for feature_path, feature in self._flat_features:
-      stacked_table_name = self._table_to_stacked_table_offset[
-          feature.table.name
-      ][0]
-      self._feature_to_sample_offset[feature_path] = (
-          self._table_to_sample_count[stacked_table_name]
-      )
-      self._table_to_sample_count[stacked_table_name] += functools.reduce(
-          operator.mul, feature.output_shape
-      )
-
   def _create_variables_and_slots(
       self,
   ) -> Dict[str, Dict[str, tf_variables.Variable]]:
@@ -760,7 +906,7 @@ def _wrap_param(param, dtype=dtypes.float32):
       table = self.variables[table_name]["parameters"]
       optimizer = self._stacked_table_to_tables[table_name][0].optimizer
       if isinstance(optimizer, tpu_embedding_v2_utils.SGD):
-        updated_embedding_table = xla_ops.xla_sparse_dense_matmul_grad_with_sgd_and_csr_input(
+        updated_embedding_table = xla_ops.xla_sparse_dense_matmul_grad_with_sgd_and_static_buffer_size(
             row_pointers=partitioned_tensor.row_pointers,
             sorted_sample_ids=partitioned_tensor.sorted_sample_ids,
             sorted_token_ids=partitioned_tensor.sorted_token_ids,
@@ -769,13 +915,19 @@ def _wrap_param(param, dtype=dtypes.float32):
             learning_rate=_wrap_param(optimizer.learning_rate),
             embedding_table=table.read_value(),
             num_minibatches_per_physical_sparse_core=num_minibatches_per_physical_sparse_core,
+            max_ids_per_sparse_core=self._table_to_max_ids_per_sparse_core[
+                table_name
+            ],
+            max_unique_ids_per_sparse_core=self._table_to_max_unique_ids_per_sparse_core[
+                table_name
+            ],
             table_name=table_name,
         )
         table.assign(updated_embedding_table)
       elif isinstance(optimizer, tpu_embedding_v2_utils.Adagrad):
         accumulators = self.variables[table_name]["accumulators"]
         updated_embedding_table, updated_accumulator = (
-            xla_ops.xla_sparse_dense_matmul_grad_with_adagrad_and_csr_input(
+            xla_ops.xla_sparse_dense_matmul_grad_with_adagrad_and_static_buffer_size(
                 row_pointers=partitioned_tensor.row_pointers,
                 sorted_sample_ids=partitioned_tensor.sorted_sample_ids,
                 sorted_token_ids=partitioned_tensor.sorted_token_ids,
@@ -785,6 +937,12 @@ def _wrap_param(param, dtype=dtypes.float32):
                 embedding_table=table.read_value(),
                 accumulator=accumulators.read_value(),
                 num_minibatches_per_physical_sparse_core=num_minibatches_per_physical_sparse_core,
+                max_ids_per_sparse_core=self._table_to_max_ids_per_sparse_core[
+                    table_name
+                ],
+                max_unique_ids_per_sparse_core=self._table_to_max_unique_ids_per_sparse_core[
+                    table_name
+                ],
                 table_name=table_name,
             )
         )
@@ -794,7 +952,7 @@ def _wrap_param(param, dtype=dtypes.float32):
         accumulators = self.variables[table_name]["accumulators"]
         momenta = self.variables[table_name]["momenta"]
         updated_embedding_table, updated_accumulator, updated_momenta = (
-            xla_ops.xla_sparse_dense_matmul_grad_with_adagrad_momentum_and_csr_input(
+            xla_ops.xla_sparse_dense_matmul_grad_with_adagrad_momentum_and_static_buffer_size(
                 row_pointers=partitioned_tensor.row_pointers,
                 sorted_sample_ids=partitioned_tensor.sorted_sample_ids,
                 sorted_token_ids=partitioned_tensor.sorted_token_ids,
@@ -810,6 +968,12 @@ def _wrap_param(param, dtype=dtypes.float32):
                 beta1=optimizer.momentum,
                 beta2=optimizer.beta2,
                 epsilon=optimizer.epsilon,
+                max_ids_per_sparse_core=self._table_to_max_ids_per_sparse_core[
+                    table_name
+                ],
+                max_unique_ids_per_sparse_core=self._table_to_max_unique_ids_per_sparse_core[
+                    table_name
+                ],
                 table_name=table_name,
             )
         )
@@ -820,7 +984,7 @@ def _wrap_param(param, dtype=dtypes.float32):
         momenta = self.variables[table_name]["momenta"]
         velocity = self.variables[table_name]["velocities"]
         updated_embedding_table, updated_momenta, updated_velocity = (
-            xla_ops.xla_sparse_dense_matmul_grad_with_adam_and_csr_input(
+            xla_ops.xla_sparse_dense_matmul_grad_with_adam_and_static_buffer_size(
                 row_pointers=partitioned_tensor.row_pointers,
                 sorted_sample_ids=partitioned_tensor.sorted_sample_ids,
                 sorted_token_ids=partitioned_tensor.sorted_token_ids,
@@ -835,6 +999,12 @@ def _wrap_param(param, dtype=dtypes.float32):
                 beta1=optimizer.beta_1,
                 beta2=optimizer.beta_2,
                 epsilon=optimizer.epsilon,
+                max_ids_per_sparse_core=self._table_to_max_ids_per_sparse_core[
+                    table_name
+                ],
+                max_unique_ids_per_sparse_core=self._table_to_max_unique_ids_per_sparse_core[
+                    table_name
+                ],
                 table_name=table_name,
             )
         )
@@ -845,7 +1015,7 @@ def _wrap_param(param, dtype=dtypes.float32):
         accumulators = self.variables[table_name]["accumulators"]
         linears = self.variables[table_name]["linears"]
         (updated_table_tensor, updated_accum_tensor, updated_linear_tensor) = (
-            xla_ops.xla_sparse_dense_matmul_grad_with_ftrl_and_csr_input(
+            xla_ops.xla_sparse_dense_matmul_grad_with_ftrl_and_static_buffer_size(
                 row_pointers=partitioned_tensor.row_pointers,
                 sorted_sample_ids=partitioned_tensor.sorted_sample_ids,
                 sorted_token_ids=partitioned_tensor.sorted_token_ids,
@@ -861,346 +1031,28 @@ def _wrap_param(param, dtype=dtypes.float32):
                 learning_rate_power=optimizer.learning_rate_power,
                 l1_regularization_strength=optimizer.l1_regularization_strength,
                 l2_regularization_strength=optimizer.l2_regularization_strength,
+                max_ids_per_sparse_core=self._table_to_max_ids_per_sparse_core[
+                    table_name
+                ],
+                max_unique_ids_per_sparse_core=self._table_to_max_unique_ids_per_sparse_core[
+                    table_name
+                ],
                 table_name=table_name,
             )
         )
         linears.assign(updated_linear_tensor)
         accumulators.assign(updated_accum_tensor)
         table.assign(updated_table_tensor)
-      else:
-        raise ValueError("Unsupported optimizer in minibatching mode.")
-
-    context.Exit()
-
-  def _stack_gradients(self, gradients):
-    """Stack the incoming gradients to per table gradients."""
-
-    # Gradients are stacked in a particular order. That order is the order
-    # features appear in the self._flat_features.
-    table_to_gradient_list = {
-        table_name: [] for table_name in self._stacked_table_to_tables
-    }
-    flattend_gradients = nest.flatten(gradients)
-    for gradient, (path, feature) in zip(
-        flattend_gradients, self._flat_features
-    ):
-      sample_count = functools.reduce(operator.mul, feature.output_shape)
-      if gradient is not None and not isinstance(gradient, tensor.Tensor):
-        raise ValueError(
-            f"found non-tensor type: {type(gradient)} at path {path}."
-        )
-      if gradient is None:
-        # TODO(bfontain): In the case that an entire table's gradient is gone
-        # then maybe we can just omit the update all together?
-        logging.warning(
-            (
-                "No gradient passed for feature %s, sending zero "
-                "gradient. This may not be correct behavior for certain "
-                "optimizers like Adam."
-            ),
-            path,
-        )
-        gradient = array_ops.zeros(
-            (sample_count, feature.table.dim), dtype=dtypes.float32
-        )
-      table_name = self._table_to_stacked_table_offset[feature.table.name][0]
-      extra_cols = self._table_to_padding_columns[feature.table.name]
-      gradient = array_ops.reshape(
-          gradient, [-1, feature.table.dim - extra_cols]
-      )
-      if extra_cols != 0:
-        gradient = array_ops.pad(gradient, [[0, 0], [0, extra_cols]])
-        # Ensure static shape after padding.
-        gradient.set_shape([sample_count, feature.table.dim])
-      table_to_gradient_list[table_name].append(gradient)
-
-    return {
-        table_name: array_ops.concat(table_to_gradient_list[table_name], axis=0)
-        for table_name in table_to_gradient_list
-    }
-
-  def _unstack_activations(self, activations: Dict[str, tensor.Tensor]):
-    """Untack the incoming per table activations into per feature."""
-
-    # Activations are stacked in a particular order. That order is the order
-    # features appear in the self._flat_features.
-
-    flattened_activations = []
-    table_to_current_offset = {
-        table_name: 0 for table_name in self._stacked_table_to_tables
-    }
-    for _, feature in self._flat_features:
-      sample_count = functools.reduce(operator.mul, feature.output_shape)
-      table_name = self._table_to_stacked_table_offset[feature.table.name][0]
-      extra_cols = self._table_to_padding_columns[feature.table.name]
-      activation = array_ops.slice(
-          activations[table_name],
-          [table_to_current_offset[table_name], 0],
-          [sample_count, feature.table.dim - extra_cols],
-      )
-
-      # Reshape to follow the user's requested output shape.
-      activation = array_ops.reshape(
-          activation,
-          list(feature.output_shape) + [feature.table.dim - extra_cols],
-      )
-      flattened_activations.append(activation)
-      table_to_current_offset[table_name] += sample_count
-
-    return nest.pack_sequence_as(self._feature_config, flattened_activations)
-
-  def __call__(
-      self, features: Any, weights: Optional[Any] = None
-  ) -> Tuple[Any, Dict[str, PartitionedCsrFormatTensor]]:
-    """Call the mid level api to do embedding lookup."""
-    return self.embedding_lookup(features, weights)
-
-  @staticmethod
-  def _convert_input_feature_to_coo(
-      input_feature: Union[
-          tensor.Tensor, sparse_tensor.SparseTensor, ragged_tensor.RaggedTensor
-      ],
-      weight: Optional[tensor.Tensor],
-      feature_config: tpu_embedding_v2_utils.FeatureConfig,
-      row_offset: int,
-      col_offset: int,
-      col_shift: int,
-      vocab_size: int,
-      num_sc_shards: int,
-  ) -> Any:
-    """Convert any of the expected input types to a COO format."""
-    sample_count = functools.reduce(operator.mul, feature_config.output_shape)
-    if isinstance(input_feature, tensor.Tensor):
-      input_feature = array_ops.reshape(input_feature, [-1])
-      if weight is None:
-        weight = array_ops.ones_like(input_feature, dtype=dtypes.float32)
-      elif isinstance(weight, tensor.Tensor):
-        weight = array_ops.reshape(weight, [-1])
-      else:
-        raise ValueError(
-            f"Expect weight to be Tensor type but got {type(weight)}"
-        )
-      row_ids, col_ids, gains = xla_ops.convert_to_coo_tensor(
-          indices_or_row_splits=array_ops.zeros((0,), dtype=dtypes.int32),
-          values=math_ops.cast(input_feature, dtype=dtypes.int32),
-          weights=math_ops.cast(weight, dtypes.float32),
-          sample_count=sample_count,
-          combiner=feature_config.table.combiner,
-      )
-    elif isinstance(input_feature, sparse_tensor.SparseTensor):
-      if weight is None:
-        weight = array_ops.ones_like(input_feature.values, dtype=dtypes.float32)
-      elif isinstance(weight, sparse_tensor.SparseTensor):
-        weight = weight.values
-      else:
-        raise ValueError(
-            f"Expect weight to be SparseTensor type but got {type(weight)}"
-        )
-      row_ids, col_ids, gains = xla_ops.convert_to_coo_tensor(
-          indices_or_row_splits=math_ops.cast(
-              input_feature.indices, dtype=dtypes.int32
-          ),
-          values=math_ops.cast(input_feature.values, dtype=dtypes.int32),
-          weights=math_ops.cast(weight, dtypes.float32),
-          sample_count=sample_count,
-          combiner=feature_config.table.combiner,
-      )
-    elif isinstance(input_feature, ragged_tensor.RaggedTensor):
-      if not weight:
-        weight = array_ops.ones_like(input_feature.values, dtype=dtypes.float32)
-      elif isinstance(weight, ragged_tensor.RaggedTensor):
-        weight = weight.values
-      else:
-        raise ValueError(
-            f"Expect weight to be RaggedTensor type but got {type(weight)}"
-        )
-      row_ids, col_ids, gains = xla_ops.convert_to_coo_tensor(
-          indices_or_row_splits=math_ops.cast(
-              input_feature.row_splits, dtype=dtypes.int32
-          ),
-          values=math_ops.cast(input_feature.values, dtype=dtypes.int32),
-          weights=math_ops.cast(weight, dtypes.float32),
-          sample_count=sample_count,
-          combiner=feature_config.table.combiner,
-      )
-    else:
-      raise ValueError(
-          f"Input of unknown type {type(input_feature)}. Please only pass "
-          "Tensor, SparseTensor or RaggedTensor as input to embedding "
-          "lookup."
-      )
-    return (
-        row_ids + row_offset,
-        (
-            (col_ids + col_shift) % num_sc_shards
-            + (col_ids // num_sc_shards * num_sc_shards)
-            + col_offset
-        ),
-        gains,
-    )
-
-  @staticmethod
-  def _preprocess_inputs_and_weights_to_coo_tensor(
-      flat_inputs: Any,
-      flat_weights: Any,
-      flat_features: Any,
-      stacked_table_to_tables: Dict[str, Any],
-      table_to_stacked_table_offset: Dict[str, Tuple[str, int, int]],
-      feature_to_sample_offset: Dict[str, int],
-      num_sc_shards: int,
-  ) -> Dict[str, Any]:
-    """Convert the raw inputs into coo tensor."""
-    table_to_list_of_coos = {
-        table_name: ([], [], []) for table_name in stacked_table_to_tables
-    }
-    for inp, weight, (feature_path, feature) in zip(
-        flat_inputs, flat_weights, flat_features
-    ):
-      table_name, col_offset, col_shift = table_to_stacked_table_offset[
-          feature.table.name
-      ]
-      row_offset = feature_to_sample_offset[feature_path]
-      # Consider making this into one op per table rather than per feature?
-      row_ids, col_ids, gains = TPUEmbeddingV2._convert_input_feature_to_coo(
-          inp,
-          weight,
-          feature,
-          row_offset,
-          col_offset,
-          col_shift,
-          feature.table.vocabulary_size,
-          num_sc_shards,
-      )
-      table_to_list_of_coos[table_name][0].append(row_ids)
-      table_to_list_of_coos[table_name][1].append(col_ids)
-      table_to_list_of_coos[table_name][2].append(gains)
-
-    return table_to_list_of_coos
-
-  @staticmethod
-  def _get_minibatch_splits_from_coo_tensor(
-      num_replicas_in_sync: int,
-      table_to_list_of_coos: Dict[str, Any],
-      stacked_table_to_tables: Dict[str, Any],
-      table_to_sample_count: Dict[str, int],
-      num_sc_per_chip: int,
-  ) -> Tuple[Dict[str, Any], List[tensor.Tensor]]:
-    """Compute minibatch splits from the coo tensor."""
-    table_to_sorted_coo_tensor = {}
-    per_replica_table_splits = []
-    for table_name in stacked_table_to_tables:
-      row_ids = array_ops.concat(table_to_list_of_coos[table_name][0], axis=0)
-      col_ids = array_ops.concat(table_to_list_of_coos[table_name][1], axis=0)
-      gains = array_ops.concat(table_to_list_of_coos[table_name][2], axis=0)
-
-      # Feature width are the same across stacked tables.
-      feature_width = stacked_table_to_tables[table_name][0].dim
-
-      total_vocab_size = sum(
-          [
-              table.vocabulary_size
-              for table in stacked_table_to_tables[table_name]
-          ]
-      )
-
-      (
-          sorted_row_ids,
-          sorted_col_ids,
-          sorted_gains,
-          splits,
-          id_counts,
-          unused_max_ids,
-          unused_max_uniques,
-      ) = xla_ops.get_minibatch_splits_with_physical_replica(
-          program_key=constant_op.constant([""]),
-          row_ids=row_ids,
-          col_ids=col_ids,
-          gains=gains,
-          sample_count=table_to_sample_count[table_name],
-          num_replica=num_replicas_in_sync,
-          table_vocab_size=total_vocab_size,
-          feature_width=feature_width,
-          num_sc_per_chip=num_sc_per_chip,
-          table_name=table_name,
-          mini_batch_splits="",
-      )
-
-      table_to_sorted_coo_tensor[table_name] = (
-          sorted_row_ids,
-          sorted_col_ids,
-          sorted_gains,
-          id_counts,
-      )
-
-      per_replica_table_splits.append(splits)
-
-    return (table_to_sorted_coo_tensor, per_replica_table_splits)
-
-  @staticmethod
-  def _get_minibatches_from_sorted_coo_tensor(
-      num_replicas_in_sync: int,
-      max_ids_per_chip_per_sample: int,
-      max_minibatches_per_sc: int,
-      table_to_sorted_coo_tensor: Dict[str, Any],
-      cross_replica_table_splits: tensor.Tensor,
-      stacked_table_to_tables: Dict[str, Any],
-      table_to_sample_count: Dict[str, int],
-      num_sc_per_chip: int,
-  ) -> Any:
-    """Partition the sorted coo tensor into minibatches."""
-    table_to_csr_format_tensor = {}
-    for table_name in stacked_table_to_tables:
-      sorted_row_ids, sorted_col_ids, sorted_gains, id_counts = (
-          table_to_sorted_coo_tensor[table_name]
-      )
+      else:
+        raise ValueError("Unsupported optimizer in minibatching mode.")
 
-      # Feature width are the same across stacked tables.
-      feature_width = stacked_table_to_tables[table_name][0].dim
+    context.Exit()
 
-      total_vocab_size = sum(
-          [
-              table.vocabulary_size
-              for table in stacked_table_to_tables[table_name]
-          ]
-      )
-      (
-          row_pointers,
-          sorted_sample_ids,
-          sorted_token_ids,
-          sorted_gains,
-          row_pointers_unpadded_size,
-          ids_unpadded_size,
-          num_minibatches_per_physical_sparse_core,
-      ) = xla_ops.get_minibatches_in_csr_with_physical_replica(
-          program_key=constant_op.constant([""]),
-          row_ids=sorted_row_ids,
-          col_ids=sorted_col_ids,
-          gains=sorted_gains,
-          splits=cross_replica_table_splits,
-          id_counts=id_counts,
-          sample_count=table_to_sample_count[table_name],
-          num_replica=num_replicas_in_sync,
-          max_minibatches_per_sc=max_minibatches_per_sc,
-          max_ids_per_chip_per_sample=max_ids_per_chip_per_sample,
-          table_vocab_size=total_vocab_size,
-          feature_width=feature_width,
-          num_sc_per_chip=num_sc_per_chip,
-          table_name=table_name,
-          mini_batch_in_csr="",
-      )
-      table_to_csr_format_tensor[table_name] = (
-          PartitionedCsrFormatTensor(
-              row_pointers=row_pointers,
-              sorted_sample_ids=sorted_sample_ids,
-              sorted_token_ids=sorted_token_ids,
-              sorted_gains=sorted_gains,
-              sample_count=table_to_sample_count[table_name],
-              num_minibatches_per_physical_sparse_core=num_minibatches_per_physical_sparse_core,
-          ),
-          row_pointers_unpadded_size,
-          ids_unpadded_size,
-      )
-    return table_to_csr_format_tensor
+  def __call__(
+      self, features: Any, weights: Optional[Any] = None
+  ) -> Tuple[Any, Dict[str, PartitionedCsrFormatTensor]]:
+    """Call the mid level api to do embedding lookup."""
+    return self.embedding_lookup(features, weights)
 
   # TODO(pineapplejuice233): Duplicated helper function from tpu_embedding_v2.py. Remove
   # this once this file is open souced.
@@ -1232,70 +1084,102 @@ def _raise_error_for_incorrect_control_flow_context(self):
       )
     return in_tpu_ctx
 
-  @staticmethod
-  def preprocess_features(
-      num_replicas_in_sync: int,
-      max_ids_per_chip_per_sample: int,
-      max_minibatches_per_sc: int,
-      num_sc_per_chip: int,
-      num_sc_shards: int,
-      stacked_table_to_tables: Dict[str, Any],
-      table_to_stacked_table_offset: Dict[str, Tuple[str, int, int]],
-      table_to_sample_count: Dict[str, int],
-      feature_to_sample_offset: Dict[str, int],
-      flat_features: Any,
-      flat_inputs: Any,
-      flat_weights: Optional[Any] = None,
-  ) -> Any:
-    """Function to preprocess features."""
-    # Preprocess the inputs into COO tensor.
-    table_to_list_of_coos = (
-        TPUEmbeddingV2._preprocess_inputs_and_weights_to_coo_tensor(
-            flat_inputs,
-            flat_weights,
-            flat_features,
-            stacked_table_to_tables,
-            table_to_stacked_table_offset,
-            feature_to_sample_offset,
-            num_sc_shards,
-        )
+  @classmethod
+  def compute_sparse_core_stats(
+      cls,
+      features: Any,
+      feature_config: Union[FeatureConfig, Iterable],  # pylint:disable=g-bare-generic
+      num_tpu_chips: int,
+      num_sc_per_chip: int = 4,
+      optimizer: Optional[tpu_embedding_v2_utils._Optimizer] = None,  # pylint:disable=protected-access
+      sparse_core_embedding_config: Optional[SparseCoreEmbeddingConfig] = None,
+  ) -> Tuple[Any, Any]:
+    """Computes the max_ids/unique ids settings from the input features."""
+    copy_feature_config = _clone_feature_config(feature_config)
+    table_config = []
+    for feature in nest.flatten(copy_feature_config):
+      table_config.append(feature.table)
+
+    for table in table_config:
+      if table.optimizer is None:
+        table.optimizer = optimizer
+
+    flat_features = nest.flatten_with_joined_string_paths(copy_feature_config)
+
+    s = _stack_tables_with_same_table_dim_and_optimizer(
+        table_config,
+        flat_features,
+        num_tpu_chips,
+        num_sc_per_chip,
+        sparse_core_embedding_config,
     )
 
-    # Get minibatch splits from the COO tensor.
-    table_to_sorted_coo_tensor, per_replica_table_splits = (
-        TPUEmbeddingV2._get_minibatch_splits_from_coo_tensor(
-            num_replicas_in_sync,
-            table_to_list_of_coos,
-            stacked_table_to_tables,
-            table_to_sample_count,
-            num_sc_per_chip,
+    flat_inputs = nest.flatten(features)
+
+    # First process them to be COO tensors.
+    table_to_list_of_coos = (
+        TPUEmbeddingV2._preprocess_inputs_and_weights_to_list_of_coo_tensors(
+            flat_inputs=flat_inputs,
+            flat_weights=[None] * len(flat_inputs),
+            flat_features=flat_features,
+            stacked_table_to_tables=s.stacked_table_to_tables,
+            table_to_stacked_table_offset=s.table_to_stacked_table_offset,
+            feature_to_sample_offset=s.feature_to_sample_offset,
+            num_sc_per_chip=num_sc_per_chip,
+            stacked_table_to_sample_count=s.table_to_sample_count,
+            num_sc_shards=num_sc_per_chip * num_tpu_chips,
         )
     )
 
-    # Collective all gather across replicas to get final splits.
-    cross_replica_table_splits = gen_collective_ops.collective_gather_v2(
-        input=per_replica_table_splits,
-        group_size=num_replicas_in_sync,
-        group_key=0,
-        instance_key=math_ops.cast(xla_ops.global_iter_id(), dtypes.int32),
-        ordering_token=[],
-    )
+    table_to_max_ids_per_sparse_core = {
+        table_name: 0 for table_name in s.stacked_table_to_tables
+    }
+    table_to_max_unique_ids_per_sparse_core = {
+        table_name: 0 for table_name in s.stacked_table_to_tables
+    }
 
-    # Use the final splits to convert COO tensors into CSR formatted tensor.
-    table_to_csr_format_tensor = (
-        TPUEmbeddingV2._get_minibatches_from_sorted_coo_tensor(
-            num_replicas_in_sync,
-            max_ids_per_chip_per_sample,
-            max_minibatches_per_sc,
-            table_to_sorted_coo_tensor,
-            cross_replica_table_splits,
-            stacked_table_to_tables,
-            table_to_sample_count,
-            num_sc_per_chip,
+    for table_name in s.stacked_table_to_tables:
+      feature_width = s.stacked_table_to_tables[table_name][0].dim
+
+      total_vocab_size = sum([
+          table.vocabulary_size
+          for table in s.stacked_table_to_tables[table_name]
+      ])
+      for i in range(num_sc_per_chip):
+        row_ids_list = table_to_list_of_coos[table_name][0][i]
+        col_ids_list = table_to_list_of_coos[table_name][1][i]
+        gains_list = table_to_list_of_coos[table_name][2][i]
+        sample_count_list = table_to_list_of_coos[table_name][3]
+        col_offset_list = table_to_list_of_coos[table_name][4]
+
+        (
+            max_ids_per_sparse_core,
+            max_unique_ids_per_sparse_core,
+        ) = xla_ops.get_stats_from_list_of_sparse_core_coo_tensors(
+            row_ids_list=row_ids_list,
+            col_ids_list=col_ids_list,
+            gains_list=gains_list,
+            sample_count_list=sample_count_list,
+            col_offset_list=col_offset_list,
+            num_replica=num_tpu_chips,
+            table_vocab_size=total_vocab_size,
+            feature_width=feature_width,
+            num_sc_per_chip=num_sc_per_chip,
+            table_name=table_name,
         )
-    )
 
-    return table_to_csr_format_tensor
+        table_to_max_ids_per_sparse_core[table_name] = math_ops.maximum(
+            table_to_max_ids_per_sparse_core[table_name],
+            max_ids_per_sparse_core,
+        )
+        table_to_max_unique_ids_per_sparse_core[table_name] = math_ops.maximum(
+            table_to_max_unique_ids_per_sparse_core[table_name],
+            max_unique_ids_per_sparse_core,
+        )
+    return (
+        table_to_max_ids_per_sparse_core,
+        table_to_max_unique_ids_per_sparse_core,
+    )
 
   def enqueue(
       self,
@@ -1317,7 +1201,7 @@ def enqueue(
     if in_tpu_context:
       # Automatically apply outside compilation if we are in tpu context.
       return tpu_replication.outside_compilation(
-          TPUEmbeddingV2.preprocess_features,
+          self._preprocess_features,
           num_replicas_in_sync=self._strategy.num_replicas_in_sync,
           max_ids_per_chip_per_sample=self.max_ids_per_chip_per_sample,
           max_minibatches_per_sc=self.max_minibatches_per_sc,
@@ -1337,7 +1221,7 @@ def enqueue(
       tpu_devices = self._strategy.extended._tpu_devices  # pylint:disable=protected-access
 
       with ops.device(device_util.get_host_for_device(tpu_devices[0][0])):
-        return TPUEmbeddingV2.preprocess_features(
+        return self._preprocess_features(
             num_replicas_in_sync=self._strategy.num_replicas_in_sync,
             max_ids_per_chip_per_sample=self.max_ids_per_chip_per_sample,
             max_minibatches_per_sc=self.max_minibatches_per_sc,
@@ -1357,7 +1241,7 @@ def enqueue(
         raise ValueError("Non-TPU device {} passed to enqueue.".format(device))
 
       with ops.device(device_util.get_host_for_device(device)):
-        return TPUEmbeddingV2.preprocess_features(
+        return self._preprocess_features(
             num_replicas_in_sync=self._strategy.num_replicas_in_sync,
             max_ids_per_chip_per_sample=self.max_ids_per_chip_per_sample,
             max_minibatches_per_sc=self.max_minibatches_per_sc,
@@ -1434,8 +1318,6 @@ def dequeue(
         _PIPELINE_MODE_FORWARD, self._pipelining
     )
     context.Enter()
-    # TODO(pineapplejuice233): Add the virtual infeed dequeue here to get the tensors
-    # rather than getting them from arguments.
     partitioned_tensors = tpu_replication.outside_compilation(
         self._copy_tensors_to_device,
         partitioned_tensors=partitioned_tensors,
@@ -1458,7 +1340,7 @@ def dequeue(
             "Expect PartitionedCsrFormatTensor but get"
             f" {type(partitioned_tensor)}."
         )
-      activation = xla_ops.xla_sparse_dense_matmul_with_csr_input(
+      activation = xla_ops.xla_sparse_dense_matmul_with_static_buffer_size(
           row_pointers=partitioned_tensor.row_pointers,
           sorted_sample_ids=partitioned_tensor.sorted_sample_ids,
           sorted_token_ids=partitioned_tensor.sorted_token_ids,
@@ -1475,6 +1357,12 @@ def dequeue(
           quantization_config_num_buckets=(
               quantization_config.num_buckets if quantization_config else 0
           ),
+          max_ids_per_sparse_core=self._table_to_max_ids_per_sparse_core[
+              table_name
+          ],
+          max_unique_ids_per_sparse_core=self._table_to_max_unique_ids_per_sparse_core[
+              table_name
+          ],
           table_name=table_name,
       )
 
@@ -1533,10 +1421,8 @@ def embedding_lookup(
 
     return result
 
-  # TODO(pineapplejuice233): Enable these methods later if needed. Don't open source these
-  # methods as these ops are not available yet.
-  @staticmethod
-  def _experimental_preprocess_features(
+  def _preprocess_features(
+      self,
       num_replicas_in_sync: int,
       max_ids_per_chip_per_sample: int,
       max_minibatches_per_sc: int,
@@ -1552,54 +1438,30 @@ def _experimental_preprocess_features(
   ) -> Any:
     """Function to preprocess features."""
     # Preprocess the inputs into list of COO tensor.
-    table_to_list_of_coos = TPUEmbeddingV2._experimental_preprocess_inputs_and_weights_to_list_of_coo_tensors(
-        flat_inputs,
-        flat_weights,
-        flat_features,
-        stacked_table_to_tables,
-        table_to_stacked_table_offset,
-        feature_to_sample_offset,
-        num_sc_per_chip,
-        table_to_sample_count,
-        num_sc_shards,
-    )
-
-    # Sort the COO tensors and compute whether minibatching is needed.
-    table_to_sorted_coo_tensor, is_minibatching_needed_per_replica = (
-        TPUEmbeddingV2._experimental_sort_list_of_coo_tensors(
-            num_replicas_in_sync,
-            table_to_list_of_coos,
+    table_to_list_of_coos = (
+        TPUEmbeddingV2._preprocess_inputs_and_weights_to_list_of_coo_tensors(
+            flat_inputs,
+            flat_weights,
+            flat_features,
             stacked_table_to_tables,
+            table_to_stacked_table_offset,
+            feature_to_sample_offset,
             num_sc_per_chip,
+            table_to_sample_count,
+            num_sc_shards,
         )
     )
 
-    # Collective all gather across replicas to determine whether minibatching
-    # is needed.
-    is_minibatching_needed_cross_replica = (
-        gen_collective_ops.collective_gather_v2(
-            input=is_minibatching_needed_per_replica,
-            group_size=num_replicas_in_sync,
-            group_key=0,
-            instance_key=math_ops.cast(xla_ops.global_iter_id(), dtypes.int32),
-            ordering_token=[],
-        )
+    # Sort the COO tensors.
+    table_to_sorted_coo_tensor = self._sort_list_of_coo_tensors(
+        num_replicas_in_sync,
+        table_to_list_of_coos,
+        stacked_table_to_tables,
+        num_sc_per_chip,
     )
 
-    table_to_csr_format_tensor = cond.cond(
-        math_ops.equal(
-            math_ops.reduce_sum(is_minibatching_needed_cross_replica), 0
-        ),
-        lambda: TPUEmbeddingV2._experimental_get_single_minibatch_from_sorted_coo_tensor(  # pylint: disable=g-long-lambda
-            num_replicas_in_sync,
-            max_ids_per_chip_per_sample,
-            max_minibatches_per_sc,
-            table_to_sorted_coo_tensor,
-            stacked_table_to_tables,
-            table_to_sample_count,
-            num_sc_per_chip,
-        ),
-        lambda: TPUEmbeddingV2._experimental_get_multiple_minibatches_from_sorted_coo_tensor(  # pylint: disable=g-long-lambda
+    table_to_csr_format_tensor = (
+        self._get_csr_wrapped_coo_from_sorted_coo_tensor(
             num_replicas_in_sync,
             max_ids_per_chip_per_sample,
             max_minibatches_per_sc,
@@ -1607,15 +1469,14 @@ def _experimental_preprocess_features(
             stacked_table_to_tables,
             table_to_sample_count,
             num_sc_per_chip,
-        ),
-        strict=True,
+        )
     )
 
     return table_to_csr_format_tensor
 
-  # TODO(pineapplejuice233): Do not use it as they are experimental.
-  @staticmethod
-  def _experimental_convert_input_feature_to_list_of_coo_tensors(
+  @classmethod
+  def _convert_input_feature_to_list_of_coo_tensors(
+      cls,
       input_feature: Union[
           tensor.Tensor, sparse_tensor.SparseTensor, ragged_tensor.RaggedTensor
       ],
@@ -1642,13 +1503,18 @@ def _experimental_convert_input_feature_to_list_of_coo_tensors(
             f"Expect weight to be Tensor type but got {type(weight)}"
         )
       row_ids_list, col_ids_list, gains_list = (
-          xla_ops.convert_to_list_of_coo_tensors(
+          xla_ops.convert_to_list_of_sparse_core_coo_tensors(
               indices_or_row_splits=array_ops.zeros((0,), dtype=dtypes.int32),
               values=math_ops.cast(input_feature, dtype=dtypes.int32),
               weights=math_ops.cast(weight, dtypes.float32),
               sample_count=sample_count,
               combiner=feature_config.table.combiner,
               num_sc_per_chip=num_sc_per_chip,
+              row_offset=row_offset,
+              col_offset=col_offset,
+              col_shift=col_shift,
+              num_sc_shards=num_sc_shards,
+              stacked_table_sample_count=stacked_table_sample_count,
           )
       )
     elif isinstance(input_feature, sparse_tensor.SparseTensor):
@@ -1661,7 +1527,7 @@ def _experimental_convert_input_feature_to_list_of_coo_tensors(
             f"Expect weight to be SparseTensor type but got {type(weight)}"
         )
       row_ids_list, col_ids_list, gains_list = (
-          xla_ops.convert_to_list_of_coo_tensors(
+          xla_ops.convert_to_list_of_sparse_core_coo_tensors(
               indices_or_row_splits=math_ops.cast(
                   input_feature.indices, dtype=dtypes.int32
               ),
@@ -1670,6 +1536,11 @@ def _experimental_convert_input_feature_to_list_of_coo_tensors(
               sample_count=sample_count,
               combiner=feature_config.table.combiner,
               num_sc_per_chip=num_sc_per_chip,
+              row_offset=row_offset,
+              col_offset=col_offset,
+              col_shift=col_shift,
+              num_sc_shards=num_sc_shards,
+              stacked_table_sample_count=stacked_table_sample_count,
           )
       )
     elif isinstance(input_feature, ragged_tensor.RaggedTensor):
@@ -1682,7 +1553,7 @@ def _experimental_convert_input_feature_to_list_of_coo_tensors(
             f"Expect weight to be RaggedTensor type but got {type(weight)}"
         )
       row_ids_list, col_ids_list, gains_list = (
-          xla_ops.convert_to_list_of_coo_tensors(
+          xla_ops.convert_to_list_of_sparse_core_coo_tensors(
               indices_or_row_splits=math_ops.cast(
                   input_feature.row_splits, dtype=dtypes.int32
               ),
@@ -1691,6 +1562,11 @@ def _experimental_convert_input_feature_to_list_of_coo_tensors(
               sample_count=sample_count,
               combiner=feature_config.table.combiner,
               num_sc_per_chip=num_sc_per_chip,
+              row_offset=row_offset,
+              col_offset=col_offset,
+              col_shift=col_shift,
+              num_sc_shards=num_sc_shards,
+              stacked_table_sample_count=stacked_table_sample_count,
           )
       )
     else:
@@ -1699,22 +1575,11 @@ def _experimental_convert_input_feature_to_list_of_coo_tensors(
           "Tensor, SparseTensor or RaggedTensor as input to embedding "
           "lookup."
       )
-    for i in range(num_sc_per_chip):
-      row_ids_list[i] = (
-          row_ids_list[i] % (sample_count // num_sc_per_chip)
-          + int(row_offset // num_sc_per_chip)
-          + int(stacked_table_sample_count // num_sc_per_chip) * i
-      )
-      col_ids_list[i] = (
-          (col_ids_list[i] + col_shift) % num_sc_shards
-          + (col_ids_list[i] // num_sc_shards * num_sc_shards)
-          + col_offset
-      )
     return row_ids_list, col_ids_list, gains_list, sample_count
 
-  # TODO(pineapplejuice233): Do not use it as they are experimental.
-  @staticmethod
-  def _experimental_preprocess_inputs_and_weights_to_list_of_coo_tensors(
+  @classmethod
+  def _preprocess_inputs_and_weights_to_list_of_coo_tensors(
+      cls,
       flat_inputs: Any,
       flat_weights: Any,
       flat_features: Any,
@@ -1746,7 +1611,7 @@ def _experimental_preprocess_inputs_and_weights_to_list_of_coo_tensors(
       row_offset = feature_to_sample_offset[feature_path]
       # Consider making this into one op per table rather than per feature?
       row_ids_list, col_ids_list, gains_list, sample_count = (
-          TPUEmbeddingV2._experimental_convert_input_feature_to_list_of_coo_tensors(
+          TPUEmbeddingV2._convert_input_feature_to_list_of_coo_tensors(
               inp,
               weight,
               feature,
@@ -1769,9 +1634,8 @@ def _experimental_preprocess_inputs_and_weights_to_list_of_coo_tensors(
       table_to_list_of_coos[table_name][4].append(col_offset)
     return table_to_list_of_coos
 
-  # TODO(pineapplejuice233): Do not use it as they are experimental.
-  @staticmethod
-  def _experimental_sort_list_of_coo_tensors(
+  def _sort_list_of_coo_tensors(
+      self,
       num_replicas_in_sync: int,
       table_to_list_of_coos: Dict[str, Any],
       stacked_table_to_tables: Dict[str, Any],
@@ -1781,17 +1645,13 @@ def _experimental_sort_list_of_coo_tensors(
     table_to_sorted_coo_tensor = {
         table_name: ([], [], [], []) for table_name in stacked_table_to_tables
     }
-    is_minibatching_needed_per_table = []
     for table_name in stacked_table_to_tables:
       # Feature width are the same across stacked tables.
       feature_width = stacked_table_to_tables[table_name][0].dim
 
-      total_vocab_size = sum(
-          [
-              table.vocabulary_size
-              for table in stacked_table_to_tables[table_name]
-          ]
-      )
+      total_vocab_size = sum([
+          table.vocabulary_size for table in stacked_table_to_tables[table_name]
+      ])
       for i in range(num_sc_per_chip):
         row_ids_list = table_to_list_of_coos[table_name][0][i]
         col_ids_list = table_to_list_of_coos[table_name][1][i]
@@ -1804,8 +1664,7 @@ def _experimental_sort_list_of_coo_tensors(
             sorted_col_ids,
             sorted_gains,
             id_counts,
-            is_minibatch_needed,
-        ) = xla_ops.sort_list_of_coo_tensors_with_physical_replica(
+        ) = xla_ops.sort_list_of_sparse_core_coo_tensors(
             row_ids_list=row_ids_list,
             col_ids_list=col_ids_list,
             gains_list=gains_list,
@@ -1815,6 +1674,12 @@ def _experimental_sort_list_of_coo_tensors(
             table_vocab_size=total_vocab_size,
             feature_width=feature_width,
             num_sc_per_chip=num_sc_per_chip,
+            max_ids_per_sparse_core=self._table_to_max_ids_per_sparse_core[
+                table_name
+            ],
+            max_unique_ids_per_sparse_core=self._table_to_max_unique_ids_per_sparse_core[
+                table_name
+            ],
             table_name=table_name,
         )
 
@@ -1823,167 +1688,10 @@ def _experimental_sort_list_of_coo_tensors(
         table_to_sorted_coo_tensor[table_name][2].append(sorted_gains)
         table_to_sorted_coo_tensor[table_name][3].append(id_counts)
 
-        is_minibatching_needed_per_table.append(
-            math_ops.cast(is_minibatch_needed, dtypes.int32)
-        )
-
-    return (table_to_sorted_coo_tensor, is_minibatching_needed_per_table)
-
-  # TODO(pineapplejuice233): Do not use it as they are experimental.
-  @staticmethod
-  def _experimental_get_minibatch_splits_from_sorted_coo_tensor(
-      num_replicas_in_sync: int,
-      table_to_sorted_coo_tensor: Dict[str, Any],
-      stacked_table_to_tables: Dict[str, Any],
-      table_to_sample_count: Dict[str, int],
-      num_sc_per_chip: int,
-  ) -> Tuple[Dict[str, Any], List[tensor.Tensor]]:
-    """Compute minibatch splits from the sorted coo tensor."""
-    table_to_sorted_coo_tensor_with_minibatch = {
-        table_name: ([], [], [], []) for table_name in stacked_table_to_tables
-    }
-    per_replica_table_splits = []
-    for table_name in stacked_table_to_tables:
-      # Feature width are the same across stacked tables.
-      feature_width = stacked_table_to_tables[table_name][0].dim
-
-      total_vocab_size = sum(
-          [
-              table.vocabulary_size
-              for table in stacked_table_to_tables[table_name]
-          ]
-      )
-
-      (
-          sorted_row_ids_list,
-          sorted_col_ids_list,
-          sorted_gains_list,
-          id_counts_list,
-      ) = table_to_sorted_coo_tensor[table_name]
-
-      for i in range(num_sc_per_chip):
-        (sorted_row_ids, sorted_col_ids, sorted_gains, id_counts, splits) = (
-            xla_ops.get_multiple_minibatches_splits_with_physical_replica(
-                sorted_row_ids=sorted_row_ids_list[i],
-                sorted_col_ids=sorted_col_ids_list[i],
-                sorted_gains=sorted_gains_list[i],
-                id_counts=id_counts_list[i],
-                num_replica=num_replicas_in_sync,
-                sample_count_per_sc=table_to_sample_count[table_name]
-                // num_sc_per_chip,
-                table_vocab_size=total_vocab_size,
-                feature_width=feature_width,
-                num_sc_per_chip=num_sc_per_chip,
-                table_name=table_name,
-            )
-        )
-
-        table_to_sorted_coo_tensor_with_minibatch[table_name][0].append(
-            sorted_row_ids
-        )
-        table_to_sorted_coo_tensor_with_minibatch[table_name][1].append(
-            sorted_col_ids
-        )
-        table_to_sorted_coo_tensor_with_minibatch[table_name][2].append(
-            sorted_gains
-        )
-        table_to_sorted_coo_tensor_with_minibatch[table_name][3].append(
-            id_counts
-        )
-        per_replica_table_splits.append(splits)
-
-    return (table_to_sorted_coo_tensor_with_minibatch, per_replica_table_splits)
-
-  # TODO(pineapplejuice233): Do not use it as they are experimental.
-  @staticmethod
-  def _experimental_get_multiple_minibatches_from_sorted_coo_tensor(
-      num_replicas_in_sync: int,
-      max_ids_per_chip_per_sample: int,
-      max_minibatches_per_sc: int,
-      table_to_sorted_coo_tensor: Dict[str, Any],
-      stacked_table_to_tables: Dict[str, Any],
-      table_to_sample_count: Dict[str, int],
-      num_sc_per_chip: int,
-  ) -> Any:
-    """Get multiple minibatches from the sorted coo tensor."""
-
-    table_to_sorted_coo_tensor_with_minibatch, per_replica_table_splits = (
-        TPUEmbeddingV2._experimental_get_minibatch_splits_from_sorted_coo_tensor(
-            num_replicas_in_sync,
-            table_to_sorted_coo_tensor,
-            stacked_table_to_tables,
-            table_to_sample_count,
-            num_sc_per_chip,
-        )
-    )
-
-    # Collective all gather across replicas to get final splits.
-    cross_replica_table_splits = gen_collective_ops.collective_gather_v2(
-        input=per_replica_table_splits,
-        group_size=num_replicas_in_sync,
-        group_key=1,
-        instance_key=math_ops.cast(xla_ops.global_iter_id(), dtypes.int32),
-        ordering_token=[],
-    )
-
-    table_to_csr_format_tensor = {}
-    for table_name in stacked_table_to_tables:
-      (
-          sorted_row_ids_list,
-          sorted_col_ids_list,
-          sorted_gains_list,
-          id_counts_list,
-      ) = table_to_sorted_coo_tensor_with_minibatch[table_name]
-
-      # Feature width are the same across stacked tables.
-      feature_width = stacked_table_to_tables[table_name][0].dim
-
-      total_vocab_size = sum(
-          [
-              table.vocabulary_size
-              for table in stacked_table_to_tables[table_name]
-          ]
-      )
-      (
-          row_pointers,
-          sorted_sample_ids,
-          sorted_token_ids,
-          sorted_gains,
-          row_pointers_unpadded_size,
-          ids_unpadded_size,
-          num_minibatches_per_physical_sparse_core,
-      ) = xla_ops.convert_to_csr_wrapped_coo_with_physical_replica(
-          sorted_row_ids_list=sorted_row_ids_list,
-          sorted_col_ids_list=sorted_col_ids_list,
-          sorted_gains_list=sorted_gains_list,
-          id_counts_list=id_counts_list,
-          splits=cross_replica_table_splits,
-          sample_count_per_sc=table_to_sample_count[table_name]
-          // num_sc_per_chip,
-          num_replica=num_replicas_in_sync,
-          max_minibatches_per_sc=max_minibatches_per_sc,
-          max_ids_per_chip_per_sample=max_ids_per_chip_per_sample,
-          table_vocab_size=total_vocab_size,
-          feature_width=feature_width,
-          table_name=table_name,
-      )
-      table_to_csr_format_tensor[table_name] = (
-          PartitionedCsrFormatTensor(
-              row_pointers=row_pointers,
-              sorted_sample_ids=sorted_sample_ids,
-              sorted_token_ids=sorted_token_ids,
-              sorted_gains=sorted_gains,
-              sample_count=table_to_sample_count[table_name],
-              num_minibatches_per_physical_sparse_core=num_minibatches_per_physical_sparse_core,
-          ),
-          row_pointers_unpadded_size,
-          ids_unpadded_size,
-      )
-    return table_to_csr_format_tensor
+    return table_to_sorted_coo_tensor
 
-  # TODO(pineapplejuice233): Do not use it as they are experimental.
-  @staticmethod
-  def _experimental_get_single_minibatch_from_sorted_coo_tensor(
+  def _get_csr_wrapped_coo_from_sorted_coo_tensor(
+      self,
       num_replicas_in_sync: int,
       max_ids_per_chip_per_sample: int,
       max_minibatches_per_sc: int,
@@ -1992,7 +1700,7 @@ def _experimental_get_single_minibatch_from_sorted_coo_tensor(
       table_to_sample_count: Dict[str, int],
       num_sc_per_chip: int,
   ) -> Any:
-    """Get a single minibatch from the sorted coo tensor."""
+    """Get csr wrapped coo tensor from the sorted coo tensor."""
     table_to_csr_format_tensor = {}
     for table_name in stacked_table_to_tables:
       (
@@ -2005,12 +1713,9 @@ def _experimental_get_single_minibatch_from_sorted_coo_tensor(
       # Feature width are the same across stacked tables.
       feature_width = stacked_table_to_tables[table_name][0].dim
 
-      total_vocab_size = sum(
-          [
-              table.vocabulary_size
-              for table in stacked_table_to_tables[table_name]
-          ]
-      )
+      total_vocab_size = sum([
+          table.vocabulary_size for table in stacked_table_to_tables[table_name]
+      ])
       (
           row_pointers,
           sorted_sample_ids,
@@ -2019,7 +1724,7 @@ def _experimental_get_single_minibatch_from_sorted_coo_tensor(
           row_pointers_unpadded_size,
           ids_unpadded_size,
           num_minibatches_per_physical_sparse_core,
-      ) = xla_ops.convert_to_csr_wrapped_coo_with_physical_replica(
+      ) = xla_ops.convert_to_sparse_core_csr_wrapped_coo_tensor(
           sorted_row_ids_list=sorted_row_ids_list,
           sorted_col_ids_list=sorted_col_ids_list,
           sorted_gains_list=sorted_gains_list,
@@ -2035,6 +1740,7 @@ def _experimental_get_single_minibatch_from_sorted_coo_tensor(
           table_vocab_size=total_vocab_size,
           feature_width=feature_width,
           table_name=table_name,
+          allow_id_dropping=True,  # TODO(pineapplejuice233): make this configurable.
       )
       table_to_csr_format_tensor[table_name] = (
           PartitionedCsrFormatTensor(
@@ -2050,10 +1756,7 @@ def _experimental_get_single_minibatch_from_sorted_coo_tensor(
       )
     return table_to_csr_format_tensor
 
-  # TODO(pineapplejuice233): Do not use it as they are experimental.
-  def _experimental_unstack_activations(
-      self, activations: Dict[str, tensor.Tensor]
-  ):
+  def _unstack_activations(self, activations: Dict[str, tensor.Tensor]):
     """Untack the incoming per table activations into per feature."""
 
     # Activations are stacked in a particular order. That order is the order
@@ -2093,8 +1796,7 @@ def _experimental_unstack_activations(
 
     return nest.pack_sequence_as(self._feature_config, flattened_activations)
 
-  # TODO(pineapplejuice233): Do not use it as they are experimental.
-  def _experimental_stack_gradients(self, gradients):
+  def _stack_gradients(self, gradients):
     """Stack the incoming gradients to per table gradients."""
 
     # Gradients are stacked in a particular order. That order is the order
@@ -2162,7 +1864,9 @@ def _experimental_stack_gradients(self, gradients):
 # this file is OSSed.
 def extract_variable_info(
     kwargs: Any,
-) -> Tuple[str, Tuple[int, ...], dtypes.DType, Callable[[], Any]]:
+) -> Tuple[
+    str, Tuple[int, ...], dtypes.DType, Callable[[], Any], Optional[int]
+]:
   """Extracts the variable creation attributes from the kwargs.
 
   Args:
@@ -2170,8 +1874,13 @@ def extract_variable_info(
       scope.
 
   Returns:
-    A tuple of variable name, shape, dtype, initialization function.
+    A tuple of variable name, shape, dtype, initialization function,
+    restore_uid.
   """
+
+  def get_restore_uid(initial_value: Callable[..., Any]) -> int | None:
+    return getattr(initial_value, "restore_uid", None)
+
   if isinstance(kwargs["initial_value"], functools.partial) and (
       "shape" in kwargs["initial_value"].keywords
       or kwargs["initial_value"].args
@@ -2186,6 +1895,7 @@ def extract_variable_info(
         shape,
         kwargs["initial_value"].keywords.get("dtype", kwargs["dtype"]),
         kwargs["initial_value"].func,
+        get_restore_uid(kwargs["initial_value"].func),
     )
   elif (
       "shape" not in kwargs
@@ -2207,6 +1917,7 @@ def extract_variable_info(
         kwargs["shape"],
         kwargs["dtype"],
         kwargs["initial_value"],
+        get_restore_uid(kwargs["initial_value"]),
     )
 
 
@@ -2260,7 +1971,9 @@ def _create_sharded_variable(next_creator, *args, **kwargs):
           "shard_info must be in arguments of the init function."
       )
 
-    name, shape, dtype, unwrapped_initial_value = extract_variable_info(kwargs)
+    name, shape, dtype, unwrapped_initial_value, restore_uid = (
+        extract_variable_info(kwargs)
+    )
 
     shape = ops.tensor_shape.TensorShape(shape)
     num_devices = num_replicas * num_cores_per_replica
@@ -2304,6 +2017,9 @@ def _create_sharded_variable(next_creator, *args, **kwargs):
     result = TPUEmbeddingShardedVariable(
         strategy, variables, tf_variables.VariableAggregation.NONE, None
     )
+    if restore_uid is not None:
+      result._maybe_initialize_trackable()  # pylint: disable=protected-access
+      result._update_uid = restore_uid  # pylint: disable=protected-access
     return result
 
   return _create_sharded_variable
diff --git a/tensorflow/python/tpu/tpu_embedding_v3_cpu_ops_test.py b/tensorflow/python/tpu/tpu_embedding_v3_cpu_ops_test.py
new file mode 100644
index 00000000000000..762b5d7d4e21da
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_embedding_v3_cpu_ops_test.py
@@ -0,0 +1,692 @@
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import functools
+import itertools
+import math
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import bitwise_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.platform import test
+from tensorflow.python.tpu.ops import gen_xla_ops as xla_ops
+
+
+def _get_combiner_scale_contribution(x, combiner):
+  if combiner == "sum":
+    return 1.0
+  elif combiner == "mean":
+    return x
+  else:
+    return x * x
+
+
+def _get_combiner_scale_transform(x, combiner):
+  if combiner == "sum":
+    return 1.0
+  elif combiner == "mean":
+    return 0 if x == 0 else (1 / x)
+  else:
+    return 0 if x == 0 else (1 / math_ops.sqrt(x))
+
+
+def convert_input_to_coo_tensor(
+    indices_or_row_splits, values, weight, sample_count, combiner
+):
+  if indices_or_row_splits.shape.rank >= 2:
+    row_ids_before_dedup = indices_or_row_splits[:, 0]
+  elif indices_or_row_splits.shape.rank == 1:
+    row_ids_before_dedup = []
+    current_row_id = -1
+    for i, _ in enumerate(values):
+      while i == indices_or_row_splits[current_row_id + 1]:
+        current_row_id += 1
+      row_ids_before_dedup.append(current_row_id)
+  else:
+    row_ids_before_dedup = math_ops.range(0, values.shape[0])
+
+  col_ids = []
+  row_ids = []
+  gains = []
+  gains_rescale = [0] * sample_count
+  for i in range(values.shape[0]):
+    gains_rescale[row_ids_before_dedup[i]] += _get_combiner_scale_contribution(
+        weight[i], combiner
+    )
+    if (
+        row_ids
+        and row_ids[-1] == row_ids_before_dedup[i]
+        and col_ids[-1] == values[i]
+    ):
+      gains[-1] += weight[i]
+    else:
+      row_ids.append(row_ids_before_dedup[i])
+      col_ids.append(values[i])
+      gains.append(weight[i])
+
+  for i in range(sample_count):
+    gains_rescale[i] = _get_combiner_scale_transform(gains_rescale[i], combiner)
+  for i in range(len(row_ids)):
+    gains[i] *= gains_rescale[row_ids[i]]
+
+  row_ids = ops.convert_to_tensor(row_ids, dtype=dtypes.int32)
+  col_ids = ops.convert_to_tensor(col_ids, dtype=dtypes.int32)
+  gains = ops.convert_to_tensor(gains, dtype=dtypes.float32)
+
+  return row_ids, col_ids, gains
+
+
+def _compute_sparse_core_stats(row_ids, col_ids, num_sc_shards):
+  max_ids = np.zeros(num_sc_shards)
+  max_unique_ids = np.zeros(num_sc_shards)
+  previous_col_id = -1
+  previous_row_id = -1
+  for col_id, row_id in sorted(zip(col_ids, row_ids)):
+    if col_id != previous_col_id:
+      max_ids[col_id % num_sc_shards] += 1
+      max_unique_ids[col_id % num_sc_shards] += 1
+    else:
+      if previous_row_id != row_id:
+        max_ids[col_id % num_sc_shards] += 1
+    previous_col_id = col_id
+    previous_row_id = row_id
+  return max(max_ids), max(max_unique_ids)
+
+
+def _convert_coo_tensor_to_csr_with_physical_replica(
+    row_ids,
+    col_ids,
+    gains,
+    splits,
+    sample_count,
+    num_replica,
+    max_minibatches_per_sc,
+    max_ids_per_chip_per_sample,
+    table_vocab_size,
+):
+  num_sc_per_replica = 4
+  num_physical_replica = num_replica * num_sc_per_replica
+  assert (
+      sample_count % num_sc_per_replica == 0
+  ), f"sample count should be multiply of 4 instead got {sample_count}"
+
+  per_sc_sample_count = sample_count // num_sc_per_replica
+
+  splits = splits.numpy()
+  if splits.size > 1:
+    splits = functools.reduce(lambda x, y: x | y, splits)
+
+  max_division_level = 6
+  max_divisions = 1 << max_division_level
+
+  division_size = (table_vocab_size + max_divisions - 1) // max_divisions
+
+  bucket_splits = []
+  current_index = 0
+  while splits > 0:
+    if splits % 2 == 1:
+      split_level = int(current_index + 1).bit_length() - 1
+      split_offset = current_index + 1 - (1 << split_level)
+      split_size = 1 << (max_division_level - 1 - split_level)
+      bucket_splits.append(split_size + split_offset * split_size * 2)
+    splits >>= 1
+    current_index += 1
+
+  bucket_splits.sort()
+
+  num_minibatch_per_sc = len(bucket_splits) + 1
+
+  embedding_lookup_inputs = []
+  for row_id, col_id, gain in zip(row_ids, col_ids, gains):
+    embedding_lookup_inputs.append(
+        (col_id % num_physical_replica, col_id, row_id, gain)
+    )
+  # sort based on replica id first, then col_id.
+  embedding_lookup_inputs.sort()
+
+  total_minibatches = num_minibatch_per_sc * num_sc_per_replica
+
+  assert num_minibatch_per_sc <= max_minibatches_per_sc, (
+      f"Get {num_minibatch_per_sc} minibatches per sparse core, but the"
+      " number of max minibatches per sparse core is"
+      f" {max_minibatches_per_sc}"
+  )
+
+  minibatches = [[] for _ in range(total_minibatches)]
+
+  def calculate_minibatch_id(col_id):
+    for i, bucket_split in enumerate(bucket_splits):
+      if bucket_split * division_size > col_id:
+        return i
+    return len(bucket_splits)
+
+  for embedding_lookup_input in embedding_lookup_inputs:
+    sc_id = embedding_lookup_input[2] // per_sc_sample_count
+    minibatch_id = calculate_minibatch_id(embedding_lookup_input[1])
+
+    minibatches[sc_id * num_minibatch_per_sc + minibatch_id].append(
+        embedding_lookup_input
+    )
+
+  def round_up_to(x, round_value):
+    return x + -x % round_value
+
+  max_ids_per_chip = max_ids_per_chip_per_sample * sample_count
+
+  padded_row_pointers_size = round_up_to(num_physical_replica, 8)
+
+  total_row_pinters_size = padded_row_pointers_size * (
+      max_minibatches_per_sc * num_sc_per_replica
+  )
+
+  row_pointers = np.full(total_row_pinters_size, 8, dtype=np.int32)
+  sorted_sample_ids = np.full(max_ids_per_chip, 8, dtype=np.int32)
+  sorted_token_ids = np.full(max_ids_per_chip, 8, dtype=np.int32)
+  sorted_gains = np.full(max_ids_per_chip, 8, dtype=np.float32)
+
+  id_index = 0
+  row_pointers_index = 0
+  for minibatch in minibatches:
+    index = 0
+    for replica_id in range(num_physical_replica):
+      while index < len(minibatch) and replica_id == minibatch[index][0]:
+        sorted_token_ids[id_index] = minibatch[index][1] // num_physical_replica
+        sorted_sample_ids[id_index] = minibatch[index][2] % per_sc_sample_count
+        sorted_gains[id_index] = minibatch[index][3]
+        index += 1
+        id_index += 1
+      row_pointers[row_pointers_index] = id_index
+      id_index = round_up_to(id_index, 8)
+      row_pointers_index += 1
+
+    for i in range(
+        row_pointers_index,
+        round_up_to(row_pointers_index, padded_row_pointers_size),
+    ):
+      row_pointers[i] = id_index
+    row_pointers_index = round_up_to(
+        row_pointers_index, padded_row_pointers_size
+    )
+
+  row_pointers_unpadded_size = total_minibatches * padded_row_pointers_size
+  ids_unpadded_size = id_index
+
+  return (
+      row_pointers,
+      sorted_sample_ids,
+      sorted_token_ids,
+      sorted_gains,
+      np.array(row_pointers_unpadded_size, dtype=np.int32),
+      np.array(ids_unpadded_size, dtype=np.int32),
+      np.array(num_minibatch_per_sc, dtype=np.int32),
+  )
+
+
+class TpuEmbeddingV3CPUOpsTest(parameterized.TestCase, test.TestCase):
+
+  @parameterized.parameters(
+      *list(
+          itertools.product(
+              [16, 32],
+              [1024, 2048],
+              ["sum", "mean", "sqrtn"],
+              [0, 56, 1600000],
+              [0, 12, 20],
+          )
+      )
+  )
+  def test_convert_to_list_of_sparse_core_coo_tensor(
+      self, sample_count, token_count, combiner, col_offset, col_shift
+  ):
+    sparse_feature = sparse_ops.sparse_reorder(
+        sparse_tensor.SparseTensor(
+            indices=[
+                [i % sample_count, i]
+                for i in np.random.randint(low=0, high=1024, size=token_count)
+            ],
+            values=np.random.randint(low=0, high=1024, size=token_count),
+            dense_shape=[sample_count, 1024],
+        )
+    )
+
+    num_sc_per_chip = 4
+    num_chip = 128
+
+    row_offset = sample_count
+    num_sc_shards = num_sc_per_chip * num_chip
+    stacked_table_sample_count = sample_count * 4
+
+    num_sc_shards_bit = int(math.log2(num_sc_shards))
+    num_sc_shards_bit_mod = (1 << num_sc_shards_bit) - 1
+    num_sc_shards_bit_mod_inv = bitwise_ops.invert(num_sc_shards_bit_mod)
+
+    row_ids, col_ids, gains = convert_input_to_coo_tensor(
+        indices_or_row_splits=sparse_feature.indices,
+        values=sparse_feature.values,
+        weight=np.ones(shape=token_count),
+        sample_count=sample_count,
+        combiner=combiner,
+    )
+
+    golden_row_ids = (
+        row_ids % (sample_count // num_sc_per_chip)
+        + int(row_offset // num_sc_per_chip)
+        + int(stacked_table_sample_count // num_sc_per_chip)
+        * (row_ids // (sample_count // num_sc_per_chip))
+    )
+    golden_col_ids = (
+        bitwise_ops.bitwise_and(col_ids + col_shift, num_sc_shards_bit_mod)
+        + bitwise_ops.bitwise_and(col_ids, num_sc_shards_bit_mod_inv)
+        + col_offset
+    )
+
+    row_ids_list, col_ids_list, gains_list = (
+        xla_ops.convert_to_list_of_sparse_core_coo_tensors(
+            indices_or_row_splits=math_ops.cast(
+                sparse_feature.indices, dtype=dtypes.int32
+            ),
+            values=math_ops.cast(sparse_feature.values, dtype=dtypes.int32),
+            weights=1.0,
+            sample_count=sample_count,
+            combiner=combiner,
+            num_sc_per_chip=4,
+            row_offset=row_offset,
+            col_offset=col_offset,
+            col_shift=col_shift,
+            num_sc_shards=num_sc_shards,
+            stacked_table_sample_count=stacked_table_sample_count,
+        )
+    )
+
+    self.assertAllClose(golden_row_ids, array_ops.concat(row_ids_list, axis=0))
+    self.assertAllClose(golden_col_ids, array_ops.concat(col_ids_list, axis=0))
+    self.assertAllClose(gains, array_ops.concat(gains_list, axis=0))
+
+  def test_convert_to_list_of_sparse_core_coo_tensors(self):
+    sample_count = 16
+    token_count = 1024
+    combiner = "sum"
+    sparse_feature = sparse_ops.sparse_reorder(
+        sparse_tensor.SparseTensor(
+            indices=[[i % sample_count, i] for i in np.arange(token_count)],
+            values=np.arange(token_count),
+            dense_shape=[sample_count, 1024],
+        )
+    )
+
+    row_ids_list, col_ids_list, gains_list = (
+        xla_ops.convert_to_list_of_sparse_core_coo_tensors(
+            indices_or_row_splits=math_ops.cast(
+                sparse_feature.indices, dtype=dtypes.int32
+            ),
+            values=math_ops.cast(sparse_feature.values, dtype=dtypes.int32),
+            weights=1.0,
+            sample_count=sample_count,
+            combiner=combiner,
+            num_sc_per_chip=4,
+            row_offset=0,
+            col_offset=0,
+            col_shift=0,
+            num_sc_shards=16,
+            stacked_table_sample_count=sample_count,
+        )
+    )
+
+    sorted_row_ids_list = []
+    sorted_col_ids_list = []
+    sorted_gains_list = []
+    id_counts_list = []
+    for i in range(4):
+      (
+          sorted_row_ids,
+          sorted_col_ids,
+          sorted_gains,
+          id_counts,
+      ) = xla_ops.sort_list_of_sparse_core_coo_tensors(
+          row_ids_list=[row_ids_list[i]],
+          col_ids_list=[col_ids_list[i]],
+          gains_list=[gains_list[i]],
+          sample_count_list=[sample_count // 4],
+          col_offset_list=[0],
+          num_replica=4,
+          table_vocab_size=16384,
+          feature_width=16,
+          num_sc_per_chip=4,
+          max_ids_per_sparse_core=256,
+          max_unique_ids_per_sparse_core=256,
+          table_name="table",
+      )
+      sorted_row_ids_list.append(sorted_row_ids)
+      sorted_col_ids_list.append(sorted_col_ids)
+      sorted_gains_list.append(sorted_gains)
+      id_counts_list.append(id_counts)
+
+    (
+        row_pointers,
+        sorted_sample_ids,
+        sorted_token_ids,
+        sorted_gains,
+        row_pointers_unpadded_size,
+        ids_unpadded_size,
+        num_minibatches_per_sc,
+    ) = xla_ops.convert_to_sparse_core_csr_wrapped_coo_tensor(
+        sorted_row_ids_list=sorted_row_ids_list,
+        sorted_col_ids_list=sorted_col_ids_list,
+        sorted_gains_list=sorted_gains_list,
+        id_counts_list=id_counts_list,
+        splits=constant_op.constant(0, dtype=dtypes.int64),
+        sample_count_per_sc=sample_count // 4,
+        max_minibatches_per_sc=4,
+        max_ids_per_chip_per_sample=64,
+        table_vocab_size=16384,
+        feature_width=16,
+        num_replica=4,
+        allow_id_dropping=False,
+        table_name="table",
+    )
+
+    (
+        golden_row_pointers,
+        golden_sorted_sample_ids,
+        golden_sorted_token_ids,
+        golden_sorted_gains,
+        golden_row_pointers_unpadded_size,
+        golden_ids_unpadded_size,
+        golden_num_minibatches_per_sc,
+    ) = _convert_coo_tensor_to_csr_with_physical_replica(
+        row_ids=array_ops.concat(row_ids_list, axis=0),
+        col_ids=array_ops.concat(col_ids_list, axis=0),
+        gains=array_ops.concat(gains_list, axis=0),
+        splits=constant_op.constant(0, dtype=dtypes.int64),
+        sample_count=sample_count,
+        num_replica=4,
+        max_minibatches_per_sc=4,
+        max_ids_per_chip_per_sample=64,
+        table_vocab_size=16384,
+    )
+
+    self.assertAllClose(
+        golden_row_pointers[:golden_row_pointers_unpadded_size],
+        row_pointers[:row_pointers_unpadded_size],
+    )
+    self.assertAllClose(
+        golden_sorted_sample_ids[:golden_ids_unpadded_size],
+        sorted_sample_ids[:ids_unpadded_size],
+    )
+    self.assertAllClose(
+        golden_sorted_token_ids[:golden_ids_unpadded_size],
+        sorted_token_ids[:ids_unpadded_size],
+    )
+    self.assertAllClose(
+        golden_sorted_gains[:golden_ids_unpadded_size],
+        sorted_gains[:ids_unpadded_size],
+    )
+    self.assertEqual(golden_num_minibatches_per_sc, num_minibatches_per_sc)
+
+  def test_get_stats_from_list_of_sparse_core_coo_tensors(self):
+    sample_count = 16
+    token_count = 1024
+    combiner = "sum"
+    sparse_feature = sparse_ops.sparse_reorder(
+        sparse_tensor.SparseTensor(
+            indices=[
+                [i % sample_count, i]
+                for i in np.random.randint(low=0, high=1024, size=token_count)
+            ],
+            values=np.random.randint(low=0, high=1024, size=token_count),
+            dense_shape=[sample_count, 1024],
+        )
+    )
+    max_ids_golden = 0
+    max_unique_ids_golden = 0
+
+    for i in range(4):
+      sparse_feature_slice = sparse_ops.sparse_slice(
+          sparse_feature,
+          [i * sample_count // 4, 0],
+          [sample_count // 4, 1024],
+      )
+      max_ids_per_sparse_core, max_uniques_per_sparse_core = (
+          _compute_sparse_core_stats(
+              sparse_feature_slice.indices[:, 0],
+              sparse_feature_slice.values,
+              num_sc_shards=16,
+          )
+      )
+      max_ids_golden = max(max_ids_golden, max_ids_per_sparse_core)
+      max_unique_ids_golden = max(
+          max_unique_ids_golden, max_uniques_per_sparse_core
+      )
+
+    row_ids_list, col_ids_list, gains_list = (
+        xla_ops.convert_to_list_of_sparse_core_coo_tensors(
+            indices_or_row_splits=math_ops.cast(
+                sparse_feature.indices, dtype=dtypes.int32
+            ),
+            values=math_ops.cast(sparse_feature.values, dtype=dtypes.int32),
+            weights=1.0,
+            sample_count=sample_count,
+            combiner=combiner,
+            num_sc_per_chip=4,
+            row_offset=0,
+            col_offset=0,
+            col_shift=0,
+            num_sc_shards=16,
+            stacked_table_sample_count=sample_count,
+        )
+    )
+
+    max_ids = 0
+    max_uniques = 0
+    for i in range(4):
+      max_ids_per_sparse_core, max_unique_ids_per_sparse_core = (
+          xla_ops.get_stats_from_list_of_sparse_core_coo_tensors(
+              row_ids_list=[row_ids_list[i]],
+              col_ids_list=[col_ids_list[i]],
+              gains_list=[gains_list[i]],
+              sample_count_list=[sample_count // 4],
+              col_offset_list=[0],
+              num_replica=4,
+              table_vocab_size=16384,
+              feature_width=16,
+              num_sc_per_chip=4,
+              table_name="table",
+          )
+      )
+      max_ids = max(max_ids, max_ids_per_sparse_core)
+      max_uniques = max(max_uniques, max_unique_ids_per_sparse_core)
+
+    self.assertEqual(max_ids, max_ids_golden)
+    self.assertEqual(max_uniques, max_unique_ids_golden)
+
+  def test_sort_list_of_sparse_core_coo_tensors(self):
+    sample_count = 16
+    token_count = 1024
+    combiner = "sum"
+    num_chips = 4
+    sparse_feature = sparse_ops.sparse_reorder(
+        sparse_tensor.SparseTensor(
+            indices=[[i % sample_count, i] for i in np.arange(token_count)],
+            values=np.arange(token_count),
+            dense_shape=[sample_count, 1024],
+        )
+    )
+
+    row_ids_list, col_ids_list, gains_list = (
+        xla_ops.convert_to_list_of_sparse_core_coo_tensors(
+            indices_or_row_splits=math_ops.cast(
+                sparse_feature.indices, dtype=dtypes.int32
+            ),
+            values=math_ops.cast(sparse_feature.values, dtype=dtypes.int32),
+            weights=1.0,
+            sample_count=sample_count,
+            combiner=combiner,
+            num_sc_per_chip=4,
+            row_offset=0,
+            col_offset=0,
+            col_shift=0,
+            num_sc_shards=num_chips * 4,
+            stacked_table_sample_count=sample_count,
+        )
+    )
+
+    for i in range(4):
+      (
+          sorted_row_ids,
+          sorted_col_ids,
+          sorted_gains,
+          _,
+      ) = xla_ops.sort_list_of_sparse_core_coo_tensors(
+          row_ids_list=[row_ids_list[i]],
+          col_ids_list=[col_ids_list[i]],
+          gains_list=[gains_list[i]],
+          sample_count_list=[sample_count // 4],
+          col_offset_list=[0],
+          num_replica=num_chips,
+          table_vocab_size=16384,
+          feature_width=16,
+          num_sc_per_chip=4,
+          max_ids_per_sparse_core=256,
+          max_unique_ids_per_sparse_core=256,
+          table_name="table",
+      )
+
+      embedding_lookup_inputs = []
+      for row_id, col_id, gain in zip(
+          row_ids_list[i], col_ids_list[i], gains_list[i]
+      ):
+        embedding_lookup_inputs.append((col_id % 16, col_id, row_id, gain))
+      # sort based on replica id first, then col_id.
+      embedding_lookup_inputs.sort()
+
+      self.assertAllClose(
+          sorted_row_ids,
+          [inp[2] % (sample_count // 4) for inp in embedding_lookup_inputs],
+      )
+      self.assertAllClose(
+          sorted_col_ids,
+          [inp[1] // (num_chips * 4) for inp in embedding_lookup_inputs],
+      )
+      self.assertAllClose(
+          sorted_gains, [inp[3] for inp in embedding_lookup_inputs]
+      )
+
+  def test_id_dropping_with_convert_to_list_of_sparse_core_coo_tensors(self):
+    sample_count = 16
+    token_count = 1024
+    combiner = "sum"
+    sparse_feature = sparse_ops.sparse_reorder(
+        sparse_tensor.SparseTensor(
+            indices=[[i % sample_count, i] for i in np.arange(token_count)],
+            values=np.arange(token_count),
+            dense_shape=[sample_count, 1024],
+        )
+    )
+
+    row_ids_list, col_ids_list, gains_list = (
+        xla_ops.convert_to_list_of_sparse_core_coo_tensors(
+            indices_or_row_splits=math_ops.cast(
+                sparse_feature.indices, dtype=dtypes.int32
+            ),
+            values=math_ops.cast(sparse_feature.values, dtype=dtypes.int32),
+            weights=1.0,
+            sample_count=sample_count,
+            combiner=combiner,
+            num_sc_per_chip=4,
+            row_offset=0,
+            col_offset=0,
+            col_shift=0,
+            num_sc_shards=16,
+            stacked_table_sample_count=sample_count,
+        )
+    )
+
+    sorted_row_ids_list = []
+    sorted_col_ids_list = []
+    sorted_gains_list = []
+    id_counts_list = []
+    for i in range(4):
+      (
+          sorted_row_ids,
+          sorted_col_ids,
+          sorted_gains,
+          id_counts,
+      ) = xla_ops.sort_list_of_sparse_core_coo_tensors(
+          row_ids_list=[row_ids_list[i]],
+          col_ids_list=[col_ids_list[i]],
+          gains_list=[gains_list[i]],
+          sample_count_list=[sample_count // 4],
+          col_offset_list=[0],
+          num_replica=4,
+          table_vocab_size=16384,
+          feature_width=16,
+          num_sc_per_chip=4,
+          max_ids_per_sparse_core=256,
+          max_unique_ids_per_sparse_core=256,
+          table_name="table",
+      )
+      sorted_row_ids_list.append(sorted_row_ids)
+      sorted_col_ids_list.append(sorted_col_ids)
+      sorted_gains_list.append(sorted_gains)
+      id_counts_list.append(id_counts)
+
+    # If not allow id dropping, the op will fail with very small
+    # max_ids_per_chip_per_sample.
+    with self.assertRaises(Exception):
+      xla_ops.convert_to_sparse_core_csr_wrapped_coo_tensor(
+          sorted_row_ids_list=sorted_row_ids_list,
+          sorted_col_ids_list=sorted_col_ids_list,
+          sorted_gains_list=sorted_gains_list,
+          id_counts_list=id_counts_list,
+          splits=constant_op.constant(0, dtype=dtypes.int64),
+          sample_count_per_sc=sample_count // 4,
+          max_minibatches_per_sc=4,
+          max_ids_per_chip_per_sample=8,
+          table_vocab_size=16384,
+          feature_width=16,
+          num_replica=4,
+          allow_id_dropping=False,
+          table_name="table",
+      )
+
+    # Allow id dropping, the op will succeed,
+    xla_ops.convert_to_sparse_core_csr_wrapped_coo_tensor(
+        sorted_row_ids_list=sorted_row_ids_list,
+        sorted_col_ids_list=sorted_col_ids_list,
+        sorted_gains_list=sorted_gains_list,
+        id_counts_list=id_counts_list,
+        splits=constant_op.constant(0, dtype=dtypes.int64),
+        sample_count_per_sc=sample_count // 4,
+        max_minibatches_per_sc=4,
+        max_ids_per_chip_per_sample=8,
+        table_vocab_size=16384,
+        feature_width=16,
+        num_replica=4,
+        allow_id_dropping=True,
+        table_name="table",
+    )
+
+
+if __name__ == "__main__":
+  v2_compat.enable_v2_behavior()
+  test.main()
diff --git a/tensorflow/python/tpu/tpu_embedding_v3_test.py b/tensorflow/python/tpu/tpu_embedding_v3_test.py
new file mode 100644
index 00000000000000..433eac09c06759
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_embedding_v3_test.py
@@ -0,0 +1,754 @@
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import tpu_strategy
+from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import remote
+from tensorflow.python.framework import config
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops_v2
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import variables as tf_variables
+from tensorflow.python.platform import test
+from tensorflow.python.tpu import tpu_embedding_for_serving
+from tensorflow.python.tpu import tpu_embedding_v2_utils
+from tensorflow.python.tpu import tpu_embedding_v3
+from tensorflow.python.tpu import tpu_replication
+from tensorflow.python.util import nest
+
+
+def create_input_data_based_on_hw_requirement(
+    num_chip,
+    max_unique_ids_per_partition,
+    per_sc_vocab_size,
+    per_sc_sample_count,
+    num_minibatches_per_physical_sparse_core,
+):
+  """Create the coo tensor based on hardware requirements.
+
+  Args:
+    num_chip: number of chips in the tpu system.
+    max_unique_ids_per_partition: max unique ids per physical replica
+    per_sc_vocab_size: per sc shard of table size.
+    per_sc_sample_count: per sc sample count.
+    num_minibatches_per_physical_sparse_core: per sc minibatch number.
+
+  Returns:
+    row_ids, col_ids, gains and splits
+  """
+  num_sc_per_chip = 4
+  num_physical_replica = num_chip * num_sc_per_chip
+
+  col_ids = []
+  row_ids = []
+  gains = []
+
+  smallest_num_division = np.power(
+      2, np.ceil(np.log2(num_minibatches_per_physical_sparse_core))
+  )
+  division_size = (
+      per_sc_vocab_size + smallest_num_division - 1
+  ) // smallest_num_division
+
+  assert division_size >= max_unique_ids_per_partition, (
+      'The max_unique_ids_per_partition is set to'
+      f' {max_unique_ids_per_partition} and the number of minibatches per'
+      f' sparse core is set to {num_minibatches_per_physical_sparse_core}.'
+      f' But the vocab size per sparse core is {per_sc_vocab_size} which is'
+      ' not going to fit that many minibatches, consider setting the number of'
+      ' minibatches smaller.'
+  )
+
+  # Generating id nums for each sc on a chip. Since each chip will have the
+  # same number of ids, we can shuffle this array to get random id numbers for
+  # each chip.
+  # Make sure that at least 1 replica contains
+
+  per_sc_per_minibatch_id_nums_for_each_replica = np.random.randint(
+      max_unique_ids_per_partition
+      * (num_minibatches_per_physical_sparse_core - 1)
+      + 1,
+      max_unique_ids_per_partition * num_minibatches_per_physical_sparse_core
+      + 1,
+      size=num_physical_replica,
+  )
+
+  per_chip_sample_count = per_sc_sample_count * num_sc_per_chip
+
+  for chip_id in range(num_chip):
+    for sc_id in range(num_sc_per_chip):
+      np.random.shuffle(per_sc_per_minibatch_id_nums_for_each_replica)
+      for physical_replica_id in range(num_physical_replica):
+        physical_replica_id_nums = (
+            per_sc_per_minibatch_id_nums_for_each_replica[physical_replica_id]
+        )
+        # Generate local col ids based on the minibatch constrains.
+        # Make sure that the generated col ids are all unique.
+        local_col_ids = np.array([])
+        for i in range(num_minibatches_per_physical_sparse_core):
+          local_col_ids_minibatch_size = max_unique_ids_per_partition
+          if i == num_minibatches_per_physical_sparse_core - 1:
+            local_col_ids_minibatch_size = (
+                physical_replica_id_nums - i * max_unique_ids_per_partition
+            )
+
+          local_col_ids = np.append(
+              local_col_ids,
+              np.random.choice(
+                  np.arange(division_size),
+                  size=local_col_ids_minibatch_size,
+                  replace=False,
+              )
+              + i * division_size,
+          )
+        local_row_ids = np.random.randint(
+            low=0,
+            high=per_sc_sample_count,
+            size=physical_replica_id_nums,
+        )
+
+        row_ids += list(
+            local_row_ids
+            + chip_id * per_chip_sample_count
+            + sc_id * per_sc_sample_count
+        )
+        col_ids += list(
+            local_col_ids * num_physical_replica + physical_replica_id
+        )
+
+        gains += list(np.random.random(size=physical_replica_id_nums))
+
+  return np.array(row_ids), np.array(col_ids), np.array(gains)
+
+
+class TPUEmbeddingV3Test(parameterized.TestCase, test.TestCase):
+
+  def setUp(self):
+    super().setUp()
+    self.vocabulary_size = 16384
+    self.embedding_dim = 127
+    self.table_video = tpu_embedding_v2_utils.TableConfig(
+        vocabulary_size=self.vocabulary_size,
+        dim=self.embedding_dim,
+        initializer=init_ops_v2.Constant(1.0),
+        combiner='sum',
+        name='video',
+    )
+    self.table_user = tpu_embedding_v2_utils.TableConfig(
+        vocabulary_size=self.vocabulary_size,
+        dim=self.embedding_dim,
+        initializer=init_ops_v2.Constant(2.0),
+        combiner='sum',
+        name='user',
+    )
+
+  def test_single_feature_single_table_lookup_with_static_buffer_size(self):
+    feature_config = tpu_embedding_v2_utils.FeatureConfig(
+        table=self.table_video, name='watched', output_shape=[16]
+    )
+
+    resolver = tpu_cluster_resolver.TPUClusterResolver(tpu='')
+    remote.connect_to_cluster(resolver)
+    tpu_cluster_resolver.initialize_tpu_system(resolver)
+    strategy = tpu_strategy.TPUStrategy(resolver)
+
+    sparse_features = sparse_ops.sparse_reorder(
+        sparse_tensor.SparseTensor(
+            indices=[[i % 16, i] for i in range(1024)],
+            values=np.arange(1024),
+            dense_shape=[16, 1024],
+        )
+    )
+
+    dataset = dataset_ops.DatasetV2.from_tensors(sparse_features)
+    dataset = (
+        dataset.unbatch()
+        .repeat()
+        .batch(16 * strategy.num_replicas_in_sync, drop_remainder=True)
+    )
+
+    dist = strategy.experimental_distribute_dataset(
+        dataset,
+        options=distribute_lib.InputOptions(experimental_fetch_to_device=False),
+    )
+    dist_iter = iter(dist)
+    data = next(dist_iter)
+
+    with strategy.scope():
+      mid_level_api = tpu_embedding_v3.TPUEmbeddingV2(
+          feature_config=feature_config,
+          optimizer=tpu_embedding_v2_utils.SGD(learning_rate=1.0),
+      )
+
+    @def_function.function
+    def test_fn():
+      def step(data):
+        return mid_level_api(data)
+
+      return strategy.run(step, args=(data,))
+
+    result = test_fn()
+
+    mid_level_api_cpu = tpu_embedding_for_serving.TPUEmbeddingForServing(
+        feature_config=feature_config,
+        optimizer=tpu_embedding_v2_utils.SGD(learning_rate=1.0),
+    )
+
+    cpu_result = mid_level_api_cpu(data)
+
+    for per_feature_result, per_feature_result_cpu in zip(
+        nest.flatten(result[0]), nest.flatten(cpu_result)
+    ):
+      self.assertAllEqual(per_feature_result, per_feature_result_cpu)
+
+  def test_two_features_single_table_lookup_with_csr_input(self):
+    feature_config = [
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_video, output_shape=[16]
+        ),
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_video, output_shape=[16]
+        ),
+    ]
+
+    resolver = tpu_cluster_resolver.TPUClusterResolver(tpu='')
+    remote.connect_to_cluster(resolver)
+    tpu_cluster_resolver.initialize_tpu_system(resolver)
+    strategy = tpu_strategy.TPUStrategy(resolver)
+
+    sparse_features = sparse_ops.sparse_reorder(
+        sparse_tensor.SparseTensor(
+            indices=[[i % 16, i] for i in range(512)],
+            values=np.arange(512),
+            dense_shape=[16, 512],
+        )
+    )
+
+    dataset = dataset_ops.DatasetV2.from_tensors(sparse_features)
+    dataset = (
+        dataset.unbatch()
+        .repeat()
+        .batch(16 * strategy.num_replicas_in_sync, drop_remainder=True)
+    )
+
+    dist = strategy.experimental_distribute_dataset(
+        dataset,
+        options=distribute_lib.InputOptions(experimental_fetch_to_device=False),
+    )
+    dist_iter = iter(dist)
+    data = next(dist_iter)
+
+    with strategy.scope():
+      mid_level_api = tpu_embedding_v3.TPUEmbeddingV2(
+          feature_config=feature_config,
+          optimizer=tpu_embedding_v2_utils.SGD(learning_rate=1.0),
+      )
+
+    @def_function.function
+    def test_fn():
+      def step(data):
+        return mid_level_api([data, data])
+
+      return strategy.run(step, args=(data,))
+
+    result = test_fn()
+
+    mid_level_api_cpu = tpu_embedding_for_serving.TPUEmbeddingForServing(
+        feature_config=feature_config,
+        optimizer=tpu_embedding_v2_utils.SGD(learning_rate=1.0),
+    )
+
+    cpu_result = mid_level_api_cpu([data, data])
+
+    for per_feature_result, per_feature_result_cpu in zip(
+        nest.flatten(result[0]), nest.flatten(cpu_result[0])
+    ):
+      self.assertAllEqual(per_feature_result, per_feature_result_cpu)
+
+  def test_two_features_two_tables_stacked_lookup_with_csr_input(self):
+    feature_config = [
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_video, output_shape=[16]
+        ),
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_user, output_shape=[16]
+        ),
+    ]
+
+    resolver = tpu_cluster_resolver.TPUClusterResolver(tpu='')
+    remote.connect_to_cluster(resolver)
+    tpu_cluster_resolver.initialize_tpu_system(resolver)
+    strategy = tpu_strategy.TPUStrategy(resolver)
+
+    sparse_features = sparse_ops.sparse_reorder(
+        sparse_tensor.SparseTensor(
+            indices=[[i % 16, i] for i in range(512)],
+            values=np.arange(512),
+            dense_shape=[16, 512],
+        )
+    )
+
+    dataset = dataset_ops.DatasetV2.from_tensors(sparse_features)
+    dataset = (
+        dataset.unbatch()
+        .repeat()
+        .batch(16 * strategy.num_replicas_in_sync, drop_remainder=True)
+    )
+
+    dist = strategy.experimental_distribute_dataset(
+        dataset,
+        options=distribute_lib.InputOptions(experimental_fetch_to_device=False),
+    )
+    dist_iter = iter(dist)
+    data = next(dist_iter)
+
+    sparse_core_embedding_config = tpu_embedding_v3.SparseCoreEmbeddingConfig(
+        disable_table_stacking=False,
+        max_ids_per_chip_per_sample=64,
+        allow_id_dropping=False,
+    )
+
+    with strategy.scope():
+      mid_level_api = tpu_embedding_v3.TPUEmbeddingV2(
+          feature_config=feature_config,
+          optimizer=tpu_embedding_v2_utils.SGD(learning_rate=1.0),
+          sparse_core_embedding_config=sparse_core_embedding_config,
+      )
+    self.assertLen(mid_level_api.embedding_tables, 1)
+
+    @def_function.function
+    def test_fn():
+      def step(data):
+        return mid_level_api([data, data])
+
+      return strategy.run(step, args=(data,))
+
+    result = test_fn()
+
+    mid_level_api_cpu = tpu_embedding_for_serving.TPUEmbeddingForServing(
+        feature_config=feature_config,
+        optimizer=tpu_embedding_v2_utils.SGD(learning_rate=1.0),
+    )
+
+    cpu_result = mid_level_api_cpu([data, data])
+
+    for per_feature_result, per_feature_result_cpu in zip(
+        nest.flatten(result[0]), nest.flatten(cpu_result[0])
+    ):
+      self.assertAllEqual(per_feature_result, per_feature_result_cpu)
+
+  def _recover_same_sized_tables(self, table, strategy, num_tables=1):
+    # This table has num_sparse_cores mod shards, so we need to slice,
+    # reconcat and reshape.
+    def _unshuffle_from_sc_to_cpu(num_sc_devices, t):
+      old_shape = t.shape
+      # The width of the table must be a multiple of number of SC devices. The
+      # tpu strategy does this round off at training time so we expect the
+      # checkpoints value to meet this requirement.
+      assert t.shape[0] % num_sc_devices == 0
+      intermediate_tensor = array_ops.reshape(
+          t, (num_sc_devices, t.shape[0] // num_sc_devices, t.shape[1])
+      )
+      intermediate_tensor = array_ops.transpose(intermediate_tensor, (1, 0, 2))
+      return array_ops.reshape(intermediate_tensor, old_shape)
+
+    table_partitions = [
+        shard.numpy()[:, : self.embedding_dim] for shard in table.values
+    ]
+    full_table = np.concatenate(table_partitions, axis=0)
+    full_table = _unshuffle_from_sc_to_cpu(
+        strategy.num_replicas_in_sync * 4, full_table
+    )
+
+    # If we have multiple tables stacked, assume each has the same vocab sizez
+    # and so was rounded the same (before stacking)
+    slice_size = full_table.shape[0] // num_tables
+    tables = []
+    for i in range(num_tables):
+      table = full_table[
+          i * slice_size : i * slice_size + self.vocabulary_size, :
+      ]
+      # Since we apply the table stacking shift to the stacked table, we
+      # are shifting it back here.
+      table = np.reshape(
+          table, (4 * strategy.num_replicas_in_sync, -1, table.shape[-1])
+      )
+      table = np.roll(table, -4 * i, axis=0)
+      table = np.reshape(table, (-1, table.shape[-1]))
+      tables.append(table)
+
+    if num_tables == 1:
+      return tables[0]
+
+    return tables
+
+  def test_single_feature_single_table_backwards_pass_with_csr_input(self):
+    feature_config = tpu_embedding_v2_utils.FeatureConfig(
+        table=self.table_video, name='watched', output_shape=[16]
+    )
+
+    resolver = tpu_cluster_resolver.TPUClusterResolver(tpu='')
+    remote.connect_to_cluster(resolver)
+    tpu_cluster_resolver.initialize_tpu_system(resolver)
+    strategy = tpu_strategy.TPUStrategy(resolver)
+
+    sparse_features = sparse_ops.sparse_reorder(
+        sparse_tensor.SparseTensor(
+            indices=[[i % 16, i] for i in range(1024)],
+            values=np.arange(1024),
+            dense_shape=[16, 1024],
+        )
+    )
+
+    dataset = dataset_ops.DatasetV2.from_tensors(sparse_features)
+    dataset = (
+        dataset.unbatch()
+        .repeat()
+        .batch(16 * strategy.num_replicas_in_sync, drop_remainder=True)
+    )
+
+    dist = strategy.experimental_distribute_dataset(
+        dataset,
+        options=distribute_lib.InputOptions(experimental_fetch_to_device=False),
+    )
+    dist_iter = iter(dist)
+    data = next(dist_iter)
+
+    with strategy.scope():
+      # Feed in learning rate as a variable.
+      weight = tf_variables.Variable(initial_value=1.0)
+      # Note that we return the function read_value and not its result here
+      # that is important.
+      optimizer = tpu_embedding_v2_utils.SGD(learning_rate=weight.read_value)
+      mid_level_api = tpu_embedding_v3.TPUEmbeddingV2(
+          feature_config=feature_config, optimizer=optimizer
+      )
+      mid_level_api.build()
+    random = np.random.uniform(size=(16, self.embedding_dim)).astype(np.float32)
+
+    @def_function.function
+    def test_fn(random_input):
+      def step(data, random_input):
+        partitioned_tensors = mid_level_api.enqueue(data)
+        preprocessed_results = tpu_replication.outside_compilation(
+            mid_level_api._copy_tensors_to_device,
+            partitioned_tensors=partitioned_tensors,
+        )
+        mid_level_api.apply_gradients(random_input, preprocessed_results)
+
+      strategy.run(step, args=(data, random_input))
+
+    test_fn(random)
+
+    full_table = self._recover_same_sized_tables(
+        mid_level_api.embedding_tables['video'], strategy
+    )
+
+    golden = np.ones(
+        [self.vocabulary_size, self.embedding_dim], dtype=np.float32
+    )
+    for i in range(1024):
+      golden[i, :] = golden[i, :] - random[i % 16]
+
+    self.assertAllClose(full_table, golden)
+
+  def test_two_feature_single_table_backwards_pass_with_csr_input(self):
+    feature_config = [
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_video, name='watched', output_shape=[16]
+        ),
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_video, name='watched2', output_shape=[16]
+        ),
+    ]
+
+    resolver = tpu_cluster_resolver.TPUClusterResolver(tpu='')
+    remote.connect_to_cluster(resolver)
+    tpu_cluster_resolver.initialize_tpu_system(resolver)
+    strategy = tpu_strategy.TPUStrategy(resolver)
+
+    sparse_features = sparse_ops.sparse_reorder(
+        sparse_tensor.SparseTensor(
+            indices=[[i % 16, i] for i in range(512)],
+            values=np.arange(512),
+            dense_shape=[16, 512],
+        )
+    )
+
+    dataset = dataset_ops.DatasetV2.from_tensors(sparse_features)
+    dataset = (
+        dataset.unbatch()
+        .repeat()
+        .batch(16 * strategy.num_replicas_in_sync, drop_remainder=True)
+    )
+
+    dist = strategy.experimental_distribute_dataset(
+        dataset,
+        options=distribute_lib.InputOptions(experimental_fetch_to_device=False),
+    )
+    dist_iter = iter(dist)
+    data = next(dist_iter)
+
+    with strategy.scope():
+      # Feed in learning rate as a variable.
+      weight = tf_variables.Variable(initial_value=1.0)
+      # Note that we return the function read_value and not its result here
+      # that is important.
+      optimizer = tpu_embedding_v2_utils.SGD(learning_rate=weight.read_value)
+      mid_level_api = tpu_embedding_v3.TPUEmbeddingV2(
+          feature_config=feature_config, optimizer=optimizer
+      )
+      mid_level_api.build()
+    random1 = np.random.uniform(size=(16, self.embedding_dim)).astype(
+        np.float32
+    )
+    random2 = np.random.uniform(size=(16, self.embedding_dim)).astype(
+        np.float32
+    )
+
+    @def_function.function
+    def test_fn(random_input):
+      def step(data, random_input):
+        partitioned_tensors = mid_level_api.enqueue([data, data])
+        preprocessed_results = tpu_replication.outside_compilation(
+            mid_level_api._copy_tensors_to_device,
+            partitioned_tensors=partitioned_tensors,
+        )
+        mid_level_api.apply_gradients(random_input, preprocessed_results)
+
+      strategy.run(step, args=(data, random_input))
+
+    test_fn([random1, random2])
+
+    full_table = self._recover_same_sized_tables(
+        mid_level_api.embedding_tables['video'], strategy
+    )
+
+    golden = np.ones(
+        [self.vocabulary_size, self.embedding_dim], dtype=np.float32
+    )
+    for i in range(512):
+      golden[i, :] = golden[i, :] - random1[i % 16] - random2[i % 16]
+
+    self.assertAllClose(full_table, golden)
+
+  def test_two_feature_two_tables_stacked_backwards_pass_with_csr_input(self):
+    feature_config = [
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_video, name='watched', output_shape=[16]
+        ),
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_user, name='watched2', output_shape=[16]
+        ),
+    ]
+
+    resolver = tpu_cluster_resolver.TPUClusterResolver(tpu='')
+    remote.connect_to_cluster(resolver)
+    tpu_cluster_resolver.initialize_tpu_system(resolver)
+    strategy = tpu_strategy.TPUStrategy(resolver)
+
+    sparse_features = sparse_ops.sparse_reorder(
+        sparse_tensor.SparseTensor(
+            indices=[[i % 16, i] for i in range(512)],
+            values=np.arange(512),
+            dense_shape=[16, 512],
+        )
+    )
+
+    dataset = dataset_ops.DatasetV2.from_tensors(sparse_features)
+    dataset = (
+        dataset.unbatch()
+        .repeat()
+        .batch(16 * strategy.num_replicas_in_sync, drop_remainder=True)
+    )
+
+    dist = strategy.experimental_distribute_dataset(
+        dataset,
+        options=distribute_lib.InputOptions(experimental_fetch_to_device=False),
+    )
+    dist_iter = iter(dist)
+    data = next(dist_iter)
+
+    sparse_core_embedding_config = tpu_embedding_v3.SparseCoreEmbeddingConfig(
+        disable_table_stacking=False,
+        max_ids_per_chip_per_sample=64,
+        allow_id_dropping=False,
+    )
+
+    with strategy.scope():
+      optimizer = tpu_embedding_v2_utils.SGD(learning_rate=1.0)
+      mid_level_api = tpu_embedding_v3.TPUEmbeddingV2(
+          feature_config=feature_config,
+          optimizer=optimizer,
+          sparse_core_embedding_config=sparse_core_embedding_config,
+      )
+      mid_level_api.build()
+    random1 = np.random.uniform(size=(16, self.embedding_dim)).astype(
+        np.float32
+    )
+    random2 = np.random.uniform(size=(16, self.embedding_dim)).astype(
+        np.float32
+    )
+
+    @def_function.function
+    def test_fn(random_input):
+      def step(data, random_input):
+        partitioned_tensors = mid_level_api.enqueue([data, data])
+        preprocessed_results = tpu_replication.outside_compilation(
+            mid_level_api._copy_tensors_to_device,
+            partitioned_tensors=partitioned_tensors,
+        )
+        mid_level_api.apply_gradients(random_input, preprocessed_results)
+
+      strategy.run(step, args=(data, random_input))
+
+    test_fn([random1, random2])
+
+    full_tables = self._recover_same_sized_tables(
+        mid_level_api.embedding_tables['user_video'], strategy, num_tables=2
+    )
+
+    goldens = [
+        np.full(
+            [self.vocabulary_size, self.embedding_dim], 2, dtype=np.float32
+        ),
+        np.ones([self.vocabulary_size, self.embedding_dim], dtype=np.float32),
+    ]
+
+    for i in range(512):
+      goldens[0][i, :] = goldens[0][i, :] - random2[i % 16]
+      goldens[1][i, :] = goldens[1][i, :] - random1[i % 16]
+
+    self.assertAllClose(full_tables, goldens)
+
+  def test_compute_sparse_core_stats_and_pass_it_to_api(self):
+    feature_config = tpu_embedding_v2_utils.FeatureConfig(
+        table=self.table_video, name='watched', output_shape=[16]
+    )
+
+    resolver = tpu_cluster_resolver.TPUClusterResolver(tpu='')
+    remote.connect_to_cluster(resolver)
+    tpu_cluster_resolver.initialize_tpu_system(resolver)
+    strategy = tpu_strategy.TPUStrategy(resolver)
+
+    sparse_features = sparse_ops.sparse_reorder(
+        sparse_tensor.SparseTensor(
+            indices=[[i % 16, i] for i in range(1024)],
+            values=np.arange(1024),
+            dense_shape=[16, 1024],
+        )
+    )
+
+    dataset = dataset_ops.DatasetV2.from_tensors(sparse_features)
+    dataset = (
+        dataset.unbatch()
+        .repeat()
+        .batch(16 * strategy.num_replicas_in_sync, drop_remainder=True)
+    )
+
+    dist = strategy.experimental_distribute_dataset(
+        dataset,
+        options=distribute_lib.InputOptions(experimental_fetch_to_device=False),
+    )
+    dist_iter = iter(dist)
+    data = next(dist_iter)
+
+    num_tpu_chips = strategy.num_replicas_in_sync
+
+    # profile the dataset to get the max ids per table and max unique ids per
+    # table.
+    table_to_max_ids, table_to_max_unique_ids = (
+        tpu_embedding_v3.TPUEmbeddingV2.compute_sparse_core_stats(
+            features=data,
+            feature_config=feature_config,
+            num_tpu_chips=num_tpu_chips,
+            optimizer=tpu_embedding_v2_utils.SGD(learning_rate=1.0),
+        )
+    )
+    self.assertEqual(
+        feature_config.table.dim, 127, 'Unexpected update to FeatureConfig'
+    )
+
+    sparse_core_embedding_config = tpu_embedding_v3.SparseCoreEmbeddingConfig(
+        disable_table_stacking=False,
+        max_ids_per_chip_per_sample=64,
+        max_ids_per_table=table_to_max_ids,
+        max_unique_ids_per_table=table_to_max_unique_ids,
+        allow_id_dropping=False,
+    )
+
+    with strategy.scope():
+      mid_level_api = tpu_embedding_v3.TPUEmbeddingV2(
+          feature_config=feature_config,
+          optimizer=tpu_embedding_v2_utils.SGD(learning_rate=1.0),
+          sparse_core_embedding_config=sparse_core_embedding_config,
+      )
+
+    @def_function.function
+    def test_fn():
+      def step(data):
+        return mid_level_api(data)
+
+      return strategy.run(step, args=(data,))
+
+    result = test_fn()
+    self.assertEqual(
+        feature_config.table.dim, 127, 'Unexpected update to FeatureConfig'
+    )
+    mid_level_api_cpu = tpu_embedding_for_serving.TPUEmbeddingForServing(
+        feature_config=feature_config,
+        optimizer=tpu_embedding_v2_utils.SGD(learning_rate=1.0),
+    )
+
+    cpu_result = mid_level_api_cpu(data)
+
+    for per_feature_result, per_feature_result_cpu in zip(
+        nest.flatten(result[0]), nest.flatten(cpu_result)
+    ):
+      self.assertAllEqual(per_feature_result, per_feature_result_cpu)
+
+  def test_raise_error_when_weight_decay_is_set(self):
+    feature_config = tpu_embedding_v2_utils.FeatureConfig(
+        table=self.table_video, name='watched', output_shape=[16]
+    )
+
+    resolver = tpu_cluster_resolver.TPUClusterResolver(tpu='')
+    remote.connect_to_cluster(resolver)
+    tpu_cluster_resolver.initialize_tpu_system(resolver)
+    strategy = tpu_strategy.TPUStrategy(resolver)
+
+    with self.assertRaises(NotImplementedError):
+      with strategy.scope():
+        tpu_embedding_v3.TPUEmbeddingV2(
+            feature_config=feature_config,
+            optimizer=tpu_embedding_v2_utils.SGD(
+                learning_rate=1.0,
+                weight_decay_factor=0.1,
+                multiply_weight_decay_factor_by_learning_rate=True,
+            ),
+        )
+
+
+if __name__ == '__main__':
+  v2_compat.enable_v2_behavior()
+  config.enable_mlir_bridge()
+  test.main()
diff --git a/tensorflow/python/tpu/tpu_embedding_v3_utils.py b/tensorflow/python/tpu/tpu_embedding_v3_utils.py
index 276731051be54f..6f5ba4e5df8e3e 100644
--- a/tensorflow/python/tpu/tpu_embedding_v3_utils.py
+++ b/tensorflow/python/tpu/tpu_embedding_v3_utils.py
@@ -20,6 +20,7 @@
 from tensorflow.python.framework import tensor
 from tensorflow.python.framework.constant_op import constant as tf_constant
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_control_flow_ops
 from tensorflow.python.ops import manip_ops
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.trackable import base as trackable_base
@@ -149,7 +150,9 @@ def _serialize_to_tensors(self) -> Dict[str, tensor.Tensor]:
   def _restore_from_tensors(
       self, restored_tensors: Dict[str, tensor.Tensor]
   ) -> None:
-    self.value = restored_tensors[trackable_base.VARIABLE_VALUE_KEY]
+    # Do not restore the layouts proto from checkpoint, it should always match
+    # the embedding, which is set at build time.
+    gen_control_flow_ops.no_op()
 
 
 class SparseCoreStackedTableTrackable(trackable_base.Trackable):
diff --git a/tensorflow/python/trackable/data_structures.py b/tensorflow/python/trackable/data_structures.py
index 989549c0b65e77..c920dd882aac35 100644
--- a/tensorflow/python/trackable/data_structures.py
+++ b/tensorflow/python/trackable/data_structures.py
@@ -1055,6 +1055,17 @@ def __copy__(self):
   def __deepcopy__(self, memo):
     return _TupleWrapper(copy.deepcopy(self.__wrapped__, memo))
 
+  @property
+  def __dict__(self):
+    # Python 3.12 inspect._check_instance() method only expects and handles
+    # AttributeError but TypeError was raised when the method looks for
+    # `__dict__` on the data structure proxy wrapper. Thus we overrides the
+    # `__dict__` property and forwarding the `__dict__` lookup to the underlying
+    # wrapped TrackalbeDataStructure. AttributeError will be raised when
+    # TrackableDataStructure does not support `__dict__` and thus will be
+    # handled properly.
+    return self.__wrapped__.__dict__
+
   def __reduce_ex__(self, protocol):
     return (self.__class__,
             (self.__wrapped__,))
diff --git a/tensorflow/security/fuzzing/cc/BUILD b/tensorflow/security/fuzzing/cc/BUILD
index c3ec8ac5314408..cc5d059883ca17 100644
--- a/tensorflow/security/fuzzing/cc/BUILD
+++ b/tensorflow/security/fuzzing/cc/BUILD
@@ -50,7 +50,7 @@ tf_cc_fuzz_test(
     srcs = ["bfloat16_fuzz.cc"],
     tags = ["no_oss"],  # b/175698644
     deps = [
-        "//tensorflow/core:test",
+        "//tensorflow/core:portable_gif_internal",
         "//tensorflow/core/framework:bfloat16",
     ],
 )
@@ -69,7 +69,8 @@ tf_cc_fuzz_test(
         "//tensorflow/core:lib",
         "//tensorflow/core/framework:types_proto_cc",
         "//tensorflow/core/platform:resource_loader",
-        "//tensorflow/core/platform:status",
+        "//tensorflow/core/util:saved_tensor_slice_proto_cc",
+        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -98,6 +99,7 @@ tf_cc_fuzz_test(
     srcs = ["consume_leading_digits_fuzz.cc"],
     tags = ["no_oss"],
     deps = [
+        "//tensorflow/core:portable_gif_internal",
         "//tensorflow/core/platform:str_util",
         "//tensorflow/core/platform:stringpiece",
     ],
@@ -178,6 +180,7 @@ tf_cc_fuzz_test(
     deps = [
         "//tensorflow/core/framework:attr_value_proto_cc",
         "//tensorflow/core/framework:attr_value_util",
+        "//tensorflow/core/platform:stringpiece",
     ],
 )
 
diff --git a/tensorflow/security/fuzzing/cc/ParseAttrValue_fuzz.cc b/tensorflow/security/fuzzing/cc/ParseAttrValue_fuzz.cc
index 76de0be10f9ce4..2e1c1479b73da0 100644
--- a/tensorflow/security/fuzzing/cc/ParseAttrValue_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/ParseAttrValue_fuzz.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "fuzztest/fuzztest.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/platform/stringpiece.h"
 
 // This is a fuzzer for tensorflow::ParseAttrValue.
 
diff --git a/tensorflow/security/fuzzing/cc/bfloat16_fuzz.cc b/tensorflow/security/fuzzing/cc/bfloat16_fuzz.cc
index 97383ec3d22614..d9d322ced293cc 100644
--- a/tensorflow/security/fuzzing/cc/bfloat16_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/bfloat16_fuzz.cc
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include "fuzztest/fuzztest.h"
 #include "tensorflow/core/framework/bfloat16.h"
-#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 // This is a fuzzer for tensorflow::FloatToBFloat16 and
 // tensorflow::BFloat16ToFloat.
diff --git a/tensorflow/security/fuzzing/cc/checkpoint_reader_fuzz.cc b/tensorflow/security/fuzzing/cc/checkpoint_reader_fuzz.cc
index f40b441244c49e..07ea7baef71863 100644
--- a/tensorflow/security/fuzzing/cc/checkpoint_reader_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/checkpoint_reader_fuzz.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/framework/resource_handle.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/framework/variant.h"
@@ -33,11 +34,12 @@ limitations under the License.
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/file_system.h"
 #include "tensorflow/core/platform/resource_loader.h"
-#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/stringpiece.h"
 #include "tensorflow/core/platform/tstring.h"
+#include "tensorflow/core/util/saved_tensor_slice.pb.h"
 #include "tensorflow/core/util/saved_tensor_slice_util.h"
 #include "tensorflow/security/fuzzing/cc/checkpoint_reader_fuzz_input.pb.h"
+#include "tsl/platform/status.h"
 
 // This is a fuzzer for tensorflow::checkpoint::CheckpointReader. LevelDB
 // reading and proto parsing are already fuzz-tested, so there's no need to test
diff --git a/tensorflow/security/fuzzing/cc/consume_leading_digits_fuzz.cc b/tensorflow/security/fuzzing/cc/consume_leading_digits_fuzz.cc
index 3b5ab89841fe8c..72f653166b94e8 100644
--- a/tensorflow/security/fuzzing/cc/consume_leading_digits_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/consume_leading_digits_fuzz.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "fuzztest/fuzztest.h"
 #include "tensorflow/core/platform/str_util.h"
 #include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
 
 // This is a fuzzer for tensorflow::str_util::ConsumeLeadingDigits
 
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index eebd36473b8ba8..6e656b861bedaf 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -287,7 +287,7 @@ def if_not_mobile(a):
 
 # Config setting selector used when building for products
 # which requires restricted licenses to be avoided.
-def if_not_mobile_or_arm_or_lgpl_restricted(a):
+def if_not_mobile_or_arm_or_macos_or_lgpl_restricted(a):
     _ = (a,)
     return select({
         "//conditions:default": [],
diff --git a/tensorflow/tensorflow.default.bzl b/tensorflow/tensorflow.default.bzl
index 50c49d3a2b2f7e..be61ba2e7b598b 100644
--- a/tensorflow/tensorflow.default.bzl
+++ b/tensorflow/tensorflow.default.bzl
@@ -10,7 +10,7 @@ load(
     _genrule = "genrule",
     _get_compatible_with_portable = "get_compatible_with_portable",
     _if_indexing_source_code = "if_indexing_source_code",
-    _if_not_mobile_or_arm_or_lgpl_restricted = "if_not_mobile_or_arm_or_lgpl_restricted",
+    _if_not_mobile_or_arm_or_macos_or_lgpl_restricted = "if_not_mobile_or_arm_or_macos_or_lgpl_restricted",
     _if_portable = "if_portable",
     _internal_tfrt_deps = "internal_tfrt_deps",
     _pybind_extension = "pybind_extension",
@@ -47,7 +47,7 @@ load(
 )
 
 clean_dep = _clean_dep
-if_not_mobile_or_arm_or_lgpl_restricted = _if_not_mobile_or_arm_or_lgpl_restricted
+if_not_mobile_or_arm_or_macos_or_lgpl_restricted = _if_not_mobile_or_arm_or_macos_or_lgpl_restricted
 if_portable = _if_portable
 ADDITIONAL_API_INDEXABLE_SETTINGS = _ADDITIONAL_API_INDEXABLE_SETTINGS
 if_indexing_source_code = _if_indexing_source_code
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
index 980d5261ed9c8b..f4b925cc843e14 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
@@ -162,6 +162,13 @@ tf_proto {
         label: LABEL_OPTIONAL
         type: TYPE_INT32
       }
+      field {
+        name: "stream_merge_options"
+        number: 19
+        label: LABEL_OPTIONAL
+        type: TYPE_MESSAGE
+        type_name: ".tensorflow.GPUOptions.Experimental.StreamMergeOptions"
+      }
       nested_type {
         name: "VirtualDevices"
         field {
@@ -183,6 +190,27 @@ tf_proto {
           type: TYPE_INT32
         }
       }
+      nested_type {
+        name: "StreamMergeOptions"
+        field {
+          name: "merge_host_to_device_stream"
+          number: 1
+          label: LABEL_OPTIONAL
+          type: TYPE_BOOL
+        }
+        field {
+          name: "merge_device_to_host_stream"
+          number: 2
+          label: LABEL_OPTIONAL
+          type: TYPE_BOOL
+        }
+        field {
+          name: "merge_device_to_device_stream"
+          number: 3
+          label: LABEL_OPTIONAL
+          type: TYPE_BOOL
+        }
+      }
     }
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index badc611fd6d123..62e0f3bd578bae 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -976,6 +976,14 @@ tf_module {
     name: "ConvertToCooTensor"
     argspec: "args=[\'indices_or_row_splits\', \'values\', \'weights\', \'sample_count\', \'combiner\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "ConvertToListOfSparseCoreCooTensors"
+    argspec: "args=[\'indices_or_row_splits\', \'values\', \'weights\', \'sample_count\', \'num_sc_per_chip\', \'row_offset\', \'col_offset\', \'col_shift\', \'num_sc_shards\', \'stacked_table_sample_count\', \'combiner\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ConvertToSparseCoreCsrWrappedCooTensor"
+    argspec: "args=[\'sorted_row_ids_list\', \'sorted_col_ids_list\', \'sorted_gains_list\', \'id_counts_list\', \'splits\', \'sample_count_per_sc\', \'num_replica\', \'max_minibatches_per_sc\', \'max_ids_per_chip_per_sample\', \'table_vocab_size\', \'feature_width\', \'table_name\', \'allow_id_dropping\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "Copy"
     argspec: "args=[\'input\', \'tensor_name\', \'debug_ops_spec\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'[]\', \'None\'], "
@@ -1940,6 +1948,10 @@ tf_module {
     name: "GetSessionTensor"
     argspec: "args=[\'handle\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "GetStatsFromListOfSparseCoreCooTensors"
+    argspec: "args=[\'row_ids_list\', \'col_ids_list\', \'gains_list\', \'sample_count_list\', \'col_offset_list\', \'num_replica\', \'table_vocab_size\', \'feature_width\', \'num_sc_per_chip\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "GlobalIterId"
     argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -4384,6 +4396,10 @@ tf_module {
     name: "SoftsignGrad"
     argspec: "args=[\'gradients\', \'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "SortListOfSparseCoreCooTensors"
+    argspec: "args=[\'row_ids_list\', \'col_ids_list\', \'gains_list\', \'sample_count_list\', \'col_offset_list\', \'num_replica\', \'table_vocab_size\', \'feature_width\', \'num_sc_per_chip\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "SpaceToBatch"
     argspec: "args=[\'input\', \'paddings\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -5716,26 +5732,50 @@ tf_module {
     name: "XlaSparseDenseMatmulGradWithAdagradAndCsrInput"
     argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'num_minibatches_per_physical_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
   }
+  member_method {
+    name: "XlaSparseDenseMatmulGradWithAdagradAndStaticBufferSize"
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'num_minibatches_per_physical_sparse_core\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+  }
   member_method {
     name: "XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInput"
     argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'momenta\', \'num_minibatches_per_physical_sparse_core\', \'use_nesterov\', \'exponent\', \'beta1\', \'beta2\', \'epsilon\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
   }
+  member_method {
+    name: "XlaSparseDenseMatmulGradWithAdagradMomentumAndStaticBufferSize"
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'momenta\', \'num_minibatches_per_physical_sparse_core\', \'use_nesterov\', \'exponent\', \'beta1\', \'beta2\', \'epsilon\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+  }
   member_method {
     name: "XlaSparseDenseMatmulGradWithAdamAndCsrInput"
     argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'momenta\', \'velocity\', \'num_minibatches_per_physical_sparse_core\', \'use_sum_inside_sqrt\', \'beta1\', \'beta2\', \'epsilon\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
   }
+  member_method {
+    name: "XlaSparseDenseMatmulGradWithAdamAndStaticBufferSize"
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'momenta\', \'velocity\', \'num_minibatches_per_physical_sparse_core\', \'use_sum_inside_sqrt\', \'beta1\', \'beta2\', \'epsilon\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+  }
   member_method {
     name: "XlaSparseDenseMatmulGradWithFtrlAndCsrInput"
     argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'linear\', \'num_minibatches_per_physical_sparse_core\', \'multiply_linear_by_learning_rate\', \'beta\', \'learning_rate_power\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
   }
+  member_method {
+    name: "XlaSparseDenseMatmulGradWithFtrlAndStaticBufferSize"
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'linear\', \'num_minibatches_per_physical_sparse_core\', \'multiply_linear_by_learning_rate\', \'beta\', \'learning_rate_power\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+  }
   member_method {
     name: "XlaSparseDenseMatmulGradWithSgdAndCsrInput"
     argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'num_minibatches_per_physical_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
   }
+  member_method {
+    name: "XlaSparseDenseMatmulGradWithSgdAndStaticBufferSize"
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'num_minibatches_per_physical_sparse_core\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+  }
   member_method {
     name: "XlaSparseDenseMatmulWithCsrInput"
     argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'embedding_table\', \'num_minibatches_per_physical_sparse_core\', \'input_size\', \'quantization_config_low\', \'quantization_config_high\', \'quantization_config_num_buckets\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "XlaSparseDenseMatmulWithStaticBufferSize"
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'embedding_table\', \'num_minibatches_per_physical_sparse_core\', \'input_size\', \'quantization_config_low\', \'quantization_config_high\', \'quantization_config_num_buckets\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "XlaSplitND"
     argspec: "args=[\'input\', \'N\', \'num_splits\', \'paddings\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-sparse-core-embedding-config.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-sparse-core-embedding-config.pbtxt
new file mode 100644
index 00000000000000..c55c1d49e13d6e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-sparse-core-embedding-config.pbtxt
@@ -0,0 +1,29 @@
+path: "tensorflow.tpu.experimental.embedding.SparseCoreEmbeddingConfig"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v3.SparseCoreEmbeddingConfig\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "allow_id_dropping"
+    mtype: "<class \'bool\'>"
+  }
+  member {
+    name: "disable_table_stacking"
+    mtype: "<class \'bool\'>"
+  }
+  member {
+    name: "max_ids_per_chip_per_sample"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "max_ids_per_table"
+    mtype: "<type \'NoneType\'>"
+  }
+  member {
+    name: "max_unique_ids_per_table"
+    mtype: "<type \'NoneType\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'disable_table_stacking\', \'max_ids_per_chip_per_sample\', \'max_ids_per_table\', \'max_unique_ids_per_table\', \'allow_id_dropping\'], varargs=None, keywords=None, defaults=[\'True\', \'64\', \'None\', \'None\', \'False\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-t-p-u-embedding-v2.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-t-p-u-embedding-v2.pbtxt
index 3a69055431344e..a39d11ce6c13f8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-t-p-u-embedding-v2.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-t-p-u-embedding-v2.pbtxt
@@ -5,6 +5,14 @@ tf_class {
   is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
   is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "DEFAULT_MAX_IDS_PER_TABLE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DEFAULT_MAX_UNIQUE_IDS_PER_TABLE"
+    mtype: "<type \'int\'>"
+  }
   member {
     name: "embedding_table_shards"
     mtype: "<type \'property\'>"
@@ -19,7 +27,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_config\', \'optimizer\', \'pipeline_execution_with_tensor_core\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
+    argspec: "args=[\'self\', \'feature_config\', \'optimizer\', \'pipeline_execution_with_tensor_core\', \'sparse_core_embedding_config\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "apply_gradients"
@@ -29,6 +37,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_sparse_core_stats"
+    argspec: "args=[\'cls\', \'features\', \'feature_config\', \'num_tpu_chips\', \'num_sc_per_chip\', \'optimizer\', \'sparse_core_embedding_config\'], varargs=None, keywords=None, defaults=[\'4\', \'None\', \'None\'], "
+  }
   member_method {
     name: "dequeue"
     argspec: "args=[\'self\', \'partitioned_tensors\'], varargs=None, keywords=None, defaults=None"
@@ -41,8 +53,4 @@ tf_class {
     name: "enqueue"
     argspec: "args=[\'self\', \'features\', \'weights\', \'device\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "preprocess_features"
-    argspec: "args=[\'num_replicas_in_sync\', \'max_ids_per_chip_per_sample\', \'max_minibatches_per_sc\', \'num_sc_per_chip\', \'num_sc_shards\', \'stacked_table_to_tables\', \'table_to_stacked_table_offset\', \'table_to_sample_count\', \'feature_to_sample_offset\', \'flat_features\', \'flat_inputs\', \'flat_weights\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.pbtxt
index 37ed3de03f7613..3ec141adfaf9a9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.pbtxt
@@ -32,6 +32,10 @@ tf_module {
     name: "SGD"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SparseCoreEmbeddingConfig"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TPUEmbedding"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index badc611fd6d123..62e0f3bd578bae 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -976,6 +976,14 @@ tf_module {
     name: "ConvertToCooTensor"
     argspec: "args=[\'indices_or_row_splits\', \'values\', \'weights\', \'sample_count\', \'combiner\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "ConvertToListOfSparseCoreCooTensors"
+    argspec: "args=[\'indices_or_row_splits\', \'values\', \'weights\', \'sample_count\', \'num_sc_per_chip\', \'row_offset\', \'col_offset\', \'col_shift\', \'num_sc_shards\', \'stacked_table_sample_count\', \'combiner\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ConvertToSparseCoreCsrWrappedCooTensor"
+    argspec: "args=[\'sorted_row_ids_list\', \'sorted_col_ids_list\', \'sorted_gains_list\', \'id_counts_list\', \'splits\', \'sample_count_per_sc\', \'num_replica\', \'max_minibatches_per_sc\', \'max_ids_per_chip_per_sample\', \'table_vocab_size\', \'feature_width\', \'table_name\', \'allow_id_dropping\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "Copy"
     argspec: "args=[\'input\', \'tensor_name\', \'debug_ops_spec\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'[]\', \'None\'], "
@@ -1940,6 +1948,10 @@ tf_module {
     name: "GetSessionTensor"
     argspec: "args=[\'handle\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "GetStatsFromListOfSparseCoreCooTensors"
+    argspec: "args=[\'row_ids_list\', \'col_ids_list\', \'gains_list\', \'sample_count_list\', \'col_offset_list\', \'num_replica\', \'table_vocab_size\', \'feature_width\', \'num_sc_per_chip\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "GlobalIterId"
     argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -4384,6 +4396,10 @@ tf_module {
     name: "SoftsignGrad"
     argspec: "args=[\'gradients\', \'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "SortListOfSparseCoreCooTensors"
+    argspec: "args=[\'row_ids_list\', \'col_ids_list\', \'gains_list\', \'sample_count_list\', \'col_offset_list\', \'num_replica\', \'table_vocab_size\', \'feature_width\', \'num_sc_per_chip\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "SpaceToBatch"
     argspec: "args=[\'input\', \'paddings\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -5716,26 +5732,50 @@ tf_module {
     name: "XlaSparseDenseMatmulGradWithAdagradAndCsrInput"
     argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'num_minibatches_per_physical_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
   }
+  member_method {
+    name: "XlaSparseDenseMatmulGradWithAdagradAndStaticBufferSize"
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'num_minibatches_per_physical_sparse_core\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+  }
   member_method {
     name: "XlaSparseDenseMatmulGradWithAdagradMomentumAndCsrInput"
     argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'momenta\', \'num_minibatches_per_physical_sparse_core\', \'use_nesterov\', \'exponent\', \'beta1\', \'beta2\', \'epsilon\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
   }
+  member_method {
+    name: "XlaSparseDenseMatmulGradWithAdagradMomentumAndStaticBufferSize"
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'momenta\', \'num_minibatches_per_physical_sparse_core\', \'use_nesterov\', \'exponent\', \'beta1\', \'beta2\', \'epsilon\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+  }
   member_method {
     name: "XlaSparseDenseMatmulGradWithAdamAndCsrInput"
     argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'momenta\', \'velocity\', \'num_minibatches_per_physical_sparse_core\', \'use_sum_inside_sqrt\', \'beta1\', \'beta2\', \'epsilon\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
   }
+  member_method {
+    name: "XlaSparseDenseMatmulGradWithAdamAndStaticBufferSize"
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'momenta\', \'velocity\', \'num_minibatches_per_physical_sparse_core\', \'use_sum_inside_sqrt\', \'beta1\', \'beta2\', \'epsilon\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+  }
   member_method {
     name: "XlaSparseDenseMatmulGradWithFtrlAndCsrInput"
     argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'linear\', \'num_minibatches_per_physical_sparse_core\', \'multiply_linear_by_learning_rate\', \'beta\', \'learning_rate_power\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
   }
+  member_method {
+    name: "XlaSparseDenseMatmulGradWithFtrlAndStaticBufferSize"
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'accumulator\', \'linear\', \'num_minibatches_per_physical_sparse_core\', \'multiply_linear_by_learning_rate\', \'beta\', \'learning_rate_power\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+  }
   member_method {
     name: "XlaSparseDenseMatmulGradWithSgdAndCsrInput"
     argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'num_minibatches_per_physical_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
   }
+  member_method {
+    name: "XlaSparseDenseMatmulGradWithSgdAndStaticBufferSize"
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'activation_gradients\', \'learning_rate\', \'embedding_table\', \'num_minibatches_per_physical_sparse_core\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'clip_weight_min\', \'clip_weight_max\', \'name\'], varargs=None, keywords=None, defaults=[\'-inf\', \'inf\', \'None\'], "
+  }
   member_method {
     name: "XlaSparseDenseMatmulWithCsrInput"
     argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'embedding_table\', \'num_minibatches_per_physical_sparse_core\', \'input_size\', \'quantization_config_low\', \'quantization_config_high\', \'quantization_config_num_buckets\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "XlaSparseDenseMatmulWithStaticBufferSize"
+    argspec: "args=[\'row_pointers\', \'sorted_sample_ids\', \'sorted_token_ids\', \'sorted_gains\', \'embedding_table\', \'num_minibatches_per_physical_sparse_core\', \'input_size\', \'quantization_config_low\', \'quantization_config_high\', \'quantization_config_num_buckets\', \'max_ids_per_sparse_core\', \'max_unique_ids_per_sparse_core\', \'table_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "XlaSplitND"
     argspec: "args=[\'input\', \'N\', \'num_splits\', \'paddings\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
index 8a08f208808cca..b5c536f9d062a3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
@@ -80,6 +80,10 @@ tf_module {
     name: "retain"
     argspec: "args=[\'sp_input\', \'to_retain\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "sampled_addmm"
+    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'mat1\', \'mat2\', \'beta\', \'alpha\', \'output_type\'], varargs=None, keywords=None, defaults=[\'1.0\', \'1.0\', \"<dtype: \'float32\'>\"], "
+  }
   member_method {
     name: "segment_mean"
     argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'num_segments\', \'name\', \'sparse_gradient\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-sparse-core-embedding-config.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-sparse-core-embedding-config.pbtxt
new file mode 100644
index 00000000000000..c55c1d49e13d6e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-sparse-core-embedding-config.pbtxt
@@ -0,0 +1,29 @@
+path: "tensorflow.tpu.experimental.embedding.SparseCoreEmbeddingConfig"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v3.SparseCoreEmbeddingConfig\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "allow_id_dropping"
+    mtype: "<class \'bool\'>"
+  }
+  member {
+    name: "disable_table_stacking"
+    mtype: "<class \'bool\'>"
+  }
+  member {
+    name: "max_ids_per_chip_per_sample"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "max_ids_per_table"
+    mtype: "<type \'NoneType\'>"
+  }
+  member {
+    name: "max_unique_ids_per_table"
+    mtype: "<type \'NoneType\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'disable_table_stacking\', \'max_ids_per_chip_per_sample\', \'max_ids_per_table\', \'max_unique_ids_per_table\', \'allow_id_dropping\'], varargs=None, keywords=None, defaults=[\'True\', \'64\', \'None\', \'None\', \'False\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-t-p-u-embedding-v2.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-t-p-u-embedding-v2.pbtxt
index 3a69055431344e..a39d11ce6c13f8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-t-p-u-embedding-v2.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-t-p-u-embedding-v2.pbtxt
@@ -5,6 +5,14 @@ tf_class {
   is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
   is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "DEFAULT_MAX_IDS_PER_TABLE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DEFAULT_MAX_UNIQUE_IDS_PER_TABLE"
+    mtype: "<type \'int\'>"
+  }
   member {
     name: "embedding_table_shards"
     mtype: "<type \'property\'>"
@@ -19,7 +27,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_config\', \'optimizer\', \'pipeline_execution_with_tensor_core\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
+    argspec: "args=[\'self\', \'feature_config\', \'optimizer\', \'pipeline_execution_with_tensor_core\', \'sparse_core_embedding_config\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "apply_gradients"
@@ -29,6 +37,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_sparse_core_stats"
+    argspec: "args=[\'cls\', \'features\', \'feature_config\', \'num_tpu_chips\', \'num_sc_per_chip\', \'optimizer\', \'sparse_core_embedding_config\'], varargs=None, keywords=None, defaults=[\'4\', \'None\', \'None\'], "
+  }
   member_method {
     name: "dequeue"
     argspec: "args=[\'self\', \'partitioned_tensors\'], varargs=None, keywords=None, defaults=None"
@@ -41,8 +53,4 @@ tf_class {
     name: "enqueue"
     argspec: "args=[\'self\', \'features\', \'weights\', \'device\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "preprocess_features"
-    argspec: "args=[\'num_replicas_in_sync\', \'max_ids_per_chip_per_sample\', \'max_minibatches_per_sc\', \'num_sc_per_chip\', \'num_sc_shards\', \'stacked_table_to_tables\', \'table_to_stacked_table_offset\', \'table_to_sample_count\', \'feature_to_sample_offset\', \'flat_features\', \'flat_inputs\', \'flat_weights\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.pbtxt
index 37ed3de03f7613..3ec141adfaf9a9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.pbtxt
@@ -32,6 +32,10 @@ tf_module {
     name: "SGD"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SparseCoreEmbeddingConfig"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TPUEmbedding"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/tests/convert_from_multiline.cc b/tensorflow/tools/api/tests/convert_from_multiline.cc
index afde8f9f122f77..9ad5a12be7d378 100644
--- a/tensorflow/tools/api/tests/convert_from_multiline.cc
+++ b/tensorflow/tools/api/tests/convert_from_multiline.cc
@@ -48,7 +48,7 @@ Status ConvertFilesFromMultiline(const string& input_dir,
     TF_RETURN_IF_ERROR(
         tensorflow::WriteStringToFile(env, output_path, contents));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/tools/benchmark/BUILD b/tensorflow/tools/benchmark/BUILD
index dbac33541f0872..cdfb36c3a3eae2 100644
--- a/tensorflow/tools/benchmark/BUILD
+++ b/tensorflow/tools/benchmark/BUILD
@@ -40,7 +40,11 @@ cc_library(
             "//tensorflow/core:tensorflow",
             "//tensorflow/core:test",
         ],
-    }),
+    }) + [
+        "//tensorflow/core/platform:numbers",
+        "//tensorflow/core/util:stats_calculator_portable",
+        "@local_tsl//tsl/platform:status",
+    ],
 )
 
 tf_cc_test(
@@ -50,9 +54,11 @@ tf_cc_test(
     deps = [
         ":benchmark_model_lib",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
diff --git a/tensorflow/tools/benchmark/benchmark_model.cc b/tensorflow/tools/benchmark/benchmark_model.cc
index f0a97c2170a3fc..b687d725dccf06 100644
--- a/tensorflow/tools/benchmark/benchmark_model.cc
+++ b/tensorflow/tools/benchmark/benchmark_model.cc
@@ -26,25 +26,34 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
-#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/platform.h"
+#include "tensorflow/core/platform/numbers.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/tstring.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/public/session.h"
+#include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/core/util/reporter.h"
 #include "tensorflow/core/util/stat_summarizer.h"
+#include "tensorflow/core/util/stat_summarizer_options.h"
+#include "tensorflow/core/util/stats_calculator.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/status.h"
 
 namespace tensorflow {
 namespace benchmark_model {
diff --git a/tensorflow/tools/benchmark/benchmark_model.h b/tensorflow/tools/benchmark/benchmark_model.h
index 8211605ace6dea..e983ea4167d740 100644
--- a/tensorflow/tools/benchmark/benchmark_model.h
+++ b/tensorflow/tools/benchmark/benchmark_model.h
@@ -16,6 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_TOOLS_BENCHMARK_BENCHMARK_MODEL_H_
 #define TENSORFLOW_TOOLS_BENCHMARK_BENCHMARK_MODEL_H_
 
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/util/stat_summarizer.h"
 
diff --git a/tensorflow/tools/benchmark/benchmark_model_test.cc b/tensorflow/tools/benchmark/benchmark_model_test.cc
index c2a953ec91f7a7..c6e42840c6a689 100644
--- a/tensorflow/tools/benchmark/benchmark_model_test.cc
+++ b/tensorflow/tools/benchmark/benchmark_model_test.cc
@@ -15,13 +15,21 @@ limitations under the License.
 
 #include "tensorflow/tools/benchmark/benchmark_model.h"
 
-#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/math_ops.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/graph/graph_def_builder.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/util/stat_summarizer.h"
+#include "tsl/lib/core/status_test_util.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.3-cudnn8.9-ubuntu22.04-manylinux2014-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.3-cudnn8.9-ubuntu22.04-manylinux2014-multipython
new file mode 100644
index 00000000000000..848da777a82fed
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.3-cudnn8.9-ubuntu22.04-manylinux2014-multipython
@@ -0,0 +1,44 @@
+# Dockerfile to build a manylinux 2014 compliant cross-compiler.
+#
+# Builds a devtoolset gcc/libstdc++ that targets manylinux 2014 compatible
+# glibc (2.12) and system libstdc++ (4.4).
+#
+# To push a new version, run:
+# $ docker build -f Dockerfile.rbe.cuda12.3-cudnn8.9-ubuntu22.04-manylinux2014-multipython \
+#  --tag "gcr.io/tensorflow-testing/nosla-cuda12.3-cudnn8.9-ubuntu22.04-manylinux2014-multipython" .
+# $ docker push gcr.io/tensorflow-testing/nosla-cuda12.3-cudnn8.9-ubuntu22.04-manylinux2014-multipython
+
+FROM gcr.io/tensorflow-sigs/build@sha256:dddcaf30321e9007103dce75c51b83fea3c06de462fcf41e7c6ae93f37fc3545
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+COPY install/install_bootstrap_deb_packages.sh /install/
+RUN /install/install_bootstrap_deb_packages.sh
+
+COPY install/install_deb_packages.sh /install/
+RUN /install/install_deb_packages.sh
+
+RUN apt-get update && apt-get install -y \
+    libbz2-dev \
+    libffi-dev \
+    libgdbm-dev \
+    libncurses5-dev \
+    libnss3-dev \
+    libreadline-dev \
+    libsqlite3-dev \
+    patchelf \
+      && \
+    rm -rf /var/lib/apt/lists/*
+
+COPY install/build_and_install_python.sh /install/
+RUN /install/build_and_install_python.sh "3.9.18"
+RUN /install/build_and_install_python.sh "3.10.13"
+RUN /install/build_and_install_python.sh "3.11.6"
+RUN /install/build_and_install_python.sh "3.12.3"
+
+COPY install/install_pip_packages_by_version.sh /install/
+# https://github.com/numpy/numpy/issues/22623 for `SETUPTOOLS_USE_DISTUTILS`.
+RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.9" "jax"
+RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.10" "jax"
+RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.11" "jax"
+RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.12" "jax"
diff --git a/tensorflow/tools/ci_build/release/requirements_common.txt b/tensorflow/tools/ci_build/release/requirements_common.txt
index f1ce7485a3dca2..03a0c682389bf0 100644
--- a/tensorflow/tools/ci_build/release/requirements_common.txt
+++ b/tensorflow/tools/ci_build/release/requirements_common.txt
@@ -3,7 +3,7 @@
 # This will change in the future.
 absl-py ~= 1.0.0
 astunparse ~= 1.6.3
-flatbuffers ~= 23.5.26
+flatbuffers ~= 24.3.25
 google_pasta ~= 0.2
 h5py ~= 3.10.0  # Earliest version for Python 3.12
 ml_dtypes ~= 0.3.1
diff --git a/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh b/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
index 690c7764ec92ba..d79bcedabdead7 100755
--- a/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
+++ b/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
@@ -60,7 +60,7 @@ rm -f ${DIR}/tensorflow_jni.dll
 mkdir -p ${DIR}/include/tensorflow/c
 mkdir -p ${DIR}/include/tensorflow/c/eager
 mkdir -p ${DIR}/include/tensorflow/core/platform
-mkdir -p ${DIR}/include/tsl/c
+mkdir -p ${DIR}/include/xla/tsl/c
 mkdir -p ${DIR}/include/tsl/platform
 mkdir -p ${DIR}/lib
 cp bazel-bin/tensorflow/tensorflow.dll ${DIR}/lib/tensorflow.dll
@@ -85,8 +85,7 @@ cp tensorflow/c/eager/c_api.h \
 cp tensorflow/core/platform/ctstring.h \
   tensorflow/core/platform/ctstring_internal.h \
   ${DIR}/include/tensorflow/core/platform
-cp third_party/xla/third_party/tsl/compiler/xla/tsl/c/tsl_status.h \
-   ${DIR}/include/tsl/c
+cp third_party/xla/xla/tsl/c/tsl_status.h ${DIR}/include/xla/tsl/c
 cp third_party/xla/third_party/tsl/tsl/platform/ctstring.h \
    third_party/xla/third_party/tsl/tsl/platform/ctstring_internal.h \
    ${DIR}/include/tsl/platform
@@ -113,7 +112,7 @@ zip libtensorflow-cpu-windows-$(uname -m).zip \
   include/tensorflow/c/c_api_experimental.h \
   include/tensorflow/core/platform/ctstring.h \
   include/tensorflow/core/platform/ctstring_internal.h \
-  include/compiler/xla/tsl/c/tsl_status.h \
+  include/xla/tsl/c/tsl_status.h \
   include/tsl/platform/ctstring.h \
   include/tsl/platform/ctstring_internal.h \
   LICENSE \
diff --git a/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh b/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh
deleted file mode 100755
index f0626c2555a68f..00000000000000
--- a/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ==============================================================================
-
-set -e
-set -x
-
-N_JOBS=$(grep -c ^processor /proc/cpuinfo)
-
-echo ""
-echo "Bazel will use ${N_JOBS} concurrent job(s)."
-echo ""
-
-# Run configure.
-export PYTHON_BIN_PATH=`which python3`
-
-export TF_NEED_CUDA=1
-export TF_CUDA_COMPUTE_CAPABILITIES=3.7
-
-yes "" | $PYTHON_BIN_PATH configure.py
-
-bazel clean
-# Run bazel test command. Double test timeouts to avoid flakes.
-bazel test --config=cuda --test_tag_filters=-no_gpu,-benchmark-test,-no_oss,-oss_excluded -k \
-    --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 \
-    --test_size_filters=small,medium \
-    --build_tests_only --test_output=errors --local_test_jobs=8 \
-    --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
-    --config=xla -- \
-    //tensorflow/compiler/...
diff --git a/tensorflow/tools/def_file_filter/symbols_pybind.txt b/tensorflow/tools/def_file_filter/symbols_pybind.txt
index 78c42c6f454c5c..69aea896b57102 100644
--- a/tensorflow/tools/def_file_filter/symbols_pybind.txt
+++ b/tensorflow/tools/def_file_filter/symbols_pybind.txt
@@ -566,6 +566,10 @@ tensorflow::quantization::QuantizeStaticRangePtq
 tensorflow::quantization::QuantizeDynamicRangePtq
 tensorflow::quantization::QuantizeWeightOnly
 
+[//tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/python:pywrap_tensorflow_to_stablehlo_lib_impl] # tensorflow_to_stablehlo
+mlir::tensorflow_to_stablehlo::pywrap::PywrapSavedModelToStablehlo
+mlir::tensorflow_to_stablehlo::pywrap::PywrapTfModuleToStablehlo
+
 [//tensorflow/dtensor/cc:dtensor_device_cc] # DTensor
 tensorflow::dtensor::AllocateDTensorDevice
 tensorflow::dtensor::AddMesh
@@ -590,3 +594,9 @@ tensorflow::dtensor::SetIteratorElementLayouts
 tensorflow::dtensor::Mesh
 tensorflow::dtensor::Layout
 tensorflow::dtensor::Layout::LayoutType
+
+[//tensorflow/core/tpu/kernels:sparse_core_layout] #SparseCoreLayoutStacker
+tensorflow::tpu::SparseCoreLayoutStacker::AddTable
+tensorflow::tpu::SparseCoreLayoutStacker::SparseCoreLayoutStacker
+tensorflow::tpu::SparseCoreLayoutStacker::AddTable
+tensorflow::tpu::SparseCoreLayoutStacker::GetLayouts
\ No newline at end of file
diff --git a/tensorflow/tools/docs/base_dir.py b/tensorflow/tools/docs/base_dir.py
index 14ea268191e0a5..f1bbbc88dacbef 100644
--- a/tensorflow/tools/docs/base_dir.py
+++ b/tensorflow/tools/docs/base_dir.py
@@ -32,10 +32,10 @@ def get_base_dirs_and_prefixes(code_url_prefix):
   base_dir = pathlib.Path(tf.__file__).parent
 
   if "dev" in tf.__version__:
-    keras_url_prefix = "https://github.com/keras-team/keras/tree/master/keras"
+    keras_url_prefix = "https://github.com/keras-team/keras/tree/master/keras/src"
   else:
     keras_url_prefix = (
-        f"https://github.com/keras-team/keras/tree/v{keras.__version__}/keras"
+        f"https://github.com/keras-team/keras/tree/v{keras.__version__}/keras/src"
     )
 
   if version.parse(tf.__version__) >= version.parse("2.16"):
diff --git a/tensorflow/tools/optimization/BUILD b/tensorflow/tools/optimization/BUILD
index 6c2b53f0da4dc8..13565f89083db7 100644
--- a/tensorflow/tools/optimization/BUILD
+++ b/tensorflow/tools/optimization/BUILD
@@ -29,6 +29,7 @@ tf_cuda_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
     ],
diff --git a/tensorflow/tools/optimization/optimization_pass_runner.cc b/tensorflow/tools/optimization/optimization_pass_runner.cc
index 502977c4f72eb8..b7d4d367048b74 100644
--- a/tensorflow/tools/optimization/optimization_pass_runner.cc
+++ b/tensorflow/tools/optimization/optimization_pass_runner.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/memory/memory.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/common_runtime/device_set.h"
@@ -89,18 +90,18 @@ Status FindPassWithName(absl::string_view name,
 
   return *result == nullptr
              ? errors::Internal("Could not find pass with name ", name)
-             : OkStatus();
+             : absl::OkStatus();
 }
 }  // namespace
 
 Status OptimizationPassRunner::Run(absl::string_view pass_to_run,
                                    GraphDef input, GraphDef* result) {
-  auto session_options = absl::make_unique<SessionOptions>();
+  auto session_options = std::make_unique<SessionOptions>();
   session_options->config.mutable_graph_options()
       ->mutable_optimizer_options()
       ->set_global_jit_level(jit_level_);
   FunctionDefLibrary flib;
-  std::unique_ptr<Graph> graph = absl::make_unique<Graph>(OpRegistry::Global());
+  std::unique_ptr<Graph> graph = std::make_unique<Graph>(OpRegistry::Global());
 
   GraphOptimizationPassOptions options;
   options.session_options = session_options.get();
@@ -128,13 +129,13 @@ Status OptimizationPassRunner::Run(absl::string_view pass_to_run,
   TF_RETURN_IF_ERROR(pass->Run(options));
 
   options.graph->get()->ToGraphDef(result);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status OptimizationPassRunner::SetJitLevel(
     OptimizerOptions::GlobalJitLevel jit_level) {
   jit_level_ = jit_level;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status OptimizationPassRunner::AddDevices(absl::string_view type, int count) {
@@ -148,6 +149,6 @@ Status OptimizationPassRunner::AddDevices(absl::string_view type, int count) {
         absl::StrCat(type)));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace tensorflow
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 2fcd2b28da4d98..c4d4c8d4060a37 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -251,7 +251,6 @@ collect_data_files(
         "//tensorflow:tensorflow_py",
         # utils below are not part of tensorflow_py DAG
         "//tensorflow/compiler/mlir/stablehlo:stablehlo_extension",
-        "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:pywrap_calibration",
         "//tensorflow/python/autograph/impl/testing:pybind_for_testing",
         "//tensorflow/python/framework:memory_checker",
         "//tensorflow/python/util:pywrap_xla_ops",
diff --git a/tensorflow/tools/pip_package/build_pip_package.py b/tensorflow/tools/pip_package/build_pip_package.py
index 9ac5ad2d7b57c8..10593503395532 100644
--- a/tensorflow/tools/pip_package/build_pip_package.py
+++ b/tensorflow/tools/pip_package/build_pip_package.py
@@ -230,10 +230,6 @@ def patch_so(srcs_dir: str) -> None:
           "tensorflow/compiler/mlir/quantization/tensorflow/python/"
           "pywrap_quantize_model.so"
       ): "$ORIGIN/../../../../../python",
-      (
-          "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/"
-          "pywrap_calibration.so"
-      ): "$ORIGIN/../../../../../python",
   }
   for file, path in to_patch.items():
     rpath = subprocess.check_output(
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 0a53c467a2fe58..f2e2f93617a2c7 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -78,7 +78,7 @@ def standard_or_nightly(standard, nightly):
 REQUIRED_PACKAGES = [
     'absl-py >= 1.0.0',
     'astunparse >= 1.6.0',
-    'flatbuffers >= 23.5.26',
+    'flatbuffers >= 24.3.25',
     'gast >=0.2.1,!=0.5.0,!=0.5.1,!=0.5.2',
     'google_pasta >= 0.1.1',
     'h5py >= 3.10.0',
@@ -312,7 +312,7 @@ def find_files(pattern, root):
   # timing of these tests, the UTC date from eight hours ago is expected to be a
   # valid version.
   _libtpu_version = standard_or_nightly(
-      '2.16.0rc0',
+      _VERSION.replace('-', ''),
       '0.1.dev'
       + (
           datetime.datetime.now(tz=datetime.timezone.utc)
diff --git a/tensorflow/tools/proto_splitter/cc/BUILD b/tensorflow/tools/proto_splitter/cc/BUILD
index cbaceb37e12720..9672bf1c590975 100644
--- a/tensorflow/tools/proto_splitter/cc/BUILD
+++ b/tensorflow/tools/proto_splitter/cc/BUILD
@@ -52,6 +52,7 @@ cc_library(
         "@com_google_absl//absl/strings:cord",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:protobuf",
+        "@riegeli//riegeli/base:initializer",
         "@riegeli//riegeli/bytes:cord_writer",
         "@riegeli//riegeli/bytes:fd_writer",
         "@riegeli//riegeli/bytes:string_writer",
@@ -99,6 +100,7 @@ tf_cc_test(
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:status_matchers",
+        "@riegeli//riegeli/base:initializer",
         "@riegeli//riegeli/bytes:cord_reader",
         "@riegeli//riegeli/bytes:fd_reader",
         "@riegeli//riegeli/bytes:string_reader",
@@ -122,6 +124,8 @@ cc_library(
         "//tensorflow/core/platform",
         "//tensorflow/core/platform:env",
         "//tensorflow/tools/proto_splitter:chunk_proto_cc",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -130,6 +134,8 @@ cc_library(
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:statusor",
+        "@riegeli//riegeli/base:initializer",
+        "@riegeli//riegeli/base:types",
         "@riegeli//riegeli/bytes:fd_reader",
         "@riegeli//riegeli/records:record_reader",
     ] + if_oss([
diff --git a/tensorflow/tools/proto_splitter/cc/composable_splitter_base.cc b/tensorflow/tools/proto_splitter/cc/composable_splitter_base.cc
index e3010d2fcdd7c0..3c2853a86b1b08 100644
--- a/tensorflow/tools/proto_splitter/cc/composable_splitter_base.cc
+++ b/tensorflow/tools/proto_splitter/cc/composable_splitter_base.cc
@@ -1,7 +1,3 @@
-#include "tensorflow/tools/proto_splitter/cc/composable_splitter_base.h"
-
-#include <unistd.h>
-
 /* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,6 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/tools/proto_splitter/cc/composable_splitter_base.h"
+
+#include <unistd.h>
+
 #include <cstddef>
 #include <cstdint>
 #include <deque>
@@ -34,6 +34,7 @@ limitations under the License.
 #include "absl/strings/cord.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "riegeli/base/maker.h"  // from @riegeli
 #include "riegeli/bytes/cord_writer.h"  // from @riegeli
 #include "riegeli/bytes/fd_writer.h"  // from @riegeli
 #include "riegeli/bytes/string_writer.h"  // from @riegeli
@@ -46,7 +47,6 @@ limitations under the License.
 #include "tensorflow/tools/proto_splitter/chunk.pb.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
 
 #define IS_OSS true
 
@@ -102,9 +102,8 @@ absl::StatusOr<ChunkedProto> ComposableSplitterBase::Split() {
                         .chunked_message = &chunked_message_};
 }
 
-template <typename T>
 static absl::Status WriteToRecordWriter(
-    riegeli::RecordWriter<T>& writer, const std::vector<MessageBytes>& chunks,
+    riegeli::RecordWriterBase& writer, const std::vector<MessageBytes>& chunks,
     ChunkedMessage& chunked_message,
     const ::tensorflow::proto_splitter::VersionDef& version) {
   // Export Riegeli / chunked file.
@@ -173,11 +172,11 @@ absl::Status ComposableSplitterBase::Write(std::string file_prefix) {
   } else {
     // Export Riegeli / chunked file.
     output_path = absl::StrCat(file_prefix, ".cpb");
-    using WriterType = riegeli::FdWriter<>;
-    riegeli::RecordWriter<WriterType> writer((WriterType(output_path)));
+    riegeli::RecordWriter writer(
+        riegeli::Maker<riegeli::FdWriter>(output_path));
     if (!writer.is_open()) return writer.status();
-    TF_RETURN_IF_ERROR(WriteToRecordWriter<WriterType>(
-        writer, *chunks, *chunked_message, Version()));
+    TF_RETURN_IF_ERROR(
+        WriteToRecordWriter(writer, *chunks, *chunked_message, Version()));
     if (!writer.Close()) return writer.status();
   }
   LOG(INFO) << "Splitter output written to " << output_path;
@@ -202,11 +201,11 @@ ComposableSplitterBase::WriteToString() {
     return std::make_tuple(output, false);
   } else {
     // Export Riegeli / chunked file.
-    using WriterType = riegeli::StringWriter<>;
-    riegeli::RecordWriter<WriterType> writer((WriterType(&output)));
+    riegeli::RecordWriter writer(
+        riegeli::Maker<riegeli::StringWriter>(&output));
     if (!writer.is_open()) return writer.status();
-    TF_RETURN_IF_ERROR(WriteToRecordWriter<WriterType>(
-        writer, *chunks, *chunked_message, Version()));
+    TF_RETURN_IF_ERROR(
+        WriteToRecordWriter(writer, *chunks, *chunked_message, Version()));
     if (!writer.Close()) return writer.status();
     LOG(INFO) << "Splitter output written to string";
     return std::make_tuple(output, true);
@@ -232,11 +231,10 @@ ComposableSplitterBase::WriteToCord() {
     return std::make_tuple(output, false);
   } else {
     // Export Riegeli / chunked file.
-    using WriterType = riegeli::CordWriter<>;
-    riegeli::RecordWriter<WriterType> writer((WriterType(&output)));
+    riegeli::RecordWriter writer(riegeli::Maker<riegeli::CordWriter>(&output));
     if (!writer.is_open()) return writer.status();
-    TF_RETURN_IF_ERROR(WriteToRecordWriter<WriterType>(
-        writer, *chunks, *chunked_message, Version()));
+    TF_RETURN_IF_ERROR(
+        WriteToRecordWriter(writer, *chunks, *chunked_message, Version()));
     if (!writer.Close()) return writer.status();
     LOG(INFO) << "Splitter output written to absl::Cord";
     return std::make_tuple(output, true);
diff --git a/tensorflow/tools/proto_splitter/cc/composable_splitter_test.cc b/tensorflow/tools/proto_splitter/cc/composable_splitter_test.cc
index 59e99ff4a3ec98..55aac6a77821ff 100644
--- a/tensorflow/tools/proto_splitter/cc/composable_splitter_test.cc
+++ b/tensorflow/tools/proto_splitter/cc/composable_splitter_test.cc
@@ -16,9 +16,7 @@ limitations under the License.
 
 #include <memory>
 #include <string>
-#include <tuple>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -27,6 +25,7 @@ limitations under the License.
 #include "absl/strings/cord.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "riegeli/base/maker.h"  // from @riegeli
 #include "riegeli/bytes/cord_reader.h"  // from @riegeli
 #include "riegeli/bytes/fd_reader.h"  // from @riegeli
 #include "riegeli/bytes/string_reader.h"  // from @riegeli
@@ -131,8 +130,7 @@ TEST(RepeatedStringSplitterTest, TestSplitChunks) {
   EXPECT_EQ(chunked_message2, chunked_message);
 }
 
-template <typename T>
-static void CheckChunks(riegeli::RecordReader<T>& reader,
+static void CheckChunks(riegeli::RecordReaderBase& reader,
                         std::vector<string>& strings) {
   ChunkMetadata chunk_metadata;
   reader.Seek(reader.Size().value());
@@ -180,8 +178,8 @@ TEST(RepeatedStringSplitterTest, TestWrite) {
   EXPECT_TRUE(exists);
 
   // Look for the last chunk, which should contain a ChunkMetadata proto.
-  riegeli::RecordReader<riegeli::FdReader<>> file_reader(
-      (riegeli::FdReader(expected_file)));
+  riegeli::RecordReader file_reader(
+      riegeli::Maker<riegeli::FdReader>(std::move(expected_file)));
 
   CheckChunks(file_reader, strings);
 }
@@ -196,8 +194,8 @@ TEST(RepeatedStringSplitterTest, TestWriteToString) {
   bool is_chunked = std::get<1>(string_output_results.value());
   EXPECT_TRUE(is_chunked);
   // Look for the last chunk, which should contain a ChunkMetadata proto.
-  riegeli::RecordReader<riegeli::StringReader<>> string_reader(
-      std::forward_as_tuple(string_output));
+  riegeli::RecordReader string_reader(
+      riegeli::Maker<riegeli::StringReader>(string_output));
 
   CheckChunks(string_reader, strings);
 }
@@ -213,8 +211,8 @@ TEST(RepeatedStringSplitterTest, TestWriteToCord) {
   bool is_chunked = std::get<1>(cord_output_results.value());
   EXPECT_TRUE(is_chunked);
   // Look for the last chunk, which should contain a ChunkMetadata proto.
-  riegeli::RecordReader<riegeli::CordReader<>> cord_reader(
-      std::forward_as_tuple(&cord_output));
+  riegeli::RecordReader cord_reader(
+      riegeli::Maker<riegeli::CordReader>(&cord_output));
 
   CheckChunks(cord_reader, strings);
 }
diff --git a/tensorflow/tools/proto_splitter/cc/util.cc b/tensorflow/tools/proto_splitter/cc/util.cc
index 4e03d5c94990eb..c0e5ea74268b5b 100644
--- a/tensorflow/tools/proto_splitter/cc/util.cc
+++ b/tensorflow/tools/proto_splitter/cc/util.cc
@@ -24,12 +24,16 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
+#include "riegeli/base/maker.h"  // from @riegeli
+#include "riegeli/base/types.h"  // from @riegeli
 #include "riegeli/bytes/fd_reader.h"  // from @riegeli
 #include "riegeli/records/record_reader.h"  // from @riegeli
 #include "tensorflow/core/platform/env.h"
@@ -736,18 +740,15 @@ std::string HumanReadableDuration(int64_t microseconds) {
 
 absl::StatusOr<riegeli::RecordReader<riegeli::FdReader<>>> GetRiegeliReader(
     absl::string_view cpb_file) {
-  riegeli::RecordReader<riegeli::FdReader<>> reader =
-      riegeli::RecordReader<riegeli::FdReader<>>(
-          riegeli::FdReader(cpb_file));
-  if (!reader.status().ok()) {
-    reader.Close();
+  riegeli::RecordReader reader(riegeli::Maker<riegeli::FdReader>(cpb_file));
+  if (!reader.ok()) {
     return reader.status();
   }
   return reader;
 }
 
 absl::StatusOr<::tensorflow::proto_splitter::ChunkMetadata> GetChunkMetadata(
-    riegeli::RecordReader<riegeli::FdReader<>>& reader) {
+    riegeli::RecordReaderBase& reader) {
   ::tensorflow::proto_splitter::ChunkMetadata chunk_metadata;
   bool read_metadata_success = reader.Seek(reader.Size().value()) &&
                                reader.SeekBack() &&
@@ -757,7 +758,7 @@ absl::StatusOr<::tensorflow::proto_splitter::ChunkMetadata> GetChunkMetadata(
 }
 
 absl::StatusOr<std::string> ReadChunk(
-    riegeli::RecordReader<riegeli::FdReader<>>& reader,
+    riegeli::RecordReaderBase& reader,
     const ::tensorflow::proto_splitter::ChunkInfo& chunk_info) {
   riegeli::Position pos = chunk_info.offset();
   std::string chunk(chunk_info.size(), '\0');
diff --git a/tensorflow/tools/proto_splitter/cc/util.h b/tensorflow/tools/proto_splitter/cc/util.h
index 751298025cbd6b..15964479d362b9 100644
--- a/tensorflow/tools/proto_splitter/cc/util.h
+++ b/tensorflow/tools/proto_splitter/cc/util.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_TOOLS_PROTO_SPLITTER_CC_UTIL_H_
 #define TENSORFLOW_TOOLS_PROTO_SPLITTER_CC_UTIL_H_
 
+#include <cstdint>
 #include <functional>
 #include <memory>
 #include <optional>
@@ -23,8 +24,8 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/cord.h"
 #include "absl/strings/string_view.h"
 #include "riegeli/bytes/fd_reader.h"  // from @riegeli
 #include "riegeli/records/record_reader.h"  // from @riegeli
@@ -148,11 +149,11 @@ absl::StatusOr<riegeli::RecordReader<riegeli::FdReader<>>> GetRiegeliReader(
 // Read the last chunk, which contains metadata necessary for reading the
 // remaining chunks.
 absl::StatusOr<::tensorflow::proto_splitter::ChunkMetadata> GetChunkMetadata(
-    riegeli::RecordReader<riegeli::FdReader<>>& reader);
+    riegeli::RecordReaderBase& reader);
 
 // Use the `reader` to read in the chunk specified by `chunk_info`.
 absl::StatusOr<std::string> ReadChunk(
-    riegeli::RecordReader<riegeli::FdReader<>>& reader,
+    riegeli::RecordReaderBase& reader,
     const ::tensorflow::proto_splitter::ChunkInfo& chunk_info);
 
 // Returns true if prefix can only be found as a .pb file, and false if a .cpb
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.requirements.txt b/tensorflow/tools/tf_sig_build_dockerfiles/devel.requirements.txt
index 480cdc54385e5a..80ff53b3b058e4 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.requirements.txt
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.requirements.txt
@@ -6,7 +6,7 @@
 # This will change in the future.
 absl-py ~= 1.0.0
 astunparse ~= 1.6.3
-flatbuffers ~= 23.5.26
+flatbuffers ~= 24.3.25
 google_pasta ~= 0.2
 h5py ~= 3.10.0 # Earliest version for Python 3.12
 ml_dtypes ~= 0.3.1
diff --git a/tensorflow/tools/toolchains/remote_config/configs.bzl b/tensorflow/tools/toolchains/remote_config/configs.bzl
index b4d69b94d59125..41fe389aa09cad 100644
--- a/tensorflow/tools/toolchains/remote_config/configs.bzl
+++ b/tensorflow/tools/toolchains/remote_config/configs.bzl
@@ -244,6 +244,28 @@ def initialize_rbe_configs():
         python_install_path = "/usr/local",
     )
 
+    tensorflow_rbe_config(
+        name = "ubuntu22.04-clang_manylinux2014-cuda12.3-cudnn8.9",
+        compiler = "/usr/lib/llvm-17/bin/clang",
+        cuda_version = "12.3",
+        cudnn_version = "8.9",
+        os = "ubuntu22.04-manylinux2014-multipython",
+        python_versions = ["3.9", "3.10", "3.11", "3.12"],
+        sysroot = "/dt9",
+        python_install_path = "/usr/local",
+    )
+
+    tensorflow_rbe_config(
+        name = "ubuntu22.04-gcc9_manylinux2014-cuda12.3-cudnn8.9",
+        compiler = "/dt9/usr/bin/gcc",
+        compiler_prefix = "/usr/bin",
+        cuda_version = "12.3",
+        cudnn_version = "8.9",
+        os = "ubuntu22.04-manylinux2014-multipython",
+        python_versions = ["3.9", "3.10", "3.11", "3.12"],
+        python_install_path = "/usr/local",
+    )
+
     tensorflow_rbe_win_config(
         name = "windows_py37",
         python_bin_path = "C:/Python37/python.exe",
diff --git a/tensorflow/tools/toolchains/remote_config/containers.bzl b/tensorflow/tools/toolchains/remote_config/containers.bzl
index dd222d06bd13b1..3c7d017b0e0206 100644
--- a/tensorflow/tools/toolchains/remote_config/containers.bzl
+++ b/tensorflow/tools/toolchains/remote_config/containers.bzl
@@ -9,6 +9,7 @@ container_digests = {
     "cuda12.1-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:8c266e5b0acd203aed5e8871b63f68a39d8d23f6d882e619797e58b973f7fe63",
     "cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:9fefda035b4a12b24cd5bae56c7dbb9527a5fd06a41ced0a22ac86fe5ed26428",
     "cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:6f9524a2ed7f75255dc4be3a0c5e3bda581385a1c13e2fa890bc17fa62da95b2",
+    "cuda12.3-cudnn8.9-ubuntu22.04-manylinux2014-multipython": "sha256:dddcaf30321e9007103dce75c51b83fea3c06de462fcf41e7c6ae93f37fc3545",
     # ROCM, probably not all of them still in use
     "rocm-ubuntu18.04-manylinux2010-multipython": "sha256:6e953a09b145df338bcb03e9e36f99b291140c29b72d0a048fb6c5905ccad5eb",
     "rocm-ubuntu20.04-manylinux2014-multipython": "sha256:906faec7765fe5dd067f2b092b5d5f220c1fedde725fb42c83d031b4d6f32204",
@@ -114,6 +115,13 @@ containers = {
         "digest": container_digests["cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython"],
     },
 
+    # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython.
+    "cuda12.3-cudnn8.9-ubuntu22.04-manylinux2014-multipython": {
+        "registry": "gcr.io",
+        "repository": "tensorflow-testing/nosla-cuda12.3-cudnn8.9-ubuntu22.04-manylinux2014-multipython",
+        "digest": container_digests["cuda12.3-cudnn8.9-ubuntu22.04-manylinux2014-multipython"],
+    },
+
     # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu18.04-manylinux2010-multipython.
     "rocm-ubuntu18.04-manylinux2010-multipython": {
         "registry": "gcr.io",
diff --git a/tensorflow/tools/toolchains/win/20240424/BUILD b/tensorflow/tools/toolchains/win/20240424/BUILD
new file mode 100644
index 00000000000000..93b3c90aff81d9
--- /dev/null
+++ b/tensorflow/tools/toolchains/win/20240424/BUILD
@@ -0,0 +1,662 @@
+# Copyright 2018 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This becomes the BUILD file for @local_config_cc// under Windows.
+
+load("@rules_cc//cc:defs.bzl", "cc_library", "cc_toolchain", "cc_toolchain_suite")
+load(":armeabi_cc_toolchain_config.bzl", "armeabi_cc_toolchain_config")
+load(":windows_cc_toolchain_config.bzl", "cc_toolchain_config")
+
+package(default_visibility = ["//visibility:public"])
+
+cc_library(name = "empty_lib")
+
+# Label flag for extra libraries to be linked into every binary.
+# TODO(bazel-team): Support passing flag multiple times to build a list.
+label_flag(
+    name = "link_extra_libs",
+    build_setting_default = ":empty_lib",
+)
+
+# The final extra library to be linked into every binary target. This collects
+# the above flag, but may also include more libraries depending on config.
+cc_library(
+    name = "link_extra_lib",
+    deps = [
+        ":link_extra_libs",
+    ],
+)
+
+cc_library(
+    name = "malloc",
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
+
+filegroup(
+    name = "mingw_compiler_files",
+    srcs = [":builtin_include_directory_paths_mingw"],
+)
+
+filegroup(
+    name = "clangcl_compiler_files",
+    srcs = [":builtin_include_directory_paths_clangcl"],
+)
+
+filegroup(
+    name = "msvc_compiler_files",
+    srcs = [":builtin_include_directory_paths_msvc"],
+)
+
+# Hardcoded toolchain, legacy behaviour.
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "armeabi-v7a|compiler": ":cc-compiler-armeabi-v7a",
+        "x64_windows|msvc-cl": ":cc-compiler-x64_windows",
+        "x64_x86_windows|msvc-cl": ":cc-compiler-x64_x86_windows",
+        "x64_arm_windows|msvc-cl": ":cc-compiler-x64_arm_windows",
+        "x64_arm64_windows|msvc-cl": ":cc-compiler-arm64_windows",
+        "arm64_windows|msvc-cl": ":cc-compiler-arm64_windows",
+        "x64_windows|msys-gcc": ":cc-compiler-x64_windows_msys",
+        "x64_windows|mingw-gcc": ":cc-compiler-x64_windows_mingw",
+        "x64_windows|clang-cl": ":cc-compiler-x64_windows-clang-cl",
+        "x64_windows_msys": ":cc-compiler-x64_windows_msys",
+        "x64_windows": ":cc-compiler-x64_windows",
+        "x64_x86_windows": ":cc-compiler-x64_x86_windows",
+        "x64_arm_windows": ":cc-compiler-x64_arm_windows",
+        "x64_arm64_windows": ":cc-compiler-arm64_windows",
+        "arm64_windows": ":cc-compiler-arm64_windows",
+        "x64_arm64_windows|clang-cl": ":cc-compiler-arm64_windows-clang-cl",
+        "arm64_windows|clang-cl": ":cc-compiler-arm64_windows-clang-cl",
+        "armeabi-v7a": ":cc-compiler-armeabi-v7a",
+    },
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows_msys",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":mingw_compiler_files",
+    compiler_files = ":mingw_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msys_x64",
+    toolchain_identifier = "msys_x64",
+)
+
+cc_toolchain_config(
+    name = "msys_x64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    compiler = "msys-gcc",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [
+        "c:/tools/msys64/usr/",
+    ],
+    dbg_mode_debug_flag = "/DEBUG:FULL",
+    fastbuild_mode_debug_flag = "/DEBUG:FASTLINK",
+    host_system_name = "local",
+    target_libc = "msys",
+    target_system_name = "local",
+    tool_bin_path = "c:/tools/msys64/usr/bin",
+    tool_paths = {
+        "ar": "c:/tools/msys64/usr/bin/ar",
+        "cpp": "c:/tools/msys64/usr/bin/cpp",
+        "dwp": "c:/tools/msys64/usr/bin/dwp",
+        "gcc": "c:/tools/msys64/usr/bin/gcc",
+        "gcov": "c:/tools/msys64/usr/bin/gcov",
+        "ld": "c:/tools/msys64/usr/bin/ld",
+        "nm": "c:/tools/msys64/usr/bin/nm",
+        "objcopy": "c:/tools/msys64/usr/bin/objcopy",
+        "objdump": "c:/tools/msys64/usr/bin/objdump",
+        "strip": "c:/tools/msys64/usr/bin/strip",
+    },
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows_msys",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+        "@bazel_tools//tools/cpp:msys",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows_msys",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows_mingw",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":mingw_compiler_files",
+    compiler_files = ":mingw_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 0,
+    toolchain_config = ":msys_x64_mingw",
+    toolchain_identifier = "msys_x64_mingw",
+)
+
+cc_toolchain_config(
+    name = "msys_x64_mingw",
+    abi_libc_version = "local",
+    abi_version = "local",
+    compiler = "mingw-gcc",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [
+        "c:/tools/msys64/mingw64/",
+    ],
+    dbg_mode_debug_flag = "/DEBUG:FULL",
+    fastbuild_mode_debug_flag = "/DEBUG:FASTLINK",
+    host_system_name = "local",
+    target_libc = "mingw",
+    target_system_name = "local",
+    tool_bin_path = "c:/tools/msys64/mingw64/bin",
+    tool_paths = {
+        "ar": "c:/tools/msys64/mingw64/bin/ar",
+        "cpp": "c:/tools/msys64/mingw64/bin/cpp",
+        "dwp": "c:/tools/msys64/mingw64/bin/dwp",
+        "gcc": "c:/tools/msys64/mingw64/bin/gcc",
+        "gcov": "c:/tools/msys64/mingw64/bin/gcov",
+        "ld": "c:/tools/msys64/mingw64/bin/ld",
+        "nm": "c:/tools/msys64/mingw64/bin/nm",
+        "objcopy": "c:/tools/msys64/mingw64/bin/objcopy",
+        "objdump": "c:/tools/msys64/mingw64/bin/objdump",
+        "strip": "c:/tools/msys64/mingw64/bin/strip",
+    },
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows_mingw",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+        "@bazel_tools//tools/cpp:mingw",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows_mingw",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":msvc_compiler_files",
+    compiler_files = ":msvc_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msvc_x64",
+    toolchain_identifier = "msvc_x64",
+)
+
+cc_toolchain_config(
+    name = "msvc_x64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:X64"],
+    compiler = "msvc-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [
+        "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\include",
+        "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Auxiliary\\VS\\include",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.22621.0\\ucrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\um",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\shared",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\winrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\cppwinrt",
+    ],
+    dbg_mode_debug_flag = "/DEBUG:FULL",
+    default_link_flags = ["/MACHINE:X64"],
+    fastbuild_mode_debug_flag = "/DEBUG:FASTLINK",
+    host_system_name = "local",
+    msvc_cl_path = "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x64/cl.exe",
+    msvc_env_include = "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\include;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Auxiliary\\VS\\include;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.22621.0\\ucrt;C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\um;C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\shared;C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\winrt;C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\cppwinrt",
+    msvc_env_lib = "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\lib\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.22621.0\\ucrt\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\\\lib\\10.0.22621.0\\\\um\\x64",
+    msvc_env_path = "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\bin\\HostX64\\x64;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\VC\\VCPackages;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TestWindow;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TeamFoundation\\Team Explorer;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\MSBuild\\Current\\bin\\Roslyn;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\10.0.22621.0\\\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\\\x64;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\\\MSBuild\\Current\\Bin\\amd64;C:\\Windows\\Microsoft.NET\\Framework64\\v4.0.30319;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\Tools\\;;C:\\Windows\\system32;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\VC\\Linux\\bin\\ConnectionManagerExe",
+    msvc_env_tmp = "C:\\Users\\ContainerAdministrator\\AppData\\Local\\Temp",
+    msvc_lib_path = "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x64/lib.exe",
+    msvc_link_path = "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x64/link.exe",
+    msvc_ml_path = "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x64/ml64.exe",
+    supports_parse_showincludes = True,
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x64/lib.exe",
+        "ml": "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x64/ml64.exe",
+        "cpp": "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x64/cl.exe",
+        "gcc": "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x64/cl.exe",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x64/link.exe",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "msvc_x64",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_x86_windows",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":msvc_compiler_files",
+    compiler_files = ":msvc_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msvc_x64_x86",
+    toolchain_identifier = "msvc_x64_x86",
+)
+
+cc_toolchain_config(
+    name = "msvc_x64_x86",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:X86"],
+    compiler = "msvc-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [
+        "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\include",
+        "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Auxiliary\\VS\\include",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.22621.0\\ucrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\um",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\shared",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\winrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\cppwinrt",
+    ],
+    dbg_mode_debug_flag = "/DEBUG:FULL",
+    default_link_flags = ["/MACHINE:X86"],
+    fastbuild_mode_debug_flag = "/DEBUG:FASTLINK",
+    host_system_name = "local",
+    msvc_cl_path = "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x86/cl.exe",
+    msvc_env_include = "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\include;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Auxiliary\\VS\\include;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.22621.0\\ucrt;C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\um;C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\shared;C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\winrt;C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\cppwinrt",
+    msvc_env_lib = "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\lib\\x86;C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.22621.0\\ucrt\\x86;C:\\Program Files (x86)\\Windows Kits\\10\\\\lib\\10.0.22621.0\\\\um\\x86",
+    msvc_env_path = "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\bin\\HostX64\\x86;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\bin\\HostX64\\x64;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\VC\\VCPackages;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TestWindow;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TeamFoundation\\Team Explorer;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\MSBuild\\Current\\bin\\Roslyn;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\10.0.22621.0\\\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\\\x64;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\\\MSBuild\\Current\\Bin\\amd64;C:\\Windows\\Microsoft.NET\\Framework64\\v4.0.30319;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\Tools\\;;C:\\Windows\\system32;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\VC\\Linux\\bin\\ConnectionManagerExe",
+    msvc_env_tmp = "C:\\Users\\ContainerAdministrator\\AppData\\Local\\Temp",
+    msvc_lib_path = "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x86/lib.exe",
+    msvc_link_path = "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x86/link.exe",
+    msvc_ml_path = "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x86/ml.exe",
+    supports_parse_showincludes = True,
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x86/lib.exe",
+        "ml": "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x86/ml.exe",
+        "cpp": "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x86/cl.exe",
+        "gcc": "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x86/cl.exe",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x86/link.exe",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "msvc_x64_x86",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_x86_windows",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_32",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_x86_windows",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_arm_windows",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":msvc_compiler_files",
+    compiler_files = ":msvc_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msvc_x64_arm",
+    toolchain_identifier = "msvc_x64_arm",
+)
+
+cc_toolchain_config(
+    name = "msvc_x64_arm",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:ARM"],
+    compiler = "msvc-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [],
+    dbg_mode_debug_flag = "/DEBUG",
+    default_link_flags = ["/MACHINE:ARM"],
+    fastbuild_mode_debug_flag = "/DEBUG",
+    host_system_name = "local",
+    msvc_cl_path = "vc_installation_error_arm.bat",
+    msvc_env_include = "msvc_not_found",
+    msvc_env_lib = "msvc_not_found",
+    msvc_env_path = "msvc_not_found",
+    msvc_env_tmp = "msvc_not_found",
+    msvc_lib_path = "vc_installation_error_arm.bat",
+    msvc_link_path = "vc_installation_error_arm.bat",
+    msvc_ml_path = "vc_installation_error_arm.bat",
+    supports_parse_showincludes = False,
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "vc_installation_error_arm.bat",
+        "ml": "vc_installation_error_arm.bat",
+        "cpp": "vc_installation_error_arm.bat",
+        "gcc": "vc_installation_error_arm.bat",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "vc_installation_error_arm.bat",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "msvc_x64_arm",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_arm_windows",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:arm",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_arm_windows",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-arm64_windows",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":msvc_compiler_files",
+    compiler_files = ":msvc_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msvc_arm64",
+    toolchain_identifier = "msvc_arm64",
+)
+
+cc_toolchain_config(
+    name = "msvc_arm64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:ARM64"],
+    compiler = "msvc-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [],
+    dbg_mode_debug_flag = "/DEBUG",
+    default_link_flags = ["/MACHINE:ARM64"],
+    fastbuild_mode_debug_flag = "/DEBUG",
+    host_system_name = "local",
+    msvc_cl_path = "vc_installation_error_arm64.bat",
+    msvc_env_include = "msvc_not_found",
+    msvc_env_lib = "msvc_not_found",
+    msvc_env_path = "msvc_not_found",
+    msvc_env_tmp = "msvc_not_found",
+    msvc_lib_path = "vc_installation_error_arm64.bat",
+    msvc_link_path = "vc_installation_error_arm64.bat",
+    msvc_ml_path = "vc_installation_error_arm64.bat",
+    supports_parse_showincludes = False,
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "vc_installation_error_arm64.bat",
+        "ml": "vc_installation_error_arm64.bat",
+        "cpp": "vc_installation_error_arm64.bat",
+        "gcc": "vc_installation_error_arm64.bat",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "vc_installation_error_arm64.bat",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "msvc_arm64",
+)
+
+toolchain(
+    name = "cc-toolchain-arm64_windows",
+    exec_compatible_with = [
+        "@platforms//os:windows",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:arm64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-arm64_windows",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows-clang-cl",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":clangcl_compiler_files",
+    compiler_files = ":clangcl_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":clang_cl_x64",
+    toolchain_identifier = "clang_cl_x64",
+)
+
+cc_toolchain_config(
+    name = "clang_cl_x64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:X64"],
+    compiler = "clang-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [
+        "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\include",
+        "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Auxiliary\\VS\\include",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.22621.0\\ucrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\um",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\shared",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\winrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\cppwinrt",
+        "C:\\tools\\LLVM\\lib\\clang\\18\\include",
+    ],
+    dbg_mode_debug_flag = "/DEBUG",
+    default_link_flags = ["/MACHINE:X64"],
+    fastbuild_mode_debug_flag = "/DEBUG",
+    host_system_name = "local",
+    msvc_cl_path = "C:/tools/LLVM/bin/clang-cl.exe",
+    msvc_env_include = "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\include;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Auxiliary\\VS\\include;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.22621.0\\ucrt;C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\um;C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\shared;C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\winrt;C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\cppwinrt;C:\\tools\\LLVM\\lib\\clang\\18\\include",
+    msvc_env_lib = "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\lib\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.22621.0\\ucrt\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\\\lib\\10.0.22621.0\\\\um\\x64;C:\\tools\\LLVM\\lib\\clang\\18\\lib\\windows",
+    msvc_env_path = "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\bin\\HostX64\\x64;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\VC\\VCPackages;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TestWindow;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TeamFoundation\\Team Explorer;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\MSBuild\\Current\\bin\\Roslyn;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\10.0.22621.0\\\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\\\x64;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\\\MSBuild\\Current\\Bin\\amd64;C:\\Windows\\Microsoft.NET\\Framework64\\v4.0.30319;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\Tools\\;;C:\\Windows\\system32;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\VC\\Linux\\bin\\ConnectionManagerExe",
+    msvc_env_tmp = "C:\\Users\\ContainerAdministrator\\AppData\\Local\\Temp",
+    msvc_lib_path = "C:/tools/LLVM/bin/llvm-lib.exe",
+    msvc_link_path = "C:/tools/LLVM/bin/lld-link.exe",
+    msvc_ml_path = "C:/tools/LLVM/bin/clang-cl.exe",
+    supports_parse_showincludes = True,
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "C:/tools/LLVM/bin/llvm-lib.exe",
+        "ml": "C:/tools/LLVM/bin/clang-cl.exe",
+        "cpp": "C:/tools/LLVM/bin/clang-cl.exe",
+        "gcc": "C:/tools/LLVM/bin/clang-cl.exe",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "C:/tools/LLVM/bin/lld-link.exe",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "clang_cl_x64",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows-clang-cl",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+        "@bazel_tools//tools/cpp:clang-cl",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows-clang-cl",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-arm64_windows-clang-cl",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":clangcl_compiler_files",
+    compiler_files = ":clangcl_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":clang_cl_arm64",
+    toolchain_identifier = "clang_cl_arm64",
+)
+
+cc_toolchain_config(
+    name = "clang_cl_arm64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:ARM64"],
+    compiler = "clang-cl",
+    cpu = "arm64_windows",
+    cxx_builtin_include_directories = [],
+    dbg_mode_debug_flag = "/DEBUG",
+    default_link_flags = ["/MACHINE:ARM64"],
+    fastbuild_mode_debug_flag = "/DEBUG",
+    host_system_name = "local",
+    msvc_cl_path = "vc_installation_error_arm64.bat",
+    msvc_env_include = "clang_cl_not_found",
+    msvc_env_lib = "clang_cl_not_found",
+    msvc_env_path = "clang_cl_not_found",
+    msvc_env_tmp = "clang_cl_not_found",
+    msvc_lib_path = "vc_installation_error_arm64.bat",
+    msvc_link_path = "vc_installation_error_arm64.bat",
+    msvc_ml_path = "vc_installation_error_arm64.bat",
+    supports_parse_showincludes = False,
+    target_libc = "msvcrt",
+    target_system_name = "aarch64-pc-windows-msvc",
+    tool_paths = {
+        "ar": "vc_installation_error_arm64.bat",
+        "ml": "vc_installation_error_arm64.bat",
+        "cpp": "vc_installation_error_arm64.bat",
+        "gcc": "vc_installation_error_arm64.bat",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "vc_installation_error_arm64.bat",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "clang_cl_arm64",
+)
+
+toolchain(
+    name = "cc-toolchain-arm64_windows-clang-cl",
+    exec_compatible_with = [
+        "@platforms//os:windows",
+        "@bazel_tools//tools/cpp:clang-cl",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:arm64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-arm64_windows-clang-cl",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-armeabi-v7a",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":empty",
+    compiler_files = ":empty",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":stub_armeabi-v7a",
+    toolchain_identifier = "stub_armeabi-v7a",
+)
+
+armeabi_cc_toolchain_config(name = "stub_armeabi-v7a")
+
+toolchain(
+    name = "cc-toolchain-armeabi-v7a",
+    exec_compatible_with = [
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:armv7",
+        "@platforms//os:android",
+    ],
+    toolchain = ":cc-compiler-armeabi-v7a",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
diff --git a/tensorflow/tools/toolchains/win/20240424/armeabi_cc_toolchain_config.bzl b/tensorflow/tools/toolchains/win/20240424/armeabi_cc_toolchain_config.bzl
new file mode 100644
index 00000000000000..72ef48ae6d6dfc
--- /dev/null
+++ b/tensorflow/tools/toolchains/win/20240424/armeabi_cc_toolchain_config.bzl
@@ -0,0 +1,82 @@
+# Copyright 2019 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A Starlark cc_toolchain configuration rule"""
+
+load(
+    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
+    "feature",
+    "tool_path",
+)
+
+def _impl(ctx):
+    toolchain_identifier = "stub_armeabi-v7a"
+    host_system_name = "armeabi-v7a"
+    target_system_name = "armeabi-v7a"
+    target_cpu = "armeabi-v7a"
+    target_libc = "armeabi-v7a"
+    compiler = "compiler"
+    abi_version = "armeabi-v7a"
+    abi_libc_version = "armeabi-v7a"
+    cc_target_os = None
+    builtin_sysroot = None
+    action_configs = []
+
+    supports_pic_feature = feature(name = "supports_pic", enabled = True)
+    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+    features = [supports_dynamic_linker_feature, supports_pic_feature]
+
+    cxx_builtin_include_directories = []
+    artifact_name_patterns = []
+    make_variables = []
+
+    tool_paths = [
+        tool_path(name = "ar", path = "/bin/false"),
+        tool_path(name = "cpp", path = "/bin/false"),
+        tool_path(name = "dwp", path = "/bin/false"),
+        tool_path(name = "gcc", path = "/bin/false"),
+        tool_path(name = "gcov", path = "/bin/false"),
+        tool_path(name = "ld", path = "/bin/false"),
+        tool_path(name = "llvm-profdata", path = "/bin/false"),
+        tool_path(name = "nm", path = "/bin/false"),
+        tool_path(name = "objcopy", path = "/bin/false"),
+        tool_path(name = "objdump", path = "/bin/false"),
+        tool_path(name = "strip", path = "/bin/false"),
+    ]
+
+    return cc_common.create_cc_toolchain_config_info(
+        ctx = ctx,
+        features = features,
+        action_configs = action_configs,
+        artifact_name_patterns = artifact_name_patterns,
+        cxx_builtin_include_directories = cxx_builtin_include_directories,
+        toolchain_identifier = toolchain_identifier,
+        host_system_name = host_system_name,
+        target_system_name = target_system_name,
+        target_cpu = target_cpu,
+        target_libc = target_libc,
+        compiler = compiler,
+        abi_version = abi_version,
+        abi_libc_version = abi_libc_version,
+        tool_paths = tool_paths,
+        make_variables = make_variables,
+        builtin_sysroot = builtin_sysroot,
+        cc_target_os = cc_target_os,
+    )
+
+armeabi_cc_toolchain_config = rule(
+    implementation = _impl,
+    attrs = {},
+    provides = [CcToolchainConfigInfo],
+)
diff --git a/tensorflow/tools/toolchains/win/20240424/builtin_include_directory_paths_clangcl b/tensorflow/tools/toolchains/win/20240424/builtin_include_directory_paths_clangcl
new file mode 100644
index 00000000000000..0a1fb6e0df84ce
--- /dev/null
+++ b/tensorflow/tools/toolchains/win/20240424/builtin_include_directory_paths_clangcl
@@ -0,0 +1,5 @@
+This file is generated by cc_configure and contains builtin include directories
+that clang-cl reported. This file is a dependency of every compilation action and
+changes to it will be reflected in the action cache key. When some of these
+paths change, Bazel will make sure to rerun the action, even though none of
+declared action inputs or the action commandline changes.
diff --git a/tensorflow/tools/toolchains/win/20240424/builtin_include_directory_paths_msvc b/tensorflow/tools/toolchains/win/20240424/builtin_include_directory_paths_msvc
new file mode 100644
index 00000000000000..55ba44f761e2c1
--- /dev/null
+++ b/tensorflow/tools/toolchains/win/20240424/builtin_include_directory_paths_msvc
@@ -0,0 +1,6 @@
+This file is generated by cc_configure and contains builtin include directories
+that msvc reported. This file is a dependency of every compilation action and
+changes to it will be reflected in the action cache key. When some of these
+paths change, Bazel will make sure to rerun the action, even though none of
+declared action inputs or the action commandline changes.
+
diff --git a/tensorflow/tools/toolchains/win/20240424/toolchain_image_info b/tensorflow/tools/toolchains/win/20240424/toolchain_image_info
new file mode 100644
index 00000000000000..807a14bebbdb44
--- /dev/null
+++ b/tensorflow/tools/toolchains/win/20240424/toolchain_image_info
@@ -0,0 +1,2 @@
+REPOSITORY                                            TAG                 DIGEST                                                                    IMAGE ID            CREATED             SIZE
+gcr.io/tensorflow-testing/tf-win2019-docker-staging   latest              sha256:1082ef4299a72e44a84388f192ecefc81ec9091c146f507bc36070c089c0edcc   b601adb43430        8 minutes ago       20.4GB
\ No newline at end of file
diff --git a/tensorflow/tools/toolchains/win/20240424/windows_cc_toolchain_config.bzl b/tensorflow/tools/toolchains/win/20240424/windows_cc_toolchain_config.bzl
new file mode 100644
index 00000000000000..6d8e8af6d50e4a
--- /dev/null
+++ b/tensorflow/tools/toolchains/win/20240424/windows_cc_toolchain_config.bzl
@@ -0,0 +1,1443 @@
+# Copyright 2019 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A Starlark cc_toolchain configuration rule for Windows"""
+
+load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
+load(
+    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
+    "action_config",
+    "artifact_name_pattern",
+    "env_entry",
+    "env_set",
+    "feature",
+    "flag_group",
+    "flag_set",
+    "tool",
+    "tool_path",
+    "variable_with_value",
+    "with_feature_set",
+)
+
+all_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.assemble,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.clif_match,
+    ACTION_NAMES.lto_backend,
+]
+
+all_cpp_compile_actions = [
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.clif_match,
+]
+
+preprocessor_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.clif_match,
+]
+
+codegen_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.assemble,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.lto_backend,
+]
+
+all_link_actions = [
+    ACTION_NAMES.cpp_link_executable,
+    ACTION_NAMES.cpp_link_dynamic_library,
+    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+]
+
+def _use_msvc_toolchain(ctx):
+    return ctx.attr.cpu in ["x64_windows", "arm64_windows"] and (ctx.attr.compiler == "msvc-cl" or ctx.attr.compiler == "clang-cl")
+
+def _impl(ctx):
+    if _use_msvc_toolchain(ctx):
+        artifact_name_patterns = [
+            artifact_name_pattern(
+                category_name = "object_file",
+                prefix = "",
+                extension = ".obj",
+            ),
+            artifact_name_pattern(
+                category_name = "static_library",
+                prefix = "",
+                extension = ".lib",
+            ),
+            artifact_name_pattern(
+                category_name = "alwayslink_static_library",
+                prefix = "",
+                extension = ".lo.lib",
+            ),
+            artifact_name_pattern(
+                category_name = "executable",
+                prefix = "",
+                extension = ".exe",
+            ),
+            artifact_name_pattern(
+                category_name = "dynamic_library",
+                prefix = "",
+                extension = ".dll",
+            ),
+            artifact_name_pattern(
+                category_name = "interface_library",
+                prefix = "",
+                extension = ".if.lib",
+            ),
+        ]
+    else:
+        artifact_name_patterns = [
+            artifact_name_pattern(
+                category_name = "executable",
+                prefix = "",
+                extension = ".exe",
+            ),
+        ]
+
+    if _use_msvc_toolchain(ctx):
+        cpp_link_nodeps_dynamic_library_action = action_config(
+            action_name = ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+            implies = [
+                "nologo",
+                "shared_flag",
+                "linkstamps",
+                "output_execpath_flags",
+                "input_param_flags",
+                "user_link_flags",
+                "linker_subsystem_flag",
+                "linker_param_file",
+                "msvc_env",
+                "no_stripping",
+                "has_configured_linker_path",
+                "def_file",
+            ],
+            tools = [tool(path = ctx.attr.msvc_link_path)],
+        )
+
+        cpp_link_static_library_action = action_config(
+            action_name = ACTION_NAMES.cpp_link_static_library,
+            implies = [
+                "nologo",
+                "archiver_flags",
+                "input_param_flags",
+                "linker_param_file",
+                "msvc_env",
+            ],
+            tools = [tool(path = ctx.attr.msvc_lib_path)],
+        )
+
+        assemble_action = action_config(
+            action_name = ACTION_NAMES.assemble,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "nologo",
+                "msvc_env",
+                "sysroot",
+            ],
+            tools = [tool(path = ctx.attr.msvc_ml_path)],
+        )
+
+        preprocess_assemble_action = action_config(
+            action_name = ACTION_NAMES.preprocess_assemble,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "nologo",
+                "msvc_env",
+                "sysroot",
+            ],
+            tools = [tool(path = ctx.attr.msvc_ml_path)],
+        )
+
+        c_compile_action = action_config(
+            action_name = ACTION_NAMES.c_compile,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "nologo",
+                "msvc_env",
+                "user_compile_flags",
+                "sysroot",
+            ],
+            tools = [tool(path = ctx.attr.msvc_cl_path)],
+        )
+
+        linkstamp_compile_action = action_config(
+            action_name = ACTION_NAMES.linkstamp_compile,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "default_compile_flags",
+                "nologo",
+                "msvc_env",
+                "user_compile_flags",
+                "sysroot",
+                "unfiltered_compile_flags",
+            ],
+            tools = [tool(path = ctx.attr.msvc_cl_path)],
+        )
+
+        cpp_compile_action = action_config(
+            action_name = ACTION_NAMES.cpp_compile,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "nologo",
+                "msvc_env",
+                "user_compile_flags",
+                "sysroot",
+            ],
+            tools = [tool(path = ctx.attr.msvc_cl_path)],
+        )
+
+        cpp_link_executable_action = action_config(
+            action_name = ACTION_NAMES.cpp_link_executable,
+            implies = [
+                "nologo",
+                "linkstamps",
+                "output_execpath_flags",
+                "input_param_flags",
+                "user_link_flags",
+                "linker_subsystem_flag",
+                "linker_param_file",
+                "msvc_env",
+                "no_stripping",
+            ],
+            tools = [tool(path = ctx.attr.msvc_link_path)],
+        )
+
+        cpp_link_dynamic_library_action = action_config(
+            action_name = ACTION_NAMES.cpp_link_dynamic_library,
+            implies = [
+                "nologo",
+                "shared_flag",
+                "linkstamps",
+                "output_execpath_flags",
+                "input_param_flags",
+                "user_link_flags",
+                "linker_subsystem_flag",
+                "linker_param_file",
+                "msvc_env",
+                "no_stripping",
+                "has_configured_linker_path",
+                "def_file",
+            ],
+            tools = [tool(path = ctx.attr.msvc_link_path)],
+        )
+
+        action_configs = [
+            assemble_action,
+            preprocess_assemble_action,
+            c_compile_action,
+            linkstamp_compile_action,
+            cpp_compile_action,
+            cpp_link_executable_action,
+            cpp_link_dynamic_library_action,
+            cpp_link_nodeps_dynamic_library_action,
+            cpp_link_static_library_action,
+        ]
+    else:
+        action_configs = []
+
+    if _use_msvc_toolchain(ctx):
+        msvc_link_env_feature = feature(
+            name = "msvc_link_env",
+            env_sets = [
+                env_set(
+                    actions = all_link_actions +
+                              [ACTION_NAMES.cpp_link_static_library],
+                    env_entries = [env_entry(key = "LIB", value = ctx.attr.msvc_env_lib)],
+                ),
+            ],
+        )
+
+        shared_flag_feature = feature(
+            name = "shared_flag",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [flag_group(flags = ["/DLL"])],
+                ),
+            ],
+        )
+
+        determinism_feature = feature(
+            name = "determinism",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "/wd4117",
+                                "-D__DATE__=\"redacted\"",
+                                "-D__TIMESTAMP__=\"redacted\"",
+                                "-D__TIME__=\"redacted\"",
+                            ] + (["-Wno-builtin-macro-redefined"] if ctx.attr.compiler == "clang-cl" else []),
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        sysroot_feature = feature(
+            name = "sysroot",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["--sysroot=%{sysroot}"],
+                            iterate_over = "sysroot",
+                            expand_if_available = "sysroot",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        unfiltered_compile_flags_feature = feature(
+            name = "unfiltered_compile_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{unfiltered_compile_flags}"],
+                            iterate_over = "unfiltered_compile_flags",
+                            expand_if_available = "unfiltered_compile_flags",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        archive_param_file_feature = feature(
+            name = "archive_param_file",
+            enabled = True,
+        )
+
+        compiler_param_file_feature = feature(
+            name = "compiler_param_file",
+            enabled = True,
+        )
+
+        copy_dynamic_libraries_to_binary_feature = feature(
+            name = "copy_dynamic_libraries_to_binary",
+        )
+
+        input_param_flags_feature = feature(
+            name = "input_param_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/IMPLIB:%{interface_library_output_path}"],
+                            expand_if_available = "interface_library_output_path",
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{libopts}"],
+                            iterate_over = "libopts",
+                            expand_if_available = "libopts",
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = all_link_actions +
+                              [ACTION_NAMES.cpp_link_static_library],
+                    flag_groups = [
+                        flag_group(
+                            iterate_over = "libraries_to_link",
+                            flag_groups = [
+                                flag_group(
+                                    iterate_over = "libraries_to_link.object_files",
+                                    flag_groups = [flag_group(flags = ["%{libraries_to_link.object_files}"])],
+                                    expand_if_equal = variable_with_value(
+                                        name = "libraries_to_link.type",
+                                        value = "object_file_group",
+                                    ),
+                                ),
+                                flag_group(
+                                    flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
+                                    expand_if_equal = variable_with_value(
+                                        name = "libraries_to_link.type",
+                                        value = "object_file",
+                                    ),
+                                ),
+                                flag_group(
+                                    flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
+                                    expand_if_equal = variable_with_value(
+                                        name = "libraries_to_link.type",
+                                        value = "interface_library",
+                                    ),
+                                ),
+                                flag_group(
+                                    flag_groups = [
+                                        flag_group(
+                                            flags = ["%{libraries_to_link.name}"],
+                                            expand_if_false = "libraries_to_link.is_whole_archive",
+                                        ),
+                                        flag_group(
+                                            flags = ["/WHOLEARCHIVE:%{libraries_to_link.name}"],
+                                            expand_if_true = "libraries_to_link.is_whole_archive",
+                                        ),
+                                    ],
+                                    expand_if_equal = variable_with_value(
+                                        name = "libraries_to_link.type",
+                                        value = "static_library",
+                                    ),
+                                ),
+                            ],
+                            expand_if_available = "libraries_to_link",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        fastbuild_feature = feature(
+            name = "fastbuild",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Od", "/Z7"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = [ctx.attr.fastbuild_mode_debug_flag, "/INCREMENTAL:NO"],
+                        ),
+                    ],
+                ),
+            ],
+            implies = ["generate_pdb_file"],
+        )
+
+        user_compile_flags_feature = feature(
+            name = "user_compile_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{user_compile_flags}"],
+                            iterate_over = "user_compile_flags",
+                            expand_if_available = "user_compile_flags",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        archiver_flags_feature = feature(
+            name = "archiver_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.cpp_link_static_library],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/OUT:%{output_execpath}"],
+                            expand_if_available = "output_execpath",
+                        ),
+                        flag_group(
+                            flags = ctx.attr.archiver_flags,
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        default_link_flags_feature = feature(
+            name = "default_link_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ctx.attr.default_link_flags)],
+                ),
+            ],
+        )
+
+        static_link_msvcrt_feature = feature(
+            name = "static_link_msvcrt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/MT"])],
+                    with_features = [with_feature_set(not_features = ["dbg"])],
+                ),
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/MTd"])],
+                    with_features = [with_feature_set(features = ["dbg"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])],
+                    with_features = [with_feature_set(not_features = ["dbg"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])],
+                    with_features = [with_feature_set(features = ["dbg"])],
+                ),
+            ],
+        )
+
+        dynamic_link_msvcrt_feature = feature(
+            name = "dynamic_link_msvcrt",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/MD"])],
+                    with_features = [with_feature_set(not_features = ["dbg", "static_link_msvcrt"])],
+                ),
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/MDd"])],
+                    with_features = [with_feature_set(features = ["dbg"], not_features = ["static_link_msvcrt"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])],
+                    with_features = [with_feature_set(not_features = ["dbg", "static_link_msvcrt"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])],
+                    with_features = [with_feature_set(features = ["dbg"], not_features = ["static_link_msvcrt"])],
+                ),
+            ],
+        )
+
+        dbg_feature = feature(
+            name = "dbg",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Od", "/Z7"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = [ctx.attr.dbg_mode_debug_flag, "/INCREMENTAL:NO"],
+                        ),
+                    ],
+                ),
+            ],
+            implies = ["generate_pdb_file"],
+        )
+
+        opt_feature = feature(
+            name = "opt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/O2"])],
+                ),
+            ],
+            implies = ["frame_pointer"],
+        )
+
+        supports_interface_shared_libraries_feature = feature(
+            name = "supports_interface_shared_libraries",
+            enabled = True,
+        )
+
+        user_link_flags_feature = feature(
+            name = "user_link_flags",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{user_link_flags}"],
+                            iterate_over = "user_link_flags",
+                            expand_if_available = "user_link_flags",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        default_compile_flags_feature = feature(
+            name = "default_compile_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.lto_backend,
+                        ACTION_NAMES.clif_match,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "/DCOMPILER_MSVC",
+                                "/DNOMINMAX",
+                                "/D_WIN32_WINNT=0x0601",
+                                "/D_CRT_SECURE_NO_DEPRECATE",
+                                "/D_CRT_SECURE_NO_WARNINGS",
+                                "/bigobj",
+                                "/Zm500",
+                                "/EHsc",
+                                "/wd4351",
+                                "/wd4291",
+                                "/wd4250",
+                                "/wd4996",
+                            ],
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        msvc_compile_env_feature = feature(
+            name = "msvc_compile_env",
+            env_sets = [
+                env_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                    ],
+                    env_entries = [env_entry(key = "INCLUDE", value = ctx.attr.msvc_env_include)],
+                ),
+            ],
+        )
+
+        preprocessor_defines_feature = feature(
+            name = "preprocessor_defines",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/D%{preprocessor_defines}"],
+                            iterate_over = "preprocessor_defines",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        generate_pdb_file_feature = feature(
+            name = "generate_pdb_file",
+        )
+
+        output_execpath_flags_feature = feature(
+            name = "output_execpath_flags",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/OUT:%{output_execpath}"],
+                            expand_if_available = "output_execpath",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        disable_assertions_feature = feature(
+            name = "disable_assertions",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/DNDEBUG"])],
+                    with_features = [with_feature_set(features = ["opt"])],
+                ),
+            ],
+        )
+
+        has_configured_linker_path_feature = feature(name = "has_configured_linker_path")
+
+        supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+
+        no_stripping_feature = feature(name = "no_stripping")
+
+        linker_param_file_feature = feature(
+            name = "linker_param_file",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions +
+                              [ACTION_NAMES.cpp_link_static_library],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["@%{linker_param_file}"],
+                            expand_if_available = "linker_param_file",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        ignore_noisy_warnings_feature = feature(
+            name = "ignore_noisy_warnings",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.cpp_link_static_library],
+                    flag_groups = [flag_group(flags = ["/ignore:4221"])],
+                ),
+            ],
+        )
+
+        no_legacy_features_feature = feature(name = "no_legacy_features")
+
+        parse_showincludes_feature = feature(
+            name = "parse_showincludes",
+            enabled = ctx.attr.supports_parse_showincludes,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                    ],
+                    flag_groups = [flag_group(flags = ["/showIncludes"])],
+                ),
+            ],
+            env_sets = [
+                env_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                    ],
+                    # Force English (and thus a consistent locale) output so that Bazel can parse
+                    # the /showIncludes output without having to guess the encoding.
+                    env_entries = [env_entry(key = "VSLANG", value = "1033")],
+                ),
+            ],
+        )
+
+        # MSVC does not emit .d files.
+        no_dotd_file_feature = feature(
+            name = "no_dotd_file",
+            enabled = True,
+        )
+
+        treat_warnings_as_errors_feature = feature(
+            name = "treat_warnings_as_errors",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile] + all_link_actions,
+                    flag_groups = [flag_group(flags = ["/WX"])],
+                ),
+            ],
+        )
+
+        windows_export_all_symbols_feature = feature(name = "windows_export_all_symbols")
+
+        no_windows_export_all_symbols_feature = feature(name = "no_windows_export_all_symbols")
+
+        include_paths_feature = feature(
+            name = "include_paths",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/I%{quote_include_paths}"],
+                            iterate_over = "quote_include_paths",
+                        ),
+                        flag_group(
+                            flags = ["/I%{include_paths}"],
+                            iterate_over = "include_paths",
+                        ),
+                        flag_group(
+                            flags = ["/I%{system_include_paths}"],
+                            iterate_over = "system_include_paths",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        external_include_paths_feature = feature(
+            name = "external_include_paths",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.clif_match,
+                        ACTION_NAMES.objc_compile,
+                        ACTION_NAMES.objcpp_compile,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/external:I", "%{external_include_paths}"],
+                            iterate_over = "external_include_paths",
+                            expand_if_available = "external_include_paths",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        linkstamps_feature = feature(
+            name = "linkstamps",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{linkstamp_paths}"],
+                            iterate_over = "linkstamp_paths",
+                            expand_if_available = "linkstamp_paths",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        targets_windows_feature = feature(
+            name = "targets_windows",
+            enabled = True,
+            implies = ["copy_dynamic_libraries_to_binary"],
+        )
+
+        linker_subsystem_flag_feature = feature(
+            name = "linker_subsystem_flag",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/SUBSYSTEM:CONSOLE"])],
+                ),
+            ],
+        )
+
+        frame_pointer_feature = feature(
+            name = "frame_pointer",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Oy-"])],
+                ),
+            ],
+        )
+
+        compiler_output_flags_feature = feature(
+            name = "compiler_output_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.assemble],
+                    flag_groups = [
+                        flag_group(
+                            flag_groups = [
+                                flag_group(
+                                    flags = ["/Fo%{output_file}", "/Zi"],
+                                    expand_if_available = "output_file",
+                                    expand_if_not_available = "output_assembly_file",
+                                ),
+                            ],
+                            expand_if_not_available = "output_preprocess_file",
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flag_groups = [
+                                flag_group(
+                                    flags = ["/Fo%{output_file}"],
+                                    expand_if_not_available = "output_preprocess_file",
+                                ),
+                            ],
+                            expand_if_available = "output_file",
+                            expand_if_not_available = "output_assembly_file",
+                        ),
+                        flag_group(
+                            flag_groups = [
+                                flag_group(
+                                    flags = ["/Fa%{output_file}"],
+                                    expand_if_available = "output_assembly_file",
+                                ),
+                            ],
+                            expand_if_available = "output_file",
+                        ),
+                        flag_group(
+                            flag_groups = [
+                                flag_group(
+                                    flags = ["/P", "/Fi%{output_file}"],
+                                    expand_if_available = "output_preprocess_file",
+                                ),
+                            ],
+                            expand_if_available = "output_file",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        nologo_feature = feature(
+            name = "nologo",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                        ACTION_NAMES.cpp_link_static_library,
+                    ],
+                    flag_groups = [flag_group(flags = ["/nologo"])],
+                ),
+            ],
+        )
+
+        smaller_binary_feature = feature(
+            name = "smaller_binary",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Gy", "/Gw"])],
+                    with_features = [with_feature_set(features = ["opt"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/OPT:ICF", "/OPT:REF"])],
+                    with_features = [with_feature_set(features = ["opt"])],
+                ),
+            ],
+        )
+
+        compiler_input_flags_feature = feature(
+            name = "compiler_input_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/c", "%{source_file}"],
+                            expand_if_available = "source_file",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        def_file_feature = feature(
+            name = "def_file",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/DEF:%{def_file_path}", "/ignore:4070"],
+                            expand_if_available = "def_file_path",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        msvc_env_feature = feature(
+            name = "msvc_env",
+            env_sets = [
+                env_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                        ACTION_NAMES.cpp_link_static_library,
+                    ],
+                    env_entries = [
+                        env_entry(key = "PATH", value = ctx.attr.msvc_env_path),
+                        env_entry(key = "TMP", value = ctx.attr.msvc_env_tmp),
+                        env_entry(key = "TEMP", value = ctx.attr.msvc_env_tmp),
+                    ],
+                ),
+            ],
+            implies = ["msvc_compile_env", "msvc_link_env"],
+        )
+        features = [
+            no_legacy_features_feature,
+            nologo_feature,
+            has_configured_linker_path_feature,
+            no_stripping_feature,
+            targets_windows_feature,
+            copy_dynamic_libraries_to_binary_feature,
+            default_compile_flags_feature,
+            msvc_env_feature,
+            msvc_compile_env_feature,
+            msvc_link_env_feature,
+            include_paths_feature,
+            external_include_paths_feature,
+            preprocessor_defines_feature,
+            parse_showincludes_feature,
+            no_dotd_file_feature,
+            generate_pdb_file_feature,
+            shared_flag_feature,
+            linkstamps_feature,
+            output_execpath_flags_feature,
+            archiver_flags_feature,
+            input_param_flags_feature,
+            linker_subsystem_flag_feature,
+            user_link_flags_feature,
+            default_link_flags_feature,
+            linker_param_file_feature,
+            static_link_msvcrt_feature,
+            dynamic_link_msvcrt_feature,
+            dbg_feature,
+            fastbuild_feature,
+            opt_feature,
+            frame_pointer_feature,
+            disable_assertions_feature,
+            determinism_feature,
+            treat_warnings_as_errors_feature,
+            smaller_binary_feature,
+            ignore_noisy_warnings_feature,
+            user_compile_flags_feature,
+            sysroot_feature,
+            unfiltered_compile_flags_feature,
+            archive_param_file_feature,
+            compiler_param_file_feature,
+            compiler_output_flags_feature,
+            compiler_input_flags_feature,
+            def_file_feature,
+            windows_export_all_symbols_feature,
+            no_windows_export_all_symbols_feature,
+            supports_dynamic_linker_feature,
+            supports_interface_shared_libraries_feature,
+        ]
+    else:
+        targets_windows_feature = feature(
+            name = "targets_windows",
+            implies = ["copy_dynamic_libraries_to_binary"],
+            enabled = True,
+        )
+
+        copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
+
+        gcc_env_feature = feature(
+            name = "gcc_env",
+            enabled = True,
+            env_sets = [
+                env_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                        ACTION_NAMES.cpp_link_static_library,
+                    ],
+                    env_entries = [
+                        env_entry(key = "PATH", value = ctx.attr.tool_bin_path),
+                    ],
+                ),
+            ],
+        )
+
+        default_compile_flags_feature = feature(
+            name = "default_compile_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.lto_backend,
+                        ACTION_NAMES.clif_match,
+                    ],
+                    flag_groups = [flag_group(flags = ["-std=gnu++14"])],
+                ),
+            ],
+        )
+
+        default_link_flags_feature = feature(
+            name = "default_link_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["-lstdc++"])],
+                ),
+            ],
+        )
+
+        supports_dynamic_linker_feature = feature(
+            name = "supports_dynamic_linker",
+            enabled = True,
+        )
+
+        dbg_feature = feature(
+            name = "dbg",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["-g", "-Og"])],
+                ),
+            ],
+        )
+
+        opt_feature = feature(
+            name = "opt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = [
+                        "-g0",
+                        "-O3",
+                        "-DNDEBUG",
+                        "-ffunction-sections",
+                        "-fdata-sections",
+                    ])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["-Wl,--gc-sections"])],
+                ),
+            ],
+        )
+
+        if ctx.attr.cpu == "x64_windows" and ctx.attr.compiler == "mingw-gcc":
+            archive_param_file_feature = feature(
+                name = "archive_param_file",
+                enabled = True,
+            )
+
+            compiler_param_file_feature = feature(
+                name = "compiler_param_file",
+            )
+
+            features = [
+                targets_windows_feature,
+                copy_dynamic_libraries_to_binary_feature,
+                gcc_env_feature,
+                default_compile_flags_feature,
+                archive_param_file_feature,
+                compiler_param_file_feature,
+                default_link_flags_feature,
+                supports_dynamic_linker_feature,
+                dbg_feature,
+                opt_feature,
+            ]
+        else:
+            supports_pic_feature = feature(
+                name = "supports_pic",
+                enabled = True,
+            )
+
+            sysroot_feature = feature(
+                name = "sysroot",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = [
+                            ACTION_NAMES.preprocess_assemble,
+                            ACTION_NAMES.linkstamp_compile,
+                            ACTION_NAMES.c_compile,
+                            ACTION_NAMES.cpp_compile,
+                            ACTION_NAMES.cpp_header_parsing,
+                            ACTION_NAMES.cpp_module_compile,
+                            ACTION_NAMES.cpp_module_codegen,
+                            ACTION_NAMES.lto_backend,
+                            ACTION_NAMES.clif_match,
+                            ACTION_NAMES.cpp_link_executable,
+                            ACTION_NAMES.cpp_link_dynamic_library,
+                            ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                        ],
+                        flag_groups = [
+                            flag_group(
+                                flags = ["--sysroot=%{sysroot}"],
+                                expand_if_available = "sysroot",
+                            ),
+                        ],
+                    ),
+                ],
+            )
+
+            fdo_optimize_feature = feature(
+                name = "fdo_optimize",
+                flag_sets = [
+                    flag_set(
+                        actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                        flag_groups = [
+                            flag_group(
+                                flags = [
+                                    "-fprofile-use=%{fdo_profile_path}",
+                                    "-fprofile-correction",
+                                ],
+                                expand_if_available = "fdo_profile_path",
+                            ),
+                        ],
+                    ),
+                ],
+                provides = ["profile"],
+            )
+
+            treat_warnings_as_errors_feature = feature(
+                name = "treat_warnings_as_errors",
+                flag_sets = [
+                    flag_set(
+                        actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                        flag_groups = [flag_group(flags = ["-Werror"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions,
+                        flag_groups = [flag_group(flags = ["-Wl,-fatal-warnings"])],
+                    ),
+                ],
+            )
+
+            user_compile_flags_feature = feature(
+                name = "user_compile_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = [
+                            ACTION_NAMES.assemble,
+                            ACTION_NAMES.preprocess_assemble,
+                            ACTION_NAMES.linkstamp_compile,
+                            ACTION_NAMES.c_compile,
+                            ACTION_NAMES.cpp_compile,
+                            ACTION_NAMES.cpp_header_parsing,
+                            ACTION_NAMES.cpp_module_compile,
+                            ACTION_NAMES.cpp_module_codegen,
+                            ACTION_NAMES.lto_backend,
+                            ACTION_NAMES.clif_match,
+                        ],
+                        flag_groups = [
+                            flag_group(
+                                flags = ["%{user_compile_flags}"],
+                                iterate_over = "user_compile_flags",
+                                expand_if_available = "user_compile_flags",
+                            ),
+                        ],
+                    ),
+                ],
+            )
+
+            features = [
+                targets_windows_feature,
+                copy_dynamic_libraries_to_binary_feature,
+                gcc_env_feature,
+                supports_pic_feature,
+                default_compile_flags_feature,
+                default_link_flags_feature,
+                fdo_optimize_feature,
+                supports_dynamic_linker_feature,
+                dbg_feature,
+                opt_feature,
+                user_compile_flags_feature,
+                treat_warnings_as_errors_feature,
+                sysroot_feature,
+            ]
+
+    tool_paths = [
+        tool_path(name = name, path = path)
+        for name, path in ctx.attr.tool_paths.items()
+    ]
+
+    return cc_common.create_cc_toolchain_config_info(
+        ctx = ctx,
+        features = features,
+        action_configs = action_configs,
+        artifact_name_patterns = artifact_name_patterns,
+        cxx_builtin_include_directories = ctx.attr.cxx_builtin_include_directories,
+        toolchain_identifier = ctx.attr.toolchain_identifier,
+        host_system_name = ctx.attr.host_system_name,
+        target_system_name = ctx.attr.target_system_name,
+        target_cpu = ctx.attr.cpu,
+        target_libc = ctx.attr.target_libc,
+        compiler = ctx.attr.compiler,
+        abi_version = ctx.attr.abi_version,
+        abi_libc_version = ctx.attr.abi_libc_version,
+        tool_paths = tool_paths,
+    )
+
+cc_toolchain_config = rule(
+    implementation = _impl,
+    attrs = {
+        "cpu": attr.string(mandatory = True),
+        "compiler": attr.string(),
+        "toolchain_identifier": attr.string(),
+        "host_system_name": attr.string(),
+        "target_system_name": attr.string(),
+        "target_libc": attr.string(),
+        "abi_version": attr.string(),
+        "abi_libc_version": attr.string(),
+        "tool_paths": attr.string_dict(),
+        "cxx_builtin_include_directories": attr.string_list(),
+        "archiver_flags": attr.string_list(default = []),
+        "default_link_flags": attr.string_list(default = []),
+        "msvc_env_tmp": attr.string(default = "msvc_not_found"),
+        "msvc_env_path": attr.string(default = "msvc_not_found"),
+        "msvc_env_include": attr.string(default = "msvc_not_found"),
+        "msvc_env_lib": attr.string(default = "msvc_not_found"),
+        "msvc_cl_path": attr.string(default = "vc_installation_error.bat"),
+        "msvc_ml_path": attr.string(default = "vc_installation_error.bat"),
+        "msvc_link_path": attr.string(default = "vc_installation_error.bat"),
+        "msvc_lib_path": attr.string(default = "vc_installation_error.bat"),
+        "dbg_mode_debug_flag": attr.string(),
+        "fastbuild_mode_debug_flag": attr.string(),
+        "tool_bin_path": attr.string(default = "not_found"),
+        "supports_parse_showincludes": attr.bool(),
+    },
+    provides = [CcToolchainConfigInfo],
+)
diff --git a/tensorflow/tools/toolchains/win/BUILD b/tensorflow/tools/toolchains/win/BUILD
index add969031b00ef..55ae6fb22b81f6 100644
--- a/tensorflow/tools/toolchains/win/BUILD
+++ b/tensorflow/tools/toolchains/win/BUILD
@@ -17,7 +17,7 @@ platform(
     remote_execution_properties = """
         properties:{
           name: "container-image"
-          value: "docker://gcr.io/tensorflow-testing/tf-win2019-rbe@sha256:775f2889a35e7cc87f8e1dc83f3195acb4e709cebe3572cf3ae591ccef27e7e8"
+          value: "docker://gcr.io/tensorflow-testing/tf-win2019-rbe@sha256:1082ef4299a72e44a84388f192ecefc81ec9091c146f507bc36070c089c0edcc"
         }
         properties:{
           name: "OSFamily"
@@ -32,7 +32,7 @@ platform(
         """,
 )
 
-#Register platform to support clang-cl on Windows
+# Register the clang-cl platform
 platform(
     name = "x64_windows-clang-cl",
     constraint_values = [
@@ -40,4 +40,20 @@ platform(
         "@platforms//os:windows",
         "@bazel_tools//tools/cpp:clang-cl",
     ],
+    remote_execution_properties = """
+        properties:{
+          name: "container-image"
+          value: "docker://gcr.io/tensorflow-testing/tf-win2019-rbe@sha256:1082ef4299a72e44a84388f192ecefc81ec9091c146f507bc36070c089c0edcc"
+        }
+        properties:{
+          name: "OSFamily"
+          value: "Windows"
+        }
+        properties:{
+          name: "Pool" value: "default"
+        }
+        properties:{
+          name: "dockerNetwork" value: "off"
+        }
+        """,
 )
diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl
index fb4987f3916fbb..18e5e4a582e33b 100644
--- a/tensorflow/workspace2.bzl
+++ b/tensorflow/workspace2.bzl
@@ -150,9 +150,9 @@ def _tf_repositories():
     # LINT.IfChange
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "ba8406e5a885edcf98b1097e853d5b5e960d34844858505143872b0f6f4f03e3",
-        strip_prefix = "XNNPACK-7f3e8aa632ab976b8a195c8d3d17e2f5831dde0e",
-        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/7f3e8aa632ab976b8a195c8d3d17e2f5831dde0e.zip"),
+        sha256 = "256ab4034e93bdeea2e1216cd8361a85bd4ee4884a89bf5e4e142abcdc796805",
+        strip_prefix = "XNNPACK-50037f8072731a2cc30a961b96e199ad691887e4",
+        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/50037f8072731a2cc30a961b96e199ad691887e4.zip"),
     )
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/xnnpack.cmake)
 
@@ -181,9 +181,9 @@ def _tf_repositories():
         name = "cudnn_frontend_archive",
         build_file = "//third_party:cudnn_frontend.BUILD",
         patch_file = ["//third_party:cudnn_frontend_header_fix.patch"],
-        sha256 = "1bb309af98fe9aad81b6a14fd52acbd6566aacfd322fc5803f9a1b77fc681a27",
-        strip_prefix = "cudnn-frontend-1.2.1",
-        urls = tf_mirror_urls("https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.2.1.zip"),
+        sha256 = "5727ed189a17fe888f1729ba09b2afd8df3e71192a27e9fa87e14a60f7b9d367",
+        strip_prefix = "cudnn-frontend-1.3.0",
+        urls = tf_mirror_urls("https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.3.0.zip"),
     )
 
     tf_http_archive(
@@ -205,9 +205,9 @@ def _tf_repositories():
     tf_http_archive(
         name = "onednn",
         build_file = "//third_party/mkl_dnn:mkldnn_v1.BUILD",
-        sha256 = "e291fa4702f4bcfa6c8c23cb5b6599f0fefa8f23bc08edb9e15ddc5254ab7843",
-        strip_prefix = "oneDNN-3.3.4",
-        urls = tf_mirror_urls("https://github.com/oneapi-src/oneDNN/archive/refs/tags/v3.3.4.tar.gz"),
+        sha256 = "906559a25581b292352420721112e1656d21029b66e8597816f9e741fbcdeadb",
+        strip_prefix = "oneDNN-3.4.1",
+        urls = tf_mirror_urls("https://github.com/oneapi-src/oneDNN/archive/refs/tags/v3.4.1.tar.gz"),
     )
 
     tf_http_archive(
@@ -389,10 +389,10 @@ def _tf_repositories():
     tf_http_archive(
         name = "nsync",
         patch_file = ["//third_party:nsync.patch"],
-        sha256 = "2be9dbfcce417c7abcc2aa6fee351cd4d292518d692577e74a2c6c05b049e442",
-        strip_prefix = "nsync-1.25.0",
+        sha256 = "e8e552a358f4a28e844207a7c5cb51767e4aeb0b29e22d23ac2a09924130f761",
+        strip_prefix = "nsync-1.27.0",
         system_build_file = "//third_party/systemlibs:nsync.BUILD",
-        urls = tf_mirror_urls("https://github.com/google/nsync/archive/1.25.0.tar.gz"),
+        urls = tf_mirror_urls("https://github.com/google/nsync/archive/1.27.0.tar.gz"),
     )
 
     tf_http_archive(
@@ -506,10 +506,10 @@ def _tf_repositories():
     tf_http_archive(
         name = "snappy",
         build_file = "//third_party:snappy.BUILD",
-        sha256 = "2e458b7017cd58dcf1469ab315389e85e7f445bd035188f2983f81fb19ecfb29",
-        strip_prefix = "snappy-984b191f0fefdeb17050b42a90b7625999c13b8d",
+        sha256 = "7ee7540b23ae04df961af24309a55484e7016106e979f83323536a1322cedf1b",
+        strip_prefix = "snappy-1.2.0",
         system_build_file = "//third_party/systemlibs:snappy.BUILD",
-        urls = tf_mirror_urls("https://github.com/google/snappy/archive/984b191f0fefdeb17050b42a90b7625999c13b8d.tar.gz"),
+        urls = tf_mirror_urls("https://github.com/google/snappy/archive/1.2.0.zip"),
     )
 
     tf_http_archive(
@@ -867,17 +867,17 @@ def _tf_repositories():
     # third_party/py/riegeli) that are used in TF.
     tf_http_archive(
         name = "riegeli",
-        sha256 = "870ca080cdfc5eba696a72ccc3a54cbf0f2271befc0d459eafa8f065edfaadb2",
-        strip_prefix = "riegeli-264ef7b4a1314d97265b37544b27cd3923ea72d2",
-        urls = tf_mirror_urls("https://github.com/google/riegeli/archive/264ef7b4a1314d97265b37544b27cd3923ea72d2.zip"),
+        sha256 = "1d216d5c97fa60632143d209a1bb48c2a83788efdb876902e7bbc06396d5ee1f",
+        strip_prefix = "riegeli-5d75119232cd4f6db8dfa69a1503289f050e9643",
+        urls = tf_mirror_urls("https://github.com/google/riegeli/archive/5d75119232cd4f6db8dfa69a1503289f050e9643.zip"),
     )
 
     tf_http_archive(
         name = "riegeli_py",
-        sha256 = "870ca080cdfc5eba696a72ccc3a54cbf0f2271befc0d459eafa8f065edfaadb2",
+        sha256 = "1d216d5c97fa60632143d209a1bb48c2a83788efdb876902e7bbc06396d5ee1f",
         patch_file = ["//third_party:riegeli_fix.patch"],
-        strip_prefix = "riegeli-264ef7b4a1314d97265b37544b27cd3923ea72d2",
-        urls = tf_mirror_urls("https://github.com/google/riegeli/archive/264ef7b4a1314d97265b37544b27cd3923ea72d2.zip"),
+        strip_prefix = "riegeli-5d75119232cd4f6db8dfa69a1503289f050e9643",
+        urls = tf_mirror_urls("https://github.com/google/riegeli/archive/5d75119232cd4f6db8dfa69a1503289f050e9643.zip"),
     )
 
     # Required by riegeli.
diff --git a/third_party/absl/system.absl.random.BUILD b/third_party/absl/system.absl.random.BUILD
index 948de07751a2cf..ac17ce6343b79e 100644
--- a/third_party/absl/system.absl.random.BUILD
+++ b/third_party/absl/system.absl.random.BUILD
@@ -51,3 +51,7 @@ cc_library(
         "//absl/types:span",
     ],
 )
+
+cc_library(
+    name = "bit_gen_ref",
+)
diff --git a/third_party/ducc/ducc.BUILD b/third_party/ducc/ducc.BUILD
index 8d713928f0f245..a1c4956d0a7912 100644
--- a/third_party/ducc/ducc.BUILD
+++ b/third_party/ducc/ducc.BUILD
@@ -24,6 +24,7 @@ cc_library(
         "src/ducc0/infra/error_handling.h",
         "src/ducc0/infra/misc_utils.h",
         "src/ducc0/infra/simd.h",
+        "src/ducc0/infra/string_utils.h",
         "src/ducc0/infra/threading.cc",
         "src/ducc0/infra/useful_macros.h",
         "src/ducc0/math/cmplx.h",
diff --git a/third_party/ducc/workspace.bzl b/third_party/ducc/workspace.bzl
index 14755791d51589..99c8b14cd9ff31 100644
--- a/third_party/ducc/workspace.bzl
+++ b/third_party/ducc/workspace.bzl
@@ -3,8 +3,8 @@
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
-    DUCC_COMMIT = "3d28aadfd8bb0219e3df188613dbbcdfffccc3cd"
-    DUCC_SHA256 = "eb044dd11374ed894d67081109d4aa7ed55c29fe3286b116f13db70da6af336c"
+    DUCC_COMMIT = "aa46a4c21e440b3d416c16eca3c96df19c74f316"
+    DUCC_SHA256 = "077cf4bd0bd7eddaa6649a024285fff96e2662c5e6f2fb6ed5c5771f9de093f3"
     tf_http_archive(
         name = "ducc",
         strip_prefix = "ducc-{commit}".format(commit = DUCC_COMMIT),
diff --git a/third_party/eigen3/workspace.bzl b/third_party/eigen3/workspace.bzl
index 027454e46dd251..36c73565a5e6e0 100644
--- a/third_party/eigen3/workspace.bzl
+++ b/third_party/eigen3/workspace.bzl
@@ -7,8 +7,8 @@ def repo():
 
     # Attention: tools parse and update these lines.
     # LINT.IfChange
-    EIGEN_COMMIT = "aa6964bf3a34fd607837dd8123bc42465185c4f8"
-    EIGEN_SHA256 = "35ba771e30c735a4215ed784d7e032086cf89fe6622dce4d793c45dd74373362"
+    EIGEN_COMMIT = "c1d637433e3b3f9012b226c2c9125c494b470ae6"
+    EIGEN_SHA256 = "0992b93a590c39e196a9efdb5b4919fbf3fb485e7e656c6a87b21ddadb7f6ad2"
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/eigen.cmake)
 
     tf_http_archive(
diff --git a/third_party/flatbuffers/build_defs.bzl b/third_party/flatbuffers/build_defs.bzl
index 421d3b91938c9c..8f4aaa7a646781 100644
--- a/third_party/flatbuffers/build_defs.bzl
+++ b/third_party/flatbuffers/build_defs.bzl
@@ -195,7 +195,7 @@ def flatbuffer_cc_library(
         reflection binaries for the schemas.
     '''
     output_headers = [
-        (out_prefix + "%s_generated.h") % (s.replace(".fbs", "").split("/")[-1])
+        (out_prefix + "%s_generated.h") % (s.replace(".fbs", "").split("/")[-1].split(":")[-1])
         for s in srcs
     ]
     reflection_name = "%s_reflection" % name if gen_reflections else ""
diff --git a/third_party/flatbuffers/workspace.bzl b/third_party/flatbuffers/workspace.bzl
index a0b943d7a9487b..b564d0331abe15 100644
--- a/third_party/flatbuffers/workspace.bzl
+++ b/third_party/flatbuffers/workspace.bzl
@@ -5,10 +5,10 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 # _FLATBUFFERS_GIT_COMMIT / _FLATBUFFERS_SHA256 were added due to an urgent change being made to
 # Flatbuffers that needed to be updated in order for Flatbuffers/TfLite be compatible with Android
 # API level >= 23. They can be removed next flatbuffers offical release / update.
-_FLATBUFFERS_GIT_COMMIT = "7d6d99c6befa635780a4e944d37ebfd58e68a108"
+_FLATBUFFERS_GIT_COMMIT = "e6463926479bd6b330cbcf673f7e917803fd5831"
 
 # curl -L https://github.com/google/flatbuffers/archive/<_FLATBUFFERS_GIT_COMMIT>.tar.gz | shasum -a 256
-_FLATBUFFERS_SHA256 = "d27761f6b2fb1017ec00ed317a7b98cb7aed86b81d90528b498fb17ec13579a1"
+_FLATBUFFERS_SHA256 = "c9c6b8653597ed7ee5c62243979010bd0f09b29a46be414505bc5b58a874bb17"
 
 def repo():
     tf_http_archive(
diff --git a/third_party/llvm/toolchains.patch b/third_party/llvm/toolchains.patch
index a4de4eaaff343a..1cbcfe31d3d072 100644
--- a/third_party/llvm/toolchains.patch
+++ b/third_party/llvm/toolchains.patch
@@ -2,7 +2,7 @@ diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llv
 index c43ab727e285..7d848d2dffae 100644
 --- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
 +++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
-@@ -30,6 +30,30 @@ exports_files([
+@@ -30,6 +30,36 @@ exports_files([
      "utils/lit/lit.py",
  ])
  
@@ -22,6 +22,12 @@ index c43ab727e285..7d848d2dffae 100644
 +    },
 +)
 +
++py_library(
++    name = "lit_lib",
++    testonly = True,
++    srcs = ["utils/lit/lit.py"] + glob(["utils/lit/lit/**/*.py"]),
++)
++
 +config_setting(
 +    name = "macos_x86_64",
 +    values = {
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index 5ef93e8ac6da3e..e20b531ed2fe3b 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "694c444b5bbb56dcba8978d283fe5385237c309a"
-    LLVM_SHA256 = "b5c953f633562b6e81101f9c50f246b741cd43158cec852d426ec55adb63f4b3"
+    LLVM_COMMIT = "1e5f29af81a5f6fda308074f6345b9fba4faa71c"
+    LLVM_SHA256 = "f31b9a08cb74cffd89a0bc87b6a7ef327b54bc67af25e1503683d40817b22f2a"
 
     tf_http_archive(
         name = name,
diff --git a/third_party/mkl_dnn/mkldnn_v1.BUILD b/third_party/mkl_dnn/mkldnn_v1.BUILD
index cc9e66d77e3a77..d36831ea4b9f53 100644
--- a/third_party/mkl_dnn/mkldnn_v1.BUILD
+++ b/third_party/mkl_dnn/mkldnn_v1.BUILD
@@ -94,8 +94,8 @@ expand_template(
     out = "include/oneapi/dnnl/dnnl_version.h",
     substitutions = {
         "@DNNL_VERSION_MAJOR@": "3",
-        "@DNNL_VERSION_MINOR@": "3",
-        "@DNNL_VERSION_PATCH@": "4",
+        "@DNNL_VERSION_MINOR@": "4",
+        "@DNNL_VERSION_PATCH@": "1",
         "@DNNL_VERSION_HASH@": "N/A",
     },
     template = "include/oneapi/dnnl/dnnl_version.h.in",
diff --git a/third_party/nsync.patch b/third_party/nsync.patch
index b76630392d0a16..7c0d210c6930d8 100644
--- a/third_party/nsync.patch
+++ b/third_party/nsync.patch
@@ -6,4 +6,4 @@ index 53cc1a6..0000000
 --- a/VERSION
 +++ /dev/null
 @@ -1 +0,0 @@
--1.25.0
+-1.27.0
diff --git a/third_party/py/BUILD b/third_party/py/BUILD
index e69de29bb2d1d6..84eba77ce1a7af 100644
--- a/third_party/py/BUILD
+++ b/third_party/py/BUILD
@@ -0,0 +1,40 @@
+load("@python//:defs.bzl", "compile_pip_requirements")
+load("@python_version_repo//:py_version.bzl", "REQUIREMENTS")
+
+compile_pip_requirements(
+    name = "requirements",
+    extra_args = [
+        "--allow-unsafe",
+        "--build-isolation",
+    ],
+    generate_hashes = True,
+    requirements_in = "requirements.in",
+    requirements_txt = REQUIREMENTS,
+)
+
+compile_pip_requirements(
+    name = "requirements_nightly",
+    data = ["test-requirements.txt"],
+    extra_args = [
+        "--allow-unsafe",
+        "--build-isolation",
+        "--extra-index-url=https://pypi.anaconda.org/scientific-python-nightly-wheels/simple",
+        "--pre",
+        "--upgrade",
+    ],
+    generate_hashes = False,
+    requirements_in = "requirements.in",
+    requirements_txt = REQUIREMENTS,
+)
+
+compile_pip_requirements(
+    name = "requirements_dev",
+    extra_args = [
+        "--allow-unsafe",
+        "--build-isolation",
+        "--upgrade",
+    ],
+    generate_hashes = False,
+    requirements_in = "requirements.in",
+    requirements_txt = REQUIREMENTS,
+)
diff --git a/third_party/py/ml_dtypes/ml_dtypes.BUILD b/third_party/py/ml_dtypes/ml_dtypes.BUILD
index 25c98abc05655d..f386124a36dfe8 100644
--- a/third_party/py/ml_dtypes/ml_dtypes.BUILD
+++ b/third_party/py/ml_dtypes/ml_dtypes.BUILD
@@ -20,21 +20,6 @@ cc_library(
     deps = ["@eigen_archive//:eigen3"],
 )
 
-cc_library(
-    name = "int4",
-    hdrs = ["include/int4.h"],
-    include_prefix = "ml_dtypes",
-    # Internal headers are all relative to . but other packages
-    # include these headers with the  prefix.
-    includes = [
-        ".",
-        "ml_dtypes",
-    ],
-    deps = [
-        ":intn",
-    ],
-)
-
 cc_library(
     name = "intn",
     hdrs = ["include/intn.h"],
@@ -62,7 +47,6 @@ pybind_extension(
     visibility = [":__subpackages__"],
     deps = [
         ":float8",
-        ":int4",
         ":intn",
         "@eigen_archive//:eigen3",
         "@local_tsl//third_party/py/numpy:headers",
diff --git a/third_party/py/ml_dtypes/ml_dtypes.tests.BUILD b/third_party/py/ml_dtypes/ml_dtypes.tests.BUILD
index 3e73c6f4747976..c811379a19dabd 100644
--- a/third_party/py/ml_dtypes/ml_dtypes.tests.BUILD
+++ b/third_party/py/ml_dtypes/ml_dtypes.tests.BUILD
@@ -60,11 +60,11 @@ cc_test(
 )
 
 cc_test(
-    name = "int4_test_cc",
-    srcs = ["int4_test.cc"],
+    name = "intn_test_cc",
+    srcs = ["intn_test.cc"],
     linkstatic = 1,
     deps = [
-        "//:int4",
+        "//:intn",
         "@com_google_googletest//:gtest_main",
         "@eigen_archive//:eigen3",
     ],
diff --git a/third_party/py/ml_dtypes/workspace.bzl b/third_party/py/ml_dtypes/workspace.bzl
index ac75f63653e8ab..51505bf3a1460d 100644
--- a/third_party/py/ml_dtypes/workspace.bzl
+++ b/third_party/py/ml_dtypes/workspace.bzl
@@ -7,8 +7,8 @@ float8 varieties, and int4.
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
-    ML_DTYPES_COMMIT = "15b400f4dbad93e725e77e7b8171e4bfebfac874"
-    ML_DTYPES_SHA256 = "368312e4909bffe6a5ef22640ddae425ee14101af069a2e48b69d2fee33461e4"
+    ML_DTYPES_COMMIT = "24084d9ed2c3d45bf83b7a9bff833aa185bf9172"
+    ML_DTYPES_SHA256 = "c916a3e6b3d9bdcb476f506fdbbecb6d5e9f21f82f221dfcb42b320b4e85e55a"
     tf_http_archive(
         name = "ml_dtypes",
         build_file = "//third_party/py/ml_dtypes:ml_dtypes.BUILD",
diff --git a/third_party/py/non_hermetic/BUILD b/third_party/py/non_hermetic/BUILD
deleted file mode 100644
index e69de29bb2d1d6..00000000000000
diff --git a/third_party/py/non_hermetic/BUILD.tpl b/third_party/py/non_hermetic/BUILD.tpl
deleted file mode 100644
index 45480bd4a31cf8..00000000000000
--- a/third_party/py/non_hermetic/BUILD.tpl
+++ /dev/null
@@ -1,80 +0,0 @@
-licenses(["restricted"])
-
-package(default_visibility = ["//visibility:public"])
-
-# Point both runtimes to the same python binary to ensure we always
-# use the python binary specified by ./configure.py script.
-load("@bazel_tools//tools/python:toolchain.bzl", "py_runtime_pair")
-
-py_runtime(
-    name = "py2_runtime",
-    interpreter_path = "%{PYTHON_BIN_PATH}",
-    python_version = "PY2",
-)
-
-py_runtime(
-    name = "py3_runtime",
-    interpreter_path = "%{PYTHON_BIN_PATH}",
-    python_version = "PY3",
-)
-
-py_runtime_pair(
-    name = "py_runtime_pair",
-    py2_runtime = ":py2_runtime",
-    py3_runtime = ":py3_runtime",
-)
-
-toolchain(
-    name = "py_toolchain",
-    toolchain = ":py_runtime_pair",
-    toolchain_type = "@bazel_tools//tools/python:toolchain_type",
-    target_compatible_with = [%{PLATFORM_CONSTRAINT}],
-    exec_compatible_with = [%{PLATFORM_CONSTRAINT}],
-)
-
-# To build Python C/C++ extension on Windows, we need to link to python import library pythonXY.lib
-# See https://docs.python.org/3/extending/windows.html
-cc_import(
-    name = "python_lib",
-    interface_library = select({
-        ":windows": ":python_import_lib",
-        # A placeholder for Unix platforms which makes --no_build happy.
-        "//conditions:default": "not-existing.lib",
-    }),
-    system_provided = 1,
-)
-
-cc_library(
-    name = "python_headers",
-    hdrs = [":python_include"],
-    deps = select({
-        ":windows": [":python_lib"],
-        "//conditions:default": [],
-    }),
-    includes = ["python_include"],
-)
-
-# This alias is exists for the use of targets in the @llvm-project dependency,
-# which expect a python_headers target called @python_runtime//:headers. We use
-# a repo_mapping to alias python_runtime to this package, and an alias to create
-# the correct target.
-alias(
-    name = "headers",
-    actual = ":python_headers",
-)
-
-cc_library(
-    name = "numpy_headers",
-    hdrs = [":numpy_include"],
-    includes = ["numpy_include"],
-)
-
-config_setting(
-    name = "windows",
-    values = {"cpu": "x64_windows"},
-    visibility = ["//visibility:public"],
-)
-
-%{PYTHON_INCLUDE_GENRULE}
-%{NUMPY_INCLUDE_GENRULE}
-%{PYTHON_IMPORT_LIB_GENRULE}
\ No newline at end of file
diff --git a/third_party/py/non_hermetic/README b/third_party/py/non_hermetic/README
deleted file mode 100644
index 62188a5817a09e..00000000000000
--- a/third_party/py/non_hermetic/README
+++ /dev/null
@@ -1,3 +0,0 @@
-This is a temporary copy of python_configure repository rule. It allows
-XLA and TSL to keep non-hermetic Python while TF uses hermetic Python.
-DO NOT DEPEND ON THIS COPY as it will be deleted soon.
\ No newline at end of file
diff --git a/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.BUILD b/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.BUILD
index 25c98abc05655d..f386124a36dfe8 100644
--- a/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.BUILD
+++ b/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.BUILD
@@ -20,21 +20,6 @@ cc_library(
     deps = ["@eigen_archive//:eigen3"],
 )
 
-cc_library(
-    name = "int4",
-    hdrs = ["include/int4.h"],
-    include_prefix = "ml_dtypes",
-    # Internal headers are all relative to . but other packages
-    # include these headers with the  prefix.
-    includes = [
-        ".",
-        "ml_dtypes",
-    ],
-    deps = [
-        ":intn",
-    ],
-)
-
 cc_library(
     name = "intn",
     hdrs = ["include/intn.h"],
@@ -62,7 +47,6 @@ pybind_extension(
     visibility = [":__subpackages__"],
     deps = [
         ":float8",
-        ":int4",
         ":intn",
         "@eigen_archive//:eigen3",
         "@local_tsl//third_party/py/numpy:headers",
diff --git a/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.tests.BUILD b/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.tests.BUILD
index 3e73c6f4747976..c811379a19dabd 100644
--- a/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.tests.BUILD
+++ b/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.tests.BUILD
@@ -60,11 +60,11 @@ cc_test(
 )
 
 cc_test(
-    name = "int4_test_cc",
-    srcs = ["int4_test.cc"],
+    name = "intn_test_cc",
+    srcs = ["intn_test.cc"],
     linkstatic = 1,
     deps = [
-        "//:int4",
+        "//:intn",
         "@com_google_googletest//:gtest_main",
         "@eigen_archive//:eigen3",
     ],
diff --git a/third_party/py/non_hermetic/ml_dtypes/workspace.bzl b/third_party/py/non_hermetic/ml_dtypes/workspace.bzl
index ac75f63653e8ab..51505bf3a1460d 100644
--- a/third_party/py/non_hermetic/ml_dtypes/workspace.bzl
+++ b/third_party/py/non_hermetic/ml_dtypes/workspace.bzl
@@ -7,8 +7,8 @@ float8 varieties, and int4.
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
-    ML_DTYPES_COMMIT = "15b400f4dbad93e725e77e7b8171e4bfebfac874"
-    ML_DTYPES_SHA256 = "368312e4909bffe6a5ef22640ddae425ee14101af069a2e48b69d2fee33461e4"
+    ML_DTYPES_COMMIT = "24084d9ed2c3d45bf83b7a9bff833aa185bf9172"
+    ML_DTYPES_SHA256 = "c916a3e6b3d9bdcb476f506fdbbecb6d5e9f21f82f221dfcb42b320b4e85e55a"
     tf_http_archive(
         name = "ml_dtypes",
         build_file = "//third_party/py/ml_dtypes:ml_dtypes.BUILD",
diff --git a/third_party/py/non_hermetic/python_configure.bzl b/third_party/py/non_hermetic/python_configure.bzl
deleted file mode 100644
index 89732c3e33d8ee..00000000000000
--- a/third_party/py/non_hermetic/python_configure.bzl
+++ /dev/null
@@ -1,312 +0,0 @@
-"""Repository rule for Python autoconfiguration.
-
-`python_configure` depends on the following environment variables:
-
-  * `PYTHON_BIN_PATH`: location of python binary.
-  * `PYTHON_LIB_PATH`: Location of python libraries.
-"""
-
-load(
-    "//third_party/remote_config:common.bzl",
-    "BAZEL_SH",
-    "PYTHON_BIN_PATH",
-    "PYTHON_LIB_PATH",
-    "TF_PYTHON_CONFIG_REPO",
-    "auto_config_fail",
-    "config_repo_label",
-    "execute",
-    "get_bash_bin",
-    "get_host_environ",
-    "get_python_bin",
-    "is_windows",
-    "raw_exec",
-    "read_dir",
-)
-
-def _genrule(src_dir, genrule_name, command, outs):
-    """Returns a string with a genrule.
-
-    Genrule executes the given command and produces the given outputs.
-    """
-    return (
-        "genrule(\n" +
-        '    name = "' +
-        genrule_name + '",\n' +
-        "    outs = [\n" +
-        outs +
-        "\n    ],\n" +
-        '    cmd = """\n' +
-        command +
-        '\n   """,\n' +
-        ")\n"
-    )
-
-def _norm_path(path):
-    """Returns a path with '/' and remove the trailing slash."""
-    path = path.replace("\\", "/")
-    if path[-1] == "/":
-        path = path[:-1]
-    return path
-
-def _symlink_genrule_for_dir(
-        repository_ctx,
-        src_dir,
-        dest_dir,
-        genrule_name,
-        src_files = [],
-        dest_files = []):
-    """Returns a genrule to symlink(or copy if on Windows) a set of files.
-
-    If src_dir is passed, files will be read from the given directory; otherwise
-    we assume files are in src_files and dest_files
-    """
-    if src_dir != None:
-        src_dir = _norm_path(src_dir)
-        dest_dir = _norm_path(dest_dir)
-        files = "\n".join(read_dir(repository_ctx, src_dir))
-
-        # Create a list with the src_dir stripped to use for outputs.
-        dest_files = files.replace(src_dir, "").splitlines()
-        src_files = files.splitlines()
-    command = []
-    outs = []
-    for i in range(len(dest_files)):
-        if dest_files[i] != "":
-            # If we have only one file to link we do not want to use the dest_dir, as
-            # $(@D) will include the full path to the file.
-            dest = "$(@D)/" + dest_dir + dest_files[i] if len(dest_files) != 1 else "$(@D)/" + dest_files[i]
-
-            # Copy the headers to create a sandboxable setup.
-            cmd = "cp -f"
-            command.append(cmd + ' "%s" "%s"' % (src_files[i], dest))
-            outs.append('        "' + dest_dir + dest_files[i] + '",')
-    genrule = _genrule(
-        src_dir,
-        genrule_name,
-        " && ".join(command),
-        "\n".join(outs),
-    )
-    return genrule
-
-def _get_python_lib(repository_ctx, python_bin):
-    """Gets the python lib path."""
-    python_lib = get_host_environ(repository_ctx, PYTHON_LIB_PATH)
-    if python_lib != None:
-        return python_lib
-
-    # The interesting program to execute.
-    print_lib = [
-        "from __future__ import print_function",
-        "import site",
-        "import os",
-        "python_paths = []",
-        "if os.getenv('PYTHONPATH') is not None:",
-        "  python_paths = os.getenv('PYTHONPATH').split(':')",
-        "try:",
-        "  library_paths = site.getsitepackages()",
-        "except AttributeError:",
-        "  from distutils.sysconfig import get_python_lib",
-        "  library_paths = [get_python_lib()]",
-        "all_paths = set(python_paths + library_paths)",
-        "paths = []",
-        "for path in all_paths:",
-        "  if os.path.isdir(path):",
-        "    paths.append(path)",
-        "if len(paths) >=1:",
-        "  print(paths[0])",
-    ]
-
-    # The below script writes the above program to a file
-    # and executes it. This is to work around the limitation
-    # of not being able to upload files as part of execute.
-    cmd = "from os import linesep;"
-    cmd += "f = open('script.py', 'w');"
-    for line in print_lib:
-        cmd += "f.write(\"%s\" + linesep);" % line
-    cmd += "f.close();"
-    cmd += "from subprocess import call;"
-    cmd += "call([\"%s\", \"script.py\"]);" % python_bin
-
-    result = execute(repository_ctx, [python_bin, "-c", cmd])
-    return result.stdout.strip()
-
-def _check_python_lib(repository_ctx, python_lib):
-    """Checks the python lib path."""
-    cmd = 'test -d "%s" -a -x "%s"' % (python_lib, python_lib)
-    result = raw_exec(repository_ctx, [get_bash_bin(repository_ctx), "-c", cmd])
-    if result.return_code == 1:
-        auto_config_fail("Invalid python library path: %s" % python_lib)
-
-def _check_python_bin(repository_ctx, python_bin):
-    """Checks the python bin path."""
-    cmd = '[[ -x "%s" ]] && [[ ! -d "%s" ]]' % (python_bin, python_bin)
-    result = raw_exec(repository_ctx, [get_bash_bin(repository_ctx), "-c", cmd])
-    if result.return_code == 1:
-        auto_config_fail("--define %s='%s' is not executable. Is it the python binary?" % (
-            PYTHON_BIN_PATH,
-            python_bin,
-        ))
-
-def _get_python_include(repository_ctx, python_bin):
-    """Gets the python include path."""
-    result = execute(
-        repository_ctx,
-        [
-            python_bin,
-            "-Wignore",
-            "-c",
-            "import sysconfig; " +
-            "print(sysconfig.get_path('include'))",
-        ],
-        error_msg = "Problem getting python include path.",
-        error_details = ("Is the Python binary path set up right? " +
-                         "(See ./configure or " + PYTHON_BIN_PATH + ".) " +
-                         "Is distutils installed?"),
-    )
-    return result.stdout.splitlines()[0]
-
-def _get_python_import_lib_name(repository_ctx, python_bin):
-    """Get Python import library name (pythonXY.lib) on Windows."""
-    result = execute(
-        repository_ctx,
-        [
-            python_bin,
-            "-c",
-            "import sys;" +
-            'print("python" + str(sys.version_info[0]) + ' +
-            '      str(sys.version_info[1]) + ".lib")',
-        ],
-        error_msg = "Problem getting python import library.",
-        error_details = ("Is the Python binary path set up right? " +
-                         "(See ./configure or " + PYTHON_BIN_PATH + ".) "),
-    )
-    return result.stdout.splitlines()[0]
-
-def _get_numpy_include(repository_ctx, python_bin):
-    """Gets the numpy include path."""
-    return execute(
-        repository_ctx,
-        [
-            python_bin,
-            "-c",
-            "from __future__ import print_function;" +
-            "import numpy;" +
-            " print(numpy.get_include());",
-        ],
-        error_msg = "Problem getting numpy include path.",
-        error_details = "Is numpy installed?",
-    ).stdout.splitlines()[0]
-
-def _create_local_python_repository(repository_ctx):
-    """Creates the repository containing files set up to build with Python."""
-
-    # Resolve all labels before doing any real work. Resolving causes the
-    # function to be restarted with all previous state being lost. This
-    # can easily lead to a O(n^2) runtime in the number of labels.
-    build_tpl = repository_ctx.path(Label("//third_party/py:BUILD.tpl"))
-
-    python_bin = get_python_bin(repository_ctx)
-    _check_python_bin(repository_ctx, python_bin)
-    python_lib = _get_python_lib(repository_ctx, python_bin)
-    _check_python_lib(repository_ctx, python_lib)
-    python_include = _get_python_include(repository_ctx, python_bin)
-    numpy_include = _get_numpy_include(repository_ctx, python_bin) + "/numpy"
-    python_include_rule = _symlink_genrule_for_dir(
-        repository_ctx,
-        python_include,
-        "python_include",
-        "python_include",
-    )
-    python_import_lib_genrule = ""
-
-    # To build Python C/C++ extension on Windows, we need to link to python import library pythonXY.lib
-    # See https://docs.python.org/3/extending/windows.html
-    if is_windows(repository_ctx):
-        python_bin = python_bin.replace("\\", "/")
-        python_include = _norm_path(python_include)
-        python_import_lib_name = _get_python_import_lib_name(repository_ctx, python_bin)
-        python_import_lib_src = python_include.rsplit("/", 1)[0] + "/libs/" + python_import_lib_name
-        python_import_lib_genrule = _symlink_genrule_for_dir(
-            repository_ctx,
-            None,
-            "",
-            "python_import_lib",
-            [python_import_lib_src],
-            [python_import_lib_name],
-        )
-    numpy_include_rule = _symlink_genrule_for_dir(
-        repository_ctx,
-        numpy_include,
-        "numpy_include/numpy",
-        "numpy_include",
-    )
-
-    platform_constraint = ""
-    if repository_ctx.attr.platform_constraint:
-        platform_constraint = "\"%s\"" % repository_ctx.attr.platform_constraint
-    repository_ctx.template("BUILD", build_tpl, {
-        "%{PYTHON_BIN_PATH}": python_bin,
-        "%{PYTHON_INCLUDE_GENRULE}": python_include_rule,
-        "%{PYTHON_IMPORT_LIB_GENRULE}": python_import_lib_genrule,
-        "%{NUMPY_INCLUDE_GENRULE}": numpy_include_rule,
-        "%{PLATFORM_CONSTRAINT}": platform_constraint,
-    })
-
-def _create_remote_python_repository(repository_ctx, remote_config_repo):
-    """Creates pointers to a remotely configured repo set up to build with Python.
-    """
-    repository_ctx.template("BUILD", config_repo_label(remote_config_repo, ":BUILD"), {})
-
-def _python_autoconf_impl(repository_ctx):
-    """Implementation of the python_autoconf repository rule."""
-    if get_host_environ(repository_ctx, TF_PYTHON_CONFIG_REPO) != None:
-        _create_remote_python_repository(
-            repository_ctx,
-            get_host_environ(repository_ctx, TF_PYTHON_CONFIG_REPO),
-        )
-    else:
-        _create_local_python_repository(repository_ctx)
-
-_ENVIRONS = [
-    BAZEL_SH,
-    PYTHON_BIN_PATH,
-    PYTHON_LIB_PATH,
-]
-
-local_python_configure = repository_rule(
-    implementation = _create_local_python_repository,
-    environ = _ENVIRONS,
-    attrs = {
-        "environ": attr.string_dict(),
-        "platform_constraint": attr.string(),
-    },
-)
-
-remote_python_configure = repository_rule(
-    implementation = _create_local_python_repository,
-    environ = _ENVIRONS,
-    remotable = True,
-    attrs = {
-        "environ": attr.string_dict(),
-        "platform_constraint": attr.string(),
-    },
-)
-
-python_configure = repository_rule(
-    implementation = _python_autoconf_impl,
-    environ = _ENVIRONS + [TF_PYTHON_CONFIG_REPO],
-    attrs = {
-        "platform_constraint": attr.string(),
-    },
-)
-"""Detects and configures the local Python.
-
-Add the following to your WORKSPACE FILE:
-
-```python
-python_configure(name = "local_config_python")
-```
-
-Args:
-  name: A unique name for this workspace rule.
-"""
diff --git a/third_party/py/python_init_pip.bzl b/third_party/py/python_init_pip.bzl
new file mode 100644
index 00000000000000..efc2bf8233cf61
--- /dev/null
+++ b/third_party/py/python_init_pip.bzl
@@ -0,0 +1,34 @@
+"""Hermetic Python initialization. Consult the WORKSPACE on how to use it."""
+
+load("@python//:defs.bzl", "interpreter")
+load("@python_version_repo//:py_version.bzl", "REQUIREMENTS")
+load("@rules_python//python:pip.bzl", "package_annotation", "pip_parse")
+
+def python_init_pip():
+    numpy_annotations = {
+        "numpy": package_annotation(
+            additive_build_content = """\
+cc_library(
+    name = "numpy_headers_2",
+    hdrs = glob(["site-packages/numpy/_core/include/**/*.h"]),
+    strip_include_prefix="site-packages/numpy/_core/include/",
+)
+cc_library(
+    name = "numpy_headers_1",
+    hdrs = glob(["site-packages/numpy/core/include/**/*.h"]),
+    strip_include_prefix="site-packages/numpy/core/include/",
+)
+cc_library(
+    name = "numpy_headers",
+    deps = [":numpy_headers_2", ":numpy_headers_1"],
+)
+""",
+        ),
+    }
+
+    pip_parse(
+        name = "pypi",
+        annotations = numpy_annotations,
+        python_interpreter_target = interpreter,
+        requirements_lock = REQUIREMENTS,
+    )
diff --git a/third_party/py/python_init_repositories.bzl b/third_party/py/python_init_repositories.bzl
new file mode 100644
index 00000000000000..5a405f2c2aba4c
--- /dev/null
+++ b/third_party/py/python_init_repositories.bzl
@@ -0,0 +1,12 @@
+"""Hermetic Python initialization. Consult the WORKSPACE on how to use it."""
+
+load("@rules_python//python:repositories.bzl", "py_repositories")
+load("//third_party/py:python_repo.bzl", "python_repository")
+
+def python_init_repositories(requirements = {}):
+    python_repository(
+        name = "python_version_repo",
+        requirements_versions = requirements.keys(),
+        requirements_locks = requirements.values(),
+    )
+    py_repositories()
diff --git a/third_party/py/python_init_rules.bzl b/third_party/py/python_init_rules.bzl
new file mode 100644
index 00000000000000..98a7b8bc3c315a
--- /dev/null
+++ b/third_party/py/python_init_rules.bzl
@@ -0,0 +1,11 @@
+"""Hermetic Python initialization. Consult the WORKSPACE on how to use it."""
+
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+def python_init_rules():
+    http_archive(
+        name = "rules_python",
+        sha256 = "9d04041ac92a0985e344235f5d946f71ac543f1b1565f2cdbc9a2aaee8adf55b",
+        strip_prefix = "rules_python-0.26.0",
+        url = "https://github.com/bazelbuild/rules_python/releases/download/0.26.0/rules_python-0.26.0.tar.gz",
+    )
diff --git a/third_party/py/python_init_toolchains.bzl b/third_party/py/python_init_toolchains.bzl
new file mode 100644
index 00000000000000..c1f800db4c01e7
--- /dev/null
+++ b/third_party/py/python_init_toolchains.bzl
@@ -0,0 +1,13 @@
+"""Hermetic Python initialization. Consult the WORKSPACE on how to use it."""
+
+load("@python_version_repo//:py_version.bzl", "HERMETIC_PYTHON_VERSION")
+load("@rules_python//python:repositories.bzl", "python_register_toolchains")
+load("@rules_python//python:versions.bzl", "MINOR_MAPPING")
+
+def python_init_toolchains():
+    if HERMETIC_PYTHON_VERSION in MINOR_MAPPING:
+        python_register_toolchains(
+            name = "python",
+            ignore_root_user_error = True,
+            python_version = HERMETIC_PYTHON_VERSION,
+        )
diff --git a/third_party/py/python_repo.bzl b/third_party/py/python_repo.bzl
new file mode 100644
index 00000000000000..77a6ce9ce50b60
--- /dev/null
+++ b/third_party/py/python_repo.bzl
@@ -0,0 +1,206 @@
+"""
+Repository rule to manage hermetic Python interpreter under Bazel.
+
+Version can be set via build parameter "--repo_env=HERMETIC_PYTHON_VERSION=3.11"
+Defaults to 3.11.
+
+To set wheel name, add "--repo_env=WHEEL_NAME=tensorflow_cpu"
+"""
+
+VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13"]
+DEFAULT_VERSION = "3.11"
+WARNING = """
+HERMETIC_PYTHON_VERSION variable was not set correctly, using default version. 
+Python {} will be used.
+To select Python version, either set HERMETIC_PYTHON_VERSION env variable in 
+your shell:
+  export HERMETIC_PYTHON_VERSION=3.12
+OR pass it as an argument to bazel command directly or inside your .bazelrc 
+file:
+  --repo_env=HERMETIC_PYTHON_VERSION=3.12
+""".format(DEFAULT_VERSION)
+
+content = """TF_PYTHON_VERSION = "{version}"
+HERMETIC_PYTHON_VERSION = "{version}"
+WHEEL_NAME = "{wheel_name}"
+WHEEL_COLLAB = "{wheel_collab}"
+REQUIREMENTS = "{requirements}"
+"""
+
+def _python_repository_impl(ctx):
+    ctx.file("BUILD", "")
+    version_legacy = ctx.os.environ.get("TF_PYTHON_VERSION", "")
+    version = ctx.os.environ.get("HERMETIC_PYTHON_VERSION", "")
+    if not version:
+        version = version_legacy
+    else:
+        version_legacy = version
+
+    wheel_name = ctx.os.environ.get("WHEEL_NAME", "tensorflow")
+    wheel_collab = ctx.os.environ.get("WHEEL_COLLAB", False)
+    if version not in VERSIONS:
+        print(WARNING)  # buildifier: disable=print
+        version = DEFAULT_VERSION
+    else:
+        print("Using hermetic Python %s" % version)  # buildifier: disable=print
+
+    requirements = ""
+    for i in range(0, len(ctx.attr.requirements_locks)):
+        if ctx.attr.requirements_versions[i] == version:
+            requirements = ctx.attr.requirements_locks[i]
+            break
+
+    ctx.file(
+        "py_version.bzl",
+        content.format(
+            version = version,
+            wheel_name = wheel_name,
+            wheel_collab = wheel_collab,
+            requirements = str(requirements),
+        ),
+    )
+
+python_repository = repository_rule(
+    implementation = _python_repository_impl,
+    attrs = {
+        "requirements_versions": attr.string_list(
+            mandatory = False,
+            default = [],
+        ),
+        "requirements_locks": attr.label_list(
+            mandatory = False,
+            default = [],
+        ),
+    },
+    environ = [
+        "TF_PYTHON_VERSION",
+        "HERMETIC_PYTHON_VERSION",
+        "WHEEL_NAME",
+        "WHEEL_COLLAB",
+    ],
+)
+
+def _custom_python_interpreter_impl(ctx):
+    version = ctx.attr.version
+    strip_prefix = ctx.attr.strip_prefix.format(version = version)
+    urls = [url.format(version = version) for url in ctx.attr.urls]
+    binary_name = ctx.attr.binary_name
+    if not binary_name:
+        ver_chunks = version.split(".")
+        binary_name = "python%s.%s" % (ver_chunks[0], ver_chunks[1])
+
+    install_dir = "{name}-{version}".format(name = ctx.attr.name, version = version)
+    _exec_and_check(ctx, ["mkdir", install_dir])
+    install_path = ctx.path(install_dir)
+    srcs_dir = "srcs"
+    ctx.download_and_extract(
+        url = urls,
+        stripPrefix = strip_prefix,
+        output = srcs_dir,
+    )
+
+    configure_params = []
+    if "CC" in ctx.os.environ:
+        configure_params.append("CC={}".format(ctx.os.environ["CC"]))
+    if "CXX" in ctx.os.environ:
+        configure_params.append("CXX={}".format(ctx.os.environ["CXX"]))
+
+    configure_params.append("--enable-optimizations")
+    configure_params.append("--prefix=%s" % install_path.realpath)
+    _exec_and_check(
+        ctx,
+        ["./configure"] + configure_params,
+        working_directory = srcs_dir,
+        quiet = False,
+    )
+    res = _exec_and_check(ctx, ["nproc"])
+    cores = 12 if res.return_code != 0 else max(1, int(res.stdout.strip()) - 1)
+    _exec_and_check(ctx, ["make", "-j%s" % cores], working_directory = srcs_dir)
+    _exec_and_check(ctx, ["make", "altinstall"], working_directory = srcs_dir)
+    _exec_and_check(ctx, ["ln", "-s", binary_name, "python3"], working_directory = install_dir + "/bin")
+    tar = "{install_dir}.tgz".format(install_dir = install_dir)
+    _exec_and_check(ctx, ["tar", "czpf", tar, install_dir])
+    _exec_and_check(ctx, ["rm", "-rf", srcs_dir])
+    res = _exec_and_check(ctx, ["sha256sum", tar])
+
+    sha256 = res.stdout.split(" ")[0].strip()
+    tar_path = ctx.path(tar)
+
+    example = """\n\n
+To use newly built Python interpreter add the following code snippet RIGHT AFTER
+python_init_toolchains() in your WORKSPACE file. The code sample should work as
+is but it may need some tuning, if you have special requirements.
+
+```
+load("@rules_python//python:repositories.bzl", "python_register_toolchains")
+python_register_toolchains(
+    name = "python",
+    # By default assume the interpreter is on the local file system, replace
+    # with proper URL if it is not the case.
+    base_url = "file://",
+    ignore_root_user_error = True,
+    python_version = "{version}",
+    tool_versions = {{
+        "{version}": {{
+            # Path to .tar.gz with Python binary. By default it points to .tgz
+            # file in cache where it was built originally; replace with proper
+            # file location, if you moved it somewhere else.
+            "url": "{tar_path}",
+            "sha256": {{
+                # By default we assume Linux x86_64 architecture, eplace with
+                # proper architecture if you were building on a different platform.
+                "x86_64-unknown-linux-gnu": "{sha256}",
+            }},
+            "strip_prefix": "{install_dir}",
+        }},
+    }},
+)
+```
+\n\n""".format(version = version, tar_path = tar_path, sha256 = sha256, install_dir = install_dir)
+
+    instructions = "INSTRUCTIONS-{version}.md".format(version = version)
+    ctx.file(instructions + ".tmpl", example, executable = False)
+    ctx.file(
+        "BUILD.bazel",
+        """
+genrule(
+    name = "{name}",
+    srcs = ["{tar}", "{instructions}.tmpl"],
+    outs = ["{install_dir}.tar.gz", "{instructions}"],
+    cmd = "cp $(location {tar}) $(location {install_dir}.tar.gz); cp $(location {instructions}.tmpl) $(location {instructions})",
+    visibility = ["//visibility:public"],
+)
+     """.format(
+            name = ctx.attr.name,
+            tar = tar,
+            install_dir = install_dir,
+            instructions = instructions,
+        ),
+        executable = False,
+    )
+
+    print(example)  # buildifier: disable=print
+
+custom_python_interpreter = repository_rule(
+    implementation = _custom_python_interpreter_impl,
+    attrs = {
+        "urls": attr.string_list(),
+        "strip_prefix": attr.string(),
+        "binary_name": attr.string(mandatory = False),
+        "version": attr.string(),
+    },
+)
+
+def _exec_and_check(ctx, command, fail_on_error = True, quiet = False, **kwargs):
+    res = ctx.execute(command, quiet = quiet, **kwargs)
+    if fail_on_error and res.return_code != 0:
+        fail("""
+Failed to execute command: `{command}`
+Exit Code: {code}
+STDERR: {stderr}
+        """.format(
+            command = command,
+            code = res.return_code,
+            stderr = res.stderr,
+        ))
+    return res
diff --git a/third_party/riegeli_fix.patch b/third_party/riegeli_fix.patch
index a6916b312e43da..a8fe8de8760393 100644
--- a/third_party/riegeli_fix.patch
+++ b/third_party/riegeli_fix.patch
@@ -6,12 +6,12 @@ index cde6d546..7fe18a93 100644
 --- a/python/riegeli/records/BUILD
 +++ b/python/riegeli/records/BUILD
 @@ -1,5 +1,5 @@
- load("//python/riegeli:py_extension.bzl", "py_extension")
 -load("@com_google_protobuf//:protobuf.bzl", "py_proto_library")
 +load("@local_tsl//tsl/platform/default:build_config.bzl", "py_proto_library")
+ load("//python/riegeli:py_extension.bzl", "py_extension")
 
- package(default_visibility = ["//python/riegeli:__subpackages__"])
-
+ package(
+     default_visibility = ["//python/riegeli:__subpackages__"],
 --
 2.25.1
 
diff --git a/third_party/stablehlo/temporary.patch b/third_party/stablehlo/temporary.patch
index c8897665644baa..85a49110b59f95 100755
--- a/third_party/stablehlo/temporary.patch
+++ b/third_party/stablehlo/temporary.patch
@@ -1,7 +1,7 @@
 diff --ruN a/stablehlo/CMakeLists.txt b/stablehlo/CMakeLists.txt
 --- stablehlo/CMakeLists.txt
 +++ stablehlo/CMakeLists.txt
-@@ -13,153 +13,20 @@
+@@ -13,154 +13,20 @@
  # See the License for the specific language governing permissions and
  # limitations under the License.
  #
@@ -43,6 +43,7 @@ diff --ruN a/stablehlo/CMakeLists.txt b/stablehlo/CMakeLists.txt
  #-------------------------------------------------------------------------------
 -option(STABLEHLO_BUILD_EMBEDDED "Build StableHLO as part of another project" OFF)
 -option(STABLEHLO_ENABLE_BINDINGS_PYTHON "Enables StableHLO Python bindings" OFF)
+-option(STABLEHLO_ENABLE_PYTHON_TF_TESTS "Enables StableHLO to SavedModel tests requiring TF" OFF)
 -option(STABLEHLO_ENABLE_STRICT_BUILD "Build StableHLO with strict warnings and warnings as errors" OFF)
 -option(STABLEHLO_ENABLE_SANITIZER "Enable a sanitizer [OFF, address]" OFF)
 -option(STABLEHLO_ENABLE_SPLIT_DWARF "Enable split DWARF if the platform supports it" OFF)
@@ -174,6 +175,53 @@ diff --ruN a/stablehlo/stablehlo/CMakeLists.txt b/stablehlo/stablehlo/CMakeLists
  add_subdirectory(integrations)
  add_subdirectory(reference)
  add_subdirectory(tests)
+diff --ruN a/stablehlo/stablehlo/conversions/tosa/tests/binary.mlir b/stablehlo/stablehlo/conversions/tosa/tests/binary.mlir
+--- stablehlo/stablehlo/conversions/tosa/tests/binary.mlir
++++ stablehlo/stablehlo/conversions/tosa/tests/binary.mlir
+@@ -155,7 +155,7 @@
+ 
+ // CHECK-LABEL: @maximum_f64
+ func.func @maximum_f64(%arg0 : tensor<10xf64>, %arg1 : tensor<10xf64>) -> tensor<10xf64> {
+-  // CHECK: stablehlo.maximum
++  // CHECK: tosa.maximum
+   %0 = "stablehlo.maximum"(%arg0, %arg1) : (tensor<10xf64>, tensor<10xf64>) -> tensor<10xf64>
+   return %0 : tensor<10xf64>
+ }
+diff --ruN a/stablehlo/stablehlo/conversions/tosa/tests/nullary.mlir b/stablehlo/stablehlo/conversions/tosa/tests/nullary.mlir
+--- stablehlo/stablehlo/conversions/tosa/tests/nullary.mlir
++++ stablehlo/stablehlo/conversions/tosa/tests/nullary.mlir
+@@ -9,8 +9,7 @@
+ 
+ // CHECK-LABEL: @constant_f64
+ func.func @constant_f64() -> tensor<10xf64> {
+-  // TOSA does not support 64-bit types, so this should not legalize.
+-  // CHECK: stablehlo.constant
++  // CHECK: tosa.const
+   %0 = stablehlo.constant dense<0.000000e+00> : tensor<10xf64>
+   return %0 : tensor<10xf64>
+ }
+diff --ruN a/stablehlo/stablehlo/dialect/AssemblyFormat.cpp b/stablehlo/stablehlo/dialect/AssemblyFormat.cpp
+--- stablehlo/stablehlo/dialect/AssemblyFormat.cpp
++++ stablehlo/stablehlo/dialect/AssemblyFormat.cpp
+@@ -305,8 +305,7 @@
+ bool isCommutativeNoRegionMatchingDialect(OperationName innerOp,
+                                           StringRef reduceOpDialect) {
+   auto innerOpDialect = innerOp.getDialect();
+-  return innerOpDialect &&
+-         innerOpDialect->getNamespace().equals(reduceOpDialect) &&
++  return innerOpDialect && innerOpDialect->getNamespace() == reduceOpDialect &&
+          innerOp.hasTrait<mlir::OpTrait::NOperands<2>::Impl>() &&
+          innerOp.hasTrait<mlir::OpTrait::OneResult>() &&
+          (innerOp.hasTrait<mlir::hlo::OpTrait::IsCommutative>() ||
+@@ -359,7 +358,7 @@
+   // Check E5.
+   LLVM_DEBUG(llvm::dbgs() << "Checking ReduceOp compact print E5\n");
+   auto retOp = block.getTerminator();
+-  if (!retOp->getName().stripDialect().equals("return")) return false;
++  if (retOp->getName().stripDialect() != "return") return false;
+ 
+   return llvm::equal(innerOp.getResults(), retOp->getOperands());
+ }
 diff --ruN a/stablehlo/stablehlo/experimental/BUILD.bazel b/stablehlo/stablehlo/experimental/BUILD.bazel
 --- stablehlo/stablehlo/experimental/BUILD.bazel
 +++ stablehlo/stablehlo/experimental/BUILD.bazel
@@ -446,7 +494,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/CMakeLists.txt b/stablehlo
 diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp
 --- stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp
 +++ stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp
-@@ -0,0 +1,506 @@
+@@ -0,0 +1,505 @@
 +/* Copyright 2023 The StableHLO Authors.
 +
 +Licensed under the Apache License, Version 2.0 (the "License");
@@ -521,7 +569,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +  // reduce_window_i1
 +  SmallVector<ShapedType> inputTypes;
 +  for (auto [index, input] : llvm::enumerate(inputs)) {
-+    auto inputType = input.getType().dyn_cast<ShapedType>();
++    auto inputType = dyn_cast<ShapedType>(input.getType());
 +    inputTypes.push_back(inputType);
 +    if (!inputType)
 +      return op_.emitError()
@@ -531,7 +579,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +  // reduce_window_i2
 +  SmallVector<ShapedType> initValueTypes;
 +  for (auto [index, initValue] : llvm::enumerate(initValues)) {
-+    auto initValueType = initValue.getType().dyn_cast<ShapedType>();
++    auto initValueType = dyn_cast<ShapedType>(initValue.getType());
 +    initValueTypes.push_back(initValueType);
 +    if (!initValueType || !initValueType.hasRank() ||
 +        initValueType.getRank() != 0)
@@ -543,7 +591,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +  // reduce_window_i3...reduce_window_i7
 +  auto checkRank = [&](StringRef name, int64_t index, Value dynamicAttr,
 +                       int64_t expectedRank) -> LogicalResult {
-+    auto type = dynamicAttr.getType().dyn_cast<ShapedType>();
++    auto type = dyn_cast<ShapedType>(dynamicAttr.getType());
 +    if (!type || !type.hasRank() || type.getRank() != expectedRank ||
 +        !type.getElementType().isIntOrIndex()) {
 +      if (index < 0) index += op_->getNumOperands();
@@ -562,7 +610,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +    return failure();
 +
 +  // reduce_window_i7
-+  auto paddingType = getPadding().getType().dyn_cast<ShapedType>();
++  auto paddingType = dyn_cast<ShapedType>(getPadding().getType());
 +  if (!paddingType || !paddingType.hasRank() || paddingType.getRank() != 2 ||
 +      paddingType.getDimSize(1) != 2 ||
 +      !paddingType.getElementType().isIntOrIndex())
@@ -598,7 +646,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +  // verify them in that case, that seems like too much at this point.
 +  auto checkShape = [&](StringRef name, int64_t index, Value dynamicAttr,
 +                        ArrayRef<int64_t> expectedShape) -> LogicalResult {
-+    auto type = dynamicAttr.getType().cast<ShapedType>();
++    auto type = cast<ShapedType>(dynamicAttr.getType());
 +    if (type.getShape() != expectedShape) {
 +      if (index < 0) index += op_->getNumOperands();
 +      return op_.emitError()
@@ -622,7 +670,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +  // reduce_window_c13
 +  if (op_.getCalledComputations().size() != 1)
 +    return op_.emitError() << "expects called_computations to have 1 element";
-+  auto bodyAttr = op_.getCalledComputations()[0].cast<FlatSymbolRefAttr>();
++  auto bodyAttr = cast<FlatSymbolRefAttr>(op_.getCalledComputations()[0]);
 +  auto bodyFunc =
 +      op_->getParentOfType<ModuleOp>().lookupSymbol<func::FuncOp>(bodyAttr);
 +  if (!bodyFunc)
@@ -644,7 +692,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +  SmallVector<ShapedType> resultTypes;
 +  std::optional<ArrayRef<int64_t>> resultShape;
 +  for (auto result : results) {
-+    auto resultType = result.getType().dyn_cast<ShapedType>();
++    auto resultType = dyn_cast<ShapedType>(result.getType());
 +    resultTypes.push_back(resultType);
 +    if (!resultType) return op_.emitError() << "expects results to be tensors";
 +
@@ -683,32 +731,32 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +}
 +
 +TypedValue<ShapedType> DynamicReduceWindowOpAdaptor::getWindowDimensions() {
-+  return op_.getInputs()[op_.getInputs().size() - 5]
-+      .cast<TypedValue<ShapedType>>();
++  return cast<TypedValue<ShapedType>>(
++      op_.getInputs()[op_.getInputs().size() - 5]);
 +}
 +
 +TypedValue<ShapedType> DynamicReduceWindowOpAdaptor::getWindowStrides() {
-+  return op_.getInputs()[op_.getInputs().size() - 4]
-+      .cast<TypedValue<ShapedType>>();
++  return cast<TypedValue<ShapedType>>(
++      op_.getInputs()[op_.getInputs().size() - 4]);
 +}
 +
 +TypedValue<ShapedType> DynamicReduceWindowOpAdaptor::getBaseDilations() {
-+  return op_.getInputs()[op_.getInputs().size() - 3]
-+      .cast<TypedValue<ShapedType>>();
++  return cast<TypedValue<ShapedType>>(
++      op_.getInputs()[op_.getInputs().size() - 3]);
 +}
 +
 +TypedValue<ShapedType> DynamicReduceWindowOpAdaptor::getWindowDilations() {
-+  return op_.getInputs()[op_.getInputs().size() - 2]
-+      .cast<TypedValue<ShapedType>>();
++  return cast<TypedValue<ShapedType>>(
++      op_.getInputs()[op_.getInputs().size() - 2]);
 +}
 +
 +TypedValue<ShapedType> DynamicReduceWindowOpAdaptor::getPadding() {
-+  return op_.getInputs()[op_.getInputs().size() - 1]
-+      .cast<TypedValue<ShapedType>>();
++  return cast<TypedValue<ShapedType>>(
++      op_.getInputs()[op_.getInputs().size() - 1]);
 +}
 +
 +Region& DynamicReduceWindowOpAdaptor::getBody() {
-+  auto bodyAttr = op_.getCalledComputations()[0].cast<FlatSymbolRefAttr>();
++  auto bodyAttr = cast<FlatSymbolRefAttr>(op_.getCalledComputations()[0]);
 +  auto bodyFunc =
 +      op_->getParentOfType<ModuleOp>().lookupSymbol<func::FuncOp>(bodyAttr);
 +  return bodyFunc.getBody();
@@ -758,20 +806,20 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +  auto output = op_.getResults()[1];
 +
 +  // dynamic_rng_bit_generator_i1
-+  if (!rngAlgorithmAttr.isa<RngAlgorithmAttr>())
++  if (!isa<RngAlgorithmAttr>(rngAlgorithmAttr))
 +    return op_.emitError()
 +           << "expects a #stablehlo<rng_algorithm ...> rng_algorithm";
 +
 +  // dynamic_rng_bit_generator_i2
 +  // TODO(#643): Clarify supported types for RngBitGeneratorOp.
-+  auto initialStateType = initialState.getType().dyn_cast<ShapedType>();
++  auto initialStateType = dyn_cast<ShapedType>(initialState.getType());
 +  if (!initialStateType || !initialStateType.getElementType().isIntOrFloat())
 +    return op_.emitError()
 +           << "expects initial_state (operand #0) "
 +           << "to be a tensor of integer or floating-point type";
 +
 +  // dynamic_rng_bit_generator_i3
-+  auto outputShapeType = outputShape.getType().dyn_cast<ShapedType>();
++  auto outputShapeType = dyn_cast<ShapedType>(outputShape.getType());
 +  if (!outputShapeType || !outputShapeType.hasRank() ||
 +      outputShapeType.getRank() != 1 ||
 +      !outputShapeType.getElementType().isIntOrIndex())
@@ -781,14 +829,14 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +
 +  // dynamic_rng_bit_generator_o1
 +  // TODO(#643): Clarify supported types for RngBitGeneratorOp.
-+  auto outputStateType = outputState.getType().dyn_cast<ShapedType>();
++  auto outputStateType = dyn_cast<ShapedType>(outputState.getType());
 +  if (!outputStateType || !outputStateType.getElementType().isIntOrFloat())
 +    return op_.emitError()
 +           << "expects output_state (result #0) "
 +           << "to be a tensor of integer or floating-point type";
 +
 +  // dynamic_rng_bit_generator_o2
-+  auto outputType = output.getType().dyn_cast<ShapedType>();
++  auto outputType = dyn_cast<ShapedType>(output.getType());
 +  if (!outputType || !outputType.getElementType().isIntOrFloat())
 +    return op_.emitError()
 +           << "expects output (result #1) "
@@ -812,25 +860,24 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +}
 +
 +RngAlgorithm DynamicRngBitGeneratorOpAdaptor::getRngAlgorithm() {
-+  return op_->getDiscardableAttr("rng_algorithm")
-+      .cast<RngAlgorithmAttr>()
++  return cast<RngAlgorithmAttr>(op_->getDiscardableAttr("rng_algorithm"))
 +      .getValue();
 +}
 +
 +TypedValue<ShapedType> DynamicRngBitGeneratorOpAdaptor::getInitialState() {
-+  return op_.getInputs()[0].cast<TypedValue<ShapedType>>();
++  return cast<TypedValue<ShapedType>>(op_.getInputs()[0]);
 +}
 +
 +TypedValue<ShapedType> DynamicRngBitGeneratorOpAdaptor::getOutputShape() {
-+  return op_.getInputs()[1].cast<TypedValue<ShapedType>>();
++  return cast<TypedValue<ShapedType>>(op_.getInputs()[1]);
 +}
 +
 +TypedValue<ShapedType> DynamicRngBitGeneratorOpAdaptor::getOutputState() {
-+  return op_.getResults()[0].cast<TypedValue<ShapedType>>();
++  return cast<TypedValue<ShapedType>>(op_.getResults()[0]);
 +}
 +
 +TypedValue<ShapedType> DynamicRngBitGeneratorOpAdaptor::getOutput() {
-+  return op_.getResults()[1].cast<TypedValue<ShapedType>>();
++  return cast<TypedValue<ShapedType>>(op_.getResults()[1]);
 +}
 +
 +std::optional<DynamicRngBitGeneratorOpAdaptor> getDynamicRngBitGeneratorOp(
@@ -864,7 +911,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +  auto indices = op_.getResults()[1];
 +
 +  // dynamic_top_k_i1
-+  auto operandType = operand.getType().dyn_cast<ShapedType>();
++  auto operandType = dyn_cast<ShapedType>(operand.getType());
 +  if (!operandType || !operandType.hasRank() || operandType.getRank() < 1 ||
 +      !operandType.getElementType().isIntOrFloat())
 +    return op_.emitError()
@@ -873,7 +920,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +           << "of rank at least 1";
 +
 +  // dynamic_top_k_i2
-+  auto kType = k.getType().dyn_cast<ShapedType>();
++  auto kType = dyn_cast<ShapedType>(k.getType());
 +  if (!kType || !kType.hasRank() || kType.getRank() != 0 ||
 +      !kType.getElementType().isIntOrIndex())
 +    return op_.emitError()
@@ -881,7 +928,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +           << "to be a 0-dimensional tensor of integer or index type";
 +
 +  // dynamic_top_k_o1
-+  auto valuesType = values.getType().dyn_cast<ShapedType>();
++  auto valuesType = dyn_cast<ShapedType>(values.getType());
 +  if (!valuesType || !valuesType.hasRank() || valuesType.getRank() < 1 ||
 +      !valuesType.getElementType().isIntOrFloat())
 +    return op_.emitError()
@@ -890,7 +937,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +           << "of rank at least 1";
 +
 +  // dynamic_top_k_o2
-+  auto indicesType = indices.getType().dyn_cast<ShapedType>();
++  auto indicesType = dyn_cast<ShapedType>(indices.getType());
 +  if (!indicesType || !indicesType.hasRank() || indicesType.getRank() < 1 ||
 +      !indicesType.getElementType().isSignlessInteger(32))
 +    return op_.emitError() << "expects indices (result #1) "
@@ -930,19 +977,19 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +}
 +
 +TypedValue<ShapedType> DynamicTopKOpAdaptor::getOperand() {
-+  return op_.getInputs()[0].cast<TypedValue<ShapedType>>();
++  return cast<TypedValue<ShapedType>>(op_.getInputs()[0]);
 +}
 +
 +TypedValue<ShapedType> DynamicTopKOpAdaptor::getK() {
-+  return op_.getInputs()[1].cast<TypedValue<ShapedType>>();
++  return cast<TypedValue<ShapedType>>(op_.getInputs()[1]);
 +}
 +
 +TypedValue<ShapedType> DynamicTopKOpAdaptor::getValues() {
-+  return op_.getResults()[0].cast<TypedValue<ShapedType>>();
++  return cast<TypedValue<ShapedType>>(op_.getResults()[0]);
 +}
 +
 +TypedValue<ShapedType> DynamicTopKOpAdaptor::getIndices() {
-+  return op_.getResults()[1].cast<TypedValue<ShapedType>>();
++  return cast<TypedValue<ShapedType>>(op_.getResults()[1]);
 +}
 +
 +std::optional<DynamicTopKOpAdaptor> getDynamicTopKOp(CustomCallOp op) {
@@ -1422,17 +1469,17 @@ diff --ruN a/stablehlo/stablehlo/experimental/tests/stablehlo_canonicalize_dynam
 +// CHECK-LABEL: func @dynamic_reduce_window_success_static_result_type
 +func.func @dynamic_reduce_window_success_static_result_type(%arg0: tensor<3x2xf32>, %arg1: tensor<f32>) -> tensor<2x2xf32> {
 +  //           CHECK-NOT: stablehlo.dynamic_reduce_window
-+  //               CHECK: "stablehlo.reduce_window"(%arg0, %arg1) ({
-+  //          CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG2:arg.*]]: tensor<f32>, %[[ARG3:arg.*]]: tensor<f32>):
-+  //          CHECK-NEXT:     %[[VAL1:.*]] = stablehlo.add %arg2, %arg3 : tensor<f32>
-+  //          CHECK-NEXT:     stablehlo.return %[[VAL1]] : tensor<f32>
-+  //          CHECK-NEXT: }) {
++  //               CHECK: "stablehlo.reduce_window"(%arg0, %arg1) <{
 +  //          CHECK-SAME:   base_dilations = array<i64: 2, 1>,
 +  // CHECK-SAME{LITERAL}:   padding = dense<[[2, 1], [0, 0]]> : tensor<2x2xi64>,
 +  //          CHECK-SAME:   window_dilations = array<i64: 3, 1>,
 +  //          CHECK-SAME:   window_dimensions = array<i64: 2, 1>,
 +  //          CHECK-SAME:   window_strides = array<i64: 4, 1>
-+  //          CHECK-SAME: } : (tensor<3x2xf32>, tensor<f32>) -> tensor<2x2xf32>
++  //          CHECK-SAME: }> ({
++  //          CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG2:arg.*]]: tensor<f32>, %[[ARG3:arg.*]]: tensor<f32>):
++  //          CHECK-NEXT:     %[[VAL1:.*]] = stablehlo.add %arg2, %arg3 : tensor<f32>
++  //          CHECK-NEXT:     stablehlo.return %[[VAL1]] : tensor<f32>
++  //          CHECK-NEXT: }) : (tensor<3x2xf32>, tensor<f32>) -> tensor<2x2xf32>
 +  %0 = stablehlo.constant dense<[2, 1]> : tensor<2xi64>
 +  %1 = stablehlo.constant dense<[4, 1]> : tensor<2xi64>
 +  %2 = stablehlo.constant dense<[2, 1]> : tensor<2xi64>
@@ -1454,17 +1501,17 @@ diff --ruN a/stablehlo/stablehlo/experimental/tests/stablehlo_canonicalize_dynam
 +// CHECK-LABEL: func @dynamic_reduce_window_success_dynamic_result_type
 +func.func @dynamic_reduce_window_success_dynamic_result_type(%arg0: tensor<?x2xf32>, %arg1: tensor<f32>) -> tensor<?x2xf32> {
 +  //           CHECK-NOT: stablehlo.dynamic_reduce_window
-+  //               CHECK: "stablehlo.reduce_window"(%arg0, %arg1) ({
-+  //          CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG2:arg.*]]: tensor<f32>, %[[ARG3:arg.*]]: tensor<f32>):
-+  //          CHECK-NEXT:     %[[VAL1:.*]] = stablehlo.add %arg2, %arg3 : tensor<f32>
-+  //          CHECK-NEXT:     stablehlo.return %[[VAL1]] : tensor<f32>
-+  //          CHECK-NEXT: }) {
++  //               CHECK: "stablehlo.reduce_window"(%arg0, %arg1) <{
 +  //          CHECK-SAME:   base_dilations = array<i64: 2, 1>,
 +  // CHECK-SAME{LITERAL}:   padding = dense<[[2, 1], [0, 0]]> : tensor<2x2xi64>,
 +  //          CHECK-SAME:   window_dilations = array<i64: 3, 1>,
 +  //          CHECK-SAME:   window_dimensions = array<i64: 2, 1>,
 +  //          CHECK-SAME:   window_strides = array<i64: 4, 1>
-+  //          CHECK-SAME: } : (tensor<?x2xf32>, tensor<f32>) -> tensor<?x2xf32>
++  //          CHECK-SAME: }> ({
++  //          CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG2:arg.*]]: tensor<f32>, %[[ARG3:arg.*]]: tensor<f32>):
++  //          CHECK-NEXT:     %[[VAL1:.*]] = stablehlo.add %arg2, %arg3 : tensor<f32>
++  //          CHECK-NEXT:     stablehlo.return %[[VAL1]] : tensor<f32>
++  //          CHECK-NEXT: }) : (tensor<?x2xf32>, tensor<f32>) -> tensor<?x2xf32>
 +  %0 = stablehlo.constant dense<[2, 1]> : tensor<2xi64>
 +  %1 = stablehlo.constant dense<[4, 1]> : tensor<2xi64>
 +  %2 = stablehlo.constant dense<[2, 1]> : tensor<2xi64>
@@ -2057,7 +2104,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp b/
 +    auto res = verifyCustomCallOpAttributes(
 +        op, rewriter, [&](NamedAttribute attr) -> LogicalResult {
 +          if (attr.getName() != "largest") return success();
-+          if (attr.getValue().cast<BoolAttr>().getValue() == false)
++          if (cast<BoolAttr>(attr.getValue()).getValue() == false)
 +            return rewriter.notifyMatchFailure(
 +                op, "largest = false is not supported.");
 +          return success();
@@ -2319,7 +2366,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDy
 +    // the verifier to make sure that its value is consistent with result type.
 +    if (!succeeded(hlo::matchInts(op.getOutputShape())))
 +      return rewriter.notifyMatchFailure(op, "expected static output_shape");
-+    if (!op.getOutput().getType().cast<ShapedType>().hasStaticShape())
++    if (!cast<ShapedType>(op.getOutput().getType()).hasStaticShape())
 +      return rewriter.notifyMatchFailure(op, "expected static output type");
 +    rewriter.replaceOpWithNewOp<RngBitGeneratorOp>(
 +        op, op->getResultTypes(), op.getRngAlgorithm(), op.getInitialState());
@@ -2341,7 +2388,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDy
 +      return rewriter.notifyMatchFailure(impl, "expected constant k");
 +
 +    // We rely on many of the properties checked by verification.
-+    auto valuesType = op.getValues().getType().cast<ShapedType>();
++    auto valuesType = cast<ShapedType>(op.getValues().getType());
 +    auto valuesLastDimSize = valuesType.getShape()[valuesType.getRank() - 1];
 +    if (hlo::isDynamicDimSize(valuesLastDimSize) ||
 +        valuesLastDimSize != k[0])
@@ -2493,7 +2540,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.c
 +    // At the moment, we only support refining return types using fully static
 +    // shape values which serves the current use cases well.
 +    // Support for partially static shape values is left for future work.
-+    auto initialStateType = op.getInitialState().getType().cast<ShapedType>();
++    auto initialStateType = cast<ShapedType>(op.getInitialState().getType());
 +    SmallVector<int64_t> outputShape;
 +    if (failed(hlo::matchInts(op.getOutputShape(), outputShape)))
 +      return rewriter.notifyMatchFailure(op, "expected constant output_shape");
@@ -2513,7 +2560,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.c
 +    if (!maybeOp || failed(maybeOp->verify())) return failure();
 +    DynamicTopKOpAdaptor op = *maybeOp;
 +
-+    auto operandType = op.getOperand().getType().cast<ShapedType>();
++    auto operandType = cast<ShapedType>(op.getOperand().getType());
 +    SmallVector<int64_t> outputShape(operandType.getShape());
 +    SmallVector<int64_t> k;
 +    if (failed(hlo::matchInts(op.getK(), k)))
@@ -2564,76 +2611,37 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.c
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
-diff --ruN a/stablehlo/stablehlo/reference/Ops.cpp b/stablehlo/stablehlo/reference/Ops.cpp
---- stablehlo/stablehlo/reference/Ops.cpp
-+++ stablehlo/stablehlo/reference/Ops.cpp
-@@ -16,6 +16,7 @@
- #include "stablehlo/reference/Ops.h"
- 
- #include <algorithm>
-+#include <cstdint>
- 
- #include "llvm/ADT/APFloat.h"
- #include "llvm/ADT/APInt.h"
-@@ -1428,23 +1429,23 @@
-                         inputSpatialDimensions.end());
-   lhsPermutation.push_back(inputFeatureDimension);
+diff --ruN a/stablehlo/stablehlo/reference/Api.cpp b/stablehlo/stablehlo/reference/Api.cpp
+--- stablehlo/stablehlo/reference/Api.cpp
++++ stablehlo/stablehlo/reference/Api.cpp
+@@ -51,7 +51,7 @@
+   auto functions = module.getOps<func::FuncOp>();
  
--  auto lhsWindowDimensions =
--      concatAndPermute(lhs.getShape()[inputBatchDimension],
--                       extractElements(rhs.getShape(), kernelSpatialDimensions),
--                       lhs.getShape()[inputFeatureDimension], lhsPermutation);
--
--  auto lhsWindowStrides =
--      concatAndPermute(1L, llvm::to_vector(windowStrides), 1L, lhsPermutation);
-+  auto lhsWindowDimensions = concatAndPermute<int64_t>(
-+      lhs.getShape()[inputBatchDimension],
-+      extractElements(rhs.getShape(), kernelSpatialDimensions),
-+      lhs.getShape()[inputFeatureDimension], lhsPermutation);
-+
-+  auto lhsWindowStrides = concatAndPermute<int64_t>(
-+      1L, llvm::to_vector(windowStrides), 1L, lhsPermutation);
+   for (auto funcOp : functions)
+-    if (funcOp.getSymName().equals(mainName)) return funcOp;
++    if (funcOp.getSymName() == mainName) return funcOp;
  
-   auto lhsBaseDilations =
--      concatAndPermute(0L, Sizes(lhsDilation) - 1, 0L, lhsPermutation);
--
--  auto lhsWindowDilations =
--      concatAndPermute(1L, llvm::to_vector(rhsDilation), 1L, lhsPermutation);
-+      concatAndPermute<int64_t>(0L, Sizes(lhsDilation) - 1, 0L, lhsPermutation);
-+
-+  auto lhsWindowDilations = concatAndPermute<int64_t>(
-+      1L, llvm::to_vector(rhsDilation), 1L, lhsPermutation);
+   bool isSingleFunction =
+       std::distance(functions.begin(), functions.end()) == 1;
+@@ -68,7 +68,7 @@
+ class DefaultInterpreterFallback : public InterpreterFallback {
+  public:
+   DefaultInterpreterFallback(const InterpreterConfiguration &config)
+-      : config(config){};
++      : config(config) {};
  
-   Sizes lhsPaddingLow, lhsPaddingHigh;
--  for (auto paddingPair : concatAndPermute({0, 0}, llvm::to_vector(padding),
--                                           {0, 0}, lhsPermutation)) {
-+  for (auto paddingPair : concatAndPermute<std::pair<int64_t, int64_t>>(
-+           {0, 0}, llvm::to_vector(padding), {0, 0}, lhsPermutation)) {
-     lhsPaddingLow.push_back(paddingPair.first);
-     lhsPaddingHigh.push_back(paddingPair.second);
-   }
-@@ -1461,8 +1462,8 @@
-   for (; outputSpatialIndexIt != outputSpatialIndexItEnd;
-        ++outputSpatialIndexIt) {
-     Sizes lhsWindowStart;
--    for (auto [i, offset] : llvm::enumerate(
--             concatAndPermute(0L, *outputSpatialIndexIt, 0L, lhsPermutation)))
-+    for (auto [i, offset] : llvm::enumerate(concatAndPermute<int64_t>(
-+             0L, *outputSpatialIndexIt, 0L, lhsPermutation)))
-       lhsWindowStart.push_back(lhsWindowStrides[i] * offset);
+   virtual llvm::Error operator()(Operation &op, Scope &scope,
+                                  Process *process) final {
+diff --ruN a/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp b/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
+--- stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
++++ stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
+@@ -764,7 +764,7 @@
  
-     Sizes limitIndices;
-@@ -1507,9 +1508,9 @@
-     for (auto dotProductIt = dotProduct.index_begin();
-          dotProductIt != dotProduct.index_end();
-          ++dotProductIt, ++resultNonSpatialIt) {
--      Index resultIndex(
--          concatAndPermute((*resultNonSpatialIt)[0], *outputSpatialIndexIt,
--                           (*resultNonSpatialIt)[1], resultPermutation));
-+      Index resultIndex(concatAndPermute<int64_t>(
-+          (*resultNonSpatialIt)[0], *outputSpatialIndexIt,
-+          (*resultNonSpatialIt)[1], resultPermutation));
-       result.set(resultIndex, dotProduct.get(*dotProductIt));
-     }
-   }
+     // Clean up operand buffers after refinement
+     // Must do in this pattern to avoid needing multiple refinement iterations
+-    if (op.getCallTargetName().equals(kCustomCallOperandBarrierTarget)) {
++    if (op.getCallTargetName() == kCustomCallOperandBarrierTarget) {
+       Value operand = op.getOperand(0);
+       if (operand.getType() == op.getResult(0).getType()) {
+         op.replaceAllUsesWith(ValueRange(operand));
 
diff --git a/third_party/stablehlo/workspace.bzl b/third_party/stablehlo/workspace.bzl
index 86613c0dcd88cf..aaef166d96583c 100644
--- a/third_party/stablehlo/workspace.bzl
+++ b/third_party/stablehlo/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     # LINT.IfChange
-    STABLEHLO_COMMIT = "e81411ef562e11337283ef24bb3c40b2f3a6ebfa"
-    STABLEHLO_SHA256 = "167f15fbdfc3dc54601b6e37d53bce7323123f701893deb3935c8629a763766a"
+    STABLEHLO_COMMIT = "797bee217e1a041e9aac22cad4db207274596d94"
+    STABLEHLO_SHA256 = "e5619033e131ea2eeb9eab8c8e362f3ba12e111c6b4a15dac789ca216ff22c58"
     # LINT.ThenChange(Google-internal path)
 
     tf_http_archive(
diff --git a/third_party/tf_runtime/workspace.bzl b/third_party/tf_runtime/workspace.bzl
index 01b5c67cc07d67..04d0e390c8dfe3 100644
--- a/third_party/tf_runtime/workspace.bzl
+++ b/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "fc9cb9f1253679cb2b645c26e0198cd0213d4378"
-    TFRT_SHA256 = "f90273589fdbba62b7b7de6b928273b36e870f8aa2922e3cdfac6c2fa5b262aa"
+    TFRT_COMMIT = "edb0d2c6f5e343c83ea121817dc2599ad5453d5c"
+    TFRT_SHA256 = "97f7bfcbff025da3005e59b9ffe1bcb06b439874e3e2cd28a17d9287193d6901"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/triton/llvm_integration/cl623185214.patch b/third_party/triton/llvm_integration/cl623185214.patch
deleted file mode 100644
index 0eed759d61b419..00000000000000
--- a/third_party/triton/llvm_integration/cl623185214.patch
+++ /dev/null
@@ -1,14 +0,0 @@
-==== triton/lib/Target/LLVMIR/LLVMDIScope.cpp#8 - /google/src/cloud/wcui/mlir_6f6336858e4588ebd113ebcc930f6384a4edca54_1712678792/triton/lib/Target/LLVMIR/LLVMDIScope.cpp ====
-# action=edit type=text
---- triton/lib/Target/LLVMIR/LLVMDIScope.cpp	2024-02-09 08:45:23.000000000 -0800
-+++ triton/lib/Target/LLVMIR/LLVMDIScope.cpp	2024-04-09 09:10:04.000000000 -0700
-@@ -90,7 +90,8 @@
-         compileUnitAttr = LLVM::DICompileUnitAttr::get(
-             context, distinctId, llvm::dwarf::DW_LANG_C, fileAttr,
-             StringAttr::get(context, "triton"),
--            /*isOptimized=*/true, LLVM::DIEmissionKind::LineTablesOnly);
-+            /*isOptimized=*/true, LLVM::DIEmissionKind::LineTablesOnly,
-+            LLVM::DINameTableKind::Default);
-       }
-       subprogramFlags = subprogramFlags | LLVM::DISubprogramFlags::Definition;
-     } else {
diff --git a/third_party/triton/llvm_integration/series.bzl b/third_party/triton/llvm_integration/series.bzl
index 3248f6a5b1322c..9d97061753ae0a 100644
--- a/third_party/triton/llvm_integration/series.bzl
+++ b/third_party/triton/llvm_integration/series.bzl
@@ -5,5 +5,4 @@ These should be upstreamed to openai/triton as part of the next triton integrati
 """
 
 llvm_patch_list = [
-    "//third_party/triton/llvm_integration:cl623185214.patch",
 ]
diff --git a/third_party/triton/temporary/cl609333259.patch b/third_party/triton/temporary/cl609333259.patch
deleted file mode 100644
index d51bae07dda0db..00000000000000
--- a/third_party/triton/temporary/cl609333259.patch
+++ /dev/null
@@ -1,47 +0,0 @@
-This patch handles internal test failures. We can attempt to upstream this into
-2 changes, but OpenAI might resist. For now, we should move this patch into the
-internal ones. This is tracked here: b/331606551. These issues won't reproduce
-upstream without removing a pass (which we do internally) that needs further
-investigations (tracked here b/331360119).
-
-diff --git a/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp b/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp
---- a/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp
-+++ b/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp
-@@ -123,7 +115,8 @@ public:
-                                 PatternRewriter &rewriter) const override {
-     // Only consider conversions to dot operand.
-     auto cvtTy = cvt.getType().cast<RankedTensorType>();
--    if (!cvtTy.getEncoding().isa<DotOperandEncodingAttr>())
-+    auto dotOpEnc = cvtTy.getEncoding().dyn_cast<DotOperandEncodingAttr>();
-+    if (!dotOpEnc)
-       return failure();
- 
-     auto src = cvt.getSrc().getDefiningOp();
-@@ -138,6 +131,12 @@ public:
-                 [](Type ty) { return ty.isa<RankedTensorType>(); }))
-       return failure();
- 
-+    // Quick handling to fix loading issues when computing the original
-+    // bitwidth is unable to realize that there is a mixed-precision dot
-+    // (hence kWidth = 1) but wants to hoist through the type conversion.
-+    if (isa<arith::ExtFOp>(src) && dotOpEnc.getKWidth() == 1)
-+        return failure();
-+
-     // Only consider custom conversions or arith ops.
-     // TODO(jlebar): Is this too restrictive?
-     if (!isa<FpToFpOp, BitcastOp>(src) &&
-@@ -150,6 +149,14 @@ public:
-     if (isa<arith::TruncIOp, arith::TruncFOp, arith::SelectOp>(src))
-       return failure();
- 
-+    // Don't hoist through u1 -> fp casts as they aren't supported in
-+    // ElementwiseOpToLLVM::reorderValues().
-+    if (isa<arith::UIToFPOp>(src)) {
-+      Type srcType = getElementTypeOrSelf(src->getOperand(0));
-+      if (srcType.isInteger(1))
-+        return failure();
-+    }
-+
-     // Check that the conversion is transitively dependent on a load, and all
-     // operations between the load and the conversion are layout preserving.
-     //
diff --git a/third_party/triton/temporary/linear_layout_compose_asan.patch b/third_party/triton/temporary/linear_layout_compose_asan.patch
new file mode 100644
index 00000000000000..eff83a166ac4a3
--- /dev/null
+++ b/third_party/triton/temporary/linear_layout_compose_asan.patch
@@ -0,0 +1,18 @@
+==== triton/lib/Tools/LinearLayout.cpp#2 - /google/src/cloud/shyshkov/triton_asan/triton/lib/Tools/LinearLayout.cpp ====
+# action=edit type=text
+--- triton/lib/Tools/LinearLayout.cpp	2024-05-17 09:15:25.000000000 -0700
++++ triton/lib/Tools/LinearLayout.cpp	2024-05-21 06:27:58.000000000 -0700
+@@ -397,9 +397,11 @@
+       for (auto [outDim, b] : llvm::zip(getOutDimNames(), basis)) {
+         bases.push_back({outDim, b});
+       }
+-      auto newBases = llvm::make_second_range(outer.apply(bases));
++
++      auto outerBases =
++          llvm::to_vector(llvm::make_second_range(outer.apply(bases)));
+       newInDimBases.push_back(
+-          std::vector<int32_t>(newBases.begin(), newBases.end()));
++          std::vector<int32_t>(outerBases.begin(), outerBases.end()));
+     }
+   }
+   return LinearLayout(std::move(newBases), outer.getOutDimNames());
diff --git a/third_party/triton/temporary/series.bzl b/third_party/triton/temporary/series.bzl
index e403a80462516b..b3d935c048fadd 100644
--- a/third_party/triton/temporary/series.bzl
+++ b/third_party/triton/temporary/series.bzl
@@ -6,5 +6,5 @@ internal patch during the next triton integration process.
 """
 
 temporary_patch_list = [
-    "//third_party/triton/temporary:cl609333259.patch",
+    "//third_party/triton/temporary:linear_layout_compose_asan.patch",
 ]
diff --git a/third_party/triton/workspace.bzl b/third_party/triton/workspace.bzl
index 50ba0e0c1ead99..a257f1f3e44645 100644
--- a/third_party/triton/workspace.bzl
+++ b/third_party/triton/workspace.bzl
@@ -8,8 +8,8 @@ load("//third_party/triton/xla_extensions:series.bzl", "extensions_files_patch_l
 def repo():
     """Imports Triton."""
 
-    TRITON_COMMIT = "cl619179472"
-    TRITON_SHA256 = "aa0b0b338bf16aa7eea778312fa549a421278b24d1a4bc04f5d6ced706f693fe"
+    TRITON_COMMIT = "cl634675237"
+    TRITON_SHA256 = "7151d057ee8443c2f45cbe18a7435a42f37e18f562e5d238b844b6e09fc560e6"
     tf_http_archive(
         name = "triton",
         sha256 = TRITON_SHA256,
diff --git a/third_party/triton/xla_extensions/env_vars.patch b/third_party/triton/xla_extensions/env_vars.patch
deleted file mode 100644
index 955eb6db8da68e..00000000000000
--- a/third_party/triton/xla_extensions/env_vars.patch
+++ /dev/null
@@ -1,14 +0,0 @@
-Long standing patch due to licensing issues.
-diff --git a/include/triton/Tools/Sys/GetEnv.hpp b/include/triton/Tools/Sys/GetEnv.hpp
-index 31bc03fe1..a19a432df 100644
---- a/include/triton/Tools/Sys/GetEnv.hpp
-+++ b/include/triton/Tools/Sys/GetEnv.hpp
-@@ -34,7 +34,7 @@ inline const std::set<std::string> ENV_VARS = {
-     "AMDGCN_ENABLE_DUMP",
-     "DISABLE_FAST_REDUCTION",
-     "DISABLE_LLVM_OPT",
--    "DISABLE_MMA_V3",
-+    "ENABLE_MMA_V3",
-     "DISABLE_PTXAS_OPT",
-     "LLVM_IR_ENABLE_DUMP",
-     "MLIR_ENABLE_DUMP",
diff --git a/third_party/triton/xla_extensions/series.bzl b/third_party/triton/xla_extensions/series.bzl
index 5e6309a20988ce..af524fb253cbef 100644
--- a/third_party/triton/xla_extensions/series.bzl
+++ b/third_party/triton/xla_extensions/series.bzl
@@ -4,8 +4,9 @@ applied in the previous copybara workflow.
 """
 
 extensions_files_patch_list = [
-    "//third_party/triton/xla_extensions:env_vars.patch",  # File not exported to google
     "//third_party/triton/xla_extensions:sparse_dot_nvgpu.patch",  # Sparsity internal patch
     "//third_party/triton/xla_extensions:sparse_dot_base.patch",  # Sparsity internal patch
     "//third_party/triton/xla_extensions:sparse_dot_passes.patch",  # Sparsity internal patch
+    "//third_party/triton/xla_extensions:sparse_dot_fixes_y24w17.patch",  # Sparsity internal patch
+    "//third_party/triton/xla_extensions:sparse_dot_fixes_y24w19.patch",  # Sparsity internal patch
 ]
diff --git a/third_party/triton/xla_extensions/sparse_dot_base.patch b/third_party/triton/xla_extensions/sparse_dot_base.patch
index 5b520537f6e814..08b7dd6f7ada87 100644
--- a/third_party/triton/xla_extensions/sparse_dot_base.patch
+++ b/third_party/triton/xla_extensions/sparse_dot_base.patch
@@ -1,8 +1,9 @@
 diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
+index 56f0b6b49..aa91ea9b8 100644
 --- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
 +++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
-@@ -1158,4 +1158,12 @@ section 9.7.13.4.1 for more details.
-   let extraClassDeclaration = extraDistributedDeclaration;
+@@ -1262,4 +1262,16 @@ section 9.7.13.4.1 for more details.
+   }];
  }
  
 +def SparseDotMetaEncodingAttr : DistributedEncoding<"SparseDotMetaEncoding", "sparse_dot_meta_encoding"> {
@@ -10,14 +11,19 @@ diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/
 +
 +  let parameters = (ins "Attribute":$parent);
 +  let assemblyFormat = "`<``{` struct(params) `}``>`";
-+  let extraClassDeclaration = extraDistributedDeclaration;
++  let extraClassDeclaration = extraDistributedDeclaration # [{
++    SmallVector<unsigned> getContigPerThread() {
++      return getSizePerThread();
++    };
++  }];
 +}
 +
  #endif
 diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
+index 4966a5f73..d2bb33cfa 100644
 --- a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
 +++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
-@@ -7,6 +7,7 @@ include "triton/Dialect/TritonGPU/IR/Tri
+@@ -7,6 +7,7 @@ include "triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td"
  include "mlir/Dialect/Arith/IR/ArithBase.td"
  include "triton/Dialect/Triton/IR/TritonTypes.td"
  include "triton/Dialect/Triton/IR/TritonAttrDefs.td"
@@ -25,8 +31,8 @@ diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td b/include/trito
  include "mlir/IR/OpBase.td"
  include "mlir/Interfaces/SideEffectInterfaces.td" // Pure
  include "mlir/Interfaces/InferTypeOpInterface.td" // SameOperandsAndResultType
-@@ -214,4 +215,19 @@ def TTG_LocalLoadOp : TTG_Op<"local_load
-   let results = (outs TT_Tensor:$result);
+@@ -232,4 +233,19 @@ def TTG_LocalStoreOp : TTG_Op<"local_store", [MemoryEffects<[MemWrite<SharedMemo
+   }];
  }
  
 +def TTNG_SparseDotOp : TTG_Op<"sparse_dot", [
@@ -46,9 +52,10 @@ diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td b/include/trito
 +
  #endif
 diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
+index 0ce7ecf18..3736a1551 100644
 --- a/lib/Dialect/TritonGPU/IR/Dialect.cpp
 +++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp
-@@ -479,6 +479,119 @@ getDefaultBlockedEncoding(MLIRContext *c
+@@ -483,6 +483,119 @@ getDefaultBlockedEncoding(MLIRContext *context, ArrayRef<int64_t> shape,
    return encoding;
  }
  
@@ -70,7 +77,7 @@ diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dia
 +
 +LogicalResult SparseDotOp::verify() {
 +  // Verify operand A.
-+  auto aTensorTy = getOperand(0).getType().cast<TensorOrMemDesc>();
++  auto aTensorTy = cast<TensorOrMemDesc>(getOperand(0).getType());
 +  auto aElemTy = aTensorTy.getElementType();
 +  if (!aElemTy.isF16() && !aElemTy.isBF16())
 +    return emitError("element type of operand A is not supported");
@@ -78,7 +85,7 @@ diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dia
 +  if (aShape.size() != 2) return emitError("shape of operand A is incorrect");
 +
 +  // Verify operand B.
-+  auto bTensorTy = getOperand(1).getType().cast<TensorOrMemDesc>();
++  auto bTensorTy = cast<TensorOrMemDesc>(getOperand(1).getType());
 +  auto bElemTy = bTensorTy.getElementType();
 +  if (!bElemTy.isF16() && !bElemTy.isBF16())
 +    return emitError("element type of operand B is not supported");
@@ -86,7 +93,7 @@ diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dia
 +  if (bShape.size() != 2) return emitError("shape of operand B is incorrect");
 +
 +  // Verify operand C.
-+  auto cTensorTy = getOperand(2).getType().cast<RankedTensorType>();
++  auto cTensorTy = cast<RankedTensorType>(getOperand(2).getType());
 +  auto cElemTy = cTensorTy.getElementType();
 +  if (!cElemTy.isF32())
 +    return emitError("element type of operand C is not supported");
@@ -101,7 +108,7 @@ diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dia
 +    return emitError("operand element types do not match");
 +
 +  // Verify sparse metadata.
-+  auto metaTy = getOperand(3).getType().cast<RankedTensorType>();
++  auto metaTy = cast<RankedTensorType>(getOperand(3).getType());
 +  auto metaShape = metaTy.getShape();
 +  if (!metaTy.getElementType().isInteger(16) || metaShape.size() != 2)
 +    return emitError("sparse metadata tensor is invalid");
@@ -125,7 +132,7 @@ diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dia
 +//--- SparseDotMetaEncodingAttr ---
 +unsigned SparseDotMetaEncodingAttr::getTotalElemsPerThread(
 +    ArrayRef<int64_t> shape, Type eltTy) const {
-+  auto mmaLayout = getParent().cast<NvidiaMmaEncodingAttr>();
++  auto mmaLayout = mlir::cast<NvidiaMmaEncodingAttr>(getParent());
 +  return product<int64_t>(shape) /
 +         (mmaLayout.getWarpsPerCTA()[0] * kMetadataElementsPerWarp);
 +}
@@ -168,208 +175,11 @@ diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dia
  } // namespace gpu
  } // namespace triton
  } // namespace mlir
-diff --git a/test/SparseDot/convert_to_llvm_ampere.mlir b/test/SparseDot/convert_to_llvm_ampere.mlir
-new file mode 100644
---- /dev/null
-+++ b/test/SparseDot/convert_to_llvm_ampere.mlir
-@@ -0,0 +1,26 @@
-+// RUN: triton-opt %s --allocate-shared-memory --convert-triton-gpu-to-llvm=compute-capability=80 | FileCheck %s
-+
-+#blocked0 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
-+#shared0 = #triton_gpu.shared<{vec = 1, perPhase=1, maxPhase=1, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
-+#mma0 = #triton_gpu.nvidia_mma<{versionMajor = 2, warpsPerCTA = [2, 2], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], instrShape = [16, 8]}>
-+#dot_operand_a = #triton_gpu.dot_op<{opIdx=0, parent=#mma0, kWidth=2}>
-+#dot_operand_b = #triton_gpu.dot_op<{opIdx=1, parent=#mma0, kWidth=2}>
-+#dot_meta_enc = #triton_gpu.sparse_dot_meta<{parent=#mma0}>
-+
-+module attributes {"triton_gpu.num-warps" = 4 : i32} {
-+  tt.func @sparse_dot(%A: tensor<32x32xf16, #blocked0>, %B: tensor<64x32xf16, #blocked0>, %meta: tensor<32x4xi16, #blocked0>) {
-+    // CHECK-COUNT-2: ldmatrix.sync.aligned.m8n8.x4.shared.b16
-+    %A_alloc = triton_gpu.local_alloc %A {allocation.offset = 0 : i32} : (tensor<32x32xf16, #blocked0>) -> !tt.memdesc<32x32xf16, #shared0>
-+    %A_dot = triton_gpu.local_load %A_alloc : !tt.memdesc<32x32xf16, #shared0> -> tensor<32x32xf16, #dot_operand_a>
-+    // CHECK-COUNT-4: ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16
-+    %B_alloc = triton_gpu.local_alloc %B {allocation.offset = 2048 : i32} : (tensor<64x32xf16, #blocked0>) -> !tt.memdesc<64x32xf16, #shared0>
-+    %B_dot = triton_gpu.local_load %B_alloc : !tt.memdesc<64x32xf16, #shared0> -> tensor<64x32xf16, #dot_operand_b>
-+    // CHECK-COUNT-4: llvm.load %[[_:.*]] : !llvm.ptr<3> -> i16
-+    %meta_alloc = triton_gpu.local_alloc %meta {allocation.offset = 6144 : i32} : (tensor<32x4xi16, #blocked0>) -> !tt.memdesc<32x4xi16, #shared0>
-+    %meta_reg = triton_gpu.local_load %meta_alloc : !tt.memdesc<32x4xi16, #shared0> -> tensor<32x4xi16, #dot_meta_enc>
-+    // CHECK-COUNT-4: mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32
-+    %acc = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma0>
-+    %D = triton_gpu.sparse_dot %A_dot, %B_dot, %acc, %meta_reg : tensor<32x32xf16, #dot_operand_a> meta tensor<32x4xi16, #dot_meta_enc> * tensor<64x32xf16, #dot_operand_b> -> tensor<32x32xf32, #mma0>
-+    tt.return
-+  }
-+}
-diff --git a/test/SparseDot/convert_to_llvm_hopper.mlir b/test/SparseDot/convert_to_llvm_hopper.mlir
-new file mode 100644
---- /dev/null
-+++ b/test/SparseDot/convert_to_llvm_hopper.mlir
-@@ -0,0 +1,28 @@
-+// RUN: triton-opt %s --allocate-shared-memory --convert-triton-gpu-to-llvm=compute-capability=90 | FileCheck %s
-+
-+#blocked0 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
-+#shared0 = #triton_gpu.shared<{vec = 1, perPhase=2, maxPhase=4, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
-+#shared1 = #triton_gpu.shared<{vec = 1, perPhase=1, maxPhase=1, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
-+#mma0 = #triton_gpu.nvidia_mma<{versionMajor = 3, warpsPerCTA = [4, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], instrShape = [16, 64, 16]}>
-+#dot_meta_enc = #triton_gpu.sparse_dot_meta<{parent=#mma0}>
-+
-+module attributes {"triton_gpu.num-warps" = 4 : i32} {
-+  tt.func @sparse_dot(%A: tensor<64x32xf16, #blocked0>, %B: tensor<64x64xf16, #blocked0>, %meta: tensor<64x4xi16, #blocked0>) {
-+    %A_alloc = triton_gpu.local_alloc %A {allocation.offset = 0 : i32} : (tensor<64x32xf16, #blocked0>) -> !tt.memdesc<64x32xf16, #shared0>
-+    %B_alloc = triton_gpu.local_alloc %B {allocation.offset = 4096 : i32} : (tensor<64x64xf16, #blocked0>) -> !tt.memdesc<64x64xf16, #shared0>
-+    // CHECK-COUNT-2: llvm.load %[[_:.*]] : !llvm.ptr<3> -> i16
-+    %meta_alloc = triton_gpu.local_alloc %meta {allocation.offset = 12288 : i32} : (tensor<64x4xi16, #blocked0>) -> !tt.memdesc<64x4xi16, #shared0>
-+    %meta_reg = triton_gpu.local_load %meta_alloc : !tt.memdesc<64x4xi16, #shared0> -> tensor<64x4xi16, #dot_meta_enc>
-+    // CHECK: nvgpu.wgmma_fence
-+    // CHECK-COUNT-2: nvgpu.wgmma_sp %[[A:.*]] meta %[[M:.*]], %[[B:.*]], %[[C:.*]] {
-+    // CHECK-DAG: layoutA = 0 : i32
-+    // CHECK-DAG: layoutB = 0 : i32
-+    // CHECK-DAG: m = 64 : i32
-+    // CHECK-DAG: n = 64 : i32
-+    // CHECK-DAG: k = 32 : i32
-+    // CHECK: nvgpu.wgmma_commit_group
-+    %acc = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #mma0>
-+    %D = triton_gpu.sparse_dot %A_alloc, %B_alloc, %acc, %meta_reg : !tt.memdesc<64x32xf16, #shared0> meta tensor<64x4xi16, #dot_meta_enc> * !tt.memdesc<64x64xf16, #shared0> -> tensor<64x64xf32, #mma0>
-+    tt.return
-+  }
-+}
-diff --git a/test/SparseDot/validation.mlir b/test/SparseDot/validation.mlir
-new file mode 100644
---- /dev/null
-+++ b/test/SparseDot/validation.mlir
-@@ -0,0 +1,129 @@
-+// RUN: triton-opt --split-input-file --verify-diagnostics %s
-+
-+tt.func @sparse_dot(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
-+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
-+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
-+  tt.return
-+}
-+
-+// -----
-+tt.func @sparse_dot_invalid_lhs_type(%lhs: tensor<128x32xf32>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
-+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
-+  // expected-error @+1 {{element type of operand A is not supported}}
-+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xf32> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
-+  tt.return
-+}
-+
-+// -----
-+tt.func @sparse_dot_invalid_lhs_shape(%lhs: tensor<1x128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
-+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
-+  // expected-error @+1 {{shape of operand A is incorrect}}
-+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<1x128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
-+  tt.return
-+}
-+
-+// -----
-+tt.func @sparse_dot_invalid_rhs_type(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xf32>, %meta: tensor<128x4xi16>) {
-+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
-+  // expected-error @+1 {{element type of operand B is not supported}}
-+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xf32> -> tensor<128x128xf32>
-+  tt.return
-+}
-+
-+// -----
-+tt.func @sparse_dot_invalid_rhs_shape(%lhs: tensor<128x32xbf16>, %rhs: tensor<1x64x128xbf16>, %meta: tensor<128x4xi16>) {
-+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
-+  // expected-error @+1 {{shape of operand B is incorrect}}
-+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<1x64x128xbf16> -> tensor<128x128xf32>
-+  tt.return
-+}
-+
-+// -----
-+tt.func @sparse_dot_invalid_acc_type(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
-+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xbf16>
-+  // expected-error @+1 {{element type of operand C is not supported}}
-+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x128xbf16>
-+  tt.return
-+}
-+
-+// -----
-+tt.func @sparse_dot_invalid_acc_shape(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
-+  %acc = arith.constant dense<0.00e+00> : tensor<16384xf32>
-+  // expected-error @+1 {{shape of operand C is incorrect}}
-+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<16384xf32>
-+  tt.return
-+}
-+
-+// -----
-+tt.func @sparse_dot_mismatch_lhs_acc(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
-+  %acc = arith.constant dense<0.00e+00> : tensor<64x128xf32>
-+  // expected-error @+1 {{operand shape dimensions are incorrect}}
-+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<64x128xf32>
-+  tt.return
-+}
-+
-+// -----
-+tt.func @sparse_dot_mismatch_rhs_acc(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
-+  %acc = arith.constant dense<0.00e+00> : tensor<128x64xf32>
-+  // expected-error @+1 {{operand shape dimensions are incorrect}}
-+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x64xf32>
-+  tt.return
-+}
-+
-+// -----
-+tt.func @sparse_dot_mismatch_lhs_rhs(%lhs: tensor<128x32xbf16>, %rhs: tensor<32x128xbf16>, %meta: tensor<128x4xi16>) {
-+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
-+  // expected-error @+1 {{operand shape dimensions are incorrect}}
-+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<32x128xbf16> -> tensor<128x128xf32>
-+  tt.return
-+}
-+
-+// -----
-+tt.func @sparse_dot_mismatch_input_types(%lhs: tensor<128x32xf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
-+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
-+  // expected-error @+1 {{operand element types do not match}}
-+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
-+  tt.return
-+}
-+
-+// -----
-+tt.func @sparse_dot_invalid_meta_type(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi8>) {
-+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
-+  // expected-error @+1 {{sparse metadata tensor is invalid}}
-+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi8> * tensor<64x128xbf16> -> tensor<128x128xf32>
-+  tt.return
-+}
-+
-+// -----
-+tt.func @sparse_dot_invalid_meta_shape(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<512xi16>) {
-+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
-+  // expected-error @+1 {{sparse metadata tensor is invalid}}
-+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<512xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
-+  tt.return
-+}
-+
-+// -----
-+tt.func @sparse_dot_mismatch_meta_noncontracting(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<64x4xi16>) {
-+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
-+  // expected-error @+1 {{sparse metadata shape dimensions are incorrect}}
-+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<64x4xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
-+  tt.return
-+}
-+
-+// -----
-+tt.func @sparse_dot_mismatch_meta_contracting(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x8xi16>) {
-+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
-+  // expected-error @+1 {{sparse metadata shape dimensions are incorrect}}
-+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x8xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
-+  tt.return
-+}
-+
-+// -----
-+#mma0 = #triton_gpu.nvidia_mma<{versionMajor = 2, warpsPerCTA = [2, 2], instrShape = [16, 8]}>
-+#enc0 = #triton_gpu.dot_op<{opIdx=0, parent=#mma0, kWidth=2}>
-+tt.func @sparse_dot_encoding_operand_mismatch(%lhs: tensor<128x32xbf16, #enc0>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
-+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
-+  // expected-error @+1 {{mismatching encoding between A and B operands}}
-+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16, #enc0> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
-+  tt.return
-+}
 diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.cpp
+index f8ece0f1c..435610817 100644
 --- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.cpp
 +++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.cpp
-@@ -38,6 +38,14 @@ Value convertLayout(int opIdx, Conversio
+@@ -43,6 +43,14 @@ Value convertLayout(int opIdx, ConversionPatternRewriter &rewriter,
                      const LLVMTypeConverter *typeConverter, Value thread);
  }
  
@@ -383,19 +193,19 @@ diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.
 +
  namespace {
  
- struct LocalLoadOpConversion
-@@ -59,6 +67,10 @@ public:
-             .isa<NvidiaMmaEncodingAttr>()) {
+ using namespace mlir;
+@@ -67,6 +75,10 @@ public:
+             cast<DotOperandEncodingAttr>(dstLayout).getParent())) {
        return lowerSharedToDotOperand(op, adaptor, getTypeConverter(), rewriter);
      }
-+    if (srcLayout.isa<SharedEncodingAttr>() &&
-+        dstLayout.isa<triton::gpu::SparseDotMetaEncodingAttr>()) {
++    if (isa<SharedEncodingAttr>(srcLayout) &&
++        isa<triton::gpu::SparseDotMetaEncodingAttr>(dstLayout)) {
 +      return lowerSharedToSparseMeta(op, adaptor, getTypeConverter(), rewriter);
 +    }
      return failure();
    }
  
-@@ -130,6 +142,29 @@ private:
+@@ -138,6 +150,26 @@ private:
      rewriter.replaceOp(op, res);
      return success();
    }
@@ -406,13 +216,10 @@ diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.
 +      const LLVMTypeConverter *typeConverter,
 +      ConversionPatternRewriter &rewriter) const {
 +    auto loc = op.getLoc();
-+    auto sparseEncoding = op.getResult()
-+                              .getType()
-+                              .cast<RankedTensorType>()
-+                              .getEncoding()
-+                              .cast<triton::gpu::SparseDotMetaEncodingAttr>();
++    auto sparseEncoding = cast<triton::gpu::SparseDotMetaEncodingAttr>(
++        cast<RankedTensorType>(op.getResult().getType()).getEncoding());
 +    auto llvmElemTy = typeConverter->convertType(
-+        op.getSrc().getType().cast<MemDescType>().getElementType());
++        cast<MemDescType>(op.getSrc().getType()).getElementType());
 +    auto smemObj = getSharedMemoryObjectFromStruct(loc, adaptor.getSrc(),
 +                                                   llvmElemTy, rewriter);
 +    Value res = SharedToSparseDotOperand::convertLayout(
@@ -427,6 +234,7 @@ diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.
  struct ConvertLayoutOpOptimizedConversion
 diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
 new file mode 100644
+index 000000000..3011cf73d
 --- /dev/null
 +++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
 @@ -0,0 +1,69 @@
@@ -453,7 +261,7 @@ new file mode 100644
 +    Value thread) {
 +  // Calculate tile size as number of mask elements (4xi4).
 +  NvidiaMmaEncodingAttr mmaLayout =
-+      sparseEncoding.getParent().cast<NvidiaMmaEncodingAttr>();
++      cast<NvidiaMmaEncodingAttr>(sparseEncoding.getParent());
 +  SmallVector<unsigned> shapePerCTATile = {
 +      kTileSize * mmaLayout.getWarpsPerCTA()[0],
 +      kTileSize / kMetadataElementsPerPackedValue};
@@ -470,7 +278,7 @@ new file mode 100644
 +  Value rowId = add(mul(warpGroupId, i32_val(kTileSize)), laneGroupId);
 +
 +  // Calculate number of tile repetitions.
-+  auto shape = tensor.getType().cast<MemDescType>().getShape();
++  auto shape = cast<MemDescType>(tensor.getType()).getShape();
 +  int repM = shape[0] / shapePerCTATile[0];
 +  int repK = shape[1] / shapePerCTATile[1];
 +  assert(repM > 0 && repK > 0);
@@ -500,9 +308,10 @@ new file mode 100644
 +}
 +}  // namespace SharedToSparseDotOperand
 diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM.cpp
+index 374b9ec9e..1601806b4 100644
 --- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM.cpp
 +++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM.cpp
-@@ -32,6 +32,12 @@ LogicalResult convertAsyncWGMMA(triton::
+@@ -32,6 +32,12 @@ LogicalResult convertAsyncWGMMA(triton::nvidia_gpu::DotAsyncOp op,
                                  const LLVMTypeConverter *typeConverter,
                                  ConversionPatternRewriter &rewriter,
                                  Value thread);
@@ -515,7 +324,7 @@ diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM.cpp b/thir
  namespace {
  struct DotOpConversion : public ConvertOpToLLVMPattern<triton::DotOp> {
    using ConvertOpToLLVMPattern<triton::DotOp>::ConvertOpToLLVMPattern;
-@@ -180,6 +186,18 @@ struct DotWaitOpConversion
+@@ -174,6 +180,18 @@ struct DotWaitOpConversion
      return success();
    }
  };
@@ -534,7 +343,7 @@ diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM.cpp b/thir
  } // namespace
  
  void mlir::triton::NVIDIA::populateDotOpToLLVMPatterns(
-@@ -188,4 +206,5 @@ void mlir::triton::NVIDIA::populateDotOp
+@@ -182,4 +200,5 @@ void mlir::triton::NVIDIA::populateDotOpToLLVMPatterns(
    patterns.add<DotOpConversion>(typeConverter, benefit);
    patterns.add<DotAsyncOpConversion>(typeConverter, benefit);
    patterns.add<DotWaitOpConversion>(typeConverter, benefit);
@@ -542,6 +351,7 @@ diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM.cpp b/thir
  }
 diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/Sparse.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/Sparse.cpp
 new file mode 100644
+index 000000000..34d9212d2
 --- /dev/null
 +++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/Sparse.cpp
 @@ -0,0 +1,339 @@
@@ -595,15 +405,15 @@ new file mode 100644
 +                               const LLVMTypeConverter *typeConverter,
 +                               ConversionPatternRewriter &rewriter) {
 +  // Get number of repetitions across the dimensions.
-+  auto aTensorTy = op.getA().getType().cast<RankedTensorType>();
-+  auto bTensorTy = op.getB().getType().cast<RankedTensorType>();
++  auto aTensorTy = cast<RankedTensorType>(op.getA().getType());
++  auto bTensorTy = cast<RankedTensorType>(op.getB().getType());
 +
-+  auto layoutA = aTensorTy.getEncoding().dyn_cast<DotOperandEncodingAttr>();
-+  auto layoutB = bTensorTy.getEncoding().dyn_cast<DotOperandEncodingAttr>();
++  auto layoutA = dyn_cast<DotOperandEncodingAttr>(aTensorTy.getEncoding());
++  auto layoutB = dyn_cast<DotOperandEncodingAttr>(bTensorTy.getEncoding());
 +  assert(layoutA != nullptr && layoutB != nullptr);
 +
 +  int bitwidth = aTensorTy.getElementType().getIntOrFloatBitWidth();
-+  auto mmaEnc = layoutA.getParent().cast<NvidiaMmaEncodingAttr>();
++  auto mmaEnc = cast<NvidiaMmaEncodingAttr>(layoutA.getParent());
 +  auto repA = mmaEnc.getMMAv2Rep(triton::gpu::getShapePerCTA(aTensorTy),
 +                                 bitwidth, layoutA.getOpIdx());
 +  auto repB = mmaEnc.getMMAv2Rep(triton::gpu::getShapePerCTA(bTensorTy),
@@ -633,7 +443,7 @@ new file mode 100644
 +  }
 +
 +  // Flatten accumulator values.
-+  auto dTensorTy = op.getD().getType().cast<RankedTensorType>();
++  auto dTensorTy = cast<RankedTensorType>(op.getD().getType());
 +  auto fc = unpackLLElements(loc, adaptor.getC(), rewriter);
 +
 +  // Create `mma.sp` instruction for 4/8 core matrices.
@@ -710,7 +520,7 @@ new file mode 100644
 +                     Location loc, std::vector<unsigned int> instrShape,
 +                     bool trans, int dimWpt, Value warpId, MemDescType tensorTy,
 +                     Value baseDesc, int minor) {
-+  auto sharedLayout = tensorTy.getEncoding().cast<SharedEncodingAttr>();
++  auto sharedLayout = cast<SharedEncodingAttr>(tensorTy.getEncoding());
 +  int elemBytes = tensorTy.getElementTypeBitWidth() / 8;
 +  int elemsPerSwizzlingRow =
 +      kMmaLineSize / sharedLayout.getPerPhase() / elemBytes;
@@ -739,10 +549,10 @@ new file mode 100644
 +                                 ConversionPatternRewriter &rewriter,
 +                                 Value thread) {
 +  // Get number of repetitions across the dimensions.
-+  auto aTensorTy = op.getA().getType().cast<MemDescType>();
-+  auto bTensorTy = op.getB().getType().cast<MemDescType>();
-+  auto dTensorTy = op.getD().getType().cast<RankedTensorType>();
-+  auto mmaEnc = dTensorTy.getEncoding().cast<NvidiaMmaEncodingAttr>();
++  auto aTensorTy = cast<MemDescType>(op.getA().getType());
++  auto bTensorTy = cast<MemDescType>(op.getB().getType());
++  auto dTensorTy = cast<RankedTensorType>(op.getD().getType());
++  auto mmaEnc = cast<NvidiaMmaEncodingAttr>(dTensorTy.getEncoding());
 +
 +  auto shapePerCTA = getShapePerCTA(dTensorTy);
 +  auto shapePerCTATile = getShapePerCTATile(mmaEnc);
@@ -771,7 +581,7 @@ new file mode 100644
 +    auto sharedObj = getSharedMemoryObjectFromStruct(
 +        loc, arg, typeConverter->convertType(tensorTy.getElementType()),
 +        rewriter);
-+    auto sharedLayout = tensorTy.getEncoding().cast<SharedEncodingAttr>();
++    auto sharedLayout = cast<SharedEncodingAttr>(tensorTy.getEncoding());
 +    auto shape = getShapePerCTA(tensorTy);
 +    auto ord = sharedLayout.getOrder();
 +    int byteSize = aTensorTy.getElementTypeBitWidth() / 8;
@@ -869,9 +679,9 @@ new file mode 100644
 +                                 SparseDotOp::Adaptor adaptor,
 +                                 const LLVMTypeConverter *typeConverter,
 +                                 ConversionPatternRewriter &rewriter) {
-+  auto resultTy = op.getResult().getType().cast<RankedTensorType>();
++  auto resultTy = cast<RankedTensorType>(op.getResult().getType());
 +  NvidiaMmaEncodingAttr mmaLayout =
-+      resultTy.getEncoding().cast<NvidiaMmaEncodingAttr>();
++      cast<NvidiaMmaEncodingAttr>(resultTy.getEncoding());
 +
 +  if (mmaLayout.isAmpere()) {
 +    return convertSparseMMA(op, adaptor, typeConverter, rewriter);
@@ -885,9 +695,10 @@ new file mode 100644
 +      "Unsupported SparseDotOp found when converting TritonGPU to LLVM.");
 +}
 diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/WGMMA.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/WGMMA.cpp
+index 738f0fe04..867939f65 100644
 --- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/WGMMA.cpp
 +++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/WGMMA.cpp
-@@ -87,8 +87,8 @@ int64_t getSwizzlingFromLayout(const Sha
+@@ -88,8 +88,8 @@ int64_t getSwizzlingFromLayout(const SharedEncodingAttr &layout,
    return swizzlingByteWidth;
  }
  
diff --git a/third_party/triton/xla_extensions/sparse_dot_fixes_y24w17.patch b/third_party/triton/xla_extensions/sparse_dot_fixes_y24w17.patch
new file mode 100644
index 00000000000000..ce009aa688e9bf
--- /dev/null
+++ b/third_party/triton/xla_extensions/sparse_dot_fixes_y24w17.patch
@@ -0,0 +1,71 @@
+diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
+index 0516fc56f..1f27f8a43 100644
+--- a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
++++ b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
+@@ -142,6 +142,7 @@ class BlockedToMMA : public mlir::RewritePattern {
+                 mlir::TypeID::get<arith::ArithDialect>());
+   }
+ 
++public:
+   // Finds the first different bitwidth in the chain of shape-preserving
+   // unary ops that x depends on.
+   // There are two primary scenarios:
+@@ -175,7 +176,6 @@ class BlockedToMMA : public mlir::RewritePattern {
+     return origBitWidth;
+   }
+ 
+-public:
+   BlockedToMMA(mlir::MLIRContext *context, int computeCapability)
+       : mlir::RewritePattern(tt::DotOp::getOperationName(), 2, context),
+         computeCapability(computeCapability) {}
+@@ -389,18 +389,22 @@ class SparseBlockedToMMA : public mlir::RewritePattern {
+                                                         newRetType, oldAcc);
+ 
+     if (versionMajor == 2) {
++      int minBitwidth = std::min(BlockedToMMA::computeOrigBitWidth(a),
++                                 BlockedToMMA::computeOrigBitWidth(b));
++      int kWidth = 32 / minBitwidth;
++
+       // convert A operand
+       auto oldAType = cast<RankedTensorType>(a.getType());
+-      auto newAEncoding = ttg::DotOperandEncodingAttr::get(
+-          ctx, 0, mmaEnc, oldAType.getElementType());
++      auto newAEncoding =
++          ttg::DotOperandEncodingAttr::get(ctx, 0, mmaEnc, kWidth);
+       auto newAType = RankedTensorType::get(
+           oldAType.getShape(), oldAType.getElementType(), newAEncoding);
+       a = rewriter.create<ttg::ConvertLayoutOp>(a.getLoc(), newAType, a);
+ 
+       // convert B operand
+       auto oldBType = cast<RankedTensorType>(b.getType());
+-      auto newBEncoding = ttg::DotOperandEncodingAttr::get(
+-          ctx, 1, mmaEnc, oldBType.getElementType());
++      auto newBEncoding =
++          ttg::DotOperandEncodingAttr::get(ctx, 1, mmaEnc, kWidth);
+       auto newBType = RankedTensorType::get(
+           oldBType.getShape(), oldBType.getElementType(), newBEncoding);
+       b = rewriter.create<ttg::ConvertLayoutOp>(b.getLoc(), newBType, b);
+diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
+index 3011cf73d..ea587dced 100644
+--- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
++++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
+@@ -22,16 +22,16 @@ Value convertLayout(
+   // Calculate tile size as number of mask elements (4xi4).
+   NvidiaMmaEncodingAttr mmaLayout =
+       cast<NvidiaMmaEncodingAttr>(sparseEncoding.getParent());
++  SmallVector<unsigned> warpsPerCTA = mmaLayout.getWarpsPerCTA();
+   SmallVector<unsigned> shapePerCTATile = {
+-      kTileSize * mmaLayout.getWarpsPerCTA()[0],
+-      kTileSize / kMetadataElementsPerPackedValue};
++      kTileSize * warpsPerCTA[0], kTileSize / kMetadataElementsPerPackedValue};
+   Value strideM = smemObj.strides[0];
+   Value strideK = smemObj.strides[1];
+ 
+   // Calculate offset in the tile for the current thread.
+   Value threadsPerWarp = i32_val(kThreadsPerWarp);
+   Value warpId = udiv(thread, threadsPerWarp);
+-  Value warpGroupId = urem(warpId, i32_val(shapePerCTATile[0] / kTileSize));
++  Value warpGroupId = udiv(warpId, i32_val(warpsPerCTA[1]));
+   Value laneId = urem(thread, threadsPerWarp);
+   Value laneGroupId = udiv(laneId, i32_val(kThreadsInGroup));
+   Value columnId = urem(laneId, i32_val(shapePerCTATile[1]));
diff --git a/third_party/triton/xla_extensions/sparse_dot_fixes_y24w19.patch b/third_party/triton/xla_extensions/sparse_dot_fixes_y24w19.patch
new file mode 100644
index 00000000000000..775ed317d1f9b9
--- /dev/null
+++ b/third_party/triton/xla_extensions/sparse_dot_fixes_y24w19.patch
@@ -0,0 +1,31 @@
+diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
+--- a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
++++ b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
+@@ -365,7 +365,8 @@ class SparseBlockedToMMA : public mlir::
+ 
+     assert(computeCapability >= 80 &&
+            "SparseDot is supported on Ampere and higher");
+-    int versionMajor = computeCapability < 90 ? 2 : 3;
++    bool allowV3 = triton::tools::getBoolEnv("ENABLE_MMA_V3");
++    int versionMajor = computeCapability >= 90 && allowV3 ? 3 : 2;
+ 
+     // get MMA encoding for the given number of warps
+     auto retShapePerCTA = ttg::getShapePerCTA(oldRetType);
+diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
+--- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
++++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
+@@ -31,7 +31,13 @@ Value convertLayout(
+   // Calculate offset in the tile for the current thread.
+   Value threadsPerWarp = i32_val(kThreadsPerWarp);
+   Value warpId = udiv(thread, threadsPerWarp);
+-  Value warpGroupId = udiv(warpId, i32_val(warpsPerCTA[1]));
++  Value warpGroupId;
++  if (mmaLayout.isHopper()) {
++    warpGroupId = urem(warpId, i32_val(warpsPerCTA[0]));
++  } else {
++    assert(mmaLayout.isAmpere());
++    warpGroupId = udiv(warpId, i32_val(warpsPerCTA[1]));
++  }
+   Value laneId = urem(thread, threadsPerWarp);
+   Value laneGroupId = udiv(laneId, i32_val(kThreadsInGroup));
+   Value columnId = urem(laneId, i32_val(shapePerCTATile[1]));
diff --git a/third_party/triton/xla_extensions/sparse_dot_nvgpu.patch b/third_party/triton/xla_extensions/sparse_dot_nvgpu.patch
index b96aeacced5743..791618363b2f34 100644
--- a/third_party/triton/xla_extensions/sparse_dot_nvgpu.patch
+++ b/third_party/triton/xla_extensions/sparse_dot_nvgpu.patch
@@ -1,7 +1,8 @@
-diff --git a/include/triton/Dialect/NVGPU/IR/NVGPUOps.td b/include/triton/Dialect/NVGPU/IR/NVGPUOps.td
---- a/include/triton/Dialect/NVGPU/IR/NVGPUOps.td
-+++ b/include/triton/Dialect/NVGPU/IR/NVGPUOps.td
-@@ -87,6 +87,15 @@ def NVGPU_WGMMAOp : NVGPU_Op<"wgmma", []
+diff --git a/third_party/nvidia/include/Dialect/NVGPU/IR/NVGPUOps.td b/third_party/nvidia/include/Dialect/NVGPU/IR/NVGPUOps.td
+index ca9d18873..d39bc6ec4 100644
+--- a/third_party/nvidia/include/Dialect/NVGPU/IR/NVGPUOps.td
++++ b/third_party/nvidia/include/Dialect/NVGPU/IR/NVGPUOps.td
+@@ -87,6 +87,15 @@ def NVGPU_WGMMAOp : NVGPU_Op<"wgmma", []> {
    let assemblyFormat = "$opA `,` $opB (`,` $opC^)? attr-dict `:` functional-type(operands, $res)";
  }
  
@@ -17,29 +18,11 @@ diff --git a/include/triton/Dialect/NVGPU/IR/NVGPUOps.td b/include/triton/Dialec
  def NVGPU_LoadDSmemOp : NVGPU_Op<"load_dsmem", [MemoryEffects<[MemRead]>]> {
    let arguments = (ins LLVM_AnyPointer:$addr, I32:$ctaId, I32Attr:$bitwidth, I32Attr:$vec);
    let builders = [
-diff --git a/test/SparseDot/test_wgmma_sp.mlir b/test/SparseDot/test_wgmma_sp.mlir
-new file mode 100644
---- /dev/null
-+++ b/test/SparseDot/test_wgmma_sp.mlir
-@@ -0,0 +1,14 @@
-+// RUN: triton-opt %s -split-input-file --convert-nv-gpu-to-llvm | FileCheck %s
-+
-+module attributes {"triton_gpu.num-warps" = 4 : i32} {
-+  tt.func @wgmma_sp(%descA: i64, %metaA: i32, %descB: i64, %acc: !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>) {
-+    // CHECK: @wgmma_sp(%[[LHS:.*]]: i64, %[[META:.*]]: i32, %[[RHS:.*]]: i64,
-+    // CHECK: llvm.inline_asm has_side_effects asm_dialect = att operand_attrs = []
-+    // CHECK-SAME: "wgmma.mma_async.sp.sync.aligned.m64n16k32.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7}, $16, $17, $18, 0, 1, 1, 1, 0, 0;"
-+    // CHECK-SAME: "=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,l,l,r" %0, %1, %2, %3, %4, %5, %6, %7, %[[LHS]], %[[RHS]], %[[META]]
-+    %acc0 = nvgpu.wgmma_sp %descA meta %metaA, %descB, %acc
-+    {eltTypeA = 5 : i32, eltTypeB = 5 : i32, eltTypeC = 7 : i32, layoutA = 0 : i32, layoutB = 1 : i32, m = 64 : i32, n = 16 : i32, k = 32 : i32} :
-+    (i64, i32, i64, !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
-+    tt.return
-+  }
-+}
 diff --git a/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp b/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp
+index e19216520..aacbfb569 100644
 --- a/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp
 +++ b/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp
-@@ -688,6 +688,84 @@ public:
+@@ -668,6 +668,84 @@ public:
    }
  };
  
@@ -50,7 +33,7 @@ diff --git a/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp b/third_part
 +  using Base::Base;
 +
 +  std::vector<std::string> getOutputConstraints(ttn::SparseWGMMAOp op) const {
-+    auto outputStructType = op.getType().cast<LLVM::LLVMStructType>();
++    auto outputStructType = cast<LLVM::LLVMStructType>(op.getType());
 +    uint32_t numOutputRegs = outputStructType.getBody().size();
 +    std::string output =
 +        outputStructType.getBody().front().isF32() ? "=f" : "=r";
@@ -90,7 +73,7 @@ diff --git a/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp b/third_part
 +
 +    // Output and operand C
 +    uint32_t numCRegs =
-+        op.getType().cast<LLVM::LLVMStructType>().getBody().size();
++        cast<LLVM::LLVMStructType>(op.getType()).getBody().size();
 +    args += "{";
 +    for (uint32_t i = 0; i < numCRegs; ++i) {
 +      args += "$" + std::to_string(asmOpIdx++) + (i == numCRegs - 1 ? "" : ",");
@@ -124,13 +107,17 @@ diff --git a/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp b/third_part
  class ConvertNVGPUToLLVM : public ConvertNVGPUToLLVMBase<ConvertNVGPUToLLVM> {
  
  public:
-@@ -711,7 +789,8 @@ public:
+@@ -688,10 +766,9 @@ public:
+     patterns.add<NVGPUOpGenericPattern<ttn::ClusterCTAIdOp>>(
+         context, Cluster_Cta_Id_Op, Constraints({"=r"}), Constraints());
  
-     patterns.add<FenceAsyncSharedOpPattern, StoreMatrixOpPattern,
-                  ClusterArriveOpPattern, LoadDSmemOpPattern, WGMMAOpPattern,
--                 WGMMAWaitGroupOpPattern, StoreDSmemOpPattern>(context);
-+                 WGMMAWaitGroupOpPattern, StoreDSmemOpPattern,
-+                 SparseWGMMAOpPattern>(context);
+-    patterns
+-        .add<FenceAsyncSharedOpPattern, StoreMatrixOpPattern,
+-             ClusterArriveOpPattern, WGMMAOpPattern, WGMMAWaitGroupOpPattern>(
+-            context);
++    patterns.add<FenceAsyncSharedOpPattern, StoreMatrixOpPattern,
++                 ClusterArriveOpPattern, WGMMAOpPattern,
++                 WGMMAWaitGroupOpPattern, SparseWGMMAOpPattern>(context);
  
      if (applyPatternsAndFoldGreedily(mod, std::move(patterns)).failed())
        signalPassFailure();
diff --git a/third_party/triton/xla_extensions/sparse_dot_passes.patch b/third_party/triton/xla_extensions/sparse_dot_passes.patch
index e610f67cd70309..9136cb84b24254 100644
--- a/third_party/triton/xla_extensions/sparse_dot_passes.patch
+++ b/third_party/triton/xla_extensions/sparse_dot_passes.patch
@@ -1,7 +1,8 @@
 diff --git a/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp b/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp
+index 4aa2712ec..16a6253d7 100644
 --- a/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp
 +++ b/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp
-@@ -277,6 +277,89 @@ struct TritonDotPattern : public OpConve
+@@ -279,6 +279,89 @@ struct TritonDotPattern : public OpConversionPattern<triton::DotOp> {
    }
  };
  
@@ -12,7 +13,7 @@ diff --git a/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp b/lib/Co
 +  LogicalResult matchAndRewrite(
 +      triton::gpu::SparseDotOp op, OpAdaptor adaptor,
 +      ConversionPatternRewriter &rewriter) const override {
-+    RankedTensorType origType = op.getType().cast<RankedTensorType>();
++    RankedTensorType origType = cast<RankedTensorType>(op.getType());
 +    auto origShape = origType.getShape();
 +    auto typeConverter = getTypeConverter<TritonGPUTypeConverter>();
 +    int numWarps = typeConverter->getNumWarps();
@@ -40,8 +41,8 @@ diff --git a/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp b/lib/Co
 +        RankedTensorType::get(origShape, origType.getElementType(), dEncoding);
 +
 +    // a & b must be of smem layout
-+    auto aType = adaptor.getA().getType().cast<RankedTensorType>();
-+    auto bType = adaptor.getB().getType().cast<RankedTensorType>();
++    auto aType = cast<RankedTensorType>(adaptor.getA().getType());
++    auto bType = cast<RankedTensorType>(adaptor.getB().getType());
 +    Type aEltType = aType.getElementType();
 +    Type bEltType = bType.getElementType();
 +    Attribute aEncoding = aType.getEncoding();
@@ -51,14 +52,14 @@ diff --git a/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp b/lib/Co
 +    Value a = adaptor.getA();
 +    Value b = adaptor.getB();
 +    Value c = adaptor.getC();
-+    if (!aEncoding.isa<triton::gpu::DotOperandEncodingAttr>()) {
++    if (!isa<triton::gpu::DotOperandEncodingAttr>(aEncoding)) {
 +      Attribute encoding = triton::gpu::DotOperandEncodingAttr::get(
 +          getContext(), 0, dEncoding, aEltType);
 +      auto dstType =
 +          RankedTensorType::get(aType.getShape(), aEltType, encoding);
 +      a = rewriter.create<triton::gpu::ConvertLayoutOp>(a.getLoc(), dstType, a);
 +    }
-+    if (!bEncoding.isa<triton::gpu::DotOperandEncodingAttr>()) {
++    if (!isa<triton::gpu::DotOperandEncodingAttr>(bEncoding)) {
 +      Attribute encoding = triton::gpu::DotOperandEncodingAttr::get(
 +          getContext(), 1, dEncoding, bEltType);
 +      auto dstType =
@@ -68,11 +69,11 @@ diff --git a/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp b/lib/Co
 +    c = rewriter.create<triton::gpu::ConvertLayoutOp>(c.getLoc(), retType, c);
 +
 +    // aMeta must be of smem layout
-+    auto aMetaType = adaptor.getAMeta().getType().cast<RankedTensorType>();
++    auto aMetaType = cast<RankedTensorType>(adaptor.getAMeta().getType());
 +    Attribute aMetaEncoding = aMetaType.getEncoding();
 +    if (!aMetaEncoding) return failure();
 +    Value aMeta = adaptor.getAMeta();
-+    if (!aMetaEncoding.isa<triton::gpu::SparseDotMetaEncodingAttr>()) {
++    if (!isa<triton::gpu::SparseDotMetaEncodingAttr>(aMetaEncoding)) {
 +      Attribute encoding =
 +          triton::gpu::SparseDotMetaEncodingAttr::get(getContext(), dEncoding);
 +      auto dstType = RankedTensorType::get(
@@ -91,17 +92,17 @@ diff --git a/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp b/lib/Co
  struct TritonCatPattern : public OpConversionPattern<triton::CatOp> {
    using OpConversionPattern::OpConversionPattern;
  
-@@ -550,6 +633,7 @@ void populateTritonPatterns(TritonGPUTyp
-       GenericOpPattern<triton::AtomicRMWOp>, GenericOpPattern<ReturnOp>,
+@@ -553,6 +636,7 @@ void populateTritonPatterns(TritonGPUTypeConverter &typeConverter,
+       GenericOpPattern<triton::ExperimentalDescriptorStoreOp>,
        GenericOpPattern<triton::CallOp>, TritonFuncOpPattern>(typeConverter,
                                                               context);
 +  patterns.insert<TritonSparseDotPattern>(typeConverter, context);
  }
  
  //
-@@ -788,6 +872,12 @@ public:
-                  IntegerAttr::get(
-                      i32_ty, llvm::APInt(32, computeCapability.getValue())));
+@@ -794,6 +878,12 @@ public:
+     mod->setAttr(AttrTargetName,
+                  StringAttr::get(context, this->target.getValue()));
  
 +    // Only transform sparse dot op with undefined layout.
 +    target.addDynamicallyLegalOp<triton::gpu::SparseDotOp>(
@@ -113,9 +114,10 @@ diff --git a/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp b/lib/Co
        return signalPassFailure();
  
 diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
+index 098ee85e4..0516fc56f 100644
 --- a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
 +++ b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-@@ -42,8 +42,9 @@ static int getMMAVersionSafe(int compute
+@@ -44,8 +44,9 @@ static int getMMAVersionSafe(int computeCapability, tt::DotOp op) {
    return 0;
  }
  
@@ -126,7 +128,7 @@ diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect
    auto rank = shape.size();
    // Early exit for batched matmul
    if (rank == 3)
-@@ -56,14 +57,14 @@ warpsPerTileV2(tt::DotOp dotOp, const Ar
+@@ -58,8 +59,8 @@ warpsPerTileV2(tt::DotOp dotOp, const ArrayRef<int64_t> shape, int numWarps) {
    auto slices = multiRootGetSlice(dotOp, {filter}, {filter});
    bool hasChainedDot = false;
    for (Operation *op : slices) {
@@ -137,14 +139,7 @@ diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect
        auto resTy = chainedDot.getResult().getType();
        if (resTy.getRank() != rank) {
          continue;
-       }
-       if (auto mmaEncoding =
--              resTy.getEncoding().dyn_cast<NvidiaMmaEncodingAttr>()) {
-+              resTy.getEncoding().template dyn_cast<NvidiaMmaEncodingAttr>()) {
-         return ttg::getWarpsPerCTA(mmaEncoding);
-       }
-       hasChainedDot = true;
-@@ -101,12 +102,13 @@ warpsPerTileV2(tt::DotOp dotOp, const Ar
+@@ -103,12 +104,13 @@ warpsPerTileV2(tt::DotOp dotOp, const ArrayRef<int64_t> shape, int numWarps) {
    return ret;
  }
  
@@ -162,7 +157,7 @@ diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect
        slices.end())
      return {(unsigned)numWarps, 1};
  
-@@ -175,9 +177,10 @@ public:
+@@ -178,9 +180,10 @@ public:
        : mlir::RewritePattern(tt::DotOp::getOperationName(), 2, context),
          computeCapability(computeCapability) {}
  
@@ -176,7 +171,7 @@ diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect
      switch (version) {
      case 2:
        return warpsPerTileV2(dotOp, shape, numWarps);
-@@ -337,6 +340,97 @@ public:
+@@ -335,6 +338,98 @@ public:
      return success();
    }
  };
@@ -201,7 +196,7 @@ diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect
 +    // Check data-types and SM compatibility
 +    RankedTensorType oldRetType = dotOp.getType();
 +    if (!oldRetType.getEncoding() ||
-+        oldRetType.getEncoding().isa<ttg::NvidiaMmaEncodingAttr>())
++        isa<ttg::NvidiaMmaEncodingAttr>(oldRetType.getEncoding()))
 +      return failure();
 +
 +    assert(computeCapability >= 80 &&
@@ -214,8 +209,9 @@ diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect
 +    int numWarps = ttg::TritonGPUDialect::getNumWarps(mod);
 +    auto CTALayout = ttg::getCTALayout(oldRetType.getEncoding());
 +
-+    auto instrShape = mmaVersionToInstrShape(
-+        versionMajor, retShapePerCTA, a.getType().cast<TensorOrMemDesc>());
++    auto instrShape =
++        mmaVersionToInstrShape(versionMajor, retShapePerCTA,
++                               cast<TensorOrMemDesc>(a.getType()), numWarps);
 +    auto warpsPerTile = BlockedToMMA::getWarpsPerTile(
 +        dotOp, retShapePerCTA, versionMajor, numWarps, instrShape);
 +    ttg::NvidiaMmaEncodingAttr mmaEnc =
@@ -231,7 +227,7 @@ diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect
 +
 +    if (versionMajor == 2) {
 +      // convert A operand
-+      auto oldAType = a.getType().cast<RankedTensorType>();
++      auto oldAType = cast<RankedTensorType>(a.getType());
 +      auto newAEncoding = ttg::DotOperandEncodingAttr::get(
 +          ctx, 0, mmaEnc, oldAType.getElementType());
 +      auto newAType = RankedTensorType::get(
@@ -239,7 +235,7 @@ diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect
 +      a = rewriter.create<ttg::ConvertLayoutOp>(a.getLoc(), newAType, a);
 +
 +      // convert B operand
-+      auto oldBType = b.getType().cast<RankedTensorType>();
++      auto oldBType = cast<RankedTensorType>(b.getType());
 +      auto newBEncoding = ttg::DotOperandEncodingAttr::get(
 +          ctx, 1, mmaEnc, oldBType.getElementType());
 +      auto newBType = RankedTensorType::get(
@@ -252,7 +248,7 @@ diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect
 +
 +    // convert metadata
 +    Value meta = dotOp.getAMeta();
-+    auto oldMetaType = meta.getType().cast<RankedTensorType>();
++    auto oldMetaType = cast<RankedTensorType>(meta.getType());
 +    auto newMetaType = RankedTensorType::get(
 +        oldMetaType.getShape(), oldMetaType.getElementType(),
 +        SparseDotMetaEncodingAttr::get(ctx, mmaEnc));
@@ -274,7 +270,7 @@ diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect
  } // namespace
  
  static Value promoteOperand(OpBuilder &builder, Location loc, Value operand,
-@@ -397,6 +491,7 @@ public:
+@@ -394,6 +489,7 @@ public:
  
      mlir::RewritePatternSet patterns(context);
      patterns.add<::BlockedToMMA>(context, computeCapability);
@@ -283,33 +279,31 @@ diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect
        signalPassFailure();
      }
 diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp
+index 97ca6a840..f0ef124ff 100644
 --- a/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp
 +++ b/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp
-@@ -47,6 +47,10 @@ struct PipelinedOpInfo {
-   bool loadIsMMAV3 = false;
+@@ -188,6 +188,10 @@ public:
+   }
  };
  
-+bool isDotOp(Operation* op) {
++static bool isDotOp(Operation* op) {
 +  return isa<tt::DotOp, ttg::SparseDotOp>(op);
 +}
 +
- } // namespace
- 
  static bool isMMAv3Dot(Operation *op) {
-@@ -163,22 +167,28 @@ getSharedEncIfAllUsersAreDotEnc(Value va
+   auto dot = dyn_cast<tt::DotOp>(op);
+   if (!dot)
+@@ -399,19 +403,28 @@ getSharedEncIfAllUsersAreDotEnc(Value val) {
      } else {
        if (!isa<ttg::LocalLoadOp, ttg::ConvertLayoutOp>(user))
          return std::nullopt;
--      auto dotOpEnc = user->getResult(0)
--                          .getType()
--                          .cast<TensorOrMemDesc>()
--                          .getEncoding()
--                          .dyn_cast<ttg::DotOperandEncodingAttr>();
+-      auto dotOpEnc = dyn_cast<ttg::DotOperandEncodingAttr>(
+-          cast<TensorOrMemDesc>(user->getResult(0).getType()).getEncoding());
 -      if (!dotOpEnc)
 +      auto enc =
-+          user->getResult(0).getType().cast<TensorOrMemDesc>().getEncoding();
++          cast<TensorOrMemDesc>(user->getResult(0).getType()).getEncoding();
 +      if (isa<ttg::DotOperandEncodingAttr>(enc)) {
-+        auto srcTy = val.getType().cast<TensorOrMemDesc>();
++        auto srcTy = cast<TensorOrMemDesc>(val.getType());
 +        auto CTALayout = ttg::getCTALayout(srcTy.getEncoding());
 +        auto order = ttg::getOrder(srcTy.getEncoding());
 +        unsigned bitWidth = srcTy.getElementType().getIntOrFloatBitWidth();
@@ -320,14 +314,14 @@ diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp b
 +            srcTy.getElementType().getIntOrFloatBitWidth(),
 +            /*needTrans=*/false);
 +      } else if (isa<ttg::SparseDotMetaEncodingAttr>(enc)) {
-+        auto srcTy = val.getType().cast<TensorOrMemDesc>();
++        auto srcTy = cast<TensorOrMemDesc>(val.getType());
 +        tempAttr = ttg::SharedEncodingAttr::get(
 +            val.getContext(), /*vec=*/1, /*perPhase=*/1, /*maxPhase=*/1,
 +            ttg::getOrder(srcTy.getEncoding()),
 +            ttg::getCTALayout(srcTy.getEncoding()));
 +      } else {
          return std::nullopt;
--      auto srcTy = val.getType().cast<TensorOrMemDesc>();
+-      auto srcTy = cast<TensorOrMemDesc>(val.getType());
 -      auto CTALayout = ttg::getCTALayout(srcTy.getEncoding());
 -      auto order = ttg::getOrder(srcTy.getEncoding());
 -      unsigned bitWidth = srcTy.getElementType().getIntOrFloatBitWidth();
@@ -340,61 +334,63 @@ diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp b
      }
      // Check that the shared encodings needed by the users are compatible.
      if (!tempAttr || (attr != nullptr && attr != tempAttr))
-@@ -311,7 +321,7 @@ loadOpsToDistanceAndUse(scf::ForOp forOp
+@@ -518,7 +531,7 @@ loadOpsToIndirectionLevelAndUse(scf::ForOp forOp) {
        };
  
    for (Operation &op : forOp.getBody()->without_terminator()) {
 -    if (!isa<tt::DotOp>(op))
 +    if (!isDotOp(&op))
        continue;
+     seen.clear();
      dfs(&op, 0, &op);
-   }
-@@ -385,7 +395,7 @@ collectOpsToPipeline(scf::ForOp forOp,
-   // loads.
-   for (auto &[loadOp, distAndUse] : loadOpToDistAndUse) {
-     PipelinedOpInfo loadInfo;
--    if (isa<tt::DotOp>(distAndUse.second)) {
-+    if (isDotOp(distAndUse.second)) {
-       if (loadIsMMAv3(loadOp)) {
-         loadInfo.loadIsMMAV3 = true;
-         loadInfo.sharedEncoding =
-@@ -743,7 +753,7 @@ bool mlir::triton::preProcessLoopAndGetS
-     int useStage = opToInfo[info.use].stage;
-     int numBuffers = useStage - defStage;
+@@ -595,7 +608,8 @@ assignMemoryLayouts(llvm::SmallVector<std::tuple<Operation *, int, Operation *>>
+         continue;
+     }
  
--    if (hasMMAV3 && isa<tt::DotOp>(info.use)) {
-+    if (hasMMAV3 && isDotOp(info.use)) {
-       // For MMAv3, we need an extra buffer as this is assumed in the wgmma
-       // pipelining post-processing.
-       numBuffers++;
+-    if (auto dot = dyn_cast<tt::DotOp>(use)) {
++    if (isDotOp(use)) {
++      auto dot = dyn_cast<tt::DotOp>(use);
+       loadInfo.usedByDot = true;
+       if (loadIsMMAv3(op)) {
+         loadInfo.loadIsMMAV3 = true;
+@@ -614,7 +628,7 @@ assignMemoryLayouts(llvm::SmallVector<std::tuple<Operation *, int, Operation *>>
+         // The codegen bug is caught by an assertion, so if you think you've
+         // fixed it, feel free to delete this code and see if the assert still
+         // fails.  :)
+-        if (!loadInfo.sharedEncoding) {
++        if (dot && !loadInfo.sharedEncoding) {
+           if (auto dotEnc = dyn_cast<ttg::NvidiaMmaEncodingAttr>(
+                   dot.getResult().getType().getEncoding())) {
+             auto loadTy = cast<RankedTensorType>(op->getResultTypes()[0]);
 diff --git a/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp b/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp
+index 2211df31b..ee5ff44d8 100644
 --- a/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp
 +++ b/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp
-@@ -36,6 +36,10 @@ public:
+@@ -37,6 +37,10 @@ public:
        auto srcEncoding = srcType.getEncoding();
-       if (srcEncoding.isa<triton::gpu::SharedEncodingAttr>())
+       if (isa<triton::gpu::SharedEncodingAttr>(srcEncoding))
          return;
-+      if (dstType.getEncoding().isa<triton::gpu::SparseDotMetaEncodingAttr>()) {
++      if (isa<triton::gpu::SparseDotMetaEncodingAttr>(dstType.getEncoding())) {
 +        replaceSparseMetaEncoding(cvtOp);
 +        return;
 +      }
        auto dstDotOp =
-           dstType.getEncoding().dyn_cast<triton::gpu::DotOperandEncodingAttr>();
+           dyn_cast<triton::gpu::DotOperandEncodingAttr>(dstType.getEncoding());
        if (!dstDotOp)
-@@ -74,6 +78,27 @@ public:
+@@ -83,6 +87,27 @@ public:
        cvtOp.erase();
      });
    }
 +
 + private:
 +  void replaceSparseMetaEncoding(triton::gpu::ConvertLayoutOp cvtOp) {
-+    auto srcType = cvtOp.getOperand().getType().cast<RankedTensorType>();
++    auto srcType = cast<RankedTensorType>(cvtOp.getOperand().getType());
 +    auto srcEncoding = srcType.getEncoding();
 +    auto sharedLayout = triton::gpu::SharedEncodingAttr::get(
 +        cvtOp.getContext(), 8, 1, 1, triton::gpu::getOrder(srcEncoding),
 +        triton::gpu::getCTALayout(srcEncoding));
 +
-+    auto dstType = cvtOp.getType().cast<RankedTensorType>();
++    auto dstType = cast<RankedTensorType>(cvtOp.getType());
 +    auto tmpType = triton::MemDescType::get(
 +        dstType.getShape(), dstType.getElementType(), sharedLayout);
 +
@@ -410,6 +406,7 @@ diff --git a/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp b/lib/Di
  
  std::unique_ptr<Pass> mlir::triton::gpu::createReduceDataDuplicationPass() {
 diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
+index f456d36a6..a1dac2b72 100644
 --- a/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
 +++ b/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
 @@ -45,7 +45,7 @@ public:
@@ -421,7 +418,7 @@ diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp b/lib/Dia
          return WalkResult::advance();
        OpBuilder builder(op);
        auto a = op->getOperand(0);
-@@ -83,7 +83,7 @@ private:
+@@ -80,7 +80,7 @@ private:
      static DenseSet<std::pair<Operation *, unsigned>> trace;
      auto op = operand.getDefiningOp();
      // avoid redundant insertion
@@ -430,162 +427,3 @@ diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp b/lib/Dia
        return false;
      // reach convertlayout
      if (op && isa<ttg::LocalAllocOp>(op) &&
-diff --git a/test/SparseDot/add_layout.mlir b/test/SparseDot/add_layout.mlir
-new file mode 100644
---- /dev/null
-+++ b/test/SparseDot/add_layout.mlir
-@@ -0,0 +1,15 @@
-+// RUN: triton-opt %s -split-input-file -convert-triton-to-tritongpu | FileCheck %s
-+
-+// CHECK-COUNT-4: #triton_gpu.blocked
-+module attributes {"triton_gpu.num-warps" = 4 : i32} {
-+  tt.func @sparse_dot() {
-+    %A = arith.constant dense<1.00e+00> : tensor<64x32xf16>
-+    %meta = arith.constant dense<0x3333> : tensor<64x4xi16>
-+    %B = arith.constant dense<2.00e+00> : tensor<64x64xf16>
-+    %C = arith.constant dense<0.00e+00> : tensor<64x64xf32>
-+    // CHECK-COUNT-4: triton_gpu.convert_layout
-+    // CHECK: triton_gpu.sparse_dot {{.+}} #triton_gpu.sparse_dot_meta
-+    %D = triton_gpu.sparse_dot %A, %B, %C, %meta : tensor<64x32xf16> meta tensor<64x4xi16> * tensor<64x64xf16> -> tensor<64x64xf32>
-+    tt.return
-+  }
-+}
-diff --git a/test/SparseDot/ttg_accelerate_matmul.mlir b/test/SparseDot/ttg_accelerate_matmul.mlir
-new file mode 100644
---- /dev/null
-+++ b/test/SparseDot/ttg_accelerate_matmul.mlir
-@@ -0,0 +1,27 @@
-+// RUN: ENABLE_MMA_V3=1 triton-opt %s -split-input-file -tritongpu-accelerate-matmul=compute-capability=90 | FileCheck %s
-+// RUN: triton-opt %s -split-input-file -tritongpu-accelerate-matmul=compute-capability=80 | FILECHECK_OPTS= FileCheck %s --check-prefix=CHECK-80
-+
-+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
-+// CHECK: #[[MMA:.+]] = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}>
-+// CHECK-80: #[[MMA:.+]] = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [16, 8]}>
-+#lhs = #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>
-+#rhs = #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>
-+module attributes {"triton_gpu.num-warps" = 4 : i32} {
-+  tt.func @sparse_dot(%A: tensor<64x32xf16, #lhs>, %B: tensor<64x64xf16, #rhs>, %meta: tensor<64x4xi16, #blocked>) -> tensor<64x64xf32, #blocked> {
-+    %C = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked>
-+    // CHECK-DAG: %[[LHS:.+]] = triton_gpu.local_alloc {{.+}} : (tensor<64x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>) -> !tt.memdesc<64x32xf16, #{{.+}}>
-+    // CHECK-DAG: %[[RHS:.+]] = triton_gpu.local_alloc {{.+}} : (tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>>) -> !tt.memdesc<64x64xf16, #{{.+}}>
-+    // CHECK-DAG: %[[ACC:.+]] = triton_gpu.convert_layout {{.+}} : tensor<64x64xf32, #blocked> -> tensor<64x64xf32, #[[MMA]]>
-+    // CHECK-DAG: %[[META:.+]] = triton_gpu.convert_layout {{.+}} : tensor<64x4xi16, #blocked> -> tensor<64x4xi16, #triton_gpu.sparse_dot_meta<{parent = #[[MMA]]}>>
-+    // CHECK: %[[OUT:.+]] = triton_gpu.sparse_dot %[[LHS]], %[[RHS]], %[[ACC]], %[[META]] : {{.+}} -> tensor<64x64xf32, #[[MMA]]>
-+    // CHECK-80-DAG: %[[LHS:.+]] = triton_gpu.convert_layout {{.+}} : {{.+}} -> tensor<64x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[MMA]], kWidth = 2}>>
-+    // CHECK-80-DAG: %[[RHS:.+]] = triton_gpu.convert_layout {{.+}} : {{.+}} -> tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[MMA]], kWidth = 2}>>
-+    // CHECK-80-DAG: %[[ACC:.+]] = triton_gpu.convert_layout {{.+}} : {{.+}} -> tensor<64x64xf32, #[[MMA]]>
-+    // CHECK-80-DAG: %[[META:.+]] = triton_gpu.convert_layout {{.+}} : {{.+}} -> tensor<64x4xi16, #triton_gpu.sparse_dot_meta<{parent = #[[MMA]]}>>
-+    // CHECK-80: %[[OUT:.+]] = triton_gpu.sparse_dot %[[LHS]], %[[RHS]], %[[ACC]], %[[META]] : {{.+}} -> tensor<64x64xf32, #[[MMA]]>
-+    %D = triton_gpu.sparse_dot %A, %B, %C, %meta : tensor<64x32xf16, #lhs> meta tensor<64x4xi16, #blocked> * tensor<64x64xf16, #rhs> -> tensor<64x64xf32, #blocked>
-+    // CHECK: triton_gpu.convert_layout %[[OUT]] : tensor<64x64xf32, #[[MMA]]> -> tensor<64x64xf32, #blocked>
-+    // CHECK-80: triton_gpu.convert_layout %[[OUT]] : tensor<64x64xf32, #[[MMA]]> -> tensor<64x64xf32, #blocked>
-+    tt.return %D : tensor<64x64xf32, #blocked>
-+  }
-+}
-diff --git a/test/SparseDot/ttg_fence_insertion.mlir b/test/SparseDot/ttg_fence_insertion.mlir
-new file mode 100644
---- /dev/null
-+++ b/test/SparseDot/ttg_fence_insertion.mlir
-@@ -0,0 +1,18 @@
-+// RUN: ENABLE_MMA_V3=1 triton-opt %s -split-input-file -triton-nvidia-gpu-fence-insertion | FileCheck %s
-+
-+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
-+#mma = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}>
-+#lhs = #triton_gpu.dot_op<{opIdx = 0, parent = #mma}>
-+#rhs = #triton_gpu.dot_op<{opIdx = 1, parent = #mma}>
-+#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}>
-+module attributes {"triton_gpu.num-warps" = 4 : i32} {
-+  tt.func public @sparse_dot_fence(%A: tensor<64x32xf16, #lhs>, %B: tensor<64x64xf16, #rhs>, %meta: tensor<64x4xi16, #blocked>) {
-+    %C = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #mma>
-+    %0 = triton_gpu.local_alloc %A : (tensor<64x32xf16, #lhs>) -> !tt.memdesc<64x32xf16, #shared>
-+    %1 = triton_gpu.local_alloc %B : (tensor<64x64xf16, #rhs>) -> !tt.memdesc<64x64xf16, #shared>
-+    %2 = triton_gpu.convert_layout %meta : tensor<64x4xi16, #blocked> -> tensor<64x4xi16, #triton_gpu.sparse_dot_meta<{parent = #mma}>>
-+    // CHECK: triton_nvidia_gpu.fence_async_shared
-+    %3 = triton_gpu.sparse_dot %0, %1, %C, %2 : !tt.memdesc<64x32xf16, #shared> meta tensor<64x4xi16, #triton_gpu.sparse_dot_meta<{parent = #mma}>> * !tt.memdesc<64x64xf16, #shared> -> tensor<64x64xf32, #mma>
-+    tt.return
-+  }
-+}
-diff --git a/test/SparseDot/ttg_loop_pipeline.mlir b/test/SparseDot/ttg_loop_pipeline.mlir
-new file mode 100644
---- /dev/null
-+++ b/test/SparseDot/ttg_loop_pipeline.mlir
-@@ -0,0 +1,61 @@
-+// RUN: triton-opt %s -split-input-file -tritongpu-pipeline=num-stages=3 | FileCheck %s
-+
-+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
-+#sliced = #triton_gpu.slice<{parent=#blocked, dim=0}>
-+#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, warpsPerCTA = [4, 1]}>
-+#dot_operand_a = #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth=2}>
-+#dot_operand_b = #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth=2}>
-+#dot_meta_enc = #triton_gpu.sparse_dot_meta<{parent=#mma}>
-+
-+module attributes {"triton_gpu.num-warps" = 4 : i32} {
-+  tt.func @sparse_dot_loop(%lb : index, %ub : index, %step : index,
-+        %A : !tt.ptr<f16> {tt.divisibility = 16 : i32},
-+        %B : !tt.ptr<f16> {tt.divisibility = 16 : i32},
-+        %A_meta : !tt.ptr<i16> {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #mma> {
-+    // CHECK-COUNT-6: triton_gpu.async_copy_global_to_local
-+    // CHECK: triton_gpu.async_wait {{.+}}, {{.+}} {num = 3 : i32}
-+    %a_ptr_splat = tt.splat %A : !tt.ptr<f16> -> tensor<128x32x!tt.ptr<f16>, #blocked>
-+    %a_tmp0 = tt.make_range {end = 32: i32, start = 0: i32} : tensor<32xi32, #sliced>
-+    %a_tmp1 = tt.expand_dims %a_tmp0 {axis = 0 : i32} : tensor<32xi32, #sliced> -> tensor<1x32xi32, #blocked>
-+    %a_offs = tt.broadcast %a_tmp1 : tensor<1x32xi32, #blocked> -> tensor<128x32xi32, #blocked>
-+    %a_ptr_init = tt.addptr %a_ptr_splat, %a_offs : tensor<128x32x!tt.ptr<f16>, #blocked>, tensor<128x32xi32, #blocked>
-+
-+    %b_ptr_splat = tt.splat %B : !tt.ptr<f16> -> tensor<64x128x!tt.ptr<f16>, #blocked>
-+    %b_tmp0 = tt.make_range {end = 128: i32, start = 0: i32} : tensor<128xi32, #sliced>
-+    %b_tmp1 = tt.expand_dims %b_tmp0 {axis = 0 : i32} : tensor<128xi32, #sliced> -> tensor<1x128xi32, #blocked>
-+    %b_offs = tt.broadcast %b_tmp1 : tensor<1x128xi32, #blocked> -> tensor<64x128xi32, #blocked>
-+    %b_ptr_init = tt.addptr %b_ptr_splat, %b_offs : tensor<64x128x!tt.ptr<f16>, #blocked>, tensor<64x128xi32, #blocked>
-+
-+    %meta_ptr_splat = tt.splat %A_meta : !tt.ptr<i16> -> tensor<128x4x!tt.ptr<i16>, #blocked>
-+    %meta_tmp0 = tt.make_range {end = 4: i32, start = 0: i32} : tensor<4xi32, #sliced>
-+    %meta_tmp1 = tt.expand_dims %meta_tmp0 {axis = 0 : i32} : tensor<4xi32, #sliced> -> tensor<1x4xi32, #blocked>
-+    %meta_offs = tt.broadcast %meta_tmp1 : tensor<1x4xi32, #blocked> -> tensor<128x4xi32, #blocked>
-+    %meta_ptr_init = tt.addptr %meta_ptr_splat, %meta_offs : tensor<128x4x!tt.ptr<i16>, #blocked>, tensor<128x4xi32, #blocked>
-+
-+    %a_off = arith.constant dense<4> : tensor<128x32xi32, #blocked>
-+    %b_off = arith.constant dense<4> : tensor<64x128xi32, #blocked>
-+    %meta_off = arith.constant dense<4> : tensor<128x4xi32, #blocked>
-+    %c_init = arith.constant dense<0.00e+00> : tensor<128x128xf32, #mma>
-+
-+    // CHECK: scf.for
-+    %loop:4 = scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %b_ptr = %b_ptr_init, %c = %c_init, %meta_ptr = %meta_ptr_init)
-+        -> (tensor<128x32x!tt.ptr<f16>, #blocked>, tensor<64x128x!tt.ptr<f16>, #blocked>, tensor<128x128xf32, #mma>, tensor<128x4x!tt.ptr<i16>, #blocked>) {
-+      // CHECK-COUNT-3: triton_gpu.local_load
-+      // CHECK: triton_gpu.sparse_dot
-+      // CHECK-COUNT-3: triton_gpu.async_copy_global_to_local
-+      %a_ = tt.load %a_ptr {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x32xf16, #blocked>
-+      %a = triton_gpu.convert_layout %a_ : tensor<128x32xf16, #blocked> -> tensor<128x32xf16, #dot_operand_a>
-+      %b_ = tt.load %b_ptr {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<64x128xf16, #blocked>
-+      %b = triton_gpu.convert_layout %b_ : tensor<64x128xf16, #blocked> -> tensor<64x128xf16, #dot_operand_b>
-+      %meta_ = tt.load %meta_ptr {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x4xi16, #blocked>
-+      %meta = triton_gpu.convert_layout %meta_ : tensor<128x4xi16, #blocked> -> tensor<128x4xi16, #dot_meta_enc>
-+      %d = triton_gpu.sparse_dot %a, %b, %c, %meta : tensor<128x32xf16, #dot_operand_a> meta tensor<128x4xi16, #dot_meta_enc> * tensor<64x128xf16, #dot_operand_b> -> tensor<128x128xf32, #mma>
-+
-+      %a_ptr_next = tt.addptr %a_ptr, %a_off : tensor<128x32x!tt.ptr<f16>, #blocked>, tensor<128x32xi32, #blocked>
-+      %b_ptr_next = tt.addptr %b_ptr, %b_off : tensor<64x128x!tt.ptr<f16>, #blocked>, tensor<64x128xi32, #blocked>
-+      %meta_ptr_next = tt.addptr %meta_ptr, %meta_off : tensor<128x4x!tt.ptr<i16>, #blocked>, tensor<128x4xi32, #blocked>
-+      scf.yield %a_ptr_next, %b_ptr_next, %d, %meta_ptr_next : tensor<128x32x!tt.ptr<f16>, #blocked>, tensor<64x128x!tt.ptr<f16>, #blocked>, tensor<128x128xf32, #mma>, tensor<128x4x!tt.ptr<i16>, #blocked>
-+    }
-+    tt.return %loop#2: tensor<128x128xf32, #mma>
-+  }
-+}
-diff --git a/test/SparseDot/ttg_reduce_data_duplication.mlir b/test/SparseDot/ttg_reduce_data_duplication.mlir
-new file mode 100644
---- /dev/null
-+++ b/test/SparseDot/ttg_reduce_data_duplication.mlir
-@@ -0,0 +1,13 @@
-+// RUN: triton-opt %s -split-input-file -tritongpu-reduce-data-duplication | FileCheck %s
-+
-+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
-+#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [16, 8]}>
-+// CHECK: #[[SHARED:.+]] = #triton_gpu.shared
-+module attributes {"triton_gpu.num-warps" = 4 : i32} {
-+  tt.func @sparse_dot_metadata(%meta: tensor<64x4xi16, #blocked>) {
-+    // CHECK: %[[META:.+]] = triton_gpu.local_alloc {{.+}} : (tensor<64x4xi16, #blocked>) -> !tt.memdesc<64x4xi16, #[[SHARED]]>
-+    // CHECK: triton_gpu.local_load %[[META]] : !tt.memdesc<64x4xi16, #[[SHARED]]> -> tensor<64x4xi16, #triton_gpu.sparse_dot_meta<{parent = #mma}>>
-+    %0 = triton_gpu.convert_layout %meta : tensor<64x4xi16, #blocked> -> tensor<64x4xi16, #triton_gpu.sparse_dot_meta<{parent = #mma}>>
-+    tt.return
-+  }
-+}
diff --git a/third_party/xla/.bazelrc b/third_party/xla/.bazelrc
index d7ae76f096431a..02dec0349c4741 100644
--- a/third_party/xla/.bazelrc
+++ b/third_party/xla/.bazelrc
@@ -51,16 +51,13 @@
 # Remote build execution options (only configured to work with TF team projects for now.)
 #     rbe_base:  General RBE options shared by all flavors.
 #     rbe_linux: General RBE options used on all linux builds.
-#     rbe_win:   General RBE options used on all windows builds.
+#     rbe_win_base:   General RBE options used on all Windows builds. Not to be used standalone.
+#     rbe_win_clang:  Options specific to compiling using Clang.
 #
 #     rbe_linux_cpu:                  RBE options to build with only CPU support.
 #     rbe_linux_cuda:                 RBE options to build with GPU support using clang.
 #     rbe_linux_cuda_nvcc:            RBE options to build with GPU support using nvcc.
 #
-#     rbe_win_py39: Windows Python 3.9 RBE config
-#
-#     tensorflow_testing_rbe_win:   RBE options to use RBE with tensorflow-testing project on windows
-#
 # Embedded Linux options (experimental and only tested with TFLite build yet)
 #     elinux:          General Embedded Linux options shared by all flavors.
 #     elinux_aarch64:  Embedded Linux options for aarch64 (ARM64) CPU support.
@@ -450,6 +447,17 @@ build:win_clang --host_linkopt=/FORCE:MULTIPLE
 test:win_clang --linkopt=/FORCE:MULTIPLE
 test:win_clang --host_linkopt=/FORCE:MULTIPLE
 
+# Same config as above but for XLA, which has different toolchain paths
+build:win_clang_xla --copt=/clang:-Weverything
+build:win_clang_xla --extra_toolchains=@local_config_cc//:cc-toolchain-x64_windows-clang-cl
+build:win_clang_xla --extra_execution_platforms=//tools/toolchains/win:x64_windows-clang-cl
+build:win_clang_xla --host_platform=//tools/toolchains/win:x64_windows-clang-cl
+build:win_clang_xla --compiler=clang-cl
+build:win_clang_xla --linkopt=/FORCE:MULTIPLE
+build:win_clang_xla --host_linkopt=/FORCE:MULTIPLE
+test:win_clang_xla --linkopt=/FORCE:MULTIPLE
+test:win_clang_xla --host_linkopt=/FORCE:MULTIPLE
+
 # Options to build TensorFlow 1.x or 2.x.
 # TODO(kanglan): Change v2's define to default behavior
 build:v2 --define=tf_api_version=2 --action_env=TF2_BEHAVIOR=1
@@ -546,38 +554,25 @@ build:rbe_linux_cuda_nvcc --config=rbe_linux_cuda
 build:rbe_linux_cuda_nvcc --config=nvcc_clang
 build:rbe_linux_cuda_nvcc --repo_env TF_NCCL_USE_STUB=1
 
-# TODO(kanglan): Remove rbe_win and rbe_win_py3* after b/289091160 is fixed
-build:rbe_win --config=rbe_base
-build:rbe_win --crosstool_top="//tensorflow/tools/toolchains/win/tf_win_05022023:toolchain"
-build:rbe_win --extra_toolchains="//tensorflow/tools/toolchains/win/tf_win_05022023:cc-toolchain-x64_windows"
-build:rbe_win --extra_execution_platforms="//tensorflow/tools/toolchains/win:rbe_windows_ltsc2019"
-build:rbe_win --host_platform="//tensorflow/tools/toolchains/win:rbe_windows_ltsc2019"
-build:rbe_win --platforms="//tensorflow/tools/toolchains/win:rbe_windows_ltsc2019"
-build:rbe_win --shell_executable=C:\\tools\\msys64\\usr\\bin\\bash.exe
-build:rbe_win --experimental_strict_action_env=true
-
-# TODO(gunan): Remove once we use MSVC 2019 with latest patches.
-build:rbe_win --define=override_eigen_strong_inline=true
-
+build:rbe_win_base --config=rbe_base
+build:rbe_win_base --shell_executable=C:\\tools\\msys64\\usr\\bin\\bash.exe
+build:rbe_win_base --remote_instance_name=projects/tensorflow-testing/instances/windows
 # Don't build the python zip archive in the RBE build.
-build:rbe_win --remote_download_minimal
-build:rbe_win --enable_runfiles
-build:rbe_win --nobuild_python_zip
-
-build:rbe_win_py38 --config=rbe_base
-build:rbe_win_py38 --repo_env=PYTHON_BIN_PATH=C:\\Python38\\python.exe
-build:rbe_win_py38 --repo_env=PYTHON_LIB_PATH=C:\\Python38\\lib\\site-packages
-build:rbe_win_py38 --repo_env=TF_PYTHON_CONFIG_REPO=//tensorflow/tools/toolchains/win_1803/py38
-build:rbe_win_py38 --python_path=C:\\Python38\\python.exe
-
-build:rbe_win_py39 --config=rbe_base
-build:rbe_win_py39 --repo_env=PYTHON_BIN_PATH=C:\\Python39\\python.exe
-build:rbe_win_py39 --repo_env=PYTHON_LIB_PATH=C:\\Python39\\lib\\site-packages
-build:rbe_win_py39 --repo_env=TF_PYTHON_CONFIG_REPO=//tensorflow/tools/toolchains/win_1803/py39
-build:rbe_win_py39 --python_path=C:\\Python39\\python.exe
-
-# TODO(kanglan): Merge tensorflow_testing_rbe_win into rbe_win
-common:tensorflow_testing_rbe_win --remote_instance_name=projects/tensorflow-testing/instances/windows
+build:rbe_win_base --remote_download_minimal
+build:rbe_win_base --enable_runfiles
+build:rbe_win_base --nobuild_python_zip
+build:rbe_win_base --define=override_eigen_strong_inline=true
+
+build:rbe_win_clang --config=rbe_win_base
+build:rbe_win_clang --crosstool_top="//tensorflow/tools/toolchains/win/20240424:toolchain"
+build:rbe_win_clang --extra_toolchains="//tensorflow/tools/toolchains/win/20240424:cc-toolchain-x64_windows-clang-cl"
+build:rbe_win_clang --extra_execution_platforms="//tensorflow/tools/toolchains/win:x64_windows-clang-cl"
+build:rbe_win_clang --host_platform="//tensorflow/tools/toolchains/win:x64_windows-clang-cl"
+build:rbe_win_clang --platforms="//tensorflow/tools/toolchains/win:x64_windows-clang-cl"
+build:rbe_win_clang --compiler=clang-cl
+build:rbe_win_clang --linkopt=/FORCE:MULTIPLE
+build:rbe_win_clang --host_linkopt=/FORCE:MULTIPLE
+
 # END TF REMOTE BUILD EXECUTION OPTIONS
 
 # TFLite build configs for generic embedded Linux
@@ -815,7 +810,7 @@ test:linux_cuda_pycpp_test --config=linux_cuda_pycpp_test_filters -- //tensorflo
 build:linux_arm64_pycpp_test_filters --test_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
 build:linux_arm64_pycpp_test_filters --build_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
 build:linux_arm64_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium --flaky_test_attempts=3
-# TODO(michaelhudgins): Why do we need to specifically omit go and java here? 
+# TODO(michaelhudgins): Why do we need to specifically omit go and java here?
 build:linux_arm64_pycpp_test --config=linux_arm64_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/core/grappler/optimizers:auto_mixed_precision_test_cpu -//tensorflow/core/grappler/optimizers:remapper_test_cpu -//tensorflow/core/kernels/image:resize_bicubic_op_test -//tensorflow/compiler/mlir/tfr/examples/customization:test_ops_test -//tensorflow/compiler/mlir/tfr/examples/mnist:mnist_ops_test -//tensorflow/compiler/mlir/tfr/examples/pad:pad_ops_test -//tensorflow/python/tools:aot_compiled_test
 # CROSS-COMPILE ARM64 PYCPP
 build:cross_compile_linux_arm64_pycpp_test --config=linux_arm64_pycpp_test
@@ -924,7 +919,9 @@ build:cross_compile_macos_x86 --extra_toolchains=//tensorflow/tools/toolchains/c
 build:cross_compile_macos_x86 --platform_mappings=tensorflow/tools/toolchains/cross_compile/config/platform_mappings
 
 # RBE cross-compile configs for Darwin x86
-build:rbe_cross_compile_macos_x86 --config=cross_compile_macos_x86
+build:rbe_cross_compile_macos_x86 --config=cross_compile_macos_x86 --remote_download_minimal
+build:rbe_cross_compile_macos_x86 --bes_backend="" --bes_results_url="" --bes_timeout="0s"
+build:rbe_cross_compile_macos_x86 --experimental_remote_build_event_upload="minimal"
 build:rbe_cross_compile_macos_x86 --config=rbe_cross_compile_base
 build:rbe_cross_compile_macos_x86 --bes_upload_mode=nowait_for_upload_complete
 test:rbe_cross_compile_macos_x86 --config=rbe_cross_compile_base
diff --git a/third_party/xla/.kokoro/linux/build.sh b/third_party/xla/.kokoro/linux/build.sh
index 87bf19cd7ad71f..c32a54520a5a05 100644
--- a/third_party/xla/.kokoro/linux/build.sh
+++ b/third_party/xla/.kokoro/linux/build.sh
@@ -48,13 +48,13 @@ docker run --name xla -w /tf/xla -itd --rm \
     "$DOCKER_IMAGE" \
     bash
 
-TAGS_FILTER="-no_oss,-oss_excluded,-oss_serial"
+TAGS_FILTER="-no_oss"
 ADDITIONAL_FLAGS=""
 RBE_FLAGS=""
-TARGET_FILTERS="-@local_tsl//tsl/platform:subprocess_test -@local_tsl//tsl/platform/cloud:google_auth_provider_test -@local_tsl//tsl/platform/cloud:oauth_client_test"
+TARGET_FILTERS=""
 
 if is_linux_gpu_job ; then
-    TAGS_FILTER="$TAGS_FILTER,gpu,requires-gpu-nvidia,-no_gpu"
+    TAGS_FILTER="$TAGS_FILTER,requires-gpu-nvidia"
 
     # We are currently running XLA presubmits on machines with NVIDIA T4 GPUs,
     # which have a compute compatibility of 7.5. Se we filter out all the tests
@@ -62,34 +62,31 @@ if is_linux_gpu_job ; then
     UNSUPPORTED_GPU_TAGS="$(echo -requires-gpu-sm{80,86,89,90}{,-only})"
     TAGS_FILTER="${TAGS_FILTER},${UNSUPPORTED_GPU_TAGS// /,}"
 
-    ADDITIONAL_FLAGS="$ADDITIONAL_FLAGS --nobuild_tests_only --run_under=//tools/ci_build/gpu_build:parallel_gpu_execute"
+    ADDITIONAL_FLAGS="$ADDITIONAL_FLAGS --run_under=//tools/ci_build/gpu_build:parallel_gpu_execute"
     RBE_FLAGS="--config=rbe_linux_cuda_nvcc --jobs=150"
     echo "***NOTE: nvidia-smi lists the highest CUDA version the driver supports, which may be different than the version of CUDA actually used!!***"
     nvidia-smi
 else
     TAGS_FILTER="$TAGS_FILTER,-gpu,-requires-gpu-nvidia"
     ADDITIONAL_FLAGS="$ADDITIONAL_FLAGS --config=nonccl"
+    TARGET_FILTERS="$TARGET_FILTERS -//xla/service/gpu/..."
 
     if is_linux_cpu_arm64_job ; then
         TAGS_FILTER="$TAGS_FILTER,-no_aarch64"
-        ADDITIONAL_FLAGS="$ADDITIONAL_FLAGS --action_env PYTHON_BIN_PATH=/usr/bin/python3.11 --python_path=/usr/bin/python3.11"
-        # Some cross-compile tests are not working for XLA Linux Aarch64.
-        # TODO(ddunleavy): Revisit these when hermetic python is available.
-        TARGET_FILTERS="$TARGET_FILTERS -//xla/python_api:xla_shape_test -//xla/python_api:xla_literal_test -//xla/service:xla_aot_compile_stablehlo_cpu_test -//xla/tests:local_client_aot_test"
         RBE_FLAGS="--config=rbe_cross_compile_linux_arm64_xla --jobs=150"
     else
         RBE_FLAGS="--config=rbe_linux_cpu --jobs=150"
-        ADDITIONAL_FLAGS="$ADDITIONAL_FLAGS --nobuild_tests_only"
     fi
 fi
 
 # Build & test XLA
 docker exec xla bazel \
         test \
-        --build_tag_filters=$TAGS_FILTER  \
+        --build_tag_filters=$TAGS_FILTER \
         --test_tag_filters=$TAGS_FILTER \
         --test_output=errors \
         --keep_going \
+        --nobuild_tests_only \
         --features=layering_check \
         --profile=/tf/pkg/profile.json.gz \
         --flaky_test_attempts=3 \
diff --git a/third_party/xla/CONTRIBUTING.md b/third_party/xla/CONTRIBUTING.md
index b014f780d689a6..1e1aedf8cffdb5 100644
--- a/third_party/xla/CONTRIBUTING.md
+++ b/third_party/xla/CONTRIBUTING.md
@@ -1,52 +1,4 @@
-# How to Contribute
+# Contributing to OpenXLA
 
-We'd love to accept your patches and contributions to this project!
-
-## Before you begin
-
-### Sign our Contributor License Agreement
-
-Contributions to this project must be accompanied by a
-[Contributor License Agreement](https://cla.developers.google.com/about) (CLA).
-You (or your employer) retain the copyright to your contribution; this simply
-gives us permission to use and redistribute your contributions as part of the
-project.
-
-If you or your current employer have already signed the Google CLA (even if it
-was for a different project), you probably don't need to do it again.
-
-Visit <https://cla.developers.google.com/> to see your current agreements or to
-sign a new one.
-
-### Review our Community Guidelines
-
-This project follows
-[Tensorflow's Open Source Community Guidelines](https://github.com/tensorflow/tensorflow/blob/master/CODE_OF_CONDUCT.md).
-
-## Contribution process
-
-### Developer Guide
-
-For a guide on how to setup a dev environment for XLA, please refer to the
-[XLA developer guide](https://github.com/openxla/xla/blob/main/docs/developer_guide.md).
-
-### Code Reviews
-
-All submissions, including submissions by project members, require review. We
-use GitHub pull requests for this purpose. Consult
-[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
-information on using pull requests. Please ensure that your PR passes CI before
-requesting review!
-
-### Notes on Testing and CI
-
-Before a PR is merged, it will undergo internal testing that uses code internal
-to Google. This can potentially add extra steps to the review process if there
-are failures on internal tests that our public CI doesn't catch. The Googler
-sheparding your PR will communicate any internal test failures and describe
-what needs to be fixed.
-
-We are actively working on increasing the number of tests run on Github!
-
-### Copybara quirks
-There are some oddities you may see while contributing, please see [this file](docs/copybara.md).
+For information on how to contribute to OpenXLA, see
+[Contributing to OpenXLA](docs/contributing.md)
diff --git a/third_party/xla/WORKSPACE b/third_party/xla/WORKSPACE
index 7ba74d6276c2e4..9d046e22949091 100644
--- a/third_party/xla/WORKSPACE
+++ b/third_party/xla/WORKSPACE
@@ -7,6 +7,32 @@ workspace(name = "xla")
 # restriction that load() statements need to be at the top of .bzl files.
 # E.g. we can not retrieve a new repository with http_archive and then load()
 # a macro from that repository in the same file.
+
+# Initialize hermetic Python
+load("//third_party/py:python_init_rules.bzl", "python_init_rules")
+
+python_init_rules()
+
+load("//third_party/py:python_init_repositories.bzl", "python_init_repositories")
+
+python_init_repositories(
+    requirements = {
+        "3.11": "//:requirements_lock_3_11.txt",
+    },
+)
+
+load("//third_party/py:python_init_toolchains.bzl", "python_init_toolchains")
+
+python_init_toolchains()
+
+load("//third_party/py:python_init_pip.bzl", "python_init_pip")
+
+python_init_pip()
+
+load("@pypi//:requirements.bzl", "install_deps")
+
+install_deps()
+
 load(":workspace4.bzl", "xla_workspace4")
 
 xla_workspace4()
diff --git a/third_party/xla/build_tools/configure/configure.py b/third_party/xla/build_tools/configure/configure.py
index 663e4b8724280d..2178a8b71dbc72 100755
--- a/third_party/xla/build_tools/configure/configure.py
+++ b/third_party/xla/build_tools/configure/configure.py
@@ -183,6 +183,7 @@ class Backend(ArgparseableEnum):
   CPU = enum.auto()
   CUDA = enum.auto()
   ROCM = enum.auto()
+  SYCL = enum.auto()
 
 
 class HostCompiler(ArgparseableEnum):
@@ -402,6 +403,8 @@ def to_bazelrc_lines(
         rc.append("build --config nonccl")
     elif self.backend == Backend.ROCM:
       pass
+    elif self.backend == Backend.SYCL:
+      rc.append("build --config sycl")
 
     # Lines that are added for every backend
     if dpav.ld_library_path:
diff --git a/third_party/xla/build_tools/sycl/build.sh b/third_party/xla/build_tools/sycl/build.sh
new file mode 100644
index 00000000000000..5ed6beb86c8a3d
--- /dev/null
+++ b/third_party/xla/build_tools/sycl/build.sh
@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+#
+# A script to build XLA sycl target.
+#
+# Required input:
+#     workspace: the local to do this buidl
+
+if [ $# -lt 1 ];then
+  echo "Error: workspace not set."
+  exit 1
+fi
+
+workspace=$1
+
+if [ -e ${workspace} ];then
+  time_stamp=$(date +%s%N)
+  echo "Warning: ${workspace} exist."
+  workspace=$workspace/$time_stamp
+  echo "Will use $workspace as new workspace"
+fi
+
+mkdir -p $workspace
+
+xla_path=$workspace/xla
+cd $workspace
+git clone -b yang/ci https://github.com/Intel-tensorflow/xla xla
+bash $xla_path/build_tools/sycl/install_bazel.sh $workspace
+bash $xla_path/build_tools/sycl/install_oneapi.sh $workspace install
+bash $xla_path/build_tools/sycl/build_xla.sh $workspace
+bash $xla_path/build_tools/sycl/clean.sh $workspace
+
diff --git a/third_party/xla/build_tools/sycl/build_xla.sh b/third_party/xla/build_tools/sycl/build_xla.sh
new file mode 100644
index 00000000000000..38e1c00e3dca21
--- /dev/null
+++ b/third_party/xla/build_tools/sycl/build_xla.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+workspace=$1
+cd $workspace/xla
+
+if [ -z ${SYCL_TOOLKIT_PATH+x} ];
+then
+  export SYCL_TOOLKIT_PATH=$workspace/oneapi/compiler/2024.1/
+fi
+bazel_bin=$(ls $workspace/bazel/)
+./configure.py --backend=SYCL --host_compiler=GCC
+$workspace/bazel/$bazel_bin build --config=verbose_logs -s --verbose_failures --nocheck_visibility //xla/tools:run_hlo_module
diff --git a/third_party/xla/xla/mlir_hlo/lhlo/CMakeLists.txt b/third_party/xla/build_tools/sycl/clean.sh
similarity index 60%
rename from third_party/xla/xla/mlir_hlo/lhlo/CMakeLists.txt
rename to third_party/xla/build_tools/sycl/clean.sh
index 649a7d47b96d55..b72aa75a84a622 100644
--- a/third_party/xla/xla/mlir_hlo/lhlo/CMakeLists.txt
+++ b/third_party/xla/build_tools/sycl/clean.sh
@@ -1,17 +1,19 @@
-#
-# Copyright 2020 The OpenXLA Authors.
+#!/usr/bin/env bash
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#      https://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#
-add_subdirectory(IR)
-add_subdirectory(transforms)
+# ==============================================================================
+workspace=$1
+SCRIPT_PATH="`dirname \"$0\"`"
+bash $SCRIPT_PATH/install_oneapi.sh $workspace remove
+rm -rf $workspace
diff --git a/third_party/xla/build_tools/sycl/install_bazel.sh b/third_party/xla/build_tools/sycl/install_bazel.sh
new file mode 100644
index 00000000000000..773dcee27cf2fa
--- /dev/null
+++ b/third_party/xla/build_tools/sycl/install_bazel.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+workspace=$1
+cd $workspace/xla
+bazel_version=$(head -1 .bazelversion)
+bazel_base_url="https://github.com/bazelbuild/bazel/releases/download"
+bazel_url=$bazel_base_url/$bazel_version/"bazel-$bazel_version-linux-x86_64"
+echo $bazel_url
+mkdir -p $workspace/bazel
+cd $workspace/bazel
+wget $bazel_url
+bazel_bin=$(ls)
+chmod +x $bazel_bin
diff --git a/third_party/xla/build_tools/sycl/install_oneapi.sh b/third_party/xla/build_tools/sycl/install_oneapi.sh
new file mode 100644
index 00000000000000..67ffe93f107cbb
--- /dev/null
+++ b/third_party/xla/build_tools/sycl/install_oneapi.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+if [ -z ${SYCL_TOOLKIT_PATH+x} ];
+then
+workspace=$1
+action=$2
+echo "Install Intel OneAPI in $workspace/oneapi"
+cd $workspace
+mkdir -p oneapi
+if ! [ -f $workspace/l_BaseKit_p_2024.1.0.596.sh ]; then
+  echo "Download oneAPI package"
+  wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/fdc7a2bc-b7a8-47eb-8876-de6201297144/l_BaseKit_p_2024.1.0.596.sh
+fi
+bash l_BaseKit_p_2024.1.0.596.sh -a -s --eula accept --action $action --install-dir $workspace/oneapi --log-dir $workspace/oneapi/log --download-cache $workspace/oneapi/cache --components=intel.oneapi.lin.dpcpp-cpp-compiler:intel.oneapi.lin.mkl.devel
+else
+  echo "SYCL_TOOLKIT_PATH set to $SYCL_TOOLKIT_PATH", skip install/remove oneAPI;
+fi
diff --git a/third_party/xla/docs/_toc.yaml b/third_party/xla/docs/_toc.yaml
index d46f2534e897e7..b133e29f7c87c8 100644
--- a/third_party/xla/docs/_toc.yaml
+++ b/third_party/xla/docs/_toc.yaml
@@ -24,22 +24,22 @@ toc:
       path: /xla/custom_call
     - title: Persisted autotuning
       path: /xla/persisted_autotuning
-    - title: Copybara quirks
-      path: /xla/copybara
     - title: XLA Tooling
       path: /xla/tools
     - title: Using LSP autocompletion
       path: /xla/lsp
 - title: Contributing
   section:
-    - title: Develop a new backend for XLA
-      path: /xla/developing_new_backend
+    - title: Contributing
+      path: /xla/contributing
     - title: Developer guide
       path: /xla/developer_guide
-    - title: Code reviews
-      path: /xla/code_reviews
     - title: Build from source
       path: /xla/build_from_source
+    - title: Develop a new backend for XLA
+      path: /xla/developing_new_backend
+    - title: Develop a new PJRT plugin
+      path: /xla/pjrt_integration
 - title: Using XLA in TensorFlow
   section:
     - title: Using XLA in TensorFlow
diff --git a/third_party/xla/docs/code_reviews.md b/third_party/xla/docs/code_reviews.md
deleted file mode 100644
index 22105fdab0fb9f..00000000000000
--- a/third_party/xla/docs/code_reviews.md
+++ /dev/null
@@ -1,110 +0,0 @@
-# Code reviews
-
-This document explains the XLA team's code review philosophy &ndash; a
-philosophy that we've developed over years of collective experience working on
-open source projects in general and XLA in particular.
-
-Different open-source projects have different cultural expectations for how much
-reviewers can ask of code authors. In some projects, reviewers will take a
-"mostly correct" pull request (PR), modify it themselves, and submit it. XLA
-takes a different approach: We expect authors to iterate on PRs until they're
-good enough to submit without additional changes.
-
-The main reason for this approach is that we want PR authors to learn to be
-fully fledged XLA contributors. If reviewers fix issues in PRs, it's much harder
-for authors to learn. The XLA approach can be challenging for both reviewers and
-authors, but we believe it ultimately helps us grow the community.
-
-Learning to be a "fully fledged XLA contributor" isn't just about writing code
-that doesn't have bugs. There's much more to learn about contributing to XLA.
-This includes:
-
-*   coding style
-*   edge cases to look out for
-*   expectations around writing tests
-*   expectations around comments and PR descriptions
-*   expectations around building infrastructure to support your changes
-
-As you build knowledge of the project and trust from reviewers, you can expect
-fewer comments, because you're naturally writing code more aligned with your
-reviewer's expectations.
-
-Like many open-source projects, XLA has a few highly experienced people and many
-relatively new people. Those of us who are highly experienced have many demands
-on our time. You can keep PRs moving forward in a timely manner and reduce the
-number of iterations by following these suggestions:
-
-*   *Carefully review your PR and/or have your PR reviewed by a colleague before
-    sending it:* Try to remove trivial mistakes (code style, spelling and
-    grammar mistakes, etc.) before sending the PR for review. Make sure all
-    tests pass.
-*   *Carefully read your reviewer's comments:* Try to understand what the
-    reviewer is asking for and attempt to address all comments before you push a
-    new version.
-*   *Avoid tangential discussions (bikeshedding):* Technical discussions and
-    disagreements are highly valuable and nobody is perfect. However, avoid
-    discussions that don't make a difference or are merely stylistic. If you
-    disagree with a reviewer's comment, try to detail your reasons as precisely
-    and comprehensively as possible to avoid long back-and-forth discussions.
-*   *Avoid asking the "commonly asked review questions" listed below:* We have
-    listed some answers to common questions and our rationale below.
-
-In general, we invite you to try to make reviewing your PRs take as little time
-for us as possible. Then we'll be able to review your changes quickly!
-
-Thank you for contributing to XLA, and happy hacking!
-
-## Commonly asked review questions
-
-### "This infrastructure change is not related to my PR. Why should I do it?"
-
-The XLA team doesn't have a dedicated infrastructure team, so it's up to us all
-to build helper libraries and avoid technical debt. We consider it to be a
-regular part of making changes to XLA, and everyone is expected to participate.
-We generally build infrastructure as needed when writing code.
-
-XLA reviewers may ask you to build some infrastructure (or otherwise make a
-large change to a PR) along with a PR that you've written. This request may seem
-unnecessary or orthogonal to the change you're trying to make. This is likely
-because of a mismatch between your expectations about how much infra you need to
-build and your reviewer's expectations for the same.
-
-A mismatch in expectations is okay! That's expected when you're new to a project
-(and it sometimes even happens to us old hats). It's likely that projects you've
-worked on in the past have different expectations. That's also okay and
-expected! It doesn't mean either one of these projects has the wrong approach;
-they're just different. We invite you to take infra requests alongside all other
-review comments as an opportunity to learn what we expect on this project.
-
-### "Can I address your comment in a future PR?"
-
-A frequent question with respect to infrastructure requests (or other large
-requests) in PRs is whether or not the change must be made in the original PR,
-or whether it can be done as a follow-up in a future PR.
-
-In general, XLA does not allow PR authors to address review comments with a
-follow-up PR. When a reviewer decides that something needs to be addressed in a
-given PR, we generally expect authors to address it in that PR, even if what's
-requested is a large change. This standard applies externally and also
-internally within Google.
-
-There are a few reasons that XLA takes this approach.
-
-*   *Trust:* Having earned the reviewer's trust is a key component. In an
-    open-source project, contributors can appear or disappear at will. After we
-    approve a PR, reviewers have no way to ensure that any promised follow-ups
-    actually get done.
-
-*   *Impact on other developers:* If you have sent a PR touching a particular
-    part of XLA, there's a good chance other people are looking at the same
-    part. If we accept technical debt in your PR, then everyone who's looking at
-    this file will be impacted by this debt until the follow-up is submitted.
-
-*   *Reviewer bandwidth:* Deferring a change to a follow-up imposes multiple
-    costs on our already overloaded reviewers. Reviewers will probably forget
-    what the first PR was about while waiting for the follow-up, making the next
-    review more difficult. Also, reviewers will have to keep track of expected
-    follow-ups, making sure that they actually happen. If the change can be made
-    such that it is truly orthogonal to the original PR so that some other
-    reviewer could review it, bandwidth would be less of a problem. In our
-    experience, this is rarely the case.
diff --git a/third_party/xla/docs/contributing.md b/third_party/xla/docs/contributing.md
new file mode 100644
index 00000000000000..8d17d519ae0d9e
--- /dev/null
+++ b/third_party/xla/docs/contributing.md
@@ -0,0 +1,156 @@
+# Contributing to OpenXLA
+
+Everyone can contribute to OpenXLA, and we value everyone’s contributions. There
+are several ways to contribute, including:
+
+*   Answering questions on OpenXLA’s discussions forums (openxla-discuss)
+
+*   Improving or expanding OpenXLA’s documentation
+
+*   Contributing to OpenXLA’s code-base
+
+*   Contributing in any of the above ways to the broader ecosystem of libraries
+built on OpenXLA
+
+The OpenXLA project follows
+[Google’s Open Source Community Guidelines](https://opensource.google/conduct/).
+
+## Before you begin
+
+### Sign the Contributor License Agreement
+
+Contributions to this project must be accompanied by a
+[Contributor License Agreement](https://cla.developers.google.com/about) (CLA).
+You (or your employer) retain the copyright to your contribution; this simply
+gives us permission to use and redistribute your contributions as part of the
+project.
+
+If you or your current employer have already signed the Google CLA (even if it
+was for a different project), you probably don't need to do it again.
+
+Visit <https://cla.developers.google.com/> to see your current agreements or to
+sign a new one.
+
+### Review the Code of Conduct
+
+This project follows
+[Tensorflow's Code of Conduct](https://github.com/tensorflow/tensorflow/blob/master/CODE_OF_CONDUCT.md).
+
+## Contribution process
+
+### Developer Guide
+
+For a guide on how to setup a development environment for OpenXLA, including getting
+code, building it, running tests and submitting changes, please refer to the
+[Developer guide](docs/developer_guide.md).
+
+### Code standards
+
+*   *Coding style*: We follow [Google's code style guide](https://google.github.io/styleguide/).
+    Specifically see the [C/C++](https://google.github.io/styleguide/cppguide.html) and [Python](https://google.github.io/styleguide/pyguide.html) guides. All
+    code submitted must strictly conform to these style guides.
+
+*   *Compact changes*: We follow
+    [Google's engineering practices](https://google.github.io/eng-practices/).
+    In particular, please observe the
+    [guide on writing compact changes](https://google.github.io/eng-practices/review/developer/small-cls.html).
+    Doing so will greatly increase the speed at which you can get your code
+    merged due to improve reviewability, and reducing the likelihood of
+    unintentional side effects of change. Even if you have a large change, there
+    are many strategies for breaking it up into more incremental changes.
+
+*   *Test Coverage*: All changes should include appropriate unit tests. Unit
+    tests should not be dependent on specific hardware (CPU, GPU, etc.) timings,
+    and should make liberal use of mocks and fakes in order to make
+    deterministic and focused tests. Changes seeking to extend existing code
+    that’s currently hard to test should make appropriate improvements to
+    testability.
+
+    All changes should include appropriate benchmark results as well in the
+    change title to ensure the benefits are clearly understood.
+
+*   When in doubt as to conventions within the code, it is always a good idea to
+    examine pre-existing code and to try to follow the patterns already in place
+    in OpenXLA.
+
+
+### Review Process
+
+All submissions, including submissions by project members, require review. We
+use GitHub pull requests for this purpose. Consult
+[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
+information on using pull requests.
+
+*   Code must follow all standards listed above prior to review. These are not
+    optional and it is critical that the submitter ensure their code conforms
+    before requesting review in order to assure timely acceptance of changes.
+
+*   *All tests must pass*. If you find that a test is broken and the issue is not
+    related to your build environment or otherwise your changes, please contact
+    the maintainers.
+
+*   Try to avoid scope creep during the review process. This is the
+    responsibility of both the submitter and the reviewer. If a change starts to
+    get too large, consider breaking it up into multiple changes.
+
+*   Before a change is merged, it will undergo internal testing that uses code
+    internal to Google and other hardware vendors. This can potentially add extra
+    steps to the review process if there are failures on internal tests that our
+    public CI doesn't catch. The Googler reviewing your change will communicate
+    any internal test failures and describe what needs to be fixed.
+
+
+## Frequently asked questions (FAQ)
+
+### "This infrastructure change is not related to my PR. Why should I do it?"
+
+The XLA team doesn't have a dedicated infrastructure team, so it's up to us all
+to build helper libraries and avoid technical debt. We consider it to be a
+regular part of making changes to XLA, and everyone is expected to participate.
+We generally build infrastructure as needed when writing code.
+
+XLA reviewers may ask you to build some infrastructure (or otherwise make a
+large change to a PR) along with a PR that you've written. This request may seem
+unnecessary or orthogonal to the change you're trying to make. This is likely
+because of a mismatch between your expectations about how much infra you need to
+build and your reviewer's expectations for the same.
+
+A mismatch in expectations is okay! That's expected when you're new to a project
+(and it sometimes even happens to us old hats). It's likely that projects you've
+worked on in the past have different expectations. That's also okay and
+expected! It doesn't mean either one of these projects has the wrong approach;
+they're just different. We invite you to take infra requests alongside all other
+review comments as an opportunity to learn what we expect on this project.
+
+### "Can I address your comment in a future PR?"
+
+A frequent question with respect to infrastructure requests (or other large
+requests) in PRs is whether or not the change must be made in the original PR,
+or whether it can be done as a follow-up in a future PR.
+
+In general, XLA does not allow PR authors to address review comments with a
+follow-up PR. When a reviewer decides that something needs to be addressed in a
+given PR, we generally expect authors to address it in that PR, even if what's
+requested is a large change. This standard applies externally and also
+internally within Google.
+
+There are a few reasons that XLA takes this approach.
+
+*   *Trust:* Having earned the reviewer's trust is a key component. In an
+    open-source project, contributors can appear or disappear at will. After we
+    approve a PR, reviewers have no way to ensure that any promised follow-ups
+    actually get done.
+
+*   *Impact on other developers:* If you have sent a PR touching a particular
+    part of XLA, there's a good chance other people are looking at the same
+    part. If we accept technical debt in your PR, then everyone who's looking at
+    this file will be impacted by this debt until the follow-up is submitted.
+
+*   *Reviewer bandwidth:* Deferring a change to a follow-up imposes multiple
+    costs on our already overloaded reviewers. Reviewers will probably forget
+    what the first PR was about while waiting for the follow-up, making the next
+    review more difficult. Also, reviewers will have to keep track of expected
+    follow-ups, making sure that they actually happen. If the change can be made
+    such that it is truly orthogonal to the original PR so that some other
+    reviewer could review it, bandwidth would be less of a problem. In our
+    experience, this is rarely the case.
diff --git a/third_party/xla/docs/custom_call.md b/third_party/xla/docs/custom_call.md
index 84633d697daa20..ef56830e08c764 100644
--- a/third_party/xla/docs/custom_call.md
+++ b/third_party/xla/docs/custom_call.md
@@ -1,18 +1,18 @@
 # XLA custom calls
 
-This document describes how to write and use XLA custom calls. Custom calls let
-you invoke code written in a programming language like C++ or CUDA from an XLA
-program.
-
-> **Caution:** Custom calls are a low-level power-user feature. It is easy to
-> break your program in difficult-to-debug (and even difficult-to-notice) ways
-> using custom calls. You shouldn't use custom calls unless you're prepared to
-> debug XLA yourself when something goes wrong, and you should expect relatively
-> less assistance from XLA developers if you run into trouble.
-
-> **Caution:** The custom-call API/ABI is not currently stable. We don't intend
-> to change it capriciously, but it may change. Some possible future changes are
-> described below.
+This document describes how to write and use XLA custom calls using XLA FFI
+library. Custom call is a mechanism to describe an external "operation" in the
+HLO module to the XLA compiler (at compile time), and XLA FFI is a mechanism to
+register implementation of such operations with XLA (at run time). FFI stands
+for "foreign function interface" and it is a set of C APIs that define a binary
+interface (ABI) for XLA to call into external code written in other programming
+languages. XLA provides header-only bindings for XLA FFI written in C++, which
+hides all the low level details of underlying C APIs from the end user.
+
+> **Caution:** The custom-call API/ABI uses PJRT-style versioning (major, minor),
+> however at this point it is still experimental and can be broken at any time.
+> Once API/ABI is finalized we intend to provide stability guarantees
+> similar to PJRT.
 
 > **Caution** The HLO-visible names of functions registered with the custom-call
 > macros API do not respect C++ namespaces. As a result, accidental collisions
@@ -42,30 +42,54 @@ void do_it() {
       xla::Parameter(&b, 1, xla::ShapeUtil::MakeShape(xla::F32, {2048}), "p1");
   xla::XlaOp custom_call =
       xla::CustomCall(&b, "do_custom_call", /*operands=*/{param0, param1},
-                      /*shape=*/xla::ShapeUtil::MakeShape(xla::F32, {2048}));
+        /*shape=*/xla::ShapeUtil::MakeShape(xla::F32, {2048}),
+        /*opaque=*/"", /*has_side_effect=*/false,
+        /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
+        /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
+        /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
 }
 
-void do_custom_call(void* out, const void** in) {
-  float* out_buf = reinterpret_cast<float*>(out);
-  const float* in0 = reinterpret_cast<const float*>(in[0]);
-  const float* in1 = reinterpret_cast<const float*>(in[1]);
-  for (int i = 0; i < 2048; ++i) {
-    out_buf[i] = in0[i % 128] + in1[i];
+// Constrain custom call arguments to rank-1 buffers of F32 data type.
+using BufferF32 = xla::ffi::BufferR1<xla::ffi::DataType::F32>;
+
+// Implement a custom call as a C+ function. Note that we can use `Buffer` type
+// defined by XLA FFI that gives us access to buffer data type and shape.
+xla::ffi::Error do_custom_call(BufferF32 in0, BufferF32 in1,
+                               xla::ffi::Result<BufferF32> out) {
+  size_t d0 = in0.dimensions[0];
+  size_t d1 = in1.dimensions[0];
+
+  // Check that dimensions are compatible.
+  assert(out->dimensions[0] == d1 && "unexpected dimensions");
+
+  for (size_t i = 0; i < d1; ++i) {
+    out->data[i] = in0.data[i % d0] + in1.data[i];
   }
 }
-XLA_REGISTER_CUSTOM_CALL_TARGET(do_custom_call, "Host");
-```
 
-Notice that the function `do_custom_call` needs to know the dimensions of the
-buffers it operates over. In this example we hardcode the sizes `128` and
-`2048`. If you don't want to do this, you can pass the dimensions in as
-parameters to the call.
+// Explicitly define an XLA FFI handler signature and bind it to the
+// `do_custom_call` implementation. XLA FFI handler can automatically infer
+// type signature from the custom call function, but it relies on magical
+// template metaprogramming an explicit binding provides and extra level of
+// type checking and clearly states custom call author intentions.
+XLA_FFI_DEFINE_HANDLER(handler, do_custom_call,
+                       ffi::Ffi::Bind()
+                           .Arg<Buffer>()
+                           .Arg<Buffer>()
+                           .Ret<Buffer>());
+
+// Registers `handler` with and XLA FFI on a "Host" platform.
+XLA_FFI_REGISTER_HANDLER(xla::ffi::GetXlaFfiApi(), "do_custom_call",
+                         "Host", handler);
+```
 
 ## Create a custom call on GPU
 
-The GPU custom call framework is somewhat different than that on the CPU. Here
-is a CUDA example that does the same computation (`A[i] = B[i % 128] + C[i]`) as
-the CPU code above.
+The GPU custom call registration with XLA FFI is almost identical, the only
+difference is that for GPU you need to ask for an underlying platform stream
+(CUDA or ROCM stream) to be able to launch kernel on device. Here is a CUDA
+example that does the same computation (`A[i] = B[i % 128] + C[i]`) as the CPU
+code above.
 
 ```c++
 void do_it() { /* same implementation as above */ }
@@ -75,18 +99,29 @@ __global__ custom_call_kernel(const float* in0, const float* in1, float* out) {
   out[idx] = in0[idx % 128] + in1[idx];
 }
 
-void do_custom_call(CUstream stream, void** buffers,
-                    const char* opaque, size_t opaque_len) {
-  const float* in0 = reinterpret_cast<const float*>(buffers[0]);
-  const float* in1 = reinterpret_cast<const float*>(buffers[1]);
-  float* out = reinterpret_cast<float*>(buffers[2]);
+void do_custom_call(CUstream stream, BufferF32 in0, BufferF32 in1,
+                    xla::ffi::Result<BufferF32> out) {
+  size_t d0 = in0.dimensions[0];
+  size_t d1 = in1.dimensions[0];
+  size_t d2 = out->dimensions[0];
+
+  assert(d0 == 128 && d1 == 2048 && d2 == 2048 && "unexpected dimensions");
 
   const int64_t block_dim = 64;
   const int64_t grid_dim = 2048 / block_dim;
-  custom_call_kernel<<<grid_dim, block_dim,
-                       /*dynamic_shared_mem_bytes=*/0, stream>>>(in0, in1, out);
+  custom_call_kernel<<<grid_dim, block_dim, 0, stream>>>(
+    in0.data, in1.data, out.data);
 }
-XLA_REGISTER_CUSTOM_CALL_TARGET(do_custom_call, "CUDA");
+
+XLA_FFI_DEFINE_HANDLER(handler, do_custom_call,
+                       ffi::Ffi::Bind()
+                           .Ctx<xla::ffi::PlatformStream<CUstream>>()
+                           .Arg<Buffer>()
+                           .Arg<Buffer>()
+                           .Ret<Buffer>());
+
+XLA_FFI_REGISTER_HANDLER(xla::ffi::GetXlaFfiApi(), "do_custom_call",
+                         "CUDA", handler);
 ```
 
 Notice first that the GPU custom call function *is still a function executed on
@@ -94,99 +129,10 @@ the CPU*. The `do_custom_call` CPU function is responsible for enqueueing work
 on the GPU. Here it launches a CUDA kernel, but it could also do something else,
 like call cuBLAS.
 
-`buffers` is an array of pointers that lives on the host, and each element it
-contains points to device (i.e. GPU) memory. The parameters come first, followed
-by the output value. This is notably different from the CPU calling convention,
-which has two params, `ins` and `out`. The GPU calling convention makes it
-possible to handle tuple-shaped inputs/outputs efficiently.
-
-As in the CPU example, we've hardcoded the input and output buffer sizes into
-our custom call. However unlike in the CPU case, passing the buffer sizes in as
-operands to the custom call would not work well. Usually we need the buffer
-sizes available to us on the CPU (e.g. when launching a kernel, we need to know
-the block/grid dimensions to use). But if we were to pass the buffer sizes as
-operands to our custom call, their values would live in GPU memory. We'd then
-have to do an expensive synchronous device-to-host `memcpy` at the start of our
-operation just to read the sizes.
-
-To let you work around this, we provide the `opaque` parameter. You can set this
-to an arbitrary string of bytes when you create the custom call:
-
-```c++
-std::string opaque = "...";
-xla::CustomCall(&b, "do_custom_call", /*operands=*/{param0, param1},
-                /*output_shape=*/xla::ShapeUtil::MakeShape(xla::F32, {2048}),
-                opaque);
-```
-
-Because `xla::Shape` has a protocol buffer representation, you could store this
-serialized proto inside of `opaque` and deserialize it within your GPU custom
-call. Note however that although `xla::ShapeProto` does not change frequently,
-it *does* change. Check the Git log to see how it has changed in the past.
-
-## Signalling an error
-
-If your custom call encounters an error, you can signal the error to the XLA
-runtime (instead of e.g. crashing or returning nonsense in the output buffers)
-by using the following signature for your function:
-
-**On CPU:**
-
-```c++
-#include "xla/service/custom_call_status.h"
-
-void do_custom_call(void* out, const void** in, XlaCustomCallStatus* status);
-```
-
-**on GPU:**
-
-```c++
-#include "xla/service/custom_call_status.h"
-
-void do_custom_call(CUstream stream, void** buffers, const char* opaque,
-                    size_t opaque_len, xla::XlaCustomCallStatus* status);
-```
-
-You can signal failure by using `XlaCustomCallStatusSetFailure`, e.g.:
-
-```c++
-void do_custom_call(void* out, const void** in, XlaCustomCallStatus* status) {
-  // ... do some work.
-
-  if (bad_condition) {
-    char* error_message = "An error occurred";
-    XlaCustomCallStatusSetFailure(status, error_message, strlen(error_message));
-    return;
-  }
-
-  // ... continue.
-}
-```
-
-You can also use `XlaCustomCallStatusSetSuccess` to indicate success, but the
-`XlaCustomCallStatus` is in a success state by default, so ignoring it
-completely will also indicate success.
-
-When using custom call functions with this signature, you must create the
-corresponding `custom-call` op with the appropriate API version set, e.g.:
-
-```c++
-xla::CustomCall(&b, "do_custom_call", /*operands=*/{param0, param1},
-                /*output_shape=*/xla::ShapeUtil::MakeShape(F32, {2048}),
-                opaque, /*has_side_effect=*/false,
-                /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
-                /*schedule=*/xla::CustomCallSchedule::SCHEDULE_NONE,
-                /*api_version=*/API_VERSION_STATUS_RETURNING);
-```
-
-> **Note:** In the future all clients will be required to migrate their custom
-> call functions to the new API version and the old one will be deprecated. For
-> custom calls that can't fail, you can simply add the new
-> `XlaCustomCallStatus*` parameter and then ignore it.
-
-On failure, none of the custom call outputs will be used; the XLA runtime will
-terminate the computation. It is not possible for an HLO computation to recover
-from the error (e.g. by catching and handling it).
+Arguments and results also live on the host, and data member contains a pointer
+to device (i.e. GPU) memory. Buffers passed to custom call handler have the
+shape of the underlying device buffers, so the custom call can compute kernel
+launch parameters from them.
 
 ## Passing tuples to custom calls
 
@@ -209,32 +155,12 @@ Shape out_shape = ShapeUtil::MakeTuple({
   ShapeUtil::MakeShape(F32, {512}),
   ShapeUtil::MakeShape(F32, {1024}),
 });
-xla::CustomCall(&b, "do_custom_call", /*operands=*/{p0}, out_shape);
+xla::CustomCall(&b, "do_custom_call", /*operands=*/{p0}, out_shape, ...);
 ```
 
 On both CPU and GPU, a tuple is represented in memory as an array of pointers.
-In C++ pseudocode, parameter 0 above is laid out as follows.
-
-```c++
-// In-memory layout of parameter 0 from custom call above. True on both CPU
-// and GPU.
-float* subbuf0 = new float[32];
-float* subbuf1 = new float[64];
-float* subbuf2 = new float[128]
-float* subbuf3 = new float[256];
-
-void* subtuple = new void*[2];
-(*subtuple)[0] = subbuf1;
-(*subtuple)[1] = subbuf2;
-
-void* p0 = new void*[3];
-(*p0)[0] = subbuf0;
-(*p0)[1] = subtuple;
-(*p0)[2] = subbuf3;
-```
-
-Although the in-memory representation of tuples is the same in CPU and GPU, they
-are handled differently in the CPU and GPU custom-call calling conventions.
+When XLA calls custom calls with tuple arguments or results it flattens them and
+passes as regular buffer arguments or results.
 
 ### Tuple outputs as temp buffers
 
@@ -256,29 +182,3 @@ been written to. That's exactly what you want from a temp buffer.
 In the example above, suppose we wanted to use the `F32[1024]` as a temp buffer.
 Then we'd write the HLO just as above, and we'd simply never read tuple index 1
 of the custom call's output.
-
-### Tuples in CPU custom calls
-
-In CPU code, we have a function `do_custom_call(const void** ins, void* out)`.
-`ins` is an array with just one element, which points to `param0`. The
-subbuffers of `param0` are accessible by dereferencing that pointer, and the
-subbuffers of `output_tuple` are accessible by dereferencing `out`.
-
-### Tuples in GPU custom calls
-
-In GPU code, we have a function `do_custom_call(..., void** buffers, ...)`. In
-this case `buffers` is a host array of *six* device pointers, one for each leaf
-buffer in the input/output. To generate the flat list, we iterate over the
-parameters and output, and for each we do a preorder traversal of its shape.
-Concretely:
-
-```c++
-// Layout of `buffers` parameter to GPU custom call function for custom-call
-// above.
-buffers[0] == subbuf0
-buffers[1] == subbuf1
-buffers[2] == subbuf2
-buffers[3] == subbuf3
-buffers[4] == output_subbuf0
-buffers[5] == output_subbuf1
-```
diff --git a/third_party/xla/docs/operation_semantics.md b/third_party/xla/docs/operation_semantics.md
index d2828b642ad846..5a3626b2931e8b 100644
--- a/third_party/xla/docs/operation_semantics.md
+++ b/third_party/xla/docs/operation_semantics.md
@@ -1416,7 +1416,6 @@ For a more intuitive description, see the "Informal Description" section below.
 | `collapsed_slice_dims` | `ArraySlice<int64>` | The set of dimensions in each slice that are collapsed away. These dimensions must have size 1. |
 | `start_index_map`      | `ArraySlice<int64>` | A map that describes how to map indices in `start_indices` to legal indices into operand. |
 | `indices_are_sorted`   | `bool`              | Whether the indices are guaranteed to be sorted by the caller. |
-| `unique_indices`       | `bool`              | Whether the indices are guaranteed to be unique by the caller. |
 
 For convenience, we label dimensions in the output array not in `offset_dims`
 as `batch_dims`.
@@ -1486,11 +1485,6 @@ If `indices_are_sorted` is set to true then XLA can assume that `start_indices`
 are sorted (in ascending `start_index_map` order) by the user. If they are not
 then the semantics is implementation defined.
 
-If `unique_indices` is set to true then XLA can assume that all elements
-scattered to are unique. So XLA could use non-atomic operations. If
-`unique_indices` is set to true and the indices being scattered to are not
-unique then the semantics is implementation defined.
-
 ### Informal Description and Examples
 
 Informally, every index `Out` in the output array corresponds to an element `E`
@@ -2382,6 +2376,7 @@ Arguments                      | Type                  | Semantics
 `inserted_window_dims`         | `ArraySlice<int64>`   | The set of *window dimensions* that must be inserted into `updates` shape.
 `scatter_dims_to_operand_dims` | `ArraySlice<int64>`   | A dimensions map from the scatter indices to the operand index space. This array is interpreted as mapping `i` to `scatter_dims_to_operand_dims[i]` . It has to be one-to-one and total.
 `indices_are_sorted`           | `bool`                | Whether the indices are guaranteed to be sorted by the caller.
+`unique_indices`               | `bool`                | Whether the indices are guaranteed to be unique by the caller.
 
 Where:
 
@@ -2484,6 +2479,11 @@ If `indices_are_sorted` is set to true then XLA can assume that `start_indices`
 are sorted (in ascending `start_index_map` order) by the user. If they are not
 then the semantics is implementation defined.
 
+If `unique_indices` is set to true then XLA can assume that all elements
+scattered to are unique. So XLA could use non-atomic operations. If
+`unique_indices` is set to true and the indices being scattered to are not
+unique then the semantics is implementation defined.
+
 Informally, the scatter op can be viewed as an _inverse_ of the gather op, i.e.
 the scatter op updates the elements in the input that are extracted by the
 corresponding gather op.
diff --git a/third_party/xla/docs/pjrt_integration.md b/third_party/xla/docs/pjrt_integration.md
new file mode 100644
index 00000000000000..c8141b158b91af
--- /dev/null
+++ b/third_party/xla/docs/pjrt_integration.md
@@ -0,0 +1,153 @@
+# PJRT plugin integration
+
+## Background
+
+[PJRT](https://github.com/openxla/xla/blob/c23fbd601a017be25726fd6d624b22daa6a8a4e5/xla/pjrt/c/pjrt_c_api.h) is the uniform Device API that we want to add to the ML ecosystem. The long term vision is that: (1) frameworks (JAX, TF, etc.) will call PJRT, which has device-specific implementations that are opaque to the frameworks; (2) each device focuses on implementing PJRT APIs, and can be opaque to the frameworks.
+
+This doc focuses on the recommendations about how to integrate with PJRT, and how to test PJRT integration with JAX.
+
+## How to integrate with PJRT
+
+### Step 1: Implement [PJRT C API interface](https://github.com/openxla/xla/blob/71a4e6e6e4e9f0f8b8f25c07a32ad489aff19239/xla/pjrt/c/pjrt_c_api.h)
+
+**Option A**: You can implement the PJRT C API directly.
+
+**Option B**: If you're able to build against C++ code in the [xla repo](https://github.com/openxla/xla) (via forking or bazel), you can also implement the PJRT C++ API and use the C→C++ wrapper:
+
+1. Implement a C++ PJRT client inheriting from the [base PJRT client](https://github.com/openxla/xla/blob/main/xla/pjrt/pjrt_client.h) (and related PJRT classes). Here are some examples of C++ PJRT client: [pjrt\_stream\_executor\_client.h](https://github.com/openxla/xla/blob/c23fbd601a017be25726fd6d624b22daa6a8a4e5/xla/pjrt/pjrt_stream_executor_client.h), [tfrt\_cpu\_pjrt\_client.h](https://github.com/openxla/xla/blob/c23fbd601a017be25726fd6d624b22daa6a8a4e5/xla/pjrt/tfrt_cpu_pjrt_client.h).
+1. Implement a few C API methods that are not part of C++ PJRT client:
+  * [PJRT\_Client\_Create](https://github.com/openxla/xla/blob/c23fbd601a017be25726fd6d624b22daa6a8a4e5/xla/pjrt/c/pjrt_c_api.h#L344-L365). Below is some sample pseudo code (assuming `GetPluginPjRtClient` returns a C++ PJRT client implemented above):
+```
+#include "third_party/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_wrapper_impl.h"
+
+namespace my_plugin {
+PJRT_Error* PJRT_Client_Create(PJRT_Client_Create_Args* args) {
+  std::unique_ptr<xla::PjRtClient> client = GetPluginPjRtClient();
+  args->client = pjrt::CreateWrapperClient(std::move(client));
+  return nullptr;
+}
+}  // namespace my_plugin
+```
+  Note [PJRT\_Client\_Create](https://github.com/openxla/xla/blob/c23fbd601a017be25726fd6d624b22daa6a8a4e5/xla/pjrt/c/pjrt_c_api.h#L344-L365) can take options passed from the framework. [Here](https://github.com/openxla/xla/blob/c23fbd601a017be25726fd6d624b22daa6a8a4e5/xla/pjrt/c/pjrt_c_api_gpu_internal.cc#L48-L102) is an example of how a GPU client uses this feature.
+
+  * [Optional] [PJRT\_TopologyDescription\_Create](https://github.com/openxla/xla/blob/c23fbd601a017be25726fd6d624b22daa6a8a4e5/xla/pjrt/c/pjrt_c_api.h#L1815-L1830).
+  * [Optional] [PJRT\_Plugin\_Initialize](https://github.com/openxla/xla/blob/c23fbd601a017be25726fd6d624b22daa6a8a4e5/xla/pjrt/c/pjrt_c_api.h#L173-L180). This is a one-time plugin setup, which will be called by the framework before any other functions are called.
+  * [Optional] [PJRT\_Plugin\_Attributes](https://github.com/openxla/xla/blob/c23fbd601a017be25726fd6d624b22daa6a8a4e5/xla/pjrt/c/pjrt_c_api.h#L182-L194).
+
+With the [wrapper](https://github.com/openxla/xla/blob/c23fbd601a017be25726fd6d624b22daa6a8a4e5/xla/pjrt/c/pjrt_c_api_wrapper_impl.h), you do not need to implement the remaining C APIs.
+
+
+### Step 2: Implement GetPjRtApi
+
+You need to implement a method `GetPjRtApi` which returns a `PJRT_Api*` containing function pointers to PJRT C API implementations. Below is an example assuming implementing through wrapper (similar to [pjrt\_c\_api\_cpu.cc](https://github.com/openxla/xla/blob/main/xla/pjrt/c/pjrt_c_api_cpu.cc)):
+```
+const PJRT_Api* GetPjrtApi() {
+  static const PJRT_Api pjrt_api =
+      pjrt::CreatePjrtApi(my_plugin::PJRT_Client_Create);
+  return &pjrt_api;
+}
+```
+
+### Step 3: Test C API implementations
+
+You can call [RegisterPjRtCApiTestFactory](https://github.com/openxla/xla/blob/c23fbd601a017be25726fd6d624b22daa6a8a4e5/xla/pjrt/c/pjrt_c_api_test.h#L31C6-L31C33) to run a small set of tests for basic PJRT C API behaviors.
+
+## How to use a PJRT plugin from JAX
+
+### Step 1: Set up JAX
+
+You can either use JAX nightly
+```
+pip install --pre -U jaxlib -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
+
+pip install git+https://github.com/google/jax
+```
+or [build JAX from source](https://jax.readthedocs.io/en/latest/developer.html#building-jaxlib-from-source).
+
+For now, you need to match the jaxlib version with the PJRT C API version. It's usually sufficient to use a jaxlib nightly version from the same day as the TF commit you're building your plugin against, e.g.
+```
+pip install --pre -U jaxlib==0.4.2.dev20230103 -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
+```
+You can also build a jaxlib from source at exactly the XLA commit you're building against ([instructions](https://jax.readthedocs.io/en/latest/developer.html#building-jaxlib-from-source-with-a-modified-xla-repository)).
+
+We will start supporting ABI compatibility soon.
+
+### Step 2: Use jax\_plugins namespace or set up entry\_point
+
+There are two options for your plugin to be discovered by JAX.
+
+1. Using namespace packages ([ref](https://packaging.python.org/en/latest/guides/creating-and-discovering-plugins/#using-naming-convention)). Define a globally unique module under the `jax_plugins` namespace package (i.e. just create a `jax_plugins` directory and define your module below it). Here is an example directory structure:
+```
+jax_plugins/
+  my_plugin/
+    __init__.py
+    my_plugin.so
+```
+2. Using package metadata ([ref](https://packaging.python.org/en/latest/guides/creating-and-discovering-plugins/#using-package-metadata)). If building a package via pyproject.toml or setup.py, advertise your plugin module name by including an entry-point under the `jax_plugins` group which points to your full module name. Here is an example via pyproject.toml or setup.py:
+```
+# use pyproject.toml
+[project.entry-points.'jax_plugins']
+my_plugin = 'my_plugin'
+
+# use setup.py
+entry_points={
+  "jax_plugins": [
+    "my_plugin = my_plugin",
+  ],
+}
+```
+Here are examples of how openxla-pjrt-plugin is implemented using Option 2: https://github.com/openxla/openxla-pjrt-plugin/pull/119, https://github.com/openxla/openxla-pjrt-plugin/pull/120.
+
+### Step 3: Implement an initialize() method
+
+You need to implement an initialize() method in your python module to register the plugin, for example:
+```
+import os
+import jax._src.xla_bridge as xb
+
+def initialize():
+  path = os.path.join(os.path.dirname(__file__), 'my_plugin.so')
+  xb.register_plugin('my_plugin', priority=500, library_path=path, options=None)
+```
+Please refer to [here](https://github.com/google/jax/blob/8f283bc9ed50d3828bd468ae57b1ee4df1527624/jax/_src/xla_bridge.py#L420) about how to use `xla_bridge.register_plugin`. It is currently a private method. A public API will be released in the future.
+
+You can run the line below to verify that the plugin is registered and raise an error if it can't be loaded.
+```
+jax.config.update("jax_platforms", "my_plugin")
+```
+JAX may have multiple backends/plugins. There are a few options to ensure your plugin is used as the default backend:
+*   Option 1: run `jax.config.update("jax_platforms", "my_plugin")` in the beginning of the program.
+*   Option 2: set ENV `JAX_PLATFORMS=my_plugin`.
+*   Option 3: set a high enough priority when calling xb.register\_plugin (the default value is 400 which is higher than other existing backends). Note the backend with highest priority will be used only when `JAX_PLATFORMS=''`. The default value of `JAX_PLATFORMS` is `''` but sometimes it will get overwritten.
+
+## How to test with JAX
+
+Some basic test cases to try:
+```
+# JAX 1+1
+print(jax.numpy.add(1, 1))
+# => 2
+
+# jit
+print(jax.jit(lambda x: x * 2)(1.))
+# => 2.0
+
+# pmap
+
+arr = jax.numpy.arange(jax.device_count()) print(jax.pmap(lambda x: x +
+jax.lax.psum(x, 'i'), axis_name='i')(arr))
+
+# single device: [0]
+
+# 4 devices: [6 7 8 9]
+
+```
+(We'll add instructions for running the jax unit tests against your plugin soon!)
+
+## Example: JAX CUDA plugin
+
+1. PJRT C API implementation through wrapper ([pjrt\_c\_api\_gpu.h](https://github.com/openxla/xla/blob/c23fbd601a017be25726fd6d624b22daa6a8a4e5/xla/pjrt/c/pjrt_c_api_gpu.h)).
+1. Set up the entry point for the package ([setup.py](https://github.com/google/jax/blob/main/jax_plugins/cuda/setup.py)).
+1. Implement an initialize() method ([\_\_init\_\_.py](https://github.com/google/jax/blob/a10854786b6d1bc92a65dd314916b151640789af/plugins/cuda/__init__.py#L31-L51)).
+1. Can be tested with any jax tests for CUDA.
+```
diff --git a/third_party/xla/opensource_only.files b/third_party/xla/opensource_only.files
index 2b635d53128b3f..666a858f3e4bd9 100644
--- a/third_party/xla/opensource_only.files
+++ b/third_party/xla/opensource_only.files
@@ -19,13 +19,17 @@ third_party/llvm_openmp/openmp.bzl:
 third_party/ortools/BUILD:
 third_party/ortools/glpk.BUILD:
 third_party/ortools/ortools.patch:
-third_party/py/non_hermetic/BUILD.tpl:
-third_party/py/non_hermetic/BUILD:
-third_party/py/non_hermetic/README:
-third_party/py/non_hermetic/ml_dtypes/BUILD:
-third_party/py/non_hermetic/ml_dtypes/LICENSE:
-third_party/py/non_hermetic/numpy/BUILD:
-third_party/py/non_hermetic/python_configure.bzl:
+third_party/py/BUILD.tpl:
+third_party/py/BUILD:
+third_party/py/ml_dtypes/BUILD:
+third_party/py/ml_dtypes/LICENSE:
+third_party/py/numpy/BUILD:
+third_party/py/python_configure.bzl:
+third_party/py/python_init_pip.bzl:
+third_party/py/python_init_repositories.bzl:
+third_party/py/python_init_rules.bzl:
+third_party/py/python_init_toolchains.bzl:
+third_party/py/python_repo.bzl:
 third_party/python_runtime/BUILD:
 third_party/repo.bzl:
 third_party/stablehlo/BUILD:
@@ -40,6 +44,7 @@ tools/toolchains/java/BUILD:
 tools/toolchains/python/BUILD:
 tools/toolchains/remote/BUILD:
 tools/toolchains/remote_config/BUILD:
+tools/toolchains/win/20240424/BUILD:
 tools/toolchains/win/BUILD:
 tools/toolchains/win/bazel_211/BUILD:
 tools/toolchains/win/tf_win_05022023/BUILD:
diff --git a/third_party/xla/requirements_lock_3_11.txt b/third_party/xla/requirements_lock_3_11.txt
new file mode 100644
index 00000000000000..5c4bb687dfecae
--- /dev/null
+++ b/third_party/xla/requirements_lock_3_11.txt
@@ -0,0 +1,49 @@
+numpy==1.24.3 \
+    --hash=sha256:0ec87a7084caa559c36e0a2309e4ecb1baa03b687201d0a847c8b0ed476a7187 \
+    --hash=sha256:1a7d6acc2e7524c9955e5c903160aa4ea083736fde7e91276b0e5d98e6332812 \
+    --hash=sha256:202de8f38fc4a45a3eea4b63e2f376e5f2dc64ef0fa692838e31a808520efaf7 \
+    --hash=sha256:210461d87fb02a84ef243cac5e814aad2b7f4be953b32cb53327bb49fd77fbb4 \
+    --hash=sha256:2d926b52ba1367f9acb76b0df6ed21f0b16a1ad87c6720a1121674e5cf63e2b6 \
+    --hash=sha256:352ee00c7f8387b44d19f4cada524586f07379c0d49270f87233983bc5087ca0 \
+    --hash=sha256:35400e6a8d102fd07c71ed7dcadd9eb62ee9a6e84ec159bd48c28235bbb0f8e4 \
+    --hash=sha256:3c1104d3c036fb81ab923f507536daedc718d0ad5a8707c6061cdfd6d184e570 \
+    --hash=sha256:4719d5aefb5189f50887773699eaf94e7d1e02bf36c1a9d353d9f46703758ca4 \
+    --hash=sha256:4749e053a29364d3452c034827102ee100986903263e89884922ef01a0a6fd2f \
+    --hash=sha256:5342cf6aad47943286afa6f1609cad9b4266a05e7f2ec408e2cf7aea7ff69d80 \
+    --hash=sha256:56e48aec79ae238f6e4395886b5eaed058abb7231fb3361ddd7bfdf4eed54289 \
+    --hash=sha256:76e3f4e85fc5d4fd311f6e9b794d0c00e7002ec122be271f2019d63376f1d385 \
+    --hash=sha256:7776ea65423ca6a15255ba1872d82d207bd1e09f6d0894ee4a64678dd2204078 \
+    --hash=sha256:784c6da1a07818491b0ffd63c6bbe5a33deaa0e25a20e1b3ea20cf0e43f8046c \
+    --hash=sha256:8535303847b89aa6b0f00aa1dc62867b5a32923e4d1681a35b5eef2d9591a463 \
+    --hash=sha256:9a7721ec204d3a237225db3e194c25268faf92e19338a35f3a224469cb6039a3 \
+    --hash=sha256:a1d3c026f57ceaad42f8231305d4653d5f05dc6332a730ae5c0bea3513de0950 \
+    --hash=sha256:ab344f1bf21f140adab8e47fdbc7c35a477dc01408791f8ba00d018dd0bc5155 \
+    --hash=sha256:ab5f23af8c16022663a652d3b25dcdc272ac3f83c3af4c02eb8b824e6b3ab9d7 \
+    --hash=sha256:ae8d0be48d1b6ed82588934aaaa179875e7dc4f3d84da18d7eae6eb3f06c242c \
+    --hash=sha256:c91c4afd8abc3908e00a44b2672718905b8611503f7ff87390cc0ac3423fb096 \
+    --hash=sha256:d5036197ecae68d7f491fcdb4df90082b0d4960ca6599ba2659957aafced7c17 \
+    --hash=sha256:d6cc757de514c00b24ae8cf5c876af2a7c3df189028d68c0cb4eaa9cd5afc2bf \
+    --hash=sha256:d933fabd8f6a319e8530d0de4fcc2e6a61917e0b0c271fded460032db42a0fe4 \
+    --hash=sha256:ea8282b9bcfe2b5e7d491d0bf7f3e2da29700cec05b49e64d6246923329f2b02 \
+    --hash=sha256:ecde0f8adef7dfdec993fd54b0f78183051b6580f606111a6d789cd14c61ea0c \
+    --hash=sha256:f21c442fdd2805e91799fbe044a7b999b8571bb0ab0f7850d0cb9641a687092b
+lit==17.0.6 \
+    --hash=sha256:dfa9af9b55fc4509a56be7bf2346f079d7f4a242d583b9f2e0b078fd0abae31b
+ml-dtypes==0.3.2 \
+    --hash=sha256:2c34f2ba9660b21fe1034b608308a01be82bbef2a92fb8199f24dc6bad0d5226 \
+    --hash=sha256:3a17ef2322e60858d93584e9c52a5be7dd6236b056b7fa1ec57f1bb6ba043e33 \
+    --hash=sha256:533059bc5f1764fac071ef54598db358c167c51a718f68f5bb55e3dee79d2967 \
+    --hash=sha256:6604877d567a29bfe7cc02969ae0f2425260e5335505cf5e7fefc3e5465f5655 \
+    --hash=sha256:6b35c4e8ca957c877ac35c79ffa77724ecc3702a1e4b18b08306c03feae597bb \
+    --hash=sha256:763697ab8a88d47443997a7cdf3aac7340049aed45f7521f6b0ec8a0594821fe \
+    --hash=sha256:7a4c3fcbf86fa52d0204f07cfd23947ef05b4ad743a1a988e163caa34a201e5e \
+    --hash=sha256:7afde548890a92b41c0fed3a6c525f1200a5727205f73dc21181a2726571bb53 \
+    --hash=sha256:7ba8e1fafc7fff3e643f453bffa7d082df1678a73286ce8187d3e825e776eb94 \
+    --hash=sha256:91f8783fd1f2c23fd3b9ee5ad66b785dafa58ba3cdb050c4458021fa4d1eb226 \
+    --hash=sha256:93b78f53431c93953f7850bb1b925a17f0ab5d97527e38a7e865b5b4bc5cfc18 \
+    --hash=sha256:961134ea44c7b8ca63eda902a44b58cd8bd670e21d62e255c81fba0a8e70d9b7 \
+    --hash=sha256:b89b194e9501a92d289c1ffd411380baf5daafb9818109a4f49b0a1b6dce4462 \
+    --hash=sha256:c7b3fb3d4f6b39bcd4f6c4b98f406291f0d681a895490ee29a0f95bab850d53c \
+    --hash=sha256:d1a746fe5fb9cd974a91070174258f0be129c592b93f9ce7df6cc336416c3fbd \
+    --hash=sha256:e8505946df1665db01332d885c2020b4cb9e84a8b1241eb4ba69d59591f65855 \
+    --hash=sha256:f47619d978ab1ae7dfdc4052ea97c636c6263e1f19bd1be0e42c346b98d15ff4
\ No newline at end of file
diff --git a/third_party/xla/third_party/py/BUILD b/third_party/xla/third_party/py/BUILD
index e69de29bb2d1d6..84eba77ce1a7af 100644
--- a/third_party/xla/third_party/py/BUILD
+++ b/third_party/xla/third_party/py/BUILD
@@ -0,0 +1,40 @@
+load("@python//:defs.bzl", "compile_pip_requirements")
+load("@python_version_repo//:py_version.bzl", "REQUIREMENTS")
+
+compile_pip_requirements(
+    name = "requirements",
+    extra_args = [
+        "--allow-unsafe",
+        "--build-isolation",
+    ],
+    generate_hashes = True,
+    requirements_in = "requirements.in",
+    requirements_txt = REQUIREMENTS,
+)
+
+compile_pip_requirements(
+    name = "requirements_nightly",
+    data = ["test-requirements.txt"],
+    extra_args = [
+        "--allow-unsafe",
+        "--build-isolation",
+        "--extra-index-url=https://pypi.anaconda.org/scientific-python-nightly-wheels/simple",
+        "--pre",
+        "--upgrade",
+    ],
+    generate_hashes = False,
+    requirements_in = "requirements.in",
+    requirements_txt = REQUIREMENTS,
+)
+
+compile_pip_requirements(
+    name = "requirements_dev",
+    extra_args = [
+        "--allow-unsafe",
+        "--build-isolation",
+        "--upgrade",
+    ],
+    generate_hashes = False,
+    requirements_in = "requirements.in",
+    requirements_txt = REQUIREMENTS,
+)
diff --git a/third_party/xla/third_party/py/ml_dtypes/ml_dtypes.BUILD b/third_party/xla/third_party/py/ml_dtypes/ml_dtypes.BUILD
index 25c98abc05655d..f386124a36dfe8 100644
--- a/third_party/xla/third_party/py/ml_dtypes/ml_dtypes.BUILD
+++ b/third_party/xla/third_party/py/ml_dtypes/ml_dtypes.BUILD
@@ -20,21 +20,6 @@ cc_library(
     deps = ["@eigen_archive//:eigen3"],
 )
 
-cc_library(
-    name = "int4",
-    hdrs = ["include/int4.h"],
-    include_prefix = "ml_dtypes",
-    # Internal headers are all relative to . but other packages
-    # include these headers with the  prefix.
-    includes = [
-        ".",
-        "ml_dtypes",
-    ],
-    deps = [
-        ":intn",
-    ],
-)
-
 cc_library(
     name = "intn",
     hdrs = ["include/intn.h"],
@@ -62,7 +47,6 @@ pybind_extension(
     visibility = [":__subpackages__"],
     deps = [
         ":float8",
-        ":int4",
         ":intn",
         "@eigen_archive//:eigen3",
         "@local_tsl//third_party/py/numpy:headers",
diff --git a/third_party/xla/third_party/py/ml_dtypes/ml_dtypes.tests.BUILD b/third_party/xla/third_party/py/ml_dtypes/ml_dtypes.tests.BUILD
index 3e73c6f4747976..c811379a19dabd 100644
--- a/third_party/xla/third_party/py/ml_dtypes/ml_dtypes.tests.BUILD
+++ b/third_party/xla/third_party/py/ml_dtypes/ml_dtypes.tests.BUILD
@@ -60,11 +60,11 @@ cc_test(
 )
 
 cc_test(
-    name = "int4_test_cc",
-    srcs = ["int4_test.cc"],
+    name = "intn_test_cc",
+    srcs = ["intn_test.cc"],
     linkstatic = 1,
     deps = [
-        "//:int4",
+        "//:intn",
         "@com_google_googletest//:gtest_main",
         "@eigen_archive//:eigen3",
     ],
diff --git a/third_party/xla/third_party/py/ml_dtypes/workspace.bzl b/third_party/xla/third_party/py/ml_dtypes/workspace.bzl
index ac75f63653e8ab..51505bf3a1460d 100644
--- a/third_party/xla/third_party/py/ml_dtypes/workspace.bzl
+++ b/third_party/xla/third_party/py/ml_dtypes/workspace.bzl
@@ -7,8 +7,8 @@ float8 varieties, and int4.
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
-    ML_DTYPES_COMMIT = "15b400f4dbad93e725e77e7b8171e4bfebfac874"
-    ML_DTYPES_SHA256 = "368312e4909bffe6a5ef22640ddae425ee14101af069a2e48b69d2fee33461e4"
+    ML_DTYPES_COMMIT = "24084d9ed2c3d45bf83b7a9bff833aa185bf9172"
+    ML_DTYPES_SHA256 = "c916a3e6b3d9bdcb476f506fdbbecb6d5e9f21f82f221dfcb42b320b4e85e55a"
     tf_http_archive(
         name = "ml_dtypes",
         build_file = "//third_party/py/ml_dtypes:ml_dtypes.BUILD",
diff --git a/third_party/xla/third_party/py/non_hermetic/BUILD b/third_party/xla/third_party/py/non_hermetic/BUILD
deleted file mode 100644
index e69de29bb2d1d6..00000000000000
diff --git a/third_party/xla/third_party/py/non_hermetic/BUILD.tpl b/third_party/xla/third_party/py/non_hermetic/BUILD.tpl
deleted file mode 100644
index 45480bd4a31cf8..00000000000000
--- a/third_party/xla/third_party/py/non_hermetic/BUILD.tpl
+++ /dev/null
@@ -1,80 +0,0 @@
-licenses(["restricted"])
-
-package(default_visibility = ["//visibility:public"])
-
-# Point both runtimes to the same python binary to ensure we always
-# use the python binary specified by ./configure.py script.
-load("@bazel_tools//tools/python:toolchain.bzl", "py_runtime_pair")
-
-py_runtime(
-    name = "py2_runtime",
-    interpreter_path = "%{PYTHON_BIN_PATH}",
-    python_version = "PY2",
-)
-
-py_runtime(
-    name = "py3_runtime",
-    interpreter_path = "%{PYTHON_BIN_PATH}",
-    python_version = "PY3",
-)
-
-py_runtime_pair(
-    name = "py_runtime_pair",
-    py2_runtime = ":py2_runtime",
-    py3_runtime = ":py3_runtime",
-)
-
-toolchain(
-    name = "py_toolchain",
-    toolchain = ":py_runtime_pair",
-    toolchain_type = "@bazel_tools//tools/python:toolchain_type",
-    target_compatible_with = [%{PLATFORM_CONSTRAINT}],
-    exec_compatible_with = [%{PLATFORM_CONSTRAINT}],
-)
-
-# To build Python C/C++ extension on Windows, we need to link to python import library pythonXY.lib
-# See https://docs.python.org/3/extending/windows.html
-cc_import(
-    name = "python_lib",
-    interface_library = select({
-        ":windows": ":python_import_lib",
-        # A placeholder for Unix platforms which makes --no_build happy.
-        "//conditions:default": "not-existing.lib",
-    }),
-    system_provided = 1,
-)
-
-cc_library(
-    name = "python_headers",
-    hdrs = [":python_include"],
-    deps = select({
-        ":windows": [":python_lib"],
-        "//conditions:default": [],
-    }),
-    includes = ["python_include"],
-)
-
-# This alias is exists for the use of targets in the @llvm-project dependency,
-# which expect a python_headers target called @python_runtime//:headers. We use
-# a repo_mapping to alias python_runtime to this package, and an alias to create
-# the correct target.
-alias(
-    name = "headers",
-    actual = ":python_headers",
-)
-
-cc_library(
-    name = "numpy_headers",
-    hdrs = [":numpy_include"],
-    includes = ["numpy_include"],
-)
-
-config_setting(
-    name = "windows",
-    values = {"cpu": "x64_windows"},
-    visibility = ["//visibility:public"],
-)
-
-%{PYTHON_INCLUDE_GENRULE}
-%{NUMPY_INCLUDE_GENRULE}
-%{PYTHON_IMPORT_LIB_GENRULE}
\ No newline at end of file
diff --git a/third_party/xla/third_party/py/non_hermetic/README b/third_party/xla/third_party/py/non_hermetic/README
deleted file mode 100644
index 62188a5817a09e..00000000000000
--- a/third_party/xla/third_party/py/non_hermetic/README
+++ /dev/null
@@ -1,3 +0,0 @@
-This is a temporary copy of python_configure repository rule. It allows
-XLA and TSL to keep non-hermetic Python while TF uses hermetic Python.
-DO NOT DEPEND ON THIS COPY as it will be deleted soon.
\ No newline at end of file
diff --git a/third_party/xla/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.BUILD b/third_party/xla/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.BUILD
index 25c98abc05655d..f386124a36dfe8 100644
--- a/third_party/xla/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.BUILD
+++ b/third_party/xla/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.BUILD
@@ -20,21 +20,6 @@ cc_library(
     deps = ["@eigen_archive//:eigen3"],
 )
 
-cc_library(
-    name = "int4",
-    hdrs = ["include/int4.h"],
-    include_prefix = "ml_dtypes",
-    # Internal headers are all relative to . but other packages
-    # include these headers with the  prefix.
-    includes = [
-        ".",
-        "ml_dtypes",
-    ],
-    deps = [
-        ":intn",
-    ],
-)
-
 cc_library(
     name = "intn",
     hdrs = ["include/intn.h"],
@@ -62,7 +47,6 @@ pybind_extension(
     visibility = [":__subpackages__"],
     deps = [
         ":float8",
-        ":int4",
         ":intn",
         "@eigen_archive//:eigen3",
         "@local_tsl//third_party/py/numpy:headers",
diff --git a/third_party/xla/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.tests.BUILD b/third_party/xla/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.tests.BUILD
index 3e73c6f4747976..c811379a19dabd 100644
--- a/third_party/xla/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.tests.BUILD
+++ b/third_party/xla/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.tests.BUILD
@@ -60,11 +60,11 @@ cc_test(
 )
 
 cc_test(
-    name = "int4_test_cc",
-    srcs = ["int4_test.cc"],
+    name = "intn_test_cc",
+    srcs = ["intn_test.cc"],
     linkstatic = 1,
     deps = [
-        "//:int4",
+        "//:intn",
         "@com_google_googletest//:gtest_main",
         "@eigen_archive//:eigen3",
     ],
diff --git a/third_party/xla/third_party/py/non_hermetic/ml_dtypes/workspace.bzl b/third_party/xla/third_party/py/non_hermetic/ml_dtypes/workspace.bzl
index ac75f63653e8ab..51505bf3a1460d 100644
--- a/third_party/xla/third_party/py/non_hermetic/ml_dtypes/workspace.bzl
+++ b/third_party/xla/third_party/py/non_hermetic/ml_dtypes/workspace.bzl
@@ -7,8 +7,8 @@ float8 varieties, and int4.
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
-    ML_DTYPES_COMMIT = "15b400f4dbad93e725e77e7b8171e4bfebfac874"
-    ML_DTYPES_SHA256 = "368312e4909bffe6a5ef22640ddae425ee14101af069a2e48b69d2fee33461e4"
+    ML_DTYPES_COMMIT = "24084d9ed2c3d45bf83b7a9bff833aa185bf9172"
+    ML_DTYPES_SHA256 = "c916a3e6b3d9bdcb476f506fdbbecb6d5e9f21f82f221dfcb42b320b4e85e55a"
     tf_http_archive(
         name = "ml_dtypes",
         build_file = "//third_party/py/ml_dtypes:ml_dtypes.BUILD",
diff --git a/third_party/xla/third_party/py/non_hermetic/python_configure.bzl b/third_party/xla/third_party/py/non_hermetic/python_configure.bzl
deleted file mode 100644
index 89732c3e33d8ee..00000000000000
--- a/third_party/xla/third_party/py/non_hermetic/python_configure.bzl
+++ /dev/null
@@ -1,312 +0,0 @@
-"""Repository rule for Python autoconfiguration.
-
-`python_configure` depends on the following environment variables:
-
-  * `PYTHON_BIN_PATH`: location of python binary.
-  * `PYTHON_LIB_PATH`: Location of python libraries.
-"""
-
-load(
-    "//third_party/remote_config:common.bzl",
-    "BAZEL_SH",
-    "PYTHON_BIN_PATH",
-    "PYTHON_LIB_PATH",
-    "TF_PYTHON_CONFIG_REPO",
-    "auto_config_fail",
-    "config_repo_label",
-    "execute",
-    "get_bash_bin",
-    "get_host_environ",
-    "get_python_bin",
-    "is_windows",
-    "raw_exec",
-    "read_dir",
-)
-
-def _genrule(src_dir, genrule_name, command, outs):
-    """Returns a string with a genrule.
-
-    Genrule executes the given command and produces the given outputs.
-    """
-    return (
-        "genrule(\n" +
-        '    name = "' +
-        genrule_name + '",\n' +
-        "    outs = [\n" +
-        outs +
-        "\n    ],\n" +
-        '    cmd = """\n' +
-        command +
-        '\n   """,\n' +
-        ")\n"
-    )
-
-def _norm_path(path):
-    """Returns a path with '/' and remove the trailing slash."""
-    path = path.replace("\\", "/")
-    if path[-1] == "/":
-        path = path[:-1]
-    return path
-
-def _symlink_genrule_for_dir(
-        repository_ctx,
-        src_dir,
-        dest_dir,
-        genrule_name,
-        src_files = [],
-        dest_files = []):
-    """Returns a genrule to symlink(or copy if on Windows) a set of files.
-
-    If src_dir is passed, files will be read from the given directory; otherwise
-    we assume files are in src_files and dest_files
-    """
-    if src_dir != None:
-        src_dir = _norm_path(src_dir)
-        dest_dir = _norm_path(dest_dir)
-        files = "\n".join(read_dir(repository_ctx, src_dir))
-
-        # Create a list with the src_dir stripped to use for outputs.
-        dest_files = files.replace(src_dir, "").splitlines()
-        src_files = files.splitlines()
-    command = []
-    outs = []
-    for i in range(len(dest_files)):
-        if dest_files[i] != "":
-            # If we have only one file to link we do not want to use the dest_dir, as
-            # $(@D) will include the full path to the file.
-            dest = "$(@D)/" + dest_dir + dest_files[i] if len(dest_files) != 1 else "$(@D)/" + dest_files[i]
-
-            # Copy the headers to create a sandboxable setup.
-            cmd = "cp -f"
-            command.append(cmd + ' "%s" "%s"' % (src_files[i], dest))
-            outs.append('        "' + dest_dir + dest_files[i] + '",')
-    genrule = _genrule(
-        src_dir,
-        genrule_name,
-        " && ".join(command),
-        "\n".join(outs),
-    )
-    return genrule
-
-def _get_python_lib(repository_ctx, python_bin):
-    """Gets the python lib path."""
-    python_lib = get_host_environ(repository_ctx, PYTHON_LIB_PATH)
-    if python_lib != None:
-        return python_lib
-
-    # The interesting program to execute.
-    print_lib = [
-        "from __future__ import print_function",
-        "import site",
-        "import os",
-        "python_paths = []",
-        "if os.getenv('PYTHONPATH') is not None:",
-        "  python_paths = os.getenv('PYTHONPATH').split(':')",
-        "try:",
-        "  library_paths = site.getsitepackages()",
-        "except AttributeError:",
-        "  from distutils.sysconfig import get_python_lib",
-        "  library_paths = [get_python_lib()]",
-        "all_paths = set(python_paths + library_paths)",
-        "paths = []",
-        "for path in all_paths:",
-        "  if os.path.isdir(path):",
-        "    paths.append(path)",
-        "if len(paths) >=1:",
-        "  print(paths[0])",
-    ]
-
-    # The below script writes the above program to a file
-    # and executes it. This is to work around the limitation
-    # of not being able to upload files as part of execute.
-    cmd = "from os import linesep;"
-    cmd += "f = open('script.py', 'w');"
-    for line in print_lib:
-        cmd += "f.write(\"%s\" + linesep);" % line
-    cmd += "f.close();"
-    cmd += "from subprocess import call;"
-    cmd += "call([\"%s\", \"script.py\"]);" % python_bin
-
-    result = execute(repository_ctx, [python_bin, "-c", cmd])
-    return result.stdout.strip()
-
-def _check_python_lib(repository_ctx, python_lib):
-    """Checks the python lib path."""
-    cmd = 'test -d "%s" -a -x "%s"' % (python_lib, python_lib)
-    result = raw_exec(repository_ctx, [get_bash_bin(repository_ctx), "-c", cmd])
-    if result.return_code == 1:
-        auto_config_fail("Invalid python library path: %s" % python_lib)
-
-def _check_python_bin(repository_ctx, python_bin):
-    """Checks the python bin path."""
-    cmd = '[[ -x "%s" ]] && [[ ! -d "%s" ]]' % (python_bin, python_bin)
-    result = raw_exec(repository_ctx, [get_bash_bin(repository_ctx), "-c", cmd])
-    if result.return_code == 1:
-        auto_config_fail("--define %s='%s' is not executable. Is it the python binary?" % (
-            PYTHON_BIN_PATH,
-            python_bin,
-        ))
-
-def _get_python_include(repository_ctx, python_bin):
-    """Gets the python include path."""
-    result = execute(
-        repository_ctx,
-        [
-            python_bin,
-            "-Wignore",
-            "-c",
-            "import sysconfig; " +
-            "print(sysconfig.get_path('include'))",
-        ],
-        error_msg = "Problem getting python include path.",
-        error_details = ("Is the Python binary path set up right? " +
-                         "(See ./configure or " + PYTHON_BIN_PATH + ".) " +
-                         "Is distutils installed?"),
-    )
-    return result.stdout.splitlines()[0]
-
-def _get_python_import_lib_name(repository_ctx, python_bin):
-    """Get Python import library name (pythonXY.lib) on Windows."""
-    result = execute(
-        repository_ctx,
-        [
-            python_bin,
-            "-c",
-            "import sys;" +
-            'print("python" + str(sys.version_info[0]) + ' +
-            '      str(sys.version_info[1]) + ".lib")',
-        ],
-        error_msg = "Problem getting python import library.",
-        error_details = ("Is the Python binary path set up right? " +
-                         "(See ./configure or " + PYTHON_BIN_PATH + ".) "),
-    )
-    return result.stdout.splitlines()[0]
-
-def _get_numpy_include(repository_ctx, python_bin):
-    """Gets the numpy include path."""
-    return execute(
-        repository_ctx,
-        [
-            python_bin,
-            "-c",
-            "from __future__ import print_function;" +
-            "import numpy;" +
-            " print(numpy.get_include());",
-        ],
-        error_msg = "Problem getting numpy include path.",
-        error_details = "Is numpy installed?",
-    ).stdout.splitlines()[0]
-
-def _create_local_python_repository(repository_ctx):
-    """Creates the repository containing files set up to build with Python."""
-
-    # Resolve all labels before doing any real work. Resolving causes the
-    # function to be restarted with all previous state being lost. This
-    # can easily lead to a O(n^2) runtime in the number of labels.
-    build_tpl = repository_ctx.path(Label("//third_party/py:BUILD.tpl"))
-
-    python_bin = get_python_bin(repository_ctx)
-    _check_python_bin(repository_ctx, python_bin)
-    python_lib = _get_python_lib(repository_ctx, python_bin)
-    _check_python_lib(repository_ctx, python_lib)
-    python_include = _get_python_include(repository_ctx, python_bin)
-    numpy_include = _get_numpy_include(repository_ctx, python_bin) + "/numpy"
-    python_include_rule = _symlink_genrule_for_dir(
-        repository_ctx,
-        python_include,
-        "python_include",
-        "python_include",
-    )
-    python_import_lib_genrule = ""
-
-    # To build Python C/C++ extension on Windows, we need to link to python import library pythonXY.lib
-    # See https://docs.python.org/3/extending/windows.html
-    if is_windows(repository_ctx):
-        python_bin = python_bin.replace("\\", "/")
-        python_include = _norm_path(python_include)
-        python_import_lib_name = _get_python_import_lib_name(repository_ctx, python_bin)
-        python_import_lib_src = python_include.rsplit("/", 1)[0] + "/libs/" + python_import_lib_name
-        python_import_lib_genrule = _symlink_genrule_for_dir(
-            repository_ctx,
-            None,
-            "",
-            "python_import_lib",
-            [python_import_lib_src],
-            [python_import_lib_name],
-        )
-    numpy_include_rule = _symlink_genrule_for_dir(
-        repository_ctx,
-        numpy_include,
-        "numpy_include/numpy",
-        "numpy_include",
-    )
-
-    platform_constraint = ""
-    if repository_ctx.attr.platform_constraint:
-        platform_constraint = "\"%s\"" % repository_ctx.attr.platform_constraint
-    repository_ctx.template("BUILD", build_tpl, {
-        "%{PYTHON_BIN_PATH}": python_bin,
-        "%{PYTHON_INCLUDE_GENRULE}": python_include_rule,
-        "%{PYTHON_IMPORT_LIB_GENRULE}": python_import_lib_genrule,
-        "%{NUMPY_INCLUDE_GENRULE}": numpy_include_rule,
-        "%{PLATFORM_CONSTRAINT}": platform_constraint,
-    })
-
-def _create_remote_python_repository(repository_ctx, remote_config_repo):
-    """Creates pointers to a remotely configured repo set up to build with Python.
-    """
-    repository_ctx.template("BUILD", config_repo_label(remote_config_repo, ":BUILD"), {})
-
-def _python_autoconf_impl(repository_ctx):
-    """Implementation of the python_autoconf repository rule."""
-    if get_host_environ(repository_ctx, TF_PYTHON_CONFIG_REPO) != None:
-        _create_remote_python_repository(
-            repository_ctx,
-            get_host_environ(repository_ctx, TF_PYTHON_CONFIG_REPO),
-        )
-    else:
-        _create_local_python_repository(repository_ctx)
-
-_ENVIRONS = [
-    BAZEL_SH,
-    PYTHON_BIN_PATH,
-    PYTHON_LIB_PATH,
-]
-
-local_python_configure = repository_rule(
-    implementation = _create_local_python_repository,
-    environ = _ENVIRONS,
-    attrs = {
-        "environ": attr.string_dict(),
-        "platform_constraint": attr.string(),
-    },
-)
-
-remote_python_configure = repository_rule(
-    implementation = _create_local_python_repository,
-    environ = _ENVIRONS,
-    remotable = True,
-    attrs = {
-        "environ": attr.string_dict(),
-        "platform_constraint": attr.string(),
-    },
-)
-
-python_configure = repository_rule(
-    implementation = _python_autoconf_impl,
-    environ = _ENVIRONS + [TF_PYTHON_CONFIG_REPO],
-    attrs = {
-        "platform_constraint": attr.string(),
-    },
-)
-"""Detects and configures the local Python.
-
-Add the following to your WORKSPACE FILE:
-
-```python
-python_configure(name = "local_config_python")
-```
-
-Args:
-  name: A unique name for this workspace rule.
-"""
diff --git a/third_party/xla/third_party/py/python_init_pip.bzl b/third_party/xla/third_party/py/python_init_pip.bzl
new file mode 100644
index 00000000000000..efc2bf8233cf61
--- /dev/null
+++ b/third_party/xla/third_party/py/python_init_pip.bzl
@@ -0,0 +1,34 @@
+"""Hermetic Python initialization. Consult the WORKSPACE on how to use it."""
+
+load("@python//:defs.bzl", "interpreter")
+load("@python_version_repo//:py_version.bzl", "REQUIREMENTS")
+load("@rules_python//python:pip.bzl", "package_annotation", "pip_parse")
+
+def python_init_pip():
+    numpy_annotations = {
+        "numpy": package_annotation(
+            additive_build_content = """\
+cc_library(
+    name = "numpy_headers_2",
+    hdrs = glob(["site-packages/numpy/_core/include/**/*.h"]),
+    strip_include_prefix="site-packages/numpy/_core/include/",
+)
+cc_library(
+    name = "numpy_headers_1",
+    hdrs = glob(["site-packages/numpy/core/include/**/*.h"]),
+    strip_include_prefix="site-packages/numpy/core/include/",
+)
+cc_library(
+    name = "numpy_headers",
+    deps = [":numpy_headers_2", ":numpy_headers_1"],
+)
+""",
+        ),
+    }
+
+    pip_parse(
+        name = "pypi",
+        annotations = numpy_annotations,
+        python_interpreter_target = interpreter,
+        requirements_lock = REQUIREMENTS,
+    )
diff --git a/third_party/xla/third_party/py/python_init_repositories.bzl b/third_party/xla/third_party/py/python_init_repositories.bzl
new file mode 100644
index 00000000000000..5a405f2c2aba4c
--- /dev/null
+++ b/third_party/xla/third_party/py/python_init_repositories.bzl
@@ -0,0 +1,12 @@
+"""Hermetic Python initialization. Consult the WORKSPACE on how to use it."""
+
+load("@rules_python//python:repositories.bzl", "py_repositories")
+load("//third_party/py:python_repo.bzl", "python_repository")
+
+def python_init_repositories(requirements = {}):
+    python_repository(
+        name = "python_version_repo",
+        requirements_versions = requirements.keys(),
+        requirements_locks = requirements.values(),
+    )
+    py_repositories()
diff --git a/third_party/xla/third_party/py/python_init_rules.bzl b/third_party/xla/third_party/py/python_init_rules.bzl
new file mode 100644
index 00000000000000..98a7b8bc3c315a
--- /dev/null
+++ b/third_party/xla/third_party/py/python_init_rules.bzl
@@ -0,0 +1,11 @@
+"""Hermetic Python initialization. Consult the WORKSPACE on how to use it."""
+
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+def python_init_rules():
+    http_archive(
+        name = "rules_python",
+        sha256 = "9d04041ac92a0985e344235f5d946f71ac543f1b1565f2cdbc9a2aaee8adf55b",
+        strip_prefix = "rules_python-0.26.0",
+        url = "https://github.com/bazelbuild/rules_python/releases/download/0.26.0/rules_python-0.26.0.tar.gz",
+    )
diff --git a/third_party/xla/third_party/py/python_init_toolchains.bzl b/third_party/xla/third_party/py/python_init_toolchains.bzl
new file mode 100644
index 00000000000000..c1f800db4c01e7
--- /dev/null
+++ b/third_party/xla/third_party/py/python_init_toolchains.bzl
@@ -0,0 +1,13 @@
+"""Hermetic Python initialization. Consult the WORKSPACE on how to use it."""
+
+load("@python_version_repo//:py_version.bzl", "HERMETIC_PYTHON_VERSION")
+load("@rules_python//python:repositories.bzl", "python_register_toolchains")
+load("@rules_python//python:versions.bzl", "MINOR_MAPPING")
+
+def python_init_toolchains():
+    if HERMETIC_PYTHON_VERSION in MINOR_MAPPING:
+        python_register_toolchains(
+            name = "python",
+            ignore_root_user_error = True,
+            python_version = HERMETIC_PYTHON_VERSION,
+        )
diff --git a/third_party/xla/third_party/py/python_repo.bzl b/third_party/xla/third_party/py/python_repo.bzl
new file mode 100644
index 00000000000000..77a6ce9ce50b60
--- /dev/null
+++ b/third_party/xla/third_party/py/python_repo.bzl
@@ -0,0 +1,206 @@
+"""
+Repository rule to manage hermetic Python interpreter under Bazel.
+
+Version can be set via build parameter "--repo_env=HERMETIC_PYTHON_VERSION=3.11"
+Defaults to 3.11.
+
+To set wheel name, add "--repo_env=WHEEL_NAME=tensorflow_cpu"
+"""
+
+VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13"]
+DEFAULT_VERSION = "3.11"
+WARNING = """
+HERMETIC_PYTHON_VERSION variable was not set correctly, using default version. 
+Python {} will be used.
+To select Python version, either set HERMETIC_PYTHON_VERSION env variable in 
+your shell:
+  export HERMETIC_PYTHON_VERSION=3.12
+OR pass it as an argument to bazel command directly or inside your .bazelrc 
+file:
+  --repo_env=HERMETIC_PYTHON_VERSION=3.12
+""".format(DEFAULT_VERSION)
+
+content = """TF_PYTHON_VERSION = "{version}"
+HERMETIC_PYTHON_VERSION = "{version}"
+WHEEL_NAME = "{wheel_name}"
+WHEEL_COLLAB = "{wheel_collab}"
+REQUIREMENTS = "{requirements}"
+"""
+
+def _python_repository_impl(ctx):
+    ctx.file("BUILD", "")
+    version_legacy = ctx.os.environ.get("TF_PYTHON_VERSION", "")
+    version = ctx.os.environ.get("HERMETIC_PYTHON_VERSION", "")
+    if not version:
+        version = version_legacy
+    else:
+        version_legacy = version
+
+    wheel_name = ctx.os.environ.get("WHEEL_NAME", "tensorflow")
+    wheel_collab = ctx.os.environ.get("WHEEL_COLLAB", False)
+    if version not in VERSIONS:
+        print(WARNING)  # buildifier: disable=print
+        version = DEFAULT_VERSION
+    else:
+        print("Using hermetic Python %s" % version)  # buildifier: disable=print
+
+    requirements = ""
+    for i in range(0, len(ctx.attr.requirements_locks)):
+        if ctx.attr.requirements_versions[i] == version:
+            requirements = ctx.attr.requirements_locks[i]
+            break
+
+    ctx.file(
+        "py_version.bzl",
+        content.format(
+            version = version,
+            wheel_name = wheel_name,
+            wheel_collab = wheel_collab,
+            requirements = str(requirements),
+        ),
+    )
+
+python_repository = repository_rule(
+    implementation = _python_repository_impl,
+    attrs = {
+        "requirements_versions": attr.string_list(
+            mandatory = False,
+            default = [],
+        ),
+        "requirements_locks": attr.label_list(
+            mandatory = False,
+            default = [],
+        ),
+    },
+    environ = [
+        "TF_PYTHON_VERSION",
+        "HERMETIC_PYTHON_VERSION",
+        "WHEEL_NAME",
+        "WHEEL_COLLAB",
+    ],
+)
+
+def _custom_python_interpreter_impl(ctx):
+    version = ctx.attr.version
+    strip_prefix = ctx.attr.strip_prefix.format(version = version)
+    urls = [url.format(version = version) for url in ctx.attr.urls]
+    binary_name = ctx.attr.binary_name
+    if not binary_name:
+        ver_chunks = version.split(".")
+        binary_name = "python%s.%s" % (ver_chunks[0], ver_chunks[1])
+
+    install_dir = "{name}-{version}".format(name = ctx.attr.name, version = version)
+    _exec_and_check(ctx, ["mkdir", install_dir])
+    install_path = ctx.path(install_dir)
+    srcs_dir = "srcs"
+    ctx.download_and_extract(
+        url = urls,
+        stripPrefix = strip_prefix,
+        output = srcs_dir,
+    )
+
+    configure_params = []
+    if "CC" in ctx.os.environ:
+        configure_params.append("CC={}".format(ctx.os.environ["CC"]))
+    if "CXX" in ctx.os.environ:
+        configure_params.append("CXX={}".format(ctx.os.environ["CXX"]))
+
+    configure_params.append("--enable-optimizations")
+    configure_params.append("--prefix=%s" % install_path.realpath)
+    _exec_and_check(
+        ctx,
+        ["./configure"] + configure_params,
+        working_directory = srcs_dir,
+        quiet = False,
+    )
+    res = _exec_and_check(ctx, ["nproc"])
+    cores = 12 if res.return_code != 0 else max(1, int(res.stdout.strip()) - 1)
+    _exec_and_check(ctx, ["make", "-j%s" % cores], working_directory = srcs_dir)
+    _exec_and_check(ctx, ["make", "altinstall"], working_directory = srcs_dir)
+    _exec_and_check(ctx, ["ln", "-s", binary_name, "python3"], working_directory = install_dir + "/bin")
+    tar = "{install_dir}.tgz".format(install_dir = install_dir)
+    _exec_and_check(ctx, ["tar", "czpf", tar, install_dir])
+    _exec_and_check(ctx, ["rm", "-rf", srcs_dir])
+    res = _exec_and_check(ctx, ["sha256sum", tar])
+
+    sha256 = res.stdout.split(" ")[0].strip()
+    tar_path = ctx.path(tar)
+
+    example = """\n\n
+To use newly built Python interpreter add the following code snippet RIGHT AFTER
+python_init_toolchains() in your WORKSPACE file. The code sample should work as
+is but it may need some tuning, if you have special requirements.
+
+```
+load("@rules_python//python:repositories.bzl", "python_register_toolchains")
+python_register_toolchains(
+    name = "python",
+    # By default assume the interpreter is on the local file system, replace
+    # with proper URL if it is not the case.
+    base_url = "file://",
+    ignore_root_user_error = True,
+    python_version = "{version}",
+    tool_versions = {{
+        "{version}": {{
+            # Path to .tar.gz with Python binary. By default it points to .tgz
+            # file in cache where it was built originally; replace with proper
+            # file location, if you moved it somewhere else.
+            "url": "{tar_path}",
+            "sha256": {{
+                # By default we assume Linux x86_64 architecture, eplace with
+                # proper architecture if you were building on a different platform.
+                "x86_64-unknown-linux-gnu": "{sha256}",
+            }},
+            "strip_prefix": "{install_dir}",
+        }},
+    }},
+)
+```
+\n\n""".format(version = version, tar_path = tar_path, sha256 = sha256, install_dir = install_dir)
+
+    instructions = "INSTRUCTIONS-{version}.md".format(version = version)
+    ctx.file(instructions + ".tmpl", example, executable = False)
+    ctx.file(
+        "BUILD.bazel",
+        """
+genrule(
+    name = "{name}",
+    srcs = ["{tar}", "{instructions}.tmpl"],
+    outs = ["{install_dir}.tar.gz", "{instructions}"],
+    cmd = "cp $(location {tar}) $(location {install_dir}.tar.gz); cp $(location {instructions}.tmpl) $(location {instructions})",
+    visibility = ["//visibility:public"],
+)
+     """.format(
+            name = ctx.attr.name,
+            tar = tar,
+            install_dir = install_dir,
+            instructions = instructions,
+        ),
+        executable = False,
+    )
+
+    print(example)  # buildifier: disable=print
+
+custom_python_interpreter = repository_rule(
+    implementation = _custom_python_interpreter_impl,
+    attrs = {
+        "urls": attr.string_list(),
+        "strip_prefix": attr.string(),
+        "binary_name": attr.string(mandatory = False),
+        "version": attr.string(),
+    },
+)
+
+def _exec_and_check(ctx, command, fail_on_error = True, quiet = False, **kwargs):
+    res = ctx.execute(command, quiet = quiet, **kwargs)
+    if fail_on_error and res.return_code != 0:
+        fail("""
+Failed to execute command: `{command}`
+Exit Code: {code}
+STDERR: {stderr}
+        """.format(
+            command = command,
+            code = res.return_code,
+            stderr = res.stderr,
+        ))
+    return res
diff --git a/third_party/xla/third_party/stablehlo/temporary.patch b/third_party/xla/third_party/stablehlo/temporary.patch
index c8897665644baa..85a49110b59f95 100755
--- a/third_party/xla/third_party/stablehlo/temporary.patch
+++ b/third_party/xla/third_party/stablehlo/temporary.patch
@@ -1,7 +1,7 @@
 diff --ruN a/stablehlo/CMakeLists.txt b/stablehlo/CMakeLists.txt
 --- stablehlo/CMakeLists.txt
 +++ stablehlo/CMakeLists.txt
-@@ -13,153 +13,20 @@
+@@ -13,154 +13,20 @@
  # See the License for the specific language governing permissions and
  # limitations under the License.
  #
@@ -43,6 +43,7 @@ diff --ruN a/stablehlo/CMakeLists.txt b/stablehlo/CMakeLists.txt
  #-------------------------------------------------------------------------------
 -option(STABLEHLO_BUILD_EMBEDDED "Build StableHLO as part of another project" OFF)
 -option(STABLEHLO_ENABLE_BINDINGS_PYTHON "Enables StableHLO Python bindings" OFF)
+-option(STABLEHLO_ENABLE_PYTHON_TF_TESTS "Enables StableHLO to SavedModel tests requiring TF" OFF)
 -option(STABLEHLO_ENABLE_STRICT_BUILD "Build StableHLO with strict warnings and warnings as errors" OFF)
 -option(STABLEHLO_ENABLE_SANITIZER "Enable a sanitizer [OFF, address]" OFF)
 -option(STABLEHLO_ENABLE_SPLIT_DWARF "Enable split DWARF if the platform supports it" OFF)
@@ -174,6 +175,53 @@ diff --ruN a/stablehlo/stablehlo/CMakeLists.txt b/stablehlo/stablehlo/CMakeLists
  add_subdirectory(integrations)
  add_subdirectory(reference)
  add_subdirectory(tests)
+diff --ruN a/stablehlo/stablehlo/conversions/tosa/tests/binary.mlir b/stablehlo/stablehlo/conversions/tosa/tests/binary.mlir
+--- stablehlo/stablehlo/conversions/tosa/tests/binary.mlir
++++ stablehlo/stablehlo/conversions/tosa/tests/binary.mlir
+@@ -155,7 +155,7 @@
+ 
+ // CHECK-LABEL: @maximum_f64
+ func.func @maximum_f64(%arg0 : tensor<10xf64>, %arg1 : tensor<10xf64>) -> tensor<10xf64> {
+-  // CHECK: stablehlo.maximum
++  // CHECK: tosa.maximum
+   %0 = "stablehlo.maximum"(%arg0, %arg1) : (tensor<10xf64>, tensor<10xf64>) -> tensor<10xf64>
+   return %0 : tensor<10xf64>
+ }
+diff --ruN a/stablehlo/stablehlo/conversions/tosa/tests/nullary.mlir b/stablehlo/stablehlo/conversions/tosa/tests/nullary.mlir
+--- stablehlo/stablehlo/conversions/tosa/tests/nullary.mlir
++++ stablehlo/stablehlo/conversions/tosa/tests/nullary.mlir
+@@ -9,8 +9,7 @@
+ 
+ // CHECK-LABEL: @constant_f64
+ func.func @constant_f64() -> tensor<10xf64> {
+-  // TOSA does not support 64-bit types, so this should not legalize.
+-  // CHECK: stablehlo.constant
++  // CHECK: tosa.const
+   %0 = stablehlo.constant dense<0.000000e+00> : tensor<10xf64>
+   return %0 : tensor<10xf64>
+ }
+diff --ruN a/stablehlo/stablehlo/dialect/AssemblyFormat.cpp b/stablehlo/stablehlo/dialect/AssemblyFormat.cpp
+--- stablehlo/stablehlo/dialect/AssemblyFormat.cpp
++++ stablehlo/stablehlo/dialect/AssemblyFormat.cpp
+@@ -305,8 +305,7 @@
+ bool isCommutativeNoRegionMatchingDialect(OperationName innerOp,
+                                           StringRef reduceOpDialect) {
+   auto innerOpDialect = innerOp.getDialect();
+-  return innerOpDialect &&
+-         innerOpDialect->getNamespace().equals(reduceOpDialect) &&
++  return innerOpDialect && innerOpDialect->getNamespace() == reduceOpDialect &&
+          innerOp.hasTrait<mlir::OpTrait::NOperands<2>::Impl>() &&
+          innerOp.hasTrait<mlir::OpTrait::OneResult>() &&
+          (innerOp.hasTrait<mlir::hlo::OpTrait::IsCommutative>() ||
+@@ -359,7 +358,7 @@
+   // Check E5.
+   LLVM_DEBUG(llvm::dbgs() << "Checking ReduceOp compact print E5\n");
+   auto retOp = block.getTerminator();
+-  if (!retOp->getName().stripDialect().equals("return")) return false;
++  if (retOp->getName().stripDialect() != "return") return false;
+ 
+   return llvm::equal(innerOp.getResults(), retOp->getOperands());
+ }
 diff --ruN a/stablehlo/stablehlo/experimental/BUILD.bazel b/stablehlo/stablehlo/experimental/BUILD.bazel
 --- stablehlo/stablehlo/experimental/BUILD.bazel
 +++ stablehlo/stablehlo/experimental/BUILD.bazel
@@ -446,7 +494,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/CMakeLists.txt b/stablehlo
 diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp
 --- stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp
 +++ stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp
-@@ -0,0 +1,506 @@
+@@ -0,0 +1,505 @@
 +/* Copyright 2023 The StableHLO Authors.
 +
 +Licensed under the Apache License, Version 2.0 (the "License");
@@ -521,7 +569,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +  // reduce_window_i1
 +  SmallVector<ShapedType> inputTypes;
 +  for (auto [index, input] : llvm::enumerate(inputs)) {
-+    auto inputType = input.getType().dyn_cast<ShapedType>();
++    auto inputType = dyn_cast<ShapedType>(input.getType());
 +    inputTypes.push_back(inputType);
 +    if (!inputType)
 +      return op_.emitError()
@@ -531,7 +579,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +  // reduce_window_i2
 +  SmallVector<ShapedType> initValueTypes;
 +  for (auto [index, initValue] : llvm::enumerate(initValues)) {
-+    auto initValueType = initValue.getType().dyn_cast<ShapedType>();
++    auto initValueType = dyn_cast<ShapedType>(initValue.getType());
 +    initValueTypes.push_back(initValueType);
 +    if (!initValueType || !initValueType.hasRank() ||
 +        initValueType.getRank() != 0)
@@ -543,7 +591,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +  // reduce_window_i3...reduce_window_i7
 +  auto checkRank = [&](StringRef name, int64_t index, Value dynamicAttr,
 +                       int64_t expectedRank) -> LogicalResult {
-+    auto type = dynamicAttr.getType().dyn_cast<ShapedType>();
++    auto type = dyn_cast<ShapedType>(dynamicAttr.getType());
 +    if (!type || !type.hasRank() || type.getRank() != expectedRank ||
 +        !type.getElementType().isIntOrIndex()) {
 +      if (index < 0) index += op_->getNumOperands();
@@ -562,7 +610,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +    return failure();
 +
 +  // reduce_window_i7
-+  auto paddingType = getPadding().getType().dyn_cast<ShapedType>();
++  auto paddingType = dyn_cast<ShapedType>(getPadding().getType());
 +  if (!paddingType || !paddingType.hasRank() || paddingType.getRank() != 2 ||
 +      paddingType.getDimSize(1) != 2 ||
 +      !paddingType.getElementType().isIntOrIndex())
@@ -598,7 +646,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +  // verify them in that case, that seems like too much at this point.
 +  auto checkShape = [&](StringRef name, int64_t index, Value dynamicAttr,
 +                        ArrayRef<int64_t> expectedShape) -> LogicalResult {
-+    auto type = dynamicAttr.getType().cast<ShapedType>();
++    auto type = cast<ShapedType>(dynamicAttr.getType());
 +    if (type.getShape() != expectedShape) {
 +      if (index < 0) index += op_->getNumOperands();
 +      return op_.emitError()
@@ -622,7 +670,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +  // reduce_window_c13
 +  if (op_.getCalledComputations().size() != 1)
 +    return op_.emitError() << "expects called_computations to have 1 element";
-+  auto bodyAttr = op_.getCalledComputations()[0].cast<FlatSymbolRefAttr>();
++  auto bodyAttr = cast<FlatSymbolRefAttr>(op_.getCalledComputations()[0]);
 +  auto bodyFunc =
 +      op_->getParentOfType<ModuleOp>().lookupSymbol<func::FuncOp>(bodyAttr);
 +  if (!bodyFunc)
@@ -644,7 +692,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +  SmallVector<ShapedType> resultTypes;
 +  std::optional<ArrayRef<int64_t>> resultShape;
 +  for (auto result : results) {
-+    auto resultType = result.getType().dyn_cast<ShapedType>();
++    auto resultType = dyn_cast<ShapedType>(result.getType());
 +    resultTypes.push_back(resultType);
 +    if (!resultType) return op_.emitError() << "expects results to be tensors";
 +
@@ -683,32 +731,32 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +}
 +
 +TypedValue<ShapedType> DynamicReduceWindowOpAdaptor::getWindowDimensions() {
-+  return op_.getInputs()[op_.getInputs().size() - 5]
-+      .cast<TypedValue<ShapedType>>();
++  return cast<TypedValue<ShapedType>>(
++      op_.getInputs()[op_.getInputs().size() - 5]);
 +}
 +
 +TypedValue<ShapedType> DynamicReduceWindowOpAdaptor::getWindowStrides() {
-+  return op_.getInputs()[op_.getInputs().size() - 4]
-+      .cast<TypedValue<ShapedType>>();
++  return cast<TypedValue<ShapedType>>(
++      op_.getInputs()[op_.getInputs().size() - 4]);
 +}
 +
 +TypedValue<ShapedType> DynamicReduceWindowOpAdaptor::getBaseDilations() {
-+  return op_.getInputs()[op_.getInputs().size() - 3]
-+      .cast<TypedValue<ShapedType>>();
++  return cast<TypedValue<ShapedType>>(
++      op_.getInputs()[op_.getInputs().size() - 3]);
 +}
 +
 +TypedValue<ShapedType> DynamicReduceWindowOpAdaptor::getWindowDilations() {
-+  return op_.getInputs()[op_.getInputs().size() - 2]
-+      .cast<TypedValue<ShapedType>>();
++  return cast<TypedValue<ShapedType>>(
++      op_.getInputs()[op_.getInputs().size() - 2]);
 +}
 +
 +TypedValue<ShapedType> DynamicReduceWindowOpAdaptor::getPadding() {
-+  return op_.getInputs()[op_.getInputs().size() - 1]
-+      .cast<TypedValue<ShapedType>>();
++  return cast<TypedValue<ShapedType>>(
++      op_.getInputs()[op_.getInputs().size() - 1]);
 +}
 +
 +Region& DynamicReduceWindowOpAdaptor::getBody() {
-+  auto bodyAttr = op_.getCalledComputations()[0].cast<FlatSymbolRefAttr>();
++  auto bodyAttr = cast<FlatSymbolRefAttr>(op_.getCalledComputations()[0]);
 +  auto bodyFunc =
 +      op_->getParentOfType<ModuleOp>().lookupSymbol<func::FuncOp>(bodyAttr);
 +  return bodyFunc.getBody();
@@ -758,20 +806,20 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +  auto output = op_.getResults()[1];
 +
 +  // dynamic_rng_bit_generator_i1
-+  if (!rngAlgorithmAttr.isa<RngAlgorithmAttr>())
++  if (!isa<RngAlgorithmAttr>(rngAlgorithmAttr))
 +    return op_.emitError()
 +           << "expects a #stablehlo<rng_algorithm ...> rng_algorithm";
 +
 +  // dynamic_rng_bit_generator_i2
 +  // TODO(#643): Clarify supported types for RngBitGeneratorOp.
-+  auto initialStateType = initialState.getType().dyn_cast<ShapedType>();
++  auto initialStateType = dyn_cast<ShapedType>(initialState.getType());
 +  if (!initialStateType || !initialStateType.getElementType().isIntOrFloat())
 +    return op_.emitError()
 +           << "expects initial_state (operand #0) "
 +           << "to be a tensor of integer or floating-point type";
 +
 +  // dynamic_rng_bit_generator_i3
-+  auto outputShapeType = outputShape.getType().dyn_cast<ShapedType>();
++  auto outputShapeType = dyn_cast<ShapedType>(outputShape.getType());
 +  if (!outputShapeType || !outputShapeType.hasRank() ||
 +      outputShapeType.getRank() != 1 ||
 +      !outputShapeType.getElementType().isIntOrIndex())
@@ -781,14 +829,14 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +
 +  // dynamic_rng_bit_generator_o1
 +  // TODO(#643): Clarify supported types for RngBitGeneratorOp.
-+  auto outputStateType = outputState.getType().dyn_cast<ShapedType>();
++  auto outputStateType = dyn_cast<ShapedType>(outputState.getType());
 +  if (!outputStateType || !outputStateType.getElementType().isIntOrFloat())
 +    return op_.emitError()
 +           << "expects output_state (result #0) "
 +           << "to be a tensor of integer or floating-point type";
 +
 +  // dynamic_rng_bit_generator_o2
-+  auto outputType = output.getType().dyn_cast<ShapedType>();
++  auto outputType = dyn_cast<ShapedType>(output.getType());
 +  if (!outputType || !outputType.getElementType().isIntOrFloat())
 +    return op_.emitError()
 +           << "expects output (result #1) "
@@ -812,25 +860,24 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +}
 +
 +RngAlgorithm DynamicRngBitGeneratorOpAdaptor::getRngAlgorithm() {
-+  return op_->getDiscardableAttr("rng_algorithm")
-+      .cast<RngAlgorithmAttr>()
++  return cast<RngAlgorithmAttr>(op_->getDiscardableAttr("rng_algorithm"))
 +      .getValue();
 +}
 +
 +TypedValue<ShapedType> DynamicRngBitGeneratorOpAdaptor::getInitialState() {
-+  return op_.getInputs()[0].cast<TypedValue<ShapedType>>();
++  return cast<TypedValue<ShapedType>>(op_.getInputs()[0]);
 +}
 +
 +TypedValue<ShapedType> DynamicRngBitGeneratorOpAdaptor::getOutputShape() {
-+  return op_.getInputs()[1].cast<TypedValue<ShapedType>>();
++  return cast<TypedValue<ShapedType>>(op_.getInputs()[1]);
 +}
 +
 +TypedValue<ShapedType> DynamicRngBitGeneratorOpAdaptor::getOutputState() {
-+  return op_.getResults()[0].cast<TypedValue<ShapedType>>();
++  return cast<TypedValue<ShapedType>>(op_.getResults()[0]);
 +}
 +
 +TypedValue<ShapedType> DynamicRngBitGeneratorOpAdaptor::getOutput() {
-+  return op_.getResults()[1].cast<TypedValue<ShapedType>>();
++  return cast<TypedValue<ShapedType>>(op_.getResults()[1]);
 +}
 +
 +std::optional<DynamicRngBitGeneratorOpAdaptor> getDynamicRngBitGeneratorOp(
@@ -864,7 +911,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +  auto indices = op_.getResults()[1];
 +
 +  // dynamic_top_k_i1
-+  auto operandType = operand.getType().dyn_cast<ShapedType>();
++  auto operandType = dyn_cast<ShapedType>(operand.getType());
 +  if (!operandType || !operandType.hasRank() || operandType.getRank() < 1 ||
 +      !operandType.getElementType().isIntOrFloat())
 +    return op_.emitError()
@@ -873,7 +920,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +           << "of rank at least 1";
 +
 +  // dynamic_top_k_i2
-+  auto kType = k.getType().dyn_cast<ShapedType>();
++  auto kType = dyn_cast<ShapedType>(k.getType());
 +  if (!kType || !kType.hasRank() || kType.getRank() != 0 ||
 +      !kType.getElementType().isIntOrIndex())
 +    return op_.emitError()
@@ -881,7 +928,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +           << "to be a 0-dimensional tensor of integer or index type";
 +
 +  // dynamic_top_k_o1
-+  auto valuesType = values.getType().dyn_cast<ShapedType>();
++  auto valuesType = dyn_cast<ShapedType>(values.getType());
 +  if (!valuesType || !valuesType.hasRank() || valuesType.getRank() < 1 ||
 +      !valuesType.getElementType().isIntOrFloat())
 +    return op_.emitError()
@@ -890,7 +937,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +           << "of rank at least 1";
 +
 +  // dynamic_top_k_o2
-+  auto indicesType = indices.getType().dyn_cast<ShapedType>();
++  auto indicesType = dyn_cast<ShapedType>(indices.getType());
 +  if (!indicesType || !indicesType.hasRank() || indicesType.getRank() < 1 ||
 +      !indicesType.getElementType().isSignlessInteger(32))
 +    return op_.emitError() << "expects indices (result #1) "
@@ -930,19 +977,19 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +}
 +
 +TypedValue<ShapedType> DynamicTopKOpAdaptor::getOperand() {
-+  return op_.getInputs()[0].cast<TypedValue<ShapedType>>();
++  return cast<TypedValue<ShapedType>>(op_.getInputs()[0]);
 +}
 +
 +TypedValue<ShapedType> DynamicTopKOpAdaptor::getK() {
-+  return op_.getInputs()[1].cast<TypedValue<ShapedType>>();
++  return cast<TypedValue<ShapedType>>(op_.getInputs()[1]);
 +}
 +
 +TypedValue<ShapedType> DynamicTopKOpAdaptor::getValues() {
-+  return op_.getResults()[0].cast<TypedValue<ShapedType>>();
++  return cast<TypedValue<ShapedType>>(op_.getResults()[0]);
 +}
 +
 +TypedValue<ShapedType> DynamicTopKOpAdaptor::getIndices() {
-+  return op_.getResults()[1].cast<TypedValue<ShapedType>>();
++  return cast<TypedValue<ShapedType>>(op_.getResults()[1]);
 +}
 +
 +std::optional<DynamicTopKOpAdaptor> getDynamicTopKOp(CustomCallOp op) {
@@ -1422,17 +1469,17 @@ diff --ruN a/stablehlo/stablehlo/experimental/tests/stablehlo_canonicalize_dynam
 +// CHECK-LABEL: func @dynamic_reduce_window_success_static_result_type
 +func.func @dynamic_reduce_window_success_static_result_type(%arg0: tensor<3x2xf32>, %arg1: tensor<f32>) -> tensor<2x2xf32> {
 +  //           CHECK-NOT: stablehlo.dynamic_reduce_window
-+  //               CHECK: "stablehlo.reduce_window"(%arg0, %arg1) ({
-+  //          CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG2:arg.*]]: tensor<f32>, %[[ARG3:arg.*]]: tensor<f32>):
-+  //          CHECK-NEXT:     %[[VAL1:.*]] = stablehlo.add %arg2, %arg3 : tensor<f32>
-+  //          CHECK-NEXT:     stablehlo.return %[[VAL1]] : tensor<f32>
-+  //          CHECK-NEXT: }) {
++  //               CHECK: "stablehlo.reduce_window"(%arg0, %arg1) <{
 +  //          CHECK-SAME:   base_dilations = array<i64: 2, 1>,
 +  // CHECK-SAME{LITERAL}:   padding = dense<[[2, 1], [0, 0]]> : tensor<2x2xi64>,
 +  //          CHECK-SAME:   window_dilations = array<i64: 3, 1>,
 +  //          CHECK-SAME:   window_dimensions = array<i64: 2, 1>,
 +  //          CHECK-SAME:   window_strides = array<i64: 4, 1>
-+  //          CHECK-SAME: } : (tensor<3x2xf32>, tensor<f32>) -> tensor<2x2xf32>
++  //          CHECK-SAME: }> ({
++  //          CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG2:arg.*]]: tensor<f32>, %[[ARG3:arg.*]]: tensor<f32>):
++  //          CHECK-NEXT:     %[[VAL1:.*]] = stablehlo.add %arg2, %arg3 : tensor<f32>
++  //          CHECK-NEXT:     stablehlo.return %[[VAL1]] : tensor<f32>
++  //          CHECK-NEXT: }) : (tensor<3x2xf32>, tensor<f32>) -> tensor<2x2xf32>
 +  %0 = stablehlo.constant dense<[2, 1]> : tensor<2xi64>
 +  %1 = stablehlo.constant dense<[4, 1]> : tensor<2xi64>
 +  %2 = stablehlo.constant dense<[2, 1]> : tensor<2xi64>
@@ -1454,17 +1501,17 @@ diff --ruN a/stablehlo/stablehlo/experimental/tests/stablehlo_canonicalize_dynam
 +// CHECK-LABEL: func @dynamic_reduce_window_success_dynamic_result_type
 +func.func @dynamic_reduce_window_success_dynamic_result_type(%arg0: tensor<?x2xf32>, %arg1: tensor<f32>) -> tensor<?x2xf32> {
 +  //           CHECK-NOT: stablehlo.dynamic_reduce_window
-+  //               CHECK: "stablehlo.reduce_window"(%arg0, %arg1) ({
-+  //          CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG2:arg.*]]: tensor<f32>, %[[ARG3:arg.*]]: tensor<f32>):
-+  //          CHECK-NEXT:     %[[VAL1:.*]] = stablehlo.add %arg2, %arg3 : tensor<f32>
-+  //          CHECK-NEXT:     stablehlo.return %[[VAL1]] : tensor<f32>
-+  //          CHECK-NEXT: }) {
++  //               CHECK: "stablehlo.reduce_window"(%arg0, %arg1) <{
 +  //          CHECK-SAME:   base_dilations = array<i64: 2, 1>,
 +  // CHECK-SAME{LITERAL}:   padding = dense<[[2, 1], [0, 0]]> : tensor<2x2xi64>,
 +  //          CHECK-SAME:   window_dilations = array<i64: 3, 1>,
 +  //          CHECK-SAME:   window_dimensions = array<i64: 2, 1>,
 +  //          CHECK-SAME:   window_strides = array<i64: 4, 1>
-+  //          CHECK-SAME: } : (tensor<?x2xf32>, tensor<f32>) -> tensor<?x2xf32>
++  //          CHECK-SAME: }> ({
++  //          CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG2:arg.*]]: tensor<f32>, %[[ARG3:arg.*]]: tensor<f32>):
++  //          CHECK-NEXT:     %[[VAL1:.*]] = stablehlo.add %arg2, %arg3 : tensor<f32>
++  //          CHECK-NEXT:     stablehlo.return %[[VAL1]] : tensor<f32>
++  //          CHECK-NEXT: }) : (tensor<?x2xf32>, tensor<f32>) -> tensor<?x2xf32>
 +  %0 = stablehlo.constant dense<[2, 1]> : tensor<2xi64>
 +  %1 = stablehlo.constant dense<[4, 1]> : tensor<2xi64>
 +  %2 = stablehlo.constant dense<[2, 1]> : tensor<2xi64>
@@ -2057,7 +2104,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp b/
 +    auto res = verifyCustomCallOpAttributes(
 +        op, rewriter, [&](NamedAttribute attr) -> LogicalResult {
 +          if (attr.getName() != "largest") return success();
-+          if (attr.getValue().cast<BoolAttr>().getValue() == false)
++          if (cast<BoolAttr>(attr.getValue()).getValue() == false)
 +            return rewriter.notifyMatchFailure(
 +                op, "largest = false is not supported.");
 +          return success();
@@ -2319,7 +2366,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDy
 +    // the verifier to make sure that its value is consistent with result type.
 +    if (!succeeded(hlo::matchInts(op.getOutputShape())))
 +      return rewriter.notifyMatchFailure(op, "expected static output_shape");
-+    if (!op.getOutput().getType().cast<ShapedType>().hasStaticShape())
++    if (!cast<ShapedType>(op.getOutput().getType()).hasStaticShape())
 +      return rewriter.notifyMatchFailure(op, "expected static output type");
 +    rewriter.replaceOpWithNewOp<RngBitGeneratorOp>(
 +        op, op->getResultTypes(), op.getRngAlgorithm(), op.getInitialState());
@@ -2341,7 +2388,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDy
 +      return rewriter.notifyMatchFailure(impl, "expected constant k");
 +
 +    // We rely on many of the properties checked by verification.
-+    auto valuesType = op.getValues().getType().cast<ShapedType>();
++    auto valuesType = cast<ShapedType>(op.getValues().getType());
 +    auto valuesLastDimSize = valuesType.getShape()[valuesType.getRank() - 1];
 +    if (hlo::isDynamicDimSize(valuesLastDimSize) ||
 +        valuesLastDimSize != k[0])
@@ -2493,7 +2540,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.c
 +    // At the moment, we only support refining return types using fully static
 +    // shape values which serves the current use cases well.
 +    // Support for partially static shape values is left for future work.
-+    auto initialStateType = op.getInitialState().getType().cast<ShapedType>();
++    auto initialStateType = cast<ShapedType>(op.getInitialState().getType());
 +    SmallVector<int64_t> outputShape;
 +    if (failed(hlo::matchInts(op.getOutputShape(), outputShape)))
 +      return rewriter.notifyMatchFailure(op, "expected constant output_shape");
@@ -2513,7 +2560,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.c
 +    if (!maybeOp || failed(maybeOp->verify())) return failure();
 +    DynamicTopKOpAdaptor op = *maybeOp;
 +
-+    auto operandType = op.getOperand().getType().cast<ShapedType>();
++    auto operandType = cast<ShapedType>(op.getOperand().getType());
 +    SmallVector<int64_t> outputShape(operandType.getShape());
 +    SmallVector<int64_t> k;
 +    if (failed(hlo::matchInts(op.getK(), k)))
@@ -2564,76 +2611,37 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.c
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
-diff --ruN a/stablehlo/stablehlo/reference/Ops.cpp b/stablehlo/stablehlo/reference/Ops.cpp
---- stablehlo/stablehlo/reference/Ops.cpp
-+++ stablehlo/stablehlo/reference/Ops.cpp
-@@ -16,6 +16,7 @@
- #include "stablehlo/reference/Ops.h"
- 
- #include <algorithm>
-+#include <cstdint>
- 
- #include "llvm/ADT/APFloat.h"
- #include "llvm/ADT/APInt.h"
-@@ -1428,23 +1429,23 @@
-                         inputSpatialDimensions.end());
-   lhsPermutation.push_back(inputFeatureDimension);
+diff --ruN a/stablehlo/stablehlo/reference/Api.cpp b/stablehlo/stablehlo/reference/Api.cpp
+--- stablehlo/stablehlo/reference/Api.cpp
++++ stablehlo/stablehlo/reference/Api.cpp
+@@ -51,7 +51,7 @@
+   auto functions = module.getOps<func::FuncOp>();
  
--  auto lhsWindowDimensions =
--      concatAndPermute(lhs.getShape()[inputBatchDimension],
--                       extractElements(rhs.getShape(), kernelSpatialDimensions),
--                       lhs.getShape()[inputFeatureDimension], lhsPermutation);
--
--  auto lhsWindowStrides =
--      concatAndPermute(1L, llvm::to_vector(windowStrides), 1L, lhsPermutation);
-+  auto lhsWindowDimensions = concatAndPermute<int64_t>(
-+      lhs.getShape()[inputBatchDimension],
-+      extractElements(rhs.getShape(), kernelSpatialDimensions),
-+      lhs.getShape()[inputFeatureDimension], lhsPermutation);
-+
-+  auto lhsWindowStrides = concatAndPermute<int64_t>(
-+      1L, llvm::to_vector(windowStrides), 1L, lhsPermutation);
+   for (auto funcOp : functions)
+-    if (funcOp.getSymName().equals(mainName)) return funcOp;
++    if (funcOp.getSymName() == mainName) return funcOp;
  
-   auto lhsBaseDilations =
--      concatAndPermute(0L, Sizes(lhsDilation) - 1, 0L, lhsPermutation);
--
--  auto lhsWindowDilations =
--      concatAndPermute(1L, llvm::to_vector(rhsDilation), 1L, lhsPermutation);
-+      concatAndPermute<int64_t>(0L, Sizes(lhsDilation) - 1, 0L, lhsPermutation);
-+
-+  auto lhsWindowDilations = concatAndPermute<int64_t>(
-+      1L, llvm::to_vector(rhsDilation), 1L, lhsPermutation);
+   bool isSingleFunction =
+       std::distance(functions.begin(), functions.end()) == 1;
+@@ -68,7 +68,7 @@
+ class DefaultInterpreterFallback : public InterpreterFallback {
+  public:
+   DefaultInterpreterFallback(const InterpreterConfiguration &config)
+-      : config(config){};
++      : config(config) {};
  
-   Sizes lhsPaddingLow, lhsPaddingHigh;
--  for (auto paddingPair : concatAndPermute({0, 0}, llvm::to_vector(padding),
--                                           {0, 0}, lhsPermutation)) {
-+  for (auto paddingPair : concatAndPermute<std::pair<int64_t, int64_t>>(
-+           {0, 0}, llvm::to_vector(padding), {0, 0}, lhsPermutation)) {
-     lhsPaddingLow.push_back(paddingPair.first);
-     lhsPaddingHigh.push_back(paddingPair.second);
-   }
-@@ -1461,8 +1462,8 @@
-   for (; outputSpatialIndexIt != outputSpatialIndexItEnd;
-        ++outputSpatialIndexIt) {
-     Sizes lhsWindowStart;
--    for (auto [i, offset] : llvm::enumerate(
--             concatAndPermute(0L, *outputSpatialIndexIt, 0L, lhsPermutation)))
-+    for (auto [i, offset] : llvm::enumerate(concatAndPermute<int64_t>(
-+             0L, *outputSpatialIndexIt, 0L, lhsPermutation)))
-       lhsWindowStart.push_back(lhsWindowStrides[i] * offset);
+   virtual llvm::Error operator()(Operation &op, Scope &scope,
+                                  Process *process) final {
+diff --ruN a/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp b/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
+--- stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
++++ stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
+@@ -764,7 +764,7 @@
  
-     Sizes limitIndices;
-@@ -1507,9 +1508,9 @@
-     for (auto dotProductIt = dotProduct.index_begin();
-          dotProductIt != dotProduct.index_end();
-          ++dotProductIt, ++resultNonSpatialIt) {
--      Index resultIndex(
--          concatAndPermute((*resultNonSpatialIt)[0], *outputSpatialIndexIt,
--                           (*resultNonSpatialIt)[1], resultPermutation));
-+      Index resultIndex(concatAndPermute<int64_t>(
-+          (*resultNonSpatialIt)[0], *outputSpatialIndexIt,
-+          (*resultNonSpatialIt)[1], resultPermutation));
-       result.set(resultIndex, dotProduct.get(*dotProductIt));
-     }
-   }
+     // Clean up operand buffers after refinement
+     // Must do in this pattern to avoid needing multiple refinement iterations
+-    if (op.getCallTargetName().equals(kCustomCallOperandBarrierTarget)) {
++    if (op.getCallTargetName() == kCustomCallOperandBarrierTarget) {
+       Value operand = op.getOperand(0);
+       if (operand.getType() == op.getResult(0).getType()) {
+         op.replaceAllUsesWith(ValueRange(operand));
 
diff --git a/third_party/xla/third_party/stablehlo/workspace.bzl b/third_party/xla/third_party/stablehlo/workspace.bzl
index 86613c0dcd88cf..aaef166d96583c 100644
--- a/third_party/xla/third_party/stablehlo/workspace.bzl
+++ b/third_party/xla/third_party/stablehlo/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     # LINT.IfChange
-    STABLEHLO_COMMIT = "e81411ef562e11337283ef24bb3c40b2f3a6ebfa"
-    STABLEHLO_SHA256 = "167f15fbdfc3dc54601b6e37d53bce7323123f701893deb3935c8629a763766a"
+    STABLEHLO_COMMIT = "797bee217e1a041e9aac22cad4db207274596d94"
+    STABLEHLO_SHA256 = "e5619033e131ea2eeb9eab8c8e362f3ba12e111c6b4a15dac789ca216ff22c58"
     # LINT.ThenChange(Google-internal path)
 
     tf_http_archive(
diff --git a/third_party/xla/third_party/triton/llvm_integration/cl623185214.patch b/third_party/xla/third_party/triton/llvm_integration/cl623185214.patch
deleted file mode 100644
index 0eed759d61b419..00000000000000
--- a/third_party/xla/third_party/triton/llvm_integration/cl623185214.patch
+++ /dev/null
@@ -1,14 +0,0 @@
-==== triton/lib/Target/LLVMIR/LLVMDIScope.cpp#8 - /google/src/cloud/wcui/mlir_6f6336858e4588ebd113ebcc930f6384a4edca54_1712678792/triton/lib/Target/LLVMIR/LLVMDIScope.cpp ====
-# action=edit type=text
---- triton/lib/Target/LLVMIR/LLVMDIScope.cpp	2024-02-09 08:45:23.000000000 -0800
-+++ triton/lib/Target/LLVMIR/LLVMDIScope.cpp	2024-04-09 09:10:04.000000000 -0700
-@@ -90,7 +90,8 @@
-         compileUnitAttr = LLVM::DICompileUnitAttr::get(
-             context, distinctId, llvm::dwarf::DW_LANG_C, fileAttr,
-             StringAttr::get(context, "triton"),
--            /*isOptimized=*/true, LLVM::DIEmissionKind::LineTablesOnly);
-+            /*isOptimized=*/true, LLVM::DIEmissionKind::LineTablesOnly,
-+            LLVM::DINameTableKind::Default);
-       }
-       subprogramFlags = subprogramFlags | LLVM::DISubprogramFlags::Definition;
-     } else {
diff --git a/third_party/xla/third_party/triton/llvm_integration/series.bzl b/third_party/xla/third_party/triton/llvm_integration/series.bzl
index 3248f6a5b1322c..9d97061753ae0a 100644
--- a/third_party/xla/third_party/triton/llvm_integration/series.bzl
+++ b/third_party/xla/third_party/triton/llvm_integration/series.bzl
@@ -5,5 +5,4 @@ These should be upstreamed to openai/triton as part of the next triton integrati
 """
 
 llvm_patch_list = [
-    "//third_party/triton/llvm_integration:cl623185214.patch",
 ]
diff --git a/third_party/xla/third_party/triton/temporary/cl609333259.patch b/third_party/xla/third_party/triton/temporary/cl609333259.patch
deleted file mode 100644
index d51bae07dda0db..00000000000000
--- a/third_party/xla/third_party/triton/temporary/cl609333259.patch
+++ /dev/null
@@ -1,47 +0,0 @@
-This patch handles internal test failures. We can attempt to upstream this into
-2 changes, but OpenAI might resist. For now, we should move this patch into the
-internal ones. This is tracked here: b/331606551. These issues won't reproduce
-upstream without removing a pass (which we do internally) that needs further
-investigations (tracked here b/331360119).
-
-diff --git a/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp b/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp
---- a/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp
-+++ b/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp
-@@ -123,7 +115,8 @@ public:
-                                 PatternRewriter &rewriter) const override {
-     // Only consider conversions to dot operand.
-     auto cvtTy = cvt.getType().cast<RankedTensorType>();
--    if (!cvtTy.getEncoding().isa<DotOperandEncodingAttr>())
-+    auto dotOpEnc = cvtTy.getEncoding().dyn_cast<DotOperandEncodingAttr>();
-+    if (!dotOpEnc)
-       return failure();
- 
-     auto src = cvt.getSrc().getDefiningOp();
-@@ -138,6 +131,12 @@ public:
-                 [](Type ty) { return ty.isa<RankedTensorType>(); }))
-       return failure();
- 
-+    // Quick handling to fix loading issues when computing the original
-+    // bitwidth is unable to realize that there is a mixed-precision dot
-+    // (hence kWidth = 1) but wants to hoist through the type conversion.
-+    if (isa<arith::ExtFOp>(src) && dotOpEnc.getKWidth() == 1)
-+        return failure();
-+
-     // Only consider custom conversions or arith ops.
-     // TODO(jlebar): Is this too restrictive?
-     if (!isa<FpToFpOp, BitcastOp>(src) &&
-@@ -150,6 +149,14 @@ public:
-     if (isa<arith::TruncIOp, arith::TruncFOp, arith::SelectOp>(src))
-       return failure();
- 
-+    // Don't hoist through u1 -> fp casts as they aren't supported in
-+    // ElementwiseOpToLLVM::reorderValues().
-+    if (isa<arith::UIToFPOp>(src)) {
-+      Type srcType = getElementTypeOrSelf(src->getOperand(0));
-+      if (srcType.isInteger(1))
-+        return failure();
-+    }
-+
-     // Check that the conversion is transitively dependent on a load, and all
-     // operations between the load and the conversion are layout preserving.
-     //
diff --git a/third_party/xla/third_party/triton/temporary/linear_layout_compose_asan.patch b/third_party/xla/third_party/triton/temporary/linear_layout_compose_asan.patch
new file mode 100644
index 00000000000000..eff83a166ac4a3
--- /dev/null
+++ b/third_party/xla/third_party/triton/temporary/linear_layout_compose_asan.patch
@@ -0,0 +1,18 @@
+==== triton/lib/Tools/LinearLayout.cpp#2 - /google/src/cloud/shyshkov/triton_asan/triton/lib/Tools/LinearLayout.cpp ====
+# action=edit type=text
+--- triton/lib/Tools/LinearLayout.cpp	2024-05-17 09:15:25.000000000 -0700
++++ triton/lib/Tools/LinearLayout.cpp	2024-05-21 06:27:58.000000000 -0700
+@@ -397,9 +397,11 @@
+       for (auto [outDim, b] : llvm::zip(getOutDimNames(), basis)) {
+         bases.push_back({outDim, b});
+       }
+-      auto newBases = llvm::make_second_range(outer.apply(bases));
++
++      auto outerBases =
++          llvm::to_vector(llvm::make_second_range(outer.apply(bases)));
+       newInDimBases.push_back(
+-          std::vector<int32_t>(newBases.begin(), newBases.end()));
++          std::vector<int32_t>(outerBases.begin(), outerBases.end()));
+     }
+   }
+   return LinearLayout(std::move(newBases), outer.getOutDimNames());
diff --git a/third_party/xla/third_party/triton/temporary/series.bzl b/third_party/xla/third_party/triton/temporary/series.bzl
index e403a80462516b..b3d935c048fadd 100644
--- a/third_party/xla/third_party/triton/temporary/series.bzl
+++ b/third_party/xla/third_party/triton/temporary/series.bzl
@@ -6,5 +6,5 @@ internal patch during the next triton integration process.
 """
 
 temporary_patch_list = [
-    "//third_party/triton/temporary:cl609333259.patch",
+    "//third_party/triton/temporary:linear_layout_compose_asan.patch",
 ]
diff --git a/third_party/xla/third_party/triton/workspace.bzl b/third_party/xla/third_party/triton/workspace.bzl
index 50ba0e0c1ead99..a257f1f3e44645 100644
--- a/third_party/xla/third_party/triton/workspace.bzl
+++ b/third_party/xla/third_party/triton/workspace.bzl
@@ -8,8 +8,8 @@ load("//third_party/triton/xla_extensions:series.bzl", "extensions_files_patch_l
 def repo():
     """Imports Triton."""
 
-    TRITON_COMMIT = "cl619179472"
-    TRITON_SHA256 = "aa0b0b338bf16aa7eea778312fa549a421278b24d1a4bc04f5d6ced706f693fe"
+    TRITON_COMMIT = "cl634675237"
+    TRITON_SHA256 = "7151d057ee8443c2f45cbe18a7435a42f37e18f562e5d238b844b6e09fc560e6"
     tf_http_archive(
         name = "triton",
         sha256 = TRITON_SHA256,
diff --git a/third_party/xla/third_party/triton/xla_extensions/env_vars.patch b/third_party/xla/third_party/triton/xla_extensions/env_vars.patch
deleted file mode 100644
index 955eb6db8da68e..00000000000000
--- a/third_party/xla/third_party/triton/xla_extensions/env_vars.patch
+++ /dev/null
@@ -1,14 +0,0 @@
-Long standing patch due to licensing issues.
-diff --git a/include/triton/Tools/Sys/GetEnv.hpp b/include/triton/Tools/Sys/GetEnv.hpp
-index 31bc03fe1..a19a432df 100644
---- a/include/triton/Tools/Sys/GetEnv.hpp
-+++ b/include/triton/Tools/Sys/GetEnv.hpp
-@@ -34,7 +34,7 @@ inline const std::set<std::string> ENV_VARS = {
-     "AMDGCN_ENABLE_DUMP",
-     "DISABLE_FAST_REDUCTION",
-     "DISABLE_LLVM_OPT",
--    "DISABLE_MMA_V3",
-+    "ENABLE_MMA_V3",
-     "DISABLE_PTXAS_OPT",
-     "LLVM_IR_ENABLE_DUMP",
-     "MLIR_ENABLE_DUMP",
diff --git a/third_party/xla/third_party/triton/xla_extensions/series.bzl b/third_party/xla/third_party/triton/xla_extensions/series.bzl
index 5e6309a20988ce..af524fb253cbef 100644
--- a/third_party/xla/third_party/triton/xla_extensions/series.bzl
+++ b/third_party/xla/third_party/triton/xla_extensions/series.bzl
@@ -4,8 +4,9 @@ applied in the previous copybara workflow.
 """
 
 extensions_files_patch_list = [
-    "//third_party/triton/xla_extensions:env_vars.patch",  # File not exported to google
     "//third_party/triton/xla_extensions:sparse_dot_nvgpu.patch",  # Sparsity internal patch
     "//third_party/triton/xla_extensions:sparse_dot_base.patch",  # Sparsity internal patch
     "//third_party/triton/xla_extensions:sparse_dot_passes.patch",  # Sparsity internal patch
+    "//third_party/triton/xla_extensions:sparse_dot_fixes_y24w17.patch",  # Sparsity internal patch
+    "//third_party/triton/xla_extensions:sparse_dot_fixes_y24w19.patch",  # Sparsity internal patch
 ]
diff --git a/third_party/xla/third_party/triton/xla_extensions/sparse_dot_base.patch b/third_party/xla/third_party/triton/xla_extensions/sparse_dot_base.patch
index 5b520537f6e814..08b7dd6f7ada87 100644
--- a/third_party/xla/third_party/triton/xla_extensions/sparse_dot_base.patch
+++ b/third_party/xla/third_party/triton/xla_extensions/sparse_dot_base.patch
@@ -1,8 +1,9 @@
 diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
+index 56f0b6b49..aa91ea9b8 100644
 --- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
 +++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
-@@ -1158,4 +1158,12 @@ section 9.7.13.4.1 for more details.
-   let extraClassDeclaration = extraDistributedDeclaration;
+@@ -1262,4 +1262,16 @@ section 9.7.13.4.1 for more details.
+   }];
  }
  
 +def SparseDotMetaEncodingAttr : DistributedEncoding<"SparseDotMetaEncoding", "sparse_dot_meta_encoding"> {
@@ -10,14 +11,19 @@ diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/
 +
 +  let parameters = (ins "Attribute":$parent);
 +  let assemblyFormat = "`<``{` struct(params) `}``>`";
-+  let extraClassDeclaration = extraDistributedDeclaration;
++  let extraClassDeclaration = extraDistributedDeclaration # [{
++    SmallVector<unsigned> getContigPerThread() {
++      return getSizePerThread();
++    };
++  }];
 +}
 +
  #endif
 diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
+index 4966a5f73..d2bb33cfa 100644
 --- a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
 +++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
-@@ -7,6 +7,7 @@ include "triton/Dialect/TritonGPU/IR/Tri
+@@ -7,6 +7,7 @@ include "triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td"
  include "mlir/Dialect/Arith/IR/ArithBase.td"
  include "triton/Dialect/Triton/IR/TritonTypes.td"
  include "triton/Dialect/Triton/IR/TritonAttrDefs.td"
@@ -25,8 +31,8 @@ diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td b/include/trito
  include "mlir/IR/OpBase.td"
  include "mlir/Interfaces/SideEffectInterfaces.td" // Pure
  include "mlir/Interfaces/InferTypeOpInterface.td" // SameOperandsAndResultType
-@@ -214,4 +215,19 @@ def TTG_LocalLoadOp : TTG_Op<"local_load
-   let results = (outs TT_Tensor:$result);
+@@ -232,4 +233,19 @@ def TTG_LocalStoreOp : TTG_Op<"local_store", [MemoryEffects<[MemWrite<SharedMemo
+   }];
  }
  
 +def TTNG_SparseDotOp : TTG_Op<"sparse_dot", [
@@ -46,9 +52,10 @@ diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td b/include/trito
 +
  #endif
 diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
+index 0ce7ecf18..3736a1551 100644
 --- a/lib/Dialect/TritonGPU/IR/Dialect.cpp
 +++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp
-@@ -479,6 +479,119 @@ getDefaultBlockedEncoding(MLIRContext *c
+@@ -483,6 +483,119 @@ getDefaultBlockedEncoding(MLIRContext *context, ArrayRef<int64_t> shape,
    return encoding;
  }
  
@@ -70,7 +77,7 @@ diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dia
 +
 +LogicalResult SparseDotOp::verify() {
 +  // Verify operand A.
-+  auto aTensorTy = getOperand(0).getType().cast<TensorOrMemDesc>();
++  auto aTensorTy = cast<TensorOrMemDesc>(getOperand(0).getType());
 +  auto aElemTy = aTensorTy.getElementType();
 +  if (!aElemTy.isF16() && !aElemTy.isBF16())
 +    return emitError("element type of operand A is not supported");
@@ -78,7 +85,7 @@ diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dia
 +  if (aShape.size() != 2) return emitError("shape of operand A is incorrect");
 +
 +  // Verify operand B.
-+  auto bTensorTy = getOperand(1).getType().cast<TensorOrMemDesc>();
++  auto bTensorTy = cast<TensorOrMemDesc>(getOperand(1).getType());
 +  auto bElemTy = bTensorTy.getElementType();
 +  if (!bElemTy.isF16() && !bElemTy.isBF16())
 +    return emitError("element type of operand B is not supported");
@@ -86,7 +93,7 @@ diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dia
 +  if (bShape.size() != 2) return emitError("shape of operand B is incorrect");
 +
 +  // Verify operand C.
-+  auto cTensorTy = getOperand(2).getType().cast<RankedTensorType>();
++  auto cTensorTy = cast<RankedTensorType>(getOperand(2).getType());
 +  auto cElemTy = cTensorTy.getElementType();
 +  if (!cElemTy.isF32())
 +    return emitError("element type of operand C is not supported");
@@ -101,7 +108,7 @@ diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dia
 +    return emitError("operand element types do not match");
 +
 +  // Verify sparse metadata.
-+  auto metaTy = getOperand(3).getType().cast<RankedTensorType>();
++  auto metaTy = cast<RankedTensorType>(getOperand(3).getType());
 +  auto metaShape = metaTy.getShape();
 +  if (!metaTy.getElementType().isInteger(16) || metaShape.size() != 2)
 +    return emitError("sparse metadata tensor is invalid");
@@ -125,7 +132,7 @@ diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dia
 +//--- SparseDotMetaEncodingAttr ---
 +unsigned SparseDotMetaEncodingAttr::getTotalElemsPerThread(
 +    ArrayRef<int64_t> shape, Type eltTy) const {
-+  auto mmaLayout = getParent().cast<NvidiaMmaEncodingAttr>();
++  auto mmaLayout = mlir::cast<NvidiaMmaEncodingAttr>(getParent());
 +  return product<int64_t>(shape) /
 +         (mmaLayout.getWarpsPerCTA()[0] * kMetadataElementsPerWarp);
 +}
@@ -168,208 +175,11 @@ diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dia
  } // namespace gpu
  } // namespace triton
  } // namespace mlir
-diff --git a/test/SparseDot/convert_to_llvm_ampere.mlir b/test/SparseDot/convert_to_llvm_ampere.mlir
-new file mode 100644
---- /dev/null
-+++ b/test/SparseDot/convert_to_llvm_ampere.mlir
-@@ -0,0 +1,26 @@
-+// RUN: triton-opt %s --allocate-shared-memory --convert-triton-gpu-to-llvm=compute-capability=80 | FileCheck %s
-+
-+#blocked0 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
-+#shared0 = #triton_gpu.shared<{vec = 1, perPhase=1, maxPhase=1, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
-+#mma0 = #triton_gpu.nvidia_mma<{versionMajor = 2, warpsPerCTA = [2, 2], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], instrShape = [16, 8]}>
-+#dot_operand_a = #triton_gpu.dot_op<{opIdx=0, parent=#mma0, kWidth=2}>
-+#dot_operand_b = #triton_gpu.dot_op<{opIdx=1, parent=#mma0, kWidth=2}>
-+#dot_meta_enc = #triton_gpu.sparse_dot_meta<{parent=#mma0}>
-+
-+module attributes {"triton_gpu.num-warps" = 4 : i32} {
-+  tt.func @sparse_dot(%A: tensor<32x32xf16, #blocked0>, %B: tensor<64x32xf16, #blocked0>, %meta: tensor<32x4xi16, #blocked0>) {
-+    // CHECK-COUNT-2: ldmatrix.sync.aligned.m8n8.x4.shared.b16
-+    %A_alloc = triton_gpu.local_alloc %A {allocation.offset = 0 : i32} : (tensor<32x32xf16, #blocked0>) -> !tt.memdesc<32x32xf16, #shared0>
-+    %A_dot = triton_gpu.local_load %A_alloc : !tt.memdesc<32x32xf16, #shared0> -> tensor<32x32xf16, #dot_operand_a>
-+    // CHECK-COUNT-4: ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16
-+    %B_alloc = triton_gpu.local_alloc %B {allocation.offset = 2048 : i32} : (tensor<64x32xf16, #blocked0>) -> !tt.memdesc<64x32xf16, #shared0>
-+    %B_dot = triton_gpu.local_load %B_alloc : !tt.memdesc<64x32xf16, #shared0> -> tensor<64x32xf16, #dot_operand_b>
-+    // CHECK-COUNT-4: llvm.load %[[_:.*]] : !llvm.ptr<3> -> i16
-+    %meta_alloc = triton_gpu.local_alloc %meta {allocation.offset = 6144 : i32} : (tensor<32x4xi16, #blocked0>) -> !tt.memdesc<32x4xi16, #shared0>
-+    %meta_reg = triton_gpu.local_load %meta_alloc : !tt.memdesc<32x4xi16, #shared0> -> tensor<32x4xi16, #dot_meta_enc>
-+    // CHECK-COUNT-4: mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32
-+    %acc = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma0>
-+    %D = triton_gpu.sparse_dot %A_dot, %B_dot, %acc, %meta_reg : tensor<32x32xf16, #dot_operand_a> meta tensor<32x4xi16, #dot_meta_enc> * tensor<64x32xf16, #dot_operand_b> -> tensor<32x32xf32, #mma0>
-+    tt.return
-+  }
-+}
-diff --git a/test/SparseDot/convert_to_llvm_hopper.mlir b/test/SparseDot/convert_to_llvm_hopper.mlir
-new file mode 100644
---- /dev/null
-+++ b/test/SparseDot/convert_to_llvm_hopper.mlir
-@@ -0,0 +1,28 @@
-+// RUN: triton-opt %s --allocate-shared-memory --convert-triton-gpu-to-llvm=compute-capability=90 | FileCheck %s
-+
-+#blocked0 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
-+#shared0 = #triton_gpu.shared<{vec = 1, perPhase=2, maxPhase=4, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
-+#shared1 = #triton_gpu.shared<{vec = 1, perPhase=1, maxPhase=1, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
-+#mma0 = #triton_gpu.nvidia_mma<{versionMajor = 3, warpsPerCTA = [4, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], instrShape = [16, 64, 16]}>
-+#dot_meta_enc = #triton_gpu.sparse_dot_meta<{parent=#mma0}>
-+
-+module attributes {"triton_gpu.num-warps" = 4 : i32} {
-+  tt.func @sparse_dot(%A: tensor<64x32xf16, #blocked0>, %B: tensor<64x64xf16, #blocked0>, %meta: tensor<64x4xi16, #blocked0>) {
-+    %A_alloc = triton_gpu.local_alloc %A {allocation.offset = 0 : i32} : (tensor<64x32xf16, #blocked0>) -> !tt.memdesc<64x32xf16, #shared0>
-+    %B_alloc = triton_gpu.local_alloc %B {allocation.offset = 4096 : i32} : (tensor<64x64xf16, #blocked0>) -> !tt.memdesc<64x64xf16, #shared0>
-+    // CHECK-COUNT-2: llvm.load %[[_:.*]] : !llvm.ptr<3> -> i16
-+    %meta_alloc = triton_gpu.local_alloc %meta {allocation.offset = 12288 : i32} : (tensor<64x4xi16, #blocked0>) -> !tt.memdesc<64x4xi16, #shared0>
-+    %meta_reg = triton_gpu.local_load %meta_alloc : !tt.memdesc<64x4xi16, #shared0> -> tensor<64x4xi16, #dot_meta_enc>
-+    // CHECK: nvgpu.wgmma_fence
-+    // CHECK-COUNT-2: nvgpu.wgmma_sp %[[A:.*]] meta %[[M:.*]], %[[B:.*]], %[[C:.*]] {
-+    // CHECK-DAG: layoutA = 0 : i32
-+    // CHECK-DAG: layoutB = 0 : i32
-+    // CHECK-DAG: m = 64 : i32
-+    // CHECK-DAG: n = 64 : i32
-+    // CHECK-DAG: k = 32 : i32
-+    // CHECK: nvgpu.wgmma_commit_group
-+    %acc = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #mma0>
-+    %D = triton_gpu.sparse_dot %A_alloc, %B_alloc, %acc, %meta_reg : !tt.memdesc<64x32xf16, #shared0> meta tensor<64x4xi16, #dot_meta_enc> * !tt.memdesc<64x64xf16, #shared0> -> tensor<64x64xf32, #mma0>
-+    tt.return
-+  }
-+}
-diff --git a/test/SparseDot/validation.mlir b/test/SparseDot/validation.mlir
-new file mode 100644
---- /dev/null
-+++ b/test/SparseDot/validation.mlir
-@@ -0,0 +1,129 @@
-+// RUN: triton-opt --split-input-file --verify-diagnostics %s
-+
-+tt.func @sparse_dot(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
-+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
-+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
-+  tt.return
-+}
-+
-+// -----
-+tt.func @sparse_dot_invalid_lhs_type(%lhs: tensor<128x32xf32>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
-+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
-+  // expected-error @+1 {{element type of operand A is not supported}}
-+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xf32> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
-+  tt.return
-+}
-+
-+// -----
-+tt.func @sparse_dot_invalid_lhs_shape(%lhs: tensor<1x128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
-+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
-+  // expected-error @+1 {{shape of operand A is incorrect}}
-+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<1x128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
-+  tt.return
-+}
-+
-+// -----
-+tt.func @sparse_dot_invalid_rhs_type(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xf32>, %meta: tensor<128x4xi16>) {
-+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
-+  // expected-error @+1 {{element type of operand B is not supported}}
-+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xf32> -> tensor<128x128xf32>
-+  tt.return
-+}
-+
-+// -----
-+tt.func @sparse_dot_invalid_rhs_shape(%lhs: tensor<128x32xbf16>, %rhs: tensor<1x64x128xbf16>, %meta: tensor<128x4xi16>) {
-+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
-+  // expected-error @+1 {{shape of operand B is incorrect}}
-+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<1x64x128xbf16> -> tensor<128x128xf32>
-+  tt.return
-+}
-+
-+// -----
-+tt.func @sparse_dot_invalid_acc_type(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
-+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xbf16>
-+  // expected-error @+1 {{element type of operand C is not supported}}
-+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x128xbf16>
-+  tt.return
-+}
-+
-+// -----
-+tt.func @sparse_dot_invalid_acc_shape(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
-+  %acc = arith.constant dense<0.00e+00> : tensor<16384xf32>
-+  // expected-error @+1 {{shape of operand C is incorrect}}
-+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<16384xf32>
-+  tt.return
-+}
-+
-+// -----
-+tt.func @sparse_dot_mismatch_lhs_acc(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
-+  %acc = arith.constant dense<0.00e+00> : tensor<64x128xf32>
-+  // expected-error @+1 {{operand shape dimensions are incorrect}}
-+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<64x128xf32>
-+  tt.return
-+}
-+
-+// -----
-+tt.func @sparse_dot_mismatch_rhs_acc(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
-+  %acc = arith.constant dense<0.00e+00> : tensor<128x64xf32>
-+  // expected-error @+1 {{operand shape dimensions are incorrect}}
-+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x64xf32>
-+  tt.return
-+}
-+
-+// -----
-+tt.func @sparse_dot_mismatch_lhs_rhs(%lhs: tensor<128x32xbf16>, %rhs: tensor<32x128xbf16>, %meta: tensor<128x4xi16>) {
-+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
-+  // expected-error @+1 {{operand shape dimensions are incorrect}}
-+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<32x128xbf16> -> tensor<128x128xf32>
-+  tt.return
-+}
-+
-+// -----
-+tt.func @sparse_dot_mismatch_input_types(%lhs: tensor<128x32xf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
-+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
-+  // expected-error @+1 {{operand element types do not match}}
-+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
-+  tt.return
-+}
-+
-+// -----
-+tt.func @sparse_dot_invalid_meta_type(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi8>) {
-+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
-+  // expected-error @+1 {{sparse metadata tensor is invalid}}
-+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi8> * tensor<64x128xbf16> -> tensor<128x128xf32>
-+  tt.return
-+}
-+
-+// -----
-+tt.func @sparse_dot_invalid_meta_shape(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<512xi16>) {
-+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
-+  // expected-error @+1 {{sparse metadata tensor is invalid}}
-+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<512xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
-+  tt.return
-+}
-+
-+// -----
-+tt.func @sparse_dot_mismatch_meta_noncontracting(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<64x4xi16>) {
-+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
-+  // expected-error @+1 {{sparse metadata shape dimensions are incorrect}}
-+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<64x4xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
-+  tt.return
-+}
-+
-+// -----
-+tt.func @sparse_dot_mismatch_meta_contracting(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x8xi16>) {
-+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
-+  // expected-error @+1 {{sparse metadata shape dimensions are incorrect}}
-+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x8xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
-+  tt.return
-+}
-+
-+// -----
-+#mma0 = #triton_gpu.nvidia_mma<{versionMajor = 2, warpsPerCTA = [2, 2], instrShape = [16, 8]}>
-+#enc0 = #triton_gpu.dot_op<{opIdx=0, parent=#mma0, kWidth=2}>
-+tt.func @sparse_dot_encoding_operand_mismatch(%lhs: tensor<128x32xbf16, #enc0>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
-+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
-+  // expected-error @+1 {{mismatching encoding between A and B operands}}
-+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16, #enc0> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
-+  tt.return
-+}
 diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.cpp
+index f8ece0f1c..435610817 100644
 --- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.cpp
 +++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.cpp
-@@ -38,6 +38,14 @@ Value convertLayout(int opIdx, Conversio
+@@ -43,6 +43,14 @@ Value convertLayout(int opIdx, ConversionPatternRewriter &rewriter,
                      const LLVMTypeConverter *typeConverter, Value thread);
  }
  
@@ -383,19 +193,19 @@ diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.
 +
  namespace {
  
- struct LocalLoadOpConversion
-@@ -59,6 +67,10 @@ public:
-             .isa<NvidiaMmaEncodingAttr>()) {
+ using namespace mlir;
+@@ -67,6 +75,10 @@ public:
+             cast<DotOperandEncodingAttr>(dstLayout).getParent())) {
        return lowerSharedToDotOperand(op, adaptor, getTypeConverter(), rewriter);
      }
-+    if (srcLayout.isa<SharedEncodingAttr>() &&
-+        dstLayout.isa<triton::gpu::SparseDotMetaEncodingAttr>()) {
++    if (isa<SharedEncodingAttr>(srcLayout) &&
++        isa<triton::gpu::SparseDotMetaEncodingAttr>(dstLayout)) {
 +      return lowerSharedToSparseMeta(op, adaptor, getTypeConverter(), rewriter);
 +    }
      return failure();
    }
  
-@@ -130,6 +142,29 @@ private:
+@@ -138,6 +150,26 @@ private:
      rewriter.replaceOp(op, res);
      return success();
    }
@@ -406,13 +216,10 @@ diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.
 +      const LLVMTypeConverter *typeConverter,
 +      ConversionPatternRewriter &rewriter) const {
 +    auto loc = op.getLoc();
-+    auto sparseEncoding = op.getResult()
-+                              .getType()
-+                              .cast<RankedTensorType>()
-+                              .getEncoding()
-+                              .cast<triton::gpu::SparseDotMetaEncodingAttr>();
++    auto sparseEncoding = cast<triton::gpu::SparseDotMetaEncodingAttr>(
++        cast<RankedTensorType>(op.getResult().getType()).getEncoding());
 +    auto llvmElemTy = typeConverter->convertType(
-+        op.getSrc().getType().cast<MemDescType>().getElementType());
++        cast<MemDescType>(op.getSrc().getType()).getElementType());
 +    auto smemObj = getSharedMemoryObjectFromStruct(loc, adaptor.getSrc(),
 +                                                   llvmElemTy, rewriter);
 +    Value res = SharedToSparseDotOperand::convertLayout(
@@ -427,6 +234,7 @@ diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.
  struct ConvertLayoutOpOptimizedConversion
 diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
 new file mode 100644
+index 000000000..3011cf73d
 --- /dev/null
 +++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
 @@ -0,0 +1,69 @@
@@ -453,7 +261,7 @@ new file mode 100644
 +    Value thread) {
 +  // Calculate tile size as number of mask elements (4xi4).
 +  NvidiaMmaEncodingAttr mmaLayout =
-+      sparseEncoding.getParent().cast<NvidiaMmaEncodingAttr>();
++      cast<NvidiaMmaEncodingAttr>(sparseEncoding.getParent());
 +  SmallVector<unsigned> shapePerCTATile = {
 +      kTileSize * mmaLayout.getWarpsPerCTA()[0],
 +      kTileSize / kMetadataElementsPerPackedValue};
@@ -470,7 +278,7 @@ new file mode 100644
 +  Value rowId = add(mul(warpGroupId, i32_val(kTileSize)), laneGroupId);
 +
 +  // Calculate number of tile repetitions.
-+  auto shape = tensor.getType().cast<MemDescType>().getShape();
++  auto shape = cast<MemDescType>(tensor.getType()).getShape();
 +  int repM = shape[0] / shapePerCTATile[0];
 +  int repK = shape[1] / shapePerCTATile[1];
 +  assert(repM > 0 && repK > 0);
@@ -500,9 +308,10 @@ new file mode 100644
 +}
 +}  // namespace SharedToSparseDotOperand
 diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM.cpp
+index 374b9ec9e..1601806b4 100644
 --- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM.cpp
 +++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM.cpp
-@@ -32,6 +32,12 @@ LogicalResult convertAsyncWGMMA(triton::
+@@ -32,6 +32,12 @@ LogicalResult convertAsyncWGMMA(triton::nvidia_gpu::DotAsyncOp op,
                                  const LLVMTypeConverter *typeConverter,
                                  ConversionPatternRewriter &rewriter,
                                  Value thread);
@@ -515,7 +324,7 @@ diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM.cpp b/thir
  namespace {
  struct DotOpConversion : public ConvertOpToLLVMPattern<triton::DotOp> {
    using ConvertOpToLLVMPattern<triton::DotOp>::ConvertOpToLLVMPattern;
-@@ -180,6 +186,18 @@ struct DotWaitOpConversion
+@@ -174,6 +180,18 @@ struct DotWaitOpConversion
      return success();
    }
  };
@@ -534,7 +343,7 @@ diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM.cpp b/thir
  } // namespace
  
  void mlir::triton::NVIDIA::populateDotOpToLLVMPatterns(
-@@ -188,4 +206,5 @@ void mlir::triton::NVIDIA::populateDotOp
+@@ -182,4 +200,5 @@ void mlir::triton::NVIDIA::populateDotOpToLLVMPatterns(
    patterns.add<DotOpConversion>(typeConverter, benefit);
    patterns.add<DotAsyncOpConversion>(typeConverter, benefit);
    patterns.add<DotWaitOpConversion>(typeConverter, benefit);
@@ -542,6 +351,7 @@ diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM.cpp b/thir
  }
 diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/Sparse.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/Sparse.cpp
 new file mode 100644
+index 000000000..34d9212d2
 --- /dev/null
 +++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/Sparse.cpp
 @@ -0,0 +1,339 @@
@@ -595,15 +405,15 @@ new file mode 100644
 +                               const LLVMTypeConverter *typeConverter,
 +                               ConversionPatternRewriter &rewriter) {
 +  // Get number of repetitions across the dimensions.
-+  auto aTensorTy = op.getA().getType().cast<RankedTensorType>();
-+  auto bTensorTy = op.getB().getType().cast<RankedTensorType>();
++  auto aTensorTy = cast<RankedTensorType>(op.getA().getType());
++  auto bTensorTy = cast<RankedTensorType>(op.getB().getType());
 +
-+  auto layoutA = aTensorTy.getEncoding().dyn_cast<DotOperandEncodingAttr>();
-+  auto layoutB = bTensorTy.getEncoding().dyn_cast<DotOperandEncodingAttr>();
++  auto layoutA = dyn_cast<DotOperandEncodingAttr>(aTensorTy.getEncoding());
++  auto layoutB = dyn_cast<DotOperandEncodingAttr>(bTensorTy.getEncoding());
 +  assert(layoutA != nullptr && layoutB != nullptr);
 +
 +  int bitwidth = aTensorTy.getElementType().getIntOrFloatBitWidth();
-+  auto mmaEnc = layoutA.getParent().cast<NvidiaMmaEncodingAttr>();
++  auto mmaEnc = cast<NvidiaMmaEncodingAttr>(layoutA.getParent());
 +  auto repA = mmaEnc.getMMAv2Rep(triton::gpu::getShapePerCTA(aTensorTy),
 +                                 bitwidth, layoutA.getOpIdx());
 +  auto repB = mmaEnc.getMMAv2Rep(triton::gpu::getShapePerCTA(bTensorTy),
@@ -633,7 +443,7 @@ new file mode 100644
 +  }
 +
 +  // Flatten accumulator values.
-+  auto dTensorTy = op.getD().getType().cast<RankedTensorType>();
++  auto dTensorTy = cast<RankedTensorType>(op.getD().getType());
 +  auto fc = unpackLLElements(loc, adaptor.getC(), rewriter);
 +
 +  // Create `mma.sp` instruction for 4/8 core matrices.
@@ -710,7 +520,7 @@ new file mode 100644
 +                     Location loc, std::vector<unsigned int> instrShape,
 +                     bool trans, int dimWpt, Value warpId, MemDescType tensorTy,
 +                     Value baseDesc, int minor) {
-+  auto sharedLayout = tensorTy.getEncoding().cast<SharedEncodingAttr>();
++  auto sharedLayout = cast<SharedEncodingAttr>(tensorTy.getEncoding());
 +  int elemBytes = tensorTy.getElementTypeBitWidth() / 8;
 +  int elemsPerSwizzlingRow =
 +      kMmaLineSize / sharedLayout.getPerPhase() / elemBytes;
@@ -739,10 +549,10 @@ new file mode 100644
 +                                 ConversionPatternRewriter &rewriter,
 +                                 Value thread) {
 +  // Get number of repetitions across the dimensions.
-+  auto aTensorTy = op.getA().getType().cast<MemDescType>();
-+  auto bTensorTy = op.getB().getType().cast<MemDescType>();
-+  auto dTensorTy = op.getD().getType().cast<RankedTensorType>();
-+  auto mmaEnc = dTensorTy.getEncoding().cast<NvidiaMmaEncodingAttr>();
++  auto aTensorTy = cast<MemDescType>(op.getA().getType());
++  auto bTensorTy = cast<MemDescType>(op.getB().getType());
++  auto dTensorTy = cast<RankedTensorType>(op.getD().getType());
++  auto mmaEnc = cast<NvidiaMmaEncodingAttr>(dTensorTy.getEncoding());
 +
 +  auto shapePerCTA = getShapePerCTA(dTensorTy);
 +  auto shapePerCTATile = getShapePerCTATile(mmaEnc);
@@ -771,7 +581,7 @@ new file mode 100644
 +    auto sharedObj = getSharedMemoryObjectFromStruct(
 +        loc, arg, typeConverter->convertType(tensorTy.getElementType()),
 +        rewriter);
-+    auto sharedLayout = tensorTy.getEncoding().cast<SharedEncodingAttr>();
++    auto sharedLayout = cast<SharedEncodingAttr>(tensorTy.getEncoding());
 +    auto shape = getShapePerCTA(tensorTy);
 +    auto ord = sharedLayout.getOrder();
 +    int byteSize = aTensorTy.getElementTypeBitWidth() / 8;
@@ -869,9 +679,9 @@ new file mode 100644
 +                                 SparseDotOp::Adaptor adaptor,
 +                                 const LLVMTypeConverter *typeConverter,
 +                                 ConversionPatternRewriter &rewriter) {
-+  auto resultTy = op.getResult().getType().cast<RankedTensorType>();
++  auto resultTy = cast<RankedTensorType>(op.getResult().getType());
 +  NvidiaMmaEncodingAttr mmaLayout =
-+      resultTy.getEncoding().cast<NvidiaMmaEncodingAttr>();
++      cast<NvidiaMmaEncodingAttr>(resultTy.getEncoding());
 +
 +  if (mmaLayout.isAmpere()) {
 +    return convertSparseMMA(op, adaptor, typeConverter, rewriter);
@@ -885,9 +695,10 @@ new file mode 100644
 +      "Unsupported SparseDotOp found when converting TritonGPU to LLVM.");
 +}
 diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/WGMMA.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/WGMMA.cpp
+index 738f0fe04..867939f65 100644
 --- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/WGMMA.cpp
 +++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/WGMMA.cpp
-@@ -87,8 +87,8 @@ int64_t getSwizzlingFromLayout(const Sha
+@@ -88,8 +88,8 @@ int64_t getSwizzlingFromLayout(const SharedEncodingAttr &layout,
    return swizzlingByteWidth;
  }
  
diff --git a/third_party/xla/third_party/triton/xla_extensions/sparse_dot_fixes_y24w17.patch b/third_party/xla/third_party/triton/xla_extensions/sparse_dot_fixes_y24w17.patch
new file mode 100644
index 00000000000000..ce009aa688e9bf
--- /dev/null
+++ b/third_party/xla/third_party/triton/xla_extensions/sparse_dot_fixes_y24w17.patch
@@ -0,0 +1,71 @@
+diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
+index 0516fc56f..1f27f8a43 100644
+--- a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
++++ b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
+@@ -142,6 +142,7 @@ class BlockedToMMA : public mlir::RewritePattern {
+                 mlir::TypeID::get<arith::ArithDialect>());
+   }
+ 
++public:
+   // Finds the first different bitwidth in the chain of shape-preserving
+   // unary ops that x depends on.
+   // There are two primary scenarios:
+@@ -175,7 +176,6 @@ class BlockedToMMA : public mlir::RewritePattern {
+     return origBitWidth;
+   }
+ 
+-public:
+   BlockedToMMA(mlir::MLIRContext *context, int computeCapability)
+       : mlir::RewritePattern(tt::DotOp::getOperationName(), 2, context),
+         computeCapability(computeCapability) {}
+@@ -389,18 +389,22 @@ class SparseBlockedToMMA : public mlir::RewritePattern {
+                                                         newRetType, oldAcc);
+ 
+     if (versionMajor == 2) {
++      int minBitwidth = std::min(BlockedToMMA::computeOrigBitWidth(a),
++                                 BlockedToMMA::computeOrigBitWidth(b));
++      int kWidth = 32 / minBitwidth;
++
+       // convert A operand
+       auto oldAType = cast<RankedTensorType>(a.getType());
+-      auto newAEncoding = ttg::DotOperandEncodingAttr::get(
+-          ctx, 0, mmaEnc, oldAType.getElementType());
++      auto newAEncoding =
++          ttg::DotOperandEncodingAttr::get(ctx, 0, mmaEnc, kWidth);
+       auto newAType = RankedTensorType::get(
+           oldAType.getShape(), oldAType.getElementType(), newAEncoding);
+       a = rewriter.create<ttg::ConvertLayoutOp>(a.getLoc(), newAType, a);
+ 
+       // convert B operand
+       auto oldBType = cast<RankedTensorType>(b.getType());
+-      auto newBEncoding = ttg::DotOperandEncodingAttr::get(
+-          ctx, 1, mmaEnc, oldBType.getElementType());
++      auto newBEncoding =
++          ttg::DotOperandEncodingAttr::get(ctx, 1, mmaEnc, kWidth);
+       auto newBType = RankedTensorType::get(
+           oldBType.getShape(), oldBType.getElementType(), newBEncoding);
+       b = rewriter.create<ttg::ConvertLayoutOp>(b.getLoc(), newBType, b);
+diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
+index 3011cf73d..ea587dced 100644
+--- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
++++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
+@@ -22,16 +22,16 @@ Value convertLayout(
+   // Calculate tile size as number of mask elements (4xi4).
+   NvidiaMmaEncodingAttr mmaLayout =
+       cast<NvidiaMmaEncodingAttr>(sparseEncoding.getParent());
++  SmallVector<unsigned> warpsPerCTA = mmaLayout.getWarpsPerCTA();
+   SmallVector<unsigned> shapePerCTATile = {
+-      kTileSize * mmaLayout.getWarpsPerCTA()[0],
+-      kTileSize / kMetadataElementsPerPackedValue};
++      kTileSize * warpsPerCTA[0], kTileSize / kMetadataElementsPerPackedValue};
+   Value strideM = smemObj.strides[0];
+   Value strideK = smemObj.strides[1];
+ 
+   // Calculate offset in the tile for the current thread.
+   Value threadsPerWarp = i32_val(kThreadsPerWarp);
+   Value warpId = udiv(thread, threadsPerWarp);
+-  Value warpGroupId = urem(warpId, i32_val(shapePerCTATile[0] / kTileSize));
++  Value warpGroupId = udiv(warpId, i32_val(warpsPerCTA[1]));
+   Value laneId = urem(thread, threadsPerWarp);
+   Value laneGroupId = udiv(laneId, i32_val(kThreadsInGroup));
+   Value columnId = urem(laneId, i32_val(shapePerCTATile[1]));
diff --git a/third_party/xla/third_party/triton/xla_extensions/sparse_dot_fixes_y24w19.patch b/third_party/xla/third_party/triton/xla_extensions/sparse_dot_fixes_y24w19.patch
new file mode 100644
index 00000000000000..775ed317d1f9b9
--- /dev/null
+++ b/third_party/xla/third_party/triton/xla_extensions/sparse_dot_fixes_y24w19.patch
@@ -0,0 +1,31 @@
+diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
+--- a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
++++ b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
+@@ -365,7 +365,8 @@ class SparseBlockedToMMA : public mlir::
+ 
+     assert(computeCapability >= 80 &&
+            "SparseDot is supported on Ampere and higher");
+-    int versionMajor = computeCapability < 90 ? 2 : 3;
++    bool allowV3 = triton::tools::getBoolEnv("ENABLE_MMA_V3");
++    int versionMajor = computeCapability >= 90 && allowV3 ? 3 : 2;
+ 
+     // get MMA encoding for the given number of warps
+     auto retShapePerCTA = ttg::getShapePerCTA(oldRetType);
+diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
+--- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
++++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToSparseDotOperand.cpp
+@@ -31,7 +31,13 @@ Value convertLayout(
+   // Calculate offset in the tile for the current thread.
+   Value threadsPerWarp = i32_val(kThreadsPerWarp);
+   Value warpId = udiv(thread, threadsPerWarp);
+-  Value warpGroupId = udiv(warpId, i32_val(warpsPerCTA[1]));
++  Value warpGroupId;
++  if (mmaLayout.isHopper()) {
++    warpGroupId = urem(warpId, i32_val(warpsPerCTA[0]));
++  } else {
++    assert(mmaLayout.isAmpere());
++    warpGroupId = udiv(warpId, i32_val(warpsPerCTA[1]));
++  }
+   Value laneId = urem(thread, threadsPerWarp);
+   Value laneGroupId = udiv(laneId, i32_val(kThreadsInGroup));
+   Value columnId = urem(laneId, i32_val(shapePerCTATile[1]));
diff --git a/third_party/xla/third_party/triton/xla_extensions/sparse_dot_nvgpu.patch b/third_party/xla/third_party/triton/xla_extensions/sparse_dot_nvgpu.patch
index b96aeacced5743..791618363b2f34 100644
--- a/third_party/xla/third_party/triton/xla_extensions/sparse_dot_nvgpu.patch
+++ b/third_party/xla/third_party/triton/xla_extensions/sparse_dot_nvgpu.patch
@@ -1,7 +1,8 @@
-diff --git a/include/triton/Dialect/NVGPU/IR/NVGPUOps.td b/include/triton/Dialect/NVGPU/IR/NVGPUOps.td
---- a/include/triton/Dialect/NVGPU/IR/NVGPUOps.td
-+++ b/include/triton/Dialect/NVGPU/IR/NVGPUOps.td
-@@ -87,6 +87,15 @@ def NVGPU_WGMMAOp : NVGPU_Op<"wgmma", []
+diff --git a/third_party/nvidia/include/Dialect/NVGPU/IR/NVGPUOps.td b/third_party/nvidia/include/Dialect/NVGPU/IR/NVGPUOps.td
+index ca9d18873..d39bc6ec4 100644
+--- a/third_party/nvidia/include/Dialect/NVGPU/IR/NVGPUOps.td
++++ b/third_party/nvidia/include/Dialect/NVGPU/IR/NVGPUOps.td
+@@ -87,6 +87,15 @@ def NVGPU_WGMMAOp : NVGPU_Op<"wgmma", []> {
    let assemblyFormat = "$opA `,` $opB (`,` $opC^)? attr-dict `:` functional-type(operands, $res)";
  }
  
@@ -17,29 +18,11 @@ diff --git a/include/triton/Dialect/NVGPU/IR/NVGPUOps.td b/include/triton/Dialec
  def NVGPU_LoadDSmemOp : NVGPU_Op<"load_dsmem", [MemoryEffects<[MemRead]>]> {
    let arguments = (ins LLVM_AnyPointer:$addr, I32:$ctaId, I32Attr:$bitwidth, I32Attr:$vec);
    let builders = [
-diff --git a/test/SparseDot/test_wgmma_sp.mlir b/test/SparseDot/test_wgmma_sp.mlir
-new file mode 100644
---- /dev/null
-+++ b/test/SparseDot/test_wgmma_sp.mlir
-@@ -0,0 +1,14 @@
-+// RUN: triton-opt %s -split-input-file --convert-nv-gpu-to-llvm | FileCheck %s
-+
-+module attributes {"triton_gpu.num-warps" = 4 : i32} {
-+  tt.func @wgmma_sp(%descA: i64, %metaA: i32, %descB: i64, %acc: !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>) {
-+    // CHECK: @wgmma_sp(%[[LHS:.*]]: i64, %[[META:.*]]: i32, %[[RHS:.*]]: i64,
-+    // CHECK: llvm.inline_asm has_side_effects asm_dialect = att operand_attrs = []
-+    // CHECK-SAME: "wgmma.mma_async.sp.sync.aligned.m64n16k32.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7}, $16, $17, $18, 0, 1, 1, 1, 0, 0;"
-+    // CHECK-SAME: "=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,l,l,r" %0, %1, %2, %3, %4, %5, %6, %7, %[[LHS]], %[[RHS]], %[[META]]
-+    %acc0 = nvgpu.wgmma_sp %descA meta %metaA, %descB, %acc
-+    {eltTypeA = 5 : i32, eltTypeB = 5 : i32, eltTypeC = 7 : i32, layoutA = 0 : i32, layoutB = 1 : i32, m = 64 : i32, n = 16 : i32, k = 32 : i32} :
-+    (i64, i32, i64, !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
-+    tt.return
-+  }
-+}
 diff --git a/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp b/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp
+index e19216520..aacbfb569 100644
 --- a/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp
 +++ b/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp
-@@ -688,6 +688,84 @@ public:
+@@ -668,6 +668,84 @@ public:
    }
  };
  
@@ -50,7 +33,7 @@ diff --git a/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp b/third_part
 +  using Base::Base;
 +
 +  std::vector<std::string> getOutputConstraints(ttn::SparseWGMMAOp op) const {
-+    auto outputStructType = op.getType().cast<LLVM::LLVMStructType>();
++    auto outputStructType = cast<LLVM::LLVMStructType>(op.getType());
 +    uint32_t numOutputRegs = outputStructType.getBody().size();
 +    std::string output =
 +        outputStructType.getBody().front().isF32() ? "=f" : "=r";
@@ -90,7 +73,7 @@ diff --git a/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp b/third_part
 +
 +    // Output and operand C
 +    uint32_t numCRegs =
-+        op.getType().cast<LLVM::LLVMStructType>().getBody().size();
++        cast<LLVM::LLVMStructType>(op.getType()).getBody().size();
 +    args += "{";
 +    for (uint32_t i = 0; i < numCRegs; ++i) {
 +      args += "$" + std::to_string(asmOpIdx++) + (i == numCRegs - 1 ? "" : ",");
@@ -124,13 +107,17 @@ diff --git a/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp b/third_part
  class ConvertNVGPUToLLVM : public ConvertNVGPUToLLVMBase<ConvertNVGPUToLLVM> {
  
  public:
-@@ -711,7 +789,8 @@ public:
+@@ -688,10 +766,9 @@ public:
+     patterns.add<NVGPUOpGenericPattern<ttn::ClusterCTAIdOp>>(
+         context, Cluster_Cta_Id_Op, Constraints({"=r"}), Constraints());
  
-     patterns.add<FenceAsyncSharedOpPattern, StoreMatrixOpPattern,
-                  ClusterArriveOpPattern, LoadDSmemOpPattern, WGMMAOpPattern,
--                 WGMMAWaitGroupOpPattern, StoreDSmemOpPattern>(context);
-+                 WGMMAWaitGroupOpPattern, StoreDSmemOpPattern,
-+                 SparseWGMMAOpPattern>(context);
+-    patterns
+-        .add<FenceAsyncSharedOpPattern, StoreMatrixOpPattern,
+-             ClusterArriveOpPattern, WGMMAOpPattern, WGMMAWaitGroupOpPattern>(
+-            context);
++    patterns.add<FenceAsyncSharedOpPattern, StoreMatrixOpPattern,
++                 ClusterArriveOpPattern, WGMMAOpPattern,
++                 WGMMAWaitGroupOpPattern, SparseWGMMAOpPattern>(context);
  
      if (applyPatternsAndFoldGreedily(mod, std::move(patterns)).failed())
        signalPassFailure();
diff --git a/third_party/xla/third_party/triton/xla_extensions/sparse_dot_passes.patch b/third_party/xla/third_party/triton/xla_extensions/sparse_dot_passes.patch
index e610f67cd70309..9136cb84b24254 100644
--- a/third_party/xla/third_party/triton/xla_extensions/sparse_dot_passes.patch
+++ b/third_party/xla/third_party/triton/xla_extensions/sparse_dot_passes.patch
@@ -1,7 +1,8 @@
 diff --git a/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp b/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp
+index 4aa2712ec..16a6253d7 100644
 --- a/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp
 +++ b/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp
-@@ -277,6 +277,89 @@ struct TritonDotPattern : public OpConve
+@@ -279,6 +279,89 @@ struct TritonDotPattern : public OpConversionPattern<triton::DotOp> {
    }
  };
  
@@ -12,7 +13,7 @@ diff --git a/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp b/lib/Co
 +  LogicalResult matchAndRewrite(
 +      triton::gpu::SparseDotOp op, OpAdaptor adaptor,
 +      ConversionPatternRewriter &rewriter) const override {
-+    RankedTensorType origType = op.getType().cast<RankedTensorType>();
++    RankedTensorType origType = cast<RankedTensorType>(op.getType());
 +    auto origShape = origType.getShape();
 +    auto typeConverter = getTypeConverter<TritonGPUTypeConverter>();
 +    int numWarps = typeConverter->getNumWarps();
@@ -40,8 +41,8 @@ diff --git a/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp b/lib/Co
 +        RankedTensorType::get(origShape, origType.getElementType(), dEncoding);
 +
 +    // a & b must be of smem layout
-+    auto aType = adaptor.getA().getType().cast<RankedTensorType>();
-+    auto bType = adaptor.getB().getType().cast<RankedTensorType>();
++    auto aType = cast<RankedTensorType>(adaptor.getA().getType());
++    auto bType = cast<RankedTensorType>(adaptor.getB().getType());
 +    Type aEltType = aType.getElementType();
 +    Type bEltType = bType.getElementType();
 +    Attribute aEncoding = aType.getEncoding();
@@ -51,14 +52,14 @@ diff --git a/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp b/lib/Co
 +    Value a = adaptor.getA();
 +    Value b = adaptor.getB();
 +    Value c = adaptor.getC();
-+    if (!aEncoding.isa<triton::gpu::DotOperandEncodingAttr>()) {
++    if (!isa<triton::gpu::DotOperandEncodingAttr>(aEncoding)) {
 +      Attribute encoding = triton::gpu::DotOperandEncodingAttr::get(
 +          getContext(), 0, dEncoding, aEltType);
 +      auto dstType =
 +          RankedTensorType::get(aType.getShape(), aEltType, encoding);
 +      a = rewriter.create<triton::gpu::ConvertLayoutOp>(a.getLoc(), dstType, a);
 +    }
-+    if (!bEncoding.isa<triton::gpu::DotOperandEncodingAttr>()) {
++    if (!isa<triton::gpu::DotOperandEncodingAttr>(bEncoding)) {
 +      Attribute encoding = triton::gpu::DotOperandEncodingAttr::get(
 +          getContext(), 1, dEncoding, bEltType);
 +      auto dstType =
@@ -68,11 +69,11 @@ diff --git a/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp b/lib/Co
 +    c = rewriter.create<triton::gpu::ConvertLayoutOp>(c.getLoc(), retType, c);
 +
 +    // aMeta must be of smem layout
-+    auto aMetaType = adaptor.getAMeta().getType().cast<RankedTensorType>();
++    auto aMetaType = cast<RankedTensorType>(adaptor.getAMeta().getType());
 +    Attribute aMetaEncoding = aMetaType.getEncoding();
 +    if (!aMetaEncoding) return failure();
 +    Value aMeta = adaptor.getAMeta();
-+    if (!aMetaEncoding.isa<triton::gpu::SparseDotMetaEncodingAttr>()) {
++    if (!isa<triton::gpu::SparseDotMetaEncodingAttr>(aMetaEncoding)) {
 +      Attribute encoding =
 +          triton::gpu::SparseDotMetaEncodingAttr::get(getContext(), dEncoding);
 +      auto dstType = RankedTensorType::get(
@@ -91,17 +92,17 @@ diff --git a/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp b/lib/Co
  struct TritonCatPattern : public OpConversionPattern<triton::CatOp> {
    using OpConversionPattern::OpConversionPattern;
  
-@@ -550,6 +633,7 @@ void populateTritonPatterns(TritonGPUTyp
-       GenericOpPattern<triton::AtomicRMWOp>, GenericOpPattern<ReturnOp>,
+@@ -553,6 +636,7 @@ void populateTritonPatterns(TritonGPUTypeConverter &typeConverter,
+       GenericOpPattern<triton::ExperimentalDescriptorStoreOp>,
        GenericOpPattern<triton::CallOp>, TritonFuncOpPattern>(typeConverter,
                                                               context);
 +  patterns.insert<TritonSparseDotPattern>(typeConverter, context);
  }
  
  //
-@@ -788,6 +872,12 @@ public:
-                  IntegerAttr::get(
-                      i32_ty, llvm::APInt(32, computeCapability.getValue())));
+@@ -794,6 +878,12 @@ public:
+     mod->setAttr(AttrTargetName,
+                  StringAttr::get(context, this->target.getValue()));
  
 +    // Only transform sparse dot op with undefined layout.
 +    target.addDynamicallyLegalOp<triton::gpu::SparseDotOp>(
@@ -113,9 +114,10 @@ diff --git a/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp b/lib/Co
        return signalPassFailure();
  
 diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
+index 098ee85e4..0516fc56f 100644
 --- a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
 +++ b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-@@ -42,8 +42,9 @@ static int getMMAVersionSafe(int compute
+@@ -44,8 +44,9 @@ static int getMMAVersionSafe(int computeCapability, tt::DotOp op) {
    return 0;
  }
  
@@ -126,7 +128,7 @@ diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect
    auto rank = shape.size();
    // Early exit for batched matmul
    if (rank == 3)
-@@ -56,14 +57,14 @@ warpsPerTileV2(tt::DotOp dotOp, const Ar
+@@ -58,8 +59,8 @@ warpsPerTileV2(tt::DotOp dotOp, const ArrayRef<int64_t> shape, int numWarps) {
    auto slices = multiRootGetSlice(dotOp, {filter}, {filter});
    bool hasChainedDot = false;
    for (Operation *op : slices) {
@@ -137,14 +139,7 @@ diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect
        auto resTy = chainedDot.getResult().getType();
        if (resTy.getRank() != rank) {
          continue;
-       }
-       if (auto mmaEncoding =
--              resTy.getEncoding().dyn_cast<NvidiaMmaEncodingAttr>()) {
-+              resTy.getEncoding().template dyn_cast<NvidiaMmaEncodingAttr>()) {
-         return ttg::getWarpsPerCTA(mmaEncoding);
-       }
-       hasChainedDot = true;
-@@ -101,12 +102,13 @@ warpsPerTileV2(tt::DotOp dotOp, const Ar
+@@ -103,12 +104,13 @@ warpsPerTileV2(tt::DotOp dotOp, const ArrayRef<int64_t> shape, int numWarps) {
    return ret;
  }
  
@@ -162,7 +157,7 @@ diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect
        slices.end())
      return {(unsigned)numWarps, 1};
  
-@@ -175,9 +177,10 @@ public:
+@@ -178,9 +180,10 @@ public:
        : mlir::RewritePattern(tt::DotOp::getOperationName(), 2, context),
          computeCapability(computeCapability) {}
  
@@ -176,7 +171,7 @@ diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect
      switch (version) {
      case 2:
        return warpsPerTileV2(dotOp, shape, numWarps);
-@@ -337,6 +340,97 @@ public:
+@@ -335,6 +338,98 @@ public:
      return success();
    }
  };
@@ -201,7 +196,7 @@ diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect
 +    // Check data-types and SM compatibility
 +    RankedTensorType oldRetType = dotOp.getType();
 +    if (!oldRetType.getEncoding() ||
-+        oldRetType.getEncoding().isa<ttg::NvidiaMmaEncodingAttr>())
++        isa<ttg::NvidiaMmaEncodingAttr>(oldRetType.getEncoding()))
 +      return failure();
 +
 +    assert(computeCapability >= 80 &&
@@ -214,8 +209,9 @@ diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect
 +    int numWarps = ttg::TritonGPUDialect::getNumWarps(mod);
 +    auto CTALayout = ttg::getCTALayout(oldRetType.getEncoding());
 +
-+    auto instrShape = mmaVersionToInstrShape(
-+        versionMajor, retShapePerCTA, a.getType().cast<TensorOrMemDesc>());
++    auto instrShape =
++        mmaVersionToInstrShape(versionMajor, retShapePerCTA,
++                               cast<TensorOrMemDesc>(a.getType()), numWarps);
 +    auto warpsPerTile = BlockedToMMA::getWarpsPerTile(
 +        dotOp, retShapePerCTA, versionMajor, numWarps, instrShape);
 +    ttg::NvidiaMmaEncodingAttr mmaEnc =
@@ -231,7 +227,7 @@ diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect
 +
 +    if (versionMajor == 2) {
 +      // convert A operand
-+      auto oldAType = a.getType().cast<RankedTensorType>();
++      auto oldAType = cast<RankedTensorType>(a.getType());
 +      auto newAEncoding = ttg::DotOperandEncodingAttr::get(
 +          ctx, 0, mmaEnc, oldAType.getElementType());
 +      auto newAType = RankedTensorType::get(
@@ -239,7 +235,7 @@ diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect
 +      a = rewriter.create<ttg::ConvertLayoutOp>(a.getLoc(), newAType, a);
 +
 +      // convert B operand
-+      auto oldBType = b.getType().cast<RankedTensorType>();
++      auto oldBType = cast<RankedTensorType>(b.getType());
 +      auto newBEncoding = ttg::DotOperandEncodingAttr::get(
 +          ctx, 1, mmaEnc, oldBType.getElementType());
 +      auto newBType = RankedTensorType::get(
@@ -252,7 +248,7 @@ diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect
 +
 +    // convert metadata
 +    Value meta = dotOp.getAMeta();
-+    auto oldMetaType = meta.getType().cast<RankedTensorType>();
++    auto oldMetaType = cast<RankedTensorType>(meta.getType());
 +    auto newMetaType = RankedTensorType::get(
 +        oldMetaType.getShape(), oldMetaType.getElementType(),
 +        SparseDotMetaEncodingAttr::get(ctx, mmaEnc));
@@ -274,7 +270,7 @@ diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect
  } // namespace
  
  static Value promoteOperand(OpBuilder &builder, Location loc, Value operand,
-@@ -397,6 +491,7 @@ public:
+@@ -394,6 +489,7 @@ public:
  
      mlir::RewritePatternSet patterns(context);
      patterns.add<::BlockedToMMA>(context, computeCapability);
@@ -283,33 +279,31 @@ diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect
        signalPassFailure();
      }
 diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp
+index 97ca6a840..f0ef124ff 100644
 --- a/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp
 +++ b/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp
-@@ -47,6 +47,10 @@ struct PipelinedOpInfo {
-   bool loadIsMMAV3 = false;
+@@ -188,6 +188,10 @@ public:
+   }
  };
  
-+bool isDotOp(Operation* op) {
++static bool isDotOp(Operation* op) {
 +  return isa<tt::DotOp, ttg::SparseDotOp>(op);
 +}
 +
- } // namespace
- 
  static bool isMMAv3Dot(Operation *op) {
-@@ -163,22 +167,28 @@ getSharedEncIfAllUsersAreDotEnc(Value va
+   auto dot = dyn_cast<tt::DotOp>(op);
+   if (!dot)
+@@ -399,19 +403,28 @@ getSharedEncIfAllUsersAreDotEnc(Value val) {
      } else {
        if (!isa<ttg::LocalLoadOp, ttg::ConvertLayoutOp>(user))
          return std::nullopt;
--      auto dotOpEnc = user->getResult(0)
--                          .getType()
--                          .cast<TensorOrMemDesc>()
--                          .getEncoding()
--                          .dyn_cast<ttg::DotOperandEncodingAttr>();
+-      auto dotOpEnc = dyn_cast<ttg::DotOperandEncodingAttr>(
+-          cast<TensorOrMemDesc>(user->getResult(0).getType()).getEncoding());
 -      if (!dotOpEnc)
 +      auto enc =
-+          user->getResult(0).getType().cast<TensorOrMemDesc>().getEncoding();
++          cast<TensorOrMemDesc>(user->getResult(0).getType()).getEncoding();
 +      if (isa<ttg::DotOperandEncodingAttr>(enc)) {
-+        auto srcTy = val.getType().cast<TensorOrMemDesc>();
++        auto srcTy = cast<TensorOrMemDesc>(val.getType());
 +        auto CTALayout = ttg::getCTALayout(srcTy.getEncoding());
 +        auto order = ttg::getOrder(srcTy.getEncoding());
 +        unsigned bitWidth = srcTy.getElementType().getIntOrFloatBitWidth();
@@ -320,14 +314,14 @@ diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp b
 +            srcTy.getElementType().getIntOrFloatBitWidth(),
 +            /*needTrans=*/false);
 +      } else if (isa<ttg::SparseDotMetaEncodingAttr>(enc)) {
-+        auto srcTy = val.getType().cast<TensorOrMemDesc>();
++        auto srcTy = cast<TensorOrMemDesc>(val.getType());
 +        tempAttr = ttg::SharedEncodingAttr::get(
 +            val.getContext(), /*vec=*/1, /*perPhase=*/1, /*maxPhase=*/1,
 +            ttg::getOrder(srcTy.getEncoding()),
 +            ttg::getCTALayout(srcTy.getEncoding()));
 +      } else {
          return std::nullopt;
--      auto srcTy = val.getType().cast<TensorOrMemDesc>();
+-      auto srcTy = cast<TensorOrMemDesc>(val.getType());
 -      auto CTALayout = ttg::getCTALayout(srcTy.getEncoding());
 -      auto order = ttg::getOrder(srcTy.getEncoding());
 -      unsigned bitWidth = srcTy.getElementType().getIntOrFloatBitWidth();
@@ -340,61 +334,63 @@ diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp b
      }
      // Check that the shared encodings needed by the users are compatible.
      if (!tempAttr || (attr != nullptr && attr != tempAttr))
-@@ -311,7 +321,7 @@ loadOpsToDistanceAndUse(scf::ForOp forOp
+@@ -518,7 +531,7 @@ loadOpsToIndirectionLevelAndUse(scf::ForOp forOp) {
        };
  
    for (Operation &op : forOp.getBody()->without_terminator()) {
 -    if (!isa<tt::DotOp>(op))
 +    if (!isDotOp(&op))
        continue;
+     seen.clear();
      dfs(&op, 0, &op);
-   }
-@@ -385,7 +395,7 @@ collectOpsToPipeline(scf::ForOp forOp,
-   // loads.
-   for (auto &[loadOp, distAndUse] : loadOpToDistAndUse) {
-     PipelinedOpInfo loadInfo;
--    if (isa<tt::DotOp>(distAndUse.second)) {
-+    if (isDotOp(distAndUse.second)) {
-       if (loadIsMMAv3(loadOp)) {
-         loadInfo.loadIsMMAV3 = true;
-         loadInfo.sharedEncoding =
-@@ -743,7 +753,7 @@ bool mlir::triton::preProcessLoopAndGetS
-     int useStage = opToInfo[info.use].stage;
-     int numBuffers = useStage - defStage;
+@@ -595,7 +608,8 @@ assignMemoryLayouts(llvm::SmallVector<std::tuple<Operation *, int, Operation *>>
+         continue;
+     }
  
--    if (hasMMAV3 && isa<tt::DotOp>(info.use)) {
-+    if (hasMMAV3 && isDotOp(info.use)) {
-       // For MMAv3, we need an extra buffer as this is assumed in the wgmma
-       // pipelining post-processing.
-       numBuffers++;
+-    if (auto dot = dyn_cast<tt::DotOp>(use)) {
++    if (isDotOp(use)) {
++      auto dot = dyn_cast<tt::DotOp>(use);
+       loadInfo.usedByDot = true;
+       if (loadIsMMAv3(op)) {
+         loadInfo.loadIsMMAV3 = true;
+@@ -614,7 +628,7 @@ assignMemoryLayouts(llvm::SmallVector<std::tuple<Operation *, int, Operation *>>
+         // The codegen bug is caught by an assertion, so if you think you've
+         // fixed it, feel free to delete this code and see if the assert still
+         // fails.  :)
+-        if (!loadInfo.sharedEncoding) {
++        if (dot && !loadInfo.sharedEncoding) {
+           if (auto dotEnc = dyn_cast<ttg::NvidiaMmaEncodingAttr>(
+                   dot.getResult().getType().getEncoding())) {
+             auto loadTy = cast<RankedTensorType>(op->getResultTypes()[0]);
 diff --git a/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp b/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp
+index 2211df31b..ee5ff44d8 100644
 --- a/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp
 +++ b/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp
-@@ -36,6 +36,10 @@ public:
+@@ -37,6 +37,10 @@ public:
        auto srcEncoding = srcType.getEncoding();
-       if (srcEncoding.isa<triton::gpu::SharedEncodingAttr>())
+       if (isa<triton::gpu::SharedEncodingAttr>(srcEncoding))
          return;
-+      if (dstType.getEncoding().isa<triton::gpu::SparseDotMetaEncodingAttr>()) {
++      if (isa<triton::gpu::SparseDotMetaEncodingAttr>(dstType.getEncoding())) {
 +        replaceSparseMetaEncoding(cvtOp);
 +        return;
 +      }
        auto dstDotOp =
-           dstType.getEncoding().dyn_cast<triton::gpu::DotOperandEncodingAttr>();
+           dyn_cast<triton::gpu::DotOperandEncodingAttr>(dstType.getEncoding());
        if (!dstDotOp)
-@@ -74,6 +78,27 @@ public:
+@@ -83,6 +87,27 @@ public:
        cvtOp.erase();
      });
    }
 +
 + private:
 +  void replaceSparseMetaEncoding(triton::gpu::ConvertLayoutOp cvtOp) {
-+    auto srcType = cvtOp.getOperand().getType().cast<RankedTensorType>();
++    auto srcType = cast<RankedTensorType>(cvtOp.getOperand().getType());
 +    auto srcEncoding = srcType.getEncoding();
 +    auto sharedLayout = triton::gpu::SharedEncodingAttr::get(
 +        cvtOp.getContext(), 8, 1, 1, triton::gpu::getOrder(srcEncoding),
 +        triton::gpu::getCTALayout(srcEncoding));
 +
-+    auto dstType = cvtOp.getType().cast<RankedTensorType>();
++    auto dstType = cast<RankedTensorType>(cvtOp.getType());
 +    auto tmpType = triton::MemDescType::get(
 +        dstType.getShape(), dstType.getElementType(), sharedLayout);
 +
@@ -410,6 +406,7 @@ diff --git a/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp b/lib/Di
  
  std::unique_ptr<Pass> mlir::triton::gpu::createReduceDataDuplicationPass() {
 diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
+index f456d36a6..a1dac2b72 100644
 --- a/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
 +++ b/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
 @@ -45,7 +45,7 @@ public:
@@ -421,7 +418,7 @@ diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp b/lib/Dia
          return WalkResult::advance();
        OpBuilder builder(op);
        auto a = op->getOperand(0);
-@@ -83,7 +83,7 @@ private:
+@@ -80,7 +80,7 @@ private:
      static DenseSet<std::pair<Operation *, unsigned>> trace;
      auto op = operand.getDefiningOp();
      // avoid redundant insertion
@@ -430,162 +427,3 @@ diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp b/lib/Dia
        return false;
      // reach convertlayout
      if (op && isa<ttg::LocalAllocOp>(op) &&
-diff --git a/test/SparseDot/add_layout.mlir b/test/SparseDot/add_layout.mlir
-new file mode 100644
---- /dev/null
-+++ b/test/SparseDot/add_layout.mlir
-@@ -0,0 +1,15 @@
-+// RUN: triton-opt %s -split-input-file -convert-triton-to-tritongpu | FileCheck %s
-+
-+// CHECK-COUNT-4: #triton_gpu.blocked
-+module attributes {"triton_gpu.num-warps" = 4 : i32} {
-+  tt.func @sparse_dot() {
-+    %A = arith.constant dense<1.00e+00> : tensor<64x32xf16>
-+    %meta = arith.constant dense<0x3333> : tensor<64x4xi16>
-+    %B = arith.constant dense<2.00e+00> : tensor<64x64xf16>
-+    %C = arith.constant dense<0.00e+00> : tensor<64x64xf32>
-+    // CHECK-COUNT-4: triton_gpu.convert_layout
-+    // CHECK: triton_gpu.sparse_dot {{.+}} #triton_gpu.sparse_dot_meta
-+    %D = triton_gpu.sparse_dot %A, %B, %C, %meta : tensor<64x32xf16> meta tensor<64x4xi16> * tensor<64x64xf16> -> tensor<64x64xf32>
-+    tt.return
-+  }
-+}
-diff --git a/test/SparseDot/ttg_accelerate_matmul.mlir b/test/SparseDot/ttg_accelerate_matmul.mlir
-new file mode 100644
---- /dev/null
-+++ b/test/SparseDot/ttg_accelerate_matmul.mlir
-@@ -0,0 +1,27 @@
-+// RUN: ENABLE_MMA_V3=1 triton-opt %s -split-input-file -tritongpu-accelerate-matmul=compute-capability=90 | FileCheck %s
-+// RUN: triton-opt %s -split-input-file -tritongpu-accelerate-matmul=compute-capability=80 | FILECHECK_OPTS= FileCheck %s --check-prefix=CHECK-80
-+
-+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
-+// CHECK: #[[MMA:.+]] = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}>
-+// CHECK-80: #[[MMA:.+]] = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [16, 8]}>
-+#lhs = #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>
-+#rhs = #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>
-+module attributes {"triton_gpu.num-warps" = 4 : i32} {
-+  tt.func @sparse_dot(%A: tensor<64x32xf16, #lhs>, %B: tensor<64x64xf16, #rhs>, %meta: tensor<64x4xi16, #blocked>) -> tensor<64x64xf32, #blocked> {
-+    %C = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked>
-+    // CHECK-DAG: %[[LHS:.+]] = triton_gpu.local_alloc {{.+}} : (tensor<64x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>) -> !tt.memdesc<64x32xf16, #{{.+}}>
-+    // CHECK-DAG: %[[RHS:.+]] = triton_gpu.local_alloc {{.+}} : (tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>>) -> !tt.memdesc<64x64xf16, #{{.+}}>
-+    // CHECK-DAG: %[[ACC:.+]] = triton_gpu.convert_layout {{.+}} : tensor<64x64xf32, #blocked> -> tensor<64x64xf32, #[[MMA]]>
-+    // CHECK-DAG: %[[META:.+]] = triton_gpu.convert_layout {{.+}} : tensor<64x4xi16, #blocked> -> tensor<64x4xi16, #triton_gpu.sparse_dot_meta<{parent = #[[MMA]]}>>
-+    // CHECK: %[[OUT:.+]] = triton_gpu.sparse_dot %[[LHS]], %[[RHS]], %[[ACC]], %[[META]] : {{.+}} -> tensor<64x64xf32, #[[MMA]]>
-+    // CHECK-80-DAG: %[[LHS:.+]] = triton_gpu.convert_layout {{.+}} : {{.+}} -> tensor<64x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[MMA]], kWidth = 2}>>
-+    // CHECK-80-DAG: %[[RHS:.+]] = triton_gpu.convert_layout {{.+}} : {{.+}} -> tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[MMA]], kWidth = 2}>>
-+    // CHECK-80-DAG: %[[ACC:.+]] = triton_gpu.convert_layout {{.+}} : {{.+}} -> tensor<64x64xf32, #[[MMA]]>
-+    // CHECK-80-DAG: %[[META:.+]] = triton_gpu.convert_layout {{.+}} : {{.+}} -> tensor<64x4xi16, #triton_gpu.sparse_dot_meta<{parent = #[[MMA]]}>>
-+    // CHECK-80: %[[OUT:.+]] = triton_gpu.sparse_dot %[[LHS]], %[[RHS]], %[[ACC]], %[[META]] : {{.+}} -> tensor<64x64xf32, #[[MMA]]>
-+    %D = triton_gpu.sparse_dot %A, %B, %C, %meta : tensor<64x32xf16, #lhs> meta tensor<64x4xi16, #blocked> * tensor<64x64xf16, #rhs> -> tensor<64x64xf32, #blocked>
-+    // CHECK: triton_gpu.convert_layout %[[OUT]] : tensor<64x64xf32, #[[MMA]]> -> tensor<64x64xf32, #blocked>
-+    // CHECK-80: triton_gpu.convert_layout %[[OUT]] : tensor<64x64xf32, #[[MMA]]> -> tensor<64x64xf32, #blocked>
-+    tt.return %D : tensor<64x64xf32, #blocked>
-+  }
-+}
-diff --git a/test/SparseDot/ttg_fence_insertion.mlir b/test/SparseDot/ttg_fence_insertion.mlir
-new file mode 100644
---- /dev/null
-+++ b/test/SparseDot/ttg_fence_insertion.mlir
-@@ -0,0 +1,18 @@
-+// RUN: ENABLE_MMA_V3=1 triton-opt %s -split-input-file -triton-nvidia-gpu-fence-insertion | FileCheck %s
-+
-+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
-+#mma = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}>
-+#lhs = #triton_gpu.dot_op<{opIdx = 0, parent = #mma}>
-+#rhs = #triton_gpu.dot_op<{opIdx = 1, parent = #mma}>
-+#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}>
-+module attributes {"triton_gpu.num-warps" = 4 : i32} {
-+  tt.func public @sparse_dot_fence(%A: tensor<64x32xf16, #lhs>, %B: tensor<64x64xf16, #rhs>, %meta: tensor<64x4xi16, #blocked>) {
-+    %C = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #mma>
-+    %0 = triton_gpu.local_alloc %A : (tensor<64x32xf16, #lhs>) -> !tt.memdesc<64x32xf16, #shared>
-+    %1 = triton_gpu.local_alloc %B : (tensor<64x64xf16, #rhs>) -> !tt.memdesc<64x64xf16, #shared>
-+    %2 = triton_gpu.convert_layout %meta : tensor<64x4xi16, #blocked> -> tensor<64x4xi16, #triton_gpu.sparse_dot_meta<{parent = #mma}>>
-+    // CHECK: triton_nvidia_gpu.fence_async_shared
-+    %3 = triton_gpu.sparse_dot %0, %1, %C, %2 : !tt.memdesc<64x32xf16, #shared> meta tensor<64x4xi16, #triton_gpu.sparse_dot_meta<{parent = #mma}>> * !tt.memdesc<64x64xf16, #shared> -> tensor<64x64xf32, #mma>
-+    tt.return
-+  }
-+}
-diff --git a/test/SparseDot/ttg_loop_pipeline.mlir b/test/SparseDot/ttg_loop_pipeline.mlir
-new file mode 100644
---- /dev/null
-+++ b/test/SparseDot/ttg_loop_pipeline.mlir
-@@ -0,0 +1,61 @@
-+// RUN: triton-opt %s -split-input-file -tritongpu-pipeline=num-stages=3 | FileCheck %s
-+
-+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
-+#sliced = #triton_gpu.slice<{parent=#blocked, dim=0}>
-+#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, warpsPerCTA = [4, 1]}>
-+#dot_operand_a = #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth=2}>
-+#dot_operand_b = #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth=2}>
-+#dot_meta_enc = #triton_gpu.sparse_dot_meta<{parent=#mma}>
-+
-+module attributes {"triton_gpu.num-warps" = 4 : i32} {
-+  tt.func @sparse_dot_loop(%lb : index, %ub : index, %step : index,
-+        %A : !tt.ptr<f16> {tt.divisibility = 16 : i32},
-+        %B : !tt.ptr<f16> {tt.divisibility = 16 : i32},
-+        %A_meta : !tt.ptr<i16> {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #mma> {
-+    // CHECK-COUNT-6: triton_gpu.async_copy_global_to_local
-+    // CHECK: triton_gpu.async_wait {{.+}}, {{.+}} {num = 3 : i32}
-+    %a_ptr_splat = tt.splat %A : !tt.ptr<f16> -> tensor<128x32x!tt.ptr<f16>, #blocked>
-+    %a_tmp0 = tt.make_range {end = 32: i32, start = 0: i32} : tensor<32xi32, #sliced>
-+    %a_tmp1 = tt.expand_dims %a_tmp0 {axis = 0 : i32} : tensor<32xi32, #sliced> -> tensor<1x32xi32, #blocked>
-+    %a_offs = tt.broadcast %a_tmp1 : tensor<1x32xi32, #blocked> -> tensor<128x32xi32, #blocked>
-+    %a_ptr_init = tt.addptr %a_ptr_splat, %a_offs : tensor<128x32x!tt.ptr<f16>, #blocked>, tensor<128x32xi32, #blocked>
-+
-+    %b_ptr_splat = tt.splat %B : !tt.ptr<f16> -> tensor<64x128x!tt.ptr<f16>, #blocked>
-+    %b_tmp0 = tt.make_range {end = 128: i32, start = 0: i32} : tensor<128xi32, #sliced>
-+    %b_tmp1 = tt.expand_dims %b_tmp0 {axis = 0 : i32} : tensor<128xi32, #sliced> -> tensor<1x128xi32, #blocked>
-+    %b_offs = tt.broadcast %b_tmp1 : tensor<1x128xi32, #blocked> -> tensor<64x128xi32, #blocked>
-+    %b_ptr_init = tt.addptr %b_ptr_splat, %b_offs : tensor<64x128x!tt.ptr<f16>, #blocked>, tensor<64x128xi32, #blocked>
-+
-+    %meta_ptr_splat = tt.splat %A_meta : !tt.ptr<i16> -> tensor<128x4x!tt.ptr<i16>, #blocked>
-+    %meta_tmp0 = tt.make_range {end = 4: i32, start = 0: i32} : tensor<4xi32, #sliced>
-+    %meta_tmp1 = tt.expand_dims %meta_tmp0 {axis = 0 : i32} : tensor<4xi32, #sliced> -> tensor<1x4xi32, #blocked>
-+    %meta_offs = tt.broadcast %meta_tmp1 : tensor<1x4xi32, #blocked> -> tensor<128x4xi32, #blocked>
-+    %meta_ptr_init = tt.addptr %meta_ptr_splat, %meta_offs : tensor<128x4x!tt.ptr<i16>, #blocked>, tensor<128x4xi32, #blocked>
-+
-+    %a_off = arith.constant dense<4> : tensor<128x32xi32, #blocked>
-+    %b_off = arith.constant dense<4> : tensor<64x128xi32, #blocked>
-+    %meta_off = arith.constant dense<4> : tensor<128x4xi32, #blocked>
-+    %c_init = arith.constant dense<0.00e+00> : tensor<128x128xf32, #mma>
-+
-+    // CHECK: scf.for
-+    %loop:4 = scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %b_ptr = %b_ptr_init, %c = %c_init, %meta_ptr = %meta_ptr_init)
-+        -> (tensor<128x32x!tt.ptr<f16>, #blocked>, tensor<64x128x!tt.ptr<f16>, #blocked>, tensor<128x128xf32, #mma>, tensor<128x4x!tt.ptr<i16>, #blocked>) {
-+      // CHECK-COUNT-3: triton_gpu.local_load
-+      // CHECK: triton_gpu.sparse_dot
-+      // CHECK-COUNT-3: triton_gpu.async_copy_global_to_local
-+      %a_ = tt.load %a_ptr {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x32xf16, #blocked>
-+      %a = triton_gpu.convert_layout %a_ : tensor<128x32xf16, #blocked> -> tensor<128x32xf16, #dot_operand_a>
-+      %b_ = tt.load %b_ptr {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<64x128xf16, #blocked>
-+      %b = triton_gpu.convert_layout %b_ : tensor<64x128xf16, #blocked> -> tensor<64x128xf16, #dot_operand_b>
-+      %meta_ = tt.load %meta_ptr {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x4xi16, #blocked>
-+      %meta = triton_gpu.convert_layout %meta_ : tensor<128x4xi16, #blocked> -> tensor<128x4xi16, #dot_meta_enc>
-+      %d = triton_gpu.sparse_dot %a, %b, %c, %meta : tensor<128x32xf16, #dot_operand_a> meta tensor<128x4xi16, #dot_meta_enc> * tensor<64x128xf16, #dot_operand_b> -> tensor<128x128xf32, #mma>
-+
-+      %a_ptr_next = tt.addptr %a_ptr, %a_off : tensor<128x32x!tt.ptr<f16>, #blocked>, tensor<128x32xi32, #blocked>
-+      %b_ptr_next = tt.addptr %b_ptr, %b_off : tensor<64x128x!tt.ptr<f16>, #blocked>, tensor<64x128xi32, #blocked>
-+      %meta_ptr_next = tt.addptr %meta_ptr, %meta_off : tensor<128x4x!tt.ptr<i16>, #blocked>, tensor<128x4xi32, #blocked>
-+      scf.yield %a_ptr_next, %b_ptr_next, %d, %meta_ptr_next : tensor<128x32x!tt.ptr<f16>, #blocked>, tensor<64x128x!tt.ptr<f16>, #blocked>, tensor<128x128xf32, #mma>, tensor<128x4x!tt.ptr<i16>, #blocked>
-+    }
-+    tt.return %loop#2: tensor<128x128xf32, #mma>
-+  }
-+}
-diff --git a/test/SparseDot/ttg_reduce_data_duplication.mlir b/test/SparseDot/ttg_reduce_data_duplication.mlir
-new file mode 100644
---- /dev/null
-+++ b/test/SparseDot/ttg_reduce_data_duplication.mlir
-@@ -0,0 +1,13 @@
-+// RUN: triton-opt %s -split-input-file -tritongpu-reduce-data-duplication | FileCheck %s
-+
-+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
-+#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [16, 8]}>
-+// CHECK: #[[SHARED:.+]] = #triton_gpu.shared
-+module attributes {"triton_gpu.num-warps" = 4 : i32} {
-+  tt.func @sparse_dot_metadata(%meta: tensor<64x4xi16, #blocked>) {
-+    // CHECK: %[[META:.+]] = triton_gpu.local_alloc {{.+}} : (tensor<64x4xi16, #blocked>) -> !tt.memdesc<64x4xi16, #[[SHARED]]>
-+    // CHECK: triton_gpu.local_load %[[META]] : !tt.memdesc<64x4xi16, #[[SHARED]]> -> tensor<64x4xi16, #triton_gpu.sparse_dot_meta<{parent = #mma}>>
-+    %0 = triton_gpu.convert_layout %meta : tensor<64x4xi16, #blocked> -> tensor<64x4xi16, #triton_gpu.sparse_dot_meta<{parent = #mma}>>
-+    tt.return
-+  }
-+}
diff --git a/third_party/xla/third_party/tsl/.bazelrc b/third_party/xla/third_party/tsl/.bazelrc
index d7ae76f096431a..02dec0349c4741 100644
--- a/third_party/xla/third_party/tsl/.bazelrc
+++ b/third_party/xla/third_party/tsl/.bazelrc
@@ -51,16 +51,13 @@
 # Remote build execution options (only configured to work with TF team projects for now.)
 #     rbe_base:  General RBE options shared by all flavors.
 #     rbe_linux: General RBE options used on all linux builds.
-#     rbe_win:   General RBE options used on all windows builds.
+#     rbe_win_base:   General RBE options used on all Windows builds. Not to be used standalone.
+#     rbe_win_clang:  Options specific to compiling using Clang.
 #
 #     rbe_linux_cpu:                  RBE options to build with only CPU support.
 #     rbe_linux_cuda:                 RBE options to build with GPU support using clang.
 #     rbe_linux_cuda_nvcc:            RBE options to build with GPU support using nvcc.
 #
-#     rbe_win_py39: Windows Python 3.9 RBE config
-#
-#     tensorflow_testing_rbe_win:   RBE options to use RBE with tensorflow-testing project on windows
-#
 # Embedded Linux options (experimental and only tested with TFLite build yet)
 #     elinux:          General Embedded Linux options shared by all flavors.
 #     elinux_aarch64:  Embedded Linux options for aarch64 (ARM64) CPU support.
@@ -450,6 +447,17 @@ build:win_clang --host_linkopt=/FORCE:MULTIPLE
 test:win_clang --linkopt=/FORCE:MULTIPLE
 test:win_clang --host_linkopt=/FORCE:MULTIPLE
 
+# Same config as above but for XLA, which has different toolchain paths
+build:win_clang_xla --copt=/clang:-Weverything
+build:win_clang_xla --extra_toolchains=@local_config_cc//:cc-toolchain-x64_windows-clang-cl
+build:win_clang_xla --extra_execution_platforms=//tools/toolchains/win:x64_windows-clang-cl
+build:win_clang_xla --host_platform=//tools/toolchains/win:x64_windows-clang-cl
+build:win_clang_xla --compiler=clang-cl
+build:win_clang_xla --linkopt=/FORCE:MULTIPLE
+build:win_clang_xla --host_linkopt=/FORCE:MULTIPLE
+test:win_clang_xla --linkopt=/FORCE:MULTIPLE
+test:win_clang_xla --host_linkopt=/FORCE:MULTIPLE
+
 # Options to build TensorFlow 1.x or 2.x.
 # TODO(kanglan): Change v2's define to default behavior
 build:v2 --define=tf_api_version=2 --action_env=TF2_BEHAVIOR=1
@@ -546,38 +554,25 @@ build:rbe_linux_cuda_nvcc --config=rbe_linux_cuda
 build:rbe_linux_cuda_nvcc --config=nvcc_clang
 build:rbe_linux_cuda_nvcc --repo_env TF_NCCL_USE_STUB=1
 
-# TODO(kanglan): Remove rbe_win and rbe_win_py3* after b/289091160 is fixed
-build:rbe_win --config=rbe_base
-build:rbe_win --crosstool_top="//tensorflow/tools/toolchains/win/tf_win_05022023:toolchain"
-build:rbe_win --extra_toolchains="//tensorflow/tools/toolchains/win/tf_win_05022023:cc-toolchain-x64_windows"
-build:rbe_win --extra_execution_platforms="//tensorflow/tools/toolchains/win:rbe_windows_ltsc2019"
-build:rbe_win --host_platform="//tensorflow/tools/toolchains/win:rbe_windows_ltsc2019"
-build:rbe_win --platforms="//tensorflow/tools/toolchains/win:rbe_windows_ltsc2019"
-build:rbe_win --shell_executable=C:\\tools\\msys64\\usr\\bin\\bash.exe
-build:rbe_win --experimental_strict_action_env=true
-
-# TODO(gunan): Remove once we use MSVC 2019 with latest patches.
-build:rbe_win --define=override_eigen_strong_inline=true
-
+build:rbe_win_base --config=rbe_base
+build:rbe_win_base --shell_executable=C:\\tools\\msys64\\usr\\bin\\bash.exe
+build:rbe_win_base --remote_instance_name=projects/tensorflow-testing/instances/windows
 # Don't build the python zip archive in the RBE build.
-build:rbe_win --remote_download_minimal
-build:rbe_win --enable_runfiles
-build:rbe_win --nobuild_python_zip
-
-build:rbe_win_py38 --config=rbe_base
-build:rbe_win_py38 --repo_env=PYTHON_BIN_PATH=C:\\Python38\\python.exe
-build:rbe_win_py38 --repo_env=PYTHON_LIB_PATH=C:\\Python38\\lib\\site-packages
-build:rbe_win_py38 --repo_env=TF_PYTHON_CONFIG_REPO=//tensorflow/tools/toolchains/win_1803/py38
-build:rbe_win_py38 --python_path=C:\\Python38\\python.exe
-
-build:rbe_win_py39 --config=rbe_base
-build:rbe_win_py39 --repo_env=PYTHON_BIN_PATH=C:\\Python39\\python.exe
-build:rbe_win_py39 --repo_env=PYTHON_LIB_PATH=C:\\Python39\\lib\\site-packages
-build:rbe_win_py39 --repo_env=TF_PYTHON_CONFIG_REPO=//tensorflow/tools/toolchains/win_1803/py39
-build:rbe_win_py39 --python_path=C:\\Python39\\python.exe
-
-# TODO(kanglan): Merge tensorflow_testing_rbe_win into rbe_win
-common:tensorflow_testing_rbe_win --remote_instance_name=projects/tensorflow-testing/instances/windows
+build:rbe_win_base --remote_download_minimal
+build:rbe_win_base --enable_runfiles
+build:rbe_win_base --nobuild_python_zip
+build:rbe_win_base --define=override_eigen_strong_inline=true
+
+build:rbe_win_clang --config=rbe_win_base
+build:rbe_win_clang --crosstool_top="//tensorflow/tools/toolchains/win/20240424:toolchain"
+build:rbe_win_clang --extra_toolchains="//tensorflow/tools/toolchains/win/20240424:cc-toolchain-x64_windows-clang-cl"
+build:rbe_win_clang --extra_execution_platforms="//tensorflow/tools/toolchains/win:x64_windows-clang-cl"
+build:rbe_win_clang --host_platform="//tensorflow/tools/toolchains/win:x64_windows-clang-cl"
+build:rbe_win_clang --platforms="//tensorflow/tools/toolchains/win:x64_windows-clang-cl"
+build:rbe_win_clang --compiler=clang-cl
+build:rbe_win_clang --linkopt=/FORCE:MULTIPLE
+build:rbe_win_clang --host_linkopt=/FORCE:MULTIPLE
+
 # END TF REMOTE BUILD EXECUTION OPTIONS
 
 # TFLite build configs for generic embedded Linux
@@ -815,7 +810,7 @@ test:linux_cuda_pycpp_test --config=linux_cuda_pycpp_test_filters -- //tensorflo
 build:linux_arm64_pycpp_test_filters --test_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
 build:linux_arm64_pycpp_test_filters --build_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
 build:linux_arm64_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium --flaky_test_attempts=3
-# TODO(michaelhudgins): Why do we need to specifically omit go and java here? 
+# TODO(michaelhudgins): Why do we need to specifically omit go and java here?
 build:linux_arm64_pycpp_test --config=linux_arm64_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/core/grappler/optimizers:auto_mixed_precision_test_cpu -//tensorflow/core/grappler/optimizers:remapper_test_cpu -//tensorflow/core/kernels/image:resize_bicubic_op_test -//tensorflow/compiler/mlir/tfr/examples/customization:test_ops_test -//tensorflow/compiler/mlir/tfr/examples/mnist:mnist_ops_test -//tensorflow/compiler/mlir/tfr/examples/pad:pad_ops_test -//tensorflow/python/tools:aot_compiled_test
 # CROSS-COMPILE ARM64 PYCPP
 build:cross_compile_linux_arm64_pycpp_test --config=linux_arm64_pycpp_test
@@ -924,7 +919,9 @@ build:cross_compile_macos_x86 --extra_toolchains=//tensorflow/tools/toolchains/c
 build:cross_compile_macos_x86 --platform_mappings=tensorflow/tools/toolchains/cross_compile/config/platform_mappings
 
 # RBE cross-compile configs for Darwin x86
-build:rbe_cross_compile_macos_x86 --config=cross_compile_macos_x86
+build:rbe_cross_compile_macos_x86 --config=cross_compile_macos_x86 --remote_download_minimal
+build:rbe_cross_compile_macos_x86 --bes_backend="" --bes_results_url="" --bes_timeout="0s"
+build:rbe_cross_compile_macos_x86 --experimental_remote_build_event_upload="minimal"
 build:rbe_cross_compile_macos_x86 --config=rbe_cross_compile_base
 build:rbe_cross_compile_macos_x86 --bes_upload_mode=nowait_for_upload_complete
 test:rbe_cross_compile_macos_x86 --config=rbe_cross_compile_base
diff --git a/third_party/xla/third_party/tsl/WORKSPACE b/third_party/xla/third_party/tsl/WORKSPACE
index 6ad0d6e0e2b7be..19350e3dbba762 100644
--- a/third_party/xla/third_party/tsl/WORKSPACE
+++ b/third_party/xla/third_party/tsl/WORKSPACE
@@ -10,6 +10,31 @@ workspace(name = "tsl")
 
 # buildifier: disable=load-on-top
 
+# Initialize hermetic Python
+load("//third_party/py:python_init_rules.bzl", "python_init_rules")
+
+python_init_rules()
+
+load("//third_party/py:python_init_repositories.bzl", "python_init_repositories")
+
+python_init_repositories(
+    requirements = {
+        "3.11": "//:requirements_lock_3_11.txt",
+    },
+)
+
+load("//third_party/py:python_init_toolchains.bzl", "python_init_toolchains")
+
+python_init_toolchains()
+
+load("//third_party/py:python_init_pip.bzl", "python_init_pip")
+
+python_init_pip()
+
+load("@pypi//:requirements.bzl", "install_deps")
+
+install_deps()
+
 load(":workspace3.bzl", "tsl_workspace3")
 
 tsl_workspace3()
diff --git a/third_party/xla/third_party/tsl/opensource_only.files b/third_party/xla/third_party/tsl/opensource_only.files
index 2853de8a621c65..709cacf5c1756e 100644
--- a/third_party/xla/third_party/tsl/opensource_only.files
+++ b/third_party/xla/third_party/tsl/opensource_only.files
@@ -73,13 +73,17 @@ third_party/nccl/system.BUILD.tpl:
 third_party/nvtx/BUILD:
 third_party/nvtx/LICENSE:
 third_party/protobuf/BUILD:
-third_party/py/non_hermetic/BUILD.tpl:
-third_party/py/non_hermetic/BUILD:
-third_party/py/non_hermetic/README:
-third_party/py/non_hermetic/ml_dtypes/BUILD:
-third_party/py/non_hermetic/ml_dtypes/LICENSE:
-third_party/py/non_hermetic/numpy/BUILD:
-third_party/py/non_hermetic/python_configure.bzl:
+third_party/py/BUILD.tpl:
+third_party/py/BUILD:
+third_party/py/ml_dtypes/BUILD:
+third_party/py/ml_dtypes/LICENSE:
+third_party/py/numpy/BUILD:
+third_party/py/python_configure.bzl:
+third_party/py/python_init_pip.bzl:
+third_party/py/python_init_repositories.bzl:
+third_party/py/python_init_rules.bzl:
+third_party/py/python_init_toolchains.bzl:
+third_party/py/python_repo.bzl:
 third_party/pybind11.BUILD:
 third_party/pybind11_bazel/BUILD:
 third_party/python_runtime/BUILD:
@@ -143,6 +147,7 @@ tools/toolchains/java/BUILD:
 tools/toolchains/python/BUILD:
 tools/toolchains/remote/BUILD:
 tools/toolchains/remote_config/BUILD:
+tools/toolchains/win/20240424/BUILD:
 tools/toolchains/win/BUILD:
 tools/toolchains/win/bazel_211/BUILD:
 tools/toolchains/win/tf_win_05022023/BUILD:
diff --git a/third_party/xla/third_party/tsl/requirements_lock_3_11.txt b/third_party/xla/third_party/tsl/requirements_lock_3_11.txt
new file mode 100644
index 00000000000000..5c4bb687dfecae
--- /dev/null
+++ b/third_party/xla/third_party/tsl/requirements_lock_3_11.txt
@@ -0,0 +1,49 @@
+numpy==1.24.3 \
+    --hash=sha256:0ec87a7084caa559c36e0a2309e4ecb1baa03b687201d0a847c8b0ed476a7187 \
+    --hash=sha256:1a7d6acc2e7524c9955e5c903160aa4ea083736fde7e91276b0e5d98e6332812 \
+    --hash=sha256:202de8f38fc4a45a3eea4b63e2f376e5f2dc64ef0fa692838e31a808520efaf7 \
+    --hash=sha256:210461d87fb02a84ef243cac5e814aad2b7f4be953b32cb53327bb49fd77fbb4 \
+    --hash=sha256:2d926b52ba1367f9acb76b0df6ed21f0b16a1ad87c6720a1121674e5cf63e2b6 \
+    --hash=sha256:352ee00c7f8387b44d19f4cada524586f07379c0d49270f87233983bc5087ca0 \
+    --hash=sha256:35400e6a8d102fd07c71ed7dcadd9eb62ee9a6e84ec159bd48c28235bbb0f8e4 \
+    --hash=sha256:3c1104d3c036fb81ab923f507536daedc718d0ad5a8707c6061cdfd6d184e570 \
+    --hash=sha256:4719d5aefb5189f50887773699eaf94e7d1e02bf36c1a9d353d9f46703758ca4 \
+    --hash=sha256:4749e053a29364d3452c034827102ee100986903263e89884922ef01a0a6fd2f \
+    --hash=sha256:5342cf6aad47943286afa6f1609cad9b4266a05e7f2ec408e2cf7aea7ff69d80 \
+    --hash=sha256:56e48aec79ae238f6e4395886b5eaed058abb7231fb3361ddd7bfdf4eed54289 \
+    --hash=sha256:76e3f4e85fc5d4fd311f6e9b794d0c00e7002ec122be271f2019d63376f1d385 \
+    --hash=sha256:7776ea65423ca6a15255ba1872d82d207bd1e09f6d0894ee4a64678dd2204078 \
+    --hash=sha256:784c6da1a07818491b0ffd63c6bbe5a33deaa0e25a20e1b3ea20cf0e43f8046c \
+    --hash=sha256:8535303847b89aa6b0f00aa1dc62867b5a32923e4d1681a35b5eef2d9591a463 \
+    --hash=sha256:9a7721ec204d3a237225db3e194c25268faf92e19338a35f3a224469cb6039a3 \
+    --hash=sha256:a1d3c026f57ceaad42f8231305d4653d5f05dc6332a730ae5c0bea3513de0950 \
+    --hash=sha256:ab344f1bf21f140adab8e47fdbc7c35a477dc01408791f8ba00d018dd0bc5155 \
+    --hash=sha256:ab5f23af8c16022663a652d3b25dcdc272ac3f83c3af4c02eb8b824e6b3ab9d7 \
+    --hash=sha256:ae8d0be48d1b6ed82588934aaaa179875e7dc4f3d84da18d7eae6eb3f06c242c \
+    --hash=sha256:c91c4afd8abc3908e00a44b2672718905b8611503f7ff87390cc0ac3423fb096 \
+    --hash=sha256:d5036197ecae68d7f491fcdb4df90082b0d4960ca6599ba2659957aafced7c17 \
+    --hash=sha256:d6cc757de514c00b24ae8cf5c876af2a7c3df189028d68c0cb4eaa9cd5afc2bf \
+    --hash=sha256:d933fabd8f6a319e8530d0de4fcc2e6a61917e0b0c271fded460032db42a0fe4 \
+    --hash=sha256:ea8282b9bcfe2b5e7d491d0bf7f3e2da29700cec05b49e64d6246923329f2b02 \
+    --hash=sha256:ecde0f8adef7dfdec993fd54b0f78183051b6580f606111a6d789cd14c61ea0c \
+    --hash=sha256:f21c442fdd2805e91799fbe044a7b999b8571bb0ab0f7850d0cb9641a687092b
+lit==17.0.6 \
+    --hash=sha256:dfa9af9b55fc4509a56be7bf2346f079d7f4a242d583b9f2e0b078fd0abae31b
+ml-dtypes==0.3.2 \
+    --hash=sha256:2c34f2ba9660b21fe1034b608308a01be82bbef2a92fb8199f24dc6bad0d5226 \
+    --hash=sha256:3a17ef2322e60858d93584e9c52a5be7dd6236b056b7fa1ec57f1bb6ba043e33 \
+    --hash=sha256:533059bc5f1764fac071ef54598db358c167c51a718f68f5bb55e3dee79d2967 \
+    --hash=sha256:6604877d567a29bfe7cc02969ae0f2425260e5335505cf5e7fefc3e5465f5655 \
+    --hash=sha256:6b35c4e8ca957c877ac35c79ffa77724ecc3702a1e4b18b08306c03feae597bb \
+    --hash=sha256:763697ab8a88d47443997a7cdf3aac7340049aed45f7521f6b0ec8a0594821fe \
+    --hash=sha256:7a4c3fcbf86fa52d0204f07cfd23947ef05b4ad743a1a988e163caa34a201e5e \
+    --hash=sha256:7afde548890a92b41c0fed3a6c525f1200a5727205f73dc21181a2726571bb53 \
+    --hash=sha256:7ba8e1fafc7fff3e643f453bffa7d082df1678a73286ce8187d3e825e776eb94 \
+    --hash=sha256:91f8783fd1f2c23fd3b9ee5ad66b785dafa58ba3cdb050c4458021fa4d1eb226 \
+    --hash=sha256:93b78f53431c93953f7850bb1b925a17f0ab5d97527e38a7e865b5b4bc5cfc18 \
+    --hash=sha256:961134ea44c7b8ca63eda902a44b58cd8bd670e21d62e255c81fba0a8e70d9b7 \
+    --hash=sha256:b89b194e9501a92d289c1ffd411380baf5daafb9818109a4f49b0a1b6dce4462 \
+    --hash=sha256:c7b3fb3d4f6b39bcd4f6c4b98f406291f0d681a895490ee29a0f95bab850d53c \
+    --hash=sha256:d1a746fe5fb9cd974a91070174258f0be129c592b93f9ce7df6cc336416c3fbd \
+    --hash=sha256:e8505946df1665db01332d885c2020b4cb9e84a8b1241eb4ba69d59591f65855 \
+    --hash=sha256:f47619d978ab1ae7dfdc4052ea97c636c6263e1f19bd1be0e42c346b98d15ff4
\ No newline at end of file
diff --git a/third_party/xla/third_party/tsl/third_party/absl/system.absl.random.BUILD b/third_party/xla/third_party/tsl/third_party/absl/system.absl.random.BUILD
index 948de07751a2cf..ac17ce6343b79e 100644
--- a/third_party/xla/third_party/tsl/third_party/absl/system.absl.random.BUILD
+++ b/third_party/xla/third_party/tsl/third_party/absl/system.absl.random.BUILD
@@ -51,3 +51,7 @@ cc_library(
         "//absl/types:span",
     ],
 )
+
+cc_library(
+    name = "bit_gen_ref",
+)
diff --git a/third_party/xla/third_party/tsl/third_party/ducc/ducc.BUILD b/third_party/xla/third_party/tsl/third_party/ducc/ducc.BUILD
index 8d713928f0f245..a1c4956d0a7912 100644
--- a/third_party/xla/third_party/tsl/third_party/ducc/ducc.BUILD
+++ b/third_party/xla/third_party/tsl/third_party/ducc/ducc.BUILD
@@ -24,6 +24,7 @@ cc_library(
         "src/ducc0/infra/error_handling.h",
         "src/ducc0/infra/misc_utils.h",
         "src/ducc0/infra/simd.h",
+        "src/ducc0/infra/string_utils.h",
         "src/ducc0/infra/threading.cc",
         "src/ducc0/infra/useful_macros.h",
         "src/ducc0/math/cmplx.h",
diff --git a/third_party/xla/third_party/tsl/third_party/ducc/workspace.bzl b/third_party/xla/third_party/tsl/third_party/ducc/workspace.bzl
index 14755791d51589..99c8b14cd9ff31 100644
--- a/third_party/xla/third_party/tsl/third_party/ducc/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/ducc/workspace.bzl
@@ -3,8 +3,8 @@
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
-    DUCC_COMMIT = "3d28aadfd8bb0219e3df188613dbbcdfffccc3cd"
-    DUCC_SHA256 = "eb044dd11374ed894d67081109d4aa7ed55c29fe3286b116f13db70da6af336c"
+    DUCC_COMMIT = "aa46a4c21e440b3d416c16eca3c96df19c74f316"
+    DUCC_SHA256 = "077cf4bd0bd7eddaa6649a024285fff96e2662c5e6f2fb6ed5c5771f9de093f3"
     tf_http_archive(
         name = "ducc",
         strip_prefix = "ducc-{commit}".format(commit = DUCC_COMMIT),
diff --git a/third_party/xla/third_party/tsl/third_party/eigen3/workspace.bzl b/third_party/xla/third_party/tsl/third_party/eigen3/workspace.bzl
index 027454e46dd251..36c73565a5e6e0 100644
--- a/third_party/xla/third_party/tsl/third_party/eigen3/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/eigen3/workspace.bzl
@@ -7,8 +7,8 @@ def repo():
 
     # Attention: tools parse and update these lines.
     # LINT.IfChange
-    EIGEN_COMMIT = "aa6964bf3a34fd607837dd8123bc42465185c4f8"
-    EIGEN_SHA256 = "35ba771e30c735a4215ed784d7e032086cf89fe6622dce4d793c45dd74373362"
+    EIGEN_COMMIT = "c1d637433e3b3f9012b226c2c9125c494b470ae6"
+    EIGEN_SHA256 = "0992b93a590c39e196a9efdb5b4919fbf3fb485e7e656c6a87b21ddadb7f6ad2"
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/eigen.cmake)
 
     tf_http_archive(
diff --git a/third_party/xla/third_party/tsl/third_party/mkl_dnn/mkldnn_v1.BUILD b/third_party/xla/third_party/tsl/third_party/mkl_dnn/mkldnn_v1.BUILD
index cc9e66d77e3a77..d36831ea4b9f53 100644
--- a/third_party/xla/third_party/tsl/third_party/mkl_dnn/mkldnn_v1.BUILD
+++ b/third_party/xla/third_party/tsl/third_party/mkl_dnn/mkldnn_v1.BUILD
@@ -94,8 +94,8 @@ expand_template(
     out = "include/oneapi/dnnl/dnnl_version.h",
     substitutions = {
         "@DNNL_VERSION_MAJOR@": "3",
-        "@DNNL_VERSION_MINOR@": "3",
-        "@DNNL_VERSION_PATCH@": "4",
+        "@DNNL_VERSION_MINOR@": "4",
+        "@DNNL_VERSION_PATCH@": "1",
         "@DNNL_VERSION_HASH@": "N/A",
     },
     template = "include/oneapi/dnnl/dnnl_version.h.in",
diff --git a/third_party/xla/third_party/tsl/third_party/nsync.patch b/third_party/xla/third_party/tsl/third_party/nsync.patch
index b76630392d0a16..7c0d210c6930d8 100644
--- a/third_party/xla/third_party/tsl/third_party/nsync.patch
+++ b/third_party/xla/third_party/tsl/third_party/nsync.patch
@@ -6,4 +6,4 @@ index 53cc1a6..0000000
 --- a/VERSION
 +++ /dev/null
 @@ -1 +0,0 @@
--1.25.0
+-1.27.0
diff --git a/third_party/xla/third_party/tsl/third_party/py/BUILD b/third_party/xla/third_party/tsl/third_party/py/BUILD
index e69de29bb2d1d6..84eba77ce1a7af 100644
--- a/third_party/xla/third_party/tsl/third_party/py/BUILD
+++ b/third_party/xla/third_party/tsl/third_party/py/BUILD
@@ -0,0 +1,40 @@
+load("@python//:defs.bzl", "compile_pip_requirements")
+load("@python_version_repo//:py_version.bzl", "REQUIREMENTS")
+
+compile_pip_requirements(
+    name = "requirements",
+    extra_args = [
+        "--allow-unsafe",
+        "--build-isolation",
+    ],
+    generate_hashes = True,
+    requirements_in = "requirements.in",
+    requirements_txt = REQUIREMENTS,
+)
+
+compile_pip_requirements(
+    name = "requirements_nightly",
+    data = ["test-requirements.txt"],
+    extra_args = [
+        "--allow-unsafe",
+        "--build-isolation",
+        "--extra-index-url=https://pypi.anaconda.org/scientific-python-nightly-wheels/simple",
+        "--pre",
+        "--upgrade",
+    ],
+    generate_hashes = False,
+    requirements_in = "requirements.in",
+    requirements_txt = REQUIREMENTS,
+)
+
+compile_pip_requirements(
+    name = "requirements_dev",
+    extra_args = [
+        "--allow-unsafe",
+        "--build-isolation",
+        "--upgrade",
+    ],
+    generate_hashes = False,
+    requirements_in = "requirements.in",
+    requirements_txt = REQUIREMENTS,
+)
diff --git a/third_party/xla/third_party/tsl/third_party/py/ml_dtypes/ml_dtypes.BUILD b/third_party/xla/third_party/tsl/third_party/py/ml_dtypes/ml_dtypes.BUILD
index 25c98abc05655d..f386124a36dfe8 100644
--- a/third_party/xla/third_party/tsl/third_party/py/ml_dtypes/ml_dtypes.BUILD
+++ b/third_party/xla/third_party/tsl/third_party/py/ml_dtypes/ml_dtypes.BUILD
@@ -20,21 +20,6 @@ cc_library(
     deps = ["@eigen_archive//:eigen3"],
 )
 
-cc_library(
-    name = "int4",
-    hdrs = ["include/int4.h"],
-    include_prefix = "ml_dtypes",
-    # Internal headers are all relative to . but other packages
-    # include these headers with the  prefix.
-    includes = [
-        ".",
-        "ml_dtypes",
-    ],
-    deps = [
-        ":intn",
-    ],
-)
-
 cc_library(
     name = "intn",
     hdrs = ["include/intn.h"],
@@ -62,7 +47,6 @@ pybind_extension(
     visibility = [":__subpackages__"],
     deps = [
         ":float8",
-        ":int4",
         ":intn",
         "@eigen_archive//:eigen3",
         "@local_tsl//third_party/py/numpy:headers",
diff --git a/third_party/xla/third_party/tsl/third_party/py/ml_dtypes/ml_dtypes.tests.BUILD b/third_party/xla/third_party/tsl/third_party/py/ml_dtypes/ml_dtypes.tests.BUILD
index 3e73c6f4747976..c811379a19dabd 100644
--- a/third_party/xla/third_party/tsl/third_party/py/ml_dtypes/ml_dtypes.tests.BUILD
+++ b/third_party/xla/third_party/tsl/third_party/py/ml_dtypes/ml_dtypes.tests.BUILD
@@ -60,11 +60,11 @@ cc_test(
 )
 
 cc_test(
-    name = "int4_test_cc",
-    srcs = ["int4_test.cc"],
+    name = "intn_test_cc",
+    srcs = ["intn_test.cc"],
     linkstatic = 1,
     deps = [
-        "//:int4",
+        "//:intn",
         "@com_google_googletest//:gtest_main",
         "@eigen_archive//:eigen3",
     ],
diff --git a/third_party/xla/third_party/tsl/third_party/py/ml_dtypes/workspace.bzl b/third_party/xla/third_party/tsl/third_party/py/ml_dtypes/workspace.bzl
index ac75f63653e8ab..51505bf3a1460d 100644
--- a/third_party/xla/third_party/tsl/third_party/py/ml_dtypes/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/py/ml_dtypes/workspace.bzl
@@ -7,8 +7,8 @@ float8 varieties, and int4.
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
-    ML_DTYPES_COMMIT = "15b400f4dbad93e725e77e7b8171e4bfebfac874"
-    ML_DTYPES_SHA256 = "368312e4909bffe6a5ef22640ddae425ee14101af069a2e48b69d2fee33461e4"
+    ML_DTYPES_COMMIT = "24084d9ed2c3d45bf83b7a9bff833aa185bf9172"
+    ML_DTYPES_SHA256 = "c916a3e6b3d9bdcb476f506fdbbecb6d5e9f21f82f221dfcb42b320b4e85e55a"
     tf_http_archive(
         name = "ml_dtypes",
         build_file = "//third_party/py/ml_dtypes:ml_dtypes.BUILD",
diff --git a/third_party/xla/third_party/tsl/third_party/py/non_hermetic/BUILD b/third_party/xla/third_party/tsl/third_party/py/non_hermetic/BUILD
deleted file mode 100644
index e69de29bb2d1d6..00000000000000
diff --git a/third_party/xla/third_party/tsl/third_party/py/non_hermetic/BUILD.tpl b/third_party/xla/third_party/tsl/third_party/py/non_hermetic/BUILD.tpl
deleted file mode 100644
index 45480bd4a31cf8..00000000000000
--- a/third_party/xla/third_party/tsl/third_party/py/non_hermetic/BUILD.tpl
+++ /dev/null
@@ -1,80 +0,0 @@
-licenses(["restricted"])
-
-package(default_visibility = ["//visibility:public"])
-
-# Point both runtimes to the same python binary to ensure we always
-# use the python binary specified by ./configure.py script.
-load("@bazel_tools//tools/python:toolchain.bzl", "py_runtime_pair")
-
-py_runtime(
-    name = "py2_runtime",
-    interpreter_path = "%{PYTHON_BIN_PATH}",
-    python_version = "PY2",
-)
-
-py_runtime(
-    name = "py3_runtime",
-    interpreter_path = "%{PYTHON_BIN_PATH}",
-    python_version = "PY3",
-)
-
-py_runtime_pair(
-    name = "py_runtime_pair",
-    py2_runtime = ":py2_runtime",
-    py3_runtime = ":py3_runtime",
-)
-
-toolchain(
-    name = "py_toolchain",
-    toolchain = ":py_runtime_pair",
-    toolchain_type = "@bazel_tools//tools/python:toolchain_type",
-    target_compatible_with = [%{PLATFORM_CONSTRAINT}],
-    exec_compatible_with = [%{PLATFORM_CONSTRAINT}],
-)
-
-# To build Python C/C++ extension on Windows, we need to link to python import library pythonXY.lib
-# See https://docs.python.org/3/extending/windows.html
-cc_import(
-    name = "python_lib",
-    interface_library = select({
-        ":windows": ":python_import_lib",
-        # A placeholder for Unix platforms which makes --no_build happy.
-        "//conditions:default": "not-existing.lib",
-    }),
-    system_provided = 1,
-)
-
-cc_library(
-    name = "python_headers",
-    hdrs = [":python_include"],
-    deps = select({
-        ":windows": [":python_lib"],
-        "//conditions:default": [],
-    }),
-    includes = ["python_include"],
-)
-
-# This alias is exists for the use of targets in the @llvm-project dependency,
-# which expect a python_headers target called @python_runtime//:headers. We use
-# a repo_mapping to alias python_runtime to this package, and an alias to create
-# the correct target.
-alias(
-    name = "headers",
-    actual = ":python_headers",
-)
-
-cc_library(
-    name = "numpy_headers",
-    hdrs = [":numpy_include"],
-    includes = ["numpy_include"],
-)
-
-config_setting(
-    name = "windows",
-    values = {"cpu": "x64_windows"},
-    visibility = ["//visibility:public"],
-)
-
-%{PYTHON_INCLUDE_GENRULE}
-%{NUMPY_INCLUDE_GENRULE}
-%{PYTHON_IMPORT_LIB_GENRULE}
\ No newline at end of file
diff --git a/third_party/xla/third_party/tsl/third_party/py/non_hermetic/README b/third_party/xla/third_party/tsl/third_party/py/non_hermetic/README
deleted file mode 100644
index 62188a5817a09e..00000000000000
--- a/third_party/xla/third_party/tsl/third_party/py/non_hermetic/README
+++ /dev/null
@@ -1,3 +0,0 @@
-This is a temporary copy of python_configure repository rule. It allows
-XLA and TSL to keep non-hermetic Python while TF uses hermetic Python.
-DO NOT DEPEND ON THIS COPY as it will be deleted soon.
\ No newline at end of file
diff --git a/third_party/xla/third_party/tsl/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.BUILD b/third_party/xla/third_party/tsl/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.BUILD
index 25c98abc05655d..f386124a36dfe8 100644
--- a/third_party/xla/third_party/tsl/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.BUILD
+++ b/third_party/xla/third_party/tsl/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.BUILD
@@ -20,21 +20,6 @@ cc_library(
     deps = ["@eigen_archive//:eigen3"],
 )
 
-cc_library(
-    name = "int4",
-    hdrs = ["include/int4.h"],
-    include_prefix = "ml_dtypes",
-    # Internal headers are all relative to . but other packages
-    # include these headers with the  prefix.
-    includes = [
-        ".",
-        "ml_dtypes",
-    ],
-    deps = [
-        ":intn",
-    ],
-)
-
 cc_library(
     name = "intn",
     hdrs = ["include/intn.h"],
@@ -62,7 +47,6 @@ pybind_extension(
     visibility = [":__subpackages__"],
     deps = [
         ":float8",
-        ":int4",
         ":intn",
         "@eigen_archive//:eigen3",
         "@local_tsl//third_party/py/numpy:headers",
diff --git a/third_party/xla/third_party/tsl/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.tests.BUILD b/third_party/xla/third_party/tsl/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.tests.BUILD
index 3e73c6f4747976..c811379a19dabd 100644
--- a/third_party/xla/third_party/tsl/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.tests.BUILD
+++ b/third_party/xla/third_party/tsl/third_party/py/non_hermetic/ml_dtypes/ml_dtypes.tests.BUILD
@@ -60,11 +60,11 @@ cc_test(
 )
 
 cc_test(
-    name = "int4_test_cc",
-    srcs = ["int4_test.cc"],
+    name = "intn_test_cc",
+    srcs = ["intn_test.cc"],
     linkstatic = 1,
     deps = [
-        "//:int4",
+        "//:intn",
         "@com_google_googletest//:gtest_main",
         "@eigen_archive//:eigen3",
     ],
diff --git a/third_party/xla/third_party/tsl/third_party/py/non_hermetic/ml_dtypes/workspace.bzl b/third_party/xla/third_party/tsl/third_party/py/non_hermetic/ml_dtypes/workspace.bzl
index ac75f63653e8ab..51505bf3a1460d 100644
--- a/third_party/xla/third_party/tsl/third_party/py/non_hermetic/ml_dtypes/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/py/non_hermetic/ml_dtypes/workspace.bzl
@@ -7,8 +7,8 @@ float8 varieties, and int4.
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
-    ML_DTYPES_COMMIT = "15b400f4dbad93e725e77e7b8171e4bfebfac874"
-    ML_DTYPES_SHA256 = "368312e4909bffe6a5ef22640ddae425ee14101af069a2e48b69d2fee33461e4"
+    ML_DTYPES_COMMIT = "24084d9ed2c3d45bf83b7a9bff833aa185bf9172"
+    ML_DTYPES_SHA256 = "c916a3e6b3d9bdcb476f506fdbbecb6d5e9f21f82f221dfcb42b320b4e85e55a"
     tf_http_archive(
         name = "ml_dtypes",
         build_file = "//third_party/py/ml_dtypes:ml_dtypes.BUILD",
diff --git a/third_party/xla/third_party/tsl/third_party/py/non_hermetic/python_configure.bzl b/third_party/xla/third_party/tsl/third_party/py/non_hermetic/python_configure.bzl
deleted file mode 100644
index 89732c3e33d8ee..00000000000000
--- a/third_party/xla/third_party/tsl/third_party/py/non_hermetic/python_configure.bzl
+++ /dev/null
@@ -1,312 +0,0 @@
-"""Repository rule for Python autoconfiguration.
-
-`python_configure` depends on the following environment variables:
-
-  * `PYTHON_BIN_PATH`: location of python binary.
-  * `PYTHON_LIB_PATH`: Location of python libraries.
-"""
-
-load(
-    "//third_party/remote_config:common.bzl",
-    "BAZEL_SH",
-    "PYTHON_BIN_PATH",
-    "PYTHON_LIB_PATH",
-    "TF_PYTHON_CONFIG_REPO",
-    "auto_config_fail",
-    "config_repo_label",
-    "execute",
-    "get_bash_bin",
-    "get_host_environ",
-    "get_python_bin",
-    "is_windows",
-    "raw_exec",
-    "read_dir",
-)
-
-def _genrule(src_dir, genrule_name, command, outs):
-    """Returns a string with a genrule.
-
-    Genrule executes the given command and produces the given outputs.
-    """
-    return (
-        "genrule(\n" +
-        '    name = "' +
-        genrule_name + '",\n' +
-        "    outs = [\n" +
-        outs +
-        "\n    ],\n" +
-        '    cmd = """\n' +
-        command +
-        '\n   """,\n' +
-        ")\n"
-    )
-
-def _norm_path(path):
-    """Returns a path with '/' and remove the trailing slash."""
-    path = path.replace("\\", "/")
-    if path[-1] == "/":
-        path = path[:-1]
-    return path
-
-def _symlink_genrule_for_dir(
-        repository_ctx,
-        src_dir,
-        dest_dir,
-        genrule_name,
-        src_files = [],
-        dest_files = []):
-    """Returns a genrule to symlink(or copy if on Windows) a set of files.
-
-    If src_dir is passed, files will be read from the given directory; otherwise
-    we assume files are in src_files and dest_files
-    """
-    if src_dir != None:
-        src_dir = _norm_path(src_dir)
-        dest_dir = _norm_path(dest_dir)
-        files = "\n".join(read_dir(repository_ctx, src_dir))
-
-        # Create a list with the src_dir stripped to use for outputs.
-        dest_files = files.replace(src_dir, "").splitlines()
-        src_files = files.splitlines()
-    command = []
-    outs = []
-    for i in range(len(dest_files)):
-        if dest_files[i] != "":
-            # If we have only one file to link we do not want to use the dest_dir, as
-            # $(@D) will include the full path to the file.
-            dest = "$(@D)/" + dest_dir + dest_files[i] if len(dest_files) != 1 else "$(@D)/" + dest_files[i]
-
-            # Copy the headers to create a sandboxable setup.
-            cmd = "cp -f"
-            command.append(cmd + ' "%s" "%s"' % (src_files[i], dest))
-            outs.append('        "' + dest_dir + dest_files[i] + '",')
-    genrule = _genrule(
-        src_dir,
-        genrule_name,
-        " && ".join(command),
-        "\n".join(outs),
-    )
-    return genrule
-
-def _get_python_lib(repository_ctx, python_bin):
-    """Gets the python lib path."""
-    python_lib = get_host_environ(repository_ctx, PYTHON_LIB_PATH)
-    if python_lib != None:
-        return python_lib
-
-    # The interesting program to execute.
-    print_lib = [
-        "from __future__ import print_function",
-        "import site",
-        "import os",
-        "python_paths = []",
-        "if os.getenv('PYTHONPATH') is not None:",
-        "  python_paths = os.getenv('PYTHONPATH').split(':')",
-        "try:",
-        "  library_paths = site.getsitepackages()",
-        "except AttributeError:",
-        "  from distutils.sysconfig import get_python_lib",
-        "  library_paths = [get_python_lib()]",
-        "all_paths = set(python_paths + library_paths)",
-        "paths = []",
-        "for path in all_paths:",
-        "  if os.path.isdir(path):",
-        "    paths.append(path)",
-        "if len(paths) >=1:",
-        "  print(paths[0])",
-    ]
-
-    # The below script writes the above program to a file
-    # and executes it. This is to work around the limitation
-    # of not being able to upload files as part of execute.
-    cmd = "from os import linesep;"
-    cmd += "f = open('script.py', 'w');"
-    for line in print_lib:
-        cmd += "f.write(\"%s\" + linesep);" % line
-    cmd += "f.close();"
-    cmd += "from subprocess import call;"
-    cmd += "call([\"%s\", \"script.py\"]);" % python_bin
-
-    result = execute(repository_ctx, [python_bin, "-c", cmd])
-    return result.stdout.strip()
-
-def _check_python_lib(repository_ctx, python_lib):
-    """Checks the python lib path."""
-    cmd = 'test -d "%s" -a -x "%s"' % (python_lib, python_lib)
-    result = raw_exec(repository_ctx, [get_bash_bin(repository_ctx), "-c", cmd])
-    if result.return_code == 1:
-        auto_config_fail("Invalid python library path: %s" % python_lib)
-
-def _check_python_bin(repository_ctx, python_bin):
-    """Checks the python bin path."""
-    cmd = '[[ -x "%s" ]] && [[ ! -d "%s" ]]' % (python_bin, python_bin)
-    result = raw_exec(repository_ctx, [get_bash_bin(repository_ctx), "-c", cmd])
-    if result.return_code == 1:
-        auto_config_fail("--define %s='%s' is not executable. Is it the python binary?" % (
-            PYTHON_BIN_PATH,
-            python_bin,
-        ))
-
-def _get_python_include(repository_ctx, python_bin):
-    """Gets the python include path."""
-    result = execute(
-        repository_ctx,
-        [
-            python_bin,
-            "-Wignore",
-            "-c",
-            "import sysconfig; " +
-            "print(sysconfig.get_path('include'))",
-        ],
-        error_msg = "Problem getting python include path.",
-        error_details = ("Is the Python binary path set up right? " +
-                         "(See ./configure or " + PYTHON_BIN_PATH + ".) " +
-                         "Is distutils installed?"),
-    )
-    return result.stdout.splitlines()[0]
-
-def _get_python_import_lib_name(repository_ctx, python_bin):
-    """Get Python import library name (pythonXY.lib) on Windows."""
-    result = execute(
-        repository_ctx,
-        [
-            python_bin,
-            "-c",
-            "import sys;" +
-            'print("python" + str(sys.version_info[0]) + ' +
-            '      str(sys.version_info[1]) + ".lib")',
-        ],
-        error_msg = "Problem getting python import library.",
-        error_details = ("Is the Python binary path set up right? " +
-                         "(See ./configure or " + PYTHON_BIN_PATH + ".) "),
-    )
-    return result.stdout.splitlines()[0]
-
-def _get_numpy_include(repository_ctx, python_bin):
-    """Gets the numpy include path."""
-    return execute(
-        repository_ctx,
-        [
-            python_bin,
-            "-c",
-            "from __future__ import print_function;" +
-            "import numpy;" +
-            " print(numpy.get_include());",
-        ],
-        error_msg = "Problem getting numpy include path.",
-        error_details = "Is numpy installed?",
-    ).stdout.splitlines()[0]
-
-def _create_local_python_repository(repository_ctx):
-    """Creates the repository containing files set up to build with Python."""
-
-    # Resolve all labels before doing any real work. Resolving causes the
-    # function to be restarted with all previous state being lost. This
-    # can easily lead to a O(n^2) runtime in the number of labels.
-    build_tpl = repository_ctx.path(Label("//third_party/py:BUILD.tpl"))
-
-    python_bin = get_python_bin(repository_ctx)
-    _check_python_bin(repository_ctx, python_bin)
-    python_lib = _get_python_lib(repository_ctx, python_bin)
-    _check_python_lib(repository_ctx, python_lib)
-    python_include = _get_python_include(repository_ctx, python_bin)
-    numpy_include = _get_numpy_include(repository_ctx, python_bin) + "/numpy"
-    python_include_rule = _symlink_genrule_for_dir(
-        repository_ctx,
-        python_include,
-        "python_include",
-        "python_include",
-    )
-    python_import_lib_genrule = ""
-
-    # To build Python C/C++ extension on Windows, we need to link to python import library pythonXY.lib
-    # See https://docs.python.org/3/extending/windows.html
-    if is_windows(repository_ctx):
-        python_bin = python_bin.replace("\\", "/")
-        python_include = _norm_path(python_include)
-        python_import_lib_name = _get_python_import_lib_name(repository_ctx, python_bin)
-        python_import_lib_src = python_include.rsplit("/", 1)[0] + "/libs/" + python_import_lib_name
-        python_import_lib_genrule = _symlink_genrule_for_dir(
-            repository_ctx,
-            None,
-            "",
-            "python_import_lib",
-            [python_import_lib_src],
-            [python_import_lib_name],
-        )
-    numpy_include_rule = _symlink_genrule_for_dir(
-        repository_ctx,
-        numpy_include,
-        "numpy_include/numpy",
-        "numpy_include",
-    )
-
-    platform_constraint = ""
-    if repository_ctx.attr.platform_constraint:
-        platform_constraint = "\"%s\"" % repository_ctx.attr.platform_constraint
-    repository_ctx.template("BUILD", build_tpl, {
-        "%{PYTHON_BIN_PATH}": python_bin,
-        "%{PYTHON_INCLUDE_GENRULE}": python_include_rule,
-        "%{PYTHON_IMPORT_LIB_GENRULE}": python_import_lib_genrule,
-        "%{NUMPY_INCLUDE_GENRULE}": numpy_include_rule,
-        "%{PLATFORM_CONSTRAINT}": platform_constraint,
-    })
-
-def _create_remote_python_repository(repository_ctx, remote_config_repo):
-    """Creates pointers to a remotely configured repo set up to build with Python.
-    """
-    repository_ctx.template("BUILD", config_repo_label(remote_config_repo, ":BUILD"), {})
-
-def _python_autoconf_impl(repository_ctx):
-    """Implementation of the python_autoconf repository rule."""
-    if get_host_environ(repository_ctx, TF_PYTHON_CONFIG_REPO) != None:
-        _create_remote_python_repository(
-            repository_ctx,
-            get_host_environ(repository_ctx, TF_PYTHON_CONFIG_REPO),
-        )
-    else:
-        _create_local_python_repository(repository_ctx)
-
-_ENVIRONS = [
-    BAZEL_SH,
-    PYTHON_BIN_PATH,
-    PYTHON_LIB_PATH,
-]
-
-local_python_configure = repository_rule(
-    implementation = _create_local_python_repository,
-    environ = _ENVIRONS,
-    attrs = {
-        "environ": attr.string_dict(),
-        "platform_constraint": attr.string(),
-    },
-)
-
-remote_python_configure = repository_rule(
-    implementation = _create_local_python_repository,
-    environ = _ENVIRONS,
-    remotable = True,
-    attrs = {
-        "environ": attr.string_dict(),
-        "platform_constraint": attr.string(),
-    },
-)
-
-python_configure = repository_rule(
-    implementation = _python_autoconf_impl,
-    environ = _ENVIRONS + [TF_PYTHON_CONFIG_REPO],
-    attrs = {
-        "platform_constraint": attr.string(),
-    },
-)
-"""Detects and configures the local Python.
-
-Add the following to your WORKSPACE FILE:
-
-```python
-python_configure(name = "local_config_python")
-```
-
-Args:
-  name: A unique name for this workspace rule.
-"""
diff --git a/third_party/xla/third_party/tsl/third_party/py/python_init_pip.bzl b/third_party/xla/third_party/tsl/third_party/py/python_init_pip.bzl
new file mode 100644
index 00000000000000..efc2bf8233cf61
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/py/python_init_pip.bzl
@@ -0,0 +1,34 @@
+"""Hermetic Python initialization. Consult the WORKSPACE on how to use it."""
+
+load("@python//:defs.bzl", "interpreter")
+load("@python_version_repo//:py_version.bzl", "REQUIREMENTS")
+load("@rules_python//python:pip.bzl", "package_annotation", "pip_parse")
+
+def python_init_pip():
+    numpy_annotations = {
+        "numpy": package_annotation(
+            additive_build_content = """\
+cc_library(
+    name = "numpy_headers_2",
+    hdrs = glob(["site-packages/numpy/_core/include/**/*.h"]),
+    strip_include_prefix="site-packages/numpy/_core/include/",
+)
+cc_library(
+    name = "numpy_headers_1",
+    hdrs = glob(["site-packages/numpy/core/include/**/*.h"]),
+    strip_include_prefix="site-packages/numpy/core/include/",
+)
+cc_library(
+    name = "numpy_headers",
+    deps = [":numpy_headers_2", ":numpy_headers_1"],
+)
+""",
+        ),
+    }
+
+    pip_parse(
+        name = "pypi",
+        annotations = numpy_annotations,
+        python_interpreter_target = interpreter,
+        requirements_lock = REQUIREMENTS,
+    )
diff --git a/third_party/xla/third_party/tsl/third_party/py/python_init_repositories.bzl b/third_party/xla/third_party/tsl/third_party/py/python_init_repositories.bzl
new file mode 100644
index 00000000000000..5a405f2c2aba4c
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/py/python_init_repositories.bzl
@@ -0,0 +1,12 @@
+"""Hermetic Python initialization. Consult the WORKSPACE on how to use it."""
+
+load("@rules_python//python:repositories.bzl", "py_repositories")
+load("//third_party/py:python_repo.bzl", "python_repository")
+
+def python_init_repositories(requirements = {}):
+    python_repository(
+        name = "python_version_repo",
+        requirements_versions = requirements.keys(),
+        requirements_locks = requirements.values(),
+    )
+    py_repositories()
diff --git a/third_party/xla/third_party/tsl/third_party/py/python_init_rules.bzl b/third_party/xla/third_party/tsl/third_party/py/python_init_rules.bzl
new file mode 100644
index 00000000000000..98a7b8bc3c315a
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/py/python_init_rules.bzl
@@ -0,0 +1,11 @@
+"""Hermetic Python initialization. Consult the WORKSPACE on how to use it."""
+
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+def python_init_rules():
+    http_archive(
+        name = "rules_python",
+        sha256 = "9d04041ac92a0985e344235f5d946f71ac543f1b1565f2cdbc9a2aaee8adf55b",
+        strip_prefix = "rules_python-0.26.0",
+        url = "https://github.com/bazelbuild/rules_python/releases/download/0.26.0/rules_python-0.26.0.tar.gz",
+    )
diff --git a/third_party/xla/third_party/tsl/third_party/py/python_init_toolchains.bzl b/third_party/xla/third_party/tsl/third_party/py/python_init_toolchains.bzl
new file mode 100644
index 00000000000000..c1f800db4c01e7
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/py/python_init_toolchains.bzl
@@ -0,0 +1,13 @@
+"""Hermetic Python initialization. Consult the WORKSPACE on how to use it."""
+
+load("@python_version_repo//:py_version.bzl", "HERMETIC_PYTHON_VERSION")
+load("@rules_python//python:repositories.bzl", "python_register_toolchains")
+load("@rules_python//python:versions.bzl", "MINOR_MAPPING")
+
+def python_init_toolchains():
+    if HERMETIC_PYTHON_VERSION in MINOR_MAPPING:
+        python_register_toolchains(
+            name = "python",
+            ignore_root_user_error = True,
+            python_version = HERMETIC_PYTHON_VERSION,
+        )
diff --git a/third_party/xla/third_party/tsl/third_party/py/python_repo.bzl b/third_party/xla/third_party/tsl/third_party/py/python_repo.bzl
new file mode 100644
index 00000000000000..77a6ce9ce50b60
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/py/python_repo.bzl
@@ -0,0 +1,206 @@
+"""
+Repository rule to manage hermetic Python interpreter under Bazel.
+
+Version can be set via build parameter "--repo_env=HERMETIC_PYTHON_VERSION=3.11"
+Defaults to 3.11.
+
+To set wheel name, add "--repo_env=WHEEL_NAME=tensorflow_cpu"
+"""
+
+VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13"]
+DEFAULT_VERSION = "3.11"
+WARNING = """
+HERMETIC_PYTHON_VERSION variable was not set correctly, using default version. 
+Python {} will be used.
+To select Python version, either set HERMETIC_PYTHON_VERSION env variable in 
+your shell:
+  export HERMETIC_PYTHON_VERSION=3.12
+OR pass it as an argument to bazel command directly or inside your .bazelrc 
+file:
+  --repo_env=HERMETIC_PYTHON_VERSION=3.12
+""".format(DEFAULT_VERSION)
+
+content = """TF_PYTHON_VERSION = "{version}"
+HERMETIC_PYTHON_VERSION = "{version}"
+WHEEL_NAME = "{wheel_name}"
+WHEEL_COLLAB = "{wheel_collab}"
+REQUIREMENTS = "{requirements}"
+"""
+
+def _python_repository_impl(ctx):
+    ctx.file("BUILD", "")
+    version_legacy = ctx.os.environ.get("TF_PYTHON_VERSION", "")
+    version = ctx.os.environ.get("HERMETIC_PYTHON_VERSION", "")
+    if not version:
+        version = version_legacy
+    else:
+        version_legacy = version
+
+    wheel_name = ctx.os.environ.get("WHEEL_NAME", "tensorflow")
+    wheel_collab = ctx.os.environ.get("WHEEL_COLLAB", False)
+    if version not in VERSIONS:
+        print(WARNING)  # buildifier: disable=print
+        version = DEFAULT_VERSION
+    else:
+        print("Using hermetic Python %s" % version)  # buildifier: disable=print
+
+    requirements = ""
+    for i in range(0, len(ctx.attr.requirements_locks)):
+        if ctx.attr.requirements_versions[i] == version:
+            requirements = ctx.attr.requirements_locks[i]
+            break
+
+    ctx.file(
+        "py_version.bzl",
+        content.format(
+            version = version,
+            wheel_name = wheel_name,
+            wheel_collab = wheel_collab,
+            requirements = str(requirements),
+        ),
+    )
+
+python_repository = repository_rule(
+    implementation = _python_repository_impl,
+    attrs = {
+        "requirements_versions": attr.string_list(
+            mandatory = False,
+            default = [],
+        ),
+        "requirements_locks": attr.label_list(
+            mandatory = False,
+            default = [],
+        ),
+    },
+    environ = [
+        "TF_PYTHON_VERSION",
+        "HERMETIC_PYTHON_VERSION",
+        "WHEEL_NAME",
+        "WHEEL_COLLAB",
+    ],
+)
+
+def _custom_python_interpreter_impl(ctx):
+    version = ctx.attr.version
+    strip_prefix = ctx.attr.strip_prefix.format(version = version)
+    urls = [url.format(version = version) for url in ctx.attr.urls]
+    binary_name = ctx.attr.binary_name
+    if not binary_name:
+        ver_chunks = version.split(".")
+        binary_name = "python%s.%s" % (ver_chunks[0], ver_chunks[1])
+
+    install_dir = "{name}-{version}".format(name = ctx.attr.name, version = version)
+    _exec_and_check(ctx, ["mkdir", install_dir])
+    install_path = ctx.path(install_dir)
+    srcs_dir = "srcs"
+    ctx.download_and_extract(
+        url = urls,
+        stripPrefix = strip_prefix,
+        output = srcs_dir,
+    )
+
+    configure_params = []
+    if "CC" in ctx.os.environ:
+        configure_params.append("CC={}".format(ctx.os.environ["CC"]))
+    if "CXX" in ctx.os.environ:
+        configure_params.append("CXX={}".format(ctx.os.environ["CXX"]))
+
+    configure_params.append("--enable-optimizations")
+    configure_params.append("--prefix=%s" % install_path.realpath)
+    _exec_and_check(
+        ctx,
+        ["./configure"] + configure_params,
+        working_directory = srcs_dir,
+        quiet = False,
+    )
+    res = _exec_and_check(ctx, ["nproc"])
+    cores = 12 if res.return_code != 0 else max(1, int(res.stdout.strip()) - 1)
+    _exec_and_check(ctx, ["make", "-j%s" % cores], working_directory = srcs_dir)
+    _exec_and_check(ctx, ["make", "altinstall"], working_directory = srcs_dir)
+    _exec_and_check(ctx, ["ln", "-s", binary_name, "python3"], working_directory = install_dir + "/bin")
+    tar = "{install_dir}.tgz".format(install_dir = install_dir)
+    _exec_and_check(ctx, ["tar", "czpf", tar, install_dir])
+    _exec_and_check(ctx, ["rm", "-rf", srcs_dir])
+    res = _exec_and_check(ctx, ["sha256sum", tar])
+
+    sha256 = res.stdout.split(" ")[0].strip()
+    tar_path = ctx.path(tar)
+
+    example = """\n\n
+To use newly built Python interpreter add the following code snippet RIGHT AFTER
+python_init_toolchains() in your WORKSPACE file. The code sample should work as
+is but it may need some tuning, if you have special requirements.
+
+```
+load("@rules_python//python:repositories.bzl", "python_register_toolchains")
+python_register_toolchains(
+    name = "python",
+    # By default assume the interpreter is on the local file system, replace
+    # with proper URL if it is not the case.
+    base_url = "file://",
+    ignore_root_user_error = True,
+    python_version = "{version}",
+    tool_versions = {{
+        "{version}": {{
+            # Path to .tar.gz with Python binary. By default it points to .tgz
+            # file in cache where it was built originally; replace with proper
+            # file location, if you moved it somewhere else.
+            "url": "{tar_path}",
+            "sha256": {{
+                # By default we assume Linux x86_64 architecture, eplace with
+                # proper architecture if you were building on a different platform.
+                "x86_64-unknown-linux-gnu": "{sha256}",
+            }},
+            "strip_prefix": "{install_dir}",
+        }},
+    }},
+)
+```
+\n\n""".format(version = version, tar_path = tar_path, sha256 = sha256, install_dir = install_dir)
+
+    instructions = "INSTRUCTIONS-{version}.md".format(version = version)
+    ctx.file(instructions + ".tmpl", example, executable = False)
+    ctx.file(
+        "BUILD.bazel",
+        """
+genrule(
+    name = "{name}",
+    srcs = ["{tar}", "{instructions}.tmpl"],
+    outs = ["{install_dir}.tar.gz", "{instructions}"],
+    cmd = "cp $(location {tar}) $(location {install_dir}.tar.gz); cp $(location {instructions}.tmpl) $(location {instructions})",
+    visibility = ["//visibility:public"],
+)
+     """.format(
+            name = ctx.attr.name,
+            tar = tar,
+            install_dir = install_dir,
+            instructions = instructions,
+        ),
+        executable = False,
+    )
+
+    print(example)  # buildifier: disable=print
+
+custom_python_interpreter = repository_rule(
+    implementation = _custom_python_interpreter_impl,
+    attrs = {
+        "urls": attr.string_list(),
+        "strip_prefix": attr.string(),
+        "binary_name": attr.string(mandatory = False),
+        "version": attr.string(),
+    },
+)
+
+def _exec_and_check(ctx, command, fail_on_error = True, quiet = False, **kwargs):
+    res = ctx.execute(command, quiet = quiet, **kwargs)
+    if fail_on_error and res.return_code != 0:
+        fail("""
+Failed to execute command: `{command}`
+Exit Code: {code}
+STDERR: {stderr}
+        """.format(
+            command = command,
+            code = res.return_code,
+            stderr = res.stderr,
+        ))
+    return res
diff --git a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
index 01b5c67cc07d67..04d0e390c8dfe3 100644
--- a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "fc9cb9f1253679cb2b645c26e0198cd0213d4378"
-    TFRT_SHA256 = "f90273589fdbba62b7b7de6b928273b36e870f8aa2922e3cdfac6c2fa5b262aa"
+    TFRT_COMMIT = "edb0d2c6f5e343c83ea121817dc2599ad5453d5c"
+    TFRT_SHA256 = "97f7bfcbff025da3005e59b9ffe1bcb06b439874e3e2cd28a17d9287193d6901"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tsl/tools/def_file_filter/symbols_pybind.txt b/third_party/xla/third_party/tsl/tools/def_file_filter/symbols_pybind.txt
index 78c42c6f454c5c..69aea896b57102 100644
--- a/third_party/xla/third_party/tsl/tools/def_file_filter/symbols_pybind.txt
+++ b/third_party/xla/third_party/tsl/tools/def_file_filter/symbols_pybind.txt
@@ -566,6 +566,10 @@ tensorflow::quantization::QuantizeStaticRangePtq
 tensorflow::quantization::QuantizeDynamicRangePtq
 tensorflow::quantization::QuantizeWeightOnly
 
+[//tensorflow/compiler/mlir/quantization/tensorflow_to_stablehlo/python:pywrap_tensorflow_to_stablehlo_lib_impl] # tensorflow_to_stablehlo
+mlir::tensorflow_to_stablehlo::pywrap::PywrapSavedModelToStablehlo
+mlir::tensorflow_to_stablehlo::pywrap::PywrapTfModuleToStablehlo
+
 [//tensorflow/dtensor/cc:dtensor_device_cc] # DTensor
 tensorflow::dtensor::AllocateDTensorDevice
 tensorflow::dtensor::AddMesh
@@ -590,3 +594,9 @@ tensorflow::dtensor::SetIteratorElementLayouts
 tensorflow::dtensor::Mesh
 tensorflow::dtensor::Layout
 tensorflow::dtensor::Layout::LayoutType
+
+[//tensorflow/core/tpu/kernels:sparse_core_layout] #SparseCoreLayoutStacker
+tensorflow::tpu::SparseCoreLayoutStacker::AddTable
+tensorflow::tpu::SparseCoreLayoutStacker::SparseCoreLayoutStacker
+tensorflow::tpu::SparseCoreLayoutStacker::AddTable
+tensorflow::tpu::SparseCoreLayoutStacker::GetLayouts
\ No newline at end of file
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl
index 756c7bbe8d786c..f2eecd61a5faf7 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl
+++ b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl
@@ -244,6 +244,28 @@ def initialize_rbe_configs():
         python_install_path = "/usr/local",
     )
 
+    tensorflow_rbe_config(
+        name = "ubuntu22.04-clang_manylinux2014-cuda12.3-cudnn8.9",
+        compiler = "/usr/lib/llvm-17/bin/clang",
+        cuda_version = "12.3",
+        cudnn_version = "8.9",
+        os = "ubuntu22.04-manylinux2014-multipython",
+        python_versions = ["3.9", "3.10", "3.11", "3.12"],
+        sysroot = "/dt9",
+        python_install_path = "/usr/local",
+    )
+
+    tensorflow_rbe_config(
+        name = "ubuntu22.04-gcc9_manylinux2014-cuda12.3-cudnn8.9",
+        compiler = "/dt9/usr/bin/gcc",
+        compiler_prefix = "/usr/bin",
+        cuda_version = "12.3",
+        cudnn_version = "8.9",
+        os = "ubuntu22.04-manylinux2014-multipython",
+        python_versions = ["3.9", "3.10", "3.11", "3.12"],
+        python_install_path = "/usr/local",
+    )
+
     tensorflow_rbe_win_config(
         name = "windows_py37",
         python_bin_path = "C:/Python37/python.exe",
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/containers.bzl b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/containers.bzl
index dd222d06bd13b1..3c7d017b0e0206 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/containers.bzl
+++ b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/containers.bzl
@@ -9,6 +9,7 @@ container_digests = {
     "cuda12.1-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:8c266e5b0acd203aed5e8871b63f68a39d8d23f6d882e619797e58b973f7fe63",
     "cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:9fefda035b4a12b24cd5bae56c7dbb9527a5fd06a41ced0a22ac86fe5ed26428",
     "cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:6f9524a2ed7f75255dc4be3a0c5e3bda581385a1c13e2fa890bc17fa62da95b2",
+    "cuda12.3-cudnn8.9-ubuntu22.04-manylinux2014-multipython": "sha256:dddcaf30321e9007103dce75c51b83fea3c06de462fcf41e7c6ae93f37fc3545",
     # ROCM, probably not all of them still in use
     "rocm-ubuntu18.04-manylinux2010-multipython": "sha256:6e953a09b145df338bcb03e9e36f99b291140c29b72d0a048fb6c5905ccad5eb",
     "rocm-ubuntu20.04-manylinux2014-multipython": "sha256:906faec7765fe5dd067f2b092b5d5f220c1fedde725fb42c83d031b4d6f32204",
@@ -114,6 +115,13 @@ containers = {
         "digest": container_digests["cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython"],
     },
 
+    # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython.
+    "cuda12.3-cudnn8.9-ubuntu22.04-manylinux2014-multipython": {
+        "registry": "gcr.io",
+        "repository": "tensorflow-testing/nosla-cuda12.3-cudnn8.9-ubuntu22.04-manylinux2014-multipython",
+        "digest": container_digests["cuda12.3-cudnn8.9-ubuntu22.04-manylinux2014-multipython"],
+    },
+
     # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu18.04-manylinux2010-multipython.
     "rocm-ubuntu18.04-manylinux2010-multipython": {
         "registry": "gcr.io",
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/win/20240424/BUILD b/third_party/xla/third_party/tsl/tools/toolchains/win/20240424/BUILD
new file mode 100644
index 00000000000000..93b3c90aff81d9
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tools/toolchains/win/20240424/BUILD
@@ -0,0 +1,662 @@
+# Copyright 2018 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This becomes the BUILD file for @local_config_cc// under Windows.
+
+load("@rules_cc//cc:defs.bzl", "cc_library", "cc_toolchain", "cc_toolchain_suite")
+load(":armeabi_cc_toolchain_config.bzl", "armeabi_cc_toolchain_config")
+load(":windows_cc_toolchain_config.bzl", "cc_toolchain_config")
+
+package(default_visibility = ["//visibility:public"])
+
+cc_library(name = "empty_lib")
+
+# Label flag for extra libraries to be linked into every binary.
+# TODO(bazel-team): Support passing flag multiple times to build a list.
+label_flag(
+    name = "link_extra_libs",
+    build_setting_default = ":empty_lib",
+)
+
+# The final extra library to be linked into every binary target. This collects
+# the above flag, but may also include more libraries depending on config.
+cc_library(
+    name = "link_extra_lib",
+    deps = [
+        ":link_extra_libs",
+    ],
+)
+
+cc_library(
+    name = "malloc",
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
+
+filegroup(
+    name = "mingw_compiler_files",
+    srcs = [":builtin_include_directory_paths_mingw"],
+)
+
+filegroup(
+    name = "clangcl_compiler_files",
+    srcs = [":builtin_include_directory_paths_clangcl"],
+)
+
+filegroup(
+    name = "msvc_compiler_files",
+    srcs = [":builtin_include_directory_paths_msvc"],
+)
+
+# Hardcoded toolchain, legacy behaviour.
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "armeabi-v7a|compiler": ":cc-compiler-armeabi-v7a",
+        "x64_windows|msvc-cl": ":cc-compiler-x64_windows",
+        "x64_x86_windows|msvc-cl": ":cc-compiler-x64_x86_windows",
+        "x64_arm_windows|msvc-cl": ":cc-compiler-x64_arm_windows",
+        "x64_arm64_windows|msvc-cl": ":cc-compiler-arm64_windows",
+        "arm64_windows|msvc-cl": ":cc-compiler-arm64_windows",
+        "x64_windows|msys-gcc": ":cc-compiler-x64_windows_msys",
+        "x64_windows|mingw-gcc": ":cc-compiler-x64_windows_mingw",
+        "x64_windows|clang-cl": ":cc-compiler-x64_windows-clang-cl",
+        "x64_windows_msys": ":cc-compiler-x64_windows_msys",
+        "x64_windows": ":cc-compiler-x64_windows",
+        "x64_x86_windows": ":cc-compiler-x64_x86_windows",
+        "x64_arm_windows": ":cc-compiler-x64_arm_windows",
+        "x64_arm64_windows": ":cc-compiler-arm64_windows",
+        "arm64_windows": ":cc-compiler-arm64_windows",
+        "x64_arm64_windows|clang-cl": ":cc-compiler-arm64_windows-clang-cl",
+        "arm64_windows|clang-cl": ":cc-compiler-arm64_windows-clang-cl",
+        "armeabi-v7a": ":cc-compiler-armeabi-v7a",
+    },
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows_msys",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":mingw_compiler_files",
+    compiler_files = ":mingw_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msys_x64",
+    toolchain_identifier = "msys_x64",
+)
+
+cc_toolchain_config(
+    name = "msys_x64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    compiler = "msys-gcc",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [
+        "c:/tools/msys64/usr/",
+    ],
+    dbg_mode_debug_flag = "/DEBUG:FULL",
+    fastbuild_mode_debug_flag = "/DEBUG:FASTLINK",
+    host_system_name = "local",
+    target_libc = "msys",
+    target_system_name = "local",
+    tool_bin_path = "c:/tools/msys64/usr/bin",
+    tool_paths = {
+        "ar": "c:/tools/msys64/usr/bin/ar",
+        "cpp": "c:/tools/msys64/usr/bin/cpp",
+        "dwp": "c:/tools/msys64/usr/bin/dwp",
+        "gcc": "c:/tools/msys64/usr/bin/gcc",
+        "gcov": "c:/tools/msys64/usr/bin/gcov",
+        "ld": "c:/tools/msys64/usr/bin/ld",
+        "nm": "c:/tools/msys64/usr/bin/nm",
+        "objcopy": "c:/tools/msys64/usr/bin/objcopy",
+        "objdump": "c:/tools/msys64/usr/bin/objdump",
+        "strip": "c:/tools/msys64/usr/bin/strip",
+    },
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows_msys",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+        "@bazel_tools//tools/cpp:msys",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows_msys",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows_mingw",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":mingw_compiler_files",
+    compiler_files = ":mingw_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 0,
+    toolchain_config = ":msys_x64_mingw",
+    toolchain_identifier = "msys_x64_mingw",
+)
+
+cc_toolchain_config(
+    name = "msys_x64_mingw",
+    abi_libc_version = "local",
+    abi_version = "local",
+    compiler = "mingw-gcc",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [
+        "c:/tools/msys64/mingw64/",
+    ],
+    dbg_mode_debug_flag = "/DEBUG:FULL",
+    fastbuild_mode_debug_flag = "/DEBUG:FASTLINK",
+    host_system_name = "local",
+    target_libc = "mingw",
+    target_system_name = "local",
+    tool_bin_path = "c:/tools/msys64/mingw64/bin",
+    tool_paths = {
+        "ar": "c:/tools/msys64/mingw64/bin/ar",
+        "cpp": "c:/tools/msys64/mingw64/bin/cpp",
+        "dwp": "c:/tools/msys64/mingw64/bin/dwp",
+        "gcc": "c:/tools/msys64/mingw64/bin/gcc",
+        "gcov": "c:/tools/msys64/mingw64/bin/gcov",
+        "ld": "c:/tools/msys64/mingw64/bin/ld",
+        "nm": "c:/tools/msys64/mingw64/bin/nm",
+        "objcopy": "c:/tools/msys64/mingw64/bin/objcopy",
+        "objdump": "c:/tools/msys64/mingw64/bin/objdump",
+        "strip": "c:/tools/msys64/mingw64/bin/strip",
+    },
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows_mingw",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+        "@bazel_tools//tools/cpp:mingw",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows_mingw",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":msvc_compiler_files",
+    compiler_files = ":msvc_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msvc_x64",
+    toolchain_identifier = "msvc_x64",
+)
+
+cc_toolchain_config(
+    name = "msvc_x64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:X64"],
+    compiler = "msvc-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [
+        "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\include",
+        "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Auxiliary\\VS\\include",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.22621.0\\ucrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\um",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\shared",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\winrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\cppwinrt",
+    ],
+    dbg_mode_debug_flag = "/DEBUG:FULL",
+    default_link_flags = ["/MACHINE:X64"],
+    fastbuild_mode_debug_flag = "/DEBUG:FASTLINK",
+    host_system_name = "local",
+    msvc_cl_path = "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x64/cl.exe",
+    msvc_env_include = "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\include;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Auxiliary\\VS\\include;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.22621.0\\ucrt;C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\um;C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\shared;C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\winrt;C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\cppwinrt",
+    msvc_env_lib = "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\lib\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.22621.0\\ucrt\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\\\lib\\10.0.22621.0\\\\um\\x64",
+    msvc_env_path = "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\bin\\HostX64\\x64;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\VC\\VCPackages;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TestWindow;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TeamFoundation\\Team Explorer;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\MSBuild\\Current\\bin\\Roslyn;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\10.0.22621.0\\\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\\\x64;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\\\MSBuild\\Current\\Bin\\amd64;C:\\Windows\\Microsoft.NET\\Framework64\\v4.0.30319;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\Tools\\;;C:\\Windows\\system32;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\VC\\Linux\\bin\\ConnectionManagerExe",
+    msvc_env_tmp = "C:\\Users\\ContainerAdministrator\\AppData\\Local\\Temp",
+    msvc_lib_path = "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x64/lib.exe",
+    msvc_link_path = "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x64/link.exe",
+    msvc_ml_path = "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x64/ml64.exe",
+    supports_parse_showincludes = True,
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x64/lib.exe",
+        "ml": "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x64/ml64.exe",
+        "cpp": "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x64/cl.exe",
+        "gcc": "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x64/cl.exe",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x64/link.exe",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "msvc_x64",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_x86_windows",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":msvc_compiler_files",
+    compiler_files = ":msvc_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msvc_x64_x86",
+    toolchain_identifier = "msvc_x64_x86",
+)
+
+cc_toolchain_config(
+    name = "msvc_x64_x86",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:X86"],
+    compiler = "msvc-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [
+        "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\include",
+        "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Auxiliary\\VS\\include",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.22621.0\\ucrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\um",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\shared",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\winrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\cppwinrt",
+    ],
+    dbg_mode_debug_flag = "/DEBUG:FULL",
+    default_link_flags = ["/MACHINE:X86"],
+    fastbuild_mode_debug_flag = "/DEBUG:FASTLINK",
+    host_system_name = "local",
+    msvc_cl_path = "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x86/cl.exe",
+    msvc_env_include = "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\include;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Auxiliary\\VS\\include;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.22621.0\\ucrt;C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\um;C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\shared;C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\winrt;C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\cppwinrt",
+    msvc_env_lib = "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\lib\\x86;C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.22621.0\\ucrt\\x86;C:\\Program Files (x86)\\Windows Kits\\10\\\\lib\\10.0.22621.0\\\\um\\x86",
+    msvc_env_path = "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\bin\\HostX64\\x86;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\bin\\HostX64\\x64;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\VC\\VCPackages;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TestWindow;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TeamFoundation\\Team Explorer;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\MSBuild\\Current\\bin\\Roslyn;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\10.0.22621.0\\\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\\\x64;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\\\MSBuild\\Current\\Bin\\amd64;C:\\Windows\\Microsoft.NET\\Framework64\\v4.0.30319;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\Tools\\;;C:\\Windows\\system32;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\VC\\Linux\\bin\\ConnectionManagerExe",
+    msvc_env_tmp = "C:\\Users\\ContainerAdministrator\\AppData\\Local\\Temp",
+    msvc_lib_path = "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x86/lib.exe",
+    msvc_link_path = "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x86/link.exe",
+    msvc_ml_path = "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x86/ml.exe",
+    supports_parse_showincludes = True,
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x86/lib.exe",
+        "ml": "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x86/ml.exe",
+        "cpp": "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x86/cl.exe",
+        "gcc": "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x86/cl.exe",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x86/link.exe",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "msvc_x64_x86",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_x86_windows",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_32",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_x86_windows",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_arm_windows",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":msvc_compiler_files",
+    compiler_files = ":msvc_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msvc_x64_arm",
+    toolchain_identifier = "msvc_x64_arm",
+)
+
+cc_toolchain_config(
+    name = "msvc_x64_arm",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:ARM"],
+    compiler = "msvc-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [],
+    dbg_mode_debug_flag = "/DEBUG",
+    default_link_flags = ["/MACHINE:ARM"],
+    fastbuild_mode_debug_flag = "/DEBUG",
+    host_system_name = "local",
+    msvc_cl_path = "vc_installation_error_arm.bat",
+    msvc_env_include = "msvc_not_found",
+    msvc_env_lib = "msvc_not_found",
+    msvc_env_path = "msvc_not_found",
+    msvc_env_tmp = "msvc_not_found",
+    msvc_lib_path = "vc_installation_error_arm.bat",
+    msvc_link_path = "vc_installation_error_arm.bat",
+    msvc_ml_path = "vc_installation_error_arm.bat",
+    supports_parse_showincludes = False,
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "vc_installation_error_arm.bat",
+        "ml": "vc_installation_error_arm.bat",
+        "cpp": "vc_installation_error_arm.bat",
+        "gcc": "vc_installation_error_arm.bat",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "vc_installation_error_arm.bat",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "msvc_x64_arm",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_arm_windows",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:arm",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_arm_windows",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-arm64_windows",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":msvc_compiler_files",
+    compiler_files = ":msvc_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msvc_arm64",
+    toolchain_identifier = "msvc_arm64",
+)
+
+cc_toolchain_config(
+    name = "msvc_arm64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:ARM64"],
+    compiler = "msvc-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [],
+    dbg_mode_debug_flag = "/DEBUG",
+    default_link_flags = ["/MACHINE:ARM64"],
+    fastbuild_mode_debug_flag = "/DEBUG",
+    host_system_name = "local",
+    msvc_cl_path = "vc_installation_error_arm64.bat",
+    msvc_env_include = "msvc_not_found",
+    msvc_env_lib = "msvc_not_found",
+    msvc_env_path = "msvc_not_found",
+    msvc_env_tmp = "msvc_not_found",
+    msvc_lib_path = "vc_installation_error_arm64.bat",
+    msvc_link_path = "vc_installation_error_arm64.bat",
+    msvc_ml_path = "vc_installation_error_arm64.bat",
+    supports_parse_showincludes = False,
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "vc_installation_error_arm64.bat",
+        "ml": "vc_installation_error_arm64.bat",
+        "cpp": "vc_installation_error_arm64.bat",
+        "gcc": "vc_installation_error_arm64.bat",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "vc_installation_error_arm64.bat",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "msvc_arm64",
+)
+
+toolchain(
+    name = "cc-toolchain-arm64_windows",
+    exec_compatible_with = [
+        "@platforms//os:windows",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:arm64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-arm64_windows",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows-clang-cl",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":clangcl_compiler_files",
+    compiler_files = ":clangcl_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":clang_cl_x64",
+    toolchain_identifier = "clang_cl_x64",
+)
+
+cc_toolchain_config(
+    name = "clang_cl_x64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:X64"],
+    compiler = "clang-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [
+        "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\include",
+        "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Auxiliary\\VS\\include",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.22621.0\\ucrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\um",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\shared",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\winrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\cppwinrt",
+        "C:\\tools\\LLVM\\lib\\clang\\18\\include",
+    ],
+    dbg_mode_debug_flag = "/DEBUG",
+    default_link_flags = ["/MACHINE:X64"],
+    fastbuild_mode_debug_flag = "/DEBUG",
+    host_system_name = "local",
+    msvc_cl_path = "C:/tools/LLVM/bin/clang-cl.exe",
+    msvc_env_include = "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\include;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Auxiliary\\VS\\include;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.22621.0\\ucrt;C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\um;C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\shared;C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\winrt;C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\cppwinrt;C:\\tools\\LLVM\\lib\\clang\\18\\include",
+    msvc_env_lib = "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\lib\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.22621.0\\ucrt\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\\\lib\\10.0.22621.0\\\\um\\x64;C:\\tools\\LLVM\\lib\\clang\\18\\lib\\windows",
+    msvc_env_path = "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\bin\\HostX64\\x64;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\VC\\VCPackages;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TestWindow;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TeamFoundation\\Team Explorer;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\MSBuild\\Current\\bin\\Roslyn;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\10.0.22621.0\\\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\\\x64;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\\\MSBuild\\Current\\Bin\\amd64;C:\\Windows\\Microsoft.NET\\Framework64\\v4.0.30319;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\Tools\\;;C:\\Windows\\system32;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\VC\\Linux\\bin\\ConnectionManagerExe",
+    msvc_env_tmp = "C:\\Users\\ContainerAdministrator\\AppData\\Local\\Temp",
+    msvc_lib_path = "C:/tools/LLVM/bin/llvm-lib.exe",
+    msvc_link_path = "C:/tools/LLVM/bin/lld-link.exe",
+    msvc_ml_path = "C:/tools/LLVM/bin/clang-cl.exe",
+    supports_parse_showincludes = True,
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "C:/tools/LLVM/bin/llvm-lib.exe",
+        "ml": "C:/tools/LLVM/bin/clang-cl.exe",
+        "cpp": "C:/tools/LLVM/bin/clang-cl.exe",
+        "gcc": "C:/tools/LLVM/bin/clang-cl.exe",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "C:/tools/LLVM/bin/lld-link.exe",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "clang_cl_x64",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows-clang-cl",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+        "@bazel_tools//tools/cpp:clang-cl",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows-clang-cl",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-arm64_windows-clang-cl",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":clangcl_compiler_files",
+    compiler_files = ":clangcl_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":clang_cl_arm64",
+    toolchain_identifier = "clang_cl_arm64",
+)
+
+cc_toolchain_config(
+    name = "clang_cl_arm64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:ARM64"],
+    compiler = "clang-cl",
+    cpu = "arm64_windows",
+    cxx_builtin_include_directories = [],
+    dbg_mode_debug_flag = "/DEBUG",
+    default_link_flags = ["/MACHINE:ARM64"],
+    fastbuild_mode_debug_flag = "/DEBUG",
+    host_system_name = "local",
+    msvc_cl_path = "vc_installation_error_arm64.bat",
+    msvc_env_include = "clang_cl_not_found",
+    msvc_env_lib = "clang_cl_not_found",
+    msvc_env_path = "clang_cl_not_found",
+    msvc_env_tmp = "clang_cl_not_found",
+    msvc_lib_path = "vc_installation_error_arm64.bat",
+    msvc_link_path = "vc_installation_error_arm64.bat",
+    msvc_ml_path = "vc_installation_error_arm64.bat",
+    supports_parse_showincludes = False,
+    target_libc = "msvcrt",
+    target_system_name = "aarch64-pc-windows-msvc",
+    tool_paths = {
+        "ar": "vc_installation_error_arm64.bat",
+        "ml": "vc_installation_error_arm64.bat",
+        "cpp": "vc_installation_error_arm64.bat",
+        "gcc": "vc_installation_error_arm64.bat",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "vc_installation_error_arm64.bat",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "clang_cl_arm64",
+)
+
+toolchain(
+    name = "cc-toolchain-arm64_windows-clang-cl",
+    exec_compatible_with = [
+        "@platforms//os:windows",
+        "@bazel_tools//tools/cpp:clang-cl",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:arm64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-arm64_windows-clang-cl",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-armeabi-v7a",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":empty",
+    compiler_files = ":empty",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":stub_armeabi-v7a",
+    toolchain_identifier = "stub_armeabi-v7a",
+)
+
+armeabi_cc_toolchain_config(name = "stub_armeabi-v7a")
+
+toolchain(
+    name = "cc-toolchain-armeabi-v7a",
+    exec_compatible_with = [
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:armv7",
+        "@platforms//os:android",
+    ],
+    toolchain = ":cc-compiler-armeabi-v7a",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/win/20240424/armeabi_cc_toolchain_config.bzl b/third_party/xla/third_party/tsl/tools/toolchains/win/20240424/armeabi_cc_toolchain_config.bzl
new file mode 100644
index 00000000000000..72ef48ae6d6dfc
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tools/toolchains/win/20240424/armeabi_cc_toolchain_config.bzl
@@ -0,0 +1,82 @@
+# Copyright 2019 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A Starlark cc_toolchain configuration rule"""
+
+load(
+    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
+    "feature",
+    "tool_path",
+)
+
+def _impl(ctx):
+    toolchain_identifier = "stub_armeabi-v7a"
+    host_system_name = "armeabi-v7a"
+    target_system_name = "armeabi-v7a"
+    target_cpu = "armeabi-v7a"
+    target_libc = "armeabi-v7a"
+    compiler = "compiler"
+    abi_version = "armeabi-v7a"
+    abi_libc_version = "armeabi-v7a"
+    cc_target_os = None
+    builtin_sysroot = None
+    action_configs = []
+
+    supports_pic_feature = feature(name = "supports_pic", enabled = True)
+    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+    features = [supports_dynamic_linker_feature, supports_pic_feature]
+
+    cxx_builtin_include_directories = []
+    artifact_name_patterns = []
+    make_variables = []
+
+    tool_paths = [
+        tool_path(name = "ar", path = "/bin/false"),
+        tool_path(name = "cpp", path = "/bin/false"),
+        tool_path(name = "dwp", path = "/bin/false"),
+        tool_path(name = "gcc", path = "/bin/false"),
+        tool_path(name = "gcov", path = "/bin/false"),
+        tool_path(name = "ld", path = "/bin/false"),
+        tool_path(name = "llvm-profdata", path = "/bin/false"),
+        tool_path(name = "nm", path = "/bin/false"),
+        tool_path(name = "objcopy", path = "/bin/false"),
+        tool_path(name = "objdump", path = "/bin/false"),
+        tool_path(name = "strip", path = "/bin/false"),
+    ]
+
+    return cc_common.create_cc_toolchain_config_info(
+        ctx = ctx,
+        features = features,
+        action_configs = action_configs,
+        artifact_name_patterns = artifact_name_patterns,
+        cxx_builtin_include_directories = cxx_builtin_include_directories,
+        toolchain_identifier = toolchain_identifier,
+        host_system_name = host_system_name,
+        target_system_name = target_system_name,
+        target_cpu = target_cpu,
+        target_libc = target_libc,
+        compiler = compiler,
+        abi_version = abi_version,
+        abi_libc_version = abi_libc_version,
+        tool_paths = tool_paths,
+        make_variables = make_variables,
+        builtin_sysroot = builtin_sysroot,
+        cc_target_os = cc_target_os,
+    )
+
+armeabi_cc_toolchain_config = rule(
+    implementation = _impl,
+    attrs = {},
+    provides = [CcToolchainConfigInfo],
+)
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/win/20240424/builtin_include_directory_paths_clangcl b/third_party/xla/third_party/tsl/tools/toolchains/win/20240424/builtin_include_directory_paths_clangcl
new file mode 100644
index 00000000000000..0a1fb6e0df84ce
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tools/toolchains/win/20240424/builtin_include_directory_paths_clangcl
@@ -0,0 +1,5 @@
+This file is generated by cc_configure and contains builtin include directories
+that clang-cl reported. This file is a dependency of every compilation action and
+changes to it will be reflected in the action cache key. When some of these
+paths change, Bazel will make sure to rerun the action, even though none of
+declared action inputs or the action commandline changes.
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/win/20240424/builtin_include_directory_paths_msvc b/third_party/xla/third_party/tsl/tools/toolchains/win/20240424/builtin_include_directory_paths_msvc
new file mode 100644
index 00000000000000..55ba44f761e2c1
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tools/toolchains/win/20240424/builtin_include_directory_paths_msvc
@@ -0,0 +1,6 @@
+This file is generated by cc_configure and contains builtin include directories
+that msvc reported. This file is a dependency of every compilation action and
+changes to it will be reflected in the action cache key. When some of these
+paths change, Bazel will make sure to rerun the action, even though none of
+declared action inputs or the action commandline changes.
+
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/win/20240424/toolchain_image_info b/third_party/xla/third_party/tsl/tools/toolchains/win/20240424/toolchain_image_info
new file mode 100644
index 00000000000000..807a14bebbdb44
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tools/toolchains/win/20240424/toolchain_image_info
@@ -0,0 +1,2 @@
+REPOSITORY                                            TAG                 DIGEST                                                                    IMAGE ID            CREATED             SIZE
+gcr.io/tensorflow-testing/tf-win2019-docker-staging   latest              sha256:1082ef4299a72e44a84388f192ecefc81ec9091c146f507bc36070c089c0edcc   b601adb43430        8 minutes ago       20.4GB
\ No newline at end of file
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/win/20240424/windows_cc_toolchain_config.bzl b/third_party/xla/third_party/tsl/tools/toolchains/win/20240424/windows_cc_toolchain_config.bzl
new file mode 100644
index 00000000000000..6d8e8af6d50e4a
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tools/toolchains/win/20240424/windows_cc_toolchain_config.bzl
@@ -0,0 +1,1443 @@
+# Copyright 2019 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A Starlark cc_toolchain configuration rule for Windows"""
+
+load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
+load(
+    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
+    "action_config",
+    "artifact_name_pattern",
+    "env_entry",
+    "env_set",
+    "feature",
+    "flag_group",
+    "flag_set",
+    "tool",
+    "tool_path",
+    "variable_with_value",
+    "with_feature_set",
+)
+
+all_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.assemble,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.clif_match,
+    ACTION_NAMES.lto_backend,
+]
+
+all_cpp_compile_actions = [
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.clif_match,
+]
+
+preprocessor_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.clif_match,
+]
+
+codegen_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.assemble,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.lto_backend,
+]
+
+all_link_actions = [
+    ACTION_NAMES.cpp_link_executable,
+    ACTION_NAMES.cpp_link_dynamic_library,
+    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+]
+
+def _use_msvc_toolchain(ctx):
+    return ctx.attr.cpu in ["x64_windows", "arm64_windows"] and (ctx.attr.compiler == "msvc-cl" or ctx.attr.compiler == "clang-cl")
+
+def _impl(ctx):
+    if _use_msvc_toolchain(ctx):
+        artifact_name_patterns = [
+            artifact_name_pattern(
+                category_name = "object_file",
+                prefix = "",
+                extension = ".obj",
+            ),
+            artifact_name_pattern(
+                category_name = "static_library",
+                prefix = "",
+                extension = ".lib",
+            ),
+            artifact_name_pattern(
+                category_name = "alwayslink_static_library",
+                prefix = "",
+                extension = ".lo.lib",
+            ),
+            artifact_name_pattern(
+                category_name = "executable",
+                prefix = "",
+                extension = ".exe",
+            ),
+            artifact_name_pattern(
+                category_name = "dynamic_library",
+                prefix = "",
+                extension = ".dll",
+            ),
+            artifact_name_pattern(
+                category_name = "interface_library",
+                prefix = "",
+                extension = ".if.lib",
+            ),
+        ]
+    else:
+        artifact_name_patterns = [
+            artifact_name_pattern(
+                category_name = "executable",
+                prefix = "",
+                extension = ".exe",
+            ),
+        ]
+
+    if _use_msvc_toolchain(ctx):
+        cpp_link_nodeps_dynamic_library_action = action_config(
+            action_name = ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+            implies = [
+                "nologo",
+                "shared_flag",
+                "linkstamps",
+                "output_execpath_flags",
+                "input_param_flags",
+                "user_link_flags",
+                "linker_subsystem_flag",
+                "linker_param_file",
+                "msvc_env",
+                "no_stripping",
+                "has_configured_linker_path",
+                "def_file",
+            ],
+            tools = [tool(path = ctx.attr.msvc_link_path)],
+        )
+
+        cpp_link_static_library_action = action_config(
+            action_name = ACTION_NAMES.cpp_link_static_library,
+            implies = [
+                "nologo",
+                "archiver_flags",
+                "input_param_flags",
+                "linker_param_file",
+                "msvc_env",
+            ],
+            tools = [tool(path = ctx.attr.msvc_lib_path)],
+        )
+
+        assemble_action = action_config(
+            action_name = ACTION_NAMES.assemble,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "nologo",
+                "msvc_env",
+                "sysroot",
+            ],
+            tools = [tool(path = ctx.attr.msvc_ml_path)],
+        )
+
+        preprocess_assemble_action = action_config(
+            action_name = ACTION_NAMES.preprocess_assemble,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "nologo",
+                "msvc_env",
+                "sysroot",
+            ],
+            tools = [tool(path = ctx.attr.msvc_ml_path)],
+        )
+
+        c_compile_action = action_config(
+            action_name = ACTION_NAMES.c_compile,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "nologo",
+                "msvc_env",
+                "user_compile_flags",
+                "sysroot",
+            ],
+            tools = [tool(path = ctx.attr.msvc_cl_path)],
+        )
+
+        linkstamp_compile_action = action_config(
+            action_name = ACTION_NAMES.linkstamp_compile,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "default_compile_flags",
+                "nologo",
+                "msvc_env",
+                "user_compile_flags",
+                "sysroot",
+                "unfiltered_compile_flags",
+            ],
+            tools = [tool(path = ctx.attr.msvc_cl_path)],
+        )
+
+        cpp_compile_action = action_config(
+            action_name = ACTION_NAMES.cpp_compile,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "nologo",
+                "msvc_env",
+                "user_compile_flags",
+                "sysroot",
+            ],
+            tools = [tool(path = ctx.attr.msvc_cl_path)],
+        )
+
+        cpp_link_executable_action = action_config(
+            action_name = ACTION_NAMES.cpp_link_executable,
+            implies = [
+                "nologo",
+                "linkstamps",
+                "output_execpath_flags",
+                "input_param_flags",
+                "user_link_flags",
+                "linker_subsystem_flag",
+                "linker_param_file",
+                "msvc_env",
+                "no_stripping",
+            ],
+            tools = [tool(path = ctx.attr.msvc_link_path)],
+        )
+
+        cpp_link_dynamic_library_action = action_config(
+            action_name = ACTION_NAMES.cpp_link_dynamic_library,
+            implies = [
+                "nologo",
+                "shared_flag",
+                "linkstamps",
+                "output_execpath_flags",
+                "input_param_flags",
+                "user_link_flags",
+                "linker_subsystem_flag",
+                "linker_param_file",
+                "msvc_env",
+                "no_stripping",
+                "has_configured_linker_path",
+                "def_file",
+            ],
+            tools = [tool(path = ctx.attr.msvc_link_path)],
+        )
+
+        action_configs = [
+            assemble_action,
+            preprocess_assemble_action,
+            c_compile_action,
+            linkstamp_compile_action,
+            cpp_compile_action,
+            cpp_link_executable_action,
+            cpp_link_dynamic_library_action,
+            cpp_link_nodeps_dynamic_library_action,
+            cpp_link_static_library_action,
+        ]
+    else:
+        action_configs = []
+
+    if _use_msvc_toolchain(ctx):
+        msvc_link_env_feature = feature(
+            name = "msvc_link_env",
+            env_sets = [
+                env_set(
+                    actions = all_link_actions +
+                              [ACTION_NAMES.cpp_link_static_library],
+                    env_entries = [env_entry(key = "LIB", value = ctx.attr.msvc_env_lib)],
+                ),
+            ],
+        )
+
+        shared_flag_feature = feature(
+            name = "shared_flag",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [flag_group(flags = ["/DLL"])],
+                ),
+            ],
+        )
+
+        determinism_feature = feature(
+            name = "determinism",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "/wd4117",
+                                "-D__DATE__=\"redacted\"",
+                                "-D__TIMESTAMP__=\"redacted\"",
+                                "-D__TIME__=\"redacted\"",
+                            ] + (["-Wno-builtin-macro-redefined"] if ctx.attr.compiler == "clang-cl" else []),
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        sysroot_feature = feature(
+            name = "sysroot",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["--sysroot=%{sysroot}"],
+                            iterate_over = "sysroot",
+                            expand_if_available = "sysroot",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        unfiltered_compile_flags_feature = feature(
+            name = "unfiltered_compile_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{unfiltered_compile_flags}"],
+                            iterate_over = "unfiltered_compile_flags",
+                            expand_if_available = "unfiltered_compile_flags",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        archive_param_file_feature = feature(
+            name = "archive_param_file",
+            enabled = True,
+        )
+
+        compiler_param_file_feature = feature(
+            name = "compiler_param_file",
+            enabled = True,
+        )
+
+        copy_dynamic_libraries_to_binary_feature = feature(
+            name = "copy_dynamic_libraries_to_binary",
+        )
+
+        input_param_flags_feature = feature(
+            name = "input_param_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/IMPLIB:%{interface_library_output_path}"],
+                            expand_if_available = "interface_library_output_path",
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{libopts}"],
+                            iterate_over = "libopts",
+                            expand_if_available = "libopts",
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = all_link_actions +
+                              [ACTION_NAMES.cpp_link_static_library],
+                    flag_groups = [
+                        flag_group(
+                            iterate_over = "libraries_to_link",
+                            flag_groups = [
+                                flag_group(
+                                    iterate_over = "libraries_to_link.object_files",
+                                    flag_groups = [flag_group(flags = ["%{libraries_to_link.object_files}"])],
+                                    expand_if_equal = variable_with_value(
+                                        name = "libraries_to_link.type",
+                                        value = "object_file_group",
+                                    ),
+                                ),
+                                flag_group(
+                                    flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
+                                    expand_if_equal = variable_with_value(
+                                        name = "libraries_to_link.type",
+                                        value = "object_file",
+                                    ),
+                                ),
+                                flag_group(
+                                    flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
+                                    expand_if_equal = variable_with_value(
+                                        name = "libraries_to_link.type",
+                                        value = "interface_library",
+                                    ),
+                                ),
+                                flag_group(
+                                    flag_groups = [
+                                        flag_group(
+                                            flags = ["%{libraries_to_link.name}"],
+                                            expand_if_false = "libraries_to_link.is_whole_archive",
+                                        ),
+                                        flag_group(
+                                            flags = ["/WHOLEARCHIVE:%{libraries_to_link.name}"],
+                                            expand_if_true = "libraries_to_link.is_whole_archive",
+                                        ),
+                                    ],
+                                    expand_if_equal = variable_with_value(
+                                        name = "libraries_to_link.type",
+                                        value = "static_library",
+                                    ),
+                                ),
+                            ],
+                            expand_if_available = "libraries_to_link",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        fastbuild_feature = feature(
+            name = "fastbuild",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Od", "/Z7"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = [ctx.attr.fastbuild_mode_debug_flag, "/INCREMENTAL:NO"],
+                        ),
+                    ],
+                ),
+            ],
+            implies = ["generate_pdb_file"],
+        )
+
+        user_compile_flags_feature = feature(
+            name = "user_compile_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{user_compile_flags}"],
+                            iterate_over = "user_compile_flags",
+                            expand_if_available = "user_compile_flags",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        archiver_flags_feature = feature(
+            name = "archiver_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.cpp_link_static_library],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/OUT:%{output_execpath}"],
+                            expand_if_available = "output_execpath",
+                        ),
+                        flag_group(
+                            flags = ctx.attr.archiver_flags,
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        default_link_flags_feature = feature(
+            name = "default_link_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ctx.attr.default_link_flags)],
+                ),
+            ],
+        )
+
+        static_link_msvcrt_feature = feature(
+            name = "static_link_msvcrt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/MT"])],
+                    with_features = [with_feature_set(not_features = ["dbg"])],
+                ),
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/MTd"])],
+                    with_features = [with_feature_set(features = ["dbg"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])],
+                    with_features = [with_feature_set(not_features = ["dbg"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])],
+                    with_features = [with_feature_set(features = ["dbg"])],
+                ),
+            ],
+        )
+
+        dynamic_link_msvcrt_feature = feature(
+            name = "dynamic_link_msvcrt",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/MD"])],
+                    with_features = [with_feature_set(not_features = ["dbg", "static_link_msvcrt"])],
+                ),
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/MDd"])],
+                    with_features = [with_feature_set(features = ["dbg"], not_features = ["static_link_msvcrt"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])],
+                    with_features = [with_feature_set(not_features = ["dbg", "static_link_msvcrt"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])],
+                    with_features = [with_feature_set(features = ["dbg"], not_features = ["static_link_msvcrt"])],
+                ),
+            ],
+        )
+
+        dbg_feature = feature(
+            name = "dbg",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Od", "/Z7"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = [ctx.attr.dbg_mode_debug_flag, "/INCREMENTAL:NO"],
+                        ),
+                    ],
+                ),
+            ],
+            implies = ["generate_pdb_file"],
+        )
+
+        opt_feature = feature(
+            name = "opt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/O2"])],
+                ),
+            ],
+            implies = ["frame_pointer"],
+        )
+
+        supports_interface_shared_libraries_feature = feature(
+            name = "supports_interface_shared_libraries",
+            enabled = True,
+        )
+
+        user_link_flags_feature = feature(
+            name = "user_link_flags",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{user_link_flags}"],
+                            iterate_over = "user_link_flags",
+                            expand_if_available = "user_link_flags",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        default_compile_flags_feature = feature(
+            name = "default_compile_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.lto_backend,
+                        ACTION_NAMES.clif_match,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "/DCOMPILER_MSVC",
+                                "/DNOMINMAX",
+                                "/D_WIN32_WINNT=0x0601",
+                                "/D_CRT_SECURE_NO_DEPRECATE",
+                                "/D_CRT_SECURE_NO_WARNINGS",
+                                "/bigobj",
+                                "/Zm500",
+                                "/EHsc",
+                                "/wd4351",
+                                "/wd4291",
+                                "/wd4250",
+                                "/wd4996",
+                            ],
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        msvc_compile_env_feature = feature(
+            name = "msvc_compile_env",
+            env_sets = [
+                env_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                    ],
+                    env_entries = [env_entry(key = "INCLUDE", value = ctx.attr.msvc_env_include)],
+                ),
+            ],
+        )
+
+        preprocessor_defines_feature = feature(
+            name = "preprocessor_defines",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/D%{preprocessor_defines}"],
+                            iterate_over = "preprocessor_defines",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        generate_pdb_file_feature = feature(
+            name = "generate_pdb_file",
+        )
+
+        output_execpath_flags_feature = feature(
+            name = "output_execpath_flags",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/OUT:%{output_execpath}"],
+                            expand_if_available = "output_execpath",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        disable_assertions_feature = feature(
+            name = "disable_assertions",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/DNDEBUG"])],
+                    with_features = [with_feature_set(features = ["opt"])],
+                ),
+            ],
+        )
+
+        has_configured_linker_path_feature = feature(name = "has_configured_linker_path")
+
+        supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+
+        no_stripping_feature = feature(name = "no_stripping")
+
+        linker_param_file_feature = feature(
+            name = "linker_param_file",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions +
+                              [ACTION_NAMES.cpp_link_static_library],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["@%{linker_param_file}"],
+                            expand_if_available = "linker_param_file",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        ignore_noisy_warnings_feature = feature(
+            name = "ignore_noisy_warnings",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.cpp_link_static_library],
+                    flag_groups = [flag_group(flags = ["/ignore:4221"])],
+                ),
+            ],
+        )
+
+        no_legacy_features_feature = feature(name = "no_legacy_features")
+
+        parse_showincludes_feature = feature(
+            name = "parse_showincludes",
+            enabled = ctx.attr.supports_parse_showincludes,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                    ],
+                    flag_groups = [flag_group(flags = ["/showIncludes"])],
+                ),
+            ],
+            env_sets = [
+                env_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                    ],
+                    # Force English (and thus a consistent locale) output so that Bazel can parse
+                    # the /showIncludes output without having to guess the encoding.
+                    env_entries = [env_entry(key = "VSLANG", value = "1033")],
+                ),
+            ],
+        )
+
+        # MSVC does not emit .d files.
+        no_dotd_file_feature = feature(
+            name = "no_dotd_file",
+            enabled = True,
+        )
+
+        treat_warnings_as_errors_feature = feature(
+            name = "treat_warnings_as_errors",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile] + all_link_actions,
+                    flag_groups = [flag_group(flags = ["/WX"])],
+                ),
+            ],
+        )
+
+        windows_export_all_symbols_feature = feature(name = "windows_export_all_symbols")
+
+        no_windows_export_all_symbols_feature = feature(name = "no_windows_export_all_symbols")
+
+        include_paths_feature = feature(
+            name = "include_paths",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/I%{quote_include_paths}"],
+                            iterate_over = "quote_include_paths",
+                        ),
+                        flag_group(
+                            flags = ["/I%{include_paths}"],
+                            iterate_over = "include_paths",
+                        ),
+                        flag_group(
+                            flags = ["/I%{system_include_paths}"],
+                            iterate_over = "system_include_paths",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        external_include_paths_feature = feature(
+            name = "external_include_paths",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.clif_match,
+                        ACTION_NAMES.objc_compile,
+                        ACTION_NAMES.objcpp_compile,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/external:I", "%{external_include_paths}"],
+                            iterate_over = "external_include_paths",
+                            expand_if_available = "external_include_paths",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        linkstamps_feature = feature(
+            name = "linkstamps",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{linkstamp_paths}"],
+                            iterate_over = "linkstamp_paths",
+                            expand_if_available = "linkstamp_paths",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        targets_windows_feature = feature(
+            name = "targets_windows",
+            enabled = True,
+            implies = ["copy_dynamic_libraries_to_binary"],
+        )
+
+        linker_subsystem_flag_feature = feature(
+            name = "linker_subsystem_flag",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/SUBSYSTEM:CONSOLE"])],
+                ),
+            ],
+        )
+
+        frame_pointer_feature = feature(
+            name = "frame_pointer",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Oy-"])],
+                ),
+            ],
+        )
+
+        compiler_output_flags_feature = feature(
+            name = "compiler_output_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.assemble],
+                    flag_groups = [
+                        flag_group(
+                            flag_groups = [
+                                flag_group(
+                                    flags = ["/Fo%{output_file}", "/Zi"],
+                                    expand_if_available = "output_file",
+                                    expand_if_not_available = "output_assembly_file",
+                                ),
+                            ],
+                            expand_if_not_available = "output_preprocess_file",
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flag_groups = [
+                                flag_group(
+                                    flags = ["/Fo%{output_file}"],
+                                    expand_if_not_available = "output_preprocess_file",
+                                ),
+                            ],
+                            expand_if_available = "output_file",
+                            expand_if_not_available = "output_assembly_file",
+                        ),
+                        flag_group(
+                            flag_groups = [
+                                flag_group(
+                                    flags = ["/Fa%{output_file}"],
+                                    expand_if_available = "output_assembly_file",
+                                ),
+                            ],
+                            expand_if_available = "output_file",
+                        ),
+                        flag_group(
+                            flag_groups = [
+                                flag_group(
+                                    flags = ["/P", "/Fi%{output_file}"],
+                                    expand_if_available = "output_preprocess_file",
+                                ),
+                            ],
+                            expand_if_available = "output_file",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        nologo_feature = feature(
+            name = "nologo",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                        ACTION_NAMES.cpp_link_static_library,
+                    ],
+                    flag_groups = [flag_group(flags = ["/nologo"])],
+                ),
+            ],
+        )
+
+        smaller_binary_feature = feature(
+            name = "smaller_binary",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Gy", "/Gw"])],
+                    with_features = [with_feature_set(features = ["opt"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/OPT:ICF", "/OPT:REF"])],
+                    with_features = [with_feature_set(features = ["opt"])],
+                ),
+            ],
+        )
+
+        compiler_input_flags_feature = feature(
+            name = "compiler_input_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/c", "%{source_file}"],
+                            expand_if_available = "source_file",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        def_file_feature = feature(
+            name = "def_file",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/DEF:%{def_file_path}", "/ignore:4070"],
+                            expand_if_available = "def_file_path",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        msvc_env_feature = feature(
+            name = "msvc_env",
+            env_sets = [
+                env_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                        ACTION_NAMES.cpp_link_static_library,
+                    ],
+                    env_entries = [
+                        env_entry(key = "PATH", value = ctx.attr.msvc_env_path),
+                        env_entry(key = "TMP", value = ctx.attr.msvc_env_tmp),
+                        env_entry(key = "TEMP", value = ctx.attr.msvc_env_tmp),
+                    ],
+                ),
+            ],
+            implies = ["msvc_compile_env", "msvc_link_env"],
+        )
+        features = [
+            no_legacy_features_feature,
+            nologo_feature,
+            has_configured_linker_path_feature,
+            no_stripping_feature,
+            targets_windows_feature,
+            copy_dynamic_libraries_to_binary_feature,
+            default_compile_flags_feature,
+            msvc_env_feature,
+            msvc_compile_env_feature,
+            msvc_link_env_feature,
+            include_paths_feature,
+            external_include_paths_feature,
+            preprocessor_defines_feature,
+            parse_showincludes_feature,
+            no_dotd_file_feature,
+            generate_pdb_file_feature,
+            shared_flag_feature,
+            linkstamps_feature,
+            output_execpath_flags_feature,
+            archiver_flags_feature,
+            input_param_flags_feature,
+            linker_subsystem_flag_feature,
+            user_link_flags_feature,
+            default_link_flags_feature,
+            linker_param_file_feature,
+            static_link_msvcrt_feature,
+            dynamic_link_msvcrt_feature,
+            dbg_feature,
+            fastbuild_feature,
+            opt_feature,
+            frame_pointer_feature,
+            disable_assertions_feature,
+            determinism_feature,
+            treat_warnings_as_errors_feature,
+            smaller_binary_feature,
+            ignore_noisy_warnings_feature,
+            user_compile_flags_feature,
+            sysroot_feature,
+            unfiltered_compile_flags_feature,
+            archive_param_file_feature,
+            compiler_param_file_feature,
+            compiler_output_flags_feature,
+            compiler_input_flags_feature,
+            def_file_feature,
+            windows_export_all_symbols_feature,
+            no_windows_export_all_symbols_feature,
+            supports_dynamic_linker_feature,
+            supports_interface_shared_libraries_feature,
+        ]
+    else:
+        targets_windows_feature = feature(
+            name = "targets_windows",
+            implies = ["copy_dynamic_libraries_to_binary"],
+            enabled = True,
+        )
+
+        copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
+
+        gcc_env_feature = feature(
+            name = "gcc_env",
+            enabled = True,
+            env_sets = [
+                env_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                        ACTION_NAMES.cpp_link_static_library,
+                    ],
+                    env_entries = [
+                        env_entry(key = "PATH", value = ctx.attr.tool_bin_path),
+                    ],
+                ),
+            ],
+        )
+
+        default_compile_flags_feature = feature(
+            name = "default_compile_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.lto_backend,
+                        ACTION_NAMES.clif_match,
+                    ],
+                    flag_groups = [flag_group(flags = ["-std=gnu++14"])],
+                ),
+            ],
+        )
+
+        default_link_flags_feature = feature(
+            name = "default_link_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["-lstdc++"])],
+                ),
+            ],
+        )
+
+        supports_dynamic_linker_feature = feature(
+            name = "supports_dynamic_linker",
+            enabled = True,
+        )
+
+        dbg_feature = feature(
+            name = "dbg",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["-g", "-Og"])],
+                ),
+            ],
+        )
+
+        opt_feature = feature(
+            name = "opt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = [
+                        "-g0",
+                        "-O3",
+                        "-DNDEBUG",
+                        "-ffunction-sections",
+                        "-fdata-sections",
+                    ])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["-Wl,--gc-sections"])],
+                ),
+            ],
+        )
+
+        if ctx.attr.cpu == "x64_windows" and ctx.attr.compiler == "mingw-gcc":
+            archive_param_file_feature = feature(
+                name = "archive_param_file",
+                enabled = True,
+            )
+
+            compiler_param_file_feature = feature(
+                name = "compiler_param_file",
+            )
+
+            features = [
+                targets_windows_feature,
+                copy_dynamic_libraries_to_binary_feature,
+                gcc_env_feature,
+                default_compile_flags_feature,
+                archive_param_file_feature,
+                compiler_param_file_feature,
+                default_link_flags_feature,
+                supports_dynamic_linker_feature,
+                dbg_feature,
+                opt_feature,
+            ]
+        else:
+            supports_pic_feature = feature(
+                name = "supports_pic",
+                enabled = True,
+            )
+
+            sysroot_feature = feature(
+                name = "sysroot",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = [
+                            ACTION_NAMES.preprocess_assemble,
+                            ACTION_NAMES.linkstamp_compile,
+                            ACTION_NAMES.c_compile,
+                            ACTION_NAMES.cpp_compile,
+                            ACTION_NAMES.cpp_header_parsing,
+                            ACTION_NAMES.cpp_module_compile,
+                            ACTION_NAMES.cpp_module_codegen,
+                            ACTION_NAMES.lto_backend,
+                            ACTION_NAMES.clif_match,
+                            ACTION_NAMES.cpp_link_executable,
+                            ACTION_NAMES.cpp_link_dynamic_library,
+                            ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                        ],
+                        flag_groups = [
+                            flag_group(
+                                flags = ["--sysroot=%{sysroot}"],
+                                expand_if_available = "sysroot",
+                            ),
+                        ],
+                    ),
+                ],
+            )
+
+            fdo_optimize_feature = feature(
+                name = "fdo_optimize",
+                flag_sets = [
+                    flag_set(
+                        actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                        flag_groups = [
+                            flag_group(
+                                flags = [
+                                    "-fprofile-use=%{fdo_profile_path}",
+                                    "-fprofile-correction",
+                                ],
+                                expand_if_available = "fdo_profile_path",
+                            ),
+                        ],
+                    ),
+                ],
+                provides = ["profile"],
+            )
+
+            treat_warnings_as_errors_feature = feature(
+                name = "treat_warnings_as_errors",
+                flag_sets = [
+                    flag_set(
+                        actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                        flag_groups = [flag_group(flags = ["-Werror"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions,
+                        flag_groups = [flag_group(flags = ["-Wl,-fatal-warnings"])],
+                    ),
+                ],
+            )
+
+            user_compile_flags_feature = feature(
+                name = "user_compile_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = [
+                            ACTION_NAMES.assemble,
+                            ACTION_NAMES.preprocess_assemble,
+                            ACTION_NAMES.linkstamp_compile,
+                            ACTION_NAMES.c_compile,
+                            ACTION_NAMES.cpp_compile,
+                            ACTION_NAMES.cpp_header_parsing,
+                            ACTION_NAMES.cpp_module_compile,
+                            ACTION_NAMES.cpp_module_codegen,
+                            ACTION_NAMES.lto_backend,
+                            ACTION_NAMES.clif_match,
+                        ],
+                        flag_groups = [
+                            flag_group(
+                                flags = ["%{user_compile_flags}"],
+                                iterate_over = "user_compile_flags",
+                                expand_if_available = "user_compile_flags",
+                            ),
+                        ],
+                    ),
+                ],
+            )
+
+            features = [
+                targets_windows_feature,
+                copy_dynamic_libraries_to_binary_feature,
+                gcc_env_feature,
+                supports_pic_feature,
+                default_compile_flags_feature,
+                default_link_flags_feature,
+                fdo_optimize_feature,
+                supports_dynamic_linker_feature,
+                dbg_feature,
+                opt_feature,
+                user_compile_flags_feature,
+                treat_warnings_as_errors_feature,
+                sysroot_feature,
+            ]
+
+    tool_paths = [
+        tool_path(name = name, path = path)
+        for name, path in ctx.attr.tool_paths.items()
+    ]
+
+    return cc_common.create_cc_toolchain_config_info(
+        ctx = ctx,
+        features = features,
+        action_configs = action_configs,
+        artifact_name_patterns = artifact_name_patterns,
+        cxx_builtin_include_directories = ctx.attr.cxx_builtin_include_directories,
+        toolchain_identifier = ctx.attr.toolchain_identifier,
+        host_system_name = ctx.attr.host_system_name,
+        target_system_name = ctx.attr.target_system_name,
+        target_cpu = ctx.attr.cpu,
+        target_libc = ctx.attr.target_libc,
+        compiler = ctx.attr.compiler,
+        abi_version = ctx.attr.abi_version,
+        abi_libc_version = ctx.attr.abi_libc_version,
+        tool_paths = tool_paths,
+    )
+
+cc_toolchain_config = rule(
+    implementation = _impl,
+    attrs = {
+        "cpu": attr.string(mandatory = True),
+        "compiler": attr.string(),
+        "toolchain_identifier": attr.string(),
+        "host_system_name": attr.string(),
+        "target_system_name": attr.string(),
+        "target_libc": attr.string(),
+        "abi_version": attr.string(),
+        "abi_libc_version": attr.string(),
+        "tool_paths": attr.string_dict(),
+        "cxx_builtin_include_directories": attr.string_list(),
+        "archiver_flags": attr.string_list(default = []),
+        "default_link_flags": attr.string_list(default = []),
+        "msvc_env_tmp": attr.string(default = "msvc_not_found"),
+        "msvc_env_path": attr.string(default = "msvc_not_found"),
+        "msvc_env_include": attr.string(default = "msvc_not_found"),
+        "msvc_env_lib": attr.string(default = "msvc_not_found"),
+        "msvc_cl_path": attr.string(default = "vc_installation_error.bat"),
+        "msvc_ml_path": attr.string(default = "vc_installation_error.bat"),
+        "msvc_link_path": attr.string(default = "vc_installation_error.bat"),
+        "msvc_lib_path": attr.string(default = "vc_installation_error.bat"),
+        "dbg_mode_debug_flag": attr.string(),
+        "fastbuild_mode_debug_flag": attr.string(),
+        "tool_bin_path": attr.string(default = "not_found"),
+        "supports_parse_showincludes": attr.bool(),
+    },
+    provides = [CcToolchainConfigInfo],
+)
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/win/BUILD b/third_party/xla/third_party/tsl/tools/toolchains/win/BUILD
index add969031b00ef..55ae6fb22b81f6 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/win/BUILD
+++ b/third_party/xla/third_party/tsl/tools/toolchains/win/BUILD
@@ -17,7 +17,7 @@ platform(
     remote_execution_properties = """
         properties:{
           name: "container-image"
-          value: "docker://gcr.io/tensorflow-testing/tf-win2019-rbe@sha256:775f2889a35e7cc87f8e1dc83f3195acb4e709cebe3572cf3ae591ccef27e7e8"
+          value: "docker://gcr.io/tensorflow-testing/tf-win2019-rbe@sha256:1082ef4299a72e44a84388f192ecefc81ec9091c146f507bc36070c089c0edcc"
         }
         properties:{
           name: "OSFamily"
@@ -32,7 +32,7 @@ platform(
         """,
 )
 
-#Register platform to support clang-cl on Windows
+# Register the clang-cl platform
 platform(
     name = "x64_windows-clang-cl",
     constraint_values = [
@@ -40,4 +40,20 @@ platform(
         "@platforms//os:windows",
         "@bazel_tools//tools/cpp:clang-cl",
     ],
+    remote_execution_properties = """
+        properties:{
+          name: "container-image"
+          value: "docker://gcr.io/tensorflow-testing/tf-win2019-rbe@sha256:1082ef4299a72e44a84388f192ecefc81ec9091c146f507bc36070c089c0edcc"
+        }
+        properties:{
+          name: "OSFamily"
+          value: "Windows"
+        }
+        properties:{
+          name: "Pool" value: "default"
+        }
+        properties:{
+          name: "dockerNetwork" value: "off"
+        }
+        """,
 )
diff --git a/third_party/xla/third_party/tsl/tsl/concurrency/async_value_ptr_test.cc b/third_party/xla/third_party/tsl/tsl/concurrency/async_value_ptr_test.cc
deleted file mode 100644
index 54815c76190c27..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/concurrency/async_value_ptr_test.cc
+++ /dev/null
@@ -1,285 +0,0 @@
-/* Copyright 2022 Google LLC. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cstdint>
-
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/types/span.h"
-#include "tsl/concurrency/async_value_ref.h"
-#include "tsl/platform/test.h"
-
-namespace tsl {
-
-TEST(AsyncValuePtrTest, Construct) {
-  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
-  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
-
-  EXPECT_EQ(ptr.get(), 42);
-}
-
-TEST(AsyncValuePtrTest, CopyRef) {
-  AsyncValueRef<int32_t> ref0 = MakeAvailableAsyncValueRef<int32_t>(42);
-  AsyncValuePtr<int32_t> ptr = ref0.AsPtr();
-
-  EXPECT_TRUE(ref0.IsUnique());  // pointer doesn't change the reference count
-
-  AsyncValueRef<int32_t> ref1 = ptr.CopyRef();
-
-  EXPECT_FALSE(ref0.IsUnique());
-  EXPECT_FALSE(ref1.IsUnique());
-}
-
-TEST(AsyncValuePtrTest, Emplace) {
-  AsyncValueRef<int32_t> ref = MakeUnconstructedAsyncValueRef<int32_t>();
-  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
-
-  EXPECT_FALSE(ptr.IsConcrete());
-  EXPECT_FALSE(ptr.IsAvailable());
-
-  ptr.emplace(42);
-  EXPECT_EQ(ptr.get(), 42);
-}
-
-TEST(AsyncValuePtrTest, SetError) {
-  AsyncValueRef<int32_t> ref = MakeUnconstructedAsyncValueRef<int32_t>();
-  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
-
-  EXPECT_FALSE(ptr.IsConcrete());
-  EXPECT_FALSE(ptr.IsAvailable());
-
-  ptr.SetError(absl::InternalError("test error"));
-
-  EXPECT_TRUE(ptr.IsAvailable());
-  EXPECT_TRUE(ptr.IsError());
-}
-
-TEST(AsyncValuePtrTest, AndThen) {
-  AsyncValueRef<int32_t> ref = MakeUnconstructedAsyncValueRef<int32_t>();
-  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
-
-  EXPECT_FALSE(ptr.IsConcrete());
-  EXPECT_FALSE(ptr.IsAvailable());
-
-  bool executed = false;
-  ptr.AndThen([&]() { executed = true; });
-
-  ptr.emplace(42);
-  EXPECT_TRUE(executed);
-}
-
-TEST(AsyncValuePtrTest, AndThenError) {
-  AsyncValueRef<int32_t> ref = MakeConstructedAsyncValueRef<int32_t>(42);
-  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
-
-  auto error = absl::InternalError("test error");
-  ptr.SetError(error);
-  ptr.AndThen([&](absl::Status status) { EXPECT_EQ(status, error); });
-}
-
-TEST(AsyncValuePtrTest, AndThenNoError) {
-  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
-  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
-
-  ptr.AndThen([](absl::Status status) { EXPECT_TRUE(status.ok()); });
-}
-
-TEST(AsyncValuePtrTest, AndThenStatusOrError) {
-  AsyncValueRef<int32_t> ref = MakeConstructedAsyncValueRef<int32_t>(42);
-  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
-
-  auto error = absl::InternalError("test error");
-  ptr.SetError(error);
-
-  ptr.AndThen([&](absl::StatusOr<int32_t*> v) {
-    EXPECT_FALSE(v.ok());
-    EXPECT_EQ(v.status(), error);
-  });
-}
-
-TEST(AsyncValuePtrTest, AndThenStatusOrNoError) {
-  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
-  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
-
-  ptr.AndThen([&](absl::StatusOr<int32_t*> v) { EXPECT_EQ(**v, 42); });
-}
-
-TEST(AsyncValuePtrTest, BlockUntilReady) {
-  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
-  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
-  BlockUntilReady(ptr);
-}
-
-TEST(AsyncValuePtrTest, RunWhenReady) {
-  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
-  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
-  bool executed = false;
-  RunWhenReady(absl::MakeConstSpan({ptr}), [&] { executed = true; });
-  EXPECT_TRUE(executed);
-}
-
-namespace {
-struct A {
-  virtual ~A() = default;
-};
-struct B : public A {};
-struct C : public B {};
-struct D : public A {};
-}  // namespace
-
-TEST(AsyncValuePtrTest, Isa) {
-  // Empty async pointer always returns false for any Isa<T>.
-  AsyncValuePtr<A> null_ptr;
-  EXPECT_FALSE(Isa<A>(null_ptr));
-
-  AsyncValueRef<A> a_ref = MakeAvailableAsyncValueRef<A>();
-  AsyncValueRef<A> b_ref = MakeAvailableAsyncValueRef<B>();
-  AsyncValueRef<A> c_ref = MakeAvailableAsyncValueRef<C>();
-  AsyncValueRef<A> d_ref = MakeAvailableAsyncValueRef<D>();
-
-  EXPECT_TRUE(Isa<A>(a_ref.AsPtr()));
-  EXPECT_TRUE(Isa<B>(b_ref.AsPtr()));
-  EXPECT_TRUE(Isa<C>(c_ref.AsPtr()));
-  EXPECT_TRUE(Isa<D>(d_ref.AsPtr()));
-
-  // Error async value is Isa<T> of any type in the hierarchy.
-  AsyncValueRef<A> err = MakeErrorAsyncValueRef(absl::InternalError("error"));
-  EXPECT_TRUE(Isa<A>(err.AsPtr()));
-  EXPECT_TRUE(Isa<B>(err.AsPtr()));
-  EXPECT_TRUE(Isa<C>(err.AsPtr()));
-  EXPECT_TRUE(Isa<D>(err.AsPtr()));
-
-  // If the value was constructed with a concrete type it should return true
-  // for Isa<T> even if it was set to error later but only if types match.
-  AsyncValueRef<A> a_err = MakeConstructedAsyncValueRef<A>();
-  AsyncValueRef<B> b_err = MakeConstructedAsyncValueRef<B>();
-  a_err.SetError(absl::InternalError("error"));
-  b_err.SetError(absl::InternalError("error"));
-
-  EXPECT_TRUE(Isa<A>(a_err.AsPtr()));
-  EXPECT_TRUE(Isa<B>(b_err.AsPtr()));
-
-  // Indirect async value is Isa<T> only if it would be a no-op cast.
-  auto indirect = MakeIndirectAsyncValue();
-  AsyncValueRef<A> c_indirect(indirect);
-  EXPECT_TRUE(Isa<A>(c_indirect.AsPtr()));
-  EXPECT_FALSE(Isa<C>(c_indirect.AsPtr()));
-
-  // After forwarding indirect async value to a concrete one it correctly
-  // returns true from Isa<T> check.
-  indirect->ForwardTo(c_ref.CopyRCRef());
-  EXPECT_TRUE(Isa<A>(c_indirect.AsPtr()));
-  EXPECT_TRUE(Isa<C>(c_indirect.AsPtr()));
-}
-
-TEST(AsyncValuePtrTest, DynCast) {
-  AsyncValueRef<A> a_ref = MakeAvailableAsyncValueRef<A>();
-  AsyncValueRef<A> b_ref = MakeAvailableAsyncValueRef<B>();
-  AsyncValueRef<A> c_ref = MakeAvailableAsyncValueRef<C>();
-  AsyncValueRef<A> d_ref = MakeAvailableAsyncValueRef<D>();
-
-  EXPECT_TRUE(DynCast<A>(a_ref.AsPtr()));
-  EXPECT_TRUE(DynCast<B>(b_ref.AsPtr()));
-  EXPECT_TRUE(DynCast<C>(c_ref.AsPtr()));
-  EXPECT_TRUE(DynCast<D>(d_ref.AsPtr()));
-
-  // No-op casts are always successful.
-  EXPECT_TRUE(DynCast<A>(c_ref.AsPtr()));
-
-  // We don't support casting to base (C inherits from B) because we can't do
-  // that safely relying just on AsyncValue type id. For safe conversion to base
-  // we need to introduce some kind of traits to the type hierarchy or rely on
-  // builtin `dynamic_cast` (will work only for constructed values).
-  EXPECT_FALSE(DynCast<B>(c_ref.AsPtr()));
-
-  // Types are unrelated, although they have same base.
-  EXPECT_FALSE(DynCast<C>(d_ref.AsPtr()));
-
-  // Error async value can be DynCast to any type in the hierarchy.
-  AsyncValueRef<A> err = MakeErrorAsyncValueRef(absl::InternalError("error"));
-  EXPECT_TRUE(DynCast<A>(err.AsPtr()));
-  EXPECT_TRUE(DynCast<B>(err.AsPtr()));
-  EXPECT_TRUE(DynCast<C>(err.AsPtr()));
-  EXPECT_TRUE(DynCast<D>(err.AsPtr()));
-
-  // If the value was constructed with a concrete type it should DynCast
-  // successfully even it it was set to error later but only if types match.
-  AsyncValueRef<A> a_err = MakeConstructedAsyncValueRef<A>();
-  AsyncValueRef<B> b_err = MakeConstructedAsyncValueRef<B>();
-  a_err.SetError(absl::InternalError("error"));
-  b_err.SetError(absl::InternalError("error"));
-
-  EXPECT_TRUE(DynCast<A>(a_err.AsPtr()));
-  EXPECT_TRUE(DynCast<B>(b_err.AsPtr()));
-  EXPECT_FALSE(DynCast<C>(a_err.AsPtr()));
-
-  // Indirect async value can't be DynCast until it's forwarded unless it's a
-  // no-op DynCast to the same type.
-  auto indirect = MakeIndirectAsyncValue();
-  AsyncValueRef<A> c_indirect(indirect);
-  EXPECT_TRUE(DynCast<A>(c_indirect.AsPtr()));
-  EXPECT_FALSE(DynCast<C>(c_indirect.AsPtr()));
-
-  // After forwarding indirect async value to a concrete one it can be DynCast
-  // to a concrete type.
-  indirect->ForwardTo(c_ref.CopyRCRef());
-  EXPECT_TRUE(DynCast<A>(c_indirect.AsPtr()));
-  EXPECT_TRUE(DynCast<C>(c_indirect.AsPtr()));
-}
-
-TEST(AsyncValuePtrTest, Cast) {
-  AsyncValueRef<A> a_ref = MakeAvailableAsyncValueRef<A>();
-  AsyncValueRef<A> b_ref = MakeAvailableAsyncValueRef<B>();
-  AsyncValueRef<A> c_ref = MakeAvailableAsyncValueRef<C>();
-  AsyncValueRef<A> d_ref = MakeAvailableAsyncValueRef<D>();
-
-  EXPECT_TRUE(Cast<A>(a_ref.AsPtr()));
-  EXPECT_TRUE(Cast<B>(b_ref.AsPtr()));
-  EXPECT_TRUE(Cast<C>(c_ref.AsPtr()));
-  EXPECT_TRUE(Cast<D>(d_ref.AsPtr()));
-
-  EXPECT_TRUE(Cast<A>(c_ref.AsPtr()));
-
-  // Error async value can be Cast to any type in the hierarchy.
-  AsyncValueRef<A> err = MakeErrorAsyncValueRef(absl::InternalError("error"));
-  EXPECT_TRUE(Cast<A>(err.AsPtr()));
-  EXPECT_TRUE(Cast<B>(err.AsPtr()));
-  EXPECT_TRUE(Cast<C>(err.AsPtr()));
-  EXPECT_TRUE(Cast<D>(err.AsPtr()));
-
-  // If the value was constructed with a concrete type it should Cast
-  // successfully even it it was set to error later but only if types match.
-  AsyncValueRef<A> a_err = MakeConstructedAsyncValueRef<A>();
-  AsyncValueRef<B> b_err = MakeConstructedAsyncValueRef<B>();
-  a_err.SetError(absl::InternalError("error"));
-  b_err.SetError(absl::InternalError("error"));
-
-  EXPECT_TRUE(Cast<A>(a_err.AsPtr()));
-  EXPECT_TRUE(Cast<B>(b_err.AsPtr()));
-
-  // Indirect async value can't be Cast until it's forwarded unless it's a
-  // no-op Cast to the same type.
-  auto indirect = MakeIndirectAsyncValue();
-  AsyncValueRef<A> c_indirect(indirect);
-  EXPECT_TRUE(Cast<A>(c_indirect.AsPtr()));
-
-  // After forwarding indirect async value to a concrete one it can be Cast
-  // to a concrete type.
-  indirect->ForwardTo(c_ref.CopyRCRef());
-  EXPECT_TRUE(Cast<A>(c_indirect.AsPtr()));
-  EXPECT_TRUE(Cast<C>(c_indirect.AsPtr()));
-}
-
-}  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/framework/bfc_allocator.h b/third_party/xla/third_party/tsl/tsl/framework/bfc_allocator.h
index 76921c5f04a79d..6ea913059c472a 100644
--- a/third_party/xla/third_party/tsl/tsl/framework/bfc_allocator.h
+++ b/third_party/xla/third_party/tsl/tsl/framework/bfc_allocator.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tsl/framework/allocator.h"
 #include "tsl/framework/allocator_retry.h"
 #include "tsl/framework/shared_counter.h"
+#include "tsl/lib/core/bits.h"
 #include "tsl/platform/macros.h"
 #include "tsl/platform/mutex.h"
 #include "tsl/platform/numbers.h"
@@ -536,28 +537,6 @@ class BFCAllocator : public Allocator {
   // Structures immutable after construction
   size_t memory_limit_ = 0;
 
-  inline int Log2FloorNonZeroSlow(uint64 n) {
-    int r = 0;
-    while (n > 0) {
-      r++;
-      n >>= 1;
-    }
-    return r - 1;
-  }
-
-  // Returns floor(log2(n)).
-  inline int Log2FloorNonZero(uint64 n) {
-#if defined(__GNUC__)
-    return 63 ^ __builtin_clzll(n);
-#elif defined(PLATFORM_WINDOWS) && (_WIN64)
-    unsigned long index;
-    _BitScanReverse64(&index, n);
-    return index;
-#else
-    return Log2FloorNonZeroSlow(n);
-#endif
-  }
-
   // Map from bin size to Bin
   Bin* BinFromIndex(BinNum index) {
     return reinterpret_cast<Bin*>(&(bins_space_[index * sizeof(Bin)]));
@@ -567,7 +546,7 @@ class BFCAllocator : public Allocator {
   }
   BinNum BinNumForSize(size_t bytes) {
     uint64 v = std::max<size_t>(bytes, 256) >> kMinAllocationBits;
-    int b = std::min(kNumBins - 1, Log2FloorNonZero(v));
+    int b = std::min(kNumBins - 1, tsl::Log2Floor64(v));
     return b;
   }
   Bin* BinForSize(size_t bytes) { return BinFromIndex(BinNumForSize(bytes)); }
diff --git a/third_party/xla/third_party/tsl/tsl/framework/convolution/eigen_spatial_convolutions_test.cc b/third_party/xla/third_party/tsl/tsl/framework/convolution/eigen_spatial_convolutions_test.cc
index 48e0379bce1466..9f589c549901e9 100644
--- a/third_party/xla/third_party/tsl/tsl/framework/convolution/eigen_spatial_convolutions_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/framework/convolution/eigen_spatial_convolutions_test.cc
@@ -1036,10 +1036,6 @@ static void PackLhsHelper(::testing::benchmark::State& state,
   reshape_dims[0] = filter_count;
   reshape_dims[1] = input_depth * filter_rows * filter_cols;
 
-  // We are going to contract along the 'in_depth * filter_rows * filter_cols`.
-  nocontract_t nocontract_dim = {0};
-  contract_t contract_dim = {1};
-
   // These values computed using the algorithm in TensorContraction.h, with
   // 'nocontract_dim' and 'contract_dim' values specified above.
   nocontract_t nocontract_strides = {1};
diff --git a/third_party/xla/third_party/tsl/tsl/framework/mlir/BUILD b/third_party/xla/third_party/tsl/tsl/framework/mlir/BUILD
new file mode 100644
index 00000000000000..4f43b59c64fe0e
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tsl/framework/mlir/BUILD
@@ -0,0 +1,24 @@
+# MLIR related utilities.
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "status_scoped_diagnostic_handler",
+    srcs = ["status_scoped_diagnostic_handler.cc"],
+    hdrs = [
+        "status_scoped_diagnostic_handler.h",
+    ],
+    deps = [
+        "//tsl/platform:logging",
+        "@com_google_absl//absl/status",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
diff --git a/third_party/xla/third_party/tsl/tsl/framework/mlir/status_scoped_diagnostic_handler.cc b/third_party/xla/third_party/tsl/tsl/framework/mlir/status_scoped_diagnostic_handler.cc
new file mode 100644
index 00000000000000..b2e447a5e778a0
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tsl/framework/mlir/status_scoped_diagnostic_handler.cc
@@ -0,0 +1,71 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tsl/framework/mlir/status_scoped_diagnostic_handler.h"
+
+#include <cassert>
+
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/Support/LogicalResult.h"
+#include "absl/status/status.h"
+#include "tsl/platform/logging.h"
+
+namespace tsl {
+
+StatusScopedDiagnosticHandler::StatusScopedDiagnosticHandler(
+    mlir::MLIRContext* context)
+    : mlir::SourceMgrDiagnosticHandler(source_mgr_, context, diag_stream_),
+      diag_stream_(diag_str_) {
+  setHandler([&](mlir::Diagnostic& diag) { return handleDiagnostic(diag); });
+}
+
+StatusScopedDiagnosticHandler::~StatusScopedDiagnosticHandler() {
+  assert(consumed_ && "Status must be consumed_ before destruction");
+}
+
+absl::Status StatusScopedDiagnosticHandler::consumeStatus() {
+  consumed_ = true;
+  return status_;
+}
+
+absl::Status StatusScopedDiagnosticHandler::consumeStatus(
+    mlir::LogicalResult result) {
+  consumed_ = true;
+  if (failed(result) && status_.ok()) {
+    return absl::UnknownError("Unknown MLIR failure");
+  }
+  return status_;
+}
+
+mlir::LogicalResult StatusScopedDiagnosticHandler::handleDiagnostic(
+    mlir::Diagnostic& diag) {
+  diag_str_.clear();
+  emitDiagnostic(diag);
+  diag_stream_.flush();
+
+  // Emit non-errors to VLOG instead of the internal status.
+  if (diag.getSeverity() != mlir::DiagnosticSeverity::Error) {
+    VLOG(1) << diag_str_;
+    return mlir::success();
+  }
+
+  status_.Update(absl::UnknownError(diag_str_));
+
+  // Return success to show that we `consumed_` the diagnostic.
+  return mlir::success();
+}
+
+}  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/framework/mlir/status_scoped_diagnostic_handler.h b/third_party/xla/third_party/tsl/tsl/framework/mlir/status_scoped_diagnostic_handler.h
new file mode 100644
index 00000000000000..ed8cb5593984a5
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tsl/framework/mlir/status_scoped_diagnostic_handler.h
@@ -0,0 +1,58 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_FRAMEWORK_MLIR_STATUS_SCOPED_DIAGNOSTIC_HANDLER_H_
+#define TENSORFLOW_TSL_FRAMEWORK_MLIR_STATUS_SCOPED_DIAGNOSTIC_HANDLER_H_
+
+#include <string>
+
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/Support/LogicalResult.h"
+#include "absl/status/status.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace tsl {
+
+// Diagnostic handler that collects all of the diagnostics reported and produces
+// an absl::Status to return to callers.
+class StatusScopedDiagnosticHandler : public mlir::SourceMgrDiagnosticHandler {
+ public:
+  explicit StatusScopedDiagnosticHandler(mlir::MLIRContext* context);
+
+  // Destruction CHECK-fails if ConsumeStatus has not been called.
+  ~StatusScopedDiagnosticHandler();
+
+  // Returns the aggregate status.
+  absl::Status consumeStatus();
+
+  // Returns the aggregate status, if it is non-OK, or an error, if `result` is
+  // mlir::failed. If the aggregate status is OK and mlir::succeeded(result),
+  // returns OK.
+  absl::Status consumeStatus(mlir::LogicalResult result);
+
+ private:
+  mlir::LogicalResult handleDiagnostic(mlir::Diagnostic& diag);
+
+  std::string diag_str_;
+  llvm::raw_string_ostream diag_stream_;
+  llvm::SourceMgr source_mgr_;
+  absl::Status status_;
+  bool consumed_ = false;
+};
+
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_FRAMEWORK_MLIR_STATUS_SCOPED_DIAGNOSTIC_HANDLER_H_
diff --git a/third_party/xla/third_party/tsl/tsl/framework/serving_device_selector.h b/third_party/xla/third_party/tsl/tsl/framework/serving_device_selector.h
index 426a43e4f13262..9ec14a61dccf63 100644
--- a/third_party/xla/third_party/tsl/tsl/framework/serving_device_selector.h
+++ b/third_party/xla/third_party/tsl/tsl/framework/serving_device_selector.h
@@ -152,14 +152,6 @@ class ServingDeviceSelector {
   virtual DeviceReservation ReserveDevice(
       absl::string_view program_fingerprint) = 0;
 
-  // Enqueues a program on the given device. Used only for load tracking
-  // purposes when the device selection feature is unused.
-  virtual void Enqueue(int32_t device_index, absl::string_view fingerprint) = 0;
-
-  // Marks the completion of a program on the given device. Used only for load
-  // tracking purposes when the device selection feature is unused.
-  virtual void Completed(int32_t device_index, bool had_error) = 0;
-
  protected:
   // A helper function for Enqueue. The EnqueueHelper does the following things.
   //  1. If there are programs in the scheduled_programs queue of the given
diff --git a/third_party/xla/third_party/tsl/tsl/framework/test_util/mock_serving_device_selector.h b/third_party/xla/third_party/tsl/tsl/framework/test_util/mock_serving_device_selector.h
index 5ed54df05ba44a..0eba723fe50216 100644
--- a/third_party/xla/third_party/tsl/tsl/framework/test_util/mock_serving_device_selector.h
+++ b/third_party/xla/third_party/tsl/tsl/framework/test_util/mock_serving_device_selector.h
@@ -29,8 +29,6 @@ class MockServingDeviceSelector : public tsl::ServingDeviceSelector {
  public:
   MOCK_METHOD(tsl::DeviceReservation, ReserveDevice, (absl::string_view),
               (override));
-  MOCK_METHOD(void, Enqueue, (int32_t, absl::string_view), (override));
-  MOCK_METHOD(void, Completed, (int32_t, bool), (override));
   MOCK_METHOD(void, FreeDeviceReservation, (const tsl::DeviceReservation&),
               (override));
 };
diff --git a/third_party/xla/third_party/tsl/tsl/framework/tracking_allocator.cc b/third_party/xla/third_party/tsl/tsl/framework/tracking_allocator.cc
index 97a4d53479f2a5..1ced676695e92e 100644
--- a/third_party/xla/third_party/tsl/tsl/framework/tracking_allocator.cc
+++ b/third_party/xla/third_party/tsl/tsl/framework/tracking_allocator.cc
@@ -171,9 +171,9 @@ std::tuple<size_t, size_t, size_t> TrackingAllocator::GetSizes() {
   return std::make_tuple(total_bytes, high_watermark, still_live_bytes);
 }
 
-gtl::InlinedVector<AllocRecord, 4> TrackingAllocator::GetRecordsAndUnRef() {
+absl::InlinedVector<AllocRecord, 4UL> TrackingAllocator::GetRecordsAndUnRef() {
   bool should_delete;
-  gtl::InlinedVector<AllocRecord, 4> allocations;
+  absl::InlinedVector<AllocRecord, 4UL> allocations;
   {
     mutex_lock lock(mu_);
     allocations.swap(allocations_);
@@ -185,8 +185,8 @@ gtl::InlinedVector<AllocRecord, 4> TrackingAllocator::GetRecordsAndUnRef() {
   return allocations;
 }
 
-gtl::InlinedVector<AllocRecord, 4> TrackingAllocator::GetCurrentRecords() {
-  gtl::InlinedVector<AllocRecord, 4> allocations;
+absl::InlinedVector<AllocRecord, 4UL> TrackingAllocator::GetCurrentRecords() {
+  absl::InlinedVector<AllocRecord, 4UL> allocations;
   {
     mutex_lock lock(mu_);
     for (const AllocRecord& alloc : allocations_) {
diff --git a/third_party/xla/third_party/tsl/tsl/framework/tracking_allocator.h b/third_party/xla/third_party/tsl/tsl/framework/tracking_allocator.h
index c31cb96e9aaac3..b76f5cfb8af8d2 100644
--- a/third_party/xla/third_party/tsl/tsl/framework/tracking_allocator.h
+++ b/third_party/xla/third_party/tsl/tsl/framework/tracking_allocator.h
@@ -88,9 +88,9 @@ class TrackingAllocator : public Allocator {
   // were allocated by this wrapper and have not yet been
   // deallocated. After this call completes and all allocated pointers
   // have been deallocated the wrapper will delete itself.
-  gtl::InlinedVector<AllocRecord, 4> GetRecordsAndUnRef();
+  absl::InlinedVector<AllocRecord, 4UL> GetRecordsAndUnRef();
   // Returns a copy of allocation records collected so far.
-  gtl::InlinedVector<AllocRecord, 4> GetCurrentRecords();
+  absl::InlinedVector<AllocRecord, 4UL> GetCurrentRecords();
 
  protected:
   ~TrackingAllocator() override {}
@@ -118,7 +118,7 @@ class TrackingAllocator : public Allocator {
   // this allocator.
   size_t total_bytes_ TF_GUARDED_BY(mu_);
 
-  gtl::InlinedVector<AllocRecord, 4> allocations_ TF_GUARDED_BY(mu_);
+  absl::InlinedVector<AllocRecord, 4UL> allocations_ TF_GUARDED_BY(mu_);
 
   // Track allocations locally if requested in the constructor and the
   // underlying allocator doesn't already do it for us.
diff --git a/third_party/xla/third_party/tsl/tsl/framework/type_traits.h b/third_party/xla/third_party/tsl/tsl/framework/type_traits.h
index e96334d0027f2b..4b8eed47bde76e 100644
--- a/third_party/xla/third_party/tsl/tsl/framework/type_traits.h
+++ b/third_party/xla/third_party/tsl/tsl/framework/type_traits.h
@@ -71,7 +71,7 @@ struct is_simple_type {
       std::is_same<T, complex64>::value || std::is_same<T, complex128>::value ||
       is_quantized<T>::value || std::is_same<T, bfloat16>::value ||
       std::is_same<T, float8_e4m3fn>::value ||
-      std::is_same<T, float8_e4m3b11>::value ||
+      std::is_same<T, float8_e4m3b11fnuz>::value ||
       std::is_same<T, float8_e5m2>::value || std::is_same<T, int4>::value ||
       std::is_same<T, uint4>::value;
 };
diff --git a/third_party/xla/third_party/tsl/tsl/lib/core/BUILD b/third_party/xla/third_party/tsl/tsl/lib/core/BUILD
index c8bf89d1aa9bd4..0fee1ed113152a 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/core/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/core/BUILD
@@ -83,6 +83,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         "//tsl/platform:logging",
+        "@com_google_absl//absl/numeric:bits",
     ],
     alwayslink = 1,
 )
@@ -102,7 +103,6 @@ cc_library(
     hdrs = ["bits.h"],
     deps = [
         "//tsl/platform:logging",
-        "//tsl/platform:types",
         "@com_google_absl//absl/numeric:bits",
     ],
 )
diff --git a/third_party/xla/third_party/tsl/tsl/lib/core/bitmap.cc b/third_party/xla/third_party/tsl/tsl/lib/core/bitmap.cc
index f8648d95e3ec5e..8681428cbd879d 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/core/bitmap.cc
+++ b/third_party/xla/third_party/tsl/tsl/lib/core/bitmap.cc
@@ -15,8 +15,13 @@ limitations under the License.
 
 #include "tsl/lib/core/bitmap.h"
 
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
 #include <string>
 
+#include "absl/numeric/bits.h"
+
 namespace tsl {
 namespace core {
 
@@ -34,41 +39,7 @@ void Bitmap::Reset(size_t n) {
 
 // Return 1+index of the first set bit in w; return 0 if w == 0.
 static size_t FindFirstSet(uint32_t w) {
-  // TODO(jeff,sanjay): If this becomes a performance issue, we could
-  // use the __builtin_ffs(w) routine on GCC, or the ffs(w) routine on
-  // some other platforms.
-
-  // clang-format off
-  static uint8_t kLowestBitSet[256] = {
-    /*  0*/ 0,  1,  2,  1,  3,  1,  2,  1,  4,  1,  2,  1,  3,  1,  2,  1,
-    /* 16*/ 5,  1,  2,  1,  3,  1,  2,  1,  4,  1,  2,  1,  3,  1,  2,  1,
-    /* 32*/ 6,  1,  2,  1,  3,  1,  2,  1,  4,  1,  2,  1,  3,  1,  2,  1,
-    /* 48*/ 5,  1,  2,  1,  3,  1,  2,  1,  4,  1,  2,  1,  3,  1,  2,  1,
-    /* 64*/ 7,  1,  2,  1,  3,  1,  2,  1,  4,  1,  2,  1,  3,  1,  2,  1,
-    /* 80*/ 5,  1,  2,  1,  3,  1,  2,  1,  4,  1,  2,  1,  3,  1,  2,  1,
-    /* 96*/ 6,  1,  2,  1,  3,  1,  2,  1,  4,  1,  2,  1,  3,  1,  2,  1,
-    /*112*/ 5,  1,  2,  1,  3,  1,  2,  1,  4,  1,  2,  1,  3,  1,  2,  1,
-    /*128*/ 8,  1,  2,  1,  3,  1,  2,  1,  4,  1,  2,  1,  3,  1,  2,  1,
-    /*144*/ 5,  1,  2,  1,  3,  1,  2,  1,  4,  1,  2,  1,  3,  1,  2,  1,
-    /*160*/ 6,  1,  2,  1,  3,  1,  2,  1,  4,  1,  2,  1,  3,  1,  2,  1,
-    /*176*/ 5,  1,  2,  1,  3,  1,  2,  1,  4,  1,  2,  1,  3,  1,  2,  1,
-    /*192*/ 7,  1,  2,  1,  3,  1,  2,  1,  4,  1,  2,  1,  3,  1,  2,  1,
-    /*208*/ 5,  1,  2,  1,  3,  1,  2,  1,  4,  1,  2,  1,  3,  1,  2,  1,
-    /*224*/ 6,  1,  2,  1,  3,  1,  2,  1,  4,  1,  2,  1,  3,  1,  2,  1,
-    /*240*/ 5,  1,  2,  1,  3,  1,  2,  1,  4,  1,  2,  1,  3,  1,  2,  1,
-  };
-  // clang-format on
-  if (w & 0xff) {
-    return kLowestBitSet[w & 0xff];
-  } else if ((w >> 8) & 0xff) {
-    return kLowestBitSet[(w >> 8) & 0xff] + 8;
-  } else if ((w >> 16) & 0xff) {
-    return kLowestBitSet[(w >> 16) & 0xff] + 16;
-  } else if ((w >> 24) & 0xff) {
-    return kLowestBitSet[(w >> 24) & 0xff] + 24;
-  } else {
-    return 0;
-  }
+  return w == 0 ? 0 : absl::countr_zero(w) + 1;
 }
 
 size_t Bitmap::FirstUnset(size_t start) const {
diff --git a/third_party/xla/third_party/tsl/tsl/lib/core/bits.h b/third_party/xla/third_party/tsl/tsl/lib/core/bits.h
index 9a31ae5815febe..f179117201abad 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/core/bits.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/core/bits.h
@@ -20,91 +20,29 @@ limitations under the License.
 
 #include "absl/numeric/bits.h"
 #include "tsl/platform/logging.h"
-#include "tsl/platform/types.h"
 
 namespace tsl {
 
 // Return floor(log2(n)) for positive integer n.  Returns -1 iff n == 0.
-int Log2Floor(uint32 n);
-int Log2Floor64(uint64 n);
-
-// Return ceiling(log2(n)) for positive integer n.  Returns -1 iff n == 0.
-int Log2Ceiling(uint32 n);
-int Log2Ceiling64(uint64 n);
-
-// ------------------------------------------------------------------------
-// Implementation details follow
-// ------------------------------------------------------------------------
-
-#if defined(__GNUC__)
-
-// Return floor(log2(n)) for positive integer n.  Returns -1 iff n == 0.
-inline int Log2Floor(uint32 n) { return n == 0 ? -1 : 31 ^ __builtin_clz(n); }
-
-// Return floor(log2(n)) for positive integer n.  Returns -1 iff n == 0.
-inline int Log2Floor64(uint64 n) {
-  return n == 0 ? -1 : 63 ^ __builtin_clzll(n);
-}
-
-#else
+inline int Log2Floor(uint32_t n) { return absl::bit_width(n) - 1; }
 
 // Return floor(log2(n)) for positive integer n.  Returns -1 iff n == 0.
-inline int Log2Floor(uint32 n) {
-  if (n == 0) return -1;
-  int log = 0;
-  uint32 value = n;
-  for (int i = 4; i >= 0; --i) {
-    int shift = (1 << i);
-    uint32 x = value >> shift;
-    if (x != 0) {
-      value = x;
-      log += shift;
-    }
-  }
-  assert(value == 1);
-  return log;
-}
-
-// Return floor(log2(n)) for positive integer n.  Returns -1 iff n == 0.
-// Log2Floor64() is defined in terms of Log2Floor32()
-inline int Log2Floor64(uint64 n) {
-  const uint32 topbits = static_cast<uint32>(n >> 32);
-  if (topbits == 0) {
-    // Top bits are zero, so scan in bottom bits
-    return Log2Floor(static_cast<uint32>(n));
-  } else {
-    return 32 + Log2Floor(topbits);
-  }
-}
+inline int Log2Floor64(uint64_t n) { return absl::bit_width(n) - 1; }
 
-#endif
-
-inline int Log2Ceiling(uint32 n) {
-  int floor = Log2Floor(n);
-  if (n == (n & ~(n - 1)))  // zero or a power of two
-    return floor;
-  else
-    return floor + 1;
+// Return ceiling(log2(n)) for positive integer n.  Returns -1 iff n == 0.
+inline int Log2Ceiling(uint32_t n) {
+  return n == 0 ? -1 : absl::bit_width(n - 1);
 }
 
-inline int Log2Ceiling64(uint64 n) {
-  int floor = Log2Floor64(n);
-  if (n == (n & ~(n - 1)))  // zero or a power of two
-    return floor;
-  else
-    return floor + 1;
+// Return ceiling(log2(n)) for positive integer n.  Returns -1 iff n == 0.
+inline int Log2Ceiling64(uint64_t n) {
+  return n == 0 ? -1 : absl::bit_width(n - 1);
 }
 
-inline uint32 NextPowerOfTwo(uint32 value) {
-  int exponent = Log2Ceiling(value);
-  DCHECK_LT(exponent, std::numeric_limits<uint32>::digits);
-  return 1 << exponent;
-}
+inline uint32_t NextPowerOfTwo(uint32_t value) { return absl::bit_ceil(value); }
 
-inline uint64 NextPowerOfTwo64(uint64 value) {
-  int exponent = Log2Ceiling(value);
-  DCHECK_LT(exponent, std::numeric_limits<uint64>::digits);
-  return 1LL << exponent;
+inline uint64_t NextPowerOfTwo64(uint64_t value) {
+  return absl::bit_ceil(value);
 }
 
 inline int64_t NextPowerOfTwoS64(int64_t value) {
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/BUILD b/third_party/xla/third_party/tsl/tsl/lib/io/BUILD
index e92e2896dc9241..c103dcfdc5a417 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/BUILD
@@ -18,6 +18,7 @@ package(
         "//tensorflow/core:__pkg__",
         "//tensorflow/core/lib/io:__subpackages__",
         "//tsl/profiler:__subpackages__",
+        "//tensorflow/core/profiler:__subpackages__",
     ]),
     licenses = ["notice"],
 )
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/zlib_outputbuffer.cc b/third_party/xla/third_party/tsl/tsl/lib/io/zlib_outputbuffer.cc
index e2f4b8505f23d5..abc55b50c0695d 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/zlib_outputbuffer.cc
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/zlib_outputbuffer.cc
@@ -155,7 +155,7 @@ absl::Status ZlibOutputBuffer::Append(StringPiece data) {
 
   size_t bytes_to_write = data.size();
 
-  if (static_cast<int32>(bytes_to_write) <= AvailableInputSpace()) {
+  if (bytes_to_write <= static_cast<size_t>(AvailableInputSpace())) {
     AddToInputBuffer(data);
     return absl::OkStatus();
   }
@@ -163,7 +163,7 @@ absl::Status ZlibOutputBuffer::Append(StringPiece data) {
   TF_RETURN_IF_ERROR(DeflateBuffered(zlib_options_.flush_mode));
 
   // At this point input stream should be empty.
-  if (static_cast<int32>(bytes_to_write) <= AvailableInputSpace()) {
+  if (bytes_to_write <= static_cast<size_t>(AvailableInputSpace())) {
     AddToInputBuffer(data);
     return absl::OkStatus();
   }
diff --git a/third_party/xla/third_party/tsl/tsl/lib/monitoring/cell_reader-inl.cc b/third_party/xla/third_party/tsl/tsl/lib/monitoring/cell_reader-inl.cc
index e1a5012455bf2b..581048c961b34b 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/monitoring/cell_reader-inl.cc
+++ b/third_party/xla/third_party/tsl/tsl/lib/monitoring/cell_reader-inl.cc
@@ -60,9 +60,9 @@ MetricKind GetMetricKind(const CollectedMetrics& metrics,
   return metric_descriptor->second->metric_kind;
 }
 
-StatusOr<std::vector<Point>> GetPoints(const CollectedMetrics& metrics,
-                                       const std::string& metric_name,
-                                       const std::vector<std::string>& labels) {
+absl::StatusOr<std::vector<Point>> GetPoints(
+    const CollectedMetrics& metrics, const std::string& metric_name,
+    const std::vector<std::string>& labels) {
   auto metric_descriptor = metrics.metric_descriptor_map.find(metric_name);
   if (metric_descriptor == metrics.metric_descriptor_map.end()) {
     return errors::NotFound("Metric descriptor is not found for metric ",
@@ -91,9 +91,9 @@ StatusOr<std::vector<Point>> GetPoints(const CollectedMetrics& metrics,
   return points;
 }
 
-StatusOr<Point> GetLatestPoint(const CollectedMetrics& metrics,
-                               const std::string& metric_name,
-                               const std::vector<std::string>& labels) {
+absl::StatusOr<Point> GetLatestPoint(const CollectedMetrics& metrics,
+                                     const std::string& metric_name,
+                                     const std::vector<std::string>& labels) {
   TF_ASSIGN_OR_RETURN(std::vector<Point> points,
                       GetPoints(metrics, metric_name, labels));
   if (points.empty()) {
@@ -151,7 +151,7 @@ int64_t GetDelta(const int64_t& a, const int64_t& b) {
 
 template <>
 Histogram GetDelta(const Histogram& a, const Histogram& b) {
-  StatusOr<Histogram> result = a.Subtract(b);
+  absl::StatusOr<Histogram> result = a.Subtract(b);
   if (!result.ok()) {
     LOG(FATAL) << "Failed to compute the delta between histograms: "
                << result.status();
diff --git a/third_party/xla/third_party/tsl/tsl/lib/monitoring/cell_reader-inl.h b/third_party/xla/third_party/tsl/tsl/lib/monitoring/cell_reader-inl.h
index 74fae3b5d6ad70..e2fcf44664641c 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/monitoring/cell_reader-inl.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/monitoring/cell_reader-inl.h
@@ -47,9 +47,9 @@ MetricKind GetMetricKind(const CollectedMetrics& metrics,
 // If the metric does not exist, it returns a `NotFound` error. If the number of
 // labels does not match the metric definition, it returns an `InvalidArgument`
 // error.
-StatusOr<std::vector<Point>> GetPoints(const CollectedMetrics& metrics,
-                                       const std::string& metric_name,
-                                       const std::vector<std::string>& labels);
+absl::StatusOr<std::vector<Point>> GetPoints(
+    const CollectedMetrics& metrics, const std::string& metric_name,
+    const std::vector<std::string>& labels);
 
 // Returns the `Point` that corresponds to the latest data point collected for
 // `metric_name`, associated with the `labels`.
@@ -58,9 +58,9 @@ StatusOr<std::vector<Point>> GetPoints(const CollectedMetrics& metrics,
 // exists but no data is collected, it returns an `Unavailable` error. If the
 // number of labels does not match the metric definition, it returns an
 // `InvalidArgument` error.
-StatusOr<Point> GetLatestPoint(const CollectedMetrics& metrics,
-                               const std::string& metric_name,
-                               const std::vector<std::string>& labels);
+absl::StatusOr<Point> GetLatestPoint(const CollectedMetrics& metrics,
+                                     const std::string& metric_name,
+                                     const std::vector<std::string>& labels);
 
 // Returns the value of `point`. Currently, only int64_t (counter) values are
 // supported.
@@ -94,7 +94,8 @@ ValueType GetLatestValueOrDefault(const CollectedMetrics& metrics,
                                   const std::string& metric_name,
                                   const std::vector<std::string>& labels,
                                   const ValueType default_value = ValueType()) {
-  StatusOr<Point> latest_point = GetLatestPoint(metrics, metric_name, labels);
+  absl::StatusOr<Point> latest_point =
+      GetLatestPoint(metrics, metric_name, labels);
   if (absl::IsUnavailable(latest_point.status())) {
     return std::move(default_value);
   }
diff --git a/third_party/xla/third_party/tsl/tsl/lib/monitoring/counter.h b/third_party/xla/third_party/tsl/tsl/lib/monitoring/counter.h
index ec7b04981f495a..6f0169fe1f65c3 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/monitoring/counter.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/monitoring/counter.h
@@ -156,7 +156,7 @@ class Counter {
   template <typename... Labels>
   CounterCell* GetCell(const Labels&... labels) TF_LOCKS_EXCLUDED(mu_);
 
-  Status GetStatus() { return status_; }
+  absl::Status GetStatus() { return status_; }
 
  private:
   explicit Counter(
@@ -172,16 +172,17 @@ class Counter {
               }
             })) {
     if (registration_handle_) {
-      status_ = OkStatus();
+      status_ = absl::OkStatus();
     } else {
-      status_ = Status(absl::StatusCode::kAlreadyExists,
+      status_ =
+          absl::Status(absl::StatusCode::kAlreadyExists,
                        "Another metric with the same name already exists.");
     }
   }
 
   mutable mutex mu_;
 
-  Status status_;
+  absl::Status status_;
 
   using LabelArray = std::array<string, NumLabels>;
   std::map<LabelArray, CounterCell> cells_ TF_GUARDED_BY(mu_);
diff --git a/third_party/xla/third_party/tsl/tsl/lib/monitoring/gauge.h b/third_party/xla/third_party/tsl/tsl/lib/monitoring/gauge.h
index 0b69383b5f2d13..b600d98ab11265 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/monitoring/gauge.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/monitoring/gauge.h
@@ -225,7 +225,7 @@ class Gauge {
   template <typename... Labels>
   GaugeCell<ValueType>* GetCell(const Labels&... labels) TF_LOCKS_EXCLUDED(mu_);
 
-  Status GetStatus() { return status_; }
+  absl::Status GetStatus() { return status_; }
 
  private:
   explicit Gauge(
@@ -241,16 +241,17 @@ class Gauge {
               }
             })) {
     if (registration_handle_) {
-      status_ = OkStatus();
+      status_ = absl::OkStatus();
     } else {
-      status_ = Status(absl::StatusCode::kAlreadyExists,
+      status_ =
+          absl::Status(absl::StatusCode::kAlreadyExists,
                        "Another metric with the same name already exists.");
     }
   }
 
   mutable mutex mu_;
 
-  Status status_;
+  absl::Status status_;
 
   using LabelArray = std::array<string, NumLabels>;
   std::map<LabelArray, GaugeCell<ValueType> > cells_ TF_GUARDED_BY(mu_);
diff --git a/third_party/xla/third_party/tsl/tsl/lib/monitoring/percentile_sampler.h b/third_party/xla/third_party/tsl/tsl/lib/monitoring/percentile_sampler.h
index 5d14b5b2894bc6..6f031f3e0de2b0 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/monitoring/percentile_sampler.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/monitoring/percentile_sampler.h
@@ -177,7 +177,7 @@ class PercentileSampler {
   PercentileSamplerCell* GetCell(const Labels&... labels)
       TF_LOCKS_EXCLUDED(mu_);
 
-  Status GetStatus() { return status_; }
+  absl::Status GetStatus() { return status_; }
 
  private:
   friend class PercentileSamplerCell;
@@ -201,27 +201,29 @@ class PercentileSampler {
     if (registration_handle_) {
       for (size_t i = 0; i < percentiles_.size(); ++i) {
         if (percentiles_[i] < 0.0 || percentiles_[i] > 100.0) {
-          status_ = Status(absl::StatusCode::kInvalidArgument,
+          status_ =
+              absl::Status(absl::StatusCode::kInvalidArgument,
                            "Percentile values must be in [0, 100] range.");
           break;
         }
         if (i + 1 < percentiles_.size() &&
             percentiles_[i] >= percentiles_[i + 1]) {
-          status_ =
-              Status(absl::StatusCode::kInvalidArgument,
-                     "Percentile values must be in strictly ascending order.");
+          status_ = absl::Status(
+              absl::StatusCode::kInvalidArgument,
+              "Percentile values must be in strictly ascending order.");
           break;
         }
       }
     } else {
-      status_ = Status(absl::StatusCode::kAlreadyExists,
+      status_ =
+          absl::Status(absl::StatusCode::kAlreadyExists,
                        "Another metric with the same name already exists.");
     }
   }
 
   mutable mutex mu_;
 
-  Status status_;
+  absl::Status status_;
 
   using LabelArray = std::array<string, NumLabels>;
   // we need a container here that guarantees pointer stability of the value,
diff --git a/third_party/xla/third_party/tsl/tsl/lib/monitoring/sampler.h b/third_party/xla/third_party/tsl/tsl/lib/monitoring/sampler.h
index eae275c265bfb9..773fa8a3542a04 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/monitoring/sampler.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/monitoring/sampler.h
@@ -231,7 +231,7 @@ class Sampler {
   template <typename... Labels>
   SamplerCell* GetCell(const Labels&... labels) TF_LOCKS_EXCLUDED(mu_);
 
-  Status GetStatus() { return status_; }
+  absl::Status GetStatus() { return status_; }
 
  private:
   friend class SamplerCell;
@@ -251,16 +251,17 @@ class Sampler {
               }
             })) {
     if (registration_handle_) {
-      status_ = OkStatus();
+      status_ = absl::OkStatus();
     } else {
-      status_ = Status(absl::StatusCode::kAlreadyExists,
+      status_ =
+          absl::Status(absl::StatusCode::kAlreadyExists,
                        "Another metric with the same name already exists.");
     }
   }
 
   mutable mutex mu_;
 
-  Status status_;
+  absl::Status status_;
 
   using LabelArray = std::array<string, NumLabels>;
   // we need a container here that guarantees pointer stability of the value,
diff --git a/third_party/xla/third_party/tsl/tsl/lib/monitoring/test_utils.cc b/third_party/xla/third_party/tsl/tsl/lib/monitoring/test_utils.cc
index d0cc42e9dd8f3c..d09cdfe678ac0f 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/monitoring/test_utils.cc
+++ b/third_party/xla/third_party/tsl/tsl/lib/monitoring/test_utils.cc
@@ -40,7 +40,7 @@ double Histogram::num(size_t bucket) const {
 double Histogram::sum() const { return histogram_proto_.sum(); }
 double Histogram::sum_squares() const { return histogram_proto_.sum_squares(); }
 
-StatusOr<Histogram> Histogram::Subtract(const Histogram& other) const {
+absl::StatusOr<Histogram> Histogram::Subtract(const Histogram& other) const {
   HistogramProto histogram_proto = histogram_proto_;
   if (other.histogram_proto_.bucket_limit().empty() &&
       other.histogram_proto_.bucket().empty()) {
diff --git a/third_party/xla/third_party/tsl/tsl/lib/monitoring/test_utils.h b/third_party/xla/third_party/tsl/tsl/lib/monitoring/test_utils.h
index 8d046b0f9035e5..7aca57e6622d10 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/monitoring/test_utils.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/monitoring/test_utils.h
@@ -58,7 +58,7 @@ class Histogram final {
   //   - This histogram has more or equal number of samples than `other` in
   //     every bucket.
   // Returns an InvalidArgument error if the requirements are violated.
-  StatusOr<Histogram> Subtract(const Histogram& other) const;
+  absl::StatusOr<Histogram> Subtract(const Histogram& other) const;
 
  private:
   HistogramProto histogram_proto_;
diff --git a/third_party/xla/third_party/tsl/tsl/platform/BUILD b/third_party/xla/third_party/tsl/tsl/platform/BUILD
index acbcb3350d401a..273fd530636675 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/BUILD
@@ -94,6 +94,7 @@ cc_library(
     deps = [
         ":byte_order",
         ":stringpiece",
+        ":tstring",
         ":types",
     ],
 )
@@ -1024,7 +1025,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         "@ml_dtypes//:float8",
-        "@ml_dtypes//:int4",
+        "@ml_dtypes//:intn",
     ],
 )
 
@@ -1454,6 +1455,7 @@ tsl_cc_test(
         "//tsl/platform/testdata:test_noop",
         "//tsl/platform/testdata:test_stderr",
     ],
+    tags = ["no_oss"],  # TODO(b/327036247): revisit after this moves to XLA
     deps = [
         ":path",
         ":strcat",
@@ -1706,6 +1708,12 @@ bzl_library(
     deps = tf_platform_alias("rules_cc_bzl"),
 )
 
+bzl_library(
+    name = "build_config_root_bzl",
+    srcs = ["build_config_root.bzl"],
+    deps = tf_platform_alias("build_config_root_bzl"),
+)
+
 cc_library(
     name = "retrying_utils",
     srcs = [
diff --git a/third_party/xla/third_party/tsl/tsl/platform/cloud/BUILD b/third_party/xla/third_party/tsl/tsl/platform/cloud/BUILD
index 2ce4463f65b7ee..bff2db41d43626 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/cloud/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/cloud/BUILD
@@ -455,6 +455,7 @@ tsl_cc_test(
         "//tsl/platform/cloud/testdata:service_account_credentials",
         "//tsl/platform/cloud/testdata:service_account_public_key",
     ],
+    tags = ["no_oss"],  # TODO(b/327036247): revisit after this moves to XLA
     deps = [
         ":http_request_fake",
         ":oauth_client",
@@ -478,6 +479,7 @@ tsl_cc_test(
         "//tsl/platform/cloud/testdata:application_default_credentials",
         "//tsl/platform/cloud/testdata:service_account_credentials",
     ],
+    tags = ["no_oss"],  # TODO(b/327036247): revisit after this moves to XLA
     deps = [
         ":google_auth_provider",
         ":http_request_fake",
diff --git a/third_party/xla/third_party/tsl/tsl/platform/coding.cc b/third_party/xla/third_party/tsl/tsl/platform/coding.cc
index f8ebe24073c917..c70890ac51d702 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/coding.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/coding.cc
@@ -16,6 +16,9 @@ limitations under the License.
 #include "tsl/platform/coding.h"
 
 #include "tsl/platform/byte_order.h"
+#include "tsl/platform/stringpiece.h"
+#include "tsl/platform/tstring.h"
+#include "tsl/platform/types.h"
 
 namespace tsl {
 namespace core {
diff --git a/third_party/xla/third_party/tsl/tsl/platform/coding.h b/third_party/xla/third_party/tsl/tsl/platform/coding.h
index 65ce99287a9b8f..50366ef2edeae4 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/coding.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/coding.h
@@ -22,6 +22,7 @@ limitations under the License.
 #define TENSORFLOW_TSL_PLATFORM_CODING_H_
 
 #include "tsl/platform/stringpiece.h"
+#include "tsl/platform/tstring.h"
 #include "tsl/platform/types.h"
 
 namespace tsl {
diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/BUILD b/third_party/xla/third_party/tsl/tsl/platform/default/BUILD
index c09089518ec1f1..087a2fc9d34c6b 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/default/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/BUILD
@@ -235,8 +235,11 @@ cc_library(
     deps = [
         "//tsl/platform:errors",
         "//tsl/platform:protobuf",
-        "//tsl/platform:status",
         "//tsl/platform:strcat",
+        "//tsl/platform:stringpiece",
+        "//tsl/platform:types",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
     ],
 )
 
@@ -554,6 +557,12 @@ bzl_library(
     srcs = ["rules_cc.bzl"],
 )
 
+bzl_library(
+    name = "build_config_root_bzl",
+    srcs = ["build_config_root.bzl"],
+    # copybara:uncomment parse_tests = False,
+)
+
 # Export source files needed for mobile builds, which do not use granular targets.
 filegroup(
     name = "additional_mobile_srcs_no_runtime",
diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/build_config_root.bzl b/third_party/xla/third_party/tsl/tsl/platform/default/build_config_root.bzl
index 7524e07b872ace..142641b16d2fa3 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/default/build_config_root.bzl
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/build_config_root.bzl
@@ -12,7 +12,7 @@ GPU_TEST_PROPERTIES = {
 }
 
 def tf_gpu_tests_tags():
-    return ["requires-gpu", "gpu"] + gpu_test_tags()
+    return ["requires-gpu-nvidia", "gpu"] + gpu_test_tags()
 
 # terminology changes: saving tf_cuda_* for compatibility
 def tf_cuda_tests_tags():
diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/human_readable_json.cc b/third_party/xla/third_party/tsl/tsl/platform/default/human_readable_json.cc
index 5f2685cd93f766..4f70c155dd4ef3 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/default/human_readable_json.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/human_readable_json.cc
@@ -15,22 +15,27 @@ limitations under the License.
 
 #include "tsl/platform/human_readable_json.h"
 
+#include <string>
+#include <utility>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "tsl/platform/errors.h"
-#include "tsl/platform/status.h"
 #include "tsl/platform/strcat.h"
+#include "tsl/platform/stringpiece.h"
+#include "tsl/platform/types.h"
 
 namespace tsl {
 
-absl::Status ProtoToHumanReadableJson(const protobuf::Message& proto,
-                                      string* result,
-                                      bool ignore_accuracy_loss) {
-  result->clear();
+absl::StatusOr<std::string> ProtoToHumanReadableJson(
+    const protobuf::Message& proto, bool ignore_accuracy_loss) {
+  std::string result;
 
   protobuf::util::JsonPrintOptions json_options;
   json_options.preserve_proto_field_names = true;
   json_options.always_print_primitive_fields = true;
   auto status =
-      protobuf::util::MessageToJsonString(proto, result, json_options);
+      protobuf::util::MessageToJsonString(proto, &result, json_options);
   if (!status.ok()) {
     // Convert error_msg google::protobuf::StringPiece to
     // tsl::StringPiece.
@@ -39,14 +44,12 @@ absl::Status ProtoToHumanReadableJson(const protobuf::Message& proto,
         strings::StrCat("Could not convert proto to JSON string: ",
                         StringPiece(error_msg.data(), error_msg.length())));
   }
-  return absl::OkStatus();
+  return std::move(result);
 }
 
-absl::Status ProtoToHumanReadableJson(const protobuf::MessageLite& proto,
-                                      string* result,
-                                      bool ignore_accuracy_loss) {
-  *result = "[human readable output not available for lite protos]";
-  return absl::OkStatus();
+absl::StatusOr<std::string> ProtoToHumanReadableJson(
+    const protobuf::MessageLite& proto, bool ignore_accuracy_loss) {
+  return std::string("[human readable output not available for lite protos]");
 }
 
 absl::Status HumanReadableJsonToProto(const string& str,
diff --git a/third_party/xla/third_party/tsl/tsl/platform/env.cc b/third_party/xla/third_party/tsl/tsl/platform/env.cc
index 2649aa00a5d33c..45199b50ebd94b 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/env.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/env.cc
@@ -55,11 +55,11 @@ constexpr size_t kCopyFileBufferSize = 128 * 1024;
 
 class FileSystemRegistryImpl : public FileSystemRegistry {
  public:
-  Status Register(const std::string& scheme, Factory factory) override;
-  Status Register(const std::string& scheme,
-                  std::unique_ptr<FileSystem> filesystem) override;
+  absl::Status Register(const std::string& scheme, Factory factory) override;
+  absl::Status Register(const std::string& scheme,
+                        std::unique_ptr<FileSystem> filesystem) override;
   FileSystem* Lookup(const std::string& scheme) override;
-  Status GetRegisteredFileSystemSchemes(
+  absl::Status GetRegisteredFileSystemSchemes(
       std::vector<std::string>* schemes) override;
 
  private:
@@ -68,25 +68,25 @@ class FileSystemRegistryImpl : public FileSystemRegistry {
       TF_GUARDED_BY(mu_);
 };
 
-Status FileSystemRegistryImpl::Register(const std::string& scheme,
-                                        FileSystemRegistry::Factory factory) {
+absl::Status FileSystemRegistryImpl::Register(
+    const std::string& scheme, FileSystemRegistry::Factory factory) {
   mutex_lock lock(mu_);
   if (!registry_.emplace(scheme, std::unique_ptr<FileSystem>(factory()))
            .second) {
     return errors::AlreadyExists("File factory for ", scheme,
                                  " already registered");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status FileSystemRegistryImpl::Register(
+absl::Status FileSystemRegistryImpl::Register(
     const std::string& scheme, std::unique_ptr<FileSystem> filesystem) {
   mutex_lock lock(mu_);
   if (!registry_.emplace(scheme, std::move(filesystem)).second) {
     return errors::AlreadyExists("File system for ", scheme,
                                  " already registered");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 FileSystem* FileSystemRegistryImpl::Lookup(const std::string& scheme) {
@@ -98,19 +98,19 @@ FileSystem* FileSystemRegistryImpl::Lookup(const std::string& scheme) {
   return found->second.get();
 }
 
-Status FileSystemRegistryImpl::GetRegisteredFileSystemSchemes(
+absl::Status FileSystemRegistryImpl::GetRegisteredFileSystemSchemes(
     std::vector<std::string>* schemes) {
   mutex_lock lock(mu_);
   for (const auto& e : registry_) {
     schemes->push_back(e.first);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Env::Env() : file_system_registry_(new FileSystemRegistryImpl) {}
 
-Status Env::GetFileSystemForFile(const std::string& fname,
-                                 FileSystem** result) {
+absl::Status Env::GetFileSystemForFile(const std::string& fname,
+                                       FileSystem** result) {
   StringPiece scheme, host, path;
   io::ParseURI(fname, &scheme, &host, &path);
   FileSystem* file_system = file_system_registry_->Lookup(std::string(scheme));
@@ -123,25 +123,26 @@ Status Env::GetFileSystemForFile(const std::string& fname,
                                  "' not implemented (file: '", fname, "')");
   }
   *result = file_system;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status Env::GetRegisteredFileSystemSchemes(std::vector<std::string>* schemes) {
+absl::Status Env::GetRegisteredFileSystemSchemes(
+    std::vector<std::string>* schemes) {
   return file_system_registry_->GetRegisteredFileSystemSchemes(schemes);
 }
 
-Status Env::RegisterFileSystem(const std::string& scheme,
-                               FileSystemRegistry::Factory factory) {
+absl::Status Env::RegisterFileSystem(const std::string& scheme,
+                                     FileSystemRegistry::Factory factory) {
   return file_system_registry_->Register(scheme, std::move(factory));
 }
 
-Status Env::RegisterFileSystem(const std::string& scheme,
-                               std::unique_ptr<FileSystem> filesystem) {
+absl::Status Env::RegisterFileSystem(const std::string& scheme,
+                                     std::unique_ptr<FileSystem> filesystem) {
   return file_system_registry_->Register(scheme, std::move(filesystem));
 }
 
-Status Env::SetOption(const std::string& scheme, const std::string& key,
-                      const std::string& value) {
+absl::Status Env::SetOption(const std::string& scheme, const std::string& key,
+                            const std::string& value) {
   FileSystem* file_system = file_system_registry_->Lookup(scheme);
   if (!file_system) {
     return errors::Unimplemented("File system scheme '", scheme,
@@ -150,8 +151,8 @@ Status Env::SetOption(const std::string& scheme, const std::string& key,
   return file_system->SetOption(key, value);
 }
 
-Status Env::SetOption(const std::string& scheme, const std::string& key,
-                      const std::vector<string>& values) {
+absl::Status Env::SetOption(const std::string& scheme, const std::string& key,
+                            const std::vector<string>& values) {
   FileSystem* file_system = file_system_registry_->Lookup(scheme);
   if (!file_system) {
     return errors::Unimplemented("File system scheme '", scheme,
@@ -160,8 +161,8 @@ Status Env::SetOption(const std::string& scheme, const std::string& key,
   return file_system->SetOption(key, values);
 }
 
-Status Env::SetOption(const std::string& scheme, const std::string& key,
-                      const std::vector<int64_t>& values) {
+absl::Status Env::SetOption(const std::string& scheme, const std::string& key,
+                            const std::vector<int64_t>& values) {
   FileSystem* file_system = file_system_registry_->Lookup(scheme);
   if (!file_system) {
     return errors::Unimplemented("File system scheme '", scheme,
@@ -170,8 +171,8 @@ Status Env::SetOption(const std::string& scheme, const std::string& key,
   return file_system->SetOption(key, values);
 }
 
-Status Env::SetOption(const std::string& scheme, const std::string& key,
-                      const std::vector<double>& values) {
+absl::Status Env::SetOption(const std::string& scheme, const std::string& key,
+                            const std::vector<double>& values) {
   FileSystem* file_system = file_system_registry_->Lookup(scheme);
   if (!file_system) {
     return errors::Unimplemented("File system scheme '", scheme,
@@ -180,7 +181,7 @@ Status Env::SetOption(const std::string& scheme, const std::string& key,
   return file_system->SetOption(key, values);
 }
 
-Status Env::FlushFileSystemCaches() {
+absl::Status Env::FlushFileSystemCaches() {
   std::vector<string> schemes;
   TF_RETURN_IF_ERROR(GetRegisteredFileSystemSchemes(&schemes));
   for (const string& scheme : schemes) {
@@ -189,45 +190,45 @@ Status Env::FlushFileSystemCaches() {
         GetFileSystemForFile(io::CreateURI(scheme, "", ""), &fs));
     fs->FlushCaches();
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status Env::NewRandomAccessFile(const string& fname,
-                                std::unique_ptr<RandomAccessFile>* result) {
+absl::Status Env::NewRandomAccessFile(
+    const string& fname, std::unique_ptr<RandomAccessFile>* result) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(fname, &fs));
   return fs->NewRandomAccessFile(fname, result);
 }
 
-Status Env::NewReadOnlyMemoryRegionFromFile(
+absl::Status Env::NewReadOnlyMemoryRegionFromFile(
     const string& fname, std::unique_ptr<ReadOnlyMemoryRegion>* result) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(fname, &fs));
   return fs->NewReadOnlyMemoryRegionFromFile(fname, result);
 }
 
-Status Env::NewWritableFile(const string& fname,
-                            std::unique_ptr<WritableFile>* result) {
+absl::Status Env::NewWritableFile(const string& fname,
+                                  std::unique_ptr<WritableFile>* result) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(fname, &fs));
   return fs->NewWritableFile(fname, result);
 }
 
-Status Env::NewAppendableFile(const string& fname,
-                              std::unique_ptr<WritableFile>* result) {
+absl::Status Env::NewAppendableFile(const string& fname,
+                                    std::unique_ptr<WritableFile>* result) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(fname, &fs));
   return fs->NewAppendableFile(fname, result);
 }
 
-Status Env::FileExists(const string& fname) {
+absl::Status Env::FileExists(const string& fname) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(fname, &fs));
   return fs->FileExists(fname);
 }
 
 bool Env::FilesExist(const std::vector<string>& files,
-                     std::vector<Status>* status) {
+                     std::vector<absl::Status>* status) {
   std::unordered_map<string, std::vector<string>> files_per_fs;
   for (const auto& file : files) {
     StringPiece scheme, host, path;
@@ -235,18 +236,18 @@ bool Env::FilesExist(const std::vector<string>& files,
     files_per_fs[string(scheme)].push_back(file);
   }
 
-  std::unordered_map<string, Status> per_file_status;
+  std::unordered_map<string, absl::Status> per_file_status;
   bool result = true;
   for (auto itr : files_per_fs) {
     FileSystem* file_system = file_system_registry_->Lookup(itr.first);
     bool fs_result;
-    std::vector<Status> local_status;
-    std::vector<Status>* fs_status = status ? &local_status : nullptr;
+    std::vector<absl::Status> local_status;
+    std::vector<absl::Status>* fs_status = status ? &local_status : nullptr;
     if (!file_system) {
       fs_result = false;
       if (fs_status) {
-        Status s = errors::Unimplemented("File system scheme '", itr.first,
-                                         "' not implemented");
+        absl::Status s = errors::Unimplemented("File system scheme '",
+                                               itr.first, "' not implemented");
         local_status.resize(itr.second.size(), s);
       }
     } else {
@@ -272,75 +273,76 @@ bool Env::FilesExist(const std::vector<string>& files,
   return result;
 }
 
-Status Env::GetChildren(const string& dir, std::vector<string>* result) {
+absl::Status Env::GetChildren(const string& dir, std::vector<string>* result) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(dir, &fs));
   return fs->GetChildren(dir, result);
 }
 
-Status Env::GetMatchingPaths(const string& pattern,
-                             std::vector<string>* results) {
+absl::Status Env::GetMatchingPaths(const string& pattern,
+                                   std::vector<string>* results) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(pattern, &fs));
   return fs->GetMatchingPaths(pattern, results);
 }
 
-Status Env::DeleteFile(const string& fname) {
+absl::Status Env::DeleteFile(const string& fname) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(fname, &fs));
   return fs->DeleteFile(fname);
 }
 
-Status Env::RecursivelyCreateDir(const string& dirname) {
+absl::Status Env::RecursivelyCreateDir(const string& dirname) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(dirname, &fs));
   return fs->RecursivelyCreateDir(dirname);
 }
 
-Status Env::CreateDir(const string& dirname) {
+absl::Status Env::CreateDir(const string& dirname) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(dirname, &fs));
   return fs->CreateDir(dirname);
 }
 
-Status Env::DeleteDir(const string& dirname) {
+absl::Status Env::DeleteDir(const string& dirname) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(dirname, &fs));
   return fs->DeleteDir(dirname);
 }
 
-Status Env::Stat(const string& fname, FileStatistics* stat) {
+absl::Status Env::Stat(const string& fname, FileStatistics* stat) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(fname, &fs));
   return fs->Stat(fname, stat);
 }
 
-Status Env::IsDirectory(const string& fname) {
+absl::Status Env::IsDirectory(const string& fname) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(fname, &fs));
   return fs->IsDirectory(fname);
 }
 
-Status Env::HasAtomicMove(const string& path, bool* has_atomic_move) {
+absl::Status Env::HasAtomicMove(const string& path, bool* has_atomic_move) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(path, &fs));
   return fs->HasAtomicMove(path, has_atomic_move);
 }
 
-Status Env::DeleteRecursively(const string& dirname, int64_t* undeleted_files,
-                              int64_t* undeleted_dirs) {
+absl::Status Env::DeleteRecursively(const string& dirname,
+                                    int64_t* undeleted_files,
+                                    int64_t* undeleted_dirs) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(dirname, &fs));
   return fs->DeleteRecursively(dirname, undeleted_files, undeleted_dirs);
 }
 
-Status Env::GetFileSize(const string& fname, uint64* file_size) {
+absl::Status Env::GetFileSize(const string& fname, uint64* file_size) {
   FileSystem* fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(fname, &fs));
   return fs->GetFileSize(fname, file_size);
 }
 
-Status Env::RenameFile(const string& src, const string& target) {
+absl::Status Env::RenameFile(const string& src, const string& target) {
   FileSystem* src_fs;
   FileSystem* target_fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(src, &src_fs));
@@ -352,7 +354,7 @@ Status Env::RenameFile(const string& src, const string& target) {
   return src_fs->RenameFile(src, target);
 }
 
-Status Env::CopyFile(const string& src, const string& target) {
+absl::Status Env::CopyFile(const string& src, const string& target) {
   FileSystem* src_fs;
   FileSystem* target_fs;
   TF_RETURN_IF_ERROR(GetFileSystemForFile(src, &src_fs));
@@ -464,9 +466,9 @@ Thread::~Thread() {}
 
 EnvWrapper::~EnvWrapper() {}
 
-Status ReadFileToString(Env* env, const string& fname, string* data) {
+absl::Status ReadFileToString(Env* env, const string& fname, string* data) {
   uint64 file_size;
-  Status s = env->GetFileSize(fname, &file_size);
+  absl::Status s = env->GetFileSize(fname, &file_size);
   if (!s.ok()) {
     return s;
   }
@@ -493,10 +495,10 @@ Status ReadFileToString(Env* env, const string& fname, string* data) {
   return s;
 }
 
-Status WriteStringToFile(Env* env, const string& fname,
-                         const StringPiece& data) {
+absl::Status WriteStringToFile(Env* env, const string& fname,
+                               const StringPiece& data) {
   std::unique_ptr<WritableFile> file;
-  Status s = env->NewWritableFile(fname, &file);
+  absl::Status s = env->NewWritableFile(fname, &file);
   if (!s.ok()) {
     return s;
   }
@@ -507,8 +509,8 @@ Status WriteStringToFile(Env* env, const string& fname,
   return s;
 }
 
-Status FileSystemCopyFile(FileSystem* src_fs, const string& src,
-                          FileSystem* target_fs, const string& target) {
+absl::Status FileSystemCopyFile(FileSystem* src_fs, const string& src,
+                                FileSystem* target_fs, const string& target) {
   std::unique_ptr<RandomAccessFile> src_file;
   TF_RETURN_IF_ERROR(src_fs->NewRandomAccessFile(src, &src_file));
 
@@ -525,7 +527,7 @@ Status FileSystemCopyFile(FileSystem* src_fs, const string& src,
 
   uint64 offset = 0;
   std::unique_ptr<char[]> scratch(new char[kCopyFileBufferSize]);
-  Status s = OkStatus();
+  absl::Status s = absl::OkStatus();
   while (s.ok()) {
     StringPiece result;
     s = src_file->Read(offset, kCopyFileBufferSize, &result, scratch.get());
@@ -550,11 +552,11 @@ class FileStream : public protobuf::io::ZeroCopyInputStream {
     return true;
   }
   int64_t ByteCount() const override { return pos_; }
-  Status status() const { return status_; }
+  absl::Status status() const { return status_; }
 
   bool Next(const void** data, int* size) override {
     StringPiece result;
-    Status s = file_->Read(pos_, kBufSize, &result, scratch_);
+    absl::Status s = file_->Read(pos_, kBufSize, &result, scratch_);
     if (result.empty()) {
       status_ = s;
       return false;
@@ -570,21 +572,21 @@ class FileStream : public protobuf::io::ZeroCopyInputStream {
 
   RandomAccessFile* file_;
   int64_t pos_;
-  Status status_;
+  absl::Status status_;
   char scratch_[kBufSize];
 };
 
 }  // namespace
 
-Status WriteBinaryProto(Env* env, const string& fname,
-                        const protobuf::MessageLite& proto) {
+absl::Status WriteBinaryProto(Env* env, const string& fname,
+                              const protobuf::MessageLite& proto) {
   string serialized;
   proto.AppendToString(&serialized);
   return WriteStringToFile(env, fname, serialized);
 }
 
-Status ReadBinaryProto(Env* env, const string& fname,
-                       protobuf::MessageLite* proto) {
+absl::Status ReadBinaryProto(Env* env, const string& fname,
+                             protobuf::MessageLite* proto) {
   std::unique_ptr<RandomAccessFile> file;
   TF_RETURN_IF_ERROR(env->NewRandomAccessFile(fname, &file));
   std::unique_ptr<FileStream> stream(new FileStream(file.get()));
@@ -595,11 +597,11 @@ Status ReadBinaryProto(Env* env, const string& fname,
     TF_RETURN_IF_ERROR(stream->status());
     return errors::DataLoss("Can't parse ", fname, " as binary proto");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status WriteTextProto(Env* env, const string& fname,
-                      const protobuf::Message& proto) {
+absl::Status WriteTextProto(Env* env, const string& fname,
+                            const protobuf::Message& proto) {
   string serialized;
   if (!protobuf::TextFormat::PrintToString(proto, &serialized)) {
     return errors::FailedPrecondition("Unable to convert proto to text.");
@@ -607,7 +609,8 @@ Status WriteTextProto(Env* env, const string& fname,
   return WriteStringToFile(env, fname, serialized);
 }
 
-Status ReadTextProto(Env* env, const string& fname, protobuf::Message* proto) {
+absl::Status ReadTextProto(Env* env, const string& fname,
+                           protobuf::Message* proto) {
   std::unique_ptr<RandomAccessFile> file;
   TF_RETURN_IF_ERROR(env->NewRandomAccessFile(fname, &file));
   std::unique_ptr<FileStream> stream(new FileStream(file.get()));
@@ -616,19 +619,19 @@ Status ReadTextProto(Env* env, const string& fname, protobuf::Message* proto) {
     TF_RETURN_IF_ERROR(stream->status());
     return errors::DataLoss("Can't parse ", fname, " as text proto");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status ReadTextOrBinaryProto(Env* env, const string& fname,
-                             protobuf::Message* proto) {
+absl::Status ReadTextOrBinaryProto(Env* env, const string& fname,
+                                   protobuf::Message* proto) {
   if (ReadTextProto(env, fname, proto).ok()) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   return ReadBinaryProto(env, fname, proto);
 }
 
-Status ReadTextOrBinaryProto(Env* env, const string& fname,
-                             protobuf::MessageLite* proto) {
+absl::Status ReadTextOrBinaryProto(Env* env, const string& fname,
+                                   protobuf::MessageLite* proto) {
   return ReadBinaryProto(env, fname, proto);
 }
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/env.h b/third_party/xla/third_party/tsl/tsl/platform/env.h
index 35b446a99445a5..6953ac8722976a 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/env.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/env.h
@@ -79,16 +79,16 @@ class Env {
   /// for the file system related (non-virtual) functions that follow.
   /// Returned FileSystem object is still owned by the Env object and will
   // (might) be destroyed when the environment is destroyed.
-  virtual Status GetFileSystemForFile(const std::string& fname,
-                                      FileSystem** result);
+  virtual absl::Status GetFileSystemForFile(const std::string& fname,
+                                            FileSystem** result);
 
   /// \brief Returns the file system schemes registered for this Env.
-  virtual Status GetRegisteredFileSystemSchemes(
+  virtual absl::Status GetRegisteredFileSystemSchemes(
       std::vector<std::string>* schemes);
 
   /// \brief Register a file system for a scheme.
-  virtual Status RegisterFileSystem(const std::string& scheme,
-                                    FileSystemRegistry::Factory factory);
+  virtual absl::Status RegisterFileSystem(const std::string& scheme,
+                                          FileSystemRegistry::Factory factory);
 
   /// \brief Register a modular file system for a scheme.
   ///
@@ -96,23 +96,23 @@ class Env {
   ///
   /// TODO(b/139060984): After all filesystems are converted, make this be the
   /// canonical registration function.
-  virtual Status RegisterFileSystem(const std::string& scheme,
-                                    std::unique_ptr<FileSystem> filesystem);
+  virtual absl::Status RegisterFileSystem(
+      const std::string& scheme, std::unique_ptr<FileSystem> filesystem);
 
-  Status SetOption(const std::string& scheme, const std::string& key,
-                   const std::string& value);
+  absl::Status SetOption(const std::string& scheme, const std::string& key,
+                         const std::string& value);
 
-  Status SetOption(const std::string& scheme, const std::string& key,
-                   const std::vector<string>& values);
+  absl::Status SetOption(const std::string& scheme, const std::string& key,
+                         const std::vector<string>& values);
 
-  Status SetOption(const std::string& scheme, const std::string& key,
-                   const std::vector<int64_t>& values);
+  absl::Status SetOption(const std::string& scheme, const std::string& key,
+                         const std::vector<int64_t>& values);
 
-  Status SetOption(const std::string& scheme, const std::string& key,
-                   const std::vector<double>& values);
+  absl::Status SetOption(const std::string& scheme, const std::string& key,
+                         const std::vector<double>& values);
 
   /// \brief Flush filesystem caches for all registered filesystems.
-  Status FlushFileSystemCaches();
+  absl::Status FlushFileSystemCaches();
 
   /// \brief Creates a brand new random access read-only file with the
   /// specified name.
@@ -127,14 +127,15 @@ class Env {
   /// The ownership of the returned RandomAccessFile is passed to the caller
   /// and the object should be deleted when is not used. The file object
   /// shouldn't live longer than the Env object.
-  Status NewRandomAccessFile(const std::string& fname,
-                             std::unique_ptr<RandomAccessFile>* result);
+  absl::Status NewRandomAccessFile(const std::string& fname,
+                                   std::unique_ptr<RandomAccessFile>* result);
 
-  Status NewRandomAccessFile(const std::string& fname, TransactionToken* token,
-                             std::unique_ptr<RandomAccessFile>* result) {
+  absl::Status NewRandomAccessFile(const std::string& fname,
+                                   TransactionToken* token,
+                                   std::unique_ptr<RandomAccessFile>* result) {
     // We duplicate these methods due to Google internal coding style prevents
     // virtual functions with default arguments. See PR #41615.
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   /// \brief Creates an object that writes to a new file with the specified
@@ -150,12 +151,13 @@ class Env {
   /// The ownership of the returned WritableFile is passed to the caller
   /// and the object should be deleted when is not used. The file object
   /// shouldn't live longer than the Env object.
-  Status NewWritableFile(const std::string& fname,
-                         std::unique_ptr<WritableFile>* result);
+  absl::Status NewWritableFile(const std::string& fname,
+                               std::unique_ptr<WritableFile>* result);
 
-  Status NewWritableFile(const std::string& fname, TransactionToken* token,
-                         std::unique_ptr<WritableFile>* result) {
-    return OkStatus();
+  absl::Status NewWritableFile(const std::string& fname,
+                               TransactionToken* token,
+                               std::unique_ptr<WritableFile>* result) {
+    return absl::OkStatus();
   }
 
   /// \brief Creates an object that either appends to an existing file, or
@@ -170,12 +172,13 @@ class Env {
   /// The ownership of the returned WritableFile is passed to the caller
   /// and the object should be deleted when is not used. The file object
   /// shouldn't live longer than the Env object.
-  Status NewAppendableFile(const std::string& fname,
-                           std::unique_ptr<WritableFile>* result);
+  absl::Status NewAppendableFile(const std::string& fname,
+                                 std::unique_ptr<WritableFile>* result);
 
-  Status NewAppendableFile(const std::string& fname, TransactionToken* token,
-                           std::unique_ptr<WritableFile>* result) {
-    return OkStatus();
+  absl::Status NewAppendableFile(const std::string& fname,
+                                 TransactionToken* token,
+                                 std::unique_ptr<WritableFile>* result) {
+    return absl::OkStatus();
   }
   /// \brief Creates a readonly region of memory with the file context.
   ///
@@ -188,30 +191,30 @@ class Env {
   /// The ownership of the returned ReadOnlyMemoryRegion is passed to the caller
   /// and the object should be deleted when is not used. The memory region
   /// object shouldn't live longer than the Env object.
-  Status NewReadOnlyMemoryRegionFromFile(
+  absl::Status NewReadOnlyMemoryRegionFromFile(
       const std::string& fname, std::unique_ptr<ReadOnlyMemoryRegion>* result);
 
-  Status NewReadOnlyMemoryRegionFromFile(
+  absl::Status NewReadOnlyMemoryRegionFromFile(
       const std::string& fname, TransactionToken* token,
       std::unique_ptr<ReadOnlyMemoryRegion>* result) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   /// Returns OK if the named path exists and NOT_FOUND otherwise.
-  Status FileExists(const std::string& fname);
+  absl::Status FileExists(const std::string& fname);
 
-  Status FileExists(const std::string& fname, TransactionToken* token) {
-    return OkStatus();
+  absl::Status FileExists(const std::string& fname, TransactionToken* token) {
+    return absl::OkStatus();
   }
 
   /// Returns true if all the listed files exist, false otherwise.
   /// if status is not null, populate the vector with a detailed status
   /// for each file.
   bool FilesExist(const std::vector<string>& files,
-                  std::vector<Status>* status);
+                  std::vector<absl::Status>* status);
 
   bool FilesExist(const std::vector<string>& files, TransactionToken* token,
-                  std::vector<Status>* status) {
+                  std::vector<absl::Status>* status) {
     return true;
   }
 
@@ -219,11 +222,11 @@ class Env {
   /// directory. The names are relative to "dir".
   ///
   /// Original contents of *results are dropped.
-  Status GetChildren(const std::string& dir, std::vector<string>* result);
+  absl::Status GetChildren(const std::string& dir, std::vector<string>* result);
 
-  Status GetChildren(const std::string& dir, TransactionToken* token,
-                     std::vector<string>* result) {
-    return OkStatus();
+  absl::Status GetChildren(const std::string& dir, TransactionToken* token,
+                           std::vector<string>* result) {
+    return absl::OkStatus();
   }
 
   /// \brief Returns true if the path matches the given pattern. The wildcards
@@ -235,19 +238,20 @@ class Env {
   /// that pattern. *results is cleared.
   ///
   /// More details about `pattern` in FileSystem::GetMatchingPaths.
-  virtual Status GetMatchingPaths(const std::string& pattern,
-                                  std::vector<string>* results);
+  virtual absl::Status GetMatchingPaths(const std::string& pattern,
+                                        std::vector<string>* results);
 
-  Status GetMatchingPaths(const std::string& pattern, TransactionToken* token,
-                          std::vector<string>* results) {
-    return OkStatus();
+  absl::Status GetMatchingPaths(const std::string& pattern,
+                                TransactionToken* token,
+                                std::vector<string>* results) {
+    return absl::OkStatus();
   }
 
   /// Deletes the named file.
-  Status DeleteFile(const std::string& fname);
+  absl::Status DeleteFile(const std::string& fname);
 
-  Status DeleteFile(const std::string& fname, TransactionToken* token) {
-    return OkStatus();
+  absl::Status DeleteFile(const std::string& fname, TransactionToken* token) {
+    return absl::OkStatus();
   }
 
   /// \brief Deletes the specified directory and all subdirectories and files
@@ -274,12 +278,15 @@ class Env {
   ///  * PERMISSION_DENIED - dirname or some descendant is not writable
   ///  * UNIMPLEMENTED - Some underlying functions (like Delete) are not
   ///                    implemented
-  Status DeleteRecursively(const std::string& dirname, int64_t* undeleted_files,
-                           int64_t* undeleted_dirs);
+  absl::Status DeleteRecursively(const std::string& dirname,
+                                 int64_t* undeleted_files,
+                                 int64_t* undeleted_dirs);
 
-  Status DeleteRecursively(const std::string& dirname, TransactionToken* token,
-                           int64_t* undeleted_files, int64_t* undeleted_dirs) {
-    return OkStatus();
+  absl::Status DeleteRecursively(const std::string& dirname,
+                                 TransactionToken* token,
+                                 int64_t* undeleted_files,
+                                 int64_t* undeleted_dirs) {
+    return absl::OkStatus();
   }
 
   /// \brief Creates the specified directory and all the necessary
@@ -287,35 +294,35 @@ class Env {
   ///  * OK - successfully created the directory and sub directories, even if
   ///         they were already created.
   ///  * PERMISSION_DENIED - dirname or some subdirectory is not writable.
-  Status RecursivelyCreateDir(const std::string& dirname);
+  absl::Status RecursivelyCreateDir(const std::string& dirname);
 
-  Status RecursivelyCreateDir(const std::string& dirname,
-                              TransactionToken* token) {
-    return OkStatus();
+  absl::Status RecursivelyCreateDir(const std::string& dirname,
+                                    TransactionToken* token) {
+    return absl::OkStatus();
   }
   /// \brief Creates the specified directory. Typical return codes
   ///  * OK - successfully created the directory.
   ///  * ALREADY_EXISTS - directory already exists.
   ///  * PERMISSION_DENIED - dirname is not writable.
-  Status CreateDir(const std::string& dirname);
+  absl::Status CreateDir(const std::string& dirname);
 
-  Status CreateDir(const std::string& dirname, TransactionToken* token) {
-    return OkStatus();
+  absl::Status CreateDir(const std::string& dirname, TransactionToken* token) {
+    return absl::OkStatus();
   }
 
   /// Deletes the specified directory.
-  Status DeleteDir(const std::string& dirname);
+  absl::Status DeleteDir(const std::string& dirname);
 
-  Status DeleteDir(const std::string& dirname, TransactionToken* token) {
-    return OkStatus();
+  absl::Status DeleteDir(const std::string& dirname, TransactionToken* token) {
+    return absl::OkStatus();
   }
 
   /// Obtains statistics for the given path.
-  Status Stat(const std::string& fname, FileStatistics* stat);
+  absl::Status Stat(const std::string& fname, FileStatistics* stat);
 
-  Status Stat(const std::string& fname, TransactionToken* token,
-              FileStatistics* stat) {
-    return OkStatus();
+  absl::Status Stat(const std::string& fname, TransactionToken* token,
+                    FileStatistics* stat) {
+    return absl::OkStatus();
   }
 
   /// \brief Returns whether the given path is a directory or not.
@@ -325,7 +332,7 @@ class Env {
   ///  * NOT_FOUND - The path entry does not exist.
   ///  * PERMISSION_DENIED - Insufficient permissions.
   ///  * UNIMPLEMENTED - The file factory doesn't support directories.
-  Status IsDirectory(const std::string& fname);
+  absl::Status IsDirectory(const std::string& fname);
 
   /// \brief Returns whether the given path is on a file system
   /// that has atomic move capabilities. This can be used
@@ -337,63 +344,66 @@ class Env {
   ///         so has_atomic_move holds the above information.
   ///  * UNIMPLEMENTED - The file system of the path hasn't been implemented in
   ///  TF
-  Status HasAtomicMove(const std::string& path, bool* has_atomic_move);
+  absl::Status HasAtomicMove(const std::string& path, bool* has_atomic_move);
 
   /// Stores the size of `fname` in `*file_size`.
-  Status GetFileSize(const std::string& fname, uint64* file_size);
+  absl::Status GetFileSize(const std::string& fname, uint64* file_size);
 
-  Status GetFileSize(const std::string& fname, TransactionToken* token,
-                     uint64* file_size) {
-    return OkStatus();
+  absl::Status GetFileSize(const std::string& fname, TransactionToken* token,
+                           uint64* file_size) {
+    return absl::OkStatus();
   }
 
   /// \brief Renames file src to target. If target already exists, it will be
   /// replaced.
-  Status RenameFile(const std::string& src, const std::string& target);
+  absl::Status RenameFile(const std::string& src, const std::string& target);
 
-  Status RenameFile(const std::string& src, const std::string& target,
-                    TransactionToken* token) {
-    return OkStatus();
+  absl::Status RenameFile(const std::string& src, const std::string& target,
+                          TransactionToken* token) {
+    return absl::OkStatus();
   }
 
   /// \brief Copy the src to target.
-  Status CopyFile(const std::string& src, const std::string& target);
+  absl::Status CopyFile(const std::string& src, const std::string& target);
 
-  Status CopyFile(const std::string& src, const std::string& target,
-                  TransactionToken* token) {
-    return OkStatus();
+  absl::Status CopyFile(const std::string& src, const std::string& target,
+                        TransactionToken* token) {
+    return absl::OkStatus();
   }
 
   /// \brief starts a new transaction on the filesystem that handles filename
-  Status StartTransaction(const std::string& filename,
-                          TransactionToken** token) {
+  absl::Status StartTransaction(const std::string& filename,
+                                TransactionToken** token) {
     *token = nullptr;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   /// \brief Adds `path` to transaction in `token` if token belongs to
   /// filesystem that handles the path.
-  Status AddToTransaction(const std::string& path, TransactionToken* token) {
-    return OkStatus();
+  absl::Status AddToTransaction(const std::string& path,
+                                TransactionToken* token) {
+    return absl::OkStatus();
   }
 
   /// \brief Get token for `path` or start a new transaction and add `path` to
   /// it.
-  Status GetTokenOrStartTransaction(const std::string& path,
-                                    TransactionToken** token) {
+  absl::Status GetTokenOrStartTransaction(const std::string& path,
+                                          TransactionToken** token) {
     *token = nullptr;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   /// \brief Returns the transaction for `path` or nullptr in `token`
-  Status GetTransactionForPath(const std::string& path,
-                               TransactionToken** token) {
+  absl::Status GetTransactionForPath(const std::string& path,
+                                     TransactionToken** token) {
     *token = nullptr;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   /// \brief Finalizes the transaction
-  Status EndTransaction(TransactionToken* token) { return OkStatus(); }
+  absl::Status EndTransaction(TransactionToken* token) {
+    return absl::OkStatus();
+  }
 
   /// \brief Returns the absolute path of the current executable. It resolves
   /// symlinks if there is any.
@@ -469,8 +479,8 @@ class Env {
   // OK from the function.
   // Otherwise returns nullptr in "*handle" and an error status from the
   // function.
-  virtual Status LoadDynamicLibrary(const char* library_filename,
-                                    void** handle) = 0;
+  virtual absl::Status LoadDynamicLibrary(const char* library_filename,
+                                          void** handle) = 0;
 
   // \brief Get a pointer to a symbol from a dynamic library.
   //
@@ -478,8 +488,9 @@ class Env {
   // On success, store a pointer to the located symbol in "*symbol" and return
   // OK from the function. Otherwise, returns nullptr in "*symbol" and an error
   // status from the function.
-  virtual Status GetSymbolFromLibrary(void* handle, const char* symbol_name,
-                                      void** symbol) = 0;
+  virtual absl::Status GetSymbolFromLibrary(void* handle,
+                                            const char* symbol_name,
+                                            void** symbol) = 0;
 
   // \brief build the name of dynamic library.
   //
@@ -511,17 +522,18 @@ class EnvWrapper : public Env {
   /// Returns the target to which this Env forwards all calls
   Env* target() const { return target_; }
 
-  Status GetFileSystemForFile(const std::string& fname,
-                              FileSystem** result) override {
+  absl::Status GetFileSystemForFile(const std::string& fname,
+                                    FileSystem** result) override {
     return target_->GetFileSystemForFile(fname, result);
   }
 
-  Status GetRegisteredFileSystemSchemes(std::vector<string>* schemes) override {
+  absl::Status GetRegisteredFileSystemSchemes(
+      std::vector<string>* schemes) override {
     return target_->GetRegisteredFileSystemSchemes(schemes);
   }
 
-  Status RegisterFileSystem(const std::string& scheme,
-                            FileSystemRegistry::Factory factory) override {
+  absl::Status RegisterFileSystem(
+      const std::string& scheme, FileSystemRegistry::Factory factory) override {
     return target_->RegisterFileSystem(scheme, factory);
   }
 
@@ -549,12 +561,12 @@ class EnvWrapper : public Env {
                          absl::AnyInvocable<void()> closure) override {
     target_->SchedClosureAfter(micros, std::move(closure));
   }
-  Status LoadDynamicLibrary(const char* library_filename,
-                            void** handle) override {
+  absl::Status LoadDynamicLibrary(const char* library_filename,
+                                  void** handle) override {
     return target_->LoadDynamicLibrary(library_filename, handle);
   }
-  Status GetSymbolFromLibrary(void* handle, const char* symbol_name,
-                              void** symbol) override {
+  absl::Status GetSymbolFromLibrary(void* handle, const char* symbol_name,
+                                    void** symbol) override {
     return target_->GetSymbolFromLibrary(handle, symbol_name, symbol);
   }
   std::string FormatLibraryFileName(const std::string& name,
@@ -608,49 +620,53 @@ struct ThreadOptions {
 
 /// A utility routine: copy contents of `src` in file system `src_fs`
 /// to `target` in file system `target_fs`.
-Status FileSystemCopyFile(FileSystem* src_fs, const std::string& src,
-                          FileSystem* target_fs, const std::string& target);
+absl::Status FileSystemCopyFile(FileSystem* src_fs, const std::string& src,
+                                FileSystem* target_fs,
+                                const std::string& target);
 
 /// A utility routine: reads contents of named file into `*data`
-Status ReadFileToString(Env* env, const std::string& fname, std::string* data);
+absl::Status ReadFileToString(Env* env, const std::string& fname,
+                              std::string* data);
 
 /// A utility routine: write contents of `data` to file named `fname`
 /// (overwriting existing contents, if any).
-Status WriteStringToFile(Env* env, const std::string& fname,
-                         const StringPiece& data);
+absl::Status WriteStringToFile(Env* env, const std::string& fname,
+                               const StringPiece& data);
 
 /// Write binary representation of "proto" to the named file.
-Status WriteBinaryProto(Env* env, const std::string& fname,
-                        const protobuf::MessageLite& proto);
+absl::Status WriteBinaryProto(Env* env, const std::string& fname,
+                              const protobuf::MessageLite& proto);
 
 /// Reads contents of named file and parse as binary encoded proto data
 /// and store into `*proto`.
-Status ReadBinaryProto(Env* env, const std::string& fname,
-                       protobuf::MessageLite* proto);
+absl::Status ReadBinaryProto(Env* env, const std::string& fname,
+                             protobuf::MessageLite* proto);
 
 /// Write the text representation of "proto" to the named file.
-inline Status WriteTextProto(Env* /* env */, const std::string& /* fname */,
-                             const protobuf::MessageLite& /* proto */) {
+inline absl::Status WriteTextProto(Env* /* env */,
+                                   const std::string& /* fname */,
+                                   const protobuf::MessageLite& /* proto */) {
   return errors::Unimplemented("Can't write text protos with protolite.");
 }
-Status WriteTextProto(Env* env, const std::string& fname,
-                      const protobuf::Message& proto);
+absl::Status WriteTextProto(Env* env, const std::string& fname,
+                            const protobuf::Message& proto);
 
 /// Read contents of named file and parse as text encoded proto data
 /// and store into `*proto`.
-inline Status ReadTextProto(Env* /* env */, const std::string& /* fname */,
-                            protobuf::MessageLite* /* proto */) {
+inline absl::Status ReadTextProto(Env* /* env */,
+                                  const std::string& /* fname */,
+                                  protobuf::MessageLite* /* proto */) {
   return errors::Unimplemented("Can't parse text protos with protolite.");
 }
-Status ReadTextProto(Env* env, const std::string& fname,
-                     protobuf::Message* proto);
+absl::Status ReadTextProto(Env* env, const std::string& fname,
+                           protobuf::Message* proto);
 
 /// Read contents of named file and parse as either text or binary encoded proto
 /// data and store into `*proto`.
-Status ReadTextOrBinaryProto(Env* env, const std::string& fname,
-                             protobuf::Message* proto);
-Status ReadTextOrBinaryProto(Env* env, const std::string& fname,
-                             protobuf::MessageLite* proto);
+absl::Status ReadTextOrBinaryProto(Env* env, const std::string& fname,
+                                   protobuf::Message* proto);
+absl::Status ReadTextOrBinaryProto(Env* env, const std::string& fname,
+                                   protobuf::MessageLite* proto);
 
 // START_SKIP_DOXYGEN
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/errors.cc b/third_party/xla/third_party/tsl/tsl/platform/errors.cc
index 7f61fb678954ed..6c732a47849113 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/errors.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/errors.cc
@@ -175,72 +175,73 @@ absl::StatusCode ErrnoToCode(int err_number) {
 
 }  // namespace
 
-Status IOError(const string& context, int err_number) {
+absl::Status IOError(const string& context, int err_number) {
   auto code = ErrnoToCode(err_number);
-  return Status(code, strings::StrCat(context, "; ", strerror(err_number)));
+  return absl::Status(code,
+                      strings::StrCat(context, "; ", strerror(err_number)));
 }
 
-bool IsAborted(const Status& status) {
+bool IsAborted(const absl::Status& status) {
   return status.code() == tsl::error::Code::ABORTED;
 }
 
-bool IsAlreadyExists(const Status& status) {
+bool IsAlreadyExists(const absl::Status& status) {
   return status.code() == tsl::error::Code::ALREADY_EXISTS;
 }
 
-bool IsCancelled(const Status& status) {
+bool IsCancelled(const absl::Status& status) {
   return status.code() == tsl::error::Code::CANCELLED;
 }
 
-bool IsDataLoss(const Status& status) {
+bool IsDataLoss(const absl::Status& status) {
   return status.code() == tsl::error::Code::DATA_LOSS;
 }
 
-bool IsDeadlineExceeded(const Status& status) {
+bool IsDeadlineExceeded(const absl::Status& status) {
   return status.code() == tsl::error::Code::DEADLINE_EXCEEDED;
 }
 
-bool IsFailedPrecondition(const Status& status) {
+bool IsFailedPrecondition(const absl::Status& status) {
   return status.code() == tsl::error::Code::FAILED_PRECONDITION;
 }
 
-bool IsInternal(const Status& status) {
+bool IsInternal(const absl::Status& status) {
   return status.code() == tsl::error::Code::INTERNAL;
 }
 
-bool IsInvalidArgument(const Status& status) {
+bool IsInvalidArgument(const absl::Status& status) {
   return status.code() == tsl::error::Code::INVALID_ARGUMENT;
 }
 
-bool IsNotFound(const Status& status) {
+bool IsNotFound(const absl::Status& status) {
   return status.code() == tsl::error::Code::NOT_FOUND;
 }
 
-bool IsOutOfRange(const Status& status) {
+bool IsOutOfRange(const absl::Status& status) {
   return status.code() == tsl::error::Code::OUT_OF_RANGE;
 }
 
-bool IsPermissionDenied(const Status& status) {
+bool IsPermissionDenied(const absl::Status& status) {
   return status.code() == tsl::error::Code::PERMISSION_DENIED;
 }
 
-bool IsResourceExhausted(const Status& status) {
+bool IsResourceExhausted(const absl::Status& status) {
   return status.code() == tsl::error::Code::RESOURCE_EXHAUSTED;
 }
 
-bool IsUnauthenticated(const Status& status) {
+bool IsUnauthenticated(const absl::Status& status) {
   return status.code() == tsl::error::Code::UNAUTHENTICATED;
 }
 
-bool IsUnavailable(const Status& status) {
+bool IsUnavailable(const absl::Status& status) {
   return status.code() == tsl::error::Code::UNAVAILABLE;
 }
 
-bool IsUnimplemented(const Status& status) {
+bool IsUnimplemented(const absl::Status& status) {
   return status.code() == tsl::error::Code::UNIMPLEMENTED;
 }
 
-bool IsUnknown(const Status& status) {
+bool IsUnknown(const absl::Status& status) {
   return status.code() == tsl::error::Code::UNKNOWN;
 }
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/errors.h b/third_party/xla/third_party/tsl/tsl/platform/errors.h
index 9008dedad8270c..1d86af35cd3efd 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/errors.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/errors.h
@@ -88,11 +88,11 @@ inline const strings::AlphaNum& PrepareForStrCat(const strings::AlphaNum& a) {
 }  // namespace internal
 
 // Maps UNIX errors into a Status.
-Status IOError(const string& context, int err_number);
+absl::Status IOError(const string& context, int err_number);
 
 // Returns all payloads from a Status as a key-value map.
 inline std::unordered_map<std::string, std::string> GetPayloads(
-    const ::tsl::Status& status) {
+    const absl::Status& status) {
   std::unordered_map<std::string, std::string> payloads;
   status.ForEachPayload(
       [&payloads](::tsl::StringPiece key, const absl::Cord& value) {
@@ -104,7 +104,7 @@ inline std::unordered_map<std::string, std::string> GetPayloads(
 // Inserts all given payloads into the given status. Will overwrite existing
 // payloads if they exist with the same key.
 inline void InsertPayloads(
-    ::tsl::Status& status,
+    absl::Status& status,
     const std::unordered_map<std::string, std::string>& payloads) {
   for (const auto& payload : payloads) {
     status.SetPayload(payload.first, absl::Cord(payload.second));
@@ -113,7 +113,7 @@ inline void InsertPayloads(
 
 // Copies all payloads from one Status to another. Will overwrite existing
 // payloads in the destination if they exist with the same key.
-inline void CopyPayloads(const ::tsl::Status& from, ::tsl::Status& to) {
+inline void CopyPayloads(const absl::Status& from, absl::Status& to) {
   from.ForEachPayload([&to](::tsl::StringPiece key, const absl::Cord& value) {
     to.SetPayload(key, value);
   });
@@ -121,22 +121,22 @@ inline void CopyPayloads(const ::tsl::Status& from, ::tsl::Status& to) {
 
 #if defined(PLATFORM_GOOGLE)
 // Creates a new status with the given code, message and payloads.
-inline ::tsl::Status Create(
+inline absl::Status Create(
     absl::StatusCode code, ::tsl::StringPiece message,
     const std::unordered_map<std::string, std::string>& payloads,
     absl::SourceLocation loc = absl::SourceLocation::current()) {
-  Status status(code, message, loc);
+  absl::Status status(code, message, loc);
   InsertPayloads(status, payloads);
   return status;
 }
 // Returns a new Status, replacing its message with the given.
-inline ::tsl::Status CreateWithUpdatedMessage(const ::tsl::Status& status,
-                                              ::tsl::StringPiece message) {
+inline absl::Status CreateWithUpdatedMessage(const absl::Status& status,
+                                             ::tsl::StringPiece message) {
   auto locations = status.GetSourceLocations();
   auto initial_loc =
       locations.empty() ? absl::SourceLocation::current() : locations[0];
-  Status new_status = Create(static_cast<absl::StatusCode>(status.code()),
-                             message, GetPayloads(status), initial_loc);
+  absl::Status new_status = Create(static_cast<absl::StatusCode>(status.code()),
+                                   message, GetPayloads(status), initial_loc);
   if (locations.size() > 1) {
     for (auto loc : locations.subspan(1)) {
       new_status.AddSourceLocation(loc);
@@ -165,7 +165,7 @@ inline ::tsl::Status CreateWithUpdatedMessage(const ::tsl::Status& status,
 // context put it on a new line, since it is possible for there
 // to be several layers of additional context.
 template <typename... Args>
-void AppendToMessage(::tsl::Status* status, Args... args) {
+void AppendToMessage(absl::Status* status, Args... args) {
   auto new_status = CreateWithUpdatedMessage(
       *status, ::tsl::strings::StrCat(status->message(), "\n\t", args...));
   CopyPayloads(*status, new_status);
@@ -199,13 +199,13 @@ void AppendToMessage(::tsl::Status* status, Args... args) {
 
 // CANCELLED
 template <typename... Args>
-::tsl::Status Cancelled(Args... args) {
-  return ::tsl::Status(absl::StatusCode::kCancelled,
-                       ::tsl::strings::StrCat(
-                           ::tsl::errors::internal::PrepareForStrCat(args)...));
+absl::Status Cancelled(Args... args) {
+  return absl::Status(absl::StatusCode::kCancelled,
+                      ::tsl::strings::StrCat(
+                          ::tsl::errors::internal::PrepareForStrCat(args)...));
 }
 template <typename... Args>
-::tsl::Status CancelledWithPayloads(
+absl::Status CancelledWithPayloads(
     const ::tsl::StringPiece& message,
     const std::unordered_map<std::string, std::string>& payloads) {
   return errors::Create(absl::StatusCode::kCancelled, message, payloads);
@@ -213,10 +213,10 @@ ::tsl::Status CancelledWithPayloads(
 
 // InvalidArgument
 template <typename... Args>
-::tsl::Status InvalidArgument(Args... args) {
-  return ::tsl::Status(absl::StatusCode::kInvalidArgument,
-                       ::tsl::strings::StrCat(
-                           ::tsl::errors::internal::PrepareForStrCat(args)...));
+absl::Status InvalidArgument(Args... args) {
+  return absl::Status(absl::StatusCode::kInvalidArgument,
+                      ::tsl::strings::StrCat(
+                          ::tsl::errors::internal::PrepareForStrCat(args)...));
 }
 
 #if defined(PLATFORM_GOOGLE)
@@ -225,7 +225,7 @@ template <typename Arg1, typename Arg2, typename Arg3, typename Arg4>
 ::absl::Status InvalidArgument(
     Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4,
     absl::SourceLocation loc = absl::SourceLocation::current()) {
-  return ::tsl::Status(
+  return absl::Status(
       absl::StatusCode::kInvalidArgument,
       ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1),
                              ::tsl::errors::internal::PrepareForStrCat(arg2),
@@ -237,7 +237,7 @@ template <typename Arg1, typename Arg2, typename Arg3>
 ::absl::Status InvalidArgument(
     Arg1 arg1, Arg2 arg2, Arg3 arg3,
     absl::SourceLocation loc = absl::SourceLocation::current()) {
-  return ::tsl::Status(
+  return absl::Status(
       absl::StatusCode::kInvalidArgument,
       ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1),
                              ::tsl::errors::internal::PrepareForStrCat(arg2),
@@ -248,7 +248,7 @@ template <typename Arg1, typename Arg2>
 ::absl::Status InvalidArgument(
     Arg1 arg1, Arg2 arg2,
     absl::SourceLocation loc = absl::SourceLocation::current()) {
-  return ::tsl::Status(
+  return absl::Status(
       absl::StatusCode::kInvalidArgument,
       ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1),
                              ::tsl::errors::internal::PrepareForStrCat(arg2)),
@@ -257,7 +257,7 @@ ::absl::Status InvalidArgument(
 template <typename Arg1>
 ::absl::Status InvalidArgument(
     Arg1 arg1, absl::SourceLocation loc = absl::SourceLocation::current()) {
-  return ::tsl::Status(
+  return absl::Status(
       absl::StatusCode::kInvalidArgument,
       ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1)),
       loc);
@@ -302,10 +302,10 @@ ::absl::Status InvalidArgumentWithPayloads(
 
 // NotFound
 template <typename... Args>
-::tsl::Status NotFound(Args... args) {
-  return ::tsl::Status(absl::StatusCode::kNotFound,
-                       ::tsl::strings::StrCat(
-                           ::tsl::errors::internal::PrepareForStrCat(args)...));
+absl::Status NotFound(Args... args) {
+  return absl::Status(absl::StatusCode::kNotFound,
+                      ::tsl::strings::StrCat(
+                          ::tsl::errors::internal::PrepareForStrCat(args)...));
 }
 #if defined(PLATFORM_GOOGLE)
 // Specialized overloads to capture source location for up to three arguments.
@@ -313,7 +313,7 @@ template <typename Arg1, typename Arg2, typename Arg3>
 ::absl::Status NotFound(
     Arg1 arg1, Arg2 arg2, Arg3 arg3,
     absl::SourceLocation loc = absl::SourceLocation::current()) {
-  return ::tsl::Status(
+  return absl::Status(
       absl::StatusCode::kNotFound,
       ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1),
                              ::tsl::errors::internal::PrepareForStrCat(arg2),
@@ -324,7 +324,7 @@ template <typename Arg1, typename Arg2>
 ::absl::Status NotFound(
     Arg1 arg1, Arg2 arg2,
     absl::SourceLocation loc = absl::SourceLocation::current()) {
-  return ::tsl::Status(
+  return absl::Status(
       absl::StatusCode::kNotFound,
       ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1),
                              ::tsl::errors::internal::PrepareForStrCat(arg2)),
@@ -333,7 +333,7 @@ ::absl::Status NotFound(
 template <typename Arg1>
 ::absl::Status NotFound(
     Arg1 arg1, absl::SourceLocation loc = absl::SourceLocation::current()) {
-  return ::tsl::Status(
+  return absl::Status(
       absl::StatusCode::kNotFound,
       ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1)),
       loc);
@@ -377,13 +377,13 @@ ::absl::Status NotFoundWithPayloads(
 
 // AlreadyExists
 template <typename... Args>
-::tsl::Status AlreadyExists(Args... args) {
-  return ::tsl::Status(absl::StatusCode::kAlreadyExists,
-                       ::tsl::strings::StrCat(
-                           ::tsl::errors::internal::PrepareForStrCat(args)...));
+absl::Status AlreadyExists(Args... args) {
+  return absl::Status(absl::StatusCode::kAlreadyExists,
+                      ::tsl::strings::StrCat(
+                          ::tsl::errors::internal::PrepareForStrCat(args)...));
 }
 template <typename... Args>
-::tsl::Status AlreadyExistsWithPayloads(
+absl::Status AlreadyExistsWithPayloads(
     const ::tsl::StringPiece& message,
     const std::unordered_map<std::string, std::string>& payloads) {
   return errors::Create(absl::StatusCode::kAlreadyExists, message, payloads);
@@ -391,13 +391,13 @@ ::tsl::Status AlreadyExistsWithPayloads(
 
 // ResourceExhausted
 template <typename... Args>
-::tsl::Status ResourceExhausted(Args... args) {
-  return ::tsl::Status(absl::StatusCode::kResourceExhausted,
-                       ::tsl::strings::StrCat(
-                           ::tsl::errors::internal::PrepareForStrCat(args)...));
+absl::Status ResourceExhausted(Args... args) {
+  return absl::Status(absl::StatusCode::kResourceExhausted,
+                      ::tsl::strings::StrCat(
+                          ::tsl::errors::internal::PrepareForStrCat(args)...));
 }
 template <typename... Args>
-::tsl::Status ResourceExhaustedWithPayloads(
+absl::Status ResourceExhaustedWithPayloads(
     const ::tsl::StringPiece& message,
     const std::unordered_map<std::string, std::string>& payloads) {
   return errors::Create(absl::StatusCode::kResourceExhausted, message,
@@ -406,13 +406,13 @@ ::tsl::Status ResourceExhaustedWithPayloads(
 
 // Unavailable
 template <typename... Args>
-::tsl::Status Unavailable(Args... args) {
-  return ::tsl::Status(absl::StatusCode::kUnavailable,
-                       ::tsl::strings::StrCat(
-                           ::tsl::errors::internal::PrepareForStrCat(args)...));
+absl::Status Unavailable(Args... args) {
+  return absl::Status(absl::StatusCode::kUnavailable,
+                      ::tsl::strings::StrCat(
+                          ::tsl::errors::internal::PrepareForStrCat(args)...));
 }
 template <typename... Args>
-::tsl::Status UnavailableWithPayloads(
+absl::Status UnavailableWithPayloads(
     const ::tsl::StringPiece& message,
     const std::unordered_map<std::string, std::string>& payloads) {
   return errors::Create(absl::StatusCode::kUnavailable, message, payloads);
@@ -420,13 +420,13 @@ ::tsl::Status UnavailableWithPayloads(
 
 // FailedPrecondition
 template <typename... Args>
-::tsl::Status FailedPrecondition(Args... args) {
-  return ::tsl::Status(absl::StatusCode::kFailedPrecondition,
-                       ::tsl::strings::StrCat(
-                           ::tsl::errors::internal::PrepareForStrCat(args)...));
+absl::Status FailedPrecondition(Args... args) {
+  return absl::Status(absl::StatusCode::kFailedPrecondition,
+                      ::tsl::strings::StrCat(
+                          ::tsl::errors::internal::PrepareForStrCat(args)...));
 }
 template <typename... Args>
-::tsl::Status FailedPreconditionWithPayloads(
+absl::Status FailedPreconditionWithPayloads(
     const ::tsl::StringPiece& message,
     const std::unordered_map<std::string, std::string>& payloads) {
   return errors::Create(absl::StatusCode::kFailedPrecondition, message,
@@ -435,13 +435,13 @@ ::tsl::Status FailedPreconditionWithPayloads(
 
 // OutOfRange
 template <typename... Args>
-::tsl::Status OutOfRange(Args... args) {
-  return ::tsl::Status(absl::StatusCode::kOutOfRange,
-                       ::tsl::strings::StrCat(
-                           ::tsl::errors::internal::PrepareForStrCat(args)...));
+absl::Status OutOfRange(Args... args) {
+  return absl::Status(absl::StatusCode::kOutOfRange,
+                      ::tsl::strings::StrCat(
+                          ::tsl::errors::internal::PrepareForStrCat(args)...));
 }
 template <typename... Args>
-::tsl::Status OutOfRangeWithPayloads(
+absl::Status OutOfRangeWithPayloads(
     const ::tsl::StringPiece& message,
     const std::unordered_map<std::string, std::string>& payloads) {
   return errors::Create(absl::StatusCode::kOutOfRange, message, payloads);
@@ -449,13 +449,13 @@ ::tsl::Status OutOfRangeWithPayloads(
 
 // Unimplemented
 template <typename... Args>
-::tsl::Status Unimplemented(Args... args) {
-  return ::tsl::Status(absl::StatusCode::kUnimplemented,
-                       ::tsl::strings::StrCat(
-                           ::tsl::errors::internal::PrepareForStrCat(args)...));
+absl::Status Unimplemented(Args... args) {
+  return absl::Status(absl::StatusCode::kUnimplemented,
+                      ::tsl::strings::StrCat(
+                          ::tsl::errors::internal::PrepareForStrCat(args)...));
 }
 template <typename... Args>
-::tsl::Status UnimplementedWithPayloads(
+absl::Status UnimplementedWithPayloads(
     const ::tsl::StringPiece& message,
     const std::unordered_map<std::string, std::string>& payloads) {
   return errors::Create(absl::StatusCode::kUnimplemented, message, payloads);
@@ -463,13 +463,13 @@ ::tsl::Status UnimplementedWithPayloads(
 
 // Internal
 template <typename... Args>
-::tsl::Status Internal(Args... args) {
-  return ::tsl::Status(absl::StatusCode::kInternal,
-                       ::tsl::strings::StrCat(
-                           ::tsl::errors::internal::PrepareForStrCat(args)...));
+absl::Status Internal(Args... args) {
+  return absl::Status(absl::StatusCode::kInternal,
+                      ::tsl::strings::StrCat(
+                          ::tsl::errors::internal::PrepareForStrCat(args)...));
 }
 template <typename... Args>
-::tsl::Status InternalWithPayloads(
+absl::Status InternalWithPayloads(
     const ::tsl::StringPiece& message,
     const std::unordered_map<std::string, std::string>& payloads) {
   return errors::Create(absl::StatusCode::kInternal, message, payloads);
@@ -477,13 +477,13 @@ ::tsl::Status InternalWithPayloads(
 
 // Aborted
 template <typename... Args>
-::tsl::Status Aborted(Args... args) {
-  return ::tsl::Status(absl::StatusCode::kAborted,
-                       ::tsl::strings::StrCat(
-                           ::tsl::errors::internal::PrepareForStrCat(args)...));
+absl::Status Aborted(Args... args) {
+  return absl::Status(absl::StatusCode::kAborted,
+                      ::tsl::strings::StrCat(
+                          ::tsl::errors::internal::PrepareForStrCat(args)...));
 }
 template <typename... Args>
-::tsl::Status AbortedWithPayloads(
+absl::Status AbortedWithPayloads(
     const ::tsl::StringPiece& message,
     const std::unordered_map<std::string, std::string>& payloads) {
   return errors::Create(absl::StatusCode::kAborted, message, payloads);
@@ -491,13 +491,13 @@ ::tsl::Status AbortedWithPayloads(
 
 // DeadlineExceeded
 template <typename... Args>
-::tsl::Status DeadlineExceeded(Args... args) {
-  return ::tsl::Status(absl::StatusCode::kDeadlineExceeded,
-                       ::tsl::strings::StrCat(
-                           ::tsl::errors::internal::PrepareForStrCat(args)...));
+absl::Status DeadlineExceeded(Args... args) {
+  return absl::Status(absl::StatusCode::kDeadlineExceeded,
+                      ::tsl::strings::StrCat(
+                          ::tsl::errors::internal::PrepareForStrCat(args)...));
 }
 template <typename... Args>
-::tsl::Status DeadlineExceededWithPayloads(
+absl::Status DeadlineExceededWithPayloads(
     const ::tsl::StringPiece& message,
     const std::unordered_map<std::string, std::string>& payloads) {
   return errors::Create(absl::StatusCode::kDeadlineExceeded, message, payloads);
@@ -505,13 +505,13 @@ ::tsl::Status DeadlineExceededWithPayloads(
 
 // DataLoss
 template <typename... Args>
-::tsl::Status DataLoss(Args... args) {
-  return ::tsl::Status(absl::StatusCode::kDataLoss,
-                       ::tsl::strings::StrCat(
-                           ::tsl::errors::internal::PrepareForStrCat(args)...));
+absl::Status DataLoss(Args... args) {
+  return absl::Status(absl::StatusCode::kDataLoss,
+                      ::tsl::strings::StrCat(
+                          ::tsl::errors::internal::PrepareForStrCat(args)...));
 }
 template <typename... Args>
-::tsl::Status DataLossWithPayloads(
+absl::Status DataLossWithPayloads(
     const ::tsl::StringPiece& message,
     const std::unordered_map<std::string, std::string>& payloads) {
   return errors::Create(absl::StatusCode::kDataLoss, message, payloads);
@@ -519,26 +519,26 @@ ::tsl::Status DataLossWithPayloads(
 
 // Unknown
 template <typename... Args>
-::tsl::Status Unknown(Args... args) {
-  return ::tsl::Status(absl::StatusCode::kUnknown,
-                       ::tsl::strings::StrCat(
-                           ::tsl::errors::internal::PrepareForStrCat(args)...));
+absl::Status Unknown(Args... args) {
+  return absl::Status(absl::StatusCode::kUnknown,
+                      ::tsl::strings::StrCat(
+                          ::tsl::errors::internal::PrepareForStrCat(args)...));
 }
 template <typename... Args>
-::tsl::Status UnknownPayloads(
+absl::Status UnknownPayloads(
     const ::tsl::StringPiece& message,
     const std::unordered_map<std::string, std::string>& payloads) {
   return errors::Create(absl::StatusCode::kUnknown, message, payloads);
 }
 // PermissionDenied
 template <typename... Args>
-::tsl::Status PermissionDenied(Args... args) {
-  return ::tsl::Status(absl::StatusCode::kPermissionDenied,
-                       ::tsl::strings::StrCat(
-                           ::tsl::errors::internal::PrepareForStrCat(args)...));
+absl::Status PermissionDenied(Args... args) {
+  return absl::Status(absl::StatusCode::kPermissionDenied,
+                      ::tsl::strings::StrCat(
+                          ::tsl::errors::internal::PrepareForStrCat(args)...));
 }
 template <typename... Args>
-::tsl::Status PermissionDeniedWithPayloads(
+absl::Status PermissionDeniedWithPayloads(
     const ::tsl::StringPiece& message,
     const std::unordered_map<std::string, std::string>& payloads) {
   return errors::Create(absl::StatusCode::kPermissionDenied, message, payloads);
@@ -546,34 +546,34 @@ ::tsl::Status PermissionDeniedWithPayloads(
 
 // Unauthenticated
 template <typename... Args>
-::tsl::Status Unauthenticated(Args... args) {
-  return ::tsl::Status(absl::StatusCode::kUnauthenticated,
-                       ::tsl::strings::StrCat(
-                           ::tsl::errors::internal::PrepareForStrCat(args)...));
+absl::Status Unauthenticated(Args... args) {
+  return absl::Status(absl::StatusCode::kUnauthenticated,
+                      ::tsl::strings::StrCat(
+                          ::tsl::errors::internal::PrepareForStrCat(args)...));
 }
 template <typename... Args>
-::tsl::Status UnauthenticatedWithPayloads(
+absl::Status UnauthenticatedWithPayloads(
     const ::tsl::StringPiece& message,
     const std::unordered_map<std::string, std::string>& payloads) {
   return errors::Create(absl::StatusCode::kUnauthenticated, message, payloads);
 }
 
-bool IsAborted(const Status& status);
-bool IsAlreadyExists(const Status& status);
-bool IsCancelled(const Status& status);
-bool IsDataLoss(const Status& status);
-bool IsDeadlineExceeded(const Status& status);
-bool IsFailedPrecondition(const Status& status);
-bool IsInternal(const Status& status);
-bool IsInvalidArgument(const Status& status);
-bool IsNotFound(const Status& status);
-bool IsOutOfRange(const Status& status);
-bool IsPermissionDenied(const Status& status);
-bool IsResourceExhausted(const Status& status);
-bool IsUnauthenticated(const Status& status);
-bool IsUnavailable(const Status& status);
-bool IsUnimplemented(const Status& status);
-bool IsUnknown(const Status& status);
+bool IsAborted(const absl::Status& status);
+bool IsAlreadyExists(const absl::Status& status);
+bool IsCancelled(const absl::Status& status);
+bool IsDataLoss(const absl::Status& status);
+bool IsDeadlineExceeded(const absl::Status& status);
+bool IsFailedPrecondition(const absl::Status& status);
+bool IsInternal(const absl::Status& status);
+bool IsInvalidArgument(const absl::Status& status);
+bool IsNotFound(const absl::Status& status);
+bool IsOutOfRange(const absl::Status& status);
+bool IsPermissionDenied(const absl::Status& status);
+bool IsResourceExhausted(const absl::Status& status);
+bool IsUnauthenticated(const absl::Status& status);
+bool IsUnavailable(const absl::Status& status);
+bool IsUnimplemented(const absl::Status& status);
+bool IsUnknown(const absl::Status& status);
 
 // Produces a formatted string pattern from the name which can uniquely identify
 // this node upstream to produce an informative error message. The pattern
@@ -610,10 +610,10 @@ inline std::string FormatFunctionForError(absl::string_view name) {
   return strings::StrCat("{{function_node ", name, "}}");
 }
 
-inline Status ReplaceErrorFromNonCommunicationOps(const Status s,
-                                                  absl::string_view op_name) {
+inline absl::Status ReplaceErrorFromNonCommunicationOps(
+    const absl::Status s, absl::string_view op_name) {
   assert(::tsl::errors::IsUnavailable(s));
-  return Status(
+  return absl::Status(
       absl::StatusCode::kInternal,
       strings::StrCat(
           s.message(), "\nExecuting non-communication op <", op_name,
diff --git a/third_party/xla/third_party/tsl/tsl/platform/errors_test.cc b/third_party/xla/third_party/tsl/tsl/platform/errors_test.cc
index 3607b90c2d7b3e..88a3a5a78f72a5 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/errors_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/errors_test.cc
@@ -21,7 +21,7 @@ limitations under the License.
 namespace tsl {
 
 TEST(AppendToMessageTest, PayloadsAreCopied) {
-  Status status = errors::Aborted("Aborted Error Message");
+  absl::Status status = errors::Aborted("Aborted Error Message");
   status.SetPayload("payload_key", absl::Cord("payload_value"));
   errors::AppendToMessage(&status, "Appended Message");
 
@@ -30,22 +30,22 @@ TEST(AppendToMessageTest, PayloadsAreCopied) {
 }
 
 TEST(Status, GetAllPayloads) {
-  Status s_error(absl::StatusCode::kInternal, "Error message");
+  absl::Status s_error(absl::StatusCode::kInternal, "Error message");
   s_error.SetPayload("Error key", absl::Cord("foo"));
   auto payloads_error_status = errors::GetPayloads(s_error);
   ASSERT_EQ(payloads_error_status.size(), 1);
   ASSERT_EQ(payloads_error_status["Error key"], "foo");
 
-  Status s_ok = Status();
+  absl::Status s_ok = absl::Status();
   auto payloads_ok_status = errors::GetPayloads(s_ok);
   ASSERT_TRUE(payloads_ok_status.empty());
 }
 
 TEST(Status, OKStatusInsertPayloadsFromErrorStatus) {
   // An OK status will should not change after InsertPayloads() calls.
-  Status s_error(absl::StatusCode::kInternal, "Error message");
+  absl::Status s_error(absl::StatusCode::kInternal, "Error message");
   s_error.SetPayload("Error key", absl::Cord("foo"));
-  Status s_ok = Status();
+  absl::Status s_ok = absl::Status();
 
   errors::InsertPayloads(s_ok, errors::GetPayloads(s_error));
   auto payloads_ok_status = errors::GetPayloads(s_ok);
@@ -54,19 +54,19 @@ TEST(Status, OKStatusInsertPayloadsFromErrorStatus) {
 
 TEST(Status, ErrorStatusInsertPayloadsFromOKStatus) {
   // An InsertPayloads() call should not take effect from empty inputs.
-  Status s_error(absl::StatusCode::kInternal, "Error message");
+  absl::Status s_error(absl::StatusCode::kInternal, "Error message");
   s_error.SetPayload("Error key", absl::Cord("foo"));
-  Status s_ok = Status();
+  absl::Status s_ok = absl::Status();
 
   errors::InsertPayloads(s_error, errors::GetPayloads(s_ok));
   ASSERT_EQ(s_error.GetPayload("Error key"), "foo");
 }
 
 TEST(Status, ErrorStatusInsertPayloadsFromErrorStatus) {
-  Status s_error1(absl::StatusCode::kInternal, "Error message");
+  absl::Status s_error1(absl::StatusCode::kInternal, "Error message");
   s_error1.SetPayload("Error key 1", absl::Cord("foo"));
   s_error1.SetPayload("Error key 2", absl::Cord("bar"));
-  Status s_error2(absl::StatusCode::kInternal, "Error message");
+  absl::Status s_error2(absl::StatusCode::kInternal, "Error message");
   s_error2.SetPayload("Error key", absl::Cord("bar"));
   ASSERT_EQ(s_error2.GetPayload("Error key"), "bar");
 
@@ -79,22 +79,22 @@ TEST(Status, ErrorStatusInsertPayloadsFromErrorStatus) {
 
 #if defined(PLATFORM_GOOGLE)
 
-Status GetError() {
+absl::Status GetError() {
   return absl::InvalidArgumentError("An invalid argument error");
 }
 
-Status PropagateError() {
+absl::Status PropagateError() {
   TF_RETURN_IF_ERROR(GetError());
   return absl::OkStatus();
 }
 
-Status PropagateError2() {
+absl::Status PropagateError2() {
   TF_RETURN_IF_ERROR(PropagateError());
   return absl::OkStatus();
 }
 
 TEST(Status, StackTracePropagation) {
-  Status s = PropagateError2();
+  absl::Status s = PropagateError2();
   auto sources = s.GetSourceLocations();
   ASSERT_EQ(sources.size(), 3);
 
@@ -105,16 +105,16 @@ TEST(Status, StackTracePropagation) {
 }
 
 TEST(Status, SourceLocationsPreservedByAppend) {
-  Status s = PropagateError2();
+  absl::Status s = PropagateError2();
   ASSERT_EQ(s.GetSourceLocations().size(), 3);
   errors::AppendToMessage(&s, "A new message.");
   ASSERT_EQ(s.GetSourceLocations().size(), 3);
 }
 
 TEST(Status, SourceLocationsPreservedByUpdate) {
-  Status s = PropagateError2();
+  absl::Status s = PropagateError2();
   ASSERT_EQ(s.GetSourceLocations().size(), 3);
-  Status s2 = errors::CreateWithUpdatedMessage(s, "New message.");
+  absl::Status s2 = errors::CreateWithUpdatedMessage(s, "New message.");
   ASSERT_EQ(s2.GetSourceLocations().size(), 3);
 }
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/file_system.cc b/third_party/xla/third_party/tsl/tsl/platform/file_system.cc
index 022782ee47c5e7..cbca921d50b545 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/file_system.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/file_system.cc
@@ -74,31 +74,33 @@ string FileSystem::TranslateName(const string& name) const {
   return this->CleanPath(path);
 }
 
-Status FileSystem::IsDirectory(const string& name, TransactionToken* token) {
+absl::Status FileSystem::IsDirectory(const string& name,
+                                     TransactionToken* token) {
   // Check if path exists.
   // TODO(sami):Forward token to other methods once migration is complete.
   TF_RETURN_IF_ERROR(FileExists(name));
   FileStatistics stat;
   TF_RETURN_IF_ERROR(Stat(name, &stat));
   if (stat.is_directory) {
-    return OkStatus();
+    return absl::OkStatus();
   }
-  return Status(absl::StatusCode::kFailedPrecondition, "Not a directory");
+  return absl::Status(absl::StatusCode::kFailedPrecondition, "Not a directory");
 }
 
-Status FileSystem::HasAtomicMove(const string& path, bool* has_atomic_move) {
+absl::Status FileSystem::HasAtomicMove(const string& path,
+                                       bool* has_atomic_move) {
   *has_atomic_move = true;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void FileSystem::FlushCaches(TransactionToken* token) {}
 
 bool FileSystem::FilesExist(const std::vector<string>& files,
                             TransactionToken* token,
-                            std::vector<Status>* status) {
+                            std::vector<absl::Status>* status) {
   bool result = true;
   for (const auto& file : files) {
-    Status s = FileExists(file);
+    absl::Status s = FileExists(file);
     result &= s.ok();
     if (status != nullptr) {
       status->push_back(s);
@@ -110,17 +112,17 @@ bool FileSystem::FilesExist(const std::vector<string>& files,
   return result;
 }
 
-Status FileSystem::DeleteRecursively(const string& dirname,
-                                     TransactionToken* token,
-                                     int64_t* undeleted_files,
-                                     int64_t* undeleted_dirs) {
+absl::Status FileSystem::DeleteRecursively(const string& dirname,
+                                           TransactionToken* token,
+                                           int64_t* undeleted_files,
+                                           int64_t* undeleted_dirs) {
   CHECK_NOTNULL(undeleted_files);
   CHECK_NOTNULL(undeleted_dirs);
 
   *undeleted_files = 0;
   *undeleted_dirs = 0;
   // Make sure that dirname exists;
-  Status exists_status = FileExists(dirname);
+  absl::Status exists_status = FileExists(dirname);
   if (!exists_status.ok()) {
     (*undeleted_dirs)++;
     return exists_status;
@@ -128,7 +130,7 @@ Status FileSystem::DeleteRecursively(const string& dirname,
 
   // If given path to a single file, we should just delete it.
   if (!IsDirectory(dirname).ok()) {
-    Status delete_root_status = DeleteFile(dirname);
+    absl::Status delete_root_status = DeleteFile(dirname);
     if (!delete_root_status.ok()) (*undeleted_files)++;
     return delete_root_status;
   }
@@ -136,7 +138,7 @@ Status FileSystem::DeleteRecursively(const string& dirname,
   std::deque<string> dir_q;      // Queue for the BFS
   std::vector<string> dir_list;  // List of all dirs discovered
   dir_q.push_back(dirname);
-  Status ret;  // Status to be returned.
+  absl::Status ret;  // Status to be returned.
   // Do a BFS on the directory to discover all the sub-directories. Remove all
   // children that are files along the way. Then cleanup and remove the
   // directories in reverse order.;
@@ -146,7 +148,7 @@ Status FileSystem::DeleteRecursively(const string& dirname,
     dir_list.push_back(dir);
     std::vector<string> children;
     // GetChildren might fail if we don't have appropriate permissions.
-    Status s = GetChildren(dir, &children);
+    absl::Status s = GetChildren(dir, &children);
     ret.Update(s);
     if (!s.ok()) {
       (*undeleted_dirs)++;
@@ -160,7 +162,7 @@ Status FileSystem::DeleteRecursively(const string& dirname,
       } else {
         // Delete file might fail because of permissions issues or might be
         // unimplemented.
-        Status del_status = DeleteFile(child_path);
+        absl::Status del_status = DeleteFile(child_path);
         ret.Update(del_status);
         if (!del_status.ok()) {
           (*undeleted_files)++;
@@ -174,7 +176,7 @@ Status FileSystem::DeleteRecursively(const string& dirname,
   for (const string& dir : dir_list) {
     // Delete dir might fail because of permissions issues or might be
     // unimplemented.
-    Status s = DeleteDir(dir);
+    absl::Status s = DeleteDir(dir);
     ret.Update(s);
     if (!s.ok()) {
       (*undeleted_dirs)++;
@@ -183,19 +185,19 @@ Status FileSystem::DeleteRecursively(const string& dirname,
   return ret;
 }
 
-Status FileSystem::RecursivelyCreateDir(const string& dirname,
-                                        TransactionToken* token) {
+absl::Status FileSystem::RecursivelyCreateDir(const string& dirname,
+                                              TransactionToken* token) {
   StringPiece scheme, host, remaining_dir;
   this->ParseURI(dirname, &scheme, &host, &remaining_dir);
   std::vector<StringPiece> sub_dirs;
   while (!remaining_dir.empty()) {
     std::string current_entry = this->CreateURI(scheme, host, remaining_dir);
-    Status exists_status = FileExists(current_entry);
+    absl::Status exists_status = FileExists(current_entry);
     if (exists_status.ok()) {
       // FileExists cannot differentiate between existence of a file or a
       // directory, hence we need an additional test as we must not assume that
       // a path to a file is a path to a parent directory.
-      Status directory_status = IsDirectory(current_entry);
+      absl::Status directory_status = IsDirectory(current_entry);
       if (directory_status.ok()) {
         break;  // We need to start creating directories from here.
       } else if (directory_status.code() == absl::StatusCode::kUnimplemented) {
@@ -221,16 +223,16 @@ Status FileSystem::RecursivelyCreateDir(const string& dirname,
   string built_path(remaining_dir);
   for (const StringPiece sub_dir : sub_dirs) {
     built_path = this->JoinPath(built_path, sub_dir);
-    Status status = CreateDir(this->CreateURI(scheme, host, built_path));
+    absl::Status status = CreateDir(this->CreateURI(scheme, host, built_path));
     if (!status.ok() && status.code() != absl::StatusCode::kAlreadyExists) {
       return status;
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status FileSystem::CopyFile(const string& src, const string& target,
-                            TransactionToken* token) {
+absl::Status FileSystem::CopyFile(const string& src, const string& target,
+                                  TransactionToken* token) {
   return FileSystemCopyFile(this, src, this, target);
 }
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/file_system.h b/third_party/xla/third_party/tsl/tsl/platform/file_system.h
index 8f7bd875e35bc3..a25a4760357ca6 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/file_system.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/file_system.h
@@ -69,17 +69,17 @@ class FileSystem {
   ///
   /// The ownership of the returned RandomAccessFile is passed to the caller
   /// and the object should be deleted when is not used.
-  virtual tsl::Status NewRandomAccessFile(
+  virtual absl::Status NewRandomAccessFile(
       const std::string& fname, std::unique_ptr<RandomAccessFile>* result) {
     return NewRandomAccessFile(fname, nullptr, result);
   }
 
-  virtual tsl::Status NewRandomAccessFile(
+  virtual absl::Status NewRandomAccessFile(
       const std::string& fname, TransactionToken* token,
       std::unique_ptr<RandomAccessFile>* result) {
     // We duplicate these methods due to Google internal coding style prevents
     // virtual functions with default arguments. See PR #41615.
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   /// \brief Creates an object that writes to a new file with the specified
@@ -94,15 +94,15 @@ class FileSystem {
   ///
   /// The ownership of the returned WritableFile is passed to the caller
   /// and the object should be deleted when is not used.
-  virtual tsl::Status NewWritableFile(const std::string& fname,
-                                      std::unique_ptr<WritableFile>* result) {
+  virtual absl::Status NewWritableFile(const std::string& fname,
+                                       std::unique_ptr<WritableFile>* result) {
     return NewWritableFile(fname, nullptr, result);
   }
 
-  virtual tsl::Status NewWritableFile(const std::string& fname,
-                                      TransactionToken* token,
-                                      std::unique_ptr<WritableFile>* result) {
-    return OkStatus();
+  virtual absl::Status NewWritableFile(const std::string& fname,
+                                       TransactionToken* token,
+                                       std::unique_ptr<WritableFile>* result) {
+    return absl::OkStatus();
   }
 
   /// \brief Creates an object that either appends to an existing file, or
@@ -116,15 +116,15 @@ class FileSystem {
   ///
   /// The ownership of the returned WritableFile is passed to the caller
   /// and the object should be deleted when is not used.
-  virtual tsl::Status NewAppendableFile(const std::string& fname,
-                                        std::unique_ptr<WritableFile>* result) {
+  virtual absl::Status NewAppendableFile(
+      const std::string& fname, std::unique_ptr<WritableFile>* result) {
     return NewAppendableFile(fname, nullptr, result);
   }
 
-  virtual tsl::Status NewAppendableFile(const std::string& fname,
-                                        TransactionToken* token,
-                                        std::unique_ptr<WritableFile>* result) {
-    return OkStatus();
+  virtual absl::Status NewAppendableFile(
+      const std::string& fname, TransactionToken* token,
+      std::unique_ptr<WritableFile>* result) {
+    return absl::OkStatus();
   }
 
   /// \brief Creates a readonly region of memory with the file context.
@@ -137,50 +137,51 @@ class FileSystem {
   ///
   /// The ownership of the returned ReadOnlyMemoryRegion is passed to the caller
   /// and the object should be deleted when is not used.
-  virtual tsl::Status NewReadOnlyMemoryRegionFromFile(
+  virtual absl::Status NewReadOnlyMemoryRegionFromFile(
       const std::string& fname, std::unique_ptr<ReadOnlyMemoryRegion>* result) {
     return NewReadOnlyMemoryRegionFromFile(fname, nullptr, result);
   }
 
-  virtual tsl::Status NewReadOnlyMemoryRegionFromFile(
+  virtual absl::Status NewReadOnlyMemoryRegionFromFile(
       const std::string& fname, TransactionToken* token,
       std::unique_ptr<ReadOnlyMemoryRegion>* result) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   /// Returns OK if the named path exists and NOT_FOUND otherwise.
-  virtual tsl::Status FileExists(const std::string& fname) {
+  virtual absl::Status FileExists(const std::string& fname) {
     return FileExists(fname, nullptr);
   }
 
-  virtual tsl::Status FileExists(const std::string& fname,
-                                 TransactionToken* token) {
-    return OkStatus();
+  virtual absl::Status FileExists(const std::string& fname,
+                                  TransactionToken* token) {
+    return absl::OkStatus();
   }
 
   /// Returns true if all the listed files exist, false otherwise.
   /// if status is not null, populate the vector with a detailed status
   /// for each file.
   virtual bool FilesExist(const std::vector<string>& files,
-                          std::vector<Status>* status) {
+                          std::vector<absl::Status>* status) {
     return FilesExist(files, nullptr, status);
   }
 
   virtual bool FilesExist(const std::vector<string>& files,
-                          TransactionToken* token, std::vector<Status>* status);
+                          TransactionToken* token,
+                          std::vector<absl::Status>* status);
 
   /// \brief Returns the immediate children in the given directory.
   ///
   /// The returned paths are relative to 'dir'.
-  virtual tsl::Status GetChildren(const std::string& dir,
-                                  std::vector<string>* result) {
+  virtual absl::Status GetChildren(const std::string& dir,
+                                   std::vector<string>* result) {
     return GetChildren(dir, nullptr, result);
   }
 
-  virtual tsl::Status GetChildren(const std::string& dir,
-                                  TransactionToken* token,
-                                  std::vector<string>* result) {
-    return OkStatus();
+  virtual absl::Status GetChildren(const std::string& dir,
+                                   TransactionToken* token,
+                                   std::vector<string>* result) {
+    return absl::OkStatus();
   }
 
   /// \brief Given a pattern, stores in *results the set of paths that matches
@@ -205,15 +206,15 @@ class FileSystem {
   ///  * OK - no errors
   ///  * UNIMPLEMENTED - Some underlying functions (like GetChildren) are not
   ///                    implemented
-  virtual tsl::Status GetMatchingPaths(const std::string& pattern,
-                                       std::vector<string>* results) {
+  virtual absl::Status GetMatchingPaths(const std::string& pattern,
+                                        std::vector<string>* results) {
     return GetMatchingPaths(pattern, nullptr, results);
   }
 
-  virtual tsl::Status GetMatchingPaths(const std::string& pattern,
-                                       TransactionToken* token,
-                                       std::vector<string>* results) {
-    return OkStatus();
+  virtual absl::Status GetMatchingPaths(const std::string& pattern,
+                                        TransactionToken* token,
+                                        std::vector<string>* results) {
+    return absl::OkStatus();
   }
 
   /// \brief Checks if the given filename matches the pattern.
@@ -224,23 +225,23 @@ class FileSystem {
   virtual bool Match(const std::string& filename, const std::string& pattern);
 
   /// \brief Obtains statistics for the given path.
-  virtual tsl::Status Stat(const std::string& fname, FileStatistics* stat) {
+  virtual absl::Status Stat(const std::string& fname, FileStatistics* stat) {
     return Stat(fname, nullptr, stat);
   }
 
-  virtual tsl::Status Stat(const std::string& fname, TransactionToken* token,
-                           FileStatistics* stat) {
-    return OkStatus();
+  virtual absl::Status Stat(const std::string& fname, TransactionToken* token,
+                            FileStatistics* stat) {
+    return absl::OkStatus();
   }
 
   /// \brief Deletes the named file.
-  virtual tsl::Status DeleteFile(const std::string& fname) {
+  virtual absl::Status DeleteFile(const std::string& fname) {
     return DeleteFile(fname, nullptr);
   }
 
-  virtual tsl::Status DeleteFile(const std::string& fname,
-                                 TransactionToken* token) {
-    return OkStatus();
+  virtual absl::Status DeleteFile(const std::string& fname,
+                                  TransactionToken* token) {
+    return absl::OkStatus();
   }
 
   /// \brief Creates the specified directory.
@@ -248,13 +249,13 @@ class FileSystem {
   ///  * OK - successfully created the directory.
   ///  * ALREADY_EXISTS - directory with name dirname already exists.
   ///  * PERMISSION_DENIED - dirname is not writable.
-  virtual tsl::Status CreateDir(const std::string& dirname) {
+  virtual absl::Status CreateDir(const std::string& dirname) {
     return CreateDir(dirname, nullptr);
   }
 
-  virtual tsl::Status CreateDir(const std::string& dirname,
-                                TransactionToken* token) {
-    return OkStatus();
+  virtual absl::Status CreateDir(const std::string& dirname,
+                                 TransactionToken* token) {
+    return absl::OkStatus();
   }
 
   /// \brief Creates the specified directory and all the necessary
@@ -263,21 +264,21 @@ class FileSystem {
   ///  * OK - successfully created the directory and sub directories, even if
   ///         they were already created.
   ///  * PERMISSION_DENIED - dirname or some subdirectory is not writable.
-  virtual tsl::Status RecursivelyCreateDir(const std::string& dirname) {
+  virtual absl::Status RecursivelyCreateDir(const std::string& dirname) {
     return RecursivelyCreateDir(dirname, nullptr);
   }
 
-  virtual tsl::Status RecursivelyCreateDir(const std::string& dirname,
-                                           TransactionToken* token);
+  virtual absl::Status RecursivelyCreateDir(const std::string& dirname,
+                                            TransactionToken* token);
 
   /// \brief Deletes the specified directory.
-  virtual tsl::Status DeleteDir(const std::string& dirname) {
+  virtual absl::Status DeleteDir(const std::string& dirname) {
     return DeleteDir(dirname, nullptr);
   }
 
-  virtual tsl::Status DeleteDir(const std::string& dirname,
-                                TransactionToken* token) {
-    return OkStatus();
+  virtual absl::Status DeleteDir(const std::string& dirname,
+                                 TransactionToken* token) {
+    return absl::OkStatus();
   }
 
   /// \brief Deletes the specified directory and all subdirectories and files
@@ -304,48 +305,49 @@ class FileSystem {
   ///  * PERMISSION_DENIED - dirname or some descendant is not writable
   ///  * UNIMPLEMENTED - Some underlying functions (like Delete) are not
   ///                    implemented
-  virtual tsl::Status DeleteRecursively(const std::string& dirname,
-                                        int64_t* undeleted_files,
-                                        int64_t* undeleted_dirs) {
+  virtual absl::Status DeleteRecursively(const std::string& dirname,
+                                         int64_t* undeleted_files,
+                                         int64_t* undeleted_dirs) {
     return DeleteRecursively(dirname, nullptr, undeleted_files, undeleted_dirs);
   }
 
-  virtual tsl::Status DeleteRecursively(const std::string& dirname,
-                                        TransactionToken* token,
-                                        int64_t* undeleted_files,
-                                        int64_t* undeleted_dirs);
+  virtual absl::Status DeleteRecursively(const std::string& dirname,
+                                         TransactionToken* token,
+                                         int64_t* undeleted_files,
+                                         int64_t* undeleted_dirs);
 
   /// \brief Stores the size of `fname` in `*file_size`.
-  virtual tsl::Status GetFileSize(const std::string& fname, uint64* file_size) {
+  virtual absl::Status GetFileSize(const std::string& fname,
+                                   uint64* file_size) {
     return GetFileSize(fname, nullptr, file_size);
   }
 
-  virtual tsl::Status GetFileSize(const std::string& fname,
-                                  TransactionToken* token, uint64* file_size) {
-    return OkStatus();
+  virtual absl::Status GetFileSize(const std::string& fname,
+                                   TransactionToken* token, uint64* file_size) {
+    return absl::OkStatus();
   }
 
   /// \brief Overwrites the target if it exists.
-  virtual tsl::Status RenameFile(const std::string& src,
-                                 const std::string& target) {
+  virtual absl::Status RenameFile(const std::string& src,
+                                  const std::string& target) {
     return RenameFile(src, target, nullptr);
   }
 
-  virtual tsl::Status RenameFile(const std::string& src,
-                                 const std::string& target,
-                                 TransactionToken* token) {
-    return OkStatus();
+  virtual absl::Status RenameFile(const std::string& src,
+                                  const std::string& target,
+                                  TransactionToken* token) {
+    return absl::OkStatus();
   }
 
   /// \brief Copy the src to target.
-  virtual tsl::Status CopyFile(const std::string& src,
-                               const std::string& target) {
+  virtual absl::Status CopyFile(const std::string& src,
+                                const std::string& target) {
     return CopyFile(src, target, nullptr);
   }
 
-  virtual tsl::Status CopyFile(const std::string& src,
-                               const std::string& target,
-                               TransactionToken* token);
+  virtual absl::Status CopyFile(const std::string& src,
+                                const std::string& target,
+                                TransactionToken* token);
 
   /// \brief Translate an URI to a filename for the FileSystem implementation.
   ///
@@ -365,12 +367,12 @@ class FileSystem {
   ///  * NOT_FOUND - The path entry does not exist.
   ///  * PERMISSION_DENIED - Insufficient permissions.
   ///  * UNIMPLEMENTED - The file factory doesn't support directories.
-  virtual tsl::Status IsDirectory(const std::string& fname) {
+  virtual absl::Status IsDirectory(const std::string& fname) {
     return IsDirectory(fname, nullptr);
   }
 
-  virtual tsl::Status IsDirectory(const std::string& fname,
-                                  TransactionToken* token);
+  virtual absl::Status IsDirectory(const std::string& fname,
+                                   TransactionToken* token);
 
   /// \brief Returns whether the given path is on a file system
   /// that has atomic move capabilities. This can be used
@@ -382,7 +384,8 @@ class FileSystem {
   ///         so has_atomic_move holds the above information.
   ///  * UNIMPLEMENTED - The file system of the path hasn't been implemented in
   ///  TF
-  virtual Status HasAtomicMove(const std::string& path, bool* has_atomic_move);
+  virtual absl::Status HasAtomicMove(const std::string& path,
+                                     bool* has_atomic_move);
 
   /// \brief Flushes any cached filesystem objects from memory.
   virtual void FlushCaches() { FlushCaches(nullptr); }
@@ -476,67 +479,67 @@ class FileSystem {
   // Transaction related API
 
   /// \brief Starts a new transaction
-  virtual tsl::Status StartTransaction(TransactionToken** token) {
+  virtual absl::Status StartTransaction(TransactionToken** token) {
     *token = nullptr;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   /// \brief Adds `path` to transaction in `token`
-  virtual tsl::Status AddToTransaction(const std::string& path,
-                                       TransactionToken* token) {
-    return OkStatus();
+  virtual absl::Status AddToTransaction(const std::string& path,
+                                        TransactionToken* token) {
+    return absl::OkStatus();
   }
 
   /// \brief Ends transaction
-  virtual tsl::Status EndTransaction(TransactionToken* token) {
-    return OkStatus();
+  virtual absl::Status EndTransaction(TransactionToken* token) {
+    return absl::OkStatus();
   }
 
   /// \brief Get token for `path` or start a new transaction and add `path` to
   /// it.
-  virtual tsl::Status GetTokenOrStartTransaction(const std::string& path,
-                                                 TransactionToken** token) {
+  virtual absl::Status GetTokenOrStartTransaction(const std::string& path,
+                                                  TransactionToken** token) {
     *token = nullptr;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   /// \brief Return transaction for `path` or nullptr in `token`
-  virtual tsl::Status GetTransactionForPath(const std::string& path,
-                                            TransactionToken** token) {
+  virtual absl::Status GetTransactionForPath(const std::string& path,
+                                             TransactionToken** token) {
     *token = nullptr;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   /// \brief Decode transaction to human readable string.
   virtual std::string DecodeTransaction(const TransactionToken* token);
 
   /// \brief Set File System Configuration Options
-  virtual Status SetOption(const string& key, const string& value) {
+  virtual absl::Status SetOption(const string& key, const string& value) {
     return errors::Unimplemented("SetOption");
   }
 
   /// \brief Set File System Configuration Option
-  virtual tsl::Status SetOption(const std::string& name,
-                                const std::vector<string>& values) {
+  virtual absl::Status SetOption(const std::string& name,
+                                 const std::vector<string>& values) {
     return errors::Unimplemented("SetOption");
   }
 
   /// \brief Set File System Configuration Option
-  virtual tsl::Status SetOption(const std::string& name,
-                                const std::vector<int64_t>& values) {
+  virtual absl::Status SetOption(const std::string& name,
+                                 const std::vector<int64_t>& values) {
     return errors::Unimplemented("SetOption");
   }
 
   /// \brief Set File System Configuration Option
-  virtual tsl::Status SetOption(const std::string& name,
-                                const std::vector<double>& values) {
+  virtual absl::Status SetOption(const std::string& name,
+                                 const std::vector<double>& values) {
     return errors::Unimplemented("SetOption");
   }
 
   /// \brief Set File System ACL checker.
   ///
   /// No checks are enforced if a FileAcl is never set.
-  virtual tsl::Status SetFileAcl(std::shared_ptr<FileAcl> file_acl) {
+  virtual absl::Status SetFileAcl(std::shared_ptr<FileAcl> file_acl) {
     return errors::Unimplemented("SetFileAcl");
   }
 
@@ -581,48 +584,49 @@ class WrappedFileSystem : public FileSystem {
  public:
   TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
 
-  tsl::Status NewRandomAccessFile(
+  absl::Status NewRandomAccessFile(
       const std::string& fname, TransactionToken* token,
       std::unique_ptr<RandomAccessFile>* result) override {
     return fs_->NewRandomAccessFile(fname, (token ? token : token_), result);
   }
 
-  tsl::Status NewWritableFile(const std::string& fname, TransactionToken* token,
-                              std::unique_ptr<WritableFile>* result) override {
+  absl::Status NewWritableFile(const std::string& fname,
+                               TransactionToken* token,
+                               std::unique_ptr<WritableFile>* result) override {
     return fs_->NewWritableFile(fname, (token ? token : token_), result);
   }
 
-  tsl::Status NewAppendableFile(
+  absl::Status NewAppendableFile(
       const std::string& fname, TransactionToken* token,
       std::unique_ptr<WritableFile>* result) override {
     return fs_->NewAppendableFile(fname, (token ? token : token_), result);
   }
 
-  tsl::Status NewReadOnlyMemoryRegionFromFile(
+  absl::Status NewReadOnlyMemoryRegionFromFile(
       const std::string& fname, TransactionToken* token,
       std::unique_ptr<ReadOnlyMemoryRegion>* result) override {
     return fs_->NewReadOnlyMemoryRegionFromFile(fname, (token ? token : token_),
                                                 result);
   }
 
-  tsl::Status FileExists(const std::string& fname,
-                         TransactionToken* token) override {
+  absl::Status FileExists(const std::string& fname,
+                          TransactionToken* token) override {
     return fs_->FileExists(fname, (token ? token : token_));
   }
 
   bool FilesExist(const std::vector<string>& files, TransactionToken* token,
-                  std::vector<Status>* status) override {
+                  std::vector<absl::Status>* status) override {
     return fs_->FilesExist(files, (token ? token : token_), status);
   }
 
-  tsl::Status GetChildren(const std::string& dir, TransactionToken* token,
-                          std::vector<string>* result) override {
+  absl::Status GetChildren(const std::string& dir, TransactionToken* token,
+                           std::vector<string>* result) override {
     return fs_->GetChildren(dir, (token ? token : token_), result);
   }
 
-  tsl::Status GetMatchingPaths(const std::string& pattern,
-                               TransactionToken* token,
-                               std::vector<string>* results) override {
+  absl::Status GetMatchingPaths(const std::string& pattern,
+                                TransactionToken* token,
+                                std::vector<string>* results) override {
     return fs_->GetMatchingPaths(pattern, (token ? token : token_), results);
   }
 
@@ -630,51 +634,51 @@ class WrappedFileSystem : public FileSystem {
     return fs_->Match(filename, pattern);
   }
 
-  tsl::Status Stat(const std::string& fname, TransactionToken* token,
-                   FileStatistics* stat) override {
+  absl::Status Stat(const std::string& fname, TransactionToken* token,
+                    FileStatistics* stat) override {
     return fs_->Stat(fname, (token ? token : token_), stat);
   }
 
-  tsl::Status DeleteFile(const std::string& fname,
-                         TransactionToken* token) override {
+  absl::Status DeleteFile(const std::string& fname,
+                          TransactionToken* token) override {
     return fs_->DeleteFile(fname, (token ? token : token_));
   }
 
-  tsl::Status CreateDir(const std::string& dirname,
-                        TransactionToken* token) override {
+  absl::Status CreateDir(const std::string& dirname,
+                         TransactionToken* token) override {
     return fs_->CreateDir(dirname, (token ? token : token_));
   }
 
-  tsl::Status RecursivelyCreateDir(const std::string& dirname,
-                                   TransactionToken* token) override {
+  absl::Status RecursivelyCreateDir(const std::string& dirname,
+                                    TransactionToken* token) override {
     return fs_->RecursivelyCreateDir(dirname, (token ? token : token_));
   }
 
-  tsl::Status DeleteDir(const std::string& dirname,
-                        TransactionToken* token) override {
+  absl::Status DeleteDir(const std::string& dirname,
+                         TransactionToken* token) override {
     return fs_->DeleteDir(dirname, (token ? token : token_));
   }
 
-  tsl::Status DeleteRecursively(const std::string& dirname,
-                                TransactionToken* token,
-                                int64_t* undeleted_files,
-                                int64_t* undeleted_dirs) override {
+  absl::Status DeleteRecursively(const std::string& dirname,
+                                 TransactionToken* token,
+                                 int64_t* undeleted_files,
+                                 int64_t* undeleted_dirs) override {
     return fs_->DeleteRecursively(dirname, (token ? token : token_),
                                   undeleted_files, undeleted_dirs);
   }
 
-  tsl::Status GetFileSize(const std::string& fname, TransactionToken* token,
-                          uint64* file_size) override {
+  absl::Status GetFileSize(const std::string& fname, TransactionToken* token,
+                           uint64* file_size) override {
     return fs_->GetFileSize(fname, (token ? token : token_), file_size);
   }
 
-  tsl::Status RenameFile(const std::string& src, const std::string& target,
-                         TransactionToken* token) override {
+  absl::Status RenameFile(const std::string& src, const std::string& target,
+                          TransactionToken* token) override {
     return fs_->RenameFile(src, target, (token ? token : token_));
   }
 
-  tsl::Status CopyFile(const std::string& src, const std::string& target,
-                       TransactionToken* token) override {
+  absl::Status CopyFile(const std::string& src, const std::string& target,
+                        TransactionToken* token) override {
     return fs_->CopyFile(src, target, (token ? token : token_));
   }
 
@@ -682,13 +686,13 @@ class WrappedFileSystem : public FileSystem {
     return fs_->TranslateName(name);
   }
 
-  tsl::Status IsDirectory(const std::string& fname,
-                          TransactionToken* token) override {
+  absl::Status IsDirectory(const std::string& fname,
+                           TransactionToken* token) override {
     return fs_->IsDirectory(fname, (token ? token : token_));
   }
 
-  Status HasAtomicMove(const std::string& path,
-                       bool* has_atomic_move) override {
+  absl::Status HasAtomicMove(const std::string& path,
+                             bool* has_atomic_move) override {
     return fs_->HasAtomicMove(path, has_atomic_move);
   }
 
@@ -702,26 +706,26 @@ class WrappedFileSystem : public FileSystem {
     return fs_->Basename(path);
   }
 
-  tsl::Status StartTransaction(TransactionToken** token) override {
+  absl::Status StartTransaction(TransactionToken** token) override {
     return fs_->StartTransaction(token);
   }
 
-  tsl::Status AddToTransaction(const std::string& path,
-                               TransactionToken* token) override {
+  absl::Status AddToTransaction(const std::string& path,
+                                TransactionToken* token) override {
     return fs_->AddToTransaction(path, (token ? token : token_));
   }
 
-  tsl::Status EndTransaction(TransactionToken* token) override {
+  absl::Status EndTransaction(TransactionToken* token) override {
     return fs_->EndTransaction(token);
   }
 
-  tsl::Status GetTransactionForPath(const std::string& path,
-                                    TransactionToken** token) override {
+  absl::Status GetTransactionForPath(const std::string& path,
+                                     TransactionToken** token) override {
     return fs_->GetTransactionForPath(path, token);
   }
 
-  tsl::Status GetTokenOrStartTransaction(const std::string& path,
-                                         TransactionToken** token) override {
+  absl::Status GetTokenOrStartTransaction(const std::string& path,
+                                          TransactionToken** token) override {
     return fs_->GetTokenOrStartTransaction(path, token);
   }
 
@@ -749,7 +753,7 @@ class RandomAccessFile {
   ///
   /// This is an optional operation that may not be implemented by every
   /// filesystem.
-  virtual tsl::Status Name(StringPiece* result) const {
+  virtual absl::Status Name(StringPiece* result) const {
     return errors::Unimplemented("This filesystem does not support Name()");
   }
 
@@ -768,12 +772,12 @@ class RandomAccessFile {
   /// because of EOF.
   ///
   /// Safe for concurrent use by multiple threads.
-  virtual tsl::Status Read(uint64 offset, size_t n, StringPiece* result,
-                           char* scratch) const = 0;
+  virtual absl::Status Read(uint64 offset, size_t n, StringPiece* result,
+                            char* scratch) const = 0;
 
 #if defined(TF_CORD_SUPPORT)
   /// \brief Read up to `n` bytes from the file starting at `offset`.
-  virtual tsl::Status Read(uint64 offset, size_t n, absl::Cord* cord) const {
+  virtual absl::Status Read(uint64 offset, size_t n, absl::Cord* cord) const {
     return errors::Unimplemented(
         "Read(uint64, size_t, absl::Cord*) is not "
         "implemented");
@@ -795,15 +799,15 @@ class WritableFile {
   virtual ~WritableFile() = default;
 
   /// \brief Append 'data' to the file.
-  virtual tsl::Status Append(StringPiece data) = 0;
+  virtual absl::Status Append(StringPiece data) = 0;
 
 #if defined(TF_CORD_SUPPORT)
   // \brief Append 'data' to the file.
-  virtual tsl::Status Append(const absl::Cord& cord) {
+  virtual absl::Status Append(const absl::Cord& cord) {
     for (StringPiece chunk : cord.Chunks()) {
       TF_RETURN_IF_ERROR(Append(chunk));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 #endif
 
@@ -814,7 +818,7 @@ class WritableFile {
   /// Typical return codes (not guaranteed to be exhaustive):
   ///  * OK
   ///  * Other codes, as returned from Flush()
-  virtual tsl::Status Close() = 0;
+  virtual absl::Status Close() = 0;
 
   /// \brief Flushes the file and optionally syncs contents to filesystem.
   ///
@@ -826,13 +830,13 @@ class WritableFile {
   /// eventually flush the contents.  If the OS or machine crashes
   /// after a successful flush, the contents may or may not be
   /// persisted, depending on the implementation.
-  virtual tsl::Status Flush() = 0;
+  virtual absl::Status Flush() = 0;
 
   // \brief Returns the name of the file.
   ///
   /// This is an optional operation that may not be implemented by every
   /// filesystem.
-  virtual tsl::Status Name(StringPiece* result) const {
+  virtual absl::Status Name(StringPiece* result) const {
     return errors::Unimplemented("This filesystem does not support Name()");
   }
 
@@ -842,14 +846,14 @@ class WritableFile {
   /// of the file have been persisted to the filesystem; if the OS
   /// or machine crashes after a successful Sync, the contents should
   /// be properly saved.
-  virtual tsl::Status Sync() = 0;
+  virtual absl::Status Sync() = 0;
 
   /// \brief Retrieves the current write position in the file, or -1 on
   /// error.
   ///
   /// This is an optional operation, subclasses may choose to return
   /// errors::Unimplemented.
-  virtual tsl::Status Tell(int64_t* position) {
+  virtual absl::Status Tell(int64_t* position) {
     *position = -1;
     return errors::Unimplemented("This filesystem does not support Tell()");
   }
@@ -903,11 +907,11 @@ class FileSystemRegistry {
   typedef std::function<FileSystem*()> Factory;
 
   virtual ~FileSystemRegistry() = default;
-  virtual tsl::Status Register(const std::string& scheme, Factory factory) = 0;
-  virtual tsl::Status Register(const std::string& scheme,
-                               std::unique_ptr<FileSystem> filesystem) = 0;
+  virtual absl::Status Register(const std::string& scheme, Factory factory) = 0;
+  virtual absl::Status Register(const std::string& scheme,
+                                std::unique_ptr<FileSystem> filesystem) = 0;
   virtual FileSystem* Lookup(const std::string& scheme) = 0;
-  virtual tsl::Status GetRegisteredFileSystemSchemes(
+  virtual absl::Status GetRegisteredFileSystemSchemes(
       std::vector<std::string>* schemes) = 0;
 };
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/file_system_helper.cc b/third_party/xla/third_party/tsl/tsl/platform/file_system_helper.cc
index bc132ba02a56b5..04dc6a0420516c 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/file_system_helper.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/file_system_helper.cc
@@ -121,18 +121,19 @@ static inline int GetFirstGlobbingEntry(const std::vector<std::string>& dirs) {
 
 }  // namespace
 
-Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern,
-                        std::vector<string>* results) {
+absl::Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern,
+                              std::vector<string>* results) {
   // Check that `fs`, `env` and `results` are non-null.
   if (fs == nullptr || env == nullptr || results == nullptr) {
-    return Status(absl::StatusCode::kInvalidArgument,
-                  "Filesystem calls GetMatchingPaths with nullptr arguments");
+    return absl::Status(
+        absl::StatusCode::kInvalidArgument,
+        "Filesystem calls GetMatchingPaths with nullptr arguments");
   }
 
   // By design, we don't match anything on empty pattern
   results->clear();
   if (pattern.empty()) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // The pattern can contain globbing characters at multiple levels, e.g.:
@@ -155,7 +156,7 @@ Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern,
     if (fs->FileExists(pattern).ok()) {
       results->emplace_back(pattern);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // To expand the globbing, we do a BFS from `dirs[matching_index-1]`.
@@ -205,7 +206,7 @@ Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern,
 
       // Get all children of `parent`. If this fails, return early.
       std::vector<std::string> children;
-      Status s = fs->GetChildren(parent, &children);
+      absl::Status s = fs->GetChildren(parent, &children);
       if (s.code() == absl::StatusCode::kPermissionDenied) {
         return;
       }
@@ -220,13 +221,13 @@ Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern,
       // We also check that children match the pattern in parallel, for speedup.
       // We store the status of the match and `IsDirectory` in
       // `children_status` array, one element for each children.
-      std::vector<Status> children_status(children.size());
+      std::vector<absl::Status> children_status(children.size());
       auto handle_children = [&fs, &match_pattern, &parent, &children,
                               &children_status](int j) {
         const std::string path = io::JoinPath(parent, children[j]);
         if (!fs->Match(path, match_pattern)) {
-          children_status[j] =
-              Status(absl::StatusCode::kCancelled, "Operation not needed");
+          children_status[j] = absl::Status(absl::StatusCode::kCancelled,
+                                            "Operation not needed");
         } else {
           children_status[j] = fs->IsDirectory(path);
         }
@@ -263,11 +264,11 @@ Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern,
     std::swap(expand_queue, next_expand_queue);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-StatusOr<bool> FileExists(Env* env, const string& fname) {
-  Status status = env->FileExists(fname);
+absl::StatusOr<bool> FileExists(Env* env, const string& fname) {
+  absl::Status status = env->FileExists(fname);
   if (errors::IsNotFound(status)) {
     return false;
   }
diff --git a/third_party/xla/third_party/tsl/tsl/platform/file_system_helper.h b/third_party/xla/third_party/tsl/tsl/platform/file_system_helper.h
index 5cc2cdc5a1e898..e9e7df6aa68907 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/file_system_helper.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/file_system_helper.h
@@ -44,8 +44,8 @@ namespace internal {
 //   results: will be cleared and may not be null.
 //
 // Returns an error status if any call to 'fs' failed.
-Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern,
-                        std::vector<string>* results);
+absl::Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern,
+                              std::vector<string>* results);
 
 // Given a file path, determines whether the file exists. This helper simplifies
 // the use of Env::FileExists.
@@ -56,7 +56,7 @@ Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern,
 //
 // Returns true if the file exists, false if it does not exist, or an error
 // Status.
-StatusOr<bool> FileExists(Env* env, const string& fname);
+absl::StatusOr<bool> FileExists(Env* env, const string& fname);
 
 }  // namespace internal
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/platform/human_readable_json.h b/third_party/xla/third_party/tsl/tsl/platform/human_readable_json.h
index b8e81df81f1829..ae7b9ee7fc4b38 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/human_readable_json.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/human_readable_json.h
@@ -16,29 +16,34 @@ limitations under the License.
 #ifndef TENSORFLOW_TSL_PLATFORM_HUMAN_READABLE_JSON_H_
 #define TENSORFLOW_TSL_PLATFORM_HUMAN_READABLE_JSON_H_
 
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "tsl/platform/protobuf.h"
-#include "tsl/platform/status.h"
+#include "tsl/platform/types.h"
 
 namespace tsl {
 
 // Converts a proto to a JSON-like string that's meant to be human-readable
 // but still machine-parseable.
 //
-// This string may not be strictly JSON-compliant, but it must be parseable by
+// This string may not be strictly JSON-compliant, but it must be parsable by
 // HumanReadableJSONToProto.
 //
 // When ignore_accuracy_loss = true, this function may ignore JavaScript
 // accuracy loss with large integers.
-Status ProtoToHumanReadableJson(const protobuf::Message& proto, string* result,
-                                bool ignore_accuracy_loss);
-Status ProtoToHumanReadableJson(const protobuf::MessageLite& proto,
-                                string* result, bool ignore_accuracy_loss);
+absl::StatusOr<std::string> ProtoToHumanReadableJson(
+    const protobuf::Message& proto, bool ignore_accuracy_loss);
+absl::StatusOr<std::string> ProtoToHumanReadableJson(
+    const protobuf::MessageLite& proto, bool ignore_accuracy_loss);
 
 // Converts a string produced by ProtoToHumanReadableJSON to a protobuf.  Not
 // guaranteed to work for general JSON.
-Status HumanReadableJsonToProto(const string& str, protobuf::Message* proto);
-Status HumanReadableJsonToProto(const string& str,
-                                protobuf::MessageLite* proto);
+absl::Status HumanReadableJsonToProto(const string& str,
+                                      protobuf::Message* proto);
+absl::Status HumanReadableJsonToProto(const string& str,
+                                      protobuf::MessageLite* proto);
 
 }  // namespace tsl
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/ml_dtypes.h b/third_party/xla/third_party/tsl/tsl/platform/ml_dtypes.h
index 1d23cebcabd6ee..916be8db4f6998 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/ml_dtypes.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/ml_dtypes.h
@@ -17,14 +17,12 @@ limitations under the License.
 #define TENSORFLOW_TSL_PLATFORM_ML_DTYPES_H_
 
 #include "ml_dtypes/include/float8.h"  // from @ml_dtypes
-#include "ml_dtypes/include/int4.h"  // from @ml_dtypes
+#include "ml_dtypes/include/intn.h"  // from @ml_dtypes
 
 namespace tsl {
 using float8_e4m3fn = ::ml_dtypes::float8_e4m3fn;
 using float8_e4m3fnuz = ::ml_dtypes::float8_e4m3fnuz;
 using float8_e4m3b11fnuz = ::ml_dtypes::float8_e4m3b11fnuz;
-using float8_e4m3b11 = float8_e4m3b11fnuz;  // Deprecated: old name for
-                                            // backward-compatibility only.
 using float8_e5m2 = ::ml_dtypes::float8_e5m2;
 using float8_e5m2fnuz = ::ml_dtypes::float8_e5m2fnuz;
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/null_file_system.h b/third_party/xla/third_party/tsl/tsl/platform/null_file_system.h
index 77b8142ee357b7..c04d2c1f0d6056 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/null_file_system.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/null_file_system.h
@@ -38,67 +38,72 @@ class NullFileSystem : public FileSystem {
 
   TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
 
-  Status NewRandomAccessFile(
+  absl::Status NewRandomAccessFile(
       const string& fname, TransactionToken* token,
       std::unique_ptr<RandomAccessFile>* result) override {
     return errors::Unimplemented("NewRandomAccessFile unimplemented");
   }
 
-  Status NewWritableFile(const string& fname, TransactionToken* token,
-                         std::unique_ptr<WritableFile>* result) override {
+  absl::Status NewWritableFile(const string& fname, TransactionToken* token,
+                               std::unique_ptr<WritableFile>* result) override {
     return errors::Unimplemented("NewWritableFile unimplemented");
   }
 
-  Status NewAppendableFile(const string& fname, TransactionToken* token,
-                           std::unique_ptr<WritableFile>* result) override {
+  absl::Status NewAppendableFile(
+      const string& fname, TransactionToken* token,
+      std::unique_ptr<WritableFile>* result) override {
     return errors::Unimplemented("NewAppendableFile unimplemented");
   }
 
-  Status NewReadOnlyMemoryRegionFromFile(
+  absl::Status NewReadOnlyMemoryRegionFromFile(
       const string& fname, TransactionToken* token,
       std::unique_ptr<ReadOnlyMemoryRegion>* result) override {
     return errors::Unimplemented(
         "NewReadOnlyMemoryRegionFromFile unimplemented");
   }
 
-  Status FileExists(const string& fname, TransactionToken* token) override {
+  absl::Status FileExists(const string& fname,
+                          TransactionToken* token) override {
     return errors::Unimplemented("FileExists unimplemented");
   }
 
-  Status GetChildren(const string& dir, TransactionToken* token,
-                     std::vector<string>* result) override {
+  absl::Status GetChildren(const string& dir, TransactionToken* token,
+                           std::vector<string>* result) override {
     return errors::Unimplemented("GetChildren unimplemented");
   }
 
-  Status GetMatchingPaths(const string& pattern, TransactionToken* token,
-                          std::vector<string>* results) override {
+  absl::Status GetMatchingPaths(const string& pattern, TransactionToken* token,
+                                std::vector<string>* results) override {
     return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
   }
 
-  Status DeleteFile(const string& fname, TransactionToken* token) override {
+  absl::Status DeleteFile(const string& fname,
+                          TransactionToken* token) override {
     return errors::Unimplemented("DeleteFile unimplemented");
   }
 
-  Status CreateDir(const string& dirname, TransactionToken* token) override {
+  absl::Status CreateDir(const string& dirname,
+                         TransactionToken* token) override {
     return errors::Unimplemented("CreateDir unimplemented");
   }
 
-  Status DeleteDir(const string& dirname, TransactionToken* token) override {
+  absl::Status DeleteDir(const string& dirname,
+                         TransactionToken* token) override {
     return errors::Unimplemented("DeleteDir unimplemented");
   }
 
-  Status GetFileSize(const string& fname, TransactionToken* token,
-                     uint64* file_size) override {
+  absl::Status GetFileSize(const string& fname, TransactionToken* token,
+                           uint64* file_size) override {
     return errors::Unimplemented("GetFileSize unimplemented");
   }
 
-  Status RenameFile(const string& src, const string& target,
-                    TransactionToken* token) override {
+  absl::Status RenameFile(const string& src, const string& target,
+                          TransactionToken* token) override {
     return errors::Unimplemented("RenameFile unimplemented");
   }
 
-  Status Stat(const string& fname, TransactionToken* token,
-              FileStatistics* stat) override {
+  absl::Status Stat(const string& fname, TransactionToken* token,
+                    FileStatistics* stat) override {
     return errors::Unimplemented("Stat unimplemented");
   }
 };
diff --git a/third_party/xla/third_party/tsl/tsl/platform/protobuf.h b/third_party/xla/third_party/tsl/tsl/platform/protobuf.h
index e6180168662250..d35ccc79c0ed7f 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/protobuf.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/protobuf.h
@@ -90,7 +90,7 @@ inline std::string ProtobufStringToString(const absl::Cord& s) {
   return std::string(s);
 }
 inline void SetProtobufStringSwapAllowed(std::string* src, absl::Cord* dest) {
-  dest->CopyFrom(*src);
+  *dest = *src;
 }
 #endif  // defined(TENSORFLOW_PROTOBUF_USES_CORD)
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/ram_file_system.h b/third_party/xla/third_party/tsl/tsl/platform/ram_file_system.h
index 1b51653b716c3e..245eacfe465daa 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/ram_file_system.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/ram_file_system.h
@@ -49,13 +49,13 @@ class RamRandomAccessFile : public RandomAccessFile, public WritableFile {
       : name_(name), data_(cord) {}
   ~RamRandomAccessFile() override {}
 
-  Status Name(StringPiece* result) const override {
+  absl::Status Name(StringPiece* result) const override {
     *result = name_;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status Read(uint64 offset, size_t n, StringPiece* result,
-              char* scratch) const override {
+  absl::Status Read(uint64 offset, size_t n, StringPiece* result,
+                    char* scratch) const override {
     if (offset >= data_->size()) {
       return errors::OutOfRange("");
     }
@@ -72,26 +72,26 @@ class RamRandomAccessFile : public RandomAccessFile, public WritableFile {
     if (left < n) {
       return errors::OutOfRange("");
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status Append(StringPiece data) override {
+  absl::Status Append(StringPiece data) override {
     data_->append(data.data(), data.size());
-    return OkStatus();
+    return absl::OkStatus();
   }
 
 #if defined(TF_CORD_SUPPORT)
-  Status Append(const absl::Cord& cord) override {
+  absl::Status Append(const absl::Cord& cord) override {
     data_->append(cord.char_begin(), cord.char_end());
-    return OkStatus();
+    return absl::OkStatus();
   }
 #endif
 
-  Status Close() override { return OkStatus(); }
-  Status Flush() override { return OkStatus(); }
-  Status Sync() override { return OkStatus(); }
+  absl::Status Close() override { return absl::OkStatus(); }
+  absl::Status Flush() override { return absl::OkStatus(); }
+  absl::Status Sync() override { return absl::OkStatus(); }
 
-  Status Tell(int64_t* position) override {
+  absl::Status Tell(int64_t* position) override {
     *position = -1;
     return errors::Unimplemented("This filesystem does not support Tell()");
   }
@@ -107,7 +107,7 @@ class RamFileSystem : public FileSystem {
  public:
   TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
 
-  Status NewRandomAccessFile(
+  absl::Status NewRandomAccessFile(
       const std::string& fname_, TransactionToken* token,
       std::unique_ptr<RandomAccessFile>* result) override {
     mutex_lock m(mu_);
@@ -121,11 +121,12 @@ class RamFileSystem : public FileSystem {
     }
     *result = std::unique_ptr<RandomAccessFile>(
         new RamRandomAccessFile(fname, fs_[fname]));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status NewWritableFile(const std::string& fname_, TransactionToken* token,
-                         std::unique_ptr<WritableFile>* result) override {
+  absl::Status NewWritableFile(const std::string& fname_,
+                               TransactionToken* token,
+                               std::unique_ptr<WritableFile>* result) override {
     mutex_lock m(mu_);
     auto fname = StripRamFsPrefix(fname_);
 
@@ -137,11 +138,12 @@ class RamFileSystem : public FileSystem {
     }
     *result = std::unique_ptr<WritableFile>(
         new RamRandomAccessFile(fname, fs_[fname]));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status NewAppendableFile(const std::string& fname_, TransactionToken* token,
-                           std::unique_ptr<WritableFile>* result) override {
+  absl::Status NewAppendableFile(
+      const std::string& fname_, TransactionToken* token,
+      std::unique_ptr<WritableFile>* result) override {
     mutex_lock m(mu_);
     auto fname = StripRamFsPrefix(fname_);
 
@@ -153,25 +155,25 @@ class RamFileSystem : public FileSystem {
     }
     *result = std::unique_ptr<WritableFile>(
         new RamRandomAccessFile(fname, fs_[fname]));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status NewReadOnlyMemoryRegionFromFile(
+  absl::Status NewReadOnlyMemoryRegionFromFile(
       const std::string& fname, TransactionToken* token,
       std::unique_ptr<ReadOnlyMemoryRegion>* result) override {
     return errors::Unimplemented("");
   }
 
-  Status FileExists(const std::string& fname_,
-                    TransactionToken* token) override {
+  absl::Status FileExists(const std::string& fname_,
+                          TransactionToken* token) override {
     FileStatistics stat;
     auto fname = StripRamFsPrefix(fname_);
 
     return Stat(fname, token, &stat);
   }
 
-  Status GetChildren(const std::string& dir_, TransactionToken* token,
-                     std::vector<std::string>* result) override {
+  absl::Status GetChildren(const std::string& dir_, TransactionToken* token,
+                           std::vector<std::string>* result) override {
     mutex_lock m(mu_);
     auto dir = StripRamFsPrefix(dir_);
 
@@ -185,11 +187,12 @@ class RamFileSystem : public FileSystem {
       ++it;
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status GetMatchingPaths(const std::string& pattern_, TransactionToken* token,
-                          std::vector<std::string>* results) override {
+  absl::Status GetMatchingPaths(const std::string& pattern_,
+                                TransactionToken* token,
+                                std::vector<std::string>* results) override {
     mutex_lock m(mu_);
     auto pattern = StripRamFsPrefix(pattern_);
 
@@ -199,11 +202,11 @@ class RamFileSystem : public FileSystem {
         results->push_back("ram://" + it->first);
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status Stat(const std::string& fname_, TransactionToken* token,
-              FileStatistics* stat) override {
+  absl::Status Stat(const std::string& fname_, TransactionToken* token,
+                    FileStatistics* stat) override {
     mutex_lock m(mu_);
     auto fname = StripRamFsPrefix(fname_);
 
@@ -216,30 +219,30 @@ class RamFileSystem : public FileSystem {
       stat->is_directory = false;
       stat->length = fs_[fname]->size();
       stat->mtime_nsec = 0;
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     stat->is_directory = true;
     stat->length = 0;
     stat->mtime_nsec = 0;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status DeleteFile(const std::string& fname_,
-                    TransactionToken* token) override {
+  absl::Status DeleteFile(const std::string& fname_,
+                          TransactionToken* token) override {
     mutex_lock m(mu_);
     auto fname = StripRamFsPrefix(fname_);
 
     if (fs_.find(fname) != fs_.end()) {
       fs_.erase(fname);
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     return errors::NotFound("");
   }
 
-  Status CreateDir(const std::string& dirname_,
-                   TransactionToken* token) override {
+  absl::Status CreateDir(const std::string& dirname_,
+                         TransactionToken* token) override {
     mutex_lock m(mu_);
     auto dirname = StripRamFsPrefix(dirname_);
 
@@ -250,15 +253,15 @@ class RamFileSystem : public FileSystem {
     }
 
     fs_[dirname] = nullptr;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status RecursivelyCreateDir(const std::string& dirname_,
-                              TransactionToken* token) override {
+  absl::Status RecursivelyCreateDir(const std::string& dirname_,
+                                    TransactionToken* token) override {
     auto dirname = StripRamFsPrefix(dirname_);
 
     std::vector<std::string> dirs = StrSplit(dirname, "/");
-    Status last_status;
+    absl::Status last_status;
     std::string dir = dirs[0];
     last_status = CreateDir(dir, token);
 
@@ -269,8 +272,8 @@ class RamFileSystem : public FileSystem {
     return last_status;
   }
 
-  Status DeleteDir(const std::string& dirname_,
-                   TransactionToken* token) override {
+  absl::Status DeleteDir(const std::string& dirname_,
+                         TransactionToken* token) override {
     mutex_lock m(mu_);
     auto dirname = StripRamFsPrefix(dirname_);
 
@@ -283,11 +286,11 @@ class RamFileSystem : public FileSystem {
     }
     fs_.erase(dirname);
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status GetFileSize(const std::string& fname_, TransactionToken* token,
-                     uint64* file_size) override {
+  absl::Status GetFileSize(const std::string& fname_, TransactionToken* token,
+                           uint64* file_size) override {
     mutex_lock m(mu_);
     auto fname = StripRamFsPrefix(fname_);
 
@@ -296,13 +299,13 @@ class RamFileSystem : public FileSystem {
         return errors::InvalidArgument("Not a file");
       }
       *file_size = fs_[fname]->size();
-      return OkStatus();
+      return absl::OkStatus();
     }
     return errors::NotFound("");
   }
 
-  Status RenameFile(const std::string& src_, const std::string& target_,
-                    TransactionToken* token) override {
+  absl::Status RenameFile(const std::string& src_, const std::string& target_,
+                          TransactionToken* token) override {
     mutex_lock m(mu_);
     auto src = StripRamFsPrefix(src_);
     auto target = StripRamFsPrefix(target_);
@@ -310,7 +313,7 @@ class RamFileSystem : public FileSystem {
     if (fs_.find(src) != fs_.end()) {
       fs_[target] = fs_[src];
       fs_.erase(fs_.find(src));
-      return OkStatus();
+      return absl::OkStatus();
     }
     return errors::NotFound("");
   }
diff --git a/third_party/xla/third_party/tsl/tsl/platform/retrying_file_system.h b/third_party/xla/third_party/tsl/tsl/platform/retrying_file_system.h
index 591423b4fe3ec7..3db1665a4f1867 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/retrying_file_system.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/retrying_file_system.h
@@ -40,21 +40,23 @@ class RetryingFileSystem : public FileSystem {
 
   TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
 
-  Status NewRandomAccessFile(
+  absl::Status NewRandomAccessFile(
       const string& filename, TransactionToken* token,
       std::unique_ptr<RandomAccessFile>* result) override;
 
-  Status NewWritableFile(const string& filename, TransactionToken* token,
-                         std::unique_ptr<WritableFile>* result) override;
+  absl::Status NewWritableFile(const string& filename, TransactionToken* token,
+                               std::unique_ptr<WritableFile>* result) override;
 
-  Status NewAppendableFile(const string& filename, TransactionToken* token,
-                           std::unique_ptr<WritableFile>* result) override;
+  absl::Status NewAppendableFile(
+      const string& filename, TransactionToken* token,
+      std::unique_ptr<WritableFile>* result) override;
 
-  Status NewReadOnlyMemoryRegionFromFile(
+  absl::Status NewReadOnlyMemoryRegionFromFile(
       const string& filename, TransactionToken* token,
       std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
 
-  Status FileExists(const string& fname, TransactionToken* token) override {
+  absl::Status FileExists(const string& fname,
+                          TransactionToken* token) override {
     return RetryingUtils::CallWithRetries(
         [this, &fname, token]() {
           return base_file_system_->FileExists(fname, token);
@@ -62,8 +64,8 @@ class RetryingFileSystem : public FileSystem {
         retry_config_);
   }
 
-  Status GetChildren(const string& dir, TransactionToken* token,
-                     std::vector<string>* result) override {
+  absl::Status GetChildren(const string& dir, TransactionToken* token,
+                           std::vector<string>* result) override {
     return RetryingUtils::CallWithRetries(
         [this, &dir, result, token]() {
           return base_file_system_->GetChildren(dir, token, result);
@@ -71,8 +73,8 @@ class RetryingFileSystem : public FileSystem {
         retry_config_);
   }
 
-  Status GetMatchingPaths(const string& pattern, TransactionToken* token,
-                          std::vector<string>* result) override {
+  absl::Status GetMatchingPaths(const string& pattern, TransactionToken* token,
+                                std::vector<string>* result) override {
     return RetryingUtils::CallWithRetries(
         [this, &pattern, result, token]() {
           return base_file_system_->GetMatchingPaths(pattern, token, result);
@@ -80,8 +82,8 @@ class RetryingFileSystem : public FileSystem {
         retry_config_);
   }
 
-  Status Stat(const string& fname, TransactionToken* token,
-              FileStatistics* stat) override {
+  absl::Status Stat(const string& fname, TransactionToken* token,
+                    FileStatistics* stat) override {
     return RetryingUtils::CallWithRetries(
         [this, &fname, stat, token]() {
           return base_file_system_->Stat(fname, token, stat);
@@ -89,7 +91,8 @@ class RetryingFileSystem : public FileSystem {
         retry_config_);
   }
 
-  Status DeleteFile(const string& fname, TransactionToken* token) override {
+  absl::Status DeleteFile(const string& fname,
+                          TransactionToken* token) override {
     return RetryingUtils::DeleteWithRetries(
         [this, &fname, token]() {
           return base_file_system_->DeleteFile(fname, token);
@@ -97,7 +100,8 @@ class RetryingFileSystem : public FileSystem {
         retry_config_);
   }
 
-  Status CreateDir(const string& dirname, TransactionToken* token) override {
+  absl::Status CreateDir(const string& dirname,
+                         TransactionToken* token) override {
     return RetryingUtils::CallWithRetries(
         [this, &dirname, token]() {
           return base_file_system_->CreateDir(dirname, token);
@@ -105,7 +109,8 @@ class RetryingFileSystem : public FileSystem {
         retry_config_);
   }
 
-  Status DeleteDir(const string& dirname, TransactionToken* token) override {
+  absl::Status DeleteDir(const string& dirname,
+                         TransactionToken* token) override {
     return RetryingUtils::DeleteWithRetries(
         [this, &dirname, token]() {
           return base_file_system_->DeleteDir(dirname, token);
@@ -113,8 +118,8 @@ class RetryingFileSystem : public FileSystem {
         retry_config_);
   }
 
-  Status GetFileSize(const string& fname, TransactionToken* token,
-                     uint64* file_size) override {
+  absl::Status GetFileSize(const string& fname, TransactionToken* token,
+                           uint64* file_size) override {
     return RetryingUtils::CallWithRetries(
         [this, &fname, file_size, token]() {
           return base_file_system_->GetFileSize(fname, token, file_size);
@@ -122,8 +127,8 @@ class RetryingFileSystem : public FileSystem {
         retry_config_);
   }
 
-  Status RenameFile(const string& src, const string& target,
-                    TransactionToken* token) override {
+  absl::Status RenameFile(const string& src, const string& target,
+                          TransactionToken* token) override {
     return RetryingUtils::CallWithRetries(
         [this, &src, &target, token]() {
           return base_file_system_->RenameFile(src, target, token);
@@ -131,7 +136,8 @@ class RetryingFileSystem : public FileSystem {
         retry_config_);
   }
 
-  Status IsDirectory(const string& dirname, TransactionToken* token) override {
+  absl::Status IsDirectory(const string& dirname,
+                           TransactionToken* token) override {
     return RetryingUtils::CallWithRetries(
         [this, &dirname, token]() {
           return base_file_system_->IsDirectory(dirname, token);
@@ -139,14 +145,15 @@ class RetryingFileSystem : public FileSystem {
         retry_config_);
   }
 
-  Status HasAtomicMove(const string& path, bool* has_atomic_move) override {
+  absl::Status HasAtomicMove(const string& path,
+                             bool* has_atomic_move) override {
     // this method does not need to be retried
     return base_file_system_->HasAtomicMove(path, has_atomic_move);
   }
 
-  Status DeleteRecursively(const string& dirname, TransactionToken* token,
-                           int64_t* undeleted_files,
-                           int64_t* undeleted_dirs) override {
+  absl::Status DeleteRecursively(const string& dirname, TransactionToken* token,
+                                 int64_t* undeleted_files,
+                                 int64_t* undeleted_dirs) override {
     return RetryingUtils::DeleteWithRetries(
         [this, &dirname, token, undeleted_files, undeleted_dirs]() {
           return base_file_system_->DeleteRecursively(
@@ -177,12 +184,12 @@ class RetryingRandomAccessFile : public RandomAccessFile {
                            const RetryConfig& retry_config)
       : base_file_(std::move(base_file)), retry_config_(retry_config) {}
 
-  Status Name(StringPiece* result) const override {
+  absl::Status Name(StringPiece* result) const override {
     return base_file_->Name(result);
   }
 
-  Status Read(uint64 offset, size_t n, StringPiece* result,
-              char* scratch) const override {
+  absl::Status Read(uint64 offset, size_t n, StringPiece* result,
+                    char* scratch) const override {
     return RetryingUtils::CallWithRetries(
         [this, offset, n, result, scratch]() {
           return base_file_->Read(offset, n, result, scratch);
@@ -206,26 +213,26 @@ class RetryingWritableFile : public WritableFile {
     Close().IgnoreError();
   }
 
-  Status Append(StringPiece data) override {
+  absl::Status Append(StringPiece data) override {
     return RetryingUtils::CallWithRetries(
         [this, &data]() { return base_file_->Append(data); }, retry_config_);
   }
-  Status Close() override {
+  absl::Status Close() override {
     return RetryingUtils::CallWithRetries(
         [this]() { return base_file_->Close(); }, retry_config_);
   }
-  Status Flush() override {
+  absl::Status Flush() override {
     return RetryingUtils::CallWithRetries(
         [this]() { return base_file_->Flush(); }, retry_config_);
   }
-  Status Name(StringPiece* result) const override {
+  absl::Status Name(StringPiece* result) const override {
     return base_file_->Name(result);
   }
-  Status Sync() override {
+  absl::Status Sync() override {
     return RetryingUtils::CallWithRetries(
         [this]() { return base_file_->Sync(); }, retry_config_);
   }
-  Status Tell(int64_t* position) override {
+  absl::Status Tell(int64_t* position) override {
     return RetryingUtils::CallWithRetries(
         [this, &position]() { return base_file_->Tell(position); },
         retry_config_);
@@ -239,7 +246,7 @@ class RetryingWritableFile : public WritableFile {
 }  // namespace retrying_internals
 
 template <typename Underlying>
-Status RetryingFileSystem<Underlying>::NewRandomAccessFile(
+absl::Status RetryingFileSystem<Underlying>::NewRandomAccessFile(
     const string& filename, TransactionToken* token,
     std::unique_ptr<RandomAccessFile>* result) {
   std::unique_ptr<RandomAccessFile> base_file;
@@ -251,11 +258,11 @@ Status RetryingFileSystem<Underlying>::NewRandomAccessFile(
       retry_config_));
   result->reset(new retrying_internals::RetryingRandomAccessFile(
       std::move(base_file), retry_config_));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename Underlying>
-Status RetryingFileSystem<Underlying>::NewWritableFile(
+absl::Status RetryingFileSystem<Underlying>::NewWritableFile(
     const string& filename, TransactionToken* token,
     std::unique_ptr<WritableFile>* result) {
   std::unique_ptr<WritableFile> base_file;
@@ -266,11 +273,11 @@ Status RetryingFileSystem<Underlying>::NewWritableFile(
       retry_config_));
   result->reset(new retrying_internals::RetryingWritableFile(
       std::move(base_file), retry_config_));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename Underlying>
-Status RetryingFileSystem<Underlying>::NewAppendableFile(
+absl::Status RetryingFileSystem<Underlying>::NewAppendableFile(
     const string& filename, TransactionToken* token,
     std::unique_ptr<WritableFile>* result) {
   std::unique_ptr<WritableFile> base_file;
@@ -282,11 +289,11 @@ Status RetryingFileSystem<Underlying>::NewAppendableFile(
       retry_config_));
   result->reset(new retrying_internals::RetryingWritableFile(
       std::move(base_file), retry_config_));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename Underlying>
-Status RetryingFileSystem<Underlying>::NewReadOnlyMemoryRegionFromFile(
+absl::Status RetryingFileSystem<Underlying>::NewReadOnlyMemoryRegionFromFile(
     const string& filename, TransactionToken* token,
     std::unique_ptr<ReadOnlyMemoryRegion>* result) {
   return RetryingUtils::CallWithRetries(
diff --git a/third_party/xla/third_party/tsl/tsl/platform/retrying_file_system_test.cc b/third_party/xla/third_party/tsl/tsl/platform/retrying_file_system_test.cc
index 605ce2eb7ea3c3..522c59f565e0b1 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/retrying_file_system_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/retrying_file_system_test.cc
@@ -24,7 +24,7 @@ limitations under the License.
 namespace tsl {
 namespace {
 
-typedef std::vector<std::tuple<string, Status>> ExpectedCalls;
+typedef std::vector<std::tuple<string, absl::Status>> ExpectedCalls;
 
 ExpectedCalls CreateRetriableErrors(const string& method, int n) {
   ExpectedCalls expected_calls;
@@ -47,7 +47,7 @@ class MockCallSequence {
         << "the next expected call: " << std::get<0>(calls_.front());
   }
 
-  Status ConsumeNextCall(const string& method) {
+  absl::Status ConsumeNextCall(const string& method) {
     EXPECT_FALSE(calls_.empty()) << "No more calls were expected.";
     auto call = calls_.front();
     calls_.erase(calls_.begin());
@@ -62,11 +62,11 @@ class MockCallSequence {
 class MockRandomAccessFile : public RandomAccessFile {
  public:
   explicit MockRandomAccessFile(const ExpectedCalls& calls) : calls_(calls) {}
-  Status Name(StringPiece* result) const override {
+  absl::Status Name(StringPiece* result) const override {
     return calls_.ConsumeNextCall("Name");
   }
-  Status Read(uint64 offset, size_t n, StringPiece* result,
-              char* scratch) const override {
+  absl::Status Read(uint64 offset, size_t n, StringPiece* result,
+                    char* scratch) const override {
     return calls_.ConsumeNextCall("Read");
   }
 
@@ -77,16 +77,16 @@ class MockRandomAccessFile : public RandomAccessFile {
 class MockWritableFile : public WritableFile {
  public:
   explicit MockWritableFile(const ExpectedCalls& calls) : calls_(calls) {}
-  Status Append(StringPiece data) override {
+  absl::Status Append(StringPiece data) override {
     return calls_.ConsumeNextCall("Append");
   }
-  Status Close() override { return calls_.ConsumeNextCall("Close"); }
-  Status Flush() override { return calls_.ConsumeNextCall("Flush"); }
-  Status Name(StringPiece* result) const override {
+  absl::Status Close() override { return calls_.ConsumeNextCall("Close"); }
+  absl::Status Flush() override { return calls_.ConsumeNextCall("Flush"); }
+  absl::Status Name(StringPiece* result) const override {
     return calls_.ConsumeNextCall("Name");
   }
-  Status Sync() override { return calls_.ConsumeNextCall("Sync"); }
-  Status Tell(int64_t* position) override {
+  absl::Status Sync() override { return calls_.ConsumeNextCall("Sync"); }
+  absl::Status Tell(int64_t* position) override {
     return calls_.ConsumeNextCall("Tell");
   }
 
@@ -101,79 +101,85 @@ class MockFileSystem : public FileSystem {
 
   TF_USE_FILESYSTEM_METHODS_WITH_NO_TRANSACTION_SUPPORT;
 
-  Status NewRandomAccessFile(
+  absl::Status NewRandomAccessFile(
       const string& fname, TransactionToken* token,
       std::unique_ptr<RandomAccessFile>* result) override {
     *result = std::move(random_access_file_to_return);
     return calls_.ConsumeNextCall("NewRandomAccessFile");
   }
 
-  Status NewWritableFile(const string& fname, TransactionToken* token,
-                         std::unique_ptr<WritableFile>* result) override {
+  absl::Status NewWritableFile(const string& fname, TransactionToken* token,
+                               std::unique_ptr<WritableFile>* result) override {
     *result = std::move(writable_file_to_return);
     return calls_.ConsumeNextCall("NewWritableFile");
   }
 
-  Status NewAppendableFile(const string& fname, TransactionToken* token,
-                           std::unique_ptr<WritableFile>* result) override {
+  absl::Status NewAppendableFile(
+      const string& fname, TransactionToken* token,
+      std::unique_ptr<WritableFile>* result) override {
     *result = std::move(writable_file_to_return);
     return calls_.ConsumeNextCall("NewAppendableFile");
   }
 
-  Status NewReadOnlyMemoryRegionFromFile(
+  absl::Status NewReadOnlyMemoryRegionFromFile(
       const string& fname, TransactionToken* token,
       std::unique_ptr<ReadOnlyMemoryRegion>* result) override {
     return calls_.ConsumeNextCall("NewReadOnlyMemoryRegionFromFile");
   }
 
-  Status FileExists(const string& fname, TransactionToken* token) override {
+  absl::Status FileExists(const string& fname,
+                          TransactionToken* token) override {
     return calls_.ConsumeNextCall("FileExists");
   }
 
-  Status GetChildren(const string& dir, TransactionToken* token,
-                     std::vector<string>* result) override {
+  absl::Status GetChildren(const string& dir, TransactionToken* token,
+                           std::vector<string>* result) override {
     return calls_.ConsumeNextCall("GetChildren");
   }
 
-  Status GetMatchingPaths(const string& dir, TransactionToken* token,
-                          std::vector<string>* result) override {
+  absl::Status GetMatchingPaths(const string& dir, TransactionToken* token,
+                                std::vector<string>* result) override {
     return calls_.ConsumeNextCall("GetMatchingPaths");
   }
 
-  Status Stat(const string& fname, TransactionToken* token,
-              FileStatistics* stat) override {
+  absl::Status Stat(const string& fname, TransactionToken* token,
+                    FileStatistics* stat) override {
     return calls_.ConsumeNextCall("Stat");
   }
 
-  Status DeleteFile(const string& fname, TransactionToken* token) override {
+  absl::Status DeleteFile(const string& fname,
+                          TransactionToken* token) override {
     return calls_.ConsumeNextCall("DeleteFile");
   }
 
-  Status CreateDir(const string& dirname, TransactionToken* token) override {
+  absl::Status CreateDir(const string& dirname,
+                         TransactionToken* token) override {
     return calls_.ConsumeNextCall("CreateDir");
   }
 
-  Status DeleteDir(const string& dirname, TransactionToken* token) override {
+  absl::Status DeleteDir(const string& dirname,
+                         TransactionToken* token) override {
     return calls_.ConsumeNextCall("DeleteDir");
   }
 
-  Status GetFileSize(const string& fname, TransactionToken* token,
-                     uint64* file_size) override {
+  absl::Status GetFileSize(const string& fname, TransactionToken* token,
+                           uint64* file_size) override {
     return calls_.ConsumeNextCall("GetFileSize");
   }
 
-  Status RenameFile(const string& src, const string& target,
-                    TransactionToken* token) override {
+  absl::Status RenameFile(const string& src, const string& target,
+                          TransactionToken* token) override {
     return calls_.ConsumeNextCall("RenameFile");
   }
 
-  Status IsDirectory(const string& dirname, TransactionToken* token) override {
+  absl::Status IsDirectory(const string& dirname,
+                           TransactionToken* token) override {
     return calls_.ConsumeNextCall("IsDirectory");
   }
 
-  Status DeleteRecursively(const string& dirname, TransactionToken* token,
-                           int64_t* undeleted_files,
-                           int64_t* undeleted_dirs) override {
+  absl::Status DeleteRecursively(const string& dirname, TransactionToken* token,
+                                 int64_t* undeleted_files,
+                                 int64_t* undeleted_dirs) override {
     return calls_.ConsumeNextCall("DeleteRecursively");
   }
 
@@ -193,14 +199,15 @@ class MockFileSystem : public FileSystem {
 
 TEST(RetryingFileSystemTest, NewRandomAccessFile_ImmediateSuccess) {
   // Configure the mock base random access file.
-  ExpectedCalls expected_file_calls({std::make_tuple("Name", OkStatus()),
-                                     std::make_tuple("Read", OkStatus())});
+  ExpectedCalls expected_file_calls(
+      {std::make_tuple("Name", absl::OkStatus()),
+       std::make_tuple("Read", absl::OkStatus())});
   std::unique_ptr<RandomAccessFile> base_file(
       new MockRandomAccessFile(expected_file_calls));
 
   // Configure the mock base file system.
   ExpectedCalls expected_fs_calls(
-      {std::make_tuple("NewRandomAccessFile", OkStatus())});
+      {std::make_tuple("NewRandomAccessFile", absl::OkStatus())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   base_fs->random_access_file_to_return = std::move(base_file);
@@ -226,13 +233,13 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_SuccessWith3rdTry) {
   ExpectedCalls expected_file_calls(
       {std::make_tuple("Read", errors::Unavailable("Something is wrong")),
        std::make_tuple("Read", errors::Unavailable("Wrong again")),
-       std::make_tuple("Read", OkStatus())});
+       std::make_tuple("Read", absl::OkStatus())});
   std::unique_ptr<RandomAccessFile> base_file(
       new MockRandomAccessFile(expected_file_calls));
 
   // Configure the mock base file system.
   ExpectedCalls expected_fs_calls(
-      {std::make_tuple("NewRandomAccessFile", OkStatus())});
+      {std::make_tuple("NewRandomAccessFile", absl::OkStatus())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   base_fs->random_access_file_to_return = std::move(base_file);
@@ -258,7 +265,7 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_AllRetriesFailed) {
 
   // Configure the mock base file system.
   ExpectedCalls expected_fs_calls(
-      {std::make_tuple("NewRandomAccessFile", OkStatus())});
+      {std::make_tuple("NewRandomAccessFile", absl::OkStatus())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   base_fs->random_access_file_to_return = std::move(base_file);
@@ -289,7 +296,7 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_NoRetriesForSomeErrors) {
 
   // Configure the mock base file system.
   ExpectedCalls expected_fs_calls(
-      {std::make_tuple("NewRandomAccessFile", OkStatus())});
+      {std::make_tuple("NewRandomAccessFile", absl::OkStatus())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   base_fs->random_access_file_to_return = std::move(base_file);
@@ -310,15 +317,16 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_NoRetriesForSomeErrors) {
 
 TEST(RetryingFileSystemTest, NewWritableFile_ImmediateSuccess) {
   // Configure the mock base random access file.
-  ExpectedCalls expected_file_calls({std::make_tuple("Name", OkStatus()),
-                                     std::make_tuple("Sync", OkStatus()),
-                                     std::make_tuple("Close", OkStatus())});
+  ExpectedCalls expected_file_calls(
+      {std::make_tuple("Name", absl::OkStatus()),
+       std::make_tuple("Sync", absl::OkStatus()),
+       std::make_tuple("Close", absl::OkStatus())});
   std::unique_ptr<WritableFile> base_file(
       new MockWritableFile(expected_file_calls));
 
   // Configure the mock base file system.
   ExpectedCalls expected_fs_calls(
-      {std::make_tuple("NewWritableFile", OkStatus())});
+      {std::make_tuple("NewWritableFile", absl::OkStatus())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   base_fs->writable_file_to_return = std::move(base_file);
@@ -342,14 +350,14 @@ TEST(RetryingFileSystemTest, NewWritableFile_SuccessWith3rdTry) {
   ExpectedCalls expected_file_calls(
       {std::make_tuple("Sync", errors::Unavailable("Something is wrong")),
        std::make_tuple("Sync", errors::Unavailable("Something is wrong again")),
-       std::make_tuple("Sync", OkStatus()),
-       std::make_tuple("Close", OkStatus())});
+       std::make_tuple("Sync", absl::OkStatus()),
+       std::make_tuple("Close", absl::OkStatus())});
   std::unique_ptr<WritableFile> base_file(
       new MockWritableFile(expected_file_calls));
 
   // Configure the mock base file system.
   ExpectedCalls expected_fs_calls(
-      {std::make_tuple("NewWritableFile", OkStatus())});
+      {std::make_tuple("NewWritableFile", absl::OkStatus())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   base_fs->writable_file_to_return = std::move(base_file);
@@ -370,13 +378,13 @@ TEST(RetryingFileSystemTest, NewWritableFile_SuccessWith3rdTry_ViaDestructor) {
       {std::make_tuple("Close", errors::Unavailable("Something is wrong")),
        std::make_tuple("Close",
                        errors::Unavailable("Something is wrong again")),
-       std::make_tuple("Close", OkStatus())});
+       std::make_tuple("Close", absl::OkStatus())});
   std::unique_ptr<WritableFile> base_file(
       new MockWritableFile(expected_file_calls));
 
   // Configure the mock base file system.
   ExpectedCalls expected_fs_calls(
-      {std::make_tuple("NewWritableFile", OkStatus())});
+      {std::make_tuple("NewWritableFile", absl::OkStatus())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   base_fs->writable_file_to_return = std::move(base_file);
@@ -395,14 +403,14 @@ TEST(RetryingFileSystemTest, NewAppendableFile_SuccessWith3rdTry) {
   ExpectedCalls expected_file_calls(
       {std::make_tuple("Sync", errors::Unavailable("Something is wrong")),
        std::make_tuple("Sync", errors::Unavailable("Something is wrong again")),
-       std::make_tuple("Sync", OkStatus()),
-       std::make_tuple("Close", OkStatus())});
+       std::make_tuple("Sync", absl::OkStatus()),
+       std::make_tuple("Close", absl::OkStatus())});
   std::unique_ptr<WritableFile> base_file(
       new MockWritableFile(expected_file_calls));
 
   // Configure the mock base file system.
   ExpectedCalls expected_fs_calls(
-      {std::make_tuple("NewAppendableFile", OkStatus())});
+      {std::make_tuple("NewAppendableFile", absl::OkStatus())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   base_fs->writable_file_to_return = std::move(base_file);
@@ -420,13 +428,13 @@ TEST(RetryingFileSystemTest, NewAppendableFile_SuccessWith3rdTry) {
 TEST(RetryingFileSystemTest, NewWritableFile_AllRetriesFailed) {
   // Configure the mock base random access file.
   ExpectedCalls expected_file_calls = CreateRetriableErrors("Sync", 11);
-  expected_file_calls.emplace_back(std::make_tuple("Close", OkStatus()));
+  expected_file_calls.emplace_back(std::make_tuple("Close", absl::OkStatus()));
   std::unique_ptr<WritableFile> base_file(
       new MockWritableFile(expected_file_calls));
 
   // Configure the mock base file system.
   ExpectedCalls expected_fs_calls(
-      {std::make_tuple("NewWritableFile", OkStatus())});
+      {std::make_tuple("NewWritableFile", absl::OkStatus())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   base_fs->writable_file_to_return = std::move(base_file);
@@ -448,7 +456,7 @@ TEST(RetryingFileSystemTest,
   ExpectedCalls expected_fs_calls(
       {std::make_tuple("NewReadOnlyMemoryRegionFromFile",
                        errors::Unavailable("Something is wrong")),
-       std::make_tuple("NewReadOnlyMemoryRegionFromFile", OkStatus())});
+       std::make_tuple("NewReadOnlyMemoryRegionFromFile", absl::OkStatus())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   RetryingFileSystem<MockFileSystem> fs(
@@ -478,7 +486,7 @@ TEST(RetryingFileSystemTest, GetChildren_SuccessWith2ndTry) {
   ExpectedCalls expected_fs_calls(
       {std::make_tuple("GetChildren",
                        errors::Unavailable("Something is wrong")),
-       std::make_tuple("GetChildren", OkStatus())});
+       std::make_tuple("GetChildren", absl::OkStatus())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   RetryingFileSystem<MockFileSystem> fs(
@@ -505,7 +513,7 @@ TEST(RetryingFileSystemTest, GetMatchingPaths_SuccessWith2ndTry) {
   ExpectedCalls expected_fs_calls(
       {std::make_tuple("GetMatchingPaths",
                        errors::Unavailable("Something is wrong")),
-       std::make_tuple("GetMatchingPaths", OkStatus())});
+       std::make_tuple("GetMatchingPaths", absl::OkStatus())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   RetryingFileSystem<MockFileSystem> fs(
@@ -532,7 +540,7 @@ TEST(RetryingFileSystemTest, GetMatchingPaths_AllRetriesFailed) {
 TEST(RetryingFileSystemTest, DeleteFile_SuccessWith2ndTry) {
   ExpectedCalls expected_fs_calls(
       {std::make_tuple("DeleteFile", errors::Unavailable("Something is wrong")),
-       std::make_tuple("DeleteFile", OkStatus())});
+       std::make_tuple("DeleteFile", absl::OkStatus())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   RetryingFileSystem<MockFileSystem> fs(
@@ -556,7 +564,7 @@ TEST(RetryingFileSystemTest, DeleteFile_AllRetriesFailed) {
 TEST(RetryingFileSystemTest, CreateDir_SuccessWith2ndTry) {
   ExpectedCalls expected_fs_calls(
       {std::make_tuple("CreateDir", errors::Unavailable("Something is wrong")),
-       std::make_tuple("CreateDir", OkStatus())});
+       std::make_tuple("CreateDir", absl::OkStatus())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   RetryingFileSystem<MockFileSystem> fs(
@@ -580,7 +588,7 @@ TEST(RetryingFileSystemTest, CreateDir_AllRetriesFailed) {
 TEST(RetryingFileSystemTest, DeleteDir_SuccessWith2ndTry) {
   ExpectedCalls expected_fs_calls(
       {std::make_tuple("DeleteDir", errors::Unavailable("Something is wrong")),
-       std::make_tuple("DeleteDir", OkStatus())});
+       std::make_tuple("DeleteDir", absl::OkStatus())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   RetryingFileSystem<MockFileSystem> fs(
@@ -605,7 +613,7 @@ TEST(RetryingFileSystemTest, GetFileSize_SuccessWith2ndTry) {
   ExpectedCalls expected_fs_calls(
       {std::make_tuple("GetFileSize",
                        errors::Unavailable("Something is wrong")),
-       std::make_tuple("GetFileSize", OkStatus())});
+       std::make_tuple("GetFileSize", absl::OkStatus())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   RetryingFileSystem<MockFileSystem> fs(
@@ -631,7 +639,7 @@ TEST(RetryingFileSystemTest, GetFileSize_AllRetriesFailed) {
 TEST(RetryingFileSystemTest, RenameFile_SuccessWith2ndTry) {
   ExpectedCalls expected_fs_calls(
       {std::make_tuple("RenameFile", errors::Unavailable("Something is wrong")),
-       std::make_tuple("RenameFile", OkStatus())});
+       std::make_tuple("RenameFile", absl::OkStatus())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   RetryingFileSystem<MockFileSystem> fs(
@@ -655,7 +663,7 @@ TEST(RetryingFileSystemTest, RenameFile_AllRetriesFailed) {
 TEST(RetryingFileSystemTest, Stat_SuccessWith2ndTry) {
   ExpectedCalls expected_fs_calls(
       {std::make_tuple("Stat", errors::Unavailable("Something is wrong")),
-       std::make_tuple("Stat", OkStatus())});
+       std::make_tuple("Stat", absl::OkStatus())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   RetryingFileSystem<MockFileSystem> fs(
@@ -693,7 +701,7 @@ TEST(RetryingFileSystemTest, FileExists_AllRetriesFailed) {
 TEST(RetryingFileSystemTest, FileExists_SuccessWith2ndTry) {
   ExpectedCalls expected_fs_calls(
       {std::make_tuple("FileExists", errors::Unavailable("Something is wrong")),
-       std::make_tuple("FileExists", OkStatus())});
+       std::make_tuple("FileExists", absl::OkStatus())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   RetryingFileSystem<MockFileSystem> fs(
@@ -706,7 +714,7 @@ TEST(RetryingFileSystemTest, IsDirectory_SuccessWith2ndTry) {
   ExpectedCalls expected_fs_calls(
       {std::make_tuple("IsDirectory",
                        errors::Unavailable("Something is wrong")),
-       std::make_tuple("IsDirectory", OkStatus())});
+       std::make_tuple("IsDirectory", absl::OkStatus())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   RetryingFileSystem<MockFileSystem> fs(
@@ -731,7 +739,7 @@ TEST(RetryingFileSystemTest, DeleteRecursively_SuccessWith2ndTry) {
   ExpectedCalls expected_fs_calls(
       {std::make_tuple("DeleteRecursively",
                        errors::Unavailable("Something is wrong")),
-       std::make_tuple("DeleteRecursively", OkStatus())});
+       std::make_tuple("DeleteRecursively", absl::OkStatus())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   RetryingFileSystem<MockFileSystem> fs(
diff --git a/third_party/xla/third_party/tsl/tsl/platform/retrying_utils.cc b/third_party/xla/third_party/tsl/tsl/platform/retrying_utils.cc
index 3beb3f46110a24..14459e93b61ef3 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/retrying_utils.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/retrying_utils.cc
@@ -54,8 +54,8 @@ double GenerateUniformRandomNumberBetween(double a, double b) {
 
 }  // namespace
 
-Status RetryingUtils::CallWithRetries(const std::function<Status()>& f,
-                                      const RetryConfig& config) {
+absl::Status RetryingUtils::CallWithRetries(
+    const std::function<absl::Status()>& f, const RetryConfig& config) {
   return CallWithRetries(
       f,
       [](int64_t micros) {
@@ -64,8 +64,8 @@ Status RetryingUtils::CallWithRetries(const std::function<Status()>& f,
       config);
 }
 
-Status RetryingUtils::CallWithRetries(
-    const std::function<Status()>& f,
+absl::Status RetryingUtils::CallWithRetries(
+    const std::function<absl::Status()>& f,
     const std::function<void(int64_t)>& sleep_usec, const RetryConfig& config) {
   int retries = 0;
   while (true) {
@@ -76,7 +76,7 @@ Status RetryingUtils::CallWithRetries(
     if (retries >= config.max_retries) {
       // Return AbortedError, so that it doesn't get retried again somewhere
       // at a higher level.
-      return Status(
+      return absl::Status(
           absl::StatusCode::kAborted,
           strings::StrCat(
               "All ", config.max_retries,
@@ -98,14 +98,15 @@ Status RetryingUtils::CallWithRetries(
   }
 }
 
-Status RetryingUtils::DeleteWithRetries(
-    const std::function<Status()>& delete_func, const RetryConfig& config) {
+absl::Status RetryingUtils::DeleteWithRetries(
+    const std::function<absl::Status()>& delete_func,
+    const RetryConfig& config) {
   bool is_retried = false;
   return RetryingUtils::CallWithRetries(
       [delete_func, &is_retried]() {
-        const Status status = delete_func();
+        const absl::Status status = delete_func();
         if (is_retried && status.code() == error::NOT_FOUND) {
-          return OkStatus();
+          return absl::OkStatus();
         }
         is_retried = true;
         return status;
diff --git a/third_party/xla/third_party/tsl/tsl/platform/retrying_utils.h b/third_party/xla/third_party/tsl/tsl/platform/retrying_utils.h
index 3252da2637c4d2..470b6a8f183412 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/retrying_utils.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/retrying_utils.h
@@ -51,20 +51,22 @@ class RetryingUtils {
   /// If initial_delay_microseconds is zero, no delays will be made between
   /// retries.
   /// If all retries failed, returns the last error status.
-  static Status CallWithRetries(const std::function<Status()>& f,
-                                const RetryConfig& config);
+  static absl::Status CallWithRetries(const std::function<absl::Status()>& f,
+                                      const RetryConfig& config);
 
   /// sleep_usec is a function that sleeps for the given number of microseconds.
-  static Status CallWithRetries(const std::function<Status()>& f,
-                                const std::function<void(int64_t)>& sleep_usec,
-                                const RetryConfig& config);
+  static absl::Status CallWithRetries(
+      const std::function<absl::Status()>& f,
+      const std::function<void(int64_t)>& sleep_usec,
+      const RetryConfig& config);
   /// \brief A retrying wrapper for a function that deletes a resource.
   ///
   /// The function takes care of the scenario when a delete operation
   /// returns a failure but succeeds under the hood: if a retry returns
   /// NOT_FOUND, the whole operation is considered a success.
-  static Status DeleteWithRetries(const std::function<Status()>& delete_func,
-                                  const RetryConfig& config);
+  static absl::Status DeleteWithRetries(
+      const std::function<absl::Status()>& delete_func,
+      const RetryConfig& config);
 };
 
 // Given the total number of retries attempted, returns a randomized duration of
diff --git a/third_party/xla/third_party/tsl/tsl/platform/retrying_utils_test.cc b/third_party/xla/third_party/tsl/tsl/platform/retrying_utils_test.cc
index c0b3ad7b651e16..5d55ec31cc2f20 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/retrying_utils_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/retrying_utils_test.cc
@@ -32,7 +32,9 @@ TEST(RetryingUtilsTest, CallWithRetries_RetryDelays) {
   std::function<void(int64_t)> sleep = [&requested_delays](int64_t delay) {
     requested_delays.emplace_back(delay / 1000000.0);
   };
-  std::function<Status()> f = []() { return errors::Unavailable("Failed."); };
+  std::function<absl::Status()> f = []() {
+    return errors::Unavailable("Failed.");
+  };
 
   const auto& status = RetryingUtils::CallWithRetries(
       f, sleep, RetryConfig(500000 /* init_delay_time_us */));
@@ -58,9 +60,9 @@ TEST(RetryingUtilsTest, CallWithRetries_RetryDelays) {
 }
 
 TEST(RetryingUtilsTest, CallWithRetries_NotFoundIsNotRetried) {
-  std::vector<Status> results(
+  std::vector<absl::Status> results(
       {errors::Unavailable("Failed."), errors::NotFound("Not found.")});
-  std::function<Status()> f = [&results]() {
+  std::function<absl::Status()> f = [&results]() {
     auto result = results[0];
     results.erase(results.begin());
     return result;
@@ -70,11 +72,11 @@ TEST(RetryingUtilsTest, CallWithRetries_NotFoundIsNotRetried) {
 }
 
 TEST(RetryingUtilsTest, CallWithRetries_ImmediateSuccess) {
-  std::vector<Status> results({OkStatus()});
+  std::vector<absl::Status> results({absl::OkStatus()});
   std::function<void(int64_t)> sleep = [](int64_t delay) {
     ADD_FAILURE() << "Unexpected call to sleep.";
   };
-  std::function<Status()> f = [&results]() {
+  std::function<absl::Status()> f = [&results]() {
     auto result = results[0];
     results.erase(results.begin());
     return result;
@@ -84,10 +86,10 @@ TEST(RetryingUtilsTest, CallWithRetries_ImmediateSuccess) {
 }
 
 TEST(RetryingUtilsTest, CallWithRetries_EventualSuccess) {
-  std::vector<Status> results({errors::Unavailable("Failed."),
-                               errors::Unavailable("Failed again."),
-                               OkStatus()});
-  std::function<Status()> f = [&results]() {
+  std::vector<absl::Status> results({errors::Unavailable("Failed."),
+                                     errors::Unavailable("Failed again."),
+                                     absl::OkStatus()});
+  std::function<absl::Status()> f = [&results]() {
     auto result = results[0];
     results.erase(results.begin());
     return result;
@@ -97,7 +99,7 @@ TEST(RetryingUtilsTest, CallWithRetries_EventualSuccess) {
 }
 
 TEST(RetryingUtilsTest, DeleteWithRetries_ImmediateSuccess) {
-  std::vector<Status> delete_results({OkStatus()});
+  std::vector<absl::Status> delete_results({absl::OkStatus()});
   const auto delete_func = [&delete_results]() {
     auto result = delete_results[0];
     delete_results.erase(delete_results.begin());
@@ -108,7 +110,8 @@ TEST(RetryingUtilsTest, DeleteWithRetries_ImmediateSuccess) {
 }
 
 TEST(RetryingUtilsTest, DeleteWithRetries_EventualSuccess) {
-  std::vector<Status> delete_results({errors::Unavailable(""), OkStatus()});
+  std::vector<absl::Status> delete_results(
+      {errors::Unavailable(""), absl::OkStatus()});
   const auto delete_func = [&delete_results]() {
     auto result = delete_results[0];
     delete_results.erase(delete_results.begin());
@@ -119,7 +122,7 @@ TEST(RetryingUtilsTest, DeleteWithRetries_EventualSuccess) {
 }
 
 TEST(RetryingUtilsTest, DeleteWithRetries_PermissionDeniedNotRetried) {
-  std::vector<Status> delete_results(
+  std::vector<absl::Status> delete_results(
       {errors::Unavailable(""), errors::PermissionDenied("")});
   const auto delete_func = [&delete_results]() {
     auto result = delete_results[0];
@@ -131,7 +134,7 @@ TEST(RetryingUtilsTest, DeleteWithRetries_PermissionDeniedNotRetried) {
 }
 
 TEST(RetryingUtilsTest, DeleteWithRetries_SuccessThroughFileNotFound) {
-  std::vector<Status> delete_results(
+  std::vector<absl::Status> delete_results(
       {errors::Unavailable(""), errors::NotFound("")});
   const auto delete_func = [&delete_results]() {
     auto result = delete_results[0];
@@ -143,7 +146,7 @@ TEST(RetryingUtilsTest, DeleteWithRetries_SuccessThroughFileNotFound) {
 }
 
 TEST(RetryingUtilsTest, DeleteWithRetries_FirstNotFoundReturnedAsIs) {
-  std::vector<Status> delete_results({errors::NotFound("")});
+  std::vector<absl::Status> delete_results({errors::NotFound("")});
   const auto delete_func = [&delete_results]() {
     auto result = delete_results[0];
     delete_results.erase(delete_results.begin());
diff --git a/third_party/xla/third_party/tsl/tsl/platform/status.cc b/third_party/xla/third_party/tsl/tsl/platform/status.cc
index 3f9dd1ecb58427..4256bd2f2a44e9 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/status.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/status.cc
@@ -112,7 +112,7 @@ namespace errors {
 static constexpr const char kStackTraceProtoUrl[] =
     "type.googleapis.com/tensorflow.StackTracePayload";
 
-void SetStackTrace(::tsl::Status& status, std::vector<StackFrame> stack_trace) {
+void SetStackTrace(absl::Status& status, std::vector<StackFrame> stack_trace) {
   // Given the StackFrame fields are (a) line number (b) filename (c) function
   // name, we can safely assume that there is no `\n` in there.
   // Thus, we can serialize as strings using a simple new line delimiter.
@@ -134,7 +134,7 @@ void SetStackTrace(::tsl::Status& status, std::vector<StackFrame> stack_trace) {
                     absl::Cord(absl::StrJoin(items, "\n")));
 }
 
-std::vector<StackFrame> GetStackTrace(const ::tsl::Status& status) {
+std::vector<StackFrame> GetStackTrace(const absl::Status& status) {
   std::vector<StackFrame> stack_trace;
   absl::optional<absl::Cord> maybe_serialized_payload =
       status.GetPayload(kStackTraceProtoUrl);
@@ -163,7 +163,7 @@ const char* NullTerminatedMessage(const absl::Status& status) {
 }
 #endif
 
-std::string* TfCheckOpHelperOutOfLine(const ::tsl::Status& v, const char* msg) {
+std::string* TfCheckOpHelperOutOfLine(const absl::Status& v, const char* msg) {
   std::string r("Non-OK-status: ");
   r += msg;
   r += " status: ";
@@ -174,8 +174,8 @@ std::string* TfCheckOpHelperOutOfLine(const ::tsl::Status& v, const char* msg) {
 
 StatusGroup::StatusGroup() {}
 
-StatusGroup::StatusGroup(std::initializer_list<Status> statuses) {
-  for (const Status& s : statuses) {
+StatusGroup::StatusGroup(std::initializer_list<absl::Status> statuses) {
+  for (const absl::Status& s : statuses) {
     Update(s);
   }
 }
@@ -183,11 +183,11 @@ StatusGroup::StatusGroup(std::initializer_list<Status> statuses) {
 static constexpr const char kDerivedStatusProtoUrl[] =
     "type.googleapis.com/tensorflow.DerivedStatus";
 
-Status StatusGroup::MakeDerived(const Status& s) {
+absl::Status StatusGroup::MakeDerived(const absl::Status& s) {
   if (IsDerived(s)) {
     return s;
   } else {
-    Status derived(s);
+    absl::Status derived(s);
     // TODO(b/200167936): Serialize an instance of DerivedStatus proto instead
     // of using the string directly. The string is never used so it is not
     // causing any issues at the moment.
@@ -196,7 +196,7 @@ Status StatusGroup::MakeDerived(const Status& s) {
   }
 }
 
-bool StatusGroup::IsDerived(const Status& s) {
+bool StatusGroup::IsDerived(const absl::Status& s) {
   return s.GetPayload(kDerivedStatusProtoUrl).has_value();
 }
 
@@ -204,7 +204,7 @@ void StatusGroup::ConfigureLogHistory() {
   StatusLogSink::GetInstance()->enable();
 }
 
-void StatusGroup::Update(const Status& s) {
+void StatusGroup::Update(const absl::Status& s) {
   if (s.ok()) {
     ++num_ok_;
   } else {
@@ -241,25 +241,26 @@ std::unordered_map<std::string, absl::Cord> StatusGroup::GetPayloads() const {
   return payloads;
 }
 
-Status MakeStatus(absl::StatusCode code, absl::string_view message,
-                  const std::unordered_map<std::string, absl::Cord>& payloads) {
-  Status status(code, message);
+absl::Status MakeStatus(
+    absl::StatusCode code, absl::string_view message,
+    const std::unordered_map<std::string, absl::Cord>& payloads) {
+  absl::Status status(code, message);
   for (const auto& payload : payloads) {
     status.SetPayload(payload.first, payload.second);
   }
   return status;
 }
 
-std::string MakeString(const Status& status) {
+std::string MakeString(const absl::Status& status) {
   return absl::StrCat(absl::StatusCodeToString(status.code()), ": ",
                       status.message());
 }
 
 // Summarize all the status objects in the StatusGroup. This is used when
 // individual Status objects in the StatusGroup are not already summarized.
-Status StatusGroup::as_summary_status() const {
+absl::Status StatusGroup::as_summary_status() const {
   if (ok_) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Gather recent logs as a string
@@ -322,9 +323,9 @@ Status StatusGroup::as_summary_status() const {
 
 // Concatenate all the status objects in the StatusGroup. This is used when
 // individual Status objects in the StatusGroup are already summarized Status.
-Status StatusGroup::as_concatenated_status() const {
+absl::Status StatusGroup::as_concatenated_status() const {
   if (ok_) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // If only one root status is found, return it directly.
diff --git a/third_party/xla/third_party/tsl/tsl/platform/status.h b/third_party/xla/third_party/tsl/tsl/platform/status.h
index b9aca1e9285a11..84954ff485a48b 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/status.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/status.h
@@ -133,9 +133,9 @@ inline const char* NullTerminatedMessage(const absl::Status& status) {
 // TODO(b/197552541) Move this namespace to errors.h.
 namespace errors {
 
-void SetStackTrace(::tsl::Status& status, std::vector<StackFrame> stack_trace);
+void SetStackTrace(absl::Status& status, std::vector<StackFrame> stack_trace);
 
-std::vector<StackFrame> GetStackTrace(const ::tsl::Status& status);
+std::vector<StackFrame> GetStackTrace(const absl::Status& status);
 }  // namespace errors
 
 // Helper class to manage multiple child status values.
@@ -144,12 +144,12 @@ class StatusGroup {
   StatusGroup();
   // Constructor to form a StatusGroup from any N set of Status arguments.
   // Usage: StatusGroup({status_a, status_b, status_c});
-  StatusGroup(std::initializer_list<Status> statuses);
+  StatusGroup(std::initializer_list<absl::Status> statuses);
 
   // Utility function to mark a Status as derived. By marking derived status,
   // Derived status messages are ignored when reporting errors to end users.
-  static Status MakeDerived(const Status& s);
-  static bool IsDerived(const Status& s);
+  static absl::Status MakeDerived(const absl::Status& s);
+  static bool IsDerived(const absl::Status& s);
 
   // Enable warning and error log collection for appending to the aggregated
   // status. This function may be called more than once.
@@ -163,15 +163,15 @@ class StatusGroup {
   std::unordered_map<std::string, absl::Cord> GetPayloads() const;
 
   // Return a merged status with combined child status messages with a summary.
-  Status as_summary_status() const;
+  absl::Status as_summary_status() const;
   // Return a merged status with combined child status messages with
   // concatenation.
-  Status as_concatenated_status() const;
+  absl::Status as_concatenated_status() const;
 
   bool ok() const { return ok_; }
 
   // Augment this group with the child status `status`.
-  void Update(const Status& status);
+  void Update(const absl::Status& status);
 
   // Attach recent warning and error log messages
   void AttachLogMessages();
@@ -183,25 +183,24 @@ class StatusGroup {
 
   // Maintain a sorted collection of statuses.
   struct CompareStatus {
-    bool operator()(const Status& a, const Status& b) const {
+    bool operator()(const absl::Status& a, const absl::Status& b) const {
       return a.ToString() > b.ToString();
     }
   };
   // Using std::set instead of absl::btree_set to keep size for certain
   // dependent libraries under the limit.
-  std::set<Status, CompareStatus> derived_;
-  std::set<Status, CompareStatus> non_derived_;
+  std::set<absl::Status, CompareStatus> derived_;
+  std::set<absl::Status, CompareStatus> non_derived_;
 
   std::vector<std::string> recent_logs_;  // recent warning and error logs
 };
 
+typedef std::function<void(const absl::Status&)> StatusCallback;
 
-typedef std::function<void(const Status&)> StatusCallback;
-
-extern ::tsl::string* TfCheckOpHelperOutOfLine(const ::tsl::Status& v,
+extern ::tsl::string* TfCheckOpHelperOutOfLine(const absl::Status& v,
                                                const char* msg);
 
-inline ::tsl::string* TfCheckOpHelper(::tsl::Status v, const char* msg) {
+inline ::tsl::string* TfCheckOpHelper(absl::Status v, const char* msg) {
   if (v.ok()) return nullptr;
   return TfCheckOpHelperOutOfLine(v, msg);
 }
diff --git a/third_party/xla/third_party/tsl/tsl/platform/status_matchers.cc b/third_party/xla/third_party/tsl/tsl/platform/status_matchers.cc
index 7546a7b20770ed..77422d564f8cda 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/status_matchers.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/status_matchers.cc
@@ -40,7 +40,7 @@ void StatusIsMatcherCommonImpl::DescribeNegationTo(std::ostream* os) const {
 }
 
 bool StatusIsMatcherCommonImpl::MatchAndExplain(
-    const Status& status,
+    const absl::Status& status,
     ::testing::MatchResultListener* result_listener) const {
   ::testing::StringMatchResultListener inner_listener;
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/status_matchers.h b/third_party/xla/third_party/tsl/tsl/platform/status_matchers.h
index ee2144dca8a698..cb8eba40783093 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/status_matchers.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/status_matchers.h
@@ -115,10 +115,12 @@ void PrintTo(const StatusOr<T>& status_or, std::ostream* os) {
 namespace testing {
 namespace internal_status {
 
-inline const Status& GetStatus(const Status& status) { return status; }
+inline const absl::Status& GetStatus(const absl::Status& status) {
+  return status;
+}
 
 template <typename T>
-inline const Status& GetStatus(const StatusOr<T>& status) {
+inline const absl::Status& GetStatus(const StatusOr<T>& status) {
   return status.status();
 }
 
@@ -211,7 +213,7 @@ class StatusIsMatcherCommonImpl {
 
   void DescribeNegationTo(std::ostream* os) const;
 
-  bool MatchAndExplain(const Status& status,
+  bool MatchAndExplain(const absl::Status& status,
                        ::testing::MatchResultListener* result_listener) const;
 
  private:
diff --git a/third_party/xla/third_party/tsl/tsl/platform/status_matchers_test.cc b/third_party/xla/third_party/tsl/tsl/platform/status_matchers_test.cc
index 70fc6191167c3b..ea0e0d489c24bf 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/status_matchers_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/status_matchers_test.cc
@@ -76,13 +76,13 @@ std::string ExplainMatch(const Matcher<T>& matcher, const V& value) {
 }
 
 TEST(IsOkAndHoldsTest, MatchesValue) {
-  StatusOr<std::string> status_or_message("Hello, world");
+  absl::StatusOr<std::string> status_or_message("Hello, world");
   EXPECT_THAT(status_or_message, IsOkAndHolds("Hello, world"));
   EXPECT_THAT(status_or_message, IsOkAndHolds(HasSubstr("Hello,")));
 }
 
 TEST(IsOkAndHoldsTest, MatchesContainer) {
-  StatusOr<std::vector<std::string>> status_or_messages =
+  absl::StatusOr<std::vector<std::string>> status_or_messages =
       std::vector<std::string>{"Hello, world", "Hello, tf"};
   EXPECT_THAT(status_or_messages,
               IsOkAndHolds(ElementsAre("Hello, world", "Hello, tf")));
@@ -91,23 +91,23 @@ TEST(IsOkAndHoldsTest, MatchesContainer) {
 }
 
 TEST(IsOkAndHoldsTest, DoesNotMatchStatus) {
-  StatusOr<std::string> status_or_message =
+  absl::StatusOr<std::string> status_or_message =
       errors::InvalidArgument("Invalid argument");
   EXPECT_THAT(status_or_message, Not(IsOkAndHolds("Hello, world")));
 }
 
 TEST(IsOkAndHoldsTest, DoesNotMatchValue) {
-  StatusOr<std::string> status_or_message("Hello, tf");
+  absl::StatusOr<std::string> status_or_message("Hello, tf");
   EXPECT_THAT(status_or_message, Not(IsOkAndHolds("Hello, world")));
 }
 
 TEST(IsOkAndHoldsTest, DoesNotMatchContainer) {
-  StatusOr<std::vector<int>> status_or_container({1, 2, 3});
+  absl::StatusOr<std::vector<int>> status_or_container({1, 2, 3});
   EXPECT_THAT(status_or_container, Not(IsOkAndHolds(ElementsAre(4, 5, 6))));
 }
 
 TEST(IsOkAndHoldsTest, DescribeExpectedValue) {
-  Matcher<StatusOr<std::string>> is_ok_and_has_substr =
+  Matcher<absl::StatusOr<std::string>> is_ok_and_has_substr =
       IsOkAndHolds(HasSubstr("Hello"));
   EXPECT_EQ(Describe(is_ok_and_has_substr),
             "is OK and has a value that has substring \"Hello\"");
@@ -116,20 +116,22 @@ TEST(IsOkAndHoldsTest, DescribeExpectedValue) {
 }
 
 TEST(IsOkAndHoldsTest, ExplainNotMatchingStatus) {
-  Matcher<StatusOr<int>> is_ok_and_less_than = IsOkAndHolds(LessThan(100));
-  StatusOr<int> status = errors::Unknown("Unknown");
+  Matcher<absl::StatusOr<int>> is_ok_and_less_than =
+      IsOkAndHolds(LessThan(100));
+  absl::StatusOr<int> status = errors::Unknown("Unknown");
   EXPECT_THAT(ExplainMatch(is_ok_and_less_than, status),
               HasSubstr("which has status UNKNOWN: Unknown"));
 }
 
 TEST(IsOkAndHoldsTest, ExplainNotMatchingValue) {
-  Matcher<StatusOr<int>> is_ok_and_less_than = IsOkAndHolds(LessThan(100));
+  Matcher<absl::StatusOr<int>> is_ok_and_less_than =
+      IsOkAndHolds(LessThan(100));
   EXPECT_EQ(ExplainMatch(is_ok_and_less_than, 120),
             "which contains value 120, which is 20 more than 100");
 }
 
 TEST(IsOkAndHoldsTest, ExplainNotMatchingContainer) {
-  Matcher<StatusOr<std::vector<int>>> is_ok_and_less_than =
+  Matcher<absl::StatusOr<std::vector<int>>> is_ok_and_less_than =
       IsOkAndHolds(ElementsAre(1, 2, 3));
   std::vector<int> actual{4, 5, 6};
   EXPECT_THAT(ExplainMatch(is_ok_and_less_than, actual),
@@ -137,20 +139,20 @@ TEST(IsOkAndHoldsTest, ExplainNotMatchingContainer) {
 }
 
 TEST(StatusIsTest, MatchesOK) {
-  EXPECT_THAT(OkStatus(), StatusIs(error::OK));
-  StatusOr<std::string> message("Hello, world");
+  EXPECT_THAT(absl::OkStatus(), StatusIs(error::OK));
+  absl::StatusOr<std::string> message("Hello, world");
   EXPECT_THAT(message, StatusIs(error::OK));
 }
 
 TEST(StatusIsTest, DoesNotMatchOk) {
   EXPECT_THAT(errors::DeadlineExceeded("Deadline exceeded"),
               Not(StatusIs(error::OK)));
-  StatusOr<std::string> status = errors::NotFound("Not found");
+  absl::StatusOr<std::string> status = errors::NotFound("Not found");
   EXPECT_THAT(status, Not(StatusIs(error::OK)));
 }
 
 TEST(StatusIsTest, MatchesStatus) {
-  Status s = errors::Cancelled("Cancelled");
+  absl::Status s = errors::Cancelled("Cancelled");
   EXPECT_THAT(s, StatusIs(error::CANCELLED));
   EXPECT_THAT(s, StatusIs(error::CANCELLED, "Cancelled"));
   EXPECT_THAT(s, StatusIs(_, "Cancelled"));
@@ -161,7 +163,7 @@ TEST(StatusIsTest, MatchesStatus) {
 }
 
 TEST(StatusIsTest, StatusOrMatchesStatus) {
-  StatusOr<int> s = errors::InvalidArgument("Invalid Argument");
+  absl::StatusOr<int> s = errors::InvalidArgument("Invalid Argument");
   EXPECT_THAT(s, StatusIs(error::INVALID_ARGUMENT));
   EXPECT_THAT(s, StatusIs(error::INVALID_ARGUMENT, "Invalid Argument"));
   EXPECT_THAT(s, StatusIs(_, "Invalid Argument"));
@@ -172,7 +174,7 @@ TEST(StatusIsTest, StatusOrMatchesStatus) {
 }
 
 TEST(StatusIsTest, DoesNotMatchStatus) {
-  Status s = errors::Internal("Internal");
+  absl::Status s = errors::Internal("Internal");
   EXPECT_THAT(s, Not(StatusIs(error::FAILED_PRECONDITION)));
   EXPECT_THAT(s, Not(StatusIs(error::INTERNAL, "Failed Precondition")));
   EXPECT_THAT(s, Not(StatusIs(_, "Failed Precondition")));
@@ -180,7 +182,7 @@ TEST(StatusIsTest, DoesNotMatchStatus) {
 }
 
 TEST(StatusIsTest, StatusOrDoesNotMatchStatus) {
-  StatusOr<int> s = errors::FailedPrecondition("Failed Precondition");
+  absl::StatusOr<int> s = errors::FailedPrecondition("Failed Precondition");
   EXPECT_THAT(s, Not(StatusIs(error::INTERNAL)));
   EXPECT_THAT(s, Not(StatusIs(error::FAILED_PRECONDITION, "Internal")));
   EXPECT_THAT(s, Not(StatusIs(_, "Internal")));
@@ -188,7 +190,7 @@ TEST(StatusIsTest, StatusOrDoesNotMatchStatus) {
 }
 
 TEST(StatusIsTest, DescribeExpectedValue) {
-  Matcher<Status> status_is =
+  Matcher<absl::Status> status_is =
       StatusIs(error::UNAVAILABLE, std::string("Unavailable"));
   EXPECT_EQ(Describe(status_is),
             "has a status code that is equal to UNAVAILABLE, "
@@ -196,7 +198,7 @@ TEST(StatusIsTest, DescribeExpectedValue) {
 }
 
 TEST(StatusIsTest, DescribeNegatedExpectedValue) {
-  Matcher<StatusOr<std::string>> status_is =
+  Matcher<absl::StatusOr<std::string>> status_is =
       StatusIs(error::ABORTED, std::string("Aborted"));
   EXPECT_EQ(DescribeNegation(status_is),
             "has a status code that isn't equal to ABORTED, "
@@ -204,60 +206,61 @@ TEST(StatusIsTest, DescribeNegatedExpectedValue) {
 }
 
 TEST(StatusIsTest, ExplainNotMatchingErrorCode) {
-  Matcher<Status> status_is = StatusIs(error::NOT_FOUND, _);
-  const Status status = errors::AlreadyExists("Already exists");
+  Matcher<absl::Status> status_is = StatusIs(error::NOT_FOUND, _);
+  const absl::Status status = errors::AlreadyExists("Already exists");
   EXPECT_EQ(ExplainMatch(status_is, status), "whose status code is wrong");
 }
 
 TEST(StatusIsTest, ExplainNotMatchingErrorMessage) {
-  Matcher<Status> status_is = StatusIs(error::NOT_FOUND, "Not found");
-  const Status status = errors::NotFound("Already exists");
+  Matcher<absl::Status> status_is = StatusIs(error::NOT_FOUND, "Not found");
+  const absl::Status status = errors::NotFound("Already exists");
   EXPECT_EQ(ExplainMatch(status_is, status), "whose error message is wrong");
 }
 
 TEST(StatusIsTest, ExplainStatusOrNotMatchingErrorCode) {
-  Matcher<StatusOr<int>> status_is = StatusIs(error::ALREADY_EXISTS, _);
-  const StatusOr<int> status_or = errors::NotFound("Not found");
+  Matcher<absl::StatusOr<int>> status_is = StatusIs(error::ALREADY_EXISTS, _);
+  const absl::StatusOr<int> status_or = errors::NotFound("Not found");
   EXPECT_EQ(ExplainMatch(status_is, status_or), "whose status code is wrong");
 }
 
 TEST(StatusIsTest, ExplainStatusOrNotMatchingErrorMessage) {
-  Matcher<StatusOr<int>> status_is =
+  Matcher<absl::StatusOr<int>> status_is =
       StatusIs(error::ALREADY_EXISTS, "Already exists");
-  const StatusOr<int> status_or = errors::AlreadyExists("Not found");
+  const absl::StatusOr<int> status_or = errors::AlreadyExists("Not found");
   EXPECT_EQ(ExplainMatch(status_is, status_or), "whose error message is wrong");
 }
 
 TEST(StatusIsTest, ExplainStatusOrHasValue) {
-  Matcher<StatusOr<int>> status_is =
+  Matcher<absl::StatusOr<int>> status_is =
       StatusIs(error::RESOURCE_EXHAUSTED, "Resource exhausted");
-  const StatusOr<int> value = -1;
+  const absl::StatusOr<int> value = -1;
   EXPECT_EQ(ExplainMatch(status_is, value), "whose status code is wrong");
 }
 
 TEST(IsOkTest, MatchesOK) {
-  EXPECT_THAT(OkStatus(), IsOk());
-  StatusOr<std::string> message = std::string("Hello, world");
+  EXPECT_THAT(absl::OkStatus(), IsOk());
+  absl::StatusOr<std::string> message = std::string("Hello, world");
   EXPECT_THAT(message, IsOk());
 }
 
 TEST(IsOkTest, DoesNotMatchOK) {
   EXPECT_THAT(errors::PermissionDenied("Permission denied"), Not(IsOk()));
-  StatusOr<std::string> status = errors::Unauthenticated("Unauthenticated");
+  absl::StatusOr<std::string> status =
+      errors::Unauthenticated("Unauthenticated");
   EXPECT_THAT(status, Not(IsOk()));
 }
 
 TEST(IsOkTest, DescribeExpectedValue) {
-  Matcher<Status> status_is_ok = IsOk();
+  Matcher<absl::Status> status_is_ok = IsOk();
   EXPECT_EQ(Describe(status_is_ok), "is OK");
-  Matcher<StatusOr<std::string>> status_or_is_ok = IsOk();
+  Matcher<absl::StatusOr<std::string>> status_or_is_ok = IsOk();
   EXPECT_EQ(Describe(status_or_is_ok), "is OK");
 }
 
 TEST(IsOkTest, DescribeNegatedExpectedValue) {
-  Matcher<Status> status_is_ok = IsOk();
+  Matcher<absl::Status> status_is_ok = IsOk();
   EXPECT_EQ(DescribeNegation(status_is_ok), "is not OK");
-  Matcher<StatusOr<std::string>> status_or_is_ok = IsOk();
+  Matcher<absl::StatusOr<std::string>> status_or_is_ok = IsOk();
   EXPECT_EQ(DescribeNegation(status_or_is_ok), "is not OK");
 }
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/status_test.cc b/third_party/xla/third_party/tsl/tsl/platform/status_test.cc
index bccf969e63b662..b95de35e181be1 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/status_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/status_test.cc
@@ -34,7 +34,7 @@ using ::tsl::testing::IsOk;
 using ::tsl::testing::StatusIs;
 
 TEST(ToStringTest, PayloadsArePrinted) {
-  Status status = errors::Aborted("Aborted Error Message");
+  absl::Status status = errors::Aborted("Aborted Error Message");
   status.SetPayload("payload_key", absl::Cord(absl::StrFormat(
                                        "payload_value %c%c%c", 1, 2, 3)));
 
@@ -44,7 +44,7 @@ TEST(ToStringTest, PayloadsArePrinted) {
 }
 
 TEST(ToStringTest, MatchesAbslStatus) {
-  Status status = errors::Aborted("Aborted Error Message");
+  absl::Status status = errors::Aborted("Aborted Error Message");
   status.SetPayload("payload_key", absl::Cord(absl::StrFormat(
                                        "payload_value %c%c%c", 1, 2, 3)));
 
@@ -57,7 +57,7 @@ TEST(ToStringTest, MatchesAbslStatus) {
 }
 
 TEST(StackTrace, SerializeAndDeserializeCorrectly) {
-  Status status = errors::Aborted("Aborted Error Message");
+  absl::Status status = errors::Aborted("Aborted Error Message");
   std::vector<StackFrame> stack_trace;
   stack_trace.push_back(StackFrame("filename_1", 33, "func_name_1"));
   stack_trace.push_back(StackFrame("filename_2", 66, "func_name_2"));
@@ -72,11 +72,11 @@ TEST(StackTrace, SerializeAndDeserializeCorrectly) {
 }
 
 TEST(StatusGroupTest, DeterministicOrderWithoutPayloads) {
-  Status status_a = errors::Aborted("Status A");
-  Status status_b = errors::Aborted("Status B");
-  Status status_c = errors::Aborted("Status C");
+  absl::Status status_a = errors::Aborted("Status A");
+  absl::Status status_b = errors::Aborted("Status B");
+  absl::Status status_c = errors::Aborted("Status C");
 
-  Status combined =
+  absl::Status combined =
       StatusGroup({status_a, status_b, status_c}).as_summary_status();
 
   EXPECT_EQ(combined,
@@ -94,14 +94,14 @@ TEST(StatusGroupTest, DeterministicOrderWithoutPayloads) {
 }
 
 TEST(StatusGroupTest, DeterministicOrderWithPayloads) {
-  Status status_a = errors::Aborted("Status A");
+  absl::Status status_a = errors::Aborted("Status A");
   status_a.SetPayload("payload_key", absl::Cord("payload_value_a"));
-  Status status_b = errors::Aborted("Status B");
+  absl::Status status_b = errors::Aborted("Status B");
   status_b.SetPayload("payload_key", absl::Cord("payload_value_b"));
-  Status status_c = errors::Aborted("Status C");
+  absl::Status status_c = errors::Aborted("Status C");
   status_c.SetPayload("payload_key", absl::Cord("payload_value_c"));
 
-  Status combined =
+  absl::Status combined =
       StatusGroup({status_a, status_b, status_c}).as_summary_status();
   ASSERT_TRUE(combined.GetPayload("payload_key").has_value());
   std::string payload(combined.GetPayload("payload_key").value());
@@ -127,16 +127,16 @@ TEST(StatusGroupTest, DeterministicOrderWithPayloads) {
 }
 
 TEST(StatusGroupTest, PayloadsMergedProperly) {
-  Status status_a = errors::Aborted("Status A");
+  absl::Status status_a = errors::Aborted("Status A");
   status_a.SetPayload("payload_key_a",
                       absl::Cord(std::string("payload_value_a")));
-  Status status_b = errors::Aborted("Status B");
+  absl::Status status_b = errors::Aborted("Status B");
   status_b.SetPayload("payload_key_b",
                       absl::Cord(std::string("payload_value_b")));
-  Status status_c = errors::Aborted("Status C");
+  absl::Status status_c = errors::Aborted("Status C");
   status_c.SetPayload("payload_key_c",
                       absl::Cord(std::string("payload_value_c")));
-  Status derived_status_c =
+  absl::Status derived_status_c =
       StatusGroup::MakeDerived(errors::Aborted("Status C"));
   derived_status_c.SetPayload(
       "payload_key_c", absl::Cord(std::string("derived_payload_value_c")));
@@ -144,14 +144,14 @@ TEST(StatusGroupTest, PayloadsMergedProperly) {
   StatusGroup status_group({status_a, status_b, status_c, derived_status_c});
   EXPECT_THAT(status_group.GetPayloads(), ::testing::SizeIs(3));
 
-  Status combined = status_group.as_summary_status();
+  absl::Status combined = status_group.as_summary_status();
   EXPECT_EQ(combined.GetPayload("payload_key_a"), "payload_value_a");
   EXPECT_EQ(combined.GetPayload("payload_key_b"), "payload_value_b");
   EXPECT_EQ(combined.GetPayload("payload_key_c"), "payload_value_c");
 }
 
 TEST(Status, ErrorStatusForEachPayloadIteratesOverAll) {
-  Status s(absl::StatusCode::kInternal, "Error message");
+  absl::Status s(absl::StatusCode::kInternal, "Error message");
   s.SetPayload("key1", absl::Cord("value1"));
   s.SetPayload("key2", absl::Cord("value2"));
   s.SetPayload("key3", absl::Cord("value3"));
@@ -168,7 +168,7 @@ TEST(Status, ErrorStatusForEachPayloadIteratesOverAll) {
 }
 
 TEST(Status, OkStatusForEachPayloadNoIteration) {
-  Status s = OkStatus();
+  absl::Status s = absl::OkStatus();
   s.SetPayload("key1", absl::Cord("value1"));
   s.SetPayload("key2", absl::Cord("value2"));
   s.SetPayload("key3", absl::Cord("value3"));
@@ -182,7 +182,7 @@ TEST(Status, OkStatusForEachPayloadNoIteration) {
 }
 
 TEST(Status, SaveOKStatusToProto) {
-  tensorflow::StatusProto status_proto = StatusToProto(OkStatus());
+  tensorflow::StatusProto status_proto = StatusToProto(absl::OkStatus());
   EXPECT_EQ(status_proto.code(), error::OK);
   EXPECT_THAT(status_proto.message(), IsEmpty());
 }
@@ -195,7 +195,7 @@ TEST(Status, SaveErrorStatusToProto) {
 }
 
 TEST(Status, SaveEmptyStatusToProto) {
-  tensorflow::StatusProto status_proto = StatusToProto(Status());
+  tensorflow::StatusProto status_proto = StatusToProto(absl::Status());
   EXPECT_EQ(status_proto.code(), error::OK);
   EXPECT_THAT(status_proto.message(), IsEmpty());
 }
diff --git a/third_party/xla/third_party/tsl/tsl/platform/status_to_from_proto.cc b/third_party/xla/third_party/tsl/tsl/platform/status_to_from_proto.cc
index 250e2e1ed4f670..96ad290f92c71a 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/status_to_from_proto.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/status_to_from_proto.cc
@@ -22,7 +22,7 @@ limitations under the License.
 
 namespace tsl {
 
-tensorflow::StatusProto StatusToProto(const Status& s) {
+tensorflow::StatusProto StatusToProto(const absl::Status& s) {
   tensorflow::StatusProto status_proto;
   if (s.ok()) {
     return status_proto;
@@ -36,13 +36,13 @@ tensorflow::StatusProto StatusToProto(const Status& s) {
 }
 
 #if defined(PLATFORM_GOOGLE)
-Status StatusFromProto(const tensorflow::StatusProto& proto,
-                       absl::SourceLocation loc) {
+absl::Status StatusFromProto(const tensorflow::StatusProto& proto,
+                             absl::SourceLocation loc) {
   if (proto.code() == tensorflow::error::OK) {
-    return OkStatus();
+    return absl::OkStatus();
   }
-  return Status(static_cast<absl::StatusCode>(proto.code()), proto.message(),
-                loc);
+  return absl::Status(static_cast<absl::StatusCode>(proto.code()),
+                      proto.message(), loc);
 }
 #else
 Status StatusFromProto(const tensorflow::StatusProto& proto) {
diff --git a/third_party/xla/third_party/tsl/tsl/platform/status_to_from_proto.h b/third_party/xla/third_party/tsl/tsl/platform/status_to_from_proto.h
index 6abbe78dc0ef69..9891737f08159c 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/status_to_from_proto.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/status_to_from_proto.h
@@ -28,11 +28,11 @@ namespace tsl {
 // Symbol not found: tensorflow11StatusProtoC1EPN6protobuf5ArenaEb
 
 // Converts a `Status` to a `StatusProto`.
-tensorflow::StatusProto StatusToProto(const Status& s);
+tensorflow::StatusProto StatusToProto(const absl::Status& s);
 
 #if defined(PLATFORM_GOOGLE)
 // Constructs a `Status` from a `StatusProto`.
-Status StatusFromProto(
+absl::Status StatusFromProto(
     const tensorflow::StatusProto& proto,
     absl::SourceLocation loc = absl::SourceLocation::current());
 #else
diff --git a/third_party/xla/third_party/tsl/tsl/platform/statusor_test.cc b/third_party/xla/third_party/tsl/tsl/platform/statusor_test.cc
index 9cfc44de7167ed..fd0ee7886073b4 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/statusor_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/statusor_test.cc
@@ -67,7 +67,7 @@ class NoDefaultConstructor {
 static_assert(!std::is_default_constructible<NoDefaultConstructor>(),
               "Should not be default-constructible.");
 
-StatusOr<std::unique_ptr<int>> ReturnUniquePtr() {
+absl::StatusOr<std::unique_ptr<int>> ReturnUniquePtr() {
   // Uses implicit constructor from T&&
   return std::unique_ptr<int>(new int(0));
 }
@@ -75,25 +75,25 @@ StatusOr<std::unique_ptr<int>> ReturnUniquePtr() {
 TEST(StatusOr, NullPointerStatusOr) {
   // As a very special case, null-plain-pointer StatusOr used to be an
   // error. Test that it no longer is.
-  StatusOr<int*> null_status(nullptr);
+  absl::StatusOr<int*> null_status(nullptr);
   EXPECT_TRUE(null_status.ok());
   EXPECT_EQ(null_status.value(), nullptr);
 }
 
 TEST(StatusOr, TestNoDefaultConstructorInitialization) {
   // Explicitly initialize it with an error code.
-  StatusOr<NoDefaultConstructor> statusor(errors::Cancelled(""));
+  absl::StatusOr<NoDefaultConstructor> statusor(errors::Cancelled(""));
   EXPECT_FALSE(statusor.ok());
   EXPECT_EQ(statusor.status().code(), absl::StatusCode::kCancelled);
 
   // Default construction of StatusOr initializes it with an UNKNOWN error code.
-  StatusOr<NoDefaultConstructor> statusor2;
+  absl::StatusOr<NoDefaultConstructor> statusor2;
   EXPECT_FALSE(statusor2.ok());
   EXPECT_EQ(statusor2.status().code(), absl::StatusCode::kUnknown);
 }
 
 TEST(StatusOr, TestMoveOnlyInitialization) {
-  StatusOr<std::unique_ptr<int>> thing(ReturnUniquePtr());
+  absl::StatusOr<std::unique_ptr<int>> thing(ReturnUniquePtr());
   ASSERT_TRUE(thing.ok());
   EXPECT_EQ(0, *thing.value());
   int* previous = thing.value().get();
@@ -105,12 +105,12 @@ TEST(StatusOr, TestMoveOnlyInitialization) {
 }
 
 TEST(StatusOr, TestMoveOnlyStatusCtr) {
-  StatusOr<std::unique_ptr<int>> thing(errors::Cancelled(""));
+  absl::StatusOr<std::unique_ptr<int>> thing(errors::Cancelled(""));
   ASSERT_FALSE(thing.ok());
 }
 
 TEST(StatusOr, TestMoveOnlyValueExtraction) {
-  StatusOr<std::unique_ptr<int>> thing(ReturnUniquePtr());
+  absl::StatusOr<std::unique_ptr<int>> thing(ReturnUniquePtr());
   ASSERT_TRUE(thing.ok());
   std::unique_ptr<int> ptr = std::move(thing).value();
   EXPECT_EQ(0, *ptr);
@@ -121,7 +121,7 @@ TEST(StatusOr, TestMoveOnlyValueExtraction) {
 }
 
 TEST(StatusOr, TestMoveOnlyConversion) {
-  StatusOr<std::unique_ptr<const int>> const_thing(ReturnUniquePtr());
+  absl::StatusOr<std::unique_ptr<const int>> const_thing(ReturnUniquePtr());
   EXPECT_TRUE(const_thing.ok());
   EXPECT_EQ(0, *const_thing.value());
 
@@ -135,7 +135,7 @@ TEST(StatusOr, TestMoveOnlyConversion) {
 
 TEST(StatusOr, TestMoveOnlyVector) {
   // Sanity check that StatusOr<MoveOnly> works in vector.
-  std::vector<StatusOr<std::unique_ptr<int>>> vec;
+  std::vector<absl::StatusOr<std::unique_ptr<int>>> vec;
   vec.push_back(ReturnUniquePtr());
   vec.resize(2);
   auto another_vec = std::move(vec);
@@ -144,11 +144,13 @@ TEST(StatusOr, TestMoveOnlyVector) {
 }
 
 TEST(StatusOr, TestMoveWithValuesAndErrors) {
-  StatusOr<std::string> status_or(std::string(1000, '0'));
-  StatusOr<std::string> value1(std::string(1000, '1'));
-  StatusOr<std::string> value2(std::string(1000, '2'));
-  StatusOr<std::string> error1(Status(absl::StatusCode::kUnknown, "error1"));
-  StatusOr<std::string> error2(Status(absl::StatusCode::kUnknown, "error2"));
+  absl::StatusOr<std::string> status_or(std::string(1000, '0'));
+  absl::StatusOr<std::string> value1(std::string(1000, '1'));
+  absl::StatusOr<std::string> value2(std::string(1000, '2'));
+  absl::StatusOr<std::string> error1(
+      absl::Status(absl::StatusCode::kUnknown, "error1"));
+  absl::StatusOr<std::string> error2(
+      absl::Status(absl::StatusCode::kUnknown, "error2"));
 
   ASSERT_TRUE(status_or.ok());
   EXPECT_EQ(std::string(1000, '0'), status_or.value());
@@ -175,11 +177,13 @@ TEST(StatusOr, TestMoveWithValuesAndErrors) {
 }
 
 TEST(StatusOr, TestCopyWithValuesAndErrors) {
-  StatusOr<std::string> status_or(std::string(1000, '0'));
-  StatusOr<std::string> value1(std::string(1000, '1'));
-  StatusOr<std::string> value2(std::string(1000, '2'));
-  StatusOr<std::string> error1(Status(absl::StatusCode::kUnknown, "error1"));
-  StatusOr<std::string> error2(Status(absl::StatusCode::kUnknown, "error2"));
+  absl::StatusOr<std::string> status_or(std::string(1000, '0'));
+  absl::StatusOr<std::string> value1(std::string(1000, '1'));
+  absl::StatusOr<std::string> value2(std::string(1000, '2'));
+  absl::StatusOr<std::string> error1(
+      absl::Status(absl::StatusCode::kUnknown, "error1"));
+  absl::StatusOr<std::string> error2(
+      absl::Status(absl::StatusCode::kUnknown, "error2"));
 
   ASSERT_TRUE(status_or.ok());
   EXPECT_EQ(std::string(1000, '0'), status_or.value());
@@ -212,13 +216,13 @@ TEST(StatusOr, TestCopyWithValuesAndErrors) {
 }
 
 TEST(StatusOr, TestDefaultCtor) {
-  StatusOr<int> thing;
+  absl::StatusOr<int> thing;
   EXPECT_FALSE(thing.ok());
   EXPECT_EQ(thing.status().code(), absl::StatusCode::kUnknown);
 }
 
 TEST(StatusOrDeathTest, TestDefaultCtorValue) {
-  StatusOr<int> thing;
+  absl::StatusOr<int> thing;
 #ifdef ABSL_HAVE_EXCEPTIONS
   try {
     thing.value();
@@ -230,7 +234,7 @@ TEST(StatusOrDeathTest, TestDefaultCtorValue) {
   EXPECT_DEATH(thing.value(), "");
 #endif
 
-  const StatusOr<int> thing2;
+  const absl::StatusOr<int> thing2;
 #ifdef ABSL_HAVE_EXCEPTIONS
   try {
     thing.value();
@@ -244,93 +248,94 @@ TEST(StatusOrDeathTest, TestDefaultCtorValue) {
 }
 
 TEST(StatusOr, TestStatusCtor) {
-  StatusOr<int> thing(Status(absl::StatusCode::kCancelled, ""));
+  absl::StatusOr<int> thing(absl::Status(absl::StatusCode::kCancelled, ""));
   EXPECT_FALSE(thing.ok());
   EXPECT_EQ(thing.status().code(), absl::StatusCode::kCancelled);
 }
 
 TEST(StatusOr, TestValueCtor) {
   const int kI = 4;
-  const StatusOr<int> thing(kI);
+  const absl::StatusOr<int> thing(kI);
   EXPECT_TRUE(thing.ok());
   EXPECT_EQ(kI, thing.value());
 }
 
 TEST(StatusOr, TestCopyCtorStatusOk) {
   const int kI = 4;
-  const StatusOr<int> original(kI);
-  const StatusOr<int> copy(original);
+  const absl::StatusOr<int> original(kI);
+  const absl::StatusOr<int> copy(original);
   EXPECT_EQ(copy.status(), original.status());
   EXPECT_EQ(original.value(), copy.value());
 }
 
 TEST(StatusOr, TestCopyCtorStatusNotOk) {
-  StatusOr<int> original(Status(absl::StatusCode::kCancelled, ""));
-  StatusOr<int> copy(original);
+  absl::StatusOr<int> original(absl::Status(absl::StatusCode::kCancelled, ""));
+  absl::StatusOr<int> copy(original);
   EXPECT_EQ(copy.status(), original.status());
 }
 
 TEST(StatusOr, TestCopyCtorNonAssignable) {
   const int kI = 4;
   CopyNoAssign value(kI);
-  StatusOr<CopyNoAssign> original(value);
-  StatusOr<CopyNoAssign> copy(original);
+  absl::StatusOr<CopyNoAssign> original(value);
+  absl::StatusOr<CopyNoAssign> copy(original);
   EXPECT_EQ(copy.status(), original.status());
   EXPECT_EQ(original.value().foo_, copy.value().foo_);
 }
 
 TEST(StatusOr, TestCopyCtorStatusOKConverting) {
   const int kI = 4;
-  StatusOr<int> original(kI);
-  StatusOr<double> copy(original);
+  absl::StatusOr<int> original(kI);
+  absl::StatusOr<double> copy(original);
   EXPECT_EQ(copy.status(), original.status());
   EXPECT_DOUBLE_EQ(original.value(), copy.value());
 }
 
 TEST(StatusOr, TestCopyCtorStatusNotOkConverting) {
-  StatusOr<int> original(Status(absl::StatusCode::kCancelled, ""));
-  StatusOr<double> copy(original);
+  absl::StatusOr<int> original(absl::Status(absl::StatusCode::kCancelled, ""));
+  absl::StatusOr<double> copy(original);
   EXPECT_EQ(copy.status(), original.status());
 }
 
 TEST(StatusOr, TestAssignmentStatusOk) {
   const int kI = 4;
-  StatusOr<int> source(kI);
-  StatusOr<int> target;
+  absl::StatusOr<int> source(kI);
+  absl::StatusOr<int> target;
   target = source;
   EXPECT_EQ(target.status(), source.status());
   EXPECT_EQ(source.value(), target.value());
 }
 
 TEST(StatusOr, TestAssignmentStatusNotOk) {
-  StatusOr<int> source(Status(absl::StatusCode::kCancelled, ""));
-  StatusOr<int> target;
+  absl::StatusOr<int> source(absl::Status(absl::StatusCode::kCancelled, ""));
+  absl::StatusOr<int> target;
   target = source;
   EXPECT_EQ(target.status(), source.status());
 }
 
 TEST(StatusOr, TestStatus) {
-  StatusOr<int> good(4);
+  absl::StatusOr<int> good(4);
   EXPECT_TRUE(good.ok());
-  StatusOr<int> bad(Status(absl::StatusCode::kCancelled, ""));
+  absl::StatusOr<int> bad(absl::Status(absl::StatusCode::kCancelled, ""));
   EXPECT_FALSE(bad.ok());
-  EXPECT_EQ(bad.status(), Status(absl::StatusCode::kCancelled, ""));
+  EXPECT_EQ(bad.status(), absl::Status(absl::StatusCode::kCancelled, ""));
 }
 
 TEST(StatusOr, TestValue) {
   const int kI = 4;
-  StatusOr<int> thing(kI);
+  absl::StatusOr<int> thing(kI);
   EXPECT_EQ(kI, thing.value());
 }
 
 TEST(StatusOr, TestValueConst) {
   const int kI = 4;
-  const StatusOr<int> thing(kI);
+  const absl::StatusOr<int> thing(kI);
   EXPECT_EQ(kI, thing.value());
 }
 
 TEST(StatusOrDeathTest, TestValueNotOk) {
-  StatusOr<int> thing(Status(absl::StatusCode::kCancelled, "cancelled"));
+  absl::StatusOr<int> thing(
+      absl::Status(absl::StatusCode::kCancelled, "cancelled"));
 #ifdef ABSL_HAVE_EXCEPTIONS
   try {
     thing.value();
@@ -344,7 +349,7 @@ TEST(StatusOrDeathTest, TestValueNotOk) {
 }
 
 TEST(StatusOrDeathTest, TestValueNotOkConst) {
-  const StatusOr<int> thing(Status(absl::StatusCode::kUnknown, ""));
+  const absl::StatusOr<int> thing(absl::Status(absl::StatusCode::kUnknown, ""));
 #ifdef ABSL_HAVE_EXCEPTIONS
   try {
     thing.value();
@@ -358,13 +363,13 @@ TEST(StatusOrDeathTest, TestValueNotOkConst) {
 }
 
 TEST(StatusOr, TestPointerDefaultCtor) {
-  StatusOr<int*> thing;
+  absl::StatusOr<int*> thing;
   EXPECT_FALSE(thing.ok());
   EXPECT_EQ(thing.status().code(), absl::StatusCode::kUnknown);
 }
 
 TEST(StatusOrDeathTest, TestPointerDefaultCtorValue) {
-  StatusOr<int*> thing;
+  absl::StatusOr<int*> thing;
 #ifdef ABSL_HAVE_EXCEPTIONS
   try {
     thing.value();
@@ -378,94 +383,97 @@ TEST(StatusOrDeathTest, TestPointerDefaultCtorValue) {
 }
 
 TEST(StatusOr, TestPointerStatusCtor) {
-  StatusOr<int*> thing(Status(absl::StatusCode::kCancelled, ""));
+  absl::StatusOr<int*> thing(absl::Status(absl::StatusCode::kCancelled, ""));
   EXPECT_FALSE(thing.ok());
-  EXPECT_EQ(thing.status(), Status(absl::StatusCode::kCancelled, ""));
+  EXPECT_EQ(thing.status(), absl::Status(absl::StatusCode::kCancelled, ""));
 }
 
 TEST(StatusOr, TestPointerValueCtor) {
   const int kI = 4;
-  StatusOr<const int*> thing(&kI);
+  absl::StatusOr<const int*> thing(&kI);
   EXPECT_TRUE(thing.ok());
   EXPECT_EQ(&kI, thing.value());
 }
 
 TEST(StatusOr, TestPointerCopyCtorStatusOk) {
   const int kI = 0;
-  StatusOr<const int*> original(&kI);
-  StatusOr<const int*> copy(original);
+  absl::StatusOr<const int*> original(&kI);
+  absl::StatusOr<const int*> copy(original);
   EXPECT_EQ(copy.status(), original.status());
   EXPECT_EQ(original.value(), copy.value());
 }
 
 TEST(StatusOr, TestPointerCopyCtorStatusNotOk) {
-  StatusOr<int*> original(Status(absl::StatusCode::kCancelled, ""));
-  StatusOr<int*> copy(original);
+  absl::StatusOr<int*> original(absl::Status(absl::StatusCode::kCancelled, ""));
+  absl::StatusOr<int*> copy(original);
   EXPECT_EQ(copy.status(), original.status());
 }
 
 TEST(StatusOr, TestPointerCopyCtorStatusOKConverting) {
   Derived derived;
-  StatusOr<Derived*> original(&derived);
-  StatusOr<Base2*> copy(original);
+  absl::StatusOr<Derived*> original(&derived);
+  absl::StatusOr<Base2*> copy(original);
   EXPECT_EQ(copy.status(), original.status());
   EXPECT_EQ(static_cast<const Base2*>(original.value()), copy.value());
 }
 
 TEST(StatusOr, TestPointerCopyCtorStatusNotOkConverting) {
-  StatusOr<Derived*> original(Status(absl::StatusCode::kCancelled, ""));
-  StatusOr<Base2*> copy(original);
+  absl::StatusOr<Derived*> original(
+      absl::Status(absl::StatusCode::kCancelled, ""));
+  absl::StatusOr<Base2*> copy(original);
   EXPECT_EQ(copy.status(), original.status());
 }
 
 TEST(StatusOr, TestPointerAssignmentStatusOk) {
   const int kI = 0;
-  StatusOr<const int*> source(&kI);
-  StatusOr<const int*> target;
+  absl::StatusOr<const int*> source(&kI);
+  absl::StatusOr<const int*> target;
   target = source;
   EXPECT_EQ(target.status(), source.status());
   EXPECT_EQ(source.value(), target.value());
 }
 
 TEST(StatusOr, TestPointerAssignmentStatusNotOk) {
-  StatusOr<int*> source(Status(absl::StatusCode::kCancelled, ""));
-  StatusOr<int*> target;
+  absl::StatusOr<int*> source(absl::Status(absl::StatusCode::kCancelled, ""));
+  absl::StatusOr<int*> target;
   target = source;
   EXPECT_EQ(target.status(), source.status());
 }
 
 TEST(StatusOr, TestPointerStatus) {
   const int kI = 0;
-  StatusOr<const int*> good(&kI);
+  absl::StatusOr<const int*> good(&kI);
   EXPECT_TRUE(good.ok());
-  StatusOr<const int*> bad(Status(absl::StatusCode::kCancelled, ""));
-  EXPECT_EQ(bad.status(), Status(absl::StatusCode::kCancelled, ""));
+  absl::StatusOr<const int*> bad(
+      absl::Status(absl::StatusCode::kCancelled, ""));
+  EXPECT_EQ(bad.status(), absl::Status(absl::StatusCode::kCancelled, ""));
 }
 
 TEST(StatusOr, TestPointerValue) {
   const int kI = 0;
-  StatusOr<const int*> thing(&kI);
+  absl::StatusOr<const int*> thing(&kI);
   EXPECT_EQ(&kI, thing.value());
 }
 
 TEST(StatusOr, TestPointerValueConst) {
   const int kI = 0;
-  const StatusOr<const int*> thing(&kI);
+  const absl::StatusOr<const int*> thing(&kI);
   EXPECT_EQ(&kI, thing.value());
 }
 
 TEST(StatusOr, TestArrowOperator) {
-  StatusOr<std::unique_ptr<int>> uptr = ReturnUniquePtr();
+  absl::StatusOr<std::unique_ptr<int>> uptr = ReturnUniquePtr();
   EXPECT_EQ(*uptr->get(), 0);
 }
 
 TEST(StatusOr, TestStarOperator) {
-  StatusOr<std::unique_ptr<int>> uptr = ReturnUniquePtr();
+  absl::StatusOr<std::unique_ptr<int>> uptr = ReturnUniquePtr();
   EXPECT_EQ(**uptr, 0);
 }
 
 TEST(StatusOr, TestStarOperatorDeath) {
-  StatusOr<Base1> error(Status(absl::StatusCode::kCancelled, "cancelled"));
+  absl::StatusOr<Base1> error(
+      absl::Status(absl::StatusCode::kCancelled, "cancelled"));
   EXPECT_DEATH(*error, "cancelled");
 }
 
@@ -478,7 +486,7 @@ TEST(StatusOr, TestStarOperatorDeath) {
 //   v.reserve(v.capacity() + 10);
 // }
 
-static StatusOr<int> MakeStatus() { return 100; }
+static absl::StatusOr<int> MakeStatus() { return 100; }
 // A factory to help us benchmark the various factory styles. All of
 // the factory methods are marked as non-inlineable so as to more
 // accurately simulate calling a factory for which you do not have
@@ -502,25 +510,25 @@ class BenchmarkFactory {
   // A more sophisticated factory, which returns a status to indicate
   // the result of the operation. The factory result is populated into
   // the user provided pointer result.
-  Status ArgumentFactory(T** result) TF_ATTRIBUTE_NOINLINE {
+  absl::Status ArgumentFactory(T** result) TF_ATTRIBUTE_NOINLINE {
     *result = value_;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status ArgumentFactoryFail(T** result) TF_ATTRIBUTE_NOINLINE {
+  absl::Status ArgumentFactoryFail(T** result) TF_ATTRIBUTE_NOINLINE {
     *result = nullptr;
-    return Status(absl::StatusCode::kCancelled, "");
+    return absl::Status(absl::StatusCode::kCancelled, "");
   }
 
-  Status ArgumentFactoryFailShortMsg(T** result) TF_ATTRIBUTE_NOINLINE {
+  absl::Status ArgumentFactoryFailShortMsg(T** result) TF_ATTRIBUTE_NOINLINE {
     *result = nullptr;
-    return Status(absl::StatusCode::kInternal, "");
+    return absl::Status(absl::StatusCode::kInternal, "");
   }
 
-  Status ArgumentFactoryFailLongMsg(T** result) TF_ATTRIBUTE_NOINLINE {
+  absl::Status ArgumentFactoryFailLongMsg(T** result) TF_ATTRIBUTE_NOINLINE {
     *result = nullptr;
-    return Status(absl::StatusCode::kInternal,
-                  "a big string of message junk that will never be read");
+    return absl::Status(absl::StatusCode::kInternal,
+                        "a big string of message junk that will never be read");
   }
 
   // A factory that returns a StatusOr<T*>. If the factory operation
@@ -531,16 +539,16 @@ class BenchmarkFactory {
   }
 
   StatusOr<T*> StatusOrFactoryFail() TF_ATTRIBUTE_NOINLINE {
-    return Status(absl::StatusCode::kCancelled, "");
+    return absl::Status(absl::StatusCode::kCancelled, "");
   }
 
   StatusOr<T*> StatusOrFactoryFailShortMsg() TF_ATTRIBUTE_NOINLINE {
-    return Status(absl::StatusCode::kInternal, "");
+    return absl::Status(absl::StatusCode::kInternal, "");
   }
 
   StatusOr<T*> StatusOrFactoryFailLongMsg() TF_ATTRIBUTE_NOINLINE {
-    return Status(absl::StatusCode::kInternal,
-                  "a big string of message junk that will never be read");
+    return absl::Status(absl::StatusCode::kInternal,
+                        "a big string of message junk that will never be read");
   }
 
  private:
@@ -594,7 +602,7 @@ void BM_ArgumentFactory(::testing::benchmark::State& state) {
   BenchmarkFactory<BenchmarkType> factory;
   for (auto s : state) {
     BenchmarkType* result = nullptr;
-    Status status = factory.ArgumentFactory(&result);
+    absl::Status status = factory.ArgumentFactory(&result);
     if (status.ok() && result != nullptr) {
       result->DoWork();
     }
@@ -607,7 +615,7 @@ BENCHMARK(BM_ArgumentFactory);
 void BM_StatusOrFactory(::testing::benchmark::State& state) {
   BenchmarkFactory<BenchmarkType> factory;
   for (auto s : state) {
-    StatusOr<BenchmarkType*> result = factory.StatusOrFactory();
+    absl::StatusOr<BenchmarkType*> result = factory.StatusOrFactory();
     if (result.ok()) {
       result.value()->DoWork();
     }
@@ -622,7 +630,7 @@ void BM_ArgumentFactoryFail(::testing::benchmark::State& state) {
   BenchmarkFactory<BenchmarkType> factory;
   for (auto s : state) {
     BenchmarkType* result = nullptr;
-    Status status = factory.ArgumentFactoryFail(&result);
+    absl::Status status = factory.ArgumentFactoryFail(&result);
     if (status.ok() && result != nullptr) {
       result->DoWork();
     }
@@ -635,7 +643,7 @@ BENCHMARK(BM_ArgumentFactoryFail);
 void BM_StatusOrFactoryFail(::testing::benchmark::State& state) {
   BenchmarkFactory<BenchmarkType> factory;
   for (auto s : state) {
-    StatusOr<BenchmarkType*> result = factory.StatusOrFactoryFail();
+    absl::StatusOr<BenchmarkType*> result = factory.StatusOrFactoryFail();
     if (result.ok()) {
       result.value()->DoWork();
     }
@@ -650,7 +658,7 @@ void BM_ArgumentFactoryFailShortMsg(::testing::benchmark::State& state) {
   BenchmarkFactory<BenchmarkType> factory;
   for (auto s : state) {
     BenchmarkType* result = nullptr;
-    Status status = factory.ArgumentFactoryFailShortMsg(&result);
+    absl::Status status = factory.ArgumentFactoryFailShortMsg(&result);
     if (status.ok() && result != nullptr) {
       result->DoWork();
     }
@@ -663,7 +671,8 @@ BENCHMARK(BM_ArgumentFactoryFailShortMsg);
 void BM_StatusOrFactoryFailShortMsg(::testing::benchmark::State& state) {
   BenchmarkFactory<BenchmarkType> factory;
   for (auto s : state) {
-    StatusOr<BenchmarkType*> result = factory.StatusOrFactoryFailShortMsg();
+    absl::StatusOr<BenchmarkType*> result =
+        factory.StatusOrFactoryFailShortMsg();
     if (result.ok()) {
       result.value()->DoWork();
     }
@@ -678,7 +687,7 @@ void BM_ArgumentFactoryFailLongMsg(::testing::benchmark::State& state) {
   BenchmarkFactory<BenchmarkType> factory;
   for (auto s : state) {
     BenchmarkType* result = nullptr;
-    Status status = factory.ArgumentFactoryFailLongMsg(&result);
+    absl::Status status = factory.ArgumentFactoryFailLongMsg(&result);
     if (status.ok() && result != nullptr) {
       result->DoWork();
     }
@@ -691,7 +700,8 @@ BENCHMARK(BM_ArgumentFactoryFailLongMsg);
 void BM_StatusOrFactoryFailLongMsg(::testing::benchmark::State& state) {
   BenchmarkFactory<BenchmarkType> factory;
   for (auto s : state) {
-    StatusOr<BenchmarkType*> result = factory.StatusOrFactoryFailLongMsg();
+    absl::StatusOr<BenchmarkType*> result =
+        factory.StatusOrFactoryFailLongMsg();
     if (result.ok()) {
       result.value()->DoWork();
     }
@@ -701,22 +711,22 @@ BENCHMARK(BM_StatusOrFactoryFailLongMsg);
 
 #if defined(PLATFORM_GOOGLE)
 
-StatusOr<int> GetError() {
+absl::StatusOr<int> GetError() {
   return absl::InvalidArgumentError("An invalid argument error");
 }
 
-StatusOr<int> PropagateError() {
+absl::StatusOr<int> PropagateError() {
   TF_ASSIGN_OR_RETURN(int a, GetError());
   return a;
 }
 
-StatusOr<int> PropagateError2() {
+absl::StatusOr<int> PropagateError2() {
   TF_ASSIGN_OR_RETURN(int a, PropagateError());
   return a;
 }
 
 TEST(Status, StackTracePropagation) {
-  StatusOr<int> s = PropagateError2();
+  absl::StatusOr<int> s = PropagateError2();
   auto sources = s.status().GetSourceLocations();
   ASSERT_EQ(sources.size(), 3);
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/test.h b/third_party/xla/third_party/tsl/tsl/platform/test.h
index 313bfe5f0ea3dd..c8950099cad936 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/test.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/test.h
@@ -47,12 +47,6 @@ limitations under the License.
 #endif
 #include <gmock/gmock.h>  // IWYU pragma: export
 
-#define DISABLED_ON_GPU_ROCM(X) X
-#if TENSORFLOW_USE_ROCM
-#undef DISABLED_ON_GPU_ROCM
-#define DISABLED_ON_GPU_ROCM(X) DISABLED_##X
-#endif  // TENSORFLOW_USE_ROCM
-
 namespace tsl {
 namespace testing {
 
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/lib/BUILD
index a5f965baa6c1a9..85ff01b6a987ce 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/BUILD
@@ -64,6 +64,7 @@ cc_library(
         "//tsl/platform:logging",
         "//tsl/platform:status",
         "//tsl/profiler/protobuf:xplane_proto_cc",
+        "@com_google_absl//absl/status",
     ],
 )
 
@@ -112,12 +113,12 @@ tsl_cc_test(
         ":profiler_factory_impl",
         ":profiler_interface",
         "//tsl/platform:macros",
-        "//tsl/platform:status",
         "//tsl/platform:test",
         "//tsl/platform:test_main",
         "//tsl/profiler/protobuf:profiler_options_proto_cc",
         "//tsl/profiler/protobuf:xplane_proto_cc",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
     ],
 )
 
@@ -149,6 +150,7 @@ cc_library(
         "//tsl/platform:errors",
         "//tsl/platform:macros",
         "//tsl/platform:statusor",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@local_xla//xla/tsl/util:env_var",
     ],
@@ -159,9 +161,9 @@ tsl_cc_test(
     srcs = ["profiler_lock_test.cc"],
     deps = [
         ":profiler_lock",
-        "//tsl/platform:statusor",
         "//tsl/platform:test",
         "//tsl/platform:test_main",
+        "@com_google_absl//absl/status:statusor",
     ],
 )
 
@@ -178,6 +180,7 @@ cc_library(
         "//tsl/platform:types",
         "//tsl/profiler/protobuf:profiler_options_proto_cc",
         "//tsl/profiler/protobuf:xplane_proto_cc",
+        "@com_google_absl//absl/status",
     ] + if_not_android([
         ":profiler_interface",
         ":profiler_lock",
@@ -206,6 +209,7 @@ cc_library(
         "//tsl/profiler/protobuf:profiler_options_proto_cc",
         "//tsl/profiler/protobuf:xplane_proto_cc",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
     ] + if_not_android([
         ":profiler_collection",
         ":profiler_factory",
@@ -280,6 +284,19 @@ cc_library(
     deps = if_cuda_is_configured(nvtx_headers()),
 )
 
+cc_library(
+    name = "nvtx_utils_libtpu",
+    srcs = ["nvtx_utils_stub.cc"],
+    hdrs = ["nvtx_utils.h"],
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "libtpu_on_gce",
+    values = {"copt": "-DLIBTPU_ON_GCE"},
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "scoped_annotation",
     hdrs = [
@@ -288,13 +305,16 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
-        ":nvtx_utils",
         "//tsl/platform:macros",
         "//tsl/platform:types",
         "@com_google_absl//absl/strings",
     ] + if_not_android([
         "//tsl/profiler/backends/cpu:annotation_stack",
-    ]),
+    ]) + select({
+        # Remove this once we clean up all the CUDA specific deps.
+        "libtpu_on_gce": [":nvtx_utils_libtpu"],
+        "//conditions:default": [":nvtx_utils"],
+    }),
 )
 
 tsl_cc_test(
@@ -338,6 +358,7 @@ cc_library(
         ":profiler_interface",
         "//tsl/platform:status",
         "//tsl/profiler/protobuf:xplane_proto_cc",
+        "@com_google_absl//absl/status",
     ],
 )
 
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_collection.cc b/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_collection.cc
index 38dfe1ae0fb65a..f3ffec62bff7f7 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_collection.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_collection.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tsl/platform/status.h"
+#include "absl/status/status.h"
 #include "tsl/profiler/lib/profiler_interface.h"
 #include "tsl/profiler/protobuf/xplane.pb.h"
 
@@ -29,24 +29,25 @@ ProfilerCollection::ProfilerCollection(
     std::vector<std::unique_ptr<ProfilerInterface>> profilers)
     : profilers_(std::move(profilers)) {}
 
-Status ProfilerCollection::Start() {
-  Status status;
+absl::Status ProfilerCollection::Start() {
+  absl::Status status;
   for (auto& profiler : profilers_) {
     status.Update(profiler->Start());
   }
   return status;
 }
 
-Status ProfilerCollection::Stop() {
-  Status status;
+absl::Status ProfilerCollection::Stop() {
+  absl::Status status;
   for (auto& profiler : profilers_) {
     status.Update(profiler->Stop());
   }
   return status;
 }
 
-Status ProfilerCollection::CollectData(tensorflow::profiler::XSpace* space) {
-  Status status;
+absl::Status ProfilerCollection::CollectData(
+    tensorflow::profiler::XSpace* space) {
+  absl::Status status;
   for (auto& profiler : profilers_) {
     status.Update(profiler->CollectData(space));
   }
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_collection.h b/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_collection.h
index ec7f2ca78292fa..c3bede9af47c8d 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_collection.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_collection.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "tsl/platform/status.h"
 #include "tsl/profiler/lib/profiler_interface.h"
 #include "tsl/profiler/protobuf/xplane.pb.h"
@@ -32,11 +33,11 @@ class ProfilerCollection : public ProfilerInterface {
   explicit ProfilerCollection(
       std::vector<std::unique_ptr<ProfilerInterface>> profilers);
 
-  Status Start() override;
+  absl::Status Start() override;
 
-  Status Stop() override;
+  absl::Status Stop() override;
 
-  Status CollectData(tensorflow::profiler::XSpace* space) override;
+  absl::Status CollectData(tensorflow::profiler::XSpace* space) override;
 
  private:
   std::vector<std::unique_ptr<ProfilerInterface>> profilers_;
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_controller.cc b/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_controller.cc
index ef86a3b24d43cb..55fc42706dfea5 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_controller.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_controller.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
-#include "tsl/platform/status.h"
 #include "tsl/profiler/lib/profiler_interface.h"
 #include "tsl/profiler/protobuf/xplane.pb.h"
 
@@ -37,8 +36,8 @@ ProfilerController::~ProfilerController() {
   }
 }
 
-Status ProfilerController::Start() {
-  Status status;
+absl::Status ProfilerController::Start() {
+  absl::Status status;
   if (state_ == ProfilerState::kInit) {
     state_ = ProfilerState::kStart;
     if (status_.ok()) {
@@ -53,8 +52,8 @@ Status ProfilerController::Start() {
   return status;
 }
 
-Status ProfilerController::Stop() {
-  Status status;
+absl::Status ProfilerController::Stop() {
+  absl::Status status;
   if (state_ == ProfilerState::kStart) {
     state_ = ProfilerState::kStop;
     if (status_.ok()) {
@@ -69,8 +68,9 @@ Status ProfilerController::Stop() {
   return status;
 }
 
-Status ProfilerController::CollectData(tensorflow::profiler::XSpace* space) {
-  Status status;
+absl::Status ProfilerController::CollectData(
+    tensorflow::profiler::XSpace* space) {
+  absl::Status status;
   if (state_ == ProfilerState::kStop) {
     state_ = ProfilerState::kCollectData;
     if (status_.ok()) {
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_controller.h b/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_controller.h
index 2fffe17066d7fc..ed88f8ec26b561 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_controller.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_controller.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/status/status.h"
 #include "tsl/platform/status.h"
 #include "tsl/profiler/lib/profiler_interface.h"
 #include "tsl/profiler/protobuf/xplane.pb.h"
@@ -38,11 +39,11 @@ class ProfilerController : public ProfilerInterface {
   explicit ProfilerController(std::unique_ptr<ProfilerInterface> profiler);
   ~ProfilerController() override;
 
-  Status Start() override;
+  absl::Status Start() override;
 
-  Status Stop() override;
+  absl::Status Stop() override;
 
-  Status CollectData(tensorflow::profiler::XSpace* space) override;
+  absl::Status CollectData(tensorflow::profiler::XSpace* space) override;
 
  private:
   enum class ProfilerState {
@@ -54,7 +55,7 @@ class ProfilerController : public ProfilerInterface {
 
   ProfilerState state_ = ProfilerState::kInit;
   std::unique_ptr<ProfilerInterface> profiler_;
-  Status status_;  // result of calls to profiler_
+  absl::Status status_;  // result of calls to profiler_
 };
 
 }  // namespace profiler
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_factory_test.cc b/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_factory_test.cc
index 55bb71b107c369..a1188b9fa5563d 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_factory_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_factory_test.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include <utility>
 
 #include "absl/memory/memory.h"
+#include "absl/status/status.h"
 #include "tsl/platform/macros.h"
-#include "tsl/platform/status.h"
 #include "tsl/platform/test.h"
 #include "tsl/profiler/lib/profiler_interface.h"
 #include "tsl/profiler/protobuf/profiler_options.pb.h"
@@ -31,10 +31,10 @@ namespace {
 
 class TestProfiler : public ProfilerInterface {
  public:
-  Status Start() override { return OkStatus(); }
-  Status Stop() override { return OkStatus(); }
-  Status CollectData(tensorflow::profiler::XSpace*) override {
-    return OkStatus();
+  absl::Status Start() override { return absl::OkStatus(); }
+  absl::Status Stop() override { return absl::OkStatus(); }
+  absl::Status CollectData(tensorflow::profiler::XSpace*) override {
+    return absl::OkStatus();
   }
 };
 
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_interface.h b/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_interface.h
index 8aa693cb844618..c949a50f463cbb 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_interface.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_interface.h
@@ -34,13 +34,13 @@ class ProfilerInterface {
   virtual ~ProfilerInterface() = default;
 
   // Starts profiling.
-  virtual Status Start() = 0;
+  virtual absl::Status Start() = 0;
 
   // Stops profiling.
-  virtual Status Stop() = 0;
+  virtual absl::Status Stop() = 0;
 
   // Saves collected profile data into XSpace.
-  virtual Status CollectData(tensorflow::profiler::XSpace* space) = 0;
+  virtual absl::Status CollectData(tensorflow::profiler::XSpace* space) = 0;
 };
 
 }  // namespace profiler
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_lock.cc b/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_lock.cc
index 325713117a333a..d32ea96fd2bf69 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_lock.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_lock.cc
@@ -16,10 +16,10 @@ limitations under the License.
 
 #include <atomic>
 
+#include "absl/status/statusor.h"
 #include "xla/tsl/util/env_var.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/macros.h"
-#include "tsl/platform/statusor.h"
 
 namespace tsl {
 namespace profiler {
@@ -39,7 +39,7 @@ static_assert(ATOMIC_INT_LOCK_FREE == 2, "Assumed atomic<int> was lock free");
   return g_session_active.load(std::memory_order_relaxed) != 0;
 }
 
-/*static*/ StatusOr<ProfilerLock> ProfilerLock::Acquire() {
+/*static*/ absl::StatusOr<ProfilerLock> ProfilerLock::Acquire() {
   // Use environment variable to permanently lock the profiler.
   // This allows running TensorFlow under an external profiling tool with all
   // built-in profiling disabled.
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_lock.h b/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_lock.h
index 26d4b2a4471c42..478dae87b8a399 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_lock.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_lock.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <utility>
 
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "tsl/platform/statusor.h"
 
@@ -35,7 +36,7 @@ class ProfilerLock {
 
   // Acquires the profiler lock if no other profiler session is currently
   // active.
-  static StatusOr<ProfilerLock> Acquire();
+  static absl::StatusOr<ProfilerLock> Acquire();
 
   // Default constructor creates an inactive instance.
   ProfilerLock() = default;
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_lock_test.cc b/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_lock_test.cc
index c22e65d635c737..2ddc56fb0b9a8d 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_lock_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_lock_test.cc
@@ -16,7 +16,7 @@ limitations under the License.
 
 #include <utility>
 
-#include "tsl/platform/statusor.h"
+#include "absl/status/statusor.h"
 #include "tsl/platform/test.h"
 
 namespace tsl {
@@ -29,7 +29,7 @@ TEST(ProfilerLockTest, DefaultConstructorCreatesInactiveInstance) {
 }
 
 TEST(ProfilerLockTest, AcquireAndReleaseExplicitly) {
-  StatusOr<ProfilerLock> profiler_lock = ProfilerLock::Acquire();
+  absl::StatusOr<ProfilerLock> profiler_lock = ProfilerLock::Acquire();
   ASSERT_TRUE(profiler_lock.ok());
   EXPECT_TRUE(profiler_lock->Active());
   profiler_lock->ReleaseIfActive();
@@ -37,14 +37,14 @@ TEST(ProfilerLockTest, AcquireAndReleaseExplicitly) {
 }
 
 TEST(ProfilerLockTest, AcquireAndReleaseOnDestruction) {
-  StatusOr<ProfilerLock> profiler_lock = ProfilerLock::Acquire();
+  absl::StatusOr<ProfilerLock> profiler_lock = ProfilerLock::Acquire();
   ASSERT_TRUE(profiler_lock.ok());
   EXPECT_TRUE(profiler_lock->Active());
 }
 
 TEST(ProfilerLockTest, ReacquireWithoutReleaseFails) {
-  StatusOr<ProfilerLock> profiler_lock_1 = ProfilerLock::Acquire();
-  StatusOr<ProfilerLock> profiler_lock_2 = ProfilerLock::Acquire();
+  absl::StatusOr<ProfilerLock> profiler_lock_1 = ProfilerLock::Acquire();
+  absl::StatusOr<ProfilerLock> profiler_lock_2 = ProfilerLock::Acquire();
   ASSERT_TRUE(profiler_lock_1.ok());
   EXPECT_TRUE(profiler_lock_1->Active());
   EXPECT_FALSE(profiler_lock_2.ok());
@@ -62,7 +62,7 @@ TEST(ProfilerLockTest, ReacquireAfterReleaseSucceeds) {
 }
 
 TEST(ProfilerLockTest, InactiveAfterMove) {
-  StatusOr<ProfilerLock> profiler_lock_1 = ProfilerLock::Acquire();
+  absl::StatusOr<ProfilerLock> profiler_lock_1 = ProfilerLock::Acquire();
   ASSERT_TRUE(profiler_lock_1.ok());
   ASSERT_TRUE(profiler_lock_1->Active());
   ProfilerLock profiler_lock_2 = std::move(*profiler_lock_1);
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_session.cc b/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_session.cc
index 39775b957a1099..f48b5ecb6f3f2c 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_session.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_session.cc
@@ -19,10 +19,10 @@ limitations under the License.
 #include <utility>
 
 #include "absl/memory/memory.h"
+#include "absl/status/status.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/mutex.h"
-#include "tsl/platform/status.h"
 #include "tsl/profiler/protobuf/profiler_options.pb.h"
 #include "tsl/profiler/protobuf/xplane.pb.h"
 
@@ -56,13 +56,13 @@ ProfileOptions GetOptions(const ProfileOptions& opts) {
   return absl::WrapUnique(new ProfilerSession(options));
 }
 
-Status ProfilerSession::Status() {
+absl::Status ProfilerSession::Status() {
   mutex_lock l(mutex_);
   return status_;
 }
 
 #if !defined(IS_MOBILE_PLATFORM)
-Status ProfilerSession::CollectDataInternal(XSpace* space) {
+absl::Status ProfilerSession::CollectDataInternal(XSpace* space) {
   mutex_lock l(mutex_);
   TF_RETURN_IF_ERROR(status_);
   LOG(INFO) << "Profiler session collecting data.";
@@ -74,17 +74,17 @@ Status ProfilerSession::CollectDataInternal(XSpace* space) {
   }
   // Allow another session to start.
   profiler_lock_.ReleaseIfActive();
-  return OkStatus();
+  return absl::OkStatus();
 }
 #endif
 
-Status ProfilerSession::CollectData(XSpace* space) {
+absl::Status ProfilerSession::CollectData(XSpace* space) {
 #if !defined(IS_MOBILE_PLATFORM)
   space->add_hostnames(port::Hostname());
   TF_RETURN_IF_ERROR(CollectDataInternal(space));
   profiler::PostProcessSingleHostXSpace(space, start_time_ns_, stop_time_ns_);
 #endif
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 ProfilerSession::ProfilerSession(const ProfileOptions& options)
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_session.h b/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_session.h
index e6fb67218ac5fe..b503f428ff30d5 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_session.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_session.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "tsl/platform/mutex.h"
 #include "tsl/platform/platform.h"
 #include "tsl/platform/status.h"
@@ -60,10 +61,10 @@ class ProfilerSession {
   // Deletes an existing Profiler and enables starting a new one.
   ~ProfilerSession();
 
-  tsl::Status Status() TF_LOCKS_EXCLUDED(mutex_);
+  absl::Status Status() TF_LOCKS_EXCLUDED(mutex_);
 
   // Collects profile data into XSpace.
-  tsl::Status CollectData(tensorflow::profiler::XSpace* space)
+  absl::Status CollectData(tensorflow::profiler::XSpace* space)
       TF_LOCKS_EXCLUDED(mutex_);
 
  private:
@@ -76,7 +77,7 @@ class ProfilerSession {
 
 #if !defined(IS_MOBILE_PLATFORM)
   // Collects profile data into XSpace without post-processsing.
-  tsl::Status CollectDataInternal(tensorflow::profiler::XSpace* space);
+  absl::Status CollectDataInternal(tensorflow::profiler::XSpace* space);
 
   profiler::ProfilerLock profiler_lock_ TF_GUARDED_BY(mutex_);
 
@@ -86,7 +87,7 @@ class ProfilerSession {
   uint64 stop_time_ns_;
   tensorflow::ProfileOptions options_;
 #endif
-  tsl::Status status_ TF_GUARDED_BY(mutex_);
+  absl::Status status_ TF_GUARDED_BY(mutex_);
   mutex mutex_;
 };
 
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/BUILD
index 925af9f6ebe4ed..fdf0979d82ea69 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/BUILD
@@ -29,6 +29,7 @@ cc_library(
         "@local_xla//xla/python:__pkg__",
         "//tensorflow/core/profiler/rpc/client:__pkg__",
         "//tensorflow/python/profiler/internal:__pkg__",
+        "//learning/pathways/data_parallel:__pkg__",
     ]),
     deps = [
         ":profiler_client_for_pybind",
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/utils/BUILD
index f0145697048df9..26358c8ec80db5 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/BUILD
@@ -446,6 +446,7 @@ tsl_cc_test(
         "//tsl/platform:test",
         "//tsl/platform:test_main",
         "//tsl/profiler/lib:connected_traceme",
+        "//tsl/profiler/protobuf:xplane_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/hash",
     ],
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane.cc b/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane.cc
index 7e8e7f3a431c97..3d06a05609a118 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane.cc
@@ -38,16 +38,16 @@ using ::tsl::profiler::XPlane;
 using ::tsl::profiler::XPlaneBuilder;
 using ::tsl::profiler::XSpace;
 
-void MutateXPlane(XPlane* plane,
+void MutateXPlane(XPlane& plane,
                   const std::vector<std::unique_ptr<XplaneEventMutatorFactory>>&
                       mutator_factories) {
-  XPlaneBuilder plane_builder(plane);
+  XPlaneBuilder plane_builder(&plane);
 
   absl::flat_hash_map<int64_t, std::vector<std::unique_ptr<XplaneEventMutator>>>
       mutators_from_event_metadata_id;
   std::vector<std::unique_ptr<XplaneEventMutator>> line_mutators;
   for (const auto& mutator_factory : mutator_factories) {
-    auto mutators = mutator_factory->CreateMutators(&plane_builder);
+    auto mutators = mutator_factory->CreateMutators(plane_builder);
     for (auto& mutator : mutators) {
       if (mutator->event_metadata()) {
         auto id = mutator->event_metadata()->id();
@@ -63,7 +63,7 @@ void MutateXPlane(XPlane* plane,
 
   plane_builder.ForEachLine([&](XLineBuilder line_builder) {
     for (const auto& mutator : line_mutators) {
-      mutator->MutateEventsInLine(&line_builder);
+      mutator->MutateEventsInLine(line_builder);
     }
     if (mutators_from_event_metadata_id.empty()) return;
     line_builder.ForEachEvent([&](XEventBuilder event_builder) {
@@ -71,7 +71,7 @@ void MutateXPlane(XPlane* plane,
           mutators_from_event_metadata_id.find(event_builder.MetadataId());
       if (event_mutators != mutators_from_event_metadata_id.end()) {
         for (const auto& mutator : event_mutators->second) {
-          mutator->Mutate(&event_builder);
+          mutator->Mutate(event_builder);
         }
       }
     });
@@ -150,14 +150,18 @@ CreateMutatorFactories() {
 }  // namespace
 
 void PreprocessXPlane(XPlane* plane) {
+  if (plane == nullptr) return;
+
   auto mutator_factories = CreateMutatorFactories();
-  MutateXPlane(plane, mutator_factories);
+  MutateXPlane(*plane, mutator_factories);
 }
 
 void PreprocessXSpace(XSpace* space) {
+  if (space == nullptr) return;
+
   auto mutator_factories = CreateMutatorFactories();
   for (XPlane& plane : *space->mutable_planes()) {
-    MutateXPlane(&plane, mutator_factories);
+    MutateXPlane(plane, mutator_factories);
   }
 }
 
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane.h b/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane.h
index 2433cd825cc842..724abb30429968 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane.h
@@ -52,9 +52,9 @@ class XplaneEventMutator {
   virtual ~XplaneEventMutator() = default;
 
   // Mutate event by event specified by the event_metadata.
-  virtual void Mutate(XEventBuilder* builder) = 0;
+  virtual void Mutate(XEventBuilder& builder) = 0;
   // Mutate line by line if event_metadata() return nullptr.
-  virtual void MutateEventsInLine(XLineBuilder* line) = 0;
+  virtual void MutateEventsInLine(XLineBuilder& line) = 0;
 
   const XEventMetadata* event_metadata() const { return event_metadata_; }
 
@@ -70,7 +70,7 @@ class XplaneEventMutatorFactory {
   virtual ~XplaneEventMutatorFactory() = default;
 
   virtual std::vector<std::unique_ptr<XplaneEventMutator>> CreateMutators(
-      XPlaneBuilder* xplane) const = 0;
+      XPlaneBuilder& xplane) const = 0;
 
  protected:
   XplaneEventMutatorFactory() = default;
@@ -84,21 +84,20 @@ class XplaneRootEventMutatorFactory : public XplaneEventMutatorFactory {
  public:
   static std::unique_ptr<XplaneEventMutatorFactory> CreateFactory(
       HostEventType event_type, int64_t root_level) {
-    std::unique_ptr<XplaneEventMutatorFactory> base;
-    base.reset(new XplaneRootEventMutatorFactory(event_type, root_level));
-    return base;
+    return absl::WrapUnique(
+        new XplaneRootEventMutatorFactory(event_type, root_level));
   }
 
   std::vector<std::unique_ptr<XplaneEventMutator>> CreateMutators(
-      XPlaneBuilder* xplane) const override {
+      XPlaneBuilder& xplane) const override {
     std::vector<std::unique_ptr<XplaneEventMutator>> mutators;
-    XEventMetadata* event_metadata =
-        xplane->GetEventMetadata(GetHostEventTypeStr(event_type_));
-    if (event_metadata == nullptr) return {};
-    XStatMetadata* root_metadata =
-        xplane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kIsRoot));
-    mutators.emplace_back(std::make_unique<XplaneRootEventMutator>(
-        event_metadata, root_metadata, root_level_));
+    if (auto* event_metadata =
+            xplane.GetEventMetadata(GetHostEventTypeStr(event_type_))) {
+      XStatMetadata* root_metadata =
+          xplane.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kIsRoot));
+      mutators.emplace_back(std::make_unique<XplaneRootEventMutator>(
+          event_metadata, *root_metadata, root_level_));
+    }
     return mutators;
   }
 
@@ -110,20 +109,20 @@ class XplaneRootEventMutatorFactory : public XplaneEventMutatorFactory {
   class XplaneRootEventMutator : public XplaneEventMutator {
    public:
     XplaneRootEventMutator(XEventMetadata* event_metadata,
-                           XStatMetadata* root_stats_metadata,
+                           XStatMetadata& root_stats_metadata,
                            int64_t root_level)
         : XplaneEventMutator(event_metadata),
           root_stats_metadata_(root_stats_metadata),
           root_level_(root_level) {}
-    void Mutate(XEventBuilder* event_builder) override {
-      event_builder->SetOrAddStatValue(*root_stats_metadata_, root_level_);
+    void Mutate(XEventBuilder& event_builder) override {
+      event_builder.SetOrAddStatValue(root_stats_metadata_, root_level_);
     }
-    void MutateEventsInLine(XLineBuilder* line) override {
+    void MutateEventsInLine(XLineBuilder& line) override {
       CHECK(false);  // Crash OK
     }
 
    private:
-    XStatMetadata* root_stats_metadata_;
+    XStatMetadata& root_stats_metadata_;
     int64_t root_level_;
   };
 
@@ -136,18 +135,19 @@ class XContextStatsAccessor {
  public:
   using value_type = StatValueType;
 
-  bool Initialize(XPlaneBuilder* xplane) {
-    stats_metadata_ = xplane->GetStatMetadata(GetStatTypeStr(kStatId));
+  bool Initialize(XPlaneBuilder& xplane) {
+    stats_metadata_ = xplane.GetStatMetadata(GetStatTypeStr(kStatId));
     return stats_metadata_;
   }
 
-  std::optional<StatValueType> GetStat(XEventBuilder* event_builder) {
-    auto* stat = event_builder->GetStat(*stats_metadata_);
+  std::optional<StatValueType> GetStat(XEventBuilder& event_builder) {
+    if (stats_metadata_ == nullptr) return std::nullopt;
+    auto* stat = event_builder.GetStat(*stats_metadata_);
     if (stat == nullptr) return std::nullopt;
     if constexpr (std::is_integral_v<StatValueType>) {
-      return event_builder->IntOrUintValue(*stat);
+      return event_builder.IntOrUintValue(*stat);
     } else {
-      return event_builder->StrOrRefValue(*stat);
+      return event_builder.StrOrRefValue(*stat);
     }
   }
 
@@ -160,19 +160,19 @@ class XContextStatsAccessorWithDefault {
  public:
   using value_type = StatValueType;
 
-  bool Initialize(XPlaneBuilder* xplane) {
-    stats_metadata_ = xplane->GetStatMetadata(GetStatTypeStr(kStatId));
+  bool Initialize(XPlaneBuilder& xplane) {
+    stats_metadata_ = xplane.GetStatMetadata(GetStatTypeStr(kStatId));
     return true;  // Always return true, even stat_metadata doesn't exist.
   }
 
-  std::optional<StatValueType> GetStat(XEventBuilder* event_builder) {
+  std::optional<StatValueType> GetStat(XEventBuilder& event_builder) {
     if (stats_metadata_ == nullptr) return kDefaultValue;
-    auto* stat = event_builder->GetStat(*stats_metadata_);
+    auto* stat = event_builder.GetStat(*stats_metadata_);
     if (stat == nullptr) return kDefaultValue;
     if constexpr (std::is_integral_v<StatValueType>) {
-      return event_builder->IntOrUintValue(*stat);
+      return event_builder.IntOrUintValue(*stat);
     } else {
-      return event_builder->StrOrRefValue(*stat);
+      return event_builder.StrOrRefValue(*stat);
     }
   }
 
@@ -222,40 +222,40 @@ class XplaneConnectedEventMutatorFactory : public XplaneEventMutatorFactory {
   using StatsAccessors = std::tuple<StatsAccessorTypes...>;
 
   std::vector<std::unique_ptr<XplaneEventMutator>> CreateMutators(
-      XPlaneBuilder* xplane) const override {
+      XPlaneBuilder& xplane) const override {
     // Check if all stats exist in current plane.
     StatsAccessors stats_accessors;
     bool all_required_stats_exist = true;
     auto check_stats_meta = [&all_required_stats_exist,
-                             xplane](auto&& accessor) {
-      if (all_required_stats_exist == false) return;
-      if (!accessor.Initialize(xplane)) all_required_stats_exist = false;
+                             &xplane](auto&& accessor) {
+      all_required_stats_exist =
+          all_required_stats_exist && accessor.Initialize(xplane);
     };
     for_each(stats_accessors, check_stats_meta);
     if (!all_required_stats_exist) return {};
 
     XEventMetadata* producer_event_metadata =
-        xplane->GetEventMetadata(GetHostEventTypeStr(producer_event));
+        xplane.GetEventMetadata(GetHostEventTypeStr(producer_event));
     XEventMetadata* consumer_event_metadata =
-        xplane->GetEventMetadata(GetHostEventTypeStr(consumer_event));
+        xplane.GetEventMetadata(GetHostEventTypeStr(consumer_event));
 
     std::vector<std::unique_ptr<XplaneEventMutator>> mutators;
     if (producer_event_metadata) {
-      XStatMetadata* context_type_metadata = xplane->GetOrCreateStatMetadata(
+      XStatMetadata* context_type_metadata = xplane.GetOrCreateStatMetadata(
           GetStatTypeStr(StatType::kProducerType));
-      XStatMetadata* context_id_metadata = xplane->GetOrCreateStatMetadata(
-          GetStatTypeStr(StatType::kProducerId));
+      XStatMetadata* context_id_metadata =
+          xplane.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kProducerId));
       mutators.emplace_back(std::make_unique<XplaneConnectedEventMutator>(
-          producer_event_metadata, context_type_metadata, context_id_metadata,
+          producer_event_metadata, *context_type_metadata, *context_id_metadata,
           stats_accessors));
     }
     if (consumer_event_metadata) {
-      XStatMetadata* context_type_metadata = xplane->GetOrCreateStatMetadata(
+      XStatMetadata* context_type_metadata = xplane.GetOrCreateStatMetadata(
           GetStatTypeStr(StatType::kConsumerType));
-      XStatMetadata* context_id_metadata = xplane->GetOrCreateStatMetadata(
-          GetStatTypeStr(StatType::kConsumerId));
+      XStatMetadata* context_id_metadata =
+          xplane.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kConsumerId));
       mutators.emplace_back(std::make_unique<XplaneConnectedEventMutator>(
-          consumer_event_metadata, context_type_metadata, context_id_metadata,
+          consumer_event_metadata, *context_type_metadata, *context_id_metadata,
           stats_accessors));
     }
     return mutators;
@@ -267,19 +267,19 @@ class XplaneConnectedEventMutatorFactory : public XplaneEventMutatorFactory {
   class XplaneConnectedEventMutator : public XplaneEventMutator {
    public:
     XplaneConnectedEventMutator(XEventMetadata* event_metadata,
-                                XStatMetadata* context_type_metadata,
-                                XStatMetadata* context_id_metadata,
+                                XStatMetadata& context_type_metadata,
+                                XStatMetadata& context_id_metadata,
                                 const StatsAccessors& accessors)
         : XplaneEventMutator(event_metadata),
           context_type_metadata_(context_type_metadata),
           context_id_metadata_(context_id_metadata),
           accessors_(accessors) {}
 
-    void Mutate(XEventBuilder* event_builder) override {
+    void Mutate(XEventBuilder& event_builder) override {
       bool all_required_stats_exist = true;
       std::vector<std::variant<absl::string_view, uint64_t>> required_stats;
       auto check_stats_meta = [&all_required_stats_exist, &required_stats,
-                               event_builder](auto&& accessor) {
+                               &event_builder](auto&& accessor) {
         if (all_required_stats_exist == false) return;
         auto stats_data = accessor.GetStat(event_builder);
         if (!stats_data) {
@@ -299,18 +299,18 @@ class XplaneConnectedEventMutatorFactory : public XplaneEventMutatorFactory {
         context_id =
             absl::HashOf(producer_event, consumer_event, required_stats);
       }
-      event_builder->SetOrAddStatValue(*context_type_metadata_,
-                                       static_cast<int64_t>(context_type));
-      event_builder->SetOrAddStatValue(*context_id_metadata_, context_id);
+      event_builder.SetOrAddStatValue(context_type_metadata_,
+                                      static_cast<int64_t>(context_type));
+      event_builder.SetOrAddStatValue(context_id_metadata_, context_id);
     }
 
-    void MutateEventsInLine(XLineBuilder* line) override {
+    void MutateEventsInLine(XLineBuilder& line) override {
       CHECK(false);  // Crash OK
     }
 
    private:
-    XStatMetadata* context_type_metadata_;
-    XStatMetadata* context_id_metadata_;
+    XStatMetadata& context_type_metadata_;
+    XStatMetadata& context_id_metadata_;
     StatsAccessors accessors_;
   };
 };
@@ -323,17 +323,18 @@ class HostRunIdMutatorFactory : public XplaneEventMutatorFactory {
   }
 
   std::vector<std::unique_ptr<XplaneEventMutator>> CreateMutators(
-      XPlaneBuilder* xplane) const override {
+      XPlaneBuilder& xplane) const override {
     std::vector<std::unique_ptr<XplaneEventMutator>> mutators;
-    XEventMetadata* event_metadata =
-        xplane->GetEventMetadata(GetHostEventTypeStr(event_type));
-    if (event_metadata == nullptr) return {};
-    XContextStatsAccessor<int64_t, StatType::kRunId> run_id_stats_accessor;
-    run_id_stats_accessor.Initialize(xplane);
-    XStatMetadata* run_id_metadata =
-        xplane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kRunId));
-    mutators.emplace_back(std::make_unique<HostRunIdMutator>(
-        event_metadata, run_id_stats_accessor, run_id_metadata));
+    if (auto* event_metadata =
+            xplane.GetEventMetadata(GetHostEventTypeStr(event_type))) {
+      XContextStatsAccessor<int64_t, StatType::kRunId> run_id_stats_accessor;
+      if (run_id_stats_accessor.Initialize(xplane)) {
+        XStatMetadata* run_id_metadata =
+            xplane.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kRunId));
+        mutators.emplace_back(std::make_unique<HostRunIdMutator>(
+            event_metadata, run_id_stats_accessor, *run_id_metadata));
+      }
+    }
     return mutators;
   }
 
@@ -344,25 +345,25 @@ class HostRunIdMutatorFactory : public XplaneEventMutatorFactory {
     HostRunIdMutator(
         XEventMetadata* event_metadata,
         XContextStatsAccessor<int64_t, StatType::kRunId> run_id_stats_accessor,
-        XStatMetadata* run_id_metadata)
+        XStatMetadata& run_id_metadata)
         : XplaneEventMutator(event_metadata),
           run_id_stats_accessor_(run_id_stats_accessor),
           run_id_metadata_(run_id_metadata) {}
 
-    void Mutate(XEventBuilder* event_builder) override {
+    void Mutate(XEventBuilder& event_builder) override {
       auto run_id = run_id_stats_accessor_.GetStat(event_builder);
       if (!run_id) return;
       int64_t fixed_run_id = ((uint64_t)run_id.value() & kRunIdMask);
-      event_builder->SetOrAddStatValue(*run_id_metadata_, fixed_run_id);
+      event_builder.SetOrAddStatValue(run_id_metadata_, fixed_run_id);
     }
 
-    void MutateEventsInLine(XLineBuilder* line) override {
+    void MutateEventsInLine(XLineBuilder& line) override {
       CHECK(false);  // Crash OK
     }
 
    private:
     XContextStatsAccessor<int64_t, StatType::kRunId> run_id_stats_accessor_;
-    XStatMetadata* run_id_metadata_;
+    XStatMetadata& run_id_metadata_;
   };
 };
 
@@ -377,27 +378,28 @@ class TpuModuleLineMutatorFactory : public XplaneEventMutatorFactory {
   }
 
   std::vector<std::unique_ptr<XplaneEventMutator>> CreateMutators(
-      XPlaneBuilder* xplane) const override {
+      XPlaneBuilder& xplane) const override {
     std::vector<std::unique_ptr<XplaneEventMutator>> mutators;
-    if (absl::StartsWith(xplane->Name(), kTpuPlanePrefix) &&
-        GetTensorCoreId(xplane->Name()).has_value()) {
-      if (auto device_ordinal = ParseDeviceOrdinal(xplane->Name())) {
-        XStatMetadata* context_type_metadata = xplane->GetOrCreateStatMetadata(
+    if (absl::StartsWith(xplane.Name(), kTpuPlanePrefix) &&
+        GetTensorCoreId(xplane.Name()).has_value()) {
+      if (auto device_ordinal = ParseDeviceOrdinal(xplane.Name())) {
+        XStatMetadata* context_type_metadata = xplane.GetOrCreateStatMetadata(
             GetStatTypeStr(StatType::kConsumerType));
-        XStatMetadata* context_id_metadata = xplane->GetOrCreateStatMetadata(
+        XStatMetadata* context_id_metadata = xplane.GetOrCreateStatMetadata(
             GetStatTypeStr(StatType::kConsumerId));
         XContextStatsAccessor<uint64_t, StatType::kQueueId>
             queue_id_stats_accessor;
         XContextStatsAccessor<uint64_t, StatType::kRunId> run_id_stats_accessor;
         XContextStatsAccessorWithDefault<uint64_t, StatType::kCoreType, 0ULL>
             core_type_stats_accessor;
-        queue_id_stats_accessor.Initialize(xplane);
-        run_id_stats_accessor.Initialize(xplane);
-        core_type_stats_accessor.Initialize(xplane);
-        mutators.emplace_back(std::make_unique<TpuModuleLineMutator>(
-            *device_ordinal, context_type_metadata, context_id_metadata,
-            queue_id_stats_accessor, run_id_stats_accessor,
-            core_type_stats_accessor));
+        if (queue_id_stats_accessor.Initialize(xplane) &&
+            run_id_stats_accessor.Initialize(xplane) &&
+            core_type_stats_accessor.Initialize(xplane)) {
+          mutators.emplace_back(std::make_unique<TpuModuleLineMutator>(
+              *device_ordinal, *context_type_metadata, *context_id_metadata,
+              queue_id_stats_accessor, run_id_stats_accessor,
+              core_type_stats_accessor));
+        }
       }
     }
     return mutators;
@@ -409,8 +411,8 @@ class TpuModuleLineMutatorFactory : public XplaneEventMutatorFactory {
   class TpuModuleLineMutator : public XplaneEventMutator {
    public:
     TpuModuleLineMutator(
-        uint32_t device_ordinal, XStatMetadata* context_type_metadata,
-        XStatMetadata* context_id_metadata,
+        uint32_t device_ordinal, XStatMetadata& context_type_metadata,
+        XStatMetadata& context_id_metadata,
         XContextStatsAccessor<uint64_t, StatType::kQueueId>
             queue_id_stats_accessor,
         XContextStatsAccessor<uint64_t, StatType::kRunId> run_id_stats_accessor,
@@ -424,16 +426,16 @@ class TpuModuleLineMutatorFactory : public XplaneEventMutatorFactory {
           run_id_stats_accessor_(run_id_stats_accessor),
           core_type_stats_accessor_(core_type_stats_accessor) {}
 
-    void Mutate(XEventBuilder* event_builder) override {
+    void Mutate(XEventBuilder& event_builder) override {
       CHECK(false);  // Crash OK
     }
 
-    void MutateEventsInLine(XLineBuilder* line) override {
-      if (line->Name() != kXlaModuleLineName) return;
-      line->ForEachEvent([&](XEventBuilder event) {
-        auto run_id = run_id_stats_accessor_.GetStat(&event);
-        auto queue_id = queue_id_stats_accessor_.GetStat(&event);
-        auto core_type = core_type_stats_accessor_.GetStat(&event);
+    void MutateEventsInLine(XLineBuilder& line) override {
+      if (line.Name() != kXlaModuleLineName) return;
+      line.ForEachEvent([&](XEventBuilder event) {
+        auto run_id = run_id_stats_accessor_.GetStat(event);
+        auto queue_id = queue_id_stats_accessor_.GetStat(event);
+        auto core_type = core_type_stats_accessor_.GetStat(event);
         if (!run_id || !queue_id) return;
         // The order of tuple <device_ordinal, queue_id, run_id> need to be
         // consistent with other kTpuLaunch types.
@@ -444,16 +446,16 @@ class TpuModuleLineMutatorFactory : public XplaneEventMutatorFactory {
         required_stats.emplace_back(*run_id);
         required_stats.emplace_back(static_cast<uint64_t>(*core_type));
         int64_t context_id = absl::HashOf(required_stats);
-        event.SetOrAddStatValue(*context_type_metadata_,
+        event.SetOrAddStatValue(context_type_metadata_,
                                 static_cast<int64_t>(ContextType::kTpuLaunch));
-        event.SetOrAddStatValue(*context_id_metadata_, context_id);
+        event.SetOrAddStatValue(context_id_metadata_, context_id);
       });
     }
 
    private:
     uint64_t device_ordinal_;
-    XStatMetadata* context_type_metadata_;
-    XStatMetadata* context_id_metadata_;
+    XStatMetadata& context_type_metadata_;
+    XStatMetadata& context_id_metadata_;
     XContextStatsAccessor<uint64_t, StatType::kQueueId>
         queue_id_stats_accessor_;
     XContextStatsAccessor<uint64_t, StatType::kRunId> run_id_stats_accessor_;
@@ -473,7 +475,7 @@ class ThreadpoolLineMutatorFactory : public XplaneEventMutatorFactory {
   }
 
   std::vector<std::unique_ptr<XplaneEventMutator>> CreateMutators(
-      XPlaneBuilder* xplane) const override {
+      XPlaneBuilder& xplane) const override {
     std::vector<std::unique_ptr<XplaneEventMutator>> mutators;
     mutators.emplace_back(std::make_unique<ThreadpoolLineMutator>(xplane));
     return mutators;
@@ -484,25 +486,25 @@ class ThreadpoolLineMutatorFactory : public XplaneEventMutatorFactory {
 
   class ThreadpoolLineMutator : public XplaneEventMutator {
    public:
-    explicit ThreadpoolLineMutator(XPlaneBuilder* xplane)
+    explicit ThreadpoolLineMutator(XPlaneBuilder& xplane)
         : XplaneEventMutator(nullptr), xplane_(xplane) {
       start_region_metadata_ =
-          xplane_->GetEventMetadata(kThreadpoolListenerStartRegion);
+          xplane_.GetEventMetadata(kThreadpoolListenerStartRegion);
       stop_region_metadata_ =
-          xplane_->GetEventMetadata(kThreadpoolListenerStopRegion);
+          xplane_.GetEventMetadata(kThreadpoolListenerStopRegion);
       thread_pool_metadata_ =
-          xplane_->GetOrCreateEventMetadata(kThreadpoolListenerRegion);
-      consumer_ = xplane_->GetOrCreateStatMetadata(
+          xplane_.GetOrCreateEventMetadata(kThreadpoolListenerRegion);
+      consumer_ = xplane_.GetOrCreateStatMetadata(
           GetStatTypeStr(StatType::kConsumerId));
-      consumer_type_ = xplane_->GetOrCreateStatMetadata(
+      consumer_type_ = xplane_.GetOrCreateStatMetadata(
           GetStatTypeStr(StatType::kConsumerType));
     }
 
-    void Mutate(XEventBuilder* event_builder) override {
+    void Mutate(XEventBuilder& event_builder) override {
       CHECK(false);  // Crash OK
     }
 
-    void MutateEventsInLine(XLineBuilder* line) override {
+    void MutateEventsInLine(XLineBuilder& line) override {
       if (start_region_metadata_ == nullptr ||
           stop_region_metadata_ == nullptr) {
         // Skip mutations for xplanes that do not have region markers. These
@@ -519,7 +521,7 @@ class ThreadpoolLineMutatorFactory : public XplaneEventMutatorFactory {
       };
 
       std::vector<EventMetadata> event_metadata;
-      line->ForEachEvent([&](const XEventBuilder& event) {
+      line.ForEachEvent([&](const XEventBuilder& event) {
         if (event.MetadataId() == start_region_metadata_->id()) {
           auto consumer_id = event.GetStat(*consumer_);
           if (!consumer_id) return;
@@ -535,7 +537,7 @@ class ThreadpoolLineMutatorFactory : public XplaneEventMutatorFactory {
         }
       });
       for (const auto& event_metadata : event_metadata) {
-        XEventBuilder region = line->AddEvent(*thread_pool_metadata_);
+        XEventBuilder region = line.AddEvent(*thread_pool_metadata_);
         region.SetTimestampPs(event_metadata.start_region_timestamp_ps);
         region.SetEndTimestampPs(event_metadata.end_region_timestamp_ps);
         region.SetOrAddStatValue(*consumer_, event_metadata.region_id);
@@ -548,7 +550,7 @@ class ThreadpoolLineMutatorFactory : public XplaneEventMutatorFactory {
    private:
     XStatMetadata* consumer_;
     XStatMetadata* consumer_type_;
-    XPlaneBuilder* xplane_;
+    XPlaneBuilder& xplane_;
     XEventMetadata* start_region_metadata_;
     XEventMetadata* stop_region_metadata_;
     XEventMetadata* thread_pool_metadata_;
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane_test.cc b/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane_test.cc
index 5912c00c397853..9712893645090c 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/preprocess_xplane_test.cc
@@ -16,12 +16,14 @@ limitations under the License.
 #include "tsl/profiler/utils/preprocess_xplane.h"
 
 #include <cstdint>
+#include <memory>
 #include <optional>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/hash/hash.h"
 #include "tsl/platform/test.h"
 #include "tsl/profiler/lib/connected_traceme.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
 #include "tsl/profiler/utils/tf_xplane_visitor.h"
 #include "tsl/profiler/utils/xplane_builder.h"
 #include "tsl/profiler/utils/xplane_schema.h"
@@ -290,6 +292,19 @@ TEST(PreprocessXPlane, ThreadPoolPreprocessorTest) {
   EXPECT_TRUE(new_event_added);
 }
 
+TEST(PreprocessXPlane, XContextStatsAccessorNPETest) {
+  auto xplane = std::make_unique<XPlane>();
+  XPlaneBuilder xplane_builder(xplane.get());
+  XLine xline;
+  XLineBuilder xline_builder(&xline, &xplane_builder);
+  XEvent xevent;
+  XEventBuilder xevent_builder(&xline, &xplane_builder, &xevent);
+  XContextStatsAccessor<int64_t, StatType::kRunId> run_id_accessor;
+
+  ASSERT_FALSE(run_id_accessor.Initialize(xplane_builder));
+  EXPECT_EQ(run_id_accessor.GetStat(xevent_builder), std::nullopt);
+}
+
 }  // namespace
 }  // namespace profiler
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_utils.cc b/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_utils.cc
index 88c7e30b76eee5..3e50fee7f7ce4a 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_utils.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_utils.cc
@@ -224,6 +224,16 @@ const XLine* FindLineWithId(const XPlane& plane, int64_t id) {
       Find(plane.lines(), [id](const XLine* line) { return line->id() == id; });
   return (i != -1) ? &plane.lines(i) : nullptr;
 }
+std::vector<const XLine*> FindLinesWithId(const XPlane& plane, int64_t id) {
+  std::vector<int> indices = FindAll(
+      plane.lines(), [id](const XLine* line) { return line->id() == id; });
+  std::vector<const XLine*> lines;
+  lines.reserve(indices.size());
+  for (int index : indices) {
+    lines.push_back(&plane.lines(index));
+  }
+  return lines;
+}
 
 const XLine* FindLineWithName(const XPlane& plane, absl::string_view name) {
   int i = Find(plane.lines(),
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_utils.h b/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_utils.h
index d0177632263455..8ea1429c1d90d2 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_utils.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_utils.h
@@ -83,6 +83,7 @@ std::vector<XPlane*> FindMutablePlanesWithPrefix(XSpace* space,
 
 // Returns the plane with the given id/name or nullptr if not found.
 const XLine* FindLineWithId(const XPlane& plane, int64_t id);
+std::vector<const XLine*> FindLinesWithId(const XPlane& plane, int64_t id);
 const XLine* FindLineWithName(const XPlane& plane, absl::string_view name);
 
 XStat* FindOrAddMutableStat(const XStatMetadata& stat_metadata, XEvent* event);
diff --git a/third_party/xla/third_party/tsl/workspace2.bzl b/third_party/xla/third_party/tsl/workspace2.bzl
index 24e4538857208f..67011d3d2bb3ea 100644
--- a/third_party/xla/third_party/tsl/workspace2.bzl
+++ b/third_party/xla/third_party/tsl/workspace2.bzl
@@ -135,13 +135,6 @@ def _tf_repositories():
         urls = tf_mirror_urls("https://github.com/Maratyszcza/pthreadpool/archive/b8374f80e42010941bda6c85b0e3f1a1bd77a1e0.zip"),
     )
 
-    tf_http_archive(
-        name = "clog",
-        strip_prefix = "cpuinfo-5e63739504f0f8e18e941bd63b2d6d42536c7d90",
-        sha256 = "18eca9bc8d9c4ce5496d0d2be9f456d55cbbb5f0639a551ce9c8bac2e84d85fe",
-        urls = tf_mirror_urls("https://github.com/pytorch/cpuinfo/archive/5e63739504f0f8e18e941bd63b2d6d42536c7d90.tar.gz"),
-    )
-
     tf_http_archive(
         name = "cpuinfo",
         strip_prefix = "cpuinfo-5e63739504f0f8e18e941bd63b2d6d42536c7d90",
@@ -304,10 +297,10 @@ def _tf_repositories():
     tf_http_archive(
         name = "nsync",
         patch_file = ["//third_party:nsync.patch"],
-        sha256 = "2be9dbfcce417c7abcc2aa6fee351cd4d292518d692577e74a2c6c05b049e442",
-        strip_prefix = "nsync-1.25.0",
+        sha256 = "e8e552a358f4a28e844207a7c5cb51767e4aeb0b29e22d23ac2a09924130f761",
+        strip_prefix = "nsync-1.27.0",
         system_build_file = "//third_party/systemlibs:nsync.BUILD",
-        urls = tf_mirror_urls("https://github.com/google/nsync/archive/1.25.0.tar.gz"),
+        urls = tf_mirror_urls("https://github.com/google/nsync/archive/1.27.0.tar.gz"),
     )
 
     tf_http_archive(
@@ -512,12 +505,6 @@ def _tf_repositories():
         urls = tf_mirror_urls("https://github.com/google/double-conversion/archive/v3.2.0.tar.gz"),
     )
 
-    tf_http_archive(
-        name = "rules_python",
-        sha256 = "aa96a691d3a8177f3215b14b0edc9641787abaaa30363a080165d06ab65e1161",
-        urls = tf_mirror_urls("https://github.com/bazelbuild/rules_python/releases/download/0.0.1/rules_python-0.0.1.tar.gz"),
-    )
-
     tf_http_archive(
         name = "build_bazel_rules_android",
         sha256 = "cd06d15dd8bb59926e4d65f9003bfc20f9da4b2519985c27e190cddc8b7a7806",
@@ -604,6 +591,7 @@ def _tf_repositories():
         urls = tf_mirror_urls("https://github.com/google/glog/archive/refs/tags/v0.4.0.tar.gz"),
     )
 
+# buildifier: disable=unnamed-macro
 def workspace():
     # Check the bazel version before executing any repository rules, in case
     # those rules rely on the version we require here.
diff --git a/third_party/xla/tools/toolchains/remote_config/configs.bzl b/third_party/xla/tools/toolchains/remote_config/configs.bzl
index 756c7bbe8d786c..f2eecd61a5faf7 100644
--- a/third_party/xla/tools/toolchains/remote_config/configs.bzl
+++ b/third_party/xla/tools/toolchains/remote_config/configs.bzl
@@ -244,6 +244,28 @@ def initialize_rbe_configs():
         python_install_path = "/usr/local",
     )
 
+    tensorflow_rbe_config(
+        name = "ubuntu22.04-clang_manylinux2014-cuda12.3-cudnn8.9",
+        compiler = "/usr/lib/llvm-17/bin/clang",
+        cuda_version = "12.3",
+        cudnn_version = "8.9",
+        os = "ubuntu22.04-manylinux2014-multipython",
+        python_versions = ["3.9", "3.10", "3.11", "3.12"],
+        sysroot = "/dt9",
+        python_install_path = "/usr/local",
+    )
+
+    tensorflow_rbe_config(
+        name = "ubuntu22.04-gcc9_manylinux2014-cuda12.3-cudnn8.9",
+        compiler = "/dt9/usr/bin/gcc",
+        compiler_prefix = "/usr/bin",
+        cuda_version = "12.3",
+        cudnn_version = "8.9",
+        os = "ubuntu22.04-manylinux2014-multipython",
+        python_versions = ["3.9", "3.10", "3.11", "3.12"],
+        python_install_path = "/usr/local",
+    )
+
     tensorflow_rbe_win_config(
         name = "windows_py37",
         python_bin_path = "C:/Python37/python.exe",
diff --git a/third_party/xla/tools/toolchains/remote_config/containers.bzl b/third_party/xla/tools/toolchains/remote_config/containers.bzl
index dd222d06bd13b1..3c7d017b0e0206 100644
--- a/third_party/xla/tools/toolchains/remote_config/containers.bzl
+++ b/third_party/xla/tools/toolchains/remote_config/containers.bzl
@@ -9,6 +9,7 @@ container_digests = {
     "cuda12.1-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:8c266e5b0acd203aed5e8871b63f68a39d8d23f6d882e619797e58b973f7fe63",
     "cuda12.2-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:9fefda035b4a12b24cd5bae56c7dbb9527a5fd06a41ced0a22ac86fe5ed26428",
     "cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython": "sha256:6f9524a2ed7f75255dc4be3a0c5e3bda581385a1c13e2fa890bc17fa62da95b2",
+    "cuda12.3-cudnn8.9-ubuntu22.04-manylinux2014-multipython": "sha256:dddcaf30321e9007103dce75c51b83fea3c06de462fcf41e7c6ae93f37fc3545",
     # ROCM, probably not all of them still in use
     "rocm-ubuntu18.04-manylinux2010-multipython": "sha256:6e953a09b145df338bcb03e9e36f99b291140c29b72d0a048fb6c5905ccad5eb",
     "rocm-ubuntu20.04-manylinux2014-multipython": "sha256:906faec7765fe5dd067f2b092b5d5f220c1fedde725fb42c83d031b4d6f32204",
@@ -114,6 +115,13 @@ containers = {
         "digest": container_digests["cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython"],
     },
 
+    # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.3-cudnn8.9-ubuntu20.04-manylinux2014-multipython.
+    "cuda12.3-cudnn8.9-ubuntu22.04-manylinux2014-multipython": {
+        "registry": "gcr.io",
+        "repository": "tensorflow-testing/nosla-cuda12.3-cudnn8.9-ubuntu22.04-manylinux2014-multipython",
+        "digest": container_digests["cuda12.3-cudnn8.9-ubuntu22.04-manylinux2014-multipython"],
+    },
+
     # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu18.04-manylinux2010-multipython.
     "rocm-ubuntu18.04-manylinux2010-multipython": {
         "registry": "gcr.io",
diff --git a/third_party/xla/tools/toolchains/win/20240424/BUILD b/third_party/xla/tools/toolchains/win/20240424/BUILD
new file mode 100644
index 00000000000000..93b3c90aff81d9
--- /dev/null
+++ b/third_party/xla/tools/toolchains/win/20240424/BUILD
@@ -0,0 +1,662 @@
+# Copyright 2018 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This becomes the BUILD file for @local_config_cc// under Windows.
+
+load("@rules_cc//cc:defs.bzl", "cc_library", "cc_toolchain", "cc_toolchain_suite")
+load(":armeabi_cc_toolchain_config.bzl", "armeabi_cc_toolchain_config")
+load(":windows_cc_toolchain_config.bzl", "cc_toolchain_config")
+
+package(default_visibility = ["//visibility:public"])
+
+cc_library(name = "empty_lib")
+
+# Label flag for extra libraries to be linked into every binary.
+# TODO(bazel-team): Support passing flag multiple times to build a list.
+label_flag(
+    name = "link_extra_libs",
+    build_setting_default = ":empty_lib",
+)
+
+# The final extra library to be linked into every binary target. This collects
+# the above flag, but may also include more libraries depending on config.
+cc_library(
+    name = "link_extra_lib",
+    deps = [
+        ":link_extra_libs",
+    ],
+)
+
+cc_library(
+    name = "malloc",
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
+
+filegroup(
+    name = "mingw_compiler_files",
+    srcs = [":builtin_include_directory_paths_mingw"],
+)
+
+filegroup(
+    name = "clangcl_compiler_files",
+    srcs = [":builtin_include_directory_paths_clangcl"],
+)
+
+filegroup(
+    name = "msvc_compiler_files",
+    srcs = [":builtin_include_directory_paths_msvc"],
+)
+
+# Hardcoded toolchain, legacy behaviour.
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "armeabi-v7a|compiler": ":cc-compiler-armeabi-v7a",
+        "x64_windows|msvc-cl": ":cc-compiler-x64_windows",
+        "x64_x86_windows|msvc-cl": ":cc-compiler-x64_x86_windows",
+        "x64_arm_windows|msvc-cl": ":cc-compiler-x64_arm_windows",
+        "x64_arm64_windows|msvc-cl": ":cc-compiler-arm64_windows",
+        "arm64_windows|msvc-cl": ":cc-compiler-arm64_windows",
+        "x64_windows|msys-gcc": ":cc-compiler-x64_windows_msys",
+        "x64_windows|mingw-gcc": ":cc-compiler-x64_windows_mingw",
+        "x64_windows|clang-cl": ":cc-compiler-x64_windows-clang-cl",
+        "x64_windows_msys": ":cc-compiler-x64_windows_msys",
+        "x64_windows": ":cc-compiler-x64_windows",
+        "x64_x86_windows": ":cc-compiler-x64_x86_windows",
+        "x64_arm_windows": ":cc-compiler-x64_arm_windows",
+        "x64_arm64_windows": ":cc-compiler-arm64_windows",
+        "arm64_windows": ":cc-compiler-arm64_windows",
+        "x64_arm64_windows|clang-cl": ":cc-compiler-arm64_windows-clang-cl",
+        "arm64_windows|clang-cl": ":cc-compiler-arm64_windows-clang-cl",
+        "armeabi-v7a": ":cc-compiler-armeabi-v7a",
+    },
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows_msys",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":mingw_compiler_files",
+    compiler_files = ":mingw_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msys_x64",
+    toolchain_identifier = "msys_x64",
+)
+
+cc_toolchain_config(
+    name = "msys_x64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    compiler = "msys-gcc",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [
+        "c:/tools/msys64/usr/",
+    ],
+    dbg_mode_debug_flag = "/DEBUG:FULL",
+    fastbuild_mode_debug_flag = "/DEBUG:FASTLINK",
+    host_system_name = "local",
+    target_libc = "msys",
+    target_system_name = "local",
+    tool_bin_path = "c:/tools/msys64/usr/bin",
+    tool_paths = {
+        "ar": "c:/tools/msys64/usr/bin/ar",
+        "cpp": "c:/tools/msys64/usr/bin/cpp",
+        "dwp": "c:/tools/msys64/usr/bin/dwp",
+        "gcc": "c:/tools/msys64/usr/bin/gcc",
+        "gcov": "c:/tools/msys64/usr/bin/gcov",
+        "ld": "c:/tools/msys64/usr/bin/ld",
+        "nm": "c:/tools/msys64/usr/bin/nm",
+        "objcopy": "c:/tools/msys64/usr/bin/objcopy",
+        "objdump": "c:/tools/msys64/usr/bin/objdump",
+        "strip": "c:/tools/msys64/usr/bin/strip",
+    },
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows_msys",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+        "@bazel_tools//tools/cpp:msys",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows_msys",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows_mingw",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":mingw_compiler_files",
+    compiler_files = ":mingw_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 0,
+    toolchain_config = ":msys_x64_mingw",
+    toolchain_identifier = "msys_x64_mingw",
+)
+
+cc_toolchain_config(
+    name = "msys_x64_mingw",
+    abi_libc_version = "local",
+    abi_version = "local",
+    compiler = "mingw-gcc",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [
+        "c:/tools/msys64/mingw64/",
+    ],
+    dbg_mode_debug_flag = "/DEBUG:FULL",
+    fastbuild_mode_debug_flag = "/DEBUG:FASTLINK",
+    host_system_name = "local",
+    target_libc = "mingw",
+    target_system_name = "local",
+    tool_bin_path = "c:/tools/msys64/mingw64/bin",
+    tool_paths = {
+        "ar": "c:/tools/msys64/mingw64/bin/ar",
+        "cpp": "c:/tools/msys64/mingw64/bin/cpp",
+        "dwp": "c:/tools/msys64/mingw64/bin/dwp",
+        "gcc": "c:/tools/msys64/mingw64/bin/gcc",
+        "gcov": "c:/tools/msys64/mingw64/bin/gcov",
+        "ld": "c:/tools/msys64/mingw64/bin/ld",
+        "nm": "c:/tools/msys64/mingw64/bin/nm",
+        "objcopy": "c:/tools/msys64/mingw64/bin/objcopy",
+        "objdump": "c:/tools/msys64/mingw64/bin/objdump",
+        "strip": "c:/tools/msys64/mingw64/bin/strip",
+    },
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows_mingw",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+        "@bazel_tools//tools/cpp:mingw",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows_mingw",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":msvc_compiler_files",
+    compiler_files = ":msvc_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msvc_x64",
+    toolchain_identifier = "msvc_x64",
+)
+
+cc_toolchain_config(
+    name = "msvc_x64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:X64"],
+    compiler = "msvc-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [
+        "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\include",
+        "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Auxiliary\\VS\\include",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.22621.0\\ucrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\um",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\shared",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\winrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\cppwinrt",
+    ],
+    dbg_mode_debug_flag = "/DEBUG:FULL",
+    default_link_flags = ["/MACHINE:X64"],
+    fastbuild_mode_debug_flag = "/DEBUG:FASTLINK",
+    host_system_name = "local",
+    msvc_cl_path = "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x64/cl.exe",
+    msvc_env_include = "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\include;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Auxiliary\\VS\\include;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.22621.0\\ucrt;C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\um;C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\shared;C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\winrt;C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\cppwinrt",
+    msvc_env_lib = "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\lib\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.22621.0\\ucrt\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\\\lib\\10.0.22621.0\\\\um\\x64",
+    msvc_env_path = "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\bin\\HostX64\\x64;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\VC\\VCPackages;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TestWindow;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TeamFoundation\\Team Explorer;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\MSBuild\\Current\\bin\\Roslyn;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\10.0.22621.0\\\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\\\x64;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\\\MSBuild\\Current\\Bin\\amd64;C:\\Windows\\Microsoft.NET\\Framework64\\v4.0.30319;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\Tools\\;;C:\\Windows\\system32;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\VC\\Linux\\bin\\ConnectionManagerExe",
+    msvc_env_tmp = "C:\\Users\\ContainerAdministrator\\AppData\\Local\\Temp",
+    msvc_lib_path = "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x64/lib.exe",
+    msvc_link_path = "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x64/link.exe",
+    msvc_ml_path = "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x64/ml64.exe",
+    supports_parse_showincludes = True,
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x64/lib.exe",
+        "ml": "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x64/ml64.exe",
+        "cpp": "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x64/cl.exe",
+        "gcc": "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x64/cl.exe",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x64/link.exe",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "msvc_x64",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_x86_windows",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":msvc_compiler_files",
+    compiler_files = ":msvc_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msvc_x64_x86",
+    toolchain_identifier = "msvc_x64_x86",
+)
+
+cc_toolchain_config(
+    name = "msvc_x64_x86",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:X86"],
+    compiler = "msvc-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [
+        "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\include",
+        "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Auxiliary\\VS\\include",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.22621.0\\ucrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\um",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\shared",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\winrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\cppwinrt",
+    ],
+    dbg_mode_debug_flag = "/DEBUG:FULL",
+    default_link_flags = ["/MACHINE:X86"],
+    fastbuild_mode_debug_flag = "/DEBUG:FASTLINK",
+    host_system_name = "local",
+    msvc_cl_path = "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x86/cl.exe",
+    msvc_env_include = "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\include;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Auxiliary\\VS\\include;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.22621.0\\ucrt;C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\um;C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\shared;C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\winrt;C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\cppwinrt",
+    msvc_env_lib = "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\lib\\x86;C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.22621.0\\ucrt\\x86;C:\\Program Files (x86)\\Windows Kits\\10\\\\lib\\10.0.22621.0\\\\um\\x86",
+    msvc_env_path = "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\bin\\HostX64\\x86;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\bin\\HostX64\\x64;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\VC\\VCPackages;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TestWindow;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TeamFoundation\\Team Explorer;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\MSBuild\\Current\\bin\\Roslyn;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\10.0.22621.0\\\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\\\x64;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\\\MSBuild\\Current\\Bin\\amd64;C:\\Windows\\Microsoft.NET\\Framework64\\v4.0.30319;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\Tools\\;;C:\\Windows\\system32;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\VC\\Linux\\bin\\ConnectionManagerExe",
+    msvc_env_tmp = "C:\\Users\\ContainerAdministrator\\AppData\\Local\\Temp",
+    msvc_lib_path = "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x86/lib.exe",
+    msvc_link_path = "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x86/link.exe",
+    msvc_ml_path = "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x86/ml.exe",
+    supports_parse_showincludes = True,
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x86/lib.exe",
+        "ml": "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x86/ml.exe",
+        "cpp": "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x86/cl.exe",
+        "gcc": "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x86/cl.exe",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.39.33519/bin/HostX64/x86/link.exe",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "msvc_x64_x86",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_x86_windows",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_32",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_x86_windows",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_arm_windows",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":msvc_compiler_files",
+    compiler_files = ":msvc_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msvc_x64_arm",
+    toolchain_identifier = "msvc_x64_arm",
+)
+
+cc_toolchain_config(
+    name = "msvc_x64_arm",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:ARM"],
+    compiler = "msvc-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [],
+    dbg_mode_debug_flag = "/DEBUG",
+    default_link_flags = ["/MACHINE:ARM"],
+    fastbuild_mode_debug_flag = "/DEBUG",
+    host_system_name = "local",
+    msvc_cl_path = "vc_installation_error_arm.bat",
+    msvc_env_include = "msvc_not_found",
+    msvc_env_lib = "msvc_not_found",
+    msvc_env_path = "msvc_not_found",
+    msvc_env_tmp = "msvc_not_found",
+    msvc_lib_path = "vc_installation_error_arm.bat",
+    msvc_link_path = "vc_installation_error_arm.bat",
+    msvc_ml_path = "vc_installation_error_arm.bat",
+    supports_parse_showincludes = False,
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "vc_installation_error_arm.bat",
+        "ml": "vc_installation_error_arm.bat",
+        "cpp": "vc_installation_error_arm.bat",
+        "gcc": "vc_installation_error_arm.bat",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "vc_installation_error_arm.bat",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "msvc_x64_arm",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_arm_windows",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:arm",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_arm_windows",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-arm64_windows",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":msvc_compiler_files",
+    compiler_files = ":msvc_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msvc_arm64",
+    toolchain_identifier = "msvc_arm64",
+)
+
+cc_toolchain_config(
+    name = "msvc_arm64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:ARM64"],
+    compiler = "msvc-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [],
+    dbg_mode_debug_flag = "/DEBUG",
+    default_link_flags = ["/MACHINE:ARM64"],
+    fastbuild_mode_debug_flag = "/DEBUG",
+    host_system_name = "local",
+    msvc_cl_path = "vc_installation_error_arm64.bat",
+    msvc_env_include = "msvc_not_found",
+    msvc_env_lib = "msvc_not_found",
+    msvc_env_path = "msvc_not_found",
+    msvc_env_tmp = "msvc_not_found",
+    msvc_lib_path = "vc_installation_error_arm64.bat",
+    msvc_link_path = "vc_installation_error_arm64.bat",
+    msvc_ml_path = "vc_installation_error_arm64.bat",
+    supports_parse_showincludes = False,
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "vc_installation_error_arm64.bat",
+        "ml": "vc_installation_error_arm64.bat",
+        "cpp": "vc_installation_error_arm64.bat",
+        "gcc": "vc_installation_error_arm64.bat",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "vc_installation_error_arm64.bat",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "msvc_arm64",
+)
+
+toolchain(
+    name = "cc-toolchain-arm64_windows",
+    exec_compatible_with = [
+        "@platforms//os:windows",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:arm64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-arm64_windows",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows-clang-cl",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":clangcl_compiler_files",
+    compiler_files = ":clangcl_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":clang_cl_x64",
+    toolchain_identifier = "clang_cl_x64",
+)
+
+cc_toolchain_config(
+    name = "clang_cl_x64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:X64"],
+    compiler = "clang-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [
+        "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\include",
+        "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Auxiliary\\VS\\include",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.22621.0\\ucrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\um",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\shared",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\winrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\cppwinrt",
+        "C:\\tools\\LLVM\\lib\\clang\\18\\include",
+    ],
+    dbg_mode_debug_flag = "/DEBUG",
+    default_link_flags = ["/MACHINE:X64"],
+    fastbuild_mode_debug_flag = "/DEBUG",
+    host_system_name = "local",
+    msvc_cl_path = "C:/tools/LLVM/bin/clang-cl.exe",
+    msvc_env_include = "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\include;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Auxiliary\\VS\\include;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.22621.0\\ucrt;C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\um;C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\shared;C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\winrt;C:\\Program Files (x86)\\Windows Kits\\10\\\\include\\10.0.22621.0\\\\cppwinrt;C:\\tools\\LLVM\\lib\\clang\\18\\include",
+    msvc_env_lib = "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\lib\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.22621.0\\ucrt\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\\\lib\\10.0.22621.0\\\\um\\x64;C:\\tools\\LLVM\\lib\\clang\\18\\lib\\windows",
+    msvc_env_path = "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.39.33519\\bin\\HostX64\\x64;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\VC\\VCPackages;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TestWindow;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TeamFoundation\\Team Explorer;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\MSBuild\\Current\\bin\\Roslyn;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\10.0.22621.0\\\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\\\x64;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\\\MSBuild\\Current\\Bin\\amd64;C:\\Windows\\Microsoft.NET\\Framework64\\v4.0.30319;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\Tools\\;;C:\\Windows\\system32;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\IDE\\VC\\Linux\\bin\\ConnectionManagerExe",
+    msvc_env_tmp = "C:\\Users\\ContainerAdministrator\\AppData\\Local\\Temp",
+    msvc_lib_path = "C:/tools/LLVM/bin/llvm-lib.exe",
+    msvc_link_path = "C:/tools/LLVM/bin/lld-link.exe",
+    msvc_ml_path = "C:/tools/LLVM/bin/clang-cl.exe",
+    supports_parse_showincludes = True,
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "C:/tools/LLVM/bin/llvm-lib.exe",
+        "ml": "C:/tools/LLVM/bin/clang-cl.exe",
+        "cpp": "C:/tools/LLVM/bin/clang-cl.exe",
+        "gcc": "C:/tools/LLVM/bin/clang-cl.exe",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "C:/tools/LLVM/bin/lld-link.exe",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "clang_cl_x64",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows-clang-cl",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+        "@bazel_tools//tools/cpp:clang-cl",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows-clang-cl",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-arm64_windows-clang-cl",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":clangcl_compiler_files",
+    compiler_files = ":clangcl_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":clang_cl_arm64",
+    toolchain_identifier = "clang_cl_arm64",
+)
+
+cc_toolchain_config(
+    name = "clang_cl_arm64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:ARM64"],
+    compiler = "clang-cl",
+    cpu = "arm64_windows",
+    cxx_builtin_include_directories = [],
+    dbg_mode_debug_flag = "/DEBUG",
+    default_link_flags = ["/MACHINE:ARM64"],
+    fastbuild_mode_debug_flag = "/DEBUG",
+    host_system_name = "local",
+    msvc_cl_path = "vc_installation_error_arm64.bat",
+    msvc_env_include = "clang_cl_not_found",
+    msvc_env_lib = "clang_cl_not_found",
+    msvc_env_path = "clang_cl_not_found",
+    msvc_env_tmp = "clang_cl_not_found",
+    msvc_lib_path = "vc_installation_error_arm64.bat",
+    msvc_link_path = "vc_installation_error_arm64.bat",
+    msvc_ml_path = "vc_installation_error_arm64.bat",
+    supports_parse_showincludes = False,
+    target_libc = "msvcrt",
+    target_system_name = "aarch64-pc-windows-msvc",
+    tool_paths = {
+        "ar": "vc_installation_error_arm64.bat",
+        "ml": "vc_installation_error_arm64.bat",
+        "cpp": "vc_installation_error_arm64.bat",
+        "gcc": "vc_installation_error_arm64.bat",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "vc_installation_error_arm64.bat",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "clang_cl_arm64",
+)
+
+toolchain(
+    name = "cc-toolchain-arm64_windows-clang-cl",
+    exec_compatible_with = [
+        "@platforms//os:windows",
+        "@bazel_tools//tools/cpp:clang-cl",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:arm64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-arm64_windows-clang-cl",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-armeabi-v7a",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":empty",
+    compiler_files = ":empty",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":stub_armeabi-v7a",
+    toolchain_identifier = "stub_armeabi-v7a",
+)
+
+armeabi_cc_toolchain_config(name = "stub_armeabi-v7a")
+
+toolchain(
+    name = "cc-toolchain-armeabi-v7a",
+    exec_compatible_with = [
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:armv7",
+        "@platforms//os:android",
+    ],
+    toolchain = ":cc-compiler-armeabi-v7a",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
diff --git a/third_party/xla/tools/toolchains/win/20240424/armeabi_cc_toolchain_config.bzl b/third_party/xla/tools/toolchains/win/20240424/armeabi_cc_toolchain_config.bzl
new file mode 100644
index 00000000000000..72ef48ae6d6dfc
--- /dev/null
+++ b/third_party/xla/tools/toolchains/win/20240424/armeabi_cc_toolchain_config.bzl
@@ -0,0 +1,82 @@
+# Copyright 2019 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A Starlark cc_toolchain configuration rule"""
+
+load(
+    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
+    "feature",
+    "tool_path",
+)
+
+def _impl(ctx):
+    toolchain_identifier = "stub_armeabi-v7a"
+    host_system_name = "armeabi-v7a"
+    target_system_name = "armeabi-v7a"
+    target_cpu = "armeabi-v7a"
+    target_libc = "armeabi-v7a"
+    compiler = "compiler"
+    abi_version = "armeabi-v7a"
+    abi_libc_version = "armeabi-v7a"
+    cc_target_os = None
+    builtin_sysroot = None
+    action_configs = []
+
+    supports_pic_feature = feature(name = "supports_pic", enabled = True)
+    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+    features = [supports_dynamic_linker_feature, supports_pic_feature]
+
+    cxx_builtin_include_directories = []
+    artifact_name_patterns = []
+    make_variables = []
+
+    tool_paths = [
+        tool_path(name = "ar", path = "/bin/false"),
+        tool_path(name = "cpp", path = "/bin/false"),
+        tool_path(name = "dwp", path = "/bin/false"),
+        tool_path(name = "gcc", path = "/bin/false"),
+        tool_path(name = "gcov", path = "/bin/false"),
+        tool_path(name = "ld", path = "/bin/false"),
+        tool_path(name = "llvm-profdata", path = "/bin/false"),
+        tool_path(name = "nm", path = "/bin/false"),
+        tool_path(name = "objcopy", path = "/bin/false"),
+        tool_path(name = "objdump", path = "/bin/false"),
+        tool_path(name = "strip", path = "/bin/false"),
+    ]
+
+    return cc_common.create_cc_toolchain_config_info(
+        ctx = ctx,
+        features = features,
+        action_configs = action_configs,
+        artifact_name_patterns = artifact_name_patterns,
+        cxx_builtin_include_directories = cxx_builtin_include_directories,
+        toolchain_identifier = toolchain_identifier,
+        host_system_name = host_system_name,
+        target_system_name = target_system_name,
+        target_cpu = target_cpu,
+        target_libc = target_libc,
+        compiler = compiler,
+        abi_version = abi_version,
+        abi_libc_version = abi_libc_version,
+        tool_paths = tool_paths,
+        make_variables = make_variables,
+        builtin_sysroot = builtin_sysroot,
+        cc_target_os = cc_target_os,
+    )
+
+armeabi_cc_toolchain_config = rule(
+    implementation = _impl,
+    attrs = {},
+    provides = [CcToolchainConfigInfo],
+)
diff --git a/third_party/xla/tools/toolchains/win/20240424/builtin_include_directory_paths_clangcl b/third_party/xla/tools/toolchains/win/20240424/builtin_include_directory_paths_clangcl
new file mode 100644
index 00000000000000..0a1fb6e0df84ce
--- /dev/null
+++ b/third_party/xla/tools/toolchains/win/20240424/builtin_include_directory_paths_clangcl
@@ -0,0 +1,5 @@
+This file is generated by cc_configure and contains builtin include directories
+that clang-cl reported. This file is a dependency of every compilation action and
+changes to it will be reflected in the action cache key. When some of these
+paths change, Bazel will make sure to rerun the action, even though none of
+declared action inputs or the action commandline changes.
diff --git a/third_party/xla/tools/toolchains/win/20240424/builtin_include_directory_paths_msvc b/third_party/xla/tools/toolchains/win/20240424/builtin_include_directory_paths_msvc
new file mode 100644
index 00000000000000..55ba44f761e2c1
--- /dev/null
+++ b/third_party/xla/tools/toolchains/win/20240424/builtin_include_directory_paths_msvc
@@ -0,0 +1,6 @@
+This file is generated by cc_configure and contains builtin include directories
+that msvc reported. This file is a dependency of every compilation action and
+changes to it will be reflected in the action cache key. When some of these
+paths change, Bazel will make sure to rerun the action, even though none of
+declared action inputs or the action commandline changes.
+
diff --git a/third_party/xla/tools/toolchains/win/20240424/toolchain_image_info b/third_party/xla/tools/toolchains/win/20240424/toolchain_image_info
new file mode 100644
index 00000000000000..807a14bebbdb44
--- /dev/null
+++ b/third_party/xla/tools/toolchains/win/20240424/toolchain_image_info
@@ -0,0 +1,2 @@
+REPOSITORY                                            TAG                 DIGEST                                                                    IMAGE ID            CREATED             SIZE
+gcr.io/tensorflow-testing/tf-win2019-docker-staging   latest              sha256:1082ef4299a72e44a84388f192ecefc81ec9091c146f507bc36070c089c0edcc   b601adb43430        8 minutes ago       20.4GB
\ No newline at end of file
diff --git a/third_party/xla/tools/toolchains/win/20240424/windows_cc_toolchain_config.bzl b/third_party/xla/tools/toolchains/win/20240424/windows_cc_toolchain_config.bzl
new file mode 100644
index 00000000000000..6d8e8af6d50e4a
--- /dev/null
+++ b/third_party/xla/tools/toolchains/win/20240424/windows_cc_toolchain_config.bzl
@@ -0,0 +1,1443 @@
+# Copyright 2019 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A Starlark cc_toolchain configuration rule for Windows"""
+
+load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
+load(
+    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
+    "action_config",
+    "artifact_name_pattern",
+    "env_entry",
+    "env_set",
+    "feature",
+    "flag_group",
+    "flag_set",
+    "tool",
+    "tool_path",
+    "variable_with_value",
+    "with_feature_set",
+)
+
+all_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.assemble,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.clif_match,
+    ACTION_NAMES.lto_backend,
+]
+
+all_cpp_compile_actions = [
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.clif_match,
+]
+
+preprocessor_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.clif_match,
+]
+
+codegen_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.assemble,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.lto_backend,
+]
+
+all_link_actions = [
+    ACTION_NAMES.cpp_link_executable,
+    ACTION_NAMES.cpp_link_dynamic_library,
+    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+]
+
+def _use_msvc_toolchain(ctx):
+    return ctx.attr.cpu in ["x64_windows", "arm64_windows"] and (ctx.attr.compiler == "msvc-cl" or ctx.attr.compiler == "clang-cl")
+
+def _impl(ctx):
+    if _use_msvc_toolchain(ctx):
+        artifact_name_patterns = [
+            artifact_name_pattern(
+                category_name = "object_file",
+                prefix = "",
+                extension = ".obj",
+            ),
+            artifact_name_pattern(
+                category_name = "static_library",
+                prefix = "",
+                extension = ".lib",
+            ),
+            artifact_name_pattern(
+                category_name = "alwayslink_static_library",
+                prefix = "",
+                extension = ".lo.lib",
+            ),
+            artifact_name_pattern(
+                category_name = "executable",
+                prefix = "",
+                extension = ".exe",
+            ),
+            artifact_name_pattern(
+                category_name = "dynamic_library",
+                prefix = "",
+                extension = ".dll",
+            ),
+            artifact_name_pattern(
+                category_name = "interface_library",
+                prefix = "",
+                extension = ".if.lib",
+            ),
+        ]
+    else:
+        artifact_name_patterns = [
+            artifact_name_pattern(
+                category_name = "executable",
+                prefix = "",
+                extension = ".exe",
+            ),
+        ]
+
+    if _use_msvc_toolchain(ctx):
+        cpp_link_nodeps_dynamic_library_action = action_config(
+            action_name = ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+            implies = [
+                "nologo",
+                "shared_flag",
+                "linkstamps",
+                "output_execpath_flags",
+                "input_param_flags",
+                "user_link_flags",
+                "linker_subsystem_flag",
+                "linker_param_file",
+                "msvc_env",
+                "no_stripping",
+                "has_configured_linker_path",
+                "def_file",
+            ],
+            tools = [tool(path = ctx.attr.msvc_link_path)],
+        )
+
+        cpp_link_static_library_action = action_config(
+            action_name = ACTION_NAMES.cpp_link_static_library,
+            implies = [
+                "nologo",
+                "archiver_flags",
+                "input_param_flags",
+                "linker_param_file",
+                "msvc_env",
+            ],
+            tools = [tool(path = ctx.attr.msvc_lib_path)],
+        )
+
+        assemble_action = action_config(
+            action_name = ACTION_NAMES.assemble,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "nologo",
+                "msvc_env",
+                "sysroot",
+            ],
+            tools = [tool(path = ctx.attr.msvc_ml_path)],
+        )
+
+        preprocess_assemble_action = action_config(
+            action_name = ACTION_NAMES.preprocess_assemble,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "nologo",
+                "msvc_env",
+                "sysroot",
+            ],
+            tools = [tool(path = ctx.attr.msvc_ml_path)],
+        )
+
+        c_compile_action = action_config(
+            action_name = ACTION_NAMES.c_compile,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "nologo",
+                "msvc_env",
+                "user_compile_flags",
+                "sysroot",
+            ],
+            tools = [tool(path = ctx.attr.msvc_cl_path)],
+        )
+
+        linkstamp_compile_action = action_config(
+            action_name = ACTION_NAMES.linkstamp_compile,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "default_compile_flags",
+                "nologo",
+                "msvc_env",
+                "user_compile_flags",
+                "sysroot",
+                "unfiltered_compile_flags",
+            ],
+            tools = [tool(path = ctx.attr.msvc_cl_path)],
+        )
+
+        cpp_compile_action = action_config(
+            action_name = ACTION_NAMES.cpp_compile,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "nologo",
+                "msvc_env",
+                "user_compile_flags",
+                "sysroot",
+            ],
+            tools = [tool(path = ctx.attr.msvc_cl_path)],
+        )
+
+        cpp_link_executable_action = action_config(
+            action_name = ACTION_NAMES.cpp_link_executable,
+            implies = [
+                "nologo",
+                "linkstamps",
+                "output_execpath_flags",
+                "input_param_flags",
+                "user_link_flags",
+                "linker_subsystem_flag",
+                "linker_param_file",
+                "msvc_env",
+                "no_stripping",
+            ],
+            tools = [tool(path = ctx.attr.msvc_link_path)],
+        )
+
+        cpp_link_dynamic_library_action = action_config(
+            action_name = ACTION_NAMES.cpp_link_dynamic_library,
+            implies = [
+                "nologo",
+                "shared_flag",
+                "linkstamps",
+                "output_execpath_flags",
+                "input_param_flags",
+                "user_link_flags",
+                "linker_subsystem_flag",
+                "linker_param_file",
+                "msvc_env",
+                "no_stripping",
+                "has_configured_linker_path",
+                "def_file",
+            ],
+            tools = [tool(path = ctx.attr.msvc_link_path)],
+        )
+
+        action_configs = [
+            assemble_action,
+            preprocess_assemble_action,
+            c_compile_action,
+            linkstamp_compile_action,
+            cpp_compile_action,
+            cpp_link_executable_action,
+            cpp_link_dynamic_library_action,
+            cpp_link_nodeps_dynamic_library_action,
+            cpp_link_static_library_action,
+        ]
+    else:
+        action_configs = []
+
+    if _use_msvc_toolchain(ctx):
+        msvc_link_env_feature = feature(
+            name = "msvc_link_env",
+            env_sets = [
+                env_set(
+                    actions = all_link_actions +
+                              [ACTION_NAMES.cpp_link_static_library],
+                    env_entries = [env_entry(key = "LIB", value = ctx.attr.msvc_env_lib)],
+                ),
+            ],
+        )
+
+        shared_flag_feature = feature(
+            name = "shared_flag",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [flag_group(flags = ["/DLL"])],
+                ),
+            ],
+        )
+
+        determinism_feature = feature(
+            name = "determinism",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "/wd4117",
+                                "-D__DATE__=\"redacted\"",
+                                "-D__TIMESTAMP__=\"redacted\"",
+                                "-D__TIME__=\"redacted\"",
+                            ] + (["-Wno-builtin-macro-redefined"] if ctx.attr.compiler == "clang-cl" else []),
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        sysroot_feature = feature(
+            name = "sysroot",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["--sysroot=%{sysroot}"],
+                            iterate_over = "sysroot",
+                            expand_if_available = "sysroot",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        unfiltered_compile_flags_feature = feature(
+            name = "unfiltered_compile_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{unfiltered_compile_flags}"],
+                            iterate_over = "unfiltered_compile_flags",
+                            expand_if_available = "unfiltered_compile_flags",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        archive_param_file_feature = feature(
+            name = "archive_param_file",
+            enabled = True,
+        )
+
+        compiler_param_file_feature = feature(
+            name = "compiler_param_file",
+            enabled = True,
+        )
+
+        copy_dynamic_libraries_to_binary_feature = feature(
+            name = "copy_dynamic_libraries_to_binary",
+        )
+
+        input_param_flags_feature = feature(
+            name = "input_param_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/IMPLIB:%{interface_library_output_path}"],
+                            expand_if_available = "interface_library_output_path",
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{libopts}"],
+                            iterate_over = "libopts",
+                            expand_if_available = "libopts",
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = all_link_actions +
+                              [ACTION_NAMES.cpp_link_static_library],
+                    flag_groups = [
+                        flag_group(
+                            iterate_over = "libraries_to_link",
+                            flag_groups = [
+                                flag_group(
+                                    iterate_over = "libraries_to_link.object_files",
+                                    flag_groups = [flag_group(flags = ["%{libraries_to_link.object_files}"])],
+                                    expand_if_equal = variable_with_value(
+                                        name = "libraries_to_link.type",
+                                        value = "object_file_group",
+                                    ),
+                                ),
+                                flag_group(
+                                    flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
+                                    expand_if_equal = variable_with_value(
+                                        name = "libraries_to_link.type",
+                                        value = "object_file",
+                                    ),
+                                ),
+                                flag_group(
+                                    flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
+                                    expand_if_equal = variable_with_value(
+                                        name = "libraries_to_link.type",
+                                        value = "interface_library",
+                                    ),
+                                ),
+                                flag_group(
+                                    flag_groups = [
+                                        flag_group(
+                                            flags = ["%{libraries_to_link.name}"],
+                                            expand_if_false = "libraries_to_link.is_whole_archive",
+                                        ),
+                                        flag_group(
+                                            flags = ["/WHOLEARCHIVE:%{libraries_to_link.name}"],
+                                            expand_if_true = "libraries_to_link.is_whole_archive",
+                                        ),
+                                    ],
+                                    expand_if_equal = variable_with_value(
+                                        name = "libraries_to_link.type",
+                                        value = "static_library",
+                                    ),
+                                ),
+                            ],
+                            expand_if_available = "libraries_to_link",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        fastbuild_feature = feature(
+            name = "fastbuild",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Od", "/Z7"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = [ctx.attr.fastbuild_mode_debug_flag, "/INCREMENTAL:NO"],
+                        ),
+                    ],
+                ),
+            ],
+            implies = ["generate_pdb_file"],
+        )
+
+        user_compile_flags_feature = feature(
+            name = "user_compile_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{user_compile_flags}"],
+                            iterate_over = "user_compile_flags",
+                            expand_if_available = "user_compile_flags",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        archiver_flags_feature = feature(
+            name = "archiver_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.cpp_link_static_library],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/OUT:%{output_execpath}"],
+                            expand_if_available = "output_execpath",
+                        ),
+                        flag_group(
+                            flags = ctx.attr.archiver_flags,
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        default_link_flags_feature = feature(
+            name = "default_link_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ctx.attr.default_link_flags)],
+                ),
+            ],
+        )
+
+        static_link_msvcrt_feature = feature(
+            name = "static_link_msvcrt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/MT"])],
+                    with_features = [with_feature_set(not_features = ["dbg"])],
+                ),
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/MTd"])],
+                    with_features = [with_feature_set(features = ["dbg"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])],
+                    with_features = [with_feature_set(not_features = ["dbg"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])],
+                    with_features = [with_feature_set(features = ["dbg"])],
+                ),
+            ],
+        )
+
+        dynamic_link_msvcrt_feature = feature(
+            name = "dynamic_link_msvcrt",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/MD"])],
+                    with_features = [with_feature_set(not_features = ["dbg", "static_link_msvcrt"])],
+                ),
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/MDd"])],
+                    with_features = [with_feature_set(features = ["dbg"], not_features = ["static_link_msvcrt"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])],
+                    with_features = [with_feature_set(not_features = ["dbg", "static_link_msvcrt"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])],
+                    with_features = [with_feature_set(features = ["dbg"], not_features = ["static_link_msvcrt"])],
+                ),
+            ],
+        )
+
+        dbg_feature = feature(
+            name = "dbg",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Od", "/Z7"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = [ctx.attr.dbg_mode_debug_flag, "/INCREMENTAL:NO"],
+                        ),
+                    ],
+                ),
+            ],
+            implies = ["generate_pdb_file"],
+        )
+
+        opt_feature = feature(
+            name = "opt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/O2"])],
+                ),
+            ],
+            implies = ["frame_pointer"],
+        )
+
+        supports_interface_shared_libraries_feature = feature(
+            name = "supports_interface_shared_libraries",
+            enabled = True,
+        )
+
+        user_link_flags_feature = feature(
+            name = "user_link_flags",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{user_link_flags}"],
+                            iterate_over = "user_link_flags",
+                            expand_if_available = "user_link_flags",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        default_compile_flags_feature = feature(
+            name = "default_compile_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.lto_backend,
+                        ACTION_NAMES.clif_match,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "/DCOMPILER_MSVC",
+                                "/DNOMINMAX",
+                                "/D_WIN32_WINNT=0x0601",
+                                "/D_CRT_SECURE_NO_DEPRECATE",
+                                "/D_CRT_SECURE_NO_WARNINGS",
+                                "/bigobj",
+                                "/Zm500",
+                                "/EHsc",
+                                "/wd4351",
+                                "/wd4291",
+                                "/wd4250",
+                                "/wd4996",
+                            ],
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        msvc_compile_env_feature = feature(
+            name = "msvc_compile_env",
+            env_sets = [
+                env_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                    ],
+                    env_entries = [env_entry(key = "INCLUDE", value = ctx.attr.msvc_env_include)],
+                ),
+            ],
+        )
+
+        preprocessor_defines_feature = feature(
+            name = "preprocessor_defines",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/D%{preprocessor_defines}"],
+                            iterate_over = "preprocessor_defines",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        generate_pdb_file_feature = feature(
+            name = "generate_pdb_file",
+        )
+
+        output_execpath_flags_feature = feature(
+            name = "output_execpath_flags",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/OUT:%{output_execpath}"],
+                            expand_if_available = "output_execpath",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        disable_assertions_feature = feature(
+            name = "disable_assertions",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/DNDEBUG"])],
+                    with_features = [with_feature_set(features = ["opt"])],
+                ),
+            ],
+        )
+
+        has_configured_linker_path_feature = feature(name = "has_configured_linker_path")
+
+        supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+
+        no_stripping_feature = feature(name = "no_stripping")
+
+        linker_param_file_feature = feature(
+            name = "linker_param_file",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions +
+                              [ACTION_NAMES.cpp_link_static_library],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["@%{linker_param_file}"],
+                            expand_if_available = "linker_param_file",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        ignore_noisy_warnings_feature = feature(
+            name = "ignore_noisy_warnings",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.cpp_link_static_library],
+                    flag_groups = [flag_group(flags = ["/ignore:4221"])],
+                ),
+            ],
+        )
+
+        no_legacy_features_feature = feature(name = "no_legacy_features")
+
+        parse_showincludes_feature = feature(
+            name = "parse_showincludes",
+            enabled = ctx.attr.supports_parse_showincludes,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                    ],
+                    flag_groups = [flag_group(flags = ["/showIncludes"])],
+                ),
+            ],
+            env_sets = [
+                env_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                    ],
+                    # Force English (and thus a consistent locale) output so that Bazel can parse
+                    # the /showIncludes output without having to guess the encoding.
+                    env_entries = [env_entry(key = "VSLANG", value = "1033")],
+                ),
+            ],
+        )
+
+        # MSVC does not emit .d files.
+        no_dotd_file_feature = feature(
+            name = "no_dotd_file",
+            enabled = True,
+        )
+
+        treat_warnings_as_errors_feature = feature(
+            name = "treat_warnings_as_errors",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile] + all_link_actions,
+                    flag_groups = [flag_group(flags = ["/WX"])],
+                ),
+            ],
+        )
+
+        windows_export_all_symbols_feature = feature(name = "windows_export_all_symbols")
+
+        no_windows_export_all_symbols_feature = feature(name = "no_windows_export_all_symbols")
+
+        include_paths_feature = feature(
+            name = "include_paths",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/I%{quote_include_paths}"],
+                            iterate_over = "quote_include_paths",
+                        ),
+                        flag_group(
+                            flags = ["/I%{include_paths}"],
+                            iterate_over = "include_paths",
+                        ),
+                        flag_group(
+                            flags = ["/I%{system_include_paths}"],
+                            iterate_over = "system_include_paths",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        external_include_paths_feature = feature(
+            name = "external_include_paths",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.clif_match,
+                        ACTION_NAMES.objc_compile,
+                        ACTION_NAMES.objcpp_compile,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/external:I", "%{external_include_paths}"],
+                            iterate_over = "external_include_paths",
+                            expand_if_available = "external_include_paths",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        linkstamps_feature = feature(
+            name = "linkstamps",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{linkstamp_paths}"],
+                            iterate_over = "linkstamp_paths",
+                            expand_if_available = "linkstamp_paths",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        targets_windows_feature = feature(
+            name = "targets_windows",
+            enabled = True,
+            implies = ["copy_dynamic_libraries_to_binary"],
+        )
+
+        linker_subsystem_flag_feature = feature(
+            name = "linker_subsystem_flag",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/SUBSYSTEM:CONSOLE"])],
+                ),
+            ],
+        )
+
+        frame_pointer_feature = feature(
+            name = "frame_pointer",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Oy-"])],
+                ),
+            ],
+        )
+
+        compiler_output_flags_feature = feature(
+            name = "compiler_output_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.assemble],
+                    flag_groups = [
+                        flag_group(
+                            flag_groups = [
+                                flag_group(
+                                    flags = ["/Fo%{output_file}", "/Zi"],
+                                    expand_if_available = "output_file",
+                                    expand_if_not_available = "output_assembly_file",
+                                ),
+                            ],
+                            expand_if_not_available = "output_preprocess_file",
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flag_groups = [
+                                flag_group(
+                                    flags = ["/Fo%{output_file}"],
+                                    expand_if_not_available = "output_preprocess_file",
+                                ),
+                            ],
+                            expand_if_available = "output_file",
+                            expand_if_not_available = "output_assembly_file",
+                        ),
+                        flag_group(
+                            flag_groups = [
+                                flag_group(
+                                    flags = ["/Fa%{output_file}"],
+                                    expand_if_available = "output_assembly_file",
+                                ),
+                            ],
+                            expand_if_available = "output_file",
+                        ),
+                        flag_group(
+                            flag_groups = [
+                                flag_group(
+                                    flags = ["/P", "/Fi%{output_file}"],
+                                    expand_if_available = "output_preprocess_file",
+                                ),
+                            ],
+                            expand_if_available = "output_file",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        nologo_feature = feature(
+            name = "nologo",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                        ACTION_NAMES.cpp_link_static_library,
+                    ],
+                    flag_groups = [flag_group(flags = ["/nologo"])],
+                ),
+            ],
+        )
+
+        smaller_binary_feature = feature(
+            name = "smaller_binary",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Gy", "/Gw"])],
+                    with_features = [with_feature_set(features = ["opt"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/OPT:ICF", "/OPT:REF"])],
+                    with_features = [with_feature_set(features = ["opt"])],
+                ),
+            ],
+        )
+
+        compiler_input_flags_feature = feature(
+            name = "compiler_input_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/c", "%{source_file}"],
+                            expand_if_available = "source_file",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        def_file_feature = feature(
+            name = "def_file",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/DEF:%{def_file_path}", "/ignore:4070"],
+                            expand_if_available = "def_file_path",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        msvc_env_feature = feature(
+            name = "msvc_env",
+            env_sets = [
+                env_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                        ACTION_NAMES.cpp_link_static_library,
+                    ],
+                    env_entries = [
+                        env_entry(key = "PATH", value = ctx.attr.msvc_env_path),
+                        env_entry(key = "TMP", value = ctx.attr.msvc_env_tmp),
+                        env_entry(key = "TEMP", value = ctx.attr.msvc_env_tmp),
+                    ],
+                ),
+            ],
+            implies = ["msvc_compile_env", "msvc_link_env"],
+        )
+        features = [
+            no_legacy_features_feature,
+            nologo_feature,
+            has_configured_linker_path_feature,
+            no_stripping_feature,
+            targets_windows_feature,
+            copy_dynamic_libraries_to_binary_feature,
+            default_compile_flags_feature,
+            msvc_env_feature,
+            msvc_compile_env_feature,
+            msvc_link_env_feature,
+            include_paths_feature,
+            external_include_paths_feature,
+            preprocessor_defines_feature,
+            parse_showincludes_feature,
+            no_dotd_file_feature,
+            generate_pdb_file_feature,
+            shared_flag_feature,
+            linkstamps_feature,
+            output_execpath_flags_feature,
+            archiver_flags_feature,
+            input_param_flags_feature,
+            linker_subsystem_flag_feature,
+            user_link_flags_feature,
+            default_link_flags_feature,
+            linker_param_file_feature,
+            static_link_msvcrt_feature,
+            dynamic_link_msvcrt_feature,
+            dbg_feature,
+            fastbuild_feature,
+            opt_feature,
+            frame_pointer_feature,
+            disable_assertions_feature,
+            determinism_feature,
+            treat_warnings_as_errors_feature,
+            smaller_binary_feature,
+            ignore_noisy_warnings_feature,
+            user_compile_flags_feature,
+            sysroot_feature,
+            unfiltered_compile_flags_feature,
+            archive_param_file_feature,
+            compiler_param_file_feature,
+            compiler_output_flags_feature,
+            compiler_input_flags_feature,
+            def_file_feature,
+            windows_export_all_symbols_feature,
+            no_windows_export_all_symbols_feature,
+            supports_dynamic_linker_feature,
+            supports_interface_shared_libraries_feature,
+        ]
+    else:
+        targets_windows_feature = feature(
+            name = "targets_windows",
+            implies = ["copy_dynamic_libraries_to_binary"],
+            enabled = True,
+        )
+
+        copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
+
+        gcc_env_feature = feature(
+            name = "gcc_env",
+            enabled = True,
+            env_sets = [
+                env_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                        ACTION_NAMES.cpp_link_static_library,
+                    ],
+                    env_entries = [
+                        env_entry(key = "PATH", value = ctx.attr.tool_bin_path),
+                    ],
+                ),
+            ],
+        )
+
+        default_compile_flags_feature = feature(
+            name = "default_compile_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.lto_backend,
+                        ACTION_NAMES.clif_match,
+                    ],
+                    flag_groups = [flag_group(flags = ["-std=gnu++14"])],
+                ),
+            ],
+        )
+
+        default_link_flags_feature = feature(
+            name = "default_link_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["-lstdc++"])],
+                ),
+            ],
+        )
+
+        supports_dynamic_linker_feature = feature(
+            name = "supports_dynamic_linker",
+            enabled = True,
+        )
+
+        dbg_feature = feature(
+            name = "dbg",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["-g", "-Og"])],
+                ),
+            ],
+        )
+
+        opt_feature = feature(
+            name = "opt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = [
+                        "-g0",
+                        "-O3",
+                        "-DNDEBUG",
+                        "-ffunction-sections",
+                        "-fdata-sections",
+                    ])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["-Wl,--gc-sections"])],
+                ),
+            ],
+        )
+
+        if ctx.attr.cpu == "x64_windows" and ctx.attr.compiler == "mingw-gcc":
+            archive_param_file_feature = feature(
+                name = "archive_param_file",
+                enabled = True,
+            )
+
+            compiler_param_file_feature = feature(
+                name = "compiler_param_file",
+            )
+
+            features = [
+                targets_windows_feature,
+                copy_dynamic_libraries_to_binary_feature,
+                gcc_env_feature,
+                default_compile_flags_feature,
+                archive_param_file_feature,
+                compiler_param_file_feature,
+                default_link_flags_feature,
+                supports_dynamic_linker_feature,
+                dbg_feature,
+                opt_feature,
+            ]
+        else:
+            supports_pic_feature = feature(
+                name = "supports_pic",
+                enabled = True,
+            )
+
+            sysroot_feature = feature(
+                name = "sysroot",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = [
+                            ACTION_NAMES.preprocess_assemble,
+                            ACTION_NAMES.linkstamp_compile,
+                            ACTION_NAMES.c_compile,
+                            ACTION_NAMES.cpp_compile,
+                            ACTION_NAMES.cpp_header_parsing,
+                            ACTION_NAMES.cpp_module_compile,
+                            ACTION_NAMES.cpp_module_codegen,
+                            ACTION_NAMES.lto_backend,
+                            ACTION_NAMES.clif_match,
+                            ACTION_NAMES.cpp_link_executable,
+                            ACTION_NAMES.cpp_link_dynamic_library,
+                            ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                        ],
+                        flag_groups = [
+                            flag_group(
+                                flags = ["--sysroot=%{sysroot}"],
+                                expand_if_available = "sysroot",
+                            ),
+                        ],
+                    ),
+                ],
+            )
+
+            fdo_optimize_feature = feature(
+                name = "fdo_optimize",
+                flag_sets = [
+                    flag_set(
+                        actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                        flag_groups = [
+                            flag_group(
+                                flags = [
+                                    "-fprofile-use=%{fdo_profile_path}",
+                                    "-fprofile-correction",
+                                ],
+                                expand_if_available = "fdo_profile_path",
+                            ),
+                        ],
+                    ),
+                ],
+                provides = ["profile"],
+            )
+
+            treat_warnings_as_errors_feature = feature(
+                name = "treat_warnings_as_errors",
+                flag_sets = [
+                    flag_set(
+                        actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                        flag_groups = [flag_group(flags = ["-Werror"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions,
+                        flag_groups = [flag_group(flags = ["-Wl,-fatal-warnings"])],
+                    ),
+                ],
+            )
+
+            user_compile_flags_feature = feature(
+                name = "user_compile_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = [
+                            ACTION_NAMES.assemble,
+                            ACTION_NAMES.preprocess_assemble,
+                            ACTION_NAMES.linkstamp_compile,
+                            ACTION_NAMES.c_compile,
+                            ACTION_NAMES.cpp_compile,
+                            ACTION_NAMES.cpp_header_parsing,
+                            ACTION_NAMES.cpp_module_compile,
+                            ACTION_NAMES.cpp_module_codegen,
+                            ACTION_NAMES.lto_backend,
+                            ACTION_NAMES.clif_match,
+                        ],
+                        flag_groups = [
+                            flag_group(
+                                flags = ["%{user_compile_flags}"],
+                                iterate_over = "user_compile_flags",
+                                expand_if_available = "user_compile_flags",
+                            ),
+                        ],
+                    ),
+                ],
+            )
+
+            features = [
+                targets_windows_feature,
+                copy_dynamic_libraries_to_binary_feature,
+                gcc_env_feature,
+                supports_pic_feature,
+                default_compile_flags_feature,
+                default_link_flags_feature,
+                fdo_optimize_feature,
+                supports_dynamic_linker_feature,
+                dbg_feature,
+                opt_feature,
+                user_compile_flags_feature,
+                treat_warnings_as_errors_feature,
+                sysroot_feature,
+            ]
+
+    tool_paths = [
+        tool_path(name = name, path = path)
+        for name, path in ctx.attr.tool_paths.items()
+    ]
+
+    return cc_common.create_cc_toolchain_config_info(
+        ctx = ctx,
+        features = features,
+        action_configs = action_configs,
+        artifact_name_patterns = artifact_name_patterns,
+        cxx_builtin_include_directories = ctx.attr.cxx_builtin_include_directories,
+        toolchain_identifier = ctx.attr.toolchain_identifier,
+        host_system_name = ctx.attr.host_system_name,
+        target_system_name = ctx.attr.target_system_name,
+        target_cpu = ctx.attr.cpu,
+        target_libc = ctx.attr.target_libc,
+        compiler = ctx.attr.compiler,
+        abi_version = ctx.attr.abi_version,
+        abi_libc_version = ctx.attr.abi_libc_version,
+        tool_paths = tool_paths,
+    )
+
+cc_toolchain_config = rule(
+    implementation = _impl,
+    attrs = {
+        "cpu": attr.string(mandatory = True),
+        "compiler": attr.string(),
+        "toolchain_identifier": attr.string(),
+        "host_system_name": attr.string(),
+        "target_system_name": attr.string(),
+        "target_libc": attr.string(),
+        "abi_version": attr.string(),
+        "abi_libc_version": attr.string(),
+        "tool_paths": attr.string_dict(),
+        "cxx_builtin_include_directories": attr.string_list(),
+        "archiver_flags": attr.string_list(default = []),
+        "default_link_flags": attr.string_list(default = []),
+        "msvc_env_tmp": attr.string(default = "msvc_not_found"),
+        "msvc_env_path": attr.string(default = "msvc_not_found"),
+        "msvc_env_include": attr.string(default = "msvc_not_found"),
+        "msvc_env_lib": attr.string(default = "msvc_not_found"),
+        "msvc_cl_path": attr.string(default = "vc_installation_error.bat"),
+        "msvc_ml_path": attr.string(default = "vc_installation_error.bat"),
+        "msvc_link_path": attr.string(default = "vc_installation_error.bat"),
+        "msvc_lib_path": attr.string(default = "vc_installation_error.bat"),
+        "dbg_mode_debug_flag": attr.string(),
+        "fastbuild_mode_debug_flag": attr.string(),
+        "tool_bin_path": attr.string(default = "not_found"),
+        "supports_parse_showincludes": attr.bool(),
+    },
+    provides = [CcToolchainConfigInfo],
+)
diff --git a/third_party/xla/tools/toolchains/win/BUILD b/third_party/xla/tools/toolchains/win/BUILD
index add969031b00ef..55ae6fb22b81f6 100644
--- a/third_party/xla/tools/toolchains/win/BUILD
+++ b/third_party/xla/tools/toolchains/win/BUILD
@@ -17,7 +17,7 @@ platform(
     remote_execution_properties = """
         properties:{
           name: "container-image"
-          value: "docker://gcr.io/tensorflow-testing/tf-win2019-rbe@sha256:775f2889a35e7cc87f8e1dc83f3195acb4e709cebe3572cf3ae591ccef27e7e8"
+          value: "docker://gcr.io/tensorflow-testing/tf-win2019-rbe@sha256:1082ef4299a72e44a84388f192ecefc81ec9091c146f507bc36070c089c0edcc"
         }
         properties:{
           name: "OSFamily"
@@ -32,7 +32,7 @@ platform(
         """,
 )
 
-#Register platform to support clang-cl on Windows
+# Register the clang-cl platform
 platform(
     name = "x64_windows-clang-cl",
     constraint_values = [
@@ -40,4 +40,20 @@ platform(
         "@platforms//os:windows",
         "@bazel_tools//tools/cpp:clang-cl",
     ],
+    remote_execution_properties = """
+        properties:{
+          name: "container-image"
+          value: "docker://gcr.io/tensorflow-testing/tf-win2019-rbe@sha256:1082ef4299a72e44a84388f192ecefc81ec9091c146f507bc36070c089c0edcc"
+        }
+        properties:{
+          name: "OSFamily"
+          value: "Windows"
+        }
+        properties:{
+          name: "Pool" value: "default"
+        }
+        properties:{
+          name: "dockerNetwork" value: "off"
+        }
+        """,
 )
diff --git a/third_party/xla/workspace2.bzl b/third_party/xla/workspace2.bzl
index 9b9fd5e9265ed9..4044a9f10263bd 100644
--- a/third_party/xla/workspace2.bzl
+++ b/third_party/xla/workspace2.bzl
@@ -36,13 +36,21 @@ def _tf_repositories():
     #    curl -L <url> | sha256sum
     # and update the sha256 with the result.
 
+    tf_http_archive(
+        name = "jsoncpp_git",
+        sha256 = "f409856e5920c18d0c2fb85276e24ee607d2a09b5e7d5f0a371368903c275da2",
+        strip_prefix = "jsoncpp-1.9.5",
+        system_build_file = "//third_party/systemlibs:jsoncpp.BUILD",
+        urls = tf_mirror_urls("https://github.com/open-source-parsers/jsoncpp/archive/1.9.5.tar.gz"),
+    )
+
     tf_http_archive(
         name = "cudnn_frontend_archive",
         build_file = "//third_party:cudnn_frontend.BUILD",
         patch_file = ["//third_party:cudnn_frontend_header_fix.patch"],
-        sha256 = "1bb309af98fe9aad81b6a14fd52acbd6566aacfd322fc5803f9a1b77fc681a27",
-        strip_prefix = "cudnn-frontend-1.2.1",
-        urls = tf_mirror_urls("https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.2.1.zip"),
+        sha256 = "5727ed189a17fe888f1729ba09b2afd8df3e71192a27e9fa87e14a60f7b9d367",
+        strip_prefix = "cudnn-frontend-1.3.0",
+        urls = tf_mirror_urls("https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.3.0.zip"),
     )
 
     tf_http_archive(
diff --git a/third_party/xla/xla/BUILD b/third_party/xla/xla/BUILD
index 907924518699f6..304ed5a4d2b031 100644
--- a/third_party/xla/xla/BUILD
+++ b/third_party/xla/xla/BUILD
@@ -1,3 +1,4 @@
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load(
     "@local_tsl//tsl/platform:build_config.bzl",
     "tf_proto_library",
@@ -23,6 +24,7 @@ package_group(
         # copybara:uncomment "//learning/...",
         "//third_party/australis/...",
         "//third_party/iree/...",
+        "//third_party/libxc/...",
         "//third_party/mira/...",
         "//third_party/mlcompass/...",
         "//third_party/mlir_edge/model_curriculum/...",
@@ -148,6 +150,7 @@ cc_library(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:ml_dtypes",
     ],
@@ -221,7 +224,10 @@ cc_library(
         "frontend_attributes.h",
     ],
     visibility = internal_visibility([":friends"]),
-    deps = ["//xla/hlo/ir:hlo"],
+    deps = [
+        ":xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+    ],
 )
 
 cc_library(
@@ -241,6 +247,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     visibility = internal_visibility([":friends"]),
     deps = [
+        "@com_google_absl//absl/strings",
         "@eigen_archive//:eigen3",
         "@local_tsl//tsl/platform:ml_dtypes",
     ],
@@ -278,11 +285,13 @@ cc_library(
     deps = [
         ":status",
         ":statusor",
-        ":types",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:macros",
         "@local_tsl//tsl/platform:stacktrace",
+        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -291,11 +300,13 @@ xla_cc_test(
     size = "small",
     srcs = ["status_macros_test.cc"],
     deps = [
+        ":status",
         ":status_macros",
-        ":statusor",
         ":test",
         ":test_helpers",
+        "@com_google_absl//absl/status:statusor",
         "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test_main",
     ],
 )
@@ -333,6 +344,7 @@ cc_library(
     hdrs = [
         "iterator_util.h",
         "map_util.h",
+        "maybe_owning.h",
         "overflow_util.h",
         "util.h",
     ],
@@ -413,11 +425,10 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":status",
-        ":status_macros",
-        ":types",
         ":util",
         "@com_google_absl//absl/hash",
         "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:protobuf",
     ],
@@ -466,6 +477,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -490,6 +502,7 @@ cc_library(
     deps = [
         ":status",
         ":status_macros",
+        ":util",
         "//xla/service:hlo_lexer",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -515,10 +528,7 @@ xla_cc_test(
     deps = [
         ":shape_util",
         ":status",
-        ":statusor",
         ":test",
-        ":test_helpers",
-        ":types",
         ":util",
         ":xla_data_proto_cc",
         "@com_google_absl//absl/strings",
@@ -535,12 +545,10 @@ xla_cc_test(
     srcs = ["primitive_util_test.cc"],
     deps = [
         ":shape_util",
-        ":status_macros",
         ":test",
         ":test_helpers",
-        ":types",
-        ":util",
         ":xla_data_proto_cc",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test_main",
     ],
 )
@@ -595,6 +603,7 @@ cc_library(
         ":array4d",
         ":permutation_util",
         ":printer",
+        ":shape_tree",
         ":shape_util",
         ":status",
         ":status_macros",
@@ -633,6 +642,7 @@ xla_cc_test(
         ":array4d",
         ":literal",
         ":literal_util",
+        ":shape_tree",
         ":shape_util",
         ":status",
         ":test",
@@ -640,6 +650,7 @@ xla_cc_test(
         ":util",
         ":xla_data_proto_cc",
         "@com_google_absl//absl/base",
+        "@com_google_absl//absl/hash",
         "@com_google_absl//absl/random",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -732,7 +743,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":types",
-        "//xla/stream_executor",
+        "//xla/stream_executor:stream_executor_interface",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -991,6 +1002,7 @@ cc_library(
         ":status",
         ":util",
         "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -1217,6 +1229,7 @@ cc_library(
     hdrs = ["printer.h"],
     visibility = ["//visibility:public"],
     deps = [
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:cord",
         "@local_tsl//tsl/platform:logging",
@@ -1254,3 +1267,20 @@ alias(
     actual = ":empty",
     visibility = ["//visibility:public"],
 )
+
+bzl_library(
+    name = "lit_bzl",
+    srcs = ["lit.bzl"],
+    deps = ["@bazel_skylib//lib:paths"],
+)
+
+bzl_library(
+    name = "xla_bzl",
+    srcs = ["xla.bzl"],
+    deps = [
+        "//xla/tsl:tsl_bzl",
+        "@local_config_rocm//rocm:build_defs_bzl",
+        "@local_tsl//tsl/platform:build_config_root_bzl",
+        "@local_tsl//tsl/platform/default:cuda_build_defs_bzl",
+    ],
+)
diff --git a/third_party/xla/xla/array.h b/third_party/xla/xla/array.h
index 145cb2856b106e..fe33a49335ac26 100644
--- a/third_party/xla/xla/array.h
+++ b/third_party/xla/xla/array.h
@@ -314,32 +314,32 @@ class Array {
 
   // Invokes a callback with the (indices, value_ptr) for each cell in the
   // array. If a callback returns a non-OK status, returns that else returns
-  // OkStatus().
-  Status EachStatus(
-      absl::FunctionRef<Status(absl::Span<const int64_t>, T*)> f) {
+  // absl::OkStatus().
+  absl::Status EachStatus(
+      absl::FunctionRef<absl::Status(absl::Span<const int64_t>, T*)> f) {
     OwnedBuffer<int64_t> index(sizes_.size, default_init_t{});
     for (int64_t i = 0; i < num_elements(); ++i, next_index(&index)) {
-      Status s = f(index.span(), &values_[i]);
+      absl::Status s = f(index.span(), &values_[i]);
       if (!s.ok()) {
         return s;
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Invokes a callback with the (indices, value) for each cell in the array.
   // If a callback returns a non-OK status, returns that else returns
-  // OkStatus().
-  Status EachStatus(
-      absl::FunctionRef<Status(absl::Span<const int64_t>, T)> f) const {
+  // absl::OkStatus().
+  absl::Status EachStatus(
+      absl::FunctionRef<absl::Status(absl::Span<const int64_t>, T)> f) const {
     OwnedBuffer<int64_t> index(sizes_.size, default_init_t{});
     for (int64_t i = 0; i < num_elements(); ++i, next_index(&index)) {
-      Status s = f(index.span(), values_[i]);
+      absl::Status s = f(index.span(), values_[i]);
       if (!s.ok()) {
         return s;
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Returns the value at the cell specified by the indexes. The number of
@@ -432,17 +432,30 @@ class Array {
 
   // Performs the equivalent of a slice operation on this array.
   Array<T> Slice(absl::Span<const int64_t> starts,
-                 absl::Span<const int64_t> limits) const {
+                 absl::Span<const int64_t> limits,
+                 bool out_of_bounds_ok = false) const {
     CHECK_EQ(starts.size(), num_dimensions());
     CHECK_EQ(limits.size(), num_dimensions());
 
     OwnedBuffer<int64_t> sizes(starts.size());
     for (int64_t i = 0; i < starts.size(); ++i) {
       CHECK_GE(starts[i], 0);
-      CHECK_LE(limits[i], dim(i));
+      if (!out_of_bounds_ok) {
+        CHECK_LE(limits[i], dim(i));
+      }
       sizes[i] = limits[i] - starts[i];
     }
     Array<T> result(sizes.span());
+    if (result.num_elements() == 0) {
+      return result;
+    }
+    // Initializes the slice to the first value if out of bounds access are ok.
+    if (out_of_bounds_ok) {
+      CHECK_GT(num_elements(), 0);
+      for (int64_t i = 0; i < result.num_elements(); ++i) {
+        result.values_[i] = values_[0];
+      }
+    }
 
     OwnedBuffer<int64_t> index(sizes_.size, default_init_t{});
     int64_t slice_i = 0;
diff --git a/third_party/xla/xla/autotune_results.proto b/third_party/xla/xla/autotune_results.proto
index 0244dc6fa492df..cf3ddcc1ad6ebb 100644
--- a/third_party/xla/xla/autotune_results.proto
+++ b/third_party/xla/xla/autotune_results.proto
@@ -44,3 +44,9 @@ message AutotuneResults {
 // LINT.ThenChange(
 //   "service/gpu/autotuner_util.cc:version"
 // )
+
+message AutotuningLogs {
+  repeated AutotuningLog logs = 1;
+
+  // Next ID: 2
+}
diff --git a/third_party/xla/xla/autotuning.proto b/third_party/xla/xla/autotuning.proto
index 9a867ab7e21776..9d6de133f6a93f 100644
--- a/third_party/xla/xla/autotuning.proto
+++ b/third_party/xla/xla/autotuning.proto
@@ -115,5 +115,9 @@ message AutotuningLog {
 
   string blas_version = 6;
 
-  // Next ID: 7
+  string fusion_name = 7;
+
+  int64 fusion_count = 8;
+
+  // Next ID: 9
 }
diff --git a/third_party/xla/xla/backends/interpreter/compiler.cc b/third_party/xla/xla/backends/interpreter/compiler.cc
index e48f63f77929ae..2409d9602671e0 100644
--- a/third_party/xla/xla/backends/interpreter/compiler.cc
+++ b/third_party/xla/xla/backends/interpreter/compiler.cc
@@ -89,7 +89,7 @@ absl::StatusOr<Literal> HandleEvaluatorCustomCall(
 
 }  // namespace
 
-Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) {
+absl::Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) {
   HloPassPipeline pipeline("Interpreter");
 
   // The TopkDecomposer generates a compare op with type=TOTALORDER and must
diff --git a/third_party/xla/xla/backends/interpreter/compiler.h b/third_party/xla/xla/backends/interpreter/compiler.h
index cfdbaa4fd23928..1f234f161befb1 100644
--- a/third_party/xla/xla/backends/interpreter/compiler.h
+++ b/third_party/xla/xla/backends/interpreter/compiler.h
@@ -62,7 +62,7 @@ class InterpreterCompiler : public Compiler {
   se::Platform::Id PlatformId() const override;
 
  private:
-  Status RunHloOptimization(HloModule* hlo_module);
+  absl::Status RunHloOptimization(HloModule* hlo_module);
 
   InterpreterCompiler(const InterpreterCompiler&) = delete;
   InterpreterCompiler& operator=(const InterpreterCompiler&) = delete;
diff --git a/third_party/xla/xla/backends/interpreter/executable_base.cc b/third_party/xla/xla/backends/interpreter/executable_base.cc
index 78443e635ad737..a58c811c87e257 100644
--- a/third_party/xla/xla/backends/interpreter/executable_base.cc
+++ b/third_party/xla/xla/backends/interpreter/executable_base.cc
@@ -44,7 +44,7 @@ absl::StatusOr<ExecutionOutput> InterpreterExecutableBase::ExecuteAsyncOnStream(
     HloExecutionProfile* hlo_execution_profile) {
   se::Stream* stream = run_options->stream();
   se::StreamExecutor* executor = stream->parent();
-  const se::Platform* platform = executor->platform();
+  const se::Platform* platform = executor->GetPlatform();
 
   // Convert the ShapeTree to a ShapedBuffer. We do this so we can call
   // TransferManager methods below.
@@ -171,11 +171,11 @@ InterpreterExecutableBase::AllocateOutputMemoryWithInputReuse(
                 alias->ToString());
           }
         }
-        return OkStatus();
+        return absl::OkStatus();
       }));
 
   se::StreamExecutor* executor = stream->parent();
-  const se::Platform* platform = executor->platform();
+  const se::Platform* platform = executor->GetPlatform();
   TF_ASSIGN_OR_RETURN(TransferManager * transfer_manager,
                       TransferManager::GetForPlatform(platform));
 
diff --git a/third_party/xla/xla/backends/interpreter/executor.h b/third_party/xla/xla/backends/interpreter/executor.h
index 89cca232411969..b77ca8d120ec7c 100644
--- a/third_party/xla/xla/backends/interpreter/executor.h
+++ b/third_party/xla/xla/backends/interpreter/executor.h
@@ -46,10 +46,10 @@ limitations under the License.
 namespace stream_executor {
 namespace interpreter {
 
-class XlaInterpreterExecutor : public StreamExecutorInterface {
+class XlaInterpreterExecutor : public StreamExecutor {
  public:
-  explicit XlaInterpreterExecutor(int device_ordinal)
-      : device_ordinal_(device_ordinal) {}
+  XlaInterpreterExecutor(int device_ordinal, Platform *platform)
+      : StreamExecutor(platform), device_ordinal_(device_ordinal) {}
 
   absl::Status Init() override { return absl::OkStatus(); }
 
@@ -114,8 +114,6 @@ class XlaInterpreterExecutor : public StreamExecutorInterface {
   bool HostCallback(Stream *stream,
                     absl::AnyInvocable<absl::Status() &&> callback) override;
 
-  absl::Status AllocateEvent(Event *event) override { return absl::OkStatus(); }
-
   absl::Status DeallocateEvent(Event *event) override {
     return absl::OkStatus();
   }
@@ -132,7 +130,6 @@ class XlaInterpreterExecutor : public StreamExecutorInterface {
     return Event::Status::kError;
   }
 
-  bool AllocateStream(Stream *stream) override { return true; }
   void DeallocateStream(Stream *stream) override {}
   bool CreateStreamDependency(Stream *dependent, Stream *other) override;
 
@@ -157,24 +154,22 @@ class XlaInterpreterExecutor : public StreamExecutorInterface {
   bool CanEnablePeerAccessTo(StreamExecutorInterface *other) override {
     return true;
   }
-
-  std::unique_ptr<EventInterface> CreateEventImplementation() override {
-    return nullptr;
+  absl::StatusOr<std::unique_ptr<Event>> CreateEvent() override {
+    return std::make_unique<Event>(this, nullptr);
   }
 
-  std::unique_ptr<StreamInterface> GetStreamImplementation() override {
-    return std::make_unique<host::HostStream>();
+  absl::StatusOr<std::unique_ptr<Stream>> CreateStream(
+      std::optional<std::variant<StreamPriority, int>> priority =
+          std::nullopt) override {
+    auto stream =
+        std::make_unique<Stream>(this, std::make_unique<host::HostStream>());
+    return std::move(stream);
   }
 
  private:
   // The device ordinal value that this executor was initialized with; recorded
   // for use in getting device metadata. Immutable post-initialization.
   int device_ordinal_;
-
-  DeviceMemoryBase AllocateSingleOutput(const xla::Shape &shape);
-
-  absl::StatusOr<DeviceMemoryBase> AllocateOutputBuffer(
-      const xla::Shape &shape);
 };
 
 }  // namespace interpreter
diff --git a/third_party/xla/xla/backends/interpreter/platform.cc b/third_party/xla/xla/backends/interpreter/platform.cc
index 35383fdcd4a038..80d99f39aafb18 100644
--- a/third_party/xla/xla/backends/interpreter/platform.cc
+++ b/third_party/xla/xla/backends/interpreter/platform.cc
@@ -61,8 +61,8 @@ absl::StatusOr<StreamExecutor*> XlaInterpreterPlatform::GetExecutor(
 absl::StatusOr<std::unique_ptr<StreamExecutor>>
 XlaInterpreterPlatform::GetUncachedExecutor(
     const StreamExecutorConfig& config) {
-  auto executor = std::make_unique<StreamExecutor>(
-      this, std::make_unique<XlaInterpreterExecutor>(config.ordinal));
+  auto executor =
+      std::make_unique<XlaInterpreterExecutor>(config.ordinal, this);
   auto init_status = executor->Init();
   if (!init_status.ok()) {
     return absl::Status{
diff --git a/third_party/xla/xla/backends/profiler/cpu/metadata_collector.cc b/third_party/xla/xla/backends/profiler/cpu/metadata_collector.cc
index a19d4527ec930c..ebb73c2ca5fc7b 100644
--- a/third_party/xla/xla/backends/profiler/cpu/metadata_collector.cc
+++ b/third_party/xla/xla/backends/profiler/cpu/metadata_collector.cc
@@ -42,23 +42,23 @@ class MetadataCollector : public tsl::profiler::ProfilerInterface {
  public:
   MetadataCollector() = default;
 
-  Status Start() override {
+  absl::Status Start() override {
     if (!trace_active_) {
       xla::XlaDebugInfoManager::Get()->StartTracing();
       trace_active_ = true;
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status Stop() override {
+  absl::Status Stop() override {
     if (trace_active_) {
       xla::XlaDebugInfoManager::Get()->StopTracing(&debug_info_);
       trace_active_ = false;
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status CollectData(tsl::profiler::XSpace* space) override {
+  absl::Status CollectData(tsl::profiler::XSpace* space) override {
     if (!debug_info_.empty()) {
       tsl::profiler::XPlane* plane =
           tsl::profiler::FindOrAddMutablePlaneWithName(
@@ -70,7 +70,7 @@ class MetadataCollector : public tsl::profiler::ProfilerInterface {
       }
       debug_info_.clear();
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
diff --git a/third_party/xla/xla/backends/profiler/gpu/BUILD b/third_party/xla/xla/backends/profiler/gpu/BUILD
index 7d6a522c1b8060..8f7429713fcd0c 100644
--- a/third_party/xla/xla/backends/profiler/gpu/BUILD
+++ b/third_party/xla/xla/backends/profiler/gpu/BUILD
@@ -4,10 +4,6 @@ load(
     "@local_tsl//tsl/platform:build_config.bzl",
     "tf_additional_device_tracer_srcs",
 )
-load(
-    "@local_tsl//tsl/platform:build_config_root.bzl",
-    "tf_cuda_tests_tags",
-)
 load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
@@ -34,6 +30,7 @@ tsl_gpu_library(
     srcs = tf_additional_device_tracer_srcs(),
     copts = tf_profiler_copts() + tsl_copts(),
     cuda_deps = [
+        ":cupti_buffer_events",
         ":cupti_collector",
         ":cupti_tracer",
         ":cupti_wrapper",
@@ -104,10 +101,11 @@ xla_cc_test(
     size = "small",
     srcs = ["cupti_error_manager_test.cc"],
     copts = tf_profiler_copts() + tsl_copts(),
-    tags = tf_cuda_tests_tags() + [
+    tags = [
         "gpu_cupti",
         "nomac",
     ],
+    use_gpu = True,
     deps = [
         ":cupti_interface",
         "@local_tsl//tsl/platform:test",
@@ -172,15 +170,13 @@ tsl_gpu_library(
     copts = tf_profiler_copts() + tsl_copts(),
     visibility = ["//visibility:public"],
     deps = [
+        ":cupti_buffer_events",
         ":cupti_collector",
         ":cupti_interface",
         ":cupti_utils",
         ":nvtx_utils",
         "@com_google_absl//absl/cleanup",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/container:node_hash_map",
-        "@com_google_absl//absl/container:node_hash_set",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:optional",
         "@local_tsl//tsl/platform:env",
@@ -190,8 +186,6 @@ tsl_gpu_library(
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:types",
         "@local_tsl//tsl/profiler/backends/cpu:annotation_stack",
-        "@local_tsl//tsl/profiler/lib:scoped_annotation",
-        "@local_tsl//tsl/profiler/utils:buffer_pool",
     ],
 )
 
@@ -217,7 +211,6 @@ tsl_gpu_library(
         "@local_tsl//tsl/platform:types",
         "@local_tsl//tsl/profiler/backends/cpu:annotation_stack",
         "@local_tsl//tsl/profiler/lib:scoped_annotation",
-        "@local_tsl//tsl/profiler/utils:buffer_pool",
     ],
 )
 
@@ -229,23 +222,27 @@ tsl_gpu_library(
     visibility = ["//visibility:public"],
     deps = [
         "//xla/stream_executor/rocm:roctracer_wrapper",
+        "//xla/tsl/util:env_var",
         "@com_google_absl//absl/container:fixed_array",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/container:node_hash_set",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:optional",
-        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:abi",
+        "@local_tsl//tsl/platform:env_time",
         "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:macros",
-        "@local_tsl//tsl/platform:platform_port",
+        "@local_tsl//tsl/platform:mutex",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:thread_annotations",
+        "@local_tsl//tsl/platform:types",
         "@local_tsl//tsl/profiler/backends/cpu:annotation_stack",
         "@local_tsl//tsl/profiler/lib:profiler_factory",
         "@local_tsl//tsl/profiler/lib:profiler_interface",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
         "@local_tsl//tsl/profiler/utils:parse_annotation",
-        "@local_tsl//tsl/profiler/utils:time_utils",
-        "@local_tsl//tsl/profiler/utils:trace_utils",
         "@local_tsl//tsl/profiler/utils:xplane_builder",
         "@local_tsl//tsl/profiler/utils:xplane_schema",
         "@local_tsl//tsl/profiler/utils:xplane_utils",
@@ -263,16 +260,18 @@ tsl_gpu_library(
         "//xla/stream_executor/rocm:roctracer_wrapper",
         "@com_google_absl//absl/container:fixed_array",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/container:node_hash_set",
-        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/types:optional",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:macros",
         "@local_tsl//tsl/platform:platform_port",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:types",
         "@local_tsl//tsl/profiler/backends/cpu:annotation_stack",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
         "@local_tsl//tsl/profiler/utils:time_utils",
     ],
 )
@@ -296,18 +295,15 @@ tsl_gpu_library(
     copts = tf_profiler_copts() + tsl_copts(),
     visibility = ["//visibility:public"],
     deps = [
-        "@com_google_absl//absl/container:fixed_array",
+        ":cupti_buffer_events",
+        ":cupti_interface",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/container:node_hash_set",
         "@com_google_absl//absl/hash",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:abi",
-        "@local_tsl//tsl/platform:macros",
         "@local_tsl//tsl/platform:mutex",
         "@local_tsl//tsl/platform:platform_port",
-        "@local_tsl//tsl/platform:types",
         "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
         "@local_tsl//tsl/profiler/utils:parse_annotation",
         "@local_tsl//tsl/profiler/utils:trace_utils",
@@ -317,6 +313,27 @@ tsl_gpu_library(
     ] + if_cuda(["//xla/tsl/cuda:cupti"]),
 )
 
+tsl_gpu_library(
+    name = "cupti_buffer_events",
+    srcs = if_cuda(["cupti_buffer_events.cc"]),
+    hdrs = ["cupti_buffer_events.h"],
+    copts = tf_profiler_copts() + tsl_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cupti_interface",
+        "@com_google_absl//absl/container:fixed_array",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:node_hash_set",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:mutex",
+        "@local_tsl//tsl/platform:platform_port",
+        "@local_tsl//tsl/platform:thread_annotations",
+        "@local_tsl//tsl/profiler/utils:buffer_pool",
+    ] + if_cuda(["//xla/tsl/cuda:cupti"]),
+)
+
 tsl_gpu_library(
     name = "cupti_utils",
     srcs = if_cuda(["cupti_utils.cc"]),
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.cc
new file mode 100644
index 00000000000000..4c3ce94e1651a4
--- /dev/null
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.cc
@@ -0,0 +1,498 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/profiler/gpu/cupti_buffer_events.h"
+
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "xla/backends/profiler/gpu/cupti_interface.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/mem.h"
+
+namespace xla {
+namespace profiler {
+
+namespace {
+
+using absl::OkStatus;
+using absl::StatusCode;
+
+// CUPTI from CUDA 11.6 adds information about the hardware channel that ops
+// run on; this makes its way into the channel_id and channel_type fields in the
+// structs we export.
+//
+// Define some type aliases so we can access the hardware channel id if it's
+// available.
+#if CUDA_VERSION >= 12000  // CUDA 12.0
+#define TF_CUPTI_HAS_CHANNEL_ID 1
+using CuptiActivityKernelTy = CUpti_ActivityKernel9;
+using CuptiActivityMemcpyTy = CUpti_ActivityMemcpy5;
+using CuptiActivityMemcpyP2PTy = CUpti_ActivityMemcpyPtoP4;
+using CuptiActivityMemsetTy = CUpti_ActivityMemset4;
+#elif CUDA_VERSION >= 11060  // CUDA 11.6
+#define TF_CUPTI_HAS_CHANNEL_ID 1
+using CuptiActivityKernelTy = CUpti_ActivityKernel7;
+using CuptiActivityMemcpyTy = CUpti_ActivityMemcpy5;
+using CuptiActivityMemcpyP2PTy = CUpti_ActivityMemcpyPtoP4;
+using CuptiActivityMemsetTy = CUpti_ActivityMemset4;
+#else
+using CuptiActivityKernelTy = CUpti_ActivityKernel4;
+using CuptiActivityMemcpyTy = CUpti_ActivityMemcpy;
+using CuptiActivityMemcpyP2PTy = CUpti_ActivityMemcpy2;
+using CuptiActivityMemsetTy = CUpti_ActivityMemset;
+#endif
+
+// Maps an OverheadKind enum to a const string.
+const char *getActivityOverheadKindString(CUpti_ActivityOverheadKind kind) {
+  switch (kind) {
+    case CUPTI_ACTIVITY_OVERHEAD_DRIVER_COMPILER:
+      return "COMPILER";
+    case CUPTI_ACTIVITY_OVERHEAD_CUPTI_BUFFER_FLUSH:
+      return "BUFFER_FLUSH";
+    case CUPTI_ACTIVITY_OVERHEAD_CUPTI_INSTRUMENTATION:
+      return "INSTRUMENTATION";
+    case CUPTI_ACTIVITY_OVERHEAD_CUPTI_RESOURCE:
+      return "RESOURCE";
+    default:
+      break;
+  }
+  return "<UNKNOWN>";
+}
+
+const char *getActivityUnifiedMemoryKindString(
+    CUpti_ActivityUnifiedMemoryCounterKind kind) {
+  switch (kind) {
+    case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD:
+      return "UM_BYTES_TRANSFER_HTOD";
+    case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH:
+      return "UM_BYTES_TRANSFER_DTOH";
+    case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT:
+      return "UM_CPU_PAGE_FAULT";
+    case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT:
+      return "UM_GPU_PAGE_FAULT";
+    case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING:
+      return "UM_THRASHING";
+    case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING:
+      return "UM_THROTTLING";
+    case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP:
+      return "UM_REMOTE_MAP";
+    case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOD:
+      return "UM_BYTES_TRANSFER_DTOD";
+    default:
+      break;
+  }
+  return "<UNKNOWN>";
+}
+
+template <bool cupti_has_channel_id, typename CuptiActivityKernel>
+void AddKernelActivityEvent(CuptiEventCollectorDelegate &collector,
+                            const CuptiActivityKernel *kernel) {
+  CuptiTracerEvent event{};
+  event.type = CuptiTracerEventType::Kernel;
+  event.source = CuptiTracerEventSource::Activity;
+  event.name = kernel->name;
+  event.start_time_ns = kernel->start;
+  event.end_time_ns = kernel->end;
+  event.device_id = kernel->deviceId;
+  event.context_id = kernel->contextId;
+  event.stream_id = kernel->streamId;
+  event.correlation_id = kernel->correlationId;
+  AnnotationMap::AnnotationInfo info =
+      collector.annotation_map.LookUp(event.device_id, event.correlation_id);
+  event.annotation = info.annotation;
+  event.nvtx_range = info.nvtx_range;
+  event.kernel_info.registers_per_thread = kernel->registersPerThread;
+  event.kernel_info.static_shared_memory_usage = kernel->staticSharedMemory;
+  event.kernel_info.dynamic_shared_memory_usage = kernel->dynamicSharedMemory;
+  event.kernel_info.block_x = kernel->blockX;
+  event.kernel_info.block_y = kernel->blockY;
+  event.kernel_info.block_z = kernel->blockZ;
+  event.kernel_info.grid_x = kernel->gridX;
+  event.kernel_info.grid_y = kernel->gridY;
+  event.kernel_info.grid_z = kernel->gridZ;
+  if constexpr (cupti_has_channel_id) {
+    event.kernel_info.channel_id = kernel->channelID;
+    event.kernel_info.channel_type = kernel->channelType;
+  }
+  collector.receive(std::move(event));
+}
+
+void AddMemcpyActivityEvent(CuptiEventCollectorDelegate &collector,
+                            const CuptiActivityMemcpyTy *memcpy) {
+  CuptiTracerEvent event{};
+  switch (memcpy->copyKind) {
+    case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD:
+      event.type = CuptiTracerEventType::MemcpyH2D;
+      event.name = "MemcpyH2D";
+      break;
+    case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH:
+      event.type = CuptiTracerEventType::MemcpyD2H;
+      event.name = "MemcpyD2H";
+      break;
+    case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD:
+      event.type = CuptiTracerEventType::MemcpyD2D;
+      event.name = "MemcpyD2D";
+      break;
+    case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP:
+      event.type = CuptiTracerEventType::MemcpyP2P;
+      event.name = "MemcpyP2P";
+      break;
+    default:
+      event.type = CuptiTracerEventType::MemcpyOther;
+      event.name = "MemcpyOther";
+      break;
+  }
+
+  event.source = CuptiTracerEventSource::Activity;
+  event.start_time_ns = memcpy->start;
+  event.end_time_ns = memcpy->end;
+  event.device_id = memcpy->deviceId;
+  event.context_id = memcpy->contextId;
+  event.stream_id = memcpy->streamId;
+  event.correlation_id = memcpy->correlationId;
+  AnnotationMap::AnnotationInfo info =
+      collector.annotation_map.LookUp(event.device_id, event.correlation_id);
+  event.annotation = info.annotation;
+  event.memcpy_info.copy_kind = memcpy->copyKind;
+  event.memcpy_info.num_bytes = memcpy->bytes;
+  event.memcpy_info.destination = memcpy->deviceId;
+  event.memcpy_info.async = memcpy->flags & CUPTI_ACTIVITY_FLAG_MEMCPY_ASYNC;
+  event.memcpy_info.src_mem_kind = memcpy->srcKind;
+  event.memcpy_info.dst_mem_kind = memcpy->dstKind;
+#if TF_CUPTI_HAS_CHANNEL_ID
+  event.memcpy_info.channel_id = memcpy->channelID;
+  event.memcpy_info.channel_type = memcpy->channelType;
+#endif
+  collector.receive(std::move(event));
+}
+
+// Invokes callback upon peer-2-peer memcpy between different GPU devices.
+void AddMemcpyP2PActivityEvent(CuptiEventCollectorDelegate &collector,
+                               const CuptiActivityMemcpyP2PTy *memcpy) {
+  CuptiTracerEvent event{};
+  event.type = CuptiTracerEventType::MemcpyP2P;
+  event.name = "MemcpyP2P";
+  event.source = CuptiTracerEventSource::Activity;
+  event.start_time_ns = memcpy->start;
+  event.end_time_ns = memcpy->end;
+  event.device_id = memcpy->srcDeviceId;
+  event.context_id = memcpy->contextId;
+  event.stream_id = memcpy->streamId;
+  event.correlation_id = memcpy->correlationId;
+  AnnotationMap::AnnotationInfo info =
+      collector.annotation_map.LookUp(event.device_id, event.correlation_id);
+  event.annotation = info.annotation;
+  event.memcpy_info.copy_kind = CUPTI_ACTIVITY_MEMCPY_KIND_PTOP;
+  event.memcpy_info.num_bytes = memcpy->bytes;
+  event.memcpy_info.destination = memcpy->dstDeviceId;
+  event.memcpy_info.async = memcpy->flags & CUPTI_ACTIVITY_FLAG_MEMCPY_ASYNC;
+  event.memcpy_info.src_mem_kind = memcpy->srcKind;
+  event.memcpy_info.dst_mem_kind = memcpy->dstKind;
+#if TF_CUPTI_HAS_CHANNEL_ID
+  event.memcpy_info.channel_id = memcpy->channelID;
+  event.memcpy_info.channel_type = memcpy->channelType;
+#endif
+  collector.receive(std::move(event));
+}
+
+void AddCuptiOverheadActivityEvent(CuptiEventCollectorDelegate &collector,
+                                   const CUpti_ActivityOverhead *overhead) {
+  CuptiTracerEvent event{};
+  event.type = CuptiTracerEventType::Overhead;
+  event.name = getActivityOverheadKindString(overhead->overheadKind);
+  event.source = CuptiTracerEventSource::Activity;
+  event.start_time_ns = overhead->start;
+  event.end_time_ns = overhead->end;
+  // If the overhead is not related to a device, we assign it to device 0.
+  event.device_id = 0;
+  // NOTE: no correlation id.
+  switch (overhead->objectKind) {
+    case CUPTI_ACTIVITY_OBJECT_UNKNOWN:
+      // Don't know how to deal with such activities because of we need either
+      // attribute it to a GPU stream or a CPU thread.
+      return;
+
+    case CUPTI_ACTIVITY_OBJECT_THREAD:
+    case CUPTI_ACTIVITY_OBJECT_PROCESS:
+      event.thread_id = overhead->objectId.pt.threadId;
+      break;
+    case CUPTI_ACTIVITY_OBJECT_STREAM:
+      event.stream_id = overhead->objectId.dcs.streamId;
+      TF_FALLTHROUGH_INTENDED;
+    case CUPTI_ACTIVITY_OBJECT_DEVICE:
+    case CUPTI_ACTIVITY_OBJECT_CONTEXT:
+      event.device_id = overhead->objectId.dcs.deviceId;
+      break;
+    default:
+      LOG(ERROR) << "Unexpected object kind: " << overhead->objectKind;
+      return;
+  }
+  collector.receive(std::move(event));
+}
+
+void AddUnifiedMemoryActivityEvent(
+    CuptiEventCollectorDelegate &collector,
+    const CUpti_ActivityUnifiedMemoryCounter2 *record) {
+  VLOG(3) << "Cuda Unified Memory Activity, kind: " << record->counterKind
+          << " src: " << record->srcId << " dst: " << record->dstId;
+  CuptiTracerEvent event{};
+  event.type = CuptiTracerEventType::UnifiedMemory;
+  event.name = getActivityUnifiedMemoryKindString(record->counterKind);
+  event.source = CuptiTracerEventSource::Activity;
+  event.start_time_ns = record->start;
+  if (record->counterKind ==
+          CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT ||
+      record->counterKind ==
+          CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING ||
+      record->counterKind ==
+          CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP ||
+      record->end <= record->start) {
+    // If the end time is not valid, trim it so that it can be shown on the UI.
+    event.end_time_ns = record->start + 1;
+  } else {
+    event.end_time_ns = record->end;
+  }
+  event.device_id = record->srcId;
+  // NOTE: not context id and correlation id.
+
+  // For visualization purpose, we assign a pseudo stream id for each
+  // record->counterKind of unified memory related events.
+  constexpr int kPseudoStreamId = 0x10000000;
+  event.stream_id = kPseudoStreamId + record->counterKind;
+  event.memcpy_info.copy_kind = CUPTI_ACTIVITY_MEMCPY_KIND_UNKNOWN;
+  // Check whether the activity is byte transfer.
+  if (record->counterKind ==
+          CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD ||
+      record->counterKind ==
+          CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH ||
+      record->counterKind ==
+          CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOD) {
+    event.memcpy_info.num_bytes = record->value;
+  } else {
+    event.memcpy_info.num_bytes = 0;
+  }
+  event.memcpy_info.destination = record->dstId;
+  event.memcpy_info.async = false;
+  collector.receive(std::move(event));
+}
+
+void AddMemoryActivityEvent(CuptiEventCollectorDelegate &collector,
+                            const CUpti_ActivityMemory *memory) {
+  CuptiTracerEvent event{};
+  event.name = absl::StrCat("Memory ", GetMemoryKindName(memory->memoryKind));
+  event.type = CuptiTracerEventType::MemoryResidency;
+  event.source = CuptiTracerEventSource::Activity;
+  event.start_time_ns = memory->start;
+  event.end_time_ns = std::max(memory->end, memory->start + 1);
+  event.device_id = memory->deviceId;
+  event.context_id = memory->contextId;
+  // Assign to default stream (0) so that event is included during Flush().
+  event.stream_id = 0;
+  event.memory_residency_info.num_bytes = memory->bytes;
+  event.memory_residency_info.mem_kind = memory->memoryKind;
+  event.memory_residency_info.address = memory->address;
+  VLOG(5) << "Cuda activity " << event.name
+          << " addr: " << reinterpret_cast<void *>(memory->address)
+          << " bytes: " << memory->bytes;
+  collector.receive(std::move(event));
+}
+
+void AddMemsetActivityEvent(CuptiEventCollectorDelegate &collector,
+                            const CuptiActivityMemsetTy *memset) {
+  auto mem_kind = memset->memoryKind;
+  CuptiTracerEvent event{};
+  event.type = CuptiTracerEventType::Memset;
+  event.source = CuptiTracerEventSource::Activity;
+  event.name = absl::StrCat("Memset ", mem_kind);
+  event.start_time_ns = memset->start;
+  event.end_time_ns = std::max(memset->end, memset->start + 1);
+  event.device_id = memset->deviceId;
+  event.correlation_id = memset->correlationId;
+  event.context_id = memset->contextId;
+  event.stream_id = memset->streamId;
+  event.memset_info.num_bytes = memset->bytes;
+  event.memset_info.mem_kind = mem_kind;
+  event.memset_info.async = (memset->flags & CUPTI_ACTIVITY_FLAG_MEMSET_ASYNC);
+#if TF_CUPTI_HAS_CHANNEL_ID
+  event.memset_info.channel_id = memset->channelID;
+  event.memset_info.channel_type = memset->channelType;
+#endif
+  VLOG(5) << "Cuda activity " << event.name << " bytes: " << memset->bytes
+          << " async: " << event.memset_info.async;
+  collector.receive(std::move(event));
+}
+
+void AddSynchronizationActivityEvent(
+    CuptiEventCollectorDelegate &collector,
+    const CUpti_ActivitySynchronization *sync) {
+  CuptiTracerEvent event{};
+  event.type = CuptiTracerEventType::Generic;
+  event.source = CuptiTracerEventSource::Activity;
+  switch (sync->type) {
+    case CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_EVENT_SYNCHRONIZE:
+      event.name = "cuEventSynchronize";
+      break;
+    case CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_WAIT_EVENT:
+      event.name = "cuStreamWaitEvent";
+      break;
+    case CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_SYNCHRONIZE:
+      event.name = "cuStreamSynchronize";
+      break;
+    case CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_CONTEXT_SYNCHRONIZE:
+      event.name = "cuCtxSynchronize";
+      break;
+    default:
+      event.name = "unknown synchronization event";
+      break;
+  }
+  event.start_time_ns = sync->start;
+  event.end_time_ns = std::max(sync->end, sync->start + 1);
+  event.correlation_id = sync->correlationId;
+  event.context_id = sync->contextId;
+  VLOG(5) << "Cuda activity " << event.name;
+  collector.receive(std::move(event));
+}
+
+static absl::Status ConvertActivityBuffer(
+    CuptiEventCollectorDelegate &collector, uint8_t *buffer, const size_t size,
+    const size_t max_activity_event_count, size_t &total_activity_event_count,
+    size_t &dropped_activity_event_count) {
+  CuptiInterface *cupti_interface = GetCuptiInterface();
+  CUpti_Activity *record = nullptr;
+  while (true) {
+    CUptiResult status =
+        cupti_interface->ActivityGetNextRecord(buffer, size, &record);
+    if (status == CUPTI_SUCCESS) {
+      if (total_activity_event_count >= max_activity_event_count) {
+        dropped_activity_event_count++;
+        continue;
+      }
+      total_activity_event_count++;
+      switch (record->kind) {
+        case CUPTI_ACTIVITY_KIND_KERNEL:  // sequential
+        case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL:
+          AddKernelActivityEvent<TF_CUPTI_HAS_CHANNEL_ID>(
+              collector, reinterpret_cast<CuptiActivityKernelTy *>(record));
+          break;
+        case CUPTI_ACTIVITY_KIND_CDP_KERNEL:
+          AddKernelActivityEvent<false>(
+              collector, reinterpret_cast<CUpti_ActivityCdpKernel *>(record));
+          break;
+        case CUPTI_ACTIVITY_KIND_MEMCPY:
+          AddMemcpyActivityEvent(
+              collector, reinterpret_cast<CuptiActivityMemcpyTy *>(record));
+          break;
+        case CUPTI_ACTIVITY_KIND_MEMCPY2:
+          AddMemcpyP2PActivityEvent(
+              collector, reinterpret_cast<CuptiActivityMemcpyP2PTy *>(record));
+          break;
+        case CUPTI_ACTIVITY_KIND_OVERHEAD:
+          AddCuptiOverheadActivityEvent(
+              collector, reinterpret_cast<CUpti_ActivityOverhead *>(record));
+          break;
+        case CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER:
+          AddUnifiedMemoryActivityEvent(
+              collector,
+              reinterpret_cast<CUpti_ActivityUnifiedMemoryCounter2 *>(record));
+          break;
+        case CUPTI_ACTIVITY_KIND_MEMORY: {
+          AddMemoryActivityEvent(
+              collector, reinterpret_cast<CUpti_ActivityMemory *>(record));
+        } break;
+        case CUPTI_ACTIVITY_KIND_MEMSET:
+          AddMemsetActivityEvent(
+              collector, reinterpret_cast<CuptiActivityMemsetTy *>(record));
+          break;
+        case CUPTI_ACTIVITY_KIND_SYNCHRONIZATION:
+          AddSynchronizationActivityEvent(
+              collector,
+              reinterpret_cast<CUpti_ActivitySynchronization *>(record));
+          break;
+        default:
+          VLOG(3) << "Activity type " << record->kind << " is not supported.";
+          break;
+      }
+    } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
+      // Normal, just reach the end of the valid activity events.
+      break;
+    } else {
+      LOG(WARNING) << "CUPTI parse ACTIVITY buffer error: " << status;
+      return absl::Status(StatusCode::kInternal,
+                          "Parse cupti activity buffer error.");
+    }
+  }
+  VLOG(3) << "CUPTI tracer post-process one ACTIVITY buffer of size: " << size
+          << ", total events count:" << total_activity_event_count;
+  return OkStatus();
+}
+
+}  // namespace
+
+void AnnotationMap::Add(uint32_t device_id, uint32_t correlation_id,
+                        const absl::string_view annotation,
+                        const absl::string_view nvtx_range) {
+  if (annotation.empty() && nvtx_range.empty()) return;
+  VLOG(3) << "Add annotation: device_id: " << device_id
+          << " correlation_id: " << correlation_id
+          << " annotation: " << annotation;
+  if (device_id >= per_device_map_.size()) return;
+  auto &per_device_map = per_device_map_[device_id];
+  tsl::mutex_lock lock(per_device_map.mutex);
+  if (per_device_map.annotations.size() < max_size_) {
+    AnnotationInfo info;
+    info.annotation = *per_device_map.annotations.emplace(annotation).first;
+    if (!nvtx_range.empty())
+      info.nvtx_range = *per_device_map.nvtx_ranges.emplace(nvtx_range).first;
+    per_device_map.correlation_map.emplace(correlation_id, info);
+  }
+}
+
+AnnotationMap::AnnotationInfo AnnotationMap::LookUp(uint32_t device_id,
+                                                    uint32_t correlation_id) {
+  if (device_id >= per_device_map_.size()) return AnnotationInfo();
+  auto &per_device_map = per_device_map_[device_id];
+  tsl::mutex_lock lock(per_device_map.mutex);
+  auto it = per_device_map.correlation_map.find(correlation_id);
+  return it != per_device_map.correlation_map.end() ? it->second
+                                                    : AnnotationInfo();
+}
+
+CuptiActivityBufferManager::ActivityBufferAndSize::ActivityBufferAndSize(
+    uint8_t *p, size_t sz)
+    : buffer(p,
+             [](uint8_t *p) {
+               if (p != nullptr) tsl::port::AlignedFree(p);
+             }),
+      size(sz) {}
+
+void CuptiActivityBufferManager::AddCachedActivityEventsTo(
+    CuptiEventCollectorDelegate &collector,
+    const size_t max_activity_event_count,
+    size_t &dropped_activity_event_count) {
+  dropped_activity_event_count = 0;
+  size_t total_activity_event_count = 0;
+  tsl::mutex_lock lock(buffer_mutex_);
+  while (!cached_buffers_.empty()) {
+    ActivityBufferAndSize buffer_and_size(std::move(cached_buffers_.front()));
+    cached_buffers_.pop_front();
+    ConvertActivityBuffer(collector, buffer_and_size.buffer.get(),
+                          buffer_and_size.size, max_activity_event_count,
+                          total_activity_event_count,
+                          dropped_activity_event_count)
+        .IgnoreError();
+  }
+}
+
+}  // namespace profiler
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.h b/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.h
new file mode 100644
index 00000000000000..4ffb4f0ef626e2
--- /dev/null
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.h
@@ -0,0 +1,296 @@
+/* Copyright 2020 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_PROFILER_GPU_CUPTI_BUFFER_EVENTS_H_
+#define XLA_BACKENDS_PROFILER_GPU_CUPTI_BUFFER_EVENTS_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <list>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/container/fixed_array.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/node_hash_set.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "tsl/platform/mutex.h"
+#include "tsl/platform/thread_annotations.h"
+#include "tsl/profiler/utils/buffer_pool.h"
+
+namespace xla {
+namespace profiler {
+
+struct MemcpyDetails {
+  // The amount of data copied for memcpy events.
+  size_t num_bytes;
+  // The destination device for peer-2-peer communication (memcpy). The source
+  // device is implicit: it's the current device.
+  uint32_t destination;
+  // Whether or not the memcpy is asynchronous.
+  bool async;
+  // This contains CUpti_ActivityMemcpyKind for activity event (on device).
+  // For events from other CuptiTracerEventSource, it is always 0.
+  int8_t copy_kind;
+  // CUpti_ActivityMemoryKind of source.
+  int8_t src_mem_kind;
+  // CUpti_ActivityMemoryKind of destination.
+  int8_t dst_mem_kind;
+
+  // ID of the hardware channel on which this operation ran.
+  uint32_t channel_id = -1;
+  // CUpti_ChannelType of the channel above.
+  int8_t channel_type = 0;  // CUPTI_CHANNEL_TYPE_INVALID
+};
+
+struct MemAllocDetails {
+  // Size of memory to be written over in bytes.
+  size_t num_bytes;
+  // The CUpti_ActivityMemoryKind value for this activity event.
+  int8_t mem_kind;
+  // The virtual address of allocation. 0 if it is a free operation.
+  uint64_t address;
+};
+
+using MemFreeDetails = MemAllocDetails;
+
+// Memory residency contains details read from CUpti_ActivityMemory type. This
+// is populated in the CUPTI tracer encounters a CUPTI_ACTIVITY_KIND_MEMORY
+// event. The start of this even corresponse to a cudaMalloc, and the end
+// corresponds to a cudaFree.
+using MemoryResidencyDetails = MemAllocDetails;
+
+// cudaHostRegister
+struct HostRegisterDetails {
+  size_t num_bytes;
+  uint64_t address;
+  unsigned int flags;
+};
+
+// cudaHostUnregister
+struct HostUnregisterDetails {
+  uint64_t address;
+};
+
+struct MemsetDetails {
+  // Size of memory to be written over in bytes.
+  size_t num_bytes;
+  // The CUpti_ActivityMemoryKind value for this activity event.
+  int8_t mem_kind;
+  // Whether or not the memset is asynchronous.
+  bool async;
+
+  // ID of the hardware channel on which this operation ran.
+  uint32_t channel_id = -1;
+  // CUpti_ChannelType of the channel above.
+  int8_t channel_type = 0;  // CUPTI_CHANNEL_TYPE_INVALID
+};
+
+struct KernelDetails {
+  // The number of registers used in this kernel.
+  uint32_t registers_per_thread;
+  // The amount of shared memory space used by a thread block.
+  uint32_t static_shared_memory_usage;
+  // The amount of dynamic memory space used by a thread block.
+  uint32_t dynamic_shared_memory_usage;
+  // X-dimension of a thread block.
+  uint32_t block_x;
+  // Y-dimension of a thread block.
+  uint32_t block_y;
+  // Z-dimension of a thread block.
+  uint32_t block_z;
+  // X-dimension of a grid.
+  uint32_t grid_x;
+  // Y-dimension of a grid.
+  uint32_t grid_y;
+  // Z-dimension of a grid.
+  uint32_t grid_z;
+
+  // ID of the hardware channel on which this operation ran.
+  uint32_t channel_id = -1;
+  // CUpti_ChannelType of the channel above.
+  int8_t channel_type = 0;  // CUPTI_CHANNEL_TYPE_INVALID
+};
+
+inline std::string ToXStat(const KernelDetails& kernel_info,
+                           double occupancy_pct) {
+  return absl::StrCat(
+      "regs:", kernel_info.registers_per_thread,
+      " static_shared:", kernel_info.static_shared_memory_usage,
+      " dynamic_shared:", kernel_info.dynamic_shared_memory_usage,
+      " grid:", kernel_info.grid_x, ",", kernel_info.grid_y, ",",
+      kernel_info.grid_z, " block:", kernel_info.block_x, ",",
+      kernel_info.block_y, ",", kernel_info.block_z,
+      " occ_pct:", occupancy_pct);
+}
+
+// Gets the name of the CUpti_ActivityMemoryKind value.
+absl::string_view GetMemoryKindName(int8_t memory_kind);
+
+enum class CuptiTracerEventType {
+  Unsupported = 0,
+  Kernel = 1,
+  MemcpyH2D = 2,
+  MemcpyD2H = 3,
+  MemcpyD2D = 4,
+  MemcpyP2P = 5,
+  MemcpyOther = 6,
+  MemoryAlloc = 7,
+  Overhead = 8,
+  UnifiedMemory = 9,
+  MemoryFree = 10,
+  Memset = 11,
+  MemoryResidency = 12,
+  HostRegister = 13,
+  HostUnregister = 14,
+  Generic = 100,
+};
+
+const char* GetTraceEventTypeName(const CuptiTracerEventType& type);
+
+enum class CuptiTracerEventSource {
+  Invalid = 0,
+  DriverCallback = 1,
+  Activity = 2,
+  // Maybe consider adding runtime callback and metric api in the future.
+};
+
+struct CuptiTracerEvent {
+  static constexpr uint32_t kInvalidThreadId =
+      std::numeric_limits<uint32_t>::max();
+  static constexpr uint32_t kInvalidCorrelationId =
+      std::numeric_limits<uint32_t>::max();
+  static constexpr uint64_t kInvalidContextId =
+      std::numeric_limits<uint64_t>::max();
+  static constexpr uint64_t kInvalidStreamId =
+      std::numeric_limits<uint64_t>::max();
+  CuptiTracerEventType type = CuptiTracerEventType::Unsupported;
+  CuptiTracerEventSource source = CuptiTracerEventSource::Invalid;
+  // Although CUpti_CallbackData::functionName is persistent, however
+  // CUpti_ActivityKernel4::name is not persistent, therefore we need a copy of
+  // it.
+  std::string name;
+  // This points to strings in AnnotationMap, which should outlive the point
+  // where serialization happens.
+  absl::string_view annotation;
+  absl::string_view nvtx_range;
+  uint64_t start_time_ns = 0;
+  uint64_t end_time_ns = 0;
+  uint32_t device_id = 0;
+  uint32_t correlation_id = kInvalidCorrelationId;
+  uint32_t thread_id = kInvalidThreadId;
+  int64_t context_id = kInvalidContextId;
+  int64_t stream_id = kInvalidStreamId;
+  union {
+    // For Memcpy API and activities. `type` must be Memcpy*.
+    MemcpyDetails memcpy_info;
+    // Used for MemAlloc API. `type` must be MemoryAlloc.
+    MemAllocDetails memalloc_info;
+    // Used for kernel activities. `type` must be Kernel.
+    KernelDetails kernel_info;
+    // Used for MemFree activities. `type` must be MemoryFree.
+    MemFreeDetails memfree_info;
+    // Used for cuMemHostRegister.  `type` must be HostRegister.
+    HostRegisterDetails host_register_info;
+    // Used for cuMemHostUnregister.  `type` must be HostUnregister.
+    HostUnregisterDetails host_unregister_info;
+    // Used for Memset API and activities. `type` must be Memset.
+    MemsetDetails memset_info;
+    // Used for Memory residency activities. `type` must be MemoryResidency.
+    MemoryResidencyDetails memory_residency_info;
+  };
+};
+
+class AnnotationMap {
+ public:
+  struct AnnotationInfo {
+    absl::string_view annotation;
+    absl::string_view nvtx_range;
+  };
+
+  explicit AnnotationMap(uint64_t max_size, uint32_t num_gpus)
+      : max_size_(max_size), per_device_map_(num_gpus) {}
+  void Add(uint32_t device_id, uint32_t correlation_id,
+           absl::string_view annotation, absl::string_view nvtx_range);
+  AnnotationInfo LookUp(uint32_t device_id, uint32_t correlation_id);
+
+ private:
+  struct PerDeviceAnnotationMap {
+    // The population/consumption of annotations might happen from multiple
+    // callback/activity api related threads.
+    tsl::mutex mutex;
+    // Annotation tends to be repetitive, use a hash_set to store the strings,
+    // an use the reference to the string in the map.
+    absl::node_hash_set<std::string> annotations TF_GUARDED_BY(mutex);
+    absl::node_hash_set<std::string> nvtx_ranges TF_GUARDED_BY(mutex);
+    absl::flat_hash_map<uint32_t, AnnotationInfo> correlation_map
+        TF_GUARDED_BY(mutex);
+  };
+  const uint64_t max_size_;
+  absl::FixedArray<PerDeviceAnnotationMap> per_device_map_;
+
+  AnnotationMap(const AnnotationMap&) = delete;
+  void operator=(const AnnotationMap&) = delete;
+};
+
+struct CuptiEventCollectorDelegate {
+  AnnotationMap& annotation_map;
+  std::function<void(CuptiTracerEvent&&)> receive;
+  explicit CuptiEventCollectorDelegate(
+      AnnotationMap& p_annotation_map,
+      std::function<void(CuptiTracerEvent&&)> p_receive)
+      : annotation_map(p_annotation_map), receive(std::move(p_receive)) {}
+};
+
+class CuptiActivityBufferManager {
+ public:
+  struct ActivityBufferAndSize {
+    std::unique_ptr<uint8_t, std::function<void(uint8_t*)>> buffer;
+    size_t size;  // size in bytes for the events filled by CUPTI.
+    explicit ActivityBufferAndSize(uint8_t* p = nullptr, size_t sz = 0);
+  };
+
+  explicit CuptiActivityBufferManager(size_t buffer_size_in_bytes)
+      : buffer_pool_(buffer_size_in_bytes) {}
+
+  size_t GetBufferSizeInBytes() { return buffer_pool_.GetBufferSizeInBytes(); }
+
+  uint8_t* GetOrCreateBuffer() { return buffer_pool_.GetOrCreateBuffer(); }
+
+  void ReclaimBuffer(uint8_t* p) { buffer_pool_.ReclaimBuffer(p); }
+
+  void CacheCuptiFilledActivityBuffer(uint8_t* p, size_t sz) {
+    tsl::mutex_lock lock(buffer_mutex_);
+    cached_buffers_.emplace_back(p, sz);
+  }
+
+  void AddCachedActivityEventsTo(CuptiEventCollectorDelegate& receiver,
+                                 size_t max_activity_event_count,
+                                 size_t& dropped_activity_event_count);
+
+ private:
+  tsl::profiler::BufferPool buffer_pool_;
+  tsl::mutex buffer_mutex_;
+  std::list<ActivityBufferAndSize> cached_buffers_ TF_GUARDED_BY(buffer_mutex_);
+};
+
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_PROFILER_GPU_CUPTI_BUFFER_EVENTS_H_
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_collector.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_collector.cc
index c29cfc558c0632..f719c8d1daeb74 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_collector.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_collector.cc
@@ -335,8 +335,7 @@ class PerDeviceCollector {
         continue;
       }
       auto* plane = is_host_event ? host_plane : device_plane;
-      VLOG(9) << "Event"
-              << " type=" << static_cast<int>(event.type)
+      VLOG(9) << "Event" << " type=" << static_cast<int>(event.type)
               << " line_id=" << line_id
               << (is_host_event ? " host plane=" : " device plane=")
               << plane->Name();
@@ -469,35 +468,21 @@ class PerDeviceCollector {
 
 }  // namespace
 
-void AnnotationMap::Add(uint32_t device_id, uint32_t correlation_id,
-                        const absl::string_view annotation,
-                        const absl::string_view nvtx_range) {
-  if (annotation.empty() && nvtx_range.empty()) return;
-  VLOG(3) << "Add annotation: device_id: " << device_id
-          << " correlation_id: " << correlation_id
-          << " annotation: " << annotation;
-  if (device_id >= per_device_map_.size()) return;
-  auto& per_device_map = per_device_map_[device_id];
-  absl::MutexLock lock(&per_device_map.mutex);
-  if (per_device_map.annotations.size() < max_size_) {
-    AnnotationInfo info;
-    info.annotation = *per_device_map.annotations.emplace(annotation).first;
-    if (!nvtx_range.empty())
-      info.nvtx_range = *per_device_map.nvtx_ranges.emplace(nvtx_range).first;
-    per_device_map.correlation_map.emplace(correlation_id, info);
+void CuptiTraceCollector::OnTracerCachedActivityBuffers(
+    std::unique_ptr<CuptiActivityBufferManager> activity_buffers) {
+  size_t dropped_activity_event_count = 0;
+  CuptiEventCollectorDelegate collector(
+      *annotation_map(),
+      [this](CuptiTracerEvent&& ev) { this->AddEvent(std::move(ev)); });
+  activity_buffers->AddCachedActivityEventsTo(collector,
+                                              options_.max_activity_api_events,
+                                              dropped_activity_event_count);
+  if (dropped_activity_event_count > 0) {
+    OnEventsDropped("total device(activity) events reaches max",
+                    dropped_activity_event_count);
   }
 }
 
-AnnotationMap::AnnotationInfo AnnotationMap::LookUp(uint32_t device_id,
-                                                    uint32_t correlation_id) {
-  if (device_id >= per_device_map_.size()) return AnnotationInfo();
-  auto& per_device_map = per_device_map_[device_id];
-  absl::MutexLock lock(&per_device_map.mutex);
-  auto it = per_device_map.correlation_map.find(correlation_id);
-  return it != per_device_map.correlation_map.end() ? it->second
-                                                    : AnnotationInfo();
-}
-
 // CuptiTraceCollectorImpl store the CuptiTracerEvents from CuptiTracer and
 // eventually convert and filter them to XSpace.
 class CuptiTraceCollectorImpl : public CuptiTraceCollector {
@@ -534,9 +519,18 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
     absl::MutexLock lock(&mutex_);
     dropped_events_[reason] += num_events;
   }
+
+  void OnTracerCachedActivityBuffers(
+      std::unique_ptr<CuptiActivityBufferManager> activity_buffers) override {
+    activity_buffers_ = std::move(activity_buffers);
+  }
+
   void Flush() override {}
   // Returns true if some GPU events are captured.
   bool Export(XSpace* space, uint64_t end_gpu_ns) override {
+    CuptiTraceCollector::OnTracerCachedActivityBuffers(
+        std::move(activity_buffers_));
+
     LOG(INFO) << " GpuTracer has collected " << num_callback_events_
               << " callback api events and " << num_activity_events_
               << " activity events. " << ReportDroppedEvents();
@@ -547,8 +541,8 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
       std::string name = GpuPlaneName(device_ordinal);
       XPlaneBuilder device_plane(FindOrAddMutablePlaneWithName(space, name));
       device_plane.SetId(device_ordinal);
-      VLOG(4) << "Creating plane for"
-              << " name=" << name << " ordinal=" << device_ordinal;
+      VLOG(4) << "Creating plane for" << " name=" << name
+              << " ordinal=" << device_ordinal;
 
       // Calculate device capabilities before flushing, so that device
       // properties are available to the occupancy calculator in Flush().
@@ -585,6 +579,7 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
  private:
   std::atomic<int> num_callback_events_;
   std::atomic<int> num_activity_events_;
+  std::unique_ptr<CuptiActivityBufferManager> activity_buffers_;
   absl::Mutex mutex_;
   absl::flat_hash_map<std::string, uint64_t> dropped_events_
       ABSL_GUARDED_BY(mutex_);
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_collector.h b/third_party/xla/xla/backends/profiler/gpu/cupti_collector.h
index 8a523548256b59..98fdbb9d05478a 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_collector.h
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_collector.h
@@ -19,197 +19,12 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 
-#include "absl/container/fixed_array.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/node_hash_set.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "tsl/platform/types.h"
+#include "xla/backends/profiler/gpu/cupti_buffer_events.h"
 #include "tsl/profiler/protobuf/xplane.pb.h"
 
 namespace xla {
 namespace profiler {
 
-struct MemcpyDetails {
-  // The amount of data copied for memcpy events.
-  size_t num_bytes;
-  // The destination device for peer-2-peer communication (memcpy). The source
-  // device is implicit: it's the current device.
-  uint32_t destination;
-  // Whether or not the memcpy is asynchronous.
-  bool async;
-  // This contains CUpti_ActivityMemcpyKind for activity event (on device).
-  // For events from other CuptiTracerEventSource, it is always 0.
-  int8_t copy_kind;
-  // CUpti_ActivityMemoryKind of source.
-  int8_t src_mem_kind;
-  // CUpti_ActivityMemoryKind of destination.
-  int8_t dst_mem_kind;
-
-  // ID of the hardware channel on which this operation ran.
-  uint32_t channel_id = -1;
-  // CUpti_ChannelType of the channel above.
-  int8_t channel_type = 0;  // CUPTI_CHANNEL_TYPE_INVALID
-};
-
-struct MemAllocDetails {
-  // Size of memory to be written over in bytes.
-  size_t num_bytes;
-  // The CUpti_ActivityMemoryKind value for this activity event.
-  int8_t mem_kind;
-  // The virtual address of allocation. 0 if it is a free operation.
-  uint64_t address;
-};
-
-using MemFreeDetails = MemAllocDetails;
-
-// Memory residency contains details read from CUpti_ActivityMemory type. This
-// is populated in the CUPTI tracer encounters a CUPTI_ACTIVITY_KIND_MEMORY
-// event. The start of this even corresponse to a cudaMalloc, and the end
-// corresponds to a cudaFree.
-using MemoryResidencyDetails = MemAllocDetails;
-
-// cudaHostRegister
-struct HostRegisterDetails {
-  size_t num_bytes;
-  uint64_t address;
-  unsigned int flags;
-};
-
-// cudaHostUnregister
-struct HostUnregisterDetails {
-  uint64_t address;
-};
-
-struct MemsetDetails {
-  // Size of memory to be written over in bytes.
-  size_t num_bytes;
-  // The CUpti_ActivityMemoryKind value for this activity event.
-  int8_t mem_kind;
-  // Whether or not the memset is asynchronous.
-  bool async;
-
-  // ID of the hardware channel on which this operation ran.
-  uint32_t channel_id = -1;
-  // CUpti_ChannelType of the channel above.
-  int8_t channel_type = 0;  // CUPTI_CHANNEL_TYPE_INVALID
-};
-
-struct KernelDetails {
-  // The number of registers used in this kernel.
-  uint32_t registers_per_thread;
-  // The amount of shared memory space used by a thread block.
-  uint32_t static_shared_memory_usage;
-  // The amount of dynamic memory space used by a thread block.
-  uint32_t dynamic_shared_memory_usage;
-  // X-dimension of a thread block.
-  uint32_t block_x;
-  // Y-dimension of a thread block.
-  uint32_t block_y;
-  // Z-dimension of a thread block.
-  uint32_t block_z;
-  // X-dimension of a grid.
-  uint32_t grid_x;
-  // Y-dimension of a grid.
-  uint32_t grid_y;
-  // Z-dimension of a grid.
-  uint32_t grid_z;
-
-  // ID of the hardware channel on which this operation ran.
-  uint32_t channel_id = -1;
-  // CUpti_ChannelType of the channel above.
-  int8_t channel_type = 0;  // CUPTI_CHANNEL_TYPE_INVALID
-};
-
-inline std::string ToXStat(const KernelDetails& kernel_info,
-                           double occupancy_pct) {
-  return absl::StrCat(
-      "regs:", kernel_info.registers_per_thread,
-      " static_shared:", kernel_info.static_shared_memory_usage,
-      " dynamic_shared:", kernel_info.dynamic_shared_memory_usage,
-      " grid:", kernel_info.grid_x, ",", kernel_info.grid_y, ",",
-      kernel_info.grid_z, " block:", kernel_info.block_x, ",",
-      kernel_info.block_y, ",", kernel_info.block_z,
-      " occ_pct:", occupancy_pct);
-}
-
-// Gets the name of the CUpti_ActivityMemoryKind value.
-absl::string_view GetMemoryKindName(int8_t memory_kind);
-
-enum class CuptiTracerEventType {
-  Unsupported = 0,
-  Kernel = 1,
-  MemcpyH2D = 2,
-  MemcpyD2H = 3,
-  MemcpyD2D = 4,
-  MemcpyP2P = 5,
-  MemcpyOther = 6,
-  MemoryAlloc = 7,
-  Overhead = 8,
-  UnifiedMemory = 9,
-  MemoryFree = 10,
-  Memset = 11,
-  MemoryResidency = 12,
-  HostRegister = 13,
-  HostUnregister = 14,
-  Generic = 100,
-};
-
-const char* GetTraceEventTypeName(const CuptiTracerEventType& type);
-
-enum class CuptiTracerEventSource {
-  Invalid = 0,
-  DriverCallback = 1,
-  Activity = 2,
-  // Maybe consider adding runtime callback and metric api in the future.
-};
-
-struct CuptiTracerEvent {
-  static constexpr uint32_t kInvalidThreadId =
-      std::numeric_limits<uint32_t>::max();
-  static constexpr uint32_t kInvalidCorrelationId =
-      std::numeric_limits<uint32_t>::max();
-  static constexpr uint64_t kInvalidContextId =
-      std::numeric_limits<uint64_t>::max();
-  static constexpr uint64_t kInvalidStreamId =
-      std::numeric_limits<uint64_t>::max();
-  CuptiTracerEventType type = CuptiTracerEventType::Unsupported;
-  CuptiTracerEventSource source = CuptiTracerEventSource::Invalid;
-  // Although CUpti_CallbackData::functionName is persistent, however
-  // CUpti_ActivityKernel4::name is not persistent, therefore we need a copy of
-  // it.
-  std::string name;
-  // This points to strings in AnnotationMap, which should outlive the point
-  // where serialization happens.
-  absl::string_view annotation;
-  absl::string_view nvtx_range;
-  uint64_t start_time_ns = 0;
-  uint64_t end_time_ns = 0;
-  uint32_t device_id = 0;
-  uint32_t correlation_id = kInvalidCorrelationId;
-  uint32_t thread_id = kInvalidThreadId;
-  int64_t context_id = kInvalidContextId;
-  int64_t stream_id = kInvalidStreamId;
-  union {
-    // For Memcpy API and activities. `type` must be Memcpy*.
-    MemcpyDetails memcpy_info;
-    // Used for MemAlloc API. `type` must be MemoryAlloc.
-    MemAllocDetails memalloc_info;
-    // Used for kernel activities. `type` must be Kernel.
-    KernelDetails kernel_info;
-    // Used for MemFree activities. `type` must be MemoryFree.
-    MemFreeDetails memfree_info;
-    // Used for cuMemHostRegister.  `type` must be HostRegister.
-    HostRegisterDetails host_register_info;
-    // Used for cuMemHostUnregister.  `type` must be HostUnregister.
-    HostUnregisterDetails host_unregister_info;
-    // Used for Memset API and activities. `type` must be Memset.
-    MemsetDetails memset_info;
-    // Used for Memory residency activities. `type` must be MemoryResidency.
-    MemoryResidencyDetails memory_residency_info;
-  };
-};
-
 struct CuptiTracerCollectorOptions {
   // Maximum number of events to collect from callback API; if -1, no limit.
   // if 0, the callback API is enabled to build a correlation map, but no
@@ -223,38 +38,6 @@ struct CuptiTracerCollectorOptions {
   uint32_t num_gpus;
 };
 
-class AnnotationMap {
- public:
-  struct AnnotationInfo {
-    absl::string_view annotation;
-    absl::string_view nvtx_range;
-  };
-
-  explicit AnnotationMap(uint64_t max_size, uint32_t num_gpus)
-      : max_size_(max_size), per_device_map_(num_gpus) {}
-  void Add(uint32_t device_id, uint32_t correlation_id,
-           const absl::string_view annotation,
-           const absl::string_view nvtx_range);
-  AnnotationInfo LookUp(uint32_t device_id, uint32_t correlation_id);
-
- private:
-  struct PerDeviceAnnotationMap {
-    // The population/consumption of annotations might happen from multiple
-    // callback/activity api related threads.
-    absl::Mutex mutex;
-    // Annotation tends to be repetitive, use a hash_set to store the strings,
-    // an use the reference to the string in the map.
-    absl::node_hash_set<std::string> annotations;
-    absl::node_hash_set<std::string> nvtx_ranges;
-    absl::flat_hash_map<uint32_t, AnnotationInfo> correlation_map;
-  };
-  const uint64_t max_size_;
-  absl::FixedArray<PerDeviceAnnotationMap> per_device_map_;
-
-  AnnotationMap(const AnnotationMap&) = delete;
-  void operator=(const AnnotationMap&) = delete;
-};
-
 class CuptiTraceCollector {
  public:
   explicit CuptiTraceCollector(const CuptiTracerCollectorOptions& options)
@@ -268,6 +51,17 @@ class CuptiTraceCollector {
                                uint32_t num_events) = 0;
   virtual void Flush() = 0;
 
+  // CuptiTracer tracer now cache all activity buffers during tracing.
+  // After tracing stop, the cached activity buffers will be send here.
+  // Default behavior is direct process those cached activity events and
+  // add it into this class by calling AddEvent().
+  // Yet collector could just save activity buffers without processing here,
+  // but process and AddEvent() later when needed, such as during export().
+  // This could make the profiling stop() timestamp, if used by upper
+  // level wrapper, do not contains time used by exporting events.
+  virtual void OnTracerCachedActivityBuffers(
+      std::unique_ptr<CuptiActivityBufferManager> activity_buffers);
+
   // Consumer side functions (i.e. called by GPU tracer);
   virtual bool Export(tensorflow::profiler::XSpace* space,
                       uint64_t end_gpu_ns) {
@@ -277,6 +71,8 @@ class CuptiTraceCollector {
 
   AnnotationMap* annotation_map() { return &annotation_map_; }
 
+  const CuptiTracerCollectorOptions& GetOptions() const { return options_; }
+
  protected:
   CuptiTracerCollectorOptions options_;
 
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager.cc
index d50be6c955e7e0..514a52c6709917 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager.cc
@@ -123,6 +123,14 @@ CUptiResult CuptiErrorManager::ActivityRegisterCallbacks(
   return error;
 }
 
+CUptiResult CuptiErrorManager::ActivityUsePerThreadBuffer() {
+  IGNORE_CALL_IF_DISABLED;
+  CUptiResult error = interface_->ActivityUsePerThreadBuffer();
+  // Don't disable cupti just because the gpu driver or cuda don't support
+  // per-thread activity buffer.
+  return error;
+}
+
 CUptiResult CuptiErrorManager::GetDeviceId(CUcontext context,
                                            uint32_t* device_id) {
   IGNORE_CALL_IF_DISABLED;
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager.h b/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager.h
index 863ae2730a7ed5..6331016b39d54d 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager.h
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager.h
@@ -70,6 +70,8 @@ class CuptiErrorManager : public xla::profiler::CuptiInterface {
       CUpti_BuffersCallbackRequestFunc func_buffer_requested,
       CUpti_BuffersCallbackCompleteFunc func_buffer_completed) override;
 
+  CUptiResult ActivityUsePerThreadBuffer() override;
+
   // Returns device ID for a given context.
   CUptiResult GetDeviceId(CUcontext context, uint32_t* device_id) override;
 
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager_test.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager_test.cc
index e513e6322db927..acdacf044d09d0 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager_test.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager_test.cc
@@ -121,6 +121,10 @@ TEST_F(CuptiErrorManagerTest, GpuTraceActivityEnableTest) {
   EXPECT_CALL(*mock_, EnableCallback(1, _, _, _))
       .InSequence(s1)
       .WillOnce(Invoke(cupti_wrapper_.get(), &CuptiWrapper::EnableCallback));
+  EXPECT_CALL(*mock_, ActivityUsePerThreadBuffer())
+      .InSequence(s1)
+      .WillOnce(Invoke(cupti_wrapper_.get(),
+                       &CuptiWrapper::ActivityUsePerThreadBuffer));
   EXPECT_CALL(*mock_, ActivityRegisterCallbacks(_, _))
       .InSequence(s1)
       .WillOnce(Invoke(cupti_wrapper_.get(),
@@ -165,6 +169,10 @@ TEST_F(CuptiErrorManagerTest, GpuTraceAutoEnableTest) {
   EXPECT_CALL(*mock_, EnableDomain(1, _, _))
       .InSequence(s1)
       .WillOnce(Invoke(cupti_wrapper_.get(), &CuptiWrapper::EnableDomain));
+  EXPECT_CALL(*mock_, ActivityUsePerThreadBuffer())
+      .InSequence(s1)
+      .WillOnce(Invoke(cupti_wrapper_.get(),
+                       &CuptiWrapper::ActivityUsePerThreadBuffer));
   EXPECT_CALL(*mock_, ActivityRegisterCallbacks(_, _))
       .InSequence(s1)
       .WillOnce(Invoke(cupti_wrapper_.get(),
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_interface.h b/third_party/xla/xla/backends/profiler/gpu/cupti_interface.h
index b487673d07e6a1..b4a96507228827 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_interface.h
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_interface.h
@@ -60,6 +60,8 @@ class CuptiInterface {
       CUpti_BuffersCallbackRequestFunc func_buffer_requested,
       CUpti_BuffersCallbackCompleteFunc func_buffer_completed) = 0;
 
+  virtual CUptiResult ActivityUsePerThreadBuffer() = 0;
+
   virtual CUptiResult GetDeviceId(CUcontext context, uint32_t* deviceId) = 0;
 
   virtual CUptiResult GetTimestamp(uint64_t* timestamp) = 0;
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.cc
index a7ebab58e6e3f5..913171fc960d76 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.cc
@@ -16,10 +16,7 @@ limitations under the License.
 #include "xla/backends/profiler/gpu/cupti_tracer.h"
 
 #include "absl/cleanup/cleanup.h"
-#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
-#include "absl/container/node_hash_map.h"
-#include "absl/container/node_hash_set.h"
 #include "third_party/gpus/cuda/extras/CUPTI/include/cupti_activity.h"
 #include "third_party/gpus/cuda/extras/CUPTI/include/generated_nvtx_meta.h"
 #include "third_party/gpus/cuda/include/cuda.h"
@@ -38,35 +35,9 @@ namespace profiler {
 namespace {
 
 using absl::OkStatus;
-using absl::Status;
 using tsl::Env;
 using tsl::profiler::AnnotationStack;
 
-// CUPTI from CUDA 11.6 adds information about the hardware channel that ops
-// run on; this makes its way into the channel_id and channel_type fields in the
-// structs we export.
-//
-// Define some type aliases so we can access the hardware channel id if it's
-// available.
-#if CUDA_VERSION >= 12000  // CUDA 12.0
-#define TF_CUPTI_HAS_CHANNEL_ID 1
-using CuptiActivityKernelTy = CUpti_ActivityKernel9;
-using CuptiActivityMemcpyTy = CUpti_ActivityMemcpy5;
-using CuptiActivityMemcpyP2PTy = CUpti_ActivityMemcpyPtoP4;
-using CuptiActivityMemsetTy = CUpti_ActivityMemset4;
-#elif CUDA_VERSION >= 11060  // CUDA 11.6
-#define TF_CUPTI_HAS_CHANNEL_ID 1
-using CuptiActivityKernelTy = CUpti_ActivityKernel7;
-using CuptiActivityMemcpyTy = CUpti_ActivityMemcpy5;
-using CuptiActivityMemcpyP2PTy = CUpti_ActivityMemcpyPtoP4;
-using CuptiActivityMemsetTy = CUpti_ActivityMemset4;
-#else
-using CuptiActivityKernelTy = CUpti_ActivityKernel4;
-using CuptiActivityMemcpyTy = CUpti_ActivityMemcpy;
-using CuptiActivityMemcpyP2PTy = CUpti_ActivityMemcpy2;
-using CuptiActivityMemsetTy = CUpti_ActivityMemset;
-#endif
-
 static thread_local int internalCuCall = 0;
 
 // Temporary disable cupti api tracing for this thread during the life scope of
@@ -77,7 +48,7 @@ class CuptiApiTracingDisabler {
   ~CuptiApiTracingDisabler() { internalCuCall--; }
 };
 
-Status ToStatus(CUptiResult result) {
+absl::Status ToStatus(CUptiResult result) {
   if (result == CUPTI_SUCCESS) {
     return OkStatus();
   }
@@ -86,7 +57,7 @@ Status ToStatus(CUptiResult result) {
   return tsl::errors::Unavailable("CUPTI error: ", str ? str : "<unknown>");
 }
 
-Status ToStatus(CUresult result) {
+absl::Status ToStatus(CUresult result) {
   if (result == CUDA_SUCCESS) {
     return OkStatus();
   }
@@ -95,53 +66,11 @@ Status ToStatus(CUresult result) {
   return tsl::errors::Unavailable("CUDA error: ", str ? str : "<unknown>");
 }
 
-inline void LogIfError(const Status &status) {
+inline void LogIfError(const absl::Status &status) {
   if (status.ok()) return;
   LOG(ERROR) << status.message();
 }
 
-// Maps an OverheadKind enum to a const string.
-const char *getActivityOverheadKindString(CUpti_ActivityOverheadKind kind) {
-  switch (kind) {
-    case CUPTI_ACTIVITY_OVERHEAD_DRIVER_COMPILER:
-      return "COMPILER";
-    case CUPTI_ACTIVITY_OVERHEAD_CUPTI_BUFFER_FLUSH:
-      return "BUFFER_FLUSH";
-    case CUPTI_ACTIVITY_OVERHEAD_CUPTI_INSTRUMENTATION:
-      return "INSTRUMENTATION";
-    case CUPTI_ACTIVITY_OVERHEAD_CUPTI_RESOURCE:
-      return "RESOURCE";
-    default:
-      break;
-  }
-  return "<UNKNOWN>";
-}
-
-const char *getActivityUnifiedMemoryKindString(
-    CUpti_ActivityUnifiedMemoryCounterKind kind) {
-  switch (kind) {
-    case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD:
-      return "UM_BYTES_TRANSFER_HTOD";
-    case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH:
-      return "UM_BYTES_TRANSFER_DTOH";
-    case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT:
-      return "UM_CPU_PAGE_FAULT";
-    case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT:
-      return "UM_GPU_PAGE_FAULT";
-    case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING:
-      return "UM_THRASHING";
-    case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING:
-      return "UM_THROTTLING";
-    case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP:
-      return "UM_REMOTE_MAP";
-    case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOD:
-      return "UM_BYTES_TRANSFER_DTOD";
-    default:
-      break;
-  }
-  return "<UNKNOWN>";
-}
-
 // CUPTI_ERROR_INSUFFICIENT_PRIVILEGES is introduced at CUDA 10.1.
 #if CUDA_VERSION <= 10000
 #define CUPTI_ERROR_INSUFFICIENT_PRIVILEGES 35
@@ -372,8 +301,9 @@ void CUPTIAPI ProcessCuptiActivityBuffer(CUcontext context, uint32_t stream_id,
           << " size: " << size << " valid_size: " << valid_size;
   VLOG(3) << "Activity profile for stream " << stream_id;
 
-  Status status = CuptiTracer::GetCuptiTracerSingleton()->ProcessActivityBuffer(
-      context, stream_id, buffer, valid_size);
+  absl::Status status =
+      CuptiTracer::GetCuptiTracerSingleton()->ProcessActivityBuffer(
+          context, stream_id, buffer, valid_size);
   if (!status.ok()) {
     LOG(ERROR) << status;
   }
@@ -513,8 +443,8 @@ void AddCuMemAllocEventUponApiExit(CuptiTraceCollector *collector,
   event.correlation_id = cbdata->correlationId;
   event.memalloc_info.address = reinterpret_cast<uintptr_t>(dptr);
   event.memalloc_info.num_bytes = params->bytesize;
-  VLOG(3) << "Cuda MemAlloc API exit."
-          << " dptr=" << dptr << " sz=" << params->bytesize;
+  VLOG(3) << "Cuda MemAlloc API exit." << " dptr=" << dptr
+          << " sz=" << params->bytesize;
   collector->AddEvent(std::move(event));
 }
 
@@ -537,8 +467,8 @@ void AddCuMemAllocPitchEventUponApiExit(
   const size_t size_in_bytes = *params->pPitch * params->Height;
   event.memalloc_info.address = reinterpret_cast<uintptr_t>(dptr);
   event.memalloc_info.num_bytes = size_in_bytes;
-  VLOG(3) << "Cuda MemAllocPitch API exit."
-          << " dptr=" << dptr << " sz=" << size_in_bytes;
+  VLOG(3) << "Cuda MemAllocPitch API exit." << " dptr=" << dptr
+          << " sz=" << size_in_bytes;
   collector->AddEvent(std::move(event));
 }
 
@@ -560,8 +490,8 @@ void AddCuMemAllocManagedEventUponApiExit(
   event.correlation_id = cbdata->correlationId;
   event.memalloc_info.address = reinterpret_cast<uintptr_t>(dptr);
   event.memalloc_info.num_bytes = params->bytesize;
-  VLOG(3) << "Cuda MemAllocManaged API exit."
-          << " dptr=" << dptr << " sz=" << params->bytesize;
+  VLOG(3) << "Cuda MemAllocManaged API exit." << " dptr=" << dptr
+          << " sz=" << params->bytesize;
   collector->AddEvent(std::move(event));
 }
 
@@ -584,8 +514,8 @@ void AddCuMemAllocHostEventUponApiExit(CuptiTraceCollector *collector,
   event.correlation_id = cbdata->correlationId;
   event.memalloc_info.address = reinterpret_cast<uintptr_t>(*params->pp);
   event.memalloc_info.num_bytes = params->bytesize;
-  VLOG(3) << "Cuda MemAllocHost API exit."
-          << " pp=" << *params->pp << " sz=" << params->bytesize;
+  VLOG(3) << "Cuda MemAllocHost API exit." << " pp=" << *params->pp
+          << " sz=" << params->bytesize;
   collector->AddEvent(std::move(event));
 }
 
@@ -608,9 +538,8 @@ void AddCuMemHostAllocEventUponApiExit(CuptiTraceCollector *collector,
   event.correlation_id = cbdata->correlationId;
   event.memalloc_info.address = reinterpret_cast<uintptr_t>(*params->pp);
   event.memalloc_info.num_bytes = params->bytesize;
-  VLOG(3) << "Cuda MemHostAlloc API exit."
-          << " pp=" << *params->pp << " sz=" << params->bytesize
-          << " Flags=" << params->Flags;
+  VLOG(3) << "Cuda MemHostAlloc API exit." << " pp=" << *params->pp
+          << " sz=" << params->bytesize << " Flags=" << params->Flags;
   collector->AddEvent(std::move(event));
 }
 
@@ -632,8 +561,7 @@ void AddCuMemFreeEventUponApiExit(CuptiTraceCollector *collector,
   event.context_id = cbdata->contextUid;
   event.correlation_id = cbdata->correlationId;
   event.memfree_info.address = reinterpret_cast<uintptr_t>(dptr);
-  VLOG(3) << "Cuda MemFree API exit."
-          << " dptr=" << dptr;
+  VLOG(3) << "Cuda MemFree API exit." << " dptr=" << dptr;
   collector->AddEvent(std::move(event));
 }
 
@@ -654,8 +582,7 @@ void AddCuMemFreeHostEventUponApiExit(CuptiTraceCollector *collector,
   event.context_id = cbdata->contextUid;
   event.correlation_id = cbdata->correlationId;
   event.memfree_info.address = reinterpret_cast<uintptr_t>(params->p);
-  VLOG(3) << "Cuda MemFreeHost API exit."
-          << " p=" << params->p;
+  VLOG(3) << "Cuda MemFreeHost API exit." << " p=" << params->p;
   collector->AddEvent(std::move(event));
 }
 
@@ -677,9 +604,8 @@ void AddCuMemHostRegisterEventUponApiExit(
   event.host_register_info.address = reinterpret_cast<uintptr_t>(params->p);
   event.host_register_info.num_bytes = params->bytesize;
   event.host_register_info.flags = params->Flags;
-  VLOG(3) << "Cuda HostRegister API exit."
-          << " p=" << params->p << " bytesize=" << params->bytesize
-          << " flags=" << params->Flags;
+  VLOG(3) << "Cuda HostRegister API exit." << " p=" << params->p
+          << " bytesize=" << params->bytesize << " flags=" << params->Flags;
   collector->AddEvent(std::move(event));
 }
 
@@ -699,8 +625,7 @@ void AddCuMemHostUnregisterEventUponApiExit(
   event.context_id = cbdata->contextUid;
   event.correlation_id = cbdata->correlationId;
   event.host_unregister_info.address = reinterpret_cast<uintptr_t>(params->p);
-  VLOG(3) << "Cuda HostUnregister API exit."
-          << " p=" << params->p;
+  VLOG(3) << "Cuda HostUnregister API exit." << " p=" << params->p;
   collector->AddEvent(std::move(event));
 }
 
@@ -718,276 +643,7 @@ void AddGenericEventUponApiExit(CuptiTraceCollector *collector,
   event.device_id = device_id;
   event.context_id = cbdata->contextUid;
   event.correlation_id = cbdata->correlationId;
-  VLOG(3) << "Observed generic API exit."
-          << " name=" << cbdata->functionName;
-  collector->AddEvent(std::move(event));
-}
-
-template <bool cupti_has_channel_id, typename CuptiActivityKernel>
-void AddKernelActivityEvent(CuptiTraceCollector *collector,
-                            const CuptiActivityKernel *kernel) {
-  CuptiTracerEvent event{};
-  event.type = CuptiTracerEventType::Kernel;
-  event.source = CuptiTracerEventSource::Activity;
-  event.name = kernel->name;
-  event.start_time_ns = kernel->start;
-  event.end_time_ns = kernel->end;
-  event.device_id = kernel->deviceId;
-  event.context_id = kernel->contextId;
-  event.stream_id = kernel->streamId;
-  event.correlation_id = kernel->correlationId;
-  AnnotationMap::AnnotationInfo info = collector->annotation_map()->LookUp(
-      event.device_id, event.correlation_id);
-  event.annotation = info.annotation;
-  event.nvtx_range = info.nvtx_range;
-  event.kernel_info.registers_per_thread = kernel->registersPerThread;
-  event.kernel_info.static_shared_memory_usage = kernel->staticSharedMemory;
-  event.kernel_info.dynamic_shared_memory_usage = kernel->dynamicSharedMemory;
-  event.kernel_info.block_x = kernel->blockX;
-  event.kernel_info.block_y = kernel->blockY;
-  event.kernel_info.block_z = kernel->blockZ;
-  event.kernel_info.grid_x = kernel->gridX;
-  event.kernel_info.grid_y = kernel->gridY;
-  event.kernel_info.grid_z = kernel->gridZ;
-  if constexpr (cupti_has_channel_id) {
-    event.kernel_info.channel_id = kernel->channelID;
-    event.kernel_info.channel_type = kernel->channelType;
-  }
-  collector->AddEvent(std::move(event));
-}
-
-void AddMemcpyActivityEvent(CuptiTraceCollector *collector,
-                            const CuptiActivityMemcpyTy *memcpy) {
-  CuptiTracerEvent event{};
-  switch (memcpy->copyKind) {
-    case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD:
-      event.type = CuptiTracerEventType::MemcpyH2D;
-      event.name = "MemcpyH2D";
-      break;
-    case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH:
-      event.type = CuptiTracerEventType::MemcpyD2H;
-      event.name = "MemcpyD2H";
-      break;
-    case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD:
-      event.type = CuptiTracerEventType::MemcpyD2D;
-      event.name = "MemcpyD2D";
-      break;
-    case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP:
-      event.type = CuptiTracerEventType::MemcpyP2P;
-      event.name = "MemcpyP2P";
-      break;
-    default:
-      event.type = CuptiTracerEventType::MemcpyOther;
-      event.name = "MemcpyOther";
-      break;
-  }
-
-  event.source = CuptiTracerEventSource::Activity;
-  event.start_time_ns = memcpy->start;
-  event.end_time_ns = memcpy->end;
-  event.device_id = memcpy->deviceId;
-  event.context_id = memcpy->contextId;
-  event.stream_id = memcpy->streamId;
-  event.correlation_id = memcpy->correlationId;
-  AnnotationMap::AnnotationInfo info = collector->annotation_map()->LookUp(
-      event.device_id, event.correlation_id);
-  event.annotation = info.annotation;
-  event.memcpy_info.copy_kind = memcpy->copyKind;
-  event.memcpy_info.num_bytes = memcpy->bytes;
-  event.memcpy_info.destination = memcpy->deviceId;
-  event.memcpy_info.async = memcpy->flags & CUPTI_ACTIVITY_FLAG_MEMCPY_ASYNC;
-  event.memcpy_info.src_mem_kind = memcpy->srcKind;
-  event.memcpy_info.dst_mem_kind = memcpy->dstKind;
-#if TF_CUPTI_HAS_CHANNEL_ID
-  event.memcpy_info.channel_id = memcpy->channelID;
-  event.memcpy_info.channel_type = memcpy->channelType;
-#endif
-  collector->AddEvent(std::move(event));
-}
-
-// Invokes callback upon peer-2-peer memcpy between different GPU devices.
-void AddMemcpyP2PActivityEvent(CuptiTraceCollector *collector,
-                               const CuptiActivityMemcpyP2PTy *memcpy) {
-  CuptiTracerEvent event{};
-  event.type = CuptiTracerEventType::MemcpyP2P;
-  event.name = "MemcpyP2P";
-  event.source = CuptiTracerEventSource::Activity;
-  event.start_time_ns = memcpy->start;
-  event.end_time_ns = memcpy->end;
-  event.device_id = memcpy->srcDeviceId;
-  event.context_id = memcpy->contextId;
-  event.stream_id = memcpy->streamId;
-  event.correlation_id = memcpy->correlationId;
-  AnnotationMap::AnnotationInfo info = collector->annotation_map()->LookUp(
-      event.device_id, event.correlation_id);
-  event.annotation = info.annotation;
-  event.memcpy_info.copy_kind = CUPTI_ACTIVITY_MEMCPY_KIND_PTOP;
-  event.memcpy_info.num_bytes = memcpy->bytes;
-  event.memcpy_info.destination = memcpy->dstDeviceId;
-  event.memcpy_info.async = memcpy->flags & CUPTI_ACTIVITY_FLAG_MEMCPY_ASYNC;
-  event.memcpy_info.src_mem_kind = memcpy->srcKind;
-  event.memcpy_info.dst_mem_kind = memcpy->dstKind;
-#if TF_CUPTI_HAS_CHANNEL_ID
-  event.memcpy_info.channel_id = memcpy->channelID;
-  event.memcpy_info.channel_type = memcpy->channelType;
-#endif
-  collector->AddEvent(std::move(event));
-}
-
-void AddCuptiOverheadActivityEvent(CuptiTraceCollector *collector,
-                                   const CUpti_ActivityOverhead *overhead) {
-  CuptiTracerEvent event{};
-  event.type = CuptiTracerEventType::Overhead;
-  event.name = getActivityOverheadKindString(overhead->overheadKind);
-  event.source = CuptiTracerEventSource::Activity;
-  event.start_time_ns = overhead->start;
-  event.end_time_ns = overhead->end;
-  // If the overhead is not related to a device, we assign it to device 0.
-  event.device_id = 0;
-  // NOTE: no correlation id.
-  switch (overhead->objectKind) {
-    case CUPTI_ACTIVITY_OBJECT_UNKNOWN:
-      // Don't know how to deal with such activities because of we need either
-      // attribute it to a GPU stream or a CPU thread.
-      return;
-
-    case CUPTI_ACTIVITY_OBJECT_THREAD:
-    case CUPTI_ACTIVITY_OBJECT_PROCESS:
-      event.thread_id = overhead->objectId.pt.threadId;
-      break;
-    case CUPTI_ACTIVITY_OBJECT_STREAM:
-      event.stream_id = overhead->objectId.dcs.streamId;
-      TF_FALLTHROUGH_INTENDED;
-    case CUPTI_ACTIVITY_OBJECT_DEVICE:
-    case CUPTI_ACTIVITY_OBJECT_CONTEXT:
-      event.device_id = overhead->objectId.dcs.deviceId;
-      break;
-    default:
-      LOG(ERROR) << "Unexpected object kind: " << overhead->objectKind;
-      return;
-  }
-  collector->AddEvent(std::move(event));
-}
-
-void AddUnifiedMemoryActivityEvent(
-    CuptiTraceCollector *collector,
-    const CUpti_ActivityUnifiedMemoryCounter2 *record) {
-  VLOG(3) << "Cuda Unified Memory Activity, kind: " << record->counterKind
-          << " src: " << record->srcId << " dst: " << record->dstId;
-  CuptiTracerEvent event{};
-  event.type = CuptiTracerEventType::UnifiedMemory;
-  event.name = getActivityUnifiedMemoryKindString(record->counterKind);
-  event.source = CuptiTracerEventSource::Activity;
-  event.start_time_ns = record->start;
-  if (record->counterKind ==
-          CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT ||
-      record->counterKind ==
-          CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING ||
-      record->counterKind ==
-          CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP ||
-      record->end <= record->start) {
-    // If the end time is not valid, trim it so that it can be shown on the UI.
-    event.end_time_ns = record->start + 1;
-  } else {
-    event.end_time_ns = record->end;
-  }
-  event.device_id = record->srcId;
-  // NOTE: not context id and correlation id.
-
-  // For visualization purpose, we assign a pseudo stream id for each
-  // record->counterKind of unified memory related events.
-  constexpr int kPseudoStreamId = 0x10000000;
-  event.stream_id = kPseudoStreamId + record->counterKind;
-  event.memcpy_info.copy_kind = CUPTI_ACTIVITY_MEMCPY_KIND_UNKNOWN;
-  // Check whether the activity is byte transfer.
-  if (record->counterKind ==
-          CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD ||
-      record->counterKind ==
-          CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH ||
-      record->counterKind ==
-          CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOD) {
-    event.memcpy_info.num_bytes = record->value;
-  } else {
-    event.memcpy_info.num_bytes = 0;
-  }
-  event.memcpy_info.destination = record->dstId;
-  event.memcpy_info.async = false;
-  collector->AddEvent(std::move(event));
-}
-
-void AddMemoryActivityEvent(CuptiTraceCollector *collector,
-                            const CUpti_ActivityMemory *memory) {
-  CuptiTracerEvent event{};
-  event.name = absl::StrCat("Memory ", GetMemoryKindName(memory->memoryKind));
-  event.type = CuptiTracerEventType::MemoryResidency;
-  event.source = CuptiTracerEventSource::Activity;
-  event.start_time_ns = memory->start;
-  event.end_time_ns = std::max(memory->end, memory->start + 1);
-  event.device_id = memory->deviceId;
-  event.context_id = memory->contextId;
-  // Assign to default stream (0) so that event is included during Flush().
-  event.stream_id = 0;
-  event.memory_residency_info.num_bytes = memory->bytes;
-  event.memory_residency_info.mem_kind = memory->memoryKind;
-  event.memory_residency_info.address = memory->address;
-  VLOG(5) << "Cuda activity " << event.name
-          << " addr: " << reinterpret_cast<void *>(memory->address)
-          << " bytes: " << memory->bytes;
-  collector->AddEvent(std::move(event));
-}
-
-void AddMemsetActivityEvent(CuptiTraceCollector *collector,
-                            const CuptiActivityMemsetTy *memset) {
-  auto mem_kind = memset->memoryKind;
-  CuptiTracerEvent event{};
-  event.type = CuptiTracerEventType::Memset;
-  event.source = CuptiTracerEventSource::Activity;
-  event.name = absl::StrCat("Memset ", mem_kind);
-  event.start_time_ns = memset->start;
-  event.end_time_ns = std::max(memset->end, memset->start + 1);
-  event.device_id = memset->deviceId;
-  event.correlation_id = memset->correlationId;
-  event.context_id = memset->contextId;
-  event.stream_id = memset->streamId;
-  event.memset_info.num_bytes = memset->bytes;
-  event.memset_info.mem_kind = mem_kind;
-  event.memset_info.async = (memset->flags & CUPTI_ACTIVITY_FLAG_MEMSET_ASYNC);
-#if TF_CUPTI_HAS_CHANNEL_ID
-  event.memset_info.channel_id = memset->channelID;
-  event.memset_info.channel_type = memset->channelType;
-#endif
-  VLOG(5) << "Cuda activity " << event.name << " bytes: " << memset->bytes
-          << " async: " << event.memset_info.async;
-  collector->AddEvent(std::move(event));
-}
-
-void AddSynchronizationActivityEvent(
-    CuptiTraceCollector *collector, const CUpti_ActivitySynchronization *sync) {
-  CuptiTracerEvent event{};
-  event.type = CuptiTracerEventType::Generic;
-  event.source = CuptiTracerEventSource::Activity;
-  switch (sync->type) {
-    case CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_EVENT_SYNCHRONIZE:
-      event.name = "cuEventSynchronize";
-      break;
-    case CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_WAIT_EVENT:
-      event.name = "cuStreamWaitEvent";
-      break;
-    case CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_SYNCHRONIZE:
-      event.name = "cuStreamSynchronize";
-      break;
-    case CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_CONTEXT_SYNCHRONIZE:
-      event.name = "cuCtxSynchronize";
-      break;
-    default:
-      event.name = "unknown synchronization event";
-      break;
-  }
-  event.start_time_ns = sync->start;
-  event.end_time_ns = std::max(sync->end, sync->start + 1);
-  event.correlation_id = sync->correlationId;
-  event.context_id = sync->contextId;
-  VLOG(5) << "Cuda activity " << event.name;
+  VLOG(3) << "Observed generic API exit." << " name=" << cbdata->functionName;
   collector->AddEvent(std::move(event));
 }
 
@@ -1001,17 +657,17 @@ class CuptiDriverApiHookWithActivityApi : public CuptiDriverApiHook {
         cupti_interface_(cupti_interface),
         collector_(collector) {}
 
-  Status OnDriverApiEnter(int device_id, CUpti_CallbackDomain domain,
-                          CUpti_CallbackId cbid,
-                          const CUpti_CallbackData *cbdata) override {
+  absl::Status OnDriverApiEnter(int device_id, CUpti_CallbackDomain domain,
+                                CUpti_CallbackId cbid,
+                                const CUpti_CallbackData *cbdata) override {
     // Stash away the current Cupti timestamp into cbdata.
     *cbdata->correlationData =
         option_.required_callback_api_events ? CuptiTracer::GetTimestamp() : 0;
     return OkStatus();
   }
-  Status OnDriverApiExit(int device_id, CUpti_CallbackDomain domain,
-                         CUpti_CallbackId cbid,
-                         const CUpti_CallbackData *cbdata) override {
+  absl::Status OnDriverApiExit(int device_id, CUpti_CallbackDomain domain,
+                               CUpti_CallbackId cbid,
+                               const CUpti_CallbackData *cbdata) override {
     // If we are not collecting CPU events from Callback API, we can return now.
     if (!option_.required_callback_api_events) {
       return OkStatus();
@@ -1024,7 +680,7 @@ class CuptiDriverApiHookWithActivityApi : public CuptiDriverApiHook {
     return AddDriverApiCallbackEvent(collector_, cupti_interface_, device_id,
                                      start_tsc, end_tsc, domain, cbid, cbdata);
   }
-  Status SyncAndFlush() override {
+  absl::Status SyncAndFlush() override {
     if (option_.sync_devices_before_stop) {
       CuptiApiTracingDisabler disabler;
       absl::MutexLock lock(&mutex_);
@@ -1068,7 +724,7 @@ class CuptiDriverApiHookWithActivityApi : public CuptiDriverApiHook {
 
 }  // namespace
 
-/*static*/ Status CuptiDriverApiHook::AddDriverApiCallbackEvent(
+/*static*/ absl::Status CuptiDriverApiHook::AddDriverApiCallbackEvent(
     CuptiTraceCollector *collector, CuptiInterface *cupti_interface,
     int device_id, uint64_t start_tsc, uint64_t end_tsc,
     CUpti_CallbackDomain domain, CUpti_CallbackId cbid,
@@ -1214,9 +870,7 @@ const char *GetTraceEventTypeName(const CuptiTracerEventType &type) {
 }
 
 CuptiTracer::CuptiTracer(CuptiInterface *cupti_interface)
-    : num_gpus_(NumGpus()),
-      cupti_interface_(cupti_interface),
-      buffer_pool_(kBufferSizeInBytes) {}
+    : num_gpus_(NumGpus()), cupti_interface_(cupti_interface) {}
 
 /* static */ CuptiTracer *CuptiTracer::GetCuptiTracerSingleton() {
   static auto *singleton = new CuptiTracer(GetCuptiInterface());
@@ -1250,7 +904,7 @@ void CuptiTracer::Enable(const CuptiTracerOptions &option,
   cupti_driver_api_hook_.reset(new CuptiDriverApiHookWithActivityApi(
       option, cupti_interface_, collector));
 
-  Status status = EnableApiTracing();
+  absl::Status status = EnableApiTracing();
   need_root_access_ |= status.code() == tsl::error::PERMISSION_DENIED;
   if (!status.ok()) return;
 
@@ -1264,6 +918,17 @@ void CuptiTracer::Disable() {
   cupti_interface_->CleanUp();
   Finalize().IgnoreError();
   cupti_driver_api_hook_->SyncAndFlush().IgnoreError();
+
+  collector_->OnTracerCachedActivityBuffers(std::move(activity_buffers_));
+  if (cupti_dropped_activity_event_count_ > 0) {
+    collector_->OnEventsDropped("Activity Event dropped by Cupti Lib:",
+                                cupti_dropped_activity_event_count_);
+  }
+  if (num_activity_events_in_dropped_buffer_ > 0) {
+    collector_->OnEventsDropped("Activity Event dropped in dropped buffer:",
+                                num_activity_events_in_dropped_buffer_);
+  }
+
   collector_->Flush();
   collector_ = nullptr;
   option_.reset();
@@ -1271,7 +936,7 @@ void CuptiTracer::Disable() {
   tsl::profiler::AnnotationStack::Enable(false);
 }
 
-Status CuptiTracer::EnableApiTracing() {
+absl::Status CuptiTracer::EnableApiTracing() {
   if (api_tracing_enabled_) return OkStatus();
 
   VLOG(1) << "Enable subscriber";
@@ -1299,7 +964,7 @@ Status CuptiTracer::EnableApiTracing() {
   return OkStatus();
 }
 
-Status CuptiTracer::DisableApiTracing() {
+absl::Status CuptiTracer::DisableApiTracing() {
   if (!api_tracing_enabled_) return OkStatus();
 
   api_tracing_enabled_ = false;
@@ -1324,10 +989,18 @@ Status CuptiTracer::DisableApiTracing() {
   return OkStatus();
 }
 
-Status CuptiTracer::EnableActivityTracing() {
+absl::Status CuptiTracer::EnableActivityTracing() {
+  if (activity_tracing_enabled_) return OkStatus();
+  PrepareActivityStart();
   if (!option_->activities_selected.empty()) {
     // Initialize callback functions for Cupti Activity API.
     VLOG(1) << "Registering CUPTI activity callbacks";
+    if (auto err = cupti_interface_->ActivityUsePerThreadBuffer();
+        err != CUPTI_SUCCESS) {
+      LOG(WARNING) << "Fail to use per-thread activity buffer, cupti trace "
+                      "overhead may be big. CUPTI ERROR CODE:"
+                   << err;
+    }
     RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityRegisterCallbacks(
         RequestCuptiActivityBuffer, ProcessCuptiActivityBuffer));
 
@@ -1345,7 +1018,7 @@ Status CuptiTracer::EnableActivityTracing() {
   return OkStatus();
 }
 
-Status CuptiTracer::DisableActivityTracing() {
+absl::Status CuptiTracer::DisableActivityTracing() {
   if (activity_tracing_enabled_) {
     VLOG(1) << "Disabling activity tracing for "
             << option_->activities_selected.size() << " activities";
@@ -1367,7 +1040,7 @@ Status CuptiTracer::DisableActivityTracing() {
   return OkStatus();
 }
 
-Status CuptiTracer::Finalize() {
+absl::Status CuptiTracer::Finalize() {
   if (option_->cupti_finalize) {
     VLOG(1) << "CuptiFinalize";
     RETURN_IF_CUPTI_ERROR(cupti_interface_->Finalize());
@@ -1386,8 +1059,8 @@ Status CuptiTracer::Finalize() {
   return 0;
 }
 
-Status CuptiTracer::HandleNVTXCallback(CUpti_CallbackId cbid,
-                                       const CUpti_CallbackData *cbdata) {
+absl::Status CuptiTracer::HandleNVTXCallback(CUpti_CallbackId cbid,
+                                             const CUpti_CallbackData *cbdata) {
   const CUpti_NvtxData *pdata =
       reinterpret_cast<const CUpti_NvtxData *>(cbdata);
   if (cbid == CUPTI_CBID_NVTX_nvtxDomainRangePushEx) {
@@ -1405,9 +1078,9 @@ Status CuptiTracer::HandleNVTXCallback(CUpti_CallbackId cbid,
   return OkStatus();
 }
 
-Status CuptiTracer::HandleCallback(CUpti_CallbackDomain domain,
-                                   CUpti_CallbackId cbid,
-                                   const CUpti_CallbackData *cbdata) {
+absl::Status CuptiTracer::HandleCallback(CUpti_CallbackDomain domain,
+                                         CUpti_CallbackId cbid,
+                                         const CUpti_CallbackData *cbdata) {
   if (!api_tracing_enabled_) return OkStatus();    // already unsubscribed.
   if (!cupti_driver_api_hook_) return OkStatus();  // already unsubscribed.
   if (domain == CUPTI_CB_DOMAIN_NVTX) return HandleNVTXCallback(cbid, cbdata);
@@ -1491,20 +1164,39 @@ void CuptiTracer::ConfigureActivityUnifiedMemoryCounter(bool enable) {
 }
 
 void CuptiTracer::RequestActivityBuffer(uint8_t **buffer, size_t *size) {
-  *buffer = buffer_pool_.GetOrCreateBuffer();
+  *buffer = activity_buffers_->GetOrCreateBuffer();
   if (*buffer == nullptr) {
     LOG(WARNING)
         << "CUPTI Buffer not allocated, activity records will be dropped";
     *size = 0;
     return;
   }
-  *size = buffer_pool_.GetBufferSizeInBytes();
+  *size = activity_buffers_->GetBufferSizeInBytes();
 }
 
-Status CuptiTracer::ProcessActivityBuffer(CUcontext context, uint32_t stream_id,
-                                          uint8_t *buffer, size_t size) {
-  absl::Cleanup buffer_cleanup = [&]() { buffer_pool_.ReclaimBuffer(buffer); };
-  if (size == 0) {
+static size_t CountCuptiActivityEvent(uint8_t *buffer, size_t size) {
+  size_t total_event_count = 0;
+  if (size == 0 || buffer == nullptr) return total_event_count;
+  CuptiInterface *cupti_interface = GetCuptiInterface();
+  CUpti_Activity *record = nullptr;
+  while (true) {
+    if (cupti_interface->ActivityGetNextRecord(buffer, size, &record) ==
+        CUPTI_SUCCESS) {
+      ++total_event_count;
+    } else {
+      break;
+    }
+  }
+  return total_event_count;
+}
+
+absl::Status CuptiTracer::ProcessActivityBuffer(CUcontext context,
+                                                uint32_t stream_id,
+                                                uint8_t *buffer, size_t size) {
+  absl::Cleanup buffer_cleanup = [&]() {
+    if (buffer) activity_buffers_->ReclaimBuffer(buffer);
+  };
+  if (size == 0 || buffer == nullptr) {
     return OkStatus();
   }
   if (!activity_tracing_enabled_) {
@@ -1513,71 +1205,33 @@ Status CuptiTracer::ProcessActivityBuffer(CUcontext context, uint32_t stream_id,
   }
   if (cupti_interface_->Disabled()) return tsl::errors::Internal("Disabled.");
 
-  CUpti_Activity *record = nullptr;
-  while (true) {
-    CUptiResult status =
-        cupti_interface_->ActivityGetNextRecord(buffer, size, &record);
-    if (status == CUPTI_SUCCESS) {
-      switch (record->kind) {
-        case CUPTI_ACTIVITY_KIND_KERNEL:  // sequential
-        case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL:
-          AddKernelActivityEvent<TF_CUPTI_HAS_CHANNEL_ID>(
-              collector_, reinterpret_cast<CuptiActivityKernelTy *>(record));
-          break;
-        case CUPTI_ACTIVITY_KIND_CDP_KERNEL:
-          AddKernelActivityEvent<false>(
-              collector_, reinterpret_cast<CUpti_ActivityCdpKernel *>(record));
-          break;
-        case CUPTI_ACTIVITY_KIND_MEMCPY:
-          AddMemcpyActivityEvent(
-              collector_, reinterpret_cast<CuptiActivityMemcpyTy *>(record));
-          break;
-        case CUPTI_ACTIVITY_KIND_MEMCPY2:
-          AddMemcpyP2PActivityEvent(
-              collector_, reinterpret_cast<CuptiActivityMemcpyP2PTy *>(record));
-          break;
-        case CUPTI_ACTIVITY_KIND_OVERHEAD:
-          AddCuptiOverheadActivityEvent(
-              collector_, reinterpret_cast<CUpti_ActivityOverhead *>(record));
-          break;
-        case CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER:
-          AddUnifiedMemoryActivityEvent(
-              collector_,
-              reinterpret_cast<CUpti_ActivityUnifiedMemoryCounter2 *>(record));
-          break;
-        case CUPTI_ACTIVITY_KIND_MEMORY: {
-          AddMemoryActivityEvent(
-              collector_, reinterpret_cast<CUpti_ActivityMemory *>(record));
-        } break;
-        case CUPTI_ACTIVITY_KIND_MEMSET:
-          AddMemsetActivityEvent(
-              collector_, reinterpret_cast<CuptiActivityMemsetTy *>(record));
-          break;
-        case CUPTI_ACTIVITY_KIND_SYNCHRONIZATION:
-          AddSynchronizationActivityEvent(
-              collector_,
-              reinterpret_cast<CUpti_ActivitySynchronization *>(record));
-          break;
-        default:
-          VLOG(3) << "Activity type " << record->kind << " is not supported.";
-          break;
-      }
-    } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
-      break;
-    } else {
-      return tsl::errors::Internal("Parse cupti activity buffer error.");
-    }
+  // Report dropped records.
+  size_t dropped = 0;
+  if (cupti_interface_->ActivityGetNumDroppedRecords(
+          context, stream_id, &dropped) == CUPTI_SUCCESS) {
+    cupti_dropped_activity_event_count_ += dropped;
   }
 
-  // Report dropped records.
-  size_t dropped;
-  RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityGetNumDroppedRecords(
-      context, stream_id, &dropped));
-  if (dropped != 0) {
-    uint32_t device_id = -1;
-    RETURN_IF_CUPTI_ERROR(cupti_interface_->GetDeviceId(context, &device_id));
-    collector_->OnEventsDropped("cupti activity buffer full", dropped);
+  size_t event_count_in_buffer = CountCuptiActivityEvent(buffer, size);
+  auto max_activity_event_count =
+      collector_->GetOptions().max_activity_api_events;
+  if (max_activity_event_count > 0 &&
+      num_activity_events_in_cached_buffer_ >= max_activity_event_count) {
+    LOG(WARNING) << "Already too many activity events, drop the buffer of "
+                 << size << "bytes of event to reuse.";
+    num_activity_events_in_dropped_buffer_ += event_count_in_buffer;
+    // buffer will be return to the pool
+    return OkStatus();
   }
+  num_activity_events_in_cached_buffer_ += event_count_in_buffer;
+
+  // When cupti activity buffer is required to flush, save the buffer and its
+  // valid size some where. All the saved activity buffer will be handled
+  // after the profiling is stopped.
+  VLOG(3) << "Caching CUPTI activity buffer of size:" << size;
+  activity_buffers_->CacheCuptiFilledActivityBuffer(buffer, size);
+  buffer = nullptr;  // So cleanup will not free it as it was saved already
+
   return OkStatus();
 }
 
@@ -1594,5 +1248,13 @@ Status CuptiTracer::ProcessActivityBuffer(CUcontext context, uint32_t stream_id,
   return "";
 }
 
+void CuptiTracer::PrepareActivityStart() {
+  activity_buffers_ =
+      std::make_unique<CuptiActivityBufferManager>(kBufferSizeInBytes);
+  cupti_dropped_activity_event_count_ = 0;
+  num_activity_events_in_cached_buffer_ = 0;
+  num_activity_events_in_dropped_buffer_ = 0;
+}
+
 }  // namespace profiler
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.h b/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.h
index 59583453779b75..54dc0253f3f4d4 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.h
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.h
@@ -23,7 +23,6 @@ limitations under the License.
 #include "xla/backends/profiler/gpu/cupti_collector.h"
 #include "xla/backends/profiler/gpu/cupti_interface.h"
 #include "tsl/platform/types.h"
-#include "tsl/profiler/utils/buffer_pool.h"
 
 namespace xla {
 namespace profiler {
@@ -113,6 +112,17 @@ class CuptiTracer {
   // Buffer size and alignment, 32K and 8 as in CUPTI samples.
   static constexpr size_t kBufferSizeInBytes = 32 * 1024;
 
+  std::unique_ptr<CuptiActivityBufferManager> activity_buffers_;
+  static_assert(std::atomic<size_t>::is_always_lock_free,
+                "std::atomic<size_t> is not lock free! This may cause very bad"
+                " profiling overhead in some circumstances.");
+  std::atomic<size_t> cupti_dropped_activity_event_count_ = 0;
+  std::atomic<size_t> num_activity_events_in_dropped_buffer_ = 0;
+  std::atomic<size_t> num_activity_events_in_cached_buffer_ = 0;
+
+  // Clear activity_buffers, reset activity event counters.
+  void PrepareActivityStart();
+
   absl::Status EnableApiTracing();
   absl::Status EnableActivityTracing();
   absl::Status DisableApiTracing();
@@ -139,8 +149,6 @@ class CuptiTracer {
   bool activity_tracing_enabled_ = false;
 
   std::unique_ptr<CuptiDriverApiHook> cupti_driver_api_hook_;
-
-  tsl::profiler::BufferPool buffer_pool_;
 };
 
 }  // namespace profiler
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_wrapper.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_wrapper.cc
index 16565d9dd576fd..2fcdbd04747bb9 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_wrapper.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_wrapper.cc
@@ -56,6 +56,20 @@ CUptiResult CuptiWrapper::ActivityRegisterCallbacks(
                                         func_buffer_completed);
 }
 
+CUptiResult CuptiWrapper::ActivityUsePerThreadBuffer() {
+#if CUDA_VERSION >= 12030
+  uint8_t use_per_thread_activity_buffer = 1;
+  size_t value_size = sizeof(use_per_thread_activity_buffer);
+  return cuptiActivitySetAttribute(
+      CUPTI_ACTIVITY_ATTR_PER_THREAD_ACTIVITY_BUFFER, &value_size,
+      &use_per_thread_activity_buffer);
+#else
+  // cuptiActivitySetAttribute returns CUPTI_ERROR_INVALID_PARAMETER if invoked
+  // with an invalid first parameter.
+  return CUPTI_ERROR_INVALID_PARAMETER;
+#endif
+}
+
 CUptiResult CuptiWrapper::GetDeviceId(CUcontext context, uint32_t* deviceId) {
   return cuptiGetDeviceId(context, deviceId);
 }
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_wrapper.h b/third_party/xla/xla/backends/profiler/gpu/cupti_wrapper.h
index ada4af91e6d48e..e0f3cdca1228ed 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_wrapper.h
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_wrapper.h
@@ -55,6 +55,8 @@ class CuptiWrapper : public xla::profiler::CuptiInterface {
       CUpti_BuffersCallbackRequestFunc func_buffer_requested,
       CUpti_BuffersCallbackCompleteFunc func_buffer_completed) override;
 
+  CUptiResult ActivityUsePerThreadBuffer() override;
+
   CUptiResult GetDeviceId(CUcontext context, uint32_t* deviceId) override;
 
   CUptiResult GetTimestamp(uint64_t* timestamp) override;
@@ -125,6 +127,8 @@ class CuptiWrapperStub : public xla::profiler::CuptiInterface {
       CUpti_BuffersCallbackRequestFunc func_buffer_requested,
       CUpti_BuffersCallbackCompleteFunc func_buffer_completed) override;
 
+  CUptiResult ActivityUsePerThreadBuffer() override;
+
   CUptiResult GetDeviceId(CUcontext context, uint32_t* deviceId) override;
 
   CUptiResult GetTimestamp(uint64_t* timestamp) override;
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_wrapper_stub.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_wrapper_stub.cc
index 945fe49e853ded..9e9c4a56a94481 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_wrapper_stub.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_wrapper_stub.cc
@@ -55,6 +55,10 @@ CUptiResult CuptiWrapperStub::ActivityRegisterCallbacks(
   return CUPTI_SUCCESS;
 }
 
+CUptiResult CuptiWrapperStub::ActivityUsePerThreadBuffer() {
+  return CUPTI_SUCCESS;
+}
+
 CUptiResult CuptiWrapperStub::GetDeviceId(CUcontext context,
                                           uint32_t* deviceId) {
   return cuptiGetDeviceId(context, deviceId);
diff --git a/third_party/xla/xla/backends/profiler/gpu/device_tracer_cuda.cc b/third_party/xla/xla/backends/profiler/gpu/device_tracer_cuda.cc
index d7bb2524b66762..741a4129044e74 100644
--- a/third_party/xla/xla/backends/profiler/gpu/device_tracer_cuda.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/device_tracer_cuda.cc
@@ -40,7 +40,6 @@ namespace xla {
 namespace profiler {
 
 using absl::OkStatus;
-using absl::Status;
 using tensorflow::ProfileOptions;
 using tensorflow::profiler::XSpace;
 using tsl::ReadBoolFromEnvVar;
@@ -55,13 +54,13 @@ class GpuTracer : public tsl::profiler::ProfilerInterface {
   ~GpuTracer() override {}
 
   // GpuTracer interface:
-  Status Start() override;
-  Status Stop() override;
-  Status CollectData(XSpace* space) override;
+  absl::Status Start() override;
+  absl::Status Stop() override;
+  absl::Status CollectData(XSpace* space) override;
 
  private:
-  Status DoStart();
-  Status DoStop();
+  absl::Status DoStart();
+  absl::Status DoStop();
 
   enum State {
     kNotStarted,
@@ -77,7 +76,7 @@ class GpuTracer : public tsl::profiler::ProfilerInterface {
   std::unique_ptr<CuptiTraceCollector> cupti_collector_;
 };
 
-Status GpuTracer::DoStart() {
+absl::Status GpuTracer::DoStart() {
   if (!cupti_tracer_->IsAvailable()) {
     return tsl::errors::Unavailable("Another profile session running.");
   }
@@ -159,8 +158,8 @@ Status GpuTracer::DoStart() {
   return OkStatus();
 }
 
-Status GpuTracer::Start() {
-  Status status = DoStart();
+absl::Status GpuTracer::Start() {
+  absl::Status status = DoStart();
   if (status.ok()) {
     profiling_state_ = State::kStartedOk;
     return OkStatus();
@@ -170,20 +169,20 @@ Status GpuTracer::Start() {
   }
 }
 
-Status GpuTracer::DoStop() {
+absl::Status GpuTracer::DoStop() {
   cupti_tracer_->Disable();
   return OkStatus();
 }
 
-Status GpuTracer::Stop() {
+absl::Status GpuTracer::Stop() {
   if (profiling_state_ == State::kStartedOk) {
-    Status status = DoStop();
+    absl::Status status = DoStop();
     profiling_state_ = status.ok() ? State::kStoppedOk : State::kStoppedError;
   }
   return OkStatus();
 }
 
-Status GpuTracer::CollectData(XSpace* space) {
+absl::Status GpuTracer::CollectData(XSpace* space) {
   VLOG(2) << "Collecting data to XSpace from GpuTracer.";
   switch (profiling_state_) {
     case State::kNotStarted:
diff --git a/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc b/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc
index 81eb2d192ea09a..560ae8033af66e 100644
--- a/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc
@@ -75,13 +75,13 @@ class GpuTracer : public profiler::ProfilerInterface {
   ~GpuTracer() override {}
 
   // GpuTracer interface:
-  Status Start() override;
-  Status Stop() override;
-  Status CollectData(XSpace* space) override;
+  absl::Status Start() override;
+  absl::Status Stop() override;
+  absl::Status CollectData(XSpace* space) override;
 
  private:
-  Status DoStart();
-  Status DoStop();
+  absl::Status DoStart();
+  absl::Status DoStop();
 
   RocmTracerOptions GetRocmTracerOptions();
 
@@ -184,7 +184,7 @@ RocmTraceCollectorOptions GpuTracer::GetRocmTraceCollectorOptions(
   return options;
 }
 
-Status GpuTracer::DoStart() {
+absl::Status GpuTracer::DoStart() {
   if (!rocm_tracer_->IsAvailable()) {
     return tsl::errors::Unavailable("Another profile session running.");
   }
@@ -208,8 +208,8 @@ Status GpuTracer::DoStart() {
   return OkStatus();
 }
 
-Status GpuTracer::Start() {
-  Status status = DoStart();
+absl::Status GpuTracer::Start() {
+  absl::Status status = DoStart();
   if (status.ok()) {
     profiling_state_ = State::kStartedOk;
     return OkStatus();
@@ -219,21 +219,21 @@ Status GpuTracer::Start() {
   }
 }
 
-Status GpuTracer::DoStop() {
+absl::Status GpuTracer::DoStop() {
   rocm_tracer_->Disable();
   AnnotationStack::Enable(false);
   return OkStatus();
 }
 
-Status GpuTracer::Stop() {
+absl::Status GpuTracer::Stop() {
   if (profiling_state_ == State::kStartedOk) {
-    Status status = DoStop();
+    absl::Status status = DoStop();
     profiling_state_ = status.ok() ? State::kStoppedOk : State::kStoppedError;
   }
   return OkStatus();
 }
 
-Status GpuTracer::CollectData(XSpace* space) {
+absl::Status GpuTracer::CollectData(XSpace* space) {
   switch (profiling_state_) {
     case State::kNotStarted:
       VLOG(3) << "No trace data collected, session wasn't started";
diff --git a/third_party/xla/xla/backends/profiler/gpu/mock_cupti.h b/third_party/xla/xla/backends/profiler/gpu/mock_cupti.h
index 2097e4049b32d7..39feb3b3a5a103 100644
--- a/third_party/xla/xla/backends/profiler/gpu/mock_cupti.h
+++ b/third_party/xla/xla/backends/profiler/gpu/mock_cupti.h
@@ -36,7 +36,7 @@ class MockCupti : public xla::profiler::CuptiInterface {
               (override));
   MOCK_METHOD(CUptiResult, ActivityFlushAll, (uint32_t flag), (override));
   MOCK_METHOD(CUptiResult, ActivityGetNextRecord,
-              (uint8_t * buffer, size_t valid_buffer_size_bytes,
+              (uint8_t* buffer, size_t valid_buffer_size_bytes,
                CUpti_Activity** record),
               (override));
   MOCK_METHOD(CUptiResult, ActivityGetNumDroppedRecords,
@@ -50,9 +50,10 @@ class MockCupti : public xla::profiler::CuptiInterface {
               (CUpti_BuffersCallbackRequestFunc func_buffer_requested,
                CUpti_BuffersCallbackCompleteFunc func_buffer_completed),
               (override));
+  MOCK_METHOD(CUptiResult, ActivityUsePerThreadBuffer, (), (override));
   MOCK_METHOD(CUptiResult, GetDeviceId, (CUcontext context, uint32_t* deviceId),
               (override));
-  MOCK_METHOD(CUptiResult, GetTimestamp, (uint64_t * timestamp), (override));
+  MOCK_METHOD(CUptiResult, GetTimestamp, (uint64_t* timestamp), (override));
   MOCK_METHOD(CUptiResult, Finalize, (), (override));
   MOCK_METHOD(CUptiResult, EnableCallback,
               (uint32_t enable, CUpti_SubscriberHandle subscriber,
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_collector.cc b/third_party/xla/xla/backends/profiler/gpu/rocm_collector.cc
index 41b21c486eb340..f817a5e424d953 100644
--- a/third_party/xla/xla/backends/profiler/gpu/rocm_collector.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_collector.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "absl/container/fixed_array.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/container/node_hash_map.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
@@ -223,8 +224,8 @@ class PerDeviceCollector {
     stats.occupancy_pct /= device_properties_.maxThreadsPerMultiProcessor;
 
     err = hipOccupancyMaxPotentialBlockSize(
-        &stats.min_grid_size, &stats.suggested_block_size, params.func_ptr,
-        params.dynamic_smem_size, 0);
+        &stats.min_grid_size, &stats.suggested_block_size,
+        static_cast<const void*>(params.func_ptr), params.dynamic_smem_size, 0);
 
     if (err != hipError_t::hipSuccess) {
       return {};
@@ -578,8 +579,7 @@ class RocmTraceCollectorImpl : public profiler::RocmTraceCollector {
         num_activity_events_(0),
         start_walltime_ns_(start_walltime_ns),
         start_gputime_ns_(start_gputime_ns),
-        num_gpus_(options.num_gpus),
-        per_device_collector_(options.num_gpus) {}
+        num_gpus_(options.num_gpus) {}
 
   void AddEvent(RocmTracerEvent&& event, bool is_auxiliary) override;
   void Flush() override;
@@ -615,7 +615,7 @@ class RocmTraceCollectorImpl : public profiler::RocmTraceCollector {
 
   const std::vector<RocmTracerEvent> ApiActivityInfoExchange();
 
-  absl::flat_hash_map<const uint32_t, PerDeviceCollector> per_device_collector_;
+  absl::node_hash_map<uint32_t, PerDeviceCollector> per_device_collector_;
 };
 //==========
 
@@ -730,6 +730,7 @@ RocmTraceCollectorImpl::ApiActivityInfoExchange() {
   std::vector<RocmTracerEvent> aggregated_events;
 
   // Copy info from activity events to API callback events
+  mutex_lock lock{event_maps_mutex_};
   for (auto& api_iter : api_events_map_) {
     RocmTracerEvent& api_event = api_iter.second;
     auto activity_event =
diff --git a/third_party/xla/xla/backends/profiler/plugin/plugin_tracer.cc b/third_party/xla/xla/backends/profiler/plugin/plugin_tracer.cc
index 8176ac55f553e3..57fb7980cf06ed 100644
--- a/third_party/xla/xla/backends/profiler/plugin/plugin_tracer.cc
+++ b/third_party/xla/xla/backends/profiler/plugin/plugin_tracer.cc
@@ -141,23 +141,23 @@ PluginTracer::~PluginTracer() {
   }
 }
 
-Status PluginTracer::Start() {
+absl::Status PluginTracer::Start() {
   PLUGIN_Profiler_Start_Args args;
   args.profiler = profiler_;
   RETURN_STATUS_IF_PLUGIN_PROFILER_ERROR(profiler_api_->start(&args),
                                          profiler_api_);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status PluginTracer::Stop() {
+absl::Status PluginTracer::Stop() {
   PLUGIN_Profiler_Stop_Args args;
   args.profiler = profiler_;
   RETURN_STATUS_IF_PLUGIN_PROFILER_ERROR(profiler_api_->stop(&args),
                                          profiler_api_);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status PluginTracer::CollectData(XSpace* space) {
+absl::Status PluginTracer::CollectData(XSpace* space) {
   PLUGIN_Profiler_CollectData_Args args;
   args.profiler = profiler_;
   args.buffer = nullptr;
@@ -172,7 +172,7 @@ Status PluginTracer::CollectData(XSpace* space) {
       plane->Swap(&tpu_plane);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace profiler
diff --git a/third_party/xla/xla/backends/profiler/plugin/plugin_tracer.h b/third_party/xla/xla/backends/profiler/plugin/plugin_tracer.h
index acd88644ac090f..832a72e6546ea0 100644
--- a/third_party/xla/xla/backends/profiler/plugin/plugin_tracer.h
+++ b/third_party/xla/xla/backends/profiler/plugin/plugin_tracer.h
@@ -33,11 +33,11 @@ class PluginTracer : public tsl::profiler::ProfilerInterface {
                         const tensorflow::ProfileOptions& options);
   ~PluginTracer() override;
 
-  Status Start() override;
+  absl::Status Start() override;
 
-  Status Stop() override;
+  absl::Status Stop() override;
 
-  Status CollectData(tensorflow::profiler::XSpace* space) override;
+  absl::Status CollectData(tensorflow::profiler::XSpace* space) override;
 
  private:
   const PLUGIN_Profiler_Api* profiler_api_;
diff --git a/third_party/xla/xla/backends/profiler/tpu/tpu_tracer.cc b/third_party/xla/xla/backends/profiler/tpu/tpu_tracer.cc
index 19c7e57e49a13e..7488645cf40f2a 100644
--- a/third_party/xla/xla/backends/profiler/tpu/tpu_tracer.cc
+++ b/third_party/xla/xla/backends/profiler/tpu/tpu_tracer.cc
@@ -95,11 +95,11 @@ class TpuTracer : public ProfilerInterface {
   explicit TpuTracer();
   ~TpuTracer() override;
 
-  Status Start() override;
+  absl::Status Start() override;
 
-  Status Stop() override;
+  absl::Status Stop() override;
 
-  Status CollectData(XSpace* space) override;
+  absl::Status CollectData(XSpace* space) override;
 
  private:
   TpuProfiler* tpu_profiler_;
@@ -118,7 +118,7 @@ TpuTracer::~TpuTracer() {
   stream_executor::tpu::ProfilerApiFn()->TpuProfiler_DestroyFn(tpu_profiler_);
 }
 
-Status TpuTracer::Start() {
+absl::Status TpuTracer::Start() {
   ProfilerStatusHelper status;
   stream_executor::tpu::ProfilerApiFn()->TpuProfiler_StartFn(tpu_profiler_,
                                                              status.c_status);
@@ -126,10 +126,10 @@ Status TpuTracer::Start() {
     LOG(ERROR) << "TPU tracer failed to start.";
     return status.status();
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status TpuTracer::Stop() {
+absl::Status TpuTracer::Stop() {
   ProfilerStatusHelper status;
   stream_executor::tpu::ProfilerApiFn()->TpuProfiler_StopFn(tpu_profiler_,
                                                             status.c_status);
@@ -137,10 +137,10 @@ Status TpuTracer::Stop() {
     LOG(ERROR) << "TPU tracer failed to stop.";
     return status.status();
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status TpuTracer::CollectData(XSpace* space) {
+absl::Status TpuTracer::CollectData(XSpace* space) {
   ProfilerStatusHelper status;
   // Get size of buffer required for TPU driver to serialize XSpace into.
   size_t size_in_bytes;
@@ -164,7 +164,7 @@ Status TpuTracer::CollectData(XSpace* space) {
     LOG(ERROR) << "TPU tracer failed to collect data.";
     return status.status();
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Initializes TpuProfilerApiFns. The initialization may not be successful if
diff --git a/third_party/xla/xla/client/client.cc b/third_party/xla/xla/client/client.cc
index 1d2fdab9029afa..d79d978dfeae54 100644
--- a/third_party/xla/xla/client/client.cc
+++ b/third_party/xla/xla/client/client.cc
@@ -49,7 +49,7 @@ absl::StatusOr<Literal> Client::Transfer(const GlobalData& data,
 
   VLOG(1) << "making transfer request";
   VLOG(3) << "TransferToClientRequest: {" << request.DebugString() << "}";
-  Status s = stub_->TransferToClient(&request, &response);
+  absl::Status s = stub_->TransferToClient(&request, &response);
   VLOG(1) << "done with request";
 
   if (!s.ok()) {
@@ -76,7 +76,7 @@ absl::StatusOr<std::unique_ptr<GlobalData>> Client::TransferToServer(
 
   VLOG(1) << "making transfer to server request";
   VLOG(3) << "TransferToServerRequest: {" << request.DebugString() << "}";
-  Status s = stub_->TransferToServer(&request, &response);
+  absl::Status s = stub_->TransferToServer(&request, &response);
   VLOG(1) << "done with request";
 
   if (!s.ok()) {
@@ -93,8 +93,9 @@ absl::StatusOr<std::unique_ptr<GlobalData>> Client::TransferToServer(
   return std::make_unique<GlobalData>(stub_, response.data());
 }
 
-Status Client::TransferToInfeed(const LiteralSlice& literal, int64_t replica_id,
-                                const DeviceHandle* device_handle) {
+absl::Status Client::TransferToInfeed(const LiteralSlice& literal,
+                                      int64_t replica_id,
+                                      const DeviceHandle* device_handle) {
   TransferToInfeedRequest request;
   *request.mutable_literal() = literal.ToProto();
   if (device_handle) {
@@ -105,7 +106,7 @@ Status Client::TransferToInfeed(const LiteralSlice& literal, int64_t replica_id,
 
   VLOG(1) << "making transfer to infeed request";
   VLOG(3) << "TransferToInfeedRequest: {" << request.DebugString() << "}";
-  Status s = stub_->TransferToInfeed(&request, &response);
+  absl::Status s = stub_->TransferToInfeed(&request, &response);
   VLOG(1) << "done with request";
 
   if (!s.ok()) {
@@ -130,7 +131,7 @@ absl::StatusOr<Literal> Client::TransferFromOutfeed(
 
   VLOG(1) << "making transfer from outfeed request";
   VLOG(3) << "TransferFromOutfeedRequest: {" << request.DebugString() << "}";
-  Status s = stub_->TransferFromOutfeed(&request, &response);
+  absl::Status s = stub_->TransferFromOutfeed(&request, &response);
   VLOG(1) << "done with request";
 
   if (!s.ok()) {
@@ -147,13 +148,13 @@ absl::StatusOr<Literal> Client::TransferFromOutfeed(
   return Literal::CreateFromProto(response.literal());
 }
 
-Status Client::ResetDevice() {
+absl::Status Client::ResetDevice() {
   ResetDeviceRequest request;
   ResetDeviceResponse response;
 
   VLOG(1) << "making reset device request";
   VLOG(3) << "ResetDeviceRequest: {" << request.DebugString() << "}";
-  Status s = stub_->ResetDevice(&request, &response);
+  absl::Status s = stub_->ResetDevice(&request, &response);
   VLOG(1) << "done with request";
 
   if (!s.ok()) {
@@ -192,7 +193,7 @@ absl::StatusOr<Literal> Client::ComputeConstant(
   ComputeConstantResponse response;
 
   VLOG(2) << "making compute-constant-graph request";
-  Status s = stub_->ComputeConstantGraph(&request, &response);
+  absl::Status s = stub_->ComputeConstantGraph(&request, &response);
   VLOG(2) << "done with request";
 
   if (!s.ok()) {
@@ -238,7 +239,7 @@ absl::StatusOr<ExecutionHandle> Client::Compile(
 
   CompileResponse response;
   VLOG(1) << "making compile request: " << request.ShortDebugString();
-  Status s = stub_->Compile(&request, &response);
+  absl::Status s = stub_->Compile(&request, &response);
   VLOG(1) << "done with request";
 
   if (!s.ok()) {
@@ -260,7 +261,7 @@ absl::StatusOr<std::unique_ptr<GlobalData>> Client::Execute(
 
   ExecuteResponse response;
   VLOG(1) << "making execute request: " << request.ShortDebugString();
-  Status s = stub_->Execute(&request, &response);
+  absl::Status s = stub_->Execute(&request, &response);
   VLOG(1) << "done with request";
 
   if (!s.ok()) {
@@ -342,7 +343,7 @@ Client::ExecuteParallel(absl::Span<const XlaComputationInstance> computations) {
   ExecuteParallelResponse response;
   VLOG(1) << "making execute-graph-parallel request: "
           << request.ShortDebugString();
-  Status s = stub_->ExecuteGraphParallel(&request, &response);
+  absl::Status s = stub_->ExecuteGraphParallel(&request, &response);
   VLOG(1) << "done with request";
 
   if (!s.ok()) {
@@ -372,7 +373,7 @@ absl::StatusOr<std::vector<DeviceHandle>> Client::GetDeviceHandles(
 
   GetDeviceHandlesResponse response;
   VLOG(1) << "making get device request: " << request.ShortDebugString();
-  Status s = stub_->GetDeviceHandles(&request, &response);
+  absl::Status s = stub_->GetDeviceHandles(&request, &response);
   VLOG(1) << "done with request";
 
   if (!s.ok()) {
@@ -389,13 +390,13 @@ absl::StatusOr<std::vector<DeviceHandle>> Client::GetDeviceHandles(
   return device_handles;
 }
 
-Status Client::Unregister(const GlobalData& data) {
+absl::Status Client::Unregister(const GlobalData& data) {
   UnregisterRequest request;
   *request.add_data() = data.handle();
   UnregisterResponse response;
 
   VLOG(1) << "making unregister request";
-  Status s = stub_->Unregister(&request, &response);
+  absl::Status s = stub_->Unregister(&request, &response);
   VLOG(1) << "done with request";
 
   return s;
@@ -408,7 +409,7 @@ Client::DeconstructTuple(const GlobalData& data) {
   DeconstructTupleResponse response;
 
   VLOG(1) << "making DestructTuple request";
-  Status s = stub_->DeconstructTuple(&request, &response);
+  absl::Status s = stub_->DeconstructTuple(&request, &response);
   VLOG(1) << "done with request";
 
   if (!s.ok()) {
@@ -433,7 +434,7 @@ absl::StatusOr<ComputationStats> Client::GetComputationStats(
   ComputationStatsResponse response;
 
   VLOG(1) << "making computation graph stats request";
-  Status s = stub_->GetComputationGraphStats(&request, &response);
+  absl::Status s = stub_->GetComputationGraphStats(&request, &response);
   VLOG(1) << "done with request";
 
   if (!s.ok()) {
@@ -455,7 +456,7 @@ absl::StatusOr<Shape> Client::GetShape(const GlobalData& data) {
   GetShapeResponse response;
 
   VLOG(1) << "making get shape request";
-  Status s = stub_->GetShape(&request, &response);
+  absl::Status s = stub_->GetShape(&request, &response);
   VLOG(1) << "done with request";
 
   if (!s.ok()) {
@@ -493,7 +494,7 @@ absl::StatusOr<ChannelHandle> Client::CreateChannelHandleByType(
   CreateChannelHandleResponse response;
 
   VLOG(1) << "making create channel handle request";
-  Status s = stub_->CreateChannelHandle(&request, &response);
+  absl::Status s = stub_->CreateChannelHandle(&request, &response);
   VLOG(1) << "done with request";
 
   if (!s.ok()) {
diff --git a/third_party/xla/xla/client/client.h b/third_party/xla/xla/client/client.h
index e4c48d58359958..9831182c9c40e0 100644
--- a/third_party/xla/xla/client/client.h
+++ b/third_party/xla/xla/client/client.h
@@ -151,8 +151,9 @@ class Client {
   // device_handle and replica_id together specify a particular device; a device
   // assigned for the given replica_id among the replicas that the given device
   // handle belongs to.
-  Status TransferToInfeed(const LiteralSlice& literal, int64_t replica_id = 0,
-                          const DeviceHandle* device_handle = nullptr);
+  absl::Status TransferToInfeed(const LiteralSlice& literal,
+                                int64_t replica_id = 0,
+                                const DeviceHandle* device_handle = nullptr);
 
   // Transfers from the Outfeed of the device.
   //
@@ -164,7 +165,7 @@ class Client {
       const DeviceHandle* device_handle = nullptr);
 
   // Resets the device, clearing all existing state on the device.
-  Status ResetDevice();
+  absl::Status ResetDevice();
 
   // Executes the computation with the given arguments and transfers the result
   // to the client as a literal. Parameters are defined the same as for
@@ -195,7 +196,7 @@ class Client {
       const Layout* output_layout = nullptr) const;
 
   // Unregister the memory for the given GlobalData on the device.
-  Status Unregister(const GlobalData& data);
+  absl::Status Unregister(const GlobalData& data);
 
   // Returns a vector of global data handles that point to the tuple elements.
   absl::StatusOr<std::vector<std::unique_ptr<GlobalData>>> DeconstructTuple(
diff --git a/third_party/xla/xla/client/global_data.cc b/third_party/xla/xla/client/global_data.cc
index 1235d1e3c50d6b..5fb1ca9df693ac 100644
--- a/third_party/xla/xla/client/global_data.cc
+++ b/third_party/xla/xla/client/global_data.cc
@@ -37,7 +37,7 @@ void ReleaseHandles(ServiceInterface* parent,
     *request.add_data() = handle;
   }
   UnregisterResponse response;
-  Status status = parent->Unregister(&request, &response);
+  absl::Status status = parent->Unregister(&request, &response);
   VLOG(1) << "Done with request";
   if (!status.ok()) {
     LOG(WARNING) << "Failed to unregister handles: " << status
diff --git a/third_party/xla/xla/client/lib/arithmetic.cc b/third_party/xla/xla/client/lib/arithmetic.cc
index 0b089552e11d75..e4f365f599f6d4 100644
--- a/third_party/xla/xla/client/lib/arithmetic.cc
+++ b/third_party/xla/xla/client/lib/arithmetic.cc
@@ -174,8 +174,4 @@ XlaOp ArgMax(XlaOp input, PrimitiveType output_type, int axis) {
   return ArgMinMax(input, output_type, axis, /*is_min=*/false);
 }
 
-XlaOp ArgMin(XlaOp input, PrimitiveType output_type, int axis) {
-  return ArgMinMax(input, output_type, axis, /*is_min=*/true);
-}
-
 }  // namespace xla
diff --git a/third_party/xla/xla/client/lib/arithmetic.h b/third_party/xla/xla/client/lib/arithmetic.h
index 8ebf47ca53aff7..c434ca7ecc430a 100644
--- a/third_party/xla/xla/client/lib/arithmetic.h
+++ b/third_party/xla/xla/client/lib/arithmetic.h
@@ -82,10 +82,6 @@ XlaOp Any(XlaOp predicates);
 // use for the output. In case of ties always prefers smaller index.
 XlaOp ArgMax(XlaOp input, PrimitiveType output_type, int axis);
 
-// Returns the argmin of `input` along `axis`. `output_type` is the type to
-// use for the output. In case of ties always prefers smaller index.
-XlaOp ArgMin(XlaOp input, PrimitiveType output_type, int axis);
-
 // Dispatch to ArgMin or ArgMax above, depending on bool.
 XlaOp ArgMinMax(XlaOp input, PrimitiveType output_type, int axis, bool is_min);
 
diff --git a/third_party/xla/xla/client/lib/math.cc b/third_party/xla/xla/client/lib/math.cc
index 60592de4ddc80f..772faba50d78c6 100644
--- a/third_party/xla/xla/client/lib/math.cc
+++ b/third_party/xla/xla/client/lib/math.cc
@@ -93,7 +93,8 @@ static XlaOp DoWithUpcastToF32(XlaOp operand,
 
 // TODO(jlebar): Use this function in more places in this file to restrict the
 // domain of other functions.
-static Status EnsureOperandIsRealFp(absl::string_view op_name, XlaOp operand) {
+static absl::Status EnsureOperandIsRealFp(absl::string_view op_name,
+                                          XlaOp operand) {
   auto& b = *operand.builder();
   TF_ASSIGN_OR_RETURN(auto shape, b.GetShape(operand));
   auto elem_ty = shape.element_type();
diff --git a/third_party/xla/xla/client/lib/math_test.cc b/third_party/xla/xla/client/lib/math_test.cc
index 90c33045360502..0198951be86d1b 100644
--- a/third_party/xla/xla/client/lib/math_test.cc
+++ b/third_party/xla/xla/client/lib/math_test.cc
@@ -196,7 +196,8 @@ XLA_TYPED_TEST(MathTypedTest, LogEdgeCases) { this->TestLogEdgeCases(); }
 XLA_TYPED_TEST(MathTypedTest, Log1pEdgeCases) { this->TestLog1pEdgeCases(); }
 XLA_TYPED_TEST(MathTypedTest, IsInfOrNan) { this->TestIsInfOrNan(); }
 XLA_TYPED_TEST(MathTypedTest, IsNegZero) { this->TestIsNegZero(); }
-XLA_TYPED_TEST(MathTypedTest, SqrtPowInequivalence) {
+// Disabling on TPU since pow(-inf, 0.5) returns nan instead of +inf.
+XLA_TYPED_TEST(MathTypedTest, DISABLED_ON_TPU(SqrtPowInequivalence)) {
   this->TestSqrtPowInequivalence();
 }
 XLA_TYPED_TEST(MathTypedTest, ErfInvEdgeCases) { this->TestErfInvEdgeCases(); }
@@ -647,7 +648,7 @@ XLA_TEST_F(MathTest, BesselI0eFloat) {
   ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
 }
 
-XLA_TEST_F(MathTest, BesselI0eDouble) {
+XLA_TEST_F(MathTest, DISABLED_ON_TPU(BesselI0eDouble)) {
   XlaBuilder builder(TestName());
   auto x = ConstantR1<double>(
       &builder,
@@ -713,7 +714,7 @@ XLA_TEST_F(MathTest, BesselI1eFloat) {
   ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
 }
 
-XLA_TEST_F(MathTest, BesselI1eDouble) {
+XLA_TEST_F(MathTest, DISABLED_ON_TPU(BesselI1eDouble)) {
   XlaBuilder builder(TestName());
   auto x = ConstantR1<double>(
       &builder,
diff --git a/third_party/xla/xla/client/lib/svd.cc b/third_party/xla/xla/client/lib/svd.cc
index 9e7a33209d0b75..2304ec3b14c253 100644
--- a/third_party/xla/xla/client/lib/svd.cc
+++ b/third_party/xla/xla/client/lib/svd.cc
@@ -829,7 +829,7 @@ absl::StatusOr<SVDResult> SortBySingularValuesAndPostProcessing(
 SVDResult SVD(XlaOp a, int64_t max_iter, float epsilon,
               PrecisionConfig::Precision precision) {
   XlaBuilder* builder = a.builder();
-  auto return_error = [&](const Status& status) {
+  auto return_error = [&](const absl::Status& status) {
     SVDResult result;
     result.u = builder->ReportError(status);
     result.v = builder->ReportError(status);
diff --git a/third_party/xla/xla/client/lib/tridiagonal.cc b/third_party/xla/xla/client/lib/tridiagonal.cc
index 4daab7b4e8408b..459becdcfa4734 100644
--- a/third_party/xla/xla/client/lib/tridiagonal.cc
+++ b/third_party/xla/xla/client/lib/tridiagonal.cc
@@ -36,9 +36,9 @@ namespace tridiagonal {
 
 namespace {
 
-Status CheckSecondToLastDimension(const Shape& op_shape, int64_t rank,
-                                  int64_t expected,
-                                  const std::string& op_name) {
+absl::Status CheckSecondToLastDimension(const Shape& op_shape, int64_t rank,
+                                        int64_t expected,
+                                        const std::string& op_name) {
   const auto actual_num_dims = ShapeUtil::GetDimension(op_shape, rank - 2);
 
   if (actual_num_dims != expected) {
@@ -119,9 +119,9 @@ struct TridiagonalMatMulShapeParams {
   PrimitiveType element_type;
 };
 
-Status ValidateTridiagonalMatMulDiagonal(const Shape& diagonal_shape,
-                                         const std::string_view diagonal_name,
-                                         const Shape& rhs_shape) {
+absl::Status ValidateTridiagonalMatMulDiagonal(
+    const Shape& diagonal_shape, const std::string_view diagonal_name,
+    const Shape& rhs_shape) {
   const int64_t diagonal_rank = diagonal_shape.rank();
   const int64_t rhs_rank = rhs_shape.rank();
   if (diagonal_rank != rhs_rank) {
diff --git a/third_party/xla/xla/client/local_client.cc b/third_party/xla/xla/client/local_client.cc
index 033361e897b676..f8a29b126dacd3 100644
--- a/third_party/xla/xla/client/local_client.cc
+++ b/third_party/xla/xla/client/local_client.cc
@@ -51,7 +51,7 @@ LocalExecutable::LocalExecutable(std::unique_ptr<Executable> executable,
       << "Must have a valid device ordinal that the executable was built for.";
 }
 
-Status LocalExecutable::ValidateExecutionOptions(
+absl::Status LocalExecutable::ValidateExecutionOptions(
     const ExecutableRunOptions& run_options, const Backend& backend) {
   if (run_options.stream() != nullptr) {
     if (!run_options.stream()->ok()) {
@@ -60,7 +60,7 @@ Status LocalExecutable::ValidateExecutionOptions(
 
     // Check stream matches service platform.
     const se::Platform* stream_platform =
-        run_options.stream()->parent()->platform();
+        run_options.stream()->parent()->GetPlatform();
     if (stream_platform != backend_->platform()) {
       return InvalidArgument(
           "stream is for platform %s, but service targets platform %s",
@@ -207,7 +207,7 @@ static std::shared_ptr<HloSnapshot> DumpArguments(
   for (const ShapedBuffer* arg : arguments) {
     auto literal = std::make_shared<Literal>(arg->on_host_shape());
     backend->transfer_manager()->TransferLiteralFromDevice(
-        stream, *arg, literal.get(), [snapshot, literal](Status status) {
+        stream, *arg, literal.get(), [snapshot, literal](absl::Status status) {
           if (!status.ok()) {
             LOG(ERROR) << "TransferLiteralFromDevice for HLO snapshot inputs "
                           "failed: "
@@ -227,7 +227,7 @@ static void DumpOutputsAndSaveSnapshot(const Backend* backend,
   auto literal = std::make_shared<Literal>(outputs.on_host_shape());
   backend->transfer_manager()->TransferLiteralFromDevice(
       stream, outputs, literal.get(),
-      [snapshot{std::move(snapshot)}, literal](Status status) {
+      [snapshot{std::move(snapshot)}, literal](absl::Status status) {
         if (status.ok()) {
           *snapshot->mutable_result() = literal->ToProto();
         } else {
@@ -471,16 +471,16 @@ absl::StatusOr<const ShapedBuffer*> LocalClient::GlobalDataToShapedBuffer(
   return local_service_->GlobalDataToShapedBuffer(data, replica_number);
 }
 
-Status LocalClient::TransferToInfeedLocal(const LiteralSlice& literal,
-                                          int device_ordinal) {
+absl::Status LocalClient::TransferToInfeedLocal(const LiteralSlice& literal,
+                                                int device_ordinal) {
   TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
                       backend().stream_executor(device_ordinal));
   return backend().transfer_manager()->TransferLiteralToInfeed(executor,
                                                                literal);
 }
 
-Status LocalClient::TransferFromOutfeedLocal(int device_ordinal,
-                                             MutableBorrowingLiteral literal) {
+absl::Status LocalClient::TransferFromOutfeedLocal(
+    int device_ordinal, MutableBorrowingLiteral literal) {
   TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
                       backend().stream_executor(device_ordinal));
   return backend().transfer_manager()->TransferLiteralFromOutfeed(executor,
diff --git a/third_party/xla/xla/client/local_client.h b/third_party/xla/xla/client/local_client.h
index d6e382192cab86..a73210b0af7f63 100644
--- a/third_party/xla/xla/client/local_client.h
+++ b/third_party/xla/xla/client/local_client.h
@@ -85,8 +85,8 @@ class LocalExecutable {
   //
   // The given ExecutableRunOptions override any values from TF_XLA_FLAGS
   // environment variable.
-  Status ValidateExecutionOptions(const ExecutableRunOptions& run_options,
-                                  const Backend& backend);
+  absl::Status ValidateExecutionOptions(const ExecutableRunOptions& run_options,
+                                        const Backend& backend);
 
   // Returns a literal containing the contents of the given ShapedBuffer.
   absl::StatusOr<Literal> LiteralFromShapedBuffer(
@@ -102,7 +102,7 @@ class LocalExecutable {
   int build_device_ordinal() const { return build_options_.device_ordinal(); }
 
   template <typename T>
-  StatusOr<T> AsyncCallAndBlockHostUntilDone(
+  absl::StatusOr<T> AsyncCallAndBlockHostUntilDone(
       absl::Span<Shape const* const> argument_shapes,
       const ExecutableRunOptions& run_options,
       std::function<StatusOr<T>(const ExecutableRunOptions&)> async_callback) {
@@ -110,8 +110,8 @@ class LocalExecutable {
                         RunHelper(argument_shapes, run_options));
     ExecutableRunOptions options = options_and_stream.first.run_options();
     options.set_device_ordinal(-1);
-    StatusOr<T> result = async_callback(options);
-    Status block_status = options.stream()->BlockHostUntilDone();
+    absl::StatusOr<T> result = async_callback(options);
+    absl::Status block_status = options.stream()->BlockHostUntilDone();
     TF_RETURN_IF_ERROR(result.status());
     TF_RETURN_IF_ERROR(block_status);
     return result;
@@ -188,15 +188,16 @@ class LocalClient : public Client {
   // TODO(b/69670845): Remove the 'Local' from the name when LocalClient does
   // not inherit from Client and there is no possibility of confusion with
   // Client::TransferToInfeed.
-  Status TransferToInfeedLocal(const LiteralSlice& literal, int device_ordinal);
+  absl::Status TransferToInfeedLocal(const LiteralSlice& literal,
+                                     int device_ordinal);
 
   // Transfer and return a value from the outfeed of the given device. The
   // shape of the object to transfer is determined by `literal`'s shape.
   // TODO(b/69670845): Remove the 'Local' from the name when LocalClient does
   // not inherit from Client and there is no possibility of confusion with
   // Client::TransferFromOutfeed.
-  Status TransferFromOutfeedLocal(int device_ordinal,
-                                  MutableBorrowingLiteral literal);
+  absl::Status TransferFromOutfeedLocal(int device_ordinal,
+                                        MutableBorrowingLiteral literal);
 
   // Returns the device ordinal that corresponds to the given replica number.
   //
diff --git a/third_party/xla/xla/client/padding.cc b/third_party/xla/xla/client/padding.cc
index f36c1cd18a505b..22c4901a512362 100644
--- a/third_party/xla/xla/client/padding.cc
+++ b/third_party/xla/xla/client/padding.cc
@@ -25,9 +25,9 @@ limitations under the License.
 
 namespace xla {
 
-Status ValidatePaddingValues(absl::Span<const int64_t> input_dimensions,
-                             absl::Span<const int64_t> window_dimensions,
-                             absl::Span<const int64_t> window_strides) {
+absl::Status ValidatePaddingValues(absl::Span<const int64_t> input_dimensions,
+                                   absl::Span<const int64_t> window_dimensions,
+                                   absl::Span<const int64_t> window_strides) {
   bool ok = input_dimensions.size() == window_dimensions.size() &&
             input_dimensions.size() == window_strides.size();
   if (!ok) {
diff --git a/third_party/xla/xla/client/padding.h b/third_party/xla/xla/client/padding.h
index 5c4a34b0d09220..50ed6e58057ad8 100644
--- a/third_party/xla/xla/client/padding.h
+++ b/third_party/xla/xla/client/padding.h
@@ -41,9 +41,9 @@ enum class Padding {
 // Validates that the slices are acceptable for determining padding -- this can
 // be used to check the preconditions of MakePadding below to produce an error
 // message that can be returned to the user.
-Status ValidatePaddingValues(absl::Span<const int64_t> input_dimensions,
-                             absl::Span<const int64_t> window_dimensions,
-                             absl::Span<const int64_t> window_strides);
+absl::Status ValidatePaddingValues(absl::Span<const int64_t> input_dimensions,
+                                   absl::Span<const int64_t> window_dimensions,
+                                   absl::Span<const int64_t> window_strides);
 
 // Returns the padding needed for the base area, given the base area dimensions,
 // window dimensions, strides, and the type of padding.
diff --git a/third_party/xla/xla/client/xla_builder.cc b/third_party/xla/xla/client/xla_builder.cc
index b395bd81f4a347..e6b4e955aa69dd 100644
--- a/third_party/xla/xla/client/xla_builder.cc
+++ b/third_party/xla/xla/client/xla_builder.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/client/xla_builder.h"
 
+#include <algorithm>
 #include <cstddef>
 #include <cstdint>
 #include <functional>
@@ -111,8 +112,8 @@ bool InstrIsSetBound(const HloInstructionProto* instr_proto) {
   return false;
 }
 
-Status NormalizeAndAssignSharing(HloInstructionProto* instr,
-                                 const OpSharding& op_sharding) {
+absl::Status NormalizeAndAssignSharing(HloInstructionProto* instr,
+                                       const OpSharding& op_sharding) {
   // Normalize tuple sharding and fail the call if the sharding is invalid.
   Shape shape(instr->shape());
   TF_ASSIGN_OR_RETURN(HloSharding sharding,
@@ -186,28 +187,20 @@ std::pair<XlaOp, int64_t> XlaBuilderFriend::BuildAsyncStart(
 
 XlaOp XlaBuilderFriend::BuildAsyncUpdate(XlaBuilder* builder,
                                          const XlaOp operand,
-                                         std::string execution_thread,
-                                         int64_t called_computation,
                                          const Shape& shape) {
   return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     *instr.mutable_shape() = shape.ToProto();
-    instr.set_async_execution_thread(execution_thread);
-    instr.add_called_computation_ids(called_computation);
     return builder->AddInstruction(std::move(instr), HloOpcode::kAsyncUpdate,
                                    {operand});
   });
 }
 
 XlaOp XlaBuilderFriend::BuildAsyncDone(XlaBuilder* builder, const XlaOp operand,
-                                       std::string execution_thread,
-                                       int64_t called_computation,
                                        const Shape& shape) {
   return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     *instr.mutable_shape() = shape.ToProto();
-    instr.set_async_execution_thread(execution_thread);
-    instr.add_called_computation_ids(called_computation);
     return builder->AddInstruction(std::move(instr), HloOpcode::kAsyncDone,
                                    {operand});
   });
@@ -539,7 +532,7 @@ XlaBuilder::XlaBuilder(const std::string& computation_name)
 
 XlaBuilder::~XlaBuilder() = default;
 
-XlaOp XlaBuilder::ReportError(const Status& error) {
+XlaOp XlaBuilder::ReportError(const absl::Status& error) {
   CHECK(!error.ok());
   if (die_immediately_on_error_) {
     LOG(FATAL) << "error building computation: " << error;
@@ -694,16 +687,16 @@ void XlaBuilder::IsConstantVisitor(const int64_t op_handle, int depth,
   visited->insert(op_handle);
 }
 
-Status XlaBuilder::SetInstructionFrontendAttribute(const XlaOp op,
-                                                   std::string attribute,
-                                                   std::string value) {
+absl::Status XlaBuilder::SetInstructionFrontendAttribute(const XlaOp op,
+                                                         std::string attribute,
+                                                         std::string value) {
   TF_ASSIGN_OR_RETURN(auto instr_proto, LookUpMutableInstruction(op));
   auto* frontend_attributes = instr_proto->mutable_frontend_attributes();
   (*frontend_attributes->mutable_map())[attribute] = std::move(value);
   return OkStatus();
 }
 
-Status XlaBuilder::SetInstructionSharding(
+absl::Status XlaBuilder::SetInstructionSharding(
     XlaOp op, const std::optional<OpSharding>& sharding) {
   TF_ASSIGN_OR_RETURN(auto instr_proto, LookUpMutableInstruction(op));
   if (!sharding.has_value()) {
@@ -724,7 +717,7 @@ XlaComputation XlaBuilder::BuildAndNoteError() {
   return std::move(build_status).value();
 }
 
-Status XlaBuilder::GetCurrentStatus() const {
+absl::Status XlaBuilder::GetCurrentStatus() const {
   if (!first_error_.ok()) {
     std::string backtrace;
     first_error_backtrace_.Dump(tsl::DebugWriteToString, &backtrace);
@@ -811,7 +804,7 @@ absl::StatusOr<XlaComputation> XlaBuilder::Build(
   return std::move(computation);
 }
 
-/* static */ Status XlaBuilder::PopulateInputOutputAliasAndBufferDonor(
+/* static */ absl::Status XlaBuilder::PopulateInputOutputAliasAndBufferDonor(
     HloModuleProto* module, const ProgramShape& program_shape,
     const std::vector<InputOutputAlias>& input_output_aliases,
     const absl::flat_hash_set<HloBufferDonorConfig::BufferDonor>&
@@ -869,7 +862,43 @@ absl::StatusOr<XlaComputation> XlaBuilder::Build(
   return OkStatus();
 }
 
-XlaOp XlaBuilder::DynamicBroadcastInDim(
+XlaOp XlaBuilder::MhloDynamicReshape(XlaOp operand, XlaOp output_shape,
+                                     const Shape& shape) {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
+    if (operand_shape->element_type() != shape.element_type()) {
+      return InvalidArgument(
+          "Element type of operand %s and output %s must match",
+          ShapeUtil::HumanString(*operand_shape),
+          ShapeUtil::HumanString(shape));
+    }
+    if (operand_shape->is_static() && shape.is_static() &&
+        ShapeUtil::ElementsIn(*operand_shape) != ShapeUtil::ElementsIn(shape)) {
+      return InvalidArgument(
+          "MhloDynamicReshape has mismatched element counts: from=%d (%s) "
+          "to=%d (%s)",
+          ShapeUtil::ElementsIn(*operand_shape),
+          ShapeUtil::HumanString(*operand_shape), ShapeUtil::ElementsIn(shape),
+          ShapeUtil::HumanString(shape));
+    }
+    TF_ASSIGN_OR_RETURN(const Shape* output_shape_shape,
+                        GetShapePtr(output_shape));
+    if (output_shape_shape->dimensions(0) != shape.rank()) {
+      return InvalidArgument(
+          "output_shape dimension size=%d (%s) and rank of shape=%d (%s) must "
+          "match",
+          output_shape_shape->dimensions(0),
+          ShapeUtil::HumanString(*output_shape_shape), shape.rank(),
+          ShapeUtil::HumanString(shape));
+    }
+    return xla::CustomCall(operand.builder(), "mhlo.dynamic_reshape",
+                           /*operands=*/{operand, output_shape},
+                           /*shape=*/shape,
+                           /*opaque=*/"");
+  });
+};
+
+XlaOp XlaBuilder::MhloDynamicBroadcastInDim(
     const XlaOp operand, const XlaOp output_dimensions,
     absl::Span<const int64_t> broadcast_dimensions, const Shape& output_shape) {
   return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
@@ -879,7 +908,7 @@ XlaOp XlaBuilder::DynamicBroadcastInDim(
 
     if (!output_dimensions_shape->IsInteger()) {
       return InvalidArgument("output_dimensions must be an integer type %s",
-                             output_dimensions_shape->ToString());
+                             ShapeUtil::HumanString(*output_dimensions_shape));
     }
 
     if (output_dimensions_shape->rank() != 1) {
@@ -954,8 +983,8 @@ absl::StatusOr<XlaOp> XlaBuilder::InDimBroadcast(
       TF_RET_CHECK(operand_shape->is_bounded_dynamic_dimension(
                        it - broadcast_dimensions.begin()) ==
                    shape.is_bounded_dynamic_dimension(i))
-          << " i: " << i << ", shape: " << shape.ToString()
-          << ", operand_shape: " << operand_shape->ToString();
+          << " i: " << i << ", shape: " << ShapeUtil::HumanString(shape)
+          << ", operand_shape: " << ShapeUtil::HumanString(*operand_shape);
     } else {
       // Non-broadcast dimensions must be static.
       TF_RET_CHECK(shape.is_static_dimension(i));
@@ -1084,7 +1113,7 @@ absl::StatusOr<std::vector<XlaOp>> ExtractDimensionSizesAndPadOnesToLeft(
 // Broadcast `scalar` to `output_shape` with all shapes static at runtime. If a
 // dimension of `output_shape` is dynamic, get the dimension size of the dynamic
 // dimension from `output` and reshape them to `tensor<1xi32>`. This is used as
-// one of the inputs to DynamicBroadcastInDim.
+// one of the inputs to MhloDynamicBroadcastInDim.
 absl::StatusOr<XlaOp> BroadcastScalarToOutputShapeWithUnbounded(
     XlaBuilder* builder, XlaOp scalar, XlaOp output,
     const Shape& output_shape) {
@@ -1100,7 +1129,7 @@ absl::StatusOr<XlaOp> BroadcastScalarToOutputShapeWithUnbounded(
                   /*values=*/{static_cast<int32_t>(output_shape.dimensions(i))})
             : Reshape(GetDimensionSize(output, i), {1});
   }
-  return DynamicBroadcastInDim(
+  return MhloDynamicBroadcastInDim(
       scalar, /*output_dimensions=*/ConcatInDim(builder, output_sizes, 0), {},
       output_shape);
 }
@@ -1117,8 +1146,8 @@ absl::StatusOr<XlaOp> DegenerateBroadcastWithUnbounded(
   std::iota(broadcast_dimensions.begin(), broadcast_dimensions.end(),
             output_shape.rank() - operand_shape->rank());
 
-  return DynamicBroadcastInDim(operand, output_dimensions, broadcast_dimensions,
-                               output_shape);
+  return MhloDynamicBroadcastInDim(operand, output_dimensions,
+                                   broadcast_dimensions, output_shape);
 }
 
 // Helper struct to store the result of `BroadcastToOutputShapeWithUnbounded`.
@@ -1387,7 +1416,7 @@ XlaOp XlaBuilder::Iota(const Shape& shape, int64_t iota_dimension) {
     if (!shape.is_static()) {
       return InvalidArgument(
           "The output of iota must not have dynamic dimensions: %s",
-          shape.ToString());
+          ShapeUtil::HumanString(shape));
     }
     HloInstructionProto instr;
     *instr.mutable_shape() = shape.ToProto();
@@ -1479,7 +1508,7 @@ XlaOp XlaBuilder::BroadcastInDim(
                             operand_shape->element_type(), out_dim_size));
     TF_RET_CHECK(!output_shape.is_unbounded_dynamic())
         << "BroadcastInDim output must shape be static or bounded dynamic "
-        << output_shape.ToString();
+        << ShapeUtil::HumanString(output_shape);
     int64_t broadcast_rank = broadcast_dimensions.size();
     if (operand_shape->rank() != broadcast_rank) {
       return InvalidArgument(
@@ -1958,7 +1987,7 @@ XlaOp XlaBuilder::SparseDot(
   });
 }
 
-Status XlaBuilder::VerifyConvolution(
+absl::Status XlaBuilder::VerifyConvolution(
     const Shape& lhs_shape, const Shape& rhs_shape,
     const ConvolutionDimensionNumbers& dimension_numbers) const {
   if (lhs_shape.rank() != rhs_shape.rank()) {
@@ -2524,9 +2553,6 @@ XlaOp XlaBuilder::CreateToken() {
 
 XlaOp XlaBuilder::AfterAll(absl::Span<const XlaOp> tokens) {
   return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
-    if (tokens.empty()) {
-      return InvalidArgument("AfterAll requires at least one operand");
-    }
     for (int i = 0, end = tokens.size(); i < end; ++i) {
       XlaOp operand = tokens[i];
       TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
@@ -3164,13 +3190,14 @@ XlaOp XlaBuilder::AllReduceImpl(XlaOp operand,
     if (layout) {
       if (!LayoutUtil::HasLayout(*layout)) {
         return InvalidArgument("shape_with_layout must have the layout set: %s",
-                               layout->ToString());
+                               ShapeUtil::HumanString(*layout));
       }
       if (!ShapeUtil::Compatible(*layout, *operand_shape)) {
         return InvalidArgument(
             "Provided shape_with_layout must be compatible with the "
             "operand shape: %s vs %s",
-            layout->ToString(), operand_shape->ToString());
+            ShapeUtil::HumanString(*layout),
+            ShapeUtil::HumanString(*operand_shape));
       }
       instr.set_constrain_layout(true);
       if (operand_shape->IsTuple() && !inferred_shape.IsTuple()) {
@@ -3306,7 +3333,7 @@ XlaOp XlaBuilder::ConditionalImpl(
   });
 }
 
-Status XlaBuilder::CheckOpBuilder(XlaOp op) const {
+absl::Status XlaBuilder::CheckOpBuilder(XlaOp op) const {
   if (this != op.builder()) {
     return InvalidArgument(
         "XlaOp with handle %d is built by builder '%s', but is trying to use "
@@ -3763,15 +3790,55 @@ XlaOp XlaBuilder::AllToAllArray(
       return all_to_all;
     }
     DimensionVector sizes;
+    const bool is_unbounded = operand_shape->is_unbounded_dynamic();
+    std::vector<XlaOp> dynamic_sizes;
+    auto GetR1DimensionSizeOrConstant = [&](XlaOp operand,
+                                            int64_t dimension) -> XlaOp {
+      if (operand_shape->is_unbounded_dynamic_dimension(dimension)) {
+        return Reshape(GetDimensionSize(operand, dimension), {1});
+      }
+      return ConstantR1<int32_t>(
+          this, {static_cast<int32_t>(operand_shape->dimensions(dimension))});
+    };
+    XlaOp r1_split_count =
+        ConstantR1<int32_t>(this, {static_cast<int32_t>(split_count)});
     for (int64_t i = 0; i < operand_shape->rank(); ++i) {
       if (i != split_dimension) {
         sizes.push_back(operand_shape->dimensions(i));
+        if (is_unbounded) {
+          dynamic_sizes.push_back(GetR1DimensionSizeOrConstant(operand, i));
+        }
         continue;
       }
       sizes.push_back(split_count);
-      sizes.push_back(operand_shape->dimensions(i) / split_count);
+      sizes.push_back(operand_shape->is_unbounded_dynamic_dimension(i)
+                          ? Shape::kUnboundedSize
+                          : operand_shape->dimensions(i) / split_count);
+
+      if (is_unbounded) {
+        dynamic_sizes.push_back(r1_split_count);
+        dynamic_sizes.push_back(
+            operand_shape->is_unbounded_dynamic_dimension(i)
+                ? Div(GetR1DimensionSizeOrConstant(operand, i), r1_split_count)
+                : ConstantR1<int32_t>(this,
+                                      {static_cast<int32_t>(sizes.back())}));
+      }
+    }
+
+    if (is_unbounded) {
+      std::vector<bool> dynamic_dimensions;
+      std::transform(
+          sizes.begin(), sizes.end(), std::back_inserter(dynamic_dimensions),
+          [](int64_t size) { return size == Shape::kUnboundedSize; });
+      TF_ASSIGN_OR_RETURN(
+          const Shape shape,
+          ShapeUtil::MakeValidatedShape(all_to_all_shape.element_type(), sizes,
+                                        dynamic_dimensions));
+      all_to_all =
+          MhloDynamicReshape(all_to_all, ConcatInDim(dynamic_sizes, 0), shape);
+    } else {
+      all_to_all = Reshape(all_to_all, sizes);
     }
-    all_to_all = Reshape(all_to_all, sizes);
 
     std::vector<int64_t> permutation;
     const auto rank = operand_shape->rank();
@@ -3784,6 +3851,21 @@ XlaOp XlaBuilder::AllToAllArray(
       permutation.push_back(dim_after_reshape);
     }
     all_to_all = Transpose(all_to_all, permutation);
+
+    if (is_unbounded) {
+      std::vector<XlaOp> new_dimensions;
+      for (int64_t i = 0; i < operand_shape->rank(); ++i) {
+        new_dimensions.push_back(GetR1DimensionSizeOrConstant(operand, i));
+      }
+      new_dimensions[split_dimension] =
+          Div(new_dimensions[split_dimension], r1_split_count);
+      new_dimensions[concat_dimension] =
+          Mul(new_dimensions[concat_dimension], r1_split_count);
+
+      return MhloDynamicReshape(all_to_all, ConcatInDim(new_dimensions, 0),
+                                all_to_all_shape);
+    }
+
     return Reshape(all_to_all_shape, all_to_all);
   });
 }
@@ -3812,7 +3894,8 @@ XlaOp XlaBuilder::AllToAllTuple(
           return InvalidArgument(
               "Provided layout must be compatible with the operands' shape. "
               "The layout is %s, but operand %d has shape %s.",
-              layout->ToString(), i, shape.tuple_shapes(i).ToString());
+              layout->ToString(), i,
+              ShapeUtil::HumanString(shape.tuple_shapes(i)));
         }
         *(shape.mutable_tuple_shapes(i)->mutable_layout()) = *layout;
       }
@@ -3838,6 +3921,13 @@ XlaOp XlaBuilder::AllToAllTuple(
     const std::optional<ChannelHandle>& channel_id) {
   return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
+    if (operand_shape->is_unbounded_dynamic() ||
+        split_dimension == Shape::kUnboundedSize ||
+        concat_dimension == Shape::kUnboundedSize ||
+        split_count == Shape::kUnboundedSize) {
+      return InvalidArgument(
+          "AllToAllTuple does not support unbounded dynamic shapes");
+    }
 
     // The HloInstruction for AllToAll currently only handles the data
     // communication: it accepts N already split parts and scatters them to N
@@ -3863,14 +3953,14 @@ XlaOp XlaBuilder::AllToAllTuple(
     }
 
     // Handle data communication.
-    XlaOp alltoall =
+    XlaOp all_to_all =
         this->AllToAllTuple(slices, replica_groups, layout, channel_id);
 
     // Concat the N received parts.
     std::vector<XlaOp> received;
     received.reserve(split_count);
     for (int i = 0; i < split_count; i++) {
-      received.push_back(this->GetTupleElement(alltoall, i));
+      received.push_back(this->GetTupleElement(all_to_all, i));
     }
     return this->ConcatInDim(received, concat_dimension);
   });
@@ -4515,7 +4605,7 @@ XlaBuilder::CreateDefaultConvDimensionNumbers(int num_spatial_dims) {
   return dimension_numbers;
 }
 
-/* static */ Status XlaBuilder::Validate(
+/* static */ absl::Status XlaBuilder::Validate(
     const ConvolutionDimensionNumbers& dnum) {
   if (dnum.input_spatial_dimensions_size() < 2) {
     return FailedPrecondition("input spacial dimension < 2: %d",
@@ -4726,10 +4816,16 @@ XlaOp BroadcastInDim(const XlaOp operand,
                                            broadcast_dimensions);
 }
 
-XlaOp DynamicBroadcastInDim(const XlaOp operand, const XlaOp output_dimensions,
-                            absl::Span<const int64_t> broadcast_dimensions,
-                            const Shape& output_shape) {
-  return operand.builder()->DynamicBroadcastInDim(
+XlaOp MhloDynamicReshape(const XlaOp operand, const XlaOp output_shape,
+                         const Shape& shape) {
+  return operand.builder()->MhloDynamicReshape(operand, output_shape, shape);
+}
+
+XlaOp MhloDynamicBroadcastInDim(const XlaOp operand,
+                                const XlaOp output_dimensions,
+                                absl::Span<const int64_t> broadcast_dimensions,
+                                const Shape& output_shape) {
+  return operand.builder()->MhloDynamicBroadcastInDim(
       operand, output_dimensions, broadcast_dimensions, output_shape);
 }
 
diff --git a/third_party/xla/xla/client/xla_builder.h b/third_party/xla/xla/client/xla_builder.h
index 585eafc91d72ca..d88c90b2cf8896 100644
--- a/third_party/xla/xla/client/xla_builder.h
+++ b/third_party/xla/xla/client/xla_builder.h
@@ -75,11 +75,9 @@ struct XlaBuilderFriend {
       std::string execution_thread, const XlaComputation& called_computation,
       const Shape& shape);
   static XlaOp BuildAsyncUpdate(XlaBuilder* builder, XlaOp operands,
-                                std::string execution_thread,
-                                int64_t called_computation, const Shape& shape);
+                                const Shape& shape);
   static XlaOp BuildAsyncDone(XlaBuilder* builder, XlaOp operands,
-                              std::string execution_thread,
-                              int64_t called_computation, const Shape& shape);
+                              const Shape& shape);
 
   static XlaOp BuildAllGatherStart(
       XlaBuilder* builder, XlaOp operand, int64_t all_gather_dimension,
@@ -338,7 +336,7 @@ class XlaBuilder {
       int num_spatial_dims = 2);
 
   // Returns an error if the convolution dimension numbers have conflicts.
-  static Status Validate(const ConvolutionDimensionNumbers& dnum);
+  static absl::Status Validate(const ConvolutionDimensionNumbers& dnum);
 
   // Returns a new XlaBuilder whose resultant Computation is used only by this
   // XlaBuilder. The sub-XlaBuilder has the same die_immediately_on_error
@@ -387,11 +385,11 @@ class XlaBuilder {
   // building the computation when they make a final call to Build().
   //
   // See also set_die_immediately_on_error().
-  Status first_error() const { return first_error_; }
+  absl::Status first_error() const { return first_error_; }
 
   // Returns the current status of the builder, complete with the stack trace
   // information.
-  Status GetCurrentStatus() const;
+  absl::Status GetCurrentStatus() const;
 
   // Returns the shape of the given op.
   absl::StatusOr<Shape> GetShape(XlaOp op) const;
@@ -418,15 +416,15 @@ class XlaBuilder {
   // * dying if die_immediately_on_error_ is true.
   // Returns an XlaOp with an invalid handle but a valid builder. This value can
   // be returned in place of a value in APIs that return an XlaOp.
-  XlaOp ReportError(const Status& error);
+  XlaOp ReportError(const absl::Status& error);
 
-  // A helper function that converts a StatusOr<XlaOp> into an XlaOp.
-  // If the Status was an error, reports the error to builder and returns an
-  // invalid XlaOp handle.
+  // A helper function that converts a absl::StatusOr<XlaOp> into an XlaOp.
+  // If the absl::Status was an error, reports the error to builder and returns
+  // an invalid XlaOp handle.
   XlaOp ReportErrorOrReturn(const absl::StatusOr<XlaOp>& op);
 
-  // A helper function that runs a function that returns a StatusOr<XlaOp> and
-  // returns an XlaOp.
+  // A helper function that runs a function that returns a absl::StatusOr<XlaOp>
+  // and returns an XlaOp.
   XlaOp ReportErrorOrReturn(
       absl::FunctionRef<absl::StatusOr<XlaOp>()> op_creator);
 
@@ -477,15 +475,15 @@ class XlaBuilder {
   // "value". If the attribute already existed, then its value is updated.
   //
   // The attribute is only added to the HloInstruction, not to the builder.
-  Status SetInstructionFrontendAttribute(XlaOp op, std::string attribute,
-                                         std::string value);
+  absl::Status SetInstructionFrontendAttribute(XlaOp op, std::string attribute,
+                                               std::string value);
 
   // Looks up the HloInstruction and sets the sharding. If the sharding already
   // existed, then its value is updated.
   //
   // The sharding is only added to the HloInstruction, not to the builder.
-  Status SetInstructionSharding(XlaOp op,
-                                const std::optional<OpSharding>& sharding);
+  absl::Status SetInstructionSharding(
+      XlaOp op, const std::optional<OpSharding>& sharding);
 
   // Returns shapes for the operands.
   absl::StatusOr<std::vector<Shape>> GetOperandShapes(
@@ -524,9 +522,10 @@ class XlaBuilder {
   // op from the XlaBuilder. This is only intended for export to MHLO or
   // StableHLO, and cannot be compiled. Only static output_dimensions are
   // allowed, and broadcast_dimensions is verified.
-  XlaOp DynamicBroadcastInDim(XlaOp operand, XlaOp output_dimensions,
-                              absl::Span<const int64_t> broadcast_dimensions,
-                              const Shape& output_shape);
+  XlaOp MhloDynamicBroadcastInDim(
+      XlaOp operand, XlaOp output_dimensions,
+      absl::Span<const int64_t> broadcast_dimensions,
+      const Shape& output_shape);
 
   XlaOp Pad(XlaOp operand, XlaOp padding_value,
             const PaddingConfig& padding_config);
@@ -551,6 +550,9 @@ class XlaBuilder {
                        absl::Span<const int64_t> new_size_bounds,
                        const std::vector<bool>& dims_are_dynamic);
 
+  XlaOp MhloDynamicReshape(XlaOp operand, XlaOp output_shape,
+                           const Shape& shape);
+
   XlaOp Collapse(XlaOp operand, absl::Span<const int64_t> dimensions);
 
   XlaOp Slice(XlaOp operand, absl::Span<const int64_t> start_indices,
@@ -1115,7 +1117,7 @@ class XlaBuilder {
                          bool* is_constant) const;
 
   // Checks bounds for convolution parameters.
-  Status VerifyConvolution(
+  absl::Status VerifyConvolution(
       const Shape& lhs_shape, const Shape& rhs_shape,
       const ConvolutionDimensionNumbers& dimension_numbers) const;
 
@@ -1123,7 +1125,7 @@ class XlaBuilder {
 
   // Populates the module with the input/output alias information stored within
   // the input_output_aliases vector.
-  static Status PopulateInputOutputAliasAndBufferDonor(
+  static absl::Status PopulateInputOutputAliasAndBufferDonor(
       HloModuleProto* module, const ProgramShape& program_shape,
       const std::vector<InputOutputAlias>& input_output_aliases,
       const absl::flat_hash_set<HloBufferDonorConfig::BufferDonor>&
@@ -1137,7 +1139,7 @@ class XlaBuilder {
 
   // The first error encountered while building the computation.
   // This is OK until the first error is encountered.
-  Status first_error_;
+  absl::Status first_error_;
 
   // The saved stack trace from the point at which the first error occurred.
   tsl::SavedStackTrace first_error_backtrace_;
@@ -1212,7 +1214,7 @@ class XlaBuilder {
                               absl::Span<const int64_t> out_dim_size,
                               absl::Span<const int64_t> broadcast_dimensions);
 
-  friend XlaOp DynamicBroadcastInDim(
+  friend XlaOp MhloDynamicBroadcastInDim(
       XlaOp operand, XlaOp output_dimensions,
       absl::Span<const int64_t> broadcast_dimensions,
       const Shape& output_shape);
@@ -1236,6 +1238,9 @@ class XlaBuilder {
                               absl::Span<const int64_t> new_size_bounds,
                               const std::vector<bool>& dims_are_dynamic);
 
+  friend XlaOp MhloDynamicReshape(XlaOp operand, XlaOp output_shape,
+                                  const Shape& shape);
+
   friend XlaOp ReshapeWithInferredDimension(XlaOp operand,
                                             absl::Span<const int64_t> new_sizes,
                                             int64_t inferred_dimension);
@@ -1656,7 +1661,7 @@ class XlaBuilder {
  protected:
   // Returns OK status if the given op was built using this builder. Otherwise,
   // returns an error.
-  Status CheckOpBuilder(XlaOp op) const;
+  absl::Status CheckOpBuilder(XlaOp op) const;
 
  private:
   XlaOp AllGatherImpl(XlaOp operand, int64_t all_gather_dimension,
@@ -1698,7 +1703,7 @@ class XlaBuilder {
   // Here, InstructionType is either const HloInstructionProto* or non-const
   // HloInstructionProto*.
   template <typename InstructionType>
-  StatusOr<InstructionType> LookUpInstructionByHandleInternal(
+  absl::StatusOr<InstructionType> LookUpInstructionByHandleInternal(
       int64_t handle) const {
     auto it = handle_to_index_.find(handle);
     if (it == handle_to_index_.end()) {
@@ -1718,11 +1723,11 @@ class XlaBuilder {
   // Here, InstructionType is either const HloInstructionProto* or non-const
   // HloInstructionProto*.
   //
-  // TODO(hinsu): Return const pointer within StatusOr and use
+  // TODO(hinsu): Return const pointer within absl::StatusOr and use
   // absl::implicit_cast at callsites. This requires implicit_cast support in
   // absl::StatusOr similar to absl::StatusOr.
   template <typename InstructionType>
-  StatusOr<InstructionType> LookUpInstructionInternal(XlaOp op) const {
+  absl::StatusOr<InstructionType> LookUpInstructionInternal(XlaOp op) const {
     TF_RETURN_IF_ERROR(CheckOpBuilder(op));
     return LookUpInstructionByHandleInternal<InstructionType>(op.handle());
   }
@@ -1918,9 +1923,9 @@ XlaOp BroadcastInDim(XlaOp operand, absl::Span<const int64_t> out_dim_size,
 // StableHLO, and cannot be compiled. See
 // https://www.tensorflow.org/mlir/hlo_ops#mhlodynamic_broadcast_in_dim_mhlodynamicbroadcastindimop.
 // for the op semantics.
-XlaOp DynamicBroadcastInDim(XlaOp operand, XlaOp output_dimensions,
-                            absl::Span<const int64_t> broadcast_dimensions,
-                            const Shape& output_shape);
+XlaOp MhloDynamicBroadcastInDim(XlaOp operand, XlaOp output_dimensions,
+                                absl::Span<const int64_t> broadcast_dimensions,
+                                const Shape& output_shape);
 
 // Copies the input operand to the output. This operation is for internal
 // purpose and is only used by the compiler for optimization purposes or to
@@ -1966,6 +1971,11 @@ XlaOp DynamicReshape(XlaOp operand, absl::Span<const XlaOp> dim_sizes,
                      absl::Span<const int64_t> new_size_bounds,
                      const std::vector<bool>& dims_are_dynamic);
 
+// This is an experimental API for creating the mhlo.dynamic_reshape op from the
+// XlaBuilder. This is only intended for export to MHLO or StableHLO, and cannot
+// be compiled.
+XlaOp MhloDynamicReshape(XlaOp operand, XlaOp output_shape, const Shape& shape);
+
 // Enqueues an operation onto the computation that collapses the operand,
 // from first to last dimension (C order), then reshapes it to the given
 // dimension sizes. Conceptually, this is a limited form of "shape casting".
@@ -2554,7 +2564,10 @@ XlaOp ReduceScatter(
     const std::optional<Layout>& layout = std::nullopt,
     std::optional<bool> use_global_device_ids = std::nullopt);
 
-// Enqueues an operation that do an Alltoall of the operand cross cores.
+// Enqueues an operation that do an AllToAll of the operand cross cores.
+// This involves AllToAll, followed by Reshape, Transpose, and another Reshape
+// to get proper codegen. See implementation for additional details.
+//
 // An optional `layout` can be specified to force the layout of the instruction.
 // This is used to guarantee the same layout for a group of AllToAll ops
 // compiled separately.
diff --git a/third_party/xla/xla/client/xla_builder_test.cc b/third_party/xla/xla/client/xla_builder_test.cc
index c97bd9f21b3674..fad65f35384248 100644
--- a/third_party/xla/xla/client/xla_builder_test.cc
+++ b/third_party/xla/xla/client/xla_builder_test.cc
@@ -754,7 +754,7 @@ TEST(XlaBuilderTest, BuildWithSpecificRootWithWrongBuilder) {
   Parameter(&b, 0, shape, "param");
   const XlaOp other_param = Parameter(&other_b, 0, shape, "other_param");
 
-  Status status = b.Build(other_param).status();
+  absl::Status status = b.Build(other_param).status();
   ASSERT_IS_NOT_OK(status);
   EXPECT_THAT(
       status.message(),
@@ -1202,7 +1202,7 @@ TEST(XlaBuilderTest, DynamicSelectNotCompatible) {
   auto gte0 = GetTupleElement(p0, 0);  // f32[4,<=5,6]
   auto gte1 = GetTupleElement(p0, 1);  // f32[4,5,<=6]
   Select(pred, gte0, gte1);
-  Status status = BuildHloModule(b).status();
+  absl::Status status = BuildHloModule(b).status();
   ASSERT_IS_OK(status);
 }
 
@@ -1240,6 +1240,54 @@ TEST(XlaBuilderTest, DotWithPreferredElementType) {
       ShapeUtil::Equal(ShapeUtil::MakeShape(U32, {2, 2}), result_shape));
 }
 
+TEST(XlaBuilderTest, FftWithFFT) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("c64[5, <=10]"));
+  const std::vector<int64_t> fft_length = {5, 10};
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("c64[5, <=10]"));
+  Fft(Parameter(&b, 0, operand, "operand"), /*fft_type=*/FftType::FFT,
+      fft_length);
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
+TEST(XlaBuilderTest, FftWithIFFT) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("c64[5, <=10]"));
+  const std::vector<int64_t> fft_length = {5, 10};
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("c64[5, <=10]"));
+  Fft(Parameter(&b, 0, operand, "operand"), /*fft_type=*/FftType::IFFT,
+      fft_length);
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
+TEST(XlaBuilderTest, FftWithRFFT) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f64[10, <=5]"));
+  const std::vector<int64_t> fft_length = {5};
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("c128[10, <=3]"));
+  Fft(Parameter(&b, 0, operand, "operand"), /*fft_type=*/FftType::RFFT,
+      fft_length);
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
+TEST(XlaBuilderTest, FftWithIRFFT) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("c128[10, <=3]"));
+  const std::vector<int64_t> fft_length = {5};
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f64[10, <=5]"));
+  Fft(Parameter(&b, 0, operand, "operand"), /*fft_type=*/FftType::IRFFT,
+      fft_length);
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
 TEST(XlaBuilderTest, SparseDot) {
   XlaBuilder b(TestName());
   auto lhs = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {10, 16}), "lhs");
@@ -1299,12 +1347,21 @@ TEST(XlaBuilderTest, ConvolutionWithPreferredElementType) {
 TEST(XlaBuilderTest, AfterAllWithNonTokenOperands) {
   XlaBuilder b(TestName());
   AfterAll(&b, {CreateToken(&b), ConstantR0<float>(&b, 1.0)});
-  Status status = b.Build().status();
+  absl::Status status = b.Build().status();
   ASSERT_IS_NOT_OK(status);
   EXPECT_THAT(status.message(),
               ::testing::HasSubstr("All operands to AfterAll must be tokens"));
 }
 
+TEST(XlaBuilderTest, AfterAllWithNoInputs) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("token[]"));
+  AfterAll(&b, {});
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
 TEST(XlaBuilderTest, CheckInputOutputAlias) {
   XlaBuilder b(TestName());
   auto p0 = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {8, 4}), "p0");
@@ -1700,13 +1757,13 @@ TEST(XlaBuilderTest, TopKDimensions) {
 // Experimental Test
 //============================================================================//
 
-TEST(XlaBuilderTest, DynamicBroadcastInDimExportSuccess) {
+TEST(XlaBuilderTest, MhloDynamicBroadcastInDimExportSuccess) {
   XlaBuilder b(TestName());
   TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[1, ?]"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape output_dimensions, ParseShape("s32[3]"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape output_shape, ParseShape("f32[1, 2, 3]"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[1, 2, 3]"));
-  DynamicBroadcastInDim(
+  MhloDynamicBroadcastInDim(
       Parameter(&b, 0, operand, "operand"),
       Parameter(&b, 1, output_dimensions, "output_dimensions"),
       /*broadcast_dimensions=*/{1, 2}, output_shape);
@@ -1717,13 +1774,14 @@ TEST(XlaBuilderTest, DynamicBroadcastInDimExportSuccess) {
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
-TEST(XlaBuilderTest, DynamicBroadcastInDimNonBroadcastDimSizeGreaterThanOne) {
+TEST(XlaBuilderTest,
+     MhloDynamicBroadcastInDimNonBroadcastDimSizeGreaterThanOne) {
   XlaBuilder b(TestName());
   TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[2, ?]"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape output_dimensions, ParseShape("s32[3]"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape output_shape, ParseShape("f32[2, 2, 3]"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[2, 2, 3]"));
-  DynamicBroadcastInDim(
+  MhloDynamicBroadcastInDim(
       Parameter(&b, 0, operand, "operand"),
       Parameter(&b, 1, output_dimensions, "output_dimensions"),
       /*broadcast_dimensions=*/{1, 2}, output_shape);
@@ -1734,13 +1792,13 @@ TEST(XlaBuilderTest, DynamicBroadcastInDimNonBroadcastDimSizeGreaterThanOne) {
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
-TEST(XlaBuilderTest, DynamicBroadcastInDimDynamicResultSize) {
+TEST(XlaBuilderTest, MhloDynamicBroadcastInDimDynamicResultSize) {
   XlaBuilder b(TestName());
   TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[1, ?]"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape output_dimensions, ParseShape("s32[3]"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape output_shape, ParseShape("f32[1, 2, ?]"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[1, 2, ?]"));
-  DynamicBroadcastInDim(
+  MhloDynamicBroadcastInDim(
       Parameter(&b, 0, operand, "operand"),
       Parameter(&b, 1, output_dimensions, "output_dimensions"),
       /*broadcast_dimensions=*/{1, 2}, output_shape);
@@ -1751,12 +1809,13 @@ TEST(XlaBuilderTest, DynamicBroadcastInDimDynamicResultSize) {
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
-TEST(XlaBuilderTest, DynamicBroadcastInDimInvalidOutputDimensionsElementType) {
+TEST(XlaBuilderTest,
+     MhloDynamicBroadcastInDimInvalidOutputDimensionsElementType) {
   XlaBuilder b(TestName());
   TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[2, ?]"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape output_dimensions, ParseShape("f32[3]"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape output_shape, ParseShape("f32[2, 3, 3]"));
-  DynamicBroadcastInDim(
+  MhloDynamicBroadcastInDim(
       Parameter(&b, 0, operand, "operand"),
       Parameter(&b, 1, output_dimensions, "output_dimensions"),
       /*broadcast_dimensions=*/{1, 2}, output_shape);
@@ -1766,13 +1825,13 @@ TEST(XlaBuilderTest, DynamicBroadcastInDimInvalidOutputDimensionsElementType) {
                HasSubstr("output_dimensions must be an integer type f32[3]")));
 }
 
-TEST(XlaBuilderTest, DynamicBroadcastInDimInvalidOutputDimensionsRank) {
+TEST(XlaBuilderTest, MhloDynamicBroadcastInDimInvalidOutputDimensionsRank) {
   XlaBuilder b(TestName());
   TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[2, ?]"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape output_dimensions,
                           ParseShape("s32[2, ?]"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape output_shape, ParseShape("f32[2, 3, 3]"));
-  DynamicBroadcastInDim(
+  MhloDynamicBroadcastInDim(
       Parameter(&b, 0, operand, "operand"),
       Parameter(&b, 1, output_dimensions, "output_dimensions"),
       /*broadcast_dimensions=*/{1, 2}, output_shape);
@@ -1782,12 +1841,12 @@ TEST(XlaBuilderTest, DynamicBroadcastInDimInvalidOutputDimensionsRank) {
                HasSubstr("output_dimensions must be rank 1 but got rank 2")));
 }
 
-TEST(XlaBuilderTest, DynamicBroadcastInDimIncompatibleBroadcastSize) {
+TEST(XlaBuilderTest, MhloDynamicBroadcastInDimIncompatibleBroadcastSize) {
   XlaBuilder b(TestName());
   TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[2, ?]"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape output_dimensions, ParseShape("s32[3]"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape output_shape, ParseShape("f32[2, 3, 3]"));
-  DynamicBroadcastInDim(
+  MhloDynamicBroadcastInDim(
       Parameter(&b, 0, operand, "operand"),
       Parameter(&b, 1, output_dimensions, "output_dimensions"),
       /*broadcast_dimensions=*/{1, 2}, output_shape);
@@ -1797,6 +1856,66 @@ TEST(XlaBuilderTest, DynamicBroadcastInDimIncompatibleBroadcastSize) {
                             "with size of result dimension 1 (3)")));
 }
 
+TEST(XlaBuilderTest, MhloDynamicReshapeExportSuccess) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 15]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape output_shape, ParseShape("s32[2]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape shape, ParseShape("f32[?, 15]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 15]"));
+  MhloDynamicReshape(
+      /*operand=*/Parameter(&b, 0, operand, "operand"),
+      /*output_shape=*/Parameter(&b, 1, output_shape, "output_shape"),
+      /*shape=*/shape);
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
+  EXPECT_THAT(module->ToString(), HasSubstr("mhlo.dynamic_reshape"));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
+TEST(XlaBuilderTest, MhloDynamicReshapeIncompatibleElementType) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 15]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape output_shape, ParseShape("s32[2]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape shape, ParseShape("s32[?, 15]"));
+  MhloDynamicReshape(
+      /*operand=*/Parameter(&b, 0, operand, "operand"),
+      /*output_shape=*/Parameter(&b, 1, output_shape, "output_shape"),
+      /*shape=*/shape);
+  EXPECT_THAT(BuildHloModule(b),
+              StatusIs(_, HasSubstr("Element type of operand f32[?,15] and "
+                                    "output s32[?,15] must match")));
+}
+
+TEST(XlaBuilderTest, MhloDynamicReshapeElementCountMismatch) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[3, 15]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape output_shape, ParseShape("s32[2]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape shape, ParseShape("f32[4, 15]"));
+  MhloDynamicReshape(
+      /*operand=*/Parameter(&b, 0, operand, "operand"),
+      /*output_shape=*/Parameter(&b, 1, output_shape, "output_shape"),
+      /*shape=*/shape);
+  EXPECT_THAT(BuildHloModule(b),
+              StatusIs(_, HasSubstr("MhloDynamicReshape has mismatched "
+                                    "element counts: from=45 (f32[3,15]) "
+                                    "to=60 (f32[4,15])")));
+}
+
+TEST(XlaBuilderTest, MhloDynamicReshapeRankMismatch) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 15]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape output_shape, ParseShape("s32[3]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape shape, ParseShape("f32[?, 15]"));
+  MhloDynamicReshape(
+      /*operand=*/Parameter(&b, 0, operand, "operand"),
+      /*output_shape=*/Parameter(&b, 1, output_shape, "output_shape"),
+      /*shape=*/shape);
+  EXPECT_THAT(
+      BuildHloModule(b),
+      StatusIs(_, HasSubstr("output_shape dimension size=3 (s32[3]) and rank "
+                            "of shape=2 (f32[?,15]) must match")));
+}
+
 //============================================================================//
 // Unbounded Dynamism Test
 //============================================================================//
@@ -1928,6 +2047,126 @@ TEST(XlaBuilderTest, UnboundedAllReduce) {
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
+TEST(XlaBuilderTest, UnboundedAllToAllDynamicSplitDimension) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 15]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 45]"));
+  AllToAll(/*operand=*/Parameter(&b, 0, operand, "operand"),
+           /*split_dimension=*/0,
+           /*concat_dimension=*/1,
+           /*split_count=*/3,
+           /*replica_groups=*/{});
+  TF_ASSERT_OK_AND_ASSIGN(const std::unique_ptr<HloModule> module,
+                          BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
+TEST(XlaBuilderTest, UnboundedAllToAllDynamicConcatDimension) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 15]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 5]"));
+  AllToAll(/*operand=*/Parameter(&b, 0, operand, "operand"),
+           /*split_dimension=*/1,
+           /*concat_dimension=*/0,
+           /*split_count=*/3,
+           /*replica_groups=*/{});
+  TF_ASSERT_OK_AND_ASSIGN(const std::unique_ptr<HloModule> module,
+                          BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
+TEST(XlaBuilderTest, UnboundedAllToAllDynamicSplitAndConcatDimensionEqual) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 15]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 15]"));
+  AllToAll(/*operand=*/Parameter(&b, 0, operand, "operand"),
+           /*split_dimension=*/0,
+           /*concat_dimension=*/0,
+           /*split_count=*/3,
+           /*replica_groups=*/{});
+  TF_ASSERT_OK_AND_ASSIGN(const std::unique_ptr<HloModule> module,
+                          BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
+TEST(XlaBuilderTest, UnboundedAllToAllFullyDynamic) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, ?]"));
+  AllToAll(/*operand=*/Parameter(&b, 0, operand, "operand"),
+           /*split_dimension=*/0,
+           /*concat_dimension=*/1,
+           /*split_count=*/3,
+           /*replica_groups=*/{});
+  TF_ASSERT_OK_AND_ASSIGN(const std::unique_ptr<HloModule> module,
+                          BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
+TEST(XlaBuilderTest, UnboundedAllToAllTupleVariadicUnsupported) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 15]{1,0}"));
+  b.ReportErrorOrReturn(
+      AllToAllTuple(/*operands=*/{Parameter(&b, 0, operand, "operand0"),
+                                  Parameter(&b, 1, operand, "operand1")},
+                    /*replica_groups=*/{}));
+  EXPECT_THAT(
+      BuildHloModule(b),
+      StatusIs(_,
+               HasSubstr(
+                   "AllToAllTuple does not support unbounded dynamic shapes")));
+}
+
+TEST(XlaBuilderTest, UnboundedAllToAllTupleUnsupported) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 15]{1,0}"));
+  b.ReportErrorOrReturn(
+      AllToAllTuple(/*operand=*/Parameter(&b, 0, operand, "operand"),
+                    /*split_dimension=*/0,
+                    /*concat_dimension=*/1,
+                    /*split_count=*/3,
+                    /*replica_groups=*/{}));
+  EXPECT_THAT(
+      BuildHloModule(b),
+      StatusIs(_,
+               HasSubstr(
+                   "AllToAllTuple does not support unbounded dynamic shapes")));
+}
+
+TEST(XlaBuilderTest, BoundedAllToAllTupleUnsupported) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[3, <=15]{1,0}"));
+  b.ReportErrorOrReturn(
+      AllToAllTuple(/*operand=*/Parameter(&b, 0, operand, "operand"),
+                    /*split_dimension=*/0,
+                    /*concat_dimension=*/1,
+                    /*split_count=*/3,
+                    /*replica_groups=*/{}));
+  EXPECT_THAT(
+      BuildHloModule(b),
+      StatusIs(_,
+               HasSubstr("AllToAll does not support bounded dynamic shapes")));
+}
+
+TEST(XlaBuilderTest, BoundedAllToAllUnsupported) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[3, <=15]{1,0}"));
+  b.ReportErrorOrReturn(
+      AllToAllTuple(/*operand=*/Parameter(&b, 0, operand, "operand"),
+                    /*split_dimension=*/0,
+                    /*concat_dimension=*/1,
+                    /*split_count=*/3,
+                    /*replica_groups=*/{}));
+  EXPECT_THAT(
+      BuildHloModule(b),
+      StatusIs(_,
+               HasSubstr("AllToAll does not support bounded dynamic shapes")));
+}
+
 TEST(XlaBuilderTest, UnboundedAnd) {
   XlaBuilder b(TestName());
   TF_ASSERT_OK_AND_ASSIGN(const Shape lhs,
@@ -2322,6 +2561,54 @@ TEST(XlaBuilderTest, UnboundedDynamicUpdateSlice) {
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
+TEST(XlaBuilderTest, UnboundedFftWithFFT) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("c64[2, <=5, ?]"));
+  const std::vector<int64_t> fft_length = {5, 10};
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("c64[2, <=5, ?]"));
+  Fft(Parameter(&b, 0, operand, "operand"), /*fft_type=*/FftType::FFT,
+      fft_length);
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
+TEST(XlaBuilderTest, UnboundedFftWithIFFT) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("c64[2, <=5, ?]"));
+  const std::vector<int64_t> fft_length = {5, 10};
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("c64[2, <=5, ?]"));
+  Fft(Parameter(&b, 0, operand, "operand"), /*fft_type=*/FftType::IFFT,
+      fft_length);
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
+TEST(XlaBuilderTest, UnboundedFftWithRFFT) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f64[2, <=5, ?]"));
+  const std::vector<int64_t> fft_length = {5, 10};
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("c128[2, <=5, 6]"));
+  Fft(Parameter(&b, 0, operand, "operand"), /*fft_type=*/FftType::RFFT,
+      fft_length);
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
+TEST(XlaBuilderTest, UnboundedFftWithIRFFT) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("c128[2, <=5, ?]"));
+  const std::vector<int64_t> fft_length = {5, 10};
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f64[2, <=5, 10]"));
+  Fft(Parameter(&b, 0, operand, "operand"), /*fft_type=*/FftType::IRFFT,
+      fft_length);
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
 TEST(XlaBuilderTest, UnboundedGather) {
   XlaBuilder b(TestName());
   TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[3, 4, 2]"));
diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc
index 4fde3b7583f1aa..ba4561802cf8fb 100644
--- a/third_party/xla/xla/debug_options_flags.cc
+++ b/third_party/xla/xla/debug_options_flags.cc
@@ -78,8 +78,6 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
 #ifdef XLA_CPU_USE_ACL
   opts.set_xla_cpu_use_acl(true);
 #endif
-  opts.set_xla_cpu_use_xla_runtime(false);
-  opts.set_xla_cpu_sparse_cuda_threads(0);
 
   opts.set_xla_cpu_enable_fast_math(false);
   // Disable forms of fast math that have caused users problems in the past.
@@ -89,7 +87,12 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_cpu_fast_math_honor_division(true);
 
   // TODO(AyanmoI): Remove this flag when cuDNN FMHA is fully supported.
-  opts.set_xla_gpu_enable_cudnn_fmha(true);
+  //
+  // cuDNN FMHA currently rewrites attention layers to use FlashAttention by
+  // default. This reassociation is not semantics-preserving, and the user
+  // should explicitly opt in if they wish to use this feature. cuDNN FMHA can
+  // not be turned on by default.
+  opts.set_xla_gpu_enable_cudnn_fmha(false);
 
   opts.set_xla_gpu_fused_attention_use_cudnn_rng(false);
 
@@ -106,10 +109,8 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.add_xla_gpu_enable_command_buffer(DebugOptions::CUBLAS);
   opts.add_xla_gpu_enable_command_buffer(DebugOptions::CUSTOM_CALL);
   opts.add_xla_gpu_enable_command_buffer(DebugOptions::CUDNN);
-  opts.set_xla_gpu_graph_num_runs_to_instantiate(-1);
   opts.set_xla_gpu_graph_min_graph_size(5);
   opts.set_xla_gpu_graph_enable_concurrent_region(false);
-  opts.set_xla_gpu_graph_eviction_timeout_seconds(60);
 
   // Despite the name, fast min/max on GPUs does not seem to be any faster, and
   // adds very counter-intuitive "NaN-swallowing" behavior.
@@ -125,14 +126,6 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_enable_all_gather_combine_by_dim(true);
   opts.set_xla_gpu_enable_reduce_scatter_combine_by_dim(true);
 
-  opts.set_xla_gpu_enable_async_collectives(true);
-  opts.set_xla_gpu_enable_async_all_reduce(true);
-  opts.set_xla_gpu_enable_async_all_gather(false);
-  opts.set_xla_gpu_enable_async_collective_broadcast(true);
-  opts.set_xla_gpu_enable_async_collective_permute(false);
-  opts.set_xla_gpu_enable_async_all_to_all(false);
-  opts.set_xla_gpu_enable_async_reduce_scatter(false);
-
   opts.set_xla_gpu_enable_reassociation_for_converted_ar(true);
 
   opts.set_xla_cpu_enable_xprof_traceme(false);
@@ -141,13 +134,12 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_detailed_logging(true);
   opts.set_xla_enable_dumping(true);
 
-  opts.set_xla_gpu_enable_xla_runtime_executable(false);
   opts.set_xla_gpu_enable_custom_fusions(false);
   opts.set_xla_gpu_enable_address_computation_fusion(true);
   opts.set_xla_gpu_nccl_termination_timeout_seconds(-1);
   opts.set_xla_gpu_enable_shared_constants(true);
   opts.set_xla_gpu_enable_nccl_user_buffers(false);
-  opts.set_xla_gpu_enable_nccl_comm_splitting(false);
+  opts.set_xla_gpu_enable_nccl_comm_splitting(true);
   opts.set_xla_gpu_enable_nccl_per_stream_comms(false);
 
   // Set 4GB space limit for redzone scratch allocator.
@@ -155,7 +147,6 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_redzone_padding_bytes(8 * 1024 * 1024);
   opts.set_xla_gpu_shape_checks(DebugOptions::RUNTIME);
   opts.set_xla_gpu_normalize_layouts(true);
-  opts.set_xla_gpu_simplify_all_fp_conversions(true);
   opts.set_xla_dump_latency_hiding_schedule(false);
   opts.set_xla_gpu_enable_latency_hiding_scheduler(false);
   opts.set_xla_gpu_lhs_enable_gpu_async_tracker(true);
@@ -189,6 +180,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_triton_gemm_any(false);
   opts.set_xla_gpu_enable_triton_softmax_fusion(false);
   opts.set_xla_gpu_triton_fusion_level(2);
+  opts.set_xla_gpu_verify_triton_fusion_numerics(false);
 
   // Moving reduce-scatter out of while loops can increase memory footprint, so
   // turning it off by default.
@@ -213,6 +205,8 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_cublas_fallback(true);
   opts.set_xla_gpu_cudnn_gemm_fusion_level(0);
   opts.set_xla_gpu_enable_while_loop_double_buffering(false);
+  opts.set_xla_gpu_enable_while_loop_unrolling(
+      DebugOptions::WHILE_LOOP_UNROLLING_NO_UNROLL);
   opts.set_xla_gpu_ensure_minor_dot_contraction_dims(false);
   opts.set_xla_gpu_filter_kernels_spilling_registers_on_autotuning(true);
   opts.set_xla_gpu_llvm_verification_level(0);
@@ -253,6 +247,12 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
 
   opts.set_xla_gpu_use_memcpy_local_p2p(false);
 
+  opts.set_xla_reduce_window_rewrite_base_length(32);
+
+  opts.set_xla_gpu_require_complete_aot_autotune_results(false);
+
+  opts.set_xla_gpu_enable_host_memory_offloading(false);
+
   return opts;
 }
 
@@ -553,6 +553,48 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
     return true;
   };
 
+  auto collective_op_types_to_string =
+      [](tsl::protobuf::RepeatedField<int> collective_ops) -> std::string {
+    struct Formatter {
+      void operator()(std::string* out, int type) const {
+        absl::StrAppend(out, DebugOptions::CollectiveOpType_Name(type));
+      }
+    };
+    return absl::StrJoin(collective_ops, ", ", Formatter());
+  };
+
+  // Custom parser for xla_gpu_disable_async_collectives.
+  auto setter_for_xla_gpu_disable_async_collectives =
+      [debug_options](const absl::string_view& input) {
+        auto is_collective_type = [](absl::string_view value) {
+          DebugOptions::CollectiveOpType op_type;
+          return DebugOptions::CollectiveOpType_Parse(
+              absl::AsciiStrToUpper(value), &op_type);
+        };
+
+        auto parse_collective_type = [](absl::string_view value) {
+          DebugOptions::CollectiveOpType op_type;
+          DebugOptions::CollectiveOpType_Parse(absl::AsciiStrToUpper(value),
+                                               &op_type);
+          return op_type;
+        };
+
+        std::vector<absl::string_view> values = absl::StrSplit(input, ',');
+
+        // Overwrite a set of supported commands with a flag.
+        if (absl::c_all_of(values, is_collective_type)) {
+          debug_options->clear_xla_gpu_disable_async_collectives();
+          for (const absl::string_view value : values) {
+            debug_options->add_xla_gpu_disable_async_collectives(
+                parse_collective_type(value));
+          }
+          return true;
+        }
+
+        // Return an error if flag value was not recognized as one of the
+        // supported modes.
+        return false;
+      };
   // Don't use an initializer list for initializing the vector; this would
   // create a temporary copy, and exceeds the stack space when compiling with
   // certain configurations.
@@ -722,17 +764,6 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       "xla_cpu_use_acl", bool_setter_for(&DebugOptions::set_xla_cpu_use_acl),
       debug_options->xla_cpu_use_acl(),
       "Generate calls to ACL (Arm Compute Library) in the CPU backend."));
-  flag_list->push_back(
-      tsl::Flag("xla_cpu_use_xla_runtime",
-                bool_setter_for(&DebugOptions::set_xla_cpu_use_xla_runtime),
-                debug_options->xla_cpu_use_xla_runtime(),
-                "Enable XLA Runtime in the CPU backend."));
-  flag_list->push_back(tsl::Flag(
-      "xla_cpu_sparse_cuda_threads",
-      int32_setter_for(&DebugOptions::set_xla_cpu_sparse_cuda_threads),
-      debug_options->xla_cpu_sparse_cuda_threads(),
-      "Sets number fo CUDA threads for sparse GPU acceleration in the CPU "
-      "backend (0 = off)."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_crash_on_verification_failures",
       bool_setter_for(
@@ -982,42 +1013,19 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
                 debug_options->xla_gpu_deterministic_ops(),
                 "Guarantees run-to-run determinism on GPU."));
   flag_list->push_back(tsl::Flag(
-      "xla_gpu_enable_async_collectives",
-      bool_setter_for(&DebugOptions::set_xla_gpu_enable_async_collectives),
-      debug_options->xla_gpu_enable_async_collectives(),
-      "Converts synchronous collective ops into asynchronous."));
-  flag_list->push_back(tsl::Flag(
-      "xla_gpu_enable_async_all_reduce",
-      bool_setter_for(&DebugOptions::set_xla_gpu_enable_async_all_reduce),
-      debug_options->xla_gpu_enable_async_all_reduce(),
-      "Converts synchronous all-reduce ops into asynchronous."));
-  flag_list->push_back(tsl::Flag(
-      "xla_gpu_enable_async_collective_broadcast",
-      bool_setter_for(
-          &DebugOptions::set_xla_gpu_enable_async_collective_broadcast),
-      debug_options->xla_gpu_enable_async_collective_broadcast(),
-      "Converts synchronous collective-broadcast ops into asynchronous."));
-  flag_list->push_back(tsl::Flag(
-      "xla_gpu_enable_async_collective_permute",
-      bool_setter_for(
-          &DebugOptions::set_xla_gpu_enable_async_collective_permute),
-      debug_options->xla_gpu_enable_async_collective_permute(),
-      "Converts synchronous collective-permute ops into asynchronous."));
-  flag_list->push_back(tsl::Flag(
-      "xla_gpu_enable_async_all_gather",
-      bool_setter_for(&DebugOptions::set_xla_gpu_enable_async_all_gather),
-      debug_options->xla_gpu_enable_async_all_gather(),
-      "Converts synchronous all-gather ops into asynchronous."));
-  flag_list->push_back(tsl::Flag(
-      "xla_gpu_enable_async_reduce_scatter",
-      bool_setter_for(&DebugOptions::set_xla_gpu_enable_async_reduce_scatter),
-      debug_options->xla_gpu_enable_async_reduce_scatter(),
-      "Converts synchronous reduce-scatter ops into asynchronous."));
+      "xla_gpu_exclude_nondeterministic_ops",
+      bool_setter_for(&DebugOptions::set_xla_gpu_exclude_nondeterministic_ops),
+      debug_options->xla_gpu_exclude_nondeterministic_ops(),
+      "Excludes non-deterministic ops from compiled executables."));
   flag_list->push_back(tsl::Flag(
-      "xla_gpu_enable_async_all_to_all",
-      bool_setter_for(&DebugOptions::set_xla_gpu_enable_async_all_to_all),
-      debug_options->xla_gpu_enable_async_all_to_all(),
-      "Converts synchronous all-to-all ops into asynchronous."));
+      "xla_gpu_disable_async_collectives",
+      setter_for_xla_gpu_disable_async_collectives,
+      collective_op_types_to_string(
+          debug_options->xla_gpu_disable_async_collectives()),
+      "This disables a certain set of async collectives and turn them into"
+      " synchornous ones. By default, this is empty which indicates enabling"
+      " async execution for all collectives. A sample usage is: "
+      " --xla_gpu_disable_async_collectives=ALLREDUCE,REDUCESCATTER"));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_all_reduce_combine_threshold_bytes",
       int64_setter_for(
@@ -1132,13 +1140,6 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       " can either be a list of command types or a list of command types with"
       " + and - as prefix, which indicate adding or removing a command type"
       " to/from the default list."));
-  flag_list->push_back(tsl::Flag(
-      "xla_gpu_graph_num_runs_to_instantiate",
-      int32_setter_for(
-          &DebugOptions::set_xla_gpu_graph_num_runs_to_instantiate),
-      debug_options->xla_gpu_graph_num_runs_to_instantiate(),
-      "Instantiate a gpu graph after the time a captured function is executed "
-      "reaches the threshold."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_graph_min_graph_size",
       int32_setter_for(&DebugOptions::set_xla_gpu_graph_min_graph_size),
@@ -1152,14 +1153,6 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
                 debug_options->xla_gpu_graph_enable_concurrent_region(),
                 "Identify concurrent regions in gpu graphs and execute them "
                 "concurrently."));
-  flag_list->push_back(tsl::Flag(
-      "xla_gpu_graph_eviction_timeout_seconds",
-      int32_setter_for(
-          &DebugOptions::set_xla_gpu_graph_eviction_timeout_seconds),
-      debug_options->xla_gpu_graph_eviction_timeout_seconds(),
-      "Timeout in seconds to evict instantiated Gpu graphs from device. When "
-      "XLA instantiates new Gpu graphs, it evicts graphs that were not "
-      "recently executed to free space on device."));
 
   flag_list->push_back(
       tsl::Flag("xla_dump_disable_metadata",
@@ -1181,11 +1174,6 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       "MLIR will be in the llvm-parsable format and can be processed by "
       "mlir-opt tools. "
       "Pretty print form is not legal MLIR."));
-  flag_list->push_back(tsl::Flag(
-      "xla_gpu_enable_xla_runtime_executable",
-      bool_setter_for(&DebugOptions::set_xla_gpu_enable_xla_runtime_executable),
-      debug_options->xla_gpu_enable_xla_runtime_executable(),
-      "Whether to enable XLA runtime for XLA:GPU backend"));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_enable_custom_fusions",
       bool_setter_for(&DebugOptions::set_xla_gpu_enable_custom_fusions),
@@ -1249,11 +1237,6 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       "Amount of padding the redzone allocator will put on one side of each "
       "buffer it allocates. (So the buffer's total size will be increased by "
       "2x this value.)"));
-  flag_list->push_back(tsl::Flag(
-      "xla_gpu_simplify_all_fp_conversions",
-      bool_setter_for(&DebugOptions::set_xla_gpu_simplify_all_fp_conversions),
-      debug_options->xla_gpu_simplify_all_fp_conversions(),
-      "Allows any chain of floating-point conversions to be simplified."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_shape_checks", setter_for_xla_gpu_shape_checks,
       DebugOptions::ShapeChecks_Name(debug_options->xla_gpu_shape_checks()),
@@ -1395,6 +1378,12 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       bool_setter_for(&DebugOptions::set_xla_gpu_enable_triton_softmax_fusion),
       debug_options->xla_gpu_enable_triton_softmax_fusion(),
       "Use Triton-based Softmax fusion."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_verify_triton_fusion_numerics",
+      bool_setter_for(&DebugOptions::set_xla_gpu_verify_triton_fusion_numerics),
+      debug_options->xla_gpu_verify_triton_fusion_numerics(),
+      "Whether to verify that the numeric results of Triton fusions match the "
+      "results of regular emitters."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_enable_cudnn_int8x32_convolution_reordering",
       bool_setter_for(
@@ -1447,7 +1436,7 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       "xla_gpu_require_complete_aot_autotune_results",
       bool_setter_for(
           &DebugOptions::set_xla_gpu_require_complete_aot_autotune_results),
-      debug_options->xla_gpu_multi_streamed_windowed_einsum(),
+      debug_options->xla_gpu_require_complete_aot_autotune_results(),
       "Whether to require complete AOT autotuning results."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_auto_spmd_partitioning_memory_budget_gb",
@@ -1480,12 +1469,20 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       debug_options->xla_gpu_triton_fusion_level(),
       "Triton fusion level, higher levels mean more fused operations."));
   flag_list->push_back(tsl::Flag(
-      "xla_gpu_dump_autotuned_triton_fusions",
-      bool_setter_for(&DebugOptions::set_xla_gpu_dump_autotuned_triton_fusions),
-      debug_options->xla_gpu_dump_autotuned_triton_fusions(),
-      "Dumps autotuned Triton fusions to the directory specified by "
+      "xla_gpu_dump_autotuned_gemm_fusions",
+      bool_setter_for(&DebugOptions::set_xla_gpu_dump_autotuned_gemm_fusions),
+      debug_options->xla_gpu_dump_autotuned_gemm_fusions(),
+      "Dumps autotuned GEMM fusions to the directory specified by "
       "xla_dump_to or stdout. Each fusion is dumped only once, as an optimized "
       "HLO."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_override_gemm_autotuner",
+      string_setter_for(&DebugOptions::set_xla_gpu_override_gemm_autotuner),
+      debug_options->xla_gpu_override_gemm_autotuner(),
+      "Overrides the GEMM autotuner to use the specified "
+      "(AutotuneResult::TritonGemmKey) textproto configuration for all Triton "
+      "GEMM fusions. (You can get such textprotos from the debug logs of the "
+      "GEMM autotuner.) "));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_copy_insertion_use_region_analysis",
       bool_setter_for(
@@ -1521,7 +1518,7 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       tsl::Flag("xla_gpu_cublas_fallback",
                 bool_setter_for(&DebugOptions::set_xla_gpu_cublas_fallback),
                 debug_options->xla_gpu_cublas_fallback(),
-                "Allow Triton GEMM autotuning to fall back to cuBLAS when that "
+                "Allow GEMM fusion autotuning to fall back to cuBLAS when that "
                 "is faster."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_cudnn_gemm_fusion_level",
@@ -1660,7 +1657,7 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       "xla_gpu_gemm_rewrite_size_threshold",
       int64_setter_for(&DebugOptions::set_xla_gpu_gemm_rewrite_size_threshold),
       debug_options->xla_gpu_gemm_rewrite_size_threshold(),
-      "Threshold to rewrite matmul to cuBLAS or Triton "
+      "Threshold until which elemental dot emitter is preferred for GEMMs "
       "(minumum combined number of elements of both matrices "
       "in non-batch dimensions to be considered for a rewrite)."));
   flag_list->push_back(tsl::Flag(
@@ -1668,6 +1665,23 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       bool_setter_for(&DebugOptions::set_xla_gpu_use_memcpy_local_p2p),
       debug_options->xla_gpu_use_memcpy_local_p2p(),
       "Whether to use memcpy for local p2p communication."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_dump_autotune_logs_to",
+      string_setter_for(&DebugOptions::set_xla_gpu_dump_autotune_logs_to),
+      debug_options->xla_gpu_dump_autotune_logs_to(),
+      "File to write autotune logs to. It will be a binary file unless the "
+      "name ends with .txt or .textproto."));
+  flag_list->push_back(tsl::Flag(
+      "xla_reduce_window_rewrite_base_length",
+      int64_setter_for(
+          &DebugOptions::set_xla_reduce_window_rewrite_base_length),
+      debug_options->xla_reduce_window_rewrite_base_length(),
+      "Base length to rewrite the reduce window to, no rewrite if set to 0."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_enable_host_memory_offloading",
+      bool_setter_for(&DebugOptions::set_xla_gpu_enable_host_memory_offloading),
+      debug_options->xla_gpu_enable_host_memory_offloading(),
+      "Whether to trigger host memory offloading on a device."));
 }  // NOLINT(readability/fn_size)
 
 // Allocates flag_values and flag_objects; this function must not be called more
diff --git a/third_party/xla/xla/device_util.h b/third_party/xla/xla/device_util.h
index 17f8aca2dbfa4c..1e9bc39329f286 100644
--- a/third_party/xla/xla/device_util.h
+++ b/third_party/xla/xla/device_util.h
@@ -22,15 +22,15 @@ limitations under the License.
 #include <string>
 
 #include "absl/strings/str_cat.h"
-#include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/stream_executor_interface.h"
 #include "xla/types.h"
 
 namespace xla {
 
 // Returns a string that represents the device in terms of platform and ordinal;
 // e.g. the first CUDA device will be "cuda:0"
-std::string DeviceIdentifier(se::StreamExecutor* stream_exec) {
-  return absl::StrCat(stream_exec->platform()->Name(), ":",
+std::string DeviceIdentifier(se::StreamExecutorInterface* stream_exec) {
+  return absl::StrCat(stream_exec->GetPlatform()->Name(), ":",
                       stream_exec->device_ordinal());
 }
 
diff --git a/third_party/xla/xla/examples/axpy/stablehlo_compile_test.cc b/third_party/xla/xla/examples/axpy/stablehlo_compile_test.cc
index 678e32237f06dd..7072f49a6544ca 100644
--- a/third_party/xla/xla/examples/axpy/stablehlo_compile_test.cc
+++ b/third_party/xla/xla/examples/axpy/stablehlo_compile_test.cc
@@ -72,11 +72,12 @@ TEST(StableHloAxpyTest, LoadAndRunCpuExecutable) {
 
   // The PjRtStreamExecutorClient will allow us to compile and execute
   // computations on the device we just configured.
-  auto pjrt_se_client = PjRtStreamExecutorClient(
-      "cpu", local_client, std::move(devices), /*process_index=*/0,
-      /*allocator=*/nullptr, /*host_memory_allocator=*/nullptr,
-      /*should_stage_host_to_device_transfers=*/false,
-      /*gpu_run_options=*/nullptr);
+  auto pjrt_se_client =
+      PjRtStreamExecutorClient("cpu", local_client, std::move(devices),
+                               /*process_index=*/0, /*allocator=*/nullptr,
+                               /*host_memory_allocator=*/nullptr,
+                               /*should_stage_host_to_device_transfers=*/false,
+                               /*gpu_run_options=*/nullptr);
 
   // Read StableHLO program to string.
   std::string program_path = tsl::io::JoinPath(
diff --git a/third_party/xla/xla/executable_run_options.cc b/third_party/xla/xla/executable_run_options.cc
index 9986ef4175de54..cc53bed2df5358 100644
--- a/third_party/xla/xla/executable_run_options.cc
+++ b/third_party/xla/xla/executable_run_options.cc
@@ -135,6 +135,17 @@ ExecutableRunOptions::cpu_executable_run_options() const {
   return cpu_executable_run_options_;
 }
 
+ExecutableRunOptions& ExecutableRunOptions::set_ffi_execution_context(
+    const ffi::ExecutionContext* ffi_execution_context) {
+  ffi_execution_context_ = ffi_execution_context;
+  return *this;
+}
+
+const ffi::ExecutionContext* ExecutableRunOptions::ffi_execution_context()
+    const {
+  return ffi_execution_context_;
+}
+
 ExecutableRunOptions& ExecutableRunOptions::set_rng_seed(int rng_seed) {
   rng_seed_ = rng_seed;
   return *this;
diff --git a/third_party/xla/xla/executable_run_options.h b/third_party/xla/xla/executable_run_options.h
index b1a9d07b28ddf5..c6a4897c2067ec 100644
--- a/third_party/xla/xla/executable_run_options.h
+++ b/third_party/xla/xla/executable_run_options.h
@@ -59,6 +59,10 @@ namespace gpu {
 class GpuExecutableRunOptions;
 }  // namespace gpu
 
+namespace ffi {
+class ExecutionContext;
+}  // namespace ffi
+
 // A unique identifier for a particular "logical execution" of an XLA model.
 //
 // A logical execution might encompass multiple executions of one or more
@@ -99,8 +103,8 @@ using ThenExecuteFunction =
 // recorded on a `stream` once the send operation is completed and data was
 // copied from the `src` memory. `frontend_attrs` contains frontend specific
 // attributes for the send.
-using SendDeviceMemoryFunction =
-    std::function<absl::StatusOr<tsl::AsyncValueRef<stream_executor::Event>>(
+using SendDeviceMemoryFunction = std::function<
+    absl::StatusOr<tsl::AsyncValueRef<std::unique_ptr<stream_executor::Event>>>(
         int64_t channel_id, stream_executor::Stream* stream, const Shape& shape,
         const stream_executor::DeviceMemoryBase& src,
         const absl::flat_hash_map<std::string, std::string>& frontend_attrs)>;
@@ -109,8 +113,8 @@ using SendDeviceMemoryFunction =
 // recorded on a `stream` once the recv operation is completed and data was
 // copied into the `dst` memory. `frontend_attrs` contains frontend specific
 // attributes for the receive.
-using RecvDeviceMemoryFunction =
-    std::function<absl::StatusOr<tsl::AsyncValueRef<stream_executor::Event>>(
+using RecvDeviceMemoryFunction = std::function<
+    absl::StatusOr<tsl::AsyncValueRef<std::unique_ptr<stream_executor::Event>>>(
         int64_t channel_id, stream_executor::Stream* stream, const Shape& shape,
         stream_executor::DeviceMemoryBase* dst,
         const absl::flat_hash_map<std::string, std::string>& frontend_attrs)>;
@@ -226,6 +230,13 @@ class ExecutableRunOptions {
       const gpu::GpuExecutableRunOptions* gpu_executable_run_options);
   const gpu::GpuExecutableRunOptions* gpu_executable_run_options() const;
 
+  // XLA FFI specific execution context that allows to pass auxiliary data to
+  // FFI handlers. It's a caller responsibility to ensure that the XLA FFI
+  // execution context stays alive while the executable is running.
+  ExecutableRunOptions& set_ffi_execution_context(
+      const ffi::ExecutionContext* ffi_execution_context);
+  const ffi::ExecutionContext* ffi_execution_context() const;
+
  private:
   stream_executor::DeviceMemoryAllocator* allocator_ = nullptr;
   int device_ordinal_ = -1;
@@ -243,6 +254,7 @@ class ExecutableRunOptions {
   RunId run_id_;
   const cpu::CpuExecutableRunOptions* cpu_executable_run_options_ = nullptr;
   const gpu::GpuExecutableRunOptions* gpu_executable_run_options_ = nullptr;
+  const ffi::ExecutionContext* ffi_execution_context_ = nullptr;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/ffi/BUILD b/third_party/xla/xla/ffi/BUILD
index fc0f1066e73f16..3fe3e55c5d8e97 100644
--- a/third_party/xla/xla/ffi/BUILD
+++ b/third_party/xla/xla/ffi/BUILD
@@ -24,17 +24,51 @@ cc_library(
         "//xla/ffi/api:c_api_internal",
         "//xla/stream_executor:device_memory",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:dynamic_annotations",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/types:span",
     ],
 )
 
+cc_library(
+    name = "execution_context",
+    srcs = ["execution_context.cc"],
+    hdrs = ["execution_context.h"],
+    deps = [
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/lib/gtl:int_type",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "execution_context_test",
+    srcs = ["execution_context_test.cc"],
+    deps = [
+        ":execution_context",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
+
 cc_library(
     name = "ffi",
     hdrs = ["ffi.h"],
     deps = [
         ":api",
+        ":execution_context",
         "//xla:shape_util",
         "//xla:status",
         "//xla:types",
@@ -42,7 +76,6 @@ cc_library(
         "//xla/ffi/api:c_api",
         "//xla/ffi/api:c_api_internal",
         "//xla/hlo/ir:hlo",
-        "//xla/runtime:memref_view",
         "//xla/stream_executor",
         "//xla/stream_executor:device_memory",
         "@com_google_absl//absl/types:span",
@@ -56,6 +89,7 @@ cc_library(
     deps = [
         ":api",
         ":call_frame",
+        ":execution_context",
         "//xla:status",
         "//xla:statusor",
         "//xla/ffi/api:c_api",
@@ -69,11 +103,28 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "attribute_map",
+    srcs = ["attribute_map.cc"],
+    hdrs = ["attribute_map.h"],
+    deps = [
+        ":call_frame",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@local_tsl//tsl/platform:errors",
+    ],
+)
+
 xla_cc_test(
     name = "ffi_test",
     srcs = ["ffi_test.cc"],
     deps = [
         ":call_frame",
+        ":execution_context",
         ":ffi",
         ":ffi_api",
         "//xla:xla_data_proto_cc",
diff --git a/third_party/xla/xla/ffi/api/BUILD b/third_party/xla/xla/ffi/api/BUILD
index ab36db4a82d1cf..685fa9a62de53f 100644
--- a/third_party/xla/xla/ffi/api/BUILD
+++ b/third_party/xla/xla/ffi/api/BUILD
@@ -60,10 +60,6 @@ cc_library(
     ],
 )
 
-#===-------------------------------------------------------------------------------------------===//
-# Internal tests for XLA FFI API
-#===-------------------------------------------------------------------------------------------===//
-
 xla_cc_test(
     name = "ffi_test",
     srcs = ["ffi_test.cc"],
@@ -71,7 +67,9 @@ xla_cc_test(
         ":ffi",
         "//xla:xla_data_proto_cc",
         "//xla/ffi:call_frame",
+        "//xla/ffi:execution_context",
         "//xla/ffi:ffi_api",
+        "//xla/service:executable",
         "//xla/stream_executor:device_memory",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
diff --git a/third_party/xla/xla/ffi/api/api.h b/third_party/xla/xla/ffi/api/api.h
index 6169f06b2ce82e..d0299022a630b2 100644
--- a/third_party/xla/xla/ffi/api/api.h
+++ b/third_party/xla/xla/ffi/api/api.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <algorithm>
 #include <array>
 #include <cassert>
+#include <complex>
 #include <cstddef>
 #include <cstdint>
 #include <functional>
@@ -146,15 +147,11 @@ XLA_FFI_Error* Ffi::RegisterStaticHandler(const XLA_FFI_Api* api,
                                           std::string_view platform,
                                           XLA_FFI_Handler* handler,
                                           XLA_FFI_Handler_Traits traits) {
-  // Make copies of string views to guarantee they are null terminated.
-  std::string name_str(name);
-  std::string platform_str(platform);
-
   XLA_FFI_Handler_Register_Args args;
   args.struct_size = XLA_FFI_Handler_Register_Args_STRUCT_SIZE;
   args.priv = nullptr;
-  args.name = name_str.c_str();
-  args.platform = platform_str.c_str();
+  args.name = XLA_FFI_ByteSpan{name.data(), name.size()};
+  args.platform = XLA_FFI_ByteSpan{platform.data(), platform.size()};
   args.handler = handler;
   args.traits = traits;
   return api->XLA_FFI_Handler_Register(&args);
@@ -210,6 +207,9 @@ namespace internal {
 // A type tag to forward all remaining args as `RemainingArgs`.
 struct RemainingArgsTag {};
 
+// A type tag to forward all remaining results as `RemainingRets`.
+struct RemainingRetsTag {};
+
 // A type tag to distinguish parameters tied to results in the `Binding`
 // variadic template. In XLA FFI we use destination passing style APIs and don't
 // return anything from the handler, but instead pass a destination where the
@@ -260,6 +260,11 @@ template <typename... Ts>
 using HasRemainingArgsTag =
     std::disjunction<std::is_same<RemainingArgsTag, Ts>...>;
 
+// Checks if remaining results are in the parameter pack.
+template <typename... Ts>
+using HasRemainingRetsTag =
+    std::disjunction<std::is_same<RemainingRetsTag, Ts>...>;
+
 }  // namespace internal
 
 //===----------------------------------------------------------------------===//
@@ -285,6 +290,12 @@ class Binding {
     return {std::move(*this)};
   }
 
+  Binding<Ts..., internal::RemainingRetsTag> RemainingResults() && {
+    static_assert(!internal::HasRemainingRetsTag<Ts...>::value,
+                  "remaining results can be passed just once");
+    return {std::move(*this)};
+  }
+
   template <typename T>
   Binding<Ts..., internal::CtxTag<T>> Ctx() && {
     return {std::move(*this)};
@@ -627,9 +638,8 @@ struct AttrDecoding;
 //                                      XLA_FFI_ExecutionContext* ctx);
 //   }
 //
-// TODO(ezhulenev): Add an example for decoding opaque data passed together with
-// a handler registration (not yet implemented). Today this is only used as
-// internal implementation detail of builtin FFI handlers.
+// Second template parameter is used to conditionally enable/disable context
+// decoding specialization for a given type via SFINAE.
 template <typename T>
 struct CtxDecoding;
 
@@ -810,18 +820,32 @@ class Unexpected;
 template <typename T, typename E>
 class Expected {
  public:
-  Expected(T value) : data_(std::move(value)) {}  // NOLINT
-  Expected(Unexpected<E> u);                      // NOLINT
+  constexpr Expected(T value) : data_(std::move(value)) {}  // NOLINT
+  constexpr Expected(Unexpected<E> u);                      // NOLINT
 
-  operator bool() const {  // NOLINT
+  constexpr operator bool() const {  // NOLINT
     return has_value();
   }
-  T operator*() const { return value(); }
-  T* operator->() const { return &value(); }
 
-  bool has_value() const { return std::holds_alternative<T>(data_); }
-  T value() const { return std::get<T>(data_); }
-  E error() const { return std::get<E>(data_); }
+  constexpr T& operator*() & { return value(); }
+  constexpr const T& operator*() const& { return value(); }
+  constexpr T&& operator*() && { return std::move(value()); }
+  constexpr const T& operator*() const&& { return std::move(value()); }
+
+  constexpr T* operator->() { return &value(); }
+  constexpr const T* operator->() const { return &value(); }
+
+  constexpr bool has_value() const { return std::holds_alternative<T>(data_); }
+
+  constexpr T& value() & { return std::get<T>(data_); }
+  constexpr const T& value() const& { return std::get<T>(data_); }
+  constexpr T&& value() && { return std::get<T>(std::move(data_)); }
+  constexpr const T& value() const&& { return std::get<T>(std::move(data_)); }
+
+  constexpr E& error() & { return std::get<E>(data_); }
+  constexpr const E& error() const& { return std::get<E>(data_); }
+  constexpr E&& error() && { return std::get<E>(std::move(data_)); }
+  constexpr const E&& error() const&& { return std::get<E>(std::move(data_)); }
 
  private:
   std::variant<T, E> data_;
@@ -830,7 +854,7 @@ class Expected {
 template <typename E>
 class Unexpected {
  public:
-  explicit Unexpected(E error) : error_(std::move(error)) {}
+  explicit constexpr Unexpected(E error) : error_(std::move(error)) {}
 
  private:
   template <typename, typename>
@@ -842,7 +866,8 @@ class Unexpected {
 Unexpected(const char*) -> Unexpected<std::string>;
 
 template <typename T, typename E>
-Expected<T, E>::Expected(Unexpected<E> u) : data_(std::move(u.error_)) {}
+constexpr Expected<T, E>::Expected(Unexpected<E> u)
+    : data_(std::move(u.error_)) {}
 
 //===----------------------------------------------------------------------===//
 // Type-safe wrapper for accessing a variable number of arguments.
@@ -888,6 +913,50 @@ struct internal::Decode<internal::RemainingArgsTag> {
   }
 };
 
+//===----------------------------------------------------------------------===//
+// Type-safe wrapper for accessing a variable number of results.
+//===----------------------------------------------------------------------===//
+
+class RemainingResults {
+ public:
+  RemainingResults(const XLA_FFI_Rets* rets, size_t offset)
+      : rets_(rets), offset_(offset) {
+    assert(offset <= rets_->size && "illegal remaining rets offset");
+  }
+
+  size_t size() const { return rets_->size - offset_; }
+  bool empty() const { return size() == 0; }
+
+  template <typename T>
+  Expected<T, std::string> get(size_t index) const {
+    size_t idx = offset_ + index;
+    if (idx >= rets_->size) {
+      return Unexpected("Index out of range.");
+    }
+
+    DiagnosticEngine diagnostic;
+    auto value_opt =
+        RetDecoding<T>::Decode(rets_->types[idx], rets_->rets[idx], diagnostic);
+    if (!value_opt.has_value()) {
+      return Unexpected(diagnostic.Result());
+    }
+    return **value_opt;
+  }
+
+ private:
+  const XLA_FFI_Rets* rets_;  // not owned
+  size_t offset_;
+};
+
+template <>
+struct internal::Decode<internal::RemainingRetsTag> {
+  static std::optional<RemainingResults> call(DecodingOffsets& offsets,
+                                              DecodingContext& ctx,
+                                              DiagnosticEngine& diagnostic) {
+    return RemainingResults(&ctx.call_frame->rets, offsets.rets);
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // Type-safe wrapper for accessing dictionary attributes.
 //===----------------------------------------------------------------------===//
@@ -980,6 +1049,11 @@ struct FnArgType<internal::RemainingArgsTag> {
   using Type = RemainingArgs;
 };
 
+template <>
+struct FnArgType<internal::RemainingRetsTag> {
+  using Type = RemainingResults;
+};
+
 // Extracts the underlying type from the returned result type tag.
 template <typename T>
 struct FnArgType<internal::RetTag<T>> {
@@ -1017,6 +1091,8 @@ template <typename T>
 struct IsTagged<CtxTag<T>> : std::true_type {};
 template <>
 struct IsTagged<RemainingArgsTag> : std::true_type {};
+template <>
+struct IsTagged<RemainingRetsTag> : std::true_type {};
 
 // A template for counting regular arguments in the Ts pack.
 template <typename... Ts>
@@ -1092,11 +1168,20 @@ class Handler : public Ffi {
 
     // Check that the number of results matches the signature. Each individual
     // result decoding will check the actual type.
-    if (XLA_FFI_PREDICT_FALSE(call_frame->rets.size != kNumRets)) {
-      return InvalidArgument(
-          call_frame->api,
-          StrCat("Wrong number of results: expected ", kNumRets, " but got ",
-                 call_frame->rets.size));
+    if (internal::HasRemainingRetsTag<Ts...>::value) {
+      if (XLA_FFI_PREDICT_FALSE(call_frame->rets.size < kNumRets)) {
+        return InvalidArgument(
+            call_frame->api,
+            StrCat("Wrong number of results: expected at least ", kNumRets - 1,
+                   " but got ", call_frame->rets.size));
+      }
+    } else {
+      if (XLA_FFI_PREDICT_FALSE(call_frame->rets.size != kNumRets)) {
+        return InvalidArgument(
+            call_frame->api,
+            StrCat("Wrong number of results: expected ", kNumRets, " but got ",
+                   call_frame->rets.size));
+      }
     }
 
     // Check that the number of passed attributes matches the signature. Each
@@ -1235,9 +1320,21 @@ inline std::ostream& operator<<(std::ostream& os, const XLA_FFI_AttrType type) {
     }                                                                 \
   }
 
+XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(bool, XLA_FFI_DataType_PRED);
+XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(int8_t, XLA_FFI_DataType_S8);
+XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(int16_t, XLA_FFI_DataType_S16);
 XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(int32_t, XLA_FFI_DataType_S32);
 XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(int64_t, XLA_FFI_DataType_S64);
+XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(uint8_t, XLA_FFI_DataType_U8);
+XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(uint16_t, XLA_FFI_DataType_U16);
+XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(uint32_t, XLA_FFI_DataType_U32);
+XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(uint64_t, XLA_FFI_DataType_U64);
 XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(float, XLA_FFI_DataType_F32);
+XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(double, XLA_FFI_DataType_F64);
+XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(std::complex<float>,
+                                      XLA_FFI_DataType_C64);
+XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(std::complex<double>,
+                                      XLA_FFI_DataType_C128);
 
 #undef XLA_FFI_REGISTER_SCALAR_ATTR_DECODING
 
diff --git a/third_party/xla/xla/ffi/api/c_api.h b/third_party/xla/xla/ffi/api/c_api.h
index 78f0e4d710a435..243e5d3e2bea51 100644
--- a/third_party/xla/xla/ffi/api/c_api.h
+++ b/third_party/xla/xla/ffi/api/c_api.h
@@ -164,6 +164,9 @@ typedef enum {
   XLA_FFI_DataType_F32 = 11,
   XLA_FFI_DataType_F64 = 12,
   XLA_FFI_DataType_BF16 = 16,
+  XLA_FFI_DataType_C64 = 15,
+  XLA_FFI_DataType_C128 = 18,
+  XLA_FFI_DataType_TOKEN = 17,
 } XLA_FFI_DataType;
 // LINT.ThenChange(ffi_test.cc)
 
@@ -214,44 +217,38 @@ typedef enum {
 typedef struct XLA_FFI_ExecutionContext XLA_FFI_ExecutionContext;
 
 //===----------------------------------------------------------------------===//
-// Call frame
+// Primitives.
 //===----------------------------------------------------------------------===//
 
+// TypeId uniquely identifies a user-defined type in a given XLA FFI instance.
+struct XLA_FFI_TypeId {
+  int64_t type_id;
+};
+
 // We use byte spans to pass strings to handlers because strings might not be
 // null terminated, and even if they are, looking for a null terminator can
 // become very expensive in tight loops.
 struct XLA_FFI_ByteSpan {
-  size_t struct_size;
-  void* priv;
-
   const char* ptr;
   size_t len;
 };
 
-XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_ByteSpan, len);
-
 // A struct to pass a scalar value to FFI handler.
 struct XLA_FFI_Scalar {
-  size_t struct_size;
-  void* priv;
-
   XLA_FFI_DataType dtype;
   void* value;
 };
 
-XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Scalar, value);
-
 // A struct to pass a dense array to FFI handler.
 struct XLA_FFI_Array {
-  size_t struct_size;
-  void* priv;
-
   XLA_FFI_DataType dtype;
   size_t size;
   void* data;
 };
 
-XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Array, data);
+//===----------------------------------------------------------------------===//
+// Call frame
+//===----------------------------------------------------------------------===//
 
 struct XLA_FFI_Args {
   size_t struct_size;
@@ -322,8 +319,8 @@ struct XLA_FFI_Handler_Register_Args {
   size_t struct_size;
   void* priv;
 
-  const char* name;      // null terminated
-  const char* platform;  // null terminated
+  XLA_FFI_ByteSpan name;
+  XLA_FFI_ByteSpan platform;
   XLA_FFI_Handler* handler;
   XLA_FFI_Handler_Traits traits;
 };
@@ -333,6 +330,43 @@ XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Handler_Register_Args, traits);
 typedef XLA_FFI_Error* XLA_FFI_Handler_Register(
     XLA_FFI_Handler_Register_Args* args);
 
+//===----------------------------------------------------------------------===//
+// TypeId
+//===----------------------------------------------------------------------===//
+
+struct XLA_FFI_TypeId_Register_Args {
+  size_t struct_size;
+  void* priv;
+
+  XLA_FFI_ByteSpan name;
+  XLA_FFI_TypeId* type_id;  // out
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_TypeId_Register_Args, type_id);
+
+// Registers user type `name` and returns a unique `type_id`.
+typedef XLA_FFI_Error* XLA_FFI_TypeId_Register(
+    XLA_FFI_TypeId_Register_Args* args);
+
+//===----------------------------------------------------------------------===//
+// ExecutionContext
+//===----------------------------------------------------------------------===//
+
+struct XLA_FFI_ExecutionContext_Get_Args {
+  size_t struct_size;
+  void* priv;
+
+  XLA_FFI_ExecutionContext* ctx;
+  XLA_FFI_TypeId* type_id;
+  void* data;  // out
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_ExecutionContext_Get_Args, data);
+
+// Returns an opaque data from the execution context for a given name.
+typedef XLA_FFI_Error* XLA_FFI_ExecutionContext_Get(
+    XLA_FFI_ExecutionContext_Get_Args* args);
+
 //===----------------------------------------------------------------------===//
 // Stream
 //===----------------------------------------------------------------------===//
@@ -368,6 +402,8 @@ struct XLA_FFI_Api {
   _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_Error_Destroy);
   _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_Handler_Register);
   _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_Stream_Get);
+  _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_TypeId_Register);
+  _XLA_FFI_API_STRUCT_FIELD(XLA_FFI_ExecutionContext_Get);
 };
 
 #undef _XLA_FFI_API_STRUCT_FIELD
diff --git a/third_party/xla/xla/ffi/api/c_api_internal.h b/third_party/xla/xla/ffi/api/c_api_internal.h
index 7bbb0441b88588..c2dc5807076552 100644
--- a/third_party/xla/xla/ffi/api/c_api_internal.h
+++ b/third_party/xla/xla/ffi/api/c_api_internal.h
@@ -64,6 +64,11 @@ typedef void* XLA_FFI_INTERNAL_DeviceMemoryAllocator_Get(
 typedef void* XLA_FFI_INTERNAL_CalledComputation_Get(
     XLA_FFI_ExecutionContext* ctx);
 
+// Returns a pointer to the underlying `xla::ffi::ExecutionContext` object which
+// allows to access typed user data attached to the execution context.
+typedef void* XLA_FFI_INTERNAL_ExecutionContext_Get(
+    XLA_FFI_ExecutionContext* ctx);
+
 //===----------------------------------------------------------------------===//
 // API access
 //===----------------------------------------------------------------------===//
@@ -77,6 +82,7 @@ struct XLA_FFI_InternalApi {
   _XLA_FFI_INTERNAL_API_STRUCT_FIELD(
       XLA_FFI_INTERNAL_DeviceMemoryAllocator_Get);
   _XLA_FFI_INTERNAL_API_STRUCT_FIELD(XLA_FFI_INTERNAL_CalledComputation_Get);
+  _XLA_FFI_INTERNAL_API_STRUCT_FIELD(XLA_FFI_INTERNAL_ExecutionContext_Get);
 };
 
 #undef _XLA_FFI_INTERNAL_API_STRUCT_FIELD
diff --git a/third_party/xla/xla/ffi/api/ffi.h b/third_party/xla/xla/ffi/api/ffi.h
index a6995cbc739541..3d56c00b94bda2 100644
--- a/third_party/xla/xla/ffi/api/ffi.h
+++ b/third_party/xla/xla/ffi/api/ffi.h
@@ -20,12 +20,16 @@ limitations under the License.
 #error Two different XLA FFI implementations cannot be included together
 #endif  // XLA_FFI_FFI_H_
 
+#include <algorithm>
+#include <cassert>
+#include <complex>
 #include <cstddef>
 #include <cstdint>
 #include <limits>
 #include <optional>
 #include <ostream>
 #include <string>
+#include <string_view>
 #include <type_traits>
 #include <utility>
 #include <vector>
@@ -53,14 +57,48 @@ enum class DataType : uint8_t {
   F32 = XLA_FFI_DataType_F32,
   F64 = XLA_FFI_DataType_F64,
   BF16 = XLA_FFI_DataType_BF16,
+  C64 = XLA_FFI_DataType_C64,
+  C128 = XLA_FFI_DataType_C128,
+  TOKEN = XLA_FFI_DataType_TOKEN,
 };
 
 inline std::ostream& operator<<(std::ostream& os, const DataType dtype) {
-  static constexpr const char* kDataTypeNames[] = {
-      "INVALID", "PRED", "S8",  "S16", "S32", "S64", "U8",
-      "U16",     "U32",  "U64", "F16", "F32", "F64", "BF16",
-  };
-  return os << kDataTypeNames[static_cast<int>(dtype)];
+  switch (dtype) {
+    case DataType::INVALID:
+      return os << "INVALID";
+    case DataType::PRED:
+      return os << "PRED";
+    case DataType::S8:
+      return os << "S8";
+    case DataType::S16:
+      return os << "S16";
+    case DataType::S32:
+      return os << "S32";
+    case DataType::S64:
+      return os << "S64";
+    case DataType::U8:
+      return os << "U8";
+    case DataType::U16:
+      return os << "U16";
+    case DataType::U32:
+      return os << "U32";
+    case DataType::U64:
+      return os << "U64";
+    case DataType::F16:
+      return os << "F16";
+    case DataType::F32:
+      return os << "F32";
+    case DataType::F64:
+      return os << "F64";
+    case DataType::BF16:
+      return os << "BF16";
+    case DataType::C64:
+      return os << "C64";
+    case DataType::C128:
+      return os << "C128";
+    case DataType::TOKEN:
+      return os << "TOKEN";
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -79,6 +117,14 @@ class Span {
 
   T& operator[](size_t index) const { return data_[index]; }
 
+  bool operator==(const Span<T>& other) const {
+    return size() == other.size() && std::equal(begin(), end(), other.begin());
+  }
+
+  T& front() const { return data_[0]; }
+  T& back() const { return data_[size_ - 1]; }
+  Span<T> first(size_t n) const { return Span<T>(data_, n); }
+  Span<T> last(size_t n) const { return Span<T>(data_ + size_ - n, n); }
   size_t size() const { return size_; }
 
   T* begin() const { return data_; }
@@ -134,32 +180,92 @@ struct DataTypeToNative {
   static_assert(always_false<dtype>::value, "unsupported data type");
 };
 
-// clang-format off
-template <> struct DataTypeToNative<DataType::PRED> { using type = bool; };
-template <> struct DataTypeToNative<DataType::U8>   { using type = uint8_t; };
-template <> struct DataTypeToNative<DataType::U16>  { using type = uint16_t; };
-template <> struct DataTypeToNative<DataType::U32>  { using type = uint32_t; };
-template <> struct DataTypeToNative<DataType::U64>  { using type = uint64_t; };
-template <> struct DataTypeToNative<DataType::S8>   { using type = int8_t; };
-template <> struct DataTypeToNative<DataType::S16>  { using type = int16_t; };
-template <> struct DataTypeToNative<DataType::S32>  { using type = int32_t; };
-template <> struct DataTypeToNative<DataType::S64>  { using type = int64_t; };
-template <> struct DataTypeToNative<DataType::F16>  { using type = uint16_t; };
-template <> struct DataTypeToNative<DataType::F32>  { using type = float; };
-template <> struct DataTypeToNative<DataType::F64>  { using type = double; };
-template <> struct DataTypeToNative<DataType::BF16> { using type = uint16_t; };
-// clang-format on
+#define XLA_FFI_REGISTER_DATATYPE_MAPPING(data_type_value, actual_type) \
+  template <>                                                           \
+  struct DataTypeToNative<data_type_value> {                            \
+    using type = actual_type;                                           \
+  };
+
+XLA_FFI_REGISTER_DATATYPE_MAPPING(DataType::PRED, bool);
+XLA_FFI_REGISTER_DATATYPE_MAPPING(DataType::U8, uint8_t);
+XLA_FFI_REGISTER_DATATYPE_MAPPING(DataType::U16, uint16_t);
+XLA_FFI_REGISTER_DATATYPE_MAPPING(DataType::U32, uint32_t);
+XLA_FFI_REGISTER_DATATYPE_MAPPING(DataType::U64, uint64_t);
+XLA_FFI_REGISTER_DATATYPE_MAPPING(DataType::S8, int8_t);
+XLA_FFI_REGISTER_DATATYPE_MAPPING(DataType::S16, int16_t);
+XLA_FFI_REGISTER_DATATYPE_MAPPING(DataType::S32, int32_t);
+XLA_FFI_REGISTER_DATATYPE_MAPPING(DataType::S64, int64_t);
+XLA_FFI_REGISTER_DATATYPE_MAPPING(DataType::F16, uint16_t);
+XLA_FFI_REGISTER_DATATYPE_MAPPING(DataType::F32, float);
+XLA_FFI_REGISTER_DATATYPE_MAPPING(DataType::F64, double);
+XLA_FFI_REGISTER_DATATYPE_MAPPING(DataType::BF16, uint16_t);
+XLA_FFI_REGISTER_DATATYPE_MAPPING(DataType::C64, std::complex<float>);
+XLA_FFI_REGISTER_DATATYPE_MAPPING(DataType::C128, std::complex<double>);
+XLA_FFI_REGISTER_DATATYPE_MAPPING(DataType::TOKEN, void);
+
+#undef XLA_FFI_REGISTER_DATATYPE_MAPPING
 
 inline constexpr size_t kDynamicRank = std::numeric_limits<size_t>::max();
 
+}  // namespace internal
+
+constexpr DataType ToComplex(DataType dtype) {
+  switch (dtype) {
+    case DataType::F32:
+      return DataType::C64;
+    case DataType::F64:
+      return DataType::C128;
+    default:
+      return DataType::INVALID;
+  }
+}
+
+constexpr DataType ToReal(DataType dtype) {
+  switch (dtype) {
+    case DataType::C64:
+      return DataType::F32;
+    case DataType::C128:
+      return DataType::F64;
+    default:
+      return dtype;
+  }
+}
+
+constexpr DataType ToImag(DataType dtype) {
+  switch (dtype) {
+    case DataType::C64:
+      return DataType::F32;
+    case DataType::C128:
+      return DataType::F64;
+    default:
+      return dtype;
+  }
+}
+
 template <DataType dtype>
-using NativeType = typename DataTypeToNative<dtype>::type;
+using NativeType = typename internal::DataTypeToNative<dtype>::type;
 
-}  // namespace internal
+template <DataType dtype>
+constexpr bool IsComplexType() {
+  return std::is_same_v<NativeType<dtype>,
+                        std::complex<NativeType<ToReal(dtype)>>>;
+}
+
+static_assert(ToReal(DataType::C64) == DataType::F32);
+static_assert(ToReal(DataType::C128) == DataType::F64);
+static_assert(ToReal(DataType::F32) == DataType::F32);
+static_assert(ToComplex(DataType::F32) == DataType::C64);
+static_assert(ToComplex(DataType::F64) == DataType::C128);
+static_assert(ToComplex(DataType::S32) == DataType::INVALID);
+static_assert(ToComplex(ToReal(DataType::C64)) == DataType::C64);
+static_assert(ToComplex(ToImag(DataType::C128)) == DataType::C128);
+static_assert(IsComplexType<DataType::C64>());
+static_assert(IsComplexType<DataType::C128>());
+static_assert(!IsComplexType<DataType::F32>());
 
 template <DataType dtype, size_t rank = internal::kDynamicRank>
 struct Buffer {
-  internal::NativeType<dtype>* data;
+  NativeType<dtype>* data;
   Span<const int64_t> dimensions;
 };
 
@@ -171,6 +277,8 @@ template <DataType dtype> using BufferR3 = Buffer<dtype, 3>;
 template <DataType dtype> using BufferR4 = Buffer<dtype, 4>;
 // clang-format on
 
+using Token = BufferR0<DataType::TOKEN>;
+
 namespace internal {
 
 inline BufferBase DecodeBuffer(XLA_FFI_Buffer* buf) {
@@ -195,13 +303,25 @@ std::optional<Buffer<dtype, rank>> DecodeBuffer(XLA_FFI_Buffer* buf,
   }
 
   Buffer<dtype, rank> buffer;
-  buffer.data = static_cast<internal::NativeType<dtype>*>(buf->data);
+  buffer.data = static_cast<NativeType<dtype>*>(buf->data);
   buffer.dimensions = Span<const int64_t>(buf->dims, buf->rank);
   return buffer;
 }
 
 }  // namespace internal
 
+using ResultBufferBase = Result<BufferBase>;
+template <DataType dtype, size_t rank = internal::kDynamicRank>
+using ResultBuffer = Result<Buffer<dtype, rank>>;
+
+// clang-format off
+template <DataType dtype> using ResultBufferR0 = ResultBuffer<dtype, 0>;
+template <DataType dtype> using ResultBufferR1 = ResultBuffer<dtype, 1>;
+template <DataType dtype> using ResultBufferR2 = ResultBuffer<dtype, 2>;
+template <DataType dtype> using ResultBufferR3 = ResultBuffer<dtype, 3>;
+template <DataType dtype> using ResultBufferR4 = ResultBuffer<dtype, 4>;
+// clang-format on
+
 //===----------------------------------------------------------------------===//
 // Arguments binding
 //===----------------------------------------------------------------------===//
@@ -333,11 +453,14 @@ struct RetDecoding<Buffer<dtype, rank>> {
     }                                                                       \
   }
 
+XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING(int8_t, XLA_FFI_DataType_S8);
+XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING(int16_t, XLA_FFI_DataType_S16);
 XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING(int32_t, XLA_FFI_DataType_S32);
 XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING(int64_t, XLA_FFI_DataType_S64);
 XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING(float, XLA_FFI_DataType_F32);
+XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING(double, XLA_FFI_DataType_F64);
 
-#undef XLA_FFI_REGISTER_SCALAR_ATTR_DECODING
+#undef XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING
 
 // A type tag to mark i64 attributes as pointers to `T`.
 template <typename T>
@@ -380,6 +503,34 @@ struct ResultEncoding<Error> {
   }
 };
 
+//===----------------------------------------------------------------------===//
+// Error helpers
+//===----------------------------------------------------------------------===//
+
+namespace internal {
+
+struct ErrorUtil {
+  static const char* GetErrorMessage(const XLA_FFI_Api* api,
+                                     XLA_FFI_Error* error) {
+    XLA_FFI_Error_GetMessage_Args args;
+    args.struct_size = XLA_FFI_Error_GetMessage_Args_STRUCT_SIZE;
+    args.priv = nullptr;
+    args.error = error;
+    api->XLA_FFI_Error_GetMessage(&args);
+    return args.message;
+  }
+
+  static void DestroyError(const XLA_FFI_Api* api, XLA_FFI_Error* error) {
+    XLA_FFI_Error_Destroy_Args args;
+    args.struct_size = XLA_FFI_Error_Destroy_Args_STRUCT_SIZE;
+    args.priv = nullptr;
+    args.error = error;
+    api->XLA_FFI_Error_Destroy(&args);
+  }
+};
+
+}  // namespace internal
+
 //===----------------------------------------------------------------------===//
 // PlatformStream
 //===----------------------------------------------------------------------===//
@@ -404,30 +555,72 @@ struct CtxDecoding<PlatformStream<T>> {
 
     if (XLA_FFI_Error* error = api->XLA_FFI_Stream_Get(&args); error) {
       diagnostic.Emit("Failed to get platform stream: ")
-          << GetErrorMessage(api, error);
-      DestroyError(api, error);
+          << internal::ErrorUtil::GetErrorMessage(api, error);
+      internal::ErrorUtil::DestroyError(api, error);
       return std::nullopt;
     }
 
     return reinterpret_cast<T>(args.stream);
   }
+};
 
-  static const char* GetErrorMessage(const XLA_FFI_Api* api,
-                                     XLA_FFI_Error* error) {
-    XLA_FFI_Error_GetMessage_Args args;
-    args.struct_size = XLA_FFI_Error_GetMessage_Args_STRUCT_SIZE;
-    args.priv = nullptr;
-    args.error = error;
-    api->XLA_FFI_Error_GetMessage(&args);
-    return args.message;
-  }
+//===----------------------------------------------------------------------===//
+// UserData
+//===----------------------------------------------------------------------===//
 
-  static void DestroyError(const XLA_FFI_Api* api, XLA_FFI_Error* error) {
-    XLA_FFI_Error_Destroy_Args args;
-    args.struct_size = XLA_FFI_Error_Destroy_Args_STRUCT_SIZE;
+// All user data types that are passed via the execution context must be
+// registered with the XLA FFI ahead of time to get unique type id.
+using TypeId = XLA_FFI_TypeId;  // NOLINT
+
+inline XLA_FFI_Error* RegisterType(const XLA_FFI_Api* api,
+                                   std::string_view name,
+                                   XLA_FFI_TypeId* type_id) {
+  XLA_FFI_TypeId_Register_Args args;
+  args.struct_size = XLA_FFI_TypeId_Register_Args_STRUCT_SIZE;
+  args.priv = nullptr;
+  args.name = XLA_FFI_ByteSpan{name.data(), name.size()};
+  args.type_id = type_id;
+  return api->XLA_FFI_TypeId_Register(&args);
+}
+
+#define XLA_FFI_REGISTER_TYPE(API, NAME, TYPE_ID) \
+  XLA_FFI_REGISTER_TYPE_(API, NAME, TYPE_ID, __COUNTER__)
+#define XLA_FFI_REGISTER_TYPE_(API, NAME, TYPE_ID, N)  \
+  XLA_FFI_ATTRIBUTE_UNUSED static const XLA_FFI_Error* \
+      xla_ffi_type_##N##_registered_ =                 \
+          [] { return ::xla::ffi::RegisterType(API, NAME, TYPE_ID); }()
+
+// A type tag for automatic decoding user data passed via the execution context.
+template <typename T>
+struct UserData {};
+
+template <typename T>
+struct CtxDecoding<UserData<T>> {
+  using Type = T*;
+
+  static_assert(std::is_same_v<decltype(T::id), TypeId>,
+                "UserData type must have a static `TypeId id` field");
+
+  static std::optional<Type> Decode(const XLA_FFI_Api* api,
+                                    XLA_FFI_ExecutionContext* ctx,
+                                    DiagnosticEngine& diagnostic) {
+    XLA_FFI_ExecutionContext_Get_Args args;
+    args.struct_size = XLA_FFI_ExecutionContext_Get_Args_STRUCT_SIZE;
     args.priv = nullptr;
-    args.error = error;
-    api->XLA_FFI_Error_Destroy(&args);
+    args.ctx = ctx;
+    args.type_id = &T::id;
+    args.data = nullptr;
+
+    assert(args.type_id->type_id > 0 && "type must be registered with XLA FFI");
+
+    if (XLA_FFI_Error* err = api->XLA_FFI_ExecutionContext_Get(&args); err) {
+      diagnostic.Emit("Failed to get user data from execution context: ")
+          << internal::ErrorUtil::GetErrorMessage(api, err);
+      internal::ErrorUtil::DestroyError(api, err);
+      return std::nullopt;
+    }
+
+    return static_cast<Type>(args.data);
   }
 };
 
diff --git a/third_party/xla/xla/ffi/api/ffi_test.cc b/third_party/xla/xla/ffi/api/ffi_test.cc
index b1dc769ca8de9d..b963b99c5689fd 100644
--- a/third_party/xla/xla/ffi/api/ffi_test.cc
+++ b/third_party/xla/xla/ffi/api/ffi_test.cc
@@ -17,12 +17,16 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdint>
+#include <memory>
+#include <string>
 #include <vector>
 
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "xla/ffi/call_frame.h"
+#include "xla/ffi/execution_context.h"
 #include "xla/ffi/ffi_api.h"
+#include "xla/service/service_executable_run_options.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/lib/core/status_test_util.h"
@@ -61,6 +65,11 @@ TEST(FfiTest, DataTypeEnumValue) {
   EXPECT_EQ(encoded(PrimitiveType::F64), encoded(DataType::F64));
 
   EXPECT_EQ(encoded(PrimitiveType::BF16), encoded(DataType::BF16));
+
+  EXPECT_EQ(encoded(PrimitiveType::C64), encoded(DataType::C64));
+  EXPECT_EQ(encoded(PrimitiveType::C128), encoded(DataType::C128));
+
+  EXPECT_EQ(encoded(PrimitiveType::TOKEN), encoded(DataType::TOKEN));
 }
 
 TEST(FfiTest, BufferBaseArgument) {
@@ -166,6 +175,23 @@ TEST(FfiTest, WrongTypeBufferArgument) {
                HasSubstr("Wrong buffer dtype: expected F32 but got S32")));
 }
 
+TEST(FfiTest, TokenArgument) {
+  CallFrameBuilder builder;
+  builder.AddBufferArg(se::DeviceMemoryBase(), PrimitiveType::TOKEN,
+                       /*dims=*/{});
+  auto call_frame = builder.Build();
+
+  auto fn = [&](Token tok) {
+    EXPECT_EQ(tok.data, nullptr);
+    EXPECT_EQ(tok.dimensions.size(), 0);
+    return ffi::Error::Success();
+  };
+
+  auto handler = Ffi::Bind().Arg<Token>().To(fn);
+  auto status = Call(*handler, call_frame);
+  TF_ASSERT_OK(status);
+}
+
 TEST(FfiTest, AutoBinding) {
   static constexpr char kI32[] = "i32";
 
@@ -264,22 +290,36 @@ TEST(FfiTest, BindingPlatformStreamInference) {
 
 TEST(FfiTest, ArrayAttr) {
   CallFrameBuilder::AttributesBuilder attrs;
-  attrs.Insert("arr", std::vector<int32_t>({1, 2, 3, 4}));
+  attrs.Insert("arr0", std::vector<int8_t>({1, 2, 3, 4}));
+  attrs.Insert("arr1", std::vector<int16_t>({1, 2, 3, 4}));
+  attrs.Insert("arr2", std::vector<int32_t>({1, 2, 3, 4}));
+  attrs.Insert("arr3", std::vector<int64_t>({1, 2, 3, 4}));
+  attrs.Insert("arr4", std::vector<float>({1, 2, 3, 4}));
+  attrs.Insert("arr5", std::vector<double>({1, 2, 3, 4}));
 
   CallFrameBuilder builder;
   builder.AddAttributes(attrs.Build());
   auto call_frame = builder.Build();
 
-  auto fn = [&](Span<const int32_t> arr) {
-    EXPECT_EQ(arr.size(), 4);
-    EXPECT_EQ(arr[0], 1);
-    EXPECT_EQ(arr[1], 2);
-    EXPECT_EQ(arr[2], 3);
-    EXPECT_EQ(arr[3], 4);
+  auto fn = [&](auto arr0, auto arr1, auto arr2, auto arr3, auto arr4,
+                auto arr5) {
+    EXPECT_EQ(arr0, Span<const int8_t>({1, 2, 3, 4}));
+    EXPECT_EQ(arr1, Span<const int16_t>({1, 2, 3, 4}));
+    EXPECT_EQ(arr2, Span<const int32_t>({1, 2, 3, 4}));
+    EXPECT_EQ(arr3, Span<const int64_t>({1, 2, 3, 4}));
+    EXPECT_EQ(arr4, Span<const float>({1, 2, 3, 4}));
+    EXPECT_EQ(arr5, Span<const double>({1, 2, 3, 4}));
     return Error::Success();
   };
 
-  auto handler = Ffi::Bind().Attr<Span<const int32_t>>("arr").To(fn);
+  auto handler = Ffi::Bind()
+                     .Attr<Span<const int8_t>>("arr0")
+                     .Attr<Span<const int16_t>>("arr1")
+                     .Attr<Span<const int32_t>>("arr2")
+                     .Attr<Span<const int64_t>>("arr3")
+                     .Attr<Span<const float>>("arr4")
+                     .Attr<Span<const double>>("arr5")
+                     .To(fn);
   auto status = Call(*handler, call_frame);
 
   TF_ASSERT_OK(status);
@@ -311,6 +351,41 @@ TEST(FfiTest, PointerAttr) {
   TF_ASSERT_OK(status);
 }
 
+struct MyData {
+  static TypeId id;
+  std::string str;
+};
+
+TypeId MyData::id = {};  // zero-initialize type id
+XLA_FFI_REGISTER_TYPE(GetXlaFfiApi(), "my_data", &MyData::id);
+
+TEST(FfiTest, UserData) {
+  MyData data{"foo"};
+
+  ExecutionContext execution_context;
+  TF_ASSERT_OK(execution_context.Insert(
+      ExecutionContext::TypeId(MyData::id.type_id), &data));
+
+  CallFrameBuilder builder;
+  auto call_frame = builder.Build();
+
+  auto fn = [&](MyData* data) {
+    EXPECT_EQ(data->str, "foo");
+    return Error::Success();
+  };
+
+  auto handler = Ffi::Bind().Ctx<UserData<MyData>>().To(fn);
+
+  ServiceExecutableRunOptions service_run_options;
+  service_run_options.mutable_run_options()->set_ffi_execution_context(
+      &execution_context);
+
+  CallOptions options = {&service_run_options};
+  auto status = Call(*handler, call_frame, options);
+
+  TF_ASSERT_OK(status);
+}
+
 //===----------------------------------------------------------------------===//
 // Performance benchmarks are below.
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/ffi/attribute_map.cc b/third_party/xla/xla/ffi/attribute_map.cc
new file mode 100644
index 00000000000000..8ef42c30085b19
--- /dev/null
+++ b/third_party/xla/xla/ffi/attribute_map.cc
@@ -0,0 +1,129 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/ffi/attribute_map.h"
+
+#include <cstdint>
+#include <string_view>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "llvm/ADT/TypeSwitch.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/ffi/call_frame.h"
+#include "tsl/platform/errors.h"
+
+using FlatAttribute = xla::ffi::CallFrameBuilder::FlatAttribute;
+using FlatAttributesMap = xla::ffi::CallFrameBuilder::FlatAttributesMap;
+
+namespace xla::ffi {
+
+absl::StatusOr<FlatAttributesMap> BuildAttributesMap(
+    mlir::DictionaryAttr dict) {
+  FlatAttributesMap attributes;
+  for (auto& kv : dict) {
+    std::string_view name = kv.getName().strref();
+
+    auto boolean = [&](mlir::BoolAttr boolean) {
+      attributes[name] = static_cast<bool>(boolean.getValue());
+      return absl::OkStatus();
+    };
+
+    auto integer = [&](mlir::IntegerAttr integer) {
+      switch (integer.getType().getIntOrFloatBitWidth()) {
+        case 1:
+          attributes[name] = static_cast<bool>(integer.getInt());
+          return absl::OkStatus();
+        case 8:
+          attributes[name] = static_cast<int8_t>(integer.getInt());
+          return absl::OkStatus();
+        case 16:
+          attributes[name] = static_cast<int16_t>(integer.getInt());
+          return absl::OkStatus();
+        case 32:
+          attributes[name] = static_cast<int32_t>(integer.getInt());
+          return absl::OkStatus();
+        case 64:
+          attributes[name] = static_cast<int64_t>(integer.getInt());
+          return absl::OkStatus();
+        default:
+          return absl::InvalidArgumentError(absl::StrCat(
+              "Unsupported integer attribute bit width for attribute: ", name));
+      }
+    };
+
+    auto fp = [&](mlir::FloatAttr fp) {
+      switch (fp.getType().getIntOrFloatBitWidth()) {
+        case 32:
+          attributes[name] = static_cast<float>(fp.getValue().convertToFloat());
+          return absl::OkStatus();
+        case 64:
+          attributes[name] =
+              static_cast<double>(fp.getValue().convertToDouble());
+          return absl::OkStatus();
+        default:
+          return absl::InvalidArgumentError(absl::StrCat(
+              "Unsupported float attribute bit width for attribute: ", name));
+      }
+    };
+
+    auto arr = [&](mlir::DenseArrayAttr arr) {
+      if (auto dense = mlir::dyn_cast<mlir::DenseI8ArrayAttr>(arr)) {
+        attributes[name] = dense.asArrayRef().vec();
+        return absl::OkStatus();
+      } else if (auto dense = mlir::dyn_cast<mlir::DenseI16ArrayAttr>(arr)) {
+        attributes[name] = dense.asArrayRef().vec();
+        return absl::OkStatus();
+      } else if (auto dense = mlir::dyn_cast<mlir::DenseI32ArrayAttr>(arr)) {
+        attributes[name] = dense.asArrayRef().vec();
+        return absl::OkStatus();
+      } else if (auto dense = mlir::dyn_cast<mlir::DenseI64ArrayAttr>(arr)) {
+        attributes[name] = dense.asArrayRef().vec();
+        return absl::OkStatus();
+      } else if (auto dense = mlir::dyn_cast<mlir::DenseF32ArrayAttr>(arr)) {
+        attributes[name] = dense.asArrayRef().vec();
+        return absl::OkStatus();
+      } else if (auto dense = mlir::dyn_cast<mlir::DenseF64ArrayAttr>(arr)) {
+        attributes[name] = dense.asArrayRef().vec();
+        return absl::OkStatus();
+      } else {
+        return absl::InvalidArgumentError(absl::StrCat(
+            "Unsupported array element type for attribute: ", name));
+      }
+    };
+
+    auto str = [&](mlir::StringAttr str) {
+      attributes[name] = str.getValue().str();
+      return absl::OkStatus();
+    };
+
+    TF_RETURN_IF_ERROR(
+        llvm::TypeSwitch<mlir::Attribute, absl::Status>(kv.getValue())
+            .Case<mlir::BoolAttr>(boolean)
+            .Case<mlir::IntegerAttr>(integer)
+            .Case<mlir::FloatAttr>(fp)
+            .Case<mlir::DenseArrayAttr>(arr)
+            .Case<mlir::StringAttr>(str)
+            .Default([&](mlir::Attribute) {
+              return absl::InvalidArgumentError(absl::StrCat(
+                  "Unsupported attribute type for attribute: ", name));
+            }));
+  }
+  return attributes;
+}
+}  // namespace xla::ffi
diff --git a/third_party/xla/xla/service/gpu/model/indexing_context.h b/third_party/xla/xla/ffi/attribute_map.h
similarity index 52%
rename from third_party/xla/xla/service/gpu/model/indexing_context.h
rename to third_party/xla/xla/ffi/attribute_map.h
index e5dfc6adb7d3c4..d6c37b31c5522b 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_context.h
+++ b/third_party/xla/xla/ffi/attribute_map.h
@@ -13,27 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_MODEL_INDEXING_CONTEXT_H_
-#define XLA_SERVICE_GPU_MODEL_INDEXING_CONTEXT_H_
+#ifndef XLA_FFI_ATTRIBUTE_MAP_H_
+#define XLA_FFI_ATTRIBUTE_MAP_H_
 
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "xla/service/gpu/model/indexing_map.h"
+#include "absl/status/statusor.h"
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "xla/ffi/call_frame.h"
 
-namespace xla {
-namespace gpu {
+namespace xla::ffi {
 
-class IndexingContext {
- public:
-  explicit IndexingContext(mlir::MLIRContext* mlir_context)
-      : mlir_context_(mlir_context) {}
+// Converts MLIR dictionary attribute attached to a custom call operation to a
+// custom call handler attributes that are forwarded to the FFI handler.
+absl::StatusOr<CallFrameBuilder::FlatAttributesMap> BuildAttributesMap(
+    mlir::DictionaryAttr dict);
 
-  mlir::MLIRContext* GetMLIRContext() const { return mlir_context_; }
+}  // namespace xla::ffi
 
- private:
-  mlir::MLIRContext* mlir_context_;
-};
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_GPU_MODEL_INDEXING_CONTEXT_H_
+#endif  // XLA_FFI_ATTRIBUTE_MAP_H_
diff --git a/third_party/xla/xla/ffi/call_frame.cc b/third_party/xla/xla/ffi/call_frame.cc
index 11fc23a5b1a71e..cb5ae84822bf4f 100644
--- a/third_party/xla/xla/ffi/call_frame.cc
+++ b/third_party/xla/xla/ffi/call_frame.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/base/dynamic_annotations.h"
 #include "absl/log/check.h"
 #include "absl/types/span.h"
 #include "xla/ffi/api/c_api.h"
@@ -83,12 +84,16 @@ void CallFrameBuilder::AddBufferArg(se::DeviceMemoryBase memory,
                                     PrimitiveType type,
                                     absl::Span<const int64_t> dims) {
   args_.push_back(Buffer{memory, type, {dims.begin(), dims.end()}});
+  ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(
+      args_.back().dims.data(), sizeof(int64_t) * args_.back().dims.size());
 }
 
 void CallFrameBuilder::AddBufferRet(se::DeviceMemoryBase memory,
                                     PrimitiveType type,
                                     absl::Span<const int64_t> dims) {
   rets_.push_back(Buffer{memory, type, {dims.begin(), dims.end()}});
+  ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(
+      rets_.back().dims.data(), sizeof(int64_t) * rets_.back().dims.size());
 }
 
 void CallFrameBuilder::AddAttributes(AttributesMap attrs) {
@@ -133,22 +138,21 @@ struct CallFrame::Dictionary {
 };
 
 struct CallFrame::Array {
-  std::variant<std::vector<int32_t>, std::vector<int64_t>, std::vector<float>>
-      value;  // XLA_FFI_Array::data
+  CallFrameBuilder::Array value;  // XLA_FFI_Array::data
 
-  XLA_FFI_Array array = {XLA_FFI_Array_STRUCT_SIZE, nullptr};
+  XLA_FFI_Array array = {};
 };
 
 struct CallFrame::Scalar {
-  std::variant<int32_t, int64_t, float> value;  // XLA_FFI_Scalar::value
+  CallFrameBuilder::Scalar value;  // XLA_FFI_Scalar::value
 
-  XLA_FFI_Scalar scalar = {XLA_FFI_Scalar_STRUCT_SIZE, nullptr};
+  XLA_FFI_Scalar scalar = {};
 };
 
 struct CallFrame::String {
   std::string value;  // XLA_FFI_ByteSpan::ptr
 
-  XLA_FFI_ByteSpan span = {XLA_FFI_ByteSpan_STRUCT_SIZE, nullptr};
+  XLA_FFI_ByteSpan span = {};
 };
 
 struct CallFrame::NamedAttribute {
@@ -246,9 +250,13 @@ static XLA_FFI_DataType ToDataType(PrimitiveType primitive_type) {
     case PrimitiveType::F32:
     case PrimitiveType::F64:
     case PrimitiveType::BF16:
+    case PrimitiveType::C64:
+    case PrimitiveType::C128:
+    case PrimitiveType::TOKEN:
       return static_cast<XLA_FFI_DataType>(primitive_type);
     default:
-      DCHECK(false) << "Unsupported primitive type" << primitive_type;
+      DCHECK(false) << "Unsupported primitive type "
+                    << PrimitiveType_Name(primitive_type);
       return XLA_FFI_DataType_INVALID;
   }
 }
@@ -355,12 +363,28 @@ struct CallFrame::ConvertAttribute {
 
 template <typename T>
 static XLA_FFI_DataType GetDataType() {
-  if constexpr (std::is_same_v<int32_t, T>) {
+  if constexpr (std::is_same_v<bool, T>) {
+    return XLA_FFI_DataType_PRED;
+  } else if constexpr (std::is_same_v<int8_t, T>) {
+    return XLA_FFI_DataType_S8;
+  } else if constexpr (std::is_same_v<int16_t, T>) {
+    return XLA_FFI_DataType_S16;
+  } else if constexpr (std::is_same_v<int32_t, T>) {
     return XLA_FFI_DataType_S32;
   } else if constexpr (std::is_same_v<int64_t, T>) {
     return XLA_FFI_DataType_S64;
+  } else if constexpr (std::is_same_v<uint8_t, T>) {
+    return XLA_FFI_DataType_U8;
+  } else if constexpr (std::is_same_v<uint16_t, T>) {
+    return XLA_FFI_DataType_U16;
+  } else if constexpr (std::is_same_v<uint32_t, T>) {
+    return XLA_FFI_DataType_U32;
+  } else if constexpr (std::is_same_v<uint64_t, T>) {
+    return XLA_FFI_DataType_U64;
   } else if constexpr (std::is_same_v<float, T>) {
     return XLA_FFI_DataType_F32;
+  } else if constexpr (std::is_same_v<double, T>) {
+    return XLA_FFI_DataType_F64;
   } else {
     static_assert(sizeof(T) == 0, "unsupported FFI data type");
   }
diff --git a/third_party/xla/xla/ffi/call_frame.h b/third_party/xla/xla/ffi/call_frame.h
index 237e7ddb824bfd..9bfd31cd4326ae 100644
--- a/third_party/xla/xla/ffi/call_frame.h
+++ b/third_party/xla/xla/ffi/call_frame.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <string>
+#include <utility>
 #include <variant>
 #include <vector>
 
@@ -57,9 +58,13 @@ class CallFrameBuilder {
   CallFrameBuilder(CallFrameBuilder&&);
   CallFrameBuilder& operator=(CallFrameBuilder&&);
 
-  using Scalar = std::variant<int32_t, int64_t, float>;
-  using Array = std::variant<std::vector<int32_t>, std::vector<int64_t>,
-                             std::vector<float>>;
+  using Scalar = std::variant<bool, int8_t, int16_t, int32_t, int64_t, uint8_t,
+                              uint16_t, uint32_t, uint64_t, float, double>;
+  using Array = std::variant<std::vector<int8_t>, std::vector<int16_t>,
+                             std::vector<int32_t>, std::vector<int64_t>,
+                             std::vector<uint8_t>, std::vector<uint16_t>,
+                             std::vector<uint32_t>, std::vector<uint64_t>,
+                             std::vector<float>, std::vector<double>>;
 
   // Declare implementation detail structs for call frame builder storage.
   struct Dictionary;
@@ -84,6 +89,10 @@ class CallFrameBuilder {
     AttributesBuilder();
     ~AttributesBuilder();
 
+    // This overload is only necessary to support older GCC versions.
+    void Insert(std::string name, const char* attr) {
+      Insert(std::move(name), std::string(attr));
+    }
     void Insert(std::string name, FlatAttribute attr);
     void Insert(std::string name, FlatAttributesMap attrs);
 
diff --git a/third_party/xla/xla/ffi/execution_context.cc b/third_party/xla/xla/ffi/execution_context.cc
new file mode 100644
index 00000000000000..2ea5f8bf37f736
--- /dev/null
+++ b/third_party/xla/xla/ffi/execution_context.cc
@@ -0,0 +1,101 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/ffi/execution_context.h"
+
+#include <atomic>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+
+#include "absl/base/attributes.h"
+#include "absl/base/const_init.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
+
+namespace xla::ffi {
+
+ABSL_CONST_INIT absl::Mutex type_registry_mutex(absl::kConstInit);
+
+using TypeRegistry = absl::flat_hash_map<std::string, ExecutionContext::TypeId>;
+static TypeRegistry& StaticTypeRegistry() {
+  static auto* registry = new TypeRegistry();
+  return *registry;
+}
+
+ExecutionContext::TypeId ExecutionContext::GetNextTypeId() {
+  static auto* counter = new std::atomic<int64_t>(1);
+  return TypeId(counter->fetch_add(1));
+}
+
+ExecutionContext::UserData::UserData(void* data, Deleter<void> deleter)
+    : data_(data), deleter_(std::move(deleter)) {}
+
+ExecutionContext::UserData::~UserData() {
+  if (deleter_) deleter_(data_);
+}
+
+absl::StatusOr<ExecutionContext::TypeId>
+ExecutionContext::RegisterExternalTypeId(std::string_view name) {
+  absl::MutexLock lock(&type_registry_mutex);
+  auto& registry = StaticTypeRegistry();
+
+  // Try to emplace with type id zero and fill it with real type id only if we
+  // successfully acquired an entry for a given name.
+  auto emplaced = registry.emplace(name, TypeId(0));
+  if (!emplaced.second) {
+    return absl::AlreadyExistsError(
+        absl::StrCat("Type id ", emplaced.first->second.value(),
+                     " already registered for type name ", name));
+  }
+  return emplaced.first->second = GetNextTypeId();
+}
+
+absl::Status ExecutionContext::Insert(TypeId type_id, void* data,
+                                      Deleter<void> deleter) {
+  return InsertUserData(type_id,
+                        std::make_unique<UserData>(data, std::move(deleter)));
+}
+
+absl::Status ExecutionContext::InsertUserData(TypeId type_id,
+                                              std::unique_ptr<UserData> data) {
+  if (!data) return absl::InvalidArgumentError("User data must be not null");
+
+  auto emplaced = user_data_.emplace(type_id, std::move(data));
+  if (!emplaced.second) {
+    return absl::AlreadyExistsError(
+        absl::StrCat("User data with type id ", type_id.value(),
+                     " already exists in execution context"));
+  }
+  return absl::OkStatus();
+}
+
+absl::StatusOr<ExecutionContext::UserData*> ExecutionContext::LookupUserData(
+    TypeId type_id) const {
+  auto it = user_data_.find(type_id);
+  if (it == user_data_.end()) {
+    return absl::NotFoundError(absl::StrCat("User data with type id ",
+                                            type_id.value(),
+                                            " not found in execution context"));
+  }
+  return it->second.get();
+}
+
+}  // namespace xla::ffi
diff --git a/third_party/xla/xla/ffi/execution_context.h b/third_party/xla/xla/ffi/execution_context.h
new file mode 100644
index 00000000000000..1a5250913830c4
--- /dev/null
+++ b/third_party/xla/xla/ffi/execution_context.h
@@ -0,0 +1,143 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_FFI_EXECUTION_CONTEXT_H_
+#define XLA_FFI_EXECUTION_CONTEXT_H_
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string_view>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "tsl/lib/gtl/int_type.h"
+#include "tsl/platform/logging.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla::ffi {
+
+// Execution context is a container for forwarding arbitrary user data to FFI
+// handlers in the scope of a single XLA execution. Execution context allows to
+// pass arbitrary user data to FFI handlers via the side channel that does not
+// require modifying HLO modules.
+//
+// From XLA FFI perspective user data is an opaque pointer that can be
+// forwarded to the FFI handler. We rely on type id to guarantee that we forward
+// user data of correct type. There are kinds of type ids:
+//
+// 1. External type id. When FFI handlers defined in a dynamically loaded
+//    library, they must register types used in the execution context ahead
+//    of time and explicitly get a unique type id for them.
+//
+// 2. Internal type id. When FFI handler defined in the same binary we rely
+//    on a global static registry to automatically assing type ids.
+//
+// Examples: FFI handler can register a per-execution cache in the execution
+// context and get access to it in the FFI handler, with a guarantee that it is
+// unique between separate calls to XLA execute.
+class ExecutionContext {
+ public:
+  template <typename T>
+  using Deleter = std::function<void(T*)>;
+
+  TSL_LIB_GTL_DEFINE_INT_TYPE(TypeId, int64_t);
+
+  // Registers external type with a given name in a static type registry.
+  static absl::StatusOr<TypeId> RegisterExternalTypeId(std::string_view name);
+
+  // Inserts opaque user data with a given type id and optional deleter.
+  absl::Status Insert(TypeId type_id, void* data,
+                      Deleter<void> deleter = nullptr);
+
+  // Inserts typed user data of type `T` and optional deleter.
+  template <typename T>
+  absl::Status Insert(T* data, Deleter<T> deleter = nullptr);
+
+  // Emplaces typed user data constructed from `args`. Execution context
+  // becomes the owner of the constructed object.
+  template <typename T, typename... Args>
+  absl::Status Emplace(Args&&... args);
+
+  // Looks up typed execution context data of type `T`.
+  template <typename T>
+  absl::StatusOr<T*> Lookup() const {
+    TF_ASSIGN_OR_RETURN(auto user_data, LookupUserData(GetTypeId<T>()));
+    return static_cast<T*>(user_data->data());
+  }
+
+  // Looks up opaque execution context data with given `type_id`.
+  absl::StatusOr<void*> Lookup(TypeId type_id) const {
+    TF_ASSIGN_OR_RETURN(auto user_data, LookupUserData(type_id));
+    return user_data->data();
+  }
+
+ private:
+  // An RAII wrapper for opaque user data. Optional deleter will be called when
+  // UserData is destroyed together with the execution context. If deleter is
+  // nullptr then the caller is responsible for making sure that the pointer
+  // stays valid during the XLA execution and correctly destroyed afterwards.
+  class UserData {
+   public:
+    UserData(void* data, Deleter<void> deleter);
+    ~UserData();
+
+    UserData(UserData&) = delete;
+    UserData& operator=(const UserData&) = delete;
+
+    void* data() const { return data_; }
+
+   private:
+    void* data_;
+    Deleter<void> deleter_;
+  };
+
+  static TypeId GetNextTypeId();
+
+  template <typename T>
+  static TypeId GetTypeId() {
+    static const TypeId id = GetNextTypeId();
+    return id;
+  }
+
+  absl::Status InsertUserData(TypeId type_id, std::unique_ptr<UserData> data);
+  absl::StatusOr<UserData*> LookupUserData(TypeId type_id) const;
+
+  absl::flat_hash_map<TypeId, std::unique_ptr<UserData>> user_data_;
+};
+
+template <typename T>
+absl::Status ExecutionContext::Insert(T* data, Deleter<T> deleter) {
+  return InsertUserData(GetTypeId<T>(),
+                        std::make_unique<UserData>(
+                            data, [deleter = std::move(deleter)](void* data) {
+                              if (deleter) deleter(static_cast<T*>(data));
+                            }));
+}
+
+template <typename T, typename... Args>
+absl::Status ExecutionContext::Emplace(Args&&... args) {
+  return InsertUserData(GetTypeId<T>(),
+                        std::make_unique<UserData>(
+                            new T(std::forward<Args>(args)...),
+                            [](void* data) { delete static_cast<T*>(data); }));
+}
+
+}  // namespace xla::ffi
+
+#endif  // XLA_FFI_EXECUTION_CONTEXT_H_
diff --git a/third_party/xla/xla/ffi/execution_context_test.cc b/third_party/xla/xla/ffi/execution_context_test.cc
new file mode 100644
index 00000000000000..5c4cb3fdaab666
--- /dev/null
+++ b/third_party/xla/xla/ffi/execution_context_test.cc
@@ -0,0 +1,82 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/ffi/execution_context.h"
+
+#include <cstdint>
+#include <string>
+
+#include "absl/status/status.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/test.h"
+
+namespace xla::ffi {
+
+struct I32UserData {
+  explicit I32UserData(int32_t value) : value(value) {}
+  int32_t value;
+};
+
+struct StrUserData {
+  explicit StrUserData(std::string value) : value(value) {}
+  std::string value;
+};
+
+TEST(ExecutionContextTest, EmplaceUserData) {
+  ExecutionContext context;
+  TF_ASSERT_OK(context.Emplace<I32UserData>(42));
+  TF_ASSERT_OK(context.Emplace<StrUserData>("hello"));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto* i32_data, context.Lookup<I32UserData>());
+  TF_ASSERT_OK_AND_ASSIGN(auto* str_data, context.Lookup<StrUserData>());
+
+  ASSERT_NE(i32_data, nullptr);
+  ASSERT_NE(str_data, nullptr);
+  ASSERT_EQ(i32_data->value, 42);
+  ASSERT_EQ(str_data->value, "hello");
+}
+
+TEST(ExecutionContextTest, InsertUserOwned) {
+  I32UserData user_data(42);
+
+  ExecutionContext context;
+  TF_ASSERT_OK(context.Insert(&user_data));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto* i32_data, context.Lookup<I32UserData>());
+  ASSERT_EQ(i32_data, &user_data);
+}
+
+TEST(ExecutionContextTest, InsertUserOwnedWithTypeId) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      ExecutionContext::TypeId type_id,
+      ExecutionContext::RegisterExternalTypeId("I32UserData"));
+
+  I32UserData user_data(42);
+
+  ExecutionContext context;
+  TF_ASSERT_OK(context.Insert(type_id, &user_data));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto* i32_data, context.Lookup(type_id));
+  ASSERT_EQ(i32_data, &user_data);
+}
+
+TEST(ExecutionContextTest, UserDataNotFound) {
+  ExecutionContext context;
+  auto i32_data = context.Lookup<I32UserData>();
+  ASSERT_EQ(i32_data.status().code(), absl::StatusCode::kNotFound);
+}
+
+}  // namespace xla::ffi
diff --git a/third_party/xla/xla/ffi/ffi.h b/third_party/xla/xla/ffi/ffi.h
index 5f51d6ec3dae93..3100bc3a4fa3b7 100644
--- a/third_party/xla/xla/ffi/ffi.h
+++ b/third_party/xla/xla/ffi/ffi.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_FFI_FFI_H_
 #define XLA_FFI_FFI_H_
 
+#include <memory>
 #ifdef XLA_FFI_API_FFI_H_
 #error Two different XLA FFI implementations cannot be included together
 #endif  // XLA_FFI_API_FFI_H_
@@ -32,9 +33,9 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/ffi/api/c_api.h"
 #include "xla/ffi/api/c_api_internal.h"  // IWYU pragma: keep
+#include "xla/ffi/execution_context.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/primitive_util.h"
-#include "xla/runtime/memref_view.h"
 #include "xla/status.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
@@ -55,15 +56,11 @@ struct CalledComputation {};  // binds `HloComputation*`
 //===----------------------------------------------------------------------===//
 
 struct BufferBase {
+  using Shape = absl::Span<const int64_t>;
+
   PrimitiveType dtype;
   se::DeviceMemoryBase data;
-  absl::Span<const int64_t> dimensions;
-
-  // TODO(ezhulenev): Remove this implicit conversion once we'll migrate to FFI
-  // handlers from runtime custom calls.
-  operator runtime::MemrefView() {  // NOLINT
-    return runtime::MemrefView{dtype, data.opaque(), dimensions};
-  }
+  Shape dimensions;
 };
 
 namespace internal {
@@ -77,8 +74,10 @@ using NativeType = typename primitive_util::PrimitiveTypeToNative<dtype>::type;
 
 template <PrimitiveType dtype, size_t rank = internal::kDynamicRank>
 struct Buffer {
+  using Shape = BufferBase::Shape;
+
   se::DeviceMemory<internal::NativeType<dtype>> data;
-  absl::Span<const int64_t> dimensions;
+  Shape dimensions;
 };
 
 // clang-format off
@@ -89,11 +88,16 @@ template <PrimitiveType dtype> using BufferR3 = Buffer<dtype, 3>;
 template <PrimitiveType dtype> using BufferR4 = Buffer<dtype, 4>;
 // clang-format on
 
+using Token = BufferR0<PrimitiveType::TOKEN>;
+
 namespace internal {
 
 inline BufferBase DecodeBuffer(XLA_FFI_Buffer* buf) {
-  size_t size_bytes = primitive_util::ByteWidth(PrimitiveType(buf->dtype));
-  for (int64_t i = 0; i < buf->rank; ++i) size_bytes *= buf->dims[i];
+  size_t size_bytes = 0;
+  if (primitive_util::IsArrayType(PrimitiveType(buf->dtype))) {
+    size_bytes = primitive_util::ByteWidth(PrimitiveType(buf->dtype));
+    for (int64_t i = 0; i < buf->rank; ++i) size_bytes *= buf->dims[i];
+  }
 
   BufferBase buffer;
   buffer.dtype = PrimitiveType(buf->dtype);
@@ -119,9 +123,15 @@ std::optional<Buffer<dtype, rank>> DecodeBuffer(XLA_FFI_Buffer* buf,
     }
   }
 
+  size_t size_bytes = 0;
+  if (primitive_util::IsArrayType(PrimitiveType(buf->dtype))) {
+    size_bytes = primitive_util::ByteWidth(PrimitiveType(buf->dtype));
+    for (int64_t i = 0; i < buf->rank; ++i) size_bytes *= buf->dims[i];
+  }
+
   Buffer<dtype, rank> buffer;
-  buffer.data =
-      se::DeviceMemory<NativeType<dtype>>(se::DeviceMemoryBase(buf->data));
+  buffer.data = se::DeviceMemory<NativeType<dtype>>(
+      se::DeviceMemoryBase(buf->data, size_bytes));
   buffer.dimensions = absl::MakeConstSpan(buf->dims, buf->rank);
   return buffer;
 }
@@ -233,11 +243,14 @@ struct RetDecoding<Buffer<dtype, rank>> {
     }                                                                    \
   }
 
+XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING(int8_t, XLA_FFI_DataType_S8);
+XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING(int16_t, XLA_FFI_DataType_S16);
 XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING(int32_t, XLA_FFI_DataType_S32);
 XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING(int64_t, XLA_FFI_DataType_S64);
 XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING(float, XLA_FFI_DataType_F32);
+XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING(double, XLA_FFI_DataType_F64);
 
-#undef XLA_FFI_REGISTER_SCALAR_ATTR_DECODING
+#undef XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING
 
 // A type tag to mark i64 attributes as pointers to `T`.
 template <typename T>
@@ -308,13 +321,46 @@ struct CtxDecoding<CalledComputation> {
   }
 };
 
+//===----------------------------------------------------------------------===//
+// UserData
+//===----------------------------------------------------------------------===//
+
+// A type tag for automatic decoding user data passed via the execution context.
+template <typename T>
+struct UserData {};
+
+template <typename T>
+struct CtxDecoding<UserData<T>> {
+  using Type = T*;
+
+  static std::optional<Type> Decode(const XLA_FFI_Api* api,
+                                    XLA_FFI_ExecutionContext* ctx,
+                                    DiagnosticEngine& diagnostic) {
+    auto* execution_context = reinterpret_cast<const ExecutionContext*>(
+        api->internal_api->XLA_FFI_INTERNAL_ExecutionContext_Get(ctx));
+
+    if (execution_context == nullptr) {
+      return diagnostic.Emit(
+          "Execution context must be not null to fetch UserData parameter");
+    }
+
+    auto user_data = execution_context->Lookup<T>();
+    if (!user_data.ok()) {
+      return diagnostic.Emit("Failed to get user data from execution context: ")
+             << user_data.status().message();
+    }
+
+    return *std::move(user_data);
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // Result encoding
 //===----------------------------------------------------------------------===//
 
 template <>
 struct ResultEncoding<Status> {
-  static XLA_FFI_Error* Encode(const XLA_FFI_Api* api, Status status) {
+  static XLA_FFI_Error* Encode(const XLA_FFI_Api* api, absl::Status status) {
     return api->internal_api->XLA_FFI_INTERNAL_Error_Forward(&status);
   }
 };
diff --git a/third_party/xla/xla/ffi/ffi_api.cc b/third_party/xla/xla/ffi/ffi_api.cc
index 48712c27fb734e..02e6b69e4f9064 100644
--- a/third_party/xla/xla/ffi/ffi_api.cc
+++ b/third_party/xla/xla/ffi/ffi_api.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "xla/ffi/api/c_api.h"
 #include "xla/ffi/api/c_api_internal.h"  // IWYU pragma: keep
 #include "xla/ffi/call_frame.h"
+#include "xla/ffi/execution_context.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/status.h"
@@ -44,6 +45,7 @@ struct XLA_FFI_Error {
 struct XLA_FFI_ExecutionContext {
   const xla::ServiceExecutableRunOptions* run_options;
   const xla::HloComputation* called_computation;
+  const xla::ffi::ExecutionContext* execution_context;
 };
 
 //===----------------------------------------------------------------------===//
@@ -61,28 +63,54 @@ bool IsCommandBufferCompatible(XLA_FFI_Handler_Traits traits) {
 // WARNING: These functions defined in `call_frame.h` as we need to make them
 // available without having to depend on `ffi.h` header.
 
-Status TakeStatus(XLA_FFI_Error* error) {
+absl::Status TakeStatus(XLA_FFI_Error* error) {
   if (error == nullptr) return absl::OkStatus();
-  Status status = std::move(error->status);
+  absl::Status status = std::move(error->status);
   delete error;
   return status;
 }
 
-Status Call(Ffi& handler, CallFrame& call_frame, const CallOptions& options) {
-  XLA_FFI_ExecutionContext ctx = {options.run_options,
-                                  options.called_computation};
+absl::Status Call(Ffi& handler, CallFrame& call_frame,
+                  const CallOptions& options) {
+  XLA_FFI_ExecutionContext ctx = {
+      options.run_options, options.called_computation,
+      internal::ScopedExecutionContext::GetCallExecutionContext(options)};
   XLA_FFI_CallFrame ffi_call_frame = call_frame.Build(GetXlaFfiApi(), &ctx);
   return TakeStatus(handler.Call(&ffi_call_frame));
 }
 
-Status Call(XLA_FFI_Handler* handler, CallFrame& call_frame,
-            const CallOptions& options) {
-  XLA_FFI_ExecutionContext ctx = {options.run_options,
-                                  options.called_computation};
+absl::Status Call(XLA_FFI_Handler* handler, CallFrame& call_frame,
+                  const CallOptions& options) {
+  XLA_FFI_ExecutionContext ctx = {
+      options.run_options, options.called_computation,
+      internal::ScopedExecutionContext::GetCallExecutionContext(options)};
   XLA_FFI_CallFrame ffi_call_frame = call_frame.Build(GetXlaFfiApi(), &ctx);
   return TakeStatus((*handler)(&ffi_call_frame));
 }
 
+namespace internal {
+static thread_local const ExecutionContext* scoped_execution_context = nullptr;
+
+ScopedExecutionContext::ScopedExecutionContext(const ExecutionContext* context)
+    : recover_(scoped_execution_context) {
+  scoped_execution_context = context;
+}
+
+ScopedExecutionContext::~ScopedExecutionContext() {
+  scoped_execution_context = recover_;
+}
+
+const ExecutionContext* ScopedExecutionContext::GetCallExecutionContext(
+    const CallOptions& options) {
+  if (scoped_execution_context != nullptr) {
+    return scoped_execution_context;
+  }
+  return options.run_options
+             ? options.run_options->run_options().ffi_execution_context()
+             : nullptr;
+}
+}  // namespace internal
+
 //===----------------------------------------------------------------------===//
 // XLA FFI registry
 //===----------------------------------------------------------------------===//
@@ -100,16 +128,17 @@ static HandlerRegistry& GetHandlerRegistry() {
   return *registry;
 }
 
-static Status RegisterHandler(std::string_view name, std::string_view platform,
-                              XLA_FFI_Handler* handler,
-                              XLA_FFI_Handler_Traits traits) {
+static absl::Status RegisterHandler(std::string_view name,
+                                    std::string_view platform,
+                                    XLA_FFI_Handler* handler,
+                                    XLA_FFI_Handler_Traits traits) {
   auto emplaced = GetHandlerRegistry().try_emplace(
       MakeHandlerKey(name, platform), HandlerRegistration{handler, traits});
   if (!emplaced.second)
     return absl::InvalidArgumentError(
         absl::StrCat("Duplicate FFI handler registration for ", name,
                      " on a platform ", platform));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 absl::StatusOr<HandlerRegistration> FindHandler(std::string_view name,
@@ -145,8 +174,8 @@ static std::string StructSizeErrorMsg(std::string_view struct_name,
                       XLA_FFI_API_MAJOR, ".", XLA_FFI_API_MINOR, ".");
 }
 
-static Status ActualStructSizeIsGreaterOrEqual(std::string_view struct_name,
-                                               size_t expected, size_t actual) {
+static absl::Status ActualStructSizeIsGreaterOrEqual(
+    std::string_view struct_name, size_t expected, size_t actual) {
   if (actual < expected) {
     return absl::InvalidArgumentError(
         StructSizeErrorMsg(struct_name, expected, actual));
@@ -198,7 +227,7 @@ static absl::StatusCode ToStatusCode(XLA_FFI_Error_Code errc) {
 
 #define XLA_FFI_RETURN_IF_ERROR(expr)                                   \
   do {                                                                  \
-    Status _status = (expr);                                            \
+    absl::Status _status = (expr);                                      \
     if (!_status.ok()) {                                                \
       XLA_FFI_Error* _c_status = new XLA_FFI_Error{std::move(_status)}; \
       return _c_status;                                                 \
@@ -214,7 +243,7 @@ static XLA_FFI_Error* XLA_FFI_Error_Create(XLA_FFI_Error_Create_Args* args) {
 }
 
 static void XLA_FFI_Error_GetMessage(XLA_FFI_Error_GetMessage_Args* args) {
-  Status struct_size_check = ActualStructSizeIsGreaterOrEqual(
+  absl::Status struct_size_check = ActualStructSizeIsGreaterOrEqual(
       "XLA_FFI_Error_GetMessage", XLA_FFI_Error_GetMessage_Args_STRUCT_SIZE,
       args->struct_size);
   if (!struct_size_check.ok()) {
@@ -226,7 +255,7 @@ static void XLA_FFI_Error_GetMessage(XLA_FFI_Error_GetMessage_Args* args) {
 }
 
 static void XLA_FFI_Error_Destroy(XLA_FFI_Error_Destroy_Args* args) {
-  Status struct_size_check = ActualStructSizeIsGreaterOrEqual(
+  absl::Status struct_size_check = ActualStructSizeIsGreaterOrEqual(
       "XLA_FFI_Error_Destroy", XLA_FFI_Error_Destroy_Args_STRUCT_SIZE,
       args->struct_size);
   if (!struct_size_check.ok()) {
@@ -241,8 +270,10 @@ static XLA_FFI_Error* XLA_FFI_Handler_Register(
       "XLA_FFI_Handler_Register", XLA_FFI_Handler_Register_Args_STRUCT_SIZE,
       args->struct_size));
 
-  if (auto status = RegisterHandler(args->name, args->platform, args->handler,
-                                    args->traits);
+  if (auto status = RegisterHandler(
+          std::string_view(args->name.ptr, args->name.len),
+          std::string_view(args->platform.ptr, args->platform.len),
+          args->handler, args->traits);
       !status.ok()) {
     return new XLA_FFI_Error{std::move(status)};
   }
@@ -260,6 +291,38 @@ static XLA_FFI_Error* XLA_FFI_Stream_Get(XLA_FFI_Stream_Get_Args* args) {
   return nullptr;
 }
 
+static XLA_FFI_Error* XLA_FFI_TypeId_Register(
+    XLA_FFI_TypeId_Register_Args* args) {
+  XLA_FFI_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "XLA_FFI_ExecutionContext_Get_Args",
+      XLA_FFI_ExecutionContext_Get_Args_STRUCT_SIZE, args->struct_size));
+
+  auto type_id = ExecutionContext::RegisterExternalTypeId(
+      std::string_view(args->name.ptr, args->name.len));
+  if (!type_id.ok()) {
+    return new XLA_FFI_Error{std::move(type_id).status()};
+  }
+
+  args->type_id->type_id = type_id->value();
+  return nullptr;
+}
+
+static XLA_FFI_Error* XLA_FFI_ExecutionContext_Get(
+    XLA_FFI_ExecutionContext_Get_Args* args) {
+  XLA_FFI_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "XLA_FFI_ExecutionContext_Get_Args",
+      XLA_FFI_ExecutionContext_Get_Args_STRUCT_SIZE, args->struct_size));
+
+  auto user_data = args->ctx->execution_context->Lookup(
+      ExecutionContext::TypeId(args->type_id->type_id));
+  if (!user_data.ok()) {
+    return new XLA_FFI_Error{std::move(user_data).status()};
+  }
+
+  args->data = *user_data;
+  return nullptr;
+}
+
 //===----------------------------------------------------------------------===//
 // XLA FFI Internal Api Implementation
 //===----------------------------------------------------------------------===//
@@ -287,6 +350,11 @@ static void* XLA_FFI_INTERNAL_CalledComputation_Get(
   return const_cast<HloComputation*>(ctx->called_computation);
 }
 
+static void* XLA_FFI_INTERNAL_ExecutionContext_Get(
+    XLA_FFI_ExecutionContext* ctx) {
+  return const_cast<ffi::ExecutionContext*>(ctx->execution_context);
+}
+
 //===----------------------------------------------------------------------===//
 // XLA FFI Api access
 //===----------------------------------------------------------------------===//
@@ -299,6 +367,7 @@ static XLA_FFI_InternalApi internal_api = {
     XLA_FFI_INTERNAL_DeviceOrdinal_Get,
     XLA_FFI_INTERNAL_DeviceMemoryAllocator_Get,
     XLA_FFI_INTERNAL_CalledComputation_Get,
+    XLA_FFI_INTERNAL_ExecutionContext_Get,
 };
 
 static XLA_FFI_Api api = {
@@ -307,11 +376,13 @@ static XLA_FFI_Api api = {
 
     &internal_api,
 
-    XLA_FFI_Error_Create,      // creates error
-    XLA_FFI_Error_GetMessage,  // get error message
-    XLA_FFI_Error_Destroy,     // frees error
-    XLA_FFI_Handler_Register,  // registers handler
-    XLA_FFI_Stream_Get,        // returns platform specific stream
+    XLA_FFI_Error_Create,
+    XLA_FFI_Error_GetMessage,
+    XLA_FFI_Error_Destroy,
+    XLA_FFI_Handler_Register,
+    XLA_FFI_Stream_Get,
+    XLA_FFI_TypeId_Register,
+    XLA_FFI_ExecutionContext_Get,
 };
 
 const XLA_FFI_Api* GetXlaFfiApi() { return &api; }
diff --git a/third_party/xla/xla/ffi/ffi_api.h b/third_party/xla/xla/ffi/ffi_api.h
index 001ed49896dcb9..766fb83c83026c 100644
--- a/third_party/xla/xla/ffi/ffi_api.h
+++ b/third_party/xla/xla/ffi/ffi_api.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "xla/ffi/api/c_api.h"
 #include "xla/ffi/api/c_api_internal.h"  // IWYU pragma: keep
 #include "xla/ffi/call_frame.h"
+#include "xla/ffi/execution_context.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/status.h"
@@ -49,13 +50,38 @@ struct CallOptions {
 
 // Takes ownership of the XLA FFI error and returns underlying status. Frees
 // `error` if it's not nullptr; returns OK status otherwise.
-Status TakeStatus(XLA_FFI_Error* error);
-
-Status Call(Ffi& handler, CallFrame& call_frame,
-            const CallOptions& options = {});
-
-Status Call(XLA_FFI_Handler* handler, CallFrame& call_frame,
-            const CallOptions& options = {});
+absl::Status TakeStatus(XLA_FFI_Error* error);
+
+absl::Status Call(Ffi& handler, CallFrame& call_frame,
+                  const CallOptions& options = {});
+
+absl::Status Call(XLA_FFI_Handler* handler, CallFrame& call_frame,
+                  const CallOptions& options = {});
+
+namespace internal {
+// This is an internal workaround to override FFI execution context for FFI
+// calls executed in the current thread with `context` in tests that use legacy
+// xla::Client, xla::Service and xla::Backend APIs because it's not worth it to
+// add proper execution context support throughout all abstraction layers
+// (legacy client APIs should be eventually deleted instead). This workaround
+// should not be used outside of tests.
+class ScopedExecutionContext {
+ public:
+  explicit ScopedExecutionContext(const ExecutionContext* context);
+  ~ScopedExecutionContext();
+
+  ScopedExecutionContext(ScopedExecutionContext&&) = delete;
+  ScopedExecutionContext& operator=(ScopedExecutionContext&&) = delete;
+
+  // Returns an execution context that should be used for FFI calls based on the
+  // call options and the current thread's execution context.
+  static const ExecutionContext* GetCallExecutionContext(
+      const CallOptions& options);
+
+ private:
+  const ExecutionContext* recover_;
+};
+}  // namespace internal
 
 //===----------------------------------------------------------------------===//
 // XLA FFI registry
diff --git a/third_party/xla/xla/ffi/ffi_test.cc b/third_party/xla/xla/ffi/ffi_test.cc
index 56c49fc8063b7a..f2c05eaf32313d 100644
--- a/third_party/xla/xla/ffi/ffi_test.cc
+++ b/third_party/xla/xla/ffi/ffi_test.cc
@@ -15,15 +15,19 @@ limitations under the License.
 
 #include "xla/ffi/ffi.h"
 
+#include <complex>
 #include <cstdint>
+#include <memory>
 #include <string>
 #include <string_view>
+#include <utility>
 #include <vector>
 
 #include "absl/status/status.h"
 #include "absl/strings/match.h"
 #include "absl/types/span.h"
 #include "xla/ffi/call_frame.h"
+#include "xla/ffi/execution_context.h"
 #include "xla/ffi/ffi_api.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/stream_executor/device_memory.h"
@@ -108,24 +112,40 @@ TEST(FfiTest, WrongNumAttrs) {
 
 TEST(FfiTest, BuiltinAttributes) {
   CallFrameBuilder::AttributesBuilder attrs;
-  attrs.Insert("i32", 42);
+  attrs.Insert("pred", true);
+  attrs.Insert("i8", static_cast<int8_t>(42));
+  attrs.Insert("i16", static_cast<int16_t>(42));
+  attrs.Insert("i32", static_cast<int32_t>(42));
+  attrs.Insert("i64", static_cast<int64_t>(42));
   attrs.Insert("f32", 42.0f);
+  attrs.Insert("f64", 42.0);
   attrs.Insert("str", "foo");
 
   CallFrameBuilder builder;
   builder.AddAttributes(attrs.Build());
   auto call_frame = builder.Build();
 
-  auto fn = [&](int32_t i32, float f32, std::string_view str) {
+  auto fn = [&](bool pred, int8_t i8, int16_t i16, int32_t i32, int64_t i64,
+                float f32, double f64, std::string_view str) {
+    EXPECT_EQ(pred, true);
+    EXPECT_EQ(i8, 42);
+    EXPECT_EQ(i16, 42);
     EXPECT_EQ(i32, 42);
+    EXPECT_EQ(i64, 42);
     EXPECT_EQ(f32, 42.0f);
+    EXPECT_EQ(f64, 42.0);
     EXPECT_EQ(str, "foo");
     return absl::OkStatus();
   };
 
   auto handler = Ffi::Bind()
+                     .Attr<bool>("pred")
+                     .Attr<int8_t>("i8")
+                     .Attr<int16_t>("i16")
                      .Attr<int32_t>("i32")
+                     .Attr<int64_t>("i64")
                      .Attr<float>("f32")
+                     .Attr<double>("f64")
                      .Attr<std::string_view>("str")
                      .To(fn);
 
@@ -163,22 +183,36 @@ TEST(FfiTest, BuiltinAttributesAutoBinding) {
 
 TEST(FfiTest, ArrayAttr) {
   CallFrameBuilder::AttributesBuilder attrs;
-  attrs.Insert("arr", std::vector<int32_t>({1, 2, 3, 4}));
+  attrs.Insert("arr0", std::vector<int8_t>({1, 2, 3, 4}));
+  attrs.Insert("arr1", std::vector<int16_t>({1, 2, 3, 4}));
+  attrs.Insert("arr2", std::vector<int32_t>({1, 2, 3, 4}));
+  attrs.Insert("arr3", std::vector<int64_t>({1, 2, 3, 4}));
+  attrs.Insert("arr4", std::vector<float>({1, 2, 3, 4}));
+  attrs.Insert("arr5", std::vector<double>({1, 2, 3, 4}));
 
   CallFrameBuilder builder;
   builder.AddAttributes(attrs.Build());
   auto call_frame = builder.Build();
 
-  auto fn = [&](absl::Span<const int32_t> arr) {
-    EXPECT_EQ(arr.size(), 4);
-    EXPECT_EQ(arr[0], 1);
-    EXPECT_EQ(arr[1], 2);
-    EXPECT_EQ(arr[2], 3);
-    EXPECT_EQ(arr[3], 4);
+  auto fn = [&](auto arr0, auto arr1, auto arr2, auto arr3, auto arr4,
+                auto arr5) {
+    EXPECT_EQ(arr0, absl::Span<const int8_t>({1, 2, 3, 4}));
+    EXPECT_EQ(arr1, absl::Span<const int16_t>({1, 2, 3, 4}));
+    EXPECT_EQ(arr2, absl::Span<const int32_t>({1, 2, 3, 4}));
+    EXPECT_EQ(arr3, absl::Span<const int64_t>({1, 2, 3, 4}));
+    EXPECT_EQ(arr4, absl::Span<const float>({1, 2, 3, 4}));
+    EXPECT_EQ(arr5, absl::Span<const double>({1, 2, 3, 4}));
     return absl::OkStatus();
   };
 
-  auto handler = Ffi::Bind().Attr<absl::Span<const int32_t>>("arr").To(fn);
+  auto handler = Ffi::Bind()
+                     .Attr<absl::Span<const int8_t>>("arr0")
+                     .Attr<absl::Span<const int16_t>>("arr1")
+                     .Attr<absl::Span<const int32_t>>("arr2")
+                     .Attr<absl::Span<const int64_t>>("arr3")
+                     .Attr<absl::Span<const float>>("arr4")
+                     .Attr<absl::Span<const double>>("arr5")
+                     .To(fn);
   auto status = Call(*handler, call_frame);
 
   TF_ASSERT_OK(status);
@@ -422,7 +456,7 @@ TEST(FfiTest, BufferBaseArgument) {
 
 TEST(FfiTest, TypedAndRankedBufferArgument) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceMemoryBase memory(storage.data(), storage.size() * sizeof(float));
 
   CallFrameBuilder builder;
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -430,6 +464,7 @@ TEST(FfiTest, TypedAndRankedBufferArgument) {
 
   auto fn = [&](BufferR2<PrimitiveType::F32> buffer) {
     EXPECT_EQ(buffer.data.opaque(), storage.data());
+    EXPECT_EQ(buffer.data.ElementCount(), storage.size());
     EXPECT_EQ(buffer.dimensions.size(), 2);
     return absl::OkStatus();
   };
@@ -447,6 +482,43 @@ TEST(FfiTest, TypedAndRankedBufferArgument) {
   }
 }
 
+TEST(FfiTest, ComplexBufferArgument) {
+  std::vector<std::complex<float>> storage(4, 0.0f);
+  se::DeviceMemoryBase memory(storage.data(),
+                              storage.size() * sizeof(std::complex<float>));
+
+  CallFrameBuilder builder;
+  builder.AddBufferArg(memory, PrimitiveType::C64, /*dims=*/{2, 2});
+  auto call_frame = builder.Build();
+
+  auto fn = [&](BufferR2<PrimitiveType::C64> buffer) {
+    EXPECT_EQ(buffer.data.opaque(), storage.data());
+    EXPECT_EQ(buffer.dimensions.size(), 2);
+    return absl::OkStatus();
+  };
+
+  auto handler = Ffi::Bind().Arg<BufferR2<PrimitiveType::C64>>().To(fn);
+  auto status = Call(*handler, call_frame);
+  TF_ASSERT_OK(status);
+}
+
+TEST(FfiTest, TokenArgument) {
+  CallFrameBuilder builder;
+  builder.AddBufferArg(se::DeviceMemoryBase(), PrimitiveType::TOKEN,
+                       /*dims=*/{});
+  auto call_frame = builder.Build();
+
+  auto fn = [&](Token tok) {
+    EXPECT_EQ(tok.data.opaque(), nullptr);
+    EXPECT_EQ(tok.dimensions.size(), 0);
+    return absl::OkStatus();
+  };
+
+  auto handler = Ffi::Bind().Arg<Token>().To(fn);
+  auto status = Call(*handler, call_frame);
+  TF_ASSERT_OK(status);
+}
+
 TEST(FfiTest, WrongRankBufferArgument) {
   std::vector<int32_t> storage(4, 0.0);
   se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(int32_t));
@@ -503,6 +575,28 @@ TEST(FfiTest, RemainingArgs) {
   TF_ASSERT_OK(status);
 }
 
+TEST(FfiTest, RemainingRets) {
+  std::vector<float> storage(4, 0.0f);
+  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+
+  CallFrameBuilder builder;
+  builder.AddBufferRet(memory, PrimitiveType::F32, /*dims=*/{2, 2});
+  builder.AddBufferRet(memory, PrimitiveType::F32, /*dims=*/{2, 2});
+  auto call_frame = builder.Build();
+
+  auto fn = [&](Result<BufferBase> ret, RemainingResults rets) {
+    EXPECT_EQ(rets.size(), 1);
+    EXPECT_TRUE(rets.get<BufferBase>(0).has_value());
+    EXPECT_FALSE(rets.get<BufferBase>(1).has_value());
+    return absl::OkStatus();
+  };
+
+  auto handler = Ffi::Bind().Ret<BufferBase>().RemainingResults().To(fn);
+  auto status = Call(*handler, call_frame);
+
+  TF_ASSERT_OK(status);
+}
+
 TEST(FfiTest, RunOptionsCtx) {
   auto call_frame = CallFrameBuilder().Build();
   auto* expected = reinterpret_cast<se::Stream*>(0x01234567);
@@ -521,4 +615,32 @@ TEST(FfiTest, RunOptionsCtx) {
   TF_ASSERT_OK(status);
 }
 
+struct StrUserData {
+  explicit StrUserData(std::string str) : str(std::move(str)) {}
+  std::string str;
+};
+
+TEST(FfiTest, UserData) {
+  ExecutionContext execution_context;
+  TF_ASSERT_OK(execution_context.Emplace<StrUserData>("foo"));
+
+  CallFrameBuilder builder;
+  auto call_frame = builder.Build();
+
+  auto fn = [&](StrUserData* data) {
+    EXPECT_EQ(data->str, "foo");
+    return absl::OkStatus();
+  };
+
+  auto handler = Ffi::Bind().Ctx<UserData<StrUserData>>().To(fn);
+
+  ServiceExecutableRunOptions opts;
+  opts.mutable_run_options()->set_ffi_execution_context(&execution_context);
+
+  CallOptions options = {&opts};
+  auto status = Call(*handler, call_frame, options);
+
+  TF_ASSERT_OK(status);
+}
+
 }  // namespace xla::ffi
diff --git a/third_party/xla/xla/frontend_attributes.cc b/third_party/xla/xla/frontend_attributes.cc
index 8831040f89c15c..53ee0a4d1f1643 100644
--- a/third_party/xla/xla/frontend_attributes.cc
+++ b/third_party/xla/xla/frontend_attributes.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/frontend_attributes.h"
 
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/xla_data.pb.h"
+
 namespace xla {
 
 void SetDisjointReadWriteRegionsAttr(HloInstruction* instruction) {
diff --git a/third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc b/third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc
index 25cb1126e24b2e..9df1e652d24dd5 100644
--- a/third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc
+++ b/third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc
@@ -159,9 +159,10 @@ std::optional<bool> GetInstructionStaticValueAsBool(
 template <PrimitiveType kType>
 struct PopulateParallelImpl {
   using NativeT = NativeTypeOf<kType>;
-  static Status Run(Literal& literal,
-                    absl::FunctionRef<Literal(absl::Span<const int64_t>, int)>
-                        literal_generator) {
+  static absl::Status Run(
+      Literal& literal,
+      absl::FunctionRef<Literal(absl::Span<const int64_t>, int)>
+          literal_generator) {
     return literal.PopulateParallel<NativeT>(
         [&literal_generator](absl::Span<const int64_t> output_index,
                              int thread_id) {
@@ -174,7 +175,7 @@ struct PopulateParallelImpl {
 template <PrimitiveType kType>
 struct PopulateImpl {
   using NativeT = NativeTypeOf<kType>;
-  static Status Run(
+  static absl::Status Run(
       Literal& literal,
       absl::FunctionRef<Literal(absl::Span<const int64_t>)> literal_generator) {
     return literal.Populate<NativeT>(
@@ -192,10 +193,10 @@ struct PopulateImpl {
 // to small templated helpers just for the parts that require manipulating the
 // native types to avoid templating the whole implementations.
 template <template <PrimitiveType> typename Trait, typename F>
-Status Apply(Literal& literal, F&& literal_generator) {
+absl::Status Apply(Literal& literal, F&& literal_generator) {
   return primitive_util::PrimitiveTypeSwitch<Status>(
       [&, literal_generator = std::forward<F>(literal_generator)](
-          auto primitive_type_constant) -> Status {
+          auto primitive_type_constant) -> absl::Status {
         if constexpr (primitive_util::IsArrayType(primitive_type_constant)) {
           return Trait<primitive_type_constant>::Run(
               literal, std::move(literal_generator));
@@ -216,7 +217,7 @@ enum class EvalErrorDetail : uint32_t {
   kDynamicValueDependence = 0,
 };
 
-std::optional<EvalErrorDetail> ParseEvalErrorDetail(const Status& error) {
+std::optional<EvalErrorDetail> ParseEvalErrorDetail(const absl::Status& error) {
   auto error_detail = error.GetPayload(kEvalErrorDetailUrl);
   if (!error_detail.has_value() && error_detail->empty()) {
     return std::nullopt;
@@ -225,11 +226,12 @@ std::optional<EvalErrorDetail> ParseEvalErrorDetail(const Status& error) {
       absl::little_endian::Load32(error_detail->Flatten().data()));
 }
 
-Status MakeEvalErrorDueToParamOrInfeed(const HloInstruction& eval_instruction) {
-  Status error = tsl::errors::FailedPrecondition(
+absl::Status MakeEvalErrorDueToParamOrInfeed(
+    const HloInstruction& eval_instruction) {
+  absl::Status error = absl::FailedPreconditionError(absl::StrCat(
       "Failed to evaluate instruction (", eval_instruction.name(),
       ") since it depends on infeed or parameters to its parent computation (",
-      eval_instruction.parent()->name(), ").");
+      eval_instruction.parent()->name(), ")."));
   std::string error_payload;
   error_payload.resize(sizeof(EvalErrorDetail));
   absl::little_endian::Store32(
@@ -1067,7 +1069,7 @@ absl::StatusOr<Literal> HloEvaluator::EvaluateDotOp(
   return Evaluate(cloned_instruction.get());
 }
 
-Status HloEvaluator::EvaluateParameterFromCallerArgument(
+absl::Status HloEvaluator::EvaluateParameterFromCallerArgument(
     const HloInstruction* parameter, const ShapeIndex& shape_index) {
   CHECK(!evaluated_.contains(parameter));
   const HloComputation* parent_computation = parameter->parent();
@@ -1119,7 +1121,7 @@ Status HloEvaluator::EvaluateParameterFromCallerArgument(
   TF_RETURN_IF_ERROR(evaluated_[parameter].CopyFrom(
       caller_operand_literal, /*dest_shape_index=*/shape_index,
       /*src_shape_index=*/shape_index));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 std::vector<int64_t> HloEvaluator::GetS64Indices(
@@ -1155,13 +1157,13 @@ DimensionVector HloEvaluator::MakeDimMultipliers(const Shape& shape) {
   return v;
 }
 
-Status HloEvaluator::EvaluateInternal(
+absl::Status HloEvaluator::EvaluateInternal(
     const HloInstruction* instruction, const ShapeIndex& shape_index,
     bool recursively_evaluate_nonconstant_operands) {
   // Don't need to evaluate this instruction again if it has already been
   // evaluated.
   if (IsAlreadyEvaluated(instruction, shape_index)) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   if (!recursively_evaluate_nonconstant_operands) {
@@ -1198,7 +1200,7 @@ Status HloEvaluator::EvaluateInternal(
         }
       }
       if (call_graph_cache_ && tuple_points_to_analysis_cache_) {
-        Status argument_eval_status =
+        absl::Status argument_eval_status =
             EvaluateParameterFromCallerArgument(instruction, shape_index);
         if (!argument_eval_status.ok()) {
           VLOG(4) << "Failed to evaluate parameter " << instruction->name()
@@ -1226,7 +1228,7 @@ Status HloEvaluator::EvaluateInternal(
           evaluated_[instruction] =
               Literal::CreateFromShapeWithUnknownLeafArrays(
                   instruction->shape());
-          return OkStatus();
+          return absl::OkStatus();
         }
       }
     }
@@ -1235,10 +1237,10 @@ Status HloEvaluator::EvaluateInternal(
   TF_RETURN_IF_ERROR(Preprocess(instruction));
   TF_RETURN_IF_ERROR(instruction->Visit(this));
   TF_RETURN_IF_ERROR(Postprocess(instruction));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status HloEvaluator::HandleBitcast(const HloInstruction* bitcast) {
+absl::Status HloEvaluator::HandleBitcast(const HloInstruction* bitcast) {
   const Literal& operand_literal = GetEvaluatedLiteralFor(bitcast->operand(0));
   Literal result(bitcast->shape());
   // Bitcast output is allowed to be smaller than the input if the backend-
@@ -1249,20 +1251,20 @@ Status HloEvaluator::HandleBitcast(const HloInstruction* bitcast) {
   memcpy(result.untyped_data(), operand_literal.untyped_data(),
          result.size_bytes());
   evaluated_[bitcast] = std::move(result);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status HloEvaluator::HandleBitcastConvert(const HloInstruction* convert) {
+absl::Status HloEvaluator::HandleBitcastConvert(const HloInstruction* convert) {
   const HloInstruction* operand = convert->operand(0);
   TF_ASSIGN_OR_RETURN(
       Literal result,
       GetEvaluatedLiteralFor(operand).BitcastConvert(convert->shape()));
 
   evaluated_[convert] = std::move(result);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status HloEvaluator::HandleGetDimensionSize(
+absl::Status HloEvaluator::HandleGetDimensionSize(
     const HloInstruction* get_dimension_size) {
   const HloInstruction* operand = get_dimension_size->operand(0);
   int64_t dim = get_dimension_size->dimension();
@@ -1276,7 +1278,7 @@ Status HloEvaluator::HandleGetDimensionSize(
   if (dynamic_size != nullptr) {
     evaluated_[get_dimension_size] =
         GetEvaluatedLiteralFor(dynamic_size).Clone();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   const Shape& shape = get_dimension_size->operand(0)->shape();
@@ -1284,10 +1286,10 @@ Status HloEvaluator::HandleGetDimensionSize(
   output.PopulateWithValue(
       static_cast<int32_t>(shape.dimensions(get_dimension_size->dimension())));
   evaluated_[get_dimension_size] = std::move(output);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status HloEvaluator::HandleSetDimensionSize(
+absl::Status HloEvaluator::HandleSetDimensionSize(
     const HloInstruction* set_dimension_size) {
   const Literal& operand_literal =
       GetEvaluatedLiteralFor(set_dimension_size->operand(0));
@@ -1299,10 +1301,10 @@ Status HloEvaluator::HandleSetDimensionSize(
   result.SetDynamicSize(set_dimension_size->dimension(),
                         size_literal.Get<int32_t>({}));
   evaluated_[set_dimension_size] = std::move(result);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status HloEvaluator::HandleParameter(const HloInstruction* parameter) {
+absl::Status HloEvaluator::HandleParameter(const HloInstruction* parameter) {
   if (!IsAlreadyEvaluated(parameter, visitor_shape_index_)) {
     if (!enable_partial_evaluation_) {
       return tsl::errors::FailedPrecondition(
@@ -1311,7 +1313,7 @@ Status HloEvaluator::HandleParameter(const HloInstruction* parameter) {
     }
     evaluated_[parameter] =
         Literal::CreateFromShapeWithUnknownLeafArrays(parameter->shape());
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   if (!arg_literals_.empty()) {
@@ -1330,10 +1332,10 @@ Status HloEvaluator::HandleParameter(const HloInstruction* parameter) {
 #endif
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status HloEvaluator::HandleInfeed(const HloInstruction* infeed) {
+absl::Status HloEvaluator::HandleInfeed(const HloInstruction* infeed) {
   if (!enable_partial_evaluation_) {
     return tsl::errors::FailedPrecondition(
         "Failed to evaluate instruction since its operands are unknown "
@@ -1341,27 +1343,28 @@ Status HloEvaluator::HandleInfeed(const HloInstruction* infeed) {
   }
   evaluated_[infeed] =
       Literal::CreateFromShapeWithUnknownLeafArrays(infeed->shape());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status HloEvaluator::HandleConstant(const HloInstruction*) {
-  return OkStatus();
+absl::Status HloEvaluator::HandleConstant(const HloInstruction*) {
+  return absl::OkStatus();
 }
 
-Status HloEvaluator::HandleReshape(const HloInstruction* reshape) {
+absl::Status HloEvaluator::HandleReshape(const HloInstruction* reshape) {
   TF_ASSIGN_OR_RETURN(evaluated_[reshape],
                       GetEvaluatedLiteralFor(reshape->operand(0))
                           .Reshape(reshape->shape().dimensions()));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status HloEvaluator::HandleTranspose(const HloInstruction* transpose) {
+absl::Status HloEvaluator::HandleTranspose(const HloInstruction* transpose) {
   evaluated_[transpose] = GetEvaluatedLiteralFor(transpose->operand(0))
                               .Transpose(transpose->dimensions());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status HloEvaluator::HandleConcatenate(const HloInstruction* concatenate) {
+absl::Status HloEvaluator::HandleConcatenate(
+    const HloInstruction* concatenate) {
   absl::Span<HloInstruction* const> operands(concatenate->operands());
   // The result concatenate dimension is going to be the sum of all
   // concatenate dimensions of the operands taking part of the operation.
@@ -1399,14 +1402,14 @@ Status HloEvaluator::HandleConcatenate(const HloInstruction* concatenate) {
   }
 
   evaluated_[concatenate] = std::move(result_literal);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status HloEvaluator::HandleIsFinite(const HloInstruction* is_finite) {
+absl::Status HloEvaluator::HandleIsFinite(const HloInstruction* is_finite) {
   auto operand = is_finite->operand(0);
   auto elem_ty = operand->shape().element_type();
   return primitive_util::PrimitiveTypeSwitch<Status>(
-      [&](auto primitive_type_constant) -> Status {
+      [&](auto primitive_type_constant) -> absl::Status {
         if constexpr (primitive_util::IsFloatingPointType(
                           primitive_type_constant)) {
           using NativeT = primitive_util::NativeTypeOf<primitive_type_constant>;
@@ -1417,7 +1420,7 @@ Status HloEvaluator::HandleIsFinite(const HloInstruction* is_finite) {
               },
               GetEvaluatedLiteralFor(operand));
           TF_ASSIGN_OR_RETURN(evaluated_[is_finite], std::move(result_or));
-          return OkStatus();
+          return absl::OkStatus();
         }
         return InvalidArgument(
             "expected element type in shape to be floating point, but got: %s",
@@ -1426,10 +1429,10 @@ Status HloEvaluator::HandleIsFinite(const HloInstruction* is_finite) {
       elem_ty);
 }
 
-Status HloEvaluator::HandleReal(const HloInstruction* real) {
+absl::Status HloEvaluator::HandleReal(const HloInstruction* real) {
   auto operand = real->operand(0);
   return primitive_util::PrimitiveTypeSwitch<Status>(
-      [&](auto primitive_type_constant) -> Status {
+      [&](auto primitive_type_constant) -> absl::Status {
         if constexpr (primitive_util::IsFloatingPointType(
                           primitive_type_constant)) {
           using NativeT = primitive_util::NativeTypeOf<primitive_type_constant>;
@@ -1437,7 +1440,7 @@ Status HloEvaluator::HandleReal(const HloInstruction* real) {
               real, [](NativeT elem_operand) { return elem_operand; },
               GetEvaluatedLiteralFor(operand));
           TF_ASSIGN_OR_RETURN(evaluated_[real], std::move(result_or));
-          return OkStatus();
+          return absl::OkStatus();
         }
         if constexpr (primitive_util::IsComplexType(primitive_type_constant)) {
           using NativeT = primitive_util::NativeTypeOf<primitive_type_constant>;
@@ -1447,7 +1450,7 @@ Status HloEvaluator::HandleReal(const HloInstruction* real) {
                   [](NativeT elem_operand) { return std::real(elem_operand); },
                   GetEvaluatedLiteralFor(operand));
           TF_ASSIGN_OR_RETURN(evaluated_[real], std::move(result_or));
-          return OkStatus();
+          return absl::OkStatus();
         }
         LOG(FATAL) << "HandleReal: unknown/unhandled primitive type: "
                    << PrimitiveType_Name(operand->shape().element_type());
@@ -1455,10 +1458,10 @@ Status HloEvaluator::HandleReal(const HloInstruction* real) {
       operand->shape().element_type());
 }
 
-Status HloEvaluator::HandleImag(const HloInstruction* imag) {
+absl::Status HloEvaluator::HandleImag(const HloInstruction* imag) {
   auto operand = imag->operand(0);
   return primitive_util::PrimitiveTypeSwitch<Status>(
-      [&](auto primitive_type_constant) -> Status {
+      [&](auto primitive_type_constant) -> absl::Status {
         if constexpr (primitive_util::IsFloatingPointType(
                           primitive_type_constant)) {
           using NativeT = primitive_util::NativeTypeOf<primitive_type_constant>;
@@ -1466,7 +1469,7 @@ Status HloEvaluator::HandleImag(const HloInstruction* imag) {
               imag, [](NativeT elem_operand) { return NativeT(0); },
               GetEvaluatedLiteralFor(operand));
           TF_ASSIGN_OR_RETURN(evaluated_[imag], std::move(result_or));
-          return OkStatus();
+          return absl::OkStatus();
         }
         if constexpr (primitive_util::IsComplexType(primitive_type_constant)) {
           using NativeT = primitive_util::NativeTypeOf<primitive_type_constant>;
@@ -1476,7 +1479,7 @@ Status HloEvaluator::HandleImag(const HloInstruction* imag) {
                   [](NativeT elem_operand) { return std::imag(elem_operand); },
                   GetEvaluatedLiteralFor(operand));
           TF_ASSIGN_OR_RETURN(evaluated_[imag], std::move(result_or));
-          return OkStatus();
+          return absl::OkStatus();
         }
         LOG(FATAL) << "HandleImag: unknown/unhandled primitive type: "
                    << PrimitiveType_Name(operand->shape().element_type());
@@ -1484,14 +1487,14 @@ Status HloEvaluator::HandleImag(const HloInstruction* imag) {
       operand->shape().element_type());
 }
 
-Status HloEvaluator::HandleComplex(const HloInstruction* complex) {
+absl::Status HloEvaluator::HandleComplex(const HloInstruction* complex) {
   const Literal& real = GetEvaluatedLiteralFor(complex->operand(0));
   const Literal& imag = GetEvaluatedLiteralFor(complex->operand(1));
   TF_RET_CHECK(ShapeUtil::Compatible(real.shape(), imag.shape()));
 
   Literal result(complex->shape());
   return primitive_util::PrimitiveTypeSwitch<Status>(
-      [&](auto primitive_type_constant) -> Status {
+      [&](auto primitive_type_constant) -> absl::Status {
         if constexpr (primitive_util::IsComplexType(primitive_type_constant)) {
           using NativeT = primitive_util::NativeTypeOf<primitive_type_constant>;
           TF_RETURN_IF_ERROR(result.Populate<NativeT>(
@@ -1501,7 +1504,7 @@ Status HloEvaluator::HandleComplex(const HloInstruction* complex) {
                     imag.Get<typename NativeT::value_type>(multi_index));
               }));
           evaluated_[complex] = std::move(result);
-          return OkStatus();
+          return absl::OkStatus();
         }
         LOG(FATAL) << "HandleComplex: unknown/unhandled primitive type: "
                    << PrimitiveType_Name(complex->shape().element_type());
@@ -1509,7 +1512,7 @@ Status HloEvaluator::HandleComplex(const HloInstruction* complex) {
       complex->shape().element_type());
 }
 
-Status HloEvaluator::HandleCompare(const HloInstruction* compare) {
+absl::Status HloEvaluator::HandleCompare(const HloInstruction* compare) {
   ComparisonDirection direction = compare->comparison_direction();
   ComparisonOrder order = compare->comparison_order();
   auto lhs = compare->operand(0);
@@ -1525,13 +1528,13 @@ Status HloEvaluator::HandleCompare(const HloInstruction* compare) {
   const Literal& rhs_literal = GetEvaluatedLiteralFor(rhs);
   // Note here we switch on the operand's type.
   return primitive_util::PrimitiveTypeSwitch<Status>(
-      [&](auto primitive_type_constant) -> Status {
+      [&](auto primitive_type_constant) -> absl::Status {
         if constexpr (primitive_util::IsArrayType(primitive_type_constant)) {
           using NativeT = primitive_util::NativeTypeOf<primitive_type_constant>;
           TF_ASSIGN_OR_RETURN(evaluated_[compare],
                               Compare<NativeT>(compare->shape(), comparison,
                                                lhs_literal, rhs_literal));
-          return OkStatus();
+          return absl::OkStatus();
         }
         LOG(FATAL) << "HandleCompare: unknown primitive type: "
                    << PrimitiveType_Name(element_type);
@@ -1539,7 +1542,7 @@ Status HloEvaluator::HandleCompare(const HloInstruction* compare) {
       element_type);
 }
 
-Status HloEvaluator::HandleTuple(const HloInstruction* tuple) {
+absl::Status HloEvaluator::HandleTuple(const HloInstruction* tuple) {
   std::vector<const Literal*> operand_literals;
   std::vector<Literal> operand_literal_values;
   if (!visitor_shape_index_.empty()) {
@@ -1589,7 +1592,7 @@ Status HloEvaluator::HandleTuple(const HloInstruction* tuple) {
   } else {
     evaluated_[tuple] = std::move(new_result);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 namespace {
@@ -1671,8 +1674,9 @@ class FftTransform {
     absl::c_reverse(fft_lengths_);
   }
 
-  Status ComputeFft(const HloInstruction* fft, const Literal& input_literal,
-                    Literal* output_literal) {
+  absl::Status ComputeFft(const HloInstruction* fft,
+                          const Literal& input_literal,
+                          Literal* output_literal) {
     const Shape& input_shape = input_literal.shape();
     const Shape& output_shape = fft->shape();
 
@@ -1735,7 +1739,7 @@ class FftTransform {
                       input_strides, input_shape.rank(), 0, 0, base_case);
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -2171,7 +2175,8 @@ class FftTransform {
     }
   }
 
-  Status CheckParameters(const Shape& input_shape, const Shape& output_shape) {
+  absl::Status CheckParameters(const Shape& input_shape,
+                               const Shape& output_shape) {
     // Check FFT parameters.
     if (fft_rank_ <= 0) {
       return InvalidArgument("Zero or negative FFT rank.");
@@ -2232,7 +2237,7 @@ class FftTransform {
       }
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -2243,7 +2248,7 @@ class FftTransform {
 
 }  // namespace
 
-Status HloEvaluator::HandleFft(const HloInstruction* fft) {
+absl::Status HloEvaluator::HandleFft(const HloInstruction* fft) {
   const Literal& input_literal = GetEvaluatedLiteralFor(fft->operand(0));
   Literal output_literal = Literal::CreateFromShape(fft->shape());
 
@@ -2251,7 +2256,7 @@ Status HloEvaluator::HandleFft(const HloInstruction* fft) {
   TF_RETURN_IF_ERROR(transform.ComputeFft(fft, input_literal, &output_literal));
   evaluated_[fft] = std::move(output_literal);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Returns an ShapeUtil::IndexIterationSpace that iterates over the output batch
@@ -2379,7 +2384,7 @@ class OutputBatchIndexToInputIndex {
 
   // Populates index_vector_ by iterating over start_indices_ according to
   // index_vector_index_.
-  Status FetchIndexVector() {
+  absl::Status FetchIndexVector() {
     int64_t index_vector_dim = dim_numbers_.index_vector_dim();
     for (int64_t i = 0, e = index_vector_.size(); i < e; i++) {
       index_vector_index_[index_vector_dim] = i;
@@ -2387,7 +2392,7 @@ class OutputBatchIndexToInputIndex {
       TF_RET_CHECK(start_index.has_value());
       index_vector_[i] = *start_index;
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Populates input_index_.
@@ -2530,7 +2535,7 @@ ReshapedGatherIndices(int64_t index_vector_dim, const Literal& start_indices,
   return std::cref(*reshaped_start_indices);
 }
 
-Status HloEvaluator::HandleGather(const HloInstruction* gather) {
+absl::Status HloEvaluator::HandleGather(const HloInstruction* gather) {
   Literal result = Literal::CreateFromShape(gather->shape());
   const Shape& shape = gather->shape();
   const GatherDimensionNumbers& dim_numbers =
@@ -2569,7 +2574,7 @@ Status HloEvaluator::HandleGather(const HloInstruction* gather) {
   const Shape& operand_shape = operand.shape();
   if (ShapeUtil::IsZeroElementArray(operand_shape)) {
     evaluated_[gather] = std::move(result);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   auto gather_inner_loop_body =
@@ -2624,7 +2629,7 @@ Status HloEvaluator::HandleGather(const HloInstruction* gather) {
   TF_RETURN_IF_ERROR(ShapeUtil::ForEachIndexWithStatus(
       shape, start_indices_iteration_space, gather_outer_loop_body));
   evaluated_[gather] = std::move(result);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 namespace {
@@ -2784,14 +2789,14 @@ class UpdateScatterIndexToInputIndex {
 
   // Populates index_vector_ by iterating over scatter_indices_ according to
   // index_vector_index_.
-  Status FetchIndexVector() {
+  absl::Status FetchIndexVector() {
     int64_t index_vector_dim = dim_numbers_.index_vector_dim();
     for (int64_t i = 0, e = index_vector_.size(); i < e; i++) {
       index_vector_index_[index_vector_dim] = i;
       index_vector_[i] =
           *scatter_indices_.GetIntegralAsS64(index_vector_index_);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Populates input_index_.
@@ -2914,7 +2919,7 @@ class UpdateWindowIndexToInputIndex {
 };
 }  // namespace
 
-Status HloEvaluator::HandleScatter(const HloInstruction* hlo) {
+absl::Status HloEvaluator::HandleScatter(const HloInstruction* hlo) {
   auto* scatter = DynCast<HloScatterInstruction>(hlo);
   const ScatterDimensionNumbers& dim_numbers =
       scatter->scatter_dimension_numbers();
@@ -3040,10 +3045,10 @@ Status HloEvaluator::HandleScatter(const HloInstruction* hlo) {
       updates[0]->shape(), scatter_indices_iteration_space,
       scatter_outer_loop_body));
   evaluated_[scatter] = std::move(result);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status HloEvaluator::HandleBroadcast(const HloInstruction* broadcast) {
+absl::Status HloEvaluator::HandleBroadcast(const HloInstruction* broadcast) {
   const Literal& operand = GetEvaluatedLiteralFor(broadcast->operand(0));
   TF_RET_CHECK(broadcast->shape().element_type() ==
                operand.shape().element_type())
@@ -3068,22 +3073,23 @@ Status HloEvaluator::HandleBroadcast(const HloInstruction* broadcast) {
       evaluated_[broadcast],
       operand.Broadcast(broadcast->shape(), broadcast->dimensions()));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status HloEvaluator::HandleAfterAll(const HloInstruction* after_all) {
+absl::Status HloEvaluator::HandleAfterAll(const HloInstruction* after_all) {
   evaluated_[after_all] = LiteralUtil::CreateToken();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status HloEvaluator::HandleAddDependency(const HloInstruction* add_dependency) {
+absl::Status HloEvaluator::HandleAddDependency(
+    const HloInstruction* add_dependency) {
   // AddDedendency just forwards its zero-th operand.
   evaluated_[add_dependency] =
       GetEvaluatedLiteralFor(add_dependency->operand(0)).Clone();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status HloEvaluator::HandleGetTupleElement(
+absl::Status HloEvaluator::HandleGetTupleElement(
     const HloInstruction* get_tuple_element) {
   const auto result_shape = get_tuple_element->shape();
   const int64_t index = get_tuple_element->tuple_index();
@@ -3106,13 +3112,13 @@ Status HloEvaluator::HandleGetTupleElement(
                                                 /*src_shape_index=*/{index});
 }
 
-Status HloEvaluator::HandleCopy(const HloInstruction* copy) {
+absl::Status HloEvaluator::HandleCopy(const HloInstruction* copy) {
   TF_RET_CHECK(ShapeUtil::Compatible(copy->shape(), copy->operand(0)->shape()));
   evaluated_[copy] = GetEvaluatedLiteralFor(copy->operand(0)).Clone();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status HloEvaluator::HandleAsyncStart(const HloInstruction* async_start) {
+absl::Status HloEvaluator::HandleAsyncStart(const HloInstruction* async_start) {
   std::vector<const Literal*> arg_literals;
   arg_literals.reserve(async_start->operands().size());
   for (auto operand : async_start->operands()) {
@@ -3140,30 +3146,31 @@ Status HloEvaluator::HandleAsyncStart(const HloInstruction* async_start) {
   TF_RETURN_IF_ERROR(evaluated_[async_start].MoveFrom(
       std::move(result), /*dest_shape_index=*/{1}));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status HloEvaluator::HandleAsyncUpdate(const HloInstruction* async_update) {
+absl::Status HloEvaluator::HandleAsyncUpdate(
+    const HloInstruction* async_update) {
   const Literal& operand_tuple_literal =
       GetEvaluatedLiteralFor(async_update->operand(0));
   evaluated_[async_update] = Literal(async_update->shape());
   TF_RETURN_IF_ERROR(evaluated_[async_update].CopyFrom(operand_tuple_literal,
                                                        /*dest_shape_index=*/{},
                                                        /*src_shape_index=*/{}));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status HloEvaluator::HandleAsyncDone(const HloInstruction* async_done) {
+absl::Status HloEvaluator::HandleAsyncDone(const HloInstruction* async_done) {
   const Literal& operand_tuple_literal =
       GetEvaluatedLiteralFor(async_done->operand(0));
   evaluated_[async_done] = Literal(async_done->shape());
   TF_RETURN_IF_ERROR(evaluated_[async_done].CopyFrom(operand_tuple_literal,
                                                      /*dest_shape_index=*/{},
                                                      /*src_shape_index=*/{1}));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status HloEvaluator::HandleCopyStart(const HloInstruction* copy_start) {
+absl::Status HloEvaluator::HandleCopyStart(const HloInstruction* copy_start) {
   if (copy_start->user_count() != 1 ||
       copy_start->users().at(0)->opcode() != HloOpcode::kCopyDone) {
     return tsl::errors::FailedPrecondition(
@@ -3180,10 +3187,10 @@ Status HloEvaluator::HandleCopyStart(const HloInstruction* copy_start) {
   evaluated_[copy_start] = LiteralUtil::MakeTuple(
       {&GetEvaluatedLiteralFor(copy_start->operand(0)),
        &GetEvaluatedLiteralFor(copy_start->operand(0)), &context_literal});
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status HloEvaluator::HandleCopyDone(const HloInstruction* copy_done) {
+absl::Status HloEvaluator::HandleCopyDone(const HloInstruction* copy_done) {
   const HloInstruction* operand = copy_done->operand(0);
   if (operand->opcode() != HloOpcode::kCopyStart) {
     return tsl::errors::FailedPrecondition(
@@ -3197,10 +3204,10 @@ Status HloEvaluator::HandleCopyDone(const HloInstruction* copy_done) {
   TF_RETURN_IF_ERROR(evaluated_[copy_done].CopyFrom(operand_tuple_literal,
                                                     /*dest_shape_index=*/{},
                                                     /*src_shape_index=*/{0}));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status HloEvaluator::HandleCall(const HloInstruction* call) {
+absl::Status HloEvaluator::HandleCall(const HloInstruction* call) {
   auto* computation = call->to_apply();
   auto operands = call->operands();
 
@@ -3219,10 +3226,10 @@ Status HloEvaluator::HandleCall(const HloInstruction* call) {
                       embedded_evaluator->Evaluate(*computation, arg_literals));
 
   evaluated_[call] = std::move(result);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status HloEvaluator::HandleFusion(const HloInstruction* fusion) {
+absl::Status HloEvaluator::HandleFusion(const HloInstruction* fusion) {
   HloModuleConfig config;
   // Attach cloned computation to an empty HLO module so the existing ones are
   // not modified.
@@ -3257,10 +3264,11 @@ Status HloEvaluator::HandleFusion(const HloInstruction* fusion) {
                                           *readded_computation, arg_literals));
 
   evaluated_[fusion] = std::move(result);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status HloEvaluator::HandleConditional(const HloInstruction* conditional) {
+absl::Status HloEvaluator::HandleConditional(
+    const HloInstruction* conditional) {
   const auto& branch_index_literal =
       GetEvaluatedLiteralFor(conditional->operand(0));
   int branch_index;
@@ -3285,19 +3293,20 @@ Status HloEvaluator::HandleConditional(const HloInstruction* conditional) {
                           {&branch_computation_arg}));
 
   evaluated_[conditional] = std::move(result);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status HloEvaluator::HandleConvert(const HloInstruction* convert) {
+absl::Status HloEvaluator::HandleConvert(const HloInstruction* convert) {
   const HloInstruction* operand = convert->operand(0);
   TF_RET_CHECK(ShapeUtil::SameDimensions(operand->shape(), convert->shape()));
   TF_ASSIGN_OR_RETURN(Literal result, GetEvaluatedLiteralFor(operand).Convert(
                                           convert->shape().element_type()));
   evaluated_[convert] = std::move(result);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status HloEvaluator::HandleDynamicSlice(const HloInstruction* dynamic_slice) {
+absl::Status HloEvaluator::HandleDynamicSlice(
+    const HloInstruction* dynamic_slice) {
   auto operand = dynamic_slice->operand(0);
   auto start_indices = dynamic_slice->operand(1);
   auto result_shape = dynamic_slice->shape();
@@ -3346,10 +3355,10 @@ Status HloEvaluator::HandleDynamicSlice(const HloInstruction* dynamic_slice) {
   };
   TF_RETURN_IF_ERROR(result.PopulateInplace(func));
   evaluated_[dynamic_slice] = std::move(result);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status HloEvaluator::HandleDynamicUpdateSlice(const HloInstruction* dus) {
+absl::Status HloEvaluator::HandleDynamicUpdateSlice(const HloInstruction* dus) {
   auto operand = dus->operand(0);
   auto update = dus->operand(1);
   auto start_indices = dus->operand(2);
@@ -3398,10 +3407,10 @@ Status HloEvaluator::HandleDynamicUpdateSlice(const HloInstruction* dus) {
                                   func);
   evaluated_[dus] = std::move(result);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status HloEvaluator::HandleSelect(const HloInstruction* select) {
+absl::Status HloEvaluator::HandleSelect(const HloInstruction* select) {
   const auto& pred = GetEvaluatedLiteralFor(select->operand(0));
   const auto& on_true = GetEvaluatedLiteralFor(select->operand(1));
   const auto& on_false = GetEvaluatedLiteralFor(select->operand(2));
@@ -3413,7 +3422,7 @@ Status HloEvaluator::HandleSelect(const HloInstruction* select) {
     } else {
       evaluated_[select] = on_false.Clone();
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   return DefaultAction(select);
@@ -3423,7 +3432,7 @@ namespace {
 
 absl::StatusOr<Literal> CreateScalarLiteral(int64_t value,
                                             PrimitiveType element_type) {
-  return primitive_util::PrimitiveTypeSwitch<StatusOr<Literal>>(
+  return primitive_util::PrimitiveTypeSwitch<absl::StatusOr<Literal>>(
       [&](auto primitive_type_constant) -> absl::StatusOr<Literal> {
         if constexpr (primitive_util::IsIntegralType(primitive_type_constant)) {
           return LiteralUtil::CreateR0(
@@ -3474,7 +3483,7 @@ absl::StatusOr<Literal> TryParseAndEvaluateWhileInductionVar(
 
 }  // namespace
 
-Status HloEvaluator::HandleWhile(const HloInstruction* while_hlo) {
+absl::Status HloEvaluator::HandleWhile(const HloInstruction* while_hlo) {
   const HloComputation* cond_comp = while_hlo->while_condition();
   const HloComputation* body_comp = while_hlo->while_body();
   // Initialize the loop carried valued with the input to the While instruction.
@@ -3488,7 +3497,7 @@ Status HloEvaluator::HandleWhile(const HloInstruction* while_hlo) {
         visitor_shape_index_.size() != 1 ||
         parsed_while_loop->static_while_loop->induction_var_index !=
             visitor_shape_index_[0]) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     Shape induction_var_shape =
         ShapeUtil::GetSubshape(while_hlo->shape(), visitor_shape_index_);
@@ -3499,7 +3508,7 @@ Status HloEvaluator::HandleWhile(const HloInstruction* while_hlo) {
     TF_RETURN_IF_ERROR(evaluated_[while_hlo].CopyFrom(
         induction_var_val, /*dest_shape_index=*/visitor_shape_index_,
         /*src_shape_index=*/{}));
-    return OkStatus();
+    return absl::OkStatus();
   }
   bool keep_going = true;
   int64_t iteration_count = 0;
@@ -3535,7 +3544,7 @@ Status HloEvaluator::HandleWhile(const HloInstruction* while_hlo) {
     }
   }
   evaluated_[while_hlo] = std::move(lcv);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 namespace {
@@ -3554,7 +3563,7 @@ Literal ExtractLiteralFromIndexPositions(const Literal& from,
 absl::StatusOr<Literal> ExtractFromIndexPositions(
     const Literal& from, absl::Span<int64_t const> indices) {
   PrimitiveType type = from.shape().element_type();
-  return primitive_util::PrimitiveTypeSwitch<StatusOr<Literal>>(
+  return primitive_util::PrimitiveTypeSwitch<absl::StatusOr<Literal>>(
       [&](auto primitive_type_constant) -> absl::StatusOr<Literal> {
         if constexpr (primitive_util::IsArrayType(primitive_type_constant)) {
           return ExtractLiteralFromIndexPositions<
@@ -3693,7 +3702,7 @@ template <PrimitiveType operand_type, PrimitiveType random_type>
 absl::StatusOr<Literal> StochasticConvertOp(const Literal& operand_literal,
                                             const Literal& random_literal,
                                             const Shape& result_shape) {
-  return primitive_util::PrimitiveTypeSwitch<StatusOr<Literal>>(
+  return primitive_util::PrimitiveTypeSwitch<absl::StatusOr<Literal>>(
       [&](auto primitive_type_constant) -> absl::StatusOr<Literal> {
         if constexpr (primitive_util::IsSignedIntegralType(
                           primitive_type_constant)) {
@@ -3714,7 +3723,7 @@ absl::StatusOr<Literal> StochasticConvertOp(const Literal& operand_literal,
 absl::StatusOr<Literal> StochasticConvertOp(const Literal& operand_literal,
                                             const Literal& random_literal,
                                             const Shape& result_shape) {
-  return primitive_util::PrimitiveTypeSwitch<StatusOr<Literal>>(
+  return primitive_util::PrimitiveTypeSwitch<absl::StatusOr<Literal>>(
       [&](auto primitive_type_constant) -> absl::StatusOr<Literal> {
         if constexpr (primitive_util::IsFloatingPointType(
                           primitive_type_constant)) {
@@ -3734,7 +3743,7 @@ absl::StatusOr<Literal> StochasticConvertOp(const Literal& operand_literal,
 }
 }  // namespace
 
-Status HloEvaluator::HandleReverse(const HloInstruction* reverse) {
+absl::Status HloEvaluator::HandleReverse(const HloInstruction* reverse) {
   const Shape& result_shape = reverse->shape();
   const auto reverse_dimensions = reverse->dimensions();
 
@@ -3767,10 +3776,10 @@ Status HloEvaluator::HandleReverse(const HloInstruction* reverse) {
       }));
 
   evaluated_[reverse] = std::move(result);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status HloEvaluator::HandleSelectAndScatter(
+absl::Status HloEvaluator::HandleSelectAndScatter(
     const HloInstruction* select_and_scatter) {
   auto operand = select_and_scatter->operand(0);
   auto source = select_and_scatter->operand(1);
@@ -3861,10 +3870,10 @@ Status HloEvaluator::HandleSelectAndScatter(
       IndexUtil::BumpIndices(source->shape(), absl::MakeSpan(source_index)));
 
   evaluated_[select_and_scatter] = std::move(result);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status HloEvaluator::HandleSlice(const HloInstruction* slice) {
+absl::Status HloEvaluator::HandleSlice(const HloInstruction* slice) {
   auto operand = slice->operand(0);
   const Shape& shape = slice->shape();
   TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
@@ -3896,10 +3905,10 @@ Status HloEvaluator::HandleSlice(const HloInstruction* slice) {
   Literal result(shape);
   TF_RETURN_IF_ERROR(result.PopulateInplaceParallel(func));
   evaluated_[slice] = std::move(result);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status HloEvaluator::HandleSort(const HloInstruction* sort) {
+absl::Status HloEvaluator::HandleSort(const HloInstruction* sort) {
   TF_RET_CHECK(sort->operand_count() >= 1)
       << "Expected at least 1 operand for sort";
   for (int64_t i = 1; i < sort->operand_count(); ++i) {
@@ -3974,13 +3983,13 @@ Status HloEvaluator::HandleSort(const HloInstruction* sort) {
 #endif
     return a_is_smaller;
   };
-  std::function<Status(absl::Span<const Literal>, absl::Span<int64_t>,
-                       absl::Span<int64_t>, absl::Span<int64_t>,
-                       std::vector<int64_t>&, HloEvaluator*)>
+  std::function<absl::Status(absl::Span<const Literal>, absl::Span<int64_t>,
+                             absl::Span<int64_t>, absl::Span<int64_t>,
+                             std::vector<int64_t>&, HloEvaluator*)>
       merge = [&](absl::Span<const Literal> literals_to_sort,
                   absl::Span<int64_t> lhs, absl::Span<int64_t> rhs,
                   absl::Span<int64_t> output, std::vector<int64_t>& tmp,
-                  HloEvaluator* embedded_evaluator) -> Status {
+                  HloEvaluator* embedded_evaluator) -> absl::Status {
     tmp.clear();
     tmp.reserve(output.size());
     // Keep picking between elements.
@@ -4003,7 +4012,7 @@ Status HloEvaluator::HandleSort(const HloInstruction* sort) {
     absl::c_copy(lhs, std::back_inserter(tmp));
     absl::c_copy(rhs, std::back_inserter(tmp));
     absl::c_copy(tmp, output.begin());
-    return OkStatus();
+    return absl::OkStatus();
   };
   auto* env = tsl::Env::Default();
   const int max_parallelism = tsl::port::MaxParallelism();
@@ -4013,16 +4022,16 @@ Status HloEvaluator::HandleSort(const HloInstruction* sort) {
   const size_t work_per_thread = useful_parallelism > 1
                                      ? sort_dim_elements / useful_parallelism
                                      : std::numeric_limits<size_t>::max();
-  std::function<Status(absl::Span<const Literal>, absl::Span<int64_t>,
-                       std::vector<int64_t>*, HloEvaluator*)>
+  std::function<absl::Status(absl::Span<const Literal>, absl::Span<int64_t>,
+                             std::vector<int64_t>*, HloEvaluator*)>
       mergesort = [&merge, &mergesort, &less_than, this, env, work_per_thread](
                       absl::Span<const Literal> literals_to_sort,
                       absl::Span<int64_t> to_sort,
                       std::vector<int64_t>* scratch,
-                      HloEvaluator* embedded_evaluator) -> Status {
+                      HloEvaluator* embedded_evaluator) -> absl::Status {
     // Base case: inputs with 0 or 1 elements are already sorted.
     if (to_sort.size() < 2) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     size_t halfway = to_sort.size() / 2;
     auto lhs = to_sort.subspan(/*pos=*/0, halfway);
@@ -4046,7 +4055,7 @@ Status HloEvaluator::HandleSort(const HloInstruction* sort) {
       // Overlap sorting the LHS with the RHS if we have enough work to
       // do. The recursive call for to `mergesort(rhs)` will potentially
       // create more threads.
-      Status lhs_status;
+      absl::Status lhs_status;
       if (to_sort.size() >= work_per_thread) {
         std::unique_ptr<tsl::Thread> thread = absl::WrapUnique(env->StartThread(
             tsl::ThreadOptions(), "XLA_mergesort",
@@ -4100,7 +4109,7 @@ Status HloEvaluator::HandleSort(const HloInstruction* sort) {
         std::rotate(ub, i, i + 1);
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   // Iterate through each dimension except 'sort_dim'.
@@ -4154,10 +4163,10 @@ Status HloEvaluator::HandleSort(const HloInstruction* sort) {
 
     evaluated_[sort] = std::move(result_tuple);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status HloEvaluator::HandleStochasticConvert(
+absl::Status HloEvaluator::HandleStochasticConvert(
     const HloInstruction* stochastic_convert) {
   const HloInstruction* operand = stochastic_convert->operand(0);
   const HloInstruction* random = stochastic_convert->operand(1);
@@ -4170,7 +4179,7 @@ Status HloEvaluator::HandleStochasticConvert(
   TF_ASSIGN_OR_RETURN(
       evaluated_[stochastic_convert],
       StochasticConvertOp(operand_literal, random_literal, result_shape));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 static bool IsScalarAdd(HloComputation* computation) {
@@ -4312,7 +4321,7 @@ static absl::StatusOr<bool> GenerateReduceOutputElement(
   return true;
 }
 
-Status HloEvaluator::HandleReduce(const HloInstruction* hlo) {
+absl::Status HloEvaluator::HandleReduce(const HloInstruction* hlo) {
   const HloReduceInstruction* reduce = Cast<HloReduceInstruction>(hlo);
   int64_t num_args = reduce->inputs().size();
   absl::Span<const int64_t> dimensions_to_reduce(reduce->dimensions());
@@ -4410,10 +4419,10 @@ Status HloEvaluator::HandleReduce(const HloInstruction* hlo) {
     TF_ASSIGN_OR_RETURN(evaluated_[reduce],
                         evaluated_[reduce].ConvertToShape(reduce->shape()));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status HloEvaluator::HandleReduceWindow(const HloInstruction* hlo) {
+absl::Status HloEvaluator::HandleReduceWindow(const HloInstruction* hlo) {
   auto* reduce_window = Cast<HloReduceWindowInstruction>(hlo);
   const Window& window = reduce_window->window();
   HloComputation* function = reduce_window->to_apply();
@@ -4551,10 +4560,10 @@ Status HloEvaluator::HandleReduceWindow(const HloInstruction* hlo) {
   }
   VLOG(2) << "Final result is:" << result.ToString() << "\n";
   evaluated_[reduce_window] = std::move(result);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status HloEvaluator::HandleMap(const HloInstruction* map) {
+absl::Status HloEvaluator::HandleMap(const HloInstruction* map) {
   auto operands = map->operands();
   const HloComputation* computation = map->to_apply();
 
@@ -4583,10 +4592,10 @@ Status HloEvaluator::HandleMap(const HloInstruction* map) {
         return computed_result;
       }));
   evaluated_[map] = std::move(result);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status HloEvaluator::HandleCustomCall(const HloInstruction* custom_call) {
+absl::Status HloEvaluator::HandleCustomCall(const HloInstruction* custom_call) {
   if (!custom_call_handler_) {
     // No handler is registered; this means custom-calls are not allowed.
     return DefaultAction(custom_call);
@@ -4604,10 +4613,10 @@ Status HloEvaluator::HandleCustomCall(const HloInstruction* custom_call) {
       auto output, custom_call_handler_(custom_call, absl::MakeSpan(operands)));
 
   evaluated_[custom_call] = std::move(output);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status HloEvaluator::Preprocess(const HloInstruction* hlo) {
+absl::Status HloEvaluator::Preprocess(const HloInstruction* hlo) {
   VLOG(3) << "About to visit HLO: " << hlo->ToString();
   if (!enable_partial_evaluation_) {
     for (const HloInstruction* operand : hlo->operands()) {
@@ -4622,7 +4631,7 @@ Status HloEvaluator::Preprocess(const HloInstruction* hlo) {
   return ShapeUtil::ValidateShape(hlo->shape());
 }
 
-Status HloEvaluator::Postprocess(const HloInstruction* hlo) {
+absl::Status HloEvaluator::Postprocess(const HloInstruction* hlo) {
   VLOG(3) << "Finished visiting " << hlo->ToString()
           << "; evaluated value is: " << GetEvaluatedLiteralFor(hlo).ToString();
   // Out of convenience the literal may have been produced with a different
@@ -4638,7 +4647,7 @@ Status HloEvaluator::Postprocess(const HloInstruction* hlo) {
                                           hlo_shape.layout())) {
     evaluated_.at(hlo) = evaluated_.at(hlo).Relayout(hlo_shape);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 namespace {
diff --git a/third_party/xla/xla/hlo/evaluator/hlo_evaluator.h b/third_party/xla/xla/hlo/evaluator/hlo_evaluator.h
index ed6accfacf96e9..5d527c092aa8be 100644
--- a/third_party/xla/xla/hlo/evaluator/hlo_evaluator.h
+++ b/third_party/xla/xla/hlo/evaluator/hlo_evaluator.h
@@ -244,12 +244,12 @@ class HloEvaluator : public ConstDfsHloVisitorWithDefault {
   // EvaluateInternal. Such partial evaluation reduces the computation and
   // memory overhead in cases where we need only one tuple element by avoiding
   // the evaluation of a full tuple.
-  Status EvaluateInternal(
+  absl::Status EvaluateInternal(
       const HloInstruction* instruction, const ShapeIndex& shape_index = {},
       bool recursively_evaluate_nonconstant_operands = false);
 
-  Status EvaluateParameterFromCallerArgument(const HloInstruction* parameter,
-                                             const ShapeIndex& shape_index);
+  absl::Status EvaluateParameterFromCallerArgument(
+      const HloInstruction* parameter, const ShapeIndex& shape_index);
 
   // Helper method to extract a list of int64_t from evaluated instruction for
   // start_indices for DynamicSlice and DynamicUpdateSlice.
@@ -280,86 +280,88 @@ class HloEvaluator : public ConstDfsHloVisitorWithDefault {
 
   // Wraps around instruction handling to infer types before dispatching to
   // the corresponding typed Visitor.
-  Status DefaultAction(const HloInstruction* hlo) override {
+  absl::Status DefaultAction(const HloInstruction* hlo) override {
     return hlo->Visit(typed_visitors_[hlo->shape().element_type()].get());
   }
 
-  Status Preprocess(const HloInstruction* hlo) override;
-  Status Postprocess(const HloInstruction* hlo) override;
+  absl::Status Preprocess(const HloInstruction* hlo) override;
+  absl::Status Postprocess(const HloInstruction* hlo) override;
 
   // Operations that are type-agnostic or always return a specific type, such as
   // HandleIsFinite where boolean is always returned.
   //
-  Status HandleBitcast(const HloInstruction* bitcast) override;
-  Status HandleBitcastConvert(const HloInstruction* convert) override;
-  Status HandleGetDimensionSize(
+  absl::Status HandleBitcast(const HloInstruction* bitcast) override;
+  absl::Status HandleBitcastConvert(const HloInstruction* convert) override;
+  absl::Status HandleGetDimensionSize(
       const HloInstruction* get_dimension_size) override;
-  Status HandleSetDimensionSize(
+  absl::Status HandleSetDimensionSize(
       const HloInstruction* set_dimension_size) override;
-  Status HandleParameter(const HloInstruction* parameter) override;
-  Status HandleInfeed(const HloInstruction* infeed) override;
-  Status HandleConstant(const HloInstruction* constant) override;
-  Status HandleConcatenate(const HloInstruction* concatenate) override;
-  Status HandleReshape(const HloInstruction* reshape) override;
-  Status HandleTranspose(const HloInstruction* transpose) override;
-  Status HandleIsFinite(const HloInstruction* is_finite) override;
-  Status HandleCompare(const HloInstruction* compare) override;
-  Status HandleTuple(const HloInstruction* tuple) override;
-  Status HandleFft(const HloInstruction* fft) override;
-  Status HandleGather(const HloInstruction* gather) override;
-  Status HandleScatter(const HloInstruction* hlo) override;
-  Status HandleGetTupleElement(
+  absl::Status HandleParameter(const HloInstruction* parameter) override;
+  absl::Status HandleInfeed(const HloInstruction* infeed) override;
+  absl::Status HandleConstant(const HloInstruction* constant) override;
+  absl::Status HandleConcatenate(const HloInstruction* concatenate) override;
+  absl::Status HandleReshape(const HloInstruction* reshape) override;
+  absl::Status HandleTranspose(const HloInstruction* transpose) override;
+  absl::Status HandleIsFinite(const HloInstruction* is_finite) override;
+  absl::Status HandleCompare(const HloInstruction* compare) override;
+  absl::Status HandleTuple(const HloInstruction* tuple) override;
+  absl::Status HandleFft(const HloInstruction* fft) override;
+  absl::Status HandleGather(const HloInstruction* gather) override;
+  absl::Status HandleScatter(const HloInstruction* hlo) override;
+  absl::Status HandleGetTupleElement(
       const HloInstruction* get_tuple_element) override;
-  Status HandleAsyncStart(const HloInstruction* async_start) override;
-  Status HandleAsyncUpdate(const HloInstruction* async_update) override;
-  Status HandleAsyncDone(const HloInstruction* async_done) override;
-  Status HandleCopy(const HloInstruction* copy) override;
-  Status HandleCopyStart(const HloInstruction* copy_start) override;
-  Status HandleCopyDone(const HloInstruction* copy_done) override;
-  Status HandleConditional(const HloInstruction* conditional) override;
-  Status HandleConvert(const HloInstruction* convert) override;
-  Status HandleCall(const HloInstruction* call) override;
-  Status HandleDynamicSlice(const HloInstruction* dynamic_slice) override;
-  Status HandleDynamicUpdateSlice(const HloInstruction* dus) override;
-  Status HandleFusion(const HloInstruction* fusion) override;
-  Status HandleWhile(const HloInstruction* while_hlo) override;
-  Status HandleSelect(const HloInstruction* select) override;
-  Status HandleBroadcast(const HloInstruction* broadcast) override;
-  Status HandleAfterAll(const HloInstruction* after_all) override;
-  Status HandleAddDependency(const HloInstruction* add_dependency) override;
-  Status HandleReverse(const HloInstruction* reverse) override;
-  Status HandleSelectAndScatter(
+  absl::Status HandleAsyncStart(const HloInstruction* async_start) override;
+  absl::Status HandleAsyncUpdate(const HloInstruction* async_update) override;
+  absl::Status HandleAsyncDone(const HloInstruction* async_done) override;
+  absl::Status HandleCopy(const HloInstruction* copy) override;
+  absl::Status HandleCopyStart(const HloInstruction* copy_start) override;
+  absl::Status HandleCopyDone(const HloInstruction* copy_done) override;
+  absl::Status HandleConditional(const HloInstruction* conditional) override;
+  absl::Status HandleConvert(const HloInstruction* convert) override;
+  absl::Status HandleCall(const HloInstruction* call) override;
+  absl::Status HandleDynamicSlice(const HloInstruction* dynamic_slice) override;
+  absl::Status HandleDynamicUpdateSlice(const HloInstruction* dus) override;
+  absl::Status HandleFusion(const HloInstruction* fusion) override;
+  absl::Status HandleWhile(const HloInstruction* while_hlo) override;
+  absl::Status HandleSelect(const HloInstruction* select) override;
+  absl::Status HandleBroadcast(const HloInstruction* broadcast) override;
+  absl::Status HandleAfterAll(const HloInstruction* after_all) override;
+  absl::Status HandleAddDependency(
+      const HloInstruction* add_dependency) override;
+  absl::Status HandleReverse(const HloInstruction* reverse) override;
+  absl::Status HandleSelectAndScatter(
       const HloInstruction* select_and_scatter) override;
-  Status HandleSlice(const HloInstruction* slice) override;
-  Status HandleSort(const HloInstruction* sort) override;
-  Status HandleStochasticConvert(
+  absl::Status HandleSlice(const HloInstruction* slice) override;
+  absl::Status HandleSort(const HloInstruction* sort) override;
+  absl::Status HandleStochasticConvert(
       const HloInstruction* stochastic_convert) override;
-  Status HandleReal(const HloInstruction* real) override;
-  Status HandleImag(const HloInstruction* imag) override;
-  Status HandleComplex(const HloInstruction* complex) override;
-  Status HandleReduce(const HloInstruction* hlo) override;
-  Status HandleReduceWindow(const HloInstruction* hlo) override;
-  Status HandleMap(const HloInstruction* map) override;
-  Status HandleCustomCall(const HloInstruction* custom_call) override;
+  absl::Status HandleReal(const HloInstruction* real) override;
+  absl::Status HandleImag(const HloInstruction* imag) override;
+  absl::Status HandleComplex(const HloInstruction* complex) override;
+  absl::Status HandleReduce(const HloInstruction* hlo) override;
+  absl::Status HandleReduceWindow(const HloInstruction* hlo) override;
+  absl::Status HandleMap(const HloInstruction* map) override;
+  absl::Status HandleCustomCall(const HloInstruction* custom_call) override;
 
   // Unsupported HLOs, note some of them (such as BatchNorm*) are typically
   // expanded in a semantic-preserving way into other HLOs by adding expansion
   // HLO pass to the HLO optimization pass during compilation, which can then be
   // handled by the evaluator.
-  Status HandleBatchNormGrad(const HloInstruction* batch_norm_grad) override {
+  absl::Status HandleBatchNormGrad(
+      const HloInstruction* batch_norm_grad) override {
     return Unimplemented("BatchNormGrad HLO is unsupported by the evaluator.");
   }
-  Status HandleBatchNormInference(
+  absl::Status HandleBatchNormInference(
       const HloInstruction* batch_norm_inference) override {
     return Unimplemented(
         "BatchNormInference HLO is unsupported by the evaluator.");
   }
-  Status HandleBatchNormTraining(
+  absl::Status HandleBatchNormTraining(
       const HloInstruction* batch_norm_training) override {
     return Unimplemented(
         "BatchNormTraining HLO is unsupported by the evaluator.");
   }
-  Status HandleOutfeed(const HloInstruction* outfeed) override {
+  absl::Status HandleOutfeed(const HloInstruction* outfeed) override {
     return Unimplemented("Outfeed HLO is unsupported by the evaluator.");
   }
 
diff --git a/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h b/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h
index 68b79d25bf5d24..8bc1e2c889e785 100644
--- a/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h
+++ b/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h
@@ -119,8 +119,8 @@ auto ToArithmeticSafeType(T t) {
 template <typename ReturnT, typename ElementwiseT = ReturnT>
 class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
  private:
-  ABSL_ATTRIBUTE_NOINLINE Status
-  UnsupportedTypeError(const HloInstruction* instruction) {
+  ABSL_ATTRIBUTE_NOINLINE absl::Status UnsupportedTypeError(
+      const HloInstruction* instruction) {
     return InvalidArgument(
         "Unsupported type for %s: %s", HloOpcodeString(instruction->opcode()),
         PrimitiveType_Name(instruction->shape().element_type()));
@@ -155,34 +155,34 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
     };
   }
 
-  Status DefaultAction(const HloInstruction* hlo_instruction) override {
+  absl::Status DefaultAction(const HloInstruction* hlo_instruction) override {
     return Unimplemented("unhandled HLO ops for HloEvaluator: %s.",
                          HloOpcodeString(hlo_instruction->opcode()));
   }
 
   template <typename NativeT,
             typename std::enable_if_t<std::is_unsigned_v<NativeT>>* = nullptr>
-  Status HandleAbs(const HloInstruction* abs) {
+  absl::Status HandleAbs(const HloInstruction* abs) {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[abs],
                         ElementWiseUnaryOp(abs, [](NativeT elem_operand) {
                           return elem_operand;
                         }));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   template <typename NativeT,
             typename std::enable_if_t<std::is_signed_v<NativeT>>* = nullptr>
-  Status HandleAbs(const HloInstruction* abs) {
+  absl::Status HandleAbs(const HloInstruction* abs) {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[abs],
                         ElementWiseUnaryOp(abs, [](NativeT elem_operand) {
                           return std::abs(elem_operand);
                         }));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   template <typename NativeT,
             typename std::enable_if_t<is_complex_v<NativeT>>* = nullptr>
-  Status HandleAbs(const HloInstruction* abs) {
+  absl::Status HandleAbs(const HloInstruction* abs) {
     const Literal& operand_literal =
         parent_->GetEvaluatedLiteralFor(abs->operand(0));
     TF_ASSIGN_OR_RETURN(
@@ -192,10 +192,10 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
             abs, [](NativeT elem_operand) { return std::abs(elem_operand); },
             operand_literal)));
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status HandleAbs(const HloInstruction* abs) override {
+  absl::Status HandleAbs(const HloInstruction* abs) override {
     // If the operand is of C64 type, the return type of abs will be F32.
     // However, ElementwiseT would still be the return type, F32, and thus
     // specifying the ElementwiseT explicitly as C64 is needed below.
@@ -207,19 +207,19 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
     return HandleAbs<ElementwiseT>(abs);
   }
 
-  Status HandleRound(const HloInstruction* round) override {
+  absl::Status HandleRound(const HloInstruction* round) override {
     if constexpr (!is_complex_v<ReturnT>) {
       TF_ASSIGN_OR_RETURN(
           parent_->evaluated_[round],
           ElementWiseUnaryOp(round, [](ElementwiseT elem_operand) {
             return std::round(elem_operand);
           }));
-      return OkStatus();
+      return absl::OkStatus();
     }
     return UnsupportedTypeError(round);
   }
 
-  Status HandleRoundNearestEven(const HloInstruction* round) override {
+  absl::Status HandleRoundNearestEven(const HloInstruction* round) override {
     if constexpr (!is_complex_v<ReturnT>) {
       // Verify the current rounding direction.
       TF_RET_CHECK(fegetround() == FE_TONEAREST);
@@ -228,88 +228,88 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
           ElementWiseUnaryOp(round, [](ElementwiseT elem_operand) {
             return std::nearbyint(elem_operand);
           }));
-      return OkStatus();
+      return absl::OkStatus();
     }
     return UnsupportedTypeError(round);
   }
 
-  Status HandleCeil(const HloInstruction* ceil) override {
+  absl::Status HandleCeil(const HloInstruction* ceil) override {
     if constexpr (!is_complex_v<ReturnT>) {
       TF_ASSIGN_OR_RETURN(
           parent_->evaluated_[ceil],
           ElementWiseUnaryOp(ceil, [](ElementwiseT elem_operand) {
             return std::ceil(elem_operand);
           }));
-      return OkStatus();
+      return absl::OkStatus();
     }
     return UnsupportedTypeError(ceil);
   }
 
-  Status HandleErf(const HloInstruction* erf) override {
+  absl::Status HandleErf(const HloInstruction* erf) override {
     if constexpr (!is_complex_v<ReturnT>) {
       TF_ASSIGN_OR_RETURN(
           parent_->evaluated_[erf],
           ElementWiseUnaryOp(erf, [](ElementwiseT elem_operand) {
             return std::erf(elem_operand);
           }));
-      return OkStatus();
+      return absl::OkStatus();
     }
     return UnsupportedTypeError(erf);
   }
 
-  Status HandleExp(const HloInstruction* exp) override {
+  absl::Status HandleExp(const HloInstruction* exp) override {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[exp],
                         ElementWiseUnaryOp(exp, [](ElementwiseT elem_operand) {
                           return std::exp(elem_operand);
                         }));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status HandleExpm1(const HloInstruction* expm1) override {
+  absl::Status HandleExpm1(const HloInstruction* expm1) override {
     if constexpr (!is_complex_v<ReturnT>) {
       TF_ASSIGN_OR_RETURN(
           parent_->evaluated_[expm1],
           ElementWiseUnaryOp(expm1, [](ElementwiseT elem_operand) {
             return std::expm1(elem_operand);
           }));
-      return OkStatus();
+      return absl::OkStatus();
     }
     return UnsupportedTypeError(expm1);
   }
 
-  Status HandleFloor(const HloInstruction* floor) override {
+  absl::Status HandleFloor(const HloInstruction* floor) override {
     if constexpr (!is_complex_v<ReturnT>) {
       TF_ASSIGN_OR_RETURN(
           parent_->evaluated_[floor],
           ElementWiseUnaryOp(floor, [](ElementwiseT elem_operand) {
             return std::floor(elem_operand);
           }));
-      return OkStatus();
+      return absl::OkStatus();
     }
     return UnsupportedTypeError(floor);
   }
 
-  Status HandleLog(const HloInstruction* log) override {
+  absl::Status HandleLog(const HloInstruction* log) override {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[log],
                         ElementWiseUnaryOp(log, [](ElementwiseT elem_operand) {
                           return std::log(elem_operand);
                         }));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status HandleLog1p(const HloInstruction* log1p) override {
+  absl::Status HandleLog1p(const HloInstruction* log1p) override {
     if constexpr (!is_complex_v<ReturnT>) {
       TF_ASSIGN_OR_RETURN(
           parent_->evaluated_[log1p],
           ElementWiseUnaryOp(log1p, [](ElementwiseT elem_operand) {
             return std::log1p(elem_operand);
           }));
-      return OkStatus();
+      return absl::OkStatus();
     }
     return UnsupportedTypeError(log1p);
   }
 
-  Status HandleNot(const HloInstruction* not_) override {
+  absl::Status HandleNot(const HloInstruction* not_) override {
     if constexpr (std::is_arithmetic_v<ElementwiseT>) {
       TF_ASSIGN_OR_RETURN(
           parent_->evaluated_[not_],
@@ -322,7 +322,7 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
               return ~elem_operand;
             }
           }));
-      return OkStatus();
+      return absl::OkStatus();
     }
     return UnsupportedTypeError(not_);
   }
@@ -331,42 +331,42 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
       typename NativeT,
       typename std::enable_if_t<std::is_signed_v<NativeT> &&
                                 !std::is_floating_point_v<NativeT>>* = nullptr>
-  Status HandleNegate(const HloInstruction* negate) {
+  absl::Status HandleNegate(const HloInstruction* negate) {
     using type = std::make_unsigned_t<NativeT>;
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[negate],
         ElementWiseUnaryOp(negate, [](ElementwiseT elem_operand) {
           return NativeT(-type(elem_operand));
         }));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   template <typename NativeT, typename std::enable_if_t<
                                   !std::is_signed_v<NativeT> ||
                                   std::is_floating_point_v<NativeT>>* = nullptr>
-  Status HandleNegate(const HloInstruction* negate) {
+  absl::Status HandleNegate(const HloInstruction* negate) {
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[negate],
         ElementWiseUnaryOp(
             negate, [](ElementwiseT elem_operand) { return -elem_operand; }));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status HandleNegate(const HloInstruction* negate) override {
+  absl::Status HandleNegate(const HloInstruction* negate) override {
     return HandleNegate<ReturnT>(negate);
   }
 
-  Status HandleLogistic(const HloInstruction* logistic) override {
+  absl::Status HandleLogistic(const HloInstruction* logistic) override {
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[logistic],
         ElementWiseUnaryOp(logistic, [](ElementwiseT elem_operand) {
           return static_cast<ElementwiseT>(1) /
                  (static_cast<ElementwiseT>(1) + std::exp(-elem_operand));
         }));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status HandleSign(const HloInstruction* sign) override {
+  absl::Status HandleSign(const HloInstruction* sign) override {
     using NativeT = ElementwiseT;
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[sign],
@@ -386,17 +386,17 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
             return 0 == abs_val ? ElementwiseT(0) : elem_operand / abs_val;
           }
         }));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status HandleAtan2(const HloInstruction* atan2) override {
+  absl::Status HandleAtan2(const HloInstruction* atan2) override {
     if constexpr (std::is_floating_point_v<ElementwiseT>) {
       TF_ASSIGN_OR_RETURN(parent_->evaluated_[atan2],
                           ElementWiseBinaryOp(atan2, [](ElementwiseT lhs_elem,
                                                         ElementwiseT rhs_elem) {
                             return std::atan2(lhs_elem, rhs_elem);
                           }));
-      return OkStatus();
+      return absl::OkStatus();
     }
     if constexpr (is_complex_v<ElementwiseT>) {
       TF_ASSIGN_OR_RETURN(
@@ -406,20 +406,20 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
             auto i = ElementwiseT(0.0, 1.0);
             return (-i) * (std::log((x + i * y) / std::sqrt(x * x + y * y)));
           }));
-      return OkStatus();
+      return absl::OkStatus();
     }
     return UnsupportedTypeError(atan2);
   }
 
-  Status HandleTanh(const HloInstruction* tanh) override {
+  absl::Status HandleTanh(const HloInstruction* tanh) override {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[tanh],
                         ElementWiseUnaryOp(tanh, [](ElementwiseT elem_operand) {
                           return std::tanh(elem_operand);
                         }));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status HandleMultiply(const HloInstruction* multiply) override {
+  absl::Status HandleMultiply(const HloInstruction* multiply) override {
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[multiply],
         ElementWiseBinaryOp(
@@ -427,10 +427,10 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
               return ElementwiseT(ToArithmeticSafeType(lhs_elem) *
                                   ToArithmeticSafeType(rhs_elem));
             }));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status HandleSubtract(const HloInstruction* subtract) override {
+  absl::Status HandleSubtract(const HloInstruction* subtract) override {
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[subtract],
         ElementWiseBinaryOp(
@@ -438,20 +438,20 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
               return ElementwiseT(ToArithmeticSafeType(lhs_elem) -
                                   ToArithmeticSafeType(rhs_elem));
             }));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status HandleAdd(const HloInstruction* add) override {
+  absl::Status HandleAdd(const HloInstruction* add) override {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[add],
                         ElementWiseBinaryOp(add, [](ElementwiseT lhs_elem,
                                                     ElementwiseT rhs_elem) {
                           return ElementwiseT(ToArithmeticSafeType(lhs_elem) +
                                               ToArithmeticSafeType(rhs_elem));
                         }));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status HandleDivide(const HloInstruction* divide) override {
+  absl::Status HandleDivide(const HloInstruction* divide) override {
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[divide],
         ElementWiseBinaryOp(
@@ -475,10 +475,10 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
               }
               return lhs_elem / rhs_elem;
             }));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status HandleMaximum(const HloInstruction* maximum) override {
+  absl::Status HandleMaximum(const HloInstruction* maximum) override {
     if constexpr (!is_complex_v<ElementwiseT>) {
       TF_ASSIGN_OR_RETURN(
           parent_->evaluated_[maximum],
@@ -493,12 +493,12 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
             }
             return std::max(lhs, rhs);
           }));
-      return OkStatus();
+      return absl::OkStatus();
     }
     return UnsupportedTypeError(maximum);
   }
 
-  Status HandleMinimum(const HloInstruction* minimum) override {
+  absl::Status HandleMinimum(const HloInstruction* minimum) override {
     if constexpr (!is_complex_v<ElementwiseT>) {
       TF_ASSIGN_OR_RETURN(
           parent_->evaluated_[minimum],
@@ -513,12 +513,12 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
             }
             return std::min(lhs, rhs);
           }));
-      return OkStatus();
+      return absl::OkStatus();
     }
     return UnsupportedTypeError(minimum);
   }
 
-  Status HandlePower(const HloInstruction* power) override {
+  absl::Status HandlePower(const HloInstruction* power) override {
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[power],
         ElementWiseBinaryOp(power, [](ElementwiseT lhs_el,
@@ -566,39 +566,39 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
             return static_cast<ElementwiseT>(std::pow(lhs_el, rhs_el));
           }
         }));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status HandleSqrt(const HloInstruction* sqrt) override {
+  absl::Status HandleSqrt(const HloInstruction* sqrt) override {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[sqrt],
                         ElementWiseUnaryOp(sqrt, [](ElementwiseT elem_operand) {
                           return std::sqrt(elem_operand);
                         }));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status HandleCbrt(const HloInstruction* cbrt) override {
+  absl::Status HandleCbrt(const HloInstruction* cbrt) override {
     if constexpr (!is_complex_v<ElementwiseT>) {
       TF_ASSIGN_OR_RETURN(
           parent_->evaluated_[cbrt],
           ElementWiseUnaryOp(cbrt, [](ElementwiseT elem_operand) {
             return std::cbrt(elem_operand);
           }));
-      return OkStatus();
+      return absl::OkStatus();
     }
     return UnsupportedTypeError(cbrt);
   }
 
-  Status HandleRsqrt(const HloInstruction* rsqrt) override {
+  absl::Status HandleRsqrt(const HloInstruction* rsqrt) override {
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[rsqrt],
         ElementWiseUnaryOp(rsqrt, [](ElementwiseT elem_operand) {
           return static_cast<ElementwiseT>(1) / std::sqrt(elem_operand);
         }));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status HandleRemainder(const HloInstruction* remainder) override {
+  absl::Status HandleRemainder(const HloInstruction* remainder) override {
     if constexpr (!is_complex_v<ElementwiseT>) {
       TF_ASSIGN_OR_RETURN(
           parent_->evaluated_[remainder],
@@ -621,12 +621,12 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
                   return std::fmod(lhs_el, rhs_el);
                 }
               }));
-      return OkStatus();
+      return absl::OkStatus();
     }
     return UnsupportedTypeError(remainder);
   }
 
-  Status HandleAnd(const HloInstruction* and_inst) override {
+  absl::Status HandleAnd(const HloInstruction* and_inst) override {
     if constexpr (std::is_integral_v<ElementwiseT>) {
       TF_ASSIGN_OR_RETURN(
           parent_->evaluated_[and_inst],
@@ -634,24 +634,24 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
                               [](ElementwiseT lhs_el, ElementwiseT rhs_el) {
                                 return lhs_el & rhs_el;
                               }));
-      return OkStatus();
+      return absl::OkStatus();
     }
     return UnsupportedTypeError(and_inst);
   }
 
-  Status HandleOr(const HloInstruction* or_inst) override {
+  absl::Status HandleOr(const HloInstruction* or_inst) override {
     if constexpr (std::is_integral_v<ElementwiseT>) {
       TF_ASSIGN_OR_RETURN(parent_->evaluated_[or_inst],
                           ElementWiseBinaryOp(or_inst, [](ElementwiseT lhs_el,
                                                           ElementwiseT rhs_el) {
                             return lhs_el | rhs_el;
                           }));
-      return OkStatus();
+      return absl::OkStatus();
     }
     return UnsupportedTypeError(or_inst);
   }
 
-  Status HandleXor(const HloInstruction* xor_inst) override {
+  absl::Status HandleXor(const HloInstruction* xor_inst) override {
     if constexpr (std::is_integral_v<ElementwiseT>) {
       TF_ASSIGN_OR_RETURN(
           parent_->evaluated_[xor_inst],
@@ -659,12 +659,12 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
                               [](ElementwiseT lhs_el, ElementwiseT rhs_el) {
                                 return lhs_el ^ rhs_el;
                               }));
-      return OkStatus();
+      return absl::OkStatus();
     }
     return UnsupportedTypeError(xor_inst);
   }
 
-  Status HandleShiftLeft(const HloInstruction* shl) override {
+  absl::Status HandleShiftLeft(const HloInstruction* shl) override {
     if constexpr (std::is_integral_v<ElementwiseT> &&
                   !std::is_same_v<ElementwiseT, bool>) {
       TF_ASSIGN_OR_RETURN(parent_->evaluated_[shl],
@@ -674,12 +674,12 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
                                        ? 0
                                        : (lhs_elem << rhs_elem);
                           }));
-      return OkStatus();
+      return absl::OkStatus();
     }
     return UnsupportedTypeError(shl);
   }
 
-  Status HandleShiftRightArithmetic(const HloInstruction* shr) override {
+  absl::Status HandleShiftRightArithmetic(const HloInstruction* shr) override {
     if constexpr (std::is_integral_v<ElementwiseT> &&
                   !std::is_same_v<ElementwiseT, bool>) {
       using SignedT = make_specialized_signed_t<ReturnT>;
@@ -694,12 +694,12 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
                   return static_cast<ElementwiseT>(lhs_signed >> rhs_elem);
                 }
               }));
-      return OkStatus();
+      return absl::OkStatus();
     }
     return UnsupportedTypeError(shr);
   }
 
-  Status HandleShiftRightLogical(const HloInstruction* shr) override {
+  absl::Status HandleShiftRightLogical(const HloInstruction* shr) override {
     if constexpr (std::is_integral_v<ElementwiseT> &&
                   !std::is_same_v<ElementwiseT, bool>) {
       using UnsignedT = make_specialized_unsigned_t<ReturnT>;
@@ -714,12 +714,12 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
                             return static_cast<ElementwiseT>(
                                 static_cast<UnsignedT>(lhs_elem) >> rhs_elem);
                           }));
-      return OkStatus();
+      return absl::OkStatus();
     }
     return UnsupportedTypeError(shr);
   }
 
-  Status HandleClamp(const HloInstruction* clamp) override {
+  absl::Status HandleClamp(const HloInstruction* clamp) override {
     if constexpr (!is_complex_v<ElementwiseT>) {
       auto clamp_op = [](ElementwiseT low, ElementwiseT value,
                          ElementwiseT high) {
@@ -740,12 +740,12 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
           parent_->evaluated_[clamp],
           ElementwiseTernaryOp(clamp,
                                std::move(ConvertTernaryFunction(clamp_op))));
-      return OkStatus();
+      return absl::OkStatus();
     }
     return UnsupportedTypeError(clamp);
   }
 
-  Status HandleSelect(const HloInstruction* select) override {
+  absl::Status HandleSelect(const HloInstruction* select) override {
     CHECK(!ShapeUtil::IsScalar(select->operand(0)->shape()));
     CHECK(select->shape().IsArray());
     std::function<ReturnT(bool, ReturnT, ReturnT)> select_op =
@@ -757,12 +757,12 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
         };
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[select],
                         ElementwiseTernaryOp(select, std::move(select_op)));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status HandleConvolutionWithLiterals(const HloInstruction* conv,
-                                       const Literal& lhs_literal,
-                                       const Literal& rhs_literal) {
+  absl::Status HandleConvolutionWithLiterals(const HloInstruction* conv,
+                                             const Literal& lhs_literal,
+                                             const Literal& rhs_literal) {
     const auto& window = conv->window();
     const Shape& result_shape = conv->shape();
     const Shape& lhs_shape = lhs_literal.shape();
@@ -958,10 +958,10 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
     TF_RETURN_IF_ERROR(result.PopulateParallel<ReturnT>(func));
 
     parent_->evaluated_[conv] = std::move(result);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status HandleConvolution(const HloInstruction* conv) override {
+  absl::Status HandleConvolution(const HloInstruction* conv) override {
     auto lhs = conv->operand(0);
     auto rhs = conv->operand(1);
     const auto& window = conv->window();
@@ -1020,7 +1020,7 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
         rhs_literal.Convert(result_shape.element_type()).value());
   }
 
-  Status HandleDot(const HloInstruction* dot) override {
+  absl::Status HandleDot(const HloInstruction* dot) override {
     if (dot->dot_dimension_numbers().rhs_contracting_dimensions_size() == 1 &&
         parent_->use_fast_path_ &&
         ShapeUtil::SameElementType(dot->operand(0)->shape(), dot->shape()) &&
@@ -1032,7 +1032,7 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
 
   template <typename NativeT, typename std::enable_if_t<
                                   std::is_same_v<NativeT, float>>* = nullptr>
-  Status HandleDot(const HloInstruction* dot) {
+  absl::Status HandleDot(const HloInstruction* dot) {
     const HloInstruction* lhs = dot->operand(0);
     const HloInstruction* rhs = dot->operand(1);
     CHECK(dot->shape().IsArray());
@@ -1092,18 +1092,18 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
     result.PopulateR2FromArray2D(*result_array);
     parent_->evaluated_[dot] =
         std::move(result).Convert(dot->shape().element_type()).value();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   template <typename NativeT, typename std::enable_if_t<
                                   !std::is_same_v<NativeT, float>>* = nullptr>
-  Status HandleDot(const HloInstruction* dot) {
+  absl::Status HandleDot(const HloInstruction* dot) {
     return HandleDotSlowPath(dot);
   }
 
-  Status HandleDotSlowPathWithLiterals(const HloInstruction* dot,
-                                       const Literal& lhs_literal,
-                                       const Literal& rhs_literal) {
+  absl::Status HandleDotSlowPathWithLiterals(const HloInstruction* dot,
+                                             const Literal& lhs_literal,
+                                             const Literal& rhs_literal) {
     const auto& dnums = dot->dot_dimension_numbers();
 
     const auto lhs_rank = lhs_literal.shape().rank();
@@ -1209,10 +1209,10 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
         }));
 
     parent_->evaluated_[dot] = std::move(result);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status HandleDotSlowPath(const HloInstruction* dot) {
+  absl::Status HandleDotSlowPath(const HloInstruction* dot) {
     auto lhs = dot->operand(0);
     auto rhs = dot->operand(1);
     CHECK(dot->shape().IsArray());
@@ -1242,7 +1242,7 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
         rhs_literal.Convert(dot->shape().element_type()).value());
   }
 
-  Status HandlePad(const HloInstruction* pad) override {
+  absl::Status HandlePad(const HloInstruction* pad) override {
     CHECK(pad->operand(0)->shape().IsArray());
     // Padding value must be scalar.
     CHECK(ShapeUtil::IsScalar(pad->operand(1)->shape()));
@@ -1307,10 +1307,10 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
                                     step, func);
 
     parent_->evaluated_[pad] = std::move(result);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status HandleClz(const HloInstruction* clz) override {
+  absl::Status HandleClz(const HloInstruction* clz) override {
     // Enable CLZ only for integer types.
     if constexpr (std::is_integral_v<ElementwiseT> &&
                   !std::is_same_v<ElementwiseT, bool>) {
@@ -1321,12 +1321,12 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
                                       std::numeric_limits<ReturnT>::is_signed;
             return (unsigned_digits - 1) - Log2Floor<uint64_t>(elem_operand);
           }));
-      return OkStatus();
+      return absl::OkStatus();
     }
     return UnsupportedTypeError(clz);
   }
 
-  Status HandlePopulationCount(const HloInstruction* popcnt) override {
+  absl::Status HandlePopulationCount(const HloInstruction* popcnt) override {
     if constexpr (std::is_integral_v<ElementwiseT> &&
                   !std::is_same_v<ElementwiseT, bool>) {
       TF_ASSIGN_OR_RETURN(
@@ -1335,12 +1335,12 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
             return std::bitset<CHAR_BIT * sizeof(ReturnT)>(elem_operand)
                 .count();
           }));
-      return OkStatus();
+      return absl::OkStatus();
     }
     return UnsupportedTypeError(popcnt);
   }
 
-  Status HandleSin(const HloInstruction* sin) override {
+  absl::Status HandleSin(const HloInstruction* sin) override {
     if constexpr (std::is_floating_point_v<ElementwiseT> ||
                   is_complex_v<ElementwiseT>) {
       TF_ASSIGN_OR_RETURN(
@@ -1348,12 +1348,12 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
           ElementWiseUnaryOp(sin, [](ElementwiseT elem_operand) {
             return std::sin(elem_operand);
           }));
-      return OkStatus();
+      return absl::OkStatus();
     }
     return UnsupportedTypeError(sin);
   }
 
-  Status HandleCos(const HloInstruction* cos) override {
+  absl::Status HandleCos(const HloInstruction* cos) override {
     if constexpr (std::is_floating_point_v<ElementwiseT> ||
                   is_complex_v<ElementwiseT>) {
       TF_ASSIGN_OR_RETURN(
@@ -1361,26 +1361,26 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
           ElementWiseUnaryOp(cos, [](ElementwiseT elem_operand) {
             return std::cos(elem_operand);
           }));
-      return OkStatus();
+      return absl::OkStatus();
     }
     return UnsupportedTypeError(cos);
   }
 
-  Status HandleTan(const HloInstruction* tan) override {
+  absl::Status HandleTan(const HloInstruction* tan) override {
     if constexpr (std::is_floating_point_v<ElementwiseT>) {
       TF_ASSIGN_OR_RETURN(
           parent_->evaluated_[tan],
           ElementWiseUnaryOp(tan, [](ElementwiseT elem_operand) {
             return std::tan(elem_operand);
           }));
-      return OkStatus();
+      return absl::OkStatus();
     }
     return UnsupportedTypeError(tan);
   }
 
   template <typename NativeT, typename std::enable_if_t<
                                   std::is_floating_point_v<NativeT>>* = nullptr>
-  Status HandleReducePrecision(const HloInstruction* reduce_precision) {
+  absl::Status HandleReducePrecision(const HloInstruction* reduce_precision) {
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[reduce_precision],
         ElementWiseUnaryOp(reduce_precision, [&](ElementwiseT elem) {
@@ -1474,22 +1474,22 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
           }
           return reduced_result;
         }));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   template <typename NativeT,
             typename std::enable_if_t<std::is_integral_v<NativeT> ||
                                       is_complex_v<NativeT>>* = nullptr>
-  Status HandleReducePrecision(const HloInstruction* reduce_precision) {
+  absl::Status HandleReducePrecision(const HloInstruction* reduce_precision) {
     return UnsupportedTypeError(reduce_precision);
   }
 
-  Status HandleReducePrecision(
+  absl::Status HandleReducePrecision(
       const HloInstruction* reduce_precision) override {
     return HandleReducePrecision<ElementwiseT>(reduce_precision);
   }
 
-  Status HandleIota(const HloInstruction* instruction) override {
+  absl::Status HandleIota(const HloInstruction* instruction) override {
     auto* iota = Cast<HloIotaInstruction>(instruction);
     if constexpr (std::is_integral_v<ElementwiseT> ||
                   is_complex_v<ElementwiseT> ||
@@ -1501,12 +1501,12 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
             return true;
           });
       parent_->evaluated_[iota] = std::move(result);
-      return OkStatus();
+      return absl::OkStatus();
     }
     return UnsupportedTypeError(iota);
   }
 
-  Status HandleRng(const HloInstruction* random) override {
+  absl::Status HandleRng(const HloInstruction* random) override {
     RandomDistribution distribution = random->random_distribution();
     const Shape& result_shape = random->shape();
     Literal result(result_shape);
@@ -1566,7 +1566,7 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
                                      " is not implemented.");
       }
       parent_->evaluated_[random] = std::move(result);
-      return OkStatus();
+      return absl::OkStatus();
     }
     if constexpr (std::is_integral_v<ElementwiseT>) {
       switch (distribution) {
@@ -1599,7 +1599,7 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
                                      " is not implemented.");
       }
       parent_->evaluated_[random] = std::move(result);
-      return OkStatus();
+      return absl::OkStatus();
     }
     return UnsupportedTypeError(random);
   }
@@ -1704,7 +1704,7 @@ extern template class HloEvaluatorTypedVisitor<complex128>;
 extern template class HloEvaluatorTypedVisitor<bfloat16, float>;
 extern template class HloEvaluatorTypedVisitor<tsl::float8_e5m2, float>;
 extern template class HloEvaluatorTypedVisitor<tsl::float8_e4m3fn, float>;
-extern template class HloEvaluatorTypedVisitor<tsl::float8_e4m3b11, float>;
+extern template class HloEvaluatorTypedVisitor<tsl::float8_e4m3b11fnuz, float>;
 extern template class HloEvaluatorTypedVisitor<tsl::float8_e5m2fnuz, float>;
 extern template class HloEvaluatorTypedVisitor<tsl::float8_e4m3fnuz, float>;
 
diff --git a/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor_float8.cc b/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor_float8.cc
index b2cd8eb87292ec..9df467e7fd5f67 100644
--- a/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor_float8.cc
+++ b/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor_float8.cc
@@ -19,7 +19,7 @@ limitations under the License.
 namespace xla {
 template class HloEvaluatorTypedVisitor<tsl::float8_e5m2, float>;
 template class HloEvaluatorTypedVisitor<tsl::float8_e4m3fn, float>;
-template class HloEvaluatorTypedVisitor<tsl::float8_e4m3b11, float>;
+template class HloEvaluatorTypedVisitor<tsl::float8_e4m3b11fnuz, float>;
 template class HloEvaluatorTypedVisitor<tsl::float8_e5m2fnuz, float>;
 template class HloEvaluatorTypedVisitor<tsl::float8_e4m3fnuz, float>;
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD b/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
index f91d99d7f10050..6e8a6e33745353 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
@@ -2,7 +2,7 @@
 
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
-load("//xla:xla.bzl", "auto_sharding_deps", "auto_sharding_solver_deps", "xla_cc_binary", "xla_cc_test")
+load("//xla:xla.bzl", "xla_cc_binary", "xla_cc_test", "xla_internal")
 load("//xla/tsl:tsl.default.bzl", "get_compatible_with_libtpu_portable")
 
 package(
@@ -65,6 +65,7 @@ cc_library(
         "//xla/service:optimize_input_output_buffer_alias",
         "//xla/service:sharding_propagation",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
@@ -78,7 +79,10 @@ cc_library(
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
-    ] + auto_sharding_deps(),
+    ] + xla_internal(
+        ["experimental/auto_sharding:auto_sharding_impl_internal"],
+        otherwise = ["//xla/hlo/experimental/auto_sharding:auto_sharding_impl"],
+    ),
 )
 
 cc_library(
@@ -122,7 +126,10 @@ cc_library(
         "@local_tsl//tsl/platform:hash",
         "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:types",
-    ] + auto_sharding_solver_deps(),
+    ] + xla_internal(
+        ["experimental/auto_sharding:auto_sharding_solver_impl_internal"],
+        otherwise = ["//xla/hlo/experimental/auto_sharding:auto_sharding_solver_impl"],
+    ),
 )
 
 cc_library(
@@ -189,8 +196,8 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_live_range",
         "//xla/service:hlo_cost_analysis",
+        "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings:string_view",
     ],
 )
@@ -207,6 +214,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_live_range",
         "//xla/service:hlo_cost_analysis",
+        "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings:string_view",
     ],
@@ -275,6 +283,7 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
+        "@jsoncpp_git//:jsoncpp",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:status",
     ],
@@ -340,6 +349,7 @@ xla_cc_test(
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings:string_view",
@@ -365,6 +375,7 @@ xla_cc_test(
         ":auto_sharding_solver",  # build_cleaner: keep
         ":auto_sharding_strategy",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
index fbf38aea099f98..5cba98f86637db 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
@@ -19,18 +19,21 @@ limitations under the License.
 #include <climits>
 #include <cstddef>
 #include <cstdint>
+#include <cstdlib>
 #include <functional>
 #include <iterator>
 #include <limits>
 #include <memory>
 #include <numeric>
 #include <optional>
+#include <queue>
 #include <string>
 #include <tuple>
 #include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/btree_set.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
@@ -46,6 +49,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/array.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h"
+#include "xla/hlo/experimental/auto_sharding/auto_sharding_memory.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_option.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_solver.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
@@ -91,7 +95,6 @@ namespace xla {
 namespace spmd {
 
 namespace {
-constexpr double kOverbudgetCoeff = 1e6;
 constexpr double kSaltiplier = 0.0;  // This value (0.0) disables salting.
 }  // namespace
 
@@ -362,6 +365,71 @@ void FollowArrayOrTokenStrategyGroup(
   }
 }
 
+std::unique_ptr<StrategyGroup> HandlePartialReduce(
+    const HloInstruction* ins, const size_t instruction_id,
+    const bool have_memory_cost, StrategyGroups& strategy_groups,
+    const ClusterEnvironment& cluster_env, StrategyMap& strategy_map,
+    const CallGraph& call_graph) {
+  absl::StatusOr<int64_t> reduction_dim = GetPartialReduceReductionDim(ins);
+  CHECK_OK(reduction_dim);
+  const Shape& shape = ins->shape();
+  const HloInstruction* operand = ins->operand(0);
+  const StrategyGroup* src_strategy_group = strategy_map.at(operand).get();
+
+  std::unique_ptr<StrategyGroup> strategy_group =
+      CreateTupleStrategyGroup(instruction_id);
+  int64_t output_size = shape.tuple_shapes_size();
+  for (size_t i = 0; i < output_size; ++i) {
+    std::unique_ptr<StrategyGroup> child_strategy_group =
+        CreateLeafStrategyGroupWithoutInNodes(instruction_id, strategy_groups);
+    child_strategy_group->in_nodes.push_back(src_strategy_group);
+    child_strategy_group->following = src_strategy_group;
+    for (int64_t sid = 0; sid < src_strategy_group->strategies.size(); ++sid) {
+      const HloSharding& input_spec =
+          src_strategy_group->strategies[sid].output_sharding;
+      // There is no way for us to handle manual sharding.
+      if (input_spec.IsManual() || input_spec.IsManualSubgroup()) {
+        continue;
+      }
+
+      HloSharding output_spec = input_spec;
+      if (!(input_spec.IsReplicated() || input_spec.IsTileMaximal())) {
+        // All 3. sub-cases (reduction dim would be replicated in the
+        // output)
+        output_spec = hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
+            input_spec, {*reduction_dim});
+      }
+
+      // Get a list of input shardings, each corresponds to an operand.
+      std::vector<std::optional<HloSharding>> input_shardings;
+      for (int64_t k = 0; k < output_size * 2; ++k) {
+        if (k < output_size) {
+          input_shardings.push_back(input_spec);
+        } else {
+          input_shardings.push_back(HloSharding::Replicate());
+        }
+      }
+
+      std::string name = ToStringSimple(output_spec);
+      double compute_cost = 0, communication_cost = 0;
+      double memory_cost =
+          GetBytes(ins->shape().tuple_shapes(i)) / output_spec.NumTiles();
+      std::pair<ReshardingCosts, ReshardingCosts> resharding_costs =
+          GenerateReshardingCostsAndMissingShardingsForAllOperands(
+              ins, output_spec, strategy_map, cluster_env, call_graph,
+              input_shardings);
+
+      child_strategy_group->strategies.push_back(ShardingStrategy(
+          {std::move(name), std::move(output_spec), compute_cost,
+           communication_cost, memory_cost, std::move(resharding_costs.first),
+           std::move(resharding_costs.second), std::move(input_shardings)}));
+    }
+
+    strategy_group->childs.push_back(std::move(child_strategy_group));
+  }
+  return strategy_group;
+}
+
 std::unique_ptr<StrategyGroup> MaybeFollowInsStrategyGroup(
     const StrategyGroup* src_strategy_group, const Shape& shape,
     const size_t instruction_id, const bool have_memory_cost,
@@ -1748,6 +1816,63 @@ std::unique_ptr<StrategyGroup> CreateElementwiseOperatorStrategies(
   return strategy_group;
 }
 
+// Generates strategies for instructions in manually sharded sub-graphs. The
+// generated strategies are present only as a way to take the memory consumption
+// of such instructions into account (hence they have all costs expect memory
+// costs set to zero). While the generated strategies have a replicated
+// output_sharding, we skip these instructions when setting sharding
+// annotations, so the output_sharding essentially remains unused.
+std::unique_ptr<StrategyGroup> HandleManuallyShardedInstruction(
+    const HloInstruction* ins, const Shape& shape, const size_t instruction_id,
+    StrategyGroups& strategy_groups, StrategyMap& strategy_map) {
+  std::unique_ptr<StrategyGroup> strategy_group;
+  if (shape.IsTuple()) {
+    strategy_group = CreateTupleStrategyGroup(instruction_id);
+    strategy_group->childs.reserve(shape.tuple_shapes_size());
+    for (size_t i = 0; i < shape.tuple_shapes_size(); ++i) {
+      std::unique_ptr<StrategyGroup> child_strategies =
+          HandleManuallyShardedInstruction(ins, shape.tuple_shapes(i),
+                                           instruction_id, strategy_groups,
+                                           strategy_map);
+      child_strategies->tuple_element_idx = i;
+      strategy_group->childs.push_back(std::move(child_strategies));
+    }
+  } else if (shape.IsToken() || shape.IsArray()) {
+    strategy_group = CreateLeafStrategyGroup(instruction_id, ins, strategy_map,
+                                             strategy_groups);
+    ReshardingCosts communication_resharding_costs;
+    ReshardingCosts memory_resharding_costs;
+    std::vector<std::optional<HloSharding>> input_shardings;
+
+    if (ins->operand_count() > 0 && ins->operand(0)->shape().IsTuple()) {
+      CHECK_EQ(ins->operand_count(), 1)
+          << "Do not support instructions with more than one tuple "
+             "operand. If this CHECK fails, we will need to fix "
+             "b/233412625.";
+      std::tie(communication_resharding_costs, memory_resharding_costs,
+               input_shardings) =
+          ReshardingCostsForTupleOperand(
+              ins->operand(0), strategy_map.at(ins->operand(0)).get());
+    } else {
+      for (int64_t k = 0; k < ins->operand_count(); ++k) {
+        const HloInstruction* operand = ins->operand(k);
+        communication_resharding_costs.push_back(std::vector<double>(
+            strategy_map.at(operand)->strategies.size(), 0));
+        memory_resharding_costs.push_back(std::vector<double>(
+            strategy_map.at(operand)->strategies.size(), 0));
+      }
+    }
+    strategy_group->strategies.push_back(ShardingStrategy(
+        {"MANUAL", HloSharding::Replicate(), 0, 0,
+         static_cast<double>(ShapeUtil::ByteSizeOf(shape)),
+         std::move(communication_resharding_costs),
+         std::move(memory_resharding_costs), std::move(input_shardings)}));
+  } else {
+    LOG(FATAL) << "Unsupported instruction shape: " << shape.DebugString();
+  }
+  return strategy_group;
+}
+
 std::unique_ptr<StrategyGroup> CreateReshapeStrategies(
     const size_t instruction_id, const HloInstruction* ins,
     const StrategyMap& strategy_map, const ClusterEnvironment& cluster_env,
@@ -1823,11 +1948,13 @@ std::unique_ptr<StrategyGroup> CreateReshapeStrategies(
 
 AutoShardingSolverResult CallSolver(
     const HloModule& hlo_module, const HloLiveRange& hlo_live_range,
-    const LivenessNodeSet& liveness_node_set,
-    const LivenessEdgeSet& liveness_edge_set, const StrategyMap& strategy_map,
-    const StrategyGroups& strategy_groups, const CostGraph& cost_graph,
-    const AliasSet& alias_set, const std::vector<NodeStrategyIdx>& s_hint,
-    const absl::flat_hash_set<LivenessIdx>& peak_times, const bool compute_iis,
+    const StrategyMap& strategy_map, const StrategyGroups& strategy_groups,
+    const CostGraph& cost_graph, const AliasSet& alias_set,
+    const std::vector<std::pair<LivenessIdx, LivenessIdx>>& node_intervals,
+    const std::vector<std::pair<LivenessIdx, LivenessIdx>>& edge_intervals,
+    const std::vector<absl::btree_set<int64_t>>& node_groups,
+    const std::vector<absl::btree_set<int64_t>>& edge_groups,
+    const std::vector<NodeStrategyIdx>& s_hint, const bool compute_iis,
     const int64_t solver_timeout_in_seconds, const AutoShardingOption& option,
     std::optional<double> max_cost, absl::string_view request_name,
     const absl::flat_hash_map<std::string, const HloInstruction*>&
@@ -1843,16 +1970,24 @@ AutoShardingSolverResult CallSolver(
   request.mutable_s_follow()->Add(cost_graph.follow_idx_.begin(),
                                   cost_graph.follow_idx_.end());
   request.mutable_s_hint()->Add(s_hint.begin(), s_hint.end());
-  request.mutable_peak_times()->Add(peak_times.begin(), peak_times.end());
   request.mutable_solver_timeout()->set_solver_timeout_in_seconds(
       solver_timeout_in_seconds);
-  request.mutable_overbudget_coeff()->set_coeff(kOverbudgetCoeff);
+  // Only apply soft memory constraints if the overbudget coeff is nonnegative.
+  if (option.memory_overbudget_coeff >= 0.0) {
+    request.mutable_overbudget_coeff()->set_coeff(
+        option.memory_overbudget_coeff);
+  }
   request.set_crash_at_infinity_costs_check(!option.try_multiple_mesh_shapes);
   request.set_compute_iis(compute_iis);
   request.set_saltiplier(kSaltiplier);
   request.set_deterministic_mode(deterministic_mode);
   request.set_request_name(std::string(request_name));
   request.set_enable_memory_edge_costs(option.model_resharding_memory_costs);
+  // If we're removing user shardings, we are probably doing internal testing /
+  // debugging where additional output from the solver might be helpful.
+  request.set_enable_output(
+      option.preserve_shardings ==
+      AutoShardingOption::PreserveShardingsType::kRemoveAllShardings);
   if (max_cost) {
     request.mutable_max_cost()->set_coeff(*max_cost);
   }
@@ -2008,18 +2143,27 @@ AutoShardingSolverResult CallSolver(
     }
   }
 
-  // Serialize liveness_set
-  for (const auto& liveness_node_subset : liveness_node_set) {
-    AutoShardingSolverRequest_Nodes nodes;
-    nodes.mutable_nodes()->Add(liveness_node_subset.begin(),
-                               liveness_node_subset.end());
-    request.mutable_live()->Add(std::move(nodes));
+  for (const auto& interval : node_intervals) {
+    AutoShardingSolverRequest_Pair pair;
+    pair.set_first(interval.first);
+    pair.set_second(interval.second);
+    *request.add_node_intervals() = std::move(pair);
+  }
+  for (const auto& interval : edge_intervals) {
+    AutoShardingSolverRequest_Pair pair;
+    pair.set_first(interval.first);
+    pair.set_second(interval.second);
+    *request.add_edge_intervals() = std::move(pair);
+  }
+  for (const auto& reduced_group : node_groups) {
+    AutoShardingSolverRequest_Group group;
+    group.mutable_prims()->Add(reduced_group.begin(), reduced_group.end());
+    *request.add_node_groups() = std::move(group);
   }
-  for (const auto& liveness_edge_subset : liveness_edge_set) {
-    AutoShardingSolverRequest_Edges edges;
-    edges.mutable_edges()->Add(liveness_edge_subset.begin(),
-                               liveness_edge_subset.end());
-    request.mutable_live_edges()->Add(std::move(edges));
+  for (const auto& reduced_group : edge_groups) {
+    AutoShardingSolverRequest_Group group;
+    group.mutable_prims()->Add(reduced_group.begin(), reduced_group.end());
+    *request.add_edge_groups() = std::move(group);
   }
 
   PopulateTemporalValues(cost_graph, request);
@@ -2027,12 +2171,14 @@ AutoShardingSolverResult CallSolver(
   return CallORToolsSolver(request);
 }
 
-void CheckHloSharding(const HloInstructionSequence& sequence,
-                      const size_t total_num_devices) {
+void CheckHloSharding(
+    const HloInstructionSequence& sequence,
+    const absl::flat_hash_set<const HloInstruction*>& instructions_to_shard,
+    const size_t total_num_devices) {
   const std::vector<HloInstruction*>& instructions = sequence.instructions();
   std::vector<std::pair<size_t, std::string>> size_string;
   for (const HloInstruction* ins : instructions) {
-    if (!ins->has_sharding()) {
+    if (!instructions_to_shard.contains(ins) || !ins->has_sharding()) {
       continue;
     }
     if (!ins->shape().IsTuple() &&
@@ -2105,15 +2251,18 @@ void CheckHloSharding(const HloInstructionSequence& sequence,
 }
 
 // Set the HloSharding for all instructions according to the ILP solution.
-void SetHloSharding(const HloInstructionSequence& sequence,
-                    const StrategyMap& strategy_map,
-                    const CostGraph& cost_graph,
-                    absl::Span<const NodeStrategyIdx> s_val,
-                    const bool last_iteration) {
+void SetHloSharding(
+    const HloInstructionSequence& sequence,
+    const absl::flat_hash_set<const HloInstruction*>& instructions_to_shard,
+    const StrategyMap& strategy_map, const CostGraph& cost_graph,
+    absl::Span<const NodeStrategyIdx> s_val, const bool last_iteration) {
   // Set the HloSharding for every instruction
   const std::vector<HloInstruction*>& instructions = sequence.instructions();
 
   for (HloInstruction* inst : instructions) {
+    if (!instructions_to_shard.contains(inst)) {
+      continue;
+    }
     if (inst->opcode() == HloOpcode::kOutfeed ||
         inst->opcode() == HloOpcode::kRecv ||
         inst->opcode() == HloOpcode::kRecvDone ||
@@ -2180,9 +2329,11 @@ void SetHloSharding(const HloInstructionSequence& sequence,
   }
 }
 
-Status SetHloShardingPostProcessing(
-    const HloInstructionSequence& sequence, const StrategyMap& strategy_map,
-    const CostGraph& cost_graph, absl::Span<const NodeStrategyIdx> s_val,
+absl::Status SetHloShardingPostProcessing(
+    const HloInstructionSequence& sequence,
+    const absl::flat_hash_set<const HloInstruction*>& instructions_to_shard,
+    const StrategyMap& strategy_map, const CostGraph& cost_graph,
+    absl::Span<const NodeStrategyIdx> s_val,
     const ClusterEnvironment& cluster_env, const bool crash_at_error,
     absl::flat_hash_map<std::string, std::vector<HloSharding>>&
         preserve_shardings) {
@@ -2193,6 +2344,10 @@ Status SetHloShardingPostProcessing(
   ReshardingCache* resharding_cache = &resharding_cache_entity;
 
   for (HloInstruction* inst : instructions) {
+    if (!instructions_to_shard.contains(inst) ||
+        spmd::IsSPMDShardToFullShapeCustomCall(inst)) {
+      continue;
+    }
     // For some dot instructions and resharding cases, our formulation thinks
     // they are valid. But the spmd partitioner cannot infer the correct
     // dot algorithms or resharding algorithm from the input/output sharding.
@@ -2664,8 +2819,10 @@ void CheckUserShardingPreservation(
 }
 
 int64_t MemoryBudgetLowerBound(
-    const HloModule& module, const LivenessSet& liveness_set,
-    const HloAliasAnalysis& alias_analysis, const int64_t num_devices,
+    const HloModule& module,
+    const absl::flat_hash_set<const HloInstruction*>& instructions_to_shard,
+    const LivenessSet& liveness_set, const HloAliasAnalysis& alias_analysis,
+    const int64_t num_devices,
     const absl::flat_hash_map<std::string, std::vector<HloSharding>>&
         preserved_shardings) {
   auto get_value_sharding = [](const HloValue* value) -> HloSharding {
@@ -2719,6 +2876,11 @@ int64_t MemoryBudgetLowerBound(
         continue;
       }
 
+      if (!instructions_to_shard.contains(value->instruction())) {
+        memory_usage += ShapeUtil::ByteSizeOf(value->shape());
+        continue;
+      }
+
       auto iter1 = value_to_memory_size_mapping.find(value);
       if (iter1 != value_to_memory_size_mapping.end()) {
         memory_usage += iter1->second;
@@ -3247,11 +3409,11 @@ void AnnotateShardingWithSimpleHeuristic(
 
 // Filter strategies according to the option.force_batch_dim_to_mesh_dim.
 // This can be used to forcibly generate data-parallel strategies.
-Status FilterStrategy(const HloInstruction* ins, const Shape& shape,
-                      std::unique_ptr<StrategyGroup>& strategy_group,
-                      const ClusterEnvironment& cluster_env,
-                      const InstructionBatchDimMap& batch_map,
-                      const AutoShardingOption& option) {
+absl::Status FilterStrategy(const HloInstruction* ins, const Shape& shape,
+                            std::unique_ptr<StrategyGroup>& strategy_group,
+                            const ClusterEnvironment& cluster_env,
+                            const InstructionBatchDimMap& batch_map,
+                            const AutoShardingOption& option) {
   int mesh_dim = option.force_batch_dim_to_mesh_dim;
   int batch_dim = batch_map.at(GetBatchDimMapKey(ins));
   const Array<int64_t>& device_mesh = cluster_env.device_mesh_;
@@ -3456,9 +3618,26 @@ bool HasReduceScatterOpportunity(
 
 }  // namespace spmd
 
+bool IsInstructionBeforeSPMDFullToShardShapeCustomCall(
+    const HloInstruction* ins) {
+  if (ins->users().empty()) {
+    return false;
+  } else if (ins->users().size() == 1) {
+    return spmd::IsSPMDFullToShardShapeCustomCall(ins->users()[0]);
+  } else {
+    for (const HloInstruction* user : ins->users()) {
+      if (spmd::IsSPMDFullToShardShapeCustomCall(user)) {
+        return true;
+      }
+    }
+    return false;
+  }
+}
+
 std::pair<absl::flat_hash_map<std::string, std::vector<HloSharding>>, bool>
 AutoShardingImplementation::SaveAndRemoveShardingAnnotation(
     HloModule* module,
+    const absl::flat_hash_set<const HloInstruction*>& instructions_to_shard,
     const absl::flat_hash_set<std::string>& replicated_small_tensors,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   absl::flat_hash_map<std::string, std::vector<HloSharding>> preserve_shardings;
@@ -3477,8 +3656,15 @@ AutoShardingImplementation::SaveAndRemoveShardingAnnotation(
                                          preserve_shardings);
         continue;
       }
+      if (IsInstructionBeforeSPMDFullToShardShapeCustomCall(inst) ||
+          spmd::IsSPMDShardToFullShapeCustomCall(inst)) {
+        spmd::SaveShardingForInstruction(inst,
+                                         /* save_for_copy_users */ false,
+                                         preserve_shardings);
+      }
       if (inst->has_sharding() &&
-          spmd::IsShardingMisaligned(inst->sharding(), inst->shape())) {
+          spmd::IsShardingMisaligned(inst->sharding(), inst->shape()) &&
+          !instructions_to_shard.contains(inst)) {
         LOG(WARNING)
             << "Instruction " << inst->name()
             << " has a user sharding annotation that is misaligned. Shape: "
@@ -3538,7 +3724,10 @@ AutoShardingImplementation::SaveAndRemoveShardingAnnotation(
 
       if (ins->opcode() == HloOpcode::kOutfeed ||
           ins->opcode() == HloOpcode::kSend ||
-          ins->opcode() == HloOpcode::kSendDone) {
+          ins->opcode() == HloOpcode::kSendDone ||
+          IsInstructionBeforeSPMDFullToShardShapeCustomCall(ins) ||
+          spmd::IsSPMDShardToFullShapeCustomCall(ins) ||
+          !instructions_to_shard.contains(ins)) {
         continue;
       }
 
@@ -3551,7 +3740,8 @@ AutoShardingImplementation::SaveAndRemoveShardingAnnotation(
   return std::make_pair(preserve_shardings, module_is_changed);
 }
 
-Status AutoShardingImplementation::CanonicalizeLayouts(HloModule* module) {
+absl::Status AutoShardingImplementation::CanonicalizeLayouts(
+    HloModule* module) {
   if (!module->layout_canonicalization_callback()) {
     LOG(INFO) << "There is no registered layout_canonicalization_callback.";
     return OkStatus();
@@ -3576,10 +3766,107 @@ Status AutoShardingImplementation::CanonicalizeLayouts(HloModule* module) {
   return OkStatus();
 }
 
+// Computes the set of instructions that lie outside any manually partitioned
+// sub-graphs.
+absl::flat_hash_set<const HloInstruction*> ComputeInstructionsToShard(
+    const HloModule& module, const HloInstructionSequence& sequence) {
+  std::queue<const HloInstruction*> queue;
+
+  for (HloInstruction* instruction : sequence.instructions()) {
+    if (spmd::IsSPMDFullToShardShapeCustomCall(instruction)) {
+      for (const HloInstruction* user : instruction->users()) {
+        if (spmd::IsSPMDShardToFullShapeCustomCall(user)) {
+          continue;
+        }
+        queue.push(user);
+      }
+    }
+  }
+
+  absl::flat_hash_set<const HloInstruction*> visited;
+  while (!queue.empty()) {
+    const HloInstruction* instruction = queue.front();
+    queue.pop();
+    if (visited.contains(instruction)) {
+      continue;
+    }
+    visited.insert(instruction);
+
+    for (const HloComputation* computation :
+         instruction->called_computations()) {
+      for (const HloInstruction* parameter :
+           computation->parameter_instructions()) {
+        if (spmd::IsSPMDShardToFullShapeCustomCall(parameter) ||
+            spmd::IsSPMDFullToShardShapeCustomCall(parameter) ||
+            parameter == instruction || visited.contains(parameter)) {
+          continue;
+        }
+        queue.push(parameter);
+      }
+    }
+
+    for (const HloInstruction* user : instruction->users()) {
+      if (spmd::IsSPMDShardToFullShapeCustomCall(user) ||
+          spmd::IsSPMDFullToShardShapeCustomCall(user) ||
+          visited.contains(user)) {
+        continue;
+      }
+      queue.push(user);
+    }
+    for (const HloInstruction* operand : instruction->operands()) {
+      if (spmd::IsSPMDShardToFullShapeCustomCall(operand) ||
+          spmd::IsSPMDFullToShardShapeCustomCall(operand) ||
+          operand == instruction || visited.contains(operand)) {
+        continue;
+      }
+      queue.push(operand);
+    }
+  }
+
+  absl::flat_hash_set<const HloInstruction*> to_shard;
+  for (HloInstruction* instruction : sequence.instructions()) {
+    if (!visited.contains(instruction) &&
+        !spmd::IsSPMDFullToShardShapeCustomCall(instruction)) {
+      if (HloCollectiveInstruction::ClassOf(instruction)) {
+        LOG(FATAL) << "The module contains collective ops not contained within "
+                      "a graph surrounded by SPMDFullToShardShape and "
+                      "SPMDShardToFullShape custom calls. This case is not yet "
+                      "supported.";
+      }
+      to_shard.insert(instruction);
+    }
+  }
+  return to_shard;
+}
+
 AutoShardingImplementation::AutoShardingImplementation(
     const AutoShardingOption& option)
     : option_(option) {}
 
+std::pair<int64_t, int64_t> ReduceMemoryTerms(
+    int64_t num_primitives,
+    const std::vector<std::pair<spmd::LivenessIdx, spmd::LivenessIdx>>&
+        intervals,
+    std::vector<std::pair<spmd::LivenessIdx, spmd::LivenessIdx>>&
+        reduced_intervals,
+    std::vector<absl::btree_set<int64_t>>& reduced_groups) {
+  int64_t num_lives = 0;
+  for (const auto& interval : intervals) {
+    if (interval.first > interval.second) continue;  // Interval undefined
+    num_lives = std::max(num_lives, interval.second + 1);
+  }
+  auto Intervals =
+      [intervals](int64_t prim_idx) -> std::pair<int64_t, int64_t> {
+    return intervals.at(prim_idx);
+  };
+  spmd::MemoryTermReducer reducer;
+  auto num_terms =
+      reducer.Reduce(num_lives, num_primitives, std::move(Intervals));
+  reduced_intervals = reducer.GetReducedIntervals();
+  reduced_groups = reducer.GetReducedGroups();
+  return num_terms;
+}
+
 absl::StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
     HloModule* module,
     const absl::flat_hash_set<std::string>& replicated_small_tensors,
@@ -3613,13 +3900,6 @@ absl::StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
                "custom_call_target=Sharding.";
   }
 
-  std::pair<absl::flat_hash_map<std::string, std::vector<HloSharding>>, bool>
-      preserve_shardings_result = SaveAndRemoveShardingAnnotation(
-          module, replicated_small_tensors, execution_threads);
-  absl::flat_hash_map<std::string, std::vector<HloSharding>>
-      preserve_shardings = preserve_shardings_result.first;
-  module_is_changed |= preserve_shardings_result.second;
-
   // ----- Get a sequential schedule and do liveness analysis -----
   auto size_fn = [](const BufferValue& buffer) {
     return spmd::GetBytes(buffer.shape());
@@ -3662,6 +3942,17 @@ absl::StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
   const HloInstructionSequence& sequence =
       hlo_live_range->flattened_instruction_sequence();
 
+  const absl::flat_hash_set<const HloInstruction*>& instructions_to_shard =
+      ComputeInstructionsToShard(*module, sequence);
+
+  std::pair<absl::flat_hash_map<std::string, std::vector<HloSharding>>, bool>
+      preserve_shardings_result = SaveAndRemoveShardingAnnotation(
+          module, instructions_to_shard, replicated_small_tensors,
+          execution_threads);
+  absl::flat_hash_map<std::string, std::vector<HloSharding>>
+      preserve_shardings = std::move(preserve_shardings_result.first);
+  module_is_changed |= preserve_shardings_result.second;
+
   absl::flat_hash_map<const HloInstruction*, int64_t>
       instruction_execution_counts = spmd::ComputeInstructionExecutionCounts(
           module, option_.loop_iteration_count_estimate);
@@ -3700,12 +3991,13 @@ absl::StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
     Array<int64_t> device_mesh(mesh_shape);
 
     int64_t total_devices = 1;
-    for (auto i : mesh_shape) {
+    for (int64_t i : mesh_shape) {
       total_devices *= i;
     }
     if (mesh_idx != partial_mesh_shapes.size() - 1) {
       absl::StatusOr<bool> changed = spmd::AdjustShardingsWithPartialMeshShape(
-          sequence.instructions(), mesh_shape, total_devices,
+          sequence.instructions(), instructions_to_shard, mesh_shape,
+          total_devices,
           /* crash_on_error */ !option_.try_multiple_mesh_shapes);
       if (changed.ok()) {
         LOG(INFO)
@@ -3734,8 +4026,8 @@ absl::StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
 
     XLA_VLOG_LINES(6, module->ToString());
     const int64_t memory_lower_bound = spmd::MemoryBudgetLowerBound(
-        *module, liveness_set, *alias_analysis, device_mesh.num_elements(),
-        preserve_shardings);
+        *module, instructions_to_shard, liveness_set, *alias_analysis,
+        device_mesh.num_elements(), preserve_shardings);
     const float memory_lower_bound_gb =
         static_cast<float>(memory_lower_bound) / (1024 * 1024 * 1024);
     LOG(INFO) << "Memory consumption lower bound is " << memory_lower_bound_gb
@@ -3750,7 +4042,11 @@ absl::StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
           << " = " << memory_lower_bound_gb * option_.memory_budget_ratio
           << " GB";
       option_.memory_budget_per_device =
-          memory_lower_bound * option_.memory_budget_ratio;
+          memory_lower_bound * std::abs(option_.memory_budget_ratio);
+      // TODO(b/341299984): Document this flag syntax, or automate the behavior.
+      if (option_.memory_budget_ratio < 0) {
+        option_.memory_overbudget_coeff = -1.0;  // Disables the soft constraint
+      }
     } else if (option_.memory_budget_per_device > 0) {
       option_.memory_budget_per_device = original_memory_budget *
                                          original_device_mesh.num_elements() /
@@ -3781,13 +4077,14 @@ absl::StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
     spmd::AssociativeDotPairs associative_dot_pairs;
     TF_ASSIGN_OR_RETURN(
         std::tie(strategy_map, strategy_groups, associative_dot_pairs),
-        BuildStrategyAndCost(
-            sequence, module, instruction_execution_counts, ins_depth_map,
-            batch_dim_map, alias_map, cluster_env, option_, *call_graph,
-            hlo_cost_analysis, option_.try_multiple_mesh_shapes));
+        BuildStrategyAndCost(sequence, module, instructions_to_shard,
+                             instruction_execution_counts, ins_depth_map,
+                             batch_dim_map, alias_map, cluster_env, option_,
+                             *call_graph, hlo_cost_analysis,
+                             option_.try_multiple_mesh_shapes));
     spmd::AliasSet alias_set =
         spmd::BuildAliasSet(module, input_output_alias_config, strategy_map);
-    if (Status alias_set_status = CheckAliasSetCompatibility(
+    if (absl::Status alias_set_status = CheckAliasSetCompatibility(
             alias_set, strategy_groups, sequence,
             /* crash_at_error */ !option_.try_multiple_mesh_shapes);
         !alias_set_status.ok()) {
@@ -3799,7 +4096,7 @@ absl::StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
     spmd::CostGraph cost_graph(strategy_groups, associative_dot_pairs);
     cost_graph.Simplify(option_.simplify_graph);
 
-    // ----- Build the liveness node & edge sets -----
+    // ----- Build & reduce node and edge intervals -----
     std::vector<absl::flat_hash_set<spmd::EdgeIdx>> node_to_edges(
         strategy_groups.size());
     spmd::EdgeIdx edge_idx = 0;
@@ -3807,34 +4104,82 @@ absl::StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
       node_to_edges[edge.second].insert(edge_idx);
       ++edge_idx;
     }
-    spmd::LivenessNodeSet liveness_node_set(liveness_set.size());
-    spmd::LivenessEdgeSet liveness_edge_set(liveness_set.size());
-    for (spmd::LivenessIdx t = 0; t < liveness_set.size(); ++t) {
-      for (const HloValue* value : liveness_set[t]) {
-        const HloInstruction* instruction = value->instruction();
-        const ShapeIndex& index = value->index();
-        if (instruction->shape().IsTuple() && index.empty()) continue;
-        const spmd::StrategyGroup* strategy_group =
-            strategy_map.at(instruction).get();
-        const spmd::NodeIdx node_idx =
-            strategy_group->GetSubStrategyGroup(index)->node_idx;
-        if (node_idx < 0) continue;
-        liveness_node_set[t].push_back(node_idx);
-        for (const spmd::EdgeIdx edge_idx : node_to_edges[node_idx]) {
-          liveness_edge_set[t].push_back(edge_idx);
-        }
+    const absl::flat_hash_map<const HloValue*, HloLiveRange::TimeBound>&
+        buffer_live_ranges = hlo_live_range->buffer_live_ranges();
+    absl::flat_hash_map<spmd::NodeIdx, HloLiveRange::TimeBound>
+        node_to_time_bound;
+    absl::flat_hash_map<spmd::EdgeIdx, HloLiveRange::TimeBound>
+        edge_to_time_bound;
+    for (const auto& [value, time_bound] : buffer_live_ranges) {
+      const HloInstruction* instruction = value->instruction();
+      const ShapeIndex& index = value->index();
+      if (instruction->shape().IsTuple() && index.empty()) continue;
+      const spmd::StrategyGroup* strategy_group =
+          strategy_map.at(instruction).get();
+      const spmd::NodeIdx node_idx =
+          strategy_group->GetSubStrategyGroup(index)->node_idx;
+      if (node_idx < 0) continue;
+      node_to_time_bound[node_idx] = time_bound;
+      for (const spmd::EdgeIdx edge_idx : node_to_edges[node_idx]) {
+        edge_to_time_bound[edge_idx] = time_bound;
       }
-      std::sort(liveness_node_set[t].begin(), liveness_node_set[t].end());
-      std::sort(liveness_edge_set[t].begin(), liveness_edge_set[t].end());
     }
+    std::vector<std::pair<spmd::LivenessIdx, spmd::LivenessIdx>> node_intervals,
+        edge_intervals;
+    for (spmd::NodeIdx node_idx = 0; node_idx < strategy_groups.size();
+         ++node_idx) {
+      std::pair<spmd::LivenessIdx, spmd::LivenessIdx> interval;
+      if (auto time_bound = node_to_time_bound.find(node_idx);
+          time_bound != node_to_time_bound.end()) {
+        interval.first = time_bound->second.start;
+        interval.second = time_bound->second.end;
+      } else {
+        interval.first = std::numeric_limits<int64_t>::max();
+        interval.second = 0;
+      }
+      node_intervals.push_back(std::move(interval));
+    }
+    for (spmd::EdgeIdx edge_idx = 0; edge_idx < cost_graph.edge_costs_.size();
+         ++edge_idx) {
+      std::pair<spmd::LivenessIdx, spmd::LivenessIdx> interval;
+      if (auto time_bound = edge_to_time_bound.find(edge_idx);
+          time_bound != edge_to_time_bound.end()) {
+        interval.first = time_bound->second.start;
+        interval.second = time_bound->second.end;
+      } else {
+        interval.first = std::numeric_limits<int64_t>::max();
+        interval.second = 0;
+      }
+      edge_intervals.push_back(std::move(interval));
+    }
+    const absl::Time term_reduction_start_time = absl::Now();
+    std::vector<std::pair<spmd::LivenessIdx, spmd::LivenessIdx>>
+        reduced_node_intervals, reduced_edge_intervals;
+    std::vector<absl::btree_set<int64_t>> reduced_node_groups,
+        reduced_edge_groups;
+    auto num_node_terms =
+        ReduceMemoryTerms(strategy_groups.size(), node_intervals,
+                          reduced_node_intervals, reduced_node_groups);
+    auto num_edge_terms =
+        ReduceMemoryTerms(cost_graph.edge_costs_.size(), edge_intervals,
+                          reduced_edge_intervals, reduced_edge_groups);
+    const absl::Time term_reduction_end_time = absl::Now();
+    const auto term_reduction_duration =
+        term_reduction_end_time - term_reduction_start_time;
+    LOG(INFO) << "Memory Term Reducer took "
+              << absl::ToInt64Milliseconds(term_reduction_duration)
+              << " ms and reduced the number of terms from "
+              << num_node_terms.first + num_edge_terms.first << " to "
+              << num_node_terms.second + num_edge_terms.second;
 
     // ----- Call the ILP Solver -----
     spmd::AutoShardingSolverOutput output;
     std::string request_name = absl::StrCat("mesh_idx_", mesh_idx);
     auto solver_result =
-        Solve(*module, *hlo_live_range, liveness_node_set, liveness_edge_set,
-              strategy_map, strategy_groups, cost_graph, alias_set, option_,
-              request_name, sharding_propagation_solution);
+        Solve(*module, *hlo_live_range, strategy_map, strategy_groups,
+              cost_graph, alias_set, reduced_node_intervals,
+              reduced_edge_intervals, reduced_node_groups, reduced_edge_groups,
+              option_, request_name, sharding_propagation_solution);
     if (solver_result.skip_auto_sharding) {
       return AutoShardingResult::kModuleUnchangedNoShardingPerformed;
     } else if (!solver_result.status.ok()) {
@@ -3860,11 +4205,12 @@ absl::StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
           output.s_val, cluster_env, option_));
     }
     // ----- Set Sharding -----
-    SetHloSharding(sequence, strategy_map, cost_graph, output.s_val,
-                   (mesh_idx == partial_mesh_shapes.size() - 1));
+    SetHloSharding(sequence, instructions_to_shard, strategy_map, cost_graph,
+                   output.s_val, (mesh_idx == partial_mesh_shapes.size() - 1));
     if (mesh_idx == partial_mesh_shapes.size() - 1) {
       if (!SetHloShardingPostProcessing(
-               sequence, strategy_map, cost_graph, output.s_val, cluster_env,
+               sequence, instructions_to_shard, strategy_map, cost_graph,
+               output.s_val, cluster_env,
                /* crash_at_error */ !option_.try_multiple_mesh_shapes,
                preserve_shardings)
                .ok()) {
@@ -3876,7 +4222,8 @@ absl::StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
   }
 
   if (VLOG_IS_ON(1)) {
-    spmd::CheckHloSharding(sequence, original_device_mesh.num_elements());
+    spmd::CheckHloSharding(sequence, instructions_to_shard,
+                           original_device_mesh.num_elements());
   }
   module_is_changed = true;
 
@@ -3887,6 +4234,27 @@ absl::StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
   // ----- Canonicalize layouts based on LayoutCanonicalizationCallback. -----
   TF_RETURN_IF_ERROR(CanonicalizeLayouts(module));
 
+  for (HloInstruction* instruction : sequence.instructions()) {
+    if (!instructions_to_shard.contains(instruction)) {
+      instruction->set_sharding(
+          HloSharding::Single(instruction->shape(), HloSharding::Manual()));
+    }
+  }
+
+  for (HloInstruction* instruction : sequence.instructions()) {
+    if (spmd::IsSPMDFullToShardShapeCustomCall(instruction)) {
+      CHECK(instruction->has_sharding());
+      CHECK(instruction->sharding().IsManual());
+      CHECK(instruction->operand(0)->has_sharding());
+      CHECK(!instruction->operand(0)->sharding().IsManual());
+    } else if (spmd::IsSPMDShardToFullShapeCustomCall(instruction)) {
+      CHECK(instruction->has_sharding());
+      CHECK(!instruction->sharding().IsManual());
+      CHECK(instruction->operand(0)->has_sharding());
+      CHECK(instruction->operand(0)->sharding().IsManual());
+    }
+  }
+
   return module_is_changed ? AutoShardingResult::kModuleChangedShardingPerformed
                            : AutoShardingResult::kModuleUnchanged;
 }
@@ -3907,17 +4275,11 @@ bool ModuleHasUserShardings(const HloModule* module) {
   return has_shardings;
 }
 
-bool IsSmallTensor(const HloInstruction* ins,
-                   const AutoShardingOption& option) {
-  return spmd::GetInstructionSize(ins->shape()) <=
-         option.small_tensor_byte_size;
-}
-
-bool IsModuleManuallySharded(const HloModule* module) {
-  for (const auto* computation : module->computations()) {
-    for (const auto* instruction : computation->instructions()) {
-      if (HloCollectiveInstruction::ClassOf(instruction) ||
-          spmd::IsManualShardingBoundaryCustomCall(instruction)) {
+bool ModuleIsManuallyPartitioned(const HloModule* module) {
+  for (const HloComputation* computation : module->computations()) {
+    for (const HloInstruction* instruction : computation->instructions()) {
+      if (spmd::IsSPMDFullToShardShapeCustomCall(instruction) ||
+          spmd::IsSPMDShardToFullShapeCustomCall(instruction)) {
         return true;
       }
     }
@@ -3925,6 +4287,12 @@ bool IsModuleManuallySharded(const HloModule* module) {
   return false;
 }
 
+bool IsSmallTensor(const HloInstruction* ins,
+                   const AutoShardingOption& option) {
+  return spmd::GetInstructionSize(ins->shape()) <=
+         option.small_tensor_byte_size;
+}
+
 bool ShardedOnTooManyMeshAxes(const HloModule& module) {
   for (const auto* computation : module.computations()) {
     for (const auto* instruction : computation->instructions()) {
@@ -3970,14 +4338,6 @@ absl::StatusOr<bool> AutoSharding::Run(
   }
   LOG(INFO) << "Starting the auto sharding pass";
 
-  if (IsModuleManuallySharded(module)) {
-    LOG(FATAL)
-        << "Auto-sharding on partially manually sharded modules "  // Crash OK
-           "is not yet supported. Please fall back on the sharding "
-           "propagation pass.";
-    return false;
-  }
-
   if (ShardedOnTooManyMeshAxes(*module)) {
     LOG(FATAL) << "The input module contains sharding annotations "  // Crash OK
                   "over a mesh with too many axes (>2). This case is currently "
@@ -4027,8 +4387,9 @@ absl::StatusOr<bool> AutoSharding::Run(
     }
   }
 
+  bool module_is_manually_partitioned = ModuleIsManuallyPartitioned(module);
   std::vector<std::vector<int64_t>> mesh_shapes;
-  if (option_.try_multiple_mesh_shapes) {
+  if (option_.try_multiple_mesh_shapes || module_is_manually_partitioned) {
     bool asymmetrical_mesh_dims = false;
     for (size_t i = 0; i < option_.device_mesh_shape.size(); ++i) {
       if (option_.device_mesh_beta[0] != option_.device_mesh_beta[i] ||
@@ -4048,6 +4409,11 @@ absl::StatusOr<bool> AutoSharding::Run(
     mesh_shapes.push_back(option_.device_mesh_shape);
   }
 
+  CHECK(option_.try_multiple_mesh_shapes || mesh_shapes.size() == 1)
+      << "Auto-sharding cannot infer a single appropriate mesh shape for this "
+         "HLO, and AutoShardingption::try_multiple_mesh_shapes is set to "
+         "false. Please re-run with the option set to true.";
+
   if (module->entry_computation()->num_parameters() > 0) {
     HloInstruction* parameter_instruction =
         module->entry_computation()->parameter_instruction(0);
@@ -4151,8 +4517,7 @@ absl::StatusOr<bool> AutoSharding::Run(
 
   absl::StatusOr<bool> module_is_changed;
   if (skip_auto_sharding) {
-    LOG(FATAL) << "The auto-sharding solver has timed out without a solution.";
-    module_is_changed = false;
+    module_is_changed = false;  // The auto-sharding solver timed out.
   } else {
     std::string trying_to_find;
     if (option_.try_multiple_mesh_shapes) {
@@ -4233,6 +4598,10 @@ absl::StatusOr<bool> AutoSharding::Run(
   XLA_VLOG_LINES(6, absl::StrCat("After auto sharding:\n", module->ToString()));
   DumpHloModuleIfEnabled(*module, "after_auto_spmd_sharding");
 
+  if (skip_auto_sharding) {
+    LOG(FATAL) << "The auto-sharding solver has timed out without a solution.";
+  }
+
   return module_is_changed;
 }
 
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h
index 9a994bfce9a7ed..b7266fbbdad321 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/btree_set.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/string_view.h"
@@ -84,6 +85,7 @@ class AutoShardingImplementation {
   std::pair<absl::flat_hash_map<std::string, std::vector<HloSharding>>, bool>
   SaveAndRemoveShardingAnnotation(
       HloModule* module,
+      const absl::flat_hash_set<const HloInstruction*>& instructions_to_shard,
       const absl::flat_hash_set<std::string>& replicated_small_tensors,
       const absl::flat_hash_set<absl::string_view>& execution_threads);
 
@@ -93,7 +95,7 @@ class AutoShardingImplementation {
   // PJRT which assigns layouts based on runtime shapes: see
   // DetermineArgumentLayoutsFromCompileOptions() in
   //     tensorflow/compiler/xla/pjrt/utils.cc
-  Status CanonicalizeLayouts(HloModule* module);
+  absl::Status CanonicalizeLayouts(HloModule* module);
 
   // Returns the optimal objective value that the ILP solver computes.
   double GetSolverOptimalObjectiveValue() {
@@ -162,30 +164,33 @@ void SetInNodesWithInstruction(std::unique_ptr<StrategyGroup>& strategy_group,
 
 void RemoveDuplicatedStrategy(std::unique_ptr<StrategyGroup>& strategy_group);
 
-Status FilterStrategy(const HloInstruction* ins, const Shape& shape,
-                      std::unique_ptr<StrategyGroup>& strategy_group,
-                      const ClusterEnvironment& cluster_env,
-                      const InstructionBatchDimMap& batch_map,
-                      const AutoShardingOption& option);
-
-Status HandleDot(std::unique_ptr<StrategyGroup>& strategy_group,
-                 StrategyGroups& strategy_groups, StrategyMap& strategy_map,
-                 const HloInstruction* ins, size_t instruction_id,
-                 const HloInstructionSequence& instruction_sequence,
-                 const HloCostAnalysis& hlo_cost_analysis,
-                 const ClusterEnvironment& cluster_env,
-                 const InstructionBatchDimMap& batch_map,
-                 const AutoShardingOption& option, const CallGraph& call_graph);
-
-Status HandleConv(std::unique_ptr<StrategyGroup>& strategy_group,
-                  StrategyGroups& strategy_groups, StrategyMap& strategy_map,
-                  const HloInstruction* ins, size_t instruction_id,
-                  const HloInstructionSequence& instruction_sequence,
-                  const HloCostAnalysis& hlo_cost_analysis,
-                  const ClusterEnvironment& cluster_env,
-                  const InstructionBatchDimMap& batch_map,
-                  const AutoShardingOption& option,
-                  const CallGraph& call_graph);
+absl::Status FilterStrategy(const HloInstruction* ins, const Shape& shape,
+                            std::unique_ptr<StrategyGroup>& strategy_group,
+                            const ClusterEnvironment& cluster_env,
+                            const InstructionBatchDimMap& batch_map,
+                            const AutoShardingOption& option);
+
+absl::Status HandleDot(std::unique_ptr<StrategyGroup>& strategy_group,
+                       StrategyGroups& strategy_groups,
+                       StrategyMap& strategy_map, const HloInstruction* ins,
+                       size_t instruction_id,
+                       const HloInstructionSequence& instruction_sequence,
+                       const HloCostAnalysis& hlo_cost_analysis,
+                       const ClusterEnvironment& cluster_env,
+                       const InstructionBatchDimMap& batch_map,
+                       const AutoShardingOption& option,
+                       const CallGraph& call_graph);
+
+absl::Status HandleConv(std::unique_ptr<StrategyGroup>& strategy_group,
+                        StrategyGroups& strategy_groups,
+                        StrategyMap& strategy_map, const HloInstruction* ins,
+                        size_t instruction_id,
+                        const HloInstructionSequence& instruction_sequence,
+                        const HloCostAnalysis& hlo_cost_analysis,
+                        const ClusterEnvironment& cluster_env,
+                        const InstructionBatchDimMap& batch_map,
+                        const AutoShardingOption& option,
+                        const CallGraph& call_graph);
 
 void AnnotateShardingWithSimpleHeuristic(HloModule* module,
                                          const std::string& heuristic,
@@ -205,10 +210,10 @@ AliasMap BuildAliasMap(const HloModule* module);
 AliasSet BuildAliasSet(const HloModule* module,
                        const StrategyMap& strategy_map);
 
-Status CheckAliasSetCompatibility(const AliasSet& alias_set,
-                                  const StrategyGroups& strategy_groups,
-                                  const HloInstructionSequence& sequence,
-                                  bool crash_on_error);
+absl::Status CheckAliasSetCompatibility(const AliasSet& alias_set,
+                                        const StrategyGroups& strategy_groups,
+                                        const HloInstructionSequence& sequence,
+                                        bool crash_on_error);
 
 absl::Status GenerateReduceScatter(
     const HloInstructionSequence& sequence, const AliasMap& alias_map,
@@ -228,11 +233,13 @@ HloSharding GetReduceScatterOutput(const HloInstruction* ins,
 // The high-level "recipe" for solving an Auto Sharding problem.
 AutoShardingSolverResult Solve(
     const HloModule& hlo_module, const HloLiveRange& hlo_live_range,
-    const LivenessNodeSet& liveness_node_set,
-    const LivenessEdgeSet& liveness_edge_set, const StrategyMap& strategy_map,
-    const StrategyGroups& strategy_groups, const CostGraph& cost_graph,
-    const AliasSet& alias_set, const AutoShardingOption& option,
-    absl::string_view request_prefix,
+    const StrategyMap& strategy_map, const StrategyGroups& strategy_groups,
+    const CostGraph& cost_graph, const AliasSet& alias_set,
+    const std::vector<std::pair<LivenessIdx, LivenessIdx>>& node_intervals,
+    const std::vector<std::pair<LivenessIdx, LivenessIdx>>& edge_intervals,
+    const std::vector<absl::btree_set<int64_t>>& node_groups,
+    const std::vector<absl::btree_set<int64_t>>& edge_groups,
+    const AutoShardingOption& option, absl::string_view request_prefix,
     const absl::flat_hash_map<std::string, const HloInstruction*>&
         sharding_propagation_solution = {});
 
@@ -283,6 +290,15 @@ std::unique_ptr<StrategyGroup> CreateElementwiseOperatorStrategies(
     int64_t max_depth, StrategyGroups& strategy_groups,
     AssociativeDotPairs& associative_dot_pairs);
 
+std::unique_ptr<StrategyGroup> HandleManuallyShardedInstruction(
+    const HloInstruction* ins, const Shape& shape, size_t instruction_id,
+    StrategyGroups& strategy_groups, StrategyMap& strategy_map);
+
+std::unique_ptr<StrategyGroup> HandlePartialReduce(
+    const HloInstruction* ins, size_t instruction_id, bool have_memory_cost,
+    StrategyGroups& strategy_groups, const ClusterEnvironment& cluster_env,
+    StrategyMap& strategy_map, const CallGraph& call_graph);
+
 // Factory functions for StrategyGroup.
 std::unique_ptr<StrategyGroup> CreateLeafStrategyGroupWithoutInNodes(
     size_t instruction_id, StrategyGroups& strategy_groups);
@@ -373,24 +389,25 @@ void TrimOrGenerateStrategiesBasedOnExistingSharding(
 
 // Build possible sharding strategies and their costs for all instructions.
 absl::StatusOr<std::tuple<StrategyMap, StrategyGroups, AssociativeDotPairs>>
-BuildStrategyAndCost(const HloInstructionSequence& sequence,
-                     const HloModule* module,
-                     const absl::flat_hash_map<const HloInstruction*, int64_t>&
-                         instruction_execution_counts,
-                     const InstructionDepthMap& depth_map,
-                     const InstructionBatchDimMap& batch_dim_map,
-                     const AliasMap& alias_map,
-                     const ClusterEnvironment& cluster_env,
-                     AutoShardingOption& option, const CallGraph& call_graph,
-                     const HloCostAnalysis& hlo_cost_analysis,
-                     bool trying_multiple_mesh_shapes);
+BuildStrategyAndCost(
+    const HloInstructionSequence& sequence, const HloModule* module,
+    const absl::flat_hash_set<const HloInstruction*>& instructions_to_shard,
+    const absl::flat_hash_map<const HloInstruction*, int64_t>&
+        instruction_execution_counts,
+    const InstructionDepthMap& depth_map,
+    const InstructionBatchDimMap& batch_dim_map, const AliasMap& alias_map,
+    const ClusterEnvironment& cluster_env, AutoShardingOption& option,
+    const CallGraph& call_graph, const HloCostAnalysis& hlo_cost_analysis,
+    bool trying_multiple_mesh_shapes);
 
 // Computes an approximate lower bound on the per-device memory usage of a
 // module once it has been sharded. This quantity is multiplied with
 // memory_budget_ratio to obtain the memory budget using in our ILP formulation.
 int64_t MemoryBudgetLowerBound(
-    const HloModule& module, const LivenessSet& liveness_set,
-    const HloAliasAnalysis& alias_analysis, int64_t num_devices,
+    const HloModule& module,
+    const absl::flat_hash_set<const HloInstruction*>& instructions_to_shard,
+    const LivenessSet& liveness_set, const HloAliasAnalysis& alias_analysis,
+    int64_t num_devices,
     const absl::flat_hash_map<std::string, std::vector<HloSharding>>&
         preserved_shardings);
 
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.proto b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.proto
index 102b3e86e35a05..ad370117cdd495 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.proto
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.proto
@@ -40,6 +40,9 @@ message AutoShardingSolverRequest {
   message Coeff {
     double coeff = 1;
   }
+  message Group {
+    repeated int64 prims = 1;  // Node or edge primitive indices.
+  }
 
   int64 num_nodes = 1;
   int64 memory_budget = 2;
@@ -50,6 +53,10 @@ message AutoShardingSolverRequest {
   repeated Pair edges = 6;
   repeated Nodes live = 7;
   repeated Edges live_edges = 28;
+  repeated Pair node_intervals = 36;
+  repeated Pair edge_intervals = 37;
+  repeated Group node_groups = 38;
+  repeated Group edge_groups = 39;
   repeated Costs computation_costs = 8;
   repeated Costs communication_costs = 9;
   repeated Costs memory_costs = 10;
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
index 84c22df6fa0746..1405ceffe14a88 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
@@ -23,8 +23,11 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
@@ -87,6 +90,8 @@ class HandlerBase {
         lhs_(ins->operand(0)),
         rhs_(ins->operand(1)) {}
 
+  virtual ~HandlerBase() = default;
+
   void AppendNewStrategy(const std::string& name,
                          const HloSharding& output_spec,
                          absl::Span<const HloSharding> input_specs,
@@ -113,6 +118,34 @@ class HandlerBase {
       const std::optional<std::function<double(const HloSharding&)>>&
           communication_cost_fn = std::nullopt);
 
+  // Given lhs and rhs dim maps, infers a sharding for the output by relying on
+  // the sharding_propagation pass.
+  void MaybeAppendInternal(
+      const std::string& name, const DimMap& lhs_dim_map,
+      const DimMap& rhs_dim_map,
+      const std::optional<DimMap>& expected_output_dim_map,
+      const Array<int64_t>& device_mesh, double compute_cost = 0,
+      const std::optional<std::function<double(const HloSharding&)>>&
+          communication_cost_fn = std::nullopt);
+
+  // Given an existing (non-allreduce) sharding candidate, generate a
+  // corresponding candidate by additionally sharding (if possible) the passed
+  // in operand, such that, the generated candidate can trigger all-gather
+  // windowed einsum during partitioning.
+  virtual void AppendAllGatherWindowedEinsumStrategyForOperand(
+      int operand_num, const std::string& name, const DimMap& lhs_dim_map,
+      const DimMap& rhs_dim_map, const DimMap& output_dim_map,
+      const Array<int64_t>& device_mesh, double compute_cost) {}
+
+  // Given an existing (allreduce) sharding candidate, generate a corresponding
+  // candidate by additionally sharding (if possible) the dot/conv output, such
+  // that, the generated candidate can trigger reduce-scatter windowed einsum
+  // during partitioning.
+  virtual void AppendReduceScatterWindowedEinsumStrategy(
+      const std::string& name, const DimMap& lhs_dim_map,
+      const DimMap& rhs_dim_map, const DimMap& output_dim_map,
+      const Array<int64_t>& device_mesh, double compute_cost) {}
+
   std::optional<HloSharding> GetShardingFromUser(const HloSharding& lhs_spec,
                                                  const HloSharding& rhs_spec);
 
@@ -177,6 +210,8 @@ class DotHandler : public HandlerBase {
       const InstructionBatchDimMap& batch_map, const AutoShardingOption& option,
       const CallGraph& call_graph);
 
+  ~DotHandler() override = default;
+
   void SplitLhsSpaceRhsSpace();
 
   void SplitLhsSpaceOnly();
@@ -205,7 +240,17 @@ class DotHandler : public HandlerBase {
 
   void Add1DBatchSplit();
 
-  Status RegisterStrategies();
+  void AppendAllGatherWindowedEinsumStrategyForOperand(
+      int operand_num, const std::string& name, const DimMap& lhs_dim_map,
+      const DimMap& rhs_dim_map, const DimMap& output_dim_map,
+      const Array<int64_t>& device_mesh, double compute_cost) override;
+
+  void AppendReduceScatterWindowedEinsumStrategy(
+      const std::string& name, const DimMap& lhs_dim_map,
+      const DimMap& rhs_dim_map, const DimMap& output_dim_map,
+      const Array<int64_t>& device_mesh, double compute_cost) override;
+
+  absl::Status RegisterStrategies();
 
   // Dimension information
   bool is_dot_;
@@ -228,6 +273,8 @@ class ConvHandler : public HandlerBase {
               const InstructionBatchDimMap& batch_map,
               const AutoShardingOption& option, const CallGraph& call_graph);
 
+  ~ConvHandler() override = default;
+
   void SplitLhsBatchRhsOutchannel();
 
   void SplitLhsBatchBothInchannel();
@@ -238,7 +285,7 @@ class ConvHandler : public HandlerBase {
 
   void SplitDepthwise(bool forward);
 
-  Status RegisterStrategies();
+  absl::Status RegisterStrategies();
 
   // Dimension information
   const ConvolutionDimensionNumbers& conv_dnums_;
@@ -287,7 +334,7 @@ void HandlerBase::AppendNewStrategy(const std::string& name,
 // TODO(b/309638633) As we build more confidence in this, we should remove
 // this expected_output_dim_map argument and fully rely on sharding
 // propagation.
-void HandlerBase::MaybeAppend(
+void HandlerBase::MaybeAppendInternal(
     const std::string& name, const DimMap& lhs_dim_map,
     const DimMap& rhs_dim_map,
     const std::optional<DimMap>& expected_output_dim_map,
@@ -336,6 +383,35 @@ void HandlerBase::MaybeAppend(
                     communication_cost);
 }
 
+void HandlerBase::MaybeAppend(
+    const std::string& name, const DimMap& lhs_dim_map,
+    const DimMap& rhs_dim_map,
+    const std::optional<DimMap>& expected_output_dim_map,
+    const Array<int64_t>& device_mesh, double compute_cost,
+    const std::optional<std::function<double(const HloSharding&)>>&
+        communication_cost_fn) {
+  MaybeAppendInternal(name, lhs_dim_map, rhs_dim_map, expected_output_dim_map,
+                      device_mesh, compute_cost, communication_cost_fn);
+  if (!option_.generate_windowed_einsum_strategies ||
+      !expected_output_dim_map.has_value()) {
+    return;
+  }
+  if (absl::StrContains(name, "allreduce")) {
+    CHECK(communication_cost_fn.has_value());
+    AppendReduceScatterWindowedEinsumStrategy(name, lhs_dim_map, rhs_dim_map,
+                                              *expected_output_dim_map,
+                                              device_mesh, compute_cost);
+  } else {
+    CHECK(!communication_cost_fn.has_value());
+    AppendAllGatherWindowedEinsumStrategyForOperand(
+        0, name, lhs_dim_map, rhs_dim_map, *expected_output_dim_map,
+        device_mesh, compute_cost);
+    AppendAllGatherWindowedEinsumStrategyForOperand(
+        1, name, lhs_dim_map, rhs_dim_map, *expected_output_dim_map,
+        device_mesh, compute_cost);
+  }
+}
+
 std::optional<HloSharding> HandlerBase::GetShardingFromUser(
     const HloSharding& lhs_spec, const HloSharding& rhs_spec) {
   std::unique_ptr<HloInstruction> ins_clone = ins_->Clone();
@@ -771,7 +847,109 @@ void DotHandler::Add1DBatchSplit() {
   }
 }
 
-Status DotHandler::RegisterStrategies() {
+void DotHandler::AppendAllGatherWindowedEinsumStrategyForOperand(
+    int operand_num, const std::string& name, const DimMap& lhs_dim_map,
+    const DimMap& rhs_dim_map, const DimMap& output_dim_map,
+    const Array<int64_t>& device_mesh, double compute_cost) {
+  const HloInstruction* operand = ins_->operand(operand_num);
+  const DimMap& operand_dim_map = operand_num == 0 ? lhs_dim_map : rhs_dim_map;
+  absl::flat_hash_set<int64_t> sharded_tensor_dims;
+  absl::flat_hash_set<int64_t> used_mesh_dims;
+  for (const auto [tensor_dim, mesh_dim] : operand_dim_map) {
+    if (device_mesh.dim(mesh_dim) == 1) {
+      continue;
+    }
+    sharded_tensor_dims.insert(tensor_dim);
+    used_mesh_dims.insert(mesh_dim);
+  }
+  if (used_mesh_dims.size() == device_mesh_.num_dimensions() ||
+      sharded_tensor_dims.size() == operand->shape().rank()) {
+    return;
+  }
+
+  for (int64_t tensor_dim = 0; tensor_dim < operand->shape().rank();
+       ++tensor_dim) {
+    if (sharded_tensor_dims.contains(tensor_dim)) {
+      continue;
+    }
+    for (int64_t mesh_dim = 0; mesh_dim < device_mesh_.num_dimensions();
+         ++mesh_dim) {
+      if (used_mesh_dims.contains(mesh_dim) ||
+          (device_mesh.dim(mesh_dim) == 1)) {
+        continue;
+      }
+      DimMap further_sharded_dim_map = operand_dim_map;
+      further_sharded_dim_map[tensor_dim] = mesh_dim;
+
+      auto updated_communication_cost_fn =
+          [](const HloSharding& output_sharding) -> double {
+        // TODO(331684721): Model costs for windowed einsum
+        return 100.0;
+      };
+
+      std::string updated_name =
+          absl::StrCat(absl::StrFormat("WindowedEinsum @ {%d,%d,%d}",
+                                       operand_num, tensor_dim, mesh_dim),
+                       name);
+      MaybeAppendInternal(
+          updated_name,
+          operand_num == 0 ? further_sharded_dim_map : lhs_dim_map,
+          operand_num == 1 ? further_sharded_dim_map : rhs_dim_map,
+          output_dim_map, device_mesh, compute_cost,
+          updated_communication_cost_fn);
+    }
+  }
+}
+
+void DotHandler::AppendReduceScatterWindowedEinsumStrategy(
+    const std::string& name, const DimMap& lhs_dim_map,
+    const DimMap& rhs_dim_map, const DimMap& output_dim_map,
+    const Array<int64_t>& device_mesh, double compute_cost) {
+  absl::flat_hash_set<int64_t> sharded_tensor_dims;
+  absl::flat_hash_set<int64_t> used_mesh_dims;
+  for (const auto [tensor_dim, mesh_dim] : output_dim_map) {
+    if (device_mesh.dim(mesh_dim) == 1) {
+      continue;
+    }
+    sharded_tensor_dims.insert(tensor_dim);
+    used_mesh_dims.insert(mesh_dim);
+  }
+  if (used_mesh_dims.size() == device_mesh_.num_dimensions() ||
+      sharded_tensor_dims.size() == ins_->shape().rank()) {
+    return;
+  }
+
+  for (int64_t tensor_dim = 0; tensor_dim < ins_->shape().rank();
+       ++tensor_dim) {
+    if (sharded_tensor_dims.contains(tensor_dim)) {
+      continue;
+    }
+    for (int64_t mesh_dim = 0; mesh_dim < device_mesh_.num_dimensions();
+         ++mesh_dim) {
+      if (used_mesh_dims.contains(mesh_dim) ||
+          (device_mesh.dim(mesh_dim) == 1)) {
+        continue;
+      }
+      DimMap further_sharded_dim_map = output_dim_map;
+      further_sharded_dim_map[tensor_dim] = mesh_dim;
+
+      auto updated_communication_cost_fn =
+          [](const HloSharding& output_sharding) -> double {
+        // TODO(331684721): Model costs for windowed einsum
+        return 100.0;
+      };
+
+      std::string updated_name = absl::StrCat(
+          absl::StrFormat("WindowedEinsum @ {%d,%d}", tensor_dim, mesh_dim),
+          name);
+      MaybeAppendInternal(updated_name, lhs_dim_map, rhs_dim_map,
+                          further_sharded_dim_map, device_mesh, compute_cost,
+                          updated_communication_cost_fn);
+    }
+  }
+}
+
+absl::Status DotHandler::RegisterStrategies() {
   // SS = SR x RS
   // Split lhs space dim and rhs space dim.
   SplitLhsSpaceRhsSpace();
@@ -883,7 +1061,7 @@ ConvHandler::ConvHandler(std::unique_ptr<StrategyGroup>& strategy_group,
   out_out_channel_dim_ = conv_dnums_.output_feature_dimension();
 }
 
-Status ConvHandler::RegisterStrategies() {
+absl::Status ConvHandler::RegisterStrategies() {
   // For 1D sharding
   if ((ins_->feature_group_count() ==
            lhs_->shape().dimensions(lhs_in_channel_dim_) &&
@@ -1038,15 +1216,16 @@ void ConvHandler::SplitDepthwise(bool forward) {
 }  // namespace
 
 // Register strategies for dot instructions.
-Status HandleDot(std::unique_ptr<StrategyGroup>& strategy_group,
-                 StrategyGroups& strategy_groups, StrategyMap& strategy_map,
-                 const HloInstruction* ins, size_t instruction_id,
-                 const HloInstructionSequence& instruction_sequence,
-                 const HloCostAnalysis& hlo_cost_analysis,
-                 const ClusterEnvironment& cluster_env,
-                 const InstructionBatchDimMap& batch_map,
-                 const AutoShardingOption& option,
-                 const CallGraph& call_graph) {
+absl::Status HandleDot(std::unique_ptr<StrategyGroup>& strategy_group,
+                       StrategyGroups& strategy_groups,
+                       StrategyMap& strategy_map, const HloInstruction* ins,
+                       size_t instruction_id,
+                       const HloInstructionSequence& instruction_sequence,
+                       const HloCostAnalysis& hlo_cost_analysis,
+                       const ClusterEnvironment& cluster_env,
+                       const InstructionBatchDimMap& batch_map,
+                       const AutoShardingOption& option,
+                       const CallGraph& call_graph) {
   strategy_group = CreateLeafStrategyGroup(instruction_id, ins, strategy_map,
                                            strategy_groups);
 
@@ -1058,15 +1237,16 @@ Status HandleDot(std::unique_ptr<StrategyGroup>& strategy_group,
 }
 
 // Register strategies for convolution instructions.
-Status HandleConv(std::unique_ptr<StrategyGroup>& strategy_group,
-                  StrategyGroups& strategy_groups, StrategyMap& strategy_map,
-                  const HloInstruction* ins, size_t instruction_id,
-                  const HloInstructionSequence& instruction_sequence,
-                  const HloCostAnalysis& hlo_cost_analysis,
-                  const ClusterEnvironment& cluster_env,
-                  const InstructionBatchDimMap& batch_map,
-                  const AutoShardingOption& option,
-                  const CallGraph& call_graph) {
+absl::Status HandleConv(std::unique_ptr<StrategyGroup>& strategy_group,
+                        StrategyGroups& strategy_groups,
+                        StrategyMap& strategy_map, const HloInstruction* ins,
+                        size_t instruction_id,
+                        const HloInstructionSequence& instruction_sequence,
+                        const HloCostAnalysis& hlo_cost_analysis,
+                        const ClusterEnvironment& cluster_env,
+                        const InstructionBatchDimMap& batch_map,
+                        const AutoShardingOption& option,
+                        const CallGraph& call_graph) {
   strategy_group = CreateLeafStrategyGroup(instruction_id, ins, strategy_map,
                                            strategy_groups);
 
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_impl.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_impl.cc
index 1adcc3f6a05caf..fa4837a4ea385e 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_impl.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_impl.cc
@@ -14,9 +14,13 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cstddef>
+#include <cstdint>
 #include <optional>
 #include <string>
+#include <utility>
+#include <vector>
 
+#include "absl/container/btree_set.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h"
@@ -34,19 +38,22 @@ namespace spmd {
 
 AutoShardingSolverResult Solve(
     const HloModule& hlo_module, const HloLiveRange& hlo_live_range,
-    const LivenessNodeSet& liveness_node_set,
-    const LivenessEdgeSet& liveness_edge_set, const StrategyMap& strategy_map,
-    const StrategyGroups& strategy_groups, const CostGraph& cost_graph,
-    const AliasSet& alias_set, const AutoShardingOption& option,
-    absl::string_view request_prefix,
+    const StrategyMap& strategy_map, const StrategyGroups& strategy_groups,
+    const CostGraph& cost_graph, const AliasSet& alias_set,
+    const std::vector<std::pair<LivenessIdx, LivenessIdx>>& node_intervals,
+    const std::vector<std::pair<LivenessIdx, LivenessIdx>>& edge_intervals,
+    const std::vector<absl::btree_set<int64_t>>& node_groups,
+    const std::vector<absl::btree_set<int64_t>>& edge_groups,
+    const AutoShardingOption& option, absl::string_view request_prefix,
     const absl::flat_hash_map<std::string, const HloInstruction*>&
         sharding_propagation_solution) {
-  return CallSolver(
-      hlo_module, hlo_live_range, liveness_node_set, liveness_edge_set,
-      strategy_map, strategy_groups, cost_graph, alias_set, /*s_hint*/ {},
-      /*peak_times*/ {}, /*compute_iis*/ true, option.solver_timeout_in_seconds,
-      option, /*max_cost*/ std::nullopt, request_prefix,
-      sharding_propagation_solution, /*deterministic mode*/ true);
+  return CallSolver(hlo_module, hlo_live_range, strategy_map, strategy_groups,
+                    cost_graph, alias_set, node_intervals, edge_intervals,
+                    node_groups, edge_groups, /*s_hint*/ {},
+                    /*compute_iis*/ true, option.solver_timeout_in_seconds,
+                    option, /*max_cost*/ std::nullopt, request_prefix,
+                    sharding_propagation_solution,
+                    /*deterministic mode*/ true);
 }
 
 void PopulateTemporalValues(const CostGraph& cost_graph,
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_memory.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_memory.cc
index 22aa1ab6473c30..4d5fc89ad7ab21 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_memory.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_memory.cc
@@ -39,73 +39,140 @@ using LiveIdx = int64_t;  // Indexes into the liveness range (like a time point)
 using GroupIdx = int64_t;  // Indexes into the list of groups
 
 using PrimPair = std::pair<PrimIdx, PrimIdx>;
-using LiveAndPrim = std::pair<LiveIdx, PrimIdx>;
+using Interval = std::pair<LiveIdx, LiveIdx>;
+using ActivePrim = std::pair<Interval, PrimIdx>;
 
-struct Interval {
-  LiveIdx lower = std::numeric_limits<LiveIdx>::max();
-  LiveIdx upper = 0;
+bool IsValid(const Interval& interval) {
+  return interval.first <= interval.second;
+}
 
-  bool IsValid() const { return lower <= upper; }
-  int64_t length() const { return upper - lower + 1; }  // (closed interval)
-};
+int64_t length(const Interval& interval) {
+  return interval.second - interval.first + 1;  // (closed interval)
+}
 
 }  // namespace
 
-int64_t MemoryTermReducer::Reduce(
+std::pair<int64_t, int64_t> MemoryTermReducer::Reduce(
     int64_t num_lives, int64_t num_primitives,
-    std::function<tsl::protobuf::RepeatedField<int64_t>(int64_t)>  // NOLINT
-        live) {
+    const std::function<
+        tsl::protobuf::RepeatedField<int64_t>(int64_t)>&  // NOLINT
+        live,
+    int64_t max_iterations) {
   LOG(INFO) << "Memory Term Reducer beginning to reduce number of terms ...";
 
   // Clear internal state.
   reduced_live_.clear();
+  reduced_intervals_.clear();
   reduced_groups_.clear();
 
   // For each primitive, determine the live interval it spans.
   int64_t num_terms = 0;
-  std::vector<Interval> intervals(num_primitives);
+  reduced_intervals_.reserve(num_primitives);
+  for (PrimIdx prim_idx = 0; prim_idx < num_primitives; ++prim_idx) {
+    reduced_intervals_.push_back({std::numeric_limits<LiveIdx>::max(), 0});
+  }
   for (LiveIdx live_idx = 0; live_idx < num_lives; ++live_idx) {
     for (const PrimIdx prim_idx : live(live_idx)) {
-      intervals[prim_idx].lower = std::min(intervals[prim_idx].lower, live_idx);
-      intervals[prim_idx].upper = std::max(intervals[prim_idx].upper, live_idx);
+      Interval& interval = reduced_intervals_[prim_idx];
+      interval.first = std::min(interval.first, live_idx);
+      interval.second = std::max(interval.second, live_idx);
       ++num_terms;
     }
   }
 
+  Reduce(num_lives, num_primitives, max_iterations);
+
+  // Create the reduced live matrix.
+  int64_t num_reduced_terms = 0;
+  reduced_live_.resize(num_lives);
+  for (PrimIdx prim_idx = 0; prim_idx < reduced_intervals_.size(); ++prim_idx) {
+    const Interval& interval = reduced_intervals_[prim_idx];
+    for (LiveIdx live_idx = interval.first; live_idx <= interval.second;
+         ++live_idx) {
+      reduced_live_[live_idx].push_back(prim_idx);
+      ++num_reduced_terms;
+    }
+  }
+
+  // Add in any additional terms that will be needed to define groups.
+  for (const auto& group : reduced_groups_) num_reduced_terms += group.size();
+
+  LOG(INFO) << "Memory Term Reducer finished reducing the number of terms.";
+  return {num_terms, num_reduced_terms};
+}
+
+std::pair<int64_t, int64_t> MemoryTermReducer::Reduce(
+    int64_t num_lives, int64_t num_primitives,
+    const std::function<std::pair<int64_t, int64_t>(int64_t)>& intervals,
+    int64_t max_iterations) {
+  LOG(INFO) << "Memory Term Reducer beginning to reduce number of terms ...";
+
+  // Clear internal state.
+  reduced_live_.clear();
+  reduced_intervals_.clear();
+  reduced_groups_.clear();
+
+  // For each primitive, record the live interval it spans.
+  int64_t num_terms = 0;
+  reduced_intervals_.reserve(num_primitives);
+  for (PrimIdx prim_idx = 0; prim_idx < num_primitives; ++prim_idx) {
+    reduced_intervals_.push_back(intervals(prim_idx));
+    const Interval& interval = reduced_intervals_.back();
+    if (IsValid(interval)) num_terms += length(interval);
+  }
+
+  Reduce(num_lives, num_primitives, max_iterations);
+
+  // Calculate the number of reduced terms.
+  int64_t num_reduced_terms = 0;
+  for (PrimIdx prim_idx = 0; prim_idx < reduced_intervals_.size(); ++prim_idx) {
+    const Interval& interval = reduced_intervals_[prim_idx];
+    if (IsValid(interval)) num_reduced_terms += length(interval);
+  }
+
+  // Add in any additional terms that will be needed to define groups.
+  for (const auto& group : reduced_groups_) num_reduced_terms += group.size();
+
+  LOG(INFO) << "Memory Term Reducer finished reducing the number of terms.";
+  return {num_terms, num_reduced_terms};
+}
+
+void MemoryTermReducer::Reduce(int64_t num_lives, int64_t num_primitives,
+                               int64_t max_iterations) {
   // For each live index, track the primitives entering memory or being evicted.
   std::vector<absl::btree_set<PrimIdx>> enter(num_lives), evict(num_lives);
   for (PrimIdx prim_idx = 0; prim_idx < num_primitives; ++prim_idx) {
-    if (!intervals[prim_idx].IsValid()) continue;  // Not found in live matrix.
-    enter[intervals[prim_idx].lower].insert(prim_idx);
-    evict[intervals[prim_idx].upper].insert(prim_idx);
+    const Interval& interval = reduced_intervals_[prim_idx];
+    if (!IsValid(interval)) continue;  // Not found in live matrix.
+    enter[interval.first].insert(prim_idx);
+    evict[interval.second].insert(prim_idx);
   }
 
   // A function to determine if one primitive would 'split' another.
-  auto Splits = [&intervals](PrimIdx large_idx, PrimIdx small_idx) -> bool {
-    return intervals[large_idx].lower < intervals[small_idx].lower &&
-           intervals[large_idx].upper > intervals[small_idx].upper;
+  auto Splits = [this](PrimIdx large_idx, PrimIdx small_idx) -> bool {
+    const Interval& large = reduced_intervals_[large_idx];
+    const Interval& small = reduced_intervals_[small_idx];
+    return large.first < small.first && large.second > small.second;
   };
 
   // A function to calculate the overlap between any pair.
-  auto CalcOverlap = [&intervals, Splits](
+  auto CalcOverlap = [this, Splits](
                          int64_t prim0_idx,
                          int64_t prim1_idx) -> std::optional<Interval> {
-    if (!intervals[prim0_idx].IsValid() || !intervals[prim1_idx].IsValid()) {
-      return std::nullopt;  // Happens when prim is absent in matrix or vanishes
-    }
-    if (Splits(prim0_idx, prim1_idx) || Splits(prim1_idx, prim0_idx)) {
-      return std::nullopt;  // Merging these would split one of the primitives.
-    }
-    const Interval overlap = {
-        std::max(intervals[prim0_idx].lower, intervals[prim1_idx].lower),
-        std::min(intervals[prim0_idx].upper, intervals[prim1_idx].upper)};
-    return overlap;
+    if (prim0_idx == prim1_idx) return std::nullopt;  // No self-overlap allowed
+    const Interval& interval0 = reduced_intervals_[prim0_idx];
+    const Interval& interval1 = reduced_intervals_[prim1_idx];
+    if (!IsValid(interval0) || !IsValid(interval1)) return std::nullopt;
+    if (Splits(prim0_idx, prim1_idx)) return std::nullopt;
+    if (Splits(prim1_idx, prim0_idx)) return std::nullopt;
+    return Interval(std::max(interval0.first, interval1.first),
+                    std::min(interval0.second, interval1.second));
   };
 
   // A function that merges a primitive (or members of a group) into a group.
   auto MergeIntoGroup = [num_primitives, this](
                             PrimIdx prim_idx,
-                            absl::flat_hash_set<PrimIdx>& reduced_group) {
+                            absl::btree_set<PrimIdx>& reduced_group) {
     if (prim_idx < num_primitives) {
       reduced_group.insert(prim_idx);
     } else {
@@ -115,11 +182,11 @@ int64_t MemoryTermReducer::Reduce(
   };
 
   // A function that calculates the # of terms a primitive (or group) uses.
-  auto CalcNumTerms = [num_primitives, &intervals, this](
+  auto CalcNumTerms = [num_primitives, this](
                           PrimIdx prim_idx,
                           std::optional<Interval> overlap = std::nullopt) {
-    int64_t num_terms = intervals[prim_idx].length();
-    if (overlap) num_terms -= overlap->length();
+    int64_t num_terms = length(reduced_intervals_[prim_idx]);
+    if (overlap) num_terms -= length(*overlap);
     if (prim_idx >= num_primitives && num_terms > 0) {
       num_terms += reduced_groups_[prim_idx - num_primitives].size();
     }
@@ -127,55 +194,55 @@ int64_t MemoryTermReducer::Reduce(
   };
 
   // A function to update a primitive after being merged into a group.
-  auto UpdatePrimitive = [&intervals, &enter, &evict](
+  auto UpdatePrimitive = [this, &enter, &evict](
                              PrimIdx prim_idx,
                              const Interval& overlap) mutable {
-    enter[intervals[prim_idx].lower].erase(prim_idx);
-    evict[intervals[prim_idx].upper].erase(prim_idx);
-    if (intervals[prim_idx].lower == overlap.lower) {
-      intervals[prim_idx].lower = overlap.upper + 1;
-    }
-    if (intervals[prim_idx].upper == overlap.upper) {
-      intervals[prim_idx].upper = overlap.lower - 1;
-    }
-    if (!intervals[prim_idx].IsValid()) return;  // It vanished.
-    enter[intervals[prim_idx].lower].insert(prim_idx);
-    evict[intervals[prim_idx].upper].insert(prim_idx);
+    Interval& interval = reduced_intervals_[prim_idx];
+    enter[interval.first].erase(prim_idx);
+    evict[interval.second].erase(prim_idx);
+    if (auto& t = interval.first; t == overlap.first) t = overlap.second + 1;
+    if (auto& t = interval.second; t == overlap.second) t = overlap.first - 1;
+    if (!IsValid(interval)) return;  // It vanished.
+    enter[interval.first].insert(prim_idx);
+    evict[interval.second].insert(prim_idx);
   };
 
   // A function to sweep through live points & merge large overlaps.
-  auto SweepAndMerge = [&num_lives, &intervals, &enter, &evict, &CalcOverlap,
-                        &CalcNumTerms, &MergeIntoGroup, &UpdatePrimitive,
-                        this]() -> bool {
-    absl::btree_set<LiveAndPrim> actives;  // Active prims sorted by lower value
+  auto SweepAndMerge = [&num_lives, &enter, &evict, &CalcOverlap, &CalcNumTerms,
+                        &MergeIntoGroup, &UpdatePrimitive, this]() -> bool {
+    absl::btree_set<ActivePrim> actives;  // Active prims sorted by interval.
     absl::btree_multimap<int64_t, PrimPair> overlaps;
     for (LiveIdx live_idx = 0; live_idx < num_lives; ++live_idx) {
       for (const PrimIdx prim_idx : enter[live_idx]) {
-        actives.insert({live_idx, prim_idx});
+        actives.insert({reduced_intervals_[prim_idx], prim_idx});
+      }
+      for (const PrimIdx prim_idx : evict[live_idx]) {
+        auto active = actives.find({reduced_intervals_[prim_idx], prim_idx});
+        if (++active == actives.end()) continue;  // No prims left to merge with
+        std::optional<Interval> overlap = CalcOverlap(prim_idx, active->second);
+        if (!overlap) continue;
+        overlaps.insert({-length(*overlap), {prim_idx, active->second}});
       }
       for (const PrimIdx prim_idx : evict[live_idx]) {
-        actives.erase({intervals[prim_idx].lower, prim_idx});
-        if (actives.empty()) continue;
-        const LiveAndPrim& active = *actives.begin();
-        overlaps.insert({active.first - live_idx, {prim_idx, active.second}});
+        actives.erase({reduced_intervals_[prim_idx], prim_idx});
       }
     }
     bool changed = false;
     for (const auto& [_, prim_pair] : overlaps) {
-      PrimIdx prim0_idx = prim_pair.first, prim1_idx = prim_pair.second;
-      std::optional<Interval> overlap = CalcOverlap(prim0_idx, prim1_idx);
+      const PrimIdx prim0_idx = prim_pair.first, prim1_idx = prim_pair.second;
+      const std::optional<Interval> overlap = CalcOverlap(prim0_idx, prim1_idx);
       if (!overlap) continue;
-      absl::flat_hash_set<PrimIdx> reduced_group;
+      absl::btree_set<PrimIdx> reduced_group;
       MergeIntoGroup(prim0_idx, reduced_group);
       MergeIntoGroup(prim1_idx, reduced_group);
       if (CalcNumTerms(prim0_idx) + CalcNumTerms(prim1_idx) <=
           CalcNumTerms(prim0_idx, overlap) + CalcNumTerms(prim1_idx, overlap) +
-              overlap->length() + reduced_group.size()) {
+              length(*overlap) + reduced_group.size()) {
         continue;  // Not reduced.
       }
-      enter[overlap->lower].insert(intervals.size());
-      evict[overlap->upper].insert(intervals.size());
-      intervals.push_back({overlap->lower, overlap->upper});
+      enter[overlap->first].insert(reduced_intervals_.size());
+      evict[overlap->second].insert(reduced_intervals_.size());
+      reduced_intervals_.push_back({overlap->first, overlap->second});
       reduced_groups_.push_back(reduced_group);
       UpdatePrimitive(prim0_idx, *overlap);
       UpdatePrimitive(prim1_idx, *overlap);
@@ -184,35 +251,19 @@ int64_t MemoryTermReducer::Reduce(
     return changed;
   };
 
-  while (SweepAndMerge()) {
+  for (int64_t iteration = 0; iteration < max_iterations; ++iteration) {
     // Repeated until no additional reductions can be achieved.
+    if (!SweepAndMerge()) break;
   }
 
   // Remove any groups that have vanished.
   for (GroupIdx group_idx = reduced_groups_.size() - 1; group_idx >= 0;
        --group_idx) {
-    if (intervals[num_primitives + group_idx].IsValid()) continue;
-    intervals.erase(intervals.begin() + num_primitives + group_idx);
+    if (IsValid(reduced_intervals_[num_primitives + group_idx])) continue;
+    reduced_intervals_.erase(reduced_intervals_.begin() + num_primitives +
+                             group_idx);
     reduced_groups_.erase(reduced_groups_.begin() + group_idx);
   }
-
-  // Create the reduced live matrix.
-  int64_t num_reduced_terms = 0;
-  reduced_live_.resize(num_lives);
-  for (PrimIdx prim_idx = 0; prim_idx < intervals.size(); ++prim_idx) {
-    for (LiveIdx live_idx = intervals[prim_idx].lower;
-         live_idx <= intervals[prim_idx].upper; ++live_idx) {
-      reduced_live_[live_idx].push_back(prim_idx);
-      ++num_reduced_terms;
-    }
-  }
-
-  // Add in any additional terms that will be needed to define groups.
-  for (const auto& group : reduced_groups_) num_reduced_terms += group.size();
-
-  LOG(INFO) << "Memory Term Reducer reduced number of terms from " << num_terms
-            << " to " << num_reduced_terms;
-  return num_reduced_terms;
 }
 
 const std::vector<std::vector<int64_t>>& MemoryTermReducer::GetReducedLive()
@@ -220,10 +271,71 @@ const std::vector<std::vector<int64_t>>& MemoryTermReducer::GetReducedLive()
   return reduced_live_;
 }
 
-const std::vector<absl::flat_hash_set<int64_t>>&
+const std::vector<std::pair<int64_t, int64_t>>&
+MemoryTermReducer::GetReducedIntervals() const {
+  return reduced_intervals_;
+}
+
+const std::vector<absl::btree_set<int64_t>>&
 MemoryTermReducer::GetReducedGroups() const {
   return reduced_groups_;
 }
 
+// Retrieves a reduced subset of time points (i.e., along the liveness
+// dimension) that are sufficient to establish memory constraints.
+absl::flat_hash_set<int64_t> MemoryTermReducer::GetReducedTimes(
+    int64_t num_primitives) {
+  return GetReducedTimes(num_primitives, reduced_intervals_, reduced_groups_);
+}
+
+absl::flat_hash_set<int64_t> MemoryTermReducer::GetReducedTimes(
+    int64_t num_primitives,
+    const std::vector<std::pair<int64_t, int64_t>>& reduced_intervals,
+    const std::vector<absl::btree_set<int64_t>>& reduced_groups) {
+  // First, reconstruct the original intervals of the primitives alone (since
+  // the start of a group does not necessarily correspond to the start of a
+  // primitive).
+  std::vector<std::pair<int64_t, int64_t>> intervals;
+  for (int64_t reduced_interval_idx = 0;
+       reduced_interval_idx < reduced_intervals.size();
+       ++reduced_interval_idx) {
+    const Interval& reduced_interval = reduced_intervals[reduced_interval_idx];
+    if (reduced_interval_idx < num_primitives) {
+      intervals.push_back(reduced_interval);
+      continue;
+    }
+    const GroupIdx group_idx = reduced_interval_idx - num_primitives;
+    for (const PrimIdx prim_idx : reduced_groups[group_idx]) {
+      Interval& interval = intervals[prim_idx];
+      if (!IsValid(interval)) {
+        interval.first = reduced_interval.first;
+        interval.second = reduced_interval.second;
+        continue;
+      }
+      interval.first = std::min(interval.first, reduced_interval.first);
+      interval.second = std::max(interval.second, reduced_interval.second);
+    }
+  }
+  // Next, created a sorted set of all entering & eviction times.
+  absl::btree_set<std::pair<int64_t, /*is_eviction=*/bool>> times;
+  for (const Interval& interval : intervals) {
+    if (!IsValid(interval)) continue;
+    times.insert({interval.first, false});
+    times.insert({interval.second, true});
+  }
+  // Finally, collect the latest entering times across all primitive intervals.
+  int64_t last_entering_time = -1;
+  absl::flat_hash_set<int64_t> reduced_times;
+  for (const auto& time : times) {
+    if (/*is_eviction*/ time.second) {
+      reduced_times.insert(last_entering_time);
+    } else {
+      last_entering_time = time.first;
+    }
+  }
+  reduced_times.insert(last_entering_time);
+  return reduced_times;
+}
+
 }  // namespace spmd
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_memory.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_memory.h
index db85fd85262124..a9e0938cf7a0d8 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_memory.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_memory.h
@@ -18,8 +18,11 @@ limitations under the License.
 
 #include <cstdint>
 #include <functional>
+#include <limits>
+#include <utility>
 #include <vector>
 
+#include "absl/container/btree_set.h"
 #include "absl/container/flat_hash_set.h"
 #include "tsl/platform/protobuf.h"
 
@@ -43,16 +46,45 @@ namespace spmd {
 // to model some new groups (4, 5, and 6) formed from the others.
 class MemoryTermReducer {
  public:
-  int64_t Reduce(
+  // Returns the number of memory terms before and after the reduction.
+  std::pair<int64_t, int64_t> Reduce(
       int64_t num_lives, int64_t num_primitives,
-      std::function<tsl::protobuf::RepeatedField<int64_t>(int64_t)>  // NOLINT
-          live);
+      const std::function<
+          tsl::protobuf::RepeatedField<int64_t>(int64_t)>&  // NOLINT
+          live,
+      int64_t max_iterations = std::numeric_limits<int64_t>::max());
+
+  // An alternate interface that consumes primitive intervals instead of a
+  // liveness matrix.
+  std::pair<int64_t, int64_t> Reduce(
+      int64_t num_lives, int64_t num_primitives,
+      const std::function<std::pair<int64_t, int64_t>(int64_t)>& intervals,
+      int64_t max_iterations = std::numeric_limits<int64_t>::max());
+
   const std::vector<std::vector<int64_t>>& GetReducedLive() const;
-  const std::vector<absl::flat_hash_set<int64_t>>& GetReducedGroups() const;
+  const std::vector<std::pair<int64_t, int64_t>>& GetReducedIntervals() const;
+  const std::vector<absl::btree_set<int64_t>>& GetReducedGroups() const;
+
+  // Retrieves a reduced subset of time points along the liveness profile that
+  // are sufficient to establish memory constraints.
+  absl::flat_hash_set<int64_t> GetReducedTimes(int64_t num_primitives);
+
+  // A static version of the above method (in case we're using a precomputed
+  // memory term reduction).
+  static absl::flat_hash_set<int64_t> GetReducedTimes(
+      int64_t num_primitives,
+      const std::vector<std::pair<int64_t, int64_t>>& reduced_intervals,
+      const std::vector<absl::btree_set<int64_t>>& reduced_groups);
 
  private:
+  // The internal implementation, agnostic to whether the client uses a liveness
+  // matrix or primitive intervals.
+  void Reduce(int64_t num_lives, int64_t num_primitives,
+              int64_t max_iterations);
+
   std::vector<std::vector<int64_t>> reduced_live_;
-  std::vector<absl::flat_hash_set<int64_t>> reduced_groups_;
+  std::vector<std::pair<int64_t, int64_t>> reduced_intervals_;
+  std::vector<absl::btree_set<int64_t>> reduced_groups_;
 };
 
 }  // namespace spmd
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_memory_test.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_memory_test.cc
index 2e73a25830a3d0..a99945ec9bbc69 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_memory_test.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_memory_test.cc
@@ -17,9 +17,11 @@ limitations under the License.
 
 #include <cstdint>
 #include <functional>
+#include <utility>
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "absl/container/btree_set.h"
 #include "absl/container/flat_hash_set.h"
 
 namespace xla {
@@ -35,6 +37,12 @@ Convert(const std::vector<std::vector<int64_t>>& live) {
   };
 }
 
+// Returns the interval at the associated primitive index.
+std::function<std::pair<int64_t, int64_t>(int64_t)> Convert(
+    const std::vector<std::pair<int64_t, int64_t>>& intervals) {
+  return [intervals](int64_t prim_idx) { return intervals[prim_idx]; };
+}
+
 // clang-format off
 
 //  |    111   ==>  |    111
@@ -42,6 +50,7 @@ Convert(const std::vector<std::vector<int64_t>>& live) {
 //  +------->  ==>  +------->
 //    (time)          (time)
 TEST(AutoShardingMemoryTest, WithoutOverlap) {
+  const int num_primitives = 2;
   const std::vector<std::vector<int64_t>> live =
       {{0   },
        {0   },
@@ -51,10 +60,8 @@ TEST(AutoShardingMemoryTest, WithoutOverlap) {
        {   1}};
 
   MemoryTermReducer reducer;
-  const auto num_reduced_terms =
-          reducer.Reduce(live.size(), /*num_primitives=*/2, Convert(live));
-  const auto reduced_live = reducer.GetReducedLive();
-  const auto reduced_groups = reducer.GetReducedGroups();
+  const auto num_terms =
+          reducer.Reduce(live.size(), num_primitives, Convert(live));
 
   const std::vector<std::vector<int64_t>> expected_reduced_live =
       {{0   },
@@ -63,10 +70,16 @@ TEST(AutoShardingMemoryTest, WithoutOverlap) {
        {   1},
        {   1},
        {   1}};
-  const std::vector<absl::flat_hash_set<int64_t>> expected_reduced_groups = {};
-  EXPECT_EQ(num_reduced_terms, 6);
-  EXPECT_EQ(reduced_live, expected_reduced_live);
-  EXPECT_EQ(reduced_groups, expected_reduced_groups);
+  const std::vector<std::pair<int64_t, int64_t>> expected_reduced_intervals =
+      {{0, 2}, {3, 5}};
+  const std::vector<absl::btree_set<int64_t>> expected_reduced_groups = {};
+  const std::pair<int64_t, int64_t> expected_num_terms = {6, 6};
+  const absl::flat_hash_set<int64_t> expected_reduced_times = {0, 3};
+  EXPECT_EQ(num_terms, expected_num_terms);
+  EXPECT_EQ(reducer.GetReducedLive(), expected_reduced_live);
+  EXPECT_EQ(reducer.GetReducedIntervals(), expected_reduced_intervals);
+  EXPECT_EQ(reducer.GetReducedGroups(), expected_reduced_groups);
+  EXPECT_EQ(reducer.GetReducedTimes(num_primitives), expected_reduced_times);
 }
 
 //  |                  2222
@@ -75,6 +88,7 @@ TEST(AutoShardingMemoryTest, WithoutOverlap) {
 //  +------->  ==>  +------->
 //    (time)          (time)
 TEST(AutoShardingMemoryTest, PartialOverlap) {
+  const int num_primitives = 2;
   const std::vector<std::vector<int64_t>> live =
       {{0   },
        {0, 1},
@@ -84,10 +98,8 @@ TEST(AutoShardingMemoryTest, PartialOverlap) {
        {   1}};
 
   MemoryTermReducer reducer;
-  const auto num_reduced_terms =
-          reducer.Reduce(live.size(), /*num_primitives=*/2, Convert(live));
-  const auto reduced_live = reducer.GetReducedLive();
-  const auto reduced_groups = reducer.GetReducedGroups();
+  const auto num_terms =
+          reducer.Reduce(live.size(), num_primitives, Convert(live));
 
   const std::vector<std::vector<int64_t>> expected_reduced_live =
       {{0      },
@@ -96,11 +108,17 @@ TEST(AutoShardingMemoryTest, PartialOverlap) {
        {      2},
        {      2},
        {   1   }};
-  const std::vector<absl::flat_hash_set<int64_t>> expected_reduced_groups =
+  const std::vector<std::pair<int64_t, int64_t>> expected_reduced_intervals =
+      {{0, 0}, {5, 5}, {1, 4}};
+  const std::vector<absl::btree_set<int64_t>> expected_reduced_groups =
       {{0, 1}};
-  EXPECT_EQ(num_reduced_terms, 8);
-  EXPECT_EQ(reduced_live, expected_reduced_live);
-  EXPECT_EQ(reduced_groups, expected_reduced_groups);
+  const std::pair<int64_t, int64_t> expected_num_terms = {10, 8};
+  const absl::flat_hash_set<int64_t> expected_reduced_times = {1};
+  EXPECT_EQ(num_terms, expected_num_terms);
+  EXPECT_EQ(reducer.GetReducedLive(), expected_reduced_live);
+  EXPECT_EQ(reducer.GetReducedIntervals(), expected_reduced_intervals);
+  EXPECT_EQ(reducer.GetReducedGroups(), expected_reduced_groups);
+  EXPECT_EQ(reducer.GetReducedTimes(num_primitives), expected_reduced_times);
 }
 
 //  |                  2222
@@ -109,6 +127,7 @@ TEST(AutoShardingMemoryTest, PartialOverlap) {
 //  +------->  ==>  +------->
 //    (time)          (time)
 TEST(AutoShardingMemoryTest, PartialOverlapReversed) {
+  const int num_primitives = 2;
   const std::vector<std::vector<int64_t>> live =
       {{   1},
        {0, 1},
@@ -118,10 +137,8 @@ TEST(AutoShardingMemoryTest, PartialOverlapReversed) {
        {0   }};
 
   MemoryTermReducer reducer;
-  const auto num_reduced_terms =
-          reducer.Reduce(live.size(), /*num_primitives=*/2, Convert(live));
-  const auto reduced_live = reducer.GetReducedLive();
-  const auto reduced_groups = reducer.GetReducedGroups();
+  const auto num_terms =
+          reducer.Reduce(live.size(), num_primitives, Convert(live));
 
   const std::vector<std::vector<int64_t>> expected_reduced_live =
       {{   1   },
@@ -130,11 +147,17 @@ TEST(AutoShardingMemoryTest, PartialOverlapReversed) {
        {      2},
        {      2},
        {0      }};
-  const std::vector<absl::flat_hash_set<int64_t>> expected_reduced_groups =
+  const std::vector<std::pair<int64_t, int64_t>> expected_reduced_intervals =
+      {{5, 5}, {0, 0}, {1, 4}};
+  const std::vector<absl::btree_set<int64_t>> expected_reduced_groups =
       {{0, 1}};
-  EXPECT_EQ(num_reduced_terms, 8);
-  EXPECT_EQ(reduced_live, expected_reduced_live);
-  EXPECT_EQ(reduced_groups, expected_reduced_groups);
+  const std::pair<int64_t, int64_t> expected_num_terms = {10, 8};
+  const absl::flat_hash_set<int64_t> expected_reduced_times = {1};
+  EXPECT_EQ(num_terms, expected_num_terms);
+  EXPECT_EQ(reducer.GetReducedLive(), expected_reduced_live);
+  EXPECT_EQ(reducer.GetReducedIntervals(), expected_reduced_intervals);
+  EXPECT_EQ(reducer.GetReducedGroups(), expected_reduced_groups);
+  EXPECT_EQ(reducer.GetReducedTimes(num_primitives), expected_reduced_times);
 }
 
 //  |  1111    ==>  |  1111
@@ -142,6 +165,7 @@ TEST(AutoShardingMemoryTest, PartialOverlapReversed) {
 //  +------->  ==>  +------->
 //    (time)          (time)
 TEST(AutoShardingMemoryTest, DoesNotSplitPrimitive) {
+  const int num_primitives = 2;
   const std::vector<std::vector<int64_t>> live =
       {{0   },
        {0, 1},
@@ -151,10 +175,8 @@ TEST(AutoShardingMemoryTest, DoesNotSplitPrimitive) {
        {0   }};
 
   MemoryTermReducer reducer;
-  const auto num_reduced_terms =
-          reducer.Reduce(live.size(), /*num_primitives=*/2, Convert(live));
-  const auto reduced_live = reducer.GetReducedLive();
-  const auto reduced_groups = reducer.GetReducedGroups();
+  const auto num_terms =
+          reducer.Reduce(live.size(), num_primitives, Convert(live));
 
   const std::vector<std::vector<int64_t>> expected_reduced_live =
       {{0   },
@@ -163,10 +185,16 @@ TEST(AutoShardingMemoryTest, DoesNotSplitPrimitive) {
        {0, 1},
        {0, 1},
        {0   }};
-  const std::vector<absl::flat_hash_set<int64_t>> expected_reduced_groups = {};
-  EXPECT_EQ(num_reduced_terms, 10);
-  EXPECT_EQ(reduced_live, expected_reduced_live);
-  EXPECT_EQ(reduced_groups, expected_reduced_groups);
+  const std::vector<std::pair<int64_t, int64_t>> expected_reduced_intervals =
+      {{0, 5}, {1, 4}};
+  const std::vector<absl::btree_set<int64_t>> expected_reduced_groups = {};
+  const std::pair<int64_t, int64_t> expected_num_terms = {10, 10};
+  const absl::flat_hash_set<int64_t> expected_reduced_times = {1};
+  EXPECT_EQ(num_terms, expected_num_terms);
+  EXPECT_EQ(reducer.GetReducedLive(), expected_reduced_live);
+  EXPECT_EQ(reducer.GetReducedIntervals(), expected_reduced_intervals);
+  EXPECT_EQ(reducer.GetReducedGroups(), expected_reduced_groups);
+  EXPECT_EQ(reducer.GetReducedTimes(num_primitives), expected_reduced_times);
 }
 
 //  |          ==>  |  22222
@@ -175,6 +203,7 @@ TEST(AutoShardingMemoryTest, DoesNotSplitPrimitive) {
 //  +------->  ==>  +------->
 //    (time)          (time)
 TEST(AutoShardingMemoryTest, OnePrimitiveVanishes) {
+  const int num_primitives = 2;
   const std::vector<std::vector<int64_t>> live =
       {{0   },
        {0, 1},
@@ -184,10 +213,8 @@ TEST(AutoShardingMemoryTest, OnePrimitiveVanishes) {
        {0, 1}};
 
   MemoryTermReducer reducer;
-  const auto num_reduced_terms =
-          reducer.Reduce(live.size(), /*num_primitives=*/2, Convert(live));
-  const auto reduced_live = reducer.GetReducedLive();
-  const auto reduced_groups = reducer.GetReducedGroups();
+  const auto num_terms =
+          reducer.Reduce(live.size(), num_primitives, Convert(live));
 
   const std::vector<std::vector<int64_t>> expected_reduced_live =
       {{0   },
@@ -196,11 +223,17 @@ TEST(AutoShardingMemoryTest, OnePrimitiveVanishes) {
        {   2},
        {   2},
        {   2}};
-  const std::vector<absl::flat_hash_set<int64_t>> expected_reduced_groups =
+  const std::vector<std::pair<int64_t, int64_t>> expected_reduced_intervals =
+      {{0, 0}, {6, 0}, {1, 5}};
+  const std::vector<absl::btree_set<int64_t>> expected_reduced_groups =
       {{0, 1}};
-  EXPECT_EQ(num_reduced_terms, 8);
-  EXPECT_EQ(reduced_live, expected_reduced_live);
-  EXPECT_EQ(reduced_groups, expected_reduced_groups);
+  const std::pair<int64_t, int64_t> expected_num_terms = {11, 8};
+  const absl::flat_hash_set<int64_t> expected_reduced_times = {1};
+  EXPECT_EQ(num_terms, expected_num_terms);
+  EXPECT_EQ(reducer.GetReducedLive(), expected_reduced_live);
+  EXPECT_EQ(reducer.GetReducedIntervals(), expected_reduced_intervals);
+  EXPECT_EQ(reducer.GetReducedGroups(), expected_reduced_groups);
+  EXPECT_EQ(reducer.GetReducedTimes(num_primitives), expected_reduced_times);
 }
 
 //  |          ==>  | 222222
@@ -209,6 +242,7 @@ TEST(AutoShardingMemoryTest, OnePrimitiveVanishes) {
 //  +------->  ==>  +------->
 //    (time)          (time)
 TEST(AutoShardingMemoryTest, BothPrimitivesVanish) {
+  const int num_primitives = 2;
   const std::vector<std::vector<int64_t>> live =
       {{0, 1},
        {0, 1},
@@ -218,10 +252,8 @@ TEST(AutoShardingMemoryTest, BothPrimitivesVanish) {
        {0, 1}};
 
   MemoryTermReducer reducer;
-  const auto num_reduced_terms =
-          reducer.Reduce(live.size(), /*num_primitives=*/2, Convert(live));
-  const auto reduced_live = reducer.GetReducedLive();
-  const auto reduced_groups = reducer.GetReducedGroups();
+  const auto num_terms =
+          reducer.Reduce(live.size(), num_primitives, Convert(live));
 
   const std::vector<std::vector<int64_t>> expected_reduced_live =
       {{2},
@@ -230,11 +262,17 @@ TEST(AutoShardingMemoryTest, BothPrimitivesVanish) {
        {2},
        {2},
        {2}};
-  const std::vector<absl::flat_hash_set<int64_t>> expected_reduced_groups =
+  const std::vector<std::pair<int64_t, int64_t>> expected_reduced_intervals =
+      {{6, -1}, {6, -1}, {0, 5}};
+  const std::vector<absl::btree_set<int64_t>> expected_reduced_groups =
       {{0, 1}};
-  EXPECT_EQ(num_reduced_terms, 8);
-  EXPECT_EQ(reduced_live, expected_reduced_live);
-  EXPECT_EQ(reduced_groups, expected_reduced_groups);
+  const std::pair<int64_t, int64_t> expected_num_terms = {12, 8};
+  const absl::flat_hash_set<int64_t> expected_reduced_times = {0};
+  EXPECT_EQ(num_terms, expected_num_terms);
+  EXPECT_EQ(reducer.GetReducedLive(), expected_reduced_live);
+  EXPECT_EQ(reducer.GetReducedIntervals(), expected_reduced_intervals);
+  EXPECT_EQ(reducer.GetReducedGroups(), expected_reduced_groups);
+  EXPECT_EQ(reducer.GetReducedTimes(num_primitives), expected_reduced_times);
 }
 
 //  |            ==>  | 33333
@@ -244,6 +282,7 @@ TEST(AutoShardingMemoryTest, BothPrimitivesVanish) {
 //  +--------->  ==>  +--------->
 //    (time)            (time)
 TEST(AutoShardingMemoryTest, OneGroupingPreventsAnother) {
+  const int num_primitives = 3;
   const std::vector<std::vector<int64_t>> live =
       {{0, 1   },
        {0, 1   },
@@ -256,10 +295,8 @@ TEST(AutoShardingMemoryTest, OneGroupingPreventsAnother) {
        {      2}};
 
   MemoryTermReducer reducer;
-  const auto num_reduced_terms =
-          reducer.Reduce(live.size(), /*num_primitives=*/3, Convert(live));
-  const auto reduced_live = reducer.GetReducedLive();
-  const auto reduced_groups = reducer.GetReducedGroups();
+  const auto num_terms =
+          reducer.Reduce(live.size(), num_primitives, Convert(live));
 
   const std::vector<std::vector<int64_t>> expected_reduced_live =
       {{      3},
@@ -271,11 +308,17 @@ TEST(AutoShardingMemoryTest, OneGroupingPreventsAnother) {
        {1, 2   },
        {1, 2   },
        {   2   }};
-  const std::vector<absl::flat_hash_set<int64_t>> expected_reduced_groups =
+  const std::vector<std::pair<int64_t, int64_t>> expected_reduced_intervals =
+      {{5, -1}, {5, 7}, {4, 8}, {0, 4}};
+  const std::vector<absl::btree_set<int64_t>> expected_reduced_groups =
       {{0, 1}};
-  EXPECT_EQ(num_reduced_terms, 15);
-  EXPECT_EQ(reduced_live, expected_reduced_live);
-  EXPECT_EQ(reduced_groups, expected_reduced_groups);
+  const std::pair<int64_t, int64_t> expected_num_terms = {18, 15};
+  const absl::flat_hash_set<int64_t> expected_reduced_times = {4};
+  EXPECT_EQ(num_terms, expected_num_terms);
+  EXPECT_EQ(reducer.GetReducedLive(), expected_reduced_live);
+  EXPECT_EQ(reducer.GetReducedIntervals(), expected_reduced_intervals);
+  EXPECT_EQ(reducer.GetReducedGroups(), expected_reduced_groups);
+  EXPECT_EQ(reducer.GetReducedTimes(num_primitives), expected_reduced_times);
 }
 
 //  |          ==>  | 333444
@@ -285,6 +328,7 @@ TEST(AutoShardingMemoryTest, OneGroupingPreventsAnother) {
 //  +------->  ==>  +------->
 //    (time)          (time)
 TEST(AutoShardingMemoryTest, TwoGroups) {
+  const int num_primitives = 3;
   const std::vector<std::vector<int64_t>> live =
       {{0, 1   },
        {0, 1   },
@@ -294,10 +338,8 @@ TEST(AutoShardingMemoryTest, TwoGroups) {
        {0,    2}};
 
   MemoryTermReducer reducer;
-  const auto num_reduced_terms =
-          reducer.Reduce(live.size(), /*num_primitives=*/3, Convert(live));
-  const auto reduced_live = reducer.GetReducedLive();
-  const auto reduced_groups = reducer.GetReducedGroups();
+  const auto num_terms =
+          reducer.Reduce(live.size(), num_primitives, Convert(live));
 
   const std::vector<std::vector<int64_t>> expected_reduced_live =
       {{3},
@@ -306,11 +348,17 @@ TEST(AutoShardingMemoryTest, TwoGroups) {
        {4},
        {4},
        {4}};
-  const std::vector<absl::flat_hash_set<int64_t>> expected_reduced_groups =
+  const std::vector<std::pair<int64_t, int64_t>> expected_reduced_intervals =
+      {{6, 2}, {3, -1}, {6, 2}, {0, 2}, {3, 5}};
+  const std::vector<absl::btree_set<int64_t>> expected_reduced_groups =
       {{0, 1}, {0, 2}};
-  EXPECT_EQ(num_reduced_terms, 10);
-  EXPECT_EQ(reduced_live, expected_reduced_live);
-  EXPECT_EQ(reduced_groups, expected_reduced_groups);
+  const std::pair<int64_t, int64_t> expected_num_terms = {12, 10};
+  const absl::flat_hash_set<int64_t> expected_reduced_times = {0, 3};
+  EXPECT_EQ(num_terms, expected_num_terms);
+  EXPECT_EQ(reducer.GetReducedLive(), expected_reduced_live);
+  EXPECT_EQ(reducer.GetReducedIntervals(), expected_reduced_intervals);
+  EXPECT_EQ(reducer.GetReducedGroups(), expected_reduced_groups);
+  EXPECT_EQ(reducer.GetReducedTimes(num_primitives), expected_reduced_times);
 }
 
 //  |           ==>  |  444555
@@ -321,6 +369,7 @@ TEST(AutoShardingMemoryTest, TwoGroups) {
 //  +------->   ==>  +------->
 //    (time)           (time)
 TEST(AutoShardingMemoryTest, TwoGroupsMutuallyExclusive) {
+  const int num_primitives = 4;
   const std::vector<std::vector<int64_t>> live =
       {{0         },
        {0, 1      },
@@ -332,10 +381,8 @@ TEST(AutoShardingMemoryTest, TwoGroupsMutuallyExclusive) {
        {         3}};
 
   MemoryTermReducer reducer;
-  const auto num_reduced_terms =
-          reducer.Reduce(live.size(), /*num_primitives=*/4, Convert(live));
-  const auto reduced_live = reducer.GetReducedLive();
-  const auto reduced_groups = reducer.GetReducedGroups();
+  const auto num_terms =
+          reducer.Reduce(live.size(), num_primitives, Convert(live));
 
   const std::vector<std::vector<int64_t>> expected_reduced_live =
       {{0      },
@@ -346,11 +393,17 @@ TEST(AutoShardingMemoryTest, TwoGroupsMutuallyExclusive) {
        {      5},
        {      5},
        {   3   }};
-  const std::vector<absl::flat_hash_set<int64_t>> expected_reduced_groups =
+  const std::vector<std::pair<int64_t, int64_t>> expected_reduced_intervals =
+      {{0, 0}, {4, 0}, {7, 3}, {7, 7}, {1, 3}, {4, 6}};
+  const std::vector<absl::btree_set<int64_t>> expected_reduced_groups =
       {{0, 1}, {2, 3}};
-  EXPECT_EQ(num_reduced_terms, 12);
-  EXPECT_EQ(reduced_live, expected_reduced_live);
-  EXPECT_EQ(reduced_groups, expected_reduced_groups);
+  const std::pair<int64_t, int64_t> expected_num_terms = {14, 12};
+  const absl::flat_hash_set<int64_t> expected_reduced_times = {1, 4};
+  EXPECT_EQ(num_terms, expected_num_terms);
+  EXPECT_EQ(reducer.GetReducedLive(), expected_reduced_live);
+  EXPECT_EQ(reducer.GetReducedIntervals(), expected_reduced_intervals);
+  EXPECT_EQ(reducer.GetReducedGroups(), expected_reduced_groups);
+  EXPECT_EQ(reducer.GetReducedTimes(num_primitives), expected_reduced_times);
 }
 
 //  | 11      ==>  | 11
@@ -358,23 +411,28 @@ TEST(AutoShardingMemoryTest, TwoGroupsMutuallyExclusive) {
 //  +------>  ==>  +------>
 //   (time)         (time)
 TEST(AutoShardingMemoryTest, MergingPrimitivesWouldNotReduceTerms) {
+  const int num_primitives = 2;
   const std::vector<std::vector<int64_t>> live =
       {{0, 1},
        {0, 1}};
 
   MemoryTermReducer reducer;
-  const auto num_reduced_terms =
-          reducer.Reduce(live.size(), /*num_primitives=*/2, Convert(live));
-  const auto reduced_live = reducer.GetReducedLive();
-  const auto reduced_groups = reducer.GetReducedGroups();
+  const auto num_terms =
+          reducer.Reduce(live.size(), num_primitives, Convert(live));
 
   const std::vector<std::vector<int64_t>> expected_reduced_live =
       {{0, 1},
        {0, 1}};
-  const std::vector<absl::flat_hash_set<int64_t>> expected_reduced_groups = {};
-  EXPECT_EQ(num_reduced_terms, 4);
-  EXPECT_EQ(reduced_live, expected_reduced_live);
-  EXPECT_EQ(reduced_groups, expected_reduced_groups);
+  const std::vector<std::pair<int64_t, int64_t>> expected_reduced_intervals =
+      {{0, 1}, {0, 1}};
+  const std::vector<absl::btree_set<int64_t>> expected_reduced_groups = {};
+  const std::pair<int64_t, int64_t> expected_num_terms = {4, 4};
+  const absl::flat_hash_set<int64_t> expected_reduced_times = {0};
+  EXPECT_EQ(num_terms, expected_num_terms);
+  EXPECT_EQ(reducer.GetReducedLive(), expected_reduced_live);
+  EXPECT_EQ(reducer.GetReducedIntervals(), expected_reduced_intervals);
+  EXPECT_EQ(reducer.GetReducedGroups(), expected_reduced_groups);
+  EXPECT_EQ(reducer.GetReducedTimes(num_primitives), expected_reduced_times);
 }
 
 //  |          ==>  | 333333
@@ -384,6 +442,7 @@ TEST(AutoShardingMemoryTest, MergingPrimitivesWouldNotReduceTerms) {
 //  +------->  ==>  +------->
 //    (time)          (time)
 TEST(AutoShardingMemoryTest, AllPrimitivesVanish) {
+  const int num_primitives = 3;
   const std::vector<std::vector<int64_t>> live =
       {{0, 1, 2},
        {0, 1, 2},
@@ -393,10 +452,8 @@ TEST(AutoShardingMemoryTest, AllPrimitivesVanish) {
        {0, 1, 2}};
 
   MemoryTermReducer reducer;
-  const auto num_reduced_terms =
-          reducer.Reduce(live.size(), /*num_primitives=*/3, Convert(live));
-  const auto reduced_live = reducer.GetReducedLive();
-  const auto reduced_groups = reducer.GetReducedGroups();
+  const auto num_terms =
+          reducer.Reduce(live.size(), num_primitives, Convert(live));
 
   const std::vector<std::vector<int64_t>> expected_reduced_live =
       {{3},
@@ -405,11 +462,17 @@ TEST(AutoShardingMemoryTest, AllPrimitivesVanish) {
        {3},
        {3},
        {3}};
-  const std::vector<absl::flat_hash_set<int64_t>> expected_reduced_groups =
+  const std::vector<std::pair<int64_t, int64_t>> expected_reduced_intervals =
+      {{6, -1}, {6, -1}, {6, -1}, {0, 5}};
+  const std::vector<absl::btree_set<int64_t>> expected_reduced_groups =
       {{0, 1, 2}};
-  EXPECT_EQ(num_reduced_terms, 9);
-  EXPECT_EQ(reduced_live, expected_reduced_live);
-  EXPECT_EQ(reduced_groups, expected_reduced_groups);
+  const std::pair<int64_t, int64_t> expected_num_terms = {18, 9};
+  const absl::flat_hash_set<int64_t> expected_reduced_times = {0};
+  EXPECT_EQ(num_terms, expected_num_terms);
+  EXPECT_EQ(reducer.GetReducedLive(), expected_reduced_live);
+  EXPECT_EQ(reducer.GetReducedIntervals(), expected_reduced_intervals);
+  EXPECT_EQ(reducer.GetReducedGroups(), expected_reduced_groups);
+  EXPECT_EQ(reducer.GetReducedTimes(num_primitives), expected_reduced_times);
 }
 
 //  |            ==>  |    555555
@@ -421,6 +484,7 @@ TEST(AutoShardingMemoryTest, AllPrimitivesVanish) {
 //  +--------->  ==>  +--------->
 //     (time)            (time)
 TEST(AutoShardingMemoryTest, MergingGroupsWouldNotReduceTerms) {
+  const int num_primitives = 4;
   const std::vector<std::vector<int64_t>> live =
       {{0, 1      },
        {0, 1      },
@@ -433,10 +497,8 @@ TEST(AutoShardingMemoryTest, MergingGroupsWouldNotReduceTerms) {
        {      2, 3}};
 
   MemoryTermReducer reducer;
-  const auto num_reduced_terms =
-          reducer.Reduce(live.size(), /*num_primitives=*/4, Convert(live));
-  const auto reduced_live = reducer.GetReducedLive();
-  const auto reduced_groups = reducer.GetReducedGroups();
+  const auto num_terms =
+          reducer.Reduce(live.size(), num_primitives, Convert(live));
 
   const std::vector<std::vector<int64_t>> expected_reduced_live =
       {{4   },
@@ -448,11 +510,17 @@ TEST(AutoShardingMemoryTest, MergingGroupsWouldNotReduceTerms) {
        {4, 5},
        {   5},
        {   5}};
-  const std::vector<absl::flat_hash_set<int64_t>> expected_reduced_groups =
+  const std::vector<std::pair<int64_t, int64_t>> expected_reduced_intervals =
+      {{7, -1}, {7, -1}, {9, 2}, {9, 2}, {0, 6}, {3, 8}};
+  const std::vector<absl::btree_set<int64_t>> expected_reduced_groups =
       {{0, 1}, {2, 3}};
-  EXPECT_EQ(num_reduced_terms, 17);
-  EXPECT_EQ(reduced_live, expected_reduced_live);
-  EXPECT_EQ(reduced_groups, expected_reduced_groups);
+  const std::pair<int64_t, int64_t> expected_num_terms = {26, 17};
+  const absl::flat_hash_set<int64_t> expected_reduced_times = {3};
+  EXPECT_EQ(num_terms, expected_num_terms);
+  EXPECT_EQ(reducer.GetReducedLive(), expected_reduced_live);
+  EXPECT_EQ(reducer.GetReducedIntervals(), expected_reduced_intervals);
+  EXPECT_EQ(reducer.GetReducedGroups(), expected_reduced_groups);
+  EXPECT_EQ(reducer.GetReducedTimes(num_primitives), expected_reduced_times);
 }
 
 //  |                      |  444466666555
@@ -463,6 +531,7 @@ TEST(AutoShardingMemoryTest, MergingGroupsWouldNotReduceTerms) {
 //  +-------------->  ==>  +-------------->
 //       (time)                 (time)
 TEST(AutoShardingMemoryTest, ExampleFromDocumentation) {
+  const int num_primitives = 4;
   const std::vector<std::vector<int64_t>> live =
       {{0         },
        {0, 1      },
@@ -480,10 +549,8 @@ TEST(AutoShardingMemoryTest, ExampleFromDocumentation) {
        {         3}};
 
   MemoryTermReducer reducer;
-  const auto num_reduced_terms =
-          reducer.Reduce(live.size(), /*num_primitives=*/4, Convert(live));
-  const auto reduced_live = reducer.GetReducedLive();
-  const auto reduced_groups = reducer.GetReducedGroups();
+  const auto num_terms =
+          reducer.Reduce(live.size(), num_primitives, Convert(live));
 
   const std::vector<std::vector<int64_t>> expected_reduced_live =
       {{0      },
@@ -500,11 +567,265 @@ TEST(AutoShardingMemoryTest, ExampleFromDocumentation) {
        {      5},
        {      5},
        {   3   }};
-  const std::vector<absl::flat_hash_set<int64_t>> expected_reduced_groups =
+  const std::vector<std::pair<int64_t, int64_t>> expected_reduced_intervals =
+      {{0, 0}, {10, 0}, {13, 4}, {13, 13}, {1, 4}, {10, 12}, {5, 9}};
+  const std::vector<absl::btree_set<int64_t>> expected_reduced_groups =
       {{0, 1}, {2, 3}, {0, 1, 2, 3}};
-  EXPECT_EQ(num_reduced_terms, 22);
-  EXPECT_EQ(reduced_live, expected_reduced_live);
-  EXPECT_EQ(reduced_groups, expected_reduced_groups);
+  const std::pair<int64_t, int64_t> expected_num_terms = {36, 22};
+  const absl::flat_hash_set<int64_t> expected_reduced_times = {5};
+  EXPECT_EQ(num_terms, expected_num_terms);
+  EXPECT_EQ(reducer.GetReducedLive(), expected_reduced_live);
+  EXPECT_EQ(reducer.GetReducedIntervals(), expected_reduced_intervals);
+  EXPECT_EQ(reducer.GetReducedGroups(), expected_reduced_groups);
+  EXPECT_EQ(reducer.GetReducedTimes(num_primitives), expected_reduced_times);
+}
+
+//  |         ==>  | 333
+//  | 2222    ==>  | ...2
+//  |    1    ==>  |    1    Groups:
+//  | 000     ==>  | ...       m[3] = m[0] + m[2]
+//  +------>  ==>  +------>
+//   (time)         (time)
+TEST(AutoShardingMemoryTest, MergesWithRightmost) {
+  const int num_primitives = 3;
+  const std::vector<std::vector<int64_t>> live =
+      {{0,    2},
+       {0,    2},
+       {0,    2},
+       {   1, 2}};
+
+  MemoryTermReducer reducer;
+  const auto num_terms =
+          reducer.Reduce(live.size(), num_primitives, Convert(live));
+
+  const std::vector<std::vector<int64_t>> expected_reduced_live =
+      {{      3},
+       {      3},
+       {      3},
+       {1, 2   }};
+  const std::vector<std::pair<int64_t, int64_t>> expected_reduced_intervals =
+      {{3, -1}, {3, 3}, {3, 3}, {0, 2}};
+  const std::vector<absl::btree_set<int64_t>> expected_reduced_groups =
+      {{0, 2}};
+  const std::pair<int64_t, int64_t> expected_num_terms = {8, 7};
+  const absl::flat_hash_set<int64_t> expected_reduced_times = {0, 3};
+  EXPECT_EQ(num_terms, expected_num_terms);
+  EXPECT_EQ(reducer.GetReducedLive(), expected_reduced_live);
+  EXPECT_EQ(reducer.GetReducedIntervals(), expected_reduced_intervals);
+  EXPECT_EQ(reducer.GetReducedGroups(), expected_reduced_groups);
+  EXPECT_EQ(reducer.GetReducedTimes(num_primitives), expected_reduced_times);
+}
+
+//  |                      |  444466666555
+//  |      333333333  ==>  |      ........3  Groups:
+//  |      22222222   ==>  |      ........     m[4] = m[0] + m[1]
+//  |  111111111      ==>  |  .........        m[5] = m[2] + m[3]
+//  | 0000000000      ==>  | 0.........        m[6] = m[0] + m[1] + m[2] + m[3]
+//  +-------------->  ==>  +-------------->
+//       (time)                 (time)
+TEST(AutoShardingMemoryTest, ExampleFromDocumentationUsingIntervals) {
+  const int num_primitives = 4;
+  const std::vector<std::pair<int64_t, int64_t>> intervals =
+      {{0, 9}, {1, 9}, {5, 12}, {5, 13}};
+
+  MemoryTermReducer reducer;
+  const auto num_terms = reducer.Reduce(/*num_lives=*/14, num_primitives,
+                                        Convert(intervals));
+
+  const std::vector<std::vector<int64_t>> expected_reduced_live = {};
+  const std::vector<std::pair<int64_t, int64_t>> expected_reduced_intervals =
+      {{0, 0}, {10, 0}, {13, 4}, {13, 13}, {1, 4}, {10, 12}, {5, 9}};
+  const std::vector<absl::btree_set<int64_t>> expected_reduced_groups =
+      {{0, 1}, {2, 3}, {0, 1, 2, 3}};
+  const std::pair<int64_t, int64_t> expected_num_terms = {36, 22};
+  const absl::flat_hash_set<int64_t> expected_reduced_times = {5};
+  EXPECT_EQ(num_terms, expected_num_terms);
+  EXPECT_EQ(reducer.GetReducedLive(), expected_reduced_live);
+  EXPECT_EQ(reducer.GetReducedIntervals(), expected_reduced_intervals);
+  EXPECT_EQ(reducer.GetReducedGroups(), expected_reduced_groups);
+  EXPECT_EQ(reducer.GetReducedTimes(num_primitives), expected_reduced_times);
+}
+
+TEST(AutoShardingMemoryTest, InvalidIntervals) {
+  const int num_primitives = 3;
+  const std::vector<std::pair<int64_t, int64_t>> intervals =
+      {{0, 4}, {9223372036854775807, 0}, {9223372036854775807, 0}};
+
+  MemoryTermReducer reducer;
+  const auto num_terms = reducer.Reduce(/*num_lives=*/5, num_primitives,
+                                        Convert(intervals));
+
+  const std::vector<std::vector<int64_t>> expected_reduced_live = {};
+  const std::vector<std::pair<int64_t, int64_t>> expected_reduced_intervals =
+      {{0, 4}, {9223372036854775807, 0}, {9223372036854775807, 0}};
+  const std::vector<absl::btree_set<int64_t>> expected_reduced_groups = {};
+  const std::pair<int64_t, int64_t> expected_num_terms = {5, 5};
+  const absl::flat_hash_set<int64_t> expected_reduced_times = {0};
+  EXPECT_EQ(num_terms, expected_num_terms);
+  EXPECT_EQ(reducer.GetReducedLive(), expected_reduced_live);
+  EXPECT_EQ(reducer.GetReducedIntervals(), expected_reduced_intervals);
+  EXPECT_EQ(reducer.GetReducedGroups(), expected_reduced_groups);
+  EXPECT_EQ(reducer.GetReducedTimes(num_primitives), expected_reduced_times);
+}
+
+//  |                 ==>  |      55555555
+//  |                 ==>  |  444444444
+//  |      333333333  ==>  |      ........3
+//  |      22222222   ==>  |      ........   Groups:
+//  |  111111111      ==>  |  .........        m[4] = m[0] + m[1]
+//  | 0000000000      ==>  | 0.........        m[5] = m[2] + m[3]
+//  +-------------->  ==>  +-------------->
+//       (time)                 (time)
+TEST(AutoShardingMemoryTest, OneIterationOnly) {
+  const int num_primitives = 4;
+  const std::vector<std::pair<int64_t, int64_t>> intervals =
+      {{0, 9}, {1, 9}, {5, 12}, {5, 13}};
+
+  MemoryTermReducer reducer;
+  const auto num_terms = reducer.Reduce(/*num_lives=*/14, num_primitives,
+                                        Convert(intervals),
+                                        /*max_iterations=*/1);
+
+  const std::vector<std::vector<int64_t>> expected_reduced_live = {};
+  const std::vector<std::pair<int64_t, int64_t>> expected_reduced_intervals =
+      {{0, 0}, {10, 0}, {13, 4}, {13, 13}, {1, 9}, {5, 12}};
+  const std::vector<absl::btree_set<int64_t>> expected_reduced_groups =
+      {{0, 1}, {2, 3}};
+  const std::pair<int64_t, int64_t> expected_num_terms = {36, 23};
+  const absl::flat_hash_set<int64_t> expected_reduced_times = {5};
+  EXPECT_EQ(num_terms, expected_num_terms);
+  EXPECT_EQ(reducer.GetReducedLive(), expected_reduced_live);
+  EXPECT_EQ(reducer.GetReducedIntervals(), expected_reduced_intervals);
+  EXPECT_EQ(reducer.GetReducedGroups(), expected_reduced_groups);
+  EXPECT_EQ(reducer.GetReducedTimes(num_primitives), expected_reduced_times);
+}
+
+//  |                 ==>  | 55555
+//  |                 ==>  | 44444444444
+//  | 33333           ==>  | .....              Groups:
+//  | 22222222        ==>  | .....222            m[4] = m[0] + m[1]
+//  | 11111111111     ==>  | ...........         m[5] = m[2] + m[3]
+//  | 00000000000000  ==>  | ...........000
+//  +-------------->  ==>  +-------------->
+//       (time)                 (time)
+TEST(AutoShardingMemoryTest, StairsBottomLeft) {
+  const int num_primitives = 4;
+  const std::vector<std::pair<int64_t, int64_t>> intervals =
+      {{0, 13}, {0, 10}, {0, 7}, {0, 4}};
+
+  MemoryTermReducer reducer;
+  const auto num_terms = reducer.Reduce(/*num_lives=*/14, num_primitives,
+                                        Convert(intervals),
+                                        /*max_iterations=*/1);
+
+  const std::vector<std::vector<int64_t>> expected_reduced_live = {};
+  const std::vector<std::pair<int64_t, int64_t>> expected_reduced_intervals =
+      {{11, 13}, {11, -1}, {5, 7}, {5, -1}, {0, 10}, {0, 4}};
+  const std::vector<absl::btree_set<int64_t>> expected_reduced_groups =
+      {{0, 1}, {2, 3}};
+  const std::pair<int64_t, int64_t> expected_num_terms = {38, 26};
+  const absl::flat_hash_set<int64_t> expected_reduced_times = {0};
+  EXPECT_EQ(num_terms, expected_num_terms);
+  EXPECT_EQ(reducer.GetReducedLive(), expected_reduced_live);
+  EXPECT_EQ(reducer.GetReducedIntervals(), expected_reduced_intervals);
+  EXPECT_EQ(reducer.GetReducedGroups(), expected_reduced_groups);
+  EXPECT_EQ(reducer.GetReducedTimes(num_primitives), expected_reduced_times);
+}
+
+//  |                 ==>  | 55555
+//  |                 ==>  | 44444444444
+//  | 33333333333333  ==>  | ...........333  Groups:
+//  | 22222222222     ==>  | ...........      m[4] = m[2] + m[3]
+//  | 11111111        ==>  | .....111         m[5] = m[0] + m[1]
+//  | 00000           ==>  | .....
+//  +-------------->  ==>  +-------------->
+//       (time)                 (time)
+TEST(AutoShardingMemoryTest, StairsTopLeft) {
+  const int num_primitives = 4;
+  const std::vector<std::pair<int64_t, int64_t>> intervals =
+      {{0, 4}, {0, 7}, {0, 10}, {0, 13}};
+
+  MemoryTermReducer reducer;
+  const auto num_terms = reducer.Reduce(/*num_lives=*/14, num_primitives,
+                                        Convert(intervals),
+                                        /*max_iterations=*/1);
+
+  const std::vector<std::vector<int64_t>> expected_reduced_live = {};
+  const std::vector<std::pair<int64_t, int64_t>> expected_reduced_intervals =
+      {{5, -1}, {5, 7}, {11, -1}, {11, 13}, {0, 10}, {0, 4}};
+  const std::vector<absl::btree_set<int64_t>> expected_reduced_groups =
+      {{2, 3}, {0, 1}};
+  const std::pair<int64_t, int64_t> expected_num_terms = {38, 26};
+  const absl::flat_hash_set<int64_t> expected_reduced_times = {0};
+  EXPECT_EQ(num_terms, expected_num_terms);
+  EXPECT_EQ(reducer.GetReducedLive(), expected_reduced_live);
+  EXPECT_EQ(reducer.GetReducedIntervals(), expected_reduced_intervals);
+  EXPECT_EQ(reducer.GetReducedGroups(), expected_reduced_groups);
+  EXPECT_EQ(reducer.GetReducedTimes(num_primitives), expected_reduced_times);
+}
+
+//  |                 ==>  |          55555
+//  |                 ==>  |    44444444444
+//  | 33333333333333  ==>  | 333...........  Groups:
+//  |    22222222222  ==>  |    ...........    m[4] = m[2] + m[3]
+//  |       11111111  ==>  |       111.....    m[5] = m[0] + m[1]
+//  |          00000  ==>  |          .....
+//  +-------------->  ==>  +-------------->
+//       (time)                 (time)
+TEST(AutoShardingMemoryTest, StairsTopRight) {
+  const int num_primitives = 4;
+  const std::vector<std::pair<int64_t, int64_t>> intervals =
+      {{9, 13}, {6, 13}, {3, 13}, {0, 13}};
+
+  MemoryTermReducer reducer;
+  const auto num_terms = reducer.Reduce(/*num_lives=*/14, num_primitives,
+                                        Convert(intervals),
+                                        /*max_iterations=*/1);
+
+  const std::vector<std::vector<int64_t>> expected_reduced_live = {};
+  const std::vector<std::pair<int64_t, int64_t>> expected_reduced_intervals =
+      {{14, 8}, {6, 8}, {14, 2}, {0, 2}, {3, 13}, {9, 13}};
+  const std::vector<absl::btree_set<int64_t>> expected_reduced_groups =
+      {{2, 3}, {0, 1}};
+  const std::pair<int64_t, int64_t> expected_num_terms = {38, 26};
+  const absl::flat_hash_set<int64_t> expected_reduced_times = {9};
+  EXPECT_EQ(num_terms, expected_num_terms);
+  EXPECT_EQ(reducer.GetReducedLive(), expected_reduced_live);
+  EXPECT_EQ(reducer.GetReducedIntervals(), expected_reduced_intervals);
+  EXPECT_EQ(reducer.GetReducedGroups(), expected_reduced_groups);
+  EXPECT_EQ(reducer.GetReducedTimes(num_primitives), expected_reduced_times);
+}
+
+//  |                 ==>  |          55555
+//  |                 ==>  |    44444444444
+//  |          33333  ==>  |          .....  Groups:
+//  |       22222222  ==>  |       222.....    m[4] = m[0] + m[1]
+//  |    11111111111  ==>  |    ...........    m[5] = m[2] + m[3]
+//  | 00000000000000  ==>  | 000...........
+//  +-------------->  ==>  +-------------->
+//       (time)                 (time)
+TEST(AutoShardingMemoryTest, StairsBottomRight) {
+  const int num_primitives = 4;
+  const std::vector<std::pair<int64_t, int64_t>> intervals =
+      {{0, 13}, {3, 13}, {6, 13}, {9, 13}};
+
+  MemoryTermReducer reducer;
+  const auto num_terms = reducer.Reduce(/*num_lives=*/14, num_primitives,
+                                        Convert(intervals),
+                                        /*max_iterations=*/1);
+
+  const std::vector<std::vector<int64_t>> expected_reduced_live = {};
+  const std::vector<std::pair<int64_t, int64_t>> expected_reduced_intervals =
+      {{0, 2}, {14, 2}, {6, 8}, {14, 8}, {3, 13}, {9, 13}};
+  const std::vector<absl::btree_set<int64_t>> expected_reduced_groups =
+      {{0, 1}, {2, 3}};
+  const std::pair<int64_t, int64_t> expected_num_terms = {38, 26};
+  const absl::flat_hash_set<int64_t> expected_reduced_times = {9};
+  EXPECT_EQ(num_terms, expected_num_terms);
+  EXPECT_EQ(reducer.GetReducedLive(), expected_reduced_live);
+  EXPECT_EQ(reducer.GetReducedIntervals(), expected_reduced_intervals);
+  EXPECT_EQ(reducer.GetReducedGroups(), expected_reduced_groups);
+  EXPECT_EQ(reducer.GetReducedTimes(num_primitives), expected_reduced_times);
 }
 
 // clang-format on
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_option.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_option.cc
index e9648436047dd7..857110d78483d9 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_option.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_option.cc
@@ -136,6 +136,9 @@ std::string AutoShardingOption::ToString() const {
   lines.push_back(absl::StrCat("model_resharding_memory_costs: ",
                                model_resharding_memory_costs));
 
+  lines.push_back(absl::StrCat("generate_windowed_einsum_strategies: ",
+                               generate_windowed_einsum_strategies));
+
   return absl::StrJoin(lines, "\n");
 }
 
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_option.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_option.h
index a858ecfce1d4da..47b6985ca8d4da 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_option.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_option.h
@@ -26,6 +26,7 @@ namespace xla {
 
 static constexpr double kDeviceMeshAlpha = 1.0;
 static constexpr double kDeviceMeshBeta = 1.0;
+static constexpr double kOverbudgetCoeff = 1e6;
 
 // Options for the autosharding pass
 struct AutoShardingOption {
@@ -63,6 +64,10 @@ struct AutoShardingOption {
   // Enabled when memory_budget_per_device == 0;
   float memory_budget_ratio = 1.1;
 
+  // Controls the penalty associated with violating memory constraints; if
+  // negative, the memory budget is instead imposed as a hard constraint.
+  float memory_overbudget_coeff = kOverbudgetCoeff;
+
   // Overwrite the all gather cost with the input all reduce cost.
   bool force_override_all_gather_cost = false;
   double all_gather_cost = 0;
@@ -191,6 +196,12 @@ struct AutoShardingOption {
   // for resharding edges.
   bool model_resharding_memory_costs = true;
 
+  // Whether or not to generate strategies that model the windowed einsum (or
+  // collective matmul) optimization
+  // TODO(331684721,329508561): Generate windowed-einsum strategies by default
+  // once it is fully implemented.
+  bool generate_windowed_einsum_strategies = false;
+
   // Prints a debug string.
   std::string ToString() const;
 
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_runner.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_runner.cc
index b7a721780b998c..58494978c14816 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_runner.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_runner.cc
@@ -28,7 +28,7 @@ namespace xla {
 namespace spmd {
 namespace {
 
-Status RunAutoShardingPassFromFile(const std::string& file_name) {
+absl::Status RunAutoShardingPassFromFile(const std::string& file_name) {
   std::string hlo_text;
   TF_RETURN_IF_ERROR(
       tsl::ReadFileToString(tsl::Env::Default(), file_name, &hlo_text));
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc
index 718392817c124a..df4ebe5b9ac40e 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc
@@ -19,9 +19,12 @@ limitations under the License.
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
+#include <functional>
 #include <limits>
 #include <memory>
+#include <optional>
 #include <string>
+#include <string_view>
 #include <utility>
 #include <vector>
 
@@ -38,6 +41,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
+#include "xla/hlo/experimental/auto_sharding/auto_sharding_memory.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
 #include "xla/status.h"
 #include "xla/status_macros.h"
@@ -63,18 +67,6 @@ using ::operations_research::MPVariable;
 // solver cannot guarantee exact numerical precision.
 constexpr double kMaxCostEpsilon = 1.0001;
 
-// In the Mixed ILP, we model all memory-related terms (i.e., coefficients,
-// bounds, etc.) using smaller absolute values, due to limitations on precision.
-// To compensate, the overbudget objective coefficient must be amplified by the
-// same amount.
-constexpr double kMemoryMultiplier = 1e-6;
-
-// Any memory terms below this threshold will be dropped (to reduce MIP size).
-constexpr double kTinyTermThreshold = 1e-6;
-
-// Always include memory constraints with this number of terms or fewer.
-constexpr int64_t kMemoryCardinalityThreshold = 1000;
-
 bool AutoShardingSolverOutput::operator==(
     const AutoShardingSolverOutput& other) const {
   return s_val == other.s_val && e_val == other.e_val && cost == other.cost &&
@@ -90,6 +82,7 @@ bool AutoShardingSolverResult::operator==(
 void PrintLargestInstructions(
     const std::vector<NodeStrategyIdx>& chosen_strategy,
     const AutoShardingSolverRequest& request) {
+  if (!request.node_intervals().empty()) return;  // TODO(moffitt): Handle this.
   // This memory consumption computation is different from that in
   // PrintAutoShardingSolution() because L and m are created to be different
   // from liveness_set and strategy.memory_cost.
@@ -221,6 +214,116 @@ AutoShardingSolverRequest ScaleRequest(
   return scaled_request;
 }
 
+// Given the live matrix and memory costs (for nodes or edges), reduce terms and
+// create constrained variables for the subsequent groups.
+std::optional<std::pair<int64_t, int64_t>> ReduceMemoryTerms(
+    const AutoShardingSolverRequest& request, MPSolver& solver,
+    int64_t num_lives, int64_t num_primitives,
+    const std::function<
+        tsl::protobuf::RepeatedField<int64_t>(int64_t)>&  // NOLINT
+        live,
+    const tsl::protobuf::RepeatedPtrField<  // NOLINT
+        AutoShardingSolverRequest_Pair>& intervals,
+    const tsl::protobuf::RepeatedPtrField<  // NOLINT
+        AutoShardingSolverRequest_Group>& groups,
+    const tsl::protobuf::RepeatedPtrField<  // NOLINT
+        AutoShardingSolverRequest_Costs>& memory_costs,
+    std::string_view prim_type,
+    std::vector<std::vector<MPVariable*>>& prim_vars,
+    std::vector<std::pair<int64_t, int64_t>>& reduced_intervals,
+    std::vector<MPVariable*>& group_vars,
+    absl::flat_hash_set<int64_t>& reduced_times) {
+  std::optional<std::pair<int64_t, int64_t>> num_terms = std::nullopt;
+  std::vector<absl::btree_set<int64_t>> reduced_groups;
+  if (groups.empty()) {
+    // If we've been given primitive intervals instead of a liveness matrix, we
+    // need to update the # of lives in order to use the memory term reducer.
+    for (const auto& interval : intervals) {
+      if (interval.first() > interval.second()) continue;  // Interval undefined
+      num_lives = std::max(num_lives, interval.second() + 1);
+    }
+    auto Intervals =
+        [intervals](int64_t prim_idx) -> std::pair<int64_t, int64_t> {
+      return {intervals.at(prim_idx).first(), intervals.at(prim_idx).second()};
+    };
+    MemoryTermReducer reducer;
+    num_terms =
+        intervals.empty()
+            ? reducer.Reduce(num_lives, num_primitives, live)
+            : reducer.Reduce(num_lives, num_primitives, std::move(Intervals));
+    reduced_intervals = reducer.GetReducedIntervals();
+    reduced_groups = reducer.GetReducedGroups();
+  } else {  // If we've already done term reduction, just copy over the results.
+    for (const auto& interval : intervals) {
+      reduced_intervals.push_back({interval.first(), interval.second()});
+    }
+    for (const auto& group : groups) {
+      reduced_groups.push_back({group.prims().begin(), group.prims().end()});
+    }
+  }
+  solver.MakeIntVarArray(reduced_groups.size(), 0.0, MPSolver::infinity(),
+                         absl::StrCat("group_", prim_type), &group_vars);
+  for (int64_t group_idx = 0; group_idx < group_vars.size(); ++group_idx) {
+    MPConstraint* constraint = solver.MakeRowConstraint(
+        -MPSolver::infinity(), 0.0,
+        absl::StrCat("group_", prim_type, "[", group_idx, "]"));
+    constraint->SetCoefficient(group_vars[group_idx], -1.0);
+    for (const int64_t prim_idx : reduced_groups[group_idx]) {
+      for (int64_t j = 0; j < prim_vars[prim_idx].size(); ++j) {
+        double memory_cost = memory_costs.at(prim_idx).costs(j);
+        memory_cost /= request.memory_budget() / 100.0;
+        const double accumulated_coefficient =
+            constraint->GetCoefficient(prim_vars[prim_idx][j]);
+        constraint->SetCoefficient(prim_vars[prim_idx][j],
+                                   accumulated_coefficient + memory_cost);
+      }
+    }
+  }
+  const absl::flat_hash_set<int64_t> times = MemoryTermReducer::GetReducedTimes(
+      num_primitives, reduced_intervals, reduced_groups);
+  reduced_times.insert(times.begin(), times.end());
+  return num_terms;
+}
+
+// Adds the appropriate memory terms (for nodes or edges) at the given time.
+void AddMemoryTerms(
+    const AutoShardingSolverRequest& request, MPSolver& solver,
+    int64_t num_primitives,
+    const std::vector<std::pair<int64_t, int64_t>>& intervals,
+    const tsl::protobuf::RepeatedPtrField<  // NOLINT
+        AutoShardingSolverRequest_Costs>& memory_costs,
+    const MPVariable* overbudget_var,
+    const absl::flat_hash_set<int64_t>& reduced_times,
+    std::vector<std::vector<MPVariable*>>& prim_vars,
+    std::vector<MPVariable*>& group_vars,
+    absl::flat_hash_map<LivenessIdx, MPConstraint*>& constraints) {
+  for (int64_t prim_idx = 0; prim_idx < intervals.size(); ++prim_idx) {
+    for (int64_t time_idx = intervals[prim_idx].first;
+         time_idx <= intervals[prim_idx].second; ++time_idx) {
+      if (!reduced_times.contains(time_idx)) continue;
+      if (!constraints.contains(time_idx)) {
+        MPConstraint* constraint = solver.MakeRowConstraint(
+            -MPSolver::infinity(), 100.0, absl::StrCat("mem[", time_idx, "]"));
+        if (overbudget_var) constraint->SetCoefficient(overbudget_var, -100.0);
+        constraints[time_idx] = constraint;
+      }
+      MPConstraint* constraint = constraints[time_idx];
+      if (prim_idx >= num_primitives) {
+        constraint->SetCoefficient(group_vars[prim_idx - num_primitives], 1.0);
+        continue;
+      }
+      for (int64_t j = 0; j < prim_vars[prim_idx].size(); ++j) {
+        double memory_cost = memory_costs.at(prim_idx).costs(j);
+        memory_cost /= request.memory_budget() / 100.0;
+        const double accumulated_coefficient =
+            constraint->GetCoefficient(prim_vars[prim_idx][j]);
+        constraint->SetCoefficient(prim_vars[prim_idx][j],
+                                   accumulated_coefficient + memory_cost);
+      }
+    }
+  }
+}
+
 // Taking an auto-sharding problem (`request`) as an input, calls the OR tools
 // CP-SAT solver and outputs a solution to the input problem.
 //
@@ -292,6 +395,7 @@ AutoShardingSolverRequest ScaleRequest(
 //    however.
 AutoShardingSolverResult CallORToolsSolver(
     const AutoShardingSolverRequest& unscaled_request) {
+  const absl::Time start_time = absl::Now();
   const AutoShardingSolverRequest& request = ScaleRequest(unscaled_request);
   const size_t num_edges = request.edges_size();
   const int num_workers = 32;
@@ -310,9 +414,9 @@ AutoShardingSolverResult CallORToolsSolver(
         request.deterministic_mode()
             ? absl::StrCat(
                   "share_binary_clauses:false,random_seed:1,interleave_"
-                  "search:true,mip_max_bound:1e9,num_workers:",
+                  "search:true,num_workers:",
                   num_workers)
-            : absl::StrCat("mip_max_bound:1e9,num_workers:", num_workers);
+            : absl::StrCat("num_workers:", num_workers);
     solver->SetSolverSpecificParametersAsString(solver_parameter_str);
   }
 #endif
@@ -473,10 +577,54 @@ AutoShardingSolverResult CallORToolsSolver(
   }
   // c.
   if (request.memory_budget() > 0) {
+    auto LiveNodes =
+        [request](int64_t live_idx) -> tsl::protobuf::RepeatedField<int64_t> {
+      return request.live(live_idx).nodes();
+    };
+    auto LiveEdges =
+        [request](int64_t live_idx) -> tsl::protobuf::RepeatedField<int64_t> {
+      return request.live_edges(live_idx).edges();
+    };
+    std::vector<std::pair<int64_t, int64_t>> reduced_intervals_nodes,
+        reduced_intervals_edges;
+    absl::flat_hash_set<int64_t> reduced_times;
+    std::vector<MPVariable*> group_node_vars, group_edge_vars;
+    const absl::Time term_reduction_start_time = absl::Now();
+    auto num_node_terms = ReduceMemoryTerms(
+        request, *solver, request.live_size(), request.num_nodes(),
+        std::move(LiveNodes), request.node_intervals(), request.node_groups(),
+        request.memory_costs(), "node", s, reduced_intervals_nodes,
+        group_node_vars, reduced_times);
+    auto num_edge_terms = ReduceMemoryTerms(
+        request, *solver, request.live_edges_size(), request.edges_size(),
+        std::move(LiveEdges), request.edge_intervals(), request.edge_groups(),
+        request.memory_edge_costs(), "edge", e, reduced_intervals_edges,
+        group_edge_vars, reduced_times);
+    const absl::Time term_reduction_end_time = absl::Now();
+    if (num_node_terms && num_edge_terms) {
+      const auto term_reduction_duration =
+          term_reduction_end_time - term_reduction_start_time;
+      LOG(INFO) << "Memory Term Reducer took "
+                << absl::ToInt64Milliseconds(term_reduction_duration)
+                << " ms and reduced the number of terms from "
+                << num_node_terms->first + num_edge_terms->first << " to "
+                << num_node_terms->second + num_edge_terms->second;
+    }
+    absl::flat_hash_map<LivenessIdx, MPConstraint*> constraints;
+    AddMemoryTerms(request, *solver, request.num_nodes(),
+                   reduced_intervals_nodes, request.memory_costs(),
+                   overbudget_var, reduced_times, s, group_node_vars,
+                   constraints);
+    if (request.enable_memory_edge_costs()) {
+      AddMemoryTerms(request, *solver, request.edges_size(),
+                     reduced_intervals_edges, request.memory_edge_costs(),
+                     overbudget_var, reduced_times, e, group_edge_vars,
+                     constraints);
+    }
     if (overbudget_var) {
       solver->MutableObjective()->SetCoefficient(
           overbudget_var,
-          request.overbudget_coeff().coeff() / kMemoryMultiplier);
+          request.overbudget_coeff().coeff() * request.memory_budget());
     }
     LOG(INFO) << "Minimum memory budget estimate: "
               << MinimumMemoryBudgetRequired(request);
@@ -671,6 +819,9 @@ AutoShardingSolverResult CallORToolsSolver(
     LOG(INFO) << "Total Makespan: " << evaluation.total_makespan;
     LOG(INFO) << "Total Violations: " << evaluation.violation_codes.size();
   }
+  const absl::Time end_time = absl::Now();
+  const auto duration = end_time - start_time;
+  LOG(INFO) << "Solver took " << absl::ToInt64Milliseconds(duration) << " ms";
   return result;
 }
 
@@ -707,158 +858,14 @@ std::vector<EdgeStrategyIdx> GetChosenEdgeStrategy(
   return chosen_edge_strategy;
 }
 
-// Finds the timestep with the largest memory overbudget (-1 if no such value).
-LivenessIdx FindPeakLiveness(const AutoShardingSolverRequest& request,
-                             const std::vector<std::vector<MPVariable*>>& s,
-                             const std::vector<std::vector<MPVariable*>>& e) {
-  const std::vector<NodeStrategyIdx> chosen_node_strategy =
-      GetChosenNodeStrategy(request, s);
-  const std::vector<EdgeStrategyIdx> chosen_edge_strategy =
-      GetChosenEdgeStrategy(request, e);
-  LivenessIdx peak_time_idx = -1;
-  double peak_overbudget = 0.0;
-  for (LivenessIdx time_idx = 0; time_idx < request.live_size(); ++time_idx) {
-    if (request.live(time_idx).nodes_size() <= kMemoryCardinalityThreshold) {
-      continue;  // We always enforce these, no need to consider them again.
-    }
-    double memory_usage = 0.0;
-    for (NodeIdx node_idx : request.live(time_idx).nodes()) {
-      const NodeStrategyIdx j = chosen_node_strategy[node_idx];
-      memory_usage += request.memory_costs(node_idx).costs(j);
-    }
-    if (!request.live_edges().empty() && request.enable_memory_edge_costs()) {
-      for (EdgeIdx edge_idx : request.live_edges(time_idx).edges()) {
-        const EdgeStrategyIdx j = chosen_edge_strategy[edge_idx];
-        memory_usage += request.memory_edge_costs(edge_idx).costs(j);
-      }
-    }
-    const double overbudget = memory_usage - request.memory_budget();
-    if (peak_overbudget < overbudget) {
-      peak_overbudget = overbudget;
-      peak_time_idx = time_idx;
-    }
-  }
-  return peak_time_idx;
-}
-
-// Imposes a new memory constraint at the given location.  Returns the number of
-// tiny terms created.
-int ImposeMemoryConstraint(const AutoShardingSolverRequest& request,
-                           const std::vector<std::vector<MPVariable*>>& s,
-                           const std::vector<std::vector<MPVariable*>>& e,
-                           const MPVariable* overbudget_var, MPSolver& solver,
-                           LivenessIdx time_idx) {
-  int tiny_term_count = 0;
-  VLOG(1) << "Imposing a memory constraint at time index " << time_idx;
-  MPConstraint* constraint =
-      solver.MakeRowConstraint(-MPSolver::infinity(), MPSolver::infinity(),
-                               absl::StrCat("mem[", time_idx, "]"));
-  if (overbudget_var) constraint->SetCoefficient(overbudget_var, -1.0);
-  double tiny_term_total = 0.0;  // Used to trim the memory budget downward.
-  for (NodeIdx node_idx : request.live(time_idx).nodes()) {
-    double tiny_term_max = 0.0;
-    for (NodeStrategyIdx j = 0; j < s[node_idx].size(); ++j) {
-      double memory_cost = request.memory_costs(node_idx).costs(j);
-      if (memory_cost < kTinyTermThreshold * request.memory_budget()) {
-        tiny_term_max = std::max(tiny_term_max, memory_cost);
-        if (memory_cost > 0.0) ++tiny_term_count;
-        continue;
-      }
-      memory_cost *= kMemoryMultiplier;
-      const double accumulated_coefficient =
-          constraint->GetCoefficient(s[node_idx][j]);
-      constraint->SetCoefficient(s[node_idx][j],
-                                 accumulated_coefficient + memory_cost);
-    }
-    tiny_term_total += tiny_term_max;
-  }
-  if (!request.live_edges().empty() && request.enable_memory_edge_costs()) {
-    for (EdgeIdx edge_idx : request.live_edges(time_idx).edges()) {
-      double tiny_term_max = 0.0;
-      for (EdgeStrategyIdx j = 0; j < e[edge_idx].size(); ++j) {
-        double memory_cost = request.memory_edge_costs(edge_idx).costs(j);
-        if (memory_cost < kTinyTermThreshold * request.memory_budget()) {
-          tiny_term_max = std::max(tiny_term_max, memory_cost);
-          if (memory_cost > 0.0) ++tiny_term_count;
-          continue;
-        }
-        memory_cost *= kMemoryMultiplier;
-        const double accumulated_coefficient =
-            constraint->GetCoefficient(e[edge_idx][j]);
-        constraint->SetCoefficient(e[edge_idx][j],
-                                   accumulated_coefficient + memory_cost);
-      }
-      tiny_term_total += tiny_term_max;
-    }
-  }
-  constraint->SetUB(kMemoryMultiplier *
-                    (request.memory_budget() - tiny_term_total));
-  return tiny_term_count;
-}
-
 AutoShardingSolverResult SolveAndExtractSolution(
     const AutoShardingSolverRequest& request,
     const std::vector<std::vector<MPVariable*>>& s,
     const std::vector<std::vector<MPVariable*>>& e,
     const MPVariable* overbudget_var, const MPVariable* makespan_var,
     MPSolver& solver) {
-  int tiny_term_count = 0;
-  absl::Time start_time = absl::Now();
-  absl::flat_hash_set<LivenessIdx> peak_times, small_times;
-  if (request.memory_budget() > 0) {
-    // Always enforce constraints that have a relatively small number of terms.
-    for (LivenessIdx time_idx = 0; time_idx < request.live_size(); ++time_idx) {
-      if (request.live(time_idx).nodes_size() <= kMemoryCardinalityThreshold) {
-        small_times.insert(time_idx);
-        tiny_term_count += ImposeMemoryConstraint(request, s, e, overbudget_var,
-                                                  solver, time_idx);
-      }
-    }
-    // Also add in any peak times that were encountered in previous iterations.
-    if (!request.deterministic_mode()) {
-      for (const LivenessIdx peak_time_idx : request.peak_times()) {
-        if (small_times.contains(peak_time_idx)) continue;
-        peak_times.insert(peak_time_idx);
-        tiny_term_count += ImposeMemoryConstraint(request, s, e, overbudget_var,
-                                                  solver, peak_time_idx);
-      }
-    }
-  }
   auto status = solver.Solve();
-  if (request.memory_budget() > 0) {
-    // Continue to add memory constraints until (a) they are all satisfied,
-    // (b) the problem becomes infeasible, or (c) the solver times out.
-    while (status == operations_research::MPSolver::OPTIMAL) {
-      std::vector<std::pair<const MPVariable*, double>> hint;
-      for (NodeIdx node_idx = 0; node_idx < request.num_nodes(); ++node_idx) {
-        if (request.s_follow(node_idx) >= 0) continue;
-        for (NodeStrategyIdx j = 0; j < s[node_idx].size(); ++j) {
-          hint.push_back({s[node_idx][j], s[node_idx][j]->solution_value()});
-        }
-      }
-      solver.SetHint(hint);
-      const LivenessIdx peak_time_idx = FindPeakLiveness(request, s, e);
-      if (peak_time_idx == -1 || peak_times.contains(peak_time_idx)) break;
-      if (small_times.contains(peak_time_idx)) break;
-      peak_times.insert(peak_time_idx);
-      tiny_term_count += ImposeMemoryConstraint(request, s, e, overbudget_var,
-                                                solver, peak_time_idx);
-      if (request.has_solver_timeout()) {
-        auto remaining_time =
-            request.solver_timeout().solver_timeout_in_seconds();
-        remaining_time -= absl::ToInt64Seconds(absl::Now() - start_time);
-        solver.SetTimeLimit(absl::Seconds(std::max(remaining_time, 0L)));
-      }
-      status = solver.Solve();
-    }
-    LOG(INFO) << "Imposed " << peak_times.size() + small_times.size()
-              << " memory constraints out of " << request.live_size();
-  }
-  absl::Time end_time = absl::Now();
-  auto duration = end_time - start_time;
-  LOG(INFO) << "Solver took " << absl::ToInt64Milliseconds(duration) << " ms";
-  LOG(INFO) << "Solver Status: " << status;
-  LOG(INFO) << "Number of tiny terms: " << tiny_term_count;
+  LOG(INFO) << "Solver absl::Status: " << status;
 
   if (status == operations_research::MPSolver::INFEASIBLE) {
     LOG(ERROR) << "MPSolver could not find any feasible solution.";
@@ -947,7 +954,8 @@ AutoShardingSolverResult SolveAndExtractSolution(
   }
   if (overbudget_var) {
     unsalted_objective += request.overbudget_coeff().coeff() *
-                          overbudget_var->solution_value() / kMemoryMultiplier;
+                          overbudget_var->solution_value() *
+                          request.memory_budget();
   }
   if (makespan_var) {
     unsalted_objective +=
@@ -965,7 +973,7 @@ AutoShardingSolverResult SolveAndExtractSolution(
   PrintLargestInstructions(chosen_node_strategy, request);
   const AutoShardingSolverOutput output = {std::move(chosen_node_strategy),
                                            std::move(chosen_edge_strategy),
-                                           unsalted_objective, peak_times};
+                                           unsalted_objective};
   return AutoShardingSolverResult(output, false);
 }
 
@@ -1036,28 +1044,116 @@ AutoShardingEvaluation Evaluate(const AutoShardingSolverRequest& request,
   if (request.memory_budget() > 0) {
     double total_overbudget = 0.0;
     double lower_bound_overbudget = 0.0;
-    for (LivenessIdx time_idx = 0; time_idx < request.live_size(); ++time_idx) {
-      double total_memory_cost = 0.0;
-      double lower_bound_memory_cost = 0.0;
-      for (NodeIdx node_idx : request.live(time_idx).nodes()) {
-        const auto& m = request.memory_costs(node_idx).costs();
-        total_memory_cost += m[s_val[node_idx]];
-        lower_bound_memory_cost += *std::min_element(m.begin(), m.end());
+    std::vector<double> total_memory_costs, lower_bound_memory_costs;
+    if (request.node_intervals().empty()) {  // Handles live matrices.
+      total_memory_costs.resize(request.live_size(), 0.0);
+      lower_bound_memory_costs.resize(request.live_size(), 0.0);
+      for (LivenessIdx time_idx = 0; time_idx < request.live_size();
+           ++time_idx) {
+        for (NodeIdx node_idx : request.live(time_idx).nodes()) {
+          const auto& m = request.memory_costs(node_idx).costs();
+          total_memory_costs[time_idx] += m[s_val[node_idx]];
+          lower_bound_memory_costs[time_idx] +=
+              *std::min_element(m.begin(), m.end());
+        }
+        if (!request.live_edges().empty() &&
+            request.enable_memory_edge_costs()) {
+          for (EdgeIdx edge_idx : request.live_edges(time_idx).edges()) {
+            const auto& m = request.memory_edge_costs(edge_idx).costs();
+            total_memory_costs[time_idx] += m[e_val[edge_idx]];
+            lower_bound_memory_costs[time_idx] +=
+                *std::min_element(m.begin(), m.end());
+          }
+        }
       }
-      if (!request.live_edges().empty() && request.enable_memory_edge_costs()) {
-        for (EdgeIdx edge_idx : request.live_edges(time_idx).edges()) {
+    } else {  // Handles the interval-based memory representation.
+      std::vector<double> total_node_group_costs, total_edge_group_costs,
+          lower_bound_node_group_costs, lower_bound_edge_group_costs;
+      for (const auto& group : request.node_groups()) {
+        double total_group_cost = 0.0;
+        double lower_bound_group_cost = 0.0;
+        for (const NodeIdx node_idx : group.prims()) {
+          const auto& m = request.memory_costs(node_idx).costs();
+          total_group_cost += m[s_val[node_idx]];
+          lower_bound_group_cost += *std::min_element(m.begin(), m.end());
+        }
+        total_node_group_costs.push_back(total_group_cost);
+        lower_bound_node_group_costs.push_back(lower_bound_group_cost);
+      }
+      for (const auto& group : request.edge_groups()) {
+        double total_group_cost = 0.0;
+        double lower_bound_group_cost = 0.0;
+        for (const EdgeIdx edge_idx : group.prims()) {
           const auto& m = request.memory_edge_costs(edge_idx).costs();
-          total_memory_cost += m[e_val[edge_idx]];
-          lower_bound_memory_cost += *std::min_element(m.begin(), m.end());
+          total_group_cost += m[e_val[edge_idx]];
+          lower_bound_group_cost += *std::min_element(m.begin(), m.end());
         }
+        total_edge_group_costs.push_back(total_group_cost);
+        lower_bound_edge_group_costs.push_back(lower_bound_group_cost);
       }
+      for (NodeIdx node_idx = 0; node_idx < request.node_intervals_size();
+           ++node_idx) {
+        const auto& interval = request.node_intervals(node_idx);
+        if (interval.first() > interval.second()) continue;
+        // Expand cost vectors if needed to cover the range of this interval.
+        while (total_memory_costs.size() <= interval.second()) {
+          total_memory_costs.push_back(0.0);
+          lower_bound_memory_costs.push_back(0.0);
+        }
+        double total_memory_cost = 0.0, lower_bound_memory_cost = 0.0;
+        if (node_idx < request.num_nodes()) {
+          const auto& m = request.memory_costs(node_idx).costs();
+          total_memory_cost = m[s_val[node_idx]];
+          lower_bound_memory_cost = *std::min_element(m.begin(), m.end());
+        } else {
+          int64_t group_idx = node_idx - request.num_nodes();
+          total_memory_cost = total_node_group_costs[group_idx];
+          lower_bound_memory_cost = lower_bound_node_group_costs[group_idx];
+        }
+        for (LivenessIdx time_idx = interval.first();
+             time_idx <= interval.second(); ++time_idx) {
+          total_memory_costs[time_idx] += total_memory_cost;
+          lower_bound_memory_costs[time_idx] += lower_bound_memory_cost;
+        }
+      }
+      if (request.enable_memory_edge_costs()) {
+        for (EdgeIdx edge_idx = 0; edge_idx < request.edge_intervals_size();
+             ++edge_idx) {
+          const auto& interval = request.edge_intervals(edge_idx);
+          if (interval.first() > interval.second()) continue;
+          // Expand cost vectors if needed to cover the range of this interval.
+          while (total_memory_costs.size() <= interval.second()) {
+            total_memory_costs.push_back(0.0);
+            lower_bound_memory_costs.push_back(0.0);
+          }
+          double total_memory_cost = 0.0, lower_bound_memory_cost = 0.0;
+          if (edge_idx < request.edges_size()) {
+            const auto& m = request.memory_edge_costs(edge_idx).costs();
+            total_memory_cost = m[e_val[edge_idx]];
+            lower_bound_memory_cost = *std::min_element(m.begin(), m.end());
+          } else {
+            int64_t group_idx = edge_idx - request.edges_size();
+            total_memory_cost = total_edge_group_costs[group_idx];
+            lower_bound_memory_cost = lower_bound_edge_group_costs[group_idx];
+          }
+          for (LivenessIdx time_idx = interval.first();
+               time_idx <= interval.second(); ++time_idx) {
+            total_memory_costs[time_idx] += total_memory_cost;
+            lower_bound_memory_costs[time_idx] += lower_bound_memory_cost;
+          }
+        }
+      }
+    }
+    for (LivenessIdx time_idx = 0; time_idx < total_memory_costs.size();
+         ++time_idx) {
       if (request.has_overbudget_coeff()) {
-        total_overbudget = std::max(
-            total_overbudget, total_memory_cost - request.memory_budget());
-        lower_bound_overbudget =
-            std::max(lower_bound_overbudget,
-                     lower_bound_memory_cost - request.memory_budget());
-      } else if (total_memory_cost > request.memory_budget()) {
+        total_overbudget =
+            std::max(total_overbudget,
+                     total_memory_costs[time_idx] - request.memory_budget());
+        lower_bound_overbudget = std::max(
+            lower_bound_overbudget,
+            lower_bound_memory_costs[time_idx] - request.memory_budget());
+      } else if (total_memory_costs[time_idx] > request.memory_budget()) {
         evaluation.violation_codes.insert(kMemoryViolationCode);
       }
     }
@@ -1135,7 +1231,7 @@ std::vector<std::string> Rationalize(const AutoShardingSolverRequest& request,
   return rationales;
 }
 
-Status ValidateRequest(const AutoShardingSolverRequest& request) {
+absl::Status ValidateRequest(const AutoShardingSolverRequest& request) {
   const int num_nodes = request.num_nodes();
   const int num_edges = request.edges_size();
   TF_RET_CHECK(num_nodes == request.computation_costs_size());
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.h
index 32809503603c75..9b3c099cd92521 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.h
@@ -142,7 +142,7 @@ class StrategyShaver {
 
 // Check fail if `request` is invalid (e.g., because of negative node costs).
 // Note: This does not include checks for valid variable aliasing yet.
-Status ValidateRequest(const AutoShardingSolverRequest& request);
+absl::Status ValidateRequest(const AutoShardingSolverRequest& request);
 
 }  // namespace spmd
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver_test.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver_test.cc
index 6c35131b0bb085..1bbd2979369076 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver_test.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver_test.cc
@@ -60,6 +60,27 @@ void AddEdges(proto2::RepeatedPtrField<AutoShardingSolverRequest_Edges>* edges,
   }
 }
 
+void AddIntervals(
+    proto2::RepeatedPtrField<AutoShardingSolverRequest_Pair>* pairs,
+    const std::vector<std::pair<int64_t, int64_t>>& intervals) {
+  for (const auto& interval : intervals) {
+    AutoShardingSolverRequest_Pair pair;
+    pair.set_first(interval.first);
+    pair.set_second(interval.second);
+    pairs->Add(std::move(pair));
+  }
+}
+
+void AddGroups(
+    proto2::RepeatedPtrField<AutoShardingSolverRequest_Group>* groups,
+    const std::vector<std::vector<int64_t>>& reduced_groups) {
+  for (const auto& reduced_group : reduced_groups) {
+    AutoShardingSolverRequest_Group group;
+    group.mutable_prims()->Add(reduced_group.begin(), reduced_group.end());
+    groups->Add(std::move(group));
+  }
+}
+
 // clang-format off
 
 AutoShardingSolverRequest DefaultAutoShardingSolverRequest() {
@@ -375,6 +396,70 @@ TEST(CallORToolsSolverTest, HandlesMemoryEdgeCosts) {
   EXPECT_EQ(result, expected_result);
 }
 
+TEST(CallORToolsSolverTest, HandlesIntervals) {
+  AutoShardingSolverRequest request = DefaultAutoShardingSolverRequest();
+  const std::vector<std::pair<int64_t, int64_t>> node_intervals =
+      {{0, 4}, {0, 4}, {2, 3}, {3, 4}, {100, -1}};
+  const std::vector<std::pair<int64_t, int64_t>> edge_intervals =
+      {{1, 2}, {2, 3}};
+  const CostMatrix memory_edge_costs = {{1000000, 1100, 1200, 1300,
+                                         2000, 2100, 2200, 2300,
+                                         3000, 3100, 3200, 3300,
+                                         4000, 4100, 4200, 4300},
+                                        {5000000, 5100, 5200, 5300,
+                                         6000, 6100, 6200, 6300,
+                                         7000, 7100, 7200, 7300}};
+  request.clear_live();
+  AddIntervals(request.mutable_node_intervals(), node_intervals);
+  AddIntervals(request.mutable_edge_intervals(), edge_intervals);
+  AddCosts(request.mutable_memory_edge_costs(), memory_edge_costs);
+  request.set_enable_memory_edge_costs(true);
+
+  const AutoShardingSolverResult result = CallORToolsSolver(request);
+
+  const std::vector<NodeStrategyIdx> s_val = {0, 0, 1, 1, 0};
+  const std::vector<EdgeStrategyIdx> e_val = {1, 1};
+  const double objective_value = 7872.0;
+  const AutoShardingSolverOutput expected_output =
+          {s_val, e_val, objective_value};
+  const AutoShardingSolverResult expected_result = {expected_output, false};
+  EXPECT_EQ(result, expected_result);
+}
+
+TEST(CallORToolsSolverTest, HandlesReducedIntervalsAndGroups) {
+  AutoShardingSolverRequest request = DefaultAutoShardingSolverRequest();
+  const std::vector<std::pair<int64_t, int64_t>> node_intervals =
+      {{5, -1}, {5, -1}, {2, 3}, {3, 4}, {100, -1}, {0, 4}};
+  const std::vector<std::pair<int64_t, int64_t>> edge_intervals =
+      {{1, 2}, {2, 3}};
+  const std::vector<std::vector<int64_t>> node_groups = {{0, 1}};
+  const std::vector<std::vector<int64_t>> edge_groups = {};
+  const CostMatrix memory_edge_costs = {{1000000, 1100, 1200, 1300,
+                                         2000, 2100, 2200, 2300,
+                                         3000, 3100, 3200, 3300,
+                                         4000, 4100, 4200, 4300},
+                                        {5000000, 5100, 5200, 5300,
+                                         6000, 6100, 6200, 6300,
+                                         7000, 7100, 7200, 7300}};
+  request.clear_live();
+  AddIntervals(request.mutable_node_intervals(), node_intervals);
+  AddIntervals(request.mutable_edge_intervals(), edge_intervals);
+  AddGroups(request.mutable_node_groups(), node_groups);
+  AddGroups(request.mutable_edge_groups(), edge_groups);
+  AddCosts(request.mutable_memory_edge_costs(), memory_edge_costs);
+  request.set_enable_memory_edge_costs(true);
+
+  const AutoShardingSolverResult result = CallORToolsSolver(request);
+
+  const std::vector<NodeStrategyIdx> s_val = {0, 0, 1, 1, 0};
+  const std::vector<EdgeStrategyIdx> e_val = {1, 1};
+  const double objective_value = 7872.0;
+  const AutoShardingSolverOutput expected_output =
+          {s_val, e_val, objective_value};
+  const AutoShardingSolverResult expected_result = {expected_output, false};
+  EXPECT_EQ(result, expected_result);
+}
+
 TEST(CallORToolsSolverTest, SolvesWithEquivalences) {
   const AutoShardingSolverRequest request =
       AutoShardingSolverRequestWithEquivalences();
@@ -436,6 +521,67 @@ TEST(AutoShardingEvaluatorTest, EvaluatesOverbudget) {
   EXPECT_EQ(evaluation, expected_evaluation);
 }
 
+TEST(AutoShardingEvaluatorTest, EvaluatesOverbudgetWithIntervals) {
+  AutoShardingSolverRequest request = DefaultAutoShardingSolverRequest();
+  const std::vector<std::pair<int64_t, int64_t>> node_intervals =
+      {{0, 4}, {0, 4}, {2, 3}, {3, 4}, {100, -1}};
+  request.set_memory_budget(100000);
+  request.mutable_overbudget_coeff()->set_coeff(10.0);
+  request.clear_live();
+  AddIntervals(request.mutable_node_intervals(), node_intervals);
+  const std::vector<NodeStrategyIdx> s_val = {2 /* violates */, 1, 2, 2, 1};
+  const std::vector<EdgeStrategyIdx> e_val = {10, 6};
+  const double objective_value = 11138.0;
+  const AutoShardingSolverOutput output = {s_val, e_val, objective_value};
+  const AutoShardingSolverResult result = {output, false};
+
+  const AutoShardingEvaluation evaluation = Evaluate(request, result);
+
+  AutoShardingEvaluation expected_evaluation;
+  expected_evaluation.total.computation_cost = 158.0;  // 12+21+32+42+51
+  expected_evaluation.total.communication_cost = 1580.0;  // 120+210+320+420+510
+  expected_evaluation.total.resharding_cost = 9400.0;  // 3200+6200
+  expected_evaluation.total.overbudget_cost = 18400000.0;  // 10*1840000
+  expected_evaluation.lower_bound.computation_cost = 150.0;
+  expected_evaluation.lower_bound.communication_cost = 1500.0;
+  expected_evaluation.lower_bound.resharding_cost = 6000.0;
+  expected_evaluation.lower_bound.overbudget_cost = 9000000.0;
+  expected_evaluation.total_departures = 3.0;
+  EXPECT_EQ(evaluation, expected_evaluation);
+}
+
+TEST(AutoShardingEvaluatorTest,
+     EvaluatesOverbudgetWithReducedIntervalsAndGroups) {
+  AutoShardingSolverRequest request = DefaultAutoShardingSolverRequest();
+  const std::vector<std::pair<int64_t, int64_t>> node_intervals =
+      {{5, -1}, {5, -1}, {2, 3}, {3, 4}, {100, -1}, {0, 4}};
+  const std::vector<std::vector<int64_t>> node_groups = {{0, 1}};
+  request.set_memory_budget(100000);
+  request.mutable_overbudget_coeff()->set_coeff(10.0);
+  request.clear_live();
+  AddIntervals(request.mutable_node_intervals(), node_intervals);
+  AddGroups(request.mutable_node_groups(), node_groups);
+  const std::vector<NodeStrategyIdx> s_val = {2 /* violates */, 1, 2, 2, 1};
+  const std::vector<EdgeStrategyIdx> e_val = {10, 6};
+  const double objective_value = 11138.0;
+  const AutoShardingSolverOutput output = {s_val, e_val, objective_value};
+  const AutoShardingSolverResult result = {output, false};
+
+  const AutoShardingEvaluation evaluation = Evaluate(request, result);
+
+  AutoShardingEvaluation expected_evaluation;
+  expected_evaluation.total.computation_cost = 158.0;  // 12+21+32+42+51
+  expected_evaluation.total.communication_cost = 1580.0;  // 120+210+320+420+510
+  expected_evaluation.total.resharding_cost = 9400.0;  // 3200+6200
+  expected_evaluation.total.overbudget_cost = 18400000.0;  // 10*1840000
+  expected_evaluation.lower_bound.computation_cost = 150.0;
+  expected_evaluation.lower_bound.communication_cost = 1500.0;
+  expected_evaluation.lower_bound.resharding_cost = 6000.0;
+  expected_evaluation.lower_bound.overbudget_cost = 9000000.0;
+  expected_evaluation.total_departures = 3.0;
+  EXPECT_EQ(evaluation, expected_evaluation);
+}
+
 TEST(AutoShardingEvaluatorTest, ViolatesFollower) {
   const AutoShardingSolverRequest request = DefaultAutoShardingSolverRequest();
   const std::vector<NodeStrategyIdx> s_val = {3, 1, 2, 1 /* violates */, 1};
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.cc
index 2c67fa56a1dc23..b5c41f7600e78d 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.cc
@@ -68,17 +68,16 @@ bool LeafVectorsAreConsistent(const std::vector<ShardingStrategy>& one,
 // NOLINTBEGIN(readability/fn_size)
 // TODO(zhuohan): Decompose this function into smaller pieces
 absl::StatusOr<std::tuple<StrategyMap, StrategyGroups, AssociativeDotPairs>>
-BuildStrategyAndCost(const HloInstructionSequence& sequence,
-                     const HloModule* module,
-                     const absl::flat_hash_map<const HloInstruction*, int64_t>&
-                         instruction_execution_counts,
-                     const InstructionDepthMap& depth_map,
-                     const InstructionBatchDimMap& batch_dim_map,
-                     const AliasMap& alias_map,
-                     const ClusterEnvironment& cluster_env,
-                     AutoShardingOption& option, const CallGraph& call_graph,
-                     const HloCostAnalysis& hlo_cost_analysis,
-                     bool trying_multiple_mesh_shapes) {
+BuildStrategyAndCost(
+    const HloInstructionSequence& sequence, const HloModule* module,
+    const absl::flat_hash_set<const HloInstruction*>& instructions_to_shard,
+    const absl::flat_hash_map<const HloInstruction*, int64_t>&
+        instruction_execution_counts,
+    const InstructionDepthMap& depth_map,
+    const InstructionBatchDimMap& batch_dim_map, const AliasMap& alias_map,
+    const ClusterEnvironment& cluster_env, AutoShardingOption& option,
+    const CallGraph& call_graph, const HloCostAnalysis& hlo_cost_analysis,
+    bool trying_multiple_mesh_shapes) {
   const Array<int64_t>& device_mesh = cluster_env.device_mesh_;
   StrategyMap strategy_map;
   // This map stores all of the trimmed strategies due to user specified
@@ -111,6 +110,16 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
             << ToAdaptiveString(ins);
     std::unique_ptr<StrategyGroup> strategy_group;
 
+    if (!instructions_to_shard.contains(ins)) {
+      VLOG(2) << "  Manually sharded;";
+      strategy_group = HandleManuallyShardedInstruction(
+          ins, ins->shape(), instruction_id, strategy_groups, strategy_map);
+      XLA_VLOG_LINES(2,
+                     absl::StrCat("strategies:\n", strategy_group->ToString()));
+      strategy_map[ins] = std::move(strategy_group);
+      continue;
+    }
+
     HloOpcode opcode = ins->opcode();
 
     bool only_allow_divisible;
@@ -702,8 +711,23 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
               src_strategy_group, ins->shape(), instruction_id,
               /* have_memory_cost= */ true, strategy_groups, cluster_env,
               pretrimmed_strategy_map);
-        } else if (ins->has_sharding()) {
+        } else if (IsSPMDFullToShardShapeCustomCall(ins)) {
+          return absl::InternalError(
+              "An SPMDFullToShardShape call found outside a manually "
+              "partitioned sub-graph.");
+        } else if (IsSPMDShardToFullShapeCustomCall(ins)) {
+          if (!ins->has_sharding()) {
+            return absl::InternalError(
+                "An SPMDShardToFullShape custom call found without a sharding "
+                "annotation.");
+          }
           generate_non_following_strategies(false);
+        } else if (IsTopKCustomCall(ins)) {
+          generate_non_following_strategies(false, {0});
+        } else if (IsPartialReduceCustomCall(ins)) {
+          strategy_group = HandlePartialReduce(
+              ins, instruction_id, /* have_memory_cost */ true, strategy_groups,
+              cluster_env, strategy_map, call_graph);
         } else if (OutputInputSameShapes(ins)) {
           auto* partitioner =
               GetCustomCallPartitioner(ins->custom_call_target());
@@ -718,8 +742,8 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
                 /* have_memory_cost= */ true, strategy_groups, cluster_env,
                 pretrimmed_strategy_map);
           }
-        } else if (IsTopKCustomCall(ins)) {
-          generate_non_following_strategies(false, {0});
+        } else if (ins->has_sharding()) {
+          generate_non_following_strategies(false);
         } else {
           // TODO (b/258723035) Handle CustomCall ops for GPUs in a better way.
           generate_non_following_strategies(true);
@@ -873,9 +897,10 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
           << ins->ToString() << " does not have any valid strategies.";
     } else if (!(strategy_group->is_tuple ||
                  !strategy_group->strategies.empty())) {
-      return Status(absl::StatusCode::kFailedPrecondition,
-                    "Could not generate any shardings for an instruction due "
-                    "to mismatched mesh shapes.");
+      return absl::Status(
+          absl::StatusCode::kFailedPrecondition,
+          "Could not generate any shardings for an instruction due "
+          "to mismatched mesh shapes.");
     }
     // Checks the shape of resharding_costs is valid. It will check fail if the
     // shape is not as expected.
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
index 9ce627b58867b7..dae15085a93305 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/strings/string_view.h"
@@ -197,7 +198,11 @@ TEST_F(AutoShardingTest, MemoryBudgetTest) {
         liveness_set[i].push_back(hlo_value);
       }
     }
-    return spmd::MemoryBudgetLowerBound(module, liveness_set, *alias_analysis,
+    absl::flat_hash_set<const HloInstruction*> instructions_to_shard(
+        module.entry_computation()->instructions().begin(),
+        module.entry_computation()->instructions().end());
+    return spmd::MemoryBudgetLowerBound(module, instructions_to_shard,
+                                        liveness_set, *alias_analysis,
                                         num_devices, preserved_shardings);
   };
 
@@ -428,6 +433,94 @@ ENTRY %RngBitGenerator (p0: u64[2]) -> (u64[2], u32[16,16]) {
   EXPECT_THAT(instruction, op::Sharding("{replicated}"));
 }
 
+TEST_F(AutoShardingTest, SPMDShardToFullShapeTest) {
+  constexpr absl::string_view kHloString = R"(
+HloModule rng_bit_generator
+
+add.6.clone {
+  y.13 = bf16[]{:T(256)} parameter(1)
+  x.13 = bf16[]{:T(256)} parameter(0)
+  ROOT add.9011 = bf16[]{:T(256)} add(x.13, y.13)
+}
+
+ENTRY main {
+  input.1 = bf16[512,512]{1,0} parameter(0)
+  custom-call.1 = bf16[512,512]{1,0} custom-call(input.1), custom_call_target="Sharding", sharding={devices=[4,4]<=[16]}
+  custom-call.2 = bf16[128,128]{1,0} custom-call(custom-call.1), custom_call_target="SPMDFullToShardShape", sharding={manual}
+  all-reduce.1 = bf16[128,128]{1,0} all-reduce(custom-call.2), channel_id=621, replica_groups={{0,1,2,3},{4,5,6,7},{8,9,10,11},{12,13,14,15}}, use_global_device_ids=true, to_apply=add.6.clone, frontend_attributes={from-cross-replica-sharding="true"}, backend_config={"flag_configs":[],"barrier_config":{"barrier_type":"CUSTOM","id":"9"},"scoped_memory_configs":[],"compute_type":"COMPUTE_TYPE_DEFAULT","device_type":"DEVICE_TYPE_INVALID","used_scoped_memory_configs":[]}
+  custom-call.3 = bf16[512,512]{1,0} custom-call(all-reduce.1), custom_call_target="SPMDShardToFullShape", sharding={devices=[4,1,4]<=[16]last_tile_dim_replicate}
+  ROOT copy.1 = copy(custom-call.3)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
+  AutoShardingOption option;
+  // Check that custom call shardings are preserved despite us dropped user
+  // shardings
+  option.preserve_shardings =
+      AutoShardingOption::PreserveShardingsType::kRemoveAllShardings;
+  option.enable = true;
+  option.device_mesh_shape = {4, 4};
+  option.device_mesh_alpha = {1.0, 1.0};
+  option.device_mesh_beta = {1.0, 1.0};
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, AutoSharding(option).Run(module.get()));
+  VLOG(10) << module->ToString();
+  EXPECT_TRUE(changed);
+
+  const HloInstruction* custom_call2 =
+      FindInstruction(module.get(), "custom-call.2");
+  ASSERT_NE(custom_call2, nullptr);
+  EXPECT_THAT(custom_call2, op::Sharding("{manual}"));
+
+  const HloInstruction* custom_call3 =
+      FindInstruction(module.get(), "custom-call.3");
+  ASSERT_NE(custom_call3, nullptr);
+  EXPECT_THAT(custom_call3,
+              op::Sharding("{devices=[4,1,4]<=[16]last_tile_dim_replicate}"));
+
+  // Auto-sharding rewrites Sharding custom-calls
+  const HloInstruction* custom_call1 = custom_call2->operand(0);
+  ASSERT_NE(custom_call1, nullptr);
+  EXPECT_THAT(custom_call1, op::Sharding("{devices=[4,4]<=[16]}"));
+}
+
+TEST_F(AutoShardingTest, SPMDShardToFullShapeMultipleValidMeshShapeTest) {
+  constexpr absl::string_view kHloString = R"(
+HloModule rng_bit_generator
+
+add.6.clone {
+  y.13 = bf16[]{:T(256)} parameter(1)
+  x.13 = bf16[]{:T(256)} parameter(0)
+  ROOT add.9011 = bf16[]{:T(256)} add(x.13, y.13)
+}
+
+ENTRY main {
+  input.1 = bf16[512,512]{1,0} parameter(0)
+  custom-call.1 = bf16[512,512]{1,0} custom-call(input.1), custom_call_target="Sharding", sharding={devices=[4,4]<=[16]}
+  custom-call.2 = bf16[128,128]{1,0} custom-call(custom-call.1), custom_call_target="SPMDFullToShardShape", sharding={manual}
+  all-reduce.1 = bf16[128,128]{1,0} all-reduce(custom-call.2), channel_id=621, replica_groups={{0,1,2,3},{4,5,6,7},{8,9,10,11},{12,13,14,15}}, use_global_device_ids=true, to_apply=add.6.clone, frontend_attributes={from-cross-replica-sharding="true"}, backend_config={"flag_configs":[],"barrier_config":{"barrier_type":"CUSTOM","id":"9"},"scoped_memory_configs":[],"compute_type":"COMPUTE_TYPE_DEFAULT","device_type":"DEVICE_TYPE_INVALID","used_scoped_memory_configs":[]}
+  reshape.1 = bf16[64,2,128]{2,1,0} reshape(bf16[128,128]{1,0} all-reduce.1)
+  reshape.2 = bf16[64,256]{1,0} reshape(bf16[64,2,128]{2,1,0} reshape.1)
+  custom-call.3 = bf16[512,512]{1,0} custom-call(reshape.2), custom_call_target="SPMDShardToFullShape", sharding={devices=[8,2]<=[16]}
+  ROOT copy.1 = copy(custom-call.3)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
+  AutoShardingOption option;
+  option.preserve_shardings =
+      AutoShardingOption::PreserveShardingsType::kRemoveAllShardings;
+  option.enable = true;
+  option.try_multiple_mesh_shapes = false;
+  option.device_mesh_shape = {4, 4};
+  option.device_mesh_alpha = {1.0, 1.0};
+  option.device_mesh_beta = {1.0, 1.0};
+  EXPECT_DEATH(auto status = AutoSharding(option).Run(module.get()),
+               "Auto-sharding cannot infer a single appropriate mesh shape for "
+               "this HLO, and AutoShardingption::try_multiple_mesh_shapes is "
+               "set to false. Please re-run with the option set to true.");
+}
+
 TEST_F(AutoShardingTest, RngBitGeneratorTupleInput) {
   constexpr absl::string_view kHloString = R"(
 HloModule rng_bit_generator
@@ -728,10 +821,14 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
   // Keep all user shardings
   option.preserve_shardings =
       AutoShardingOption::PreserveShardingsType::kKeepAllShardings;
+  absl::flat_hash_set<const HloInstruction*> instructions_to_shard(
+      module->entry_computation()->instructions().begin(),
+      module->entry_computation()->instructions().end());
   std::pair<absl::flat_hash_map<std::string, std::vector<HloSharding>>, bool>
       saved_shardings_result =
           AutoShardingImplementation(option).SaveAndRemoveShardingAnnotation(
-              module.get(), /* replicated_small_tensors */ {},
+              module.get(), instructions_to_shard,
+              /* replicated_small_tensors */ {},
               /* execution_threads */ {});
   absl::flat_hash_map<std::string, std::vector<HloSharding>> saved_shardings =
       saved_shardings_result.first;
@@ -782,10 +879,14 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
   // Keep all user shardings
   option.preserve_shardings =
       AutoShardingOption::PreserveShardingsType::kKeepInputOutputShardings;
+  absl::flat_hash_set<const HloInstruction*> instructions_to_shard(
+      module->entry_computation()->instructions().begin(),
+      module->entry_computation()->instructions().end());
   std::pair<absl::flat_hash_map<std::string, std::vector<HloSharding>>, bool>
       saved_shardings_result =
           AutoShardingImplementation(option).SaveAndRemoveShardingAnnotation(
-              module.get(), /* replicated_small_tensors */ {"dot"},
+              module.get(), instructions_to_shard,
+              /* replicated_small_tensors */ {"dot"},
               /* execution_threads */ {});
   absl::flat_hash_map<std::string, std::vector<HloSharding>> saved_shardings =
       saved_shardings_result.first;
@@ -833,10 +934,14 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
   AutoShardingOption option;
   option.preserve_shardings =
       AutoShardingOption::PreserveShardingsType::kKeepInputOutputShardings;
+  absl::flat_hash_set<const HloInstruction*> instructions_to_shard(
+      module->entry_computation()->instructions().begin(),
+      module->entry_computation()->instructions().end());
   std::pair<absl::flat_hash_map<std::string, std::vector<HloSharding>>, bool>
       saved_shardings_result =
           AutoShardingImplementation(option).SaveAndRemoveShardingAnnotation(
-              module.get(), /* replicated_small_tensors */ {},
+              module.get(), instructions_to_shard,
+              /* replicated_small_tensors */ {},
               /* execution_threads */ {});
   absl::flat_hash_map<std::string, std::vector<HloSharding>> saved_shardings =
       saved_shardings_result.first;
@@ -913,10 +1018,14 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
   // Remove all user shardings
   option.preserve_shardings =
       AutoShardingOption::PreserveShardingsType::kRemoveAllShardings;
+  absl::flat_hash_set<const HloInstruction*> instructions_to_shard(
+      module->entry_computation()->instructions().begin(),
+      module->entry_computation()->instructions().end());
   std::pair<absl::flat_hash_map<std::string, std::vector<HloSharding>>, bool>
       saved_shardings_result =
           AutoShardingImplementation(option).SaveAndRemoveShardingAnnotation(
-              module.get(), /* replicated_small_tensors */ {},
+              module.get(), instructions_to_shard,
+              /* replicated_small_tensors */ {},
               /* execution_threads */ {});
   absl::flat_hash_map<std::string, std::vector<HloSharding>> saved_shardings =
       saved_shardings_result.first;
@@ -947,10 +1056,14 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
   // Remove all user shardings
   option.preserve_shardings =
       AutoShardingOption::PreserveShardingsType::kRemoveAllShardings;
+  absl::flat_hash_set<const HloInstruction*> instructions_to_shard(
+      module->entry_computation()->instructions().begin(),
+      module->entry_computation()->instructions().end());
   std::pair<absl::flat_hash_map<std::string, std::vector<HloSharding>>, bool>
       saved_shardings_result =
           AutoShardingImplementation(option).SaveAndRemoveShardingAnnotation(
-              module.get(), /* replicated_small_tensors */ {"dot", "copy"},
+              module.get(), instructions_to_shard,
+              /* replicated_small_tensors */ {"dot", "copy"},
               /* execution_threads */ {});
   absl::flat_hash_map<std::string, std::vector<HloSharding>> saved_shardings =
       saved_shardings_result.first;
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
index d07dcd0208d1b3..e43fc36aed518c 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "json/json.h"
 #include "xla/array.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -1802,10 +1803,10 @@ AliasSet BuildAliasSet(const HloModule* module,
   return alias_set;
 }
 
-Status CheckAliasSetCompatibility(const AliasSet& alias_set,
-                                  const StrategyGroups& strategy_groups,
-                                  const HloInstructionSequence& sequence,
-                                  bool crash_on_error) {
+absl::Status CheckAliasSetCompatibility(const AliasSet& alias_set,
+                                        const StrategyGroups& strategy_groups,
+                                        const HloInstructionSequence& sequence,
+                                        bool crash_on_error) {
   const std::vector<HloInstruction*>& instructions = sequence.instructions();
   // Checks the compatibility
   for (const auto& pair : alias_set) {
@@ -2028,6 +2029,7 @@ AdjustShardingWithPartialMeshShapePerElement(
 
 absl::StatusOr<bool> AdjustShardingsWithPartialMeshShape(
     const std::vector<HloInstruction*>& instructions,
+    const absl::flat_hash_set<const HloInstruction*>& instructions_to_shard,
     const std::vector<int64_t>& mesh_shape, int64_t total_num_devices,
     bool crash_on_error) {
   bool changed = false;
@@ -2038,7 +2040,7 @@ absl::StatusOr<bool> AdjustShardingsWithPartialMeshShape(
     }
   }
   for (HloInstruction* inst : instructions) {
-    if (!inst->has_sharding()) {
+    if (!inst->has_sharding() || !instructions_to_shard.contains(inst)) {
       continue;
     }
     if (inst->shape().IsTuple()) {
@@ -2225,7 +2227,8 @@ std::vector<std::vector<int64_t>> InferMeshShapesToTry(
       for (const HloSharding& child : sharding.tuple_elements()) {
         process_sharding(child);
       }
-    } else if (!sharding.IsReplicated() && !sharding.IsTileMaximal()) {
+    } else if (!sharding.IsReplicated() && !sharding.IsTileMaximal() &&
+               !sharding.IsManual()) {
       absl::Span<const int64_t> dims = sharding.tile_assignment().dimensions();
       std::vector<int64_t> dims_greater_than_one;
       for (const int64_t dim : dims) {
@@ -2341,5 +2344,35 @@ HloSharding ReplaceGivenShardingsWithUnknownForTuple(
   return HloSharding::Tuple(shape, new_tuple_shardings);
 }
 
+absl::StatusOr<int64_t> GetPartialReduceReductionDim(
+    const HloInstruction* ins) {
+  constexpr char kReductionDimKey[] = "reduction_dim";
+  if (ins->raw_backend_config_string().empty()) {
+    return absl::InternalError(
+        "No backend config for a PartialReduce custom call.");
+  }
+  Json::Value parsed_json;
+  Json::Reader json_reader;
+  json_reader.parse(ins->raw_backend_config_string(), parsed_json,
+                    /* collectComments */ false);
+  if (!parsed_json.isObject()) {
+    return absl::InternalError(
+        "Error when parsing json backend config for a PartialReduce custom "
+        "call.");
+  }
+  if (!parsed_json.isMember(kReductionDimKey)) {
+    return absl::InternalError(
+        "No backend config found for a PartialReduce custom call.");
+  }
+
+  if (!parsed_json[kReductionDimKey].isInt64()) {
+    return absl::InternalError(
+        "Error when extracting the reduction key from the json backend config "
+        "of a PartialReduce custom call.");
+  }
+
+  return parsed_json[kReductionDimKey].asInt64();
+}
+
 }  // namespace spmd
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
index 1ca5c644d58041..d06b705233fc85 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
@@ -25,6 +25,7 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
@@ -42,7 +43,6 @@ limitations under the License.
 #include "xla/service/call_graph.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/statusor.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/status.h"
 
@@ -54,9 +54,12 @@ inline constexpr absl::string_view kIdentityMarker = "identity";
 inline constexpr absl::string_view kPipelineMarkerStartType = "start";
 inline constexpr absl::string_view kPipelineMarkerEndType = "end";
 
-inline bool IsManualShardingBoundaryCustomCall(const HloInstruction* ins) {
-  return ins->IsCustomCall("SPMDFullToShardShape") ||
-         ins->IsCustomCall("SPMDShardToFullShape");
+inline bool IsSPMDFullToShardShapeCustomCall(const HloInstruction* ins) {
+  return ins->IsCustomCall("SPMDFullToShardShape");
+}
+
+inline bool IsSPMDShardToFullShapeCustomCall(const HloInstruction* ins) {
+  return ins->IsCustomCall("SPMDShardToFullShape");
 }
 
 inline std::pair<int, int> ParseMeshDims(const std::string& strategy_name) {
@@ -233,6 +236,12 @@ inline bool IsTopKCustomCall(const HloInstruction* inst) {
          inst->custom_call_target() == "TopK";
 }
 
+// Return whether this instruction is a TopK custom call.
+inline bool IsPartialReduceCustomCall(const HloInstruction* inst) {
+  return inst->opcode() == HloOpcode::kCustomCall &&
+         inst->custom_call_target() == "PartialReduce";
+}
+
 // Pass through the custom call marker and get the source instruction
 inline const HloInstruction* PassThroughCustomCallMarkerGetSource(
     const HloInstruction* ins) {
@@ -612,14 +621,16 @@ HloInstruction* FindInstruction(
 // total_num_devices should equal to the product of mesh_shape elements.
 absl::StatusOr<bool> AdjustShardingsWithPartialMeshShape(
     const std::vector<HloInstruction*>& instructions,
+    const absl::flat_hash_set<const HloInstruction*>& instructions_to_shard,
     const std::vector<int64_t>& mesh_shape, int64_t total_num_devices,
     bool crash_on_error);
 
 inline bool AdjustShardingsWithPartialMeshShape(
     const std::vector<HloInstruction*>& instructions,
+    const absl::flat_hash_set<const HloInstruction*>& instructions_to_shard,
     const std::vector<int64_t>& mesh_shape, int64_t total_num_devices) {
-  auto result = AdjustShardingsWithPartialMeshShape(instructions, mesh_shape,
-                                                    total_num_devices, true);
+  absl::StatusOr<bool> result = AdjustShardingsWithPartialMeshShape(
+      instructions, instructions_to_shard, mesh_shape, total_num_devices, true);
   CHECK_OK(result);
   return *result;
 }
@@ -662,6 +673,9 @@ HloSharding ReplaceGivenShardingsWithUnknownForTuple(
     const HloSharding& sharding, const Shape& shape,
     absl::Span<const bool> to_replace_sharding_ids);
 
+// Extract the reduction_dim of a PartialReduce custom call
+absl::StatusOr<int64_t> GetPartialReduceReductionDim(const HloInstruction* ins);
+
 }  // namespace spmd
 }  // namespace xla
 
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_wrapper.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_wrapper.h
index 1811a2cc619e5e..55642de9cd7b5d 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_wrapper.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_wrapper.h
@@ -20,10 +20,11 @@ limitations under the License.
 #include <cstdint>
 #include <optional>
 #include <string>
+#include <utility>
 #include <vector>
 
+#include "absl/container/btree_set.h"
 #include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_option.h"
@@ -41,11 +42,13 @@ namespace spmd {
 // combinatorial optimization problem & solves it.
 AutoShardingSolverResult CallSolver(
     const HloModule& hlo_module, const HloLiveRange& hlo_live_range,
-    const LivenessNodeSet& liveness_node_set,
-    const LivenessEdgeSet& liveness_edge_set, const StrategyMap& strategy_map,
-    const StrategyGroups& strategy_groups, const CostGraph& cost_graph,
-    const AliasSet& alias_set, const std::vector<NodeStrategyIdx>& s_hint,
-    const absl::flat_hash_set<LivenessIdx>& peak_times, bool compute_iis,
+    const StrategyMap& strategy_map, const StrategyGroups& strategy_groups,
+    const CostGraph& cost_graph, const AliasSet& alias_set,
+    const std::vector<std::pair<LivenessIdx, LivenessIdx>>& node_intervals,
+    const std::vector<std::pair<LivenessIdx, LivenessIdx>>& edge_intervals,
+    const std::vector<absl::btree_set<int64_t>>& node_groups,
+    const std::vector<absl::btree_set<int64_t>>& edge_groups,
+    const std::vector<NodeStrategyIdx>& s_hint, bool compute_iis,
     int64_t solver_timeout_in_seconds, const AutoShardingOption& option,
     std::optional<double> max_cost, absl::string_view request_name,
     const absl::flat_hash_map<std::string, const HloInstruction*>&
diff --git a/third_party/xla/xla/hlo/ir/BUILD b/third_party/xla/xla/hlo/ir/BUILD
index 1e8ac27a54df3c..8f15554a314061 100644
--- a/third_party/xla/xla/hlo/ir/BUILD
+++ b/third_party/xla/xla/hlo/ir/BUILD
@@ -2,6 +2,7 @@
 #   XLA’s HLO Intermediate Representation implementation.
 
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
 load("//xla/tsl:tsl.bzl", "internal_visibility")
 
 package(
@@ -20,6 +21,7 @@ package_group(
 cc_library(
     name = "hlo",
     srcs = [
+        "collective_device_list.cc",
         "dfs_hlo_visitor.cc",
         "dynamic_parameter_binding.cc",
         "hlo_computation.cc",
@@ -36,6 +38,7 @@ cc_library(
         "hlo_sharding_metadata.cc",
     ],
     hdrs = [
+        "collective_device_list.h",
         "dfs_hlo_visitor.h",
         "dfs_hlo_visitor_with_default.h",
         "dynamic_parameter_binding.h",
@@ -56,6 +59,7 @@ cc_library(
         "hlo_sharding_metadata.h",
     ],
     deps = [
+        ":backend_config",
         ":ptrvec",
         ":tile_assignment",
         "//xla:array",
@@ -91,10 +95,11 @@ cc_library(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:cord",
         "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/lib/gtl:iterator_range",
         "@local_tsl//tsl/lib/gtl:map_util",
@@ -109,6 +114,36 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "backend_config",
+    srcs = ["backend_config.cc"],
+    hdrs = ["backend_config.h"],
+    deps = [
+        "//xla:util",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:human_readable_json",
+        "@local_tsl//tsl/platform:protobuf",
+    ],
+)
+
+xla_cc_test(
+    name = "backend_config_test",
+    srcs = ["backend_config_test.cc"],
+    deps = [
+        ":backend_config",
+        "//xla/service/gpu:backend_configs_cc",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
+
 cc_library(
     name = "hlo_module_group",
     srcs = ["hlo_module_group.cc"],
@@ -177,6 +212,8 @@ cc_library(
         "//xla:printer",
         "//xla:util",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/types:span",
     ],
 )
diff --git a/third_party/xla/xla/hlo/ir/backend_config.cc b/third_party/xla/xla/hlo/ir/backend_config.cc
new file mode 100644
index 00000000000000..9f26ae04cddd0f
--- /dev/null
+++ b/third_party/xla/xla/hlo/ir/backend_config.cc
@@ -0,0 +1,125 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/ir/backend_config.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/util.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/human_readable_json.h"
+#include "tsl/platform/protobuf.h"
+
+namespace xla {
+
+std::unique_ptr<tsl::protobuf::Message> CloneBackendConfigProto(
+    const tsl::protobuf::Message* proto) {
+  if (proto == nullptr) {
+    return nullptr;
+  }
+  std::unique_ptr<tsl::protobuf::Message> result(proto->New());
+  result->CopyFrom(*proto);
+  return result;
+}
+
+absl::StatusOr<std::string> BackendConfigToRawString(
+    const tsl::protobuf::Message& proto) {
+  // Pass ignore_accuracy_loss = true because estimated_cycles field can be
+  // INT64_MAX. If ignore_accuracy_loss = false and estimated_cycles =
+  // INT64_MAX, JsonFormat will return an error status, although there is no
+  // accuracy loss for int64_t.
+  return tsl::ProtoToHumanReadableJson(proto, /*ignore_accuracy_loss=*/true);
+}
+
+const std::string& BackendConfigWrapper::GetRawStringWithoutMutex() const {
+  if (proto_ && raw_string_.empty()) {
+    // Cache the raw string.
+    raw_string_ = BackendConfigToRawString(*proto_).value();
+  }
+  static const std::string* kEmptyString = new std::string();
+  return raw_string_.empty() ? *kEmptyString : raw_string_;
+}
+
+absl::Status BackendConfigWrapper::GetProto(
+    tsl::protobuf::Message* output_proto) const {
+  output_proto->Clear();
+
+  absl::WriterMutexLock lock{&mutex_};
+  if (proto_ != nullptr) {
+    if (proto_->GetDescriptor() != output_proto->GetDescriptor()) {
+      return Internal("Mismatched backend config descriptors.");
+    }
+    output_proto->CopyFrom(*proto_);
+    return absl::OkStatus();
+  }
+
+  // Empty string does not parse as valid JSON, but it's a valid backend config,
+  // corresponding to the empty proto.
+  if (raw_string_.empty()) {
+    return absl::OkStatus();
+  }
+  TF_RETURN_IF_ERROR(tsl::HumanReadableJsonToProto(raw_string_, output_proto));
+  // Cache the proto into the empty proto_.
+  proto_ = CloneBackendConfigProto(output_proto);
+  return absl::OkStatus();
+}
+
+BackendConfigWrapper& BackendConfigWrapper::operator=(
+    BackendConfigWrapper&& other) {
+  std::unique_ptr<tsl::protobuf::Message> temp_proto;
+  std::string temp_string;
+
+  // Do not hold two mutexes at the same time to avoid deadlocks.
+  {
+    absl::MutexLock other_lock{&other.mutex_};
+    temp_proto = std::move(other.proto_);
+    temp_string = std::move(other.raw_string_);
+  }
+
+  absl::MutexLock this_lock{&mutex_};
+
+  proto_ = std::move(temp_proto);
+  raw_string_ = std::move(temp_string);
+  return *this;
+}
+
+bool BackendConfigWrapper::operator==(const BackendConfigWrapper& other) const {
+  tsl::protobuf::Message* this_proto = nullptr;
+
+  // Do not hold two mutexes at the same time to avoid deadlocks.
+  {
+    absl::MutexLock this_lock{&mutex_};
+    this_proto = proto_.get();
+  }
+
+  const std::string* other_raw_string = nullptr;
+  {
+    absl::MutexLock other_lock{&other.mutex_};
+    if (this_proto != nullptr && other.proto_ != nullptr) {
+      using ::tsl::protobuf::util::MessageDifferencer;
+      return MessageDifferencer::Equals(*this_proto, *other.proto_);
+    }
+    other_raw_string = &other.GetRawStringWithoutMutex();
+  }
+
+  return GetRawString() == *other_raw_string;
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/ir/backend_config.h b/third_party/xla/xla/hlo/ir/backend_config.h
new file mode 100644
index 00000000000000..9829a23b39eb58
--- /dev/null
+++ b/third_party/xla/xla/hlo/ir/backend_config.h
@@ -0,0 +1,118 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_IR_BACKEND_CONFIG_H_
+#define XLA_HLO_IR_BACKEND_CONFIG_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "tsl/platform/protobuf.h"
+
+namespace xla {
+
+// Returns a string representation of a proto in the format used by
+// HloInstruction::raw_backend_config_string.
+//
+// This is morally equivalent to:
+//
+//   HloInstruction instr;
+//   TF_RETURN_IF_ERROR(instr.set_backend_config(proto));
+//   return instr.raw_backend_config_string();
+//
+absl::StatusOr<std::string> BackendConfigToRawString(
+    const tsl::protobuf::Message& proto);
+
+// Clones the provided proto. If the input is nullptr, the result is also
+// nullptr.
+std::unique_ptr<tsl::protobuf::Message> CloneBackendConfigProto(
+    const tsl::protobuf::Message* proto);
+
+// A wrapper around the BackendConfig proto. It can be initialized either with
+// a proto object or a string representing the JSON encoding of a proto. Once
+// the wrapper is initialized (either during construction or via an assignment)
+// it becomes immutable and any further assignment attempts will fail.
+//
+// When the wrapper is initialized only the provided format is stored. If the
+// other format is requested from the wrapper later, it is lazily computed and
+// cached internally, before it is returned. Subsequent accesses will directly
+// return the cached value.
+//
+// All accesses are protected via a mutex because instances of this class are
+// accessed concurrently during auto tuning.
+class BackendConfigWrapper {
+ public:
+  BackendConfigWrapper() = default;
+  explicit BackendConfigWrapper(std::string raw_string)
+      : raw_string_(std::move(raw_string)) {}
+  explicit BackendConfigWrapper(const tsl::protobuf::Message& proto)
+      : proto_(CloneBackendConfigProto(&proto)) {}
+  BackendConfigWrapper(const BackendConfigWrapper& other) {
+    absl::MutexLock other_lock{&other.mutex_};
+    proto_ = CloneBackendConfigProto(other.proto_.get());
+    raw_string_ = other.raw_string_;
+  }
+
+  BackendConfigWrapper& operator=(BackendConfigWrapper&& other);
+  bool operator==(const BackendConfigWrapper& other) const;
+  bool operator!=(const BackendConfigWrapper& other) const {
+    return !(*this == other);
+  }
+
+  // Returns a reference to the raw string that corresponds to this backend
+  // config.
+  //
+  // WARNING: This function returns a reference which is valid at the time the
+  //          call terminates. If the BackendConfig is reassigned the reference
+  //          becomes invalid, which could lead to subtle and hard to detect
+  //          bugs, especially in multi-threaded code. The caller is responsible
+  //          for ensuring the lifetime of the referenced string.
+  //
+  //          Prefer to use the safer (but potentially slower) GetProto().
+  const std::string& GetRawString() const {
+    absl::WriterMutexLock lock{&mutex_};
+    return GetRawStringWithoutMutex();
+  }
+  absl::Status GetProto(tsl::protobuf::Message* output_proto) const;
+
+  bool empty() const {
+    absl::MutexLock lock{&mutex_};
+    return proto_ == nullptr && raw_string_.empty();
+  }
+
+ private:
+  const std::string& GetRawStringWithoutMutex() const
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // proto_ and raw_string_ must be consistent. If one is set, the other
+  // will be lazily initialized when requested. Because this class is accessed
+  // concurrently, a mutex is used to protect all access.
+  //
+  // Unfortunately, all members have to be mutable, since either of them can be
+  // the cached one.
+  mutable absl::Mutex mutex_;
+  mutable std::unique_ptr<tsl::protobuf::Message> proto_
+      ABSL_GUARDED_BY(mutex_);
+  mutable std::string raw_string_ ABSL_GUARDED_BY(mutex_);
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_IR_BACKEND_CONFIG_H_
diff --git a/third_party/xla/xla/hlo/ir/backend_config_test.cc b/third_party/xla/xla/hlo/ir/backend_config_test.cc
new file mode 100644
index 00000000000000..5ffe3ae98b8d6c
--- /dev/null
+++ b/third_party/xla/xla/hlo/ir/backend_config_test.cc
@@ -0,0 +1,118 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/ir/backend_config.h"
+
+#include <memory>
+#include <string>
+#include <thread>  // NOLINT
+#include <utility>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/notification.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/test.h"
+
+namespace xla {
+namespace {
+
+const int kNumThreads = 100;
+const int kNumRepetitions = 100;
+
+// This string has to be in a canonical form (without spaces and new lines)
+// since the == operator does not canonicalize the raw strings before comparing
+// them.
+constexpr absl::string_view kRawString =
+    R"({"operation_queue_id":"0","wait_on_operation_queues":[],"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"256","block_n":"256","block_k":"32","split_k":"1","num_stages":"1","num_warps":"16","num_ctas":"1"}},"force_earliest_schedule":false})";
+
+template <typename Input, typename CheckFn>
+void RunThreaded(Input input, CheckFn check_fn) {
+  for (int i = 0; i < kNumRepetitions; ++i) {
+    BackendConfigWrapper source(input);
+
+    absl::Notification all_threads_created;
+    std::vector<std::unique_ptr<std::thread>> threads;
+
+    for (int i = 0; i < kNumThreads; ++i) {
+      threads.emplace_back(std::make_unique<std::thread>([&] {
+        all_threads_created.WaitForNotification();
+        check_fn(source);
+      }));
+    }
+    all_threads_created.Notify();
+
+    for (int i = 0; i < kNumThreads; ++i) {
+      threads[i]->join();
+    }
+  }
+}
+
+TEST(BackendConfigWrapperTest, ConcurrentGetProto) {
+  RunThreaded(std::string{kRawString}, [](BackendConfigWrapper& source) {
+    gpu::GpuBackendConfig proto;
+    TF_EXPECT_OK(source.GetProto(&proto));
+    EXPECT_TRUE(proto.has_fusion_backend_config());
+    BackendConfigWrapper wrapped(proto);
+    EXPECT_TRUE(wrapped == source);
+  });
+}
+
+TEST(BackendConfigWrapperTest, ConcurrentGetRawString) {
+  BackendConfigWrapper source_json(std::string{kRawString});
+  gpu::GpuBackendConfig proto;
+  TF_EXPECT_OK(source_json.GetProto(&proto));
+
+  RunThreaded(proto, [](BackendConfigWrapper& source) {
+    std::string raw_string = source.GetRawString();
+    EXPECT_EQ(raw_string, kRawString);
+    BackendConfigWrapper wrapped(raw_string);
+    EXPECT_TRUE(wrapped == source);
+  });
+}
+
+TEST(BackendConfigWrapperTest, AssignmentToNonEmptyIsOK) {
+  BackendConfigWrapper a(std::string{kRawString});
+  BackendConfigWrapper b(std::string{kRawString});
+  a = std::move(b);
+  EXPECT_TRUE(a == BackendConfigWrapper(std::string{kRawString}));
+}
+
+TEST(BackendConfigWrapperTest, AssignmentDoesNotDeadlock) {
+  BackendConfigWrapper source;
+  BackendConfigWrapper& ref = source;
+  source = std::move(ref);
+}
+
+TEST(BackendConfigWrapperTest, SelfComparisonDoesNotDeadlock) {
+  BackendConfigWrapper source(std::string{kRawString});
+  EXPECT_TRUE(source == source);
+}
+
+TEST(BackendConfigWrapperTest, ComparisonDoesNotDeadlock) {
+  BackendConfigWrapper source_json(std::string{kRawString});
+  gpu::GpuBackendConfig proto;
+  TF_EXPECT_OK(source_json.GetProto(&proto));
+  RunThreaded(std::string{kRawString}, [&proto](BackendConfigWrapper& source) {
+    BackendConfigWrapper other_first(proto);
+    EXPECT_TRUE(other_first == source);
+    BackendConfigWrapper other_second(proto);
+    EXPECT_TRUE(source == other_second);
+  });
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/ir/collective_device_list.cc b/third_party/xla/xla/hlo/ir/collective_device_list.cc
new file mode 100644
index 00000000000000..e9ce7af00cadf2
--- /dev/null
+++ b/third_party/xla/xla/hlo/ir/collective_device_list.cc
@@ -0,0 +1,104 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/ir/collective_device_list.h"
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/types/span.h"
+
+namespace xla {
+
+CollectiveDeviceList::CollectiveDeviceList(
+    absl::Span<const ReplicaGroup> replica_groups) {
+  replica_groups_shared_ = std::make_shared<std::vector<ReplicaGroup>>(
+      replica_groups.begin(), replica_groups.end());
+  replica_groups_ = replica_groups_shared_.get();
+}
+
+CollectiveDeviceList::CollectiveDeviceList(
+    absl::Span<const std::vector<int64_t>> replica_groups) {
+  auto rg_list = std::make_shared<std::vector<ReplicaGroup>>();
+  rg_list->reserve(replica_groups.size());
+  for (auto g : replica_groups) {
+    auto& group = rg_list->emplace_back();
+    *group.mutable_replica_ids() = {g.begin(), g.end()};
+  }
+  replica_groups_shared_ = std::move(rg_list);
+  replica_groups_ = replica_groups_shared_.get();
+}
+
+CollectiveDeviceList::CollectiveDeviceList() {
+  replica_groups_shared_ = std::make_shared<std::vector<ReplicaGroup>>();
+  replica_groups_ = replica_groups_shared_.get();
+}
+
+void CollectiveDeviceList::MaybeMaterializeFullReplicaGroupList() const {
+  if (replica_groups_ != nullptr) {
+    VLOG(10) << "Replica group list already materialized.";
+    return;
+  }
+
+  DCHECK(iota_replica_group_list_.has_value());
+  VLOG(10) << "Materializing full replica group list";
+
+  auto rg_list = std::make_shared<std::vector<ReplicaGroup>>();
+  const int64_t num_replica_groups =
+      iota_replica_group_list_->num_replica_groups();
+  rg_list->reserve(num_replica_groups);
+
+  auto array = iota_replica_group_list_->ToArray();
+  // Iota replica group list array must only have 2 dimensions.
+  DCHECK_EQ(array.num_dimensions(), 2);
+  const int64_t num_devices_per_group =
+      iota_replica_group_list_->num_devices_per_group();
+  DCHECK_EQ(array.end() - array.begin(),
+            num_devices_per_group * num_replica_groups);
+  for (auto it = array.begin(), end = array.end(); it != end;
+       it += num_devices_per_group) {
+    *rg_list->emplace_back().mutable_replica_ids() = {
+        it, it + num_devices_per_group};
+  }
+
+  replica_groups_shared_ = std::move(rg_list);
+  replica_groups_ = replica_groups_shared_.get();
+}
+
+const std::vector<ReplicaGroup>& CollectiveDeviceList::replica_groups() const {
+  MaybeMaterializeFullReplicaGroupList();
+  return *replica_groups_;
+}
+
+int64_t IotaReplicaGroupList::num_replica_groups() const {
+  if (num_replica_groups_ == -1) {
+    num_replica_groups_ = iota_tile_assignment_.dim(0);
+  }
+  DCHECK_GE(num_replica_groups_, 0);
+  return num_replica_groups_;
+}
+
+int64_t IotaReplicaGroupList::num_devices_per_group() const {
+  if (num_devices_per_group_ == -1) {
+    num_devices_per_group_ = iota_tile_assignment_.dim(1);
+  }
+  DCHECK_GE(num_devices_per_group_, 0);
+  return num_devices_per_group_;
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/ir/collective_device_list.h b/third_party/xla/xla/hlo/ir/collective_device_list.h
new file mode 100644
index 00000000000000..5475c26d2d30f8
--- /dev/null
+++ b/third_party/xla/xla/hlo/ir/collective_device_list.h
@@ -0,0 +1,95 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_IR_COLLECTIVE_DEVICE_LIST_H_
+#define XLA_HLO_IR_COLLECTIVE_DEVICE_LIST_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "xla/hlo/ir/tile_assignment.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+
+// Represents a list of replica groups (a list of list of devices) with
+// reshaping and transposing an iota array (iota tile assignment). Can be used
+// to represent certain common patterns of device lists in a compact, scalable
+// format.
+class IotaReplicaGroupList {
+ public:
+  explicit IotaReplicaGroupList(int64_t num_replica_groups,
+                                int64_t num_devices_per_group,
+                                absl::Span<const int64_t> reshape_dims,
+                                absl::Span<const int> transpose_perm)
+      : iota_tile_assignment_(IotaTileAssignment::Create(
+            {num_replica_groups, num_devices_per_group}, reshape_dims,
+            transpose_perm)) {}
+
+  int64_t num_replica_groups() const;
+  int64_t num_devices_per_group() const;
+  absl::Span<const int64_t> reshape_dims() const {
+    return iota_tile_assignment_.reshape_dims();
+  }
+  absl::Span<const int> transpose_perm() const {
+    return iota_tile_assignment_.transpose_perm();
+  }
+  Array<int64_t> ToArray() const { return iota_tile_assignment_.ToArray(); }
+
+ private:
+  IotaTileAssignment iota_tile_assignment_;
+  mutable int64_t num_replica_groups_ = -1;
+  mutable int64_t num_devices_per_group_ = -1;
+};
+
+// Represents a series of devices participating in a collective operation
+// (all-gather, all-reduce, etc.). While this directly translates to a list of
+// replica groups, it may be used to represent these lists in compact forms.
+class CollectiveDeviceList {
+ public:
+  explicit CollectiveDeviceList(absl::Span<const ReplicaGroup> replica_groups);
+
+  explicit CollectiveDeviceList(
+      absl::Span<const std::vector<int64_t>> replica_groups);
+
+  explicit CollectiveDeviceList(
+      const IotaReplicaGroupList& iota_replica_group_list)
+      : iota_replica_group_list_(iota_replica_group_list) {}
+
+  // TODO(b/316622399): Remove this constructor and its usage as creating an
+  // empty collective device list has no meaning.
+  explicit CollectiveDeviceList();
+
+  const std::vector<ReplicaGroup>& replica_groups() const;
+
+  const std::optional<IotaReplicaGroupList>& iota_replica_group_list() const {
+    return iota_replica_group_list_;
+  }
+
+ private:
+  void MaybeMaterializeFullReplicaGroupList() const;
+  std::optional<IotaReplicaGroupList> iota_replica_group_list_ = std::nullopt;
+  mutable std::shared_ptr<const std::vector<ReplicaGroup>>
+      replica_groups_shared_ = nullptr;
+  mutable const std::vector<ReplicaGroup>* replica_groups_ = nullptr;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_IR_COLLECTIVE_DEVICE_LIST_H_
diff --git a/third_party/xla/xla/hlo/ir/dfs_hlo_visitor.cc b/third_party/xla/xla/hlo/ir/dfs_hlo_visitor.cc
index 22f031eb7f77fb..a41ec006618271 100644
--- a/third_party/xla/xla/hlo/ir/dfs_hlo_visitor.cc
+++ b/third_party/xla/xla/hlo/ir/dfs_hlo_visitor.cc
@@ -25,14 +25,14 @@ limitations under the License.
 namespace xla {
 
 template <typename HloInstructionPtr>
-Status DfsHloVisitorBase<HloInstructionPtr>::HandleElementwiseUnary(
+absl::Status DfsHloVisitorBase<HloInstructionPtr>::HandleElementwiseUnary(
     HloInstructionPtr hlo) {
   return Unimplemented("DfsHloVisitor::HandleElementwiseUnary: %s",
                        HloOpcodeString(hlo->opcode()));
 }
 
 template <typename HloInstructionPtr>
-Status DfsHloVisitorBase<HloInstructionPtr>::HandleElementwiseBinary(
+absl::Status DfsHloVisitorBase<HloInstructionPtr>::HandleElementwiseBinary(
     HloInstructionPtr hlo) {
   return Unimplemented("DfsHloVisitor::HandleElementwiseBinary: %s",
                        HloOpcodeString(hlo->opcode()));
@@ -62,12 +62,14 @@ void DfsHloVisitorBase<HloInstructionPtr>::SetVisited(
 }
 
 template <typename HloInstructionPtr>
-Status DfsHloVisitorBase<HloInstructionPtr>::Preprocess(HloInstructionPtr) {
+absl::Status DfsHloVisitorBase<HloInstructionPtr>::Preprocess(
+    HloInstructionPtr) {
   return OkStatus();
 }
 
 template <typename HloInstructionPtr>
-Status DfsHloVisitorBase<HloInstructionPtr>::Postprocess(HloInstructionPtr) {
+absl::Status DfsHloVisitorBase<HloInstructionPtr>::Postprocess(
+    HloInstructionPtr) {
   return OkStatus();
 }
 
diff --git a/third_party/xla/xla/hlo/ir/dfs_hlo_visitor.h b/third_party/xla/xla/hlo/ir/dfs_hlo_visitor.h
index 29d5715716bd5f..3382aeef22a163 100644
--- a/third_party/xla/xla/hlo/ir/dfs_hlo_visitor.h
+++ b/third_party/xla/xla/hlo/ir/dfs_hlo_visitor.h
@@ -77,256 +77,257 @@ class DfsHloVisitorBase {
   // These routines are self-descriptive, see class comment for usage
   // information.
 
-  virtual Status HandleElementwiseUnary(HloInstructionPtr hlo);
-  virtual Status HandleElementwiseBinary(HloInstructionPtr hlo);
+  virtual absl::Status HandleElementwiseUnary(HloInstructionPtr hlo);
+  virtual absl::Status HandleElementwiseBinary(HloInstructionPtr hlo);
 
-  virtual Status HandleClamp(HloInstructionPtr hlo) = 0;
-  virtual Status HandleSelect(HloInstructionPtr hlo) = 0;
-  virtual Status HandleMaximum(HloInstructionPtr hlo) {
+  virtual absl::Status HandleClamp(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleSelect(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleMaximum(HloInstructionPtr hlo) {
     return HandleElementwiseBinary(hlo);
   }
-  virtual Status HandleMinimum(HloInstructionPtr hlo) {
+  virtual absl::Status HandleMinimum(HloInstructionPtr hlo) {
     return HandleElementwiseBinary(hlo);
   }
-  virtual Status HandleConcatenate(HloInstructionPtr hlo) = 0;
-  virtual Status HandleConvert(HloInstructionPtr hlo) {
+  virtual absl::Status HandleConcatenate(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleConvert(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleBitcastConvert(HloInstructionPtr hlo) {
+  virtual absl::Status HandleBitcastConvert(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleStochasticConvert(HloInstructionPtr hlo) {
+  virtual absl::Status HandleStochasticConvert(HloInstructionPtr hlo) {
     return HandleElementwiseBinary(hlo);
   }
-  virtual Status HandleCopy(HloInstructionPtr hlo) {
+  virtual absl::Status HandleCopy(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleComplex(HloInstructionPtr hlo) {
+  virtual absl::Status HandleComplex(HloInstructionPtr hlo) {
     return HandleElementwiseBinary(hlo);
   }
-  virtual Status HandleMultiply(HloInstructionPtr hlo) {
+  virtual absl::Status HandleMultiply(HloInstructionPtr hlo) {
     return HandleElementwiseBinary(hlo);
   }
-  virtual Status HandleDot(HloInstructionPtr hlo) = 0;
-  virtual Status HandlePower(HloInstructionPtr hlo) {
+  virtual absl::Status HandleDot(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandlePower(HloInstructionPtr hlo) {
     return HandleElementwiseBinary(hlo);
   }
-  virtual Status HandleSqrt(HloInstructionPtr hlo) {
+  virtual absl::Status HandleSqrt(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleRsqrt(HloInstructionPtr hlo) {
+  virtual absl::Status HandleRsqrt(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleCbrt(HloInstructionPtr hlo) {
+  virtual absl::Status HandleCbrt(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
   /* go/keep-sorted start */
-  virtual Status HandleAllGather(HloInstructionPtr hlo) = 0;
-  virtual Status HandleAllGatherDone(HloInstructionPtr hlo) = 0;
-  virtual Status HandleAllGatherStart(HloInstructionPtr hlo) = 0;
-  virtual Status HandleAllReduce(HloInstructionPtr hlo) = 0;
-  virtual Status HandleAllReduceDone(HloInstructionPtr hlo) = 0;
-  virtual Status HandleAllReduceStart(HloInstructionPtr hlo) = 0;
-  virtual Status HandleAllToAll(HloInstructionPtr hlo) = 0;
-  virtual Status HandleCollectiveBroadcast(HloInstructionPtr hlo) = 0;
-  virtual Status HandleCollectivePermute(HloInstructionPtr hlo) = 0;
-  virtual Status HandleCollectivePermuteDone(HloInstructionPtr hlo) = 0;
-  virtual Status HandleCollectivePermuteStart(HloInstructionPtr hlo) = 0;
-  virtual Status HandleConvolution(HloInstructionPtr hlo) = 0;
-  virtual Status HandleOptimizationBarrier(HloInstructionPtr hlo) = 0;
-  virtual Status HandlePartitionId(HloInstructionPtr hlo) = 0;
-  virtual Status HandleReduceScatter(HloInstructionPtr hlo) = 0;
-  virtual Status HandleReplicaId(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleAllGather(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleAllGatherDone(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleAllGatherStart(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleAllReduce(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleAllReduceDone(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleAllReduceStart(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleAllToAll(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleCollectiveBroadcast(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleCollectivePermute(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleCollectivePermuteDone(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleCollectivePermuteStart(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleConvolution(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleOptimizationBarrier(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandlePartitionId(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleReduceScatter(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleReplicaId(HloInstructionPtr hlo) = 0;
   /* go/keep-sorted end */
 
   /* go/keep-sorted start */
-  virtual Status HandleCholesky(HloInstructionPtr hlo) = 0;
-  virtual Status HandleFft(HloInstructionPtr fft) = 0;
-  virtual Status HandleTopK(HloInstructionPtr hlo) = 0;
-  virtual Status HandleTriangularSolve(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleCholesky(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleFft(HloInstructionPtr fft) = 0;
+  virtual absl::Status HandleTopK(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleTriangularSolve(HloInstructionPtr hlo) = 0;
   /* go/keep-sorted end */
 
-  virtual Status HandleGetDimensionSize(HloInstructionPtr hlo) = 0;
-  virtual Status HandleSetDimensionSize(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleGetDimensionSize(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleSetDimensionSize(HloInstructionPtr hlo) = 0;
 
-  virtual Status HandleCompare(HloInstructionPtr hlo) {
+  virtual absl::Status HandleCompare(HloInstructionPtr hlo) {
     return HandleElementwiseBinary(hlo);
   }
-  virtual Status HandleAdd(HloInstructionPtr hlo) {
+  virtual absl::Status HandleAdd(HloInstructionPtr hlo) {
     return HandleElementwiseBinary(hlo);
   }
-  virtual Status HandleDivide(HloInstructionPtr hlo) {
+  virtual absl::Status HandleDivide(HloInstructionPtr hlo) {
     return HandleElementwiseBinary(hlo);
   }
-  virtual Status HandleRemainder(HloInstructionPtr hlo) {
+  virtual absl::Status HandleRemainder(HloInstructionPtr hlo) {
     return HandleElementwiseBinary(hlo);
   }
-  virtual Status HandleSubtract(HloInstructionPtr hlo) {
+  virtual absl::Status HandleSubtract(HloInstructionPtr hlo) {
     return HandleElementwiseBinary(hlo);
   }
-  virtual Status HandleAbs(HloInstructionPtr hlo) {
+  virtual absl::Status HandleAbs(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleAtan2(HloInstructionPtr hlo) {
+  virtual absl::Status HandleAtan2(HloInstructionPtr hlo) {
     return HandleElementwiseBinary(hlo);
   }
-  virtual Status HandleRound(HloInstructionPtr hlo) {
+  virtual absl::Status HandleRound(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleRoundNearestEven(HloInstructionPtr hlo) {
+  virtual absl::Status HandleRoundNearestEven(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleErf(HloInstructionPtr hlo) {
+  virtual absl::Status HandleErf(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleLogistic(HloInstructionPtr hlo) {
+  virtual absl::Status HandleLogistic(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleSign(HloInstructionPtr hlo) {
+  virtual absl::Status HandleSign(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleNegate(HloInstructionPtr hlo) {
+  virtual absl::Status HandleNegate(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleExp(HloInstructionPtr hlo) {
+  virtual absl::Status HandleExp(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleExpm1(HloInstructionPtr hlo) {
+  virtual absl::Status HandleExpm1(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleFloor(HloInstructionPtr hlo) {
+  virtual absl::Status HandleFloor(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleCeil(HloInstructionPtr hlo) {
+  virtual absl::Status HandleCeil(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleLog(HloInstructionPtr hlo) {
+  virtual absl::Status HandleLog(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleClz(HloInstructionPtr hlo) {
+  virtual absl::Status HandleClz(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleLog1p(HloInstructionPtr hlo) {
+  virtual absl::Status HandleLog1p(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleCos(HloInstructionPtr hlo) {
+  virtual absl::Status HandleCos(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleSin(HloInstructionPtr hlo) {
+  virtual absl::Status HandleSin(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleTan(HloInstructionPtr hlo) {
+  virtual absl::Status HandleTan(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleTanh(HloInstructionPtr hlo) {
+  virtual absl::Status HandleTanh(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleReal(HloInstructionPtr hlo) {
+  virtual absl::Status HandleReal(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleImag(HloInstructionPtr hlo) {
+  virtual absl::Status HandleImag(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleIsFinite(HloInstructionPtr hlo) {
+  virtual absl::Status HandleIsFinite(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleAnd(HloInstructionPtr hlo) {
+  virtual absl::Status HandleAnd(HloInstructionPtr hlo) {
     return HandleElementwiseBinary(hlo);
   }
-  virtual Status HandleNot(HloInstructionPtr hlo) {
+  virtual absl::Status HandleNot(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleOr(HloInstructionPtr hlo) {
+  virtual absl::Status HandleOr(HloInstructionPtr hlo) {
     return HandleElementwiseBinary(hlo);
   }
-  virtual Status HandleXor(HloInstructionPtr hlo) {
+  virtual absl::Status HandleXor(HloInstructionPtr hlo) {
     return HandleElementwiseBinary(hlo);
   }
-  virtual Status HandlePopulationCount(HloInstructionPtr hlo) {
+  virtual absl::Status HandlePopulationCount(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleShiftLeft(HloInstructionPtr hlo) {
+  virtual absl::Status HandleShiftLeft(HloInstructionPtr hlo) {
     return HandleElementwiseBinary(hlo);
   }
-  virtual Status HandleShiftRightArithmetic(HloInstructionPtr hlo) {
+  virtual absl::Status HandleShiftRightArithmetic(HloInstructionPtr hlo) {
     return HandleElementwiseBinary(hlo);
   }
-  virtual Status HandleShiftRightLogical(HloInstructionPtr hlo) {
+  virtual absl::Status HandleShiftRightLogical(HloInstructionPtr hlo) {
     return HandleElementwiseBinary(hlo);
   }
 
-  virtual Status HandleReducePrecision(HloInstructionPtr hlo) {
+  virtual absl::Status HandleReducePrecision(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
 
-  virtual Status HandleDomain(HloInstructionPtr hlo) {
+  virtual absl::Status HandleDomain(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
 
   /* go/keep-sorted start */
-  virtual Status HandleInfeed(HloInstructionPtr hlo) = 0;
-  virtual Status HandleOutfeed(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleInfeed(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleOutfeed(HloInstructionPtr hlo) = 0;
   /* go/keep-sorted end */
 
   /* go/keep-sorted start */
-  virtual Status HandleBitcast(HloInstructionPtr hlo) = 0;
-  virtual Status HandleBroadcast(HloInstructionPtr hlo) = 0;
-  virtual Status HandleCall(HloInstructionPtr hlo) = 0;
-  virtual Status HandleConditional(HloInstructionPtr hlo) = 0;
-  virtual Status HandleConstant(HloInstructionPtr hlo) = 0;
-  virtual Status HandleCustomCall(HloInstructionPtr hlo) = 0;
-  virtual Status HandleDynamicReshape(HloInstructionPtr hlo) = 0;
-  virtual Status HandleDynamicSlice(HloInstructionPtr hlo) = 0;
-  virtual Status HandleDynamicUpdateSlice(HloInstructionPtr hlo) = 0;
-  virtual Status HandleFusion(HloInstructionPtr hlo) = 0;
-  virtual Status HandleGather(HloInstructionPtr hlo) = 0;
-  virtual Status HandleGetTupleElement(HloInstructionPtr hlo) = 0;
-  virtual Status HandleIota(HloInstructionPtr hlo) = 0;
-  virtual Status HandleMap(HloInstructionPtr hlo) = 0;
-  virtual Status HandleParameter(HloInstructionPtr hlo) = 0;
-  virtual Status HandleReduce(HloInstructionPtr hlo) = 0;
-  virtual Status HandleReduceWindow(HloInstructionPtr hlo) = 0;
-  virtual Status HandleReshape(HloInstructionPtr hlo) = 0;
-  virtual Status HandleReverse(HloInstructionPtr hlo) = 0;
-  virtual Status HandleRng(HloInstructionPtr hlo) = 0;
-  virtual Status HandleRngBitGenerator(HloInstructionPtr hlo) = 0;
-  virtual Status HandleRngGetAndUpdateState(HloInstructionPtr hlo) = 0;
-  virtual Status HandleScatter(HloInstructionPtr hlo) = 0;
-  virtual Status HandleSelectAndScatter(HloInstructionPtr hlo) = 0;
-  virtual Status HandleSlice(HloInstructionPtr hlo) = 0;
-  virtual Status HandleSort(HloInstructionPtr hlo) = 0;
-  virtual Status HandleTranspose(HloInstructionPtr hlo) = 0;
-  virtual Status HandleTuple(HloInstructionPtr hlo) = 0;
-  virtual Status HandleWhile(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleBitcast(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleBroadcast(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleCall(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleConditional(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleConstant(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleCustomCall(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleDynamicReshape(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleDynamicSlice(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleDynamicUpdateSlice(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleFusion(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleGather(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleGetTupleElement(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleIota(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleMap(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleParameter(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleReduce(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleReduceWindow(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleReshape(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleReverse(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleRng(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleRngBitGenerator(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleRngGetAndUpdateState(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleScatter(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleSelectAndScatter(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleSlice(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleSort(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleTranspose(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleTuple(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleWhile(HloInstructionPtr hlo) = 0;
   /* go/keep-sorted end */
 
-  virtual Status HandlePad(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandlePad(HloInstructionPtr hlo) = 0;
 
-  virtual Status HandleAsyncStart(HloInstructionPtr hlo) = 0;
-  virtual Status HandleAsyncUpdate(HloInstructionPtr hlo) = 0;
-  virtual Status HandleAsyncDone(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleAsyncStart(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleAsyncUpdate(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleAsyncDone(HloInstructionPtr hlo) = 0;
 
-  virtual Status HandleCopyStart(HloInstructionPtr copy_start) = 0;
-  virtual Status HandleCopyDone(HloInstructionPtr copy_done) = 0;
+  virtual absl::Status HandleCopyStart(HloInstructionPtr copy_start) = 0;
+  virtual absl::Status HandleCopyDone(HloInstructionPtr copy_done) = 0;
 
-  virtual Status HandleSend(HloInstructionPtr send) = 0;
-  virtual Status HandleSendDone(HloInstructionPtr send_done) = 0;
+  virtual absl::Status HandleSend(HloInstructionPtr send) = 0;
+  virtual absl::Status HandleSendDone(HloInstructionPtr send_done) = 0;
 
-  virtual Status HandleRecv(HloInstructionPtr recv) = 0;
-  virtual Status HandleRecvDone(HloInstructionPtr recv_done) = 0;
+  virtual absl::Status HandleRecv(HloInstructionPtr recv) = 0;
+  virtual absl::Status HandleRecvDone(HloInstructionPtr recv_done) = 0;
 
-  virtual Status HandleBatchNormTraining(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleBatchNormTraining(HloInstructionPtr hlo) = 0;
 
-  virtual Status HandleBatchNormInference(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleBatchNormInference(HloInstructionPtr hlo) = 0;
 
-  virtual Status HandleBatchNormGrad(HloInstructionPtr hlo) = 0;
+  virtual absl::Status HandleBatchNormGrad(HloInstructionPtr hlo) = 0;
 
-  virtual Status HandleAddDependency(HloInstructionPtr add_dependency) = 0;
-  virtual Status HandleAfterAll(HloInstructionPtr token) = 0;
+  virtual absl::Status HandleAddDependency(
+      HloInstructionPtr add_dependency) = 0;
+  virtual absl::Status HandleAfterAll(HloInstructionPtr token) = 0;
 
   // Invoked to inform the visitor that the traversal has completed, and that
   // the root was "root".
-  virtual Status FinishVisit(HloInstructionPtr root) = 0;
+  virtual absl::Status FinishVisit(HloInstructionPtr root) = 0;
 
   // 3 possible visitation states of HLO instructions. Each instruction's
   // state only flows one way: kNotVisited -> kVisiting -> kVisited.
@@ -404,7 +405,7 @@ class DfsHloVisitorBase {
   //
   // Overriding methods should call DfsHloVisitor::Preprocess before doing their
   // own preprocessing.
-  virtual Status Preprocess(HloInstructionPtr hlo);
+  virtual absl::Status Preprocess(HloInstructionPtr hlo);
 
   // This method should be overridden by subclasses that wish to run some
   // operation on an op after its Handle* visitor method is called. See
@@ -412,7 +413,7 @@ class DfsHloVisitorBase {
   //
   // Overriding methods should call DfsHloVisitor::Postprocess after doing their
   // own postprocessing.
-  virtual Status Postprocess(HloInstructionPtr hlo);
+  virtual absl::Status Postprocess(HloInstructionPtr hlo);
 
  private:
   absl::flat_hash_map<int, VisitState> visit_state_;
diff --git a/third_party/xla/xla/hlo/ir/dfs_hlo_visitor_with_default.h b/third_party/xla/xla/hlo/ir/dfs_hlo_visitor_with_default.h
index 93b085772cd624..5a6b46fbf331c5 100644
--- a/third_party/xla/xla/hlo/ir/dfs_hlo_visitor_with_default.h
+++ b/third_party/xla/xla/hlo/ir/dfs_hlo_visitor_with_default.h
@@ -48,239 +48,244 @@ class DfsHloVisitorWithDefaultBase
   ~DfsHloVisitorWithDefaultBase() override = default;
 
   // Default action performed on HloInstruction.
-  virtual Status DefaultAction(HloInstructionPtr hlo_instruction) = 0;
+  virtual absl::Status DefaultAction(HloInstructionPtr hlo_instruction) = 0;
 
-  Status HandleElementwiseUnary(HloInstructionPtr hlo) override {
+  absl::Status HandleElementwiseUnary(HloInstructionPtr hlo) override {
     return DefaultAction(hlo);
   }
-  Status HandleElementwiseBinary(HloInstructionPtr hlo) override {
+  absl::Status HandleElementwiseBinary(HloInstructionPtr hlo) override {
     return DefaultAction(hlo);
   }
 
-  Status HandleBatchNormTraining(HloInstructionPtr hlo) override {
+  absl::Status HandleBatchNormTraining(HloInstructionPtr hlo) override {
     return DefaultAction(hlo);
   }
 
-  Status HandleBatchNormInference(HloInstructionPtr hlo) override {
+  absl::Status HandleBatchNormInference(HloInstructionPtr hlo) override {
     return DefaultAction(hlo);
   }
 
-  Status HandleBatchNormGrad(HloInstructionPtr hlo) override {
+  absl::Status HandleBatchNormGrad(HloInstructionPtr hlo) override {
     return DefaultAction(hlo);
   }
 
-  Status HandleClamp(HloInstructionPtr clamp) override {
+  absl::Status HandleClamp(HloInstructionPtr clamp) override {
     return DefaultAction(clamp);
   }
-  Status HandleConcatenate(HloInstructionPtr concatenate) override {
+  absl::Status HandleConcatenate(HloInstructionPtr concatenate) override {
     return DefaultAction(concatenate);
   }
-  Status HandleSelect(HloInstructionPtr select) override {
+  absl::Status HandleSelect(HloInstructionPtr select) override {
     return DefaultAction(select);
   }
-  Status HandleDot(HloInstructionPtr dot) override {
+  absl::Status HandleDot(HloInstructionPtr dot) override {
     return DefaultAction(dot);
   }
-  Status HandleConvolution(HloInstructionPtr convolution) override {
+  absl::Status HandleConvolution(HloInstructionPtr convolution) override {
     return DefaultAction(convolution);
   }
-  Status HandleFft(HloInstructionPtr fft) override {
+  absl::Status HandleFft(HloInstructionPtr fft) override {
     return DefaultAction(fft);
   }
-  Status HandleTriangularSolve(HloInstructionPtr hlo) override {
+  absl::Status HandleTriangularSolve(HloInstructionPtr hlo) override {
     return DefaultAction(hlo);
   }
-  Status HandleCholesky(HloInstructionPtr hlo) override {
+  absl::Status HandleCholesky(HloInstructionPtr hlo) override {
     return DefaultAction(hlo);
   }
-  Status HandleOptimizationBarrier(HloInstructionPtr hlo) override {
+  absl::Status HandleOptimizationBarrier(HloInstructionPtr hlo) override {
     return DefaultAction(hlo);
   }
-  Status HandleAllGather(HloInstructionPtr crs) override {
+  absl::Status HandleAllGather(HloInstructionPtr crs) override {
     return DefaultAction(crs);
   }
-  Status HandleAllGatherStart(HloInstructionPtr crs) override {
+  absl::Status HandleAllGatherStart(HloInstructionPtr crs) override {
     return DefaultAction(crs);
   }
-  Status HandleAllGatherDone(HloInstructionPtr crs) override {
+  absl::Status HandleAllGatherDone(HloInstructionPtr crs) override {
     return DefaultAction(crs);
   }
-  Status HandleAllReduce(HloInstructionPtr crs) override {
+  absl::Status HandleAllReduce(HloInstructionPtr crs) override {
     return DefaultAction(crs);
   }
-  Status HandleReduceScatter(HloInstructionPtr hlo) override {
+  absl::Status HandleReduceScatter(HloInstructionPtr hlo) override {
     return DefaultAction(hlo);
   }
-  Status HandleAllReduceStart(HloInstructionPtr hlo) override {
+  absl::Status HandleAllReduceStart(HloInstructionPtr hlo) override {
     return DefaultAction(hlo);
   }
-  Status HandleAllReduceDone(HloInstructionPtr hlo) override {
+  absl::Status HandleAllReduceDone(HloInstructionPtr hlo) override {
     return DefaultAction(hlo);
   }
-  Status HandleAllToAll(HloInstructionPtr hlo) override {
+  absl::Status HandleAllToAll(HloInstructionPtr hlo) override {
     return DefaultAction(hlo);
   }
-  Status HandleCollectiveBroadcast(HloInstructionPtr hlo) override {
+  absl::Status HandleCollectiveBroadcast(HloInstructionPtr hlo) override {
     return DefaultAction(hlo);
   }
-  Status HandleCollectivePermute(HloInstructionPtr hlo) override {
+  absl::Status HandleCollectivePermute(HloInstructionPtr hlo) override {
     return DefaultAction(hlo);
   }
-  Status HandleCollectivePermuteStart(HloInstructionPtr hlo) override {
+  absl::Status HandleCollectivePermuteStart(HloInstructionPtr hlo) override {
     return DefaultAction(hlo);
   }
-  Status HandleCollectivePermuteDone(HloInstructionPtr hlo) override {
+  absl::Status HandleCollectivePermuteDone(HloInstructionPtr hlo) override {
     return DefaultAction(hlo);
   }
-  Status HandleReplicaId(HloInstructionPtr hlo) override {
+  absl::Status HandleReplicaId(HloInstructionPtr hlo) override {
     return DefaultAction(hlo);
   }
-  Status HandlePartitionId(HloInstructionPtr hlo) override {
+  absl::Status HandlePartitionId(HloInstructionPtr hlo) override {
     return DefaultAction(hlo);
   }
-  Status HandleRng(HloInstructionPtr random) override {
+  absl::Status HandleRng(HloInstructionPtr random) override {
     return DefaultAction(random);
   }
-  Status HandleRngBitGenerator(HloInstructionPtr random) override {
+  absl::Status HandleRngBitGenerator(HloInstructionPtr random) override {
     return DefaultAction(random);
   }
-  Status HandleRngGetAndUpdateState(HloInstructionPtr random) override {
+  absl::Status HandleRngGetAndUpdateState(HloInstructionPtr random) override {
     return DefaultAction(random);
   }
-  Status HandleInfeed(HloInstructionPtr infeed) override {
+  absl::Status HandleInfeed(HloInstructionPtr infeed) override {
     return DefaultAction(infeed);
   }
-  Status HandleOutfeed(HloInstructionPtr outfeed) override {
+  absl::Status HandleOutfeed(HloInstructionPtr outfeed) override {
     return DefaultAction(outfeed);
   }
-  Status HandleReverse(HloInstructionPtr reverse) override {
+  absl::Status HandleReverse(HloInstructionPtr reverse) override {
     return DefaultAction(reverse);
   }
-  Status HandleSort(HloInstructionPtr sort) override {
+  absl::Status HandleSort(HloInstructionPtr sort) override {
     return DefaultAction(sort);
   }
-  Status HandleConstant(HloInstructionPtr constant) override {
+  absl::Status HandleConstant(HloInstructionPtr constant) override {
     return DefaultAction(constant);
   }
-  Status HandleIota(HloInstructionPtr iota) override {
+  absl::Status HandleIota(HloInstructionPtr iota) override {
     return DefaultAction(iota);
   }
-  Status HandleGetTupleElement(HloInstructionPtr get_tuple_element) override {
+  absl::Status HandleGetTupleElement(
+      HloInstructionPtr get_tuple_element) override {
     return DefaultAction(get_tuple_element);
   }
-  Status HandleParameter(HloInstructionPtr parameter) override {
+  absl::Status HandleParameter(HloInstructionPtr parameter) override {
     return DefaultAction(parameter);
   }
-  Status HandleFusion(HloInstructionPtr fusion) override {
+  absl::Status HandleFusion(HloInstructionPtr fusion) override {
     return DefaultAction(fusion);
   }
-  Status HandleCall(HloInstructionPtr call) override {
+  absl::Status HandleCall(HloInstructionPtr call) override {
     return DefaultAction(call);
   }
-  Status HandleCustomCall(HloInstructionPtr custom_call) override {
+  absl::Status HandleCustomCall(HloInstructionPtr custom_call) override {
     return DefaultAction(custom_call);
   }
-  Status HandleSlice(HloInstructionPtr slice) override {
+  absl::Status HandleSlice(HloInstructionPtr slice) override {
     return DefaultAction(slice);
   }
-  Status HandleDynamicSlice(HloInstructionPtr dynamic_slice) override {
+  absl::Status HandleDynamicSlice(HloInstructionPtr dynamic_slice) override {
     return DefaultAction(dynamic_slice);
   }
-  Status HandleDynamicUpdateSlice(
+  absl::Status HandleDynamicUpdateSlice(
       HloInstructionPtr dynamic_update_slice) override {
     return DefaultAction(dynamic_update_slice);
   }
-  Status HandleTuple(HloInstructionPtr tuple) override {
+  absl::Status HandleTuple(HloInstructionPtr tuple) override {
     return DefaultAction(tuple);
   }
-  Status HandleMap(HloInstructionPtr map) override {
+  absl::Status HandleMap(HloInstructionPtr map) override {
     return DefaultAction(map);
   }
-  Status HandleReduce(HloInstructionPtr reduce) override {
+  absl::Status HandleReduce(HloInstructionPtr reduce) override {
     return DefaultAction(reduce);
   }
-  Status HandleReduceWindow(HloInstructionPtr reduce_window) override {
+  absl::Status HandleReduceWindow(HloInstructionPtr reduce_window) override {
     return DefaultAction(reduce_window);
   }
-  Status HandleSelectAndScatter(HloInstructionPtr select_and_scatter) override {
+  absl::Status HandleSelectAndScatter(
+      HloInstructionPtr select_and_scatter) override {
     return DefaultAction(select_and_scatter);
   }
-  Status HandleBitcast(HloInstructionPtr bitcast) override {
+  absl::Status HandleBitcast(HloInstructionPtr bitcast) override {
     return DefaultAction(bitcast);
   }
-  Status HandleBroadcast(HloInstructionPtr broadcast) override {
+  absl::Status HandleBroadcast(HloInstructionPtr broadcast) override {
     return DefaultAction(broadcast);
   }
-  Status HandlePad(HloInstructionPtr pad) override {
+  absl::Status HandlePad(HloInstructionPtr pad) override {
     return DefaultAction(pad);
   }
-  Status HandleDynamicReshape(HloInstructionPtr dynamic_reshape) override {
+  absl::Status HandleDynamicReshape(
+      HloInstructionPtr dynamic_reshape) override {
     return DefaultAction(dynamic_reshape);
   }
-  Status HandleReshape(HloInstructionPtr reshape) override {
+  absl::Status HandleReshape(HloInstructionPtr reshape) override {
     return DefaultAction(reshape);
   }
-  Status HandleTranspose(HloInstructionPtr transpose) override {
+  absl::Status HandleTranspose(HloInstructionPtr transpose) override {
     return DefaultAction(transpose);
   }
-  Status HandleWhile(HloInstructionPtr xla_while) override {
+  absl::Status HandleWhile(HloInstructionPtr xla_while) override {
     return DefaultAction(xla_while);
   }
-  Status HandleConditional(HloInstructionPtr conditional) override {
+  absl::Status HandleConditional(HloInstructionPtr conditional) override {
     return DefaultAction(conditional);
   }
-  Status HandleAsyncStart(HloInstructionPtr async_start) override {
+  absl::Status HandleAsyncStart(HloInstructionPtr async_start) override {
     return DefaultAction(async_start);
   }
-  Status HandleAsyncUpdate(HloInstructionPtr async_update) override {
+  absl::Status HandleAsyncUpdate(HloInstructionPtr async_update) override {
     return DefaultAction(async_update);
   }
-  Status HandleAsyncDone(HloInstructionPtr async_done) override {
+  absl::Status HandleAsyncDone(HloInstructionPtr async_done) override {
     return DefaultAction(async_done);
   }
-  Status HandleCopyStart(HloInstructionPtr copy_start) override {
+  absl::Status HandleCopyStart(HloInstructionPtr copy_start) override {
     return DefaultAction(copy_start);
   }
-  Status HandleCopyDone(HloInstructionPtr copy_done) override {
+  absl::Status HandleCopyDone(HloInstructionPtr copy_done) override {
     return DefaultAction(copy_done);
   }
-  Status HandleRecv(HloInstructionPtr recv) override {
+  absl::Status HandleRecv(HloInstructionPtr recv) override {
     return DefaultAction(recv);
   }
-  Status HandleRecvDone(HloInstructionPtr recv_done) override {
+  absl::Status HandleRecvDone(HloInstructionPtr recv_done) override {
     return DefaultAction(recv_done);
   }
-  Status HandleSend(HloInstructionPtr send) override {
+  absl::Status HandleSend(HloInstructionPtr send) override {
     return DefaultAction(send);
   }
-  Status HandleTopK(HloInstructionPtr topk) override {
+  absl::Status HandleTopK(HloInstructionPtr topk) override {
     return DefaultAction(topk);
   }
-  Status HandleSendDone(HloInstructionPtr send_done) override {
+  absl::Status HandleSendDone(HloInstructionPtr send_done) override {
     return DefaultAction(send_done);
   }
-  Status HandleGather(HloInstructionPtr gather) override {
+  absl::Status HandleGather(HloInstructionPtr gather) override {
     return DefaultAction(gather);
   }
-  Status HandleScatter(HloInstructionPtr scatter) override {
+  absl::Status HandleScatter(HloInstructionPtr scatter) override {
     return DefaultAction(scatter);
   }
-  Status HandleAfterAll(HloInstructionPtr token) override {
+  absl::Status HandleAfterAll(HloInstructionPtr token) override {
     return DefaultAction(token);
   }
-  Status HandleGetDimensionSize(HloInstructionPtr get_size) override {
+  absl::Status HandleGetDimensionSize(HloInstructionPtr get_size) override {
     return DefaultAction(get_size);
   }
-  Status HandleSetDimensionSize(HloInstructionPtr get_size) override {
+  absl::Status HandleSetDimensionSize(HloInstructionPtr get_size) override {
     return DefaultAction(get_size);
   }
-  Status HandleAddDependency(HloInstructionPtr add_dependency) override {
+  absl::Status HandleAddDependency(HloInstructionPtr add_dependency) override {
     return DefaultAction(add_dependency);
   }
 
   // Invoked to inform the visitor that the traversal has completed, and that
   // the root was "root".
-  Status FinishVisit(HloInstructionPtr /*root*/) override { return OkStatus(); }
+  absl::Status FinishVisit(HloInstructionPtr /*root*/) override {
+    return OkStatus();
+  }
 
  private:
   DfsHloVisitorWithDefaultBase(const DfsHloVisitorWithDefaultBase&) = delete;
@@ -304,7 +309,7 @@ class DfsHloRewriteVisitor : public DfsHloVisitorWithDefault {
   absl::StatusOr<bool> RunOnModule(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads = {}) {
-    Status status;
+    absl::Status status;
     for (HloComputation* computation :
          module->MakeNonfusionComputations(execution_threads)) {
       status = computation->Accept(this);
@@ -314,7 +319,7 @@ class DfsHloRewriteVisitor : public DfsHloVisitorWithDefault {
   }
 
   // Default visitor action is to do nothing and return OK.
-  Status DefaultAction(HloInstruction* /*hlo_instruction*/) override {
+  absl::Status DefaultAction(HloInstruction* /*hlo_instruction*/) override {
     return OkStatus();
   }
 
@@ -323,14 +328,14 @@ class DfsHloRewriteVisitor : public DfsHloVisitorWithDefault {
  protected:
   // Replaces the existing HLO instruction old_instruction, with
   // new_instruction, and marks the optimizer status as changed.
-  // Returns the Status representing the result of the replace operation.
-  Status ReplaceWithNewInstruction(
+  // Returns the absl::Status representing the result of the replace operation.
+  absl::Status ReplaceWithNewInstruction(
       HloInstruction* old_instruction,
       std::unique_ptr<HloInstruction> new_instruction) {
     VLOG(3) << "Replacing instruction:" << "\n  old: "
             << old_instruction->ToString()
             << "\n  new: " << new_instruction->ToString();
-    Status status = old_instruction->parent()->ReplaceWithNewInstruction(
+    absl::Status status = old_instruction->parent()->ReplaceWithNewInstruction(
         old_instruction, std::move(new_instruction));
     if (ABSL_PREDICT_TRUE(status.ok())) {
       changed_ = true;
@@ -340,7 +345,7 @@ class DfsHloRewriteVisitor : public DfsHloVisitorWithDefault {
 
   // Replaces the existing HLO instruction old_instruction, with
   // new_instruction, and marks the optimizer status as changed.
-  // Returns the Status representing the result of the replace operation.
+  // Returns the absl::Status representing the result of the replace operation.
   absl::StatusOr<bool> ReplaceInstruction(HloInstruction* old_instruction,
                                           HloInstruction* new_instruction,
                                           bool preserve_sharding) {
@@ -356,8 +361,8 @@ class DfsHloRewriteVisitor : public DfsHloVisitorWithDefault {
     return changed_or;
   }
 
-  Status ReplaceInstruction(HloInstruction* old_instruction,
-                            HloInstruction* new_instruction) {
+  absl::Status ReplaceInstruction(HloInstruction* old_instruction,
+                                  HloInstruction* new_instruction) {
     absl::StatusOr<bool> changed_or =
         ReplaceInstruction(old_instruction, new_instruction,
                            /*preserve_sharding=*/false);
@@ -375,7 +380,8 @@ class DfsHloRewriteVisitor : public DfsHloVisitorWithDefault {
 };
 
 // (Const)FunctionVisitor lets you transform an
-// std::function<Status((const) HloInstruction*)> into a (Const)DfsHloVisitor.
+// std::function<absl::Status((const) HloInstruction*)> into a
+// (Const)DfsHloVisitor.
 //
 // This is useful if you have code that needs to handle visitors in the form of
 // both std::function and DfsHloVisitor.  You can wrap the function in a
@@ -385,10 +391,10 @@ class FunctionVisitorBase
     : public DfsHloVisitorWithDefaultBase<HloInstructionPtr> {
  public:
   explicit FunctionVisitorBase(
-      std::function<Status(HloInstructionPtr)> visitor_func)
+      std::function<absl::Status(HloInstructionPtr)> visitor_func)
       : visitor_func_(std::move(visitor_func)) {}
 
-  Status DefaultAction(HloInstructionPtr hlo_instruction) override {
+  absl::Status DefaultAction(HloInstructionPtr hlo_instruction) override {
     return visitor_func_(hlo_instruction);
   }
 
@@ -396,7 +402,7 @@ class FunctionVisitorBase
   FunctionVisitorBase(const FunctionVisitorBase&) = delete;
   FunctionVisitorBase& operator=(const FunctionVisitorBase&) = delete;
 
-  std::function<Status(HloInstructionPtr)> visitor_func_;
+  std::function<absl::Status(HloInstructionPtr)> visitor_func_;
 };
 
 using FunctionVisitor = FunctionVisitorBase<HloInstruction*>;
diff --git a/third_party/xla/xla/hlo/ir/dynamic_parameter_binding.cc b/third_party/xla/xla/hlo/ir/dynamic_parameter_binding.cc
index ca96fbe1351136..b1fc5fcab3aa5b 100644
--- a/third_party/xla/xla/hlo/ir/dynamic_parameter_binding.cc
+++ b/third_party/xla/xla/hlo/ir/dynamic_parameter_binding.cc
@@ -31,7 +31,7 @@ limitations under the License.
 
 namespace xla {
 
-Status DynamicParameterBinding::Bind(
+absl::Status DynamicParameterBinding::Bind(
     const DynamicSizeParameter& dynamic_parameter,
     const DynamicDimension& dynamic_dimension) {
   auto result = bindings_.emplace(dynamic_dimension, dynamic_parameter);
@@ -67,18 +67,18 @@ std::string DynamicParameterBinding::ToString() const {
   return absl::StrJoin(pieces, "\n");
 }
 
-Status DynamicParameterBinding::ForEachBinding(BindingFn fn) const {
+absl::Status DynamicParameterBinding::ForEachBinding(BindingFn fn) const {
   for (const auto& binding : bindings_) {
     TF_RETURN_IF_ERROR(fn(binding.second, binding.first));
   }
   return OkStatus();
 }
 
-Status DynamicParameterBinding::Verify(
+absl::Status DynamicParameterBinding::Verify(
     const HloComputation& computation) const {
   return ForEachBinding([&](const DynamicSizeParameter& dynamic_parameter,
                             const DynamicDimension& dynamic_dimension)
-                            -> Status {
+                            -> absl::Status {
     TF_RET_CHECK(dynamic_parameter.parameter_num >= 0 &&
                  dynamic_parameter.parameter_num <
                      computation.num_parameters());
diff --git a/third_party/xla/xla/hlo/ir/dynamic_parameter_binding.h b/third_party/xla/xla/hlo/ir/dynamic_parameter_binding.h
index 70a199db635321..7c78ef3c8d659d 100644
--- a/third_party/xla/xla/hlo/ir/dynamic_parameter_binding.h
+++ b/third_party/xla/xla/hlo/ir/dynamic_parameter_binding.h
@@ -81,8 +81,8 @@ class DynamicParameterBinding {
   // Adds binding which indicates that the dimension indicated by
   // `dynamic_dimension` is dynamic, and its runtime size is represented by
   // `dynamic_parameter`.
-  Status Bind(const DynamicSizeParameter& dynamic_parameter,
-              const DynamicDimension& dynamic_dimension);
+  absl::Status Bind(const DynamicSizeParameter& dynamic_parameter,
+                    const DynamicDimension& dynamic_dimension);
 
   // Returns the parameter and the index representing the runtime size of
   // dimension `dim_num` of parameter `param_num` at `param_index`.
@@ -92,17 +92,17 @@ class DynamicParameterBinding {
       const DynamicDimension& dynamic_dimension) const;
 
   using BindingFn =
-      std::function<Status(const DynamicSizeParameter& dynamic_parameter,
-                           const DynamicDimension& dynamic_dimension)>;
+      std::function<absl::Status(const DynamicSizeParameter& dynamic_parameter,
+                                 const DynamicDimension& dynamic_dimension)>;
 
   // Iterate through each binding.
-  Status ForEachBinding(BindingFn fn) const;
+  absl::Status ForEachBinding(BindingFn fn) const;
 
   std::string ToString() const;
 
   // Verifies that the given binding is valid for the given computation.
   // Specifically, the binding's parameter and parameter size should be valid.
-  Status Verify(const HloComputation& computation) const;
+  absl::Status Verify(const HloComputation& computation) const;
 
   // Returns true iff there are no bindings.
   bool empty() const { return bindings_.empty(); }
diff --git a/third_party/xla/xla/hlo/ir/hlo_computation.cc b/third_party/xla/xla/hlo/ir/hlo_computation.cc
index 89edd53556616d..b424a4b374dc76 100644
--- a/third_party/xla/xla/hlo/ir/hlo_computation.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_computation.cc
@@ -277,7 +277,7 @@ HloInstruction* HloComputation::AddEntryComputationParameter(
   return instructions_.back().get();
 }
 
-Status HloComputation::ReplaceEntryComputationParameter(
+absl::Status HloComputation::ReplaceEntryComputationParameter(
     int64_t param_no, HloInstruction* old_instruction,
     std::unique_ptr<HloInstruction> instruction) {
   CHECK_GE(param_no, 0);
@@ -297,7 +297,7 @@ Status HloComputation::ReplaceEntryComputationParameter(
   return ForceRemoveInstruction(old_instruction);
 }
 
-Status HloComputation::RemoveParameter(int64_t param_no) {
+absl::Status HloComputation::RemoveParameter(int64_t param_no) {
   CHECK_GE(param_no, 0);
   CHECK_LT(param_no, param_instructions_.size());
   HloInstruction* param_instruction = param_instructions_[param_no];
@@ -339,15 +339,15 @@ HloInstruction* HloComputation::ReplaceParameter(
   return new_instruction;
 }
 
-Status HloComputation::RemoveUnusedParametersFromFusedComputation() {
+absl::Status HloComputation::RemoveUnusedParametersFromFusedComputation() {
   return RemoveUnusedParametersImpl(/*allow_non_fusion=*/false);
 }
 
-Status HloComputation::RemoveUnusedParametersFromAnyComputation() {
+absl::Status HloComputation::RemoveUnusedParametersFromAnyComputation() {
   return RemoveUnusedParametersImpl(/*allow_non_fusion=*/true);
 }
 
-Status HloComputation::RemoveUnusedParametersImpl(bool allow_non_fusion) {
+absl::Status HloComputation::RemoveUnusedParametersImpl(bool allow_non_fusion) {
   CHECK(allow_non_fusion || IsFusionComputation());
   int64_t removed = 0;
   for (int64_t i = 0; i < param_instructions_.size(); ++i) {
@@ -404,7 +404,7 @@ bool HloComputation::IsMarkedAsDead(const HloInstruction* inst) {
   return inst->IsMarkedAsDead();
 }
 
-Status HloComputation::RemoveInstructionAndUnusedOperands(
+absl::Status HloComputation::RemoveInstructionAndUnusedOperands(
     HloInstruction* instruction,
     std::optional<absl::FunctionRef<void(HloInstruction*)>> cleanup,
     bool ignore_control_dependencies) {
@@ -416,6 +416,7 @@ Status HloComputation::RemoveInstructionAndUnusedOperands(
   absl::flat_hash_set<HloInstruction*> removed;
   std::queue<HloInstruction*> worklist;
   worklist.push(instruction);
+  std::vector<HloInstruction*> parameters_to_be_removed;
   while (!worklist.empty()) {
     HloInstruction* item = worklist.front();
     worklist.pop();
@@ -438,22 +439,52 @@ Status HloComputation::RemoveInstructionAndUnusedOperands(
     if (cleanup != std::nullopt) {
       (*cleanup)(item);
     }
-    TF_RETURN_IF_ERROR(RemoveInstruction(item));
+    if (item->opcode() == HloOpcode::kParameter) {
+      // Note that right now, only parameters inside fusion computations are
+      // considered to be safely removable. We cannot remove a parameter
+      // directly, because it may cause a renumbering of other parameters which
+      // may invalidate some of the pointers in the worklist.
+      parameters_to_be_removed.push_back(item);
+    } else {
+      TF_RETURN_IF_ERROR(RemoveInstruction(item));
+    }
     removed.insert(item);
   }
+  // Sort into decreasing order by parameter number, otherwise the renumbering
+  // of parameters when one parameter is deleted will cause issues.
+  std::sort(parameters_to_be_removed.begin(), parameters_to_be_removed.end(),
+            [](HloInstruction* a, HloInstruction* b) {
+              return a->parameter_number() > b->parameter_number();
+            });
+  for (HloInstruction* param : parameters_to_be_removed) {
+    int64_t parameter_number = param->parameter_number();
+    TF_RETURN_IF_ERROR(RemoveParameter(parameter_number));
+    if (FusionInstruction() != nullptr) {
+      auto operand = FusionInstruction()->mutable_operand(parameter_number);
+      FusionInstruction()->RemoveOperandAt(parameter_number);
+      FusionInstruction()->DetachFrom(operand);
+      if (operand->IsDead() && operand->parent()->IsSafelyRemovable(
+                                   operand, ignore_control_dependencies)) {
+        TF_RETURN_IF_ERROR(
+            operand->parent()->RemoveInstructionAndUnusedOperands(
+                operand, cleanup, ignore_control_dependencies));
+      }
+    }
+  }
   return OkStatus();
 }
 
-Status HloComputation::RemoveInstruction(HloInstruction* instruction) {
+absl::Status HloComputation::RemoveInstruction(HloInstruction* instruction) {
   return RemoveInstructionImpl(instruction, /*ignore_safety_check=*/false);
 }
 
-Status HloComputation::ForceRemoveInstruction(HloInstruction* instruction) {
+absl::Status HloComputation::ForceRemoveInstruction(
+    HloInstruction* instruction) {
   return RemoveInstructionImpl(instruction, /*ignore_safety_check=*/true);
 }
 
-Status HloComputation::RemoveInstructionImpl(HloInstruction* instruction,
-                                             bool ignore_safety_check) {
+absl::Status HloComputation::RemoveInstructionImpl(HloInstruction* instruction,
+                                                   bool ignore_safety_check) {
   VLOG(2) << "Removing instruction " << instruction << " "
           << instruction->name() << " from computation " << name();
   TF_RET_CHECK(ignore_safety_check || IsSafelyRemovable(instruction))
@@ -999,7 +1030,7 @@ HloComputation::CreateFromProto(
     return to_proto_id[a.get()] < to_proto_id[b.get()];
   });
 
-  TF_RETURN_IF_ERROR([&]() -> Status {
+  TF_RETURN_IF_ERROR([&]() -> absl::Status {
     std::vector<bool> parameters_seen(parameter_count);
     int parameters_seen_count = 0;
     for (auto& instruction : instructions) {
@@ -1312,14 +1343,14 @@ bool HloComputation::EqualInternal(
   return true;
 }
 
-Status HloComputation::ReplaceWithNewInstruction(
+absl::Status HloComputation::ReplaceWithNewInstruction(
     HloInstruction* old_instruction,
     std::unique_ptr<HloInstruction> new_instruction) {
   return ReplaceInstruction(old_instruction,
                             AddInstruction(std::move(new_instruction)));
 }
 
-Status HloComputation::ReplaceWithNewEntryComputationParameter(
+absl::Status HloComputation::ReplaceWithNewEntryComputationParameter(
     HloInstruction* old_instruction,
     std::unique_ptr<HloInstruction> new_instruction) {
   return ReplaceInstruction(old_instruction, AddEntryComputationParameter(
@@ -1328,18 +1359,19 @@ Status HloComputation::ReplaceWithNewEntryComputationParameter(
 
 absl::StatusOr<bool> HloComputation::ReplaceInstruction(
     HloInstruction* old_instruction, HloInstruction* new_instruction,
-    bool preserve_sharding, bool relay_control_dependency) {
+    bool preserve_sharding, bool relay_control_dependency,
+    bool remove_unused_operands) {
   TF_RET_CHECK(
       ShapeUtil::Compatible(old_instruction->shape(), new_instruction->shape()))
       << ShapeUtil::HumanString(old_instruction->shape()) << " vs "
       << ShapeUtil::HumanString(new_instruction->shape());
-  return ReplaceInstructionWithDifferentShape(old_instruction, new_instruction,
-                                              preserve_sharding,
-                                              relay_control_dependency);
+  return ReplaceInstructionWithDifferentShape(
+      old_instruction, new_instruction, preserve_sharding,
+      relay_control_dependency, remove_unused_operands);
 }
 
-Status HloComputation::ReplaceInstruction(HloInstruction* old_instruction,
-                                          HloInstruction* new_instruction) {
+absl::Status HloComputation::ReplaceInstruction(
+    HloInstruction* old_instruction, HloInstruction* new_instruction) {
   TF_ASSIGN_OR_RETURN(bool changed,
                       ReplaceInstruction(old_instruction, new_instruction,
                                          /*preserve_sharding=*/false));
@@ -1413,7 +1445,7 @@ absl::StatusOr<bool> HloComputation::ReplaceInstructionWithDifferentShape(
   return true;
 }
 
-Status HloComputation::ReplaceInstructionWithDifferentShape(
+absl::Status HloComputation::ReplaceInstructionWithDifferentShape(
     HloInstruction* old_instruction, HloInstruction* new_instruction) {
   TF_ASSIGN_OR_RETURN(bool changed, ReplaceInstructionWithDifferentShape(
                                         old_instruction, new_instruction,
@@ -1437,7 +1469,7 @@ std::vector<HloInstruction*> HloComputation::CollectUnreachableRoots() const {
   return unreachable_roots;
 }
 
-Status HloComputation::AcceptWithOperandOrder(
+absl::Status HloComputation::AcceptWithOperandOrder(
     DfsHloVisitor* visitor,
     const HloInstruction::CompareFunction& operand_order) const {
   // Visit unreachable roots. Beware that the visitor might delete the currently
diff --git a/third_party/xla/xla/hlo/ir/hlo_computation.h b/third_party/xla/xla/hlo/ir/hlo_computation.h
index 5b67b15788ca12..8c5b2aa9b29e72 100644
--- a/third_party/xla/xla/hlo/ir/hlo_computation.h
+++ b/third_party/xla/xla/hlo/ir/hlo_computation.h
@@ -111,8 +111,8 @@ class HloComputation {
       return AddInstruction(std::move(parameter));
     }
 
-    Status ForEachInstruction(
-        absl::FunctionRef<Status(const HloInstruction*)> func) const {
+    absl::Status ForEachInstruction(
+        absl::FunctionRef<absl::Status(const HloInstruction*)> func) const {
       for (const auto& instruction : instructions_) {
         TF_RETURN_IF_ERROR(func(instruction.get()));
       }
@@ -216,17 +216,17 @@ class HloComputation {
   // Remove the param_no'th parameter from the computation.
   // Note this is only applicatable to the computation for the fusion
   // instruction.
-  Status RemoveParameter(int64_t param_no);
+  absl::Status RemoveParameter(int64_t param_no);
 
   // Remove unused parameters from the computation.
   // Note this is only applicatable to the computation for the fusion
   // instruction.
-  Status RemoveUnusedParametersFromFusedComputation();
+  absl::Status RemoveUnusedParametersFromFusedComputation();
 
   // Remove unused parameters from the computation. Unlike
   // RemoveUnusedParametersFromFusedComputation, this function can be used
   // to remove parameters from non-fusion computations.
-  Status RemoveUnusedParametersFromAnyComputation();
+  absl::Status RemoveUnusedParametersFromAnyComputation();
 
   // Adds a new parameter instruction to a fusion computation.
   //
@@ -244,18 +244,18 @@ class HloComputation {
 
   // Replaces an old parameter with a new parameter. Adds the new parameter
   // instruction to the entry computation.  Updates users instruction.
-  Status ReplaceEntryComputationParameter(
+  absl::Status ReplaceEntryComputationParameter(
       int64_t param_no, HloInstruction* old_instruction,
       std::unique_ptr<HloInstruction> instruction);
 
   // Remove an instruction from the computation. The instruction must have no
   // users. Instruction is deallocated with this call.
-  Status RemoveInstruction(HloInstruction* instruction);
+  absl::Status RemoveInstruction(HloInstruction* instruction);
 
   // Removes an instruction from the computation. The instruction must have no
   // users. Instruction is deallocated with this call. The instruction will be
   // removed even if it is marked as not removable.
-  Status ForceRemoveInstruction(HloInstruction* instruction);
+  absl::Status ForceRemoveInstruction(HloInstruction* instruction);
 
   // Remove an instruction (including side effecting ones) from the computation
   // and also transitively any operand that has no side effect and no users post
@@ -267,7 +267,7 @@ class HloComputation {
   // the predecessors to the succesors of the removed instructions, so that the
   // logical exeuction order of the remaining unremoved instructions are
   // preserved.
-  Status RemoveInstructionAndUnusedOperands(
+  absl::Status RemoveInstructionAndUnusedOperands(
       HloInstruction* instruction,
       std::optional<absl::FunctionRef<void(HloInstruction*)>> cleanup =
           std::nullopt,
@@ -560,14 +560,14 @@ class HloComputation {
 
   // Replaces old instruction with newly created instruction. Removes old
   // instruction from computation. Updates uses and root instruction.
-  Status ReplaceWithNewInstruction(
+  absl::Status ReplaceWithNewInstruction(
       HloInstruction* old_instruction,
       std::unique_ptr<HloInstruction> new_instruction);
 
   // Replaces an old instruction with a newly created instruction, and adds the
   // new instruction as an entry computation's parameter. Removes old
   // instruction from computation. Updates uses and root instruction.
-  Status ReplaceWithNewEntryComputationParameter(
+  absl::Status ReplaceWithNewEntryComputationParameter(
       HloInstruction* old_instruction,
       std::unique_ptr<HloInstruction> new_instruction);
 
@@ -581,14 +581,16 @@ class HloComputation {
   // return false. Otherwise, when the replacement happens, if |new_instruction|
   // doesn't have any sharding information it will receive the sharding
   // information of |old_instruction|, and function will return true.
-  absl::StatusOr<bool> ReplaceInstruction(
-      HloInstruction* old_instruction, HloInstruction* new_instruction,
-      bool preserve_sharding, bool relay_control_dependency = false);
+  absl::StatusOr<bool> ReplaceInstruction(HloInstruction* old_instruction,
+                                          HloInstruction* new_instruction,
+                                          bool preserve_sharding,
+                                          bool relay_control_dependency = false,
+                                          bool remove_unused_operands = true);
 
   // Same as above, with preserve_sharding=false. Since this replacement always
-  // happens, it returns just a Status as opposed to StatusOr<bool>
-  Status ReplaceInstruction(HloInstruction* old_instruction,
-                            HloInstruction* new_instruction);
+  // happens, it returns just a absl::Status as opposed to absl::StatusOr<bool>
+  absl::Status ReplaceInstruction(HloInstruction* old_instruction,
+                                  HloInstruction* new_instruction);
 
   // Same as ReplaceInstruction, but the new instruction can have a different
   // shape.
@@ -596,8 +598,8 @@ class HloComputation {
       HloInstruction* old_instruction, HloInstruction* new_instruction,
       bool preserve_sharding, bool relay_control_dependency = false,
       bool remove_unused_operands = true);
-  Status ReplaceInstructionWithDifferentShape(HloInstruction* old_instruction,
-                                              HloInstruction* new_instruction);
+  absl::Status ReplaceInstructionWithDifferentShape(
+      HloInstruction* old_instruction, HloInstruction* new_instruction);
 
   // Set/get the module containing this computation.
   void set_parent(HloModule* module) { parent_ = module; }
@@ -611,20 +613,20 @@ class HloComputation {
   // the visitor's FinishVisit method is called once upon completion (with the
   // root instruction as the argument).
   template <typename HloInstructionPtr>
-  Status Accept(DfsHloVisitorBase<HloInstructionPtr>* visitor) const;
+  absl::Status Accept(DfsHloVisitorBase<HloInstructionPtr>* visitor) const;
 
   // Same as Accept() above, but the order of operand and control predecessor
   // visitation is determined by the given operand order; if compare(A, B) ==
   // true, A is visited before B.
-  Status AcceptWithOperandOrder(
+  absl::Status AcceptWithOperandOrder(
       DfsHloVisitor* visitor,
       const HloInstruction::CompareFunction& operand_order) const;
 
   // Visit every node in the computation in the given order. 'order' must
   // be a topological sort of all instructions in the computation.
   template <typename HloInstructionPtr>
-  Status AcceptOrdered(DfsHloVisitorBase<HloInstructionPtr>* visitor,
-                       absl::Span<HloInstruction* const> order) const;
+  absl::Status AcceptOrdered(DfsHloVisitorBase<HloInstructionPtr>* visitor,
+                             absl::Span<HloInstruction* const> order) const;
 
   // Returns a deep copy of this computation including all instructions.
   // If the clone context is specified, it will be populated with the cloned
@@ -906,10 +908,10 @@ class HloComputation {
       const ChannelDependencies& channel_dependencies, VisitMap& visited,
       std::vector<HloInstruction*>* dfs_stack_scratch) const;
 
-  Status RemoveUnusedParametersImpl(bool allow_non_fusion);
+  absl::Status RemoveUnusedParametersImpl(bool allow_non_fusion);
 
-  Status RemoveInstructionImpl(HloInstruction* instruction,
-                               bool ignore_safety_check);
+  absl::Status RemoveInstructionImpl(HloInstruction* instruction,
+                                     bool ignore_safety_check);
 
   enum class InstructionType : uint8_t {
     kUnset,
@@ -987,7 +989,7 @@ class HloComputation {
 };
 
 template <typename HloInstructionPtr>
-Status HloComputation::Accept(
+absl::Status HloComputation::Accept(
     DfsHloVisitorBase<HloInstructionPtr>* visitor) const {
   // Visit unreachable roots. Beware that the visitor might delete the currently
   // visited root, which would invalidate iterators if the unreachable roots
@@ -1002,11 +1004,11 @@ Status HloComputation::Accept(
 }
 
 // Explicit instantiations.
-template Status HloComputation::Accept(DfsHloVisitor* visitor) const;
-template Status HloComputation::Accept(ConstDfsHloVisitor* visitor) const;
+template absl::Status HloComputation::Accept(DfsHloVisitor* visitor) const;
+template absl::Status HloComputation::Accept(ConstDfsHloVisitor* visitor) const;
 
 template <typename HloInstructionPtr>
-Status HloComputation::AcceptOrdered(
+absl::Status HloComputation::AcceptOrdered(
     DfsHloVisitorBase<HloInstructionPtr>* visitor,
     absl::Span<HloInstruction* const> order) const {
   VLOG(3) << "Accepting visitor with order.";
@@ -1032,9 +1034,9 @@ Status HloComputation::AcceptOrdered(
 }
 
 // Explicit instantiations.
-template Status HloComputation::AcceptOrdered(
+template absl::Status HloComputation::AcceptOrdered(
     DfsHloVisitor*, absl::Span<HloInstruction* const>) const;
-template Status HloComputation::AcceptOrdered(
+template absl::Status HloComputation::AcceptOrdered(
     ConstDfsHloVisitor*, absl::Span<HloInstruction* const>) const;
 
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/ir/hlo_input_output_alias_config.cc b/third_party/xla/xla/hlo/ir/hlo_input_output_alias_config.cc
index 3dca857c43a84e..23a886814ae08e 100644
--- a/third_party/xla/xla/hlo/ir/hlo_input_output_alias_config.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_input_output_alias_config.cc
@@ -43,7 +43,7 @@ bool HloInputOutputAliasConfig::OutputHasAlias(
   return alias_.element(output_index).has_value();
 }
 
-Status HloInputOutputAliasConfig::SetUpAlias(
+absl::Status HloInputOutputAliasConfig::SetUpAlias(
     const ShapeIndex& output_index, int64_t param_number,
     const ShapeIndex& param_index,
     HloInputOutputAliasConfig::AliasKind must_alias) {
@@ -182,7 +182,7 @@ void HloInputOutputAliasConfig::ForEachAlias(AliasFn fn) const {
       });
 }
 
-Status HloInputOutputAliasConfig::ForEachAliasWithStatus(
+absl::Status HloInputOutputAliasConfig::ForEachAliasWithStatus(
     AliasFnWithStatus fn) const {
   return alias_.ForEachElementWithStatus(
       [&](const ShapeIndex& output_index, std::optional<Alias> aliased) {
@@ -193,7 +193,7 @@ Status HloInputOutputAliasConfig::ForEachAliasWithStatus(
       });
 }
 
-Status HloInputOutputAliasConfig::Verify(
+absl::Status HloInputOutputAliasConfig::Verify(
     const HloModule& module,
     absl::FunctionRef<int64_t(const Shape&)> size_func) const {
   std::vector<ShapeTree<bool>> param_has_seen;
@@ -203,7 +203,7 @@ Status HloInputOutputAliasConfig::Verify(
     param_has_seen.emplace_back(param->shape());
   }
   return ForEachAliasWithStatus([&](const ShapeIndex& output_index,
-                                    const Alias& alias) -> Status {
+                                    const Alias& alias) -> absl::Status {
     TF_RET_CHECK(0 <= alias.parameter_number);
     TF_RET_CHECK(entry->num_parameters() > alias.parameter_number);
     const Shape& param_shape =
@@ -250,8 +250,8 @@ std::ostream& operator<<(std::ostream& out,
   return out;
 }
 
-Status HloBufferDonorConfig::AddBufferDonor(int64_t param_number,
-                                            const ShapeIndex& param_index) {
+absl::Status HloBufferDonorConfig::AddBufferDonor(
+    int64_t param_number, const ShapeIndex& param_index) {
   TF_RET_CHECK(param_number >= 0) << param_number;
   VLOG(4) << "Register the parameter " << param_number << " at index "
           << param_index.ToString() << " as a buffer donor.";
@@ -259,8 +259,8 @@ Status HloBufferDonorConfig::AddBufferDonor(int64_t param_number,
   return OkStatus();
 }
 
-Status HloBufferDonorConfig::RemoveBufferDonor(int64_t param_number,
-                                               const ShapeIndex& param_index) {
+absl::Status HloBufferDonorConfig::RemoveBufferDonor(
+    int64_t param_number, const ShapeIndex& param_index) {
   TF_RET_CHECK(param_number >= 0) << param_number;
   buffer_donor_.erase(BufferDonor(param_number, param_index));
   return OkStatus();
@@ -319,7 +319,7 @@ bool HloBufferDonorConfig::ParameterIsBufferDonor(
   return it != buffer_donor_.end();
 }
 
-Status HloBufferDonorConfig::Verify(const HloModule& module) const {
+absl::Status HloBufferDonorConfig::Verify(const HloModule& module) const {
   const HloComputation* entry = module.entry_computation();
   const auto& alias_config = module.input_output_alias_config();
   for (const auto& donor : buffer_donor_) {
diff --git a/third_party/xla/xla/hlo/ir/hlo_input_output_alias_config.h b/third_party/xla/xla/hlo/ir/hlo_input_output_alias_config.h
index 236621dbe05384..d00609edeae07f 100644
--- a/third_party/xla/xla/hlo/ir/hlo_input_output_alias_config.h
+++ b/third_party/xla/xla/hlo/ir/hlo_input_output_alias_config.h
@@ -24,9 +24,12 @@ limitations under the License.
 #include <utility>
 
 #include "absl/container/btree_set.h"
-#include "absl/container/flat_hash_set.h"
 #include "absl/functional/function_ref.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
 #include "xla/service/hlo.pb.h"
+#include "xla/shape.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
 
@@ -72,13 +75,11 @@ class HloInputOutputAliasConfig {
   explicit HloInputOutputAliasConfig(Shape output_shape)
       : alias_(std::move(output_shape)) {}
 
-  virtual ~HloInputOutputAliasConfig() = default;
-
   // Sets up alias config from `output_index` to `param_index` at
   // `param_number`.
-  Status SetUpAlias(const ShapeIndex& output_index, int64_t param_number,
-                    const ShapeIndex& param_index,
-                    AliasKind must_alias = kMayAlias);
+  absl::Status SetUpAlias(const ShapeIndex& output_index, int64_t param_number,
+                          const ShapeIndex& param_index,
+                          AliasKind must_alias = kMayAlias);
 
   // Returns true if the given parameter is aliased with one of the output
   // buffers.
@@ -120,16 +121,16 @@ class HloInputOutputAliasConfig {
   // Iterates through each aliased output and input.
   void ForEachAlias(AliasFn fn) const;
 
-  using AliasFnWithStatus =
-      absl::FunctionRef<Status(const ShapeIndex& output_index, const Alias&)>;
+  using AliasFnWithStatus = absl::FunctionRef<absl::Status(
+      const ShapeIndex& output_index, const Alias&)>;
 
   // Verifies that the given config is valid for the given module.
-  // Specifically, the config's input and output should be in-bound and size of
+  // Specifically, the config's input and output should be in-bound and size ofF
   // the aliased buffers should match.
-  Status Verify(const HloModule& module,
-                absl::FunctionRef<int64_t(const Shape&)> size_func) const;
+  absl::Status Verify(const HloModule& module,
+                      absl::FunctionRef<int64_t(const Shape&)> size_func) const;
 
-  Status ForEachAliasWithStatus(AliasFnWithStatus fn) const;
+  absl::Status ForEachAliasWithStatus(AliasFnWithStatus fn) const;
 
   // Returns the shape of the output of the alias config.
   const Shape& shape() const;
@@ -186,12 +187,13 @@ class HloBufferDonorConfig {
   };
 
   HloBufferDonorConfig() = default;
-  virtual ~HloBufferDonorConfig() = default;
 
   // Register and unregister the parameter with `param_index` at `param_number`
   // as a buffer donor.
-  Status AddBufferDonor(int64_t param_number, const ShapeIndex& param_index);
-  Status RemoveBufferDonor(int64_t param_number, const ShapeIndex& param_index);
+  absl::Status AddBufferDonor(int64_t param_number,
+                              const ShapeIndex& param_index);
+  absl::Status RemoveBufferDonor(int64_t param_number,
+                                 const ShapeIndex& param_index);
 
   // Returns true if the given parameter is registered as a buffer donor.
   bool ParameterIsBufferDonor(int64_t param_number,
@@ -205,7 +207,7 @@ class HloBufferDonorConfig {
   // Verifies that the given config is valid for the given module.
   // The config's input should be in-bound and this config cannot overlap with
   // the given module's input_output_alias_config.
-  Status Verify(const HloModule& module) const;
+  absl::Status Verify(const HloModule& module) const;
 
   // Returns the registered buffer donors
   const absl::btree_set<BufferDonor>& buffer_donor() const {
diff --git a/third_party/xla/xla/hlo/ir/hlo_instruction.cc b/third_party/xla/xla/hlo/ir/hlo_instruction.cc
index b2bfb6b8a349b8..9b9bdcbaa7d9d4 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instruction.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_instruction.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include <functional>
 #include <iostream>
 #include <iterator>
-#include <list>
 #include <memory>
 #include <optional>
 #include <ostream>
@@ -46,9 +45,10 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
-#include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/comparison_util.h"
+#include "xla/hlo/ir/backend_config.h"
+#include "xla/hlo/ir/collective_device_list.h"
 #include "xla/hlo/ir/dfs_hlo_visitor.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_clone_context.h"
@@ -62,7 +62,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/hlo/ir/hlo_sharding_metadata.h"
 #include "xla/hlo/ir/ptrvec.h"
-#include "xla/iterator_util.h"
 #include "xla/layout.h"
 #include "xla/literal.h"
 #include "xla/map_util.h"
@@ -76,7 +75,6 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/status_macros.h"
-#include "xla/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/lib/gtl/iterator_range.h"
@@ -99,7 +97,7 @@ const HloInstruction::Rare* const HloInstruction::kEmptyRare =
 namespace {
 // Specialization for erasing from PtrVec<T>.
 template <typename T>
-Status EraseElementFromVector(PtrVec<T>* container, T value) {
+absl::Status EraseElementFromVector(PtrVec<T>* container, T value) {
   // absl::c_find returns a const_iterator which does not seem to work on
   // gcc 4.8.4, and this breaks the ubuntu/xla_gpu build bot.
   auto it = std::find(container->begin(), container->end(), value);
@@ -681,15 +679,15 @@ absl::StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       if (opcode == HloOpcode::kAllGather) {
         instruction = CreateAllGather(
             shape, all_operands(), all_gather_dimension,
-            std::vector<ReplicaGroup>(proto.replica_groups().begin(),
-                                      proto.replica_groups().end()),
+            CollectiveDeviceList(std::vector<ReplicaGroup>(
+                proto.replica_groups().begin(), proto.replica_groups().end())),
             proto.constrain_layout(), channel_id,
             proto.use_global_device_ids());
       } else {
         instruction = CreateAllGatherStart(
             shape, all_operands(), all_gather_dimension,
-            std::vector<ReplicaGroup>(proto.replica_groups().begin(),
-                                      proto.replica_groups().end()),
+            CollectiveDeviceList(std::vector<ReplicaGroup>(
+                proto.replica_groups().begin(), proto.replica_groups().end())),
             proto.constrain_layout(), channel_id,
             proto.use_global_device_ids());
       }
@@ -712,23 +710,24 @@ absl::StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       }
       std::vector<ReplicaGroup> replica_groups(proto.replica_groups().begin(),
                                                proto.replica_groups().end());
+      CollectiveDeviceList device_list(replica_groups);
       if (opcode == HloOpcode::kAllReduce) {
         instruction =
-            CreateAllReduce(shape, all_operands(), computations(0),
-                            replica_groups, proto.constrain_layout(),
-                            channel_id, proto.use_global_device_ids());
+            CreateAllReduce(shape, all_operands(), computations(0), device_list,
+                            proto.constrain_layout(), channel_id,
+                            proto.use_global_device_ids());
       } else if (opcode == HloOpcode::kReduceScatter) {
         TF_RET_CHECK(proto.dimensions_size() == 1)
             << "ReduceScatter cannot have more than 1 scatter dimensions";
         int64_t scatter_dimension = proto.dimensions(0);
         instruction = CreateReduceScatter(
-            shape, all_operands(), computations(0), replica_groups,
+            shape, all_operands(), computations(0), device_list,
             proto.constrain_layout(), channel_id, proto.use_global_device_ids(),
             scatter_dimension);
       } else {
         instruction =
             CreateAllReduceStart(shape, all_operands(), computations(0),
-                                 replica_groups, proto.constrain_layout(),
+                                 device_list, proto.constrain_layout(),
                                  channel_id, proto.use_global_device_ids());
       }
       break;
@@ -750,8 +749,8 @@ absl::StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       instruction = CreateAllToAll(
           shape, all_operands(),
           /*replica_groups=*/
-          std::vector<ReplicaGroup>(proto.replica_groups().begin(),
-                                    proto.replica_groups().end()),
+          CollectiveDeviceList(std::vector<ReplicaGroup>(
+              proto.replica_groups().begin(), proto.replica_groups().end())),
           /*constrain_layout=*/proto.constrain_layout(),
           /*channel_id=*/channel_id, split_dimension);
       break;
@@ -764,7 +763,8 @@ absl::StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       auto replica_groups = std::vector<ReplicaGroup>(
           proto.replica_groups().begin(), proto.replica_groups().end());
       instruction = CreateCollectiveBroadcast(
-          shape, all_operands(), replica_groups, false, channel_id);
+          shape, all_operands(), CollectiveDeviceList(replica_groups), false,
+          channel_id);
       break;
     }
     case HloOpcode::kCollectivePermute:
@@ -1210,7 +1210,7 @@ absl::StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
   TF_RET_CHECK(!proto.name().empty());
   instruction->SetAndSanitizeName(proto.name());
   *instruction->metadata_ = proto.metadata();
-  instruction->backend_config_ = proto.backend_config();
+  instruction->backend_config_ = BackendConfigWrapper(proto.backend_config());
 
   TF_RET_CHECK(proto.id() >= 0)
       << "Instruction with negative id: " << proto.id();
@@ -1490,14 +1490,37 @@ HloInstruction::CreateReducePrecision(const Shape& shape,
       shape, operand, exponent_bits, mantissa_bits);
 }
 
+/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateAllGather(
+    const Shape& shape, absl::Span<HloInstruction* const> operands,
+    int64_t all_gather_dimension, const CollectiveDeviceList& device_list,
+    bool constrain_layout, const std::optional<int64_t>& channel_id,
+    bool use_global_device_ids) {
+  return std::make_unique<HloAllGatherInstruction>(
+      HloOpcode::kAllGather, shape, operands, all_gather_dimension, device_list,
+      constrain_layout, channel_id, use_global_device_ids);
+}
+
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateAllGather(
     const Shape& shape, absl::Span<HloInstruction* const> operands,
     int64_t all_gather_dimension, absl::Span<const ReplicaGroup> replica_groups,
     bool constrain_layout, const std::optional<int64_t>& channel_id,
     bool use_global_device_ids) {
+  return CreateAllGather(shape, operands, all_gather_dimension,
+                         CollectiveDeviceList(replica_groups), constrain_layout,
+                         channel_id, use_global_device_ids);
+}
+
+/* static */ std::unique_ptr<HloInstruction>
+HloInstruction::CreateAllGatherStart(const Shape& shape,
+                                     absl::Span<HloInstruction* const> operands,
+                                     int64_t all_gather_dimension,
+                                     const CollectiveDeviceList& device_list,
+                                     bool constrain_layout,
+                                     const std::optional<int64_t>& channel_id,
+                                     bool use_global_device_ids) {
   return std::make_unique<HloAllGatherInstruction>(
-      HloOpcode::kAllGather, shape, operands, all_gather_dimension,
-      replica_groups, constrain_layout, channel_id, use_global_device_ids);
+      HloOpcode::kAllGatherStart, shape, operands, all_gather_dimension,
+      device_list, constrain_layout, channel_id, use_global_device_ids);
 }
 
 /* static */ std::unique_ptr<HloInstruction>
@@ -1506,9 +1529,20 @@ HloInstruction::CreateAllGatherStart(
     int64_t all_gather_dimension, absl::Span<const ReplicaGroup> replica_groups,
     bool constrain_layout, const std::optional<int64_t>& channel_id,
     bool use_global_device_ids) {
-  return std::make_unique<HloAllGatherInstruction>(
-      HloOpcode::kAllGatherStart, shape, operands, all_gather_dimension,
-      replica_groups, constrain_layout, channel_id, use_global_device_ids);
+  return CreateAllGatherStart(shape, operands, all_gather_dimension,
+                              CollectiveDeviceList(replica_groups),
+                              constrain_layout, channel_id,
+                              use_global_device_ids);
+}
+
+/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateAllReduce(
+    const Shape& shape, absl::Span<HloInstruction* const> operands,
+    HloComputation* reduce_computation, const CollectiveDeviceList& device_list,
+    bool constrain_layout, const std::optional<int64_t>& channel_id,
+    bool use_global_device_ids) {
+  return std::make_unique<HloAllReduceInstruction>(
+      HloOpcode::kAllReduce, shape, operands, reduce_computation, device_list,
+      constrain_layout, channel_id, use_global_device_ids);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateAllReduce(
@@ -1516,9 +1550,20 @@ HloInstruction::CreateAllGatherStart(
     HloComputation* reduce_computation,
     absl::Span<const ReplicaGroup> replica_groups, bool constrain_layout,
     const std::optional<int64_t>& channel_id, bool use_global_device_ids) {
-  return std::make_unique<HloAllReduceInstruction>(
-      HloOpcode::kAllReduce, shape, operands, reduce_computation,
-      replica_groups, constrain_layout, channel_id, use_global_device_ids);
+  return CreateAllReduce(shape, operands, reduce_computation,
+                         CollectiveDeviceList(replica_groups), constrain_layout,
+                         channel_id, use_global_device_ids);
+}
+
+/* static */ std::unique_ptr<HloInstruction>
+HloInstruction::CreateReduceScatter(
+    const Shape& shape, absl::Span<HloInstruction* const> operands,
+    HloComputation* reduce_computation, const CollectiveDeviceList& device_list,
+    bool constrain_layout, const std::optional<int64_t>& channel_id,
+    bool use_global_device_ids, int64_t scatter_dimension) {
+  return std::make_unique<HloReduceScatterInstruction>(
+      shape, operands, reduce_computation, device_list, constrain_layout,
+      channel_id, use_global_device_ids, scatter_dimension);
 }
 
 /* static */ std::unique_ptr<HloInstruction>
@@ -1528,9 +1573,22 @@ HloInstruction::CreateReduceScatter(
     absl::Span<const ReplicaGroup> replica_groups, bool constrain_layout,
     const std::optional<int64_t>& channel_id, bool use_global_device_ids,
     int64_t scatter_dimension) {
-  return std::make_unique<HloReduceScatterInstruction>(
-      shape, operands, reduce_computation, replica_groups, constrain_layout,
-      channel_id, use_global_device_ids, scatter_dimension);
+  return CreateReduceScatter(
+      shape, operands, reduce_computation, CollectiveDeviceList(replica_groups),
+      constrain_layout, channel_id, use_global_device_ids, scatter_dimension);
+}
+
+/* static */ std::unique_ptr<HloInstruction>
+HloInstruction::CreateAllReduceStart(const Shape& shape,
+                                     absl::Span<HloInstruction* const> operands,
+                                     HloComputation* reduce_computation,
+                                     const CollectiveDeviceList& device_list,
+                                     bool constrain_layout,
+                                     const std::optional<int64_t>& channel_id,
+                                     bool use_global_device_ids) {
+  return std::make_unique<HloAllReduceInstruction>(
+      HloOpcode::kAllReduceStart, shape, operands, reduce_computation,
+      device_list, constrain_layout, channel_id, use_global_device_ids);
 }
 
 /* static */ std::unique_ptr<HloInstruction>
@@ -1539,9 +1597,19 @@ HloInstruction::CreateAllReduceStart(
     HloComputation* reduce_computation,
     absl::Span<const ReplicaGroup> replica_groups, bool constrain_layout,
     const std::optional<int64_t>& channel_id, bool use_global_device_ids) {
-  return std::make_unique<HloAllReduceInstruction>(
-      HloOpcode::kAllReduceStart, shape, operands, reduce_computation,
-      replica_groups, constrain_layout, channel_id, use_global_device_ids);
+  return CreateAllReduceStart(
+      shape, operands, reduce_computation, CollectiveDeviceList(replica_groups),
+      constrain_layout, channel_id, use_global_device_ids);
+}
+
+/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateAllToAll(
+    const Shape& shape, absl::Span<HloInstruction* const> operands,
+    const CollectiveDeviceList& device_list, bool constrain_layout,
+    const std::optional<int64_t>& channel_id,
+    const std::optional<int64_t>& split_dimension) {
+  return std::make_unique<HloAllToAllInstruction>(shape, operands, device_list,
+                                                  constrain_layout, channel_id,
+                                                  split_dimension);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateAllToAll(
@@ -1549,21 +1617,30 @@ HloInstruction::CreateAllReduceStart(
     absl::Span<const ReplicaGroup> replica_groups, bool constrain_layout,
     const std::optional<int64_t>& channel_id,
     const std::optional<int64_t>& split_dimension) {
-  return std::make_unique<HloAllToAllInstruction>(
-      shape, operands, replica_groups, constrain_layout, channel_id,
-      split_dimension);
+  return CreateAllToAll(shape, operands, CollectiveDeviceList(replica_groups),
+                        constrain_layout, channel_id, split_dimension);
 }
 
 /* static */ std::unique_ptr<HloInstruction>
 HloInstruction::CreateCollectiveBroadcast(
     const Shape& shape, absl::Span<HloInstruction* const> operands,
-    absl::Span<const ReplicaGroup> replica_groups, bool constrain_layout,
+    const CollectiveDeviceList& device_list, bool constrain_layout,
     const std::optional<int64_t>& channel_id) {
   return std::make_unique<HloCollectiveBroadcastInstruction>(
-      HloOpcode::kCollectiveBroadcast, shape, operands, replica_groups,
+      HloOpcode::kCollectiveBroadcast, shape, operands, device_list,
       constrain_layout, channel_id);
 }
 
+/* static */ std::unique_ptr<HloInstruction>
+HloInstruction::CreateCollectiveBroadcast(
+    const Shape& shape, absl::Span<HloInstruction* const> operands,
+    absl::Span<const ReplicaGroup> replica_groups, bool constrain_layout,
+    const std::optional<int64_t>& channel_id) {
+  return CreateCollectiveBroadcast(shape, operands,
+                                   CollectiveDeviceList(replica_groups),
+                                   constrain_layout, channel_id);
+}
+
 /* static */ std::unique_ptr<HloInstruction>
 HloInstruction::CreateCollectivePermute(
     const Shape& shape, HloInstruction* operand,
@@ -2021,12 +2098,6 @@ HloInstruction::CreateBroadcastSequence(
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateReshape(
     const Shape& shape, HloInstruction* operand, int64_t inferred_dimension) {
-  CHECK(operand->shape().is_unbounded_dynamic() ||
-        ShapeUtil::StaticExtentProduct(shape) ==
-            ShapeUtil::StaticExtentProduct(operand->shape()))
-      << "shape: " << ShapeUtil::HumanString(shape)
-      << " operand: " << ShapeUtil::HumanString(operand->shape());
-
   return std::make_unique<HloReshapeInstruction>(shape, operand,
                                                  inferred_dimension);
 }
@@ -2510,7 +2581,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
   // SetupDerivedInstruction will setup the precision_config_ field.
   SetupDerivedInstruction(clone.get());
   clone->set_parent(parent_);
-  clone->backend_config_ = backend_config_.Clone();
+  clone->backend_config_ = BackendConfigWrapper(backend_config_);
   // The new instruction's name will be uniquified when it's added to a
   // computation.
   clone->SetAndSanitizeName(name());
@@ -2627,7 +2698,8 @@ HloInstruction::InstructionVector HloInstruction::unique_operands() const {
   return unique;
 }
 
-Status HloInstruction::AddControlDependencyTo(HloInstruction* instruction) {
+absl::Status HloInstruction::AddControlDependencyTo(
+    HloInstruction* instruction) {
   TF_RET_CHECK(instruction->parent() == parent());
   if (!absl::c_linear_search(control_successors(), instruction)) {
     mutable_rare()->control_successors.push_back(instruction);
@@ -2638,7 +2710,8 @@ Status HloInstruction::AddControlDependencyTo(HloInstruction* instruction) {
   return OkStatus();
 }
 
-Status HloInstruction::RemoveControlDependencyTo(HloInstruction* instruction) {
+absl::Status HloInstruction::RemoveControlDependencyTo(
+    HloInstruction* instruction) {
   TF_RET_CHECK(instruction->parent() == parent());
   if (has_rare()) {
     TF_RETURN_IF_ERROR(EraseElementFromVector(
@@ -2651,7 +2724,7 @@ Status HloInstruction::RemoveControlDependencyTo(HloInstruction* instruction) {
   return OkStatus();
 }
 
-Status HloInstruction::DropAllControlDeps() {
+absl::Status HloInstruction::DropAllControlDeps() {
   if (has_rare()) {
     for (auto* ctrl_succ : rare()->control_successors) {
       TF_RETURN_IF_ERROR(EraseElementFromVector(
@@ -2668,7 +2741,7 @@ Status HloInstruction::DropAllControlDeps() {
   return OkStatus();
 }
 
-Status HloInstruction::SafelyDropAllControlDependencies() {
+absl::Status HloInstruction::SafelyDropAllControlDependencies() {
   // Add all pairs of transitive dependencies from predecessors to successors.
   if (has_rare()) {
     for (HloInstruction* predecessor : rare()->control_predecessors) {
@@ -2686,8 +2759,8 @@ bool HloInstruction::HasControlDependencies() const {
   return (!r->control_predecessors.empty() || !r->control_successors.empty());
 }
 
-Status HloInstruction::CopyAllControlDepsTo(HloInstruction* start,
-                                            HloInstruction* end) const {
+absl::Status HloInstruction::CopyAllControlDepsTo(HloInstruction* start,
+                                                  HloInstruction* end) const {
   for (auto* ctrl_pred : control_predecessors()) {
     TF_RETURN_IF_ERROR(ctrl_pred->AddControlDependencyTo(start));
   }
@@ -2958,8 +3031,8 @@ bool HloInstruction::IdenticalSlowPath(
   return false;
 }
 
-Status HloInstruction::ReplaceUseWith(HloInstruction* user,
-                                      HloInstruction* new_producer) {
+absl::Status HloInstruction::ReplaceUseWith(HloInstruction* user,
+                                            HloInstruction* new_producer) {
   TF_RET_CHECK(
       ShapeUtil::CompatibleIgnoringFpPrecision(shape(), new_producer->shape()))
       << "this shape: " << ShapeUtil::HumanString(shape())
@@ -2968,7 +3041,7 @@ Status HloInstruction::ReplaceUseWith(HloInstruction* user,
   return ReplaceUseWithDifferentShape(user, new_producer);
 }
 
-Status HloInstruction::ReplaceUseWithDifferentShape(
+absl::Status HloInstruction::ReplaceUseWithDifferentShape(
     HloInstruction* user, HloInstruction* new_producer) {
   VLOG(3) << "Replacing uses of " << name() << " in " << user->name()
           << " with " << new_producer->name();
@@ -2987,8 +3060,9 @@ Status HloInstruction::ReplaceUseWithDifferentShape(
   return OkStatus();
 }
 
-Status HloInstruction::ReplaceUseWith(HloInstruction* user, int operand_number,
-                                      HloInstruction* new_producer) {
+absl::Status HloInstruction::ReplaceUseWith(HloInstruction* user,
+                                            int operand_number,
+                                            HloInstruction* new_producer) {
   TF_RET_CHECK(
       ShapeUtil::CompatibleIgnoringFpPrecision(shape(), new_producer->shape()))
       << "this shape: " << ShapeUtil::HumanString(shape())
@@ -2997,7 +3071,7 @@ Status HloInstruction::ReplaceUseWith(HloInstruction* user, int operand_number,
   return ReplaceUseWithDifferentShape(user, operand_number, new_producer);
 }
 
-Status HloInstruction::ReplaceUseWithDifferentShape(
+absl::Status HloInstruction::ReplaceUseWithDifferentShape(
     HloInstruction* user, int operand_number, HloInstruction* new_producer) {
   VLOG(3) << "Replacing operand " << operand_number << " of " << name()
           << " in " << user->name() << " with " << new_producer->name();
@@ -3014,8 +3088,8 @@ Status HloInstruction::ReplaceUseWithDifferentShape(
   return OkStatus();
 }
 
-Status HloInstruction::ReplaceOperandWith(int64_t operand_num,
-                                          HloInstruction* new_operand) {
+absl::Status HloInstruction::ReplaceOperandWith(int64_t operand_num,
+                                                HloInstruction* new_operand) {
   auto old_operand = operand(operand_num);
   TF_RET_CHECK(ShapeUtil::CompatibleIgnoringFpPrecision(old_operand->shape(),
                                                         new_operand->shape()))
@@ -3024,7 +3098,7 @@ Status HloInstruction::ReplaceOperandWith(int64_t operand_num,
   return ReplaceOperandWithDifferentShape(operand_num, new_operand);
 }
 
-Status HloInstruction::ReplaceOperandWithDifferentShape(
+absl::Status HloInstruction::ReplaceOperandWithDifferentShape(
     int64_t operand_num, HloInstruction* new_operand) {
   TF_RET_CHECK(operand_num >= 0);
   TF_RET_CHECK(operand_num < operand_count());
@@ -3048,7 +3122,7 @@ Status HloInstruction::ReplaceOperandWithDifferentShape(
 // Copy all the instructions in the given fusion instruction into the fusion
 // instruction's parent computation and replace the use of the fusion
 // instruction with the copy of the fusion expression root.
-Status HloInstruction::Defuse() {
+absl::Status HloInstruction::Defuse() {
   if (opcode() != HloOpcode::kFusion) {
     return OkStatus();
   }
@@ -3093,15 +3167,15 @@ Status HloInstruction::Defuse() {
   return module->RemoveEmbeddedComputation(fused_computation);
 }
 
-Status HloInstruction::ReplaceUsesWith(absl::Span<HloInstruction* const> users,
-                                       HloInstruction* new_producer) {
+absl::Status HloInstruction::ReplaceUsesWith(
+    absl::Span<HloInstruction* const> users, HloInstruction* new_producer) {
   TF_RET_CHECK(
       ShapeUtil::CompatibleIgnoringFpPrecision(shape(), new_producer->shape()))
       << shape() << " is not compatible with " << new_producer->shape();
   return ReplaceAllUsesWithDifferentShape(users, new_producer);
 }
 
-Status HloInstruction::ReplaceAllUsesWithDifferentShape(
+absl::Status HloInstruction::ReplaceAllUsesWithDifferentShape(
     absl::Span<HloInstruction* const> users, HloInstruction* new_producer) {
   // Make a copy since users span might get mutated during the loop
   std::vector<HloInstruction*> users_vector(users.begin(), users.end());
@@ -3116,8 +3190,8 @@ Status HloInstruction::ReplaceAllUsesWithDifferentShape(
   return OkStatus();
 }
 
-Status HloInstruction::ReplaceAllUsesWith(HloInstruction* new_producer,
-                                          absl::string_view trigger) {
+absl::Status HloInstruction::ReplaceAllUsesWith(HloInstruction* new_producer,
+                                                absl::string_view trigger) {
   auto print_options = HloPrintOptions::ShortParsable()
                            .set_print_operand_shape(true)
                            .set_print_extra_attributes(false);
@@ -3130,7 +3204,7 @@ Status HloInstruction::ReplaceAllUsesWith(HloInstruction* new_producer,
   return ReplaceAllUsesWithDifferentShape(new_producer);
 }
 
-Status HloInstruction::ReplaceAllUsesWithDifferentShape(
+absl::Status HloInstruction::ReplaceAllUsesWithDifferentShape(
     HloInstruction* new_producer) {
   bool new_producer_is_user = false;
   // Make a copy since users span might get mutated during the loop
@@ -3987,7 +4061,8 @@ HloInstruction::HloInstruction(HloOpcode opcode, const Shape& shape)
 }
 
 template <typename HloInstructionPtr>
-Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
+absl::Status HloInstruction::Visit(
+    DfsHloVisitorBase<HloInstructionPtr>* visitor) {
   switch (opcode_) {
     case HloOpcode::kAbs:
       return visitor->HandleAbs(this);
@@ -4240,8 +4315,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
 }
 
 // Explicit instantiations.
-template Status HloInstruction::Visit(DfsHloVisitor* visitor);
-template Status HloInstruction::Visit(ConstDfsHloVisitor* visitor);
+template absl::Status HloInstruction::Visit(DfsHloVisitor* visitor);
+template absl::Status HloInstruction::Visit(ConstDfsHloVisitor* visitor);
 
 // Push "child" onto the dfs_stack if not already visited.  Returns false if a
 // cycle was detected, and true otherwise.
@@ -4269,10 +4344,10 @@ using InternalCompareFunction =
     absl::FunctionRef<bool(std::pair<int, const HloInstruction*>,
                            std::pair<int, const HloInstruction*>)>;
 template <typename Visitor>
-static Status PostOrderDFS(HloInstruction* root, Visitor* visitor,
-                           std::optional<InternalCompareFunction> operand_order,
-                           bool ignore_control_predecessors,
-                           bool cross_computation) {
+static absl::Status PostOrderDFS(
+    HloInstruction* root, Visitor* visitor,
+    std::optional<InternalCompareFunction> operand_order,
+    bool ignore_control_predecessors, bool cross_computation) {
   visitor->ReserveVisitStates(root->parent()->instruction_count());
 
   // dfs_stack holds pairs of <HloInstruction*->unique_id(), HloInstruction*>.
@@ -4367,10 +4442,9 @@ static Status PostOrderDFS(HloInstruction* root, Visitor* visitor,
 }
 
 template <typename HloInstructionPtr>
-Status HloInstruction::Accept(DfsHloVisitorBase<HloInstructionPtr>* visitor,
-                              bool call_finish_visit,
-                              bool ignore_control_predecessors,
-                              bool cross_computation) {
+absl::Status HloInstruction::Accept(
+    DfsHloVisitorBase<HloInstructionPtr>* visitor, bool call_finish_visit,
+    bool ignore_control_predecessors, bool cross_computation) {
   VLOG(3) << "HloInstruction::Accept(%" << name() << ")";
   TF_RETURN_IF_ERROR(PostOrderDFS(this, visitor, std::nullopt,
                                   ignore_control_predecessors,
@@ -4382,12 +4456,13 @@ Status HloInstruction::Accept(DfsHloVisitorBase<HloInstructionPtr>* visitor,
 }
 
 // Explicit instantiations.
-template Status HloInstruction::Accept(DfsHloVisitor*, bool, bool, bool);
-template Status HloInstruction::Accept(ConstDfsHloVisitor*, bool, bool, bool);
+template absl::Status HloInstruction::Accept(DfsHloVisitor*, bool, bool, bool);
+template absl::Status HloInstruction::Accept(ConstDfsHloVisitor*, bool, bool,
+                                             bool);
 
-Status HloInstruction::AcceptWithOperandOrder(DfsHloVisitor* visitor,
-                                              CompareFunction operand_order,
-                                              bool call_finish_visit) {
+absl::Status HloInstruction::AcceptWithOperandOrder(
+    DfsHloVisitor* visitor, CompareFunction operand_order,
+    bool call_finish_visit) {
   VLOG(2) << "HloInstruction::AcceptWithOperandOrder(%" << name() << ")";
   auto func = [operand_order](std::pair<int, const HloInstruction*> a,
                               std::pair<int, const HloInstruction*> b) {
@@ -4888,96 +4963,6 @@ bool HloPtrComparator::operator()(const HloInstruction* const& lhs,
   return lhs->unique_id() < rhs->unique_id();
 }
 
-Status HloInstruction::GetBackendConfigInternal(
-    tsl::protobuf::Message* proto) const {
-  proto->Clear();
-
-  if (auto* proto_ptr = backend_config_.GetProtoPtr()) {
-    if (proto_ptr->GetDescriptor() == proto->GetDescriptor()) {
-      proto->CopyFrom(*proto_ptr);
-      return OkStatus();
-    }
-  }
-
-  auto& raw_string = raw_backend_config_string();
-  // Empty string does not parse as valid JSON, but it's a valid backend config,
-  // corresponding to the empty proto.
-  if (raw_string.empty()) {
-    return OkStatus();
-  }
-  TF_RETURN_IF_ERROR(tsl::HumanReadableJsonToProto(raw_string, proto));
-  backend_config_.SetProto(*proto);
-  return OkStatus();
-}
-
-const std::string& HloInstruction::BackendConfigRep::GetRawString() const {
-  absl::WriterMutexLock lock{&mutex_};
-  if (proto_ && raw_string_.empty()) {
-    raw_string_ = BackendConfigToRawString(*proto_).value();
-  }
-  return raw_string_;
-}
-
-HloInstruction::BackendConfigRep HloInstruction::BackendConfigRep::Clone()
-    const {
-  // Prefer cloning protobuf, raw_string_ will be lazily generated if accessed.
-  BackendConfigRep cloned;
-  if (auto* proto = GetProtoPtr()) {
-    cloned.SetProto(*proto);
-  } else {
-    absl::MutexLock source_lock{&mutex_};
-    absl::MutexLock target_lock{&cloned.mutex_};
-    cloned.raw_string_ = raw_string_;
-  }
-  return cloned;
-}
-
-HloInstruction::BackendConfigRep& HloInstruction::BackendConfigRep::operator=(
-    std::string raw_string) {
-  absl::MutexLock lock{&mutex_};
-  raw_string_ = std::move(raw_string);
-  proto_.reset();
-  return *this;
-}
-
-HloInstruction::BackendConfigRep& HloInstruction::BackendConfigRep::operator=(
-    const tsl::protobuf::Message& proto) {
-  SetProto(proto);
-  absl::MutexLock lock{&mutex_};
-  raw_string_.clear();
-  return *this;
-}
-
-void HloInstruction::BackendConfigRep::SetProto(
-    const tsl::protobuf::Message& proto) {
-  proto_.reset(proto.New());
-  proto_->CopyFrom(proto);
-}
-
-bool HloInstruction::BackendConfigRep::operator==(
-    const BackendConfigRep& other) const {
-  auto* proto_a = GetProtoPtr();
-  auto* proto_b = other.GetProtoPtr();
-  if (proto_a != nullptr && proto_b != nullptr) {
-    using ::tsl::protobuf::util::MessageDifferencer;
-    return MessageDifferencer::Equals(*proto_a, *proto_b);
-  }
-  // TODO(b/225956414): Consider canonicalizing raw string form.
-  return GetRawString() == other.GetRawString();
-}
-
-/* static */ absl::StatusOr<std::string>
-HloInstruction::BackendConfigToRawString(const tsl::protobuf::Message& proto) {
-  std::string ret;
-  // Pass ignore_accuracy_loss = true because estimated_cycles field can be
-  // INT64_MAX. If ignore_accuracy_loss = false and estimated_cycles =
-  // INT64_MAX, JsonFormat will return an error status, although there is no
-  // accuracy loss for int64_t.
-  TF_RETURN_IF_ERROR(tsl::ProtoToHumanReadableJson(
-      proto, &ret, /*ignore_accuracy_loss=*/true));
-  return ret;
-}
-
 const PrecisionConfig& HloInstruction::precision_config() const {
   if (auto* convolution = DynCast<HloConvolutionInstruction>(this)) {
     return convolution->precision_config();
@@ -5285,6 +5270,10 @@ const std::vector<ReplicaGroup>& HloInstruction::replica_groups() const {
   return Cast<HloCollectiveInstruction>(this)->replica_groups();
 }
 
+const CollectiveDeviceList& HloInstruction::device_list() const {
+  return Cast<HloCollectiveInstruction>(this)->device_list();
+}
+
 const std::vector<std::pair<int64_t, int64_t>>&
 HloInstruction::source_target_pairs() const {
   return Cast<HloCollectivePermuteInstruction>(this)->source_target_pairs();
@@ -5490,4 +5479,11 @@ HloInstruction::output_operand_aliasing() const {
   return Cast<HloCallableInstruction>(this)->output_to_operand_aliasing();
 }
 
+void HloInstruction::set_output_to_operand_aliasing(
+    std::vector<std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>
+        aliasing) {
+  Cast<HloCallableInstruction>(this)->set_output_to_operand_aliasing(
+      std::move(aliasing));
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/ir/hlo_instruction.h b/third_party/xla/xla/hlo/ir/hlo_instruction.h
index a28fb7cf529149..a1f7e2e98312fc 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instruction.h
+++ b/third_party/xla/xla/hlo/ir/hlo_instruction.h
@@ -25,7 +25,6 @@ limitations under the License.
 #include <cstdint>
 #include <functional>
 #include <iosfwd>
-#include <list>
 #include <map>
 #include <memory>
 #include <optional>
@@ -37,23 +36,22 @@ limitations under the License.
 #include <vector>
 
 #include "absl/base/attributes.h"
-#include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/function_ref.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
-#include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/comparison_util.h"
+#include "xla/hlo/ir/backend_config.h"
+#include "xla/hlo/ir/collective_device_list.h"
 #include "xla/hlo/ir/dfs_hlo_visitor.h"
 #include "xla/hlo/ir/hlo_clone_context.h"
 #include "xla/hlo/ir/hlo_domain_metadata.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/hlo/ir/ptrvec.h"
-#include "xla/iterator_util.h"
 #include "xla/layout.h"
 #include "xla/literal.h"
 #include "xla/printer.h"
@@ -675,6 +673,7 @@ class HloInstruction {
   };
 
   inline static constexpr char kMainExecutionThread[] = "main";
+  inline static constexpr char kHostThread[] = "host";
 
   virtual ~HloInstruction() { DetachFromOperandsAndUsers(); }
 
@@ -855,6 +854,13 @@ class HloInstruction {
   // use_global_device_ids arguments are identical to those in all-reduce,
   // except that the order of the group members determines the concatenation
   // order of inputs from different participants.
+  static std::unique_ptr<HloInstruction> CreateAllGather(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      int64_t all_gather_dimension, const CollectiveDeviceList& device_list,
+      bool constrain_layout, const std::optional<int64_t>& channel_id,
+      bool use_global_device_ids);
+
+  ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
   static std::unique_ptr<HloInstruction> CreateAllGather(
       const Shape& shape, absl::Span<HloInstruction* const> operands,
       int64_t all_gather_dimension,
@@ -868,6 +874,13 @@ class HloInstruction {
   // except that the order of the group members determines the concatenation
   // order of inputs from different participants. Needs to be used in
   // conjunction of a AllGatherDone op that synchronizes and returns the result.
+  static std::unique_ptr<HloInstruction> CreateAllGatherStart(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      int64_t all_gather_dimension, const CollectiveDeviceList& device_list,
+      bool constrain_layout, const std::optional<int64_t>& channel_id,
+      bool use_global_device_ids);
+
+  ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
   static std::unique_ptr<HloInstruction> CreateAllGatherStart(
       const Shape& shape, absl::Span<HloInstruction* const> operands,
       int64_t all_gather_dimension,
@@ -887,6 +900,13 @@ class HloInstruction {
   // `channel_id`: for Allreduce nodes from different modules, if
   // they have the same channel_id, they will be 'Allreduce'd. If
   // empty, Allreduce will not be applied cross modules.
+  static std::unique_ptr<HloInstruction> CreateAllReduce(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      HloComputation* reduce_computation,
+      const CollectiveDeviceList& device_list, bool constrain_layout,
+      const std::optional<int64_t>& channel_id, bool use_global_device_ids);
+
+  ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
   static std::unique_ptr<HloInstruction> CreateAllReduce(
       const Shape& shape, absl::Span<HloInstruction* const> operands,
       HloComputation* reduce_computation,
@@ -896,6 +916,14 @@ class HloInstruction {
   // Creates a reduce-scatter operation which reduces its inputs across the
   // given replica groups and then scatters the reduced data across the N
   // participants.
+  static std::unique_ptr<HloInstruction> CreateReduceScatter(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      HloComputation* reduce_computation,
+      const CollectiveDeviceList& device_list, bool constrain_layout,
+      const std::optional<int64_t>& channel_id, bool use_global_device_ids,
+      int64_t scatter_dimension);
+
+  ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
   static std::unique_ptr<HloInstruction> CreateReduceScatter(
       const Shape& shape, absl::Span<HloInstruction* const> operands,
       HloComputation* reduce_computation,
@@ -916,6 +944,13 @@ class HloInstruction {
   // `channel_id`: for Allreduce nodes from different modules, if
   // they have the same channel_id, they will be 'Allreduce'd. If
   // empty, Allreduce will not be applied cross modules.
+  static std::unique_ptr<HloInstruction> CreateAllReduceStart(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      HloComputation* reduce_computation,
+      const CollectiveDeviceList& device_list, bool constrain_layout,
+      const std::optional<int64_t>& channel_id, bool use_global_device_ids);
+
+  ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
   static std::unique_ptr<HloInstruction> CreateAllReduceStart(
       const Shape& shape, absl::Span<HloInstruction* const> operands,
       HloComputation* reduce_computation,
@@ -948,6 +983,13 @@ class HloInstruction {
   // Note that in addition to supporting this instruction, XlaBuilder also
   // supports a higher-level instruction which takes one input and slices it,
   // performs AllToAll and then concatenates the results into a single array.
+  static std::unique_ptr<HloInstruction> CreateAllToAll(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      const CollectiveDeviceList& device_list, bool constrain_layout,
+      const std::optional<int64_t>& channel_id,
+      const std::optional<int64_t>& split_dimension = std::nullopt);
+
+  ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
   static std::unique_ptr<HloInstruction> CreateAllToAll(
       const Shape& shape, absl::Span<HloInstruction* const> operands,
       absl::Span<const ReplicaGroup> replica_groups, bool constrain_layout,
@@ -960,12 +1002,12 @@ class HloInstruction {
   // on that replica is a tensor consists of 0(s) in `shape`.
   static std::unique_ptr<HloInstruction> CreateCollectiveBroadcast(
       const Shape& shape, absl::Span<HloInstruction* const> operand,
-      absl::Span<const ReplicaGroup> replica_groups, bool constrain_layout,
+      const CollectiveDeviceList& device_list, bool constrain_layout,
       const std::optional<int64_t>& channel_id);
 
+  ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
   static std::unique_ptr<HloInstruction> CreateCollectiveBroadcast(
-      const Shape& shape, HloInstruction* input, HloInstruction* output,
-      HloInstruction* input_start_indices, HloInstruction* output_start_indices,
+      const Shape& shape, absl::Span<HloInstruction* const> operand,
       absl::Span<const ReplicaGroup> replica_groups, bool constrain_layout,
       const std::optional<int64_t>& channel_id);
 
@@ -1355,7 +1397,7 @@ class HloInstruction {
 
   // Creates a Afterall instruction used for joining or creating new values of
   // token type which thread through side-effecting operations. Operands must
-  // all be tokens, and there must be at least one operand.
+  // all be tokens, calls without operands generates a token.
   static std::unique_ptr<HloInstruction> CreateAfterAll(
       absl::Span<HloInstruction* const> operands);
 
@@ -1456,19 +1498,19 @@ class HloInstruction {
   // This is used to enforce an additional ordering requirement that is not
   // captured by normal data dependencies, such as ordering among Send or Recv
   // operations to avoid deadlock.
-  Status AddControlDependencyTo(HloInstruction* instruction);
+  absl::Status AddControlDependencyTo(HloInstruction* instruction);
 
   // Removes a previously added control dependency from this instruction to
   // 'instruction'.
-  Status RemoveControlDependencyTo(HloInstruction* instruction);
+  absl::Status RemoveControlDependencyTo(HloInstruction* instruction);
 
   // Drops all control predecessors and successors from this HLO instruction.
-  Status DropAllControlDeps();
+  absl::Status DropAllControlDeps();
 
   // Drops all control predecessors and successors from this HLO instruction,
   // and the maintain the transitivie control dependencies between
   // control predecessors and control successors.
-  Status SafelyDropAllControlDependencies();
+  absl::Status SafelyDropAllControlDependencies();
 
   // Returns if instruction has any control dependencies.
   bool HasControlDependencies() const;
@@ -1480,7 +1522,7 @@ class HloInstruction {
   // Depending on the use cases we see in practice, in the future we may
   // consider folding the logic here into Clone, CloneWithNewOperands and
   // ReplaceAllUsesWith by treating control dependencies like data dependencies.
-  Status CopyAllControlDepsFrom(const HloInstruction* inst) {
+  absl::Status CopyAllControlDepsFrom(const HloInstruction* inst) {
     return inst->CopyAllControlDepsTo(this, this);
   }
 
@@ -1488,7 +1530,8 @@ class HloInstruction {
   // all control predecessors of this instruction to control predecessors of
   // `start` and copies all control successors of this instruction to control
   // successors of `end`.
-  Status CopyAllControlDepsTo(HloInstruction* start, HloInstruction* end) const;
+  absl::Status CopyAllControlDepsTo(HloInstruction* start,
+                                    HloInstruction* end) const;
 
   // Returns the set of control predecessors (successors) of this
   // instruction. Control predecessors (successors) must execute before (after)
@@ -1584,32 +1627,35 @@ class HloInstruction {
   //
   // If user is a fusion instruction, this function will remove any duplicated
   // operands of it which could be created due to this replacement.
-  Status ReplaceUseWith(HloInstruction* user, HloInstruction* new_producer);
+  absl::Status ReplaceUseWith(HloInstruction* user,
+                              HloInstruction* new_producer);
 
   // Same as ReplaceUseWith(), but new_producer can have a different shape.
-  Status ReplaceUseWithDifferentShape(HloInstruction* user,
-                                      HloInstruction* new_producer);
+  absl::Status ReplaceUseWithDifferentShape(HloInstruction* user,
+                                            HloInstruction* new_producer);
 
   // Same as ReplaceUseWith but only replaces the use at the given operand
   // number.
-  Status ReplaceUseWith(HloInstruction* user, int operand_number,
-                        HloInstruction* new_producer);
-  Status ReplaceUseWithDifferentShape(HloInstruction* user, int operand_number,
-                                      HloInstruction* new_producer);
+  absl::Status ReplaceUseWith(HloInstruction* user, int operand_number,
+                              HloInstruction* new_producer);
+  absl::Status ReplaceUseWithDifferentShape(HloInstruction* user,
+                                            int operand_number,
+                                            HloInstruction* new_producer);
 
   // Replaces the specified operand with new_operand. The old and new operands
   // must have compatible shapes ignoring floating-point precision.
   //
   // This function does NOT remove duplicated operands even if this instruction
   // is a fusion, so that the existing operand numbers do not change.
-  Status ReplaceOperandWith(int64_t operand_num, HloInstruction* new_operand);
+  absl::Status ReplaceOperandWith(int64_t operand_num,
+                                  HloInstruction* new_operand);
 
   // Same as ReplaceOperandWith(), but new_operand can have a different shape.
-  Status ReplaceOperandWithDifferentShape(int64_t operand_num,
-                                          HloInstruction* new_operand);
+  absl::Status ReplaceOperandWithDifferentShape(int64_t operand_num,
+                                                HloInstruction* new_operand);
 
   // Decomposes fusion back to individual parts.
-  Status Defuse();
+  absl::Status Defuse();
 
   // Replaces all uses of this instruction with the new producer. If
   // new_producer is a user of this instruction then new_producer remains a use
@@ -1626,16 +1672,16 @@ class HloInstruction {
   //
   // trigger is a string used in the error message if the new and the
   // current instruction don't have a compatible shape.
-  Status ReplaceAllUsesWith(HloInstruction* new_producer,
-                            absl::string_view trigger = "");
+  absl::Status ReplaceAllUsesWith(HloInstruction* new_producer,
+                                  absl::string_view trigger = "");
 
   // Same as ReplaceAllUsesWith, but new_producer can have a different shape.
-  Status ReplaceAllUsesWithDifferentShape(HloInstruction* new_producer);
+  absl::Status ReplaceAllUsesWithDifferentShape(HloInstruction* new_producer);
 
   // Same as ReplaceAllUsesWith, but only replace given set of users.
-  Status ReplaceUsesWith(absl::Span<HloInstruction* const> users,
-                         HloInstruction* new_producer);
-  Status ReplaceAllUsesWithDifferentShape(
+  absl::Status ReplaceUsesWith(absl::Span<HloInstruction* const> users,
+                               HloInstruction* new_producer);
+  absl::Status ReplaceAllUsesWithDifferentShape(
       absl::Span<HloInstruction* const> users, HloInstruction* new_producer);
 
   // Performs a postorder DFS visit using this node as the root. If
@@ -1647,13 +1693,14 @@ class HloInstruction {
   // true, DFS will go across the computation boundary (i.e., from an
   // instruction to the root instruction of a computation it calls).
   template <typename HloInstructionPtr>
-  Status Accept(DfsHloVisitorBase<HloInstructionPtr>* visitor,
-                bool call_finish_visit = true,
-                bool ignore_control_predecessors = false,
-                bool cross_computation = false);
-  Status Accept(ConstDfsHloVisitor* visitor, bool call_finish_visit = true,
-                bool ignore_control_predecessors = false,
-                bool cross_computation = false) const {
+  absl::Status Accept(DfsHloVisitorBase<HloInstructionPtr>* visitor,
+                      bool call_finish_visit = true,
+                      bool ignore_control_predecessors = false,
+                      bool cross_computation = false);
+  absl::Status Accept(ConstDfsHloVisitor* visitor,
+                      bool call_finish_visit = true,
+                      bool ignore_control_predecessors = false,
+                      bool cross_computation = false) const {
     return const_cast<HloInstruction*>(this)->Accept(
         visitor, call_finish_visit, ignore_control_predecessors,
         cross_computation);
@@ -1664,14 +1711,14 @@ class HloInstruction {
   // true, A is visited before B.
   using CompareFunction =
       absl::FunctionRef<bool(const HloInstruction*, const HloInstruction*)>;
-  Status AcceptWithOperandOrder(DfsHloVisitor* visitor,
-                                CompareFunction operand_order,
-                                bool call_finish_visit = true);
+  absl::Status AcceptWithOperandOrder(DfsHloVisitor* visitor,
+                                      CompareFunction operand_order,
+                                      bool call_finish_visit = true);
 
   // Visit this instruction and only this instruction with the given visitor.
   template <typename HloInstructionPtr>
-  Status Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor);
-  Status Visit(ConstDfsHloVisitor* visitor) const {
+  absl::Status Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor);
+  absl::Status Visit(ConstDfsHloVisitor* visitor) const {
     return const_cast<HloInstruction*>(this)->Visit(visitor);
   }
 
@@ -2020,38 +2067,14 @@ class HloInstruction {
   // if no id has been assigned yet).
   int unique_id() const { return unique_id_; }
 
-  template <typename T>
-  using EnableIfProto = typename std::enable_if_t<
-      std::is_base_of<tsl::protobuf::Message, T>::value>;
-
-  // Returns the backend-specific configuration for how a backend should compile
-  // this HLO. The meaning of the field is backend specific. Not for use before
-  // or during general HLO optimization, since HLO optimizations do not preserve
-  // this field and they cannot interpret it due to its meaning being backend
-  // specific. Except for CustomCall, where this field is preserved and no
-  // general HLO optimization needs to interpret it.
-  //
-  // ConfigProto should be a protobuf Message type.
-  template <typename ConfigProto, EnableIfProto<ConfigProto>* = nullptr>
-  StatusOr<ConfigProto> backend_config() const {
-    ConfigProto proto;
-    TF_RETURN_IF_ERROR(GetBackendConfigInternal(&proto));
-    return std::move(proto);
-  }
-
-  Status set_backend_config(const tsl::protobuf::Message& proto) {
-    backend_config_ = proto;
-    return OkStatus();
-  }
-
   bool preserve_layout() const { return metadata_->preserve_layout(); }
 
   bool has_backend_config() const { return !backend_config_.empty(); }
 
-  void clear_backend_config() { backend_config_.clear(); }
+  void clear_backend_config() { backend_config_ = BackendConfigWrapper(); }
 
   void CopyBackendConfigFrom(const HloInstruction* other) {
-    backend_config_ = other->backend_config_.Clone();
+    backend_config_ = BackendConfigWrapper(other->backend_config_);
   }
 
   void set_frontend_attributes(FrontendAttributes frontend_attributes) {
@@ -2099,30 +2122,42 @@ class HloInstruction {
 
   const StatisticsViz& statistics_viz() const { return rare()->statistics_viz; }
 
+  template <typename T>
+  using EnableIfProto = typename std::enable_if_t<
+      std::is_base_of<tsl::protobuf::Message, T>::value>;
+
+  // Returns the backend-specific configuration for how a backend should compile
+  // this HLO. The meaning of the field is backend specific. Not for use before
+  // or during general HLO optimization, since HLO optimizations do not preserve
+  // this field and they cannot interpret it due to its meaning being backend
+  // specific. Except for CustomCall, where this field is preserved and no
+  // general HLO optimization needs to interpret it.
+  //
+  // ConfigProto should be a protobuf Message type.
+  template <typename ConfigProto, EnableIfProto<ConfigProto>* = nullptr>
+  absl::StatusOr<ConfigProto> backend_config() const {
+    ConfigProto proto;
+    TF_RETURN_IF_ERROR(backend_config_.GetProto(&proto));
+    return std::move(proto);
+  }
+
+  absl::Status set_backend_config(const tsl::protobuf::Message& proto) {
+    backend_config_ = BackendConfigWrapper(proto);
+    return OkStatus();
+  }
+
   // Getter/setter for raw JSON-encoded backend config.  Prefer the
   // functions above that deal in proto Messages where possible.
   const std::string& raw_backend_config_string() const {
     return backend_config_.GetRawString();
   }
   void set_raw_backend_config_string(std::string config_str) {
-    backend_config_ = std::move(config_str);
+    backend_config_ = BackendConfigWrapper(std::move(config_str));
   }
 
   bool is_default_config() const { return is_default_config_; }
   void set_default_config() { is_default_config_ = true; }
 
-  // Returns a string representation of a proto in the format used by
-  // raw_backend_config_string.
-  //
-  // This is morally equivalent to:
-  //
-  //   HloInstruction instr;
-  //   TF_RETURN_IF_ERROR(instr.set_backend_config(proto));
-  //   return instr.raw_backend_config_string();
-  //
-  static absl::StatusOr<std::string> BackendConfigToRawString(
-      const tsl::protobuf::Message& proto);
-
   // Returns the information used to tell the implementation information about
   // what sort of precision is requested. The meaning of the field is backend
   // specific. At the moment, it is only supported for kConvolution and kDot.
@@ -2345,8 +2380,13 @@ class HloInstruction {
   Shape* mutable_outfeed_shape();
 
   // Delegates to HloCollectiveInstruction::replica_groups.
+  // TODO(b/316622399): Remove usages of this method and replace with
+  // device_list()->replica_groups().
   const std::vector<ReplicaGroup>& replica_groups() const;
 
+  // Delegates to HloCollectiveInstruction::device_list.
+  const CollectiveDeviceList& device_list() const;
+
   // Delegates to HloCollectivePermuteInstruction::source_target_pairs.
   const std::vector<std::pair<int64_t, int64_t>>& source_target_pairs() const;
 
@@ -2484,6 +2524,11 @@ class HloInstruction {
   const std::vector<std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>&
   output_operand_aliasing() const;
 
+  // Delegates to HloCallableInstruction::set_output_to_operand_aliasing().
+  void set_output_to_operand_aliasing(
+      std::vector<std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>
+          aliasing);
+
   // Appends operand to the list of operands and adds this instruction as a user
   // of the operand.
   void AppendOperand(HloInstruction* operand);
@@ -2536,56 +2581,6 @@ class HloInstruction {
 
  private:
   friend class HloComputation;
-  // Wrapper class of string format and protobuf format of BackendConfig.
-  class BackendConfigRep {
-   public:
-    const tsl::protobuf::Message* GetProtoPtr() const { return proto_.get(); }
-
-    const std::string& GetRawString() const;
-
-    BackendConfigRep Clone() const;
-
-    bool operator==(const BackendConfigRep& other) const;
-    bool operator!=(const BackendConfigRep& other) const {
-      return !(*this == other);
-    }
-
-    bool empty() const {
-      absl::MutexLock lock{&mutex_};
-      return proto_ == nullptr && raw_string_.empty();
-    }
-
-    void clear() {
-      proto_.reset();
-      absl::MutexLock lock{&mutex_};
-      raw_string_.clear();
-    }
-
-    BackendConfigRep() = default;
-    BackendConfigRep(BackendConfigRep&& other)
-        : proto_(std::move(other.proto_)), raw_string_([&] {
-            absl::MutexLock lock{&other.mutex_};
-            return std::move(other.raw_string_);
-          }()) {}
-
-    BackendConfigRep& operator=(std::string raw_string);
-    BackendConfigRep& operator=(const tsl::protobuf::Message& proto);
-    BackendConfigRep& operator=(BackendConfigRep&& other) {
-      proto_ = std::move(other.proto_);
-      absl::MutexLock destination_lock{&mutex_};
-      absl::MutexLock source_lock{&other.mutex_};
-      raw_string_ = std::move(other.raw_string_);
-      return *this;
-    }
-
-    void SetProto(const tsl::protobuf::Message& proto);
-
-   private:
-    std::unique_ptr<tsl::protobuf::Message> proto_;
-    // If proto_ is not null, raw_string_ is a lazy cache of its string format.
-    mutable absl::Mutex mutex_;
-    mutable std::string raw_string_ ABSL_GUARDED_BY(mutex_);
-  };
 
   bool IdenticalInternal(
       const HloInstruction& other,
@@ -2641,7 +2636,7 @@ class HloInstruction {
 
   // Helper for implementing backend_config().  Parses backend_config_ into the
   // given proto.
-  Status GetBackendConfigInternal(tsl::protobuf::Message* proto) const;
+  absl::Status GetBackendConfigInternal(tsl::protobuf::Message* proto) const;
 
   // Mark this instruction as dead. Accessed by friend class HloInstruction.
   void MarkAsDead() { marked_as_dead_ = true; }
@@ -2788,7 +2783,7 @@ class HloInstruction {
 
   // The backend-specific configuration for how a backend should compile this
   // HLO. See the documentation on backend_config().
-  mutable BackendConfigRep backend_config_;
+  BackendConfigWrapper backend_config_;
 
   // String identifier for instruction.
   std::string name_;
@@ -2799,11 +2794,12 @@ class HloInstruction {
 };
 
 // Explicit instantiations in hlo_instruction.cc.
-extern template Status HloInstruction::Accept(DfsHloVisitor*, bool, bool, bool);
-extern template Status HloInstruction::Accept(ConstDfsHloVisitor*, bool, bool,
-                                              bool);
-extern template Status HloInstruction::Visit(DfsHloVisitor* visitor);
-extern template Status HloInstruction::Visit(ConstDfsHloVisitor* visitor);
+extern template absl::Status HloInstruction::Accept(DfsHloVisitor*, bool, bool,
+                                                    bool);
+extern template absl::Status HloInstruction::Accept(ConstDfsHloVisitor*, bool,
+                                                    bool, bool);
+extern template absl::Status HloInstruction::Visit(DfsHloVisitor* visitor);
+extern template absl::Status HloInstruction::Visit(ConstDfsHloVisitor* visitor);
 
 absl::string_view ToString(HloInstruction::FusionKind kind);
 absl::StatusOr<HloInstruction::FusionKind> StringToFusionKind(
diff --git a/third_party/xla/xla/hlo/ir/hlo_instructions.cc b/third_party/xla/xla/hlo/ir/hlo_instructions.cc
index cbeb602313f223..b1a44f092a4c4c 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instructions.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_instructions.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <deque>
 #include <functional>
 #include <iterator>
-#include <list>
 #include <memory>
 #include <numeric>
 #include <optional>
@@ -38,6 +37,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/comparison_util.h"
+#include "xla/hlo/ir/collective_device_list.h"
 #include "xla/hlo/ir/dfs_hlo_visitor.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_clone_context.h"
@@ -48,7 +48,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/hlo/ir/hlo_sharding_metadata.h"
-#include "xla/iterator_util.h"
 #include "xla/layout.h"
 #include "xla/layout_util.h"
 #include "xla/literal.h"
@@ -915,20 +914,29 @@ HloRecvDoneInstruction::CloneWithNewOperandsImpl(
 HloCollectiveInstruction::HloCollectiveInstruction(
     HloOpcode opcode, const Shape& shape,
     absl::Span<HloInstruction* const> operands,
-    absl::Span<const ReplicaGroup> replica_groups, bool constrain_layout,
+    const CollectiveDeviceList& device_list, bool constrain_layout,
     const std::optional<int64_t>& channel_id)
     : HloChannelInstruction(opcode, shape, channel_id),
-      replica_groups_(SpanToVector(replica_groups)),
+      device_list_(device_list),
       constrain_layout_(constrain_layout) {
   for (auto operand : operands) {
     AppendOperand(operand);
   }
 }
 
+HloCollectiveInstruction::HloCollectiveInstruction(
+    HloOpcode opcode, const Shape& shape,
+    absl::Span<HloInstruction* const> operands,
+    absl::Span<const ReplicaGroup> replica_groups, bool constrain_layout,
+    const std::optional<int64_t>& channel_id)
+    : HloCollectiveInstruction(opcode, shape, operands,
+                               CollectiveDeviceList(replica_groups),
+                               constrain_layout, channel_id) {}
+
 HloInstructionProto HloCollectiveInstruction::ToProto() const {
   HloInstructionProto proto = HloChannelInstruction::ToProto();
-  *proto.mutable_replica_groups() = {replica_groups_.begin(),
-                                     replica_groups_.end()};
+  *proto.mutable_replica_groups() = {replica_groups().begin(),
+                                     replica_groups().end()};
   proto.set_constrain_layout(constrain_layout_);
   return proto;
 }
@@ -964,13 +972,23 @@ bool HloCollectiveInstruction::IdenticalSlowPathIgnoringChannelIdValues(
 HloAllGatherInstruction::HloAllGatherInstruction(
     HloOpcode opcode, const Shape& shape,
     absl::Span<HloInstruction* const> operands, int64_t all_gather_dimension,
-    absl::Span<const ReplicaGroup> replica_groups, bool constrain_layout,
+    const CollectiveDeviceList& device_list, bool constrain_layout,
     const std::optional<int64_t>& channel_id, bool use_global_device_ids)
-    : HloCollectiveInstruction(opcode, shape, operands, replica_groups,
+    : HloCollectiveInstruction(opcode, shape, operands, device_list,
                                constrain_layout, channel_id),
       all_gather_dimension_(all_gather_dimension),
       use_global_device_ids_(use_global_device_ids) {}
 
+HloAllGatherInstruction::HloAllGatherInstruction(
+    HloOpcode opcode, const Shape& shape,
+    absl::Span<HloInstruction* const> operands, int64_t all_gather_dimension,
+    absl::Span<const ReplicaGroup> replica_groups, bool constrain_layout,
+    const std::optional<int64_t>& channel_id, bool use_global_device_ids)
+    : HloAllGatherInstruction(opcode, shape, operands, all_gather_dimension,
+                              CollectiveDeviceList(replica_groups),
+                              constrain_layout, channel_id,
+                              use_global_device_ids) {}
+
 void HloAllGatherInstruction::PrintExtraAttributesImpl(
     AttributePrinter& printer, const HloPrintOptions& options) const {
   HloCollectiveInstruction::PrintExtraAttributesImpl(printer, options);
@@ -989,7 +1007,7 @@ HloAllGatherInstruction::CloneWithNewOperandsImpl(
     const Shape& shape, absl::Span<HloInstruction* const> new_operands,
     HloCloneContext* /*context*/) const {
   return std::make_unique<HloAllGatherInstruction>(
-      opcode(), shape, new_operands, all_gather_dimension(), replica_groups(),
+      opcode(), shape, new_operands, all_gather_dimension(), device_list(),
       constrain_layout(), channel_id(), use_global_device_ids());
 }
 
@@ -1014,16 +1032,27 @@ bool HloAllGatherInstruction::IdenticalSlowPathIgnoringChannelIdValues(
 HloAllReduceInstructionBase::HloAllReduceInstructionBase(
     HloOpcode opcode, const Shape& shape,
     absl::Span<HloInstruction* const> operands,
-    HloComputation* reduce_computation,
-    absl::Span<const ReplicaGroup> replica_groups, bool constrain_layout,
-    const std::optional<int64_t>& channel_id, bool use_global_device_ids)
-    : HloCollectiveInstruction(opcode, shape, operands, replica_groups,
+    HloComputation* reduce_computation, const CollectiveDeviceList& device_list,
+    bool constrain_layout, const std::optional<int64_t>& channel_id,
+    bool use_global_device_ids)
+    : HloCollectiveInstruction(opcode, shape, operands, device_list,
                                constrain_layout, channel_id),
       use_global_device_ids_(use_global_device_ids) {
   AppendComputation(reduce_computation);
   reduce_computation->SetCollectiveCallInstruction(this);
 }
 
+HloAllReduceInstructionBase::HloAllReduceInstructionBase(
+    HloOpcode opcode, const Shape& shape,
+    absl::Span<HloInstruction* const> operands,
+    HloComputation* reduce_computation,
+    absl::Span<const ReplicaGroup> replica_groups, bool constrain_layout,
+    const std::optional<int64_t>& channel_id, bool use_global_device_ids)
+    : HloAllReduceInstructionBase(opcode, shape, operands, reduce_computation,
+                                  CollectiveDeviceList(replica_groups),
+                                  constrain_layout, channel_id,
+                                  use_global_device_ids) {}
+
 HloInstructionProto HloAllReduceInstructionBase::ToProto() const {
   HloInstructionProto proto = HloCollectiveInstruction::ToProto();
   proto.set_use_global_device_ids(use_global_device_ids_);
@@ -1070,20 +1099,30 @@ HloAllReduceInstruction::CloneWithNewOperandsImpl(
     const Shape& shape, absl::Span<HloInstruction* const> new_operands,
     HloCloneContext* /*context*/) const {
   return std::make_unique<HloAllReduceInstruction>(
-      opcode(), shape, new_operands, to_apply(), replica_groups(),
+      opcode(), shape, new_operands, to_apply(), device_list(),
       constrain_layout(), channel_id(), use_global_device_ids());
 }
 
+HloReduceScatterInstruction::HloReduceScatterInstruction(
+    const Shape& shape, absl::Span<HloInstruction* const> operands,
+    HloComputation* reduce_computation, const CollectiveDeviceList& device_list,
+    bool constrain_layout, const std::optional<int64_t>& channel_id,
+    bool use_global_device_ids, int64_t scatter_dimension)
+    : HloAllReduceInstructionBase(
+          HloOpcode::kReduceScatter, shape, operands, reduce_computation,
+          device_list, constrain_layout, channel_id, use_global_device_ids),
+      scatter_dimension_(scatter_dimension) {}
+
 HloReduceScatterInstruction::HloReduceScatterInstruction(
     const Shape& shape, absl::Span<HloInstruction* const> operands,
     HloComputation* reduce_computation,
     absl::Span<const ReplicaGroup> replica_groups, bool constrain_layout,
     const std::optional<int64_t>& channel_id, bool use_global_device_ids,
     int64_t scatter_dimension)
-    : HloAllReduceInstructionBase(
-          HloOpcode::kReduceScatter, shape, operands, reduce_computation,
-          replica_groups, constrain_layout, channel_id, use_global_device_ids),
-      scatter_dimension_(scatter_dimension) {}
+    : HloReduceScatterInstruction(shape, operands, reduce_computation,
+                                  CollectiveDeviceList(replica_groups),
+                                  constrain_layout, channel_id,
+                                  use_global_device_ids, scatter_dimension) {}
 
 void HloReduceScatterInstruction::PrintExtraAttributesImpl(
     AttributePrinter& printer, const HloPrintOptions& options) const {
@@ -1115,25 +1154,34 @@ HloReduceScatterInstruction::CloneWithNewOperandsImpl(
     const Shape& shape, absl::Span<HloInstruction* const> new_operands,
     HloCloneContext* /*context*/) const {
   return std::make_unique<HloReduceScatterInstruction>(
-      shape, new_operands, to_apply(), replica_groups(), constrain_layout(),
+      shape, new_operands, to_apply(), device_list(), constrain_layout(),
       channel_id(), use_global_device_ids(), scatter_dimension());
 }
 
 HloAllToAllInstruction::HloAllToAllInstruction(
     const Shape& shape, absl::Span<HloInstruction* const> operands,
-    absl::Span<const ReplicaGroup> replica_groups, bool constrain_layout,
+    const CollectiveDeviceList& device_list, bool constrain_layout,
     const std::optional<int64_t>& channel_id,
     const std::optional<int64_t>& split_dimension)
     : HloCollectiveInstruction(HloOpcode::kAllToAll, shape, operands,
-                               replica_groups, constrain_layout, channel_id),
+                               device_list, constrain_layout, channel_id),
       split_dimension_(split_dimension) {}
 
+HloAllToAllInstruction::HloAllToAllInstruction(
+    const Shape& shape, absl::Span<HloInstruction* const> operands,
+    absl::Span<const ReplicaGroup> replica_groups, bool constrain_layout,
+    const std::optional<int64_t>& channel_id,
+    const std::optional<int64_t>& split_dimension)
+    : HloAllToAllInstruction(shape, operands,
+                             CollectiveDeviceList(replica_groups),
+                             constrain_layout, channel_id, split_dimension) {}
+
 std::unique_ptr<HloInstruction>
 HloAllToAllInstruction::CloneWithNewOperandsImpl(
     const Shape& shape, absl::Span<HloInstruction* const> new_operands,
     HloCloneContext* /*context*/) const {
   return std::make_unique<HloAllToAllInstruction>(
-      shape, new_operands, replica_groups(), constrain_layout(), channel_id(),
+      shape, new_operands, device_list(), constrain_layout(), channel_id(),
       split_dimension());
 }
 
@@ -1168,11 +1216,20 @@ bool HloAllToAllInstruction::IdenticalSlowPathIgnoringChannelIdValues(
 HloCollectiveBroadcastInstruction::HloCollectiveBroadcastInstruction(
     HloOpcode opcode, const Shape& shape,
     absl::Span<HloInstruction* const> operands,
-    absl::Span<const ReplicaGroup> replica_groups, bool constrain_layout,
+    const CollectiveDeviceList& device_list, bool constrain_layout,
     const std::optional<int64_t>& channel_id)
-    : HloCollectiveInstruction(opcode, shape, operands, replica_groups,
+    : HloCollectiveInstruction(opcode, shape, operands, device_list,
                                constrain_layout, channel_id) {}
 
+HloCollectiveBroadcastInstruction::HloCollectiveBroadcastInstruction(
+    HloOpcode opcode, const Shape& shape,
+    absl::Span<HloInstruction* const> operands,
+    absl::Span<const ReplicaGroup> replica_groups, bool constrain_layout,
+    const std::optional<int64_t>& channel_id)
+    : HloCollectiveBroadcastInstruction(opcode, shape, operands,
+                                        CollectiveDeviceList(replica_groups),
+                                        constrain_layout, channel_id) {}
+
 HloInstructionProto HloCollectiveBroadcastInstruction::ToProto() const {
   return HloCollectiveInstruction::ToProto();
 }
@@ -1182,7 +1239,7 @@ HloCollectiveBroadcastInstruction::CloneWithNewOperandsImpl(
     const Shape& shape, absl::Span<HloInstruction* const> new_operands,
     HloCloneContext* /*context*/) const {
   return std::make_unique<HloCollectiveBroadcastInstruction>(
-      opcode(), shape, new_operands, replica_groups(), constrain_layout(),
+      opcode(), shape, new_operands, device_list(), constrain_layout(),
       channel_id());
 }
 
@@ -1498,6 +1555,11 @@ HloReshapeInstruction::HloReshapeInstruction(const Shape& shape,
                                              int64_t inferred_dimension)
     : HloInstruction(HloOpcode::kReshape, shape),
       inferred_dimension_(inferred_dimension) {
+  CHECK(operand->shape().is_unbounded_dynamic() ||
+        ShapeUtil::StaticExtentProduct(shape) ==
+            ShapeUtil::StaticExtentProduct(operand->shape()))
+      << "shape: " << ShapeUtil::HumanString(shape)
+      << " operand: " << ShapeUtil::HumanString(operand->shape());
   AppendOperand(operand);
 }
 
@@ -1865,10 +1927,9 @@ HloCallableInstruction::CloneAndAppendInstructionIntoCalledComputation(
   HloInstruction* clone = nullptr;
   bool do_not_clone =
       instruction_to_append->opcode() == HloOpcode::kTuple &&
-      std::find_if(instruction_to_append->users().begin(),
-                   instruction_to_append->users().end(), [](HloInstruction* u) {
-                     return u->opcode() != HloOpcode::kGetTupleElement;
-                   }) == instruction_to_append->users().end();
+      absl::c_all_of(instruction_to_append->users(), [](HloInstruction* u) {
+        return u->opcode() == HloOpcode::kGetTupleElement;
+      });
   if (called_computations().empty()) {
     // New fusion instruction. It should not be a multioutput instruction.
     CHECK(!add_output);
@@ -1960,14 +2021,30 @@ HloCallableInstruction::CloneAndAppendInstructionIntoCalledComputation(
         clone->ReplaceOperandWith(operand_num, called_computation_parameter));
   }
 
+  if (clone != instruction_to_append) {
+    VLOG(2) << "New clone:\n" << clone->ToString();
+  }
+
   if (add_output) {
     int64_t user_count = instruction_to_append->user_count();
     CHECK(user_count > 0 || instruction_to_append->IsRoot())
         << "Unable to append instruction: " << instruction_to_append->ToString()
         << ", which has " << user_count << " users.";
+    HloInstruction* root = called_computation_root();
+    // Check whether we have replaced an existing fusion root with 'clone'. If
+    // yes, no need to add a duplicate root.
+    if (root->opcode() == HloOpcode::kTuple) {
+      for (int64_t i = 0; i < root->operand_count(); ++i) {
+        if (root->operand(i) == clone) {
+          HloInstruction* new_gte = AddInstruction(
+              HloInstruction::CreateGetTupleElement(clone->shape(), this, i));
+          TF_CHECK_OK(instruction_to_append->ReplaceAllUsesWith(new_gte));
+          return clone;
+        }
+      }
+    }
     // If this is already a multioutput instruction, expand the root tuple
     // by 1.
-    HloInstruction* root = called_computation_root();
     HloInstruction::InstructionVector tuple_elements;
     bool newly_created_tuple_instr = false;
     if (root->opcode() == HloOpcode::kTuple) {
@@ -1998,7 +2075,7 @@ HloCallableInstruction::CloneAndAppendInstructionIntoCalledComputation(
     // If this is a newly created multioutput instruction, we need to update
     // the use of the original callable instruction.
     if (newly_created_tuple_instr) {
-      HloInstruction* new_instr = parent()->AddInstruction(
+      HloInstruction* new_instr = AddInstruction(
           HloInstruction::CreateGetTupleElement(root->shape(), this, 0));
       TF_CHECK_OK(ReplaceAllUsesWithDifferentShape(new_instr));
     }
@@ -2013,7 +2090,7 @@ HloCallableInstruction::CloneAndAppendInstructionIntoCalledComputation(
         CHECK_EQ(old_gte->opcode(), HloOpcode::kGetTupleElement);
         int64_t old_tuple_index = old_gte->tuple_index();
         HloInstruction* new_gte =
-            parent()->AddInstruction(HloInstruction::CreateGetTupleElement(
+            AddInstruction(HloInstruction::CreateGetTupleElement(
                 old_gte->shape(), this, index + old_tuple_index));
         TF_CHECK_OK(old_gte->ReplaceAllUsesWith(new_gte));
         to_be_removed.push_back(old_gte);
@@ -2023,15 +2100,12 @@ HloCallableInstruction::CloneAndAppendInstructionIntoCalledComputation(
       }
     } else {
       HloInstruction* new_gte =
-          parent()->AddInstruction(HloInstruction::CreateGetTupleElement(
+          AddInstruction(HloInstruction::CreateGetTupleElement(
               clone->shape(), this, index - 1));
       TF_CHECK_OK(instruction_to_append->ReplaceAllUsesWith(new_gte));
     }
   }
 
-  if (clone != instruction_to_append) {
-    VLOG(2) << "New clone:\n" << clone->ToString();
-  }
   return clone;
 }
 
@@ -2072,6 +2146,7 @@ HloFusionInstruction::HloFusionInstruction(const Shape& shape,
   SetAndSanitizeName(HloOpcodeString(opcode()));
   set_parent(fused_root->parent());
   set_metadata(fused_root->metadata());
+  set_frontend_attributes(fused_root->frontend_attributes());
   CHECK(fused_root->IsFusible()) << fused_root->ToString();
   CloneAndAppendInstructionIntoCalledComputation(fused_root);
 }
@@ -2266,62 +2341,95 @@ void HloFusionInstruction::MergeFusionInstructionIntoMultiOutput(
   // it's only added when inserting to the computation.
   absl::flat_hash_map<HloInstruction*, HloInstruction*> old_to_new;
   std::vector<HloInstruction*> unfused_instructions;
+  absl::flat_hash_set<const HloInstruction*> new_roots;
+  std::vector<std::pair<HloInstruction*, int64_t>> old_fusion_outputs;
   auto computation_to_merge =
       instruction_to_merge->fused_instructions_computation();
-  auto post_order = computation_to_merge->MakeInstructionPostOrder();
-  for (auto rit = post_order.rbegin(); rit != post_order.rend(); ++rit) {
-    auto fused_instruction = *rit;
+  for (auto fused_instruction :
+       computation_to_merge->MakeInstructionPostOrder()) {
     if (fused_instruction->opcode() == HloOpcode::kParameter) {
       InsertOrDie(&old_to_new, fused_instruction,
                   instruction_to_merge->mutable_operand(
                       fused_instruction->parameter_number()));
       continue;
     }
+    // If 'instruction_to_merge' is a multi-output fusion, we need to skip the
+    // root tuple, but remember which of the fusion outputs need to become
+    // fusion outputs of the merged fusion.
+    if (fused_instruction->opcode() == HloOpcode::kTuple &&
+        fused_instruction == instruction_to_merge->fused_expression_root()) {
+      for (const HloInstruction* user : instruction_to_merge->users()) {
+        CHECK_EQ(user->opcode(), HloOpcode::kGetTupleElement);
+        old_fusion_outputs.emplace_back(
+            fused_instruction->mutable_operand(user->tuple_index()),
+            user->tuple_index());
+        bool has_outside_user = false;
+        for (HloInstruction* gte_user : user->users()) {
+          if (gte_user != this) {
+            has_outside_user = true;
+            break;
+          }
+        }
+        if (has_outside_user) {
+          new_roots.insert(
+              FindOrDie(old_to_new, old_fusion_outputs.back().first));
+        }
+      }
+      continue;
+    }
 
     // Here we clone the insertion and call FuseInstructionIntoMultiOutput()
     // which clones again. This can be improved.
+    std::vector<HloInstruction*> new_operands;
+    new_operands.reserve(fused_instruction->operand_count());
+    for (HloInstruction* new_operand : fused_instruction->mutable_operands()) {
+      new_operands.push_back(FindOrDie(old_to_new, new_operand));
+    }
     auto cloned_instruction =
-        parent()->AddInstruction(fused_instruction->Clone());
+        parent()->AddInstruction(fused_instruction->CloneWithNewOperands(
+            fused_instruction->shape(), new_operands, /*suffix=*/"clone"));
     unfused_instructions.push_back(cloned_instruction);
     InsertOrDie(&old_to_new, fused_instruction, cloned_instruction);
   }
-  for (auto unfused_instruction : unfused_instructions) {
-    for (int64_t index = 0; index < unfused_instruction->operand_count();
-         index++) {
-      auto new_operand =
-          FindOrDie(old_to_new, unfused_instruction->mutable_operand(index));
-      TF_CHECK_OK(unfused_instruction->ReplaceOperandWith(index, new_operand));
+  if (instruction_to_merge->IsMultiOutputFusion()) {
+    for (auto [old_root, tuple_index] : old_fusion_outputs) {
+      auto new_root = FindOrDie(old_to_new, old_root);
+      // Replace the get-tuple-element op on 'instruction_to_merge' referencing
+      // the same tuple index as 'old_root' with 'new_root'.
+      for (HloInstruction* gte : instruction_to_merge->users()) {
+        if (gte->opcode() == HloOpcode::kGetTupleElement &&
+            gte->tuple_index() == tuple_index) {
+          TF_CHECK_OK(gte->ReplaceAllUsesWith(new_root));
+          TF_CHECK_OK(gte->parent()->RemoveInstruction(gte));
+        }
+      }
     }
+  } else {
+    // If there are no unfused instructions, the fused computation must consist
+    // only of kParameter instructions. Make the operand of the corresponding
+    // parameter number the new root.
+    HloInstruction* unfused_root =
+        unfused_instructions.empty()
+            ? instruction_to_merge->mutable_operand(
+                  instruction_to_merge->fused_instructions_computation()
+                      ->root_instruction()
+                      ->parameter_number())
+            : unfused_instructions.back();
+    new_roots.insert(unfused_root);
+    TF_CHECK_OK(instruction_to_merge->ReplaceAllUsesWith(unfused_root));
   }
-
-  // If there are no unfused instructions, the fused computation must consist
-  // only of kParameter instructions. Make the operand of the corresponding
-  // parameter number the new root.
-  HloInstruction* unfused_root =
-      unfused_instructions.empty()
-          ? instruction_to_merge->mutable_operand(
-                instruction_to_merge->fused_instructions_computation()
-                    ->root_instruction()
-                    ->parameter_number())
-          : unfused_instructions.front();
-  TF_CHECK_OK(instruction_to_merge->ReplaceAllUsesWith(unfused_root));
-
   TF_CHECK_OK(
       instruction_to_merge->parent()->RemoveInstruction(instruction_to_merge));
   if (GetModule()) {
     TF_CHECK_OK(GetModule()->RemoveEmbeddedComputation(computation_to_merge));
   }
-
-  // Fuse the root instruction and generate multiple outputs.
-  if (unfused_instructions.empty()) {
-    return;
-  }
-  FuseInstructionIntoMultiOutput(unfused_root);
-  TF_CHECK_OK(unfused_root->parent()->RemoveInstruction(unfused_root));
-  // The rest instructions are of normal fusing.
-  for (int64_t i = 1; i < unfused_instructions.size(); i++) {
-    auto instruction = unfused_instructions[i];
-    FuseInstruction(instruction);
+  for (int64_t i = unfused_instructions.size() - 1; i >= 0; --i) {
+    HloInstruction* instruction = unfused_instructions[i];
+    if (new_roots.contains(instruction)) {
+      FuseInstructionIntoMultiOutput(instruction);
+    } else {
+      FuseInstruction(instruction);
+    }
     TF_CHECK_OK(instruction->parent()->RemoveInstruction(instruction));
   }
 }
@@ -2408,7 +2516,7 @@ std::unique_ptr<HloInstruction> HloFusionInstruction::CloneWithNewOperandsImpl(
   return new_fusion_instruction;
 }
 
-Status HloFusionInstruction::DeduplicateFusionOperands() {
+absl::Status HloFusionInstruction::DeduplicateFusionOperands() {
   if (IsCustomFusion()) {
     return OkStatus();
   }
diff --git a/third_party/xla/xla/hlo/ir/hlo_instructions.h b/third_party/xla/xla/hlo/ir/hlo_instructions.h
index 5ee5800df2ad9e..c84754fb87a114 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instructions.h
+++ b/third_party/xla/xla/hlo/ir/hlo_instructions.h
@@ -31,6 +31,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/comparison_util.h"
+#include "xla/hlo/ir/collective_device_list.h"
 #include "xla/hlo/ir/hlo_clone_context.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_domain_metadata.h"
@@ -311,6 +312,15 @@ class HloAsyncStartInstruction : public HloAsyncInstruction {
       absl::string_view async_execution_thread) override;
   HloInstructionProto ToProto() const override;
 
+  static bool ClassOf(const HloInstruction* hlo) {
+    switch (hlo->opcode()) {
+      case HloOpcode::kAsyncStart:
+        return true;
+      default:
+        return false;
+    }
+  }
+
  private:
   void PrintExtraAttributesImpl(AttributePrinter& printer,
                                 const HloPrintOptions& options) const override;
@@ -628,10 +638,14 @@ class HloRecvDoneInstruction : public HloSendRecvInstruction {
 
 class HloCollectiveInstruction : public HloChannelInstruction {
  public:
+  // TODO(b/316622399): Remove usages of this method and replace with
+  // device_list()->replica_groups().
   const std::vector<ReplicaGroup>& replica_groups() const {
-    return replica_groups_;
+    return device_list_.replica_groups();
   }
 
+  const CollectiveDeviceList& device_list() const { return device_list_; }
+
   // Returns true if the layout of the AllReduce is enforced by XLA client (as
   // the layout set in the shape). The only reason for the client to set the
   // layout is to separately compile computations that communicate with
@@ -650,6 +664,13 @@ class HloCollectiveInstruction : public HloChannelInstruction {
   static bool ClassOf(const HloInstruction* hlo);
 
  protected:
+  explicit HloCollectiveInstruction(
+      HloOpcode opcode, const Shape& shape,
+      absl::Span<HloInstruction* const> operands,
+      const CollectiveDeviceList& collective_device_list, bool constrain_layout,
+      const std::optional<int64_t>& channel_id);
+
+  ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
   explicit HloCollectiveInstruction(
       HloOpcode opcode, const Shape& shape,
       absl::Span<HloInstruction* const> operands,
@@ -665,17 +686,27 @@ class HloCollectiveInstruction : public HloChannelInstruction {
       absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
           eq_computations) const override;
 
-  std::vector<ReplicaGroup> replica_groups_;
+  CollectiveDeviceList device_list_;
   bool constrain_layout_;
 };
 
 class HloAllGatherInstruction : public HloCollectiveInstruction {
  public:
+  explicit HloAllGatherInstruction(HloOpcode opcode, const Shape& shape,
+                                   absl::Span<HloInstruction* const> operands,
+                                   int64_t all_gather_dimension,
+                                   const CollectiveDeviceList& device_list,
+                                   bool constrain_layout,
+                                   const std::optional<int64_t>& channel_id,
+                                   bool use_global_device_ids);
+
+  ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
   explicit HloAllGatherInstruction(
       HloOpcode opcode, const Shape& shape,
       absl::Span<HloInstruction* const> operands, int64_t all_gather_dimension,
       absl::Span<const ReplicaGroup> replica_groups, bool constrain_layout,
       const std::optional<int64_t>& channel_id, bool use_global_device_ids);
+
   // Same as HloAllReduceInstruction::use_global_device_ids.
   bool use_global_device_ids() const { return use_global_device_ids_; }
 
@@ -715,6 +746,14 @@ class HloAllGatherInstruction : public HloCollectiveInstruction {
 // Base class for all-reduce and all-reduce scatter instructions.
 class HloAllReduceInstructionBase : public HloCollectiveInstruction {
  public:
+  explicit HloAllReduceInstructionBase(
+      HloOpcode opcode, const Shape& shape,
+      absl::Span<HloInstruction* const> operands,
+      HloComputation* reduce_computation,
+      const CollectiveDeviceList& device_list, bool constrain_layout,
+      const std::optional<int64_t>& channel_id, bool use_global_device_ids);
+
+  ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
   explicit HloAllReduceInstructionBase(
       HloOpcode opcode, const Shape& shape,
       absl::Span<HloInstruction* const> operands,
@@ -773,6 +812,14 @@ class HloAllReduceInstruction : public HloAllReduceInstructionBase {
 
 class HloReduceScatterInstruction : public HloAllReduceInstructionBase {
  public:
+  explicit HloReduceScatterInstruction(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      HloComputation* reduce_computation,
+      const CollectiveDeviceList& device_list, bool constrain_layout,
+      const std::optional<int64_t>& channel_id, bool use_global_device_ids,
+      int64_t scatter_dimension);
+
+  ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
   explicit HloReduceScatterInstruction(
       const Shape& shape, absl::Span<HloInstruction* const> operands,
       HloComputation* reduce_computation,
@@ -811,6 +858,13 @@ class HloReduceScatterInstruction : public HloAllReduceInstructionBase {
 
 class HloAllToAllInstruction : public HloCollectiveInstruction {
  public:
+  explicit HloAllToAllInstruction(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      const CollectiveDeviceList& device_list, bool constrain_layout,
+      const std::optional<int64_t>& channel_id,
+      const std::optional<int64_t>& split_dimension);
+
+  ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
   explicit HloAllToAllInstruction(
       const Shape& shape, absl::Span<HloInstruction* const> operands,
       absl::Span<const ReplicaGroup> replica_groups, bool constrain_layout,
@@ -852,6 +906,13 @@ class HloAllToAllInstruction : public HloCollectiveInstruction {
 
 class HloCollectiveBroadcastInstruction : public HloCollectiveInstruction {
  public:
+  explicit HloCollectiveBroadcastInstruction(
+      HloOpcode opcode, const Shape& shape,
+      absl::Span<HloInstruction* const> operands,
+      const CollectiveDeviceList& device_list, bool constrain_layout,
+      const std::optional<int64_t>& channel_id);
+
+  ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
   explicit HloCollectiveBroadcastInstruction(
       HloOpcode opcode, const Shape& shape,
       absl::Span<HloInstruction* const> operands,
@@ -1455,7 +1516,7 @@ class HloFusionInstruction : public HloCallableInstruction {
   void set_fusion_kind(FusionKind kind) { fusion_kind_ = kind; }
 
   // If multiple operands are the same instruction, keeps only one of them.
-  Status DeduplicateFusionOperands();
+  absl::Status DeduplicateFusionOperands();
 
   static bool ClassOf(const HloInstruction* hlo) {
     return hlo->opcode() == HloOpcode::kFusion;
@@ -2033,6 +2094,11 @@ class HloCustomCallInstruction : public HloCallableInstruction {
     CHECK(layout_constrained());
     return operand_shapes_with_layout_;
   }
+  void set_operand_shapes_with_layout(
+      std::vector<Shape> operand_shapes_with_layout) {
+    CHECK(layout_constrained());
+    operand_shapes_with_layout_ = std::move(operand_shapes_with_layout);
+  }
   void set_custom_call_schedule(CustomCallSchedule custom_call_schedule) {
     custom_call_schedule_ = custom_call_schedule;
   }
diff --git a/third_party/xla/xla/hlo/ir/hlo_module.cc b/third_party/xla/xla/hlo/ir/hlo_module.cc
index 2deb35aa19cd48..b5fdd4d074f9f1 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_module.cc
@@ -79,7 +79,7 @@ HloModule::HloModule(const std::string& name,
   metadata_.set_canonical_module_id(unique_id_);
 }
 
-Status HloModule::set_schedule(HloSchedule schedule) {
+absl::Status HloModule::set_schedule(HloSchedule schedule) {
   TF_RET_CHECK(schedule.module() == this);
   TF_RETURN_IF_ERROR(schedule.Verify());
   schedule_ = std::move(schedule);
@@ -187,7 +187,7 @@ HloComputation* HloModule::AddEntryComputationWithLayouts(
                                 /*preserve_entry_layouts=*/true);
 }
 
-Status HloModule::RemoveEmbeddedComputation(HloComputation* to_remove) {
+absl::Status HloModule::RemoveEmbeddedComputation(HloComputation* to_remove) {
   if (has_schedule()) {
     schedule_->remove_computation(to_remove);
   }
@@ -511,7 +511,8 @@ absl::StatusOr<HloModuleProtoWithConfig> HloModule::ToProtoWithConfig() const {
   return result;
 }
 
-Status HloModule::CheckUniqueNamesAndIdsForComputationsAndInstructions() const {
+absl::Status HloModule::CheckUniqueNamesAndIdsForComputationsAndInstructions()
+    const {
   absl::flat_hash_set<absl::string_view> computation_names;
   absl::flat_hash_set<int> computation_ids;
   absl::flat_hash_set<absl::string_view> instruction_names;
@@ -1118,7 +1119,7 @@ std::unique_ptr<HloModule> HloModule::Clone(
   return module;
 }
 
-Status HloModule::RemoveUnusedComputations() {
+absl::Status HloModule::RemoveUnusedComputations() {
   std::string suffix = "tmp";
   auto module = std::make_unique<HloModule>(
       absl::StrCat(name_, "-", suffix), config(),
diff --git a/third_party/xla/xla/hlo/ir/hlo_module.h b/third_party/xla/xla/hlo/ir/hlo_module.h
index d232fb1462ce00..cc884d25a3474a 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module.h
+++ b/third_party/xla/xla/hlo/ir/hlo_module.h
@@ -163,10 +163,10 @@ class HloModule {
       std::unique_ptr<HloComputation> computation);
 
   // Removes an embedded computation.
-  Status RemoveEmbeddedComputation(HloComputation* to_remove);
+  absl::Status RemoveEmbeddedComputation(HloComputation* to_remove);
 
   // Removes unused computations.
-  Status RemoveUnusedComputations();
+  absl::Status RemoveUnusedComputations();
 
   // Marks duplicate fusions with the same name to be able to group them for
   // analysis purposes (e.g. through Xprof).
@@ -524,7 +524,7 @@ class HloModule {
   int unique_id() const { return unique_id_; }
 
   // Sets the schedule of the module to the given schedule.
-  Status set_schedule(HloSchedule schedule);
+  absl::Status set_schedule(HloSchedule schedule);
 
   // Clears the schedule of the module.
   void clear_schedule() { schedule_.reset(); }
@@ -558,7 +558,7 @@ class HloModule {
     computation->UniquifyName(&computation_name_uniquer_);
   }
 
-  Status CheckUniqueNamesAndIdsForComputationsAndInstructions() const;
+  absl::Status CheckUniqueNamesAndIdsForComputationsAndInstructions() const;
 
   // Checks if this config has a list of entry parameters' HLO shardings for
   // SPMD.
@@ -611,7 +611,8 @@ class HloModule {
         CrossProgramPrefetchInfo{parameter, index, alt_memory_offset});
   }
 
-  Status SetCrossProgramPrefetchOffset(int64_t prefetch_index, int64_t offset) {
+  absl::Status SetCrossProgramPrefetchOffset(int64_t prefetch_index,
+                                             int64_t offset) {
     TF_RET_CHECK(prefetch_index < cross_program_prefetches_.size());
     auto& [parameter, index, optional_offset] =
         cross_program_prefetches_[prefetch_index];
diff --git a/third_party/xla/xla/hlo/ir/hlo_module_metadata.cc b/third_party/xla/xla/hlo/ir/hlo_module_metadata.cc
index aa910c10919249..9765dee1fcf096 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module_metadata.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_module_metadata.cc
@@ -37,7 +37,7 @@ HloModuleMetadata::GetCurrentHloPassMetadata() {
   return running_passes_.back();
 }
 
-Status HloModuleMetadata::MutateCurrentHloPassMetadata(
+absl::Status HloModuleMetadata::MutateCurrentHloPassMetadata(
     absl::FunctionRef<void(HloPassMetadata*)> mutator) {
   TF_ASSIGN_OR_RETURN(HloPassMetadata * pass_metadata,
                       GetCurrentHloPassMetadata());
@@ -52,7 +52,7 @@ void HloModuleMetadata::RecordPassStart() {
   running_passes_.push_back(pass_metadata);
 }
 
-Status HloModuleMetadata::RecordPassEnd() {
+absl::Status HloModuleMetadata::RecordPassEnd() {
   TF_ASSIGN_OR_RETURN(HloPassMetadata * pass_metadata,
                       GetCurrentHloPassMetadata());
   pass_metadata->set_end_timestamp_usec(env_->NowMicros());
@@ -88,7 +88,7 @@ void HloModuleMetadata::set_prepartitioning_metadata(
   }
 }
 
-Status HloModuleMetadata::set_custom_metadata(
+absl::Status HloModuleMetadata::set_custom_metadata(
     const ::tsl::protobuf::Message& message) {
   TF_ASSIGN_OR_RETURN(HloPassMetadata * pass_metadata,
                       GetCurrentHloPassMetadata());
diff --git a/third_party/xla/xla/hlo/ir/hlo_module_metadata.h b/third_party/xla/xla/hlo/ir/hlo_module_metadata.h
index a18e9090db4d01..14231f29897e59 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module_metadata.h
+++ b/third_party/xla/xla/hlo/ir/hlo_module_metadata.h
@@ -44,7 +44,7 @@ class HloModuleMetadata {
 
   // Marks the currently running pass as finished. Returns NotFound if metadata
   // for the currently running pass cannot be found.
-  Status RecordPassEnd();
+  absl::Status RecordPassEnd();
 
   const std::optional<HloModuleMetadataProto>& prepartitioning_metadata()
       const {
@@ -63,7 +63,7 @@ class HloModuleMetadata {
   void add_partitioned_module_id(int64_t id) {
     module_metadata_.add_partitioned_module_ids(id);
   }
-  Status set_custom_metadata(const ::tsl::protobuf::Message& message);
+  absl::Status set_custom_metadata(const ::tsl::protobuf::Message& message);
 
   absl::StatusOr<int64_t> current_pass_id() {
     TF_ASSIGN_OR_RETURN(HloPassMetadata * pass_metadata,
@@ -72,37 +72,39 @@ class HloModuleMetadata {
   }
 
   // Setters for the current HloPassMetadata.
-  Status set_current_pass_name(const std::string& pass_name) {
+  absl::Status set_current_pass_name(const std::string& pass_name) {
     return MutateCurrentHloPassMetadata(
         [&pass_name](HloPassMetadata* pass_metadata) {
           pass_metadata->set_pass_name(pass_name);
         });
   }
-  Status set_current_pass_pipeline_name(const std::string& pipeline_name) {
+  absl::Status set_current_pass_pipeline_name(
+      const std::string& pipeline_name) {
     return MutateCurrentHloPassMetadata(
         [&pipeline_name](HloPassMetadata* pass_metadata) {
           pass_metadata->set_pipeline_name(pipeline_name);
         });
   }
-  Status add_current_pass_dump_filename(const std::string& dump_filename) {
+  absl::Status add_current_pass_dump_filename(
+      const std::string& dump_filename) {
     return MutateCurrentHloPassMetadata(
         [&dump_filename](HloPassMetadata* pass_metadata) {
           pass_metadata->add_dump_filenames(dump_filename);
         });
   }
-  Status set_current_pass_module_changed(bool module_changed) {
+  absl::Status set_current_pass_module_changed(bool module_changed) {
     return MutateCurrentHloPassMetadata(
         [&module_changed](HloPassMetadata* pass_metadata) {
           pass_metadata->set_module_changed(module_changed);
         });
   }
-  Status set_current_pass_module_id(int64_t module_id) {
+  absl::Status set_current_pass_module_id(int64_t module_id) {
     return MutateCurrentHloPassMetadata(
         [&module_id](HloPassMetadata* pass_metadata) {
           pass_metadata->set_module_id(module_id);
         });
   }
-  Status add_current_pass_module_group_module_id(int64_t module_id) {
+  absl::Status add_current_pass_module_group_module_id(int64_t module_id) {
     return MutateCurrentHloPassMetadata(
         [&module_id](HloPassMetadata* pass_metadata) {
           pass_metadata->add_module_group_module_ids(module_id);
@@ -115,7 +117,7 @@ class HloModuleMetadata {
   // currently running pass cannot be found.
   absl::StatusOr<HloPassMetadata*> GetCurrentHloPassMetadata();
 
-  Status MutateCurrentHloPassMetadata(
+  absl::Status MutateCurrentHloPassMetadata(
       absl::FunctionRef<void(HloPassMetadata*)> mutator);
 
   HloModuleMetadataProto module_metadata_;
diff --git a/third_party/xla/xla/hlo/ir/hlo_schedule.cc b/third_party/xla/xla/hlo/ir/hlo_schedule.cc
index 2922a1b1286f37..1e0dce4f73a0b1 100644
--- a/third_party/xla/xla/hlo/ir/hlo_schedule.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_schedule.cc
@@ -124,7 +124,7 @@ const HloInstructionSequence& HloSchedule::sequence(
   return sequences_.at(computation->unique_id());
 }
 
-Status HloSchedule::UpdateComputationSchedule(
+absl::Status HloSchedule::UpdateComputationSchedule(
     const HloComputation* computation) {
   // Map from unique ID to HloInstruction pointer for instructions in the
   // computation.
@@ -212,7 +212,7 @@ Status HloSchedule::UpdateComputationSchedule(
   return OkStatus();
 }
 
-Status HloSchedule::Update(
+absl::Status HloSchedule::Update(
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // The schedule must contain a sequence for every non-fusion computation in
   // the module for the specified threads, but can have sequences for
@@ -282,7 +282,7 @@ HloSchedule::num_sequences_by_execution_thread() const {
   return sequence_num_by_execution_threads;
 }
 
-Status HloSchedule::Verify() const {
+absl::Status HloSchedule::Verify() const {
   VLOG(2) << "VerifySchedule()";
   XLA_VLOG_LINES(2, ToString());
 
diff --git a/third_party/xla/xla/hlo/ir/hlo_schedule.h b/third_party/xla/xla/hlo/ir/hlo_schedule.h
index 3e04c843d577de..bfc2ce5342c5b4 100644
--- a/third_party/xla/xla/hlo/ir/hlo_schedule.h
+++ b/third_party/xla/xla/hlo/ir/hlo_schedule.h
@@ -190,14 +190,14 @@ class HloSchedule {
   // remain in the same order in the updated schedule. Instructions which exist
   // in the module but not in the given schedule will be placed as early as
   // possible in the updated schedule.
-  Status Update(
+  absl::Status Update(
       const absl::flat_hash_set<absl::string_view>& execution_threads = {});
 
   // Verifies that the given schedule is valid for the given module.
   // Specifically, the schedule contains exactly the instructions in the
   // non-fusion computations in the module and every dependency in the module is
   // satisfied in the schedule.
-  Status Verify() const;
+  absl::Status Verify() const;
 
   std::string ToString() const;
 
@@ -207,7 +207,7 @@ class HloSchedule {
 
  private:
   // Updates the instruction sequence for the given computation.
-  Status UpdateComputationSchedule(const HloComputation* computation);
+  absl::Status UpdateComputationSchedule(const HloComputation* computation);
 
   const HloModule* module_;
 
diff --git a/third_party/xla/xla/hlo/ir/hlo_sharding.cc b/third_party/xla/xla/hlo/ir/hlo_sharding.cc
index 0577e5de93d0d1..26b2702885b042 100644
--- a/third_party/xla/xla/hlo/ir/hlo_sharding.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_sharding.cc
@@ -613,7 +613,7 @@ int64_t HloSharding::RequiredLeaves(const Shape& shape) {
   return (leaf_count == 0) ? 1 : leaf_count;
 }
 
-Status HloSharding::CheckLeafCount(const Shape& shape) const {
+absl::Status HloSharding::CheckLeafCount(const Shape& shape) const {
   int64_t leaf_count = ShapeUtil::GetLeafCount(shape);
   if (leaf_count == 0 && tuple_elements_.size() == 1) {
     // Allow (but don't require) empty tuples to have a single sharding
@@ -683,8 +683,8 @@ int64_t HloSharding::GetUniqueDevice() const {
   return *device;
 }
 
-Status HloSharding::ValidateTuple(const Shape& shape,
-                                  std::optional<int64_t> num_devices) const {
+absl::Status HloSharding::ValidateTuple(
+    const Shape& shape, std::optional<int64_t> num_devices) const {
   if (!shape.IsTuple()) {
     return tsl::errors::InvalidArgument(
         "Sharding is tuple-shaped but validation shape is not.");
@@ -699,7 +699,7 @@ Status HloSharding::ValidateTuple(const Shape& shape,
   // shape tree.
   ShapeTree<HloSharding> shape_tree = GetAsShapeTree(shape);
   for (const auto& index_to_sharding : shape_tree.leaves()) {
-    Status status = index_to_sharding.second.ValidateNonTuple(
+    absl::Status status = index_to_sharding.second.ValidateNonTuple(
         ShapeUtil::GetSubshape(shape, index_to_sharding.first), num_devices);
     if (!status.ok()) {
       tsl::errors::AppendToMessage(
@@ -712,13 +712,13 @@ Status HloSharding::ValidateTuple(const Shape& shape,
   return OkStatus();
 }
 
-Status HloSharding::Validate(const Shape& shape,
-                             std::optional<int64_t> num_devices) const {
+absl::Status HloSharding::Validate(const Shape& shape,
+                                   std::optional<int64_t> num_devices) const {
   if (shape.IsToken()) {
     return OkStatus();
   }
-  Status status = IsTuple() ? ValidateTuple(shape, num_devices)
-                            : ValidateNonTuple(shape, num_devices);
+  absl::Status status = IsTuple() ? ValidateTuple(shape, num_devices)
+                                  : ValidateNonTuple(shape, num_devices);
   if (!status.ok()) {
     tsl::errors::AppendToMessage(
         &status, StrCat("Note: While validating sharding ", ToString(),
@@ -727,8 +727,8 @@ Status HloSharding::Validate(const Shape& shape,
   return status;
 }
 
-Status HloSharding::ValidateNonTuple(const Shape& shape,
-                                     std::optional<int64_t> num_devices) const {
+absl::Status HloSharding::ValidateNonTuple(
+    const Shape& shape, std::optional<int64_t> num_devices) const {
   if (shape.IsTuple()) {
     return absl::InvalidArgumentError(
         "Validation shape is a tuple but sharding is not.");
@@ -742,7 +742,7 @@ Status HloSharding::ValidateNonTuple(const Shape& shape,
   bool all_devices_seen;
   if (!tile_assignment_.iota_) {
     absl::flat_hash_set<int64_t> seen_devices;
-    Status status = tile_assignment_.array().EachStatus(
+    absl::Status status = tile_assignment_.array().EachStatus(
         [&num_devices, &seen_devices](absl::Span<const int64_t> indices,
                                       int32_t device) {
           if (num_devices.has_value() && device >= *num_devices) {
diff --git a/third_party/xla/xla/hlo/ir/hlo_sharding.h b/third_party/xla/xla/hlo/ir/hlo_sharding.h
index 4237b1e182ba9d..a15d3b33e4c44f 100644
--- a/third_party/xla/xla/hlo/ir/hlo_sharding.h
+++ b/third_party/xla/xla/hlo/ir/hlo_sharding.h
@@ -164,8 +164,8 @@ class HloSharding {
   std::string ToString(bool include_metadata = false) const;
 
   // Validate that this sharding can be applied to a tensor with shape `shape`.
-  Status Validate(const Shape& shape,
-                  std::optional<int64_t> num_devices = {}) const;
+  absl::Status Validate(const Shape& shape,
+                        std::optional<int64_t> num_devices = {}) const;
 
   // Returns true if the sharding has tuple type.
   bool IsTuple() const { return tuple_; }
@@ -629,15 +629,15 @@ class HloSharding {
 
   // Checks that the number of elements in tuple_elements_ is consistent with
   // the tuple shape passes as argument.
-  Status CheckLeafCount(const Shape& shape) const;
+  absl::Status CheckLeafCount(const Shape& shape) const;
 
   // Internal helper to validate a tuple sharding.
-  Status ValidateTuple(const Shape& shape,
-                       std::optional<int64_t> num_devices) const;
+  absl::Status ValidateTuple(const Shape& shape,
+                             std::optional<int64_t> num_devices) const;
 
   // Internal helper to validate a non-tuple (leaf) sharding.
-  Status ValidateNonTuple(const Shape& shape,
-                          std::optional<int64_t> num_devices) const;
+  absl::Status ValidateNonTuple(const Shape& shape,
+                                std::optional<int64_t> num_devices) const;
 
   // This field is only used if replicated_ is false. If maximal_ is true, then
   // the field contains a rank 1 array with a single element, which is the
diff --git a/third_party/xla/xla/hlo/ir/hlo_sharding_metadata.cc b/third_party/xla/xla/hlo/ir/hlo_sharding_metadata.cc
index 85f5fc04e579dc..3a9eebaadaac26 100644
--- a/third_party/xla/xla/hlo/ir/hlo_sharding_metadata.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_sharding_metadata.cc
@@ -121,8 +121,8 @@ std::vector<PassThrough> LocatePassThroughDomainLinks(
   return pass_through;
 }
 
-Status FixupPassThroughDomainLinks(const DomainMetadata::Domain& domain,
-                                   const HloSharding& sharding) {
+absl::Status FixupPassThroughDomainLinks(const DomainMetadata::Domain& domain,
+                                         const HloSharding& sharding) {
   for (auto& pass_through : LocatePassThroughDomainLinks(domain)) {
     HloInstruction* tuple = pass_through.operand->parent()->AddInstruction(
         HloInstruction::CreateTuple({pass_through.operand}));
@@ -153,8 +153,8 @@ std::shared_ptr<const HloSharding> CloneShardingForDomain(
   return std::make_shared<const HloSharding>(*single_sharding);
 }
 
-Status ApplyDomainSingleSharding(const DomainMetadata::Domain& domain,
-                                 const HloSharding& sharding) {
+absl::Status ApplyDomainSingleSharding(const DomainMetadata::Domain& domain,
+                                       const HloSharding& sharding) {
   VLOG(4) << "Applying " << sharding << " sharding";
   for (HloInstruction* instruction : domain.instructions) {
     // We only change instructions without sharding, since otherwise we might
@@ -342,8 +342,8 @@ absl::StatusOr<int64_t> ApplyDomainShardingPass(
   return assigned;
 }
 
-Status ApplyDomainSharding(const DomainMetadata::Domain& domain,
-                           const HloSharding& sharding) {
+absl::Status ApplyDomainSharding(const DomainMetadata::Domain& domain,
+                                 const HloSharding& sharding) {
   // None of the external normalizers handled the domain sharding, try to see
   // whether this is a single sharding first.
   auto single_sharding = sharding.ExtractSingleSharding();
@@ -438,14 +438,14 @@ std::string ShardingMetadata::ToString() const {
 /*static*/ absl::StatusOr<const ShardingMetadata*>
 ShardingMetadata::ToShardingMetadata(const DomainMetadata* metadata) {
   if (metadata->Kind() != ShardingMetadata::KindName()) {
-    return Status(
+    return absl::Status(
         absl::StatusCode::kInvalidArgument,
         "ShardingMetadata normalizer called with incorrect domain metadata");
   }
   return static_cast<const ShardingMetadata*>(metadata);
 }
 
-Status ShardingMetadata::NormalizeShardingDomain(
+absl::Status ShardingMetadata::NormalizeShardingDomain(
     const DomainMetadata::Domain& domain, const DomainMetadata* metadata) {
   if (metadata != nullptr) {
     TF_ASSIGN_OR_RETURN(const auto& sharding_metadata,
diff --git a/third_party/xla/xla/hlo/ir/hlo_sharding_metadata.h b/third_party/xla/xla/hlo/ir/hlo_sharding_metadata.h
index f8d1b9368eb5d1..689a1d28694479 100644
--- a/third_party/xla/xla/hlo/ir/hlo_sharding_metadata.h
+++ b/third_party/xla/xla/hlo/ir/hlo_sharding_metadata.h
@@ -67,8 +67,8 @@ class ShardingMetadata : public DomainMetadata {
   // be the original one. Policy wise, HLO passes are allowed to create new
   // unassigned instructions, but if they do create assigned ones, they have to
   // conform to the ones around.
-  static Status NormalizeShardingDomain(const DomainMetadata::Domain& domain,
-                                        const DomainMetadata* metadata);
+  static absl::Status NormalizeShardingDomain(
+      const DomainMetadata::Domain& domain, const DomainMetadata* metadata);
 
  private:
   std::shared_ptr<const HloSharding> sharding_;
diff --git a/third_party/xla/xla/hlo/ir/tile_assignment.cc b/third_party/xla/xla/hlo/ir/tile_assignment.cc
index 8bba81c8528eae..fe82af0a35c416 100644
--- a/third_party/xla/xla/hlo/ir/tile_assignment.cc
+++ b/third_party/xla/xla/hlo/ir/tile_assignment.cc
@@ -15,13 +15,17 @@ limitations under the License.
 
 #include "xla/hlo/ir/tile_assignment.h"
 
+#include <cstdint>
 #include <cstring>
 #include <memory>
-#include <new>
 #include <optional>
 #include <string>
 #include <utility>
 
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/types/span.h"
+#include "xla/array.h"
 #include "xla/util.h"
 
 namespace xla {
@@ -127,6 +131,52 @@ TransposeKind GetTransposeKind(absl::Span<const int64_t> dims,
   return kind;
 }
 
+// Try to split canonicalized reshape_dims and transpose_perm so that
+// reshape_dims transposed with split_transpose_perm will result in
+// non_one_dims (must not contain 1s, as the name suggests), returns true iff
+// such a split is found, and fills the results in split_transpose_perm.
+bool TryDecanonicalize(absl::Span<const int64_t> non_one_dims,
+                       absl::Span<const int64_t> reshape_dims,
+                       absl::Span<const int> transpose_perm,
+                       absl::InlinedVector<int, 6>& split_transpose_perm) {
+  DCHECK_LT(reshape_dims.size(), non_one_dims.size());
+  DCHECK_EQ(transpose_perm.size(), reshape_dims.size());
+  split_transpose_perm.clear();
+  absl::InlinedVector<int, 6> split_counts(reshape_dims.size());
+  int non_one_idx = 0;
+  for (int i = 0; i < reshape_dims.size() && non_one_idx < non_one_dims.size();
+       ++i) {
+    int split_dim = transpose_perm[i];
+    int64_t target = reshape_dims[transpose_perm[i]];
+    int64_t cand = non_one_dims[non_one_idx];
+    int start_idx = non_one_idx;
+    while (target % cand == 0) {
+      target /= cand;
+      if (++non_one_idx >= non_one_dims.size()) {
+        break;
+      }
+      cand = non_one_dims[non_one_idx];
+    }
+    if (target != 1) {
+      return false;
+    }
+    split_counts[split_dim] = non_one_idx - start_idx;
+  }
+  absl::InlinedVector<int, 6> old_to_new_dims(split_counts.size());
+  for (int i = 1; i < old_to_new_dims.size(); ++i) {
+    old_to_new_dims[i] = old_to_new_dims[i - 1] + split_counts[i - 1];
+  }
+  split_transpose_perm.reserve(non_one_dims.size());
+  for (int i = 0; i < old_to_new_dims.size(); ++i) {
+    const int old_dim = transpose_perm[i];
+    for (int j = 0; j < split_counts[old_dim]; ++j) {
+      split_transpose_perm.push_back(old_to_new_dims[old_dim] + j);
+    }
+  }
+  CHECK_EQ(split_transpose_perm.size(), non_one_dims.size());
+  return true;
+}
+
 }  // namespace
 
 /*static*/ IotaTileAssignment IotaTileAssignment::Create(
@@ -153,6 +203,14 @@ TransposeKind GetTransposeKind(absl::Span<const int64_t> dims,
   return IotaTileAssignment(dims, dims_span, perm_span);
 }
 
+Array<int64_t> IotaTileAssignment::ToArray() const {
+  Array<int64_t> array(reshape_dims());
+  array.FillIota(0);
+  array.TransposeDimensions(transpose_perm());
+  array.Reshape(dims());
+  return array;
+}
+
 IotaTileAssignment::IotaTileAssignment(const IotaTileAssignment& other)
     : IotaTileAssignment(other.ndims_, other.reshape_ndims_) {
   std::memcpy(storage_.get(), other.storage_.get(), size_bytes());
@@ -235,7 +293,64 @@ std::optional<IotaTileAssignment> IotaTileAssignment::Transpose(
     CHECK_EQ(reshape_ndims_, new_perm.size());
     return IotaTileAssignment::Create(new_dims, reshape_dims, new_perm);
   }
-  // TODO(b/281892190): Handle remaining patterns and remove nullopt path.
+  if (reshape_ndims_ > non_one_dims.size()) {
+    // If there are fewer non-one tile dimensions than reshape dimensions,
+    // try grouping reshape dimensions together to see if they form the
+    // identical tile dimensions, then transpose them in groups.
+    absl::InlinedVector<absl::InlinedVector<int, 2>, 6> grouped_reshape_dims(
+        non_one_dims.size());
+    int transpose_perm_idx = 0;
+    for (int i = 0, n = non_one_dims.size();
+         i < n && transpose_perm_idx < reshape_ndims_; ++i) {
+      int reshape_dim_idx = transpose_perm[transpose_perm_idx];
+      int64_t cand = reshape_dims[reshape_dim_idx];
+      int64_t target = non_one_dims[i];
+      while (target % cand == 0) {
+        target /= cand;
+        grouped_reshape_dims[i].push_back(reshape_dim_idx);
+        if (++transpose_perm_idx >= reshape_ndims_) {
+          break;
+        }
+        reshape_dim_idx = transpose_perm[transpose_perm_idx];
+        cand = reshape_dims[reshape_dim_idx];
+      }
+      if (target != 1) {
+        return std::nullopt;
+      }
+    }
+    absl::InlinedVector<int, 6> flattened_transpose_perm;
+    flattened_transpose_perm.reserve(reshape_ndims_);
+    for (int i = 0; i < perm.size(); ++i) {
+      const int dim = perm[i];
+      if (one_to_non_one[dim] < 0) {
+        continue;
+      }
+      auto& group = grouped_reshape_dims[one_to_non_one[dim]];
+      flattened_transpose_perm.insert(flattened_transpose_perm.end(),
+                                      group.begin(), group.end());
+    }
+    CHECK_EQ(flattened_transpose_perm.size(), reshape_ndims_);
+    return IotaTileAssignment::Create(new_dims, reshape_dims,
+                                      flattened_transpose_perm);
+  } else {
+    absl::InlinedVector<int, 6> split_transpose_perm;
+    if (TryDecanonicalize(non_one_dims, reshape_dims, transpose_perm,
+                          split_transpose_perm)) {
+      absl::InlinedVector<int, 6> new_perm;
+      new_perm.reserve(non_one_dims.size());
+      for (int i = 0; i < ndims_; ++i) {
+        if (dims[perm[i]] == 1) continue;
+        new_perm.push_back(split_transpose_perm[one_to_non_one[perm[i]]]);
+      }
+      absl::InlinedVector<int64_t, 6> new_reshape_dims(
+          split_transpose_perm.size());
+      for (int i = 0; i < non_one_dims.size(); ++i) {
+        new_reshape_dims[split_transpose_perm[i]] = non_one_dims[i];
+      }
+      return IotaTileAssignment::Create(new_dims, new_reshape_dims, new_perm);
+    }
+  }
+  // TODO(b/341371396): Handle remaining patterns and remove nullopt path.
   return std::nullopt;
 }
 
@@ -317,8 +432,9 @@ void TileAssignment::Each(
   array_->Each(f);
 }
 
-Status TileAssignment::EachStatus(
-    absl::FunctionRef<Status(absl::Span<const int64_t>, int64_t)> f) const {
+absl::Status TileAssignment::EachStatus(
+    absl::FunctionRef<absl::Status(absl::Span<const int64_t>, int64_t)> f)
+    const {
   MaybeMaterializeFullArray();
   return array_->EachStatus(f);
 }
@@ -395,10 +511,7 @@ void TileAssignment::MaybeMaterializeFullArray() const {
   if (array_ == nullptr) {
     DCHECK(shared_array_ == nullptr);
     DCHECK(iota_.has_value());
-    auto full = std::make_shared<Array<int64_t>>(iota_->reshape_dims());
-    full->FillIota(0);
-    full->TransposeDimensions(iota_->transpose_perm());
-    full->Reshape(iota_->dims());
+    auto full = std::make_shared<Array<int64_t>>(iota_->ToArray());
     shared_array_ = std::move(full);
     array_ = shared_array_.get();
   }
diff --git a/third_party/xla/xla/hlo/ir/tile_assignment.h b/third_party/xla/xla/hlo/ir/tile_assignment.h
index a68d932f7627b0..ca7f23bda2dc14 100644
--- a/third_party/xla/xla/hlo/ir/tile_assignment.h
+++ b/third_party/xla/xla/hlo/ir/tile_assignment.h
@@ -104,6 +104,9 @@ class IotaTileAssignment {
 
   std::string ToString() const;
 
+  // Materializes array representation of IotaTileAssignment.
+  Array<int64_t> ToArray() const;
+
  private:
   friend class TileAssignment;
   static constexpr int kPerDimBytes = sizeof(int64_t);
@@ -202,8 +205,9 @@ class TileAssignment {
   void Each(
       absl::FunctionRef<void(absl::Span<const int64_t>, int64_t)> f) const;
 
-  Status EachStatus(
-      absl::FunctionRef<Status(absl::Span<const int64_t>, int64_t)> f) const;
+  absl::Status EachStatus(
+      absl::FunctionRef<absl::Status(absl::Span<const int64_t>, int64_t)> f)
+      const;
 
   // Returns a tile assignment reshaped to the given dimensions.
   // REQUIRES: new shape has the same number of elements.
diff --git a/third_party/xla/xla/hlo/transforms/BUILD b/third_party/xla/xla/hlo/transforms/BUILD
index 2f778ccfd80c93..6043cd089013b9 100644
--- a/third_party/xla/xla/hlo/transforms/BUILD
+++ b/third_party/xla/xla/hlo/transforms/BUILD
@@ -24,6 +24,15 @@ cc_library(
     deps = [
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_pass",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -33,11 +42,13 @@ xla_cc_test(
     deps = [
         ":hlo_constant_splitter",
         "//xla:test",
+        "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_dce",
         "//xla/service:hlo_parser",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
diff --git a/third_party/xla/xla/hlo/transforms/hlo_constant_splitter.cc b/third_party/xla/xla/hlo/transforms/hlo_constant_splitter.cc
index f5849e5173d468..79f5992478ef4c 100644
--- a/third_party/xla/xla/hlo/transforms/hlo_constant_splitter.cc
+++ b/third_party/xla/xla/hlo/transforms/hlo_constant_splitter.cc
@@ -18,7 +18,17 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 
@@ -34,9 +44,19 @@ bool IsSupportedConstant(const HloInstruction* instruction,
 // Return if this is one of the constant expressions that we consider for
 // duplication.
 bool IsSupportedConstantExpression(const HloInstruction* instruction) {
-  return !instruction->HasSideEffect() &&
-         (instruction->opcode() == HloOpcode::kBroadcast ||
-          instruction->IsElementwise());
+  if (instruction->HasSideEffect()) {
+    return false;
+  }
+  if (instruction->IsElementwise()) {
+    return true;
+  }
+  switch (instruction->opcode()) {
+    case HloOpcode::kBroadcast:
+    case HloOpcode::kSlice:
+      return true;
+    default:
+      return false;
+  }
 }
 
 // Perform duplication of a certain constant expression and replace the
diff --git a/third_party/xla/xla/hlo/transforms/hlo_constant_splitter.h b/third_party/xla/xla/hlo/transforms/hlo_constant_splitter.h
index 3ad29dea4703ac..40034402513348 100644
--- a/third_party/xla/xla/hlo/transforms/hlo_constant_splitter.h
+++ b/third_party/xla/xla/hlo/transforms/hlo_constant_splitter.h
@@ -15,6 +15,10 @@ limitations under the License.
 #ifndef XLA_HLO_TRANSFORMS_HLO_CONSTANT_SPLITTER_H_
 #define XLA_HLO_TRANSFORMS_HLO_CONSTANT_SPLITTER_H_
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/hlo/transforms/hlo_constant_splitter_test.cc b/third_party/xla/xla/hlo/transforms/hlo_constant_splitter_test.cc
index c12ae702906b82..d1b7bb696c8dc7 100644
--- a/third_party/xla/xla/hlo/transforms/hlo_constant_splitter_test.cc
+++ b/third_party/xla/xla/hlo/transforms/hlo_constant_splitter_test.cc
@@ -16,12 +16,16 @@ limitations under the License.
 
 #include <cstdint>
 
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/hlo_dce.h"
 #include "xla/service/hlo_parser.h"
 #include "xla/test.h"
 #include "xla/tests/hlo_test_base.h"
+#include "xla/util.h"
 #include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -82,7 +86,7 @@ TEST_F(HloConstantSplitterTest, PreservingConstantsWithZeroUsers) {
   EXPECT_FALSE(status_or.value());
 }
 
-TEST_F(HloConstantSplitterTest, SplittingExpressions) {
+TEST_F(HloConstantSplitterTest, SplittingExpressionsWithBroadcast) {
   const char* module_str = R"(
     HloModule test_module
 
@@ -117,6 +121,36 @@ TEST_F(HloConstantSplitterTest, SplittingExpressions) {
   EXPECT_EQ(module->entry_computation()->instruction_count(), 23);
 }
 
+TEST_F(HloConstantSplitterTest, SplittingExpressionsWithSlice) {
+  const char* module_str = R"(
+    HloModule test_module
+
+    ENTRY entry_computation {
+      iota.0 = u32[64] iota(), iota_dimension=0
+      slice.0 = u32[32] slice(iota.0), slice={[0:32]}
+      broadcast.0 = u32[16,32] broadcast(slice.0), dimensions={1}
+      broadcast.1 = u32[32,32] broadcast(slice.0), dimensions={1}
+      p.0 = u32[16,32] parameter(0)
+      p.1 = u32[32,32] parameter(1)
+      add.0 = u32[16,32] add(p.0, broadcast.0)
+      add.1 = u32[32,32] add(p.1, broadcast.1)
+      ROOT root = (u32[16,32], u32[32,32]) tuple(add.0, add.1)
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(module_str));
+  HloConstantSplitter pass = HloConstantSplitter(/*split_expressions=*/true);
+  const auto status_or = HloTestBase::RunHloPass(&pass, module.get());
+  TF_ASSERT_OK(status_or.status());
+  // Verify that the changed flag returned is correct.
+  EXPECT_TRUE(status_or.value());
+  HloDCE dce;
+  TF_ASSERT_OK(dce.Run(module.get()).status());
+  XLA_VLOG_LINES(1, module->entry_computation()->ToString());
+  EXPECT_EQ(module->entry_computation()->instruction_count(), 11);
+}
+
 TEST_F(HloConstantSplitterTest, NoSplittingSideEffectExpressions) {
   const char* module_str = R"(
     HloModule test_module
diff --git a/third_party/xla/xla/hlo/utils/BUILD b/third_party/xla/xla/hlo/utils/BUILD
index 5df930b01ea385..39153f7e1eda67 100644
--- a/third_party/xla/xla/hlo/utils/BUILD
+++ b/third_party/xla/xla/hlo/utils/BUILD
@@ -102,7 +102,6 @@ cc_library(
         "//xla:protobuf_util",
         "//xla:shape_util",
         "//xla:status",
-        "//xla:statusor",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -118,6 +117,8 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -153,3 +154,19 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
+
+xla_cc_test(
+    name = "hlo_query_test",
+    srcs = [
+        "hlo_query_test.cc",
+    ],
+    deps = [
+        ":hlo_query",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_parser",
+        "//xla/tests:hlo_test_base",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
diff --git a/third_party/xla/xla/hlo/utils/hlo_matchers_test.cc b/third_party/xla/xla/hlo/utils/hlo_matchers_test.cc
index 9811e8830f9449..67f4412702cdea 100644
--- a/third_party/xla/xla/hlo/utils/hlo_matchers_test.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_matchers_test.cc
@@ -346,10 +346,10 @@ TEST_F(HloMatchersTest, ReplicaGroupsMatcher) {
   replica_groups[0].add_replica_ids(2);
   replica_groups[1].add_replica_ids(1);
   replica_groups[1].add_replica_ids(3);
-  std::unique_ptr<HloInstruction> all_to_all =
-      HloInstruction::CreateAllToAll(shape, {p0.get()}, replica_groups,
-                                     /*constrain_layout=*/false,
-                                     /*channel_id=*/std::nullopt);
+  std::unique_ptr<HloInstruction> all_to_all = HloInstruction::CreateAllToAll(
+      shape, {p0.get()}, CollectiveDeviceList(replica_groups),
+      /*constrain_layout=*/false,
+      /*channel_id=*/std::nullopt);
 
   EXPECT_THAT(Explain(p0.get(), op::ReplicaGroups({})),
               "%param = f32[5,7]{1,0} parameter(0) not a collective op");
diff --git a/third_party/xla/xla/hlo/utils/hlo_query.h b/third_party/xla/xla/hlo/utils/hlo_query.h
index e12e3306651375..8343df4dc24472 100644
--- a/third_party/xla/xla/hlo/utils/hlo_query.h
+++ b/third_party/xla/xla/hlo/utils/hlo_query.h
@@ -78,6 +78,26 @@ bool IsBroadcastOfParameter(const HloInstruction& instr);
 HloInstruction* GetFirstInstructionWithOpcode(const HloComputation& computation,
                                               HloOpcode opcode);
 
+// Applies `fn` to a collection of instruction for a given `computation`.
+template <typename Fn>
+void ForEachInstructionWithOpcode(HloComputation& computation, HloOpcode opcode,
+                                  Fn&& fn) {
+  for (HloInstruction* instr : computation.instructions()) {
+    if (instr->opcode() == opcode) {
+      fn(instr);
+    }
+  }
+}
+
+// Applies `fn` to a collection of instruction for a given `module`.
+template <typename Fn>
+void ForEachInstructionWithOpcode(HloModule& module, HloOpcode opcode,
+                                  Fn&& fn) {
+  for (HloComputation* computation : module.computations()) {
+    ForEachInstructionWithOpcode(*computation, opcode, fn);
+  }
+}
+
 // Determines whether the given computation contains an instruction with one of
 // the given opcodes.  Checks both comp's instructions and the instructions of
 // any computations nested within it.
diff --git a/third_party/xla/xla/hlo/utils/hlo_query_test.cc b/third_party/xla/xla/hlo/utils/hlo_query_test.cc
new file mode 100644
index 00000000000000..7697bffc855806
--- /dev/null
+++ b/third_party/xla/xla/hlo/utils/hlo_query_test.cc
@@ -0,0 +1,113 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/utils/hlo_query.h"
+
+#include <memory>
+
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/hlo_parser.h"
+#include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace {
+
+using HloQueryTest = HloTestBase;
+
+template <typename Hlo>
+int CountInstructions(Hlo& module, HloOpcode opcode) {
+  int counter = 0;
+  hlo_query::ForEachInstructionWithOpcode(
+      module, opcode, [&counter](auto& instr) { counter++; });
+  return counter;
+}
+
+TEST_F(HloQueryTest,
+       GetInstructionWithOpCodeReturnsMatchingInstructionForModule) {
+  constexpr absl::string_view kHloString = R"(
+HloModule m
+
+computation.0 {
+  param.0 = f32[32]{0} parameter(0)
+  ROOT _ = f32[32]{0} rsqrt(param.0)
+}
+
+ENTRY main {
+  param.0 = f32[32]{0} parameter(0)
+  param.1 = f32[32]{0} parameter(1)
+  param.2 = f32[32]{0} parameter(2)
+  param.3 = f32[32]{0} parameter(3)
+  add.0 = f32[32]{0} add(param.0,param.1)
+  add.1 = f32[32]{0} add(param.1,param.2)
+  sub.0 = f32[32]{0} subtract(param.0,param.1)
+  mul.0 = f32[32]{0} multiply(param.0,param.1)
+  mul.1 = f32[32]{0} multiply(param.1,param.2)
+  mul.2 = f32[32]{0} multiply(param.2,param.3)
+  comp.0 = call(param.0), to_apply=computation.0
+  ROOT _ = (f32[32],f32[32],f32[32],f32[32],f32[32],f32[32],f32[32]) tuple(comp.0,add.0,add.1,sub.0,mul.0,mul.1,mul.2)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule(kHloString));
+  EXPECT_EQ(CountInstructions(*module, HloOpcode::kAdd), 2);
+  EXPECT_EQ(CountInstructions(*module, HloOpcode::kSubtract), 1);
+  EXPECT_EQ(CountInstructions(*module, HloOpcode::kMultiply), 3);
+}
+
+TEST_F(HloQueryTest,
+       GetInstructionWithOpCodeReturnsMatchingInstructionForComputation) {
+  constexpr absl::string_view kHloString = R"(
+HloModule m
+
+computation.0 {
+  param.0 = f32[32]{0} parameter(0)
+  param.1 = f32[32]{0} parameter(1)
+  param.2 = f32[32]{0} parameter(2)
+  param.3 = f32[32]{0} parameter(3)
+  add.0 = f32[32]{0} add(param.0,param.1)
+  add.1 = f32[32]{0} add(param.1,param.2)
+  sub.0 = f32[32]{0} subtract(param.0,param.1)
+  mul.0 = f32[32]{0} multiply(param.0,param.1)
+  mul.1 = f32[32]{0} multiply(param.1,param.2)
+  ROOT mul.2 = f32[32]{0} multiply(param.2,param.3)
+}
+
+ENTRY main {
+  param.0 = f32[32]{0} parameter(0)
+  param.1 = f32[32]{0} parameter(1)
+  param.2 = f32[32]{0} parameter(2)
+  param.3 = f32[32]{0} parameter(3)
+  add.0 = f32[32]{0} add(param.0,param.1)
+  sub.0 = f32[32]{0} subtract(param.0,param.1)
+  mul.0 = f32[32]{0} multiply(param.0,param.1)
+  comp.0 = f32[32]{0} call(param.0,param.1,param.2), to_apply=computation.0
+  ROOT _ = (f32[32],f32[32],f32[32],f32[32]) tuple(add.0,sub.0,mul.0,comp.0)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule(kHloString));
+  HloComputation* computation = module->GetComputationWithName("computation.0");
+  EXPECT_EQ(CountInstructions(*computation, HloOpcode::kAdd), 2);
+  EXPECT_EQ(CountInstructions(*computation, HloOpcode::kSubtract), 1);
+  EXPECT_EQ(CountInstructions(*computation, HloOpcode::kMultiply), 3);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
index 9a8ab27cf2a67c..8e28d90f5c9e21 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
@@ -18,9 +18,11 @@ limitations under the License.
 #include <algorithm>
 #include <cmath>
 #include <cstdint>
+#include <cstdlib>
 #include <iterator>
 #include <map>
 #include <memory>
+#include <numeric>
 #include <optional>
 #include <string>
 #include <tuple>
@@ -41,6 +43,7 @@ limitations under the License.
 #include "xla/array.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_sharding.h"
@@ -52,9 +55,10 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
-#include "xla/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace hlo_sharding_util {
@@ -348,6 +352,13 @@ bool MergeShardingIfCompatible(const HloSharding& to_merge,
     }
   }
 
+  const int64_t num_devices = to_merge.tile_assignment().num_elements();
+  const int64_t new_num_tiles = Product(merged_tile_dims);
+  if (num_devices % new_num_tiles != 0 || new_num_tiles < minimum_tiles) {
+    return false;
+  }
+  int64_t replication;
+
   if (to_merge_man_dim >= 0) {
     int64_t man_group_size = to_merge.tile_assignment().dim(to_merge_man_dim);
     if (man_group_size != dst->tile_assignment().dim(dst_man_dim)) {
@@ -361,14 +372,14 @@ bool MergeShardingIfCompatible(const HloSharding& to_merge,
     merged_tile_dims.push_back(man_group_size);
     num_merge_groups *= man_group_size;
     num_dst_groups *= man_group_size;
+    if (num_devices % (new_num_tiles * man_group_size) != 0) {
+      return false;
+    }
+    replication = num_devices / (new_num_tiles * man_group_size);
+  } else {
+    replication = num_devices / new_num_tiles;
   }
 
-  const int64_t num_devices = to_merge.tile_assignment().num_elements();
-  const int64_t new_num_tiles = Product(merged_tile_dims);
-  if (num_devices % new_num_tiles != 0 || new_num_tiles < minimum_tiles) {
-    return false;
-  }
-  const int64_t replication = num_devices / new_num_tiles;
   if (replication > 1) {
     merged_tile_dims.push_back(replication);
   }
@@ -485,7 +496,7 @@ bool MergeShardingIfCompatible(const HloSharding& to_merge,
     };
     // Try to find the intersection of to_merge and dst replication groups, in
     // order to determine the merged tile assignment.
-    Status compatible =
+    absl::Status compatible =
         new_tile_array.EachStatus([&](absl::Span<const int64_t> indices,
                                       int64_t* device) -> absl::Status {
           DimensionVector to_merge_index(
@@ -527,7 +538,7 @@ bool MergeShardingIfCompatible(const HloSharding& to_merge,
               *device = *it1;
               gm1.erase(it1);
               gm2.erase(it2);
-              return OkStatus();
+              return absl::OkStatus();
             } else if (*it1 < *it2) {
               it1++;
             } else {
@@ -580,12 +591,14 @@ std::optional<int64_t> SelectDominantDevice(
   return count > 0 ? std::optional<int64_t>(device) : std::optional<int64_t>();
 }
 
-HloSharding FindCommonSharding(absl::Span<const HloSharding> shardings) {
+HloSharding FindCommonSharding(absl::Span<const HloSharding> shardings,
+                               std::optional<HloSharding> default_sharding) {
   CHECK(!shardings.empty());
   bool all_compatible = true;
   HloSharding common_sharding = shardings[0];
   for (int i = 1; i != shardings.size(); ++i) {
-    if (!MergeShardingIfCompatible(shardings[i], common_sharding.NumTiles(),
+    if (common_sharding != shardings[i] &&
+        !MergeShardingIfCompatible(shardings[i], common_sharding.NumTiles(),
                                    &common_sharding)) {
       all_compatible = false;
       break;
@@ -597,7 +610,7 @@ HloSharding FindCommonSharding(absl::Span<const HloSharding> shardings) {
   // TODO(tongfei): instead of return the first sharding in case not all
   // shardings are compatible, we should find a sharding that's compatible with
   // the most number of shardings instead.
-  return shardings[0];
+  return default_sharding.has_value() ? default_sharding.value() : shardings[0];
 }
 
 void AssignComputationDevice(HloComputation* computation, int64_t device) {
@@ -691,30 +704,34 @@ std::optional<HloSharding> ReshapeSharding(const Shape& source_shape,
     return sharding;
   }
 
-  // In case of a tiled sharding the reshaped sharding will be a valid if the
+  // In case of a tiled sharding, the reshaped sharding will be valid if the
   // reshape is composed from the following operations:
   // * Adding or removing dimensions with size 1.
   // * Merging consecutive dimensions where only the most major is sharded.
   // * Splitting a dimension to consecutive dimensions.
   // * Any reshaping of unsharded dimensions.
-  // Note that merge and split can happen consecutively on the same dimension,
-  // e.g., f32[1024,256,1024] to f32[128,2048,1024] can be considered that 1024
-  // gets split into 128 and 8, but 8 then gets merged with 256. We use stacks
-  // to make supporting such cases easy.
-  const Shape tile_shape = sharding.TileShape(source_shape);
+  //
+  // Merge and split can happen consecutively on the same dimension, e.g.,
+  // f32[1024,256] to f32[128,2048] can be considered that 1024 gets split into
+  // 128 and 8, but 8 then gets merged with 256. We use stacks to make
+  // supporting such cases easy.
+  //
+  // If transpose is needed between source and target shapes, we use the GCD of
+  // (target_shape_dim, sharding_dim) if source_shape_dim % sharding_dim == 0.
+  // For example, given the source_shape f32[6,4], target_shape f32[4,6] and
+  // sharding {devices=[6,1]<=[6]}, the output sharding is {devices=[2,1,3]<=[6]
+  // last_tile_dim_replicate}.
   DimensionVector target_tile_assignment_dimensions;
-  DimensionVector source_dims_stack(source_shape.rank());
-  DimensionVector target_dims_stack(target_shape.rank());
-  DimensionVector sharding_tile_dims_stack(source_shape.rank());
-  int64_t added_to_partially_replicated = 1;
-  for (int64_t i = 0; i < source_shape.rank(); ++i) {
-    source_dims_stack[i] = source_shape.dimensions(source_shape.rank() - 1 - i);
-    sharding_tile_dims_stack[i] =
-        sharding.tile_assignment().dim(source_shape.rank() - 1 - i);
-  }
-  for (int64_t i = 0; i < target_shape.rank(); ++i) {
-    target_dims_stack[i] = target_shape.dimensions(target_shape.rank() - 1 - i);
-  }
+  DimensionVector source_dims_stack(source_shape.dimensions().rbegin(),
+                                    source_shape.dimensions().rend());
+  DimensionVector target_dims_stack(target_shape.dimensions().rbegin(),
+                                    target_shape.dimensions().rend());
+  DimensionVector sharding_tile_dims_stack(
+      sharding.tile_assignment().dimensions().begin(),
+      sharding.tile_assignment().dimensions().begin() + source_shape.rank());
+  std::reverse(sharding_tile_dims_stack.begin(),
+               sharding_tile_dims_stack.end());
+
   bool inplace_add_sharding_dim = false;
   auto append_sharding_dim = [&](int64_t size) {
     if (inplace_add_sharding_dim) {
@@ -724,15 +741,38 @@ std::optional<HloSharding> ReshapeSharding(const Shape& source_shape,
     }
     inplace_add_sharding_dim = false;
   };
+
   while (!source_dims_stack.empty() || !target_dims_stack.empty()) {
+    if (Product(sharding_tile_dims_stack) == 1) {
+      // No more partitions left.
+      break;
+    }
+
+    int64_t source_dim_product = 1;
+    while (!sharding_tile_dims_stack.empty() &&
+           sharding_tile_dims_stack.back() == 1) {
+      sharding_tile_dims_stack.pop_back();
+      source_dim_product *= source_dims_stack.back();
+      source_dims_stack.pop_back();
+    }
+    while (!target_dims_stack.empty() && target_dims_stack.back() > 1 &&
+           source_dim_product % target_dims_stack.back() == 0) {
+      source_dim_product /= target_dims_stack.back();
+      target_dims_stack.pop_back();
+      append_sharding_dim(1);
+    }
+    if (source_dim_product != 1) {
+      source_dims_stack.push_back(source_dim_product);
+      sharding_tile_dims_stack.push_back(1);
+    }
+
     if (target_dims_stack.empty()) {
-      if (Product(sharding_tile_dims_stack) != 1) {
-        return std::nullopt;
-      }
       break;
     }
+    int64_t t_size = target_dims_stack.back();
+    target_dims_stack.pop_back();
+
     int64_t s_size = 1;
-    int64_t t_size = 1;
     int64_t s_partitions = 1;
     if (!source_dims_stack.empty()) {
       s_size = source_dims_stack.back();
@@ -740,13 +780,7 @@ std::optional<HloSharding> ReshapeSharding(const Shape& source_shape,
       s_partitions = sharding_tile_dims_stack.back();
       sharding_tile_dims_stack.pop_back();
     }
-    t_size = target_dims_stack.back();
-    target_dims_stack.pop_back();
-    if (s_partitions * Product(sharding_tile_dims_stack) == 1) {
-      // No more partitions left.
-      append_sharding_dim(1);
-      continue;
-    }
+
     if (s_partitions > 1 && s_size % s_partitions == 0 &&
         t_size % s_partitions == 0) {
       // If s_partitions evenly divides both s_size and t_size, we can add this
@@ -756,9 +790,7 @@ std::optional<HloSharding> ReshapeSharding(const Shape& source_shape,
       sharding_tile_dims_stack.push_back(1);
       append_sharding_dim(s_partitions);
       inplace_add_sharding_dim = true;
-      continue;
-    }
-    if (s_size == t_size) {
+    } else if (s_size == t_size) {
       // Same dimension.
       append_sharding_dim(s_partitions);
     } else if (t_size == 1) {
@@ -768,15 +800,17 @@ std::optional<HloSharding> ReshapeSharding(const Shape& source_shape,
       sharding_tile_dims_stack.push_back(s_partitions);
     } else if (s_size == 1) {
       // Trivial dimension removed.
-      if (s_partitions != 1) {
-        added_to_partially_replicated *= s_partitions;
-      }
       target_dims_stack.push_back(t_size);
     } else if (s_size > t_size) {
       // Dimension split.
-      if (s_size % t_size != 0 || s_size % s_partitions != 0) {
+      if (s_size % s_partitions != 0) {
         return std::nullopt;
       }
+      if (s_size % t_size != 0) {
+        // Transpose is needed between source and target shapes.
+        append_sharding_dim(std::gcd(t_size, s_partitions));
+        break;
+      }
       if (t_size % s_partitions == 0) {
         append_sharding_dim(s_partitions);
         // We have part of the s_size unprocessed, so put it back to stack.
@@ -788,7 +822,8 @@ std::optional<HloSharding> ReshapeSharding(const Shape& source_shape,
         source_dims_stack.push_back(s_size / t_size);
         sharding_tile_dims_stack.push_back(s_partitions / t_size);
       } else {
-        return std::nullopt;
+        append_sharding_dim(std::gcd(t_size, s_partitions));
+        break;
       }
     } else {
       // Dimension merge. Also merge the source dimension with the next, and
@@ -797,45 +832,56 @@ std::optional<HloSharding> ReshapeSharding(const Shape& source_shape,
         return std::nullopt;
       }
       CHECK(!source_dims_stack.empty());
+      if (t_size % s_size != 0) {
+        // Transpose is needed between source and target shapes.
+        append_sharding_dim(std::gcd(t_size, s_partitions));
+        break;
+      }
       if (sharding_tile_dims_stack.back() != 1 && s_size != s_partitions) {
         // If the next dimension to combine is sharded, we require that the
         // current dimension's shard size to be 1. Otherwise, the new shard
         // would be non-contiguous.
-        return std::nullopt;
+        break;
       }
       source_dims_stack.back() *= s_size;
       sharding_tile_dims_stack.back() *= s_partitions;
       target_dims_stack.push_back(t_size);
     }
   }
+
   if (Product(target_tile_assignment_dimensions) == 1) {
     return std::nullopt;
   }
+  while (target_tile_assignment_dimensions.size() < target_shape.rank()) {
+    target_tile_assignment_dimensions.push_back(1);
+  }
   for (int64_t i = sharding.TiledDataRank();
        i < sharding.tile_assignment().num_dimensions(); ++i) {
     target_tile_assignment_dimensions.push_back(
-        sharding.tile_assignment().dim(i));
+        i == sharding.SubgroupReplicationDim()
+            ? 1
+            : sharding.tile_assignment().dim(i));
   }
 
   auto subgroup_types = sharding.subgroup_types();
-  // If we added dimensions to the partially replicated dimension then add the
-  // additional dimension on the partially replicated tiling.
-  if (added_to_partially_replicated > 1) {
+  auto partially_replicated = std::div(
+      sharding.TotalNumTiles(), Product(target_tile_assignment_dimensions));
+  CHECK_EQ(partially_replicated.rem, 0);
+  if (partially_replicated.quot > 1) {
     if (sharding.ReplicateOnLastTileDim()) {
-      target_tile_assignment_dimensions.back() *= added_to_partially_replicated;
+      target_tile_assignment_dimensions.back() = partially_replicated.quot;
+      subgroup_types.push_back(OpSharding::REPLICATED);
+    } else if (absl::c_linear_search(subgroup_types, OpSharding::REPLICATED)) {
+      target_tile_assignment_dimensions[sharding.SubgroupReplicationDim() -
+                                        sharding.TiledDataRank() +
+                                        target_shape.rank()] =
+          partially_replicated.quot;
     } else {
-      target_tile_assignment_dimensions.push_back(
-          added_to_partially_replicated);
+      target_tile_assignment_dimensions.push_back(partially_replicated.quot);
+      subgroup_types.push_back(OpSharding::REPLICATED);
     }
   }
-  // If subgroup_types doesn't have already partially replicated as a sharding
-  // type then add it.
-  if ((sharding.ReplicateOnLastTileDim() ||
-       added_to_partially_replicated > 1) &&
-      (subgroup_types.empty() ||
-       subgroup_types.back() != OpSharding::REPLICATED)) {
-    subgroup_types.push_back(OpSharding::REPLICATED);
-  }
+
   auto new_tile_assignment =
       sharding.tile_assignment().Reshape(target_tile_assignment_dimensions);
   return HloSharding::Subgroup(new_tile_assignment, subgroup_types,
@@ -1820,7 +1866,7 @@ IdentityValueAndHloOpcodeForScatterReduceComputation(
   auto computation = scatter.to_apply();
   // We only handle computations with 2 parameters and only 1 calculation.
   if (computation->instruction_count() != 3) {
-    return Status(
+    return absl::Status(
         absl::StatusCode::kInvalidArgument,
         "Expected scatter reduce computation with 2 parameters and only 1 "
         "calculation");
@@ -1847,9 +1893,9 @@ IdentityValueAndHloOpcodeForScatterReduceComputation(
                           root_instruction->opcode());
   }
 
-  return Status(absl::StatusCode::kInvalidArgument,
-                "Expected scatter reduce computation which is "
-                "add/or/multiply/add/min/max");
+  return absl::Status(absl::StatusCode::kInvalidArgument,
+                      "Expected scatter reduce computation which is "
+                      "add/or/multiply/add/min/max");
 }
 
 namespace {
@@ -3110,5 +3156,42 @@ Shape TileLeafShape(const HloSharding& sharding, const Shape& shape) {
   return result_shape;
 }
 
+absl::Status CanonicalizeLayoutAfterShardingPropagation(
+    HloModule* module, bool update_output_layout,
+    bool update_parameters_layout) {
+  if (!update_output_layout && !update_parameters_layout) {
+    return absl::OkStatus();
+  }
+  if (!module->layout_canonicalization_callback()) {
+    LOG(INFO) << "There is no registered layout_canonicalization_callback.";
+    return absl::OkStatus();
+  }
+  TF_ASSIGN_OR_RETURN(auto shapes_with_layout,
+                      module->layout_canonicalization_callback()(*module));
+
+  if (update_output_layout &&
+      module->entry_computation_layout().result_layout().LayoutIsSet()) {
+    TF_RETURN_IF_ERROR(module->mutable_entry_computation_layout()
+                           ->mutable_result_layout()
+                           ->CopyLayoutFromShape(shapes_with_layout.second));
+  }
+
+  if (update_parameters_layout) {
+    for (int64_t i = 0; i < module->entry_computation()->num_parameters();
+         ++i) {
+      if (module->entry_computation_layout()
+              .parameter_layout(i)
+              .LayoutIsSet()) {
+        TF_RETURN_IF_ERROR(
+            module->mutable_entry_computation_layout()
+                ->mutable_parameter_layout(i)
+                ->CopyLayoutFromShape(shapes_with_layout.first[i]));
+      }
+    }
+  }
+
+  return absl::OkStatus();
+}
+
 }  // namespace hlo_sharding_util
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util.h b/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
index db3f8dceb946a6..b1269f7562cfa5 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
@@ -34,7 +34,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/service/call_graph.h"
 #include "xla/shape.h"
-#include "xla/statusor.h"
+#include "xla/status.h"
 #include "xla/util.h"
 
 namespace xla {
@@ -75,7 +75,9 @@ bool MergeShardingIfCompatible(const HloSharding& to_merge,
 // Find a reasonable common sharding for a list of shardings. The reasonable
 // sharding should incur little(the least) amount of total resharding cost when
 // resharding all the shardings to this common sharding.
-HloSharding FindCommonSharding(absl::Span<const HloSharding> shardings);
+HloSharding FindCommonSharding(
+    absl::Span<const HloSharding> shardings,
+    std::optional<HloSharding> default_sharding = std::nullopt);
 
 // Given a map<device, occurrence_count>, selects the device with higher
 // occurrence count (if any). If top_count in not nullptr, it will receive the
@@ -371,6 +373,7 @@ struct GroupedSharding {
         sharding(std::move(grouped_sharding)),
         subgroup_manual(subgroup_manual) {}
   std::string ToString() const;
+  // TODO(b/316622399): Migrate this to be a TileAssignment.
   std::vector<std::vector<int64_t>> device_groups;
   DimensionVector group_dims;
   DimensionVector group_dim_sizes;
@@ -485,6 +488,16 @@ Shape TileShape(const HloSharding& sharding, const Shape& shape);
 // REQUIRES: !sharding.IsTuple()
 Shape TileLeafShape(const HloSharding& sharding, const Shape& shape);
 
+// Canonicalizes entry_computation_layout by calling
+// module->layout_canonicalization_callback(), which gives canonicalized
+// argument and result layouts based on current module. Currently used by PJRT
+// that assigns layouts based on runtime shapes. Refer to
+// DetermineArgumentLayoutsFromCompileOptions() in
+// tensorflow/compiler/xla/pjrt/utils.h.
+absl::Status CanonicalizeLayoutAfterShardingPropagation(
+    HloModule* module, bool update_output_layout,
+    bool update_parameters_layout);
+
 }  // namespace hlo_sharding_util
 }  // namespace xla
 
diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util_test.cc b/third_party/xla/xla/hlo/utils/hlo_sharding_util_test.cc
index ad042361a7bf27..c9902e6a6ad14c 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util_test.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util_test.cc
@@ -179,6 +179,18 @@ TEST(HloShardingUtilTest, ReshapeShardingTiledSplit2) {
   EXPECT_EQ(result.value(), output_sharding);
 }
 
+TEST(HloShardingUtilTest, ReshapeShardingTiledSplit3) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {36});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {6, 6});
+  HloSharding input_sharding = HloSharding::IotaTile({4});
+  HloSharding output_sharding =
+      HloSharding::PartialTile(TileAssignment({2, 1, 2}));
+  std::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, input_sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), output_sharding);
+}
+
 TEST(HloShardingUtilTest, ReshapeShardingTiledSplitThenMerge) {
   Shape input_shape = ShapeUtil::MakeShape(F32, {16, 4, 7});
   Shape output_shape = ShapeUtil::MakeShape(F32, {4, 16, 7});
@@ -239,6 +251,117 @@ TEST(HloShardingUtilTest, ReshapeShardingScalar) {
   EXPECT_FALSE(result.has_value());
 }
 
+TEST(HloShardingUtilTest, ReshapeShardingSuffixShapeSizeOne1) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {64, 1, 1});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {64, 1});
+  HloSharding input_sharding = HloSharding::IotaTile({4, 1, 1});
+  HloSharding output_sharding = HloSharding::IotaTile({4, 1});
+
+  std::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, input_sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), output_sharding);
+
+  result = ReshapeSharding(output_shape, input_shape, output_sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), input_sharding);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingSuffixShapeSizeOne2) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {64, 1, 1});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {64, 1});
+  HloSharding input_sharding = HloSharding::IotaTile({4, 2, 8});
+  HloSharding output_sharding =
+      HloSharding::PartialTile(TileAssignment({4, 2, 8}));
+  std::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, input_sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), output_sharding);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingSuffixShapeSizeOne3) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {64, 1});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {64, 1, 1});
+  HloSharding input_sharding = HloSharding::IotaTile({4, 2});
+  HloSharding output_sharding = HloSharding::IotaTile({4, 2, 1});
+  std::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, input_sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), output_sharding);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingPrefixShapeSizeOne1) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {1, 1, 64});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {1, 64});
+  HloSharding input_sharding = HloSharding::IotaTile({1, 1, 4});
+  HloSharding output_sharding = HloSharding::IotaTile({1, 4});
+  std::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, input_sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), output_sharding);
+
+  result = ReshapeSharding(output_shape, input_shape, output_sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), input_sharding);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingPrefixShapeSizeOne2) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {1, 1, 64});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {1, 64});
+  HloSharding input_sharding = HloSharding::IotaTile({2, 1, 1});
+  HloSharding output_sharding = HloSharding::IotaTile({2, 1});
+  std::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, input_sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), output_sharding);
+
+  result = ReshapeSharding(output_shape, input_shape, output_sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), input_sharding);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingTranspose1) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {6, 2, 5});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {4, 3, 5});
+  HloSharding sharding = HloSharding::IotaTile({2, 1, 5});
+  std::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), sharding);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingTranspose2) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {2, 3, 5, 7, 11});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {10, 21, 11});
+  HloSharding input_sharding = HloSharding::IotaTile({2, 1, 1, 1, 13});
+  HloSharding output_sharding = HloSharding::IotaTile({2, 1, 13});
+  std::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, input_sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), output_sharding);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingTranspose3) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {2, 3, 5});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {3, 10});
+  HloSharding input_sharding = HloSharding::IotaTile({1, 1, 5});
+  std::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, input_sharding);
+  EXPECT_FALSE(result.has_value());
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingTranspose4) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {2, 3, 5, 7, 11, 13, 17, 19});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {3, 2, 55, 91, 19, 17});
+  HloSharding input_sharding = HloSharding::IotaTile({1, 1, 5, 1, 1, 13, 1, 1});
+  HloSharding output_sharding =
+      HloSharding::PartialTile(TileAssignment({1, 1, 5, 1, 1, 1, 13}));
+  std::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, input_sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), output_sharding);
+}
+
 TEST(HloShardingUtilTest, ReshapeToTileDimension2D) {
   // The two sharding in the vector are the same. They will be processed in
   // different branches in ReshapeToTileDimension.
@@ -337,6 +460,39 @@ TEST(HloShardingUtilTest, ReshapeToTileDimension4D) {
   }
 }
 
+TEST(HloShardingUtilTest, PropagateReshapeShardingTranspose1) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {6, 4});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {2, 2, 3, 2});
+  HloSharding input_sharding = HloSharding::IotaTile({6, 1});
+  HloSharding output_sharding =
+      HloSharding::PartialTile(TileAssignment({2, 1, 1, 1, 3}));
+  HloSharding result = PropagateShardingThroughReshape(
+      input_shape, output_shape, input_sharding);
+  EXPECT_EQ(result, output_sharding);
+}
+
+TEST(HloShardingUtilTest, PropagateReshapeShardingTranspose2) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {6, 4});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {4, 6});
+  HloSharding input_sharding = HloSharding::IotaTile({6, 1});
+  HloSharding output_sharding =
+      HloSharding::PartialTile(TileAssignment({2, 1, 3}));
+  HloSharding result = PropagateShardingThroughReshape(
+      input_shape, output_shape, input_sharding);
+  EXPECT_EQ(result, output_sharding);
+}
+
+TEST(HloShardingUtilTest, PropagateReshapeShardingTranspose3) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {4, 6, 5});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {2, 2, 2, 5, 3});
+  HloSharding input_sharding = HloSharding::IotaTile({2, 6, 1});
+  HloSharding output_sharding =
+      HloSharding::PartialTile(TileAssignment({2, 1, 2, 1, 1, 3}));
+  HloSharding result = PropagateShardingThroughReshape(
+      input_shape, output_shape, input_sharding);
+  EXPECT_EQ(result, output_sharding);
+}
+
 TEST(HloShardingUtilTest, PropagateReshapeShardingTiledSplitPartialMatch) {
   Shape input_shape = ShapeUtil::MakeShape(F32, {14, 16});
   Shape output_shape = ShapeUtil::MakeShape(F32, {2, 7, 4, 4});
@@ -373,6 +529,17 @@ TEST(HloShardingUtilTest,
   EXPECT_EQ(result, output_sharding);
 }
 
+TEST(HloShardingUtilTest, MergeManualSubgroupSharding) {
+  TileAssignment tile_assignment({16, 4});
+  std::vector<OpSharding::Type> subgroup_types = {OpSharding::MANUAL,
+                                                  OpSharding::REPLICATED};
+  // Subgroup sharding
+  //  {devices=[16,4]<=[64] last_tile_dims={manual, replicated}}
+  HloSharding dst = HloSharding::Subgroup(tile_assignment, subgroup_types);
+  HloSharding to_merge = dst;
+  EXPECT_FALSE(MergeShardingIfCompatible(to_merge, dst.NumTiles() + 1, &dst));
+}
+
 TEST(HloShardingUtilTest, GetManualSubgroupSharding_ManualOnly) {
   TileAssignment tile_assignment({1, 2, 2});
   std::vector<OpSharding::Type> subgroup_types = {OpSharding::MANUAL};
diff --git a/third_party/xla/xla/layout_util.cc b/third_party/xla/xla/layout_util.cc
index 03b83e22c20436..59381c738b4818 100644
--- a/third_party/xla/xla/layout_util.cc
+++ b/third_party/xla/xla/layout_util.cc
@@ -204,7 +204,7 @@ Layout CreateDefaultLayoutForRank(int64_t rank) {
   LayoutUtil::SetToDefaultLayout(program_shape->mutable_result());
 }
 
-/* static */ Status LayoutUtil::ValidateLayoutInShape(
+/* static */ absl::Status LayoutUtil::ValidateLayoutInShape(
     const Shape& shape, bool allow_missing_layouts) {
   if (shape.IsTuple()) {
     // Tuple shape.
@@ -215,11 +215,11 @@ Layout CreateDefaultLayoutForRank(int64_t rank) {
       TF_RETURN_IF_ERROR(
           ValidateLayoutInShape(element_shape, allow_missing_layouts));
     }
-    return OkStatus();
+    return absl::OkStatus();
   } else if (shape.IsArray()) {
     if (!shape.has_layout()) {
       if (allow_missing_layouts) {
-        return OkStatus();
+        return absl::OkStatus();
       }
       return InvalidArgument("shape %s does not have a layout",
                              ShapeUtil::HumanString(shape));
@@ -232,12 +232,12 @@ Layout CreateDefaultLayoutForRank(int64_t rank) {
           "shape of primitive type %s should not have a layout",
           PrimitiveType_Name(shape.element_type()));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 }
 
-/* static */ Status LayoutUtil::ValidateLayoutForShape(const Layout& layout,
-                                                       const Shape& shape) {
+/* static */ absl::Status LayoutUtil::ValidateLayoutForShape(
+    const Layout& layout, const Shape& shape) {
   if (shape.IsTuple()) {
     return InvalidArgument("a single Layout is not valid for tuple shapes");
   }
@@ -248,7 +248,7 @@ Layout CreateDefaultLayoutForRank(int64_t rank) {
           "shape of primitive type %s should not have a non-trivial layout",
           PrimitiveType_Name(shape.element_type()));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   if (layout.minor_to_major_size() != shape.rank()) {
@@ -358,7 +358,7 @@ Layout CreateDefaultLayoutForRank(int64_t rank) {
                   "physical shape: %s",
                   shape.ShortDebugString());
             }
-            return OkStatus();
+            return absl::OkStatus();
           }));
       if (layout.index_primitive_type() != PRIMITIVE_TYPE_INVALID &&
           !primitive_util::IsUnsignedIntegralType(
@@ -419,7 +419,7 @@ Layout CreateDefaultLayoutForRank(int64_t rank) {
                            layout.element_size_in_bits());
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 /* static */ void LayoutUtil::ClearLayout(Shape* shape) {
@@ -586,7 +586,7 @@ Layout CreateDefaultLayoutForRank(int64_t rank) {
 namespace {
 
 // Internal helper for recursively copying layouts.
-Status CopyLayoutInternal(const Shape& src, Shape* dst) {
+absl::Status CopyLayoutInternal(const Shape& src, Shape* dst) {
   if (src.IsTuple() != dst->IsTuple()) {
     return InvalidArgument(
         "cannot copy layout from shape: shape structure differs");
@@ -613,13 +613,13 @@ Status CopyLayoutInternal(const Shape& src, Shape* dst) {
       dst->clear_layout();
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
 
 /* static */
-Status LayoutUtil::CopyLayoutBetweenShapes(const Shape& src, Shape* dst) {
+absl::Status LayoutUtil::CopyLayoutBetweenShapes(const Shape& src, Shape* dst) {
   return CopyLayoutInternal(src, dst);
 }
 
diff --git a/third_party/xla/xla/layout_util.h b/third_party/xla/xla/layout_util.h
index 7cff0b3ce80ad1..ca48164965ec50 100644
--- a/third_party/xla/xla/layout_util.h
+++ b/third_party/xla/xla/layout_util.h
@@ -90,13 +90,13 @@ class LayoutUtil {
   // Validates that the layout within the given shape is correct. The check
   // is performed for all subshapes as well. If missing layouts are allowed
   // the check does not fail on array shapes without layouts.
-  static Status ValidateLayoutInShape(const Shape& shape,
-                                      bool allow_missing_layouts = false);
+  static absl::Status ValidateLayoutInShape(const Shape& shape,
+                                            bool allow_missing_layouts = false);
 
   // Validates that the provided layout satisfies invariants for the given
   // shape.
-  static Status ValidateLayoutForShape(const Layout& layout,
-                                       const Shape& shape);
+  static absl::Status ValidateLayoutForShape(const Layout& layout,
+                                             const Shape& shape);
 
   // Clears the layout in the given Shape. After this function is called,
   // HasLayout will return false for the shape.
@@ -242,7 +242,7 @@ class LayoutUtil {
   // tuples.  'src' and 'dst' need not be compatible but the two shapes must
   // have the same tuple structure (if any) and arrays must have the same
   // rank. within the shapes must have the same number of dimensions.
-  static Status CopyLayoutBetweenShapes(const Shape& src, Shape* dst);
+  static absl::Status CopyLayoutBetweenShapes(const Shape& src, Shape* dst);
 
   // Returns true if the layouts of lhs and rhs are equal, false
   // otherwise. Recursively compares layouts of tuples.
diff --git a/third_party/xla/xla/lit.bzl b/third_party/xla/xla/lit.bzl
index 57d25fff261242..962baf4181de6f 100644
--- a/third_party/xla/xla/lit.bzl
+++ b/third_party/xla/xla/lit.bzl
@@ -178,10 +178,25 @@ def lit_test(
         visibility = ["//visibility:private"],
         **kwargs
     )
+    lit_name = "//third_party/py/lit:lit"
+
+    # copybara:comment_begin(oss-only)
+    lit_name = "lit_custom_" + name
+    native.py_binary(
+        name = lit_name,
+        main = "@llvm-project//llvm:utils/lit/lit.py",
+        srcs = ["@llvm-project//llvm:utils/lit/lit.py"],
+        testonly = True,
+        deps = [
+            "@llvm-project//llvm:lit_lib",
+            "@pypi_lit//:pkg",
+        ],
+    )
 
+    # copybara:comment_end
     native_test(
         name = name,
-        src = "@llvm-project//llvm:lit",
+        src = lit_name,
         args = [
             "-a",
             "--path",
@@ -189,14 +204,14 @@ def lit_test(
             "$(location {})".format(test_file),
         ] + args,
         data = [
-            "@llvm-project//llvm:lit",
+            lit_name,
             test_file,
 
             # TODO(cheshire): Config is not passed properly when it's not
             # called lit.cfg.py
             cfg,
             tools_on_path_target_name,
-        ] + data,
+        ] + data + ["@pypi_lit//:pkg"],
         visibility = visibility,
         env = env,
         timeout = timeout,
diff --git a/third_party/xla/xla/literal.cc b/third_party/xla/xla/literal.cc
index 9a9996e4c48396..a11ddc71c549e5 100644
--- a/third_party/xla/xla/literal.cc
+++ b/third_party/xla/xla/literal.cc
@@ -45,6 +45,7 @@ limitations under the License.
 #include "xla/primitive_util.h"
 #include "xla/printer.h"
 #include "xla/shape.h"
+#include "xla/shape_tree.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/status_macros.h"
@@ -86,16 +87,16 @@ void ConvertEndianShort(char* bytes, int64_t size) {
 }
 
 bool LiteralProtoHasValues(const LiteralProto& proto) {
-  return proto.preds_size() || !proto.s4s().empty() || !proto.u4s().empty() ||
-         !proto.s8s().empty() || !proto.u8s().empty() || proto.s32s_size() ||
-         proto.s64s_size() || proto.u32s_size() || proto.u64s_size() ||
-         proto.f32s_size() || proto.f64s_size() || proto.c64s_size() ||
-         proto.c128s_size() || proto.tuple_literals_size() ||
-         !proto.f16s().empty() || !proto.bf16s().empty() ||
-         !proto.u16s().empty() || !proto.s16s().empty() ||
+  return !proto.s2s().empty() || !proto.s4s().empty() || !proto.s8s().empty() ||
+         !proto.s16s().empty() || proto.s32s_size() || proto.s64s_size() ||
+         !proto.u2s().empty() || !proto.u4s().empty() || !proto.u8s().empty() ||
+         !proto.u16s().empty() || proto.u32s_size() || proto.u64s_size() ||
          !proto.f8e5m2s().empty() || !proto.f8e4m3fns().empty() ||
          !proto.f8e4m3b11fnuzs().empty() || !proto.f8e5m2fnuzs().empty() ||
-         !proto.f8e4m3fnuzs().empty();
+         !proto.f8e4m3fnuzs().empty() || !proto.f16s().empty() ||
+         !proto.bf16s().empty() || proto.f32s_size() || proto.f64s_size() ||
+         proto.c64s_size() || proto.c128s_size() || proto.preds_size() ||
+         proto.tuple_literals_size();
 }
 
 // Lazy getter for the interned scalar shape in static storage. We reuse this
@@ -222,7 +223,11 @@ std::ostream& operator<<(std::ostream& out, const Literal& literal) {
 
 Shape* MutableLiteralBase::mutable_shape_do_not_use() {
   const Shape* const_shape = shape_.get();
-  Shape* shape = shape_.get_mutable(/*ensure_owned=*/true);
+  if (!shape_.OwnsPtr()) {
+    shape_ = MaybeOwningShapePtr(std::make_unique<Shape>(*shape_));
+  }
+  Shape* shape = shape_.get_mutable();
+
   if (shape != const_shape) {
     std::function<void(const Shape&, Piece*)> set_piece_shapes =
         [&set_piece_shapes](const Shape& shape, Piece* piece) {
@@ -369,6 +374,22 @@ std::optional<int64_t> LiteralBase::GetFirstInteger() const {
       shape().element_type());
 }
 
+void LiteralBase::BuildPieceSubtree(const Shape& shape, Piece* piece) {
+  CHECK(shape.IsTuple());
+  for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+    const Shape& subshape = shape.tuple_shapes(i);
+
+    Piece child_piece;
+    child_piece.set_subshape(&subshape);
+
+    if (subshape.IsTuple()) {
+      BuildPieceSubtree(subshape, &child_piece);
+    }
+
+    piece->emplace_back(std::move(child_piece));
+  }
+}
+
 absl::Status LiteralBase::SerializeToString(std::string* output) const {
   ShapeProto shape_proto = shape().ToProto();
   TF_ASSIGN_OR_RETURN(int64_t size,
@@ -384,7 +405,7 @@ absl::StatusOr<std::string> LiteralBase::SerializeAsString() const {
 }
 
 template <typename NativeT>
-Status MutableLiteralBase::CopySliceFromInternal(
+absl::Status MutableLiteralBase::CopySliceFromInternal(
     const LiteralBase& src_literal, absl::Span<const int64_t> src_base,
     absl::Span<const int64_t> dest_base, absl::Span<const int64_t> copy_size) {
   auto linear_index = [](const Shape& shape,
@@ -437,7 +458,7 @@ Status MutableLiteralBase::CopySliceFromInternal(
                             stride_config.dimensions, stride_config.step,
                             copy_proc);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void MutableLiteralBase::CopyElementFrom(const LiteralSlice& src_literal,
@@ -500,10 +521,10 @@ void MutableLiteralBase::CopyElementFrom(const LiteralSlice& src_literal,
                 ShapeUtil::TupleElementCount(piece->subshape()),
                 proto_element->tuple_literals_size());
           }
-          return OkStatus();
+          return absl::OkStatus();
         }
         if (piece->subshape().element_type() == TOKEN) {
-          return OkStatus();
+          return absl::OkStatus();
         }
 
         CHECK(piece->subshape().IsArray());
@@ -515,7 +536,7 @@ void MutableLiteralBase::CopyElementFrom(const LiteralSlice& src_literal,
           TF_RETURN_IF_ERROR(piece->CopyFromProto(*proto_element));
         }
 
-        return OkStatus();
+        return absl::OkStatus();
       }));
 
   return std::move(literal);
@@ -653,8 +674,8 @@ void LiteralBase::Piece::CopyElementsWithDynamicBound(
   } while (IndexUtil::BumpIndices(bound_shape, absl::MakeSpan(index)));
 }
 
-Status LiteralBase::Piece::CopyFrom(const LiteralBase::Piece& src,
-                                    bool only_dynamic_bound) {
+absl::Status LiteralBase::Piece::CopyFrom(const LiteralBase::Piece& src,
+                                          bool only_dynamic_bound) {
   CHECK(subshape_ != nullptr);
   CHECK(src.subshape_ != nullptr);
   CHECK(LayoutUtil::IsDenseArray(subshape()))
@@ -670,7 +691,7 @@ Status LiteralBase::Piece::CopyFrom(const LiteralBase::Piece& src,
       DeallocateBuffers();
     }
     array_value_state_ = src.array_value_state_;
-    return OkStatus();
+    return absl::OkStatus();
   } else {
     CHECK(src.array_value_state_ == ArrayValueState::kKnown);
     if (array_value_state_ == ArrayValueState::kUndetermined ||
@@ -703,7 +724,7 @@ Status LiteralBase::Piece::CopyFrom(const LiteralBase::Piece& src,
     memcpy(dynamic_size_buffer(), src.dynamic_size_buffer(),
            src.dynamic_size_buffer_bytes());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void MutableLiteralBase::SetDynamicSize(int64_t dim_index, int32_t size) {
@@ -724,10 +745,10 @@ void MutableLiteralBase::SetDynamicSize(int64_t dim_index,
   piece(shape_index).SetDynamicSize(dim_index, size);
 }
 
-Status MutableLiteralBase::CopyFrom(const LiteralSlice& src_literal,
-                                    const ShapeIndex& dest_shape_index,
-                                    const ShapeIndex& src_shape_index,
-                                    bool only_dynamic_bound) {
+absl::Status MutableLiteralBase::CopyFrom(const LiteralSlice& src_literal,
+                                          const ShapeIndex& dest_shape_index,
+                                          const ShapeIndex& src_shape_index,
+                                          bool only_dynamic_bound) {
   const Shape& dest_subshape =
       ShapeUtil::GetSubshape(shape(), dest_shape_index);
   const Shape& src_subshape =
@@ -750,7 +771,7 @@ Status MutableLiteralBase::CopyFrom(const LiteralSlice& src_literal,
   return mutable_root_piece().ForEachMutableSubpieceWithStatus(
       [&](const ShapeIndex& index, Piece* piece) {
         if (!piece->subshape().IsArray()) {
-          return OkStatus();
+          return absl::OkStatus();
         }
 
         // Determine if this index is in the part of this literal that we want
@@ -763,7 +784,7 @@ Status MutableLiteralBase::CopyFrom(const LiteralSlice& src_literal,
           }
         }
         if (!in_subtree_to_copy) {
-          return OkStatus();
+          return absl::OkStatus();
         }
         // Construct the index of the corresponding piece in the source literal.
         ShapeIndex src_piece_index = src_shape_index;
@@ -774,12 +795,12 @@ Status MutableLiteralBase::CopyFrom(const LiteralSlice& src_literal,
         TF_RETURN_IF_ERROR(
             piece->CopyFrom(src_literal.piece(src_piece_index),
                             /*only_dynamic_bound=*/only_dynamic_bound));
-        return OkStatus();
+        return absl::OkStatus();
       });
 }
 
-Status Literal::MoveFrom(Literal&& src_literal,
-                         const ShapeIndex& dest_shape_index) {
+absl::Status Literal::MoveFrom(Literal&& src_literal,
+                               const ShapeIndex& dest_shape_index) {
   const Shape& dest_subshape =
       ShapeUtil::GetSubshape(shape(), dest_shape_index);
   if (!ShapeUtil::Equal(dest_subshape, src_literal.shape())) {
@@ -808,13 +829,12 @@ Status Literal::MoveFrom(Literal&& src_literal,
   src_literal.root_piece_ = Piece();
   src_literal.root_piece_.set_subshape(src_literal.shape_.get());
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status MutableLiteralBase::CopySliceFrom(const LiteralSlice& src_literal,
-                                         absl::Span<const int64_t> src_base,
-                                         absl::Span<const int64_t> dest_base,
-                                         absl::Span<const int64_t> copy_size) {
+absl::Status MutableLiteralBase::CopySliceFrom(
+    const LiteralSlice& src_literal, absl::Span<const int64_t> src_base,
+    absl::Span<const int64_t> dest_base, absl::Span<const int64_t> copy_size) {
   TF_RET_CHECK(LayoutUtil::IsDenseArray(shape())) << shape();
   TF_RET_CHECK(LayoutUtil::IsDenseArray(src_literal.shape()))
       << src_literal.shape();
@@ -823,7 +843,7 @@ Status MutableLiteralBase::CopySliceFrom(const LiteralSlice& src_literal,
   TF_RET_CHECK(shape().rank() == dest_base.size());
 
   return primitive_util::ArrayTypeSwitch<Status>(
-      [&](auto primitive_type_constant) -> Status {
+      [&](auto primitive_type_constant) -> absl::Status {
         using NativeT = NativeTypeOf<primitive_type_constant>;
         return CopySliceFromInternal<NativeT>(src_literal, src_base, dest_base,
                                               copy_size);
@@ -906,7 +926,7 @@ void MutableLiteralBase::PopulateInplaceInternal(
   }
 }
 
-Status MutableLiteralBase::PopulateInplace(
+absl::Status MutableLiteralBase::PopulateInplace(
     absl::FunctionRef<void(void*, absl::Span<const int64_t>)> populator) {
   TF_RET_CHECK(LayoutUtil::IsDenseArray(shape()))
       << __func__ << " is only supported for dense arrays: " << shape();
@@ -915,16 +935,16 @@ Status MutableLiteralBase::PopulateInplace(
         return populator(dest, indexes);
       },
       /*parallel=*/false);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status MutableLiteralBase::PopulateInplaceParallel(
+absl::Status MutableLiteralBase::PopulateInplaceParallel(
     absl::FunctionRef<void(void*, absl::Span<const int64_t>, int)> populator) {
   TF_RET_CHECK(LayoutUtil::IsDenseArray(shape()))
       << __func__ << " is only supported for dense arrays: " << shape();
   PopulateInplaceInternal(populator,
                           /*parallel=*/element_count() > 32);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Literal LiteralBase::Relayout(const Layout& new_layout,
@@ -1361,16 +1381,16 @@ std::optional<complex128> LiteralBase::GetAsComplex128(
       shape().element_type());
 }
 
-Status MutableLiteralBase::SetIntegralAsS64(
+absl::Status MutableLiteralBase::SetIntegralAsS64(
     absl::Span<const int64_t> multi_index, int64_t value) {
   CHECK(LayoutUtil::IsDenseArray(shape()));
   return primitive_util::PrimitiveTypeSwitch<Status>(
-      [&](auto primitive_type_constant) -> Status {
+      [&](auto primitive_type_constant) -> absl::Status {
         if constexpr (primitive_util::IsIntegralType(primitive_type_constant) ||
                       primitive_type_constant == PRED) {
           using NativeT = NativeTypeOf<primitive_type_constant>;
           Set<NativeT>(multi_index, static_cast<NativeT>(value));
-          return OkStatus();
+          return absl::OkStatus();
         }
         return FailedPrecondition("Array element type is not integral: %s",
                                   PrimitiveType_Name(shape().element_type()));
@@ -1378,8 +1398,8 @@ Status MutableLiteralBase::SetIntegralAsS64(
       shape().element_type());
 }
 
-Status MutableLiteralBase::SetFromDouble(absl::Span<const int64_t> multi_index,
-                                         double value) {
+absl::Status MutableLiteralBase::SetFromDouble(
+    absl::Span<const int64_t> multi_index, double value) {
   CHECK(LayoutUtil::IsDenseArray(shape()));
   if (!primitive_util::IsFloatingPointType(shape().element_type())) {
     return FailedPrecondition("Array element type is not integral: %s",
@@ -1391,7 +1411,7 @@ Status MutableLiteralBase::SetFromDouble(absl::Span<const int64_t> multi_index,
         Set<NativeT>(multi_index, static_cast<NativeT>(value));
       },
       shape().element_type());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 namespace {
@@ -1674,8 +1694,8 @@ void ConvertBetweenNativeTypes(absl::Span<const NativeSrcT> src_data,
 }
 
 template <PrimitiveType kSrcType>
-Status ConvertIfDestTypeMatches(const LiteralBase& src_literal,
-                                MutableLiteralBase& dst_literal) {
+absl::Status ConvertIfDestTypeMatches(const LiteralBase& src_literal,
+                                      MutableLiteralBase& dst_literal) {
   DCHECK(dst_literal.shape().IsArray());
   using NativeSrcT = NativeTypeOf<kSrcType>;
   // Pass raw data Span/pointers to called template methods to avoid duplicating
@@ -1684,7 +1704,7 @@ Status ConvertIfDestTypeMatches(const LiteralBase& src_literal,
   void* dst_base = dst_literal.untyped_data();
   DCHECK_EQ(src_data.size(), dst_literal.element_count());
   return primitive_util::ArrayTypeSwitch<Status>(
-      [&](auto primitive_type_constant) -> Status {
+      [&](auto primitive_type_constant) -> absl::Status {
         if constexpr (primitive_util::IsComplexType(kSrcType) &&
                       !primitive_util::IsComplexType(primitive_type_constant)) {
           return Unimplemented("%s from type %s to type %s is not implemented.",
@@ -1695,7 +1715,7 @@ Status ConvertIfDestTypeMatches(const LiteralBase& src_literal,
           ConvertBetweenNativeTypes<NativeSrcT, NativeDestT>(src_data,
                                                              dst_base);
         }
-        return OkStatus();
+        return absl::OkStatus();
       },
       dst_literal.shape().element_type());
 }
@@ -1720,7 +1740,7 @@ absl::StatusOr<Literal> ConvertSwitch(const LiteralBase& literal,
   Literal result(
       ShapeUtil::ChangeElementType(literal.shape(), primitive_dest_type));
   TF_RETURN_IF_ERROR(primitive_util::ArrayTypeSwitch<Status>(
-      [&](auto primitive_type_constant) -> Status {
+      [&](auto primitive_type_constant) -> absl::Status {
         return ConvertIfDestTypeMatches<primitive_type_constant>(literal,
                                                                  result);
       },
@@ -1847,12 +1867,15 @@ bool LiteralBase::Piece::EqualElements(const LiteralBase::Piece& other) const {
     CHECK(LayoutUtil::IsDenseArray(subshape()))
         << __func__ << " is only supported for dense arrays: " << subshape();
     CHECK_EQ(size_bytes_dense(), other.size_bytes_dense());
-    if (primitive_util::Is4BitType(subshape().element_type())) {
+    if (primitive_util::IsSubByteNonPredType(subshape().element_type())) {
+      CHECK(!primitive_util::IsFloatingPointType(subshape().element_type()));
       auto one_array = buffer();
       auto two_array = other.buffer();
+      const int bits_per_element =
+          primitive_util::BitWidth(subshape().element_type());
+      const uint8_t mask = LsbMask<uint8_t>(bits_per_element);
       for (int64_t i = 0; i < size_bytes_dense(); ++i) {
-        if ((one_array[i] & uint8_t{0xf}) != (two_array[i] & uint8_t{0xf}))
-          return false;
+        if ((one_array[i] & mask) != (two_array[i] & mask)) return false;
       }
       return true;
     }
@@ -2175,13 +2198,9 @@ void LiteralBase::Piece::WriteToProto(LiteralProto* proto) const {
     case PRED:
       CopyToRepeatedField(proto->mutable_preds(), data<bool>());
       break;
-    case S4:
-      *proto->mutable_s4s() = std::string(
-          reinterpret_cast<const char*>(data<s4>().data()), size_bytes_dense());
-      break;
-    case S8:
-      proto->set_s8s(static_cast<const signed char*>(data<int8_t>().data()),
-                     element_count());
+    case U2:
+      *proto->mutable_u2s() = std::string(
+          reinterpret_cast<const char*>(data<u2>().data()), size_bytes_dense());
       break;
     case U4:
       *proto->mutable_u4s() = std::string(
@@ -2191,25 +2210,31 @@ void LiteralBase::Piece::WriteToProto(LiteralProto* proto) const {
       proto->set_u8s(static_cast<const unsigned char*>(data<uint8_t>().data()),
                      element_count());
       break;
+    case U16:
+      *proto->mutable_u16s() =
+          std::string(reinterpret_cast<const char*>(data<uint16_t>().data()),
+                      size_bytes_dense());
+      if (!kLittleEndian) {
+        ConvertEndianShort(proto->mutable_u16s());
+      }
+      break;
     case U32:
       CopyToRepeatedField(proto->mutable_u32s(), data<uint32_t>());
       break;
     case U64:
       CopyToRepeatedField(proto->mutable_u64s(), data<uint64_t>());
       break;
-    case S32:
-      CopyToRepeatedField(proto->mutable_s32s(), data<int32_t>());
+    case S2:
+      *proto->mutable_s2s() = std::string(
+          reinterpret_cast<const char*>(data<s2>().data()), size_bytes_dense());
       break;
-    case S64:
-      CopyToRepeatedField(proto->mutable_s64s(), data<int64_t>());
+    case S4:
+      *proto->mutable_s4s() = std::string(
+          reinterpret_cast<const char*>(data<s4>().data()), size_bytes_dense());
       break;
-    case U16:
-      *proto->mutable_u16s() =
-          std::string(reinterpret_cast<const char*>(data<uint16_t>().data()),
-                      size_bytes_dense());
-      if (!kLittleEndian) {
-        ConvertEndianShort(proto->mutable_u16s());
-      }
+    case S8:
+      proto->set_s8s(static_cast<const signed char*>(data<int8_t>().data()),
+                     element_count());
       break;
     case S16:
       *proto->mutable_s16s() =
@@ -2219,21 +2244,11 @@ void LiteralBase::Piece::WriteToProto(LiteralProto* proto) const {
         ConvertEndianShort(proto->mutable_s16s());
       }
       break;
-    case F16:
-      *proto->mutable_f16s() =
-          std::string(reinterpret_cast<const char*>(data<half>().data()),
-                      size_bytes_dense());
-      if (!kLittleEndian) {
-        ConvertEndianShort(proto->mutable_f16s());
-      }
+    case S32:
+      CopyToRepeatedField(proto->mutable_s32s(), data<int32_t>());
       break;
-    case BF16:
-      *proto->mutable_bf16s() =
-          std::string(reinterpret_cast<const char*>(data<bfloat16>().data()),
-                      size_bytes_dense());
-      if (!kLittleEndian) {
-        ConvertEndianShort(proto->mutable_bf16s());
-      }
+    case S64:
+      CopyToRepeatedField(proto->mutable_s64s(), data<int64_t>());
       break;
     case F8E5M2:
       *proto->mutable_f8e5m2s() = std::string(
@@ -2247,7 +2262,7 @@ void LiteralBase::Piece::WriteToProto(LiteralProto* proto) const {
       break;
     case F8E4M3B11FNUZ:
       *proto->mutable_f8e4m3b11fnuzs() = std::string(
-          reinterpret_cast<const char*>(data<tsl::float8_e4m3b11>().data()),
+          reinterpret_cast<const char*>(data<tsl::float8_e4m3b11fnuz>().data()),
           size_bytes_dense());
       break;
     case F8E5M2FNUZ:
@@ -2260,6 +2275,22 @@ void LiteralBase::Piece::WriteToProto(LiteralProto* proto) const {
           reinterpret_cast<const char*>(data<tsl::float8_e4m3fnuz>().data()),
           size_bytes_dense());
       break;
+    case F16:
+      *proto->mutable_f16s() =
+          std::string(reinterpret_cast<const char*>(data<half>().data()),
+                      size_bytes_dense());
+      if (!kLittleEndian) {
+        ConvertEndianShort(proto->mutable_f16s());
+      }
+      break;
+    case BF16:
+      *proto->mutable_bf16s() =
+          std::string(reinterpret_cast<const char*>(data<bfloat16>().data()),
+                      size_bytes_dense());
+      if (!kLittleEndian) {
+        ConvertEndianShort(proto->mutable_bf16s());
+      }
+      break;
     case F32:
       CopyToRepeatedField(proto->mutable_f32s(), data<float>());
       break;
@@ -2304,20 +2335,20 @@ void* LiteralBase::Piece::untyped_data() {
 namespace {
 
 template <typename RepeatedFieldT, typename NativeT>
-Status CopyFromRepeatedField(absl::Span<NativeT> dest,
-                             const RepeatedFieldT& src) {
+absl::Status CopyFromRepeatedField(absl::Span<NativeT> dest,
+                                   const RepeatedFieldT& src) {
   if (dest.size() != src.size()) {
     return InvalidArgument(
         "Expected %lu elements in LiteralProto repeated field, has %d",
         dest.size(), src.size());
   }
   std::copy(src.begin(), src.end(), dest.begin());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
 
-Status LiteralBase::Piece::CopyFromProto(const LiteralProto& proto) {
+absl::Status LiteralBase::Piece::CopyFromProto(const LiteralProto& proto) {
   // These conditions should have been checked in
   // MutableLiteralBase::CreateFromProto.
   TF_RET_CHECK(proto.has_shape());
@@ -2329,46 +2360,57 @@ Status LiteralBase::Piece::CopyFromProto(const LiteralProto& proto) {
     case PRED:
       TF_RETURN_IF_ERROR(CopyFromRepeatedField(data<bool>(), proto.preds()));
       break;
+    case S2: {
+      const std::string& s(proto.s2s());
+      TF_RET_CHECK(data<s2>().size() * sizeof(s2) == s.size());
+      memcpy(untyped_data(), s.data(), s.size());
+      break;
+    }
     case S4: {
       const std::string& s(proto.s4s());
       TF_RET_CHECK(data<s4>().size() * sizeof(s4) == s.size());
       memcpy(untyped_data(), s.data(), s.size());
-    } break;
+      break;
+    }
     case S8: {
       auto s8_data = data<int8_t>();
       TF_RET_CHECK(proto.s8s().size() == s8_data.size());
       std::copy(proto.s8s().begin(), proto.s8s().end(), s8_data.begin());
-    } break;
-    case U4: {
-      const std::string& s(proto.u4s());
-      TF_RET_CHECK(data<u4>().size() * sizeof(u4) == s.size());
+      break;
+    }
+    case S16: {
+      const std::string& s(proto.s16s());
+      TF_RET_CHECK(data<int16_t>().size() * sizeof(int16_t) == s.size());
       memcpy(untyped_data(), s.data(), s.size());
-    } break;
-    case U8: {
-      auto u8_data = data<uint8_t>();
-      TF_RET_CHECK(proto.u8s().size() == u8_data.size());
-      std::copy(proto.u8s().begin(), proto.u8s().end(), u8_data.begin());
-    } break;
+      if (!kLittleEndian) {
+        ConvertEndianShort(reinterpret_cast<char*>(untyped_data()), s.size());
+      }
+      break;
+    }
     case S32:
       TF_RETURN_IF_ERROR(CopyFromRepeatedField(data<int32_t>(), proto.s32s()));
       break;
     case S64:
       TF_RETURN_IF_ERROR(CopyFromRepeatedField(data<int64_t>(), proto.s64s()));
       break;
-    case U32:
-      TF_RETURN_IF_ERROR(CopyFromRepeatedField(data<uint32_t>(), proto.u32s()));
-      break;
-    case U64:
-      TF_RETURN_IF_ERROR(CopyFromRepeatedField(data<uint64_t>(), proto.u64s()));
+    case U2: {
+      const std::string& s(proto.u2s());
+      TF_RET_CHECK(data<u2>().size() * sizeof(u2) == s.size());
+      memcpy(untyped_data(), s.data(), s.size());
       break;
-    case S16: {
-      const std::string& s(proto.s16s());
-      TF_RET_CHECK(data<int16_t>().size() * sizeof(int16_t) == s.size());
+    }
+    case U4: {
+      const std::string& s(proto.u4s());
+      TF_RET_CHECK(data<u4>().size() * sizeof(u4) == s.size());
       memcpy(untyped_data(), s.data(), s.size());
-      if (!kLittleEndian) {
-        ConvertEndianShort(reinterpret_cast<char*>(untyped_data()), s.size());
-      }
-    } break;
+      break;
+    }
+    case U8: {
+      auto u8_data = data<uint8_t>();
+      TF_RET_CHECK(proto.u8s().size() == u8_data.size());
+      std::copy(proto.u8s().begin(), proto.u8s().end(), u8_data.begin());
+      break;
+    }
     case U16: {
       const std::string& s(proto.u16s());
       TF_RET_CHECK(data<uint16_t>().size() * sizeof(uint16_t) == s.size());
@@ -2376,41 +2418,53 @@ Status LiteralBase::Piece::CopyFromProto(const LiteralProto& proto) {
       if (!kLittleEndian) {
         ConvertEndianShort(reinterpret_cast<char*>(untyped_data()), s.size());
       }
-    } break;
+      break;
+    }
+    case U32:
+      TF_RETURN_IF_ERROR(CopyFromRepeatedField(data<uint32_t>(), proto.u32s()));
+      break;
+    case U64:
+      TF_RETURN_IF_ERROR(CopyFromRepeatedField(data<uint64_t>(), proto.u64s()));
+      break;
     case F8E5M2: {
       const std::string& s(proto.f8e5m2s());
       TF_RET_CHECK(data<tsl::float8_e5m2>().size() * sizeof(tsl::float8_e5m2) ==
                    s.size());
       memcpy(untyped_data(), s.data(), s.size());
-    } break;
+      break;
+    }
     case F8E4M3FN: {
       const std::string& s(proto.f8e4m3fns());
       TF_RET_CHECK(data<tsl::float8_e4m3fn>().size() *
                        sizeof(tsl::float8_e4m3fn) ==
                    s.size());
       memcpy(untyped_data(), s.data(), s.size());
-    } break;
+      break;
+    }
     case F8E4M3B11FNUZ: {
       const std::string& s(proto.f8e4m3b11fnuzs());
-      TF_RET_CHECK(data<tsl::float8_e4m3b11>().size() *
-                       sizeof(tsl::float8_e4m3b11) ==
+      TF_RET_CHECK(data<tsl::float8_e4m3b11fnuz>().size() *
+                       sizeof(tsl::float8_e4m3b11fnuz) ==
                    s.size());
       memcpy(untyped_data(), s.data(), s.size());
-    } break;
+      break;
+    }
     case F8E5M2FNUZ: {
       const std::string& s(proto.f8e5m2fnuzs());
       TF_RET_CHECK(data<tsl::float8_e5m2fnuz>().size() *
                        sizeof(tsl::float8_e5m2fnuz) ==
                    s.size());
       memcpy(untyped_data(), s.data(), s.size());
-    } break;
+      break;
+    }
     case F8E4M3FNUZ: {
       const std::string& s(proto.f8e4m3fnuzs());
       TF_RET_CHECK(data<tsl::float8_e4m3fnuz>().size() *
                        sizeof(tsl::float8_e4m3fnuz) ==
                    s.size());
       memcpy(untyped_data(), s.data(), s.size());
-    } break;
+      break;
+    }
     case F16: {
       const std::string& s(proto.f16s());
       TF_RET_CHECK(data<half>().size() * sizeof(half) == s.size());
@@ -2418,8 +2472,8 @@ Status LiteralBase::Piece::CopyFromProto(const LiteralProto& proto) {
       if (!kLittleEndian) {
         ConvertEndianShort(reinterpret_cast<char*>(untyped_data()), s.size());
       }
-    } break;
-
+      break;
+    }
     case BF16: {
       const std::string& s(proto.bf16s());
       TF_RET_CHECK(data<bfloat16>().size() * sizeof(bfloat16) == s.size());
@@ -2427,7 +2481,8 @@ Status LiteralBase::Piece::CopyFromProto(const LiteralProto& proto) {
       if (!kLittleEndian) {
         ConvertEndianShort(reinterpret_cast<char*>(untyped_data()), s.size());
       }
-    } break;
+      break;
+    }
     case F32:
       TF_RETURN_IF_ERROR(CopyFromRepeatedField(data<float>(), proto.f32s()));
       break;
@@ -2459,7 +2514,7 @@ Status LiteralBase::Piece::CopyFromProto(const LiteralProto& proto) {
       return InvalidArgument("Is called on unsupported shape: %s",
                              ShapeUtil::HumanString(subshape()));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool LiteralBase::Piece::IsKnown() const {
@@ -2648,6 +2703,25 @@ MutableBorrowingLiteral::MutableBorrowingLiteral(absl::Span<char*> src_buf_ptrs,
   }
 }
 
+MutableBorrowingLiteral::MutableBorrowingLiteral(ShapeTree<char*> src_buf_ptrs)
+    : MutableLiteralBase() {
+  shape_ = std::make_unique<Shape>(src_buf_ptrs.shape());
+
+  root_piece_ = new Piece();
+  root_piece_->set_subshape(shape_.get());
+  BuildPieceSubtree(*shape_, root_piece_);
+
+  root_piece_->ForEachMutableSubpiece(
+      [&](const ShapeIndex& index, Piece* piece) {
+        if (ShapeUtil::GetSubshape(*shape_, index).IsTuple()) {
+          DCHECK_EQ(src_buf_ptrs.element(index), nullptr)
+              << "Tuples should not have buffer pointers";
+          return;
+        }
+        piece->set_buffer(const_cast<char*>(src_buf_ptrs.element(index)));
+      });
+}
+
 MutableBorrowingLiteral::~MutableBorrowingLiteral() {
   if (root_piece_ != nullptr) {
     delete root_piece_;
@@ -2661,22 +2735,6 @@ LiteralSlice::LiteralSlice(const LiteralBase& literal,
                            const ShapeIndex& view_root)
     : LiteralBase(), root_piece_(&literal.piece(view_root)) {}
 
-void BorrowingLiteral::BuildPieceSubtree(const Shape& shape, Piece* piece) {
-  CHECK(shape.IsTuple());
-  for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
-    const Shape& subshape = shape.tuple_shapes(i);
-
-    Piece child_piece;
-    child_piece.set_subshape(&subshape);
-
-    if (subshape.IsTuple()) {
-      BuildPieceSubtree(subshape, &child_piece);
-    }
-
-    piece->emplace_back(std::move(child_piece));
-  }
-}
-
 BorrowingLiteral::BorrowingLiteral(const char* src_buf_ptr, const Shape& shape)
     : LiteralBase(), shape_(std::make_unique<Shape>(shape)) {
   CHECK(shape_->IsArray());
@@ -2704,4 +2762,21 @@ BorrowingLiteral::BorrowingLiteral(absl::Span<const char* const> src_buf_ptrs,
   }
 }
 
+BorrowingLiteral::BorrowingLiteral(ShapeTree<const char*> src_buf_ptrs)
+    : LiteralBase(), shape_(std::make_unique<Shape>(src_buf_ptrs.shape())) {
+  root_piece_ = Piece();
+  root_piece_.set_subshape(shape_.get());
+  BuildPieceSubtree(*shape_, &root_piece_);
+
+  root_piece_.ForEachMutableSubpiece(
+      [&](const ShapeIndex& index, Piece* piece) {
+        if (ShapeUtil::GetSubshape(*shape_, index).IsTuple()) {
+          DCHECK_EQ(src_buf_ptrs.element(index), nullptr)
+              << "Tuples should not have buffer pointers";
+          return;
+        }
+        piece->set_buffer(const_cast<char*>(src_buf_ptrs.element(index)));
+      });
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/literal.h b/third_party/xla/xla/literal.h
index 2595dcd512a582..0ea05a201d8188 100644
--- a/third_party/xla/xla/literal.h
+++ b/third_party/xla/xla/literal.h
@@ -34,7 +34,6 @@ limitations under the License.
 
 #include "absl/base/attributes.h"
 #include "absl/base/casts.h"
-#include "absl/base/config.h"
 #include "absl/functional/function_ref.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
@@ -45,13 +44,14 @@ limitations under the License.
 #include "xla/index_util.h"
 #include "xla/layout.h"
 #include "xla/layout_util.h"
+#include "xla/maybe_owning.h"
 #include "xla/primitive_util.h"
 #include "xla/printer.h"
 #include "xla/shape.h"
+#include "xla/shape_tree.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/status_macros.h"
-#include "xla/statusor.h"
 #include "xla/types.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -115,7 +115,7 @@ class LiteralBase {
   // compatibility.  If compatibility is required, you should use protobuf
   // serialization instead.
   template <typename OutputIterator>
-  Status Serialize(OutputIterator output) const {
+  absl::Status Serialize(OutputIterator output) const {
     return SerializeWithShapeProto(shape().ToProto(), output);
   }
 
@@ -366,10 +366,36 @@ class LiteralBase {
           }
 
           CHECK(LayoutUtil::IsDenseArray(subshape));
+          const int64_t size_bytes = literal.size_bytes(index);
+          const int64_t bytes_to_hash = std::min(size_bytes, kByteLimit);
+          // When layout insensitive, we need to hash the data bytes in logical
+          // order rather than physical order.
+          const bool use_physical_order =
+              kIsLayoutSensitive || !subshape.has_layout();
           auto data = absl::MakeConstSpan(
               static_cast<const char*>(literal.untyped_data(index)),
-              std::min(kByteLimit, literal.size_bytes(index)));
-          state = H::combine(std::move(state), data);
+              size_bytes);
+          if (use_physical_order) {
+            state = H::combine(std::move(state), data.first(bytes_to_hash));
+            return;
+          }
+          const int64_t elem_size =
+              ShapeUtil::ByteSizeOfPrimitiveType(subshape.element_type());
+          absl::Span<const int64_t> minor_to_major =
+              subshape.layout().minor_to_major();
+          DimensionVector elem_index(subshape.dimensions_size());
+          absl::Span<int64_t> elem_index_span(elem_index.data(),
+                                              elem_index.size());
+          int64_t bytes_hashed = 0;
+          while (bytes_hashed < bytes_to_hash) {
+            int64_t offset =
+                elem_size * IndexUtil::MultidimensionalIndexToLinearIndex(
+                                subshape, minor_to_major, elem_index);
+            state =
+                H::combine(std::move(state), data.subspan(offset, elem_size));
+            if (!IndexUtil::BumpIndices(subshape, elem_index_span)) return;
+            bytes_hashed += elem_size;
+          }
         });
 
     return std::move(state);
@@ -492,9 +518,15 @@ class LiteralBase {
   static Literal CreateFromShapeWithUndeterminedLeafArrays(const Shape& shape);
 
  protected:
+  class Piece;
+
+  // Recursively builds the subtree for the given piece and sets the subshapes
+  // of the given piece with the given shape.
+  void BuildPieceSubtree(const Shape& shape, Piece* piece);
+
   template <typename OutputIterator>
-  Status SerializeWithShapeProto(const ShapeProto& proto,
-                                 OutputIterator output) const;
+  absl::Status SerializeWithShapeProto(const ShapeProto& proto,
+                                       OutputIterator output) const;
 
   template <typename OutputIterator>
   class SerializeState {
@@ -510,8 +542,7 @@ class LiteralBase {
     void WriteElement(NativeT element) {
       constexpr PrimitiveType primitive_type =
           primitive_util::NativeToPrimitiveType<NativeT>();
-      static_assert(primitive_type != PRED);
-      static_assert(!primitive_util::Is4BitType(primitive_type));
+      static_assert(primitive_util::BitWidth(primitive_type) % 8 == 0);
       if constexpr (primitive_util::IsComplexType(primitive_type)) {
         WriteElement(element.real());
         WriteElement(element.imag());
@@ -536,41 +567,37 @@ class LiteralBase {
 
     template <typename NativeT>
     void WriteElements(absl::Span<const NativeT> elements) {
-      if constexpr (std::is_same_v<NativeT, bool>) {
-        int64_t bytes = elements.size() / 8;
+      constexpr PrimitiveType primitive_type =
+          primitive_util::NativeToPrimitiveType<NativeT>();
+      constexpr int bits_per_element = primitive_util::BitWidth(primitive_type);
+      if constexpr (bits_per_element < 8) {
+        static_assert(!primitive_util::IsFloatingPointType(primitive_type));
+        static_assert(!primitive_util::IsComplexType(primitive_type));
+        static_assert(8 % bits_per_element == 0);
+        constexpr int elements_per_byte = 8 / bits_per_element;
+
+        int64_t bytes = elements.size() / elements_per_byte;
         for (int64_t i = 0; i < bytes; ++i) {
           uint8_t byte = 0;
-          for (int b = 0; b < 8; ++b) {
-            if (elements[i * 8 + b]) {
-              byte |= uint8_t{1} << b;
-            }
+          for (int b = 0; b < elements_per_byte; ++b) {
+            uint8_t src =
+                static_cast<uint8_t>(elements[i * elements_per_byte + b]) &
+                LsbMask<uint8_t>(bits_per_element);
+            byte |= src << (b * bits_per_element);
           }
           WriteElement(byte);
         }
-        int64_t rest = elements.size() % 8;
+        int64_t rest = elements.size() % elements_per_byte;
         if (rest != 0) {
           uint8_t byte = 0;
           for (int64_t b = 0; b < rest; ++b) {
-            if (elements[bytes * 8 + b]) {
-              byte |= uint8_t{1} << b;
-            }
+            uint8_t src =
+                static_cast<uint8_t>(elements[bytes * elements_per_byte + b]) &
+                LsbMask<uint8_t>(bits_per_element);
+            byte |= src << (b * bits_per_element);
           }
           WriteElement(byte);
         }
-      } else if constexpr (primitive_util::Is4BitType(
-                               primitive_util::NativeToPrimitiveType<
-                                   NativeT>())) {
-        int64_t bytes = elements.size() / 2;
-        for (int64_t i = 0; i < bytes; ++i) {
-          uint8_t low = static_cast<uint8_t>(elements[i * 2]);
-          uint8_t high = static_cast<uint8_t>(elements[i * 2 + 1]);
-          uint8_t byte = (low & uint8_t{0xf}) | (high << 4);
-          WriteElement(byte);
-        }
-        if (elements.size() % 2 != 0) {
-          uint8_t last = static_cast<uint8_t>(elements.back()) & uint8_t{0xf};
-          WriteElement(last);
-        }
       } else {
         for (NativeT element : elements) {
           WriteElement(element);
@@ -607,8 +634,7 @@ class LiteralBase {
     ABSL_MUST_USE_RESULT bool ReadElement(NativeT& element) {
       constexpr PrimitiveType primitive_type =
           primitive_util::NativeToPrimitiveType<NativeT>();
-      static_assert(!primitive_util::Is4BitType(primitive_type));
-      static_assert(primitive_type != PRED);
+      static_assert(primitive_util::BitWidth(primitive_type) % 8 == 0);
       if constexpr (primitive_util::IsComplexType(primitive_type)) {
         using ComponentT =
             primitive_util::NativeTypeOf<primitive_util::ComplexComponentType(
@@ -653,46 +679,39 @@ class LiteralBase {
 
     template <typename NativeT>
     ABSL_MUST_USE_RESULT bool ReadElements(absl::Span<NativeT> elements) {
-      if constexpr (std::is_same_v<NativeT, bool>) {
-        int64_t bytes = elements.size() / 8;
+      constexpr PrimitiveType primitive_type =
+          primitive_util::NativeToPrimitiveType<NativeT>();
+      constexpr int bits_per_element = primitive_util::BitWidth(primitive_type);
+      if constexpr (bits_per_element < 8) {
+        static_assert(!primitive_util::IsFloatingPointType(primitive_type));
+        static_assert(!primitive_util::IsComplexType(primitive_type));
+        static_assert(8 % bits_per_element == 0);
+        constexpr int elements_per_byte = 8 / bits_per_element;
+
+        int64_t bytes = elements.size() / elements_per_byte;
         for (int64_t i = 0; i < bytes; ++i) {
           uint8_t byte;
           if (!ReadElement(byte)) {
             return false;
           }
-          for (int b = 0; b < 8; ++b) {
-            elements[i * 8 + b] = !!(byte & (uint8_t{1} << b));
+          for (int b = 0; b < elements_per_byte; ++b) {
+            elements[i * elements_per_byte + b] =
+                static_cast<NativeT>(byte & LsbMask<uint8_t>(bits_per_element));
+            byte >>= bits_per_element;
           }
         }
-        int64_t rest = elements.size() % 8;
+        int64_t rest = elements.size() % elements_per_byte;
         if (rest != 0) {
           uint8_t byte;
           if (!ReadElement(byte)) {
             return false;
           }
           for (int64_t b = 0; b < rest; ++b) {
-            elements[bytes * 8 + b] = !!(byte & (uint8_t{1} << b));
+            elements[bytes * elements_per_byte + b] =
+                static_cast<NativeT>(byte & LsbMask<uint8_t>(bits_per_element));
+            byte >>= bits_per_element;
           }
         }
-      } else if constexpr (primitive_util::Is4BitType(
-                               primitive_util::NativeToPrimitiveType<
-                                   NativeT>())) {
-        int64_t bytes = elements.size() / 2;
-        for (int64_t i = 0; i < bytes; ++i) {
-          uint8_t byte;
-          if (!ReadElement(byte)) {
-            return false;
-          }
-          elements[i * 2] = static_cast<NativeT>(byte & uint8_t{0xf});
-          elements[i * 2 + 1] = static_cast<NativeT>(byte >> 4);
-        }
-        if (elements.size() % 2 != 0) {
-          uint8_t last;
-          if (!ReadElement(last)) {
-            return false;
-          }
-          elements.back() = static_cast<NativeT>(last);
-        }
       } else {
         for (NativeT& element : elements) {
           if (!ReadElement(element)) {
@@ -894,16 +913,16 @@ class LiteralBase {
       return ForEachHelper(
                  [&func](const ShapeIndex& index, const Piece& piece) {
                    func(index, piece);
-                   return OkStatus();
+                   return absl::OkStatus();
                  },
                  *this, &index)
           .IgnoreError();
     }
     // Same as above, but the function has the type:
-    //    Status (const ShapeIndex& index, const Piece& piece)
+    //    absl::Status (const ShapeIndex& index, const Piece& piece)
     // The first non-OK return value is returned by the function.
     template <typename Fn>
-    Status ForEachSubpieceWithStatus(const Fn& func) const {
+    absl::Status ForEachSubpieceWithStatus(const Fn& func) const {
       ShapeIndex index;
       return ForEachHelper(func, *this, &index);
     }
@@ -923,16 +942,16 @@ class LiteralBase {
       return ForEachMutableHelper(
                  [&func](const ShapeIndex& index, Piece* piece) {
                    func(index, piece);
-                   return OkStatus();
+                   return absl::OkStatus();
                  },
                  const_cast<xla::LiteralBase::Piece*>(this), &index)
           .IgnoreError();
     }
     // Same as above, but the function has the type:
-    //    Status (const ShapeIndex& index, Piece& piece)
+    //    absl::Status (const ShapeIndex& index, Piece& piece)
     // The first non-OK return value is returned by the function.
     template <typename Fn>
-    Status ForEachMutableSubpieceWithStatus(const Fn& func) {
+    absl::Status ForEachMutableSubpieceWithStatus(const Fn& func) {
       ShapeIndex index;
       return ForEachMutableHelper(
           func, const_cast<xla::LiteralBase::Piece*>(this), &index);
@@ -966,11 +985,11 @@ class LiteralBase {
     // Copy the data from 'src' into this piece's buffer. Shapes of this piece
     // and src must be compatible. If only_dynamic_bound is true, only elements
     // within dynamic bounds will be copied.
-    Status CopyFrom(const Piece& src, bool only_dynamic_bound);
+    absl::Status CopyFrom(const Piece& src, bool only_dynamic_bound);
 
     // Copies the data from the given proto into this piece. The shape of this
     // piece must be equal (not just compatible) to the shape of the proto.
-    Status CopyFromProto(const LiteralProto& proto);
+    absl::Status CopyFromProto(const LiteralProto& proto);
 
     // See comments on ArrayValueState for detailed explanation.
     bool IsDetermined() const;
@@ -1029,8 +1048,8 @@ class LiteralBase {
     // The callable 'func' has the same signature as described above in
     // ForEachSubpiece*.
     template <typename Fn>
-    Status ForEachHelper(const Fn& func, const Piece& piece,
-                         ShapeIndex* index) const {
+    absl::Status ForEachHelper(const Fn& func, const Piece& piece,
+                               ShapeIndex* index) const {
       TF_RETURN_IF_ERROR(func(*index, piece));
       if (auto* tuple_rep = piece.GetTupleRep()) {
         for (int64_t i = 0; i < tuple_rep->children.size(); ++i) {
@@ -1040,7 +1059,7 @@ class LiteralBase {
           index->pop_back();
         }
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
     template <typename Fn>
     bool ForEachHelperBool(const Fn& func, const Piece& piece,
@@ -1060,8 +1079,8 @@ class LiteralBase {
       return true;
     }
     template <typename Fn>
-    Status ForEachMutableHelper(const Fn& func, Piece* piece,
-                                ShapeIndex* index) {
+    absl::Status ForEachMutableHelper(const Fn& func, Piece* piece,
+                                      ShapeIndex* index) {
       TF_RETURN_IF_ERROR(func(*index, piece));
       if (auto* tuple_rep = piece->GetTupleRep()) {
         for (int64_t i = 0; i < tuple_rep->children.size(); ++i) {
@@ -1071,7 +1090,7 @@ class LiteralBase {
           index->pop_back();
         }
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     // Recursive helper for EqualElements.
@@ -1151,10 +1170,10 @@ class MutableLiteralBase : public LiteralBase {
   // at 'dest_shape_index' must be compatible with the subshape of 'src_literal'
   // rooted at 'src_shape_index', but need not be arrays. If only_dynamic_bound
   // is true, only elements within dynamic bounds will be copied.
-  Status CopyFrom(const LiteralSlice& src_literal,
-                  const ShapeIndex& dest_shape_index = {},
-                  const ShapeIndex& src_shape_index = {},
-                  bool only_dynamic_bound = false);
+  absl::Status CopyFrom(const LiteralSlice& src_literal,
+                        const ShapeIndex& dest_shape_index = {},
+                        const ShapeIndex& src_shape_index = {},
+                        bool only_dynamic_bound = false);
 
   // Copies the values from src_literal, starting at src_base shape indexes,
   // to this literal, starting at dest_base, where the copy size in each
@@ -1166,10 +1185,10 @@ class MutableLiteralBase : public LiteralBase {
   // element, then copy_size must be 0 in these dimensions while the
   // corresponding base indices being 0.
   // This literal and 'src_literal' must be arrays.
-  Status CopySliceFrom(const LiteralSlice& src_literal,
-                       absl::Span<const int64_t> src_base,
-                       absl::Span<const int64_t> dest_base,
-                       absl::Span<const int64_t> copy_size);
+  absl::Status CopySliceFrom(const LiteralSlice& src_literal,
+                             absl::Span<const int64_t> src_base,
+                             absl::Span<const int64_t> dest_base,
+                             absl::Span<const int64_t> copy_size);
 
   // Copies one element from src_literal[src_index] to (*this)[dest_index].
   void CopyElementFrom(const LiteralSlice& src_literal,
@@ -1188,11 +1207,13 @@ class MutableLiteralBase : public LiteralBase {
 
   // As Set(), but truncates `value` to the literal element type before storing.
   // This literal must be an array.
-  Status SetIntegralAsS64(absl::Span<const int64_t> multi_index, int64_t value);
+  absl::Status SetIntegralAsS64(absl::Span<const int64_t> multi_index,
+                                int64_t value);
 
   // As Set(), but truncates `value` to the literal element type before storing.
   // This literal must be an array.
-  Status SetFromDouble(absl::Span<const int64_t> multi_index, double value);
+  absl::Status SetFromDouble(absl::Span<const int64_t> multi_index,
+                             double value);
 
   // Populate this literal with the given values. Examples:
   //
@@ -1228,14 +1249,14 @@ class MutableLiteralBase : public LiteralBase {
   //
   // This literal must have a dense layout.
   template <typename NativeT>
-  Status Populate(
+  absl::Status Populate(
       absl::FunctionRef<NativeT(absl::Span<const int64_t>)> generator);
 
   // A parallel version of Populate(). This can be used if the generator is
   // thread-safe and the values for the shape's different elements are
   // independent.
   template <typename NativeT>
-  Status PopulateParallel(
+  absl::Status PopulateParallel(
       absl::FunctionRef<NativeT(absl::Span<const int64_t>, int)> generator);
 
   // Similar to Populate() but takes a populator function that allows caller
@@ -1245,13 +1266,13 @@ class MutableLiteralBase : public LiteralBase {
   // that we can avoid templatizing the method for better code size.
   //
   // This literal must have a dense layout.
-  Status PopulateInplace(
+  absl::Status PopulateInplace(
       absl::FunctionRef<void(void*, absl::Span<const int64_t>)> populator);
 
   // A parallel version of PopulateInplace(). This can be used if the generator
   // is thread-safe and the values for the shape's different elements are
   // independent.
-  Status PopulateInplaceParallel(
+  absl::Status PopulateInplaceParallel(
       absl::FunctionRef<void(void*, absl::Span<const int64_t>, int)> populator);
 
   // Fills this literal with the given value.
@@ -1279,110 +1300,24 @@ class MutableLiteralBase : public LiteralBase {
   // Internal template helper for the Literal::CopySliceFrom(), matching its
   // arguments one by one.
   template <typename NativeT>
-  Status CopySliceFromInternal(const LiteralBase& src_literal,
-                               absl::Span<const int64_t> src_base,
-                               absl::Span<const int64_t> dest_base,
-                               absl::Span<const int64_t> copy_size);
+  absl::Status CopySliceFromInternal(const LiteralBase& src_literal,
+                                     absl::Span<const int64_t> src_base,
+                                     absl::Span<const int64_t> dest_base,
+                                     absl::Span<const int64_t> copy_size);
 
-  // A unique_ptr like class which may or may not have ownership of its pointer.
   // The literal may or may not own the storage of the shape. Creating/copying a
   // shape can incur significant overhead which in many case we'd like to avoid,
   // esp. for small literals.
-  class MaybeOwningShapePtr {
-   public:
-    MaybeOwningShapePtr() = default;
-    explicit MaybeOwningShapePtr(std::unique_ptr<Shape> unique)
-        : ptr_and_owning_bit_(TakeUnique(std::move(unique))) {}
-
-    explicit MaybeOwningShapePtr(const Shape* borrowed)
-        : ptr_and_owning_bit_(Borrow(borrowed)) {}
-
-    ~MaybeOwningShapePtr() { MaybeDeleteOwned(); }
-
-    const Shape* get() const {
-      return reinterpret_cast<const Shape*>(ptr_and_owning_bit_ & kPointerMask);
-    }
-    Shape* get_mutable(bool ensure_owned = false) {
-      const Shape* const_ptr = get();
-      // TODO(b/67651157): Remove this copy on write logic and combine get() and
-      // get_mutable() once we remove mutable_shape_do_not_use().
-      if (const_ptr && !OwnsPtr()) {
-        ptr_and_owning_bit_ = TakeUnique(std::make_unique<Shape>(*const_ptr));
-        const_ptr = get();
-      }
-      DCHECK(OwnsPtr());
-      return const_cast<Shape*>(const_ptr);
-    }
-    const Shape* operator->() const { return get(); }
-    const Shape& operator*() const { return *get(); }
-
-    MaybeOwningShapePtr& operator=(std::unique_ptr<Shape> unique) {
-      MaybeDeleteOwned();
-      ptr_and_owning_bit_ = TakeUnique(std::move(std::move(unique)));
-      return *this;
-    }
-
-    MaybeOwningShapePtr& operator=(const Shape* borrowed) {
-      MaybeDeleteOwned();
-      ptr_and_owning_bit_ = Borrow(borrowed);
-      return *this;
-    }
-
-    MaybeOwningShapePtr& operator=(MaybeOwningShapePtr&& other) {
-      using std::swap;
-      swap(ptr_and_owning_bit_, other.ptr_and_owning_bit_);
-      return *this;
-    }
-
-    MaybeOwningShapePtr(const MaybeOwningShapePtr&) = delete;
-    MaybeOwningShapePtr(MaybeOwningShapePtr&& other)
-        : ptr_and_owning_bit_(other.ptr_and_owning_bit_) {
-      other.ptr_and_owning_bit_ = 0;
-    }
-
-    MaybeOwningShapePtr Clone() const {
-      const Shape* ptr = get();
-      if (ptr && OwnsPtr()) {
-        return MaybeOwningShapePtr(std::make_unique<Shape>(*ptr));
-      }
-      return MaybeOwningShapePtr(ptr);
-    }
-
-   private:
-    enum : uint64_t {
-      kOwningBitMask = 1UL,
-      kPointerMask = ~kOwningBitMask,
-    };
-    static intptr_t TakeUnique(std::unique_ptr<Shape> unique) {
-      Shape* released = unique.release();
-      DCHECK_EQ(reinterpret_cast<intptr_t>(released) & kOwningBitMask, 0);
-      return reinterpret_cast<intptr_t>(released) | kOwningBitMask;
-    }
-
-    static intptr_t Borrow(const Shape* borrowed) {
-      DCHECK_EQ(reinterpret_cast<intptr_t>(borrowed) & kOwningBitMask, 0);
-      return reinterpret_cast<intptr_t>(borrowed);
-    }
-
-    bool OwnsPtr() const { return kOwningBitMask & ptr_and_owning_bit_; }
-
-    void MaybeDeleteOwned() {
-      if (OwnsPtr()) {
-        delete get();
-      }
-    }
-
-    intptr_t ptr_and_owning_bit_ = 0;
-  };
+  using MaybeOwningShapePtr = MaybeOwning<Shape>;
 
   // The parent class borrows this shape.
   MaybeOwningShapePtr shape_;
 
   // Implementation details shared between Populate() and PopulateParallel()
   //  template <typename NativeT, typename FnType>
-  //  Status PopulateInternal(const FnType& generator, bool parallel);
+  //  absl::Status PopulateInternal(const FnType& generator, bool parallel);
   template <typename NativeT>
-  Status PopulateInternal(
+  absl::Status PopulateInternal(
       absl::FunctionRef<NativeT(absl::Span<const int64_t>, int)> generator,
       bool parallel);
   void PopulateInplaceInternal(
@@ -1423,9 +1358,9 @@ class Literal : public MutableLiteralBase {
   // allocated in this literal for the subshape at dest_shape_index is
   // deallocated, and the respective buffers are replaced with those in
   // src_literal. Upon return, src_literal is set to a nil shape (empty tuple).
-  virtual Status MoveFrom(Literal&& src_literal,
-                          const ShapeIndex& dest_shape_index);
-  Status MoveFrom(Literal&& src_literal) {
+  virtual absl::Status MoveFrom(Literal&& src_literal,
+                                const ShapeIndex& dest_shape_index);
+  absl::Status MoveFrom(Literal&& src_literal) {
     return MoveFrom(std::move(src_literal), /*dest_shape_index=*/{});
   }
 
@@ -1492,12 +1427,21 @@ class MutableBorrowingLiteral : public MutableLiteralBase {
   MutableBorrowingLiteral(MutableLiteralBase* literal);
   MutableBorrowingLiteral(MutableBorrowingLiteral literal,
                           const ShapeIndex& view_root);
+
+  // 'src_buf_ptr' is not owned by this class and must outlive the
+  // lifetime of this class. It points to an appropriately sized buffer with
+  // data interpreted as indicated by 'shape'.
+  // This constructor is only used for array shapes.
   MutableBorrowingLiteral(const char* src_buf_ptr, const Shape& shape);
 
-  // Create a literal from a list of buffers and a shape.
-  // Returns a tuple literal if `shape` is a tuple type.
+  // Similar as above, except to be used for constructing non-nested tuples.
   MutableBorrowingLiteral(absl::Span<char*> src_buf_ptrs, const Shape& shape);
 
+  // Similar as above, except to be used for constructing literals with
+  // potentially nested tuples (same shape as `src_buf_ptrs`) with borrowed
+  // buffers for each shape index.
+  explicit MutableBorrowingLiteral(ShapeTree<char*> src_buf_ptrs);
+
  private:
   const Piece& root_piece() const override { return *root_piece_; };
   // Recursively copies the subtree from the `src_piece` at the given child
@@ -1533,19 +1477,20 @@ class BorrowingLiteral : public LiteralBase {
 
   // 'src_buf_ptr' is not owned by this class and must outlive the
   // lifetime of this class. It points to an appropriately sized buffer with
-  // data interpretered as indicated by 'shape'.
+  // data interpreted as indicated by 'shape'.
   // This constructor is only used for array shapes.
   BorrowingLiteral(const char* src_buf_ptr, const Shape& shape);
+
   // Similar as above, except to be used for constructing non-nested tuples.
   BorrowingLiteral(absl::Span<const char* const> src_buf_ptrs,
                    const Shape& shape);
-  // TODO(b/79707221): adding constructors for nested tuples as well.
 
- private:
-  // Recursively builds the subtree for the given piece and sets the subshapes
-  // of the given piece with the given shape.
-  void BuildPieceSubtree(const Shape& shape, Piece* piece);
+  // Similar as above, except to be used for constructing literals with
+  // potentially nested tuples (same shape as `src_buf_ptrs`) with borrowed
+  // buffers for each shape index.
+  explicit BorrowingLiteral(ShapeTree<const char*> src_buf_ptrs);
 
+ private:
   // Accessor for the root piece of this literal.
   const Piece& root_piece() const override { return root_piece_; };
   Piece root_piece_;
@@ -1603,14 +1548,14 @@ bool LiteralBase::Piece::DeserializeData(
 //   byte (PRED, S4, U4) are packed into bytes.  Otherwise, they are written in
 //   little-endian byte order.
 template <typename OutputIterator>
-Status LiteralBase::SerializeWithShapeProto(const ShapeProto& shape_proto,
-                                            OutputIterator output) const {
+absl::Status LiteralBase::SerializeWithShapeProto(const ShapeProto& shape_proto,
+                                                  OutputIterator output) const {
   SerializeState<OutputIterator> state(shape_proto, output);
   TF_RETURN_IF_ERROR(root_piece().ForEachSubpieceWithStatus(
       [&](const ShapeIndex& shape_index, const Piece& piece) -> absl::Status {
         const Shape& subshape = piece.subshape();
         if (subshape.IsTuple()) {
-          return OkStatus();
+          return absl::OkStatus();
         }
         if (!subshape.IsArray()) {
           return InvalidArgument("Shape cannot be serialized: %s",
@@ -1622,11 +1567,11 @@ Status LiteralBase::SerializeWithShapeProto(const ShapeProto& shape_proto,
               piece.SerializeData<NativeT>(state);
             },
             subshape.element_type());
-        return OkStatus();
+        return absl::OkStatus();
       }));
   DCHECK_EQ(state.num_written(), SerializedSize().value())
       << shape().ToString();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename InputIterator>
@@ -1644,7 +1589,7 @@ absl::StatusOr<Literal> Literal::Deserialize(InputIterator begin,
           [&](const ShapeIndex& shape_index, Piece* piece) -> absl::Status {
             const Shape& subshape = piece->subshape();
             if (subshape.IsTuple()) {
-              return OkStatus();
+              return absl::OkStatus();
             }
             if (!subshape.IsArray()) {
               return InvalidArgument("Shape cannot be deserialized: %s",
@@ -1661,7 +1606,7 @@ absl::StatusOr<Literal> Literal::Deserialize(InputIterator begin,
                   "Failed to deserialize all data for shape: %s",
                   shape.ToString());
             }
-            return OkStatus();
+            return absl::OkStatus();
           }));
   DCHECK_EQ(state.num_read(), ShapeUtil::SerializedSize(shape).value())
       << shape.ToString();
@@ -1923,7 +1868,7 @@ void MutableLiteralBase::PopulateR4FromArray4D(const Array4D<NativeT>& values) {
 }
 
 template <typename NativeT>
-TF_ATTRIBUTE_NOINLINE Status MutableLiteralBase::PopulateInternal(
+TF_ATTRIBUTE_NOINLINE absl::Status MutableLiteralBase::PopulateInternal(
     absl::FunctionRef<NativeT(absl::Span<const int64_t>, int)> generator,
     bool parallel) {
   const Shape& this_shape = shape();
@@ -1940,11 +1885,11 @@ TF_ATTRIBUTE_NOINLINE Status MutableLiteralBase::PopulateInternal(
         *static_cast<NativeT*>(dest) = generator(indices, thread_id);
       },
       parallel);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename NativeT>
-TF_ATTRIBUTE_NOINLINE Status MutableLiteralBase::Populate(
+TF_ATTRIBUTE_NOINLINE absl::Status MutableLiteralBase::Populate(
     absl::FunctionRef<NativeT(absl::Span<const int64_t>)> generator) {
   TF_RET_CHECK(LayoutUtil::IsDenseArray(shape()))
       << __func__ << " is only supported for dense arrays: " << shape();
@@ -1955,7 +1900,7 @@ TF_ATTRIBUTE_NOINLINE Status MutableLiteralBase::Populate(
       /*parallel=*/false);
 }
 template <typename NativeT>
-TF_ATTRIBUTE_NOINLINE Status MutableLiteralBase::PopulateParallel(
+TF_ATTRIBUTE_NOINLINE absl::Status MutableLiteralBase::PopulateParallel(
     absl::FunctionRef<NativeT(absl::Span<const int64_t>, int)> generator) {
   TF_RET_CHECK(LayoutUtil::IsDenseArray(shape()))
       << __func__ << " is only supported for dense arrays: " << shape();
diff --git a/third_party/xla/xla/literal_comparison.cc b/third_party/xla/xla/literal_comparison.cc
index 4da0d71d1ff772..cb2e2cf64539df 100644
--- a/third_party/xla/xla/literal_comparison.cc
+++ b/third_party/xla/xla/literal_comparison.cc
@@ -92,8 +92,8 @@ bool CompareEqual(NativeT lhs, NativeT rhs,
 }
 
 template <typename NativeT, typename UnsignedT>
-Status MakeBitwiseErrorStatus(NativeT lhs, NativeT rhs,
-                              absl::Span<const int64_t> multi_index) {
+absl::Status MakeBitwiseErrorStatus(NativeT lhs, NativeT rhs,
+                                    absl::Span<const int64_t> multi_index) {
   auto ulhs = Eigen::numext::bit_cast<UnsignedT>(lhs);
   auto urhs = Eigen::numext::bit_cast<UnsignedT>(rhs);
   auto lhs_double = static_cast<double>(lhs);
@@ -107,8 +107,8 @@ Status MakeBitwiseErrorStatus(NativeT lhs, NativeT rhs,
 }
 
 template <typename NativeT>
-Status MakeErrorStatus(NativeT lhs, NativeT rhs,
-                       absl::Span<const int64_t> multi_index) {
+absl::Status MakeErrorStatus(NativeT lhs, NativeT rhs,
+                             absl::Span<const int64_t> multi_index) {
   if constexpr (is_specialized_integral_v<NativeT>) {
     return InvalidArgument(
         "first mismatch at array index %s:\n  expected value: %s\n  actual "
@@ -137,9 +137,9 @@ Status MakeErrorStatus(NativeT lhs, NativeT rhs,
 //    - nullptr. In this case, the function will return once any mismatch is
 //      found between expected and actual.
 template <typename NativeT>
-Status Equal(LiteralSlice expected, LiteralSlice actual,
-             absl::Span<int64_t> multi_index, int64_t dimension,
-             Literal* mismatched = nullptr) {
+absl::Status Equal(LiteralSlice expected, LiteralSlice actual,
+                   absl::Span<int64_t> multi_index, int64_t dimension,
+                   Literal* mismatched = nullptr) {
   if (dimension == expected.shape().dimensions_size()) {
     NativeT expected_value = expected.Get<NativeT>(multi_index);
     NativeT actual_value = actual.Get<NativeT>(multi_index);
@@ -148,12 +148,12 @@ Status Equal(LiteralSlice expected, LiteralSlice actual,
     if (mismatched) {
       mismatched->Set<bool>(multi_index, !result);
     }
-    return result ? OkStatus()
+    return result ? absl::OkStatus()
                   : MakeErrorStatus<NativeT>(expected_value, actual_value,
                                              multi_index);
   }
 
-  Status result;
+  absl::Status result;
   int64_t upper_bound = expected.shape().dimensions(dimension);
   if (expected.shape().is_dynamic_dimension(dimension)) {
     // If the dimension is dynamic, we only want to check up until the actual
@@ -233,11 +233,11 @@ class NearComparator {
   // result. The comparison is ok() if all actual and expected elements are
   // within the given error bound. In case of error, the status contains a
   // detailed message about the discrepancy.
-  static Status Compare(const LiteralSlice& expected,
-                        const LiteralSlice& actual,
-                        const ShapeIndex& shape_index, ErrorSpec error,
-                        bool detailed_message,
-                        const MiscompareCallback& miscompare_callback) {
+  static absl::Status Compare(const LiteralSlice& expected,
+                              const LiteralSlice& actual,
+                              const ShapeIndex& shape_index, ErrorSpec error,
+                              bool detailed_message,
+                              const MiscompareCallback& miscompare_callback) {
     NearComparator<NativeT> comparator(expected, actual, shape_index, error,
                                        detailed_message, miscompare_callback);
     return comparator.Run();
@@ -285,7 +285,7 @@ class NearComparator {
         rel_error_buckets_(kErrorBucketBounds.size(), 0) {}
 
   // Runs the comparison between expected and actual literals.
-  Status Run() {
+  absl::Status Run() {
     // If the shapes mismatch, we simply fail the expectation instead of
     // printing out data, as it's a type error rather than a value error.
     TF_RETURN_IF_ERROR(EqualShapes(expected_.shape(), actual_.shape()));
@@ -300,7 +300,7 @@ class NearComparator {
     CompareLiterals();
 
     if (num_mismatches_ == 0) {
-      return OkStatus();
+      return absl::OkStatus();
     } else if (!VLOG_IS_ON(1) && miscompare_callback_ != nullptr) {
       miscompare_callback_(
           expected_, actual_, mismatches_, shape_index_,
@@ -627,21 +627,22 @@ class NearComparator {
   std::vector<int64_t> rel_error_buckets_;
 };
 
-Status EqualHelper(const LiteralSlice& expected, const LiteralSlice& actual,
-                   const ShapeIndex& shape_index,
-                   const MiscompareCallback& miscompare_callback) {
+absl::Status EqualHelper(const LiteralSlice& expected,
+                         const LiteralSlice& actual,
+                         const ShapeIndex& shape_index,
+                         const MiscompareCallback& miscompare_callback) {
   if (expected.shape().is_static() && actual.shape().is_static()) {
     TF_RETURN_IF_ERROR(EqualShapes(expected.shape(), actual.shape()));
   } else {
     TF_RETURN_IF_ERROR(EqualDynamicShapesAndDimensions(expected, actual));
   }
 
-  Status result;
+  absl::Status result;
   if (expected.shape().IsTuple()) {
     ShapeIndex next_index = shape_index;
     for (int i = 0; i < ShapeUtil::TupleElementCount(expected.shape()); ++i) {
       next_index.push_back(i);
-      Status tuple_result =
+      absl::Status tuple_result =
           EqualHelper(LiteralSlice(expected, {i}), LiteralSlice(actual, {i}),
                       next_index, miscompare_callback);
       if (miscompare_callback) {
@@ -691,10 +692,11 @@ Status EqualHelper(const LiteralSlice& expected, const LiteralSlice& actual,
 // Helper function for comparing two literals for nearness. Handles tuple-shapes
 // via recursion. shape_index is the ShapeIndex of expected (or actual)
 // currently being compared.
-Status NearHelper(const LiteralSlice& expected, const LiteralSlice& actual,
-                  const ShapeIndex& shape_index, const ErrorSpec& error,
-                  std::optional<bool> detailed_message,
-                  const MiscompareCallback& miscompare_callback) {
+absl::Status NearHelper(const LiteralSlice& expected,
+                        const LiteralSlice& actual,
+                        const ShapeIndex& shape_index, const ErrorSpec& error,
+                        std::optional<bool> detailed_message,
+                        const MiscompareCallback& miscompare_callback) {
   if (expected.shape().is_static() && actual.shape().is_static()) {
     TF_RETURN_IF_ERROR(EqualShapes(expected.shape(), actual.shape()));
   } else {
@@ -702,14 +704,14 @@ Status NearHelper(const LiteralSlice& expected, const LiteralSlice& actual,
   }
 
   if (expected.shape().IsTuple()) {
-    Status return_status;
+    absl::Status return_status;
     for (int64_t i = 0; i < ShapeUtil::TupleElementCount(expected.shape());
          ++i) {
       const auto expected_element = LiteralSlice(expected, {i});
       const auto actual_element = LiteralSlice(actual, {i});
       ShapeIndex element_index = shape_index;
       element_index.push_back(i);
-      Status element_result =
+      absl::Status element_result =
           NearHelper(expected_element, actual_element, element_index, error,
                      detailed_message, miscompare_callback);
       if (!element_result.ok()) {
@@ -740,7 +742,7 @@ Status NearHelper(const LiteralSlice& expected, const LiteralSlice& actual,
     bool use_detailed_message = detailed_message.value_or(
         ShapeUtil::ElementsIn(expected.shape()) >= 64);
     return primitive_util::PrimitiveTypeSwitch<Status>(
-        [&](auto primitive_type) -> Status {
+        [&](auto primitive_type) -> absl::Status {
           if constexpr (primitive_util::IsFloatingPointType(primitive_type) ||
                         primitive_util::IsComplexType(primitive_type)) {
             using NativeT = primitive_util::NativeTypeOf<primitive_type>;
@@ -761,7 +763,7 @@ Status NearHelper(const LiteralSlice& expected, const LiteralSlice& actual,
 
 }  // namespace
 
-Status EqualShapes(const Shape& expected, const Shape& actual) {
+absl::Status EqualShapes(const Shape& expected, const Shape& actual) {
   if (expected.element_type() != actual.element_type()) {
     return InvalidArgument("element type mismatch, want: %s got %s",
                            ShapeUtil::HumanString(expected),
@@ -776,7 +778,7 @@ Status EqualShapes(const Shape& expected, const Shape& actual) {
           ShapeUtil::TupleElementCount(actual));
     }
     for (int i = 0; i < expected.tuple_shapes_size(); ++i) {
-      Status result =
+      absl::Status result =
           EqualShapes(expected.tuple_shapes(i), actual.tuple_shapes(i));
       if (!result.ok()) {
         return AppendStatus(result, StrCat("mismatch in tuple index", i));
@@ -807,11 +809,11 @@ Status EqualShapes(const Shape& expected, const Shape& actual) {
     }
   }
   // Non-array, non-tuple shapes are trivially equivalent.
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status EqualDynamicShapesAndDimensions(const LiteralSlice& expected,
-                                       const LiteralSlice& actual) {
+absl::Status EqualDynamicShapesAndDimensions(const LiteralSlice& expected,
+                                             const LiteralSlice& actual) {
   TF_RETURN_IF_ERROR(EqualShapes(expected.shape(), actual.shape()));
   return ShapeUtil::ForEachSubshapeWithStatus(
       expected.shape(),
@@ -854,7 +856,7 @@ Status EqualDynamicShapesAndDimensions(const LiteralSlice& expected,
           }
         }
 
-        return OkStatus();
+        return absl::OkStatus();
       });
 }
 
@@ -862,9 +864,9 @@ namespace {
 
 // If result is an error, extend the error message with the expected and actual
 // literals.
-Status EmitLiteralsInErrorMessage(const Status& result,
-                                  const LiteralSlice& expected,
-                                  const LiteralSlice& actual) {
+absl::Status EmitLiteralsInErrorMessage(const absl::Status& result,
+                                        const LiteralSlice& expected,
+                                        const LiteralSlice& actual) {
   if (result.ok()) {
     return result;
   }
@@ -875,24 +877,24 @@ Status EmitLiteralsInErrorMessage(const Status& result,
 
 }  // namespace
 
-Status Equal(const LiteralSlice& expected, const LiteralSlice& actual) {
+absl::Status Equal(const LiteralSlice& expected, const LiteralSlice& actual) {
   VLOG(1) << "expected:";
   XLA_VLOG_LINES(1, expected.ToString());
   VLOG(1) << "actual:";
   XLA_VLOG_LINES(1, actual.ToString());
-  Status result = EqualHelper(expected, actual, {}, nullptr);
+  absl::Status result = EqualHelper(expected, actual, {}, nullptr);
   return EmitLiteralsInErrorMessage(result, expected, actual);
 }
 
-Status Near(const LiteralSlice& expected, const LiteralSlice& actual,
-            const ErrorSpec& error, std::optional<bool> detailed_message,
-            const MiscompareCallback& miscompare_callback) {
+absl::Status Near(const LiteralSlice& expected, const LiteralSlice& actual,
+                  const ErrorSpec& error, std::optional<bool> detailed_message,
+                  const MiscompareCallback& miscompare_callback) {
   VLOG(1) << "Expected literal:";
   XLA_VLOG_LINES(1, expected.ToString());
   VLOG(1) << "Actual literal:";
   XLA_VLOG_LINES(1, actual.ToString());
-  Status result = NearHelper(expected, actual, /*shape_index=*/{}, error,
-                             detailed_message, miscompare_callback);
+  absl::Status result = NearHelper(expected, actual, /*shape_index=*/{}, error,
+                                   detailed_message, miscompare_callback);
   return EmitLiteralsInErrorMessage(result, expected, actual);
 }
 
diff --git a/third_party/xla/xla/literal_comparison.h b/third_party/xla/xla/literal_comparison.h
index 06d416f89dc4c8..61f0695c981e0c 100644
--- a/third_party/xla/xla/literal_comparison.h
+++ b/third_party/xla/xla/literal_comparison.h
@@ -36,17 +36,17 @@ namespace literal_comparison {
 
 // Returns ok if the given shapes have the same rank, dimension sizes, and
 // primitive types.
-Status EqualShapes(const Shape& expected, const Shape& actual);
+absl::Status EqualShapes(const Shape& expected, const Shape& actual);
 
 // Returns ok if the given literals share identical dynamic shapes and
 // dimension sizes.
-Status EqualDynamicShapesAndDimensions(const LiteralSlice& expected,
-                                       const LiteralSlice& actual);
+absl::Status EqualDynamicShapesAndDimensions(const LiteralSlice& expected,
+                                             const LiteralSlice& actual);
 
 // Returns ok if the expected and actual literals are (bitwise) equal for all
 // elements in the literal. Also, asserts that the rank, dimensions sizes, and
 // primitive type are equal.
-Status Equal(const LiteralSlice& expected, const LiteralSlice& actual);
+absl::Status Equal(const LiteralSlice& expected, const LiteralSlice& actual);
 
 // Structure that contains the distribution of absolute and relative errors,
 // bucketized into five buckets: [0.0001, 0.001, 0.01, 0.1, 1].
@@ -88,9 +88,9 @@ using MiscompareCallback = std::function<void(
 //
 // If miscompare_callback is nullptr, Near will return an error on the first
 // detected mismatch.
-Status Near(const LiteralSlice& expected, const LiteralSlice& actual,
-            const ErrorSpec& error, std::optional<bool> detailed_message,
-            const MiscompareCallback& miscompare_callback);
+absl::Status Near(const LiteralSlice& expected, const LiteralSlice& actual,
+                  const ErrorSpec& error, std::optional<bool> detailed_message,
+                  const MiscompareCallback& miscompare_callback);
 
 // Calling ToString on a literal with over 100 million elements takes around
 // 3 minutes.  The utility of printing a literal with >1000 elements is
diff --git a/third_party/xla/xla/literal_test.cc b/third_party/xla/xla/literal_test.cc
index 24c12bb92d6d84..37d0868334f210 100644
--- a/third_party/xla/xla/literal_test.cc
+++ b/third_party/xla/xla/literal_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/base/casts.h"
+#include "absl/hash/hash.h"
 #include "absl/random/random.h"
 #include "absl/strings/match.h"
 #include "absl/types/span.h"
@@ -41,6 +42,7 @@ limitations under the License.
 #include "xla/literal_util.h"
 #include "xla/primitive_util.h"
 #include "xla/shape.h"
+#include "xla/shape_tree.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/test.h"
@@ -174,8 +176,8 @@ TEST_F(LiteralUtilTest, LiteralScalarToString) {
       LiteralUtil::CreateR0<tsl::float8_e4m3fn>(tsl::float8_e4m3fn(0.5));
   EXPECT_EQ("f8e4m3fn[] 0.5", f8e4m3_lit.ToString());
 
-  auto f8e4m3b11fnuz_lit =
-      LiteralUtil::CreateR0<tsl::float8_e4m3b11>(tsl::float8_e4m3b11(0.5));
+  auto f8e4m3b11fnuz_lit = LiteralUtil::CreateR0<tsl::float8_e4m3b11fnuz>(
+      tsl::float8_e4m3b11fnuz(0.5));
   EXPECT_EQ("f8e4m3b11fnuz[] 0.5", f8e4m3b11fnuz_lit.ToString());
 
   auto f8e4m3fnuz_lit =
@@ -627,9 +629,9 @@ TEST_F(LiteralUtilTest, IsAll) {
   EXPECT_FALSE(LiteralUtil::CreateR1<tsl::float8_e4m3fn>({r16}).IsAll(8));
   EXPECT_TRUE(LiteralUtil::CreateR1<tsl::float8_e4m3fn>({r16}).IsAll(9));
 
-  tsl::float8_e4m3b11 s16(9);  // Exactly representable in e4m3
-  EXPECT_FALSE(LiteralUtil::CreateR1<tsl::float8_e4m3b11>({s16}).IsAll(8));
-  EXPECT_TRUE(LiteralUtil::CreateR1<tsl::float8_e4m3b11>({s16}).IsAll(9));
+  tsl::float8_e4m3b11fnuz s16(9);  // Exactly representable in e4m3
+  EXPECT_FALSE(LiteralUtil::CreateR1<tsl::float8_e4m3b11fnuz>({s16}).IsAll(8));
+  EXPECT_TRUE(LiteralUtil::CreateR1<tsl::float8_e4m3b11fnuz>({s16}).IsAll(9));
 
   tsl::float8_e4m3fnuz t16(9);  // Exactly representable in e4m3
   EXPECT_FALSE(LiteralUtil::CreateR1<tsl::float8_e4m3fnuz>({t16}).IsAll(8));
@@ -930,6 +932,16 @@ TEST_F(LiteralUtilTest, TestR4RelayoutEquivalence) {
   EXPECT_EQ(literal_r4_2x2x3x3_dim0minor_, dim0major_relaid_to_dim0minor);
 }
 
+template <bool kIsLayoutSensitive>
+struct HashTester {
+  template <typename H>
+  friend H AbslHashValue(H h, const HashTester& key) {
+    return Literal::Hash<H, kIsLayoutSensitive, /*kByteLimit=*/64>(
+        std::move(h), *key.literal);
+  }
+  const Literal* literal;
+};
+
 TEST_F(LiteralUtilTest, TestR2LinearLayout) {
   // Test expected memory layout of R2 dim0-minor (column-major) literal.
   auto mat_dim0minor = LiteralUtil::CreateR2WithLayout<int32_t>(
@@ -941,6 +953,8 @@ TEST_F(LiteralUtilTest, TestR2LinearLayout) {
   auto relaid_mat_to_dim0major = mat_dim0minor.Relayout(layout_r2_dim0major_);
   EXPECT_THAT(relaid_mat_to_dim0major.data<int32_t>(),
               ElementsAre(1, 2, 3, 4, 5, 6));
+  EXPECT_EQ(absl::HashOf(HashTester<false>{&mat_dim0minor}),
+            absl::HashOf(HashTester<false>{&relaid_mat_to_dim0major}));
 
   // Test expected memory layout of R2 created with dim0-major (row-major).
   auto mat_dim0major = LiteralUtil::CreateR2WithLayout<int32_t>(
@@ -952,6 +966,14 @@ TEST_F(LiteralUtilTest, TestR2LinearLayout) {
   auto relaid_mat_to_dim0minor = mat_dim0major.Relayout(layout_r2_dim0minor_);
   EXPECT_THAT(relaid_mat_to_dim0minor.data<int32_t>(),
               ElementsAre(1, 4, 2, 5, 3, 6));
+  EXPECT_EQ(absl::HashOf(HashTester<false>{&mat_dim0major}),
+            absl::HashOf(HashTester<false>{&relaid_mat_to_dim0minor}));
+
+  // Test that layout sensitive hashes are equal.
+  EXPECT_EQ(absl::HashOf(HashTester<true>{&mat_dim0minor}),
+            absl::HashOf(HashTester<true>{&relaid_mat_to_dim0minor}));
+  EXPECT_EQ(absl::HashOf(HashTester<true>{&mat_dim0major}),
+            absl::HashOf(HashTester<true>{&relaid_mat_to_dim0major}));
 }
 
 TEST_F(LiteralUtilTest, TestR3LinearLayout) {
@@ -1194,9 +1216,9 @@ TEST_F(LiteralUtilTest, PopulateWithValueR1F8e4m3) {
 
 TEST_F(LiteralUtilTest, PopulateWithValueR1F8e4m3b11) {
   Literal output(ShapeUtil::MakeShape(F8E4M3B11FNUZ, {3}));
-  tsl::float8_e4m3b11 x(0.5f);
-  output.PopulateWithValue<tsl::float8_e4m3b11>(x);
-  auto expected = LiteralUtil::CreateR1<tsl::float8_e4m3b11>({x, x, x});
+  tsl::float8_e4m3b11fnuz x(0.5f);
+  output.PopulateWithValue<tsl::float8_e4m3b11fnuz>(x);
+  auto expected = LiteralUtil::CreateR1<tsl::float8_e4m3b11fnuz>({x, x, x});
   EXPECT_EQ(output, expected);
 }
 
@@ -1388,7 +1410,7 @@ TEST_F(LiteralUtilTest, CopyBetweenSameTuple) {
 TEST_F(LiteralUtilTest, CopyFromDifferentShapes) {
   auto matrix = LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
   auto vector = LiteralUtil::CreateR1<float>({5.0, 7.0});
-  Status status = matrix.CopyFrom(vector);
+  absl::Status status = matrix.CopyFrom(vector);
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(status.message(), HasSubstr("Destination subshape incompatible"));
 }
@@ -1709,7 +1731,7 @@ TEST_F(LiteralUtilTest, ConvertIfTypesMatchF8) {
   using e4 = tsl::float8_e4m3fn;
   auto f8e4m3 = LiteralUtil::CreateR2WithLayout<e4>(
       {{e4{0.}, e4{1.}}, {e4{2.}, e4{3.}}}, layout_r2_dim0major_);
-  using b11 = tsl::float8_e4m3b11;
+  using b11 = tsl::float8_e4m3b11fnuz;
   auto f8e4m3b11 = LiteralUtil::CreateR2WithLayout<b11>(
       {{b11{0.}, b11{1.}}, {b11{2.}, b11{3.}}}, layout_r2_dim0major_);
   using e5f = tsl::float8_e5m2fnuz;
@@ -1797,7 +1819,7 @@ TEST_F(LiteralUtilTest, BitcastConvert) {
 
 TEST_F(LiteralUtilTest, BitcastConvertBetweenInvalidTypes) {
   Literal literal = LiteralUtil::CreateR0<uint32_t>(1234);
-  Status status =
+  absl::Status status =
       literal.BitcastConvert(ShapeUtil::ChangeElementType(literal.shape(), F64))
           .status();
   EXPECT_NE(OkStatus(), status);
@@ -2002,6 +2024,44 @@ TEST_F(LiteralUtilTest, BorrowingLiteralFromMultipleBufferPtrs) {
       literal_tuple.Get<int64_t>(/*multi_index=*/{2}, /*shape_index=*/{0}), 3);
 }
 
+TEST_F(LiteralUtilTest, BorrowingLiteralFromShapeTree) {
+  std::vector<float> data = {1.0, 2.0, 3.0};
+
+  Shape shape = ShapeUtil::MakeShape(PrimitiveType::F32, {3});
+  Shape tuple = ShapeUtil::MakeTupleShape({shape, shape});
+  Shape nested_tuple = ShapeUtil::MakeTupleShape({tuple, shape});
+
+  ShapeTree<const char*> ptr_tree(nested_tuple);
+  *ptr_tree.mutable_element({0, 0}) = reinterpret_cast<char*>(data.data());
+  *ptr_tree.mutable_element({0, 1}) = reinterpret_cast<char*>(data.data());
+  *ptr_tree.mutable_element({1}) = reinterpret_cast<char*>(data.data());
+
+  BorrowingLiteral literal(ptr_tree);
+
+  EXPECT_THAT(literal.data<float>({0, 0}), ElementsAre(1.0, 2.0, 3.0));
+  EXPECT_THAT(literal.data<float>({0, 1}), ElementsAre(1.0, 2.0, 3.0));
+  EXPECT_THAT(literal.data<float>({1}), ElementsAre(1.0, 2.0, 3.0));
+}
+
+TEST_F(LiteralUtilTest, MutableBorrowingLiteralFromShapeTree) {
+  std::vector<float> data = {1.0, 2.0, 3.0};
+
+  Shape shape = ShapeUtil::MakeShape(PrimitiveType::F32, {3});
+  Shape tuple = ShapeUtil::MakeTupleShape({shape, shape});
+  Shape nested_tuple = ShapeUtil::MakeTupleShape({tuple, shape});
+
+  ShapeTree<char*> ptr_tree(nested_tuple);
+  *ptr_tree.mutable_element({0, 0}) = reinterpret_cast<char*>(data.data());
+  *ptr_tree.mutable_element({0, 1}) = reinterpret_cast<char*>(data.data());
+  *ptr_tree.mutable_element({1}) = reinterpret_cast<char*>(data.data());
+
+  MutableBorrowingLiteral literal(ptr_tree);
+
+  EXPECT_THAT(literal.data<float>({0, 0}), ElementsAre(1.0, 2.0, 3.0));
+  EXPECT_THAT(literal.data<float>({0, 1}), ElementsAre(1.0, 2.0, 3.0));
+  EXPECT_THAT(literal.data<float>({1}), ElementsAre(1.0, 2.0, 3.0));
+}
+
 TEST_F(LiteralUtilTest, LiteralMove) {
   Literal matrix = LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
   Literal literal(std::move(matrix));
@@ -2178,7 +2238,7 @@ TEST_F(LiteralUtilTest, ProtoRoundTrip) {
   using e4 = tsl::float8_e4m3fn;
   auto vector_f8e4m3 =
       LiteralUtil::CreateR1<e4>({e4{10.0}, e4{20.0}, e4{-32.0}});
-  using b11 = tsl::float8_e4m3b11;
+  using b11 = tsl::float8_e4m3b11fnuz;
   auto vector_f8e4m3b11 =
       LiteralUtil::CreateR1<b11>({b11{10.0}, b11{20.0}, b11{-30.0}});
   using e5f = tsl::float8_e5m2fnuz;
@@ -2227,7 +2287,7 @@ TEST_F(LiteralUtilTest, InvalidProtoNoValues) {
   // Proto contains a shape, but no values.
   LiteralProto proto;
   *proto.mutable_shape() = ShapeUtil::MakeShape(F32, {3}).ToProto();
-  Status status = Literal::CreateFromProto(proto).status();
+  absl::Status status = Literal::CreateFromProto(proto).status();
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(status.message(),
               HasSubstr("Expected 3 elements in LiteralProto"));
@@ -2237,7 +2297,7 @@ TEST_F(LiteralUtilTest, ValidProtoNoValues) {
   // Proto contains a shape, but no values.
   LiteralProto proto;
   *proto.mutable_shape() = ShapeUtil::MakeShape(F32, {3}).ToProto();
-  Status status =
+  absl::Status status =
       Literal::CreateFromProto(proto, /*prohibit_empty_literal=*/false)
           .status();
   EXPECT_TRUE(status.ok());
@@ -2251,7 +2311,7 @@ TEST_F(LiteralUtilTest, ValidProtoWithClearedValues) {
   // Clear values.
   proto.clear_preds();
   EXPECT_EQ(proto.preds_size(), 0);
-  Status status =
+  absl::Status status =
       Literal::CreateFromProto(proto, /*prohibit_empty_literal=*/false)
           .status();
   EXPECT_TRUE(status.ok());
@@ -2263,7 +2323,7 @@ TEST_F(LiteralUtilTest, InvalidProtoNoShape) {
   proto.add_preds(false);
   proto.add_preds(true);
   proto.add_preds(false);
-  Status status = Literal::CreateFromProto(proto).status();
+  absl::Status status = Literal::CreateFromProto(proto).status();
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(status.message(), HasSubstr("LiteralProto has no shape"));
 }
@@ -2275,7 +2335,7 @@ TEST_F(LiteralUtilTest, InvalidProtoWrongContainer) {
   proto.add_preds(false);
   proto.add_preds(true);
   proto.add_preds(false);
-  Status status = Literal::CreateFromProto(proto).status();
+  absl::Status status = Literal::CreateFromProto(proto).status();
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(status.message(),
               HasSubstr("Expected 3 elements in LiteralProto"));
@@ -2288,7 +2348,7 @@ TEST_F(LiteralUtilTest, InvalidProtoTooFewValues) {
   proto.add_f32s(1.0);
   proto.add_f32s(2.0);
   proto.add_f32s(3.0);
-  Status status = Literal::CreateFromProto(proto).status();
+  absl::Status status = Literal::CreateFromProto(proto).status();
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(status.message(),
               HasSubstr("Expected 84 elements in LiteralProto"));
@@ -2301,7 +2361,7 @@ TEST_F(LiteralUtilTest, InvalidProtoTooManyValues) {
   proto.add_s32s(42);
   proto.add_s32s(-10);
   proto.add_s32s(100);
-  Status status = Literal::CreateFromProto(proto).status();
+  absl::Status status = Literal::CreateFromProto(proto).status();
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(status.message(),
               HasSubstr("Expected 2 elements in LiteralProto"));
@@ -2316,7 +2376,7 @@ TEST_F(LiteralUtilTest, InvalidProtoMissingLayout) {
   proto.add_preds(false);
   proto.add_preds(true);
   proto.add_preds(false);
-  Status status = Literal::CreateFromProto(proto).status();
+  absl::Status status = Literal::CreateFromProto(proto).status();
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(status.message(), HasSubstr("LiteralProto has no layout"));
 }
@@ -2334,7 +2394,7 @@ TEST_F(LiteralUtilTest, InvalidProtoTooFewTupleElements) {
   element0->add_preds(false);
   element0->add_preds(true);
 
-  Status status = Literal::CreateFromProto(proto).status();
+  absl::Status status = Literal::CreateFromProto(proto).status();
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(status.message(), HasSubstr("Expected 2 tuple elements"));
 }
@@ -2359,7 +2419,7 @@ TEST_F(LiteralUtilTest, InvalidProtoTooManyTupleElements) {
   *element2->mutable_shape() = ShapeUtil::MakeShape(F32, {}).ToProto();
   element2->add_f32s(123.0);
 
-  Status status = Literal::CreateFromProto(proto).status();
+  absl::Status status = Literal::CreateFromProto(proto).status();
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(status.message(), HasSubstr("Expected 2 tuple elements"));
 }
diff --git a/third_party/xla/xla/literal_util.h b/third_party/xla/xla/literal_util.h
index 5a27f1b510088a..fad4b5ed9f2826 100644
--- a/third_party/xla/xla/literal_util.h
+++ b/third_party/xla/xla/literal_util.h
@@ -251,7 +251,7 @@ class LiteralUtil {
 
   // Creates a literal with the supplied shape, and uses the provided value
   // generator to populate the literal's values.
-  // Returns the new literal object, or an error Status if failed.
+  // Returns the new literal object, or an error absl::Status if failed.
   template <PrimitiveType type, typename T = primitive_util::NativeTypeOf<type>>
   static absl::StatusOr<Literal> CreateLiteralWithGenerator(
       const Shape& shape,
@@ -260,7 +260,7 @@ class LiteralUtil {
   // Creates a literal with the supplied shape, and initializes the literal
   // values using a normal distribution with given mean and stddev standard
   // deviation, and using the engine as entropy generator.
-  // Returns the new literal object, or an error Status if failed.
+  // Returns the new literal object, or an error absl::Status if failed.
   template <PrimitiveType type, typename E,
             typename T = primitive_util::NativeTypeOf<type>>
   static absl::StatusOr<Literal> CreateRandomLiteral(const Shape& shape,
@@ -270,7 +270,7 @@ class LiteralUtil {
   // Creates a literal with the supplied shape, and initializes the literal
   // values using a normal distribution with given mean and stddev standard
   // deviation.
-  // Returns the new literal object, or an error Status if failed.
+  // Returns the new literal object, or an error absl::Status if failed.
   template <PrimitiveType type, typename T = primitive_util::NativeTypeOf<type>>
   static absl::StatusOr<Literal> CreateRandomLiteral(const Shape& shape, T mean,
                                                      T stddev);
diff --git a/third_party/xla/xla/map_util.h b/third_party/xla/xla/map_util.h
index a47c661f8f6798..f25caf0874d408 100644
--- a/third_party/xla/xla/map_util.h
+++ b/third_party/xla/xla/map_util.h
@@ -54,7 +54,7 @@ typename Collection::value_type::second_type& FindOrDie(
 // Like FindOrDie but returns an error instead of dying if `key` is not in
 // `container`.
 template <class Collection>
-StatusOr<
+absl::StatusOr<
     std::reference_wrapper<const typename Collection::value_type::second_type>>
 MaybeFind(const Collection& collection,
           const typename Collection::value_type::first_type& key) {
diff --git a/third_party/xla/xla/maybe_owning.h b/third_party/xla/xla/maybe_owning.h
new file mode 100644
index 00000000000000..4f32472ecb2f95
--- /dev/null
+++ b/third_party/xla/xla/maybe_owning.h
@@ -0,0 +1,107 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_MAYBE_OWNING_H_
+#define XLA_MAYBE_OWNING_H_
+
+#include <cstdint>
+#include <memory>
+
+// A unique_ptr like class which may or may not have ownership of its pointer.
+// Uses least significant bit of the pointer to indicate ownership.
+template <typename T>
+class MaybeOwning final {
+ public:
+  MaybeOwning() = default;
+  explicit MaybeOwning(std::unique_ptr<T> unique)
+      : ptr_and_owning_bit_(TakeUnique(std::move(unique))) {}
+
+  explicit MaybeOwning(const T* borrowed)
+      : ptr_and_owning_bit_(Borrow(borrowed)) {}
+
+  ~MaybeOwning() { MaybeDeleteOwned(); }
+
+  const T* get() const { return RemoveMask(); }
+
+  T* get_mutable() { return RemoveMask(); }
+
+  const T* operator->() const { return get(); }
+  const T& operator*() const { return *get(); }
+
+  MaybeOwning<T>& operator=(std::unique_ptr<T> unique) {
+    MaybeDeleteOwned();
+    ptr_and_owning_bit_ = TakeUnique(std::move(std::move(unique)));
+    return *this;
+  }
+
+  MaybeOwning& operator=(const T* borrowed) {
+    MaybeDeleteOwned();
+    ptr_and_owning_bit_ = Borrow(borrowed);
+    return *this;
+  }
+
+  MaybeOwning& operator=(MaybeOwning&& other) {
+    using std::swap;
+    swap(ptr_and_owning_bit_, other.ptr_and_owning_bit_);
+    return *this;
+  }
+
+  MaybeOwning(const MaybeOwning&) = delete;
+  MaybeOwning(MaybeOwning&& other)
+      : ptr_and_owning_bit_(other.ptr_and_owning_bit_) {
+    other.ptr_and_owning_bit_ = 0;
+  }
+
+  MaybeOwning Clone() const {
+    const T* ptr = get();
+    if (ptr && OwnsPtr()) {
+      return MaybeOwning(std::make_unique<T>(*ptr));
+    }
+    return MaybeOwning(ptr);
+  }
+
+  bool OwnsPtr() const { return kOwningBitMask & ptr_and_owning_bit_; }
+
+ private:
+  enum : uint64_t {
+    kOwningBitMask = 1UL,
+    kPointerMask = ~kOwningBitMask,
+  };
+
+  T* RemoveMask() const {
+    return reinterpret_cast<T*>(ptr_and_owning_bit_ & kPointerMask);
+  }
+
+  static intptr_t TakeUnique(std::unique_ptr<T> unique) {
+    T* released = unique.release();
+    DCHECK_EQ(reinterpret_cast<intptr_t>(released) & kOwningBitMask, 0);
+    return reinterpret_cast<intptr_t>(released) | kOwningBitMask;
+  }
+
+  static intptr_t Borrow(const T* borrowed) {
+    DCHECK_EQ(reinterpret_cast<intptr_t>(borrowed) & kOwningBitMask, 0);
+    return reinterpret_cast<intptr_t>(borrowed);
+  }
+
+  void MaybeDeleteOwned() {
+    if (OwnsPtr()) {
+      delete get();
+    }
+  }
+
+  intptr_t ptr_and_owning_bit_ = 0;
+};
+
+#endif  // XLA_MAYBE_OWNING_H_
diff --git a/third_party/xla/xla/mlir/backends/cpu/transforms/BUILD b/third_party/xla/xla/mlir/backends/cpu/transforms/BUILD
deleted file mode 100644
index bae1e833302a91..00000000000000
--- a/third_party/xla/xla/mlir/backends/cpu/transforms/BUILD
+++ /dev/null
@@ -1,66 +0,0 @@
-load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//xla:friends"],
-    licenses = ["notice"],
-)
-
-gentbl_cc_library(
-    name = "passes_inc_gen",
-    compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=CpuTransforms",
-            ],
-            "passes.h.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "passes.td",
-    visibility = ["//visibility:private"],
-    deps = ["@llvm-project//mlir:PassBaseTdFiles"],
-)
-
-cc_library(
-    name = "passes",
-    srcs = [
-        "legalize_i1_vector_transfers.cc",
-        "legalize_library_ops.cc",
-        "remove_copies_to_out_params.cc",
-        "xla_abi_legalization.cc",
-        "xla_cpu_memref_element_cast_to_llvm.cc",
-        "xla_cpu_to_cpu_runtime.cc",
-        "xla_rewrite_realloc_to_alloc.cc",
-    ],
-    hdrs = ["passes.h"],
-    deps = [
-        ":passes_inc_gen",
-        "//xla/mlir/runtime/transforms:type_converter",
-        "//xla/mlir/runtime/utils:custom_calls",
-        "//xla/mlir/xla_cpu/ir:xla_cpu",
-        "//xla/mlir_hlo",
-        "//xla/service:hlo_parser",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:Analysis",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:ArithUtils",
-        "@llvm-project//mlir:DialectUtils",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:LLVMCommonConversion",
-        "@llvm-project//mlir:LLVMDialect",
-        "@llvm-project//mlir:LinalgDialect",
-        "@llvm-project//mlir:MemRefDialect",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:SparseTensorDialect",
-        "@llvm-project//mlir:TensorDialect",
-        "@llvm-project//mlir:TransformUtils",
-        "@llvm-project//mlir:VectorDialect",
-        "@stablehlo//:chlo_ops",
-    ],
-)
diff --git a/third_party/xla/xla/mlir/backends/cpu/transforms/legalize_i1_vector_transfers.cc b/third_party/xla/xla/mlir/backends/cpu/transforms/legalize_i1_vector_transfers.cc
deleted file mode 100644
index 09d4293af32dc3..00000000000000
--- a/third_party/xla/xla/mlir/backends/cpu/transforms/legalize_i1_vector_transfers.cc
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <optional>
-#include <utility>
-
-#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
-#include "mlir/Dialect/Vector/IR/VectorOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "xla/mlir/backends/cpu/transforms/passes.h"
-#include "xla/mlir/xla_cpu/ir/xla_cpu.h"
-
-namespace xla {
-namespace cpu {
-namespace {
-
-#define GEN_PASS_DEF_LEGALIZEI1VECTORTRANSFEROPSPASS
-#include "xla/mlir/backends/cpu/transforms/passes.h.inc"
-
-using namespace mlir;  // NOLINT
-
-class LegalizeI1VectorTransferOpsPass
-    : public impl::LegalizeI1VectorTransferOpsPassBase<
-          LegalizeI1VectorTransferOpsPass> {
-  void runOnOperation() override;
-};
-
-Value CastToI8(Value in, ImplicitLocOpBuilder& b, bool optional = false) {
-  auto ty = in.getType();
-  assert(optional || getElementTypeOrSelf(ty).isInteger(1));
-  if (!getElementTypeOrSelf(ty).isInteger(1)) {
-    return {};
-  }
-
-  if (auto vec_ty = ty.dyn_cast<VectorType>()) {
-    return b.create<arith::ExtUIOp>(
-        vec_ty.cloneWith(std::nullopt, b.getI8Type()), in);
-  }
-  if (auto memref_ty = ty.dyn_cast<MemRefType>()) {
-    auto cast_ty = memref_ty.cloneWith(std::nullopt, b.getI8Type());
-    return b.create<xla_cpu::MemRefElementCastOp>(cast_ty, in);
-  }
-  if (ty == b.getI1Type()) {
-    return b.create<arith::ExtUIOp>(b.getI8Type(), in);
-  }
-  return {};
-}
-
-class I1TransferReadLowering : public OpRewritePattern<vector::TransferReadOp> {
-  using OpRewritePattern<vector::TransferReadOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(vector::TransferReadOp op,
-                                PatternRewriter& rewriter) const override {
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    b.setInsertionPoint(op);
-    Value cast_src = CastToI8(op.getSource(), b, /*optional=*/true);
-    if (!cast_src) {
-      return failure();
-    }
-
-    auto cast_result_ty =
-        op.getVector().getType().cloneWith(std::nullopt, b.getI8Type());
-    TypedValue<VectorType> new_read =
-        b.create<vector::TransferReadOp>(
-             TypeRange{cast_result_ty}, cast_src, op.getIndices(),
-             op.getPermutationMap(), CastToI8(op.getPadding(), b), op.getMask(),
-             op.getInBoundsAttr())
-            .getResult();
-    Value zero = b.create<arith::ConstantOp>(
-        DenseElementsAttr::get(new_read.getType(), b.getI8IntegerAttr(0)));
-    auto result =
-        b.create<arith::CmpIOp>(arith::CmpIPredicate::ne, new_read, zero);
-    rewriter.replaceOp(op, result);
-    return success();
-  };
-};
-
-class I1TransferWriteLowering
-    : public OpRewritePattern<vector::TransferWriteOp> {
-  using OpRewritePattern<vector::TransferWriteOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(vector::TransferWriteOp op,
-                                PatternRewriter& rewriter) const override {
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    b.setInsertionPoint(op);
-    // Confusingly, the destination is called 'source'.
-    auto cast_dst = CastToI8(op.getSource(), b, /*optional=*/true);
-    if (!cast_dst) {
-      return failure();
-    }
-
-    op.getVectorMutable().assign(CastToI8(op.getVector(), b));
-    op.getSourceMutable().assign(cast_dst);
-    return success();
-  };
-};
-
-void LegalizeI1VectorTransferOpsPass::runOnOperation() {
-  func::FuncOp func = getOperation();
-  MLIRContext* ctx = func.getContext();
-
-  RewritePatternSet patterns(ctx);
-  patterns.insert<I1TransferReadLowering, I1TransferWriteLowering>(ctx);
-  // TODO(jreiffers): Handle other transfer ops if we need them (load,
-  // maskedload, etc.).
-
-  if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns)))) {
-    return signalPassFailure();
-  }
-}
-
-}  // namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>>
-createLegalizeI1VectorTransferOpsPass() {
-  return std::make_unique<LegalizeI1VectorTransferOpsPass>();
-}
-
-}  // namespace cpu
-}  // namespace xla
diff --git a/third_party/xla/xla/mlir/backends/cpu/transforms/legalize_library_ops.cc b/third_party/xla/xla/mlir/backends/cpu/transforms/legalize_library_ops.cc
deleted file mode 100644
index c3231a92625385..00000000000000
--- a/third_party/xla/xla/mlir/backends/cpu/transforms/legalize_library_ops.cc
+++ /dev/null
@@ -1,449 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <algorithm>
-#include <iterator>
-#include <memory>
-#include <optional>
-#include <utility>
-#include <vector>
-
-#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
-#include "mlir/Dialect/Arith/Utils/Utils.h"  // from @llvm-project
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
-#include "mlir/Dialect/Utils/StaticValueUtils.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "xla/mlir/backends/cpu/transforms/passes.h"
-#include "xla/mlir/xla_cpu/ir/xla_cpu.h"
-#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
-
-namespace xla {
-namespace cpu {
-namespace {
-
-#define GEN_PASS_DEF_LEGALIZELIBRARYOPSPASS
-#include "xla/mlir/backends/cpu/transforms/passes.h.inc"
-
-using namespace mlir;  // NOLINT
-
-class LegalizeLibraryOpsPass
-    : public impl::LegalizeLibraryOpsPassBase<LegalizeLibraryOpsPass> {
-  void runOnOperation() override;
-};
-
-std::optional<xla_cpu::ReductionKind> MatchReductionComputation(
-    Region& region) {
-  if (!region.hasOneBlock()) {
-    return std::nullopt;
-  }
-
-  auto ret = dyn_cast<mhlo::ReturnOp>(region.front().getTerminator());
-  if (!ret || ret->getNumOperands() != 1) {
-    return std::nullopt;
-  }
-
-  auto computation = ret.getOperand(0).getDefiningOp();
-  if (computation->getNumOperands() != 2 ||
-      computation->getOperand(0) != region.front().getArgument(0) ||
-      computation->getOperand(1) != region.front().getArgument(1)) {
-    return std::nullopt;
-  }
-
-  if (isa<mhlo::AddOp>(computation)) {
-    return xla_cpu::ReductionKind::ALL_REDUCE_SUM;
-  }
-  if (isa<mhlo::MulOp>(computation)) {
-    return xla_cpu::ReductionKind::ALL_REDUCE_PRODUCT;
-  }
-  if (isa<mhlo::MinOp>(computation)) {
-    return xla_cpu::ReductionKind::ALL_REDUCE_MIN;
-  }
-  if (isa<mhlo::MaxOp>(computation)) {
-    return xla_cpu::ReductionKind::ALL_REDUCE_MAX;
-  }
-
-  auto type = computation->getOperandTypes().front().dyn_cast<ShapedType>();
-  if (!type || !type.getElementType().isInteger(1)) {
-    return std::nullopt;
-  }
-
-  if (isa<mhlo::AndOp>(computation)) {
-    return xla_cpu::ReductionKind::ALL_REDUCE_MIN;
-  }
-  if (isa<mhlo::OrOp>(computation)) {
-    return xla_cpu::ReductionKind::ALL_REDUCE_MAX;
-  }
-
-  return std::nullopt;
-}
-
-// Returns a `tensor.empty` with the same shape as `tensor`.
-Value CreateEmptyLike(OpBuilder& b, Location loc, Value tensor) {
-  auto ty = tensor.getType().cast<ShapedType>();
-  auto sizes = tensor::getMixedSizes(b, loc, tensor);
-  return b.create<tensor::EmptyOp>(loc, sizes, ty.getElementType());
-}
-
-class AllReduceLowering : public OpRewritePattern<mhlo::AllReduceOp> {
-  using OpRewritePattern<mhlo::AllReduceOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(mhlo::AllReduceOp op,
-                                PatternRewriter& rewriter) const override {
-    auto reduction_kind = MatchReductionComputation(op.getRegion());
-    if (!reduction_kind) {
-      return failure();
-    }
-
-    SmallVector<Value> dsts;
-    for (auto operand : op->getOperands()) {
-      // The operands and results have the same shapes.
-      dsts.push_back(CreateEmptyLike(rewriter, op.getLoc(), operand));
-    }
-
-    rewriter.replaceOpWithNewOp<xla_cpu::AllReduceOp>(
-        op, op->getResultTypes(), op->getOperands(), dsts,
-        op.getReplicaGroupsAttr(),
-        rewriter.getI64IntegerAttr(op.getChannelHandle()
-                                       ? op.getChannelHandle()->getHandle()
-                                       : int64_t{0}),
-        rewriter.getI32IntegerAttr(op.getUseGlobalDeviceIdsAttr() ? 1 : 0),
-        rewriter.getI32IntegerAttr(static_cast<int32_t>(*reduction_kind)));
-
-    return success();
-  };
-};
-
-template <typename IdOp, typename XlaIdOp>
-class IdLowering : public OpRewritePattern<IdOp> {
-  using OpRewritePattern<IdOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(IdOp op,
-                                PatternRewriter& rewriter) const override {
-    Value id = rewriter.create<XlaIdOp>(op.getLoc());
-    // Wrap the scalar in a tensor.
-    Value id_tensor = rewriter.create<tensor::FromElementsOp>(
-        op.getLoc(), RankedTensorType::get({}, rewriter.getI32Type()), id);
-    // And convert it to unsigned. This becomes a noop later.
-    rewriter.replaceOpWithNewOp<mhlo::ConvertOp>(
-        op,
-        RankedTensorType::get({}, IntegerType::get(rewriter.getContext(), 32,
-                                                   IntegerType::Unsigned)),
-        id_tensor);
-    return success();
-  };
-};
-
-class CollectivePermuteLowering
-    : public OpRewritePattern<mhlo::CollectivePermuteOp> {
-  using OpRewritePattern<mhlo::CollectivePermuteOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(mhlo::CollectivePermuteOp op,
-                                PatternRewriter& rewriter) const override {
-    // The result of collective_permute has the same shape as the operand.
-    Value dst = CreateEmptyLike(rewriter, op.getLoc(), op.getOperand());
-    rewriter.replaceOpWithNewOp<xla_cpu::CollectivePermuteOp>(
-        op, op->getResultTypes(), op->getOperand(0), dst,
-        op.getSourceTargetPairsAttr(),
-        rewriter.getI64IntegerAttr(op.getChannelHandle()
-                                       ? op.getChannelHandle()->getHandle()
-                                       : int64_t{0}));
-    return success();
-  };
-};
-
-class AllToAllLowering : public OpRewritePattern<mhlo::AllToAllOp> {
-  using OpRewritePattern<mhlo::AllToAllOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(mhlo::AllToAllOp op,
-                                PatternRewriter& rewriter) const override {
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-
-    SmallVector<Value> dsts;
-
-    if (!op.getConcatDimensionAttr()) {
-      for (auto operand : op->getOperands()) {
-        // The operands and results of TupleAllToAll the same shapes.
-        dsts.push_back(CreateEmptyLike(rewriter, op.getLoc(), operand));
-      }
-    } else {
-      auto sizes = getValueOrCreateConstantIndexOp(
-          b, b.getLoc(),
-          tensor::getMixedSizes(b, op.getLoc(), op->getOperand(0)));
-      uint64_t split_dimension = *op.getSplitDimension();
-      Value split_count = b.create<arith::ConstantIndexOp>(*op.getSplitCount());
-      sizes[split_dimension] = b.createOrFold<arith::DivUIOp>(
-          b.getIndexType(), sizes[split_dimension], split_count);
-      uint64_t concat_dimension = *op.getConcatDimension();
-      sizes[concat_dimension] =
-          b.createOrFold<arith::MulIOp>(sizes[concat_dimension], split_count);
-
-      dsts.push_back(rewriter.create<tensor::EmptyOp>(
-          op.getLoc(), getAsOpFoldResult(sizes),
-          op->getResultTypes()[0].cast<ShapedType>().getElementType()));
-    }
-
-    rewriter.replaceOpWithNewOp<xla_cpu::AllToAllOp>(
-        op, op->getResultTypes(), op->getOperands(), dsts,
-        op.getReplicaGroupsAttr(),
-        rewriter.getI32IntegerAttr(op.getChannelHandle() ? 1 : 0),
-        rewriter.getI64IntegerAttr(op.getChannelHandle()
-                                       ? op.getChannelHandle()->getHandle()
-                                       : int64_t{0}),
-        op.getSplitDimensionAttr(), op.getConcatDimensionAttr(),
-        op.getSplitCountAttr());
-    return success();
-  };
-};
-
-class FftLowering : public OpRewritePattern<mhlo::FftOp> {
-  using OpRewritePattern<mhlo::FftOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(mhlo::FftOp op,
-                                PatternRewriter& rewriter) const override {
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-
-    // TODO(jreiffers): Support dynamic sizes.
-    auto dst = b.create<tensor::EmptyOp>(op.getLoc(), op.getType().getShape(),
-                                         op.getType().getElementType());
-
-    auto lengths =
-        llvm::to_vector<3>(op.getFftLengthAttr().getValues<int64_t>());
-    rewriter.replaceOpWithNewOp<xla_cpu::FftOp>(
-        op, op->getResultTypes(), op->getOperand(0), dst,
-        static_cast<int32_t>(op.getFftType()),
-        rewriter.getI64ArrayAttr(lengths));
-    return success();
-  };
-};
-
-class InfeedLowering : public OpRewritePattern<mhlo::InfeedOp> {
-  using OpRewritePattern<mhlo::InfeedOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(mhlo::InfeedOp op,
-                                PatternRewriter& rewriter) const override {
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-
-    llvm::SmallVector<Value> dsts;
-    for (const auto& type : op.getResultTypes()) {
-      if (auto ranked_type = type.dyn_cast<RankedTensorType>()) {
-        dsts.push_back(b.create<tensor::EmptyOp>(
-            op.getLoc(), ranked_type.getShape(), ranked_type.getElementType()));
-      } else {
-        // Last element of result types is expected to be of token type.
-        dsts.push_back(op.getToken());
-      }
-    }
-
-    rewriter.replaceOpWithNewOp<xla_cpu::InfeedOp>(
-        op, op.getResultTypes(), dsts, op.getInfeedConfigAttr(),
-        op.getLayoutAttr());
-    return success();
-  };
-};
-
-class OutfeedLowering : public OpRewritePattern<mhlo::OutfeedOp> {
-  using OpRewritePattern<mhlo::OutfeedOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(mhlo::OutfeedOp op,
-                                PatternRewriter& rewriter) const override {
-    SmallVector<Attribute> result_types;
-    for (auto operand : op.getInputs()) {
-      result_types.push_back(
-          TypeAttr::get(operand.getType().cast<ShapedType>().getElementType()));
-    }
-    rewriter.create<xla_cpu::OutfeedOp>(
-        op.getLoc(), std::nullopt, op.getInputs(), op.getOutfeedConfigAttr(),
-        ArrayAttr::get(op->getContext(), result_types));
-
-    // Replacing the op with the token.
-    rewriter.replaceOp(op, op.getToken());
-    return success();
-  };
-};
-
-class AfterAllLowering : public OpRewritePattern<mhlo::AfterAllOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(mhlo::AfterAllOp op,
-                                PatternRewriter& rewriter) const override {
-    // We don't reorder collective ops, so after_all is a no-op.
-    rewriter.replaceOp(op, op->getOperand(0));
-    return success();
-  };
-};
-
-class RngBitGeneratorLowering
-    : public OpRewritePattern<mhlo::RngBitGeneratorOp> {
-  using OpRewritePattern<mhlo::RngBitGeneratorOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(mhlo::RngBitGeneratorOp op,
-                                PatternRewriter& rewriter) const override {
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-
-    auto state_init = CreateEmptyLike(b, op.getLoc(), op.getOperand());
-    auto output_init =
-        b.create<tensor::EmptyOp>(op.getLoc(), op.getType(1), ValueRange{});
-
-    rewriter.replaceOpWithNewOp<xla_cpu::RngBitGeneratorOp>(
-        op, op->getResultTypes(), op->getOperand(0), state_init, output_init,
-        op.getRngAlgorithmAttr());
-    return success();
-  };
-};
-
-class AddDependencyLowering : public OpRewritePattern<mhlo::AddDependencyOp> {
-  using OpRewritePattern<mhlo::AddDependencyOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(mhlo::AddDependencyOp op,
-                                PatternRewriter& rewriter) const override {
-    rewriter.replaceOpWithNewOp<xla_cpu::AddDependencyOp>(
-        op, op->getResultTypes(), op->getOperands());
-    return success();
-  };
-};
-
-class ConvolutionLowering : public OpRewritePattern<mhlo::ConvolutionOp> {
-  using OpRewritePattern<mhlo::ConvolutionOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(mhlo::ConvolutionOp op,
-                                PatternRewriter& rewriter) const override {
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-
-    auto input_shape = op.getLhs().getType().dyn_cast<ShapedType>();
-    auto kernel_shape = op.getRhs().getType().dyn_cast<ShapedType>();
-    auto output_shape = op.getResult().getType().dyn_cast<ShapedType>();
-
-    auto dnums = op.getDimensionNumbers();
-    auto reversals = op.getWindowReversal();
-    // Convolution op is implementable as Eigen convolution if:
-    // - input and kernel have non-zero number of elements
-    // - input is NHWC order
-    // - kernel is HWIO order
-    // - some other layout constraints
-    auto implementable_as_eigen_convolution = [&]() {
-      if (!input_shape || !kernel_shape || !output_shape ||
-          !input_shape.hasStaticShape() || !kernel_shape.hasStaticShape() ||
-          !output_shape.hasStaticShape()) {
-        return false;
-      }
-
-      auto primitive_type = input_shape.getElementType();
-      if (!(primitive_type.isF32() || primitive_type.isF16())) {
-        return false;
-      }
-
-      if (llvm::is_contained(input_shape.getShape(), 0) ||
-          llvm::is_contained(kernel_shape.getShape(), 0)) {
-        return false;
-      }
-
-      if (reversals.has_value() &&
-          llvm::is_contained(reversals.value().getValues<bool>(), true)) {
-        return false;
-      }
-
-      auto numSpatialDims = dnums.getOutputSpatialDimensions().size();
-      if (numSpatialDims < 1 || numSpatialDims > 3) {
-        return false;
-      }
-
-      if (!llvm::equal(dnums.getInputSpatialDimensions(),
-                       llvm::seq<int64_t>(1, numSpatialDims + 1))) {
-        return false;
-      }
-
-      if (!llvm::equal(dnums.getKernelSpatialDimensions(),
-                       llvm::seq<int64_t>(0, numSpatialDims))) {
-        return false;
-      }
-
-      if (!llvm::equal(dnums.getOutputSpatialDimensions(),
-                       llvm::seq<int64_t>(1, numSpatialDims + 1))) {
-        return false;
-      }
-
-      if (!op.getWindowStrides().has_value() || !op.getPadding().has_value() ||
-          !op.getLhsDilation().has_value() || !op.getRhsDilation().has_value())
-        return false;
-
-      auto input_rank = input_shape.getRank();
-      auto kernel_rank = kernel_shape.getRank();
-      auto output_rank = output_shape.getRank();
-      return dnums.getInputBatchDimension() == 0 &&
-             dnums.getInputFeatureDimension() == input_rank - 1 &&
-             dnums.getOutputBatchDimension() == 0 &&
-             dnums.getOutputFeatureDimension() == output_rank - 1 &&
-             dnums.getKernelInputFeatureDimension() == kernel_rank - 2 &&
-             dnums.getKernelOutputFeatureDimension() == kernel_rank - 1;
-    };
-    if (!implementable_as_eigen_convolution()) {
-      return failure();
-    }
-
-    auto dst = b.create<tensor::EmptyOp>(op.getLoc(), op.getType().getShape(),
-                                         op.getType().getElementType());
-
-    rewriter.replaceOpWithNewOp<xla_cpu::ConvolutionOp>(
-        op, op->getResultTypes(), op.getLhs(), op.getRhs(), dst,
-        op.getWindowStridesAttr(), op.getPaddingAttr(), op.getLhsDilationAttr(),
-        op.getRhsDilationAttr(), op.getWindowReversalAttr(),
-        rewriter.getI64IntegerAttr(dnums.getInputBatchDimension()),
-        rewriter.getI64IntegerAttr(dnums.getInputFeatureDimension()),
-        rewriter.getI64ArrayAttr(dnums.getInputSpatialDimensions()),
-        rewriter.getI64IntegerAttr(dnums.getKernelInputFeatureDimension()),
-        rewriter.getI64IntegerAttr(dnums.getKernelOutputFeatureDimension()),
-        rewriter.getI64ArrayAttr(dnums.getKernelSpatialDimensions()),
-        rewriter.getI64IntegerAttr(dnums.getOutputBatchDimension()),
-        rewriter.getI64IntegerAttr(dnums.getOutputFeatureDimension()),
-        rewriter.getI64ArrayAttr(dnums.getOutputSpatialDimensions()),
-        op.getFeatureGroupCountAttr(), op.getBatchGroupCountAttr(),
-        op.getPrecisionConfigAttr());
-    return success();
-  };
-};
-
-void LegalizeLibraryOpsPass::runOnOperation() {
-  func::FuncOp func = getOperation();
-  MLIRContext* ctx = func.getContext();
-
-  // Convert mhlo library operations to XLA cpu ops.
-  RewritePatternSet patterns(ctx);
-  patterns.insert<AddDependencyLowering, AfterAllLowering, AllReduceLowering,
-                  AllToAllLowering, CollectivePermuteLowering,
-                  ConvolutionLowering, FftLowering,
-                  IdLowering<mhlo::PartitionIdOp, xla_cpu::PartitionIdOp>,
-                  IdLowering<mhlo::ReplicaIdOp, xla_cpu::ReplicaIdOp>,
-                  InfeedLowering, OutfeedLowering, RngBitGeneratorLowering>(
-      ctx);
-
-  if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns)))) {
-    return signalPassFailure();
-  }
-}
-
-}  // namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeLibraryOpsPass() {
-  return std::make_unique<LegalizeLibraryOpsPass>();
-}
-
-}  // namespace cpu
-}  // namespace xla
diff --git a/third_party/xla/xla/mlir/backends/cpu/transforms/passes.h b/third_party/xla/xla/mlir/backends/cpu/transforms/passes.h
deleted file mode 100644
index 7993c0f5d926f6..00000000000000
--- a/third_party/xla/xla/mlir/backends/cpu/transforms/passes.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_MLIR_BACKENDS_CPU_TRANSFORMS_PASSES_H_
-#define XLA_MLIR_BACKENDS_CPU_TRANSFORMS_PASSES_H_
-
-#include <memory>
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-
-namespace xla {
-namespace cpu {
-
-//===----------------------------------------------------------------------===//
-// Auxiliary passes for lowering to XLA Cpu runtime.
-//===----------------------------------------------------------------------===//
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createConvertXlaCpuToCpuRuntimePass();
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createXlaAbiLegalizationPass();
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createLegalizeLibraryOpsPass();
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createLegalizeI1VectorTransferOpsPass();
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createConvertXlaCpuMemRefElementCastToLLVMPass();
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createRemoveCopiesToOutParamsPass();
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createRewriteReallocToAllocPass();
-
-//===-----------------------------------------------------------------------===/
-
-#define GEN_PASS_REGISTRATION
-#include "xla/mlir/backends/cpu/transforms/passes.h.inc"
-
-}  // namespace cpu
-}  // namespace xla
-
-#endif  // XLA_MLIR_BACKENDS_CPU_TRANSFORMS_PASSES_H_
diff --git a/third_party/xla/xla/mlir/backends/cpu/transforms/passes.td b/third_party/xla/xla/mlir/backends/cpu/transforms/passes.td
deleted file mode 100644
index 4d84b86bf970e3..00000000000000
--- a/third_party/xla/xla/mlir/backends/cpu/transforms/passes.td
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_MLIR_TRANSFORMS_CPU_PASSES_TD_
-#define XLA_MLIR_TRANSFORMS_CPU_PASSES_TD_
-
-include "mlir/Pass/PassBase.td"
-
-//===----------------------------------------------------------------------===//
-// Auxiliary passes for lowering to XLA Cpu runtime.
-//===----------------------------------------------------------------------===//
-
-def ConvertXlaCpuToCpuRuntimePass :
-    Pass<"xla-cpu-to-cpu-runtime", "mlir::ModuleOp"> {
-  let summary = "Converts xla_cpu operations to XLA Cpu runtime custom calls";
-
-  let description = [{
-      Converts xla_cpu dialect operations to XLA Cpu runtime custom calls.
-  }];
-
-  let constructor = "createConvertXlaCpuToCpuRuntimePass()";
-}
-
-def LegalizeXlaAbiPass :
-    Pass<"xla-legalize-abi", "mlir::ModuleOp"> {
-  let summary = "Converts layouts and data formats at ABI boundaries";
-
-  let description = [{
-      Converts layouts defined in entry_computation_layout to the default
-      layout assumed by the MLIR pipeline. The same is done for custom calls.
-
-      This pass should run immediately after expand-hlo-tuples.
-  }];
-
-  let dependentDialects = [
-      "mlir::func::FuncDialect", "mlir::mhlo::MhloDialect"
-  ];
-
-  let constructor = "createXlaAbiLegalizationPass()";
-}
-
-def LegalizeLibraryOpsPass :
-   Pass<"xla-legalize-library-ops", "mlir::func::FuncOp"> {
-  let summary = "Legalizes ops that map to runtime library calls.";
-
-  let description = [{
-      Lowers ops that map to a runtime library to xla_cpu ops.
-  }];
-
-  let dependentDialects = [
-    "mlir::mhlo::MhloDialect", "mlir::tensor::TensorDialect",
-    "mlir::xla_cpu::XlaCpuDialect"
-  ];
-
-  let constructor = "createLegalizeLibraryOpsPass()";
-}
-
-def LegalizeI1VectorTransferOpsPass :
-   Pass<"xla-legalize-i1-vector-transfers", "mlir::func::FuncOp"> {
-  let summary = "Legalizes transfer ops operating on vectors of i1.";
-
-  let description = [{
-      Replaces transfers involving vector<i1>s with memref casts to i8, and
-      vector comparisons.
-  }];
-
-  let dependentDialects = [
-    "mlir::vector::VectorDialect", "mlir::xla_cpu::XlaCpuDialect"
-  ];
-
-  let constructor = "createLegalizeI1VectorTransferOpsPass()";
-}
-
-def ConvertXlaCpuMemRefElementCastToLLVMPass :
-   Pass<"xla-convert-memref-element-cast-to-llvm", "mlir::func::FuncOp"> {
-  let summary = "Converts xla_cpu.memref_element_cast ops to LLVM.";
-
-  let description = [{
-      Rewrites xla_cpu.memref_elements_cast ops as a new memref descriptor,
-      where the allocated and aligned pointers are updated.
-  }];
-
-  let dependentDialects = ["mlir::LLVM::LLVMDialect"];
-
-  let constructor = "createConvertXlaCpuMemRefElementCastToLLVMPass()";
-}
-
-def RemoveCopiesToOutParamsPass :
-   Pass<"xla-remove-copies-to-out-params", "mlir::func::FuncOp"> {
-  let summary = "Removes redundant alloc/copy pairs to out params.";
-
-  let description = [{
-      Removes redundant alloc/alloca + copy pairs that can remain after running
-      bufferization's BufferResultsToOutParams pass.
-  }];
-
-  let constructor = "createRemoveCopiesToOutParamsPass()";
-}
-
-def RewriteReallocToAllocPass :
-    Pass<"xla-rewrite-realloc-to-alloc", "mlir::func::FuncOp"> {
-  let summary = "Rewrites realloc to alloc + copy";
-
-  let description = [{
-    Rewrites realloc to alloc + copy.
-
-    This pass is actually undesirable. However, the current buffer-deallocation
-    pass fails to handle reallocation correctly, and causes double free issues
-    when using memref::ReallocOp. We should remove this pass after the bug is
-    fixed upstream.
-  }];
-
-  let dependentDialects = [
-      "mlir::memref::MemRefDialect"
-  ];
-
-  let constructor = "createRewriteReallocToAllocPass()";
-}
-
-#endif  // XLA_MLIR_TRANSFORMS_CPU_PASSES_TD_
diff --git a/third_party/xla/xla/mlir/backends/cpu/transforms/remove_copies_to_out_params.cc b/third_party/xla/xla/mlir/backends/cpu/transforms/remove_copies_to_out_params.cc
deleted file mode 100644
index 5171fd2dadd346..00000000000000
--- a/third_party/xla/xla/mlir/backends/cpu/transforms/remove_copies_to_out_params.cc
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <iterator>
-#include <memory>
-#include <optional>
-#include <utility>
-#include <vector>
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "xla/mlir/backends/cpu/transforms/passes.h"
-
-namespace xla {
-namespace cpu {
-namespace {
-
-using ::mlir::LogicalResult;
-using ::mlir::Operation;
-using ::mlir::OperationPass;
-using ::mlir::PatternRewriter;
-using ::mlir::RewritePatternSet;
-using ::mlir::Value;
-
-namespace memref = ::mlir::memref;
-namespace func = ::mlir::func;
-
-#define GEN_PASS_DEF_REMOVECOPIESTOOUTPARAMSPASS
-#include "xla/mlir/backends/cpu/transforms/passes.h.inc"
-
-LogicalResult AllocRemoval(memref::CopyOp copy, PatternRewriter &rewriter) {
-  Value from = copy.getSource();
-  Value to = copy.getTarget();
-
-  Operation *alloc =
-      llvm::dyn_cast_or_null<memref::AllocOp>(from.getDefiningOp());
-  if (!alloc) {
-    return mlir::failure();
-  }
-
-  // Only match if we dealloc immediately after the copy.
-  auto dealloc = llvm::dyn_cast_or_null<memref::DeallocOp>(copy->getNextNode());
-  if (!dealloc || dealloc.getMemref() != from) {
-    return mlir::failure();
-  }
-
-  // Only go up one level to grab the parent function; the match we're looking
-  // for is at the very end of a function.
-  auto func = llvm::dyn_cast_or_null<func::FuncOp>(copy->getParentOp());
-  if (!func) {
-    return mlir::failure();
-  }
-
-  // If the copy target is a function argument, use it directly.
-  if (llvm::is_contained(func.getArguments(), to)) {
-    rewriter.replaceAllUsesWith(from, to);
-    rewriter.eraseOp(alloc);
-    rewriter.eraseOp(dealloc);
-    rewriter.eraseOp(copy);
-    return mlir::success();
-  }
-  return mlir::failure();
-}
-
-LogicalResult AllocaRemoval(memref::CopyOp copy, PatternRewriter &rewriter) {
-  Value from = copy.getSource();
-  Value to = copy.getTarget();
-
-  Operation *alloca =
-      llvm::dyn_cast_or_null<memref::AllocaOp>(from.getDefiningOp());
-  if (!alloca) {
-    return mlir::failure();
-  }
-
-  // Only go up one level to grab the parent function; the match we're looking
-  // for is at the very end of a function.
-  auto func = llvm::dyn_cast_or_null<func::FuncOp>(copy->getParentOp());
-  if (!func) {
-    return mlir::failure();
-  }
-
-  // If the copy target is a function argument, use it directly.
-  if (llvm::is_contained(func.getArguments(), to)) {
-    rewriter.replaceAllUsesWith(from, to);
-    rewriter.eraseOp(alloca);
-    rewriter.eraseOp(copy);
-    return mlir::success();
-  }
-  return mlir::failure();
-}
-
-class RemoveCopiesToOutParamsPass
-    : public impl::RemoveCopiesToOutParamsPassBase<
-          RemoveCopiesToOutParamsPass> {
-  void runOnOperation() override {
-    RewritePatternSet patterns(&getContext());
-    patterns.add(AllocRemoval);
-    patterns.add(AllocaRemoval);
-    if (failed(mlir::applyPatternsAndFoldGreedily(getOperation(),
-                                                  std::move(patterns)))) {
-      signalPassFailure();
-    }
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>>
-createRemoveCopiesToOutParamsPass() {
-  return std::make_unique<RemoveCopiesToOutParamsPass>();
-}
-
-}  // namespace cpu
-}  // namespace xla
diff --git a/third_party/xla/xla/mlir/backends/cpu/transforms/xla_abi_legalization.cc b/third_party/xla/xla/mlir/backends/cpu/transforms/xla_abi_legalization.cc
deleted file mode 100644
index 03298f338e7980..00000000000000
--- a/third_party/xla/xla/mlir/backends/cpu/transforms/xla_abi_legalization.cc
+++ /dev/null
@@ -1,330 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <utility>
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/IRMapping.h"  // from @llvm-project
-#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "xla/mlir/backends/cpu/transforms/passes.h"
-#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
-
-namespace xla {
-namespace cpu {
-namespace {
-
-#define GEN_PASS_DEF_LEGALIZEXLAABIPASS
-#include "xla/mlir/backends/cpu/transforms/passes.h.inc"
-
-using namespace mlir;  // NOLINT
-
-class LegalizeXlaAbiPass
-    : public impl::LegalizeXlaAbiPassBase<LegalizeXlaAbiPass> {
-  void runOnOperation() override;
-};
-
-bool IsDefaultLayout(ArrayRef<int64_t> layout) {
-  return llvm::equal(llvm::reverse(layout),
-                     llvm::seq(size_t{0}, layout.size()));
-}
-
-Value NormalizeTensor(ImplicitLocOpBuilder& b, TypedValue<ShapedType> tensor,
-                      ArrayRef<int64_t> layout, bool isInput) {
-  int64_t rank = tensor.getType().getRank();
-  SmallVector<int64_t> permutation{llvm::reverse(layout)};
-  SmallVector<int64_t> physical_dim_sizes(rank);
-  for (int64_t dim = 0; dim < rank; ++dim) {
-    physical_dim_sizes[dim] = tensor.getType().getDimSize(permutation[dim]);
-  }
-  auto physical_shape = RankedTensorType::get(
-      physical_dim_sizes, tensor.getType().getElementType());
-
-  if (isInput) {
-    SmallVector<int64_t> inverse_permutation(rank);
-    for (int64_t dim = 0; dim < rank; ++dim) {
-      inverse_permutation[permutation[dim]] = dim;
-    }
-    Value reshape = b.create<mhlo::ReshapeOp>(physical_shape, tensor);
-    return b.create<mhlo::TransposeOp>(tensor.getType(), reshape,
-                                       b.getI64VectorAttr(inverse_permutation));
-  }
-
-  Value transpose = b.create<mhlo::TransposeOp>(
-      physical_shape, tensor, b.getI64VectorAttr(permutation));
-  return b.create<mhlo::ReshapeOp>(tensor.getType(), transpose);
-}
-
-void NormalizeInputInPlace(ImplicitLocOpBuilder& b, Value tensor,
-                           ArrayRef<int64_t> layout) {
-  auto typedTensor = tensor.dyn_cast<TypedValue<ShapedType>>();
-  if (!typedTensor || IsDefaultLayout(layout)) {
-    return;
-  }
-
-  Value normalized = NormalizeTensor(b, typedTensor, layout, /*isInput=*/true);
-  tensor.replaceAllUsesExcept(
-      normalized, normalized.getDefiningOp()->getOperand(0).getDefiningOp());
-}
-
-SmallVector<SmallVector<int64_t>> FlattenLayoutAttribute(Attribute attr) {
-  SmallVector<SmallVector<int64_t>> layouts;
-
-  auto visit_attr = [&](mlir::Attribute attr) {
-    if (attr.isa<DenseElementsAttr>()) {
-      layouts.emplace_back(attr.cast<DenseElementsAttr>().getValues<int64_t>());
-    }
-  };
-
-  if (auto array = attr.dyn_cast<ArrayAttr>()) {
-    for (int64_t i = 0; i < array.size(); ++i) {
-      visit_attr(array[i]);
-    }
-  } else {
-    visit_attr(attr);
-  }
-  return layouts;
-}
-
-struct RewriteInputArgs : OpRewritePattern<func::FuncOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(func::FuncOp op,
-                                PatternRewriter& rewriter) const override {
-    // Note: this attribute is only present if
-    // a) this function is the entry computation and
-    // b) there's at least one input with a custom layout.
-    auto layouts = op->getAttr("xla_entry_computation_parameter_layouts");
-    if (layouts == nullptr) {
-      return failure();
-    }
-
-    // Flatten the layouts (we're assuming we run after expand-hlo-tuples).
-    SmallVector<SmallVector<int64_t>> param_layouts =
-        FlattenLayoutAttribute(layouts);
-    assert(param_layouts.size() == op.getNumArguments() &&
-           "Unexpected number of parameter layouts");
-
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    b.setInsertionPointToStart(&op.getBody().front());
-    IRMapping bvm;
-    for (const auto&& [param, layout] :
-         llvm::zip(op.getArguments(), param_layouts)) {
-      NormalizeInputInPlace(b, param, layout);
-    }
-    op->removeAttr("xla_entry_computation_parameter_layouts");
-
-    return success();
-  }
-};
-
-struct RewriteReturnArgs : OpRewritePattern<func::ReturnOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(func::ReturnOp op,
-                                PatternRewriter& rewriter) const override {
-    auto func = op->getParentOfType<func::FuncOp>();
-    assert(func && "ReturnOp's parent is always a FuncOp");
-    auto layouts = func->getAttr("xla_entry_computation_result_layout");
-    if (layouts == nullptr) {
-      return failure();
-    }
-
-    SmallVector<SmallVector<int64_t>> result_layouts =
-        FlattenLayoutAttribute(layouts);
-    assert(result_layouts.size() == func.getNumResults() &&
-           "Unexpected number of result layouts");
-
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    b.setInsertionPoint(op);
-    SmallVector<Value> results;
-    for (const auto&& [result, layout] :
-         llvm::zip(op.getOperands(), result_layouts)) {
-      results.push_back(
-          IsDefaultLayout(layout)
-              ? result
-              : NormalizeTensor(b, result.cast<TypedValue<ShapedType>>(),
-                                layout,
-                                /*isInput=*/false));
-    }
-
-    func->removeAttr("xla_entry_computation_result_layout");
-    rewriter.replaceOpWithNewOp<func::ReturnOp>(op, results);
-    return success();
-  }
-};
-
-bool IsI1Tensor(Type ty) {
-  return ty.isa<ShapedType>() &&
-         ty.cast<ShapedType>().getElementType().isInteger(1);
-}
-
-struct RewriteI1Results : OpRewritePattern<func::FuncOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(func::FuncOp op,
-                                PatternRewriter& rewriter) const override {
-    if (!llvm::any_of(op.getResultTypes(), IsI1Tensor)) {
-      return failure();
-    }
-
-    func::ReturnOp return_op =
-        cast<func::ReturnOp>(op.getBody().front().getTerminator());
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    b.setInsertionPoint(return_op);
-
-    for (const auto& [index, resultTy] :
-         llvm::enumerate(return_op.getOperandTypes())) {
-      if (IsI1Tensor(resultTy)) {
-        return_op.setOperand(
-            index, b.create<mhlo::ConvertOp>(
-                       return_op.getOperand(index),
-                       rewriter.getIntegerType(8, /*isSigned=*/false)));
-      }
-    }
-
-    FunctionType fun_ty = op.getFunctionType();
-    op.setFunctionType(FunctionType::get(
-        fun_ty.getContext(), fun_ty.getInputs(), return_op->getOperandTypes()));
-
-    return success();
-  }
-};
-
-struct RewriteCustomCalls : OpRewritePattern<mhlo::CustomCallOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(mhlo::CustomCallOp op,
-                                PatternRewriter& rewriter) const override {
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    b.setInsertionPoint(op);
-
-    if (op.getCallTargetName().starts_with("sparse_tensor_")) {
-      // Skips special calling target for sparse tensors.
-      return failure();
-    }
-
-    if (!op->hasAttr("operand_layouts") && !op->hasAttr("result_layouts") &&
-        !llvm::any_of(op.getOperandTypes(), IsI1Tensor)) {
-      return failure();
-    }
-
-    // Normalize any operands that require it.
-    if (auto operand_layouts_attr = op.getOperandLayoutsAttr()) {
-      SmallVector<SmallVector<int64_t>> operand_layouts =
-          FlattenLayoutAttribute(operand_layouts_attr);
-
-      for (const auto& [index, operand] : llvm::enumerate(op.getOperands())) {
-        const auto& layout = operand_layouts[index];
-        if (!IsDefaultLayout(layout)) {
-          Value normalized = NormalizeTensor(
-              b, op.getOperand(index).cast<TypedValue<ShapedType>>(), layout,
-              /*isInput=*/false);
-          op.setOperand(index, normalized);
-        }
-      }
-      op.removeOperandLayoutsAttr();
-    }
-
-    // Rewrite i1 inputs to ui8.
-    for (const auto& [index, operand] : llvm::enumerate(op.getOperands())) {
-      if (IsI1Tensor(operand.getType())) {
-        op.setOperand(index, b.create<mhlo::ConvertOp>(
-                                 operand, rewriter.getIntegerType(
-                                              8, /*isSigned=*/false)));
-      }
-    }
-
-    // Normalize outputs.
-    b.setInsertionPointAfter(op);
-    if (auto result_layouts_attr = op.getResultLayoutsAttr()) {
-      SmallVector<SmallVector<int64_t>> result_layouts =
-          FlattenLayoutAttribute(result_layouts_attr);
-      assert(result_layouts.size() == op.getNumResults() &&
-             "Unexpected number of result layouts");
-      for (const auto& [result, layout] :
-           llvm::zip(op.getResults(), result_layouts)) {
-        NormalizeInputInPlace(b, result, layout);
-      }
-
-      op.removeResultLayoutsAttr();
-    }
-
-    return success();
-  }
-};
-
-template <typename Op>
-struct RewriteResultLayout : OpRewritePattern<Op> {
-  using OpRewritePattern<Op>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(Op op,
-                                PatternRewriter& rewriter) const override {
-    auto layout_attr = op->getAttr("result_layout");
-    if (!layout_attr) {
-      return failure();
-    }
-
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    b.setInsertionPointAfter(op);
-
-    SmallVector<SmallVector<int64_t>> result_layouts =
-        FlattenLayoutAttribute(layout_attr);
-
-    assert(result_layouts.size() == op->getNumResults() &&
-           "Unexpected number of result layouts");
-    for (const auto& [result, layout] :
-         llvm::zip(op->getResults(), result_layouts)) {
-      NormalizeInputInPlace(b, result, layout);
-    }
-
-    op->removeAttr("result_layout");
-    return success();
-  }
-};
-
-void LegalizeXlaAbiPass::runOnOperation() {
-  ModuleOp module = getOperation();
-  MLIRContext* ctx = module.getContext();
-
-  // Convert lmhlo operations to XLA cpu runtime custom calls.
-  RewritePatternSet patterns(ctx);
-  patterns.insert<RewriteInputArgs, RewriteReturnArgs, RewriteI1Results,
-                  RewriteCustomCalls, RewriteResultLayout<mhlo::ConstantOp>>(
-      ctx);
-
-  if (failed(applyPatternsAndFoldGreedily(module, std::move(patterns)))) {
-    return signalPassFailure();
-  }
-}
-
-}  // namespace
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createXlaAbiLegalizationPass() {
-  return std::make_unique<LegalizeXlaAbiPass>();
-}
-
-}  // namespace cpu
-}  // namespace xla
diff --git a/third_party/xla/xla/mlir/backends/cpu/transforms/xla_cpu_memref_element_cast_to_llvm.cc b/third_party/xla/xla/mlir/backends/cpu/transforms/xla_cpu_memref_element_cast_to_llvm.cc
deleted file mode 100644
index 8bedb9be524aef..00000000000000
--- a/third_party/xla/xla/mlir/backends/cpu/transforms/xla_cpu_memref_element_cast_to_llvm.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <utility>
-
-#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
-#include "mlir/Conversion/LLVMCommon/Pattern.h"
-#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
-#include "mlir/Analysis/DataLayoutAnalysis.h"  // from @llvm-project
-#include "mlir/Conversion/LLVMCommon/MemRefBuilder.h"  // from @llvm-project
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "xla/mlir/backends/cpu/transforms/passes.h"
-#include "xla/mlir/xla_cpu/ir/xla_cpu.h"
-
-namespace xla {
-namespace cpu {
-namespace {
-
-#define GEN_PASS_DEF_CONVERTXLACPUMEMREFELEMENTCASTTOLLVMPASS
-#include "xla/mlir/backends/cpu/transforms/passes.h.inc"
-
-using namespace mlir;  // NOLINT
-
-struct MemRefElementCastOpLowering
-    : public ConvertOpToLLVMPattern<xla_cpu::MemRefElementCastOp> {
-  using ConvertOpToLLVMPattern<
-      xla_cpu::MemRefElementCastOp>::ConvertOpToLLVMPattern;
-
-  LogicalResult matchAndRewrite(
-      xla_cpu::MemRefElementCastOp cast_op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    auto target_memref_ty = cast_op.getDst().getType().cast<MemRefType>();
-
-    const LLVMTypeConverter &type_converter = *getTypeConverter();
-    auto target_desc_ty = type_converter.convertType(target_memref_ty)
-                              .dyn_cast_or_null<LLVM::LLVMStructType>();
-    if (!target_desc_ty) {
-      return failure();
-    }
-
-    // Unpack the descriptor into the list of its fields.
-    Location loc = cast_op.getLoc();
-    Type src_type = cast_op.getSrc().getType();
-
-    SmallVector<Value> desc_fields;
-    MemRefDescriptor::unpack(rewriter, loc, adaptor.getSrc(),
-                             src_type.cast<MemRefType>(), desc_fields);
-
-    // Create descriptor.
-    auto dst_desc = MemRefDescriptor::pack(rewriter, loc, type_converter,
-                                           cast_op.getType(), desc_fields);
-    rewriter.replaceOp(cast_op, {dst_desc});
-    return success();
-  }
-};
-
-struct ConvertXlaCpuMemRefElementCastToLLVMPass
-    : public impl::ConvertXlaCpuMemRefElementCastToLLVMPassBase<
-          ConvertXlaCpuMemRefElementCastToLLVMPass> {
-  ConvertXlaCpuMemRefElementCastToLLVMPass() = default;
-
-  void runOnOperation() override {
-    Operation *op = getOperation();
-    const auto &data_layout_analysis = getAnalysis<DataLayoutAnalysis>();
-    LowerToLLVMOptions options(&getContext(),
-                               data_layout_analysis.getAtOrAbove(op));
-
-    LLVMTypeConverter type_converter(&getContext(), options,
-                                     &data_layout_analysis);
-    RewritePatternSet patterns(&getContext());
-    patterns.add<MemRefElementCastOpLowering>(type_converter);
-
-    LLVMConversionTarget target(getContext());
-    target.addLegalOp<func::FuncOp>();
-    if (failed(applyPartialConversion(op, target, std::move(patterns)))) {
-      signalPassFailure();
-    }
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createConvertXlaCpuMemRefElementCastToLLVMPass() {
-  return std::make_unique<ConvertXlaCpuMemRefElementCastToLLVMPass>();
-}
-
-}  // namespace cpu
-}  // namespace xla
diff --git a/third_party/xla/xla/mlir/backends/cpu/transforms/xla_cpu_to_cpu_runtime.cc b/third_party/xla/xla/mlir/backends/cpu/transforms/xla_cpu_to_cpu_runtime.cc
deleted file mode 100644
index fb3bb71548c6f9..00000000000000
--- a/third_party/xla/xla/mlir/backends/cpu/transforms/xla_cpu_to_cpu_runtime.cc
+++ /dev/null
@@ -1,436 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <iterator>
-#include <memory>
-#include <optional>
-#include <utility>
-#include <vector>
-
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/SymbolTable.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "xla/mlir/backends/cpu/transforms/passes.h"
-#include "xla/mlir/runtime/transforms/type_converter.h"
-#include "xla/mlir/runtime/utils/custom_calls.h"
-#include "xla/mlir/xla_cpu/ir/xla_cpu.h"
-#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
-#include "xla/service/hlo_parser.h"
-
-namespace xla {
-namespace cpu {
-namespace {
-
-#define GEN_PASS_DEF_CONVERTXLACPUTOCPURUNTIMEPASS
-#include "xla/mlir/backends/cpu/transforms/passes.h.inc"
-
-using namespace mlir;  // NOLINT
-
-using xla_cpu::PartitionIdOp;
-using xla_cpu::ReplicaIdOp;
-
-using xla::runtime::AppendCustomCallAttrs;
-using xla::runtime::CustomCallDeclarations;
-
-class ConvertXlaCpuToCpuRuntimePass
-    : public impl::ConvertXlaCpuToCpuRuntimePassBase<
-          ConvertXlaCpuToCpuRuntimePass> {
-  void runOnOperation() override;
-
-  void getDependentDialects(DialectRegistry& registry) const override {
-    registry.insert<func::FuncDialect, memref::MemRefDialect>();
-  }
-};
-
-// Copies memrefs with non-identity layouts (e.g. results of memref.subviews)
-// to newly allocated memrefs, ensuring all outputs have flat layouts.
-// TODO(jreiffers): If the memref just as an offset, but its layout is otherwise
-// default, the copy is overkill.
-SmallVector<Value> EnsureFlatMemrefs(ValueRange values,
-                                     ImplicitLocOpBuilder& b) {
-  SmallVector<Value> out;
-  for (Value value : values) {
-    auto ty = value.getType().dyn_cast<MemRefType>();
-    if (!ty || ty.getLayout().isIdentity()) {
-      out.push_back(value);
-    } else {
-      auto default_layout_ty =
-          MemRefType::get(ty.getShape(), ty.getElementType());
-      auto alloc =
-          out.emplace_back(b.create<memref::AllocOp>(default_layout_ty));
-      b.create<memref::CopyOp>(value, alloc);
-    }
-  }
-  return out;
-}
-
-// Replaces a DPS style collective op with a custom call.
-func::CallOp CreateCallForDpsCollectiveOp(Operation* op,
-                                          CustomCallDeclarations& custom_calls,
-                                          StringRef call_target,
-                                          PatternRewriter& rewriter) {
-  ImplicitLocOpBuilder b(op->getLoc(), rewriter);
-  b.setInsertionPoint(op);
-
-  // Subview ops result in strided Memrefs. The runtime can't deal with them,
-  // so we copy everything that doesn't have the default layout.
-  SmallVector<Value> new_operands = EnsureFlatMemrefs(op->getOperands(), b);
-
-  func::FuncOp callee = custom_calls.GetOrCreate(
-      b, call_target, TypeRange(ValueRange(new_operands)), TypeRange());
-  auto call =
-      b.create<func::CallOp>(callee.getName(), TypeRange(), new_operands);
-
-  // Copy attributes from original op.
-  for (auto& attr : op->getAttrs()) {
-    call->setAttr(attr.getName(), attr.getValue());
-  }
-  rewriter.eraseOp(op);
-  return call;
-}
-
-//===----------------------------------------------------------------------===//
-
-template <typename IdOp>
-class IdOpLowering : public OpRewritePattern<IdOp> {
- public:
-  IdOpLowering(MLIRContext* ctx, llvm::StringRef call_target,
-               CustomCallDeclarations& custom_calls)
-      : OpRewritePattern<IdOp>(ctx),
-        call_target_(call_target),
-        custom_calls_(custom_calls) {}
-
-  LogicalResult matchAndRewrite(IdOp op,
-                                PatternRewriter& rewriter) const override {
-    ImplicitLocOpBuilder b(op->getLoc(), rewriter);
-
-    // Create a custom call function declaration.
-    func::FuncOp callee = custom_calls_.GetOrCreate(
-        b, call_target_, TypeRange(), TypeRange(rewriter.getI32Type()));
-
-    rewriter.replaceOpWithNewOp<func::CallOp>(op, callee.getName(),
-                                              TypeRange(rewriter.getI32Type()));
-    return success();
-  }
-
- private:
-  llvm::StringRef call_target_;
-  CustomCallDeclarations& custom_calls_;
-};
-
-//===----------------------------------------------------------------------===//
-
-class AllReduceLowering : public OpRewritePattern<xla_cpu::AllReduceOp> {
- public:
-  AllReduceLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
-      : OpRewritePattern(ctx), custom_calls_(custom_calls) {}
-
-  LogicalResult matchAndRewrite(xla_cpu::AllReduceOp op,
-                                PatternRewriter& rewriter) const override {
-    if (!op.getOperandTypes().front().isa<MemRefType>()) {
-      return failure();
-    }
-
-    auto call = CreateCallForDpsCollectiveOp(op.getOperation(), custom_calls_,
-                                             kCallTarget, rewriter);
-
-    // Set default attributes.
-    if (!call->hasAttr("use_global_device_ids")) {
-      call->setAttr("use_global_device_ids", rewriter.getI32IntegerAttr(0));
-    }
-    if (!call->hasAttr("op_id")) {
-      call->setAttr("op_id", rewriter.getI64IntegerAttr(0));
-    }
-
-    return success();
-  }
-
- private:
-  static constexpr const char kCallTarget[] = "xla.cpu.all_reduce";
-
-  CustomCallDeclarations& custom_calls_;
-};
-
-//===----------------------------------------------------------------------===//
-
-class AllToAllLowering : public OpRewritePattern<xla_cpu::AllToAllOp> {
- public:
-  AllToAllLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
-      : OpRewritePattern(ctx), custom_calls_(custom_calls) {}
-
-  LogicalResult matchAndRewrite(xla_cpu::AllToAllOp op,
-                                PatternRewriter& rewriter) const override {
-    if (op.getSplitDimensionAttr()) {
-      op.emitOpError("ArrayAllToAll is not supported");
-      return failure();
-    }
-    CreateCallForDpsCollectiveOp(op.getOperation(), custom_calls_, kCallTarget,
-                                 rewriter);
-    return success();
-  }
-
- private:
-  static constexpr const char kCallTarget[] = "xla.cpu.tuple_all_to_all";
-
-  CustomCallDeclarations& custom_calls_;
-};
-
-//===----------------------------------------------------------------------===//
-
-class CollectivePermuteLowering
-    : public OpRewritePattern<xla_cpu::CollectivePermuteOp> {
- public:
-  CollectivePermuteLowering(MLIRContext* ctx,
-                            CustomCallDeclarations& custom_calls)
-      : OpRewritePattern(ctx), custom_calls_(custom_calls) {}
-
-  LogicalResult matchAndRewrite(xla_cpu::CollectivePermuteOp op,
-                                PatternRewriter& rewriter) const override {
-    if (!op.getOperandTypes().front().isa<MemRefType>()) {
-      return failure();
-    }
-
-    CreateCallForDpsCollectiveOp(op.getOperation(), custom_calls_, kCallTarget,
-                                 rewriter);
-    return success();
-  }
-
- private:
-  static constexpr const char kCallTarget[] = "xla.cpu.collective_permute";
-
-  CustomCallDeclarations& custom_calls_;
-};
-
-//===----------------------------------------------------------------------===//
-
-class ConvolutionLowering : public OpRewritePattern<xla_cpu::ConvolutionOp> {
- public:
-  ConvolutionLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
-      : OpRewritePattern(ctx), custom_calls_(custom_calls) {}
-
-  LogicalResult matchAndRewrite(xla_cpu::ConvolutionOp op,
-                                PatternRewriter& rewriter) const override {
-    ImplicitLocOpBuilder b(op->getLoc(), rewriter);
-    b.setInsertionPoint(op);
-
-    // Subview ops result in strided Memrefs. The runtime can't deal with them,
-    // so we copy everything that doesn't have the default layout.
-    SmallVector<Value> new_operands = EnsureFlatMemrefs(op->getOperands(), b);
-
-    func::FuncOp callee = custom_calls_.GetOrCreate(
-        b, kCallTarget, TypeRange(ValueRange(new_operands)), TypeRange());
-    auto call =
-        b.create<func::CallOp>(callee.getName(), TypeRange(), new_operands);
-
-    // Copy attributes from original op.
-    for (auto name :
-         {"inputBatchDimension", "inputSpatialDimensions",
-          "inputFeatureDimension", "kernelSpatialDimensions",
-          "kernelInputFeatureDimension", "kernelOutputFeatureDimension",
-          "outputSpatialDimensions", "window_strides", "padding",
-          "lhs_dilation", "rhs_dilation", "feature_group_count"}) {
-      call->setAttr(name, op->getAttr(name));
-    }
-    rewriter.eraseOp(op);
-    return success();
-  }
-
- private:
-  static constexpr const char kCallTarget[] = "xla_cpu_convolution";
-
-  CustomCallDeclarations& custom_calls_;
-};
-
-//===----------------------------------------------------------------------===//
-
-class RngBitGeneratorLowering
-    : public OpRewritePattern<xla_cpu::RngBitGeneratorOp> {
- public:
-  RngBitGeneratorLowering(MLIRContext* ctx,
-                          CustomCallDeclarations& custom_calls)
-      : OpRewritePattern(ctx), custom_calls_(custom_calls) {}
-
-  LogicalResult matchAndRewrite(xla_cpu::RngBitGeneratorOp op,
-                                PatternRewriter& rewriter) const override {
-    auto algorithm =
-        op.getRngAlgorithmAttr().cast<mhlo::RngAlgorithmAttr>().getValue();
-    op->removeAttr("rng_algorithm");
-
-    CreateCallForDpsCollectiveOp(op.getOperation(), custom_calls_,
-                                 algorithm == mhlo::RngAlgorithm::THREE_FRY
-                                     ? kThreeFryTarget
-                                     : kPhiloxTarget,
-                                 rewriter);
-    return success();
-  }
-
- private:
-  static constexpr const char kThreeFryTarget[] = "xla_cpu_rng_three_fry";
-  static constexpr const char kPhiloxTarget[] = "xla_cpu_rng_philox";
-
-  CustomCallDeclarations& custom_calls_;
-};
-
-//===----------------------------------------------------------------------===//
-
-class InfeedLowering : public OpRewritePattern<xla_cpu::InfeedOp> {
- public:
-  InfeedLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
-      : OpRewritePattern(ctx), custom_calls_(custom_calls) {}
-
-  LogicalResult matchAndRewrite(xla_cpu::InfeedOp op,
-                                PatternRewriter& rewriter) const override {
-    ImplicitLocOpBuilder b(op->getLoc(), rewriter);
-
-    // By default all operands are passed to the custom call handler.
-    llvm::SmallVector<Value> operands = EnsureFlatMemrefs(op->getOperands(), b);
-
-    // For infeed with empty tuples, bufferizer does not run, thus the token is
-    // left as the only operand. Remove it.
-    if (operands.back().getType().isa<mlir::mhlo::TokenType>()) {
-      assert(operands.size() == 1 && "Expect token only with empty tuples");
-      operands.pop_back();
-    }
-
-    // Create a custom call function declaration.
-    func::FuncOp callee =
-        custom_calls_.GetOrCreate(b, StringRef(kCallTarget),
-                                  TypeRange(ValueRange(operands)), TypeRange());
-
-    // Call the runtime intrinsic with the original operands.
-    b.create<func::CallOp>(op->getLoc(), callee.getName(), TypeRange(),
-                           operands);
-    rewriter.eraseOp(op);
-
-    return success();
-  }
-
- private:
-  static constexpr const char kCallTarget[] = "xla.cpu.infeed";
-
-  CustomCallDeclarations& custom_calls_;
-};
-
-//===----------------------------------------------------------------------===//
-
-class OutfeedLowering : public OpRewritePattern<xla_cpu::OutfeedOp> {
- public:
-  OutfeedLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
-      : OpRewritePattern(ctx), custom_calls_(custom_calls) {}
-
-  LogicalResult matchAndRewrite(xla_cpu::OutfeedOp op,
-                                PatternRewriter& rewriter) const override {
-    ImplicitLocOpBuilder b(op->getLoc(), rewriter);
-
-    // By default all operands are passed to the custom call handler.
-    llvm::SmallVector<Value> operands = EnsureFlatMemrefs(op->getOperands(), b);
-
-    // Create a custom call function declaration.
-    func::FuncOp callee =
-        custom_calls_.GetOrCreate(b, StringRef(kCallTarget),
-                                  TypeRange(ValueRange(operands)), TypeRange());
-
-    llvm::SmallVector<NamedAttribute> custom_call_attrs;
-    SmallVector<int32_t> types;
-    for (int i = 0; i < op.getResultType().size(); ++i) {
-      auto type_attr = cast<TypeAttr>(op.getResultType()[i]);
-      auto status_or_primitive_type =
-          xla::runtime::TypeConverter::ConvertElementType(type_attr.getValue());
-      if (!status_or_primitive_type.ok()) {
-        return rewriter.notifyMatchFailure(
-            op,
-            "is not provided with a supported primitive type in the result "
-            "type attribute.");
-      }
-      types.push_back(status_or_primitive_type.value());
-    }
-
-    // Call the runtime intrinsic with the original operands.
-    auto call = rewriter.replaceOpWithNewOp<func::CallOp>(
-        op, callee.getName(), TypeRange(), operands);
-    call->setAttr("result_type", b.getI32ArrayAttr(types));
-
-    return success();
-  }
-
- private:
-  static constexpr const char kCallTarget[] = "xla.cpu.outfeed";
-
-  CustomCallDeclarations& custom_calls_;
-};
-
-//===----------------------------------------------------------------------===//
-
-class FftLowering : public OpRewritePattern<xla_cpu::FftOp> {
- public:
-  FftLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
-      : OpRewritePattern(ctx), custom_calls_(custom_calls) {}
-
-  LogicalResult matchAndRewrite(xla_cpu::FftOp op,
-                                PatternRewriter& rewriter) const override {
-    CreateCallForDpsCollectiveOp(op.getOperation(), custom_calls_, kCallTarget,
-                                 rewriter);
-    return success();
-  }
-
- private:
-  static constexpr const char kCallTarget[] = "xla.cpu.fft";
-
-  CustomCallDeclarations& custom_calls_;
-};
-
-//===----------------------------------------------------------------------===//
-
-void ConvertXlaCpuToCpuRuntimePass::runOnOperation() {
-  ModuleOp module = getOperation();
-  MLIRContext* ctx = module.getContext();
-
-  // Keep track of the custom calls created from the lowered operations.
-  SymbolTable sym_table(module);
-  CustomCallDeclarations custom_calls(std::move(sym_table));
-
-  // Convert xla_cpu operations to XLA cpu runtime custom calls.
-  RewritePatternSet patterns(ctx);
-  patterns.insert<AllReduceLowering, AllToAllLowering,
-                  CollectivePermuteLowering, ConvolutionLowering, FftLowering,
-                  InfeedLowering, OutfeedLowering, RngBitGeneratorLowering>(
-      ctx, custom_calls);
-  patterns.insert<IdOpLowering<PartitionIdOp>>(ctx, "xla.cpu.partition_id",
-                                               custom_calls);
-  patterns.insert<IdOpLowering<ReplicaIdOp>>(ctx, "xla.cpu.replica_id",
-                                             custom_calls);
-
-  if (failed(applyPatternsAndFoldGreedily(module, std::move(patterns))))
-    return signalPassFailure();
-}
-
-}  // namespace
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createConvertXlaCpuToCpuRuntimePass() {
-  return std::make_unique<ConvertXlaCpuToCpuRuntimePass>();
-}
-
-}  // namespace cpu
-}  // namespace xla
diff --git a/third_party/xla/xla/mlir/backends/cpu/transforms/xla_rewrite_realloc_to_alloc.cc b/third_party/xla/xla/mlir/backends/cpu/transforms/xla_rewrite_realloc_to_alloc.cc
deleted file mode 100644
index f358eb4f27c7c2..00000000000000
--- a/third_party/xla/xla/mlir/backends/cpu/transforms/xla_rewrite_realloc_to_alloc.cc
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cassert>
-#include <memory>
-#include <utility>
-
-#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "xla/mlir/backends/cpu/transforms/passes.h"
-
-namespace xla {
-namespace cpu {
-namespace {
-
-#define GEN_PASS_DEF_REWRITEREALLOCTOALLOCPASS
-#include "xla/mlir/backends/cpu/transforms/passes.h.inc"
-
-using namespace mlir;  // NOLINT
-
-class RewriteReallocToAllocPass
-    : public impl::RewriteReallocToAllocPassBase<RewriteReallocToAllocPass> {
-  void runOnOperation() override;
-};
-
-class ReallocToAllocRewriter : public OpRewritePattern<memref::ReallocOp> {
-  using OpRewritePattern::OpRewritePattern;
-  // Rewrites a Realloc to alloc + copy
-  LogicalResult matchAndRewrite(memref::ReallocOp op,
-                                PatternRewriter& rewriter) const override {
-    Value alloc = rewriter.create<memref::AllocOp>(
-        op.getLoc(), op.getType(), op.getOperands().drop_front(1),
-        op.getAlignmentAttr());
-    rewriter.create<memref::CopyOp>(op.getLoc(), op.getSource(), alloc);
-    rewriter.replaceOp(op, alloc);
-    return success();
-  }
-};
-
-void RewriteReallocToAllocPass::runOnOperation() {
-  func::FuncOp func = getOperation();
-  MLIRContext* ctx = func.getContext();
-
-  RewritePatternSet patterns(ctx);
-  patterns.insert<ReallocToAllocRewriter>(ctx);
-
-  if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns)))) {
-    return signalPassFailure();
-  }
-}
-
-}  // namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>> createRewriteReallocToAllocPass() {
-  return std::make_unique<RewriteReallocToAllocPass>();
-}
-
-}  // namespace cpu
-}  // namespace xla
diff --git a/third_party/xla/xla/mlir/framework/transforms/outline_with_xla_framework.cc b/third_party/xla/xla/mlir/framework/transforms/outline_with_xla_framework.cc
index 8efd1726ed05cf..b13299d69a6149 100644
--- a/third_party/xla/xla/mlir/framework/transforms/outline_with_xla_framework.cc
+++ b/third_party/xla/xla/mlir/framework/transforms/outline_with_xla_framework.cc
@@ -79,7 +79,7 @@ struct OutlineXLAFunc : public RewritePattern {
     if (!func) return failure();
     if (func.getSymName() != "main") return failure();
     if (llvm::any_of(op->getOperandTypes(),
-                     [](Type t) { return !t.isa<MemRefType>(); }) ||
+                     [](Type t) { return !mlir::isa<MemRefType>(t); }) ||
         op->getNumResults() != 0)
       return failure();
     if (func->hasAttr("outlined")) return failure();
diff --git a/third_party/xla/xla/mlir/framework/transforms/xla_framework_to_llvm_pass.cc b/third_party/xla/xla/mlir/framework/transforms/xla_framework_to_llvm_pass.cc
index dc609381152f67..62ae86bce849ed 100644
--- a/third_party/xla/xla/mlir/framework/transforms/xla_framework_to_llvm_pass.cc
+++ b/third_party/xla/xla/mlir/framework/transforms/xla_framework_to_llvm_pass.cc
@@ -138,11 +138,11 @@ struct BarePtrFuncOpConversion : public ConvertOpToLLVMPattern<func::FuncOp> {
           Value inner_index = rewriter.create<LLVM::ConstantOp>(
               loc, typeConverter->convertType(rewriter.getIntegerType(32)),
               rewriter.getI32IntegerAttr(static_cast<int32_t>(
-                  funcOp
-                      ->getAttrOfType<mlir::ArrayAttr>(
-                          "xla_framework.result_inner_mapping")
-                      .getValue()[current_index]
-                      .cast<mlir::IntegerAttr>()
+                  mlir::cast<mlir::IntegerAttr>(
+                      funcOp
+                          ->getAttrOfType<mlir::ArrayAttr>(
+                              "xla_framework.result_inner_mapping")
+                          .getValue()[current_index])
                       .getValue()
                       .getSExtValue())));
 
@@ -227,10 +227,11 @@ class LegalizeXLAFrameworkToLLVMPass
     target.markUnknownOpDynamicallyLegal([](Operation *) { return true; });
     target.addIllegalDialect<xla_framework::XLAFrameworkDialect>();
     target.addDynamicallyLegalOp<func::FuncOp>([](func::FuncOp op) {
-      if (llvm::any_of(
-              llvm::concat<const Type>(op.getArgumentTypes(),
-                                       op.getResultTypes()),
-              [](Type type) { return type.isa<xla_framework::BufferType>(); }))
+      if (llvm::any_of(llvm::concat<const Type>(op.getArgumentTypes(),
+                                                op.getResultTypes()),
+                       [](Type type) {
+                         return mlir::isa<xla_framework::BufferType>(type);
+                       }))
         return false;
       return true;
     });
diff --git a/third_party/xla/xla/mlir/memref/BUILD b/third_party/xla/xla/mlir/memref/BUILD
deleted file mode 100644
index 7f628aefb16c13..00000000000000
--- a/third_party/xla/xla/mlir/memref/BUILD
+++ /dev/null
@@ -1,16 +0,0 @@
-package_group(
-    name = "friends",
-    packages = [
-        "//xla/mlir/...",
-        # copybara:uncomment_begin(google-only)
-        # # TODO(ezhulenev): Clean up dependencies that are leforvers from Autofusion project.
-        # "//third_party/py/enzyme_ad/...",
-        # copybara:uncomment_end(google-only)
-    ],
-)
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [":friends"],
-    licenses = ["notice"],
-)
diff --git a/third_party/xla/xla/mlir/memref/transforms/BUILD b/third_party/xla/xla/mlir/memref/transforms/BUILD
deleted file mode 100644
index cd30507edc3b0d..00000000000000
--- a/third_party/xla/xla/mlir/memref/transforms/BUILD
+++ /dev/null
@@ -1,40 +0,0 @@
-load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//xla/mlir/memref:friends"],
-    licenses = ["notice"],
-)
-
-gentbl_cc_library(
-    name = "passes_inc_gen",
-    compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=MemrefTransforms",
-            ],
-            "passes.h.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "passes.td",
-    deps = ["@llvm-project//mlir:PassBaseTdFiles"],
-)
-
-cc_library(
-    name = "passes",
-    srcs = ["aligned_allocations.cc"],
-    hdrs = ["passes.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        ":passes_inc_gen",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:MemRefDialect",
-        "@llvm-project//mlir:Pass",
-    ],
-)
diff --git a/third_party/xla/xla/mlir/memref/transforms/aligned_allocations.cc b/third_party/xla/xla/mlir/memref/transforms/aligned_allocations.cc
deleted file mode 100644
index 8cee23aec11e35..00000000000000
--- a/third_party/xla/xla/mlir/memref/transforms/aligned_allocations.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cassert>
-#include <memory>
-#include <utility>
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "xla/mlir/memref/transforms/passes.h"
-
-namespace xla {
-
-using namespace mlir;  // NOLINT
-
-#define GEN_PASS_DEF_ALIGNEDALLOCATIONSPASS
-#include "xla/mlir/memref/transforms/passes.h.inc"
-
-struct AlignedAllocationsPass
-    : public impl::AlignedAllocationsPassBase<AlignedAllocationsPass> {
-  explicit AlignedAllocationsPass(int64_t alignment) { alignment_ = alignment; }
-  void runOnOperation() override;
-};
-
-void AlignedAllocationsPass::runOnOperation() {
-  assert(alignment_ >= 0 && "alignment must be larger or equal to 0");
-  if (alignment_ == 0) return;
-
-  auto i64 = IntegerType::get(&getContext(), 64);
-  auto alignment_attr = IntegerAttr::get(i64, alignment_);
-
-  getOperation().walk([&](memref::AllocOp alloc) {
-    // Add alignment attribute only if the alignment attribute is missing or the
-    // current alignment is smaller.
-    if (!alloc.getAlignment().has_value() || *alloc.getAlignment() < alignment_)
-      alloc.setAlignmentAttr(alignment_attr);
-  });
-}
-
-std::unique_ptr<OperationPass<func::FuncOp>> CreateAlignedAllocationsPass(
-    int64_t alignment) {
-  return std::make_unique<AlignedAllocationsPass>(alignment);
-}
-
-}  // namespace xla
diff --git a/third_party/xla/xla/mlir/memref/transforms/passes.h b/third_party/xla/xla/mlir/memref/transforms/passes.h
deleted file mode 100644
index 05b6f5fec2aebe..00000000000000
--- a/third_party/xla/xla/mlir/memref/transforms/passes.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_MLIR_MEMREF_TRANSFORMS_PASSES_H_
-#define XLA_MLIR_MEMREF_TRANSFORMS_PASSES_H_
-
-#include <memory>
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-
-namespace xla {
-
-#define GEN_PASS_DECL_ALIGNEDALLOCATIONSPASS
-#include "xla/mlir/memref/transforms/passes.h.inc"
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-CreateAlignedAllocationsPass(int64_t alignment = 64);
-
-#define GEN_PASS_REGISTRATION
-#include "xla/mlir/memref/transforms/passes.h.inc"
-
-}  // namespace xla
-
-#endif  // XLA_MLIR_MEMREF_TRANSFORMS_PASSES_H_
diff --git a/third_party/xla/xla/mlir/memref/transforms/passes.td b/third_party/xla/xla/mlir/memref/transforms/passes.td
deleted file mode 100644
index 8310e55556294d..00000000000000
--- a/third_party/xla/xla/mlir/memref/transforms/passes.td
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_MEMREF_PASSES
-#define XLA_MEMREF_PASSES
-
-include "mlir/Pass/PassBase.td"
-
-def AlignedAllocationsPass
-    : Pass<"xla-memref-aligned-allocations", "mlir::func::FuncOp"> {
-  let summary = "Add alignment attribute to all `alloc` operations.";
-
-  let description = [{
-    This pass adds an alignment attribute to all `alloc` operations which don't
-    have such an attribute yet, or which have a smaller alignment than the one
-    configured for this pass.
-  }];
-
-  let constructor = "::xla::CreateAlignedAllocationsPass()";
-
-  let options = [
-   Option<"alignment_", "alignment", "int64_t", "64",
-          "Byte alignment for allocated memrefs.">
-  ];
-}
-
-#endif  // XLA_MEMREF_PASSES
diff --git a/third_party/xla/xla/mlir/runtime/BUILD b/third_party/xla/xla/mlir/runtime/BUILD
deleted file mode 100644
index 45147f52981c2b..00000000000000
--- a/third_party/xla/xla/mlir/runtime/BUILD
+++ /dev/null
@@ -1,24 +0,0 @@
-package_group(
-    name = "friends",
-    packages = [
-        # copybara:uncomment_begin(google-only)
-        # "//platforms/xla/service/cpu/...",
-        # "//third_party/mlir_edge/tpgen/...",
-        # # TODO(ezhulenev): Clean up dependencies that are leftovers from Autofusion project.
-        # "@tf_runtime//...",
-        # copybara:uncomment_end(google-only)
-        "//tensorflow/compiler/mlir/tfrt/...",
-        "//xla/mlir/...",
-        "//xla/runtime/...",
-        # TODO(ezhulenev): All targets depending on mlir must be under xla/mlir folder
-        "//xla/service/cpu/...",
-        "//xla/service/gpu/...",
-        "//third_party/py/enzyme_ad/...",
-    ],
-)
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [":friends"],
-    licenses = ["notice"],
-)
diff --git a/third_party/xla/xla/mlir/runtime/ir/BUILD b/third_party/xla/xla/mlir/runtime/ir/BUILD
deleted file mode 100644
index f044bf6f4b5d94..00000000000000
--- a/third_party/xla/xla/mlir/runtime/ir/BUILD
+++ /dev/null
@@ -1,111 +0,0 @@
-load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//xla/mlir/runtime:friends"],
-    licenses = ["notice"],
-)
-
-td_library(
-    name = "rt_ops_td_files",
-    srcs = [
-        "rt_dialect.td",
-        "rt_interfaces.td",
-        "rt_ops.td",
-    ],
-    compatible_with = get_compatible_with_portable(),
-    includes = ["include"],
-    visibility = ["//visibility:private"],
-    deps = [
-        "@llvm-project//mlir:ControlFlowInterfacesTdFiles",
-        "@llvm-project//mlir:OpBaseTdFiles",
-        "@llvm-project//mlir:SideEffectInterfacesTdFiles",
-    ],
-)
-
-gentbl_cc_library(
-    name = "rt_inc_gen",
-    compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-dialect-decls"],
-            "rt_dialect.h.inc",
-        ),
-        (
-            ["-gen-dialect-defs"],
-            "rt_dialect.cc.inc",
-        ),
-        (
-            ["-gen-op-decls"],
-            "rt_ops.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "rt_ops.cc.inc",
-        ),
-        (
-            ["-gen-typedef-decls"],
-            "rt_types.h.inc",
-        ),
-        (
-            ["-gen-typedef-defs"],
-            "rt_types.cc.inc",
-        ),
-        (
-            ["-gen-attrdef-decls"],
-            "rt_attrs.h.inc",
-        ),
-        (
-            ["-gen-attrdef-defs"],
-            "rt_attrs.cc.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "rt_ops.td",
-    deps = [":rt_ops_td_files"],
-)
-
-gentbl_cc_library(
-    name = "rt_interfaces_inc_gen",
-    compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-attr-interface-decls"],
-            "rt_attr_interfaces.h.inc",
-        ),
-        (
-            ["-gen-attr-interface-defs"],
-            "rt_attr_interfaces.cc.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "rt_interfaces.td",
-    deps = [":rt_ops_td_files"],
-)
-
-cc_library(
-    name = "rt",
-    srcs = [
-        "rt_dialect.cc",
-        "rt_interfaces.cc",
-        "rt_ops.cc",
-    ],
-    hdrs = [
-        "rt_dialect.h",
-        "rt_interfaces.h",
-        "rt_ops.h",
-    ],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        ":rt_inc_gen",
-        ":rt_interfaces_inc_gen",
-        "//xla/runtime:constraints",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:ControlFlowInterfaces",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:SideEffectInterfaces",
-    ],
-)
diff --git a/third_party/xla/xla/mlir/runtime/ir/rt_dialect.cc b/third_party/xla/xla/mlir/runtime/ir/rt_dialect.cc
deleted file mode 100644
index 45b52b3f7f0b18..00000000000000
--- a/third_party/xla/xla/mlir/runtime/ir/rt_dialect.cc
+++ /dev/null
@@ -1,141 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/mlir/runtime/ir/rt_dialect.h"
-
-#include "llvm/ADT/TypeSwitch.h"  // IWYU pragma: keep
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/DialectImplementation.h"  // from @llvm-project  // IWYU pragma: keep
-#include "xla/mlir/runtime/ir/rt_interfaces.h"
-#include "xla/mlir/runtime/ir/rt_ops.h"
-#include "xla/runtime/constraints.h"
-
-//===----------------------------------------------------------------------===//
-// RT Dialect
-//===----------------------------------------------------------------------===//
-
-#include "xla/mlir/runtime/ir/rt_dialect.cc.inc"
-
-namespace xla {
-namespace runtime {
-
-static bool IsRtConstraintAttr(mlir::Attribute attr) {
-  // If attribute is not defined it means that there is no constraint
-  if (!attr) return true;
-  auto str = attr.dyn_cast_or_null<mlir::StringAttr>();
-  absl::StatusOr<ArgumentConstraint> constraint =
-      ParseArgumentConstraint(str.getValue());
-  return constraint.ok();
-}
-
-void RuntimeDialect::initialize() {
-  allowUnknownTypes();
-
-  addOperations<
-#define GET_OP_LIST
-#include "xla/mlir/runtime/ir/rt_ops.cc.inc"
-      >();
-
-  addTypes<
-#define GET_TYPEDEF_LIST
-#include "xla/mlir/runtime/ir/rt_types.cc.inc"
-      >();
-
-  addAttributes<
-#define GET_ATTRDEF_LIST
-#include "xla/mlir/runtime/ir/rt_attrs.cc.inc"
-      >();
-}
-
-mlir::LogicalResult RuntimeDialect::verifyOperationAttribute(
-    mlir::Operation *op, mlir::NamedAttribute attribute) {
-  // Only functions can be marked as exported.
-  if (attribute.getName() == "rt.exported") {
-    if (!llvm::isa<mlir::IntegerAttr>(attribute.getValue())) {
-      return op->emitOpError() << "requires " << attribute.getName()
-                               << " to be an integer attribute";
-    }
-
-    auto func = llvm::dyn_cast<mlir::FunctionOpInterface>(op);
-    if (!func) {
-      return op->emitError()
-             << attribute.getName() << " can only be applied to a function";
-    }
-    if (func.empty()) {
-      return op->emitOpError()
-             << "requires non-empty body for function with attribute "
-             << attribute.getName();
-    }
-  }
-
-  // Custom call attribute can be defined only on a function declaration.
-  if (attribute.getName() == "rt.custom_call") {
-    if (!(attribute.getValue().isa<mlir::StringAttr>())) {
-      return op->emitOpError() << "requires " << attribute.getName()
-                               << " to only accept string value";
-    }
-
-    auto func = llvm::dyn_cast<mlir::func::FuncOp>(op);
-    if (!func) {
-      return op->emitError()
-             << attribute.getName() << " can only be applied to a function";
-    }
-    if (!func.empty()) {
-      return op->emitOpError() << "requires " << attribute.getName()
-                               << " to only apply to a function declaration";
-    }
-  }
-
-  // Dynamic custom call attribute can be applied only to a custom call
-  // declaration.
-  if (attribute.getName() == "rt.dynamic") {
-    if (!op->hasAttr("rt.custom_call")) {
-      return op->emitOpError()
-             << attribute.getName()
-             << " can only be applied to a custom call declaration";
-    }
-  }
-
-  // Trace annotation should implement an attribute interface.
-  if (attribute.getName() == "rt.trace") {
-    if (!attribute.getValue().isa<TraceAnnotationAttrInterface>()) {
-      return op->emitOpError() << " requires " << attribute.getName()
-                               << " to be a trace annotation attribute";
-    }
-  }
-
-  // Check constraints for all function arguments.
-  if (auto func = llvm::dyn_cast<mlir::func::FuncOp>(op)) {
-    for (int i = 0; i < func.getNumArguments(); ++i) {
-      if (!IsRtConstraintAttr(
-              func.getArgAttr(i, kArgumentConstraintAttrName))) {
-        return op->emitOpError()
-               << "has illegal attribute value of "
-               << kArgumentConstraintAttrName << " for argument " << i;
-      }
-    }
-  }
-
-  return mlir::success();
-}
-}  // namespace runtime
-}  // namespace xla
-
-#define GET_TYPEDEF_CLASSES
-#include "xla/mlir/runtime/ir/rt_types.cc.inc"
-
-#define GET_ATTRDEF_CLASSES
-#include "xla/mlir/runtime/ir/rt_attrs.cc.inc"
diff --git a/third_party/xla/xla/mlir/runtime/ir/rt_dialect.h b/third_party/xla/xla/mlir/runtime/ir/rt_dialect.h
deleted file mode 100644
index d13a536e575999..00000000000000
--- a/third_party/xla/xla/mlir/runtime/ir/rt_dialect.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_MLIR_RUNTIME_IR_RT_DIALECT_H_
-#define XLA_MLIR_RUNTIME_IR_RT_DIALECT_H_
-
-#include "mlir/IR/Dialect.h"  // from @llvm-project  // IWYU pragma: keep
-#include "mlir/IR/OpImplementation.h"  // from @llvm-project  // IWYU pragma: keep
-#include "xla/mlir/runtime/ir/rt_interfaces.h"  // IWYU pragma: keep
-
-// Runtime dialect definition.
-#include "xla/mlir/runtime/ir/rt_dialect.h.inc"
-
-#define GET_TYPEDEF_CLASSES
-#include "xla/mlir/runtime/ir/rt_types.h.inc"
-
-#define GET_ATTRDEF_CLASSES
-#include "xla/mlir/runtime/ir/rt_attrs.h.inc"
-
-namespace xla {
-namespace runtime {
-
-// Attribute name for marking functions exported to runtime.
-static constexpr char const* kExportedAttrName = "rt.exported";
-
-static constexpr char const* kRequiresBlasAttrName = "rt.requires_blas";
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_MLIR_RUNTIME_IR_RT_DIALECT_H_
diff --git a/third_party/xla/xla/mlir/runtime/ir/rt_dialect.td b/third_party/xla/xla/mlir/runtime/ir/rt_dialect.td
deleted file mode 100644
index e2a5baa8420231..00000000000000
--- a/third_party/xla/xla/mlir/runtime/ir/rt_dialect.td
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-//===- rt_dialect.td ------------------------------------------------------===//
-//
-// Dialect, types and attributes definitions for the RT dialect.
-//
-//===----------------------------------------------------------------------===//
-
-#ifdef RT_DIALECT
-#else
-#define RT_DIALECT
-
-include "mlir/IR/AttrTypeBase.td"
-include "mlir/IR/OpBase.td"
-
-include "xla/mlir/runtime/ir/rt_interfaces.td"
-
-def RuntimeDialect : Dialect {
-  let name = "rt";
-
-  let description = [{
-    The run-time dialect defines types and operations for the XLA executables to
-    call into the XLA C++ runtime.
-  }];
-
-  let cppNamespace = "::xla::runtime";
-
-  let useDefaultTypePrinterParser = 1;
-
-  let useDefaultAttributePrinterParser = 1;
-
-  let hasOperationAttrVerify = 1;
-}
-
-def RT_Ordinal : SignlessIntegerAttrBase<I32, "ordinal value">;
-
-//===----------------------------------------------------------------------===//
-// Types for integrating XLA executables with the runtime.
-//===----------------------------------------------------------------------===//
-
-class RT_Type<string name, string typeMnemonic> : TypeDef<RuntimeDialect,
-                                                          name> {
-  let mnemonic = typeMnemonic;
-}
-
-def ExecutionContextType : RT_Type<"ExecutionContext", "execution_context"> {
-  let summary = "Execution Context type";
-  let description = [{
-    Handle to an instance of the runtime execution context.
-  }];
-}
-
-def StatusType : RT_Type<"Status", "status"> {
-  let summary = "Status type";
-  let description = [{
-    A status type returned from the XLA runtime API intrinsics.
-  }];
-}
-
-def OpaqueValueType : RT_Type<"Opaque", "opaque"> {
-  let summary = "Opaque type";
-  let description = [{
-    Opaque type for passing opaque values (at run time just `void*`) to and from
-    compiled executables and custom calls.
-  }];
-}
-
-//===----------------------------------------------------------------------===//
-// Tracing Annotation Attributes
-//===----------------------------------------------------------------------===//
-
-def HloTraceAttr : AttrDef<RuntimeDialect, "HloTrace",
-    [RT_TraceAnnotationAttrInterface]> {
-  let mnemonic = "hlo_trace";
-  let summary = "Trace execution of the HLO operation";
-
-  let parameters = (ins
-    StringRefParameter<"hlo_op">:$hlo_op
-  );
-
-  let assemblyFormat = "`<` $hlo_op `>`";
-}
-
-#endif // RT_DIALECT
diff --git a/third_party/xla/xla/mlir/runtime/ir/rt_interfaces.td b/third_party/xla/xla/mlir/runtime/ir/rt_interfaces.td
deleted file mode 100644
index 790a67eae249d3..00000000000000
--- a/third_party/xla/xla/mlir/runtime/ir/rt_interfaces.td
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-//===- rt_interfaces.td ---------------------------------------------------===//
-//
-// Attribute and type interfaces for the `rt` dialect.
-//
-//===----------------------------------------------------------------------===//
-
-#ifdef RT_INTERFACES
-#else
-#define RT_INTERFACES
-
-include "mlir/IR/OpBase.td"
-
-//===----------------------------------------------------------------------===//
-// Attribute interfaces
-//===----------------------------------------------------------------------===//
-
-class RT_AttrInterface<string name> : AttrInterface<name> {
-  let cppNamespace = "xla::runtime";
-}
-
-def RT_TraceAnnotationAttrInterface
-  : RT_AttrInterface<"TraceAnnotationAttrInterface"> {
-  let description = [{
-    This is a marker for attributes that can be passed to the XLA runtime
-    tracing custom calls (see `rt.trace` operation).
-  }];
-}
-
-#endif // RT_INTERFACES
diff --git a/third_party/xla/xla/mlir/runtime/ir/rt_ops.cc b/third_party/xla/xla/mlir/runtime/ir/rt_ops.cc
deleted file mode 100644
index 3440ddb9dd14aa..00000000000000
--- a/third_party/xla/xla/mlir/runtime/ir/rt_ops.cc
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/mlir/runtime/ir/rt_ops.h"  // IWYU pragma: keep
-
-#include <iterator>
-#include <optional>
-
-#include "llvm/ADT/STLExtras.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project  // IWYU pragma: keep
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "xla/mlir/runtime/ir/rt_interfaces.h"
-
-namespace xla {
-namespace runtime {
-
-using namespace mlir;  // NOLINT
-
-//===----------------------------------------------------------------------===//
-// ExportOp
-//===----------------------------------------------------------------------===//
-
-void ExportOp::build(OpBuilder &builder, OperationState &result,
-                     FunctionOpInterface function_ref) {
-  result.addAttribute("function_ref", SymbolRefAttr::get(function_ref));
-}
-
-void ExportOp::build(OpBuilder &builder, OperationState &result,
-                     FunctionOpInterface function_ref, unsigned ordinal) {
-  build(builder, result, function_ref);
-  result.addAttribute("ordinal", builder.getI32IntegerAttr(ordinal));
-}
-
-LogicalResult ExportOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
-  Operation *op = getOperation();
-  auto func = symbolTable.lookupNearestSymbolFrom<FunctionOpInterface>(
-      op, getFunctionRefAttr());
-
-  // Function reference must reference a valid FuncOp operation.
-  if (!func) {
-    return op->emitError() << "func op named '" << getFunctionRef()
-                           << "' not found for export";
-  }
-
-  return success();
-}
-
-std::optional<unsigned> ExportOp::ordinal() {
-  if (auto ordinal = getOrdinal()) return ordinal->getLimitedValue();
-  return std::nullopt;
-}
-
-FunctionOpInterface ExportOp::exported(mlir::SymbolTable &sym_table) {
-  return sym_table.lookupNearestSymbolFrom<FunctionOpInterface>(
-      getOperation(), getFunctionRefAttr());
-}
-
-//===----------------------------------------------------------------------===//
-// TraceOp
-//===----------------------------------------------------------------------===//
-
-void TraceOp::getSuccessorRegions(RegionBranchPoint point,
-                                  SmallVectorImpl<RegionSuccessor> &regions) {
-  // If the predecessor is the TraceOp, branch into the body.
-  if (point.isParent()) {
-    regions.push_back(RegionSuccessor(&getRegion()));
-    return;
-  }
-
-  // Region branches back to the parent operation.
-  regions.push_back(RegionSuccessor(getResults()));
-}
-
-LogicalResult TraceOp::verify() {
-  if (getRegion().front().getNumArguments() > 0)
-    return emitOpError("region cannot have any arguments");
-  return success();
-}
-
-void TraceOp::build(OpBuilder &builder, OperationState &result,
-                    TypeRange results, Value exec_ctx,
-                    TraceAnnotationAttrInterface annotation,
-                    function_ref<void(OpBuilder &, Location)> bodyBuilder) {
-  result.addTypes(results);
-  result.addOperands(exec_ctx);
-  result.addAttribute("annotation", annotation);
-
-  Region *bodyRegion = result.addRegion();
-  Block &bodyBlock = bodyRegion->emplaceBlock();
-
-  OpBuilder::InsertionGuard guard(builder);
-  builder.setInsertionPointToStart(&bodyBlock);
-
-  // Create the default terminator if the builder is not provided and if the
-  // expected result is empty. Otherwise, leave this to the caller
-  // because we don't know which values to return from the trace op.
-  if (results.empty() && !bodyBuilder) {
-    builder.create<YieldOp>(result.location, ValueRange());
-  } else if (bodyBuilder) {
-    bodyBuilder(builder, result.location);
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// YieldOp
-//===----------------------------------------------------------------------===//
-
-MutableOperandRange YieldOp::getMutableSuccessorOperands(
-    RegionBranchPoint point) {
-  return getArgumentsMutable();
-}
-
-}  // namespace runtime
-}  // namespace xla
-
-#define GET_OP_CLASSES
-#include "xla/mlir/runtime/ir/rt_ops.cc.inc"
diff --git a/third_party/xla/xla/mlir/runtime/ir/rt_ops.h b/third_party/xla/xla/mlir/runtime/ir/rt_ops.h
deleted file mode 100644
index 3bc455a15f9f1c..00000000000000
--- a/third_party/xla/xla/mlir/runtime/ir/rt_ops.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_MLIR_RUNTIME_IR_RT_OPS_H_
-#define XLA_MLIR_RUNTIME_IR_RT_OPS_H_
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project  // IWYU pragma: keep
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project  // IWYU pragma: keep
-#include "mlir/IR/SymbolTable.h"  // from @llvm-project  // IWYU pragma: keep
-#include "mlir/Interfaces/ControlFlowInterfaces.h"  // from @llvm-project  // IWYU pragma: keep
-#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project  // IWYU pragma: keep
-#include "xla/mlir/runtime/ir/rt_dialect.h"  // IWYU pragma: keep
-#include "xla/mlir/runtime/ir/rt_interfaces.h"  // IWYU pragma: keep
-
-#define GET_OP_CLASSES
-#include "xla/mlir/runtime/ir/rt_ops.h.inc"
-
-#endif  // XLA_MLIR_RUNTIME_IR_RT_OPS_H_
diff --git a/third_party/xla/xla/mlir/runtime/ir/rt_ops.td b/third_party/xla/xla/mlir/runtime/ir/rt_ops.td
deleted file mode 100644
index ba9b98e74101d9..00000000000000
--- a/third_party/xla/xla/mlir/runtime/ir/rt_ops.td
+++ /dev/null
@@ -1,365 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-//===- rt_ops.td ----------------------------------------------------------===//
-//
-// Operation definitions for RT dialect.
-//
-//===----------------------------------------------------------------------===//
-
-#ifdef RT_OPS
-#else
-#define RT_OPS
-
-include "mlir/Interfaces/SideEffectInterfaces.td"
-include "mlir/Interfaces/ControlFlowInterfaces.td"
-include "mlir/IR/OpAsmInterface.td"
-include "mlir/IR/OpBase.td"
-include "mlir/IR/SymbolInterfaces.td"
-
-include "xla/mlir/runtime/ir/rt_dialect.td"
-include "xla/mlir/runtime/ir/rt_interfaces.td"
-
-//===----------------------------------------------------------------------===//
-// Op definitions.
-//===----------------------------------------------------------------------===//
-
-class RT_Op<string mnemonic, list<Trait> traits = []> :
-      Op<RuntimeDialect, mnemonic, traits> {
-}
-
-//===----------------------------------------------------------------------===//
-// ExportOp
-//===----------------------------------------------------------------------===//
-
-def RT_ExportOp : RT_Op<"export", [
-    DeclareOpInterfaceMethods<SymbolUserOpInterface>,
-    HasParent<"mlir::ModuleOp">]> {
-  let summary = "exports a function from the module";
-
-  let description = [{
-    Specifies an exported function that can be called externally by the XLA
-    runtime and becomes visible as an XLA executable function.
-
-    Optional ordinal attribute specifies exported function ordinal in the
-    executable. If ordinal is not defined, it will be assigned automatically by
-    the ordinal assignment pass.
-
-    Each ordinal must be unique and ordinals are contiguous starting from zero.
-  }];
-
-  let arguments = (ins
-    FlatSymbolRefAttr:$function_ref,
-    OptionalAttr<RT_Ordinal>:$ordinal
-  );
-
-  let skipDefaultBuilders = 1;
-  let builders = [
-    OpBuilder<(ins "mlir::FunctionOpInterface":$function_ref)>,
-    OpBuilder<(ins "mlir::FunctionOpInterface":$function_ref, 
-                   "unsigned":$ordinal)>,
-  ];
-
-  let extraClassDeclaration = [{
-    std::optional<unsigned> ordinal();
-    mlir::FunctionOpInterface exported(mlir::SymbolTable& sym_table);
-  }];
-
-  let assemblyFormat = "$function_ref (`ordinal` $ordinal^)? attr-dict";
-}
-
-//===----------------------------------------------------------------------===//
-// SetOutputOp
-//===----------------------------------------------------------------------===//
-
-// TODO(ezhulenev): Rename to SetResult for consistent use of argument/result.
-
-def RT_SetOutputOp : RT_Op<"set_output"> {
-  let summary = "set the result to a given value";
-
-  let description = [{
-    This operation sets the executable result at the given index to the given
-    value. In XLA executables we do not return the results using the
-    conventional return statement, but use the runtime context API to pass
-    values back to the runtime.
-
-    We want to support early returns from the function in case of an error. In
-    C++ we would use `StatusOr<Result>`, however we do not want to define the
-    ABI for this type, and instead we rely on the runtime APIs (see `set_error`
-    operation defined below) to return either a result or an error.
-
-    The result becomes available to the caller only when the executable returns
-    the control flow, and not when the `set_result` is called. Executable can
-    set the result for the same `index` multiple times, and the last one will be
-    returned to the caller.
-
-    Example:
-
-      ```mlir
-      func @compute(%ctx: !rt.execution_context) {
-        %out0 = ... : memref<?xf32>
-        %out1 = ... : memref<?x?xf32>
-        rt.set_output %ctx, 0, %out0 : memref<?xf32>
-        rt.set_output %ctx, 1, %out1 : memref<?x?xf32>
-      }
-      ```
-
-    is an equivalent of a regular function:
-
-      ```mlir
-      func @compute() -> (memref<?xf32>, memref<?x?xf32) {
-        %out0 = ... : memref<?xf32>
-        %out1 = ... : memref<?x?xf32>
-        return %out0, %out1 : memref<?xf32>, memref<?x?xf32>
-      }
-      ```
-  }];
-
-  let arguments = (ins
-    ExecutionContextType:$ctx,
-    ConfinedAttr<I64Attr, [IntNonNegative]>:$index,
-    AnyType:$value
-  );
-
-  let assemblyFormat = [{
-    $ctx `,` $index `,` $value `:` type($value) attr-dict
-  }];
-}
-
-//===----------------------------------------------------------------------===//
-// SetErrorOp
-//===----------------------------------------------------------------------===//
-
-def RT_SetErrorOp : RT_Op<"set_error"> {
-  let summary = "set all executable results to the error state";
-
-  let description = [{
-    This operation sets all XLA executable results to the error state. An XLA
-    executable can call set_error only once, and must not set any of the results
-    in this case (before or after calling `set_error`). The provided error
-    message may be used by a runtime to propagate the error to the user.
-
-    Example:
-
-      ```mlir
-      func @compute(%ctx: !rt.execution_context) {
-        %precondition = arith.cmpi ...
-        cond_br %precondition, ^ok, ^err
-
-      ^ok:
-        %result = "compute_result"(): () -> memref<?xf32>
-        rt.set_output %ctx, 0, %result : memref<?xf32>
-        return
-
-      ^err:
-        rt.set_error %ctx, "Failed precondition"
-        return
-      }
-      ```
-  }];
-
-  let arguments = (ins
-    ExecutionContextType:$ctx,
-    StrAttr:$error);
-
-  let assemblyFormat = "$ctx `,` $error attr-dict";
-}
-
-//===----------------------------------------------------------------------===//
-// IsOkOp
-//===----------------------------------------------------------------------===//
-
-def RT_IsOkOp : RT_Op<"is_ok"> {
-  let summary = "returns true if status is ok";
-  let description = "Checks if the runtime status is ok.";
-
-  let arguments = (ins StatusType:$status);
-  let results = (outs I1:$ok);
-
-  let assemblyFormat = "$status attr-dict";
-}
-
-//===----------------------------------------------------------------------===//
-// CustomCallOp
-//===----------------------------------------------------------------------===//
-
-def RT_CallOp : RT_Op<"call"> {
-  let summary = "calls a custom function registered with the runtime";
-
-  let description = [{
-    This operation calls a custom function registered with the runtime. This
-    mechanism allows to call any C++ function from the compiled XLA program. For
-    example, the XLA Gpu runtime is implemented as a set of function calls into
-    the Gpu abstraction layer.
-
-    Returns `!rt.status` value which can be checked to see if the custom call
-    was successful.
-
-    Example:
-
-      ```mlir
-      func @compute(%ctx: !rt.execution_context,
-                    %arg0: memref<?xf32>, %arg1: memref<?xf32>) {
-        %status = rt.call %ctx["xla.gpu.kernel_launch"] (%arg0, %arg1)
-          { kernel = "fusion.0" } : (memref<?xf32>, memref<?xf32>) -> ()
-        %0 = rt.is_ok %status
-        cf.assert %0, "failed to call xla.gpu.kernel_launch custom call"
-        return
-      }
-      ```
-
-    To avoid collisions users should group custom calls into libraries and put
-    them into namespaces (similar to MLIR dialects). In this example there is
-    an assumption that all XLA Gpu related custom calls will be registered with
-    an `xla.gpu` prefix.
-
-    Dynamic custom calls are resolved at runtime using the custom calls
-    registry, which incurs additional overheads because the custom call handler
-    has to be looked up by name (expensive string map lookup).
-
-    Direct custom calls are linked to the custom call handler when compiling the
-    XLA executable, and the user must pass a runtime symbol map (see executable
-    compilation options) that binds custom call callees to the function pointers
-    implementing the custom call API:
-
-      ```
-      bool CustomCallImpl(xla::runtime::ExecutionContext* ctx,
-                          void** args, void** attrs, void** rets);
-      ```
-  }];
-
-  let arguments = (ins
-    ExecutionContextType:$ctx,
-    StrAttr:$callee,
-    UnitAttr:$dynamic,
-    Variadic<AnyType>:$arguments
-  );
-
-  let results = (outs
-    StatusType:$status,
-    Variadic<AnyType>:$results
-  );
-
-  let assemblyFormat = [{
-    (`dynamic` $dynamic^)? $ctx `[` $callee `]`  `(` $arguments `)`
-    attr-dict `:` functional-type($arguments, $results)
-  }];
-}
-
-//===----------------------------------------------------------------------===//
-// TraceOp
-//===----------------------------------------------------------------------===//
-
-def RT_TraceOp : RT_Op<"trace",
-    [AutomaticAllocationScope, OpAsmOpInterface,
-     DeclareOpInterfaceMethods<RegionBranchOpInterface>,
-     SingleBlockImplicitTerminator<"YieldOp">]> {
-  let summary = "Trace operation";
-
-  let description = [{
-    Traces execution of the attached region using provided annotation.
-
-    Example: trace gemm with a corresponding HLO operation annotation
-
-    ```mlir
-    %0 = rt.trace %ctx, #rt.hlo<op=gemm, module=foo> -> !tensor<?x?xf32> {
-      %1 = call @local_xla.gemm(...) : (...) -> tensor<?x?xf32>
-      yield %1 : tensor<?x?xf32>
-    }
-    ```
-  }];
-
-  let arguments = (ins
-    ExecutionContextType:$ctx,
-    RT_TraceAnnotationAttrInterface:$annotation
-  );
-
-  let results = (outs Variadic<AnyType>:$results);
-
-  let regions = (region SizedRegion<1>:$body);
-
-  let assemblyFormat = [{
-    qualified($annotation) `,` $ctx (`->` type($results)^)? $body attr-dict
-  }];
-
-  let hasVerifier = 1;
-
-  let extraClassDeclaration = [{
-    // OpAsmOpInterface: Allow the dialect prefix to be omitted.
-    static llvm::StringRef getDefaultDialect() { return "rt"; }
-  }];
-
-  let skipDefaultBuilders = 1;
-
-  let builders = [
-    OpBuilder<(ins
-      "mlir::TypeRange":$results,
-      "mlir::Value":$exec_ctx,
-      "TraceAnnotationAttrInterface":$annotation,
-      CArg<"llvm::function_ref<void(mlir::OpBuilder&, mlir::Location)>",
-           "nullptr">:$bodyBuilder)>,
-  ];
-}
-
-//===----------------------------------------------------------------------===//
-// YieldOp
-//===----------------------------------------------------------------------===//
-
-def RT_YieldOp : RT_Op<"yield",
-    [HasParent<"TraceOp">, Pure, Terminator,
-     DeclareOpInterfaceMethods<RegionBranchTerminatorOpInterface>]> {
-  let summary = "terminator for rt.trace operation";
-
-  let description = [{
-    The `rt.yield` is a special terminator operation for the block inside
-    `rt.trace` operation that returns values defined inside the body block to
-    the users of `rt.trace` operation results.
-  }];
-
-  let arguments = (ins Variadic<AnyType>:$arguments);
-
-  let assemblyFormat = "($arguments^ `:` type($arguments))? attr-dict";
-
-  // Default builder needed for ensureTerminator
-  let builders = [OpBuilder<(ins), "build($_builder, $_state, {});">];
-}
-
-//===----------------------------------------------------------------------===//
-// UnsignedCast
-//===----------------------------------------------------------------------===//
-
-// TODO(ezhulenev): This is not a reasonable long term solution for the problem.
-// We should either get rid of unsigned types before we lower to gpu dialect, or
-// at least put this operation to some other dialect that facilitates lowering
-// to LLVM, and do not keep it in `rt`.
-
-def RT_UnsignedCastOp : RT_Op<"unsigned_cast"> {
-  let summary = [{
-    Converts any integer value to an unsigned integer value.
-  }];
-
-  let description = [{
-    This is a work around the fact that `arith` and `LLVM` dialects do not
-    support unsigned integer constants, and we need to generate a valid IR for
-    `gpu.memset` operation with unsigned memref argument.
-
-    This operation is erased by the `rt-to-llvm` pass.
-  }];
-
-  let arguments = (ins SignlessIntegerLike:$value );
-  let results = (outs AnyType:$result );
-}
-
-#endif // RT_OPS
diff --git a/third_party/xla/xla/mlir/runtime/transforms/BUILD b/third_party/xla/xla/mlir/runtime/transforms/BUILD
deleted file mode 100644
index 00bf20b09e6d1f..00000000000000
--- a/third_party/xla/xla/mlir/runtime/transforms/BUILD
+++ /dev/null
@@ -1,331 +0,0 @@
-load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
-load(
-    "@local_tsl//tsl/platform:build_config_root.bzl",
-    "if_llvm_aarch64_available",
-    "if_llvm_arm_available",
-    "if_llvm_powerpc_available",
-    "if_llvm_system_z_available",
-    "if_llvm_x86_available",
-)
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("//xla:xla.bzl", "xla_cc_test")
-load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//xla/mlir/runtime:friends"],
-    licenses = ["notice"],
-)
-
-gentbl_cc_library(
-    name = "passes_inc_gen",
-    compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=RuntimeTransforms",
-            ],
-            "passes.h.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "passes.td",
-    deps = ["@llvm-project//mlir:PassBaseTdFiles"],
-)
-
-cc_library(
-    name = "passes",
-    srcs = [
-        "add_initializations.cc",
-        "convert_asserts.cc",
-        "convert_custom_calls.cc",
-        "export_functions.cc",
-        "move_allocas_to_entry_block.cc",
-        "ordinal_assignment.cc",
-        "rt_to_llvm.cc",
-    ],
-    hdrs = ["passes.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        ":custom_call_encoding",
-        ":passes_inc_gen",
-        "//xla/mlir/runtime/ir:rt",
-        "//xla/mlir/runtime/utils:custom_calls",
-        "//xla/runtime:custom_call",
-        "//xla/runtime:logical_result",
-        "//xla/runtime:tracing",
-        "//xla/runtime:type_id",
-        "@com_google_absl//absl/log:check",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:AsyncDialect",
-        "@llvm-project//mlir:ControlFlowDialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:FuncTransforms",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:LLVMCommonConversion",
-        "@llvm-project//mlir:LLVMDialect",
-        "@llvm-project//mlir:MemRefDialect",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TransformUtils",
-    ],
-)
-
-cc_library(
-    name = "calling_convention",
-    srcs = ["calling_convention.cc"],
-    hdrs = ["calling_convention.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        "//xla/mlir/runtime/ir:rt",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TransformUtils",
-    ],
-)
-
-xla_cc_test(
-    name = "calling_convention_test",
-    srcs = ["calling_convention_test.cc"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        ":calling_convention",
-        "//xla/mlir/runtime/ir:rt",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:MemRefDialect",
-        "@llvm-project//mlir:TransformUtils",
-        "@local_tsl//tsl/platform:test",
-        "@local_tsl//tsl/platform:test_main",
-    ],
-)
-
-cc_library(
-    name = "compilation_pipeline_cpu",
-    srcs = ["compilation_pipeline_cpu.cc"],
-    hdrs = ["compilation_pipeline_cpu.h"],
-    compatible_with = get_compatible_with_portable(),
-    local_defines = select({
-        "//xla/service/cpu:experimental_mlir_gpu_enabled": [
-            "EXPERIMENTAL_MLIR_GPU=1",
-        ],
-        "//conditions:default": [],
-    }),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":compilation_pipeline_options",
-        ":compiler",
-        ":passes",
-        "//xla/mlir/backends/cpu/transforms:passes",
-        "//xla/mlir/memref/transforms:passes",
-        "//xla/mlir/runtime/ir:rt",
-        "//xla/mlir_hlo:transforms_passes",
-        "//xla/runtime:compiler",
-        "@llvm-project//mlir:AffineDialect",
-        "@llvm-project//mlir:AffineToStandard",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:ArithTransforms",
-        "@llvm-project//mlir:AsyncDialect",
-        "@llvm-project//mlir:AsyncToLLVM",
-        "@llvm-project//mlir:AsyncTransforms",
-        "@llvm-project//mlir:BuiltinToLLVMIRTranslation",
-        "@llvm-project//mlir:ComplexToLLVM",
-        "@llvm-project//mlir:ControlFlowDialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:FuncExtensions",
-        "@llvm-project//mlir:LLVMToLLVMIRTranslation",
-        "@llvm-project//mlir:LinalgDialect",
-        "@llvm-project//mlir:LinalgTransforms",
-        "@llvm-project//mlir:MathDialect",
-        "@llvm-project//mlir:MathToLLVM",
-        "@llvm-project//mlir:MemRefDialect",
-        "@llvm-project//mlir:MemRefToLLVM",
-        "@llvm-project//mlir:MemRefTransforms",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:ReconcileUnrealizedCasts",
-        "@llvm-project//mlir:SCFDialect",
-        "@llvm-project//mlir:SparseTensorDialect",
-        "@llvm-project//mlir:Transforms",
-        "@local_tsl//tsl/platform:logging",
-    ] + select({
-        "//xla/service/cpu:experimental_mlir_gpu_enabled": [
-            "@llvm-project//mlir:GPUToGPURuntimeTransforms",
-            "@llvm-project//mlir:GPUTransforms",
-        ],
-        "//conditions:default": [],
-    }) + if_llvm_aarch64_available([
-        "@llvm-project//mlir:ArmSVEToLLVMIRTranslation",
-    ]) + if_llvm_arm_available([
-        "@llvm-project//mlir:ArmNeonToLLVMIRTranslation",
-    ]) + if_llvm_x86_available([
-        "@llvm-project//mlir:AMXToLLVMIRTranslation",
-        "@llvm-project//mlir:X86VectorToLLVMIRTranslation",
-    ]),
-    alwayslink = 1,  # has pipeline registration
-)
-
-cc_library(
-    name = "compilation_pipeline_options",
-    hdrs = ["compilation_pipeline_options.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        ":custom_call_encoding",
-        "//xla/runtime:type_id",
-        "@llvm-project//mlir:TransformUtils",
-        "@llvm-project//mlir:Transforms",
-    ],
-)
-
-cc_library(
-    name = "custom_call_encoding",
-    srcs = ["custom_call_encoding.cc"],
-    hdrs = ["custom_call_encoding.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        "//xla:shape_util",
-        "//xla:xla_data_proto_cc",
-        "//xla/mlir/runtime/ir:rt",
-        "//xla/runtime:custom_call",
-        "//xla/runtime:logical_result",
-        "//xla/runtime:memref_view",
-        "//xla/runtime:tracing",
-        "//xla/runtime:type_id",
-        "@com_google_absl//absl/types:span",
-        "@eigen_archive//:eigen3",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:AsyncDialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:LLVMCommonConversion",
-        "@llvm-project//mlir:LLVMDialect",
-        "@llvm-project//mlir:Support",
-        "@local_tsl//tsl/concurrency:async_value",
-    ],
-)
-
-cc_library(
-    name = "jit_compiler",
-    srcs = ["jit_compiler.cc"],
-    hdrs = ["jit_compiler.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        ":calling_convention",
-        ":compiler",
-        ":passes",
-        ":specialization",
-        ":type_converter",
-        "//xla/mlir/runtime/ir:rt",
-        "//xla/runtime:arguments",
-        "//xla/runtime:compiler",
-        "//xla/runtime:constraints",
-        "//xla/runtime:errors",
-        "//xla/runtime:executable",
-        "//xla/runtime:execution_engine",
-        "//xla/runtime:logical_result",
-        "//xla/runtime:memory_mapper",
-        "//xla/runtime:symbolic_shape",
-        "//xla/service/llvm_ir:llvm_util",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:cord",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:Analysis",
-        "@llvm-project//llvm:Core",
-        "@llvm-project//llvm:JITLink",
-        "@llvm-project//llvm:Passes",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//llvm:Target",
-        "@llvm-project//mlir:ExecutionEngineUtils",
-        "@llvm-project//mlir:FuncExtensions",
-        "@llvm-project//mlir:FunctionInterfaces",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Parser",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:ToLLVMIRTranslation",
-    ] + if_llvm_aarch64_available([
-        "@llvm-project//llvm:AArch64AsmParser",
-    ]) + if_llvm_powerpc_available([
-        "@llvm-project//llvm:PowerPCAsmParser",
-    ]) + if_llvm_system_z_available([
-        "@llvm-project//llvm:SystemZAsmParser",
-    ]) + if_llvm_x86_available([
-        "@llvm-project//llvm:X86AsmParser",
-    ]),
-)
-
-cc_library(
-    name = "specialization",
-    srcs = ["specialization.cc"],
-    hdrs = ["specialization.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        ":type_converter",
-        "//xla:xla_data_proto_cc",
-        "//xla/mlir/runtime/utils:constraints",
-        "//xla/runtime:arguments",
-        "//xla/runtime:constraints",
-        "//xla/runtime:symbolic_shape",
-        "//xla/runtime:types",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:FunctionInterfaces",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TensorDialect",
-    ],
-)
-
-cc_library(
-    name = "type_converter",
-    srcs = ["type_converter.cc"],
-    hdrs = ["type_converter.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        "//xla:shape_util",
-        "//xla:xla_data_proto_cc",
-        "//xla/mlir/runtime/ir:rt",
-        "//xla/runtime:types",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:str_format",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AsyncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
-xla_cc_test(
-    name = "type_converter_test",
-    srcs = ["type_converter_test.cc"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        ":type_converter",
-        "//xla:xla_data_proto_cc",
-        "//xla/runtime:types",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/platform:test",
-        "@local_tsl//tsl/platform:test_main",
-    ],
-)
-
-cc_library(
-    name = "compiler",
-    hdrs = ["compiler.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-    ],
-)
diff --git a/third_party/xla/xla/mlir/runtime/transforms/add_initializations.cc b/third_party/xla/xla/mlir/runtime/transforms/add_initializations.cc
deleted file mode 100644
index 09e4d4d564769f..00000000000000
--- a/third_party/xla/xla/mlir/runtime/transforms/add_initializations.cc
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cstdint>
-#include <memory>
-#include <utility>
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
-#include "mlir/IR/SymbolTable.h"  // from @llvm-project
-#include "mlir/IR/TypeRange.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "xla/mlir/runtime/ir/rt_dialect.h"
-#include "xla/mlir/runtime/transforms/passes.h"
-#include "xla/mlir/runtime/utils/custom_calls.h"
-
-namespace xla {
-namespace runtime {
-
-using namespace mlir;  // NOLINT
-
-#define GEN_PASS_DEF_ADDINITIALIZATIONS
-#include "xla/mlir/runtime/transforms/passes.h.inc"
-
-class AddInitializations
-    : public impl::AddInitializationsBase<AddInitializations> {
-  void runOnOperation() override;
-};
-
-//===----------------------------------------------------------------------====/
-
-void AddInitializations::runOnOperation() {
-  ModuleOp module = getOperation();
-  bool requires_blas = false;
-  if (Attribute requires_blas_attr = module->getAttr(kRequiresBlasAttrName)) {
-    requires_blas = cast<BoolAttr>(requires_blas_attr).getValue();
-  }
-
-  if (!requires_blas) {
-    return;
-  }
-
-  SymbolTable sym_table(module);
-  CustomCallDeclarations custom_calls(std::move(sym_table));
-
-  ImplicitLocOpBuilder b(module->getLoc(), custom_calls.sym_table().getOp());
-  func::FuncOp initialize_cublas = custom_calls.GetOrCreate(
-      b, "xla.gpu.init_cublas", TypeRange(), TypeRange());
-
-  module.walk([&](func::FuncOp func) {
-    if (IntegerAttr exported = func.getOperation()->getAttrOfType<IntegerAttr>(
-            kExportedAttrName)) {
-      int64_t ordinal = exported.getInt();
-      if (ordinal == 0) {
-        b.setInsertionPointToStart(&*func.getBody().getBlocks().begin());
-        b.create<func::CallOp>(initialize_cublas.getName(), TypeRange());
-      }
-    }
-  });
-}
-
-std::unique_ptr<OperationPass<ModuleOp>> CreateAddInitializationsPass() {
-  return std::make_unique<AddInitializations>();
-}
-
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/mlir/runtime/transforms/calling_convention.cc b/third_party/xla/xla/mlir/runtime/transforms/calling_convention.cc
deleted file mode 100644
index d7861d265614f1..00000000000000
--- a/third_party/xla/xla/mlir/runtime/transforms/calling_convention.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/mlir/runtime/transforms/calling_convention.h"
-
-#include <iterator>
-#include <utility>
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "xla/mlir/runtime/ir/rt_dialect.h"
-
-namespace xla {
-namespace runtime {
-
-CallingConvention DefaultCallingConvention() {
-  return [](mlir::FunctionType func) {
-    mlir::MLIRContext* ctx = func.getContext();
-
-    llvm::SmallVector<mlir::Type> inputs = {ExecutionContextType::get(ctx)};
-    inputs.reserve(1 + func.getNumInputs());
-    llvm::append_range(inputs, func.getInputs());
-
-    return mlir::FunctionType::get(ctx, inputs, func.getResults());
-  };
-}
-
-CallingConvention DefaultCallingConvention(mlir::TypeConverter type_converter) {
-  return [c = std::move(type_converter)](mlir::FunctionType func) mutable {
-    mlir::MLIRContext* ctx = func.getContext();
-
-    // Track if all type conversions were successful.
-    bool failed_conversion = false;
-
-    auto convert = [&](llvm::ArrayRef<mlir::Type> types,
-                       llvm::SmallVector<mlir::Type>& converted) {
-      llvm::for_each(types, [&](mlir::Type type) {
-        mlir::LogicalResult result = c.convertType(type, converted);
-        if (result.failed()) failed_conversion = true;
-      });
-    };
-    // Add execution context as the first argument.
-    llvm::SmallVector<mlir::Type> inputs = {ExecutionContextType::get(ctx)};
-    inputs.reserve(1 + func.getNumInputs());
-    convert(func.getInputs(), inputs);
-
-    // Apply type conversion to all results types.
-    llvm::SmallVector<mlir::Type> results;
-    results.reserve(func.getNumResults());
-    convert(func.getResults(), results);
-
-    // Return null if any of the type conversions failed.
-    if (failed_conversion) return mlir::FunctionType();
-
-    return mlir::FunctionType::get(ctx, inputs, results);
-  };
-}
-
-CallingConvention ResultsToOutsCallingConvention(
-    mlir::TypeConverter type_converter) {
-  return [c = std::move(type_converter)](mlir::FunctionType func) mutable {
-    mlir::MLIRContext* ctx = func.getContext();
-
-    // Track if all type conversions were successful.
-    bool failed_conversion = false;
-
-    auto convert = [&](llvm::ArrayRef<mlir::Type> types,
-                       llvm::SmallVector<mlir::Type>& converted) {
-      llvm::for_each(types, [&](mlir::Type type) {
-        mlir::LogicalResult result = c.convertType(type, converted);
-        if (result.failed()) failed_conversion = true;
-      });
-    };
-
-    llvm::SmallVector<mlir::Type> inputs;
-    inputs.reserve(1 + func.getNumInputs() + func.getNumResults());
-    inputs.push_back(ExecutionContextType::get(ctx));
-    convert(func.getInputs(), inputs);
-    convert(func.getResults(), inputs);
-
-    // Return null if any of the type conversions failed.
-    if (failed_conversion) return mlir::FunctionType();
-
-    return mlir::FunctionType::get(ctx, inputs, {});
-  };
-}
-
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/mlir/runtime/transforms/calling_convention.h b/third_party/xla/xla/mlir/runtime/transforms/calling_convention.h
deleted file mode 100644
index da21f467807629..00000000000000
--- a/third_party/xla/xla/mlir/runtime/transforms/calling_convention.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_MLIR_RUNTIME_TRANSFORMS_CALLING_CONVENTION_H_
-#define XLA_MLIR_RUNTIME_TRANSFORMS_CALLING_CONVENTION_H_
-
-#include <functional>
-
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-
-namespace xla {
-namespace runtime {
-
-// Calling convention converts exported function types to function types with
-// a well-defined ABI (e.g. tensors do not have an ABI; they must be passed
-// across the function boundary as memrefs). In a nutshell it tells the XLA
-// runtime how to call the compiled executable at run time, and how to return
-// results back to the caller.
-//
-// All types in the converted function signature should have a registered
-// type conversion (see `type_converter` below) to a type with defined
-// argument or result ABI (see Type::ArgumentAbi and Type::ResultAbi).
-//
-// If conversion is not possible, calling convention must return a null value.
-//
-// Example: abstract executable defined in high level dialect, e.g. MHLO
-//
-//   ```mlir
-//     func @compute(%arg0: tensor<?xf32>,
-//                   %arg1: tensor<?xf32>) -> tensor<?x?xf32> { ... }
-//   ```
-//
-//   after calling convention conversion becomes:
-//
-//   ```mlir
-//     func @compute(%ctx: !rt.execution_context,
-//                  %arg0: memref<?xf32>,
-//                  %arg1: memref<?xf32>) -> memref<?x?xf32> { ... }
-//   ```
-//
-// Calling convention function type is not the same as the exported function
-// type produced by the compilation pipeline for several reasons:
-//
-// 1) Compilation pipeline produces LLVM functions with LLVM types, and high
-//    level information is lost, e.g. all memrefs are deconstructed into
-//    primitive fields when passed as inputs.
-//
-// 2) Exported function always returns void, and uses runtime API to return
-//    results back to the caller (see `xla-rt-export-functions` pass).
-//
-// Calling convention function type is a XLA-compatible description of the
-// compiled executable ABI, so that XLA runtime can correctly initialize
-// CallFrame arguments, allocate memory for returned results, and then correctly
-// decode results memory into the high level types (e.g. convert returned memref
-// descriptor to a Tensor).
-class CallingConvention
-    : public std::function<mlir::FunctionType(mlir::FunctionType)> {
-  using function::function;
-};
-
-// Returns a calling convention that only adds the execution context argument.
-CallingConvention DefaultCallingConvention();
-
-// Returns a calling convention that uses user-provided type converter to
-// convert all inputs and results types, and adds the execution context
-// argument.
-CallingConvention DefaultCallingConvention(mlir::TypeConverter);
-
-// Returns a calling convention that (1) prepends the execution context
-// argument, (2) uses the user-provided type converter to convert all inputs and
-// results types, and (3) converts result types into out-params by appending
-// them to the arguments.
-CallingConvention ResultsToOutsCallingConvention(mlir::TypeConverter);
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_MLIR_RUNTIME_TRANSFORMS_CALLING_CONVENTION_H_
diff --git a/third_party/xla/xla/mlir/runtime/transforms/calling_convention_test.cc b/third_party/xla/xla/mlir/runtime/transforms/calling_convention_test.cc
deleted file mode 100644
index 6d326264d3be15..00000000000000
--- a/third_party/xla/xla/mlir/runtime/transforms/calling_convention_test.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright 2022 The TensorFlow Runtime Authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "xla/mlir/runtime/transforms/calling_convention.h"
-
-#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/TypeRange.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "xla/mlir/runtime/ir/rt_dialect.h"
-#include "tsl/platform/test.h"
-
-namespace xla {
-namespace runtime {
-
-using mlir::FunctionType;
-using mlir::IntegerType;
-using mlir::MemRefType;
-using mlir::MLIRContext;
-using mlir::TypeRange;
-
-namespace memref = mlir::memref;
-
-TEST(CallingConventionTest, DefaultCallingConvention) {
-  MLIRContext ctx;
-  ctx.loadDialect<RuntimeDialect, memref::MemRefDialect>();
-
-  CallingConvention calling_convention = DefaultCallingConvention();
-
-  auto i32 = IntegerType::get(&ctx, 32);
-
-  auto signature = FunctionType::get(&ctx, TypeRange(i32), TypeRange());
-  auto converted = calling_convention(signature);
-
-  EXPECT_EQ(converted.getNumInputs(), 2);
-  EXPECT_TRUE(converted.getInput(0).isa<ExecutionContextType>());
-  EXPECT_TRUE(converted.getInput(1).isa<IntegerType>());
-}
-
-TEST(CallingConventionTest, DefaultCallingConventionWithTypeConverter) {
-  MLIRContext ctx;
-  ctx.loadDialect<RuntimeDialect, memref::MemRefDialect>();
-
-  mlir::TypeConverter type_converter;
-  type_converter.addConversion(
-      [](mlir::Type type) { return MemRefType::get({}, type); });
-
-  CallingConvention calling_convention =
-      DefaultCallingConvention(type_converter);
-
-  auto i32 = IntegerType::get(&ctx, 32);
-
-  auto signature = FunctionType::get(&ctx, TypeRange(i32), TypeRange());
-  auto converted = calling_convention(signature);
-
-  EXPECT_EQ(converted.getNumInputs(), 2);
-  EXPECT_TRUE(converted.getInput(0).isa<ExecutionContextType>());
-  EXPECT_TRUE(converted.getInput(1).isa<MemRefType>());
-}
-
-TEST(CallingConventionTest, ResultsToOutsCallingConvention) {
-  MLIRContext ctx;
-  ctx.loadDialect<RuntimeDialect, memref::MemRefDialect>();
-
-  mlir::TypeConverter type_converter;
-  type_converter.addConversion(
-      [](mlir::Type type) { return MemRefType::get({}, type); });
-
-  CallingConvention calling_convention =
-      ResultsToOutsCallingConvention(type_converter);
-
-  auto i32 = IntegerType::get(&ctx, 32);
-
-  auto signature = FunctionType::get(&ctx, TypeRange(), TypeRange(i32));
-  auto converted = calling_convention(signature);
-
-  EXPECT_EQ(converted.getNumInputs(), 2);
-  EXPECT_TRUE(converted.getInput(0).isa<ExecutionContextType>());
-  EXPECT_TRUE(converted.getInput(1).isa<MemRefType>());
-
-  EXPECT_EQ(converted.getNumResults(), 0);
-}
-
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/mlir/runtime/transforms/compilation_pipeline_cpu.cc b/third_party/xla/xla/mlir/runtime/transforms/compilation_pipeline_cpu.cc
deleted file mode 100644
index c5e845c22516df..00000000000000
--- a/third_party/xla/xla/mlir/runtime/transforms/compilation_pipeline_cpu.cc
+++ /dev/null
@@ -1,197 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/mlir/runtime/transforms/compilation_pipeline_cpu.h"
-
-#include <utility>
-
-#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"  // from @llvm-project
-#include "mlir/Conversion/AsyncToLLVM/AsyncToLLVM.h"  // from @llvm-project
-#include "mlir/Conversion/ComplexToLLVM/ComplexToLLVM.h"  // from @llvm-project
-#include "mlir/Conversion/MathToLLVM/MathToLLVM.h"  // from @llvm-project
-#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"  // from @llvm-project
-#include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"  // from @llvm-project
-#include "mlir/Dialect/Affine/IR/AffineOps.h"  // from @llvm-project
-#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
-#include "mlir/Dialect/Arith/Transforms/Passes.h"  // from @llvm-project
-#include "mlir/Dialect/Async/IR/Async.h"  // from @llvm-project
-#include "mlir/Dialect/Async/Passes.h"  // from @llvm-project
-#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"  // from @llvm-project
-#include "mlir/Dialect/Func/Extensions/AllExtensions.h"  // from @llvm-project
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/Linalg/IR/Linalg.h"  // from @llvm-project
-#include "mlir/Dialect/Linalg/Passes.h"  // from @llvm-project
-#include "mlir/Dialect/Math/IR/Math.h"  // from @llvm-project
-#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
-#include "mlir/Dialect/MemRef/Transforms/Passes.h"  // from @llvm-project
-#include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
-#include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"  // from @llvm-project
-#include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
-#ifdef TF_LLVM_X86_AVAILABLE
-#include "mlir/Target/LLVMIR/Dialect/AMX/AMXToLLVMIRTranslation.h"  // from @llvm-project
-#endif
-#if defined(TF_LLVM_AARCH64_AVAILABLE) || defined(TF_LLVM_AARCH32_AVAILABLE)
-#include "mlir/Target/LLVMIR/Dialect/ArmNeon/ArmNeonToLLVMIRTranslation.h"  // from @llvm-project
-#ifdef TF_LLVM_AARCH64_AVAILABLE
-#include "mlir/Target/LLVMIR/Dialect/ArmSVE/ArmSVEToLLVMIRTranslation.h"  // from @llvm-project
-#endif
-#endif
-#include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"  // from @llvm-project
-#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"  // from @llvm-project
-#ifdef TF_LLVM_X86_AVAILABLE
-#include "mlir/Target/LLVMIR/Dialect/X86Vector/X86VectorToLLVMIRTranslation.h"  // from @llvm-project
-#endif
-#include "mlir/Transforms/Passes.h"  // from @llvm-project
-#include "xla/mlir/backends/cpu/transforms/passes.h"
-#include "xla/mlir/memref/transforms/passes.h"
-#include "xla/mlir/runtime/ir/rt_dialect.h"
-#include "xla/mlir/runtime/transforms/compilation_pipeline_options.h"
-#include "xla/mlir/runtime/transforms/compiler.h"
-#include "xla/mlir/runtime/transforms/passes.h"
-#include "xla/mlir_hlo/transforms/passes.h"
-#include "tsl/platform/logging.h"
-
-#ifdef EXPERIMENTAL_MLIR_GPU
-#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"  // from @llvm-project
-#include "mlir/Dialect/GPU/Transforms/Passes.h"  // from @llvm-project
-#endif  // EXPERIMENTAL_MLIR_GPU
-
-namespace xla {
-namespace runtime {
-
-void RegisterDefaultXlaCpuRuntimeDialects(DialectRegistry& dialects) {
-  // Register MLIR dialects supported by the compiled executables.
-  dialects->insert<mlir::affine::AffineDialect, mlir::arith::ArithDialect,
-                   mlir::async::AsyncDialect, mlir::cf::ControlFlowDialect,
-                   mlir::linalg::LinalgDialect, mlir::math::MathDialect,
-                   mlir::memref::MemRefDialect, mlir::scf::SCFDialect,
-                   mlir::func::FuncDialect,
-                   mlir::sparse_tensor::SparseTensorDialect,
-                   mlir::tensor::TensorDialect, mlir::vector::VectorDialect,
-                   RuntimeDialect>();
-
-  mlir::func::registerAllExtensions(*dialects);
-
-  // Register MLIR dialects that can be translated to LLVM IR.
-#ifdef TF_LLVM_AARCH64_AVAILABLE
-  mlir::registerArmSVEDialectTranslation(*dialects);
-#endif
-#if defined(TF_LLVM_AARCH64_AVAILABLE) || defined(TF_LLVM_AARCH32_AVAILABLE)
-  mlir::registerArmNeonDialectTranslation(*dialects);
-#endif
-#ifdef TF_LLVM_X86_AVAILABLE
-  mlir::registerAMXDialectTranslation(*dialects);
-  mlir::registerX86VectorDialectTranslation(*dialects);
-#endif
-  mlir::registerBuiltinDialectTranslation(*dialects);
-  mlir::registerLLVMDialectTranslation(*dialects);
-}
-
-static void CreateXlaCpuCompilationPipeline(mlir::OpPassManager& pm,
-                                            const CpuPipelineOptions& opts) {
-  pm.addPass(mlir::createAsyncFuncToAsyncRuntimePass());
-
-  // Convert entry function to the XLA entrypoint.
-  pm.addPass(CreateExportRuntimeFunctionsPass());
-  pm.addPass(cpu::createConvertXlaCpuToCpuRuntimePass());
-  pm.addPass(CreateConvertCustomCallsPass());
-  pm.addPass(CreateConvertAssertsPass());
-
-  pm.addPass(mlir::createInlinerPass());
-  pm.addPass(mlir::createCanonicalizerPass());
-  pm.addPass(mlir::createCSEPass());
-
-  // Convert all linalg operations to parallel loops.
-  pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::createConvertLinalgToParallelLoopsPass());
-  // Canonicalize generated scf.parallel operations to remove single iterations.
-  pm.addPass(mlir::createCanonicalizerPass());
-
-  // TODO(ecg,ezhulenev): add conversion of scf.parallel to async.
-
-  // Lower from high level async operations to async runtime.
-  pm.addPass(mlir::createAsyncToAsyncRuntimePass());
-
-  // Move all memref.alloca to entry block for all functions.
-  pm.addPass(CreateMoveAllocasToEntryBlockPass());
-
-  // Add async.runtime reference counting operations.
-  pm.addPass(mlir::createAsyncRuntimePolicyBasedRefCountingPass());
-
-  // Expand math operations into std/arith dialect operations.
-  pm.addNestedPass<mlir::func::FuncOp>(mlir::arith::createArithExpandOpsPass());
-  pm.addNestedPass<mlir::func::FuncOp>(mlir::memref::createExpandOpsPass());
-  pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::memref::createExpandStridedMetadataPass());
-  pm.addPass(mlir::createLowerAffinePass());
-
-  // Add alignment attribute to all memref allocations.
-  pm.addNestedPass<mlir::func::FuncOp>(
-      xla::CreateAlignedAllocationsPass(opts.alignment));
-
-  // Lower everything down to LLVM dialect.
-  // Convert runtime operations and custom calls to LLVM dialect.
-  const CompilationPipelineOptions& copts = opts.common_options;
-  ConvertRuntimeToLLvmOpts rt_to_llvm_opts = {
-      copts.populate_type_id_names, copts.populate_type_conversions,
-      copts.populate_arg_encodings, copts.populate_ret_encodings,
-      copts.populate_attr_encodings};
-  pm.addPass(CreateConvertRuntimeToLLVMPass(std::move(rt_to_llvm_opts)));
-
-  // Convert async to LLVM once everything else is in the LLVM dialect.
-  pm.addPass(mlir::createConvertAsyncToLLVMPass());
-
-  // Convert everything else to LLVM dialect.
-  mlir::GenericHostToLLVMPassOptions llvm_options;
-  llvm_options.enableAvx2 = opts.math_avx2;
-  pm.addPass(mlir::hlo::createGenericHostToLLVMPass(llvm_options));
-  const bool gpuCodegen = opts.xla_cpu_sparse_cuda_threads > 0;
-#ifdef EXPERIMENTAL_MLIR_GPU
-  if (gpuCodegen) {
-#ifdef MLIR_GPU_TO_CUBIN_PASS_ENABLE
-    pm.addNestedPass<mlir::gpu::GPUModuleOp>(
-        mlir::createGpuSerializeToCubinPass(opts.cuda_triplet, opts.cuda_arch,
-                                            opts.cuda_features));
-#endif
-    pm.addPass(mlir::createGpuToLLVMConversionPass());
-  }
-#else   // EXPERIMENTAL_MLIR_GPU
-  CHECK(!gpuCodegen)
-      << "Experimental MLIR GPU code generation was not enabled at build time";
-#endif  // EXPERIMENTAL_MLIR_GPU
-  pm.addPass(mlir::createReconcileUnrealizedCastsPass());
-
-  // Prepare module for translation to LLVM.
-  pm.addPass(mlir::createCanonicalizerPass());
-  pm.addPass(mlir::createCSEPass());
-}
-
-void CreateDefaultXlaCpuRuntimeCompilationPipeline(
-    PassManager& passes, const CpuPipelineOptions& opts) {
-  CreateXlaCpuCompilationPipeline(*passes, opts);
-}
-
-static void CreateDefaultCpuPipeline(mlir::OpPassManager& pm) {
-  CpuPipelineOptions opts;
-  CreateXlaCpuCompilationPipeline(pm, opts);
-}
-
-static mlir::PassPipelineRegistration<> kXlaRuntimePipeline(
-    "xla-runtime-default-cpu-pipeline",
-    "Default XLA-CPU runtime compilation pipeline", CreateDefaultCpuPipeline);
-
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/mlir/runtime/transforms/compilation_pipeline_cpu.h b/third_party/xla/xla/mlir/runtime/transforms/compilation_pipeline_cpu.h
deleted file mode 100644
index df2eb7ba863f4c..00000000000000
--- a/third_party/xla/xla/mlir/runtime/transforms/compilation_pipeline_cpu.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_MLIR_RUNTIME_TRANSFORMS_COMPILATION_PIPELINE_CPU_H_
-#define XLA_MLIR_RUNTIME_TRANSFORMS_COMPILATION_PIPELINE_CPU_H_
-
-#include <functional>
-
-#include "xla/mlir/runtime/transforms/compilation_pipeline_options.h"
-#include "xla/runtime/compiler.h"
-
-namespace xla {
-namespace runtime {
-
-struct CpuPipelineOptions {
-  CompilationPipelineOptions common_options;
-
-  // Byte alignment for allocated memrefs. Depending on the compiler flags
-  // Tensorflow requires tensors to be aligned on 16, 32 or 64 bytes.
-  int alignment = 0;
-
-  // Accelerate sparse computations with CUDA threading.
-  // This is an experimental feature, so off by default.
-  int32_t xla_cpu_sparse_cuda_threads = 0;
-  std::string cuda_triplet = "nvptx64-nvidia-cuda";
-  std::string cuda_arch = "sm_80";
-  std::string cuda_features = "+ptx71";
-
-  // Enables math approximations that emit AVX2 intrinsics.
-#ifdef __AVX2__
-  bool math_avx2 = true;
-#else
-  bool math_avx2 = false;
-#endif
-};
-
-// Registers dialects, interfaces and dialects translations with the registry
-// required by the default XLA-CPU runtime compilation pipeline.
-void RegisterDefaultXlaCpuRuntimeDialects(DialectRegistry& dialects);
-
-// Creates default XLA-CPU runtime compilation pipeline that lowers from the
-// `rt` and `memref` dialects to the LLVMIR dialect. This is a very simple
-// pipeline that is mostly intended for writing tests for the XLA runtime, and
-// it is expected that all end users will construct their own compilation
-// pipelines from the available XLA and MLIR passes.
-void CreateDefaultXlaCpuRuntimeCompilationPipeline(
-    PassManager& passes, const CpuPipelineOptions& opts);
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_MLIR_RUNTIME_TRANSFORMS_COMPILATION_PIPELINE_CPU_H_
diff --git a/third_party/xla/xla/mlir/runtime/transforms/compilation_pipeline_options.h b/third_party/xla/xla/mlir/runtime/transforms/compilation_pipeline_options.h
deleted file mode 100644
index 9840984d347f0f..00000000000000
--- a/third_party/xla/xla/mlir/runtime/transforms/compilation_pipeline_options.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_MLIR_RUNTIME_TRANSFORMS_COMPILATION_PIPELINE_OPTIONS_H_
-#define XLA_MLIR_RUNTIME_TRANSFORMS_COMPILATION_PIPELINE_OPTIONS_H_
-
-#include <functional>
-
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "xla/mlir/runtime/transforms/custom_call_encoding.h"
-#include "xla/runtime/type_id.h"
-
-namespace xla {
-namespace runtime {
-
-// Common options for default XLA-{CPU,GPU} compilation pipelines.
-struct CompilationPipelineOptions {
-  // Register names for the TypeIDs used for encoding types of custom arguments
-  // and attributes.
-  std::function<void(TypeIDNameRegistry&)> populate_type_id_names;
-
-  // Add type conversions from user-defined types to LLVM types. These
-  // conversions are required for lowering runtime operations to the
-  // corresponding runtime APIs (including custom calls).
-  std::function<void(mlir::TypeConverter&)> populate_type_conversions;
-
-  // Add user-defined encoding for JitRt custom call arguments and attributes.
-  //
-  // Custom encodings allow to pass dialect-specific attributes (enums and
-  // structs) to the custom calls, and decode them into dialect-specific runtime
-  // values in the custom call handlers (see custom_call_to_llvm.h for details).
-  std::function<void(CustomCallArgEncodingSet&)> populate_arg_encodings;
-  std::function<void(CustomCallRetEncodingSet&)> populate_ret_encodings;
-  std::function<void(CustomCallAttrEncodingSet&)> populate_attr_encodings;
-};
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_MLIR_RUNTIME_TRANSFORMS_COMPILATION_PIPELINE_OPTIONS_H_
diff --git a/third_party/xla/xla/mlir/runtime/transforms/compiler.h b/third_party/xla/xla/mlir/runtime/transforms/compiler.h
deleted file mode 100644
index f9a9c44ddd1cf0..00000000000000
--- a/third_party/xla/xla/mlir/runtime/transforms/compiler.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_MLIR_RUNTIME_TRANSFORMS_COMPILER_H_
-#define XLA_MLIR_RUNTIME_TRANSFORMS_COMPILER_H_
-
-#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
-#include "mlir/Pass/PassManager.h"  // from @llvm-project
-
-namespace xla {
-namespace runtime {
-
-class DialectRegistry {
- public:
-  DialectRegistry() = default;
-  mlir::DialectRegistry* operator->() { return &registry_; }
-  mlir::DialectRegistry& operator*() { return registry_; }
-
- private:
-  mlir::DialectRegistry registry_;
-};
-
-class PassManager {
- public:
-  explicit PassManager(mlir::PassManager* pm) : pm_(pm) {}
-  mlir::PassManager* operator->() { return pm_; }
-  mlir::PassManager& operator*() { return *pm_; }
-
- private:
-  mlir::PassManager* pm_;
-};
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_MLIR_RUNTIME_TRANSFORMS_COMPILER_H_
diff --git a/third_party/xla/xla/mlir/runtime/transforms/convert_asserts.cc b/third_party/xla/xla/mlir/runtime/transforms/convert_asserts.cc
deleted file mode 100644
index 94dfee42bf010f..00000000000000
--- a/third_party/xla/xla/mlir/runtime/transforms/convert_asserts.cc
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <utility>
-
-#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"  // from @llvm-project
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/Block.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "xla/mlir/runtime/ir/rt_dialect.h"
-#include "xla/mlir/runtime/ir/rt_ops.h"
-#include "xla/mlir/runtime/transforms/passes.h"
-
-namespace xla {
-namespace runtime {
-
-using namespace mlir;  // NOLINT
-
-#define GEN_PASS_DEF_CONVERTASSERTS
-#include "xla/mlir/runtime/transforms/passes.h.inc"
-
-class ConvertAssertsPass : public impl::ConvertAssertsBase<ConvertAssertsPass> {
-  void runOnOperation() override;
-};
-
-//===----------------------------------------------------------------------====/
-
-class AssertOpLowering : public OpRewritePattern<cf::AssertOp> {
- public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(cf::AssertOp op,
-                                PatternRewriter& rewriter) const override {
-    // Check if assertion is inside the exported runtime function.
-    auto exported = dyn_cast<func::FuncOp>(op->getParentOp());
-    if (!exported || !exported->hasAttr(kExportedAttrName))
-      return rewriter.notifyMatchFailure(
-          op, "assertion is not inside the exported runtime function");
-
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    Value exec_ctx = exported.getArgument(0);
-
-    // Split the block at the assert operation.
-    Block* block = op->getBlock();
-    Block* ok = rewriter.splitBlock(block, op->getIterator());
-
-    // Set up block for returning error.
-    Block* err = rewriter.createBlock(&exported.getBody());
-    b.setInsertionPointToStart(err);
-    b.create<SetErrorOp>(exec_ctx, op.getMsg());
-    b.create<func::ReturnOp>();
-
-    // Branch into the error block if assertion failed.
-    b.setInsertionPointToEnd(block);
-    b.create<cf::CondBranchOp>(op.getArg(), ok, err);
-
-    // Erase the original assert operation.
-    rewriter.eraseOp(op);
-
-    return success();
-  }
-};
-
-//===----------------------------------------------------------------------====/
-
-void ConvertAssertsPass::runOnOperation() {
-  RewritePatternSet patterns(&getContext());
-  patterns.insert<AssertOpLowering>(&getContext());
-
-  ModuleOp op = getOperation();
-  if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns)))) {
-    return signalPassFailure();
-  }
-}
-
-std::unique_ptr<OperationPass<ModuleOp>> CreateConvertAssertsPass() {
-  return std::make_unique<ConvertAssertsPass>();
-}
-
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/mlir/runtime/transforms/convert_custom_calls.cc b/third_party/xla/xla/mlir/runtime/transforms/convert_custom_calls.cc
deleted file mode 100644
index 2c7886cce9b273..00000000000000
--- a/third_party/xla/xla/mlir/runtime/transforms/convert_custom_calls.cc
+++ /dev/null
@@ -1,183 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"  // from @llvm-project
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/Region.h"  // from @llvm-project
-#include "mlir/IR/SymbolTable.h"  // from @llvm-project
-#include "mlir/IR/TypeRange.h"  // from @llvm-project
-#include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "xla/mlir/runtime/ir/rt_dialect.h"
-#include "xla/mlir/runtime/ir/rt_interfaces.h"
-#include "xla/mlir/runtime/ir/rt_ops.h"
-#include "xla/mlir/runtime/transforms/passes.h"
-
-namespace xla {
-namespace runtime {
-
-using namespace mlir;  // NOLINT
-
-#define GEN_PASS_DEF_CONVERTCUSTOMCALLS
-#include "xla/mlir/runtime/transforms/passes.h.inc"
-
-class ConvertCustomCallsPass
-    : public impl::ConvertCustomCallsBase<ConvertCustomCallsPass> {
-  void runOnOperation() override;
-};
-
-//===----------------------------------------------------------------------====/
-
-class CallOpLowering : public OpRewritePattern<func::CallOp> {
- public:
-  using OpRewritePattern::OpRewritePattern;
-
-  CallOpLowering(MLIRContext* ctx, SymbolTable sym_table)
-      : OpRewritePattern(ctx), sym_table_(std::move(sym_table)) {}
-
-  LogicalResult matchAndRewrite(func::CallOp op,
-                                PatternRewriter& rewriter) const override {
-    // Check if callee is a custom call declaration.
-    auto callee = sym_table_.lookup<func::FuncOp>(op.getCallee());
-    StringAttr target = callee->getAttrOfType<StringAttr>("rt.custom_call");
-    if (!target) return failure();
-
-    // Check if call operation is inside the exported runtime function.
-    auto exported = op->getParentOfType<func::FuncOp>();
-    if (!exported || !exported->hasAttr(kExportedAttrName))
-      return rewriter.notifyMatchFailure(
-          op, "func.call is not inside the exported runtime function");
-
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    Value exec_ctx = exported.getArgument(0);
-
-    // Custom call operation always returns the status flag.
-    llvm::SmallVector<Type> results = {StatusType::get(getContext())};
-    results.append(op->getResultTypes().begin(), op->getResultTypes().end());
-
-    // Build a runtime call operation, maybe inside the trace region.
-    auto build_custom_call = [&](ImplicitLocOpBuilder b) -> CallOp {
-      // Rewrite function call with a runtime call, and check the return status.
-      bool dynamic = callee->hasAttr("rt.dynamic");
-      auto call = b.create<CallOp>(results, exec_ctx, target, dynamic,
-                                   op.getOperands());
-
-      // Copy optional attributes from the custom call function declaration.
-      llvm::ArrayRef<llvm::StringRef> callee_attrs = callee.getAttributeNames();
-      for (auto& attr : callee->getAttrs()) {
-        if (isa_and_nonnull<RuntimeDialect>(attr.getNameDialect())) continue;
-        if (llvm::find(callee_attrs, attr.getName()) == callee_attrs.end())
-          call->setAttr(attr.getName(), attr.getValue());
-      }
-
-      // Copy optional attributes from the call operation to the custom call.
-      llvm::ArrayRef<llvm::StringRef> orig_attrs = op.getAttributeNames();
-      for (auto& attr : op->getAttrs()) {
-        if (llvm::find(orig_attrs, attr.getName()) == orig_attrs.end())
-          call->setAttr(attr.getName(), attr.getValue());
-      }
-
-      return call;
-    };
-
-    // Builds the trace operation body region.
-    auto build_trace = [&](OpBuilder& builder, Location loc) {
-      ImplicitLocOpBuilder b(loc, builder);
-      auto call = build_custom_call(b);
-      call->removeAttr("rt.trace");
-      b.create<YieldOp>(call->getOpResults());
-    };
-
-    Value status;                // custom call status
-    SmallVector<OpResult> rets;  // custom call results
-
-    // Check if we must trace the custom call execution.
-    auto attrs = op->getAttrDictionary();
-    if (auto traced = attrs.getAs<TraceAnnotationAttrInterface>("rt.trace")) {
-      auto trace = b.create<TraceOp>(results, exec_ctx, traced, build_trace);
-      status = trace.getResult(0);
-      rets = llvm::to_vector(llvm::drop_begin(trace->getResults()));
-    } else {
-      auto call = build_custom_call(b);
-      status = call.getStatus();
-      rets = llvm::to_vector(call.getResults());
-    }
-
-    b.create<cf::AssertOp>(
-        b.create<IsOkOp>(TypeRange(b.getI1Type()), status),
-        b.getStringAttr("custom call '" + target.str() + "' failed"));
-
-    // Forward users of the original results to custom call results.
-    llvm::for_each(llvm::zip(op->getResults(), rets), [](auto ret) {
-      std::get<0>(ret).replaceAllUsesWith(std::get<1>(ret));
-    });
-
-    // Erase the original function call operation.
-    rewriter.eraseOp(op);
-
-    return success();
-  }
-
- private:
-  SymbolTable sym_table_;
-};
-
-//===----------------------------------------------------------------------====/
-
-void ConvertCustomCallsPass::runOnOperation() {
-  ModuleOp op = getOperation();
-
-  RewritePatternSet patterns(&getContext());
-  patterns.insert<CallOpLowering>(&getContext(), SymbolTable(op));
-
-  if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns)))) {
-    return signalPassFailure();
-  }
-
-  Region& body = op.getBodyRegion();
-
-  // Erase all unused custom call declarations.
-  auto is_unused = [](func::FuncOp decl) {
-    return decl->hasAttr("rt.custom_call") && decl.symbolKnownUseEmpty(decl);
-  };
-  for (auto op : llvm::make_early_inc_range(body.getOps<func::FuncOp>())) {
-    if (is_unused(op)) op.erase();
-  }
-}
-
-std::unique_ptr<OperationPass<ModuleOp>> CreateConvertCustomCallsPass() {
-  return std::make_unique<ConvertCustomCallsPass>();
-}
-
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/mlir/runtime/transforms/custom_call_encoding.cc b/third_party/xla/xla/mlir/runtime/transforms/custom_call_encoding.cc
deleted file mode 100644
index b2c75bd12443c0..00000000000000
--- a/third_party/xla/xla/mlir/runtime/transforms/custom_call_encoding.cc
+++ /dev/null
@@ -1,1448 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/mlir/runtime/transforms/custom_call_encoding.h"
-
-#include <functional>
-#include <memory>
-#include <optional>
-#include <string>
-#include <string_view>
-#include <utility>
-
-#include "absl/types/span.h"
-#include "Eigen/Core"  // from @eigen_archive
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/TypeSwitch.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "mlir/Conversion/LLVMCommon/MemRefBuilder.h"  // from @llvm-project
-#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
-#include "mlir/Dialect/Async/IR/AsyncTypes.h"  // from @llvm-project
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/LLVMIR/LLVMAttrs.h"  // from @llvm-project
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
-#include "mlir/Dialect/LLVMIR/LLVMTypes.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/Matchers.h"  // from @llvm-project
-#include "mlir/IR/SymbolTable.h"  // from @llvm-project
-#include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/IR/ValueRange.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Support/TypeID.h"  // from @llvm-project
-#include "xla/mlir/runtime/ir/rt_dialect.h"
-#include "xla/primitive_util.h"
-#include "xla/runtime/custom_call.h"
-#include "xla/runtime/logical_result.h"
-#include "xla/runtime/memref_view.h"
-#include "xla/runtime/tracing.h"
-#include "xla/runtime/type_id.h"
-#include "xla/xla_data.pb.h"
-#include "tsl/concurrency/async_value_ref.h"
-#include "tsl/concurrency/chain.h"
-
-namespace Eigen {
-struct half;
-}  // namespace Eigen
-
-namespace xla {
-namespace runtime {
-
-using namespace mlir;  // NOLINT
-using arith::ConstantOp;
-using func::FuncOp;
-
-using llvm::ArrayRef;
-
-//===----------------------------------------------------------------------===//
-// Custom call arguments encoding.
-//===----------------------------------------------------------------------===//
-
-using EncodedArg = CustomCallArgEncodingSet::Encoded;
-
-FailureOr<EncodedArg> CustomCallArgEncodingSet::Encode(Globals &g, Allocas &a,
-                                                       ImplicitLocOpBuilder &b,
-                                                       Value value,
-                                                       Value converted) const {
-  for (auto &encoding : encodings_)
-    if (succeeded(encoding->Match(value, converted)))
-      return encoding->Encode(g, a, b, value, converted);
-  return failure();
-}
-
-//===----------------------------------------------------------------------===//
-// Custom call results encoding.
-//===----------------------------------------------------------------------===//
-
-using EncodedRet = CustomCallRetEncodingSet::Encoded;
-
-FailureOr<EncodedRet> CustomCallRetEncodingSet::Encode(Globals &g, Allocas &a,
-                                                       ImplicitLocOpBuilder &b,
-                                                       Type type,
-                                                       Type converted) const {
-  for (auto &encoding : encodings_)
-    if (succeeded(encoding->Match(type, converted)))
-      return encoding->Encode(g, a, b, type, converted);
-  return failure();
-}
-
-FailureOr<Value> CustomCallRetEncodingSet::Decode(ImplicitLocOpBuilder &b,
-                                                  Type type, Type converted,
-                                                  LLVM::AllocaOp alloca) const {
-  for (auto &encoding : encodings_) {
-    if (succeeded(encoding->Match(type, converted)))
-      return encoding->Decode(b, type, converted, alloca);
-  }
-  return failure();
-}
-
-//===----------------------------------------------------------------------===//
-// Custom call attributes encoding.
-//===----------------------------------------------------------------------===//
-
-using EncodedAttr = CustomCallAttrEncodingSet::Encoded;
-
-FailureOr<EncodedAttr> CustomCallAttrEncodingSet::Encode(
-    mlir::SymbolTable &sym_table, Globals &g, ImplicitLocOpBuilder &b,
-    std::string_view name, Attribute attr) const {
-  for (auto &encoding : encodings_)
-    if (succeeded(encoding->Match(sym_table, name, attr)))
-      return encoding->Encode(sym_table, g, b, name, attr);
-  return failure();
-}
-
-//===----------------------------------------------------------------------===//
-// A set of helper functions for packing primitive attributes.
-//===----------------------------------------------------------------------===//
-
-LLVM::GlobalOp EncodeTypeId(Globals &g, ImplicitLocOpBuilder &b,
-                            TypeID type_id) {
-  return g.GetOrCreate(b, type_id);
-}
-
-LLVM::GlobalOp EncodeString(Globals &g, ImplicitLocOpBuilder &b,
-                            std::string_view strref,
-                            std::string_view symbol_base) {
-  MLIRContext *ctx = b.getContext();
-  int64_t size = strref.size();
-
-  // Encoded string type: !llvm.struct<(i64, !llvm.ptr<array<i8 x len>>)>.
-  Type ptr = LLVM::LLVMPointerType::get(ctx);
-  Type type = LLVM::LLVMStructType::getLiteral(ctx, {b.getI64Type(), ptr});
-
-  // Global constant initializer for the encoded string structure
-  auto init = [&](ImplicitLocOpBuilder &ib, Attribute) {
-    // String size and pointer to a null-terminated string.
-    Value num_elements = ib.create<ConstantOp>(ib.getI64IntegerAttr(size));
-    Value str = Globals::AddrOf(ib, g.GetOrCreate(b, strref, "__rt_str"));
-
-    // Store size and pointer into the struct.
-    Value encoded = ib.create<LLVM::UndefOp>(type);
-    encoded = ib.create<LLVM::InsertValueOp>(encoded, num_elements, 0);
-    encoded = ib.create<LLVM::InsertValueOp>(encoded, str, 1);
-    ib.create<LLVM::ReturnOp>(encoded);
-  };
-
-  auto value = b.getStringAttr(strref);
-  return g.GetOrCreate(b, value, type, symbol_base, init);
-}
-
-mlir::LLVM::GlobalOp EncodeScalar(Globals &g, mlir::ImplicitLocOpBuilder &b,
-                                  mlir::Attribute value,
-                                  std::string_view symbol_base) {
-  return g.GetOrCreate(b, cast<TypedAttr>(value), symbol_base);
-}
-
-// Reshape dense elements as a one-dimensional array.
-static mlir::DenseElementsAttr Flatten(DenseIntOrFPElementsAttr dense) {
-  ShapedType shaped_type = dense.getType();
-  ShapedType new_shaped_type = shaped_type.cloneWith(
-      {shaped_type.getNumElements()}, dense.getElementType());
-  return dense.reshape(new_shaped_type);
-}
-
-//===----------------------------------------------------------------------===//
-// A set of helper functions for packing dense and array-like attributes.
-//===----------------------------------------------------------------------===//
-
-// Encodes dense elements attribute as a global constant.
-static LLVM::GlobalOp EncodeDenseElementsAttribute(
-    Globals &g, ImplicitLocOpBuilder &b, Attribute value,
-    std::string_view symbol_base) {
-  MLIRContext *ctx = b.getContext();
-  DenseIntOrFPElementsAttr dense = value.cast<DenseIntOrFPElementsAttr>();
-
-  Type ptr = LLVM::LLVMPointerType::get(ctx);
-
-  // Store dense elements data as !llvm.array<element_type x num_elements>
-  Type array_type =
-      LLVM::LLVMArrayType::get(dense.getElementType(), dense.getNumElements());
-
-  // Encoded array type: !llvm.struct<(i64, !llvm.ptr)>.
-  //
-  // We use the same type for encoding dense elements attributes as the type for
-  // encoding arrays attributes, so that at run time we can safely reinterpret
-  // cast pointers to dense elements attributes (shaped tensors) as pointers to
-  // flat array attributes.
-  //
-  // See `EncodeArrayAttribute` defined below.
-  Type encoded_arr_type =
-      LLVM::LLVMStructType::getLiteral(ctx, {b.getI64Type(), ptr});
-
-  int64_t rank = dense.getType().getRank();
-  ArrayRef<int64_t> shape = dense.getType().getShape();
-  Type shape_arr_type = LLVM::LLVMArrayType::get(b.getI64Type(), rank);
-
-  // Encoded dense elements type:
-  //   !llvm.struct<encoded_arr_type, i64, array<i64, rank>
-  Type type = LLVM::LLVMStructType::getLiteral(
-      ctx, {encoded_arr_type, b.getI64Type(), shape_arr_type});
-
-  // Global constant initializer for the encoded array structure.
-  auto init = [&](ImplicitLocOpBuilder &ib, Attribute) {
-    Value num_elements =
-        ib.create<ConstantOp>(b.getI64IntegerAttr(dense.getNumElements()));
-    Value data_ptr = Globals::AddrOf(
-        ib, g.GetOrCreate(b, Flatten(dense), array_type, symbol_base));
-
-    // Create the encoded array struct.
-    Value encoded_arr = ib.create<LLVM::UndefOp>(encoded_arr_type);
-    encoded_arr = ib.create<LLVM::InsertValueOp>(encoded_arr, num_elements, 0);
-    encoded_arr = ib.create<LLVM::InsertValueOp>(encoded_arr, data_ptr, 1);
-
-    // Get rank and shape.
-    Value rank_value = ib.create<ConstantOp>(b.getI64IntegerAttr(rank));
-    Value shape_value = ib.create<LLVM::UndefOp>(shape_arr_type);
-
-    // Store each dimension size into shape_value.
-    for (int i = 0; i < rank; i++) {
-      Value dim = ib.create<ConstantOp>(ib.getI64IntegerAttr(shape[i]));
-      shape_value = ib.create<LLVM::InsertValueOp>(shape_value, dim, i);
-    }
-
-    // Store the encoded_arr, rank, and shape into the struct.
-    Value encoded = ib.create<LLVM::UndefOp>(type);
-    encoded = ib.create<LLVM::InsertValueOp>(encoded, encoded_arr, 0);
-    encoded = ib.create<LLVM::InsertValueOp>(encoded, rank_value, 1);
-    encoded = ib.create<LLVM::InsertValueOp>(encoded, shape_value, 2);
-    ib.create<LLVM::ReturnOp>(encoded);
-  };
-
-  return g.GetOrCreate(b, value, type, symbol_base, init);
-}
-
-// Encodes the payload of an array attribute as a global constant.
-static LLVM::GlobalOp EncodeArrayAttrData(Globals &g, ImplicitLocOpBuilder &b,
-                                          ArrayAttr array, Type element_type,
-                                          std::string_view symbol_base) {
-  Type arr_type = LLVM::LLVMArrayType::get(element_type, array.size());
-
-  auto init = [&](ImplicitLocOpBuilder &ib, Attribute) {
-    Value data = ib.create<LLVM::UndefOp>(arr_type);
-    for (int i = 0; i < array.size(); i++) {
-      Value value = ib.create<ConstantOp>(cast<TypedAttr>(array[i]));
-      data = ib.create<LLVM::InsertValueOp>(data, value, i);
-    }
-    ib.create<LLVM::ReturnOp>(data);
-  };
-
-  return g.GetOrCreate(b, array, arr_type, symbol_base, init);
-}
-
-// Encodes array attribute as a global constant.
-static LLVM::GlobalOp EncodeArrayAttribute(Globals &g, ImplicitLocOpBuilder &b,
-                                           ArrayAttr array, Type element_type,
-                                           std::string_view symbol_base) {
-  MLIRContext *ctx = b.getContext();
-
-  int64_t size = array.size();
-  Type ptr = LLVM::LLVMPointerType::get(ctx);
-
-  // Encoded array type: !llvm.struct<(i64, !llvm.ptr)>.
-  Type type = LLVM::LLVMStructType::getLiteral(ctx, {b.getI64Type(), ptr});
-
-  // Global constant initializer for the encoded array structure
-  auto init = [&](ImplicitLocOpBuilder &ib, Attribute) {
-    // Array size and the pointer to data.
-    Value num_elements = ib.create<ConstantOp>(b.getI64IntegerAttr(size));
-    Value data = Globals::AddrOf(
-        b, EncodeArrayAttrData(g, b, array, element_type, symbol_base));
-
-    // Store size and values into the struct.
-    Value encoded = ib.create<LLVM::UndefOp>(type);
-    encoded = ib.create<LLVM::InsertValueOp>(encoded, num_elements, 0);
-    encoded = ib.create<LLVM::InsertValueOp>(encoded, data, 1);
-
-    ib.create<LLVM::ReturnOp>(encoded);
-  };
-
-  return g.GetOrCreate(b, array, type, symbol_base, init);
-}
-
-template <typename T, typename AttrType, typename ArrayType>
-static Value FillDataFromDenseArrayAttr(
-    ImplicitLocOpBuilder &b, AttrType (ImplicitLocOpBuilder::*get_attr)(T),
-    ArrayType array, Value data) {
-  ArrayRef<T> array_ref = array.asArrayRef();
-  for (int i = 0; i < array_ref.size(); i++) {
-    Value value = b.create<ConstantOp>((b.*get_attr)(array_ref[i]));
-    data = b.create<LLVM::InsertValueOp>(data, value, i);
-  }
-  return data;
-}
-
-// Encodes the payload of a dense array attribute as a global constant.
-static LLVM::GlobalOp EncodeDenseArrayAttrData(Globals &g,
-                                               ImplicitLocOpBuilder &b,
-                                               DenseArrayAttr base_array,
-                                               Type arr_type,
-                                               std::string_view symbol_base) {
-  auto init = [&](ImplicitLocOpBuilder &ib, Attribute) {
-    Value data = ib.create<LLVM::UndefOp>(arr_type);
-    llvm::TypeSwitch<DenseArrayAttr>(base_array)
-        .Case([&](DenseI8ArrayAttr attr) {
-          data = FillDataFromDenseArrayAttr<int8_t, IntegerAttr>(
-              b, &ImplicitLocOpBuilder::getI8IntegerAttr, attr, data);
-        })
-        .Case([&](DenseI16ArrayAttr attr) {
-          data = FillDataFromDenseArrayAttr<int16_t, IntegerAttr>(
-              b, &ImplicitLocOpBuilder::getI16IntegerAttr, attr, data);
-        })
-        .Case([&](DenseI32ArrayAttr attr) {
-          data = FillDataFromDenseArrayAttr<int32_t, IntegerAttr>(
-              b, &ImplicitLocOpBuilder::getI32IntegerAttr, attr, data);
-        })
-        .Case([&](DenseI64ArrayAttr attr) {
-          data = FillDataFromDenseArrayAttr<int64_t, IntegerAttr>(
-              b, &ImplicitLocOpBuilder::getI64IntegerAttr, attr, data);
-        })
-        .Case([&](DenseF32ArrayAttr attr) {
-          data = FillDataFromDenseArrayAttr<float, FloatAttr>(
-              b, &ImplicitLocOpBuilder::getF32FloatAttr, attr, data);
-        })
-        .Case([&](DenseF64ArrayAttr attr) {
-          data = FillDataFromDenseArrayAttr<double, FloatAttr>(
-              b, &ImplicitLocOpBuilder::getF64FloatAttr, attr, data);
-        })
-        .Default([&](DenseArrayAttr attr) {
-          assert(false && "unsupported DenseArrayAttr element type");
-        });
-    ib.create<LLVM::ReturnOp>(data);
-  };
-
-  return g.GetOrCreate(b, base_array, arr_type, symbol_base, init);
-}
-
-static LLVM::GlobalOp EncodeDenseArrayAttribute(Globals &g,
-                                                ImplicitLocOpBuilder &b,
-                                                Attribute value,
-                                                std::string_view symbol_base) {
-  MLIRContext *ctx = b.getContext();
-
-  DenseArrayAttr base_array = value.cast<DenseArrayAttr>();
-  int64_t size = base_array.size();
-
-  Type ptr = LLVM::LLVMPointerType::get(ctx);
-
-  // Stored array type: !llvm.array<element_type x size>
-  Type element_type = base_array.getElementType();
-  Type arr_type = LLVM::LLVMArrayType::get(element_type, size);
-
-  // Encoded array type: !llvm.struct<(i64, !llvm.ptr)>.
-  Type type = LLVM::LLVMStructType::getLiteral(ctx, {b.getI64Type(), ptr});
-
-  // Global constant initializer for the encoded array structure
-  auto init = [&](ImplicitLocOpBuilder &ib, Attribute) {
-    // Array size and values.
-    Value num_elements = ib.create<ConstantOp>(b.getI64IntegerAttr(size));
-    Value data = Globals::AddrOf(
-        b, EncodeDenseArrayAttrData(g, ib, base_array, arr_type, symbol_base));
-
-    // Store size and values into the struct.
-    Value encoded = ib.create<LLVM::UndefOp>(type);
-    encoded = ib.create<LLVM::InsertValueOp>(encoded, num_elements, 0);
-    encoded = ib.create<LLVM::InsertValueOp>(encoded, data, 1);
-
-    ib.create<LLVM::ReturnOp>(encoded);
-  };
-
-  return g.GetOrCreate(b, value, type, symbol_base, init);
-}
-
-static LLVM::GlobalOp EncodeEmptyArrayAttribute(Globals &g,
-                                                ImplicitLocOpBuilder &b,
-                                                Attribute value,
-                                                std::string_view symbol_base) {
-  MLIRContext *ctx = b.getContext();
-
-  Type ptr = LLVM::LLVMPointerType::get(ctx);
-
-  // Encoded array type: !llvm.struct<(i64, !llvm.ptr)>.
-  Type type = LLVM::LLVMStructType::getLiteral(ctx, {b.getI64Type(), ptr});
-
-  // Global constant initializer for the encoded array structure
-  auto init = [&](ImplicitLocOpBuilder &ib, Attribute) {
-    // Array size and the pointer to data.
-    Value num_elements = ib.create<ConstantOp>(b.getI64IntegerAttr(0));
-    Value data = ib.create<LLVM::ZeroOp>(ptr);
-
-    // Store size and values into the struct.
-    Value encoded = ib.create<LLVM::UndefOp>(type);
-    encoded = ib.create<LLVM::InsertValueOp>(encoded, num_elements, 0);
-    encoded = ib.create<LLVM::InsertValueOp>(encoded, data, 1);
-
-    ib.create<LLVM::ReturnOp>(encoded);
-  };
-
-  return g.GetOrCreate(b, value, type, symbol_base, init);
-}
-
-//===----------------------------------------------------------------------===//
-// Packing primitive values on the stack.
-//===----------------------------------------------------------------------===//
-
-// Returns the parent function operation for the given value.
-static FuncOp GetParentFunc(Value value) {
-  Block *parent_block = value.getParentBlock();
-  Operation *parent_op = parent_block->getParentOp();
-
-  return isa<FuncOp>(parent_op) ? cast<FuncOp>(parent_op)
-                                : parent_op->getParentOfType<FuncOp>();
-}
-
-// Packs value on the stack. Returns allocation holding the value.
-static LLVM::AllocaOp PackValue(ImplicitLocOpBuilder &b, Allocas &a,
-                                Value value) {
-  LLVM::AllocaOp alloca = a.GetOrCreate(b, value.getType());
-  // Start the lifetime of encoded value.
-  b.create<LLVM::LifetimeStartOp>(b.getI64IntegerAttr(-1), alloca);
-  // Use volatile store to suppress expensive LLVM optimizations.
-  b.create<LLVM::StoreOp>(value, alloca, /*alignment=*/0, /*isVolatile=*/true);
-
-  return alloca;
-}
-
-//===----------------------------------------------------------------------===//
-// A helper class to create global constants in the module.
-//===----------------------------------------------------------------------===//
-
-LLVM::GlobalOp Globals::Find(Key key) {
-  auto it = globals_.find(key);
-  if (it != globals_.end()) return it->second;
-  return nullptr;
-}
-
-LLVM::GlobalOp Globals::GetOrCreate(ImplicitLocOpBuilder &b,
-                                    std::string_view strref,
-                                    std::string_view symbol_base) {
-  // Create an std::string to get a null terminated sequence of characters.
-  std::string str(strref);
-
-  // Create a string reference that captures the null terminator.
-  std::string_view ref(str.data(), str.size() + 1);
-  StringAttr attr = b.getStringAttr(ref);
-  Type arr = LLVM::LLVMArrayType::get(b.getI8Type(), ref.size());
-  return GetOrCreate(b, attr, arr, symbol_base);
-}
-
-LLVM::GlobalOp Globals::GetOrCreate(ImplicitLocOpBuilder &b, TypedAttr attr,
-                                    std::string_view symbol_base) {
-  return GetOrCreate(b, attr, attr.getType(), symbol_base);
-}
-
-LLVM::GlobalOp Globals::GetOrCreate(ImplicitLocOpBuilder &b,
-                                    mlir::TypeID type_id) {
-  std::string_view name = type_id_names_.FindTypeIDSymbolName(type_id);
-  assert(!name.empty() && "cannot find the symbol name of type_id");
-  return GetOrCreate(b, IntegerAttr(), b.getI64Type(), name, /*initialize=*/{},
-                     LLVM::Linkage::External);
-}
-
-LLVM::GlobalOp Globals::GetOrCreate(ImplicitLocOpBuilder &b, Attribute attr,
-                                    Type type, std::string_view symbol_base,
-                                    GlobalInitializer initialize,
-                                    LLVM::Linkage linkage) {
-  if (!initialize) {
-    return *TryGetOrCreate(b, attr, type, symbol_base, /*initialize=*/{},
-                           linkage);
-  }
-
-  return *TryGetOrCreate(
-      b, attr, type, symbol_base,
-      [&](ImplicitLocOpBuilder &b, Attribute) {
-        return (initialize(b, attr), success());
-      },
-      linkage);
-}
-
-mlir::FailureOr<mlir::LLVM::GlobalOp> Globals::TryGetOrCreate(
-    mlir::ImplicitLocOpBuilder &b, mlir::Attribute attr, mlir::Type type,
-    std::string_view symbol_base, FailureOrGlobalInitializer initialize,
-    mlir::LLVM::Linkage linkage) {
-  // We assume that this triple uniquely identifies the global value and the
-  // global initializer always produces the same value for given inputs.
-  Key key(attr, type, b.getStringAttr(symbol_base));
-
-  // Check if global value already exists ...
-  if (auto global = Find(key)) return global;
-
-  // ... otherwise create a new one.
-  OpBuilder::InsertionGuard guard(b);
-  b.setInsertionPointToStart(module_.getBody());
-
-  // If the initialize function is not provided, create constant directly.
-  if (!initialize) {
-    auto global = b.create<LLVM::GlobalOp>(type, /*isConstant=*/true, linkage,
-                                           symbol_base, attr);
-    return (sym_table_.insert(global), globals_[key] = global);
-  }
-
-  // Create an uninitialized global.
-  auto global = b.create<LLVM::GlobalOp>(type, /*isConstant=*/true, linkage,
-                                         symbol_base, nullptr);
-
-  // Call user-provided global initializer.
-  mlir::Region &region = global.getInitializerRegion();
-  mlir::Block *block = b.createBlock(&region);
-
-  b.setInsertionPointToStart(block);
-  if (failed(initialize(b, attr))) return failure();
-
-  return (sym_table_.insert(global), globals_[key] = global);
-}
-
-/*static*/ Value Globals::AddrOf(ImplicitLocOpBuilder &b,
-                                 LLVM::GlobalOp global) {
-  return b.create<LLVM::AddressOfOp>(LLVM::LLVMPointerType::get(b.getContext()),
-                                     global.getSymName());
-}
-
-//===----------------------------------------------------------------------===//
-// A helper class to create alloca operations for encoded arguments.
-//===----------------------------------------------------------------------===//
-
-Allocas::Allocas(Block *block,
-                 llvm::DenseMap<mlir::Type, TypedAllocas> *allocas)
-    : block_(block), allocas_(allocas) {
-  for (auto &[_, v] : *allocas_) {
-    assert(v.offset == 0 && "expected zero offset");
-    (void)v;
-  }
-}
-
-Allocas::~Allocas() {
-  for (auto &[k, v] : *allocas_) v.offset = 0;
-}
-
-mlir::LLVM::AllocaOp Allocas::GetOrCreate(mlir::ImplicitLocOpBuilder &b,
-                                          mlir::Type type) {
-  TypedAllocas &allocas = (*allocas_)[type];
-
-  // Reuse existing alloca for the given type.
-  if (allocas.offset < allocas.allocas.size()) {
-    return allocas.allocas[allocas.offset++];
-  }
-
-  // Create a new alloca at the beginning of the block.
-  OpBuilder::InsertionGuard guard(b);
-  b.setInsertionPointToStart(block_);
-  Value c1 = b.create<ConstantOp>(b.getI32IntegerAttr(1));
-  Type ptr = LLVM::LLVMPointerType::get(b.getContext());
-  auto alloca = b.create<LLVM::AllocaOp>(ptr, type, c1, 0);
-
-  ++allocas.offset;
-  return allocas.allocas.emplace_back(alloca);
-}
-
-Allocas EncodingAllocas::GetForOperation(mlir::Operation *op) {
-  // Always create an `alloca` in the parent function entry block.
-  // See: https://llvm.org/docs/Frontend/PerformanceTips.html#use-of-allocas
-  Block *block = &op->getParentOfType<func::FuncOp>().getBody().front();
-  return Allocas(block, &allocas_[block]);
-}
-
-//===----------------------------------------------------------------------===//
-// Helper functions for encoding attributes and values for custom calls.
-//===----------------------------------------------------------------------===//
-
-static bool IsAnyOf(unsigned width, ArrayRef<unsigned> supported) {
-  return llvm::any_of(supported, [&](unsigned w) { return w == width; });
-}
-
-static bool IsSupportedScalarType(Type type) {
-  if (auto idx = type.dyn_cast<mlir::IndexType>()) return true;
-
-  if (auto i = type.dyn_cast<mlir::IntegerType>())
-    return i.isUnsigned() ? IsAnyOf(i.getWidth(), {8, 16, 32, 64})
-                          : IsAnyOf(i.getWidth(), {1, 8, 16, 32, 64});
-
-  if (auto fp = type.dyn_cast<mlir::FloatType>())
-    return IsAnyOf(fp.getWidth(), {16, 32, 64});
-
-  return false;
-}
-
-static bool IsSupportedScalarAttribute(Attribute attr) {
-  if (auto typed = attr.dyn_cast<TypedAttr>())
-    return IsSupportedScalarType(typed.getType());
-  return false;
-}
-
-static TypeID ScalarRuntimeTypeId(Type type) {
-  if (type.isIndex()) return TypeID::get<Tagged<int64_t>>();
-
-  if (type.isUnsignedInteger(8)) return TypeID::get<Tagged<uint8_t>>();
-  if (type.isUnsignedInteger(16)) return TypeID::get<Tagged<uint16_t>>();
-  if (type.isUnsignedInteger(32)) return TypeID::get<Tagged<uint32_t>>();
-  if (type.isUnsignedInteger(64)) return TypeID::get<Tagged<uint64_t>>();
-
-  if (type.isInteger(1)) return TypeID::get<Tagged<bool>>();
-  if (type.isInteger(8)) return TypeID::get<Tagged<int8_t>>();
-  if (type.isInteger(16)) return TypeID::get<Tagged<int16_t>>();
-  if (type.isInteger(32)) return TypeID::get<Tagged<int32_t>>();
-  if (type.isInteger(64)) return TypeID::get<Tagged<int64_t>>();
-
-  if (type.isBF16()) return TypeID::get<Tagged<Eigen::bfloat16>>();
-  if (type.isF16()) return TypeID::get<Tagged<Eigen::half>>();
-  if (type.isF32()) return TypeID::get<Tagged<float>>();
-  if (type.isF64()) return TypeID::get<Tagged<double>>();
-
-  assert(false && "unsupported type id");
-  return TypeID::getFromOpaquePointer(reinterpret_cast<void *>(0xDEADBEEF));
-}
-
-static PrimitiveType ScalarPrimitiveType(Type type) {
-  // Integer types.
-  if (type.isInteger(1)) return PrimitiveType::PRED;
-  if (auto int_type = type.dyn_cast<mlir::IntegerType>()) {
-    unsigned int width = int_type.getWidth();
-    if (auto primitive_type =
-            int_type.isUnsigned()
-                // Unsigned integer types.
-                ? primitive_util::UnsignedIntegralTypeForBitWidth(width)
-                // Signed integer types.
-                : primitive_util::SignedIntegralTypeForBitWidth(width);
-        primitive_type != PRIMITIVE_TYPE_INVALID) {
-      return primitive_type;
-    }
-  }
-
-  // Floating point types.
-  if (type.isFloat8E4M3FN()) return PrimitiveType::F8E4M3FN;
-  if (type.isFloat8E4M3B11FNUZ()) return PrimitiveType::F8E4M3B11FNUZ;
-  if (type.isFloat8E5M2()) return PrimitiveType::F8E5M2;
-  if (type.isFloat8E4M3FNUZ()) return PrimitiveType::F8E4M3FNUZ;
-  if (type.isFloat8E5M2FNUZ()) return PrimitiveType::F8E5M2FNUZ;
-  if (type.isF16()) return PrimitiveType::F16;
-  if (type.isF32()) return PrimitiveType::F32;
-  if (type.isF64()) return PrimitiveType::F64;
-  if (type.isBF16()) return PrimitiveType::BF16;
-
-  // Complex types.
-  if (auto complex = type.dyn_cast<ComplexType>()) {
-    return primitive_util::ComplexType(
-        ScalarPrimitiveType(complex.getElementType()));
-  }
-
-  assert(false && "unsupported type id");
-  return PrimitiveType::PRIMITIVE_TYPE_INVALID;
-}
-
-static TypeID ArrayRuntimeTypeId(Type elem_type) {
-  if (elem_type.isInteger(8))
-    return TypeID::get<Tagged<absl::Span<const int8_t>>>();
-  if (elem_type.isInteger(16))
-    return TypeID::get<Tagged<absl::Span<const int16_t>>>();
-  if (elem_type.isInteger(32))
-    return TypeID::get<Tagged<absl::Span<const int32_t>>>();
-  if (elem_type.isInteger(64))
-    return TypeID::get<Tagged<absl::Span<const int64_t>>>();
-
-  if (elem_type.isF32()) return TypeID::get<Tagged<absl::Span<const float>>>();
-  if (elem_type.isF64()) return TypeID::get<Tagged<absl::Span<const double>>>();
-
-  assert(false && "unsupported type id");
-  return TypeID::getFromOpaquePointer(reinterpret_cast<void *>(0xDEADBEEF));
-}
-
-static TypeID AsyncValueRuntimeTypeId(Type elem_type) {
-  if (elem_type.isInteger(1))
-    return TypeID::get<Tagged<tsl::AsyncValueRef<bool>>>();
-  if (elem_type.isInteger(8))
-    return TypeID::get<Tagged<tsl::AsyncValueRef<int8_t>>>();
-  if (elem_type.isInteger(16))
-    return TypeID::get<Tagged<tsl::AsyncValueRef<int16_t>>>();
-  if (elem_type.isInteger(32))
-    return TypeID::get<Tagged<tsl::AsyncValueRef<int32_t>>>();
-  if (elem_type.isInteger(64))
-    return TypeID::get<Tagged<tsl::AsyncValueRef<int64_t>>>();
-  if (elem_type.isF32())
-    return TypeID::get<Tagged<tsl::AsyncValueRef<float>>>();
-  if (elem_type.isF64())
-    return TypeID::get<Tagged<tsl::AsyncValueRef<double>>>();
-  if (elem_type.isa<MemRefType>())
-    return TypeID::get<Tagged<tsl::AsyncValueRef<MemrefView>>>();
-
-  assert(false && "unsupported type id");
-  return TypeID::getFromOpaquePointer(reinterpret_cast<void *>(0xDEADBEEF));
-}
-
-static TypeID DenseElementsRuntimeTypeId(Type elem_type) {
-  if (elem_type.isInteger(32))
-    return TypeID::get<Tagged<CustomCall::TensorRef<int32_t>>>();
-  if (elem_type.isInteger(64))
-    return TypeID::get<Tagged<CustomCall::TensorRef<int64_t>>>();
-  if (elem_type.isF32())
-    return TypeID::get<Tagged<CustomCall::TensorRef<float>>>();
-  if (elem_type.isF64())
-    return TypeID::get<Tagged<CustomCall::TensorRef<double>>>();
-
-  assert(false && "unsupported type id");
-  return TypeID::getFromOpaquePointer(reinterpret_cast<void *>(0xDEADBEEF));
-}
-
-//===----------------------------------------------------------------------===//
-// Custom call attributes encoding.
-//===----------------------------------------------------------------------===//
-
-LogicalResult StringAttrEncoding::Match(mlir::SymbolTable &,
-                                        std::string_view name,
-                                        Attribute attr) const {
-  return success(attr.isa<StringAttr>());
-}
-
-FailureOr<EncodedAttr> StringAttrEncoding::Encode(mlir::SymbolTable &,
-                                                  Globals &g,
-                                                  ImplicitLocOpBuilder &b,
-                                                  std::string_view name,
-                                                  Attribute attr) const {
-  auto str = attr.cast<StringAttr>();
-
-  Encoded encoded;
-  encoded.name = EncodeString(g, b, name, kAttrName);
-  encoded.type_id = EncodeTypeId(g, b, TypeID::get<Tagged<std::string_view>>());
-  encoded.value = EncodeString(g, b, str.getValue(), kAttrValue);
-  return encoded;
-}
-
-//===----------------------------------------------------------------------===//
-
-LogicalResult ScalarAttrEncoding::Match(mlir::SymbolTable &,
-                                        std::string_view name,
-                                        Attribute attr) const {
-  return success(IsSupportedScalarAttribute(attr));
-}
-
-FailureOr<EncodedAttr> ScalarAttrEncoding::Encode(mlir::SymbolTable &,
-                                                  Globals &g,
-                                                  ImplicitLocOpBuilder &b,
-                                                  std::string_view name,
-                                                  Attribute attr) const {
-  Type type = attr.cast<TypedAttr>().getType();
-
-  Encoded encoded;
-  encoded.name = EncodeString(g, b, name, kAttrName);
-  encoded.type_id = EncodeTypeId(g, b, ScalarRuntimeTypeId(type));
-  encoded.value = EncodeScalar(g, b, attr, kAttrValue);
-
-  return encoded;
-}
-
-//===----------------------------------------------------------------------===//
-
-LogicalResult DenseElementsAttrEncoding::Match(mlir::SymbolTable &,
-                                               std::string_view name,
-                                               Attribute attr) const {
-  if (auto dense = attr.dyn_cast<DenseIntOrFPElementsAttr>())
-    return success(IsSupportedScalarType(dense.getElementType()));
-  return failure();
-}
-
-FailureOr<EncodedAttr> DenseElementsAttrEncoding::Encode(
-    mlir::SymbolTable &, Globals &g, ImplicitLocOpBuilder &b,
-    std::string_view name, Attribute attr) const {
-  auto dense = attr.cast<DenseIntOrFPElementsAttr>();
-  Type elem_type = dense.getType().getElementType();
-
-  Encoded encoded;
-  encoded.name = EncodeString(g, b, name, kAttrName);
-  encoded.type_id = EncodeTypeId(g, b, DenseElementsRuntimeTypeId(elem_type));
-  encoded.value = EncodeDenseElementsAttribute(g, b, attr, kAttrValue);
-
-  return encoded;
-}
-
-//===----------------------------------------------------------------------===//
-
-LogicalResult ArrayAttrEncoding::Match(mlir::SymbolTable &,
-                                       std::string_view name,
-                                       Attribute attr) const {
-  if (auto array = attr.dyn_cast<ArrayAttr>();
-      array && !array.empty() && array[0].isa<TypedAttr>()) {
-    return success(IsSupportedScalarAttribute(array[0]));
-  }
-  return failure();
-}
-
-FailureOr<EncodedAttr> ArrayAttrEncoding::Encode(mlir::SymbolTable &,
-                                                 Globals &g,
-                                                 ImplicitLocOpBuilder &b,
-                                                 std::string_view name,
-                                                 Attribute attr) const {
-  ArrayAttr array = attr.dyn_cast<ArrayAttr>();
-  Type elem_type = array[0].cast<TypedAttr>().getType();
-
-  // We only support array attributes with elements of same type.
-  bool all_of_same_type = llvm::all_of(array, [&](Attribute attr) {
-    auto typed = attr.dyn_cast<TypedAttr>();
-    return typed && typed.getType() == elem_type;
-  });
-  if (!all_of_same_type) return failure();
-
-  Encoded encoded;
-  encoded.name = EncodeString(g, b, name, kAttrName);
-  encoded.type_id = EncodeTypeId(g, b, ArrayRuntimeTypeId(elem_type));
-  encoded.value = EncodeArrayAttribute(g, b, array, elem_type, kAttrValue);
-
-  return encoded;
-}
-
-//===----------------------------------------------------------------------===//
-
-LogicalResult DenseArrayAttrEncoding::Match(mlir::SymbolTable &,
-                                            std::string_view name,
-                                            Attribute attr) const {
-  if (auto array = attr.dyn_cast<DenseArrayAttr>()) {
-    return success();
-  }
-  return failure();
-}
-
-FailureOr<EncodedAttr> DenseArrayAttrEncoding::Encode(mlir::SymbolTable &,
-                                                      Globals &g,
-                                                      ImplicitLocOpBuilder &b,
-                                                      std::string_view name,
-                                                      Attribute attr) const {
-  Type elem_type = attr.cast<DenseArrayAttr>().getElementType();
-
-  Encoded encoded;
-  encoded.name = EncodeString(g, b, name, kAttrName);
-  encoded.type_id = EncodeTypeId(g, b, ArrayRuntimeTypeId(elem_type));
-  encoded.value = EncodeDenseArrayAttribute(g, b, attr, kAttrValue);
-
-  return encoded;
-}
-
-//===----------------------------------------------------------------------===//
-
-LogicalResult EmptyArrayAttrEncoding::Match(mlir::SymbolTable &,
-                                            std::string_view name,
-                                            Attribute attr) const {
-  if (auto array = attr.dyn_cast<ArrayAttr>(); array && array.empty()) {
-    return success();
-  }
-  return failure();
-}
-
-FailureOr<EncodedAttr> EmptyArrayAttrEncoding::Encode(mlir::SymbolTable &,
-                                                      Globals &g,
-                                                      ImplicitLocOpBuilder &b,
-                                                      std::string_view name,
-                                                      Attribute attr) const {
-  Encoded encoded;
-  encoded.name = EncodeString(g, b, name, kAttrName);
-  encoded.type_id = EncodeTypeId(g, b, TypeID::get<Tagged<EmptyArray>>());
-  encoded.value = EncodeEmptyArrayAttribute(g, b, attr, kAttrValue);
-
-  return encoded;
-}
-
-//===----------------------------------------------------------------------===//
-
-LogicalResult SymbolRefAttrEncoding::Match(mlir::SymbolTable &sym_table,
-                                           std::string_view name,
-                                           Attribute attr) const {
-  if (auto ref = attr.dyn_cast<FlatSymbolRefAttr>()) {
-    auto exported = sym_table.lookup<func::FuncOp>(ref.getValue());
-    return success(exported && exported->hasAttr(kExportedAttrName));
-  }
-  return failure();
-}
-
-FailureOr<EncodedAttr> SymbolRefAttrEncoding::Encode(
-    mlir::SymbolTable &sym_table, Globals &g, ImplicitLocOpBuilder &b,
-    std::string_view name, Attribute attr) const {
-  // Get the exported function ordinal.
-  auto ref = attr.cast<FlatSymbolRefAttr>();
-  auto func = sym_table.lookup<func::FuncOp>(ref.getValue());
-  auto ordinal = func->getAttrOfType<IntegerAttr>(kExportedAttrName);
-  assert(ordinal.getType().isSignlessInteger(32));
-
-  // Encode exported function ordinal as a scalar constant with function ordinal
-  // type id.
-  auto type_id = TypeID::get<Tagged<CustomCall::FunctionOrdinal>>();
-
-  Encoded encoded;
-  encoded.name = EncodeString(g, b, name, kAttrName);
-  encoded.type_id = EncodeTypeId(g, b, type_id);
-  encoded.value = EncodeScalar(g, b, ordinal, kAttrValue);
-
-  return encoded;
-}
-
-//===----------------------------------------------------------------------===//
-
-LogicalResult UnitAttrEncoding::Match(mlir::SymbolTable &, std::string_view,
-                                      Attribute attr) const {
-  return success(attr.isa<UnitAttr>());
-}
-
-FailureOr<EncodedAttr> UnitAttrEncoding::Encode(mlir::SymbolTable &, Globals &g,
-                                                ImplicitLocOpBuilder &b,
-                                                std::string_view name,
-                                                Attribute attr) const {
-  Encoded encoded;
-  encoded.name = EncodeString(g, b, name, kAttrName);
-  encoded.type_id = EncodeTypeId(g, b, TypeID::get<Tagged<std::nullopt_t>>());
-  encoded.value = nullptr;  // unit attribute encoded as null global op
-
-  return encoded;
-}
-
-//===----------------------------------------------------------------------===//
-
-LogicalResult DictionaryAttrEncoding::Match(mlir::SymbolTable &,
-                                            std::string_view,
-                                            Attribute attr) const {
-  return success(attr.isa<DictionaryAttr>());
-}
-
-FailureOr<EncodedAttr> DictionaryAttrEncoding::Encode(
-    mlir::SymbolTable &sym_table, Globals &g, ImplicitLocOpBuilder &b,
-    std::string_view name, Attribute attr) const {
-  // TODO(ezhulenev): Add current set of available encodings to `Encode`
-  // arguments and remove it from `AggregateAttrEncoding` constructor.
-  CustomCallAttrEncodingSet encoding = DefaultAttrEncodings();
-
-  auto dict = cast<DictionaryAttr>(attr);
-  auto encoded_dict = EncodeAttributes(
-      sym_table, g, b, encoding, "__rt_dictionary",
-      // We rely on the fact that dictionary keeps attributes sorted by name.
-      llvm::SmallVector<NamedAttribute>(dict.begin(), dict.end()));
-  if (mlir::failed(encoded_dict)) return mlir::failure();
-
-  Encoded encoded;
-  encoded.name = EncodeString(g, b, name, kAttrName);
-  encoded.type_id = EncodeTypeId(g, b, TypeID::get<Tagged<Dictionary>>());
-  encoded.value = *encoded_dict;
-  return encoded;
-}
-
-//===----------------------------------------------------------------------===//
-// Encoding for collection of attributes.
-//===----------------------------------------------------------------------===//
-
-FailureOr<LLVM::GlobalOp> EncodeAttributes(
-    mlir::SymbolTable &sym_table, Globals &g, ImplicitLocOpBuilder &b,
-    const CustomCallAttrEncodingSet &encoding, std::string_view symbol_base,
-    ArrayRef<NamedAttribute> attrs) {
-  using EncodedAttr =
-      std::pair<std::string_view, CustomCallAttrEncoding::Encoded>;
-
-  // In addition to encoded attributes we encode the number of attributes.
-  int64_t n_attrs = attrs.size();
-
-  // We store encoded attribute as `!llvm.array<ptr x len>`.
-  Type ptr = LLVM::LLVMPointerType::get(b.getContext());
-  Type type = LLVM::LLVMArrayType::get(ptr, 1 + n_attrs * 3);
-
-  // Global initializer that encodes attributes as pointers.
-  auto init = [&](ImplicitLocOpBuilder &ib, Attribute) -> LogicalResult {
-    // Try to encode each individual attribute.
-    llvm::SmallVector<EncodedAttr> encoded_attrs;
-    for (auto &attr : attrs) {
-      auto encoded = encoding.Encode(sym_table, g, b, attr.getName().getValue(),
-                                     attr.getValue());
-      if (failed(encoded)) return failure();
-      encoded_attrs.emplace_back(attr.getName().getValue(), *encoded);
-    }
-
-    // Prepare an array for encoding attributes.
-    Value arr = b.create<LLVM::UndefOp>(type);
-    auto insert_value = [&](Value value, int64_t offset) {
-      arr = b.create<LLVM::InsertValueOp>(arr, value, offset);
-    };
-
-    // Insert the number of encoded attributes.
-    LLVM::GlobalOp num_attrs =
-        EncodeScalar(g, b, b.getI64IntegerAttr(n_attrs), "__rt_num_attrs");
-    insert_value(Globals::AddrOf(b, num_attrs), 0);
-
-    // Insert encoded attributes into the allocated storage.
-    for (const auto &pair : llvm::enumerate(encoded_attrs)) {
-      CustomCallAttrEncoding::Encoded encoded = pair.value().second;
-      int64_t offset = 1 + pair.index() * 3;
-
-      insert_value(Globals::AddrOf(b, encoded.name), offset + 0);
-      insert_value(Globals::AddrOf(b, encoded.type_id), offset + 1);
-
-      // For unit attributes we do not create any global operations, and just
-      // pass them as a null pointer. Attribute decoding treats null pointers as
-      // empty optional attributes.
-      if (encoded.value) {
-        insert_value(Globals::AddrOf(b, encoded.value), offset + 2);
-      } else {
-        insert_value(b.create<LLVM::ZeroOp>(ptr), offset + 2);
-      }
-    }
-
-    // Return attributes array from the global initializer block.
-    b.create<LLVM::ReturnOp>(arr);
-
-    return success();
-  };
-
-  // Put all attributes in a dictionary attribute, so we can rely use it as a
-  // part of the `Globals` cache key.
-  auto attrs_map = DictionaryAttr::get(b.getContext(), attrs);
-  auto global = g.TryGetOrCreate(b, attrs_map, type, symbol_base, init);
-  if (failed(global)) return failure();
-
-  // Return global encoding attributes.
-  return *global;
-}
-
-//===----------------------------------------------------------------------===//
-// Custom call arguments encodings.
-//===----------------------------------------------------------------------===//
-
-LogicalResult ScalarArgEncoding::Match(Value value, Value converted) const {
-  return success(IsSupportedScalarType(value.getType()));
-}
-
-FailureOr<EncodedArg> ScalarArgEncoding::Encode(Globals &g, Allocas &a,
-                                                ImplicitLocOpBuilder &b,
-                                                Value value,
-                                                Value converted) const {
-  Type type = converted.getType();
-
-  Encoded encoded;
-  encoded.type_id = EncodeTypeId(g, b, ScalarRuntimeTypeId(type));
-
-  // Encode constant arguments as global values.
-  if (IntegerAttr cst; matchPattern(converted, m_Constant(&cst))) {
-    std::string name = llvm::formatv("__rt_c{0}", cst.getValue());
-    encoded.value = g.GetOrCreate(b, cst, name);
-  } else if (FloatAttr cst; matchPattern(converted, m_Constant(&cst))) {
-    encoded.value = g.GetOrCreate(b, cst, "__rt_cst");
-  } else {
-    encoded.value = PackValue(b, a, converted);
-  }
-
-  return encoded;
-}
-
-//===----------------------------------------------------------------------===//
-
-static bool IsOpaqueValue(Value value) {
-  return value.getType().isa<OpaqueType>();
-}
-
-OpaqueArgEncoding::OpaqueArgEncoding()
-    : OpaqueArgEncoding(IsOpaqueValue, TypeID::get<Tagged<void *>>()) {}
-
-OpaqueArgEncoding::OpaqueArgEncoding(std::function<bool(Value)> match,
-                                     TypeID type_id)
-    : match_(std::move(match)), type_id_(type_id) {}
-
-LogicalResult OpaqueArgEncoding::Match(Value value, Value converted) const {
-  if (auto ptr = converted.getType().dyn_cast<LLVM::LLVMPointerType>())
-    return success(match_(value));
-  return failure();
-}
-
-FailureOr<EncodedArg> OpaqueArgEncoding::Encode(Globals &g, Allocas &a,
-                                                ImplicitLocOpBuilder &b,
-                                                Value value,
-                                                Value converted) const {
-  Encoded encoded;
-  encoded.type_id = EncodeTypeId(g, b, type_id_);
-  encoded.value = PackValue(b, a, converted);
-  return encoded;
-}
-
-//===----------------------------------------------------------------------===//
-
-static LLVM::LLVMStructType GetEncodeMemRefType(ImplicitLocOpBuilder &b,
-                                                MemRefType memref_ty) {
-  MLIRContext *ctx = b.getContext();
-
-  // Encode sizes together with strides as a single array.
-  int64_t sizes_and_strides_size = 2 * memref_ty.getRank();
-
-  // Encoded memref type: !llvm.struct<(i8, i8, ptr<i8>, array<... x i64>)>.
-  Type i8 = b.getI8Type();
-  Type ptr = LLVM::LLVMPointerType::get(ctx);
-  Type arr = LLVM::LLVMArrayType::get(b.getI64Type(), sizes_and_strides_size);
-  return LLVM::LLVMStructType::getLiteral(ctx, {i8, i8, ptr, arr});
-}
-
-// Encodes memref as LLVM struct value:
-//
-//   { i8: dtype, i8: rank, ptr<i8>: data,
-//     array<2*rank x i64>: sizes_and_strides }
-//
-// This is a type erased version of the MLIR memref descriptor without base
-// pointer. We pack sizes and strides as a single array member, so that on
-// the runtime side we can read it back using C flexible array member.
-// If the descriptor value is null, we only encode statically known info: dtype,
-// rank, and dims, otherwise we also encode dynamic info
-static Value EncodeMemRef(ImplicitLocOpBuilder &b, MemRefType memref_ty,
-                          Value descriptor) {
-  Location loc = b.getLoc();
-
-  auto type = GetEncodeMemRefType(b, memref_ty);
-
-  // Helper to unpack MLIR strided memref descriptor value.
-  std::optional<MemRefDescriptor> desc = std::nullopt;
-  if (descriptor) {
-    desc = MemRefDescriptor(descriptor);
-  }
-
-  PrimitiveType element_dtype = ScalarPrimitiveType(memref_ty.getElementType());
-
-  // Create values for filling encoded memref struct.
-  Value dtype = b.create<ConstantOp>(
-      b.getI8IntegerAttr(static_cast<uint8_t>(element_dtype)));
-  Value rank = b.create<ConstantOp>(b.getI8IntegerAttr(memref_ty.getRank()));
-
-  auto i64 = [&](int64_t i) { return b.getI64IntegerAttr(i); };
-
-  // Get the statically known strides and offset from the memref type.
-  llvm::SmallVector<int64_t> strides;
-  int64_t memref_offset;
-  if (failed(getStridesAndOffset(memref_ty, strides, memref_offset)))
-    strides.resize(memref_ty.getRank(), ShapedType::kDynamic);
-
-  // Build encoded memref sizes + strides: !llvm.array<... x i64>
-  Value payload = b.create<LLVM::UndefOp>(type.getBody()[3]);
-  for (unsigned i = 0; i < memref_ty.getRank(); ++i) {
-    int64_t dim_size = memref_ty.getDimSize(i);
-    int64_t stride_size = strides[i];
-
-    Value dim = ShapedType::isDynamic(dim_size) && desc.has_value()
-                    ? desc->size(b, loc, i)
-                    : b.create<ConstantOp>(i64(dim_size));
-
-    Value stride = ShapedType::isDynamic(stride_size) && desc.has_value()
-                       ? desc->stride(b, loc, i)
-                       : b.create<ConstantOp>(i64(stride_size));
-
-    auto stride_pos = memref_ty.getRank() + i;
-
-    payload = b.create<LLVM::InsertValueOp>(payload, dim, i);
-    payload = b.create<LLVM::InsertValueOp>(payload, stride, stride_pos);
-  }
-
-  // Construct encoded memref value.
-  Value memref = b.create<LLVM::UndefOp>(type);
-  memref = b.create<LLVM::InsertValueOp>(memref, dtype, 0);
-  memref = b.create<LLVM::InsertValueOp>(memref, rank, 1);
-  memref = b.create<LLVM::InsertValueOp>(memref, payload, 3);
-
-  // Previous values almost always are known at compile time, and inserting
-  // dynamic values into the struct after all statically know values leads to a
-  // better canonicalization and cleaner final LLVM IR.
-  if (desc.has_value()) {
-    Value offset = (memref_offset == ShapedType::kDynamic)
-                       ? desc->offset(b, loc)
-                       : b.create<ConstantOp>(i64(memref_offset)).getResult();
-    auto ptr = LLVM::LLVMPointerType::get(b.getContext());
-    Value data = b.create<LLVM::GEPOp>(ptr, memref_ty.getElementType(),
-                                       desc->alignedPtr(b, loc), offset);
-    memref = b.create<LLVM::InsertValueOp>(memref, data, 2);
-  }
-
-  return memref;
-}
-
-LogicalResult MemrefArgEncoding::Match(Value value, Value converted) const {
-  return success(value.getType().isa<MemRefType>());
-}
-
-FailureOr<EncodedArg> MemrefArgEncoding::Encode(Globals &g, Allocas &a,
-                                                ImplicitLocOpBuilder &b,
-                                                Value value,
-                                                Value converted) const {
-  auto memref_type = value.getType().cast<MemRefType>();
-
-  // If memref has non-identity layout we use `StridedMemrefView` to
-  // distinguish it from the default row-major memref.
-  auto type_id = memref_type.getLayout().isIdentity()
-                     ? TypeID::get<Tagged<MemrefView>>()
-                     : TypeID::get<Tagged<StridedMemrefView>>();
-
-  Encoded encoded;
-  encoded.type_id = EncodeTypeId(g, b, type_id);
-  encoded.value = PackValue(b, a, EncodeMemRef(b, memref_type, converted));
-
-  return encoded;
-}
-
-//===----------------------------------------------------------------------===//
-// Custom call results encodings.
-//===----------------------------------------------------------------------===//
-
-LogicalResult ScalarRetEncoding::Match(Type type, Type converted) const {
-  return success(IsSupportedScalarType(type));
-}
-
-FailureOr<EncodedRet> ScalarRetEncoding::Encode(Globals &g, Allocas &a,
-                                                ImplicitLocOpBuilder &b,
-                                                Type type,
-                                                Type converted) const {
-  Encoded encoded;
-  encoded.type_id = EncodeTypeId(g, b, ScalarRuntimeTypeId(converted));
-  encoded.value = a.GetOrCreate(b, converted);
-
-  // Start the lifetime of encoded result.
-  b.create<LLVM::LifetimeStartOp>(b.getI64IntegerAttr(-1), encoded.value);
-
-  return encoded;
-}
-
-FailureOr<Value> ScalarRetEncoding::Decode(ImplicitLocOpBuilder &b, Type type,
-                                           Type converted,
-                                           LLVM::AllocaOp alloca) const {
-  return Value{b.create<LLVM::LoadOp>(converted, alloca)};
-}
-
-//===----------------------------------------------------------------------===//
-
-static bool IsOpaqueType(Type type) { return type.isa<OpaqueType>(); }
-
-OpaqueRetEncoding::OpaqueRetEncoding()
-    : OpaqueRetEncoding(IsOpaqueType, TypeID::get<Tagged<void *>>()) {}
-
-OpaqueRetEncoding::OpaqueRetEncoding(std::function<bool(Type)> match,
-                                     TypeID type_id)
-    : match_(std::move(match)), type_id_(type_id) {}
-
-LogicalResult OpaqueRetEncoding::Match(Type type, Type converted) const {
-  if (auto ptr = converted.dyn_cast<LLVM::LLVMPointerType>())
-    return success(match_(type));
-  return failure();
-}
-
-FailureOr<EncodedRet> OpaqueRetEncoding::Encode(Globals &g, Allocas &a,
-                                                ImplicitLocOpBuilder &b,
-                                                Type value,
-                                                Type converted) const {
-  Encoded encoded;
-  encoded.type_id = EncodeTypeId(g, b, type_id_);
-  encoded.value = a.GetOrCreate(b, converted);
-
-  // Start the lifetime of encoded result.
-  b.create<LLVM::LifetimeStartOp>(b.getI64IntegerAttr(-1), encoded.value);
-
-  return encoded;
-}
-
-FailureOr<Value> OpaqueRetEncoding::Decode(ImplicitLocOpBuilder &b, Type type,
-                                           Type converted,
-                                           LLVM::AllocaOp alloca) const {
-  return Value{b.create<LLVM::LoadOp>(converted, alloca)};
-}
-
-//===----------------------------------------------------------------------===//
-
-LogicalResult MemrefRetEncoding::Match(Type type, Type converted) const {
-  return success(type.isa<MemRefType>() &&
-                 converted.isa<LLVM::LLVMStructType>());
-}
-
-FailureOr<EncodedRet> MemrefRetEncoding::Encode(Globals &g, Allocas &a,
-                                                ImplicitLocOpBuilder &b,
-                                                Type type,
-                                                Type converted) const {
-  auto memref_ty = type.cast<MemRefType>();
-
-  // We assume custom calls can only return row-major memrefs, may need to add
-  // PermutedMemref support in the future.
-  auto type_id = TypeID::get<Tagged<MemrefView>>();
-
-  Encoded encoded;
-  encoded.type_id = EncodeTypeId(g, b, type_id);
-  // No memref descriptor for result, we only encode compile time known info:
-  // dtype, rank, dims
-  encoded.value =
-      PackValue(b, a, EncodeMemRef(b, memref_ty, /*descriptor=*/nullptr));
-
-  return encoded;
-}
-
-// Convert EncodedMemRef back to llvm MemRef descriptor, e.g.,
-//   !llvm.struct<(i8, i8, ptr, array<2 x i64>)>
-//     --->>> (note that memref descriptor still uses typed LLVM pointers)
-//   !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-FailureOr<Value> MemrefRetEncoding::Decode(ImplicitLocOpBuilder &b, Type type,
-                                           Type converted,
-                                           LLVM::AllocaOp alloca) const {
-  Location loc = b.getLoc();
-  auto memref_type = cast<MemRefType>(type);
-  auto memref_desc = MemRefDescriptor::undef(b, loc, converted);
-
-  // TODO(ezhulenev): Add support for returning dynamically shaped memrefs.
-  if (!memref_type.hasStaticShape()) return failure();
-
-  Type ptr = LLVM::LLVMPointerType::get(b.getContext());
-  LLVM::LLVMStructType encoded = GetEncodeMemRefType(b, memref_type);
-
-  Value c0 = b.create<ConstantOp>(b.getI64IntegerAttr(0));
-  Value c2 = b.create<ConstantOp>(b.getI64IntegerAttr(2));
-
-  // Fill memref descriptor pointers and offset.
-  Value gep = b.create<LLVM::GEPOp>(ptr, encoded, alloca, ValueRange({c0, c2}));
-  Value data_ptr = b.create<LLVM::LoadOp>(ptr, gep);
-  memref_desc.setAllocatedPtr(b, loc, data_ptr);
-  memref_desc.setAlignedPtr(b, loc, data_ptr);
-
-  // Get the statically known strides and offset from the memref type.
-  SmallVector<int64_t> strides;
-  int64_t memref_offset;
-  if (failed(getStridesAndOffset(memref_type, strides, memref_offset))) {
-    return failure();
-  }
-
-  memref_desc.setConstantOffset(b, loc, memref_offset);
-
-  // Fill memref descriptor dimensions and strides.
-  for (unsigned i = 0; i < memref_type.getRank(); ++i) {
-    memref_desc.setConstantSize(b, loc, i, memref_type.getDimSize(i));
-    memref_desc.setConstantStride(b, loc, i, strides[i]);
-  }
-
-  auto casted =
-      b.create<UnrealizedConversionCastOp>(memref_type, Value(memref_desc));
-  return casted.getResult(0);
-}
-
-//===----------------------------------------------------------------------===//
-
-LogicalResult AsyncValueRetEncoding::Match(Type type, Type converted) const {
-  return success(
-      (type.isa<async::ValueType>() || type.isa<async::TokenType>()) &&
-      converted.isa<LLVM::LLVMPointerType>());
-}
-
-FailureOr<EncodedRet> AsyncValueRetEncoding::Encode(Globals &g, Allocas &a,
-                                                    ImplicitLocOpBuilder &b,
-                                                    Type type,
-                                                    Type converted) const {
-  Type ptr = LLVM::LLVMPointerType::get(b.getContext());
-  Value one = b.create<ConstantOp>(b.getI32IntegerAttr(1));
-
-  auto type_id = type.isa<async::ValueType>()
-                     ? AsyncValueRuntimeTypeId(
-                           type.cast<async::ValueType>().getValueType())
-                     : TypeID::get<Tagged<tsl::AsyncValueRef<tsl::Chain>>>();
-
-  Encoded encoded;
-  encoded.type_id = EncodeTypeId(g, b, type_id);
-
-  // for !async.value<memref> encoding its dtype, rank and dims with
-  // EncodedMemRef struct; we use its data field to store async value ptr.
-  if (auto value_ty = type.dyn_cast<async::ValueType>()) {
-    if (auto memref_ty = value_ty.getValueType().dyn_cast<MemRefType>()) {
-      encoded.value =
-          PackValue(b, a, EncodeMemRef(b, memref_ty, /*descriptor=*/nullptr));
-      return encoded;
-    }
-  }
-
-  encoded.value = b.create<LLVM::AllocaOp>(ptr, converted, one, 0);
-
-  return encoded;
-}
-
-FailureOr<Value> AsyncValueRetEncoding::Decode(ImplicitLocOpBuilder &b,
-                                               Type type, Type converted,
-                                               LLVM::AllocaOp alloca) const {
-  if (auto value_ty = type.dyn_cast<async::ValueType>()) {
-    if (auto memref_ty = value_ty.getValueType().dyn_cast<MemRefType>()) {
-      // TODO(ezhulenev): Add support for returning dynamically shaped memref.
-      if (!memref_ty.hasStaticShape()) return failure();
-
-      Value c0 = b.create<ConstantOp>(b.getI64IntegerAttr(0));
-      Value c2 = b.create<ConstantOp>(b.getI64IntegerAttr(2));
-      Type ptr = LLVM::LLVMPointerType::get(b.getContext());
-      LLVM::LLVMStructType encoded = GetEncodeMemRefType(b, memref_ty);
-      Value gep =
-          b.create<LLVM::GEPOp>(ptr, encoded, alloca, ValueRange({c0, c2}));
-      Value async_value = b.create<LLVM::LoadOp>(converted, gep);
-      auto casted = b.create<UnrealizedConversionCastOp>(type, async_value);
-      return casted.getResult(0);
-    }
-  }
-
-  auto async_value = Value{b.create<LLVM::LoadOp>(converted, alloca)};
-  auto casted = b.create<UnrealizedConversionCastOp>(type, async_value);
-  return casted.getResult(0);
-}
-
-//===----------------------------------------------------------------------===//
-// Default encodings for arguments, attributes, and results
-//===----------------------------------------------------------------------===//
-
-CustomCallAttrEncodingSet DefaultAttrEncodings() {
-  CustomCallAttrEncodingSet encodings;
-  encodings
-      .Add<StringAttrEncoding, ScalarAttrEncoding, DenseElementsAttrEncoding,
-           ArrayAttrEncoding, DenseArrayAttrEncoding, EmptyArrayAttrEncoding,
-           SymbolRefAttrEncoding, UnitAttrEncoding, DictionaryAttrEncoding>();
-
-  encodings.Add<AggregateAttrEncoding<HloTraceAttr, HloTrace>>(
-      encodings,
-      AggregateAttrDef<HloTraceAttr>().Add("hlo_op", &HloTraceAttr::getHloOp));
-
-  return encodings;
-}
-
-CustomCallArgEncodingSet DefaultArgEncodings() {
-  CustomCallArgEncodingSet encodings;
-  encodings.Add<ScalarArgEncoding, OpaqueArgEncoding, MemrefArgEncoding>();
-  return encodings;
-}
-
-CustomCallRetEncodingSet DefaultRetEncodings() {
-  CustomCallRetEncodingSet encodings;
-  encodings.Add<ScalarRetEncoding, OpaqueRetEncoding, MemrefRetEncoding,
-                AsyncValueRetEncoding>();
-  return encodings;
-}
-
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/mlir/runtime/transforms/custom_call_encoding.h b/third_party/xla/xla/mlir/runtime/transforms/custom_call_encoding.h
deleted file mode 100644
index ea9fac26865ed5..00000000000000
--- a/third_party/xla/xla/mlir/runtime/transforms/custom_call_encoding.h
+++ /dev/null
@@ -1,775 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_MLIR_RUNTIME_TRANSFORMS_CUSTOM_CALL_ENCODING_H_
-#define XLA_MLIR_RUNTIME_TRANSFORMS_CUSTOM_CALL_ENCODING_H_
-
-#include <functional>
-#include <memory>
-#include <string>
-#include <string_view>
-#include <tuple>
-#include <type_traits>
-#include <utility>
-#include <variant>
-#include <vector>
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "mlir/Dialect/LLVMIR/LLVMAttrs.h"  // from @llvm-project
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/SymbolTable.h"  // from @llvm-project
-#include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Support/TypeID.h"  // from @llvm-project
-#include "xla/runtime/custom_call.h"
-#include "xla/runtime/type_id.h"
-
-namespace xla {
-namespace runtime {
-
-//===----------------------------------------------------------------------===//
-// Helper classes to build XLA custom calls' lowering to the LLVM dialect.
-//===----------------------------------------------------------------------===//
-//
-// Arguments to the custom call API intrinsic are encoded as an array of opaque
-// pointers and at the runtime side available as `void**`. Runtime decodes
-// opaque pointers to the C++ data structures (see runtime/custom_call.h), and
-// passes them to the registered callback. Argument encoding/decoding must be
-// compatible, otherwise it's very easy to get a segfault because of an illegal
-// memory access.
-//
-// Attributes are encoded into a separate opaque storage together with names,
-// so the runtime side can decode the attributes it needs and check that all
-// required attributes were passed to the custom call handler.
-//
-// Custom call attributes are encoded as module global constants, and at run
-// time we only need to pass a pointer to the constant section.
-//
-// Custom call arguments are encoded as an array of pointers allocated on the
-// stack. Each individual argument is also encoded on the stack, because
-// arguments are typically run time values and we can't encode them in the
-// constant section. Statically known arguments (constants) can be encoded as
-// global values together with attributes.
-
-// Forward declare class declared below.
-class Globals;
-class Allocas;
-
-//===----------------------------------------------------------------------===//
-// Custom call arguments encoding.
-//===----------------------------------------------------------------------===//
-
-// Encodes argument into stack allocated storage according to the ABI.
-class CustomCallArgEncoding {
- public:
-  struct Encoded {
-    mlir::LLVM::GlobalOp type_id;  // llvm.mlir.global external $type_name : i64
-
-    // Statically known arguments might be encoded as global constants,
-    // otherwise it will be `!llvm.alloca 1 x ArgType`.
-    std::variant<mlir::LLVM::AllocaOp, mlir::LLVM::GlobalOp> value;
-  };
-
-  virtual ~CustomCallArgEncoding() = default;
-
-  virtual mlir::LogicalResult Match(mlir::Value value,
-                                    mlir::Value conterted) const = 0;
-
-  virtual mlir::FailureOr<Encoded> Encode(Globals &g, Allocas &a,
-                                          mlir::ImplicitLocOpBuilder &b,
-                                          mlir::Value value,
-                                          mlir::Value converted) const = 0;
-};
-
-// A set of registered custom call arguments encodings.
-class CustomCallArgEncodingSet {
- public:
-  using Encoded = CustomCallArgEncoding::Encoded;
-
-  // Finds matching argument encoding and tries to encode the values. Returns
-  // failure if didn't match values to any of the argument encodings.
-  mlir::FailureOr<Encoded> Encode(Globals &g, Allocas &a,
-                                  mlir::ImplicitLocOpBuilder &b,
-                                  mlir::Value value,
-                                  mlir::Value converted) const;
-
-  template <typename... Ts, typename = std::enable_if_t<sizeof...(Ts) != 0>>
-  CustomCallArgEncodingSet &Add() {
-    (encodings_.emplace_back(std::make_unique<Ts>()), ...);
-    return *this;
-  }
-
-  template <typename... Ts, typename Arg, typename... Args,
-            typename = std::enable_if_t<sizeof...(Ts) != 0>>
-  CustomCallArgEncodingSet &Add(Arg arg, Args... args) {
-    (encodings_.emplace_back(std::make_unique<Ts>(std::forward<Arg>(arg),
-                                                  std::forward<Args...>(args))),
-     ...);
-    return *this;
-  }
-
- private:
-  std::vector<std::unique_ptr<CustomCallArgEncoding>> encodings_;
-};
-
-//===----------------------------------------------------------------------===//
-// Custom call results encoding.
-//===----------------------------------------------------------------------===//
-
-// Encodes result into stack allocated storage according to the ABI.
-class CustomCallRetEncoding {
- public:
-  struct Encoded {
-    mlir::LLVM::GlobalOp type_id;  // llvm.mlir.global external $type_name : i64
-    mlir::LLVM::AllocaOp value;    // !llvm.alloca 1 x ResultType
-  };
-
-  virtual ~CustomCallRetEncoding() = default;
-
-  virtual mlir::LogicalResult Match(mlir::Type type,
-                                    mlir::Type converted) const = 0;
-
-  virtual mlir::FailureOr<Encoded> Encode(Globals &g, Allocas &a,
-                                          mlir::ImplicitLocOpBuilder &b,
-                                          mlir::Type type,
-                                          mlir::Type converted) const = 0;
-
-  virtual mlir::FailureOr<mlir::Value> Decode(
-      mlir::ImplicitLocOpBuilder &b, mlir::Type type, mlir::Type converted,
-      mlir::LLVM::AllocaOp alloca) const = 0;
-};
-
-// A set of registered custom call results encodings.
-class CustomCallRetEncodingSet {
- public:
-  using Encoded = CustomCallRetEncoding::Encoded;
-
-  // Finds matching result encoding and tries to encode the values. Returns
-  // failure if didn't match values to any of the result encodings.
-  mlir::FailureOr<Encoded> Encode(Globals &g, Allocas &a,
-                                  mlir::ImplicitLocOpBuilder &b,
-                                  mlir::Type type, mlir::Type converted) const;
-
-  // Convert the encoded value in alloca back to a value with the converted
-  // type. Return failure if the convertion failed.
-  mlir::FailureOr<mlir::Value> Decode(mlir::ImplicitLocOpBuilder &b,
-                                      mlir::Type type, mlir::Type converted,
-                                      mlir::LLVM::AllocaOp alloca) const;
-
-  template <typename... Ts, typename = std::enable_if_t<sizeof...(Ts) != 0>>
-  CustomCallRetEncodingSet &Add() {
-    (encodings_.emplace_back(std::make_unique<Ts>()), ...);
-    return *this;
-  }
-
-  template <typename... Ts, typename Arg, typename... Args,
-            typename = std::enable_if_t<sizeof...(Ts) != 0>>
-  CustomCallRetEncodingSet &Add(Arg arg, Args... args) {
-    (encodings_.emplace_back(std::make_unique<Ts>(std::forward<Arg>(arg),
-                                                  std::forward<Args...>(args))),
-     ...);
-    return *this;
-  }
-
- private:
-  std::vector<std::unique_ptr<CustomCallRetEncoding>> encodings_;
-};
-
-//===----------------------------------------------------------------------===//
-// Custom call attributes encoding.
-//===----------------------------------------------------------------------===//
-
-// Attributes encoding packs attribute name, data type and a value into the
-// module global constant, and returns values pointing to the encoded data.
-struct CustomCallAttrEncoding {
-  static constexpr char kAttrName[] = "__rt_attr_name";
-  static constexpr char kAttrValue[] = "__rt_attr_value";
-
-  struct Encoded {
-    mlir::LLVM::GlobalOp name;     // llvm.mlir.global <encoded-name>
-    mlir::LLVM::GlobalOp type_id;  // llvm.mlir.global external $type_name : i64
-    mlir::LLVM::GlobalOp value;    // llvm.mlir.global <encoded-attribute>
-  };
-
-  virtual ~CustomCallAttrEncoding() = default;
-
-  virtual mlir::LogicalResult Match(mlir::SymbolTable &sym_table,
-                                    std::string_view name,
-                                    mlir::Attribute attr) const = 0;
-
-  virtual mlir::FailureOr<Encoded> Encode(mlir::SymbolTable &sym_table,
-                                          Globals &g,
-                                          mlir::ImplicitLocOpBuilder &b,
-                                          std::string_view name,
-                                          mlir::Attribute attr) const = 0;
-};
-
-// A set of registered custom call attributes encodings.
-class CustomCallAttrEncodingSet {
- public:
-  using Encoded = CustomCallAttrEncoding::Encoded;
-
-  // Finds matching attribute encoding and tries to encode the attribute.
-  // Returns failure if didn't match attribute to any of the encodings.
-  mlir::FailureOr<Encoded> Encode(mlir::SymbolTable &sym_table, Globals &g,
-                                  mlir::ImplicitLocOpBuilder &b,
-                                  std::string_view name,
-                                  mlir::Attribute attr) const;
-
-  template <typename... Ts, typename = std::enable_if_t<sizeof...(Ts) != 0>>
-  CustomCallAttrEncodingSet &Add() {
-    (encodings_.emplace_back(std::make_unique<Ts>()), ...);
-    return *this;
-  }
-
-  template <typename... Ts, typename ConstructorArg,
-            typename... ConstructorArgs,
-            typename = std::enable_if_t<sizeof...(Ts) != 0>>
-  CustomCallAttrEncodingSet &Add(ConstructorArg &&arg,
-                                 ConstructorArgs &&...args) {
-    (encodings_.emplace_back(std::make_unique<Ts>(arg, args...)), ...);
-    return *this;
-  }
-
- private:
-  std::vector<std::unique_ptr<CustomCallAttrEncoding>> encodings_;
-};
-
-//===----------------------------------------------------------------------===//
-// A set of helper functions for packing encoding attributes.
-//===----------------------------------------------------------------------===//
-
-// Encodes type id as an external LLVM global of type `i64`. The global name is
-// defined by the type id name registry. Internally type id implemented as an
-// opaque pointer (void*), and type equality check at run time is just a pointer
-// comparison. All type id symbols at run time must be resolved to the type id
-// instances defined in the current process.
-mlir::LLVM::GlobalOp EncodeTypeId(Globals &g, mlir::ImplicitLocOpBuilder &b,
-                                  TypeID type_id);
-
-// Encodes string as a module global null-terminated string constant + size. We
-// reuse the encoding scheme for arrays to store sting with its size, to avoid
-// computing the length of the null-terminated string at run time.
-mlir::LLVM::GlobalOp EncodeString(Globals &g, mlir::ImplicitLocOpBuilder &b,
-                                  std::string_view strref,
-                                  std::string_view symbol_base);
-
-// Encodes scalar attribute as a global constant.
-mlir::LLVM::GlobalOp EncodeScalar(Globals &g, mlir::ImplicitLocOpBuilder &b,
-                                  mlir::Attribute value,
-                                  std::string_view symbol_base);
-
-//===----------------------------------------------------------------------===//
-// A helper class to create global constants in the module.
-//===----------------------------------------------------------------------===//
-
-class Globals {
- public:
-  // Global value initializer that build the initialization region.
-  using GlobalInitializer =
-      std::function<void(mlir::ImplicitLocOpBuilder &, mlir::Attribute)>;
-
-  // Global value initializer that can return failure if it can't initialize the
-  // global value from the given attribute.
-  using FailureOrGlobalInitializer = std::function<mlir::LogicalResult(
-      mlir::ImplicitLocOpBuilder &, mlir::Attribute)>;
-
-  Globals(mlir::ModuleOp module, TypeIDNameRegistry type_id_names)
-      : module_(module),
-        sym_table_(module_),
-        type_id_names_(std::move(type_id_names)) {}
-
-  // Creates a global external variable for the type id.
-  mlir::LLVM::GlobalOp GetOrCreate(mlir::ImplicitLocOpBuilder &b,
-                                   TypeID type_id);
-
-  // Creates a global null-terminated string constant.
-  mlir::LLVM::GlobalOp GetOrCreate(mlir::ImplicitLocOpBuilder &b,
-                                   std::string_view strref,
-                                   std::string_view symbol_base);
-
-  // Creates a global constant value from the attribute. Attribute type must be
-  // a valid type compatible with LLVM globals.
-  mlir::LLVM::GlobalOp GetOrCreate(mlir::ImplicitLocOpBuilder &b,
-                                   mlir::TypedAttr attr,
-                                   std::string_view symbol_base);
-
-  // Creates a global constant value of the given type from the attribute, using
-  // optional user-provided global constant initialization.
-  mlir::LLVM::GlobalOp GetOrCreate(
-      mlir::ImplicitLocOpBuilder &b, mlir::Attribute attr, mlir::Type type,
-      std::string_view symbol_base, GlobalInitializer initialize = {},
-      mlir::LLVM::Linkage linkage = mlir::LLVM::Linkage::Internal);
-
-  // Creates a global constant value of the given type from the attribute, using
-  // optional user-provided global constant initialization. Returns failure if
-  // user-provided initialization failed to initialize the global value.
-  mlir::FailureOr<mlir::LLVM::GlobalOp> TryGetOrCreate(
-      mlir::ImplicitLocOpBuilder &b, mlir::Attribute attr, mlir::Type type,
-      std::string_view symbol_base, FailureOrGlobalInitializer initialize = {},
-      mlir::LLVM::Linkage linkage = mlir::LLVM::Linkage::Internal);
-
-  // Returns the address of the global value.
-  static mlir::Value AddrOf(mlir::ImplicitLocOpBuilder &b,
-                            mlir::LLVM::GlobalOp global);
-
-  mlir::ModuleOp module() { return module_; }
-
- private:
-  // Globals key: {attribute, encoded-type, sym-name}. We can only have global
-  // constants of one of the LLVM types, and there could be multiple ways to
-  // encode an attribute as an LLVM type, e.g. strings can be stored as null
-  // terminated array of bytes, or a pair of string size and and array of bytes.
-  using Key = std::tuple<mlir::Attribute, mlir::Type, mlir::StringAttr>;
-
-  mlir::LLVM::GlobalOp Find(Key key);
-
-  mlir::ModuleOp module_;
-  mlir::SymbolTable sym_table_;  // symbol table for the `module_`
-  llvm::DenseMap<Key, mlir::LLVM::GlobalOp> globals_;
-
-  // A mapping from the TypeID to the unique type name for encoding external
-  // globals corresponding to types ids.
-  TypeIDNameRegistry type_id_names_;
-};
-
-//===----------------------------------------------------------------------===//
-// A helper class to create alloca operations for encoded arguments.
-//===----------------------------------------------------------------------===//
-
-class EncodingAllocas;
-
-// We reuse allocas for encoding custom call arguments and results, because we
-// potentially can have thousands of custom calls, and we do not want to
-// accidentally blow up the stack size. It means that we might encode the same
-// argument multiple times, but encoding is cheap (few store operations), and
-// LLVM can potentially optimize them away.
-//
-// TODO(ezhulenev): Use `llvm.invariant.start` and `llvm.invariant.end` to mark
-// encoded arguments allocas.
-class Allocas {
- public:
-  ~Allocas();
-
-  mlir::LLVM::AllocaOp GetOrCreate(mlir::ImplicitLocOpBuilder &b,
-                                   mlir::Type type);
-
- private:
-  friend class EncodingAllocas;
-
-  struct TypedAllocas {
-    size_t offset = 0;
-    llvm::SmallVector<mlir::LLVM::AllocaOp> allocas;
-  };
-
-  explicit Allocas(mlir::Block *block,
-                   llvm::DenseMap<mlir::Type, TypedAllocas> *allocas);
-
-  mlir::Block *block_;
-  llvm::DenseMap<mlir::Type, TypedAllocas> *allocas_;
-};
-
-// Mapping from basic block to allocas.
-class EncodingAllocas {
- public:
-  Allocas GetForOperation(mlir::Operation *op);
-
- private:
-  friend class Allocas;
-
-  llvm::DenseMap<mlir::Block *,
-                 llvm::DenseMap<mlir::Type, Allocas::TypedAllocas>>
-      allocas_;
-};
-
-//===----------------------------------------------------------------------===//
-// Custom call attributes encoding.
-//===----------------------------------------------------------------------===//
-
-// Encodes attribute using a scheme compatible with run time attributes decoding
-// (see `internal::DecodedAttrs` in the custom call header file).
-//
-// Returns a value of `!llvm.ptr<ptr<i8>>` (void**) type pointing to the encoded
-// attributes array (array of pointers).
-//
-// This function is used to encode:
-//
-//   1. Struct attributes as aggregates of nested attributes, where the order of
-//      attributes matches the order defined with the `AggregateAttrDef` schema
-//      defined below.
-//
-//   2. Custom call attributes, where the attributes sorted lexicographically by
-//      name, to be able to efficiently decode named attributes.
-//
-mlir::FailureOr<mlir::LLVM::GlobalOp> EncodeAttributes(
-    mlir::SymbolTable &sym_table, Globals &g, mlir::ImplicitLocOpBuilder &b,
-    const CustomCallAttrEncodingSet &encoding, std::string_view symbol_base,
-    llvm::ArrayRef<mlir::NamedAttribute> attrs);
-
-struct StringAttrEncoding : public CustomCallAttrEncoding {
-  mlir::LogicalResult Match(mlir::SymbolTable &, std::string_view,
-                            mlir::Attribute) const final;
-  mlir::FailureOr<Encoded> Encode(mlir::SymbolTable &, Globals &,
-                                  mlir::ImplicitLocOpBuilder &,
-                                  std::string_view,
-                                  mlir::Attribute) const final;
-};
-
-struct ScalarAttrEncoding : public CustomCallAttrEncoding {
-  mlir::LogicalResult Match(mlir::SymbolTable &, std::string_view,
-                            mlir::Attribute) const final;
-  mlir::FailureOr<Encoded> Encode(mlir::SymbolTable &, Globals &g,
-                                  mlir::ImplicitLocOpBuilder &,
-                                  std::string_view,
-                                  mlir::Attribute) const final;
-};
-
-struct DenseElementsAttrEncoding : public CustomCallAttrEncoding {
-  mlir::LogicalResult Match(mlir::SymbolTable &, std::string_view,
-                            mlir::Attribute) const final;
-  mlir::FailureOr<Encoded> Encode(mlir::SymbolTable &, Globals &,
-                                  mlir::ImplicitLocOpBuilder &,
-                                  std::string_view,
-                                  mlir::Attribute) const final;
-};
-
-struct ArrayAttrEncoding : public CustomCallAttrEncoding {
-  mlir::LogicalResult Match(mlir::SymbolTable &, std::string_view,
-                            mlir::Attribute) const final;
-  mlir::FailureOr<Encoded> Encode(mlir::SymbolTable &, Globals &,
-                                  mlir::ImplicitLocOpBuilder &,
-                                  std::string_view,
-                                  mlir::Attribute) const final;
-};
-
-struct DenseArrayAttrEncoding : public CustomCallAttrEncoding {
-  mlir::LogicalResult Match(mlir::SymbolTable &, std::string_view,
-                            mlir::Attribute) const final;
-  mlir::FailureOr<Encoded> Encode(mlir::SymbolTable &, Globals &,
-                                  mlir::ImplicitLocOpBuilder &,
-                                  std::string_view,
-                                  mlir::Attribute) const final;
-};
-
-struct EmptyArrayAttrEncoding : public CustomCallAttrEncoding {
-  mlir::LogicalResult Match(mlir::SymbolTable &, std::string_view,
-                            mlir::Attribute) const final;
-  mlir::FailureOr<Encoded> Encode(mlir::SymbolTable &, Globals &,
-                                  mlir::ImplicitLocOpBuilder &,
-                                  std::string_view,
-                                  mlir::Attribute) const final;
-};
-
-struct SymbolRefAttrEncoding : public CustomCallAttrEncoding {
-  mlir::LogicalResult Match(mlir::SymbolTable &, std::string_view,
-                            mlir::Attribute) const final;
-  mlir::FailureOr<Encoded> Encode(mlir::SymbolTable &, Globals &,
-                                  mlir::ImplicitLocOpBuilder &,
-                                  std::string_view,
-                                  mlir::Attribute) const final;
-};
-
-struct UnitAttrEncoding : public CustomCallAttrEncoding {
-  mlir::LogicalResult Match(mlir::SymbolTable &, std::string_view,
-                            mlir::Attribute) const final;
-  mlir::FailureOr<Encoded> Encode(mlir::SymbolTable &, Globals &,
-                                  mlir::ImplicitLocOpBuilder &,
-                                  std::string_view,
-                                  mlir::Attribute) const final;
-};
-
-struct DictionaryAttrEncoding : public CustomCallAttrEncoding {
-  mlir::LogicalResult Match(mlir::SymbolTable &, std::string_view,
-                            mlir::Attribute) const final;
-  mlir::FailureOr<Encoded> Encode(mlir::SymbolTable &, Globals &,
-                                  mlir::ImplicitLocOpBuilder &,
-                                  std::string_view,
-                                  mlir::Attribute) const final;
-};
-
-// Custom call attribute encoding that encodes enums using their underlying
-// scalar type. Type id is based on the enum type passed to the runtime.
-//
-// This encoding can convert enum types defined in the compiler (e.g. dialect
-// enums defined in MLIR) to the enum types used at run time.
-template <typename AttrType, typename EnumType,
-          typename RuntimeEnumType = EnumType>
-struct EnumAttrEncoding : public CustomCallAttrEncoding {
-  static_assert(std::is_enum<RuntimeEnumType>::value, "must be an enum class");
-
-  // Convert from the compile time enum to the run time enum.
-  using Converter = std::function<RuntimeEnumType(EnumType)>;
-
-  EnumAttrEncoding() {
-    static_assert(std::is_same<EnumType, RuntimeEnumType>::value,
-                  "requires enum converter");
-    convert = [](EnumType value) { return value; };
-  }
-
-  explicit EnumAttrEncoding(Converter convert) : convert(std::move(convert)) {}
-
-  mlir::LogicalResult Match(mlir::SymbolTable &, std::string_view,
-                            mlir::Attribute attr) const final {
-    return mlir::success(attr.isa<AttrType>());
-  }
-
-  mlir::FailureOr<Encoded> Encode(mlir::SymbolTable &, Globals &g,
-                                  mlir::ImplicitLocOpBuilder &b,
-                                  std::string_view name,
-                                  mlir::Attribute attr) const final {
-    // Convert enum underlying integral value to an attribute.
-    EnumType compile_time_enum = attr.cast<AttrType>().getValue();
-    RuntimeEnumType run_time_enum = convert(compile_time_enum);
-
-    using T = std::underlying_type_t<RuntimeEnumType>;
-    T underlying_value = static_cast<T>(run_time_enum);
-
-    TypeID type_id = TypeID::get<Tagged<RuntimeEnumType>>();
-    mlir::Attribute underlying_attr = AsAttr(b, underlying_value);
-
-    Encoded encoded;
-    encoded.name = EncodeString(g, b, name, kAttrName);
-    encoded.type_id = EncodeTypeId(g, b, type_id);
-    encoded.value = EncodeScalar(g, b, underlying_attr, kAttrValue);
-
-    return encoded;
-  }
-
-  static mlir::Attribute AsAttr(mlir::ImplicitLocOpBuilder &b, uint32_t value) {
-    return b.getI32IntegerAttr(value);
-  }
-
-  Converter convert;
-};
-
-// A helper type to define `AttrType` encoding scheme.
-template <typename AttrType>
-struct AggregateAttrDef {
-  template <typename T>
-  using Extract = T (AttrType::*)() const;
-
-  template <typename T, typename Attr = mlir::Attribute>
-  using Encode = Attr (mlir::Builder::*)(T);
-
-  template <typename T, typename U, typename Attr = mlir::Attribute>
-  AggregateAttrDef &Add(std::string name, Extract<T> extract,
-                        Encode<U, Attr> encode) {
-    bindings.emplace_back([=](AttrType attr, mlir::Builder &b) {
-      auto encoded = std::invoke(encode, b, std::invoke(extract, attr));
-      return mlir::NamedAttribute(b.getStringAttr(name), encoded);
-    });
-    return *this;
-  }
-
-  AggregateAttrDef &Add(std::string name, Extract<bool> extract) {
-    return Add(name, extract, &mlir::Builder::getBoolAttr);
-  }
-
-  AggregateAttrDef &Add(std::string name, Extract<int64_t> extract) {
-    return Add(name, extract, &mlir::Builder::getI64IntegerAttr);
-  }
-
-  AggregateAttrDef &Add(std::string name, Extract<llvm::StringRef> extract) {
-    return Add(name, extract, &mlir::Builder::getStringAttr);
-  }
-
-  AggregateAttrDef &Add(std::string name,
-                        Extract<llvm::ArrayRef<int64_t>> extract) {
-    return Add(name, extract, &mlir::Builder::getI64TensorAttr);
-  }
-
-  // A list of functions to destruct `AttrType` attribute into the aggregate
-  // attributes that will be used for encoding.
-  using Bind = std::function<mlir::NamedAttribute(AttrType, mlir::Builder &)>;
-  llvm::SmallVector<Bind> bindings;
-};
-
-// Custom call attribute encoding for the user-defined attributes which encodes
-// them as an aggregate of primitive attributes. It uses the encoding scheme
-// compatible with the custom call attributes decoding.
-template <typename AttrType, typename RuntimeType = AttrType>
-struct AggregateAttrEncoding : public CustomCallAttrEncoding {
-  using AttrDef = AggregateAttrDef<AttrType>;
-
-  AggregateAttrEncoding(const CustomCallAttrEncodingSet &encoding,
-                        AttrDef attrdef)
-      : encoding(encoding), attrdef(std::move(attrdef)) {}
-
-  mlir::LogicalResult Match(mlir::SymbolTable &, std::string_view,
-                            mlir::Attribute attr) const final {
-    return mlir::success(attr.isa<AttrType>());
-  }
-
-  mlir::FailureOr<Encoded> Encode(mlir::SymbolTable &sym_table, Globals &g,
-                                  mlir::ImplicitLocOpBuilder &b,
-                                  std::string_view name,
-                                  mlir::Attribute attr) const final {
-    // Extract aggregate attributes from the user-defined attributes.
-    llvm::SmallVector<mlir::NamedAttribute> attrs;
-    for (auto &bind : attrdef.bindings)
-      attrs.emplace_back(bind(attr.cast<AttrType>(), b));
-
-    // Encode extracted attributes as an aggregate.
-    auto type_id = TypeID::get<Tagged<RuntimeType>>();
-    auto sym = "__rt_aggregate_" + AttrType::getMnemonic();
-    auto aggregate =
-        EncodeAttributes(sym_table, g, b, encoding, sym.str(), attrs);
-    if (mlir::failed(aggregate)) return mlir::failure();
-
-    Encoded encoded;
-    encoded.name = EncodeString(g, b, name, kAttrName);
-    encoded.type_id = EncodeTypeId(g, b, type_id);
-    encoded.value = *aggregate;
-    return encoded;
-  }
-
-  const CustomCallAttrEncodingSet &encoding;
-  AttrDef attrdef;
-};
-
-//===----------------------------------------------------------------------===//
-// Custom call arguments encoding.
-//===----------------------------------------------------------------------===//
-
-// Encodes scalar arguments.
-class ScalarArgEncoding : public CustomCallArgEncoding {
- public:
-  mlir::LogicalResult Match(mlir::Value, mlir::Value) const final;
-  mlir::FailureOr<Encoded> Encode(Globals &g, Allocas &a,
-                                  mlir::ImplicitLocOpBuilder &b, mlir::Value,
-                                  mlir::Value) const final;
-};
-
-// Encodes custom call arguments passed as an opaque LLVM pointer (!llvm.ptr)
-// using a custom type id. Default constructed encoding encodes `!rt.opaque`
-// arguments using a `void*` type id.
-class OpaqueArgEncoding : public CustomCallArgEncoding {
- public:
-  OpaqueArgEncoding();  // encodes `!rt.opaque` with `void*` type id
-  OpaqueArgEncoding(std::function<bool(mlir::Value)> match, TypeID type_id);
-
-  mlir::LogicalResult Match(mlir::Value, mlir::Value) const final;
-  mlir::FailureOr<Encoded> Encode(Globals &g, Allocas &a,
-                                  mlir::ImplicitLocOpBuilder &b, mlir::Value,
-                                  mlir::Value) const final;
-
-  template <typename T>
-  static auto Match() {
-    return [](mlir::Value value) { return value.getType().isa<T>(); };
-  }
-
- private:
-  std::function<bool(mlir::Value)> match_;
-  TypeID type_id_;
-};
-
-// Encodes MemRef arguments according to the (Strided)MemrefView ABI.
-class MemrefArgEncoding : public CustomCallArgEncoding {
- public:
-  mlir::LogicalResult Match(mlir::Value, mlir::Value) const final;
-  mlir::FailureOr<Encoded> Encode(Globals &g, Allocas &a,
-                                  mlir::ImplicitLocOpBuilder &b, mlir::Value,
-                                  mlir::Value) const final;
-};
-
-//===----------------------------------------------------------------------===//
-// Custom call results encoding.
-//===----------------------------------------------------------------------===//
-
-// Encodes scalar operands.
-class ScalarRetEncoding : public CustomCallRetEncoding {
- public:
-  mlir::LogicalResult Match(mlir::Type, mlir::Type) const final;
-  mlir::FailureOr<Encoded> Encode(Globals &g, Allocas &a,
-                                  mlir::ImplicitLocOpBuilder &b, mlir::Type,
-                                  mlir::Type) const final;
-  mlir::FailureOr<mlir::Value> Decode(mlir::ImplicitLocOpBuilder &b, mlir::Type,
-                                      mlir::Type,
-                                      mlir::LLVM::AllocaOp) const final;
-};
-
-// Encodes custom call results returned as an opaque LLVM pointer (!llvm.ptr)
-// using a custom type id. Default constructed encoding encodes `!rt.opaque`
-// results using a `void*` type id.
-class OpaqueRetEncoding : public CustomCallRetEncoding {
- public:
-  OpaqueRetEncoding();  // encodes `!rt.opaque` with `void*` type id
-  OpaqueRetEncoding(std::function<bool(mlir::Type)> match, TypeID type_id);
-
-  mlir::LogicalResult Match(mlir::Type, mlir::Type) const final;
-  mlir::FailureOr<Encoded> Encode(Globals &g, Allocas &a,
-                                  mlir::ImplicitLocOpBuilder &b, mlir::Type,
-                                  mlir::Type) const final;
-  mlir::FailureOr<mlir::Value> Decode(mlir::ImplicitLocOpBuilder &b, mlir::Type,
-                                      mlir::Type,
-                                      mlir::LLVM::AllocaOp) const final;
-
-  template <typename T>
-  static auto Match() {
-    return [](mlir::Type type) { return type.isa<T>(); };
-  }
-
- private:
-  std::function<bool(mlir::Type)> match_;
-  TypeID type_id_;
-};
-
-// Encodes MemRef results according to the MemrefView ABI.
-class MemrefRetEncoding : public CustomCallRetEncoding {
- public:
-  mlir::LogicalResult Match(mlir::Type, mlir::Type) const final;
-  mlir::FailureOr<Encoded> Encode(Globals &g, Allocas &a,
-                                  mlir::ImplicitLocOpBuilder &b, mlir::Type,
-                                  mlir::Type) const final;
-  mlir::FailureOr<mlir::Value> Decode(mlir::ImplicitLocOpBuilder &b, mlir::Type,
-                                      mlir::Type,
-                                      mlir::LLVM::AllocaOp) const final;
-};
-
-class AsyncValueRetEncoding : public CustomCallRetEncoding {
- public:
-  mlir::LogicalResult Match(mlir::Type, mlir::Type) const final;
-  mlir::FailureOr<Encoded> Encode(Globals &g, Allocas &a,
-                                  mlir::ImplicitLocOpBuilder &b, mlir::Type,
-                                  mlir::Type) const final;
-  mlir::FailureOr<mlir::Value> Decode(mlir::ImplicitLocOpBuilder &b, mlir::Type,
-                                      mlir::Type,
-                                      mlir::LLVM::AllocaOp) const final;
-};
-
-//===----------------------------------------------------------------------===//
-// Default encodings for arguments, attributes and results.
-//===----------------------------------------------------------------------===//
-
-// TODO(ezhulenev): Use `Populate...` functions for adding default encodings.
-
-CustomCallArgEncodingSet DefaultArgEncodings();
-CustomCallAttrEncodingSet DefaultAttrEncodings();
-CustomCallRetEncodingSet DefaultRetEncodings();
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_MLIR_RUNTIME_TRANSFORMS_CUSTOM_CALL_ENCODING_H_
diff --git a/third_party/xla/xla/mlir/runtime/transforms/export_functions.cc b/third_party/xla/xla/mlir/runtime/transforms/export_functions.cc
deleted file mode 100644
index 612666397173dc..00000000000000
--- a/third_party/xla/xla/mlir/runtime/transforms/export_functions.cc
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <string>
-#include <string_view>
-#include <utility>
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
-#include "mlir/IR/SymbolTable.h"  // from @llvm-project
-#include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "xla/mlir/runtime/ir/rt_dialect.h"
-#include "xla/mlir/runtime/ir/rt_ops.h"
-#include "xla/mlir/runtime/transforms/passes.h"
-
-namespace xla {
-namespace runtime {
-
-using namespace mlir;  // NOLINT
-
-#define GEN_PASS_DEF_EXPORTFUNCTIONS
-#include "xla/mlir/runtime/transforms/passes.h.inc"
-
-class ExportFunctionsPass
-    : public impl::ExportFunctionsBase<ExportFunctionsPass> {
-  void runOnOperation() override;
-};
-
-static void ConvertReturnOperations(func::FuncOp func, Value exec_ctx) {
-  // Convert all returns to the Runtime API calls.
-  func.walk([&](func::ReturnOp ret) {
-    ImplicitLocOpBuilder b(ret.getLoc(), ret);
-
-    // Return all outputs via the `rt.set_output` operation.
-    for (const auto& pair : llvm::enumerate(ret.getOperands())) {
-      b.create<SetOutputOp>(exec_ctx, pair.index(), pair.value());
-    }
-
-    // Replace original return with an empty one.
-    b.create<func::ReturnOp>();
-    ret.erase();
-  });
-
-  // Update function type to the function with empty results.
-  auto type = FunctionType::get(func.getContext(), func.getArgumentTypes(), {});
-  func.setType(type);
-}
-
-static Value PrependExecutionContextArgument(func::FuncOp func) {
-  Type new_type = ExecutionContextType::get(func.getContext());
-  DictionaryAttr attr = DictionaryAttr::get(func.getContext());
-  func.insertArguments({0}, {new_type}, {attr}, {func.getLoc()});
-  return func.getArgument(0);
-}
-
-static void ConvertExportedFunction(ExportOp exported, func::FuncOp func) {
-  Value exec_ctx = PrependExecutionContextArgument(func);
-  ConvertReturnOperations(func, exec_ctx);
-
-  // After conversion mark exported function with an attribute.
-  func->setAttr(kExportedAttrName, exported.getOrdinalAttr());
-}
-
-void ExportFunctionsPass::runOnOperation() {
-  llvm::SmallVector<std::pair<ExportOp, func::FuncOp>> exported;
-
-  // Collect exported functions.
-  SymbolTable sym_table(getOperation());
-  getOperation().walk([&](ExportOp op) {
-    if (op.getOrdinal().has_value()) {
-      func::FuncOp func = sym_table.lookup<func::FuncOp>(op.getFunctionRef());
-      exported.emplace_back(op, func);
-    }
-  });
-
-  // Convert all exported functions.
-  llvm::for_each(exported, [](std::pair<ExportOp, func::FuncOp>& pair) {
-    ConvertExportedFunction(pair.first, pair.second);
-    pair.first.erase();
-  });
-}
-
-std::unique_ptr<OperationPass<ModuleOp>> CreateExportRuntimeFunctionsPass() {
-  return std::make_unique<ExportFunctionsPass>();
-}
-
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/mlir/runtime/transforms/jit_compiler.cc b/third_party/xla/xla/mlir/runtime/transforms/jit_compiler.cc
deleted file mode 100644
index bfb072221687eb..00000000000000
--- a/third_party/xla/xla/mlir/runtime/transforms/jit_compiler.cc
+++ /dev/null
@@ -1,516 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/mlir/runtime/transforms/jit_compiler.h"
-
-#include <functional>
-#include <memory>
-#include <optional>
-#include <string>
-#include <string_view>
-#include <utility>
-#include <vector>
-
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-#include "absl/types/span.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Analysis/CGSCCPassManager.h"
-#include "llvm/Analysis/LoopAnalysisManager.h"
-#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/IR/PassTimingInfo.h"
-#include "llvm/Pass.h"
-#include "llvm/Passes/OptimizationLevel.h"
-#include "llvm/Passes/PassBuilder.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/SMLoc.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
-#include "mlir/Dialect/Func/Extensions/AllExtensions.h"  // from @llvm-project
-#include "mlir/ExecutionEngine/OptUtils.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/OperationSupport.h"  // from @llvm-project
-#include "mlir/IR/SymbolTable.h"  // from @llvm-project
-#include "mlir/Interfaces/FunctionInterfaces.h"  // from @llvm-project
-#include "mlir/Parser/Parser.h"  // from @llvm-project
-#include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Support/Timing.h"  // from @llvm-project
-#include "mlir/Target/LLVMIR/Export.h"  // from @llvm-project
-#include "xla/mlir/runtime/ir/rt_dialect.h"
-#include "xla/mlir/runtime/ir/rt_ops.h"
-#include "xla/mlir/runtime/transforms/compiler.h"
-#include "xla/mlir/runtime/transforms/passes.h"
-#include "xla/mlir/runtime/transforms/specialization.h"
-#include "xla/runtime/arguments.h"
-#include "xla/runtime/constraints.h"
-#include "xla/runtime/errors.h"
-#include "xla/runtime/executable.h"
-#include "xla/runtime/execution_engine.h"
-#include "xla/runtime/logical_result.h"
-#include "xla/runtime/memory_mapper.h"
-#include "xla/service/llvm_ir/llvm_util.h"
-
-namespace xla {
-namespace runtime {
-
-using namespace mlir;  // NOLINT
-
-using absl::InternalError;
-using absl::StrCat;
-using absl::StrFormat;
-
-static bool DebugJitCompiler() {
-#if defined(DEBUG_XLA_RUNTIME_COMPILER)
-  return true;
-#endif
-  return VLOG_IS_ON(5);
-}
-
-static bool EnablePassTiming() {
-#if defined(ENABLE_XLAR_RUNTIME_PASS_TIMING)
-  return true;
-#endif
-  return DebugJitCompiler();
-}
-
-//===----------------------------------------------------------------------===//
-// Setup MLIR pass pipeline to lower to LLVM dialect, and use ORC JIT to codegen
-// functions at runtime.
-//===----------------------------------------------------------------------===//
-
-static void InitializeLlvmCompiler() {
-  static const bool initialized = ([] {
-    llvm::InitializeNativeTarget();
-    // Initialize asm printer and parser so that we can handle the inline
-    // assembly generated in MLIR for some operations.
-    llvm::InitializeNativeTargetAsmPrinter();
-    llvm::InitializeNativeTargetAsmParser();
-    return true;
-  })();
-  (void)initialized;
-}
-
-static void SetupPassDebugging(MLIRContext* context, mlir::PassManager& pm) {
-  // Print IR after all passes.
-  if (DebugJitCompiler()) {
-    context->disableMultithreading();
-    // Do not print large constants.
-    mlir::OpPrintingFlags printing_flags;
-    printing_flags.elideLargeElementsAttrs(32);
-    pm.enableIRPrinting([](Pass*, Operation*) { return false; },
-                        [](Pass*, Operation*) { return true; },
-                        /*printModuleScope=*/true,
-                        /*printAfterOnlyOnChange=*/false,
-                        /*printAfterOnlyOnFailure=*/false, llvm::errs(),
-                        printing_flags);
-  }
-}
-
-static void PrintPassPipeline(const mlir::PassManager& pm) {
-  llvm::errs() << "MLIR Pass Pipeline:\n";
-  pm.printAsTextualPipeline(llvm::errs());
-  llvm::errs() << "\n";
-}
-
-static LogicalResult RunPipeline(
-    ModuleOp module,
-    const std::function<absl::Status(PassManager&)>& create_pipeline,
-    int verification_level) {
-  if (!create_pipeline) return success();
-
-  // Instrument the pass manager to capture timing information.
-  DefaultTimingManager tm;
-  TimingScope timing;
-
-  bool should_verify = verification_level >= 1;
-#ifndef NDEBUG
-  should_verify = true;
-#endif
-
-  mlir::PassManager pm(module.getContext());
-  pm.enableVerifier(should_verify);
-
-  SetupPassDebugging(module.getContext(), pm);
-
-  if (EnablePassTiming()) {
-    tm.setEnabled(true);
-    timing = tm.getRootScope();
-    pm.enableTiming(timing);
-  }
-  PassManager passes(&pm);
-  absl::Status pipeline_created = create_pipeline(passes);
-  if (!pipeline_created.ok()) {
-    llvm::errs() << pipeline_created.message() << "\n";
-    return mlir::failure();
-  }
-
-  if (DebugJitCompiler()) {
-    PrintPassPipeline(pm);
-  }
-
-  return pm.run(module);
-}
-
-// Runs the user-provided compilation pipeline to compile the module to LLVM.
-static LogicalResult RunCompilationPipeline(ModuleOp module,
-                                            const JitCompiler::Options& opts) {
-  return RunPipeline(module, opts.create_compilation_pipeline,
-                     opts.verification_level);
-}
-
-// Runs the user-provided specialization pipeline.
-static LogicalResult RunSpecializationPipeline(
-    ModuleOp module, const JitCompiler::Options& opts) {
-  return RunPipeline(module, opts.create_specialization_pipeline,
-                     opts.verification_level);
-}
-
-//===----------------------------------------------------------------------===//
-
-// Configures MLIR Context and registers all the dialects that are expected
-// in the compiled module.
-static void ConfigureMlirContext(MLIRContext* context,
-                                 const JitCompiler::Options& opts) {
-  if (opts.register_dialects) {
-    // Call user-provided callback to register all required dialects.
-    DialectRegistry dialects;
-    opts.register_dialects(dialects);
-    mlir::func::registerAllExtensions(*dialects);
-    context->appendDialectRegistry(*dialects);
-    context->loadAllAvailableDialects();
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// JitCompiler implementation.
-//===----------------------------------------------------------------------===//
-
-JitCompiler::JitCompiler(JitCompiler::Options opts,
-                         std::string_view mlir_module)
-    : opts_(std::move(opts)),
-      owned_context_(
-          std::make_unique<MLIRContext>(MLIRContext::Threading::DISABLED)),
-      context_(owned_context_.get()),
-      diagnostic_os_(diagnostic_),
-      handler_(source_mgr_, context_, diagnostic_os_),
-      specialized_(false) {
-  ConfigureMlirContext(context_, opts_);
-  source_mgr_.AddNewSourceBuffer(
-      llvm::MemoryBuffer::getMemBuffer(mlir_module, "xla.program"),
-      llvm::SMLoc());
-  module_ = parseSourceFile<ModuleOp>(source_mgr_, context_);
-}
-
-JitCompiler::JitCompiler(JitCompiler::Options opts, mlir::ModuleOp mlir_module)
-    : opts_(std::move(opts)),
-      context_(mlir_module.getContext()),
-      diagnostic_os_(diagnostic_),
-      handler_(source_mgr_, context_, diagnostic_os_),
-      module_(mlir_module),
-      specialized_(false) {
-  ConfigureMlirContext(context_, opts_);
-}
-
-absl::Status JitCompiler::ComputeOrdinalsForExportedFunctions(
-    absl::Span<const std::string_view> exported) {
-  SymbolTable sym_table(*module_);
-
-  // Add `rt.export` operations for all explicitly exported functions.
-  for (const auto& indexed : llvm::enumerate(exported)) {
-    if (auto func = sym_table.lookup<FunctionOpInterface>(indexed.value())) {
-      OpBuilder(func).create<ExportOp>(func.getLoc(), func, indexed.index());
-      continue;
-    }
-    return InvalidArgument("exported function %s not found", indexed.value());
-  }
-
-  // Assign unique ordinals to all exported functions, including functions that
-  // were already exported with `rt.export` operations in the input IR.
-  mlir::PassManager pm(module_->getContext());
-  pm.addPass(CreateOrdinalAssignmentPass());
-  if (failed(pm.run(*module_)))
-    return Error("failed to run ordinal assignment pass");
-
-  // Resolve all functions exported from the module indexed by ordinal.
-  for (ExportOp op : module_->getOps<ExportOp>()) {
-    unsigned ordinal = *op.ordinal();
-    if (ordinal >= exported_.size()) exported_.resize(ordinal + 1);
-    exported_[ordinal] = op.exported(sym_table);
-  }
-
-  return absl::OkStatus();
-}
-
-absl::Status ExportMainWithOrdinal0(mlir::ModuleOp module,
-                                    mlir::MLIRContext& mlir_context) {
-  SymbolTable sym_table(module);
-
-  // Add `rt.export` operations for all explicitly exported functions.
-  if (auto func = sym_table.lookup<FunctionOpInterface>("main")) {
-    OpBuilder(func).create<ExportOp>(func.getLoc(), func, 0);
-  }
-  mlir::PassManager pm(&mlir_context);
-  pm.addPass(CreateOrdinalAssignmentPass());
-  if (failed(pm.run(module)))
-    return absl::InternalError("failed to run ordinal assignment pass");
-  return absl::OkStatus();
-}
-
-/*static*/ absl::StatusOr<std::unique_ptr<JitCompiler>>
-JitCompiler::Instantiate(JitCompiler::Options opts,
-                         std::string_view mlir_module,
-                         absl::Span<const std::string_view> exported) {
-  std::unique_ptr<JitCompiler> compiler(
-      new JitCompiler(std::move(opts), mlir_module));
-
-  // Check that mlir source was parsed into module operation.
-  if (!compiler->module_)
-    return compiler->Error("failed to parse the mlir source");
-
-  auto status = compiler->ComputeOrdinalsForExportedFunctions(exported);
-  if (!status.ok()) return status;
-
-  // Initialize LLVM compiler internals.
-  InitializeLlvmCompiler();
-
-  return {std::move(compiler)};
-}
-
-/*static*/ absl::StatusOr<std::unique_ptr<JitCompiler>>
-JitCompiler::Instantiate(JitCompiler::Options opts, ModuleOp mlir_module,
-                         absl::Span<const std::string_view> exported) {
-  std::unique_ptr<JitCompiler> compiler(
-      new JitCompiler(std::move(opts), mlir_module));
-
-  auto status = compiler->ComputeOrdinalsForExportedFunctions(exported);
-  if (!status.ok()) return status;
-
-  // Initialize LLVM compiler internals.
-  InitializeLlvmCompiler();
-
-  return {std::move(compiler)};
-}
-
-static std::function<llvm::Error(llvm::Module*)>
-MakeOptimizingTransformerForJit(llvm::TargetMachine* targetMachine) {
-  return [targetMachine](llvm::Module* m) -> llvm::Error {
-    llvm::LoopAnalysisManager lam;
-    llvm::FunctionAnalysisManager fam;
-    llvm::CGSCCAnalysisManager cgam;
-    llvm::ModuleAnalysisManager mam;
-
-    llvm::PipelineTuningOptions tuningOptions;
-    // LLVM's loop unrolling isn't well tuned for the loops we emit. Turn it off
-    // as it consumes compile time with little benefit.
-    tuningOptions.LoopUnrolling = false;
-    // Vectorization happens at the MLIR level.
-    tuningOptions.LoopVectorization = false;
-    llvm::PassBuilder pb(targetMachine, tuningOptions);
-
-    pb.registerModuleAnalyses(mam);
-    pb.registerCGSCCAnalyses(cgam);
-    pb.registerFunctionAnalyses(fam);
-    pb.registerLoopAnalyses(lam);
-    pb.crossRegisterProxies(lam, fam, cgam, mam);
-
-    llvm::ModulePassManager mpm;
-    mpm.addPass(pb.buildPerModuleDefaultPipeline(llvm::OptimizationLevel::O2));
-    mpm.run(*m, mam);
-    return llvm::Error::success();
-  };
-}
-
-/*static*/ absl::StatusOr<Executable> JitCompiler::Compile(
-    std::unique_ptr<JitCompiler> compiler, std::string_view memory_region_name,
-    std::optional<size_t> specialization) {
-  // We track end-to-end time to compile the final executable.
-  auto compilation_start = std::chrono::steady_clock::now();
-
-  const JitCompiler::Options& opts = compiler->options();
-
-  // Calling convention must be defined so we can get the run-time signature.
-  if (!opts.calling_convention)
-    return compiler->Error("calling convention is not defined");
-
-  // Prepare exported functions that will be handed to the Executable.
-  std::vector<Executable::Function> functions;
-  std::vector<std::string_view> exported;  // names of exported functions
-
-  for (const auto& indexed : llvm::enumerate(compiler->exported())) {
-    auto func = indexed.value();
-    std::string_view name = exported.emplace_back(func.getName());
-
-    // Get the signature of the exported function.
-    auto signature = opts.type_converter.Convert(
-        llvm::cast<mlir::FunctionType>(func.getFunctionType()));
-    if (!signature.ok()) return signature.status();
-
-    // Calling convention conversion can fail if some types are not supported.
-    auto runtime_type = opts.calling_convention(
-        llvm::cast<mlir::FunctionType>(func.getFunctionType()));
-    if (!runtime_type)
-      return compiler->Error(StrFormat(
-          "calling convention failed to convert function type for %s", name));
-
-    // Get the runtime signature of the exported function.
-    auto runtime_signature = opts.type_converter.Convert(runtime_type);
-    if (!runtime_signature.ok()) return runtime_signature.status();
-
-    // Get the memory layout for passing function arguments.
-    auto arguments_memory_layout =
-        Executable::GetArgumentsMemoryLayout(*runtime_signature);
-    if (!arguments_memory_layout.ok()) return arguments_memory_layout.status();
-
-    // Get the memory layout for returning function results.
-    auto results_memory_layout =
-        Executable::GetResultsMemoryLayout(*runtime_signature);
-    if (!results_memory_layout.ok()) return results_memory_layout.status();
-
-    bool requires_blas = false;
-    if (Attribute requires_blas_attr = func->getAttr(kRequiresBlasAttrName)) {
-      requires_blas = cast<BoolAttr>(requires_blas_attr).getValue();
-    }
-
-    // Add function with an unresolved function pointer; it will be updated once
-    // we compile the input module to the native executable.
-    functions.push_back(Executable::Function(
-        name,
-        /*fptr=*/nullptr, std::move(*signature), std::move(*runtime_signature),
-        std::move(*arguments_memory_layout), std::move(*results_memory_layout),
-        requires_blas));
-  }
-
-  // Run the compilation pipeline to lower the module to LLVM dialect.
-  if (failed(RunCompilationPipeline(compiler->module(), opts)))
-    return compiler->Error("failed to run compilation pipeline");
-
-  if (EnablePassTiming()) llvm::TimePassesIsEnabled = true;
-
-  // Prepare JIT target machine for code generation.
-  auto builder = llvm::orc::JITTargetMachineBuilder::detectHost();
-  if (!builder) return InternalError(toString(builder.takeError()));
-
-  builder->setCodeGenOptLevel(compiler->options().jit_code_opt_level);
-
-  llvm::Expected<std::unique_ptr<llvm::TargetMachine>> target_machine =
-      builder->createTargetMachine();
-  if (!target_machine)
-    return InternalError(toString(target_machine.takeError()));
-
-  // Name of the compiled module if available.
-  auto module_name = compiler->module().getSymName().value_or("<unknown>");
-
-  // Memory region name to mmap executable code.
-  std::string mapper_name = llvm::formatv(
-      "/xla{0}{1}:@{2}::@{3}", memory_region_name.empty() ? "" : ":",
-      EscapeMemRegionName(memory_region_name), module_name,
-      specialization.has_value() ? "specialized" : "default");
-
-  // Custom memory mapper to tag memory allocated for XLA executables.
-  std::unique_ptr<XlaRuntimeMemoryMapper> memory_mapper =
-      XlaRuntimeMemoryMapper::Create(std::move(mapper_name));
-
-  // Register symbols required for running XLA Executable.
-  ExecutionEngine::SymbolsBinding symbols =
-      RuntimeSymbolsBinding(compiler->options().symbols_binding);
-
-  // Construct options for the XLA runtime execution engine.
-  ExecutionEngine::JitOptions engine_options;
-  engine_options.opt_level = compiler->options().jit_code_opt_level;
-  engine_options.target_machine = std::move(target_machine.get());
-  engine_options.make_optimizing_transformer = MakeOptimizingTransformerForJit;
-  engine_options.section_memory_mapper = memory_mapper.get();
-  engine_options.symbols_binding = std::move(symbols);
-
-  // Translate MLIR module to the LLVM module.
-  auto llvm_ctx = std::make_unique<llvm::LLVMContext>();
-  auto llvm_module = translateModuleToLLVMIR(compiler->module(), *llvm_ctx);
-  if (!llvm_module)
-    return compiler->Error("failed to translate module to LLVM IR");
-
-  std::string llvm_module_string;
-  if (compiler->options().embed_ir_in_executable) {
-    llvm_module_string = llvm_ir::DumpToString(llvm_module.get());
-  }
-
-  // Compile input module to the native function.
-  auto engine = ExecutionEngine::CreateFromModule(
-      std::move(llvm_ctx), std::move(llvm_module), engine_options, exported);
-  if (!engine.ok()) return engine.status();
-
-  // At this point compilation is completed, and all symbols in the LLVM module
-  // materialized as addresses (all exported functions have a corresponding
-  // function pointer).
-  auto time_to_compile = std::chrono::duration_cast<std::chrono::milliseconds>(
-      std::chrono::steady_clock::now() - compilation_start);
-
-  if (EnablePassTiming()) llvm::reportAndResetTimings();
-
-  // Resolve all exported functions to function pointers.
-  for (unsigned i = 0; i < exported.size(); ++i)
-    functions[i].fptr = (*engine)->exported(i);
-
-  return Executable(compiler->name(), std::move(memory_mapper),
-                    std::move(*engine), std::move(functions), specialization,
-                    time_to_compile, std::move(llvm_module_string));
-}
-
-// TODO(ezhulenev): Currently it's possible to specialize only one function. It
-// should be possible to specialize multiple functions, and run specialization
-// pipeline once all specialized functions signatures are updated.
-
-absl::Status JitCompiler::Specialize(unsigned ordinal, ArgumentsRef arguments,
-                                     ArrayRef<SymbolicShape> symbolic_shapes,
-                                     ArrayRef<ArgumentConstraint> constraints,
-                                     const SpecializationListener* listener) {
-  assert(!specialized_ && "can specialize executable only once");
-  specialized_ = true;
-
-  auto func = exported(ordinal);
-
-  // Update function signature and sink constant arguments into the body.
-  if (auto specialized = SpecializeFunction(func, arguments, symbolic_shapes,
-                                            constraints, listener);
-      !specialized.ok()) {
-    // No need to call this->Error() because we don't have diagnostic to report
-    // in case of a failed specialization.
-    return InternalError(
-        StrCat("failed to specialize: ", specialized.message()));
-  }
-
-  // Run the user-provided specialization pipeline to take advantage of the
-  // specialized operands and sunk constants.
-  if (failed(RunSpecializationPipeline(*module_, opts_)))
-    return Error("failed to run specialization pipeline");
-
-  return absl::OkStatus();
-}
-
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/mlir/runtime/transforms/jit_compiler.h b/third_party/xla/xla/mlir/runtime/transforms/jit_compiler.h
deleted file mode 100644
index 044aac3ea0535c..00000000000000
--- a/third_party/xla/xla/mlir/runtime/transforms/jit_compiler.h
+++ /dev/null
@@ -1,223 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_MLIR_RUNTIME_TRANSFORMS_JIT_COMPILER_H_
-#define XLA_MLIR_RUNTIME_TRANSFORMS_JIT_COMPILER_H_
-
-#include <functional>
-#include <memory>
-#include <optional>
-#include <string>
-#include <string_view>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/cord.h"
-#include "absl/strings/str_cat.h"
-#include "absl/types/span.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/Support/CodeGen.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/raw_ostream.h"
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/Diagnostics.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
-#include "mlir/Interfaces/FunctionInterfaces.h"  // from @llvm-project
-#include "xla/mlir/runtime/transforms/calling_convention.h"
-#include "xla/mlir/runtime/transforms/specialization.h"
-#include "xla/mlir/runtime/transforms/type_converter.h"
-#include "xla/runtime/arguments.h"
-#include "xla/runtime/compiler.h"
-#include "xla/runtime/constraints.h"
-#include "xla/runtime/executable.h"
-#include "xla/runtime/execution_engine.h"
-#include "xla/runtime/symbolic_shape.h"
-
-namespace xla {
-namespace runtime {
-
-// JitCompiler manages parsing, specialization and compilation of a single XLA
-// module to the XLA runtime executable. It owns the MLIR context where the
-// module is created, and handlers to capture all compilation diagnostics
-// messages.
-//
-// TODO(ezhulenev): Allow constructing JitCompiler (and JitExecutable) from the
-// MLIR module directly without serializing it to string first.
-class JitCompiler {
- public:
-  using SymbolicShape = SymbolicShapesResolver::SymbolicShape;
-
-  struct Options {
-    // Register dialects that are allowed in the serialized module.
-    std::function<void(DialectRegistry&)> register_dialects;
-
-    // Create a pass pipeline that is called whenever the compiled module
-    // gets specialized. This pipeline can use refined shape information and
-    // symbolic shape attributes to do the shape inference and canonicalization.
-    //
-    // Original input module might have an undefined calling convention (e.g.
-    // XLA runtime does not support unranked tensors), and specialization can be
-    // required as a precondition for compilation.
-    std::function<absl::Status(PassManager&)> create_specialization_pipeline;
-
-    // Create a pass pipeline that lowers compiled module from high level
-    // dialects to the LLVM dialect. XLA runtime will use the LLVM ORC compiler
-    // API to compile the LLVM module at run time
-    // (https://llvm.org/docs/ORCv2.html).
-    //
-    // This compilation pipeline must export functions invocable by the runtime
-    // (convert them to an ABI compatible with the calling convention advertised
-    // to XLA through the `calling_convention` type conversion), and for
-    // that it usually must include `xla-rt-export-functions` pass.
-    std::function<absl::Status(PassManager&)> create_compilation_pipeline;
-
-    // LLVM optimization level when JIT compiling a module.
-    llvm::CodeGenOptLevel jit_code_opt_level = llvm::CodeGenOptLevel::Default;
-
-    // Runtime symbols binding allows to pass user-defined bindings for symbols
-    // at JIT compilation time, e.g. to bind type ids or custom calls.
-    ExecutionEngine::SymbolsBinding symbols_binding;
-
-    // Calling convention defines an ABI for XLA runtime to call an executable.
-    // See `CallingConvention` documentation for details.
-    CallingConvention calling_convention = DefaultCallingConvention();
-
-    // Type converter converts MLIR types to the corresponding run-time types.
-    // Executable uses its own type hierarchy, parallel to MLIR's, so that it
-    // doesn't depend on any parts of the MLIR after compilation produces an
-    // executable artifact, because keeping MLIR context alive can be expensive
-    // in terms of memory usage.
-    //
-    // As a side effect, it allows loading AOT compiled executables from the
-    // object files without any dependencies on MLIR.
-    //
-    // Default type converter knows how to convert canonical MLIR types
-    // (memrefs, tensors, etc...). All user-defined types used at the compiled
-    // function boundary (arguments or results) should register a custom type
-    // conversion.
-    //
-    // When we compile the input IR, we first apply the `calling_convention` to
-    // get the MLIR function type for the exported function(s), and then we
-    // convert it to the corresponding run-time function type.
-    TypeConverter type_converter;
-
-    // How much verification would you like to do?
-    int verification_level = 0;
-
-    // Whether to embed the LLVM IR generated in the executable
-    bool embed_ir_in_executable = false;
-  };
-
-  // Instantiates compiler from the serialized mlir source.
-  static absl::StatusOr<std::unique_ptr<JitCompiler>> Instantiate(
-      Options opts, std::string_view mlir_module,
-      absl::Span<const std::string_view> exported);
-
-  // Instantiates compiler from the mlir module.
-  static absl::StatusOr<std::unique_ptr<JitCompiler>> Instantiate(
-      Options opts, mlir::ModuleOp mlir_module,
-      absl::Span<const std::string_view> exported);
-
-  // Makes an executable from an instance of the JitCompiler. This is the end of
-  // life for the `JitCompiler`, it effectively converts the MLIR module
-  // to the executable (function pointer) using LLVM JIT code generation.
-  // Optional specialization identifier specifies if the compiled executable is
-  // a default one, or a specialization.
-  static absl::StatusOr<Executable> Compile(
-      std::unique_ptr<JitCompiler> compiler,
-      std::string_view memory_region_name,
-      std::optional<size_t> specialization = std::nullopt);
-
-  // Specialize the exported function given by 'ordinal' to the arguments:
-  //
-  // - update all unknown dimensions according to the resolved symbolic shapes
-  // - attach symbolic shape attribute to the operands
-  // - sink small constants into the function body
-  //
-  // After the exported function's signature is updated, and all constant
-  // arguments are materialized in the function body, runs the user-provided
-  // specialization pipeline to optimize the module based on the new
-  // information in the IR.
-  //
-  // Returns an error if arguments are not compatible with the exported
-  // function's signature.
-  absl::Status Specialize(unsigned ordinal, ArgumentsRef arguments,
-                          llvm::ArrayRef<SymbolicShape> symbolic_shapes,
-                          llvm::ArrayRef<ArgumentConstraint> constraints,
-                          const SpecializationListener* listener = nullptr);
-
-  const Options& options() const { return opts_; }
-
-  std::string_view name() const {
-    return module().getName().value_or("<unknown>");
-  }
-
-  mlir::ModuleOp module() const {
-    assert(module_ && "failed to parse the mlir module");
-    return *module_;
-  }
-
-  size_t num_exported() const { return exported_.size(); }
-
-  absl::Span<const mlir::FunctionOpInterface> exported() const {
-    return exported_;
-  }
-
-  mlir::FunctionOpInterface exported(unsigned ordinal) const {
-    assert(exported_[ordinal] && "failed to resolve exported function");
-    return exported_[ordinal];
-  }
-
- private:
-  JitCompiler(Options opts, std::string_view mlir_module);
-  JitCompiler(Options opts, mlir::ModuleOp mlir_module);
-
-  absl::Status ComputeOrdinalsForExportedFunctions(
-      absl::Span<const std::string_view> exported);
-
-  absl::Status Error(std::string_view error) {
-    absl::Status interr = absl::InternalError(error);
-    interr.SetPayload("__jit_compiler_internal_error", absl::Cord(diagnostic_));
-    return interr;
-  }
-
-  Options opts_;
-  std::unique_ptr<mlir::MLIRContext> owned_context_;  // set if context is owned
-  mlir::MLIRContext* context_;
-
-  std::string diagnostic_;
-  llvm::raw_string_ostream diagnostic_os_;
-
-  llvm::SourceMgr source_mgr_;
-  mlir::SourceMgrDiagnosticHandler handler_;
-
-  mlir::OwningOpRef<mlir::ModuleOp> module_;         // null if failed to parse
-  std::vector<mlir::FunctionOpInterface> exported_;  // empty if failed to parse
-
-  bool specialized_;
-};
-
-// Adds "rt.export" with ordinal 0 to the "main" function in `module`.
-// This is done by performing a run of the OrdinalAssignment pass using
-// the given `mlir_context`.
-absl::Status ExportMainWithOrdinal0(mlir::ModuleOp module,
-                                    mlir::MLIRContext& mlir_context);
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_MLIR_RUNTIME_TRANSFORMS_JIT_COMPILER_H_
diff --git a/third_party/xla/xla/mlir/runtime/transforms/move_allocas_to_entry_block.cc b/third_party/xla/xla/mlir/runtime/transforms/move_allocas_to_entry_block.cc
deleted file mode 100644
index 8b9a4c6cf62ee1..00000000000000
--- a/third_party/xla/xla/mlir/runtime/transforms/move_allocas_to_entry_block.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-
-#include "absl/log/check.h"
-#include "llvm/ADT/SmallVector.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
-#include "mlir/IR/Block.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "xla/mlir/runtime/transforms/passes.h"
-
-namespace xla {
-namespace runtime {
-
-using namespace mlir;  // NOLINT
-
-#define GEN_PASS_DEF_MOVEALLOCASTOENTRYBLOCK
-#include "xla/mlir/runtime/transforms/passes.h.inc"
-
-class MoveAllocasToEntryBlockPass
-    : public impl::MoveAllocasToEntryBlockBase<MoveAllocasToEntryBlockPass> {
-  void runOnOperation() override;
-};
-
-//===----------------------------------------------------------------------====/
-
-void MoveAllocasToEntryBlockPass::runOnOperation() {
-  ModuleOp module = getOperation();
-  module.walk([](mlir::func::FuncOp func) {
-    CHECK(!func.getBlocks().empty());
-    Block* entryBlock = &func.getBlocks().front();
-    llvm::SmallVector<memref::AllocaOp> allocas;
-    for (auto op : func.getOps<memref::AllocaOp>()) {
-      if (op->getBlock() != entryBlock) {
-        allocas.push_back(op);
-      }
-    }
-
-    auto builder =
-        ImplicitLocOpBuilder::atBlockBegin(func->getLoc(), entryBlock);
-    builder.setInsertionPointToStart(entryBlock);
-    for (auto op : allocas) {
-      op->moveBefore(builder.getInsertionBlock(), builder.getInsertionPoint());
-    }
-  });
-}
-
-std::unique_ptr<OperationPass<ModuleOp>> CreateMoveAllocasToEntryBlockPass() {
-  return std::make_unique<MoveAllocasToEntryBlockPass>();
-}
-
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/mlir/runtime/transforms/ordinal_assignment.cc b/third_party/xla/xla/mlir/runtime/transforms/ordinal_assignment.cc
deleted file mode 100644
index 5902c50d8fd991..00000000000000
--- a/third_party/xla/xla/mlir/runtime/transforms/ordinal_assignment.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <algorithm>
-#include <memory>
-#include <utility>
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "xla/mlir/runtime/ir/rt_ops.h"
-#include "xla/mlir/runtime/transforms/passes.h"
-
-namespace xla {
-namespace runtime {
-
-using namespace mlir;  // NOLINT
-
-#define GEN_PASS_DEF_ORDINALASSIGNMENT
-#include "xla/mlir/runtime/transforms/passes.h.inc"
-
-class OrdinalAssignmentPass
-    : public impl::OrdinalAssignmentBase<OrdinalAssignmentPass> {
-  void runOnOperation() override;
-};
-
-//===----------------------------------------------------------------------===//
-
-void OrdinalAssignmentPass::runOnOperation() {
-  llvm::SmallVector<ExportOp> assigned;    // ops with assigned ordinal
-  llvm::SmallVector<ExportOp> unassigned;  // ops with unassigned ordinal
-
-  ModuleOp module = getOperation();
-  OpBuilder b(module);
-
-  for (ExportOp op : module.getOps<ExportOp>()) {
-    // Collect export ops without assigned ordinals.
-    if (!op.ordinal()) {
-      unassigned.push_back(op);
-      continue;
-    }
-
-    unsigned ordinal = *op.ordinal();
-    if (ordinal >= assigned.size()) assigned.resize(ordinal + 1);
-
-    // Check that we do not have any duplicate exports.
-    if (assigned[ordinal]) {
-      op.emitError("duplicate exported function with ordinal ") << ordinal;
-      return signalPassFailure();
-    }
-
-    assigned[ordinal] = op;
-  }
-
-  // Check that we have enough unassigned export ops to fill all ordinals.
-  size_t num_holes = llvm::count_if(assigned, [](ExportOp op) { return !op; });
-
-  if (unassigned.size() < num_holes) {
-    module.emitError("can't fill all ordinals with exported functions");
-    return signalPassFailure();
-  }
-
-  // Ordinals that must be filled first.
-  llvm::SmallVector<unsigned> unassigned_ordinals;
-  for (unsigned ordinal = 0; ordinal < assigned.size(); ++ordinal)
-    if (!assigned[ordinal]) unassigned_ordinals.push_back(ordinal);
-
-  // Reverse order of unassigned ordinals and operations to assign ordinals to
-  // operations according to their order in the module.
-  std::reverse(unassigned.begin(), unassigned.end());
-  std::reverse(unassigned_ordinals.begin(), unassigned_ordinals.end());
-
-  // Fill unassigned ordinals first.
-  while (!unassigned_ordinals.empty()) {
-    unsigned ordinal = unassigned_ordinals.pop_back_val();
-    assigned[ordinal] = unassigned.pop_back_val();
-    assigned[ordinal].setOrdinalAttr(b.getI32IntegerAttr(ordinal));
-  }
-
-  // Then assign new ordinals for the remaining export operations.
-  while (!unassigned.empty()) {
-    unsigned ordinal = assigned.size();
-    assigned.emplace_back(unassigned.pop_back_val())
-        .setOrdinalAttr(b.getI32IntegerAttr(ordinal));
-  }
-}
-
-std::unique_ptr<OperationPass<ModuleOp>> CreateOrdinalAssignmentPass() {
-  return std::make_unique<OrdinalAssignmentPass>();
-}
-
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/mlir/runtime/transforms/passes.h b/third_party/xla/xla/mlir/runtime/transforms/passes.h
deleted file mode 100644
index a0849eb6a9ac88..00000000000000
--- a/third_party/xla/xla/mlir/runtime/transforms/passes.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_MLIR_RUNTIME_TRANSFORMS_PASSES_H_
-#define XLA_MLIR_RUNTIME_TRANSFORMS_PASSES_H_
-
-#include <functional>
-#include <memory>
-
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "xla/mlir/runtime/ir/rt_ops.h"  // IWYU pragma: keep
-
-namespace xla {
-namespace runtime {
-
-#define GEN_PASS_DECL_ORDINALASSIGNMENT
-#define GEN_PASS_DECL_MOVEALLOCASTOENTRYBLOCK
-#define GEN_PASS_DECL_EXPORTFUNCTIONS
-#define GEN_PASS_DECL_CONVERTCUSTOMCALLS
-#define GEN_PASS_DECL_CONVERTASSERTS
-#define GEN_PASS_DECL_CONVERTRUNTIMETOLLVMPASS
-
-#include "xla/mlir/runtime/transforms/passes.h.inc"
-
-//===-----------------------------------------------------------------------===/
-// Transformations targeting `rt` dialect.
-//===-----------------------------------------------------------------------===/
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-CreateOrdinalAssignmentPass();
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-CreateMoveAllocasToEntryBlockPass();
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-CreateExportRuntimeFunctionsPass();
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-CreateConvertCustomCallsPass();
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-CreateAddInitializationsPass();
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> CreateConvertAssertsPass();
-
-//===-----------------------------------------------------------------------===/
-// Conversions targeting `rt` dialect.
-//===-----------------------------------------------------------------------===/
-
-class TypeIDNameRegistry;
-class CustomCallArgEncodingSet;
-class CustomCallAttrEncodingSet;
-class CustomCallRetEncodingSet;
-
-// Extension points for converting `rt` dialect to the LLVM dialect.
-//
-// Runtime custom calls is an extension mechanism for enabling compiled programs
-// to call into the APIs provided by the user. It relies on converting
-// values and attributes to the LLVM types (structs and pointers) with a
-// well-defined memory layout, so that they can be passed across the function
-// boundary and safely decoded (without dependency on C++ ABI).
-//
-// All user-defined types (values and attributes) that are passed to the custom
-// calls must define the argument or attribute encoding.
-struct ConvertRuntimeToLLvmOpts {
-  // Register names for the TypeIDs used for encoding types of custom arguments
-  // and attributes.
-  std::function<void(TypeIDNameRegistry&)> populate_type_id_names;
-
-  // Add type conversions for user-defined types to the corresponding LLVM
-  // types. Conversion pass uses these extra conversions to convert arguments
-  // of the entrypoint function and values passed to the custom calls. Custom
-  // call argument encoding can further refine how values of LLVM types passed
-  // to the custom call handlers by passing custom encoding (see below).
-  std::function<void(mlir::TypeConverter&)> populate_type_conversions;
-
-  // Add user-defined arguments encoding to the custom call lowering.
-  std::function<void(CustomCallArgEncodingSet&)> populate_arg_encodings;
-
-  // Add user-defined attributes type encoding to the custom call lowering.
-  std::function<void(CustomCallRetEncodingSet&)> populate_ret_encodings;
-
-  // Add user-defined attributes type encoding to the custom call lowering.
-  std::function<void(CustomCallAttrEncodingSet&)> populate_attr_encodings;
-};
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-CreateConvertRuntimeToLLVMPass(ConvertRuntimeToLLvmOpts opts = {});
-
-//===-----------------------------------------------------------------------===/
-
-#define GEN_PASS_REGISTRATION
-#include "xla/mlir/runtime/transforms/passes.h.inc"
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_MLIR_RUNTIME_TRANSFORMS_PASSES_H_
diff --git a/third_party/xla/xla/mlir/runtime/transforms/passes.td b/third_party/xla/xla/mlir/runtime/transforms/passes.td
deleted file mode 100644
index 0d485208170d98..00000000000000
--- a/third_party/xla/xla/mlir/runtime/transforms/passes.td
+++ /dev/null
@@ -1,226 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_RUNTIME_PASSES
-#define XLA_RUNTIME_PASSES
-
-include "mlir/Pass/PassBase.td"
-
-//===-----------------------------------------------------------------------===/
-// Transformations targeting `rt` dialect.
-//===-----------------------------------------------------------------------===/
-
-def OrdinalAssignment : Pass<"xla-rt-ordinal-assignment", "mlir::ModuleOp"> {
-  let summary = "Assigns ordinals to all exported functions";
-
-  let description = [{
-    Assigns unique ordinals to all exported functions. After assignment it's
-    guaranteed that all `rt.export` operations have been assigned a valid
-    ordinal.
-
-    Example:
-
-      ```mlir
-      rt.export @compute.0
-      rt.export @compute.1
-
-      func @compute.0(...) -> memref<?xf32> { ... }
-      func @compute.1(...) -> memref<?xf32> { ... }
-      ```
-
-    converted to:
-
-      ```mlir
-      rt.export @compute.0 ordinal 0
-      rt.export @compute.1 ordinal 1
-
-      func @compute.0(...) -> memref<?xf32> { ... }
-      func @compute.1(...) -> memref<?xf32> { ... }
-      ```
-  }];
-
-  let constructor = "xla::runtime::CreateOrdinalAssignmentPass()";
-  let dependentDialects = ["xla::runtime::RuntimeDialect"];
-}
-
-def ExportFunctions : Pass<"xla-rt-export-functions", "mlir::ModuleOp"> {
-  let summary = "Exports function(s) to Xla runtime";
-
-  let description = [{
-    Converts functions marked for export with an `rt.export` operation with
-    specified ordinal attribute to Xla runtime ABI, and makes them invocable by
-    the executable, i.e.:
-     - first argument is an `!rt.execution_context`
-     - all results returned via the `rt.set_result` operation
-
-    Exported function can also return errors to the runtime using `rt.set_error`
-    operation, and all asserts can be automatically converted to safe run-time
-    errors with the `ConvertAsserts` pass defined below.
-
-    See the `ir/runtime/rt_ops.td` to find how Xla executable returns results
-    and errors using the runtime APIs.
-
-    Example:
-
-      ```mlir
-      rt.export @compute ordinal 0
-
-      func @compute(...) -> memref<?xf32> {
-        %0 = "allocate_memref" : () -> memref<?xf32>
-        return %0 : memref<?xf32>
-      }
-      ```
-
-    converted to:
-
-      ```mlir
-      func @compute(%ctx: !rt.execution_context, ...)
-        attributes { rt.exported = 0 : i32 } {
-        %0 = "allocate_memref" : () -> memref<?xf32>
-        rt.set_output %ctx, 0, %1 : memref<xf32>
-        return
-      }
-      ```
-  }];
-
-  let constructor = "xla::runtime::CreateExportRuntimeFunctionsPass()";
-  let dependentDialects = ["xla::runtime::RuntimeDialect"];
-}
-
-def ConvertCustomCalls : Pass<"xla-rt-convert-custom-calls", "ModuleOp"> {
-  let summary = "Converts custom calls to explicit `rt.custom_call` operations";
-
-  let description = [{
-    Converts function calls to custom call declarations (function declarations
-    annotated with `rt.custom_call` attribute) to explicit `rt.custom_call`
-    operations. Function declarations annotated with `rt.dynamic` attribute
-    are converted to dynamic custom calls that are resolved at run time by name.
-
-    When converting function call to the custom call operation, custom call
-    attributes will be a union of custom call function declaration attributes,
-    and the call operation attributes. Function call attributes will override
-    any attributes defined by the custom call function declaration.
-
-    Example:
-
-      ```mlir
-      func @custom_call() -> memref<?xf32>
-        attributes { rt.custom_call = "custom_call", attr = <value> }
-
-      func @compute(%ctx: !rt.execution_context, %arg0: i1) -> memref<?xf32>
-          attributes { rt.exported = 0 : i32 } {
-        %0 = call @custom_call() { attr = <new_value> }: () -> memref<?xf32>
-        return %0 : memref<?xf32>
-      }
-      ```
-
-    converted to:
-
-      ```mlir
-      func @compute(%ctx: !rt.execution_context, %arg0: i1) -> memref<?xf32>
-          attributes { rt.exported = 0 : i32 } {
-
-        %status, %0 = rt.custom_call %ctx["custom_call"] () {
-                        attr = <new value>
-                      } : () -> memref<?xf32>
-
-        %success = rt.is_ok %status : !rt.status
-        assert %success, "Custom call to `custom_call` failed"
-
-        return %0 : memref<?xf32>
-      }
-      ```
-  }];
-
-  let constructor = "xla::runtime::CreateConvertCustomCallsPass()";
-  let dependentDialects = ["xla::runtime::RuntimeDialect"];
-}
-
-def MoveAllocasToEntryBlock : Pass<"xla-rt-move-allocas-to-entry-block", "ModuleOp"> {
-  let summary = "Move all `memref.alloca` to entry block for all functions";
-
-  let description = [{
-    For all functions, move `memref.alloca` to entry block for coroutine stack safety.
-  }];
-
-  let constructor = "xla::runtime::CreateMoveAllocasToEntryBlockPass()";
-  let dependentDialects = ["xla::runtime::RuntimeDialect"];
-}
-
-def ConvertAsserts : Pass<"xla-rt-convert-asserts", "ModuleOp"> {
-  let summary = "Converts asserts in exported functions to run-time errors";
-
-  let description = [{
-    Converts failed assertions to calls to the `rt.set_error` operation to
-    safely return errors to the caller. The default lowering of failed
-    assertions calls `std::abort`, which causes abnormal program termination.
-
-    This conversion only happens to exported runtime functions. These functions
-    have access to an `!rt.execution_context`, through which all results and
-    errors must be returned.
-
-    Example:
-
-      ```mlir
-      func @compute(%ctx: !rt.execution_context, %arg0: i1)
-          attributes { rt.exported = 0} {
-        cf.assert %arg0, "Oops"
-        return
-      }
-      ```
-
-    converted to:
-
-      ```mlir
-      func @compute(%ctx: !rt.execution_context, %arg0: i1)
-          attributes { rt.exported = 0 : i32 } {
-        %0 = ... : i1
-        cond_br %0, ^ok, ^err
-      ^err:
-        rt.set_error %ctx, "Oops"
-        return
-      ^ok:
-        return
-      }
-      ```
-  }];
-
-  let constructor = "xla::runtime::CreateConvertAssertsPass()";
-  let dependentDialects = ["xla::runtime::RuntimeDialect"];
-}
-
-def AddInitializations : Pass<"xla-rt-add-initializations", "ModuleOp"> {
-  let summary = "Add initialization custom calls";
-
-  let description = [{
-    Add custom calls that initialize library support to the beginning of the
-    entry funcion.
-  }];
-
-  let constructor = "xla::runtime::CreateAddInitializationsPass()";
-  let dependentDialects = ["xla::runtime::RuntimeDialect"];
-}
-
-//===-----------------------------------------------------------------------===/
-// Conversions targeting `rt` dialect.
-//===-----------------------------------------------------------------------===/
-
-def ConvertRuntimeToLLVMPass : Pass<"xla-rt-to-llvm", "ModuleOp"> {
-  let summary = "Convert RT dialect to LLVM dialect";
-  let constructor = "::xla::runtime::CreateConvertRuntimeToLLVMPass()";
-  let dependentDialects = ["mlir::LLVM::LLVMDialect"];
-}
-
-#endif  // XLA_RUNTIME_PASSES
diff --git a/third_party/xla/xla/mlir/runtime/transforms/rt_to_llvm.cc b/third_party/xla/xla/mlir/runtime/transforms/rt_to_llvm.cc
deleted file mode 100644
index 8af1cfda92803d..00000000000000
--- a/third_party/xla/xla/mlir/runtime/transforms/rt_to_llvm.cc
+++ /dev/null
@@ -1,804 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <algorithm>
-#include <functional>
-#include <iterator>
-#include <memory>
-#include <optional>
-#include <string>
-#include <string_view>
-#include <utility>
-#include <variant>
-#include <vector>
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "mlir/Conversion/LLVMCommon/TypeConverter.h"  // from @llvm-project
-#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
-#include "mlir/Dialect/Async/IR/AsyncTypes.h"  // from @llvm-project
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/Func/Transforms/FuncConversions.h"  // from @llvm-project
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
-#include "mlir/Dialect/LLVMIR/LLVMTypes.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
-#include "mlir/IR/Location.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/SymbolTable.h"  // from @llvm-project
-#include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/IR/ValueRange.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "xla/mlir/runtime/ir/rt_dialect.h"
-#include "xla/mlir/runtime/ir/rt_ops.h"
-#include "xla/mlir/runtime/transforms/custom_call_encoding.h"
-#include "xla/mlir/runtime/transforms/passes.h"
-#include "xla/runtime/custom_call.h"
-#include "xla/runtime/logical_result.h"
-#include "xla/runtime/tracing.h"
-#include "xla/runtime/type_id.h"
-
-namespace xla {
-namespace runtime {
-namespace {
-
-using namespace mlir;  // NOLINT
-using mlir::arith::ConstantOp;
-
-using llvm::DenseMap;
-
-#define GEN_PASS_DEF_CONVERTRUNTIMETOLLVMPASS
-#include "xla/mlir/runtime/transforms/passes.h.inc"
-
-//===----------------------------------------------------------------------===//
-// Runtime C API declaration (see runtime.h header file).
-//===----------------------------------------------------------------------===//
-
-static constexpr const char *kGetResultStorage = "runtimeGetResultStorage";
-static constexpr const char *kSetError = "runtimeSetError";
-static constexpr const char *kCustomCall = "runtimeCustomCall";
-
-struct RuntimeAPI {
-  static FunctionType GetResultStorageFunctionType(MLIRContext *ctx) {
-    auto ptr = LLVM::LLVMPointerType::get(ctx);
-    auto i64 = IntegerType::get(ctx, 64);
-    return FunctionType::get(ctx, {/*execution_ctx=*/ptr, i64},
-                             {/*storage=*/ptr});
-  }
-
-  static FunctionType SetErrorFunctionType(MLIRContext *ctx) {
-    auto ptr = LLVM::LLVMPointerType::get(ctx);
-    return FunctionType::get(ctx, {/*execution_ctx=*/ptr, /*error_msg=*/ptr},
-                             {});
-  }
-
-  static FunctionType CustomCallFunctionType(MLIRContext *ctx) {
-    auto ptr = LLVM::LLVMPointerType::get(ctx);
-    auto i1 = IntegerType::get(ctx, 1);
-    return FunctionType::get(ctx,
-                             {/*execution_ctx=*/ptr, /*callee=*/ptr,
-                              /*args=*/ptr, /*attrs=*/ptr, /*rets=*/ptr},
-                             {i1});
-  }
-
-  static FunctionType DirectCustomCallFunctionType(MLIRContext *ctx) {
-    auto ptr = LLVM::LLVMPointerType::get(ctx);
-    auto i1 = IntegerType::get(ctx, 1);
-    return FunctionType::get(
-        ctx, {/*execution_ctx=*/ptr, /*args=*/ptr, /*attrs=*/ptr, /*rets=*/ptr},
-        {i1});
-  }
-};
-
-// Adds function declaration if it doesn't already exist.
-static void AddDeclaration(SymbolTable &sym_table, ModuleOp module,
-                           std::string_view name, FunctionType type) {
-  assert(sym_table.getOp() == module && "incorrect symbol table");
-  if (sym_table.lookup(name)) return;
-
-  auto b = ImplicitLocOpBuilder::atBlockEnd(module.getLoc(), module.getBody());
-
-  MLIRContext *ctx = module.getContext();
-  func::FuncOp func = b.create<func::FuncOp>(name, type);
-  func.setPrivate();
-
-  // TODO(ezhulenev): Add per-argument nocapture attributes?
-  func->setAttr("passthrough",
-                ArrayAttr::get(ctx, {StringAttr::get(ctx, "nounwind")}));
-
-  sym_table.insert(func);
-}
-
-// Adds Runtime C API declarations to the module.
-static void AddRuntimeApiDeclarations(SymbolTable &sym_table, ModuleOp module) {
-  auto add = [&](std::string_view name, FunctionType type) {
-    AddDeclaration(sym_table, module, name, type);
-  };
-
-  MLIRContext *ctx = module.getContext();
-  add(kGetResultStorage, RuntimeAPI::GetResultStorageFunctionType(ctx));
-  add(kSetError, RuntimeAPI::SetErrorFunctionType(ctx));
-  add(kCustomCall, RuntimeAPI::CustomCallFunctionType(ctx));
-}
-
-//===----------------------------------------------------------------------===//
-
-class RuntimeTypeConverter : public TypeConverter {
- public:
-  RuntimeTypeConverter() {
-    addConversion([](Type type) { return type; });
-    addConversion(ConvertExecutionContextType);
-    addConversion(ConvertStatusType);
-    addConversion(ConvertOpaqueType);
-  }
-
-  static std::optional<Type> ConvertExecutionContextType(
-      ExecutionContextType type) {
-    return LLVM::LLVMPointerType::get(type.getContext());
-  }
-
-  static std::optional<Type> ConvertStatusType(StatusType type) {
-    return IntegerType::get(type.getContext(), 1);
-  }
-
-  static std::optional<Type> ConvertOpaqueType(OpaqueType type) {
-    return LLVM::LLVMPointerType::get(type.getContext());
-  }
-};
-
-//===----------------------------------------------------------------------===//
-// Convert rt.set_output to the corresponding runtime API call.
-//===----------------------------------------------------------------------===//
-
-class SetOutputOpLowering : public OpConversionPattern<SetOutputOp> {
- public:
-  using OpConversionPattern::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      SetOutputOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    Location loc = op->getLoc();
-
-    auto execution_ctx = adaptor.getCtx();
-    auto index = rewriter.create<ConstantOp>(loc, adaptor.getIndexAttr());
-
-    // Get a pointer to the result value storage from the runtime.
-    auto result_ptr_ty = LLVM::LLVMPointerType::get(rewriter.getContext());
-    auto result_ptr = rewriter.create<func::CallOp>(
-        loc, kGetResultStorage, TypeRange(result_ptr_ty),
-        ValueRange({execution_ctx, index}));
-
-    // Cast from i8* to the LLVM pointer type to store the result.
-    auto stored_type = getTypeConverter()->convertType(op.getValue().getType());
-    if (!stored_type)
-      return rewriter.notifyMatchFailure(
-          op, "failed to convert output type to LLVM type");
-
-    // Store the output value into the result value storage.
-    auto value = adaptor.getValue();
-    rewriter.create<LLVM::StoreOp>(loc, value, result_ptr.getResult(0));
-
-    // Erase the original runtime operation.
-    rewriter.eraseOp(op);
-
-    return success();
-  }
-};
-
-//===----------------------------------------------------------------------===//
-// Convert rt.is_ok to the corresponding runtime API call.
-//===----------------------------------------------------------------------===//
-
-class IsOkOpLowering : public OpConversionPattern<IsOkOp> {
- public:
-  using OpConversionPattern::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      IsOkOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    // Just pass through the converted operand.
-    rewriter.replaceOp(op, adaptor.getStatus());
-    return success();
-  }
-};
-
-//===----------------------------------------------------------------------===//
-// Convert rt.custom_call to the corresponding runtime API call.
-//===----------------------------------------------------------------------===//
-
-static Value AsPtr(ImplicitLocOpBuilder &b,
-                   std::variant<LLVM::AllocaOp, LLVM::GlobalOp> &v) {
-  if (auto *alloca = std::get_if<LLVM::AllocaOp>(&v))
-    return alloca->getResult();
-  return Globals::AddrOf(b, std::get<LLVM::GlobalOp>(v));
-}
-
-static LLVM::GlobalOp EncodeEmptyArgsRets(Globals &g, ImplicitLocOpBuilder &b,
-                                          std::string_view symbol_base) {
-  // Empty args/rets is just an array with a single pointer to size (zero).
-  Type ptr = LLVM::LLVMPointerType::get(b.getContext());
-  Type type = LLVM::LLVMArrayType::get(ptr, 1);
-
-  auto init = [&](ImplicitLocOpBuilder &ib, Attribute) {
-    LLVM::GlobalOp zero =
-        EncodeScalar(g, b, b.getI64IntegerAttr(0), "__rt_zero");
-
-    Value arr = b.create<LLVM::UndefOp>(type);
-    arr = b.create<LLVM::InsertValueOp>(arr, Globals::AddrOf(b, zero), 0);
-    b.create<LLVM::ReturnOp>(arr);
-  };
-
-  return g.GetOrCreate(b, b.getArrayAttr({}), type, symbol_base, init);
-}
-
-static LLVM::GlobalOp EncodeTypeTable(Globals &g, ImplicitLocOpBuilder &b,
-                                      ArrayRef<LLVM::GlobalOp> type_ids,
-                                      std::string_view symbol_base) {
-  // We store type table as `!llvm.array<ptr x len>`.
-  Type ptr = LLVM::LLVMPointerType::get(b.getContext());
-  Type type = LLVM::LLVMArrayType::get(ptr, type_ids.size());
-
-  // Global initializer that encodes type ids as pointers.
-  auto init = [&](ImplicitLocOpBuilder &ib, Attribute) -> LogicalResult {
-    Value arr = b.create<LLVM::UndefOp>(type);
-    for (const auto &pair : llvm::enumerate(type_ids)) {
-      arr = b.create<LLVM::InsertValueOp>(arr, Globals::AddrOf(b, pair.value()),
-                                          pair.index());
-    }
-    b.create<LLVM::ReturnOp>(arr);
-    return success();
-  };
-
-  // Put all type ids into an array attribute, so we can use it as a globals
-  // cache key, so we do not encode the same type table multiple times.
-  llvm::SmallVector<llvm::StringRef> type_id_syms;
-  for (auto type_id : type_ids) type_id_syms.push_back(type_id.getSymName());
-  auto arr_attr = b.getStrArrayAttr(type_id_syms);
-
-  return g.GetOrCreate(b, arr_attr, type, symbol_base, init);
-}
-
-struct EncodedArguments {
-  std::variant<LLVM::AllocaOp, LLVM::GlobalOp> encoded;  // `args` argument
-  SmallVector<std::variant<LLVM::AllocaOp, LLVM::GlobalOp>> values;
-};
-
-static FailureOr<EncodedArguments> EncodeArguments(
-    CallOp op, CustomCallArgEncodingSet &encodings, Globals &g, Allocas &a,
-    DenseMap<Value, CustomCallArgEncoding::Encoded> &encoded_args,
-    ImplicitLocOpBuilder &b, ValueRange operands, ValueRange converted) {
-  llvm::SmallVector<CustomCallArgEncoding::Encoded> encoded;
-
-  // Encode empty arguments as a global array (skip the status type).
-  if (operands.drop_front().empty()) {
-    return EncodedArguments{EncodeEmptyArgsRets(g, b, "__rt_empty_args"), {}};
-  }
-
-  EncodedArguments arguments;
-
-  // Encode all arguments as a set of pointers (skip the execution context).
-  for (auto tuple : llvm::drop_begin(llvm::zip(operands, converted))) {
-    auto encoded_arg =
-        encodings.Encode(g, a, b, std::get<0>(tuple), std::get<1>(tuple));
-    if (failed(encoded_arg)) return failure();
-    encoded.push_back(*encoded_arg);
-    encoded_args.try_emplace(std::get<0>(tuple), *encoded_arg);
-  }
-
-  // We store encoded arguments as `!llvm.array<ptr x len>`.
-  size_t len = encoded.empty() ? 1 : 2 + encoded.size();
-  Type ptr = LLVM::LLVMPointerType::get(b.getContext());
-  Type type = LLVM::LLVMArrayType::get(ptr, len);
-
-  // Prepare an array for encoded arguments.
-  Value arr = b.create<LLVM::UndefOp>(type);
-  auto insert_value = [&](Value value, int64_t offset) {
-    arr = b.create<LLVM::InsertValueOp>(arr, value, offset);
-  };
-
-  // Insert the number of encoded arguments.
-  LLVM::GlobalOp num_args =
-      EncodeScalar(g, b, b.getI64IntegerAttr(encoded.size()), "__rt_num_args");
-  insert_value(Globals::AddrOf(b, num_args), 0);
-
-  // Package arguments type ids into a type table global value.
-  llvm::SmallVector<LLVM::GlobalOp> type_ids;
-  for (auto &arg : encoded) type_ids.push_back(arg.type_id);
-  LLVM::GlobalOp type_table =
-      EncodeTypeTable(g, b, type_ids, "__rt_args_type_table");
-  if (!encoded.empty()) insert_value(Globals::AddrOf(b, type_table), 1);
-
-  // Store pointer to encoded arguments into the allocated storage.
-  for (const auto &pair : llvm::enumerate(encoded)) {
-    CustomCallArgEncoding::Encoded encoded = pair.value();
-    int64_t offset = 2 + pair.index();
-    insert_value(AsPtr(b, encoded.value), offset);
-    arguments.values.push_back(encoded.value);
-  }
-
-  // Get allocation for packed arguments pointers.
-  LLVM::AllocaOp alloca = a.GetOrCreate(b, type);
-
-  // Start the lifetime of the encoded arguments pointers.
-  b.create<LLVM::LifetimeStartOp>(b.getI64IntegerAttr(-1), alloca);
-
-  // Store constructed arguments pointers array into the alloca. Use volatile
-  // store to suppress expensive LLVM optimizations.
-  b.create<LLVM::StoreOp>(arr, alloca, /*alignment=*/0, /*isVolatile=*/true);
-
-  // Alloca that encodes the custom call arguments.
-  arguments.encoded = alloca;
-
-  return arguments;
-}
-
-// Encodes attributes into the global constant (array of pointers to the
-// attributes data, which are also stored as global constants).
-static FailureOr<LLVM::GlobalOp> EncodeAttributes(
-    CustomCallAttrEncodingSet &encodings, SymbolTable &sym_table, Globals &g,
-    ImplicitLocOpBuilder &b, ArrayRef<NamedAttribute> attrs) {
-  // Forward attributes that are not part of the custom call operation itself.
-  auto forward_attr = [](NamedAttribute attr) -> bool {
-    return attr.getName() != "callee" && attr.getName() != "dynamic";
-  };
-
-  llvm::SmallVector<NamedAttribute> custom_call_attrs =
-      llvm::to_vector(llvm::make_filter_range(attrs, forward_attr));
-
-  // Sort encoded attributes in lexicographical order so that when decoding we
-  // can efficiently find attributes by name.
-  llvm::sort(custom_call_attrs, [](NamedAttribute &a, NamedAttribute &b) {
-    return a.getName().strref() < b.getName().strref();
-  });
-
-  return EncodeAttributes(sym_table, g, b, encodings, "__rt_custom_call_attrs",
-                          custom_call_attrs);
-}
-
-struct EncodedResults {
-  std::variant<LLVM::AllocaOp, LLVM::GlobalOp> encoded;  // `rets` argument
-  SmallVector<LLVM::AllocaOp> allocas;                   // encoded returns
-};
-
-static FailureOr<EncodedResults> EncodeResults(
-    CallOp op, CustomCallRetEncodingSet &encodings, Globals &g, Allocas &a,
-    ImplicitLocOpBuilder &b, TypeRange ret_types, TypeRange converted_types) {
-  llvm::SmallVector<CustomCallRetEncoding::Encoded> encoded;
-
-  // Encode empty returns as a global array (skip the status type).
-  if (ret_types.drop_front().empty()) {
-    return EncodedResults{EncodeEmptyArgsRets(g, b, "__rt_empty_rets"), {}};
-  }
-
-  EncodedResults results;
-
-  // Encode all returns as a set of pointers (skip the status type).
-  for (auto tuple : llvm::drop_begin(llvm::zip(ret_types, converted_types))) {
-    auto encoded_ret =
-        encodings.Encode(g, a, b, std::get<0>(tuple), std::get<1>(tuple));
-    if (failed(encoded_ret)) return failure();
-    encoded.push_back(*encoded_ret);
-  }
-
-  // We store encoded results as `!llvm.array<ptr x len>`.
-  size_t len = encoded.empty() ? 1 : 2 + encoded.size();
-  Type ptr = LLVM::LLVMPointerType::get(b.getContext());
-  Type type = LLVM::LLVMArrayType::get(ptr, len);
-
-  // Prepare an array for encoding results.
-  Value arr = b.create<LLVM::UndefOp>(type);
-  auto insert_value = [&](Value value, int64_t offset) {
-    arr = b.create<LLVM::InsertValueOp>(arr, value, offset);
-  };
-
-  // Insert the number of encoded results.
-  LLVM::GlobalOp num_rets =
-      EncodeScalar(g, b, b.getI64IntegerAttr(encoded.size()), "__rt_num_rets");
-  insert_value(Globals::AddrOf(b, num_rets), 0);
-
-  // Package results type ids into a type table global value.
-  llvm::SmallVector<LLVM::GlobalOp> type_ids;
-  for (auto &arg : encoded) type_ids.push_back(arg.type_id);
-  LLVM::GlobalOp type_table =
-      EncodeTypeTable(g, b, type_ids, "__rt_rets_type_table");
-  if (!encoded.empty()) insert_value(Globals::AddrOf(b, type_table), 1);
-
-  // Store encoded results into the allocated storage.
-  for (const auto &pair : llvm::enumerate(encoded)) {
-    CustomCallRetEncoding::Encoded encoded_pair = pair.value();
-    int64_t offset = 2 + pair.index();
-    insert_value(encoded_pair.value, offset);
-    results.allocas.push_back(encoded_pair.value);
-  }
-
-  // Get allocation for packed results pointers.
-  LLVM::AllocaOp alloca = a.GetOrCreate(b, type);
-
-  // Start the lifetime of the encoded results pointers allocation.
-  b.create<LLVM::LifetimeStartOp>(b.getI64IntegerAttr(-1), alloca);
-
-  // Store constructed results pointers array on the stack. Use volatile
-  // store to suppress expensive LLVM optimizations.
-  b.create<LLVM::StoreOp>(arr, alloca, /*alignment=*/0, /*isVolatile=*/true);
-
-  // Alloca that encodes the custom call returns.
-  results.encoded = alloca;
-
-  return results;
-}
-
-static FailureOr<SmallVector<Value>> DecodeResults(
-    func::CallOp op, ImplicitLocOpBuilder b,
-    CustomCallRetEncodingSet &encodings, TypeRange ret_types,
-    TypeRange converted_types, SmallVector<LLVM::AllocaOp> &allocas) {
-  SmallVector<Value> load_results;
-  load_results.push_back(op.getResult(0));
-
-  for (auto tuple : llvm::zip(llvm::drop_begin(ret_types),
-                              llvm::drop_begin(converted_types), allocas)) {
-    auto decoded_ret = encodings.Decode(b, std::get<0>(tuple),
-                                        std::get<1>(tuple), std::get<2>(tuple));
-    if (failed(decoded_ret)) return failure();
-    load_results.push_back(*decoded_ret);
-  }
-
-  return load_results;
-}
-
-class CallOpLowering : public OpConversionPattern<CallOp> {
- public:
-  using OpConversionPattern::OpConversionPattern;
-
-  CallOpLowering(TypeConverter &converter, MLIRContext *ctx,
-                 SymbolTable &sym_table, Globals &globals,
-                 EncodingAllocas &allocas,
-                 CustomCallArgEncodingSet &arg_encoding,
-                 CustomCallAttrEncodingSet &attr_encoding,
-                 CustomCallRetEncodingSet &ret_encoding,
-                 DenseMap<Value, CustomCallArgEncoding::Encoded> &encoded_args)
-      : OpConversionPattern(converter, ctx),
-        sym_table_(sym_table),
-        globals_(globals),
-        allocas_(allocas),
-        arg_encoding_(arg_encoding),
-        attr_encoding_(attr_encoding),
-        ret_encoding_(ret_encoding),
-        encoded_args_(encoded_args) {}
-
-  LogicalResult matchAndRewrite(
-      CallOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-
-    // Reuse allocas for encoding custom call arguments.
-    Allocas allocas = allocas_.GetForOperation(op);
-
-    // Encode operation arguments as a runtime API arguments.
-    auto args =
-        EncodeArguments(op, arg_encoding_, globals_, allocas, encoded_args_, b,
-                        op->getOperands(), adaptor.getOperands());
-    if (failed(args)) return op.emitOpError() << "failed to encode arguments";
-
-    // Encode operation attributes as a runtime API argument.
-    auto attrs = EncodeAttributes(attr_encoding_, sym_table_, globals_, b,
-                                  op->getAttrs());
-    if (failed(attrs)) return op.emitOpError() << "failed to encode attributes";
-
-    // Encode operation results as a runtime API arguments.
-    auto ret_types = op->getResultTypes();
-    std::vector<Type> converted_ret_types(ret_types.size());
-    std::transform(
-        ret_types.begin(), ret_types.end(), converted_ret_types.begin(),
-        [&](Type type) { return getTypeConverter()->convertType(type); });
-    auto rets = EncodeResults(op, ret_encoding_, globals_, allocas, b,
-                              ret_types, converted_ret_types);
-    if (failed(rets)) return op.emitOpError() << "failed to encode results";
-
-    // Creates a dynamic custom call resolved by name at run time.
-    auto call_dynamic = [&]() -> func::CallOp {
-      auto callee = Globals::AddrOf(
-          b, globals_.GetOrCreate(b, op.getCallee(), "__rt_custom_call_name"));
-
-      return b.create<func::CallOp>(
-          kCustomCall, TypeRange(rewriter.getI1Type()),
-          ValueRange({adaptor.getCtx(), callee, AsPtr(b, args->encoded),
-                      Globals::AddrOf(b, *attrs), AsPtr(b, rets->encoded)}));
-    };
-
-    // Creates a direct custom call resolved at link time.
-    auto call_direct = [&]() -> func::CallOp {
-      auto type = RuntimeAPI::DirectCustomCallFunctionType(op.getContext());
-      AddDeclaration(sym_table_, op->getParentOfType<ModuleOp>(),
-                     op.getCallee(), type);
-
-      return b.create<func::CallOp>(
-          op.getCallee(), TypeRange(rewriter.getI1Type()),
-          ValueRange({adaptor.getCtx(), AsPtr(b, args->encoded),
-                      Globals::AddrOf(b, *attrs), AsPtr(b, rets->encoded)}));
-    };
-
-    // Build a call operation and result decoding right after the original op.
-    OpBuilder::InsertionGuard guard(b);
-    b.setInsertionPointAfter(op);
-
-    func::CallOp call = op.getDynamic() ? call_dynamic() : call_direct();
-
-    // Load results written by custom call into the allocated storage and decode
-    // them back to the expected type (e.g. convert memref descriptor type).
-    FailureOr<SmallVector<Value>> decoded_results = DecodeResults(
-        call, b, ret_encoding_, ret_types, converted_ret_types, rets->allocas);
-    if (failed(decoded_results))
-      return op.emitOpError() << "failed to decode results";
-
-    auto size = b.getI64IntegerAttr(-1);
-
-    // End the lifetime of encoded arguments and results pointers.
-    if (auto *alloca = std::get_if<LLVM::AllocaOp>(&args->encoded))
-      b.create<LLVM::LifetimeEndOp>(size, *alloca);
-    if (auto *alloca = std::get_if<LLVM::AllocaOp>(&rets->encoded))
-      b.create<LLVM::LifetimeEndOp>(size, *alloca);
-
-    // End the lifetime of arguments encoded on a stack.
-    for (auto &arg : args->values)
-      if (auto *alloca = std::get_if<LLVM::AllocaOp>(&arg))
-        b.create<LLVM::LifetimeEndOp>(size, *alloca);
-
-    // End the lifetime of results encoded on a stack.
-    for (LLVM::AllocaOp alloca : rets->allocas)
-      b.create<LLVM::LifetimeEndOp>(size, alloca);
-
-    rewriter.replaceOp(op, ValueRange(*decoded_results));
-    return success();
-  }
-
- private:
-  SymbolTable &sym_table_;
-  Globals &globals_;
-  EncodingAllocas &allocas_;
-  CustomCallArgEncodingSet &arg_encoding_;
-  CustomCallAttrEncodingSet &attr_encoding_;
-  CustomCallRetEncodingSet &ret_encoding_;
-  DenseMap<Value, CustomCallArgEncoding::Encoded> &encoded_args_;
-};
-
-//===----------------------------------------------------------------------===//
-// Convert rt.set_error to the corresponding runtime API call.
-//===----------------------------------------------------------------------===//
-
-class SetErrorOpLowering : public OpConversionPattern<SetErrorOp> {
- public:
-  SetErrorOpLowering(TypeConverter &converter, MLIRContext *ctx,
-                     Globals &globals)
-      : OpConversionPattern(converter, ctx), globals_(globals) {}
-
-  LogicalResult matchAndRewrite(
-      SetErrorOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-
-    // Get the error message (pointer to a null terminated string).
-    auto err = Globals::AddrOf(
-        b, globals_.GetOrCreate(b, op.getError(), "__assert_failed"));
-
-    // Call runtime API to report the error.
-    auto execution_ctx = adaptor.getCtx();
-    rewriter.replaceOpWithNewOp<func::CallOp>(op, kSetError, TypeRange(),
-                                              ValueRange({execution_ctx, err}));
-
-    return success();
-  }
-
- private:
-  Globals &globals_;
-};
-
-//===----------------------------------------------------------------------===//
-// Convert rt.trace to a pair of custom calls (start and end trace activity).
-//===----------------------------------------------------------------------===//
-
-class TraceOpLowering : public OpConversionPattern<TraceOp> {
- public:
-  using OpConversionPattern::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      TraceOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-
-    Type status = StatusType::get(getContext());
-    Type activity_id = rewriter.getI64Type();
-
-    // Start the trace activity with the given annotation.
-    b.setInsertionPoint(op);
-    auto start = b.create<CallOp>(TypeRange({status, activity_id}), op.getCtx(),
-                                  "xla.trace.activity_start",
-                                  /*dynamic=*/false, ValueRange());
-    start->setAttr("annotation", op.getAnnotation());
-
-    // End activity after executing the attached region.
-    b.setInsertionPointAfter(op);
-    b.create<CallOp>(status, op.getCtx(), "xla.trace.activity_end",
-                     /*dynamic=*/false, start.getResults());
-
-    // Replace trace operation with inlined region.
-    b.setInsertionPointAfter(op);
-    auto terminator = cast<YieldOp>(op.getBody().front().getTerminator());
-    rewriter.inlineBlockBefore(terminator->getBlock(), op);
-    rewriter.replaceOp(op, terminator->getOperands());
-    rewriter.eraseOp(terminator);
-
-    return success();
-  }
-};
-
-//===----------------------------------------------------------------------===//
-// Convert rt.unsigned_cast to no-op.
-//===----------------------------------------------------------------------===//
-
-class UnsignedCastOpLowering : public OpConversionPattern<UnsignedCastOp> {
- public:
-  using OpConversionPattern::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      UnsignedCastOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    // Just pass through the argument value.
-    rewriter.replaceOp(op, adaptor.getValue());
-    return success();
-  }
-};
-
-//===----------------------------------------------------------------------===//
-
-class ConvertRuntimeToLLVMPass
-    : public impl::ConvertRuntimeToLLVMPassBase<ConvertRuntimeToLLVMPass> {
- public:
-  explicit ConvertRuntimeToLLVMPass(ConvertRuntimeToLLvmOpts opts)
-      : opts_(std::move(opts)) {}
-
-  void runOnOperation() override;
-
- private:
-  ConvertRuntimeToLLvmOpts opts_;
-};
-
-void ConvertRuntimeToLLVMPass::runOnOperation() {
-  ModuleOp module = getOperation();
-  MLIRContext *ctx = module.getContext();
-
-  // A symbol table for resolving symbol references attributes.
-  SymbolTable sym_table(module);
-
-  // Add declarations for the runtime API functions.
-  AddRuntimeApiDeclarations(sym_table, module);
-
-  RuntimeTypeConverter converter;
-  RewritePatternSet patterns(ctx);
-
-  // We use conversion to LLVM type to lower all runtime operands to LLVM types.
-  LLVMTypeConverter llvm_converter(ctx);
-  llvm_converter.addConversion(
-      RuntimeTypeConverter::ConvertExecutionContextType);
-  llvm_converter.addConversion(RuntimeTypeConverter::ConvertStatusType);
-  llvm_converter.addConversion(RuntimeTypeConverter::ConvertOpaqueType);
-
-  // TODO(ezhulenev): We should combine AsyncToLLVM and RtToLLVM into a single
-  // pass that composed from `rt` and `async` patterns, because they both
-  // rewriter function into the CFG and they interact badly.
-
-  // Convert all async types to opaque pointers.
-  llvm_converter.addConversion([&](Type type) -> std::optional<Type> {
-    if (type.isa<async::TokenType, async::GroupType, async::ValueType>())
-      return LLVM::LLVMPointerType::get(ctx);
-    return std::nullopt;
-  });
-
-  // Use UnrealizedConversionCast as the bridge so that we don't need to pull
-  // in patterns for other dialects.
-  auto add_unrealized_cast = [](OpBuilder &builder, Type type,
-                                ValueRange inputs, Location loc) {
-    auto cast = builder.create<UnrealizedConversionCastOp>(loc, type, inputs);
-    return std::optional<Value>(cast.getResult(0));
-  };
-  converter.addSourceMaterialization(add_unrealized_cast);
-
-  // Add type conversions for user-defined types so that we can properly convert
-  // all function signatures in the module and prepare values for custom calls.
-  if (opts_.populate_type_conversions) {
-    opts_.populate_type_conversions(converter);
-    opts_.populate_type_conversions(llvm_converter);
-  }
-
-  // Register mappings from the TypeID to type names.
-  TypeIDNameRegistry type_id_names;
-  PopulateCustomCallTypeIdNames(type_id_names);
-  PopulateTraceTypeIdNames(type_id_names);
-  if (opts_.populate_type_id_names) opts_.populate_type_id_names(type_id_names);
-
-  // A helper class to create unique global constants.
-  Globals globals(module, type_id_names);
-
-  // A helper class to create allocas for values encoded on a stack.
-  EncodingAllocas allocas;
-
-  // Keep a cache of encoded values to encode each unique value just once.
-  DenseMap<Value, CustomCallArgEncoding::Encoded> encoded_args;
-
-  // Lower from the runtime operations to the runtime API function calls.
-  patterns.add<SetOutputOpLowering, IsOkOpLowering>(llvm_converter, ctx);
-  patterns.add<SetErrorOpLowering>(llvm_converter, ctx, globals);
-
-  // Lower tracing operation to a pair of push/pop custom calls.
-  patterns.add<TraceOpLowering>(llvm_converter, ctx);
-
-  // Erase special signless-unsigned casting operation that we added to work
-  // around the unsigned constants limitation.
-  patterns.add<UnsignedCastOpLowering>(llvm_converter, ctx);
-
-  // Use default custom call encoding for canonical types.
-  CustomCallArgEncodingSet args = DefaultArgEncodings();
-  CustomCallAttrEncodingSet attrs = DefaultAttrEncodings();
-  CustomCallRetEncodingSet rets = DefaultRetEncodings();
-
-  // Add user-defined arg and attr encodings.
-  if (opts_.populate_arg_encodings) opts_.populate_arg_encodings(args);
-  if (opts_.populate_attr_encodings) opts_.populate_attr_encodings(attrs);
-  if (opts_.populate_ret_encodings) opts_.populate_ret_encodings(rets);
-
-  patterns.add<CallOpLowering>(llvm_converter, ctx, sym_table, globals, allocas,
-                               args, attrs, rets, encoded_args);
-
-  // Convert function signatures and call sites.
-  mlir::populateFunctionOpInterfaceTypeConversionPattern<func::FuncOp>(
-      patterns, converter);
-  populateCallOpTypeConversionPattern(patterns, converter);
-
-  // Set up conversion target to rewrite all runtime operations.
-  ConversionTarget target(*ctx);
-  target.addIllegalDialect<RuntimeDialect>();
-  target.addLegalDialect<LLVM::LLVMDialect>();
-  target.addLegalOp<ConstantOp, UnrealizedConversionCastOp, func::CallOp>();
-
-  // Add dynamic legality constraints to apply conversions defined above.
-  target.addDynamicallyLegalOp<func::FuncOp>([&](func::FuncOp op) {
-    return converter.isSignatureLegal(op.getFunctionType());
-  });
-
-  target.addDynamicallyLegalOp<func::CallOp>([&](func::CallOp op) {
-    return converter.isSignatureLegal(op.getCalleeType());
-  });
-
-  if (failed(applyPartialConversion(module, target, std::move(patterns))))
-    return signalPassFailure();
-
-  // Remove all rt.exported attributes once we are done with conversion to LLVM.
-  module.walk([](Operation *op) { op->removeAttr("rt.exported"); });
-}
-
-}  // namespace
-
-std::unique_ptr<OperationPass<ModuleOp>> CreateConvertRuntimeToLLVMPass(
-    ConvertRuntimeToLLvmOpts opts) {
-  return std::make_unique<ConvertRuntimeToLLVMPass>(std::move(opts));
-}
-
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/mlir/runtime/transforms/specialization.cc b/third_party/xla/xla/mlir/runtime/transforms/specialization.cc
deleted file mode 100644
index 3109917e6fb2e1..00000000000000
--- a/third_party/xla/xla/mlir/runtime/transforms/specialization.cc
+++ /dev/null
@@ -1,277 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/mlir/runtime/transforms/specialization.h"
-
-#include <memory>
-#include <numeric>
-#include <string>
-#include <utility>
-
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Casting.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
-#include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Block.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/Location.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/Interfaces/FunctionInterfaces.h"  // from @llvm-project
-#include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
-#include "xla/mlir/runtime/transforms/type_converter.h"
-#include "xla/mlir/runtime/utils/constraints.h"
-#include "xla/runtime/arguments.h"
-#include "xla/runtime/constraints.h"
-#include "xla/runtime/symbolic_shape.h"
-#include "xla/runtime/types.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla {
-namespace runtime {
-
-using llvm::ArrayRef;
-using llvm::dyn_cast;
-
-using absl::InvalidArgumentError;
-using absl::Status;
-using absl::StatusOr;
-using absl::StrCat;
-
-using SymbolicShape = SymbolicShapesResolver::SymbolicShape;
-
-static Status VerifyMemrefOperand(unsigned index, mlir::ShapedType shaped,
-                                  const MemrefDesc& memref) {
-  auto element_ty = TypeConverter::ConvertElementType(shaped.getElementType());
-  if (!element_ty.ok()) element_ty.status();
-
-  // TODO(ezhulenev): Pass an instance of TypeConverter so we can convert shaped
-  // type to the corresponding run-time type. For now we convert all shaped
-  // types to memrefs, because for the verification function it doesn't really
-  // matter if it's a tensor or a memref.
-
-  // We do not support unranked memrefs at runtime, however we need to verify
-  // operand types when we do compiled executable specialization to shape.
-  if (shaped.hasRank()) {
-    MemrefType type(shaped.getShape(), *element_ty);
-    if (auto st = VerifyMemrefArgument(index, type, memref); !st.ok())
-      return st;
-  } else {
-    UnrankedMemrefType type(*element_ty);
-    if (auto st = VerifyMemrefArgument(index, type, memref); !st.ok())
-      return st;
-  }
-
-  return absl::OkStatus();
-}
-
-// Return input `type` specialized to the argument and its symbolic shape.
-static StatusOr<mlir::Type> SpecializeOperandType(
-    unsigned index, mlir::Type type, const Argument& argument,
-    const SymbolicShape& symbolic_shape) {
-  // We do not yet support specializing non-memref arguments.
-  auto* memref_arg = dyn_cast<MemrefDesc>(&argument);
-  if (!memref_arg) {
-    if (!symbolic_shape.empty())
-      return InvalidArgumentError(StrCat(
-          "unexpected symbolic shape for argument: ", argument.ToString()));
-    return type;
-  }
-
-  // Replace all symbolic dimensions with dynamic dimension.
-  auto shape = SymbolicShapesResolver::Normalize(symbolic_shape);
-
-  if (auto memref = type.dyn_cast<mlir::MemRefType>()) {
-    if (auto st = VerifyMemrefOperand(index, memref, *memref_arg); !st.ok())
-      return st;
-    return mlir::MemRefType::get(shape, memref.getElementType());
-  }
-
-  if (auto tensor = type.dyn_cast<mlir::RankedTensorType>()) {
-    if (auto st = VerifyMemrefOperand(index, tensor, *memref_arg); !st.ok())
-      return st;
-    return mlir::RankedTensorType::get(shape, tensor.getElementType());
-  }
-
-  if (auto tensor = type.dyn_cast<mlir::UnrankedTensorType>()) {
-    if (auto st = VerifyMemrefOperand(index, tensor, *memref_arg); !st.ok())
-      return st;
-    return mlir::RankedTensorType::get(shape, tensor.getElementType());
-  }
-
-  return InvalidArgumentError(
-      StrCat("Unsupported input type: ", debugString(type)));
-}
-
-// Gets (copies) the values from `desc`, returning them in a DenseElementsAttr.
-// If it cannot extract the values, returns an empty attribute.
-static mlir::DenseElementsAttr GetMemrefValues(mlir::Builder& builder,
-                                               mlir::TensorType operand_type,
-                                               const MemrefDesc& desc) {
-  size_t rank = desc.rank();
-  if (rank != 0 && rank != 1) return {};
-
-  llvm::SmallVector<mlir::Attribute> attributes;
-  size_t num_values = rank == 0 ? 1 : desc.size(0);
-  switch (desc.dtype()) {
-    case PrimitiveType::S32: {
-      const auto* data = static_cast<int32_t*>(desc.data());
-      for (int i = 0; i < num_values; ++i) {
-        attributes.push_back(builder.getI32IntegerAttr(data[i]));
-      }
-    } break;
-    case PrimitiveType::S64: {
-      const auto* data = static_cast<int64_t*>(desc.data());
-      for (int i = 0; i < num_values; ++i) {
-        attributes.push_back(builder.getI64IntegerAttr(data[i]));
-      }
-    } break;
-    default:
-      return {};
-  }
-
-  // Update operand type to a ranked tensor type with statically known shape.
-  auto element_type = operand_type.getElementType();
-  auto ranked_tensor = mlir::RankedTensorType::get(
-      {desc.sizes().begin(), desc.sizes().size()}, element_type);
-
-  return mlir::DenseElementsAttr::get(ranked_tensor, attributes);
-}
-
-Status SpecializeFunction(mlir::FunctionOpInterface func,
-                          ArgumentsRef arguments,
-                          ArrayRef<SymbolicShape> symbolic_shapes,
-                          ArrayRef<ArgumentConstraint> constraints,
-                          const SpecializationListener* listener) {
-  mlir::MLIRContext* ctx = func.getContext();
-
-  unsigned num_inputs = func.getNumArguments();
-
-  // Specialize all function inputs to the given arguments.
-  llvm::SmallVector<mlir::Type> specialized_inputs(num_inputs);
-  for (unsigned i = 0; i < num_inputs; ++i) {
-    auto specialized = SpecializeOperandType(
-        i, llvm::cast<mlir::FunctionType>(func.getFunctionType()).getInput(i),
-        arguments[i], symbolic_shapes[i]);
-    if (!specialized.ok()) return specialized.status();
-    specialized_inputs[i] = *specialized;
-  }
-
-  // Update function type to a new specialized one.
-  auto specialized = mlir::FunctionType::get(
-      ctx, specialized_inputs,
-      llvm::cast<mlir::FunctionType>(func.getFunctionType()).getResults());
-  func.setType(specialized);
-
-  // Update function entry block arguments.
-  mlir::Block& entry_block = func.getBlocks().front();
-  mlir::OpBuilder builder = mlir::OpBuilder::atBlockBegin(&entry_block);
-  mlir::Location loc = func.getLoc();
-
-  // Forward original block arguments to arguments with specialized type. We
-  // need to insert casts to ensure the users still get the correct type and
-  // avoid illegal IR. This can be optimized away by the user-provided
-  // specialization pipeline, e.g., in Tensorflow these casts will be optimized
-  // away by the shape inference pass.
-  for (int i = 0; i < num_inputs; ++i) {
-    mlir::Value new_arg = entry_block.addArgument(specialized_inputs[i], loc);
-    mlir::Value old_arg = entry_block.getArgument(i);
-    if (new_arg.getType() != old_arg.getType()) {
-      new_arg =
-          builder.create<mlir::tensor::CastOp>(loc, old_arg.getType(), new_arg);
-    }
-    old_arg.replaceAllUsesWith(new_arg);
-  }
-
-  // Erase all the original block arguments.
-  entry_block.eraseArguments(0, num_inputs);
-
-  // Add symbolic shapes as arguments attributes.
-  for (unsigned i = 0; i < num_inputs; ++i) {
-    const SymbolicShape& shape = symbolic_shapes[i];
-    int64_t rank = shape.size();
-
-    // Skip statically known shapes.
-    if (llvm::all_of(shape, [](int64_t dim) { return dim >= 0; })) continue;
-
-    // Symbolic shape attribute stored as 1d tensor attribute.
-    auto i64 = mlir::IntegerType::get(ctx, 64);
-    auto tensor = mlir::RankedTensorType::get({rank}, i64);
-
-    // Create i64 attributes from the symbolic shape values.
-    llvm::SmallVector<mlir::Attribute> values(rank);
-    for (unsigned d = 0; d < rank; ++d)
-      values[d] = mlir::IntegerAttr::get(i64, shape[d]);
-
-    func.setArgAttr(i, kSymbolicShapeAttrName,
-                    mlir::DenseElementsAttr::get(tensor, values));
-  }
-
-  // Sink small constants into the function body.
-  builder.setInsertionPointToStart(&func.getFunctionBody().front());
-  for (int i = 0; i < constraints.size(); ++i) {
-    if (constraints[i] != ArgumentConstraint::kValue) continue;
-
-    // We only support sinking of Tensor arguments into the function body.
-    mlir::Type input =
-        llvm::cast<mlir::FunctionType>(func.getFunctionType()).getInput(i);
-    mlir::TensorType tensor = input.dyn_cast<mlir::TensorType>();
-    if (!tensor || !SupportsValueSpecialization(tensor)) {
-      return InvalidArgumentError(StrCat(
-          "non-sinkable operand was marked for sinking: ", debugString(input)));
-    }
-
-    // Value specialized tensors must be passed as memref arguments.
-    auto* memref = dyn_cast<MemrefDesc>(&arguments[i]);
-    if (!memref) {
-      return InvalidArgumentError(
-          StrCat("non-sinkable argument was marked for sinking: ",
-                 arguments[i].ToString()));
-    }
-
-    // Get the argument value from the runtime memref argument.
-    mlir::DenseElementsAttr value = GetMemrefValues(builder, tensor, *memref);
-    if (!value) {
-      return InvalidArgumentError(
-          StrCat("cannot get value from argument type: ", debugString(input)));
-    }
-
-    auto cst =
-        builder.create<mlir::arith::ConstantOp>(loc, value.getType(), value);
-    entry_block.getArgument(i).replaceAllUsesWith(cst);
-
-    if (listener) listener->notifyValueSpecialized(i, value.getType(), value);
-  }
-
-  if (listener) {
-    llvm::SmallVector<mlir::DictionaryAttr> specialized_attrs;
-    func.getAllArgAttrs(specialized_attrs);
-    listener->notifyModuleSpecialized(specialized_inputs, specialized_attrs);
-  }
-
-  return absl::OkStatus();
-}
-
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/mlir/runtime/transforms/specialization.h b/third_party/xla/xla/mlir/runtime/transforms/specialization.h
deleted file mode 100644
index 4a43daa6ab49ed..00000000000000
--- a/third_party/xla/xla/mlir/runtime/transforms/specialization.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_MLIR_RUNTIME_TRANSFORMS_SPECIALIZATION_H_
-#define XLA_MLIR_RUNTIME_TRANSFORMS_SPECIALIZATION_H_
-
-#include "absl/status/status.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/Interfaces/FunctionInterfaces.h"  // from @llvm-project
-#include "xla/runtime/arguments.h"
-#include "xla/runtime/constraints.h"
-#include "xla/runtime/symbolic_shape.h"
-
-namespace xla {
-namespace runtime {
-
-// TODO(ezhulenev): A lot of specialization code is written with an assumption
-// that we can only specialize Tensor arguments. Make this extendable
-// to support user-defined types and user-defined specializations.
-
-// Symbolic shape attached to the argument as a dense i64 array attribute.
-// TODO(ezhulenev): Change symbolic shape attribute type to match the comment.
-constexpr const char* kSymbolicShapeAttrName = "rt.symbolic_shape";
-
-// Listener class to control notifications during specialization.
-struct SpecializationListener {
-  virtual ~SpecializationListener() {}
-
-  // Called at the end of module specialization.
-  // - 'operands' is a reference to the specialized operands' types.
-  // - `attrs` is a list of attributes attached to operands.
-  virtual void notifyModuleSpecialized(
-      llvm::ArrayRef<mlir::Type> operands,
-      llvm::ArrayRef<mlir::DictionaryAttr> attrs) const {}
-
-  // Called once for every value-specialized argument.
-  virtual void notifyValueSpecialized(unsigned index, mlir::Type type,
-                                      mlir::Attribute value) const {}
-};
-
-// Specializes function to the runtime arguments:
-//
-// - updates all unknown dimensions according to the resolved symbolic shapes
-// - attaches symbolic shape attribute to the operands
-// - for value-specialized operands sinks small constants into the function body
-//
-// Returns error if arguments are not compatible with the function signature.
-absl::Status SpecializeFunction(
-    mlir::FunctionOpInterface func, ArgumentsRef arguments,
-    llvm::ArrayRef<SymbolicShapesResolver::SymbolicShape> symbolic_shapes,
-    llvm::ArrayRef<ArgumentConstraint> constraints,
-    const SpecializationListener* listener = nullptr);
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_MLIR_RUNTIME_TRANSFORMS_SPECIALIZATION_H_
diff --git a/third_party/xla/xla/mlir/runtime/transforms/type_converter.cc b/third_party/xla/xla/mlir/runtime/transforms/type_converter.cc
deleted file mode 100644
index 0c0303bdf864c2..00000000000000
--- a/third_party/xla/xla/mlir/runtime/transforms/type_converter.cc
+++ /dev/null
@@ -1,197 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/mlir/runtime/transforms/type_converter.h"
-
-#include <iterator>
-#include <memory>
-#include <string>
-#include <string_view>
-#include <utility>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "absl/strings/str_format.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "mlir/Dialect/Async/IR/AsyncTypes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
-#include "xla/mlir/runtime/ir/rt_dialect.h"
-#include "xla/primitive_util.h"
-#include "xla/runtime/types.h"
-#include "xla/xla_data.pb.h"
-#include "tsl/platform/statusor.h"
-
-namespace xla {
-namespace runtime {
-
-using absl::InvalidArgumentError;
-using absl::StatusOr;
-using absl::StrFormat;
-
-// Type conversion for the canonical MLIR types supported by the runtime.
-static std::unique_ptr<Type> ConvertCanonicalType(
-    mlir::Type type, const TypeConverter& convert) {
-  // ExecutionContextType -> ExecutionContextOperandType (both in xla::runtime).
-  if (auto ctx = type.dyn_cast<ExecutionContextType>())
-    return std::make_unique<ExecutionContextOperandType>();
-
-  // OpaqueType -> OpaqueOperandType (both in xla::runtime).
-  if (auto ctx = type.dyn_cast<OpaqueType>())
-    return std::make_unique<OpaqueOperandType>();
-
-  // mlir::async::TokenType -> xla::runtime::AsyncTokenType
-  if (type.isa<mlir::async::TokenType>())
-    return std::make_unique<AsyncTokenType>();
-
-  // mlir::async::ValueType -> xla::runtime::AsyncValueType
-  if (auto value = type.dyn_cast<mlir::async::ValueType>()) {
-    if (auto value_type = convert.Convert(value.getValueType());
-        value_type.ok())
-      return std::make_unique<AsyncValueType>(std::move(*value_type));
-  }
-
-  // mlir::{IndexType, IntegerType, FloatType} -> xla::runtime::ScalarType
-  if (type.isa<mlir::IndexType, mlir::IntegerType, mlir::FloatType>()) {
-    if (auto dtype = TypeConverter::ConvertElementType(type); dtype.ok())
-      return std::make_unique<ScalarType>(*dtype);
-  }
-
-  // mlir::RankedTensorType -> xla::runtime::RankedTensorType
-  if (auto tensor = type.dyn_cast<mlir::RankedTensorType>()) {
-    if (auto dtype = TypeConverter::ConvertElementType(tensor.getElementType());
-        dtype.ok())
-      return std::make_unique<RankedTensorType>(tensor.getShape(), *dtype);
-  }
-
-  // mlir::UnrankedTensorType -> xla::runtime::UnrankedTensorType
-  if (auto tensor = type.dyn_cast<mlir::UnrankedTensorType>()) {
-    if (auto dtype = TypeConverter::ConvertElementType(tensor.getElementType());
-        dtype.ok())
-      return std::make_unique<UnrankedTensorType>(*dtype);
-  }
-
-  // mlir::MemrefType -> xla::runtime::MemrefType
-  if (auto memref = type.dyn_cast<mlir::MemRefType>()) {
-    if (auto dtype = TypeConverter::ConvertElementType(memref.getElementType());
-        dtype.ok())
-      return std::make_unique<MemrefType>(memref.getShape(), *dtype);
-  }
-
-  // mlir::UnrankedMemrefType -> xla::runtime::UnrankedMemrefType
-  if (auto memref = type.dyn_cast<mlir::UnrankedMemRefType>()) {
-    if (auto dtype = TypeConverter::ConvertElementType(memref.getElementType());
-        dtype.ok())
-      return std::make_unique<UnrankedMemrefType>(*dtype);
-  }
-
-  // mlir::TupleType -> xla::runtime::TupleType
-  if (auto tuple = type.dyn_cast<mlir::TupleType>()) {
-    llvm::SmallVector<std::unique_ptr<Type>> conv_elems;
-    llvm::transform(tuple, std::back_inserter(conv_elems),
-                    [&convert](mlir::Type type) {
-                      return ConvertCanonicalType(type, convert);
-                    });
-    return std::make_unique<TupleType>(std::move(conv_elems));
-  }
-
-  // For non-canonical types the user must provide type conversion function.
-  return {};
-}
-
-/*static*/ StatusOr<PrimitiveType> TypeConverter::ConvertElementType(
-    mlir::Type type) {
-  if (type.isFloat8E4M3FN()) return PrimitiveType::F8E4M3FN;
-  if (type.isFloat8E4M3B11FNUZ()) return PrimitiveType::F8E4M3B11FNUZ;
-  if (type.isFloat8E4M3FNUZ()) return PrimitiveType::F8E4M3FNUZ;
-  if (type.isFloat8E5M2()) return PrimitiveType::F8E5M2;
-  if (type.isFloat8E5M2FNUZ()) return PrimitiveType::F8E5M2FNUZ;
-  if (type.isIndex()) return PrimitiveType::S64;
-  if (type.isBF16()) return PrimitiveType::BF16;
-  if (type.isF16()) return PrimitiveType::F16;
-  if (type.isF32()) return PrimitiveType::F32;
-  if (type.isF64()) return PrimitiveType::F64;
-  if (type.isInteger(1)) return PrimitiveType::PRED;
-  if (auto int_type = type.dyn_cast<mlir::IntegerType>()) {
-    unsigned int width = int_type.getWidth();
-    if (auto primitive_type =
-            int_type.isUnsigned()
-                ? primitive_util::UnsignedIntegralTypeForBitWidth(width)
-                : primitive_util::SignedIntegralTypeForBitWidth(width);
-        primitive_type != PRIMITIVE_TYPE_INVALID) {
-      return primitive_type;
-    }
-  }
-  if (auto complex_type = type.dyn_cast<mlir::ComplexType>()) {
-    auto element_type = complex_type.getElementType();
-    TF_ASSIGN_OR_RETURN(auto element_primitive_type,
-                        ConvertElementType(element_type));
-    if (auto complex_primitive_type =
-            primitive_util::ComplexType(element_primitive_type);
-        complex_primitive_type != PRIMITIVE_TYPE_INVALID) {
-      return complex_primitive_type;
-    }
-  }
-
-  return InvalidArgumentError(
-      StrFormat("unsupported element type: %s", debugString(type)));
-}
-
-StatusOr<std::unique_ptr<Type>> TypeConverter::Convert(mlir::Type type) const {
-  if (std::unique_ptr<Type> converted = ConvertCanonicalType(type, *this))
-    return std::move(converted);
-
-  for (const ConversionFn& conversion : conversions_)
-    if (std::unique_ptr<Type> converted = conversion(type))
-      return std::move(converted);
-
-  return InvalidArgumentError(StrFormat(
-      "can't convert type: %s to the run time type", debugString(type)));
-}
-
-StatusOr<FunctionType> TypeConverter::Convert(mlir::FunctionType type) const {
-  assert(type && "function type must be not null");
-
-  std::vector<std::unique_ptr<Type>> operands;
-  std::vector<std::unique_ptr<Type>> results;
-
-  operands.reserve(type.getNumInputs());
-  results.reserve(type.getNumResults());
-
-  auto error = [](std::string_view kind, unsigned i, mlir::Type type) {
-    return InvalidArgumentError(
-        StrFormat("can't convert %s #%i type %s to the run time type", kind, i,
-                  debugString(type)));
-  };
-
-  for (unsigned i = 0; i < type.getNumInputs(); ++i) {
-    StatusOr<std::unique_ptr<Type>> converted = Convert(type.getInput(i));
-    if (!converted.ok()) return error("input", i, type.getInput(i));
-    operands.push_back(std::move(*converted));
-  }
-
-  for (unsigned i = 0; i < type.getNumResults(); ++i) {
-    StatusOr<std::unique_ptr<Type>> converted = Convert(type.getResult(i));
-    if (!converted.ok()) return error("result", i, type.getResult(i));
-    results.push_back(std::move(*converted));
-  }
-
-  return FunctionType(std::move(operands), std::move(results));
-}
-
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/mlir/runtime/transforms/type_converter.h b/third_party/xla/xla/mlir/runtime/transforms/type_converter.h
deleted file mode 100644
index edf275bf5c255d..00000000000000
--- a/third_party/xla/xla/mlir/runtime/transforms/type_converter.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_MLIR_RUNTIME_TRANSFORMS_TYPE_CONVERTER_H_
-#define XLA_MLIR_RUNTIME_TRANSFORMS_TYPE_CONVERTER_H_
-
-#include <functional>
-#include <memory>
-
-#include "absl/status/statusor.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/Types.h"  // from @llvm-project
-#include "xla/runtime/types.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla {
-namespace runtime {
-
-//===----------------------------------------------------------------------===//
-// Type conversion from the compile time types to the run-time types.
-//===----------------------------------------------------------------------===//
-
-// Type converter converts MLIR types known at compile time to the corresponding
-// types used at run time. It provides default conversions for the canonical
-// types (memrefs, tensors, etc...) and allows users to register custom
-// conversions for user-defined types.
-class TypeConverter {
- public:
-  // Conversion function must return run time type corresponding to the compile
-  // time type if the conversion is successful, or `nullptr` if failed.
-  using ConversionFn = std::function<std::unique_ptr<Type>(mlir::Type)>;
-
-  TypeConverter() = default;
-
-  template <typename... Fns>
-  explicit TypeConverter(Fns&&... fn) {
-    (AddConversion(std::forward<Fns>(fn)), ...);
-  }
-
-  // Adds a type conversion function with a type predicate.
-  //
-  // Example:
-  //
-  //   AddConversion([](mlir::TensorType) -> std::unique_ptr<Type> { ... });
-  //
-  // The conversion function will match only the tensor type, and return empty
-  // result for all other types, and the type converter will try the next
-  // conversion function (see `Convert` implementation).
-  template <typename Fn, typename FnTraits = llvm::function_traits<Fn>>
-  void AddConversion(Fn&& fn) {
-    using ArgType = typename FnTraits::template arg_t<0>;
-    conversions_.emplace_back(
-        [fn = std::forward<Fn>(fn)](mlir::Type type) -> std::unique_ptr<Type> {
-          if (auto arg = type.dyn_cast<ArgType>()) return fn(arg);
-          return {};
-        });
-  }
-
-  // Converts MLIR element type to the PrimitiveType.
-  static absl::StatusOr<PrimitiveType> ConvertElementType(mlir::Type type);
-
-  // Converts MLIR type to the runtime type. Returns error if conversion was not
-  // successful and the type has no corresponding run time type.
-  absl::StatusOr<std::unique_ptr<Type>> Convert(mlir::Type type) const;
-
-  // Converts MLIR function type to the runtime function type. Returns error if
-  // function has unsupported operands or results types.
-  absl::StatusOr<FunctionType> Convert(mlir::FunctionType type) const;
-
- private:
-  llvm::SmallVector<ConversionFn> conversions_;
-};
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_MLIR_RUNTIME_TRANSFORMS_TYPE_CONVERTER_H_
diff --git a/third_party/xla/xla/mlir/runtime/transforms/type_converter_test.cc b/third_party/xla/xla/mlir/runtime/transforms/type_converter_test.cc
deleted file mode 100644
index 94425abfbfa257..00000000000000
--- a/third_party/xla/xla/mlir/runtime/transforms/type_converter_test.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright 2022 The TensorFlow Runtime Authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "xla/mlir/runtime/transforms/type_converter.h"
-
-#include "llvm/Support/Casting.h"
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "xla/runtime/types.h"
-#include "xla/xla_data.pb.h"
-#include "tsl/platform/test.h"
-
-namespace xla {
-namespace runtime {
-
-using mlir::FloatType;
-using mlir::IntegerType;
-using mlir::MLIRContext;
-
-TEST(TypeConverterTest, ScalarTypeConversion) {
-  MLIRContext ctx;
-  auto i32 = IntegerType::get(&ctx, 32);
-  auto f32 = FloatType::getF32(&ctx);
-
-  TypeConverter converter;
-  auto i32_converted = converter.Convert(i32);
-  auto f32_converted = converter.Convert(f32);
-
-  EXPECT_TRUE(i32_converted.ok() && *i32_converted);
-  EXPECT_TRUE(f32_converted.ok() && *f32_converted);
-
-  auto* i32_scalar = llvm::dyn_cast<ScalarType>(i32_converted->get());
-  EXPECT_TRUE(i32_scalar);
-  ASSERT_EQ(i32_scalar->type(), PrimitiveType::S32);
-
-  auto* f32_scalar = llvm::dyn_cast<ScalarType>(f32_converted->get());
-  EXPECT_TRUE(f32_scalar);
-  ASSERT_EQ(f32_scalar->type(), PrimitiveType::F32);
-}
-
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/mlir/runtime/utils/BUILD b/third_party/xla/xla/mlir/runtime/utils/BUILD
deleted file mode 100644
index 812119ac4c8a72..00000000000000
--- a/third_party/xla/xla/mlir/runtime/utils/BUILD
+++ /dev/null
@@ -1,73 +0,0 @@
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//xla/mlir/runtime:friends"],
-    licenses = ["notice"],
-)
-
-cc_library(
-    name = "async_runtime_api",
-    srcs = ["async_runtime_api.cc"],
-    hdrs = ["async_runtime_api.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        "//xla/runtime:async_runtime",
-        "@com_google_absl//absl/base:dynamic_annotations",
-        "@com_google_absl//absl/functional:any_invocable",
-        "@llvm-project//llvm:OrcJIT",
-        "@llvm-project//llvm:OrcShared",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:mlir_async_runtime_api",
-        "@local_tsl//tsl/concurrency:async_value",
-        "@local_tsl//tsl/concurrency:ref_count",
-        "@local_tsl//tsl/platform:platform_port",
-    ],
-)
-
-cc_library(
-    name = "c_runner_utils",
-    hdrs = ["c_runner_utils.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        "@llvm-project//llvm:OrcJIT",
-        "@llvm-project//mlir:mlir_c_runner_utils",
-    ],
-)
-
-cc_library(
-    name = "constraints",
-    srcs = ["constraints.cc"],
-    hdrs = ["constraints.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        "//xla/runtime:constraints",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FunctionInterfaces",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-cc_library(
-    name = "custom_calls",
-    srcs = ["custom_calls.cc"],
-    hdrs = ["custom_calls.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-    ],
-)
-
-cc_library(
-    name = "float_16bits",
-    hdrs = ["float_16bits.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = ["@llvm-project//llvm:OrcJIT"],
-)
diff --git a/third_party/xla/xla/mlir/runtime/utils/async_runtime_api.cc b/third_party/xla/xla/mlir/runtime/utils/async_runtime_api.cc
deleted file mode 100644
index 2e7967a9e2ebad..00000000000000
--- a/third_party/xla/xla/mlir/runtime/utils/async_runtime_api.cc
+++ /dev/null
@@ -1,357 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/mlir/runtime/utils/async_runtime_api.h"
-
-#include <stdlib.h>
-
-#include <cstddef>
-#include <iostream>
-#include <ostream>
-#include <string_view>
-#include <thread>  // NOLINT TODO(ezhulenev): Remove this header.
-#include <utility>
-
-#include "absl/base/dynamic_annotations.h"
-#include "absl/functional/any_invocable.h"
-#include "llvm/ADT/STLFunctionalExtras.h"
-#include "llvm/ExecutionEngine/JITSymbol.h"
-#include "llvm/ExecutionEngine/Orc/Core.h"
-#include "llvm/ExecutionEngine/Orc/Mangling.h"
-#include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h"
-#include "mlir/ExecutionEngine/AsyncRuntime.h"  // from @llvm-project
-#include "xla/runtime/async_runtime.h"
-#include "tsl/concurrency/async_value.h"
-#include "tsl/concurrency/async_value_ref.h"
-#include "tsl/concurrency/chain.h"
-#include "tsl/concurrency/ref_count.h"
-#include "tsl/platform/mem.h"
-
-namespace xla {
-namespace runtime {
-
-using tsl::AsyncValue;
-using tsl::AsyncValueRef;
-using tsl::Chain;
-using tsl::port::AlignedFree;
-using tsl::port::AlignedMalloc;
-
-AsyncValueRef<Chain> ConvertAsyncTokenToChain(AsyncRuntime::Token *token) {
-  auto *async_value = AsyncRuntime::GetAsyncValue(token);
-  auto out_chain = AsyncValueRef<Chain>(FormRef(async_value));
-  AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(token));
-  return out_chain;
-}
-
-void ExtractAsyncValue(
-    AsyncRuntime::Value *value, AsyncValue *dst,
-    absl::AnyInvocable<void(void *storage, AsyncValue *dst)> emplace_fn) {
-  auto *async_value = AsyncRuntime::GetAsyncValue(value);
-
-  // Fast path if async value is already available.
-  if (async_value->IsAvailable()) {
-    auto *storage = AsyncRuntime::GetStorage(value);
-    std::move(emplace_fn)(storage, dst);
-    AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(value));
-    return;
-  }
-
-  // Wait for the async value completion, and emplace the `dst`.
-  async_value->AndThen([value, emplace_fn = std::move(emplace_fn),
-                        dst = FormRef(dst)]() mutable {
-    auto *storage = AsyncRuntime::GetStorage(value);
-    emplace_fn(storage, dst.get());
-    AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(value));
-  });
-}
-
-void ExtractAsyncValue(
-    AsyncRuntime::Value *value, AsyncValue *dst, void *context,
-    llvm::function_ref<void(void *storage, AsyncValue *dst, void *context)>
-        emplace_fn) {
-  auto *async_value = AsyncRuntime::GetAsyncValue(value);
-
-  // Fast path if async value is already available.
-  if (async_value->IsAvailable()) {
-    auto *storage = AsyncRuntime::GetStorage(value);
-    emplace_fn(storage, dst, context);
-    AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(value));
-    return;
-  }
-
-  // Wait for the async value completion, and emplace the `dst`.
-  async_value->AndThen([value, emplace_fn, context, dst = FormRef(dst)]() {
-    auto *storage = AsyncRuntime::GetStorage(value);
-    emplace_fn(storage, dst.get(), context);
-    AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(value));
-  });
-}
-
-llvm::orc::SymbolMap AsyncRuntimeApiSymbolMap(
-    llvm::orc::MangleAndInterner mangle) {
-  llvm::orc::SymbolMap symbol_map;
-
-  auto bind = [&](std::string_view name, auto symbol_ptr) {
-    symbol_map[mangle(name)] = {llvm::orc::ExecutorAddr::fromPtr(symbol_ptr),
-                                llvm::JITSymbolFlags()};
-  };
-
-  bind("mlirAsyncRuntimeAddRef", &mlir::runtime::mlirAsyncRuntimeAddRef);
-  bind("mlirAsyncRuntimeDropRef", &mlir::runtime::mlirAsyncRuntimeDropRef);
-  bind("mlirAsyncRuntimeExecute", &mlir::runtime::mlirAsyncRuntimeExecute);
-  bind("mlirAsyncRuntimeGetValueStorage",
-       &mlir::runtime::mlirAsyncRuntimeGetValueStorage);
-  bind("mlirAsyncRuntimeCreateToken",
-       &mlir::runtime::mlirAsyncRuntimeCreateToken);
-  bind("mlirAsyncRuntimeCreateValue",
-       &mlir::runtime::mlirAsyncRuntimeCreateValue);
-  bind("mlirAsyncRuntimeEmplaceToken",
-       &mlir::runtime::mlirAsyncRuntimeEmplaceToken);
-  bind("mlirAsyncRuntimeSetTokenError",
-       &mlir::runtime::mlirAsyncRuntimeSetTokenError);
-  bind("mlirAsyncRuntimeIsTokenError",
-       &mlir::runtime::mlirAsyncRuntimeIsTokenError);
-  bind("mlirAsyncRuntimeEmplaceValue",
-       &mlir::runtime::mlirAsyncRuntimeEmplaceValue);
-  bind("mlirAsyncRuntimeSetValueError",
-       &mlir::runtime::mlirAsyncRuntimeSetValueError);
-  bind("mlirAsyncRuntimeIsValueError",
-       &mlir::runtime::mlirAsyncRuntimeIsValueError);
-  bind("mlirAsyncRuntimeIsGroupError",
-       &mlir::runtime::mlirAsyncRuntimeIsGroupError);
-  bind("mlirAsyncRuntimeAwaitToken",
-       &mlir::runtime::mlirAsyncRuntimeAwaitToken);
-  bind("mlirAsyncRuntimeAwaitValue",
-       &mlir::runtime::mlirAsyncRuntimeAwaitValue);
-  bind("mlirAsyncRuntimeAwaitTokenAndExecute",
-       &mlir::runtime::mlirAsyncRuntimeAwaitTokenAndExecute);
-  bind("mlirAsyncRuntimeAwaitValueAndExecute",
-       &mlir::runtime::mlirAsyncRuntimeAwaitValueAndExecute);
-  bind("mlirAsyncRuntimeCreateGroup",
-       &mlir::runtime::mlirAsyncRuntimeCreateGroup);
-  bind("mlirAsyncRuntimeAddTokenToGroup",
-       &mlir::runtime::mlirAsyncRuntimeAddTokenToGroup);
-  bind("mlirAsyncRuntimeAwaitAllInGroup",
-       &mlir::runtime::mlirAsyncRuntimeAwaitAllInGroup);
-  bind("mlirAsyncRuntimeAwaitAllInGroupAndExecute",
-       &mlir::runtime::mlirAsyncRuntimeAwaitAllInGroupAndExecute);
-  bind("mlirAsyncRuntimePrintCurrentThreadId",
-       &mlir::runtime::mlirAsyncRuntimePrintCurrentThreadId);
-
-  return symbol_map;
-}
-
-namespace {
-
-void *RuntimeAlignedAlloc(size_t alignment, size_t size) {
-  return AlignedMalloc(size, alignment);
-}
-
-void *RuntimeMalloc(size_t size) {
-  // AlignedAlloc() requires results to be deallocated with AlignedFree().
-  // Make all allocations aligned because there is only one RuntimeFree().
-  // Align to the size of a pointer by default.
-  return RuntimeAlignedAlloc(sizeof(void *), size);
-}
-
-void RuntimeFree(void *ptr) { return AlignedFree(ptr); }
-
-}  // namespace
-
-llvm::orc::SymbolMap AsyncRuntimeMemoryAllocationSymbolMap(
-    llvm::orc::MangleAndInterner mangle) {
-  llvm::orc::SymbolMap symbol_map;
-
-  auto bind = [&](std::string_view name, auto symbol_ptr) {
-    symbol_map[mangle(name)] = {llvm::orc::ExecutorAddr::fromPtr(symbol_ptr),
-                                llvm::JITSymbolFlags()};
-  };
-
-  bind("malloc", &RuntimeMalloc);
-  bind("free", &RuntimeFree);
-  bind("aligned_alloc", &RuntimeAlignedAlloc);
-
-  return symbol_map;
-}
-
-}  // namespace runtime
-}  // namespace xla
-
-//===----------------------------------------------------------------------===//
-// MLIR Async runtime API.
-//===----------------------------------------------------------------------===//
-
-// TODO(b/192775419): All pointers passed from the JIT compiled code to the
-// runtime API must be marked initialized when running with msan enabled,
-// because currently we do not have a way to enable sanitizer in the compiled
-// code, and msan does not have any visibility into that code at runtime.
-
-namespace mlir {
-namespace runtime {
-
-using xla::runtime::AsyncRuntime;
-using xla::runtime::AsyncRuntimeObject;
-
-// Adds references to reference counted runtime object.
-void mlirAsyncRuntimeAddRef(RefCountedObjPtr ptr, int64_t count) {
-  AsyncRuntimeObject *obj = static_cast<AsyncRuntimeObject *>(ptr);
-  ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(&ptr, sizeof(RefCountedObjPtr));
-  ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(&count, sizeof(int64_t));
-  AsyncRuntime::AddRef(obj, count);
-}
-
-// Drops references from reference counted runtime object.
-void mlirAsyncRuntimeDropRef(RefCountedObjPtr ptr, int64_t count) {
-  AsyncRuntimeObject *obj = static_cast<AsyncRuntimeObject *>(ptr);
-  ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(&ptr, sizeof(RefCountedObjPtr));
-  ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(&count, sizeof(int64_t));
-  AsyncRuntime::DropRef(obj, count);
-}
-
-// Create a new `async.token` in not-ready state.
-AsyncToken *mlirAsyncRuntimeCreateToken() {
-  return AsyncRuntime::CreateToken();
-}
-
-// Creates a new `async.value` in not-ready state.
-AsyncValue *mlirAsyncRuntimeCreateValue(int64_t size) {
-  ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(&size, sizeof(int64_t));
-  return AsyncRuntime::CreateValue(size,
-                                   /*alignment=*/alignof(std::max_align_t));
-}
-
-// Create a new `async.group` in empty state.
-AsyncGroup *mlirAsyncRuntimeCreateGroup(int64_t size) {
-  ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(&size, sizeof(int64_t));
-  return AsyncRuntime::CreateGroup(size);
-}
-
-int64_t mlirAsyncRuntimeAddTokenToGroup(AsyncToken *token, AsyncGroup *group) {
-  ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(&token, sizeof(void *));
-  ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(&group, sizeof(void *));
-  return AsyncRuntime::AddTokenToGroup(group, token);
-}
-
-bool mlirAsyncRuntimeIsGroupError(AsyncGroup *group) {
-  ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(&group, sizeof(void *));
-  return AsyncRuntime::IsError(group);
-}
-
-void mlirAsyncRuntimeEmplaceToken(AsyncToken *token) {
-  ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(&token, sizeof(void *));
-  AsyncRuntime::SetAvailable(token);
-}
-
-void mlirAsyncRuntimeSetTokenError(AsyncToken *token) {
-  ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(&token, sizeof(void *));
-  AsyncRuntime::SetError(token);
-}
-
-bool mlirAsyncRuntimeIsTokenError(AsyncToken *token) {
-  ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(&token, sizeof(void *));
-  return AsyncRuntime::IsError(token);
-}
-
-void mlirAsyncRuntimeAwaitToken(AsyncToken *token) {
-  AsyncRuntime &runtime = AsyncRuntime::GetCurrentRuntime();
-  ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(&token, sizeof(void *));
-  runtime.AwaitToken(token);
-}
-
-void mlirAsyncRuntimeAwaitAllInGroup(AsyncGroup *group) {
-  ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(&group, sizeof(void *));
-  AsyncRuntime::AwaitGroup(group);
-}
-
-ValueStorage mlirAsyncRuntimeGetValueStorage(AsyncValue *value) {
-  ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(&value, sizeof(void *));
-  return AsyncRuntime::GetStorage(value);
-}
-
-void mlirAsyncRuntimeEmplaceValue(AsyncValue *value) {
-  ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(&value, sizeof(void *));
-  AsyncRuntime::SetAvailable(value);
-}
-
-void mlirAsyncRuntimeSetValueError(AsyncValue *value) {
-  AsyncRuntime &runtime = AsyncRuntime::GetCurrentRuntime();
-  ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(&value, sizeof(void *));
-  runtime.SetError(value);
-}
-
-bool mlirAsyncRuntimeIsValueError(AsyncValue *value) {
-  ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(&value, sizeof(void *));
-  return AsyncRuntime::IsError(value);
-}
-
-void mlirAsyncRuntimeAwaitValue(AsyncValue *value) {
-  ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(&value, sizeof(void *));
-  AsyncRuntime::AwaitValue(value);
-}
-
-void mlirAsyncRuntimeExecute(CoroHandle handle, CoroResume resume) {
-  AsyncRuntime &runtime = AsyncRuntime::GetCurrentRuntime();
-  runtime.Execute([resume, handle, runtime]() {
-    AsyncRuntime::Set(runtime);
-    (*resume)(handle);
-  });
-}
-
-void mlirAsyncRuntimeAwaitTokenAndExecute(AsyncToken *token, CoroHandle handle,
-                                          CoroResume resume) {
-  // TODO(ezhulenev): Add fast path for available tokens.
-  AsyncRuntime &runtime = AsyncRuntime::GetCurrentRuntime();
-  ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(&token, sizeof(void *));
-  AsyncRuntime::AwaitToken(token, [handle, resume, runtime]() {
-    AsyncRuntime::Set(runtime);
-    (*resume)(handle);
-  });
-}
-
-void mlirAsyncRuntimeAwaitValueAndExecute(AsyncValue *value, CoroHandle handle,
-                                          CoroResume resume) {
-  // TODO(ezhulenev): Add fast path for available values.
-  AsyncRuntime &runtime = AsyncRuntime::GetCurrentRuntime();
-  ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(&value, sizeof(void *));
-  runtime.AwaitValue(value, [handle, resume, runtime]() {
-    AsyncRuntime::Set(runtime);
-
-    (*resume)(handle);
-  });
-}
-
-void mlirAsyncRuntimeAwaitAllInGroupAndExecute(AsyncGroup *group,
-                                               CoroHandle handle,
-                                               CoroResume resume) {
-  // TODO(ezhulenev): Add fast path for available groups.
-  AsyncRuntime &runtime = AsyncRuntime::GetCurrentRuntime();
-  ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(&group, sizeof(void *));
-  runtime.AwaitGroup(group, [handle, resume, runtime]() {
-    AsyncRuntime::Set(runtime);
-    (*resume)(handle);
-  });
-}
-
-//===----------------------------------------------------------------------===//
-// Small async runtime support library for testing.
-//===----------------------------------------------------------------------===//
-
-void mlirAsyncRuntimePrintCurrentThreadId() {
-  static thread_local std::thread::id thisId = std::this_thread::get_id();
-  std::cout << "Current thread id: " << thisId << std::endl;
-}
-
-}  // namespace runtime
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/runtime/utils/async_runtime_api.h b/third_party/xla/xla/mlir/runtime/utils/async_runtime_api.h
deleted file mode 100644
index 94f3dcd2f46495..00000000000000
--- a/third_party/xla/xla/mlir/runtime/utils/async_runtime_api.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_MLIR_RUNTIME_UTILS_ASYNC_RUNTIME_API_H_
-#define XLA_MLIR_RUNTIME_UTILS_ASYNC_RUNTIME_API_H_
-
-#include "absl/functional/any_invocable.h"
-#include "llvm/ADT/STLFunctionalExtras.h"
-#include "llvm/ExecutionEngine/Orc/Core.h"
-#include "llvm/ExecutionEngine/Orc/Mangling.h"
-#include "xla/runtime/async_runtime.h"
-#include "tsl/concurrency/async_value.h"
-#include "tsl/concurrency/async_value_ref.h"
-#include "tsl/concurrency/chain.h"
-
-namespace xla {
-namespace runtime {
-
-// Converts MLIR Async Runtime token into the TFRT async chain, and drops the
-// reference count on the token.
-tsl::AsyncValueRef<tsl::Chain> ConvertAsyncTokenToChain(
-    AsyncRuntime::Token* token);
-
-// Extracts a payload from the MLIR Async Runtime `value` and emplaces it into
-// the TFRT async value `dst` using a user provided emplace function. Drops the
-// reference on the runtime value after it is no longer needed.
-void ExtractAsyncValue(
-    AsyncRuntime::Value* value, tsl::AsyncValue* dst,
-    absl::AnyInvocable<void(void*, tsl::AsyncValue*)> emplace_fn);
-
-// A version of the `ExtractAsyncValue` function defined above that takes an
-// additional opaque pointer that will be passed to the emplace function when
-// async value will become ready. It is the caller responsibility to ensure that
-// the pointed object will stay alive.
-void ExtractAsyncValue(
-    AsyncRuntime::Value* value, tsl::AsyncValue* dst, void* context,
-    llvm::function_ref<void(void*, tsl::AsyncValue*, void*)> emplace_fn);
-
-// Builds a symbol map from the Async Runtime API functions.
-llvm::orc::SymbolMap AsyncRuntimeApiSymbolMap(
-    llvm::orc::MangleAndInterner mangle);
-
-// TODO(ezhulenev): This should not be a part of async runtime api library.
-llvm::orc::SymbolMap AsyncRuntimeMemoryAllocationSymbolMap(
-    llvm::orc::MangleAndInterner mangle);
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_MLIR_RUNTIME_UTILS_ASYNC_RUNTIME_API_H_
diff --git a/third_party/xla/xla/mlir/runtime/utils/c_runner_utils.h b/third_party/xla/xla/mlir/runtime/utils/c_runner_utils.h
deleted file mode 100644
index 152f7b2bf5c3b2..00000000000000
--- a/third_party/xla/xla/mlir/runtime/utils/c_runner_utils.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_MLIR_RUNTIME_UTILS_C_RUNNER_UTILS_H_
-#define XLA_MLIR_RUNTIME_UTILS_C_RUNNER_UTILS_H_
-
-#include <string_view>
-
-#include "llvm/ExecutionEngine/Orc/Core.h"
-#include "llvm/ExecutionEngine/Orc/Mangling.h"
-#include "mlir/ExecutionEngine/CRunnerUtils.h"  // from @llvm-project
-
-namespace xla {
-namespace runtime {
-
-inline llvm::orc::SymbolMap CRunnerUtilsSymbolMap(
-    llvm::orc::MangleAndInterner mangle) {
-  llvm::orc::SymbolMap symbol_map;
-
-  auto bind = [&](std::string_view name, auto symbol_ptr) {
-    symbol_map[mangle(name)] = {llvm::orc::ExecutorAddr::fromPtr(symbol_ptr),
-                                llvm::JITSymbolFlags()};
-  };
-
-#ifndef _WIN32
-  // TODO(b/246980307): fails to link on windows because it's marked dllimport.
-  bind("memrefCopy", &memrefCopy);
-#endif
-
-  return symbol_map;
-}
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_MLIR_RUNTIME_UTILS_C_RUNNER_UTILS_H_
diff --git a/third_party/xla/xla/mlir/runtime/utils/constraints.cc b/third_party/xla/xla/mlir/runtime/utils/constraints.cc
deleted file mode 100644
index da0023ea62b4de..00000000000000
--- a/third_party/xla/xla/mlir/runtime/utils/constraints.cc
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/mlir/runtime/utils/constraints.h"
-
-#include <utility>
-
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Casting.h"
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/Interfaces/FunctionInterfaces.h"  // from @llvm-project
-#include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
-#include "xla/runtime/constraints.h"
-
-namespace xla {
-namespace runtime {
-
-using namespace mlir;  // NOLINT
-
-using absl::InvalidArgumentError;
-using absl::StatusOr;
-using absl::StrCat;
-
-using llvm::SmallVector;
-
-StatusOr<SmallVector<ArgumentConstraint>> GetArgumentsConstraints(
-    FunctionOpInterface func) {
-  llvm::SmallVector<ArgumentConstraint> constraints;
-  constraints.reserve(func.getNumArguments());
-
-  auto parse = [](Attribute attr) -> StatusOr<ArgumentConstraint> {
-    // If attribute is not defined it means that there is no constraint.
-    if (!attr) return ArgumentConstraint::kResolved;
-
-    // Otherwise try to parse constraint from the string attribute.
-    auto str = attr.dyn_cast_or_null<StringAttr>();
-    if (!str)
-      return InvalidArgumentError(
-          StrCat("unexpected ", kArgumentConstraintAttrName, " attribute"));
-    return ParseArgumentConstraint(str.getValue());
-  };
-
-  for (int i = 0; i < func.getNumArguments(); ++i) {
-    auto arg_type =
-        llvm::cast<FunctionType>(func.getFunctionType()).getInput(i);
-
-    auto constraint = parse(func.getArgAttr(i, kArgumentConstraintAttrName));
-    if (!constraint.ok()) return constraint.status();
-
-    auto resolved = ResolveArgumentConstraint(*constraint, arg_type);
-    if (!resolved.ok()) return resolved.status();
-
-    constraints.push_back(*resolved);
-  }
-
-  return constraints;
-}
-
-StatusOr<ArgumentConstraint> ResolveArgumentConstraint(
-    ArgumentConstraint constraint, Type type) {
-  // Skip already resolved constraints.
-  if (constraint == ArgumentConstraint::kResolved) return constraint;
-
-  // Operand must be a shaped type: memref or tensor.
-  auto shaped = type.dyn_cast<ShapedType>();
-  if (!shaped)
-    return InvalidArgumentError(
-        StrCat("unsupported operand type: ", debugString(type)));
-
-  // Resolve `rank` constraint if rank is known at compile time.
-  if (constraint == ArgumentConstraint::kRank && shaped.hasRank())
-    return ArgumentConstraint::kResolved;
-
-  // Resolve `shape` constraint if shape is known at compile time.
-  if (constraint == ArgumentConstraint::kShape && shaped.hasStaticShape())
-    return ArgumentConstraint::kResolved;
-
-  // Leave the `value` constraint unmodified if the operand is sinkable.
-  if (constraint == ArgumentConstraint::kValue) {
-    if (SupportsValueSpecialization(shaped)) return constraint;
-    return InvalidArgumentError(
-        StrCat("cannot sink operand type: ", debugString(type)));
-  }
-
-  return constraint;
-}
-
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/mlir/runtime/utils/constraints.h b/third_party/xla/xla/mlir/runtime/utils/constraints.h
deleted file mode 100644
index f88c3f267e0d38..00000000000000
--- a/third_party/xla/xla/mlir/runtime/utils/constraints.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_MLIR_RUNTIME_UTILS_CONSTRAINTS_H_
-#define XLA_MLIR_RUNTIME_UTILS_CONSTRAINTS_H_
-
-#include "absl/status/statusor.h"
-#include "llvm/ADT/SmallVector.h"
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/Interfaces/FunctionInterfaces.h"  // from @llvm-project
-#include "xla/runtime/constraints.h"
-
-namespace xla {
-namespace runtime {
-// Returns arguments constraints inferred from the function signature.
-absl::StatusOr<llvm::SmallVector<ArgumentConstraint>> GetArgumentsConstraints(
-    mlir::FunctionOpInterface func);
-
-// Resolves argument constraint based on the argument type, if constraint is
-// fully satisfied by the type, returns `kResolved`.
-absl::StatusOr<ArgumentConstraint> ResolveArgumentConstraint(
-    ArgumentConstraint constraint, mlir::Type type);
-
-// Returns true iff the value of given type can be sunk into the function body
-// at run time via value specialization.
-inline bool SupportsValueSpecialization(mlir::Type type) {
-  // TODO(ezhulenev): Add support for sinking `memref` values once the value
-  // specialization will support it.
-  mlir::TensorType tensor = type.dyn_cast<mlir::TensorType>();
-  return tensor && (tensor.getRank() == 0 || tensor.getRank() == 1) &&
-         (tensor.getElementType().isInteger(32) ||
-          tensor.getElementType().isInteger(64));
-}
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_MLIR_RUNTIME_UTILS_CONSTRAINTS_H_
diff --git a/third_party/xla/xla/mlir/runtime/utils/custom_calls.cc b/third_party/xla/xla/mlir/runtime/utils/custom_calls.cc
deleted file mode 100644
index 41991bc8f50b14..00000000000000
--- a/third_party/xla/xla/mlir/runtime/utils/custom_calls.cc
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/mlir/runtime/utils/custom_calls.h"
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/StringRef.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
-#include "mlir/IR/SymbolTable.h"  // from @llvm-project
-
-namespace xla {
-namespace runtime {
-
-using namespace mlir;  // NOLINT
-using mlir::func::FuncOp;
-
-using llvm::ArrayRef;
-using llvm::StringRef;
-
-static constexpr const char kCustomCall[] = "rt.custom_call";
-
-CustomCallDeclarations::CustomCallDeclarations(SymbolTable sym_table)
-    : sym_table_(sym_table) {}
-
-FuncOp CustomCallDeclarations::GetOrCreate(ImplicitLocOpBuilder& b,
-                                           StringRef target,
-                                           FunctionType type) {
-  // Check if we already have a custom all declaration.
-  Key key = {b.getStringAttr(target), type};
-  if (auto it = custom_calls_.find(key); it != custom_calls_.end())
-    return it->second;
-
-  // Create a new builder not attached to any operation, so that we can later
-  // insert created function into the symbol table.
-  OpBuilder builder(b.getContext(), b.getListener());
-
-  // Create a custom call declaration.
-  NamedAttribute attr(b.getStringAttr(kCustomCall), b.getStringAttr(target));
-  auto declaration = builder.create<FuncOp>(b.getLoc(), target, type,
-                                            ArrayRef<NamedAttribute>(attr));
-  declaration.setPrivate();
-
-  // Add created custom call declaration to the symbol table.
-  sym_table_.insert(declaration);
-  custom_calls_[key] = declaration;
-
-  return declaration;
-}
-
-FuncOp CustomCallDeclarations::GetOrCreate(ImplicitLocOpBuilder& b,
-                                           StringRef target, TypeRange inputs,
-                                           TypeRange results) {
-  auto type = FunctionType::get(b.getContext(), inputs, results);
-  return GetOrCreate(b, target, type);
-}
-
-FuncOp CustomCallDeclarations::GetOrCreate(ImplicitLocOpBuilder& b,
-                                           StringRef target, Operation* op) {
-  return GetOrCreate(b, target, op->getOperandTypes(), op->getResultTypes());
-}
-
-void AppendCustomCallAttrs(mlir::Operation* op,
-                           llvm::ArrayRef<mlir::NamedAttribute> attrs) {
-  for (auto& attr : attrs) op->setAttr(attr.getName(), attr.getValue());
-}
-
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/mlir/runtime/utils/custom_calls.h b/third_party/xla/xla/mlir/runtime/utils/custom_calls.h
deleted file mode 100644
index 02a85d2c38ff62..00000000000000
--- a/third_party/xla/xla/mlir/runtime/utils/custom_calls.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_MLIR_RUNTIME_UTILS_CUSTOM_CALLS_H_
-#define XLA_MLIR_RUNTIME_UTILS_CUSTOM_CALLS_H_
-
-#include <string>
-#include <utility>
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/StringRef.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
-#include "mlir/IR/SymbolTable.h"  // from @llvm-project
-#include "mlir/IR/TypeRange.h"  // from @llvm-project
-
-namespace xla {
-namespace runtime {
-
-// A helper class to create XLA runtime custom call declarations in the given
-// symbol table. This class ensures that for each unique combination of the
-// custom call target and function signature we create exaclty one custom call
-// funcation declaration.
-class CustomCallDeclarations {
- public:
-  explicit CustomCallDeclarations(mlir::SymbolTable sym_table);
-
-  // Returns existing custom call declaration or creates a new one.
-  mlir::func::FuncOp GetOrCreate(mlir::ImplicitLocOpBuilder& b,
-                                 llvm::StringRef target,
-                                 mlir::FunctionType type);
-
-  // Returns existing custom call declaration or creates a new one with a
-  // function type constructed from `inputs` and `results`.
-  mlir::func::FuncOp GetOrCreate(mlir::ImplicitLocOpBuilder& b,
-                                 llvm::StringRef target, mlir::TypeRange inputs,
-                                 mlir::TypeRange results);
-
-  // Returns existing custom call declaration or creates a new one with a
-  // function type constructed from the `op` operands and results.
-  mlir::func::FuncOp GetOrCreate(mlir::ImplicitLocOpBuilder& b,
-                                 llvm::StringRef target, mlir::Operation* op);
-
-  mlir::SymbolTable& sym_table() { return sym_table_; }
-
- private:
-  mlir::SymbolTable sym_table_;
-
-  using Key = std::pair<mlir::StringAttr, mlir::FunctionType>;
-  llvm::DenseMap<Key, mlir::func::FuncOp> custom_calls_;
-};
-
-// Appends all attributes from the list to the operation.
-void AppendCustomCallAttrs(mlir::Operation* op,
-                           llvm::ArrayRef<mlir::NamedAttribute> attrs);
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_MLIR_RUNTIME_UTILS_CUSTOM_CALLS_H_
diff --git a/third_party/xla/xla/mlir/runtime/utils/float_16bits.h b/third_party/xla/xla/mlir/runtime/utils/float_16bits.h
deleted file mode 100644
index 0ea054626b4d32..00000000000000
--- a/third_party/xla/xla/mlir/runtime/utils/float_16bits.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_MLIR_RUNTIME_UTILS_FLOAT_16BITS_H_
-#define XLA_MLIR_RUNTIME_UTILS_FLOAT_16BITS_H_
-
-#include <string_view>
-
-#include "llvm/ExecutionEngine/Orc/Core.h"
-#include "llvm/ExecutionEngine/Orc/Mangling.h"
-
-// Provided by compiler-rt and MLIR.
-// Converts an F32 value to a BF16.
-extern "C" uint16_t __truncsfbf2(float);
-// Converts an F64 value to a BF16.
-extern "C" uint16_t __truncdfbf2(double);
-
-namespace xla {
-namespace runtime {
-
-inline llvm::orc::SymbolMap Float16bitsSymbolMap(
-    llvm::orc::MangleAndInterner mangle) {
-  llvm::orc::SymbolMap symbol_map;
-
-  auto bind = [&](std::string_view name, auto symbol_ptr) {
-    symbol_map[mangle(name)] = {llvm::orc::ExecutorAddr::fromPtr(symbol_ptr),
-                                llvm::JITSymbolFlags()};
-  };
-
-  bind("__truncsfbf2", &__truncsfbf2);
-  bind("__truncdfbf2", &__truncdfbf2);
-
-  return symbol_map;
-}
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_MLIR_RUNTIME_UTILS_FLOAT_16BITS_H_
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/BUILD b/third_party/xla/xla/mlir/tools/mlir_bisect/BUILD
new file mode 100644
index 00000000000000..59fd7def032da0
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_bisect/BUILD
@@ -0,0 +1,65 @@
+load("@bazel_skylib//rules:build_test.bzl", "build_test")
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_binary")
+
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
+build_test(
+    name = "mlir-bisect_build_test",
+    targets = [
+        ":mlir-bisect",
+    ],
+)
+
+xla_cc_binary(
+    name = "mlir-bisect",
+    testonly = True,
+    srcs = ["mlir_bisect.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":bisect_lib",
+        "//xla:literal",
+        "//xla/mlir/tools/mlir_bisect/rewrites",
+        "//xla/mlir/tools/mlir_interpreter/dialects",
+        "//xla/mlir/tools/mlir_interpreter/framework",
+        "//xla/mlir/tools/mlir_replay/public:execution_trace_utils",
+        "//xla/mlir_hlo:hlo_dialect_registration",
+        "//xla/mlir_hlo:mhlo_passes",
+        "//xla/service:hlo_proto_cc",
+        "@com_google_absl//absl/log:check",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AffineUtils",
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:platform_port",
+    ],
+)
+
+cc_library(
+    name = "bisect_lib",
+    srcs = [
+        "bisect_lib.cc",
+        "test_passes.cc",
+    ],
+    hdrs = [
+        "bisect_lib.h",
+        "test_passes.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla/mlir/tools/mlir_replay/public:execution_trace_proto_cc",
+        "//xla/mlir/tools/mlir_replay/public:execution_trace_proto_cc_impl",
+        "//xla/mlir/tools/mlir_replay/public:execution_trace_utils",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LinalgDialect",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+    ],
+)
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/README.md b/third_party/xla/xla/mlir/tools/mlir_bisect/README.md
new file mode 100644
index 00000000000000..570e92b6e53802
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_bisect/README.md
@@ -0,0 +1,85 @@
+# MLIR HLO mlir_bisect
+
+This is a test case reduction tool, similar in purpose to `mlir-reduce`, but
+specific to the `mlir-interpreter` infrastructure. In particular, reductions can
+depend on concrete values encountered during execution, and reductions can (and
+usually do) generate multiple candidates.
+
+For example, the `ReplaceOpWithConstant` reduction will attempt to replace each
+op with each of its results. If the op is in a loop, each execution will be a
+candidate for replacement.
+
+## Using this tool
+
+1.  Run a JAX test with snapshots enabled:
+
+    ```
+    bazel test some-jax-test
+      --test_env=XLA_FLAGS="--xla_cpu_use_xla_runtime --xla_dump_to=/tmp/dump
+      --xla_dump_hlo_snapshots" --test_filter=SomeSpecific.Test
+      --test_sharding_strategy=disabled --test_strategy=local
+    ```
+
+1.  Figure out the culprit module and pass (sorry, no automation yet):
+
+    ```
+    bazel run tensorflow/compiler/xla/mlir/tools/mlir_replay:mlir_replay -- \
+      --mlir-compilation-trace=/tmp/dump/module_0000.jit__something.mlir-trace.pb \
+      --hlo-snapshot=/tmp/dump/module_0000.jit__something.snapshot.0.pb \
+      --print-changes-only \
+      --execution-trace-dir=/tmp/execution
+    ```
+
+    You should see a pass after which results change. You'll want to use the
+    .mlir file in `/tmp/execution` corresponding to the pass *before* that with
+    the bisect tool.
+
+    Note: If the failing pass is bufferization, you may have to use an earlier
+    snapshot, e.g. before EmptyTensorToAllocTensor.
+1.  Run bisect:
+
+    ```
+    bazel run tensorflow/compiler/xla/mlir/tools/mlir_bisect:mlir-bisect -- \
+      --hlo-snapshot=/tmp/dump/module_0000.jit_something.snapshot.0.pb \
+      --pass-pipeline="builtin.module(empty-tensor-to-alloc-tensor,one-shot-bufferize{allow-return-allocs bufferize-function-boundaries create-deallocs=0})" \
+      /tmp/execution/0052.ScalarizationPass.mlir
+    ```
+
+## Adding a reduction
+
+To add a reduction, create a function that generates the candidates and register
+it:
+
+```
+SmallVector<OwningOpRef<ModuleOp>>
+FrobulateAndDefenestrate(BisectState&, dialect::SomeOp some_op) {
+  auto [cloned_module_1, cloned_op_1] = CloneModuleFor(some_op);
+  Frobulate(cloned_op_1);
+
+  auto [cloned_module_2, cloned_op_2] = CloneModuleFor(some_op);
+  Defenestrate(cloned_op_2);
+
+  return {cloned_module_1, cloned_module_2};
+}
+
+REGISTER_MLIR_REDUCE_STRATEGY(FrobulateAndDefenestrate);
+```
+
+Then, add a test for the strategy. Make sure your strategy is linked into
+mlir-bisect and has `alwayslink` set.
+
+```
+// RUN: mlir-bisect %s --debug-strategy=FrobulateAndDefenestrate | FileCheck %s
+
+func.func @main() {
+  dialect.some_op()
+}
+
+// CHECK: func @main()
+// CHECK-NEXT: frobulated
+
+// CHECK: func @main()
+// CHECK-NEXT: defenestrated
+```
+
+`--debug-strategy` will print all candidates generated by the given strategy.
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/bisect_lib.cc b/third_party/xla/xla/mlir/tools/mlir_bisect/bisect_lib.cc
new file mode 100644
index 00000000000000..dbac610da22e30
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_bisect/bisect_lib.cc
@@ -0,0 +1,87 @@
+/* Copyright 2023 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/mlir/tools/mlir_bisect/bisect_lib.h"
+
+#include <cassert>
+#include <functional>
+#include <iterator>
+#include <utility>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace mlir {
+namespace bisect {
+
+Operation* FindInClone(Operation* op, ModuleOp clone) {
+  if (llvm::isa<ModuleOp>(op)) {
+    return clone;
+  }
+
+  auto* parent_clone = FindInClone(op->getParentOp(), clone);
+  auto cloned_ops =
+      parent_clone->getRegions()[op->getParentRegion()->getRegionNumber()]
+          .getOps();
+  for (auto [original_op, cloned_op] :
+       llvm::zip(op->getParentRegion()->getOps(), cloned_ops)) {
+    if (&original_op == op) {
+      return &cloned_op;
+    }
+  }
+
+  llvm_unreachable("Op not found in clone.");
+}
+
+std::pair<OwningOpRef<ModuleOp>, Operation*> CloneModuleFor(Operation* op) {
+  auto module = op->getParentOfType<ModuleOp>().clone();
+  return {OwningOpRef<ModuleOp>{module}, FindInClone(op, module)};
+}
+
+namespace detail {
+
+DenseMap<StringRef, std::function<CandidateVector(BisectState&, Operation*)>>&
+GetStrategies() {
+  static auto* strategies =
+      new DenseMap<StringRef,
+                   std::function<CandidateVector(BisectState&, Operation*)>>();
+  return *strategies;
+}
+
+void RegisterReduceStrategy(
+    StringRef name,
+    std::function<CandidateVector(BisectState&, Operation*)> fn) {
+  GetStrategies()[name] = std::move(fn);
+}
+
+CandidateVector GetCandidates(
+    const std::function<CandidateVector(BisectState&, Operation*)>& strategy,
+    BisectState& state, ModuleOp op) {
+  assert(strategy && "GetCandidates was passed a null strategy");
+  CandidateVector result;
+  op.lookupSymbol("main")->walk([&](Operation* sub_op) {
+    llvm::move(strategy(state, sub_op), std::back_inserter(result));
+  });
+  return result;
+}
+
+}  // namespace detail
+}  // namespace bisect
+}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/bisect_lib.h b/third_party/xla/xla/mlir/tools/mlir_bisect/bisect_lib.h
new file mode 100644
index 00000000000000..a4784263313de5
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_bisect/bisect_lib.h
@@ -0,0 +1,96 @@
+/* Copyright 2023 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_MLIR_TOOLS_MLIR_BISECT_BISECT_LIB_H_
+#define XLA_MLIR_TOOLS_MLIR_BISECT_BISECT_LIB_H_
+
+#include <functional>
+#include <tuple>
+#include <utility>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/mlir/tools/mlir_replay/public/execution_trace.pb.h"
+#include "xla/mlir/tools/mlir_replay/public/execution_trace_utils.h"
+
+#define REGISTER_MLIR_REDUCE_STRATEGY(name)                      \
+  static int name##_init = []() {                                \
+    ::mlir::bisect::detail::RegisterReduceStrategy(#name, name); \
+    return 1;                                                    \
+  }();
+
+namespace mlir {
+namespace bisect {
+
+class BisectState {
+ public:
+  void SetTrace(mlir::interpreter::ExecutionTrace trace) {
+    trace_ = std::move(trace);
+  }
+
+  // Returns all executions of the given op.
+  llvm::SmallVector<const interpreter::InstructionTrace*> GetExecutions(
+      mlir::Operation* op) const {
+    return interpreter::FindOpExecutionsInTrace(trace_, op);
+  }
+
+ private:
+  mlir::interpreter::ExecutionTrace trace_;
+};
+
+std::pair<OwningOpRef<ModuleOp>, Operation*> CloneModuleFor(Operation* op);
+Operation* FindInClone(Operation* op, ModuleOp clone);
+
+template <typename Op>
+std::pair<OwningOpRef<ModuleOp>, Op> CloneModuleFor(Op op) {
+  auto [module, op_clone] = CloneModuleFor(op.getOperation());
+  return {std::move(module), llvm::cast<Op>(op_clone)};
+}
+
+namespace detail {
+
+using CandidateVector = SmallVector<std::function<OwningOpRef<ModuleOp>()>>;
+
+CandidateVector GetCandidates(
+    const std::function<CandidateVector(BisectState&, Operation*)>& strategy,
+    BisectState& state, ModuleOp op);
+
+DenseMap<StringRef, std::function<CandidateVector(BisectState&, Operation*)>>&
+GetStrategies();
+
+// Registers a strategy that applies to all ops.
+void RegisterReduceStrategy(
+    StringRef name,
+    std::function<CandidateVector(BisectState&, Operation*)> fn);
+
+// Registers a strategy that applies to specific ops.
+template <typename Op>
+void RegisterReduceStrategy(StringRef name,
+                            CandidateVector (*fn)(BisectState&, Op)) {
+  RegisterReduceStrategy(
+      name, [fn](BisectState& state, Operation* op) -> CandidateVector {
+        if (auto cast = llvm::dyn_cast<Op>(op)) {
+          return fn(state, cast);
+        }
+        return {};
+      });
+}
+
+}  // namespace detail
+
+}  // namespace bisect
+}  // namespace mlir
+
+#endif  // XLA_MLIR_TOOLS_MLIR_BISECT_BISECT_LIB_H_
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/mlir_bisect.cc b/third_party/xla/xla/mlir/tools/mlir_bisect/mlir_bisect.cc
new file mode 100644
index 00000000000000..073513a7c56e9b
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_bisect/mlir_bisect.cc
@@ -0,0 +1,359 @@
+/* Copyright 2023 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/log/check.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Affine/Utils.h"  // from @llvm-project
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Verifier.h"  // from @llvm-project
+#include "mlir/InitAllDialects.h"  // from @llvm-project
+#include "mlir/InitAllPasses.h"  // from @llvm-project
+#include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/FileUtilities.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/Passes.h"  // from @llvm-project
+#include "xla/literal.h"
+#include "xla/mlir/tools/mlir_bisect/bisect_lib.h"
+#include "xla/mlir/tools/mlir_bisect/test_passes.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter.h"
+#include "xla/mlir/tools/mlir_replay/public/execution_trace_utils.h"
+#include "xla/mlir_hlo/mhlo/IR/register.h"
+#include "xla/mlir_hlo/mhlo/transforms/passes.h"
+#include "xla/service/hlo.pb.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/init_main.h"
+
+struct Options {
+  llvm::cl::opt<std::string> input_filename{llvm::cl::Positional,
+                                            llvm::cl::desc("<input file>"),
+                                            llvm::cl::init("-")};
+  llvm::cl::opt<std::string> hlo_snapshot{
+      "hlo-snapshot",
+      llvm::cl::desc(
+          "If set, get argument values from the given snapshot. If not set, "
+          "the input function must not have any arguments."),
+      llvm::cl::init("")};
+  llvm::cl::opt<std::string> debug_strategy{
+      "debug-strategy",
+      llvm::cl::desc("If set, print all reductions for the given strategy and "
+                     "exit. For testing."),
+      llvm::cl::init("")};
+  llvm::cl::opt<std::string> expected_error{
+      "expected-error",
+      llvm::cl::desc("If set, expect the given error message after applying "
+                     "the pass instead of a successful execution."),
+      llvm::cl::init("")};
+  llvm::cl::opt<int64_t> max_steps_per_run{
+      "max-steps-per-run",
+      llvm::cl::desc("Maximum number of steps to execute for each attempt."),
+      llvm::cl::init(100000)};
+  mlir::PassPipelineCLParser pass_pipeline{"", "Passes to run"};
+  llvm::cl::opt<bool> canonicalize{
+      "enable-canonicalization",
+      llvm::cl::desc("If set, canonicalize candidates before trying them. Set "
+                     "to false if you're bisecting --canonicalize."),
+      llvm::cl::init(true)};
+};
+
+namespace mlir {
+namespace bisect {
+namespace {
+
+OwningOpRef<ModuleOp> ParseMlirInput(llvm::StringRef input_filename,
+                                     MLIRContext* context) {
+  std::string error_message;
+  auto file = mlir::openInputFile(input_filename, &error_message);
+  if (!file) {
+    llvm::errs() << error_message << "\n";
+    return {};
+  }
+
+  auto source_mgr = std::make_shared<llvm::SourceMgr>();
+  source_mgr->AddNewSourceBuffer(std::move(file), SMLoc());
+  return parseSourceFile<ModuleOp>(source_mgr, context);
+}
+
+LogicalResult RunPipeline(ModuleOp module, const Options& options) {
+  if (!options.pass_pipeline.hasAnyOccurrences()) {
+    return mlir::success();
+  }
+
+  auto error_handler = [&](const llvm::Twine& msg) {
+    llvm::errs() << msg << "\n";
+    return failure();
+  };
+  PassManager pm(module.getContext());
+  if (failed(options.pass_pipeline.addToPipeline(pm, error_handler)) ||
+      failed(pm.run(module))) {
+    llvm::errs() << "pipeline failed\n";
+    return failure();
+  }
+  return success();
+}
+
+LogicalResult Run(mlir::Operation* module, interpreter::ExecutionTrace* trace,
+                  const Options& options) {
+  SymbolTable symbol_table{module};
+  interpreter::ExecutionTraceListener tracer(trace);
+  interpreter::InterpreterOptions interpreter_options;
+  interpreter_options.listener = &tracer;
+  interpreter_options.max_steps = options.max_steps_per_run;
+  auto results_before_pass = interpreter::RunInterpreter(
+      symbol_table, llvm::cast<func::FuncOp>(symbol_table.lookup("main")), {},
+      interpreter_options);
+
+  if (!results_before_pass.ok()) {
+    llvm::errs() << "Interpreter failed\n";
+    return failure();
+  }
+
+  if (!options.debug_strategy.empty()) {
+    return success();
+  }
+
+  OwningOpRef<ModuleOp> clone(llvm::cast<ModuleOp>(module).clone());
+  if (!succeeded(RunPipeline(*clone, options))) {
+    return failure();
+  }
+
+  SymbolTable symbol_table_after{*clone};
+  interpreter_options.listener = nullptr;
+  bool found_expected_error = false;
+  if (!options.expected_error.empty()) {
+    auto original_handler = std::move(interpreter_options.error_handler);
+    interpreter_options.error_handler = [&](llvm::StringRef failure) {
+      found_expected_error |=
+          failure.find(options.expected_error) != std::string::npos;
+      original_handler(failure);
+    };
+  }
+
+  auto results_after_pass = interpreter::RunInterpreter(
+      symbol_table_after,
+      llvm::cast<func::FuncOp>(symbol_table_after.lookup("main")), {},
+      std::move(interpreter_options));
+
+  if (!results_after_pass.ok()) {
+    if (found_expected_error) {
+      return success();
+    }
+    llvm::errs() << "Interpreter failed\n";
+    return failure();
+  } else if (!options.expected_error.empty()) {
+    llvm::errs() << "Expected error not seen\n";
+    return failure();
+  }
+
+  // If the results are the same, the bug is no longer present.
+  if (*results_before_pass == *results_after_pass) {
+    return failure();
+  }
+
+  llvm::errs() << "results before:\n";
+  for (auto& result : *results_before_pass) {
+    llvm::errs() << "  " << result.ToString() << "\n";
+  }
+  llvm::errs() << "\nresults after:\n";
+  for (auto& result : *results_after_pass) {
+    llvm::errs() << "  " << result.ToString() << "\n";
+  }
+
+  return success();
+}
+
+LogicalResult Canonicalize(ModuleOp module) {
+  PassManager pm(module.getContext());
+  pm.addPass(createCanonicalizerPass());
+  return pm.run(module.getOperation());
+}
+
+OwningOpRef<ModuleOp> ReduceModule(OwningOpRef<ModuleOp> module,
+                                   BisectState& state, const Options& options) {
+  auto strategies = llvm::to_vector(mlir::bisect::detail::GetStrategies());
+
+  auto apply_step = [&]() -> std::optional<OwningOpRef<ModuleOp>> {
+    for (auto it = strategies.begin(); it != strategies.end(); ++it) {
+      for (auto& candidate_fn :
+           detail::GetCandidates(it->second, state, *module)) {
+        auto candidate = candidate_fn();
+        if (!candidate || !mlir::verify(*candidate).succeeded()) {
+          continue;
+        }
+        if (options.canonicalize && !Canonicalize(*candidate).succeeded()) {
+          continue;
+        }
+
+        interpreter::ExecutionTrace trace;
+        // Verify that the candidate is still buggy.
+        if (!Run(*candidate, &trace, options).succeeded()) {
+          continue;
+        }
+
+        // Print the new buggy module.
+        llvm::outs() << "module after " << it->first << ":\n"
+                     << *candidate << "\n\n";
+
+        // Update the trace.
+        state.SetTrace(std::move(trace));
+
+        // Move strategies to the end.
+        decltype(strategies) new_strategies;
+        std::copy(it + 1, strategies.end(), std::back_inserter(new_strategies));
+        std::copy(strategies.begin(), it + 1,
+                  std::back_inserter(new_strategies));
+        strategies = std::move(new_strategies);
+        return {candidate.release()};
+      }
+    }
+    return std::nullopt;
+  };
+
+  while (auto new_module = apply_step()) {
+    module = std::move(*new_module);
+  }
+  return module;
+}
+
+void ReplaceArgsWithConstants(ModuleOp module,
+                              const xla::HloSnapshot& snapshot) {
+  auto main = llvm::cast<func::FuncOp>(module.lookupSymbol("main"));
+  OpBuilder b(main.getBody());
+  for (auto [arg, bbarg] :
+       llvm::zip(snapshot.arguments(), main.getBody().getArguments())) {
+    auto attr = interpreter::ValueToAttribute(
+        *interpreter::LiteralToValue(*xla::Literal::CreateFromProto(arg)),
+        bbarg.getType());
+    CHECK_EQ(attr.size(), 1) << "unsupported argument";
+
+    auto constant = b.create<arith::ConstantOp>(
+        main.getLoc(), bbarg.getType(), llvm::cast<TypedAttr>(attr.front()));
+    bbarg.replaceAllUsesWith(constant);
+  }
+
+  // The remaining ops are output args, so we replace them with allocs.
+  for (auto arg :
+       main.getBody().getArguments().drop_front(snapshot.arguments().size())) {
+    CHECK(llvm::isa<MemRefType>(arg.getType())) << "unsupported argument";
+    arg.replaceAllUsesWith(b.create<memref::AllocOp>(
+        module.getLoc(), llvm::cast<MemRefType>(arg.getType())));
+  }
+  while (main.getBody().getNumArguments() > 0) {
+    main.getBody().eraseArgument(0);
+  }
+  main.setFunctionType(FunctionType::get(main.getContext(), /*inputs=*/{},
+                                         main.getFunctionType().getResults()));
+  main.setArgAttrsAttr(b.getArrayAttr({}));
+}
+
+}  // namespace
+}  // namespace bisect
+}  // namespace mlir
+
+int main(int argc, char* argv[]) {
+  llvm::errs().tie(&llvm::outs());
+  llvm::outs().tie(&llvm::errs());
+  int dummy_argc = 1;
+  tsl::port::InitMain("", &dummy_argc, &argv);
+
+  Options options;
+  llvm::cl::ParseCommandLineOptions(argc, argv, "MLIR bisect tool\n");
+
+  mlir::DialectRegistry registry;
+  mlir::registerAllDialects(registry);
+  mlir::registerAllPasses();
+  mlir::bisect::test::RegisterTestPasses();
+  mlir::mhlo::registerAllMhloPasses();
+  mlir::mhlo::registerAllMhloDialects(registry);
+
+  registry.insert<mlir::arith::ArithDialect>();
+
+  mlir::MLIRContext context(registry);
+  context.getOrLoadDialect<mlir::arith::ArithDialect>();
+  auto module = mlir::bisect::ParseMlirInput(options.input_filename, &context);
+
+  if (!options.hlo_snapshot.empty()) {
+    xla::HloSnapshot snapshot;
+    CHECK_OK(tsl::ReadBinaryProto(tsl::Env::Default(), options.hlo_snapshot,
+                                  &snapshot));
+    mlir::bisect::ReplaceArgsWithConstants(*module, snapshot);
+  }
+
+  if (options.debug_strategy.empty()) {
+    llvm::outs() << "initial module:\n" << *module << "\n";
+  }
+
+  mlir::interpreter::ExecutionTrace trace;
+  if (!mlir::bisect::Run(*module, &trace, options).succeeded()) {
+    llvm::outs() << "Did not find bug in initial module\n";
+    if (options.pass_pipeline.hasAnyOccurrences() &&
+        mlir::succeeded(mlir::bisect::RunPipeline(*module, options))) {
+      llvm::outs() << "Module after running pipeline:\n" << *module << "\n";
+    }
+    return 1;
+  }
+
+  mlir::bisect::BisectState state;
+  state.SetTrace(std::move(trace));
+  if (!options.debug_strategy.empty()) {
+    bool some_failed = false;
+    for (auto& candidate : mlir::bisect::detail::GetCandidates(
+             mlir::bisect::detail::GetStrategies()[options.debug_strategy],
+             state, *module)) {
+      auto new_module = candidate();
+      if (!new_module) {
+        continue;
+      }
+      llvm::outs() << *new_module << "\n\n";
+      if (!mlir::verify(*new_module).succeeded()) {
+        some_failed = true;
+        llvm::errs() << "verification failed\n";
+      }
+    }
+    return some_failed ? 1 : 0;
+  }
+
+  module = mlir::bisect::ReduceModule(std::move(module), state, options);
+
+  llvm::outs() << "Final module:\n" << *module << "\n";
+  if (options.pass_pipeline.hasAnyOccurrences() &&
+      mlir::succeeded(mlir::bisect::RunPipeline(*module, options))) {
+    llvm::outs() << "Final module after running pipeline:\n" << *module << "\n";
+  }
+  return 0;
+}
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/BUILD b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/BUILD
new file mode 100644
index 00000000000000..18620fa5b880ec
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/BUILD
@@ -0,0 +1,26 @@
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
+cc_library(
+    name = "rewrites",
+    srcs = [
+        "func.cc",
+        "general.cc",
+        "scf.cc",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla/mlir/tools/mlir_bisect:bisect_lib",
+        "//xla/mlir/tools/mlir_replay/public:execution_trace_utils",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:DialectUtils",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:SideEffectInterfaces",
+        "@llvm-project//mlir:Support",
+    ],
+    alwayslink = 1,
+)
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/func.cc b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/func.cc
new file mode 100644
index 00000000000000..3715f36b825692
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/func.cc
@@ -0,0 +1,86 @@
+/* Copyright 2023 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <functional>
+#include <iterator>
+#include <utility>
+
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/mlir/tools/mlir_bisect/bisect_lib.h"
+
+namespace mlir {
+namespace bisect {
+namespace {
+
+void SetReturnValues(func::FuncOp func, ValueRange values) {
+  // We only operate on functions without arguments.
+  func.setFunctionType(mlir::FunctionType::get(func.getContext(), /*inputs=*/{},
+                                               values.getTypes()));
+  func.getBody().getBlocks().front().getTerminator()->setOperands(values);
+}
+
+SmallVector<std::function<OwningOpRef<ModuleOp>()>> TruncateFunction(
+    BisectState&, func::FuncOp func) {
+  SmallVector<std::function<OwningOpRef<ModuleOp>()>> result;
+  for (auto& ret : func.getBody().getBlocks().front().without_terminator()) {
+    if (func.getBody().getBlocks().front().getTerminator()->getOperands() ==
+        ret.getResults()) {
+      continue;
+    }
+    auto fun = [r = &ret]() -> OwningOpRef<ModuleOp> {
+      auto [module, ret_clone] = CloneModuleFor(r);
+      SetReturnValues(ret_clone->getParentOfType<func::FuncOp>(),
+                      ret_clone->getResults());
+      return std::move(module);
+    };
+    result.push_back(fun);
+  }
+  return result;
+}
+
+SmallVector<std::function<OwningOpRef<ModuleOp>()>>
+ReturnOperandsOfTerminatorOperands(BisectState&, func::FuncOp func) {
+  SmallVector<std::function<OwningOpRef<ModuleOp>()>> result;
+  result.push_back([func]() -> OwningOpRef<ModuleOp> {
+    auto [module, func_clone] = CloneModuleFor(func);
+    auto* terminator = func_clone.getBody().getBlocks().front().getTerminator();
+    SmallVector<Value> new_operands;
+    for (auto operand : terminator->getOperands()) {
+      if (operand.getDefiningOp()) {
+        llvm::copy(operand.getDefiningOp()->getOperands(),
+                   std::back_inserter(new_operands));
+      } else {
+        return nullptr;
+      }
+    }
+    SetReturnValues(func_clone, new_operands);
+    return std::move(module);
+  });
+  return result;
+}
+
+REGISTER_MLIR_REDUCE_STRATEGY(TruncateFunction);
+REGISTER_MLIR_REDUCE_STRATEGY(ReturnOperandsOfTerminatorOperands);
+
+}  // namespace
+}  // namespace bisect
+}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/general.cc b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/general.cc
new file mode 100644
index 00000000000000..e54e81aebeebb8
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/general.cc
@@ -0,0 +1,194 @@
+/* Copyright 2023 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cassert>
+#include <cstdint>
+#include <functional>
+#include <utility>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/mlir/tools/mlir_bisect/bisect_lib.h"
+#include "xla/mlir/tools/mlir_replay/public/execution_trace_utils.h"
+
+namespace mlir {
+namespace bisect {
+namespace {
+
+bool IsTerminator(Operation* op) {
+  return op->hasTrait<OpTrait::IsTerminator>();
+}
+
+bool IsTopLevelOp(Operation* op) {
+  return !op->getBlock()->back().mightHaveTrait<OpTrait::IsTerminator>();
+}
+
+SmallVector<std::function<OwningOpRef<ModuleOp>()>> EraseOpWithoutResults(
+    BisectState& state, Operation* op) {
+  // Only erase ops with results if they're unused.
+  if (op->getNumResults() > 0 && !op->use_empty()) {
+    return {};
+  }
+
+  // Don't erase entire functions, constants, terminators.
+  if (IsTopLevelOp(op) || IsTerminator(op)) {
+    return {};
+  }
+
+  SmallVector<std::function<OwningOpRef<ModuleOp>()>> ret;
+  ret.push_back([op]() {
+    auto [module, cloned_op] = CloneModuleFor(op);
+    cloned_op->erase();
+    return std::move(module);
+  });
+  return ret;
+}
+
+llvm::SmallVector<std::function<OwningOpRef<ModuleOp>()>> ReplaceOpWithConstant(
+    BisectState& state, Operation* op) {
+  llvm::SmallVector<std::function<OwningOpRef<ModuleOp>()>> result;
+  if (op->hasTrait<OpTrait::ConstantLike>() || IsTopLevelOp(op) ||
+      IsTerminator(op) || op->use_empty() || op->getNumResults() == 0) {
+    return result;
+  }
+
+  auto mii = llvm::dyn_cast<MemoryEffectOpInterface>(op);
+  if (mii && mii.hasEffect<MemoryEffects::Allocate>()) {
+    // Don't replace allocs with constants.
+    return result;
+  }
+
+  // Ops that are never executed won't be replaced here, but we have other
+  // strategies that get rid of them (e.g. deleting the entire region).
+  for (auto* execution : state.GetExecutions(op)) {
+    assert(execution->results_size() == op->getNumResults() &&
+           "unexpected number of results");
+
+    result.push_back([execution, op]() -> OwningOpRef<ModuleOp> {
+      auto [module_clone, op_clone] = CloneModuleFor(op);
+      SmallVector<Value> results;
+      OpBuilder b(op_clone);
+      for (int64_t i = 0; i < op->getNumResults(); ++i) {
+        auto type = op->getResultTypes()[i];
+        auto value = *interpreter::TracedValueToValue(
+            execution->results(static_cast<int>(i)));
+        auto attribute = interpreter::ValueToAttribute(value, type);
+        // We don't currently support tuples.
+        if (attribute.size() != 1) {
+          return nullptr;
+        }
+        op_clone->getResults()[i].replaceAllUsesWith(
+            b.create<arith::ConstantOp>(
+                op_clone->getLoc(), type,
+                llvm::cast<TypedAttr>(attribute.front())));
+      }
+      return std::move(module_clone);
+    });
+  }
+  return result;
+}
+
+llvm::SmallVector<std::function<OwningOpRef<ModuleOp>()>>
+ReplaceOperandWithConstant(BisectState& state, Operation* op) {
+  llvm::SmallVector<std::function<OwningOpRef<ModuleOp>()>> result;
+  if (IsTopLevelOp(op) || op->getNumOperands() == 0) {
+    return result;
+  }
+
+  for (auto* execution : state.GetExecutions(op)) {
+    for (int64_t i = 0; i < op->getNumOperands(); ++i) {
+      auto operand = op->getOperand(i);
+      if (operand.getDefiningOp() &&
+          operand.getDefiningOp()->hasTrait<OpTrait::ConstantLike>()) {
+        continue;
+      }
+      result.push_back([execution, i, op]() -> OwningOpRef<ModuleOp> {
+        auto type = op->getOperandTypes()[i];
+        auto value = *interpreter::TracedValueToValue(
+            execution->args(static_cast<int>(i)));
+        auto attribute = interpreter::ValueToAttribute(value, type);
+        if (attribute.size() != 1) {
+          return nullptr;
+        }
+        auto [module_clone, op_clone] = CloneModuleFor(op);
+        OpBuilder b(op_clone);
+        op_clone->setOperand(i, b.create<arith::ConstantOp>(
+                                    op_clone->getLoc(), type,
+                                    llvm::cast<TypedAttr>(attribute.front())));
+        return std::move(module_clone);
+      });
+    }
+  }
+  return result;
+}
+
+// Replaces an op's result with some other value with the same type defined
+// previously in the same region.
+llvm::SmallVector<std::function<OwningOpRef<ModuleOp>()>> ReplaceOpWithValue(
+    BisectState&, Operation* op) {
+  llvm::SmallVector<std::function<OwningOpRef<ModuleOp>()>> ret;
+  if (op->hasTrait<OpTrait::ConstantLike>() || IsTopLevelOp(op) ||
+      IsTerminator(op)) {
+    return ret;
+  }
+
+  // TODO(jreiffers): Consider bbargs.
+  llvm::DenseMap<mlir::Type, llvm::SmallVector<std::pair<Operation*, int64_t>>>
+      candidates_by_type;
+  for (auto* pred = op->getPrevNode(); pred != nullptr;
+       pred = pred->getPrevNode()) {
+    for (auto [index, result] : llvm::enumerate(pred->getResults())) {
+      candidates_by_type[result.getType()].emplace_back(pred, index);
+    }
+  }
+
+  for (auto [index, result] : llvm::enumerate(op->getResults())) {
+    if (result.use_empty()) {
+      continue;
+    }
+
+    for (auto [new_result_op, new_result_index] :
+         candidates_by_type[result.getType()]) {
+      ret.push_back(
+          [op, i = index, j = new_result_index, result_op = new_result_op]() {
+            auto [module_clone, op_clone] = CloneModuleFor(op);
+            op_clone->getResults()[i].replaceAllUsesWith(
+                FindInClone(result_op, module_clone.get())->getResults()[j]);
+            return std::move(module_clone);
+          });
+    }
+  }
+  return ret;
+}
+
+REGISTER_MLIR_REDUCE_STRATEGY(EraseOpWithoutResults);
+REGISTER_MLIR_REDUCE_STRATEGY(ReplaceOpWithConstant);
+REGISTER_MLIR_REDUCE_STRATEGY(ReplaceOpWithValue);
+REGISTER_MLIR_REDUCE_STRATEGY(ReplaceOperandWithConstant);
+
+}  // namespace
+}  // namespace bisect
+}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/scf.cc b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/scf.cc
new file mode 100644
index 00000000000000..3445393ff9798b
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/scf.cc
@@ -0,0 +1,144 @@
+/* Copyright 2023 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
+
+#include <cstdint>     // NOLINT
+#include <functional>  // NOLINT
+#include <utility>     // NOLINT
+
+#include "mlir/Dialect/Utils/StaticValueUtils.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/mlir/tools/mlir_bisect/bisect_lib.h"
+
+namespace mlir {
+namespace bisect {
+namespace {
+
+constexpr int64_t kMaxWhileIterations = 1;
+
+// Rewrites a while loop to execute its body a fixed number of times. The
+// condition is executed, but its result is ignored.
+// For ease of implementation, this generates scf.execute_region ops. These are
+// subsequently canonicalized away.
+llvm::SmallVector<std::function<OwningOpRef<ModuleOp>()>> InlineScfWhile(
+    BisectState&, scf::WhileOp while_op) {
+  llvm::SmallVector<std::function<OwningOpRef<ModuleOp>()>> result;
+  for (int64_t num_executions = 0; num_executions <= kMaxWhileIterations;
+       ++num_executions) {
+    using ::mlir::scf::ExecuteRegionOp;
+
+    result.push_back([while_op, num_executions]() -> OwningOpRef<ModuleOp> {
+      auto [module, op] = CloneModuleFor(while_op);
+      OpBuilder b(op);
+      llvm::SmallVector<scf::ExecuteRegionOp> regions;
+
+      auto wrap_region_in_execute = [&,
+                                     loc = op.getLoc()](mlir::Region& region) {
+        regions
+            .emplace_back(b.create<ExecuteRegionOp>(
+                loc,
+                region.getBlocks().front().getTerminator()->getOperandTypes(),
+                mlir::ValueRange{}))
+            .getRegion()
+            .takeBody(region);
+      };
+
+      wrap_region_in_execute(op.getBefore());
+      // Replace the condition terminator with a yield terminator.
+      {
+        auto& before_block = regions[0].getRegion().getBlocks().front();
+        OpBuilder before_builder(before_block.getTerminator());
+        IRRewriter before_rewriter(before_builder);
+        before_rewriter.replaceOpWithNewOp<scf::YieldOp>(
+            before_block.getTerminator(),
+            before_block.getTerminator()->getOperands());
+      }
+
+      // Clone the execute region ops the requested number of times.
+      if (num_executions > 0) {
+        wrap_region_in_execute(op.getAfter());
+        for (int64_t i = 0; i < num_executions - 1; ++i) {
+          b.insert(regions.emplace_back(regions[0].clone()));
+          b.insert(regions.emplace_back(regions[1].clone()));
+        }
+        b.insert(regions.emplace_back(regions[0].clone()));
+      }
+
+      // Rewire region arguments and erase them.
+      for (int64_t i = 0; i < regions.size(); ++i) {
+        auto args = i == 0 ? ValueRange{op.getOperands()}
+                           : ValueRange{regions[i - 1].getResults()};
+        bool is_after_region = (i & 1) == 1;
+        auto& region = regions[i].getRegion();
+        for (int64_t arg = static_cast<int64_t>(region.getNumArguments()) - 1;
+             arg >= 0; --arg) {
+          region.getArgument(arg).replaceAllUsesWith(
+              args[is_after_region ? arg + 1 : arg]);
+          region.eraseArgument(arg);
+        }
+      }
+      op->replaceAllUsesWith(regions.back().getResults().drop_front(1));
+      op->erase();
+      return std::move(module);
+    });
+  }
+  return result;
+}
+
+SmallVector<std::function<OwningOpRef<ModuleOp>()>> ReduceScfForallBounds(
+    BisectState&, scf::ForallOp forall_op) {
+  SmallVector<OpFoldResult> new_upper_bound{forall_op.getMixedUpperBound()};
+  OpBuilder b(forall_op);
+  bool any_replaced = false;
+  for (auto& ub : new_upper_bound) {
+    auto constant_or = mlir::getConstantIntValue(ub);
+    if (!constant_or.has_value()) {
+      continue;
+    }
+    any_replaced = true;
+    ub = b.getIndexAttr(*constant_or - 1);
+  }
+  SmallVector<std::function<OwningOpRef<ModuleOp>()>> result;
+  if (!any_replaced) {
+    return result;
+  }
+  result.push_back([=]() -> OwningOpRef<ModuleOp> {
+    auto [module, op] = CloneModuleFor(forall_op);
+    OpBuilder b(op);
+    SmallVector<Value> dynamic_upper_bound;
+    SmallVector<int64_t> static_upper_bound;
+    dispatchIndexOpFoldResults(new_upper_bound, dynamic_upper_bound,
+                               static_upper_bound);
+    op.getDynamicUpperBoundMutable().assign(dynamic_upper_bound);
+    op.setStaticUpperBound(static_upper_bound);
+    return std::move(module);
+  });
+  return result;
+}
+
+REGISTER_MLIR_REDUCE_STRATEGY(ReduceScfForallBounds);
+REGISTER_MLIR_REDUCE_STRATEGY(InlineScfWhile);
+
+}  // namespace
+}  // namespace bisect
+}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/BUILD b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/BUILD
new file mode 100644
index 00000000000000..3cca2e53bb6945
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/BUILD
@@ -0,0 +1,13 @@
+load("//xla:lit.bzl", "lit_test_suite")
+
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
+lit_test_suite(
+    name = "all_tests",
+    srcs = glob(["*.mlir"]),
+    cfg = "//xla:lit.cfg.py",
+    tools = [
+        "//xla/mlir/tools/mlir_bisect:mlir-bisect",
+        "@llvm-project//llvm:FileCheck",
+    ],
+)
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/erase-op-without-results.mlir b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/erase-op-without-results.mlir
new file mode 100644
index 00000000000000..e918e112fe46f3
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/erase-op-without-results.mlir
@@ -0,0 +1,12 @@
+// RUN: mlir-bisect %s --debug-strategy=EraseOpWithoutResults | FileCheck %s
+
+func.func @main() -> memref<i32> {
+  %a = arith.constant 1 : i32
+  %b = memref.alloc() : memref<i32>
+  memref.store %a, %b[] : memref<i32>
+  func.return %b : memref<i32>
+}
+
+//      CHECK: func.func @main()
+//      CHECK:   %[[ALLOC:.*]] = memref.alloc
+// CHECK-NEXT:   return %[[ALLOC]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/inline-scf-while.mlir b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/inline-scf-while.mlir
new file mode 100644
index 00000000000000..6c9deddbc37cb5
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/inline-scf-while.mlir
@@ -0,0 +1,40 @@
+// RUN: mlir-bisect %s --debug-strategy=InlineScfWhile | FileCheck %s
+
+func.func @main() -> i64 {
+  %c0 = arith.constant 0 : i64
+  %c1 = arith.constant 1 : i64
+  %c4 = arith.constant 4 : i64
+  %alloc = memref.alloc() : memref<i64>
+  memref.store %c0, %alloc[] : memref<i64>
+  %ret = scf.while(%arg0 = %c0): (i64) -> (i64) {
+    %cond = arith.cmpi slt, %arg0, %c4 : i64
+    scf.condition(%cond) %arg0 : i64
+  } do {
+  ^bb0(%arg1: i64):
+    %add = arith.addi %arg1, %c1 : i64
+    scf.yield %add : i64
+  }
+  return %ret : i64
+}
+
+//     CHECK: func @main
+// CHECK-DAG:   %[[C0:.*]] = arith.constant 0
+// CHECK-DAG:   %[[C4:.*]] = arith.constant 4
+//     CHECK:   %[[RET:.*]]:2 = scf.execute_region
+//     CHECK:     arith.cmpi slt, %[[C0]], %[[C4]]
+//     CHECK:     yield {{.*}}, %[[C0]]
+//     CHECK:   return %[[RET]]#1
+
+//     CHECK: func @main
+// CHECK-DAG:   %[[C0:.*]] = arith.constant 0
+// CHECK-DAG:   %[[C1:.*]] = arith.constant 1
+//     CHECK:   %[[BEFORE0:.*]]:2 = scf.execute_region
+//     CHECK:     arith.cmpi
+//     CHECK:     yield {{.*}}, %[[C0]]
+//     CHECK:   %[[AFTER:.*]] = scf.execute_region
+//     CHECK:     %[[ADD:.*]] = arith.addi %[[BEFORE0]]#1, %[[C1]]
+//     CHECK:     yield %[[ADD]]
+//     CHECK:   %[[BEFORE1:.*]]:2 = scf.execute_region
+//     CHECK:     arith.cmpi
+//     CHECK:     yield {{.*}}, %[[AFTER]]
+//     CHECK:   return %[[BEFORE1]]#1
\ No newline at end of file
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/reduce-scf-forall-bounds.mlir b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/reduce-scf-forall-bounds.mlir
new file mode 100644
index 00000000000000..61f289d3c5cd6c
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/reduce-scf-forall-bounds.mlir
@@ -0,0 +1,16 @@
+// RUN: mlir-bisect %s --debug-strategy=ReduceScfForallBounds | FileCheck %s
+
+func.func @main() -> tensor<8xindex> {
+  %init = tensor.empty() : tensor<8xindex>
+  %iota = scf.forall (%i) = (0) to (8) step (1)
+      shared_outs (%init_ = %init) -> (tensor<8xindex>) {
+    %tensor = tensor.from_elements %i : tensor<1xindex>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %tensor into %init_[%i] [1] [1]
+        : tensor<1xindex> into tensor<8xindex>
+    }
+  }
+  func.return %iota : tensor<8xindex>
+}
+// CHECK: func @main()
+// CHECK:   scf.forall ({{.*}}) in (7)
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/replace-op-with-constant.mlir b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/replace-op-with-constant.mlir
new file mode 100644
index 00000000000000..171472ad733642
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/replace-op-with-constant.mlir
@@ -0,0 +1,26 @@
+// RUN: mlir-bisect %s --debug-strategy=ReplaceOpWithConstant | FileCheck %s
+
+func.func @main() -> tensor<2xi32> {
+  %a = arith.constant dense<3> : tensor<2xi32>
+  %b = arith.constant dense<2> : tensor<2xi32>
+  %c = mhlo.add %a, %b : tensor<2xi32>
+  %d = mhlo.multiply %b, %c : tensor<2xi32>
+  func.return %d : tensor<2xi32>
+}
+
+//      CHECK: func.func @main()
+// CHECK-NEXT:   arith.constant dense<3>
+// CHECK-NEXT:   arith.constant dense<2>
+// CHECK-NEXT:   arith.constant dense<5>
+// CHECK-NEXT:   %[[ADD:.*]] = mhlo.add
+//  CHECK-NOT:   %[[ADD]]
+// CHECK-NEXT:   mhlo.multiply
+// CHECK-NEXT:   return
+
+//      CHECK: func.func @main()
+// CHECK-NEXT:   arith.constant dense<3>
+// CHECK-NEXT:   arith.constant dense<2>
+// CHECK-NEXT:   mhlo.add
+// CHECK-NEXT:   %[[D:.*]] = arith.constant dense<10>
+// CHECK-NEXT:   mhlo.multiply
+// CHECK-NEXT:   return %[[D]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/replace-op-with-value.mlir b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/replace-op-with-value.mlir
new file mode 100644
index 00000000000000..f89f647f14ddc6
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/replace-op-with-value.mlir
@@ -0,0 +1,16 @@
+// RUN: mlir-bisect %s --debug-strategy=ReplaceOpWithValue | FileCheck %s
+
+func.func @main() -> (memref<i32>, memref<i32>) {
+  %a = memref.alloc() : memref<i32>
+  %b = memref.alloc() : memref<i32>
+  %c0 = arith.constant 0 : i32
+  memref.store %c0, %b[] : memref<i32>
+  return %a, %b : memref<i32>, memref<i32>
+}
+
+//      CHECK: func @main()
+//      CHECK:   %[[ALLOC:.*]] = memref.alloc()
+// CHECK-NEXT:   memref.alloc
+// CHECK-NEXT:   constant
+// CHECK-NEXT:   memref.store {{.*}}, %[[ALLOC]]
+// CHECK-NEXT:   return %[[ALLOC]], %[[ALLOC]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/replace-operand-with-constant.mlir b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/replace-operand-with-constant.mlir
new file mode 100644
index 00000000000000..7619a8a500c5e4
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/replace-operand-with-constant.mlir
@@ -0,0 +1,28 @@
+// RUN: mlir-bisect %s --debug-strategy=ReplaceOperandWithConstant | FileCheck %s
+
+func.func @main() -> (tensor<2xi32>, tensor<2xi32>) {
+  %a = arith.constant dense<3> : tensor<2xi32>
+  %b = arith.constant dense<2> : tensor<2xi32>
+  %c = mhlo.add %a, %b : tensor<2xi32>
+  %d = mhlo.multiply %b, %c : tensor<2xi32>
+  func.return %c, %d : tensor<2xi32>, tensor<2xi32>
+}
+
+// CHECK: func @main()
+// CHECK:   %[[C2:.*]] = arith.constant dense<2>
+// CHECK:   %[[ADD:.*]] = mhlo.add
+// CHECK:   %[[C5:.*]] = arith.constant dense<5>
+// CHECK:   %[[MUL:.*]] = mhlo.multiply %[[C2]], %[[C5]] : tensor<2xi32>
+// CHECK:   return %[[ADD]], %[[MUL]]
+
+// CHECK: func @main()
+// CHECK:   mhlo.add
+// CHECK:   %[[MUL:.*]] = mhlo.multiply %cst_0, %0 : tensor<2xi32>
+// CHECK:   %[[C5:.*]] = arith.constant dense<5>
+// CHECK:   return %[[C5]], %[[MUL]]
+
+// CHECK: func @main()
+// CHECK:   %[[ADD:.*]] = mhlo.add
+// CHECK:   mhlo.multiply
+// CHECK:   %[[C10:.*]] = arith.constant dense<10>
+// CHECK:   return %[[ADD]], %[[C10]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/return-operands-of-terminator-operands.mlir b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/return-operands-of-terminator-operands.mlir
new file mode 100644
index 00000000000000..8584e2a0008fa0
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/return-operands-of-terminator-operands.mlir
@@ -0,0 +1,15 @@
+// RUN: mlir-bisect %s --debug-strategy=ReturnOperandsOfTerminatorOperands | FileCheck %s
+
+func.func @main() -> tensor<2xi32> {
+  %a = arith.constant dense<3> : tensor<2xi32>
+  %b = arith.constant dense<2> : tensor<2xi32>
+  %c = mhlo.add %a, %b : tensor<2xi32>
+  %d = mhlo.multiply %b, %c : tensor<2xi32>
+  func.return %d : tensor<2xi32>
+}
+
+// CHECK: @main
+// CHECK:   %[[C2:.*]] = arith.constant dense<2>
+// CHECK:   %[[ADD:.*]] = mhlo.add
+// CHECK:   mhlo.multiply
+// CHECK:   return %[[C2]], %[[ADD]]
\ No newline at end of file
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/truncate-function.mlir b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/truncate-function.mlir
new file mode 100644
index 00000000000000..af06778bd47c54
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_bisect/rewrites/tests/truncate-function.mlir
@@ -0,0 +1,31 @@
+// RUN: mlir-bisect %s --debug-strategy=TruncateFunction | FileCheck %s
+
+// Function to prevent constant folding below.
+func.func private @cst() -> tensor<2xi32> {
+  %cst = arith.constant dense<2> : tensor<2xi32>
+  return %cst : tensor<2xi32>
+}
+
+func.func @main() -> tensor<2xi32> {
+  %a = arith.constant dense<1> : tensor<2xi32>
+  %b = func.call @cst() : () -> tensor<2xi32>
+  %c = mhlo.add %a, %b : tensor<2xi32>
+  %d = mhlo.multiply %b, %c : tensor<2xi32>
+  func.return %d : tensor<2xi32>
+}
+
+//     CHECK: func @main()
+//     CHECK:   %[[A:.*]] = arith.constant dense<1>
+//     CHECK:   return %[[A]]
+
+//     CHECK: func @main()
+//     CHECK:   %[[B:.*]] = call @cst()
+//     CHECK:   return %[[B]]
+
+//     CHECK: func @main()
+//     CHECK:   %[[A:.*]] = arith.constant dense<1>
+//     CHECK:   %[[B:.*]] = call @cst()
+//     CHECK:   %[[ADD:.*]] = mhlo.add
+// CHECK-DAG:   %[[A]]
+// CHECK-DAG:   %[[B]]
+//     CHECK:   return %[[ADD]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/test_passes.cc b/third_party/xla/xla/mlir/tools/mlir_bisect/test_passes.cc
new file mode 100644
index 00000000000000..507eb8a9f27d89
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_bisect/test_passes.cc
@@ -0,0 +1,55 @@
+/* Copyright 2023 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/mlir/tools/mlir_bisect/test_passes.h"
+
+#include <utility>
+
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+
+namespace mlir {
+namespace bisect {
+namespace test {
+namespace {
+
+struct BreakLinalgTransposePass
+    : public PassWrapper<BreakLinalgTransposePass, OperationPass<ModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(BreakLinalgTransposePass)
+
+  StringRef getArgument() const final { return "test-break-linalg-transpose"; }
+  StringRef getDescription() const final { return "breaks linalg transpose"; }
+  BreakLinalgTransposePass() = default;
+
+  void runOnOperation() override {
+    getOperation().walk([](linalg::TransposeOp op) {
+      auto permutation = llvm::to_vector(op.getPermutation());
+      std::swap(permutation[0], permutation[1]);
+      op.setPermutation(permutation);
+    });
+  }
+};
+}  // namespace
+
+void RegisterTestPasses() { PassRegistration<BreakLinalgTransposePass>(); }
+
+}  // namespace test
+}  // namespace bisect
+}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/lhlo/IR/lhlo_dialect.td b/third_party/xla/xla/mlir/tools/mlir_bisect/test_passes.h
similarity index 64%
rename from third_party/xla/xla/mlir_hlo/lhlo/IR/lhlo_dialect.td
rename to third_party/xla/xla/mlir/tools/mlir_bisect/test_passes.h
index 4e95122db70c24..b90f7e6b86529d 100644
--- a/third_party/xla/xla/mlir_hlo/lhlo/IR/lhlo_dialect.td
+++ b/third_party/xla/xla/mlir/tools/mlir_bisect/test_passes.h
@@ -1,4 +1,4 @@
-/* Copyright 2021 The OpenXLA Authors.
+/* Copyright 2023 The OpenXLA Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,15 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef LHLO_DIALECT
-#define LHLO_DIALECT
+#ifndef XLA_MLIR_TOOLS_MLIR_BISECT_TEST_PASSES_H_
+#define XLA_MLIR_TOOLS_MLIR_BISECT_TEST_PASSES_H_
 
-include "mlir/IR/OpBase.td"
+namespace mlir {
+namespace bisect {
+namespace test {
+
+void RegisterTestPasses();
 
-// We define the dialect here so that both structs and ops can refer to it.
-def LHLO_Dialect : Dialect {
-  let name = "lmhlo";
-  let cppNamespace = "::mlir::lmhlo";
 }
+}  // namespace bisect
+}  // namespace mlir
 
-#endif  // LHLO_DIALECT
+#endif  // XLA_MLIR_TOOLS_MLIR_BISECT_TEST_PASSES_H_
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/tests/BUILD b/third_party/xla/xla/mlir/tools/mlir_bisect/tests/BUILD
new file mode 100644
index 00000000000000..5fb8b35f77d82a
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_bisect/tests/BUILD
@@ -0,0 +1,17 @@
+load("//xla:lit.bzl", "lit_test_suite")
+
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
+lit_test_suite(
+    name = "all_tests",
+    srcs = glob(["*.mlir"]),
+    cfg = "//xla:lit.cfg.py",
+    data = [
+        "snapshot.mlir.pb",
+    ],
+    tools = [
+        "//xla/mlir/tools/mlir_bisect:mlir-bisect",
+        "@llvm-project//llvm:FileCheck",
+        "@llvm-project//llvm:not",
+    ],
+)
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/tests/bisect.mlir b/third_party/xla/xla/mlir/tools/mlir_bisect/tests/bisect.mlir
new file mode 100644
index 00000000000000..ca839d982c416a
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_bisect/tests/bisect.mlir
@@ -0,0 +1,46 @@
+// RUN: mlir-bisect %s \
+// RUN: --pass-pipeline="builtin.module(test-break-linalg-transpose)" \
+// RUN: --max-steps-per-run=200 \
+// RUN: | FileCheck %s
+
+func.func @main() -> (memref<2x2xindex>, memref<2x2xindex>) {
+  %a = memref.alloc() : memref<2x2xindex>
+  %b = memref.alloc() : memref<2x2xindex>
+  %c = memref.alloc() : memref<2x2xindex>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c3 = arith.constant 3 : index
+  scf.for %i = %c0 to %c2 step %c1 {
+    scf.for %j = %c0 to %c2 step %c1 {
+      memref.store %i, %a[%i, %j] : memref<2x2xindex>
+      memref.store %j, %b[%i, %j] : memref<2x2xindex>
+    }
+  }
+
+  %i = scf.while: () -> (index) {
+    %value = memref.load %a[%c0, %c0] : memref<2x2xindex>
+    %cond = arith.cmpi slt, %value, %c3 : index
+    scf.condition(%cond) %value : index
+  } do {
+  ^bb0(%_: index):
+    %value = memref.load %a[%c0, %c0] : memref<2x2xindex>
+    %add = arith.addi %value, %c1 : index
+    memref.store %add, %a[%c0, %c0] : memref<2x2xindex>
+    linalg.transpose ins(%b : memref<2x2xindex>) outs(%c : memref<2x2xindex>)
+      permutation = [1, 0]
+    memref.copy %c, %b : memref<2x2xindex> to memref<2x2xindex>
+    scf.yield
+  }
+
+  return %a, %b : memref<2x2xindex>, memref<2x2xindex>
+}
+
+//     CHECK: Final module
+//     CHECK: func @main() -> memref<2x2xindex> {
+// CHECK-NOT: scf.while
+// CHECK-NOT: scf.for
+//     CHECK: linalg.transpose {{.*}} permutation = [1, 0]
+
+//     CHECK: Final module after running pipeline
+//     CHECK: linalg.transpose {{.*}} permutation = [0, 1]
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/tests/no-bug.mlir b/third_party/xla/xla/mlir/tools/mlir_bisect/tests/no-bug.mlir
new file mode 100644
index 00000000000000..df343f3bf8b09f
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_bisect/tests/no-bug.mlir
@@ -0,0 +1,10 @@
+// RUN: not mlir-bisect %s \
+// RUN: --pass-pipeline="builtin.module(test-break-linalg-transpose)" \
+// RUN: | FileCheck %s
+
+func.func @main() -> memref<2x2xindex> {
+  %a = memref.alloc() : memref<2x2xindex>
+  return %a : memref<2x2xindex>
+}
+
+// CHECK: Did not find bug in initial module
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/tests/snapshot.mlir b/third_party/xla/xla/mlir/tools/mlir_bisect/tests/snapshot.mlir
new file mode 100644
index 00000000000000..916ca47ab0fd8e
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_bisect/tests/snapshot.mlir
@@ -0,0 +1,12 @@
+// RUN: not mlir-bisect %s --hlo-snapshot=%s.pb \
+// RUN: --pass-pipeline="builtin.module(test-break-linalg-transpose)" \
+// RUN: | FileCheck %s
+
+func.func @main(%a: tensor<3x1xi32>, %b: tensor<3x1xi32>) -> tensor<3x1xi32> {
+  return %a : tensor<3x1xi32>
+}
+
+// CHECK: initial module
+// CHECK: func @main() -> tensor<3x1xi32> {
+// CHECK{LITERAL}: arith.constant dense<[[2], [-4], [5]]> : tensor<3x1xi32>
+// CHECK{LITERAL}: arith.constant dense<[[0], [7], [-5]]> : tensor<3x1xi32>
diff --git a/third_party/xla/xla/mlir/tools/mlir_bisect/tests/snapshot.mlir.pb b/third_party/xla/xla/mlir/tools/mlir_bisect/tests/snapshot.mlir.pb
new file mode 100644
index 00000000000000..ee3c8f759494db
Binary files /dev/null and b/third_party/xla/xla/mlir/tools/mlir_bisect/tests/snapshot.mlir.pb differ
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/BUILD b/third_party/xla/xla/mlir/tools/mlir_interpreter/BUILD
new file mode 100644
index 00000000000000..87d94287d3285a
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/BUILD
@@ -0,0 +1,40 @@
+load("@bazel_skylib//rules:build_test.bzl", "build_test")
+load("//xla:xla.bzl", "xla_cc_binary")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [":friends"],
+    licenses = ["notice"],
+)
+
+package_group(
+    name = "friends",
+    includes = [
+        "//xla:friends",
+    ],
+)
+
+build_test(
+    name = "mlir-interpreter-runner_build_test",
+    targets = [
+        ":mlir-interpreter-runner",
+    ],
+)
+
+xla_cc_binary(
+    name = "mlir-interpreter-runner",
+    testonly = True,
+    srcs = ["mlir_interpreter_runner.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla/mlir/tools/mlir_interpreter/dialects",
+        "//xla/mlir/tools/mlir_interpreter/framework",
+        "//xla/mlir_hlo:hlo_dialect_registration",
+        "//xla/mlir_hlo:mhlo_passes",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:ParseUtilities",
+        "@llvm-project//mlir:Support",
+    ],
+)
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/BUILD b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/BUILD
new file mode 100644
index 00000000000000..53353594c97029
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/BUILD
@@ -0,0 +1,63 @@
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [":friends"],
+    licenses = ["notice"],
+)
+
+package_group(
+    name = "friends",
+    includes = [
+        "//xla:friends",
+    ],
+)
+
+cc_library(
+    name = "dialects",
+    srcs = glob(
+        [
+            "*.cc",
+        ],
+        exclude = ["util.cc"],
+    ),
+    deps = [
+        ":dialect_utils",
+        "//xla/mlir/tools/mlir_interpreter/framework",
+        "//xla/mlir_hlo",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AffineDialect",
+        "@llvm-project//mlir:AffineUtils",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:BufferizationDialect",
+        "@llvm-project//mlir:ComplexDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LinalgDialect",
+        "@llvm-project//mlir:MemRefDialect",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:VectorDialect",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "dialect_utils",
+    srcs = [
+        "util.cc",
+    ],
+    hdrs = [
+        "comparators.h",
+        "cwise_math.h",
+        "util.h",
+    ],
+    deps = [
+        "//xla/mlir/tools/mlir_interpreter/framework",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:ViewLikeInterface",
+    ],
+)
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/affine.cc b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/affine.cc
new file mode 100644
index 00000000000000..3b952f5c4e54db
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/affine.cc
@@ -0,0 +1,53 @@
+/* Copyright 2022 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cstdint>
+
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/mlir/tools/mlir_interpreter/dialects/util.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/registration.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+llvm::SmallVector<int64_t> Apply(InterpreterState&, affine::AffineApplyOp op,
+                                 ArrayRef<int64_t> operands) {
+  return EvalAffineMap(op.getAffineMap(), operands);
+}
+
+int64_t Min(InterpreterState&, affine::AffineMinOp op,
+            ArrayRef<int64_t> operands) {
+  auto results = EvalAffineMap(op.getAffineMap(), operands);
+  return *std::min_element(results.begin(), results.end());
+}
+
+int64_t Max(InterpreterState&, affine::AffineMaxOp op,
+            ArrayRef<int64_t> operands) {
+  auto results = EvalAffineMap(op.getAffineMap(), operands);
+  return *std::max_element(results.begin(), results.end());
+}
+
+REGISTER_MLIR_INTERPRETER_OP(Apply);
+REGISTER_MLIR_INTERPRETER_OP(Max);
+REGISTER_MLIR_INTERPRETER_OP(Min);
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/arith.cc b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/arith.cc
new file mode 100644
index 00000000000000..18b208dc71014b
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/arith.cc
@@ -0,0 +1,311 @@
+/* Copyright 2022 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+
+#include <variant>  // NOLINT
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/mlir/tools/mlir_interpreter/dialects/comparators.h"
+#include "xla/mlir/tools/mlir_interpreter/dialects/cwise_math.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value_util.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/registration.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/tensor_or_memref.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+InterpreterValue Bitcast(InterpreterState&, arith::BitcastOp op,
+                         const InterpreterValue& in) {
+  Type ty = op->getResultTypes()[0];
+  auto shaped_ty = dyn_cast<ShapedType>(ty);
+  auto result = DispatchScalarType(ty, [&](auto dummy) -> InterpreterValue {
+    TensorOrMemref<decltype(dummy)> result;
+    result.view = {};
+    if (shaped_ty) {
+      result.buffer = in.Clone().GetBuffer();
+    } else {
+      result.buffer = in.AsUnitTensor().GetBuffer();
+    }
+    return {result};
+  });
+  if (!shaped_ty) {
+    return result.ExtractElement({});
+  }
+  auto& out_view = result.View();
+  out_view.strides = BufferView::GetDefaultStrides(shaped_ty.getShape());
+  out_view.sizes = llvm::to_vector(shaped_ty.getShape());
+  return result;
+}
+
+InterpreterValue Constant(InterpreterState&, arith::ConstantOp constant) {
+  auto ty = constant->getResultTypes()[0];
+  auto shaped_ty = dyn_cast<ShapedType>(ty);
+  auto elem_ty = shaped_ty ? shaped_ty.getElementType() : ty;
+  return DispatchScalarType(elem_ty, [&](auto dummy) -> InterpreterValue {
+    using T = decltype(dummy);
+    if (shaped_ty) {
+      auto values = cast<DenseElementsAttr>(constant.getValue()).getValues<T>();
+      auto result = TensorOrMemref<T>::Empty(shaped_ty.getShape());
+      auto value_it = values.begin();
+      result.view.is_vector = isa<VectorType>(shaped_ty);
+      for (const auto& index : result.view.Indices(true)) {
+        result.at(index) = *value_it;
+        ++value_it;
+      }
+      return {result};
+    }
+
+    auto value = constant.getValue();
+    if (auto integer = value.dyn_cast<IntegerAttr>()) {
+      return {static_cast<T>(integer.getInt())};
+    }
+    if (auto float_value = value.dyn_cast<FloatAttr>()) {
+      return {static_cast<T>(float_value.getValueAsDouble())};
+    }
+
+    llvm_unreachable("unsupported constant type");
+  });
+}
+
+template <typename Op>
+InterpreterValue IntCast(InterpreterState&, Op op,
+                         const InterpreterValue& arg) {
+  if (arg.IsTensor()) {
+    return DispatchScalarType(
+        op->getResultTypes()[0], [&](auto dummy) -> InterpreterValue {
+          auto result = TensorOrMemref<decltype(dummy)>::EmptyLike(arg.View());
+          for (const auto& index : result.view.Indices()) {
+            result.at(index) =
+                static_cast<decltype(dummy)>(arg.ExtractElement(index).AsInt());
+          }
+          return {result};
+        });
+  }
+
+  return DispatchScalarType(
+      op->getResultTypes()[0], [&](auto dummy) -> InterpreterValue {
+        return {static_cast<decltype(dummy)>(arg.AsInt())};
+      });
+}
+
+template <typename Op>
+InterpreterValue FloatCast(InterpreterState&, Op op,
+                           const InterpreterValue& arg) {
+  if (arg.IsTensor()) {
+    return DispatchScalarType(
+        op->getResultTypes()[0], [&](auto dummy) -> InterpreterValue {
+          auto result = TensorOrMemref<decltype(dummy)>::EmptyLike(arg.View());
+          for (const auto& index : result.view.Indices()) {
+            result.at(index) = static_cast<decltype(dummy)>(
+                arg.ExtractElement(index).AsDouble());
+          }
+          return {result};
+        });
+  }
+
+  return DispatchScalarType(
+      op->getResultTypes()[0], [&](auto dummy) -> InterpreterValue {
+        return {static_cast<decltype(dummy)>(arg.AsDouble())};
+      });
+}
+
+llvm::SmallVector<InterpreterValue> UiToFP(
+    MutableArrayRef<InterpreterValue> args, mlir::Operation* op,
+    InterpreterState&) {
+  if (args[0].IsTensor()) {
+    auto ty = op->getResultTypes()[0].cast<ShapedType>();
+    return {DispatchScalarType(
+        ty.getElementType(), [&](auto dummy) -> InterpreterValue {
+          auto result =
+              TensorOrMemref<decltype(dummy)>::EmptyLike(args[0].View());
+          for (const auto& index : result.view.Indices()) {
+            result.at(index) = static_cast<decltype(dummy)>(
+                args[0].ExtractElement(index).AsUInt());
+          }
+          return {result};
+        })};
+  }
+
+  return {DispatchScalarType(
+      op->getResultTypes()[0], [&](auto dummy) -> InterpreterValue {
+        return {static_cast<decltype(dummy)>(args[0].AsUInt())};
+      })};
+}
+
+InterpreterValue CmpI(InterpreterState&, arith::CmpIOp compare,
+                      const InterpreterValue& lhs,
+                      const InterpreterValue& rhs) {
+  switch (compare.getPredicate()) {
+    case arith::CmpIPredicate::eq:
+      return ApplyCwiseBinaryMap<Foeq>(lhs, rhs);
+    case arith::CmpIPredicate::ne:
+      return ApplyCwiseBinaryMap<Fone>(lhs, rhs);
+    case arith::CmpIPredicate::slt:
+      return ApplyCwiseBinaryMap<Folt>(lhs, rhs);
+    case arith::CmpIPredicate::sle:
+      return ApplyCwiseBinaryMap<Fole>(lhs, rhs);
+    case arith::CmpIPredicate::sgt:
+      return ApplyCwiseBinaryMap<Fogt>(lhs, rhs);
+    case arith::CmpIPredicate::sge:
+      return ApplyCwiseBinaryMap<Foge>(lhs, rhs);
+    case arith::CmpIPredicate::ult:
+      return ApplyCwiseBinaryMap<Iult>(lhs, rhs);
+    case arith::CmpIPredicate::ule:
+      return ApplyCwiseBinaryMap<Iule>(lhs, rhs);
+    case arith::CmpIPredicate::ugt:
+      return ApplyCwiseBinaryMap<Iugt>(lhs, rhs);
+    case arith::CmpIPredicate::uge:
+      return ApplyCwiseBinaryMap<Iuge>(lhs, rhs);
+  }
+}
+
+template <bool value>
+struct ConstFunctor : CwiseAll {
+  template <typename T>
+  static bool Apply(T, T) {
+    return value;
+  }
+};
+
+InterpreterValue CmpF(InterpreterState&, arith::CmpFOp compare,
+                      const InterpreterValue& lhs,
+                      const InterpreterValue& rhs) {
+  switch (compare.getPredicate()) {
+    case arith::CmpFPredicate::AlwaysFalse:
+      return ApplyCwiseBinaryMap<ConstFunctor<false>>(lhs, rhs);
+    case arith::CmpFPredicate::OEQ:
+      return ApplyCwiseBinaryMap<Foeq>(lhs, rhs);
+    case arith::CmpFPredicate::OGT:
+      return ApplyCwiseBinaryMap<Fogt>(lhs, rhs);
+    case arith::CmpFPredicate::OGE:
+      return ApplyCwiseBinaryMap<Foge>(lhs, rhs);
+    case arith::CmpFPredicate::OLT:
+      return ApplyCwiseBinaryMap<Folt>(lhs, rhs);
+    case arith::CmpFPredicate::OLE:
+      return ApplyCwiseBinaryMap<Fole>(lhs, rhs);
+    case arith::CmpFPredicate::ONE:
+      return ApplyCwiseBinaryMap<Fone>(lhs, rhs);
+    case arith::CmpFPredicate::ORD:
+      return ApplyCwiseBinaryMap<Ford>(lhs, rhs);
+    case arith::CmpFPredicate::UEQ:
+      return ApplyCwiseBinaryMap<Fueq>(lhs, rhs);
+    case arith::CmpFPredicate::UGT:
+      return ApplyCwiseBinaryMap<Fugt>(lhs, rhs);
+    case arith::CmpFPredicate::UGE:
+      return ApplyCwiseBinaryMap<Fuge>(lhs, rhs);
+    case arith::CmpFPredicate::ULT:
+      return ApplyCwiseBinaryMap<Fult>(lhs, rhs);
+    case arith::CmpFPredicate::ULE:
+      return ApplyCwiseBinaryMap<Fule>(lhs, rhs);
+    case arith::CmpFPredicate::UNE:
+      return ApplyCwiseBinaryMap<Fune>(lhs, rhs);
+    case arith::CmpFPredicate::UNO:
+      return ApplyCwiseBinaryMap<Funo>(lhs, rhs);
+    case arith::CmpFPredicate::AlwaysTrue:
+      return ApplyCwiseBinaryMap<ConstFunctor<true>>(lhs, rhs);
+  }
+}
+
+InterpreterValue Select(InterpreterState& state, arith::SelectOp,
+                        const InterpreterValue& cond,
+                        const InterpreterValue& true_value,
+                        const InterpreterValue& false_value) {
+  if (std::holds_alternative<bool>(cond.storage)) {
+    return std::get<bool>(cond.storage) ? true_value : false_value;
+  }
+
+  if (!cond.IsTensor() && !cond.View().is_vector) {
+    state.AddFailure("select requires a scalar or vector argument");
+    return {};
+  }
+
+  auto ret = true_value.Clone();
+  for (const auto& index : cond.View().Indices(/*include_vector_dims=*/true)) {
+    if (cond.ExtractElement(index).AsInt() == 0) {
+      ret.InsertElement(index, false_value.ExtractElement(index));
+    }
+  }
+  return ret;
+}
+
+template <typename R>
+struct ExtFFunctor : CwiseFloat {
+  template <typename A>
+  static R Apply(A v) {
+    return v;
+  }
+};
+
+InterpreterValue ExtF(InterpreterState&, arith::ExtFOp op,
+                      const InterpreterValue& in) {
+  return DispatchScalarType(
+      op->getResultTypes()[0], [&](auto dummy) -> InterpreterValue {
+        return ApplyCwiseMap<ExtFFunctor<decltype(dummy)>>(in);
+      });
+}
+
+REGISTER_MLIR_INTERPRETER_OP("arith.addf", ApplyCwiseBinaryMap<Plus>);
+REGISTER_MLIR_INTERPRETER_OP("arith.andi", ApplyCwiseBinaryMap<BitAnd>);
+REGISTER_MLIR_INTERPRETER_OP("arith.divf", ApplyCwiseBinaryMap<Divide>);
+REGISTER_MLIR_INTERPRETER_OP("arith.extui", UiToFP);
+REGISTER_MLIR_INTERPRETER_OP("arith.maxf", ApplyCwiseBinaryMap<Max>);
+REGISTER_MLIR_INTERPRETER_OP("arith.minf", ApplyCwiseBinaryMap<Min>);
+REGISTER_MLIR_INTERPRETER_OP("arith.mulf", ApplyCwiseBinaryMap<Multiply>);
+REGISTER_MLIR_INTERPRETER_OP("arith.negf", ApplyCwiseMap<Neg>);
+REGISTER_MLIR_INTERPRETER_OP("arith.ori", ApplyCwiseBinaryMap<BitOr>);
+REGISTER_MLIR_INTERPRETER_OP("arith.remf", ApplyCwiseBinaryMap<Remainder>);
+REGISTER_MLIR_INTERPRETER_OP("arith.subf", ApplyCwiseBinaryMap<Minus>);
+REGISTER_MLIR_INTERPRETER_OP("arith.uitofp", UiToFP);
+REGISTER_MLIR_INTERPRETER_OP("arith.xori", ApplyCwiseBinaryMap<BitXor>);
+REGISTER_MLIR_INTERPRETER_OP("arith.shrui",
+                             ApplyCwiseBinaryMap<ShiftRightLogical>);
+REGISTER_MLIR_INTERPRETER_OP("arith.shrsi",
+                             ApplyCwiseBinaryMap<ShiftRightArith>);
+REGISTER_MLIR_INTERPRETER_OP("arith.shli", ApplyCwiseBinaryMap<ShiftLeft>);
+
+// The float implementations support ints too.
+REGISTER_MLIR_INTERPRETER_OP("arith.addi", "arith.addf");
+REGISTER_MLIR_INTERPRETER_OP("arith.divsi", "arith.divf");
+REGISTER_MLIR_INTERPRETER_OP("arith.maxsi", "arith.maxf");
+REGISTER_MLIR_INTERPRETER_OP("arith.minsi", "arith.minf");
+REGISTER_MLIR_INTERPRETER_OP("arith.muli", "arith.mulf");
+REGISTER_MLIR_INTERPRETER_OP("arith.remsi", "arith.remf");
+REGISTER_MLIR_INTERPRETER_OP("arith.subi", "arith.subf");
+
+REGISTER_MLIR_INTERPRETER_OP(Bitcast);
+REGISTER_MLIR_INTERPRETER_OP(CmpF);
+REGISTER_MLIR_INTERPRETER_OP(CmpI);
+REGISTER_MLIR_INTERPRETER_OP(Constant);
+REGISTER_MLIR_INTERPRETER_OP(ExtF);
+REGISTER_MLIR_INTERPRETER_OP(FloatCast<arith::FPToSIOp>);
+REGISTER_MLIR_INTERPRETER_OP(IntCast<arith::ExtSIOp>);
+REGISTER_MLIR_INTERPRETER_OP(IntCast<arith::IndexCastOp>);
+REGISTER_MLIR_INTERPRETER_OP(IntCast<arith::SIToFPOp>);
+REGISTER_MLIR_INTERPRETER_OP(IntCast<arith::TruncIOp>);
+REGISTER_MLIR_INTERPRETER_OP(Select);
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/bufferization.cc b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/bufferization.cc
new file mode 100644
index 00000000000000..950b8eff988608
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/bufferization.cc
@@ -0,0 +1,74 @@
+/* Copyright 2022 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"  // from @llvm-project
+
+// clang-format erroneously puts the Bufferization header above.
+#include <algorithm>  // NOLINT
+#include <cstdint>    // NOLINT
+#include <optional>   // NOLINT
+
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/mlir/tools/mlir_interpreter/dialects/util.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/registration.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+InterpreterValue ToTensor(InterpreterState&, bufferization::ToTensorOp,
+                          const InterpreterValue& in) {
+  return in.Clone();
+}
+
+InterpreterValue ToMemref(InterpreterState&, bufferization::ToMemrefOp,
+                          const InterpreterValue& in) {
+  return in;
+}
+
+InterpreterValue AllocTensor(
+    InterpreterState&, bufferization::AllocTensorOp alloc,
+    ArrayRef<int64_t> dynamic_sizes, std::optional<InterpreterValue> copy,
+    const std::optional<InterpreterValue>& /*sizeHint*/) {
+  auto ty = alloc->getResultTypes().front().cast<mlir::ShapedType>();
+  auto shape = ReplaceDynamicVals(ty.getShape(), dynamic_sizes);
+
+  if (copy) {
+    return copy->Clone();
+  }
+  return InterpreterValue::MakeTensor(ty.getElementType(), shape);
+}
+
+InterpreterValue Clone(InterpreterState& state, bufferization::CloneOp,
+                       const InterpreterValue& in) {
+  if (auto* stats = state.GetOptions().stats) {
+    stats->heap_size += in.GetBuffer()->GetByteSize();
+    stats->peak_heap_size = std::max(stats->peak_heap_size, stats->heap_size);
+    ++stats->num_allocations;
+  }
+  return in.Clone();
+}
+
+REGISTER_MLIR_INTERPRETER_OP(AllocTensor);
+REGISTER_MLIR_INTERPRETER_OP(Clone);
+REGISTER_MLIR_INTERPRETER_OP(ToMemref);
+REGISTER_MLIR_INTERPRETER_OP(ToTensor);
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/builtin.cc b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/builtin.cc
new file mode 100644
index 00000000000000..ccd1546e59d11c
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/builtin.cc
@@ -0,0 +1,64 @@
+/* Copyright 2022 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/registration.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/tensor_or_memref.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+llvm::SmallVector<InterpreterValue> UnrealizedConversionCast(
+    MutableArrayRef<InterpreterValue> args, mlir::Operation* op,
+    InterpreterState&) {
+  auto result_ty = op->getResultTypes()[0];
+  auto operand_ty = op->getOperandTypes()[0];
+  if (result_ty == operand_ty) {
+    return {args[0]};
+  }
+
+  if (auto r = llvm::dyn_cast<ShapedType>(result_ty)) {
+    if (auto o = llvm::dyn_cast<ShapedType>(operand_ty)) {
+      if (verifyCompatibleShapes({o, r}).succeeded()) {
+        return {DispatchScalarType(r, [&](auto dummy) -> InterpreterValue {
+          TensorOrMemref<decltype(dummy)> result;
+          result.view = args[0].View();
+          result.buffer = args[0].GetBuffer();
+          return {result};
+        })};
+      }
+    }
+  }
+
+  llvm::errs() << "Unimplemented cast: " << *op << "\n";
+  llvm_unreachable("unimplemented cast");
+}
+
+REGISTER_MLIR_INTERPRETER_OP("builtin.unrealized_conversion_cast",
+                             UnrealizedConversionCast);
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/comparators.h b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/comparators.h
new file mode 100644
index 00000000000000..c26b75af6ae660
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/comparators.h
@@ -0,0 +1,105 @@
+/* Copyright 2022 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_MLIR_TOOLS_MLIR_INTERPRETER_DIALECTS_COMPARATORS_H_
+#define XLA_MLIR_TOOLS_MLIR_INTERPRETER_DIALECTS_COMPARATORS_H_
+
+#include <complex>
+#include <cstdint>
+#include <type_traits>
+
+#include "llvm/Support/ErrorHandling.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value_util.h"
+
+namespace mlir {
+namespace interpreter {
+
+// Despite the name, this works on integers and complex too.
+template <int64_t v, bool r, bool nan_result>
+struct FloatCompare : CwiseAll {
+  template <typename T>
+  static bool Apply(T a, T b) {
+    if (isnan(a) || isnan(b)) return nan_result;
+    if constexpr (v == 0) {
+      // For complex eq/ne.
+      return (a == b) == r;
+    } else if constexpr (std::is_floating_point_v<T> || std::is_integral_v<T>) {
+      auto cmp = a > b ? 1 : (a < b ? -1 : 0);
+      return (cmp == v) == r;
+    } else {
+      llvm_unreachable("operation not supported for this type");
+    }
+  }
+
+  template <typename T>
+  static bool isnan(T a) {
+    return std::isnan(a);
+  }
+  template <typename T>
+  static bool isnan(std::complex<T> a) {
+    return std::isnan(std::real(a)) || std::isnan(std::imag(a));
+  }
+};
+
+using Foeq = FloatCompare<0, true, false>;
+using Foge = FloatCompare<-1, false, false>;
+using Fogt = FloatCompare<1, true, false>;
+using Fole = FloatCompare<1, false, false>;
+using Folt = FloatCompare<-1, true, false>;
+using Fone = FloatCompare<0, false, false>;
+using Ford = FloatCompare<99, false, false>;
+using Fueq = FloatCompare<0, true, true>;
+using Fuge = FloatCompare<-1, false, true>;
+using Fugt = FloatCompare<1, true, true>;
+using Fule = FloatCompare<1, false, true>;
+using Fult = FloatCompare<-1, true, true>;
+using Fune = FloatCompare<0, false, true>;
+using Funo = FloatCompare<99, true, true>;
+
+template <int64_t v, bool r>
+struct UnsignedCompare : CwiseInt {
+  template <typename T>
+  static bool Apply(T a, T b) {
+    using U = std::make_unsigned_t<T>;
+    auto a_u = static_cast<U>(a);
+    auto b_u = static_cast<U>(b);
+    auto cmp = a_u > b_u ? 1 : (a_u < b_u ? -1 : 0);
+    return (cmp == v) == r;
+  }
+};
+
+using Iuge = UnsignedCompare<-1, false>;
+using Iule = UnsignedCompare<1, false>;
+using Iugt = UnsignedCompare<1, true>;
+using Iult = UnsignedCompare<-1, true>;
+
+struct Iumax {
+  template <typename T>
+  static T apply(T a, T b) {
+    return Iuge::Apply(a, b) ? a : b;
+  }
+};
+
+struct Iumin {
+  template <typename T>
+  static T apply(T a, T b) {
+    return Iule::Apply(a, b) ? a : b;
+  }
+};
+
+}  // namespace interpreter
+}  // namespace mlir
+
+#endif  // XLA_MLIR_TOOLS_MLIR_INTERPRETER_DIALECTS_COMPARATORS_H_
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/complex.cc b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/complex.cc
new file mode 100644
index 00000000000000..35ab806eef8ca3
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/complex.cc
@@ -0,0 +1,68 @@
+/* Copyright 2023 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Dialect/Complex/IR/Complex.h"  // from @llvm-project
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "xla/mlir/tools/mlir_interpreter/dialects/cwise_math.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value_util.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/registration.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+InterpreterValue Constant(InterpreterState&, complex::ConstantOp constant) {
+  auto ty = constant->getResultTypes()[0];
+  return DispatchScalarType(ty, [&](auto dummy) -> InterpreterValue {
+    if constexpr (is_complex_v<decltype(dummy)>) {
+      using T = typename decltype(dummy)::value_type;
+      auto values =
+          llvm::to_vector(constant.getValue().getAsValueRange<FloatAttr>());
+      return {decltype(dummy){static_cast<T>(values[0].convertToDouble()),
+                              static_cast<T>(values[1].convertToDouble())}};
+    } else {
+      llvm_unreachable("invalid constant");
+    }
+  });
+}
+
+REGISTER_MLIR_INTERPRETER_OP("complex.abs", "math.absf");
+REGISTER_MLIR_INTERPRETER_OP("complex.add", "arith.addf");
+REGISTER_MLIR_INTERPRETER_OP("complex.cos", ApplyCwiseMap<Cos>);
+REGISTER_MLIR_INTERPRETER_OP("complex.create", ApplyCwiseBinaryMap<Complex>);
+REGISTER_MLIR_INTERPRETER_OP("complex.div", ApplyCwiseBinaryMap<Divide>);
+REGISTER_MLIR_INTERPRETER_OP("complex.exp", ApplyCwiseMap<Exp>);
+REGISTER_MLIR_INTERPRETER_OP("complex.expm1", ApplyCwiseMap<ExpM1>);
+REGISTER_MLIR_INTERPRETER_OP("complex.im", ApplyCwiseMap<Imag>);
+REGISTER_MLIR_INTERPRETER_OP("complex.log", ApplyCwiseMap<Log>);
+REGISTER_MLIR_INTERPRETER_OP("complex.log1p", ApplyCwiseMap<Log1P>);
+REGISTER_MLIR_INTERPRETER_OP("complex.mul", ApplyCwiseBinaryMap<Multiply>);
+REGISTER_MLIR_INTERPRETER_OP("complex.neg", ApplyCwiseMap<Neg>);
+REGISTER_MLIR_INTERPRETER_OP("complex.pow", ApplyCwiseBinaryMap<Power>);
+REGISTER_MLIR_INTERPRETER_OP("complex.re", ApplyCwiseMap<Real>);
+REGISTER_MLIR_INTERPRETER_OP("complex.rsqrt", ApplyCwiseMap<RSqrt>);
+REGISTER_MLIR_INTERPRETER_OP("complex.sin", ApplyCwiseMap<Sin>);
+REGISTER_MLIR_INTERPRETER_OP("complex.sqrt", ApplyCwiseMap<Sqrt>);
+REGISTER_MLIR_INTERPRETER_OP("complex.tanh", ApplyCwiseMap<TanH>);
+REGISTER_MLIR_INTERPRETER_OP(Constant);
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/cwise_math.h b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/cwise_math.h
new file mode 100644
index 00000000000000..6f36f8a65a8142
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/cwise_math.h
@@ -0,0 +1,242 @@
+/* Copyright 2023 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_MLIR_TOOLS_MLIR_INTERPRETER_DIALECTS_CWISE_MATH_H_
+#define XLA_MLIR_TOOLS_MLIR_INTERPRETER_DIALECTS_CWISE_MATH_H_
+
+#include <climits>
+#include <complex>
+#include <cstdint>
+#include <functional>
+#include <type_traits>
+
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value_util.h"
+
+namespace mlir {
+namespace interpreter {
+
+struct ATan2 : CwiseReal {
+  template <typename T>
+  static T Apply(T a, T b) {
+    return std::atan2(a, b);
+  }
+};
+
+struct Clz : CwiseInt {
+  template <typename T>
+  static T Apply(T a) {
+    if (!a) {
+      // Return something well-defined for zeroes.
+      return sizeof(T{}) * CHAR_BIT;
+    }
+    return __builtin_clzl(
+               static_cast<uint64_t>(static_cast<std::make_unsigned_t<T>>(a))) -
+           (sizeof(uint64_t) - sizeof(T{})) * CHAR_BIT;
+  }
+};
+
+struct Ctz : CwiseInt {
+  template <typename T>
+  static T Apply(T a) {
+    if (!a) {
+      // Return something well-defined for zeroes.
+      return sizeof(T{}) * CHAR_BIT;
+    }
+    return __builtin_ctzl(static_cast<uint64_t>(a));
+  }
+};
+
+struct Complex : CwiseFloat {
+  template <typename T>
+  static std::complex<T> Apply(T a, T b) {
+    return {a, b};
+  }
+};
+
+struct Max : CwiseReal {
+  template <typename T>
+  static T Apply(T a, T b) {
+    return std::max(a, b);
+  }
+};
+
+struct Min : CwiseReal {
+  template <typename T>
+  static T Apply(T a, T b) {
+    return std::min(a, b);
+  }
+};
+
+struct Power : CwiseArith {
+  template <typename T>
+  static T Apply(T a, T b) {
+    if constexpr (std::is_integral_v<T>) {
+      if constexpr (std::is_signed_v<T>) {
+        if (b < 0) {
+          return a == 1 ? 1 : 0;
+        }
+      }
+      T result = 1;
+      while (b > 0) {
+        if (b & 1) result *= a;
+        b >>= 1;
+        if (b) {
+          a *= a;
+        }
+      }
+      return result;
+    } else {
+      return std::pow(a, b);
+    }
+  }
+};
+
+struct Remainder : CwiseReal {
+  template <typename T>
+  static T Apply(T a, T b) {
+    if constexpr (std::is_integral_v<T>) {
+      return a % b;
+    } else {
+      return std::fmod(a, b);
+    }
+  }
+};
+
+struct ShiftRightArith : CwiseInt {
+  template <typename T>
+  static T Apply(T a, T b) {
+    return b >= sizeof(T) * CHAR_BIT ? 0 : (a >> b);
+  }
+};
+
+struct ShiftRightLogical : CwiseInt {
+  template <typename T>
+  static T Apply(T a, T b) {
+    return b >= sizeof(T) * CHAR_BIT
+               ? 0
+               : static_cast<std::make_unsigned_t<T>>(a) >> b;
+  }
+};
+
+struct ShiftLeft : CwiseInt {
+  template <typename T>
+  static T Apply(T a, T b) {
+    return b >= sizeof(T) * CHAR_BIT ? 0 : (a << b);
+  }
+};
+
+namespace detail {
+template <template <typename T> class F, typename trait>
+struct Wrap : trait {
+  template <typename T>
+  static T Apply(T a, T b) {
+    return F<T>{}(a, b);
+  }
+};
+}  // namespace detail
+
+using Plus = detail::Wrap<std::plus, CwiseArith>;
+using Divide = detail::Wrap<std::divides, CwiseArith>;
+using Multiply = detail::Wrap<std::multiplies, CwiseArith>;
+using Minus = detail::Wrap<std::minus, CwiseAll>;
+using BitAnd = detail::Wrap<std::bit_and, CwiseIntegral>;
+using BitOr = detail::Wrap<std::bit_or, CwiseIntegral>;
+using BitXor = detail::Wrap<std::bit_xor, CwiseIntegral>;
+
+struct RSqrt : CwiseNonIntegral {
+  template <typename T>
+  static T Apply(T a) {
+    return static_cast<T>(T{1} / std::sqrt(a));
+  }
+};
+
+#define DEFINE_WRAPPER(name, std_fun, trait) \
+  struct name : trait {                      \
+    template <typename T>                    \
+    static auto Apply(T a) {                 \
+      return std_fun(a);                     \
+    }                                        \
+  };
+
+DEFINE_WRAPPER(ATan, std::atan, CwiseNonIntegral);
+DEFINE_WRAPPER(Abs, std::abs, CwiseSignedOrComplex);
+DEFINE_WRAPPER(Cbrt, std::cbrt, CwiseFloat);
+DEFINE_WRAPPER(Ceil, std::ceil, CwiseFloat);
+DEFINE_WRAPPER(Cos, std::cos, CwiseNonIntegral);
+DEFINE_WRAPPER(Erf, std::erf, CwiseFloat);
+DEFINE_WRAPPER(Exp, std::exp, CwiseNonIntegral);
+DEFINE_WRAPPER(Exp2, std::exp2, CwiseFloat);
+DEFINE_WRAPPER(Floor, std::floor, CwiseFloat);
+DEFINE_WRAPPER(Imag, std::imag, CwiseComplex);
+DEFINE_WRAPPER(IsFinite, std::isfinite, CwiseFloat);
+DEFINE_WRAPPER(Log, std::log, CwiseNonIntegral);
+DEFINE_WRAPPER(Log10, std::log10, CwiseNonIntegral);
+DEFINE_WRAPPER(Log2, std::log2, CwiseFloat);
+DEFINE_WRAPPER(NearbyInt, std::nearbyint, CwiseFloat);
+DEFINE_WRAPPER(Neg, std::negate<T>{}, CwiseSignedOrComplex);
+DEFINE_WRAPPER(Real, std::real, CwiseComplex);
+DEFINE_WRAPPER(Round, std::round, CwiseFloat);
+DEFINE_WRAPPER(Sin, std::sin, CwiseNonIntegral);
+DEFINE_WRAPPER(Sqrt, std::sqrt, CwiseNonIntegral);
+DEFINE_WRAPPER(Tan, std::tan, CwiseNonIntegral);
+DEFINE_WRAPPER(TanH, std::tanh, CwiseNonIntegral);
+DEFINE_WRAPPER(Trunc, std::trunc, CwiseFloat);
+
+#undef DEFINE_WRAPPER
+
+struct ExpM1 : CwiseNonIntegral {
+  template <typename T>
+  static T Apply(T a) {
+    if constexpr (std::is_floating_point_v<T>) {
+      return std::expm1(a);
+    } else {
+      auto r = std::real(a);
+      auto i = std::imag(a);
+      auto s = std::sin(i / 2);
+      auto real = std::expm1(r) * std::cos(i) - 2 * s * s;
+      auto imag = std::exp(r) * std::sin(i);
+      return {real, imag};
+    }
+  }
+};
+
+struct Log1P : CwiseNonIntegral {
+  template <typename T>
+  static T Apply(T a) {
+    if constexpr (std::is_floating_point_v<T>) {
+      return std::log1p(a);
+    } else {
+      auto r = std::real(a);
+      auto i = std::imag(a);
+      auto l = std::hypot(r + 1, i);
+      auto real = std::log(l);
+      auto imag = std::atan2(i, r + 1);
+      return {real, imag};
+    }
+  }
+};
+
+struct PopCount : CwiseInt {
+  template <typename T>
+  static T Apply(T a) {
+    return __builtin_popcountl(
+        static_cast<uint64_t>(static_cast<std::make_unsigned_t<T>>(a)));
+  }
+};
+
+}  // namespace interpreter
+}  // namespace mlir
+
+#endif  // XLA_MLIR_TOOLS_MLIR_INTERPRETER_DIALECTS_CWISE_MATH_H_
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/func.cc b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/func.cc
new file mode 100644
index 00000000000000..b20d8dfe3607e4
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/func.cc
@@ -0,0 +1,122 @@
+/* Copyright 2023 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <dlfcn.h>
+
+#include <tuple>
+#include <type_traits>
+
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/mlir/tools/mlir_interpreter/dialects/util.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/registration.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+template <typename T>
+bool TypeMatches(mlir::Type type) {
+  if constexpr (std::is_same_v<T, float>) {
+    return type.isF32();
+  } else if constexpr (std::is_same_v<T, double>) {
+    return type.isF64();
+  } else {
+    return false;
+  }
+}
+
+template <typename Dummy>
+bool TypesMatch(ArrayRef<mlir::Type> types) {
+  return types.empty();
+}
+
+template <typename Dummy, typename T, typename... R>
+bool TypesMatch(ArrayRef<mlir::Type> types) {
+  if (types.empty() || !TypeMatches<T>(types.front())) return false;
+  return TypesMatch<Dummy, R...>(types.drop_front());
+}
+
+template <int n, typename... Args>
+using Arg = std::tuple_element_t<n, std::tuple<Args...>>;
+
+template <typename Ret, typename... Args>
+bool TryCall(void* sym, func::FuncOp callee,
+             MutableArrayRef<InterpreterValue> args, InterpreterValue& ret) {
+  if (args.size() != callee.getNumArguments() || callee.getNumResults() != 1) {
+    return false;
+  }
+
+  if (!TypeMatches<Ret>(callee.getResultTypes()[0])) {
+    return false;
+  }
+  if (!TypesMatch<void, Args...>(callee.getArgumentTypes())) {
+    return false;
+  }
+
+  static_assert(sizeof...(Args) <= 2);
+  using FnType = Ret (*)(Args...);
+  auto fn = reinterpret_cast<FnType>(sym);
+  constexpr int n = sizeof...(Args);
+
+  if constexpr (n == 1) {
+    ret = {fn(std::get<Arg<0, Args...>>(args[0].storage))};
+  } else {
+    static_assert(n == 2);
+    ret = {fn(std::get<Arg<0, Args...>>(args[0].storage),
+              std::get<Arg<1, Args...>>(args[1].storage))};
+  }
+  return true;
+}
+
+llvm::SmallVector<InterpreterValue> Call(MutableArrayRef<InterpreterValue> args,
+                                         mlir::Operation* op,
+                                         InterpreterState& state) {
+  auto call = llvm::cast<func::CallOp>(op);
+  auto callee =
+      llvm::cast<func::FuncOp>(state.GetSymbols().lookup(call.getCallee()));
+  if (callee->getRegion(0).hasOneBlock()) {
+    return Interpret(state, callee.getRegion(), args);
+  }
+
+  void* sym = dlsym(RTLD_DEFAULT, callee.getSymName().str().c_str());
+  if (sym == nullptr) {
+    state.AddFailure("callee not found");
+    return {};
+  }
+
+  InterpreterValue result;
+  if (TryCall<float, float>(sym, callee, args, result) ||
+      TryCall<float, float, float>(sym, callee, args, result) ||
+      TryCall<double, double>(sym, callee, args, result) ||
+      TryCall<double, double, double>(sym, callee, args, result)) {
+    return {result};
+  }
+
+  state.AddFailure("unsupported call target");
+  return {};
+}
+
+REGISTER_MLIR_INTERPRETER_OP("func.call", Call);
+REGISTER_MLIR_INTERPRETER_OP("func.return", NoOpTerminator);
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/linalg.cc b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/linalg.cc
new file mode 100644
index 00000000000000..2c55af769ac9ca
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/linalg.cc
@@ -0,0 +1,338 @@
+/* Copyright 2022 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Dialect/Linalg/IR/Linalg.h"  // from @llvm-project
+
+// clang-format erroneously puts the Linalg header above.
+#include <algorithm>   // NOLINT
+#include <cstdint>     // NOLINT
+#include <functional>  // NOLINT
+#include <memory>      // NOLINT
+#include <utility>     // NOLINT
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/mlir/tools/mlir_interpreter/dialects/util.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/registration.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/tensor_or_memref.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+class IterationIndexSideChannel : public InterpreterSideChannel {
+ public:
+  explicit IterationIndexSideChannel(ArrayRef<int64_t> indices)
+      : indices_(indices) {}
+  ArrayRef<int64_t> GetIndices() const { return indices_; }
+
+ private:
+  ArrayRef<int64_t> indices_;
+};
+
+llvm::SmallVector<InterpreterValue> Broadcast(InterpreterState&,
+                                              linalg::BroadcastOp broadcast,
+                                              const InterpreterValue& input,
+                                              InterpreterValue init) {
+  auto broadcast_dims = llvm::to_vector(broadcast.getDimensions());
+  llvm::sort(broadcast_dims);
+
+  auto out_shape = init.View().sizes;
+  auto out = input;
+  auto& out_view = out.View();
+  for (int64_t dim : broadcast_dims) {
+    out_view.sizes.insert(out_view.sizes.begin() + dim, out_shape[dim]);
+    out_view.strides.insert(out_view.strides.begin() + dim, 0);
+  }
+
+  if (broadcast.getNumResults() == 1) {
+    return {std::move(out)};
+  }
+
+  init.Fill([&](llvm::ArrayRef<int64_t> indices) {
+    return out.ExtractElement(indices);
+  });
+  return {};
+}
+
+llvm::SmallVector<InterpreterValue> Generic(
+    InterpreterState& state, linalg::GenericOp generic,
+    MutableArrayRef<InterpreterValue> inputs,
+    MutableArrayRef<InterpreterValue> outputs_ref) {
+  SmallVector<int64_t> shapes;
+  for (auto& value : llvm::concat<InterpreterValue>(inputs, outputs_ref)) {
+    if (value.IsTensor() /* (or memref) */) {
+      llvm::append_range(
+          shapes, ArrayRef<int64_t>(value.View().sizes)
+                      .drop_back(value.View().num_vector_dims.value_or(0)));
+    }
+  }
+  auto ranges = generic.getShapesToLoopsMap().compose(shapes);
+
+  llvm::SmallVector<InterpreterValue> outputs;
+  for (int64_t output = 0; output < outputs_ref.size(); ++output) {
+    outputs.push_back(
+        GetInitOperand(generic.getOutputs(), output, outputs_ref));
+  }
+
+  llvm::SmallVector<int64_t> ivs(ranges.size());
+  InterpreterScope scope(state);
+  scope.SetSideChannel(std::make_shared<IterationIndexSideChannel>(ivs));
+
+  auto indexing_maps = generic.getIndexingMapsArray();
+  auto output_maps =
+      ArrayRef<AffineMap>(indexing_maps).drop_front(inputs.size());
+  std::function<void(int64_t)> run;
+  run = [&](int64_t loop_index) {
+    // Abort recursion if we encountered some error previously.
+    if (state.HasFailure()) {
+      return;
+    }
+
+    if (loop_index < ranges.size()) {
+      for (int64_t index = 0; index < ranges[loop_index]; ++index) {
+        ivs[loop_index] = index;
+        run(loop_index + 1);
+      }
+    } else {
+      llvm::SmallVector<InterpreterValue> bbargs;
+      // Build bbargs: 1. inputs, 2. outputs.
+      for (auto [input, map] : llvm::zip(inputs, indexing_maps)) {
+        auto indices = EvalAffineMap(map, ivs);
+        bbargs.push_back(input.ExtractElement(indices));
+      }
+      llvm::SmallVector<llvm::SmallVector<int64_t>> output_indices;
+      for (auto [output, map] : llvm::zip(outputs, output_maps)) {
+        auto& indices = output_indices.emplace_back(EvalAffineMap(map, ivs));
+        bbargs.push_back(output.ExtractElement(indices));
+      }
+      // Evaluate region.
+      auto yielded = Interpret(state, generic.getRegion(), bbargs);
+      if (state.HasFailure()) {
+        return;
+      }
+      // Insert yielded values in the outputs.
+      for (auto [output, indices, yield] :
+           llvm::zip(outputs, output_indices, yielded)) {
+        output.InsertElement(indices, yield);
+      }
+    }
+  };
+  run(0);
+
+  if (generic.getNumResults() == 0) return {};
+  return outputs;
+}
+
+llvm::SmallVector<InterpreterValue> Map(InterpreterState& state,
+                                        linalg::MapOp op,
+                                        ArrayRef<InterpreterValue> inputs,
+                                        const InterpreterValue& init) {
+  InterpreterValue output =
+      isa<TensorType>(op.getInit().getType()) ? init.Clone() : init;
+
+  InterpreterScope scope(state);
+  SmallVector<int64_t> ivs(output.View().Rank());
+  scope.SetSideChannel(std::make_shared<IterationIndexSideChannel>(ivs));
+  for (const auto& indices : output.View().Indices()) {
+    std::copy(indices.begin(), indices.end(), ivs.begin());
+    llvm::SmallVector<InterpreterValue> args;
+    for (auto& input : inputs) {
+      args.push_back(input.ExtractElement(indices));
+    }
+    auto yielded = Interpret(state, op.getRegion(), args);
+    if (state.HasFailure()) {
+      break;
+    }
+    output.InsertElement(indices, yielded[0]);
+  }
+
+  if (op.getNumResults() == 0) {
+    return {};
+  }
+  return {std::move(output)};
+}
+
+llvm::SmallVector<InterpreterValue> Reduce(InterpreterState& state,
+                                           linalg::ReduceOp reduce,
+                                           ArrayRef<InterpreterValue> ins,
+                                           ArrayRef<InterpreterValue> inits) {
+  auto dims = reduce.getDimensions();
+  SmallVector<InterpreterValue> output;
+  for (auto [ty, init] : llvm::zip(reduce.getInits().getTypes(), inits)) {
+    output.push_back(isa<TensorType>(ty) ? init.Clone() : init);
+  }
+  for (const auto& index : ins[0].View().Indices()) {
+    auto dst_index = index;
+    for (int64_t dim : llvm::reverse(dims)) {
+      dst_index.erase(dst_index.begin() + dim);
+    }
+
+    SmallVector<InterpreterValue> args;
+    for (auto& in : ins) {
+      args.push_back(in.ExtractElement(index));
+    }
+    for (auto& out : output) {
+      args.push_back(out.ExtractElement(dst_index));
+    }
+    auto new_values = Interpret(state, reduce.getRegion(), args);
+    if (state.HasFailure()) {
+      return {};
+    }
+
+    for (auto [out, value] : llvm::zip(output, new_values)) {
+      out.InsertElement(dst_index, value);
+    }
+  }
+  if (reduce->getNumResults() == 0) {
+    return {};
+  }
+  return output;
+}
+
+llvm::SmallVector<InterpreterValue> Fill(InterpreterState&, linalg::FillOp op,
+                                         const InterpreterValue& value,
+                                         const InterpreterValue& init) {
+  // TODO(jreiffers): Support variadic Fill.
+  InterpreterValue output = GetInitOperand(op.getOutputs(), 0, {init});
+  output.Fill([&](llvm::ArrayRef<int64_t>) { return value; });
+  if (op.getNumResults() == 0) {
+    return {};
+  }
+  return {std::move(output)};
+}
+
+int64_t Index(InterpreterState& state, linalg::IndexOp index) {
+  return state.GetTopScope()
+      ->GetSideChannel<IterationIndexSideChannel>()
+      ->GetIndices()[index.getDim()];
+}
+
+SmallVector<InterpreterValue> Matmul(InterpreterState& state,
+                                     linalg::MatmulOp matmul,
+                                     ArrayRef<InterpreterValue> inputs,
+                                     const InterpreterValue& init) {
+  if (inputs.size() != 2) {
+    state.AddFailure("Invalid matmul");
+    return {};
+  }
+  const auto& lhs = inputs[0];
+  const auto& rhs = inputs[1];
+  auto ty = matmul.getOutputs()[0].getType();
+  auto result = isa<TensorType>(ty) ? init.Clone() : init;
+  DispatchScalarType(ty, [&](auto dummy) {
+    using TT = TensorOrMemref<decltype(dummy)>;
+    auto lhs_tensor = std::get<TT>(lhs.storage);
+    auto rhs_tensor = std::get<TT>(rhs.storage);
+    auto result_tensor = std::get<TT>(result.storage);
+    for (int64_t i = 0; i < result_tensor.view.sizes[0]; ++i) {
+      for (int64_t j = 0; j < result_tensor.view.sizes[1]; ++j) {
+        for (int64_t k = 0; k < lhs_tensor.view.sizes[1]; ++k) {
+          result_tensor.at({i, j}) +=
+              lhs_tensor.at({i, k}) * rhs_tensor.at({k, j});
+        }
+      }
+    }
+  });
+
+  if (matmul.getNumResults() == 0) {
+    return {};
+  }
+  return {result};
+}
+
+SmallVector<InterpreterValue> Transpose(InterpreterState&,
+                                        linalg::TransposeOp transpose,
+                                        const InterpreterValue& input,
+                                        InterpreterValue init) {
+  auto transposed = TransposeImpl(input, transpose.getPermutation());
+  if (transpose.getNumResults() == 1) {
+    return {transposed};
+  }
+
+  init.Fill([&](auto index) { return transposed.ExtractElement(index); });
+  return {};
+}
+
+SmallVector<InterpreterValue> Dot(InterpreterState&, linalg::DotOp op,
+                                  ArrayRef<InterpreterValue> inputs,
+                                  InterpreterValue acc) {
+  const auto& lhs = inputs[0];
+  const auto& rhs = inputs[1];
+  if (op.getOutputs()[0].getType().isa<TensorType>()) {
+    acc = acc.Clone();
+  }
+  DispatchScalarType(op.getOutputs()[0].getType(), [&](auto dummy) {
+    using TT = TensorOrMemref<decltype(dummy)>;
+    auto lhs_tensor = std::get<TT>(lhs.storage);
+    auto rhs_tensor = std::get<TT>(rhs.storage);
+    auto result_tensor = std::get<TT>(acc.storage);
+    for (int64_t k = 0; k < lhs_tensor.view.sizes[0]; ++k) {
+      result_tensor.at({}) += lhs_tensor.at(k) * rhs_tensor.at(k);
+    }
+  });
+
+  if (op.getNumResults() == 0) {
+    return {};
+  }
+  return {std::move(acc)};
+}
+
+SmallVector<InterpreterValue> Vecmat(InterpreterState&, linalg::VecmatOp op,
+                                     ArrayRef<InterpreterValue> inputs,
+                                     InterpreterValue acc) {
+  const auto& lhs = inputs[0];
+  const auto& rhs = inputs[1];
+  if (op.getOutputs()[0].getType().isa<TensorType>()) {
+    acc = acc.Clone();
+  }
+  DispatchScalarType(op.getOutputs()[0].getType(), [&](auto dummy) {
+    using TT = TensorOrMemref<decltype(dummy)>;
+    auto lhs_tensor = std::get<TT>(lhs.storage);
+    auto rhs_tensor = std::get<TT>(rhs.storage);
+    auto result_tensor = std::get<TT>(acc.storage);
+    for (int64_t j = 0; j < result_tensor.view.sizes[0]; ++j) {
+      for (int64_t k = 0; k < lhs_tensor.view.sizes[0]; ++k) {
+        result_tensor.at(j) += lhs_tensor.at(k) * rhs_tensor.at({k, j});
+      }
+    }
+  });
+
+  if (op.getNumResults() == 0) {
+    return {};
+  }
+  return {std::move(acc)};
+}
+
+REGISTER_MLIR_INTERPRETER_OP("linalg.yield", NoOpTerminator);
+REGISTER_MLIR_INTERPRETER_OP(Broadcast);
+REGISTER_MLIR_INTERPRETER_OP(Dot);
+REGISTER_MLIR_INTERPRETER_OP(Fill);
+REGISTER_MLIR_INTERPRETER_OP(Generic);
+REGISTER_MLIR_INTERPRETER_OP(Index);
+REGISTER_MLIR_INTERPRETER_OP(Map);
+REGISTER_MLIR_INTERPRETER_OP(Matmul);
+REGISTER_MLIR_INTERPRETER_OP(Reduce);
+REGISTER_MLIR_INTERPRETER_OP(Transpose);
+REGISTER_MLIR_INTERPRETER_OP(Vecmat);
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/math.cc b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/math.cc
new file mode 100644
index 00000000000000..02bfded3404e8d
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/math.cc
@@ -0,0 +1,64 @@
+/* Copyright 2022 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/mlir/tools/mlir_interpreter/dialects/cwise_math.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value_util.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/registration.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+struct CopySign : CwiseFloat {
+  template <typename T>
+  static T Apply(T a, T b) {
+    return std::copysign(a, b);
+  }
+};
+
+REGISTER_MLIR_INTERPRETER_OP("math.absf", ApplyCwiseMap<Abs>);
+REGISTER_MLIR_INTERPRETER_OP("math.absi", ApplyCwiseMap<Abs>);
+REGISTER_MLIR_INTERPRETER_OP("math.atan", ApplyCwiseMap<ATan>);
+REGISTER_MLIR_INTERPRETER_OP("math.atan2", ApplyCwiseBinaryMap<ATan2>);
+REGISTER_MLIR_INTERPRETER_OP("math.cbrt", ApplyCwiseMap<Cbrt>);
+REGISTER_MLIR_INTERPRETER_OP("math.ceil", ApplyCwiseMap<Ceil>);
+REGISTER_MLIR_INTERPRETER_OP("math.copysign", ApplyCwiseBinaryMap<CopySign>);
+REGISTER_MLIR_INTERPRETER_OP("math.cos", ApplyCwiseMap<Cos>);
+REGISTER_MLIR_INTERPRETER_OP("math.ctlz", ApplyCwiseMap<Clz>);
+REGISTER_MLIR_INTERPRETER_OP("math.ctpop", ApplyCwiseMap<PopCount>);
+REGISTER_MLIR_INTERPRETER_OP("math.cttz", ApplyCwiseMap<Ctz>);
+REGISTER_MLIR_INTERPRETER_OP("math.erf", ApplyCwiseMap<Erf>);
+REGISTER_MLIR_INTERPRETER_OP("math.exp", ApplyCwiseMap<Exp>);
+REGISTER_MLIR_INTERPRETER_OP("math.exp2", ApplyCwiseMap<Exp2>);
+REGISTER_MLIR_INTERPRETER_OP("math.expm1", ApplyCwiseMap<ExpM1>);
+REGISTER_MLIR_INTERPRETER_OP("math.floor", ApplyCwiseMap<Floor>);
+REGISTER_MLIR_INTERPRETER_OP("math.ipowi", ApplyCwiseBinaryMap<Power>);
+REGISTER_MLIR_INTERPRETER_OP("math.log", ApplyCwiseMap<Log>);
+REGISTER_MLIR_INTERPRETER_OP("math.log10", ApplyCwiseMap<Log10>);
+REGISTER_MLIR_INTERPRETER_OP("math.log1p", ApplyCwiseMap<Log1P>);
+REGISTER_MLIR_INTERPRETER_OP("math.log2", ApplyCwiseMap<Log2>);
+REGISTER_MLIR_INTERPRETER_OP("math.powf", ApplyCwiseBinaryMap<Power>);
+REGISTER_MLIR_INTERPRETER_OP("math.round", ApplyCwiseMap<Round>);
+REGISTER_MLIR_INTERPRETER_OP("math.roundeven", ApplyCwiseMap<NearbyInt>);
+REGISTER_MLIR_INTERPRETER_OP("math.rsqrt", ApplyCwiseMap<RSqrt>);
+REGISTER_MLIR_INTERPRETER_OP("math.sin", ApplyCwiseMap<Sin>);
+REGISTER_MLIR_INTERPRETER_OP("math.sqrt", ApplyCwiseMap<Sqrt>);
+REGISTER_MLIR_INTERPRETER_OP("math.tan", ApplyCwiseMap<Tan>);
+REGISTER_MLIR_INTERPRETER_OP("math.tanh", ApplyCwiseMap<TanH>);
+REGISTER_MLIR_INTERPRETER_OP("math.trunc", ApplyCwiseMap<Trunc>);
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/memref.cc b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/memref.cc
new file mode 100644
index 00000000000000..040e67864049da
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/memref.cc
@@ -0,0 +1,253 @@
+/* Copyright 2022 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
+
+// clang-format erroneously puts the MemRef header above.
+#include <algorithm>  // NOLINT
+#include <cassert>    // NOLINT
+#include <cstdint>    // NOLINT
+#include <utility>    // NOLINT
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/mlir/tools/mlir_interpreter/dialects/util.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/registration.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/tensor_or_memref.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+InterpreterValue Load(InterpreterState& state, memref::LoadOp,
+                      const InterpreterValue& memref,
+                      ArrayRef<int64_t> indices) {
+  if (!memref.GetBuffer()) {
+    state.AddFailure("null pointer dereference.");
+    return {};
+  }
+  if (!memref.View().InBounds(indices)) {
+    state.AddFailure("array index out of bounds");
+    return {};
+  }
+  return memref.ExtractElement(indices);
+}
+
+void Store(InterpreterState& state, memref::StoreOp,
+           const InterpreterValue& value, InterpreterValue memref,
+           ArrayRef<int64_t> indices) {
+  if (memref.View().InBounds(indices)) {
+    memref.InsertElement(indices, value);
+  } else {
+    state.AddFailure("array index out of bounds");
+  }
+}
+
+// TODO(jreiffers): Support symbol operands.
+InterpreterValue Alloc(InterpreterState& state, memref::AllocOp alloc,
+                       ArrayRef<int64_t> dynamic_sizes) {
+  auto ty = cast<ShapedType>(alloc->getResultTypes().front());
+  auto shape = ReplaceDynamicVals(ty.getShape(), dynamic_sizes);
+  auto result =
+      InterpreterValue::MakeTensor(ty.getElementType(), std::move(shape));
+  if (auto* stats = state.GetOptions().stats) {
+    stats->heap_size += result.GetBuffer()->GetByteSize();
+    stats->peak_heap_size = std::max(stats->peak_heap_size, stats->heap_size);
+    ++stats->num_allocations;
+  }
+  result.GetBuffer()->SetAllocatedBy(alloc);
+  return result;
+}
+
+InterpreterValue AllocA(InterpreterState&, memref::AllocaOp alloc,
+                        ArrayRef<int64_t> dynamic_sizes) {
+  auto ty = cast<ShapedType>(alloc->getResultTypes().front());
+  auto shape = ReplaceDynamicVals(ty.getShape(), dynamic_sizes);
+  auto result =
+      InterpreterValue::MakeTensor(ty.getElementType(), std::move(shape));
+  result.GetBuffer()->SetIsAlloca();
+  result.GetBuffer()->SetAllocatedBy(alloc);
+  return result;
+}
+
+void Dealloc(InterpreterState& state, memref::DeallocOp op,
+             InterpreterValue memref) {
+  if (!memref.GetBuffer()) {
+    state.AddFailure("attempting to deallocate null pointer.");
+    return;
+  }
+  auto buffer = memref.GetBuffer();
+  const auto& view = memref.View();
+  if (auto* stats = state.GetOptions().stats) {
+    stats->heap_size -= buffer->GetByteSize();
+    ++stats->num_deallocations;
+  }
+  if (view.GetNumElements() * memref.GetByteSizeOfElement() !=
+      buffer->GetByteSize()) {
+    state.AddFailure("Attempting to deallocate a subview");
+  } else if (!state.GetOptions().disable_deallocations) {
+    buffer->Deallocate(op);
+  }
+}
+
+void Copy(InterpreterState&, memref::CopyOp, InterpreterValue source,
+          InterpreterValue dest) {
+  dest.Fill([&](llvm::ArrayRef<int64_t> indices) {
+    return source.ExtractElement(indices);
+  });
+}
+
+InterpreterValue Subview(InterpreterState& state, memref::SubViewOp subview,
+                         const InterpreterValue& memref,
+                         ArrayRef<int64_t> dynamic_offsets,
+                         ArrayRef<int64_t> dynamic_sizes,
+                         ArrayRef<int64_t> dynamic_strides) {
+  auto v = ExtractOffsetsSizesStrides(dynamic_offsets, dynamic_sizes,
+                                      dynamic_strides, subview);
+  auto out = memref;
+  auto& out_view = out.View();
+  if (!out_view.Subview(v.offsets, v.sizes, v.strides).succeeded()) {
+    state.AddFailure("subview out of bounds");
+    return {};
+  }
+
+  if (subview.getResult().getType().getRank() == out_view.Rank()) {
+    return out;
+  }
+
+  auto shape = subview.getResult().getType().getShape();
+  // TODO(jreiffers): Check why subview.getDroppedDims() yields the wrong shape
+  // here for 1x2x2x3 (-> 1x2x1x3) -> 1x2x3 (claiming 0 is dropped).
+  int64_t dim = 0;
+  while (dim < out_view.Rank() && dim < shape.size()) {
+    if (shape[dim] != 1 && out_view.sizes[dim] == 1) {
+      out_view.sizes.erase(out_view.sizes.begin() + dim);
+      out_view.strides.erase(out_view.strides.begin() + dim);
+    } else {
+      assert((shape[dim] < 0 || out_view.sizes[dim] == shape[dim]) &&
+             "expected static size to match");
+      ++dim;
+    }
+  }
+  while (dim < out_view.Rank()) {
+    assert(out_view.sizes.back() == 1 && "expected remaining dims to be 1");
+    out_view.sizes.pop_back();
+    out_view.strides.pop_back();
+  }
+  return out;
+}
+
+llvm::SmallVector<InterpreterValue> CollapseShape(
+    InterpreterState& state, memref::CollapseShapeOp collapse,
+    const InterpreterValue& memref) {
+  const BufferView& input_view = memref.View();
+  InterpreterValue out = memref;
+  auto& out_view = out.View();
+  out_view.sizes.clear();
+  out_view.strides.clear();
+
+  for (const auto& group : collapse.getReassociationIndices()) {
+    if (auto stride = input_view.GetCollapsedStride(group)) {
+      out_view.strides.push_back(*stride);
+      int64_t& size = out_view.sizes.emplace_back(1);
+      for (int64_t dim : group) size *= input_view.sizes[dim];
+    } else {
+      state.AddFailure("cannot collapse dimensions without a common stride");
+      return {};
+    }
+  }
+
+  return {out};
+}
+
+InterpreterValue Cast(InterpreterState&, memref::CastOp,
+                      InterpreterValue memref) {
+  return memref;
+}
+
+// TODO(jreiffers): Implement full expand_shape support.
+InterpreterValue ExpandShape(InterpreterState& state, memref::ExpandShapeOp op,
+                             InterpreterValue memref) {
+  BufferView input_view = memref.View();
+  auto out_ty = cast<MemRefType>(op->getResultTypes()[0]);
+  if (out_ty.getNumDynamicDims() > 0) {
+    state.AddFailure("dynamic dimensions unsupported.");
+    return {};
+  }
+
+  InterpreterValue out = memref;
+  auto& out_view = out.View();
+  out_view.strides.clear();
+  out_view.sizes = llvm::to_vector(out_ty.getShape());
+  int64_t dummy;
+  if (!getStridesAndOffset(out_ty, out_view.strides, dummy).succeeded()) {
+    if (input_view.strides != BufferView::GetDefaultStrides(input_view.sizes)) {
+      state.AddFailure("unsupported strides");
+      return {};
+    }
+    out_view.strides = BufferView::GetDefaultStrides(out_view.sizes);
+  }
+
+  return out;
+}
+
+InterpreterValue GetGlobal(InterpreterState& state,
+                           memref::GetGlobalOp get_global) {
+  auto global = llvm::cast<memref::GlobalOp>(
+      state.GetSymbols().lookup(get_global.getName()));
+
+  auto value = global.getConstantInitValue();
+  assert(value && "mutable globals are not implemented");
+
+  auto ty = cast<ShapedType>(get_global->getResultTypes()[0]);
+  return DispatchScalarType(ty, [&](auto dummy) -> InterpreterValue {
+    auto values = value.getValues<decltype(dummy)>();
+    auto result = TensorOrMemref<decltype(dummy)>::Empty(ty.getShape());
+    auto value_it = values.begin();
+    for (const auto& index : result.view.Indices()) {
+      result.at(index) = *value_it;
+      ++value_it;
+    }
+    return {result};
+  });
+}
+
+int64_t Dim(InterpreterState& state, memref::DimOp,
+            const InterpreterValue& memref, int64_t dim) {
+  return DimImpl(memref, dim, state);
+}
+
+REGISTER_MLIR_INTERPRETER_OP(Alloc);
+REGISTER_MLIR_INTERPRETER_OP(AllocA);
+REGISTER_MLIR_INTERPRETER_OP(Cast);
+REGISTER_MLIR_INTERPRETER_OP(CollapseShape);
+REGISTER_MLIR_INTERPRETER_OP(Copy);
+REGISTER_MLIR_INTERPRETER_OP(Dealloc);
+REGISTER_MLIR_INTERPRETER_OP(Dim);
+REGISTER_MLIR_INTERPRETER_OP(ExpandShape);
+REGISTER_MLIR_INTERPRETER_OP(GetGlobal);
+REGISTER_MLIR_INTERPRETER_OP(Load);
+REGISTER_MLIR_INTERPRETER_OP(Store);
+REGISTER_MLIR_INTERPRETER_OP(Subview);
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/mhlo.cc b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/mhlo.cc
new file mode 100644
index 00000000000000..18a6c1f0ce34c0
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/mhlo.cc
@@ -0,0 +1,871 @@
+/* Copyright 2022 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// NOTE: These ops aim to match the StableHLO spec and the behavior of the XLA
+// compiler. For op semantics and a reference implementation, check the
+// StableHLO repository (the spec is here:
+// https://github.com/openxla/stablehlo/blob/main/docs/spec.md).
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <numeric>
+#include <type_traits>
+#include <utility>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/mlir/tools/mlir_interpreter/dialects/comparators.h"
+#include "xla/mlir/tools/mlir_interpreter/dialects/cwise_math.h"
+#include "xla/mlir/tools/mlir_interpreter/dialects/util.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value_util.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/registration.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/tensor_or_memref.h"
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+InterpreterValue MakeTuple(MutableArrayRef<InterpreterValue> values) {
+  Tuple result;
+  for (auto& value : values) {
+    result.values.push_back(
+        std::make_shared<InterpreterValue>(std::move(value)));
+  }
+  return {result};
+}
+
+InterpreterValue GetTupleElement(InterpreterState&, mhlo::GetTupleElementOp get,
+                                 const InterpreterValue& tuple) {
+  return *std::get<Tuple>(tuple.storage).values[get.getIndex()];
+}
+
+InterpreterValue BitcastConvert(InterpreterState&, mhlo::BitcastConvertOp op,
+                                const InterpreterValue& in) {
+  ShapedType ty = cast<ShapedType>(op->getResultTypes()[0]);
+  auto result = DispatchScalarType(ty, [&](auto dummy) -> InterpreterValue {
+    TensorOrMemref<decltype(dummy)> result;
+    result.view = {};
+    result.buffer = in.Clone().GetBuffer();
+    return {result};
+  });
+  auto& out_view = result.View();
+  out_view.strides = BufferView::GetDefaultStrides(ty.getShape());
+  out_view.sizes = llvm::to_vector(ty.getShape());
+  return result;
+}
+
+InterpreterValue BroadcastInDim(InterpreterState&,
+                                mhlo::BroadcastInDimOp broadcast,
+                                InterpreterValue in) {
+  auto boradcast_dims = broadcast.getBroadcastDimensions().getValues<int64_t>();
+  const auto& in_sizes = in.View().sizes;
+  auto out = in.TypedAlike(
+      cast<ShapedType>(broadcast->getResultTypes()[0]).getShape());
+  // TODO(jreiffers): Skip the copy.
+  out.Fill([&](llvm::ArrayRef<int64_t> out_indices) {
+    llvm::SmallVector<int64_t> in_indices;
+    for (auto [inDim, out_dim] : llvm::enumerate(boradcast_dims)) {
+      in_indices.push_back(in_sizes[inDim] == 1 ? 0 : out_indices[out_dim]);
+    }
+    return in.ExtractElement(in_indices);
+  });
+  return out;
+}
+
+InterpreterValue Clamp(InterpreterState&, mhlo::ClampOp,
+                       const InterpreterValue& lb, const InterpreterValue& arg,
+                       const InterpreterValue& ub) {
+  auto result = arg.Clone();
+  for (const auto& index : arg.View().Indices()) {
+    auto lb_scalar = lb.IsTensor() ? lb.ExtractElement(index) : lb;
+    auto ub_scalar = ub.IsTensor() ? ub.ExtractElement(index) : ub;
+    assert(arg.IsTensor() && "clamp only bcasts scalar bounds");
+    auto arg_scalar = arg.ExtractElement(index);
+    auto result_scalar = ApplyCwiseBinaryMap<Min>(
+        ApplyCwiseBinaryMap<Max>(arg_scalar, lb_scalar), ub_scalar);
+    result.InsertElement(index, result_scalar);
+  }
+  return result;
+}
+
+InterpreterValue Concatenate(InterpreterState&, mhlo::ConcatenateOp concat,
+                             ArrayRef<InterpreterValue> vals) {
+  uint64_t dim = concat.getDimension();
+  auto sizes = vals[0].View().sizes;
+  sizes[dim] = 0;
+  for (const auto& val : vals) {
+    sizes[dim] += val.View().sizes[dim];
+  }
+
+  auto result = vals[0].TypedAlike(sizes);
+  int64_t offset = 0;
+  for (const auto& val : vals) {
+    for (auto index : val.View().Indices()) {
+      auto item = val.ExtractElement(index);
+      index[dim] += offset;
+      result.InsertElement(index, item);
+    }
+    offset += val.View().sizes[dim];
+  }
+  return result;
+}
+
+InterpreterValue Reshape(InterpreterState&, mhlo::ReshapeOp reshape,
+                         const InterpreterValue& in) {
+  auto ty = reshape->getResultTypes()[0].cast<mlir::ShapedType>();
+  return ReshapeTensor(in, ty.getShape());
+}
+
+llvm::SmallVector<int64_t> ClampStarts(ArrayRef<int64_t> starts,
+                                       ArrayRef<int64_t> slice_sizes,
+                                       ArrayRef<int64_t> tensor_sizes) {
+  llvm::SmallVector<int64_t> result;
+  for (auto [start, slice_size, tensor_size] :
+       llvm::zip(starts, slice_sizes, tensor_sizes)) {
+    result.push_back(
+        std::max(int64_t{0}, std::min(tensor_size - slice_size, start)));
+  }
+  return result;
+}
+
+InterpreterValue DynamicSlice(InterpreterState&, mhlo::DynamicSliceOp slice,
+                              const InterpreterValue& in,
+                              ArrayRef<int64_t> starts) {
+  auto result = in.TypedAlike(
+      llvm::to_vector(slice.getSliceSizes().getValues<int64_t>()));
+  auto clamed_starts =
+      ClampStarts(starts, result.View().sizes, in.View().sizes);
+  // TODO(jreiffers): Skip the copy.
+  result.Fill([&](llvm::ArrayRef<int64_t> out_indices) {
+    llvm::SmallVector<int64_t> in_indices;
+    for (auto [start, index] : llvm::zip(clamed_starts, out_indices)) {
+      in_indices.push_back(start + index);
+    }
+    return in.ExtractElement(in_indices);
+  });
+  return result;
+}
+
+InterpreterValue DynamicUpdateSlice(InterpreterState&,
+                                    mhlo::DynamicUpdateSliceOp,
+                                    const InterpreterValue& in,
+                                    const InterpreterValue& updates,
+                                    ArrayRef<int64_t> starts) {
+  auto result = in.Clone();
+  auto clamed_starts =
+      ClampStarts(starts, updates.View().sizes, result.View().sizes);
+  for (auto in_indices : updates.View().Indices()) {
+    llvm::SmallVector<int64_t> out_indices;
+    for (auto [start, index] : llvm::zip(clamed_starts, in_indices)) {
+      out_indices.push_back(start + index);
+    }
+    result.InsertElement(out_indices, updates.ExtractElement(in_indices));
+  }
+  return result;
+}
+
+InterpreterValue Slice(InterpreterState&, mhlo::SliceOp slice,
+                       InterpreterValue in) {
+  auto starts = slice.getStartIndices().getValues<int64_t>();
+  auto limits = slice.getLimitIndices().getValues<int64_t>();
+  auto strides = slice.getStrides().getValues<int64_t>();
+
+  llvm::SmallVector<int64_t> sizes;
+  for (auto [start, limit, stride] : llvm::zip(starts, limits, strides)) {
+    sizes.push_back(((limit - start) + (stride - 1)) / stride);
+  }
+  // TODO(jreiffers): Skip the copy.
+  auto result = in.TypedAlike(sizes);
+  result.Fill([&](llvm::ArrayRef<int64_t> out_indices) {
+    llvm::SmallVector<int64_t> in_indices;
+    for (auto [start, stride, index] :
+         llvm::zip(starts, strides, out_indices)) {
+      in_indices.push_back(start + stride * index);
+    }
+    return in.ExtractElement(in_indices);
+  });
+  return result;
+}
+
+llvm::SmallVector<InterpreterValue> Constant(InterpreterState&,
+                                             mhlo::ConstantOp constant) {
+  auto ty = constant->getResultTypes()[0].cast<ShapedType>();
+  return {DispatchScalarType(ty, [&](auto dummy) -> InterpreterValue {
+    if (ty.getElementType().isUnsignedInteger()) {
+      if constexpr (!std::is_same_v<decltype(dummy), bool> &&
+                    std::is_integral_v<decltype(dummy)>) {
+        auto values =
+            constant.getValue()
+                .getValues<
+                    typename std::make_unsigned<decltype(dummy)>::type>();
+        auto result = TensorOrMemref<decltype(dummy)>::Empty(ty.getShape());
+        auto value_it = values.begin();
+        for (const auto& index : result.view.Indices()) {
+          result.at(index) = *value_it;
+          ++value_it;
+        }
+        return {result};
+      } else {
+        llvm_unreachable("invalid input");
+      }
+    } else {
+      auto values = constant.getValue().getValues<decltype(dummy)>();
+      auto result = TensorOrMemref<decltype(dummy)>::Empty(ty.getShape());
+      auto value_it = values.begin();
+      for (const auto& index : result.view.Indices()) {
+        result.at(index) = *value_it;
+        ++value_it;
+      }
+      return {result};
+    }
+  })};
+}
+
+InterpreterValue Pad(InterpreterState&, mhlo::PadOp pad, InterpreterValue arg,
+                     InterpreterValue padding_value) {
+  padding_value = padding_value.ExtractElement({});
+  auto his = pad.getEdgePaddingHigh().getValues<int64_t>();
+  auto los = pad.getEdgePaddingLow().getValues<int64_t>();
+  auto ins = pad.getInteriorPadding().getValues<int64_t>();
+
+  llvm::SmallVector<int64_t> sizes;
+  for (auto [size, lo, in, hi] : llvm::zip(arg.View().sizes, los, ins, his)) {
+    sizes.push_back(size + lo + hi + (size - 1) * in);
+  }
+
+  auto result = arg.TypedAlike(sizes);
+  result.Fill([&](llvm::ArrayRef<int64_t>) { return padding_value; });
+
+  for (const auto& in_indices : arg.View().Indices()) {
+    llvm::SmallVector<int64_t> out_indices;
+    for (auto [in_index, in, lo] : llvm::zip(in_indices, ins, los)) {
+      out_indices.push_back(in_index * (in + 1) + lo);
+    }
+    if (result.View().InBounds(out_indices)) {
+      result.InsertElement(out_indices, arg.ExtractElement(in_indices));
+    }
+  }
+
+  return result;
+}
+
+InterpreterValue Compare(InterpreterState&, mhlo::CompareOp compare,
+                         const InterpreterValue& lhs,
+                         const InterpreterValue& rhs) {
+  switch (compare.getComparisonDirection()) {
+    case mlir::mhlo::ComparisonDirection::EQ:
+      return ApplyCwiseBinaryMap<Foeq>(lhs, rhs);
+    case mlir::mhlo::ComparisonDirection::NE:
+      return ApplyCwiseBinaryMap<Fune>(lhs, rhs);
+    case mlir::mhlo::ComparisonDirection::GE:
+      return ApplyCwiseBinaryMap<Foge>(lhs, rhs);
+    case mlir::mhlo::ComparisonDirection::GT:
+      return ApplyCwiseBinaryMap<Fogt>(lhs, rhs);
+    case mlir::mhlo::ComparisonDirection::LE:
+      return ApplyCwiseBinaryMap<Fole>(lhs, rhs);
+    case mlir::mhlo::ComparisonDirection::LT:
+      return ApplyCwiseBinaryMap<Folt>(lhs, rhs);
+  }
+}
+
+llvm::SmallVector<InterpreterValue> Gather(
+    MutableArrayRef<InterpreterValue> args, mlir::Operation* op,
+    InterpreterState& state) {
+  auto gather = llvm::dyn_cast<mhlo::GatherOp>(op);
+  auto dynamic_gather = llvm::dyn_cast<mhlo::DynamicGatherOp>(op);
+
+  if (!gather && !dynamic_gather) {
+    state.AddFailure("invalid gather");
+    return {};
+  }
+
+  const auto& dims = gather ? gather.getDimensionNumbers()
+                            : dynamic_gather.getDimensionNumbers();
+
+  auto index_vector_dim = dims.getIndexVectorDim();
+  auto start_index_map = dims.getStartIndexMap();
+  auto offset_dims = dims.getOffsetDims();
+  auto collapsed_slice_dims = dims.getCollapsedSliceDims();
+  auto slice_sizes =
+      gather ? llvm::to_vector(gather.getSliceSizes().getValues<int64_t>())
+             : llvm::to_vector(llvm::map_range(
+                   args[2].View().Indices(), [&](const auto& indices) {
+                     return args[2].ExtractElement(indices).AsInt();
+                   }));
+
+  auto& operand = args[0];
+  auto& start_indices = args[1];
+  const auto& operand_view = operand.View();
+  int64_t operand_rank = operand_view.Rank();
+
+  // Make a fake BufferView for the start indices.
+  BufferView start_indices_view = start_indices.View();
+  auto output_rank =
+      static_cast<int64_t>(start_indices_view.Rank() + offset_dims.size());
+  if (index_vector_dim < start_indices_view.Rank()) {
+    --output_rank;
+    start_indices_view.sizes[index_vector_dim] = 1;
+  }
+
+  SmallVector<int64_t> batch_dims;
+  for (int64_t i = 0; i < output_rank; ++i) {
+    if (!llvm::is_contained(offset_dims, i)) {
+      batch_dims.push_back(i);
+    }
+  }
+
+  // Make a fake BufferView for the slice indices.
+  BufferView slice_indices_view{0, SmallVector<int64_t>{slice_sizes}, {}};
+
+  SmallVector<int64_t> non_collapsed_slice_dims;
+  for (int64_t i = 0; i < operand_rank; ++i) {
+    if (!llvm::is_contained(collapsed_slice_dims, i)) {
+      non_collapsed_slice_dims.push_back(i);
+    }
+  }
+
+  SmallVector<int64_t> output_sizes(output_rank);
+  for (auto [output_dim, slice_dim] :
+       llvm::zip(offset_dims, non_collapsed_slice_dims)) {
+    output_sizes[output_dim] = slice_sizes[slice_dim];
+  }
+  for (auto [batch_index, output_dim] : llvm::enumerate(batch_dims)) {
+    if (batch_index >= index_vector_dim) {
+      ++batch_index;
+    }
+    output_sizes[output_dim] = start_indices_view.sizes[batch_index];
+  }
+
+  auto output = operand.TypedAlike(output_sizes);
+  for (auto start_indices_index : start_indices_view.Indices()) {
+    SmallVector<int64_t> operand_base_indices(operand_rank);
+    for (auto [i, dim] : llvm::enumerate(start_index_map)) {
+      if (index_vector_dim < start_indices_view.Rank()) {
+        start_indices_index[index_vector_dim] = static_cast<int64_t>(i);
+      }
+      operand_base_indices[dim] = std::max<int64_t>(
+          0, std::min(start_indices.ExtractElement(start_indices_index).AsInt(),
+                      operand_view.sizes[dim] - slice_sizes[dim]));
+    }
+
+    for (const auto& slice_indices : slice_indices_view.Indices()) {
+      SmallVector<int64_t> operand_indices;
+      for (int64_t i = 0; i < operand_rank; ++i) {
+        operand_indices.push_back(operand_base_indices[i] + slice_indices[i]);
+      }
+
+      SmallVector<int64_t> output_indices(output_rank);
+      for (auto [output_dim, slice_dim] :
+           llvm::zip(offset_dims, non_collapsed_slice_dims)) {
+        output_indices[output_dim] = slice_indices[slice_dim];
+      }
+      for (auto [batch_index, output_dim] : llvm::enumerate(batch_dims)) {
+        output_indices[output_dim] =
+            start_indices_index[batch_index >= index_vector_dim
+                                    ? batch_index + 1
+                                    : batch_index];
+      }
+
+      auto value = operand.ExtractElement(operand_indices);
+      output.InsertElement(output_indices, value);
+    }
+  }
+
+  return {output};
+}
+
+llvm::SmallVector<InterpreterValue> Scatter(
+    InterpreterState& state, mhlo::ScatterOp scatter,
+    ArrayRef<InterpreterValue> n_inputs, InterpreterValue scatter_indices,
+    ArrayRef<InterpreterValue> n_updates) {
+  const auto& dims = scatter.getScatterDimensionNumbers();
+  auto index_vector_dim = dims.getIndexVectorDim();
+  auto scatter_dims_to_operand_dims = dims.getScatterDimsToOperandDims();
+  auto inserted_window_dims = dims.getInsertedWindowDims();
+  auto update_window_dims = dims.getUpdateWindowDims();
+
+  auto input_view = n_inputs.front().View();
+  int64_t operand_rank = input_view.Rank();
+  int64_t updates_rank = n_updates.front().View().Rank();
+  int64_t indices_rank = scatter_indices.View().Rank();
+
+  llvm::SmallVector<int64_t> batch_dims;
+  for (int64_t dim = 0; dim < operand_rank; ++dim) {
+    if (!llvm::is_contained(inserted_window_dims, dim)) {
+      batch_dims.push_back(dim);
+    }
+  }
+
+  llvm::SmallVector<int64_t> update_scatter_dims;
+  for (int64_t dim = 0; dim < updates_rank; ++dim) {
+    if (!llvm::is_contained(update_window_dims, dim)) {
+      update_scatter_dims.push_back(dim);
+    }
+  }
+
+  llvm::SmallVector<InterpreterValue> n_results;
+  for (auto& inputs : n_inputs) {
+    n_results.push_back(inputs.Clone());
+  }
+
+  for (auto [inputs, updates, results] :
+       llvm::zip(n_results, n_updates, n_results)) {
+    const auto& updates_view = updates.View();
+    for (const auto& update_indices : updates_view.Indices()) {
+      llvm::SmallVector<int64_t> input_indices(operand_rank);
+      llvm::SmallVector<int64_t> max_indices(operand_rank);
+      llvm::SmallVector<int64_t> min_indices(operand_rank);
+      llvm::SmallVector<int64_t> scatter_indices_index(indices_rank);
+
+      for (auto [index, dim] : llvm::enumerate(update_scatter_dims)) {
+        scatter_indices_index[index >= index_vector_dim ? index + 1 : index] +=
+            update_indices[dim];
+      }
+
+      for (auto [update_dim, operand_dim] :
+           llvm::zip(update_window_dims, batch_dims)) {
+        input_indices[operand_dim] = update_indices[update_dim];
+        max_indices[operand_dim] = updates_view.sizes[update_dim] - 1;
+      }
+
+      for (auto [index, dim] : llvm::enumerate(scatter_dims_to_operand_dims)) {
+        if (index_vector_dim < indices_rank) {
+          scatter_indices_index[index_vector_dim] = static_cast<int64_t>(index);
+        }
+
+        int64_t scatter_index =
+            scatter_indices.ExtractElement(scatter_indices_index).AsInt();
+        input_indices[dim] += scatter_index;
+        min_indices[dim] += scatter_index;
+        max_indices[dim] += scatter_index;
+      }
+
+      if (!input_view.InBounds(min_indices) ||
+          !input_view.InBounds(max_indices)) {
+        continue;
+      }
+
+      auto current_value = inputs.ExtractElement(input_indices).AsUnitTensor();
+      auto update = updates.ExtractElement(update_indices).AsUnitTensor();
+
+      auto result = Interpret(state, scatter.getUpdateComputation(),
+                              {current_value, update});
+      if (state.HasFailure()) {
+        return n_results;
+      }
+      inputs.InsertElement(input_indices, result.front().ExtractElement({}));
+    }
+  }
+
+  return n_results;
+}
+
+InterpreterValue Select(InterpreterState&, mhlo::SelectOp,
+                        TensorOrMemref<bool> condition,
+                        const InterpreterValue& true_vals,
+                        const InterpreterValue& false_vals) {
+  // TODO(jreiffers): Support implicit broadcasting.
+  auto result = true_vals.Clone();
+  for (const auto& indices : condition.view.Indices()) {
+    if (!condition.at(indices)) {
+      result.InsertElement(indices, false_vals.ExtractElement(indices));
+    }
+  }
+  return result;
+}
+
+InterpreterValue Transpose(InterpreterState&, mhlo::TransposeOp transpose,
+                           const InterpreterValue& tensor) {
+  auto permutation = transpose.getPermutation().getValues<int64_t>();
+  return TransposeImpl(tensor, llvm::to_vector(permutation));
+}
+
+InterpreterValue Iota(InterpreterState&, mhlo::IotaOp iota) {
+  auto dim = iota.getIotaDimension();
+  auto ty = iota->getResultTypes()[0].cast<ShapedType>();
+  return DispatchScalarType(ty, [&](auto dummy) -> InterpreterValue {
+    auto result = TensorOrMemref<decltype(dummy)>::Empty({ty.getShape()[dim]});
+    for (const auto& index : result.view.Indices()) {
+      result.at(index) = index[0];
+    }
+    result.view.sizes = SmallVector<int64_t>(ty.getShape());
+    result.view.strides = SmallVector<int64_t>(result.view.sizes.size());
+    result.view.strides[dim] = 1;
+    return {result};
+  });
+}
+
+template <typename R>
+struct Caster {
+  template <typename A>
+  constexpr static bool SupportedType() {
+    return std::is_convertible_v<A, R>;
+  }
+
+  template <typename A>
+  static R Apply(A v) {
+    return v;
+  }
+};
+
+InterpreterValue Convert(InterpreterState&, mhlo::ConvertOp op,
+                         InterpreterValue in) {
+  return DispatchScalarType(op->getResultTypes()[0],
+                            [&](auto dummy) -> InterpreterValue {
+                              return ApplyCwiseMap<Caster<decltype(dummy)>>(in);
+                            });
+}
+
+llvm::SmallVector<InterpreterValue> While(
+    InterpreterState& state, mhlo::WhileOp whileop,
+    SmallVector<InterpreterValue> loop_vars) {
+  auto cond = Interpret(state, whileop.getRegion(0), loop_vars);
+  while (!state.HasFailure() &&
+         std::get<TensorOrMemref<bool>>(cond.front().storage).at({})) {
+    loop_vars = Interpret(state, whileop.getRegion(1), loop_vars);
+    if (state.HasFailure()) {
+      break;
+    }
+    cond = Interpret(state, whileop.getRegion(0), loop_vars);
+  }
+  return loop_vars;
+}
+
+InterpreterValue Copy(InterpreterState&, mhlo::CopyOp op,
+                      const InterpreterValue& tensor) {
+  auto layout = op->getAttr("result_layout");
+  if (!layout) {
+    return tensor.Clone();
+  }
+  return tensor.Clone(
+      llvm::to_vector(cast<DenseIntElementsAttr>(layout).getValues<int64_t>()));
+}
+
+llvm::SmallVector<InterpreterValue> Fusion(InterpreterState& state,
+                                           mhlo::FusionOp fusion,
+                                           ArrayRef<InterpreterValue> args) {
+  return Interpret(state, fusion.getRegion(), args);
+}
+
+llvm::SmallVector<InterpreterValue> Reduce(InterpreterState& state,
+                                           mhlo::ReduceOp reduce,
+                                           ArrayRef<InterpreterValue> operands,
+                                           ArrayRef<InterpreterValue> inits) {
+  auto dims = reduce.getDimensions().getValues<int64_t>();
+
+  auto out_sizes = operands.front().View().sizes;
+  for (int64_t dim : llvm::reverse(dims)) {
+    out_sizes.erase(out_sizes.begin() + dim);
+  }
+
+  SmallVector<InterpreterValue> results;
+  for (auto [operand, init] : llvm::zip(operands, inits)) {
+    results.push_back(operand.TypedAlike(out_sizes));
+    auto init_scalar = init.ExtractElement({});
+    results.back().Fill([&](llvm::ArrayRef<int64_t>) { return init_scalar; });
+  }
+
+  for (const auto& index : operands[0].View().Indices()) {
+    auto dst_index = index;
+    for (int64_t dim : llvm::reverse(dims)) {
+      dst_index.erase(dst_index.begin() + dim);
+    }
+
+    SmallVector<InterpreterValue> args;
+    for (auto& result : results) {
+      args.push_back(result.ExtractElement(dst_index).AsUnitTensor());
+    }
+    for (auto& operand : operands) {
+      args.push_back(operand.ExtractElement(index).AsUnitTensor());
+    }
+    auto new_values = Interpret(state, reduce.getRegion(), args);
+    if (state.HasFailure()) {
+      return results;
+    }
+
+    for (auto [result, value] : llvm::zip(results, new_values)) {
+      result.InsertElement(dst_index, value.ExtractElement({}));
+    }
+  }
+  return results;
+}
+
+SmallVector<InterpreterValue> Sort(InterpreterState& state, mhlo::SortOp op,
+                                   ArrayRef<InterpreterValue> inputs) {
+  const auto& shape = inputs.front().View().sizes;
+  uint64_t dim =
+      op.getDimension() < shape.size() ? op.getDimension() : shape.size() - 1;
+  SmallVector<int64_t> indices(shape[dim]);
+
+  auto iter_view = inputs.front().View();
+  iter_view.sizes[dim] = 1;
+  SmallVector<InterpreterValue> results;
+  for (const auto& input : inputs) {
+    results.push_back(input.TypedAlike(input.View().sizes));
+  }
+  for (const auto& offset : iter_view.Indices()) {
+    std::iota(indices.begin(), indices.end(), 0);
+    auto comparator = [&](int64_t a, int64_t b) {
+      if (state.HasFailure()) {
+        return false;
+      }
+      SmallVector<InterpreterValue> args;
+      auto index_a = offset;
+      index_a[dim] = a;
+      auto index_b = offset;
+      index_b[dim] = b;
+      for (const auto& input : inputs) {
+        args.push_back(input.ExtractElement(index_a).AsUnitTensor());
+        args.push_back(input.ExtractElement(index_b).AsUnitTensor());
+      }
+      auto result = Interpret(state, op.getComparator(), args);
+      return !state.HasFailure() && result[0].ExtractElement({}).AsInt() != 0;
+    };
+
+    if (op.getIsStable()) {
+      std::stable_sort(indices.begin(), indices.end(), comparator);
+    } else {
+      std::sort(indices.begin(), indices.end(), comparator);
+    }
+
+    auto copy = offset;
+    for (auto [dst, src] : llvm::enumerate(indices)) {
+      for (auto [input, result] : llvm::zip(inputs, results)) {
+        copy[dim] = src;
+        auto elem = input.ExtractElement(copy);
+        copy[dim] = static_cast<int64_t>(dst);
+        result.InsertElement(copy, elem);
+      }
+    }
+  }
+  return results;
+}
+
+SmallVector<InterpreterValue> Case(InterpreterState& state, mhlo::CaseOp op,
+                                   int64_t index) {
+  if (index < 0 || index >= op->getNumResults()) {
+    index = op->getNumRegions() - 1;
+  }
+  return Interpret(state, op->getRegion(index), {});
+}
+
+InterpreterValue DotGeneralImpl(InterpreterValue& lhs, InterpreterValue& rhs,
+                                ArrayRef<int64_t> lhs_contracting,
+                                ArrayRef<int64_t> rhs_contracting,
+                                ArrayRef<int64_t> lhs_batch,
+                                ArrayRef<int64_t> rhs_batch,
+                                mlir::Type element_ty) {
+  auto& lhsv = lhs.View();
+  auto& rhsv = rhs.View();
+  SmallVector<int64_t> dimensions;
+  SmallVector<int64_t> lhs_non_batch;
+  SmallVector<int64_t> rhs_non_batch;
+  auto nbatch = static_cast<int64_t>(lhs_batch.size());
+  for (int64_t lhs_dim : lhs_batch) {
+    dimensions.push_back(lhsv.sizes[lhs_dim]);
+  }
+  for (int64_t i = 0; i < lhsv.Rank(); i++) {
+    if (!llvm::is_contained(lhs_contracting, i) &&
+        !llvm::is_contained(lhs_batch, i)) {
+      dimensions.push_back(lhsv.sizes[i]);
+      lhs_non_batch.push_back(i);
+    }
+  }
+  for (int64_t i = 0; i < rhs.View().Rank(); i++) {
+    if (!llvm::is_contained(rhs_contracting, i) &&
+        !llvm::is_contained(rhs_batch, i)) {
+      dimensions.push_back(rhsv.sizes[i]);
+      rhs_non_batch.push_back(i);
+    }
+  }
+
+  SmallVector<int64_t> lhs_index(lhsv.Rank());
+  SmallVector<int64_t> rhs_index(rhsv.Rank());
+  SmallVector<int64_t> output_index(dimensions.size());
+  auto output = lhs.TypedAlike(dimensions);
+
+  std::function<void(int64_t)> apply_batch, apply_lhs, apply_rhs,
+      apply_contract;
+
+  apply_batch = [&](int64_t dim) {
+    if (dim >= nbatch) {
+      apply_lhs(0);
+      return;
+    }
+    for (int64_t i = 0; i < dimensions[dim]; ++i) {
+      lhs_index[lhs_batch[dim]] = i;
+      rhs_index[rhs_batch[dim]] = i;
+      output_index[dim] = i;
+      apply_batch(dim + 1);
+    }
+  };
+
+  apply_lhs = [&](int64_t dim) {
+    if (dim >= lhs_non_batch.size()) {
+      apply_rhs(0);
+      return;
+    }
+    int64_t out_dim = nbatch + dim;
+    for (int64_t i = 0; i < dimensions[out_dim]; ++i) {
+      lhs_index[lhs_non_batch[dim]] = i;
+      output_index[out_dim] = i;
+      apply_lhs(dim + 1);
+    }
+  };
+
+  apply_rhs = [&](int64_t dim) {
+    if (dim >= rhs_non_batch.size()) {
+      apply_contract(0);
+      return;
+    }
+    auto out_dim = static_cast<int64_t>(nbatch + lhs_non_batch.size() + dim);
+    for (int64_t i = 0; i < dimensions[out_dim]; ++i) {
+      rhs_index[rhs_non_batch[dim]] = i;
+      output_index[out_dim] = i;
+      apply_rhs(dim + 1);
+    }
+  };
+
+  apply_contract = [&](int64_t dim) {
+    if (dim >= lhs_contracting.size()) {
+      DispatchScalarType(element_ty, [&](auto dummy) {
+        using T = TensorOrMemref<decltype(dummy)>;
+        std::get<T>(output.storage).at(output_index) +=
+            std::get<T>(lhs.storage).at(lhs_index) *
+            std::get<T>(rhs.storage).at(rhs_index);
+      });
+      return;
+    }
+    for (int64_t i = 0; i < lhsv.sizes[lhs_contracting[dim]]; ++i) {
+      lhs_index[lhs_contracting[dim]] = i;
+      rhs_index[rhs_contracting[dim]] = i;
+      apply_contract(dim + 1);
+    }
+  };
+
+  apply_batch(0);
+  return output;
+}
+
+// TODO(jreiffers): Unify this with DotGeneral.
+InterpreterValue Dot(InterpreterState& state, mhlo::DotOp op,
+                     const InterpreterValue& lhs, const InterpreterValue& rhs) {
+  auto ty = cast<ShapedType>(op->getResultTypes()[0]);
+  auto result = lhs.TypedAlike(ty.getShape());
+
+  if (lhs.View().Rank() == 1 && rhs.View().Rank() == 1) {
+    DispatchScalarType(ty, [&](auto dummy) {
+      using T = decltype(dummy);
+      using TT = TensorOrMemref<T>;
+      auto lhs_tensor = std::get<TT>(lhs.storage);
+      auto rhs_tensor = std::get<TT>(rhs.storage);
+
+      T product = 0;
+      for (int64_t i = 0; i < lhs_tensor.view.sizes[0]; ++i) {
+        product += lhs_tensor.at(i) * rhs_tensor.at(i);
+      }
+      std::get<TT>(result.storage).at({}) += product;
+    });
+  } else if (lhs.View().Rank() == 2 && rhs.View().Rank() == 1) {
+    DispatchScalarType(ty, [&](auto dummy) {
+      using TT = TensorOrMemref<decltype(dummy)>;
+      auto lhs_tensor = std::get<TT>(lhs.storage);
+      auto rhs_tensor = std::get<TT>(rhs.storage);
+      auto result_tensor = std::get<TT>(result.storage);
+      for (int64_t i = 0; i < result_tensor.view.sizes[0]; ++i) {
+        for (int64_t j = 0; j < rhs_tensor.view.sizes[0]; ++j) {
+          result_tensor.at(i) += lhs_tensor.at({i, j}) * rhs_tensor.at(j);
+        }
+      }
+    });
+  } else if (lhs.View().Rank() == 2 && rhs.View().Rank() == 2) {
+    DispatchScalarType(ty, [&](auto dummy) {
+      using TT = TensorOrMemref<decltype(dummy)>;
+      auto lhs_tensor = std::get<TT>(lhs.storage);
+      auto rhs_tensor = std::get<TT>(rhs.storage);
+      auto result_tensor = std::get<TT>(result.storage);
+      for (int64_t i = 0; i < result_tensor.view.sizes[0]; ++i) {
+        for (int64_t j = 0; j < result_tensor.view.sizes[1]; ++j) {
+          for (int64_t k = 0; k < lhs_tensor.view.sizes[1]; ++k) {
+            result_tensor.at({i, j}) +=
+                lhs_tensor.at({i, k}) * rhs_tensor.at({k, j});
+          }
+        }
+      }
+    });
+  } else {
+    state.AddFailure("unsupported dot");
+  }
+
+  return result;
+}
+
+InterpreterValue DotGeneral(InterpreterState&, mhlo::DotGeneralOp op,
+                            InterpreterValue lhs, InterpreterValue rhs) {
+  const auto& dims = op.getDotDimensionNumbers();
+  return DotGeneralImpl(
+      lhs, rhs, dims.getLhsContractingDimensions(),
+      dims.getRhsContractingDimensions(), dims.getLhsBatchingDimensions(),
+      dims.getRhsBatchingDimensions(),
+      op->getResultTypes()[0].cast<ShapedType>().getElementType());
+}
+
+// TODO(jreiffers): Migrate remaining ops to the safer signature.
+REGISTER_MLIR_INTERPRETER_OP("mhlo.dynamic_gather", Gather);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.gather", Gather);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.return", NoOpTerminator);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.tuple", MakeTuple);
+REGISTER_MLIR_INTERPRETER_OP(BitcastConvert);
+REGISTER_MLIR_INTERPRETER_OP(BroadcastInDim);
+REGISTER_MLIR_INTERPRETER_OP(Clamp);
+REGISTER_MLIR_INTERPRETER_OP(Compare);
+REGISTER_MLIR_INTERPRETER_OP(Concatenate);
+REGISTER_MLIR_INTERPRETER_OP(Constant);
+REGISTER_MLIR_INTERPRETER_OP(Convert);
+REGISTER_MLIR_INTERPRETER_OP(Copy);
+REGISTER_MLIR_INTERPRETER_OP(Dot);
+REGISTER_MLIR_INTERPRETER_OP(DotGeneral);
+REGISTER_MLIR_INTERPRETER_OP(DynamicSlice);
+REGISTER_MLIR_INTERPRETER_OP(DynamicUpdateSlice);
+REGISTER_MLIR_INTERPRETER_OP(Fusion);
+REGISTER_MLIR_INTERPRETER_OP(GetTupleElement);
+REGISTER_MLIR_INTERPRETER_OP(Iota);
+REGISTER_MLIR_INTERPRETER_OP(Case);
+REGISTER_MLIR_INTERPRETER_OP(While);
+REGISTER_MLIR_INTERPRETER_OP(Pad);
+REGISTER_MLIR_INTERPRETER_OP(Reduce);
+REGISTER_MLIR_INTERPRETER_OP(Reshape);
+REGISTER_MLIR_INTERPRETER_OP(Scatter);
+REGISTER_MLIR_INTERPRETER_OP(Select);
+REGISTER_MLIR_INTERPRETER_OP(Slice);
+REGISTER_MLIR_INTERPRETER_OP(Sort);
+REGISTER_MLIR_INTERPRETER_OP(Transpose);
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/mhlo_binary_cwise.cc b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/mhlo_binary_cwise.cc
new file mode 100644
index 00000000000000..00c4a38aa2286f
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/mhlo_binary_cwise.cc
@@ -0,0 +1,45 @@
+/* Copyright 2023 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/mlir/tools/mlir_interpreter/dialects/cwise_math.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value_util.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/registration.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+REGISTER_MLIR_INTERPRETER_OP("mhlo.add", ApplyCwiseBinaryMap<Plus>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.and", ApplyCwiseBinaryMap<BitAnd>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.atan2", ApplyCwiseBinaryMap<ATan2>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.complex", ApplyCwiseBinaryMap<Complex>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.divide", ApplyCwiseBinaryMap<Divide>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.maximum", ApplyCwiseBinaryMap<Max>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.minimum", ApplyCwiseBinaryMap<Min>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.multiply", ApplyCwiseBinaryMap<Multiply>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.or", ApplyCwiseBinaryMap<BitOr>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.power", ApplyCwiseBinaryMap<Power>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.remainder", ApplyCwiseBinaryMap<Remainder>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.shift_left", ApplyCwiseBinaryMap<ShiftLeft>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.shift_right_arithmetic",
+                             ApplyCwiseBinaryMap<ShiftRightArith>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.shift_right_logical",
+                             ApplyCwiseBinaryMap<ShiftRightLogical>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.subtract", ApplyCwiseBinaryMap<Minus>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.xor", ApplyCwiseBinaryMap<BitXor>);
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/mhlo_unary_cwise.cc b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/mhlo_unary_cwise.cc
new file mode 100644
index 00000000000000..e8d09418369589
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/mhlo_unary_cwise.cc
@@ -0,0 +1,84 @@
+/* Copyright 2023 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <complex>
+
+#include "xla/mlir/tools/mlir_interpreter/dialects/cwise_math.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value_util.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/registration.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+struct Logistic : CwiseNonIntegral {
+  template <typename T>
+  static T Apply(T a) {
+    if (std::real(a) < 0) {
+      T e = std::exp(a);
+      return e / (e + T{1});
+    }
+    return T{1} / (std::exp(-a) + T{1});
+  }
+};
+
+// Note: this can't be replaced with std::bit_not, which always returns true for
+// bools.
+struct Not : CwiseIntegral {
+  static bool Apply(bool a) { return !a; }
+
+  template <typename T>
+  static T Apply(T a) {
+    return ~a;
+  }
+};
+
+struct Sign : CwiseSigned {
+  template <typename T>
+  static T Apply(T a) {
+    return std::copysign(T{1}, a);
+  }
+};
+
+REGISTER_MLIR_INTERPRETER_OP("mhlo.abs", ApplyCwiseMap<Abs>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.cbrt", ApplyCwiseMap<Cbrt>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.ceil", ApplyCwiseMap<Ceil>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.cosine", ApplyCwiseMap<Cos>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.count_leading_zeros", ApplyCwiseMap<Clz>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.exponential", ApplyCwiseMap<Exp>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.exponential_minus_one",
+                             ApplyCwiseMap<ExpM1>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.floor", ApplyCwiseMap<Floor>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.imag", ApplyCwiseMap<Imag>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.is_finite", ApplyCwiseMap<IsFinite>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.log", ApplyCwiseMap<Log>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.log_plus_one", ApplyCwiseMap<Log1P>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.logistic", ApplyCwiseMap<Logistic>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.negate", ApplyCwiseMap<Neg>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.not", ApplyCwiseMap<Not>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.popcnt", ApplyCwiseMap<PopCount>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.real", ApplyCwiseMap<Real>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.round_nearest_afz", ApplyCwiseMap<Round>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.round_nearest_even",
+                             ApplyCwiseMap<NearbyInt>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.rsqrt", ApplyCwiseMap<RSqrt>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.sign", ApplyCwiseMap<Sign>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.sine", ApplyCwiseMap<Sin>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.sqrt", ApplyCwiseMap<Sqrt>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.tanh", ApplyCwiseMap<TanH>);
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/scf.cc b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/scf.cc
new file mode 100644
index 00000000000000..60137e379441dd
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/scf.cc
@@ -0,0 +1,222 @@
+/* Copyright 2022 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
+
+#include <cassert>   // NOLINT
+#include <cstdint>   // NOLINT
+#include <iterator>  // NOLINT
+#include <memory>    // NOLINT
+#include <utility>   // NOLINT
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/mlir/tools/mlir_interpreter/dialects/util.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/registration.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/tensor_or_memref.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+class ParallelSideChannel : public InterpreterSideChannel {
+ public:
+  explicit ParallelSideChannel(llvm::SmallVector<InterpreterValue>& results)
+      : results_(results) {}
+
+  InterpreterValue& result(int index) const { return results_[index]; }
+
+ private:
+  SmallVector<InterpreterValue>& results_;
+};
+
+llvm::SmallVector<InterpreterValue> For(InterpreterState& state, scf::ForOp op,
+                                        int64_t lb, int64_t ub, int64_t step,
+                                        ArrayRef<InterpreterValue> inits) {
+  llvm::SmallVector<InterpreterValue> results;
+  for (int64_t i = 0; i < inits.size(); ++i) {
+    results.push_back(GetInitOperand(op.getInitArgs(), i, inits));
+  }
+
+  auto& region = op->getRegion(0);
+  for (; lb < ub; lb += step) {
+    SmallVector<InterpreterValue> inputs;
+    DispatchScalarType(op.getLowerBound().getType(), [&](auto dummy) {
+      inputs.push_back(InterpreterValue{static_cast<decltype(dummy)>(lb)});
+    });
+    llvm::copy(results, std::back_inserter(inputs));
+    results = Interpret(state, region, inputs);
+    if (state.HasFailure()) {
+      break;
+    }
+  }
+  return results;
+}
+
+llvm::SmallVector<InterpreterValue> ForAll(
+    InterpreterState& state, scf::ForallOp op,
+    ArrayRef<InterpreterValue> dynamic_lower_bounds,
+    ArrayRef<InterpreterValue> dynamic_upper_bounds,
+    ArrayRef<InterpreterValue> dynamic_steps,
+    ArrayRef<InterpreterValue> inits) {
+  bool is_bufferized = op.getNumResults() == 0;
+
+  // Clone any tensors that are passed in `shared_outs`.
+  SmallVector<InterpreterValue> outs = llvm::to_vector(inits);
+  for (auto [type, out] : llvm::zip(op.getOutputs().getTypes(), outs)) {
+    if (isa<TensorType>(type)) {
+      out = out.Clone();
+    }
+  }
+
+  auto lbs = ReplaceDynamicVals(op.getStaticLowerBound(), dynamic_lower_bounds);
+  auto ubs = ReplaceDynamicVals(op.getStaticUpperBound(), dynamic_upper_bounds);
+  auto steps = ReplaceDynamicVals(op.getStaticStep(), dynamic_steps);
+
+  SmallVector<int64_t> iter_sizes;
+  for (auto [lb, ub, step] : llvm::zip(lbs, ubs, steps)) {
+    if (step == 0) {
+      state.AddFailure("invalid step");
+      return {};
+    }
+    iter_sizes.push_back((ub - lb + (step - 1)) / step);
+  }
+
+  // Make a fake buffer view to abuse its index iterator.
+  BufferView view{0, std::move(iter_sizes), {}};
+  for (const auto& indices : view.Indices()) {
+    SmallVector<InterpreterValue> args;
+    for (auto [i, lb, step] : llvm::zip(indices, lbs, steps)) {
+      args.push_back(InterpreterValue{i * step + lb});
+    }
+    llvm::copy(outs, std::back_inserter(args));
+
+    auto yielded = Interpret(state, op->getRegion(0), args);
+    if (state.HasFailure()) {
+      break;
+    }
+    assert(yielded.empty() && "forall loop shouldn't have yielded anything");
+  }
+
+  if (is_bufferized) {
+    return {};
+  }
+  return outs;
+}
+
+void InParallel(InterpreterState& state, scf::InParallelOp op) {
+  Interpret(state, op.getRegion(), {});
+}
+
+llvm::SmallVector<InterpreterValue> If(InterpreterState& state, scf::IfOp op,
+                                       bool condition) {
+  if (condition) {
+    return Interpret(state, op.getThenRegion(), {});
+  }
+  if (op.getElseRegion().hasOneBlock()) {
+    return Interpret(state, op.getElseRegion(), {});
+  }
+  return {};
+}
+
+llvm::SmallVector<InterpreterValue> Parallel(InterpreterState& state,
+                                             scf::ParallelOp parallel,
+                                             ArrayRef<int64_t> lbs,
+                                             ArrayRef<int64_t> ubs,
+                                             ArrayRef<int64_t> steps,
+                                             ArrayRef<InterpreterValue> inits) {
+  llvm::SmallVector<InterpreterValue> results;
+  for (int64_t i = 0; i < parallel.getNumReductions(); ++i) {
+    results.push_back(GetInitOperand(parallel.getInitVals(), i, inits));
+  }
+
+  BufferView iter;
+  for (auto [lb, ub, step] : llvm::zip(lbs, ubs, steps)) {
+    iter.sizes.push_back((ub - lb + (step - 1)) / step);
+  }
+
+  // Make the results available to reduce ops.
+  state.GetTopScope()->SetSideChannel(
+      std::make_shared<ParallelSideChannel>(results));
+  for (const auto& indices : iter.Indices()) {
+    SmallVector<InterpreterValue> iter_args;
+    for (auto [i, lb, step] : llvm::zip(indices, lbs, steps)) {
+      iter_args.push_back(InterpreterValue{i * step + lb});
+    }
+
+    // Execute the region. It has no results.
+    Interpret(state, parallel.getRegion(), iter_args);
+  }
+
+  return results;
+}
+
+void Reduce(InterpreterState& state, scf::ReduceOp reduce,
+            ArrayRef<InterpreterValue> operands) {
+  if (operands.size() != reduce.getNumRegions()) {
+    state.AddFailure("reduce op has wrong number of operands");
+    return;
+  }
+  for (int i = 0; i < reduce.getNumRegions(); ++i) {
+    auto& accumulator =
+        state.GetTopScope()->GetSideChannel<ParallelSideChannel>()->result(i);
+    auto results =
+        Interpret(state, reduce.getRegion(i), {accumulator, operands[i]});
+    if (state.HasFailure()) {
+      return;
+    }
+    accumulator = results.front();
+  }
+}
+
+llvm::SmallVector<InterpreterValue> While(
+    InterpreterState& state, scf::WhileOp op,
+    MutableArrayRef<InterpreterValue> inits) {
+  auto loop_vars = Interpret(state, op.getBefore(), inits);
+  while (!state.HasFailure() && std::get<bool>(loop_vars.front().storage)) {
+    loop_vars = Interpret(state, op.getAfter(),
+                          ArrayRef<InterpreterValue>(loop_vars).drop_front());
+    if (state.HasFailure()) {
+      break;
+    }
+    loop_vars = Interpret(state, op.getBefore(), loop_vars);
+  }
+  if (state.HasFailure()) {
+    return {};
+  }
+  loop_vars.erase(loop_vars.begin());
+  return loop_vars;
+}
+
+REGISTER_MLIR_INTERPRETER_OP("scf.condition", NoOpTerminator);
+REGISTER_MLIR_INTERPRETER_OP("scf.reduce.return", NoOpTerminator);
+REGISTER_MLIR_INTERPRETER_OP("scf.yield", NoOpTerminator);
+REGISTER_MLIR_INTERPRETER_OP(ForAll);
+REGISTER_MLIR_INTERPRETER_OP(InParallel);
+REGISTER_MLIR_INTERPRETER_OP(Parallel);
+REGISTER_MLIR_INTERPRETER_OP(Reduce);
+REGISTER_MLIR_INTERPRETER_OP(For);
+REGISTER_MLIR_INTERPRETER_OP(If);
+REGISTER_MLIR_INTERPRETER_OP(While);
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tensor.cc b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tensor.cc
new file mode 100644
index 00000000000000..0abed8dc48b573
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tensor.cc
@@ -0,0 +1,262 @@
+/* Copyright 2022 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
+
+#include <cassert>   // NOLINT
+#include <cstdint>   // NOLINT
+#include <optional>  // NOLINT
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/mlir/tools/mlir_interpreter/dialects/util.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value_util.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/registration.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+int64_t Dim(InterpreterState& state, tensor::DimOp,
+            const InterpreterValue& tensor, int64_t dim) {
+  return DimImpl(tensor, dim, state);
+}
+
+InterpreterValue Empty(InterpreterState&, tensor::EmptyOp op,
+                       ArrayRef<int64_t> dynamic_sizes) {
+  auto ty = op->getResultTypes().front().cast<mlir::ShapedType>();
+  auto shape = ReplaceDynamicVals(ty.getShape(), dynamic_sizes);
+  return InterpreterValue::MakeTensor(ty.getElementType(), shape);
+}
+
+InterpreterValue Extract(InterpreterState& state, tensor::ExtractOp,
+                         InterpreterValue tensor, ArrayRef<int64_t> indices) {
+  if (!tensor.View().InBounds(indices)) {
+    state.AddFailure("array index out of bounds");
+    return {};
+  }
+  return tensor.ExtractElement(indices);
+}
+
+InterpreterValue FromElements(InterpreterState&, tensor::FromElementsOp op,
+                              MutableArrayRef<InterpreterValue> elements) {
+  auto ty = op->getResultTypes().front().cast<mlir::ShapedType>();
+  auto result = InterpreterValue::MakeTensor(ty.getElementType(),
+                                             llvm::to_vector(ty.getShape()));
+  for (auto [index, element] : llvm::zip(result.View().Indices(), elements)) {
+    result.InsertElement(index, element);
+  }
+  return result;
+}
+
+llvm::SmallVector<InterpreterValue> CollapseShape(
+    InterpreterState&, tensor::CollapseShapeOp op,
+    const InterpreterValue& tensor) {
+  SmallVector<int64_t> sizes;
+  for (const auto& indices : op.getReassociationIndices()) {
+    int64_t size = 1;
+    for (auto dim : indices) {
+      size *= tensor.View().sizes[dim];
+    }
+    sizes.push_back(size);
+  }
+  return {ReshapeTensor(tensor, sizes)};
+}
+
+llvm::SmallVector<InterpreterValue> ExpandShape(
+    InterpreterState&, tensor::ExpandShapeOp op,
+    const InterpreterValue& tensor) {
+  auto ty = cast<ShapedType>(op->getResultTypes().front());
+  auto sizes = llvm::to_vector(ty.getShape());
+  for (const auto& [src_index, dst_indices] :
+       llvm::enumerate(op.getReassociationIndices())) {
+    int64_t size = tensor.View().sizes[src_index];
+    std::optional<int64_t> dyn_index = std::nullopt;
+    for (auto dim : dst_indices) {
+      if (sizes[dim] < 0) {
+        dyn_index = dim;
+      } else {
+        size /= sizes[dim];
+      }
+    }
+    if (dyn_index) {
+      sizes[*dyn_index] = size;
+    }
+  }
+  return {ReshapeTensor(tensor, sizes)};
+}
+
+llvm::SmallVector<InterpreterValue> ExtractSlice(
+    InterpreterState&, tensor::ExtractSliceOp extract, InterpreterValue tensor,
+    ArrayRef<int64_t> dynamic_offsets, ArrayRef<int64_t> dynamic_sizes,
+    ArrayRef<int64_t> dynamic_strides) {
+  auto v = ExtractOffsetsSizesStrides(dynamic_offsets, dynamic_sizes,
+                                      dynamic_strides, extract);
+  int64_t rank = v.offsets.size();
+  auto out = tensor.TypedAlike(v.sizes);
+  out.Fill([&](llvm::ArrayRef<int64_t> indices) {
+    llvm::SmallVector<int64_t> src_indices;
+    for (int64_t i = 0; i < rank; ++i) {
+      src_indices.push_back(indices[i] * v.strides[i] + v.offsets[i]);
+    }
+    return tensor.ExtractElement(src_indices);
+  });
+
+  int64_t num_dropped = 0;
+  auto& out_view = out.View();
+
+  // TODO(jreiffers): Figure out why getDroppedDims fails here when there's
+  // no rank reduction and the output has a dynamic shape.
+  int64_t dim = 0;
+  const auto& result_sizes = extract.getResultType().getShape();
+  const auto& static_sizes = extract.getStaticSizes();
+  while (dim < out_view.Rank()) {
+    if (static_sizes[num_dropped + dim] == 1 &&
+        (dim >= result_sizes.size() || result_sizes[dim] != 1)) {
+      out_view.sizes.erase(out_view.sizes.begin() + dim);
+      out_view.strides.erase(out_view.strides.begin() + dim);
+      ++num_dropped;
+    } else {
+      ++dim;
+    }
+  }
+
+  return {out};
+}
+
+template <typename Op>
+llvm::SmallVector<InterpreterValue> InsertSlice(
+    InterpreterState&, Op insert, InterpreterValue src, InterpreterValue dest,
+    ArrayRef<int64_t> dynamic_offsets, ArrayRef<int64_t> dynamic_sizes,
+    ArrayRef<int64_t> dynamic_strides) {
+  // parallel_insert_slice actually writes to its destination.
+  if (insert->getNumResults() == 1) {
+    dest = dest.Clone();
+  }
+  auto v = ExtractOffsetsSizesStrides(dynamic_offsets, dynamic_sizes,
+                                      dynamic_strides, insert);
+
+  auto static_sizes = insert.getStaticSizes();
+  llvm::SmallVector<int64_t> inserted_dims;
+  auto& src_view = src.View();
+  auto* src_size_it = src_view.sizes.begin();
+  for (auto [dim, size] : llvm::enumerate(static_sizes)) {
+    if (src_size_it == src_view.sizes.end() ||
+        (*src_size_it != size && size >= 0)) {
+      assert(size == 1 && "Can only insert unit dims");
+      inserted_dims.push_back(dim);
+    } else {
+      ++src_size_it;
+    }
+  }
+
+  for (const auto& src_indices : src_view.Indices()) {
+    llvm::SmallVector<int64_t> src_with_inserted_dims = src_indices;
+    for (int64_t dim : inserted_dims) {
+      src_with_inserted_dims.insert(src_with_inserted_dims.begin() + dim, 0);
+    }
+    llvm::SmallVector<int64_t> dst_indices;
+    for (auto [src_index, stride, offset] :
+         llvm::zip(src_with_inserted_dims, v.strides, v.offsets)) {
+      dst_indices.push_back(src_index * stride + offset);
+    }
+    dest.InsertElement(dst_indices, src.ExtractElement(src_indices));
+  }
+  if (insert->getNumResults() == 1) {
+    return {dest};
+  }
+  return {};
+}
+
+InterpreterValue Generate(InterpreterState& state, tensor::GenerateOp generate,
+                          ArrayRef<int64_t> dynamic_sizes) {
+  auto ty = generate->getResultTypes().front().cast<ShapedType>();
+  auto sizes = ReplaceDynamicVals(ty.getShape(), dynamic_sizes);
+
+  auto result = InterpreterValue::MakeTensor(ty.getElementType(), sizes);
+  result.Fill([&](ArrayRef<int64_t> indices) -> InterpreterValue {
+    auto values = Interpret(state, generate.getRegion(),
+                            PackInterpreterValues<int64_t>(indices));
+    if (state.HasFailure()) {
+      return {result.ExtractElement(indices)};
+    }
+    return values.front();
+  });
+  return {result};
+}
+
+InterpreterValue Insert(InterpreterState& state, tensor::InsertOp,
+                        const InterpreterValue& value,
+                        const InterpreterValue& tensor,
+                        ArrayRef<int64_t> indices) {
+  auto result = tensor.Clone();
+  if (result.View().InBounds(indices)) {
+    result.InsertElement(indices, value);
+  } else {
+    state.AddFailure("array index out of bounds");
+  }
+  return result;
+}
+
+InterpreterValue Pad(InterpreterState& state, tensor::PadOp pad,
+                     InterpreterValue tensor, ArrayRef<int64_t> dynamic_lows,
+                     ArrayRef<int64_t> dynamic_highs) {
+  auto lows = ReplaceDynamicVals(pad.getStaticLow(), dynamic_lows);
+  auto highs = ReplaceDynamicVals(pad.getStaticHigh(), dynamic_highs);
+
+  auto& view = tensor.View();
+  llvm::SmallVector<int64_t> result_sizes;
+  for (auto [size, low, high] : llvm::zip(view.sizes, lows, highs)) {
+    result_sizes.push_back(size + low + high);
+  }
+
+  auto result = tensor.TypedAlike(result_sizes);
+  result.Fill([&](llvm::ArrayRef<int64_t> out_index) -> InterpreterValue {
+    llvm::SmallVector<int64_t> in_index;
+    for (auto [index, low] : llvm::zip(out_index, lows)) {
+      in_index.push_back(index - low);
+    }
+    if (view.InBounds(in_index)) {
+      return tensor.ExtractElement(in_index);
+    }
+    return Interpret(state, pad.getRegion(), PackInterpreterValues(out_index))
+        .front();
+  });
+  return result;
+}
+
+REGISTER_MLIR_INTERPRETER_OP("tensor.cast",
+                             "builtin.unrealized_conversion_cast");
+REGISTER_MLIR_INTERPRETER_OP("tensor.yield", NoOpTerminator);
+REGISTER_MLIR_INTERPRETER_OP(CollapseShape);
+REGISTER_MLIR_INTERPRETER_OP(Dim);
+REGISTER_MLIR_INTERPRETER_OP(Empty);
+REGISTER_MLIR_INTERPRETER_OP(ExpandShape);
+REGISTER_MLIR_INTERPRETER_OP(Extract);
+REGISTER_MLIR_INTERPRETER_OP(ExtractSlice);
+REGISTER_MLIR_INTERPRETER_OP(FromElements);
+REGISTER_MLIR_INTERPRETER_OP(Generate);
+REGISTER_MLIR_INTERPRETER_OP(Insert);
+REGISTER_MLIR_INTERPRETER_OP(InsertSlice<tensor::InsertSliceOp>);
+REGISTER_MLIR_INTERPRETER_OP(InsertSlice<tensor::ParallelInsertSliceOp>);
+REGISTER_MLIR_INTERPRETER_OP(Pad);
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/BUILD b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/BUILD
new file mode 100644
index 00000000000000..c146c9ea69a970
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/BUILD
@@ -0,0 +1,13 @@
+load("//xla:lit.bzl", "lit_test_suite")
+
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
+lit_test_suite(
+    name = "all_tests",
+    srcs = glob(["**/*.mlir"]),
+    cfg = "//xla:lit.cfg.py",
+    tools = [
+        "//xla/mlir/tools/mlir_interpreter:mlir-interpreter-runner",
+        "@llvm-project//llvm:FileCheck",
+    ],
+)
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/affine/apply.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/affine/apply.mlir
new file mode 100644
index 00000000000000..43feca045e927f
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/affine/apply.mlir
@@ -0,0 +1,63 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @ceildiv() -> index {
+  %c25 = arith.constant 25 : index
+  %ret = affine.apply affine_map<(d0)[] -> ((d0 ceildiv 8) * 8)>(%c25)[]
+  return %ret : index
+}
+
+// CHECK-LABEL: @ceildiv
+// CHECK: Results
+// CHECK-NEXT: 32
+
+func.func @floordiv() -> index {
+  %c25 = arith.constant 25 : index
+  %ret = affine.apply affine_map<(d0)[] -> ((d0 floordiv 8) * 8)>(%c25)[]
+  return %ret : index
+}
+
+// CHECK-LABEL: @floordiv
+// CHECK: Results
+// CHECK-NEXT: 24
+
+func.func @add() -> index {
+  %c100 = arith.constant 100 : index
+  %c42 = arith.constant 42 : index
+  %ret = affine.apply affine_map<(d0, d1)[] -> (d0 + d1)>(%c100, %c42)[]
+  return %ret : index
+}
+
+// CHECK-LABEL: @add
+// CHECK: Results
+// CHECK-NEXT: 142
+
+func.func @mod() -> index {
+  %c99 = arith.constant 99 : index
+  %ret = affine.apply affine_map<(d0)[] -> (d0 mod 10)>(%c99)[]
+  return %ret : index
+}
+
+// CHECK-LABEL: @mod
+// CHECK: Results
+// CHECK-NEXT: 9
+
+func.func @mul() -> index {
+  %c100 = arith.constant 100 : index
+  %ret = affine.apply affine_map<(d0)[] -> (d0 * 42)>(%c100)[]
+  return %ret : index
+}
+
+// CHECK-LABEL: @mul
+// CHECK: Results
+// CHECK-NEXT: 4200
+
+func.func @symbol() -> index {
+  %c1 = arith.constant 1 : index
+  %c20 = arith.constant 20 : index
+  %ret = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%c1)[%c20]
+  return %ret : index
+}
+
+// CHECK-LABEL: @symbol
+// CHECK: Results
+// CHECK-NEXT: 21
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/affine/minmax.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/affine/minmax.mlir
new file mode 100644
index 00000000000000..cf38c45f3608ab
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/affine/minmax.mlir
@@ -0,0 +1,36 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+#map = affine_map<(d0)[] -> (1000, d0 + 512, d0*100)>
+
+func.func @min() -> (index, index, index) {
+  %c1 = arith.constant 1 : index
+  %c8 = arith.constant 8 : index
+  %c500 = arith.constant 500 : index
+
+  %0 = affine.min #map (%c1)[]
+  %1 = affine.min #map (%c8)[]
+  %2 = affine.min #map (%c500)[]
+
+  return %0, %1, %2 : index, index, index
+}
+
+// CHECK-LABEL: @min
+// CHECK-NEXT: Results
+// CHECK-NEXT: 100
+// CHECK-NEXT: 520
+// CHECK-NEXT: 1000
+
+func.func @max() -> (index, index) {
+  %c1 = arith.constant 1 : index
+  %c11 = arith.constant 11 : index
+
+  %0 = affine.max #map (%c1)[]
+  %1 = affine.max #map (%c11)[]
+
+  return %0, %1 : index, index
+}
+
+// CHECK-LABEL: @max
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1000
+// CHECK-NEXT: 1100
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/bitcast.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/bitcast.mlir
new file mode 100644
index 00000000000000..5dc30c1fbbba20
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/bitcast.mlir
@@ -0,0 +1,21 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @bitcast_scalar() -> i32 {
+  %c15 = arith.constant 1.5 : f32
+  %ret = arith.bitcast %c15 : f32 to i32
+  return %ret : i32
+}
+
+// CHECK-LABEL: @bitcast_scalar
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1069547520
+
+func.func @bitcast_vector() -> vector<2xf32> {
+  %c15 = arith.constant dense<[15, 25]> : vector<2xi32>
+  %ret = arith.bitcast %c15 : vector<2xi32> to vector<2xf32>
+  return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: @bitcast_vector
+// CHECK-NEXT: Results
+// CHECK-NEXT: [2.101948e-44, 3.503246e-44]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/cmpf.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/cmpf.mlir
new file mode 100644
index 00000000000000..e416f353b18781
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/cmpf.mlir
@@ -0,0 +1,129 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func private @compare(%a: f32, %b: f32) -> tensor<16xi1> {
+  %false = arith.cmpf false, %a, %b : f32
+  %oeq = arith.cmpf oeq, %a, %b : f32
+  %ogt = arith.cmpf ogt, %a, %b : f32
+  %oge = arith.cmpf oge, %a, %b : f32
+
+  %olt = arith.cmpf olt, %a, %b : f32
+  %ole = arith.cmpf ole, %a, %b : f32
+  %one = arith.cmpf one, %a, %b : f32
+  %ord = arith.cmpf ord, %a, %b : f32
+
+  %ueq = arith.cmpf ueq, %a, %b : f32
+  %ugt = arith.cmpf ugt, %a, %b : f32
+  %uge = arith.cmpf uge, %a, %b : f32
+  %ult = arith.cmpf ult, %a, %b : f32
+
+  %ule = arith.cmpf ule, %a, %b : f32
+  %une = arith.cmpf une, %a, %b : f32
+  %uno = arith.cmpf uno, %a, %b : f32
+  %true = arith.cmpf true, %a, %b : f32
+
+  %ret = tensor.from_elements
+    %false, %oeq, %ogt, %oge,
+    %olt, %ole, %one, %ord,
+    %ueq, %ugt, %uge, %ult,
+    %ule, %une, %uno, %true : tensor<16xi1>
+  func.return %ret : tensor<16xi1>
+}
+
+func.func @nan_vs_one() -> tensor<16xi1> {
+  %nan = arith.constant 0x7fc00000 : f32
+  %one = arith.constant 1.0 : f32
+
+  %ret = func.call @compare(%nan, %one) : (f32, f32) -> tensor<16xi1>
+  func.return %ret : tensor<16xi1>
+}
+
+// CHECK-LABEL: @nan_vs_one
+// CHECK-NEXT: Results
+// CHECK-NEXT: [false, false, false, false,
+// CHECK-SAME:  false, false, false, false,
+// CHECK-SAME:  true, true, true, true,
+// CHECK-SAME:  true, true, true, true]
+
+func.func @one_vs_nan() -> tensor<16xi1> {
+  %nan = arith.constant 0x7fc00000 : f32
+  %one = arith.constant 1.0 : f32
+
+  %ret = func.call @compare(%one, %nan) : (f32, f32) -> tensor<16xi1>
+  func.return %ret : tensor<16xi1>
+}
+
+// CHECK-LABEL: @one_vs_nan
+// CHECK-NEXT: Results
+// CHECK-NEXT: [false, false, false, false,
+// CHECK-SAME:  false, false, false, false,
+// CHECK-SAME:  true, true, true, true,
+// CHECK-SAME:  true, true, true, true]
+
+func.func @nan_vs_nan() -> tensor<16xi1> {
+  %nan = arith.constant 0x7fc00000 : f32
+
+  %ret = func.call @compare(%nan, %nan) : (f32, f32) -> tensor<16xi1>
+  func.return %ret : tensor<16xi1>
+}
+
+// CHECK-LABEL: @nan_vs_nan
+// CHECK-NEXT: Results
+// CHECK-NEXT: [false, false, false, false,
+// CHECK-SAME:  false, false, false, false,
+// CHECK-SAME:  true, true, true, true,
+// CHECK-SAME:  true, true, true, true]
+
+func.func @one_vs_one() -> tensor<16xi1> {
+  %one = arith.constant 1.0 : f32
+
+  %ret = func.call @compare(%one, %one) : (f32, f32) -> tensor<16xi1>
+  func.return %ret : tensor<16xi1>
+}
+
+// CHECK-LABEL: @one_vs_one
+// CHECK-NEXT: Results
+// CHECK-NEXT: [false, true, false, true,
+// CHECK-SAME:  false, true, false, true,
+// CHECK-SAME:  true, false, true, false,
+// CHECK-SAME:  true, false, false, true]
+
+func.func @one_vs_two() -> tensor<16xi1> {
+  %one = arith.constant 1.0 : f32
+  %two = arith.constant 2.0 : f32
+
+  %ret = func.call @compare(%one, %two) : (f32, f32) -> tensor<16xi1>
+  func.return %ret : tensor<16xi1>
+}
+
+// CHECK-LABEL: @one_vs_two
+// CHECK-NEXT: Results
+// CHECK-NEXT: [false, false, false, false,
+// CHECK-SAME:  true, true, true, true,
+// CHECK-SAME:  false, false, false, true,
+// CHECK-SAME:  true, true, false, true]
+
+func.func @two_vs_one() -> tensor<16xi1> {
+  %one = arith.constant 1.0 : f32
+  %two = arith.constant 2.0 : f32
+
+  %ret = func.call @compare(%two, %one) : (f32, f32) -> tensor<16xi1>
+  func.return %ret : tensor<16xi1>
+}
+
+// CHECK-LABEL: @two_vs_one
+// CHECK-NEXT: Results
+// CHECK-NEXT: [false, false, true, true,
+// CHECK-SAME:  false, false, true, true,
+// CHECK-SAME:  false, true, true, false,
+// CHECK-SAME:  false, true, false, true]
+
+func.func @vector() -> vector<4xi1> {
+  %0 = arith.constant dense<[1.0, 2.0, 3.0, 4.0]> : vector<4xf32>
+  %1 = arith.constant dense<[10.0, 2.0, 30.0, 4.0]> : vector<4xf32>
+  %ret = arith.cmpf oeq, %0, %1 : vector<4xf32>
+  return %ret : vector<4xi1>
+}
+
+// CHECK-LABEL: @vector
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<4xi1>: [false, true, false, true]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/cmpi.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/cmpi.mlir
new file mode 100644
index 00000000000000..dbca4f718d5ccd
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/cmpi.mlir
@@ -0,0 +1,147 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @eq() -> (i1, i1) {
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %e1 = arith.cmpi eq, %c1, %c2 : index
+  %e2 = arith.cmpi eq, %c1, %c1 : index
+  return %e1, %e2 : i1, i1
+}
+
+// CHECK-LABEL: @eq
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: false
+// CHECK-NEXT{LITERAL}: true
+
+func.func @ne() -> (i1, i1) {
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %e1 = arith.cmpi ne, %c1, %c2 : index
+  %e2 = arith.cmpi ne, %c1, %c1 : index
+  return %e1, %e2 : i1, i1
+}
+
+// CHECK-LABEL: @ne
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: true
+// CHECK-NEXT{LITERAL}: false
+
+func.func @slt() -> (i1, i1, i1) {
+  %c-1 = arith.constant -1 : index
+  %c1 = arith.constant 1 : index
+  %e1 = arith.cmpi slt, %c-1, %c1 : index
+  %e2 = arith.cmpi slt, %c1, %c-1 : index
+  %e3 = arith.cmpi slt, %c1, %c1 : index
+  return %e1, %e2, %e3 : i1, i1, i1
+}
+
+// CHECK-LABEL: @slt
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: true
+// CHECK-NEXT{LITERAL}: false
+// CHECK-NEXT{LITERAL}: false
+
+func.func @sle() -> (i1, i1, i1) {
+  %c-1 = arith.constant -1 : index
+  %c1 = arith.constant 1 : index
+  %e1 = arith.cmpi sle, %c-1, %c1 : index
+  %e2 = arith.cmpi sle, %c1, %c-1 : index
+  %e3 = arith.cmpi sle, %c1, %c1 : index
+  return %e1, %e2, %e3 : i1, i1, i1
+}
+
+// CHECK-LABEL: @sle
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: true
+// CHECK-NEXT{LITERAL}: false
+// CHECK-NEXT{LITERAL}: true
+
+func.func @sgt() -> (i1, i1, i1) {
+  %c-1 = arith.constant -1 : index
+  %c1 = arith.constant 1 : index
+  %e1 = arith.cmpi sgt, %c-1, %c1 : index
+  %e2 = arith.cmpi sgt, %c1, %c-1 : index
+  %e3 = arith.cmpi sgt, %c1, %c1 : index
+  return %e1, %e2, %e3 : i1, i1, i1
+}
+
+// CHECK-LABEL: @sgt
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: false
+// CHECK-NEXT{LITERAL}: true
+// CHECK-NEXT{LITERAL}: false
+
+func.func @sge() -> (i1, i1, i1) {
+  %c-1 = arith.constant -1 : index
+  %c1 = arith.constant 1 : index
+  %e1 = arith.cmpi sge, %c-1, %c1 : index
+  %e2 = arith.cmpi sge, %c1, %c-1 : index
+  %e3 = arith.cmpi sge, %c1, %c1 : index
+  return %e1, %e2, %e3 : i1, i1, i1
+}
+
+// CHECK-LABEL: @sge
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: false
+// CHECK-NEXT{LITERAL}: true
+// CHECK-NEXT{LITERAL}: true
+
+func.func @ult() -> (i1, i1, i1) {
+  %c-1 = arith.constant -1 : index
+  %c1 = arith.constant 1 : index
+  %e1 = arith.cmpi ult, %c-1, %c1 : index
+  %e2 = arith.cmpi ult, %c1, %c-1 : index
+  %e3 = arith.cmpi ult, %c1, %c1 : index
+  return %e1, %e2, %e3 : i1, i1, i1
+}
+
+// CHECK-LABEL: @ult
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: false
+// CHECK-NEXT{LITERAL}: true
+// CHECK-NEXT{LITERAL}: false
+
+func.func @ule() -> (i1, i1, i1) {
+  %c-1 = arith.constant -1 : index
+  %c1 = arith.constant 1 : index
+  %e1 = arith.cmpi ule, %c-1, %c1 : index
+  %e2 = arith.cmpi ule, %c1, %c-1 : index
+  %e3 = arith.cmpi ule, %c1, %c1 : index
+  return %e1, %e2, %e3 : i1, i1, i1
+}
+
+// CHECK-LABEL: @ule
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: false
+// CHECK-NEXT{LITERAL}: true
+// CHECK-NEXT{LITERAL}: true
+
+func.func @ugt() -> (i1, i1, i1) {
+  %c-1 = arith.constant -1 : index
+  %c1 = arith.constant 1 : index
+  %e1 = arith.cmpi ugt, %c-1, %c1 : index
+  %e2 = arith.cmpi ugt, %c1, %c-1 : index
+  %e3 = arith.cmpi ugt, %c1, %c1 : index
+  return %e1, %e2, %e3 : i1, i1, i1
+}
+
+// CHECK-LABEL: @ugt
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: true
+// CHECK-NEXT{LITERAL}: false
+// CHECK-NEXT{LITERAL}: false
+
+func.func @uge() -> (i1, i1, i1) {
+  %c-1 = arith.constant -1 : index
+  %c1 = arith.constant 1 : index
+  %e1 = arith.cmpi uge, %c-1, %c1 : index
+  %e2 = arith.cmpi uge, %c1, %c-1 : index
+  %e3 = arith.cmpi uge, %c1, %c1 : index
+  return %e1, %e2, %e3 : i1, i1, i1
+}
+
+// CHECK-LABEL: @uge
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: true
+// CHECK-NEXT{LITERAL}: false
+// CHECK-NEXT{LITERAL}: true
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/constant.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/constant.mlir
new file mode 100644
index 00000000000000..b378f008bada34
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/constant.mlir
@@ -0,0 +1,37 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @tensor() -> tensor<2xi16> {
+  %cst = arith.constant dense<[42, 43]> : tensor<2xi16>
+  return %cst : tensor<2xi16>
+}
+
+// CHECK-LABEL: @tensor
+// CHECK-NEXT: Results
+// CHECK-NEXT: [42, 43]
+
+func.func @tensor_splat() -> tensor<2xi32> {
+  %cst = arith.constant dense<42> : tensor<2xi32>
+  return %cst : tensor<2xi32>
+}
+
+// CHECK-LABEL: @tensor_splat
+// CHECK-NEXT: Results
+// CHECK-NEXT: [42, 42]
+
+func.func @scalar() -> i1 {
+  %cst = arith.constant true
+  return %cst : i1
+}
+
+// CHECK-LABEL: @scalar
+// CHECK-NEXT: Results
+// CHECK-NEXT: true
+
+func.func @vector() -> vector<2x3xi32> {
+  %cst = arith.constant dense<[[1, 2, 3], [4, 5, 6]]> : vector<2x3xi32>
+  return %cst : vector<2x3xi32>
+}
+
+// CHECK-LABEL: @vector
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: vector<2x3xi32>: [[1, 2, 3], [4, 5, 6]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/extf.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/extf.mlir
new file mode 100644
index 00000000000000..f6124ea07c3706
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/extf.mlir
@@ -0,0 +1,11 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @extf() -> f64 {
+  %c1 = arith.constant 1.0 : f32
+  %ext = arith.extf %c1 : f32 to f64
+  return %ext : f64
+}
+
+// CHECK-LABEL: @extf
+// CHECK-NEXT: Results
+// CHECK-NEXT: f64: 1.000000e+00
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/fptosi.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/fptosi.mlir
new file mode 100644
index 00000000000000..46592e49f2ed33
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/fptosi.mlir
@@ -0,0 +1,21 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @i16() -> i16 {
+  %c-1 = arith.constant -1.4 : f32
+  %r = arith.fptosi %c-1 : f32 to i16
+  return %r : i16
+}
+
+// CHECK-LABEL: @i16
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: -1
+
+func.func @vector() -> vector<2xi32> {
+  %c = arith.constant dense<[-1.1, -0.9]> : vector<2xf32>
+  %r = arith.fptosi %c : vector<2xf32> to vector<2xi32>
+  return %r : vector<2xi32>
+}
+
+// CHECK-LABEL: @vector
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: vector<2xi32>: [-1, 0]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/index_cast.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/index_cast.mlir
new file mode 100644
index 00000000000000..d47438e1992062
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/index_cast.mlir
@@ -0,0 +1,28 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @i32() -> (index) {
+  %c1 = arith.constant 42 : i32
+  %index = arith.index_cast %c1 : i32 to index
+  return %index : index
+}
+
+// CHECK-LABEL: @i32
+// CHECK{LITERAL}: 42
+
+func.func @i64() -> (index) {
+  %c1 = arith.constant 43 : i64
+  %index = arith.index_cast %c1 : i64 to index
+  return %index : index
+}
+
+// CHECK-LABEL: @i64
+// CHECK{LITERAL}: 43
+
+func.func @narrowing() -> (i32) {
+  %c1 = arith.constant 0x100000001 : index
+  %i32 = arith.index_cast %c1 : index to i32
+  return %i32 : i32
+}
+
+// CHECK-LABEL: @narrowing
+// CHECK{LITERAL}: i32: 1
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/int_math.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/int_math.mlir
new file mode 100644
index 00000000000000..a1c32c64730355
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/int_math.mlir
@@ -0,0 +1,111 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @addi() -> i32 {
+  %c1 = arith.constant 1 : i32
+  %c2 = arith.constant 2 : i32
+  %ret = arith.addi %c1, %c2 : i32
+  return %ret : i32
+}
+
+// CHECK-LABEL: @addi
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: 3
+
+func.func @muli() -> i32 {
+  %c3 = arith.constant 3 : i32
+  %c5 = arith.constant 5 : i32
+  %ret = arith.muli %c3, %c5 : i32
+  return %ret : i32
+}
+
+// CHECK-LABEL: @muli
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: 15
+
+func.func @divsi() -> i32 {
+  %c10 = arith.constant 10 : i32
+  %c-2 = arith.constant -2 : i32
+  %ret = arith.divsi %c10, %c-2 : i32
+  return %ret : i32
+}
+
+// CHECK-LABEL: @divsi
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: -5
+
+func.func @subi() -> i32 {
+  %c10 = arith.constant 10 : i32
+  %c3 = arith.constant 3 : i32
+  %ret = arith.subi %c10, %c3 : i32
+  return %ret : i32
+}
+
+// CHECK-LABEL: @subi
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: 7
+
+func.func @andi() -> i32 {
+  %c63 = arith.constant 63 : i32
+  %c131 = arith.constant 131 : i32
+  %ret = arith.andi %c63, %c131 : i32
+  return %ret : i32
+}
+
+// CHECK-LABEL: @andi
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: 3
+
+func.func @ori() -> i32 {
+  %c3 = arith.constant 3 : i32
+  %c10 = arith.constant 10 : i32
+  %ret = arith.ori %c3, %c10 : i32
+  return %ret : i32
+}
+
+// CHECK-LABEL: @ori
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: 11
+
+func.func @shrui() -> i32 {
+  %c20 = arith.constant 20 : i32
+  %c-1 = arith.constant -1 : i32
+  %ret = arith.shrui %c-1, %c20 : i32
+  return %ret : i32
+}
+
+// CHECK-LABEL: @shrui
+// CHECK-NEXT: Results
+// CHECK-NEXT: 4095
+
+func.func @shli() -> i32 {
+  %c2 = arith.constant 2 : i32
+  %c42 = arith.constant 42 : i32
+  %ret = arith.shli %c42, %c2 : i32
+  return %ret : i32
+}
+
+// CHECK-LABEL: @shli
+// CHECK-NEXT: Results
+// CHECK-NEXT: 168
+
+func.func @shrsi() -> i32 {
+  %c3 = arith.constant 3 : i32
+  %c-1023 = arith.constant -1023 : i32
+  %ret = arith.shrsi %c-1023, %c3 : i32
+  return %ret : i32
+}
+
+// CHECK-LABEL: @shrsi
+// CHECK-NEXT: Results
+// CHECK-NEXT: -128
+
+func.func @remsi() -> i32 {
+  %c7 = arith.constant 7 : i32
+  %c-1023 = arith.constant -1023 : i32
+  %ret = arith.remsi %c-1023, %c7 : i32
+  return %ret : i32
+}
+
+// CHECK-LABEL: @remsi
+// CHECK-NEXT: Results
+// CHECK-NEXT: -1
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/minmax.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/minmax.mlir
new file mode 100644
index 00000000000000..6d11378b5d7d31
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/minmax.mlir
@@ -0,0 +1,25 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @i32() -> (i32, i32) {
+  %c-1 = arith.constant -1 : i32
+  %c1 = arith.constant 1 : i32
+  %r1 = arith.minsi %c-1, %c1 : i32
+  %r2 = arith.maxsi %c-1, %c1 : i32
+  return %r1, %r2 : i32, i32
+}
+
+// CHECK-LABEL: @i32
+// CHECK{LITERAL}: -1
+// CHECK-NEXT{LITERAL}: 1
+
+func.func @i64() -> (i64, i64) {
+  %c-1 = arith.constant -1 : i64
+  %c1 = arith.constant 1000000000000 : i64
+  %r1 = arith.minsi %c-1, %c1 : i64
+  %r2 = arith.maxsi %c-1, %c1 : i64
+  return %r1, %r2 : i64, i64
+}
+
+// CHECK-LABEL: @i64
+// CHECK{LITERAL}: -1
+// CHECK-NEXT{LITERAL}: 1000000000000
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/negf.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/negf.mlir
new file mode 100644
index 00000000000000..8b45b85a0611ba
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/negf.mlir
@@ -0,0 +1,21 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @negf32() -> f32 {
+  %c = arith.constant -1.5 : f32
+  %ret = arith.negf %c : f32
+  return %ret : f32
+}
+
+// CHECK-LABEL: @negf32
+// CHECK-NEXT: Results
+// CHECK-NEXT: f32: 1.500000e+00
+
+func.func @negf64() -> f64 {
+  %c = arith.constant 3.5 : f64
+  %ret = arith.negf %c : f64
+  return %ret : f64
+}
+
+// CHECK-LABEL: @negf64
+// CHECK-NEXT: Results
+// CHECK-NEXT: f64: -3.500000e+00
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/remf.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/remf.mlir
new file mode 100644
index 00000000000000..3dc3033646cabd
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/remf.mlir
@@ -0,0 +1,12 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @remf() -> f32 {
+  %a = arith.constant 3.5 : f32
+  %b = arith.constant 2.25 : f32
+  %ret = arith.remf %a, %b : f32
+  return %ret : f32
+}
+
+// CHECK-LABEL: @remf
+// CHECK-NEXT: Results
+// CHECK-NEXT: f32: 1.250000e+00
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/select.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/select.mlir
new file mode 100644
index 00000000000000..103842d4079e92
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/select.mlir
@@ -0,0 +1,52 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @select() -> (i32, i32) {
+  %c-1 = arith.constant -1 : i32
+  %c1 = arith.constant 1 : i32
+  %true = arith.constant true
+  %false = arith.constant false
+  %r1 = arith.select %true, %c-1, %c1 : i32
+  %r2 = arith.select %false, %c-1, %c1 : i32
+  return %r1, %r2 : i32, i32
+}
+
+// CHECK-LABEL: @select
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: -1
+// CHECK-NEXT{LITERAL}: 1
+
+func.func @vector() -> vector<4xi32> {
+  %a = arith.constant dense<[1, 2, 3, 4]> : vector<4xi32>
+  %b = arith.constant dense<[10, 20, 30, 40]> : vector<4xi32>
+  %c = arith.constant dense<[true, false, true, false]> : vector<4xi1>
+  %r = arith.select %c, %a, %b : vector<4xi1>, vector<4xi32>
+  return %r : vector<4xi32>
+}
+
+// CHECK-LABEL: @vector
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: vector<4xi32>: [1, 20, 3, 40]
+
+func.func @scalar_vector() -> vector<4xi32> {
+  %a = arith.constant dense<[1, 2, 3, 4]> : vector<4xi32>
+  %b = arith.constant dense<[10, 20, 30, 40]> : vector<4xi32>
+  %c = arith.constant false
+  %r = arith.select %c, %a, %b : vector<4xi32>
+  return %r : vector<4xi32>
+}
+
+// CHECK-LABEL: @scalar_vector
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: vector<4xi32>: [10, 20, 30, 40]
+
+func.func @tensor() -> tensor<4xi32> {
+  %a = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
+  %b = arith.constant dense<[10, 20, 30, 40]> : tensor<4xi32>
+  %c = arith.constant dense<[true, false, true, false]> : tensor<4xi1>
+  %r = arith.select %c, %a, %b : tensor<4xi1>, tensor<4xi32>
+  return %r : tensor<4xi32>
+}
+
+// CHECK-LABEL: @tensor
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: TensorOrMemref<4xi32>: [1, 20, 3, 40]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/sitofp.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/sitofp.mlir
new file mode 100644
index 00000000000000..b2a18564d0fcd3
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/sitofp.mlir
@@ -0,0 +1,31 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @i16() -> f32 {
+  %c-1 = arith.constant -1 : i16
+  %r = arith.sitofp %c-1 : i16 to f32
+  return %r : f32
+}
+
+// CHECK-LABEL: @i16
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: -1.000000e+00
+
+func.func @i1() -> f64 {
+  %true = arith.constant true
+  %r = arith.sitofp %true : i1 to f64
+  return %r : f64
+}
+
+// CHECK-LABEL: @i1
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: 1.000000e+00
+
+func.func @vector() -> vector<1xf32> {
+  %c-1 = arith.constant dense<-1> : vector<1xi8>
+  %r = arith.sitofp %c-1 : vector<1xi8> to vector<1xf32>
+  return %r : vector<1xf32>
+}
+
+// CHECK-LABEL: @vector
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: vector<1xf32>: [-1.000000e+00]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/uitofp.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/uitofp.mlir
new file mode 100644
index 00000000000000..990e0196041858
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/uitofp.mlir
@@ -0,0 +1,31 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @i16() -> f32 {
+  %c-1 = arith.constant -1 : i16
+  %r = arith.uitofp %c-1 : i16 to f32
+  return %r : f32
+}
+
+// CHECK-LABEL: @i16
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: 6.553500e+04
+
+func.func @i1() -> f64 {
+  %true = arith.constant true
+  %r = arith.uitofp %true : i1 to f64
+  return %r : f64
+}
+
+// CHECK-LABEL: @i1
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: 1.000000e+00
+
+func.func @vector() -> vector<1xf32> {
+  %c-1 = arith.constant dense<-1> : vector<1xi8>
+  %r = arith.uitofp %c-1 : vector<1xi8> to vector<1xf32>
+  return %r : vector<1xf32>
+}
+
+// CHECK-LABEL: @vector
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: vector<1xf32>: [2.550000e+02]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/vector_math.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/vector_math.mlir
new file mode 100644
index 00000000000000..4ffc4d0ed81961
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/arith/vector_math.mlir
@@ -0,0 +1,12 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @addi() -> vector<2xi32> {
+  %c1 = arith.constant dense<[1, 2]> : vector<2xi32>
+  %c2 = arith.constant dense<[3, 4]> : vector<2xi32>
+  %ret = arith.addi %c1, %c2 : vector<2xi32>
+  return %ret : vector<2xi32>
+}
+
+// CHECK-LABEL: @addi
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<2xi32>: [4, 6]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/bufferization/alloc_tensor.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/bufferization/alloc_tensor.mlir
new file mode 100644
index 00000000000000..8d47f63656a05f
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/bufferization/alloc_tensor.mlir
@@ -0,0 +1,30 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @static() -> tensor<1x2x3xi32> {
+  %t = bufferization.alloc_tensor() : tensor<1x2x3xi32>
+  return %t : tensor<1x2x3xi32>
+}
+
+// CHECK-LABEL: @static
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[[0, 0, 0], [0, 0, 0]]]
+
+func.func @dynamic() -> tensor<?x1xi32> {
+  %c4 = arith.constant 4 : index
+  %t = bufferization.alloc_tensor(%c4) : tensor<?x1xi32>
+  return %t : tensor<?x1xi32>
+}
+
+// CHECK-LABEL: @dynamic
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[0], [0], [0], [0]]
+
+func.func @copy() -> tensor<i32> {
+  %c = arith.constant dense<123> : tensor<i32>
+  %t = bufferization.alloc_tensor() copy(%c) : tensor<i32>
+  return %t : tensor<i32>
+}
+
+// CHECK-LABEL: @copy
+// CHECK-NEXT: Results
+// CHECK-NEXT: 123
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/bufferization/clone.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/bufferization/clone.mlir
new file mode 100644
index 00000000000000..09670eb52c81d6
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/bufferization/clone.mlir
@@ -0,0 +1,14 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @clone() -> (memref<i32>, memref<i32>) {
+  %a = arith.constant dense<1> : memref<i32>
+  %b = bufferization.clone %a : memref<i32> to memref<i32>
+  %c = arith.constant 2 : i32
+  memref.store %c, %b[] : memref<i32>
+  return %a, %b : memref<i32>, memref<i32>
+}
+
+// CHECK-LABEL: @clone
+// CHECK-NEXT: Results
+// CHECK-NEXT: TensorOrMemref<i32>: 1
+// CHECK-NEXT: TensorOrMemref<i32>: 2
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/bufferization/to_memref.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/bufferization/to_memref.mlir
new file mode 100644
index 00000000000000..2ed938c36c6ece
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/bufferization/to_memref.mlir
@@ -0,0 +1,10 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @memref() -> memref<2xi16> {
+  %cst = arith.constant dense<[42, 43]> : tensor<2xi16>
+  %memref = bufferization.to_memref %cst : memref<2xi16>
+  return %memref : memref<2xi16>
+}
+
+// CHECK-LABEL: @memref
+// CHECK{LITERAL}: [42, 43]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/bufferization/to_tensor.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/bufferization/to_tensor.mlir
new file mode 100644
index 00000000000000..8bc19b4d895bff
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/bufferization/to_tensor.mlir
@@ -0,0 +1,11 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @tensor() -> tensor<2xi16> {
+  %cst = arith.constant dense<[43, 44]> : tensor<2xi16>
+  %memref = bufferization.to_memref %cst : memref<2xi16>
+  %tensor = bufferization.to_tensor %memref : memref<2xi16>
+  return %tensor : tensor<2xi16>
+}
+
+// CHECK-LABEL: @tensor
+// CHECK{LITERAL}: [43, 44]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/builtin/unrealized_conversion_cast.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/builtin/unrealized_conversion_cast.mlir
new file mode 100644
index 00000000000000..55c481677eb4ad
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/builtin/unrealized_conversion_cast.mlir
@@ -0,0 +1,21 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @no_op_cast() -> i32 {
+  %cst = arith.constant 42 : i32
+  %cast = builtin.unrealized_conversion_cast %cst : i32 to i32
+  return %cast : i32
+}
+
+// CHECK-LABEL: @no_op_cast
+// CHECK-NEXT: Results
+// CHECK{LITERAL}: 42
+
+func.func @cast_to_dynamic() -> tensor<?xi32> {
+  %cst = arith.constant dense<[0, 1, 2]> : tensor<3xi32>
+  %cast = builtin.unrealized_conversion_cast %cst : tensor<3xi32> to tensor<?xi32>
+  return %cast : tensor<?xi32>
+}
+
+// CHECK-LABEL: @cast_to_dynamic
+// CHECK-NEXT: Results
+// CHECK{LITERAL}: [0, 1, 2]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/complex/complex.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/complex/complex.mlir
new file mode 100644
index 00000000000000..590ce8fb2916a7
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/complex/complex.mlir
@@ -0,0 +1,186 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @abs() -> f32 {
+  %c = complex.constant [1.0 : f32, 2.0 : f32] : complex<f32>
+  %ret = complex.abs %c : complex<f32>
+  return %ret : f32
+}
+
+// CHECK-LABEL: @abs
+// CHECK-NEXT: Results
+// CHECK-NEXT: 2.236068e+00
+
+func.func @add() -> complex<f32> {
+  %a = complex.constant [1.0 : f32, 2.0 : f32] : complex<f32>
+  %b = complex.constant [10.0 : f32, 200.0 : f32] : complex<f32>
+  %ret = complex.add %a, %b : complex<f32>
+  return %ret : complex<f32>
+}
+
+// CHECK-LABEL: @add
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1.100000e+01+2.020000e+02i
+
+func.func @cos() -> complex<f32> {
+  %a = complex.constant [1.0 : f32, 2.0 : f32] : complex<f32>
+  %ret = complex.cos %a : complex<f32>
+  return %ret : complex<f32>
+}
+
+// CHECK-LABEL: @cos
+// CHECK-NEXT: Results
+// CHECK-NEXT: 2.032723e+00-3.051898e+00i
+
+func.func @create() -> complex<f32> {
+  %c1 = arith.constant 1.0 : f32
+  %c2 = arith.constant 2.0 : f32
+  %ret = complex.create %c1, %c2 : complex<f32>
+  return %ret : complex<f32>
+}
+
+// CHECK-LABEL: @create
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1.000000e+00+2.000000e+00i
+
+func.func @div() -> complex<f32> {
+  %a = complex.constant [1.0 : f32, 2.0 : f32] : complex<f32>
+  %b = complex.constant [3.0 : f32, 4.0 : f32] : complex<f32>
+  %ret = complex.div %a, %b : complex<f32>
+  return %ret : complex<f32>
+}
+
+// CHECK-LABEL: @div
+// CHECK-NEXT: Results
+// CHECK-NEXT: 4.400000e-01+8.000000e-02i
+
+func.func @exp() -> complex<f32> {
+  %a = complex.constant [1.0 : f32, 2.0 : f32] : complex<f32>
+  %ret = complex.exp %a : complex<f32>
+  return %ret : complex<f32>
+}
+
+// CHECK-LABEL: @exp
+// CHECK-NEXT: Results
+// CHECK-NEXT: -1.131204e+00+2.471727e+00i
+
+func.func @expm1() -> complex<f32> {
+  %a = complex.constant [1.0e-06 : f32, 1.0e-06 : f32] : complex<f32>
+  %ret = complex.expm1 %a : complex<f32>
+  return %ret : complex<f32>
+}
+
+// CHECK-LABEL: @expm1
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1.000000e-06+1.000001e-06i
+
+func.func @im() -> f32 {
+  %a = complex.constant [1.0 : f32, 2.0 : f32] : complex<f32>
+  %ret = complex.im %a : complex<f32>
+  return %ret : f32
+}
+
+// CHECK-LABEL: @im
+// CHECK-NEXT: Results
+// CHECK-NEXT: 2.000000e+00
+
+func.func @log() -> complex<f32> {
+  %a = complex.constant [1.0 : f32, 2.0 : f32] : complex<f32>
+  %ret = complex.log %a : complex<f32>
+  return %ret : complex<f32>
+}
+
+// CHECK-LABEL: @log
+// CHECK-NEXT: Results
+// CHECK-NEXT: 8.047190e-01+1.107149e+00i
+
+func.func @log1p() -> complex<f32> {
+  %a = complex.constant [1.0e-07 : f32, 1.0e-20 : f32] : complex<f32>
+  %ret = complex.log1p %a : complex<f32>
+  return %ret : complex<f32>
+}
+
+// CHECK-LABEL: @log1p
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1.192093e-07+9.999999e-21i
+
+func.func @mul() -> complex<f32> {
+  %a = complex.constant [1.0 : f32, 2.0 : f32] : complex<f32>
+  %b = complex.constant [3.0 : f32, 4.0 : f32] : complex<f32>
+  %ret = complex.mul %a, %b : complex<f32>
+  return %ret : complex<f32>
+}
+
+// CHECK-LABEL: @mul
+// CHECK-NEXT: Results
+// CHECK-NEXT: -5.000000e+00+1.000000e+01i
+
+func.func @neg() -> complex<f32> {
+  %a = complex.constant [1.0 : f32, 2.0 : f32] : complex<f32>
+  %ret = complex.neg %a : complex<f32>
+  return %ret : complex<f32>
+}
+
+// CHECK-LABEL: @neg
+// CHECK-NEXT: Results
+// CHECK-NEXT: -1.000000e+00-2.000000e+00i
+
+func.func @pow() -> complex<f32> {
+  %a = complex.constant [1.0 : f32, 2.0 : f32] : complex<f32>
+  %b = complex.constant [3.0 : f32, 4.0 : f32] : complex<f32>
+  %ret = complex.pow %a, %b : complex<f32>
+  return %ret : complex<f32>
+}
+
+// CHECK-LABEL: @pow
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1.290096e-01+3.392413e-02i
+
+func.func @re() -> f32 {
+  %a = complex.constant [1.0 : f32, 2.0 : f32] : complex<f32>
+  %ret = complex.re %a : complex<f32>
+  return %ret : f32
+}
+
+// CHECK-LABEL: @re
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1.000000e+00
+
+func.func @rsqrt() -> complex<f32> {
+  %a = complex.constant [1.0 : f32, 2.0 : f32] : complex<f32>
+  %ret = complex.rsqrt %a : complex<f32>
+  return %ret : complex<f32>
+}
+
+// CHECK-LABEL: @rsqrt
+// CHECK-NEXT: Results
+// CHECK-NEXT: 5.688645e-01-3.515776e-01i
+
+func.func @sin() -> complex<f64> {
+  %a = complex.constant [1.0 : f64, 2.0 : f64] : complex<f64>
+  %ret = complex.sin %a : complex<f64>
+  return %ret : complex<f64>
+}
+
+// CHECK-LABEL: @sin
+// CHECK-NEXT: Results
+// CHECK-NEXT: 3.165779e+00+1.959601e+00i
+
+func.func @sqrt() -> complex<f32> {
+  %a = complex.constant [1.0 : f32, 2.0 : f32] : complex<f32>
+  %ret = complex.sqrt %a : complex<f32>
+  return %ret : complex<f32>
+}
+
+// CHECK-LABEL: @sqrt
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1.27202{{.*}}e+00+7.86151{{.*}}e-01i
+
+func.func @tanh() -> complex<f32> {
+  %a = complex.constant [1.0 : f32, 2.0 : f32] : complex<f32>
+  %ret = complex.tanh %a : complex<f32>
+  return %ret : complex<f32>
+}
+
+// CHECK-LABEL: @tanh
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1.166736e+00-2.434582e-01i
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/func/call.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/func/call.mlir
new file mode 100644
index 00000000000000..b23f9d7423b05b
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/func/call.mlir
@@ -0,0 +1,48 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @cbrtf_caller() -> f32 {
+  %c-27 = arith.constant -27.0 : f32
+  %ret = func.call @cbrtf(%c-27) : (f32) -> f32
+  func.return %ret : f32
+}
+
+// CHECK-LABEL: @cbrtf_caller
+// CHECK-NEXT: Results
+// CHECK-NEXT: -3.000000e+00
+
+func.func @cbrt_caller() -> f64 {
+  %c-8 = arith.constant -8.0 : f64
+  %ret = func.call @cbrt(%c-8) : (f64) -> f64
+  func.return %ret : f64
+}
+
+// CHECK-LABEL: @cbrt_caller
+// CHECK-NEXT: Results
+// CHECK-NEXT: -2.000000e+00
+
+func.func @atan2f_caller() -> f32 {
+  %c1 = arith.constant 1.0 : f32
+  %c2 = arith.constant 2.0 : f32
+  %ret = func.call @atan2f(%c1, %c2) : (f32, f32) -> f32
+  func.return %ret : f32
+}
+
+// CHECK-LABEL: @atan2f_caller
+// CHECK-NEXT: Results
+// CHECK-NEXT: 4.636476e-01
+
+func.func @atan2_caller() -> f64 {
+  %c2 = arith.constant 2.0 : f64
+  %c3 = arith.constant 3.0 : f64
+  %ret = func.call @atan2(%c2, %c3) : (f64, f64) -> f64
+  func.return %ret : f64
+}
+
+// CHECK-LABEL: @atan2_caller
+// CHECK-NEXT: Results
+// CHECK-NEXT: 5.880026e-01
+
+func.func private @cbrtf(%a: f32) -> f32
+func.func private @cbrt(%a: f64) -> f64
+func.func private @atan2f(%a: f32, %b: f32) -> f32
+func.func private @atan2(%a: f64, %b: f64) -> f64
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/linalg/broadcast.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/linalg/broadcast.mlir
new file mode 100644
index 00000000000000..56fa8396a6391a
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/linalg/broadcast.mlir
@@ -0,0 +1,30 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @broadcast() -> (tensor<2x3xi32>, tensor<2x3xi32>) {
+  %v = arith.constant dense<[1,2]> : tensor<2xi32>
+  %init = tensor.empty() : tensor<2x3xi32>
+  %bcast = linalg.broadcast
+      ins(%v: tensor<2xi32>)
+      outs(%init: tensor<2x3xi32>)
+      dimensions = [1]
+  func.return %init, %bcast : tensor<2x3xi32>, tensor<2x3xi32>
+}
+
+// CHECK-LABEL: @broadcast
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[0, 0, 0], [0, 0, 0]]
+// CHECK-NEXT{LITERAL}: [[1, 1, 1], [2, 2, 2]]
+
+func.func @bufferized() -> memref<2x3xi32> {
+  %v = arith.constant dense<[1,2]> : memref<2xi32>
+  %alloc = memref.alloc() : memref<2x3xi32>
+  linalg.broadcast
+      ins(%v: memref<2xi32>)
+      outs(%alloc: memref<2x3xi32>)
+      dimensions = [1]
+  func.return %alloc : memref<2x3xi32>
+}
+
+// CHECK-LABEL: @bufferized
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[1, 1, 1], [2, 2, 2]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/linalg/dot.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/linalg/dot.mlir
new file mode 100644
index 00000000000000..895f51e0b0dd93
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/linalg/dot.mlir
@@ -0,0 +1,14 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @dot() -> tensor<i32> {
+  %lhs = arith.constant dense<[1, 2]> : tensor<2xi32>
+  %rhs = arith.constant dense<[3, 4]> : tensor<2xi32>
+  %init = tensor.empty() : tensor<i32>
+  %ret = linalg.dot ins(%lhs, %rhs: tensor<2xi32>, tensor<2xi32>)
+                    outs(%init: tensor<i32>) -> tensor<i32>
+  return %ret : tensor<i32>
+}
+
+// CHECK-LABEL: @dot
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: 11
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/linalg/fill.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/linalg/fill.mlir
new file mode 100644
index 00000000000000..20a91a7768df6b
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/linalg/fill.mlir
@@ -0,0 +1,24 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @fill() -> (tensor<2xi32>, tensor<2xi32>) {
+  %c42 = arith.constant 42 : i32
+  %init = tensor.empty() : tensor<2xi32>
+  %fill = linalg.fill ins(%c42 : i32) outs(%init : tensor<2xi32>) -> tensor<2xi32>
+  func.return %init, %fill : tensor<2xi32>, tensor<2xi32>
+}
+
+// CHECK-LABEL: @fill
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [0, 0]
+// CHECK-NEXT{LITERAL}: [42, 42]
+
+func.func @bufferized() -> memref<2xi32> {
+  %c42 = arith.constant 42 : i32
+  %alloc = memref.alloc() : memref<2xi32>
+  linalg.fill ins(%c42 : i32) outs(%alloc : memref<2xi32>)
+  func.return %alloc : memref<2xi32>
+}
+
+// CHECK-LABEL: @bufferized
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [42, 42]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/linalg/generic.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/linalg/generic.mlir
new file mode 100644
index 00000000000000..c27c765bf3337e
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/linalg/generic.mlir
@@ -0,0 +1,113 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+#matmul_trait = {
+  indexing_maps = [
+    affine_map<(m, n, k) -> (m, k)>,
+    affine_map<(m, n, k) -> (k, n)>,
+    affine_map<(m, n, k) -> (m, n)>
+  ],
+  iterator_types = ["parallel", "parallel", "reduction"]
+}
+
+func.func @matmul() -> (tensor<2x2xi32>, tensor<2x2xi32>) {
+  %lhs = arith.constant dense<[[1, 2, 3], [4, 5, 6]]> : tensor<2x3xi32>
+  %rhs = arith.constant dense<[[1, 2], [3, 4], [5, 6]]> : tensor<3x2xi32>
+  %init = tensor.empty() : tensor<2x2xi32>
+  %ret = linalg.generic #matmul_trait
+    ins(%lhs, %rhs : tensor<2x3xi32>, tensor<3x2xi32>)
+    outs(%init : tensor<2x2xi32>) {
+    ^bb(%a: i32, %b: i32, %c: i32):
+      %d = arith.muli %a, %b: i32
+      %e = arith.addi %c, %d: i32
+      linalg.yield %e : i32
+    } -> tensor<2x2xi32>
+  return %ret, %init : tensor<2x2xi32>, tensor<2x2xi32>
+}
+
+// CHECK-LABEL: @matmul
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[22, 28], [49, 64]]
+// CHECK-NEXT{LITERAL}: [[0, 0], [0, 0]]
+
+func.func @bufferized() -> memref<2x2xi32> {
+  %lhs = arith.constant dense<[[1, 2, 3], [4, 5, 6]]> : memref<2x3xi32>
+  %rhs = arith.constant dense<[[2, 1], [3, 4], [5, 6]]> : memref<3x2xi32>
+  %alloc = memref.alloc() : memref<2x2xi32>
+  linalg.generic #matmul_trait
+    ins(%lhs, %rhs : memref<2x3xi32>, memref<3x2xi32>)
+    outs(%alloc : memref<2x2xi32>) {
+    ^bb(%a: i32, %b: i32, %c: i32):
+      %d = arith.muli %a, %b: i32
+      %e = arith.addi %c, %d: i32
+      linalg.yield %e : i32
+    }
+  return %alloc : memref<2x2xi32>
+}
+
+// CHECK-LABEL: @bufferized
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[23, 27], [53, 60]]
+
+#map = affine_map<(d0) -> (d0)>
+
+func.func @vector() -> tensor<4xvector<2xi32>> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %lhs = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
+  %rhs = arith.constant dense<[5, 6, 7, 8]> : tensor<4xi32>
+  %init = tensor.empty() : tensor<4xvector<2xi32>>
+  %ret = linalg.generic {
+      indexing_maps = [#map, #map, #map],
+      iterator_types = ["parallel"]
+    }
+    ins(%lhs, %rhs : tensor<4xi32>, tensor<4xi32>)
+    outs(%init : tensor<4xvector<2xi32>>) {
+    ^bb(%a: i32, %b: i32, %c: vector<2xi32>):
+      %d = vector.insertelement %a, %c[%c0 : index] : vector<2xi32>
+      %e = vector.insertelement %b, %d[%c1 : index] : vector<2xi32>
+      linalg.yield %e : vector<2xi32>
+    } -> tensor<4xvector<2xi32>>
+  return %ret : tensor<4xvector<2xi32>>
+}
+
+// CHECK-LABEL: @vector
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: TensorOrMemref<4xvector<2xi32>>: [[1, 5], [2, 6], [3, 7], [4, 8]]
+
+func.func @matmul_dynamic() -> tensor<2x2xi32> {
+  %lhs = arith.constant dense<[[1, 2, 3], [4, 5, 6]]> : tensor<2x3xi32>
+  %rhs = arith.constant dense<[[1, 2], [3, 4], [5, 6]]> : tensor<3x2xi32>
+  %lhs_cast = tensor.cast %lhs : tensor<2x3xi32> to tensor<2x?xi32>
+  %rhs_cast = tensor.cast %rhs : tensor<3x2xi32> to tensor<?x2xi32>
+  %init = tensor.empty() : tensor<2x2xi32>
+  %ret = linalg.generic #matmul_trait
+    ins(%lhs_cast, %rhs_cast : tensor<2x?xi32>, tensor<?x2xi32>)
+    outs(%init : tensor<2x2xi32>) {
+    ^bb(%a: i32, %b: i32, %c: i32):
+      %d = arith.muli %a, %b: i32
+      %e = arith.addi %c, %d: i32
+      linalg.yield %e : i32
+    } -> tensor<2x2xi32>
+  return %ret : tensor<2x2xi32>
+}
+
+// CHECK-LABEL: @matmul_dynamic
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[22, 28], [49, 64]]
+
+func.func @dynamic_generic_w_cst() -> tensor<4x?xf64> {
+  %cst_1 =  arith.constant 123.456 : f64
+  %extracted_slice_ = arith.constant dense<[[1.1, 2.2], [3.3, 4.4], [5.5, 6.6], [7.7, 8.8]]> : tensor<4x2xf64>
+  %extracted_slice = tensor.cast %extracted_slice_ : tensor<4x2xf64> to tensor<4x?xf64>
+  %6 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> ()>,
+      affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel",
+      "parallel"]} ins(%cst_1 : f64) outs(%extracted_slice : tensor<4x?xf64>) {
+  ^bb0(%in: f64, %out: f64):
+    linalg.yield %in : f64
+  } -> tensor<4x?xf64>
+  return %6 : tensor<4x?xf64>
+}
+
+// CHECK-LABEL: @dynamic_generic_w_cst
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: TensorOrMemref<4x2xf64>: [[1.234560e+02, 1.234560e+02], [1.234560e+02, 1.234560e+02], [1.234560e+02, 1.234560e+02], [1.234560e+02, 1.234560e+02]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/linalg/map.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/linalg/map.mlir
new file mode 100644
index 00000000000000..4fef179d7989d6
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/linalg/map.mlir
@@ -0,0 +1,74 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @no_inputs() -> tensor<4xf32> {
+  %init = arith.constant dense<[1.0,2.0,3.0,4.0]> : tensor<4xf32>
+  %zero = linalg.map outs(%init:tensor<4xf32>)() {
+    %0 = arith.constant 0.0: f32
+    linalg.yield %0: f32
+  }
+  func.return %zero : tensor<4xf32>
+}
+
+// CHECK-LABEL: @no_inputs
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00]
+
+func.func @binary() -> tensor<4xi32> {
+  %init = tensor.empty() : tensor<4xi32>
+  %lhs = arith.constant dense<[0, 1, 2, 3]> : tensor<4xi32>
+  %rhs = arith.constant dense<[10, 20, 30, 40]> : tensor<4xi32>
+  %add = linalg.map ins(%lhs, %rhs: tensor<4xi32>, tensor<4xi32>)
+                    outs(%init: tensor<4xi32>)
+    (%lhs_elem: i32, %rhs_elem: i32) {
+      %0 = arith.addi %lhs_elem, %rhs_elem: i32
+      linalg.yield %0: i32
+    }
+  func.return %add : tensor<4xi32>
+}
+
+// CHECK-LABEL: @binary
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [10, 21, 32, 43]
+
+func.func @memref() -> memref<4xi32> {
+  %alloc = memref.alloc() : memref<4xi32>
+  %lhs = arith.constant dense<[0, 1, 2, 3]> : memref<4xi32>
+  %rhs = arith.constant dense<[10, 20, 30, 40]> : memref<4xi32>
+  linalg.map ins(%lhs, %rhs: memref<4xi32>, memref<4xi32>)
+             outs(%alloc: memref<4xi32>)
+    (%lhs_elem: i32, %rhs_elem: i32) {
+      %0 = arith.muli %lhs_elem, %rhs_elem: i32
+      linalg.yield %0: i32
+    }
+  func.return %alloc : memref<4xi32>
+}
+
+// CHECK-LABEL: @memref
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [0, 20, 60, 120]
+
+func.func @index() -> memref<4xindex> {
+  %alloc = memref.alloc() : memref<4xindex>
+  linalg.map outs(%alloc: memref<4xindex>)() {
+    %0 = linalg.index 0 : index
+    linalg.yield %0: index
+  }
+  func.return %alloc : memref<4xindex>
+}
+
+// CHECK-LABEL: @index
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [0, 1, 2, 3]
+
+func.func @vector() -> memref<4xvector<2xindex>> {
+  %c = arith.constant dense<42> : vector<2xindex>
+  %alloc = memref.alloc() : memref<4xvector<2xindex>>
+  linalg.map outs(%alloc: memref<4xvector<2xindex>>)() {
+    linalg.yield %c: vector<2xindex>
+  }
+  func.return %alloc : memref<4xvector<2xindex>>
+}
+
+// CHECK-LABEL: @vector
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[42, 42], [42, 42], [42, 42], [42, 42]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/linalg/matmul.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/linalg/matmul.mlir
new file mode 100644
index 00000000000000..131d2db123792b
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/linalg/matmul.mlir
@@ -0,0 +1,41 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @m1x1() -> (tensor<1x1xi32>, tensor<1x1xi32>) {
+  %a = arith.constant dense<1> : tensor<1x1xi32>
+  %b = arith.constant dense<2> : tensor<1x1xi32>
+  %c = arith.constant dense<3> : tensor<1x1xi32>
+  %ret = linalg.matmul ins(%a, %b : tensor<1x1xi32>, tensor<1x1xi32>)
+                       outs(%c : tensor<1x1xi32>) -> tensor<1x1xi32>
+  return %c, %ret : tensor<1x1xi32>, tensor<1x1xi32>
+}
+
+// CHECK-LABEL: @m1x1
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[3]]
+// CHECK-NEXT{LITERAL}: [[5]]
+
+func.func @m2x2() -> tensor<2x2xi32> {
+  %a = arith.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
+  %b = arith.constant dense<[[4, 5], [6, 7]]> : tensor<2x2xi32>
+  %c = tensor.empty() : tensor<2x2xi32>
+  %ret = linalg.matmul ins(%a, %b : tensor<2x2xi32>, tensor<2x2xi32>)
+                       outs(%c : tensor<2x2xi32>) -> tensor<2x2xi32>
+  return %ret : tensor<2x2xi32>
+}
+
+// CHECK-LABEL: @m2x2
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[16, 19], [36, 43]]
+
+func.func @m1x1_bufferized() -> memref<1x1xi32> {
+  %a = arith.constant dense<1> : memref<1x1xi32>
+  %b = arith.constant dense<2> : memref<1x1xi32>
+  %c = arith.constant dense<3> : memref<1x1xi32>
+  linalg.matmul ins(%a, %b : memref<1x1xi32>, memref<1x1xi32>)
+                outs(%c : memref<1x1xi32>)
+  return %c : memref<1x1xi32>
+}
+
+// CHECK-LABEL: @m1x1_bufferized
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[5]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/linalg/reduce.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/linalg/reduce.mlir
new file mode 100644
index 00000000000000..9b2e2d760369e2
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/linalg/reduce.mlir
@@ -0,0 +1,57 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @reduce() -> tensor<2xi32> {
+  %v = arith.constant dense<[[1,2,3,4], [5,6,7,8]]> : tensor<2x4xi32>
+  %init = arith.constant dense<[9, 10]> : tensor<2xi32>
+  %ret = linalg.reduce ins(%v : tensor<2x4xi32>)
+                       outs(%init: tensor<2xi32>)
+                       dimensions = [1]
+                       (%in: i32, %out: i32) {
+                         %sum = arith.addi %in, %out : i32
+                         linalg.yield %sum: i32
+                       }
+  func.return %ret : tensor<2xi32>
+}
+
+// CHECK-LABEL: @reduce
+// CHECK-NEXT: Results
+// CHECK-NEXT: [19, 36]
+
+func.func @variadic() -> (tensor<2xi32>, tensor<2xf32>) {
+  %v = arith.constant dense<[[1,2,3,4], [5,6,7,8]]> : tensor<2x4xi32>
+  %w = arith.constant dense<[[1.0,2.0,3.0,4.0], [5.0,6.0,7.0,8.0]]> : tensor<2x4xf32>
+  %init = arith.constant dense<[9, 10]> : tensor<2xi32>
+  %init2 = arith.constant dense<[9.0, 10.0]> : tensor<2xf32>
+  %ret, %retf = linalg.reduce ins(%v, %w : tensor<2x4xi32>, tensor<2x4xf32>)
+                              outs(%init, %init2: tensor<2xi32>, tensor<2xf32>)
+                              dimensions = [1]
+    (%in: i32, %inf: f32, %out: i32, %outf: f32) {
+      %sum = arith.addi %in, %out : i32
+      %sumf = arith.addf %inf, %outf : f32
+      linalg.yield %sum, %sumf: i32, f32
+    }
+  func.return %ret, %retf : tensor<2xi32>, tensor<2xf32>
+}
+
+// CHECK-LABEL: @variadic
+// CHECK-NEXT: Results
+// CHECK-NEXT: [19, 36]
+// CHECK-NEXT: [1.900000e+01, 3.600000e+01]
+
+func.func @bufferized() -> memref<2xi32> {
+  %v = arith.constant dense<[[1,2,3,4], [5,6,7,8]]> : memref<2x4xi32>
+  %init = arith.constant dense<[9, 10]> : memref<2xi32>
+  linalg.reduce ins(%v : memref<2x4xi32>)
+                outs(%init: memref<2xi32>)
+                dimensions = [1]
+                (%in: i32, %out: i32) {
+                  %sum = arith.addi %in, %out : i32
+                  linalg.yield %sum: i32
+                }
+  func.return %init : memref<2xi32>
+}
+
+// CHECK-LABEL: @bufferized
+// CHECK-NEXT: Results
+// CHECK-NEXT: [19, 36]
+
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/linalg/transpose.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/linalg/transpose.mlir
new file mode 100644
index 00000000000000..8a60a6ef83ce8e
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/linalg/transpose.mlir
@@ -0,0 +1,27 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @transpose() -> tensor<2x3xi32> {
+  %a = arith.constant dense<[[1, 2], [3, 4], [5, 6]]> : tensor<3x2xi32>
+  %b = tensor.empty() : tensor<2x3xi32>
+  %ret = linalg.transpose ins(%a : tensor<3x2xi32>)
+                          outs(%b : tensor<2x3xi32>)
+                          permutation = [1, 0]
+  return %ret : tensor<2x3xi32>
+}
+
+// CHECK-LABEL: @transpose
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[1, 3, 5], [2, 4, 6]]
+
+func.func @transpose_bufferized() -> memref<2x3xi32> {
+  %a = arith.constant dense<[[1, 2], [3, 4], [5, 6]]> : memref<3x2xi32>
+  %b = memref.alloc() : memref<2x3xi32>
+  linalg.transpose ins(%a : memref<3x2xi32>)
+                   outs(%b : memref<2x3xi32>)
+                   permutation = [1, 0]
+  return %b : memref<2x3xi32>
+}
+
+// CHECK-LABEL: @transpose_bufferized
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[1, 3, 5], [2, 4, 6]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/linalg/vecmat.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/linalg/vecmat.mlir
new file mode 100644
index 00000000000000..2a4280f1581b2c
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/linalg/vecmat.mlir
@@ -0,0 +1,14 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @vecmat() -> tensor<2xi32> {
+  %lhs = arith.constant dense<[4, 5]> : tensor<2xi32>
+  %rhs = arith.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
+  %init = tensor.empty() : tensor<2xi32>
+  %ret = linalg.vecmat ins(%lhs, %rhs: tensor<2xi32>, tensor<2x2xi32>)
+                       outs(%init: tensor<2xi32>) -> tensor<2xi32>
+  return %ret : tensor<2xi32>
+}
+
+// CHECK-LABEL: @vecmat
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [19, 28]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/math/math.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/math/math.mlir
new file mode 100644
index 00000000000000..3e99a449c8cef6
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/math/math.mlir
@@ -0,0 +1,252 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @absi() -> i32 {
+  %a = arith.constant -1 : i32
+  %ret = math.absi %a : i32
+  return %ret : i32
+}
+
+// CHECK-LABEL: @absi
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1
+
+func.func @absf() -> (f32, f32) {
+  %a = arith.constant 1.0 : f32
+  %b = arith.constant -1.0 : f32
+  %c = math.absf %a : f32
+  %d = math.absf %b : f32
+  return %c, %d : f32, f32
+}
+
+// CHECK-LABEL: @absf
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1.000000e+00
+// CHECK-NEXT: 1.000000e+00
+
+func.func @atan() -> f32 {
+  %c1 = arith.constant 1.0 : f32
+  %ret = math.atan %c1 : f32
+  return %ret : f32
+}
+
+// CHECK-LABEL: @atan
+// CHECK-NEXT: Results
+// CHECK-NEXT: 7.853982e-01
+
+func.func @atan2() -> f32 {
+  %c10 = arith.constant 10.0 : f32
+  %c1 = arith.constant 1.0 : f32
+  %ret = math.atan2 %c10, %c1 : f32
+  return %ret : f32
+}
+
+// CHECK-LABEL: @atan2
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1.471128e+00
+
+func.func @cbrt() -> f32 {
+  %c-27 = arith.constant -27.0 : f32
+  %ret = math.cbrt %c-27 : f32
+  return %ret : f32
+}
+
+// CHECK-LABEL: @cbrt
+// CHECK-NEXT: Results
+// CHECK-NEXT: -3.000000e+00
+
+func.func @ceil() -> f32 {
+  %a = arith.constant 1.234 : f32
+  %ret = math.ceil %a : f32
+  return %ret : f32
+}
+
+// CHECK-LABEL: @ceil
+// CHECK-NEXT: Results
+// CHECK-NEXT: 2.000000e+00
+
+func.func @ctlz() -> i8 {
+  %c1 = arith.constant 1 : i8
+  %ret = math.ctlz %c1 : i8
+  return %ret : i8
+}
+
+// CHECK-LABEL: @ctlz
+// CHECK-NEXT: Results
+// CHECK-NEXT: 7
+
+func.func @ctpop() -> i8 {
+  %c1 = arith.constant 17 : i8
+  %ret = math.ctpop %c1 : i8
+  return %ret : i8
+}
+
+// CHECK-LABEL: @ctpop
+// CHECK-NEXT: Results
+// CHECK-NEXT: i8: 2
+
+func.func @ctz() -> i8 {
+  %c1 = arith.constant 8 : i8
+  %ret = math.cttz %c1 : i8
+  return %ret : i8
+}
+
+// CHECK-LABEL: @ctz
+// CHECK-NEXT: Results
+// CHECK-NEXT: 3
+
+func.func @copysign() -> (f32, f32) {
+  %a = arith.constant 1.5 : f32
+  %b = arith.constant -10.0 : f32
+  %c = math.copysign %a, %b : f32
+  %d = math.copysign %a, %a : f32
+  return %c, %d : f32, f32
+}
+
+// CHECK-LABEL: @copysign
+// CHECK-NEXT: Results
+// CHECK-NEXT: -1.500000e+00
+// CHECK-NEXT: 1.500000e+00
+
+func.func @cos() -> f32 {
+  %a = arith.constant 0.0 : f32
+  %b = math.cos %a : f32
+  return %b : f32
+}
+
+// CHECK-LABEL: @cos
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1.000000e+00
+
+func.func @erf() -> f32 {
+  %a = arith.constant 1.0 : f32
+  %b = math.erf %a : f32
+  return %b : f32
+}
+
+// CHECK-LABEL: @erf
+// CHECK-NEXT: Results
+// CHECK-NEXT: 8.427008e-01
+
+func.func @exp() -> f32 {
+  %a = arith.constant 1.0 : f32
+  %b = math.exp %a : f32
+  return %b : f32
+}
+
+// CHECK-LABEL: @exp
+// CHECK-NEXT: Results
+// CHECK-NEXT: 2.718282e+00
+
+func.func @exp2() -> f32 {
+  %a = arith.constant 3.0 : f32
+  %b = math.exp2 %a : f32
+  return %b : f32
+}
+
+// CHECK-LABEL: @exp2
+// CHECK-NEXT: Results
+// CHECK-NEXT: 8.000000e+00
+
+func.func @expm1() -> f32 {
+  %a = arith.constant 1.0e-06 : f32
+  %b = math.expm1 %a : f32
+  return %b : f32
+}
+
+// CHECK-LABEL: @expm1
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1.000000e-06
+
+func.func @floor() -> (f32, f32) {
+  %a = arith.constant 1.5 : f32
+  %b = arith.constant -10.5 : f32
+  %c = math.floor %a : f32
+  %d = math.floor %b : f32
+  return %c, %d : f32, f32
+}
+
+// CHECK-LABEL: @floor
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1.000000e+00
+// CHECK-NEXT: -1.100000e+01
+
+func.func @ipowi() -> i32 {
+  %a = arith.constant 2 : i32
+  %b = arith.constant 3 : i32
+  %c = math.ipowi %a, %b : i32
+  return %c : i32
+}
+
+// CHECK-LABEL: @ipowi
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: 8
+
+func.func @log() -> f32 {
+  %a = arith.constant 1.0 : f32
+  %ret = math.log %a : f32
+  return %ret : f32
+}
+
+// CHECK-LABEL: @log
+// CHECK-NEXT: Results
+// CHECK-NEXT: 0.000000e+00
+
+func.func @log10() -> f32 {
+  %a = arith.constant 1.0e12 : f32
+  %ret = math.log10 %a : f32
+  return %ret : f32
+}
+
+// CHECK-LABEL: @log
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1.200000e+01
+
+func.func @log1p() -> f32 {
+  %a = arith.constant 1.0e-10 : f32
+  %ret = math.log1p %a : f32
+  return %ret : f32
+}
+
+// CHECK-LABEL: @log1p
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1.000000e-10
+
+func.func @log2() -> f32 {
+  %a = arith.constant 65536.0 : f32
+  %ret = math.log2 %a : f32
+  return %ret : f32
+}
+
+// CHECK-LABEL: @log2
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1.600000e+01
+
+func.func @powf() -> f32 {
+  %a = arith.constant 2.0 : f32
+  %b = arith.constant 3.0 : f32
+  %c = math.powf %a, %b : f32
+  return %c : f32
+}
+
+// CHECK-LABEL: @powf
+// CHECK-NEXT: Results
+// CHECK-NEXT: 8.000000e+00
+
+//REGISTER_MLIR_INTERPRETER_OP("math.round", applyCwiseMap<Round>);
+//REGISTER_MLIR_INTERPRETER_OP("math.roundeven",  applyCwiseMap<NearbyInt>);
+//REGISTER_MLIR_INTERPRETER_OP("math.rsqrt", applyCwiseMap<RSqrt>);
+
+func.func @sin() -> f32 {
+  %a = arith.constant 0.0 : f32
+  %b = math.sin %a : f32
+  return %b : f32
+}
+
+// CHECK-LABEL: @sin
+// CHECK-NEXT: Results
+// CHECK-NEXT: 0.000000e+00
+
+//REGISTER_MLIR_INTERPRETER_OP("math.sqrt", applyCwiseMap<Sqrt>);
+//REGISTER_MLIR_INTERPRETER_OP("math.tan", applyCwiseMap<Tan>);
+//REGISTER_MLIR_INTERPRETER_OP("math.tanh", applyCwiseMap<TanH>);
+//REGISTER_MLIR_INTERPRETER_OP("math.trunc", applyCwiseMap<Trunc>);
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/memref/alloc.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/memref/alloc.mlir
new file mode 100644
index 00000000000000..d6026b5b913a1b
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/memref/alloc.mlir
@@ -0,0 +1,57 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @alloc() -> memref<2x3xi32> {
+  %ret = memref.alloc() : memref<2x3xi32>
+  return %ret : memref<2x3xi32>
+}
+
+// CHECK-LABEL: @alloc
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: <2x3xi32>: [[0, 0, 0], [0, 0, 0]]
+
+func.func @alloc_unit() -> memref<i32> {
+  %ret = memref.alloc() : memref<i32>
+  return %ret : memref<i32>
+}
+
+// CHECK-LABEL: @alloc_unit
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: <i32>: 0
+
+func.func @alloc_unit_vector() -> memref<vector<i32>> {
+  %ret = memref.alloc() : memref<vector<i32>>
+  return %ret : memref<vector<i32>>
+}
+
+// CHECK-LABEL: @alloc_unit_vector
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: <vector<i32>>: 0
+
+func.func @alloc_vector() -> memref<2xvector<3xi32>> {
+  %ret = memref.alloc() : memref<2xvector<3xi32>>
+  return %ret : memref<2xvector<3xi32>>
+}
+
+// CHECK-LABEL: @alloc_vector
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: <2xvector<3xi32>>: [[0, 0, 0], [0, 0, 0]]
+
+func.func @alloc_dynamic() -> memref<?x3xi32> {
+  %c2 = arith.constant 2 : index
+  %ret = memref.alloc(%c2) : memref<?x3xi32>
+  return %ret : memref<?x3xi32>
+}
+
+// CHECK-LABEL: @alloc_dynamic
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: <2x3xi32>: [[0, 0, 0], [0, 0, 0]]
+
+func.func @dealloc() -> memref<i32> {
+  %a = memref.alloc() : memref<i32>
+  memref.dealloc %a : memref<i32>
+  return %a : memref<i32>
+}
+
+// CHECK-LABEL: @dealloc
+// CHECK-NEXT: Results
+// CHECK-NEXT: TensorOrMemref<i32>: <<deallocated>>
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/memref/collapse_shape.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/memref/collapse_shape.mlir
new file mode 100644
index 00000000000000..027b59755bed8c
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/memref/collapse_shape.mlir
@@ -0,0 +1,33 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @collapse_shape()
+    -> (memref<1x2x3xi32>, memref<1x6xi32>, memref<2x3xi32>, memref<6xi32>) {
+  %cst = arith.constant dense<[[[1, 2, 3], [4, 5, 6]]]> : memref<1x2x3xi32>
+  %collapse1 = memref.collapse_shape %cst [[0], [1, 2]]
+      : memref<1x2x3xi32> into memref<1x6xi32>
+  %collapse2 = memref.collapse_shape %cst [[0, 1], [2]]
+      : memref<1x2x3xi32> into memref<2x3xi32>
+  %collapse3 = memref.collapse_shape %cst [[0, 1, 2]]
+      : memref<1x2x3xi32> into memref<6xi32>
+  return %cst, %collapse1, %collapse2, %collapse3
+      : memref<1x2x3xi32>, memref<1x6xi32>, memref<2x3xi32>, memref<6xi32>
+}
+
+// CHECK-LABEL: @collapse_shape
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[[1, 2, 3], [4, 5, 6]]]
+// CHECK-NEXT{LITERAL}: [[1, 2, 3, 4, 5, 6]]
+// CHECK-NEXT{LITERAL}: [[1, 2, 3], [4, 5, 6]]
+// CHECK-NEXT{LITERAL}: [1, 2, 3, 4, 5, 6]
+
+func.func @zero_dim()
+    -> (memref<6x0xi32>) {
+  %cst = arith.constant dense<> : memref<1x2x3x0xi32>
+  %collapse = memref.collapse_shape %cst [[0, 1, 2], [3]]
+      : memref<1x2x3x0xi32> into memref<6x0xi32>
+  return %collapse : memref<6x0xi32>
+}
+
+// CHECK-LABEL: @zero_dim
+// CHECK-NEXT: Results
+// CHECK-NEXT: TensorOrMemref<6x0xi32>
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/memref/copy.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/memref/copy.mlir
new file mode 100644
index 00000000000000..09815f5e8cf61a
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/memref/copy.mlir
@@ -0,0 +1,39 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @subview() -> memref<4x4xi32, strided<[4, 1], offset: 0>> {
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+
+  %0 = memref.alloc() : memref<4x4xi32, strided<[4, 1], offset: 0>>
+  %1 = memref.subview %0[%c1, %c1][%c2, %c2][%c1, %c1]
+    : memref<4x4xi32, strided<[4, 1], offset: 0>> to
+      memref<?x?xi32, strided<[?, ?], offset: ?>>
+
+  %cst = arith.constant dense<[[1, 2], [3, 4]]> : memref<2x2xi32>
+  memref.copy %cst, %1 : memref<2x2xi32> to memref<?x?xi32, strided<[?, ?], offset: ?>>
+
+  return %0 : memref<4x4xi32, strided<[4, 1], offset: 0>>
+}
+
+// CHECK-LABEL: @subview
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[0, 0, 0, 0], [0, 1, 2, 0], [0, 3, 4, 0], [0, 0, 0, 0]]
+
+func.func @strided() -> memref<4x4xi32> {
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+
+  %0 = memref.alloc() : memref<4x4xi32>
+  %1 = memref.subview %0[%c1, %c1][%c2, %c2][%c2, %c2]
+    : memref<4x4xi32> to memref<?x?xi32, strided<[?, ?], offset: ?>>
+
+  %cst = arith.constant dense<[[1, 2], [3, 4]]> : memref<2x2xi32>
+  memref.copy %cst, %1
+    : memref<2x2xi32> to memref<?x?xi32, strided<[?, ?], offset: ?>>
+
+  return %0 : memref<4x4xi32>
+}
+
+// CHECK-LABEL: @strided
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[0, 0, 0, 0], [0, 1, 0, 2], [0, 0, 0, 0], [0, 3, 0, 4]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/memref/dim.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/memref/dim.mlir
new file mode 100644
index 00000000000000..2740b7df65de6d
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/memref/dim.mlir
@@ -0,0 +1,12 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @dim() -> index {
+  %alloc = memref.alloc() {alignment = 64 : i64} : memref<10x50xf32>
+  %c1 = arith.constant 1 : index
+  %dim = memref.dim %alloc, %c1 : memref<10x50xf32>
+  return %dim : index
+}
+
+// CHECK-LABEL: @dim
+// CHECK-NEXT: Results
+// CHECK-NEXT: i64: 50
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/memref/expand_shape.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/memref/expand_shape.mlir
new file mode 100644
index 00000000000000..7f9cdc7e27f49f
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/memref/expand_shape.mlir
@@ -0,0 +1,52 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @expand_shape()
+    -> (memref<1x2x3xi32>, memref<1x2x3xi32>, memref<2x1x3xi32>,
+        memref<1x2x3xi32>) {
+  %cst1 = arith.constant dense<[[1, 2, 3, 4, 5, 6]]> : memref<1x6xi32>
+  %cst2 = arith.constant dense<[[1, 2, 3], [4, 5, 6]]> : memref<2x3xi32>
+  %cst3 = arith.constant dense<[1, 2, 3, 4, 5, 6]> : memref<6xi32>
+  %expand1 = memref.expand_shape %cst1 [[0], [1, 2]] output_shape [1, 2, 3]
+      : memref<1x6xi32> into memref<1x2x3xi32>
+  %expand2 = memref.expand_shape %cst2 [[0, 1], [2]] output_shape [1, 2, 3]
+      : memref<2x3xi32> into memref<1x2x3xi32>
+  %expand3 = memref.expand_shape %cst2 [[0, 1], [2]] output_shape [2, 1, 3]
+      : memref<2x3xi32> into memref<2x1x3xi32>
+  %expand4 = memref.expand_shape %cst3 [[0, 1, 2]] output_shape [1, 2, 3]
+      : memref<6xi32> into memref<1x2x3xi32>
+  return %expand1, %expand2, %expand3, %expand4
+      : memref<1x2x3xi32>, memref<1x2x3xi32>, memref<2x1x3xi32>,
+        memref<1x2x3xi32>
+}
+
+// CHECK-LABEL: @expand_shape
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[[1, 2, 3], [4, 5, 6]]]
+// CHECK-NEXT{LITERAL}: [[[1, 2, 3], [4, 5, 6]]]
+// CHECK-NEXT{LITERAL}: [[[1, 2, 3]], [[4, 5, 6]]]
+// CHECK-NEXT{LITERAL}: [[[1, 2, 3], [4, 5, 6]]]
+
+func.func @zero_rank()
+    -> (memref<1x1xi32>) {
+  %cst = arith.constant dense<1> : memref<i32>
+  %expand = memref.expand_shape %cst [] output_shape [1, 1]
+      : memref<i32> into memref<1x1xi32>
+  return %expand : memref<1x1xi32>
+}
+
+// CHECK-LABEL: @zero_rank
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[1]]
+
+func.func @split_dim() -> memref<3x2x2xi32, strided<[4, 2, 1], offset: 0>> {
+  %cst = arith.constant dense<[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]>
+    : memref<3x4xi32, strided<[4, 1], offset: 0>>
+  %ret = memref.expand_shape %cst [[0], [1, 2]] output_shape [3, 2, 2]
+    : memref<3x4xi32, strided<[4, 1], offset: 0>> into
+      memref<3x2x2xi32, strided<[4, 2, 1], offset: 0>>
+  return %ret : memref<3x2x2xi32, strided<[4, 2, 1], offset: 0>>
+}
+
+// CHECK-LABEL: @split_dim
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[9, 10], [11, 12]]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/memref/get_global.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/memref/get_global.mlir
new file mode 100644
index 00000000000000..a8b5de8930cc83
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/memref/get_global.mlir
@@ -0,0 +1,12 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+memref.global "private" constant @cst : memref<2xi16> = dense<[1, 2]>
+
+func.func @get_global() -> memref<2xi16> {
+  %0 = memref.get_global @cst : memref<2xi16>
+  return %0 : memref<2xi16>
+}
+
+// CHECK-LABEL: @get_global
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [1, 2]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/memref/invalid.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/memref/invalid.mlir
new file mode 100644
index 00000000000000..a533a9664c235d
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/memref/invalid.mlir
@@ -0,0 +1,77 @@
+// RUN: (! mlir-interpreter-runner %s -run-all 2>&1) | FileCheck %s
+
+func.func @out_of_bounds_load() -> i32 {
+  %c3 = arith.constant 3 : index
+  %cst = arith.constant dense<[1, 2]> : memref<2xi32>
+  %ret = memref.load %cst[%c3] : memref<2xi32>
+  return %ret : i32
+}
+
+// CHECK-LABEL: @out_of_bounds_load
+// CHECK-NEXT: array index out of bounds
+
+func.func @out_of_bounds_store() {
+  %c3 = arith.constant 3 : index
+  %v = arith.constant 32 : i32
+  %cst = arith.constant dense<[1, 2]> : memref<2xi32>
+  memref.store %v, %cst[%c3] : memref<2xi32>
+  return
+}
+
+// CHECK-LABEL: @out_of_bounds_store
+// CHECK-NEXT: array index out of bounds
+
+func.func @out_of_bounds_subview() -> memref<?xi32, strided<[?], offset: ?>> {
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %v = arith.constant 32 : i32
+  %cst = arith.constant dense<[1, 2]> : memref<2xi32>
+  %subview = memref.subview %cst[%c1][%c2][%c1]
+    : memref<2xi32> to memref<?xi32, strided<[?], offset: ?>>
+  return %subview : memref<?xi32, strided<[?], offset: ?>>
+}
+
+// CHECK-LABEL: @out_of_bounds_subview
+// CHECK-NEXT: subview out of bounds
+
+func.func @collapse_shape_no_common_stride()
+    -> (memref<1x2x3xi32>, memref<1x6xi32>) {
+  %a = arith.constant dense<[[[[0, 1, 2], [3, 4, 5]],
+                              [[6, 7, 8], [9, 10,11]]]]>
+    : memref<1x2x2x3xi32>
+  %b = memref.subview %a[0, 0, 0, 0][1, 2, 1, 3][1, 1, 1, 1]
+    : memref<1x2x2x3xi32> to memref<1x2x3xi32>
+  %c = memref.collapse_shape %b [[0], [1, 2]]
+    : memref<1x2x3xi32> into memref<1x6xi32>
+  return %b, %c : memref<1x2x3xi32>, memref<1x6xi32>
+}
+
+// CHECK-LABEL: @collapse_shape_no_common_stride
+// CHECK-NEXT: cannot collapse dimensions without a common stride
+
+func.func @double_free() {
+  %a = memref.alloc() : memref<i32>
+  memref.dealloc %a : memref<i32>
+  memref.dealloc %a : memref<i32>
+  return
+}
+
+// CHECK-LABEL: @double_free
+// CHECK-NEXT: Interpreter failure: double-free
+// CHECK-NEXT: Note: allocated by %alloc = memref.alloc() : memref<i32>
+// CHECK-NEXT: Note: previously freed by memref.dealloc %alloc : memref<i32>
+// CHECK-NEXT{2}: Encountered failure while executing memref.dealloc %alloc : memref<i32>
+
+func.func @use_after_free() {
+  %a = memref.alloc() : memref<i32>
+  memref.dealloc %a : memref<i32>
+  %b = arith.constant 1 : i32
+  memref.store %b, %a[] : memref<i32>
+  return
+}
+
+// CHECK-LABEL: @use_after_free
+// CHECK-NEXT: Interpreter failure: use-after-free
+// CHECK-NEXT: Note: allocated by %alloc = memref.alloc() : memref<i32>
+// CHECK-NEXT: Note: previously freed by memref.dealloc %alloc : memref<i32>
+// CHECK-NEXT{2}: Encountered failure while executing memref.store %c1_i32, %alloc[] : memref<i32>
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/memref/load.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/memref/load.mlir
new file mode 100644
index 00000000000000..17d935319622e1
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/memref/load.mlir
@@ -0,0 +1,12 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @load() -> i32 {
+  %c1 = arith.constant 1 : index
+  %cst = arith.constant dense<[[1, 2], [3, 4]]> : memref<2x2xi32>
+  %ret = memref.load %cst[%c1, %c1] : memref<2x2xi32>
+  return %ret : i32
+}
+
+// CHECK-LABEL: @load
+// CHECK-NEXT: Results
+// CHECK-NEXT: 4
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/memref/subview.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/memref/subview.mlir
new file mode 100644
index 00000000000000..441548ffe2c84d
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/memref/subview.mlir
@@ -0,0 +1,120 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @subview() -> (memref<4x4xi32, strided<[4, 1], offset: 0>>,
+                         memref<?x?xi32, strided<[?, ?], offset: ?>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+
+  %0 = memref.alloc() : memref<4x4xi32, strided<[4, 1], offset: 0>>
+  %1 = memref.subview %0[%c1, %c1][%c2, %c2][%c1, %c1]
+    : memref<4x4xi32, strided<[4, 1], offset: 0>> to
+      memref<?x?xi32, strided<[?, ?], offset: ?>>
+
+  %i1 = arith.constant 1 : i32
+  %i2 = arith.constant 2 : i32
+  %i3 = arith.constant 3 : i32
+  %i4 = arith.constant 4 : i32
+
+  memref.store %i1, %1[%c0, %c0] : memref<?x?xi32, strided<[?, ?], offset: ?>>
+  memref.store %i2, %1[%c0, %c1] : memref<?x?xi32, strided<[?, ?], offset: ?>>
+  memref.store %i3, %1[%c1, %c0] : memref<?x?xi32, strided<[?, ?], offset: ?>>
+  memref.store %i4, %1[%c1, %c1] : memref<?x?xi32, strided<[?, ?], offset: ?>>
+
+  return %0, %1 : memref<4x4xi32, strided<[4, 1], offset: 0>>,
+                  memref<?x?xi32, strided<[?, ?], offset: ?>>
+}
+
+// CHECK-LABEL: @subview
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[0, 0, 0, 0], [0, 1, 2, 0], [0, 3, 4, 0], [0, 0, 0, 0]]
+// CHECK-NEXT{LITERAL}: [[1, 2], [3, 4]]
+
+func.func @strided() -> memref<?x?xi32, strided<[?, ?], offset: ?>> {
+  %0 = arith.constant dense<[[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]> : memref<2x5xi32>
+
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+
+  %1 = memref.subview %0[%c0, %c1][%c2, %c2][%c1, %c2]
+    : memref<2x5xi32> to
+      memref<?x?xi32, strided<[?, ?], offset: ?>>
+
+  return %1 : memref<?x?xi32, strided<[?, ?], offset: ?>>
+}
+
+// CHECK-LABEL: @strided
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[2, 4], [7, 9]]
+
+func.func @subview_of_subview() -> (memref<?xi32, strided<[?], offset: ?>>,
+                                     memref<?xi32, strided<[?], offset: ?>>) {
+  %0 = arith.constant dense<[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]> : memref<10xi32>
+
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c4 = arith.constant 4 : index
+
+  %1 = memref.subview %0[%c1][%c4][%c2]
+    : memref<10xi32> to memref<?xi32, strided<[?], offset: ?>>
+  %2 = memref.subview %1[%c1][%c2][%c2]
+    : memref<?xi32, strided<[?], offset: ?>> to
+      memref<?xi32, strided<[?], offset: ?>>
+
+  return %1, %2 : memref<?xi32, strided<[?], offset: ?>>,
+                  memref<?xi32, strided<[?], offset: ?>>
+}
+
+// CHECK-LABEL: @subview_of_subview
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [2, 4, 6, 8]
+// CHECK-NEXT{LITERAL}: [4, 8]
+
+func.func @negative_stride() -> memref<?xi32, strided<[?], offset: ?>> {
+  %0 = arith.constant dense<[1, 2, 3, 4]> : memref<4xi32>
+  %c-1 = arith.constant -1 : index
+  %c3 = arith.constant 3 : index
+  %c4 = arith.constant 4 : index
+  %1 = memref.subview %0[%c3][%c4][%c-1]
+    : memref<4xi32> to memref<?xi32, strided<[?], offset: ?>>
+  return %1 : memref<?xi32, strided<[?], offset: ?>>
+}
+
+// CHECK-LABEL: @negative_stride
+// CHECK-NEXT: Results
+// CHECK-NEXT: [4, 3, 2, 1]
+
+func.func @negative_stride_of_subview() -> (memref<?xi32, strided<[?], offset: ?>>, memref<?xi32, strided<[?], offset: ?>>) {
+  %0 = arith.constant dense<[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]> : memref<10xi32>
+  %c-2 = arith.constant -2 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c3 = arith.constant 3 : index
+  %c4 = arith.constant 4 : index
+  %1 = memref.subview %0[%c1][%c4][%c2]
+    : memref<10xi32> to memref<?xi32, strided<[?], offset: ?>>
+  %2 = memref.subview %1[%c3][%c2][%c-2]
+    : memref<?xi32, strided<[?], offset: ?>> to
+      memref<?xi32, strided<[?], offset: ?>>
+  return %1, %2 : memref<?xi32, strided<[?], offset: ?>>, memref<?xi32, strided<[?], offset: ?>>
+}
+
+// CHECK-LABEL: @negative_stride_of_subview
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [2, 4, 6, 8]
+// CHECK-NEXT{LITERAL}: [8, 4]
+
+func.func @rank_reduce_middle() -> memref<1x2x3xi32> {
+  %a = arith.constant dense<[[[[0, 1, 2], [3, 4, 5]],
+                              [[6, 7, 8], [9, 10,11]]]]>
+    : memref<1x2x2x3xi32>
+  %b = memref.subview %a[0, 0, 0, 0][1, 2, 1, 3][1, 1, 1, 1]
+    : memref<1x2x2x3xi32> to memref<1x2x3xi32>
+  return %b : memref<1x2x3xi32>
+}
+
+// CHECK-LABEL: @rank_reduce_middle
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[[0, 1, 2], [6, 7, 8]]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/bitcast_convert.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/bitcast_convert.mlir
new file mode 100644
index 00000000000000..f628ebadd21abc
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/bitcast_convert.mlir
@@ -0,0 +1,11 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @bitcast_convert() -> tensor<3xf32> {
+  %c = arith.constant dense<[10000000,20000000,30000000]> : tensor<3xi32>
+  %ret = mhlo.bitcast_convert %c : (tensor<3xi32>) -> tensor<3xf32>
+  return %ret : tensor<3xf32>
+}
+
+// CHECK-LABEL: @bitcast_convert
+// CHECK-NEXT: Results
+// CHECK-NEXT: [1.401298e-38, 3.254205e-38, 7.411627e-38]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/broadcast_in_dim.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/broadcast_in_dim.mlir
new file mode 100644
index 00000000000000..6f834dfb2fecea
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/broadcast_in_dim.mlir
@@ -0,0 +1,20 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @broadcast() -> tensor<2x3xui16> {
+  %0 = mhlo.constant dense<[1, 2]> : tensor<2xui16>
+  %1 = "mhlo.broadcast_in_dim"(%0) {
+    broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<2xui16>) -> tensor<2x3xui16>
+  return %1 : tensor<2x3xui16>
+}
+
+// CHECK{LITERAL}: [[1, 1, 1], [2, 2, 2]]
+
+func.func @zero_rank() -> tensor<1x2x3xi32> {
+  %in = mhlo.constant dense<1> : tensor<i32>
+  %0 = "mhlo.broadcast_in_dim"(%in) {
+    broadcast_dimensions = dense<[]> : tensor<0xi64>
+  } : (tensor<i32>) -> tensor<1x2x3xi32>
+  func.return %0 : tensor<1x2x3xi32>
+}
+
+// CHECK{LITERAL}: [[[1, 1, 1], [1, 1, 1]]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/case.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/case.mlir
new file mode 100644
index 00000000000000..17d7f8fecd3641
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/case.mlir
@@ -0,0 +1,17 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @case() -> tensor<i32> {
+  %c1 = mhlo.constant dense<1> : tensor<i32>
+  %c2 = mhlo.constant dense<2> : tensor<i32>
+  %c3 = mhlo.constant dense<3> : tensor<i32>
+  %ret = "mhlo.case"(%c1) ({
+    "mhlo.return"(%c2) : (tensor<i32>) -> ()
+  }, {
+    "mhlo.return"(%c3) : (tensor<i32>) -> ()
+  }) : (tensor<i32>) -> tensor<i32>
+  func.return %ret : tensor<i32>
+}
+
+// CHECK-LABEL: @case
+// CHECK-NEXT: Results
+// CHECK-NEXT: <i32>: 3
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/clamp.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/clamp.mlir
new file mode 100644
index 00000000000000..f7a455a78e2293
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/clamp.mlir
@@ -0,0 +1,27 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @clamp() -> tensor<2x2xi32> {
+  %lb = mhlo.constant dense<[[1, 7], [1, 7]]> : tensor<2x2xi32>
+  %arg = mhlo.constant dense<[[4, 5], [6, 9]]> : tensor<2x2xi32>
+  %ub = mhlo.constant dense<[[5, 9], [3, 6]]> : tensor<2x2xi32>
+  %clamp = "mhlo.clamp"(%lb, %arg, %ub)
+      : (tensor<2x2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+  return %clamp : tensor<2x2xi32>
+}
+
+// CHECK-LABEL:         @clamp
+// CHECK-NEXT:          Results
+// CHECK-NEXT{LITERAL}: [[4, 7], [3, 6]]
+
+func.func @clamp_f32() -> tensor<2x2xf32> {
+  %lb = mhlo.constant dense<[[1.1, 7.1], [1.1, 7.1]]> : tensor<2x2xf32>
+  %arg = mhlo.constant dense<[[4.1, 5.1], [6.1, 9.1]]> : tensor<2x2xf32>
+  %ub = mhlo.constant dense<[[5.1, 9.1], [3.1, 6.1]]> : tensor<2x2xf32>
+  %clamp = "mhlo.clamp"(%lb, %arg, %ub)
+      : (tensor<2x2xf32>, tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %clamp : tensor<2x2xf32>
+}
+
+// CHECK-LABEL:         @clamp
+// CHECK-NEXT:          Results
+// CHECK-NEXT{LITERAL}: [[4.100000e+00, 7.100000e+00], [3.100000e+00, 6.100000e+00]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/compare.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/compare.mlir
new file mode 100644
index 00000000000000..f9d7eb99cbeac7
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/compare.mlir
@@ -0,0 +1,143 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @eq() -> (tensor<i1>, tensor<i1>) {
+  %c1 = arith.constant dense<1> : tensor<i32>
+  %c2 = arith.constant dense<2> : tensor<i32>
+  %0 = mhlo.compare EQ, %c1, %c2 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  %1 = mhlo.compare EQ, %c1, %c1 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  return %0, %1 : tensor<i1>, tensor<i1>
+}
+
+// CHECK-LABEL: @eq
+// CHECK-NEXT: Results
+// CHECK-NEXT: false
+// CHECK-NEXT: true
+
+func.func @ne() -> (tensor<i1>, tensor<i1>) {
+  %c1 = arith.constant dense<1> : tensor<i32>
+  %c2 = arith.constant dense<2> : tensor<i32>
+  %0 = mhlo.compare NE, %c1, %c2 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  %1 = mhlo.compare NE, %c1, %c1 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  return %0, %1 : tensor<i1>, tensor<i1>
+}
+
+// CHECK-LABEL: @ne
+// CHECK-NEXT: Results
+// CHECK-NEXT: true
+// CHECK-NEXT: false
+
+func.func @ge() -> (tensor<i1>, tensor<i1>, tensor<i1>) {
+  %c1 = arith.constant dense<1> : tensor<i32>
+  %c2 = arith.constant dense<2> : tensor<i32>
+  %0 = mhlo.compare GE, %c1, %c2 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  %1 = mhlo.compare GE, %c1, %c1 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  %2 = mhlo.compare GE, %c2, %c1 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  return %0, %1, %2 : tensor<i1>, tensor<i1>, tensor<i1>
+}
+
+// CHECK-LABEL: @ge
+// CHECK-NEXT: Results
+// CHECK-NEXT: false
+// CHECK-NEXT: true
+// CHECK-NEXT: true
+
+func.func @gt() -> (tensor<i1>, tensor<i1>, tensor<i1>) {
+  %c1 = arith.constant dense<1> : tensor<i32>
+  %c2 = arith.constant dense<2> : tensor<i32>
+  %0 = mhlo.compare GT, %c1, %c2 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  %1 = mhlo.compare GT, %c1, %c1 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  %2 = mhlo.compare GT, %c2, %c1 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  return %0, %1, %2 : tensor<i1>, tensor<i1>, tensor<i1>
+}
+
+// CHECK-LABEL: @gt
+// CHECK-NEXT: Results
+// CHECK-NEXT: false
+// CHECK-NEXT: false
+// CHECK-NEXT: true
+
+func.func @le() -> (tensor<i1>, tensor<i1>, tensor<i1>) {
+  %c1 = arith.constant dense<1> : tensor<i32>
+  %c2 = arith.constant dense<2> : tensor<i32>
+  %0 = mhlo.compare LE, %c1, %c2 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  %1 = mhlo.compare LE, %c1, %c1 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  %2 = mhlo.compare LE, %c2, %c1 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  return %0, %1, %2 : tensor<i1>, tensor<i1>, tensor<i1>
+}
+
+// CHECK-LABEL: @le
+// CHECK-NEXT: Results
+// CHECK-NEXT: true
+// CHECK-NEXT: true
+// CHECK-NEXT: false
+
+func.func @lt() -> (tensor<i1>, tensor<i1>, tensor<i1>) {
+  %c1 = arith.constant dense<1> : tensor<i32>
+  %c2 = arith.constant dense<2> : tensor<i32>
+  %0 = mhlo.compare LT, %c1, %c2 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  %1 = mhlo.compare LT, %c1, %c1 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  %2 = mhlo.compare LT, %c2, %c1 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  return %0, %1, %2 : tensor<i1>, tensor<i1>, tensor<i1>
+}
+
+// CHECK-LABEL: @lt
+// CHECK-NEXT: Results
+// CHECK-NEXT: true
+// CHECK-NEXT: false
+// CHECK-NEXT: false
+
+func.func @complex_eq() -> (tensor<i1>, tensor<i1>) {
+  %c1 = arith.constant dense<(1.0, 1.0)> : tensor<complex<f32>>
+  %c2 = arith.constant dense<(1.0, 2.0)> : tensor<complex<f32>>
+  %0 = mhlo.compare EQ, %c1, %c2
+    : (tensor<complex<f32>>, tensor<complex<f32>>) -> tensor<i1>
+  %1 = mhlo.compare EQ, %c1, %c1
+    : (tensor<complex<f32>>, tensor<complex<f32>>) -> tensor<i1>
+  return %0, %1 : tensor<i1>, tensor<i1>
+}
+
+// CHECK-LABEL: @complex_eq
+// CHECK-NEXT: Results
+// CHECK-NEXT: false
+// CHECK-NEXT: true
+
+func.func @complex_nan_compare() -> (tensor<i1>, tensor<i1>) {
+  %nan = arith.constant dense<0x7FC00000> : tensor<f32>
+  %c1 = arith.constant dense<1.0> : tensor<f32>
+  %c = mhlo.complex %c1, %nan : tensor<complex<f32>>
+  %0 = mhlo.compare EQ, %c, %c
+    : (tensor<complex<f32>>, tensor<complex<f32>>) -> tensor<i1>
+  %1 = mhlo.compare NE, %c, %c
+    : (tensor<complex<f32>>, tensor<complex<f32>>) -> tensor<i1>
+  return %0, %1 : tensor<i1>, tensor<i1>
+}
+
+// CHECK-LABEL: @complex_nan_compare
+// CHECK-NEXT: Results
+// CHECK-NEXT: false
+// CHECK-NEXT: true
+
+func.func @float_eq() -> (tensor<i1>, tensor<i1>) {
+  %c1 = arith.constant dense<1.0> : tensor<f32>
+  %c2 = arith.constant dense<2.0> : tensor<f32>
+  %0 = mhlo.compare EQ, %c1, %c2 : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  %1 = mhlo.compare EQ, %c1, %c1 : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  return %0, %1 : tensor<i1>, tensor<i1>
+}
+
+// CHECK-LABEL: @float_eq
+// CHECK-NEXT: Results
+// CHECK-NEXT: false
+// CHECK-NEXT: true
+
+func.func @float_nan_compare() -> (tensor<i1>, tensor<i1>) {
+  %nan = arith.constant dense<0x7FC00000> : tensor<f32>
+  %0 = mhlo.compare EQ, %nan, %nan : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  %1 = mhlo.compare NE, %nan, %nan : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  return %0, %1 : tensor<i1>, tensor<i1>
+}
+
+// CHECK-LABEL: @float_nan_compare
+// CHECK-NEXT: Results
+// CHECK-NEXT: false
+// CHECK-NEXT: true
\ No newline at end of file
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/complex_math.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/complex_math.mlir
new file mode 100644
index 00000000000000..c31953be62158e
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/complex_math.mlir
@@ -0,0 +1,100 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @cos() -> tensor<complex<f64>> {
+  %c = mhlo.constant dense<(1.0, 1.0)> : tensor<complex<f64>>
+  %cos = mhlo.cosine %c : tensor<complex<f64>>
+  return %cos : tensor<complex<f64>>
+}
+
+// CHECK-LABEL: @cos
+// CHECK-NEXT: Results
+// CHECK-NEXT: 8.337300e-01-9.888977e-01i
+
+func.func @expm1() -> tensor<complex<f32>> {
+  // import numpy as np  -- not jax.numpy
+  // np.expm1(np.array(1e-6 + 1e-6j, dtype=np.complex64))
+  // Don't run this with jax.numpy, it returns 9.536743e-07+0.j.
+  %c = mhlo.constant dense<(1.0e-06, 1.0e-06)> : tensor<complex<f32>>
+  %expm1 = mhlo.exponential_minus_one %c : tensor<complex<f32>>
+  return %expm1 : tensor<complex<f32>>
+}
+
+// CHECK-LABEL: @expm1
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1.000000e-06+1.000001e-06i
+
+func.func @expm1d() -> tensor<complex<f64>> {
+  %c = mhlo.constant dense<(1.0e-50, 1.0e-50)> : tensor<complex<f64>>
+  %expm1 = mhlo.exponential_minus_one %c : tensor<complex<f64>>
+  return %expm1 : tensor<complex<f64>>
+}
+
+// CHECK-LABEL: @expm1d
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1.000000e-50+1.000000e-50i
+
+func.func @imag() -> tensor<f64> {
+  %c = mhlo.constant dense<(1.0, 2.0)> : tensor<complex<f64>>
+  %imag = mhlo.imag %c : (tensor<complex<f64>>) -> tensor<f64>
+  return %imag : tensor<f64>
+}
+
+// CHECK-LABEL: @imag
+// CHECK-NEXT: Results
+// CHECK-NEXT: 2.000000e+00
+
+func.func @log1pf() -> tensor<complex<f32>> {
+  %c = mhlo.constant dense<(1.0e-07, 1.0e-20)> : tensor<complex<f32>>
+  %cos = mhlo.log_plus_one %c : tensor<complex<f32>>
+  return %cos : tensor<complex<f32>>
+}
+
+// CHECK-LABEL: @log1p
+// CHECK-NEXT: Results
+// The accuracy of this is rather poor, but it matches numpy.
+// CHECK-NEXT: 1.192093e-07+9.999999e-21i
+
+func.func @log1pd() -> tensor<complex<f64>> {
+  %c = mhlo.constant dense<(1.0e-07, 1.0e-20)> : tensor<complex<f64>>
+  %cos = mhlo.log_plus_one %c : tensor<complex<f64>>
+  return %cos : tensor<complex<f64>>
+}
+
+// CHECK-LABEL: @log1pd
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1.000000e-07+9.999999e-21i
+
+func.func @logistic() -> tensor<5xcomplex<f32>> {
+  %c = mhlo.constant dense<[(-1.0e5, 0.0), (-1.0, 0.0),
+      (0.0, 0.0), (1.0, 0.0), (1.0e5, 0.0)]> : tensor<5xcomplex<f32>>
+  %ret = mhlo.logistic %c : tensor<5xcomplex<f32>>
+  return %ret : tensor<5xcomplex<f32>>
+}
+
+// CHECK-LABEL: @logistic
+// CHECK-NEXT: Results
+// CHECK-NEXT: [0.000000e+00+0.000000e+00i,
+// CHECK-SAME:  2.689414e-01+0.000000e+00i,
+// CHECK-SAME:  5.000000e-01+0.000000e+00i,
+// CHECK-SAME:  7.310586e-01+0.000000e+00i,
+// CHECK-SAME:  1.000000e+00+0.000000e+00i]
+
+func.func @real() -> tensor<f64> {
+  %c = mhlo.constant dense<(1.0, 2.0)> : tensor<complex<f64>>
+  %real = mhlo.real %c : (tensor<complex<f64>>) -> tensor<f64>
+  return %real : tensor<f64>
+}
+
+// CHECK-LABEL: @real
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1.000000e+00
+
+func.func @sin() -> tensor<complex<f64>> {
+  %c = mhlo.constant dense<(1.0, 1.0)> : tensor<complex<f64>>
+  %cos = mhlo.sine %c : tensor<complex<f64>>
+  return %cos : tensor<complex<f64>>
+}
+
+// CHECK-LABEL: @sin
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1.298458e+00+6.349639e-01i
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/concatenate.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/concatenate.mlir
new file mode 100644
index 00000000000000..fb88f0c0da1fe3
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/concatenate.mlir
@@ -0,0 +1,37 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @concat_1d()  -> tensor<3xi32> {
+  %a = mhlo.constant dense<[1]> : tensor<1xi32>
+  %b = mhlo.constant dense<[2, 3]> : tensor<2xi32>
+  %0 = "mhlo.concatenate"(%a, %b) { dimension = 0 : i64 }
+     : (tensor<1xi32>, tensor<2xi32>) -> tensor<3xi32>
+  func.return %0 : tensor<3xi32>
+}
+
+// CHECK-LABEL: @concat_1d
+// CHECK-NEXT: Results
+// CHECK-NEXT: [1, 2, 3]
+
+func.func @concat_dim0() -> tensor<4x2xi32> {
+  %a = mhlo.constant dense<1> : tensor<2x2xi32>
+  %b = mhlo.constant dense<2> : tensor<2x2xi32>
+  %0 = "mhlo.concatenate"(%a, %b) { dimension = 0 : i64 }
+     : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<4x2xi32>
+  func.return %0 : tensor<4x2xi32>
+}
+
+// CHECK-LABEL: @concat_dim0
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[1, 1], [1, 1], [2, 2], [2, 2]]
+
+func.func @concat_dim1() -> tensor<2x4xi32> {
+  %a = mhlo.constant dense<1> : tensor<2x2xi32>
+  %b = mhlo.constant dense<2> : tensor<2x2xi32>
+  %0 = "mhlo.concatenate"(%a, %b) { dimension = 1 : i64 }
+     : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x4xi32>
+  func.return %0 : tensor<2x4xi32>
+}
+
+// CHECK-LABEL: @concat_dim1
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[1, 1, 2, 2], [1, 1, 2, 2]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/constant.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/constant.mlir
new file mode 100644
index 00000000000000..77547aff85ba20
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/constant.mlir
@@ -0,0 +1,25 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @main() -> (tensor<2x3xi32>, tensor<2x3xf32>, tensor<0x0x3xi16>, tensor<f64>) {
+  %i32 = mhlo.constant dense<[[0, 1, 2], [3, 4, 5]]> : tensor<2x3xi32>
+  %f32 = mhlo.constant dense<[[0.0, 0.1, 0.2], [0.3, 0.4, 0.5]]> : tensor<2x3xf32>
+  %empty = mhlo.constant dense<> : tensor<0x0x3xi16>
+  %scalar = mhlo.constant dense<3.14> : tensor<f64>
+  return %i32, %f32, %empty, %scalar : tensor<2x3xi32>, tensor<2x3xf32>, tensor<0x0x3xi16>, tensor<f64>
+}
+
+// CHECK-LABEL: @main
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[0, 1, 2], [3, 4, 5]]
+// CHECK-NEXT{LITERAL}: [[0.000000e+00, 1.000000e-01, 2.000000e-01], [3.000000e-01, 4.000000e-01, 5.000000e-01]]
+// CHECK-NEXT{LITERAL}: []
+// CHECK-NEXT{LITERAL}: 3.140000e+00
+
+func.func @ui8() -> tensor<ui8> {
+  %v = mhlo.constant dense<123> : tensor<ui8>
+  return %v : tensor<ui8>
+}
+
+// CHECK-LABEL: @ui8
+// CHECK-NEXT: Results
+// CHECK-NEXT: 123
\ No newline at end of file
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/convert.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/convert.mlir
new file mode 100644
index 00000000000000..fe897b2856a3b6
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/convert.mlir
@@ -0,0 +1,21 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @convert_i1_to_f32() -> tensor<2xf32> {
+  %input = mhlo.constant dense<[true, false]> : tensor<2xi1>
+  %result = "mhlo.convert"(%input) : (tensor<2xi1>) -> tensor<2xf32>
+  func.return %result : tensor<2xf32>
+}
+
+// CHECK-LABEL: @convert_i1_to_f32
+// CHECK-NEXT: Results
+// CHECK-NEXT: [1.000000e+00, 0.000000e+00]
+
+func.func @convert_f32_to_i16() -> tensor<2xi16> {
+  %input = mhlo.constant dense<[1.4, 2.55]> : tensor<2xf32>
+  %result = "mhlo.convert"(%input) : (tensor<2xf32>) -> tensor<2xi16>
+  func.return %result : tensor<2xi16>
+}
+
+// CHECK-LABEL: @convert_f32_to_i16
+// CHECK-NEXT: Results
+// CHECK-NEXT: [1, 2]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/dot.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/dot.mlir
new file mode 100644
index 00000000000000..66ed3956c0cbae
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/dot.mlir
@@ -0,0 +1,37 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @dot_2d() -> tensor<2x2xi32> {
+  %lhs = mhlo.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
+  %rhs = mhlo.constant dense<[[4, 5], [6, 7]]> : tensor<2x2xi32>
+  %dot = "mhlo.dot"(%lhs, %rhs)
+    : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+  return %dot : tensor<2x2xi32>
+}
+
+// CHECK-LABEL: @dot_2d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[16, 19], [36, 43]]
+
+func.func @dot_2d_1d() -> tensor<2xi32> {
+  %lhs = mhlo.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
+  %rhs = mhlo.constant dense<[4, 5]> : tensor<2xi32>
+  %dot = "mhlo.dot"(%lhs, %rhs)
+    : (tensor<2x2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  return %dot : tensor<2xi32>
+}
+
+// CHECK-LABEL: @dot_2d_1d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [14, 32]
+
+func.func @dot_1d_1d() -> tensor<i32> {
+  %lhs = mhlo.constant dense<[1, 2]> : tensor<2xi32>
+  %rhs = mhlo.constant dense<[4, 5]> : tensor<2xi32>
+  %dot = "mhlo.dot"(%lhs, %rhs)
+    : (tensor<2xi32>, tensor<2xi32>) -> tensor<i32>
+  return %dot : tensor<i32>
+}
+
+// CHECK-LABEL: @dot_1d_1d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: 14
\ No newline at end of file
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/dot_general.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/dot_general.mlir
new file mode 100644
index 00000000000000..1f83ad5ff63d62
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/dot_general.mlir
@@ -0,0 +1,73 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @dot_general_2d() -> tensor<2x2xi32> {
+  %lhs = mhlo.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
+  %rhs = mhlo.constant dense<[[4, 5], [6, 7]]> : tensor<2x2xi32>
+  %dot = "mhlo.dot_general"(%lhs, %rhs) {
+    dot_dimension_numbers = #mhlo.dot<
+      lhs_contracting_dimensions = [1],
+      rhs_contracting_dimensions = [0],
+      lhs_batching_dimensions = [],
+      rhs_batching_dimensions = []
+    >
+  } : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+  return %dot : tensor<2x2xi32>
+}
+
+// CHECK-LABEL: @dot_general_2d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[16, 19], [36, 43]]
+
+func.func @dot_general_2d_2() -> tensor<2x2xi32> {
+  %lhs = mhlo.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
+  %rhs = mhlo.constant dense<[[4, 5], [6, 7]]> : tensor<2x2xi32>
+  %dot = "mhlo.dot_general"(%lhs, %rhs) {
+    dot_dimension_numbers = #mhlo.dot<
+      lhs_contracting_dimensions = [0],
+      rhs_contracting_dimensions = [1],
+      lhs_batching_dimensions = [],
+      rhs_batching_dimensions = []
+    >
+  } : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+  return %dot : tensor<2x2xi32>
+}
+
+// CHECK-LABEL: @dot_general_2d_2
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[19, 27], [28, 40]]
+
+func.func @dot_general_2d_1d() -> tensor<2xi32> {
+  %lhs = mhlo.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
+  %rhs = mhlo.constant dense<[4, 5]> : tensor<2xi32>
+  %dot = "mhlo.dot_general"(%lhs, %rhs) {
+    dot_dimension_numbers = #mhlo.dot<
+      lhs_contracting_dimensions = [0],
+      rhs_contracting_dimensions = [0],
+      lhs_batching_dimensions = [],
+      rhs_batching_dimensions = []
+    >
+  } : (tensor<2x2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  return %dot : tensor<2xi32>
+}
+
+// CHECK-LABEL: @dot_general_2d_1d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [19, 28]
+
+func.func @dot_general_batch_only() -> tensor<2x2xi32> {
+  %lhs = mhlo.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
+  %rhs = mhlo.constant dense<[4, 5]> : tensor<2xi32>
+  %dot = "mhlo.dot_general"(%lhs, %rhs) {
+    dot_dimension_numbers = #mhlo.dot<
+      lhs_contracting_dimensions = [],
+      rhs_contracting_dimensions = [],
+      lhs_batching_dimensions = [1],
+      rhs_batching_dimensions = [0]
+    >
+  } : (tensor<2x2xi32>, tensor<2xi32>) -> tensor<2x2xi32>
+  return %dot : tensor<2x2xi32>
+}
+
+// CHECK-LABEL: @dot_general_batch_only
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[4, 12], [10, 20]]
\ No newline at end of file
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/dynamic_slice.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/dynamic_slice.mlir
new file mode 100644
index 00000000000000..8708694d3a8504
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/dynamic_slice.mlir
@@ -0,0 +1,32 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @dynamic_slice() -> tensor<1x2xi32> {
+  %cst = mhlo.constant dense<[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]>
+    : tensor<3x4xi32>
+  %s0 = mhlo.constant dense<1> : tensor<i32>
+  %s1 = mhlo.constant dense<0> : tensor<i32>
+  %0 = "mhlo.dynamic_slice"(%cst, %s0, %s1) {
+    slice_sizes = dense<[1, 2]> : tensor<2xi64>
+  } : (tensor<3x4xi32>, tensor<i32>, tensor<i32>) -> tensor<1x2xi32>
+  func.return %0 : tensor<1x2xi32>
+}
+
+// CHECK-LABEL: @dynamic_slice
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[5, 6]]
+
+func.func @clamp_starts() -> tensor<2x3xi32> {
+
+  %cst = mhlo.constant dense<[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]>
+    : tensor<3x4xi32>
+  %s0 = mhlo.constant dense<-10> : tensor<i32>
+  %s1 = mhlo.constant dense<10> : tensor<i32>
+  %0 = "mhlo.dynamic_slice"(%cst, %s0, %s1) {
+    slice_sizes = dense<[2, 3]> : tensor<2xi64>
+  } : (tensor<3x4xi32>, tensor<i32>, tensor<i32>) -> tensor<2x3xi32>
+  func.return %0 : tensor<2x3xi32>
+}
+
+// CHECK-LABEL: @clamp_starts
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[2, 3, 4], [6, 7, 8]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/dynamic_update_slice.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/dynamic_update_slice.mlir
new file mode 100644
index 00000000000000..481389e156e927
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/dynamic_update_slice.mlir
@@ -0,0 +1,34 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @dynamic_update_slice() -> tensor<3x4xi32> {
+  %cst = mhlo.constant dense<[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]>
+    : tensor<3x4xi32>
+  %v = mhlo.constant dense<[[13, 14]]> : tensor<1x2xi32>
+  %s0 = mhlo.constant dense<1> : tensor<i32>
+  %s1 = mhlo.constant dense<0> : tensor<i32>
+  %0 = "mhlo.dynamic_update_slice"(%cst, %v, %s0, %s1)
+    : (tensor<3x4xi32>, tensor<1x2xi32>, tensor<i32>, tensor<i32>)
+      -> tensor<3x4xi32>
+  func.return %0 : tensor<3x4xi32>
+}
+
+// CHECK-LABEL: @dynamic_update_slice
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[1, 2, 3, 4], [13, 14, 7, 8], [9, 10, 11, 12]]
+
+func.func @clamp_starts() -> tensor<3x4xi32> {
+  %cst = mhlo.constant dense<[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]>
+    : tensor<3x4xi32>
+  %v = mhlo.constant dense<[[13, 14]]> : tensor<1x2xi32>
+  %s0 = mhlo.constant dense<-10> : tensor<i32>
+  %s1 = mhlo.constant dense<10> : tensor<i32>
+  %0 = "mhlo.dynamic_update_slice"(%cst, %v, %s0, %s1) {
+    slice_sizes = dense<[2, 3]> : tensor<2xi64>
+  } : (tensor<3x4xi32>, tensor<1x2xi32>, tensor<i32>, tensor<i32>)
+      -> tensor<3x4xi32>
+  func.return %0 : tensor<3x4xi32>
+}
+
+// CHECK-LABEL: @clamp_starts
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[1, 2, 13, 14], [5, 6, 7, 8], [9, 10, 11, 12]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/float_math.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/float_math.mlir
new file mode 100644
index 00000000000000..c713e4bf46394d
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/float_math.mlir
@@ -0,0 +1,199 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @atan2() -> tensor<1xf32> {
+  %c10 = mhlo.constant dense<10.0> : tensor<1xf32>
+  %c1 = mhlo.constant dense<1.0> : tensor<1xf32>
+  %ret = mhlo.atan2 %c10, %c1 : tensor<1xf32>
+  return %ret : tensor<1xf32>
+}
+
+// CHECK-LABEL: @atan2
+// CHECK-NEXT: Results
+// CHECK-NEXT: [1.471128e+00]
+
+func.func @cbrt() -> tensor<1xf32> {
+  %c-27 = mhlo.constant dense<-27.0> : tensor<1xf32>
+  %ret = mhlo.cbrt %c-27 : tensor<1xf32>
+  return %ret : tensor<1xf32>
+}
+
+// CHECK-LABEL: @cbrt
+// CHECK-NEXT: Results
+// CHECK-NEXT: [-3.000000e+00]
+
+func.func @ceil() -> tensor<1xf32> {
+  %c = mhlo.constant dense<0.123> : tensor<1xf32>
+  %ret = mhlo.ceil %c : tensor<1xf32>
+  return %ret : tensor<1xf32>
+}
+
+// CHECK-LABEL: @ceil
+// CHECK-NEXT: Results
+// CHECK-NEXT: [1.000000e+00]
+
+func.func @complex() -> tensor<complex<f32>> {
+  %c1 = mhlo.constant dense<1.0> : tensor<f32>
+  %c2 = mhlo.constant dense<2.0> : tensor<f32>
+  %ret = mhlo.complex %c1, %c2 : tensor<complex<f32>>
+  return %ret : tensor<complex<f32>>
+}
+
+// CHECK-LABEL: @complex
+// CHECK-NEXT: Results
+// CHECK-NEXT: <complex<f32>>: 1.000000e+00+2.000000e+00i
+
+func.func @exp() -> tensor<1xf32> {
+  %c = mhlo.constant dense<0.0> : tensor<1xf32>
+  %ret = mhlo.exponential %c : tensor<1xf32>
+  return %ret : tensor<1xf32>
+}
+
+// CHECK-LABEL: @exp
+// CHECK-NEXT: Results
+// CHECK-NEXT: [1.000000e+00]
+
+func.func @floor() -> tensor<1xf32> {
+  %c = mhlo.constant dense<3.123> : tensor<1xf32>
+  %ret = mhlo.floor %c : tensor<1xf32>
+  return %ret : tensor<1xf32>
+}
+
+// CHECK-LABEL: @floor
+// CHECK-NEXT: Results
+// CHECK-NEXT: [3.000000e+00]
+
+func.func @is_finite() -> tensor<3xi1> {
+  %c = mhlo.constant dense<[0x7FC00000, 2.0, 0x7F800000]> : tensor<3xf32>
+  %is_finite = mhlo.is_finite %c : (tensor<3xf32>) -> tensor<3xi1>
+  return %is_finite : tensor<3xi1>
+}
+
+// CHECK-LABEL: @is_finite
+// CHECK-NEXT: Results
+// CHECK-NEXT: [false, true, false]
+
+func.func @log() -> tensor<1xf32> {
+  %c = mhlo.constant dense<1.0> : tensor<1xf32>
+  %ret = mhlo.log %c : tensor<1xf32>
+  return %ret : tensor<1xf32>
+}
+
+// CHECK-LABEL: @log
+// CHECK-NEXT: Results
+// CHECK-NEXT: [0.000000e+00]
+
+func.func @logistic() -> tensor<5xf32> {
+  %c = mhlo.constant dense<[-1.0e5, -1.0, 0.0, 1.0, 1.0e5]> : tensor<5xf32>
+  %ret = mhlo.logistic %c : tensor<5xf32>
+  return %ret : tensor<5xf32>
+}
+
+// CHECK-LABEL: @logistic
+// CHECK-NEXT: Results
+// CHECK-NEXT: [0.000000e+00, 2.689414e-01, 5.000000e-01, 7.310586e-01, 1.000000e+00]
+
+func.func @maximum() -> tensor<f32> {
+  %c1 = mhlo.constant dense<1.0> : tensor<f32>
+  %c2 = mhlo.constant dense<2.0> : tensor<f32>
+  %ret = mhlo.maximum %c1, %c2 : tensor<f32>
+  return %ret : tensor<f32>
+}
+
+// CHECK-LABEL: @maximum
+// CHECK-NEXT: Results
+// CHECK-NEXT: 2.000000e+00
+
+func.func @minimum() -> tensor<f32> {
+  %c1 = mhlo.constant dense<1.0> : tensor<f32>
+  %c2 = mhlo.constant dense<2.0> : tensor<f32>
+  %ret = mhlo.minimum %c1, %c2 : tensor<f32>
+  return %ret : tensor<f32>
+}
+
+// CHECK-LABEL: @minimum
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1.000000e+00
+
+func.func @pow() -> tensor<f32> {
+  %c2 = mhlo.constant dense<2.0> : tensor<f32>
+  %c5 = mhlo.constant dense<5.0> : tensor<f32>
+  %pow = mhlo.power %c2, %c5 : tensor<f32>
+  return %pow : tensor<f32>
+}
+
+// CHECK-LABEL: @pow
+// CHECK-NEXT: Results
+// CHECK-NEXT: 3.200000e+01
+
+func.func @rem() -> tensor<8xf32> {
+  %0 = mhlo.constant dense<[-2.5, 2.25, -10.0, 6.0, 3.0, 3.0, -1.0, -8.0]> : tensor<8xf32>
+  %1 = mhlo.constant dense<[10.0, 1.0, 10.0, -6.0, 2.0, -2.0, 7.0, -4.0]> : tensor<8xf32>
+  %2 = mhlo.remainder %0, %1 : tensor<8xf32>
+  func.return %2 : tensor<8xf32>
+}
+
+// CHECK-LABEL: @rem
+// CHECK-NEXT: Results
+// CHECK-NEXT: [-2.500000e+00, 2.500000e-01, -0.000000e+00, 0.000000e+00, 1.000000e+00, 1.000000e+00, -1.000000e+00, -0.000000e+00]
+
+func.func @rem_inf() -> (tensor<1xf32>, tensor<1xf32>) {
+  %0 = mhlo.constant dense<[1.0]> : tensor<1xf32>
+  %1 = mhlo.constant dense<[0x7F800000]> : tensor<1xf32>
+  %2 = mhlo.remainder %0, %1 : tensor<1xf32>
+  func.return %2, %1 : tensor<1xf32>, tensor<1xf32>
+}
+
+// CHECK-LABEL: @rem_inf
+// CHECK-NEXT: Results
+// CHECK-NEXT: [1.000000e+00]
+// CHECK-NEXT: [INF]
+
+func.func @round_nearest_afz() -> tensor<4xf32> {
+  %c = mhlo.constant dense<[-1.5, -0.5, 0.5, 1.5]> : tensor<4xf32>
+  %ret = mhlo.round_nearest_afz %c : tensor<4xf32>
+  return %ret : tensor<4xf32>
+}
+
+// CHECK-LABEL: @round_nearest_afz
+// CHECK-NEXT: Results
+// CHECK-NEXT: [-2.000000e+00, -1.000000e+00, 1.000000e+00, 2.000000e+00]
+
+func.func @round_nearest_even() -> tensor<5xf32> {
+  %c = mhlo.constant dense<[-1.5, -0.5, 0.5, 0.6, 1.5]> : tensor<5xf32>
+  %ret = mhlo.round_nearest_even %c : tensor<5xf32>
+  return %ret : tensor<5xf32>
+}
+
+// CHECK-LABEL: @round_nearest_even
+// CHECK-NEXT: Results
+// CHECK-NEXT: [-2.000000e+00, -0.000000e+00, 0.000000e+00, 1.000000e+00, 2.000000e+00]
+
+func.func @rsqrt() -> tensor<1xf32> {
+  %c4 = mhlo.constant dense<4.0> : tensor<1xf32>
+  %ret = mhlo.rsqrt %c4 : tensor<1xf32>
+  return %ret : tensor<1xf32>
+}
+
+// CHECK-LABEL: @rsqrt
+// CHECK-NEXT: Results
+// CHECK-NEXT: [5.000000e-01]
+
+func.func @sign() -> tensor<3xf32> {
+  %c = mhlo.constant dense<[-1.0, 2.0, 0x7F800000]> : tensor<3xf32>
+  %ret = mhlo.sign %c : tensor<3xf32>
+  return %ret : tensor<3xf32>
+}
+
+// CHECK-LABEL: @sign
+// CHECK-NEXT: Results
+// CHECK-NEXT: [-1.000000e+00, 1.000000e+00, 1.000000e+00]
+
+func.func @tanh() -> tensor<1xf32> {
+  %c = mhlo.constant dense<[1.0]> : tensor<1xf32>
+  %ret = mhlo.tanh %c : tensor<1xf32>
+  return %ret : tensor<1xf32>
+}
+
+// CHECK-LABEL: @tanh
+// CHECK-NEXT: Results
+// CHECK-NEXT: [7.615942e-01]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/gather.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/gather.mlir
new file mode 100644
index 00000000000000..c6992ba6d527dd
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/gather.mlir
@@ -0,0 +1,58 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @bounds_check() -> tensor<4x3xi32> {
+  %operand = mhlo.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]> : tensor<10xi32>
+  %indices = mhlo.constant dense<[[1], [8], [-3]]> : tensor<3x1xi32>
+  %gather = "mhlo.gather"(%operand, %indices) {
+    dimension_numbers = #mhlo.gather<
+      collapsed_slice_dims = [],
+      index_vector_dim = 1,
+      offset_dims = [0],
+      start_index_map = [0]
+    >,
+    slice_sizes = dense<[4]> : tensor<1xi64>
+  } : (tensor<10xi32>, tensor<3x1xi32>) -> tensor<4x3xi32>
+  return %gather : tensor<4x3xi32>
+}
+
+// CHECK-LABEL: @bounds_check
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[1, 6, 0], [2, 7, 1], [3, 8, 2], [4, 9, 3]]
+
+func.func @gather_2d() -> tensor<4x2xi32> {
+  // operand = np.arange(1, 16).reshape([5, 3, 1])
+  // indices = np.array([[0, 0], [1, 0], [4, 3], [-1, -1]])
+  // lax.gather(operand, indices, lax.GatherDimensionNumbers(offset_dims=(1,),
+  //   collapsed_slice_dims=(1,2), start_index_map=(0,1,)), slice_sizes=[2,1,1]))
+  %operand = arith.constant dense<[
+    [[1], [2], [3]],
+    [[4], [5], [6]],
+    [[7], [8], [9]],
+    [[10], [11], [12]],
+    [[13], [14], [15]]
+  ]> : tensor<5x3x1xi32>
+
+  %indices = arith.constant dense<[
+    [0, 0],
+    [1, 0],
+    [4, 3],
+    [-1, -1]
+  ]> : tensor<4x2xi64>
+
+  %0 = "mhlo.gather"(%operand, %indices) {
+    dimension_numbers = #mhlo.gather<
+      collapsed_slice_dims = [1, 2],
+      index_vector_dim = 1,
+      offset_dims = [1],
+      start_index_map = [0, 1]
+    >,
+    indices_are_sorted = false,
+    slice_sizes = dense<[2, 1, 1]> : tensor<3xi64>
+  } : (tensor<5x3x1xi32>, tensor<4x2xi64>) -> tensor<4x2xi32>
+
+  func.return %0 : tensor<4x2xi32>
+}
+
+// CHECK-LABEL: @gather_2d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[1, 4], [4, 7], [12, 15], [1, 4]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/int_math.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/int_math.mlir
new file mode 100644
index 00000000000000..999f6f6996e8b6
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/int_math.mlir
@@ -0,0 +1,358 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @abs() -> tensor<1xi32> {
+  %c-3 = mhlo.constant dense<-3> : tensor<1xi32>
+  %ret = mhlo.abs %c-3 : tensor<1xi32>
+  return %ret : tensor<1xi32>
+}
+
+// CHECK-LABEL: @abs
+// CHECK-NEXT: Results
+// CHECK-NEXT: [3]
+
+func.func @add_tensor() -> tensor<2x3xi32> {
+  %lhs = mhlo.constant dense<[[0, 1, 2], [3, 4, 5]]> : tensor<2x3xi32>
+  %rhs = mhlo.constant dense<[[10, 20, 30], [40, 50, 60]]> : tensor<2x3xi32>
+  %result = mhlo.add %lhs, %rhs : tensor<2x3xi32>
+  return %result : tensor<2x3xi32>
+}
+
+// CHECK-LABEL: @add_tensor
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[10, 21, 32], [43, 54, 65]]
+
+func.func @add_scalar_i8() -> tensor<i8> {
+  %lhs = mhlo.constant dense<40> : tensor<i8>
+  %rhs = mhlo.constant dense<2> : tensor<i8>
+  %result = mhlo.add %lhs, %rhs : tensor<i8>
+  return %result : tensor<i8>
+}
+
+// CHECK-LABEL: @add_scalar_i8
+// CHECK-NEXT: Results
+// CHECK-NEXT: <i8>: 42
+
+func.func @add_scalar_i16() -> tensor<i16> {
+  %lhs = mhlo.constant dense<40> : tensor<i16>
+  %rhs = mhlo.constant dense<2> : tensor<i16>
+  %result = mhlo.add %lhs, %rhs : tensor<i16>
+  return %result : tensor<i16>
+}
+
+// CHECK-LABEL: @add_scalar_i16
+// CHECK-NEXT: Results
+// CHECK-NEXT: <i16>: 42
+
+func.func @add_scalar_i32() -> tensor<i32> {
+  %lhs = mhlo.constant dense<40> : tensor<i32>
+  %rhs = mhlo.constant dense<2> : tensor<i32>
+  %result = mhlo.add %lhs, %rhs : tensor<i32>
+  return %result : tensor<i32>
+}
+
+// CHECK-LABEL: @add_scalar_i32
+// CHECK-NEXT: Results
+// CHECK-NEXT: <i32>: 42
+
+func.func @and() -> tensor<1xi32> {
+  %c63 = mhlo.constant dense<63> : tensor<1xi32>
+  %c131 = mhlo.constant dense<131> : tensor<1xi32>
+  %ret = mhlo.and %c63, %c131 : tensor<1xi32>
+  return %ret : tensor<1xi32>
+}
+
+// CHECK-LABEL: @and
+// CHECK-NEXT: Results
+// CHECK-NEXT: [3]
+
+func.func @and_i16() -> tensor<i16> {
+  %c63 = mhlo.constant dense<63> : tensor<i16>
+  %c131 = mhlo.constant dense<131> : tensor<i16>
+  %ret = mhlo.and %c63, %c131 : tensor<i16>
+  return %ret : tensor<i16>
+}
+
+// CHECK-LABEL: @and_i16
+// CHECK-NEXT: Results
+// CHECK-NEXT: <i16>: 3
+
+func.func @clz_negative()
+    -> (tensor<1xi8>, tensor<1xi16>, tensor<1xi32>, tensor<1xi64>) {
+  %c-1_8 = mhlo.constant dense<-1> : tensor<1xi8>
+  %c-2_16 = mhlo.constant dense<-2> : tensor<1xi16>
+  %c-4_32 = mhlo.constant dense<-4> : tensor<1xi32>
+  %c-8_64 = mhlo.constant dense<-8> : tensor<1xi64>
+  %clz_8 = mhlo.count_leading_zeros %c-1_8 : tensor<1xi8>
+  %clz_16 = mhlo.count_leading_zeros %c-2_16 : tensor<1xi16>
+  %clz_32 = mhlo.count_leading_zeros %c-4_32 : tensor<1xi32>
+  %clz_64 = mhlo.count_leading_zeros %c-8_64 : tensor<1xi64>
+  return %clz_8, %clz_16, %clz_32, %clz_64
+    : tensor<1xi8>, tensor<1xi16>, tensor<1xi32>, tensor<1xi64>
+}
+
+// CHECK-LABEL: @clz_negative
+// CHECK-NEXT: Results
+// CHECK-NEXT: [0]
+// CHECK-NEXT: [0]
+// CHECK-NEXT: [0]
+// CHECK-NEXT: [0]
+
+func.func @clz_signed()
+    -> (tensor<1xi8>, tensor<1xi16>, tensor<1xi32>, tensor<1xi64>) {
+  %c1_8 = mhlo.constant dense<1> : tensor<1xi8>
+  %c2_16 = mhlo.constant dense<2> : tensor<1xi16>
+  %c4_32 = mhlo.constant dense<4> : tensor<1xi32>
+  %c8_64 = mhlo.constant dense<8> : tensor<1xi64>
+  %clz_8 = mhlo.count_leading_zeros %c1_8 : tensor<1xi8>
+  %clz_16 = mhlo.count_leading_zeros %c2_16 : tensor<1xi16>
+  %clz_32 = mhlo.count_leading_zeros %c4_32 : tensor<1xi32>
+  %clz_64 = mhlo.count_leading_zeros %c8_64 : tensor<1xi64>
+  return %clz_8, %clz_16, %clz_32, %clz_64
+    : tensor<1xi8>, tensor<1xi16>, tensor<1xi32>, tensor<1xi64>
+}
+
+// CHECK-LABEL: @clz_signed
+// CHECK-NEXT: Results
+// CHECK-NEXT: [7]
+// CHECK-NEXT: [14]
+// CHECK-NEXT: [29]
+// CHECK-NEXT: [60]
+
+func.func @clz_unsigned()
+    -> (tensor<1xui8>, tensor<1xui16>, tensor<1xui32>, tensor<1xui64>) {
+  %c1_8 = mhlo.constant dense<1> : tensor<1xui8>
+  %c2_16 = mhlo.constant dense<2> : tensor<1xui16>
+  %c4_32 = mhlo.constant dense<4> : tensor<1xui32>
+  %c8_64 = mhlo.constant dense<8> : tensor<1xui64>
+  %clz_8 = mhlo.count_leading_zeros %c1_8 : tensor<1xui8>
+  %clz_16 = mhlo.count_leading_zeros %c2_16 : tensor<1xui16>
+  %clz_32 = mhlo.count_leading_zeros %c4_32 : tensor<1xui32>
+  %clz_64 = mhlo.count_leading_zeros %c8_64 : tensor<1xui64>
+  return %clz_8, %clz_16, %clz_32, %clz_64
+    : tensor<1xui8>, tensor<1xui16>, tensor<1xui32>, tensor<1xui64>
+}
+
+// CHECK-LABEL: @clz_unsigned
+// CHECK-NEXT: Results
+// CHECK-NEXT: [7]
+// CHECK-NEXT: [14]
+// CHECK-NEXT: [29]
+// CHECK-NEXT: [60]
+
+func.func @divide() -> tensor<1xi32> {
+  %c-10 = mhlo.constant dense<-10> : tensor<1xi32>
+  %c-2 = mhlo.constant dense<-2> : tensor<1xi32>
+  %ret = mhlo.divide %c-10, %c-2 : tensor<1xi32>
+  return %ret : tensor<1xi32>
+}
+
+// CHECK-LABEL: @divide
+// CHECK-NEXT: Results
+// CHECK-NEXT: [5]
+
+func.func @subtract() -> tensor<1xi32> {
+  %c10 = mhlo.constant dense<10> : tensor<1xi32>
+  %c3 = mhlo.constant dense<3> : tensor<1xi32>
+  %ret = mhlo.subtract %c10, %c3 : tensor<1xi32>
+  return %ret : tensor<1xi32>
+}
+
+// CHECK-LABEL: @subtract
+// CHECK-NEXT: Results
+// CHECK-NEXT: [7]
+
+func.func @or() -> tensor<1xi32> {
+  %c3 = mhlo.constant dense<3> : tensor<1xi32>
+  %c10 = mhlo.constant dense<10> : tensor<1xi32>
+  %ret = mhlo.or %c3, %c10 : tensor<1xi32>
+  return %ret : tensor<1xi32>
+}
+
+// CHECK-LABEL: @or
+// CHECK-NEXT: Results
+// CHECK-NEXT: [11]
+
+func.func @max_scalar() -> tensor<i32> {
+  %lhs = mhlo.constant dense<40> : tensor<i32>
+  %rhs = mhlo.constant dense<2> : tensor<i32>
+  %result = mhlo.maximum %lhs, %rhs : tensor<i32>
+  return %result : tensor<i32>
+}
+
+// CHECK-LABEL: @max_scalar
+// CHECK-NEXT: Results
+// CHECK-NEXT: 40
+
+func.func @multiply() -> tensor<1xi32> {
+  %c3 = mhlo.constant dense<3> : tensor<1xi32>
+  %c-5 = mhlo.constant dense<-5> : tensor<1xi32>
+  %ret = mhlo.multiply %c3, %c-5 : tensor<1xi32>
+  return %ret : tensor<1xi32>
+}
+
+// CHECK-LABEL: @multiply
+// CHECK-NEXT: Results
+// CHECK-NEXT: [-15]
+
+func.func @multiply_scalar_i16() -> tensor<i16> {
+  %lhs = mhlo.constant dense<40> : tensor<i16>
+  %rhs = mhlo.constant dense<2> : tensor<i16>
+  %result = mhlo.multiply %lhs, %rhs : tensor<i16>
+  return %result : tensor<i16>
+}
+
+// CHECK-LABEL: @multiply_scalar_i16
+// CHECK-NEXT: Results
+// CHECK-NEXT: <i16>: 80
+
+func.func @multiply_scalar_ui16() -> tensor<ui16> {
+  %lhs = mhlo.constant dense<40> : tensor<ui16>
+  %rhs = mhlo.constant dense<2> : tensor<ui16>
+  %result = mhlo.multiply %lhs, %rhs : tensor<ui16>
+  return %result : tensor<ui16>
+}
+
+// CHECK-LABEL: @multiply_scalar_ui16
+// CHECK-NEXT: Results
+// CHECK-NEXT: <ui16>: 80
+
+func.func @not_i1() -> tensor<2xi1> {
+  %cst = mhlo.constant dense<[false, true]> : tensor<2xi1>
+  %not = mhlo.not %cst : tensor<2xi1>
+  return %not : tensor<2xi1>
+}
+
+// CHECK-LABEL: @not_i1
+// CHECK-NEXT: Results
+// CHECK-NEXT: [true, false]
+
+func.func @not_ui16() -> tensor<ui16> {
+  %cst = mhlo.constant dense<1> : tensor<ui16>
+  %not = mhlo.not %cst : tensor<ui16>
+  return %not : tensor<ui16>
+}
+
+// CHECK-LABEL: @not_ui16
+// CHECK-NEXT: Results
+// CHECK-NEXT: 65534
+
+func.func @popcnt_negative()
+    -> (tensor<1xi8>, tensor<1xi16>, tensor<1xi32>, tensor<1xi64>) {
+  %c-1_8 = mhlo.constant dense<-1> : tensor<1xi8>
+  %c-2_16 = mhlo.constant dense<-2> : tensor<1xi16>
+  %c-4_32 = mhlo.constant dense<-4> : tensor<1xi32>
+  %c-8_64 = mhlo.constant dense<-8> : tensor<1xi64>
+  %pop_8 = mhlo.popcnt %c-1_8 : tensor<1xi8>
+  %pop_16 = mhlo.popcnt %c-2_16 : tensor<1xi16>
+  %pop_32 = mhlo.popcnt %c-4_32 : tensor<1xi32>
+  %pop_64 = mhlo.popcnt %c-8_64 : tensor<1xi64>
+  return %pop_8, %pop_16, %pop_32, %pop_64
+    : tensor<1xi8>, tensor<1xi16>, tensor<1xi32>, tensor<1xi64>
+}
+
+// CHECK-LABEL: @popcnt_negative
+// CHECK-NEXT: Results
+// CHECK-NEXT: [8]
+// CHECK-NEXT: [15]
+// CHECK-NEXT: [30]
+// CHECK-NEXT: [61]
+
+func.func @pow() -> tensor<ui16> {
+  %c2 = mhlo.constant dense<2> : tensor<ui16>
+  %c5 = mhlo.constant dense<5> : tensor<ui16>
+  %pow = mhlo.power %c2, %c5 : tensor<ui16>
+  return %pow : tensor<ui16>
+}
+
+// CHECK-LABEL: @pow
+// CHECK-NEXT: Results
+// CHECK-NEXT: <ui16>: 32
+
+func.func @pow_negative() -> tensor<3xi64> {
+  %c = mhlo.constant dense<[0, 1, 2]> : tensor<3xi64>
+  %c2 = mhlo.constant dense<-1> : tensor<3xi64>
+  %pow = mhlo.power %c, %c2 : tensor<3xi64>
+  return %pow : tensor<3xi64>
+}
+
+// CHECK-LABEL: @pow_negative
+// CHECK-NEXT: Results
+// CHECK-NEXT: [0, 1, 0]
+
+func.func @pow_non_double() -> tensor<i64> {
+  %c3 = mhlo.constant dense<3> : tensor<i64>
+  %c35 = mhlo.constant dense<35> : tensor<i64>
+  // The result of this operation cannot be represented by a double.
+  %pow = mhlo.power %c3, %c35 : tensor<i64>
+  return %pow : tensor<i64>
+}
+
+// CHECK-LABEL: @pow_non_double
+// CHECK-NEXT: Results
+// CHECK-NEXT: 50031545098999707
+
+func.func @rem() -> tensor<4xi32> {
+  %0 = mhlo.constant dense<[5, 66, 5, -1]> : tensor<4xi32>
+  %1 = mhlo.constant dense<[3, 5, 1, -2]> : tensor<4xi32>
+  %2 = mhlo.remainder %0, %1 : tensor<4xi32>
+  func.return %2 : tensor<4xi32>
+}
+
+// CHECK-LABEL: @rem
+// CHECK-NEXT: Results
+// CHECK-NEXT: [2, 1, 0, -1]
+
+func.func @shift_left() -> tensor<2xi32> {
+  %0 = mhlo.constant dense<[3, 7]> : tensor<2xi32>
+  %1 = mhlo.constant dense<[4, 31]> : tensor<2xi32>
+  %ret = mhlo.shift_left %0, %1 : tensor<2xi32>
+  func.return %ret : tensor<2xi32>
+}
+
+// CHECK-LABEL: @shift_left
+// CHECK-NEXT: Results
+// CHECK-NEXT: [48, -2147483648]
+
+func.func @shift_right_arith() -> tensor<2xi32> {
+  %0 = mhlo.constant dense<[100, -100]> : tensor<2xi32>
+  %1 = mhlo.constant dense<[2, 2]> : tensor<2xi32>
+  %ret = mhlo.shift_right_arithmetic %0, %1 : tensor<2xi32>
+  func.return %ret : tensor<2xi32>
+}
+
+// CHECK-LABEL: @shift_right_arith
+// CHECK-NEXT: Results
+// CHECK-NEXT: [25, -25]
+
+func.func @shift_right_arith_ui32() -> tensor<ui32> {
+  %0 = mhlo.constant dense<100> : tensor<ui32>
+  %1 = mhlo.constant dense<2> : tensor<ui32>
+  %ret = mhlo.shift_right_arithmetic %0, %1 : tensor<ui32>
+  func.return %ret : tensor<ui32>
+}
+
+// CHECK-LABEL: @shift_right_arith_ui32
+// CHECK-NEXT: Results
+// CHECK-NEXT: 25
+
+func.func @shift_right_logical() -> tensor<2xi32> {
+  %0 = mhlo.constant dense<[100, -100]> : tensor<2xi32>
+  %1 = mhlo.constant dense<[2, 2]> : tensor<2xi32>
+  %ret = mhlo.shift_right_logical %0, %1 : tensor<2xi32>
+  func.return %ret : tensor<2xi32>
+}
+
+// CHECK-LABEL: @shift_right_logical
+// CHECK-NEXT: Results
+// CHECK-NEXT: [25, 1073741799]
+
+func.func @shift_right_logical_ui32() -> tensor<ui32> {
+  %0 = mhlo.constant dense<100> : tensor<ui32>
+  %1 = mhlo.constant dense<2> : tensor<ui32>
+  %ret = mhlo.shift_right_logical %0, %1 : tensor<ui32>
+  func.return %ret : tensor<ui32>
+}
+
+// CHECK-LABEL: @shift_right_logical_ui32
+// CHECK-NEXT: Results
+// CHECK-NEXT: 25
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/iota.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/iota.mlir
new file mode 100644
index 00000000000000..e2d12d998e0bb5
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/iota.mlir
@@ -0,0 +1,30 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @iota_f32() -> tensor<1x2x3x4xf32> {
+  %result = "mhlo.iota"() {
+    iota_dimension = 2 : i64
+  } : () -> tensor<1x2x3x4xf32>
+  func.return %result : tensor<1x2x3x4xf32>
+}
+
+// CHECK-LABEL: @iota_f32
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[[[0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00],
+// CHECK{LITERAL}:         [1.000000e+00, 1.000000e+00, 1.000000e+00, 1.000000e+00],
+// CHECK{LITERAL}:         [2.000000e+00, 2.000000e+00, 2.000000e+00, 2.000000e+00]]]]
+
+func.func @iota_i32() -> tensor<1x2x3x4xi32> {
+  %result = "mhlo.iota"() {
+    iota_dimension = 3 : i64
+  } : () -> tensor<1x2x3x4xi32>
+  func.return %result : tensor<1x2x3x4xi32>
+}
+
+// CHECK-LABEL: @iota_i32
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[[[0, 1, 2, 3],
+// CHECK{LITERAL}          [0, 1, 2, 3],
+// CHECK{LITERAL}          [0, 1, 2, 3]],
+// CHECK{LITERAL}         [[0, 1, 2, 3],
+// CHECK{LITERAL}          [0, 1, 2, 3],
+// CHECK{LITERAL}          [0, 1, 2, 3]]]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/pad.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/pad.mlir
new file mode 100644
index 00000000000000..1f563b2d5ec57f
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/pad.mlir
@@ -0,0 +1,56 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @static_pad() -> tensor<2x4x7xi32> {
+  // lax.pad(np.array([[[1,2,3],[4,5,6]]]), 42,
+  //                  [(0, 1, 0), (1, 1, 0), (2, 0, 1)])
+  %cst = mhlo.constant dense<[[[1,2,3],[4,5,6]]]> : tensor<1x2x3xi32>
+  %pad_value = mhlo.constant dense<42> : tensor<i32>
+  %0 = "mhlo.pad"(%cst, %pad_value) {
+    edge_padding_low = dense<[0, 1, 2]> : tensor<3xi64>,
+    edge_padding_high = dense<[1, 1, 0]> : tensor<3xi64>,
+    interior_padding = dense<[0, 0, 1]> : tensor<3xi64>
+  } : (tensor<1x2x3xi32>, tensor<i32>) -> tensor<2x4x7xi32>
+  func.return %0 : tensor<2x4x7xi32>
+}
+
+// CHECK-LABEL: @static_pad
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[[42, 42, 42, 42, 42, 42, 42],
+// CHECK-SAME{LITERAL}:   [42, 42, 1, 42, 2, 42, 3]
+// CHECK-SAME{LITERAL}:   [42, 42, 4, 42, 5, 42, 6],
+// CHECK-SAME{LITERAL}:   [42, 42, 42, 42, 42, 42, 42]],
+// CHECK-SAME{LITERAL}:  [[42, 42, 42, 42, 42, 42, 42],
+// CHECK-SAME{LITERAL}:   [42, 42, 42, 42, 42, 42, 42],
+// CHECK-SAME{LITERAL}:   [42, 42, 42, 42, 42, 42, 42],
+// CHECK-SAME{LITERAL}:   [42, 42, 42, 42, 42, 42, 42]]]
+
+func.func @dynamic_pad() -> tensor<?x4xi32> {
+  %c1 = arith.constant 1 : index
+  %empty = tensor.empty(%c1) : tensor<?x2xi32>
+  %pad_value = mhlo.constant dense<42> : tensor<i32>
+  %0 = "mhlo.pad"(%empty, %pad_value) {
+    edge_padding_low = dense<[0, 1]> : tensor<2xi64>,
+    edge_padding_high = dense<[1, 1]> : tensor<2xi64>,
+    interior_padding = dense<[0, 0]> : tensor<2xi64>
+  } : (tensor<?x2xi32>, tensor<i32>) -> tensor<?x4xi32>
+  func.return %0 : tensor<?x4xi32>
+}
+
+// CHECK-LABEL: @dynamic_pad
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[42, 0, 0, 42], [42, 42, 42, 42]]
+
+func.func @negative_pad() -> tensor<10xi32> {
+  %empty = arith.constant dense<[1,2,3,4,5,6,7]> : tensor<7xi32>
+  %pad_value = mhlo.constant dense<42> : tensor<i32>
+  %0 = "mhlo.pad"(%empty, %pad_value) {
+    edge_padding_low = dense<[-2]> : tensor<1xi64>,
+    edge_padding_high = dense<[-1]> : tensor<1xi64>,
+    interior_padding = dense<[1]> : tensor<1xi64>
+  } : (tensor<7xi32>, tensor<i32>) -> tensor<10xi32>
+  func.return %0 : tensor<10xi32>
+}
+
+// CHECK-LABEL: @negative_pad
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [2, 42, 3, 42, 4, 42, 5, 42, 6, 42]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/reduce.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/reduce.mlir
new file mode 100644
index 00000000000000..f371769413c2c5
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/reduce.mlir
@@ -0,0 +1,17 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @reduce() -> tensor<3xi32> {
+  %cst = mhlo.constant dense<[[1, 2, 3], [4, 5, 6]]> : tensor<2x3xi32>
+  %init = mhlo.constant dense<1> : tensor<i32>
+  %reduce = mhlo.reduce(%cst init: %init) across dimensions = [0]
+      : (tensor<2x3xi32>, tensor<i32>) -> tensor<3xi32>
+    reducer(%arg0: tensor<i32>, %arg1: tensor<i32>)  {
+      %0 = mhlo.add %arg0, %arg1 : tensor<i32>
+      mhlo.return %0 : tensor<i32>
+    }
+  return %reduce : tensor<3xi32>
+}
+
+// CHECK-LABEL: @reduce
+// CHECK-NEXT: Results
+// CHECK-NEXT: [6, 8, 10]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/reshape.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/reshape.mlir
new file mode 100644
index 00000000000000..117489375b00e7
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/reshape.mlir
@@ -0,0 +1,34 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @reshape() -> tensor<2x4x2xi32> {
+  %cst = mhlo.constant dense<
+    [[0, 1, 2, 3], [4, 5, 6, 7],
+     [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>
+  %reshape = mhlo.reshape %cst : (tensor<4x4xi32>) -> tensor<2x4x2xi32>
+  func.return %reshape : tensor<2x4x2xi32>
+}
+
+// CHECK-LABEL: @reshape
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: <2x4x2xi32>: [[[0, 1], [2, 3], [4, 5], [6, 7]],
+// CHECK-SAME{LITERAL}:               [[8, 9], [10, 11], [12, 13], [14, 15]]]
+
+func.func @reshape_0d_1d() -> tensor<1xi32> {
+  %cst = mhlo.constant dense<42> : tensor<i32>
+  %reshape = mhlo.reshape %cst : (tensor<i32>) -> tensor<1xi32>
+  func.return %reshape : tensor<1xi32>
+}
+
+// CHECK-LABEL: @reshape_0d_1d
+// CHECK-NEXT: Results
+// CHECK-NEXT: [42]
+
+func.func @reshape_1d_0d() -> tensor<i32> {
+  %cst = mhlo.constant dense<42> : tensor<1xi32>
+  %reshape = mhlo.reshape %cst : (tensor<1xi32>) -> tensor<i32>
+  func.return %reshape : tensor<i32>
+}
+
+// CHECK-LABEL: @reshape_1d_0d
+// CHECK-NEXT: Results
+// CHECK-NEXT: 42
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/scatter.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/scatter.mlir
new file mode 100644
index 00000000000000..85bb6225f9ba54
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/scatter.mlir
@@ -0,0 +1,55 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @bounds_check() -> tensor<10xi32> {
+  // operand = np.zeros([8], dtype=np.int32)
+  // indices = np.array([[1], [8], [-1]])
+  // updates = np.array([[4, 5, 6], [6, 7, 8], [8, 9, 10]])
+  // lax.scatter_add(operand, indices, updates,
+  //   dimension_numbers=lax.ScatterDimensionNumbers(
+  //      update_window_dims=(0,), inserted_window_dims=(),
+  //      scatter_dims_to_operand_dims=(0,)))
+  %operand = mhlo.constant dense<0> : tensor<10xi32>
+  %indices = mhlo.constant dense<[[1], [8], [-1]]> : tensor<3x1xi32>
+  %updates = mhlo.constant dense<[[4, 5, 6], [6, 7, 8], [8, 9, 10]]> : tensor<3x3xi32>
+  %scatter = "mhlo.scatter"(%operand, %indices, %updates) ({
+  ^bb0(%lhs: tensor<i32>, %rhs: tensor<i32>):
+    %add = mhlo.add %lhs, %rhs : tensor<i32>
+    "mhlo.return"(%add) : (tensor<i32>) -> ()
+  }) {
+    scatter_dimension_numbers = #mhlo.scatter<
+      update_window_dims = [0],
+      inserted_window_dims = [],
+      index_vector_dim = 1,
+      scatter_dims_to_operand_dims = [0]
+    >
+  } : (tensor<10xi32>, tensor<3x1xi32>, tensor<3x3xi32>) -> tensor<10xi32>
+  return %scatter : tensor<10xi32>
+}
+
+// CHECK-LABEL: @bounds_check
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [0, 4, 6, 8, 0, 0, 0, 0, 0, 0]
+
+func.func @update_last_element() -> tensor<2xi32> {
+  %operand = mhlo.constant dense<[1, 1]> : tensor<2xi32>
+  %indices = mhlo.constant dense<[[1]]> : tensor<1x1xi32>
+  %updates = mhlo.constant dense<[[0]]> : tensor<1x1xi32>
+
+  %scatter = "mhlo.scatter"(%operand, %indices, %updates) ({
+    ^bb0(%arg3: tensor<i32>, %arg4: tensor<i32>):
+      "mhlo.return"(%arg4) : (tensor<i32>) -> ()
+    }) {
+      indices_are_sorted = false,
+      scatter_dimension_numbers = #mhlo.scatter<
+        update_window_dims = [1],
+        scatter_dims_to_operand_dims = [0],
+        index_vector_dim = 1
+      >,
+      unique_indices = false
+    } : (tensor<2xi32>, tensor<1x1xi32>, tensor<1x1xi32>) -> tensor<2xi32>
+  return %scatter : tensor<2xi32>
+}
+
+// CHECK-LABEL: @update_last_element
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [1, 0]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/select.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/select.mlir
new file mode 100644
index 00000000000000..5959b3cf105b89
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/select.mlir
@@ -0,0 +1,14 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @reshape() -> tensor<2xi32> {
+  %cst = mhlo.constant dense<[true, false]> : tensor<2xi1>
+  %a = mhlo.constant dense<[1, 2]> : tensor<2xi32>
+  %b = mhlo.constant dense<[3, 4]> : tensor<2xi32>
+  %ret = "mhlo.select"(%cst, %a, %b) :
+    (tensor<2xi1>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  return %ret : tensor<2xi32>
+}
+
+// CHECK-LABEL: @reshape
+// CHECK-NEXT: Results
+// CHECK-NEXT: [1, 4]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/slice.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/slice.mlir
new file mode 100644
index 00000000000000..c5bb53f8740cc0
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/slice.mlir
@@ -0,0 +1,16 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @slice() -> tensor<1x2xi32> {
+  %cst = mhlo.constant dense<[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]>
+    : tensor<3x4xi32>
+  %0 = "mhlo.slice"(%cst) {
+    start_indices = dense<[1, 0]> : tensor<2xi64>,
+    limit_indices = dense<[2, 4]> : tensor<2xi64>,
+    strides = dense<[1, 2]> : tensor<2xi64>
+  } : (tensor<3x4xi32>) -> tensor<1x2xi32>
+  func.return %0 : tensor<1x2xi32>
+}
+
+// CHECK-LABEL: @slice
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[5, 7]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/sort.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/sort.mlir
new file mode 100644
index 00000000000000..fa163718e91fbf
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/sort.mlir
@@ -0,0 +1,25 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @sort() -> (tensor<2x5xf32>, tensor<2x5xi32>) {
+  %input0 = arith.constant dense<[
+    [4.0, 2.0, 1.0, 5.0, 3.0],
+    [6.0, 9.0, 8.0, 7.0, 10.0]
+  ]> : tensor<2x5xf32>
+
+  %input1 = arith.constant dense<[
+    [1, 2, 3, 4,  5],
+    [6, 7, 8, 9, 10]
+  ]> : tensor<2x5xi32>
+  %0, %1 = "mhlo.sort"(%input0, %input1) ({
+  ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i32>, %arg3: tensor<i32>):
+    %7 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = #mhlo<comparison_direction LT>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    "mhlo.return"(%7) : (tensor<i1>) -> ()
+  }) {dimension = 1 : i64, is_stable = true} : (tensor<2x5xf32>, tensor<2x5xi32>) -> (tensor<2x5xf32>, tensor<2x5xi32>)
+
+  return %0, %1 : tensor<2x5xf32>, tensor<2x5xi32>
+}
+
+// CHECK-LABEL: @sort
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00], [6.000000e+00, 7.000000e+00, 8.000000e+00, 9.000000e+00, 1.000000e+01]]
+// CHECK-NEXT{LITERAL}: [[3, 2, 5, 1, 4], [6, 9, 8, 7, 10]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/subtract.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/subtract.mlir
new file mode 100644
index 00000000000000..334e8421863ef2
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/subtract.mlir
@@ -0,0 +1,10 @@
+// RUN: mlir-interpreter-runner %s | FileCheck %s
+
+func.func @main() -> tensor<2x3xi64> {
+  %lhs = mhlo.constant dense<[[0, 1, 2], [3, 4, 5]]> : tensor<2x3xi64>
+  %rhs = mhlo.constant dense<[[10, 20, 30], [40, 50, 60]]> : tensor<2x3xi64>
+  %result = mhlo.subtract %lhs, %rhs : tensor<2x3xi64>
+  return %result : tensor<2x3xi64>
+}
+
+// CHECK{LITERAL}: [[-10, -19, -28], [-37, -46, -55]]
\ No newline at end of file
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/transpose.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/transpose.mlir
new file mode 100644
index 00000000000000..09f9822aa8daa8
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/transpose.mlir
@@ -0,0 +1,28 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @transpose() -> tensor<2x1x4x3xi32> {
+  %0 = mhlo.constant dense<[[[
+      [000, 001, 002, 003],
+      [010, 011, 012, 013],
+      [020, 021, 022, 023]
+    ],
+    [
+      [100, 101, 102, 103],
+      [110, 111, 112, 113],
+      [120, 121, 122, 123]
+    ]]]> : tensor<1x2x3x4xi32>
+  %1 = "mhlo.transpose"(%0) {permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>}
+    : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
+  return %1 : tensor<2x1x4x3xi32>
+}
+
+// CHECK-LABEL: @transpose
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[[[0, 10, 20],
+// CHECK{LITERAL}:         [1, 11, 21],
+// CHECK{LITERAL}:         [2, 12, 22],
+// CHECK{LITERAL}:         [3, 13, 23]]],
+// CHECK{LITERAL}:       [[[100, 110, 120],
+// CHECK{LITERAL}:         [101, 111, 121],
+// CHECK{LITERAL}:         [102, 112, 122],
+// CHECK{LITERAL}:         [103, 113, 123]]]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/tuple.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/tuple.mlir
new file mode 100644
index 00000000000000..02cb16c72e8b5d
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/tuple.mlir
@@ -0,0 +1,30 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @tuple() -> tuple<tensor<2xi1>, tensor<i32>> {
+  %cst = mhlo.constant dense<[true, false]> : tensor<2xi1>
+  %c1 = mhlo.constant dense<1> : tensor<i32>
+  %ret = "mhlo.tuple"(%cst, %c1)
+    : (tensor<2xi1>, tensor<i32>) -> tuple<tensor<2xi1>, tensor<i32>>
+  return %ret : tuple<tensor<2xi1>, tensor<i32>>
+}
+
+// CHECK-LABEL: @tuple
+// CHECK-NEXT: Results
+// CHECK-NEXT: (TensorOrMemref<2xi1>: [true, false], TensorOrMemref<i32>: 1)
+
+func.func @get_tuple_element() -> (tensor<2xi1>, tensor<i32>) {
+  %cst = mhlo.constant dense<[true, false]> : tensor<2xi1>
+  %c42 = mhlo.constant dense<42> : tensor<i32>
+  %tuple = "mhlo.tuple"(%cst, %c42)
+    : (tensor<2xi1>, tensor<i32>) -> tuple<tensor<2xi1>, tensor<i32>>
+  %r0 = "mhlo.get_tuple_element"(%tuple) {index = 0 : i32}
+    : (tuple<tensor<2xi1>, tensor<i32>>) -> tensor<2xi1>
+  %r1 = "mhlo.get_tuple_element"(%tuple) {index = 1 : i32}
+    : (tuple<tensor<2xi1>, tensor<i32>>) -> tensor<i32>
+  return %r0, %r1 : tensor<2xi1>, tensor<i32>
+}
+
+// CHECK-LABEL: @get_tuple_element
+// CHECK-NEXT: Results
+// CHECK-NEXT: [true, false]
+// CHECK-NEXT: 42
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/while.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/while.mlir
new file mode 100644
index 00000000000000..4525174d7af4be
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/mhlo/while.mlir
@@ -0,0 +1,25 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @while() -> (tensor<i32>, tensor<i32>) {
+  %c0 = mhlo.constant dense<0> : tensor<i32>
+  %c1 = mhlo.constant dense<1> : tensor<i32>
+  %c10 = mhlo.constant dense<10> : tensor<i32>
+  %3:2 = "mhlo.while"(%c0, %c1) ({
+    ^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>):
+      %4 = "mhlo.compare"(%arg0, %c10) {
+        comparison_direction = #mhlo<comparison_direction LT>
+      } : (tensor<i32>, tensor<i32>) -> tensor<i1>
+      "mhlo.return"(%4) : (tensor<i1>) -> ()
+    },  {
+    ^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>):
+      %5 = mhlo.add %arg0, %c1 : tensor<i32>
+      %6 = mhlo.add %arg1, %arg1 : tensor<i32>
+      "mhlo.return"(%5, %6) : (tensor<i32>, tensor<i32>) -> ()
+    }) : (tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>)
+  func.return %3#0, %3#1 : tensor<i32>, tensor<i32>
+}
+
+// CHECK-LABEL: @while
+// CHECK-NEXT: Results
+// CHECK-NEXT: TensorOrMemref<i32>: 10
+// CHECK-NEXT: TensorOrMemref<i32>: 1024
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/scf/for.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/scf/for.mlir
new file mode 100644
index 00000000000000..bad5951f4b4ca9
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/scf/for.mlir
@@ -0,0 +1,82 @@
+
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @for() -> memref<4xi64> {
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %c4 = arith.constant 4 : index
+  %alloc = memref.alloc() : memref<4xi64>
+  scf.for %i = %c0 to %c4 step %c2 {
+    %1 = arith.index_cast %i: index to i64
+    memref.store %1, %alloc[%i]: memref<4xi64>
+  }
+  return %alloc : memref<4xi64>
+}
+
+// CHECK-LABEL: @for
+// CHECK: Results
+// CHECK-NEXT{LITERAL}: [0, 0, 2, 0]
+
+func.func @nested() -> memref<2x2xindex> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %alloc = memref.alloc() : memref<2x2xindex>
+  scf.for %i = %c0 to %c2 step %c1 {
+    scf.for %j = %c0 to %c2 step %c1 {
+      memref.store %c1, %alloc[%i, %j]: memref<2x2xindex>
+    }
+  }
+  return %alloc : memref<2x2xindex>
+}
+
+// CHECK-LABEL: @nested
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[1, 1], [1, 1]]
+
+func.func @iter_arg() -> index {
+  %c0 = arith.constant 0 : index
+  %c4 = arith.constant 4 : index
+  %c1 = arith.constant 1 : index
+  %sum = scf.for %i = %c0 to %c4 step %c1 iter_args(%x = %c1) -> index {
+    %sum = arith.addi %i, %x : index
+    scf.yield %sum : index
+  }
+  return %sum : index
+}
+
+// CHECK-LABEL: @iter_arg
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: i64: 7
+
+func.func @int32() -> memref<4xi32> {
+  %c0 = arith.constant 0 : i32
+  %c1 = arith.constant 1 : i32
+  %c4 = arith.constant 4 : i32
+  %alloc = memref.alloc() : memref<4xi32>
+  scf.for %i = %c0 to %c4 step %c1 : i32 {
+    %index = arith.index_cast %i : i32 to index
+    memref.store %i, %alloc[%index]: memref<4xi32>
+  }
+  return %alloc : memref<4xi32>
+}
+
+// CHECK-LABEL: int32
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: <4xi32>: [0, 1, 2, 3]
+
+func.func @int16() -> memref<4xi16> {
+  %c0 = arith.constant 0 : i16
+  %c1 = arith.constant 1 : i16
+  %c4 = arith.constant 4 : i16
+  %alloc = memref.alloc() : memref<4xi16>
+  scf.for %i = %c0 to %c4 step %c1 : i16 {
+    %index = arith.index_cast %i : i16 to index
+    memref.store %i, %alloc[%index]: memref<4xi16>
+  }
+  return %alloc : memref<4xi16>
+}
+
+// CHECK-LABEL: int16
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: <4xi16>: [0, 1, 2, 3]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/scf/forall.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/scf/forall.mlir
new file mode 100644
index 00000000000000..29bec5e463ea92
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/scf/forall.mlir
@@ -0,0 +1,62 @@
+
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @add() -> (tensor<3xi32>, tensor<3xi32>) {
+  %c3 = arith.constant 3 : index
+  %in = arith.constant dense<[1, 2, 3]> : tensor<3xi32>
+  %out = arith.constant dense<[4, 5, 6]> : tensor<3xi32>
+
+  %result = scf.forall (%i) in (%c3) shared_outs(%o = %out) -> tensor<3xi32> {
+    %addend = tensor.extract_slice %in[%i][1][1] : tensor<3xi32> to tensor<1xi32>
+    %augend = tensor.extract_slice %out[%i][1][1] : tensor<3xi32> to tensor<1xi32>
+    %sum = mhlo.add %augend, %addend : tensor<1xi32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %sum into %o[%i][1][1]
+        : tensor<1xi32> into tensor<3xi32>
+    }
+  }
+
+  return %out, %result : tensor<3xi32>, tensor<3xi32>
+}
+
+// CHECK-LABEL: @add
+// CHECK-NEXT: Results
+// CHECK-NEXT: [4, 5, 6]
+// CHECK-NEXT: [5, 7, 9]
+
+func.func @bufferized_add() -> memref<3xi32> {
+  %c3 = arith.constant 3 : index
+  %in = arith.constant dense<[1, 2, 3]> : tensor<3xi32>
+  %out = arith.constant dense<[4, 5, 6]> : memref<3xi32>
+
+  scf.forall (%i) in (%c3) {
+    %addend = tensor.extract %in[%i] : tensor<3xi32>
+    %augend = memref.load %out[%i] : memref<3xi32>
+    %sum = arith.addi %augend, %addend : i32
+    memref.store %sum, %out[%i] : memref<3xi32>
+  }
+
+  return %out : memref<3xi32>
+}
+
+// CHECK-LABEL: @bufferized_add
+// CHECK-NEXT: Results
+// CHECK-NEXT: [5, 7, 9]
+
+func.func @step() -> memref<4xi32> {
+  %c2 = arith.constant 2 : index
+  %c4 = arith.constant 4 : index
+
+  %c42 = arith.constant 42 : i32
+  %out = arith.constant dense<[1, 1, 1, 1]> : memref<4xi32>
+
+  scf.forall (%i) = (0) to (%c4) step (%c2) {
+    memref.store %c42, %out[%i] : memref<4xi32>
+  }
+
+  return %out : memref<4xi32>
+}
+
+// CHECK-LABEL: @step
+// CHECK-NEXT: Results
+// CHECK-NEXT: [42, 1, 42, 1]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/scf/if.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/scf/if.mlir
new file mode 100644
index 00000000000000..52d899d13328a6
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/scf/if.mlir
@@ -0,0 +1,69 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @true() -> i64 {
+  %c0 = arith.constant 0 : i64
+  %c1 = arith.constant 1 : i64
+  %true = arith.constant true
+  %ret = scf.if %true -> i64 {
+    scf.yield %c0 : i64
+  } else {
+    scf.yield %c1 : i64
+  }
+  return %ret : i64
+}
+
+// CHECK-LABEL: @true
+// CHECK-NEXT: Results
+// CHECK-NEXT: i64: 0
+
+func.func @false() -> i64 {
+  %c2 = arith.constant 2 : i64
+  %c3 = arith.constant 3 : i64
+  %false = arith.constant false
+  %ret = scf.if %false -> i64 {
+    scf.yield %c2 : i64
+  } else {
+    scf.yield %c3 : i64
+  }
+  return %ret : i64
+}
+
+// CHECK-LABEL: @false
+// CHECK-NEXT: Results
+// CHECK-NEXT: i64: 3
+
+func.func @side_effect() -> memref<i64> {
+  %alloc = memref.alloc() : memref<i64>
+  %true = arith.constant true
+  %c124 = arith.constant 124 : i64
+  %c125 = arith.constant 125 : i64
+  scf.if %true {
+    memref.store %c124, %alloc[] : memref<i64>
+    scf.yield
+  } else {
+    memref.store %c125, %alloc[] : memref<i64>
+    scf.yield
+  }
+  return %alloc : memref<i64>
+}
+
+// CHECK-LABEL: @side_effect
+// CHECK-NEXT: Results
+// CHECK-NEXT: <i64>: 124
+
+func.func @side_effect_not_executed() -> memref<i64> {
+  %alloc = memref.alloc() : memref<i64>
+  %false = arith.constant false
+  %c126 = arith.constant 126 : i64
+  memref.store %c126, %alloc[] : memref<i64>
+  %c127 = arith.constant 127 : i64
+  scf.if %false {
+    memref.store %c127, %alloc[] : memref<i64>
+    scf.yield
+  }
+  return %alloc : memref<i64>
+}
+
+// CHECK-LABEL: @side_effect_not_executed
+// CHECK-NEXT: Results
+// CHECK-NEXT: <i64>: 126
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/scf/parallel.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/scf/parallel.mlir
new file mode 100644
index 00000000000000..72007d2b2966fd
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/scf/parallel.mlir
@@ -0,0 +1,42 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @parallel() -> memref<4x4xi32> {
+  %ret = memref.alloc() : memref<4x4xi32>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c4 = arith.constant 4 : index
+  %c9 = arith.constant 9 : i32
+  scf.parallel (%i, %j) = (%c0, %c1) to (%c4, %c4) step (%c1, %c2) {
+    memref.store %c9, %ret[%i, %j] : memref<4x4xi32>
+  }
+  return %ret : memref<4x4xi32>
+}
+
+// CHECK-LABEL: @parallel
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[0, 9, 0, 9], [0, 9, 0, 9], [0, 9, 0, 9], [0, 9, 0, 9]]
+
+func.func @reduce_2() -> (index, index) {
+  %c1 = arith.constant 1 : index
+  %c6 = arith.constant 6 : index
+  %ret:2 = scf.parallel (%i) = (%c1) to (%c6) step (%c1)
+             init (%c1, %c1) -> (index, index) {
+    scf.reduce (%i, %i : index, index) {
+      ^bb0(%lhs: index, %rhs: index):
+        %ret = arith.muli %lhs, %rhs : index
+        scf.reduce.return %ret : index
+    }, {
+      ^bb0(%lhs: index, %rhs: index):
+        %ret = arith.addi %lhs, %rhs : index
+        scf.reduce.return %ret : index
+    }
+  }
+  return %ret#0, %ret#1 : index, index
+}
+
+// CHECK-LABEL: @reduce_2
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: i64: 120
+// CHECK-NEXT{LITERAL}: i64: 16
+
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/scf/while.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/scf/while.mlir
new file mode 100644
index 00000000000000..1be3814f40365f
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/scf/while.mlir
@@ -0,0 +1,45 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @while_empty() -> memref<i64> {
+  %c0 = arith.constant 0 : i64
+  %c1 = arith.constant 1 : i64
+  %c4 = arith.constant 4 : i64
+  %alloc = memref.alloc() : memref<i64>
+  memref.store %c0, %alloc[] : memref<i64>
+  scf.while: () -> () {
+    %value = memref.load %alloc[] : memref<i64>
+    %cond = arith.cmpi slt, %value, %c4 : i64
+    scf.condition(%cond)
+  } do {
+    %value = memref.load %alloc[] : memref<i64>
+    %add = arith.addi %value, %c1 : i64
+    memref.store %add, %alloc[] : memref<i64>
+    scf.yield
+  }
+  return %alloc : memref<i64>
+}
+
+// CHECK-LABEL: @while_empty
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: TensorOrMemref<i64>: 4
+
+func.func @while_var() -> i64 {
+  %c0 = arith.constant 0 : i64
+  %c1 = arith.constant 1 : i64
+  %c4 = arith.constant 4 : i64
+  %alloc = memref.alloc() : memref<i64>
+  memref.store %c0, %alloc[] : memref<i64>
+  %ret = scf.while(%arg0 = %c0): (i64) -> (i64) {
+    %cond = arith.cmpi slt, %arg0, %c4 : i64
+    scf.condition(%cond) %arg0 : i64
+  } do {
+  ^bb0(%arg1: i64):
+    %add = arith.addi %arg1, %c1 : i64
+    scf.yield %add : i64
+  }
+  return %ret : i64
+}
+
+// CHECK-LABEL: @while_var
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: i64: 4
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/tensor/collapse_shape.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/tensor/collapse_shape.mlir
new file mode 100644
index 00000000000000..1201a2a342e73a
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/tensor/collapse_shape.mlir
@@ -0,0 +1,42 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @collapse_shape()
+    -> (tensor<1x6xi32>, tensor<2x3xi32>, tensor<6xi32>) {
+  %cst = arith.constant dense<[[[1, 2, 3], [4, 5, 6]]]> : tensor<1x2x3xi32>
+  %collapse1 = tensor.collapse_shape %cst [[0], [1, 2]]
+      : tensor<1x2x3xi32> into tensor<1x6xi32>
+  %collapse2 = tensor.collapse_shape %cst [[0, 1], [2]]
+      : tensor<1x2x3xi32> into tensor<2x3xi32>
+  %collapse3 = tensor.collapse_shape %cst [[0, 1, 2]]
+      : tensor<1x2x3xi32> into tensor<6xi32>
+  return %collapse1, %collapse2, %collapse3
+      : tensor<1x6xi32>, tensor<2x3xi32>, tensor<6xi32>
+}
+
+// CHECK-LABEL: @collapse_shape
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[1, 2, 3, 4, 5, 6]]
+// CHECK-NEXT{LITERAL}: [[1, 2, 3], [4, 5, 6]]
+// CHECK-NEXT{LITERAL}: [1, 2, 3, 4, 5, 6]
+
+func.func @to_unit() -> tensor<i32> {
+  %cst = arith.constant dense<42> : tensor<1x1x1x1xi32>
+  %collapse = tensor.collapse_shape %cst []
+    : tensor<1x1x1x1xi32> into tensor<i32>
+  return %collapse : tensor<i32>
+}
+
+// CHECK-LABEL: @to_unit
+// CHECK-NEXT: Results
+// CHECK-NEXT: 42
+
+func.func @dynamic() -> tensor<?xi32> {
+  %cst = arith.constant dense<42> : tensor<2x3xi32>
+  %cast = tensor.cast %cst : tensor<2x3xi32> to tensor<?x3xi32>
+  %collapse = tensor.collapse_shape %cast [[0, 1]] : tensor<?x3xi32> into tensor<?xi32>
+  return %collapse : tensor<?xi32>
+}
+
+// CHECK-LABEL: @dynamic
+// CHECK-NEXT: Results
+// CHECK-NEXT: [42, 42, 42, 42, 42, 42]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/tensor/dim.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/tensor/dim.mlir
new file mode 100644
index 00000000000000..6e38150587ed27
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/tensor/dim.mlir
@@ -0,0 +1,12 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @dim() -> index {
+  %it = tensor.empty() : tensor<2x3x4xi32>
+  %c1 = arith.constant 1 : index
+  %dim = tensor.dim %it, %c1 : tensor<2x3x4xi32>
+  return %dim : index
+}
+
+// CHECK-LABEL: @dim
+// CHECK-NEXT: Results
+// CHECK-NEXT: i64: 3
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/tensor/empty.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/tensor/empty.mlir
new file mode 100644
index 00000000000000..1fdd75afc7e227
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/tensor/empty.mlir
@@ -0,0 +1,21 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @static() -> tensor<2x3xi32> {
+  %ret = tensor.empty() : tensor<2x3xi32>
+  return %ret : tensor<2x3xi32>
+}
+
+// CHECK-LABEL: @static
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[0, 0, 0], [0, 0, 0]]
+
+func.func @dynamic() -> tensor<2x?x3x?xi32> {
+  %c5 = arith.constant 5 : index
+  %c7 = arith.constant 7 : index
+  %ret = tensor.empty(%c5, %c7) : tensor<2x?x3x?xi32>
+  return %ret : tensor<2x?x3x?xi32>
+}
+
+// CHECK-LABEL: @dynamic
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: TensorOrMemref<2x5x3x7xi32>
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/tensor/expand_shape.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/tensor/expand_shape.mlir
new file mode 100644
index 00000000000000..77948509ae3334
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/tensor/expand_shape.mlir
@@ -0,0 +1,31 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @expand_shape()
+    -> (tensor<1x6xi32>, tensor<2x3xi32>, tensor<1x2x3xi32>) {
+  %cst = arith.constant dense<[1, 2, 3, 4, 5, 6]> : tensor<6xi32>
+  %expand1 = tensor.expand_shape %cst [[0, 1]] output_shape [1, 6]
+      : tensor<6xi32> into tensor<1x6xi32>
+  %expand2 = tensor.expand_shape %cst [[0, 1]] output_shape [2, 3]
+      : tensor<6xi32> into tensor<2x3xi32>
+  %expand3 = tensor.expand_shape %cst [[0, 1, 2]] output_shape [1, 2, 3]
+      : tensor<6xi32> into tensor<1x2x3xi32>
+  return %expand1, %expand2, %expand3
+      : tensor<1x6xi32>, tensor<2x3xi32>, tensor<1x2x3xi32>
+}
+
+// CHECK-LABEL: @expand_shape
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[1, 2, 3, 4, 5, 6]]
+// CHECK-NEXT{LITERAL}: [[1, 2, 3], [4, 5, 6]]
+// CHECK-NEXT{LITERAL}: [[[1, 2, 3], [4, 5, 6]]]
+
+func.func @from_unit() -> tensor<1x1xi32> {
+  %cst = arith.constant dense<42> : tensor<i32>
+  %expand = tensor.expand_shape %cst [] output_shape [1, 1]
+    : tensor<i32> into tensor<1x1xi32>
+  return %expand : tensor<1x1xi32>
+}
+
+// CHECK-LABEL: @from_unit
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: TensorOrMemref<1x1xi32>: [[42]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/tensor/extract.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/tensor/extract.mlir
new file mode 100644
index 00000000000000..9209eea17d9a50
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/tensor/extract.mlir
@@ -0,0 +1,13 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @extract() -> i32 {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %cst = arith.constant dense<[[[1, 2, 3], [4, 5, 6]]]> : tensor<1x2x3xi32>
+  %ret = tensor.extract %cst[%c0, %c1, %c1] : tensor<1x2x3xi32>
+  return %ret : i32
+}
+
+// CHECK-LABEL: @extract
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: 5
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/tensor/extract_slice.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/tensor/extract_slice.mlir
new file mode 100644
index 00000000000000..81721e12b2abf7
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/tensor/extract_slice.mlir
@@ -0,0 +1,62 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @extract() -> tensor<1x2x1xi32> {
+  %cst = arith.constant dense<[[[1, 2, 3], [4, 5, 6]]]> : tensor<1x2x3xi32>
+  %ret = tensor.extract_slice %cst[0, 0, 0][1, 2, 1][1, 1, 1] : tensor<1x2x3xi32> to tensor<1x2x1xi32>
+  return %ret : tensor<1x2x1xi32>
+}
+
+// CHECK-LABEL: @extract
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[[1], [4]]]
+
+func.func @rank_reduction() -> tensor<2xi32> {
+  %cst = arith.constant dense<[[[1, 2, 3], [4, 5, 6]]]> : tensor<1x2x3xi32>
+  %ret = tensor.extract_slice %cst[0, 0, 0][1, 2, 1][1, 1, 1] : tensor<1x2x3xi32> to tensor<2xi32>
+  return %ret : tensor<2xi32>
+}
+
+// CHECK-LABEL: @rank_reduction
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [1, 4]
+
+func.func @rank_reduction_to_dynamic() -> tensor<?xi32> {
+  %cst = arith.constant dense<[[[1, 2, 3], [4, 5, 6]]]> : tensor<1x2x3xi32>
+  %c2 = arith.constant 2 : index
+  %ret = tensor.extract_slice %cst[0, 0, 0][1, %c2, 1][1, 1, 1] : tensor<1x2x3xi32> to tensor<?xi32>
+  return %ret : tensor<?xi32>
+}
+
+// CHECK-LABEL: @rank_reduction_to_dynamic
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [1, 4]
+
+func.func @no_rank_reduction_to_dynamic() -> tensor<1x?xi32> {
+  %cst = arith.constant dense<[[0], [1], [2], [3], [4]]> : tensor<5x1xi32>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %ret = tensor.extract_slice %cst[%c2, %c0] [1, %c1] [1, 1]
+    : tensor<5x1xi32> to tensor<1x?xi32>
+  return %ret : tensor<1x?xi32>
+}
+
+// CHECK-LABEL: @no_rank_reduction_to_dynamic
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[2]]
+
+func.func @extract_from_extract() -> tensor<1x1xi32> {
+  %cst = arith.constant dense<[[0], [1], [2], [3], [4]]> : tensor<5x1xi32>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %extracted = tensor.extract_slice %cst[%c2, %c0] [1, %c1] [1, 1]
+    : tensor<5x1xi32> to tensor<1x?xi32>
+  %ret = tensor.extract_slice %extracted[%c0, %c0] [1, 1] [1, 1]
+    : tensor<1x?xi32> to tensor<1x1xi32>
+  return %ret : tensor<1x1xi32>
+}
+
+// CHECK-LABEL: @extract_from_extract
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[2]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/tensor/from_elements.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/tensor/from_elements.mlir
new file mode 100644
index 00000000000000..e8bda72ada6dd4
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/tensor/from_elements.mlir
@@ -0,0 +1,25 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @from_elements() -> tensor<2x3xindex> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c3 = arith.constant 3 : index
+  %c4 = arith.constant 4 : index
+  %c5 = arith.constant 5 : index
+  %ret = tensor.from_elements %c0, %c1, %c2, %c3, %c4, %c5 :  tensor<2x3xindex>
+  return %ret : tensor<2x3xindex>
+}
+
+// CHECK-LABEL: @from_elements
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[0, 1, 2], [3, 4, 5]]
+
+func.func @empty() -> tensor<0xindex> {
+  %ret = tensor.from_elements : tensor<0xindex>
+  return %ret : tensor<0xindex>
+}
+
+// CHECK-LABEL: @empty
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: []
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/tensor/generate.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/tensor/generate.mlir
new file mode 100644
index 00000000000000..9ae2a4d43de1fa
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/tensor/generate.mlir
@@ -0,0 +1,29 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @generate() -> tensor<?xindex> {
+  %size = arith.constant 5 : index
+  %iota = tensor.generate %size {
+    ^bb0(%i : index):
+      tensor.yield %i : index
+    } : tensor<?xindex>
+  return %iota : tensor<?xindex>
+}
+
+// CHECK-LABEL: @generate
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [0, 1, 2, 3, 4]
+
+func.func @generate_2d() -> tensor<?x?xindex> {
+  %c5 = arith.constant 5 : index
+  %c2 = arith.constant 2 : index
+  %iota = tensor.generate %c5, %c2 {
+    ^bb0(%i : index, %j : index):
+      %sum = arith.addi %i, %j : index
+      tensor.yield %sum : index
+    } : tensor<?x?xindex>
+  return %iota : tensor<?x?xindex>
+}
+
+// CHECK-LABEL: @generate_2d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[0, 1], [1, 2], [2, 3], [3, 4], [4, 5]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/tensor/insert.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/tensor/insert.mlir
new file mode 100644
index 00000000000000..47d5677694d89c
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/tensor/insert.mlir
@@ -0,0 +1,14 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @insert() -> tensor<1x2x3xi32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c7 = arith.constant 7 : i32
+  %cst = arith.constant dense<[[[1, 2, 3], [4, 5, 6]]]> : tensor<1x2x3xi32>
+  %ret = tensor.insert %c7 into %cst[%c0, %c1, %c1] : tensor<1x2x3xi32>
+  return %ret : tensor<1x2x3xi32>
+}
+
+// CHECK-LABEL: @insert
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[[1, 2, 3], [4, 7, 6]]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/tensor/insert_slice.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/tensor/insert_slice.mlir
new file mode 100644
index 00000000000000..ec5383d5c2852a
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/tensor/insert_slice.mlir
@@ -0,0 +1,25 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @insert() -> tensor<1x3x3xi32> {
+  %cst = arith.constant dense<[[[1, 2, 3], [4, 5, 6], [7, 8, 9]]]> : tensor<1x3x3xi32>
+  %cst_2 = arith.constant dense<[[[10], [11]]]> : tensor<1x2x1xi32>
+  %ret = tensor.insert_slice %cst_2 into %cst[0, 1, 1][1, 2, 1][1, 1, 1]
+    : tensor<1x2x1xi32> into tensor<1x3x3xi32>
+  return %ret : tensor<1x3x3xi32>
+}
+
+// CHECK-LABEL: @insert
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[[1, 2, 3], [4, 10, 6], [7, 11, 9]]]
+
+func.func @rank_increase() -> tensor<1x3x3xi32> {
+  %cst = arith.constant dense<[[[1, 2, 3], [4, 5, 6], [7, 8, 9]]]> : tensor<1x3x3xi32>
+  %cst_2 = arith.constant dense<[10, 11]> : tensor<2xi32>
+  %ret = tensor.insert_slice %cst_2 into %cst[0, 1, 1][1, 2, 1][1, 1, 1]
+    : tensor<2xi32> into tensor<1x3x3xi32>
+  return %ret : tensor<1x3x3xi32>
+}
+
+// CHECK-LABEL: @rank_increase
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[[1, 2, 3], [4, 10, 6], [7, 11, 9]]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/tensor/pad.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/tensor/pad.mlir
new file mode 100644
index 00000000000000..a2a6470b6e3f17
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/tensor/pad.mlir
@@ -0,0 +1,38 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @pad() -> tensor<1x?x?xi32> {
+  %it = arith.constant dense<[[[1, 2, 3], [2, 3, 4]]]> : tensor<1x2x3xi32>
+  %offset = arith.constant 2 : index
+  %c0 = arith.constant 0 : index
+  %out = tensor.pad %it low[%c0, %offset, 0] high[0, %c0, %offset]  {
+    ^bb0(%a: index, %b: index, %c: index):
+      %c5 = arith.constant 5 : i32
+      tensor.yield %c5 : i32
+    } : tensor<1x2x3xi32> to tensor<1x?x?xi32>
+  return %out : tensor<1x?x?xi32>
+}
+
+// CHECK-LABEL: @pad
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[[5, 5, 5, 5, 5],
+// CHECK-SAME{LITERAL}:   [5, 5, 5, 5, 5],
+// CHECK-SAME{LITERAL}:   [1, 2, 3, 5, 5],
+// CHECK-SAME{LITERAL}:   [2, 3, 4, 5, 5]]]
+
+func.func @pad_args() -> tensor<3x3xindex> {
+  %it = arith.constant dense<[[999]]> : tensor<1x1xindex>
+  %out = tensor.pad %it low[1, 1] high[1, 1]  {
+    ^bb0(%a: index, %b: index):
+      %c10 = arith.constant 10 : index
+      %mul = arith.muli %a, %c10 : index
+      %ret = arith.addi %mul, %b : index
+      tensor.yield %ret : index
+    } : tensor<1x1xindex> to tensor<3x3xindex>
+  return %out : tensor<3x3xindex>
+}
+
+// CHECK-LABEL: @pad_args
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[0, 1, 2],
+// CHECK-SAME{LITERAL}:  [10, 999, 12],
+// CHECK-SAME{LITERAL}:  [20, 21, 22]]
\ No newline at end of file
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/bitcast.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/bitcast.mlir
new file mode 100644
index 00000000000000..242b025324e934
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/bitcast.mlir
@@ -0,0 +1,32 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @bitcast1d() -> vector<4xi8> {
+  %c = arith.constant dense<-2> : vector<1xi32>
+  %b = vector.bitcast %c : vector<1xi32> to vector<4xi8>
+  return %b : vector<4xi8>
+}
+
+// CHECK-LABEL: @bitcast1d
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<4xi8>: [-2, -1, -1, -1]
+
+func.func @bitcast2d() -> vector<2x1xi64> {
+  %c = arith.constant dense<[[0, 1], [2, 3]]> : vector<2x2xi32>
+  %b = vector.bitcast %c : vector<2x2xi32> to vector<2x1xi64>
+  return %b : vector<2x1xi64>
+}
+
+// CHECK-LABEL: @bitcast2d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: vector<2x1xi64>: [[4294967296], [12884901890]]
+
+func.func @transpose_bitcast() -> vector<2x1xi64> {
+  %c = arith.constant dense<[[0, 1], [2, 3]]> : vector<2x2xi32>
+  %d = vector.transpose %c, [1, 0] : vector<2x2xi32> to vector<2x2xi32>
+  %b = vector.bitcast %d : vector<2x2xi32> to vector<2x1xi64>
+  return %b : vector<2x1xi64>
+}
+
+// CHECK-LABEL: @transpose_bitcast
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: vector<2x1xi64>: [[8589934592], [12884901889]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/broadcast.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/broadcast.mlir
new file mode 100644
index 00000000000000..1e30f7aa9e603f
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/broadcast.mlir
@@ -0,0 +1,51 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @scalar_to_unit() -> vector<i32> {
+  %c1 = arith.constant 1 : i32
+  %b = vector.broadcast %c1 : i32 to vector<i32>
+  return %b : vector<i32>
+}
+
+// CHECK-LABEL: @scalar_to_unit
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<i32>: 1
+
+func.func @scalar_to_2d() -> vector<2x3xi32> {
+  %c1 = arith.constant 1 : i32
+  %b = vector.broadcast %c1 : i32 to vector<2x3xi32>
+  return %b : vector<2x3xi32>
+}
+
+// CHECK-LABEL: @scalar_to_2d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: vector<2x3xi32>: [[1, 1, 1], [1, 1, 1]]
+
+func.func @unit_to_unit() -> vector<i32> {
+  %c1 = arith.constant dense<1> : vector<i32>
+  %b = vector.broadcast %c1 : vector<i32> to vector<i32>
+  return %b : vector<i32>
+}
+
+// CHECK-LABEL: @unit_to_unit
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<i32>: 1
+
+func.func @stretch() -> vector<3xi32> {
+  %c1 = arith.constant dense<1> : vector<1xi32>
+  %b = vector.broadcast %c1 : vector<1xi32> to vector<3xi32>
+  return %b : vector<3xi32>
+}
+
+// CHECK-LABEL: @stretch
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<3xi32>: [1, 1, 1]
+
+func.func @stretch_and_broadcast() -> vector<2x1x2x2xi32> {
+  %c1 = arith.constant dense<[[1], [2]]> : vector<2x1xi32>
+  %b = vector.broadcast %c1 : vector<2x1xi32> to vector<2x1x2x2xi32>
+  return %b : vector<2x1x2x2xi32>
+}
+
+// CHECK-LABEL: @stretch_and_broadcast
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: vector<2x1x2x2xi32>: [[[[1, 1], [2, 2]]], [[[1, 1], [2, 2]]]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/compressstore.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/compressstore.mlir
new file mode 100644
index 00000000000000..079633094723c0
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/compressstore.mlir
@@ -0,0 +1,16 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @compressstore() -> memref<3x4xi32> {
+  %alloc = memref.alloc() : memref<3x4xi32>
+  %c = arith.constant dense<[1,2,3]> : vector<3xi32>
+  %m = arith.constant dense<[true,false,true]> : vector<3xi1>
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  vector.compressstore %alloc[%c1, %c2], %m, %c
+    : memref<3x4xi32>, vector<3xi1>, vector<3xi32>
+  return %alloc : memref<3x4xi32>
+}
+
+// CHECK-LABEL: @compressstore
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[0, 0, 0, 0], [0, 0, 1, 3], [0, 0, 0, 0]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/constant_mask.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/constant_mask.mlir
new file mode 100644
index 00000000000000..a3e3e0d03cc959
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/constant_mask.mlir
@@ -0,0 +1,14 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @constant_mask() -> vector<4x3xi1> {
+  %1 = vector.constant_mask [3, 2] : vector<4x3xi1>
+  return %1 : vector<4x3xi1>
+}
+
+// CHECK-LABEL: @constant_mask
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: vector<4x3xi1>:
+// CHECK-SAME{LITERAL}: [[true, true, false],
+// CHECK-SAME{LITERAL}:  [true, true, false],
+// CHECK-SAME{LITERAL}:  [true, true, false],
+// CHECK-SAME{LITERAL}:  [false, false, false]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/contract.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/contract.mlir
new file mode 100644
index 00000000000000..f7f26fed00d8d0
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/contract.mlir
@@ -0,0 +1,141 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+// Adapted from
+// mlir/test/Integration/Dialect/Vector/CPU/test-contraction.mlir
+
+#dotp_accesses = [
+  affine_map<(i) -> (i)>,
+  affine_map<(i) -> (i)>,
+  affine_map<(i) -> ()>
+]
+#dotp_trait = {
+  indexing_maps = #dotp_accesses,
+  iterator_types = ["reduction"]
+}
+
+#matvec_accesses = [
+  affine_map<(i, j) -> (i, j)>,
+  affine_map<(i, j) -> (j)>,
+  affine_map<(i, j) -> (i)>
+]
+#matvec_trait = {
+  indexing_maps = #matvec_accesses,
+  iterator_types = ["parallel", "reduction"]
+}
+
+#mattransvec_accesses = [
+  affine_map<(i, j) -> (j, i)>,
+  affine_map<(i, j) -> (j)>,
+  affine_map<(i, j) -> (i)>
+]
+#mattransvec_trait = {
+  indexing_maps = #mattransvec_accesses,
+  iterator_types = ["parallel", "reduction"]
+}
+
+#matmat_accesses = [
+  affine_map<(i, j, k) -> (i, k)>,
+  affine_map<(i, j, k) -> (k, j)>,
+  affine_map<(i, j, k) -> (i, j)>
+]
+#matmat_trait = {
+  indexing_maps = #matmat_accesses,
+  iterator_types = ["parallel", "parallel", "reduction"]
+}
+
+#column_major_matmat_accesses = [
+  affine_map<(i, j, k) -> (k, j)>,
+  affine_map<(i, j, k) -> (i, k)>,
+  affine_map<(i, j, k) -> (j, i)>
+]
+#column_major_matmat_trait = {
+  indexing_maps = #column_major_matmat_accesses,
+  iterator_types = ["parallel", "parallel", "reduction"]
+}
+
+func.func @dot_products() -> (i32, i32) {
+  %c0 = arith.constant 0 : i32
+  %c1 = arith.constant 1 : i32
+  %a = arith.constant dense<[1, 2]> : vector<2xi32>
+  %b = arith.constant dense<[3, 4]> : vector<2xi32>
+  // Contraction: dot-product a x b
+  %dp1 = vector.contract #dotp_trait %a, %b, %c0
+    : vector<2xi32>, vector<2xi32> into i32
+  %dp2 = vector.contract #dotp_trait %a, %b, %c1
+    : vector<2xi32>, vector<2xi32> into i32
+  return %dp1, %dp2 : i32, i32
+}
+
+// CHECK-LABEL: @dot_product
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: 11
+// CHECK-NEXT: i32: 12
+
+func.func @matrix_vector() -> (vector<2xi32>, vector<2xi32>) {
+  %z1 = arith.constant dense<0> : vector<2xi32>
+  %a = arith.constant dense<[1, 2]> : vector<2xi32>
+  %c = arith.constant dense<[5, 6]> : vector<2xi32>
+  %A = arith.constant dense<[[1, 2], [3, 4]]> : vector<2x2xi32>
+  %mv1 = vector.contract #matvec_trait %A, %c, %z1
+    : vector<2x2xi32>, vector<2xi32> into vector<2xi32>
+  %mv2 = vector.contract #matvec_trait %A, %c, %a
+    : vector<2x2xi32>, vector<2xi32> into vector<2xi32>
+  return %mv1, %mv2 : vector<2xi32>, vector<2xi32>
+}
+
+// CHECK-LABEL: @matrix_vector
+// CHECK-NEXT: Results
+// CHECK-NEXT: [17, 39]
+// CHECK-NEXT: [18, 41]
+
+func.func @matrix_trans_vector() -> (vector<2xi32>, vector<2xi32>) {
+  %z1 = arith.constant dense<0> : vector<2xi32>
+  %a = arith.constant dense<[1, 2]> : vector<2xi32>
+  %c = arith.constant dense<[5, 6]> : vector<2xi32>
+  %A = arith.constant dense<[[1, 2], [3, 4]]> : vector<2x2xi32>
+  %mv1 = vector.contract #mattransvec_trait %A, %c, %z1
+    : vector<2x2xi32>, vector<2xi32> into vector<2xi32>
+  %mv2 = vector.contract #mattransvec_trait %A, %c, %a
+    : vector<2x2xi32>, vector<2xi32> into vector<2xi32>
+  return %mv1, %mv2 : vector<2xi32>, vector<2xi32>
+}
+
+// CHECK-LABEL: @matrix_trans_vector
+// CHECK-NEXT: Results
+// CHECK-NEXT: [23, 34]
+// CHECK-NEXT: [24, 36]
+
+func.func @matrix_matrix() -> (vector<2x2xi32>, vector<2x2xi32>) {
+  %z2 = arith.constant dense<0> : vector<2x2xi32>
+  %A = arith.constant dense<[[1, 2], [3, 4]]> : vector<2x2xi32>
+  %B = arith.constant dense<[[5, 6], [7, 8]]> : vector<2x2xi32>
+  %mm1 = vector.contract #matmat_trait %A, %B, %z2
+    : vector<2x2xi32>, vector<2x2xi32> into vector<2x2xi32>
+  %mm2 = vector.contract #matmat_trait %A, %B, %A
+    : vector<2x2xi32>, vector<2x2xi32> into vector<2x2xi32>
+  return %mm1, %mm2 : vector<2x2xi32>, vector<2x2xi32>
+}
+
+// CHECK-LABEL: @matrix_matrix
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[19, 22], [43, 50]]
+// CHECK-NEXT{LITERAL}: [[20, 24], [46, 54]]
+
+func.func @matrix_matrix_column_major() -> (vector<2x2xi32>, vector<2x2xi32>) {
+  %z2 = arith.constant dense<0> : vector<2x2xi32>
+  %A = arith.constant dense<[[1, 2], [3, 4]]> : vector<2x2xi32>
+  %B = arith.constant dense<[[5, 6], [7, 8]]> : vector<2x2xi32>
+  %llvm_matrix_column_major_mm0 =
+    vector.contract #column_major_matmat_trait %A, %B, %z2
+      : vector<2x2xi32>, vector<2x2xi32> into vector<2x2xi32>
+  %llvm_matrix_column_major_mm1 =
+    vector.contract #column_major_matmat_trait %A, %B, %A
+      : vector<2x2xi32>, vector<2x2xi32> into vector<2x2xi32>
+  return %llvm_matrix_column_major_mm0, %llvm_matrix_column_major_mm1 :
+    vector<2x2xi32>, vector<2x2xi32>
+}
+
+// CHECK-LABEL: @matrix_matrix_column_major
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[23, 31], [34, 46]]
+// CHECK-NEXT{LITERAL}: [[24, 33], [37, 50]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/create_mask.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/create_mask.mlir
new file mode 100644
index 00000000000000..2616eddb273fb2
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/create_mask.mlir
@@ -0,0 +1,16 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @create_mask() -> vector<4x3xi1> {
+  %c2 = arith.constant 2 : index
+  %c3 = arith.constant 3 : index
+  %1 = vector.create_mask %c3, %c2 : vector<4x3xi1>
+  return %1 : vector<4x3xi1>
+}
+
+// CHECK-LABEL: @create_mask
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: vector<4x3xi1>:
+// CHECK-SAME{LITERAL}: [[true, true, false],
+// CHECK-SAME{LITERAL}:  [true, true, false],
+// CHECK-SAME{LITERAL}:  [true, true, false],
+// CHECK-SAME{LITERAL}:  [false, false, false]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/expandload.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/expandload.mlir
new file mode 100644
index 00000000000000..35a4bc27358128
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/expandload.mlir
@@ -0,0 +1,19 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @expandload() -> (vector<4xi32>, vector<4xi32>) {
+  %passthrough = arith.constant dense<[1,2,3,4]> : vector<4xi32>
+  %mask = arith.constant dense<[true, false, true, false]> : vector<4xi1>
+  %memref = arith.constant dense<[[10,11,12,13,14],
+                                  [15,16,17,18,19]]> : memref<2x5xi32>
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %ret = vector.expandload %memref[%c1, %c2], %mask, %passthrough
+    : memref<2x5xi32>, vector<4xi1>, vector<4xi32> into vector<4xi32>
+
+  return %passthrough, %ret : vector<4xi32>, vector<4xi32>
+}
+
+// CHECK-LABEL: @expandload
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<4xi32>: [1, 2, 3, 4]
+// CHECK-NEXT: vector<4xi32>: [17, 2, 18, 4]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/extract.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/extract.mlir
new file mode 100644
index 00000000000000..6186cf62257302
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/extract.mlir
@@ -0,0 +1,52 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @extract_1d_1d() -> vector<2xi32> {
+  %c = arith.constant dense<[1, 2]> : vector<2xi32>
+  %i = vector.extract %c[] : vector<2xi32> from vector<2xi32>
+  return %i : vector<2xi32>
+}
+
+// CHECK-LABEL: @extract_1d_1d
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<2xi32>: [1, 2]
+
+func.func @extract_1d_0d() -> i32 {
+  %c = arith.constant dense<[1, 2]> : vector<2xi32>
+  %i = vector.extract %c[1] : i32 from vector<2xi32>
+  return %i : i32
+}
+
+// CHECK-LABEL: @extract_1d_0d
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: 2
+
+func.func @extract_2d_0d() -> i32 {
+  %c = arith.constant dense<[[1, 2], [3, 4]]> : vector<2x2xi32>
+  %i = vector.extract %c[0, 1] : i32 from vector<2x2xi32>
+  return %i : i32
+}
+
+// CHECK-LABEL: @extract_2d_0d
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: 2
+
+func.func @extract_2d_1d() -> vector<3xi32> {
+  %c = arith.constant dense<[[1, 2, 3], [4, 5, 6]]> : vector<2x3xi32>
+  %i = vector.extract %c[0] : vector<3xi32> from vector<2x3xi32>
+  return %i : vector<3xi32>
+}
+
+// CHECK-LABEL: @extract_2d_1d
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<3xi32>: [1, 2, 3]
+
+func.func @extract_2d_2d() -> vector<2x2xi32> {
+  %c = arith.constant dense<[[1, 2], [3, 4]]> : vector<2x2xi32>
+  %i = vector.extract %c[] : vector<2x2xi32> from vector<2x2xi32>
+  return %i : vector<2x2xi32>
+}
+
+// CHECK-LABEL: @extract_2d_2d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: vector<2x2xi32>: [[1, 2], [3, 4]]
+
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/extract_strided_slice.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/extract_strided_slice.mlir
new file mode 100644
index 00000000000000..eae5ae9e5e483d
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/extract_strided_slice.mlir
@@ -0,0 +1,18 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @extract_strided_slice() -> vector<2x3xi32> {
+  %c = arith.constant dense<[[1,2,3,4],
+                             [5,6,7,8],
+                             [9,10,11,12]]> : vector<3x4xi32>
+  %o = vector.extract_strided_slice %c {
+    offsets = [0, 1],
+    sizes = [2, 3],
+    // TODO(jreiffers): Test non-unit strides when supported by verifier.
+    strides = [1, 1]
+  } : vector<3x4xi32> to vector<2x3xi32>
+  return %o : vector<2x3xi32>
+}
+
+// CHECK-LABEL: @extract_strided_slice
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[2, 3, 4], [6, 7, 8]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/extractelement.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/extractelement.mlir
new file mode 100644
index 00000000000000..2e4ffc708d64ea
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/extractelement.mlir
@@ -0,0 +1,22 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @extract0d() -> i32 {
+  %c = arith.constant dense<1> : vector<i32>
+  %i = vector.extractelement %c[] : vector<i32>
+  return %i : i32
+}
+
+// CHECK-LABEL: @extract0d
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: 1
+
+func.func @extract1d() -> i32 {
+  %c = arith.constant dense<[1,2]> : vector<2xi32>
+  %c1 = arith.constant 1 : index
+  %i = vector.extractelement %c[%c1 : index] : vector<2xi32>
+  return %i : i32
+}
+
+// CHECK-LABEL: @extract1d
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: 2
\ No newline at end of file
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/flat_transpose.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/flat_transpose.mlir
new file mode 100644
index 00000000000000..b67a84a4cce49b
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/flat_transpose.mlir
@@ -0,0 +1,23 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @flattranspose_2x3() -> vector<6xi32> {
+  %c = arith.constant dense<[0, 1, 2, 3, 4, 5]> : vector<6xi32>
+  %ret = vector.flat_transpose %c { columns = 2: i32, rows = 3: i32 }
+    : vector<6xi32> -> vector<6xi32>
+  return %ret : vector<6xi32>
+}
+
+// CHECK-LABEL: @flattranspose_2x3
+// CHECK-NEXT: Results
+// CHECK-NEXT: [0, 3, 1, 4, 2, 5]
+
+func.func @flattranspose_3x2() -> vector<6xi32> {
+  %c = arith.constant dense<[0, 1, 2, 3, 4, 5]> : vector<6xi32>
+  %ret = vector.flat_transpose %c { columns = 3: i32, rows = 2: i32 }
+    : vector<6xi32> -> vector<6xi32>
+  return %ret : vector<6xi32>
+}
+
+// CHECK-LABEL: @flattranspose_3x2
+// CHECK-NEXT: Results
+// CHECK-NEXT: [0, 2, 4, 1, 3, 5]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/fma.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/fma.mlir
new file mode 100644
index 00000000000000..696799d5c99c6c
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/fma.mlir
@@ -0,0 +1,13 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @fma() -> vector<2xf32> {
+  %a = arith.constant dense<[1.0,2.0]> : vector<2xf32>
+  %b = arith.constant dense<[3.0,4.0]> : vector<2xf32>
+  %c = arith.constant dense<[5.0,6.0]> : vector<2xf32>
+  %r = vector.fma %a, %b, %c : vector<2xf32>
+  return %r : vector<2xf32>
+}
+
+// CHECK-LABEL: @fma
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<2xf32>: [8.000000e+00, 1.400000e+01]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/gather.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/gather.mlir
new file mode 100644
index 00000000000000..c92a5589b1ecb4
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/gather.mlir
@@ -0,0 +1,50 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+// Adapted from mlir/test/Integration/Dialect/Vector/CPU/test-gather.mlir
+
+func.func private @gather8(%base: memref<10xi32>, %indices: vector<8xi32>,
+              %mask: vector<8xi1>, %pass_thru: vector<8xi32>) -> vector<8xi32> {
+  %c0 = arith.constant 0: index
+  %g = vector.gather %base[%c0][%indices], %mask, %pass_thru
+    : memref<10xi32>, vector<8xi32>, vector<8xi1>, vector<8xi32> into vector<8xi32>
+  return %g : vector<8xi32>
+}
+
+func.func @gather() ->
+    (vector<8xi32>, vector<8xi32>, vector<8xi32>, vector<8xi32>, vector<8xi32>) {
+  %A = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]> : memref<10xi32>
+  %idx = arith.constant dense<[0, 6, 1, 3, 5, 4, 9, 2]> : vector<8xi32>
+  %pass = arith.constant dense<-7> : vector<8xi32>
+  %none = vector.constant_mask [0] : vector<8xi1>
+  %all = vector.constant_mask [8] : vector<8xi1>
+  %some = vector.constant_mask [4] : vector<8xi1>
+  %true = arith.constant true
+  %more = vector.insert %true, %some[7] : i1 into vector<8xi1>
+
+  %g1 = call @gather8(%A, %idx, %all, %pass)
+    : (memref<10xi32>, vector<8xi32>, vector<8xi1>, vector<8xi32>)
+    -> (vector<8xi32>)
+  %g2 = call @gather8(%A, %idx, %none, %pass)
+    : (memref<10xi32>, vector<8xi32>, vector<8xi1>, vector<8xi32>)
+    -> (vector<8xi32>)
+  %g3 = call @gather8(%A, %idx, %some, %pass)
+    : (memref<10xi32>, vector<8xi32>, vector<8xi1>, vector<8xi32>)
+    -> (vector<8xi32>)
+  %g4 = call @gather8(%A, %idx, %more, %pass)
+    : (memref<10xi32>, vector<8xi32>, vector<8xi1>, vector<8xi32>)
+    -> (vector<8xi32>)
+  %g5 = call @gather8(%A, %idx, %all, %pass)
+    : (memref<10xi32>, vector<8xi32>, vector<8xi1>, vector<8xi32>)
+    -> (vector<8xi32>)
+
+  return %g1, %g2, %g3, %g4, %g5
+    : vector<8xi32>, vector<8xi32>, vector<8xi32>, vector<8xi32>, vector<8xi32>
+}
+
+// CHECK-LABEL: @gather
+// CHECK-NEXT: Results
+// CHECK-NEXT: [0, 6, 1, 3, 5, 4, 9, 2]
+// CHECK-NEXT: [-7, -7, -7, -7, -7, -7, -7, -7]
+// CHECK-NEXT: [0, 6, 1, 3, -7, -7, -7, -7]
+// CHECK-NEXT: [0, 6, 1, 3, -7, -7, -7, 2]
+// CHECK-NEXT: [0, 6, 1, 3, 5, 4, 9, 2]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/insert.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/insert.mlir
new file mode 100644
index 00000000000000..97019cf98e4775
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/insert.mlir
@@ -0,0 +1,57 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @insert_1d_1d() -> vector<2xi32> {
+  %c = arith.constant dense<[3, 4]> : vector<2xi32>
+  %d = arith.constant dense<[1, 2]> : vector<2xi32>
+  %i = vector.insert %c, %d[] : vector<2xi32> into vector<2xi32>
+  return %i : vector<2xi32>
+}
+
+// CHECK-LABEL: @insert_1d_1d
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<2xi32>: [3, 4]
+
+func.func @insert_1d_0d() -> vector<2xi32> {
+  %c = arith.constant 42 : i32
+  %d = arith.constant dense<[1, 2]> : vector<2xi32>
+  %i = vector.insert %c, %d[1] : i32 into vector<2xi32>
+  return %i : vector<2xi32>
+}
+
+// CHECK-LABEL: @insert_1d_0d
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<2xi32>: [1, 42]
+
+func.func @insert_2d_0d() -> vector<2x2xi32> {
+  %c = arith.constant 42 : i32
+  %d = arith.constant dense<[[1, 2], [3, 4]]> : vector<2x2xi32>
+  %i = vector.insert %c, %d[0, 1] : i32 into vector<2x2xi32>
+  return %i : vector<2x2xi32>
+}
+
+// CHECK-LABEL: @insert_2d_0d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: vector<2x2xi32>: [[1, 42], [3, 4]]
+
+func.func @insert_2d_1d() -> vector<2x3xi32> {
+  %c = arith.constant dense<[42, 43, 44]> : vector<3xi32>
+  %d = arith.constant dense<[[1, 2, 3], [4, 5, 6]]> : vector<2x3xi32>
+  %i = vector.insert %c, %d[0] : vector<3xi32> into vector<2x3xi32>
+  return %i : vector<2x3xi32>
+}
+
+// CHECK-LABEL: @insert_2d_1d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: vector<2x3xi32>: [[42, 43, 44], [4, 5, 6]]
+
+func.func @insert_2d_2d() -> vector<2x2xi32> {
+  %c = arith.constant dense<[[1, 2], [3, 4]]> : vector<2x2xi32>
+  %d = arith.constant dense<0> : vector<2x2xi32>
+  %i = vector.insert %c, %d[] : vector<2x2xi32> into vector<2x2xi32>
+  return %i : vector<2x2xi32>
+}
+
+// CHECK-LABEL: @insert_2d_2d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: vector<2x2xi32>: [[1, 2], [3, 4]]
+
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/insert_strided_slice.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/insert_strided_slice.mlir
new file mode 100644
index 00000000000000..140cc716bfdf0a
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/insert_strided_slice.mlir
@@ -0,0 +1,17 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @insert_strided_slice() -> (vector<3x4xi32>, vector<3x4xi32>) {
+  %v = arith.constant dense<[[2, 3, 4], [6, 7, 8]]> : vector<2x3xi32>
+  %c = arith.constant dense<0> : vector<3x4xi32>
+  %o = vector.insert_strided_slice %v, %c {
+    offsets = [0, 1],
+    // TODO(jreiffers): Test non-unit strides when supported by verifier.
+    strides = [1, 1]
+  } : vector<2x3xi32> into vector<3x4xi32>
+  return %c, %o : vector<3x4xi32>, vector<3x4xi32>
+}
+
+// CHECK-LABEL: @insert_strided_slice
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]
+// CHECK-NEXT{LITERAL}: [[0, 2, 3, 4], [0, 6, 7, 8], [0, 0, 0, 0]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/insertelement.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/insertelement.mlir
new file mode 100644
index 00000000000000..4befb8c9598486
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/insertelement.mlir
@@ -0,0 +1,24 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @insert0d() -> vector<i32> {
+  %c = arith.constant dense<1> : vector<i32>
+  %v = arith.constant 42 : i32
+  %i = vector.insertelement %v, %c[] : vector<i32>
+  return %i : vector<i32>
+}
+
+// CHECK-LABEL: @insert0d
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<i32>: 42
+
+func.func @insert1d() -> vector<2xi32> {
+  %c = arith.constant dense<1> : vector<2xi32>
+  %c1 = arith.constant 1 : index
+  %v = arith.constant 42 : i32
+  %i = vector.insertelement %v, %c[%c1 : index] : vector<2xi32>
+  return %i : vector<2xi32>
+}
+
+// CHECK-LABEL: @insert1d
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<2xi32>: [1, 42]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/invalid.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/invalid.mlir
new file mode 100644
index 00000000000000..7a08cf50923f51
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/invalid.mlir
@@ -0,0 +1,39 @@
+// RUN: (! mlir-interpreter-runner %s -run-all 2>&1) | FileCheck %s
+
+func.func @write_4_at_3_inbounds() {
+  %a = memref.alloc() : memref<5xi32>
+  %base = arith.constant 3 : index
+  %f = arith.constant dense<[1, 2, 3, 4]> : vector<4xi32>
+  vector.transfer_write %f, %a[%base]
+    {permutation_map = affine_map<(d0) -> (d0)>, in_bounds = [true]}
+    : vector<4xi32>, memref<5xi32>
+  return
+}
+
+// CHECK-LABEL: @write_4_at_3_inbounds
+// CHECK-NEXT: index out of bounds
+
+func.func @transfer_read_2d_1d_oob()-> vector<2xi32> {
+  %a = arith.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7]]> : memref<2x4xi32>
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %c-42 = arith.constant -42: i32
+  %f = vector.transfer_read %a[%c2, %c0], %c-42
+      : memref<2x4xi32>, vector<2xi32>
+  return %f : vector<2xi32>
+}
+
+// CHECK-LABEL: @transfer_read_2d_1d_oob
+// CHECK-NEXT: index out of bounds
+
+func.func @store_vector_memref() -> memref<1x2xvector<2xi32>> {
+  %m = memref.alloc() : memref<1x2xvector<2xi32>>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1094795585 : index
+  %v = arith.constant dense<[1,2]> : vector<2xi32>
+  vector.store %v, %m[%c0, %c1] : memref<1x2xvector<2xi32>>, vector<2xi32>
+  return %m : memref<1x2xvector<2xi32>>
+}
+
+// CHECK-LABEL: @store_vector_memref
+// CHECK-NEXT: index out of bounds
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/load.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/load.mlir
new file mode 100644
index 00000000000000..3fabd7676d6413
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/load.mlir
@@ -0,0 +1,27 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @load_vector_memref() -> vector<2x2xi32> {
+  %m = memref.alloc() : memref<4x4xvector<2x2xi32>>
+  %c3 = arith.constant 3 : index
+  %r = vector.load %m[%c3, %c3] : memref<4x4xvector<2x2xi32>>, vector<2x2xi32>
+  return %r : vector<2x2xi32>
+}
+
+// CHECK-LABEL: @load_vector_memref
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: vector<2x2xi32>: [[0, 0], [0, 0]]
+
+func.func @load_scalar_memref() -> vector<2x2xi32> {
+  %m = arith.constant dense<[[1, 2, 3, 4],
+                             [5, 6, 7, 8],
+                             [9, 10, 11, 12],
+                             [13, 14, 15, 16]]> : memref<4x4xi32>
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %r = vector.load %m[%c1, %c2] : memref<4x4xi32>, vector<2x2xi32>
+  return %r : vector<2x2xi32>
+}
+
+// CHECK-LABEL: @load_scalar_memref
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: vector<2x2xi32>: [[7, 8], [11, 12]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/maskedload.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/maskedload.mlir
new file mode 100644
index 00000000000000..bb11f49d261b28
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/maskedload.mlir
@@ -0,0 +1,19 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @maskedload() -> (vector<4xi32>, vector<4xi32>) {
+  %passthrough = arith.constant dense<[1,2,3,4]> : vector<4xi32>
+  %mask = arith.constant dense<[true, false, true, false]> : vector<4xi1>
+  %memref = arith.constant dense<[[10,11,12,13,14],
+                                  [15,16,17,18,19]]> : memref<2x5xi32>
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %ret = vector.maskedload %memref[%c1, %c2], %mask, %passthrough
+    : memref<2x5xi32>, vector<4xi1>, vector<4xi32> into vector<4xi32>
+
+  return %passthrough, %ret : vector<4xi32>, vector<4xi32>
+}
+
+// CHECK-LABEL: @maskedload
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<4xi32>: [1, 2, 3, 4]
+// CHECK-NEXT: vector<4xi32>: [17, 2, 19, 4]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/maskedstore.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/maskedstore.mlir
new file mode 100644
index 00000000000000..9ceaefc3a9c622
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/maskedstore.mlir
@@ -0,0 +1,18 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @maskedstore() -> memref<2x5xi32> {
+  %value = arith.constant dense<[1,2,3,4]> : vector<4xi32>
+  %mask = arith.constant dense<[true, false, true, false]> : vector<4xi1>
+  %memref = arith.constant dense<[[10,11,12,13,14],
+                                  [15,16,17,18,19]]> : memref<2x5xi32>
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  vector.maskedstore %memref[%c1, %c2], %mask, %value
+    : memref<2x5xi32>, vector<4xi1>, vector<4xi32>
+
+  return %memref : memref<2x5xi32>
+}
+
+// CHECK-LABEL: @maskedstore
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[10, 11, 12, 13, 14], [15, 16, 1, 18, 3]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/multi_reduction.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/multi_reduction.mlir
new file mode 100644
index 00000000000000..94fc5481653583
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/multi_reduction.mlir
@@ -0,0 +1,46 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @multi_reduction_1d_0d() -> i32 {
+  %a = arith.constant dense<[10, 4, 7]> : vector<3xi32>
+  %acc = arith.constant 1 : i32
+  %r = vector.multi_reduction <add>, %a, %acc [0] : vector<3xi32> to i32
+  return %r : i32
+}
+
+// CHECK-LABEL: @multi_reduction_1d_0d
+// CHECK-NEXT: Results
+// CHECK-NEXT: 22
+
+#dense234 = dense<[[[0,1,2,3],[4,5,6,7],[8,9,10,11]],
+                   [[12,13,14,15],[16,17,18,19],[20,21,22,23]]]>
+              : vector<2x3x4xi32>
+
+func.func @multi_reduction_3d_2d()
+    -> (vector<2x3xi32>, vector<2x4xi32>, vector<2x3xi32>) {
+  %a = arith.constant #dense234
+  %acc_23 = arith.constant dense<[[0, 1, 2], [3, 4, 5]]> : vector<2x3xi32>
+  %r1 = vector.multi_reduction <add>, %a, %acc_23 [2]
+    : vector<2x3x4xi32> to vector<2x3xi32>
+  %acc_24 = arith.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7]]> : vector<2x4xi32>
+  %r2 = vector.multi_reduction <mul>, %a, %acc_24 [1]
+    : vector<2x3x4xi32> to vector<2x4xi32>
+  return %r1, %r2, %acc_23 : vector<2x3xi32>, vector<2x4xi32>, vector<2x3xi32>
+}
+
+// CHECK-LABEL: @multi_reduction_3d_2d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[6, 23, 40], [57, 74, 91]]
+// CHECK-NEXT{LITERAL}: [[0, 45, 240, 693], [15360, 23205, 33264, 45885]]
+// CHECK-NEXT{LITERAL}: [[0, 1, 2], [3, 4, 5]]
+
+func.func @multi_reduction_3d_1d() -> vector<3xi32> {
+  %a = arith.constant #dense234
+  %acc_3 = arith.constant dense<[0, 1, 2]> : vector<3xi32>
+  %r1 = vector.multi_reduction <add>, %a, %acc_3 [2, 0]
+    : vector<2x3x4xi32> to vector<3xi32>
+  return %r1 : vector<3xi32>
+}
+
+// CHECK-LABEL: @multi_reduction_3d_1d
+// CHECK-NEXT: Results
+// CHECK-NEXT: [60, 93, 126]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/outerproduct.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/outerproduct.mlir
new file mode 100644
index 00000000000000..2c47a16384c284
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/outerproduct.mlir
@@ -0,0 +1,155 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @outerproduct_1d_1d() -> vector<2x3xi32> {
+  %a = arith.constant dense<[1, 2]> : vector<2xi32>
+  %b = arith.constant dense<[3, 4, 5]> : vector<3xi32>
+  %o = vector.outerproduct %a, %b : vector<2xi32>, vector<3xi32>
+  return %o : vector<2x3xi32>
+}
+
+// CHECK-LABEL: @outerproduct_1d_1d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[3, 4, 5], [6, 8, 10]]
+
+func.func @outerproduct_1d_1d_add() -> vector<2x3xi32> {
+  %a = arith.constant dense<[1, 2]> : vector<2xi32>
+  %b = arith.constant dense<[3, 4, 5]> : vector<3xi32>
+  %init = arith.constant dense<[[100, 200, 300], [400, 500, 600]]>
+    : vector<2x3xi32>
+  %o = vector.outerproduct %a, %b, %init : vector<2xi32>, vector<3xi32>
+  return %o : vector<2x3xi32>
+}
+
+// CHECK-LABEL: @outerproduct_1d_1d_add
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[103, 204, 305], [406, 508, 610]]
+
+func.func @outerproduct_1d_0d_and() -> vector<1xi32> {
+  %a = arith.constant dense<[3]> : vector<1xi32>
+  %b = arith.constant 1 : i32
+  %init = arith.constant dense<[9]> : vector<1xi32>
+  %o = vector.outerproduct %a, %b, %init {kind = #vector.kind<and>}
+    : vector<1xi32>, i32
+  return %o : vector<1xi32>
+}
+
+// CHECK-LABEL: @outerproduct_1d_0d_and
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [1]
+
+func.func @outerproduct_1d_0d_maxui() -> vector<3xi32> {
+  %a = arith.constant dense<[1, 2, 3]> : vector<3xi32>
+  %b = arith.constant 3 : i32
+  %init = arith.constant dense<[100, 0, -100]> : vector<3xi32>
+  %o = vector.outerproduct %a, %b, %init {kind = #vector.kind<maxui>}
+    : vector<3xi32>, i32
+  return %o : vector<3xi32>
+}
+
+// CHECK-LABEL: @outerproduct_1d_0d_maxui
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [100, 6, -100]
+
+func.func @outerproduct_1d_0d_mul() -> vector<3xi32> {
+  %a = arith.constant dense<[1, 2, 3]> : vector<3xi32>
+  %b = arith.constant 3 : i32
+  %init = arith.constant dense<[1, 2, 3]> : vector<3xi32>
+  %o = vector.outerproduct %a, %b, %init {kind = #vector.kind<mul>}
+    : vector<3xi32>, i32
+  return %o : vector<3xi32>
+}
+
+// CHECK-LABEL: @outerproduct_1d_0d_mul
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [3, 12, 27]
+
+func.func @outerproduct_1d_0d_minui() -> vector<3xi32> {
+  %a = arith.constant dense<[1, 2, 3]> : vector<3xi32>
+  %b = arith.constant 3 : i32
+  %init = arith.constant dense<[100, 0, -100]> : vector<3xi32>
+  %o = vector.outerproduct %a, %b, %init {kind = #vector.kind<minui>}
+    : vector<3xi32>, i32
+  return %o : vector<3xi32>
+}
+
+// CHECK-LABEL: @outerproduct_1d_0d_minui
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [3, 0, 9]
+
+func.func @outerproduct_1d_0d_maxsi() -> vector<3xi32> {
+  %a = arith.constant dense<[1, 2, 3]> : vector<3xi32>
+  %b = arith.constant 3 : i32
+  %init = arith.constant dense<[100, 0, -100]> : vector<3xi32>
+  %o = vector.outerproduct %a, %b, %init {kind = #vector.kind<maxsi>}
+    : vector<3xi32>, i32
+  return %o : vector<3xi32>
+}
+
+// CHECK-LABEL: @outerproduct_1d_0d_maxsi
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [100, 6, 9]
+
+func.func @outerproduct_1d_0d_minsi() -> vector<3xi32> {
+  %a = arith.constant dense<[1, 2, 3]> : vector<3xi32>
+  %b = arith.constant 3 : i32
+  %init = arith.constant dense<[100, 0, -100]> : vector<3xi32>
+  %o = vector.outerproduct %a, %b, %init {kind = #vector.kind<minsi>}
+    : vector<3xi32>, i32
+  return %o : vector<3xi32>
+}
+
+// CHECK-LABEL: @outerproduct_1d_0d_minsi
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [3, 0, -100]
+
+func.func @outerproduct_1d_0d_maxf() -> vector<1xf32> {
+  %a = arith.constant dense<[1.0]> : vector<1xf32>
+  %b = arith.constant 3.0 : f32
+  %init = arith.constant dense<[10.0]> : vector<1xf32>
+  %o = vector.outerproduct %a, %b, %init {kind = #vector.kind<maximumf>}
+    : vector<1xf32>, f32
+  return %o : vector<1xf32>
+}
+
+// CHECK-LABEL: @outerproduct_1d_0d_maxf
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [1.000000e+01]
+
+func.func @outerproduct_1d_0d_minf() -> vector<1xf32> {
+  %a = arith.constant dense<[1.0]> : vector<1xf32>
+  %b = arith.constant 3.0 : f32
+  %init = arith.constant dense<[10.0]> : vector<1xf32>
+  %o = vector.outerproduct %a, %b, %init {kind = #vector.kind<minimumf>}
+    : vector<1xf32>, f32
+  return %o : vector<1xf32>
+}
+
+// CHECK-LABEL: @outerproduct_1d_0d_minf
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [3.000000e+00]
+
+func.func @outerproduct_1d_0d_or() -> vector<1xi32> {
+  %a = arith.constant dense<[3]> : vector<1xi32>
+  %b = arith.constant 1 : i32
+  %init = arith.constant dense<[9]> : vector<1xi32>
+  %o = vector.outerproduct %a, %b, %init {kind = #vector.kind<or>}
+    : vector<1xi32>, i32
+  return %o : vector<1xi32>
+}
+
+// CHECK-LABEL: @outerproduct_1d_0d_or
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [11]
+
+func.func @outerproduct_1d_0d_xor() -> vector<1xi32> {
+  %a = arith.constant dense<[3]> : vector<1xi32>
+  %b = arith.constant 1 : i32
+  %init = arith.constant dense<[9]> : vector<1xi32>
+  %o = vector.outerproduct %a, %b, %init {kind = #vector.kind<xor>}
+    : vector<1xi32>, i32
+  return %o : vector<1xi32>
+}
+
+// CHECK-LABEL: @outerproduct_1d_0d_xor
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [10]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/reduction.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/reduction.mlir
new file mode 100644
index 00000000000000..c62fe869fc6459
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/reduction.mlir
@@ -0,0 +1,235 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @reduction_add() -> i32 {
+  %a = arith.constant dense<[10, 4, 7]> : vector<3xi32>
+  %r = vector.reduction <add>, %a : vector<3xi32> into i32
+  return %r : i32
+}
+
+// CHECK-LABEL: @reduction_add
+// CHECK-NEXT: Results
+// CHECK-NEXT: 21
+
+func.func @reduction_minf_degenerate() -> f32 {
+  %a = arith.constant dense<[1.0]> : vector<1xf32>
+  %r = vector.reduction <minimumf>, %a : vector<1xf32> into f32
+  return %r : f32
+}
+
+// CHECK-LABEL: @reduction_minf_degenerate
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1.000000e+00
+
+func.func @reduction_minf() -> f32 {
+  %a = arith.constant dense<[10.0, 4.0, 7.0]> : vector<3xf32>
+  %r = vector.reduction <minimumf>, %a : vector<3xf32> into f32
+  return %r : f32
+}
+
+// CHECK-LABEL: @reduction_minf
+// CHECK-NEXT: Results
+// CHECK-NEXT: 4.000000e+00
+
+func.func @reduction_acc() -> f32 {
+  %a = arith.constant dense<[10.0, 4.0, 7.0]> : vector<3xf32>
+  %acc = arith.constant 3.0 : f32
+  %r = vector.reduction <minimumf>, %a, %acc : vector<3xf32> into f32
+  return %r : f32
+}
+
+// CHECK-LABEL: @reduction_acc
+// CHECK-NEXT: Results
+// CHECK-NEXT: 3.000000e+00
+
+func.func @reduction_acc_first_is_minimum() -> f32 {
+  %a = arith.constant dense<[1.0, 4.0, 7.0]> : vector<3xf32>
+  %acc = arith.constant 3.0 : f32
+  %r = vector.reduction <minimumf>, %a, %acc : vector<3xf32> into f32
+  return %r : f32
+}
+
+// CHECK-LABEL: @reduction_acc_first_is_minimum
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1.000000e+00
+
+func.func @masked_and() -> i32 {
+  %a = arith.constant dense<[255, 127, 6]> : vector<3xi32>
+  %m = arith.constant dense<[true, true, false]> : vector<3xi1>
+  %r = vector.mask %m {
+    vector.reduction <and>, %a : vector<3xi32> into i32
+  } : vector<3xi1> -> i32
+  return %r : i32
+}
+
+// CHECK-LABEL: @masked_and
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: 127
+
+func.func @masked_xor() -> i32 {
+  %a = arith.constant dense<[255, 1, 3]> : vector<3xi32>
+  %m = arith.constant dense<[false, true, true]> : vector<3xi1>
+  %r = vector.mask %m {
+    vector.reduction <xor>, %a : vector<3xi32> into i32
+  } : vector<3xi1> -> i32
+  return %r : i32
+}
+
+// CHECK-LABEL: @masked_xor
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: 2
+
+func.func @masked_or() -> i32 {
+  %a = arith.constant dense<[255, 1, 3]> : vector<3xi32>
+  %m = arith.constant dense<[false, true, true]> : vector<3xi1>
+  %r = vector.mask %m {
+    vector.reduction <or>, %a : vector<3xi32> into i32
+  } : vector<3xi1> -> i32
+  return %r : i32
+}
+
+// CHECK-LABEL: @masked_or
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: 3
+
+func.func @masked_add_i32() -> i32 {
+  %a = arith.constant dense<[255, 1, 3]> : vector<3xi32>
+  %m = arith.constant dense<[false, true, true]> : vector<3xi1>
+  %r = vector.mask %m {
+    vector.reduction <add>, %a : vector<3xi32> into i32
+  } : vector<3xi1> -> i32
+  return %r : i32
+}
+
+// CHECK-LABEL: @masked_add_i32
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: 4
+
+func.func @masked_add_f32() -> f32 {
+  %a = arith.constant dense<[255.0, 1.0, 3.0]> : vector<3xf32>
+  %m = arith.constant dense<[false, true, true]> : vector<3xi1>
+  %r = vector.mask %m {
+    vector.reduction <add>, %a : vector<3xf32> into f32
+  } : vector<3xi1> -> f32
+  return %r : f32
+}
+
+// CHECK-LABEL: @masked_add_f32
+// CHECK-NEXT: Results
+// CHECK-NEXT: f32: 4.0
+
+func.func @masked_mul_i32() -> i32 {
+  %a = arith.constant dense<[255, 2, 3]> : vector<3xi32>
+  %m = arith.constant dense<[false, true, true]> : vector<3xi1>
+  %r = vector.mask %m {
+    vector.reduction <mul>, %a : vector<3xi32> into i32
+  } : vector<3xi1> -> i32
+  return %r : i32
+}
+
+// CHECK-LABEL: @masked_mul_i32
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: 6
+
+func.func @masked_mul_f32() -> f32 {
+  %a = arith.constant dense<[255.0, 2.0, 3.0]> : vector<3xf32>
+  %m = arith.constant dense<[false, true, true]> : vector<3xi1>
+  %r = vector.mask %m {
+    vector.reduction <mul>, %a : vector<3xf32> into f32
+  } : vector<3xi1> -> f32
+  return %r : f32
+}
+
+// CHECK-LABEL: @masked_mul_f32
+// CHECK-NEXT: Results
+// CHECK-NEXT: f32: 6.0
+
+func.func @masked_minsi() -> i32 {
+  %a = arith.constant dense<[255, -2, -5]> : vector<3xi32>
+  %m = arith.constant dense<[true, true, false]> : vector<3xi1>
+  %r = vector.mask %m {
+    vector.reduction <minsi>, %a : vector<3xi32> into i32
+  } : vector<3xi1> -> i32
+  return %r : i32
+}
+
+// CHECK-LABEL: @masked_minsi
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: -2
+
+func.func @masked_minui() -> i32 {
+  %a = arith.constant dense<[255, -2, -5]> : vector<3xi32>
+  %m = arith.constant dense<[true, true, false]> : vector<3xi1>
+  %r = vector.mask %m {
+    vector.reduction <minui>, %a : vector<3xi32> into i32
+  } : vector<3xi1> -> i32
+  return %r : i32
+}
+
+// CHECK-LABEL: @masked_minui
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: 255
+
+func.func @masked_maxsi() -> i32 {
+  %a = arith.constant dense<[255, -2, 500]> : vector<3xi32>
+  %m = arith.constant dense<[true, true, false]> : vector<3xi1>
+  %r = vector.mask %m {
+    vector.reduction <maxsi>, %a : vector<3xi32> into i32
+  } : vector<3xi1> -> i32
+  return %r : i32
+}
+
+// CHECK-LABEL: @masked_maxsi
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: 255
+
+func.func @masked_maxui() -> i32 {
+  %a = arith.constant dense<[255, -5, -2]> : vector<3xi32>
+  %m = arith.constant dense<[true, true, false]> : vector<3xi1>
+  %r = vector.mask %m {
+    vector.reduction <maxui>, %a : vector<3xi32> into i32
+  } : vector<3xi1> -> i32
+  return %r : i32
+}
+
+// CHECK-LABEL: @masked_maxui
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: -5
+
+func.func @masked_minf() -> f32 {
+  %a = arith.constant dense<[255.0, 2.0, 3.0]> : vector<3xf32>
+  %m = arith.constant dense<[true, false, true]> : vector<3xi1>
+  %r = vector.mask %m {
+    vector.reduction <minimumf>, %a : vector<3xf32> into f32
+  } : vector<3xi1> -> f32
+  return %r : f32
+}
+
+// CHECK-LABEL: @masked_minf
+// CHECK-NEXT: Results
+// CHECK-NEXT: f32: 3.0
+
+func.func @masked_maxf() -> f32 {
+  %a = arith.constant dense<[255.0, 2.0, 3.0]> : vector<3xf32>
+  %m = arith.constant dense<[false, true, true]> : vector<3xi1>
+  %r = vector.mask %m {
+    vector.reduction <maximumf>, %a : vector<3xf32> into f32
+  } : vector<3xi1> -> f32
+  return %r : f32
+}
+
+// CHECK-LABEL: @masked_maxf
+// CHECK-NEXT: Results
+// CHECK-NEXT: f32: 3.0
+
+func.func @masked_maxf_empty() -> f32 {
+  %a = arith.constant dense<[255.0]> : vector<1xf32>
+  %m = arith.constant dense<[false]> : vector<1xi1>
+  %r = vector.mask %m {
+    vector.reduction <maximumf>, %a : vector<1xf32> into f32
+  } : vector<1xi1> -> f32
+  return %r : f32
+}
+
+// CHECK-LABEL: @masked_maxf_empty
+// CHECK-NEXT: Results
+// CHECK-NEXT: f32: -INF
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/shape_cast.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/shape_cast.mlir
new file mode 100644
index 00000000000000..333adcbc4b1dcb
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/shape_cast.mlir
@@ -0,0 +1,23 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @shape_cast() -> vector<2x2xi32> {
+  %a = arith.constant dense<[[1, 2, 3, 4]]> : vector<1x4xi32>
+  %cast = vector.shape_cast %a : vector<1x4xi32> to vector<2x2xi32>
+  return %cast : vector<2x2xi32>
+}
+
+// CHECK-LABEL: @shape_cast
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[1, 2], [3, 4]]
+
+func.func @cast_of_transpose() -> (vector<3x2xi32>, vector<2x3xi32>) {
+  %a = arith.constant dense<[[1, 2, 3], [4, 5, 6]]> : vector<2x3xi32>
+  %b = vector.transpose %a, [1, 0] : vector<2x3xi32> to vector<3x2xi32>
+  %cast = vector.shape_cast %b : vector<3x2xi32> to vector<2x3xi32>
+  return %b, %cast : vector<3x2xi32>, vector<2x3xi32>
+}
+
+// CHECK-LABEL: @cast_of_transpose
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[1, 4], [2, 5], [3, 6]]
+// CHECK-NEXT{LITERAL}: [[1, 4, 2], [5, 3, 6]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/shuffle.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/shuffle.mlir
new file mode 100644
index 00000000000000..3fb3d881fbee51
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/shuffle.mlir
@@ -0,0 +1,34 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @shuffle0d() -> vector<4xi32> {
+  %c1 = arith.constant dense<1> : vector<i32>
+  %c2 = arith.constant dense<2> : vector<i32>
+  %shuffle = vector.shuffle %c1, %c2[0, 1, 1, 0] : vector<i32>, vector<i32>
+  return %shuffle : vector<4xi32>
+}
+
+// CHECK-LABEL: @shuffle0d
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<4xi32>: [1, 2, 2, 1]
+
+func.func @shuffle1d() -> vector<4xi32> {
+  %c1 = arith.constant dense<[1, 2]> : vector<2xi32>
+  %c2 = arith.constant dense<[11, 12, 13, 14]> : vector<4xi32>
+  %shuffle = vector.shuffle %c1, %c2[0, 2, 5, 1] : vector<2xi32>, vector<4xi32>
+  return %shuffle : vector<4xi32>
+}
+
+// CHECK-LABEL: @shuffle1d
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<4xi32>: [1, 11, 14, 2]
+
+func.func @shuffle2d() -> vector<3x2xi32> {
+  %c1 = arith.constant dense<[[1, 2], [11, 12]]> : vector<2x2xi32>
+  %c2 = arith.constant dense<[[21, 22]]> : vector<1x2xi32>
+  %shuffle = vector.shuffle %c1, %c2[0, 2, 1] : vector<2x2xi32>, vector<1x2xi32>
+  return %shuffle : vector<3x2xi32>
+}
+
+// CHECK-LABEL: @shuffle2d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: vector<3x2xi32>: [[1, 2], [21, 22], [11, 12]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/splat.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/splat.mlir
new file mode 100644
index 00000000000000..bfbb76b38af58f
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/splat.mlir
@@ -0,0 +1,11 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @splat() -> vector<2x4xi32> {
+  %c42 = arith.constant 42 : i32
+  %splat = vector.splat %c42 : vector<2x4xi32>
+  return %splat : vector<2x4xi32>
+}
+
+// CHECK-LABEL: @splat
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: vector<2x4xi32>: [[42, 42, 42, 42], [42, 42, 42, 42]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/store.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/store.mlir
new file mode 100644
index 00000000000000..f97f6435d61230
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/store.mlir
@@ -0,0 +1,39 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @store_vector_memref() -> memref<1x2xvector<2xi32>> {
+  %m = memref.alloc() : memref<1x2xvector<2xi32>>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %v = arith.constant dense<[1,2]> : vector<2xi32>
+  vector.store %v, %m[%c0, %c1] : memref<1x2xvector<2xi32>>, vector<2xi32>
+  return %m : memref<1x2xvector<2xi32>>
+}
+
+// CHECK-LABEL: @store_vector_memref
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: TensorOrMemref<1x2xvector<2xi32>>: [[[0, 0], [1, 2]]]
+
+func.func @store_scalar_memref() -> memref<2x2xi32> {
+  %m = memref.alloc() : memref<2x2xi32>
+  %v = arith.constant dense<[[1,2]]> : vector<1x2xi32>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  vector.store %v, %m[%c1, %c0] : memref<2x2xi32>, vector<1x2xi32>
+  return %m : memref<2x2xi32>
+}
+
+// CHECK-LABEL: @store_scalar_memref
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: TensorOrMemref<2x2xi32>: [[0, 0], [1, 2]]
+
+func.func @store_oob() -> memref<2x2xi32> {
+  %m = memref.alloc() : memref<2x2xi32>
+  %v = arith.constant dense<[[1,2]]> : vector<1x2xi32>
+  %c1 = arith.constant 1 : index
+  vector.store %v, %m[%c1, %c1] : memref<2x2xi32>, vector<1x2xi32>
+  return %m : memref<2x2xi32>
+}
+
+// CHECK-LABEL: @store_oob
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: TensorOrMemref<2x2xi32>: [[0, 0], [0, 1]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/transfer_read.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/transfer_read.mlir
new file mode 100644
index 00000000000000..6cafe0b2f4cd06
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/transfer_read.mlir
@@ -0,0 +1,118 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+// Adapted from mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read.mlir.
+
+func.func @transfer_read_1d() -> vector<13xi32> {
+  %a = arith.constant dense<[0, 1, 2, 3, 4]> : memref<5xi32>
+  %c2 = arith.constant 2 : index
+  %c-42 = arith.constant -42 : i32
+  %f = vector.transfer_read %a[%c2], %c-42
+      {permutation_map = affine_map<(d0) -> (d0)>} :
+    memref<5xi32>, vector<13xi32>
+  return %f : vector<13xi32>
+}
+
+// CHECK-LABEL: @transfer_read_1d
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<13xi32>: [2, 3, 4, -42, -42, -42, -42, -42, -42, -42, -42, -42, -42]
+
+func.func @transfer_read_mask_1d() -> vector<13xi32> {
+  %a = arith.constant dense<[0, 1, 2, 3, 4]> : memref<5xi32>
+  %c2 = arith.constant 2 : index
+  %c-42 = arith.constant -42: i32
+  %m = arith.constant dense<[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]> : vector<13xi1>
+  %f = vector.transfer_read %a[%c2], %c-42, %m : memref<5xi32>, vector<13xi32>
+  return %f : vector<13xi32>
+}
+
+// CHECK-LABEL: @transfer_read_mask_1d
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<13xi32>: [-42, -42, 4, -42, -42, -42, -42, -42, -42, -42, -42, -42, -42]
+
+func.func @transfer_read_vector_mask() -> vector<6xi32> {
+  %a = arith.constant dense<[0, 1, 2, 3, 4]> : memref<5xi32>
+  %c2 = arith.constant 2 : index
+  %c-42 = arith.constant -42: i32
+  %m = arith.constant dense<[0, 0, 1, 1, 1, 1]> : vector<6xi1>
+  %passthrough = arith.constant dense<[-1, -2, -3, -4, -5, -6]> : vector<6xi32>
+  %f = vector.mask %m, %passthrough {
+    vector.transfer_read %a[%c2], %c-42 : memref<5xi32>, vector<6xi32>
+  } : vector<6xi1> -> vector<6xi32>
+  return %f : vector<6xi32>
+}
+
+// CHECK-LABEL: @transfer_read_vector_mask
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<6xi32>: [-1, -2, 4, -42, -42, -42]
+
+func.func @transfer_read_inbounds_4() -> vector<4xi32> {
+  %a = arith.constant dense<[0, 1, 2, 0, 0]> : memref<5xi32>
+  %c-42 = arith.constant -42: i32
+  %c1 = arith.constant 1 : index
+  %f = vector.transfer_read %a[%c1], %c-42
+      {permutation_map = affine_map<(d0) -> (d0)>, in_bounds = [true]} :
+    memref<5xi32>, vector<4xi32>
+  return %f : vector<4xi32>
+}
+
+// CHECK-LABEL: @transfer_read_inbounds_4
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<4xi32>: [1, 2, 0, 0]
+
+func.func @transfer_read_mask_inbounds_4() -> vector<4xi32> {
+  %a = arith.constant dense<[0, 1, 2, 0, 0]> : memref<5xi32>
+  %c-42 = arith.constant -42: i32
+  %c1 = arith.constant 1 : index
+  %m = arith.constant dense<[0, 1, 0, 1]> : vector<4xi1>
+  %f = vector.transfer_read %a[%c1], %c-42, %m {in_bounds = [true]}
+      : memref<5xi32>, vector<4xi32>
+  return %f : vector<4xi32>
+}
+
+// CHECK-LABEL: @transfer_read_mask_inbounds_4
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<4xi32>: [-42, 2, -42, 0]
+
+func.func @transfer_read_2d()-> vector<2x2xi32> {
+  %a = arith.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]]>
+    : memref<3x4xi32>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c-42 = arith.constant -42: i32
+  %f = vector.transfer_read %a[%c1, %c0], %c-42
+      : memref<3x4xi32>, vector<2x2xi32>
+  return %f : vector<2x2xi32>
+}
+
+// CHECK-LABEL: @transfer_read_2d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: vector<2x2xi32>: [[4, 5], [8, 9]]
+
+func.func @transfer_read_2d_1d() -> vector<2xi32> {
+  %a = arith.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7]]> : memref<2x4xi32>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c-42 = arith.constant -42: i32
+  %f = vector.transfer_read %a[%c1, %c0], %c-42
+      : memref<2x4xi32>, vector<2xi32>
+  return %f : vector<2xi32>
+}
+
+// CHECK-LABEL: @transfer_read_2d_1d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: vector<2xi32>: [4, 5]
+
+func.func @transfer_read_i1() -> vector<2xi1> {
+  %a = arith.constant dense<[true, false]> : memref<2xi1>
+  %c0 = arith.constant 0 : index
+  %false = arith.constant false
+  // Note: this read is technically illegal, but it can occur in practice and
+  // persist for a while until it's lowered to something legal.
+  %ret = vector.transfer_read %a[%c0], %false {in_bounds = [true]}
+    : memref<2xi1>, vector<2xi1>
+  return %ret : vector<2xi1>
+}
+
+// CHECK-LABEL: @transfer_read_i1
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [true, false]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/transfer_write.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/transfer_write.mlir
new file mode 100644
index 00000000000000..6ca9eb8860395f
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/transfer_write.mlir
@@ -0,0 +1,91 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @write_4_at_3_inbounds() -> memref<?xi32> {
+  %c8 = arith.constant 8 : index
+  %a = memref.alloc(%c8) : memref<?xi32>
+  %base = arith.constant 3 : index
+  %f = arith.constant dense<[1, 2, 3, 4]> : vector<4xi32>
+  vector.transfer_write %f, %a[%base]
+    {permutation_map = affine_map<(d0) -> (d0)>, in_bounds = [true]}
+    : vector<4xi32>, memref<?xi32>
+  return %a : memref<?xi32>
+}
+
+// CHECK-LABEL: @write_4_at_3_inbounds
+// CHECK-NEXT: Results
+// CHECK-NEXT: [0, 0, 0, 1, 2, 3, 4, 0]
+
+func.func @write_6_at_5() -> memref<8xi32> {
+  %a = memref.alloc() : memref<8xi32>
+  %base = arith.constant 5 : index
+  %f = arith.constant dense<[1, 2, 3, 4, 5, 6]> : vector<6xi32>
+  vector.transfer_write %f, %a[%base]
+    {permutation_map = affine_map<(d0) -> (d0)>}
+    : vector<6xi32>, memref<8xi32>
+  return %a : memref<8xi32>
+}
+
+// CHECK-LABEL: @write_6_at_5
+// CHECK-NEXT: Results
+// CHECK-NEXT: [0, 0, 0, 0, 0, 1, 2, 3]
+
+func.func @write_to_tensor() -> (tensor<3xi32>, tensor<3xi32>) {
+  %a = arith.constant dense<[1,2,3]> : tensor<3xi32>
+  %base = arith.constant 1 : index
+  %f = arith.constant dense<[4, 5]> : vector<2xi32>
+  %b = vector.transfer_write %f, %a[%base]
+    {permutation_map = affine_map<(d0) -> (d0)>}
+    : vector<2xi32>, tensor<3xi32>
+  return %a, %b : tensor<3xi32>, tensor<3xi32>
+}
+
+// CHECK-LABEL: @write_to_tensor
+// CHECK-NEXT: Results
+// CHECK-NEXT: [1, 2, 3]
+// CHECK-NEXT: [1, 4, 5]
+
+func.func @write_masked() -> memref<4xi32> {
+  %a = arith.constant dense<[10, 11, 12, 13]> : memref<4xi32>
+  %base = arith.constant 1 : index
+  %f = arith.constant dense<[1, 2, 3]> : vector<3xi32>
+  %mask = arith.constant dense<[true, false, true]> : vector<3xi1>
+  vector.transfer_write %f, %a[%base], %mask
+    {permutation_map = affine_map<(d0) -> (d0)>}
+    : vector<3xi32>, memref<4xi32>
+  return %a : memref<4xi32>
+}
+
+// CHECK-LABEL: @write_masked
+// CHECK-NEXT: Results
+// CHECK-NEXT: [10, 1, 12, 3]
+
+func.func @write_vector_mask() -> memref<4xi32> {
+  %a = arith.constant dense<[10, 11, 12, 13]> : memref<4xi32>
+  %base = arith.constant 1 : index
+  %f = arith.constant dense<[1, 2, 3]> : vector<3xi32>
+  %mask = arith.constant dense<[true, false, true]> : vector<3xi1>
+  vector.mask %mask {
+    vector.transfer_write %f, %a[%base]
+      {permutation_map = affine_map<(d0) -> (d0)>}
+      : vector<3xi32>, memref<4xi32>
+  } : vector<3xi1>
+  return %a : memref<4xi32>
+}
+
+// CHECK-LABEL: @write_vector_mask
+// CHECK-NEXT: Results
+// CHECK-NEXT: [10, 1, 12, 3]
+
+func.func @write_1d_to_2d() -> memref<2x4xi32> {
+  %a = memref.alloc() : memref<2x4xi32>
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %f = arith.constant dense<[1, 2]> : vector<2xi32>
+  vector.transfer_write %f, %a[%c1, %c2]
+    : vector<2xi32>, memref<2x4xi32>
+  return %a : memref<2x4xi32>
+}
+
+// CHECK-LABEL: @write_1d_to_2d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[0, 0, 0, 0], [0, 0, 1, 2]]
\ No newline at end of file
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/transpose.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/transpose.mlir
new file mode 100644
index 00000000000000..3c39327d67d92a
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/transpose.mlir
@@ -0,0 +1,28 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @transpose() -> vector<2x1x4x3xi32> {
+  %0 = arith.constant dense<[[[
+      [000, 001, 002, 003],
+      [010, 011, 012, 013],
+      [020, 021, 022, 023]
+    ],
+    [
+      [100, 101, 102, 103],
+      [110, 111, 112, 113],
+      [120, 121, 122, 123]
+    ]]]> : vector<1x2x3x4xi32>
+  %1 = vector.transpose %0, [1, 0, 3, 2]
+    : vector<1x2x3x4xi32> to vector<2x1x4x3xi32>
+  return %1 : vector<2x1x4x3xi32>
+}
+
+// CHECK-LABEL: @transpose
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[[[0, 10, 20],
+// CHECK{LITERAL}:         [1, 11, 21],
+// CHECK{LITERAL}:         [2, 12, 22],
+// CHECK{LITERAL}:         [3, 13, 23]]],
+// CHECK{LITERAL}:       [[[100, 110, 120],
+// CHECK{LITERAL}:         [101, 111, 121],
+// CHECK{LITERAL}:         [102, 112, 122],
+// CHECK{LITERAL}:         [103, 113, 123]]]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/type_cast.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/type_cast.mlir
new file mode 100644
index 00000000000000..fecc8e21466152
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/type_cast.mlir
@@ -0,0 +1,11 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @type_cast() -> memref<vector<2x2xi32>> {
+  %alloc = arith.constant dense<[[1, 2], [3, 4]]> : memref<2x2xi32>
+  %cast = vector.type_cast %alloc: memref<2x2xi32> to memref<vector<2x2xi32>>
+  return %cast : memref<vector<2x2xi32>>
+}
+
+// CHECK-LABEL: @type_cast
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: TensorOrMemref<vector<2x2xi32>>: [[1, 2], [3, 4]]
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/vscale.mlir b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/vscale.mlir
new file mode 100644
index 00000000000000..6d162357d57936
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/tests/vector/vscale.mlir
@@ -0,0 +1,12 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @vscale() -> (vector<[4]xi32>, index) {
+  %c = arith.constant dense<0> : vector<[4]xi32>
+  %vscale = vector.vscale
+  return %c, %vscale : vector<[4]xi32>, index
+}
+
+// CHECK-LABEL: @vscale
+// CHECK-NEXT: Results
+// CHECK-NEXT: [0, 0, 0, 0]
+// CHECK-NEXT: 1
\ No newline at end of file
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/util.cc b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/util.cc
new file mode 100644
index 00000000000000..e1f3dae982726e
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/util.cc
@@ -0,0 +1,180 @@
+/* Copyright 2022 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/mlir/tools/mlir_interpreter/dialects/util.h"
+
+#include <cassert>
+#include <cstdint>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/AffineExpr.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "mlir/Interfaces/ViewLikeInterface.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/MathExtras.h"  // from @llvm-project
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h"
+
+namespace mlir {
+namespace interpreter {
+
+SmallVector<int64_t> ReplaceDynamicVals(llvm::ArrayRef<int64_t> static_vals,
+                                        ArrayRef<InterpreterValue>& args) {
+  llvm::SmallVector<int64_t> out;
+  for (int64_t val : static_vals) {
+    if (ShapedType::isDynamic(val)) {
+      out.push_back(std::get<int64_t>(args.front().storage));
+      args = args.drop_front(1);
+    } else {
+      out.push_back(val);
+    }
+  }
+  return out;
+}
+
+SmallVector<int64_t> ReplaceDynamicVals(ArrayRef<int64_t> static_vals,
+                                        ArrayRef<int64_t> dynamic_vals) {
+  llvm::SmallVector<int64_t> out;
+  for (int64_t val : static_vals) {
+    if (ShapedType::isDynamic(val)) {
+      out.push_back(dynamic_vals.front());
+      dynamic_vals = dynamic_vals.drop_front(1);
+    } else {
+      out.push_back(val);
+    }
+  }
+  assert(dynamic_vals.empty() && "expected no leftover dynamic values");
+  return out;
+}
+
+OffsetsSizesStrides ExtractOffsetsSizesStrides(
+    ArrayRef<InterpreterValue> args, OffsetSizeAndStrideOpInterface op) {
+  auto offsets = ReplaceDynamicVals(op.getStaticOffsets(), args);
+  auto sizes = ReplaceDynamicVals(op.getStaticSizes(), args);
+  auto strides = ReplaceDynamicVals(op.getStaticStrides(), args);
+  return {offsets, sizes, strides};
+}
+
+OffsetsSizesStrides ExtractOffsetsSizesStrides(
+    ArrayRef<int64_t> dynamic_offsets, ArrayRef<int64_t> dynamic_sizes,
+    ArrayRef<int64_t> dynamic_strides, OffsetSizeAndStrideOpInterface op) {
+  auto offsets = ReplaceDynamicVals(op.getStaticOffsets(), dynamic_offsets);
+  auto sizes = ReplaceDynamicVals(op.getStaticSizes(), dynamic_sizes);
+  auto strides = ReplaceDynamicVals(op.getStaticStrides(), dynamic_strides);
+  return {offsets, sizes, strides};
+}
+
+InterpreterValue ReshapeTensor(const InterpreterValue& in,
+                               ArrayRef<int64_t> shape) {
+  // This doesn't need a copy in many cases, but it's easier that way.
+  auto out = in.TypedAlike(shape);
+  for (const auto& [in_index, out_index] :
+       llvm::zip(in.View().Indices(), out.View().Indices())) {
+    out.InsertElement(out_index, in.ExtractElement(in_index));
+  }
+  return out;
+}
+
+InterpreterValue GetInitOperand(mlir::Operation* op, int64_t index,
+                                MutableArrayRef<InterpreterValue> args) {
+  return GetInitOperand(op->getOperands(), index, args);
+}
+
+InterpreterValue GetInitOperand(mlir::ValueRange values, int64_t index,
+                                ArrayRef<InterpreterValue> args) {
+  assert(args.size() == values.size() && "expected matching sizes");
+  return mlir::isa<TensorType>(values[index].getType()) ? args[index].Clone()
+                                                        : args[index];
+}
+
+InterpreterValue TransposeImpl(const InterpreterValue& in,
+                               ArrayRef<int64_t> permutation) {
+  auto out = in;
+  auto& view = out.View();
+
+  view.sizes.clear();
+  view.strides.clear();
+  for (int64_t p : permutation) {
+    view.sizes.push_back(in.View().sizes[p]);
+    view.strides.push_back(in.View().strides[p]);
+  }
+
+  return out;
+}
+
+int64_t DimImpl(const InterpreterValue& in, int64_t index,
+                InterpreterState& state) {
+  if (index < 0 || index >= in.View().Rank()) {
+    state.AddFailure("dimension index out of bounds");
+    return 0;
+  }
+  return in.View().sizes[index];
+}
+
+llvm::SmallVector<InterpreterValue> NoOpTerminator(
+    MutableArrayRef<InterpreterValue> args, mlir::Operation*,
+    InterpreterState&) {
+  return llvm::to_vector(args);
+}
+
+int64_t EvalAffineExpr(AffineExpr expr, ArrayRef<int64_t> dims,
+                       ArrayRef<int64_t> symbols) {
+  int64_t lhs = 0, rhs = 0;
+  if (auto bin = expr.dyn_cast<AffineBinaryOpExpr>()) {
+    lhs = EvalAffineExpr(bin.getLHS(), dims, symbols);
+    rhs = EvalAffineExpr(bin.getRHS(), dims, symbols);
+  }
+  switch (expr.getKind()) {
+    case AffineExprKind::Add:
+      return lhs + rhs;
+    case AffineExprKind::Mul:
+      return lhs * rhs;
+    case AffineExprKind::Mod:
+      return mod(lhs, rhs);
+    case AffineExprKind::FloorDiv:
+      return floorDiv(lhs, rhs);
+    case AffineExprKind::CeilDiv:
+      return ceilDiv(lhs, rhs);
+    case AffineExprKind::Constant:
+      return expr.cast<AffineConstantExpr>().getValue();
+    case AffineExprKind::DimId:
+      return dims[expr.cast<AffineDimExpr>().getPosition()];
+    case AffineExprKind::SymbolId:
+      return symbols[expr.cast<AffineSymbolExpr>().getPosition()];
+  }
+}
+
+SmallVector<int64_t> EvalAffineMap(AffineMap map, ArrayRef<int64_t> dims,
+                                   ArrayRef<int64_t> symbols) {
+  SmallVector<int64_t> result;
+  for (auto expr : map.getResults()) {
+    result.push_back(EvalAffineExpr(expr, dims, symbols));
+  }
+  return result;
+}
+
+llvm::SmallVector<int64_t> EvalAffineMap(AffineMap map,
+                                         ArrayRef<int64_t> operands) {
+  return EvalAffineMap(map, operands.take_front(map.getNumDims()),
+                       operands.drop_front(map.getNumDims()));
+}
+
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/util.h b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/util.h
new file mode 100644
index 00000000000000..be769a2920d76e
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/util.h
@@ -0,0 +1,85 @@
+/* Copyright 2023 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_MLIR_TOOLS_MLIR_INTERPRETER_DIALECTS_UTIL_H_
+#define XLA_MLIR_TOOLS_MLIR_INTERPRETER_DIALECTS_UTIL_H_
+
+#include <cstdint>
+
+#include "mlir/IR/AffineExpr.h"  // from @llvm-project
+#include "mlir/IR/AffineMap.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "mlir/Interfaces/ViewLikeInterface.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h"
+
+namespace mlir {
+namespace interpreter {
+
+struct OffsetsSizesStrides {
+  llvm::SmallVector<int64_t> offsets;
+  llvm::SmallVector<int64_t> sizes;
+  llvm::SmallVector<int64_t> strides;
+};
+
+// Replaces dynamic placeholders in static_vals using elements from the front
+// of args, which are removed.
+SmallVector<int64_t> ReplaceDynamicVals(ArrayRef<int64_t> static_vals,
+                                        ArrayRef<InterpreterValue>& args);
+// Same as above, but the values are already unpacked. `dynamicVals.size` must
+// match the number of dynamic values in `staticVals`.
+SmallVector<int64_t> ReplaceDynamicVals(ArrayRef<int64_t> static_vals,
+                                        ArrayRef<int64_t> dynamic_vals);
+
+OffsetsSizesStrides ExtractOffsetsSizesStrides(
+    ArrayRef<InterpreterValue> args, OffsetSizeAndStrideOpInterface op);
+OffsetsSizesStrides ExtractOffsetsSizesStrides(
+    ArrayRef<int64_t> dynamic_offsets, ArrayRef<int64_t> dynamic_sizes,
+    ArrayRef<int64_t> dynamic_strides, OffsetSizeAndStrideOpInterface op);
+
+InterpreterValue ReshapeTensor(const InterpreterValue& in,
+                               ArrayRef<int64_t> shape);
+
+// gets the given operand, cloning its storage if it is a tensor.
+InterpreterValue GetInitOperand(mlir::Operation* op, int64_t index,
+                                MutableArrayRef<InterpreterValue> args);
+InterpreterValue GetInitOperand(mlir::ValueRange values, int64_t index,
+                                ArrayRef<InterpreterValue> args);
+
+// Common implementations for ops from different dialects but sharing the same
+// semantics.
+InterpreterValue TransposeImpl(const InterpreterValue& in,
+                               ArrayRef<int64_t> permutation);
+int64_t DimImpl(const InterpreterValue& in, int64_t index,
+                InterpreterState& state);
+
+// Terminator that just returns its args.
+llvm::SmallVector<InterpreterValue> NoOpTerminator(
+    MutableArrayRef<InterpreterValue> args, mlir::Operation*,
+    InterpreterState&);
+
+int64_t EvalAffineExpr(AffineExpr expr, ArrayRef<int64_t> dims,
+                       ArrayRef<int64_t> symbols);
+llvm::SmallVector<int64_t> EvalAffineMap(AffineMap map, ArrayRef<int64_t> dims,
+                                         ArrayRef<int64_t> symbols);
+llvm::SmallVector<int64_t> EvalAffineMap(AffineMap map,
+                                         ArrayRef<int64_t> operands);
+
+}  // namespace interpreter
+}  // namespace mlir
+
+#endif  // XLA_MLIR_TOOLS_MLIR_INTERPRETER_DIALECTS_UTIL_H_
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/vector.cc b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/vector.cc
new file mode 100644
index 00000000000000..f01a4f6ece711a
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/dialects/vector.cc
@@ -0,0 +1,877 @@
+/* Copyright 2023 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <type_traits>
+#include <utility>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/mlir/tools/mlir_interpreter/dialects/comparators.h"
+#include "xla/mlir/tools/mlir_interpreter/dialects/util.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/registration.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/tensor_or_memref.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+class MaskSideChannel : public InterpreterSideChannel {
+ public:
+  MaskSideChannel(TensorOrMemref<bool> mask,
+                  std::optional<InterpreterValue> passthrough)
+      : mask_(std::move(mask)), passthrough_(std::move(passthrough)) {}
+
+  const TensorOrMemref<bool>& GetMask() const { return mask_; }
+  const std::optional<InterpreterValue>& GetPassthrough() const {
+    return passthrough_;
+  }
+
+ private:
+  const TensorOrMemref<bool> mask_;
+  const std::optional<InterpreterValue> passthrough_;
+};
+
+template <typename T>
+using combiner_t = T (*)(T, T);
+
+template <typename T>
+combiner_t<T> GetCombiner(vector::CombiningKind kind) {
+  if constexpr (std::is_arithmetic_v<T> && !std::is_same_v<T, bool>) {
+    switch (kind) {
+      case vector::CombiningKind::ADD:
+        return +[](T a, T b) -> T { return a + b; };
+      case vector::CombiningKind::MUL:
+        return +[](T a, T b) -> T { return a * b; };
+      case vector::CombiningKind::MINSI:
+      case vector::CombiningKind::MINIMUMF:
+        return +[](T a, T b) -> T { return std::min(a, b); };
+      case vector::CombiningKind::MAXSI:
+      case vector::CombiningKind::MAXIMUMF:
+        return +[](T a, T b) -> T { return std::max(a, b); };
+      default: {
+      }
+    }
+  }
+
+  if constexpr (std::is_integral_v<T> && !std::is_same_v<T, bool>) {
+    switch (kind) {
+      case vector::CombiningKind::MINUI:
+        return &Iumin::apply<T>;
+      case vector::CombiningKind::MAXUI:
+        return &Iumax::apply<T>;
+      default: {
+      }
+    }
+  }
+
+  if constexpr (std::is_integral_v<T>) {
+    switch (kind) {
+      case vector::CombiningKind::AND:
+        return +[](T a, T b) -> T { return a & b; };
+      case vector::CombiningKind::OR:
+        return +[](T a, T b) -> T { return a | b; };
+      case vector::CombiningKind::XOR:
+        return +[](T a, T b) -> T { return a ^ b; };
+      default: {
+      }
+    }
+  }
+
+  llvm_unreachable("unknown combining kind");
+}
+
+InterpreterValue GetNeutralElement(vector::CombiningKind kind, mlir::Type ty) {
+  return DispatchScalarType(ty, [&](auto dummy) -> InterpreterValue {
+    using T = decltype(dummy);
+    switch (kind) {
+      case vector::CombiningKind::AND:
+      case vector::CombiningKind::MINUI:
+        if constexpr (std::is_same_v<T, bool>) {
+          return {true};
+        } else if constexpr (std::is_integral_v<T>) {
+          return {static_cast<T>(static_cast<std::make_unsigned_t<T>>(-1))};
+        }
+        break;
+      case vector::CombiningKind::ADD:
+      case vector::CombiningKind::MAXUI:
+      case vector::CombiningKind::OR:
+      case vector::CombiningKind::XOR:
+        return {T{0}};
+      case vector::CombiningKind::MUL:
+        return {T{1}};
+      case vector::CombiningKind::MINSI:
+        return {std::numeric_limits<T>::max()};
+      case vector::CombiningKind::MINIMUMF:
+        return {std::numeric_limits<T>::infinity()};
+      case vector::CombiningKind::MAXSI:
+        return {std::numeric_limits<T>::min()};  // NOLINT
+      case vector::CombiningKind::MAXIMUMF:
+        return {-std::numeric_limits<T>::infinity()};
+      default: {
+      }
+    }
+    llvm_unreachable("invalid combining kind");
+  });
+}
+
+template <typename IntType>
+SmallVector<IntType> ExtractVector(ArrayAttr array_attr) {
+  return llvm::to_vector<4>(llvm::map_range(
+      array_attr.getAsRange<IntegerAttr>(),
+      [](IntegerAttr attr) { return static_cast<IntType>(attr.getInt()); }));
+}
+
+InterpreterValue Bitcast(InterpreterState&, vector::BitCastOp op,
+                         const InterpreterValue& vector) {
+  ShapedType ty = cast<ShapedType>(op->getResultTypes()[0]);
+  auto flattened = vector.CoerceLayout({});
+  auto buffer = flattened.GetBuffer();
+  auto view = flattened.View();
+  view.sizes = llvm::to_vector(ty.getShape());
+  view.strides = BufferView::GetDefaultStrides(view.sizes);
+  return DispatchScalarType(ty, [&](auto dummy) -> InterpreterValue {
+    // TODO(jreiffers): i1 semantics are currently broken (both here and
+    // upstream).
+    using T = decltype(dummy);
+    return {TensorOrMemref<T>{buffer, view}};
+  });
+}
+
+InterpreterValue Broadcast(InterpreterState&, vector::BroadcastOp broadcast,
+                           const InterpreterValue& value) {
+  auto result =
+      value.IsTensor() ? value : value.AsUnitTensor(/*is_vector=*/true);
+  auto& result_view = result.View();
+
+  // Insert additional leading stride 0 dims.
+  SmallVector<int64_t> strides(broadcast.getResultVectorType().getRank());
+  llvm::copy(llvm::reverse(result_view.strides), strides.rbegin());
+  // Zero out broadcast dimension strides.
+  for (int64_t i : broadcast.computeBroadcastedUnitDims()) {
+    strides[i] = 0;
+  }
+
+  result_view.strides = std::move(strides);
+  result_view.sizes =
+      llvm::to_vector(broadcast.getResultVectorType().getShape());
+  return result;
+}
+
+void CompressStore(InterpreterState& state, vector::CompressStoreOp,
+                   InterpreterValue dst, ArrayRef<int64_t> indices,
+                   TensorOrMemref<bool> mask, InterpreterValue value) {
+  auto dst_buffer = dst.GetBuffer();
+  const auto& dst_view = dst.View();
+  if (dst_view.strides.back() != 1) {
+    state.AddFailure("trailing dimension must be continguous");
+    return;
+  }
+  auto offset = dst_view.GetPhysicalIndex(indices);
+  if (!offset) {
+    state.AddFailure("index out of bounds");
+    return;
+  }
+
+  auto src_buffer = value.GetBuffer();
+  const auto& src_view = value.View();
+
+  // TODO(jreiffers): Bounds checks.
+  int64_t n = src_view.sizes[0];
+  int64_t element_size = value.GetByteSizeOfElement();
+  for (int64_t i = 0; i < n; ++i) {
+    if (mask.at(i)) {
+      memcpy(dst_buffer->at(offset, element_size),
+             src_buffer->at(i, element_size), element_size);
+      ++*offset;
+    }
+  }
+}
+
+InterpreterValue MaskImpl(mlir::Operation* op, ArrayRef<int64_t> mask_sizes) {
+  auto out_sizes = cast<ShapedType>(op->getResultTypes()[0]).getShape();
+  auto result = TensorOrMemref<bool>::Empty(out_sizes);
+  result.view.is_vector = true;
+  BufferView iter;
+  iter.sizes = llvm::to_vector(mask_sizes);
+  for (const auto& indices : iter.Indices()) {
+    result.at(indices) = true;
+  }
+  return {result};
+}
+
+InterpreterValue ConstantMask(InterpreterState&, vector::ConstantMaskOp mask) {
+  return MaskImpl(mask, ExtractVector<int64_t>(mask.getMaskDimSizes()));
+}
+
+// TODO(jreiffers): Support masked contractions.
+InterpreterValue Contract(InterpreterState&, vector::ContractionOp contraction,
+                          const InterpreterValue& lhs,
+                          const InterpreterValue& rhs,
+                          const InterpreterValue& acc) {
+  BufferView iter;
+  contraction.getIterationBounds(iter.sizes);
+  auto maps = contraction.getIndexingMapsArray();
+  auto result_ty = contraction->getResultTypes()[0];
+  auto shaped_ty = result_ty.dyn_cast<ShapedType>();
+  auto result =
+      DispatchScalarType(result_ty, [&](auto dummy) -> InterpreterValue {
+        using T = decltype(dummy);
+        using TT = TensorOrMemref<T>;
+        const auto& lhs_t = std::get<TT>(lhs.storage);
+        const auto& rhs_t = std::get<TT>(rhs.storage);
+        auto result_value = shaped_ty ? acc.Clone() : acc.AsUnitTensor();
+        auto& result = std::get<TT>(result_value.storage);
+        auto combiner = *GetCombiner<T>(contraction.getKind());
+        for (const auto& indices : iter.Indices()) {
+          auto lhs_indices = EvalAffineMap(maps[0], indices);
+          auto rhs_indices = EvalAffineMap(maps[1], indices);
+          auto result_indices = EvalAffineMap(maps[2], indices);
+
+          auto& result_item = result.at(result_indices);
+          result_item = combiner(result_item,
+                                 lhs_t.at(lhs_indices) * rhs_t.at(rhs_indices));
+        }
+        return result_value;
+      });
+
+  return shaped_ty ? result : result.ExtractElement({});
+}
+
+InterpreterValue CreateMask(InterpreterState&, vector::CreateMaskOp op,
+                            ArrayRef<int64_t> sizes) {
+  return MaskImpl(op, sizes);
+}
+
+InterpreterValue ExpandLoad(InterpreterState& state, vector::ExpandLoadOp,
+                            InterpreterValue memref,
+                            SmallVector<int64_t> offsets,
+                            TensorOrMemref<bool> mask,
+                            const InterpreterValue& passthrough) {
+  if (memref.View().strides.back() != 1) {
+    state.AddFailure("expected last dimension to be contiguous");
+    return {};
+  }
+
+  auto out = passthrough.Clone();
+  for (int64_t i = 0, e = out.View().sizes[0]; i < e; ++i) {
+    if (mask.at({i})) {
+      out.InsertElement({i}, memref.ExtractElement(offsets));
+      ++offsets.back();
+    }
+  }
+  return out;
+}
+
+InterpreterValue Extract(InterpreterState& state, vector::ExtractOp extract,
+                         const InterpreterValue& vector) {
+  auto result = vector;
+  auto& result_view = result.View();
+  for (int64_t offset : extract.getStaticPosition()) {
+    state.CheckSuccess(result_view.Slice(0, offset), "index out of bounds");
+  }
+  return result_view.Rank() == 0 ? result.ExtractElement({}) : result;
+}
+
+InterpreterValue ExtractElement(InterpreterState& state,
+                                vector::ExtractElementOp,
+                                const InterpreterValue& vector,
+                                std::optional<int64_t> index) {
+  if (!index) {
+    return vector.ExtractElement({});
+  }
+  if (!vector.View().InBounds(*index)) {
+    state.AddFailure("array index out of bounds");
+    return {};
+  }
+  return vector.ExtractElement(*index);
+}
+
+InterpreterValue ExtractSlice(InterpreterState& state,
+                              vector::ExtractStridedSliceOp extract,
+                              const InterpreterValue& vector) {
+  auto out = vector;
+  state.CheckSuccess(
+      out.View().Subview(ExtractVector<int64_t>(extract.getOffsets()),
+                         ExtractVector<int64_t>(extract.getSizes()),
+                         ExtractVector<int64_t>(extract.getStrides())),
+      "subview out of bounds");
+  return out;
+}
+
+InterpreterValue FlatTranspose(InterpreterState&,
+                               vector::FlatTransposeOp transpose,
+                               const InterpreterValue& vector) {
+  auto out = vector.Clone();
+  // We currently only implement -matrix-default-layout=column-major.
+  int64_t rows = transpose.getRows();
+  int64_t cols = transpose.getColumns();
+  for (int64_t i = 0; i < rows * cols; ++i) {
+    int64_t src_index = (i % cols) * rows + (i / cols);
+    out.InsertElement({i}, vector.ExtractElement({src_index}));
+  }
+  return out;
+}
+
+InterpreterValue FusedMultiplyAdd(InterpreterState&, vector::FMAOp op,
+                                  const InterpreterValue& lhs,
+                                  const InterpreterValue& rhs,
+                                  const InterpreterValue& acc) {
+  auto out = acc.Clone();
+  DispatchScalarType(op->getResultTypes()[0], [&](auto dummy) {
+    using TT = TensorOrMemref<decltype(dummy)>;
+    const auto& lhs_t = std::get<TT>(lhs.storage);
+    const auto& rhs_t = std::get<TT>(rhs.storage);
+    auto& result = std::get<TT>(out.storage);
+
+    for (const auto& indices : lhs_t.view.Indices()) {
+      result.at(indices) += lhs_t.at(indices) * rhs_t.at(indices);
+    }
+  });
+  return out;
+}
+
+InterpreterValue Gather(InterpreterState& state, vector::GatherOp op,
+                        const InterpreterValue& src, ArrayRef<int64_t> offsets,
+                        const InterpreterValue& indices,
+                        const TensorOrMemref<bool>& mask,
+                        const InterpreterValue& pass_through) {
+  if (isa<MemRefType>(op->getOperandTypes()[0]) &&
+      src.View().strides.back() != 1) {
+    state.AddFailure("expected trailing dimension to be contiguous");
+    return {};
+  }
+  auto out = pass_through.Clone();
+  for (const auto& out_index : out.View().Indices()) {
+    if (!mask.at(out_index)) {
+      continue;
+    }
+    auto in_index = llvm::to_vector(offsets);
+    in_index[0] += indices.ExtractElement(out_index).AsInt();
+    out.InsertElement(out_index, src.ExtractElement(in_index));
+  }
+  return out;
+}
+
+InterpreterValue Insert(InterpreterState& state, vector::InsertOp insert,
+                        const InterpreterValue& src,
+                        const InterpreterValue& dst) {
+  auto result = dst.Clone();
+  auto result_slice = result;
+  auto& result_slice_view = result_slice.View();
+  for (int64_t offset : insert.getStaticPosition()) {
+    state.CheckSuccess(result_slice_view.Slice(0, offset),
+                       "index out of bounds");
+  }
+  result_slice.Fill([&](auto indices) { return src.ExtractElement(indices); });
+  return result;
+}
+
+InterpreterValue InsertElement(InterpreterState& state, vector::InsertElementOp,
+                               const InterpreterValue& value,
+                               const InterpreterValue& vector,
+                               std::optional<int64_t> index) {
+  auto result = vector.Clone();
+  if (!index) {
+    result.InsertElement({}, value);
+    return result;
+  }
+  if (!result.View().InBounds(*index)) {
+    state.AddFailure("array index out of bounds");
+    return {};
+  }
+  result.InsertElement(*index, value);
+  return result;
+}
+
+InterpreterValue InsertSlice(InterpreterState& state,
+                             vector::InsertStridedSliceOp insert,
+                             InterpreterValue src,
+                             const InterpreterValue& dst) {
+  auto out = dst.Clone();
+  auto out_slice = out;
+  if (!out_slice.View()
+           .Subview(ExtractVector<int64_t>(insert.getOffsets()),
+                    insert.getSourceVectorType().getShape(),
+                    ExtractVector<int64_t>(insert.getStrides()))
+           .succeeded()) {
+    state.AddFailure("subview out of bounds");
+  }
+  for (const auto& index : src.View().Indices()) {
+    out_slice.InsertElement(index, src.ExtractElement(index));
+  }
+  return out;
+}
+
+InterpreterValue Load(InterpreterState& state, vector::LoadOp load,
+                      const InterpreterValue& memref,
+                      ArrayRef<int64_t> indices) {
+  if (memref.View().num_vector_dims > 0) {
+    return {memref.ExtractElement(indices)};
+  }
+  auto out = memref;
+  if (!out.View()
+           .Subview(indices, load.getVectorType().getShape(),
+                    SmallVector<int64_t>(load.getVectorType().getRank(), 1))
+           .succeeded()) {
+    // "Not all targets may support out-of-bounds vector loads"
+    state.AddFailure("out of bounds loads not supported");
+    return {};
+  }
+  out = out.Clone();
+  out.View().is_vector = true;
+  return out;
+}
+
+llvm::SmallVector<InterpreterValue> Mask(
+    InterpreterState& state, vector::MaskOp op, TensorOrMemref<bool> mask,
+    std::optional<InterpreterValue> passthrough) {
+  InterpreterScope scope(state);
+  scope.SetSideChannel(std::make_shared<MaskSideChannel>(mask, passthrough));
+  return Interpret(state, op.getMaskRegion(), {});
+}
+
+InterpreterValue MaskedLoad(InterpreterState& state, vector::MaskedLoadOp,
+                            const InterpreterValue& memref,
+                            SmallVector<int64_t> offsets,
+                            TensorOrMemref<bool> mask,
+                            const InterpreterValue& passthrough) {
+  if (memref.View().strides.back() != 1) {
+    state.AddFailure("expected last dimension to be contiguous");
+    return {};
+  }
+
+  auto out = passthrough.Clone();
+  for (int64_t i = 0, e = mask.view.sizes[0]; i < e; ++i) {
+    if (mask.at(i)) {
+      out.InsertElement(i, memref.ExtractElement(offsets));
+    }
+    ++offsets.back();
+  }
+  return out;
+}
+
+void MaskedStore(InterpreterState& state, vector::MaskedStoreOp,
+                 InterpreterValue memref, SmallVector<int64_t> offsets,
+                 TensorOrMemref<bool> mask, const InterpreterValue& vector) {
+  if (memref.View().strides.back() != 1) {
+    state.AddFailure("expected last dimension to be contiguous");
+    return;
+  }
+
+  for (int64_t i = 0, e = mask.view.sizes[0]; i < e; ++i) {
+    if (mask.at({i})) {
+      memref.InsertElement(offsets, vector.ExtractElement({i}));
+    }
+    ++offsets.back();
+  }
+}
+
+InterpreterValue ReductionImpl(InterpreterState& state,
+                               const InterpreterValue& v,
+                               const InterpreterValue* acc,
+                               vector::CombiningKind kind,
+                               SmallVector<int64_t> dims, Type element_type) {
+  llvm::sort(dims);
+  SmallVector<int64_t> kept_dims;
+  SmallVector<int64_t> result_shape;
+  for (auto [dim, size] : llvm::enumerate(v.View().sizes)) {
+    if (!llvm::is_contained(dims, dim)) {
+      kept_dims.push_back(dim);
+      result_shape.push_back(size);
+    }
+  }
+  return DispatchScalarType(element_type, [&](auto dummy) -> InterpreterValue {
+    using T = decltype(dummy);
+    using TT = TensorOrMemref<T>;
+
+    auto combiner = *GetCombiner<T>(kind);
+
+    TT result = acc ? std::get<TT>((kept_dims.empty() ? acc->AsUnitTensor()
+                                                      : acc->Clone())
+                                       .storage)
+                    : TensorOrMemref<T>::Empty(result_shape);
+
+    for (const auto& result_index : result.view.Indices()) {
+      auto src = std::get<TT>(v.storage);
+      for (auto [dim, index] :
+           llvm::reverse(llvm::zip(kept_dims, result_index))) {
+        state.CheckSuccess(src.view.Slice(dim, index), "index out of bounds");
+      }
+
+      T& item = result.at(result_index);
+      bool first = acc == nullptr;
+      for (const auto& src_index : src.view.Indices()) {
+        if (first) {
+          item = src.at(src_index);
+          first = false;
+        } else {
+          item = combiner(item, src.at(src_index));
+        }
+      }
+    }
+
+    if (kept_dims.empty()) {
+      return {result.at({})};
+    }
+    return {result};
+  });
+}
+
+InterpreterValue MultiReduction(InterpreterState& state,
+                                vector::MultiDimReductionOp reduction,
+                                const InterpreterValue& source,
+                                const InterpreterValue& acc) {
+  auto element_ty = getElementTypeOrSelf(reduction->getResultTypes()[0]);
+  return {ReductionImpl(state, source, &acc, reduction.getKind(),
+                        ExtractVector<int64_t>(reduction.getReductionDims()),
+                        element_ty)};
+}
+
+InterpreterValue OuterProduct(InterpreterState&,
+                              vector::OuterProductOp outer_product,
+                              const InterpreterValue& lhs,
+                              const InterpreterValue& rhs,
+                              std::optional<InterpreterValue> acc) {
+  ShapedType ty = cast<ShapedType>(outer_product->getResultTypes()[0]);
+  return DispatchScalarType(ty, [&](auto dummy) -> InterpreterValue {
+    using T = decltype(dummy);
+    using TT = TensorOrMemref<T>;
+    const TT& lhs_t = std::get<TT>(lhs.storage);
+
+    auto combiner = GetCombiner<T>(outer_product.getKind());
+    auto result =
+        acc ? std::get<TT>(acc->storage).Clone() : TT::Empty(ty.getShape());
+    if (std::holds_alternative<T>(rhs.storage)) {
+      T rhs_scalar = std::get<T>(rhs.storage);
+      for (int64_t i : llvm::seq(int64_t{0}, lhs_t.view.sizes[0])) {
+        result.at(i) = combiner(result.at(i), lhs_t.at(i) * rhs_scalar);
+      }
+    } else {
+      const TT& rhs_t = std::get<TT>(rhs.storage);
+      for (int64_t i : llvm::seq(int64_t{0}, lhs_t.view.sizes[0])) {
+        for (int64_t j : llvm::seq(int64_t{0}, rhs_t.view.sizes[0])) {
+          result.at({i, j}) =
+              combiner(result.at({i, j}), lhs_t.at(i) * rhs_t.at(j));
+        }
+      }
+    }
+    return {result};
+  });
+}
+
+InterpreterValue Reduction(InterpreterState& state,
+                           vector::ReductionOp reduction, InterpreterValue arg,
+                           std::optional<InterpreterValue> acc) {
+  auto* mask =
+      state.GetTopScope()->GetSideChannel<MaskSideChannel>(/*optional=*/true);
+  auto ty = reduction->getResultTypes()[0];
+  if (mask) {
+    if (mask->GetPassthrough()) {
+      state.AddFailure("passthrough should not be set with masked reduction");
+      return {};
+    }
+    arg = arg.Clone();
+    if (mask->GetMask().view.sizes != arg.View().sizes) {
+      state.AddFailure("mask shape should match argument shape");
+      return {};
+    }
+    auto neutral = GetNeutralElement(reduction.getKind(), ty);
+    for (const auto& idx : arg.View().Indices()) {
+      if (!mask->GetMask().at(idx)) {
+        arg.InsertElement(idx, neutral);
+      }
+    }
+  }
+
+  return ReductionImpl(state, arg, acc ? &acc.value() : nullptr,
+                       reduction.getKind(), {0}, ty);
+}
+
+InterpreterValue ShapeCast(InterpreterState&, vector::ShapeCastOp op,
+                           const InterpreterValue& in) {
+  auto out = in.CoerceLayout({});
+  auto& out_view = out.View();
+  out_view.sizes =
+      llvm::to_vector(op->getResultTypes()[0].cast<ShapedType>().getShape());
+  out_view.strides = BufferView::GetDefaultStrides(out_view.sizes);
+  return out;
+}
+
+InterpreterValue Shuffle(InterpreterState& state, vector::ShuffleOp shuffle,
+                         const InterpreterValue& v0,
+                         const InterpreterValue& v1) {
+  auto result = v0.TypedAlike(shuffle.getResultVectorType().getShape());
+  auto& result_view = result.View();
+  result_view.is_vector = true;
+
+  auto mask = ExtractVector<int64_t>(shuffle.getMask());
+  bool is_zero_dim = v0.View().Rank() == 0;
+  int64_t size0 = is_zero_dim ? 1 : v0.View().sizes[0];
+  for (auto [dst_index, src_index] : llvm::enumerate(mask)) {
+    auto src = src_index < size0 ? v0 : v1;
+    if (!is_zero_dim) {
+      state.CheckSuccess(
+          src.View().Slice(0,
+                           src_index < size0 ? src_index : src_index - size0),
+          "index out of bounds");
+    }
+    auto dst = result;
+    state.CheckSuccess(dst.View().Slice(0, dst_index), "index out of bounds");
+    dst.Fill([&](auto indices) { return src.ExtractElement(indices); });
+  }
+  return result;
+}
+
+InterpreterValue Splat(InterpreterState&, vector::SplatOp op,
+                       const InterpreterValue& in) {
+  auto out = in.AsUnitTensor(/*is_vector=*/true);
+  auto& view = out.View();
+  view.sizes =
+      llvm::to_vector(op->getResultTypes()[0].cast<ShapedType>().getShape());
+  view.strides = SmallVector<int64_t>(view.sizes.size(), 0);
+  return out;
+}
+
+void Store(InterpreterState& state, vector::StoreOp,
+           const InterpreterValue& src, InterpreterValue dst,
+           ArrayRef<int64_t> offsets) {
+  if (!dst.View().InBounds(offsets)) {
+    state.AddFailure("array index out of bounds");
+    return;
+  }
+  const auto& out_view = dst.View();
+  if (out_view.num_vector_dims > 0) {
+    dst.InsertElement(offsets, src);
+  } else {
+    for (const auto& src_index : src.View().Indices()) {
+      auto dst_index = src_index;
+      for (int64_t i = 0; i < dst_index.size(); ++i) {
+        dst_index[i] += offsets[i];
+      }
+      if (out_view.InBounds(dst_index)) {
+        dst.InsertElement(dst_index, src.ExtractElement(src_index));
+      }
+    }
+  }
+}
+
+std::optional<InterpreterValue> ExtractMemorySlice(
+    InterpreterState& state, const AffineMap& map,
+    const InterpreterValue& memory, const InterpreterValue& vector,
+    ArrayRef<int64_t> offsets, std::optional<ArrayAttr> in_bounds_attr) {
+  llvm::SmallVector<bool> in_bounds(offsets.size());
+  if (in_bounds_attr) {
+    llvm::copy(in_bounds_attr->getAsValueRange<BoolAttr>(),
+               in_bounds.end() - in_bounds_attr->size());
+  }
+
+  auto mem_slice = memory;
+  auto& mem_slice_view = mem_slice.View();
+  auto& vector_view = vector.View();
+  for (int64_t i = 0; i < mem_slice_view.Rank(); ++i) {
+    bool found = false;
+    for (int64_t j = 0; !found && j < vector_view.Rank(); ++j) {
+      if (map.getResult(j).isFunctionOfDim(i)) {
+        int64_t size = mem_slice_view.sizes[i] - offsets[i];
+        bool is_in_bounds = size >= vector_view.sizes[j];
+        if (!is_in_bounds && in_bounds[i]) {
+          state.AddFailure("index out of bounds");
+          return std::nullopt;
+        }
+        (void)mem_slice_view.Slice(
+            i, offsets[i],
+            std::max(int64_t{0}, std::min(vector_view.sizes[j], size)));
+        found = true;
+      }
+    }
+    if (!found) {
+      bool is_in_bounds = mem_slice_view.sizes[i] > offsets[i];
+      if (!is_in_bounds) {
+        state.AddFailure("index out of bounds");
+        return std::nullopt;
+      }
+
+      (void)mem_slice_view.Slice(i, offsets[i], is_in_bounds ? 1 : 0);
+    }
+  }
+  return mem_slice;
+}
+
+InterpreterValue TransferRead(InterpreterState& state,
+                              vector::TransferReadOp transfer,
+                              const InterpreterValue& src,
+                              ArrayRef<int64_t> offsets,
+                              const InterpreterValue& padding,
+                              std::optional<TensorOrMemref<bool>> mask) {
+  auto* mask_channel = state.GetTopScope()->GetSideChannel<MaskSideChannel>(
+      /*optional=*/true);
+  if (mask_channel) {
+    if (mask) {
+      state.AddFailure(
+          "vector.mask and transfer_read with mask should not be used "
+          "simultaneously");
+      return {};
+    }
+    mask = mask_channel->GetMask();
+  }
+
+  InterpreterValue dst = src.TypedAlike(transfer.getVectorType().getShape());
+  if (mask_channel && mask_channel->GetPassthrough()) {
+    dst.Fill([&](auto indices) {
+      if (mask->at(indices)) {
+        return padding;
+      }
+      return mask_channel->GetPassthrough()->ExtractElement(indices);
+    });
+  } else {
+    dst.Fill([&](auto) { return padding; });
+  }
+  dst.View().is_vector = true;
+
+  auto src_slice = ExtractMemorySlice(state, transfer.getPermutationMap(), src,
+                                      dst, offsets, transfer.getInBounds());
+
+  if (!src_slice) {
+    return {};
+  }
+  for (const auto& src_indices : src_slice->View().Indices()) {
+    SmallVector<int64_t> dst_indices =
+        EvalAffineMap(transfer.getPermutationMap(), src_indices);
+
+    // Note: the handling of padding and passthrough values is somewhat
+    // arbitrary here. At the time of writing this, there seems to be little
+    // evidence of actual usage of this feature.
+    if (!mask || mask->at(dst_indices)) {
+      dst.InsertElement(dst_indices, src_slice->ExtractElement(src_indices));
+    }
+  }
+
+  return dst;
+}
+
+llvm::SmallVector<InterpreterValue> TransferWrite(
+    InterpreterState& state, vector::TransferWriteOp transfer,
+    InterpreterValue src, InterpreterValue dst, ArrayRef<int64_t> offsets,
+    std::optional<TensorOrMemref<bool>> mask) {
+  if (auto* mask_channel = state.GetTopScope()->GetSideChannel<MaskSideChannel>(
+          /*optional=*/true)) {
+    if (mask) {
+      state.AddFailure(
+          "vector.mask and transfer_write with mask should not be used "
+          "simultaneously");
+      return {};
+    }
+    if (mask_channel->GetPassthrough()) {
+      state.AddFailure(
+          "vector.mask with passthrough should not be used with "
+          "transfer_write");
+      return {};
+    }
+    mask = mask_channel->GetMask();
+  }
+
+  const auto& src_view = src.View();
+  assert(transfer.getPermutationMap().getNumResults() == src_view.Rank() &&
+         "expected matching number of results");
+
+  dst = transfer.getSource().getType().isa<TensorType>() ? dst.Clone() : dst;
+  auto dst_slice = ExtractMemorySlice(state, transfer.getPermutationMap(), dst,
+                                      src, offsets, transfer.getInBounds());
+  if (!dst_slice) {
+    return {};
+  }
+
+  for (const auto& dst_indices : dst_slice->View().Indices()) {
+    SmallVector<int64_t> src_indices =
+        EvalAffineMap(transfer.getPermutationMap(), dst_indices);
+    if (src_view.InBounds(src_indices) && (!mask || mask->at(src_indices))) {
+      dst_slice->InsertElement(dst_indices, src.ExtractElement(src_indices));
+    }
+  }
+
+  if (transfer->getNumResults() == 0) {
+    return {};
+  }
+  return {dst};
+}
+
+InterpreterValue Transpose(InterpreterState&, vector::TransposeOp transpose,
+                           const InterpreterValue& vector) {
+  return TransposeImpl(vector, transpose.getPermutation());
+}
+
+InterpreterValue TypeCast(InterpreterState&, vector::TypeCastOp,
+                          InterpreterValue vector) {
+  vector.View().num_vector_dims = vector.View().Rank();
+  return vector;
+}
+
+uint64_t VScale(InterpreterState&, vector::VectorScaleOp) { return 1; }
+
+REGISTER_MLIR_INTERPRETER_OP("vector.yield", NoOpTerminator);
+REGISTER_MLIR_INTERPRETER_OP(Bitcast);
+REGISTER_MLIR_INTERPRETER_OP(Broadcast);
+REGISTER_MLIR_INTERPRETER_OP(CompressStore);
+REGISTER_MLIR_INTERPRETER_OP(ConstantMask);
+REGISTER_MLIR_INTERPRETER_OP(Contract);
+REGISTER_MLIR_INTERPRETER_OP(CreateMask);
+REGISTER_MLIR_INTERPRETER_OP(ExpandLoad);
+REGISTER_MLIR_INTERPRETER_OP(Extract);
+REGISTER_MLIR_INTERPRETER_OP(ExtractElement);
+REGISTER_MLIR_INTERPRETER_OP(ExtractSlice);
+REGISTER_MLIR_INTERPRETER_OP(FusedMultiplyAdd);
+REGISTER_MLIR_INTERPRETER_OP(FlatTranspose);
+REGISTER_MLIR_INTERPRETER_OP(Gather);
+REGISTER_MLIR_INTERPRETER_OP(Insert);
+REGISTER_MLIR_INTERPRETER_OP(InsertElement);
+REGISTER_MLIR_INTERPRETER_OP(InsertSlice);
+REGISTER_MLIR_INTERPRETER_OP(Load);
+REGISTER_MLIR_INTERPRETER_OP(Mask);
+REGISTER_MLIR_INTERPRETER_OP(MaskedLoad);
+REGISTER_MLIR_INTERPRETER_OP(MaskedStore);
+REGISTER_MLIR_INTERPRETER_OP(MultiReduction);
+REGISTER_MLIR_INTERPRETER_OP(OuterProduct);
+REGISTER_MLIR_INTERPRETER_OP(Reduction);
+REGISTER_MLIR_INTERPRETER_OP(ShapeCast);
+REGISTER_MLIR_INTERPRETER_OP(Shuffle);
+REGISTER_MLIR_INTERPRETER_OP(Splat);
+REGISTER_MLIR_INTERPRETER_OP(Store);
+REGISTER_MLIR_INTERPRETER_OP(TransferRead);
+REGISTER_MLIR_INTERPRETER_OP(TransferWrite);
+REGISTER_MLIR_INTERPRETER_OP(Transpose);
+REGISTER_MLIR_INTERPRETER_OP(TypeCast);
+REGISTER_MLIR_INTERPRETER_OP(VScale);
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/BUILD b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/BUILD
new file mode 100644
index 00000000000000..5ac54bd0703a7e
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/BUILD
@@ -0,0 +1,42 @@
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [":friends"],
+    licenses = ["notice"],
+)
+
+package_group(
+    name = "friends",
+    includes = [
+        "//xla:friends",
+    ],
+)
+
+cc_library(
+    name = "framework",
+    srcs = [
+        "interpreter.cc",
+        "interpreter_value.cc",
+        "registration.cc",
+        "tensor_or_memref.cc",
+    ],
+    hdrs = [
+        "interpreter.h",
+        "interpreter_value.h",
+        "interpreter_value_util.h",
+        "registration.h",
+        "tensor_or_memref.h",
+    ],
+    deps = [
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:BufferizationInterfaces",
+        "@llvm-project//mlir:DialectUtils",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/interpreter.cc b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/interpreter.cc
new file mode 100644
index 00000000000000..f5476bf74296ba
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/interpreter.cc
@@ -0,0 +1,152 @@
+/* Copyright 2022 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter.h"
+
+#include <cassert>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <utility>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Region.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/registration.h"
+
+namespace mlir {
+namespace interpreter {
+
+SmallVector<InterpreterValue> Interpret(InterpreterState& state,
+                                        Operation& op) {
+  auto fn = detail::GetFunction(op.getName().getStringRef());
+  if (!fn) {
+    llvm::errs() << "Unsupported op: " << op.getName().getStringRef() << "\n";
+    op.dump();
+    state.AddFailure("unsupported op");
+    return {};
+  }
+  SmallVector<InterpreterValue> operands;
+  for (auto operand : op.getOperands()) {
+    operands.push_back(state.GetTopScope()->Get(operand));
+  }
+  state.GetOptions().listener->BeforeOp(operands, &op);
+  auto results = fn(operands, &op, state);
+  for (auto* scope = state.GetTopScope(); scope != nullptr;
+       scope = scope->GetParentScope()) {
+    scope->Verify();
+  }
+  if (state.HasFailure()) {
+    llvm::errs() << "Encountered failure while executing " << op << "\n";
+  }
+  state.GetOptions().listener->AfterOp(results);
+  state.Step();
+  return results;
+}
+
+SmallVector<InterpreterValue> Interpret(InterpreterState& state, Region& region,
+                                        ArrayRef<InterpreterValue> bbargs) {
+  if (state.HasFailure()) return {};
+  assert(region.hasOneBlock() && "expected region to have one block");
+  state.GetOptions().listener->EnterRegion(bbargs, region);
+  InterpreterScope scope(state);
+
+  auto& block = region.getBlocks().front();
+  for (auto [value, interpreter_value] :
+       llvm::zip(block.getArguments(), bbargs)) {
+    scope.Set(value, interpreter_value);
+  }
+
+  std::optional<SmallVector<InterpreterValue>> block_results;
+  for (mlir::Operation& op : block) {
+    auto results = Interpret(state, op);
+    if (state.HasFailure()) return {};
+    if (op.hasTrait<OpTrait::IsTerminator>()) {
+      assert(!block_results.has_value() && "Expected at most one terminator");
+      block_results = results;
+    } else {
+      if (results.size() != op.getNumResults()) {
+        llvm::errs() << "Unexpected number of results while interpreting "
+                     << op.getName().getStringRef() << ". Interpreter bug?\n";
+        llvm_unreachable("unexpected number of results");
+      }
+      for (auto [v, iv] : llvm::zip(op.getResults(), results)) {
+        scope.Set(v, iv);
+      }
+    }
+  }
+  if (!block_results) {
+    block_results = SmallVector<InterpreterValue>{};
+  }
+  state.GetOptions().listener->LeaveRegion(*block_results);
+  return *std::move(block_results);
+}
+
+InterpreterState::InterpreterState(const mlir::SymbolTable& symbols,
+                                   InterpreterOptions options)
+    : symbols_(symbols), options_(options) {
+  if (!options_.listener) {
+    static auto& no_op_listener = *new InterpreterListener();
+    this->options_.listener = &no_op_listener;
+  }
+  if (options_.max_steps) {
+    remaining_steps_ = *options_.max_steps;
+  }
+}
+
+void InterpreterState::AddFailure(llvm::StringRef failure) {
+  failed_ = true;
+  options_.error_handler(failure);
+}
+
+void InterpreterScope::Verify() const {
+  for (auto& [_, value] : values_) {
+    if (value.IsTensor() && value.GetBuffer() &&
+        !value.GetBuffer()->GetFailure().empty()) {
+      state_.AddFailure(value.GetBuffer()->GetFailure());
+      break;
+    }
+  }
+}
+
+InterpreterScope::~InterpreterScope() {
+  Verify();
+  state_.top_scope_ = parent_scope_;
+}
+
+absl::StatusOr<SmallVector<InterpreterValue>> RunInterpreter(
+    const mlir::SymbolTable& symbols, mlir::func::FuncOp function,
+    ArrayRef<InterpreterValue> args, InterpreterOptions options) {
+  InterpreterState state{symbols, std::move(options)};
+  auto results = Interpret(state, function.getBody(), args);
+  if (state.HasFailure()) {
+    return absl::InvalidArgumentError("Interpreter failed, check error logs");
+  }
+  return results;
+}
+
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/interpreter.h b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/interpreter.h
new file mode 100644
index 00000000000000..8251906f44b08f
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/interpreter.h
@@ -0,0 +1,186 @@
+/* Copyright 2022 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_MLIR_TOOLS_MLIR_INTERPRETER_FRAMEWORK_INTERPRETER_H_
+#define XLA_MLIR_TOOLS_MLIR_INTERPRETER_FRAMEWORK_INTERPRETER_H_
+
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <optional>
+
+#include "absl/status/statusor.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h"
+
+namespace mlir {
+namespace interpreter {
+
+class InterpreterScope;
+
+class InterpreterListener {
+ public:
+  virtual ~InterpreterListener() = default;
+  virtual void BeforeOp(ArrayRef<InterpreterValue> args, mlir::Operation*) {}
+  virtual void AfterOp(ArrayRef<InterpreterValue> results) {}
+  virtual void EnterRegion(ArrayRef<InterpreterValue> args,
+                           mlir::Region& region) {}
+  virtual void LeaveRegion(ArrayRef<InterpreterValue> terminator_args) {}
+};
+
+struct InterpreterStats {
+  // Memrefs only.
+  int64_t heap_size = 0;
+  int64_t peak_heap_size = 0;
+  int64_t num_allocations = 0;
+  int64_t num_deallocations = 0;
+};
+
+struct InterpreterOptions {
+  InterpreterListener* listener = nullptr;
+  std::optional<int64_t> max_steps = std::nullopt;
+  // If set, ignore deallocations. Normally, accessing a deallocated memref will
+  // trigger an assertion. This flag disables all allocations, which can be
+  // useful when debugging IR that includes a use-after-free bug.
+  bool disable_deallocations = false;
+  std::function<void(llvm::StringRef)> error_handler =
+      [](llvm::StringRef failure) {
+        llvm::errs() << "Interpreter failure: " << failure << "\n";
+      };
+  InterpreterStats* stats = nullptr;
+};
+
+class InterpreterState {
+ public:
+  InterpreterState(const mlir::SymbolTable& symbols,
+                   InterpreterOptions options);
+
+  void Step() {
+    if (remaining_steps_ == 0) {
+      AddFailure("maximum number of steps exceeded");
+      return;
+    }
+    --remaining_steps_;
+  }
+  void AddFailure(llvm::StringRef failure);
+  bool HasFailure() const { return failed_; }
+  void CheckSuccess(LogicalResult result, llvm::StringRef failure) {
+    if (!result.succeeded()) {
+      AddFailure(failure);
+    }
+  }
+
+  InterpreterScope* GetTopScope() { return top_scope_; }
+  const mlir::SymbolTable& GetSymbols() const { return symbols_; }
+  const InterpreterOptions& GetOptions() { return options_; }
+
+ private:
+  const mlir::SymbolTable& symbols_;
+  InterpreterScope* top_scope_ = nullptr;
+  bool failed_ = false;
+  InterpreterOptions options_;
+  int64_t remaining_steps_ = std::numeric_limits<int64_t>::max();
+
+  friend class InterpreterScope;
+  friend class InterpreterScopeStash;
+};
+
+// Used for passing arbitrary data to ops in sub-regions.
+class InterpreterSideChannel {
+ public:
+  virtual ~InterpreterSideChannel() = default;
+};
+
+// Holds a mapping from SSA values to InterpreterValues and registered side
+// channels. There's typically one scope per region, but ops can add additional
+// scopes if needed (for example, to register a side channel).
+class InterpreterScope {
+ public:
+  InterpreterScope(InterpreterScope&&) = delete;
+  explicit InterpreterScope(InterpreterState& state)
+      : state_(state), parent_scope_(state.top_scope_) {
+    state.top_scope_ = this;
+  }
+  ~InterpreterScope();
+
+  void Set(Value v, InterpreterValue iv) { values_[v] = std::move(iv); }
+
+  const InterpreterValue& Get(Value v) {
+    auto ret = values_.find(v);
+    if (ret == values_.end()) {
+      if (!parent_scope_) {
+        v.dump();
+      }
+
+      assert(parent_scope_ && "value not found");
+      return parent_scope_->Get(v);
+    }
+    return ret->second;
+  }
+
+  void Verify() const;
+
+  // Retrieves the side channel of the given type in this scope or one of its
+  // ancestor scopes. If `optional` is set, returns nullptr if not found,
+  // otherwise asserts.
+  template <typename T>
+  T* GetSideChannel(bool optional = false) {
+    for (auto& side_channel : side_channels_) {
+      if (auto it = dynamic_cast<T*>(side_channel.get())) {
+        return it;
+      }
+    }
+    if (!parent_scope_ && optional) return nullptr;
+    assert(parent_scope_ && "side channel not found");
+    return parent_scope_->GetSideChannel<T>(optional);
+  }
+
+  // Registers the given side channel. Will shadow a side channel of the same
+  // type if registered in an outer scope.
+  // The behavior of registering two side channels of the same type in the same
+  // scope is undefined.
+  void SetSideChannel(std::shared_ptr<InterpreterSideChannel> side_channel) {
+    side_channels_.push_back(std::move(side_channel));
+  }
+
+  InterpreterScope* GetParentScope() const { return parent_scope_; }
+
+ private:
+  DenseMap<Value, InterpreterValue> values_;
+  SmallVector<std::shared_ptr<InterpreterSideChannel>> side_channels_;
+
+  InterpreterState& state_;
+  InterpreterScope* parent_scope_;
+
+  friend class InterpreterScopeStash;
+};
+
+// Interprets the given region and returns the terminator's arguments. The
+// region must have a single block.
+SmallVector<InterpreterValue> Interpret(InterpreterState& state, Region& region,
+                                        ArrayRef<InterpreterValue> bbargs);
+
+// Interprets the given function.
+absl::StatusOr<SmallVector<InterpreterValue>> RunInterpreter(
+    const mlir::SymbolTable& symbols, mlir::func::FuncOp function,
+    ArrayRef<InterpreterValue> args, InterpreterOptions options = {});
+
+}  // namespace interpreter
+}  // namespace mlir
+
+#endif  // XLA_MLIR_TOOLS_MLIR_INTERPRETER_FRAMEWORK_INTERPRETER_H_
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/interpreter_value.cc b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/interpreter_value.cc
new file mode 100644
index 00000000000000..7c0a3e756c764f
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/interpreter_value.cc
@@ -0,0 +1,389 @@
+/* Copyright 2022 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h"
+
+#include <cassert>
+#include <complex>
+#include <cstdint>
+#include <functional>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <type_traits>
+#include <variant>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/mlir/tools/mlir_interpreter/framework/tensor_or_memref.h"
+
+namespace mlir {
+namespace interpreter {
+
+namespace {
+struct TypeStr {
+  static std::string_view Get(bool) { return "i1"; }
+  static std::string_view Get(int64_t) { return "i64"; }
+  static std::string_view Get(int32_t) { return "i32"; }
+  static std::string_view Get(int16_t) { return "i16"; }
+  static std::string_view Get(int8_t) { return "i8"; }
+  static std::string_view Get(uint64_t) { return "ui64"; }
+  static std::string_view Get(uint32_t) { return "ui32"; }
+  static std::string_view Get(uint16_t) { return "ui16"; }
+  static std::string_view Get(uint8_t) { return "ui8"; }
+  static std::string_view Get(float) { return "f32"; }
+  static std::string_view Get(double) { return "f64"; }
+  static std::string_view Get(std::complex<float>) { return "complex<f32>"; }
+  static std::string_view Get(std::complex<double>) { return "complex<f64>"; }
+};
+
+struct InterpreterValuePrinter {
+  llvm::raw_ostream& os;
+
+  template <typename T>
+  void operator()(const TensorOrMemref<T>& t) {
+    if (!t.buffer) {
+      os << "Memref: null";
+      return;
+    }
+
+    if (t.view.is_vector) {
+      os << "vector<";
+    } else {
+      os << "TensorOrMemref<";
+    }
+    ArrayRef<int64_t> sizes = t.view.sizes;
+    for (int64_t size : sizes.drop_back(t.view.num_vector_dims.value_or(0))) {
+      os << size << "x";
+    }
+    if (t.view.num_vector_dims) {
+      os << "vector<";
+      for (int64_t size : sizes.take_back(*t.view.num_vector_dims)) {
+        os << size << "x";
+      }
+      os << TypeStr::Get(T{}) << ">>: ";
+    } else {
+      os << TypeStr::Get(T{}) << ">: ";
+    }
+    SmallVector<int64_t> indices(t.view.Rank() +
+                                 t.view.num_vector_dims.value_or(0));
+    std::function<void(int64_t)> print;
+    print = [&](int64_t dim) {
+      if (dim == indices.size()) {
+        PrintScalar(t.at(indices));
+      } else {
+        os << "[";
+        for (int64_t i = 0; i < t.view.sizes[dim]; ++i) {
+          if (i > 0) os << ", ";
+          indices[dim] = i;
+          print(dim + 1);
+        }
+        os << "]";
+      }
+    };
+    if (t.buffer->Deallocated()) {
+      os << "<<deallocated>>";
+    } else {
+      print(0);
+    }
+  }
+
+  void operator()(const Tuple& t) {
+    os << "(";
+    bool first = true;
+    for (const auto& v : t.values) {
+      if (!first) os << ", ";
+      first = false;
+      v->Print(os);
+    }
+    os << ")";
+  }
+
+  template <typename T>
+  void operator()(const T& t) {
+    os << TypeStr::Get(t) << ": ";
+    PrintScalar(t);
+  }
+
+  template <typename T>
+  void PrintScalar(const T& v) {
+    os << v;
+  }
+
+  template <typename T>
+  void PrintScalar(const std::complex<T>& v) {
+    os << v.real() << (v.imag() >= 0 ? "+" : "") << v.imag() << "i";
+  }
+
+  void PrintScalar(bool v) { os << (v ? "true" : "false"); }
+
+  void PrintScalar(int8_t v) { os << (int)v; }
+  void PrintScalar(uint8_t v) { os << (int)v; }
+};
+}  // namespace
+
+void InterpreterValue::Print(llvm::raw_ostream& os) const {
+  std::visit(InterpreterValuePrinter{os}, storage);
+}
+
+std::string InterpreterValue::ToString() const {
+  std::string buf;
+  llvm::raw_string_ostream os(buf);
+  Print(os);
+  return buf;
+}
+
+InterpreterValue InterpreterValue::ExtractElement(
+    llvm::ArrayRef<int64_t> indices) const {
+  return std::visit(
+      [&](auto& it) -> InterpreterValue {
+        using T = std::decay_t<decltype(it)>;
+        if constexpr (is_tensor_or_memref_v<T>) {
+          if (it.view.num_vector_dims) {
+            return {it.VectorAt(indices)};
+          } else {
+            return {it.at(indices)};
+          }
+        } else if constexpr (std::is_same_v<T, Tuple>) {
+          llvm_unreachable("extracting from tuples is unsupported");
+        } else {
+          return {it};
+        }
+      },
+      storage);
+}
+
+void InterpreterValue::InsertElement(llvm::ArrayRef<int64_t> indices,
+                                     const InterpreterValue& value) {
+  std::visit(
+      [&](auto& it) {
+        using T = std::decay_t<decltype(it)>;
+        if constexpr (is_tensor_or_memref_v<T>) {
+          if (it.view.num_vector_dims) {
+            auto subview = it.VectorAt(indices);
+            const auto& values = std::get<T>(value.storage);
+            assert(values.view.sizes == subview.view.sizes &&
+                   "mismatched sizes");
+            for (const auto& index : subview.view.Indices()) {
+              subview.at(index) = values.at(index);
+            }
+          } else {
+            it.at(indices) = std::get<typename T::element_type>(value.storage);
+          }
+        } else if constexpr (std::is_same_v<T, Tuple>) {
+          llvm_unreachable("inserting into tuples is unsupported");
+        } else {
+          it = std::get<T>(value.storage);
+        }
+      },
+      storage);
+}
+
+void InterpreterValue::Fill(
+    const std::function<InterpreterValue(llvm::ArrayRef<int64_t> indices)>& f) {
+  std::visit(
+      [&](auto& it) {
+        using T = std::decay_t<decltype(it)>;
+        if constexpr (is_tensor_or_memref_v<T>) {
+          for (const auto& indices : it.view.Indices()) {
+            if (it.view.num_vector_dims) {
+              auto subview = it.VectorAt(indices);
+              auto value = std::get<T>(f(indices).storage);
+              for (const auto& index : subview.view.Indices()) {
+                subview.at(index) = value.at(index);
+              }
+            } else {
+              it.at(indices) =
+                  std::get<typename T::element_type>(f(indices).storage);
+            }
+          }
+        } else if constexpr (std::is_same_v<T, Tuple>) {
+          llvm_unreachable("Filling tuples is unsupported");
+        } else {
+          it = std::get<T>(f({}).storage);
+        }
+      },
+      storage);
+}
+
+InterpreterValue InterpreterValue::Clone(ArrayRef<int64_t> layout) const {
+  return std::visit(
+      [&](const auto& it) -> InterpreterValue {
+        using T = std::decay_t<decltype(it)>;
+        if constexpr (is_tensor_or_memref_v<T>) {
+          return {it.Clone(layout)};
+        } else if constexpr (std::is_same_v<T, Tuple>) {
+          llvm_unreachable("cloning tuples is unsupported");
+        } else {
+          return {it};
+        }
+      },
+      storage);
+}
+
+InterpreterValue InterpreterValue::CoerceLayout(
+    ArrayRef<int64_t> layout) const {
+  const auto& view = this->View();
+  if (view.strides == BufferView::GetStridesForLayout(view.sizes, layout)) {
+    return *this;
+  }
+  return Clone(layout);
+}
+
+InterpreterValue InterpreterValue::TypedAlike(
+    llvm::ArrayRef<int64_t> shape) const {
+  return std::visit(
+      [&](const auto& it) -> InterpreterValue {
+        using T = std::decay_t<decltype(it)>;
+        if constexpr (is_tensor_or_memref_v<T>) {
+          return {T::Empty(shape)};
+        } else if constexpr (std::is_same_v<T, Tuple>) {
+          llvm_unreachable("TypedAlike for tuples is unsupported");
+        } else {
+          return {TensorOrMemref<T>::Empty(shape)};
+        }
+      },
+      storage);
+}
+
+InterpreterValue InterpreterValue::MakeTensor(mlir::Type element_type,
+                                              SmallVector<int64_t> shape) {
+  auto vector_ty = llvm::dyn_cast<VectorType>(element_type);
+  if (vector_ty) {
+    llvm::copy(vector_ty.getShape(), std::back_inserter(shape));
+  }
+  return DispatchScalarType(element_type, [&](auto dummy) -> InterpreterValue {
+    auto tensor = TensorOrMemref<decltype(dummy)>::Empty(shape);
+    if (vector_ty) {
+      tensor.view.num_vector_dims = vector_ty.getRank();
+    }
+    return {tensor};
+  });
+}
+
+BufferView& InterpreterValue::View() {
+  return std::visit(
+      [](auto& it) -> BufferView& {
+        if constexpr (is_tensor_or_memref_v<decltype(it)>) {
+          return it.view;
+        }
+        llvm_unreachable("view is only supported for tensors");
+      },
+      storage);
+}
+
+const BufferView& InterpreterValue::View() const {
+  return std::visit(
+      [](const auto& it) -> const BufferView& {
+        if constexpr (is_tensor_or_memref_v<decltype(it)>) {
+          return it.view;
+        }
+        llvm_unreachable("view is only supported for tensors");
+      },
+      storage);
+}
+
+bool InterpreterValue::IsTensor() const {
+  return std::visit(
+      [](const auto& it) { return is_tensor_or_memref_v<decltype(it)>; },
+      storage);
+}
+
+InterpreterValue InterpreterValue::AsUnitTensor(bool is_vector) const {
+  auto result = TypedAlike({});
+  result.InsertElement({}, *this);
+  result.View().is_vector = is_vector;
+  return result;
+}
+
+bool Tuple::operator==(const Tuple& other) const {
+  if (other.values.size() != values.size()) return false;
+  for (const auto& [lhs, rhs] : llvm::zip(values, other.values)) {
+    if (!(*lhs == *rhs)) return false;
+  }
+  return true;
+}
+
+std::shared_ptr<Buffer> InterpreterValue::GetBuffer() const {
+  return std::visit(
+      [](const auto& it) -> std::shared_ptr<interpreter::Buffer> {
+        if constexpr (is_tensor_or_memref_v<decltype(it)>) {
+          return it.buffer;
+        } else {
+          llvm_unreachable("buffer() is only supported for tensors");
+        }
+      },
+      storage);
+}
+
+int64_t InterpreterValue::AsInt() const {
+  auto visit = [](auto value) -> int64_t {
+    if constexpr (std::is_integral_v<decltype(value)>) {
+      return static_cast<int64_t>(value);
+    } else {
+      llvm_unreachable("only integral types can be converted to ints");
+    }
+  };
+  return std::visit(visit, storage);
+}
+
+uint64_t InterpreterValue::AsUInt() const {
+  auto visit = [](auto value) -> uint64_t {
+    if constexpr (std::is_integral_v<decltype(value)>) {
+      if constexpr (std::is_signed_v<decltype(value)>) {
+        return static_cast<uint64_t>(
+            static_cast<std::make_unsigned_t<decltype(value)>>(value));
+      } else {
+        return static_cast<uint64_t>(value);
+      }
+    } else {
+      llvm_unreachable("only integral types can be converted to ints");
+    }
+  };
+  return std::visit(visit, storage);
+}
+
+double InterpreterValue::AsDouble() const {
+  auto visit = [](auto value) -> int64_t {
+    if constexpr (std::is_floating_point_v<decltype(value)>) {
+      return static_cast<double>(value);
+    } else {
+      llvm_unreachable("only float types can be converted to ints");
+    }
+  };
+  return std::visit(visit, storage);
+}
+
+int64_t InterpreterValue::GetByteSizeOfElement() const {
+  return std::visit(
+      [](const auto& it) -> int64_t {
+        using T = std::decay_t<decltype(it)>;
+        if constexpr (is_tensor_or_memref_v<T>) {
+          return sizeof(typename T::element_type);
+        } else {
+          llvm_unreachable("scalars have no element sizes");
+        }
+      },
+      storage);
+}
+
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h
new file mode 100644
index 00000000000000..69228c4edc7c13
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h
@@ -0,0 +1,227 @@
+/* Copyright 2022 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_MLIR_TOOLS_MLIR_INTERPRETER_FRAMEWORK_INTERPRETER_VALUE_H_
+#define XLA_MLIR_TOOLS_MLIR_INTERPRETER_FRAMEWORK_INTERPRETER_VALUE_H_
+
+#include <cassert>
+#include <complex>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <variant>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/mlir/tools/mlir_interpreter/framework/tensor_or_memref.h"
+
+namespace mlir {
+namespace interpreter {
+
+struct InterpreterValue;
+
+struct Tuple {
+  bool operator==(const Tuple& other) const;
+
+  SmallVector<std::shared_ptr<InterpreterValue>> values;
+};
+
+// Holds a scalar, a tensor/memref or a tuple. Tensors/memrefs can also
+// represent vectors.
+struct InterpreterValue {
+  void Print(llvm::raw_ostream& os) const;
+  std::string ToString() const;
+
+  // Returns the element at the given indices. If the value is a scalar, returns
+  // itself.
+  InterpreterValue ExtractElement(llvm::ArrayRef<int64_t> indices) const;
+  // Sets the element at the given index. If the value is a scalar, sets its
+  // value.
+  void InsertElement(llvm::ArrayRef<int64_t> indices,
+                     const InterpreterValue& value);
+  // Initializes all elements of the underlying tensor.
+  void Fill(
+      const std::function<InterpreterValue(llvm::ArrayRef<int64_t> indices)>&
+          f);
+
+  // Converts a scalar to a unit tensor or vector.
+  InterpreterValue AsUnitTensor(bool is_vector = false) const;
+  // For integral interpreter values, casts them to int64.
+  int64_t AsInt() const;
+  // For integral interpreter values, first casts them to the unsigned integer
+  // type of the same size, and then to uint64. For example, the result for
+  // int8_t{-1} is 255.
+  uint64_t AsUInt() const;
+  // For floating point scalars, casts them to double.
+  double AsDouble() const;
+  // Must be a tensor or memref.
+  int64_t GetByteSizeOfElement() const;
+
+  // Creates a new tensor InterpreterValue (backed a new buffer) with the same
+  // element type as this, but a different shape. If this is not a tensor, it is
+  // used as the element type.
+  // If `layout` is empty, the clone uses the default layout.
+  InterpreterValue Clone(ArrayRef<int64_t> layout = {}) const;
+  // Returns either this tensor InterpreterValue (if its layout matches the
+  // requested layout) or a clone.
+  InterpreterValue CoerceLayout(llvm::ArrayRef<int64_t> layout) const;
+  // Returns a tensor interpreter value with a newly allocated buffer of the
+  // given shape, with a default layout and the same element type as this
+  // interpreter value.
+  InterpreterValue TypedAlike(llvm::ArrayRef<int64_t> shape) const;
+
+  // Creates a tensor with the given element type and shape. `element_type` may
+  // be a vector type, in which case the shape only specifies the non-vector
+  // dimensions.
+  static InterpreterValue MakeTensor(mlir::Type element_type,
+                                     SmallVector<int64_t> shape);
+
+  // Returns the underlying tensor's view. Must be a tensor.
+  BufferView& View();
+  const BufferView& View() const;
+  // Returns the underlying tensor's buffer. Must be a tensor.
+  std::shared_ptr<Buffer> GetBuffer() const;
+
+  bool IsTensor() const;
+
+  bool operator==(const InterpreterValue& other) const {
+    if (storage.index() != other.storage.index()) return false;
+    if (IsTensor() || std::holds_alternative<Tuple>(storage))
+      return storage == other.storage;
+    // Tensors treat NaNs as equal, so just wrap the values.
+    return AsUnitTensor() == other.AsUnitTensor();
+  }
+
+  std::variant<
+      Tuple, bool, float, double, uint8_t, int8_t, uint16_t, int16_t, uint32_t,
+      int32_t, uint64_t, int64_t, std::complex<float>, std::complex<double>,
+      TensorOrMemref<bool>, TensorOrMemref<float>, TensorOrMemref<double>,
+      TensorOrMemref<uint8_t>, TensorOrMemref<int8_t>, TensorOrMemref<uint16_t>,
+      TensorOrMemref<int16_t>, TensorOrMemref<uint32_t>,
+      TensorOrMemref<int32_t>, TensorOrMemref<uint64_t>,
+      TensorOrMemref<int64_t>, TensorOrMemref<std::complex<float>>,
+      TensorOrMemref<std::complex<double>>>
+      storage;
+};
+
+template <typename T>
+constexpr static bool is_valid_interpreter_value_v =  // NOLINT
+    std::is_constructible_v<decltype(InterpreterValue::storage), T>;
+
+// Attempts to cast the given value to the requested type, returning nullopt if
+// no cast is possible. This allows casts to the concrete type of the value
+// (e.g. an `InterpreterValue` containing a `Tuple` can be cast to `Tuple`),
+// casts from a unit tensor to their contents, and casts of scalars to any
+// convertible type.
+// NOTE: When casting to an unsigned type, this behaves differently than
+// InterpreterValue::AsUint. That function preserves the content's bit width,
+// so InterpreterValueDynCast<uint64_t>({int8_t{-1}}) will return 2^64-1,
+// whereas AsUInt will return 255.
+template <typename T>
+std::optional<T> InterpreterValueDynCast(InterpreterValue v) {
+  if constexpr (std::is_same_v<T, InterpreterValue>) {
+    return v;
+  }
+  if constexpr (is_valid_interpreter_value_v<T>) {
+    if (std::holds_alternative<T>(v.storage)) {
+      return std::get<T>(v.storage);
+    }
+  }
+  if (v.IsTensor() && !is_tensor_or_memref_v<T>) {
+    if (v.View().GetNumElements() != 1) {
+      return std::nullopt;
+    }
+    return InterpreterValueDynCast<T>(v.ExtractElement({}));
+  }
+  return std::visit(
+      [](auto v) -> std::optional<T> {
+        if constexpr (std::is_convertible_v<decltype(v), T>) {
+          return v;
+        } else {
+          return std::nullopt;
+        }
+      },
+      v.storage);
+}
+
+template <typename T>
+T InterpreterValueCast(InterpreterValue v) {
+  auto ret = InterpreterValueDynCast<T>(v);
+  assert(ret && "cast failed");
+  return *std::move(ret);
+}
+
+// Calls functor with a value of the C++ type corresponding to the given `Type`,
+// (or its element type).
+template <class Fn>
+auto DispatchScalarType(mlir::Type ty, Fn&& functor) {
+  ty = getElementTypeOrSelf(ty);
+  if (ty.isF32()) {
+    return functor(float{});
+  }
+  if (ty.isF64()) {
+    return functor(double{});
+  }
+  if (ty.isUnsignedInteger(64)) {
+    return functor(uint64_t{});
+  }
+  if (ty.isInteger(64) || ty.isIndex()) {
+    return functor(int64_t{});
+  }
+  if (ty.isUnsignedInteger(32)) {
+    return functor(uint32_t{});
+  }
+  if (ty.isInteger(32)) {
+    return functor(int32_t{});
+  }
+  if (ty.isUnsignedInteger(16)) {
+    return functor(uint16_t{});
+  }
+  if (ty.isInteger(16)) {
+    return functor(int16_t{});
+  }
+  if (ty.isUnsignedInteger(8)) {
+    return functor(uint8_t{});
+  }
+  if (ty.isInteger(8)) {
+    return functor(int8_t{});
+  }
+  if (ty.isInteger(1)) {
+    return functor(bool{});
+  }
+  if (auto complex = mlir::dyn_cast<ComplexType>(ty)) {
+    if (complex.getElementType().isF32()) {
+      return functor(std::complex<float>{});
+    }
+    if (complex.getElementType().isF64()) {
+      return functor(std::complex<double>{});
+    }
+  }
+
+  llvm::errs() << "DispatchScalarType unimplemented for " << ty << "\n";
+  llvm_unreachable("unimplemented");
+}
+
+}  // namespace interpreter
+}  // namespace mlir
+
+#endif  // XLA_MLIR_TOOLS_MLIR_INTERPRETER_FRAMEWORK_INTERPRETER_VALUE_H_
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/interpreter_value_util.h b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/interpreter_value_util.h
new file mode 100644
index 00000000000000..12f508dcd5fff3
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/interpreter_value_util.h
@@ -0,0 +1,190 @@
+/* Copyright 2022 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_MLIR_TOOLS_MLIR_INTERPRETER_FRAMEWORK_INTERPRETER_VALUE_UTIL_H_
+#define XLA_MLIR_TOOLS_MLIR_INTERPRETER_FRAMEWORK_INTERPRETER_VALUE_UTIL_H_
+
+#include <cassert>
+#include <complex>
+#include <type_traits>
+#include <utility>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/tensor_or_memref.h"
+
+namespace mlir {
+namespace interpreter {
+namespace detail {
+
+template <typename T>
+struct is_complex : std::false_type {};  // NOLINT
+
+template <typename T>
+struct is_complex<std::complex<T>> : std::true_type {};  // NOLINT
+
+template <typename Fn>
+struct InterpreterValueMapVisitor {
+  template <typename T>
+  InterpreterValue operator()(const TensorOrMemref<T>& t) {
+    if constexpr (Fn::template SupportedType<T>()) {
+      using out_elem_t = decltype(Fn::Apply(T()));
+      auto out = TensorOrMemref<out_elem_t>::EmptyLike(t.view);
+      for (const auto& index : out.view.Indices(true)) {
+        out.at(index) = Fn::Apply(t.at(index));
+      }
+      return {out};
+    } else {
+      llvm::errs() << llvm::getTypeName<Fn>()
+                   << " unsupported type: " << llvm::getTypeName<T>() << "\n";
+      llvm_unreachable("unsupported type");
+    }
+  }
+
+  InterpreterValue operator()(const Tuple& t) {
+    Tuple out;
+    for (const auto& value : t.values) {
+      out.values.push_back(std::make_unique<InterpreterValue>(
+          std::move(std::visit(*this, value->storage))));
+    }
+    return {out};
+  }
+
+  template <typename T>
+  InterpreterValue operator()(const T& t) {
+    if constexpr (Fn::template SupportedType<T>()) {
+      return {Fn::Apply(t)};
+    } else {
+      llvm::errs() << llvm::getTypeName<Fn>()
+                   << " unsupported type: " << llvm::getTypeName<T>() << "\n";
+      llvm_unreachable("unsupported type");
+    }
+  }
+};
+
+template <typename Fn>
+struct InterpreterValueBiMapVisitor {
+  const InterpreterValue& rhs;
+
+  template <typename T>
+  InterpreterValue operator()(const TensorOrMemref<T>& lhs_t) {
+    if constexpr (Fn::template SupportedType<T>()) {
+      using OutElemT = decltype(Fn::Apply(T(), T()));
+      auto out = TensorOrMemref<OutElemT>::EmptyLike(lhs_t.view);
+      const auto& rhs_t = std::get<TensorOrMemref<T>>(rhs.storage);
+      for (const auto& index : out.view.Indices(true)) {
+        out.at(index) = Fn::Apply(lhs_t.at(index), rhs_t.at(index));
+      }
+      return {out};
+    } else {
+      llvm::errs() << llvm::getTypeName<Fn>()
+                   << " unsupported type: " << llvm::getTypeName<T>() << "\n";
+      llvm_unreachable("unsupported type");
+    }
+  }
+
+  InterpreterValue operator()(const Tuple& lhs_t) {
+    const auto& rhs_t = std::get<Tuple>(rhs.storage);
+    Tuple out;
+    for (const auto& [lhs_v, rhs_v] : llvm::zip(lhs_t.values, rhs_t.values)) {
+      out.values.push_back(std::make_unique<InterpreterValue>(std::move(
+          std::visit(InterpreterValueBiMapVisitor{*rhs_v}, lhs_v->storage))));
+    }
+    return {std::move(out)};
+  }
+
+  template <typename T>
+  InterpreterValue operator()(const T& t) {
+    if constexpr (Fn::template SupportedType<T>()) {
+      return {Fn::Apply(t, std::get<T>(rhs.storage))};
+    } else {
+      llvm::errs() << llvm::getTypeName<Fn>()
+                   << " unsupported type: " << llvm::getTypeName<T>() << "\n";
+      llvm_unreachable("unsupported type");
+    }
+  }
+};
+
+}  // namespace detail
+
+template <typename T>
+inline constexpr bool is_complex_v = detail::is_complex<T>::value;  // NOLINT
+
+template <bool allow_bools, bool allow_ints, bool allow_floats,
+          bool allow_complex, bool allow_unsigned = true>
+struct FilterMapTraits {
+  template <typename T>
+  static constexpr bool SupportedType() {
+    constexpr bool is_bool = std::is_same_v<T, bool>;
+    constexpr bool is_int = std::is_integral_v<T> && !is_bool;
+    constexpr bool is_unsigned = std::is_unsigned_v<T>;
+    return (allow_bools && is_bool) ||
+           (allow_ints && is_int && (allow_unsigned || !is_unsigned)) ||
+           (allow_floats && std::is_floating_point_v<T>) ||
+           (allow_complex && is_complex_v<T>);
+  }
+};
+
+using CwiseAll = FilterMapTraits<true, true, true, true>;
+using CwiseArith = FilterMapTraits<false, true, true, true>;
+using CwiseComplex = FilterMapTraits<false, false, false, true>;
+using CwiseFloat = FilterMapTraits<false, false, true, false>;
+using CwiseInt = FilterMapTraits<false, true, false, false>;
+using CwiseIntegral = FilterMapTraits<true, true, false, false>;
+using CwiseNonIntegral = FilterMapTraits<false, false, true, true>;
+using CwiseReal = FilterMapTraits<false, true, true, false>;
+using CwiseSignedOrComplex = FilterMapTraits<false, true, true, true, false>;
+using CwiseSigned = FilterMapTraits<false, true, true, false, false>;
+
+template <typename Fn>
+InterpreterValue ApplyCwiseMap(const InterpreterValue& value) {
+  return std::visit(detail::InterpreterValueMapVisitor<Fn>{}, value.storage);
+}
+
+template <typename Fn>
+InterpreterValue ApplyCwiseBinaryMap(const InterpreterValue& lhs,
+                                     const InterpreterValue& rhs) {
+  assert(lhs.storage.index() == rhs.storage.index());
+  return std::visit(detail::InterpreterValueBiMapVisitor<Fn>{rhs}, lhs.storage);
+}
+
+// Unboxes (and casts if necessary) the given interpreter values. Asserts if the
+// types are incompatible.
+template <typename T>
+SmallVector<T> UnpackInterpreterValues(ArrayRef<InterpreterValue> values) {
+  SmallVector<T> result;
+  for (const auto& value : values) {
+    result.push_back(InterpreterValueCast<T>(value));
+  }
+  return result;
+}
+
+// Boxes the given values in InterpreterValues.
+template <typename T>
+SmallVector<InterpreterValue> PackInterpreterValues(ArrayRef<T> values) {
+  SmallVector<InterpreterValue> result;
+  for (const auto& value : values) {
+    result.push_back({value});
+  }
+  return result;
+}
+
+}  // namespace interpreter
+}  // namespace mlir
+
+#endif  // XLA_MLIR_TOOLS_MLIR_INTERPRETER_FRAMEWORK_INTERPRETER_VALUE_UTIL_H_
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/registration.cc b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/registration.cc
new file mode 100644
index 00000000000000..93ec63bbe06d53
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/registration.cc
@@ -0,0 +1,126 @@
+/* Copyright 2023 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/mlir/tools/mlir_interpreter/framework/registration.h"
+
+#include <cassert>
+#include <functional>
+#include <utility>
+
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h"
+
+namespace mlir {
+namespace interpreter {
+namespace detail {
+namespace {
+
+// Aliases and function names are wrapped in functions because function
+// registrations are called from static initializers, whose execution order is
+// undefined.
+DenseMap<llvm::StringRef, llvm::StringRef>& GetOpAliases() {
+  static DenseMap<llvm::StringRef, llvm::StringRef>* aliases = nullptr;
+  if (!aliases) {
+    aliases = new DenseMap<llvm::StringRef, llvm::StringRef>();
+  }
+  return *aliases;
+}
+
+DenseMap<llvm::StringRef, InterpreterFunction>& GetFunctions() {
+  static DenseMap<llvm::StringRef, InterpreterFunction>* functions = nullptr;
+  if (!functions) {
+    functions = new DenseMap<llvm::StringRef, InterpreterFunction>();
+  }
+  return *functions;
+}
+
+}  // namespace
+
+InterpreterFunction GetFunction(llvm::StringRef name) {
+  const auto& fns = GetFunctions();
+  auto fn = fns.find(name);
+  if (fn != fns.end()) {
+    return fn->second;
+  }
+  const auto& aliases = GetOpAliases();
+  auto alias = aliases.find(name);
+  if (alias != aliases.end()) {
+    return fns.find(alias->second)->second;
+  }
+  return nullptr;
+}
+
+void RegisterInterpreterOp(llvm::StringRef name,
+                           InterpreterValue (*fn)(const InterpreterValue&)) {
+  RegisterInterpreterOp(
+      name,
+      [fn](MutableArrayRef<InterpreterValue> operands, mlir::Operation*,
+           InterpreterState&) -> SmallVector<InterpreterValue> {
+        assert(operands.size() == 1 && "unexpected number of operands");
+        return {fn(operands[0])};
+      });
+}
+
+void RegisterInterpreterOp(llvm::StringRef name,
+                           InterpreterValue (*fn)(const InterpreterValue&,
+                                                  const InterpreterValue&)) {
+  RegisterInterpreterOp(
+      name,
+      [fn](MutableArrayRef<InterpreterValue> operands, mlir::Operation*,
+           InterpreterState&) -> SmallVector<InterpreterValue> {
+        assert(operands.size() == 2 && "unexpected number of operands");
+        return {fn(operands[0], operands[1])};
+      });
+}
+
+void RegisterInterpreterOp(
+    llvm::StringRef name,
+    InterpreterValue (*fn)(MutableArrayRef<InterpreterValue>)) {
+  RegisterInterpreterOp(
+      name,
+      [fn](MutableArrayRef<InterpreterValue> operands, mlir::Operation*,
+           InterpreterState&) -> SmallVector<InterpreterValue> {
+        return {fn(operands)};
+      });
+}
+
+void RegisterInterpreterOp(llvm::StringRef name,
+                           void (*fn)(MutableArrayRef<InterpreterValue>)) {
+  RegisterInterpreterOp(
+      name,
+      [fn](MutableArrayRef<InterpreterValue> operands, mlir::Operation*,
+           InterpreterState&) -> SmallVector<InterpreterValue> {
+        fn(operands);
+        return {};
+      });
+}
+
+void RegisterInterpreterOp(
+    llvm::StringRef name,
+    std::function<llvm::SmallVector<InterpreterValue>(
+        MutableArrayRef<InterpreterValue>, mlir::Operation*, InterpreterState&)>
+        fn) {
+  GetFunctions()[name] = std::move(fn);
+}
+
+void RegisterInterpreterOp(llvm::StringRef name, llvm::StringRef original) {
+  GetOpAliases()[name] = original;
+}
+
+}  // namespace detail
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/registration.h b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/registration.h
new file mode 100644
index 00000000000000..75de217bddfb40
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/registration.h
@@ -0,0 +1,229 @@
+/* Copyright 2023 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_MLIR_TOOLS_MLIR_INTERPRETER_FRAMEWORK_REGISTRATION_H_
+#define XLA_MLIR_TOOLS_MLIR_INTERPRETER_FRAMEWORK_REGISTRATION_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <optional>
+#include <type_traits>
+#include <utility>
+
+#include "absl/strings/str_cat.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/TypeName.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value_util.h"
+
+#define MLIR_INTERPRETER_CONCAT_IMPL(x, y) x##y
+#define MLIR_INTERPRETER_CONCAT(x, y) MLIR_INTERPRETER_CONCAT_IMPL(x, y)
+#define REGISTER_MLIR_INTERPRETER_OP(args...)                     \
+  static int MLIR_INTERPRETER_CONCAT(init_, __COUNTER__) = []() { \
+    ::mlir::interpreter::detail::RegisterInterpreterOp(args);     \
+    return 1;                                                     \
+  }();
+
+namespace mlir {
+namespace interpreter {
+namespace detail {
+
+// The generic signature for interpreter functions. Typically the type-checked
+// form should be used instead.
+using InterpreterFunction = std::function<SmallVector<InterpreterValue>(
+    MutableArrayRef<InterpreterValue>, mlir::Operation*, InterpreterState&)>;
+
+// Returns the given registered function, or nullptr if not found.
+InterpreterFunction GetFunction(llvm::StringRef name);
+
+// Simple unary ops.
+void RegisterInterpreterOp(llvm::StringRef name,
+                           InterpreterValue (*fn)(const InterpreterValue&));
+
+// Simple binary ops.
+void RegisterInterpreterOp(llvm::StringRef name,
+                           InterpreterValue (*fn)(const InterpreterValue&,
+                                                  const InterpreterValue&));
+
+template <typename T>
+struct is_optional : std::false_type {};  // NOLINT
+
+template <typename T>
+struct is_optional<std::optional<T>> : std::true_type {};  // NOLINT
+
+// Converts the given arguments to the requested type. Supported target types:
+// - InterpreterValue storage types (e.g. uint8_t, TensorOrMemref<float>).
+//   Scalars will be cast if necessary.
+// - std::optional of InterpreterValue storage types
+// - ArrayRef of InterpreterValue storage types. Will cast if necessary (e.g.
+//   int32_t -> int64_t).
+// - no-op conversions: InterpreterValue, ArrayRef<InterpreterValue>
+template <typename ArgT>
+auto TypedInterpreterOpConvertArg(MutableArrayRef<InterpreterValue> args,
+                                  InterpreterState& state) {
+  constexpr bool optional = is_optional<ArgT>::value;
+  constexpr bool single = std::is_same_v<ArgT, InterpreterValue> ||
+                          is_valid_interpreter_value_v<ArgT>;
+  if constexpr (optional) {
+    if (args.empty()) {
+      return ArgT{};
+    }
+  }
+
+  if constexpr (single || optional) {
+    if (args.size() != 1) {
+      state.AddFailure("Expected a single argument for the operand");
+      return ArgT{};
+    }
+  }
+
+  auto fail = [&]() {
+    state.AddFailure(absl::StrCat("Unable to convert argument (variant index ",
+                                  args[0].storage.index(), ") to ",
+                                  llvm::getTypeName<ArgT>().str()));
+    return ArgT{};
+  };
+
+  if constexpr (single) {
+    if (auto arg = InterpreterValueDynCast<ArgT>(args[0])) {
+      return ArgT{*arg};
+    }
+    return fail();
+  } else if constexpr (optional) {
+    using T = std::decay_t<decltype(*std::declval<ArgT>())>;
+    if (auto arg = InterpreterValueDynCast<T>(args[0])) {
+      return arg;
+    }
+    return fail();
+  } else {
+    using E = std::decay_t<decltype(*std::declval<ArgT>().begin())>;
+    // Container argument types (e.g. MutableArrayRef<InterpreterValue>,
+    // ArrayRef<int64_t>).
+    if constexpr (std::is_same_v<E, InterpreterValue>) {
+      return ArgT{args};
+    } else {
+      // Note: we don't cast to ArgT here, because that's typically an ArrayRef,
+      // which would lead to returning a reference to a temporary.
+      return UnpackInterpreterValues<E>(args);
+    }
+  }
+}
+
+// Converts the given return value. Supported target types:
+// - InterpreterValue (no-op conversion)
+// - InterpreterValue storage types (the value will be boxed)
+// - SmallVector<InterpreterValue>
+template <typename RetT>
+SmallVector<InterpreterValue> TypedInterpreterOpConvertRet(RetT ret) {
+  if constexpr (std::is_same_v<RetT, InterpreterValue>) {
+    return {ret};
+  } else if constexpr (is_valid_interpreter_value_v<RetT>) {
+    return {InterpreterValue{ret}};
+  } else if constexpr (std::is_same_v<RetT, SmallVector<InterpreterValue>>) {
+    return ret;
+  } else {
+    using E = std::decay_t<decltype(*std::declval<RetT>().begin())>;
+    return PackInterpreterValues(ArrayRef<E>(ret));
+  }
+}
+
+// Adapts the given function to the generic handler signature
+// (SmallVector<InterpreterValue>(MutableArrayRef<InterpreterValue>, Operation*,
+// InterpreterState&)).
+// See the function below for usage.
+template <typename Op, typename Ret, typename... T, size_t... Indices>
+void RegisterTypedInterpreterOpImpl(Ret (*fn)(InterpreterState&, Op, T... args),
+                                    std::index_sequence<Indices...>) {
+  RegisterInterpreterOp(
+      Op::getOperationName(),
+      [fn](MutableArrayRef<InterpreterValue> args, mlir::Operation* op,
+           InterpreterState& state) -> SmallVector<InterpreterValue> {
+        auto cast = llvm::dyn_cast<Op>(op);
+        if (!cast) {
+          state.AddFailure(absl::StrCat(
+              "failed to cast op '", op->getName().getStringRef().str(),
+              "' to expected type (", llvm::getTypeName<Op>().str(), ")"));
+          return {};
+        }
+        int64_t used_args = 0;
+        for (auto i : llvm::seq(0ul, sizeof...(T))) {
+          used_args += cast.getODSOperandIndexAndLength(i).second;
+        }
+        if (args.size() != used_args) {
+          state.AddFailure("Op handler did not use all arguments");
+          return {};
+        }
+
+        auto extract_arg = [&](auto index, auto* dummy) {
+          auto [pos, length] = cast.getODSOperandIndexAndLength(index);
+          using ArgT = std::decay_t<decltype(*dummy)>;
+          return TypedInterpreterOpConvertArg<ArgT>(args.slice(pos, length),
+                                                    state);
+        };
+
+        if constexpr (std::is_same_v<Ret, void>) {
+          fn(state, cast, extract_arg(Indices, (std::decay_t<T>*){nullptr})...);
+          return {};
+        } else {
+          Ret ret = fn(state, cast,
+                       extract_arg(Indices, (std::decay_t<T>*){nullptr})...);
+          return TypedInterpreterOpConvertRet(ret);
+        }
+      });
+}
+
+// registers the given function. The function should take one argument per
+// Op operand, in the same order as the Op.
+// The argument types should match the operation's operand types:
+// - Variadic<...> becomes ArrayRef<...>
+// - Optional<...> becomes std::optional<...>
+// - Unboxing is optionally supported, e.g. an Optional<Index> operand can be
+//   passed to either a std::optional<int64_t> or a
+//   std::optional<InterpreterValue>.
+// Valid return types are InterpreterValue, SmallVector<InterpreterValue>, void,
+// and any type boxable in an InterpreterValue.
+template <typename Op, typename Ret, typename... T>
+void RegisterInterpreterOp(Ret (*fn)(InterpreterState&, Op, T...)) {
+  RegisterTypedInterpreterOpImpl(fn, std::index_sequence_for<T...>{});
+}
+
+// Simple variadic ops (single output).
+void RegisterInterpreterOp(
+    llvm::StringRef name,
+    InterpreterValue (*fn)(MutableArrayRef<InterpreterValue>));
+
+// Simple variadic ops(no output).
+void RegisterInterpreterOp(llvm::StringRef name,
+                           void (*fn)(MutableArrayRef<InterpreterValue>));
+
+// Generic ops.
+void RegisterInterpreterOp(
+    llvm::StringRef name,
+    std::function<llvm::SmallVector<InterpreterValue>(
+        MutableArrayRef<InterpreterValue>, mlir::Operation*, InterpreterState&)>
+        fn);
+
+void RegisterInterpreterOp(llvm::StringRef name, llvm::StringRef original);
+
+}  // namespace detail
+}  // namespace interpreter
+}  // namespace mlir
+
+#endif  // XLA_MLIR_TOOLS_MLIR_INTERPRETER_FRAMEWORK_REGISTRATION_H_
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tensor_or_memref.cc b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tensor_or_memref.cc
new file mode 100644
index 00000000000000..d66415e045572c
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tensor_or_memref.cc
@@ -0,0 +1,169 @@
+/* Copyright 2022 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/mlir/tools/mlir_interpreter/framework/tensor_or_memref.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+#include <utility>
+
+#include "mlir/Dialect/Utils/IndexingUtils.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+
+namespace mlir {
+namespace interpreter {
+
+std::optional<int64_t> BufferView::GetPhysicalIndex(
+    llvm::ArrayRef<int64_t> view_indices) const {
+  int64_t result = offset;
+  if (!InBounds(view_indices)) {
+    return std::nullopt;
+  }
+  for (int64_t i = 0; i < view_indices.size(); ++i) {
+    result += view_indices[i] * strides[i];
+  }
+  return result;
+}
+
+bool BufferView::InBounds(llvm::ArrayRef<int64_t> view_indices) const {
+  if (view_indices.size() > sizes.size()) {
+    return false;
+  }
+  for (auto [index, size] : llvm::zip(view_indices, sizes)) {
+    if (index < 0 || index >= size) {
+      return false;
+    }
+  }
+  return true;
+}
+
+SmallVector<int64_t> BufferView::GetDefaultStrides(ArrayRef<int64_t> sizes) {
+  SmallVector<int64_t> result(sizes.size());
+  int64_t stride = 1;
+  for (int64_t i = result.size() - 1; i >= 0; --i) {
+    result[i] = stride;
+    stride *= sizes[i];
+  }
+  return result;
+}
+
+SmallVector<int64_t> BufferView::GetStridesForLayout(ArrayRef<int64_t> sizes,
+                                                     ArrayRef<int64_t> layout) {
+  if (layout.empty()) return GetDefaultStrides(sizes);
+  auto inverse_layout = invertPermutationVector(layout);
+  SmallVector<int64_t> result(sizes.size());
+  int64_t stride = 1;
+  for (int64_t i = 0; i < layout.size(); ++i) {
+    result[inverse_layout[i]] = stride;
+    stride *= sizes[inverse_layout[i]];
+  }
+  return result;
+}
+
+LogicalResult BufferView::Slice(int64_t dim_index, int64_t dim_offset) {
+  llvm::SmallVector<int64_t> offsets(Rank(), 0);
+  offsets[dim_index] = dim_offset;
+  if (auto new_offset = GetPhysicalIndex(offsets)) {
+    offset = *new_offset;
+  } else {
+    return failure();
+  }
+  if (dim_index >= Rank()) --*num_vector_dims;
+  strides.erase(strides.begin() + dim_index);
+  sizes.erase(sizes.begin() + dim_index);
+  return success();
+}
+
+LogicalResult BufferView::Slice(int64_t dim_index, int64_t dim_offset,
+                                int64_t dim_size, int64_t dim_stride) {
+  llvm::SmallVector<int64_t> offsets(Rank(), 0);
+  offsets[dim_index] = dim_offset;
+  if (dim_size == 0) {
+    offset = 0;
+  } else if (auto new_offset = GetPhysicalIndex(offsets)) {
+    offset = *new_offset;
+  } else {
+    return failure();
+  }
+  sizes[dim_index] = dim_size;
+  strides[dim_index] *= dim_stride;
+  return success();
+}
+
+LogicalResult BufferView::Subview(ArrayRef<int64_t> subview_offsets,
+                                  ArrayRef<int64_t> subview_sizes,
+                                  ArrayRef<int64_t> subview_strides) {
+  if (auto new_offset = GetPhysicalIndex(subview_offsets)) {
+    offset = *new_offset;
+  } else {
+    return failure();
+  }
+
+  for (auto [in_size, subview_offset, subview_size, subview_stride] :
+       llvm::zip(sizes, subview_offsets, subview_sizes, subview_strides)) {
+    int64_t limit_index = subview_offset + (subview_size - 1) * subview_stride;
+    if (subview_offset < 0 || subview_offset >= in_size || limit_index < 0 ||
+        limit_index >= in_size) {
+      return failure();
+    }
+  }
+
+  for (auto [in_stride, subview_stride] : llvm::zip(strides, subview_strides)) {
+    in_stride *= subview_stride;
+  }
+  sizes = llvm::to_vector(subview_sizes);
+  return success();
+}
+
+int64_t BufferView::GetNumElements(bool include_vector_dims) const {
+  size_t n = 1;
+  for (auto size : ArrayRef<int64_t>(sizes).drop_back(
+           include_vector_dims ? 0 : num_vector_dims.value_or(0))) {
+    n *= size;
+  }
+  return n;
+}
+
+std::optional<int64_t> BufferView::GetCollapsedStride(
+    llvm::ArrayRef<int64_t> dims) const {
+  using StrideAndDim = std::pair<int64_t, int64_t>;
+  llvm::SmallVector<StrideAndDim> strides_and_dims;
+  for (auto dim : dims) {
+    if (sizes[dim] != 1) {
+      strides_and_dims.emplace_back(strides[dim], dim);
+    }
+  }
+
+  if (strides_and_dims.empty()) {
+    return 0;
+  }
+
+  llvm::sort(strides_and_dims);
+  int64_t next_stride = strides_and_dims.front().first;
+  for (auto [stride, dim] : strides_and_dims) {
+    if (stride != next_stride) {
+      return std::nullopt;
+    }
+    next_stride *= sizes[dim];
+  }
+  return strides_and_dims.front().first;
+}
+
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tensor_or_memref.h b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tensor_or_memref.h
new file mode 100644
index 00000000000000..35afc96ce1b79c
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tensor_or_memref.h
@@ -0,0 +1,365 @@
+/* Copyright 2022 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_MLIR_TOOLS_MLIR_INTERPRETER_FRAMEWORK_TENSOR_OR_MEMREF_H_
+#define XLA_MLIR_TOOLS_MLIR_INTERPRETER_FRAMEWORK_TENSOR_OR_MEMREF_H_
+
+#include <math.h>
+
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <optional>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+
+namespace mlir {
+namespace interpreter {
+
+template <typename T>
+bool IsEqual(T a, T b) {
+  return a == b;
+}
+
+// TODO(jreiffers): Replace ifndef with a command line flag.
+#ifndef MLIR_INTERPRETER_COMPARE_DOUBLES_EXACT
+// Compare double precision float with a small tolerance, because complex
+// computations in the interpreter don't always produce the exact same result.
+template <>
+inline bool IsEqual(double a, double b) {
+  if (isinf(a) || isinf(b)) {
+    return a == b;
+  }
+
+  return fabs(a - b) < 1e-14;
+}
+
+template <>
+inline bool IsEqual(std::complex<double> a, std::complex<double> b) {
+  return IsEqual(a.real(), b.real()) && IsEqual(a.imag(), b.imag());
+}
+#endif
+
+// Represents a view into a physical buffer.
+struct BufferView {
+  int64_t offset;
+  llvm::SmallVector<int64_t> sizes;    // [10, 11, 12]
+  llvm::SmallVector<int64_t> strides;  // [132, 12, 1]
+  // Number of vector element dimensions in the tensor. nullopt if this is a
+  // vector itself (is_vector is set). {0} if this is a tensor of a unit vector.
+  std::optional<int64_t> num_vector_dims = std::nullopt;
+  bool is_vector = false;
+
+  int64_t Rank() const { return sizes.size() - num_vector_dims.value_or(0); }
+
+  // Removes the dimension from the view. If you need to keep it, use the
+  // overload below with dim_size = 1.
+  LogicalResult Slice(int64_t dim_index, int64_t dim_offset);
+  LogicalResult Slice(int64_t dim_index, int64_t dim_offset, int64_t dim_size,
+                      int64_t dim_stride = 1);
+  LogicalResult Subview(ArrayRef<int64_t> subview_offsets,
+                        ArrayRef<int64_t> subview_sizes,
+                        ArrayRef<int64_t> subview_strides);
+  int64_t GetNumElements(bool include_vector_dimsms = false) const;
+
+  class LogicalIndexView {
+   public:
+    class Iterator {
+     public:
+      using iterator_category = std::forward_iterator_tag;
+      using value_type = llvm::SmallVector<int64_t>;
+      using difference_type = std::ptrdiff_t;
+      using pointer = llvm::SmallVector<int64_t>*;
+      using reference = llvm::SmallVector<int64_t>&;
+
+      const llvm::SmallVector<int64_t>& operator*() const {
+        return view_indices_;
+      }
+      const llvm::SmallVector<int64_t>* operator->() const {
+        return &view_indices_;
+      }
+
+      Iterator& operator++() {
+        auto index_it = view_indices_.rbegin();
+        auto size_it = view_->sizes.rbegin();
+        if (!include_vector_dims_) {
+          std::advance(size_it, view_->num_vector_dims.value_or(0));
+        }
+
+        for (auto e = view_indices_.rend(); index_it != e;
+             ++index_it, ++size_it) {
+          ++*index_it;
+          if (*index_it < *size_it) {
+            return *this;
+          }
+          *index_it = 0;
+        }
+
+        view_indices_.clear();
+        view_indices_.push_back(-1);
+        return *this;
+      }
+
+      Iterator operator++(int) {
+        auto tmp = *this;
+        ++(*this);
+        return tmp;
+      }
+
+      bool operator==(const Iterator& other) const {
+        return view_indices_ == other.view_indices_;
+      }
+
+      bool operator!=(const Iterator& other) const { return !(*this == other); }
+
+     private:
+      friend class LogicalIndexView;
+
+      Iterator(const BufferView* view, llvm::SmallVector<int64_t> indices,
+               bool include_vector_dims)
+          : view_(view),
+            view_indices_(std::move(indices)),
+            include_vector_dims_(include_vector_dims) {}
+
+      const BufferView* view_;
+      llvm::SmallVector<int64_t> view_indices_;
+      bool include_vector_dims_;
+    };
+
+    Iterator begin() const {
+      if (view_->GetNumElements() == 0) return end();
+      return {
+          view_,
+          llvm::SmallVector<int64_t>(
+              view_->Rank() +
+              (include_vector_dims_ ? view_->num_vector_dims.value_or(0) : 0)),
+          include_vector_dims_};
+    }
+    Iterator end() const { return {view_, {-1}, false}; }
+
+   private:
+    friend class BufferView;
+
+    LogicalIndexView(const BufferView* view, bool include_vector_dims)
+        : view_(view), include_vector_dims_(include_vector_dims) {}
+
+    const BufferView* view_;
+    bool include_vector_dims_;
+  };
+
+  // Returns nullopt if the index is out of bounds.
+  std::optional<int64_t> GetPhysicalIndex(
+      llvm::ArrayRef<int64_t> view_indices) const;
+  LogicalIndexView Indices(bool include_vector_dims = false) const {
+    return LogicalIndexView{this, include_vector_dims};
+  }
+  // Returns the stride resulting from collapsing the given dimensions, if
+  // possible.
+  std::optional<int64_t> GetCollapsedStride(llvm::ArrayRef<int64_t> dims) const;
+
+  bool InBounds(llvm::ArrayRef<int64_t> view_indices) const;
+  static SmallVector<int64_t> GetDefaultStrides(ArrayRef<int64_t> sizes);
+  static SmallVector<int64_t> GetStridesForLayout(ArrayRef<int64_t> sizes,
+                                                  ArrayRef<int64_t> layout);
+};
+
+// Backing for a TensorOrMemref.
+class Buffer {
+ private:
+  struct Dummy {};
+
+ public:
+  template <typename T>
+  static std::shared_ptr<Buffer> Allocate(size_t size) {
+    return std::make_shared<Buffer>(Dummy{}, size, sizeof(T));
+  }
+
+  char* at(std::optional<int64_t> idx, int64_t element_size) {
+    auto byte_offset = GetByteOffset(idx, element_size);
+    if (!byte_offset) {
+      return &storage_.data()[0];
+    }
+    return &storage_.data()[*byte_offset];
+  }
+
+  const char* at(std::optional<int64_t> idx, int64_t element_size) const {
+    auto byte_offset = GetByteOffset(idx, element_size);
+    if (!byte_offset) {
+      return &storage_.data()[0];
+    }
+    return &storage_.data()[*byte_offset];
+  }
+
+  Buffer(Dummy, size_t num_elements, size_t element_size)
+      : storage_(num_elements * element_size) {}
+
+  int64_t GetByteSize() const { return storage_.size(); }
+
+  void Deallocate(mlir::Operation* op) {
+    if (is_alloca_) {
+      SetFailure("deallocated stack buffer");
+    } else if (freed_by_ != nullptr) {
+      std::string failure;
+      llvm::raw_string_ostream os(failure);
+      os << "double-free\n";
+      os << "  Note: allocated by " << *allocated_by_ << "\n";
+      os << "  Note: previously freed by " << *freed_by_ << "\n";
+      SetFailure(failure);
+    } else {
+      freed_by_ = op;
+    }
+  }
+
+  bool Deallocated() const { return freed_by_ != nullptr; }
+  mlir::Operation* FreedByOp() const { return freed_by_; }
+  void SetAllocatedBy(mlir::Operation* allocated_by) {
+    this->allocated_by_ = allocated_by;
+  }
+
+  void SetFailure(llvm::StringRef failure) const {
+    this->failure_ = failure.str();
+  }
+  llvm::StringRef GetFailure() const { return failure_; }
+
+  void SetIsAlloca() { is_alloca_ = true; }
+
+ private:
+  std::optional<size_t> GetByteOffset(std::optional<int64_t> idx,
+                                      int64_t element_size) const {
+    if (!idx) {
+      SetFailure("out of bounds access");
+      return std::nullopt;
+    }
+
+    if (freed_by_ != nullptr) {
+      std::string failure;
+      llvm::raw_string_ostream os(failure);
+      os << "use-after-free\n";
+      os << "  Note: allocated by " << *allocated_by_ << "\n";
+      os << "  Note: previously freed by " << *freed_by_ << "\n";
+      SetFailure(failure);
+      return std::nullopt;
+    }
+
+    return *idx * element_size;
+  }
+
+  llvm::SmallVector<char> storage_;
+  mlir::Operation* freed_by_ = nullptr;
+  mlir::Operation* allocated_by_ = nullptr;
+  bool is_alloca_ = false;
+  mutable std::string failure_;
+};
+
+template <typename T>
+struct TensorOrMemref {
+  using element_type = T;
+
+  static TensorOrMemref<T> Empty(ArrayRef<int64_t> sizes,
+                                 ArrayRef<int64_t> layout = {}) {
+    BufferView dummy{0, SmallVector<int64_t>(sizes), {}};
+    return EmptyLike(dummy, layout);
+  }
+
+  static TensorOrMemref<T> EmptyLike(const BufferView& view,
+                                     ArrayRef<int64_t> layout = {}) {
+    BufferView new_view = view;
+    new_view.offset = 0;
+    new_view.strides = BufferView::GetStridesForLayout(view.sizes, layout);
+    return {Buffer::Allocate<T>(view.GetNumElements(true)), new_view};
+  }
+
+  TensorOrMemref<T> Clone(ArrayRef<int64_t> layout = {}) const {
+    auto out = EmptyLike(view, layout);
+    for (auto [src_index, dst_index] :
+         llvm::zip(view.Indices(true), out.view.Indices(true))) {
+      out.at(dst_index) = at(src_index);
+    }
+    return out;
+  }
+
+  const T& at(ArrayRef<int64_t> indices) const {
+    return *reinterpret_cast<const T*>(
+        buffer->at(view.GetPhysicalIndex(indices), sizeof(T)));
+  }
+
+  T& at(ArrayRef<int64_t> indices) {
+    return *reinterpret_cast<T*>(
+        buffer->at(view.GetPhysicalIndex(indices), sizeof(T)));
+  }
+
+  TensorOrMemref VectorAt(ArrayRef<int64_t> indices) const {
+    auto offset = view.GetPhysicalIndex(indices);
+    BufferView subview;
+    subview.strides = {view.strides.begin() + view.Rank(), view.strides.end()};
+    subview.sizes = {view.sizes.begin() + view.Rank(), view.sizes.end()};
+    if (offset) {
+      subview.offset = *offset;
+    } else {
+      buffer->SetFailure("out of bounds access");
+    }
+    subview.is_vector = true;
+    subview.num_vector_dims = std::nullopt;
+    return {buffer, subview};
+  }
+
+  bool operator==(const TensorOrMemref& other) const {
+    if (buffer->Deallocated() || other.buffer->Deallocated()) return false;
+    if (other.view.sizes != view.sizes) return false;
+    if (other.view.num_vector_dims != view.num_vector_dims) return false;
+    for (const auto& indices : view.Indices(true)) {
+      // Treat NaNs as equal.
+      if constexpr (std::is_floating_point_v<T>) {
+        bool thisnan = isnan(at(indices));
+        bool othernan = isnan(other.at(indices));
+        if (thisnan || othernan) {
+          if (thisnan && othernan) continue;
+          return false;
+        }
+      }
+      if (!IsEqual(at(indices), other.at(indices))) return false;
+    }
+    return true;
+  }
+
+  std::shared_ptr<Buffer> buffer;
+  BufferView view;
+};
+
+template <typename T>
+struct is_tensor_or_memref : std::false_type {};  // NOLINT
+
+template <typename T>
+struct is_tensor_or_memref<TensorOrMemref<T>> : std::true_type {};  // NOLINT
+
+template <typename T>
+inline constexpr bool is_tensor_or_memref_v =  // NOLINT
+    is_tensor_or_memref<std::decay_t<T>>::value;
+
+}  // namespace interpreter
+}  // namespace mlir
+
+#endif  // XLA_MLIR_TOOLS_MLIR_INTERPRETER_FRAMEWORK_TENSOR_OR_MEMREF_H_
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tests/BUILD b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tests/BUILD
new file mode 100644
index 00000000000000..cd273637bc029b
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tests/BUILD
@@ -0,0 +1,25 @@
+load("//xla:xla.bzl", "xla_cc_test")
+
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
+xla_cc_test(
+    name = "tensor_or_memref_test",
+    srcs = ["tensor_or_memref_test.cc"],
+    deps = [
+        "//xla/mlir/tools/mlir_interpreter/framework",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+xla_cc_test(
+    name = "interpreter_value_test",
+    srcs = ["interpreter_value_test.cc"],
+    deps = [
+        "//xla/mlir/tools/mlir_interpreter/framework",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+    ],
+)
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tests/interpreter_value_test.cc b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tests/interpreter_value_test.cc
new file mode 100644
index 00000000000000..e049e672cbfa31
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tests/interpreter_value_test.cc
@@ -0,0 +1,239 @@
+/* Copyright 2022 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h"
+
+#include <complex>
+#include <cstdint>
+#include <optional>
+#include <variant>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "llvm/ADT/ArrayRef.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/tensor_or_memref.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::IsEmpty;
+
+TEST(InterpreterValueTest, FillUnitTensor) {
+  auto t = TensorOrMemref<int64_t>::Empty({});
+  t.at({}) = 42;
+  InterpreterValue v{t};
+  v.Fill([](llvm::ArrayRef<int64_t>) { return InterpreterValue{int64_t{43}}; });
+  ASSERT_EQ(t.at({}), 43);
+}
+
+TEST(InterpreterValueTest, Fill1DTensor) {
+  auto t = TensorOrMemref<int64_t>::Empty({3});
+  InterpreterValue v{t};
+  v.Fill([](llvm::ArrayRef<int64_t> indices) {
+    return InterpreterValue{indices[0]};
+  });
+  ASSERT_EQ(t.at(0), 0);
+  ASSERT_EQ(t.at(1), 1);
+  ASSERT_EQ(t.at(2), 2);
+}
+
+TEST(InterpreterValueTest, FillTensorOfVector) {
+  auto t = TensorOrMemref<int64_t>::Empty({4, 2});
+  t.view.num_vector_dims = 1;
+
+  InterpreterValue v{t};
+  v.Fill([](llvm::ArrayRef<int64_t> indices) -> InterpreterValue {
+    EXPECT_EQ(indices.size(), 1);
+    auto r = TensorOrMemref<int64_t>::Empty({2});
+    r.view.is_vector = true;
+    r.at(0) = indices[0];
+    r.at(1) = indices[0] * 10;
+    return {r};
+  });
+  ASSERT_EQ(
+      v.ToString(),
+      "TensorOrMemref<4xvector<2xi64>>: [[0, 0], [1, 10], [2, 20], [3, 30]]");
+}
+
+TEST(InterpreterValueTest, FillZeroSizedTensor) {
+  auto t = TensorOrMemref<int64_t>::Empty({0, 1});
+  InterpreterValue v{t};
+  bool was_called = false;
+  v.Fill([&](llvm::ArrayRef<int64_t> indices) {
+    was_called = true;
+    return InterpreterValue{indices[0]};
+  });
+  EXPECT_FALSE(was_called);
+}
+
+TEST(InterpreterValueTest, TypedAlike) {
+  InterpreterValue v{TensorOrMemref<int32_t>::Empty({})};
+  auto TypedAlike = v.TypedAlike({1, 2, 3});
+  ASSERT_TRUE(
+      std::holds_alternative<TensorOrMemref<int32_t>>(TypedAlike.storage));
+  ASSERT_THAT(TypedAlike.View().sizes, ElementsAre(1, 2, 3));
+}
+
+TEST(InterpreterValueTest, AsUnitTensor) {
+  InterpreterValue v{42};
+  InterpreterValue wrapped = v.AsUnitTensor();
+  ASSERT_THAT(wrapped.View().sizes, IsEmpty());
+  ASSERT_EQ(std::get<TensorOrMemref<int32_t>>(wrapped.storage).at({}), 42);
+}
+
+TEST(InterpreterValueTest, IsTensor) {
+  ASSERT_FALSE(InterpreterValue{42}.IsTensor());
+  ASSERT_TRUE(InterpreterValue{TensorOrMemref<int32_t>::Empty({})}.IsTensor());
+}
+
+TEST(InterpreterValueTest, AsInt) {
+  ASSERT_EQ(InterpreterValue{int64_t{42}}.AsInt(), 42);
+  ASSERT_EQ(InterpreterValue{int32_t{42}}.AsInt(), 42);
+  ASSERT_EQ(InterpreterValue{int16_t{42}}.AsInt(), 42);
+  ASSERT_EQ(InterpreterValue{int8_t{42}}.AsInt(), 42);
+  ASSERT_EQ(InterpreterValue{int8_t{-1}}.AsInt(), -1);
+}
+
+TEST(InterpreterValueTest, AsUInt) {
+  ASSERT_EQ(InterpreterValue{int16_t{-1}}.AsUInt(), 65535);
+  ASSERT_EQ(InterpreterValue{int8_t{-1}}.AsUInt(), 255);
+}
+
+TEST(InterpreterValueTest, CloneTensor) {
+  auto tensor = TensorOrMemref<int64_t>::Empty({3});
+  tensor.at(0) = 1;
+  tensor.at(1) = 2;
+  tensor.at(2) = 3;
+
+  InterpreterValue wrapped{tensor};
+  auto clone = wrapped.Clone();
+  tensor.at(0) = 4;
+
+  auto& cloned_tensor = std::get<TensorOrMemref<int64_t>>(clone.storage);
+  ASSERT_EQ(cloned_tensor.at(0), 1);
+  ASSERT_EQ(cloned_tensor.at(1), 2);
+  ASSERT_EQ(cloned_tensor.at(2), 3);
+}
+
+TEST(InterpreterValueTest, CloneWithLayouts) {
+  auto tensor = TensorOrMemref<int64_t>::Empty({3, 5}, {0, 1});
+  tensor.at({2, 4}) = 42;
+
+  InterpreterValue wrapped{tensor};
+  auto clone = wrapped.Clone();
+  ASSERT_EQ(clone.View().strides,
+            BufferView::GetStridesForLayout({3, 5}, {1, 0}));
+  ASSERT_EQ(clone.ExtractElement({2, 4}).AsInt(), 42);
+}
+
+TEST(InterpreterValueTest, CoerceLayoutNoop) {
+  auto tensor = TensorOrMemref<int64_t>::Empty({3, 5}, {0, 1});
+  tensor.at({2, 4}) = 42;
+
+  InterpreterValue wrapped{tensor};
+  auto coerced = wrapped.CoerceLayout({0, 1});
+  ASSERT_EQ(tensor.buffer,
+            std::get<TensorOrMemref<int64_t>>(coerced.storage).buffer);
+}
+
+TEST(InterpreterValueTest, CoerceLayout) {
+  auto tensor = TensorOrMemref<int64_t>::Empty({3, 5});
+  tensor.at({2, 4}) = 42;
+
+  InterpreterValue wrapped{tensor};
+  auto clone = wrapped.CoerceLayout({0, 1});
+  ASSERT_EQ(clone.View().strides,
+            BufferView::GetStridesForLayout({3, 5}, {0, 1}));
+  ASSERT_EQ(clone.ExtractElement({2, 4}).AsInt(), 42);
+}
+
+TEST(InterpreterValueTest, CoerceLayoutSquare) {
+  auto tensor = TensorOrMemref<float>::Empty({2, 2});
+  tensor.at({0, 0}) = 1;
+  tensor.at({0, 1}) = 2;
+  tensor.at({1, 0}) = 3;
+  tensor.at({1, 1}) = 4;
+
+  InterpreterValue wrapped{tensor};
+  auto clone = wrapped.CoerceLayout({0, 1});
+  auto& cloned_tensor = std::get<TensorOrMemref<float>>(clone.storage);
+
+  EXPECT_EQ(
+      *reinterpret_cast<float*>(cloned_tensor.buffer->at(0, sizeof(float))), 1);
+  EXPECT_EQ(
+      *reinterpret_cast<float*>(cloned_tensor.buffer->at(1, sizeof(float))), 3);
+  EXPECT_EQ(
+      *reinterpret_cast<float*>(cloned_tensor.buffer->at(2, sizeof(float))), 2);
+  EXPECT_EQ(
+      *reinterpret_cast<float*>(cloned_tensor.buffer->at(3, sizeof(float))), 4);
+}
+
+TEST(InterpreterValueTest, CloneScalar) {
+  InterpreterValue value{42};
+  auto clone = value.Clone();
+  ASSERT_THAT(std::get<int32_t>(clone.storage), 42);
+}
+
+TEST(InterpreterValueTest, ToString) {
+  InterpreterValue value{TensorOrMemref<int64_t>::Empty({3})};
+  ASSERT_EQ(value.ToString(), "TensorOrMemref<3xi64>: [0, 0, 0]");
+}
+
+TEST(InterpreterValueTest, ToString2d) {
+  InterpreterValue value{TensorOrMemref<int64_t>::Empty({3, 2})};
+  ASSERT_EQ(value.ToString(),
+            "TensorOrMemref<3x2xi64>: [[0, 0], [0, 0], [0, 0]]");
+}
+
+TEST(InterpreterValueTest, ToString0d) {
+  InterpreterValue value{TensorOrMemref<int64_t>::Empty({})};
+  ASSERT_EQ(value.ToString(), "TensorOrMemref<i64>: 0");
+}
+
+TEST(InterpreterValueTest, ToStringComplex) {
+  InterpreterValue value{std::complex<float>{}};
+  ASSERT_EQ(value.ToString(), "complex<f32>: 0.000000e+00+0.000000e+00i");
+}
+
+TEST(CastTest, UnpackTensor) {
+  InterpreterValue value{TensorOrMemref<int8_t>::Empty({1, 1})};
+  value.InsertElement({0, 0}, {int8_t{1}});
+  ASSERT_EQ(InterpreterValueCast<int64_t>(value), 1);
+  ASSERT_EQ(InterpreterValueCast<uint8_t>(value), 1);
+  ASSERT_EQ(InterpreterValueCast<float>(value), 1.0f);
+  ASSERT_EQ(InterpreterValueCast<double>(value), 1.0);
+
+  InterpreterValue non_unit{TensorOrMemref<int8_t>::Empty({2, 2})};
+  ASSERT_EQ(InterpreterValueDynCast<int64_t>(non_unit), std::nullopt);
+}
+
+TEST(CastTest, IdentityCast) {
+  InterpreterValue value{TensorOrMemref<float>::Empty({1, 1})};
+  ASSERT_EQ(InterpreterValueCast<InterpreterValue>(value), value);
+}
+
+TEST(CastTest, CastToUnsigned) {
+  // Note: This is different from `AsUint`, which preserves the size of the
+  // original type (i.e. int8_t{-1} results in 255).
+  InterpreterValue value{int8_t{-1}};
+  ASSERT_EQ(InterpreterValueCast<uint8_t>(value), 255);
+  ASSERT_EQ(InterpreterValueCast<uint16_t>(value), 65535);
+}
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tests/tensor_or_memref_test.cc b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tests/tensor_or_memref_test.cc
new file mode 100644
index 00000000000000..ae19471933e742
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/framework/tests/tensor_or_memref_test.cc
@@ -0,0 +1,107 @@
+/* Copyright 2022 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/mlir/tools/mlir_interpreter/framework/tensor_or_memref.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <optional>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/str_join.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+using ::testing::ElementsAre;
+
+TEST(TensorOrMemrefTest, DefaultStrides) {
+  EXPECT_THAT(BufferView::GetDefaultStrides({1, 2, 3}), ElementsAre(6, 3, 1));
+}
+
+TEST(TensorOrMemrefTest, StridesForLayout) {
+  EXPECT_THAT(BufferView::GetStridesForLayout({1, 2, 3}, {2, 1, 0}),
+              ElementsAre(6, 3, 1));
+  EXPECT_THAT(BufferView::GetStridesForLayout({1, 2, 3}, {0, 1, 2}),
+              ElementsAre(1, 1, 2));
+  EXPECT_THAT(BufferView::GetStridesForLayout({3, 3, 3, 3}, {3, 0, 1, 2}),
+              ElementsAre(27, 1, 3, 9));
+}
+
+std::optional<int64_t> GetCollapsedStrideNaive(llvm::ArrayRef<int64_t> dims,
+                                               const BufferView& view) {
+  BufferView f;
+  for (int64_t dim : dims) {
+    f.sizes.push_back(view.sizes[dim]);
+  }
+
+  // Find all physical indices for the dimensions.
+  llvm::SmallBitVector v(view.GetNumElements());
+  for (const auto& indices : f.Indices()) {
+    SmallVector<int64_t> view_indices(view.Rank());
+    for (auto [dim, index] : llvm::zip(dims, indices)) {
+      view_indices[dim] = index;
+    }
+    v[*view.GetPhysicalIndex(view_indices)] = true;
+  }
+
+  if (v.count() != f.GetNumElements()) return std::nullopt;
+  if (f.GetNumElements() <= 1) return 0;
+
+  // Check that they have a common stride.
+  int64_t min = v.find_first();
+  int64_t expected_stride = (v.find_last() - min) / (f.GetNumElements() - 1);
+  for (int64_t i = 0; i < f.GetNumElements(); ++i) {
+    if (!v[i * expected_stride + min]) {
+      return std::nullopt;
+    }
+  }
+
+  return expected_stride;
+}
+
+TEST(TensorOrMemrefTest, CollapsedStride) {
+  BufferView view{.sizes = {1, 2, 3, 1, 5},
+                  .strides = BufferView::GetDefaultStrides({1, 2, 3, 1, 5})};
+
+  auto check_all = [&]() {
+    for (int64_t i = 0; i < (1 << view.Rank()); ++i) {
+      SmallVector<int64_t> dims;
+      for (int64_t dim = 0; dim < view.Rank(); ++dim) {
+        if (i & (1 << dim)) dims.push_back(dim);
+      }
+
+      do {
+        auto v = view.GetCollapsedStride(dims);
+        auto n = GetCollapsedStrideNaive(dims, view);
+        EXPECT_EQ(n, v) << "checking " << absl::StrJoin(dims, ", ");
+      } while (std::next_permutation(dims.begin(), dims.end()));
+    }
+  };
+
+  check_all();
+  ASSERT_TRUE(view.Slice(3, 0).succeeded());
+  check_all();
+}
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/tools/mlir_interpreter/mlir_interpreter_runner.cc b/third_party/xla/xla/mlir/tools/mlir_interpreter/mlir_interpreter_runner.cc
new file mode 100644
index 00000000000000..e265c0c27f56bd
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_interpreter/mlir_interpreter_runner.cc
@@ -0,0 +1,138 @@
+/* Copyright 2022 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "mhlo/IR/register.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/InitAllDialects.h"  // from @llvm-project
+#include "mlir/InitAllPasses.h"  // from @llvm-project
+#include "mlir/Support/FileUtilities.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Tools/ParseUtilities.h"  // from @llvm-project
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter.h"
+#include "xla/mlir_hlo/mhlo/IR/register.h"
+#include "xla/mlir_hlo/mhlo/transforms/passes.h"
+
+struct Options {
+  llvm::cl::opt<std::string> input_filename{llvm::cl::Positional,
+                                            llvm::cl::desc("<input file>"),
+                                            llvm::cl::init("-")};
+  llvm::cl::opt<bool> run_all_functions{
+      "run-all", llvm::cl::desc("Run all functions in the module"),
+      llvm::cl::init(false)};
+};
+
+static mlir::OwningOpRef<mlir::Operation *> ParseMlirInput(
+    llvm::StringRef input_filename, mlir::MLIRContext *context) {
+  std::string error_message;
+  auto file = mlir::openInputFile(input_filename, &error_message);
+  if (!file) {
+    llvm::errs() << error_message << "\n";
+    return {};
+  }
+
+  auto source_mgr = std::make_shared<llvm::SourceMgr>();
+  source_mgr->AddNewSourceBuffer(std::move(file), mlir::SMLoc());
+  return mlir::parseSourceFileForTool(source_mgr, context,
+                                      /*insertImplicitModule=*/true);
+}
+
+static mlir::LogicalResult Run(mlir::ModuleOp module,
+                               mlir::func::FuncOp function) {
+  llvm::outs() << "@" << function.getName().str() << "()\n";
+  if (function.getBody().getBlocks().front().getNumArguments() > 0) {
+    llvm::errs() << "Function arguments are not supported.";
+    return mlir::failure();
+  }
+
+  mlir::SymbolTable symbol_table{module};
+  auto results =
+      mlir::interpreter::RunInterpreter(symbol_table, function, {}, {});
+  if (!results.ok()) {
+    llvm::errs() << "Interpreter failed\n";
+    return mlir::failure();
+  }
+
+  if (!results->empty()) {
+    llvm::outs() << "Results:\n";
+    for (const auto &result : *results) {
+      llvm::outs() << result.ToString() << "\n";
+    }
+  }
+
+  return mlir::success();
+}
+
+int main(int argc, char *argv[]) {
+  // Flush llvm::outs before writing errors.
+  llvm::errs().tie(&llvm::outs());
+
+  Options options;
+  llvm::cl::ParseCommandLineOptions(argc, argv, "MLIR interpreter driver\n");
+
+  mlir::DialectRegistry registry;
+  mlir::registerAllDialects(registry);
+  mlir::mhlo::registerAllMhloDialects(registry);
+  mlir::mhlo::registerAllMhloPasses();
+  mlir::registerAllPasses();
+
+  mlir::MLIRContext context(registry);
+  context.allowUnregisteredDialects();
+  auto parsed_input = ParseMlirInput(options.input_filename, &context);
+  if (!parsed_input) {
+    llvm::errs() << "Failed to parse module.\n";
+    return 1;
+  }
+  auto module = llvm::dyn_cast<mlir::ModuleOp>(**parsed_input);
+  if (!module) {
+    llvm::errs() << "Parsing returned something that's not a module.\n";
+    return 1;
+  }
+
+  if (options.run_all_functions) {
+    bool all_succeeded = true;
+    module.walk([&](mlir::func::FuncOp function) {
+      if (!function.isPrivate()) {
+        all_succeeded &= Run(module, function).succeeded();
+      }
+    });
+    if (!all_succeeded) {
+      return 1;
+    }
+  } else {
+    auto *main = module.lookupSymbol("main");
+    if (!main) {
+      llvm::errs() << "no main function found.\n";
+      return 1;
+    }
+    if (!Run(module, llvm::cast<mlir::func::FuncOp>(main)).succeeded()) {
+      return 1;
+    }
+  }
+  return 0;
+}
diff --git a/third_party/xla/xla/mlir/tools/mlir_replay/BUILD b/third_party/xla/xla/mlir/tools/mlir_replay/BUILD
new file mode 100644
index 00000000000000..0d58db72f2aad1
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_replay/BUILD
@@ -0,0 +1,65 @@
+load("@bazel_skylib//rules:build_test.bzl", "build_test")
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_binary")
+
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
+build_test(
+    name = "mlir_replay_build_test",
+    targets = [
+        ":mlir_replay",
+    ],
+)
+
+xla_cc_binary(
+    name = "mlir_replay",
+    testonly = True,
+    srcs = ["mlir_replay.cc"],
+    deps = [
+        ":mlir_replay_lib",
+        "//xla:debug_options_flags",
+        "//xla/mlir/tools/mlir_interpreter/dialects",
+        "//xla/mlir/tools/mlir_interpreter/framework",
+        "//xla/mlir/tools/mlir_replay/public:compiler_trace_proto_cc",
+        "//xla/mlir/tools/mlir_replay/public:compiler_trace_proto_cc_impl",
+        "//xla/mlir/tools/mlir_replay/public:execution_trace_proto_cc",
+        "//xla/mlir/tools/mlir_replay/public:execution_trace_utils",
+        "//xla/mlir_hlo:hlo_dialect_registration",
+        "//xla/service:hlo_proto_cc",
+        "//xla/tsl/util:command_line_flags",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:path",
+        "@local_tsl//tsl/platform:platform_port",
+        "@local_tsl//tsl/platform:status",
+    ],
+)
+
+cc_library(
+    name = "mlir_replay_lib",
+    srcs = ["mlir_replay_lib.cc"],
+    hdrs = ["mlir_replay_lib.h"],
+    deps = [
+        "//xla/mlir/tools/mlir_interpreter/framework",
+        "//xla/mlir/tools/mlir_replay/public:execution_trace_proto_cc",
+        "//xla/mlir/tools/mlir_replay/public:execution_trace_utils",
+        "//xla/service:hlo_proto_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/random",
+        "@com_google_absl//absl/random:bit_gen_ref",
+        "@com_google_absl//absl/random:distributions",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:ParseUtilities",
+        "@llvm-project//mlir:Support",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
diff --git a/third_party/xla/xla/mlir/tools/mlir_replay/README.md b/third_party/xla/xla/mlir/tools/mlir_replay/README.md
new file mode 100644
index 00000000000000..6c2091d526e92f
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_replay/README.md
@@ -0,0 +1,48 @@
+# MLIR Replay tool
+
+This tool is mainly intended for helping debug miscompiles. It takes as inputs
+an HLO snapshot proto with input tensors and a compiler trace proto with the
+state of the IR after each pass.
+
+This tool is built on top of
+[mlir-interpreter](https://github.com/tensorflow/mlir-hlo/tree/master/tools/mlir_interpreter/).
+
+Example usage:
+
+```
+# Run a JAX test with debug flags enabled:
+$ bazel test :some_jax_test --compilation_mode=opt \
+  --test_env=XLA_FLAGS="--xla_cpu_use_xla_runtime --xla_dump_to=/tmp/test-dump --xla_dump_hlo_snapshots" \
+  --test_filter=SomeSpecific.TestCase \
+  --test_sharding_strategy=disabled --test_strategy=local
+
+# JAX tends to compile many modules, so first check which one is broken:
+./mlir_replay \
+  --mlir-compilation-trace-dir=/tmp/test-dump
+
+Failures for /tmp/test-dump/module_1234.jit_something.mlir-trace.pb:
+  Result mismatch for /tmp/test-dump/module_1234.jit_something.snapshot.56.pb: TensorOrMemref<3xi32>: [1, 2, 3] != TensorOrMemref<3xi32>: [1, 1, 1]
+  run :mlir_replay -- --mlir-compilation-trace=/tmp/test-dump/module_1234.jit_something.mlir-trace.pb --hlo-snapshot=/tmp/test-dump/module_1234.jit_something.snapshot.56.pb --print-changes-only --stop-after-first-failure
+```
+
+There may be multiple failing modules. You can run the provided command to
+replay a particular one:
+
+```
+# Run the IR after each pass. Note that JAX typically compiles many modules, so
+# you may have check more than one.
+# There is one .mlir-trace.pb file per module (containing the intermediate IR)
+# and one .snapshot.pb file per execution (containing the inputs and outputs).
+$ ./mlir_replay \
+  --mlir-compilation-trace=/tmp/test-dump/module_1234.jit_something.mlir-trace.pb \
+  --hlo-snapshot=/tmp/test-dump/module_1234.jit_something.snapshot.56.pb \
+  --print-changes-only --stop-after-first-failure
+Running IR after APass
+Results: [1, 2, 3]
+
+Running IR after BPass
+Running IR after CPass
+Running IR after BrokenPass
+Results: [1, 1, 1]
+```
+
diff --git a/third_party/xla/xla/mlir/tools/mlir_replay/mlir_replay.cc b/third_party/xla/xla/mlir/tools/mlir_replay/mlir_replay.cc
new file mode 100644
index 00000000000000..b8a3f1e7e30694
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_replay/mlir_replay.cc
@@ -0,0 +1,240 @@
+/* Copyright 2023 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstring>
+#include <string>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/InitAllDialects.h"  // from @llvm-project
+#include "xla/debug_options_flags.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h"
+#include "xla/mlir/tools/mlir_replay/mlir_replay_lib.h"
+#include "xla/mlir/tools/mlir_replay/public/compiler_trace.pb.h"
+#include "xla/mlir/tools/mlir_replay/public/execution_trace.pb.h"
+#include "xla/mlir/tools/mlir_replay/public/execution_trace_utils.h"
+#include "xla/mlir_hlo/mhlo/IR/register.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/tsl/util/command_line_flags.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/init_main.h"
+#include "tsl/platform/path.h"
+#include "tsl/platform/status.h"
+
+struct ReplayOptions {
+  std::string hlo_snapshot;
+  std::string mlir_compilation_trace;
+  std::string mlir_compilation_trace_dir = "";
+  std::string execution_trace_dir = "";
+  std::vector<std::string> entry_points = {"main", "main_xla_framework"};
+  bool print_changes_only = false;
+  bool stop_after_first_failure = false;
+  bool print_values = true;
+};
+
+bool ResultsMatch(const xla::HloSnapshot& snapshot,
+                  const llvm::SmallVector<mlir::interpreter::InterpreterValue>&
+                      first_pass_results,
+                  std::vector<std::string>& failures,
+                  const ReplayOptions& opts) {
+  auto actual = mlir::interpreter::LiteralToValue(snapshot.result());
+  TF_CHECK_OK(actual.status());
+
+  // We assume this is MHLO, so multiple results will be in a tuple.
+  if (first_pass_results.size() != 1) {
+    failures.push_back("expected one result");
+    return false;
+  }
+
+  if (!(*actual == first_pass_results[0])) {
+    if (opts.print_values) {
+      failures.push_back("result mismatch: " + actual->ToString() +
+                         " != " + first_pass_results[0].ToString());
+    } else {
+      failures.push_back("result mismatch");
+    }
+    return false;
+  }
+  return true;
+}
+
+void TestAll(mlir::MLIRContext& context, const ReplayOptions& opts) {
+  std::vector<std::string> traces;
+  TF_CHECK_OK(tsl::Env::Default()->GetMatchingPaths(
+      opts.mlir_compilation_trace_dir + "/*.mlir-trace.pb", &traces));
+
+  for (const auto& trace_path : traces) {
+    mlir::interpreter::MlirCompilationTrace trace;
+    TF_CHECK_OK(tsl::ReadBinaryProto(tsl::Env::Default(), trace_path, &trace))
+        << "Failed to load " << trace_path;
+
+    std::vector<std::string> snapshots;
+    std::string prefix =
+        trace_path.substr(0, trace_path.length() - strlen(".mlir-trace.pb"));
+    TF_CHECK_OK(tsl::Env::Default()->GetMatchingPaths(prefix + "*.snapshot.*",
+                                                      &snapshots));
+    CHECK_NE(snapshots.size(), 0)
+        << "No snapshots found for module " << trace_path << ".";
+
+    std::vector<std::string> failures;
+    for (const auto& snapshot_path : snapshots) {
+      xla::HloSnapshot snapshot;
+      TF_CHECK_OK(
+          tsl::ReadBinaryProto(tsl::Env::Default(), snapshot_path, &snapshot));
+
+      auto results =
+          mlir::interpreter::Run(context, trace.passes(0).mlir_module(),
+                                 snapshot, nullptr, opts.entry_points);
+      if (!results.status().ok()) {
+        failures.push_back("Failed to execute " + snapshot_path + ": " +
+                           results.status().ToString());
+      } else {
+        if (!ResultsMatch(snapshot, *results, failures, opts)) {
+          failures.push_back(
+              std::string("run :mlir_replay -- --mlir-compilation-trace=") +
+              trace_path + " --hlo-snapshot=" + snapshot_path +
+              " --print-changes-only --stop-after-first-failure");
+        }
+      }
+    }
+
+    if (!failures.empty()) {
+      llvm::errs() << "Failures for " << trace_path << ":\n  "
+                   << absl::StrJoin(failures, "\n  ") << "\n";
+    }
+  }
+}
+
+int main(int argc, char* argv[]) {
+  // Flush llvm::outs before writing errors.
+  llvm::errs().tie(&llvm::outs());
+
+  std::string entry_points;
+  ReplayOptions opts;
+  std::vector<tsl::Flag> flag_list = {
+      tsl::Flag("hlo-snapshot", &opts.hlo_snapshot,
+                "Filename of an HloSnapshot proto. Only used to read inputs."),
+      tsl::Flag("mlir-compilation-trace", &opts.mlir_compilation_trace,
+                "Filename of an MlirCompilerTrace proto."),
+      tsl::Flag("mlir-compilation-trace-dir", &opts.mlir_compilation_trace_dir,
+                "Directory from which to load MlirCompilerTrace and "
+                "HloSnapshot protos. The tool will run all snapshots and "
+                "report the ones with bugs."),
+      tsl::Flag("execution-trace-dir", &opts.execution_trace_dir,
+                "Directory where to store the execution traces (optional)."),
+      tsl::Flag("entry-point", &entry_points,
+                "Program entry function (optional, defaults to 'main')."),
+      tsl::Flag("print-changes-only", &opts.print_changes_only,
+                "If set, only print changed values"),
+      tsl::Flag("stop-after-first-failure", &opts.stop_after_first_failure,
+                "If set, stop after the first failed invocation."),
+      tsl::Flag("print-values", &opts.print_values, "If set, print values."),
+  };
+  xla::AppendDebugOptionsFlags(&flag_list);
+
+  // The usage string includes the message at the top of the file, the
+  // DebugOptions flags and the flags defined above.
+  std::string usage_string = tsl::Flags::Usage(argv[0], flag_list);
+  if (!tsl::Flags::Parse(&argc, argv, flag_list)) {
+    return 1;
+  }
+
+  if (!entry_points.empty()) {
+    opts.entry_points = absl::StrSplit(entry_points, ',');
+  }
+
+  tsl::port::InitMain(usage_string.c_str(), &argc, &argv);
+
+  CHECK(opts.mlir_compilation_trace.empty() !=
+        opts.mlir_compilation_trace_dir.empty())
+      << "Exactly one of --mlir-compilation-trace and "
+         "--mlir-compilation-trace-dir must be specified.";
+
+  CHECK(opts.mlir_compilation_trace_dir.empty() || opts.hlo_snapshot.empty())
+      << "If --mlir-compilation-trace-dir is set, --hlo-snapshot must not be.";
+
+  mlir::DialectRegistry registry;
+  mlir::registerAllDialects(registry);
+  mlir::mhlo::registerAllMhloDialects(registry);
+
+  mlir::MLIRContext context(registry);
+
+  if (!opts.mlir_compilation_trace_dir.empty()) {
+    TestAll(context, opts);
+    return 0;
+  }
+
+  xla::HloSnapshot snapshot;
+  if (!opts.hlo_snapshot.empty()) {
+    TF_CHECK_OK(tsl::ReadBinaryProto(tsl::Env::Default(), opts.hlo_snapshot,
+                                     &snapshot));
+  }
+  mlir::interpreter::MlirCompilationTrace trace;
+  TF_CHECK_OK(tsl::ReadBinaryProto(tsl::Env::Default(),
+                                   opts.mlir_compilation_trace, &trace));
+
+  llvm::SmallVector<mlir::interpreter::InterpreterValue> previous_results;
+  int pass_id = 0;
+  for (auto& state : trace.passes()) {
+    llvm::outs() << "Running IR after " << state.after_pass() << ".\n";
+    mlir::interpreter::ExecutionTrace execution_trace;
+    auto results = mlir::interpreter::Run(
+        context, state.mlir_module(), snapshot,
+        opts.execution_trace_dir.empty() ? nullptr : &execution_trace,
+        opts.entry_points);
+    if (results.status().ok()) {
+      if (opts.print_values &&
+          (!opts.print_changes_only || (*results != previous_results))) {
+        llvm::outs() << "Results:\n";
+        for (const auto& result : *results) {
+          llvm::outs() << result.ToString() << "\n";
+        }
+        previous_results = *results;
+        llvm::outs() << "\n";
+      }
+    } else {
+      llvm::errs() << results.status().ToString() << "\n";
+      if (opts.stop_after_first_failure) {
+        return 1;
+      }
+    }
+
+    if (!opts.execution_trace_dir.empty()) {
+      TF_CHECK_OK(
+          tsl::Env::Default()->RecursivelyCreateDir(opts.execution_trace_dir));
+      std::string filename = tsl::io::JoinPath(
+          opts.execution_trace_dir,
+          absl::StrFormat("%.4d.%s.mlir", pass_id, state.after_pass()));
+      TF_CHECK_OK(tsl::WriteStringToFile(tsl::Env::Default(), filename,
+                                         execution_trace.ir()));
+
+      filename = tsl::io::JoinPath(
+          opts.execution_trace_dir,
+          absl::StrFormat("%.4d.%s.trace.pb", pass_id, state.after_pass()));
+      TF_CHECK_OK(tsl::WriteBinaryProto(tsl::Env::Default(), filename,
+                                        execution_trace));
+    }
+    ++pass_id;
+  }
+
+  return 0;
+}
diff --git a/third_party/xla/xla/mlir/tools/mlir_replay/mlir_replay_lib.cc b/third_party/xla/xla/mlir/tools/mlir_replay/mlir_replay_lib.cc
new file mode 100644
index 00000000000000..8bf928f314d425
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_replay/mlir_replay_lib.cc
@@ -0,0 +1,263 @@
+/* Copyright 2022 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/mlir/tools/mlir_replay/mlir_replay_lib.h"
+
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <numeric>
+#include <random>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/random/bit_gen_ref.h"
+#include "absl/random/gaussian_distribution.h"
+#include "absl/random/random.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/TypeRange.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Tools/ParseUtilities.h"  // from @llvm-project
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/tensor_or_memref.h"
+#include "xla/mlir/tools/mlir_replay/public/execution_trace_utils.h"
+#include "xla/service/hlo.pb.h"
+#include "tsl/platform/statusor.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+absl::StatusOr<SmallVector<InterpreterValue>> LoadArgs(
+    const xla::HloSnapshot& snapshot, TypeRange types) {
+  SmallVector<InterpreterValue> result;
+  for (const auto& [arg, type] : llvm::zip(snapshot.arguments(), types)) {
+    TF_ASSIGN_OR_RETURN(auto converted, LiteralToValue(arg, type));
+    result.push_back(std::move(converted));
+  }
+  return result;
+}
+
+namespace {
+template <typename T, template <typename _> class rng_t>
+mlir::interpreter::InterpreterValue RandomTensor(absl::BitGenRef bitgen,
+                                                 mlir::Type type) {
+  llvm::SmallVector<int64_t> shape;
+  auto shaped_ty = type.dyn_cast<mlir::ShapedType>();
+  if (shaped_ty) {
+    shape = llvm::to_vector(shaped_ty.getShape());
+  }
+
+  auto rng = rng_t<T>{};
+  auto result = mlir::interpreter::TensorOrMemref<T>::Empty(shape);
+  for (const auto& index : result.view.Indices()) {
+    auto& elem = result.at(index) = rng(bitgen);
+    // Ints are typically indices, so scale them down to a more reasonable
+    // range.
+    if constexpr (std::is_same_v<T, int64_t>) {
+      elem >>= 60;
+    }
+  }
+  if (shaped_ty) {
+    return {result};
+  }
+  return {result.at({})};
+}
+}  // namespace
+
+mlir::FailureOr<mlir::interpreter::InterpreterValue> MakeRandomInput(
+    absl::BitGenRef bitgen, mlir::Type type) {
+  auto elem_ty =
+      type.isa<ShapedType>() ? type.cast<ShapedType>().getElementType() : type;
+  if (elem_ty.isF32()) {
+    return RandomTensor<float, absl::gaussian_distribution>(bitgen, type);
+  }
+  if (elem_ty.isF64()) {
+    return RandomTensor<double, absl::gaussian_distribution>(bitgen, type);
+  }
+  if (elem_ty.isInteger(32)) {
+    return RandomTensor<int32_t, absl::uniform_int_distribution>(bitgen, type);
+  }
+  if (elem_ty.isInteger(16)) {
+    return RandomTensor<int16_t, absl::uniform_int_distribution>(bitgen, type);
+  }
+  if (elem_ty.isInteger(64)) {
+    return RandomTensor<int64_t, absl::uniform_int_distribution>(bitgen, type);
+  }
+  if (elem_ty.isInteger(1)) {
+    return {{TensorOrMemref<bool>::Empty(type.cast<ShapedType>().getShape())}};
+  }
+
+  llvm::errs() << "Unsupported type: ";
+  type.print(llvm::errs());
+  llvm::errs() << "\n";
+  return failure();
+}
+
+// TODO(jreiffers): Add a flag to intentionally alias as many buffers as
+// possible (in particular, all non-variable inputs).
+// Extracts a mapping from function arguments to allocated buffers.
+// The buffer assignment is only relevant once the program is bufferized and
+// memref results were converted to arguments.
+std::vector<int64_t> ExtractXlaBufferAssignment(func::FuncOp main) {
+  std::vector<int64_t> buffer_assignment(main.getNumArguments());
+  auto result_mapping =
+      main->getAttrOfType<IntegerAttr>("xla_framework.result_mapping");
+  if (!result_mapping) {
+    // No attribute, fall back to unique buffers for each argument.
+    std::iota(buffer_assignment.begin(), buffer_assignment.end(), 0);
+    return buffer_assignment;
+  }
+
+  std::vector<int64_t> result_to_buffer;
+  if (auto inner_mapping = main->getAttrOfType<ArrayAttr>(
+          "xla_framework.result_inner_mapping")) {
+    llvm::copy(llvm::map_range(inner_mapping.getAsValueRange<IntegerAttr>(),
+                               [](const llvm::APInt& value) {
+                                 return value.getSExtValue();
+                               }),
+               std::back_inserter(result_to_buffer));
+  } else {
+    result_to_buffer = {result_mapping.getInt()};
+  }
+
+  int64_t result_index = 0;
+  for (int64_t arg_index : llvm::seq<int64_t>(0, main.getNumArguments())) {
+    if (auto input_buffer_index = main.getArgAttrOfType<IntegerAttr>(
+            arg_index, "xla_framework.input_mapping")) {
+      buffer_assignment[arg_index] = input_buffer_index.getInt();
+    } else {
+      buffer_assignment[arg_index] = result_to_buffer[result_index++];
+    }
+  }
+
+  return buffer_assignment;
+}
+
+}  // namespace
+
+absl::StatusOr<SmallVector<InterpreterValue>> Run(
+    MLIRContext& context, const std::string& mlir_ir,
+    const xla::HloSnapshot& snapshot, ExecutionTrace* trace,
+    const std::vector<std::string>& entry) {
+  auto source_mgr = std::make_shared<llvm::SourceMgr>();
+  source_mgr->AddNewSourceBuffer(llvm::MemoryBuffer::getMemBuffer(mlir_ir),
+                                 mlir::SMLoc());
+  mlir::OwningOpRef<mlir::Operation*> module =
+      mlir::parseSourceFileForTool(source_mgr, &context, false);
+  if (!module) {
+    return absl::InvalidArgumentError("failed to parse MLIR");
+  }
+
+  SymbolTable symbols(*module);
+  func::FuncOp main;
+  for (const std::string& candidate : entry) {
+    main = llvm::dyn_cast_or_null<func::FuncOp>(symbols.lookup(candidate));
+    if (main && !main.getBody().empty()) {
+      break;
+    }
+  }
+
+  if (!main) {
+    return absl::InvalidArgumentError("failed to find entry point");
+  }
+
+  if (trace) {
+    llvm::raw_string_ostream os(*trace->mutable_ir());
+    (*module)->print(os, OpPrintingFlags().printGenericOpForm());
+  }
+
+  auto function_args = main.getBody().getBlocks().front().getArguments();
+  if (!llvm::all_of(function_args, [&](Value arg) {
+        return isa<ShapedType>(arg.getType());
+      })) {
+    return absl::InvalidArgumentError(
+        "expected all function arguments to be shaped types");
+  }
+
+  auto args_to_buffers = ExtractXlaBufferAssignment(main);
+  TF_ASSIGN_OR_RETURN(auto args,
+                      LoadArgs(snapshot, main.getBody().getArgumentTypes()));
+  auto out_args =
+      main.getBody().getBlocks().front().getArguments().drop_front(args.size());
+
+  absl::flat_hash_map<int64_t, InterpreterValue> buffer_to_value;
+  // None of the input arguments will be statically known to alias.
+  for (auto [index, value] : llvm::enumerate(args)) {
+    buffer_to_value[args_to_buffers[index]] = value;
+  }
+
+  std::seed_seq my_seed_seq({0});
+  absl::BitGen bitgen(my_seed_seq);
+  llvm::SmallVector<InterpreterValue> out_buffers;
+  // Add random inputs for output arguments and unspecified inputs.
+  for (auto arg : out_args) {
+    auto ty = arg.getType();
+
+    int64_t buffer_index = args_to_buffers[arg.getArgNumber()];
+    // If we already have a buffer for this argument, use it.
+    if (buffer_to_value.contains(buffer_index)) {
+      auto& value = buffer_to_value[buffer_index];
+      out_buffers.push_back(value);
+      args.push_back(value);
+      continue;
+    }
+
+    auto arg_or = MakeRandomInput(bitgen, ty);
+    if (!succeeded(arg_or)) {
+      return absl::InvalidArgumentError("failed to create input");
+    }
+    out_buffers.push_back(*arg_or);
+    args.push_back(*arg_or);
+    buffer_to_value[buffer_index] = *arg_or;
+  }
+
+  InterpreterOptions options;
+  ExecutionTraceListener tracer(trace);
+  if (trace) {
+    options.listener = &tracer;
+  }
+  TF_ASSIGN_OR_RETURN(auto results,
+                      RunInterpreter(symbols, main, args, options));
+
+  if (results.empty()) {
+    return out_buffers;
+  }
+  return results;
+}
+
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/tools/mlir_replay/mlir_replay_lib.h b/third_party/xla/xla/mlir/tools/mlir_replay/mlir_replay_lib.h
new file mode 100644
index 00000000000000..c1ed5b228a53eb
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_replay/mlir_replay_lib.h
@@ -0,0 +1,40 @@
+/* Copyright 2022 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_MLIR_TOOLS_MLIR_REPLAY_MLIR_REPLAY_LIB_H_
+#define XLA_MLIR_TOOLS_MLIR_REPLAY_MLIR_REPLAY_LIB_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h"
+#include "xla/mlir/tools/mlir_replay/public/execution_trace.pb.h"
+#include "xla/service/hlo.pb.h"
+
+namespace mlir {
+namespace interpreter {
+
+// Runs the given IR on the inputs from `snapshot` and returns the result.
+absl::StatusOr<SmallVector<InterpreterValue>> Run(
+    MLIRContext& context, const std::string& mlir_ir,
+    const xla::HloSnapshot& snapshot, ExecutionTrace* trace,
+    const std::vector<std::string>& entry);
+
+}  // namespace interpreter
+}  // namespace mlir
+
+#endif  // XLA_MLIR_TOOLS_MLIR_REPLAY_MLIR_REPLAY_LIB_H_
diff --git a/third_party/xla/xla/mlir/tools/mlir_replay/public/BUILD b/third_party/xla/xla/mlir/tools/mlir_replay/public/BUILD
new file mode 100644
index 00000000000000..39b902613bfeb5
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_replay/public/BUILD
@@ -0,0 +1,76 @@
+load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "compiler_trace_instrumentation",
+    srcs = ["compiler_trace_instrumentation.cc"],
+    hdrs = ["compiler_trace_instrumentation.h"],
+    deps = [
+        ":compiler_trace_proto_cc",
+        ":compiler_trace_proto_cc_impl",
+        "//xla/service/llvm_ir:llvm_util",
+        "@com_google_absl//absl/strings:str_format",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:path",
+        "@local_tsl//tsl/platform:protobuf",
+    ],
+)
+
+cc_library(
+    name = "execution_trace_utils",
+    srcs = ["execution_trace_utils.cc"],
+    hdrs = ["execution_trace_utils.h"],
+    deps = [
+        ":execution_trace_proto_cc",
+        ":execution_trace_proto_cc_impl",
+        "//xla:literal",
+        "//xla:xla_data_proto_cc",
+        "//xla/mlir/tools/mlir_interpreter/framework",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "execution_trace_utils_test",
+    srcs = ["execution_trace_utils_test.cc"],
+    deps = [
+        ":execution_trace_utils",
+        "//xla:literal",
+        "//xla:literal_util",
+        "//xla/mlir/tools/mlir_interpreter/framework",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Support",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+tf_proto_library(
+    name = "execution_trace_proto",
+    srcs = ["execution_trace.proto"],
+    cc_api_version = 2,
+    make_default_target_header_only = True,
+)
+
+tf_proto_library(
+    name = "compiler_trace_proto",
+    srcs = ["compiler_trace.proto"],
+    cc_api_version = 2,
+    make_default_target_header_only = True,
+)
diff --git a/third_party/xla/xla/mlir/xla_cpu/ir/xla_cpu_dialect.td b/third_party/xla/xla/mlir/tools/mlir_replay/public/compiler_trace.proto
similarity index 56%
rename from third_party/xla/xla/mlir/xla_cpu/ir/xla_cpu_dialect.td
rename to third_party/xla/xla/mlir/tools/mlir_replay/public/compiler_trace.proto
index 026168df1f1898..b8f75dd1ad355f 100644
--- a/third_party/xla/xla/mlir/xla_cpu/ir/xla_cpu_dialect.td
+++ b/third_party/xla/xla/mlir/tools/mlir_replay/public/compiler_trace.proto
@@ -1,4 +1,4 @@
-/* Copyright 2022 The OpenXLA Authors.
+/* Copyright 2022 The OpenXLA Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,20 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_MLIR_XLA_CPU_DIALECT_TD_
-#define XLA_MLIR_XLA_CPU_DIALECT_TD_
+syntax = "proto2";
 
-include "mlir/IR/OpBase.td"
+package mlir.interpreter;
 
-def XlaCpuDialect : Dialect {
-  let name = "xla_cpu";
+message MlirCompilationTraceEntry {
+  // The name of the pass that was previously executed.
+  optional string after_pass = 1;
 
-  let summary = "Enums and operations for the xla_cpu dialect";
-  let description = [{
-    This dialect contains operations that bridge the gap between HLO and the
-    CPU runtime.
-  }];
-  let cppNamespace = "::mlir::xla_cpu";
+  // MLIR module IR of the state after the pass.
+  optional string mlir_module = 2;
 }
 
-#endif  // XLA_MLIR_XLA_CPU_DIALECT_TD_
+message MlirCompilationTrace {
+  // MLIR modules corresponding to each stage of the compilation pipeline.
+  repeated MlirCompilationTraceEntry passes = 1;
+}
diff --git a/third_party/xla/xla/mlir/tools/mlir_replay/public/compiler_trace_instrumentation.cc b/third_party/xla/xla/mlir/tools/mlir_replay/public/compiler_trace_instrumentation.cc
new file mode 100644
index 00000000000000..cc3c3460c4d97b
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_replay/public/compiler_trace_instrumentation.cc
@@ -0,0 +1,46 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/mlir/tools/mlir_replay/public/compiler_trace_instrumentation.h"
+
+#include <string>
+
+#include "absl/strings/str_format.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "xla/service/llvm_ir/llvm_util.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/logging.h"
+#include "tsl/platform/path.h"
+
+namespace mlir {
+namespace interpreter {
+
+void MlirCompilerTraceInstrumentation::runAfterPass(Pass* pass, Operation* op) {
+  ModuleOp module = llvm::dyn_cast<ModuleOp>(op);
+  if (!module) {
+    module = op->getParentOfType<mlir::ModuleOp>();
+  }
+  if (!module) {
+    LOG(ERROR) << "Failed to find a ModuleOp: " << pass->getName().str() << ".";
+    return;
+  }
+
+  auto* item = trace_.mutable_passes()->Add();
+  item->set_after_pass(pass->getName().str());
+  *item->mutable_mlir_module() = xla::llvm_ir::DumpToString(module);
+}
+
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/tools/mlir_replay/public/compiler_trace_instrumentation.h b/third_party/xla/xla/mlir/tools/mlir_replay/public/compiler_trace_instrumentation.h
new file mode 100644
index 00000000000000..8260c53b39965e
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_replay/public/compiler_trace_instrumentation.h
@@ -0,0 +1,42 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_MLIR_TOOLS_MLIR_REPLAY_PUBLIC_COMPILER_TRACE_INSTRUMENTATION_H_
+#define XLA_MLIR_TOOLS_MLIR_REPLAY_PUBLIC_COMPILER_TRACE_INSTRUMENTATION_H_
+
+#include <string>
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassInstrumentation.h"  // from @llvm-project
+#include "xla/mlir/tools/mlir_replay/public/compiler_trace.pb.h"
+
+namespace mlir {
+namespace interpreter {
+
+// Instrumentation that logs the state of the IR after each pass.
+class MlirCompilerTraceInstrumentation : public PassInstrumentation {
+ public:
+  explicit MlirCompilerTraceInstrumentation(MlirCompilationTrace& trace)
+      : trace_(trace) {}
+  void runAfterPass(Pass* pass, Operation* op) override;
+
+ private:
+  MlirCompilationTrace& trace_;
+};
+
+}  // namespace interpreter
+}  // namespace mlir
+
+#endif  // XLA_MLIR_TOOLS_MLIR_REPLAY_PUBLIC_COMPILER_TRACE_INSTRUMENTATION_H_
diff --git a/third_party/xla/xla/mlir/tools/mlir_replay/public/execution_trace.proto b/third_party/xla/xla/mlir/tools/mlir_replay/public/execution_trace.proto
new file mode 100644
index 00000000000000..c4ff5ecd4fa939
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_replay/public/execution_trace.proto
@@ -0,0 +1,72 @@
+/* Copyright 2022 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto2";
+
+package mlir.interpreter;
+
+message TracedValue {
+  // The shape - includes vector dimensions.
+  // TODO(jreiffers): Model vector dimensions separately.
+  repeated int64 shape = 1 [packed = true];
+  optional bool is_scalar = 2;
+
+  enum ElementType {
+    UNKNOWN = 0;
+    INTEGRAL = 1;
+    UNSIGNED = 2;
+    FLOAT = 3;
+    COMPLEX = 4;
+    TUPLE = 5;
+  }
+
+  optional int32 bit_width = 3;
+  optional ElementType element_type = 4;
+
+  repeated float floats = 5 [packed = true];
+  repeated double doubles = 6 [packed = true];
+  repeated int64 ints = 7 [packed = true];
+  repeated uint64 uints = 8 [packed = true];
+  repeated TracedValue tuple_elements = 9;
+}
+
+message InstructionTrace {
+  optional string name = 1;
+  repeated TracedValue args = 2;
+  repeated TracedValue results = 3;
+  // TODO(jreiffers): Model side effects (e.g. memref.store).
+
+  repeated RegionTrace regions = 4;
+}
+
+message RegionTrace {
+  // The number of the region that is being executed (within the parent op).
+  // For example: '1' for an scf.while's `after` region.
+  optional int32 region_number = 1;
+  // The arguments that were passed to the region.
+  repeated TracedValue bbargs = 2;
+  // One instruction per instruction in the region.
+  repeated InstructionTrace instructions = 3;
+  repeated TracedValue results = 4;
+}
+
+message ExecutionTrace {
+  // The IR that was executed. Note: this should always be filled in the generic
+  // format.
+  optional string ir = 1;
+
+  // The trace of the entry function execution.
+  optional RegionTrace trace = 2;
+}
diff --git a/third_party/xla/xla/mlir/tools/mlir_replay/public/execution_trace_utils.cc b/third_party/xla/xla/mlir/tools/mlir_replay/public/execution_trace_utils.cc
new file mode 100644
index 00000000000000..e90b51ed6e0733
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_replay/public/execution_trace_utils.cc
@@ -0,0 +1,457 @@
+/* Copyright 2022 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/mlir/tools/mlir_replay/public/execution_trace_utils.h"
+
+#include <cassert>
+#include <complex>
+#include <cstdint>
+#include <functional>
+#include <iterator>
+#include <memory>
+#include <type_traits>
+#include <utility>
+#include <variant>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Region.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/literal.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/tensor_or_memref.h"
+#include "xla/mlir/tools/mlir_replay/public/execution_trace.pb.h"
+#include "tsl/platform/statusor.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+// Visitor for converting an InterpreterValue to a TracedValue.
+struct TraceInterpreterValueVisitor {
+  TracedValue out;
+
+  void Add(float v) { out.add_floats(v); }
+  void Add(double v) { out.add_doubles(v); }
+  void Add(std::complex<float> v) {
+    out.add_floats(v.real());
+    out.add_floats(v.imag());
+  }
+  void Add(std::complex<double> v) {
+    out.add_doubles(v.real());
+    out.add_doubles(v.imag());
+  }
+  void Add(int64_t v) { out.add_ints(v); }
+  void Add(int32_t v) { out.add_ints(v); }
+  void Add(int16_t v) { out.add_ints(v); }
+  void Add(int8_t v) { out.add_ints(v); }
+  void Add(uint64_t v) { out.add_uints(v); }
+  void Add(uint32_t v) { out.add_uints(v); }
+  void Add(uint16_t v) { out.add_uints(v); }
+  void Add(uint8_t v) { out.add_uints(v); }
+  void Add(bool v) { out.add_ints(static_cast<int64_t>(v)); }
+
+  template <typename T>
+  void operator()(T v) {
+    SetElementType<T>();
+    out.set_is_scalar(true);
+    Add(v);
+  }
+
+  void operator()(const Tuple& t) {
+    out.set_element_type(TracedValue::TUPLE);
+    for (const auto& v : t.values) {
+      *out.add_tuple_elements() = ValueToTracedValue(*v);
+    }
+  }
+
+  template <typename T>
+  void operator()(const TensorOrMemref<T>& v) {
+    for (int64_t size : v.view.sizes) {
+      out.add_shape(size);
+    }
+    SetElementType<T>();
+    for (const auto& index : v.view.Indices()) {
+      Add(v.at(index));
+    }
+  }
+
+  template <typename T>
+  void SetElementType() {
+    out.set_element_type(GetElementType(T{}));
+    if constexpr (std::is_same_v<T, bool>) {
+      out.set_bit_width(1);
+    } else {
+      out.set_bit_width(sizeof(T) * 8);
+    }
+  }
+
+  template <typename T>
+  static TracedValue::ElementType GetElementType(const T&) {
+    if constexpr (std::is_floating_point_v<T>) {
+      return TracedValue::FLOAT;
+    } else if constexpr (std::is_integral_v<T>) {
+      if constexpr (std::is_unsigned_v<T>) {
+        return TracedValue::UNSIGNED;
+      } else {
+        return TracedValue::INTEGRAL;
+      }
+    } else {
+      T{"invalid type"} + 0;
+      return TracedValue::UNKNOWN;
+    }
+  }
+
+  template <typename T>
+  static TracedValue::ElementType GetElementType(const std::complex<T>&) {
+    return TracedValue::COMPLEX;
+  }
+
+  static TracedValue::ElementType GetElementType(const Tuple&) {
+    return TracedValue::UNKNOWN;
+  }
+};
+
+}  // namespace
+
+void ExecutionTraceListener::BeforeOp(ArrayRef<InterpreterValue> args,
+                                      Operation* op) {
+  auto* inst = regions_.back()->add_instructions();
+  inst->set_name(op->getName().getStringRef().str());
+  for (const auto& arg : args) {
+    *inst->add_args() = ValueToTracedValue(arg);
+  }
+}
+
+void ExecutionTraceListener::AfterOp(ArrayRef<InterpreterValue> results) {
+  auto* traced_results =
+      regions_.back()->mutable_instructions()->rbegin()->mutable_results();
+  for (const auto& result : results) {
+    *traced_results->Add() = ValueToTracedValue(result);
+  }
+}
+
+void ExecutionTraceListener::EnterRegion(ArrayRef<InterpreterValue> bbargs,
+                                         Region& region) {
+  if (regions_.empty()) {
+    regions_.push_back(trace_->mutable_trace());
+  } else {
+    regions_.push_back(
+        regions_.back()->mutable_instructions()->rbegin()->add_regions());
+  }
+
+  auto& traced_region = *regions_.back();
+  traced_region.set_region_number(region.getRegionNumber());
+  for (const auto& bbarg : bbargs) {
+    *traced_region.add_bbargs() = ValueToTracedValue(bbarg);
+  }
+}
+
+void ExecutionTraceListener::LeaveRegion(ArrayRef<InterpreterValue> yielded) {
+  for (const auto& result : yielded) {
+    *regions_.back()->add_results() = ValueToTracedValue(result);
+  }
+  regions_.pop_back();
+}
+
+llvm::SmallVector<mlir::Attribute> ValueToAttribute(
+    const InterpreterValue& value, mlir::Type type) {
+  if (std::holds_alternative<Tuple>(value.storage)) {
+    auto types = type.cast<TupleType>().getTypes();
+    const auto& t = std::get<Tuple>(value.storage);
+    llvm::SmallVector<mlir::Attribute> attrs;
+    for (const auto& [v, ty] : llvm::zip(t.values, types)) {
+      auto attr = ValueToAttribute(*v, ty);
+      assert(attr.size() == 1 && "nested tuples not supported");
+      attrs.push_back(attr.front());
+    }
+    return attrs;
+  }
+
+  if (!value.IsTensor()) {
+    return {cast<DenseElementsAttr>(
+                ValueToAttribute(value.AsUnitTensor(),
+                                 mlir::RankedTensorType::get({}, type))
+                    .front())
+                .getValues<mlir::Attribute>()[0]};
+  }
+
+  if (!type.isa<ShapedType>()) {
+    return {};
+  }
+
+  auto shaped_ty = type.cast<ShapedType>();
+  return {DispatchScalarType(shaped_ty, [&](auto dummy) -> mlir::Attribute {
+    using T = decltype(dummy);
+    auto& t = std::get<TensorOrMemref<T>>(value.storage);
+    SmallVector<T> vals;
+    for (const auto& index : t.view.Indices()) {
+      vals.push_back(t.at(index));
+    }
+    auto attr_ty =
+        shaped_ty.cloneWith(/*shape=*/t.view.sizes, shaped_ty.getElementType());
+    if constexpr (std::is_same_v<T, bool>) {
+      return mlir::DenseElementsAttr::get(attr_ty, vals);
+    } else {
+      return mlir::DenseElementsAttr::get<T>(attr_ty, vals);
+    }
+  })};
+}
+
+namespace {
+template <typename T>
+TensorOrMemref<T> ArrayLiteralToTensor(const xla::Literal& literal) {
+  SmallVector<int64_t> layout;
+  if (literal.shape().has_layout()) {
+    llvm::copy(literal.shape().layout().minor_to_major(),
+               std::back_inserter(layout));
+  }
+  SmallVector<int64_t> shape{literal.shape().dimensions().begin(),
+                             literal.shape().dimensions().end()};
+  auto result = TensorOrMemref<T>::Empty(shape, layout);
+  assert(literal.size_bytes() == result.buffer->GetByteSize() &&
+         "expected buffer sizes to match");
+  memcpy(result.buffer->at(0, 0), literal.untyped_data(),
+         result.buffer->GetByteSize());
+  return result;
+}
+}  // namespace
+
+absl::StatusOr<InterpreterValue> LiteralToValue(const xla::Literal& literal) {
+  if (literal.shape().IsTuple()) {
+    auto elements = literal.Clone().DecomposeTuple();
+    Tuple result;
+    for (auto& element : elements) {
+      TF_ASSIGN_OR_RETURN(auto converted, LiteralToValue(element));
+      result.values.push_back(
+          std::make_shared<InterpreterValue>(std::move(converted)));
+    }
+    return {{result}};
+  }
+
+  if (literal.shape().IsToken()) {
+    return absl::UnimplementedError("token arguments are not implemented");
+  }
+
+  if (literal.shape().IsArray()) {
+    switch (literal.shape().element_type()) {
+      case xla::PRED:
+        return {{ArrayLiteralToTensor<bool>(literal)}};
+      case xla::S8:
+        return {{ArrayLiteralToTensor<int8_t>(literal)}};
+      case xla::S16:
+        return {{ArrayLiteralToTensor<int16_t>(literal)}};
+      case xla::S32:
+        return {{ArrayLiteralToTensor<int32_t>(literal)}};
+      case xla::S64:
+        return {{ArrayLiteralToTensor<int64_t>(literal)}};
+      case xla::U8:
+        return {{ArrayLiteralToTensor<uint8_t>(literal)}};
+      case xla::U16:
+        return {{ArrayLiteralToTensor<uint16_t>(literal)}};
+      case xla::U32:
+        return {{ArrayLiteralToTensor<uint32_t>(literal)}};
+      case xla::U64:
+        return {{ArrayLiteralToTensor<uint64_t>(literal)}};
+      case xla::F16:
+        return absl::UnimplementedError("F16 not implemented");
+      case xla::F32:
+        return {{ArrayLiteralToTensor<float>(literal)}};
+      case xla::BF16:
+        return absl::UnimplementedError("BF16 not implemented");
+      case xla::F64:
+        return {{ArrayLiteralToTensor<double>(literal)}};
+      case xla::F8E5M2:
+        return absl::UnimplementedError("F8E5M2 not implemented");
+      case xla::F8E4M3FN:
+        return absl::UnimplementedError("F8E4M3FN not implemented");
+      case xla::F8E4M3B11FNUZ:
+        return absl::UnimplementedError("F8E4M3B11FNUZ not implemented");
+      case xla::F8E5M2FNUZ:
+        return absl::UnimplementedError("F8E5M2FNUZ not implemented");
+      case xla::F8E4M3FNUZ:
+        return absl::UnimplementedError("F8E4M3FNUZ not implemented");
+      case xla::C64:
+        return {{ArrayLiteralToTensor<std::complex<float>>(literal)}};
+      case xla::C128:
+        return {{ArrayLiteralToTensor<std::complex<double>>(literal)}};
+      default:
+        // Fallthrough intended.
+        break;
+    }
+  }
+
+  return absl::InvalidArgumentError("unexpected literal type");
+}
+
+absl::StatusOr<InterpreterValue> LiteralToValue(
+    const xla::LiteralProto& literal) {
+  TF_ASSIGN_OR_RETURN(auto deserialized,
+                      xla::Literal::CreateFromProto(literal));
+  return LiteralToValue(deserialized);
+}
+
+absl::StatusOr<InterpreterValue> LiteralToValue(
+    const xla::LiteralProto& literal, mlir::Type type) {
+  TF_ASSIGN_OR_RETURN(auto result, LiteralToValue(literal));
+  return {DispatchScalarType(type, [&](auto dummy) -> InterpreterValue {
+    TensorOrMemref<decltype(dummy)> cast;
+    cast.view = result.View();
+    cast.buffer = result.GetBuffer();
+    return {cast};
+  })};
+}
+
+TracedValue ValueToTracedValue(const InterpreterValue& value) {
+  TraceInterpreterValueVisitor visitor;
+  std::visit(visitor, value.storage);
+  return visitor.out;
+}
+
+absl::StatusOr<InterpreterValue> TracedValueToValue(
+    const TracedValue& traced_value) {
+  auto extract = [&](auto dummy, auto& elements) -> InterpreterValue {
+    using T = decltype(dummy);
+    if (traced_value.is_scalar()) {
+      return {static_cast<T>(elements[0])};
+    }
+
+    auto result =
+        TensorOrMemref<T>::Empty(llvm::to_vector(traced_value.shape()));
+    for (auto [index, element] : llvm::zip(result.view.Indices(), elements)) {
+      result.at(index) = element;
+    }
+    return {result};
+  };
+  auto extract_complex = [&](auto& elements) -> InterpreterValue {
+    using T = std::complex<std::decay_t<decltype(elements[0])>>;
+    if (traced_value.is_scalar()) {
+      return {T{elements[0], elements[1]}};
+    }
+
+    auto result =
+        TensorOrMemref<T>::Empty(llvm::to_vector(traced_value.shape()));
+    int64_t i = 0;
+    for (auto it = result.view.Indices().begin(),
+              end = result.view.Indices().end();
+         it != end; ++it, i += 2) {
+      result.at(*it) = {elements[i], elements[i + 1]};
+    }
+    return {result};
+  };
+  switch (traced_value.element_type()) {
+    case TracedValue::UNKNOWN:
+      break;
+    case TracedValue::FLOAT:
+      if (traced_value.bit_width() == 32) {
+        return extract(float{}, traced_value.floats());
+      }
+      return extract(double{}, traced_value.doubles());
+    case TracedValue::UNSIGNED:
+      switch (traced_value.bit_width()) {
+        case 1:
+          return extract(bool{}, traced_value.ints());
+        case 8:
+          return extract(uint8_t{}, traced_value.uints());
+        case 16:
+          return extract(uint16_t{}, traced_value.uints());
+        case 32:
+          return extract(uint32_t{}, traced_value.uints());
+        case 64:
+          return extract(uint64_t{}, traced_value.uints());
+      }
+      break;
+    case TracedValue::INTEGRAL:
+      switch (traced_value.bit_width()) {
+        case 8:
+          return extract(int8_t{}, traced_value.ints());
+        case 16:
+          return extract(int16_t{}, traced_value.ints());
+        case 32:
+          return extract(int32_t{}, traced_value.ints());
+        case 64:
+          return extract(int64_t{}, traced_value.ints());
+      }
+      break;
+    case TracedValue::COMPLEX:
+      switch (traced_value.bit_width()) {
+        case 64:
+          return extract_complex(traced_value.floats());
+        case 128:
+          return extract_complex(traced_value.doubles());
+      }
+      break;
+    case TracedValue::TUPLE:
+      Tuple result;
+      for (const auto& elem : traced_value.tuple_elements()) {
+        TF_ASSIGN_OR_RETURN(auto converted, TracedValueToValue(elem));
+        result.values.push_back(
+            std::make_shared<InterpreterValue>(std::move(converted)));
+      }
+      return {{std::move(result)}};
+  }
+  return absl::InvalidArgumentError("unexpected type: " +
+                                    traced_value.DebugString());
+}
+
+llvm::SmallVector<const InstructionTrace*> FindOpExecutionsInTrace(
+    const ExecutionTrace& trace, mlir::Operation* op) {
+  llvm::SmallVector<int64_t> region_indices;
+  llvm::SmallVector<int64_t> op_indices;
+
+  std::function<void(mlir::Operation*)> get_op_path;
+  get_op_path = [&](mlir::Operation* op) {
+    auto* parent = op->getParentOp();
+    if (!llvm::isa<func::FuncOp>(parent)) {
+      get_op_path(parent);
+      region_indices.push_back(op->getParentRegion()->getRegionNumber());
+    }
+
+    int64_t index = 0;
+    while ((op = op->getPrevNode()) != nullptr) ++index;
+    op_indices.push_back(index);
+  };
+  get_op_path(op);
+
+  llvm::SmallVector<const InstructionTrace*> result;
+  std::function<void(const RegionTrace& trace, int index)> step;
+  step = [&](const RegionTrace& trace, int index) {
+    auto& instruction_trace = trace.instructions(op_indices[index]);
+    if (region_indices.size() > index) {
+      for (const auto& region : instruction_trace.regions()) {
+        if (region.region_number() == region_indices[index]) {
+          step(region, index + 1);
+        }
+      }
+    } else {
+      result.push_back(&instruction_trace);
+    }
+  };
+  step(trace.trace(), 0);
+
+  return result;
+}
+
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/tools/mlir_replay/public/execution_trace_utils.h b/third_party/xla/xla/mlir/tools/mlir_replay/public/execution_trace_utils.h
new file mode 100644
index 00000000000000..4c4c36e389795e
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_replay/public/execution_trace_utils.h
@@ -0,0 +1,76 @@
+/* Copyright 2022 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_MLIR_TOOLS_MLIR_REPLAY_PUBLIC_EXECUTION_TRACE_UTILS_H_
+#define XLA_MLIR_TOOLS_MLIR_REPLAY_PUBLIC_EXECUTION_TRACE_UTILS_H_
+
+#include "absl/status/statusor.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Region.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/literal.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h"
+#include "xla/mlir/tools/mlir_replay/public/execution_trace.pb.h"
+#include "xla/xla_data.pb.h"
+
+namespace mlir {
+namespace interpreter {
+
+// Interpreter listener that builds a trace of all executed ops and regions.
+class ExecutionTraceListener : public InterpreterListener {
+ public:
+  explicit ExecutionTraceListener(ExecutionTrace* trace) : trace_(trace) {}
+
+  void BeforeOp(ArrayRef<InterpreterValue> args, Operation* op) override;
+  void AfterOp(ArrayRef<InterpreterValue> results) override;
+  void EnterRegion(ArrayRef<InterpreterValue> bbargs, Region& region) override;
+  void LeaveRegion(ArrayRef<InterpreterValue> yielded) override;
+
+ private:
+  ExecutionTrace* trace_;
+  SmallVector<RegionTrace*> regions_;
+};
+
+// Returns an attribute with the given contents and type.
+llvm::SmallVector<mlir::Attribute> ValueToAttribute(
+    const InterpreterValue& value, mlir::Type type);
+
+// Deserializes the given literal.
+absl::StatusOr<InterpreterValue> LiteralToValue(
+    const xla::LiteralProto& literal);
+// Deserializes the given literal and then casts it to the given type.
+absl::StatusOr<InterpreterValue> LiteralToValue(
+    const xla::LiteralProto& literal, mlir::Type type);
+
+// Deserializes the given literal.
+absl::StatusOr<InterpreterValue> LiteralToValue(const xla::Literal& literal);
+
+// Serializes the given interpreter value.
+TracedValue ValueToTracedValue(const InterpreterValue& value);
+
+// Deserializes the given traced value.
+absl::StatusOr<InterpreterValue> TracedValueToValue(
+    const TracedValue& traced_value);
+
+// Returns all executions of the given op in the given trace.
+llvm::SmallVector<const InstructionTrace*> FindOpExecutionsInTrace(
+    const ExecutionTrace& trace, mlir::Operation* op);
+
+}  // namespace interpreter
+}  // namespace mlir
+
+#endif  // XLA_MLIR_TOOLS_MLIR_REPLAY_PUBLIC_EXECUTION_TRACE_UTILS_H_
diff --git a/third_party/xla/xla/mlir/tools/mlir_replay/public/execution_trace_utils_test.cc b/third_party/xla/xla/mlir/tools/mlir_replay/public/execution_trace_utils_test.cc
new file mode 100644
index 00000000000000..094bad426493c3
--- /dev/null
+++ b/third_party/xla/xla/mlir/tools/mlir_replay/public/execution_trace_utils_test.cc
@@ -0,0 +1,140 @@
+/* Copyright 2022 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/mlir/tools/mlir_replay/public/execution_trace_utils.h"
+
+#include <cmath>
+#include <complex>
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/literal.h"
+#include "xla/literal_util.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/interpreter_value.h"
+#include "xla/mlir/tools/mlir_interpreter/framework/tensor_or_memref.h"
+#include "tsl/platform/statusor.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+class TracedValueRoundTripTest
+    : public ::testing::TestWithParam<InterpreterValue> {};
+
+TEST_P(TracedValueRoundTripTest, Run) {
+  auto traced_value = ValueToTracedValue(GetParam());
+  TF_ASSERT_OK_AND_ASSIGN(auto value, TracedValueToValue(traced_value));
+  EXPECT_EQ(GetParam(), value) << GetParam().ToString();
+}
+
+template <typename T>
+InterpreterValue MakeTensor(ArrayRef<int64_t> shape, ArrayRef<T> values) {
+  auto result = TensorOrMemref<T>::Empty(shape);
+  for (auto [indices, value] : llvm::zip(result.view.Indices(), values)) {
+    result.at(indices) = value;
+  }
+  return {result};
+}
+
+template <typename T>
+std::shared_ptr<T> WrapShared(T value) {
+  return std::make_shared<T>(std::move(value));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    RoundTrip, TracedValueRoundTripTest,
+    ::testing::ValuesIn(std::vector<InterpreterValue>{
+        {uint8_t{42}},
+        {uint16_t{43}},
+        {uint32_t{44}},
+        {uint64_t{45}},
+        {int8_t{-47}},
+        {int16_t{-48}},
+        {int32_t{-49}},
+        {int64_t{-50}},
+        {float{42.0}},
+        {double{42.0}},
+        {std::complex<float>{1.0, 2.0}},
+        {std::complex<double>{3.0, 4.0}},
+        {true},
+        {false},
+        {MakeTensor<int16_t>({1, 2}, {42, 43})},
+        {MakeTensor<double>({2, 2}, {1.0, -INFINITY, INFINITY, NAN})},
+        {MakeTensor<std::complex<double>>({}, {{1.0, 2.0}})},
+        {Tuple{SmallVector<std::shared_ptr<InterpreterValue>>{
+            WrapShared(InterpreterValue{42}),
+            WrapShared(InterpreterValue{43.0}),
+        }}}}));
+
+class FromLiteralTest
+    : public ::testing::TestWithParam<
+          std::pair<std::shared_ptr<xla::Literal>, InterpreterValue>> {};
+
+TEST_P(FromLiteralTest, Run) {
+  TF_ASSERT_OK_AND_ASSIGN(auto value, LiteralToValue(*GetParam().first));
+  EXPECT_EQ(value, GetParam().second)
+      << value.ToString() << " vs " << GetParam().second.ToString();
+}
+
+std::vector<std::pair<std::shared_ptr<xla::Literal>, InterpreterValue>>
+MakeInputs() {
+  using ::xla::LiteralUtil;
+  return {
+      {WrapShared(LiteralUtil::CreateR2<uint8_t>({{41, 42}})),
+       MakeTensor<uint8_t>({1, 2}, {41, 42})},
+      {WrapShared(LiteralUtil::CreateR0<uint16_t>(43)),
+       MakeTensor<uint16_t>({}, {43})},
+      {WrapShared(LiteralUtil::CreateR0<uint32_t>(44)),
+       MakeTensor<uint32_t>({}, {44})},
+      {WrapShared(LiteralUtil::CreateR0<uint64_t>(45)),
+       MakeTensor<uint64_t>({}, {45})},
+      {WrapShared(LiteralUtil::CreateR0<int8_t>(46)),
+       MakeTensor<int8_t>({}, {46})},
+      {WrapShared(LiteralUtil::CreateR0<int16_t>(47)),
+       MakeTensor<int16_t>({}, {47})},
+      {WrapShared(LiteralUtil::CreateR0<int32_t>(48)),
+       MakeTensor<int32_t>({}, {48})},
+      {WrapShared(LiteralUtil::CreateR0<int64_t>(49)),
+       MakeTensor<int64_t>({}, {49})},
+      {WrapShared(LiteralUtil::CreateR0<float>(50.0)),
+       MakeTensor<float>({}, {50.0})},
+      {WrapShared(LiteralUtil::CreateR0<double>(51.0)),
+       MakeTensor<double>({}, {51.0})},
+      {WrapShared(LiteralUtil::CreateR0<std::complex<float>>({52.0, 53.0})),
+       MakeTensor<std::complex<float>>({}, {{52.0, 53.0}})},
+      {WrapShared(LiteralUtil::CreateR0<std::complex<double>>({54.0, 55.0})),
+       MakeTensor<std::complex<double>>({}, {{54.0, 55.0}})},
+      {WrapShared(LiteralUtil::CreateR1<bool>({true, false})),
+       MakeTensor<bool>({2}, {true, false})},
+      {WrapShared(
+           LiteralUtil::MakeTupleOwned(LiteralUtil::CreateR0<bool>(true),
+                                       LiteralUtil::CreateR0<int8_t>(56))),
+       InterpreterValue{Tuple{SmallVector<std::shared_ptr<InterpreterValue>>{
+           std::make_shared<InterpreterValue>(MakeTensor<bool>({}, {true})),
+           std::make_shared<InterpreterValue>(
+               MakeTensor<int8_t>({}, {56}))}}}}};
+}
+
+INSTANTIATE_TEST_SUITE_P(Test, FromLiteralTest,
+                         ::testing::ValuesIn(MakeInputs()));
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/third_party/xla/xla/mlir/utils/BUILD b/third_party/xla/xla/mlir/utils/BUILD
index 9652b2fbf3dbb0..4267e525a8aaf1 100644
--- a/third_party/xla/xla/mlir/utils/BUILD
+++ b/third_party/xla/xla/mlir/utils/BUILD
@@ -51,6 +51,7 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "@com_google_absl//absl/status:statusor",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
diff --git a/third_party/xla/xla/mlir/utils/error_util.h b/third_party/xla/xla/mlir/utils/error_util.h
index b37f478e173c48..d48c80ed59d0e6 100644
--- a/third_party/xla/xla/mlir/utils/error_util.h
+++ b/third_party/xla/xla/mlir/utils/error_util.h
@@ -25,13 +25,14 @@ limitations under the License.
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 
-// Error utilities for MLIR when interacting with code using Status returns.
+// Error utilities for MLIR when interacting with code using absl::Status
+// returns.
 namespace mlir {
 // Diagnostic handler that collects all the diagnostics reported and can
-// produce a Status to return to callers. This is for the case where MLIR
-// functions are called from a function that will return a Status: MLIR code
-// still uses the default error reporting, and the final return function can
-// return the Status constructed from the diagnostics collected.
+// produce a absl::Status to return to callers. This is for the case where MLIR
+// functions are called from a function that will return a absl::Status: MLIR
+// code still uses the default error reporting, and the final return function
+// can return the absl::Status constructed from the diagnostics collected.
 class BaseScopedDiagnosticHandler : public SourceMgrDiagnosticHandler {
  public:
   explicit BaseScopedDiagnosticHandler(MLIRContext* context,
@@ -42,10 +43,10 @@ class BaseScopedDiagnosticHandler : public SourceMgrDiagnosticHandler {
   // Returns whether any errors were reported.
   bool ok() const;
 
-  // Returns Status corresponding to the diagnostics reported. This consumes
-  // the diagnostics reported and returns a Status of type Unknown. It is
-  // required to consume the error status, if there is one, before destroying
-  // the object.
+  // Returns absl::Status corresponding to the diagnostics reported. This
+  // consumes the diagnostics reported and returns a absl::Status of type
+  // Unknown. It is required to consume the error status, if there is one,
+  // before destroying the object.
   absl::Status ConsumeStatus();
 
   // Returns the combination of the passed in status and consumed diagnostics.
diff --git a/third_party/xla/xla/mlir/utils/type_util.cc b/third_party/xla/xla/mlir/utils/type_util.cc
index 873d9609083270..29072d09f79174 100644
--- a/third_party/xla/xla/mlir/utils/type_util.cc
+++ b/third_party/xla/xla/mlir/utils/type_util.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "xla/primitive_util.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -91,11 +92,11 @@ xla::PrimitiveType ConvertMlirTypeToPrimitiveType(mlir::Type type) {
     return xla::PrimitiveType::F32;
   } else if (type.isF64()) {
     return xla::PrimitiveType::F64;
-  } else if (auto complex_type = type.dyn_cast<mlir::ComplexType>()) {
+  } else if (auto complex_type = mlir::dyn_cast<mlir::ComplexType>(type)) {
     mlir::Type element_ty = complex_type.getElementType();
     return xla::primitive_util::ComplexType(
         ConvertMlirTypeToPrimitiveType(element_ty));
-  } else if (auto integer_type = type.dyn_cast<mlir::IntegerType>()) {
+  } else if (auto integer_type = mlir::dyn_cast<mlir::IntegerType>(type)) {
     bool is_unsigned = integer_type.isUnsigned();
     if (integer_type.getWidth() == 1) {
       return xla::PrimitiveType::PRED;
diff --git a/third_party/xla/xla/mlir/xla_cpu/ir/BUILD b/third_party/xla/xla/mlir/xla_cpu/ir/BUILD
deleted file mode 100644
index b5933c23b96da4..00000000000000
--- a/third_party/xla/xla/mlir/xla_cpu/ir/BUILD
+++ /dev/null
@@ -1,111 +0,0 @@
-load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("//xla/tsl:tsl.bzl", "internal_visibility")
-load(
-    "//xla/tsl:tsl.default.bzl",
-    "get_compatible_with_portable",
-)
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = internal_visibility(["//learning/brain/mlir:xla_friends"]),
-)
-
-td_library(
-    name = "td_files",
-    srcs = [
-        "xla_cpu_dialect.td",
-        "xla_cpu_enums.td",
-        "xla_cpu_ops.td",
-    ],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        "//xla/mlir_hlo:hlo_ops_td_files",
-        "@llvm-project//mlir:BufferizableOpInterfaceTdFiles",
-        "@llvm-project//mlir:InferTypeOpInterfaceTdFiles",
-        "@llvm-project//mlir:OpBaseTdFiles",
-    ],
-)
-
-gentbl_cc_library(
-    name = "xla_cpu_dialect_inc_gen",
-    compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-dialect-decls"],
-            "xla_cpu_dialect.h.inc",
-        ),
-        (
-            ["-gen-dialect-defs"],
-            "xla_cpu_dialect.cc.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "xla_cpu_dialect.td",
-    deps = [":td_files"],
-)
-
-gentbl_cc_library(
-    name = "xla_cpu_inc_gen",
-    compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "xla_cpu.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "xla_cpu.cc.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "xla_cpu_ops.td",
-    deps = [":td_files"],
-)
-
-gentbl_cc_library(
-    name = "xla_cpu_enums_inc_gen",
-    compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            ["-gen-enum-decls"],
-            "xla_cpu_enums.h.inc",
-        ),
-        (
-            ["-gen-enum-defs"],
-            "xla_cpu_enums.cc.inc",
-        ),
-        (
-            ["-gen-attrdef-decls"],
-            "xla_cpu_attrdefs.h.inc",
-        ),
-        (
-            ["-gen-attrdef-defs"],
-            "xla_cpu_attrdefs.cc.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "xla_cpu_enums.td",
-    deps = [
-        ":td_files",
-    ],
-)
-
-cc_library(
-    name = "xla_cpu",
-    srcs = [
-        "xla_cpu.cc",
-    ],
-    hdrs = ["xla_cpu.h"],
-    deps = [
-        ":xla_cpu_dialect_inc_gen",
-        ":xla_cpu_enums_inc_gen",
-        ":xla_cpu_inc_gen",
-        "//xla/mlir_hlo",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:BufferizationDialect",
-        "@llvm-project//mlir:BufferizationInterfaces",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:InferTypeOpInterface",
-    ],
-)
diff --git a/third_party/xla/xla/mlir/xla_cpu/ir/xla_cpu.cc b/third_party/xla/xla/mlir/xla_cpu/ir/xla_cpu.cc
deleted file mode 100644
index 3e977dc7f2e7cc..00000000000000
--- a/third_party/xla/xla/mlir/xla_cpu/ir/xla_cpu.cc
+++ /dev/null
@@ -1,186 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/mlir/xla_cpu/ir/xla_cpu.h"
-
-#include <optional>
-
-#include "llvm/ADT/TypeSwitch.h"
-#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
-#include "xla/mlir/xla_cpu/ir/xla_cpu_dialect.cc.inc"
-#include "xla/mlir/xla_cpu/ir/xla_cpu_enums.cc.inc"
-#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
-#define GET_ATTRDEF_CLASSES
-#include "xla/mlir/xla_cpu/ir/xla_cpu_attrdefs.cc.inc"
-
-namespace mlir {
-namespace xla_cpu {
-
-using ::mlir::mhlo::TokenType;
-
-void XlaCpuDialect::initialize() {
-  addOperations<
-#define GET_OP_LIST
-#include "xla/mlir/xla_cpu/ir/xla_cpu.cc.inc"
-#undef GET_OP_LIST
-      >();
-}
-
-template <typename Op>
-LogicalResult BufferizeOp(Op op, RewriterBase &rewriter,
-                          const bufferization::BufferizationOptions &options,
-                          int64_t num_inputs) {
-  if (op.getOperands().front().getType().template isa<MemRefType>()) {
-    return success();
-  }
-  SmallVector<Value> new_operands;
-  std::optional<Value> token = std::nullopt;
-  for (auto operand : op.getOperands()) {
-    if (operand.getType().template isa<TokenType>()) {
-      assert(operand == op.getOperands().back() &&
-             "Expect token type only for last operand");
-      assert(!token && "Expect at most only one token-typed operand");
-      token = operand;
-      continue;
-    }
-    FailureOr<Value> maybe_buffer = getBuffer(rewriter, operand, options);
-    if (failed(maybe_buffer)) {
-      return failure();
-    }
-    new_operands.push_back(*maybe_buffer);
-  }
-  rewriter.create<Op>(op.getLoc(), TypeRange{}, new_operands,
-                      op.getOperation()->getAttrs());
-
-  if (token) {
-    new_operands.push_back(*token);
-  }
-  bufferization::replaceOpWithBufferizedValues(
-      rewriter, op.getOperation(),
-      llvm::ArrayRef(new_operands).drop_front(num_inputs));
-  return success();
-}
-
-bool AllReduceOp::bufferizesToMemoryRead(OpOperand &opOperand,
-                                         const bufferization::AnalysisState &) {
-  return opOperand.getOperandNumber() < getNumOperands() / 2;
-}
-
-bool AllReduceOp::bufferizesToMemoryWrite(
-    OpOperand &opOperand, const bufferization::AnalysisState &state) {
-  return !bufferizesToMemoryRead(opOperand, state);
-}
-
-bufferization::AliasingValueList AllReduceOp::getAliasingValues(
-    OpOperand &opOperand, const bufferization::AnalysisState &) {
-  if (opOperand.getOperandNumber() < getNumOperands() / 2) {
-    return {};
-  }
-  return {{getOperation()->getOpResult(opOperand.getOperandNumber() -
-                                       getNumOperands() / 2),
-           bufferization::BufferRelation::Equivalent}};
-}
-
-LogicalResult AllReduceOp::bufferize(
-    RewriterBase &rewriter,
-    const bufferization::BufferizationOptions &options) {
-  return BufferizeOp(*this, rewriter, options, this->getNumOperands() / 2);
-}
-
-LogicalResult CollectivePermuteOp::bufferize(
-    RewriterBase &rewriter,
-    const bufferization::BufferizationOptions &options) {
-  return BufferizeOp(*this, rewriter, options, this->getNumOperands() / 2);
-}
-
-LogicalResult AllToAllOp::bufferize(
-    RewriterBase &rewriter,
-    const bufferization::BufferizationOptions &options) {
-  return BufferizeOp(*this, rewriter, options, this->getNumOperands() / 2);
-}
-
-LogicalResult FftOp::bufferize(
-    RewriterBase &rewriter,
-    const bufferization::BufferizationOptions &options) {
-  return BufferizeOp(*this, rewriter, options, this->getNumOperands() / 2);
-}
-
-LogicalResult InfeedOp::bufferize(
-    RewriterBase &rewriter,
-    const bufferization::BufferizationOptions &options) {
-  return BufferizeOp(*this, rewriter, options, 0);
-}
-
-LogicalResult OutfeedOp::bufferize(
-    RewriterBase &rewriter,
-    const bufferization::BufferizationOptions &options) {
-  return BufferizeOp(*this, rewriter, options, this->getNumOperands());
-}
-
-LogicalResult RngBitGeneratorOp::bufferize(
-    RewriterBase &rewriter,
-    const bufferization::BufferizationOptions &options) {
-  return BufferizeOp(*this, rewriter, options, 1);
-}
-
-LogicalResult AddDependencyOp::bufferize(
-    RewriterBase &rewriter,
-    const bufferization::BufferizationOptions &options) {
-  FailureOr<Value> maybe_buffer =
-      getBuffer(rewriter, this->getOperand(), options);
-  if (failed(maybe_buffer)) {
-    return rewriter.notifyMatchFailure(*this,
-                                       "failed during bufferizing operand");
-  }
-  bufferization::replaceOpWithBufferizedValues(rewriter, this->getOperation(),
-                                               *maybe_buffer);
-  return success();
-}
-
-LogicalResult MemRefElementCastOp::verify() {
-  auto src_memref_ty = getSrc().getType().cast<MemRefType>();
-  auto dst_memref_ty = getDst().getType().cast<MemRefType>();
-  if (src_memref_ty.getShape() != dst_memref_ty.getShape()) {
-    return emitOpError() << "expects matching shapes";
-  }
-
-  unsigned src_width = src_memref_ty.getElementType().getIntOrFloatBitWidth();
-  unsigned dst_width = dst_memref_ty.getElementType().getIntOrFloatBitWidth();
-  if ((src_width + CHAR_BIT - 1) / CHAR_BIT !=
-      (dst_width + CHAR_BIT - 1) / CHAR_BIT) {
-    return emitOpError() << "cannot cast from "
-                         << src_memref_ty.getElementType() << " to "
-                         << dst_memref_ty.getElementType();
-  }
-  return success();
-}
-
-LogicalResult ConvolutionOp::bufferize(
-    RewriterBase &rewriter,
-    const bufferization::BufferizationOptions &options) {
-  return BufferizeOp(*this, rewriter, options, this->getNumOperands() - 1);
-}
-
-}  // namespace xla_cpu
-}  // namespace mlir
-
-#define GET_OP_CLASSES
-#include "xla/mlir/xla_cpu/ir/xla_cpu.cc.inc"
diff --git a/third_party/xla/xla/mlir/xla_cpu/ir/xla_cpu.h b/third_party/xla/xla/mlir/xla_cpu/ir/xla_cpu.h
deleted file mode 100644
index 6391e985d16222..00000000000000
--- a/third_party/xla/xla/mlir/xla_cpu/ir/xla_cpu.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file defines the operations and types used in the XLAFramework dialect.
-//
-#ifndef XLA_MLIR_XLA_CPU_IR_XLA_CPU_H_
-#define XLA_MLIR_XLA_CPU_IR_XLA_CPU_H_
-
-#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/Dialect.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/OpDefinition.h"  // from @llvm-project
-#include "mlir/IR/OpImplementation.h"  // from @llvm-project
-#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
-
-#define GET_OP_CLASSES
-#include "xla/mlir/xla_cpu/ir/xla_cpu.h.inc"
-#include "xla/mlir/xla_cpu/ir/xla_cpu_dialect.h.inc"
-#include "xla/mlir/xla_cpu/ir/xla_cpu_enums.h.inc"
-#define GET_ATTRDEF_CLASSES
-#include "xla/mlir/xla_cpu/ir/xla_cpu_attrdefs.h.inc"
-#undef GET_OP_CLASSES
-
-#endif  // XLA_MLIR_XLA_CPU_IR_XLA_CPU_H_
diff --git a/third_party/xla/xla/mlir/xla_cpu/ir/xla_cpu_enums.td b/third_party/xla/xla/mlir/xla_cpu/ir/xla_cpu_enums.td
deleted file mode 100644
index ff09a17c97520f..00000000000000
--- a/third_party/xla/xla/mlir/xla_cpu/ir/xla_cpu_enums.td
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_MLIR_XLA_CPU_ENUMS_TD_
-#define XLA_MLIR_XLA_CPU_ENUMS_TD_
-
-include "xla/mlir/xla_cpu/ir/xla_cpu_dialect.td"
-include "mlir/IR/EnumAttr.td"
-include "mlir/IR/PatternBase.td"
-
-def ALL_REDUCE_SUM     : I32EnumAttrCase<"ALL_REDUCE_SUM", 0>;
-def ALL_REDUCE_PRODUCT : I32EnumAttrCase<"ALL_REDUCE_PRODUCT", 1>;
-def ALL_REDUCE_MIN     : I32EnumAttrCase<"ALL_REDUCE_MIN", 2>;
-def ALL_REDUCE_MAX     : I32EnumAttrCase<"ALL_REDUCE_MAX", 3>;
-
-def XlaCpuReductionKind : I32EnumAttr<"ReductionKind",
-    "Type of reduction to apply.",
-    [ALL_REDUCE_SUM, ALL_REDUCE_PRODUCT, ALL_REDUCE_MIN, ALL_REDUCE_MAX]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::mlir::xla_cpu";
-}
-
-def XlaCpuReductionKindEnum : EnumAttr<XlaCpuDialect, XlaCpuReductionKind, "reduction_kind"> {
-  let assemblyFormat = "`<` $value `>`";
-}
-
-#endif // XLA_MLIR_XLA_CPU_ENUMS_TD_
diff --git a/third_party/xla/xla/mlir/xla_cpu/ir/xla_cpu_ops.td b/third_party/xla/xla/mlir/xla_cpu/ir/xla_cpu_ops.td
deleted file mode 100644
index ce107cf5e2531c..00000000000000
--- a/third_party/xla/xla/mlir/xla_cpu/ir/xla_cpu_ops.td
+++ /dev/null
@@ -1,414 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_MLIR_XLA_CPU_OPS_TD_
-#define XLA_MLIR_XLA_CPU_OPS_TD_
-
-include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td"
-include "mlir/IR/AttrTypeBase.td"
-include "mlir/IR/OpBase.td"
-include "mlir/Interfaces/InferTypeOpInterface.td"
-include "xla/mlir/xla_cpu/ir/xla_cpu_dialect.td"
-include "xla/mlir/xla_cpu/ir/xla_cpu_enums.td"
-include "xla/mlir_hlo/mhlo/IR/hlo_ops_common.td"
-
-// Base class for XLA CPU dialect ops.
-class XlaCpu_Op<string mnemonic, list<Trait> traits = []> :
-    Op<XlaCpuDialect, mnemonic, traits>;
-
-def TensorOrMemref :
-  AnyTypeOf<[AnyMemRef, AnyRankedTensor], "", "::mlir::ShapedType">;
-
-def TensorOrMemrefOrToken :
-  AnyTypeOf<[AnyMemRef, AnyRankedTensor, MHLO_Token]>;
-
-def AllReduceOp : XlaCpu_Op<"all_reduce",
-    [SameOperandsElementType,
-     SameVariadicOperandSize,
-     BufferizableOpInterface]> {
-  let summary = [{
-    CPU-specific version of AllReduce.
-  }];
-
-  let description = [{
-    The major differences between this and HLO's all_reduce are:
-    - It bufferizes to itself.
-    - It has no region.
-    - It uses destination passing style.
-  }];
-
-  let arguments = (ins
-      Variadic<TensorOrMemref>:$operand,
-      Variadic<TensorOrMemref>:$dsts,
-      I64ElementsAttr:$replica_groups,
-      I64Attr:$channel_handle,
-      I32Attr:$use_global_device_ids,
-      XlaCpuReductionKind:$reduction_kind
-  );
-  let results = (outs
-      Variadic<TensorOrMemref>
-  );
-  let extraClassDeclaration = [{
-    // Declarations for BufferizableOpInterface:
-    bool bufferizesToMemoryRead(OpOperand &opOperand,
-        const bufferization::AnalysisState &state);
-    bool bufferizesToMemoryWrite(OpOperand &opOperand,
-        const bufferization::AnalysisState &state);
-    bufferization::AliasingValueList getAliasingValues(
-        OpOperand &opOperand, const bufferization::AnalysisState &state);
-    LogicalResult bufferize(RewriterBase &rewriter,
-        const bufferization::BufferizationOptions &options);
-  }];
-}
-
-def ReplicaIdOp : XlaCpu_Op<"replica_id"> {
-  let summary = "CPU-specific version of ReplicaId";
-  let description = [{
-    ReplicaId, but returns a i32 instead of tensor<ui32>.
-  }];
-  let results = (outs I32);
-}
-
-def PartitionIdOp : XlaCpu_Op<"partition_id"> {
-  let summary = "CPU-specific version of PartitionId";
-  let description = [{
-    PartitionId, but returns a i32 instead of tensor<ui32>.
-  }];
-  let results = (outs I32);
-}
-
-def CollectivePermuteOp : XlaCpu_Op<"collective_permute", [BufferizableOpInterface]> {
-  let summary = "CPU-specific version of CollectivePermute";
-  let description = [{
-    The major differences between this and HLO's collective_permute are:
-    - It bufferizes to itself.
-    - It uses destination passing style.
-  }];
-
-  let arguments = (ins
-      TensorOrMemref:$operand,
-      TensorOrMemref:$dst,
-      I64ElementsAttr:$source_target_pairs,
-      I64Attr:$channel_handle
-  );
-  let results = (outs Variadic<TensorOrMemref>);
-  let extraClassDeclaration = [{
-    // Declarations for BufferizableOpInterface:
-    bool bufferizesToMemoryRead(OpOperand &opOperand,
-        const bufferization::AnalysisState &state) {
-      return opOperand.getOperandNumber() == 0;
-    }
-    bool bufferizesToMemoryWrite(OpOperand   &opOperand,
-        const bufferization::AnalysisState &state) {
-      return opOperand.getOperandNumber() == 1;
-    }
-    bufferization::AliasingValueList getAliasingValues(
-        OpOperand &opOperand, const bufferization::AnalysisState &state) {
-      if (opOperand.getOperandNumber() == 0) return {};
-      return {{getOperation()->getOpResult(0),
-               bufferization::BufferRelation::Equivalent}};
-    }
-    LogicalResult bufferize(RewriterBase &rewriter,
-        const bufferization::BufferizationOptions &options);
-  }];
-}
-
-def AllToAllOp : XlaCpu_Op<"all_to_all",
-  [SameOperandsElementType,
-   SameVariadicOperandSize,
-   BufferizableOpInterface]> {
-  let summary = "CPU-specific version of AllToAll";
-  let description = [{
-    The major differences between this and HLO's all_to_all are:
-    - It bufferizes to itself.
-    - It uses destination passing style.
-  }];
-
-  let arguments = (ins
-      Variadic<TensorOrMemref>:$operand,
-      Variadic<TensorOrMemref>:$dst,
-      I64ElementsAttr:$replica_groups,
-      I32Attr:$channel_id_present,
-      I64Attr:$op_id,
-      OptionalAttr<I64Attr>:$split_dimension,
-      OptionalAttr<I64Attr>:$concat_dimension,
-      OptionalAttr<I64Attr>:$split_count
-  );
-  let results = (outs Variadic<TensorOrMemref>);
-  let extraClassDeclaration = [{
-    // Declarations for BufferizableOpInterface:
-    bool bufferizesToMemoryRead(OpOperand &opOperand,
-        const bufferization::AnalysisState &state) {
-      return opOperand.getOperandNumber() < getNumOperands() / 2;
-    }
-    bool bufferizesToMemoryWrite(OpOperand &opOperand,
-        const bufferization::AnalysisState &state) {
-      return opOperand.getOperandNumber() >= getNumOperands() / 2;
-    }
-    bufferization::AliasingValueList getAliasingValues(
-        OpOperand &opOperand, const bufferization::AnalysisState &state) {
-      if (bufferizesToMemoryRead(opOperand, state)) return {};
-      return {{getOperation()->getOpResult(opOperand.getOperandNumber() - getNumOperands() / 2),
-               bufferization::BufferRelation::Equivalent}};
-    }
-    LogicalResult bufferize(RewriterBase &rewriter,
-        const bufferization::BufferizationOptions &options);
-  }];
-}
-
-def FftOp : XlaCpu_Op<"fft", [BufferizableOpInterface]> {
-  let summary = "CPU-specific version of FFT";
-  let description = [{
-    The major differences between this and HLO's fft are:
-    - It bufferizes to itself.
-    - It uses destination passing style.
-  }];
-
-  let arguments = (ins
-      TensorOrMemref:$operand,
-      TensorOrMemref:$dst,
-      I32Attr:$fft_type,
-      I64ArrayAttr:$fft_length
-  );
-  let results = (outs Variadic<TensorOrMemref>);
-  let extraClassDeclaration = [{
-    // Declarations for BufferizableOpInterface:
-    bool bufferizesToMemoryRead(OpOperand &opOperand,
-        const bufferization::AnalysisState &state) {
-      return opOperand.getOperandNumber() == 0;
-    }
-    bool bufferizesToMemoryWrite(OpOperand &opOperand,
-        const bufferization::AnalysisState &state) {
-      return opOperand.getOperandNumber() == 1;
-    }
-    bufferization::AliasingValueList getAliasingValues(
-        OpOperand &opOperand, const bufferization::AnalysisState &state) {
-      if (opOperand.getOperandNumber() == 0) return {};
-      return {{getOperation()->getOpResult(0),
-               bufferization::BufferRelation::Equivalent}};
-    }
-    LogicalResult bufferize(RewriterBase &rewriter,
-        const bufferization::BufferizationOptions &options);
-  }];
-}
-
-def InfeedOp : XlaCpu_Op<"infeed", [BufferizableOpInterface]> {
-  let summary = "CPU-specific version of Infeed";
-  let description = [{
-    The difference between this and HLO's outfeed is that the former bufferizes
-    to itself.
-  }];
-  let arguments = (ins
-    Variadic<TensorOrMemrefOrToken>:$dst,
-    DefaultValuedStrAttr<StrAttr, "">:$config,
-    OptionalAttr<ArrayAttr>:$layout
-  );
-  let results = (outs
-      Variadic<TensorOrMemrefOrToken>
-  );
-  let extraClassDeclaration = [{
-    // Declarations for BufferizableOpInterface:
-    bool bufferizesToMemoryRead(OpOperand &opOperand,
-        const bufferization::AnalysisState &state) {
-      return false;
-    }
-    bool bufferizesToMemoryWrite(OpOperand &opOperand,
-        const bufferization::AnalysisState &state) {
-      return true;
-    }
-    bufferization::AliasingValueList getAliasingValues(
-        OpOperand &opOperand, const bufferization::AnalysisState &state) {
-      return {{getOperation()->getOpResult(0),
-               bufferization::BufferRelation::Equivalent}};
-    }
-    LogicalResult bufferize(RewriterBase &rewriter,
-        const bufferization::BufferizationOptions &options);
-  }];
-}
-
-def OutfeedOp : XlaCpu_Op<"outfeed", [BufferizableOpInterface]> {
-  let summary = "CPU-specific version of Outfeed";
-  let description = [{
-    The major differences between this and HLO's outfeed are:
-    - It bufferizes to itself.
-    - It captures the output type to reinstate it after signless conversions.
-  }];
-  let arguments = (ins
-    Variadic<TensorOrMemref>:$operand,
-    DefaultValuedStrAttr<StrAttr, "">:$config,
-    ArrayAttr:$result_type
-  );
-  let extraClassDeclaration = [{
-    // Declarations for BufferizableOpInterface:
-    bool bufferizesToMemoryRead(OpOperand &opOperand,
-        const bufferization::AnalysisState &state) {
-      return true;
-    }
-    bool bufferizesToMemoryWrite(OpOperand &opOperand,
-        const bufferization::AnalysisState &state) {
-      return false;
-    }
-    bufferization::AliasingValueList getAliasingValues(
-        OpOperand &opOperand, const bufferization::AnalysisState &state) {
-      return {};
-    }
-    LogicalResult bufferize(RewriterBase &rewriter,
-        const bufferization::BufferizationOptions &options);
-  }];
-}
-
-def MemRefElementCastOp : XlaCpu_Op<"memref_element_cast",
-    [SameOperandsAndResultShape]> {
-  let summary = "MemRef reinterpret_cast on element types";
-  let description = [{
-    This op is the equivalent of C++'s reinterpret_cast on pointers. The element
-    types' storage sizes must be the same. Does not cast shapes.
-  }];
-  let arguments = (ins
-    MemRefOf<[I1, I8, I16, I32, I64, BF16, F16, F32, F64]>:$src
-  );
-  let results = (outs
-    MemRefOf<[I1, I8, I16, I32, I64, BF16, F16, F32, F64]>:$dst
-  );
-  let assemblyFormat = "$src attr-dict `:` type($src) `to` type($dst)";
-  let hasVerifier = 1;
-}
-
-def RngBitGeneratorOp : XlaCpu_Op<"rng_bit_generator", [BufferizableOpInterface]> {
-  let summary = "CPU-specific version of rng_bit_generator";
-  let description = [{
-    The major differences between this and HLO's rng_bit_generator are:
-    - It bufferizes to itself.
-    - It uses destination passing style.
-  }];
-  let arguments = (ins
-      TensorOrMemref:$state,
-      TensorOrMemref:$dst_state,
-      TensorOrMemref:$dst,
-      AnyAttr:$rng_algorithm
-  );
-  let results = (outs Variadic<TensorOrMemref>);
-  let extraClassDeclaration = [{
-    // Declarations for BufferizableOpInterface:
-    bool bufferizesToMemoryRead(OpOperand &opOperand,
-        const bufferization::AnalysisState &state) {
-      return opOperand.getOperandNumber() == 0;
-    }
-    bool bufferizesToMemoryWrite(OpOperand &opOperand,
-        const bufferization::AnalysisState &state) {
-      return opOperand.getOperandNumber() != 0;
-    }
-    bufferization::AliasingValueList getAliasingValues(
-        OpOperand &opOperand, const bufferization::AnalysisState &state) {
-      if (opOperand.getOperandNumber() == 0) return {};
-      return {{getOperation()->getOpResult(opOperand.getOperandNumber()-1),
-               bufferization::BufferRelation::Equivalent}};
-    }
-    LogicalResult bufferize(RewriterBase &rewriter,
-        const bufferization::BufferizationOptions &options);
-  }];
-}
-
-def AddDependencyOp : XlaCpu_Op<"add_dependency", [BufferizableOpInterface]> {
-  let summary = "CPU-specific version of AddDependency";
-  let description = [{
-    The major differences between this and HLO's add_dependency are:
-    - It bufferizes itself.
-  }];
-  let arguments = (ins
-    MHLO_TensorOrToken:$operand,
-    MHLO_Token:$token
-  );
-  let results = (outs MHLO_TensorOrToken);
-  let extraClassDeclaration = [{
-    // Declarations for BufferizableOpInterface:
-    bool bufferizesToMemoryRead(OpOperand &opOperand,
-        const bufferization::AnalysisState &state) {
-      return opOperand.getOperandNumber() == 0;
-    }
-    bool bufferizesToMemoryWrite(OpOperand &opOperand,
-        const bufferization::AnalysisState &state) {
-      return false;
-    }
-    bufferization::AliasingValueList getAliasingValues(
-        OpOperand &opOperand, const bufferization::AnalysisState &state) {
-      if (opOperand.getOperandNumber() == 0 || opOperand.getOperandNumber() == 1)
-        return {};
-      return {{getOperation()->getOpResult(0),
-               bufferization::BufferRelation::Unknown, /*isDefinite=*/false}};
-    }
-    LogicalResult bufferize(RewriterBase &rewriter,
-        const bufferization::BufferizationOptions &options);
-  }];
-}
-
-def ConvolutionOp : XlaCpu_Op<"convolution", [BufferizableOpInterface]> {
-  let summary = "CPU-specific version of convolution";
-  let description = [{
-    The major differences between this and HLO's convolution are:
-    - It bufferizes to itself.
-    - It uses destination passing style.
-  }];
-
-  let arguments = (ins
-      TensorOrMemref:$input,
-      TensorOrMemref:$kernel,
-      TensorOrMemref:$dst,
-      // Default value: one for each of the spatial dimension.
-      OptionalAttr<I64ElementsAttr>:$window_strides,
-      // Default value: two zeros for each of the spatial dimension.
-      OptionalAttr<I64ElementsAttr>:$padding,
-      // Default value: one for each of the spatial dimension.
-      OptionalAttr<I64ElementsAttr>:$lhs_dilation,
-      // Default value: one for each of the spatial dimension.
-      OptionalAttr<I64ElementsAttr>:$rhs_dilation,
-      // Default value: false for each of the spatial dimension.
-      OptionalAttr<MHLO_BoolElementsAttr>:$window_reversal,
-      I64Attr:$inputBatchDimension,
-      I64Attr:$inputFeatureDimension,
-      I64ArrayAttr:$inputSpatialDimensions,
-      I64Attr:$kernelInputFeatureDimension,
-      I64Attr:$kernelOutputFeatureDimension,
-      I64ArrayAttr:$kernelSpatialDimensions,
-      I64Attr:$outputBatchDimension,
-      I64Attr:$outputFeatureDimension,
-      I64ArrayAttr:$outputSpatialDimensions,
-      I64Attr:$feature_group_count,
-      I64Attr:$batch_group_count,
-      MHLO_PrecisionConfigAttr:$precision_config
-  );
-  let results = (outs Variadic<TensorOrMemref>);
-  let extraClassDeclaration = [{
-    // Declarations for BufferizableOpInterface:
-    bool bufferizesToMemoryRead(OpOperand &opOperand,
-        const bufferization::AnalysisState &state) {
-      return opOperand.getOperandNumber() < 2;
-    }
-    bool bufferizesToMemoryWrite(OpOperand &opOperand,
-        const bufferization::AnalysisState &state) {
-      return opOperand.getOperandNumber() == 2;
-    }
-    bufferization::AliasingValueList getAliasingValues(
-        OpOperand &opOperand, const bufferization::AnalysisState &state) {
-      if (opOperand.getOperandNumber() < 2) return {};
-      return {{getOperation()->getOpResult(0),
-               bufferization::BufferRelation::Equivalent}};
-    }
-    LogicalResult bufferize(RewriterBase &rewriter,
-        const bufferization::BufferizationOptions &options);
-  }];
-}
-
-#endif  // XLA_MLIR_XLA_CPU_OPS_TD_
diff --git a/third_party/xla/xla/mlir_hlo/BUILD b/third_party/xla/xla/mlir_hlo/BUILD
index 761446ff70f3e1..8f5076a63398f2 100644
--- a/third_party/xla/xla/mlir_hlo/BUILD
+++ b/third_party/xla/xla/mlir_hlo/BUILD
@@ -12,7 +12,6 @@ package(
 
 exports_files([
     "mhlo/IR/hlo_ops.td",
-    "lhlo/IR/lhlo_ops.td",
 ])
 
 # Python extension sources.
@@ -63,24 +62,6 @@ gentbl_cc_library(
     deps = ["@llvm-project//mlir:PassBaseTdFiles"],
 )
 
-gentbl_cc_library(
-    name = "lmhlo_pass_inc_gen",
-    compatible_with = get_compatible_with_portable(),
-    strip_include_prefix = ".",
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=AllLmhlo",
-            ],
-            "lhlo/transforms/lmhlo_passes.h.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "lhlo/transforms/lmhlo_passes.td",
-    deps = ["@llvm-project//mlir:PassBaseTdFiles"],
-)
-
 gentbl_cc_library(
     name = "hlo_ops_inc_gen",
     compatible_with = get_compatible_with_portable(),
@@ -182,44 +163,6 @@ gentbl_cc_library(
     ],
 )
 
-gentbl_cc_library(
-    name = "lhlo_ops_structs_inc_gen",
-    compatible_with = get_compatible_with_portable(),
-    strip_include_prefix = ".",
-    tbl_outs = [
-        (
-            ["-gen-attrdef-decls"],
-            "lhlo/IR/lhlo_ops_structs.h.inc",
-        ),
-        (
-            ["-gen-attrdef-defs"],
-            "lhlo/IR/lhlo_ops_structs.cc.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "lhlo/IR/lhlo_ops_structs.td",
-    deps = [":lhlo_ops_td_files"],
-)
-
-gentbl_cc_library(
-    name = "lhlo_ops_inc_gen",
-    compatible_with = get_compatible_with_portable(),
-    strip_include_prefix = ".",
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "lhlo/IR/lhlo_ops.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "lhlo/IR/lhlo_ops.cc.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "lhlo/IR/lhlo_ops.td",
-    deps = [":lhlo_ops_td_files"],
-)
-
 gentbl_filegroup(
     name = "hlo_ops_doc_gen",
     compatible_with = get_compatible_with_portable(),
@@ -237,23 +180,6 @@ gentbl_filegroup(
     deps = [":hlo_ops_td_files"],
 )
 
-gentbl_filegroup(
-    name = "lhlo_ops_doc_gen",
-    compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            [
-                "-gen-dialect-doc",
-                "-dialect=lmhlo",
-            ],
-            "g3doc/lhlo_ops.md",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "lhlo/IR/lhlo_ops.td",
-    deps = [":lhlo_ops_td_files"],
-)
-
 cc_library(
     name = "hlo_ops_common",
     srcs = ["mhlo/IR/hlo_ops_common.cc"],
@@ -262,6 +188,7 @@ cc_library(
     deps = [
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -343,59 +270,6 @@ cc_library(
     ],
 )
 
-td_library(
-    name = "lhlo_ops_td_files",
-    srcs = glob(["lhlo/IR/*.td"]),
-    compatible_with = get_compatible_with_portable(),
-    includes = ["."],
-    deps = [
-        ":hlo_ops_td_files",
-        "@llvm-project//mlir:ControlFlowInterfacesTdFiles",
-        "@llvm-project//mlir:CopyOpInterfaceTdFiles",
-        "@llvm-project//mlir:InferTypeOpInterfaceTdFiles",
-        "@llvm-project//mlir:LoopLikeInterfaceTdFiles",
-        "@llvm-project//mlir:MemRefOpsTdFiles",
-        "@llvm-project//mlir:OpBaseTdFiles",
-        "@llvm-project//mlir:ShapeOpsTdFiles",
-        "@llvm-project//mlir:SideEffectInterfacesTdFiles",
-        "@llvm-project//mlir:ViewLikeInterfaceTdFiles",
-    ],
-)
-
-gentbl_cc_library(
-    name = "lhlo_structured_interface_inc_gen",
-    compatible_with = get_compatible_with_portable(),
-    strip_include_prefix = ".",
-    tbl_outs = [
-        (
-            ["-gen-op-interface-decls"],
-            "lhlo/IR/lhlo_structured_interface.h.inc",
-        ),
-        (
-            ["-gen-op-interface-defs"],
-            "lhlo/IR/lhlo_structured_interface.cpp.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "lhlo/IR/lhlo_structured_interface.td",
-    deps = [":lhlo_ops_td_files"],
-)
-
-cc_library(
-    name = "lhlo_structured_interface",
-    srcs = ["lhlo/IR/lhlo_structured_interface.cc"],
-    hdrs = [
-        "lhlo/IR/lhlo_structured_interface.h",
-        "lhlo/IR/lhlo_structured_interface.h.inc",
-    ],
-    strip_include_prefix = ".",
-    deps = [
-        ":lhlo_structured_interface_inc_gen",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
 cc_library(
     name = "convert_op_folder",
     srcs = ["utils/convert_op_folder.cc"],
@@ -404,6 +278,7 @@ cc_library(
     deps = [
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -468,35 +343,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "lhlo",
-    srcs = ["lhlo/IR/lhlo_ops.cc"],
-    hdrs = [
-        "lhlo/IR/lhlo_ops.h",
-        "lhlo/IR/lhlo_ops_structs.h",
-        "lhlo/utils/lhlo_utils.h",
-    ],
-    strip_include_prefix = ".",
-    deps = [
-        ":hlo_ops_common",
-        ":lhlo_ops_inc_gen",
-        ":lhlo_ops_structs_inc_gen",
-        ":lhlo_structured_interface",
-        ":mlir_hlo",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:BufferizationDialect",
-        "@llvm-project//mlir:ControlFlowInterfaces",
-        "@llvm-project//mlir:CopyOpInterface",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:LoopLikeInterface",
-        "@llvm-project//mlir:MemRefDialect",
-        "@llvm-project//mlir:SideEffectInterfaces",
-        "@llvm-project//mlir:ViewLikeInterface",
-        "@stablehlo//:stablehlo_type_inference",
-    ],
-)
-
 cc_library(
     name = "hlo_dialect_registration",
     srcs = ["mhlo/IR/init.cc"],
@@ -522,7 +368,6 @@ cc_library(
         "mhlo/transforms/expand_hlo_tuples/expand_hlo_tuples.cc",
         "mhlo/transforms/expand_ops_simplifier/expand_ops_simplifier.cc",
         "mhlo/transforms/group_reduction_dimensions/group_reduction_dimensions.cc",
-        "mhlo/transforms/hlo_legalize_shape_ops_to_standard/hlo_legalize_shape_ops_to_standard.cc",
         "mhlo/transforms/hlo_legalize_to_arithmetic/hlo_legalize_to_arithmetic.cc",
         "mhlo/transforms/hlo_legalize_to_memref/hlo_legalize_to_memref.cc",
         "mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo_pass.cc",
@@ -580,10 +425,8 @@ cc_library(
         ":hlo_legalize_to_stablehlo",
         ":legalize_to_linalg_utils",
         ":legalize_to_standard_inc_gen",
-        ":lhlo",
         ":lower_complex_inc_gen",
         ":map_chlo_to_hlo_op",
-        ":map_hlo_to_lhlo_op",
         ":map_mhlo_to_scalar_op",
         ":mhlo_pass_inc_gen",
         ":mhlo_rng_utils",
@@ -641,6 +484,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncTransforms",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
@@ -648,16 +492,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "map_lmhlo_to_scalar_op",
-    hdrs = ["lhlo/transforms/map_lmhlo_to_scalar_op.h"],
-    strip_include_prefix = ".",
-    deps = [
-        ":map_lhlo_to_hlo_op",
-        ":map_mhlo_to_scalar_op",
-    ],
-)
-
 cc_library(
     name = "map_mhlo_to_scalar_op",
     hdrs = ["mhlo/transforms/map_mhlo_to_scalar_op.h"],
@@ -670,6 +504,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:MathDialect",
         "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -684,26 +519,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "map_hlo_to_lhlo_op",
-    hdrs = ["lhlo/transforms/map_hlo_to_lhlo_op.h"],
-    strip_include_prefix = ".",
-    deps = [
-        ":lhlo",
-        ":mlir_hlo",
-    ],
-)
-
-cc_library(
-    name = "map_lhlo_to_hlo_op",
-    hdrs = ["lhlo/transforms/map_lhlo_to_hlo_op.h"],
-    strip_include_prefix = ".",
-    deps = [
-        ":lhlo",
-        ":mlir_hlo",
-    ],
-)
-
 cc_library(
     name = "map_stablehlo_to_hlo_op",
     hdrs = ["mhlo/transforms/map_stablehlo_to_hlo_op.h"],
@@ -714,43 +529,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "lmhlo_passes",
-    srcs = [
-        "lhlo/transforms/legalize_to_tensor_op/legalize_to_tensor_op.cc",
-        "lhlo/transforms/lhlo_legalize_to_affine/lhlo_legalize_to_affine.cc",
-        "lhlo/transforms/lhlo_legalize_to_gpu/lhlo_legalize_to_gpu.cc",
-        "lhlo/transforms/lhlo_legalize_to_parallel_loops/lhlo_legalize_to_parallel_loops.cc",
-        "lhlo/transforms/lmhlo_passes.h.inc",
-    ],
-    hdrs = ["lhlo/transforms/passes.h"],
-    strip_include_prefix = ".",
-    deps = [
-        ":lhlo",
-        ":lmhlo_pass_inc_gen",
-        ":map_lmhlo_to_scalar_op",
-        ":mlir_hlo",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AffineDialect",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:BufferizationDialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:GPUDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:LinalgDialect",
-        "@llvm-project//mlir:LinalgTransforms",
-        "@llvm-project//mlir:MemRefDialect",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:SCFDialect",
-        "@llvm-project//mlir:ShapeDialect",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TensorDialect",
-        "@llvm-project//mlir:TransformUtils",
-        "@llvm-project//mlir:Transforms",
-        "@llvm-project//mlir:ViewLikeInterface",
-    ],
-)
-
 cc_library(
     name = "codegen_utils",
     srcs = ["utils/codegen_utils.cc"],
@@ -764,6 +542,7 @@ cc_library(
         "@llvm-project//mlir:LLVMDialect",
         "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -829,6 +608,7 @@ cc_library(
     deps = [
         ":mlir_hlo",
         "@llvm-project//mlir:DialectUtils",
+        "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
     ],
 )
@@ -883,6 +663,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:ShapeDialect",
+        "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
@@ -949,12 +730,10 @@ cc_library(
         # These are not exposed as headers in the dependent targets, and
         # shouldn't be. Ideally, this entire target should be removed.
         "deallocation/transforms/passes.h.inc",
-        "lhlo/transforms/lmhlo_passes.h.inc",
         "transforms/passes.h.inc",
     ],
     hdrs = [
         "deallocation/transforms/passes.h",
-        "lhlo/transforms/passes.h",
         "mhlo/transforms/passes.h",
         "transforms/passes.h",
     ],
@@ -962,9 +741,6 @@ cc_library(
     deps = [
         ":deallocation_passes",
         ":deallocation_passes_inc_gen",
-        ":lhlo",
-        ":lmhlo_pass_inc_gen",
-        ":lmhlo_passes",
         ":mhlo_pass_inc_gen",
         ":mhlo_passes",
         ":stablehlo_legalize_to_hlo",
@@ -991,7 +767,6 @@ cc_library(
         "transforms/lower_index_cast_pass.cc",
         "transforms/naive_copy_removal.cc",
         "transforms/propagate_static_shapes_to_kernel.cc",
-        "transforms/test_hlo_transform_dialect_interpreter.cc",
         "transforms/tile_loops_pass.cc",
         "transforms/unbufferize_pass.cc",
         "transforms/unroll_loops.cc",
@@ -1005,7 +780,6 @@ cc_library(
     strip_include_prefix = ".",
     deps = [
         ":deallocation_passes",
-        ":lhlo",
         ":mhlo_passes",
         ":mlir_hlo",
         ":shape_component_analysis",
@@ -1087,7 +861,6 @@ cc_library(
     strip_include_prefix = ".",
     deps = [
         ":gpu_transforms_passes_inc_gen",
-        ":lhlo",
         ":mhlo_passes",
         ":mlir_hlo",
         ":transforms_passes",
@@ -1209,6 +982,7 @@ cc_library(
         ":all_passes",
         ":mlir_hlo",
         "@llvm-project//mlir:CAPIIR",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -1230,6 +1004,7 @@ cc_library(
         ":all_passes",
         ":mlir_hlo",
         "@llvm-project//mlir:CAPIIRObjects",
+        "@llvm-project//mlir:Support",
     ],
     alwayslink = True,
 )
@@ -1247,7 +1022,6 @@ cc_binary(
     deps = [
         ":all_passes",
         ":hlo_dialect_registration",
-        ":lhlo",
         ":transforms_gpu_passes",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AllExtensions",
diff --git a/third_party/xla/xla/mlir_hlo/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/CMakeLists.txt
index bb81e4a7e80ea6..c89d3c6ce3b7dd 100644
--- a/third_party/xla/xla/mlir_hlo/CMakeLists.txt
+++ b/third_party/xla/xla/mlir_hlo/CMakeLists.txt
@@ -160,7 +160,6 @@ add_custom_target(check-mlir-hlo)
 
 add_subdirectory(bindings)
 add_subdirectory(deallocation)
-add_subdirectory(lhlo)
 add_subdirectory(mhlo)
 add_subdirectory(stablehlo)
 add_subdirectory(tests)
diff --git a/third_party/xla/xla/mlir_hlo/README.md b/third_party/xla/xla/mlir_hlo/README.md
index 20513ecf7a1960..0a8770aae99330 100644
--- a/third_party/xla/xla/mlir_hlo/README.md
+++ b/third_party/xla/xla/mlir_hlo/README.md
@@ -2,12 +2,15 @@
 
 The code here exists in two places:
 
-*   https://github.com/tensorflow/tensorflow/tree/master/tensorflow/compiler/xla/mlir_hlo;
+*   https://github.com/openxla/xla/tree/main/xla/mlir_hlo:
     this is the canonical location and where contributions should be made using
     GitHub pull-requests.
-*   https://github.com/tensorflow/mlir-hlo; this is a standalone repository with
-    a view to the same code to allow other projects to use this without
-    depending on the entire TF monorepo.
+*   **(DEPRECATED)** https://github.com/tensorflow/mlir-hlo; this is a
+    standalone repository with a view to the same code to allow other projects
+    to use this without depending on the entire XLA monorepo. This repository is
+    [slated for deletion](https://groups.google.com/a/openxla.org/g/openxla-discuss/c/Mppuv1Edv1s).
+    All dependencies on MHLO should go through XLA. Users of MHLO should migrate
+    to [StableHLO](https://github.com/openxla/stablehlo) whenever possible.
 
 This implements a self-contained compiler for a linear algebra set of operations
 inspired by XLA
@@ -164,7 +167,7 @@ The latter could be implemented by adding the op and a legalization of the op to
 other known ops, from which a helper function can get generated that could be
 used as regular.
 
-Status: Exists but need to be cleaned up.
+absl::Status: Exists but need to be cleaned up.
 
 ### Meta HLO Dialect `mhlo`
 
@@ -207,7 +210,7 @@ intermediate optimization dialect/format. It is also where we can experiment
 cheaply with new ops. This format will be where the representation would differ
 from existing endpoints.
 
-Status: Exists but need to be cleaned up and evolved, in particular with respect
+absl::Status: Exists but need to be cleaned up and evolved, in particular with respect
 to supporting dynamic shapes.
 
 MHLO differs from XLA HLO op set in multiple ways, including:
diff --git a/third_party/xla/xla/mlir_hlo/bindings/c/Attributes.cc b/third_party/xla/xla/mlir_hlo/bindings/c/Attributes.cc
index d5183742094a3e..e5c3f4500efbb4 100644
--- a/third_party/xla/xla/mlir_hlo/bindings/c/Attributes.cc
+++ b/third_party/xla/xla/mlir_hlo/bindings/c/Attributes.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "mhlo/IR/hlo_ops.h"
 #include "mlir/CAPI/IR.h"
 #include "mlir/CAPI/Support.h"
+#include "mlir/Support/LLVM.h"
 
 //
 // ScatterDimensionNumbersAttr.
@@ -35,57 +36,50 @@ MlirAttribute mlirMhloScatterDimensionNumbersGet(
 }
 
 bool mlirMhloAttributeIsAScatterDimensionNumbers(MlirAttribute attr) {
-  return unwrap(attr).isa<mlir::mhlo::ScatterDimensionNumbersAttr>();
+  return mlir::isa<mlir::mhlo::ScatterDimensionNumbersAttr>(unwrap(attr));
 }
 
 intptr_t mlirMhloScatterDimensionNumbersGetUpdateWindowDimsSize(
     MlirAttribute attr) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::ScatterDimensionNumbersAttr>()
+  return mlir::cast<mlir::mhlo::ScatterDimensionNumbersAttr>(unwrap(attr))
       .getUpdateWindowDims()
       .size();
 }
 
 int64_t mlirMhloScatterDimensionNumbersGetUpdateWindowDimsElem(
     MlirAttribute attr, intptr_t pos) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::ScatterDimensionNumbersAttr>()
+  return mlir::cast<mlir::mhlo::ScatterDimensionNumbersAttr>(unwrap(attr))
       .getUpdateWindowDims()[pos];
 }
 
 intptr_t mlirMhloScatterDimensionNumbersGetInsertedWindowDimsSize(
     MlirAttribute attr) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::ScatterDimensionNumbersAttr>()
+  return mlir::cast<mlir::mhlo::ScatterDimensionNumbersAttr>(unwrap(attr))
       .getInsertedWindowDims()
       .size();
 }
 
 int64_t mlirMhloScatterDimensionNumbersGetInsertedWindowDimsElem(
     MlirAttribute attr, intptr_t pos) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::ScatterDimensionNumbersAttr>()
+  return mlir::cast<mlir::mhlo::ScatterDimensionNumbersAttr>(unwrap(attr))
       .getInsertedWindowDims()[pos];
 }
 
 intptr_t mlirMhloScatterDimensionNumbersGetScatteredDimsToOperandDimsSize(
     MlirAttribute attr) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::ScatterDimensionNumbersAttr>()
+  return mlir::cast<mlir::mhlo::ScatterDimensionNumbersAttr>(unwrap(attr))
       .getScatterDimsToOperandDims()
       .size();
 }
 
 int64_t mlirMhloScatterDimensionNumbersGetScatteredDimsToOperandDimsElem(
     MlirAttribute attr, intptr_t pos) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::ScatterDimensionNumbersAttr>()
+  return mlir::cast<mlir::mhlo::ScatterDimensionNumbersAttr>(unwrap(attr))
       .getScatterDimsToOperandDims()[pos];
 }
 
 int64_t mlirMhloDimensionNumbersGetIndexVectorDim(MlirAttribute attr) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::ScatterDimensionNumbersAttr>()
+  return mlir::cast<mlir::mhlo::ScatterDimensionNumbersAttr>(unwrap(attr))
       .getIndexVectorDim();
 }
 
@@ -105,56 +99,49 @@ MlirAttribute mlirMhloGatherDimensionNumbersGet(
 }
 
 bool mlirMhloAttributeIsAGatherDimensionNumbers(MlirAttribute attr) {
-  return unwrap(attr).isa<mlir::mhlo::GatherDimensionNumbersAttr>();
+  return mlir::isa<mlir::mhlo::GatherDimensionNumbersAttr>(unwrap(attr));
 }
 
 intptr_t mlirMhloGatherDimensionNumbersGetOffsetDimsSize(MlirAttribute attr) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::GatherDimensionNumbersAttr>()
+  return mlir::cast<mlir::mhlo::GatherDimensionNumbersAttr>(unwrap(attr))
       .getOffsetDims()
       .size();
 }
 
 int64_t mlirMhloGatherDimensionNumbersGetOffsetDimsElem(MlirAttribute attr,
                                                         intptr_t pos) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::GatherDimensionNumbersAttr>()
+  return mlir::cast<mlir::mhlo::GatherDimensionNumbersAttr>(unwrap(attr))
       .getOffsetDims()[pos];
 }
 
 intptr_t mlirMhloGatherDimensionNumbersGetCollapsedSliceDimsSize(
     MlirAttribute attr) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::GatherDimensionNumbersAttr>()
+  return mlir::cast<mlir::mhlo::GatherDimensionNumbersAttr>(unwrap(attr))
       .getCollapsedSliceDims()
       .size();
 }
 
 int64_t mlirMhloGatherDimensionNumbersGetCollapsedSliceDimsElem(
     MlirAttribute attr, intptr_t pos) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::GatherDimensionNumbersAttr>()
+  return mlir::cast<mlir::mhlo::GatherDimensionNumbersAttr>(unwrap(attr))
       .getCollapsedSliceDims()[pos];
 }
 
 intptr_t mlirMhloGatherDimensionNumbersGetStartIndexMapSize(
     MlirAttribute attr) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::GatherDimensionNumbersAttr>()
+  return mlir::cast<mlir::mhlo::GatherDimensionNumbersAttr>(unwrap(attr))
       .getStartIndexMap()
       .size();
 }
 
 int64_t mlirMhloGatherDimensionNumbersGetStartIndexMapElem(MlirAttribute attr,
                                                            intptr_t pos) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::GatherDimensionNumbersAttr>()
+  return mlir::cast<mlir::mhlo::GatherDimensionNumbersAttr>(unwrap(attr))
       .getStartIndexMap()[pos];
 }
 
 int64_t mlirMhloGatherDimensionNumbersGetIndexVectorDim(MlirAttribute attr) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::GatherDimensionNumbersAttr>()
+  return mlir::cast<mlir::mhlo::GatherDimensionNumbersAttr>(unwrap(attr))
       .getIndexVectorDim();
 }
 
@@ -177,66 +164,58 @@ MlirAttribute mlirMhloDotDimensionNumbersGet(
 }
 
 bool mlirMhloAttributeIsADotDimensionNumbers(MlirAttribute attr) {
-  return unwrap(attr).isa<mlir::mhlo::DotDimensionNumbersAttr>();
+  return mlir::isa<mlir::mhlo::DotDimensionNumbersAttr>(unwrap(attr));
 }
 
 intptr_t mlirMhloDotDimensionNumbersGetLhsBatchingDimensionsSize(
     MlirAttribute attr) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::DotDimensionNumbersAttr>()
+  return mlir::cast<mlir::mhlo::DotDimensionNumbersAttr>(unwrap(attr))
       .getLhsBatchingDimensions()
       .size();
 }
 
 int64_t mlirMhloDotDimensionNumbersGetLhsBatchingDimensionsElem(
     MlirAttribute attr, intptr_t pos) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::DotDimensionNumbersAttr>()
+  return mlir::cast<mlir::mhlo::DotDimensionNumbersAttr>(unwrap(attr))
       .getLhsBatchingDimensions()[pos];
 }
 
 intptr_t mlirMhloDotDimensionNumbersGetRhsBatchingDimensionsSize(
     MlirAttribute attr) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::DotDimensionNumbersAttr>()
+  return mlir::cast<mlir::mhlo::DotDimensionNumbersAttr>(unwrap(attr))
       .getRhsBatchingDimensions()
       .size();
 }
 
 int64_t mlirMhloDotDimensionNumbersGetRhsBatchingDimensionsElem(
     MlirAttribute attr, intptr_t pos) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::DotDimensionNumbersAttr>()
+  return mlir::cast<mlir::mhlo::DotDimensionNumbersAttr>(unwrap(attr))
       .getRhsBatchingDimensions()[pos];
 }
 
 intptr_t mlirMhloDotDimensionNumbersGetLhsContractingDimensionsSize(
     MlirAttribute attr) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::DotDimensionNumbersAttr>()
+  return mlir::cast<mlir::mhlo::DotDimensionNumbersAttr>(unwrap(attr))
       .getLhsContractingDimensions()
       .size();
 }
 
 int64_t mlirMhloDotDimensionNumbersGetLhsContractingDimensionsElem(
     MlirAttribute attr, intptr_t pos) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::DotDimensionNumbersAttr>()
+  return mlir::cast<mlir::mhlo::DotDimensionNumbersAttr>(unwrap(attr))
       .getLhsContractingDimensions()[pos];
 }
 
 intptr_t mlirMhloDotDimensionNumbersGetRhsContractingDimensionsSize(
     MlirAttribute attr) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::DotDimensionNumbersAttr>()
+  return mlir::cast<mlir::mhlo::DotDimensionNumbersAttr>(unwrap(attr))
       .getRhsContractingDimensions()
       .size();
 }
 
 int64_t mlirMhloDotDimensionNumbersGetRhsContractingDimensionsElem(
     MlirAttribute attr, intptr_t pos) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::DotDimensionNumbersAttr>()
+  return mlir::cast<mlir::mhlo::DotDimensionNumbersAttr>(unwrap(attr))
       .getRhsContractingDimensions()[pos];
 }
 
@@ -261,92 +240,80 @@ MlirAttribute mlirMhloConvDimensionNumbersGet(
 }
 
 bool mlirMhloAttributeIsAConvDimensionNumbers(MlirAttribute attr) {
-  return unwrap(attr).isa<mlir::mhlo::ConvDimensionNumbersAttr>();
+  return mlir::isa<mlir::mhlo::ConvDimensionNumbersAttr>(unwrap(attr));
 }
 
 int64_t mlirMhloConvDimensionNumbersGetInputBatchDimension(MlirAttribute attr) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::ConvDimensionNumbersAttr>()
+  return mlir::cast<mlir::mhlo::ConvDimensionNumbersAttr>(unwrap(attr))
       .getInputBatchDimension();
 }
 
 int64_t mlirMhloConvDimensionNumbersGetInputFeatureDimension(
     MlirAttribute attr) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::ConvDimensionNumbersAttr>()
+  return mlir::cast<mlir::mhlo::ConvDimensionNumbersAttr>(unwrap(attr))
       .getInputFeatureDimension();
 }
 
 intptr_t mlirMhloConvDimensionNumbersGetInputSpatialDimensionsSize(
     MlirAttribute attr) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::ConvDimensionNumbersAttr>()
+  return mlir::cast<mlir::mhlo::ConvDimensionNumbersAttr>(unwrap(attr))
       .getInputSpatialDimensions()
       .size();
 }
 
 int64_t mlirMhloConvDimensionNumbersGetInputSpatialDimensionsElem(
     MlirAttribute attr, intptr_t pos) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::ConvDimensionNumbersAttr>()
+  return mlir::cast<mlir::mhlo::ConvDimensionNumbersAttr>(unwrap(attr))
       .getInputSpatialDimensions()[pos];
 }
 
 int64_t mlirMhloConvDimensionNumbersGetKernelInputFeatureDimension(
     MlirAttribute attr) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::ConvDimensionNumbersAttr>()
+  return mlir::cast<mlir::mhlo::ConvDimensionNumbersAttr>(unwrap(attr))
       .getKernelInputFeatureDimension();
 }
 
 int64_t mlirMhloConvDimensionNumbersGetKernelOutputFeatureDimension(
     MlirAttribute attr) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::ConvDimensionNumbersAttr>()
+  return mlir::cast<mlir::mhlo::ConvDimensionNumbersAttr>(unwrap(attr))
       .getKernelOutputFeatureDimension();
 }
 
 intptr_t mlirMhloConvDimensionNumbersGetKernelSpatialDimensionsSize(
     MlirAttribute attr) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::ConvDimensionNumbersAttr>()
+  return mlir::cast<mlir::mhlo::ConvDimensionNumbersAttr>(unwrap(attr))
       .getKernelSpatialDimensions()
       .size();
 }
 
 int64_t mlirMhloConvDimensionNumbersGetKernelSpatialDimensionsElem(
     MlirAttribute attr, intptr_t pos) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::ConvDimensionNumbersAttr>()
+  return mlir::cast<mlir::mhlo::ConvDimensionNumbersAttr>(unwrap(attr))
       .getKernelSpatialDimensions()[pos];
 }
 
 int64_t mlirMhloConvDimensionNumbersGetOutputBatchDimension(
     MlirAttribute attr) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::ConvDimensionNumbersAttr>()
+  return mlir::cast<mlir::mhlo::ConvDimensionNumbersAttr>(unwrap(attr))
       .getOutputBatchDimension();
 }
 
 int64_t mlirMhloConvDimensionNumbersGetOutputFeatureDimension(
     MlirAttribute attr) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::ConvDimensionNumbersAttr>()
+  return mlir::cast<mlir::mhlo::ConvDimensionNumbersAttr>(unwrap(attr))
       .getOutputFeatureDimension();
 }
 
 intptr_t mlirMhloConvDimensionNumbersGetOutputSpatialDimensionsSize(
     MlirAttribute attr) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::ConvDimensionNumbersAttr>()
+  return mlir::cast<mlir::mhlo::ConvDimensionNumbersAttr>(unwrap(attr))
       .getOutputSpatialDimensions()
       .size();
 }
 
 int64_t mlirMhloConvDimensionNumbersGetOutputSpatialDimensionsElem(
     MlirAttribute attr, intptr_t pos) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::ConvDimensionNumbersAttr>()
+  return mlir::cast<mlir::mhlo::ConvDimensionNumbersAttr>(unwrap(attr))
       .getOutputSpatialDimensions()[pos];
 }
 
@@ -364,42 +331,37 @@ MLIR_CAPI_EXPORTED MlirAttribute mlirMhloOutputOperandAliasGet(
 }
 
 bool mlirMhloAttributeIsAOutputOperandAlias(MlirAttribute attr) {
-  return unwrap(attr).isa<mlir::mhlo::OutputOperandAliasAttr>();
+  return mlir::isa<mlir::mhlo::OutputOperandAliasAttr>(unwrap(attr));
 }
 
 intptr_t mlirMhloOutputOperandAliasGetOutputTupleIndicesSize(
     MlirAttribute attr) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::OutputOperandAliasAttr>()
+  return mlir::cast<mlir::mhlo::OutputOperandAliasAttr>(unwrap(attr))
       .getOutputTupleIndices()
       .size();
 }
 
 int64_t mlirMhloOutputOperandAliasGetOutputTupleIndicesElem(MlirAttribute attr,
                                                             intptr_t pos) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::OutputOperandAliasAttr>()
+  return mlir::cast<mlir::mhlo::OutputOperandAliasAttr>(unwrap(attr))
       .getOutputTupleIndices()[pos];
 }
 
 int64_t mlirMhloOutputOperandAliasGetOperandIndex(MlirAttribute attr) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::OutputOperandAliasAttr>()
+  return mlir::cast<mlir::mhlo::OutputOperandAliasAttr>(unwrap(attr))
       .getOperandIndex();
 }
 
 intptr_t mlirMhloOutputOperandAliasGetOperandTupleIndicesSize(
     MlirAttribute attr) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::OutputOperandAliasAttr>()
+  return mlir::cast<mlir::mhlo::OutputOperandAliasAttr>(unwrap(attr))
       .getOperandTupleIndices()
       .size();
 }
 
 int64_t mlirMhloOutputOperandAliasGetOperandTupleIndicesElem(MlirAttribute attr,
                                                              intptr_t pos) {
-  return unwrap(attr)
-      .cast<mlir::mhlo::OutputOperandAliasAttr>()
+  return mlir::cast<mlir::mhlo::OutputOperandAliasAttr>(unwrap(attr))
       .getOperandTupleIndices()[pos];
 }
 
@@ -416,12 +378,13 @@ MlirAttribute mlirMhloComparisonDirectionAttrGet(MlirContext ctx,
 }
 
 bool mlirMhloAttributeIsAComparisonDirectionAttr(MlirAttribute attr) {
-  return unwrap(attr).isa<mlir::mhlo::ComparisonDirectionAttr>();
+  return mlir::isa<mlir::mhlo::ComparisonDirectionAttr>(unwrap(attr));
 }
 
 MlirStringRef mlirMhloComparisonDirectionAttrGetValue(MlirAttribute attr) {
   return wrap(mlir::mhlo::stringifyComparisonDirection(
-      unwrap(attr).cast<mlir::mhlo::ComparisonDirectionAttr>().getValue()));
+      mlir::cast<mlir::mhlo::ComparisonDirectionAttr>(unwrap(attr))
+          .getValue()));
 }
 
 //
@@ -438,12 +401,12 @@ MlirAttribute mlirMhloComparisonTypeAttrGet(MlirContext ctx,
 }
 
 bool mlirMhloAttributeIsAComparisonTypeAttr(MlirAttribute attr) {
-  return unwrap(attr).isa<mlir::mhlo::ComparisonTypeAttr>();
+  return mlir::isa<mlir::mhlo::ComparisonTypeAttr>(unwrap(attr));
 }
 
 MlirStringRef mlirMhloComparisonTypeAttrGetValue(MlirAttribute attr) {
   return wrap(mlir::mhlo::stringifyComparisonType(
-      unwrap(attr).cast<mlir::mhlo::ComparisonTypeAttr>().getValue()));
+      mlir::cast<mlir::mhlo::ComparisonTypeAttr>(unwrap(attr)).getValue()));
 }
 
 //
@@ -458,12 +421,12 @@ MlirAttribute mlirMhloDomainKindAttrGet(MlirContext ctx, MlirStringRef value) {
 }
 
 bool mlirMhloAttributeIsADomainKindAttr(MlirAttribute attr) {
-  return unwrap(attr).isa<mlir::mhlo::DomainKindAttr>();
+  return mlir::isa<mlir::mhlo::DomainKindAttr>(unwrap(attr));
 }
 
 MlirStringRef mlirMhloDomainKindAttrGetValue(MlirAttribute attr) {
   return wrap(mlir::mhlo::stringifyDomainKind(
-      unwrap(attr).cast<mlir::mhlo::DomainKindAttr>().getValue()));
+      mlir::cast<mlir::mhlo::DomainKindAttr>(unwrap(attr)).getValue()));
 }
 
 //
@@ -478,12 +441,12 @@ MlirAttribute mlirMhloPrecisionAttrGet(MlirContext ctx, MlirStringRef value) {
 }
 
 bool mlirMhloAttributeIsAPrecisionAttr(MlirAttribute attr) {
-  return unwrap(attr).isa<mlir::mhlo::PrecisionAttr>();
+  return mlir::isa<mlir::mhlo::PrecisionAttr>(unwrap(attr));
 }
 
 MlirStringRef mlirMhloPrecisionAttrGetValue(MlirAttribute attr) {
   return wrap(mlir::mhlo::stringifyPrecision(
-      unwrap(attr).cast<mlir::mhlo::PrecisionAttr>().getValue()));
+      mlir::cast<mlir::mhlo::PrecisionAttr>(unwrap(attr)).getValue()));
 }
 
 //
@@ -498,12 +461,12 @@ MlirAttribute mlirMhloFftTypeAttrGet(MlirContext ctx, MlirStringRef value) {
 }
 
 bool mlirMhloAttributeIsAFftTypeAttr(MlirAttribute attr) {
-  return unwrap(attr).isa<mlir::mhlo::FftTypeAttr>();
+  return mlir::isa<mlir::mhlo::FftTypeAttr>(unwrap(attr));
 }
 
 MlirStringRef mlirMhloFftTypeAttrGetValue(MlirAttribute attr) {
   return wrap(mlir::mhlo::stringifyFftType(
-      unwrap(attr).cast<mlir::mhlo::FftTypeAttr>().getValue()));
+      mlir::cast<mlir::mhlo::FftTypeAttr>(unwrap(attr)).getValue()));
 }
 
 //
@@ -520,12 +483,12 @@ MlirAttribute mlirMhloDequantizeModeAttrGet(MlirContext ctx,
 }
 
 bool mlirMhloAttributeIsADequantizeModeAttr(MlirAttribute attr) {
-  return unwrap(attr).isa<mlir::mhlo::DequantizeModeAttr>();
+  return mlir::isa<mlir::mhlo::DequantizeModeAttr>(unwrap(attr));
 }
 
 MlirStringRef mlirMhloDequantizeModeAttrGetValue(MlirAttribute attr) {
   return wrap(mlir::mhlo::stringifyDequantizeMode(
-      unwrap(attr).cast<mlir::mhlo::DequantizeModeAttr>().getValue()));
+      mlir::cast<mlir::mhlo::DequantizeModeAttr>(unwrap(attr)).getValue()));
 }
 
 //
@@ -540,12 +503,12 @@ MlirAttribute mlirMhloTransposeAttrGet(MlirContext ctx, MlirStringRef value) {
 }
 
 bool mlirMhloAttributeIsATransposeAttr(MlirAttribute attr) {
-  return unwrap(attr).isa<mlir::mhlo::TransposeAttr>();
+  return mlir::isa<mlir::mhlo::TransposeAttr>(unwrap(attr));
 }
 
 MlirStringRef mlirMhloTransposeAttrGetValue(MlirAttribute attr) {
   return wrap(mlir::mhlo::stringifyTranspose(
-      unwrap(attr).cast<mlir::mhlo::TransposeAttr>().getValue()));
+      mlir::cast<mlir::mhlo::TransposeAttr>(unwrap(attr)).getValue()));
 }
 
 //
@@ -560,12 +523,12 @@ MlirAttribute mlirMhloFusionKindAttrGet(MlirContext ctx, MlirStringRef value) {
 }
 
 bool mlirMhloAttributeIsAFusionKindAttr(MlirAttribute attr) {
-  return unwrap(attr).isa<mlir::mhlo::FusionKindAttr>();
+  return mlir::isa<mlir::mhlo::FusionKindAttr>(unwrap(attr));
 }
 
 MlirStringRef mlirMhloFusionKindAttrGetValue(MlirAttribute attr) {
   return wrap(mlir::mhlo::stringifyFusionKind(
-      unwrap(attr).cast<mlir::mhlo::FusionKindAttr>().getValue()));
+      mlir::cast<mlir::mhlo::FusionKindAttr>(unwrap(attr)).getValue()));
 }
 
 //
@@ -582,12 +545,12 @@ MlirAttribute mlirMhloRngDistributionAttrGet(MlirContext ctx,
 }
 
 bool mlirMhloAttributeIsARngDistributionAttr(MlirAttribute attr) {
-  return unwrap(attr).isa<mlir::mhlo::RngDistributionAttr>();
+  return mlir::isa<mlir::mhlo::RngDistributionAttr>(unwrap(attr));
 }
 
 MlirStringRef mlirMhloRngDistributionAttrGetValue(MlirAttribute attr) {
   return wrap(mlir::mhlo::stringifyRngDistribution(
-      unwrap(attr).cast<mlir::mhlo::RngDistributionAttr>().getValue()));
+      mlir::cast<mlir::mhlo::RngDistributionAttr>(unwrap(attr)).getValue()));
 }
 
 //
@@ -604,12 +567,12 @@ MlirAttribute mlirMhloRngAlgorithmAttrGet(MlirContext ctx,
 }
 
 bool mlirMhloAttributeIsARngAlgorithmAttr(MlirAttribute attr) {
-  return unwrap(attr).isa<mlir::mhlo::RngAlgorithmAttr>();
+  return mlir::isa<mlir::mhlo::RngAlgorithmAttr>(unwrap(attr));
 }
 
 MlirStringRef mlirMhloRngAlgorithmAttrGetValue(MlirAttribute attr) {
   return wrap(mlir::mhlo::stringifyRngAlgorithm(
-      unwrap(attr).cast<mlir::mhlo::RngAlgorithmAttr>().getValue()));
+      mlir::cast<mlir::mhlo::RngAlgorithmAttr>(unwrap(attr)).getValue()));
 }
 
 //
@@ -622,15 +585,15 @@ MlirAttribute mlirMhloChannelHandleGet(MlirContext ctx, int64_t handle,
 }
 
 bool mlirMhloAttributeIsChannelHandle(MlirAttribute attr) {
-  return unwrap(attr).isa<mlir::mhlo::ChannelHandleAttr>();
+  return mlir::isa<mlir::mhlo::ChannelHandleAttr>(unwrap(attr));
 }
 
 int64_t mlirMhloChannelHandleGetHandle(MlirAttribute attr) {
-  return unwrap(attr).cast<mlir::mhlo::ChannelHandleAttr>().getHandle();
+  return mlir::cast<mlir::mhlo::ChannelHandleAttr>(unwrap(attr)).getHandle();
 }
 
 int64_t mlirMhloChannelHandleGetType(MlirAttribute attr) {
-  return unwrap(attr).cast<mlir::mhlo::ChannelHandleAttr>().getType();
+  return mlir::cast<mlir::mhlo::ChannelHandleAttr>(unwrap(attr)).getType();
 }
 
 //
@@ -644,15 +607,18 @@ MlirAttribute mlirMhloTypeExtensionsGet(MlirContext ctx, intptr_t nBounds,
 }
 
 bool mlirMhloAttributeIsTypeExtensions(MlirAttribute attr) {
-  return unwrap(attr).isa<mlir::mhlo::TypeExtensionsAttr>();
+  return mlir::isa<mlir::mhlo::TypeExtensionsAttr>(unwrap(attr));
 }
 
 intptr_t mlirMhloTypeExtensionsGetBoundsSize(MlirAttribute attr) {
-  return unwrap(attr).cast<mlir::mhlo::TypeExtensionsAttr>().getBounds().size();
+  return mlir::cast<mlir::mhlo::TypeExtensionsAttr>(unwrap(attr))
+      .getBounds()
+      .size();
 }
 
 int64_t mlirMhloTypeExtensionsGetBoundsElem(MlirAttribute attr, intptr_t pos) {
-  return unwrap(attr).cast<mlir::mhlo::TypeExtensionsAttr>().getBounds()[pos];
+  return mlir::cast<mlir::mhlo::TypeExtensionsAttr>(unwrap(attr))
+      .getBounds()[pos];
 }
 
 //
@@ -666,17 +632,18 @@ MlirAttribute mlirMhloSparsityDescriptorGet(MlirContext ctx, int64_t dimension,
 }
 
 bool mlirMhloAttributeIsASparsityDescriptor(MlirAttribute attr) {
-  return unwrap(attr).isa<mlir::mhlo::SparsityDescriptorAttr>();
+  return mlir::isa<mlir::mhlo::SparsityDescriptorAttr>(unwrap(attr));
 }
 
 int64_t mlirMhloSparsityDescriptorGetDimension(MlirAttribute attr) {
-  return unwrap(attr).cast<mlir::mhlo::SparsityDescriptorAttr>().getDimension();
+  return mlir::cast<mlir::mhlo::SparsityDescriptorAttr>(unwrap(attr))
+      .getDimension();
 }
 
 int64_t mlirMhloSparsityDescriptorGetN(MlirAttribute attr) {
-  return unwrap(attr).cast<mlir::mhlo::SparsityDescriptorAttr>().getN();
+  return mlir::cast<mlir::mhlo::SparsityDescriptorAttr>(unwrap(attr)).getN();
 }
 
 int64_t mlirMhloSparsityDescriptorGetM(MlirAttribute attr) {
-  return unwrap(attr).cast<mlir::mhlo::SparsityDescriptorAttr>().getM();
+  return mlir::cast<mlir::mhlo::SparsityDescriptorAttr>(unwrap(attr)).getM();
 }
diff --git a/third_party/xla/xla/mlir_hlo/bindings/c/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/bindings/c/CMakeLists.txt
index d3f4158293adfc..fd2a5c2c3a024d 100644
--- a/third_party/xla/xla/mlir_hlo/bindings/c/CMakeLists.txt
+++ b/third_party/xla/xla/mlir_hlo/bindings/c/CMakeLists.txt
@@ -13,6 +13,5 @@ add_mlir_public_c_api_library(MLIRHLOCAPIDialects
   MhloToStandard
   MhloToLinalg
   MhloToStablehlo
-  MhloShapeOpsToStandard
   StablehloToMhlo
 )
diff --git a/third_party/xla/xla/mlir_hlo/bindings/c/Types.cc b/third_party/xla/xla/mlir_hlo/bindings/c/Types.cc
index 0be0e34c02069e..4ac9bdd75c1825 100644
--- a/third_party/xla/xla/mlir_hlo/bindings/c/Types.cc
+++ b/third_party/xla/xla/mlir_hlo/bindings/c/Types.cc
@@ -14,11 +14,12 @@ limitations under the License.
 
 #include "mhlo/IR/hlo_ops.h"
 #include "mlir/CAPI/IR.h"
+#include "mlir/Support/LLVM.h"
 
 MlirType mlirMhloTokenTypeGet(MlirContext ctx) {
   return wrap(mlir::mhlo::TokenType::get(unwrap(ctx)));
 }
 
 bool mlirMhloTypeIsAToken(MlirType type) {
-  return unwrap(type).isa<mlir::mhlo::TokenType>();
+  return mlir::isa<mlir::mhlo::TokenType>(unwrap(type));
 }
diff --git a/third_party/xla/xla/mlir_hlo/deallocation/transforms/buffer_reuse.cc b/third_party/xla/xla/mlir_hlo/deallocation/transforms/buffer_reuse.cc
index 951dc240e9b53a..4b8930edc961bf 100644
--- a/third_party/xla/xla/mlir_hlo/deallocation/transforms/buffer_reuse.cc
+++ b/third_party/xla/xla/mlir_hlo/deallocation/transforms/buffer_reuse.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "mlir/IR/Value.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir {
@@ -372,7 +373,7 @@ void promoteToStack(memref::DeallocOp dealloc) {
   auto alloc = dealloc.getMemref().getDefiningOp<memref::AllocOp>();
   OpBuilder b(alloc);
   auto alloca = b.create<memref::AllocaOp>(
-      alloc->getLoc(), alloc->getResultTypes()[0].cast<MemRefType>(),
+      alloc->getLoc(), mlir::cast<MemRefType>(alloc->getResultTypes()[0]),
       alloc.getAlignmentAttr());
   alloc->replaceAllUsesWith(ValueRange{alloca.getResult()});
   alloc->erase();
diff --git a/third_party/xla/xla/mlir_hlo/deallocation/utils/util.h b/third_party/xla/xla/mlir_hlo/deallocation/utils/util.h
index ce6be44f99d962..c18c8f9dcd2485 100644
--- a/third_party/xla/xla/mlir_hlo/deallocation/utils/util.h
+++ b/third_party/xla/xla/mlir_hlo/deallocation/utils/util.h
@@ -81,8 +81,8 @@ struct ValueComparator {
     if (lhs == rhs) return false;
 
     // Block arguments are less than results.
-    bool lhsIsBBArg = lhs.isa<BlockArgument>();
-    if (lhsIsBBArg != rhs.isa<BlockArgument>()) {
+    bool lhsIsBBArg = isa<BlockArgument>(lhs);
+    if (lhsIsBBArg != isa<BlockArgument>(rhs)) {
       return lhsIsBBArg;
     }
 
diff --git a/third_party/xla/xla/mlir_hlo/lhlo/IR/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/lhlo/IR/CMakeLists.txt
deleted file mode 100644
index 51516427777c8b..00000000000000
--- a/third_party/xla/xla/mlir_hlo/lhlo/IR/CMakeLists.txt
+++ /dev/null
@@ -1,58 +0,0 @@
-#
-# Copyright 2020 The OpenXLA Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Need a separate function because of the .cc vs .cpp used in the one provided by MLIR
-function(add_mlir_hlo_dialect_separate_files dialect)
-  set(LLVM_TARGET_DEFINITIONS ${dialect}.td)
-  mlir_tablegen(${dialect}.h.inc -gen-op-decls)
-  mlir_tablegen(${dialect}.cc.inc -gen-op-defs)
-  set(LLVM_TARGET_DEFINITIONS ${dialect}_structs.td)
-  mlir_tablegen(${dialect}_structs.h.inc -gen-attrdef-decls)
-  mlir_tablegen(${dialect}_structs.cc.inc -gen-attrdef-defs)
-  add_public_tablegen_target(MLIR${dialect}IncGen)
-  add_dependencies(mlir-headers MLIR${dialect}IncGen)
-endfunction()
-
-add_mlir_hlo_dialect_separate_files(lhlo_ops)
-
-add_mlir_interface(lhlo_structured_interface)
-
-include_directories(BEFORE
-    ${CMAKE_CURRENT_BINARY_DIR}
-    ${CMAKE_CURRENT_SOURCE_DIR})
-
-add_mlir_library(LmhloStructuredInterface
-  lhlo_structured_interface.cc
-
-  LINK_LIBS PUBLIC
-  MLIRIR
-
-  DEPENDS
-  MLIRlhlo_structured_interfaceIncGen
-)
-
-add_mlir_dialect_library(LmhloDialect
-  lhlo_ops.cc
-
-  DEPENDS
-  MLIRlhlo_opsIncGen
-
-  LINK_LIBS PUBLIC
-  HloOpsCommon
-  LmhloStructuredInterface
-  MhloDialect
-  MLIRIR
-)
-
diff --git a/third_party/xla/xla/mlir_hlo/lhlo/IR/lhlo_ops.cc b/third_party/xla/xla/mlir_hlo/lhlo/IR/lhlo_ops.cc
deleted file mode 100644
index 952e3c94751d3b..00000000000000
--- a/third_party/xla/xla/mlir_hlo/lhlo/IR/lhlo_ops.cc
+++ /dev/null
@@ -1,404 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file defines the operations used in the LMHLO dialect.
-
-#include "lhlo/IR/lhlo_ops.h"
-
-#include <assert.h>
-#include <stddef.h>
-#include <stdint.h>
-
-#include <optional>
-#include <tuple>
-#include <unordered_set>
-
-#include "lhlo/utils/lhlo_utils.h"
-#include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/TypeSwitch.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "mhlo/IR/hlo_ops.h"
-#include "mhlo/IR/hlo_ops_common.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/Dialect.h"
-#include "mlir/IR/IRMapping.h"
-#include "mlir/IR/Location.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/OpDefinition.h"
-#include "mlir/IR/OpImplementation.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/IR/OperationSupport.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/TypeUtilities.h"
-#include "mlir/IR/Types.h"
-#include "mlir/IR/Value.h"
-
-#define GET_ATTRDEF_CLASSES
-#include "lhlo/IR/lhlo_ops_structs.cc.inc"
-
-namespace mlir {
-namespace lmhlo {
-
-using mhlo::TokenType;
-
-LmhloDialect::LmhloDialect(MLIRContext* context)
-    : Dialect(getDialectNamespace(), context, TypeID::get<LmhloDialect>()) {
-  context->loadDialect<mhlo::MhloDialect>();
-  addOperations<
-#define GET_OP_LIST
-#include "lhlo/IR/lhlo_ops.cc.inc"
-      >();
-  addAttributes<
-#define GET_ATTRDEF_LIST
-#include "lhlo/IR/lhlo_ops_structs.cc.inc"
-      >();
-}
-
-// Entry point for Attribute parsing, TableGen generated code will handle the
-// dispatch to the individual classes.
-Attribute LmhloDialect::parseAttribute(DialectAsmParser& parser,
-                                       Type type) const {
-  StringRef attrTag;
-  Attribute attr;
-  auto parseResult = generatedAttributeParser(parser, &attrTag, type, attr);
-  if (parseResult.has_value()) return attr;
-  parser.emitError(parser.getNameLoc(), "unknown mhlo attribute");
-  return Attribute();
-}
-
-// Entry point for Attribute printing, TableGen generated code will handle the
-// dispatch to the individual classes.
-void LmhloDialect::printAttribute(Attribute attr, DialectAsmPrinter& os) const {
-  LogicalResult result = generatedAttributePrinter(attr, os);
-  (void)result;
-  assert(succeeded(result));
-}
-
-//===----------------------------------------------------------------------===//
-// AbsOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult AbsOp::verify() {
-  AbsOp op = *this;
-  auto operandType = getElementTypeOrSelf(op.getInput().getType());
-  auto outputType = getElementTypeOrSelf(op.getOutput().getType());
-  if (auto complexType = operandType.dyn_cast<ComplexType>()) {
-    if (complexType.getElementType() != outputType) {
-      return op.emitOpError(
-          "requires output type to be the same as the element type of the "
-          "input");
-    }
-    return success();
-  }
-  if (operandType != outputType)
-    return op.emitOpError("requires all operands to have the same type");
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// CaseOp
-//===----------------------------------------------------------------------===//
-
-void CaseOp::getSuccessorRegions(RegionBranchPoint point,
-                                 SmallVectorImpl<RegionSuccessor>& regions) {
-  // If the predecessor is the CaseOp, branch to all other branches.
-  if (point.isParent()) {
-    for (auto& branch : getBranches())
-      regions.push_back(RegionSuccessor(&branch, branch.getArguments()));
-  }
-  // If the predecessor is one of the branches, branch back to the parent
-  // operation.
-  regions.push_back(RegionSuccessor());
-}
-
-//===----------------------------------------------------------------------===//
-// ConstantOp.
-//===----------------------------------------------------------------------===//
-
-/// An lho.constant on an memref that is locally allocated and with no other
-/// users (other than dealloc's) can be erased.
-// TODO: This can be generalized to an arbitrary op by making use of memory
-// effects (write memory effect).
-struct EraseConstantOp : public OpRewritePattern<ConstantOp> {
-  using OpRewritePattern<ConstantOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(ConstantOp op,
-                                PatternRewriter& rewriter) const override {
-    Value memref = op.getOutput();
-    if (!memref.getDefiningOp<memref::AllocOp>()) {
-      return failure();
-    }
-
-    // Check that all uses of the memref are either DeallocOps or this op.
-    for (Operation* user : memref.getUsers())
-      if (user != op && !isa<memref::DeallocOp>(user)) return failure();
-
-    rewriter.eraseOp(op);
-    return success();
-  }
-};
-
-void ConstantOp::getCanonicalizationPatterns(RewritePatternSet& results,
-                                             MLIRContext* context) {
-  results.add<EraseConstantOp>(context);
-}
-
-//===----------------------------------------------------------------------===//
-// CustomCallOp.
-//===----------------------------------------------------------------------===//
-
-LogicalResult CustomCallOp::verify() {
-  CustomCallOp op = *this;
-  if (op.getTargetArgMapping()) {
-    CustomCallTargetArgMappingAttr mapping = *op.getTargetArgMapping();
-    auto verifyMapping = [&](int64_t targetNum, size_t opNum,
-                             ArrayRef<int64_t> mapping,
-                             StringRef kind) -> LogicalResult {
-      if (targetNum < static_cast<int64_t>(opNum))
-        return op.emitOpError("number of target " + kind + " (")
-               << targetNum << ") cannot be less than the number of " << kind
-               << "(" << opNum << ") for the operation";
-
-      if (mapping.size() != opNum)
-        return op.emitOpError("number of entries in the mapping for " + kind +
-                              " (")
-               << mapping.size() << ") should match the number of " << kind
-               << " for the operation (" << opNum << ")";
-
-      std::unordered_set<int64_t> entries;
-      // Each entry in the mapping should be < target_num and an entry cannot
-      // appear more than once.
-      for (int64_t entry : mapping) {
-        // ODS verification will ensure that these entries are integers.
-        if (!entries.insert(entry).second)
-          return op.emitOpError("entry ")
-                 << entry << " cannot appear more than once in the mapping for "
-                 << kind;
-        if (entry < 0 || entry >= targetNum)
-          return op.emitOpError(
-                     "entries in mapping for " + kind +
-                     " must be >= 0 and less than target's number of " + kind +
-                     " (")
-                 << targetNum << ")";
-      }
-      return success();
-    };
-    if (failed(verifyMapping(mapping.getNumArgs(), op.getArgs().size(),
-                             mapping.getArgsToTargetArgs(), "args")) ||
-        failed(verifyMapping(mapping.getNumResults(), op.getOutput().size(),
-                             mapping.getResultsToTargetResults(), "results")))
-      return failure();
-  }
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// PadOp
-//===----------------------------------------------------------------------===//
-
-// PadOp's verifier checks if :-
-//  1. Operand and output ranks match.
-//  2. Padding configurations are specified for each dimension.
-//  3. Output shape matches the expected output shape when the padding
-//     configurations are applied to the operand.
-LogicalResult PadOp::verify() {
-  PadOp op = *this;
-  auto operandType = op.getOperand().getType().dyn_cast<ShapedType>();
-  auto outputType = op.getOutput().getType().dyn_cast<ShapedType>();
-  if (!(operandType && outputType && operandType.hasRank() &&
-        outputType.hasRank())) {
-    return success();
-  }
-
-  unsigned rank = operandType.getRank();
-  // Checks if operand and output ranks match.
-  if (outputType.getRank() != rank) {
-    return op.emitOpError()
-           << "output's rank(" << outputType.getRank()
-           << ") is not same as operand's rank(" << rank << ")";
-  }
-
-  auto edgePadLowRanges = op.getEdgePaddingLow().getValues<int64_t>();
-  auto edgePadHighRanges = op.getEdgePaddingHigh().getValues<int64_t>();
-  auto interiorPadRanges = op.getInteriorPadding().getValues<int64_t>();
-  // Checks if padding configurations are specified for each dimension.
-  if (edgePadLowRanges.size() != rank || edgePadHighRanges.size() != rank ||
-      interiorPadRanges.size() != rank) {
-    return op.emitOpError() << "pad configurations to be specified for all "
-                            << rank << " dimensions";
-  }
-
-  // Checks if output shape matches the expected output shape when the padding
-  // configurations are applied to the operand. Expected output shape for each
-  // dimension is calculated as :-
-  //     low_padding + operand_dim_size + total_interior_padding + high_padding
-  //  where, total_interior_padding = (operand_dim_size - 1) * interior_padding.
-  for (const auto& paddings : llvm::enumerate(
-           llvm::zip(edgePadLowRanges, edgePadHighRanges, interiorPadRanges,
-                     operandType.getShape(), outputType.getShape()))) {
-    auto index = static_cast<unsigned>(paddings.index());
-    int64_t lowPad, highPad, interiorPad, operandDimSize, outputDimSize;
-    std::tie(lowPad, highPad, interiorPad, operandDimSize, outputDimSize) =
-        paddings.value();
-    int64_t expectedDimSize =
-        lowPad + operandDimSize + (operandDimSize - 1) * interiorPad + highPad;
-    if (expectedDimSize != outputDimSize) {
-      return op.emitOpError()
-             << "expected " << index << "-th dimension size after padding is "
-             << expectedDimSize << " but found " << outputDimSize;
-    }
-  }
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// ReduceOp
-//===----------------------------------------------------------------------===//
-
-// Removes `lmhlo.copy` inside ReduceOp body.
-//
-// TODO(b/183920887): Remove this pattern as soon as bufferization is fixed.
-struct RemoveCopyInReduceBody : public OpRewritePattern<ReduceOp> {
-  using OpRewritePattern<ReduceOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(ReduceOp reduce,
-                                PatternRewriter& rewriter) const override {
-    // Find the only `lmhlo.copy` in the body of `reduce`.
-    CopyOp theOnlyCopy;
-    for (auto& op : reduce.getBody().front()) {
-      if (auto copy = dyn_cast<lmhlo::CopyOp>(op)) {
-        if (theOnlyCopy == nullptr) {
-          theOnlyCopy = copy;
-        } else {
-          theOnlyCopy = nullptr;
-          break;
-        }
-      }
-    }
-    if (!theOnlyCopy) return failure();
-
-    auto newReduce = rewriter.cloneWithoutRegions(reduce);
-    auto& oldReduceBody = reduce.getBody().front();
-    Block* newBlock = rewriter.createBlock(
-        &newReduce.getBody(), newReduce.getBody().end(),
-        oldReduceBody.getArgumentTypes(),
-        SmallVector<Location>(oldReduceBody.getNumArguments(),
-                              reduce.getLoc()));
-
-    mlir::IRMapping bvm;
-    for (auto item : llvm::zip(reduce.getBody().front().getArguments(),
-                               newBlock->getArguments())) {
-      bvm.map(std::get<0>(item), std::get<1>(item));
-    }
-    bvm.map(theOnlyCopy.getOperand(), bvm.lookup(theOnlyCopy.getOutput()));
-
-    rewriter.setInsertionPointToStart(newBlock);
-    for (auto& op : reduce.getBody().front()) {
-      if (llvm::isa<lmhlo::CopyOp>(op) || llvm::isa<memref::DeallocOp>(op) ||
-          llvm::isa<memref::AllocOp>(op))
-        continue;
-      rewriter.clone(op, bvm);
-    }
-    rewriter.eraseOp(reduce);
-    return success();
-  }
-};
-
-void ReduceOp::getCanonicalizationPatterns(RewritePatternSet& results,
-                                           MLIRContext* context) {
-  results.add<RemoveCopyInReduceBody>(context);
-}
-
-//===----------------------------------------------------------------------===//
-// ReduceWindowOp.
-//===----------------------------------------------------------------------===//
-
-// For reduce-window, all `inputs` need to have compatible shapes.
-LogicalResult ReduceWindowOp::verify() {
-  ReduceWindowOp op = *this;
-  if (failed(verifyCompatibleShapes(op.getInputs().getTypes())))
-    return op.emitOpError() << "requires same shape for all operands";
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// WhileOp
-//===----------------------------------------------------------------------===//
-
-void WhileOp::getSuccessorRegions(RegionBranchPoint point,
-                                  SmallVectorImpl<RegionSuccessor>& regions) {
-  // If the predecessor is the WhileOp or the body region, branch into the
-  // cond region.
-  if (point.isParent() || point == (*this)->getRegion(1)) {
-    regions.push_back(RegionSuccessor(&getCond(), getCond().getArguments()));
-    return;
-  }
-  // If the predecessor is the cond region, we can branch to the body region
-  // or back to the parent operation.
-  regions.push_back(RegionSuccessor(&getBody(), getBody().getArguments()));
-  regions.push_back(RegionSuccessor());
-}
-
-SmallVector<Region*> WhileOp::getLoopRegions() { return {&getBody()}; }
-
-// suppress warning.
-
-using mlir::hlo::parseWindowAttributes;
-using mlir::hlo::printWindowAttributes;
-
-}  // namespace lmhlo
-}  // namespace mlir
-
-#define GET_OP_CLASSES
-#include "lhlo/IR/lhlo_ops.cc.inc"
-
-namespace mlir {
-namespace lmhlo {
-
-// TODO(cheshire): Support folding, reuse code from hlo_ops.cc.
-
-void FusionOp::build(OpBuilder& builder, OperationState& result,
-                     ArrayRef<NamedAttribute> attributes) {
-  result.addAttributes(attributes);
-  Region* bodyRegion = result.addRegion();
-  FusionOp::ensureTerminator(*bodyRegion, builder, result.location);
-}
-
-void FusionOp::getSuccessorRegions(RegionBranchPoint point,
-                                   SmallVectorImpl<RegionSuccessor>& regions) {
-  // If the predecessor is the fusion region, jump back to the parent op.
-  if (!point.isParent()) {
-    assert(point == (*this)->getRegion(0) && "expected fusion region");
-    regions.push_back(RegionSuccessor());
-  } else {
-    // If the predecessor is the FusionOp, branch into the region.
-    regions.push_back(
-        RegionSuccessor(&getRegion(), getRegion().getArguments()));
-  }
-}
-
-}  // namespace lmhlo
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/lhlo/IR/lhlo_ops.h b/third_party/xla/xla/mlir_hlo/lhlo/IR/lhlo_ops.h
deleted file mode 100644
index 7adcacee3060ea..00000000000000
--- a/third_party/xla/xla/mlir_hlo/lhlo/IR/lhlo_ops.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file defines the operations used in the LHLO dialect.
-
-#ifndef MLIR_HLO_LHLO_IR_LHLO_OPS_H
-#define MLIR_HLO_LHLO_IR_LHLO_OPS_H
-
-#include "lhlo/IR/lhlo_ops_structs.h"
-#include "lhlo/IR/lhlo_structured_interface.h"
-#include "llvm/ADT/StringRef.h"
-#include "mhlo/IR/hlo_ops.h"
-#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/Dialect.h"
-#include "mlir/IR/Location.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/OpDefinition.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/IR/Types.h"
-#include "mlir/Interfaces/ControlFlowInterfaces.h"
-#include "mlir/Interfaces/CopyOpInterface.h"
-#include "mlir/Interfaces/LoopLikeInterface.h"
-#include "mlir/Interfaces/SideEffectInterfaces.h"
-#include "mlir/Interfaces/ViewLikeInterface.h"
-
-namespace mlir {
-class OpBuilder;
-namespace lmhlo {
-
-class LmhloDialect : public Dialect {
- public:
-  explicit LmhloDialect(MLIRContext *context);
-  static StringRef getDialectNamespace() { return "lmhlo"; }
-
-  // Parses an attribute registered to this dialect.
-  Attribute parseAttribute(DialectAsmParser &parser, Type type) const override;
-
-  // Prints an attribute registered to this dialect.
-  void printAttribute(Attribute attr, DialectAsmPrinter &os) const override;
-};
-
-}  // namespace lmhlo
-}  // end namespace mlir
-
-#define GET_OP_CLASSES
-#include "lhlo/IR/lhlo_ops.h.inc"
-
-#endif  // MLIR_HLO_LHLO_IR_LHLO_OPS_H
diff --git a/third_party/xla/xla/mlir_hlo/lhlo/IR/lhlo_ops.td b/third_party/xla/xla/mlir_hlo/lhlo/IR/lhlo_ops.td
deleted file mode 100644
index c793ed6b1002d3..00000000000000
--- a/third_party/xla/xla/mlir_hlo/lhlo/IR/lhlo_ops.td
+++ /dev/null
@@ -1,1627 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This is the operation definition file for LMHLO, the "late" MHLO variant of
-// the dialect, which operates on buffers instead of tensors.
-//
-// This file largely overlaps with hlo_ops.td at a logical level. It's tempting
-// to merge these two files together, but we need to consider the following
-// obstacles:
-// * We need to have a common representation for arguments. That is to say,
-//   HLO_Array<X> translates to MHLO_Tensor<X> in HLO dialect, and
-//   Arg<LHLO_Buffer<X>, "", [Mem(Read|Write)]> in LHLO. Array types within
-//   tuples also need to be transformed.
-// * As of now, TableGen's dag functions are not sufficient to accomplish the
-//   one above.
-// * Traits aren't identical, but need to be copied. For example,
-//   SameOperandAndResultType in HLO corresponds to SameTypeOperands in LHLO.
-// * Also, currently HLO describes the API in XLA's client side, not service
-//   side. LHLO aims for the service side.
-
-#ifndef LHLO_OPS
-#define LHLO_OPS
-
-include "mlir/Dialect/MemRef/IR/MemRefBase.td"
-include "mlir/IR/OpBase.td"
-include "mlir/Interfaces/ControlFlowInterfaces.td"
-include "mlir/Interfaces/CopyOpInterface.td"
-include "mlir/Interfaces/LoopLikeInterface.td"
-include "mlir/Interfaces/SideEffectInterfaces.td"
-include "mlir/Interfaces/ViewLikeInterface.td"
-include "lhlo/IR/lhlo_dialect.td"
-include "lhlo/IR/lhlo_ops_base.td"
-include "lhlo/IR/lhlo_ops_structs.td"
-include "lhlo/IR/lhlo_structured_interface.td"
-
-//===----------------------------------------------------------------------===//
-// LMHLO nullary op definitions.
-//===----------------------------------------------------------------------===//
-
-class LHLO_Op<string mnemonic, list<Trait> traits> :
-  Op<LHLO_Dialect, mnemonic,
-    !listconcat([MemoryEffects<[MemRead, MemWrite]>,
-    LmhloStructuredInterface], traits)>;
-
-def LHLO_ConstantOp : LHLO_Op<"constant", []> {
-  let summary = "Constant operator";
-  let description = [{
-    Represents a constant value.
-  }];
-  let arguments = (ins
-    ElementsAttr:$value,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$output
-  );
-
-  let hasCanonicalizer = 1;
-}
-
-def LHLO_IotaOp : LHLO_Op<"iota", []> {
-  let summary = "Iota operator";
-  let description = [{
-    Creates a rank 1 array of values starting at zero and incrementing by one.
-  }];
-  let arguments = (ins I64Attr:$iota_dimension,
-                   Arg<LHLO_Buffer, "", [MemWrite]>:$output);
-}
-
-def LHLO_CommandBufferOp: LHLO_Op<"command_buffer", []> {
-  let summary = "Command buffer operator";
-  let description = [{
-    A dummy operation that represents the submission of a command buffer. The
-    payload of this operation is not lowered from the HLO instruction, because
-    we will emit thunk from HLO.
-  }];
-  let arguments = (ins);
-}
-
-def LHLO_AsyncStartOp: LHLO_Op<"async_start", []> {
-  let summary = "Async start operator";
-  let description = [{
-    A dummy operation that represents the submission of a generic async start.
-  }];
-  let arguments = (ins);
-}
-
-def LHLO_AsyncDoneOp: LHLO_Op<"async_done", []> {
-  let summary = "Async done operator";
-  let description = [{
-    A dummy operation that represents the submission of a generic async done.
-  }];
-  let arguments = (ins);
-}
-
-//===----------------------------------------------------------------------===//
-// LMHLO unary elementwise op definitions.
-//===----------------------------------------------------------------------===//
-// See https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions
-
-class LHLO_UnaryElementwiseOp<string mnemonic,
-                              Type BufferType = LHLO_Buffer,
-                              list<Trait> traits = [SameTypeOperands, Elementwise]>
-    : LHLO_Op<mnemonic, traits> {
-  let arguments = (ins Arg<BufferType, "", [MemRead]>:$input,
-                       Arg<BufferType, "", [MemWrite]>:$output);
-}
-
-// Abs supports complex to real, so element type is not guaranteed to match.
-def LHLO_AbsOp: LHLO_UnaryElementwiseOp<"abs", LHLO_Buffer, [SameOperandsShape]> {
-  let summary = "Absolute value operator";
-  let description = [{
-    Returns `abs(operand)` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
-  }];
-  let hasVerifier = 1;
-}
-
-// TODO(timshen): add a custom verifier.
-def LHLO_BitcastConvertOp:
-    LHLO_UnaryElementwiseOp<"bitcast_convert", LHLO_Buffer, [SameOperandsShape]> {
-  let summary = "BitcastConvert operator";
-  let description = [{
-    Similar to a 'tf.bitcast' in TensorFlow, performs an element-wise bitcast
-    operation from a data shape to a target shape. The dimensions must match,
-    and the conversion is an element-wise one. Bitcast is implemented as a
-    low-level cast, so machines with different floating-point representations
-    will give different results.
-
-    See https://www.tensorflow.org/xla/operation_semantics#bitcastconverttype.
-  }];
-}
-def LHLO_CbrtOp: LHLO_UnaryElementwiseOp<"cbrt", LHLO_FpBuffer> {
-  let summary = "Cubic root operator";
-  let description = [{
-    Returns element-wise cubic root of the operand.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
-  }];
-}
-def LHLO_CeilOp: LHLO_UnaryElementwiseOp<"ceil", LHLO_FpBuffer> {
-  let summary = "Ceil operator";
-  let description = [{
-    Returns `Ceil(operand)` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
-  }];
-}
-def LHLO_ClzOp: LHLO_UnaryElementwiseOp<"count_leading_zeros", LHLO_IntBuffer> {
-  let summary = "Count-leading-zeros (Clz) operator";
-  let description = [{
-    Returns the number of leading zeros in each operand element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
-  }];
-}
-// TODO(timshen): add a custom verifier.
-def LHLO_ConvertOp : LHLO_UnaryElementwiseOp<"convert", LHLO_Buffer, [SameOperandsShape]> {
-  let summary = "Convert operator";
-  let description = [{
-    Performs element-wise conversion of values from one type to another, e.g.
-    float to int.
-
-    See https://www.tensorflow.org/xla/operation_semantics#convertelementtype.
-  }];
-}
-def LHLO_CosineOp: LHLO_UnaryElementwiseOp<"cosine", LHLO_FpOrComplexBuffer> {
-  let summary = "Cos operator";
-  let description = [{
-    Returns `Cos(operand)` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
-  }];
-}
-def LHLO_TanOp: LHLO_UnaryElementwiseOp<"tan", LHLO_FpOrComplexBuffer> {
-  let summary = "Tan operator";
-  let description = [{
-    Returns `Tan(operand)` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
-  }];
-}
-def LHLO_ExpOp: LHLO_UnaryElementwiseOp<"exponential", LHLO_FpOrComplexBuffer> {
-  let summary = "Exponential operator";
-  let description = [{
-    Returns `e^(operand)` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
-  }];
-}
-def LHLO_Expm1Op: LHLO_UnaryElementwiseOp<"exponential_minus_one", LHLO_FpOrComplexBuffer> {
-  let summary = "Exponential minus one operator";
-  let description = [{
-    Returns `e^(operand) - 1` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
-  }];
-}
-def LHLO_FloorOp: LHLO_UnaryElementwiseOp<"floor", LHLO_FpBuffer> {
-  let summary = "Floor operator";
-  let description = [{
-    Returns `Floor(operand)` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
-  }];
-}
-def LHLO_ImagOp: LHLO_Op<"imag", [SameOperandsShape]> {
-  let summary = "Imag operator";
-  let description = [{
-    Returns `Imag(operand)` element-wise.
-  }];
-  let arguments = (ins Arg<LHLO_ComplexBuffer, "", [MemRead]>:$input,
-                       Arg<LHLO_FpBuffer, "", [MemWrite]>:$output);
-}
-
-def LHLO_IsFiniteOp: LHLO_Op<"is_finite", [SameOperandsShape]> {
-  let summary = "IsFinite operator";
-  let description = [{
-    Tests whether each element of operand is finite, i.e., is not positive or
-    negative infinity, and is not NaN. Returns a tensor of 1-bit integers with
-    the same shape as the input, where each element is nonzero (i.e. true) if
-    and only if the corresponding input element is finite.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
-  }];
-  let arguments = (ins Arg<LHLO_FpBuffer, "", [MemRead]>:$input,
-                       Arg<LHLO_PredBuffer, "", [MemWrite]>:$output);
-}
-
-def LHLO_LogOp: LHLO_UnaryElementwiseOp<"log", LHLO_FpOrComplexBuffer> {
-  let summary = "Logarithm operator";
-  let description = [{
-    Returns `log(operand)` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
-  }];
-}
-def LHLO_LogisticOp : LHLO_UnaryElementwiseOp<"logistic", LHLO_FpOrComplexBuffer> {
-  let summary = "Logistic operator";
-  let description = [{
-    Returns `logistic(operand)` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
-  }];
-}
-def LHLO_Log1pOp: LHLO_UnaryElementwiseOp<"log_plus_one", LHLO_FpOrComplexBuffer> {
-  let summary = "Log1p operator";
-  let description = [{
-    Returns `log(operand+1)` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
-  }];
-}
-def LHLO_NegOp: LHLO_UnaryElementwiseOp<"negate"> {
-  let summary = "Negation operator";
-  let description = [{
-    Returns `-operand` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
-  }];
-}
-def LHLO_NotOp: LHLO_UnaryElementwiseOp<"not", LHLO_PredOrIntBuffer> {
-  let summary = "Not operator";
-  let description = [{
-    Returns `!operand` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
-  }];
-}
-def LHLO_PopulationCountOp: LHLO_UnaryElementwiseOp<"popcnt", LHLO_IntBuffer> {
-  let summary = "PopulationCount operator";
-  let description = [{
-    Returns the number of bits set in each operand element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
-  }];
-}
-def LHLO_RealOp: LHLO_Op<"real", [SameOperandsShape]> {
-  let summary = "Real operator";
-  let description = [{
-    Returns `Real(operand)` element-wise.
-  }];
-  let arguments = (ins Arg<LHLO_ComplexBuffer, "", [MemRead]>:$input,
-                       Arg<LHLO_FpBuffer, "", [MemWrite]>:$output);
-}
-
-def LHLO_RoundOp: LHLO_UnaryElementwiseOp<"round_nearest_afz", LHLO_FpBuffer> {
-  let summary = "Round operator";
-  let description = [{
-    Returns `Round(operand)` element-wise, rounding to nearest integer with
-    half-way cases rounding away from zero.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
-  }];
-}
-def LHLO_RoundNearestEvenOp: LHLO_UnaryElementwiseOp<"round_nearest_even", LHLO_FpBuffer> {
-  let summary = "Round nearest even operator";
-  let description = [{
-    Returns `Round(operand)` element-wise, rounding to nearest integer with
-    half-way cases rounding towards even numbers.
-
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
-  }];
-}
-def LHLO_RsqrtOp: LHLO_UnaryElementwiseOp<"rsqrt", LHLO_FpOrComplexBuffer> {
-  let summary = "Reciprocal Square-root operator";
-  let description = [{
-    Returns `1.0 / sqrt(operand)` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
-  }];
-}
-def LHLO_SqrtOp: LHLO_UnaryElementwiseOp<"sqrt", LHLO_FpOrComplexBuffer> {
-  let summary = "Square-root operator";
-  let description = [{
-    Returns `sqrt(operand)` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
-  }];
-}
-def LHLO_SignOp: LHLO_UnaryElementwiseOp<"sign"> {
-  let summary = "Sign operator";
-  let description = [{
-    Returns `sign(operand)` element-wise, where
-
-    ```
-    sign(x) = -1  : x < 0
-            = -0  : x = -0
-            = NaN : x = NaN
-            = +0  : x = +0
-            = 1   : x > 0
-    ```
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
-  }];
-}
-def LHLO_SineOp: LHLO_UnaryElementwiseOp<"sine", LHLO_FpOrComplexBuffer> {
-  let summary = "Sin operator";
-  let description = [{
-    Returns `Sin(operand)` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
-  }];
-}
-def LHLO_TanhOp: LHLO_UnaryElementwiseOp<"tanh", LHLO_FpOrComplexBuffer> {
-  let summary = "Tanh operator";
-  let description = [{
-    Returns `tanh(operand)` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
-  }];
-}
-//===----------------------------------------------------------------------===//
-// LMHLO binary elementwise op definitions.
-//===----------------------------------------------------------------------===//
-// See https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations
-
-class LHLO_BinaryElementwiseOp<string mnemonic, Type BufferType = LHLO_Buffer,
-                               list<Trait> traits = [SameTypeOperands, Elementwise]> :
-        LHLO_Op<mnemonic, traits> {
-  let arguments = (ins
-      Arg<BufferType, "", [MemRead]>:$lhs,
-      Arg<BufferType, "", [MemRead]>:$rhs,
-      Arg<BufferType, "", [MemWrite]>:$out,
-      OptionalAttr<I64ElementsAttr>:$broadcast_dimensions
-  );
-}
-
-def LHLO_AddOp : LHLO_BinaryElementwiseOp<"add"> {
-  let summary = "Addition operator";
-  let description = [{
-    Returns `lhs + rhs` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
-  }];
-}
-def LHLO_AndOp: LHLO_BinaryElementwiseOp<"and", LHLO_PredOrIntBuffer> {
-  let summary = "Logical and";
-  let description = [{
-    Returns `logical_and(lhs, rhs)` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
-  }];
-}
-def LHLO_Atan2Op : LHLO_BinaryElementwiseOp<"atan2", LHLO_FpOrComplexBuffer> {
-  let summary = "Atan2 operator";
-  let description = [{
-    Returns `atan2(lhs/rhs)` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
-  }];
-}
-def LHLO_ComplexOp: LHLO_Op<"complex", [SameOperandsShape]> {
-  let summary = "Complex operator";
-  let description = [{
-    Performs element-wise conversion of a pair of real and imaginary values to
-    a complex value.
-  }];
-  let arguments = (ins
-      Arg<LHLO_FpBuffer, "", [MemRead]>:$lhs,
-      Arg<LHLO_FpBuffer, "", [MemRead]>:$rhs,
-      Arg<LHLO_ComplexBuffer, "", [MemWrite]>:$output,
-      OptionalAttr<I64ElementsAttr>:$broadcast_dimensions
-  );
-}
-
-def LHLO_DivOp : LHLO_BinaryElementwiseOp<"divide"> {
-  let summary = "Division operator";
-  let description = [{
-    Returns `lhs / rhs` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
-  }];
-}
-def LHLO_MaxOp : LHLO_BinaryElementwiseOp<"maximum"> {
-  let summary = "Maximum operator";
-  let description = [{
-    Returns `max(lhs, rhs)` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
-  }];
-}
-def LHLO_MinOp : LHLO_BinaryElementwiseOp<"minimum"> {
-  let summary = "Minimum operator";
-  let description = [{
-    Returns `min(lhs, rhs)` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
-  }];
-}
-def LHLO_MulOp : LHLO_BinaryElementwiseOp<"multiply"> {
-  let summary = "Multiplication operator";
-  let description = [{
-    Returns `lhs * rhs` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
-  }];
-}
-def LHLO_OrOp : LHLO_BinaryElementwiseOp<"or", LHLO_PredOrIntBuffer> {
-  let summary = "Logical or";
-  let description = [{
-    Returns `logical_or(lhs, rhs)` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
-  }];
-}
-def LHLO_PowOp : LHLO_BinaryElementwiseOp<"power"> {
-  let summary = "Power operator";
-  let description = [{
-    Returns `lhs ^ rhs` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
-  }];
-}
-def LHLO_RemOp : LHLO_BinaryElementwiseOp<"remainder", LHLO_IntOrFpBuffer> {
-  let summary = "Remainder operator";
-  let description = [{
-    Returns `lhs % rhs` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
-  }];
-}
-def LHLO_ShiftLeftOp : LHLO_BinaryElementwiseOp<"shift_left", LHLO_IntBuffer> {
-  let summary = "Shift Left operator";
-  let description = [{
-    Returns `lhs << rhs` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
-  }];
-}
-def LHLO_ShiftRightArithmeticOp : LHLO_BinaryElementwiseOp<"shift_right_arithmetic", LHLO_IntBuffer> {
-  let summary = "Shift right arithmetic operator";
-  let description = [{
-    Returns arithmetic `lhs >> rhs` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
-  }];
-}
-def LHLO_ShiftRightLogicalOp : LHLO_BinaryElementwiseOp<"shift_right_logical", LHLO_IntBuffer> {
-  let summary = "Shift right logical operator";
-  let description = [{
-    Returns logical `lhs >> rhs` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
-  }];
-}
-def LHLO_SubtractOp : LHLO_BinaryElementwiseOp<"subtract"> {
-  let summary = "Subtraction operator";
-  let description = [{
-    Returns `lhs - rhs` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
-  }];
-}
-def LHLO_XorOp : LHLO_BinaryElementwiseOp<"xor", LHLO_PredOrIntBuffer> {
-  let summary = "Logical xor";
-  let description = [{
-    Returns `logical_xor(lhs, rhs)` element-wise.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations.
-  }];
-}
-//===----------------------------------------------------------------------===//
-// LMHLO control flow op definitions.
-//===----------------------------------------------------------------------===//
-
-// TODO(b/139813999): specify required function signature in a type-safe way.
-//
-// The region `body` may return lmhlo.TerminatorOp or mhlo.ReturnOp. We are
-// moving towards mhlo.ReturnOp, but some code that needs cleanup still assumes lmhlo.TerminatorOp.
-// TODO(timshen): cleanup lmhlo.TerminatorOp.
-def LHLO_ReduceOp: LHLO_Op<"reduce", [SameVariadicOperandSize]> {
-  let summary = "Reduce operator";
-  let description = [{
-    Returns the result of executing a reduction function on one or more arrays
-    in parallel.
-
-    See https://www.tensorflow.org/xla/operation_semantics#reduce.
-  }];
-  let arguments = (ins
-    Arg<Variadic<LHLO_Buffer>, "", [MemRead]>:$inputs,
-    Arg<Variadic<LHLO_Buffer>, "", [MemRead]>:$init_values,
-    Arg<Variadic<LHLO_Buffer>, "", [MemWrite]>:$out,
-    I64ElementsAttr:$dimensions
-  );
-
-  let regions = (region SizedRegion<1>:$body);
-
-  let hasCanonicalizer = 1;
-}
-
-def LHLO_ReduceWindowOp: LHLO_Op<"reduce_window", [SameVariadicOperandSize]> {
-  let summary = "ReduceWindow operator";
-  let description = [{
-    Returns the result of executing a reduction function over all elements in
-    each window of one or more arrays in parallel.
-
-    See https://www.tensorflow.org/xla/operation_semantics#reducewindow.
-  }];
-  let arguments = (ins
-    Arg<Variadic<LHLO_Buffer>, "", [MemRead]>:$inputs,
-    Arg<Variadic<LHLO_Buffer>, "", [MemRead]>:$init_values,
-    Arg<Variadic<LHLO_Buffer>, "", [MemWrite]>:$out,
-    I64ElementsAttr:$window_dimensions,
-    // If strides or dilations attributes are missing then the default value is
-    // one for each of the input dimensions. Similarly, padding values are zero
-    // for both low and high in each of the dimensions, if not specified.
-    OptionalAttr<I64ElementsAttr>:$window_strides,
-    OptionalAttr<I64ElementsAttr>:$base_dilations,
-    OptionalAttr<I64ElementsAttr>:$window_dilations,
-    OptionalAttr<I64ElementsAttr>:$padding
-  );
-
-  let regions = (region SizedRegion<1>:$body);
-  let hasVerifier = 1;
-}
-
-// TODO(timshen): Add a custom syntax for this.
-def LHLO_CaseOp: LHLO_Op<"case", [
-      SingleBlockImplicitTerminator<"TerminatorOp">,
-      DeclareOpInterfaceMethods<RegionBranchOpInterface>]> {
-  let summary = "Switch-Case operator";
-  let description = [{
-    Returns the result of executing `branches[index]`. If
-    `index` is < 0 or >= N, then `branches[N-1] is executed as
-    the default branch.
-
-    Each branch `branches[b]` must take in a single argument of same type as
-    `branch_operands[b]` and will be invoked with `branch_operands[b]`. The type
-    of the returned value of each branch must be the same.
-
-    Note that only one of the branches will be executed depending on the value
-    of index.
-    See https://www.tensorflow.org/xla/operation_semantics#conditional.
-  }];
-
-  let arguments = (ins Arg<LHLO_PredOrIntBuffer, "", [MemRead]>:$index);
-
-  let regions = (region VariadicRegion<SizedRegion<1>>:$branches);
-}
-
-// TODO(timshen): Add a custom syntax for this.
-def LHLO_WhileOp: LHLO_Op<"while", [
-      DeclareOpInterfaceMethods<RegionBranchOpInterface>,
-      DeclareOpInterfaceMethods<LoopLikeOpInterface>]> {
-  let summary = "While operator";
-  let description = [{
-    Returns the result of executing a body function until the cond body returns
-    true.
-
-    See https://www.tensorflow.org/xla/operation_semantics#while.
-  }];
-  let arguments = (ins
-    Arg<Variadic<LHLO_PredBuffer>, "", [MemWrite]>:$cond_val,
-    OptionalAttr<I64Attr>:$trip_count);
-
-  let regions = (region SizedRegion<1>:$cond, SizedRegion<1>:$body);
-}
-
-def LHLO_CustomCallOp : LHLO_Op<"custom_call", [AttrSizedOperandSegments]> {
-  let summary = "CustomCall operator";
-  let description = [{
-    A custom call invokes code external to XLA. The `args` are passed to the
-    external code, and the external code is expected to produce a result of the
-    given type. The exact mechanism is backend-specific. For example, in the CPU
-    backend, a call instruction is emitted which targets a symbol with the name
-    `call_target_name`.
-
-    `call_target_name` and `backend_config` can be arbitrary strings, but
-    `call_target_name` should be short as it may be used in labels.
-    `backend_config` can encode arbitrarily large amounts of information.
-
-    See https://www.tensorflow.org/xla/operation_semantics#customcall.
-  }];
-  let arguments = (ins
-    Arg<Variadic<LHLO_Buffer>, "", [MemRead]>:$args,
-    Arg<Variadic<LHLO_Buffer>, "", [MemWrite]>:$output,
-    StrAttr:$call_target_name,
-    DefaultValuedOptionalAttr<BoolAttr, "false">:$has_side_effect,
-    OptionalAttr<AnyAttrOf<[StrAttr, DictionaryAttr]>>:$backend_config,
-    // TODO(b/189822916): Remove this field when all clients are migrated to
-    // the status-returning API.
-    DefaultValuedOptionalAttr<MHLO_CustomCallApiVersionAttr,
-                      "mhlo::CustomCallApiVersion::API_VERSION_ORIGINAL">:
-                      $api_version,
-    OptionalAttr<CustomCallTargetArgMappingAttr>:$target_arg_mapping
-  );
-  let hasVerifier = 1;
-  let regions = (region AnyRegion:$called_computation);
-}
-
-//===----------------------------------------------------------------------===//
-// LMHLO tuple op definitions.
-//===----------------------------------------------------------------------===//
-
-def LHLO_CompareOp: LHLO_Op<"compare", []> {
-  let summary = "Comparison operator";
-  let description = [{
-    Compares `lhs` and `rhs` elementwise according to `comparison_direction`
-    and `compare_type`. If unspecified, `compare_type` is FLOAT for float element
-    types, SIGNED for signed element types and UNSIGNED for unsigned element
-    types.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#element-wise_comparison_operations.
-  }];
-  let arguments = (ins
-    Arg<LHLO_Buffer, "", [MemRead]>:$lhs,
-    Arg<LHLO_Buffer, "", [MemRead]>:$rhs,
-    Arg<LHLO_PredBuffer, "", [MemWrite]>:$out,
-    OptionalAttr<I64ElementsAttr>:$broadcast_dimensions,
-    MHLO_ComparisonDirectionAttr:$comparison_direction,
-    OptionalAttr<MHLO_ComparisonTypeAttr>:$compare_type
-  );
-}
-
-//===----------------------------------------------------------------------===//
-// LMHLO Slice definitions.
-//===----------------------------------------------------------------------===//
-
-def LHLO_SliceOp: LHLO_Op<
-      "slice",
-      [AllTypesMatch<["start_indices", "limit_indices", "strides"]>]> {
-  let arguments = (ins
-    Arg<LHLO_Buffer, "", [MemRead]>:$operand,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$output,
-    I64ElementsAttr:$start_indices,
-    I64ElementsAttr:$limit_indices,
-    I64ElementsAttr:$strides
-  );
-}
-
-def LHLO_DynamicSliceOp: LHLO_Op<"dynamic_slice",
-      [AllElementTypesMatch<["operand", "output"]>]> {
-  let summary = "Dynamic Slice operator";
-  let description = [{
-    Extracts a sub-array from the input array at dynamic start_indices.
-
-    See https://www.tensorflow.org/xla/operation_semantics#dynamicslice.
-  }];
-  let arguments = (ins
-    Arg<LHLO_Buffer, "", [MemRead]>:$operand,
-    Arg<Variadic<LHLO_Buffer>, "", [MemRead]>:$start_indices,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$output,
-    I64ElementsAttr:$slice_sizes
-  );
-}
-
-def LHLO_DynamicUpdateSliceOp: LHLO_Op<"dynamic-update-slice", []> {
-  let summary = "Dynamic Update Slice operator";
-  let description = [{
-    DynamicUpdateSlice generates a result which is the value of the input array
-    operand, with a slice update overwritten at start_indices.
-
-    See https://www.tensorflow.org/xla/operation_semantics#dynamicupdateslice.
-  }];
-  let arguments = (ins
-    Arg<LHLO_Buffer, "", [MemRead]>:$operand,
-    Arg<LHLO_Buffer, "", [MemRead]>:$update,
-    Arg<Variadic<LHLO_Buffer>, "", [MemRead]>:$start_indices,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$output
-  );
-}
-
-//===----------------------------------------------------------------------===//
-// LMHLO Other op definitions.
-//===----------------------------------------------------------------------===//
-
-def LHLO_BatchNormGradOp : LHLO_Op<"batch_norm_grad", []> {
-  let summary = "Batch Normalization Gradient";
-  let description = [{
-    Calculates gradients of batch norm.
-
-    See https://www.tensorflow.org/xla/operation_semantics#batchnormgrad
-  }];
-
-  let arguments = (ins
-    Arg<LHLO_Buffer, "", [MemRead]>:$operand,
-    Arg<LHLO_Buffer, "", [MemRead]>:$scale,
-    Arg<LHLO_Buffer, "", [MemRead]>:$mean,
-    Arg<LHLO_Buffer, "", [MemRead]>:$variance,
-    Arg<LHLO_Buffer, "", [MemRead]>:$grad_output,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$grad_operand,  // gradient of $operand.
-    Arg<LHLO_Buffer, "", [MemWrite]>:$grad_scale,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$grad_offset,
-    F32Attr:$epsilon,
-    I64Attr:$feature_index
-  );
-
-}
-
-def LHLO_BatchNormInferenceOp : LHLO_Op<"batch_norm_inference", []> {
-  let summary = "Batch Normalization for Inference";
-  let description = [{
-    Normalizes an array across batch and spatial dimensions.
-
-    See https://www.tensorflow.org/xla/operation_semantics#batchnorminference
-  }];
-
-  let arguments = (ins
-    Arg<LHLO_Buffer, "", [MemRead]>:$operand,
-    Arg<LHLO_Buffer, "", [MemRead]>:$scale,
-    Arg<LHLO_Buffer, "", [MemRead]>:$offset,
-    Arg<LHLO_Buffer, "", [MemRead]>:$mean,
-    Arg<LHLO_Buffer, "", [MemRead]>:$variance,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$output,
-    F32Attr:$epsilon,
-    I64Attr:$feature_index
-  );
-}
-
-def LHLO_BatchNormTrainingOp : LHLO_Op<"batch_norm_training", []> {
-  let summary = "Batch Normalization for Training";
-  let description = [{
-    Normalizes an array across batch and spatial dimensions.
-
-    See https://www.tensorflow.org/xla/operation_semantics#batchnormtraining
-  }];
-
-  let arguments = (ins
-    Arg<LHLO_Buffer, "", [MemRead]>:$operand,
-    Arg<LHLO_Buffer, "", [MemRead]>:$scale,
-    Arg<LHLO_Buffer, "", [MemRead]>:$offset,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$output,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$batch_mean,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$batch_var,
-    F32Attr:$epsilon,
-    I64Attr:$feature_index
-  );
-}
-
-def LHLO_BroadcastOp : LHLO_Op<"broadcast",
-      []> {
-  let summary = "Broadcast a tensor to a higher rank by prepending dimensions";
-  let description = [{
-    Broadcasts the operand tensor to a higher rank by prepending
-    `broadcast_sizes` to the dimensions. The current values of the operand are
-    copied into the other dimensions.
-
-    This is a more limited form of broadcasting, that corresponds to the XLA
-    client Broadcast method. For a more general form of broadcasting, see the
-    BroadcastInDimOp.
-
-    See https://www.tensorflow.org/xla/operation_semantics#broadcast.
-  }];
-  let arguments = (ins
-    Arg<LHLO_Buffer, "", [MemRead]>:$operand,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$output,
-    I64ElementsAttr:$broadcast_sizes
-  );
-}
-
-def LHLO_BroadcastInDimOp : LHLO_Op<"broadcast_in_dim",
-      []> {
-  let summary = "Broadcast a tensor into the given shape by adding dimensions.";
-  let description = [{
-    Broadcasts the `operand` tensor to a higher rank. This is not the limited
-    form of broadcasting exposed as the XLA client broadcast op, but rather the
-    more powerful "InDim" broadcasting, which is closer to the HLO broadcast op
-    and exposed in the XLA client BroadcastInDim method.
-
-    `broadcast_dimensions` maps the operand dimension number to the target shape
-    dimension number. It must have the same size as the rank of the operand. The
-    mapped dimensions must either be the same size or the dimension being
-    broadcast from must be size 1 (degenerate broadcasting).
-
-    For a scalar (0D tensor) operand, `broadcast_dimensions` must be empty. The
-    The scalar value will be broadcast to every element in the target shape.
-
-    See https://www.tensorflow.org/xla/broadcasting.
-  }];
-  let arguments = (ins
-    Arg<LHLO_Buffer, "", [MemRead]>:$operand,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$output,
-    I64ElementsAttr:$broadcast_dimensions
-  );
-}
-
-def LHLO_ClampOp : LHLO_Op<"clamp", []> {
-  let summary = "Clamp operator";
-  let description = [{
-    Clamps an operand to within the range between a minimum and maximum value.
-
-    Note: All three arrays must be the same shape. Alternatively, as a
-          restricted form of broadcasting, min and/or max can be a scalar (0D
-          tensor) of the element type of the tensor operand.
-
-    See https://www.tensorflow.org/xla/operation_semantics#clamp.
-  }];
-  let arguments = (ins
-    Arg<LHLO_Buffer, "", [MemRead]>:$min,
-    Arg<LHLO_Buffer, "", [MemRead]>:$operand,
-    Arg<LHLO_Buffer, "", [MemRead]>:$max,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$output
-  );
-}
-
-def LHLO_ConcatenateOp : LHLO_Op<"concatenate", []> {
-  let summary = "XLA's concatenate op";
-  let description = [{
-     Concatenates a set of tensors along the specified dimension.
-
-     See https://www.tensorflow.org/xla/operation_semantics#concatenate.
-   }];
-   let arguments = (ins
-     Arg<Variadic<LHLO_Buffer>, "", [MemRead]>:$val,
-     Arg<LHLO_Buffer, "", [MemWrite]>:$output,
-     I64Attr:$dimension
-   );
-}
-
-def LHLO_ConvolutionOp : LHLO_Op<"convolution", []> {
-  let summary = "Convolution operator";
-  let description = [{
-    Computes a convolution of the kind used in neural networks.
-
-    See https://www.tensorflow.org/xla/operation_semantics#conv_convolution.
-  }];
-  let arguments = !con(
-    (ins
-       Arg<LHLO_Buffer, "", [MemRead]>:$lhs,
-       Arg<LHLO_Buffer, "", [MemRead]>:$rhs,
-       Arg<LHLO_Buffer, "", [MemWrite]>:$output),
-    MHLO_ConvolutionAttributes.attributes);
-
-  code extraClassDeclaration = [{
-    bool hasWindowReversal() {
-      auto reversal = getWindowReversalAttr();
-      return reversal && llvm::any_of(reversal.getValues<bool>(),
-                                      [](bool v) { return v; });
-    }
-  }];
-
- let assemblyFormat = [{
-    `(`operands`)`
-       `dim_numbers` `=` custom<ConvolutionDimensions>($dimension_numbers) `,`
-       `window` `=` `{` custom<WindowAttributes>($window_strides, $padding,
-                                                 $lhs_dilation, $rhs_dilation,
-                                                 $window_reversal) `}`
-       attr-dict `:` functional-type(operands, results)
-  }];
-}
-
-def LHLO_CopyOp: LHLO_Op<"copy", [CopyOpInterface]> {
-  let summary = "Copy operator";
-  let description = [{
-    Returns a copy of `operand`.
-  }];
-  let arguments = (ins
-    Arg<LHLO_Buffer, "", [MemRead]>:$operand,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$output
-  );
-
-  let extraClassDeclaration = [{
-    Value getSource() { return getOperand();}
-    Value getTarget() { return getOutput(); }
-  }];
-}
-
-def LHLO_DotOp: LHLO_Op<"dot", []> {
-  let summary = "Dot operator";
-  let description = [{
-    Performs dot products between vectors, vector/matrix and matrix/matrix
-    multiplication.
-
-    See https://www.tensorflow.org/xla/operation_semantics#dot.
-  }];
-  let arguments = (ins
-    Arg<LHLO_Buffer, "", [MemRead]>:$lhs,
-    Arg<LHLO_Buffer, "", [MemRead]>:$rhs,
-    MHLO_DotDimensionNumbers:$dot_dimension_numbers,
-    MHLO_PrecisionConfigAttr:$precision_config,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$output
-  );
-}
-
-def LHLO_GatherOp: LHLO_Op<"gather", []> {
-  let arguments = (ins
-    Arg<LHLO_Buffer, "", [MemRead]>:$operand,
-    Arg<LHLO_IntBuffer, "", [MemRead]>:$start_indices,
-    MHLO_GatherDimensionNumbers:$dimension_numbers,
-    I64ElementsAttr:$slice_sizes,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$output
-  );
-}
-
-def LHLO_ReshapeOp: LHLO_Op<"reshape", []> {
-  let summary = "Reshape operator";
-  let description = [{
-    Reshapes the dimensions of `operand` into a new configuration.
-
-    See https://www.tensorflow.org/xla/operation_semantics#reshape.
-  }];
-  let arguments = (ins
-    Arg<LHLO_Buffer, "", [MemRead]>:$operand,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$output
-  );
-}
-
-def LHLO_ScatterOp: LHLO_Op<"scatter", []> {
-  let summary = "Scatter operator";
-  let description = [{
-    Generates a result which is the value of the input array `operand`,
-    with several slices (at indices specified by `scatter_indices`)
-    updated with the values in `updates` using `update_computation`.
-
-    See https://www.tensorflow.org/xla/operation_semantics#scatter.
-  }];
-  let arguments = (ins
-    Arg<LHLO_Buffer, "", [MemRead]>:$operand,
-    Arg<LHLO_Buffer, "", [MemRead]>:$scatter_indices,
-    Arg<LHLO_Buffer, "", [MemRead]>:$updates,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$output,
-    MHLO_ScatterDimensionNumbers:$scatter_dimension_numbers,
-    DefaultValuedOptionalAttr<BoolAttr, "false">:$indices_are_sorted,
-    DefaultValuedOptionalAttr<BoolAttr, "false">:$unique_indices
-  );
-
-  let regions = (region SizedRegion<1>:$update_computation);
-}
-
-def LHLO_SelectOp: LHLO_Op<"select", [Elementwise]> {
-  let summary = "Select operator";
-  let description = [{
-    Constructs an output tensor from the elements of `on_true` and `on_false`
-    based on the values of `pred`.
-
-    `pred`, `on_true` and `on_false` must be broadcast compatible.
-  }];
-  let arguments = (ins
-    Arg<LHLO_PredBuffer, "", [MemRead]>:$pred,
-    Arg<LHLO_Buffer, "", [MemRead]>:$on_true,
-    Arg<LHLO_Buffer, "", [MemRead]>:$on_false,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$output
-  );
-}
-
-def LHLO_SelectAndScatterOp: LHLO_Op<"select_and_scatter", []> {
-  let summary = "SelectAndScatter operator";
-  let description = [{
-    Runs a windowed selection `select` function over `operand` with shape
-    `window_dimensions` and stride `window_strides`. This will produce an amount
-    of selected locations whose shape matches `source`. These are then scattered
-    to the output which is initialized with `init_value`.
-    Multiple scattered elements which land in the same output location are
-    combined using the `scatter` function.
-
-    See https://www.tensorflow.org/xla/operation_semantics#selectandscatter.
-  }];
-  let arguments = (ins
-    Arg<LHLO_Buffer, "", [MemRead]>:$operand,
-    Arg<LHLO_Buffer, "", [MemRead]>:$source,
-    Arg<LHLO_Buffer, "", [MemRead]>:$init_value,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$out,
-    OptionalAttr<I64ElementsAttr>:$window_dimensions,
-    OptionalAttr<I64ElementsAttr>:$window_strides,
-    OptionalAttr<I64ElementsAttr>:$padding
-  );
-
-  let regions = (region SizedRegion<1>:$select, SizedRegion<1>:$scatter);
-}
-
-def LHLO_ReverseOp: LHLO_Op<"reverse", []> {
-  let summary = "Reverse operator";
-  let description = [{
-    Reverses the specified dimensions of `operand` according to the given
-    `dimensions`.
-
-    See https://www.tensorflow.org/xla/operation_semantics#rev_reverse.
-  }];
-  let arguments = (ins
-    Arg<LHLO_Buffer, "", [MemRead]>:$operand,
-    I64ElementsAttr:$dimensions,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$output
-  );
-}
-
-def LHLO_PadOp: LHLO_Op<"pad", []> {
-  let summary = "Pad operator";
-  let description = [{
-    Pads the edges of `operand` with the `padding_value` and according to
-    the passed configuration.
-
-    See https://www.tensorflow.org/xla/operation_semantics#pad.
-  }];
-  let arguments = (ins
-    Arg<LHLO_Buffer, "", [MemRead]>:$operand,
-    Arg<LHLO_Buffer, "", [MemRead]>:$padding_value,
-    I64ElementsAttr:$edge_padding_low,
-    I64ElementsAttr:$edge_padding_high,
-    I64ElementsAttr:$interior_padding,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$output
-  );
-  let hasVerifier = 1;
-}
-
-def LHLO_TransposeOp: LHLO_Op<"transpose", []> {
-  let summary = "Transpose operator";
-  let description = [{
-    Permutes the dimensions of `operand` according to the given `permutation`.
-
-    `res_dimensions[i] = operand_dimensions[permutation[i]]`
-
-    See https://www.tensorflow.org/xla/operation_semantics#transpose.
-  }];
-  let arguments = (ins
-    Arg<LHLO_Buffer, "", [MemRead]>:$operand,
-    I64ElementsAttr:$permutation,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$output
-  );
-}
-
-def LHLO_ReducePrecisionOp: LHLO_Op<"reduce_precision", [SameTypeOperands]> {
-  let summary = "Reduce precision operator";
-  let description = [{
-    Models the effect of converting floating - point values to a lower -
-    precision format(such as IEEE - FP16) and back to the original
-    format. The number of exponent and mantissa bits in the lower -
-    precision format can be specified arbitrarily,
-    although all bit sizes may not be supported on all hardware
-    implementations.
-
-    See https://www.tensorflow.org/xla/operation_semantics#reduceprecision.
-  }];
-  let arguments = (ins
-    Arg<LHLO_FpBuffer, "", [MemRead]>:$operand,
-    Arg<LHLO_FpBuffer, "", [MemWrite]>:$output,
-    I32Attr:$exponent_bits,
-    I32Attr:$mantissa_bits
-  );
-}
-
-def LHLO_FftOp: LHLO_Op<"fft", []> {
-  let summary = "Fast fourier transform operator";
-  let description = [{
-    Returns the fast-fourier-transform of the input array.
-
-    See
-    https://www.tensorflow.org/xla/operation_semantics#fft.
-  }];
-  let arguments = (ins
-    Arg<LHLO_Buffer, "", [MemRead]>:$operand,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$output,
-    MHLO_FftTypeAttr:$fft_type,
-    I64ElementsAttr:$fft_length
-  );
-}
-
-def LHLO_CholeskyOp: LHLO_Op<"cholesky", [SameOperandsElementType]> {
-  let summary = "Cholesky operator";
-  let description = [{
-  Computes the Cholesky decomposition of a batch of symmetric (Hermitian)
-  positive definite matrices.
-
-  If lower is true, computes lower-triangular matrices l such that
-  `a=l.Transpose(l)`. If lower is false, computes upper-triangular matrices u such
-  that `a=Transpose(u).u`.
-
-  Input data is read only from the lower/upper triangle of a, depending on the
-  value of lower. Values from the other triangle are ignored. Output data is
-  returned in the same triangle; the values in the other triangle are
-  implementation-defined and may be anything.
-
-  If the rank of a is greater than 2, a is treated as a batch of matrices, where
-  all except the minor 2 dimensions are batch dimensions.
-
-  If a is not symmetric (Hermitian) positive definite, the result is
-  implementation-defined.
-
-    See https://www.tensorflow.org/xla/operation_semantics#cholesky.
-  }];
-  let arguments = (ins
-    Arg<LHLO_FpOrComplexBuffer, "", [MemRead]>:$a,
-    Arg<LHLO_FpOrComplexBuffer, "", [MemWrite]>:$output,
-    DefaultValuedOptionalAttr<BoolAttr, "false">:$lower
-  );
-}
-
-def LHLO_InfeedOp: LHLO_Op<"infeed", []> {
-  let summary = "Infeed operator";
-  let description = [{
-    Reads a single data item from the implicit Infeed streaming interface of
-    the device, interpreting the data as the given shape and its layout, and
-    returns an LHLO op of the data. Multiple Infeed operations are allowed in a
-    computation, but there must be a total order among the Infeed operations.
-    For example, two Infeeds in the code below have a total order since there
-    is a dependency between the while loops.
-
-    See https://www.tensorflow.org/xla/operation_semantics#infeed
-  }];
-  let arguments = (ins
-    Arg<Variadic<LHLO_Buffer>, "", [MemWrite]>:$outputs,
-    DefaultValuedStrAttr<StrAttr, "">:$config
-  );
-}
-
-def LHLO_OutfeedOp: LHLO_Op<"outfeed", []> {
-  let arguments = (ins
-    Arg<Variadic<LHLO_Buffer>, "", [MemRead]>:$inputs,
-    DefaultValuedStrAttr<StrAttr, "">:$config
-  );
-}
-
-def LHLO_ReplicaIdOp : LHLO_Op<"replica_id", []> {
-  let summary = "ReplicaId operator";
-  let description = [{
-    Returns the unique ID (int32 scalar) of the replica.
-
-    The unique ID of each replica is an unsigned integer in the interval [0, N),
-    where N is the number of replicas. Since all the replicas are running the
-    same program, a ReplicaId() call in the program will return a different
-    value on each replica.
-
-    See https://www.tensorflow.org/xla/operation_semantics#replicaid.
-  }];
-  let arguments = (ins Arg<MemRefOf<[UI32]>, "", [MemWrite]>);
-}
-
-def LHLO_PartitionIdOp : LHLO_Op<"partition_id", []> {
-  let summary = "PartitionId operator";
-  let description = [{
-    Returns the unique ID (int32 scalar) of the partition.
-  }];
-  let arguments = (ins Arg<MemRefOf<[UI32]>, "", [MemWrite]>);
-}
-
-def LHLO_TriangularSolveOp: LHLO_Op<"triangular_solve", [SameOperandsElementType]> {
-  let summary = "TriangularSolve operator";
-  let description = [{
-    Solves systems of linear equations with lower or upper triangular
-    coefficient matrices by forward- or back-substitution. Broadcasting along
-    leading dimensions, this routine solves one of the matrix systems
-    op(a) * x = b, or x * op(a) = b, for the variable x, given a and b, where
-    op(a) is either op(a) = a, or op(a) = Transpose(a), or
-    op(a) = Conj(Transpose(a)).
-
-    Input data is read only from the lower/upper triangle of a, depending on the
-    value of lower. Values from the other triangle are ignored. Output data is
-    returned in the same triangle; the values in the other triangle are
-    implementation-defined and may be anything.
-
-    If the rank of a and b are greater than 2, they are treated as batches of
-    matrices, where all except the minor 2 dimensions are batch dimensions. a
-    and b must have equal batch dimensions.
-
-    See https://www.tensorflow.org/xla/operation_semantics#triangularsolve.
-  }];
-  let arguments = (ins
-    Arg<LHLO_FpOrComplexBuffer, "", [MemRead]>:$a,
-    Arg<LHLO_FpOrComplexBuffer, "", [MemRead]>:$b,
-    Arg<LHLO_FpOrComplexBuffer, "", [MemWrite]>:$output,
-    BoolAttr:$left_side,
-    BoolAttr:$lower,
-    BoolAttr:$unit_diagonal,
-    MHLO_TransposeAttr:$transpose_a,
-    MHLO_LayoutAttr:$layout_a,
-    MHLO_LayoutAttr:$layout_b,
-    MHLO_LayoutAttr:$layout_output
-  );
-}
-
-// TODO(timshen): add a custom verifier.
-def LHLO_MapOp: LHLO_Op<"map", [SameOperandsShape]> {
-  let summary = "Map operator";
-  let description = [{
-  Applies a scalar function over the given operands arrays, producing an array
-  of the same dimensions where each element is the result of the mapped function
-  applied to the corresponding elements in the input arrays.
-
-  The mapped function is an arbitrary computation with the restriction that it
-  has N inputs of scalar type T and a single output with type S. The output has
-  the same dimensions as the operands except that the element type T is replaced
-  with S.
-
-  See https://www.tensorflow.org/xla/operation_semantics#map.
-  }];
-  let arguments = (ins
-    Arg<Variadic<LHLO_Buffer>, "", [MemRead]>:$inputs,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$output,
-    I64ElementsAttr:$dimensions
-  );
-  let regions = (region SizedRegion<1>:$computation);
-}
-
-def LHLO_RngGetAndUpdateStateOp: LHLO_Op<"rng_get_and_update_state", []> {
-  let arguments = (ins
-    Arg<MemRefOf<[UI64]>, "", [MemRead, MemWrite]>:$state,
-    I64Attr:$delta
-  );
-}
-
-// TODO(timshen): add a custom verifier.
-def LHLO_SortOp: LHLO_Op<"sort", [SameVariadicOperandSize, SameOperandsShape]> {
-  let summary = "Sort operator";
-  let description = [{
-    Sorts the given `operands` at the given `dimension` with the given
-    `comparator`.
-
-    See https://www.tensorflow.org/xla/operation_semantics#sort.
-  }];
-  let arguments = (ins
-    Arg<Variadic<LHLO_Buffer>, "", [MemRead]>:$inputs,
-    Arg<Variadic<LHLO_Buffer>, "", [MemWrite]>:$output,
-    DefaultValuedOptionalAttr<I64Attr, "-1">:$dimension,
-    DefaultValuedOptionalAttr<BoolAttr, "false">:$is_stable
-  );
-
-  let regions = (region SizedRegion<1>:$comparator);
-}
-
-//===----------------------------------------------------------------------===//
-// Point-to-point communication operations.
-//===----------------------------------------------------------------------===//
-
-def LHLO_SendOp : LHLO_Op<"send", []> {
-
-  let summary = "Send operator";
-
-  let description = [{
-    Sends the given operand data to a Recv instruction in another computation
-    that shares the same channel handle. Does not return any data. Send is an
-    asynchronous operation, and must be paired with a SendDone operation to
-    wait for the completion of the data transfer.
-
-    See https://www.tensorflow.org/xla/operation_semantics#send.
-  }];
-
-  let arguments = (ins
-    Arg<Variadic<LHLO_Buffer>, "", [MemRead]>:$inputs,
-    MHLO_ChannelHandle:$channel_handle,
-    DefaultValuedOptionalAttr<BoolAttr, "false">:$is_host_transfer,
-    DefaultValuedOptionalAttr<DictionaryAttr, "{}">:$frontend_attributes
-  );
-
-  let results = (outs MHLO_Token:$token);
-}
-
-def LHLO_RecvOp : LHLO_Op<"recv", []> {
-
-  let summary = "Recv operator";
-
-  let description = [{
-    Receives data of the given shape from a Send instruction in another
-    computation that shares the same channel handle. Recv is an asynchronous
-    operation, and must be paired with a RecvDone operation to wait for the
-    completion of the data transfer.
-
-    See https://www.tensorflow.org/xla/operation_semantics#recv.
-  }];
-
-  let arguments = (ins
-    Arg<Variadic<LHLO_Buffer>, "", [MemWrite]>:$outputs,
-    MHLO_ChannelHandle:$channel_handle,
-    DefaultValuedOptionalAttr<BoolAttr, "false">:$is_host_transfer,
-    DefaultValuedOptionalAttr<DictionaryAttr, "{}">:$frontend_attributes
-  );
-
-  let results = (outs MHLO_Token:$token);
-}
-
-def LHLO_SendDoneOp : LHLO_Op<"send_done", []> {
-
-  let summary = "SendDone operator";
-
-  let description = [{
-    Waits for the completion of corresponding Send operation data transfer.
-
-    See https://www.tensorflow.org/xla/operation_semantics#send.
-  }];
-
-  let arguments = (ins
-    MHLO_Token:$token,
-    MHLO_ChannelHandle:$channel_handle,
-    DefaultValuedOptionalAttr<BoolAttr, "false">:$is_host_transfer
-  );
-}
-
-def LHLO_RecvDoneOp : LHLO_Op<"recv_done", []> {
-
-  let summary = "RecvDone operator";
-
-  let description = [{
-    Waits for the completion of corresponding Recv operation data transfer.
-
-    See https://www.tensorflow.org/xla/operation_semantics#recv.
-  }];
-
-  let arguments = (ins
-    MHLO_Token:$token,
-    MHLO_ChannelHandle:$channel_handle,
-    DefaultValuedOptionalAttr<BoolAttr, "false">:$is_host_transfer
-  );
-}
-
-//===----------------------------------------------------------------------===//
-// Late operations
-//===----------------------------------------------------------------------===//
-
-def FusionOp : LHLO_Op<"fusion", [
-      SingleBlockImplicitTerminator<"TerminatorOp">,
-      DeclareOpInterfaceMethods<RegionBranchOpInterface>
-      ]> {
-  let summary = "Fusion operator";
-  let description = [{
-    Models the fusion instruction generated by the XLA compiler's fusion pass.
-
-    Fusion instructions are generated by the fusion pass of the XLA compiler.
-    They serve as a hint to the backend that it is beneficial to emit the
-    contained instructions into a single loop nest or kernel. The XLA fusion
-    pass is designed such that it only generates fusion nodes that can be
-    handled by the XLA compilers backends.
-    The XLA runtime expects this hint to be followed, as it expects a single
-    kernel per HLO instruction. This restriction might be lifted in the future.
-  }];
-  let arguments = (ins
-    OptionalAttr<AnyAttrOf<[StrAttr, DictionaryAttr]>>:$backend_config
-  );
-  let regions = (region SizedRegion<1>:$region);
-
-  let skipDefaultBuilders = 1;
-  let builders = [
-     OpBuilder<(ins CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes)>
-  ];
-
-  let extraClassDeclaration = [{
-    SmallVector<Value> getInputBuffers() {
-      SmallVector<Value> buffers;
-      for (auto load : getRegion().front().getOps<bufferization::ToTensorOp>()) {
-        buffers.push_back(load.getMemref());
-      }
-      return buffers;
-    }
-
-    SmallVector<Value> getOutputBuffers() {
-      SmallVector<Value> buffers;
-      for (auto store : getRegion().front()
-          .getOps<bufferization::MaterializeInDestinationOp>()) {
-        buffers.push_back(store.getDest());
-      }
-      return buffers;
-    }
-
-    SmallVector<Value> getFusionParameters() {
-      SmallVector<Value> buffers;
-      for (auto load : getRegion().front().getOps<bufferization::ToTensorOp>()) {
-        buffers.push_back(load);
-      }
-      return buffers;
-    }
-
-    SmallVector<Value> getFusionResults() {
-      SmallVector<Value> buffers;
-      for (auto store : getRegion().front()
-          .getOps<bufferization::MaterializeInDestinationOp>()) {
-        buffers.push_back(store.getSource());
-      }
-      return buffers;
-    }
-
-    SmallVector<Operation*> getFusionRoots() {
-      SmallVector<Operation*> roots;
-      for (auto value : getFusionResults()) {
-        Operation* op = value.getDefiningOp();
-        if (roots.empty() || roots.back() != op) {
-          roots.push_back(op);
-        }
-      }
-      return roots;
-    }
-  }];
-}
-
-def TerminatorOp :
-    LHLO_Op<"terminator", [ReturnLike, Terminator]> {
-  let summary = "LHLO termination operation";
-  let description = [{
-    Terminator operation for the LHLO dialect.
-  }];
-  let builders = [
-    OpBuilder<(ins "ValueRange":$operands),
-    [{ build($_builder, $_state, std::nullopt, operands, std::nullopt); }]>];
-}
-
-def LHLO_RealDynamicSliceOp: LHLO_Op<
-      "real_dynamic_slice",
-      [AllTypesMatch<["start_indices", "limit_indices", "strides"]>]> {
-  let summary = "LHLO Real Dynamic Slice operator";
-  let description = [{
-    The dynamic shape version of DynamicSliceOp. Extracts a sub-array from the
-    input array according to dynamic start_indices, limit_indices and strides.
-  }];
-  let arguments = (ins
-    Arg<LHLO_Buffer, "", [MemRead]>:$operand,
-    Arg<LHLO_DimensionBuffer, "", [MemRead]>:$start_indices,
-    Arg<LHLO_DimensionBuffer, "", [MemRead]>:$limit_indices,
-    Arg<LHLO_DimensionBuffer, "", [MemRead]>:$strides,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$output
-  );
-}
-
-def LHLO_DynamicBroadcastInDimOp : LHLO_Op<"dynamic_broadcast_in_dim",
-      []> {
-  let summary = "Broadcast a tensor into the given dynamic shape by adding dimensions.";
-  let description = [{
-    The dynamic shape version of BroadcastInDimOp. This is a generalization of the
-    BroadcastInDimOp which accepts its output dimensions as an argument. It should
-    eventually supercede the statically shaped original, but is being phased as a
-    separate op in order to support compatibility with lowerings and translations that
-    precede dynamic shapes.
-  }];
-  let arguments = (ins
-    Arg<LHLO_Buffer, "", [MemRead]>:$operand,
-    Arg<LHLO_DimensionBuffer, "", [MemRead]>:$output_dimensions,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$output,
-    I64ElementsAttr:$broadcast_dimensions
-  );
-}
-
-def LHLO_DotGeneralOp: LHLO_Op<"dot_general", []> {
-  let summary = "LHLO General Dot operator";
-  let description = [{
-    Performs general dot products between vectors, vector/matrix and
-    matrix/matrix multiplication.
-
-    See https://www.tensorflow.org/xla/operation_semantics#dotgeneral.
-  }];
-  let arguments = (ins
-    Arg<LHLO_Buffer, "", [MemRead]>:$lhs,
-    Arg<LHLO_Buffer, "", [MemRead]>:$rhs,
-    MHLO_DotDimensionNumbers:$dot_dimension_numbers,
-    MHLO_PrecisionConfigAttr:$precision_config,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$output
-  );
-}
-
-def LHLO_DynamicGatherOp: LHLO_Op<"dynamic_gather", []> {
-  string summary = "LHLO Dynamic Gather operator";
-  string description = [{
-    The dynamic shape version of GatherOp. Stitches together several slices of an input
-    array. slice_sizes is not a const.
-  }];
-  let arguments = (ins
-    Arg<LHLO_Buffer, "", [MemRead]>:$operand,
-    Arg<LHLO_IntBuffer, "", [MemRead]>:$start_indices,
-    Arg<LHLO_DimensionBuffer, "", [MemRead]>:$slice_sizes,
-    MHLO_GatherDimensionNumbers:$dimension_numbers,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$output
-  );
-}
-
-def LHLO_DynamicPadOp: LHLO_Op<
-      "dynamic_pad",
-      [AllTypesMatch<["edge_padding_low", "edge_padding_high", "interior_padding"]>]> {
-  let summary = "LHLO Dynamic Pad operator";
-  let description = [{
-    The dynamic shape version of PadOp. Pads the edges of `operand` with the `padding_value` and according to
-    the passed configuration. Passed configuration are dynamic shape.
-    See
-    https://www.tensorflow.org/xla/operation_semantics#pad
-  }];
-  let arguments = (ins
-    Arg<LHLO_Buffer, "", [MemRead]>:$operand,
-    Arg<LHLO_Buffer, "", [MemRead]>:$padding_value,
-    Arg<LHLO_DimensionBuffer, "", [MemRead]>:$edge_padding_low,
-    Arg<LHLO_DimensionBuffer, "", [MemRead]>:$edge_padding_high,
-    Arg<LHLO_DimensionBuffer, "", [MemRead]>:$interior_padding,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$output
-  );
-}
-
-def LHLO_BitcastOp: LHLO_Op<"bitcast", []> {
-  let summary = "LHLO Bitcast operator";
-  let description = [{
-    This op changes the shape of the input in the way that the physical
-    arrangement of elements are unchanged.
-
-    However, the op needs layout information to make sense of "physical
-    arrangement of elements". Layout support in MHLO is currently under
-    exploration.
-  }];
-  let arguments = (ins
-    Arg<LHLO_Buffer, "", [MemRead]>:$operand,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$output
-  );
-}
-
-def LHLO_DynamicBitcastOp: LHLO_Op<"dynamic_bitcast", []> {
-  let summary = "LHLO Dynamic Bitcast operator";
-  let description = [{
-    The dynamic shape version of BitcastOp. This op changes the shape of the
-    input in the way that the physical arrangement of elements are unchanged.
-
-    However, the op needs layout information to make sense of "physical
-    arrangement of elements". Layout support in MHLO is currently under
-    exploration.
-  }];
-  let arguments = (ins
-    Arg<LHLO_Buffer, "", [MemRead]>:$operand,
-    Arg<LHLO_IntBuffer, "", [MemRead]>:$shape,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$output
-  );
-}
-
-def LHLO_DynamicIotaOp : LHLO_Op<"dynamic_iota", []> {
-  let summary = "Create linear increasing values from 0 to length -1.";
-  let description = [{
-    The dynamic shape version of IotaOp. Produces an output of the specified shape,
-    with an incremental set of values along the specified dimension starting at 0.
-    See
-    https://www.tensorflow.org/xla/operation_semantics#iota
-  }];
-  let arguments = (ins Arg<LHLO_DimensionBuffer, "", [MemRead]>:$shape,
-                   I64Attr:$iota_dimension,
-                   Arg<LHLO_Buffer, "", [MemWrite]>:$output);
-}
-
-def LHLO_DynamicConvOp : LHLO_Op<"dynamic_conv", []> {
-  let arguments = !con(
-    (ins Arg<LHLO_Buffer, "", [MemRead]>:$lhs,
-    Arg<LHLO_Buffer, "", [MemRead]>:$rhs,
-    Arg<LHLO_Buffer, "", [MemRead]>:$d_padding,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$output),
-    MHLO_ConvolutionAttributes.attributes);
-}
-
-def LHLO_DynamicReshapeOp: LHLO_Op<"dynamic_reshape", []> {
-  let summary = "Reshape a tensor to a given, possibly dynamic, shape.";
-  let description = [{
-    The dynamic shape version of ReshapeOp. Reshapes `operand` to `output`.
-  }];
-  let arguments = (ins
-    Arg<LHLO_Buffer, "", [MemRead]>:$operand,
-    Arg<LHLO_DimensionBuffer, "", [MemRead]>:$shape,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$output
-  );
-}
-
-#endif // LHLO_OPS
diff --git a/third_party/xla/xla/mlir_hlo/lhlo/IR/lhlo_ops_base.td b/third_party/xla/xla/mlir_hlo/lhlo/IR/lhlo_ops_base.td
deleted file mode 100644
index 4760c4c09eec3d..00000000000000
--- a/third_party/xla/xla/mlir_hlo/lhlo/IR/lhlo_ops_base.td
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef LHLO_OPS_BASE
-#define LHLO_OPS_BASE
-
-include "mlir/Dialect/MemRef/IR/MemRefBase.td"
-include "mlir/IR/OpBase.td"
-include "mhlo/IR/hlo_ops_common.td"
-
-//===----------------------------------------------------------------------===//
-// LMHLO type definitions.
-//===----------------------------------------------------------------------===//
-
-// Any integer tensor types
-def LHLO_IntBuffer : MemRefOf<[MHLO_Int]>;
-
-// Any floating-point tensor types
-def LHLO_FpBuffer : MemRefOf<[AnyFloat]>;
-
-def LHLO_ComplexBuffer : MemRefOf<[AnyComplex]>;
-
-def LHLO_FpOrComplexBuffer : MemRefOf<[AnyFloat, AnyComplex]>;
-
-def LHLO_PredBuffer : MemRefOf<[MHLO_Pred]>;
-
-// Any integer or floating-point tensor types
-def LHLO_IntOrFpBuffer : MemRefOf<[MHLO_Int, AnyFloat]>;
-
-def LHLO_PredOrIntBuffer : MemRefOf<[MHLO_Int, MHLO_Pred]>;
-
-def LHLO_Buffer : MemRefOf<[AnyFloat, AnyInteger, AnyComplex]>;
-
-def LHLO_DimensionValue : AnyTypeOf<[Index, MHLO_Pred, MHLO_Int]>;
-
-// Dynamic representation of a shape vector
-def LHLO_DimensionBuffer : MemRefRankOf<[LHLO_DimensionValue], [1]>;
-
-#endif // LHLO_OPS_BASE
diff --git a/third_party/xla/xla/mlir_hlo/lhlo/IR/lhlo_ops_structs.h b/third_party/xla/xla/mlir_hlo/lhlo/IR/lhlo_ops_structs.h
deleted file mode 100644
index 38371e3595fc99..00000000000000
--- a/third_party/xla/xla/mlir_hlo/lhlo/IR/lhlo_ops_structs.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file defines structures used in LMHLO dialect.
-
-#ifndef MLIR_HLO_LHLO_IR_LHLO_OPS_STRUCTS_H
-#define MLIR_HLO_LHLO_IR_LHLO_OPS_STRUCTS_H
-
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/Types.h"
-
-// Order matters, this .inc header is not self-contained, and relies on the
-// #includes above.
-#define GET_ATTRDEF_CLASSES
-#include "lhlo/IR/lhlo_ops_structs.h.inc"
-
-#endif  // MLIR_HLO_LHLO_IR_LHLO_OPS_STRUCTS_H
diff --git a/third_party/xla/xla/mlir_hlo/lhlo/IR/lhlo_ops_structs.td b/third_party/xla/xla/mlir_hlo/lhlo/IR/lhlo_ops_structs.td
deleted file mode 100644
index ca8a42ca33ba43..00000000000000
--- a/third_party/xla/xla/mlir_hlo/lhlo/IR/lhlo_ops_structs.td
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef LHLO_OPS_STRUCTS
-#define LHLO_OPS_STRUCTS
-
-include "lhlo/IR/lhlo_dialect.td"
-include "mlir/IR/AttrTypeBase.td"
-
-// This attribute defines information about how arguments to the LHLO custom
-// call operation relate to the arguments of the target function. In most cases
-// the mapping will be 1:1, but in certain cases, it may not be. As an example,
-// tokens are not represented in the LHLO dialect, but the custom call target
-// might still expect to see buffer arguments corresponding to tokens, in which
-// case the mapping will not be 1:1.
-def CustomCallTargetArgMappingAttr : AttrDef<
-  LHLO_Dialect, "CustomCallTargetArgMapping"> {
-  let mnemonic = "custom_call_target_arg_mapping";
-  let parameters = (ins
-    // number of buffer expected by the target for arguments.
-    "int64_t":$num_args,
-    // number of buffer expected by the target for results.
-    "int64_t":$num_results,
-    // map each custom call op arg to its position in target args.
-    ArrayRefParameter<"int64_t">:$args_to_target_args,
-    // map each custom call op arg to its position in target results.
-    ArrayRefParameter<"int64_t">:$results_to_target_results
-  );
-  // 'struct()' would allow any order, but doesn't support square brackets yet.
-  let assemblyFormat = [{`<`
-    `num_args` `=` $num_args`,`
-    `num_results` `=` $num_results`,`
-    `args_to_target_args` `=` `[`$args_to_target_args`]``,`
-    `results_to_target_results` `=` `[`$results_to_target_results`]``>`
-  }];
-  let summary = "Custom call operands to target argument mapping info";
-}
-
-#endif // LHLO_OPS_STRUCTS
diff --git a/third_party/xla/xla/mlir_hlo/lhlo/IR/lhlo_structured_interface.h b/third_party/xla/xla/mlir_hlo/lhlo/IR/lhlo_structured_interface.h
deleted file mode 100644
index 05f6c0c3ed2989..00000000000000
--- a/third_party/xla/xla/mlir_hlo/lhlo/IR/lhlo_structured_interface.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_LHLO_IR_LHLO_STRUCTURED_INTERFACE_H
-#define MLIR_HLO_LHLO_IR_LHLO_STRUCTURED_INTERFACE_H
-
-#include "mlir/IR/OpDefinition.h"
-
-/// Include the generated interface declarations.
-#include "lhlo/IR/lhlo_structured_interface.h.inc"
-
-#endif  // MLIR_HLO_LHLO_IR_LHLO_STRUCTURED_INTERFACE_H
diff --git a/third_party/xla/xla/mlir_hlo/lhlo/IR/lhlo_structured_interface.td b/third_party/xla/xla/mlir_hlo/lhlo/IR/lhlo_structured_interface.td
deleted file mode 100644
index efc0ee0c81c7de..00000000000000
--- a/third_party/xla/xla/mlir_hlo/lhlo/IR/lhlo_structured_interface.td
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file contains LmhloStructuredInterface, which provides access to 'LmhloOp'
-// interface.
-
-#ifndef MLIR_LMHLO_STRUCTURED_INTERFACE
-#define MLIR_LMHLO_STRUCTURED_INTERFACE
-
-include "mlir/IR/OpBase.td"
-
-def LmhloStructuredInterface : OpInterface<"LmhloOp"> {
-  let cppNamespace = "::mlir::lmhlo";
-
-  let methods = [
-    InterfaceMethod<
-      /*desc=*/[{Return the operand that is the output buffer
-      }],
-      /*retTy=*/"Value",
-      /*methodName=*/"getResultBuffer",
-      /*args=*/(ins),
-      /*methodBody=*/[{}],
-      /*defaultImplementation=*/[{
-        /// By default, the result buffer is the last operand 
-        unsigned num_operands = this->getOperation()->getNumOperands();
-        if (num_operands > 1) {
-          return this->getOperation()->getOperand(num_operands - 1);
-        }
-        return nullptr;
-      }]
-    >,
-  ];
-}
-
-#endif // MLIR_LMHLO_STRUCTURED_INTERFACE
diff --git a/third_party/xla/xla/mlir_hlo/lhlo/transforms/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/lhlo/transforms/CMakeLists.txt
deleted file mode 100644
index 30aae289f923d6..00000000000000
--- a/third_party/xla/xla/mlir_hlo/lhlo/transforms/CMakeLists.txt
+++ /dev/null
@@ -1,50 +0,0 @@
-#
-# Copyright 2020 The OpenXLA Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-set(LLVM_TARGET_DEFINITIONS lmhlo_passes.td)
-mlir_tablegen(lmhlo_passes.h.inc -gen-pass-decls -name AllLmhlo)
-add_public_tablegen_target(MLIRLmhloPassIncGen)
-
-include_directories(BEFORE
-    ${CMAKE_CURRENT_BINARY_DIR}
-    ${CMAKE_CURRENT_SOURCE_DIR})
-
-add_mlir_library(LmhloPasses
-  legalize_to_tensor_op/legalize_to_tensor_op.cc
-  lhlo_legalize_to_affine/lhlo_legalize_to_affine.cc
-  lhlo_legalize_to_gpu/lhlo_legalize_to_gpu.cc
-  lhlo_legalize_to_parallel_loops/lhlo_legalize_to_parallel_loops.cc
-
-  DEPENDS
-  MLIRlhlo_opsIncGen
-  MLIRLmhloPassIncGen
-  MLIRMhloPassIncGen
-
-  LINK_COMPONENTS
-  Core
-
-  LINK_LIBS PUBLIC
-  LmhloDialect
-  MLIRComplexDialect
-  MLIRGPUDialect
-  MLIRLinalgDialect
-  MLIRLinalgTransforms
-  MLIRMhloUtils
-  MLIRIR
-  MLIRPass
-  MLIRRewrite
-  MLIRTransformUtils
-)
diff --git a/third_party/xla/xla/mlir_hlo/lhlo/transforms/legalize_to_tensor_op/legalize_to_tensor_op.cc b/third_party/xla/xla/mlir_hlo/lhlo/transforms/legalize_to_tensor_op/legalize_to_tensor_op.cc
deleted file mode 100644
index 6513fb7370295a..00000000000000
--- a/third_party/xla/xla/mlir_hlo/lhlo/transforms/legalize_to_tensor_op/legalize_to_tensor_op.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file implements logic for lowering bufferization.to_tensor ops that are
-// inserted during `mhlo-legalize-to-lmhlo`.
-
-#include <memory>
-#include <utility>
-
-#include "lhlo/transforms/passes.h"
-#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/Shape/IR/Shape.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"  // TF:llvm-project
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Pass/PassRegistry.h"
-#include "mlir/Support/LogicalResult.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir {
-namespace lmhlo {
-
-#define GEN_PASS_DEF_LEGALIZETOTENSOROPPASS
-#include "lhlo/transforms/lmhlo_passes.h.inc"
-
-namespace {
-using shape::ShapeOfOp;
-using tensor::ExtractOp;
-
-// Converting:
-//   memref (operand) -> bufferization.to_tensor -> tensor.extract
-//     to
-//   memref (operand) -> memref.load
-struct ForwardExtractOp : public OpRewritePattern<ExtractOp> {
-  using OpRewritePattern<ExtractOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(ExtractOp extract,
-                                PatternRewriter& rewriter) const override {
-    auto toTensor =
-        extract.getTensor().getDefiningOp<bufferization::ToTensorOp>();
-    if (!toTensor) return failure();
-
-    rewriter.replaceOpWithNewOp<memref::LoadOp>(
-        extract, extract.getType(), toTensor.getMemref(), extract.getIndices());
-    return success();
-  }
-};
-
-// Converting:
-//   memref (operand) -> bufferization.to_tensor -> shape.shape_of
-//     to
-//   memref (operand) -> shape.shape_of
-struct ForwardShapeOfOp : public OpRewritePattern<ShapeOfOp> {
-  using OpRewritePattern<ShapeOfOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(ShapeOfOp shapeOf,
-                                PatternRewriter& rewriter) const override {
-    auto toTensor = shapeOf.getArg().getDefiningOp<bufferization::ToTensorOp>();
-    if (!toTensor) return failure();
-
-    rewriter.replaceOpWithNewOp<ShapeOfOp>(shapeOf, shapeOf.getType(),
-                                           toTensor.getMemref());
-    return success();
-  }
-};
-
-struct LegalizeToTensorOpPass
-    : public impl::LegalizeToTensorOpPassBase<LegalizeToTensorOpPass> {
-  // Perform the lowering to remove bufferization.to_tensor ops inserted during
-  // `mhlo-legalize-to-lmhlo`.
-  void runOnOperation() override {
-    auto func = getOperation();
-    auto* context = &getContext();
-    RewritePatternSet patterns(context);
-    patterns.add<ForwardShapeOfOp, ForwardExtractOp>(context);
-    if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns)))) {
-      func.emitError("applyPatternsAndFoldGreedily does not converge");
-      signalPassFailure();
-    }
-  }
-};
-
-}  // namespace
-
-}  // namespace lmhlo
-}  // namespace mlir
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-mlir::lmhlo::createLegalizeToTensorOpPass() {
-  return std::make_unique<LegalizeToTensorOpPass>();
-}
diff --git a/third_party/xla/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_affine/lhlo_legalize_to_affine.cc b/third_party/xla/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_affine/lhlo_legalize_to_affine.cc
deleted file mode 100644
index 45e44cc1161827..00000000000000
--- a/third_party/xla/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_affine/lhlo_legalize_to_affine.cc
+++ /dev/null
@@ -1,679 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file implements logic for lowering LHLO dialect to Affine dialect.
-
-#include <memory>
-#include <optional>
-#include <utility>
-
-#include "lhlo/IR/lhlo_ops.h"
-#include "lhlo/transforms/map_lmhlo_to_scalar_op.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/Location.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir {
-namespace lmhlo {
-
-#define GEN_PASS_DEF_LHLOLEGALIZETOAFFINEPASS
-#include "lhlo/transforms/lmhlo_passes.h.inc"
-
-namespace {
-
-// Builds an affine loop nest iterating from zeros to "upper_bounds" with unit
-// steps, and populates the body of the innermost loop using "body_builder".
-static void buildBoundedAffineLoopNest(
-    OpBuilder& builder, Location location, ArrayRef<int64_t> upperBounds,
-    function_ref<void(OpBuilder&, Location, ValueRange)> bodyBuilder) {
-  SmallVector<int64_t, 3> lowerBounds(upperBounds.size(), /*Value=*/0);
-  SmallVector<int64_t, 3> steps(upperBounds.size(), /*Value=*/1);
-  affine::buildAffineLoopNest(builder, location, lowerBounds, upperBounds,
-                              steps, bodyBuilder);
-}
-
-struct DotOpConverter : public OpRewritePattern<DotOp> {
-  using OpRewritePattern<DotOp>::OpRewritePattern;
-
-  // Supports only rank-2 tensors for LHS and RHS.
-  LogicalResult matchAndRewrite(DotOp op,
-                                PatternRewriter& rewriter) const override {
-    Value lhs = op.getLhs();
-    Value rhs = op.getRhs();
-    MemRefType lhsType = lhs.getType().cast<MemRefType>();
-    MemRefType rhsType = rhs.getType().cast<MemRefType>();
-    Type elementType = lhsType.getElementType();
-    ArrayRef<int64_t> shapeLhs = lhsType.getShape();
-    ArrayRef<int64_t> shapeRhs = rhsType.getShape();
-
-    if ((lhsType.getRank() != 2) || (rhsType.getRank() != 2)) {
-      return failure();
-    }
-
-    // We don't currently support batching dimensions, or multiple contraction
-    // dimensions.
-    mhlo::DotDimensionNumbersAttr dotDimensionNumbers =
-        op.getDotDimensionNumbers();
-    if (!dotDimensionNumbers.getLhsBatchingDimensions().empty() ||
-        !dotDimensionNumbers.getRhsBatchingDimensions().empty())
-      return failure();
-    if (dotDimensionNumbers.getLhsContractingDimensions().size() != 1 ||
-        *dotDimensionNumbers.getLhsContractingDimensions().begin() != 1 ||
-        dotDimensionNumbers.getRhsContractingDimensions().size() != 1 ||
-        *dotDimensionNumbers.getRhsContractingDimensions().begin() != 0) {
-      return failure();
-    }
-
-    LogicalResult mapStatus = success();
-    auto bodyBuilder = [&](OpBuilder& builder, Location loc, ValueRange ivs) {
-      SmallVector<Value, 2> lhsIndices{ivs[0], ivs[2]},
-          rhsIndices{ivs[2], ivs[1]}, resultIndices{ivs[0], ivs[1]};
-
-      auto l = builder.create<affine::AffineLoadOp>(loc, lhs, lhsIndices);
-      auto r = builder.create<affine::AffineLoadOp>(loc, rhs, rhsIndices);
-      auto result = rewriter.create<affine::AffineLoadOp>(loc, op.getOutput(),
-                                                          resultIndices);
-      Value opResult = lmhlo::LhloOpToStdScalarOp::map<DotOp>(
-          op, elementType, {l, r, result}, &builder);
-      mapStatus = success(opResult != nullptr);
-      if (failed(mapStatus)) return;
-      builder.create<affine::AffineStoreOp>(loc, opResult, op.getOutput(),
-                                            resultIndices);
-    };
-
-    buildBoundedAffineLoopNest(rewriter, op.getLoc(),
-                               {shapeLhs[0], shapeRhs[1], shapeRhs[0]},
-                               bodyBuilder);
-    if (failed(mapStatus)) return failure();
-
-    rewriter.eraseOp(op);
-    return success();
-  }
-};
-
-/// Concat Operation Example (2D):
-/// Given inpA[2][1], inpB[2][2], concat_dimension = 1.
-/// Compute output[x1][x2].
-/// Implementation Pseudocode:
-/// s = 0
-/// for a in range(0, 2):
-///   for b in range(0, 1):
-///     output[a][b] = inpA[a][b - s]
-/// s = 1
-/// for a in range(0, 2):
-///   for b in range(1, 3):
-///     output[a][b] = inpB[a][b - s]
-///
-/// Concatenate composes an array from multiple array operands. The array is of
-/// the same rank as each of the input array operands (which must be of the same
-/// rank as each other) and contains the arguments in the order that they were
-/// specified.
-struct ConcatOpConverter : public OpRewritePattern<ConcatenateOp> {
-  using OpRewritePattern<ConcatenateOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(ConcatenateOp op,
-                                PatternRewriter& rewriter) const override {
-    Location loc = op.getLoc();
-    Value output = op.getOutput();
-    MemRefType outputType = output.getType().cast<MemRefType>();
-    unsigned outputRank = outputType.getRank();
-    ArrayRef<int64_t> outputShape = outputType.getShape();
-
-    ValueRange operands = op.getVal();
-    uint64_t concatDim = op.getDimension();
-    int64_t prevBound = 0;
-
-    for (Value operand : operands) {
-      MemRefType operandType = operand.getType().cast<MemRefType>();
-      ArrayRef<int64_t> operandShape = operandType.getShape();
-
-      // TODO(pashu123): Extend support for dynamic dimensions.
-      if (!operandType.hasStaticShape()) return failure();
-
-      // Only for the concatenation dimension, the value is dimension -
-      // prevBound.
-      SmallVector<AffineExpr, 4> expr;
-      for (unsigned i = 0; i < outputRank; i++) {
-        AffineExpr d0 = (i == concatDim)
-                            ? rewriter.getAffineDimExpr(concatDim) - prevBound
-                            : rewriter.getAffineDimExpr(i);
-
-        expr.push_back(d0);
-      }
-      AffineMap map =
-          AffineMap::get(outputRank, 0, expr, rewriter.getContext());
-
-      // Create multiple for loop nests iterating along the concatenation
-      // dimension.
-      OpBuilder::InsertionGuard guard(rewriter);
-      SmallVector<Value, 3> indices;
-      affine::AffineForOp forOp;
-      for (unsigned i = 0; i < outputRank; i++) {
-        if (i == concatDim) {
-          forOp = rewriter.create<affine::AffineForOp>(
-              loc, prevBound, prevBound + operandShape[i]);
-          prevBound += operandShape[i];
-          indices.push_back(forOp.getInductionVar());
-        } else {
-          forOp = rewriter.create<affine::AffineForOp>(loc, 0, outputShape[i]);
-          indices.push_back(forOp.getInductionVar());
-        }
-        rewriter.setInsertionPointToStart(forOp.getBody());
-      }
-      Value storeVal =
-          rewriter.create<affine::AffineLoadOp>(loc, operand, map, indices);
-      rewriter.create<affine::AffineStoreOp>(loc, storeVal, output, indices);
-    }
-    rewriter.eraseOp(op);
-    return success();
-  }
-};
-
-/// Returns a zero value of type `type`. `type` is expected to be either
-/// int or float.
-static Value getZeroValue(Type type, Location loc, PatternRewriter& rewriter) {
-  assert(type.isIntOrFloat() && "Expected int or float");
-
-  if (IntegerType intType = type.dyn_cast<IntegerType>())
-    return rewriter.create<mlir::arith::ConstantIntOp>(loc, 0,
-                                                       intType.getWidth());
-
-  FloatType floatType = type.cast<FloatType>();
-  return rewriter.create<mlir::arith::ConstantFloatOp>(
-      loc, APFloat::getZero(floatType.getFloatSemantics()), floatType);
-}
-
-/// Emits a nest to fill the given `buffer` of memref type with `fillValue`.
-static void fillBuffer(Location loc, Value buffer, Value fillValue,
-                       PatternRewriter& builder) {
-  OpBuilder::InsertionGuard guard(builder);
-  MemRefType bufType = buffer.getType().cast<MemRefType>();
-  unsigned rank = bufType.getRank();
-  SmallVector<Value, 4> dimSizes;
-  dimSizes.reserve(rank);
-  for (unsigned i = 0; i < rank; ++i)
-    dimSizes.push_back(builder.create<mlir::memref::DimOp>(loc, buffer, i));
-
-  AffineMap idSymMap = builder.getSymbolIdentityMap();
-  AffineMap lbMap = builder.getConstantAffineMap(0);
-  SmallVector<Value, 4> ivs(rank);
-  affine::AffineForOp forOp;
-  for (unsigned i = 0; i < rank; ++i) {
-    forOp = builder.create<affine::AffineForOp>(loc, std::nullopt, lbMap,
-                                                dimSizes[i], idSymMap);
-    builder.setInsertionPointToStart(forOp.getBody());
-    ivs[i] = forOp.getInductionVar();
-  }
-  Type fillValueType = fillValue.getType();
-  auto fillMemRefType = fillValueType.dyn_cast<MemRefType>();
-  assert(((fillMemRefType && fillMemRefType.getRank() == 0) ||
-          fillValueType.isIntOrFloat()) &&
-         "init value has to be a 0-d memref or int or fp");
-  Value initVal = fillMemRefType ? builder.create<affine::AffineLoadOp>(
-                                       loc, fillValue, /*indices=*/std::nullopt)
-                                 : fillValue;
-  builder.create<affine::AffineStoreOp>(loc, initVal, buffer, ivs);
-}
-
-/// Converts GatherOp to Affine nest form.
-/// Pseudocode:
-///   1. Fill a temporary output tensor with 0.
-///   2. Repeat the following for each batch dimension :-
-///      1. For each indices in 'operand' :-
-///        1. Get hold of start indices from 'start_indices'.
-///        2. Add offset to the start indices to get the final indices.
-///        3. Load value from 'operand' tensor : 'operand_val'.
-///        4. Load value from temporary output : 'prev_val'.
-///        5. If the final indices match current indices of 'operand' :
-///             'prev_val' = 'prev_val' + 'operand_val'
-///        6. Store 'prev_val' back to the temporary output.
-class GatherOpConverter : public OpRewritePattern<GatherOp> {
- public:
-  using OpRewritePattern<GatherOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(GatherOp op,
-                                PatternRewriter& rewriter) const final {
-    Location loc = op.getLoc();
-
-    // Operand array.
-    Value operand = op.getOperand();
-    MemRefType operandType = operand.getType().cast<MemRefType>();
-    unsigned operandRank = operandType.getRank();
-    ArrayRef<int64_t> operandShape = operandType.getShape();
-
-    // Start_indices array.
-    Value startIndices = op.getStartIndices();
-    MemRefType startIndicesType = startIndices.getType().cast<MemRefType>();
-    unsigned startIndicesRank = startIndicesType.getRank();
-    ArrayRef<int64_t> startIndicesShape = startIndicesType.getShape();
-
-    // Output array.
-    Value output = op.getOutput();
-    MemRefType outputType = output.getType().cast<MemRefType>();
-    ArrayRef<int64_t> outputShape = outputType.getShape();
-
-    if (!operandType.hasStaticShape() || !startIndicesType.hasStaticShape() ||
-        !outputType.hasStaticShape())
-      return rewriter.notifyMatchFailure(op, "only static shaped type allowed");
-
-    mhlo::GatherDimensionNumbersAttr gatherDim = op.getDimensionNumbers();
-
-    auto collapsedSliceDims = gatherDim.getCollapsedSliceDims();
-    auto offsetDims = gatherDim.getOffsetDims();
-    auto startIndexMap = gatherDim.getStartIndexMap();
-    int64_t indexVectorDim = gatherDim.getIndexVectorDim();
-
-    // Slice_sizes.
-    DenseIntElementsAttr sliceSizesAttr = op.getSliceSizesAttr();
-    SmallVector<int64_t, 4> sliceSizes;
-    for (const APInt& dim : sliceSizesAttr.getValues<APInt>())
-      sliceSizes.push_back(dim.getSExtValue());
-
-    // Creating constants with 0 value. We need the Integer type constant value
-    // because the indices type will be Integer.
-    Value zeroIntVal = rewriter.create<mlir::arith::ConstantIntOp>(
-        loc, 0, startIndicesType.getElementType());
-    Type elementType = outputType.getElementType();
-    Value zeroLoadValue = getZeroValue(elementType, loc, rewriter);
-    // Initializing the output buffer with 0.
-    fillBuffer(loc, output, zeroLoadValue, rewriter);
-
-    // We fetch the shape of start_indices at index_vector_dim. In case
-    // index_vector_dim is equal to the rank of start_indices, we implicitly
-    // consider start_indices to have a trailing 1 dimension.
-    unsigned startIndicesNumbers = (indexVectorDim == startIndicesRank)
-                                       ? 1
-                                       : startIndicesShape[indexVectorDim];
-    // We create integer constants till start_incides_index which help us to
-    // fetch start_indices in affine transformation.
-    SmallVector<Value, 4> startIndicesIndex;
-    for (unsigned i = 0; i < startIndicesNumbers; i++) {
-      Value iVal = rewriter.create<mlir::arith::ConstantIntOp>(
-          loc, i, startIndicesType.getElementType());
-      iVal = rewriter.create<arith::IndexCastOp>(loc, rewriter.getIndexType(),
-                                                 iVal);
-      startIndicesIndex.push_back(iVal);
-    }
-
-    // S_in contains the multiple indices that form a starting index used in the
-    // input/operand tensor. O_in contains the multiple offsets of corresponding
-    // starting index used in the input/operand tensor. We initialize both of
-    // them with 0.
-    SmallVector<Value, 4> sIn;
-    SmallVector<Value, 4> oIn;
-    Value zeroIndexVal = rewriter.create<arith::IndexCastOp>(
-        loc, rewriter.getIndexType(), zeroIntVal);
-    for (unsigned i = 0; i < operandRank; i++) {
-      sIn.push_back(zeroIndexVal);
-      oIn.push_back(zeroIndexVal);
-    }
-
-    // batch_induction_vars stores the loop induction variables pertaining to
-    // the batches of start indices.
-    SmallVector<Value, 4> batchInductionVars;
-    // output_induction_vars stores the loop induction variables pertaining to
-    // both batches and offsets within the output tensor.
-    SmallVector<Value, 4> outputInductionVars;
-    // Create loops to iterate over each batch of starting index.
-    for (unsigned i = 0; i < startIndicesRank; i++) {
-      // ith dimension of start_indices doesn't form a batch if it is equal to
-      // index_vector_dim.
-      if (i == indexVectorDim) continue;
-      affine::AffineForOp forOp =
-          rewriter.create<affine::AffineForOp>(loc, 0, startIndicesShape[i]);
-      batchInductionVars.push_back(forOp.getInductionVar());
-      outputInductionVars.push_back(forOp.getInductionVar());
-      rewriter.setInsertionPointToStart(forOp.getBody());
-    }
-
-    // Create loops to iterate over each offset dimension within the output
-    // tensor.
-    for (unsigned i = 0, k = 0, e = offsetDims.size(); i < e; i++) {
-      affine::AffineForOp forOp = rewriter.create<affine::AffineForOp>(
-          loc, 0, outputShape[offsetDims[i]]);
-      rewriter.setInsertionPointToStart(forOp.getBody());
-      // We try to fetch the first non-collapsed dimension.
-      while (k < collapsedSliceDims.size() && collapsedSliceDims[k] == i) k++;
-      // Remapping the offset_dim[i] to the non-collapsed dimension.
-      oIn[k++] = forOp.getInductionVar();
-      // We assume offset_dims to be sorted. So when inserted to
-      // output_induction_vars the loop induction variable gets inserted at the
-      // correct position.
-      outputInductionVars.insert(outputInductionVars.begin() + offsetDims[i],
-                                 forOp.getInductionVar());
-    }
-
-    // Create loops to iterate over all dimensions within the operand tensor.
-    SmallVector<Value, 4> operandIndex;
-    for (unsigned i = 0, k = 0; i < operandRank; i++) {
-      // We assume start_index_map to have sorted dimensions. We only include
-      // those dimensions of operand tensor which are present in
-      // start_index_map.
-      if (k < startIndexMap.size() && i == startIndexMap[k++]) {
-        affine::AffineForOp forOp =
-            rewriter.create<affine::AffineForOp>(loc, 0, operandShape[i]);
-        operandIndex.push_back(forOp.getInductionVar());
-        rewriter.setInsertionPointToStart(forOp.getBody());
-      } else {
-        operandIndex.push_back(oIn[i]);
-      }
-    }
-
-    // In case index_vector_dim is not equal to start_indices shape then we
-    // create another loop to iterate over starting index and update
-    // batch_induction_vars.
-    if (indexVectorDim != startIndicesRank) {
-      for (unsigned i = 0; i < startIndicesNumbers; i++) {
-        batchInductionVars.insert(batchInductionVars.begin() + indexVectorDim,
-                                  startIndicesIndex[i]);
-        Value startIndex = rewriter.create<affine::AffineLoadOp>(
-            loc, startIndices, batchInductionVars);
-        startIndex = rewriter.create<arith::IndexCastOp>(
-            loc, rewriter.getIndexType(), startIndex);
-        sIn[startIndexMap[i]] = startIndex;
-        batchInductionVars.erase(batchInductionVars.begin() + indexVectorDim);
-      }
-    } else {
-      // Since index_vector_dim is equal to start_indicesRank we can directly
-      // fetch the start_index from batch_induction_vars.
-      Value startIndex = rewriter.create<affine::AffineLoadOp>(
-          loc, startIndices, batchInductionVars);
-      startIndex = rewriter.create<arith::IndexCastOp>(
-          loc, rewriter.getIndexType(), startIndex);
-      sIn[0] = startIndex;
-    }
-
-    // We load value at a particular operand index and populate the output
-    // tensor if the index constraints match.
-    Value loadValue =
-        rewriter.create<affine::AffineLoadOp>(loc, operand, operandIndex);
-    SmallVector<Value, 4> predicates;
-    // Adding offsets to the corresponding starting index and comparing it with
-    // the corresponding operand index.
-    for (unsigned k = 0, i = 0; k < startIndexMap.size(); k++) {
-      i = startIndexMap[k];
-      Value addStartIndexOffset = rewriter.create<mlir::arith::AddIOp>(
-          loc, rewriter.getIndexType(), sIn[i], oIn[i]);
-      Value predicate = rewriter.create<mlir::arith::CmpIOp>(
-          loc, arith::CmpIPredicate::eq, addStartIndexOffset, operandIndex[i]);
-      predicates.push_back(predicate);
-    }
-
-    // Since the no. of predicates is equal to start_index_map.size() we
-    // iterate over pairs of predicates and join them with arith::AndIOp.
-    // We store the final predicate formed by joining other predicates with
-    // arith::AndIOp in result_predicate.
-    Value resultPredicate = nullptr;
-    for (unsigned i = 0; i < predicates.size() - 1; i += 2) {
-      Value predicateA = predicates[i];
-      Value predicateB = predicates[i + 1];
-      Value andPredicate =
-          rewriter.create<mlir::arith::AndIOp>(loc, predicateA, predicateB);
-      resultPredicate = (i == 0) ? andPredicate
-                                 : rewriter.create<mlir::arith::AndIOp>(
-                                       loc, resultPredicate, andPredicate);
-    }
-    // We fetch the last predicate value. In case this is the only predicate
-    // we let result_predicate be equal to this predicate value. Else if there
-    // are odd number of predicates we join it to other predicates using
-    // arith::AndIOp.
-    Value predicate = predicates.back();
-    if (!resultPredicate) resultPredicate = predicate;
-    // In case there are odd number of predicates we join the last predicate
-    // to the result_predicate using arith::AndIOp.
-    else if (startIndexMap.size() % 2 == 1)
-      resultPredicate =
-          rewriter.create<mlir::arith::AndIOp>(loc, resultPredicate, predicate);
-
-    // We use the loaded value if the index computed by adding offsets to
-    // starting index is equal to the current operand index. We use 0 as a value
-    // otherwise.
-    Value selectLoad = rewriter.create<mlir::arith::SelectOp>(
-        loc, resultPredicate, loadValue, zeroLoadValue);
-    // We load value at output array.
-    Value outputValue =
-        rewriter.create<affine::AffineLoadOp>(loc, output, outputInductionVars);
-
-    // The selected value is added to the previous value stored in output array.
-    if (elementType.isa<FloatType>())
-      outputValue = rewriter.create<arith::AddFOp>(loc, elementType, selectLoad,
-                                                   outputValue);
-    else
-      outputValue = rewriter.create<arith::AddIOp>(loc, elementType, selectLoad,
-                                                   outputValue);
-    rewriter.create<affine::AffineStoreOp>(loc, outputValue, output,
-                                           outputInductionVars);
-    rewriter.eraseOp(op);
-    return success();
-  }
-};
-
-/// Converts PadOp to Affine nest form.
-/// Pseudocode:
-///   1. Fill `output` tensor with `padding_value`.
-///   2. Compute AffineMap for store into `output`.
-///      out_idx = edge_padding_low +
-///                operand_idx * (1 + interior_padding)
-///   3. Create nested loop from `operand` shape.
-///      3.1 load from `operand`.
-///      3.2 store into `output`.
-/// NOTE: The lowering handles only ranked shapes and bails out in case any of
-///       output type/edge_padding_low/edge_padding_high/interior_padding size
-///       doesn't match that of the operand's rank.
-/// Limitation for now:
-///   interior_padding == 0 && edge_padding_* >= 0
-struct PadOpConverter : public OpRewritePattern<PadOp> {
-  using OpRewritePattern<PadOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(PadOp op,
-                                PatternRewriter& rewriter) const override {
-    Value operand = op.getOperand();
-    Value paddingValue = op.getPaddingValue();
-    Value output = op.getOutput();
-
-    auto operandType = operand.getType().dyn_cast<ShapedType>();
-    auto outputType = output.getType().dyn_cast<ShapedType>();
-    // We allow lowering for only ranked input/output.
-    if (!(operandType && outputType && operandType.hasRank() &&
-          outputType.hasRank()))
-      return failure();
-    unsigned rank = operandType.getRank();
-
-    auto edgePadLowRanges = op.getEdgePaddingLow().getValues<int64_t>();
-    auto edgePadHighRanges = op.getEdgePaddingHigh().getValues<int64_t>();
-    auto interiorPadRanges = op.getInteriorPadding().getValues<int64_t>();
-    // Check whether the constraints for the lowering are satisfied :-
-    //   1. interior_padding[i] == 0
-    //   2. edge_padding_*[i] >= 0
-    for (auto paddings :
-         llvm::zip(edgePadLowRanges, edgePadHighRanges, interiorPadRanges)) {
-      // Only handle non-negative edge padding.
-      if (std::get<0>(paddings) < 0 || std::get<1>(paddings) < 0)
-        return rewriter.notifyMatchFailure(
-            op, "expected non-negative edge padding");
-      // Only handle interior padding being zero for now.
-      if (std::get<2>(paddings) != 0)
-        return rewriter.notifyMatchFailure(op,
-                                           "expected zero interior padding");
-    }
-
-    SmallVector<int64_t> edgePaddingLow(edgePadLowRanges.begin(),
-                                        edgePadLowRanges.end());
-    SmallVector<int64_t> edgePaddingHigh(edgePadHighRanges.begin(),
-                                         edgePadHighRanges.end());
-    SmallVector<int64_t> interiorPadding(interiorPadRanges.begin(),
-                                         interiorPadRanges.end());
-
-    ArrayRef<int64_t> operandShape = operandType.getShape();
-    ArrayRef<int64_t> outputShape = outputType.getShape();
-
-    // Mapping the `operand` index to the `output` index.
-    SmallVector<AffineExpr, 4> expr;
-    for (unsigned i = 0; i < rank; i++) {
-      AffineExpr dimExpr = rewriter.getAffineDimExpr(i);
-      expr.push_back(dimExpr + edgePaddingLow[i]);
-    }
-    AffineMap map =
-        AffineMap::get(rank, /*symbolCount=*/0, expr, rewriter.getContext());
-
-    SmallVector<Value, 4> indices;
-
-    Location loc = op.getLoc();
-    // Set padding_value to output.
-    {
-      OpBuilder::InsertionGuard regionGuard(rewriter);
-      Value scalarPaddingValue = rewriter.create<affine::AffineLoadOp>(
-          loc, paddingValue, SmallVector<Value, 4>());
-      affine::AffineForOp initForOp;
-      for (unsigned i = 0; i < rank; i++) {
-        initForOp =
-            rewriter.create<affine::AffineForOp>(loc, 0, outputShape[i]);
-        indices.push_back(initForOp.getInductionVar());
-        rewriter.setInsertionPointToStart(initForOp.getBody());
-      }
-      rewriter.create<affine::AffineStoreOp>(loc, scalarPaddingValue, output,
-                                             indices);
-    }
-
-    // Store `operand` into `output`, loop upper bounds from `operand` shape.
-    indices.clear();
-    affine::AffineForOp padForOp;
-    for (unsigned i = 0; i < rank; i++) {
-      padForOp = rewriter.create<affine::AffineForOp>(loc, 0, operandShape[i]);
-      indices.push_back(padForOp.getInductionVar());
-      rewriter.setInsertionPointToStart(padForOp.getBody());
-    }
-    Value storeVal =
-        rewriter.create<affine::AffineLoadOp>(loc, operand, indices);
-    rewriter.create<affine::AffineStoreOp>(loc, storeVal, output, map, indices);
-    rewriter.eraseOp(op);
-    return success();
-  }
-};
-
-template <typename LhloOpTy>
-struct BinaryOpConverter : public OpRewritePattern<LhloOpTy> {
-  using OpRewritePattern<LhloOpTy>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(LhloOpTy op,
-                                PatternRewriter& rewriter) const override {
-    Value lhs = op.getLhs();
-    Value rhs = op.getRhs();
-    const auto& lhsType = lhs.getType().template cast<MemRefType>();
-    const auto& rhsType = rhs.getType().template cast<MemRefType>();
-    const auto& elementType = lhsType.getElementType();
-
-    if (lhsType.getShape() != rhsType.getShape()) {
-      return failure();
-    }
-
-    LogicalResult mapStatus = success();
-    auto bodyBuilder = [&](OpBuilder& builder, Location loc,
-                           ValueRange inductionVars) {
-      auto l = builder.create<affine::AffineLoadOp>(loc, lhs, inductionVars);
-      auto r = builder.create<affine::AffineLoadOp>(loc, rhs, inductionVars);
-      Value opResult = lmhlo::LhloOpToStdScalarOp::map<LhloOpTy>(
-          op, elementType, {l, r}, &builder);
-      mapStatus = success(opResult != nullptr);
-      if (failed(mapStatus)) return;
-      rewriter.create<affine::AffineStoreOp>(loc, opResult, op.getOut(),
-                                             inductionVars);
-    };
-
-    buildBoundedAffineLoopNest(rewriter, op.getLoc(), lhsType.getShape(),
-                               bodyBuilder);
-    if (failed(mapStatus)) return failure();
-    rewriter.eraseOp(op);
-    return success();
-  }
-};
-
-/// Conversion for unary operations i.e. tanh sin cos log log1p etc.
-template <typename LhloOpTy>
-struct UnaryOpConverter : public OpRewritePattern<LhloOpTy> {
-  using OpRewritePattern<LhloOpTy>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(LhloOpTy op,
-                                PatternRewriter& rewriter) const override {
-    Value input = op.getInput();
-    auto inputType = input.getType().cast<MemRefType>();
-    auto elementType = inputType.getElementType();
-    ArrayRef<int64_t> shape = inputType.getShape();
-
-    SmallVector<Value, 4> inductionVars;
-
-    LogicalResult mapStatus = success();
-    auto bodyBuilder = [&](OpBuilder& builder, Location loc,
-                           ValueRange inductionVars) {
-      Value loadInput =
-          builder.create<affine::AffineLoadOp>(loc, input, inductionVars);
-      Value opResult = lmhlo::LhloOpToStdScalarOp::map<LhloOpTy>(
-          op, elementType, {loadInput}, &builder);
-      mapStatus = success(opResult != nullptr);
-      if (failed(mapStatus)) return;
-      rewriter.create<affine::AffineStoreOp>(loc, opResult, op.getOutput(),
-                                             inductionVars);
-    };
-    buildBoundedAffineLoopNest(rewriter, op.getLoc(), shape, bodyBuilder);
-    if (failed(mapStatus)) return failure();
-    rewriter.eraseOp(op);
-    return success();
-  }
-};
-
-void populateLHLOToAffineConversionPattern(MLIRContext* context,
-                                           RewritePatternSet* patterns) {
-  // clang-format off
-  patterns->add<
-      BinaryOpConverter<lmhlo::AddOp>,
-      BinaryOpConverter<lmhlo::AndOp>,
-      BinaryOpConverter<lmhlo::DivOp>,
-      BinaryOpConverter<lmhlo::MaxOp>,
-      BinaryOpConverter<lmhlo::MinOp>,
-      BinaryOpConverter<lmhlo::MulOp>,
-      BinaryOpConverter<lmhlo::SubtractOp>,
-      ConcatOpConverter,
-      DotOpConverter,
-      GatherOpConverter,
-      PadOpConverter,
-      UnaryOpConverter<lmhlo::LogOp>>(context);
-  // clang-format on
-}
-
-struct LhloLegalizeToAffinePass
-    : public impl::LhloLegalizeToAffinePassBase<LhloLegalizeToAffinePass> {
-  void getDependentDialects(DialectRegistry& registry) const override {
-    registry.insert<affine::AffineDialect, math::MathDialect>();
-  }
-  void runOnOperation() override {
-    auto func = getOperation();
-    RewritePatternSet patterns(&getContext());
-    populateLHLOToAffineConversionPattern(&getContext(), &patterns);
-    if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns))))
-      return signalPassFailure();
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>> createLhloLegalizeToAffinePass() {
-  return std::make_unique<LhloLegalizeToAffinePass>();
-}
-
-}  // namespace lmhlo
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_gpu/lhlo_legalize_to_gpu.cc b/third_party/xla/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_gpu/lhlo_legalize_to_gpu.cc
deleted file mode 100644
index 7572655977e21d..00000000000000
--- a/third_party/xla/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_gpu/lhlo_legalize_to_gpu.cc
+++ /dev/null
@@ -1,208 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file implements logic for lowering LHLO dialect to GPU dialect.
-
-#include <cstdint>
-#include <memory>
-#include <optional>
-#include <utility>
-
-#include "lhlo/IR/lhlo_ops.h"
-#include "lhlo/transforms/map_lmhlo_to_scalar_op.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/IRMapping.h"
-#include "mlir/IR/Location.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/DialectConversion.h"
-
-namespace mlir {
-namespace lmhlo {
-
-#define GEN_PASS_DEF_LHLOLEGALIZETOGPUPASS
-#include "lhlo/transforms/lmhlo_passes.h.inc"
-
-namespace {
-
-// A simple translation of LHLO reduce operations to a corresponding gpu
-// launch operation. The transformation does no tiling and also only supports
-// 1d results.
-class LhloReduceToGPULaunchConverter : public OpConversionPattern<ReduceOp> {
- public:
-  using OpConversionPattern::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      ReduceOp reduceOp, OpAdaptor /*adaptor*/,
-      ConversionPatternRewriter& rewriter) const final {
-    auto loc = reduceOp.getLoc();
-    // Only support 1d reductions for now.
-    int64_t size = 0;
-    for (auto result : reduceOp.getOut()) {
-      auto shapedType = result.getType().dyn_cast<ShapedType>();
-      if (!shapedType || shapedType.getRank() != 1) {
-        return failure();
-      }
-      auto dimSize = shapedType.getDimSize(0);
-      if (size && size != dimSize) {
-        return failure();
-      }
-      size = dimSize;
-    }
-
-    auto reducingDimension = *reduceOp.getDimensions().value_begin<APInt>();
-
-    // Require all inputs to have the same shape.
-    int64_t reduceDimSize = 0;
-    for (auto input : reduceOp.getInputs()) {
-      auto shapedType = input.getType().dyn_cast<ShapedType>();
-      if (!shapedType || !shapedType.hasStaticShape()) {
-        return failure();
-      }
-      reduceDimSize = shapedType.getDimSize(reducingDimension.getSExtValue());
-    }
-
-    // Create a launch that is parallel in the result dimension.
-    auto blockSizeX = rewriter.create<mlir::arith::ConstantOp>(
-        loc, rewriter.getIndexType(),
-        rewriter.getIntegerAttr(rewriter.getIndexType(), size));
-    auto one = rewriter.create<mlir::arith::ConstantOp>(
-        loc, rewriter.getIndexType(),
-        rewriter.getIntegerAttr(rewriter.getIndexType(), 1));
-    auto launchOp = rewriter.create<mlir::gpu::LaunchOp>(loc, one, one, one,
-                                                         blockSizeX, one, one);
-    {
-      OpBuilder::InsertionGuard guard(rewriter);
-      rewriter.setInsertionPointToEnd(&launchOp.getBody().front());
-      auto index = launchOp.getThreadIds().x;
-
-      // Load the initial value and store it to the output.
-      for (auto pair : llvm::zip(reduceOp.getInitValues(), reduceOp.getOut())) {
-        auto initValue =
-            rewriter.create<mlir::memref::LoadOp>(loc, std::get<0>(pair));
-        rewriter.create<mlir::memref::StoreOp>(
-            loc, initValue, std::get<1>(pair), ArrayRef<Value>{index});
-      }
-
-      // Insert a loop into the body to compute the reduction. The loop ranges
-      // from [0.dim).
-      auto zero = rewriter.create<mlir::arith::ConstantOp>(
-          loc, rewriter.getIndexType(),
-          rewriter.getIntegerAttr(rewriter.getIndexType(), 0));
-      // TODO(b/137624192) Use dimOp to make it shape independent.
-      auto upper = rewriter.create<mlir::arith::ConstantOp>(
-          loc, rewriter.getIndexType(),
-          rewriter.getIntegerAttr(rewriter.getIndexType(), reduceDimSize));
-      auto step = rewriter.create<mlir::arith::ConstantOp>(
-          loc, rewriter.getIndexType(),
-          rewriter.getIntegerAttr(rewriter.getIndexType(), 1));
-      auto loop = rewriter.create<mlir::scf::ForOp>(loc, zero, upper, step);
-
-      rewriter.setInsertionPointToStart(loop.getBody());
-      // Compute memrefs for the value to reduce. This makes it easier to just
-      // inline the body.
-      auto output = *reduceOp.getOut().begin();
-      auto resType = MemRefType::get(
-          std::nullopt, getElementTypeOrSelf(output.getType()),
-          makeStridedLinearLayoutMap(std::nullopt, ShapedType::kDynamic,
-                                     rewriter.getContext()));
-      OpFoldResult offset = launchOp.getThreadIds().x;
-      auto oneAttr = rewriter.getI64IntegerAttr(1);
-      OpFoldResult size = oneAttr;
-      OpFoldResult stride = oneAttr;
-      auto accumulator = rewriter.create<memref::SubViewOp>(
-          loc, resType, output, offset, size, stride);
-      llvm::SmallVector<Value, 4> indexings;
-      Value inputBuffer = reduceOp.getInputs().front();
-      auto inputTypeRank = inputBuffer.getType().cast<MemRefType>().getRank();
-
-      Value input = *reduceOp.operand_begin();
-      SmallVector<OpFoldResult> offsets = llvm::to_vector<4>(llvm::map_range(
-          llvm::seq<int>(0, inputTypeRank), [&](int dim) -> OpFoldResult {
-            return dim == reducingDimension ? loop.getInductionVar()
-                                            : launchOp.getThreadIds().x;
-          }));
-      SmallVector<OpFoldResult> sizes(inputTypeRank, oneAttr);
-      SmallVector<OpFoldResult> strides(inputTypeRank, oneAttr);
-      auto rhs = rewriter.create<memref::SubViewOp>(
-          loc, accumulator.getType(), input, offsets, sizes, strides);
-
-      // Now copy over the actual body of the reduction, leaving out the
-      // terminator.
-      IRMapping mapping;
-      mapping.map(reduceOp.getBody().getArgument(0), accumulator);
-      mapping.map(reduceOp.getBody().getArgument(1), rhs);
-      mapping.map(reduceOp.getBody().getArgument(2), accumulator);
-      for (auto& nested : reduceOp.getBody().front().without_terminator()) {
-        auto* clone = rewriter.clone(nested, mapping);
-        for (auto pair : llvm::zip(nested.getResults(), clone->getResults())) {
-          mapping.map(std::get<0>(pair), std::get<1>(pair));
-        }
-      }
-
-      // Finally, insert the terminator for the launchOp.
-      rewriter.setInsertionPointToEnd(&launchOp.getBody().front());
-      rewriter.create<mlir::gpu::TerminatorOp>(loc);
-    }
-
-    rewriter.eraseOp(reduceOp);
-    return success();
-  };
-};
-
-struct LhloLegalizeToGpuPass
-    : public impl::LhloLegalizeToGpuPassBase<LhloLegalizeToGpuPass> {
-  void getDependentDialects(DialectRegistry& registry) const override {
-    registry
-        .insert<affine::AffineDialect, gpu::GPUDialect, linalg::LinalgDialect,
-                memref::MemRefDialect, scf::SCFDialect>();
-  }
-
-  void runOnOperation() override {
-    RewritePatternSet patterns(&getContext());
-    ConversionTarget target(getContext());
-    target.addLegalDialect<arith::ArithDialect, linalg::LinalgDialect,
-                           memref::MemRefDialect, func::FuncDialect,
-                           gpu::GPUDialect, scf::SCFDialect, LmhloDialect>();
-    target.addIllegalOp<ReduceOp>();
-    auto func = getOperation();
-    patterns.add<LhloReduceToGPULaunchConverter>(func.getContext());
-    if (failed(applyPartialConversion(func, target, std::move(patterns)))) {
-      signalPassFailure();
-    }
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeToGpuPass() {
-  return std::make_unique<LhloLegalizeToGpuPass>();
-}
-
-}  // namespace lmhlo
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_parallel_loops/lhlo_legalize_to_parallel_loops.cc b/third_party/xla/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_parallel_loops/lhlo_legalize_to_parallel_loops.cc
deleted file mode 100644
index f3c6b3940aced5..00000000000000
--- a/third_party/xla/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_parallel_loops/lhlo_legalize_to_parallel_loops.cc
+++ /dev/null
@@ -1,746 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <optional>
-#include <tuple>
-#include <utility>
-
-#include "lhlo/IR/lhlo_ops.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/DialectConversion.h"
-
-namespace mlir {
-namespace lmhlo {
-
-#define GEN_PASS_DEF_LHLOLEGALIZETOPARALLELLOOPSPASS
-#include "lhlo/transforms/lmhlo_passes.h.inc"
-
-namespace {
-
-// Clones and adapts the code in `lhlo_block` that works on buffers and has a
-// single output buffer to make it compatible with `operands` that have element
-// types of the respective buffers. Returns the computed value.
-//
-// Example. For `operands` with (f32, i32) types and a block with LHLO ops and
-// with signature:
-//   ^bb(%lhs: memref<f32>, %rhs: memref<i32>, %res: memref<i1>):
-//     <LHLO_ops>
-//
-// inserts necessary alloc and store ops to compute and return result that has
-// `i1` type.
-Value applySingleResultLhloCode(Location loc, ValueRange operands,
-                                Block* lhloBlock, OpBuilder* b) {
-  SmallVector<Value, 2> argBufs;
-  for (auto argType : lhloBlock->getArgumentTypes()) {
-    argBufs.push_back(
-        b->create<memref::AllocOp>(loc, argType.cast<MemRefType>()));
-  }
-  for (const auto& operand : llvm::enumerate(operands)) {
-    b->create<memref::StoreOp>(loc, operand.value(), argBufs[operand.index()]);
-  }
-  // Clone the ops from `lhlo_block`.
-  IRMapping mapping;
-  mapping.map(lhloBlock->getArguments(), argBufs);
-  for (auto& nested : lhloBlock->without_terminator()) {
-    auto* clone = b->clone(nested, mapping);
-    mapping.map(nested.getResults(), clone->getResults());
-  }
-  return b->create<memref::LoadOp>(loc, argBufs.back());
-}
-
-// Converts a block with LHLO ops and with signature:
-//   ^bb(%lhs: memref<f32>, %rhs: memref<f32>, %res: memref<f32>):
-// into a reduction operator of scf.reduce by doing buffer allocation for
-// scalar arguments and the result of `scf.reduce` to make it compatible with
-// LHLO ops.
-void convertToReductionOperator(Location loc, Block& loopReduceOpBody,
-                                Block* lhloBlock, OpBuilder* b) {
-  OpBuilder::InsertionGuard guard(*b);
-  b->setInsertionPointToStart(&loopReduceOpBody);
-  b->create<scf::ReduceReturnOp>(
-      loc, applySingleResultLhloCode(loc, loopReduceOpBody.getArguments(),
-                                     lhloBlock, b));
-}
-
-// Returns result of arith::ConstantOp if `dim` is static, otherwise uses DimOp
-// to extract dimension at runtime.
-Value getStaticOrDynamicDim(mlir::Location loc, Value shapedValue,
-                            size_t dimIndex, int64_t dim, OpBuilder* b) {
-  return dim == ShapedType::kDynamic
-             ? (Value)b->create<memref::DimOp>(loc, shapedValue, dimIndex)
-             : (Value)b->create<arith::ConstantIndexOp>(loc, dim);
-}
-
-struct MappedIvs {
-  // False if the mapped indices are in the padding area, true otherwise.
-  Value inBounds;
-  // Mapped indices.
-  SmallVector<Value, 2> ivs;
-};
-
-template <typename OpTy>
-MappedIvs mapWindowIvsToInput(OpTy op, Value operand, ValueRange ivs,
-                              ValueRange windowIvs, OpBuilder* b) {
-  MappedIvs mappedIvs;
-
-  if (!op.getWindowStrides().has_value()) {
-    op.emitOpError("No window strides specified.");
-  }
-  auto windowStrides = op.getWindowStrides().value();
-
-  if (!op.getPadding().has_value()) {
-    op.emitOpError("No padding specified.");
-  }
-  auto padding = op.getPadding().value();
-
-  auto loc = op.getLoc();
-  auto operandShape = operand.getType().template cast<MemRefType>().getShape();
-
-  // `in_bounds` is false when the mapped indices are in the padding area.
-  mappedIvs.inBounds = b->create<mlir::arith::ConstantOp>(
-      loc, b->getI1Type(), b->getIntegerAttr(b->getI1Type(), 1));
-  for (unsigned i = 0, e = ivs.size(); i < e; ++i) {
-    auto stride = windowStrides.template getValues<llvm::APInt>()[i];
-    auto padLow = padding.template getValues<llvm::APInt>()[{i, 0}];
-
-    Value strideVal =
-        b->create<arith::ConstantIndexOp>(loc, stride.getSExtValue());
-    Value padLowVal =
-        b->create<arith::ConstantIndexOp>(loc, padLow.getSExtValue());
-
-    Value center = b->create<arith::MulIOp>(loc, ivs[i], strideVal);
-    Value offset = b->create<arith::SubIOp>(loc, windowIvs[i], padLowVal);
-    Value index = b->create<arith::AddIOp>(loc, center, offset);
-    Value upperBound =
-        getStaticOrDynamicDim(loc, operand, i, operandShape[i], b);
-    // We must check whether 0 <= index_i < shape_i, as otherwise we are in
-    // the pad and then we have to use the neutral element for reduction.
-    // Equivalently, it can be computed as the unsigned comparison index_i <
-    // shape_i, since a negative value wraps to a large positive value.
-    mappedIvs.inBounds = b->create<mlir::arith::AndIOp>(
-        loc, mappedIvs.inBounds,
-        b->create<arith::CmpIOp>(loc, arith::CmpIPredicate::ult, index,
-                                 upperBound));
-    mappedIvs.ivs.push_back(index);
-  }
-  return mappedIvs;
-}
-
-// Returns scf::Parallel over a shaped value with static or dynamic shape.
-scf::ParallelOp makeLoopOverShape(Location loc, Value shapedValue,
-                                  OpBuilder* b) {
-  Value zero = b->create<arith::ConstantIndexOp>(loc, 0);
-  Value one = b->create<arith::ConstantIndexOp>(loc, 1);
-
-  ArrayRef<int64_t> shape = shapedValue.getType().cast<ShapedType>().getShape();
-  SmallVector<Value, 2> lower, upper, step;
-  for (const auto& dim : llvm::enumerate(shape)) {
-    upper.push_back(
-        getStaticOrDynamicDim(loc, shapedValue, dim.index(), dim.value(), b));
-    lower.push_back(zero);
-    step.push_back(one);
-  }
-  return b->create<scf::ParallelOp>(loc, lower, upper, step);
-}
-
-// Converts `lmhlo.ReduceOp` into two scf::ParallelOp and a scf::ReduceOp.
-// The outper `ParallelOp` refers to the parallel loops if there are
-// any. The inner `ParalleOp` refers to the reduction loops and `ReduceOp`
-// contains the reduction operator.
-//
-// Example:
-//
-//  "lmhlo.reduce"(%buffer, %init_buf, %result) ({
-//    ^bb0(%lhs: memref<f32>, %rhs: memref<f32>, %res: memref<f32>):
-//      <LHLO ops>
-//    } ) {dimensions = dense<[1]> : tensor<1xi64>}
-//      : (memref<100x10x5xf32>, memref<f32>, memref<100x5xf32>) -> ()
-//
-//  is roughly converted into:
-//
-//  %init = load %init_buf[] : memref<f32>
-//  scf.parallel (%i, %k) = (%c0, %c0) to (%c100, %c5) step (%c1, %c1) {
-//    %result = scf.parallel (%j) = (%c0) to (%c10) step (%c1) init (%init) {
-//      %elem_to_reduce = load %buffer[%i, %j, %k] : memref<100x10x5xf32>
-//      scf.reduce(%elem_to_reduce)  {
-//        ^bb0(%elem: f32, %acc: f32):
-//          elem_buf = alloc() : memref<f32>
-//          store %elem, elem_buf[] : memref<f32>
-//          acc_buf = alloc() : memref<f32>
-//          store %acc, acc_buf[] : memref<f32>
-//          <LHLO_ops>
-//          %acc_result = load acc_buf[] : memref<f32>
-//          scf.reduce.return %acc_result : f32
-//      } : f32
-//      scf.yield
-//    } : f32
-//    scf.yield
-//  }
-class ReduceOpConverter : public OpConversionPattern<lmhlo::ReduceOp> {
- public:
-  using OpConversionPattern<lmhlo::ReduceOp>::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      lmhlo::ReduceOp reduceOp, OpAdaptor /*adaptor*/,
-      ConversionPatternRewriter& rewriter) const final {
-    // TODO(b/183977252) : Handle variadic ReduceOp/ReduceWindowOp
-    if (reduceOp.getOut().size() != 1) return failure();
-
-    scf::ReduceOp scfReduceOp =
-        createReduceOpInNestedParallelLoops(reduceOp, &rewriter);
-    convertToReductionOperator(reduceOp.getLoc(),
-                               scfReduceOp.getReductions().front().front(),
-                               &reduceOp.getBody().front(), &rewriter);
-    rewriter.replaceOp(reduceOp, std::nullopt);
-    return success();
-  }
-
- private:
-  // Creates nested `scf.parallel` ops with `scf.reduce`. The outer ParallelOp
-  // refers to the parallel dimensions of `reduce_op` if any and the inner
-  // ParallelOp refers to the reduction dimensions. The scf.reduce op is
-  // returned.
-  //
-  // If the reduction argument is a memref<100x10x5xf32> and the
-  // reduction is performed along dimension 1 then this method will generate
-  //
-  //  %init = load %init_buf[] : memref<f32>
-  //  scf.parallel (%i, %k) = (%c0, %c0) to (%c100, %c5) step (%c1, %c1) {
-  //    %result = scf.parallel (%j) = (%c0) to (%c10) step (%c1) init (%init) {
-  //      %elem_to_reduce = load %buffer[%i, %j, %k] : memref<100x10x5xf32>
-  //      scf.reduce(%elem_to_reduce)  {
-  //        <THE BLOCK PTR TO BE RETURNED>
-  //      } : f32
-  //      scf.yield
-  //    } : f32
-  //    scf.yield
-  //  }
-  scf::ReduceOp createReduceOpInNestedParallelLoops(
-      lmhlo::ReduceOp reduceOp, ConversionPatternRewriter* rewriter) const {
-    auto loc = reduceOp.getLoc();
-    DenseSet<int> reducingDims;
-    for (const auto& rdim : reduceOp.getDimensions().getValues<APInt>()) {
-      reducingDims.insert(rdim.getSExtValue());
-    }
-
-    Value operand = reduceOp.getInputs().front();
-    Value out = reduceOp.getOut().front();
-    SmallVector<Value, 2> parallelLower, parallelUpper, parallelStep;
-    SmallVector<Value, 2> reduceLower, reduceUpper, reduceStep;
-    auto operandShape = operand.getType().cast<MemRefType>().getShape();
-    for (const auto& dim : llvm::enumerate(operandShape)) {
-      const bool isReducingDim = reducingDims.count(dim.index());
-
-      Value ub = getStaticOrDynamicDim(loc, operand, dim.index(), dim.value(),
-                                       rewriter);
-      Value lb = rewriter->create<arith::ConstantIndexOp>(loc, 0);
-      Value step = rewriter->create<arith::ConstantIndexOp>(loc, 1);
-      (isReducingDim ? reduceLower : parallelLower).push_back(lb);
-      (isReducingDim ? reduceUpper : parallelUpper).push_back(ub);
-      (isReducingDim ? reduceStep : parallelStep).push_back(step);
-    }
-    // Load initial value from memref<element_type>.
-    SmallVector<Value, 1> initValue = {rewriter->create<memref::LoadOp>(
-        loc, *reduceOp.getInitValues().begin())};
-    // Outer ParallelOp is not needed if it is a reduction across all dims.
-    scf::ParallelOp outer;
-    if (!parallelLower.empty()) {
-      outer = rewriter->create<scf::ParallelOp>(loc, parallelLower,
-                                                parallelUpper, parallelStep);
-      rewriter->setInsertionPointToStart(outer.getBody());
-    }
-    scf::ParallelOp inner = rewriter->create<scf::ParallelOp>(
-        loc, reduceLower, reduceUpper, reduceStep, ValueRange(initValue));
-    Value reductionResult = *inner.getResults().begin();
-
-    SmallVector<Value, 1> outIndices;
-    if (outer != nullptr) {
-      outIndices.reserve(outer.getNumLoops());
-      for (Value iv : outer.getInductionVars()) {
-        outIndices.push_back(iv);
-      }
-    } else {
-      outIndices.push_back(rewriter->create<arith::ConstantIndexOp>(loc, 0));
-    }
-
-    rewriter->create<memref::StoreOp>(loc, reductionResult, out, outIndices);
-
-    // Load the element to reduce.
-    SmallVector<Value, 2> indices;
-    indices.reserve(operandShape.size());
-
-    if (outer) {
-      auto innerIvsIt = inner.getInductionVars().begin();
-      auto outerIvsIt = outer.getInductionVars().begin();
-      for (unsigned i = 0, e = operandShape.size(); i < e; ++i) {
-        indices.push_back(reducingDims.count(i) ? *innerIvsIt++
-                                                : *outerIvsIt++);
-      }
-    } else {
-      indices = inner.getInductionVars();
-    }
-
-    rewriter->setInsertionPointToStart(inner.getBody());
-    Value elem = rewriter->create<mlir::memref::LoadOp>(
-        loc, reduceOp.getInputs().front(), indices);
-    return rewriter->create<scf::ReduceOp>(loc, elem);
-  }
-};
-
-// Pseudocode:
-// for each index O in output
-//   accumulator = neutral_value
-//   in_bounds = true
-//   for each index W in window
-//     for each dimension i from 0 to rank - 1
-//       index = O[i] * stride[i] + W[i] - pad_low[i]
-//       in_bounds = inbounds && (index `ult` shape[i])
-//       I[i] = index
-//     if (in_bounds)
-//       value = input[I]
-//     else
-//       value = neutral_value
-//     accumulator = reduction_operator(accumulator, value)
-//   output[O] = accumulator
-//
-// Converts `lmhlo.ReduceWindowOp` into two scf::ParallelOp and a
-// scf::ReduceOp.
-// The outper `ParallelOp` refers to the parallel loops that traverese output
-// buffer. The inner `ParalleOp` refers to the reduction loops that traverse
-// reduction windows and `ReduceOp` contains the reduction operator.
-//
-// Example:
-//
-// func @reduce_window(%arg: memref<112x112xf32>,
-//              %init: memref<f32>,
-//              %result: memref<56x56xf32>) {
-//   "lmhlo.reduce_window"(%arg, %init, %result) ({
-//     ^bb0(%lhs: memref<f32>, %rhs: memref<f32>, %res: memref<f32>):
-//       "lmhlo.maximum"(%lhs, %rhs, %res)
-//         : (memref<f32>, memref<f32>, memref<f32>) -> ()
-//       "lmhlo.terminator"() : () -> ()
-//     }) {
-//       padding = dense<[[0, 1], [0, 1]]> : tensor<2x2xi64>,
-//       window_dimensions = dense<[3, 3]> : tensor<2xi64>,
-//       window_strides = dense<[2, 2]> : tensor<2xi64>
-//     } : (memref<112x112xf32>, memref<f32>, memref<56x56xf32>) -> ()
-//   return
-// }
-//
-// is roughly converted into:
-//
-//    %neutral_elem = load %init_buf[] : memref<f32>
-//    scf.parallel (%i, %j) = (%c0, %c0) to (%c56, %c56) step (%c1, %c1) {
-//      %result = scf.parallel (%iw, %jw) = (%c0, %c0)
-//                  to (%c3, %c3) step (%c1, %c1) neutral_elem (%0) -> f32 {
-//        %in_bounds = <COMPUTE IF INDEX IS IN OPERAND'S pad>
-//        %elem = load %operand[%computed_i, %computed_j]
-//        %elem_or_neutral = select %in_bounds, %elem, %neutral_elem : f32
-//        scf.reduce(%elem_to_reduce)  : f32 {
-//          ^bb0(%arg7: f32, %arg8: f32):
-//            <LHLO ops>
-//        }
-//        scf.yield
-//      }
-//      store %result, %output_buffer[%i, %j] : memref<56x56xf32>
-//      scf.yield
-//    }
-//    return
-//  }
-class ReduceWindowOpConverter
-    : public OpConversionPattern<lmhlo::ReduceWindowOp> {
- public:
-  using OpConversionPattern<lmhlo::ReduceWindowOp>::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      lmhlo::ReduceWindowOp reduceWindowOp, OpAdaptor /*adaptor*/,
-      ConversionPatternRewriter& rewriter) const final {
-    // TODO(b/183977252) : Handle variadic ReduceOp/ReduceWindowOp
-    if (reduceWindowOp.getOut().size() != 1) return failure();
-
-    scf::ParallelOp outputLoop, windowLoop;
-    std::tie(outputLoop, windowLoop) =
-        createParallelLoopsToTraverseOutputAndWindow(reduceWindowOp, &rewriter);
-
-    scf::ReduceOp reduceOp = createReduceOpInNestedParallelLoops(
-        reduceWindowOp, outputLoop, windowLoop, &rewriter);
-
-    convertToReductionOperator(reduceWindowOp.getLoc(),
-                               reduceOp.getReductions().front().front(),
-                               &reduceWindowOp.getBody().front(), &rewriter);
-    rewriter.replaceOp(reduceWindowOp, std::nullopt);
-    return success();
-  }
-
- private:
-  std::pair<scf::ParallelOp, scf::ParallelOp>
-  createParallelLoopsToTraverseOutputAndWindow(
-      lmhlo::ReduceWindowOp reduceWindowOp,
-      ConversionPatternRewriter* rewriter) const {
-    auto loc = reduceWindowOp.getLoc();
-    Value initValue = rewriter->create<memref::LoadOp>(
-        loc, reduceWindowOp.getInitValues()[0]);
-
-    Value zero = rewriter->create<arith::ConstantIndexOp>(loc, 0);
-    Value one = rewriter->create<arith::ConstantIndexOp>(loc, 1);
-
-    // Create an outer parallel loop that spans the output of ReduceWindowOp.
-    Value output = reduceWindowOp.getOut()[0];
-    auto outputLoop = makeLoopOverShape(loc, output, rewriter);
-
-    // Create a nested loop that traverses the window.
-    SmallVector<Value, 2> windowLower, windowUpper, windowStep;
-    rewriter->setInsertionPointToStart(outputLoop.getBody());
-    for (const auto& windowDim : reduceWindowOp.getWindowDimensions()) {
-      windowStep.push_back(one);
-      windowLower.push_back(zero);
-      windowUpper.push_back(rewriter->create<arith::ConstantIndexOp>(
-          loc, windowDim.getSExtValue()));
-    }
-    auto windowLoop = rewriter->create<scf::ParallelOp>(
-        loc, windowLower, windowUpper, windowStep, ValueRange(initValue));
-
-    Value reductionResult = *windowLoop.getResults().begin();
-    auto outputIvs = outputLoop.getInductionVars();
-    rewriter->create<memref::StoreOp>(loc, reductionResult, output, outputIvs);
-    return std::make_pair(outputLoop, windowLoop);
-  }
-
-  scf::ReduceOp createReduceOpInNestedParallelLoops(
-      lmhlo::ReduceWindowOp reduceWindowOp, scf::ParallelOp outputLoop,
-      scf::ParallelOp windowLoop, ConversionPatternRewriter* rewriter) const {
-    rewriter->setInsertionPointToStart(windowLoop.getBody());
-    auto loc = reduceWindowOp.getLoc();
-
-    if (reduceWindowOp.getBaseDilations().has_value() ||
-        reduceWindowOp.getWindowDilations().has_value()) {
-      reduceWindowOp.emitRemark(
-          "Lowering to parallel loops does not support `base_dilations` or "
-          "`window_dilations` attributes yet. The attributes will be ignored.");
-    }
-
-    Value input = reduceWindowOp.getInputs()[0];
-    auto inputType = input.getType().cast<MemRefType>();
-
-    // Compute ivs in 'arg' buffer and whether these ivs are in pad area or not.
-    MappedIvs mappedIvs = mapWindowIvsToInput(
-        reduceWindowOp, input, outputLoop.getInductionVars(),
-        windowLoop.getInductionVars(), rewriter);
-
-    auto elemOrInit = rewriter->create<scf::IfOp>(
-        loc, inputType.getElementType(), mappedIvs.inBounds,
-        /*withElseRegion=*/true);
-
-    OpBuilder thenBuilder =
-        elemOrInit.getThenBodyBuilder(rewriter->getListener());
-    Value elem =
-        thenBuilder.create<mlir::memref::LoadOp>(loc, input, mappedIvs.ivs);
-    thenBuilder.create<scf::YieldOp>(loc, elem);
-
-    OpBuilder elseBuilder =
-        elemOrInit.getElseBodyBuilder(rewriter->getListener());
-    elseBuilder.create<scf::YieldOp>(loc, *windowLoop.getInitVals().begin());
-
-    return rewriter->create<scf::ReduceOp>(loc,
-                                           *elemOrInit.getResults().begin());
-  }
-};
-
-// See the operation semantics in
-// https://www.tensorflow.org/xla/operation_semantics#selectandscatter
-//
-// Pseudocode:
-//  scf.parallel(coordinates O in the output):
-//    output[O] = init
-//  scf.parallel(coordinates S in the source):
-//    selected_ivs = 0
-//    selected_val = 0
-//    initialized_flag = false
-//    scf.for (first dim W_1 in the window)
-//         iter_args (selected_ivs, selected_val, initialized_flag):
-//    ...
-//      scf.for (last dim W_N in the window):
-//           iter_args (selected_ivs, selected_val, initialized_flag):
-//        I = S * stride + W - pad_low
-//        if I within bounds of operand:
-//          if (initialized_flag):
-//            pred = select(selected_value, operand(I))):
-//            if (pred)
-//              selected_value = operand(I)
-//              selected_index = I
-//          else
-//              selected_value = operand(I)
-//              selected_index = I
-//              initialized_flag = true
-//    output(selected_index) = scatter(output(selected_index), source(S))
-class SelectAndScatterOpConverter
-    : public OpConversionPattern<lmhlo::SelectAndScatterOp> {
- public:
-  using OpConversionPattern<lmhlo::SelectAndScatterOp>::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      lmhlo::SelectAndScatterOp sAndSOp, OpAdaptor /*adaptor*/,
-      ConversionPatternRewriter& rewriter) const final {
-    auto loc = sAndSOp.getLoc();
-    initializeOutput(sAndSOp, &rewriter);
-    scf::ParallelOp loopOverSrc =
-        makeLoopOverShape(loc, sAndSOp.getSource(), &rewriter);
-    rewriter.setInsertionPointToStart(loopOverSrc.getBody());
-
-    // Compute indices of the selected element in the window.
-    auto selectedIvs = selectIvs(sAndSOp, loopOverSrc, &rewriter);
-
-    // Load `source[selected_ivs]`.
-    auto srcElem = rewriter.create<memref::LoadOp>(
-        loc, sAndSOp.getSource(), loopOverSrc.getInductionVars());
-
-    // Compute `out[selected_ivs]` = scatter(out[selected_ivs], src_element)`.
-    auto rmw = rewriter.create<memref::GenericAtomicRMWOp>(
-        loc, sAndSOp.getOut(), selectedIvs);
-    OpBuilder rmwBuilder = OpBuilder::atBlockEnd(rmw.getBody());
-    auto accResult =
-        applySingleResultLhloCode(loc, {srcElem, rmw.getCurrentValue()},
-                                  &sAndSOp.getScatter().front(), &rmwBuilder);
-    rmwBuilder.create<memref::AtomicYieldOp>(loc, accResult);
-
-    rewriter.replaceOp(sAndSOp, std::nullopt);
-    return success();
-  }
-
- private:
-  void initializeOutput(lmhlo::SelectAndScatterOp sAndSOp, OpBuilder* b) const {
-    auto loc = sAndSOp.getLoc();
-    Value initValue = b->create<memref::LoadOp>(loc, sAndSOp.getInitValue());
-
-    scf::ParallelOp loopOverOutput =
-        makeLoopOverShape(loc, sAndSOp.getOut(), b);
-    OpBuilder::InsertionGuard guard(*b);
-    b->setInsertionPointToStart(loopOverOutput.getBody());
-    b->create<memref::StoreOp>(loc, initValue, sAndSOp.getOut(),
-                               loopOverOutput.getInductionVars());
-  }
-
-  struct WindowLoops {
-    SmallVector<Value, 2> selectedIvs;
-    SmallVector<Value, 2> windowIvs;
-    scf::ForOp innerLoop;
-  };
-  WindowLoops insertWindowLoops(lmhlo::SelectAndScatterOp sAndSOp,
-                                scf::ParallelOp loopOverSrc,
-                                OpBuilder* b) const {
-    auto loc = sAndSOp.getLoc();
-    Value zero = b->create<arith::ConstantIndexOp>(loc, 0);
-    Value one = b->create<arith::ConstantIndexOp>(loc, 1);
-
-    auto elementType =
-        sAndSOp.getOut().getType().cast<MemRefType>().getElementType();
-    auto rank = loopOverSrc.getNumLoops();
-
-    // `iter_args` = [iv_1, ..., iv_N, selected_value, is_initialized]
-    SmallVector<Value, 4> iterArgs(rank, zero);
-    iterArgs.push_back(b->create<mlir::arith::ConstantOp>(
-        loc, elementType, b->getFloatAttr(elementType, 0)));
-    iterArgs.push_back(b->create<mlir::arith::ConstantOp>(
-        loc, b->getI1Type(), b->getIntegerAttr(b->getI1Type(), 0)));
-
-    // Create a nested loop that traverses the window.
-    OpBuilder::InsertPoint ip;
-    WindowLoops result;
-    for (const auto& windowDim :
-         sAndSOp.getWindowDimensions()->getValues<APInt>()) {
-      Value upper =
-          b->create<arith::ConstantIndexOp>(loc, windowDim.getSExtValue());
-      result.innerLoop = b->create<scf::ForOp>(loc, zero, upper, one, iterArgs);
-      if (b->getInsertionBlock() == loopOverSrc.getBody()) {
-        ip = b->saveInsertionPoint();
-        result.selectedIvs = result.innerLoop.getResults().take_front(rank);
-      } else {
-        b->create<scf::YieldOp>(loc, result.innerLoop.getResults());
-      }
-      b->setInsertionPointToStart(result.innerLoop.getBody());
-      iterArgs = ValueRange{result.innerLoop.getRegionIterArgs()};
-      result.windowIvs.push_back(result.innerLoop.getInductionVar());
-    }
-    b->restoreInsertionPoint(ip);
-    return result;
-  }
-
-  // Adapter to store iteration arguments of sequential loops that perform
-  // select in a window.
-  class IterArgs {
-   public:
-    explicit IterArgs(ValueRange ivsValFlag) : ivsValFlag(ivsValFlag) {}
-    IterArgs(ValueRange ivs, Value value, Value flag) {
-      ivsValFlag = ivs;
-      ivsValFlag.push_back(value);
-      ivsValFlag.push_back(flag);
-    }
-
-    ArrayRef<Value> toVector() const { return ivsValFlag; }
-
-    // Indices of the currently selected value.
-    ArrayRef<Value> ivs() const { return toVector().drop_back(2); }
-    // Currently selected value w.r.t. select() function.
-    Value value() const { return ivsValFlag.end()[-2]; }
-    // i1 flag if value() and ivs() were initialized.
-    Value isInit() const { return ivsValFlag.back(); }
-
-   private:
-    // Vector that stores iv_1, ..., iv_N, value, init.
-    SmallVector<Value, 4> ivsValFlag;
-  };
-
-  SmallVector<Value, 2> selectIvs(lmhlo::SelectAndScatterOp sAndSOp,
-                                  scf::ParallelOp loopOverSrc,
-                                  OpBuilder* b) const {
-    auto loc = sAndSOp.getLoc();
-
-    WindowLoops windowLoops = insertWindowLoops(sAndSOp, loopOverSrc, b);
-    auto innerLoopB = OpBuilder::atBlockEnd(windowLoops.innerLoop.getBody());
-
-    // Compute ivs in 'arg' buffer and whether these ivs are in the pad area.
-    MappedIvs mappedIvs = mapWindowIvsToInput(
-        sAndSOp, sAndSOp.getOperand(), loopOverSrc.getInductionVars(),
-        windowLoops.windowIvs, &innerLoopB);
-
-    IterArgs ivsValFlag(windowLoops.innerLoop.getRegionIterArgs());
-
-    auto ifInBounds = innerLoopB.create<scf::IfOp>(
-        loc, windowLoops.innerLoop.getResultTypes(), mappedIvs.inBounds,
-        /*withElseRegion=*/true);
-
-    // Case when we are inside boundaries of 'arg' and not in the pad area.
-    {
-      OpBuilder inBoundsThenB = ifInBounds.getThenBodyBuilder(b->getListener());
-      auto selectOrInitResults = selectOrInitialize(
-          sAndSOp, mappedIvs.ivs, &ivsValFlag, &inBoundsThenB);
-      inBoundsThenB.create<scf::YieldOp>(loc, selectOrInitResults);
-    }
-
-    // Case when we are in the pad.
-    {
-      OpBuilder inBoundsElseB = ifInBounds.getElseBodyBuilder(b->getListener());
-      inBoundsElseB.create<scf::YieldOp>(loc, ivsValFlag.toVector());
-    }
-
-    innerLoopB.create<scf::YieldOp>(loc, ifInBounds.getResults());
-    return windowLoops.selectedIvs;
-  }
-
-  SmallVector<Value, 4> selectOrInitialize(lmhlo::SelectAndScatterOp sAndSOp,
-                                           ArrayRef<Value> operandIvs,
-                                           IterArgs* ivsValFlag,
-                                           OpBuilder* b) const {
-    auto loc = sAndSOp.getLoc();
-    Value trueI1 = b->create<mlir::arith::ConstantOp>(
-        loc, b->getI1Type(), b->getIntegerAttr(b->getI1Type(), 1));
-
-    const TypeRange iterArgTypes{ValueRange{ivsValFlag->toVector()}};
-    Value operandElem =
-        b->create<memref::LoadOp>(loc, sAndSOp.getOperand(), operandIvs);
-    auto ifInit = b->create<scf::IfOp>(loc, iterArgTypes, ivsValFlag->isInit(),
-                                       /*withElseRegion=*/true);
-    // Init == true, i.e. iter args are already initialized with a selected
-    // element in boundaries of the operand. Select function has to be computed
-    // here.
-    {
-      OpBuilder ifInitThenB = ifInit.getThenBodyBuilder(b->getListener());
-
-      auto& lhloSelect = sAndSOp.getSelect().front();
-      Value pred = applySingleResultLhloCode(
-          loc, {operandElem, ivsValFlag->value()}, &lhloSelect, &ifInitThenB);
-
-      auto ifPred = ifInitThenB.create<scf::IfOp>(loc, iterArgTypes, pred,
-                                                  /*withElseRegion=*/true);
-
-      // Pred == true, therefore pack newly selected ivs, val and init flag back
-      // to iter_args and return.
-      {
-        OpBuilder ifPredThenB = ifPred.getThenBodyBuilder(b->getListener());
-        ifPredThenB.create<scf::YieldOp>(
-            loc, IterArgs{operandIvs, operandElem, trueI1}.toVector());
-      }
-
-      // Pred == false, therefore return old iter_args.
-      {
-        OpBuilder ifPredElseB = ifPred.getElseBodyBuilder(b->getListener());
-        ifPredElseB.create<scf::YieldOp>(loc, ivsValFlag->toVector());
-      }
-
-      ifInitThenB.create<scf::YieldOp>(loc, ifPred.getResults());
-    }
-    // Init == false, i.e. only pad was visited before and this is the first
-    // element in the boundaries of the operand.
-    {
-      OpBuilder ifInitElseB = ifInit.getElseBodyBuilder(b->getListener());
-
-      ifInitElseB.create<scf::YieldOp>(
-          loc, IterArgs{operandIvs, operandElem, trueI1}.toVector());
-    }
-    return ifInit.getResults();
-  }
-};
-
-struct LhloLegalizeToParallelLoopsPass
-    : public impl::LhloLegalizeToParallelLoopsPassBase<
-          LhloLegalizeToParallelLoopsPass> {
-  void getDependentDialects(DialectRegistry& registry) const override {
-    registry.insert<arith::ArithDialect, func::FuncDialect,
-                    memref::MemRefDialect, scf::SCFDialect>();
-  }
-
-  void runOnOperation() override {
-    auto func = getOperation();
-
-    RewritePatternSet patterns(&getContext());
-    // clang-format off
-    patterns.add<
-        ReduceOpConverter,
-        ReduceWindowOpConverter,
-        SelectAndScatterOpConverter
-      >(func.getContext());
-    // clang-format on
-
-    ConversionTarget target(getContext());
-    target.addLegalDialect<arith::ArithDialect, linalg::LinalgDialect,
-                           memref::MemRefDialect, func::FuncDialect,
-                           scf::SCFDialect, LmhloDialect>();
-    target.addIllegalOp<lmhlo::ReduceOp, lmhlo::ReduceWindowOp,
-                        lmhlo::SelectAndScatterOp>();
-
-    if (failed(applyPartialConversion(func, target, std::move(patterns)))) {
-      signalPassFailure();
-    }
-  }
-};
-}  // namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>>
-createLegalizeLhloToParallelLoopsPass() {
-  return std::make_unique<LhloLegalizeToParallelLoopsPass>();
-}
-
-}  // namespace lmhlo
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/lhlo/transforms/lmhlo_passes.td b/third_party/xla/xla/mlir_hlo/lhlo/transforms/lmhlo_passes.td
deleted file mode 100644
index 189e9b7761ea14..00000000000000
--- a/third_party/xla/xla/mlir_hlo/lhlo/transforms/lmhlo_passes.td
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-include "mlir/Pass/PassBase.td"
-
-def LhloLegalizeToAffinePass : Pass<"lhlo-legalize-to-affine", "func::FuncOp"> {
-  let summary = "Legalize from LHLO dialect to affine dialect.";
-  let constructor = "createLhloLegalizeToAffinePass()";
-}
-
-
-def LhloLegalizeToGpuPass : Pass<"lhlo-legalize-to-gpu", "func::FuncOp"> {
-  let summary = "Legalize from LHLO dialect to GPU dialect.";
-  let constructor = "createLegalizeToGpuPass()";
-}
-
-
-def LhloLegalizeToParallelLoopsPass : Pass<"lhlo-legalize-to-parallel-loops", "func::FuncOp"> {
-  let summary = "Legalize from LHLO dialect to parallel loops.";
-  let constructor = "createLegalizeLhloToParallelLoopsPass()";
-}
-
-def LegalizeToTensorOpPass : Pass<"lhlo-legalize-to-tensor-op", "func::FuncOp"> {
-  let summary = "Legalize bufferization.to_tensor ops inserted during mhlo to lmhlo conversion.";
-  let constructor = "createLegalizeToTensorOpPass()";
-}
diff --git a/third_party/xla/xla/mlir_hlo/lhlo/transforms/map_hlo_to_lhlo_op.h b/third_party/xla/xla/mlir_hlo/lhlo/transforms/map_hlo_to_lhlo_op.h
deleted file mode 100644
index b12cfd580f6cff..00000000000000
--- a/third_party/xla/xla/mlir_hlo/lhlo/transforms/map_hlo_to_lhlo_op.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_LHLO_TRANSFORMS_MAP_HLO_TO_LHLO_OP_H
-#define MLIR_HLO_LHLO_TRANSFORMS_MAP_HLO_TO_LHLO_OP_H
-
-#include <type_traits>
-
-#include "lhlo/IR/lhlo_ops.h"
-#include "mhlo/IR/hlo_ops.h"
-
-namespace mlir {
-namespace mhlo {
-
-template <typename HloOpTy>
-struct HloToLhloOpImpl {
-  using Type = std::false_type;
-};
-template <typename HloOpTy>
-using HloToLhloOp = typename HloToLhloOpImpl<HloOpTy>::Type;
-
-#define MAP_HLO_TO_LHLO(OpName)          \
-  template <>                            \
-  struct HloToLhloOpImpl<mhlo::OpName> { \
-    using Type = lmhlo::OpName;          \
-  }
-
-MAP_HLO_TO_LHLO(AbsOp);
-MAP_HLO_TO_LHLO(AddOp);
-MAP_HLO_TO_LHLO(AndOp);
-MAP_HLO_TO_LHLO(Atan2Op);
-MAP_HLO_TO_LHLO(BatchNormGradOp);
-MAP_HLO_TO_LHLO(BatchNormTrainingOp);
-MAP_HLO_TO_LHLO(BitcastConvertOp);
-MAP_HLO_TO_LHLO(BroadcastInDimOp);
-MAP_HLO_TO_LHLO(CeilOp);
-MAP_HLO_TO_LHLO(ClampOp);
-MAP_HLO_TO_LHLO(ConstantOp);
-MAP_HLO_TO_LHLO(CompareOp);
-MAP_HLO_TO_LHLO(ComplexOp);
-MAP_HLO_TO_LHLO(ConcatenateOp);
-MAP_HLO_TO_LHLO(ConvolutionOp);
-MAP_HLO_TO_LHLO(ConvertOp);
-MAP_HLO_TO_LHLO(CopyOp);
-MAP_HLO_TO_LHLO(CosineOp);
-MAP_HLO_TO_LHLO(CustomCallOp);
-MAP_HLO_TO_LHLO(DivOp);
-MAP_HLO_TO_LHLO(DotOp);
-MAP_HLO_TO_LHLO(DynamicBroadcastInDimOp);
-MAP_HLO_TO_LHLO(DynamicGatherOp);
-MAP_HLO_TO_LHLO(DynamicIotaOp);
-MAP_HLO_TO_LHLO(DynamicPadOp);
-MAP_HLO_TO_LHLO(DynamicReshapeOp);
-MAP_HLO_TO_LHLO(ExpOp);
-MAP_HLO_TO_LHLO(Expm1Op);
-MAP_HLO_TO_LHLO(FloorOp);
-MAP_HLO_TO_LHLO(GatherOp);
-MAP_HLO_TO_LHLO(ImagOp);
-MAP_HLO_TO_LHLO(IotaOp);
-MAP_HLO_TO_LHLO(IsFiniteOp);
-MAP_HLO_TO_LHLO(LogOp);
-MAP_HLO_TO_LHLO(LogisticOp);
-MAP_HLO_TO_LHLO(Log1pOp);
-MAP_HLO_TO_LHLO(MaxOp);
-MAP_HLO_TO_LHLO(MinOp);
-MAP_HLO_TO_LHLO(MulOp);
-MAP_HLO_TO_LHLO(NegOp);
-MAP_HLO_TO_LHLO(NotOp);
-MAP_HLO_TO_LHLO(OrOp);
-MAP_HLO_TO_LHLO(PowOp);
-MAP_HLO_TO_LHLO(RealDynamicSliceOp);
-MAP_HLO_TO_LHLO(RealOp);
-MAP_HLO_TO_LHLO(ReduceOp);
-MAP_HLO_TO_LHLO(ReduceWindowOp);
-MAP_HLO_TO_LHLO(ReshapeOp);
-MAP_HLO_TO_LHLO(RemOp);
-MAP_HLO_TO_LHLO(RsqrtOp);
-MAP_HLO_TO_LHLO(SelectOp);
-MAP_HLO_TO_LHLO(ShiftLeftOp);
-MAP_HLO_TO_LHLO(ShiftRightArithmeticOp);
-MAP_HLO_TO_LHLO(ShiftRightLogicalOp);
-MAP_HLO_TO_LHLO(SignOp);
-MAP_HLO_TO_LHLO(SineOp);
-MAP_HLO_TO_LHLO(SliceOp);
-MAP_HLO_TO_LHLO(SqrtOp);
-MAP_HLO_TO_LHLO(SubtractOp);
-MAP_HLO_TO_LHLO(TanOp);
-MAP_HLO_TO_LHLO(TanhOp);
-MAP_HLO_TO_LHLO(TransposeOp);
-MAP_HLO_TO_LHLO(XorOp);
-
-#undef MAP_HLO_TO_LHLO
-
-}  // namespace mhlo
-}  // namespace mlir
-
-#endif  // MLIR_HLO_LHLO_TRANSFORMS_MAP_HLO_TO_LHLO_OP_H
diff --git a/third_party/xla/xla/mlir_hlo/lhlo/transforms/map_lhlo_to_hlo_op.h b/third_party/xla/xla/mlir_hlo/lhlo/transforms/map_lhlo_to_hlo_op.h
deleted file mode 100644
index 269cdccf3c70b7..00000000000000
--- a/third_party/xla/xla/mlir_hlo/lhlo/transforms/map_lhlo_to_hlo_op.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_LHLO_TRANSFORMS_MAP_LHLO_TO_HLO_OP_H
-#define MLIR_HLO_LHLO_TRANSFORMS_MAP_LHLO_TO_HLO_OP_H
-
-#include <type_traits>
-
-#include "lhlo/IR/lhlo_ops.h"
-#include "mhlo/IR/hlo_ops.h"
-
-namespace mlir {
-namespace lmhlo {
-
-template <typename LhloOpTy>
-struct LhloToHloOpImpl {
-  using Type = std::false_type;
-};
-template <typename LhloOpTy>
-using LhloToHloOp = typename LhloToHloOpImpl<LhloOpTy>::Type;
-
-#define MAP_LHLO_TO_HLO(OpName)           \
-  template <>                             \
-  struct LhloToHloOpImpl<lmhlo::OpName> { \
-    using Type = mhlo::OpName;            \
-  }
-
-MAP_LHLO_TO_HLO(AbsOp);
-MAP_LHLO_TO_HLO(AddOp);
-MAP_LHLO_TO_HLO(AndOp);
-MAP_LHLO_TO_HLO(Atan2Op);
-MAP_LHLO_TO_HLO(BitcastConvertOp);
-MAP_LHLO_TO_HLO(BroadcastInDimOp);
-MAP_LHLO_TO_HLO(CeilOp);
-MAP_LHLO_TO_HLO(ClampOp);
-MAP_LHLO_TO_HLO(ConstantOp);
-MAP_LHLO_TO_HLO(CompareOp);
-MAP_LHLO_TO_HLO(ComplexOp);
-MAP_LHLO_TO_HLO(ConcatenateOp);
-MAP_LHLO_TO_HLO(ConvolutionOp);
-MAP_LHLO_TO_HLO(ConvertOp);
-MAP_LHLO_TO_HLO(CopyOp);
-MAP_LHLO_TO_HLO(CosineOp);
-MAP_LHLO_TO_HLO(CustomCallOp);
-MAP_LHLO_TO_HLO(DivOp);
-MAP_LHLO_TO_HLO(DotOp);
-MAP_LHLO_TO_HLO(DynamicBroadcastInDimOp);
-MAP_LHLO_TO_HLO(DynamicGatherOp);
-MAP_LHLO_TO_HLO(DynamicIotaOp);
-MAP_LHLO_TO_HLO(DynamicPadOp);
-MAP_LHLO_TO_HLO(DynamicReshapeOp);
-MAP_LHLO_TO_HLO(ExpOp);
-MAP_LHLO_TO_HLO(Expm1Op);
-MAP_LHLO_TO_HLO(FloorOp);
-MAP_LHLO_TO_HLO(GatherOp);
-MAP_LHLO_TO_HLO(ImagOp);
-MAP_LHLO_TO_HLO(IotaOp);
-MAP_LHLO_TO_HLO(IsFiniteOp);
-MAP_LHLO_TO_HLO(LogOp);
-MAP_LHLO_TO_HLO(LogisticOp);
-MAP_LHLO_TO_HLO(Log1pOp);
-MAP_LHLO_TO_HLO(MaxOp);
-MAP_LHLO_TO_HLO(MinOp);
-MAP_LHLO_TO_HLO(MulOp);
-MAP_LHLO_TO_HLO(NegOp);
-MAP_LHLO_TO_HLO(NotOp);
-MAP_LHLO_TO_HLO(OrOp);
-MAP_LHLO_TO_HLO(PowOp);
-MAP_LHLO_TO_HLO(RealDynamicSliceOp);
-MAP_LHLO_TO_HLO(RealOp);
-MAP_LHLO_TO_HLO(ReduceOp);
-MAP_LHLO_TO_HLO(ReshapeOp);
-MAP_LHLO_TO_HLO(RemOp);
-MAP_LHLO_TO_HLO(RsqrtOp);
-MAP_LHLO_TO_HLO(SelectOp);
-MAP_LHLO_TO_HLO(ShiftLeftOp);
-MAP_LHLO_TO_HLO(ShiftRightArithmeticOp);
-MAP_LHLO_TO_HLO(ShiftRightLogicalOp);
-MAP_LHLO_TO_HLO(SignOp);
-MAP_LHLO_TO_HLO(SineOp);
-MAP_LHLO_TO_HLO(SliceOp);
-MAP_LHLO_TO_HLO(SqrtOp);
-MAP_LHLO_TO_HLO(SubtractOp);
-MAP_LHLO_TO_HLO(TanOp);
-MAP_LHLO_TO_HLO(TanhOp);
-MAP_LHLO_TO_HLO(TransposeOp);
-MAP_LHLO_TO_HLO(XorOp);
-
-#undef MAP_LHLO_TO_HLO
-
-}  // namespace lmhlo
-}  // namespace mlir
-
-#endif  // MLIR_HLO_LHLO_TRANSFORMS_MAP_LHLO_TO_HLO_OP_H
diff --git a/third_party/xla/xla/mlir_hlo/lhlo/transforms/map_lmhlo_to_scalar_op.h b/third_party/xla/xla/mlir_hlo/lhlo/transforms/map_lmhlo_to_scalar_op.h
deleted file mode 100644
index 0a9dec9f7179b5..00000000000000
--- a/third_party/xla/xla/mlir_hlo/lhlo/transforms/map_lmhlo_to_scalar_op.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_LHLO_TRANSFORMS_MAP_LMHLO_TO_SCALAR_OP_H
-#define MLIR_HLO_LHLO_TRANSFORMS_MAP_LMHLO_TO_SCALAR_OP_H
-
-#include "lhlo/transforms/map_lhlo_to_hlo_op.h"
-#include "mhlo/transforms/map_mhlo_to_scalar_op.h"
-
-namespace mlir {
-namespace lmhlo {
-
-struct LhloOpToStdScalarOp {
-  // Implementation for LHLO ops except lmhlo::CompareOp.
-  template <typename LhloOpTy, typename MhloOpTy = lmhlo::LhloToHloOp<LhloOpTy>,
-            typename = std::enable_if_t<
-                !std::is_same<LhloOpTy, lmhlo::CompareOp>::value &&
-                !std::is_same<MhloOpTy, std::false_type>::value>>
-  static Value map(LhloOpTy op, ArrayRef<Type> resultTypes, ValueRange args,
-                   OpBuilder* b, int /*i*/ = 0) {
-    return mlir::mhlo::impl::mapMhloOpToStdScalarOp<MhloOpTy>(
-        op.getLoc(), resultTypes, llvm::to_vector<4>(op->getOperandTypes()),
-        args, b);
-  }
-
-  // Implementation for lmhlo::CompareOp.
-  template <typename LhloOpTy, typename = std::enable_if_t<std::is_same<
-                                   LhloOpTy, lmhlo::CompareOp>::value>>
-  static Value map(lmhlo::CompareOp op, ArrayRef<Type> resultTypes,
-                   ValueRange args, OpBuilder* b) {
-    return mlir::mhlo::impl::mapMhloOpToStdScalarOp<mhlo::CompareOp>(
-        op.getLoc(), resultTypes, llvm::to_vector<4>(op->getOperandTypes()),
-        mhlo::CompareOp::Adaptor(args, op->getAttrDictionary()), b);
-  }
-
-  // Implementation for LHLO ops except lmhlo::CompareOp.
-  template <typename LhloOpTy, typename MhloOpTy = lmhlo::LhloToHloOp<LhloOpTy>,
-            typename = std::enable_if_t<
-                !std::is_same<LhloOpTy, lmhlo::CompareOp>::value &&
-                !std::is_same<MhloOpTy, std::false_type>::value>>
-  static Value map(Location loc, ArrayRef<Type> resultTypes,
-                   ArrayRef<Type> argTypes, ValueRange args, OpBuilder* b,
-                   unsigned /*i*/ = 0) {
-    return mlir::mhlo::impl::mapMhloOpToStdScalarOp<MhloOpTy>(
-        loc, resultTypes, argTypes, args, b);
-  }
-};
-
-}  // namespace lmhlo
-}  // namespace mlir
-
-#endif  // MLIR_HLO_LHLO_TRANSFORMS_MAP_LMHLO_TO_SCALAR_OP_H
diff --git a/third_party/xla/xla/mlir_hlo/lhlo/transforms/passes.h b/third_party/xla/xla/mlir_hlo/lhlo/transforms/passes.h
deleted file mode 100644
index 41a8247ea33938..00000000000000
--- a/third_party/xla/xla/mlir_hlo/lhlo/transforms/passes.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_LHLO_TRANSFORMS_PASSES_H
-#define MLIR_HLO_LHLO_TRANSFORMS_PASSES_H
-
-#include <memory>
-
-#include "llvm/ADT/ArrayRef.h"
-#include "mlir/Pass/Pass.h"
-
-namespace mlir {
-
-class ModuleOp;
-class Operation;
-template <typename T>
-class OperationPass;
-class Pass;
-namespace func {
-class FuncOp;
-}  // namespace func
-namespace lmhlo {
-class FusionOp;
-}  // namespace lmhlo
-
-namespace lmhlo {
-
-#define GEN_PASS_DECL
-#include "lhlo/transforms/lmhlo_passes.h.inc"
-
-// Lowers from LHLO dialect to Affine dialect.
-std::unique_ptr<OperationPass<func::FuncOp>> createLhloLegalizeToAffinePass();
-
-// Lowers from LHLO dialect to GPU dialect.
-std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeToGpuPass();
-
-// Lowers from LHLO dialect to parallel loops.
-std::unique_ptr<OperationPass<func::FuncOp>>
-createLegalizeLhloToParallelLoopsPass();
-
-// Legalizes tensor load ops that are inserted during mhlo to lmhlo conversion.
-std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeToTensorOpPass();
-
-// Input inline fusion pass for fusion codegen
-std::unique_ptr<OperationPass<func::FuncOp>> createInputInlineFusionPass();
-
-#define GEN_PASS_REGISTRATION
-#include "lhlo/transforms/lmhlo_passes.h.inc"
-
-}  // namespace lmhlo
-}  // namespace mlir
-
-#endif  // MLIR_HLO_LHLO_TRANSFORMS_PASSES_H
diff --git a/third_party/xla/xla/mlir_hlo/lhlo/utils/lhlo_utils.h b/third_party/xla/xla/mlir_hlo/lhlo/utils/lhlo_utils.h
deleted file mode 100644
index d85288237d78f2..00000000000000
--- a/third_party/xla/xla/mlir_hlo/lhlo/utils/lhlo_utils.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_LHLO_UTILS_LHLO_UTILS_H
-#define MLIR_HLO_LHLO_UTILS_LHLO_UTILS_H
-
-#include <optional>
-
-#include "mhlo/IR/hlo_ops_common.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/Types.h"
-#include "stablehlo/dialect/TypeInference.h"
-
-namespace mlir {
-namespace lmhlo {
-
-// TODO(b/236017415): remove when mhlo uses prefix accessor.
-namespace accessor_dispatch {
-template <typename OpT>
-auto getOutputs(OpT op, int) -> decltype(op.getOutputs(), ValueRange{}) {
-  return op.getOutputs();
-}
-template <typename OpT>
-auto getOutputs(OpT op, char) -> decltype(op.results(), ValueRange{}) {
-  return op.results();
-}
-
-template <typename OpT>
-auto getInputs(OpT op, int) -> decltype(op.getInputs(), ValueRange{}) {
-  return op.getInputs();
-}
-template <typename OpT>
-auto getInputs(OpT op, char) -> decltype(op.operands(), ValueRange{}) {
-  return op.operands();
-}
-}  // namespace accessor_dispatch
-
-template <typename OpT>
-static LogicalResult verifyAllReduce(OpT op) {
-  if (failed(hlo::verifyReplicaGroups(op.getLoc(), op.getReplicaGroups(),
-                                      /*allGroupsMustHaveSameSize=*/false,
-                                      op.getUseGlobalDeviceIds(),
-                                      /*expectedGroupSize=*/std::nullopt)))
-    return failure();
-
-  // AllReduce has variadic operands and results that have the same size.
-  // Each member of the operand should have the same type as the corresponding
-  // member of the result.
-  for (auto it : llvm::enumerate(
-           llvm::zip(accessor_dispatch::getInputs(op, 0).getTypes(),
-                     accessor_dispatch::getOutputs(op, 0).getTypes()))) {
-    Type operandType = std::get<0>(it.value());
-    Type resultType = std::get<1>(it.value());
-    if (operandType != resultType)
-      return op.emitOpError("requires operand #")
-             << it.index() << " (type: " << operandType << ") and result #"
-             << it.index() << " (type: " << resultType << ") to have same type";
-  }
-  return success();
-}
-
-}  // namespace lmhlo
-}  // namespace mlir
-
-#endif  // MLIR_HLO_LHLO_UTILS_LHLO_UTILS_H
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_base.td b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_base.td
index 15d8dcc8cf5f5f..992aca563edf97 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_base.td
+++ b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_base.td
@@ -131,4 +131,6 @@ defvar MHLO_StaticShapeIntOrFpTensor = HLO_StaticShapeIntOrFpTensor;
 
 defvar MHLO_StaticShapeIntFpOrComplexTensor = HLO_StaticShapeIntFpOrComplexTensor;
 
+defvar MHLO_Static1DIntTensor = HLO_Static1DIntTensor;
+
 #endif // MLIR_HLO_DIALECT_MHLO_IR_HLO_BASE
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
index d12f6599463d2d..4a4690f28e9cbd 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
@@ -139,7 +139,7 @@ struct AsyncBundleTypeStorage final
 
   void getFlattenedTypes(SmallVectorImpl<Type>& types) {
     for (Type type : getTypes()) {
-      if (auto nestedTuple = type.dyn_cast<TupleType>())
+      if (auto nestedTuple = dyn_cast<TupleType>(type))
         nestedTuple.getFlattenedTypes(types);
       else
         types.push_back(type);
@@ -190,10 +190,9 @@ static T clamp(const T& value, const T& lower, const T& upper) {
 template <typename OpT>
 static LogicalResult verifyDimAttr(OpT op) {
   int64_t rank = -1;
-  if (auto ty =
-          op.getOperand().getType().template dyn_cast<RankedTensorType>()) {
+  if (auto ty = mlir::dyn_cast<RankedTensorType>(op.getOperand().getType())) {
     rank = ty.getRank();
-  } else if (auto ty = op.getType().template dyn_cast<RankedTensorType>()) {
+  } else if (auto ty = mlir::dyn_cast<RankedTensorType>(op.getType())) {
     rank = ty.getRank();
   } else {
     return success();
@@ -253,7 +252,7 @@ DenseElementsAttr reshape(DenseElementsAttr attr, ShapedType newType) {
   // Bypass the element type check for quantized tensor. For quantized tensors,
   // we only require storage type and shape match the attribute type and shape.
   if (auto quantElemTy =
-          newType.getElementType().dyn_cast<quant::QuantizedType>()) {
+          dyn_cast<quant::QuantizedType>(newType.getElementType())) {
     // Only shape and storage type information is needed to reshape the
     // attribute.
     auto quantShapedType =
@@ -284,7 +283,7 @@ FailureOr<SmallVector<std::pair<int64_t, int64_t>>> convertNx2Attribute(
     return SmallVector<std::pair<int64_t, int64_t>>{};
   mlir::DenseIntElementsAttr attr = *optionalAttr;
 
-  auto attrType = attr.getType().cast<RankedTensorType>();  // ensured by ODS.
+  auto attrType = cast<RankedTensorType>(attr.getType());  // ensured by ODS.
   if (attrType.getRank() > 1) {
     if (attrType.getRank() != 2 || attrType.getShape()[1] != 2)
       return (mlir::emitError(loc) << "expects the shape of padding-attribute "
@@ -415,9 +414,18 @@ INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(XorOp)
 // Async ops
 //===----------------------------------------------------------------------===//
 
+// Follow async operation use-def chain to find the start of the async chain.
+AsyncStartOp findAsyncChainStart(Operation* op) {
+  Operation* start = op;
+  while (start != nullptr && !isa<AsyncStartOp>(start)) {
+    start = start->getOperand(0).getDefiningOp();
+  }
+  return dyn_cast_or_null<AsyncStartOp>(start);
+}
+
 Type maybeTupleFromTypes(MLIRContext* ctx, ArrayRef<Type> types,
                          bool expectsTuple = false) {
-  if (!expectsTuple && types.size() == 1 && !types[0].isa<TupleType>())
+  if (!expectsTuple && types.size() == 1 && !isa<TupleType>(types[0]))
     return types[0];
   return TupleType::get(ctx, TypeRange(types));
 }
@@ -488,31 +496,27 @@ LogicalResult AsyncStartOp::verify() {
     }
   }
 
-  auto bundleType = getResult().getType().cast<AsyncBundleType>();
+  auto bundleType = cast<AsyncBundleType>(getResult().getType());
   return verifyAsyncBundleType(this, bundleType, calleeType);
 }
 
 LogicalResult AsyncUpdateOp::verify() {
-  ModuleOp module = getOperation()->getParentOfType<ModuleOp>();
-  func::FuncOp callee =
-      module.lookupSymbol<func::FuncOp>(getCalledComputation());
-  if (!callee) {
-    return emitOpError() << "can't find function: " << getCalledComputation();
+  if (!isa<AsyncStartOp, AsyncUpdateOp>(getOperand().getDefiningOp())) {
+    return emitOpError()
+           << "operand must be defined by async-start or async-update op";
   }
-  FunctionType calleeType = callee.getFunctionType();
 
-  auto calleeThreadName = callee->getAttrOfType<StringAttr>("execution_thread");
-  if (!calleeThreadName)
-    return emitOpError() << "callee must have execution_thread attribute.";
-  if (calleeThreadName != getExecutionThread()) {
-    return emitOpError() << "execution_thread does not match name of "
-                         << getCalledComputation() << ". Got: \""
-                         << getExecutionThread() << "\", but expected "
-                         << calleeThreadName << ".";
+  AsyncStartOp startOp = findAsyncChainStart(*this);
+  if (!startOp) {
+    return emitOpError() << "can't find a start of async chain";
   }
 
-  auto bundleType = getResult().getType().cast<AsyncBundleType>();
-  return verifyAsyncBundleType(this, bundleType, calleeType);
+  ModuleOp module = getOperation()->getParentOfType<ModuleOp>();
+  func::FuncOp callee =
+      module.lookupSymbol<func::FuncOp>(startOp.getCalledComputation());
+
+  auto bundleType = cast<AsyncBundleType>(getResult().getType());
+  return verifyAsyncBundleType(this, bundleType, callee.getFunctionType());
 }
 
 LogicalResult AsyncUpdateOp::inferReturnTypes(
@@ -520,32 +524,28 @@ LogicalResult AsyncUpdateOp::inferReturnTypes(
     DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
   AsyncUpdateOp::Adaptor adaptor(operands, attributes, properties, regions);
-  auto stateType = adaptor.getBundle().getType().cast<AsyncBundleType>();
+  auto stateType = cast<AsyncBundleType>(adaptor.getBundle().getType());
   inferredReturnTypes.push_back(stateType);
   return success();
 }
 
 LogicalResult AsyncDoneOp::verify() {
-  ModuleOp module = getOperation()->getParentOfType<ModuleOp>();
-  func::FuncOp callee =
-      module.lookupSymbol<func::FuncOp>(getCalledComputation());
-  if (!callee) {
-    return emitOpError() << "can't find function: " << getCalledComputation();
+  if (!isa<AsyncStartOp, AsyncUpdateOp>(getOperand().getDefiningOp())) {
+    return emitOpError()
+           << "operand must be defined by async-start or async-update op";
   }
-  FunctionType calleeType = callee.getFunctionType();
 
-  auto calleeThreadName = callee->getAttrOfType<StringAttr>("execution_thread");
-  if (!calleeThreadName)
-    return emitOpError() << "callee must have execution_thread attribute.";
-  if (calleeThreadName != getExecutionThread()) {
-    return emitOpError() << "execution_thread does not match name of "
-                         << getCalledComputation() << ". Got: \""
-                         << getExecutionThread() << "\", but expected "
-                         << calleeThreadName << ".";
+  AsyncStartOp startOp = findAsyncChainStart(*this);
+  if (!startOp) {
+    return emitOpError() << "can't find a start of async chain";
   }
 
-  auto bundleType = getBundle().getType().cast<AsyncBundleType>();
-  return verifyAsyncBundleType(this, bundleType, calleeType);
+  ModuleOp module = getOperation()->getParentOfType<ModuleOp>();
+  func::FuncOp callee =
+      module.lookupSymbol<func::FuncOp>(startOp.getCalledComputation());
+
+  auto bundleType = cast<AsyncBundleType>(getBundle().getType());
+  return verifyAsyncBundleType(this, bundleType, callee.getFunctionType());
 }
 
 LogicalResult AsyncDoneOp::inferReturnTypes(
@@ -553,9 +553,16 @@ LogicalResult AsyncDoneOp::inferReturnTypes(
     DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
   AsyncDoneOp::Adaptor adaptor(operands, attributes, properties, regions);
+
+  AsyncStartOp startOp = findAsyncChainStart(operands[0].getDefiningOp());
+  if (!startOp) {
+    return adaptor.getBundle().getDefiningOp()->emitOpError()
+           << "can't find a start of async chain";
+  }
+
   ModuleOp module =
       adaptor.getBundle().getDefiningOp()->getParentOfType<ModuleOp>();
-  auto calledComputation = adaptor.getCalledComputationAttr();
+  auto calledComputation = startOp.getCalledComputation();
   func::FuncOp callee = module.lookupSymbol<func::FuncOp>(calledComputation);
   if (!callee) {
     return adaptor.getBundle().getDefiningOp()->emitOpError()
@@ -604,21 +611,21 @@ void ConstantOp::build(OpBuilder& /*builder*/, OperationState& result,
                        Attribute value) {
   Properties& properties = result.getOrAddProperties<Properties>();
   Type type;
-  if (auto elemAttr = value.dyn_cast<ElementsAttr>()) {
+  if (auto elemAttr = dyn_cast<ElementsAttr>(value)) {
     type = elemAttr.getType();
     properties.value = elemAttr;
-  } else if (value.isa<BoolAttr, FloatAttr, IntegerAttr>()) {
+  } else if (isa<BoolAttr, FloatAttr, IntegerAttr>(value)) {
     // All XLA types must be tensor types. In the build() method, we want to
     // provide more flexibility by allowing attributes of scalar types. But we
     // need to wrap it up with ElementsAttr to construct valid XLA constants.
     type =
-        RankedTensorType::get(/*shape=*/{}, value.cast<TypedAttr>().getType());
-    properties.value = DenseElementsAttr::get(type.cast<TensorType>(), value);
-  } else if (auto complexAttr = value.dyn_cast<complex::NumberAttr>()) {
+        RankedTensorType::get(/*shape=*/{}, cast<TypedAttr>(value).getType());
+    properties.value = DenseElementsAttr::get(cast<TensorType>(type), value);
+  } else if (auto complexAttr = dyn_cast<complex::NumberAttr>(value)) {
     type = RankedTensorType::get(/*shape=*/{},
-                                 complexAttr.cast<TypedAttr>().getType());
+                                 cast<TypedAttr>(complexAttr).getType());
     properties.value =
-        DenseElementsAttr::get(type.cast<TensorType>(), complexAttr.getValue());
+        DenseElementsAttr::get(cast<TensorType>(type), complexAttr.getValue());
   }
 
   // TODO: support other XLA specific types.
@@ -637,13 +644,12 @@ LogicalResult ConstantOp::inferReturnTypes(
 
 bool ConstantOp::isCompatibleReturnTypes(TypeRange l, TypeRange r) {
   if (l.size() != r.size() || l.size() != 1) return false;
-  auto lhsTy = l.front().cast<ShapedType>();
-  auto rhsTy = r.front().cast<ShapedType>();
+  auto lhsTy = cast<ShapedType>(l.front());
+  auto rhsTy = cast<ShapedType>(r.front());
   // For comparisons of the uniform quantized element based tensor type, use the
   // storage type since the constant value will be stored through the underlying
   // storage type.
-  if (auto rhsElemTy =
-          rhsTy.getElementType().dyn_cast<quant::QuantizedType>()) {
+  if (auto rhsElemTy = dyn_cast<quant::QuantizedType>(rhsTy.getElementType())) {
     rhsTy = hlo::getSameShapeTensorType(rhsTy, rhsElemTy.getStorageType());
   }
   return lhsTy == rhsTy;
@@ -665,7 +671,7 @@ template <typename CallableOpType>
 LogicalResult verifyOutputOperandAliasing(CallableOpType* op) {
   auto aliasArrayAttr = op->getOutputOperandAliases();
   for (auto attr : aliasArrayAttr) {
-    auto alias = attr.template cast<OutputOperandAliasAttr>();
+    auto alias = mlir::cast<OutputOperandAliasAttr>(attr);
     auto outputTupleIndices = alias.getOutputTupleIndices();
     auto operandIndex = alias.getOperandIndex();
     auto operandTupleIndices = alias.getOperandTupleIndices();
@@ -677,26 +683,26 @@ LogicalResult verifyOutputOperandAliasing(CallableOpType* op) {
              << op->getInputs().size() << "); got: " << operandIndex << ".";
     Type operandPart = op->getOperand(operandIndex).getType();
     for (auto i : operandTupleIndices) {
-      if (!operandPart.isa<TupleType>() ||
-          i >= static_cast<int64_t>(operandPart.cast<TupleType>().size()) ||
+      if (!isa<TupleType>(operandPart) ||
+          i >= static_cast<int64_t>(cast<TupleType>(operandPart).size()) ||
           i < 0)
         return op->emitOpError()
                << "operand_tuple_indices in the output_operand_alias "
                   "attribute out of bounds";
-      operandPart = operandPart.cast<TupleType>().getType(i);
+      operandPart = cast<TupleType>(operandPart).getType(i);
     }
     Type outputPart =
         op->getNumResults() > 1
             ? TupleType::get(op->getContext(), op->getResultTypes())
             : op->getResult(0).getType();
     for (auto i : outputTupleIndices) {
-      if (!outputPart.isa<TupleType>() ||
-          i >= static_cast<int64_t>(outputPart.cast<TupleType>().size()) ||
+      if (!isa<TupleType>(outputPart) ||
+          i >= static_cast<int64_t>(cast<TupleType>(outputPart).size()) ||
           i < 0)
         return op->emitOpError()
                << "output_tuple_indices in the output_operand_alias "
                   "attribute out of bounds";
-      outputPart = outputPart.cast<TupleType>().getType(i);
+      outputPart = cast<TupleType>(outputPart).getType(i);
     }
     if (operandPart != outputPart)
       return op->emitOpError()
@@ -771,13 +777,13 @@ LogicalResult CustomCallOp::verify() {
         auto index = indexedTypeAndLayout.index();
 
         auto type = std::get<0>(indexedTypeAndLayout.value());
-        auto layout = std::get<1>(indexedTypeAndLayout.value())
-                          .cast<DenseIntElementsAttr>();
+        auto layout = cast<DenseIntElementsAttr>(
+            std::get<1>(indexedTypeAndLayout.value()));
 
-        if (type.isa<TupleType>())
+        if (isa<TupleType>(type))
           return emitOpError() << "Tuple types are not fully supported with "
                                   "layout constraints yet";
-        auto tensorType = type.dyn_cast<TensorType>();
+        auto tensorType = dyn_cast<TensorType>(type);
 
         // For non-tensor types such as !mhlo.token, the layout should be empty.
         if (!tensorType) {
@@ -820,8 +826,8 @@ LogicalResult CustomCallOp::verify() {
     // the i-th element of `result_layouts` specifies layout for i-th element of
     // the result tuple.
     TypeRange resultTypes;
-    if (getNumResults() == 1 && getResult(0).getType().isa<TupleType>())
-      resultTypes = getResult(0).getType().cast<TupleType>().getTypes();
+    if (getNumResults() == 1 && isa<TupleType>(getResult(0).getType()))
+      resultTypes = cast<TupleType>(getResult(0).getType()).getTypes();
     else
       resultTypes = getResultTypes();
 
@@ -842,13 +848,13 @@ LogicalResult CustomCallOp::verify() {
   if (auto backendConfig = getBackendConfig()) {
     if (getApiVersion() == CustomCallApiVersion::API_VERSION_TYPED_FFI) {
       // Typed FFI custom calls require `backend_config` to be a DictionaryAttr.
-      if (backendConfig->isa<mlir::StringAttr>())
+      if (isa<mlir::StringAttr>(*backendConfig))
         return emitOpError()
                << "unsupported user-encoded backend config,"
                   " backend config must be a dictionary attribute.";
     } else {
       // Older API versions require user-encoded `backend_config` string.
-      if (backendConfig->isa<mlir::DictionaryAttr>())
+      if (isa<mlir::DictionaryAttr>(*backendConfig))
         return emitOpError()
                << "unsupported dictionary attribute backend config, backend"
                   " config must be a user-encoded string attribute.";
@@ -909,8 +915,8 @@ LogicalResult DotGeneralOp::verify() {
 LogicalResult DotGeneralOp::reifyReturnTypeShapes(
     OpBuilder& builder, ValueRange operands,
     SmallVectorImpl<Value>& reifiedReturnShapes) {
-  auto lhsType = getLhs().getType().dyn_cast<ShapedType>();
-  auto rhsType = getRhs().getType().dyn_cast<ShapedType>();
+  auto lhsType = dyn_cast<ShapedType>(getLhs().getType());
+  auto rhsType = dyn_cast<ShapedType>(getRhs().getType());
   if (!lhsType || !rhsType) {
     return failure();
   }
@@ -948,8 +954,8 @@ LogicalResult DotGeneralOp::reifyReturnTypeShapes(
 //===----------------------------------------------------------------------===//
 
 LogicalResult SparseDotOp::verify() {
-  RankedTensorType lhsType = getLhs().getType().dyn_cast<RankedTensorType>();
-  RankedTensorType rhsType = getRhs().getType().dyn_cast<RankedTensorType>();
+  RankedTensorType lhsType = dyn_cast<RankedTensorType>(getLhs().getType());
+  RankedTensorType rhsType = dyn_cast<RankedTensorType>(getRhs().getType());
   // If either operand is unranked, static verification is not possible.
   if (!lhsType || !rhsType) return success();
 
@@ -982,7 +988,7 @@ LogicalResult SparseDotOp::verify() {
     return failure();
 
   auto inferredShape = inferredReturnShapes[0];
-  auto resultType = getResult().getType().cast<ShapedType>();
+  auto resultType = cast<ShapedType>(getResult().getType());
   if (inferredShape.hasRank() && resultType.hasRank() &&
       failed(verifyCompatibleShape(inferredShape.getDims(),
                                    resultType.getShape())))
@@ -1046,7 +1052,7 @@ struct GatherSlice : public OpRewritePattern<GatherOp> {
       return failure();
 
     RankedTensorType operandType =
-        gather->getOperand(0).getType().dyn_cast<RankedTensorType>();
+        dyn_cast<RankedTensorType>(gather->getOperand(0).getType());
     if (!operandType || !operandType.hasStaticShape()) return failure();
 
     auto sliceEnd =
@@ -1068,7 +1074,7 @@ struct GatherSlice : public OpRewritePattern<GatherOp> {
     for (size_t i = 0; i < sliceEnd.size(); ++i) {
       sliceShape[i] = sliceEnd[i] - sliceStart[i];
     }
-    Type elementType = gather.getType().cast<TensorType>().getElementType();
+    Type elementType = cast<TensorType>(gather.getType()).getElementType();
     auto sliceType = RankedTensorType::get(sliceShape, elementType);
     Value result = rewriter.create<SliceOp>(
         gather.getLoc(), sliceType, gather.getOperand(),
@@ -1129,7 +1135,7 @@ void getSliceSizeValues(DynamicGatherOp* /*dGather*/, OpBuilder& builder,
                         SmallVectorImpl<Value>& sliceSizeValues) {
   DynamicGatherOp::Adaptor adaptor(operands);
   Value sliceSizes = adaptor.getSliceSizes();
-  auto sliceSizesTy = sliceSizes.getType().cast<ShapedType>();
+  auto sliceSizesTy = cast<ShapedType>(sliceSizes.getType());
   for (int64_t i = 0; i < sliceSizesTy.getDimSize(0); ++i) {
     Value idx = builder.create<arith::ConstantIndexOp>(loc, i);
     sliceSizeValues.push_back(
@@ -1141,8 +1147,7 @@ template <typename Op>
 LogicalResult reifyGatherShape(Op* op, OpBuilder& builder, ValueRange operands,
                                SmallVectorImpl<Value>& reifiedReturnShapes) {
   // No support for unranked gather output shape a.t.m.
-  auto resultTy =
-      op->getResult().getType().template dyn_cast<RankedTensorType>();
+  auto resultTy = mlir::dyn_cast<RankedTensorType>(op->getResult().getType());
   if (!resultTy) return failure();
 
   typename Op::Adaptor adaptor(operands);
@@ -1282,14 +1287,14 @@ LogicalResult GetDimensionSizeOp::inferReturnTypeComponents(
 
 /// Fold get_dimension_size when the said shape dimension is a constant.
 OpFoldResult GetDimensionSizeOp::fold(FoldAdaptor) {
-  RankedTensorType type = getOperand().getType().dyn_cast<RankedTensorType>();
+  RankedTensorType type = dyn_cast<RankedTensorType>(getOperand().getType());
   if (!type) return {};
 
   int32_t dim = getDimension();
   if (type.isDynamicDim(dim)) return {};
   // The result type is always is a 0-d i32 tensor.
   return DenseIntElementsAttr::get<int32_t>(
-      getResult().getType().cast<RankedTensorType>(), type.getDimSize(dim));
+      cast<RankedTensorType>(getResult().getType()), type.getDimSize(dim));
 }
 
 //===----------------------------------------------------------------------===//
@@ -1307,7 +1312,7 @@ struct IotaBroadcast : public OpRewritePattern<IotaOp> {
 
   LogicalResult matchAndRewrite(IotaOp iota,
                                 PatternRewriter& rewriter) const override {
-    auto resultTy = iota.getType().cast<ShapedType>();
+    auto resultTy = cast<ShapedType>(iota.getType());
     if (!resultTy.hasRank() || resultTy.getRank() < 2) {
       return failure();
     }
@@ -1336,7 +1341,7 @@ void IotaOp::getCanonicalizationPatterns(RewritePatternSet& results,
 
 OpFoldResult IotaOp::fold(FoldAdaptor /*adaptor*/) {
   auto dimension = getIotaDimension();
-  auto resultTy = getResult().getType().cast<ShapedType>();
+  auto resultTy = cast<ShapedType>(getResult().getType());
   if (resultTy.hasRank() && resultTy.getDimSize(dimension) == 1) {
     Builder builder(getContext());
     return builder.getZeroAttr(resultTy);
@@ -1357,7 +1362,7 @@ struct DynamicIotaIsStatic : public OpRewritePattern<DynamicIotaOp> {
   LogicalResult matchAndRewrite(DynamicIotaOp iota,
                                 PatternRewriter& rewriter) const override {
     // Result type has static shape, replace with iota.
-    auto resultTy = iota.getType().cast<ShapedType>();
+    auto resultTy = cast<ShapedType>(iota.getType());
     if (resultTy.hasStaticShape()) {
       rewriter.replaceOpWithNewOp<IotaOp>(iota, resultTy,
                                           iota.getIotaDimension());
@@ -1375,7 +1380,7 @@ struct DynamicIotaBroadcast : public OpRewritePattern<DynamicIotaOp> {
 
   LogicalResult matchAndRewrite(DynamicIotaOp iota,
                                 PatternRewriter& rewriter) const override {
-    auto resultTy = iota.getType().cast<ShapedType>();
+    auto resultTy = cast<ShapedType>(iota.getType());
     if (!resultTy.hasRank() || resultTy.getRank() < 2) {
       return failure();
     }
@@ -1386,7 +1391,7 @@ struct DynamicIotaBroadcast : public OpRewritePattern<DynamicIotaOp> {
     auto convertedShape = rewriter.create<arith::IndexCastOp>(
         iota.getLoc(),
         RankedTensorType::get(
-            iota.getOutputShape().getType().cast<ShapedType>().getShape(),
+            cast<ShapedType>(iota.getOutputShape().getType()).getShape(),
             rewriter.getI64Type()),
         iota.getOutputShape());
 
@@ -1398,10 +1403,9 @@ struct DynamicIotaBroadcast : public OpRewritePattern<DynamicIotaOp> {
 
     auto convertedSlicedShape = rewriter.create<arith::IndexCastOp>(
         iota.getLoc(),
-        RankedTensorType::get({1}, iota.getOutputShape()
-                                       .getType()
-                                       .cast<ShapedType>()
-                                       .getElementType()),
+        RankedTensorType::get(
+            {1},
+            cast<ShapedType>(iota.getOutputShape().getType()).getElementType()),
         slicedShape);
 
     auto iotaType = RankedTensorType::get(
@@ -1431,7 +1435,7 @@ void DynamicIotaOp::getCanonicalizationPatterns(RewritePatternSet& results,
 static Value castToIndexTensor(OpBuilder& builder, Location loc,
                                Value shapeOp) {
   ShapedType resultTy = shape::getExtentTensorType(
-      builder.getContext(), shapeOp.getType().cast<ShapedType>().getDimSize(0));
+      builder.getContext(), cast<ShapedType>(shapeOp.getType()).getDimSize(0));
   if (shapeOp.getType() == resultTy) return shapeOp;  // Nothing to do.
   return builder.create<arith::IndexCastOp>(loc, resultTy, shapeOp);
 }
@@ -1461,8 +1465,8 @@ LogicalResult DynamicUpdateSliceOp::inferReturnTypeComponents(
 }
 
 OpFoldResult DynamicUpdateSliceOp::fold(FoldAdaptor /*adaptor*/) {
-  auto operandShape = this->getOperand().getType().cast<RankedTensorType>();
-  auto updateShape = this->getUpdate().getType().cast<RankedTensorType>();
+  auto operandShape = cast<RankedTensorType>(this->getOperand().getType());
+  auto updateShape = cast<RankedTensorType>(this->getUpdate().getType());
 
   // If any of the dimensions are length-0, the update does nothing.
   for (auto dim : updateShape.getShape()) {
@@ -1547,10 +1551,10 @@ SmallVector<int64_t> inferConvolutionOpReturnShape(
   // NOTE: This is a divergence from StableHLO which doesn't allow us to fully
   // share ConvolutionOp's verification / shape inference logic with StableHLO.
   SmallVector<int64_t> outputDimensions =
-      to_vector(op.getResult().getType().cast<ShapedType>().getShape());
+      to_vector(cast<ShapedType>(op.getResult().getType()).getShape());
 
   // Infer the output spatial dimensions.
-  auto lhsType = op.getLhs().getType().cast<RankedTensorType>();
+  auto lhsType = cast<RankedTensorType>(op.getLhs().getType());
   auto inputSpatialDims = op.getDimensionNumbers().getInputSpatialDimensions();
   auto numSpatialDims = inputSpatialDims.size();
   SmallVector<int64_t> inputSpatialDimVals(numSpatialDims);
@@ -1564,7 +1568,7 @@ SmallVector<int64_t> inferConvolutionOpReturnShape(
         windowOutputShape[i];
 
   // Infer the output-batch-dimension and output-feature-dimension.
-  auto rhsType = op.getRhs().getType().cast<RankedTensorType>();
+  auto rhsType = cast<RankedTensorType>(op.getRhs().getType());
   const int64_t inputBatch =
       lhsType.getShape()[op.getDimensionNumbers().getInputBatchDimension()];
   const int64_t kernelOutputFeatures =
@@ -1589,9 +1593,9 @@ struct ConvolutionIsDot : public OpRewritePattern<mhlo::ConvolutionOp> {
                                 PatternRewriter& rewriter) const override {
     Value lhs = op.getLhs();
     Value rhs = op.getRhs();
-    auto lhsTy = lhs.getType().cast<RankedTensorType>();
-    auto rhsTy = rhs.getType().cast<RankedTensorType>();
-    auto resultTy = op.getType().cast<RankedTensorType>();
+    auto lhsTy = cast<RankedTensorType>(lhs.getType());
+    auto rhsTy = cast<RankedTensorType>(rhs.getType());
+    auto resultTy = cast<RankedTensorType>(op.getType());
 
     if (lhsTy.getRank() != 2) return failure();
     if (rhsTy.getRank() != 2) return failure();
@@ -1694,8 +1698,8 @@ void ConvolutionOp::getCanonicalizationPatterns(RewritePatternSet& results,
  *  P4. Verify the return shape.
  */
 LogicalResult ConvolutionOp::verify() {
-  auto lhsType = getLhs().getType().dyn_cast<RankedTensorType>();
-  auto rhsType = getRhs().getType().dyn_cast<RankedTensorType>();
+  auto lhsType = dyn_cast<RankedTensorType>(getLhs().getType());
+  auto rhsType = dyn_cast<RankedTensorType>(getRhs().getType());
 
   if (!lhsType || !rhsType) return success();
 
@@ -1749,10 +1753,10 @@ LogicalResult ConvolutionOp::verify() {
   if (failed(windowOrErr)) return failure();
 
   // P4.
-  auto actualReturnType = getResult().getType().cast<TensorType>();
+  auto actualReturnType = cast<TensorType>(getResult().getType());
   if (!actualReturnType.hasRank()) return success();
 
-  auto actualReturnRankedType = actualReturnType.cast<RankedTensorType>();
+  auto actualReturnRankedType = cast<RankedTensorType>(actualReturnType);
   if (numDims != actualReturnRankedType.getRank())
     return emitOpError() << "expects rank of convolution return-type to be "
                             "equal to input-ranks ("
@@ -1818,15 +1822,15 @@ void DynamicConvOp::getCanonicalizationPatterns(RewritePatternSet& results,
 
 void ConvertOp::build(OpBuilder& builder, OperationState& result, Value operand,
                       Type resultElementTy) {
-  auto rankedTy = operand.getType().cast<RankedTensorType>();
+  auto rankedTy = cast<RankedTensorType>(operand.getType());
   auto resultTy = RankedTensorType::get(rankedTy.getShape(), resultElementTy);
   build(builder, result, resultTy, operand);
 }
 
 OpFoldResult ConvertOp::fold(FoldAdaptor adaptor) {
   auto operands = adaptor.getOperands();
-  auto operandTy = getOperand().getType().cast<TensorType>();
-  auto resultTy = getResult().getType().cast<TensorType>();
+  auto operandTy = cast<TensorType>(getOperand().getType());
+  auto resultTy = cast<TensorType>(getResult().getType());
   if (operandTy == resultTy) return getOperand();
 
   // If the result has non-static shape, a convert op is necessary to go from
@@ -1834,7 +1838,7 @@ OpFoldResult ConvertOp::fold(FoldAdaptor adaptor) {
   if (!resultTy.hasStaticShape()) return {};
 
   // If the operand is constant, we can do the conversion now.
-  auto elementsAttr = operands.front().dyn_cast_or_null<ElementsAttr>();
+  auto elementsAttr = dyn_cast_or_null<ElementsAttr>(operands.front());
   if (!elementsAttr) return {};
 
   // Prevent folding if the result is too large.
@@ -1854,29 +1858,29 @@ struct EliminateRedundantConvert : public OpRewritePattern<ConvertOp> {
       return failure();
     }
     auto firstType =
-        convertOp.getOperand().getType().cast<TensorType>().getElementType();
+        cast<TensorType>(convertOp.getOperand().getType()).getElementType();
     auto secondType =
-        op.getOperand().getType().cast<TensorType>().getElementType();
+        cast<TensorType>(op.getOperand().getType()).getElementType();
     auto thirdType =
-        op.getResult().getType().cast<TensorType>().getElementType();
+        cast<TensorType>(op.getResult().getType()).getElementType();
     auto loc = rewriter.getFusedLoc({convertOp->getLoc(), op->getLoc()});
-    if (firstType.isa<FloatType>() && secondType.isa<FloatType>() &&
-        thirdType.isa<FloatType>()) {
+    if (isa<FloatType>(firstType) && isa<FloatType>(secondType) &&
+        isa<FloatType>(thirdType)) {
       // fold when the second float type's width is longer than first,
       // like fp16 -> fp32 -> fp64, bf16 -> fp32 -> fp16
-      if (secondType.cast<FloatType>().getWidth() >
-          firstType.cast<FloatType>().getWidth()) {
+      if (cast<FloatType>(secondType).getWidth() >
+          cast<FloatType>(firstType).getWidth()) {
         Value result = rewriter.create<ConvertOp>(loc, op.getResult().getType(),
                                                   convertOp.getOperand());
         rewriter.replaceOp(op, result);
         return success();
       }
-    } else if (firstType.isa<IntegerType>() && secondType.isa<IntegerType>() &&
-               thirdType.isa<IntegerType>()) {
+    } else if (isa<IntegerType>(firstType) && isa<IntegerType>(secondType) &&
+               isa<IntegerType>(thirdType)) {
       // fold when the second integer type's width is longer than first,
       // like i16 -> i32 -> i64, u16 -> i32 -> u32
-      if (secondType.cast<IntegerType>().getWidth() >
-          firstType.cast<IntegerType>().getWidth()) {
+      if (cast<IntegerType>(secondType).getWidth() >
+          cast<IntegerType>(firstType).getWidth()) {
         Value result = rewriter.create<ConvertOp>(loc, op.getResult().getType(),
                                                   convertOp.getOperand());
         rewriter.replaceOp(op, result);
@@ -1976,14 +1980,14 @@ LogicalResult AllToAllOp::inferReturnTypeComponents(
 
     // TupleAllToAll has identical result and operand shapes.
     for (size_t i = 0; i < operands.size(); ++i) {
-      auto rankedOperand = operands[i].getType().dyn_cast<RankedTensorType>();
+      auto rankedOperand = dyn_cast<RankedTensorType>(operands[i].getType());
       if (rankedOperand)
         inferredReturnShapes.emplace_back(rankedOperand.getShape(),
                                           rankedOperand.getElementType(),
                                           rankedOperand.getEncoding());
       else
         inferredReturnShapes.emplace_back(
-            operands[i].getType().cast<ShapedType>());
+            cast<ShapedType>(operands[i].getType()));
     }
 
     return success();
@@ -2183,8 +2187,8 @@ OpFoldResult BitcastOp::fold(FoldAdaptor) {
 LogicalResult BitcastConvertOp::reifyReturnTypeShapes(
     OpBuilder& builder, ValueRange operands,
     SmallVectorImpl<Value>& reifiedReturnShapes) {
-  auto operandType = operands[0].getType().dyn_cast<RankedTensorType>();
-  auto resultType = getType().dyn_cast<RankedTensorType>();
+  auto operandType = dyn_cast<RankedTensorType>(operands[0].getType());
+  auto resultType = dyn_cast<RankedTensorType>(getType());
 
   // Only ranked tensors are supported.
   if (!operandType || !resultType) return failure();
@@ -2212,7 +2216,7 @@ LogicalResult BitcastConvertOp::verify() {
 
 OpFoldResult BroadcastOp::fold(FoldAdaptor adaptor) {
   auto attrs = adaptor.getOperands();
-  auto type = getType().cast<ShapedType>();
+  auto type = cast<ShapedType>(getType());
   auto sizesType = getBroadcastSizes().getType();
   if (sizesType.getNumElements() == 0) {
     return getOperand();
@@ -2220,17 +2224,17 @@ OpFoldResult BroadcastOp::fold(FoldAdaptor adaptor) {
 
   // Constant fold when an operand is a splat tensor attribute.
   if (!attrs[0] || !type.hasStaticShape()) return {};
-  auto splatOperandAttr = attrs[0].dyn_cast<SplatElementsAttr>();
+  auto splatOperandAttr = dyn_cast<SplatElementsAttr>(attrs[0]);
   if (!splatOperandAttr) return {};
 
   // Handle complex type
-  if (type.getElementType().isa<ComplexType>()) {
-    ComplexType complex = type.getElementType().cast<ComplexType>();
-    if (complex.getElementType().isa<FloatType>()) {
+  if (isa<ComplexType>(type.getElementType())) {
+    ComplexType complex = cast<ComplexType>(type.getElementType());
+    if (isa<FloatType>(complex.getElementType())) {
       return DenseElementsAttr::get(
           type, {splatOperandAttr.getSplatValue<std::complex<APFloat>>()});
     }
-    if (complex.getElementType().isa<IntegerType>()) {
+    if (isa<IntegerType>(complex.getElementType())) {
       return DenseElementsAttr::get(
           type, {splatOperandAttr.getSplatValue<std::complex<APInt>>()});
     }
@@ -2239,7 +2243,7 @@ OpFoldResult BroadcastOp::fold(FoldAdaptor adaptor) {
 
   // Skip Quantized types since they are not supported in
   // DenseElementsAttr::get.
-  if (type.getElementType().isa<quant::QuantizedType>()) {
+  if (isa<quant::QuantizedType>(type.getElementType())) {
     return {};
   }
 
@@ -2267,7 +2271,7 @@ LogicalResult BroadcastOp::reifyReturnTypeShapes(
   BroadcastOp::Adaptor adaptor(operands);
   Value operand = adaptor.getOperand();
 
-  auto operandType = operand.getType().dyn_cast<RankedTensorType>();
+  auto operandType = dyn_cast<RankedTensorType>(operand.getType());
   // Unranked tensors are not supported.
   if (!operandType) return failure();
 
@@ -2308,7 +2312,7 @@ LogicalResult BroadcastInDimOp::verify() {
 
 OpFoldResult BroadcastInDimOp::fold(FoldAdaptor adaptor) {
   auto attrs = adaptor.getOperands();
-  auto type = getType().cast<RankedTensorType>();
+  auto type = cast<RankedTensorType>(getType());
   if (type == getOperand().getType()) {
     auto broadcastValues = getBroadcastDimensions().getValues<int64_t>();
     if (!std::equal(broadcastValues.begin(), broadcastValues.end(),
@@ -2320,17 +2324,17 @@ OpFoldResult BroadcastInDimOp::fold(FoldAdaptor adaptor) {
 
   // Constant fold when an operand is a splat tensor attribute.
   if (!attrs[0] || !type.hasStaticShape()) return {};
-  auto splatOperandAttr = attrs[0].dyn_cast<SplatElementsAttr>();
+  auto splatOperandAttr = dyn_cast<SplatElementsAttr>(attrs[0]);
   if (!splatOperandAttr) return {};
 
   // Handle complex type
-  if (type.getElementType().isa<ComplexType>()) {
-    ComplexType complex = type.getElementType().cast<ComplexType>();
-    if (complex.getElementType().isa<FloatType>()) {
+  if (isa<ComplexType>(type.getElementType())) {
+    ComplexType complex = cast<ComplexType>(type.getElementType());
+    if (isa<FloatType>(complex.getElementType())) {
       return DenseElementsAttr::get(
           type, {splatOperandAttr.getSplatValue<std::complex<APFloat>>()});
     }
-    if (complex.getElementType().isa<IntegerType>()) {
+    if (isa<IntegerType>(complex.getElementType())) {
       return DenseElementsAttr::get(
           type, {splatOperandAttr.getSplatValue<std::complex<APInt>>()});
     }
@@ -2339,7 +2343,7 @@ OpFoldResult BroadcastInDimOp::fold(FoldAdaptor adaptor) {
 
   // Skip Quantized types since they are not supported in
   // DenseElementsAttr::get.
-  if (type.getElementType().isa<quant::QuantizedType>()) {
+  if (isa<quant::QuantizedType>(type.getElementType())) {
     return {};
   }
 
@@ -2355,8 +2359,8 @@ class BroadcastInDimSimplifier : public OpRewritePattern<BroadcastInDimOp> {
   using OpRewritePattern<BroadcastInDimOp>::OpRewritePattern;
   LogicalResult matchAndRewrite(BroadcastInDimOp op,
                                 PatternRewriter& rewriter) const override {
-    auto operandType = op.getOperand().getType().dyn_cast<RankedTensorType>();
-    auto resultType = op.getResult().getType().dyn_cast<RankedTensorType>();
+    auto operandType = dyn_cast<RankedTensorType>(op.getOperand().getType());
+    auto resultType = dyn_cast<RankedTensorType>(op.getResult().getType());
     if (!operandType || !resultType) {
       return failure();
     }
@@ -2380,14 +2384,13 @@ class BroadcastInDimSimplifier : public OpRewritePattern<BroadcastInDimOp> {
     // eliminate redundant BroadcastInDim
     if (auto broadcastInDimOp = llvm::dyn_cast_or_null<BroadcastInDimOp>(
             op.getOperand().getDefiningOp())) {
-      auto newIndices =
-          broadcastInDimOp.getBroadcastDimensions()
-              .mapValues(op.getBroadcastDimensions().getElementType(),
-                         [&bsDimIndices](const APInt& dim) -> APInt {
-                           return APInt(dim.getBitWidth(),
-                                        bsDimIndices[dim.getSExtValue()], true);
-                         })
-              .cast<DenseIntElementsAttr>();
+      auto newIndices = cast<DenseIntElementsAttr>(
+          broadcastInDimOp.getBroadcastDimensions().mapValues(
+              op.getBroadcastDimensions().getElementType(),
+              [&bsDimIndices](const APInt& dim) -> APInt {
+                return APInt(dim.getBitWidth(),
+                             bsDimIndices[dim.getSExtValue()], true);
+              }));
       rewriter.replaceOpWithNewOp<BroadcastInDimOp>(
           op, op.getType(), broadcastInDimOp.getOperand(), newIndices);
       return success();
@@ -2409,9 +2412,8 @@ LogicalResult DynamicBroadcastInDimOp::verify() {
   // Check for unranked dynamism. Unranked dynamism is not supported by
   // StableHLO (hlo::verifyReshapeOp will fail) and we can't verify
   // anything statically in that case anyway.
-  auto outputdimensionsType =
-      getOutputDimensions().getType().cast<ShapedType>();
-  auto resultType = getResult().getType().cast<ShapedType>();
+  auto outputdimensionsType = cast<ShapedType>(getOutputDimensions().getType());
+  auto resultType = cast<ShapedType>(getResult().getType());
   if (!outputdimensionsType.hasRank() || !resultType.hasRank()) {
     return success();
   }
@@ -2472,8 +2474,8 @@ class DynamicBroadcastInDimOpNotActuallyDynamic
   using OpRewritePattern::OpRewritePattern;
   LogicalResult matchAndRewrite(DynamicBroadcastInDimOp op,
                                 PatternRewriter& rewriter) const override {
-    auto type = op.getType().dyn_cast<RankedTensorType>();
-    auto operandType = op.getOperand().getType().dyn_cast<RankedTensorType>();
+    auto type = dyn_cast<RankedTensorType>(op.getType());
+    auto operandType = dyn_cast<RankedTensorType>(op.getOperand().getType());
     auto* outputDimOp = op.getOutputDimensions().getDefiningOp();
     if (!type || !operandType || !operandType.hasStaticShape()) {
       return rewriter.notifyMatchFailure(op, "requires operand static shape");
@@ -2540,7 +2542,7 @@ class DynamicBroadcastInDimAllDimsNonExpanding
   using OpRewritePattern::OpRewritePattern;
   LogicalResult matchAndRewrite(DynamicBroadcastInDimOp op,
                                 PatternRewriter& rewriter) const override {
-    auto resultType = op.getResult().getType().dyn_cast<RankedTensorType>();
+    auto resultType = dyn_cast<RankedTensorType>(op.getResult().getType());
     if (!resultType)
       return rewriter.notifyMatchFailure(op, "requires ranked result type");
 
@@ -2678,7 +2680,7 @@ class ConcatenateOperandRemoval : public OpRewritePattern<ConcatenateOp> {
     auto axis = op.getDimension();
     llvm::SmallVector<Value, 6> newOperands;
     for (auto operand : op.getOperands()) {
-      auto ty = operand.getType().cast<ShapedType>();
+      auto ty = cast<ShapedType>(operand.getType());
       if (!ty.hasRank() || ty.getDimSize(axis) != 0) {
         newOperands.push_back(operand);
       }
@@ -2753,7 +2755,7 @@ template <typename T>
 static Attribute foldConcatenateHelper(ConcatenateOp* op,
                                        ArrayRef<Attribute> operands) {
   auto axis = op->getDimension();
-  auto type = op->getType().cast<ShapedType>();
+  auto type = cast<ShapedType>(op->getType());
   auto shape = type.getShape();
 
   size_t topSize = 1;
@@ -2767,7 +2769,7 @@ static Attribute foldConcatenateHelper(ConcatenateOp* op,
   SmallVector<T, 6> values;
   for (size_t i = 0; i < topSize; i++) {
     for (auto operand : operands) {
-      DenseElementsAttr attr = operand.cast<DenseElementsAttr>();
+      DenseElementsAttr attr = cast<DenseElementsAttr>(operand);
       size_t bottomSize = attr.getNumElements() / topSize;
       auto iter = attr.getValues<T>().begin() + i * bottomSize;
       values.append(iter, iter + bottomSize);
@@ -2783,13 +2785,13 @@ static Attribute foldConcatenate(ConcatenateOp* op,
     if (!operand) return {};
   }
 
-  auto type = op->getResult().getType().cast<ShapedType>();
+  auto type = cast<ShapedType>(op->getResult().getType());
   auto etype = type.getElementType();
-  if (etype.isa<IntegerType>()) {
+  if (isa<IntegerType>(etype)) {
     return foldConcatenateHelper<APInt>(op, operands);
   }
 
-  if (etype.isa<FloatType>()) {
+  if (isa<FloatType>(etype)) {
     return foldConcatenateHelper<APFloat>(op, operands);
   }
 
@@ -2801,7 +2803,7 @@ OpFoldResult ConcatenateOp::fold(FoldAdaptor adaptor) {
   if (getNumOperands() == 1 && getOperand(0).getType() == getType())
     return getOperand(0);
 
-  ShapedType type = getResult().getType().cast<ShapedType>();
+  ShapedType type = cast<ShapedType>(getResult().getType());
   if (!type.hasStaticShape()) return {};
 
   auto axis = getDimension();
@@ -2810,7 +2812,7 @@ OpFoldResult ConcatenateOp::fold(FoldAdaptor adaptor) {
   }
 
   for (auto operand : getOperands()) {
-    auto ty = operand.getType().cast<ShapedType>();
+    auto ty = cast<ShapedType>(operand.getType());
     if (ty.getDimSize(axis) != 0) {
       return {};
     }
@@ -2825,7 +2827,7 @@ LogicalResult ConcatenateOp::reifyReturnTypeShapes(
   ConcatenateOp::Adaptor adaptor(operands);
   auto inputs = adaptor.getVal();
 
-  auto operandType = inputs[0].getType().dyn_cast<RankedTensorType>();
+  auto operandType = dyn_cast<RankedTensorType>(inputs[0].getType());
   // Not support unranked type a.t.m.
   if (!operandType) return failure();
 
@@ -2838,7 +2840,7 @@ LogicalResult ConcatenateOp::reifyReturnTypeShapes(
   SmallVector<SmallVector<Value, 4>, 4> allShapeValues;
   for (size_t inputId = 0; inputId < inputs.size(); ++inputId) {
     Value operand = inputs[inputId];
-    auto operandType = operand.getType().dyn_cast<RankedTensorType>();
+    auto operandType = dyn_cast<RankedTensorType>(operand.getType());
     if (!operandType) return failure();
 
     SmallVector<Value, 4> shapeVals;
@@ -2881,11 +2883,14 @@ LogicalResult DynamicReshapeOp::verify() {
   // Check for unranked dynamism. Unranked dynamism is not supported by
   // StableHLO (hlo::verifyDynamicReshapeOp will fail) and we can't verify
   // anything statically in that case anyway.
-  auto resultType = getResult().getType().cast<ShapedType>();
-  auto outputShapeType = getOutputShape().getType().cast<ShapedType>();
-  if (!resultType.hasRank() || !outputShapeType.hasStaticShape())
+  auto operandType = cast<ShapedType>(getOperand().getType());
+  auto resultType = cast<ShapedType>(getResult().getType());
+  auto outputShapeType = cast<ShapedType>(getOutputShape().getType());
+  if (!operandType.hasRank() || !resultType.hasRank() ||
+      !outputShapeType.hasStaticShape())
     return success();
-  return hlo::verifyDynamicReshapeOp(getLoc(), getOutputShape(), getResult());
+  return hlo::verifyDynamicReshapeOp(getLoc(), getOperand(), getOutputShape(),
+                                     getResult());
 }
 
 LogicalResult DynamicReshapeOp::reifyReturnTypeShapes(
@@ -2904,7 +2909,7 @@ class DynamicReshapeOpNotActuallyDynamic
   using OpRewritePattern::OpRewritePattern;
   LogicalResult matchAndRewrite(DynamicReshapeOp op,
                                 PatternRewriter& rewriter) const override {
-    auto type = op.getResult().getType().dyn_cast<RankedTensorType>();
+    auto type = dyn_cast<RankedTensorType>(op.getResult().getType());
     if (!type || !type.hasStaticShape()) {
       return rewriter.notifyMatchFailure(op, "requires static shape tensor");
     }
@@ -2931,12 +2936,12 @@ class RemoveRedundantRank1DynamicReshape
   using OpRewritePattern::OpRewritePattern;
   LogicalResult matchAndRewrite(DynamicReshapeOp op,
                                 PatternRewriter& rewriter) const override {
-    auto type = op.getResult().getType().dyn_cast<RankedTensorType>();
+    auto type = dyn_cast<RankedTensorType>(op.getResult().getType());
     if (!type || type.getRank() != 1 || type.hasStaticShape()) {
       return rewriter.notifyMatchFailure(
           op, "requires rank 1 shape tensor with dynamic dimension");
     }
-    auto operandType = op.getOperand().getType().dyn_cast<RankedTensorType>();
+    auto operandType = dyn_cast<RankedTensorType>(op.getOperand().getType());
     if (!operandType || operandType.getRank() != 1 ||
         operandType.hasStaticShape()) {
       return rewriter.notifyMatchFailure(
@@ -3012,7 +3017,7 @@ struct DynamicSliceToSlice : public OpRewritePattern<DynamicSliceOp> {
   LogicalResult matchAndRewrite(DynamicSliceOp dynamicSlice,
                                 PatternRewriter& rewriter) const override {
     Value input = dynamicSlice.getOperand();
-    auto inputTensor = input.getType().dyn_cast<RankedTensorType>();
+    auto inputTensor = dyn_cast<RankedTensorType>(input.getType());
     if (!inputTensor || !inputTensor.hasStaticShape()) return failure();
 
     auto sliceSizes = dynamicSlice.getSliceSizes().getValues<int64_t>();
@@ -3126,7 +3131,7 @@ struct RealDSliceToDSlice : public OpRewritePattern<RealDynamicSliceOp> {
           rewriter.getI64TensorAttr(i + 1), rewriter.getI64TensorAttr(1));
       auto startIndex0DType = RankedTensorType::get(
           {},
-          op.getStartIndices().getType().cast<ShapedType>().getElementType());
+          cast<ShapedType>(op.getStartIndices().getType()).getElementType());
       auto startIndex0D = rewriter.create<ReshapeOp>(
           op.getLoc(), startIndex0DType, startIndex1D);
       startIndices.push_back(startIndex0D);
@@ -3154,7 +3159,7 @@ LogicalResult RealDynamicSliceOp::reifyReturnTypeShapes(
   Value limitIndices = adaptor.getLimitIndices();
   Value strides = adaptor.getStrides();
 
-  auto operandType = operand.getType().dyn_cast<RankedTensorType>();
+  auto operandType = dyn_cast<RankedTensorType>(operand.getType());
   // Not support unranked type a.t.m.
   if (!operandType) return failure();
 
@@ -3162,7 +3167,7 @@ LogicalResult RealDynamicSliceOp::reifyReturnTypeShapes(
   SmallVector<Value, 4> shapeValues;
   shapeValues.reserve(operandType.getRank());
   Type shapeScalarType =
-      startIndices.getType().cast<ShapedType>().getElementType();
+      cast<ShapedType>(startIndices.getType()).getElementType();
   Value one = builder.create<arith::ConstantIndexOp>(loc, 1);
   one = maybeCastTo(builder, loc, one, shapeScalarType);
   for (const auto& element : llvm::enumerate(operandType.getShape())) {
@@ -3347,8 +3352,8 @@ Operation* ReduceWindowOp::getReductionOp(int resultIndex) {
   auto returnOp = cast<ReturnOp>(getBody().front().getTerminator());
   Operation* computeOp = returnOp.getResults()[resultIndex].getDefiningOp();
   if (computeOp->getNumOperands() != 2) return nullptr;
-  auto arg0 = computeOp->getOperand(0).dyn_cast<BlockArgument>();
-  auto arg1 = computeOp->getOperand(1).dyn_cast<BlockArgument>();
+  auto arg0 = dyn_cast<BlockArgument>(computeOp->getOperand(0));
+  auto arg1 = dyn_cast<BlockArgument>(computeOp->getOperand(1));
   if (!arg0 || !arg1) return nullptr;
   int64_t arg0Num = arg0.getArgNumber();
   int64_t arg1Num = arg1.getArgNumber();
@@ -3362,10 +3367,10 @@ Operation* ReduceWindowOp::getReductionOp(int resultIndex) {
 
 bool isSplatZero(SplatElementsAttr attr) {
   if (!attr) return false;
-  if (attr.getElementType().isa<FloatType>()) {
+  if (isa<FloatType>(attr.getElementType())) {
     return attr.getSplatValue<APFloat>().isZero();
   }
-  if (attr.getElementType().isa<IntegerType>()) {
+  if (isa<IntegerType>(attr.getElementType())) {
     return attr.getSplatValue<APInt>().isZero();
   }
   return false;
@@ -3393,7 +3398,7 @@ LogicalResult ReduceWindowOp::fold(FoldAdaptor adaptor,
 
   // Fold no-op single input sum reduction.
   if (getInputs().size() == 1 &&
-      isSplatZero(operands[1].dyn_cast_or_null<SplatElementsAttr>()) &&
+      isSplatZero(dyn_cast_or_null<SplatElementsAttr>(operands[1])) &&
       emptyOrAllEq(getWindowDimensionsAttr(), 1) &&
       emptyOrAllEq(getWindowStrides(), 1) &&
       emptyOrAllEq(getBaseDilations(), 1) &&
@@ -3431,13 +3436,13 @@ void ReduceWindowOp::build(
   blockArgTypes.reserve(numValues);
   locs.reserve(numValues);
   for (auto i : inputs) {
-    auto iType = i.getType().cast<ShapedType>();
+    auto iType = cast<ShapedType>(i.getType());
     blockArgTypes.push_back(iType.cloneWith(
         llvm::ArrayRef<int64_t>(std::nullopt), iType.getElementType()));
     locs.push_back(i.getLoc());
   }
   for (auto i : init_values) {
-    auto iType = i.getType().cast<ShapedType>();
+    auto iType = cast<ShapedType>(i.getType());
     blockArgTypes.push_back(iType.cloneWith(
         llvm::ArrayRef<int64_t>(std::nullopt), iType.getElementType()));
     locs.push_back(i.getLoc());
@@ -3545,7 +3550,7 @@ OpFoldResult ReverseOp::fold(FoldAdaptor adaptor) {
 
   // If size of all dimensions to reverse equals 1, then the reverse is a no-op.
   // Eg. Reverse dimensions {0,1} of a 1x1x2 tensor
-  auto shapedType = input.getType().cast<ShapedType>();
+  auto shapedType = cast<ShapedType>(input.getType());
   if (llvm::all_of(dims.getValues<int64_t>(), [&](int64_t dim) {
         return shapedType.getDimSize(dim) == 1;
       }))
@@ -3554,12 +3559,12 @@ OpFoldResult ReverseOp::fold(FoldAdaptor adaptor) {
   // If the operand is a static shaped tensor of constants, return reversed
   // tensor
   DenseElementsAttr inputAttr =
-      operands.begin()->dyn_cast_or_null<DenseElementsAttr>();
+      mlir::dyn_cast_or_null<DenseElementsAttr>(*operands.begin());
   if (inputAttr && shapedType.hasStaticShape()) {
     auto etype = shapedType.getElementType();
-    if (etype.isa<IntegerType>())
+    if (isa<IntegerType>(etype))
       return foldReverseHelper<APInt>(inputAttr, shapedType, dims);
-    if (etype.isa<FloatType>())
+    if (isa<FloatType>(etype))
       return foldReverseHelper<APFloat>(inputAttr, shapedType, dims);
   }
 
@@ -3665,10 +3670,10 @@ void ReduceOp::build(OpBuilder&, OperationState& odsState, ValueRange inputs,
 
   SmallVector<ShapedType> inputArgTensorTypes{
       llvm::map_range(adaptor.getInputs().getTypes(),
-                      [](Type t) { return t.cast<ShapedType>(); })};
+                      [](Type t) { return cast<ShapedType>(t); })};
   SmallVector<ShapedType> initValueTensorTypes{
       llvm::map_range(adaptor.getInitValues().getTypes(),
-                      [](Type t) { return t.cast<ShapedType>(); })};
+                      [](Type t) { return cast<ShapedType>(t); })};
 
   if (succeeded(hlo::verifyReduceOpInputsAndInferShape(
           odsState.location, inputArgTensorTypes,
@@ -3729,12 +3734,12 @@ struct LowerBoolSplatConstantsIntoRegion : public OpRewritePattern<ReduceOp> {
       ConstantOp cst = inp.getDefiningOp<ConstantOp>();
       if (!cst) return failure();
 
-      auto cstAttr = cst.getValue().dyn_cast_or_null<DenseElementsAttr>();
+      auto cstAttr = dyn_cast_or_null<DenseElementsAttr>(cst.getValue());
       if (!cstAttr.isSplat()) {
         return rewriter.notifyMatchFailure(op, "Must be splat constant.");
       }
 
-      auto bargShapedType = barg.getType().dyn_cast<ShapedType>();
+      auto bargShapedType = dyn_cast<ShapedType>(barg.getType());
       if (!bargShapedType) return failure();
 
       auto bargCstAttr = DenseElementsAttr::get(
@@ -3798,7 +3803,7 @@ LogicalResult ReduceOp::reifyReturnTypeShapes(
   ReduceOp::Adaptor adaptor(operands);
   auto inputs = adaptor.getInputs();
 
-  auto operandType = inputs[0].getType().dyn_cast<RankedTensorType>();
+  auto operandType = dyn_cast<RankedTensorType>(inputs[0].getType());
   // Not support unranked type a.t.m.
   if (!operandType) return failure();
 
@@ -3899,7 +3904,7 @@ LogicalResult RngOp::reifyReturnTypeShapes(
 //===----------------------------------------------------------------------===//
 
 LogicalResult XlaRngGetAndUpdateStateOp::verify() {
-  auto resultTy = getType().cast<RankedTensorType>();
+  auto resultTy = cast<RankedTensorType>(getType());
   if (!resultTy) return emitOpError() << "Output is not ranked.";
   if (!resultTy.hasStaticShape())
     return emitOpError() << "Output is not statically shaped.";
@@ -3931,12 +3936,12 @@ OpFoldResult SelectOp::fold(FoldAdaptor adaptor) {
     return getOnTrue();
   }
 
-  auto predicate = operands[0].dyn_cast_or_null<DenseIntElementsAttr>();
+  auto predicate = dyn_cast_or_null<DenseIntElementsAttr>(operands[0]);
   if (!predicate) {
     return {};
   }
 
-  auto predicateTy = predicate.getType().cast<ShapedType>();
+  auto predicateTy = cast<ShapedType>(predicate.getType());
   if (!predicateTy.getElementType().isInteger(1)) {
     return {};
   }
@@ -3979,13 +3984,13 @@ LogicalResult SelectOp::reifyReturnTypeShapes(
 
 OpFoldResult SetDimensionSizeOp::fold(FoldAdaptor adaptor) {
   auto operands = adaptor.getOperands();
-  DenseElementsAttr input = operands[0].dyn_cast_or_null<DenseElementsAttr>();
+  DenseElementsAttr input = dyn_cast_or_null<DenseElementsAttr>(operands[0]);
   if (input) return input;
 
-  DenseElementsAttr size = operands[1].dyn_cast_or_null<DenseElementsAttr>();
+  DenseElementsAttr size = dyn_cast_or_null<DenseElementsAttr>(operands[1]);
   if (!size || !size.isSplat()) return {};
 
-  auto ty = getType().dyn_cast<RankedTensorType>();
+  auto ty = dyn_cast<RankedTensorType>(getType());
   if (!ty) return {};
 
   int64_t dimSize = ty.getDimSize(getDimension());
@@ -4088,25 +4093,25 @@ OpFoldResult PadOp::fold(FoldAdaptor adaptor) {
       llvm::any_of(getInteriorPadding().getValues<APInt>(), isNegative))
     return {};
 
-  DenseElementsAttr input = operands[0].dyn_cast_or_null<DenseElementsAttr>();
-  DenseElementsAttr padding = operands[1].dyn_cast_or_null<DenseElementsAttr>();
-  RankedTensorType returnType = getType().dyn_cast_or_null<RankedTensorType>();
+  DenseElementsAttr input = dyn_cast_or_null<DenseElementsAttr>(operands[0]);
+  DenseElementsAttr padding = dyn_cast_or_null<DenseElementsAttr>(operands[1]);
+  RankedTensorType returnType = dyn_cast_or_null<RankedTensorType>(getType());
   if (!input || !input.getType().hasRank() || !padding || !returnType ||
       !returnType.hasStaticShape())
     return {};
 
-  if (returnType.getElementType().isa<IntegerType>())
+  if (isa<IntegerType>(returnType.getElementType()))
     return padOpFoldHelper<APInt>(input, padding, returnType,
                                   getEdgePaddingLow(), getEdgePaddingHigh(),
                                   getInteriorPadding());
-  if (returnType.getElementType().isa<FloatType>())
+  if (isa<FloatType>(returnType.getElementType()))
     return padOpFoldHelper<APFloat>(input, padding, returnType,
                                     getEdgePaddingLow(), getEdgePaddingHigh(),
                                     getInteriorPadding());
   if (ComplexType complex =
-          returnType.getElementType().dyn_cast_or_null<ComplexType>()) {
+          dyn_cast_or_null<ComplexType>(returnType.getElementType())) {
     // TODO(atondwal): Allow int types in HLO_complex
-    if (complex.getElementType().isa<FloatType>())
+    if (isa<FloatType>(complex.getElementType()))
       return padOpFoldHelper<std::complex<APFloat>>(
           input, padding, returnType, getEdgePaddingLow(), getEdgePaddingHigh(),
           getInteriorPadding());
@@ -4121,7 +4126,7 @@ LogicalResult PadOp::reifyReturnTypeShapes(
                          this->getOperation()->getPropertiesStorage());
   auto loc = this->getLoc();
   Value operand = adaptor.getOperand();
-  auto operandTy = operand.getType().cast<RankedTensorType>();
+  auto operandTy = cast<RankedTensorType>(operand.getType());
 
   llvm::SmallVector<int32_t> padHigh;
   llvm::SmallVector<int32_t> padLow;
@@ -4189,8 +4194,8 @@ struct PadEmptyTensor : public OpRewritePattern<PadOp> {
     auto operand = op.getOperand();
     auto padVal = op.getPaddingValue();
 
-    auto operandTy = operand.getType().cast<RankedTensorType>();
-    auto resultTy = op.getType().cast<RankedTensorType>();
+    auto operandTy = cast<RankedTensorType>(operand.getType());
+    auto resultTy = cast<RankedTensorType>(op.getType());
 
     if (llvm::all_of(operandTy.getShape(), [](int64_t d) { return d != 0; })) {
       return failure();
@@ -4241,7 +4246,7 @@ struct DynamicPadEmptyTensor : public OpRewritePattern<DynamicPadOp> {
     auto operand = op.getOperand();
     auto padVal = op.getPaddingValue();
 
-    auto operandTy = operand.getType().cast<RankedTensorType>();
+    auto operandTy = cast<RankedTensorType>(operand.getType());
 
     if (llvm::all_of(operandTy.getShape(), [](int64_t d) { return d != 0; })) {
       return failure();
@@ -4282,7 +4287,7 @@ LogicalResult DynamicPadOp::reifyReturnTypeShapes(
   Value edgePaddingHigh = adaptor.getEdgePaddingHigh();
   Value interiorPadding = adaptor.getInteriorPadding();
 
-  auto operandType = operand.getType().dyn_cast<RankedTensorType>();
+  auto operandType = dyn_cast<RankedTensorType>(operand.getType());
   // Not support unranked pad a.t.m.
   if (!operandType) return failure();
 
@@ -4290,7 +4295,7 @@ LogicalResult DynamicPadOp::reifyReturnTypeShapes(
   SmallVector<Value, 4> shapeValues;
   shapeValues.reserve(operandType.getRank());
   Type shapeScalarType =
-      edgePaddingLow.getType().cast<ShapedType>().getElementType();
+      cast<ShapedType>(edgePaddingLow.getType()).getElementType();
 
   auto toShapeScalarType = [&](Value v) {
     return maybeCastTo(builder, loc, v, shapeScalarType);
@@ -4344,8 +4349,8 @@ LogicalResult ReshapeOp::verify() {
   // Check for unranked dynamism. Unranked dynamism is not supported by
   // StableHLO (hlo::verifyReshapeOp will fail) and we can't verify
   // anything statically in that case anyway.
-  auto operandType = getOperand().getType().cast<ShapedType>();
-  auto resultType = getResult().getType().cast<ShapedType>();
+  auto operandType = cast<ShapedType>(getOperand().getType());
+  auto resultType = cast<ShapedType>(getResult().getType());
   if (!operandType.hasRank() || !resultType.hasRank()) {
     return success();
   }
@@ -4363,8 +4368,8 @@ OpFoldResult ReshapeOp::fold(FoldAdaptor adaptor) {
     return getResult();
   }
 
-  if (auto elements = operands.front().dyn_cast_or_null<DenseElementsAttr>()) {
-    return reshape(elements, getResult().getType().cast<ShapedType>());
+  if (auto elements = dyn_cast_or_null<DenseElementsAttr>(operands.front())) {
+    return reshape(elements, cast<ShapedType>(getResult().getType()));
   }
 
   return {};
@@ -4509,10 +4514,10 @@ template <typename Op, typename ElementType, typename ValType, typename Convert,
 static Attribute UnaryFolder(Op* op, ArrayRef<Attribute> attrs) {
   if (!attrs[0]) return {};
 
-  DenseElementsAttr val = attrs[0].dyn_cast<DenseElementsAttr>();
+  DenseElementsAttr val = dyn_cast<DenseElementsAttr>(attrs[0]);
   if (!val) return {};
 
-  ShapedType type = op->getType().template cast<ShapedType>();
+  ShapedType type = cast<ShapedType>(op->getType());
   if (!type.hasStaticShape()) {
     return {};
   }
@@ -4520,7 +4525,7 @@ static Attribute UnaryFolder(Op* op, ArrayRef<Attribute> attrs) {
   Type etype = type.getElementType();
 
   // Evaluate for integer values.
-  if (!etype.isa<ElementType>()) {
+  if (!isa<ElementType>(etype)) {
     return {};
   }
 
@@ -4604,9 +4609,9 @@ double logistic(double d) { return 1.0 / (1.0 + std::exp(-d)); }
         getElementTypeOrSelf(getType())) {                                    \
       return {};                                                              \
     }                                                                         \
-    if (getElementTypeOrSelf(getType()).isa<FloatType>())                     \
+    if (isa<FloatType>(getElementTypeOrSelf(getType())))                      \
       return UnaryFolder<Op, FloatType, APFloat, Func<APFloat>>(this, attrs); \
-    if (getElementTypeOrSelf(getType()).isa<IntegerType>())                   \
+    if (isa<IntegerType>(getElementTypeOrSelf(getType())))                    \
       return UnaryFolder<Op, IntegerType, APInt, Func<APInt>>(this, attrs);   \
     return {};                                                                \
   }
@@ -4614,7 +4619,7 @@ double logistic(double d) { return 1.0 / (1.0 + std::exp(-d)); }
 #define UNARY_FOLDER_INT(Op, Func)                                          \
   OpFoldResult Op::fold(FoldAdaptor adaptor) {                              \
     auto attrs = adaptor.getOperands();                                     \
-    if (getElementTypeOrSelf(getType()).isa<IntegerType>())                 \
+    if (isa<IntegerType>(getElementTypeOrSelf(getType())))                  \
       return UnaryFolder<Op, IntegerType, APInt, Func<APInt>>(this, attrs); \
     return {};                                                              \
   }
@@ -4622,7 +4627,7 @@ double logistic(double d) { return 1.0 / (1.0 + std::exp(-d)); }
 #define UNARY_FOLDER_FLOAT(Op, Func)                                 \
   OpFoldResult Op::fold(FoldAdaptor adaptor) {                       \
     auto attrs = adaptor.getOperands();                              \
-    if (getElementTypeOrSelf(getType()).isa<FloatType>())            \
+    if (isa<FloatType>(getElementTypeOrSelf(getType())))             \
       return UnaryFolder<Op, FloatType, APFloat, Func>(this, attrs); \
     return {};                                                       \
   }
@@ -4645,7 +4650,7 @@ double logistic(double d) { return 1.0 / (1.0 + std::exp(-d)); }
   };                                                                 \
   OpFoldResult Op::fold(FoldAdaptor adaptor) {                       \
     auto attrs = adaptor.getOperands();                              \
-    if (getElementTypeOrSelf(getType()).isa<FloatType>())            \
+    if (isa<FloatType>(getElementTypeOrSelf(getType())))             \
       return UnaryFolder<Op, FloatType, APFloat, Op##Folder,         \
                          Validate<APFloat>>(this, attrs);            \
     return {};                                                       \
@@ -4684,11 +4689,11 @@ template <typename Op, typename ElementType = Type, typename ValType,
 static Attribute BinaryFolder(Op* op, ArrayRef<Attribute> attrs) {
   if (!attrs[0] || !attrs[1]) return {};
 
-  DenseElementsAttr lhs = attrs[0].dyn_cast<DenseElementsAttr>();
-  DenseElementsAttr rhs = attrs[1].dyn_cast<DenseElementsAttr>();
+  DenseElementsAttr lhs = dyn_cast<DenseElementsAttr>(attrs[0]);
+  DenseElementsAttr rhs = dyn_cast<DenseElementsAttr>(attrs[1]);
   if (!lhs || !rhs) return {};
 
-  ShapedType type = op->getType().template cast<ShapedType>();
+  ShapedType type = cast<ShapedType>(op->getType());
   if (!type.hasStaticShape()) {
     return {};
   }
@@ -4696,15 +4701,15 @@ static Attribute BinaryFolder(Op* op, ArrayRef<Attribute> attrs) {
   Type etype = type.getElementType();
 
   // Evaluate for integer values.
-  if (!etype.isa<ElementType>()) {
+  if (!isa<ElementType>(etype)) {
     return {};
   }
 
   // Special case for folding splats no matter how large.
   // Only covers the case of both attrs being splats; operation-specific cases
   // like adding a zero or multiplying by one are handled elsewhere.
-  SplatElementsAttr splatLhs = lhs.dyn_cast<SplatElementsAttr>();
-  SplatElementsAttr splatRhs = rhs.dyn_cast<SplatElementsAttr>();
+  SplatElementsAttr splatLhs = dyn_cast<SplatElementsAttr>(lhs);
+  SplatElementsAttr splatRhs = dyn_cast<SplatElementsAttr>(rhs);
   if (splatLhs && splatRhs) {
     auto signedLhs = addSign(splatLhs.getSplatValue<ValType>(), etype);
     auto signedRhs = addSign(splatRhs.getSplatValue<ValType>(), etype);
@@ -4792,9 +4797,9 @@ struct Min<APFloat> {
 };
 
 #define BINARY_FOLDER_INTERNAL(Op, Func)                                     \
-  if (getElementTypeOrSelf(getType()).isa<FloatType>())                      \
+  if (isa<FloatType>(getElementTypeOrSelf(getType())))                       \
     return BinaryFolder<Op, FloatType, APFloat, Func<APFloat>>(this, attrs); \
-  if (getElementTypeOrSelf(getType()).isa<IntegerType>())                    \
+  if (isa<IntegerType>(getElementTypeOrSelf(getType())))                     \
     return BinaryFolder<Op, IntegerType, APInt, Func<APSInt>>(this, attrs);  \
   return {};
 
@@ -4818,8 +4823,8 @@ OpFoldResult AddOp::fold(FoldAdaptor adaptor) {
   auto attrs = adaptor.getOperands();
   // Handle special case where one operand is 0:  x + 0 => x
   if (attrs[0] || attrs[1]) {
-    SplatElementsAttr splatLhs = attrs[0].dyn_cast_or_null<SplatElementsAttr>();
-    SplatElementsAttr splatRhs = attrs[1].dyn_cast_or_null<SplatElementsAttr>();
+    SplatElementsAttr splatLhs = dyn_cast_or_null<SplatElementsAttr>(attrs[0]);
+    SplatElementsAttr splatRhs = dyn_cast_or_null<SplatElementsAttr>(attrs[1]);
     if (isSplatZero(splatLhs))
       return splatRhs ? (OpFoldResult)splatRhs : getRhs();
     if (isSplatZero(splatRhs))
@@ -4833,10 +4838,10 @@ OpFoldResult AddOp::fold(FoldAdaptor adaptor) {
 
 bool isSplatOne(SplatElementsAttr attr) {
   if (!attr) return false;
-  if (attr.getElementType().isa<FloatType>()) {
+  if (isa<FloatType>(attr.getElementType())) {
     return attr.getSplatValue<APFloat>().convertToDouble() == 1.0;
   }
-  if (attr.getElementType().isa<IntegerType>()) {
+  if (isa<IntegerType>(attr.getElementType())) {
     return attr.getSplatValue<APInt>().getSExtValue() == 1;
   }
   return false;
@@ -4846,8 +4851,8 @@ OpFoldResult MulOp::fold(FoldAdaptor adaptor) {
   auto attrs = adaptor.getOperands();
   // Handle special case where one operand is 1: x * 1 => x
   if (attrs[0] || attrs[1]) {
-    SplatElementsAttr splatLhs = attrs[0].dyn_cast_or_null<SplatElementsAttr>();
-    SplatElementsAttr splatRhs = attrs[1].dyn_cast_or_null<SplatElementsAttr>();
+    SplatElementsAttr splatLhs = dyn_cast_or_null<SplatElementsAttr>(attrs[0]);
+    SplatElementsAttr splatRhs = dyn_cast_or_null<SplatElementsAttr>(attrs[1]);
     if (isSplatOne(splatLhs))
       return splatRhs ? (OpFoldResult)splatRhs : getRhs();
     if (isSplatOne(splatRhs))
@@ -4867,8 +4872,8 @@ OpFoldResult AndOp::fold(FoldAdaptor adaptor) {
   auto operands = adaptor.getOperands();
   if (getLhs() == getRhs()) return getLhs();
 
-  auto lhsVal = operands[0].dyn_cast_or_null<DenseElementsAttr>();
-  auto rhsVal = operands[1].dyn_cast_or_null<DenseElementsAttr>();
+  auto lhsVal = dyn_cast_or_null<DenseElementsAttr>(operands[0]);
+  auto rhsVal = dyn_cast_or_null<DenseElementsAttr>(operands[1]);
 
   if (lhsVal && lhsVal.isSplat()) {
     if (lhsVal.getSplatValue<IntegerAttr>().getValue().isAllOnes()) {
@@ -4899,8 +4904,8 @@ OpFoldResult OrOp::fold(FoldAdaptor adaptor) {
   auto operands = adaptor.getOperands();
   if (getLhs() == getRhs()) return getLhs();
 
-  auto lhsVal = operands[0].dyn_cast_or_null<DenseElementsAttr>();
-  auto rhsVal = operands[1].dyn_cast_or_null<DenseElementsAttr>();
+  auto lhsVal = dyn_cast_or_null<DenseElementsAttr>(operands[0]);
+  auto rhsVal = dyn_cast_or_null<DenseElementsAttr>(operands[1]);
 
   if (lhsVal && lhsVal.isSplat()) {
     if (lhsVal.getSplatValue<IntegerAttr>().getValue().isAllOnes()) {
@@ -4930,14 +4935,14 @@ OpFoldResult OrOp::fold(FoldAdaptor adaptor) {
 OpFoldResult XorOp::fold(FoldAdaptor adaptor) {
   auto operands = adaptor.getOperands();
   // Fold x^x to 0. Attributes only support static shapes.
-  auto rType = getType().cast<ShapedType>();
+  auto rType = cast<ShapedType>(getType());
   if (getLhs() == getRhs() && rType.hasStaticShape()) {
     Builder builder(getContext());
     return builder.getZeroAttr(rType);
   }
 
-  auto lhsVal = operands[0].dyn_cast_or_null<DenseElementsAttr>();
-  auto rhsVal = operands[1].dyn_cast_or_null<DenseElementsAttr>();
+  auto lhsVal = dyn_cast_or_null<DenseElementsAttr>(operands[0]);
+  auto rhsVal = dyn_cast_or_null<DenseElementsAttr>(operands[1]);
 
   if (lhsVal && lhsVal.isSplat()) {
     if (lhsVal.getSplatValue<IntegerAttr>().getValue().isZero()) {
@@ -4965,9 +4970,9 @@ OpFoldResult XorOp::fold(FoldAdaptor adaptor) {
 
 OpFoldResult ClampOp::fold(FoldAdaptor adaptor) {
   auto operands = adaptor.getOperands();
-  auto operand = operands[1].dyn_cast_or_null<ElementsAttr>();
-  auto min = operands[0].dyn_cast_or_null<ElementsAttr>();
-  auto max = operands[2].dyn_cast_or_null<ElementsAttr>();
+  auto operand = dyn_cast_or_null<ElementsAttr>(operands[1]);
+  auto min = dyn_cast_or_null<ElementsAttr>(operands[0]);
+  auto max = dyn_cast_or_null<ElementsAttr>(operands[2]);
   if (!operand || !min || !max) {
     return {};
   }
@@ -4980,12 +4985,12 @@ OpFoldResult ClampOp::fold(FoldAdaptor adaptor) {
                                  max.getValues<Attribute>()[0]);
   }
   Attribute result = {};
-  if (operand.getShapedType().getElementType().isa<FloatType>()) {
+  if (isa<FloatType>(operand.getShapedType().getElementType())) {
     result = BinaryFolder<ClampOp, FloatType, APFloat, Max<APFloat>>(
         this, ArrayRef<Attribute>{min, operand});
     result = BinaryFolder<ClampOp, FloatType, APFloat, Min<APFloat>>(
         this, ArrayRef<Attribute>{max, result});
-  } else if (operand.getShapedType().getElementType().isa<IntegerType>()) {
+  } else if (isa<IntegerType>(operand.getShapedType().getElementType())) {
     result = BinaryFolder<ClampOp, IntegerType, APInt, Max<APSInt>>(
         this, ArrayRef<Attribute>{min, operand});
     result = BinaryFolder<ClampOp, IntegerType, APInt, Min<APSInt>>(
@@ -5067,14 +5072,14 @@ static Attribute foldSlice(SliceOp* op, I values) {
   auto stride = llvm::to_vector<6>(op->getStrides().getValues<int64_t>());
 
   // TODO(b/235903849): This should be op->getType().case<ShapedType>().
-  auto resultType = op->getOperand().getType().cast<ShapedType>();
+  auto resultType = cast<ShapedType>(op->getOperand().getType());
   if (!resultType.hasStaticShape()) return {};
 
   auto shape = resultType.getShape();
   int64_t count = resultType.getNumElements();
   if (count == 0) {
     return DenseElementsAttr::get<E>(
-        op->getResult().getType().cast<ShapedType>(),
+        cast<ShapedType>(op->getResult().getType()),
         /*list=*/{});
   }
 
@@ -5093,15 +5098,15 @@ static Attribute foldSlice(SliceOp* op, I values) {
   outValues.reserve(resultType.getNumElements());
   sliceElements<I, E>(values, sizes, start, limit, stride, &outValues);
 
-  return DenseElementsAttr::get(op->getResult().getType().cast<ShapedType>(),
+  return DenseElementsAttr::get(cast<ShapedType>(op->getResult().getType()),
                                 outValues);
 }
 
 OpFoldResult SliceOp::fold(FoldAdaptor adaptor) {
   auto operands = adaptor.getOperands();
   // Check if the SliceOp is a NoOp operation.
-  auto operandType = getOperand().getType().cast<ShapedType>();
-  auto resultType = getResult().getType().cast<ShapedType>();
+  auto operandType = cast<ShapedType>(getOperand().getType());
+  auto resultType = cast<ShapedType>(getResult().getType());
 
   if (operandType.hasStaticShape() && resultType.hasStaticShape() &&
       (operandType.getShape() == resultType.getShape())) {
@@ -5111,15 +5116,15 @@ OpFoldResult SliceOp::fold(FoldAdaptor adaptor) {
   if (operands.empty() || !operands.front()) return {};
 
   // Evaluate for statically valued inputs.
-  DenseElementsAttr elements = operands.front().dyn_cast<DenseElementsAttr>();
+  DenseElementsAttr elements = dyn_cast<DenseElementsAttr>(operands.front());
   if (!elements) return {};
 
   auto etype = elements.getType().getElementType();
-  if (etype.isa<IntegerType>()) {
+  if (isa<IntegerType>(etype)) {
     return foldSlice<DenseElementsAttr::IntElementIterator, APInt>(
         this, elements.value_begin<APInt>());
   }
-  if (etype.isa<FloatType>()) {
+  if (isa<FloatType>(etype)) {
     return foldSlice<DenseElementsAttr::FloatElementIterator, APFloat>(
         this, elements.value_begin<APFloat>());
   }
@@ -5137,13 +5142,13 @@ struct SimplifyConcatSlice : public OpRewritePattern<SliceOp> {
 
   LogicalResult matchAndRewrite(SliceOp slice,
                                 PatternRewriter& rewriter) const override {
-    auto resultTy = slice.getType().cast<ShapedType>();
+    auto resultTy = cast<ShapedType>(slice.getType());
     if (!resultTy.hasStaticShape()) {
       return failure();
     }
 
     auto sliceInput = slice.getOperand();
-    auto sliceInputTy = sliceInput.getType().cast<ShapedType>();
+    auto sliceInputTy = cast<ShapedType>(sliceInput.getType());
     auto concat = sliceInput.getDefiningOp<ConcatenateOp>();
     if (!concat) {
       return failure();
@@ -5167,7 +5172,7 @@ struct SimplifyConcatSlice : public OpRewritePattern<SliceOp> {
     auto subsetEnd = concat.operand_end();
     for (auto it = concat.operand_begin(); it < concat.operand_end(); ++it) {
       auto input = *it;
-      ShapedType inputTy = input.getType().cast<ShapedType>();
+      ShapedType inputTy = cast<ShapedType>(input.getType());
       if (inputTy.isDynamicDim(dimension)) {
         return failure();
       }
@@ -5215,7 +5220,7 @@ struct SimplifyConcatSlice : public OpRewritePattern<SliceOp> {
     newStart[dimension] -= frontOffset;
     newLimit[dimension] -= frontOffset;
 
-    auto attrType = slice.getStartIndices().getType().cast<ShapedType>();
+    auto attrType = cast<ShapedType>(slice.getStartIndices().getType());
     auto create = rewriter.create<SliceOp>(
         slice.getLoc(), newConcat,
         DenseIntElementsAttr::get(attrType, newStart),
@@ -5309,7 +5314,7 @@ static LogicalResult sortDropEmptyUseArgs(SortOp op,
 /// is known.
 static LogicalResult sortOpInferDefaultDimension(SortOp op,
                                                  PatternRewriter& rewriter) {
-  auto ty = op.getResultTypes()[0].dyn_cast<ShapedType>();
+  auto ty = dyn_cast<ShapedType>(op.getResultTypes()[0]);
   if (!ty) {
     return failure();
   }
@@ -5353,8 +5358,8 @@ LogicalResult TopKOp::inferReturnTypeComponents(
 
 OpFoldResult TransposeOp::fold(FoldAdaptor adaptor) {
   auto operands = adaptor.getOperands();
-  if (auto elements = operands.front().dyn_cast_or_null<SplatElementsAttr>()) {
-    return reshape(elements, getResult().getType().cast<ShapedType>());
+  if (auto elements = dyn_cast_or_null<SplatElementsAttr>(operands.front())) {
+    return reshape(elements, cast<ShapedType>(getResult().getType()));
   }
   for (const auto& it : llvm::enumerate(getPermutation().getValues<APInt>())) {
     if (it.index() != it.value()) {
@@ -5374,12 +5379,11 @@ static LogicalResult eliminateRedundantTranspse(TransposeOp op,
   }
   auto operandPermutation = tranposeOperand.getPermutation().getValues<APInt>();
   auto newPermutation =
-      op.getPermutation()
-          .mapValues(op.getPermutation().getElementType(),
-                     [&operandPermutation](const APInt& index) -> APInt {
-                       return operandPermutation[index.getSExtValue()];
-                     })
-          .cast<DenseIntElementsAttr>();
+      cast<DenseIntElementsAttr>(op.getPermutation().mapValues(
+          op.getPermutation().getElementType(),
+          [&operandPermutation](const APInt& index) -> APInt {
+            return operandPermutation[index.getSExtValue()];
+          }));
   rewriter.replaceOpWithNewOp<TransposeOp>(op, op.getResult().getType(),
                                            tranposeOperand.getOperand(),
                                            newPermutation);
@@ -5416,8 +5420,8 @@ static LogicalResult eliminateBroadcastInDimTranspose(
 // simplify Transpose: replace Transpose with Reshape if they are equivalent
 static LogicalResult simplifyTranspose(TransposeOp op,
                                        PatternRewriter& rewriter) {
-  auto operandType = op.getOperand().getType().dyn_cast<RankedTensorType>();
-  auto resultType = op.getResult().getType().dyn_cast<RankedTensorType>();
+  auto operandType = dyn_cast<RankedTensorType>(op.getOperand().getType());
+  auto resultType = dyn_cast<RankedTensorType>(op.getResult().getType());
   if (!operandType || !resultType) {
     return failure();
   }
@@ -5453,7 +5457,7 @@ LogicalResult TransposeOp::reifyReturnTypeShapes(
   TransposeOp::Adaptor adaptor(operands);
   Value operand = adaptor.getOperand();
 
-  auto operandType = operand.getType().dyn_cast<RankedTensorType>();
+  auto operandType = dyn_cast<RankedTensorType>(operand.getType());
   // Not support unranked type a.t.m.
   if (!operandType) return failure();
 
@@ -5590,18 +5594,17 @@ template <typename Op, typename ElementType, typename SrcType, typename Convert>
 static Attribute CompareFolder(CompareOp op, ArrayRef<Attribute> attrs) {
   if (!attrs[0] || !attrs[1]) return {};
 
-  DenseElementsAttr lhs = attrs[0].dyn_cast<DenseElementsAttr>();
-  DenseElementsAttr rhs = attrs[1].dyn_cast<DenseElementsAttr>();
+  DenseElementsAttr lhs = dyn_cast<DenseElementsAttr>(attrs[0]);
+  DenseElementsAttr rhs = dyn_cast<DenseElementsAttr>(attrs[1]);
   if (!lhs || !rhs) return {};
 
-  ShapedType operandType =
-      op.getOperand(0).getType().template cast<ShapedType>();
+  ShapedType operandType = cast<ShapedType>(op.getOperand(0).getType());
   if (!operandType.hasStaticShape()) {
     return {};
   }
 
   auto etype = operandType.getElementType();
-  if (!etype.isa<ElementType>()) {
+  if (!isa<ElementType>(etype)) {
     return {};
   }
 
@@ -5617,20 +5620,20 @@ static Attribute CompareFolder(CompareOp op, ArrayRef<Attribute> attrs) {
                   addSign(std::get<1>(zip), rhs.getElementType())));
   }
 
-  auto resultTy = op.getType().cast<ShapedType>();
+  auto resultTy = cast<ShapedType>(op.getType());
   return DenseElementsAttr::get(resultTy, values);
 }
 
 OpFoldResult CompareOp::fold(FoldAdaptor adaptor) {
   auto operands = adaptor.getOperands();
-  auto resultTy = getType().cast<ShapedType>();
+  auto resultTy = cast<ShapedType>(getType());
   if (!resultTy.hasStaticShape()) return {};
 
   auto direction = getComparisonDirection();
   auto lhsTy = getElementTypeOrSelf(getLhs());
-  if (getLhs() == getRhs() && !lhsTy.isa<FloatType>() &&
-      (!lhsTy.isa<ComplexType>() ||
-       !lhsTy.cast<ComplexType>().getElementType().isa<FloatType>())) {
+  if (getLhs() == getRhs() && !isa<FloatType>(lhsTy) &&
+      (!isa<ComplexType>(lhsTy) ||
+       !isa<FloatType>(cast<ComplexType>(lhsTy).getElementType()))) {
     if (direction == ComparisonDirection::LE ||
         direction == ComparisonDirection::EQ ||
         direction == ComparisonDirection::GE) {
@@ -5639,7 +5642,7 @@ OpFoldResult CompareOp::fold(FoldAdaptor adaptor) {
     return DenseIntElementsAttr::get(resultTy, {false});
   }
 
-  auto opElType = getLhs().getType().cast<ShapedType>().getElementType();
+  auto opElType = cast<ShapedType>(getLhs().getType()).getElementType();
   // Fold tensor<*xi1> != false to just return tensor<*xi1>
   if (direction == ComparisonDirection::NE && opElType.isInteger(1)) {
     DenseIntElementsAttr cstAttr;
@@ -5787,17 +5790,17 @@ LogicalResult ScatterOp::fold(
   auto args = adaptor.getOperands();
   // Variadic Scatter not yet implemented
   if (getInputs().size() != 1 || getUpdates().size() != 1) return failure();
-  auto index = args[1].dyn_cast_or_null<DenseIntElementsAttr>();
+  auto index = dyn_cast_or_null<DenseIntElementsAttr>(args[1]);
   if (!index) return failure();
 
-  auto baseType = getInputs().getTypes()[0].dyn_cast<RankedTensorType>();
-  auto updateType = getUpdates().getTypes()[0].dyn_cast<RankedTensorType>();
-  auto indexType = index.getType().cast<RankedTensorType>();
+  auto baseType = dyn_cast<RankedTensorType>(getInputs().getTypes()[0]);
+  auto updateType = dyn_cast<RankedTensorType>(getUpdates().getTypes()[0]);
+  auto indexType = cast<RankedTensorType>(index.getType());
   if (!baseType || !indexType || !updateType) return failure();
 
   // TODO(b/228310289): Work around canonicalization crash for complex types.
   // Remove after upstream MLIR has been fixed.
-  if (baseType.getElementType().isa<ComplexType>()) return failure();
+  if (isa<ComplexType>(baseType.getElementType())) return failure();
 
   // Catch a trivial full replacement of base with update, this does not require
   // these to be constant: just that we know the type.
@@ -5808,8 +5811,8 @@ LogicalResult ScatterOp::fold(
     foldResults.push_back(getUpdates()[0]);
     return success();
   }
-  auto base = args[0].dyn_cast_or_null<DenseElementsAttr>();
-  auto update = args[2].dyn_cast_or_null<DenseElementsAttr>();
+  auto base = dyn_cast_or_null<DenseElementsAttr>(args[0]);
+  auto update = dyn_cast_or_null<DenseElementsAttr>(args[2]);
   if (!base || !update) return failure();
 
   // Add the virtual trailing dimension of size 1 if indexVectorDim equals to
@@ -5820,7 +5823,7 @@ LogicalResult ScatterOp::fold(
     auto indexShape = indexType.getShape().vec();
     indexShape.push_back(1);
     indexType = RankedTensorType::get(indexShape, indexType.getElementType());
-    index = reshape(index, indexType).cast<DenseIntElementsAttr>();
+    index = cast<DenseIntElementsAttr>(reshape(index, indexType));
   }
 
   // Increment the multi-dimensional index vector based on the limits for each
@@ -5903,7 +5906,7 @@ LogicalResult ScatterOp::fold(
     auto newValue = evaluateMhloRegion(getUpdateComputation(), {lhs, rhs});
     if (newValue.size() != 1 || !newValue[0]) return failure();
     results[linearBaseIndex] =
-        newValue[0].cast<DenseElementsAttr>().getValues<Attribute>()[0];
+        cast<DenseElementsAttr>(newValue[0]).getValues<Attribute>()[0];
   } while (nextIndex(updateIndex, updateType.getShape()));
 
   foldResults.push_back(DenseElementsAttr::get(baseType, results));
@@ -5921,11 +5924,11 @@ struct ScatterFullReplace : public OpRewritePattern<ScatterOp> {
       return failure();
 
     auto baseType =
-        scatter.getInputs().getTypes()[0].dyn_cast<RankedTensorType>();
+        dyn_cast<RankedTensorType>(scatter.getInputs().getTypes()[0]);
     auto updateType =
-        scatter.getUpdates().getTypes()[0].dyn_cast<RankedTensorType>();
+        dyn_cast<RankedTensorType>(scatter.getUpdates().getTypes()[0]);
     auto indexType =
-        scatter.getScatterIndices().getType().dyn_cast<RankedTensorType>();
+        dyn_cast<RankedTensorType>(scatter.getScatterIndices().getType());
     if (!baseType || !indexType || !updateType) return failure();
 
     // If updates is an empty shape, scatter overwrites the entire tensor.
@@ -6071,6 +6074,24 @@ LogicalResult UniformDequantizeOp::inferReturnTypeComponents(
                                        inferredReturnShapes);
 }
 
+//===----------------------------------------------------------------------===//
+// MinimumBroadcastShapesOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult MinimumBroadcastShapesOp::verify() {
+  // Check that the number of operands matches the number of outputs.
+  unsigned resultShapesCount = getResults().size();
+  unsigned operandShapesCount = getShapes().size();
+  if (operandShapesCount != resultShapesCount)
+    return emitOpError() << "number of operand shapes (" << operandShapesCount
+                         << ") does not match number of result shapes ("
+                         << resultShapesCount << ")";
+  if (operandShapesCount < 2)
+    return emitOpError() << "number of operand shapes (" << operandShapesCount
+                         << ") should be >= 2";
+  return success();
+}
+
 using mlir::hlo::parseWindowAttributes;
 using mlir::hlo::printWindowAttributes;
 
@@ -6131,7 +6152,7 @@ struct MhloHloDialectInterface : public hlo::HloDialectInterface {
     return TokenType::get(getDialect()->getContext());
   }
 
-  bool isTokenType(Type type) const override { return type.isa<TokenType>(); }
+  bool isTokenType(Type type) const override { return isa<TokenType>(type); }
 
   Attribute createTypeExtensions(ArrayRef<int64_t> bounds) const override {
     return TypeExtensionsAttr::get(getDialect()->getContext(), bounds);
@@ -6170,7 +6191,7 @@ Type MhloDialect::parseType(DialectAsmParser& parser) const {
 }
 
 void MhloDialect::printType(Type type, DialectAsmPrinter& os) const {
-  if (type.isa<TokenType>()) {
+  if (isa<TokenType>(type)) {
     os << "token";
     return;
   }
@@ -6840,7 +6861,7 @@ Attribute ArgResultAliasAttr::parse(AsmParser& parser, Type type) {
 static Type getTypeFromTupleIndices(Type type, ArrayRef<int64_t> indices) {
   Type current = type;
   for (auto index : indices) {
-    TupleType tupleType = current.dyn_cast<TupleType>();
+    TupleType tupleType = dyn_cast<TupleType>(current);
     if (!tupleType || index >= static_cast<int64_t>(tupleType.size()))
       return {};
     current = tupleType.getType(index);
@@ -6910,10 +6931,8 @@ LogicalResult verifyCrossProgramPrefetchAttr(CrossProgramPrefetchAttr cpp,
            << "cross_program_prefetch: parameter " << cpp.getParameter()
            << " out of range. main has only " << main.getNumArguments()
            << " arguments";
-  auto type = getTypeFromTupleIndices(main.getArgument(cpp.getParameter())
-                                          .getType()
-                                          .dyn_cast_or_null<TupleType>(),
-                                      cpp.getIndices());
+  auto type = getTypeFromTupleIndices(
+      main.getArgument(cpp.getParameter()).getType(), cpp.getIndices());
   if (!type)
     return module->emitOpError()
            << "cross_program_prefetch: no subshape at given index: "
@@ -6970,7 +6989,7 @@ SortOp createSortOp(PatternRewriter* rewriter, const Location& loc,
   // element type is of type float.
   std::optional<StringRef> compareType = std::nullopt;
   for (auto const& elementType : elementTypes)
-    if (elementType.isa<FloatType>()) {
+    if (isa<FloatType>(elementType)) {
       compareType.emplace("TOTALORDER");
       break;
     }
@@ -6985,15 +7004,15 @@ SortOp createSortOp(PatternRewriter* rewriter, const Location& loc,
 
 Operation* MhloDialect::materializeConstant(OpBuilder& builder, Attribute value,
                                             Type type, Location loc) {
-  auto elementsAttr = value.dyn_cast<ElementsAttr>();
+  auto elementsAttr = dyn_cast<ElementsAttr>(value);
   // HLO dialect constants only support ElementsAttr unlike standard dialect
   // constant which supports all attributes.
   if (!elementsAttr) return nullptr;
-  auto resultShapedType = type.dyn_cast<ShapedType>();
-  auto attrShapedType = elementsAttr.getType().dyn_cast<ShapedType>();
+  auto resultShapedType = dyn_cast<ShapedType>(type);
+  auto attrShapedType = dyn_cast<ShapedType>(elementsAttr.getType());
   if (resultShapedType && attrShapedType) {
-    if (auto quantElemTy = resultShapedType.getElementType()
-                               .dyn_cast<quant::QuantizedType>()) {
+    if (auto quantElemTy =
+            dyn_cast<quant::QuantizedType>(resultShapedType.getElementType())) {
       // Attribute type and shape should match storage type and shape for
       // quantized tensors.
       if ((attrShapedType.getElementType() != quantElemTy.getStorageType()) ||
@@ -7010,7 +7029,7 @@ Operation* MhloDialect::materializeConstant(OpBuilder& builder, Attribute value,
 }
 
 int64_t getNumLeafBuffers(Type type) {
-  if (auto tuple = type.dyn_cast<TupleType>()) {
+  if (auto tuple = dyn_cast<TupleType>(type)) {
     auto ans = 0;
     for (auto type : tuple.getTypes()) ans += getNumLeafBuffers(type);
     return ans;
@@ -7023,13 +7042,13 @@ LogicalResult MhloDialect::verifyRegionArgAttribute(Operation* op,
                                                     unsigned /*regionIndex*/,
                                                     unsigned argIndex,
                                                     NamedAttribute attr) {
-  if (auto aliasAttr = attr.getValue().dyn_cast<ArgResultAliasAttr>()) {
+  if (auto aliasAttr = dyn_cast<ArgResultAliasAttr>(attr.getValue())) {
     if (failed(
             verifyArgResultAliasAttr(attr.getName(), aliasAttr, argIndex, op)))
       return failure();
   }
   if (attr.getName() == "mhlo.parameter_replication") {
-    auto arrayAttr = attr.getValue().dyn_cast<ArrayAttr>();
+    auto arrayAttr = dyn_cast<ArrayAttr>(attr.getValue());
     if (!arrayAttr)
       return op->emitOpError() << "parameter_replication: must be an array";
     auto func = dyn_cast<mlir::FunctionOpInterface>(op);
@@ -7054,18 +7073,18 @@ LogicalResult MhloDialect::verifyRegionArgAttribute(Operation* op,
 
 LogicalResult MhloDialect::verifyOperationAttribute(Operation* op,
                                                     NamedAttribute attr) {
-  if (auto aliasAttr = attr.getValue().dyn_cast<ArgResultAliasAttr>()) {
+  if (auto aliasAttr = dyn_cast<ArgResultAliasAttr>(attr.getValue())) {
     if (!isa<mlir::FunctionOpInterface>(op))
       return op->emitOpError()
              << "attribute " << attr.getName()
              << " can only be used on function-like operations";
   }
   if (attr.getName() == "mhlo.cross_program_prefetches") {
-    auto arrayAttr = attr.getValue().dyn_cast<ArrayAttr>();
+    auto arrayAttr = dyn_cast<ArrayAttr>(attr.getValue());
     if (!arrayAttr)
       return op->emitOpError() << "cross_program_prefetches must be an array";
     for (auto attrElt : arrayAttr) {
-      auto prefetchAttr = attrElt.dyn_cast<CrossProgramPrefetchAttr>();
+      auto prefetchAttr = dyn_cast<CrossProgramPrefetchAttr>(attrElt);
       if (!prefetchAttr)
         return op->emitOpError() << "cross_program_prefetches must be an array "
                                     "of cross_program_prefetch attrs";
@@ -7078,7 +7097,7 @@ LogicalResult MhloDialect::verifyOperationAttribute(Operation* op,
     }
   }
   if (attr.getName() == "mhlo.spmd_parameters_sharding") {
-    auto arrayAttr = attr.getValue().dyn_cast<ArrayAttr>();
+    auto arrayAttr = dyn_cast<ArrayAttr>(attr.getValue());
     if (!arrayAttr)
       return op->emitOpError() << "spmd_parameters_sharding: must be an array";
     auto module = dyn_cast<ModuleOp>(op);
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td
index 63d89a47ab9067..cfba735634a366 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td
+++ b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td
@@ -120,12 +120,12 @@ def MHLO_IotaOp : MHLO_Op<"iota", [Pure]> {
 def MHLO_DynamicIotaOp: MHLO_ShapedInterfaceOp<"dynamic_iota", [Pure]> {
   let summary = "DynamicIota operation";
   let description = [{
-    This operation is a work in progress, so it is not yet included in
-    the specification: https://github.com/openxla/stablehlo/issues/8.
+    This operation is functionally identical to
+    [iota](https://github.com/openxla/stablehlo/blob/main/docs/spec.md#iota)
+    op, but the result shape is specified dynamically via `output_shape`.
 
-    Informally, this operation does the same thing as IotaOp except that the
-    result shape is specified dynamically via `output_shape`:
-    https://github.com/openxla/stablehlo/blob/main/docs/spec.md#iota
+    See:
+    https://github.com/openxla/stablehlo/blob/main/docs/spec.md#dynamic_iota
 
     Example:
     ```mlir
@@ -1293,11 +1293,7 @@ def MHLO_AsyncUpdateOp : MHLO_Op<"async_update", [DeclareOpInterfaceMethods<Infe
     See the documentation for AsyncStart for more information.
   }];
 
-  let arguments = (ins
-    MHLO_AsyncBundle:$bundle,
-    FlatSymbolRefAttr:$called_computation,
-    StrAttr:$execution_thread
-    );
+  let arguments = (ins MHLO_AsyncBundle:$bundle);
 
   let results = (outs MHLO_AsyncBundle);
 
@@ -1317,11 +1313,7 @@ def MHLO_AsyncDoneOp : MHLO_Op<"async_done", [DeclareOpInterfaceMethods<InferTyp
     See the documentation for AsyncStart for more information.
   }];
 
-  let arguments = (ins
-    MHLO_AsyncBundle:$bundle,
-    FlatSymbolRefAttr:$called_computation,
-    StrAttr:$execution_thread
-    );
+  let arguments = (ins MHLO_AsyncBundle:$bundle);
 
   let results = (outs Variadic<MHLO_TensorOrTokenOrTuple>);
   let hasVerifier = 1;
@@ -2134,19 +2126,30 @@ def MHLO_DynamicBroadcastInDimOp : MHLO_ShapedInterfaceOp<
     "dynamic_broadcast_in_dim", [Pure]> {
   let summary = "DynamicBroadcastInDim operation";
   let description = [{
-    This operation is a work in progress, so it is not yet included in
-    the specification: https://github.com/openxla/stablehlo/issues/8.
-
-    Informally, this operation does the same thing as BroadcastInDimOp except
-    that the result shape is specified dynamically via `output_dimensions`:
-    https://github.com/openxla/stablehlo/blob/main/docs/spec.md#broadcast_in_dim
+    This operation is functionally identical to
+    [broadcast_in_dim](https://github.com/openxla/stablehlo/blob/main/docs/spec.md#broadcast_in_dim)
+    op, but the result shape is specified dynamically via `output_dimensions`.
 
     It also accepts optional attributes to express static knowledge about the
     expanding behavior of dimensions. If not specified, all dimensions are
     assumed to be possibly expanding. The sets of dimensions that are known to
     be expanding and the set of dimensions that are known to be non-expanding
     must be disjoint and they must be a subset of the operand's dimensions.
+
+    See: https://github.com/openxla/stablehlo/blob/main/docs/spec.md#dynamic_broadcast_in_dim
+
+    Example:
+    ```mlir
+    %operand = mhlo.constant dense<[[1, 2, 3]]> : tensor<1x3xi64>
+    %output_dimensions = mhlo.constant dense<[2, 3, 2]> : tensor<3xi64>
+    %result = "mhlo.dynamic_broadcast_in_dim"(%operand, %output_dimensions) {
+      broadcast_dimensions = array<i64: 2, 1>,
+      known_expanding_dimensions = array<i64: 0>,
+      known_non_expanding_dimensions = array<i64: 1>
+    } : (tensor<1x3xi64>, tensor<3xi64>) -> tensor<2x3x2xi64>
+    ```
   }];
+
   let arguments = (ins
     MHLO_Tensor:$operand,
     MHLO_DimensionTensor:$output_dimensions,
@@ -2862,16 +2865,17 @@ def MHLO_ReshapeOp: MHLO_Op<"reshape",
 def MHLO_DynamicReshapeOp: MHLO_ShapedInterfaceOp<"dynamic_reshape", [Pure]> {
   let summary = "DynamicReshape operation";
   let description = [{
-    This operation is a work in progress, so it is not yet included in
-    the specification: https://github.com/openxla/stablehlo/issues/8.
+    This operation is functionally identical to
+    [reshape](https://github.com/openxla/stablehlo/blob/main/docs/spec.md#reshape)
+    op, but the result shape is specified dynamically via `output_shape`.
 
-    Informally, this operation does the same thing as ReshapeOp except that the
-    result shape is specified dynamically via `output_shape`:
-    https://github.com/openxla/stablehlo/blob/main/docs/spec.md#reshape
+    See:
+    https://github.com/openxla/stablehlo/blob/main/docs/spec.md#dynamic_reshape
 
     Example:
     ```mlir
-    %0 = mhlo.dynamic_reshape %arg0, %shape : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+    %output_shape = mhlo.constant dense<[3, 2]> : tensor<2xi64>
+    %result = mhlo.dynamic_reshape %operand, %output_shape : (tensor<2x3xi64>, tensor<2xi64>) -> tensor<3x2xi64>
     ```
   }];
 
@@ -3803,7 +3807,7 @@ def MHLO_DynamicGatherOp: MHLO_Op<"dynamic_gather",
   let arguments = (ins
     MHLO_Tensor:$operand,
     MHLO_IntTensor:$start_indices,
-    MHLO_IntTensor:$slice_sizes,
+    MHLO_Static1DIntTensor:$slice_sizes,
     MHLO_GatherDimensionNumbers:$dimension_numbers,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$indices_are_sorted
   );
@@ -3849,61 +3853,50 @@ def MHLO_DynamicConvOp : MHLO_Op<"dynamic_conv", [Pure]> {
   let hasCustomHLOConverter = 1;
 }
 
-def MHLO_ComputeReshapeShapeOp :
-    MHLO_Op<"compute_reshape_shape", [Pure]> {
-  let summary = "ComputeReshapeShape operation";
-  let description = [{
-    This operation is a work in progress, so it is not yet included in
-    the specification: https://github.com/openxla/stablehlo/issues/8.
-
-    Informally, this operation computes an output_shape for DynamicReshapeOp
-    from the `num_elements` number of elements in an operand of DynamicReshapeOp
-    and the `dynamic_shape` shape provided to TF's reshape:
-    https://www.tensorflow.org/api_docs/python/tf/reshape
-
-    For example, for `num_elements = 12` and `dynamic_shape = [2, -1]`,
-    the `result` is going to be `[2, 6]`. If operands are not valid (e.g. if
-    dimensions do not evenly divide the number of elements, or if there are
-    multiple -1 values in dimensions), this leads to undefined behavior.
-
-    Example:
-    ```mlir
-    %result = mhlo.compute_reshape_shape %num_elements, %dynamic_shape
-           : (index, tensor<2xi32>) -> tensor<2xi32>
-    ```
-  }];
+// WARNING: This op was ported from CHLO and is currently only used by
+// KernelGen. Please do not add new uses of this op.
+def MHLO_MinimumBroadcastShapesOp : CppDeprecated<"Do not use this op outside of KernelGen">,
+    MHLO_Op<"minimum_broadcast_shapes", [Pure]> {
+  string summary = "Minimizes the rank of two or more shapes to be broadcasted";
 
-  let arguments = (ins Index:$num_elements, 1DTensorOf<[AnyInteger, Index]>:$dynamic_shape);
-  let results = (outs 1DTensorOf<[AnyInteger, Index]>:$result);
-
-  let hasCustomHLOConverter = 1;
+  string description = [{
+    Given two or more 1D tensors representing shapes, returns one 1D tensor for
+    each operand, where operand `i` corresponds to output `i`.
 
-  let assemblyFormat = "operands attr-dict `:` functional-type(operands, results)";
-}
+    The returned tensors have the property that they specify a shape which is a
+    reshape of the corresponding input shape, and the broadcasted output shape
+    (using shape::BroadcastOp) of the returned shapes is a reshape of the
+    broadcasted output shape of the input shapes. Among all possibilities with
+    this property, the one is chosen which minimizes the rank of each returned
+    shape.
 
-def MHLO_CstrReshapableOp :
-    MHLO_Op<"cstr_reshapable", [Pure]> {
-  let summary = "CstrReshapable operation";
-  let description = [{
-    This operation is a work in progress, so it is not yet included in
-    the specification: https://github.com/openxla/stablehlo/issues/8.
+    The general idea of this op is that it can be used for ops which have a
+    broadcasting semantic to operate on shapes with a possibly smaller rank
+    while preserving equivalence of the computed values. After computing the
+    result of the op using reshaped operands, the result can be reshaped to the
+    result that would have been originally computed.
 
-    Informally, this operation creates a witness on the constraint that
-    ComputeReshapeShape would succeed with the provided operands.
+    Here is an example with two input shapes:
 
-    Example:
     ```mlir
-    %result = mhlo.cstr_reshapable %num_elements, %dynamic_shape
-           : (index, tensor<3xi32>) -> !shape.witness
+    mhlo.minimum_broadcast_shapes [1, 2, 3, 1, 2, 1],
+                                     [1, 1, 1, 2, 3] -> [6, 2, 1], [2, 3]
     ```
+
+    The broadcasted output shape of the operands is [1, 2, 3, 1, 2, 3], the
+    broadcasted output shape of the outputs is [6, 2, 3]. These two shapes are
+    reshapes of each other, and also each output is a reshape of the
+    corresponding input.
   }];
 
-  let arguments = (ins Index:$num_elements, 1DTensorOf<[AnyInteger, Index]>:$dynamic_shape);
-  let results = (outs Shape_WitnessType:$result);
+  let arguments = (ins Variadic<1DTensorOf<[Index]>>:$shapes);
+  let results = (outs Variadic<1DTensorOf<[Index]>>:$results);
 
-  let hasCustomHLOConverter = 1;
+  let assemblyFormat = "$shapes attr-dict `:` type($shapes) `->` type($results)";
 
-  let assemblyFormat = "operands attr-dict `:` functional-type(operands, results)";
+  let hasVerifier = 1;
+
+  let hasCustomHLOConverter = 1;
 }
 
 #endif // MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops_attrs.td b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops_attrs.td
index 2e52b72c04ea1d..7bb55816ba2a27 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops_attrs.td
+++ b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops_attrs.td
@@ -237,7 +237,7 @@ def MHLO_TypeExtensions : AttrDef<MHLO_Dialect, "TypeExtensions", [
 // A layout attribute (1D tensor of index type)
 def MHLO_LayoutAttr : Attr<
   And<[IndexElementsAttr.predicate,
-       CPred<[{$_self.cast<::mlir::DenseIntElementsAttr>().getType().getRank()
+       CPred<[{mlir::cast<::mlir::DenseIntElementsAttr>($_self).getType().getRank()
                == 1}]>]>,
   "A 1D tensor of index type (layout)"> {
   let storageType = IndexElementsAttr.storageType;
@@ -282,8 +282,8 @@ def MHLO_SparsityDescriptor : AttrDef<MHLO_Dialect, "SparsityDescriptor"> {
 
 def MHLO_BoolElementsAttr :
     ElementsAttrBase<
-      And<[CPred<"$_self.isa<::mlir::DenseIntOrFPElementsAttr>()">,
-           CPred<"$_self.cast<::mlir::DenseIntOrFPElementsAttr>().getType().getElementType().isInteger(1)">]>,
+      And<[CPred<"mlir::isa<::mlir::DenseIntOrFPElementsAttr>($_self)">,
+           CPred<"mlir::cast<::mlir::DenseIntOrFPElementsAttr>($_self).getType().getElementType().isInteger(1)">]>,
       "constant boolean vector/tensor attribute"> {
   let storageType = [{ ::mlir::DenseElementsAttr }];
   let returnType = [{ ::mlir::DenseElementsAttr }];
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops_common.cc b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops_common.cc
index ff69d84d08879f..073707b75c093b 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops_common.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops_common.cc
@@ -25,13 +25,14 @@ limitations under the License.
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/Support/LLVM.h"
 
 namespace mlir {
 namespace hlo {
 // Verifies the source target pairs attached to collective permute.
 LogicalResult verifyCollectivePermuteSourceTargetPairs(
     Operation *op, DenseIntElementsAttr attr) {
-  auto type = attr.getType().cast<RankedTensorType>();
+  auto type = mlir::cast<RankedTensorType>(attr.getType());
   if (type.getRank() != 2)
     return op->emitError() << "expect source_target_pairs attribute to be of "
                               "rank 2, but got rank "
@@ -73,8 +74,8 @@ LogicalResult verifyReduceScatter(Operation *op, TypeRange operandTypes,
   }
 
   for (auto it : llvm::zip(operandTypes, resultTypes)) {
-    auto operandType = std::get<0>(it).cast<ShapedType>();
-    auto resultType = std::get<1>(it).cast<ShapedType>();
+    auto operandType = mlir::cast<ShapedType>(std::get<0>(it));
+    auto resultType = mlir::cast<ShapedType>(std::get<1>(it));
     if (!operandType.hasRank() || !resultType.hasRank()) continue;
     if (operandType.getRank() != resultType.getRank())
       return op->emitOpError() << "operand and result should have same rank";
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops_typedefs.td b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops_typedefs.td
index e9baa78d2956e3..5f86621b9fa79b 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops_typedefs.td
+++ b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops_typedefs.td
@@ -51,11 +51,11 @@ def MHLO_AsyncBundleTypeDef : TypeDef<MHLO_Dialect, "AsyncBundle"> {
 }
 
 // Whether a type is a AsyncBundleType.
-def MHLO_IsAsyncBundleTypePred : CPred<"$_self.isa<::mlir::mhlo::AsyncBundleType>()">;
+def MHLO_IsAsyncBundleTypePred : CPred<"isa<::mlir::mhlo::AsyncBundleType>($_self)">;
 
 def MHLO_AsyncBundle :
     MixedContainerType<AnyTypeOf<[MHLO_Tensor, MHLO_Token]>, MHLO_IsAsyncBundleTypePred,
-                       "AsyncBundleType::getFlattenedTypes($_self.cast<::mlir::mhlo::AsyncBundleType>())",
+                       "AsyncBundleType::getFlattenedTypes(cast<::mlir::mhlo::AsyncBundleType>($_self))",
                        "async_bundle">;
 
 #endif // MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_TYPEDEFS
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_utils.td b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_utils.td
index 079e363b826f5d..d96ba2b56c205e 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_utils.td
+++ b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_utils.td
@@ -25,7 +25,7 @@ def NullArrayAttr : NativeCodeCall<"ArrayAttr()">;
 
 def NullChannelHandleAttr : NativeCodeCall<"mhlo::ChannelHandleAttr()">;
 
-def CastIntElementsAttr : NativeCodeCall<"$0.cast<DenseIntElementsAttr>()">;
+def CastIntElementsAttr : NativeCodeCall<"cast<DenseIntElementsAttr>($0)">;
 
 class ConstantSplat<string value> : NativeCodeCall<
     "hlo::getSplat(&$_builder, $0, " # value # ")">;
@@ -62,11 +62,11 @@ def IdentityBroadcastDims : AttrConstraint<
     CPred<"hlo::isSequenceStartingWith0($_self)">>;
 
 def ComplexElementType : Type<
-  CPred<"$_self.cast<ShapedType>().getElementType().isa<ComplexType>()">,
+  CPred<"isa<ComplexType>(cast<ShapedType>($_self).getElementType())">,
   "Complex element type">;
 
 def NonComplexElementType : Type<
-  CPred<"!$_self.cast<ShapedType>().getElementType().isa<ComplexType>()">,
+  CPred<"!isa<ComplexType>(cast<ShapedType>($_self).getElementType())">,
   "Non-complex element type">;
 
 #endif // MLIR_HLO_DIALECT_MHLO_IR_HLO_UTILS
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/analysis/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/mhlo/analysis/CMakeLists.txt
index c5a6a0a7f1cdeb..726d340dbf6812 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/analysis/CMakeLists.txt
+++ b/third_party/xla/xla/mlir_hlo/mhlo/analysis/CMakeLists.txt
@@ -7,5 +7,4 @@ add_mlir_library(MhloAnalysis
   LINK_LIBS PUBLIC
   MLIRAnalysis
   MLIRIR
-  LmhloDialect
 )
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/analysis/shape_component_analysis.cc b/third_party/xla/xla/mlir_hlo/mhlo/analysis/shape_component_analysis.cc
index 4d21d8aecf6b0d..19891ffc24e679 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/analysis/shape_component_analysis.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/analysis/shape_component_analysis.cc
@@ -83,7 +83,8 @@ struct ShapeVisitor {
       // Skip irrelevant cases early.
       Value value = transitivelyRequestedInfo.value();
       Type ty = value.getType();
-      if (!ty.isIntOrIndexOrFloat() && !ty.isa<RankedTensorType>()) continue;
+      if (!ty.isIntOrIndexOrFloat() && !mlir::isa<RankedTensorType>(ty))
+        continue;
 
       // Handle shapes.
       if (transitivelyRequestedInfo.isShapeInfo()) {
@@ -101,7 +102,7 @@ struct ShapeVisitor {
           backwardTransposeShape(transpose);
         } else if (auto select = value.getDefiningOp<mhlo::SelectOp>()) {
           backwardSelectShape(select);
-        } else if (auto arg = value.dyn_cast<BlockArgument>()) {
+        } else if (auto arg = mlir::dyn_cast<BlockArgument>(value)) {
           backwardBlockArgumentShape(arg);
         } else if (value.getDefiningOp() &&
                    value.getDefiningOp()
@@ -114,7 +115,7 @@ struct ShapeVisitor {
       }
 
       // Skip irrelevant cases early.
-      auto rankedTy = ty.dyn_cast<RankedTensorType>();
+      auto rankedTy = mlir::dyn_cast<RankedTensorType>(ty);
       bool isPossiblyInterestingScalar = ty.isIntOrIndex();
       bool isPossiblyInterestingTensor =
           rankedTy && rankedTy.getRank() <= 1 && rankedTy.hasStaticShape();
@@ -245,7 +246,7 @@ struct ShapeVisitor {
 
   void backwardAssumingShape(Value op) {
     auto assumingOp = op.getDefiningOp<shape::AssumingOp>();
-    auto number = op.cast<OpResult>().getResultNumber();
+    auto number = mlir::cast<OpResult>(op).getResultNumber();
     forwardsWorklist.push_back(ShapeOrValueInfo::getShapeInfoOf(op));
     backwardsWorklist.push_back(ShapeOrValueInfo::getShapeInfoOf(
         cast<shape::AssumingYieldOp>(
@@ -254,7 +255,7 @@ struct ShapeVisitor {
   }
   void forwardAssumingShape(Value op) {
     auto assumingOp = op.getDefiningOp<shape::AssumingOp>();
-    auto number = op.cast<OpResult>().getResultNumber();
+    auto number = mlir::cast<OpResult>(op).getResultNumber();
     auto &dims = insert(ShapeOrValueInfo::getShapeInfoOf(op));
     dims = lookup(ShapeOrValueInfo::getShapeInfoOf(
         cast<shape::AssumingYieldOp>(
@@ -338,7 +339,7 @@ struct ShapeVisitor {
         ShapeOrValueInfo::getValueInfoOf(op.getOutputShape()));
   }
   void forwardDynamicReshapeShape(mhlo::DynamicReshapeOp op) {
-    auto rankedTy = op.getResult().getType().cast<RankedTensorType>();
+    auto rankedTy = mlir::cast<RankedTensorType>(op.getResult().getType());
     auto shapeDims =
         lookup(ShapeOrValueInfo::getValueInfoOf(op.getOutputShape()));
     auto &dims = insert(ShapeOrValueInfo::getShapeInfoOf(op));
@@ -370,7 +371,7 @@ struct ShapeVisitor {
   void forwardTransposeShape(mhlo::TransposeOp op) {
     auto &dims = insert(ShapeOrValueInfo::getShapeInfoOf(op));
     auto in = lookup(ShapeOrValueInfo::getShapeInfoOf(op.getOperand()));
-    auto elem = op.getPermutation().cast<DenseIntElementsAttr>();
+    auto elem = mlir::cast<DenseIntElementsAttr>(op.getPermutation());
     for (const auto &val : elem) dims.push_back(in[val.getZExtValue()]);
   }
   void backwardSelectShape(mhlo::SelectOp op) {
@@ -440,7 +441,7 @@ struct ShapeVisitor {
     forwardsWorklist.push_back(ShapeOrValueInfo::getShapeInfoOf(v));
   }
   void forwardUnknownShape(Value v) {
-    auto rankedTy = v.getType().dyn_cast<RankedTensorType>();
+    auto rankedTy = mlir::dyn_cast<RankedTensorType>(v.getType());
     if (!rankedTy) return;
     auto id = getAffineSymbolExpr(0, v.getContext());
     auto &dims = insert(ShapeOrValueInfo::getShapeInfoOf(v));
@@ -465,7 +466,7 @@ struct ShapeVisitor {
     backwardsWorklist.push_back(ShapeOrValueInfo::getShapeInfoOf(op.getArg()));
   }
   void forwardShapeOf(shape::ShapeOfOp op) {
-    auto rankedTy = op.getArg().getType().cast<RankedTensorType>();
+    auto rankedTy = mlir::cast<RankedTensorType>(op.getArg().getType());
     auto arg = lookup(ShapeOrValueInfo::getShapeInfoOf(op.getArg()));
     auto &dims = insert(ShapeOrValueInfo::getValueInfoOf(op));
     return dimsFromStaticShape(rankedTy, arg, &dims);
@@ -521,7 +522,7 @@ struct ShapeVisitor {
   void forwardDim(tensor::DimOp op) {
     auto &dims = insert(ShapeOrValueInfo::getValueInfoOf(op));
     if (auto index = op.getIndex().getDefiningOp<arith::ConstantOp>()) {
-      int64_t i = index.getValue().cast<IntegerAttr>().getInt();
+      int64_t i = mlir::cast<IntegerAttr>(index.getValue()).getInt();
       auto in = lookup(ShapeOrValueInfo::getShapeInfoOf(op.getSource()));
       if (i >= static_cast<int64_t>(in.size()) || i < 0)
         llvm::report_fatal_error("tensor dim out of bounds");
@@ -591,7 +592,7 @@ struct ShapeVisitor {
     assert(op.getIndices().size() == 1);
     if (auto index =
             op.getIndices().front().getDefiningOp<arith::ConstantOp>()) {
-      int64_t i = index.getValue().cast<IntegerAttr>().getInt();
+      int64_t i = mlir::cast<IntegerAttr>(index.getValue()).getInt();
       // We asssume this is in bounds.
       auto in = lookup(ShapeOrValueInfo::getValueInfoOf(op.getTensor()));
       dims.push_back({in[i].symbols, in[i].expr});
@@ -661,7 +662,7 @@ struct ShapeVisitor {
     }
     auto &dims = insert(ShapeOrValueInfo::getValueInfoOf(op));
     auto in = lookup(ShapeOrValueInfo::getValueInfoOf(op.getOperand()));
-    auto elem = op.getStartIndices().cast<DenseIntElementsAttr>();
+    auto elem = mlir::cast<DenseIntElementsAttr>(op.getStartIndices());
     auto i = (*elem.begin()).getZExtValue();
     if (i >= in.size()) {  // Bounds check.
       return forwardUnknown(op);
@@ -711,7 +712,7 @@ struct ShapeVisitor {
 
   // Return the size of the first dimension. Returns 1 for scalars.
   static int64_t dim0size(Type type) {
-    if (auto rankedType = type.dyn_cast<RankedTensorType>())
+    if (auto rankedType = mlir::dyn_cast<RankedTensorType>(type))
       return rankedType.getRank() == 0 ? 1 : rankedType.getDimSize(0);
     return 1;
   }
@@ -781,8 +782,7 @@ bool SymbolicExpr::isKnownNotNegativeOne() const {
     if (symbol.source.isShapeInfo()) return true;
     Operation *op = symbol.source.value().getDefiningOp();
     if (op == nullptr) return false;
-    return llvm::isa<shape::ShapeOfOp, mhlo::ComputeReshapeShapeOp,
-                     shape::NumElementsOp>(op);
+    return llvm::isa<shape::ShapeOfOp, shape::NumElementsOp>(op);
   };
 
   // For constants we know if it's -1 or not. Checking the sign is sufficient
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/mhlo/transforms/CMakeLists.txt
index b7d447620a22b7..ecec1370bb2658 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/CMakeLists.txt
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/CMakeLists.txt
@@ -114,7 +114,6 @@ add_mlir_library(MhloQuantToIntConversion
   Core
 
   LINK_LIBS PUBLIC
-  LmhloDialect
   MhloDialect
   MhloTypeConversion
   MLIRIR
@@ -135,7 +134,6 @@ add_mlir_library(MhloToMemrefConversion
   Core
 
   LINK_LIBS PUBLIC
-  LmhloDialect
   MhloDialect
   MhloTypeConversion
   MLIRIR
@@ -173,7 +171,6 @@ add_mlir_library(MhloToStandard
 
   DEPENDS
   MLIRhlo_opsIncGen
-  MLIRlhlo_opsIncGen
   MLIRMhloLegalizeToStandardIncGen
   MLIRMhloPassIncGen
   MhloTypeConversion
@@ -198,7 +195,6 @@ add_mlir_library(ChloPasses
   MLIRhlo_opsIncGen
   MLIRChloLegalizeToHloIncGen
   MLIRMhloPassIncGen
-  MLIRLmhloPassIncGen
 
   LINK_COMPONENTS
   Core
@@ -225,7 +221,6 @@ add_mlir_library(MhloToLinalg
 
   LINK_LIBS PUBLIC
   HloToLinalgUtils
-  LmhloDialect
   MhloDialect
   MhloRngUtils
   MhloToArithmeticConversion
@@ -240,27 +235,6 @@ add_mlir_library(MhloToLinalg
   MLIRTransformUtils
 )
 
-add_mlir_library(MhloShapeOpsToStandard
-  hlo_legalize_shape_ops_to_standard/hlo_legalize_shape_ops_to_standard.cc
-
-  DEPENDS
-  MLIRhlo_opsIncGen
-  MLIRMhloPassIncGen
-
-  LINK_COMPONENTS
-  Core
-
-  LINK_LIBS PUBLIC
-  LmhloDialect
-  MhloDialect
-  MhloTypeConversion
-  MLIRComplexDialect
-  MLIRIR
-  MLIRPass
-  MLIRRewrite
-  MLIRTransformUtils
-)
-
 add_mlir_library(MhloToStablehlo
   hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc
   hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo_pass.cc
@@ -312,7 +286,6 @@ target_link_libraries(AllMhloPasses INTERFACE
   MhloToStandard
   HloToLinalgUtils
   MhloToLinalg
-  MhloShapeOpsToStandard
   MhloToStablehlo
   StablehloToMhlo
 )
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/broadcast_propagation/broadcast_propagation.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/broadcast_propagation/broadcast_propagation.cc
index 56ea0a42e79c19..da27173913f81e 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/broadcast_propagation/broadcast_propagation.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/broadcast_propagation/broadcast_propagation.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir {
@@ -152,7 +153,7 @@ void findBroadcastIntents(
 
   // Derive the broadcast intent associated with the root broadcast operation.
   // Add it to the worklist to seed the analysis.
-  rootBcastIntent = {root.getResult().getType().cast<RankedTensorType>(),
+  rootBcastIntent = {mlir::cast<RankedTensorType>(root.getResult().getType()),
                      root.getOperand(), root.getOutputDimensions(),
                      root.getBroadcastDimensions()};
   addToWorklistIfNew(rootBcastIntent);
@@ -177,7 +178,7 @@ void findBroadcastIntents(
             llvm::dyn_cast<DynamicBroadcastInDimOp>(producerOp)) {
       DenseIntElementsAttr composedBcastDims = composeBroadcastDimensionsAttr(
           builder, producerBcastOp.getBroadcastDimensions(),
-          it.broadcastDimensions.cast<DenseIntElementsAttr>());
+          mlir::cast<DenseIntElementsAttr>(it.broadcastDimensions));
       BroadcastIntent bcastedOperandIntent = {
           it.resultType, producerBcastOp.getOperand(), it.outputDimensions,
           composedBcastDims};
@@ -194,7 +195,7 @@ void findBroadcastIntents(
     assert(allowsForElementwiseBroadcastPropagation(producerOp));
     bcastIntentDependencies[it] = {};
     for (auto operand : producerOp->getOperands()) {
-      auto operandTy = operand.getType().cast<RankedTensorType>();
+      auto operandTy = mlir::cast<RankedTensorType>(operand.getType());
       auto operandBcastDims = operandTy.getRank() == 0
                                   ? builder.getI64TensorAttr({})
                                   : it.broadcastDimensions;
@@ -272,7 +273,7 @@ DenseMap<BroadcastIntent, Value> realizeBroadcastIntents(
       realizations[it] = rewriter.create<DynamicBroadcastInDimOp>(
           it.targetValue.getLoc(), it.resultType, it.targetValue,
           it.outputDimensions,
-          it.broadcastDimensions.cast<DenseIntElementsAttr>());
+          mlir::cast<DenseIntElementsAttr>(it.broadcastDimensions));
       continue;
     }
 
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_pass.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_pass.cc
index d03c05880865e4..b7a027a060233a 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_pass.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_pass.cc
@@ -87,7 +87,6 @@ struct ChloLegalizeToHloPass
     conversionTarget.addLegalDialect<
         MhloDialect, mlir::arith::ArithDialect, mlir::func::FuncDialect,
         mlir::tensor::TensorDialect, mlir::shape::ShapeDialect>();
-    conversionTarget.addLegalOp<chlo::MinimumBroadcastShapesOp>();
 
     if (failed(applyPartialConversion(getOperation(), conversionTarget,
                                       std::move(conversionPatterns)))) {
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/collapse_elementwise_map/collapse_elementwise_map.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/collapse_elementwise_map/collapse_elementwise_map.cc
index 7f1e7de597b535..60fcd198853911 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/collapse_elementwise_map/collapse_elementwise_map.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/collapse_elementwise_map/collapse_elementwise_map.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir {
@@ -73,7 +74,7 @@ struct ConvertMapOfElementwiseOps : public OpRewritePattern<MapOp> {
         operands.push_back(blockAndValueMap.lookup(value));
       auto *newOp = rewriter.create(
           op.getLoc(), op.getName().getIdentifier(), operands,
-          op.getResultTypes()[0].cast<TensorType>().clone(shape));
+          mlir::cast<TensorType>(op.getResultTypes()[0]).clone(shape));
       // Maps the result.
       blockAndValueMap.map(op.getResult(0), newOp->getResult(0));
     }
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/constraint_fusion/constraint_fusion_pass.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/constraint_fusion/constraint_fusion_pass.cc
index 7dc6080c07ccb4..f0d9b2442619dc 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/constraint_fusion/constraint_fusion_pass.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/constraint_fusion/constraint_fusion_pass.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Shape/IR/Shape.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
 
 namespace mlir {
 namespace mhlo {
@@ -286,7 +287,7 @@ LogicalResult analyzeBroadcastableConstraint(
     // For shapes without a definition, expect them to be an argument of the
     // regarded block.
     if (def == nullptr) {
-      auto barg = shape.dyn_cast<BlockArgument>();
+      auto barg = mlir::dyn_cast<BlockArgument>(shape);
       if (!barg || barg.getParentBlock() != theBlock) return failure();
       transitiveBcastableCstrOperands.push_back(
           CstrBroadcastableOperand::valueOf(barg));
@@ -299,7 +300,7 @@ LogicalResult analyzeBroadcastableConstraint(
     if (auto sof = llvm::dyn_cast<shape::ShapeOfOp>(def)) {
       if (!isWithinBlock(sof, theBlock)) return failure();
       tryFlagForErase(theBlock, def, toBeErased);
-      auto barg = sof.getArg().dyn_cast<BlockArgument>();
+      auto barg = mlir::dyn_cast<BlockArgument>(sof.getArg());
       if (!barg) return failure();
       transitiveBcastableCstrOperands.push_back(
           CstrBroadcastableOperand::shapeOf(barg));
@@ -351,7 +352,7 @@ LogicalResult analyzeBlockGlobalConstraints(
     // For witnesses without a definition, expect it to be an argument of the
     // regarded block.
     if (def == nullptr) {
-      auto barg = cstr.dyn_cast<BlockArgument>();
+      auto barg = mlir::dyn_cast<BlockArgument>(cstr);
       if (!barg || barg.getParentBlock() != theBlock) return failure();
       argumentCstrs.push_back(barg);
       continue;
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/convert_to_signless/convert_to_signless_pass.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/convert_to_signless/convert_to_signless_pass.cc
index c95ea747ea1362..dffab945389ee1 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/convert_to_signless/convert_to_signless_pass.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/convert_to_signless/convert_to_signless_pass.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/DialectConversion.h"
 
@@ -86,12 +87,13 @@ class ConvertConstantToSignless
       arith::ConstantOp constantOp, arith::ConstantOpAdaptor adaptor,
       ConversionPatternRewriter& rewriter) const override {
     // We only care about unsigned integers
-    if (!adaptor.getValue().isa<DenseIntElementsAttr>()) return failure();
+    if (!mlir::isa<DenseIntElementsAttr>(adaptor.getValue())) return failure();
 
-    auto values = llvm::to_vector(
-        adaptor.getValue().cast<DenseIntElementsAttr>().getValues<APInt>());
+    auto values =
+        llvm::to_vector(mlir::cast<DenseIntElementsAttr>(adaptor.getValue())
+                            .getValues<APInt>());
     Type type = typeConverter->convertType(constantOp.getType());
-    auto shapedType = type.dyn_cast<ShapedType>();
+    auto shapedType = mlir::dyn_cast<ShapedType>(type);
     auto newValues = DenseIntElementsAttr::get(
         shapedType, values);
 
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/expand_hlo_tuples/expand_hlo_tuples.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/expand_hlo_tuples/expand_hlo_tuples.cc
index 6b514e720fc53b..7b4426d94704d2 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/expand_hlo_tuples/expand_hlo_tuples.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/expand_hlo_tuples/expand_hlo_tuples.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "mlir/IR/Value.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassRegistry.h"
+#include "mlir/Support/LLVM.h"
 
 namespace mlir {
 namespace mhlo {
@@ -69,7 +70,7 @@ class ExpandHloTuplesPass
                                                  func.getArguments().end());
     for (auto argument : funcArguments) {
       auto type = argument.getType();
-      auto tupleType = type.dyn_cast_or_null<TupleType>();
+      auto tupleType = mlir::dyn_cast_or_null<TupleType>(type);
       if (!tupleType) {
         expandedInputTypes.push_back(type);
       } else {
@@ -109,7 +110,7 @@ class ExpandHloTuplesPass
     SmallVector<Value, 4> expandedReturnOperands;
     SmallVector<Type, 4> expandedResultTypes;
     for (auto value : returnOp.getOperands()) {
-      if (auto tupleTy = value.getType().dyn_cast<TupleType>()) {
+      if (auto tupleTy = mlir::dyn_cast<TupleType>(value.getType())) {
         llvm::copy(tupleTy.getTypes(), std::back_inserter(expandedResultTypes));
         for (auto [index, ty] : llvm::enumerate(tupleTy.getTypes())) {
           expandedReturnOperands.push_back(
@@ -145,7 +146,7 @@ class ExpandHloTuplesPass
     while (
         llvm::any_of(llvm::concat<const Type>(entryFunction.getArgumentTypes(),
                                               entryFunction.getResultTypes()),
-                     [](Type type) { return type.isa<TupleType>(); })) {
+                     [](Type type) { return mlir::isa<TupleType>(type); })) {
       expandTupledTensorInReturnOp(entryFunction);
     }
   }
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/expand_ops_simplifier/expand_ops_simplifier.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/expand_ops_simplifier/expand_ops_simplifier.cc
index 5113736c0b3bba..b75f13e5c8417b 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/expand_ops_simplifier/expand_ops_simplifier.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/expand_ops_simplifier/expand_ops_simplifier.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir {
@@ -162,7 +163,7 @@ struct SelectAndScatterExpanderPattern
 
     llvm::SmallVector<int64_t> concatenatedIotasDims;
     concatenatedIotasDims.reserve(
-        iotaIndices.front().getType().cast<ShapedType>().getRank());
+        mlir::cast<ShapedType>(iotaIndices.front().getType()).getRank());
     concatenatedIotasDims.insert(concatenatedIotasDims.end(),
                                  broadcastedIotaDims.begin(),
                                  broadcastedIotaDims.end());
@@ -189,8 +190,8 @@ struct SelectAndScatterExpanderPattern
     llvm::SmallVector<Type> scatterIns;
     llvm::SmallVector<Location> scatterLocs;
     scatterIns.push_back(RankedTensorType::get(
-        {},
-        broadcastedInitValue.getType().cast<ShapedType>().getElementType()));
+        {}, mlir::cast<ShapedType>(broadcastedInitValue.getType())
+                .getElementType()));
     scatterIns.push_back(
         RankedTensorType::get({}, source.getType().getElementType()));
     scatterLocs.push_back(broadcastedInitValue.getLoc());
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/group_reduction_dimensions/group_reduction_dimensions.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/group_reduction_dimensions/group_reduction_dimensions.cc
index d2058c0e23254f..7bbc78cd0dc598 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/group_reduction_dimensions/group_reduction_dimensions.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/group_reduction_dimensions/group_reduction_dimensions.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Types.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir {
@@ -226,8 +227,9 @@ LogicalResult tryLowerTo1DOr2DReduction(
   auto reductionDimAttr = rewriter.getI64VectorAttr({reductionDim});
   Value initVal = op.getInitValues().front();
   SmallVector<Type> elementTypes{llvm::map_range(
-      op.getBody().front().getTerminator()->getOperands(),
-      [](Value v) { return v.getType().cast<ShapedType>().getElementType(); })};
+      op.getBody().front().getTerminator()->getOperands(), [](Value v) {
+        return mlir::cast<ShapedType>(v.getType()).getElementType();
+      })};
   auto reductionOp = rewriter.create<ReduceOp>(loc, intermResult, initVal,
                                                reductionDimAttr, elementTypes);
   rewriter.inlineRegionBefore(op.getBody(), reductionOp.getBody(),
@@ -235,7 +237,7 @@ LogicalResult tryLowerTo1DOr2DReduction(
   intermResult = reductionOp->getResults().front();
 
   // Restore the expected shape by dynamic reshape, if required.
-  auto resultTy = op->getResultTypes().front().cast<RankedTensorType>();
+  auto resultTy = mlir::cast<RankedTensorType>(op->getResultTypes().front());
   if (requiresDynamicReshape) {
     assert(resultShape && "expect to have reified the result shape");
     intermResult = rewriter.create<DynamicReshapeOp>(
@@ -245,7 +247,7 @@ LogicalResult tryLowerTo1DOr2DReduction(
   // Othwerise, restore the expected shape by shape expansion, if required.
   int64_t resultRank = resultTy.getRank();
   int64_t intermResultRank =
-      intermResult.getType().cast<RankedTensorType>().getRank();
+      mlir::cast<RankedTensorType>(intermResult.getType()).getRank();
   bool requiresExpand =
       !requiresDynamicReshape && resultRank != intermResultRank;
   if (requiresExpand) {
@@ -276,11 +278,11 @@ struct GroupReductionDimensionsPattern : public OpRewritePattern<ReduceOp> {
       return failure();
     Value arg = op.getInputs().front();
     // Only apply to non-sparse tensors.
-    if (auto rtp = arg.getType().cast<RankedTensorType>();
+    if (auto rtp = mlir::cast<RankedTensorType>(arg.getType());
         rtp.getEncoding() != nullptr)
       return failure();
 
-    auto argTy = arg.getType().cast<RankedTensorType>();
+    auto argTy = mlir::cast<RankedTensorType>(arg.getType());
 
     // Sort reduction dimensions, which is not an invariant of the op.
     SmallVector<int64_t> orderedReductionDims =
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_shape_ops_to_standard/hlo_legalize_shape_ops_to_standard.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_shape_ops_to_standard/hlo_legalize_shape_ops_to_standard.cc
deleted file mode 100644
index 702deddaa0a51b..00000000000000
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_shape_ops_to_standard/hlo_legalize_shape_ops_to_standard.cc
+++ /dev/null
@@ -1,244 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file implements logic for lowering HLO/LHLO dialect to Linalg dialect.
-
-#include <algorithm>
-#include <memory>
-#include <numeric>
-#include <string>
-#include <utility>
-
-#include "mhlo/IR/hlo_ops.h"
-#include "mhlo/transforms/rewriters.h"
-#include "mhlo/utils/type_conversion.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Shape/IR/Shape.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/IR/AffineExpr.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/Location.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Pass/PassManager.h"
-#include "mlir/Support/LogicalResult.h"
-#include "mlir/Transforms/DialectConversion.h"
-
-namespace mlir {
-namespace mhlo {
-
-#define GEN_PASS_DEF_HLOLEGALIZESHAPEOPSTOSTANDARDPASS
-#include "mhlo/transforms/mhlo_passes.h.inc"
-
-namespace {
-
-struct ComputeReshapeShapeConversion
-    : public OpConversionPattern<mhlo::ComputeReshapeShapeOp> {
-  using OpConversionPattern<mhlo::ComputeReshapeShapeOp>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      mhlo::ComputeReshapeShapeOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter& rewriter) const final {
-    auto loc = op.getLoc();
-    auto* ctx = op->getContext();
-    Value negOne = rewriter.create<arith::ConstantIndexOp>(loc, -1);
-    auto indexType = rewriter.getIndexType();
-    auto numElements = adaptor.getOperands()[0];
-    auto targetShapeType =
-        adaptor.getOperands()[1].getType().cast<ShapedType>();
-    auto extentType =
-        shape::getExtentTensorType(ctx, targetShapeType.getDimSize(0));
-
-    // Calculate the computed actual extent for a possible dynamic extent.
-    auto newShape = targetShapeType.getElementType().isIndex()
-                        ? adaptor.getOperands()[1]
-                        : rewriter.create<arith::IndexCastOp>(
-                              loc, extentType, adaptor.getOperands()[1]);
-    Value newShapeRank =
-        rewriter.create<shape::RankOp>(loc, indexType, newShape);
-    // The product begins with a -1 seed which will cancel out a -1 extent in
-    // the input shape if there is one. If there is not, this computed result
-    // will never be used, so it's okay to compute a negative number of
-    // elements.
-    auto accountedNumEls =
-        rewriter.create<shape::ReduceOp>(loc, newShape, negOne);
-    {
-      PatternRewriter::InsertionGuard g(rewriter);
-      rewriter.setInsertionPointToEnd(accountedNumEls.getBody());
-      Value lhs = accountedNumEls.getBody()->getArgument(1);
-      Value rhs = accountedNumEls.getBody()->getArgument(2);
-      rewriter.create<shape::YieldOp>(
-          loc, rewriter.create<arith::MulIOp>(loc, lhs, rhs).getResult());
-    }
-    Value missingDimVal = rewriter.create<arith::DivUIOp>(
-        loc, numElements, accountedNumEls->getResult(0));
-
-    // Create the final target shape with a possible dynamic extent replace with
-    // the calculated extent.
-    SmallVector<Value> dynamicExtent;
-    if (!targetShapeType.hasStaticShape())
-      dynamicExtent.push_back(newShapeRank);
-    auto gen = rewriter.create<tensor::GenerateOp>(
-        loc, targetShapeType, dynamicExtent,
-        [&](OpBuilder& b, Location loc, ValueRange indices) {
-          Value extent = b.create<shape::GetExtentOp>(loc, indexType, newShape,
-                                                      indices[0]);
-          Value useMissingDimVal = b.create<arith::CmpIOp>(
-              loc, arith::CmpIPredicate::eq, extent, negOne);
-          Value dimVal = b.create<arith::SelectOp>(loc, useMissingDimVal,
-                                                   missingDimVal, extent);
-          dimVal = targetShapeType.getElementType().isIndex()
-                       ? dimVal
-                       : b.create<arith::IndexCastOp>(
-                             loc, targetShapeType.getElementType(), dimVal);
-          b.create<tensor::YieldOp>(loc, dimVal);
-        });
-    rewriter.replaceOp(op, gen.getResult());
-
-    return success();
-  }
-};
-
-struct CstrReshapableConversion
-    : public OpConversionPattern<mhlo::CstrReshapableOp> {
-  using OpConversionPattern<mhlo::CstrReshapableOp>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      mhlo::CstrReshapableOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter& rewriter) const final {
-    auto loc = op.getLoc();
-    auto* ctx = op->getContext();
-    Value negOne = rewriter.create<arith::ConstantIndexOp>(loc, -1);
-    Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-    Value one = rewriter.create<arith::ConstantIndexOp>(loc, 1);
-    auto numElements = adaptor.getOperands()[0];
-    auto targetShapeType =
-        adaptor.getOperands()[1].getType().cast<ShapedType>();
-    auto extentType =
-        shape::getExtentTensorType(ctx, targetShapeType.getDimSize(0));
-
-    // Calculate the computed actual extent for a possible dynamic extent.
-    auto newShape = targetShapeType.getElementType().isIndex()
-                        ? adaptor.getOperands()[1]
-                        : rewriter.create<arith::IndexCastOp>(
-                              loc, extentType, adaptor.getOperands()[1]);
-    auto reduction = rewriter.create<shape::ReduceOp>(
-        loc, newShape, llvm::ArrayRef({one, zero, zero}));
-    {
-      PatternRewriter::InsertionGuard g(rewriter);
-      auto* body = reduction.getBody();
-      rewriter.setInsertionPointToEnd(body);
-      Value extent = body->getArgument(1);
-      Value isDynamic = rewriter.create<arith::CmpIOp>(
-          loc, arith::CmpIPredicate::eq, negOne, extent);
-      Value isInvalid = rewriter.create<arith::CmpIOp>(
-          loc, arith::CmpIPredicate::slt, extent, negOne);
-      Value totalDynamic = rewriter.create<arith::AddIOp>(
-          loc, rewriter.create<arith::SelectOp>(loc, isDynamic, one, zero),
-          body->getArgument(3));
-      Value totalInvalid = rewriter.create<arith::AddIOp>(
-          loc, rewriter.create<arith::SelectOp>(loc, isInvalid, one, zero),
-          body->getArgument(4));
-      Value extentOrOne =
-          rewriter.create<arith::SelectOp>(loc, isDynamic, one, extent);
-      Value totalElements = rewriter.create<arith::MulIOp>(
-          loc, extentOrOne, body->getArgument(2));
-      rewriter.create<shape::YieldOp>(
-          loc, llvm::ArrayRef({totalElements, totalDynamic, totalInvalid}));
-    }
-    // Avoid division by zero.
-    Value isZeroElements = rewriter.create<arith::CmpIOp>(
-        loc, arith::CmpIPredicate::eq, reduction->getResult(0), zero);
-    Value divisor = rewriter.create<arith::SelectOp>(loc, isZeroElements, one,
-                                                     reduction->getResult(0));
-    Value isDivisible = rewriter.create<arith::CmpIOp>(
-        loc, arith::CmpIPredicate::eq, zero,
-        rewriter.create<arith::RemSIOp>(loc, numElements, divisor));
-    // Must have 0 or 1 dynamic dimensions.
-    Value acceptablyDynamic = rewriter.create<arith::CmpIOp>(
-        loc, arith::CmpIPredicate::ule, reduction->getResult(1), one);
-    // Must have no invalid dimensions.
-    Value noInvalid = rewriter.create<arith::CmpIOp>(
-        loc, arith::CmpIPredicate::eq, reduction->getResult(2), zero);
-    // If there is no dynamic dimension then the number of elements must match.
-    Value hasOneDynamic = rewriter.create<arith::CmpIOp>(
-        loc, arith::CmpIPredicate::eq, reduction->getResult(1), one);
-    Value equalIfNotDynamic = rewriter.create<arith::OrIOp>(
-        loc, hasOneDynamic,
-        rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq,
-                                       numElements, reduction->getResult(0)));
-
-    Value allPassing = rewriter.create<arith::AndIOp>(
-        loc, isDivisible,
-        rewriter.create<arith::AndIOp>(
-            loc, acceptablyDynamic,
-            rewriter.create<arith::AndIOp>(loc, noInvalid, equalIfNotDynamic)));
-
-    rewriter.replaceOpWithNewOp<shape::CstrRequireOp>(
-        op, allPassing, "Required valid reshape shape input");
-
-    return success();
-  }
-};
-
-struct HloLegalizeShapeOpsToStandardPass
-    : public impl::HloLegalizeShapeOpsToStandardPassBase<
-          HloLegalizeShapeOpsToStandardPass> {
-  void getDependentDialects(DialectRegistry& registry) const override {
-    registry.insert<arith::ArithDialect, shape::ShapeDialect,
-                    tensor::TensorDialect>();
-  }
-
-  void runOnOperation() override {
-    MLIRContext& ctx = getContext();
-    RewritePatternSet patterns(&ctx);
-    ConversionTarget target(ctx);
-    target.addLegalDialect<arith::ArithDialect, tensor::TensorDialect,
-                           shape::ShapeDialect>();
-
-    target.addLegalOp<UnrealizedConversionCastOp>();
-
-    auto func = getOperation();
-    mhlo::RemoveSignTypeConverter typeConverter;
-    mhlo::populateHloShapeOpsToStandardConversionPattern(&ctx, typeConverter,
-                                                         &patterns);
-    if (failed(applyPartialConversion(func, target, std::move(patterns)))) {
-      signalPassFailure();
-    }
-  }
-};
-
-}  // namespace
-
-void populateHloShapeOpsToStandardConversionPattern(
-    MLIRContext* context, TypeConverter& typeConverter,
-    RewritePatternSet* patterns) {
-  // clang-format off
-  patterns->add<
-      ComputeReshapeShapeConversion,
-      CstrReshapableConversion>(typeConverter, context);
-  // clang-format on
-}
-
-std::unique_ptr<OperationPass<func::FuncOp>>
-createLegalizeHloShapeOpsToStandardPass() {
-  return std::make_unique<HloLegalizeShapeOpsToStandardPass>();
-}
-
-}  // namespace mhlo
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_arithmetic/hlo_legalize_to_arithmetic.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_arithmetic/hlo_legalize_to_arithmetic.cc
index fead7be62bf1c2..a3a9f6664f9544 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_arithmetic/hlo_legalize_to_arithmetic.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_arithmetic/hlo_legalize_to_arithmetic.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/BuiltinDialect.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir {
@@ -125,7 +126,7 @@ struct ScalarHloToArithmeticPattern : public OpConversionPattern<OpTy> {
     if (filterFn && !filterFn(op)) return failure();
 
     auto isScalar = [&](Value v) {
-      return v.getType().cast<ShapedType>().getRank() == 0;
+      return mlir::cast<ShapedType>(v.getType()).getRank() == 0;
     };
 
     if (!llvm::all_of(adaptor.getOperands(), isScalar))
@@ -134,8 +135,8 @@ struct ScalarHloToArithmeticPattern : public OpConversionPattern<OpTy> {
     auto loc = op.getLoc();
 
     std::optional<ShapedType> resultTy;
-    resultTy = this->typeConverter->convertType(op->getResultTypes().front())
-                   .template dyn_cast<ShapedType>();
+    resultTy = mlir::dyn_cast<ShapedType>(
+        this->typeConverter->convertType(op->getResultTypes().front()));
 
     SmallVector<Value> operands;
     for (auto operand : adaptor.getOperands()) {
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_memref/hlo_legalize_to_memref.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_memref/hlo_legalize_to_memref.cc
index 1df0035b53e6fe..afad7b91711f35 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_memref/hlo_legalize_to_memref.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_memref/hlo_legalize_to_memref.cc
@@ -13,14 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// This file implements logic for lowering HLO dialect to LHLO dialect.
+// This file implements logic for bufferizing HLO dialect to memref dialect.
 
-#include <functional>
 #include <memory>
 #include <optional>
 #include <utility>
 
-#include "lhlo/IR/lhlo_ops.h"
 #include "mhlo/IR/hlo_ops.h"
 #include "mhlo/interfaces/bufferizable_op_interface_impl.h"
 #include "mhlo/transforms/passes.h"
@@ -33,6 +31,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
 
 namespace mlir {
 namespace mhlo {
@@ -49,120 +48,6 @@ using bufferization::BufferizationOptions;
 using bufferization::BufferRelation;
 using bufferization::replaceOpWithNewBufferizedOp;
 
-struct CustomCallOpInterface
-    : public BufferizableOpInterface::ExternalModel<CustomCallOpInterface,
-                                                    mhlo::CustomCallOp> {
-  bool bufferizesToMemoryRead(Operation *, OpOperand &,
-                              const AnalysisState &) const {
-    return true;
-  }
-
-  bool bufferizesToMemoryWrite(Operation *, OpOperand &,
-                               const AnalysisState &) const {
-    return false;  // Arguments are read-only.
-  }
-
-  AliasingValueList getAliasingValues(Operation *, OpOperand &,
-                                      const AnalysisState &) const {
-    return {};
-  }
-
-  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                          const BufferizationOptions &options) const {
-    auto customCallOp = cast<mhlo::CustomCallOp>(op);
-    Value tokenArgument;
-
-    // Bufferize arguments.
-    SmallVector<Value> bufferArgs;
-    for (OpOperand &operand : customCallOp->getOpOperands()) {
-      auto &newBuffer = bufferArgs.emplace_back();
-      if (operand.get().getType().isa<mhlo::TokenType>()) {
-        // Remember the token for later. We need it for the return value but
-        // it's not getting passed to LMHLO.
-        if (tokenArgument) return failure();
-        tokenArgument = operand.get();
-        continue;
-      }
-      if (!operand.get().getType().isa<TensorType>()) return failure();
-      FailureOr<Value> operandBuffer =
-          getBuffer(rewriter, operand.get(), options);
-      if (failed(operandBuffer)) return failure();
-      newBuffer = *operandBuffer;
-    }
-
-    // Allocate outputs.
-    for (OpResult result : customCallOp->getOpResults()) {
-      auto &newBuffer = bufferArgs.emplace_back();
-      if (result.getType().isa<mhlo::TokenType>()) {
-        continue;
-      }
-      auto tensorType = result.getType().dyn_cast<RankedTensorType>();
-      if (!tensorType) return failure();
-      // TODO(springerm): Create alloc_tensor ops during TensorCopyInsertion.
-      AnalysisState analysisState(options);
-      FailureOr<Value> tensorAlloc =
-          bufferization::allocateTensorForShapedValue(rewriter, op->getLoc(),
-                                                      result, options);
-      if (failed(tensorAlloc)) return failure();
-      auto memrefType =
-          MemRefType::get(tensorType.getShape(), tensorType.getElementType());
-      newBuffer = rewriter.create<bufferization::ToMemrefOp>(
-          op->getLoc(), memrefType, *tensorAlloc);
-    }
-
-    lmhlo::CustomCallTargetArgMappingAttr targetMapping;
-    auto numArguments = static_cast<int32_t>(customCallOp->getNumOperands());
-    auto numResults = static_cast<int32_t>(customCallOp->getNumResults());
-
-    // Take the result buffers and fill in the token input in the gaps.
-    auto bufferResults = llvm::to_vector(llvm::map_range(
-        llvm::ArrayRef(bufferArgs).slice(numArguments),
-        [&](Value buffer) { return buffer ? buffer : tokenArgument; }));
-
-    if (tokenArgument) {
-      // If there was a token, squeeze all the non-token arguments and results
-      // (in-place) and remember the mapping.
-      int nextIndex = 0;
-      llvm::SmallVector<int64_t> argToTargetArgMapping;
-      for (int i = 0; i < numArguments; ++i) {
-        if (bufferArgs[i]) {
-          argToTargetArgMapping.push_back(i);
-          bufferArgs[nextIndex++] = bufferArgs[i];
-        }
-      }
-      llvm::SmallVector<int64_t> resultToTargetResultMapping;
-      for (int32_t i = numArguments;
-           i < static_cast<int64_t>(bufferArgs.size()); ++i) {
-        if (bufferArgs[i]) {
-          resultToTargetResultMapping.push_back(i - numArguments);
-          bufferArgs[nextIndex++] = bufferArgs[i];
-        }
-      }
-
-      // Build the mapping attribute.
-      targetMapping = lmhlo::CustomCallTargetArgMappingAttr::get(
-          rewriter.getContext(), numArguments, numResults,
-          argToTargetArgMapping, resultToTargetResultMapping);
-
-      // Drop the remaining operands and adjust num_arguments and num_results
-      // for LMHLO creation.
-      bufferArgs.resize(nextIndex);
-      numArguments = static_cast<int32_t>(argToTargetArgMapping.size());
-      numResults = static_cast<int32_t>(resultToTargetResultMapping.size());
-    }
-
-    auto lhloOp = rewriter.create<lmhlo::CustomCallOp>(
-        op->getLoc(), std::nullopt, bufferArgs, op->getAttrs());
-    if (targetMapping) lhloOp.setTargetArgMappingAttr(targetMapping);
-    // lmhlo.custom_call uses a segment_size attribute to tell input from output
-    // arguments.
-    lhloOp->setAttr(lhloOp.getOperandSegmentSizeAttr(),
-                    rewriter.getDenseI32ArrayAttr({numArguments, numResults}));
-    bufferization::replaceOpWithBufferizedValues(rewriter, op, bufferResults);
-    return success();
-  }
-};
-
 struct ReshapeOpInterface
     : public BufferizableOpInterface::ExternalModel<ReshapeOpInterface,
                                                     mhlo::ReshapeOp> {
@@ -185,7 +70,7 @@ struct ReshapeOpInterface
                           const BufferizationOptions &options) const {
     auto reshapeOp = cast<mhlo::ReshapeOp>(op);
     auto unrankedOperandType =
-        reshapeOp.getOperand().getType().dyn_cast<UnrankedTensorType>();
+        mlir::dyn_cast<UnrankedTensorType>(reshapeOp.getOperand().getType());
     if (unrankedOperandType == nullptr) return success();
 
     // The buffer still has the old (pre-reshape) type.
@@ -193,7 +78,7 @@ struct ReshapeOpInterface
         getBuffer(rewriter, reshapeOp.getOperand(), options);
     if (failed(operandBuffer)) return failure();
 
-    auto resultType = reshapeOp.getType().cast<RankedTensorType>();
+    auto resultType = mlir::cast<RankedTensorType>(reshapeOp.getType());
     auto destType =
         MemRefType::get(resultType.getShape(), resultType.getElementType());
     replaceOpWithNewBufferizedOp<memref::CastOp>(rewriter, op, destType,
@@ -233,16 +118,16 @@ struct DynamicReshapeOpInterface
 
     ShapedType resultType;
     TensorType opResultType = reshapeOp.getType();
-    if (auto rankedType = opResultType.dyn_cast<RankedTensorType>()) {
+    if (auto rankedType = mlir::dyn_cast<RankedTensorType>(opResultType)) {
       resultType =
           MemRefType::get(rankedType.getShape(), rankedType.getElementType());
     } else if (auto unrankedType =
-                   opResultType.dyn_cast<UnrankedTensorType>()) {
+                   mlir::dyn_cast<UnrankedTensorType>(opResultType)) {
       resultType = UnrankedMemRefType::get(unrankedType.getElementType(), 0);
     }
     auto operand = *operandBuffer;
     // If the operand has a non-identity affine map, we will have to add a copy.
-    auto bufferType = operandBuffer->getType().dyn_cast<MemRefType>();
+    auto bufferType = mlir::dyn_cast<MemRefType>(operandBuffer->getType());
     if (bufferType && !bufferType.getLayout().isIdentity()) {
       // TODO(springerm): Create alloc_tensor ops during TensorCopyInsertion.
       AnalysisState analysisState(options);
@@ -268,11 +153,11 @@ FailureOr<Value> insertDynamicMemrefCastOp(
     mhlo::DynamicBroadcastInDimOp op, Value operand, RewriterBase &rewriter,
     const BufferizationOptions &options) {
   auto loc = op.getLoc();
-  auto operandType = operand.getType().cast<MemRefType>();
+  auto operandType = mlir::cast<MemRefType>(operand.getType());
   auto operandShape = operandType.getShape();
   auto operandRank = operandType.getRank();
 
-  auto resultType = op.getType().cast<RankedTensorType>();
+  auto resultType = mlir::cast<RankedTensorType>(op.getType());
   auto resultRank = resultType.getRank();
 
   Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
@@ -380,7 +265,8 @@ struct DynamicBroadcastInDimOpInterface
   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                           const BufferizationOptions &options) const {
     auto broadcastInDimOp = cast<mhlo::DynamicBroadcastInDimOp>(op);
-    auto resultType = broadcastInDimOp.getType().dyn_cast<RankedTensorType>();
+    auto resultType =
+        mlir::dyn_cast<RankedTensorType>(broadcastInDimOp.getType());
     if (!resultType) return success();
 
     // The buffer still has the old (pre-reshape) type.
@@ -419,7 +305,6 @@ std::unique_ptr<OperationPass<ModuleOp>> createLegalizeToMemrefPass() {
 
 void registerBufferizableOpInterfaceExternalModels(DialectRegistry &registry) {
   registry.addExtension(+[](MLIRContext *ctx, MhloDialect * /*dialect*/) {
-    CustomCallOp::attachInterface<CustomCallOpInterface>(*ctx);
     ReshapeOp::attachInterface<ReshapeOpInterface>(*ctx);
     DynamicReshapeOp::attachInterface<DynamicReshapeOpInterface>(*ctx);
     DynamicBroadcastInDimOp::attachInterface<DynamicBroadcastInDimOpInterface>(
@@ -427,7 +312,7 @@ void registerBufferizableOpInterfaceExternalModels(DialectRegistry &registry) {
 
     // Load additional dialects of which ops may get created.
     ctx->loadDialect<arith::ArithDialect, bufferization::BufferizationDialect,
-                     lmhlo::LmhloDialect, memref::MemRefDialect>();
+                     memref::MemRefDialect>();
   });
 }
 
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc
index a7ef5e93d1585b..cd86891b3ccbf7 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/Location.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Operation.h"
@@ -34,6 +35,7 @@ limitations under the License.
 #include "mlir/IR/SymbolTable.h"
 #include "mlir/IR/Types.h"
 #include "mlir/Support/DebugStringHelper.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/RegionUtils.h"
@@ -81,7 +83,7 @@ bool hasPrivateFeaturesNotInStablehlo(HloOpTy hloOp) {
 bool hasPackedNibble(std::optional<ArrayAttr> precisionConfigAttr) {
   if (!precisionConfigAttr) return false;
   return llvm::any_of(*precisionConfigAttr, [&](Attribute attr) {
-    auto precisionAttr = attr.cast<mhlo::PrecisionAttr>();
+    auto precisionAttr = mlir::cast<mhlo::PrecisionAttr>(attr);
     return precisionAttr.getValue() == mhlo::Precision::PACKED_NIBBLE;
   });
 }
@@ -214,7 +216,7 @@ bool isDenseI64Array(mlir::StringAttr hloName) {
 
 template <typename StablehloOpTy>
 Attribute convertDenseArray(mlir::StringAttr hloName, Attribute hloAttr) {
-  auto denseInts = hloAttr.dyn_cast<DenseIntElementsAttr>();
+  auto denseInts = mlir::dyn_cast<DenseIntElementsAttr>(hloAttr);
   if (!denseInts) return {};
 
   if ((std::is_same<StablehloOpTy, stablehlo::ConvolutionOp>::value ||
@@ -243,17 +245,17 @@ Attribute convertAttr(Attribute hloAttr) {
   // Handle MHLO attributes.
   // The logic that handles attributes from other dialects (e.g. builtin
   // attributes) lives below.
-  if (auto attr = hloAttr.dyn_cast<mhlo::ChannelHandleAttr>()) {
+  if (auto attr = mlir::dyn_cast<mhlo::ChannelHandleAttr>(hloAttr)) {
     return stablehlo::ChannelHandleAttr::get(attr.getContext(),
                                              attr.getHandle(), attr.getType());
   }
-  if (auto attr = hloAttr.dyn_cast<mhlo::ComparisonDirectionAttr>()) {
+  if (auto attr = mlir::dyn_cast<mhlo::ComparisonDirectionAttr>(hloAttr)) {
     RETURN_CONVERTED_ENUM_ATTR(ComparisonDirection);
   }
-  if (auto attr = hloAttr.dyn_cast<mhlo::ComparisonTypeAttr>()) {
+  if (auto attr = mlir::dyn_cast<mhlo::ComparisonTypeAttr>(hloAttr)) {
     RETURN_CONVERTED_ENUM_ATTR(ComparisonType);
   }
-  if (auto attr = hloAttr.dyn_cast<mhlo::ConvDimensionNumbersAttr>()) {
+  if (auto attr = mlir::dyn_cast<mhlo::ConvDimensionNumbersAttr>(hloAttr)) {
     return stablehlo::ConvDimensionNumbersAttr::get(
         attr.getContext(), attr.getInputBatchDimension(),
         attr.getInputFeatureDimension(), attr.getInputSpatialDimensions(),
@@ -264,44 +266,44 @@ Attribute convertAttr(Attribute hloAttr) {
   }
   // NOTE: We cannot process CustomCallApiVersionAttr here because
   // `dyn_cast<mhlo::CustomCallApiVersionAttr>()` succeeds for IntegerAttr too.
-  if (auto attr = hloAttr.dyn_cast<mhlo::DotDimensionNumbersAttr>()) {
+  if (auto attr = mlir::dyn_cast<mhlo::DotDimensionNumbersAttr>(hloAttr)) {
     return stablehlo::DotDimensionNumbersAttr::get(
         attr.getContext(), attr.getLhsBatchingDimensions(),
         attr.getRhsBatchingDimensions(), attr.getLhsContractingDimensions(),
         attr.getRhsContractingDimensions());
   }
-  if (auto attr = hloAttr.dyn_cast<mhlo::FftTypeAttr>()) {
+  if (auto attr = mlir::dyn_cast<mhlo::FftTypeAttr>(hloAttr)) {
     RETURN_CONVERTED_ENUM_ATTR(FftType);
   }
-  if (auto attr = hloAttr.dyn_cast<mhlo::GatherDimensionNumbersAttr>()) {
+  if (auto attr = mlir::dyn_cast<mhlo::GatherDimensionNumbersAttr>(hloAttr)) {
     return stablehlo::GatherDimensionNumbersAttr::get(
         attr.getContext(), attr.getOffsetDims(), attr.getCollapsedSliceDims(),
         attr.getStartIndexMap(), attr.getIndexVectorDim());
   }
-  if (auto attr = hloAttr.dyn_cast<mhlo::OutputOperandAliasAttr>()) {
+  if (auto attr = mlir::dyn_cast<mhlo::OutputOperandAliasAttr>(hloAttr)) {
     return stablehlo::OutputOperandAliasAttr::get(
         attr.getContext(), attr.getOutputTupleIndices(), attr.getOperandIndex(),
         attr.getOperandTupleIndices());
   }
-  if (auto attr = hloAttr.dyn_cast<mhlo::PrecisionAttr>()) {
+  if (auto attr = mlir::dyn_cast<mhlo::PrecisionAttr>(hloAttr)) {
     // StableHLO Precision doesn't support PACKED_NIBBLE yet.
     // Proposal: https://github.com/openxla/stablehlo/issues/742.
     if (attr.getValue() == mhlo::Precision::PACKED_NIBBLE) return {};
     RETURN_CONVERTED_ENUM_ATTR(Precision);
   }
-  if (auto attr = hloAttr.dyn_cast<mhlo::RngAlgorithmAttr>()) {
+  if (auto attr = mlir::dyn_cast<mhlo::RngAlgorithmAttr>(hloAttr)) {
     RETURN_CONVERTED_ENUM_ATTR(RngAlgorithm);
   }
-  if (auto attr = hloAttr.dyn_cast<mhlo::RngDistributionAttr>()) {
+  if (auto attr = mlir::dyn_cast<mhlo::RngDistributionAttr>(hloAttr)) {
     RETURN_CONVERTED_ENUM_ATTR(RngDistribution);
   }
-  if (auto attr = hloAttr.dyn_cast<mhlo::ScatterDimensionNumbersAttr>()) {
+  if (auto attr = mlir::dyn_cast<mhlo::ScatterDimensionNumbersAttr>(hloAttr)) {
     return stablehlo::ScatterDimensionNumbersAttr::get(
         attr.getContext(), attr.getUpdateWindowDims(),
         attr.getInsertedWindowDims(), attr.getScatterDimsToOperandDims(),
         attr.getIndexVectorDim());
   }
-  if (auto attr = hloAttr.dyn_cast<mhlo::TransposeAttr>()) {
+  if (auto attr = mlir::dyn_cast<mhlo::TransposeAttr>(hloAttr)) {
     RETURN_CONVERTED_ENUM_ATTR(Transpose);
   }
   if (hloAttr.getDialect().getNamespace() ==
@@ -316,7 +318,7 @@ Attribute convertAttr(Attribute hloAttr) {
   // Handle non-MHLO attributes.
   // If an attribute is not defined in MHLO, then it is unchanged,
   // with the exception of ArrayAttr which is converted recursively.
-  if (auto hloAttrs = hloAttr.dyn_cast<ArrayAttr>()) {
+  if (auto hloAttrs = mlir::dyn_cast<ArrayAttr>(hloAttr)) {
     SmallVector<Attribute> stablehloAttrs;
     for (auto hloAttr : hloAttrs) {
       auto stablehloAttr = convertAttr(hloAttr);
@@ -325,6 +327,15 @@ Attribute convertAttr(Attribute hloAttr) {
     }
     return ArrayAttr::get(hloAttrs.getContext(), stablehloAttrs);
   }
+  if (auto hloAttrs = mlir::dyn_cast<DictionaryAttr>(hloAttr)) {
+    SmallVector<NamedAttribute> stablehloAttrs;
+    for (auto hloAttr : hloAttrs) {
+      auto stablehloAttr = convertAttr(hloAttr.getValue());
+      if (!stablehloAttr) return {};
+      stablehloAttrs.push_back({hloAttr.getName(), stablehloAttr});
+    }
+    return DictionaryAttr::get(hloAttrs.getContext(), stablehloAttrs);
+  }
   return hloAttr;
 }
 
@@ -337,19 +348,26 @@ Attribute convertAttr(Attribute hloAttr) {
 // a dependency on upstream printing / parsing. If an attribute name is changed,
 // we can fork and  modify the code of `stringifyPrecision` as needed for
 // compatibility.
-Attribute encodePrecisionConfig(Attribute hloAttrs) {
-  auto hloArrayAttr = hloAttrs.dyn_cast<ArrayAttr>();
-  if (!hloArrayAttr) return {};
+Attribute encodePrecisionConfig(ArrayAttr precisionConfigAttr) {
   SmallVector<Attribute> stablehloAttrs;
-  for (auto hloAttr : hloArrayAttr) {
-    auto precisionAttr = hloAttr.dyn_cast<mhlo::PrecisionAttr>();
+  for (auto hloAttr : precisionConfigAttr) {
+    auto precisionAttr = mlir::dyn_cast<mhlo::PrecisionAttr>(hloAttr);
     if (!precisionAttr) return {};
     StringRef precisionStr = mhlo::stringifyPrecision(precisionAttr.getValue());
     if (precisionStr.empty()) return {};
     stablehloAttrs.push_back(
         StringAttr::get(hloAttr.getContext(), precisionStr));
   }
-  return ArrayAttr::get(hloAttrs.getContext(), stablehloAttrs);
+  return ArrayAttr::get(precisionConfigAttr.getContext(), stablehloAttrs);
+}
+
+template <typename FailedToConvertTy>
+LogicalResult notifyConversionFailure(ConversionPatternRewriter& rewriter,
+                                      Operation* op,
+                                      std::string const& errorMessage,
+                                      FailedToConvertTy ty) {
+  return rewriter.notifyMatchFailure(
+      op, [=](Diagnostic& diag) { diag << errorMessage << ": " << ty; });
 }
 
 // Converts region to function.
@@ -376,9 +394,11 @@ FailureOr<func::FuncOp> rewriteMhloRegionAsFunc(
   SetVector<Value> values;
   getUsedValuesDefinedAbove(region, values);
   if (!values.empty())
-    return op->emitError(
+    return notifyConversionFailure(
+        rewriter, op,
         "MHLO feature serialization in StableHLO only supports regions that "
-        "do not capture SSA values from above");
+        "do not capture SSA values from above",
+        op);
 
   // Insert into the parent module
   OpBuilder::InsertionGuard g(rewriter);
@@ -388,7 +408,8 @@ FailureOr<func::FuncOp> rewriteMhloRegionAsFunc(
   // Convert so that function signature is correct
   if (failed(rewriter.convertRegionTypes(&region, *typeConverter,
                                          /*entryConversion=*/nullptr)))
-    return failure();
+    return notifyConversionFailure(rewriter, op,
+                                   "failed to convert region types", op);
 
   // Create function with args that match block inputs / return types
   rewriter.setInsertionPointToEnd(&module.getBodyRegion().front());
@@ -405,6 +426,58 @@ FailureOr<func::FuncOp> rewriteMhloRegionAsFunc(
   return funcOp;
 }
 
+// Convert HLO attributes to StableHLO, using `convertAttr`.
+//
+// Has additional logic for skipping experimental features or default values
+// of features that aren't in StableHLO like custom_call_schedule.
+//
+template <typename HloOpTy>
+LogicalResult convertAttributes(ConversionPatternRewriter& rewriter,
+                                HloOpTy hloOp,
+                                SmallVector<NamedAttribute>& stablehloAttrs) {
+  for (NamedAttribute hloAttr : hloOp->getAttrs()) {
+    Attribute stablehloAttr;
+
+    // Skip custom_call_schedule if using the default attribute
+    if constexpr (std::is_same<HloOpTy, mhlo::CustomCallOp>::value) {
+      // custom_call_schedule is private to XLA, but we still want to allow
+      // #mhlo<custom_call_schedule NONE> (by ignoring it).
+      if (hloAttr.getName() == "custom_call_schedule" &&
+          hloOp.getCustomCallSchedule() == mhlo::CustomCallSchedule::NONE)
+        continue;
+    }
+
+    // If PACKED_NIBBLE enum support enabled, convert to string "PACKED_NIBBLE"
+    if constexpr (std::is_same<HloOpTy, mhlo::ConvolutionOp>::value ||
+                  std::is_same<HloOpTy, mhlo::DotGeneralOp>::value ||
+                  std::is_same<HloOpTy, mhlo::DotOp>::value) {
+      if (hloAttr.getName() == "precision_config" &&
+          hasPackedNibble(hloOp.getPrecisionConfig())) {
+        stablehloAttr =
+            encodePrecisionConfig(hloOp.getPrecisionConfig().value());
+      }
+    }
+
+    // Handle DenseElements --> DenseArray for certain StableHLO ops
+    if constexpr (!std::is_same<HloOpTy, mhlo::TanOp>::value &&
+                  !std::is_same<HloOpTy, mhlo::ErfOp>::value &&
+                  !std::is_same<HloOpTy, mhlo::TopKOp>::value) {
+      if (!stablehloAttr)
+        stablehloAttr = convertDenseArray<HloToStablehloOp<HloOpTy>>(
+            hloAttr.getName(), hloAttr.getValue());
+    }
+
+    // Generic handler for all other attributes
+    if (!stablehloAttr) stablehloAttr = convertAttr(hloAttr.getValue());
+
+    if (!stablehloAttr)
+      return notifyConversionFailure(rewriter, hloOp, "failed to convert attr ",
+                                     hloAttr.getValue());
+    stablehloAttrs.push_back({hloAttr.getName(), stablehloAttr});
+  }
+  return success();
+}
+
 // Experimental and public ops in MHLO that do not exist yet in StableHLO can be
 // encoded as a StableHLO CustomCallOp to allow round-tripping between dialects.
 //
@@ -433,23 +506,13 @@ LogicalResult rewriteMhloOpAsCustomCall(HloOpTy hloOp,
   SmallVector<Type> stablehloTypes;
   if (failed(
           typeConverter->convertTypes(hloOp->getResultTypes(), stablehloTypes)))
-    return failure();
+    return notifyConversionFailure(rewriter, hloOp,
+                                   "failed to convert op types", hloOp);
 
   // Convert MHLO attributes to StableHLO equivalents.
   SmallVector<NamedAttribute> stablehloConvertedAttrs;
-  for (NamedAttribute hloAttr : hloOp->getAttrs()) {
-    // Special case Attrs/Values not in StableHLO
-    // precision_config exists in both MHLO and StableHLO, but MHLO's version
-    // has additional enum values not supported in StableHLO.
-    Attribute stablehloAttr;
-    if (hloAttr.getName() == "precision_config") {
-      stablehloAttr = encodePrecisionConfig(hloAttr.getValue());
-    } else {
-      stablehloAttr = convertAttr(hloAttr.getValue());
-    }
-    if (!stablehloAttr) return failure();
-    stablehloConvertedAttrs.push_back({hloAttr.getName(), stablehloAttr});
-  }
+  if (failed(convertAttributes(rewriter, hloOp, stablehloConvertedAttrs)))
+    return failure();
 
   // Create functions from regions
   std::optional<func::FuncOp> stablehloConvertedRegion;
@@ -493,15 +556,21 @@ class HloToStablehloCustomCallOpConverter
   LogicalResult matchAndRewrite(
       HloOpTy hloOp, typename HloOpTy::Adaptor adaptor,
       ConversionPatternRewriter& rewriter) const final {
-    if (hasPrivateFeaturesNotInStablehlo(hloOp)) return failure();
+    if (hasPrivateFeaturesNotInStablehlo(hloOp))
+      return notifyConversionFailure(
+          rewriter, hloOp, "op has private features not in StableHLO", hloOp);
     bool hasExperimentalFeatures = hasExperimentalFeaturesNotInStablehlo(hloOp);
-    if (!allowExperimentalFeatures && hasExperimentalFeatures) return failure();
+    if (!allowExperimentalFeatures && hasExperimentalFeatures)
+      return notifyConversionFailure(
+          rewriter, hloOp,
+          "op has experimental features, but conversion not enabled", hloOp);
     auto hasPublicFeatures = hasPublicFeaturesNotInStablehlo(hloOp);
     if (hasPublicFeatures || hasExperimentalFeatures) {
       return rewriteMhloOpAsCustomCall(
           hloOp, rewriter, this->getTypeConverter(), adaptor.getOperands());
     }
-    return failure();
+    return notifyConversionFailure(
+        rewriter, hloOp, "op not candidate for custom_call conversion", hloOp);
   }
 
   bool allowExperimentalFeatures;
@@ -571,22 +640,8 @@ class HloToStablehloOpConverter : public OpConversionPattern<HloOpTy> {
     // If an attribute is not defined in MHLO, then it is unchanged,
     // with the exception of ArrayAttr which is converted recursively.
     SmallVector<NamedAttribute> stablehloAttrs;
-    for (NamedAttribute hloAttr : hloOp->getAttrs()) {
-      if constexpr (std::is_same<HloOpTy, mhlo::CustomCallOp>::value) {
-        // custom_call_schedule is private to XLA, but we still want to allow
-        // #mhlo<custom_call_schedule NONE> (by ignoring it).
-        if (hloAttr.getName() == "custom_call_schedule" &&
-            hloOp.getCustomCallSchedule() == mhlo::CustomCallSchedule::NONE)
-          continue;
-      }
-      auto stablehloAttr = convertDenseArray<HloToStablehloOp<HloOpTy>>(
-          hloAttr.getName(), hloAttr.getValue());
-      if (!stablehloAttr) {
-        stablehloAttr = convertAttr(hloAttr.getValue());
-      }
-      if (!stablehloAttr) return failure();
-      stablehloAttrs.push_back({hloAttr.getName(), stablehloAttr});
-    }
+    if (failed(convertAttributes(rewriter, hloOp, stablehloAttrs)))
+      return failure();
 
     // Convert the MHLO operation to a StableHLO equivalent.
     // This can almost be done in a generic fashion, except for stablehlo.case
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_control_flow/legalize_control_flow.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_control_flow/legalize_control_flow.cc
index d56a1da7591094..9d473b9a058936 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_control_flow/legalize_control_flow.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_control_flow/legalize_control_flow.cc
@@ -64,8 +64,8 @@ void inlineMhloRegionIntoSCFRegion(PatternRewriter& rewriter, Region& mhlo,
 // scalar value when necessary.
 Value extractTensorValue(OpBuilder& b, Value tensor) {
   auto loc = tensor.getLoc();
-  if (tensor.getType().cast<TensorType>().hasRank() &&
-      tensor.getType().cast<TensorType>().getRank() != 0) {
+  if (mlir::cast<TensorType>(tensor.getType()).hasRank() &&
+      mlir::cast<TensorType>(tensor.getType()).getRank() != 0) {
     tensor = b.create<tensor::CollapseShapeOp>(
         loc, tensor, SmallVector<ReassociationIndices>());
   }
@@ -85,9 +85,9 @@ std::optional<ScfForBounds> extractForBounds(mhlo::WhileOp op) {
   if (cond.getOperations().size() != 2) return std::nullopt;
 
   auto matchBbArg = [](Value v, Block& block) -> std::optional<unsigned> {
-    if (!v.isa<BlockArgument>() || v.getParentBlock() != &block)
+    if (!mlir::isa<BlockArgument>(v) || v.getParentBlock() != &block)
       return std::nullopt;
-    return v.cast<BlockArgument>().getArgNumber();
+    return mlir::cast<BlockArgument>(v).getArgNumber();
   };
 
   auto compare = llvm::dyn_cast<mhlo::CompareOp>(cond.front());
@@ -207,10 +207,10 @@ struct CaseOpPattern : public OpConversionPattern<mhlo::CaseOp> {
 
     // Determine if the current index matches the case index.
     auto scalarType = idxValue.getType();
-    auto shapedType = scalarType.cast<ShapedType>();
+    auto shapedType = mlir::cast<ShapedType>(scalarType);
     auto constAttr = DenseElementsAttr::get(
-        shapedType,
-        {outerBuilder.getI32IntegerAttr(currentIdx).cast<mlir::Attribute>()});
+        shapedType, {mlir::cast<mlir::Attribute>(
+                        outerBuilder.getI32IntegerAttr(currentIdx))});
     Value currentIdxVal = outerBuilder.create<mhlo::ConstantOp>(
         loc, idxValue.getType(), constAttr);
 
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_cross_replica_sum_to_all_reduce/legalize_cross_replica_sum_to_all_reduce.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_cross_replica_sum_to_all_reduce/legalize_cross_replica_sum_to_all_reduce.cc
index dfc541c591a4da..0ddd409bf3f583 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_cross_replica_sum_to_all_reduce/legalize_cross_replica_sum_to_all_reduce.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_cross_replica_sum_to_all_reduce/legalize_cross_replica_sum_to_all_reduce.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
@@ -49,11 +50,10 @@ struct CrossReplicaSumToAllReducePattern
         /*use_global_device_ids=*/false);
 
     auto *block = rewriter.createBlock(&allReduceOp.getComputation());
-    auto elementType = RankedTensorType::get({}, allReduceOp.getResults()
-                                                     .front()
-                                                     .getType()
-                                                     .dyn_cast<TensorType>()
-                                                     .getElementType());
+    auto elementType = RankedTensorType::get(
+        {},
+        mlir::dyn_cast<TensorType>(allReduceOp.getResults().front().getType())
+            .getElementType());
     auto location = allReduceOp.getComputation().getLoc();
     block->addArguments({elementType, elementType}, {location, location});
 
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_dot_general_to_dot/legalize_dot_general_to_dot.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_dot_general_to_dot/legalize_dot_general_to_dot.cc
index ea37c6104e62cd..6c2c26e3d28bf5 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_dot_general_to_dot/legalize_dot_general_to_dot.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_dot_general_to_dot/legalize_dot_general_to_dot.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
@@ -44,8 +45,8 @@ struct DotGeneralToDot : public OpRewritePattern<DotGeneralOp> {
                                 PatternRewriter& rewriter) const override {
     auto lhs = dot.getLhs();
     auto rhs = dot.getRhs();
-    auto lhsTy = lhs.getType().cast<ShapedType>();
-    auto rhsTy = rhs.getType().cast<ShapedType>();
+    auto lhsTy = mlir::cast<ShapedType>(lhs.getType());
+    auto rhsTy = mlir::cast<ShapedType>(rhs.getType());
 
     int64_t lhsRank = lhsTy.getRank();
     int64_t rhsRank = rhsTy.getRank();
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_einsum_to_dot_general/legalize_einsum_to_dot_general.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_einsum_to_dot_general/legalize_einsum_to_dot_general.cc
index 0dc4495cdbbc6c..f8c0f9eafd7c83 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_einsum_to_dot_general/legalize_einsum_to_dot_general.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_einsum_to_dot_general/legalize_einsum_to_dot_general.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir {
@@ -68,8 +69,8 @@ struct EinsumToDotGeneralPattern : public OpRewritePattern<EinsumOp> {
       index++;
     }
 
-    auto lhsType = einsum.getLhs().getType().cast<RankedTensorType>();
-    auto rhsType = einsum.getRhs().getType().cast<RankedTensorType>();
+    auto lhsType = mlir::cast<RankedTensorType>(einsum.getLhs().getType());
+    auto rhsType = mlir::cast<RankedTensorType>(einsum.getRhs().getType());
     assert(static_cast<int64_t>(lhsTokens.size()) == lhsType.getRank());
     assert(static_cast<int64_t>(rhsTokens.size()) == rhsType.getRank());
 
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_gather_to_torch_index_select/legalize_gather_to_torch_index_select.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_gather_to_torch_index_select/legalize_gather_to_torch_index_select.cc
index aba642b45cdf09..34837951b24eef 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_gather_to_torch_index_select/legalize_gather_to_torch_index_select.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_gather_to_torch_index_select/legalize_gather_to_torch_index_select.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir {
@@ -39,13 +40,13 @@ struct GatherIsTorchIndexSelect : public OpRewritePattern<GatherOp> {
   LogicalResult matchAndRewrite(GatherOp gather,
                                 PatternRewriter &rewriter) const override {
     auto startIndices = gather.getStartIndices();
-    auto startIndicesTy = startIndices.getType().cast<ShapedType>();
+    auto startIndicesTy = mlir::cast<ShapedType>(startIndices.getType());
     if (!startIndicesTy.hasRank()) {
       return rewriter.notifyMatchFailure(gather, "unranked start_indices");
     }
 
     auto operand = gather.getOperand();
-    auto operandTy = operand.getType().cast<ShapedType>();
+    auto operandTy = mlir::cast<ShapedType>(operand.getType());
     if (!operandTy.hasRank()) {
       return rewriter.notifyMatchFailure(gather, "unranked operand");
     }
@@ -73,7 +74,8 @@ struct GatherIsTorchIndexSelect : public OpRewritePattern<GatherOp> {
       return rewriter.notifyMatchFailure(gather, "start_index_map != [0]");
     }
 
-    auto resultTy = gather.getResult().getType().dyn_cast<RankedTensorType>();
+    auto resultTy =
+        mlir::dyn_cast<RankedTensorType>(gather.getResult().getType());
     if (!resultTy) {
       return rewriter.notifyMatchFailure(gather, "unranked result");
     }
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_shape_computations/legalize_shape_computations.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_shape_computations/legalize_shape_computations.cc
index 7a8feb69caf9cb..935465d2dabaa6 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_shape_computations/legalize_shape_computations.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_shape_computations/legalize_shape_computations.cc
@@ -46,6 +46,7 @@ limitations under the License.
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
@@ -62,7 +63,7 @@ namespace {
 bool opIsShapeComputation(Operation *op) {
   bool foundFromElements = false;
   for (auto operand : op->getOperands()) {
-    auto shapedTy = operand.getType().template cast<ShapedType>();
+    auto shapedTy = mlir::cast<ShapedType>(operand.getType());
     if (!shapedTy.hasRank() || shapedTy.getRank() > 1) return false;
     if (auto fromElements =
             operand.template getDefiningOp<tensor::FromElementsOp>()) {
@@ -82,14 +83,14 @@ class MhloElementwiseConverter : public OpRewritePattern<OpTy> {
                                 PatternRewriter &rewriter) const final {
     if (!opIsShapeComputation(op)) return failure();
 
-    auto resultTy = op.getType().template cast<ShapedType>();
+    auto resultTy = mlir::cast<ShapedType>(op.getType());
 
     Location loc = op.getLoc();
     SmallVector<Value> operands;
     for (int i = 0, s = resultTy.getNumElements(); i < s; i++) {
       SmallVector<Value> extracts;
       for (auto operand : op->getOperands()) {
-        ShapedType operandTy = operand.getType().template cast<ShapedType>();
+        ShapedType operandTy = mlir::cast<ShapedType>(operand.getType());
         if (operandTy.getRank() == 0) {
           Value extract =
               rewriter.create<tensor::ExtractOp>(loc, operand, ValueRange({}));
@@ -121,12 +122,12 @@ class ConcatenateConverter : public OpRewritePattern<mhlo::ConcatenateOp> {
     if (!opIsShapeComputation(op)) return failure();
 
     Location loc = op.getLoc();
-    auto resultTy = op.getType().cast<ShapedType>();
+    auto resultTy = mlir::cast<ShapedType>(op.getType());
     llvm::SmallVector<Value> elements;
     elements.reserve(resultTy.getNumElements());
 
     for (auto operand : op->getOperands()) {
-      ShapedType operandTy = operand.getType().template cast<ShapedType>();
+      ShapedType operandTy = mlir::cast<ShapedType>(operand.getType());
       if (operandTy.getRank() == 0) {
         Value extract =
             rewriter.create<tensor::ExtractOp>(loc, operand, ValueRange({}));
@@ -174,10 +175,10 @@ class ReshapeConverter : public OpRewritePattern<mhlo::ReshapeOp> {
   LogicalResult matchAndRewrite(mhlo::ReshapeOp op,
                                 PatternRewriter &rewriter) const final {
     auto operand = op.getOperand();
-    auto shapedTy = operand.getType().template cast<ShapedType>();
+    auto shapedTy = mlir::cast<ShapedType>(operand.getType());
     if (!shapedTy.hasRank() || shapedTy.getRank() > 1) return failure();
 
-    auto resultTy = op.getType().cast<ShapedType>();
+    auto resultTy = mlir::cast<ShapedType>(op.getType());
 
     auto fromElements = op.getOperand().getDefiningOp<tensor::FromElementsOp>();
     if (!fromElements) return failure();
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_sort/legalize_sort.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_sort/legalize_sort.cc
index 6446995a7e91fb..8ba9de9a1a10f4 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_sort/legalize_sort.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_sort/legalize_sort.cc
@@ -139,7 +139,7 @@ SmallVector<Value> loadTensorElements(ImplicitLocOpBuilder& b,
 SmallVector<Value> loadMemrefElements(ImplicitLocOpBuilder& b,
                                       ValueRange memrefs, Value index) {
   return llvm::to_vector(llvm::map_range(memrefs, [&](Value memref) -> Value {
-    Type type = memref.getType().cast<MemRefType>().getElementType();
+    Type type = mlir::cast<MemRefType>(memref.getType()).getElementType();
     return b.create<memref::LoadOp>(type, memref, index);
   }));
 }
@@ -414,25 +414,24 @@ struct Slicer {
   }
 
   MemRefType toSlicedType(MemRefType sourceType) {
-    return memref::SubViewOp::inferRankReducedResultType(
-               {ShapedType::kDynamic} /*1D output*/, sourceType, offsets, sizes,
-               strides)
-        .cast<MemRefType>();
+    return mlir::cast<MemRefType>(memref::SubViewOp::inferRankReducedResultType(
+        {ShapedType::kDynamic} /*1D output*/, sourceType, offsets, sizes,
+        strides));
   }
 
   template <typename Op, typename Ty>
   Value slice(ImplicitLocOpBuilder& b, Value input) {
-    Ty ty = input.getType().cast<Ty>();
+    Ty ty = mlir::cast<Ty>(input.getType());
     return b.create<Op>(toSlicedType(ty), input, offsets, sizes, strides)
         .getResult();
   }
 
   Value apply(ImplicitLocOpBuilder& b, Value input) {
     Type inTy = input.getType();
-    if (inTy.isa<RankedTensorType>()) {
+    if (mlir::isa<RankedTensorType>(inTy)) {
       return slice<tensor::ExtractSliceOp, RankedTensorType>(b, input);
     }
-    assert(inTy.isa<MemRefType>());
+    assert(mlir::isa<MemRefType>(inTy));
     return slice<memref::SubViewOp, MemRefType>(b, input);
   }
 
@@ -470,7 +469,7 @@ struct SortOpPattern : public OpRewritePattern<SortOp> {
     SmallVector<Value> scratchMemrefs;
 
     Value firstOperand = op.getOperands().front();
-    auto firstOperandType = firstOperand.getType().cast<ShapedType>();
+    auto firstOperandType = mlir::cast<ShapedType>(firstOperand.getType());
     int64_t inputRank = firstOperandType.getRank();
 
     Value sortDimSize = b.createOrFold<tensor::DimOp>(
@@ -489,7 +488,7 @@ struct SortOpPattern : public OpRewritePattern<SortOp> {
     // statically known to be <= kInsertionSortSize, `scratchMemrefs` are unused
     // and will be cleaned up later.
     for (auto input : op.getOperands()) {
-      auto inputType = input.getType().cast<ShapedType>();
+      auto inputType = mlir::cast<ShapedType>(input.getType());
       auto memRefType =
           MemRefType::get(inputType.getShape(), inputType.getElementType());
 
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_to_linalg/legalize_to_linalg.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_to_linalg/legalize_to_linalg.cc
index 2c470ccf66d48d..d0fe87d2451871 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_to_linalg/legalize_to_linalg.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_to_linalg/legalize_to_linalg.cc
@@ -73,22 +73,22 @@ namespace {
 Value getResultValue(Operation* op) { return op->getResult(0); }
 
 ShapedType getHloOpResultType(Operation* op) {
-  return getResultValue(op).getType().cast<ShapedType>();
+  return mlir::cast<ShapedType>(getResultValue(op).getType());
 }
 
 bool verifyHloOpBufferOrTensorSemantics(Operation* op) {
   auto verifyType = [&](Value val) -> bool {
-    return val.getType().isa<RankedTensorType>();
+    return mlir::isa<RankedTensorType>(val.getType());
   };
   if (!llvm::all_of(op->getOperands(), verifyType)) return false;
   return llvm::all_of(op->getResults(), verifyType);
 }
 
 Value fillTensorWithZeros(OpBuilder& builder, Location loc, Value tensor) {
-  auto type = tensor.getType().cast<ShapedType>();
+  auto type = mlir::cast<ShapedType>(tensor.getType());
   Value zero;
   // Complex numbers are a special case.
-  if (auto complexType = type.getElementType().dyn_cast<ComplexType>()) {
+  if (auto complexType = mlir::dyn_cast<ComplexType>(type.getElementType())) {
     auto zeroElement = builder.getZeroAttr(complexType.getElementType());
     auto zeroAttr = builder.getArrayAttr({zeroElement, zeroElement});
     zero = builder.create<complex::ConstantOp>(loc, complexType, zeroAttr);
@@ -233,15 +233,15 @@ struct RngUniformConversion : public OpConversionPattern<mhlo::RngOp> {
       return failure();
     }
     // TODO(raikonenfnu): Handle other element types as well.
-    auto minTy = adaptor.getOperands()[0].getType().dyn_cast<ShapedType>();
-    auto maxTy = adaptor.getOperands()[0].getType().dyn_cast<ShapedType>();
-    if (!minTy.getElementType().dyn_cast<FloatType>() ||
-        !maxTy.getElementType().dyn_cast<FloatType>()) {
+    auto minTy = mlir::dyn_cast<ShapedType>(adaptor.getOperands()[0].getType());
+    auto maxTy = mlir::dyn_cast<ShapedType>(adaptor.getOperands()[0].getType());
+    if (!mlir::dyn_cast<FloatType>(minTy.getElementType()) ||
+        !mlir::dyn_cast<FloatType>(maxTy.getElementType())) {
       return rewriter.notifyMatchFailure(
           op, "expected min/max for rng op to be FloatType");
     }
-    auto targetTy = this->typeConverter->convertType(op.getResult().getType())
-                        .cast<ShapedType>();
+    auto targetTy = mlir::cast<ShapedType>(
+        this->typeConverter->convertType(op.getResult().getType()));
     if (!targetTy) {
       return rewriter.notifyMatchFailure(
           op, "expected target shape of rng op to be ShapedType");
@@ -339,14 +339,16 @@ SmallVector<Value, 2> extractDynamicEinsumSizes(
     if (dimIndIt != lhsLoopVec.end()) {
       // Query from lhs vars.
       auto dimIndPos = dimIndIt - lhsLoopVec.begin();
-      auto lhsShape = lhs.getType().dyn_cast<RankedTensorType>().getShape();
+      auto lhsShape =
+          mlir::dyn_cast<RankedTensorType>(lhs.getType()).getShape();
       if (lhsShape[dimIndPos] != ShapedType::kDynamic) continue;
       dimSize = b.create<tensor::DimOp>(loc, lhs, dimIndPos);
     } else {
       // query from rhs vars.
       dimIndIt = std::find(rhsLoopVec.begin(), rhsLoopVec.end(), dimInd);
       auto dimIndPos = dimIndIt - rhsLoopVec.begin();
-      auto rhsShape = rhs.getType().dyn_cast<RankedTensorType>().getShape();
+      auto rhsShape =
+          mlir::dyn_cast<RankedTensorType>(rhs.getType()).getShape();
       if (rhsShape[dimIndPos] != ShapedType::kDynamic) continue;
       dimSize = b.create<tensor::DimOp>(loc, rhs, dimIndPos);
     }
@@ -407,7 +409,7 @@ class EinsumToLinalgConverter : public OpConversionPattern<mhlo::EinsumOp> {
       mhlo::EinsumOp op, OpAdaptor adaptor,
       ConversionPatternRewriter& rewriter) const final {
     auto getRank = [](Value v) {
-      return v.getType().cast<ShapedType>().getRank();
+      return mlir::cast<ShapedType>(v.getType()).getRank();
     };
     auto einsumConfig = op.getEinsumConfig();
 
@@ -433,8 +435,8 @@ class EinsumToLinalgConverter : public OpConversionPattern<mhlo::EinsumOp> {
     }
 
     // Find result type, if on tensors.
-    auto resultTy = this->typeConverter->convertType(getHloOpResultType(op))
-                        .dyn_cast<RankedTensorType>();
+    auto resultTy = mlir::dyn_cast<RankedTensorType>(
+        this->typeConverter->convertType(getHloOpResultType(op)));
 
     // Check result type compatibility.
     if (!resultTy || !(resultTy.getElementType().isSignlessIntOrFloat())) {
@@ -598,8 +600,7 @@ class ScalarPointwiseToStandardConverter : public OpConversionPattern<MhloOp> {
   LogicalResult matchAndRewrite(
       MhloOp mhloOp, ConversionPatternRewriter& rewriter) const final {
     auto loc = mhloOp.getLoc();
-    auto argType =
-        mhloOp.getOperand(0).getType().template dyn_cast<ShapedType>();
+    auto argType = mlir::dyn_cast<ShapedType>(mhloOp.getOperand(0).getType());
     if (!argType || !argType.getElementType().isSignlessIntOrFloat() ||
         (argType.getRank() != 0)) {
       return failure();
@@ -632,8 +633,8 @@ class DataMovementOpConverter : public OpConversionPattern<OpTy> {
       ConversionPatternRewriter& rewriter) const final {
     if (!verifyHloOpBufferOrTensorSemantics(op)) return failure();
     auto resultType = getHloOpResultType(op);
-    resultType = this->typeConverter->convertType(resultType)
-                     .template cast<ShapedType>();
+    resultType =
+        mlir::cast<ShapedType>(this->typeConverter->convertType(resultType));
 
     SmallVector<AffineMap, 2> indexingMaps =
         Derived::getIndexingMaps(op, &rewriter);
@@ -671,7 +672,7 @@ class BroadcastConverter
   static SmallVector<AffineMap, 2> getIndexingMaps(OpTy broadcastOp,
                                                    Builder* b) {
     ShapedType inputType =
-        broadcastOp.getOperand().getType().template cast<ShapedType>();
+        mlir::cast<ShapedType>(broadcastOp.getOperand().getType());
     unsigned inputRank = inputType.getRank();
     unsigned nloops = getHloOpResultType(broadcastOp).getRank();
 
@@ -704,7 +705,8 @@ class BroadcastOpToBroadcastConverter
   LogicalResult matchAndRewrite(
       mhlo::BroadcastOp op, OpAdaptor adaptor,
       ConversionPatternRewriter& rewriter) const override {
-    auto resultTy = typeConverter->convertType(op.getType()).cast<ShapedType>();
+    auto resultTy =
+        mlir::cast<ShapedType>(typeConverter->convertType(op.getType()));
 
     int64_t numPrependedDims = op.getBroadcastSizes().size();
     SmallVector<int64_t> dimensions =
@@ -733,7 +735,7 @@ class HloBroadcastInDimConverter
       mhlo::BroadcastInDimOp broadcastOp, Builder* b) {
     auto resultType = getHloOpResultType(broadcastOp);
     auto operandType =
-        broadcastOp.getOperand().getType().template cast<ShapedType>();
+        mlir::cast<ShapedType>(broadcastOp.getOperand().getType());
     unsigned nloops = resultType.getRank();
 
     // The input is a scalar, i.e. this is a scalar broadcast op.
@@ -765,7 +767,7 @@ class HloBroadcastInDimConverter
 Value collapseExpandingDims(PatternRewriter& rewriter, Location loc,
                             Value operand, SmallVector<int64_t>& dimensions,
                             llvm::function_ref<bool(int64_t)> isExpandingDim) {
-  auto operandTy = operand.getType().cast<RankedTensorType>();
+  auto operandTy = mlir::cast<RankedTensorType>(operand.getType());
 
   SmallVector<ReassociationIndices> reassociationMap;
   ReassociationIndices currentIndices;
@@ -816,7 +818,7 @@ Value transposeBroadcastOperand(PatternRewriter& rewriter, Location loc,
     return dimensions[lhs] < dimensions[rhs];
   });
 
-  auto operandTy = operand.getType().cast<ShapedType>();
+  auto operandTy = mlir::cast<ShapedType>(operand.getType());
   ArrayRef<int64_t> operandShape = operandTy.getShape();
   SmallVector<int64_t> transposedOperandShape, transposedDimensions;
 
@@ -846,8 +848,9 @@ class BroadcastInDimOpToBroadcastConverter
         llvm::to_vector(op.getBroadcastDimensions().getValues<int64_t>());
 
     Value operand = adaptor.getOperand();
-    auto operandTy = operand.getType().cast<ShapedType>();
-    auto resultTy = typeConverter->convertType(op.getType()).cast<ShapedType>();
+    auto operandTy = mlir::cast<ShapedType>(operand.getType());
+    auto resultTy =
+        mlir::cast<ShapedType>(typeConverter->convertType(op.getType()));
 
     ArrayRef<int64_t> operandShape = operandTy.getShape();
     ArrayRef<int64_t> resultShape = resultTy.getShape();
@@ -895,10 +898,10 @@ class HloDynamicBroadcastInDimConverter
       mhlo::DynamicBroadcastInDimOp op, OpAdaptor adaptor,
       ConversionPatternRewriter& rewriter) const final {
     Value operand = adaptor.getOperand();
-    auto operandType = operand.getType().dyn_cast<RankedTensorType>();
+    auto operandType = mlir::dyn_cast<RankedTensorType>(operand.getType());
     if (!operandType) return failure();
-    auto resultType =
-        typeConverter->convertType(op.getType()).dyn_cast<RankedTensorType>();
+    auto resultType = mlir::dyn_cast<RankedTensorType>(
+        typeConverter->convertType(op.getType()));
     if (!resultType) return failure();
 
     // Determine dimension expressions based on whether the dimension is
@@ -971,10 +974,10 @@ class DynamicBroadcastInDimOpToBroadcastConverter
     Location loc = op.getLoc();
 
     Value operand = adaptor.getOperand();
-    auto operandTy = operand.getType().dyn_cast<RankedTensorType>();
+    auto operandTy = mlir::dyn_cast<RankedTensorType>(operand.getType());
     if (!operandTy) return failure();
-    auto resultTy =
-        typeConverter->convertType(op.getType()).dyn_cast<RankedTensorType>();
+    auto resultTy = mlir::dyn_cast<RankedTensorType>(
+        typeConverter->convertType(op.getType()));
     if (!resultTy) return failure();
 
     SmallVector<int64_t> broadcastDimensions =
@@ -1049,7 +1052,7 @@ class DynamicBroadcastInDimOpToBroadcastConverter
   static Value getBroadcastOperand(
       PatternRewriter& rewriter, Location loc, Value operand,
       llvm::function_ref<bool(int64_t)> isExpandingDim) {
-    auto operandTy = operand.getType().dyn_cast<RankedTensorType>();
+    auto operandTy = mlir::dyn_cast<RankedTensorType>(operand.getType());
 
     SmallVector<int64_t> updatedOperandShape =
         llvm::to_vector(operandTy.getShape());
@@ -1070,7 +1073,8 @@ class DynamicBroadcastInDimOpToBroadcastConverter
   static ShapedType getBroadcastResultType(
       Value operand, RankedTensorType resultTy, ArrayRef<int64_t> dimensions,
       llvm::function_ref<bool(int64_t)> isExpandingDim) {
-    auto operandShape = operand.getType().cast<RankedTensorType>().getShape();
+    auto operandShape =
+        mlir::cast<RankedTensorType>(operand.getType()).getShape();
     auto broadcastResultShape = llvm::to_vector(resultTy.getShape());
 
     for (const auto& [operandIndex, resultIndex] :
@@ -1091,7 +1095,7 @@ class TransposeConverter
   using DataMovementOpConverter<TransposeConverter<OpTy>,
                                 OpTy>::DataMovementOpConverter;
   static SmallVector<AffineMap, 2> getIndexingMaps(OpTy op, Builder* b) {
-    auto resultType = getHloOpResultType(op).template cast<ShapedType>();
+    auto resultType = mlir::cast<ShapedType>(getHloOpResultType(op));
     auto nloops = resultType.getRank();
     SmallVector<AffineExpr, 2> inputExprs;
     inputExprs.resize(resultType.getRank());
@@ -1112,7 +1116,8 @@ class TransposeOpToTransposeConverter
   LogicalResult matchAndRewrite(
       mhlo::TransposeOp op, OpAdaptor adaptor,
       ConversionPatternRewriter& rewriter) const override {
-    auto resultTy = typeConverter->convertType(op.getType()).cast<ShapedType>();
+    auto resultTy =
+        mlir::cast<ShapedType>(typeConverter->convertType(op.getType()));
 
     auto loc = op.getLoc();
     Value emptyTensor =
@@ -1137,9 +1142,10 @@ class BitcastConvertConverter
       ConversionPatternRewriter& rewriter) const final {
     if (!verifyHloOpBufferOrTensorSemantics(op)) return failure();
 
-    auto inputType = adaptor.getOperand().getType().cast<RankedTensorType>();
+    auto inputType =
+        mlir::cast<RankedTensorType>(adaptor.getOperand().getType());
     auto outputType =
-        typeConverter->convertType(op.getType()).cast<RankedTensorType>();
+        mlir::cast<RankedTensorType>(typeConverter->convertType(op.getType()));
     auto loc = op.getLoc();
 
     // Fallback to pointwise conversion if the tensor dimensions are not
@@ -1251,7 +1257,7 @@ class RealDynamicSliceConverter
       mhlo::RealDynamicSliceOp realDynamicSliceOp, OpAdaptor adaptor,
       ConversionPatternRewriter& rewriter) const final {
     Location loc = realDynamicSliceOp.getLoc();
-    auto argType = adaptor.getOperand().getType().dyn_cast<ShapedType>();
+    auto argType = mlir::dyn_cast<ShapedType>(adaptor.getOperand().getType());
     if (!argType || !argType.hasRank()) {
       return rewriter.notifyMatchFailure(realDynamicSliceOp,
                                          "require known-rank args");
@@ -1268,9 +1274,8 @@ class RealDynamicSliceConverter
         dimElementType.isIndex() ? rewriter.getI64Type() : dimElementType;
     Type indexType = rewriter.getIndexType();
 
-    auto resultType =
-        this->typeConverter->convertType(realDynamicSliceOp.getType())
-            .cast<RankedTensorType>();
+    auto resultType = mlir::cast<RankedTensorType>(
+        this->typeConverter->convertType(realDynamicSliceOp.getType()));
     Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
     SmallVector<OpFoldResult, 4> offsets, sizes, strides;
     SmallVector<Type, 3> clampType(3, arithType);
@@ -1340,9 +1345,9 @@ class ReshapeOpConverter : public OpConversionPattern<mhlo::ReshapeOp> {
       ConversionPatternRewriter& rewriter) const final {
     if (!verifyHloOpBufferOrTensorSemantics(reshapeOp)) return failure();
     auto operand = adaptor.getOperand();
-    auto operandType = operand.getType().cast<ShapedType>();
+    auto operandType = mlir::cast<ShapedType>(operand.getType());
     auto elemType = operandType.getElementType();
-    auto resultType = reshapeOp.getType().cast<ShapedType>();
+    auto resultType = mlir::cast<ShapedType>(reshapeOp.getType());
 
     if (!resultType.hasStaticShape()) return failure();
 
@@ -1354,7 +1359,7 @@ class ReshapeOpConverter : public OpConversionPattern<mhlo::ReshapeOp> {
       return success();
     }
 
-    resultType = typeConverter->convertType(resultType).cast<ShapedType>();
+    resultType = mlir::cast<ShapedType>(typeConverter->convertType(resultType));
 
     // Special case where the result is a scalar.
     if (resultType.getRank() == 0 && !operandType.hasStaticShape()) {
@@ -1472,8 +1477,8 @@ class IotaConverter : public OpConversionPattern<OpTy> {
     ShapedType resultShapedType = getHloOpResultType(iotaOp);
     if (!resultShapedType) return failure();
     Type targetElementType = resultShapedType.getElementType();
-    resultShapedType = this->typeConverter->convertType(resultShapedType)
-                           .template dyn_cast<ShapedType>();
+    resultShapedType = mlir::dyn_cast<ShapedType>(
+        this->typeConverter->convertType(resultShapedType));
 
     Type resultElementType = resultShapedType.getElementType();
 
@@ -1497,7 +1502,7 @@ class IotaConverter : public OpConversionPattern<OpTy> {
               nestedLoc, iotaOp.getIotaDimension());
           Type unwrappedResultElementType = resultElementType;
           if (auto complexType =
-                  unwrappedResultElementType.dyn_cast<ComplexType>())
+                  mlir::dyn_cast<ComplexType>(unwrappedResultElementType))
             unwrappedResultElementType = complexType.getElementType();
           Value castOp = nestedBuilder.create<arith::IndexCastOp>(
               nestedLoc,
@@ -1526,8 +1531,8 @@ class IotaToMapConverter : public OpConversionPattern<OpTy> {
     ShapedType resultTy = getHloOpResultType(iotaOp);
     if (!resultTy) return failure();
     Type targetElementType = resultTy.getElementType();
-    resultTy = this->typeConverter->convertType(resultTy)
-                   .template dyn_cast<ShapedType>();
+    resultTy =
+        mlir::dyn_cast<ShapedType>(this->typeConverter->convertType(resultTy));
 
     Location loc = iotaOp.getLoc();
     Value empty = getEmptyTensorFor(rewriter, loc, resultTy, iotaOp,
@@ -1564,8 +1569,8 @@ struct ConcatenateConverter : public OpConversionPattern<mhlo::ConcatenateOp> {
       return success();
     }
 
-    auto resultType = this->typeConverter->convertType(op.getResult().getType())
-                          .dyn_cast<RankedTensorType>();
+    auto resultType = mlir::dyn_cast<RankedTensorType>(
+        this->typeConverter->convertType(op.getResult().getType()));
     if (!resultType) return failure();
 
     uint64_t dim = op.getDimension();
@@ -1647,9 +1652,9 @@ class ConstConverterTensor : public OpConversionPattern<mhlo::ConstantOp> {
   LogicalResult matchAndRewrite(
       mhlo::ConstantOp constOp, OpAdaptor /*adaptor*/,
       ConversionPatternRewriter& rewriter) const final {
-    auto valueAttr = constOp.getValue().cast<DenseElementsAttr>();
+    auto valueAttr = mlir::cast<DenseElementsAttr>(constOp.getValue());
     auto type =
-        typeConverter->convertType(constOp.getType()).cast<ShapedType>();
+        mlir::cast<ShapedType>(typeConverter->convertType(constOp.getType()));
     if (type != constOp.getType()) {
       // Signedness conversion.
       valueAttr = valueAttr.mapValues(type.getElementType(),
@@ -1668,7 +1673,7 @@ class ReverseConverter
                                 mhlo::ReverseOp>::DataMovementOpConverter;
   static SmallVector<AffineMap, 2> getIndexingMaps(mhlo::ReverseOp op,
                                                    Builder* b) {
-    auto resultType = getHloOpResultType(op).cast<ShapedType>();
+    auto resultType = mlir::cast<ShapedType>(getHloOpResultType(op));
     auto nloops = resultType.getRank();
     SmallVector<AffineExpr, 2> inputExprs;
     inputExprs.reserve(nloops);
@@ -1693,7 +1698,8 @@ class SliceConverter : public OpConversionPattern<mhlo::SliceOp> {
   LogicalResult matchAndRewrite(
       mhlo::SliceOp sliceOp, typename mhlo::SliceOp::Adaptor adaptor,
       ConversionPatternRewriter& rewriter) const final {
-    auto argType = adaptor.getOperands()[0].getType().dyn_cast<ShapedType>();
+    auto argType =
+        mlir::dyn_cast<ShapedType>(adaptor.getOperands()[0].getType());
     if (!argType || !argType.hasRank()) {
       return rewriter.notifyMatchFailure(sliceOp, "expects known-rank args");
     }
@@ -1726,7 +1732,7 @@ class DynamicSliceConverter : public OpConversionPattern<mhlo::DynamicSliceOp> {
       mhlo::DynamicSliceOp dynamicSliceOp, OpAdaptor adaptor,
       ConversionPatternRewriter& rewriter) const final {
     auto loc = dynamicSliceOp.getLoc();
-    auto argType = adaptor.getOperand().getType().dyn_cast<ShapedType>();
+    auto argType = mlir::dyn_cast<ShapedType>(adaptor.getOperand().getType());
     if (!argType || !argType.hasRank()) {
       return rewriter.notifyMatchFailure(dynamicSliceOp,
                                          "require known-rank args");
@@ -1764,8 +1770,8 @@ class DynamicSliceConverter : public OpConversionPattern<mhlo::DynamicSliceOp> {
     int64_t rank = argType.getRank();
     SmallVector<OpFoldResult, 3> strides(rank, rewriter.getI64IntegerAttr(1));
 
-    auto resultType = this->typeConverter->convertType(dynamicSliceOp.getType())
-                          .cast<RankedTensorType>();
+    auto resultType = mlir::cast<RankedTensorType>(
+        this->typeConverter->convertType(dynamicSliceOp.getType()));
 
     rewriter.replaceOpWithNewOp<tensor::ExtractSliceOp>(
         dynamicSliceOp, resultType, adaptor.getOperand(), startIndices, sizes,
@@ -1784,14 +1790,14 @@ class DynamicUpdateSliceConverter
       ConversionPatternRewriter& rewriter) const final {
     auto loc = op.getLoc();
     auto operandType =
-        adaptor.getOperand().getType().dyn_cast<RankedTensorType>();
+        mlir::dyn_cast<RankedTensorType>(adaptor.getOperand().getType());
     if (!operandType || !operandType.hasStaticShape()) {
       return rewriter.notifyMatchFailure(
           op, "require static ranked type for operand");
     }
 
     auto updateType =
-        adaptor.getUpdate().getType().dyn_cast<RankedTensorType>();
+        mlir::dyn_cast<RankedTensorType>(adaptor.getUpdate().getType());
     if (!updateType || !updateType.hasStaticShape()) {
       return rewriter.notifyMatchFailure(
           op, "require static ranked type for operand");
@@ -1842,9 +1848,9 @@ enum class DotOperationType {
 
 DotOperationType getDotOperationType(mhlo::DotOp dotOp) {
   ArrayRef<int64_t> lhsShape =
-      dotOp.getLhs().getType().cast<ShapedType>().getShape();
+      mlir::cast<ShapedType>(dotOp.getLhs().getType()).getShape();
   ArrayRef<int64_t> rhsShape =
-      dotOp.getRhs().getType().cast<ShapedType>().getShape();
+      mlir::cast<ShapedType>(dotOp.getRhs().getType()).getShape();
   auto shapeMatches = [](int64_t a, int64_t b) {
     return a == ShapedType::kDynamic || b == ShapedType::kDynamic || a == b;
   };
@@ -1873,19 +1879,19 @@ SmallVector<Value, 2> getDotOpEmptyTensorDynSizes(OpBuilder& b, Location loc,
   SmallVector<Value, 2> dynShape;
   switch (type) {
     case DotOperationType::kMatrixMatrix: {
-      if (lhs.getType().cast<ShapedType>().isDynamicDim(0))
+      if (mlir::cast<ShapedType>(lhs.getType()).isDynamicDim(0))
         dynShape.push_back(b.create<tensor::DimOp>(loc, lhs, 0));
-      if (rhs.getType().cast<ShapedType>().isDynamicDim(1))
+      if (mlir::cast<ShapedType>(rhs.getType()).isDynamicDim(1))
         dynShape.push_back(b.create<tensor::DimOp>(loc, rhs, 1));
       break;
     }
     case DotOperationType::kMatrixVector: {
-      if (lhs.getType().cast<ShapedType>().isDynamicDim(0))
+      if (mlir::cast<ShapedType>(lhs.getType()).isDynamicDim(0))
         dynShape.push_back(b.create<tensor::DimOp>(loc, lhs, 0));
       break;
     }
     case DotOperationType::kVectorMatrix: {
-      if (rhs.getType().cast<ShapedType>().isDynamicDim(1))
+      if (mlir::cast<ShapedType>(rhs.getType()).isDynamicDim(1))
         dynShape.push_back(b.create<tensor::DimOp>(loc, rhs, 1));
       break;
     }
@@ -1912,7 +1918,7 @@ class DotOpConversion : public OpConversionPattern<mhlo::DotOp> {
     // Convert unsigned to signed. This works because signed and unsigned
     // integer matmul is the same operation in two's complement.
     auto outputType =
-        typeConverter->convertType(op.getType()).cast<ShapedType>();
+        mlir::cast<ShapedType>(typeConverter->convertType(op.getType()));
     SmallVector<Value, 2> dynShape = getDotOpEmptyTensorDynSizes(
         rewriter, loc, adaptor.getLhs(), adaptor.getRhs(), op_type);
     auto emptyTensor =
@@ -1938,7 +1944,7 @@ class DotGeneralBatchMatMulOpConversion
     if (!verifyHloOpBufferOrTensorSemantics(op)) {
       return failure();
     }
-    if (op.getType().cast<RankedTensorType>().getRank() != 3) {
+    if (mlir::cast<RankedTensorType>(op.getType()).getRank() != 3) {
       return rewriter.notifyMatchFailure(op, "expected a batch matmul");
     }
 
@@ -1968,7 +1974,7 @@ class DotGeneralBatchMatMulOpConversion
     // Convert unsigned to signed. This works because signed and unsigned
     // integer matmul is the same operation in two's complement.
     auto outputType =
-        typeConverter->convertType(op.getType()).cast<ShapedType>();
+        mlir::cast<ShapedType>(typeConverter->convertType(op.getType()));
     auto emptyTensor =
         getEmptyTensorFor(rewriter, loc, outputType, op, adaptor.getOperands());
     Value zeroTensor = fillTensorWithZeros(rewriter, loc, emptyTensor);
@@ -1992,7 +1998,7 @@ class MapOpToGenericConverter : public OpConversionPattern<mhlo::MapOp> {
     if (!verifyHloOpBufferOrTensorSemantics(op)) return failure();
 
     auto resultType =
-        typeConverter->convertType(op.getType()).cast<ShapedType>();
+        mlir::cast<ShapedType>(typeConverter->convertType(op.getType()));
     assert(op.getDimensions().size() == resultType.getRank() &&
            "Expected a pointwise map");
 
@@ -2019,7 +2025,7 @@ class MapOpToGenericConverter : public OpConversionPattern<mhlo::MapOp> {
       signatureConverter.addInputs(
           it.index(),
           typeConverter->convertType(
-              it.value().getType().cast<ShapedType>().getElementType()));
+              mlir::cast<ShapedType>(it.value().getType()).getElementType()));
     }
     signatureConverter.addInputs(resultType.getElementType());
 
@@ -2039,7 +2045,7 @@ class MapOpToMapConverter : public OpConversionPattern<mhlo::MapOp> {
     if (!verifyHloOpBufferOrTensorSemantics(op)) return failure();
 
     auto resultType =
-        typeConverter->convertType(op.getType()).cast<ShapedType>();
+        mlir::cast<ShapedType>(typeConverter->convertType(op.getType()));
     assert(op.getDimensions().size() == resultType.getRank() &&
            "Expected a pointwise map");
 
@@ -2069,7 +2075,7 @@ class MapOpToMapConverter : public OpConversionPattern<mhlo::MapOp> {
       signatureConverter.addInputs(
           it.index(),
           typeConverter->convertType(
-              it.value().getType().cast<ShapedType>().getElementType()));
+              mlir::cast<ShapedType>(it.value().getType()).getElementType()));
     }
 
     rewriter.applySignatureConversion(&region, signatureConverter,
@@ -2089,7 +2095,7 @@ SmallVector<Value, 8> getReduceOpEmptyTensorDynSizes(
 
   SmallVector<unsigned, 4> parallelDims;
   SmallVector<Value, 8> dynShape;
-  int rank = arg.getType().cast<RankedTensorType>().getRank();
+  int rank = mlir::cast<RankedTensorType>(arg.getType()).getRank();
   for (int i = 0, j = 0; i < rank; ++i) {
     if (s.count(i)) continue;
     if (!resultType.isDynamicDim(j++)) continue;
@@ -2111,7 +2117,7 @@ class ReduceRegionReturnOpConversion
     }
     SmallVector<Value, 4> operands(adaptor.getOperands());
     for (size_t i = 0; i < operands.size(); ++i) {
-      if (operands[i].getType().isa<ShapedType>()) {
+      if (mlir::isa<ShapedType>(operands[i].getType())) {
         auto loc = operands[i].getLoc();
         operands[i] = rewriter.create<tensor::ExtractOp>(loc, operands[i]);
       }
@@ -2132,12 +2138,12 @@ class ReduceOpToGenericConverter : public OpConversionPattern<mhlo::ReduceOp> {
     int numOperands = static_cast<int>(adaptor.getInputs().size());
 
     if (llvm::any_of(adaptor.getInputs(), [](Value v) {
-          return !v.getType().isa<RankedTensorType>();
+          return !mlir::isa<RankedTensorType>(v.getType());
         })) {
       return rewriter.notifyMatchFailure(op, "expects known-rank args");
     }
     auto srcRank =
-        adaptor.getInputs()[0].getType().cast<ShapedType>().getRank();
+        mlir::cast<ShapedType>(adaptor.getInputs()[0].getType()).getRank();
 
     SmallVector<int64_t, 4> reductionDims = extract1DVector(op.getDimensions());
 
@@ -2205,14 +2211,14 @@ class ReduceOpToGenericConverter : public OpConversionPattern<mhlo::ReduceOp> {
           /*origInputNo=*/idx + numOperands,
           // type for the new operand number 'idx'.
           typeConverter->convertType(
-              val.getType().cast<ShapedType>().getElementType()));
+              mlir::cast<ShapedType>(val.getType()).getElementType()));
     }
     for (const auto& [idx, val] : llvm::enumerate(op.getInitValues())) {
       signatureConverter.addInputs(
           /*origInputNo=*/idx,
           // type for the new operand number 'idx' + 'numOperands'.
           typeConverter->convertType(
-              val.getType().cast<ShapedType>().getElementType()));
+              mlir::cast<ShapedType>(val.getType()).getElementType()));
     }
 
     rewriter.applySignatureConversion(&region, signatureConverter,
@@ -2234,7 +2240,7 @@ struct ReduceOpToReduceConverter : public OpConversionPattern<mhlo::ReduceOp> {
     llvm::sort(reductionDims);
 
     auto toRankedTensor = [](Value v) -> RankedTensorType {
-      return v.getType().dyn_cast<RankedTensorType>();
+      return mlir::dyn_cast<RankedTensorType>(v.getType());
     };
 
     SmallVector<Value> outputs;
@@ -2256,14 +2262,14 @@ struct ReduceOpToReduceConverter : public OpConversionPattern<mhlo::ReduceOp> {
         return rewriter.notifyMatchFailure(op, "expects known-rank operands");
       operandTypes.push_back(operandType);
       initValue = rewriter.createOrFold<tensor::ExtractOp>(loc, initValue);
-      auto tensorResultType = resultType.cast<RankedTensorType>();
+      auto tensorResultType = mlir::cast<RankedTensorType>(resultType);
       // For linalg.reduce, the result type's dimensions must match the input's
       // dimensions, whereas MHLO allows replacing static dimensions with
       // dynamic ones.
       SmallVector<int64_t> resultShape;
       SmallVector<Value, 8> dynShape;
-      for (const auto& [index, dim] :
-           llvm::enumerate(operand.getType().cast<ShapedType>().getShape())) {
+      for (const auto& [index, dim] : llvm::enumerate(
+               mlir::cast<ShapedType>(operand.getType()).getShape())) {
         if (!llvm::is_contained(reductionDims, index)) {
           resultShape.push_back(dim);
           if (ShapedType::isDynamic(dim)) {
@@ -2334,9 +2340,8 @@ class RngBitGeneratorConverter
       ConversionPatternRewriter& rewriter) const final {
     Location loc = op.getLoc();
     Value state = adaptor.getInitialState();
-    ShapedType resultTy =
-        this->typeConverter->convertType(op.getResult(1).getType())
-            .cast<ShapedType>();
+    ShapedType resultTy = mlir::cast<ShapedType>(
+        this->typeConverter->convertType(op.getResult(1).getType()));
 
     if (op.getRngAlgorithm() == mhlo::RngAlgorithm::THREE_FRY) {
       Value random;
@@ -2376,10 +2381,10 @@ struct SelectAndScatterNoOverlapConverter
     Value operand = op.getOperand();
     Value init = op.getInitValue();
 
-    auto sourceTy = source.getType().dyn_cast<RankedTensorType>();
-    auto operandTy = operand.getType().dyn_cast<RankedTensorType>();
-    auto initTy = init.getType().dyn_cast<RankedTensorType>();
-    auto resultTy = op.getResult().getType().dyn_cast<RankedTensorType>();
+    auto sourceTy = mlir::dyn_cast<RankedTensorType>(source.getType());
+    auto operandTy = mlir::dyn_cast<RankedTensorType>(operand.getType());
+    auto initTy = mlir::dyn_cast<RankedTensorType>(init.getType());
+    auto resultTy = mlir::dyn_cast<RankedTensorType>(op.getResult().getType());
     if (!sourceTy || !operandTy || !initTy || !resultTy)
       return rewriter.notifyMatchFailure(op, "inputs/outputs must be ranked");
 
@@ -2563,7 +2568,7 @@ struct SelectAndScatterNoOverlapConverter
     b.setInsertionPoint(op);
 
     Value reduceIndex = reduceGeneric.getResult(1);
-    ShapedType reduceIndexTy = reduceIndex.getType().cast<ShapedType>();
+    ShapedType reduceIndexTy = mlir::cast<ShapedType>(reduceIndex.getType());
 
     // For the second generic we restricted to only cases where there are
     // no window overlaps. This guarantees that each source value is scattered
@@ -2674,7 +2679,7 @@ struct SelectAndScatterNoOverlapConverter
 
     Value collapse = b.create<tensor::CollapseShapeOp>(
         scatterGeneric.getResult(0), reassociationMap);
-    auto collapseTy = collapse.getType().cast<ShapedType>();
+    auto collapseTy = mlir::cast<ShapedType>(collapse.getType());
 
     // After collapsing it it possible that the target may need to be padded.
     auto zero = b.createOrFold<arith::ConstantIndexOp>(0);
@@ -2817,7 +2822,7 @@ struct PadOpConversion : public OpConversionPattern<mhlo::PadOp> {
         rewriter.create<linalg::FillOp>(loc, paddingVal, emptyTensor).result();
 
     // Get sizes of the original operand.
-    auto operandType = adaptor.getOperand().getType().cast<ShapedType>();
+    auto operandType = mlir::cast<ShapedType>(adaptor.getOperand().getType());
     auto sizes = llvm::to_vector<4>(llvm::map_range(
         llvm::seq<int64_t>(0, operandType.getRank()),
         [&](int64_t dim) -> OpFoldResult {
@@ -2850,7 +2855,7 @@ Value applyConvolutionPadding(Location loc, Value input,
       (!lhsDilation || isSplatValue(lhsDilation, 1)))
     return input;
 
-  auto inputType = input.getType().cast<ShapedType>();
+  auto inputType = mlir::cast<ShapedType>(input.getType());
   auto rank = inputType.getRank();
 
   // Translate window padding into low/high padding.
@@ -2931,8 +2936,8 @@ struct NormalConvolutionOpConversion
     Value input = adaptor.getLhs();
     Value filter = adaptor.getRhs();
     filter = applyConvolutionReversal(loc, rewriter, op, filter);
-    auto resultType =
-        typeConverter->convertType(op.getResult().getType()).cast<ShapedType>();
+    auto resultType = mlir::cast<ShapedType>(
+        typeConverter->convertType(op.getResult().getType()));
     int64_t rank = resultType.getRank();
 
     // Immediately emit an EmptyOp for output tensors with zero dimension.
@@ -3039,8 +3044,8 @@ struct ConvolutionOpGeneralConversion
     auto loc = op.getLoc();
     auto* ctx = op.getContext();
 
-    auto resultType =
-        typeConverter->convertType(op.getResult().getType()).cast<ShapedType>();
+    auto resultType = mlir::cast<ShapedType>(
+        typeConverter->convertType(op.getResult().getType()));
     auto reshapedResultShape = resultType.getShape().vec();
     if (!resultType.hasStaticShape()) return failure();
 
@@ -3089,8 +3094,8 @@ struct ConvolutionOpGeneralConversion
     // Non-one values for feature or batch group counts will result in reshaped
     // inputs and outputs. These mappings are used to keep track of the the new
     // index after reshaping has possibly inserted new dimensions.
-    auto paddedLhsType = modifiedLhs.getType().cast<ShapedType>();
-    auto paddedRhsType = modifiedRhs.getType().cast<ShapedType>();
+    auto paddedLhsType = mlir::cast<ShapedType>(modifiedLhs.getType());
+    auto paddedRhsType = mlir::cast<ShapedType>(modifiedRhs.getType());
     SmallVector<int64_t> lhsIndexMapping(paddedLhsType.getRank());
     std::iota(lhsIndexMapping.begin(), lhsIndexMapping.end(), 0);
     SmallVector<int64_t> rhsIndexMapping(paddedRhsType.getRank());
@@ -3318,8 +3323,8 @@ struct DepthwiseConvolutionOpConversion
 
     // Make sure that this is depthwise convolution.
     int64_t inputFeatureDim = dimensionNumbers.getInputFeatureDimension();
-    int64_t inputFeatureCount =
-        op.getLhs().getType().cast<ShapedType>().getDimSize(inputFeatureDim);
+    int64_t inputFeatureCount = mlir::cast<ShapedType>(op.getLhs().getType())
+                                    .getDimSize(inputFeatureDim);
     if (static_cast<int64_t>(op.getFeatureGroupCount()) != inputFeatureCount) {
       return rewriter.notifyMatchFailure(op, "not depth-wise convolution");
     }
@@ -3350,8 +3355,8 @@ struct DepthwiseConvolutionOpConversion
     Location loc = op.getLoc();
     Value input = adaptor.getLhs();
     Value filter = adaptor.getRhs();
-    auto resultType = typeConverter->convertType(op.getResult().getType())
-                          .cast<RankedTensorType>();
+    auto resultType = mlir::cast<RankedTensorType>(
+        typeConverter->convertType(op.getResult().getType()));
     if (!resultType.hasStaticShape()) {
       return rewriter.notifyMatchFailure(op,
                                          "expected output has static shapes");
@@ -3371,12 +3376,12 @@ struct DepthwiseConvolutionOpConversion
                                     op.getLhsDilationAttr(), spatialDimMapping,
                                     rewriter);
 
-    auto filterDims =
-        llvm::to_vector<4>(op.getRhs().getType().cast<ShapedType>().getShape());
+    auto filterDims = llvm::to_vector<4>(
+        mlir::cast<ShapedType>(op.getRhs().getType()).getShape());
 
     auto getReassociationIndicesToCollapseLastTwoDims = [](Value v) {
       SmallVector<ReassociationIndices> reassociations;
-      int64_t rank = v.getType().cast<ShapedType>().getRank();
+      int64_t rank = mlir::cast<ShapedType>(v.getType()).getRank();
       for (int64_t i = 0; i < rank - 1; ++i) reassociations.emplace_back(1, i);
       reassociations.back().push_back(rank - 1);
       return reassociations;
@@ -3406,7 +3411,8 @@ struct DepthwiseConvolutionOpConversion
             op.getFeatureGroupCount();
         auto reshapedFilterType = RankedTensorType::get(
             reshapedFilterDims,
-            op.getRhs().getType().cast<RankedTensorType>().getElementType());
+            mlir::cast<RankedTensorType>(op.getRhs().getType())
+                .getElementType());
 
         reshapedFilter =
             rewriter.create<mhlo::ReshapeOp>(loc, reshapedFilterType, filter);
@@ -3591,7 +3597,7 @@ struct ReduceWindowOpOnTensorsGenericConversion
     llvm::SmallVector<Value> broadcastValues;
     for (uint64_t i = 0, s = initValues.size(); i < s; i++) {
       Value initValue = initValues[i];
-      auto resultTy = resultTypes[i].cast<ShapedType>();
+      auto resultTy = mlir::cast<ShapedType>(resultTypes[i]);
       if (!resultTy.hasStaticShape()) return failure();
 
       auto broadcastSizes = rewriter.getI64TensorAttr(resultTy.getShape());
@@ -3658,17 +3664,17 @@ struct ReduceWindowOpOnTensorsGenericConversion
     // args will correlate with the LHS and the inputs correlate with the RHS.
     for (const auto& [i, type] : llvm::enumerate(resultTypes)) {
       auto idx = inputs.size() + i - 1;
-      signatureConverter.addInputs(idx,
-                                   type.cast<ShapedType>().getElementType());
+      signatureConverter.addInputs(
+          idx, mlir::cast<ShapedType>(type).getElementType());
     }
 
     signatureConverter.addInputs(
-        inputs.back().getType().cast<ShapedType>().getElementType());
+        mlir::cast<ShapedType>(inputs.back().getType()).getElementType());
 
     for (const auto& [i, input] :
          llvm::enumerate(ArrayRef<Value>(inputs).drop_back())) {
       signatureConverter.addInputs(
-          i, input.getType().cast<ShapedType>().getElementType());
+          i, mlir::cast<ShapedType>(input.getType()).getElementType());
     }
 
     rewriter.applySignatureConversion(&region, signatureConverter,
@@ -3697,8 +3703,8 @@ struct ReduceWindowOpConversion
 
   static PoolingType getPoolingType(mhlo::ReduceWindowOp reduceOp,
                                     int resultIndex) {
-    auto rank =
-        reduceOp.getResultTypes()[resultIndex].cast<ShapedType>().getRank();
+    auto rank = mlir::cast<ShapedType>(reduceOp.getResultTypes()[resultIndex])
+                    .getRank();
     if (Operation* op = reduceOp.getReductionOp(resultIndex)) {
       if (isa<mhlo::MinOp>(*op) && rank == 4) return PoolingType::k2DMin;
       if (isa<mhlo::MinOp>(*op) && rank == 5) return PoolingType::k3DMin;
@@ -3714,7 +3720,7 @@ struct ReduceWindowOpConversion
       mhlo::ReduceWindowOp op, OpAdaptor adaptor,
       ConversionPatternRewriter& rewriter) const override {
     auto loc = op.getLoc();
-    int rank = op.getResultTypes()[0].cast<ShapedType>().getRank();
+    int rank = mlir::cast<ShapedType>(op.getResultTypes()[0]).getRank();
     if (rank != 4 && rank != 5) {
       return rewriter.notifyMatchFailure(
           op, "expected NHWC/NDHWC pooling-based op");
@@ -3778,8 +3784,8 @@ struct ReduceWindowOpConversion
       OpResult result = std::get<0>(it);
       Value input = std::get<1>(it);
       Value initValue = std::get<2>(it);
-      auto resultType = result.getType().cast<ShapedType>();
-      if (!input.getType().cast<ShapedType>().getElementType().isF32()) {
+      auto resultType = mlir::cast<ShapedType>(result.getType());
+      if (!mlir::cast<ShapedType>(input.getType()).getElementType().isF32()) {
         return rewriter.notifyMatchFailure(op,
                                            "expected element type to be f32");
       }
@@ -3799,9 +3805,9 @@ struct ReduceWindowOpConversion
         } else {
           auto i = en.index() - 1;
           auto stride =
-              strides.cast<DenseIntElementsAttr>().getValues<int64_t>()[i];
-          auto dilation =
-              dilations.cast<DenseIntElementsAttr>().getValues<int64_t>()[i];
+              mlir::cast<DenseIntElementsAttr>(strides).getValues<int64_t>()[i];
+          auto dilation = mlir::cast<DenseIntElementsAttr>(dilations)
+                              .getValues<int64_t>()[i];
           // let j = i * stride
           // output[i] = reduce( input[j, j + window_size * dilation) )
           Value offset = rewriter.create<arith::ConstantIndexOp>(
@@ -3883,15 +3889,16 @@ struct TorchIndexSelectOpConversion
       ConversionPatternRewriter& rewriter) const final {
     int axis = static_cast<int>(op.getDim());
     int batch = static_cast<int>(op.getBatchDims());
-    auto indexShapedType = adaptor.getIndex().getType().cast<ShapedType>();
+    auto indexShapedType = mlir::cast<ShapedType>(adaptor.getIndex().getType());
     int numIndices = static_cast<int>(indexShapedType.getRank());
-    auto operandShapedType = adaptor.getOperand().getType().cast<ShapedType>();
+    auto operandShapedType =
+        mlir::cast<ShapedType>(adaptor.getOperand().getType());
     if (axis < 0) axis += static_cast<int>(operandShapedType.getRank());
     if (batch < 0) batch += numIndices;
 
     Location loc = op.getLoc();
-    auto resultType = this->typeConverter->convertType(op.getResult().getType())
-                          .cast<ShapedType>();
+    auto resultType = mlir::cast<ShapedType>(
+        this->typeConverter->convertType(op.getResult().getType()));
     int rank = static_cast<int>(resultType.getRank());
 
     // The output shape is
@@ -3968,7 +3975,7 @@ struct TorchIndexSelectOpConversion
     auto* block = rewriter.createBlock(region, region->end());
     for (auto blockArgs : linalgOpArgs) {
       bodyArgTypes.push_back(
-          blockArgs.getType().cast<ShapedType>().getElementType());
+          mlir::cast<ShapedType>(blockArgs.getType()).getElementType());
     }
     block->addArguments(bodyArgTypes,
                         SmallVector<Location>(bodyArgTypes.size(), loc));
@@ -4015,10 +4022,10 @@ struct GatherConversion : public OpConversionPattern<mhlo::GatherOp> {
     Value startIndices = adaptor.getStartIndices();
     Value operand = adaptor.getOperand();
 
-    auto resultType = typeConverter->convertType(gatherOp.getType())
-                          .dyn_cast<RankedTensorType>();
+    auto resultType = mlir::dyn_cast<RankedTensorType>(
+        typeConverter->convertType(gatherOp.getType()));
     RankedTensorType startIndicesType =
-        startIndices.getType().dyn_cast<RankedTensorType>();
+        mlir::dyn_cast<RankedTensorType>(startIndices.getType());
     // We could actually deal with an unranked result by inferring the result
     // rank, but the current reifyReturnTypes doesn't support unranked either.
     if (!resultType || !startIndicesType)
@@ -4168,13 +4175,13 @@ struct GatherConversion : public OpConversionPattern<mhlo::GatherOp> {
           indexFromOffset[i]));
 
     Value extractOperand;
-    if (operand.getType().isa<RankedTensorType>()) {
+    if (mlir::isa<RankedTensorType>(operand.getType())) {
       extractOperand = operand;
     } else {
       // Cannot extract from unranked tensors, cast to ranked first.
       SmallVector<int64_t> dims(operandRank, ShapedType::kDynamic);
       auto type = RankedTensorType::get(
-          dims, operand.getType().cast<TensorType>().getElementType());
+          dims, mlir::cast<TensorType>(operand.getType()).getElementType());
       extractOperand = rewriter.create<tensor::CastOp>(loc, type, operand);
     }
     Value element =
@@ -4211,14 +4218,14 @@ class DotGeneralOpConversion : public OpConversionPattern<mhlo::DotGeneralOp> {
     // Convert unsigned to signed. This works because signed and unsigned
     // integer matmul is the same operation in two's complement.
     auto outputType =
-        typeConverter->convertType(op.getType()).cast<ShapedType>();
+        mlir::cast<ShapedType>(typeConverter->convertType(op.getType()));
     auto targetRank = outputType.getRank();
     auto totalLoopCount = numContracting + targetRank;
 
-    auto lhsRank = adaptor.getLhs().getType().cast<ShapedType>().getRank();
+    auto lhsRank = mlir::cast<ShapedType>(adaptor.getLhs().getType()).getRank();
     auto lhsExtraDims =
         lhsRank - lhsBatchingDims.size() - lhsContractingDims.size();
-    auto rhsRank = adaptor.getRhs().getType().cast<ShapedType>().getRank();
+    auto rhsRank = mlir::cast<ShapedType>(adaptor.getRhs().getType()).getRank();
 
     Location loc = op.getLoc();
     auto emptyTensor =
@@ -4302,13 +4309,13 @@ class PointwiseToLinalgMapConverter : public OpConversionPattern<OpTy> {
 
     // Find result type, if on tensors.
     std::optional<ShapedType> resultTy;
-    resultTy = this->typeConverter->convertType(op->getResultTypes().front())
-                   .template dyn_cast<ShapedType>();
+    resultTy = mlir::dyn_cast<ShapedType>(
+        this->typeConverter->convertType(op->getResultTypes().front()));
 
     // Check result type compatibility.
     if (!resultTy || !resultTy->hasRank() || resultTy->getRank() != maxRank ||
         !(resultTy->getElementType().isSignlessIntOrFloat() ||
-          resultTy->getElementType().isa<ComplexType>())) {
+          mlir::isa<ComplexType>(resultTy->getElementType()))) {
       return rewriter.notifyMatchFailure(
           op, "mismatched operand/result types or iterator count");
     }
@@ -4352,7 +4359,7 @@ class PointwiseToLinalgMapConverter : public OpConversionPattern<OpTy> {
 
  protected:
   int64_t getRank(Value v) const {
-    return v.getType().cast<ShapedType>().getRank();
+    return mlir::cast<ShapedType>(v.getType()).getRank();
   }
 
   int64_t getMaxRank(typename OpTy::Adaptor adaptor) const {
@@ -4392,7 +4399,8 @@ class SetDimensionSizeConverter
     // regular dynamic shape. Note that the bounds annotation is still around
     // but may be no longer valid depending on choices made by bufferization.
     Location loc = setDimensionSizeOp.getLoc();
-    auto resultType = setDimensionSizeOp.getType().cast<RankedTensorType>();
+    auto resultType =
+        mlir::cast<RankedTensorType>(setDimensionSizeOp.getType());
 
     SmallVector<OpFoldResult> offsets(resultType.getRank(),
                                       rewriter.getIndexAttr(0));
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_to_standard/legalize_to_standard.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_to_standard/legalize_to_standard.cc
index 2b8b4a051ffa53..be752397f72fcb 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_to_standard/legalize_to_standard.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_to_standard/legalize_to_standard.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir {
@@ -48,8 +49,8 @@ class CompareIConvert : public OpRewritePattern<mhlo::CompareOp> {
                                 PatternRewriter &rewriter) const override {
     auto lhs = op.getLhs();
     auto rhs = op.getRhs();
-    auto lhsType = lhs.getType().cast<TensorType>();
-    auto rhsType = rhs.getType().cast<TensorType>();
+    auto lhsType = mlir::cast<TensorType>(lhs.getType());
+    auto rhsType = mlir::cast<TensorType>(rhs.getType());
 
     // Broadcasting not supported by this rewrite.
     if (lhsType.getShape() != rhsType.getShape()) return failure();
@@ -96,14 +97,14 @@ class CompareFConvert : public OpRewritePattern<mhlo::CompareOp> {
                                 PatternRewriter &rewriter) const override {
     auto lhs = op.getLhs();
     auto rhs = op.getRhs();
-    auto lhsType = lhs.getType().cast<TensorType>();
-    auto rhsType = rhs.getType().cast<TensorType>();
+    auto lhsType = mlir::cast<TensorType>(lhs.getType());
+    auto rhsType = mlir::cast<TensorType>(rhs.getType());
 
     // Broadcasting not supported by this rewrite.
     if (lhsType.getShape() != rhsType.getShape()) return failure();
 
-    if (!lhsType.getElementType().isa<FloatType>() ||
-        !rhsType.getElementType().isa<FloatType>())
+    if (!mlir::isa<FloatType>(lhsType.getElementType()) ||
+        !mlir::isa<FloatType>(rhsType.getElementType()))
       return failure();
 
     std::optional<arith::CmpFPredicate> comparePredicate = std::nullopt;
@@ -146,7 +147,7 @@ class ConvertIotaOp : public OpRewritePattern<mhlo::IotaOp> {
 
   LogicalResult matchAndRewrite(mhlo::IotaOp op,
                                 PatternRewriter &rewriter) const override {
-    auto outputType = op.getType().cast<ShapedType>();
+    auto outputType = mlir::cast<ShapedType>(op.getType());
     auto outputSize = outputType.getNumElements();
     auto dimension = op.getIotaDimension();
     auto maxDimSize = outputType.getDimSize(dimension);
@@ -154,7 +155,7 @@ class ConvertIotaOp : public OpRewritePattern<mhlo::IotaOp> {
     auto elementType = outputType.getElementType();
     int bitwidth;
 
-    auto complexTy = elementType.dyn_cast<ComplexType>();
+    auto complexTy = mlir::dyn_cast<ComplexType>(elementType);
     Type intOrFloatTy = elementType;
     if (complexTy) intOrFloatTy = complexTy.getElementType();
 
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_to_standard/legalize_to_standard_patterns.td b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_to_standard/legalize_to_standard_patterns.td
index d71af240275135..f4d2460816904f 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_to_standard/legalize_to_standard_patterns.td
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_to_standard/legalize_to_standard_patterns.td
@@ -33,8 +33,8 @@ def : Pat<(MHLO_ConstantOp ElementsAttr:$value),
 //===----------------------------------------------------------------------===//
 
 def IsSameSizePred : CPred<
-    "$0.getType().cast<ShapedType>().getShape() "
-    "== $1.getType().cast<ShapedType>().getShape()">;
+    "cast<ShapedType>($0.getType()).getShape() "
+    "== cast<ShapedType>($1.getType()).getShape()">;
 def IsSameSizeConstraint : Constraint<IsSameSizePred, "inputs are same size">;
 def createFastMathNone : NativeCodeCall<
     "::mlir::arith::FastMathFlagsAttr::get("
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_torch_index_select_to_gather/legalize_torch_index_select_to_gather.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_torch_index_select_to_gather/legalize_torch_index_select_to_gather.cc
index 432c7733544772..3fa7e61dd7096b 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_torch_index_select_to_gather/legalize_torch_index_select_to_gather.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_torch_index_select_to_gather/legalize_torch_index_select_to_gather.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
@@ -68,7 +69,7 @@ struct TorchIndexSelectIsGather : public OpRewritePattern<TorchIndexSelectOp> {
 
     int64_t indexVectorDim = index.getType().getRank();
     auto indexTy = index.getType();
-    auto indexElementTy = indexTy.getElementType().dyn_cast<IntegerType>();
+    auto indexElementTy = mlir::dyn_cast<IntegerType>(indexTy.getElementType());
     if (!indexElementTy) {
       return rewriter.notifyMatchFailure(
           op, "index must have integer element type");
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/lower_general_dot/lower_general_dot.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/lower_general_dot/lower_general_dot.cc
index ee672671a68592..f4141c5dbcbaa1 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/lower_general_dot/lower_general_dot.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/lower_general_dot/lower_general_dot.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir {
@@ -76,9 +77,8 @@ Value transposeReshape(Value arg, Location loc,
                             rewriter.getIntegerType(64));
 
   auto transposePermutationAttr =
-      DenseIntElementsAttr::get(transposePermutationType,
-                                llvm::ArrayRef(transposePermutation))
-          .cast<DenseIntElementsAttr>();
+      mlir::cast<DenseIntElementsAttr>(DenseIntElementsAttr::get(
+          transposePermutationType, llvm::ArrayRef(transposePermutation)));
 
   // Compute the resulting shape.
   llvm::SmallVector<int64_t, 5> transposedShape;
@@ -144,7 +144,7 @@ Value transposeReshape(Value arg, Location loc,
 
 Value processDotArg(Value arg, Location loc, ArrayRef<int64_t> contractDimsAttr,
                     bool outerDimsFirst, PatternRewriter &rewriter) {
-  auto shape = arg.getType().cast<ShapedType>().getShape();
+  auto shape = mlir::cast<ShapedType>(arg.getType()).getShape();
 
   llvm::SmallVector<bool, 5> isOuterDim;
   isOuterDim.resize(shape.size(), true);
@@ -197,7 +197,7 @@ struct GeneralDotConvert : public OpRewritePattern<DotGeneralOp> {
     auto opPrecisionConfig = op.getPrecisionConfig();
     if (opPrecisionConfig.has_value()) precisionConfig = *opPrecisionConfig;
 
-    auto resultTy = op.getType().cast<ShapedType>();
+    auto resultTy = mlir::cast<ShapedType>(op.getType());
 
     auto lhsContractingDims = dotNumbers.getLhsContractingDimensions();
     auto rhsContractingDims = dotNumbers.getRhsContractingDimensions();
@@ -205,8 +205,8 @@ struct GeneralDotConvert : public OpRewritePattern<DotGeneralOp> {
     auto lhs = op.getLhs();
     auto rhs = op.getRhs();
 
-    RankedTensorType lhsTy = lhs.getType().dyn_cast<RankedTensorType>();
-    RankedTensorType rhsTy = rhs.getType().dyn_cast<RankedTensorType>();
+    RankedTensorType lhsTy = mlir::dyn_cast<RankedTensorType>(lhs.getType());
+    RankedTensorType rhsTy = mlir::dyn_cast<RankedTensorType>(rhs.getType());
     if (!lhsTy || !rhsTy) return failure();
 
     // The MHLO dot operator directly supports a vector dot product
@@ -264,8 +264,8 @@ struct GeneralDotConvert : public OpRewritePattern<DotGeneralOp> {
         rhs, loc, rhsContractingDims, /*outerDimsFirst=*/false, rewriter));
 
     // Accept only static shaped types.
-    auto lhsShapeType = lhs.getType().dyn_cast_or_null<ShapedType>();
-    auto rhsShapeType = rhs.getType().dyn_cast_or_null<ShapedType>();
+    auto lhsShapeType = mlir::dyn_cast_or_null<ShapedType>(lhs.getType());
+    auto rhsShapeType = mlir::dyn_cast_or_null<ShapedType>(rhs.getType());
     if (!lhsShapeType || !rhsShapeType) return failure();
 
     // Generate new dot operator on expanded types.
@@ -293,7 +293,7 @@ struct GeneralDotConvert : public OpRewritePattern<DotGeneralOp> {
 
     auto getDynamicDims = [&](Value arg,
                               llvm::ArrayRef<int64_t> contractingDims) {
-      RankedTensorType ty = arg.getType().cast<RankedTensorType>();
+      RankedTensorType ty = mlir::cast<RankedTensorType>(arg.getType());
       int index = 0;
       for (auto contractingDim : contractingDims) {
         for (; index < contractingDim; index++) {
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h b/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h
index 9761312cad5d82..c3b7f7103841f1 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/TypeUtilities.h"
+#include "mlir/Support/LLVM.h"
 
 namespace mlir {
 namespace mhlo {
@@ -231,13 +232,13 @@ struct MapMhloOpToScalarOpImpl<SupportedType, void, Args...> {
 };
 
 struct IsAnyIntegerType {
-  bool operator()(Type t) { return t.isa<IntegerType>(); }
+  bool operator()(Type t) { return mlir::isa<IntegerType>(t); }
 };
 
 struct IsSignedIntegerType {
   bool operator()(Type t) {
     // Pretend that signless is signed. This will change eventually.
-    return t.isa<IntegerType>() && !t.isUnsignedInteger() &&
+    return mlir::isa<IntegerType>(t) && !t.isUnsignedInteger() &&
            !t.isSignlessInteger(1);
   }
 };
@@ -249,11 +250,11 @@ struct IsUnsignedIntegerType {
 };
 
 struct IsFloatType {
-  bool operator()(Type t) { return t.isa<FloatType>(); }
+  bool operator()(Type t) { return mlir::isa<FloatType>(t); }
 };
 
 struct IsComplexType {
-  bool operator()(Type t) { return t.isa<ComplexType>(); }
+  bool operator()(Type t) { return mlir::isa<ComplexType>(t); }
 };
 
 template <template <typename T> class MapTy, typename OpTy,
@@ -291,11 +292,11 @@ inline Value mapMhloOpToStdScalarOp<mhlo::AbsOp>(Location loc,
                                                  mhlo::AbsOp::Adaptor adaptor,
                                                  OpBuilder* b) {
   Type elementType = getElementTypeOrSelf(argTypes.front());
-  if (elementType.isa<FloatType>()) {
+  if (mlir::isa<FloatType>(elementType)) {
     return MapMhloOpToScalarOpImpl<IsFloatType, ::mlir::math::AbsFOp>{}(
         loc, resultTypes, argTypes, adaptor.getOperands(), b);
   }
-  if (elementType.isa<ComplexType>()) {
+  if (mlir::isa<ComplexType>(elementType)) {
     return MapMhloOpToScalarOpImpl<IsComplexType, ::mlir::complex::AbsOp>{}(
         loc, resultTypes, argTypes, adaptor.getOperands(), b);
   }
@@ -315,7 +316,7 @@ inline Value mapMhloOpToStdScalarOp<mhlo::AbsOp>(Location loc,
 // Return a constant for v of type t, splat if t is a vector type.
 inline Value getConstantOrSplat(OpBuilder* b, Location loc, Type t,
                                 Attribute v) {
-  if (VectorType vecType = t.dyn_cast<VectorType>()) {
+  if (VectorType vecType = mlir::dyn_cast<VectorType>(t)) {
     v = SplatElementsAttr::get(vecType, v);
   }
   return b->create<arith::ConstantOp>(loc, t, cast<TypedAttr>(v));
@@ -364,8 +365,8 @@ getCmpPredicate<arith::CmpIPredicate>(
 
 inline Value cmpComplex(Location loc, Value lhs, Value rhs,
                         ComparisonDirection comparisonDirection, OpBuilder* b) {
-  auto complexType = lhs.getType().cast<ComplexType>();
-  if (complexType.getElementType().isa<FloatType>()) {
+  auto complexType = mlir::cast<ComplexType>(lhs.getType());
+  if (mlir::isa<FloatType>(complexType.getElementType())) {
     if (comparisonDirection == ComparisonDirection::EQ) {
       return b->create<complex::EqualOp>(loc, lhs, rhs);
     }
@@ -409,7 +410,7 @@ inline Value mapMhloOpToStdScalarOp<mhlo::CompareOp>(
   const auto& lhs = adaptor.getLhs();
   const auto& rhs = adaptor.getRhs();
   Type elementType = getElementTypeOrSelf(argTypes.front());
-  if (elementType.isa<IntegerType>()) {
+  if (mlir::isa<IntegerType>(elementType)) {
     bool isUnsigned = IsUnsignedIntegerType{}(elementType);
     std::optional<arith::CmpIPredicate> predicate =
         getCmpPredicate<arith::CmpIPredicate>(comparisonDirection, !isUnsigned);
@@ -417,7 +418,7 @@ inline Value mapMhloOpToStdScalarOp<mhlo::CompareOp>(
     return b->create<ScalarIOp<mhlo::CompareOp>>(loc, predicate.value(), lhs,
                                                  rhs);
   }
-  if (auto floatType = elementType.dyn_cast<FloatType>()) {
+  if (auto floatType = mlir::dyn_cast<FloatType>(elementType)) {
     if (adaptor.getCompareType() &&
         *adaptor.getCompareType() == mhlo::ComparisonType::TOTALORDER) {
       // The semantics of totalorder fp compare are
@@ -461,7 +462,7 @@ inline Value mapMhloOpToStdScalarOp<mhlo::CompareOp>(
     return b->create<ScalarFOp<mhlo::CompareOp>>(loc, predicate.value(), lhs,
                                                  rhs);
   }
-  if (auto complexType = elementType.dyn_cast<ComplexType>())
+  if (auto complexType = mlir::dyn_cast<ComplexType>(elementType))
     return cmpComplex(loc, lhs, rhs, comparisonDirection, b);
   return nullptr;
 }
@@ -474,7 +475,8 @@ inline Value mapMhloOpToStdScalarOp<mhlo::ReducePrecisionOp>(
   mlir::ImplicitLocOpBuilder b(loc, *builder);
 
   // Integer and float types for casting and constant generation.
-  auto floatType = getElementTypeOrSelf(argTypes.front()).cast<FloatType>();
+  auto floatType =
+      mlir::cast<FloatType>(getElementTypeOrSelf(argTypes.front()));
   int64_t nbits = floatType.getWidth();
   auto intType = mlir::IntegerType::get(loc.getContext(), nbits);
 
@@ -606,7 +608,7 @@ inline Value mapMhloOpToStdScalarOp<mhlo::MaxOp>(Location loc,
   Value lhs = operands.front();
   Type complexTy = lhs.getType();
 
-  if (!complexTy.isa<ComplexType>())
+  if (!mlir::isa<ComplexType>(complexTy))
     return MapMhloOpToScalarOpImpl<IsFloatType, arith::MaximumFOp,
                                    IsSignedIntegerType, arith::MaxSIOp,
                                    IsUnsignedIntegerType, arith::MaxUIOp>{}(
@@ -632,7 +634,7 @@ inline Value mapMhloOpToStdScalarOp<mhlo::MinOp>(Location loc,
   Value lhs = operands.front();
   Type complexTy = lhs.getType();
 
-  if (!complexTy.isa<ComplexType>())
+  if (!mlir::isa<ComplexType>(complexTy))
     return MapMhloOpToScalarOpImpl<IsFloatType, arith::MinimumFOp,
                                    IsSignedIntegerType, arith::MinSIOp,
                                    IsUnsignedIntegerType, arith::MinUIOp>{}(
@@ -654,7 +656,7 @@ inline Value mapMhloOpToStdScalarOp<mhlo::RealOp>(Location loc,
                                                   ArrayRef<Type> argTypes,
                                                   mhlo::RealOp::Adaptor adaptor,
                                                   OpBuilder* b) {
-  if (!adaptor.getOperand().getType().isa<ComplexType>())
+  if (!mlir::isa<ComplexType>(adaptor.getOperand().getType()))
     return adaptor.getOperand();
   return MapMhloOpToScalarOpImpl<complex::ReOp>{}(loc, resultTypes, argTypes,
                                                   adaptor.getOperands(), b);
@@ -666,7 +668,7 @@ inline Value mapMhloOpToStdScalarOp<mhlo::ImagOp>(Location loc,
                                                   ArrayRef<Type> argTypes,
                                                   mhlo::ImagOp::Adaptor adaptor,
                                                   OpBuilder* b) {
-  if (!adaptor.getOperand().getType().isa<ComplexType>())
+  if (!mlir::isa<ComplexType>(adaptor.getOperand().getType()))
     return b->create<arith::ConstantOp>(
         loc, b->getZeroAttr(adaptor.getOperand().getType()));
   return MapMhloOpToScalarOpImpl<complex::ImOp>{}(loc, resultTypes, argTypes,
@@ -700,26 +702,26 @@ inline Value mapConvertOpToStdScalarOp(Location loc, ArrayRef<Type> targetTypes,
     return b->create<mlir::arith::SIToFPOp>(loc, resultTypes, args,
                                             std::nullopt);
   }
-  if (sourceType.isa<FloatType>() && targetType.isa<FloatType>()) {
-    auto src = sourceType.cast<FloatType>();
-    auto res = targetType.cast<FloatType>();
-    if (src.getWidth() > res.getWidth()) {
-      return b->create<mlir::arith::TruncFOp>(loc, resultTypes, args,
-                                              std::nullopt);
+  if (mlir::isa<FloatType>(sourceType) && mlir::isa<FloatType>(targetType)) {
+    if (sourceType == targetType) {
+      return args.front();
     }
-    if (src.getWidth() < res.getWidth()) {
-      return b->create<mlir::arith::ExtFOp>(loc, resultTypes, args,
-                                            std::nullopt);
+
+    mlir::Value src = args.front();
+    auto dst = mlir::cast<FloatType>(targetType);
+    if (sourceType.getIntOrFloatBitWidth() == dst.getWidth()) {
+      // There are no ops for conversions between floats of equal width, so we
+      // go through the next-larger standard type.
+      sourceType = dst.getWidth() == 8 ? b->getF16Type() : b->getF32Type();
+      src = b->create<mlir::arith::ExtFOp>(loc, sourceType, src).getResult();
     }
-    // There's no direct conversion between different 16 bit floating point
-    // types, so go through 32 bit float.
-    if (sourceType != targetType) {
-      assert(sourceType.isBF16() || targetType.isBF16());
-      Value ext = b->create<arith::ExtFOp>(loc, b->getF32Type(), args);
-      return b->create<arith::TruncFOp>(loc, resultTypes, ext);
+    assert(sourceType.getIntOrFloatBitWidth() != dst.getWidth());
+
+    if (sourceType.getIntOrFloatBitWidth() > dst.getWidth()) {
+      return b->create<mlir::arith::TruncFOp>(loc, resultTypes, src,
+                                              std::nullopt);
     }
-    // No conversion is needed for identical float types.
-    return args.front();
+    return b->create<mlir::arith::ExtFOp>(loc, resultTypes, src, std::nullopt);
   }
   if (targetType.isInteger(/*width=*/1)) {
     // When casting to bool, we need to compare whether the value is equal to
@@ -730,16 +732,17 @@ inline Value mapConvertOpToStdScalarOp(Location loc, ArrayRef<Type> targetTypes,
       return b->create<mlir::arith::CmpIOp>(loc, arith::CmpIPredicate::ne,
                                             args.front(), zeroIntval);
     }
-    if (sourceType.isa<FloatType>()) {
+    if (mlir::isa<FloatType>(sourceType)) {
       Value zero = b->create<arith::ConstantOp>(
           loc, b->getZeroAttr(args.front().getType()));
       return b->create<mlir::arith::CmpFOp>(loc, arith::CmpFPredicate::UNE,
                                             args.front(), zero);
     }
   }
-  if (sourceType.isa<IntegerType>() && targetType.isa<IntegerType>()) {
-    auto src = sourceType.cast<IntegerType>();
-    auto res = targetType.cast<IntegerType>();
+  if (mlir::isa<IntegerType>(sourceType) &&
+      mlir::isa<IntegerType>(targetType)) {
+    auto src = mlir::cast<IntegerType>(sourceType);
+    auto res = mlir::cast<IntegerType>(targetType);
     if (src.getWidth() > res.getWidth()) {
       return b->create<mlir::arith::TruncIOp>(loc, resultTypes, args,
                                               std::nullopt);
@@ -767,17 +770,19 @@ inline Value mapConvertOpToStdScalarOp(Location loc, ArrayRef<Type> targetTypes,
     return b->create<mlir::arith::FPToSIOp>(loc, resultTypes, args,
                                             std::nullopt);
   }
-  if (targetType.isa<ComplexType>()) {
-    Type targetElementType = targetType.cast<ComplexType>().getElementType();
-    assert(!targetElementType.isa<ComplexType>() &&
+  if (mlir::isa<ComplexType>(targetType)) {
+    Type targetElementType =
+        mlir::cast<ComplexType>(targetType).getElementType();
+    assert(!mlir::isa<ComplexType>(targetElementType) &&
            "elements of complex numbers should not be complex");
     Value targetReal;
     Value targetImag;
-    if (sourceType.isa<ComplexType>()) {
+    if (mlir::isa<ComplexType>(sourceType)) {
       // We are converting from complex type: convert real and imaginary parts
       // separately.
-      Type sourceElementType = sourceType.cast<ComplexType>().getElementType();
-      assert(!sourceElementType.isa<ComplexType>() &&
+      Type sourceElementType =
+          mlir::cast<ComplexType>(sourceType).getElementType();
+      assert(!mlir::isa<ComplexType>(sourceElementType) &&
              "elements of complex numbers should not be complex");
       Value sourceReal =
           b->create<mlir::complex::ReOp>(loc, sourceElementType, args.front());
@@ -800,7 +805,7 @@ inline Value mapConvertOpToStdScalarOp(Location loc, ArrayRef<Type> targetTypes,
     return b->create<mlir::complex::CreateOp>(loc, targetType, targetReal,
                                               targetImag);
   }
-  if (auto sourceComplexType = sourceType.dyn_cast<ComplexType>()) {
+  if (auto sourceComplexType = mlir::dyn_cast<ComplexType>(sourceType)) {
     auto sourceElementType = sourceComplexType.getElementType();
     // When converting from complex to a non-complex type, we take just the real
     // part of the complex number.
@@ -840,14 +845,14 @@ inline Value mapMhloOpToStdScalarOp<mhlo::DotOp>(Location loc,
   const auto& rhs = adaptor.getOperands()[1];
   const auto& result = adaptor.getOperands()[2];
   Type elementType = lhs.getType();
-  if (elementType.isa<FloatType>()) {
+  if (mlir::isa<FloatType>(elementType)) {
     Value floatMul =
         MapMhloOpToScalarOpImpl<IsFloatType, ::mlir::arith::MulFOp>{}(
             loc, resultTypes, argTypes, {lhs, rhs}, b);
     return MapMhloOpToScalarOpImpl<IsFloatType, ::mlir::arith::AddFOp>{}(
         loc, resultTypes, argTypes, {floatMul, result}, b);
   }
-  if (elementType.isa<IntegerType>()) {
+  if (mlir::isa<IntegerType>(elementType)) {
     Value intMul =
         MapMhloOpToScalarOpImpl<IsAnyIntegerType, ::mlir::arith::MulIOp>{}(
             loc, resultTypes, argTypes, {lhs, rhs}, b);
@@ -861,9 +866,9 @@ template <>
 inline Value mapMhloOpToStdScalarOp<mhlo::IsFiniteOp>(
     Location loc, ArrayRef<Type> /*ResultTypes*/, ArrayRef<Type> /*argTypes*/,
     mhlo::IsFiniteOp::Adaptor adaptor, OpBuilder* b) {
-  if (adaptor.getX().getType().isa<FloatType>()) {
+  if (mlir::isa<FloatType>(adaptor.getX().getType())) {
     auto posInf = APFloat::getInf(
-        adaptor.getX().getType().cast<FloatType>().getFloatSemantics());
+        mlir::cast<FloatType>(adaptor.getX().getType()).getFloatSemantics());
     auto constPosInf = b->create<arith::ConstantOp>(
         loc, b->getFloatAttr(adaptor.getX().getType(), posInf));
     Value absX = b->create<::mlir::math::AbsFOp>(loc, adaptor.getX());
@@ -895,7 +900,7 @@ struct CompareSelectOpToStdScalarOp<SupportedType, StdCompareOp, Predicate,
                    ArrayRef<Type> resultTypes, ArrayRef<Type> argTypes,
                    ValueRange args, OpBuilder* b) {
     Type elementType = getElementTypeOrSelf(argTypes.front());
-    if (elementType.isa<SupportedType>()) {
+    if (isa<SupportedType>(elementType)) {
       auto predicate = getCmpPredicate<Predicate>(
           comparisonDirection, !elementType.isUnsignedInteger());
       assert(predicate.has_value() && "expected valid comparison direction");
@@ -911,7 +916,7 @@ struct CompareSelectOpToStdScalarOp<SupportedType, StdCompareOp, Predicate,
 inline Value mhloAlwaysPropagateNaN(Value v, ValueRange args, Location loc,
                                     OpBuilder* b) {
   Type elementType = getElementTypeOrSelf(args.front().getType());
-  if (auto floatType = elementType.dyn_cast<FloatType>()) {
+  if (auto floatType = mlir::dyn_cast<FloatType>(elementType)) {
     Value isnan = b->create<mlir::arith::CmpFOp>(loc, arith::CmpFPredicate::UNO,
                                                  args[0], args[1]);
 
@@ -941,7 +946,7 @@ inline Value makeSafeIntDiv(ImplicitLocOpBuilder& lb, Type originalType,
                             Value lhs, Value rhs, Value returnedOnZero,
                             Value returnedOnSignedOverflow) {
   Type type = lhs.getType();
-  auto elementType = getElementTypeOrSelf(type).cast<IntegerType>();
+  auto elementType = mlir::cast<IntegerType>(getElementTypeOrSelf(type));
   Value zero = lb.create<arith::ConstantOp>(lb.getZeroAttr(type));
   auto makeConstant = [&](const APInt& i) {
     return getConstantOrSplat(&lb, lb.getLoc(), type,
@@ -981,7 +986,7 @@ inline Value mapMhloOpToStdScalarOp<mhlo::DivOp>(Location loc,
                                                  mhlo::DivOp::Adaptor adaptor,
                                                  OpBuilder* b) {
   Type originalType = getElementTypeOrSelf(argTypes.front());
-  if (originalType.isa<ComplexType, FloatType>()) {
+  if (mlir::isa<ComplexType, FloatType>(originalType)) {
     return MapMhloOpToScalarOpImpl<IsFloatType, arith::DivFOp, IsComplexType,
                                    complex::DivOp>{}(loc, resultTypes, argTypes,
                                                      adaptor.getOperands(), b);
@@ -993,7 +998,7 @@ inline Value mapMhloOpToStdScalarOp<mhlo::DivOp>(Location loc,
   // INT_SMIN /s -1 = INT_SMIN
   ImplicitLocOpBuilder lb(loc, *b);
   Type type = adaptor.getLhs().getType();
-  auto elementType = getElementTypeOrSelf(type).cast<IntegerType>();
+  auto elementType = mlir::cast<IntegerType>(getElementTypeOrSelf(type));
   auto makeConstant = [&](const APInt& i) {
     return getConstantOrSplat(&lb, lb.getLoc(), type,
                               lb.getIntegerAttr(elementType, i));
@@ -1013,7 +1018,7 @@ inline Value mapMhloOpToStdScalarOp<mhlo::RemOp>(Location loc,
                                                  mhlo::RemOp::Adaptor adaptor,
                                                  OpBuilder* b) {
   Type originalType = getElementTypeOrSelf(argTypes.front());
-  if (originalType.isa<ComplexType, FloatType>()) {
+  if (mlir::isa<ComplexType, FloatType>(originalType)) {
     return MapMhloOpToScalarOpImpl<IsFloatType, arith::RemFOp>{}(
         loc, resultTypes, argTypes, adaptor.getOperands(), b);
   }
@@ -1038,12 +1043,12 @@ inline Value mapMhloOpToStdScalarOp<mhlo::NegOp>(Location loc,
                                                  mhlo::NegOp::Adaptor adaptor,
                                                  OpBuilder* b) {
   Type elementType = getElementTypeOrSelf(adaptor.getOperand().getType());
-  if (elementType.isa<ComplexType, FloatType>()) {
+  if (mlir::isa<ComplexType, FloatType>(elementType)) {
     return MapMhloOpToScalarOpImpl<IsFloatType, ::mlir::arith::NegFOp,
                                    IsComplexType, ::mlir::complex::NegOp>{}(
         loc, resultTypes, argTypes, adaptor.getOperands(), b);
   }
-  if (elementType.isa<IntegerType>()) {
+  if (mlir::isa<IntegerType>(elementType)) {
     // lmhlo.neg(x, result) -> result = sub(0, x)
     Value lhs = adaptor.getOperand();
     Value zeroIntval =
@@ -1060,7 +1065,7 @@ inline Value mapMhloOpToStdScalarOp<mhlo::NotOp>(Location loc,
                                                  mhlo::NotOp::Adaptor adaptor,
                                                  OpBuilder* b) {
   Type elementType = getElementTypeOrSelf(adaptor.getOperand().getType());
-  if (auto integerType = elementType.dyn_cast<IntegerType>()) {
+  if (auto integerType = mlir::dyn_cast<IntegerType>(elementType)) {
     // lmhlo.not(x) -> x ^ -1
     Value allOnes = getConstantOrSplat(
         b, loc, adaptor.getOperand().getType(),
@@ -1083,7 +1088,7 @@ inline Value mapMhloOpToStdScalarOp<mhlo::LogisticOp>(
 
   Type type = getElementTypeOrSelf(resultTypes[0]);
   Value oneFloat =
-      type.isa<ComplexType>()
+      mlir::isa<ComplexType>(type)
           ? b->create<arith::ConstantOp>(loc, b->getF32FloatAttr(1.0))
           : getConstantOrSplat(b, loc, resultTypes[0],
                                FloatAttr::get(type, 1.0f));
@@ -1105,7 +1110,7 @@ inline Value mapMhloOpToStdScalarOp<mhlo::PowOp>(Location loc,
   // TODO: b/315868720 Consider alternate lowerings of mhlo::PowOp with integer
   // operands. Floating point can use std::powf
   auto resultType = getElementTypeOrSelf(resultTypes.front());
-  if (resultType.isa<ComplexType, FloatType>()) {
+  if (mlir::isa<ComplexType, FloatType>(resultType)) {
     return MapMhloOpToScalarOpImpl<IsFloatType, math::PowFOp, IsComplexType,
                                    complex::PowOp>{}(loc, resultTypes, argTypes,
                                                      adaptor.getOperands(), b);
@@ -1193,7 +1198,7 @@ inline Value mapMhloOpToStdScalarOp<mhlo::SignOp>(Location loc,
                                                   OpBuilder* b) {
   Value operand = adaptor.getOperand();
   Type elementType = getElementTypeOrSelf(operand.getType());
-  if (auto floatType = elementType.dyn_cast<FloatType>()) {
+  if (auto floatType = mlir::dyn_cast<FloatType>(elementType)) {
     Value zero =
         b->create<arith::ConstantOp>(loc, b->getZeroAttr(operand.getType()));
     Value ne0I1 = b->create<::mlir::arith::CmpFOp>(
@@ -1206,7 +1211,7 @@ inline Value mapMhloOpToStdScalarOp<mhlo::SignOp>(Location loc,
         loc, arith::CmpFPredicate::UNO, operand, operand);
     return b->create<::mlir::arith::SelectOp>(loc, isNan, operand, copySign);
   }
-  if (auto integerType = elementType.dyn_cast<IntegerType>()) {
+  if (auto integerType = mlir::dyn_cast<IntegerType>(elementType)) {
     // sign(x) = x == 0 ? 0 : ((x s>> 31) | 1)
     Value zero =
         b->create<arith::ConstantOp>(loc, b->getZeroAttr(operand.getType()));
@@ -1222,7 +1227,7 @@ inline Value mapMhloOpToStdScalarOp<mhlo::SignOp>(Location loc,
     Value orOp = b->create<::mlir::arith::OrIOp>(loc, ashr, one);
     return b->create<::mlir::arith::SelectOp>(loc, cmp, zero, orOp);
   }
-  if (elementType.isa<ComplexType>()) {
+  if (mlir::isa<ComplexType>(elementType)) {
     return b->create<::mlir::complex::SignOp>(loc, elementType, operand);
   }
   return nullptr;
@@ -1233,8 +1238,9 @@ inline Value mapMhloOpToStdScalarOp<mhlo::SignOp>(Location loc,
 inline Value selectShiftedOrSaturated(ImplicitLocOpBuilder& lb, Value rhs,
                                       Value shifted, Value saturated,
                                       Type type) {
-  Type etype =
-      type.isa<ShapedType>() ? type.cast<ShapedType>().getElementType() : type;
+  Type etype = mlir::isa<ShapedType>(type)
+                   ? mlir::cast<ShapedType>(type).getElementType()
+                   : type;
   auto bitWidthInt = etype.getIntOrFloatBitWidth();
   Value bitWidth = getConstantOrSplat(&lb, lb.getLoc(), type,
                                       lb.getIntegerAttr(etype, bitWidthInt));
@@ -1283,8 +1289,9 @@ inline Value mapMhloOpToStdScalarOp<mhlo::ShiftRightArithmeticOp>(
   Value lhs = adaptor.getLhs();
   Value rhs = adaptor.getRhs();
   Type type = lhs.getType();
-  Type etype =
-      type.isa<ShapedType>() ? type.cast<ShapedType>().getElementType() : type;
+  Type etype = mlir::isa<ShapedType>(type)
+                   ? mlir::cast<ShapedType>(type).getElementType()
+                   : type;
   auto bitWidthInt = etype.getIntOrFloatBitWidth();
 
   // "Saturate" if the shift is greater than the bitwidth of the type
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_stablehlo_to_hlo_op.h b/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_stablehlo_to_hlo_op.h
index 3754b928e4b51d..80c6c0b0bd1ea7 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_stablehlo_to_hlo_op.h
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_stablehlo_to_hlo_op.h
@@ -69,7 +69,6 @@ MAP_STABLEHLO_TO_HLO(CollectivePermuteOp)
 MAP_STABLEHLO_TO_HLO(CompareOp)
 MAP_STABLEHLO_TO_HLO(ComplexOp)
 MAP_STABLEHLO_TO_HLO(CompositeOp)
-MAP_STABLEHLO_TO_HLO(ComputeReshapeShapeOp)
 MAP_STABLEHLO_TO_HLO(ConcatenateOp)
 MAP_STABLEHLO_TO_HLO(ConstantOp)
 MAP_STABLEHLO_TO_HLO(ConvertOp)
@@ -77,7 +76,6 @@ MAP_STABLEHLO_TO_HLO(ConvolutionOp)
 MAP_STABLEHLO_TO_HLO(CosineOp)
 MAP_STABLEHLO_TO_HLO(CreateTokenOp)
 MAP_STABLEHLO_TO_HLO(CrossReplicaSumOp)
-MAP_STABLEHLO_TO_HLO(CstrReshapableOp)
 MAP_STABLEHLO_TO_HLO(CustomCallOp)
 MAP_STABLEHLO_TO_HLO(DivOp)
 MAP_STABLEHLO_TO_HLO(DotGeneralOp)
@@ -152,7 +150,6 @@ MAP_STABLEHLO_TO_HLO(SqrtOp)
 MAP_STABLEHLO_TO_HLO(SubtractOp)
 MAP_STABLEHLO_TO_HLO(TanhOp)
 MAP_STABLEHLO_TO_HLO(TorchIndexSelectOp)
-MAP_STABLEHLO_TO_HLO(TraceOp)
 MAP_STABLEHLO_TO_HLO(TransposeOp)
 MAP_STABLEHLO_TO_HLO(TriangularSolveOp)
 MAP_STABLEHLO_TO_HLO(TupleOp)
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/materialize_broadcasts/materialize_broadcasts.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/materialize_broadcasts/materialize_broadcasts.cc
index 32538f4a8c37f0..e92f5288c591e0 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/materialize_broadcasts/materialize_broadcasts.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/materialize_broadcasts/materialize_broadcasts.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir {
@@ -36,9 +37,10 @@ struct ClampWithBroadcastConvert : public OpRewritePattern<ClampOp> {
 
   LogicalResult matchAndRewrite(ClampOp op,
                                 PatternRewriter &rewriter) const override {
-    auto operandType = op.getOperand().getType().dyn_cast<RankedTensorType>();
-    auto maxType = op.getMax().getType().dyn_cast<RankedTensorType>();
-    auto minType = op.getMin().getType().dyn_cast<RankedTensorType>();
+    auto operandType =
+        mlir::dyn_cast<RankedTensorType>(op.getOperand().getType());
+    auto maxType = mlir::dyn_cast<RankedTensorType>(op.getMax().getType());
+    auto minType = mlir::dyn_cast<RankedTensorType>(op.getMin().getType());
     // Unrancked types are not supported.
     if (!operandType || !maxType || !minType) return failure();
     // Does not support operand with dynamic dimensions for now.
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_canonicalize_dot/mhlo_canonicalize_dot.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_canonicalize_dot/mhlo_canonicalize_dot.cc
index 959c82837be8b3..11fe1151407ce1 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_canonicalize_dot/mhlo_canonicalize_dot.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_canonicalize_dot/mhlo_canonicalize_dot.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir {
@@ -42,7 +43,7 @@ LogicalResult canonicalizeDot(DotOp dotOp, PatternRewriter& rewriter) {
   SmallVector<ReassociationIndices> reassociationMap = {{0, 1}};
   auto collapseUnitParallelDim = [&](Value orig,
                                      unsigned parallelDimIdx) -> Value {
-    auto type = orig.getType().dyn_cast<RankedTensorType>();
+    auto type = mlir::dyn_cast<RankedTensorType>(orig.getType());
     if (!type || type.getRank() == 1 || type.getDimSize(parallelDimIdx) != 1)
       return orig;
 
@@ -62,12 +63,12 @@ LogicalResult canonicalizeDot(DotOp dotOp, PatternRewriter& rewriter) {
   SmallVector<int64_t> newShape;
   if (!matvec != !vecmat) {
     newShape.push_back(
-        vecmat ? rhs.getType().cast<ShapedType>().getShape().back()
-               : lhs.getType().cast<ShapedType>().getShape().front());
+        vecmat ? mlir::cast<ShapedType>(rhs.getType()).getShape().back()
+               : mlir::cast<ShapedType>(lhs.getType()).getShape().front());
   }
 
   auto newTy = RankedTensorType::get(
-      newShape, dotOp.getType().cast<ShapedType>().getElementType());
+      newShape, mlir::cast<ShapedType>(dotOp.getType()).getElementType());
 
   auto newOp = rewriter.create<DotOp>(
       loc, newTy, newLhs, newRhs, dotOp.getPrecisionConfig().value_or(nullptr));
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_canonicalize_reduction/mhlo_canonicalize_reduction.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_canonicalize_reduction/mhlo_canonicalize_reduction.cc
index bb68ec22fbac90..ed3d62446cc2a9 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_canonicalize_reduction/mhlo_canonicalize_reduction.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_canonicalize_reduction/mhlo_canonicalize_reduction.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
 
 namespace mlir {
 namespace mhlo {
@@ -136,7 +137,7 @@ struct HloCanonicalizeReductionPass
       if (dimsToReduce.empty()) return;
 
       // suppose reduce input is a ranked tensor
-      auto ty = op.getOperand(0).getType().dyn_cast<RankedTensorType>();
+      auto ty = mlir::dyn_cast<RankedTensorType>(op.getOperand(0).getType());
       if (!ty) return signalPassFailure();
       int rank = ty.getRank();
       int ndimsToReduce = dimsToReduce.size();
@@ -222,7 +223,7 @@ struct HloCanonicalizeReductionPass
       }
       SmallVector<Type> elementTypes{llvm::map_range(
           op.getBody().front().getTerminator()->getOperands(), [](Value v) {
-            return v.getType().cast<ShapedType>().getElementType();
+            return mlir::cast<ShapedType>(v.getType()).getElementType();
           })};
       auto newOp = b.create<ReduceOp>(loc, newOperands, op.getInitValues(),
                                       attr, elementTypes);
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_canonicalize_scatter/mhlo_canonicalize_scatter.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_canonicalize_scatter/mhlo_canonicalize_scatter.cc
index 30b0c6f1848bdd..e3ef577d66ec53 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_canonicalize_scatter/mhlo_canonicalize_scatter.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_canonicalize_scatter/mhlo_canonicalize_scatter.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir {
@@ -67,7 +68,7 @@ SmallVector<Value> transposeTensors(OpBuilder& b, Location loc,
 SmallVector<Value> transposeUpdatesAccordingToScatterDimsMap(
     OpBuilder& b, Location loc, SmallVector<Value> updates,
     ArrayRef<int64_t> scatterDimsToOperandDims) {
-  auto updatesType = updates.front().getType().cast<RankedTensorType>();
+  auto updatesType = mlir::cast<RankedTensorType>(updates.front().getType());
   int64_t updatesRank = updatesType.getRank();
   int64_t operandRank = updatesRank - 1;
 
@@ -87,7 +88,7 @@ SmallVector<Value> transposeUpdatesAccordingToScatterDimsMap(
 SmallVector<Value> transposeUpdatesToMoveWindowDimensionsInside(
     OpBuilder& b, Location loc, SmallVector<Value> updates,
     ArrayRef<int64_t> updateWindowDims) {
-  auto updatesType = updates.front().getType().cast<RankedTensorType>();
+  auto updatesType = mlir::cast<RankedTensorType>(updates.front().getType());
   int64_t updatesRank = updatesType.getRank();
 
   // Move update dimensions to the back
@@ -102,7 +103,7 @@ SmallVector<Value> transposeUpdatesToMoveWindowDimensionsInside(
 SmallVector<Value> reshapeUpdatesToEnsureSingleScatterDimension(
     OpBuilder& b, Location loc, ValueRange updates,
     ArrayRef<int64_t> updateWindowDims) {
-  auto updatesType = updates.front().getType().cast<RankedTensorType>();
+  auto updatesType = mlir::cast<RankedTensorType>(updates.front().getType());
   int64_t updatesRank = updatesType.getRank();
 
   // Collapse scatter dimensions to 1D if there are more than 1 or prepend a
@@ -186,7 +187,7 @@ struct CanonicalizeScatterPattern : public OpRewritePattern<ScatterOp> {
         scatterOp.getScatterDimensionNumbers();
 
     auto operandType =
-        scatterOp.getInputs().front().getType().cast<RankedTensorType>();
+        mlir::cast<RankedTensorType>(scatterOp.getInputs().front().getType());
     int64_t operandRank = operandType.getRank();
     auto [operandPermutation, operandPermutationInverse] =
         makeOperandStartIndexPermutations(
@@ -205,7 +206,7 @@ struct CanonicalizeScatterPattern : public OpRewritePattern<ScatterOp> {
         dimsAttrs.getUpdateWindowDims(), dimsAttrs.getInsertedWindowDims());
 
     int64_t scatterIndicesVectorSize =
-        canonicalIndices.getType().cast<TensorType>().getDimSize(1);
+        mlir::cast<TensorType>(canonicalIndices.getType()).getDimSize(1);
     auto canonicalDimsAttrs = ScatterDimensionNumbersAttr::get(
         rewriter.getContext(),
         /*updateWindowDims=*/
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_flatten_tuple/mhlo_flatten_tuple.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_flatten_tuple/mhlo_flatten_tuple.cc
index f55bb8e85973dc..b2086947066188 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_flatten_tuple/mhlo_flatten_tuple.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_flatten_tuple/mhlo_flatten_tuple.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir {
@@ -44,13 +45,13 @@ namespace {
 
 // Calculates the flatten types of a value.
 void flattenTupleType(Value value, llvm::SmallVectorImpl<Type> &types) {
-  if (!value.getType().isa<TupleType>()) {
+  if (!mlir::isa<TupleType>(value.getType())) {
     types.push_back(value.getType());
     return;
   }
 
   // This function doesn't handle nested tuple.
-  auto tupleType = value.getType().cast<TupleType>();
+  auto tupleType = mlir::cast<TupleType>(value.getType());
   types.append(tupleType.begin(), tupleType.end());
 }
 
@@ -59,18 +60,19 @@ void flattenTupleType(Value value, llvm::SmallVectorImpl<Type> &types) {
 // of the root TupleOp or given value if the type is not TupleType.
 Value createTupleValue(OpBuilder &builder, Location loc,
                        ValueRange flattenValues, Type tupleType) {
-  if (!tupleType.isa<TupleType>()) {
+  if (!mlir::isa<TupleType>(tupleType)) {
     assert(flattenValues.size() == 1);
     return flattenValues[0];
   }
 
-  assert(tupleType.cast<TupleType>().getTypes().size() == flattenValues.size());
+  assert(mlir::cast<TupleType>(tupleType).getTypes().size() ==
+         flattenValues.size());
   return builder.create<mhlo::TupleOp>(loc, flattenValues);
 }
 
 void flattenTupleValue(OpBuilder &builder, Location loc, Value value,
                        llvm::SmallVectorImpl<Value> &flattenedValues) {
-  auto tupleType = value.getType().dyn_cast<TupleType>();
+  auto tupleType = mlir::dyn_cast<TupleType>(value.getType());
   if (!tupleType) {
     flattenedValues.push_back(value);
     return;
@@ -88,10 +90,10 @@ struct FlattenCustomCallOp : public OpRewritePattern<CustomCallOp> {
 
   LogicalResult matchAndRewrite(CustomCallOp op,
                                 PatternRewriter &rewriter) const override {
-    bool flattenResult =
-        op->getNumResults() == 1 && op->getResult(0).getType().isa<TupleType>();
+    bool flattenResult = op->getNumResults() == 1 &&
+                         mlir::isa<TupleType>(op->getResult(0).getType());
     bool flattenOperands = llvm::any_of(op.getInputs(), [](Value operand) {
-      return operand.getType().isa<TupleType>();
+      return mlir::isa<TupleType>(operand.getType());
     });
 
     if (!flattenResult && !flattenOperands) return failure();
@@ -106,8 +108,8 @@ struct FlattenCustomCallOp : public OpRewritePattern<CustomCallOp> {
     } else {
       // Check for nested tuples.
       for (Type innerType :
-           op->getResult(0).getType().cast<TupleType>().getTypes())
-        if (innerType.isa<TupleType>()) return failure();
+           mlir::cast<TupleType>(op->getResult(0).getType()).getTypes())
+        if (mlir::isa<TupleType>(innerType)) return failure();
 
       for (auto result : op->getResults())
         flattenTupleType(result, flattenedResultTypes);
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_passes.td b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_passes.td
index 62868358ffc0e0..6dab945f47e2ca 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_passes.td
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_passes.td
@@ -136,11 +136,6 @@ def LegalizeTanhToApproximationPass : Pass<"mhlo-legalize-trigonometric-to-appro
   let constructor = "createLegalizeTrigonometricToApproximationPass()";
 }
 
-def HloLegalizeShapeOpsToStandardPass : Pass<"hlo-legalize-shapeops-to-standard", "func::FuncOp"> {
-  let summary = "Legalize shape operations from HLO dialect to standard dialect.";
-  let constructor = "createLegalizeHloShapeOpsToStandardPass()";
-}
-
 def HloLegalizeToLinalgPass : Pass<"hlo-legalize-to-linalg", "func::FuncOp"> {
   let summary = "Legalize from HLO dialect to Linalg dialect.";
   let constructor = "createLegalizeHloToLinalgPass()";
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_quant_legalize_to_int/mhlo_quant_legalize_to_int.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_quant_legalize_to_int/mhlo_quant_legalize_to_int.cc
index 4d44b97db4fde8..e6b0b132863938 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_quant_legalize_to_int/mhlo_quant_legalize_to_int.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_quant_legalize_to_int/mhlo_quant_legalize_to_int.cc
@@ -54,12 +54,12 @@ namespace {
 using QuantType = std::variant<quant::UniformQuantizedType,
                                quant::UniformQuantizedPerAxisType>;
 FailureOr<QuantType> getQuantType(Type type) {
-  if (auto quantType =
-          getElementTypeOrSelf(type).dyn_cast<quant::UniformQuantizedType>()) {
+  if (auto quantType = mlir::dyn_cast<quant::UniformQuantizedType>(
+          getElementTypeOrSelf(type))) {
     return QuantType(quantType);
   }
-  if (auto quantType = getElementTypeOrSelf(type)
-                           .dyn_cast<quant::UniformQuantizedPerAxisType>()) {
+  if (auto quantType = mlir::dyn_cast<quant::UniformQuantizedPerAxisType>(
+          getElementTypeOrSelf(type))) {
     return QuantType(quantType);
   }
   return failure();
@@ -168,16 +168,16 @@ void getQuantizationStorageInfo(OpBuilder &builder, Location loc,
 
 // Extracts storage type of a UQ type. Return original type if it is no UQ type.
 Type getQuantStorageType(Type type) {
-  if (auto shaped = type.dyn_cast<ShapedType>()) {
+  if (auto shaped = mlir::dyn_cast<ShapedType>(type)) {
     return shaped.clone(getQuantStorageType(shaped.getElementType()));
   }
 
-  if (auto elementType =
-          getElementTypeOrSelf(type).dyn_cast<quant::UniformQuantizedType>()) {
+  if (auto elementType = mlir::dyn_cast<quant::UniformQuantizedType>(
+          getElementTypeOrSelf(type))) {
     return elementType.getStorageType();
   }
-  if (auto elementType = getElementTypeOrSelf(type)
-                             .dyn_cast<quant::UniformQuantizedPerAxisType>()) {
+  if (auto elementType = mlir::dyn_cast<quant::UniformQuantizedPerAxisType>(
+          getElementTypeOrSelf(type))) {
     return elementType.getStorageType();
   }
   return type;
@@ -305,7 +305,8 @@ Value requantize(mlir::OpState op, Value input, QuantType inputQuantType,
                                         outputQuantType, outputFloat);
 
   // Clamp output if the output integer bit-width <32.
-  if (outputTensorType.getElementType().cast<IntegerType>().getWidth() < 32) {
+  if (mlir::cast<IntegerType>(outputTensorType.getElementType()).getWidth() <
+      32) {
     Value quantizationMin, quantizationMax;
     getQuantizationStorageInfo(rewriter, op->getLoc(), outputQuantType,
                                quantizationMin, quantizationMax);
@@ -334,8 +335,9 @@ class ConvertUniformQuantizeOp
       if (succeeded(quantType)) {
         return matchAndRewriteQuantize(op, adaptor, rewriter, *quantType);
       }
-    } else if (inputElementType.isa<quant::UniformQuantizedType,
-                                    quant::UniformQuantizedPerAxisType>()) {
+    } else if (mlir::isa<quant::UniformQuantizedType,
+                         quant::UniformQuantizedPerAxisType>(
+                   inputElementType)) {
       auto inputQuantType = getQuantType(inputElementType);
       auto outputQuantType = getQuantType(op.getResult().getType());
       if (succeeded(inputQuantType) && succeeded(outputQuantType)) {
@@ -395,8 +397,8 @@ class ConvertUniformQuantizeOp
         op,
         requantize(op, adaptor.getOperand(), inputQuantType, outputQuantType,
                    /*outputTensorType=*/
-                   op.getResult().getType().cast<TensorType>().clone(
-                       getQuantStorageType(outputQuantType)),
+                   mlir::cast<TensorType>(op.getResult().getType())
+                       .clone(getQuantStorageType(outputQuantType)),
                    rewriter));
     return success();
   }
@@ -423,13 +425,13 @@ class ConvertUniformDequantizeOp
     Value input = adaptor.getOperand();
     // TODO: b/260280919 - Consider avoiding conversion to int32.
     auto resInt32TensorType =
-        input.getType().cast<TensorType>().clone(rewriter.getI32Type());
+        mlir::cast<TensorType>(input.getType()).clone(rewriter.getI32Type());
     Value resInt32 = rewriter.create<mhlo::ConvertOp>(
         op->getLoc(), resInt32TensorType, input);
     resInt32 = rewriter.create<chlo::BroadcastSubOp>(
         op->getLoc(), resInt32TensorType, resInt32, zeroPoints, broadcastDims);
     auto resFloatTensorType =
-        resInt32.getType().cast<TensorType>().clone(rewriter.getF32Type());
+        mlir::cast<TensorType>(resInt32.getType()).clone(rewriter.getF32Type());
     Value resFloat = rewriter.create<mhlo::ConvertOp>(
         op->getLoc(), resFloatTensorType, resInt32);
     resFloat = rewriter.replaceOpWithNewOp<chlo::BroadcastMulOp>(
@@ -628,11 +630,9 @@ LogicalResult matchAndRewriteDotLikeHybridOp(
   if (failed(rhsElementQuantType)) {
     return failure();
   }
-  auto resFloat32TensorType =
-      op.getResult().getType().template cast<TensorType>();
-  auto rhsFloat32TensorType =
-      op.getRhs().getType().template cast<TensorType>().clone(
-          rewriter.getF32Type());
+  auto resFloat32TensorType = mlir::cast<TensorType>(op.getResult().getType());
+  auto rhsFloat32TensorType = mlir::cast<TensorType>(op.getRhs().getType())
+                                  .clone(rewriter.getF32Type());
 
   // Get scales and zero points for rhs.
   Value rhsScale, rhsZeroPoint;
@@ -672,7 +672,7 @@ Value createZeroPointPartialOffset(OpBuilder &builder, Location loc,
 
   // Calculate the output tensor shape. This is input tensor dims minus
   // contracting dims.
-  auto rankedTensor = tensor.getType().cast<RankedTensorType>();
+  auto rankedTensor = mlir::cast<RankedTensorType>(tensor.getType());
   SmallVector<int64_t> outputDims;
   for (int64_t i = 0; i < rankedTensor.getRank(); ++i) {
     if (llvm::count(reductionDims, i) == 0) {
@@ -683,7 +683,7 @@ Value createZeroPointPartialOffset(OpBuilder &builder, Location loc,
   // Convert input tensor to output type since mhlo::Reduce only supports same
   // element type for input/output.
   tensor = builder.create<mhlo::ConvertOp>(
-      loc, tensor.getType().cast<TensorType>().clone(outputElementType),
+      loc, mlir::cast<TensorType>(tensor.getType()).clone(outputElementType),
       tensor);
   auto reducerTensorType = RankedTensorType::get({}, outputElementType);
 
@@ -756,7 +756,7 @@ Value broadcastZpContribution(OpBuilder &builder, Location loc,
   // zero-point-offset tensor to the final output tensor, and then do the
   // broadcast.
   auto zpContributionRank =
-      zpContribution.getType().cast<ShapedType>().getRank();
+      mlir::cast<ShapedType>(zpContribution.getType()).getRank();
   SmallVector<int64_t> broadcastDims;
   broadcastDims.resize(zpContributionRank, 0);
   // Result tensor will have batching dims first, then LHS result dims, then
@@ -779,7 +779,7 @@ Value broadcastZpContribution(OpBuilder &builder, Location loc,
   }
   // Use broadcast_in_dim or dyanmic_broadcast_in_dim based on output shape
   // dynamism.
-  if (outputTensorType.cast<ShapedType>().hasStaticShape()) {
+  if (mlir::cast<ShapedType>(outputTensorType).hasStaticShape()) {
     zpContribution = builder.create<mhlo::BroadcastInDimOp>(
         loc, outputTensorType, zpContribution,
         DenseIntElementsAttr::get(
@@ -805,8 +805,8 @@ Value calculateZeroPointOffset(OpBuilder &builder, Location loc, Value lhs,
                                Value rhs, Value output, int64_t lhsZp,
                                int64_t rhsZp, TensorType outputTensorType,
                                const DotLikeDimensionNumbers &dims) {
-  mlir::ShapedType lhsShape = lhs.getType().cast<mlir::ShapedType>();
-  mlir::ShapedType rhsShape = rhs.getType().cast<mlir::ShapedType>();
+  mlir::ShapedType lhsShape = mlir::cast<mlir::ShapedType>(lhs.getType());
+  mlir::ShapedType rhsShape = mlir::cast<mlir::ShapedType>(rhs.getType());
   Value result = nullptr;
   Value outputDimsValue = nullptr;
   // Calculate LHS contribution when RHS zp is non-zero.
@@ -900,17 +900,17 @@ Value createDotLikeKernel<mhlo::ConvolutionOp>(OpBuilder &builder, Location loc,
     auto originalPadding = op.getPaddingAttr().getValues<int64_t>();
 
     Value zp = builder.create<mhlo::ConstantOp>(
-        loc,
-        DenseIntElementsAttr::get(
-            RankedTensorType::get({}, builder.getI8Type()),
-            {static_cast<int8_t>(getElementTypeOrSelf(op.getLhs().getType())
-                                     .cast<quant::UniformQuantizedType>()
-                                     .getZeroPoint())}));
+        loc, DenseIntElementsAttr::get(
+                 RankedTensorType::get({}, builder.getI8Type()),
+                 {static_cast<int8_t>(
+                     mlir::cast<quant::UniformQuantizedType>(
+                         getElementTypeOrSelf(op.getLhs().getType()))
+                         .getZeroPoint())}));
     // Convert Padding attributes from mhlo::Convolution to mhlo::Pad. Note that
     // Padding is applied for spatial dimensions [1...rank-1) only for
     // mhlo::Convolution. But mhlo::Pad require those for all dimensions. Hence
     // we add 0 to the beginning and end of the padding vectors.
-    int64_t rank = lhs.getType().cast<TensorType>().getRank();
+    int64_t rank = mlir::cast<TensorType>(lhs.getType()).getRank();
     SmallVector<int64_t> paddingLow(rank, 0), paddingHigh(rank, 0),
         paddingInterior(rank, 0);
     for (int64_t i = 1; i < rank - 1; ++i) {
@@ -965,20 +965,18 @@ LogicalResult matchAndRewriteDotLikeOp(DotLikeOp op, DotLikeOpAdaptor adaptor,
   Value resI32 = createDotLikeKernel(rewriter, op->getLoc(), op,
                                      resInt32TensorType, lhs, rhs, attrs);
 
-  auto lhsElementQuantType = getElementTypeOrSelf(op.getLhs().getType())
-                                 .template cast<quant::UniformQuantizedType>();
-  auto rhsElementQuantType =
-      getElementTypeOrSelf(op.getRhs().getType())
-          .template dyn_cast<quant::UniformQuantizedType>();
+  auto lhsElementQuantType = mlir::cast<quant::UniformQuantizedType>(
+      getElementTypeOrSelf(op.getLhs().getType()));
+  auto rhsElementQuantType = mlir::dyn_cast<quant::UniformQuantizedType>(
+      getElementTypeOrSelf(op.getRhs().getType()));
   auto rhsElementQuantPerChannelType =
-      getElementTypeOrSelf(op.getRhs().getType())
-          .template dyn_cast<quant::UniformQuantizedPerAxisType>();
-  auto resElementQuantType =
-      getElementTypeOrSelf(op.getResult())
-          .template dyn_cast<quant::UniformQuantizedType>();
+      mlir::dyn_cast<quant::UniformQuantizedPerAxisType>(
+          getElementTypeOrSelf(op.getRhs().getType()));
+  auto resElementQuantType = mlir::dyn_cast<quant::UniformQuantizedType>(
+      getElementTypeOrSelf(op.getResult()));
   auto resElementQuantPerChannelType =
-      getElementTypeOrSelf(op.getResult())
-          .template dyn_cast<quant::UniformQuantizedPerAxisType>();
+      mlir::dyn_cast<quant::UniformQuantizedPerAxisType>(
+          getElementTypeOrSelf(op.getResult()));
 
   // Here we assume LHS must be per-tensor quantized.
   // If RHS is per-channel quantized, it must has 0 zp.
@@ -1015,7 +1013,8 @@ LogicalResult matchAndRewriteDotLikeOp(DotLikeOp op, DotLikeOpAdaptor adaptor,
     // Skip zp_offset if it is 0.
     if (zpOffset) {
       auto zpOffsetFloat32TensorType =
-          zpOffset.getType().cast<TensorType>().clone(rewriter.getF32Type());
+          mlir::cast<TensorType>(zpOffset.getType())
+              .clone(rewriter.getF32Type());
       zpOffset = rewriter.create<mhlo::ConvertOp>(
           op->getLoc(), zpOffsetFloat32TensorType, zpOffset);
       zpOffset = rewriter.create<chlo::BroadcastMulOp>(
@@ -1197,8 +1196,8 @@ FailureOr<DotLikeDimensionNumbers> verifyAndConstructDims(
                              std::get<quant::UniformQuantizedPerAxisType>(
                                  rhsElementQuantType)
                                  .getZeroPoints(),
-                             getElementTypeOrSelf(op.getResult())
-                                 .cast<quant::UniformQuantizedPerAxisType>()
+                             mlir::cast<quant::UniformQuantizedPerAxisType>(
+                                 getElementTypeOrSelf(op.getResult()))
                                  .getZeroPoints()),
                          [](int64_t zp) { return zp != 0; })) {
     op->emitError("RHS/result UQ type must have zero zp.");
@@ -1208,7 +1207,7 @@ FailureOr<DotLikeDimensionNumbers> verifyAndConstructDims(
   if (!isRhsQuantPerTensor &&
       (std::get<quant::UniformQuantizedPerAxisType>(rhsElementQuantType)
            .getQuantizedDimension() !=
-       op.getRhs().getType().cast<TensorType>().getRank() - 1)) {
+       mlir::cast<TensorType>(op.getRhs().getType()).getRank() - 1)) {
     op->emitError("Conv quantized axis must be out channel axis");
     return failure();
   }
@@ -1216,8 +1215,8 @@ FailureOr<DotLikeDimensionNumbers> verifyAndConstructDims(
   // the same for each channel.
   if (!isRhsQuantPerTensor) {
     auto resElementQuantPerChannelType =
-        getElementTypeOrSelf(op.getResult())
-            .cast<quant::UniformQuantizedPerAxisType>();
+        mlir::cast<quant::UniformQuantizedPerAxisType>(
+            getElementTypeOrSelf(op.getResult()));
     SmallVector<double> scaleRatios(
         resElementQuantPerChannelType.getScales().size());
     for (size_t i = 0; i < scaleRatios.size(); ++i) {
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/optimize_mhlo/optimize_mhlo.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/optimize_mhlo/optimize_mhlo.cc
index 48d30f88e1a23f..10634e2928304a 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/optimize_mhlo/optimize_mhlo.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/optimize_mhlo/optimize_mhlo.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "mlir/IR/Types.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassRegistry.h"
+#include "mlir/Support/LLVM.h"
 #include "utils/hlo_utils.h"
 
 namespace mlir {
@@ -57,12 +58,11 @@ class GatherIsSlice : public OpRewritePattern<GatherOp> {
     auto dimensionNumbers = gather.getDimensionNumbers();
 
     // Inputs need to be ranked to lower.
-    if (!gather.getOperand().getType().cast<ShapedType>().hasRank() ||
-        !gather.getOperand().getType().cast<ShapedType>().hasStaticShape() ||
-        !gather.getStartIndices().getType().cast<ShapedType>().hasRank() ||
-        !gather.getStartIndices()
-             .getType()
-             .cast<ShapedType>()
+    if (!mlir::cast<ShapedType>(gather.getOperand().getType()).hasRank() ||
+        !mlir::cast<ShapedType>(gather.getOperand().getType())
+             .hasStaticShape() ||
+        !mlir::cast<ShapedType>(gather.getStartIndices().getType()).hasRank() ||
+        !mlir::cast<ShapedType>(gather.getStartIndices().getType())
              .hasStaticShape()) {
       return rewriter.notifyMatchFailure(gather,
                                          "non-static operand or start_indices");
@@ -80,7 +80,8 @@ class GatherIsSlice : public OpRewritePattern<GatherOp> {
                                          "start_index_map not empty or [0]");
     }
 
-    auto resultTy = gather.getResult().getType().dyn_cast<RankedTensorType>();
+    auto resultTy =
+        mlir::dyn_cast<RankedTensorType>(gather.getResult().getType());
 
     if (!resultTy) {
       return rewriter.notifyMatchFailure(gather, "unranked result");
@@ -110,7 +111,8 @@ class GatherIsSlice : public OpRewritePattern<GatherOp> {
     }
 
     auto gatherStartIndices = gather.getStartIndices();
-    auto gatherStartIndicesTy = gatherStartIndices.getType().cast<ShapedType>();
+    auto gatherStartIndicesTy =
+        mlir::cast<ShapedType>(gatherStartIndices.getType());
 
     llvm::SmallVector<Value, 4> sliceStartIndices;
 
@@ -125,8 +127,9 @@ class GatherIsSlice : public OpRewritePattern<GatherOp> {
             gather.getLoc(), gatherStartIndices, start, limit, stride);
         auto reshaped = rewriter.create<ReshapeOp>(
             gather.getLoc(),
-            RankedTensorType::get(
-                {}, indicesSlice.getType().cast<ShapedType>().getElementType()),
+            RankedTensorType::get({},
+                                  mlir::cast<ShapedType>(indicesSlice.getType())
+                                      .getElementType()),
             indicesSlice);
         sliceStartIndices.push_back(reshaped);
       }
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/passes.h b/third_party/xla/xla/mlir_hlo/mhlo/transforms/passes.h
index b9a025cdb55861..32eefa340b6ef0 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/passes.h
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/passes.h
@@ -31,9 +31,6 @@ class Pass;
 namespace func {
 class FuncOp;
 }  // namespace func
-namespace lmhlo {
-class FusionOp;
-}  // namespace lmhlo
 
 namespace mhlo {
 
@@ -81,10 +78,6 @@ std::unique_ptr<OperationPass<ModuleOp>> createLegalizeToMemrefPass();
 /// Lowers from HLO dialect to Arithmetic dialect.
 std::unique_ptr<OperationPass<ModuleOp>> createLegalizeToArithmeticPass();
 
-// Lowers shape operations from HLO dialect to Standard dialect.
-std::unique_ptr<OperationPass<func::FuncOp>>
-createLegalizeHloShapeOpsToStandardPass();
-
 /// Lowers from MHLO dialect to THLO dialect.
 std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeMHLOToTHLOPass(
     bool enableExperimentalOps = false);
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc
index f03bd96b273ed8..0397018add2b28 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc
@@ -64,12 +64,13 @@ void prepareConstantOp(Operation *op, SplatElementsAttr attr) {
   // Arbitrarily chosen "small" number. This could be chosen based on the proto
   // size too.
   if (attr.getNumElements() < 32) return;
-  ShapedType returnType = op->getResultTypes().front().cast<ShapedType>();
+  ShapedType returnType = mlir::cast<ShapedType>(op->getResultTypes().front());
   ImplicitLocOpBuilder b(op->getLoc(), op);
   ConstantOp cst;
-  if (auto complexTy = returnType.getElementType().dyn_cast<ComplexType>()) {
+  if (auto complexTy =
+          mlir::dyn_cast<ComplexType>(returnType.getElementType())) {
     auto tensorType = RankedTensorType::get({}, returnType.getElementType());
-    assert(complexTy.getElementType().isa<FloatType>() &&
+    assert(mlir::isa<FloatType>(complexTy.getElementType()) &&
            "unexpected int complex in MHLO");
     auto complexVal = attr.getSplatValue<std::complex<APFloat>>();
     cst = b.create<ConstantOp>(DenseElementsAttr::get(tensorType, complexVal));
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/restrict_max_rank/restrict_max_rank.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/restrict_max_rank/restrict_max_rank.cc
index fcaf6acf2e88eb..55200ab2c24227 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/restrict_max_rank/restrict_max_rank.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/restrict_max_rank/restrict_max_rank.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
@@ -66,9 +67,9 @@ struct RewriteReshapeTransposeReshape : public OpRewritePattern<TransposeOp> {
   LogicalResult matchAndRewrite(TransposeOp op,
                                 PatternRewriter &rewriter) const override {
     Value result = op.getResult();
-    TensorType resultTy = result.getType().cast<TensorType>();
+    TensorType resultTy = mlir::cast<TensorType>(result.getType());
     Value operand = op.getOperand();
-    TensorType operandTy = operand.getType().cast<TensorType>();
+    TensorType operandTy = mlir::cast<TensorType>(operand.getType());
     if (!operandTy.hasStaticShape() || !resultTy.hasStaticShape())
       return rewriter.notifyMatchFailure(op,
                                          "transpose op has non-static types");
@@ -92,7 +93,7 @@ struct RewriteReshapeTransposeReshape : public OpRewritePattern<TransposeOp> {
                                          "user of the result is not reshape");
 
     Value input = defOp.getOperand();
-    auto inputTy = input.getType().cast<TensorType>();
+    auto inputTy = mlir::cast<TensorType>(input.getType());
     auto outputTy = userOp.getType();
     if (!inputTy.hasStaticShape() || !outputTy.hasStaticShape())
       return rewriter.notifyMatchFailure(
@@ -151,7 +152,7 @@ struct RewriteReshapeTransposeReshape : public OpRewritePattern<TransposeOp> {
     // first dimension.
     for (int dim = spatialDims - 1; dim >= 0; dim--) {
       // 1) Reshape to split the particular spatial dimension.
-      auto inputTy = input.getType().cast<TensorType>();
+      auto inputTy = mlir::cast<TensorType>(input.getType());
       auto intermediateShape = llvm::to_vector<4>(inputTy.getShape());
       int64_t dimIdx = 1 + dim;
       intermediateShape[dimIdx] /= blockSizes[dim];
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/rewriters.h b/third_party/xla/xla/mlir_hlo/mhlo/transforms/rewriters.h
index c40a087ab52cbd..4e075c2d49274a 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/rewriters.h
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/rewriters.h
@@ -92,12 +92,6 @@ void populateScalarHloToArithmeticConversionPatterns(
     RewritePatternSet *patterns,
     llvm::function_ref<bool(Operation *)> filterFn = nullptr);
 
-// Collection of rewrite patterns for lowering of shape operations from the HLO
-// dialect to the standard dialect.
-void populateHloShapeOpsToStandardConversionPattern(
-    MLIRContext *context, TypeConverter &typeConverter,
-    RewritePatternSet *patterns);
-
 // Collection of rewrite patterns for lowering of HLO to Linalg dialect.
 void populateHloToLinalgConversionPattern(MLIRContext *context,
                                           TypeConverter &typeConverter,
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/shape_legalize_to_hlo/shape_legalize_to_hlo.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/shape_legalize_to_hlo/shape_legalize_to_hlo.cc
index 9a45db31ef3ae4..ef783c7d0c008c 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/shape_legalize_to_hlo/shape_legalize_to_hlo.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/shape_legalize_to_hlo/shape_legalize_to_hlo.cc
@@ -52,7 +52,7 @@ namespace mhlo {
 namespace {
 
 bool hasI32Style(Value value) {
-  auto type = value.getType().dyn_cast<ShapedType>();
+  auto type = mlir::dyn_cast<ShapedType>(value.getType());
   return type && type.getElementType().isInteger(32);
 }
 
@@ -68,7 +68,7 @@ Value castToI32(PatternRewriter& rewriter, Location loc, Value value) {
   Type resultType;
   if (value.getType().isIndex())
     resultType = RankedTensorType::get({}, rewriter.getI32Type());
-  if (auto valueType = value.getType().dyn_cast<ShapedType>()) {
+  if (auto valueType = mlir::dyn_cast<ShapedType>(value.getType())) {
     if (!valueType.hasStaticShape()) return {};
     if (valueType.getElementType().isInteger(32)) return value;
     if (valueType.getElementType().isIndex())
@@ -83,7 +83,7 @@ Value castToI32(PatternRewriter& rewriter, Location loc, Value value) {
 
 bool hasIndexStyle(Value value) {
   if (value.getType().isIndex()) return true;
-  auto type = value.getType().dyn_cast<ShapedType>();
+  auto type = mlir::dyn_cast<ShapedType>(value.getType());
   return type && type.getElementType().isIndex();
 }
 
@@ -98,7 +98,7 @@ bool hasIndexStyle(Value value) {
 Value castToIndex(PatternRewriter& rewriter, Location loc, Value value) {
   Type resultType;
   if (value.getType().isIndex()) return value;
-  if (auto valueType = value.getType().dyn_cast<ShapedType>()) {
+  if (auto valueType = mlir::dyn_cast<ShapedType>(value.getType())) {
     if (!valueType.hasStaticShape()) return {};
     if (valueType.getElementType().isInteger(32)) {
       if (valueType.getRank() == 0) {
@@ -126,79 +126,6 @@ void insertShapeAssertionCustomCall(OpBuilder builder, Location loc,
                       builder.getStringAttr("Shape assertion failed"));
 }
 
-struct ConvertComputeReshapeShapeOpPattern
-    : public OpRewritePattern<ComputeReshapeShapeOp> {
-  using OpRewritePattern::OpRewritePattern;
-  LogicalResult matchAndRewrite(ComputeReshapeShapeOp op,
-                                PatternRewriter& rewriter) const override {
-    // Cast num_elements from index to tensor<i32>.
-    // Cast dynamic_shape from tensor<Nxindex> to tensor<Nxi32> if needed.
-    // (mhlo.compute_reshape_shape supports both index- and integer-based
-    // dynamic_shape operands).
-    // This cannot error out given how the operation is currently defined.
-    auto numElementsI32 = castToI32(rewriter, op.getLoc(), op.getNumElements());
-    auto dynamicShapeI32x1 =
-        castToI32(rewriter, op.getLoc(), op.getDynamicShape());
-    if (!numElementsI32 || !dynamicShapeI32x1)
-      return rewriter.notifyMatchFailure(op, "cast to i32 failed");
-    auto rank = dynamicShapeI32x1.getType().cast<ShapedType>().getNumElements();
-
-    // Obtain individual input dimension sizes and also compute the product of
-    // all these dimension sizes.
-    auto i32Type = RankedTensorType::get({}, rewriter.getI32Type());
-    Value dynamicNumElementsI32 = rewriter.create<ConstantOp>(
-        op.getLoc(), DenseIntElementsAttr::get<int32_t>(i32Type, -1));
-    SmallVector<Value> dynamicSizesI32;
-    for (auto i = 0; i < rank; ++i) {
-      auto dynamicSizeI32x1 = rewriter.create<SliceOp>(
-          op.getLoc(), dynamicShapeI32x1, rewriter.getI64TensorAttr(i),
-          rewriter.getI64TensorAttr(i + 1), rewriter.getI64TensorAttr(1));
-      auto dynamicSizeI32 =
-          rewriter.create<ReshapeOp>(op.getLoc(), i32Type, dynamicSizeI32x1);
-      dynamicSizesI32.push_back(dynamicSizeI32);
-      dynamicNumElementsI32 = rewriter.create<MulOp>(
-          op.getLoc(), dynamicNumElementsI32, dynamicSizeI32);
-    }
-
-    // Compute the dimension size that corresponds to -1 in dynamic_shape.
-    // If such a dimension doesn't exist, then this value doesn't matter.
-    auto computedSizeI32 = rewriter.create<DivOp>(op.getLoc(), numElementsI32,
-                                                  dynamicNumElementsI32);
-
-    // Compute individual output dimension sizes, replacing a potential -1
-    // with the value computed above.
-    auto i32x1Type = RankedTensorType::get({1}, rewriter.getI32Type());
-    Value minusOneI32 = rewriter.create<ConstantOp>(
-        op.getLoc(), DenseIntElementsAttr::get<int32_t>(i32Type, -1));
-    SmallVector<Value> resultSizesI32x1;
-    for (auto i = 0; i < rank; ++i) {
-      auto eqMinusOne =
-          rewriter.create<CompareOp>(op.getLoc(), dynamicSizesI32[i],
-                                     minusOneI32, ComparisonDirection::EQ);
-      auto resultSizeI32 = rewriter.create<SelectOp>(
-          op.getLoc(), eqMinusOne, computedSizeI32, dynamicSizesI32[i]);
-      auto resultSizeI32x1 =
-          rewriter.create<ReshapeOp>(op.getLoc(), i32x1Type, resultSizeI32);
-      resultSizesI32x1.push_back(resultSizeI32x1);
-    }
-    auto resultI32 =
-        rewriter.create<mhlo::ConcatenateOp>(op.getLoc(), resultSizesI32x1,
-                                             /*dimension=*/0);
-
-    // Cast the result to tensor<Nxindex> if needed.
-    // (mhlo.compute_reshape_shape supports both index- and integer-based
-    // results).
-    // This cannot error out given how the operation is currently defined.
-    auto resultIndex = hasI32Style(op.getResult())
-                           ? resultI32
-                           : castToIndex(rewriter, op.getLoc(), resultI32);
-    if (!resultIndex || resultIndex.getType() != op.getResult().getType())
-      return rewriter.notifyMatchFailure(op, "cast to index failed");
-    rewriter.replaceOp(op, resultIndex);
-    return success();
-  }
-};
-
 struct ConvertNumElementsOpPattern
     : public OpRewritePattern<shape::NumElementsOp> {
   using OpRewritePattern::OpRewritePattern;
@@ -208,7 +135,7 @@ struct ConvertNumElementsOpPattern
     // This will error out if shape is !shape.shape.
     auto shapeI32 = castToI32(rewriter, op.getLoc(), op.getShape());
     if (!shapeI32) return rewriter.notifyMatchFailure(op, "cast to i32 failed");
-    auto rank = shapeI32.getType().cast<ShapedType>().getNumElements();
+    auto rank = mlir::cast<ShapedType>(shapeI32.getType()).getNumElements();
 
     // Compute the product of the individual dimension sizes.
     // Using this representation instead of mhlo::ReduceOp because it is more
@@ -241,7 +168,7 @@ struct ConvertShapeOfOpPattern : public OpRewritePattern<shape::ShapeOfOp> {
   using OpRewritePattern::OpRewritePattern;
   LogicalResult matchAndRewrite(shape::ShapeOfOp op,
                                 PatternRewriter& rewriter) const override {
-    auto operandType = op.getArg().getType().dyn_cast<RankedTensorType>();
+    auto operandType = mlir::dyn_cast<RankedTensorType>(op.getArg().getType());
     if (!operandType)
       return rewriter.notifyMatchFailure(op, "expected ranked operand");
 
@@ -276,7 +203,8 @@ struct ConvertConstShapeOpPattern
   using OpRewritePattern::OpRewritePattern;
   LogicalResult matchAndRewrite(shape::ConstShapeOp op,
                                 PatternRewriter& rewriter) const override {
-    auto operandType = op.getResult().getType().dyn_cast<RankedTensorType>();
+    auto operandType =
+        mlir::dyn_cast<RankedTensorType>(op.getResult().getType());
     if (!operandType)
       return rewriter.notifyMatchFailure(op, "expected ranked operand");
 
@@ -300,7 +228,8 @@ struct ConvertIndexCastOpPattern : public OpRewritePattern<arith::IndexCastOp> {
   LogicalResult matchAndRewrite(arith::IndexCastOp op,
                                 PatternRewriter& rewriter) const override {
     Value result = op.getIn();
-    if (hasIndexStyle(op.getIn()) && !op.getIn().getType().isa<ShapedType>()) {
+    if (hasIndexStyle(op.getIn()) &&
+        !mlir::isa<ShapedType>(op.getIn().getType())) {
       // Handle a special case of index -> i64.
       // This is converted to the following sequence:
       //   unrealized_conversion_cast index -> tensor<i32>
@@ -315,7 +244,8 @@ struct ConvertIndexCastOpPattern : public OpRewritePattern<arith::IndexCastOp> {
                                  op.getLoc(), op.getOut().getType(), result));
       return success();
     }
-    if (!op.getIn().getType().isa<ShapedType>() && hasIndexStyle(op.getOut())) {
+    if (!mlir::isa<ShapedType>(op.getIn().getType()) &&
+        hasIndexStyle(op.getOut())) {
       // Handle a special case of i32 -> index.
       // This is converted to the following sequence:
       //   unrealized_conversion_cast i32 -> tensor<i32>
@@ -406,8 +336,8 @@ struct ConvertShapeBroadcastOpPattern
     auto shape1 = castToI32(rewriter, op.getLoc(), op.getShapes().front());
     auto shape2 = castToI32(rewriter, op.getLoc(), op.getShapes().back());
     if (!shape1 || !shape2) return failure();
-    auto tensorType1 = shape1.getType().dyn_cast<RankedTensorType>();
-    auto tensorType2 = shape2.getType().dyn_cast<RankedTensorType>();
+    auto tensorType1 = mlir::dyn_cast<RankedTensorType>(shape1.getType());
+    auto tensorType2 = mlir::dyn_cast<RankedTensorType>(shape2.getType());
     if (!tensorType1 || !tensorType2) return failure();
 
     // If the two operand shapes are of different sizes, the smaller one is
@@ -530,7 +460,7 @@ struct ConvertTensorFromElementsPattern
   LogicalResult matchAndRewrite(tensor::FromElementsOp op,
                                 PatternRewriter& rewriter) const override {
     auto tensorType =
-        op.getResult().getType().dyn_cast_or_null<RankedTensorType>();
+        mlir::dyn_cast_or_null<RankedTensorType>(op.getResult().getType());
     if (!tensorType) {
       return failure();
     }
@@ -588,8 +518,8 @@ struct ConvertCstrBroadcastableOp
     auto shape1 = castToI32(rewriter, op.getLoc(), op.getShapes().front());
     auto shape2 = castToI32(rewriter, op.getLoc(), op.getShapes().back());
     if (!shape1 || !shape2) return failure();
-    auto tensorType1 = shape1.getType().dyn_cast<RankedTensorType>();
-    auto tensorType2 = shape2.getType().dyn_cast<RankedTensorType>();
+    auto tensorType1 = mlir::dyn_cast<RankedTensorType>(shape1.getType());
+    auto tensorType2 = mlir::dyn_cast<RankedTensorType>(shape2.getType());
     if (!tensorType1 || !tensorType2) return failure();
 
     // If the two operand shapes are of different sizes, the smaller one is
@@ -647,107 +577,6 @@ struct ConvertCstrBroadcastableOp
   }
 };
 
-// As defined in tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.td, the
-// dynamic shape is reshapable if it has only 0 or 1 dynamic dimensions and the
-// number of element can divide the product of the static dimension sizes.
-struct ConvertCstrReshapableOp
-    : public OpRewritePattern<mhlo::CstrReshapableOp> {
-  using OpRewritePattern::OpRewritePattern;
-  LogicalResult matchAndRewrite(mhlo::CstrReshapableOp op,
-                                PatternRewriter& rewriter) const override {
-    Value numElements;
-    if (auto constIndex = dyn_cast_or_null<arith::ConstantIndexOp>(
-            op.getNumElements().getDefiningOp())) {
-      numElements = rewriter.create<ConstantOp>(
-          op.getLoc(), DenseIntElementsAttr::get<int32_t>(
-                           RankedTensorType::get({}, rewriter.getI32Type()),
-                           static_cast<int32_t>(constIndex.value())));
-    } else {
-      numElements = castToI32(rewriter, op->getLoc(), op.getNumElements());
-    }
-    Value dyanmicShape =
-        castToI32(rewriter, op->getLoc(), op.getDynamicShape());
-    if (!dyanmicShape || !numElements) return failure();
-    auto dyanmicShapeType =
-        dyanmicShape.getType().dyn_cast_or_null<RankedTensorType>();
-    if (!dyanmicShapeType || dyanmicShapeType.getRank() != 1) return failure();
-
-    auto i32Type = RankedTensorType::get({}, rewriter.getI32Type());
-    Value minusOne = rewriter.create<ConstantOp>(
-        op.getLoc(), DenseIntElementsAttr::get<int32_t>(i32Type, -1));
-    Value one = rewriter.create<ConstantOp>(
-        op.getLoc(), DenseIntElementsAttr::get<int32_t>(i32Type, 1));
-    Value zero = rewriter.create<ConstantOp>(
-        op.getLoc(), DenseIntElementsAttr::get<int32_t>(i32Type, 0));
-    Value productAllDimSizes = one;
-    Value numDyanmicDim = zero;
-    for (auto i = 0; i < dyanmicShapeType.getDimSize(0); ++i) {
-      // Calculate the product of static dimension sizes.
-      Value dimSize = rewriter.create<SliceOp>(
-          op.getLoc(), dyanmicShape, rewriter.getI64TensorAttr(i),
-          rewriter.getI64TensorAttr(i + 1), rewriter.getI64TensorAttr(1));
-      dimSize = rewriter.create<ReshapeOp>(op.getLoc(), i32Type, dimSize);
-      productAllDimSizes =
-          rewriter.create<MulOp>(op.getLoc(), productAllDimSizes, dimSize);
-      // Count number of -1 dims, aka dynamic dimensions.
-      Value eqMinusOne = rewriter.create<CompareOp>(
-          op.getLoc(), dimSize, minusOne, ComparisonDirection::EQ);
-      eqMinusOne =
-          rewriter.create<SelectOp>(op.getLoc(), eqMinusOne, one, zero);
-      numDyanmicDim =
-          rewriter.create<AddOp>(op.getLoc(), numDyanmicDim, eqMinusOne);
-    }
-
-    // Here we handle two situations below. Either one is a valid reshape.
-    // A: There is 1 dynamic dimension and the number of elements can be divided
-    //    by the product of static dim sizes.
-    // B: There is no dynamic dimension and the number of elements equals the
-    //    product of all dim sizes.
-
-    // A.1: Check there is 1 dynamic dim.
-    Value exactlyOneDynamicDim = rewriter.create<CompareOp>(
-        op.getLoc(), numDyanmicDim, one, ComparisonDirection::EQ);
-
-    // A.2: Calculate product of all static dim sizes. Multiple by -1 to cancel
-    // with the dynamic dim size -1.
-    Value productStaticDimSizes =
-        rewriter.create<MulOp>(op.getLoc(), productAllDimSizes, minusOne);
-
-    // A.3: Check number of elements can be divided by product of static dim
-    // sizes.
-    Value rem =
-        rewriter.create<RemOp>(op.getLoc(), numElements, productStaticDimSizes);
-    Value dynamicReshapable = rewriter.create<CompareOp>(
-        op.getLoc(), rem, zero, ComparisonDirection::EQ);
-
-    // A.4: Check both conditions for scenario A are true.
-    dynamicReshapable = rewriter.create<AndOp>(op.getLoc(), dynamicReshapable,
-                                               exactlyOneDynamicDim);
-
-    // B.1: Check there is no dynamic dim.
-    Value noDynamicDim = rewriter.create<CompareOp>(
-        op.getLoc(), numDyanmicDim, zero, ComparisonDirection::EQ);
-
-    // B.2: Check product of all dim sizes equals number of elements.
-    Value staticReshapable = rewriter.create<CompareOp>(
-        op.getLoc(), productAllDimSizes, numElements, ComparisonDirection::EQ);
-
-    // B.3: Check both conditions for scenario B are true.
-    staticReshapable =
-        rewriter.create<AndOp>(op.getLoc(), noDynamicDim, staticReshapable);
-
-    // Check if either scenario is true.
-    Value reshapable =
-        rewriter.create<OrOp>(op.getLoc(), dynamicReshapable, staticReshapable);
-
-    // Add CustomCallOp and replace Cstr op with const witness, which is
-    // useful for canonicalizer to remove the shape.assuming region.
-    insertShapeAssertionCustomCall(rewriter, op->getLoc(), reshapable);
-    rewriter.replaceOpWithNewOp<shape::ConstWitnessOp>(op.getOperation(), true);
-    return success();
-  }
-};
-
 template <typename OpType>
 struct CastOperandsPattern : public OpRewritePattern<OpType> {
   using OpRewritePattern<OpType>::OpRewritePattern;
@@ -787,8 +616,7 @@ struct ShapeLegalizeToHloPass
 
   void runOnOperation() override {
     // In order to make dynamic MHLO programs compatible with HLO,
-    // we need to get rid of all non-MHLO ops as well as the two shape-related
-    // MHLO ops: mhlo.compute_reshape_shape and mhlo.cstr_reshapable.
+    // we need to get rid of all non-MHLO ops.
     //
     // As an example, a cursory inspection of the TF/XLA bridge, which provides
     // one data point of an MHLO producer that can generate dynamic MHLO
@@ -818,8 +646,6 @@ struct ShapeLegalizeToHloPass
     ConversionTarget target(getContext());
     target.addIllegalDialect<shape::ShapeDialect>();
     target.addIllegalDialect<tensor::TensorDialect>();
-    target.addIllegalOp<mhlo::ComputeReshapeShapeOp>();
-    target.addIllegalOp<mhlo::CstrReshapableOp>();
     target.addIllegalOp<arith::IndexCastOp>();
     target.addIllegalOp<arith::MulIOp>();
     target.addDynamicallyLegalDialect<mhlo::MhloDialect>([](Operation* op) {
@@ -841,7 +667,6 @@ struct ShapeLegalizeToHloPass
     // to ultimately annihilate with each other upon canonicalization if
     // everything went right.
     RewritePatternSet patterns(&getContext());
-    patterns.add<ConvertComputeReshapeShapeOpPattern>(&getContext());
     patterns.add<ConvertConstShapeOpPattern>(&getContext());
     patterns.add<ConvertMulIOpPattern>(&getContext());
     patterns.add<ConvertIndexCastOpPattern>(&getContext());
@@ -855,7 +680,6 @@ struct ShapeLegalizeToHloPass
     patterns.add<ConvertTensorFromElementsPattern>(&getContext());
     if (this->legalize_constraints_) {
       patterns.add<ConvertCstrBroadcastableOp>(&getContext());
-      patterns.add<ConvertCstrReshapableOp>(&getContext());
     }
     if (failed(applyPartialConversion(getOperation(), target,
                                       std::move(patterns))))
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/shape_reification/shape_reification_pass.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/shape_reification/shape_reification_pass.cc
index af1738fe3c532c..054544274eb08d 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/shape_reification/shape_reification_pass.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/shape_reification/shape_reification_pass.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Interfaces/InferTypeOpInterface.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir {
@@ -52,7 +53,8 @@ struct ShapeReificationPattern : public OpRewritePattern<shape::ShapeOfOp> {
                                             reifications))) {
       return failure();
     }
-    Value shape = reifications[op.getArg().cast<OpResult>().getResultNumber()];
+    Value shape =
+        reifications[mlir::cast<OpResult>(op.getArg()).getResultNumber()];
 
     // Insert cast, if needed.
     if (shape.getType() != op.getType()) {
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/shape_simplification/shape_simplification.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/shape_simplification/shape_simplification.cc
index 22e980e96e0741..b96370f71cf23c 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/shape_simplification/shape_simplification.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/shape_simplification/shape_simplification.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir {
@@ -183,7 +184,8 @@ struct ExtractFromBroadcastedTensorCanonicalizationPattern
     for (auto shape : broadcastOp.getShapes()) {
       auto shapeOfOp = shape.getDefiningOp<ShapeOfOp>();
       if (!shapeOfOp) return failure();
-      auto shapedType = shapeOfOp->getOperandTypes().front().cast<ShapedType>();
+      auto shapedType =
+          mlir::cast<ShapedType>(shapeOfOp->getOperandTypes().front());
 
       // Abort on the existence of unranked shapes as they require more logic.
       if (!shapedType.hasRank()) return failure();
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc
index 81153b92ecd06f..a7adb9c987368b 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/Types.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "stablehlo/dialect/StablehloOps.h"
@@ -47,12 +48,12 @@ Attribute convertAttr(Attribute stablehloAttr) {
   // StableHLO uses DenseArray for some attributes, MHLO is in the process
   // of integrating this change. In the meantime, convert DenseArray to
   // DenseElementsAttr.
-  if (auto attr = stablehloAttr.dyn_cast<DenseI64ArrayAttr>()) {
+  if (auto attr = mlir::dyn_cast<DenseI64ArrayAttr>(stablehloAttr)) {
     return DenseIntElementsAttr::get(
         RankedTensorType::get(attr.getSize(), attr.getElementType()),
         attr.asArrayRef());
   }
-  if (auto attr = stablehloAttr.dyn_cast<DenseBoolArrayAttr>()) {
+  if (auto attr = mlir::dyn_cast<DenseBoolArrayAttr>(stablehloAttr)) {
     return DenseIntElementsAttr::get(
         RankedTensorType::get(attr.getSize(), attr.getElementType()),
         attr.asArrayRef());
@@ -61,19 +62,20 @@ Attribute convertAttr(Attribute stablehloAttr) {
   // Handle StableHLO attributes.
   // The logic that handles attributes from other dialects (e.g. builtin
   // attributes) lives below.
-  if (auto attr = stablehloAttr.dyn_cast<stablehlo::ChannelHandleAttr>()) {
+  if (auto attr = mlir::dyn_cast<stablehlo::ChannelHandleAttr>(stablehloAttr)) {
     return mhlo::ChannelHandleAttr::get(attr.getContext(), attr.getHandle(),
                                         attr.getType());
   }
   if (auto attr =
-          stablehloAttr.dyn_cast<stablehlo::ComparisonDirectionAttr>()) {
+          mlir::dyn_cast<stablehlo::ComparisonDirectionAttr>(stablehloAttr)) {
     RETURN_CONVERTED_ENUM_ATTR(ComparisonDirection);
   }
-  if (auto attr = stablehloAttr.dyn_cast<stablehlo::ComparisonTypeAttr>()) {
+  if (auto attr =
+          mlir::dyn_cast<stablehlo::ComparisonTypeAttr>(stablehloAttr)) {
     RETURN_CONVERTED_ENUM_ATTR(ComparisonType);
   }
   if (auto attr =
-          stablehloAttr.dyn_cast<stablehlo::ConvDimensionNumbersAttr>()) {
+          mlir::dyn_cast<stablehlo::ConvDimensionNumbersAttr>(stablehloAttr)) {
     return mhlo::ConvDimensionNumbersAttr::get(
         attr.getContext(), attr.getInputBatchDimension(),
         attr.getInputFeatureDimension(), attr.getInputSpatialDimensions(),
@@ -83,47 +85,49 @@ Attribute convertAttr(Attribute stablehloAttr) {
         attr.getOutputFeatureDimension(), attr.getOutputSpatialDimensions());
   }
   if (auto attr =
-          stablehloAttr.dyn_cast<stablehlo::CustomCallApiVersionAttr>()) {
+          mlir::dyn_cast<stablehlo::CustomCallApiVersionAttr>(stablehloAttr)) {
     RETURN_CONVERTED_ENUM_ATTR(CustomCallApiVersion);
   }
   if (auto attr =
-          stablehloAttr.dyn_cast<stablehlo::DotDimensionNumbersAttr>()) {
+          mlir::dyn_cast<stablehlo::DotDimensionNumbersAttr>(stablehloAttr)) {
     return mhlo::DotDimensionNumbersAttr::get(
         attr.getContext(), attr.getLhsBatchingDimensions(),
         attr.getRhsBatchingDimensions(), attr.getLhsContractingDimensions(),
         attr.getRhsContractingDimensions());
   }
-  if (auto attr = stablehloAttr.dyn_cast<stablehlo::FftTypeAttr>()) {
+  if (auto attr = mlir::dyn_cast<stablehlo::FftTypeAttr>(stablehloAttr)) {
     RETURN_CONVERTED_ENUM_ATTR(FftType);
   }
-  if (auto attr =
-          stablehloAttr.dyn_cast<stablehlo::GatherDimensionNumbersAttr>()) {
+  if (auto attr = mlir::dyn_cast<stablehlo::GatherDimensionNumbersAttr>(
+          stablehloAttr)) {
     return mhlo::GatherDimensionNumbersAttr::get(
         attr.getContext(), attr.getOffsetDims(), attr.getCollapsedSliceDims(),
         attr.getStartIndexMap(), attr.getIndexVectorDim());
   }
-  if (auto attr = stablehloAttr.dyn_cast<stablehlo::OutputOperandAliasAttr>()) {
+  if (auto attr =
+          mlir::dyn_cast<stablehlo::OutputOperandAliasAttr>(stablehloAttr)) {
     return mhlo::OutputOperandAliasAttr::get(
         attr.getContext(), attr.getOutputTupleIndices(), attr.getOperandIndex(),
         attr.getOperandTupleIndices());
   }
-  if (auto attr = stablehloAttr.dyn_cast<stablehlo::PrecisionAttr>()) {
+  if (auto attr = mlir::dyn_cast<stablehlo::PrecisionAttr>(stablehloAttr)) {
     RETURN_CONVERTED_ENUM_ATTR(Precision);
   }
-  if (auto attr = stablehloAttr.dyn_cast<stablehlo::RngAlgorithmAttr>()) {
+  if (auto attr = mlir::dyn_cast<stablehlo::RngAlgorithmAttr>(stablehloAttr)) {
     RETURN_CONVERTED_ENUM_ATTR(RngAlgorithm);
   }
-  if (auto attr = stablehloAttr.dyn_cast<stablehlo::RngDistributionAttr>()) {
+  if (auto attr =
+          mlir::dyn_cast<stablehlo::RngDistributionAttr>(stablehloAttr)) {
     RETURN_CONVERTED_ENUM_ATTR(RngDistribution);
   }
-  if (auto attr =
-          stablehloAttr.dyn_cast<stablehlo::ScatterDimensionNumbersAttr>()) {
+  if (auto attr = mlir::dyn_cast<stablehlo::ScatterDimensionNumbersAttr>(
+          stablehloAttr)) {
     return mhlo::ScatterDimensionNumbersAttr::get(
         attr.getContext(), attr.getUpdateWindowDims(),
         attr.getInsertedWindowDims(), attr.getScatterDimsToOperandDims(),
         attr.getIndexVectorDim());
   }
-  if (auto attr = stablehloAttr.dyn_cast<stablehlo::TransposeAttr>()) {
+  if (auto attr = mlir::dyn_cast<stablehlo::TransposeAttr>(stablehloAttr)) {
     RETURN_CONVERTED_ENUM_ATTR(Transpose);
   }
   if (stablehloAttr.getDialect().getNamespace() ==
@@ -137,7 +141,7 @@ Attribute convertAttr(Attribute stablehloAttr) {
   // Handle non-StableHLO attributes.
   // If an attribute is not defined in StableHLO, then it is unchanged,
   // with the exception of ArrayAttr which is converted recursively.
-  if (auto stablehloAttrs = stablehloAttr.dyn_cast<ArrayAttr>()) {
+  if (auto stablehloAttrs = mlir::dyn_cast<ArrayAttr>(stablehloAttr)) {
     SmallVector<Attribute> hloAttrs;
     for (auto stablehloAttr : stablehloAttrs) {
       auto hloAttr = convertAttr(stablehloAttr);
@@ -154,11 +158,11 @@ Attribute convertAttr(Attribute stablehloAttr) {
 // Convert array of enum strings to array of enum attrs
 //   ["PACKED_NIBBLE"] --> [#mhlo<precision PACKED_NIBBLE>]
 Attribute decodePrecisionConfig(Attribute stablehloAttr) {
-  auto arrayAttr = stablehloAttr.dyn_cast<ArrayAttr>();
+  auto arrayAttr = mlir::dyn_cast<ArrayAttr>(stablehloAttr);
   if (!arrayAttr) return {};
   SmallVector<Attribute> hloAttrs;
   for (auto attr : arrayAttr) {
-    auto precisionStr = attr.dyn_cast<StringAttr>();
+    auto precisionStr = mlir::dyn_cast<StringAttr>(attr);
     if (!precisionStr) return {};
     auto precisionOpt = mhlo::symbolizePrecision(precisionStr.getValue());
     if (!precisionOpt.has_value()) return {};
@@ -229,8 +233,8 @@ LogicalResult rewriteCustomCallAsMhloOp(stablehlo::CustomCallOp stablehloOp,
     return failure();
   }
 
-  auto stablehloConvertedAttrs = stablehloOp->getAttr("mhlo.attributes")
-                                     .dyn_cast_or_null<DictionaryAttr>();
+  auto stablehloConvertedAttrs = mlir::dyn_cast_or_null<DictionaryAttr>(
+      stablehloOp->getAttr("mhlo.attributes"));
   if (!stablehloConvertedAttrs) {
     return failure();
   }
@@ -268,7 +272,7 @@ LogicalResult rewriteCustomCallAsMhloOp(stablehlo::CustomCallOp stablehloOp,
   if (stablehloHasRegion) {
     auto stablehloRegionOp =
         stablehloOp->getParentOfType<ModuleOp>().lookupSymbol<func::FuncOp>(
-            stablehloCalledComputations[0].cast<FlatSymbolRefAttr>());
+            mlir::cast<FlatSymbolRefAttr>(stablehloCalledComputations[0]));
     if (failed(convertFuncToStablehloRegion(hloOp, stablehloRegionOp, rewriter,
                                             typeConverter)))
       return failure();
@@ -289,8 +293,7 @@ LogicalResult fixupMhloBackendConfig(stablehlo::CustomCallOp stablehloOp,
   auto stablehloBackendConfig = stablehloOp->getAttr("mhlo.backend_config");
   if (stablehloBackendConfig) {
     if (auto oldHloBackendConfig =
-            hloOp.getBackendConfigAttr()
-                .template dyn_cast_or_null<StringAttr>()) {
+            mlir::dyn_cast_or_null<StringAttr>(hloOp.getBackendConfigAttr())) {
       if (!oldHloBackendConfig.empty()) return failure();
     } else {
       return failure();
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/symbolic_shape_optimization/symbolic_shape_optimization.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/symbolic_shape_optimization/symbolic_shape_optimization.cc
index 402bf827c6393c..20808e4d12d9e7 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/symbolic_shape_optimization/symbolic_shape_optimization.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/symbolic_shape_optimization/symbolic_shape_optimization.cc
@@ -219,28 +219,6 @@ struct AnnotateExpandingDimensionsInDynamicBroadcastInDim
   }
 };
 
-// Remove compute_reshape_shape if we can prove that the dynamic shape does not
-// contain a `-1` dimension.
-struct RemoveComputeReshapeShape final
-    : public OpRewritePattern<mhlo::ComputeReshapeShapeOp> {
-  using OpRewritePattern::OpRewritePattern;
-  LogicalResult matchAndRewrite(mhlo::ComputeReshapeShapeOp op,
-                                PatternRewriter &rewriter) const override {
-    ShapeComponentAnalysis shapeComponentAnalysis;
-    auto dynamicShape =
-        shapeComponentAnalysis.GetValueInfo(op.getDynamicShape());
-    if (!dynamicShape) return failure();
-
-    if (llvm::any_of(*dynamicShape, [](const auto &dim) {
-          return !dim.isKnownNotNegativeOne();
-        })) {
-      return failure();
-    }
-    rewriter.replaceOp(op, op.getDynamicShape());
-    return success();
-  }
-};
-
 bool isProduct(AffineExpr expr,
                llvm::function_ref<void(AffineConstantExpr)> cbkConstantFactor,
                llvm::function_ref<void(AffineSymbolExpr)> cbkSymbolicFactor) {
@@ -288,92 +266,6 @@ bool isSymbolicProduct(const SymbolicExpr &symbolicExpr,
       [&](Symbol s) { product->symbolic.push_back(s); });
 }
 
-struct RemoveRedundantCstrReshapable final
-    : public OpRewritePattern<mhlo::CstrReshapableOp> {
-  using OpRewritePattern::OpRewritePattern;
-  LogicalResult matchAndRewrite(mhlo::CstrReshapableOp op,
-                                PatternRewriter &rewriter) const override {
-    // Get shape analysis info for the number of elements.
-    ShapeComponentAnalysis shapeComponentAnalysis;
-    auto numElementsInfo =
-        shapeComponentAnalysis.GetValueInfo(op.getNumElements());
-    if (!numElementsInfo) return failure();
-    assert(numElementsInfo->size() == 1 && "expect one value for a scalar");
-    auto numElements = numElementsInfo->front();
-
-    // Get shape analysis info for the dynamic shape.
-    auto dynShapeDims =
-        shapeComponentAnalysis.GetValueInfo(op.getDynamicShape());
-    if (!dynShapeDims) return failure();
-
-    // We can handle two cases:
-    //   - there is exactly one -1 in the dynamic shape, i.e. a unique wildcard
-    //     dimension, or
-    //   - there is no -1 in the dynamic shape, i.e. no wildcard dimension.
-    bool uniqueWildcardDimension = false;
-    for (const auto &d : *dynShapeDims) {
-      if (d.isConstant(-1)) {
-        if (uniqueWildcardDimension) return failure();
-        uniqueWildcardDimension = true;
-      } else if (!d.isKnownNotNegativeOne()) {
-        return failure();
-      }
-    }
-
-    // We can only handle simple products with constants and symbols. Find all
-    // the factors based on the number of elements.
-    SymbolicProduct numElementsRemainingFactors;
-    if (!isSymbolicProduct(numElements, &numElementsRemainingFactors)) {
-      return failure();
-    }
-    assert(numElementsRemainingFactors.concrete >= 1 &&
-           "number of elements cannot entail negative or zero factors");
-
-    // Find all factors based on the dynamic shape.
-    //   - Accumulate the conrete product to later compare it against its
-    //     equivalent based on the number of elements.
-    //   - Remove symbolic factors from the list and fail if we find an unknown
-    //     factor, i.e. if the symbolic factors based on the dynamic shape are
-    //     not a subset of the factors based on the number of elements.
-    int64_t concreteProductDynShape = 1;
-    for (const auto &dim : *dynShapeDims) {
-      SmallVector<Symbol> partialSymbolicFactorsDynShape;
-      if (!isSymbolicProduct(
-              dim,
-              [&](int64_t c) {
-                if (c != -1) concreteProductDynShape *= c;
-              },
-              [&](Symbol s) { partialSymbolicFactorsDynShape.push_back(s); })) {
-        return failure();
-      }
-      for (const Symbol &symDynShape : partialSymbolicFactorsDynShape) {
-        auto *it =
-            llvm::find(numElementsRemainingFactors.symbolic, symDynShape);
-        if (it == numElementsRemainingFactors.symbolic.end()) return failure();
-        numElementsRemainingFactors.symbolic.erase(it);
-      }
-    }
-    assert(concreteProductDynShape >= 1 &&
-           "concrete product must not aggregate negative or zero factors");
-
-    // A wildcard dimension can subsume the remaining symbolic factors and
-    // potentially also a concrete factor.
-    if (uniqueWildcardDimension) {
-      if (numElementsRemainingFactors.concrete % concreteProductDynShape != 0)
-        return failure();
-      rewriter.replaceOpWithNewOp<shape::ConstWitnessOp>(op, true);
-      return success();
-    }
-
-    // W/o a wildcard, the symbolic and concrete products must be equal.
-    bool isReshapable =
-        numElementsRemainingFactors.symbolic.empty() &&
-        numElementsRemainingFactors.concrete == concreteProductDynShape;
-    rewriter.replaceOpWithNewOp<shape::ConstWitnessOp>(op, isReshapable);
-    return success();
-  }
-};
-
 LogicalResult materializeReshapeAsScalarExpand(RankedTensorType operandTy,
                                                RankedTensorType resultTy,
                                                mhlo::DynamicReshapeOp op,
@@ -719,9 +611,10 @@ struct DynamicReshapeToExpandAndCollapseShape final
   using OpRewritePattern::OpRewritePattern;
   LogicalResult matchAndRewrite(mhlo::DynamicReshapeOp op,
                                 PatternRewriter &rewriter) const override {
-    auto operandTy = op.getOperand().getType().dyn_cast<RankedTensorType>();
+    auto operandTy =
+        mlir::dyn_cast<RankedTensorType>(op.getOperand().getType());
     if (!operandTy) return failure();
-    auto resultTy = op.getType().dyn_cast<RankedTensorType>();
+    auto resultTy = mlir::dyn_cast<RankedTensorType>(op.getType());
     if (!resultTy) return failure();
 
     // Handle degenerate scalar expand case.
@@ -838,7 +731,8 @@ std::optional<Value> simplifyBroadcast(ShapeComponentAnalysis &analysis,
     // 1 dimensions are filtered above, recreate the constant.
     if (!shapeAndRankForDim[i].first) {
       auto one = builder->getIntegerAttr(
-          shapes[0].getType().cast<RankedTensorType>().getElementType(), 1);
+          mlir::cast<RankedTensorType>(shapes[0].getType()).getElementType(),
+          1);
       elements.push_back(builder->create<arith::ConstantOp>(loc, one));
       continue;
     }
@@ -892,8 +786,6 @@ class SymbolicShapeOptimizationPass final
         BroadcastOpLowering,
         CstrBroadcastableOpLowering,
         DynamicReshapeToExpandAndCollapseShape,
-        RemoveComputeReshapeShape,
-        RemoveRedundantCstrReshapable,
         SimplifyBroadcasts>(ctx);
     // clang-format on
 
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/unfuse_batch_norm/unfuse_batch_norm.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/unfuse_batch_norm/unfuse_batch_norm.cc
index c59722ded20ad7..f2ad3562a9f3fd 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/unfuse_batch_norm/unfuse_batch_norm.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/unfuse_batch_norm/unfuse_batch_norm.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/Types.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir {
@@ -53,7 +54,8 @@ Value broadcastToFeatureDim(Location loc, RankedTensorType resultType,
 // Get the shape of operand, assuming it is a dynamic shape with static rank.
 Value getShapeValue(Location loc, Value operand,
                     PatternRewriter &rewriter) {  // NOLINT
-  RankedTensorType resultType = operand.getType().dyn_cast<RankedTensorType>();
+  RankedTensorType resultType =
+      mlir::dyn_cast<RankedTensorType>(operand.getType());
   return rewriter.create<mlir::shape::ShapeOfOp>(
       loc,
       RankedTensorType::get({resultType.getRank()}, rewriter.getIndexType()),
@@ -84,7 +86,7 @@ Value materializeEpsilon(Operation *op, FloatAttr epsilonAttr, FloatType fpType,
 
   auto scalarType = RankedTensorType::get({}, fpType);
   auto epsilonTensorAttr =
-      DenseElementsAttr::get(scalarType, {epsilonAttr.cast<Attribute>()});
+      DenseElementsAttr::get(scalarType, {mlir::cast<Attribute>(epsilonAttr)});
   Value epsilon = b.create<mhlo::ConstantOp>(epsilonTensorAttr);
   auto dimsType = RankedTensorType::get({0}, b.getIntegerType(64));
   auto dims = DenseIntElementsAttr::get(dimsType, SmallVector<int64_t, 1>{});
@@ -108,13 +110,14 @@ class UnfuseBatchNormInferencePattern
     // Enforce type invariants.
     // Note that we deduce the actual element type from the variance,
     // which should not be subject to quantization at a higher level.
-    auto inputType = bnOp.getOperand().getType().dyn_cast<RankedTensorType>();
+    auto inputType =
+        mlir::dyn_cast<RankedTensorType>(bnOp.getOperand().getType());
     auto varianceType =
-        bnOp.getVariance().getType().dyn_cast<RankedTensorType>();
+        mlir::dyn_cast<RankedTensorType>(bnOp.getVariance().getType());
     if (!inputType || !varianceType) {
       return failure();
     }
-    auto fpType = varianceType.getElementType().dyn_cast<FloatType>();
+    auto fpType = mlir::dyn_cast<FloatType>(varianceType.getElementType());
     if (!fpType) {
       return failure();
     }
@@ -168,7 +171,7 @@ class UnfuseBatchNormInferencePattern
 Value createReduce(Location loc, Value operand, Value zero,
                    SmallVector<int64_t>& reduceDims, int64_t featureIndex,
                    PatternRewriter& rewriter) {
-  auto operandType = operand.getType().cast<RankedTensorType>();
+  auto operandType = mlir::cast<RankedTensorType>(operand.getType());
   Type reduceResultType = RankedTensorType::get(
       {operandType.getDimSize(featureIndex)}, operandType.getElementType());
   mhlo::ReduceOp reduce =
@@ -233,7 +236,7 @@ Value calculateReduceSize(Operation *op, Value operand,
   llvm::APFloat floatValue(static_cast<double>(reduceDimsSize));
   bool losesInfo;
   floatValue.convert(
-      scaleType.getElementType().cast<FloatType>().getFloatSemantics(),
+      mlir::cast<FloatType>(scaleType.getElementType()).getFloatSemantics(),
       APFloat::rmNearestTiesToEven, &losesInfo);
   if (losesInfo) {
     op->emitWarning("Conversion of reduce_dims_size loses precision");
@@ -252,12 +255,14 @@ class UnfuseBatchNormTrainingPattern
 
   LogicalResult matchAndRewrite(mhlo::BatchNormTrainingOp bnOp,
                                 PatternRewriter& rewriter) const override {
-    auto operandType = bnOp.getOperand().getType().dyn_cast<RankedTensorType>();
-    auto scaleType = bnOp.getScale().getType().dyn_cast<RankedTensorType>();
+    auto operandType =
+        mlir::dyn_cast<RankedTensorType>(bnOp.getOperand().getType());
+    auto scaleType =
+        mlir::dyn_cast<RankedTensorType>(bnOp.getScale().getType());
     if (!operandType || !scaleType) {
       return failure();
     }
-    auto fpType = operandType.getElementType().dyn_cast<FloatType>();
+    auto fpType = mlir::dyn_cast<FloatType>(operandType.getElementType());
     if (!fpType) {
       return failure();
     }
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/utils/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/mhlo/utils/CMakeLists.txt
index 59889fec26735b..eca597b971b2c8 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/utils/CMakeLists.txt
+++ b/third_party/xla/xla/mlir_hlo/mhlo/utils/CMakeLists.txt
@@ -78,7 +78,6 @@ add_mlir_library(HloToLinalgUtils
   Core
 
   LINK_LIBS PUBLIC
-  LmhloDialect
   MhloDialect
   MhloTypeConversion
   MLIRBufferizationDialect
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/utils/legalize_to_linalg_utils.cc b/third_party/xla/xla/mlir_hlo/mhlo/utils/legalize_to_linalg_utils.cc
index 2d1a5009f83540..31cd49c45db880 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/utils/legalize_to_linalg_utils.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/utils/legalize_to_linalg_utils.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Support/LLVM.h"
 #include "stablehlo/dialect/ChloOps.h"
 
 namespace mlir {
@@ -33,7 +34,7 @@ namespace mhlo {
 namespace {
 
 bool hasIntegralShapeType(Operation* op) {
-  auto stp = op->getOperand(0).getType().dyn_cast<ShapedType>();
+  auto stp = mlir::dyn_cast<ShapedType>(op->getOperand(0).getType());
   return stp && stp.getElementType().isIntOrIndex();
 }
 
@@ -54,17 +55,17 @@ SmallVector<utils::IteratorType, 3> getNParallelLoopsAttrs(
 
 Value getEmptySparseTensor(OpBuilder& b, Location loc, ShapedType type,
                            ArrayRef<Value> dynSizes) {
-  return b.create<bufferization::AllocTensorOp>(loc, type.cast<TensorType>(),
-                                                dynSizes,
-                                                /*copy=*/Value(),
-                                                /*memory_space=*/IntegerAttr());
+  return b.create<bufferization::AllocTensorOp>(
+      loc, mlir::cast<TensorType>(type), dynSizes,
+      /*copy=*/Value(),
+      /*memory_space=*/IntegerAttr());
 }
 
 Value getEmptyTensor(OpBuilder& b, Location loc, ShapedType type,
                      ArrayRef<Value> dynSizes) {
-  return b.create<tensor::EmptyOp>(loc, type.getShape(), type.getElementType(),
-                                   dynSizes,
-                                   type.cast<RankedTensorType>().getEncoding());
+  return b.create<tensor::EmptyOp>(
+      loc, type.getShape(), type.getElementType(), dynSizes,
+      mlir::cast<RankedTensorType>(type).getEncoding());
 }
 
 Value getEmptyTensorFor(OpBuilder& b, Location loc, ShapedType resultType,
@@ -129,7 +130,7 @@ Value postSparsify(Operation* op, Value semiring, Value result, OpBuilder* b) {
 
 bool allOperandsAreScalarTensors(Operation* op) {
   return llvm::all_of(op->getOperands(), [](Value operand) {
-    auto operandTy = operand.getType().dyn_cast<ShapedType>();
+    auto operandTy = mlir::dyn_cast<ShapedType>(operand.getType());
     return operandTy && operandTy.getRank() == 0;
   });
 }
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/utils/legalize_to_linalg_utils.h b/third_party/xla/xla/mlir_hlo/mhlo/utils/legalize_to_linalg_utils.h
index 1b476d64da64d5..fa379f0410c2d6 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/utils/legalize_to_linalg_utils.h
+++ b/third_party/xla/xla/mlir_hlo/mhlo/utils/legalize_to_linalg_utils.h
@@ -109,7 +109,7 @@ class PointwiseToLinalgConverter : public OpConversionPattern<OpTy> {
     auto loc = op.getLoc();
     // Find maximum rank / number of loops.
     auto getRank = [](Value v) {
-      return v.getType().cast<ShapedType>().getRank();
+      return mlir::cast<ShapedType>(v.getType()).getRank();
     };
     auto isScalar = [&](Value v) { return getRank(v) == 0; };
     auto it = llvm::find_if_not(adaptor.getOperands(), isScalar);
@@ -129,13 +129,13 @@ class PointwiseToLinalgConverter : public OpConversionPattern<OpTy> {
 
     // Find result type, if on tensors.
     std::optional<ShapedType> resultTy;
-    resultTy = this->typeConverter->convertType(op->getResultTypes().front())
-                   .template dyn_cast<ShapedType>();
+    resultTy = mlir::dyn_cast<ShapedType>(
+        this->typeConverter->convertType(op->getResultTypes().front()));
 
     // Check result type compatibility.
     if (!resultTy || !resultTy->hasRank() || resultTy->getRank() != nloops ||
         !(resultTy->getElementType().isSignlessIntOrFloat() ||
-          resultTy->getElementType().isa<ComplexType>())) {
+          isa<ComplexType>(resultTy->getElementType()))) {
       return rewriter.notifyMatchFailure(
           op, "mismatched operand/result types or iterator count");
     }
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/utils/mhlo_rng_utils.cc b/third_party/xla/xla/mlir_hlo/mhlo/utils/mhlo_rng_utils.cc
index ac34bea9cc222a..559553889cf615 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/utils/mhlo_rng_utils.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/utils/mhlo_rng_utils.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Location.h"
 #include "mlir/IR/Types.h"
+#include "mlir/Support/LLVM.h"
 
 namespace mlir {
 namespace mhlo {
@@ -69,7 +70,7 @@ class ArithOp {
   }
 
   ArithOp indexCast(int32_t bitwidth) {
-    if (value.getType().isa<IntegerType>()) {
+    if (mlir::isa<IntegerType>(value.getType())) {
       Value cast = builder.create<arith::IndexCastOp>(
           loc, builder.getIndexType(), value);
       return ArithOp(builder, loc, cast);
@@ -187,7 +188,7 @@ std::pair<ArithOp, ArithOp> runThreeFry2xi32(ArithOp key0, ArithOp key1,
 // Extract and potentially reconstruct the i32 key-pair as necessary.
 std::pair<Value, Value> extractKey32(OpBuilder &builder, Location loc,
                                      Value store) {
-  ShapedType storeTy = store.getType().cast<ShapedType>();
+  ShapedType storeTy = mlir::cast<ShapedType>(store.getType());
   if (storeTy.getRank() != 1) return {nullptr, nullptr};
 
   Type storeETy = storeTy.getElementType();
@@ -217,7 +218,7 @@ std::pair<Value, Value> extractKey32(OpBuilder &builder, Location loc,
 
 // Extract and potentially reconstruct the i64 state as necessary.
 Value extractState64(OpBuilder &builder, Location loc, Value store) {
-  ShapedType storeTy = store.getType().cast<ShapedType>();
+  ShapedType storeTy = mlir::cast<ShapedType>(store.getType());
   if (storeTy.getRank() != 1) return nullptr;
 
   Type storeETy = storeTy.getElementType();
@@ -246,7 +247,7 @@ Value extractState64(OpBuilder &builder, Location loc, Value store) {
 }
 
 Value setState64(OpBuilder &b, Location loc, Value store, Value state) {
-  ShapedType storeTy = store.getType().cast<ShapedType>();
+  ShapedType storeTy = mlir::cast<ShapedType>(store.getType());
   if (storeTy.getRank() != 1) return nullptr;
 
   Type storeETy = storeTy.getElementType();
@@ -278,7 +279,7 @@ Value setState64(OpBuilder &b, Location loc, Value store, Value state) {
 
 Value reshapeToTarget(OpBuilder &builder, Location loc, ShapedType destTy,
                       Value src) {
-  auto srcTy = src.getType().cast<ShapedType>();
+  auto srcTy = mlir::cast<ShapedType>(src.getType());
   // Expand out to the target shape.
 
   auto reassociationIndices =
@@ -365,7 +366,7 @@ LogicalResult generateLinalgThreeFry32(OpBuilder &builder, Location loc,
   Value destRight = builder.create<tensor::EmptyOp>(
       loc, ArrayRef<int64_t>({count}), resultETy);
 
-  ShapedType destTy = destLeft.getType().cast<ShapedType>();
+  ShapedType destTy = mlir::cast<ShapedType>(destLeft.getType());
 
   SmallVector<AffineMap> indexingMaps(2, builder.getMultiDimIdentityMap(1));
   SmallVector<utils::IteratorType> iterators(1, utils::IteratorType::parallel);
@@ -446,7 +447,7 @@ LogicalResult generateLinalgThreeFry64(OpBuilder &builder, Location loc,
   // Generate a 1D tensor with for the random values.
   Value dest = builder.create<tensor::EmptyOp>(loc, ArrayRef<int64_t>({count}),
                                                resultETy);
-  ShapedType destTy = dest.getType().cast<ShapedType>();
+  ShapedType destTy = mlir::cast<ShapedType>(dest.getType());
 
   SmallVector<AffineMap> indexingMaps(1, builder.getMultiDimIdentityMap(1));
   SmallVector<utils::IteratorType> iterators(1, utils::IteratorType::parallel);
@@ -565,7 +566,7 @@ LogicalResult generateLinalgPhilox32(OpBuilder &builder, Location loc,
   Value dest3 = builder.create<tensor::EmptyOp>(loc, ArrayRef<int64_t>({count}),
                                                 resultETy);
 
-  ShapedType destTy = dest0.getType().cast<ShapedType>();
+  ShapedType destTy = mlir::cast<ShapedType>(dest0.getType());
 
   SmallVector<AffineMap> indexingMaps(4, builder.getMultiDimIdentityMap(1));
   SmallVector<utils::IteratorType> iterators(1, utils::IteratorType::parallel);
@@ -657,7 +658,7 @@ LogicalResult generateLinalgPhilox64(OpBuilder &builder, Location loc,
                                                 resultETy);
   Value dest1 = builder.create<tensor::EmptyOp>(loc, ArrayRef<int64_t>({count}),
                                                 resultETy);
-  ShapedType destTy = dest0.getType().cast<ShapedType>();
+  ShapedType destTy = mlir::cast<ShapedType>(dest0.getType());
 
   SmallVector<AffineMap> indexingMaps(2, builder.getMultiDimIdentityMap(1));
   SmallVector<utils::IteratorType> iterators(1, utils::IteratorType::parallel);
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/utils/mhlo_scatter_gather_utils.cc b/third_party/xla/xla/mlir_hlo/mhlo/utils/mhlo_scatter_gather_utils.cc
index 91db4969a88ad2..d70c06ddf65f14 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/utils/mhlo_scatter_gather_utils.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/utils/mhlo_scatter_gather_utils.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Utils/ReshapeOpsUtils.h"
+#include "mlir/Support/LLVM.h"
 
 namespace mlir {
 namespace mhlo {
@@ -41,16 +42,16 @@ static SmallVector<int64_t> getInversePermutation(
 
 bool isCanonicalScatter(ScatterOp scatterOp) {
   if (llvm::any_of(scatterOp.getOperandTypes(), [](Type operandType) {
-        return !operandType.isa<RankedTensorType>();
+        return !mlir::isa<RankedTensorType>(operandType);
       }))
     return false;
 
   ScatterDimensionNumbersAttr dimsAttrs =
       scatterOp.getScatterDimensionNumbers();
   auto indicesType =
-      scatterOp.getScatterIndices().getType().cast<RankedTensorType>();
+      mlir::cast<RankedTensorType>(scatterOp.getScatterIndices().getType());
   auto operandType =
-      scatterOp.getOperands().front().getType().cast<RankedTensorType>();
+      mlir::cast<RankedTensorType>(scatterOp.getOperands().front().getType());
 
   return indicesType.getRank() == 2 && dimsAttrs.getIndexVectorDim() == 1 &&
          dimsAttrs.getInsertedWindowDims().empty() &&
@@ -86,7 +87,7 @@ Value insertDegenerateDimensions(OpBuilder& b, Location loc, Value tensor,
                                  ArrayRef<int64_t> dimsToInsert) {
   assert(llvm::is_sorted(dimsToInsert) && "dimsToInsert must be sorted");
   if (dimsToInsert.empty()) return tensor;
-  TensorType type = tensor.getType().cast<TensorType>();
+  TensorType type = mlir::cast<TensorType>(tensor.getType());
   SmallVector<int64_t> newShape{type.getShape()};
   for (int64_t dim : dimsToInsert) newShape.insert(newShape.begin() + dim, 1);
   auto newType = RankedTensorType::get(newShape, type.getElementType());
@@ -104,7 +105,7 @@ Value insertDegenerateDimensions(OpBuilder& b, Location loc, Value tensor,
 static Value ensureIndexVectorDimPosition(OpBuilder& b, Location loc,
                                           Value indices,
                                           int64_t indexVectorDim) {
-  int64_t indicesRank = indices.getType().cast<TensorType>().getRank();
+  int64_t indicesRank = mlir::cast<TensorType>(indices.getType()).getRank();
   if (indexVectorDim == indicesRank - 1) return indices;
   if (indexVectorDim == indicesRank)
     return insertDegenerateDimensions(b, loc, indices, {indicesRank});
@@ -121,7 +122,7 @@ Value canonicalizeStartIndices(OpBuilder& b, Location loc, Value indices,
                                int64_t indexVectorDim) {
   indices = ensureIndexVectorDimPosition(b, loc, indices, indexVectorDim);
 
-  int64_t indicesRank = indices.getType().cast<TensorType>().getRank();
+  int64_t indicesRank = mlir::cast<TensorType>(indices.getType()).getRank();
 
   if (indicesRank == 2) return indices;
   if (indicesRank == 1) return insertDegenerateDimensions(b, loc, indices, {0});
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/utils/type_conversion.cc b/third_party/xla/xla/mlir_hlo/mhlo/utils/type_conversion.cc
index 42a5a542283697..50a8d01313ec6f 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/utils/type_conversion.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/utils/type_conversion.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/IR/Types.h"
 #include "mlir/IR/Value.h"
+#include "mlir/Support/LLVM.h"
 #include "stablehlo/dialect/StablehloOps.h"
 
 namespace mlir {
@@ -43,7 +44,7 @@ Type convertInteger(IntegerType intType) {
 }
 
 Type convertShapedType(ShapedType shapedType) {
-  if (auto intType = shapedType.getElementType().dyn_cast<IntegerType>())
+  if (auto intType = mlir::dyn_cast<IntegerType>(shapedType.getElementType()))
     return shapedType.clone(convertInteger(intType));
   return shapedType;
 }
@@ -76,7 +77,7 @@ std::optional<Value> materializeCastToIllegal(OpBuilder& builder, Type type,
 std::optional<Value> scalarToTensor(OpBuilder& builder, Type /*type*/,
                                     ValueRange inputs, Location loc) {
   assert(inputs.size() == 1);
-  if (inputs.front().getType().isa<ShapedType>()) {
+  if (mlir::isa<ShapedType>(inputs.front().getType())) {
     return std::nullopt;
   }
   return builder
@@ -160,7 +161,7 @@ bool HloToStablehloTypeConverter::isSourceDialect(Dialect& dialect) {
 
 Attribute HloToStablehloTypeConverter::convertSourceDialectEncoding(
     Attribute attr) {
-  if (auto hloAttr = attr.dyn_cast_or_null<mhlo::TypeExtensionsAttr>()) {
+  if (auto hloAttr = mlir::dyn_cast_or_null<mhlo::TypeExtensionsAttr>(attr)) {
     return stablehlo::TypeExtensionsAttr::get(hloAttr.getContext(),
                                               hloAttr.getBounds());
   }
@@ -185,7 +186,7 @@ bool StablehloToHloTypeConverter::isSourceDialect(Dialect& dialect) {
 Attribute StablehloToHloTypeConverter::convertSourceDialectEncoding(
     Attribute attr) {
   if (auto stablehloAttr =
-          attr.dyn_cast_or_null<stablehlo::TypeExtensionsAttr>()) {
+          mlir::dyn_cast_or_null<stablehlo::TypeExtensionsAttr>(attr)) {
     return mhlo::TypeExtensionsAttr::get(stablehloAttr.getContext(),
                                          stablehloAttr.getBounds());
   }
diff --git a/third_party/xla/xla/mlir_hlo/tests/BUILD b/third_party/xla/xla/mlir_hlo/tests/BUILD
index 4db81efeb2dd52..08df267d05a427 100644
--- a/third_party/xla/xla/mlir_hlo/tests/BUILD
+++ b/third_party/xla/xla/mlir_hlo/tests/BUILD
@@ -27,6 +27,7 @@ package(
             "nomsan",  # The execution engine doesn't work with msan, see b/248097619.
         ],
         deps = ["@pypi_lit//:pkg"],
+        deps = ["@pypi_lit//:pkg"],
     )
     for src in glob(["**/*.mlir"])
 ]
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/bufferization/hlo_one_shot_bufferize.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/bufferization/hlo_one_shot_bufferize.mlir
index cb012feaf59009..38ffba4dfd6012 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/bufferization/hlo_one_shot_bufferize.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/bufferization/hlo_one_shot_bufferize.mlir
@@ -114,6 +114,6 @@ func.func @user_dynamic(%arg : tensor<?xf32>, %init : tensor<?xf32>)
 // CHECK:         %[[RES:.*]] = call @user_fusion_0(%[[ARG_CAST]], %[[INIT_CAST]])
 // CHECK:         %[[C0:.*]] = arith.constant 0 : index
 // CHECK:         %[[DIM:.*]] = memref.dim %[[RES]], %[[C0]]
-// CHECK:         %[[ALLOC:.*]] = memref.alloc(%[[DIM]]) : memref<?xf32>
+// CHECK:         %[[ALLOC:.*]] = memref.alloc(%[[DIM]]) {{.*}} : memref<?xf32>
 // CHECK:         memref.copy %[[RES]], %[[ALLOC]]
 // CHECK:         return %[[ALLOC]] : memref<?xf32>
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/lhlo/lhlo-legalize-select-and-scatter.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/lhlo/lhlo-legalize-select-and-scatter.mlir
deleted file mode 100644
index 388ce00b077a8f..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/lhlo/lhlo-legalize-select-and-scatter.mlir
+++ /dev/null
@@ -1,158 +0,0 @@
-// GenericAtomicRMWOp should contain only ops with no side effects.
-// Unfortunately, the legalization pattern for SelectAndScatterOp has to adapt
-// to LMHLO dialect using allocs/deallocs inside of GenericAtomicRMWOp body.
-// Lowering to STD dialect and store forwarding pass would be required to get
-// rid of them. This is exactly what is done in the real MLIR GPU pipeline, but
-// here we disable verification with `verify-each=0` to check the output IR.
-// RUN: mlir-hlo-opt %s -lhlo-legalize-to-parallel-loops -canonicalize --verify-each=0 | FileCheck %s
-
-func.func @select_and_scatter(%arg: memref<112x112xf32>,
-                         %src: memref<56x56xf32>,
-                         %init: memref<f32>,
-                         %result: memref<112x112xf32>) {
-  "lmhlo.select_and_scatter"(%arg, %src, %init, %result) ({
-    // select
-    ^bb0(%lhs: memref<f32>, %rhs: memref<f32>, %pred: memref<i1>):
-      "lmhlo.compare"(%lhs, %rhs, %pred) {comparison_direction = #mhlo<comparison_direction GE>} :
-          (memref<f32>, memref<f32>, memref<i1>) -> ()
-      "lmhlo.terminator"() : () -> ()
-  }, {
-    // scatter
-    ^bb0(%lhs: memref<f32>, %rhs: memref<f32>, %out: memref<f32>):
-      "lmhlo.add"(%lhs, %rhs, %out) :
-          (memref<f32>, memref<f32>, memref<f32>) -> ()
-      "lmhlo.terminator"() : () -> ()
-  }) {
-    padding = dense<[[0, 1], [0, 1]]> : tensor<2x2xi64>,
-    window_dimensions = dense<[3, 3]> : tensor<2xi64>,
-    window_strides = dense<[2, 2]> : tensor<2xi64>
-  } : (memref<112x112xf32>,
-       memref<56x56xf32>,
-       memref<f32>, memref<112x112xf32>) -> ()
-  "lmhlo.terminator"() : () -> ()
-}
-// CHECK-LABEL: func.func
-// CHECK: ^bb0(%[[ARG_BUF:.*]]: memref<112x112xf32>, %[[SRC_BUF:.*]]: memref<56x56xf32>, %[[INIT_BUF:.*]]: memref<f32>, %[[RESULT_BUF:.*]]: memref<112x112xf32>):
-
-// Constants.
-// CHECK-DAG: %[[C0_F32:.*]] = "arith.constant"() <{value = 0.000000e+00 : f32}>
-// CHECK-DAG: %[[C0:.*]] = "arith.constant"() <{value = 0 : index}>
-// CHECK-DAG: %[[C1:.*]] = "arith.constant"() <{value = 1 : index}>
-// CHECK-DAG: %[[C2:.*]] = "arith.constant"() <{value = 2 : index}>
-// CHECK-DAG: %[[C3:.*]] = "arith.constant"() <{value = 3 : index}>
-// CHECK-DAG: %[[C56:.*]] = "arith.constant"() <{value = 56 : index}>
-// CHECK-DAG: %[[C112:.*]] = "arith.constant"() <{value = 112 : index}>
-// CHECK-DAG: %[[CFALSE:.*]] = "arith.constant"() <{value = false}>
-// CHECK-DAG: %[[CTRUE:.*]] = "arith.constant"() <{value = true}>
-
-// Parallel loop to initialize the output buffer.
-// CHECK: %[[INIT:.*]] = "memref.load"(%[[INIT_BUF]]) : (memref<f32>) -> f32
-// CHECK: "scf.parallel"(%[[C0]], %[[C0]], %[[C112]], %[[C112]], %[[C1]], %[[C1]]) <{{.*}}> ({
-// CHECK: ^bb0(%[[I:.*]]: index, %[[J:.*]]: index):
-// CHECK:   "memref.store"(%[[INIT]], %[[RESULT_BUF]], %[[I]], %[[J]])
-// CHECK:   "scf.reduce"() : () -> ()
-// CHECK: })
-
-// Parallel loop over source buffer to compute scattered values.
-// CHECK: "scf.parallel"(%[[C0]], %[[C0]], %[[C56]], %[[C56]], %[[C1]], %[[C1]]) <{{.*}}> ({
-// CHECK: ^bb0(%[[II:.*]]: index, %[[JJ:.*]]: index):
-
-// Window loop w.r.t. first dim.
-// CHECK:   %[[SEL_RES_I:.*]]:4 = "scf.for"(%[[C0]], %[[C3]], %[[C1]], %[[C0]], %[[C0]], %[[C0_F32]], %[[CFALSE]]) ({
-// CHECK:      ^bb0(%[[WIN_I:.*]]: index, %[[SEL_I_0:.*]]: index, %[[SEL_J_0:.*]]: index, %[[SEL_VAL_0:.*]]: f32, %[[SEL_INIT_0:.*]]: i1):
-
-// Window loop w.r.t. second dim.
-// CHECK: %[[SEL_RES_J:.*]]:4 = "scf.for"(%[[C0]], %[[C3]], %[[C1]], %[[SEL_I_0]], %[[SEL_J_0]], %[[SEL_VAL_0]], %[[SEL_INIT_0]]) ({
-// CHECK: ^bb0(%[[WIN_J:.*]]: index, %[[SEL_I:.*]]: index, %[[SEL_J:.*]]: index, %[[SEL_VAL:.*]]: f32, %[[SEL_INIT:.*]]: i1):
-
-// Compute index I of the ARG buffer and check whether it is in padding area.
-// CHECK: %[[START_I:.*]] = "arith.muli"(%[[II]], %[[C2]])
-// CHECK: %[[ARG_I:.*]] = "arith.addi"(%[[START_I]], %[[WIN_I]])
-// CHECK: %[[ARG_I_FITS:.*]] = "arith.cmpi"(%[[ARG_I]], %[[C112]])
-
-// Compute index J of the ARG buffer and check whether it is in padding area.
-// CHECK: %[[START_J:.*]] = "arith.muli"(%[[JJ]], %[[C2]])
-// CHECK: %[[ARG_J:.*]] = "arith.addi"(%[[START_J]], %[[WIN_J]])
-// CHECK: %[[ARG_J_FITS:.*]] = "arith.cmpi"(%[[ARG_J]], %[[C112]])
-
-// Update `INBOUNDS`, i.e. whether or not ARG indices are inside the boundaries
-// of the buffer or they are in the padding area.
-// CHECK: %[[INBOUNDS_1:.*]] = "arith.andi"(%[[ARG_I_FITS]], %[[ARG_J_FITS]])
-
-// If ARG ivs are in the padding area, then 'select' function does not have to
-// be applied, current selected ivs (SEL_I, SEL_J) and value (SEL_VAL) are
-// returned in that case.
-// CHECK:  %[[IF_INBOUNDS_RES:.*]]:3
-// CHECK-SAME:  = "scf.if"(%[[INBOUNDS_1]]) ({
-
-
-  // INBOUNDS-THEN-BODY, i.e. if INBOUNDS == true
-
-  // CHECK: %[[ARG_ELEM:.*]] = "memref.load"(%[[ARG_BUF]], %[[ARG_I]], %[[ARG_J]])
-  // CHECK: %[[IF_INIT_RES:.*]]:3 = "scf.if"(%[[SEL_INIT]]) ({
-
-    // INIT-THEN-BODY, i.e. INBOUNDS == true and INIT = true
-
-    // The LHLO IR of the select block of the lhlo.select_and_scatter is applied
-    // to the current selected value (SEL_VAL) and the element of the ARG buffer
-    // to compute boolean PRED, whether the new value and ivs should replace the
-    // current ones.
-
-    // Allocate buffers for ARG element, current selected value to adapt LHLO
-    // code.
-    // CHECK: %[[ARG_ELEM_BUF:.*]] = "memref.alloc"() {{.*}} -> memref<f32>
-    // CHECK: %[[SEL_VAL_BUF:.*]] = "memref.alloc"() {{.*}} -> memref<f32>
-    // CHECK: %[[PRED_BUF:.*]] = "memref.alloc"() {{.*}} -> memref<i1>
-    // CHECK: "memref.store"(%[[ARG_ELEM]], %[[ARG_ELEM_BUF]])
-    // CHECK: "memref.store"(%[[SEL_VAL]], %[[SEL_VAL_BUF]])
-
-    // Compute PRED.
-    // CHECK:  "lmhlo.compare"(
-    // CHECK-SAME:     %[[ARG_ELEM_BUF]], %[[SEL_VAL_BUF]], %[[PRED_BUF]])
-    // CHECK:  %[[PRED:.*]] = "memref.load"(%[[PRED_BUF]]) : (memref<i1>) -> i1
-
-
-    // Depending on PRED, return ARG ivs & elem or current select ivs and value.
-    // CHECK: %[[IF_PRED_RES0:.*]] = "arith.select"(%[[PRED]], %[[ARG_I]], %[[SEL_I]])
-    // CHECK: %[[IF_PRED_RES1:.*]] = "arith.select"(%[[PRED]], %[[ARG_J]], %[[SEL_J]])
-    // CHECK: %[[IF_PRED_RES2:.*]] = "arith.select"(%[[PRED]], %[[ARG_ELEM]], %[[SEL_VAL]])
-
-    // INIT-THEN-BODY yield.
-    // CHECK:    "scf.yield"(%[[IF_PRED_RES0]], %[[IF_PRED_RES1]], %[[IF_PRED_RES2]])
-    // CHECK: }
-
-    // INIT-ELSE-BODY, i.e. if INBOUNDS == TRUE and INIT == FALSE, returns ARG
-    // ivs and element without computing Select function.
-    // CHECK:   "scf.yield"(%[[ARG_I]], %[[ARG_J]], %[[ARG_ELEM]])
-    // CHECK: }
-
-// Window loop w.r.t. first dim yield.
-// CHECK:   "scf.yield"(%[[SEL_RES_J:.*]]#0, %[[SEL_RES_J]]#1, %[[SEL_RES_J]]#2, %[[SEL_RES_J]]#3)
-// CHECK: }
-
-// Use selected ivs to load element from the SRC buffer.
-// CHECK: %[[SRC_ELEM:.*]] = "memref.load"(%[[SRC_BUF]], %[[II]], %[[JJ]]) <{nontemporal = false}> : (memref<56x56xf32>, index, index) -> f32
-
-// Update of RESULT[SELECTED_I, SELECTED_J] should be done atomically, because
-// it may happen that several other threads select the same IVs if the windows
-// overlap.
-// CHECK: "memref.generic_atomic_rmw"(%[[RESULT_BUF]], %[[SEL_RES_I:.*]]#0, %[[SEL_RES_I]]#1) ({
-// CHECK: ^bb0(%[[CUR_RES:.*]]: f32):
-
-// Allocate buffers for ARG element, current selected value to adapt LHLO code.
-// CHECK: %[[SRC_ELEM_BUF:.*]] = "memref.alloc"() {{.*}} : () -> memref<f32>
-// CHECK: %[[CUR_RES_BUF:.*]] = "memref.alloc"() {{.*}} : () -> memref<f32>
-// CHECK: %[[RES_BUF:.*]] = "memref.alloc"() {{.*}} : () -> memref<f32>
-// CHECK: "memref.store"(%[[SRC_ELEM]], %[[SRC_ELEM_BUF]]) : (f32, memref<f32>) -> ()
-// CHECK: "memref.store"(%[[CUR_RES]], %[[CUR_RES_BUF]]) : (f32, memref<f32>) -> ()
-
-// Compute scatter value.
-// CHECK: "lmhlo.add"(%[[SRC_ELEM_BUF]], %[[CUR_RES_BUF]], %[[RES_BUF]])
-// CHECK: %[[RES:.*]] = "memref.load"(%[[RES_BUF]]) : (memref<f32>) -> f32
-
-
-// Atomic RMW terminator that returns updated value.
-// CHECK: "memref.atomic_yield"(%[[RES]]) : (f32) -> ()
-
-// Parallel loop over source buffer yield
-// CHECK: "scf.reduce"() : () -> ()
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/lhlo/lhlo-legalize-to-affine.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/lhlo/lhlo-legalize-to-affine.mlir
deleted file mode 100644
index fc5afe7cc33c4c..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/lhlo/lhlo-legalize-to-affine.mlir
+++ /dev/null
@@ -1,544 +0,0 @@
-// RUN: mlir-hlo-opt -lhlo-legalize-to-affine %s -o - | FileCheck %s
-
-// Smoke test.
-// CHECK-LABEL: func @min_op
-func.func @min_op(%lhs: memref<4x3x2x1xf32>, %rhs: memref<4x3x2x1xf32>,
-             %result: memref<4x3x2x1xf32>) -> () {
-  // CHECK-NEXT: affine.for %[[I:.*]] = 0 to 4 {
-  // CHECK-NEXT:   affine.for %[[J:.*]] = 0 to 3 {
-  // CHECK-NEXT:     affine.for %[[K:.*]] = 0 to 2 {
-  // CHECK-NEXT:       affine.for %[[L:.*]] = 0 to 1 {
-  // CHECK-NEXT:         %[[LHS:.*]] = affine.load %{{.*}}[%[[I]], %[[J]], %[[K]], %[[L]]] : memref<4x3x2x1xf32>
-  // CHECK-NEXT:         %[[RHS:.*]] = affine.load %{{.*}}[%[[I]], %[[J]], %[[K]], %[[L]]] : memref<4x3x2x1xf32>
-  // CHECK-NEXT:         %[[MIN:.*]] = arith.minimumf %[[LHS]], %[[RHS]] : f32
-  // CHECK-NEXT:         affine.store %[[MIN]], %{{.*}}[%[[I]], %[[J]], %[[K]], %[[L]]] : memref<4x3x2x1xf32>
-  // CHECK:      return
-  "lmhlo.minimum"(%lhs, %rhs, %result) {name = "min.1"} :
-      (memref<4x3x2x1xf32>, memref<4x3x2x1xf32>, memref<4x3x2x1xf32>) -> ()
-  func.return
-}
-
-// Add tests.
-// CHECK-LABEL: func @float_add_op
-func.func @float_add_op(%lhs: memref<7xf32>, %rhs: memref<7xf32>,
-                   %result: memref<7xf32>) -> () {
-  // CHECK: addf %{{.*}}, %{{.*}} : f32
-  "lmhlo.add"(%lhs, %rhs, %result) {name = "add.1"}
-      : (memref<7xf32>, memref<7xf32>, memref<7xf32>) -> ()
-  func.return
-}
-// CHECK-LABEL: func @int_add_op
-func.func @int_add_op(%lhs: memref<7xi32>, %rhs: memref<7xi32>,
-                 %result: memref<7xi32>) -> () {
-  // CHECK: addi %{{.*}}, %{{.*}} : i32
-  "lmhlo.add"(%lhs, %rhs, %result) {name = "add.1"}
-      : (memref<7xi32>, memref<7xi32>, memref<7xi32>) -> ()
-  func.return
-}
-
-// And test.
-// CHECK-LABEL: func @int_and_op
-func.func @int_and_op(%lhs: memref<7xi32>, %rhs: memref<7xi32>,
-                 %result: memref<7xi32>) -> () {
-  // CHECK: arith.andi %{{.*}}, %{{.*}} : i32
-  "lmhlo.and"(%lhs, %rhs, %result) {name = "and.1"}
-      : (memref<7xi32>, memref<7xi32>, memref<7xi32>) -> ()
-  func.return
-}
-
-// Div tests.
-// CHECK-LABEL: func @float_div_op
-func.func @float_div_op(%lhs: memref<7xf32>, %rhs: memref<7xf32>,
-                   %result: memref<7xf32>) -> () {
-  // CHECK: divf %{{.*}}, %{{.*}} : f32
-  "lmhlo.divide"(%lhs, %rhs, %result) {name = "div.1"}
-      : (memref<7xf32>, memref<7xf32>, memref<7xf32>) -> ()
-  func.return
-}
-// CHECK-LABEL: func @int_div_op
-func.func @int_div_op(%lhs: memref<7xi32>, %rhs: memref<7xi32>,
-                 %result: memref<7xi32>) -> () {
-  // CHECK: arith.divsi %{{.*}}, %{{.*}} : i32
-  "lmhlo.divide"(%lhs, %rhs, %result) {name = "div.1"}
-      : (memref<7xi32>, memref<7xi32>, memref<7xi32>) -> ()
-  func.return
-}
-
-// Max tests.
-// CHECK-LABEL: func @float_max_op
-func.func @float_max_op(%lhs: memref<7xf32>, %rhs: memref<7xf32>,
-                   %result: memref<7xf32>) -> () {
-  // CHECK: arith.maximumf %{{.*}}, %{{.*}} : f32
-  "lmhlo.maximum"(%lhs, %rhs, %result) {name = "max.1"}
-      : (memref<7xf32>, memref<7xf32>, memref<7xf32>) -> ()
-  func.return
-}
-
-// CHECK-LABEL: func @int_max_op
-func.func @int_max_op(%lhs: memref<7xi32>, %rhs: memref<7xi32>,
-                 %result: memref<7xi32>) -> () {
-  // CHECK: arith.maxsi %{{.*}}, %{{.*}} : i32
-  "lmhlo.maximum"(%lhs, %rhs, %result) {name = "max.1"}
-      : (memref<7xi32>, memref<7xi32>, memref<7xi32>) -> ()
-  func.return
-}
-
-// Min tests.
-// CHECK-LABEL: func @float_min_op
-func.func @float_min_op(%lhs: memref<7xf32>, %rhs: memref<7xf32>,
-                   %result: memref<7xf32>) -> () {
-  // CHECK: arith.minimumf %{{.*}}, %{{.*}} : f32
-  "lmhlo.minimum"(%lhs, %rhs, %result) {name = "min.1"}
-      : (memref<7xf32>, memref<7xf32>, memref<7xf32>) -> ()
-  func.return
-}
-
-// CHECK-LABEL: func @int_min_op
-func.func @int_min_op(%lhs: memref<7xi32>, %rhs: memref<7xi32>,
-                 %result: memref<7xi32>) -> () {
-  // CHECK: arith.minsi %{{.*}}, %{{.*}} : i32
-  "lmhlo.minimum"(%lhs, %rhs, %result) {name = "min.1"}
-      : (memref<7xi32>, memref<7xi32>, memref<7xi32>) -> ()
-  func.return
-}
-
-// Mul tests.
-// CHECK-LABEL: func @float_mul_op
-func.func @float_mul_op(%lhs: memref<7xf32>, %rhs: memref<7xf32>,
-                   %result: memref<7xf32>) -> () {
-  // CHECK: mulf %{{.*}}, %{{.*}} : f32
-  "lmhlo.multiply"(%lhs, %rhs, %result) {name = "mul.1"}
-      : (memref<7xf32>, memref<7xf32>, memref<7xf32>) -> ()
-  func.return
-}
-
-// CHECK-LABEL: func @int_mul_op
-func.func @int_mul_op(%lhs: memref<7xi32>, %rhs: memref<7xi32>,
-                 %result: memref<7xi32>) -> () {
-  // CHECK: muli %{{.*}}, %{{.*}} : i32
-  "lmhlo.multiply"(%lhs, %rhs, %result) {name = "mul.1"}
-      : (memref<7xi32>, memref<7xi32>, memref<7xi32>) -> ()
-  func.return
-}
-
-// Sub tests.
-// CHECK-LABEL: func @float_sub_op
-func.func @float_sub_op(%lhs: memref<7xf32>, %rhs: memref<7xf32>,
-                   %result: memref<7xf32>) -> () {
-  // CHECK: subf %{{.*}}, %{{.*}} : f32
-  "lmhlo.subtract"(%lhs, %rhs, %result) {name = "sub.1"}
-      : (memref<7xf32>, memref<7xf32>, memref<7xf32>) -> ()
-  func.return
-}
-// CHECK-LABEL: func @int_sub_op
-func.func @int_sub_op(%lhs: memref<7xi32>, %rhs: memref<7xi32>,
-                 %result: memref<7xi32>) -> () {
-  // CHECK: subi %{{.*}}, %{{.*}} : i32
-  "lmhlo.subtract"(%lhs, %rhs, %result) {name = "sub.1"}
-      : (memref<7xi32>, memref<7xi32>, memref<7xi32>) -> ()
-  func.return
-}
-
-// Dot tests.
-// CHECK-LABEL: func @float_dot_op
-func.func @float_dot_op(%lhs: memref<7x3xf32>, %rhs:
-                  memref<3x4xf32>, %result: memref<7x4xf32> ) -> () {
-    // CHECK-NEXT: affine.for %[[I:.*]] = 0 to 7 {
-    // CHECK-NEXT:  affine.for %[[J:.*]] = 0 to 4 {
-    // CHECK-NEXT:    affine.for %[[K:.*]] = 0 to 3 {
-    // CHECK-NEXT:      %[[LHS:.*]] = affine.load %{{.*}}[%[[I]], %[[K]]] : memref<7x3xf32>
-    // CHECK-NEXT:      %[[RHS:.*]] = affine.load %{{.*}}[%[[K]], %[[J]]] : memref<3x4xf32>
-    // CHECK-NEXT:      %[[RESULT:.*]] = affine.load %{{.*}}[%[[I]], %[[J]]] : memref<7x4xf32>
-    // CHECK-NEXT:      %[[MULT:.*]] = arith.mulf %[[LHS]], %[[RHS]] : f32
-    // CHECK-NEXT:      %[[ADD:.*]] =  arith.addf %[[MULT]], %[[RESULT]] : f32
-    // CHECK-NEXT:      affine.store %[[ADD]], %{{.*}}[%[[I]], %[[J]]] : memref<7x4xf32>
-    // CHECK: return
-  "lmhlo.dot"(%lhs, %rhs, %result) {
-      dot_dimension_numbers = #mhlo.dot<
-         lhs_batching_dimensions = [],
-         rhs_batching_dimensions = [],
-         lhs_contracting_dimensions = [1],
-         rhs_contracting_dimensions = [0]
-      >} : (memref<7x3xf32>, memref<3x4xf32>, memref<7x4xf32>) -> ()
-  func.return
-}
-// CHECK-LABEL: func @int_dot_op
-func.func @int_dot_op(%lhs: memref<7x3xi32>, %rhs:
-                  memref<3x4xi32>, %result: memref<7x4xi32> ) -> () {
-    // CHECK-NEXT: affine.for %[[I:.*]] = 0 to 7 {
-    // CHECK-NEXT:  affine.for %[[J:.*]] = 0 to 4 {
-    // CHECK-NEXT:    affine.for %[[K:.*]] = 0 to 3 {
-    // CHECK-NEXT:      %[[LHS:.*]] = affine.load %{{.*}}[%[[I]], %[[K]]] : memref<7x3xi32>
-    // CHECK-NEXT:      %[[RHS:.*]] = affine.load %{{.*}}[%[[K]], %[[J]]] : memref<3x4xi32>
-    // CHECK-NEXT:      %[[RESULT:.*]] = affine.load %{{.*}}[%[[I]], %[[J]]] : memref<7x4xi32>
-    // CHECK-NEXT:      %[[MULT:.*]] = arith.muli %[[LHS]], %[[RHS]] : i32
-    // CHECK-NEXT:      %[[ADD:.*]] =  arith.addi %[[MULT]], %[[RESULT]] : i32
-    // CHECK-NEXT:      affine.store %[[ADD]], %{{.*}}[%[[I]], %[[J]]] : memref<7x4xi32>
-    // CHECK: return
-  "lmhlo.dot"(%lhs, %rhs, %result) {
-    dot_dimension_numbers = #mhlo.dot<
-      lhs_contracting_dimensions = [1],
-      rhs_contracting_dimensions = [0]
-    >} : (memref<7x3xi32>, memref<3x4xi32>, memref<7x4xi32>) -> ()
-  func.return
-}
-
-// CHECK-LABEL: func @concatenate
-func.func @concatenate(%arg0: memref<1x1xf32>, %arg1: memref<1x100xf32>, %arg2: memref<1x200xf32>, %arg3: memref<1x301xf32>) {
-    // CHECK-NEXT:    %[[RESULT:.*]] = memref.alloc() : memref<1x301xf32>
-    // CHECK-NEXT:    affine.for %[[X:.*]] = 0 to 1 {
-    // CHECK-NEXT:      affine.for %[[Y:.*]] = 0 to 1 {
-    // CHECK-NEXT:        %[[LOAD:.*]] = affine.load %arg0[%[[X]], %[[Y]]] : memref<1x1xf32>
-    // CHECK-NEXT:        affine.store %[[LOAD]], %[[RESULT]][%[[X]], %[[Y]]] : memref<1x301xf32>
-    // CHECK-NEXT:      }
-    // CHECK-NEXT:    }
-    // CHECK-NEXT:    affine.for %[[X:.*]] = 0 to 1 {
-    // CHECK-NEXT:      affine.for %[[Y:.*]] = 1 to 101 {
-    // CHECK-NEXT:        %[[LOAD:.*]] = affine.load %arg1[%[[X]], %[[Y]] - 1] : memref<1x100xf32>
-    // CHECK-NEXT:        affine.store %[[LOAD]], %[[RESULT]][%[[X]], %[[Y]]] : memref<1x301xf32>
-    // CHECK-NEXT:      }
-    // CHECK-NEXT:    }
-    // CHECK-NEXT:    affine.for %[[X:.*]] = 0 to 1 {
-    // CHECK-NEXT:      affine.for %[[Y:.*]] = 101 to 301 {
-    // CHECK-NEXT:        %[[LOAD:.*]] = affine.load %arg2[%[[X]], %[[Y]] - 101] : memref<1x200xf32>
-    // CHECK-NEXT:        affine.store %[[LOAD]], %[[RESULT]][%[[X]], %[[Y]]] : memref<1x301xf32>
-    %0 = memref.alloc() : memref<1x301xf32>
-    "lmhlo.concatenate"(%arg0, %arg1, %arg2, %0) {dimension = 1 : i64} : (memref<1x1xf32>, memref<1x100xf32>, memref<1x200xf32>, memref<1x301xf32>) -> ()
-    "lmhlo.copy"(%0, %arg3) : (memref<1x301xf32>, memref<1x301xf32>) -> ()
-    "lmhlo.terminator"() : () -> ()
-}
-
-// TODO(pashu123): Extend Support for dynamic dimensions.
-// CHECK-LABEL: func @concatenate_dynamic
-func.func @concatenate_dynamic(%arg0: memref<1x?xf32>, %arg1: memref<1x?xf32>, %arg2: memref<1x?xf32>) {
-    // CHECK: "lmhlo.concatenate"
-    %cst_1 = arith.constant 1 : index
-    %0 = memref.alloc(%cst_1) : memref<1x?xf32>
-    "lmhlo.concatenate"(%arg0, %arg1, %0) {dimension = 1 : i64} : (memref<1x?xf32>,  memref<1x?xf32>, memref<1x?xf32>) -> ()
-    "lmhlo.copy"(%0, %arg2) : (memref<1x?xf32>, memref<1x?xf32>) -> ()
-    "lmhlo.terminator"() : () -> ()
-}
-
-// Gather op.
-// Test case 1: A general GatherOp test case.
-// CHECK-LABEL: func @gather_1
-// CHECK-SAME: (%[[OPERAND:.*]]: memref<28996x512xf32>, %[[START_INDICES:.*]]: memref<1x128xi32>, %[[OUTPUT:.*]]: memref<1x128x512xf32>)
-func.func @gather_1(%arg0: memref<28996x512xf32>, %arg1: memref<1x128xi32>, %arg2: memref<1x128x512xf32>) {
-  %0 = memref.alloc() : memref<1x128x512xf32>
-  "lmhlo.gather"(%arg0, %arg1, %0) {
-    dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [0],
-      index_vector_dim = 2,
-      offset_dims = [2],
-      start_index_map = [0],
-    >,
-    indices_are_sorted = false,
-    slice_sizes = dense<[1, 512]> : tensor<2xi64>
-  } : (memref<28996x512xf32>, memref<1x128xi32>, memref<1x128x512xf32>) -> ()
-  "lmhlo.copy"(%0, %arg2) : (memref<1x128x512xf32>, memref<1x128x512xf32>) -> ()
-  "lmhlo.terminator"() : () -> ()
-}
-// CHECK-NEXT: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK-NEXT: %[[temp_output:.*]] = memref.alloc() : memref<1x128x512xf32>
-// CHECK-NEXT: affine.for %{{.*}} = 0 to 1 {
-// CHECK-NEXT:   affine.for %{{.*}} = 0 to 128 {
-// CHECK-NEXT:     affine.for %{{.*}} = 0 to 512 {
-// CHECK-NEXT:       affine.store %[[zero]], %[[temp_output]][%{{.*}}, %{{.*}}, %{{.*}}] : memref<1x128x512xf32>
-// CHECK-NEXT:     }
-// CHECK-NEXT:   }
-// CHECK-NEXT: }
-// CHECK-NEXT: affine.for %[[batch0:.*]] = 0 to 1 {
-// CHECK-NEXT:   affine.for %[[batch1:.*]] = 0 to 128 {
-// CHECK-NEXT:     affine.for %[[offset0:.*]] = 0 to 512 {
-// CHECK-NEXT:       affine.for %[[iv0:.*]] = 0 to 28996 {
-// CHECK-NEXT:         %[[a:.*]] = affine.load %[[START_INDICES]][%[[batch0]], %[[batch1]]] : memref<1x128xi32>
-// CHECK-NEXT:         %[[S_in0:.*]] = arith.index_cast %[[a]] : i32 to index
-// CHECK-NEXT:         %[[operand_val:.*]] = affine.load %[[OPERAND]][%[[iv0]], %[[offset0]]] : memref<28996x512xf32>
-// CHECK-NEXT:         %[[pred:.*]] = arith.cmpi eq, %[[S_in0]], %[[iv0]] : index
-// CHECK-NEXT:         %[[selected_value:.*]] = arith.select %[[pred]], %[[operand_val]], %[[zero]] : f32
-// CHECK-NEXT:         %[[prev_value:.*]] = affine.load %[[temp_output]][%[[batch0]], %[[batch1]], %[[offset0]]] : memref<1x128x512xf32>
-// CHECK-NEXT:         %[[final_value:.*]] = arith.addf %[[selected_value]], %[[prev_value]] : f32
-// CHECK-NEXT:         affine.store %[[final_value]], %[[temp_output]][%[[batch0]], %[[batch1]], %[[offset0]]] : memref<1x128x512xf32>
-// CHECK-NEXT:       }
-// CHECK-NEXT:     }
-// CHECK-NEXT:   }
-// CHECK-NEXT: }
-
-// Test case 2: Checks for multi-dimensional starting indices.
-// CHECK-LABEL: func @gather_2
-// CHECK-SAME: (%[[OPERAND:.*]]: memref<16x11xf32>, %[[START_INDICES:.*]]: memref<5x2xi32>, %[[OUTPUT:.*]]: memref<5x8x6xf32>)
-func.func @gather_2(%arg0: memref<16x11xf32>, %arg1: memref<5x2xi32>, %arg2: memref<5x8x6xf32>) {
-  %0 = memref.alloc() : memref<5x8x6xf32>
-  "lmhlo.gather"(%arg0, %arg1, %0) {
-    dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [-1],
-      index_vector_dim = 1,
-      offset_dims = [1,2],
-      start_index_map = [0,1],
-    >,
-    indices_are_sorted = false,
-    slice_sizes = dense<[8, 6]> : tensor<2xi64>
-  } : (memref<16x11xf32>, memref<5x2xi32>, memref<5x8x6xf32>) -> ()
-  "lmhlo.copy"(%0, %arg2) : (memref<5x8x6xf32>, memref<5x8x6xf32>) -> ()
-  "lmhlo.terminator"() : () -> ()
-}
-// CHECK-DAG:  %[[zero:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK-DAG:  %c0 = arith.constant 0 : index
-// CHECK-DAG:  %c1 = arith.constant 1 : index
-// CHECK-NEXT: %[[temp_output:.*]] = memref.alloc() : memref<5x8x6xf32>
-// CHECK-NEXT: affine.for %{{.*}} = 0 to 5 {
-// CHECK-NEXT:   affine.for %{{.*}} = 0 to 8 {
-// CHECK-NEXT:     affine.for %{{.*}} = 0 to 6 {
-// CHECK-NEXT:       affine.store %[[zero]], %[[temp_output]][%{{.*}}, %{{.*}}, %{{.*}}] : memref<5x8x6xf32>
-// CHECK-NEXT:     }
-// CHECK-NEXT:   }
-// CHECK-NEXT: }
-// CHECK-NEXT: affine.for %[[batch0:.*]] = 0 to 5 {
-// CHECK-NEXT:   affine.for %[[offset0:.*]] = 0 to 8 {
-// CHECK-NEXT:     affine.for %[[offset1:.*]] = 0 to 6 {
-// CHECK-NEXT:       affine.for %[[iv0:.*]] = 0 to 16 {
-// CHECK-NEXT:         affine.for %[[iv1:.*]] = 0 to 11 {
-// CHECK-NEXT:           %[[a:.*]] = affine.load %[[START_INDICES]][%[[batch0]], %c0] : memref<5x2xi32>
-// CHECK-NEXT:           %[[S_in0:.*]] = arith.index_cast %[[a]] : i32 to index
-// CHECK-NEXT:           %[[b:.*]] = affine.load %[[START_INDICES]][%[[batch0]], %c1] : memref<5x2xi32>
-// CHECK-NEXT:           %[[S_in1:.*]] = arith.index_cast %[[b]] : i32 to index
-// CHECK-NEXT:           %[[operand_val:.*]] = affine.load %[[OPERAND]][%[[iv0]], %[[iv1]]] : memref<16x11xf32>
-// CHECK-NEXT:           %[[In0:.*]] = arith.addi %[[S_in0]], %[[offset0]] : index
-// CHECK-NEXT:           %[[pred1:.*]] = arith.cmpi eq, %[[In0]], %[[iv0]] : index
-// CHECK-NEXT:           %[[In1:.*]] = arith.addi %[[S_in1]], %[[offset1]] : index
-// CHECK-NEXT:           %[[pred2:.*]] = arith.cmpi eq, %[[In1]], %[[iv1]] : index
-// CHECK-NEXT:           %[[and1:.*]] = arith.andi %[[pred1]], %[[pred2]] : i1
-// CHECK-NEXT:           %[[selected_value:.*]] = arith.select %[[and1]], %[[operand_val]], %[[zero]] : f32
-// CHECK-NEXT:           %[[prev_value:.*]] = affine.load %[[temp_output]][%[[batch0]], %[[offset0]], %[[offset1]]] : memref<5x8x6xf32>
-// CHECK-NEXT:           %[[final_value:.*]] = arith.addf %[[selected_value]], %[[prev_value]] : f32
-// CHECK-NEXT:           affine.store %[[final_value]], %[[temp_output]][%[[batch0]], %[[offset0]], %[[offset1]]] : memref<5x8x6xf32>
-// CHECK-NEXT:         }
-// CHECK-NEXT:       }
-// CHECK-NEXT:     }
-// CHECK-NEXT:   }
-// CHECK-NEXT: }
-
-// Test case 3: Checks for multi-dimensional start_indices with multi-dimensional batch size. This also tests for f16 type.
-// CHECK-LABEL: func @gather_3
-// CHECK-SAME: (%[[OPERAND:.*]]: memref<16x11xf16>, %[[START_INDICES:.*]]: memref<4x2x5xi32>, %[[OUTPUT:.*]]: memref<4x5x8x6xf16>)
-func.func @gather_3(%arg0: memref<16x11xf16>, %arg1: memref<4x2x5xi32>, %arg2: memref<4x5x8x6xf16>) {
-  %0 = memref.alloc() : memref<4x5x8x6xf16>
-  "lmhlo.gather"(%arg0, %arg1, %0) {
-    dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [-1],
-      index_vector_dim = 1,
-      offset_dims = [2,3],
-      start_index_map = [0,1],
-    >,
-    indices_are_sorted = false,
-    slice_sizes = dense<[8, 6]> : tensor<2xi64>
-  } : (memref<16x11xf16>, memref<4x2x5xi32>, memref<4x5x8x6xf16>) -> ()
-  "lmhlo.copy"(%0, %arg2) : (memref<4x5x8x6xf16>, memref<4x5x8x6xf16>) -> ()
-  "lmhlo.terminator"() : () -> ()
-}
-// CHECK-DAG:  %[[zero:.*]] = arith.constant 0.000000e+00 : f16
-// CHECK-DAG:  %c0 = arith.constant 0 : index
-// CHECK-DAG:  %c1 = arith.constant 1 : index
-// CHECK-NEXT: %[[temp_output:.*]] = memref.alloc() : memref<4x5x8x6xf16>
-// CHECK-NEXT: affine.for %{{.*}} = 0 to 4 {
-// CHECK-NEXT:   affine.for %{{.*}} = 0 to 5 {
-// CHECK-NEXT:     affine.for %{{.*}} = 0 to 8 {
-// CHECK-NEXT:       affine.for %{{.*}} = 0  to 6 {
-// CHECK-NEXT:         affine.store %[[zero]], %[[temp_output]][%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] : memref<4x5x8x6xf16>
-// CHECK-NEXT:       }
-// CHECK-NEXT:     }
-// CHECK-NEXT:   }
-// CHECK-NEXT: }
-// CHECK-NEXT: affine.for %[[batch0:.*]] = 0 to 4 {
-// CHECK-NEXT:   affine.for %[[batch1:.*]] = 0 to 5 {
-// CHECK-NEXT:     affine.for %[[offset0:.*]] = 0 to 8 {
-// CHECK-NEXT:       affine.for %[[offset1:.*]] = 0 to 6 {
-// CHECK-NEXT:         affine.for %[[iv0:.*]] = 0 to 16 {
-// CHECK-NEXT:           affine.for %[[iv1:.*]] = 0 to 11 {
-// CHECK-NEXT:             %[[a:.*]] = affine.load %[[START_INDICES]][%[[batch0]], %c0, %[[batch1]]] : memref<4x2x5xi32>
-// CHECK-NEXT:             %[[S_in0:.*]] = arith.index_cast %[[a]] : i32 to index
-// CHECK-NEXT:             %[[b:.*]] = affine.load %[[START_INDICES]][%[[batch0]], %c1, %[[batch1]]] : memref<4x2x5xi32>
-// CHECK-NEXT:             %[[S_in1:.*]] = arith.index_cast %[[b]] : i32 to index
-// CHECK-NEXT:             %[[operand_val:.*]] = affine.load %[[OPERAND]][%[[iv0]], %[[iv1]]] : memref<16x11xf16>
-// CHECK-NEXT:             %[[In0:.*]] = arith.addi %[[S_in0]], %[[offset0]] : index
-// CHECK-NEXT:             %[[pred1:.*]] = arith.cmpi eq, %[[In0]], %[[iv0]] : index
-// CHECK-NEXT:             %[[In1:.*]] = arith.addi %[[S_in1]], %[[offset1]] : index
-// CHECK-NEXT:             %[[pred2:.*]] = arith.cmpi eq, %[[In1]], %[[iv1]] : index
-// CHECK-NEXT:             %[[and1:.*]] = arith.andi %[[pred1]], %[[pred2]] : i1
-// CHECK-NEXT:             %[[selected_value:.*]] = arith.select %[[and1]], %[[operand_val]], %[[zero]] : f16
-// CHECK-NEXT:             %[[prev_value:.*]] = affine.load %[[temp_output]][%[[batch0]], %[[batch1]], %[[offset0]], %[[offset1]]] : memref<4x5x8x6xf16>
-// CHECK-NEXT:             %[[final_value:.*]] = arith.addf %[[selected_value]], %[[prev_value]] : f16
-// CHECK-NEXT:             affine.store %[[final_value]], %[[temp_output]][%[[batch0]], %[[batch1]], %[[offset0]], %[[offset1]]] : memref<4x5x8x6xf16>
-// CHECK-NEXT:           }
-// CHECK-NEXT:         }
-// CHECK-NEXT:       }
-// CHECK-NEXT:     }
-// CHECK-NEXT:   }
-// CHECK-NEXT: }
-
-// Test case 4: Changing starting_index_map : X -> [0,X]
-// CHECK-LABEL: func @gather_4
-// CHECK-SAME: (%[[OPERAND:.*]]: memref<16x11xf32>, %[[START_INDICES:.*]]: memref<5x4xi32>, %[[OUTPUT:.*]]: memref<4x5x6xf32>)
-func.func @gather_4(%arg0: memref<16x11xf32>, %arg1: memref<5x4xi32>, %arg2: memref<4x5x6xf32>) {
-  %0 = memref.alloc() : memref<4x5x6xf32>
-  "lmhlo.gather"(%arg0, %arg1, %0) {
-    dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [0],
-      index_vector_dim = 2,
-      offset_dims = [2],
-      start_index_map = [0],
-    >,
-    indices_are_sorted = false,
-    slice_sizes = dense<[1, 6]> : tensor<2xi64>
-  } : (memref<16x11xf32>, memref<5x4xi32>, memref<4x5x6xf32>) -> ()
-  "lmhlo.copy"(%0, %arg2) : (memref<4x5x6xf32>, memref<4x5x6xf32>) -> ()
-  "lmhlo.terminator"() : () -> ()
-}
-// CHECK-NEXT: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK-NEXT: %[[temp_output:.*]] = memref.alloc() : memref<4x5x6xf32>
-// CHECK-NEXT: affine.for %{{.*}} = 0 to 4 {
-// CHECK-NEXT:   affine.for %{{.*}} = 0 to 5 {
-// CHECK-NEXT:     affine.for %{{.*}} = 0 to 6 {
-// CHECK-NEXT:       affine.store %[[zero]], %[[temp_output]][%{{.*}}, %{{.*}}, %{{.*}}] : memref<4x5x6xf32>
-// CHECK-NEXT:     }
-// CHECK-NEXT:   }
-// CHECK-NEXT: }
-// CHECK-NEXT: affine.for %[[batch0:.*]] = 0 to 5 {
-// CHECK-NEXT:   affine.for %[[batch1:.*]] = 0 to 4 {
-// CHECK-NEXT:     affine.for %[[offset0:.*]] = 0 to 6 {
-// CHECK-NEXT:       affine.for %[[iv0:.*]] = 0 to 16 {
-// CHECK-NEXT:         %[[a:.*]] = affine.load %[[START_INDICES]][%[[batch0]], %[[batch1]]] : memref<5x4xi32>
-// CHECK-NEXT:         %[[S_in0:.*]] = arith.index_cast %[[a]] : i32 to index
-// CHECK-NEXT:         %[[operand_val:.*]] = affine.load %[[OPERAND]][%[[iv0]], %[[offset0]]] : memref<16x11xf32>
-// CHECK-NEXT:         %[[pred:.*]] = arith.cmpi eq, %[[S_in0]], %[[iv0]] : index
-// CHECK-NEXT:         %[[selected_value:.*]] = arith.select %[[pred]], %[[operand_val]], %[[zero]] : f32
-// CHECK-NEXT:         %[[prev_value:.*]] = affine.load %[[temp_output]][%[[batch0]], %[[batch1]], %[[offset0]]] : memref<4x5x6xf32>
-// CHECK-NEXT:         %[[final_value:.*]] = arith.addf %[[selected_value]], %[[prev_value]] : f32
-// CHECK-NEXT:         affine.store %[[final_value]], %[[temp_output]][%[[batch0]], %[[batch1]], %[[offset0]]] : memref<4x5x6xf32>
-// CHECK-NEXT:       }
-// CHECK-NEXT:     }
-// CHECK-NEXT:   }
-// CHECK-NEXT: }
-
-// Test case 5: Testing for more than two equality checks.
-// CHECK-LABEL: func @gather_5
-func.func @gather_5(%arg0: memref<28996x512x256xf32>, %arg1: memref<10x3xi32>, %arg2: memref<10x20x10x5xf32>) {
-  %0 = memref.alloc() : memref<10x20x10x5xf32>
-  "lmhlo.gather"(%arg0, %arg1, %0) {
-    dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [-1],
-      index_vector_dim = 1,
-      offset_dims = [1,2,3],
-      start_index_map = [0,1,2],
-    >,
-    indices_are_sorted = false,
-    slice_sizes = dense<[20, 10, 5]> : tensor<3xi64>
-  } : (memref<28996x512x256xf32>, memref<10x3xi32>, memref<10x20x10x5xf32>) -> ()
-  "lmhlo.copy"(%0, %arg2) : (memref<10x20x10x5xf32>, memref<10x20x10x5xf32>) -> ()
-  "lmhlo.terminator"() : () -> ()
-}
-// CHECK: %[[and1:.*]] = arith.andi %{{.*}}, %{{.*}} : i1
-// CHECK-NEXT: arith.andi %[[and1]], %{{.*}} : i1
-
-// CHECK-LABEL: func @gather_6
-// CHECK-SAME: (%[[OPERAND:.*]]: memref<16x11x10x9xf32>, %[[START_INDICES:.*]]: memref<5x4xi32>, %[[OUTPUT:.*]]: memref<5x8x6x5x4xf32>)
-func.func @gather_6(%arg0: memref<16x11x10x9xf32>, %arg1: memref<5x4xi32>, %arg2: memref<5x8x6x5x4xf32>) {
-   %0 = memref.alloc() : memref<5x8x6x5x4xf32>
-   "lmhlo.gather"(%arg0, %arg1, %0) {dimension_numbers = #mhlo.gather<collapsed_slice_dims = [-1],
-                                                              index_vector_dim = 1,
-                                                              offset_dims = [1,2,3,4],
-                                                              start_index_map = [0,1,2,3],>,
-                                        indices_are_sorted = false, slice_sizes = dense<[8, 6, 5, 4]> : tensor<4xi64>} :
-   (memref<16x11x10x9xf32>, memref<5x4xi32>, memref<5x8x6x5x4xf32>) -> ()
-   "lmhlo.copy"(%0, %arg2) : (memref<5x8x6x5x4xf32>, memref<5x8x6x5x4xf32>) -> ()
-   "lmhlo.terminator"() : () -> ()
-}
-// CHECK-DAG:  %[[ZERO:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK-DAG:  %[[ZERO_IDX:.*]] = arith.constant 0 : index
-// CHECK-DAG:  %[[ONE_IDX:.*]] = arith.constant 1 : index
-// CHECK-DAG:  %[[TWO_IDX:.*]] = arith.constant 2 : index
-// CHECK-DAG:  %[[THREE_IDX:.*]] = arith.constant 3 : index
-// CHECK-NEXT: %[[RESULT:.*]] = memref.alloc() : memref<5x8x6x5x4xf32>
-// CHECK-NEXT: affine.for %{{.*}} = 0 to 5 {
-// CHECK-NEXT:  affine.for %{{.*}} = 0 to 8 {
-// CHECK-NEXT:    affine.for %{{.*}} = 0 to 6 {
-// CHECK-NEXT:      affine.for %{{.*}} = 0 to 5 {  
-// CHECK-NEXT:        affine.for %{{.*}} = 0 to 4 {
-// CHECK-NEXT:          affine.store %[[ZERO]], %[[RESULT]][%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] : memref<5x8x6x5x4xf32>
-// CHECK-NEXT:        }
-// CHECK-NEXT:       }
-// CHECK-NEXT:     }
-// CHECK-NEXT:   }
-// CHECK-NEXT: } 
-// CHECK-NEXT: affine.for %[[BATCH0:.*]] = 0 to 5 {
-// CHECK-NEXT:  affine.for %[[OFFSET0:.*]] = 0 to 8 {
-// CHECK-NEXT:    affine.for %[[OFFSET1:.*]] = 0 to 6 {
-// CHECK-NEXT:      affine.for %[[OFFSET2:.*]] = 0 to 5 {
-// CHECK-NEXT:        affine.for %[[OFFSET3:.*]] = 0 to 4 {
-// CHECK-NEXT:          affine.for %[[iv0:.*]] = 0 to 16 {
-// CHECK-NEXT:            affine.for %[[iv1:.*]] = 0 to 11 {
-// CHECK-NEXT:              affine.for %[[iv2:.*]] = 0 to 10 {
-// CHECK-NEXT:                affine.for %[[iv3:.*]] = 0 to 9 {
-// CHECK-NEXT:                  %[[START0:.*]] = affine.load %[[START_INDICES]][%[[BATCH0]], %[[ZERO_IDX]]] : memref<5x4xi32> 
-// CHECK-NEXT:                  %[[START_IND0:.*]] = arith.index_cast %[[START0]] : i32 to index
-// CHECK-NEXT:                  %[[START1:.*]] = affine.load %[[START_INDICES]][%[[BATCH0]], %[[ONE_IDX]]] : memref<5x4xi32>
-// CHECK-NEXT:                  %[[START_IND1:.*]] = arith.index_cast %[[START1]] : i32 to index
-// CHECK-NEXT:                  %[[START2:.*]] = affine.load %[[START_INDICES]][%[[BATCH0]], %[[TWO_IDX]]] : memref<5x4xi32>
-// CHECK-NEXT:                  %[[START_IND2:.*]] = arith.index_cast %[[START2]] : i32 to index
-// CHECK-NEXT:                  %[[START3:.*]] = affine.load %[[START_INDICES]][%[[BATCH0]], %[[THREE_IDX]]] : memref<5x4xi32>
-// CHECK-NEXT:                  %[[START_IND3:.*]] = arith.index_cast %[[START3]] : i32 to index
-// CHECK-NEXT:                  %[[OPERANDVAL:.*]] = affine.load %[[OPERAND]][%[[iv0]], %[[iv1]], %[[iv2]], %[[iv3]]] : memref<16x11x10x9xf32>
-// CHECK-NEXT:                  %[[INDEX0:.*]] = arith.addi %[[START_IND0]], %[[OFFSET0]] : index
-// CHECK-NEXT:                  %[[PRED0:.*]] = arith.cmpi eq, %[[INDEX0]], %[[iv0]] : index
-// CHECK-NEXT:                  %[[INDEX1:.*]] = arith.addi %[[START_IND1]], %[[OFFSET1]] : index
-// CHECK-NEXT:                  %[[PRED1:.*]] = arith.cmpi eq, %[[INDEX1]], %[[iv1]] : index
-// CHECK-NEXT:                  %[[INDEX2:.*]] = arith.addi %[[START_IND2]], %[[OFFSET2]] : index
-// CHECK-NEXT:                  %[[PRED2:.*]] = arith.cmpi eq, %[[INDEX2]], %[[iv2]] : index
-// CHECK-NEXT:                  %[[INDEX3:.*]] = arith.addi %[[START_IND3]], %[[OFFSET3]] : index
-// CHECK-NEXT:                  %[[PRED3:.*]] = arith.cmpi eq, %[[INDEX3]], %[[iv3]] : index
-// CHECK-NEXT:                  %[[PRED0_AND_PRED1:.*]] = arith.andi %[[PRED0]], %[[PRED1]] : i1
-// CHECK-NEXT:                  %[[PRED2_AND_PRED3:.*]] = arith.andi %[[PRED2]], %[[PRED3]] : i1
-// CHECK-NEXT:                  %[[PRED:.*]] = arith.andi %[[PRED0_AND_PRED1]], %[[PRED2_AND_PRED3]] : i1
-// CHECK-NEXT:                  %[[OPERANDVAL_OR_ZERO:.*]] = arith.select %[[PRED]], %[[OPERANDVAL]], %[[ZERO]] : f32
-// CHECK-NEXT:                  %[[OUTPUTVAL:.*]] = affine.load %[[RESULT]][%[[BATCH0]], %[[OFFSET0]], %[[OFFSET1]], %[[OFFSET2]], %[[OFFSET3]]] : memref<5x8x6x5x4xf32>
-// CHECK-NEXT:                  %[[FINALVAL:.*]] = arith.addf %[[OPERANDVAL_OR_ZERO]], %[[OUTPUTVAL]] : f32
-// CHECK-NEXT:                  affine.store %[[FINALVAL]], %[[RESULT]][%[[BATCH0]], %[[OFFSET0]], %[[OFFSET1]], %[[OFFSET2]], %[[OFFSET3]]] : memref<5x8x6x5x4xf32> 
-
-// CHECK-LABEL: func @log
-func.func @log(%arg0: memref<2x2xf32>, %arg1: memref<2x2xf32>) {
-
-// CHECK-NEXT: %[[RES:.*]] = memref.alloc() : memref<2x2xf32>
-// CHECK-NEXT: affine.for %{{.*}} = 0 to 2 {
-// CHECK-NEXT:   affine.for %{{.*}} = 0 to 2 {
-// CHECK-NEXT:     %[[LOAD:.*]] = affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<2x2xf32>
-// CHECK-NEXT:     %[[LOG:.*]] = math.log %[[LOAD]] : f32
-// CHECK-NEXT:     affine.store %[[LOG]], %[[RES]][%{{.*}}, %{{.*}}] : memref<2x2xf32>
-
-  %0 = memref.alloc() : memref<2x2xf32>
-  "lmhlo.log"(%arg0, %0) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-  "lmhlo.copy"(%0, %arg1) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-  "lmhlo.terminator"() : () -> ()
-}
-
-// CHECK-LABEL: func @pad
-// CHECK-SAME: (%[[INPUT:.*]]: memref<1x2x3xf16>, %[[PAD_VAL_INPUT:.*]]: memref<f16>, %[[OUTPUT:.*]]: memref<2x4x5xf16>)
-func.func @pad(%arg0: memref<1x2x3xf16>, %arg1: memref<f16>, %arg2: memref<2x4x5xf16>) {
-  "lmhlo.pad"(%arg0, %arg1, %arg2) {edge_padding_high = dense<[1, 1, 0]> : tensor<3xi64>, edge_padding_low = dense<[0, 1, 2]> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (memref<1x2x3xf16>, memref<f16>, memref<2x4x5xf16>) -> ()
-// CHECK:        %[[PAD_VALUE:.*]] = affine.load %[[PAD_VAL_INPUT]][] : memref<f16>
-// CHECK-NEXT:   affine.for %[[OUTD0:.*]] = 0 to 2 {
-// CHECK-NEXT:    affine.for %[[OUTD1:.*]] = 0 to 4 {
-// CHECK-NEXT:      affine.for %[[OUTD2:.*]] = 0 to 5 {
-// CHECK-NEXT:       affine.store %[[PAD_VALUE]], %[[OUTPUT]][%[[OUTD0]], %[[OUTD1]], %[[OUTD2]]] : memref<2x4x5xf16>
-// CHECK-NEXT:      }
-// CHECK-NEXT:     }
-// CHECK-NEXT:    }
-// CHECK-NEXT:   affine.for %[[IND0:.*]] = 0 to 1 {
-// CHECK-NEXT:    affine.for %[[IND1:.*]] = 0 to 2 {
-// CHECK-NEXT:      affine.for %[[IND2:.*]] = 0 to 3 {
-// CHECK-NEXT:       %[[VAL:.*]] = affine.load %[[INPUT]][%[[IND0]], %[[IND1]], %[[IND2]]] : memref<1x2x3xf16>
-// CHECK-NEXT:       affine.store %[[VAL]], %[[OUTPUT]][%[[IND0]], %[[IND1]] + 1, %[[IND2]] + 2] : memref<2x4x5xf16>
-// CHECK-NEXT:      }
-// CHECK-NEXT:     }
-// CHECK-NEXT:    }
-  func.return
-}
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/lhlo/lhlo-legalize-to-gpu.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/lhlo/lhlo-legalize-to-gpu.mlir
deleted file mode 100644
index f70d3f96781b61..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/lhlo/lhlo-legalize-to-gpu.mlir
+++ /dev/null
@@ -1,34 +0,0 @@
-// RUN: mlir-hlo-opt %s -lhlo-legalize-to-gpu -split-input-file | FileCheck %s
-
-func.func @reduce(%arg: memref<100x10xf32>,
-             %init: memref<f32>,
-             %result: memref<100xf32>) {
-  "lmhlo.reduce"(%arg, %init, %result) ({
-    ^bb0(%lhs: memref<f32>, %rhs: memref<f32>, %res: memref<f32>):
-      "lmhlo.add"(%lhs, %rhs, %res)
-        : (memref<f32>, memref<f32>, memref<f32>) -> ()
-      "lmhlo.terminator"() : () -> ()
-    } ) {dimensions = dense<[1]> : tensor<1xi64>}
-      : (memref<100x10xf32>, memref<f32>, memref<100xf32>) -> ()
-  func.return
-}
-
-// CHECK-DAG: #[[$MAP:.*]] = affine_map<()[s0] -> (s0)>
-
-//     CHECK: func @reduce(%[[ARG0:.*]]: memref<100x10xf32>, %[[ARG1:.*]]: memref<f32>, %[[ARG2:.*]]: memref<100xf32>) {
-// CHECK-DAG: %[[C100:.*]] = arith.constant 100 : index
-// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
-//     CHECK: gpu.launch blocks({{.*}}, {{.*}}, {{.*}}) in ({{.*}} = %[[C1]], {{.*}} = %[[C1]], {{.*}} = %[[C1]]) threads(%[[IDX:.*]], {{.*}}, {{.*}}) in ({{.*}} = %[[C100]], {{.*}} = %[[C1]], {{.*}} = %[[C1]]) {
-//     CHECK:   %[[ACC:.*]] = memref.load %[[ARG1]][] : memref<f32>
-//     CHECK:   store %[[ACC]], %[[ARG2]][%[[IDX:.*]]] : memref<100xf32>
-// CHECK-DAG:   %[[LB:.*]] = arith.constant 0 : index
-// CHECK-DAG:   %[[UB:.*]] = arith.constant 10 : index
-// CHECK-DAG:   %[[STEP:.*]] = arith.constant 1 : index
-//     CHECK:   scf.for %[[IDX1:.*]] = %[[LB]] to %[[UB]] step %[[STEP]] {
-//     CHECK:     %[[LHS:.*]] = memref.subview %[[ARG2]][%[[IDX]]] [1] [1] : memref<100xf32> to memref<f32, #[[$MAP]]>
-//     CHECK:     %[[RHS:.*]] = memref.subview %[[ARG0]][%[[IDX]], %[[IDX1]]] [1, 1] [1, 1] : memref<100x10xf32> to memref<f32, #[[$MAP]]>
-//     CHECK:     "lmhlo.add"(%[[LHS]], %[[RHS]], %[[LHS]]) : (memref<f32, {{.*}}>, memref<f32, {{.*}}>, memref<f32, {{.*}}>) -> ()
-//     CHECK:   }
-//     CHECK:   gpu.terminator
-//     CHECK: }
-//     CHECK: return
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/lhlo/lhlo-legalize-to-parallel-loops.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/lhlo/lhlo-legalize-to-parallel-loops.mlir
deleted file mode 100644
index 15434a94a07749..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/lhlo/lhlo-legalize-to-parallel-loops.mlir
+++ /dev/null
@@ -1,198 +0,0 @@
-// RUN: mlir-hlo-opt %s -lhlo-legalize-to-parallel-loops -canonicalize -split-input-file | FILECHECK_OPTS="" FileCheck %s
-
-func.func @reduce(%arg: memref<100x10x5xf32>,
-             %init: memref<f32>,
-             %result: memref<100x5xf32>) {
-  "lmhlo.reduce"(%arg, %init, %result) ({
-    ^bb0(%lhs: memref<f32>, %rhs: memref<f32>, %res: memref<f32>):
-      "lmhlo.add"(%lhs, %rhs, %res)
-        : (memref<f32>, memref<f32>, memref<f32>) -> ()
-      "lmhlo.terminator"() : () -> ()
-    } ) {dimensions = dense<[1]> : tensor<1xi64>}
-      : (memref<100x10x5xf32>, memref<f32>, memref<100x5xf32>) -> ()
-  func.return
-}
-// CHECK-LABEL: func @reduce(
-// CHECK-SAME: [[ARG_BUF:%.*]]: memref<100x10x5xf32>,
-// CHECK-SAME: [[INIT_BUF:%.*]]: memref<f32>,
-// CHECK-SAME: [[RESULT_BUF:%.*]]: memref<100x5xf32>) {
-// CHECK-DAG:  [[C0:%.*]] = arith.constant 0 : index
-// CHECK-DAG:  [[C1:%.*]] = arith.constant 1 : index
-// CHECK-DAG:  [[C5:%.*]] = arith.constant 5 : index
-// CHECK-DAG:  [[C10:%.*]] = arith.constant 10 : index
-// CHECK-DAG:  [[C100:%.*]] = arith.constant 100 : index
-// CHECK:  [[INIT:%.*]] = memref.load [[INIT_BUF]]
-// CHECK:  scf.parallel ([[I:%.*]], [[K:%.*]]) = ([[C0]], [[C0]])
-// CHECK-SAME:                     to ([[C100]], [[C5]]) step ([[C1]], [[C1]]) {
-// CHECK:    [[REDUCTION_RESULT:%.*]] = scf.parallel ([[J:%.*]]) =
-// CHECK-SAME:      ([[C0]]) to ([[C10]]) step ([[C1]]) init ([[INIT]]) -> f32 {
-// CHECK:      [[ELEM_TO_REDUCE:%.*]] = memref.load [[ARG_BUF]]
-// CHECK-SAME:                 {{\[}}[[I]], [[J]], [[K]]] : memref<100x10x5xf32>
-// CHECK:      scf.reduce([[ELEM_TO_REDUCE]] : f32) {
-// CHECK:      ^bb0([[ELEM:%.*]]: f32, [[ACC:%.*]]: f32):
-// CHECK:        [[ELEM_BUF:%.*]] = memref.alloc() : memref<f32>
-// CHECK:        [[ACC_BUF:%.*]] = memref.alloc() : memref<f32>
-// CHECK:        [[ACC_OUT_BUF:%.*]] = memref.alloc() : memref<f32>
-// CHECK:        memref.store [[ELEM]], [[ELEM_BUF]][] : memref<f32>
-// CHECK:        memref.store [[ACC]], [[ACC_BUF]][] : memref<f32>
-// CHECK:        "lmhlo.add"([[ELEM_BUF]], [[ACC_BUF]], [[ACC_OUT_BUF]])
-// CHECK:        [[ACC_RESULT:%.*]] = memref.load [[ACC_OUT_BUF]][] : memref<f32>
-// CHECK:        scf.reduce.return [[ACC_RESULT]] : f32
-// CHECK:      }
-// CHECK:    }
-// CHECK:    memref.store [[REDUCTION_RESULT]], [[RESULT_BUF]]{{\[}}[[I]], [[K]]]
-// CHECK:    scf.reduce
-
-// -----
-
-func.func @reduce_no_outer_loop(%arg: memref<100xf32>,
-                           %init: memref<f32>,
-                           %result: memref<1xf32>) {
-  "lmhlo.reduce"(%arg, %init, %result) ({
-    ^bb0(%lhs: memref<f32>, %rhs: memref<f32>, %res: memref<f32>):
-      "lmhlo.add"(%lhs, %rhs, %res)
-        : (memref<f32>, memref<f32>, memref<f32>) -> ()
-      "lmhlo.terminator"() : () -> ()
-    } ) {dimensions = dense<[0]> : tensor<1xi64>}
-      : (memref<100xf32>, memref<f32>, memref<1xf32>) -> ()
-  func.return
-}
-// CHECK-LABEL: func @reduce_no_outer_loop(
-// CHECK-SAME: [[ARG_BUF:%.*]]: memref<100xf32>,
-// CHECK-SAME: [[ELEM_TO_REDUCE_BUF:%.*]]: memref<f32>,
-// CHECK-SAME: [[RESULT_BUF:%.*]]: memref<1xf32>) {
-// CHECK-DAG:  [[C0:%.*]] = arith.constant 0 : index
-// CHECK-DAG:  [[C1:%.*]] = arith.constant 1 : index
-// CHECK-DAG:  [[C100:%.*]] = arith.constant 100 : index
-// CHECK:      [[INIT:%.*]] = memref.load [[INIT_BUF]]
-// CHECK:      [[REDUCTION_RESULT:%.*]] = scf.parallel ([[I:%.*]]) = ([[C0]])
-// CHECK-SAME:     to ([[C100]]) step ([[C1]]) init ([[INIT]]) -> f32 {
-// CHECK:        [[ELEM_TO_REDUCE:%.*]] = memref.load [[ARG_BUF]]{{\[}}[[I]]{{\]}}
-// CHECK:        scf.reduce([[ELEM_TO_REDUCE]] : f32) {
-// CHECK:        ^bb0([[ELEM:%.*]]: f32, [[ACC:%.*]]: f32):
-// CHECK:          [[ELEM_BUF:%.*]] = memref.alloc() : memref<f32>
-// CHECK:          [[ACC_BUF:%.*]] = memref.alloc() : memref<f32>
-// CHECK:          [[ACC_OUT_BUF:%.*]] = memref.alloc() : memref<f32>
-// CHECK:          memref.store [[ELEM]], [[ELEM_BUF]][] : memref<f32>
-// CHECK:          memref.store [[ACC]], [[ACC_BUF]][] : memref<f32>
-// CHECK:          "lmhlo.add"([[ELEM_BUF]], [[ACC_BUF]], [[ACC_OUT_BUF]])
-// CHECK:          [[ACC_RESULT:%.*]] = memref.load [[ACC_OUT_BUF]][] : memref<f32>
-// CHECK:          scf.reduce.return [[ACC_RESULT]]
-// CHECK:        }
-// CHECK:      memref.store [[REDUCTION_RESULT]], [[RESULT_BUF]]{{\[}}[[C0]]]
-
-// -----
-
-func.func @dynamic_reduce(%arg: memref<?x?x?xf32>,
-                     %init: memref<f32>,
-                     %result: memref<?x?xf32>) {
-  "lmhlo.reduce"(%arg, %init, %result) ({
-    ^bb0(%lhs: memref<f32>, %rhs: memref<f32>, %res: memref<f32>):
-      "lmhlo.add"(%lhs, %rhs, %res)
-        : (memref<f32>, memref<f32>, memref<f32>) -> ()
-      "lmhlo.terminator"() : () -> ()
-    } ) {dimensions = dense<[1]> : tensor<1xi64>}
-      : (memref<?x?x?xf32>, memref<f32>, memref<?x?xf32>) -> ()
-  func.return
-}
-// CHECK-LABEL: func @dynamic_reduce(
-// CHECK-SAME: [[ARG_BUF:%.*]]: memref<?x?x?xf32>,
-// CHECK-SAME: [[INIT_BUF:%.*]]: memref<f32>,
-// CHECK-SAME: [[RESULT_BUF:%.*]]: memref<?x?xf32>) {
-// CHECK-DAG:  [[C0:%.*]] = arith.constant 0 : index
-// CHECK-DAG:  [[C1:%.*]] = arith.constant 1 : index
-// CHECK-DAG:  [[C2:%.*]] = arith.constant 2 : index
-// CHECK:  [[DIM0:%.*]] = memref.dim [[ARG_BUF]], [[C0]] : memref<?x?x?xf32>
-// CHECK:  [[DIM1:%.*]] = memref.dim [[ARG_BUF]], [[C1]] : memref<?x?x?xf32>
-// CHECK:  [[DIM2:%.*]] = memref.dim [[ARG_BUF]], [[C2]] : memref<?x?x?xf32>
-// CHECK:  [[INIT:%.*]] = memref.load [[INIT_BUF]]
-// CHECK:  scf.parallel ([[I:%.*]], [[K:%.*]]) = ([[C0]], [[C0]])
-// CHECK-SAME:                     to ([[DIM0]], [[DIM2]]) step ([[C1]], [[C1]]) {
-// CHECK:    [[REDUCTION_RESULT:%.*]] = scf.parallel ([[J:%.*]]) =
-// CHECK-SAME:     ([[C0]]) to ([[DIM1]]) step ([[C1]]) init ([[INIT]]) -> f32 {
-// CHECK:      [[ELEM_TO_REDUCE:%.*]] = memref.load [[ARG_BUF]]
-// CHECK-SAME:                 {{\[}}[[I]], [[J]], [[K]]] : memref<?x?x?xf32>
-// CHECK:      scf.reduce([[ELEM_TO_REDUCE]] : f32) {
-// CHECK:      ^bb0([[ELEM:%.*]]: f32, [[ACC:%.*]]: f32):
-// CHECK:        [[ELEM_BUF:%.*]] = memref.alloc() : memref<f32>
-// CHECK:        [[ACC_BUF:%.*]] = memref.alloc() : memref<f32>
-// CHECK:        [[ACC_OUT_BUF:%.*]] = memref.alloc() : memref<f32>
-// CHECK:        memref.store [[ELEM]], [[ELEM_BUF]][] : memref<f32>
-// CHECK:        memref.store [[ACC]], [[ACC_BUF]][] : memref<f32>
-// CHECK:        "lmhlo.add"([[ELEM_BUF]], [[ACC_BUF]], [[ACC_OUT_BUF]])
-// CHECK:        [[ACC_RESULT:%.*]] = memref.load [[ACC_OUT_BUF]][] : memref<f32>
-// CHECK:        scf.reduce.return [[ACC_RESULT]] : f32
-// CHECK:      }
-// CHECK:    }
-// CHECK:    memref.store [[REDUCTION_RESULT]], [[RESULT_BUF]]{{\[}}[[I]], [[K]]]
-// CHECK:    scf.reduce
-
-// -----
-
-func.func @reduce_window(%arg: memref<112x112xf32>,
-             %init: memref<f32>,
-             %result: memref<56x56xf32>) {
-  "lmhlo.reduce_window"(%arg, %init, %result) ({
-    ^bb0(%lhs: memref<f32>, %rhs: memref<f32>, %res: memref<f32>):
-      "lmhlo.maximum"(%lhs, %rhs, %res)
-        : (memref<f32>, memref<f32>, memref<f32>) -> ()
-      "lmhlo.terminator"() : () -> ()
-    }) {
-      padding = dense<[[0, 1], [0, 1]]> : tensor<2x2xi64>,
-      window_dimensions = dense<[3, 3]> : tensor<2xi64>,
-      window_strides = dense<[2, 2]> : tensor<2xi64>
-    } : (memref<112x112xf32>, memref<f32>, memref<56x56xf32>) -> ()
-  func.return
-}
-// CHECK-LABEL: func @reduce_window(
-// CHECK-SAME:      [[OPERAND_BUF:%.*]]: memref<112x112xf32>,
-// CHECK-SAME:      [[INIT_BUF:%.*]]: memref<f32>,
-// CHECK-SAME:      [[RESULT_BUF:%.*]]: memref<56x56xf32>) {
-// CHECK-DAG:  [[C0:%.*]] = arith.constant 0 : index
-// CHECK-DAG:  [[C1:%.*]] = arith.constant 1 : index
-// CHECK-DAG:  [[C2:%.*]] = arith.constant 2 : index
-// CHECK-DAG:  [[C3:%.*]] = arith.constant 3 : index
-// CHECK-DAG:  [[C56:%.*]] = arith.constant 56 : index
-// CHECK-DAG:  [[C112:%.*]] = arith.constant 112 : index
-// CHECK:      [[INIT:%.*]] = memref.load [[INIT_BUF]][] : memref<f32>
-// CHECK:      scf.parallel ([[I:%.*]], [[J:%.*]]) = ([[C0]], [[C0]])
-// CHECK-SAME:         to ([[C56]], [[C56]]) step ([[C1]], [[C1]]) {
-// CHECK:        [[REDUCTION_RESULT:%.*]] = scf.parallel
-// CHECK-SAME:       ([[IW:%.*]], [[JW:%.*]]) = ([[C0]], [[C0]])
-// CHECK-SAME:       to ([[C3]], [[C3]]) step ([[C1]], [[C1]])
-// CHECK-SAME:       init ([[INIT]]) -> f32 {
-
-// CHECK:          [[START_I:%.*]] = arith.muli [[I]], [[C2]] : index
-// CHECK:          [[INDEX_I:%.*]] = arith.addi [[START_I]], [[IW]] : index
-// CHECK:          [[INDEX_I_FITS:%.*]] = arith.cmpi ult, [[INDEX_I]], [[C112]]
-
-// CHECK:          [[START_J:%.*]] = arith.muli [[J]], [[C2]] : index
-// CHECK:          [[INDEX_J:%.*]] = arith.addi [[START_J]], [[JW]] : index
-// CHECK:          [[INDEX_J_FITS:%.*]] = arith.cmpi ult, [[INDEX_J]], [[C112]]
-// CHECK:          [[IN_BOUNDS_1:%.*]] = arith.andi [[INDEX_I_FITS]], [[INDEX_J_FITS]]
-
-// CHECK:          [[ELEM_TO_REDUCE:%.*]] = scf.if [[IN_BOUNDS_1]] -> (f32) {
-// CHECK:            [[OPERAND_ELEM:%.*]] =
-// CHECK-SAME:         memref.load [[OPERAND_BUF]]{{\[}}[[INDEX_I]], [[INDEX_J]]]
-// CHECK:              scf.yield [[OPERAND_ELEM]] : f32
-// CHECK:            } else {
-// CHECK:              scf.yield [[INIT]] : f32
-// CHECK:            }
-
-// CHECK:          scf.reduce([[ELEM_TO_REDUCE]]  : f32) {
-// CHECK:          ^bb0([[ELEM:%.*]]: f32, [[ACC:%.*]]: f32):
-// CHECK:            [[ELEM_BUF:%.*]] = memref.alloc() : memref<f32>
-// CHECK:            [[ACC_BUF:%.*]] = memref.alloc() : memref<f32>
-// CHECK:            [[ACC_OUT_BUF:%.*]] = memref.alloc() : memref<f32>
-// CHECK:            memref.store [[ELEM]], [[ELEM_BUF]][] : memref<f32>
-// CHECK:            memref.store [[ACC]], [[ACC_BUF]][] : memref<f32>
-// CHECK:            "lmhlo.maximum"([[ELEM_BUF]], [[ACC_BUF]], [[ACC_OUT_BUF]])
-// CHECK:            [[ACC_RESULT:%.*]] = memref.load [[ACC_OUT_BUF]][] : memref<f32>
-// CHECK:            scf.reduce.return [[ACC_RESULT]] : f32
-// CHECK:          }
-// CHECK:        }
-// CHECK:        memref.store [[REDUCTION_RESULT]], [[RESULT_BUF]]{{\[}}[[I]], [[J]]]
-// CHECK:        scf.reduce
-// CHECK:      }
-// CHECK:      return
-// CHECK:    }
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/lhlo/lhlo-legalize-to-tensor-op.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/lhlo/lhlo-legalize-to-tensor-op.mlir
deleted file mode 100644
index 999c73394d9539..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/lhlo/lhlo-legalize-to-tensor-op.mlir
+++ /dev/null
@@ -1,36 +0,0 @@
-// RUN: mlir-hlo-opt -lhlo-legalize-to-tensor-op %s -o - | FileCheck %s
-
-// test: `memref -> bufferization.to_tensor -> tensor.extract` -> `memref -> memref.load`
-// CHECK-LABEL: forward_extract_op
-// CHECK-SAME: (%[[ARG0:.*]]: memref<?x?xf32>, %[[ARG1:.*]]: memref<3xindex>)
-func.func @forward_extract_op(%arg0: memref<?x?xf32>, %arg1: memref<3xindex>) -> memref<?x?x?xf32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  // CHECK-NOT: bufferization.to_tensor
-  // CHECK-NOT: tensor.extract
-  // CHECK: %[[DIM0:.*]] = memref.load %[[ARG1]][%c0]
-  // CHECK: %[[DIM1:.*]] = memref.load %[[ARG1]][%c1]
-  // CHECK: %[[DIM2:.*]] = memref.load %[[ARG1]][%c2]
-  // CHECK: memref.alloc(%[[DIM0]], %[[DIM1]], %[[DIM2]])
-  %0 = bufferization.to_tensor %arg1 : memref<3xindex>
-  %1 = tensor.extract %0[%c0] : tensor<3xindex>
-  %2 = tensor.extract %0[%c1] : tensor<3xindex>
-  %3 = tensor.extract %0[%c2] : tensor<3xindex>
-  %4 = memref.alloc(%1, %2, %3) : memref<?x?x?xf32>
-  "lmhlo.dynamic_broadcast_in_dim"(%arg0, %arg1, %4) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (memref<?x?xf32>, memref<3xindex>, memref<?x?x?xf32>) -> ()
-  func.return %4 : memref<?x?x?xf32>
-}
-
-// -----
-
-// test: `memref -> bufferization.to_tensor -> shape.shape_of` -> `memref -> shape.shape_of`
-// CHECK-LABEL: forward_shape_of_op
-// CHECK-SAME: (%[[ARG:.*]]: memref<?x?xf32>)
-func.func @forward_shape_of_op(%arg0: memref<?x?xf32>) -> tensor<2xindex> {
-  // CHECK-NOT: bufferization.to_tensor
-  // CHECK: shape.shape_of %[[ARG]] : memref<?x?xf32> -> tensor<2xindex>
-  %0 = bufferization.to_tensor %arg0 : memref<?x?xf32>
-  %1 = shape.shape_of %0 : tensor<?x?xf32> -> tensor<2xindex>
-  func.return %1 : tensor<2xindex>
-}
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/lhlo/ops.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/lhlo/ops.mlir
deleted file mode 100644
index 78b64069531d0c..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/lhlo/ops.mlir
+++ /dev/null
@@ -1,1133 +0,0 @@
-// RUN: mlir-hlo-opt %s -verify-diagnostics -split-input-file | mlir-hlo-opt -split-input-file | FileCheck %s
-
-// -----
-
-// CHECK-LABEL: func @ceil
-func.func @ceil(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
-  "lmhlo.ceil"(%input, %result) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @ceil(%input: memref<2x2xi32>, %result: memref<2x2xi32>) {
-  // expected-error@+1{{must be memref of floating-point values}}
-  "lmhlo.ceil"(%input, %result) : (memref<2x2xi32>, memref<2x2xi32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @cos
-func.func @cos(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
-  "lmhlo.cosine"(%input, %result) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @cos
-func.func @cos(%input: memref<2x2xcomplex<f32>>, %result: memref<2x2xcomplex<f32>>) {
-  "lmhlo.cosine"(%input, %result) : (memref<2x2xcomplex<f32>>, memref<2x2xcomplex<f32>>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @cos(%input: memref<2x2xi32>, %result: memref<2x2xi32>) {
-  // expected-error@+1{{must be memref of floating-point or complex-type values}}
-  "lmhlo.cosine"(%input, %result) : (memref<2x2xi32>, memref<2x2xi32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @sin
-func.func @sin(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
-  "lmhlo.sine"(%input, %result) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @sin
-func.func @sin(%input: memref<2x2xcomplex<f32>>, %result: memref<2x2xcomplex<f32>>) {
-  "lmhlo.sine"(%input, %result) : (memref<2x2xcomplex<f32>>, memref<2x2xcomplex<f32>>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @sin(%input: memref<2x2xi32>, %result: memref<2x2xi32>) {
-  // expected-error@+1{{must be memref of floating-point or complex-type values}}
-  "lmhlo.sine"(%input, %result) : (memref<2x2xi32>, memref<2x2xi32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @add_memrefs
-func.func @add_memrefs(%arg0: memref<1xi32>, %arg1: memref<1xi32>, %arg_out: memref<1xi32>) -> () {
-  "lmhlo.add"(%arg0, %arg1, %arg_out) : (memref<1xi32>, memref<1xi32>, memref<1xi32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @abs_memref
-func.func @abs_memref(%in: memref<10xf32>, %out: memref<10xf32>) -> () {
-  "lmhlo.abs"(%in, %out) : (memref<10xf32>, memref<10xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @convert_memref
-func.func @convert_memref(%in: memref<10xf32>, %out: memref<10xi32>) -> () {
-  "lmhlo.convert"(%in, %out) : (memref<10xf32>, memref<10xi32>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @convert_memref(%in: memref<10xf32>, %out: memref<9xi32>) -> () {
-  // expected-error@+1{{requires the same shape for all operands}}
-  "lmhlo.convert"(%in, %out) : (memref<10xf32>, memref<9xi32>) -> ()
-  func.return
-}
-
-// -----
-// CHECK-LABEL: func @exp
-func.func @exp(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
-  "lmhlo.exponential"(%input, %result) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @exp
-func.func @exp(%input: memref<2x2xcomplex<f32>>, %result: memref<2x2xcomplex<f32>>) {
-  "lmhlo.exponential"(%input, %result) : (memref<2x2xcomplex<f32>>, memref<2x2xcomplex<f32>>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @exp(%input: memref<2x2xi32>, %result: memref<2x2xi32>) {
-  // expected-error@+1{{must be memref of floating-point or complex-type values}}
-  "lmhlo.exponential"(%input, %result) : (memref<2x2xi32>, memref<2x2xi32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @log_memref
-func.func @log_memref(%in: memref<10xf32>, %out: memref<10xf32>) -> () {
-  "lmhlo.log"(%in, %out) : (memref<10xf32>, memref<10xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @log_memref
-func.func @log_memref(%in: memref<10xcomplex<f32>>, %out: memref<10xcomplex<f32>>) -> () {
-  "lmhlo.log"(%in, %out) : (memref<10xcomplex<f32>>, memref<10xcomplex<f32>>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @log_memref(%in: memref<10xi32>, %out: memref<10xi32>) -> () {
-  // expected-error@+1{{must be memref of floating-point or complex-type values}}
-  "lmhlo.log"(%in, %out) : (memref<10xi32>, memref<10xi32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @neg_memref
-func.func @neg_memref(%in: memref<10xf32>, %out: memref<10xf32>) -> () {
-  "lmhlo.negate"(%in, %out) : (memref<10xf32>, memref<10xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @rsqrt_memref
-func.func @rsqrt_memref(%in: memref<10xf32>, %out: memref<10xf32>) -> () {
-  "lmhlo.rsqrt"(%in, %out) : (memref<10xf32>, memref<10xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @rsqrt_memref
-func.func @rsqrt_memref(%in: memref<10xcomplex<f32>>, %out: memref<10xcomplex<f32>>) -> () {
-  "lmhlo.rsqrt"(%in, %out) : (memref<10xcomplex<f32>>, memref<10xcomplex<f32>>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @rsqrt_memref(%in: memref<10xi32>, %out: memref<10xi32>) -> () {
-  // expected-error@+1{{must be memref of floating-point or complex-type values}}
-  "lmhlo.rsqrt"(%in, %out) : (memref<10xi32>, memref<10xi32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @sqrt_memref
-func.func @sqrt_memref(%in: memref<10xf32>, %out: memref<10xf32>) -> () {
-  "lmhlo.sqrt"(%in, %out) : (memref<10xf32>, memref<10xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @sqrt_memref
-func.func @sqrt_memref(%in: memref<10xcomplex<f32>>, %out: memref<10xcomplex<f32>>) -> () {
-  "lmhlo.sqrt"(%in, %out) : (memref<10xcomplex<f32>>, memref<10xcomplex<f32>>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @sqrt_memref(%in: memref<10xi32>, %out: memref<10xi32>) -> () {
-  // expected-error@+1{{must be memref of floating-point or complex-type values}}
-  "lmhlo.sqrt"(%in, %out) : (memref<10xi32>, memref<10xi32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @sign_memref
-func.func @sign_memref(%in: memref<10xf32>, %out: memref<10xf32>) -> () {
-  "lmhlo.sign"(%in, %out) : (memref<10xf32>, memref<10xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @tanh_memref
-func.func @tanh_memref(%in: memref<10xf32>, %out: memref<10xf32>) -> () {
-  "lmhlo.tanh"(%in, %out) : (memref<10xf32>, memref<10xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @tanh_memref
-func.func @tanh_memref(%in: memref<10xcomplex<f32>>, %out: memref<10xcomplex<f32>>) -> () {
-  "lmhlo.tanh"(%in, %out) : (memref<10xcomplex<f32>>, memref<10xcomplex<f32>>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @tanh_memref(%in: memref<10xi32>, %out: memref<10xi32>) -> () {
-  // expected-error@+1{{must be memref of floating-point or complex-type values}}
-  "lmhlo.tanh"(%in, %out) : (memref<10xi32>, memref<10xi32>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @tanh_memref(%arg0: memref<1xf32>, %arg1: memref<2xf32>) -> () {
-  // expected-error@+1{{'lmhlo.tanh' op requires all operands to have the same type}}
-  "lmhlo.tanh"(%arg0, %arg1) : (memref<1xf32>, memref<2xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @add_memref
-func.func @add_memref(%lhs: memref<10xf32>, %rhs: memref<10xf32>, %out: memref<10xf32>) -> () {
-  "lmhlo.add"(%lhs, %rhs, %out) : (memref<10xf32>, memref<10xf32>, memref<10xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @div_memref
-func.func @div_memref(%lhs: memref<10xf32>, %rhs: memref<10xf32>, %out: memref<10xf32>) -> () {
-  "lmhlo.divide"(%lhs, %rhs, %out) : (memref<10xf32>, memref<10xf32>, memref<10xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @max_memref
-func.func @max_memref(%lhs: memref<10xf32>, %rhs: memref<10xf32>, %out: memref<10xf32>) -> () {
-  "lmhlo.maximum"(%lhs, %rhs, %out) : (memref<10xf32>, memref<10xf32>, memref<10xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @min_memref
-func.func @min_memref(%lhs: memref<10xf32>, %rhs: memref<10xf32>, %out: memref<10xf32>) -> () {
-  "lmhlo.minimum"(%lhs, %rhs, %out) : (memref<10xf32>, memref<10xf32>, memref<10xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @mul_memref
-func.func @mul_memref(%lhs: memref<10xf32>, %rhs: memref<10xf32>, %out: memref<10xf32>) -> () {
-  "lmhlo.multiply"(%lhs, %rhs, %out) : (memref<10xf32>, memref<10xf32>, memref<10xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @sub_memref
-func.func @sub_memref(%lhs: memref<10xf32>, %rhs: memref<10xf32>, %out: memref<10xf32>) -> () {
-  "lmhlo.subtract"(%lhs, %rhs, %out) : (memref<10xf32>, memref<10xf32>, memref<10xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @and_memref
-func.func @and_memref(%lhs: memref<10xi32>, %rhs: memref<10xi32>, %out: memref<10xi32>) -> () {
-  "lmhlo.and"(%lhs, %rhs, %out) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @and_memref
-func.func @and_memref(%lhs: memref<10xi1>, %rhs: memref<10xi1>, %out: memref<10xi1>) -> () {
-  "lmhlo.and"(%lhs, %rhs, %out) : (memref<10xi1>, memref<10xi1>, memref<10xi1>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @and_memref(%lhs: memref<10xf32>, %rhs: memref<10xf32>, %out: memref<10xf32>) -> () {
-  // expected-error @+1 {{must be memref of 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or pred (AKA boolean or 1-bit integer) values}}
-  "lmhlo.and"(%lhs, %rhs, %out) : (memref<10xf32>, memref<10xf32>, memref<10xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @or_memref
-func.func @or_memref(%lhs: memref<10xi32>, %rhs: memref<10xi32>, %out: memref<10xi32>) -> () {
-  "lmhlo.or"(%lhs, %rhs, %out) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @or_memref
-func.func @or_memref(%lhs: memref<10xi1>, %rhs: memref<10xi1>, %out: memref<10xi1>) -> () {
-  "lmhlo.or"(%lhs, %rhs, %out) : (memref<10xi1>, memref<10xi1>, memref<10xi1>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @or_memref(%lhs: memref<10xf32>, %rhs: memref<10xf32>, %out: memref<10xf32>) -> () {
-  // expected-error @+1 {{must be memref of 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or pred (AKA boolean or 1-bit integer) values}}
-  "lmhlo.or"(%lhs, %rhs, %out) : (memref<10xf32>, memref<10xf32>, memref<10xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @xor_memref
-func.func @xor_memref(%lhs: memref<10xi32>, %rhs: memref<10xi32>, %out: memref<10xi32>) -> () {
-  "lmhlo.xor"(%lhs, %rhs, %out) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @xor_memref
-func.func @xor_memref(%lhs: memref<10xi1>, %rhs: memref<10xi1>, %out: memref<10xi1>) -> () {
-  "lmhlo.xor"(%lhs, %rhs, %out) : (memref<10xi1>, memref<10xi1>, memref<10xi1>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @xor_memref(%lhs: memref<10xf32>, %rhs: memref<10xf32>, %out: memref<10xf32>) -> () {
-  // expected-error @+1 {{must be memref of 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or pred (AKA boolean or 1-bit integer) values}}
-  "lmhlo.xor"(%lhs, %rhs, %out) : (memref<10xf32>, memref<10xf32>, memref<10xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @broadcast_in_dim_memref
-func.func @broadcast_in_dim_memref(%arg0: memref<1x2xi32>, %out: memref<1x2x2xi32>) -> () {
-  "lmhlo.broadcast_in_dim"(%arg0, %out) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (memref<1x2xi32>, memref<1x2x2xi32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @broadcast_in_dim_zero_rank_memref
-func.func @broadcast_in_dim_zero_rank_memref(%arg0: memref<i32>, %out: memref<1x2x3xi32>) -> () {
-  "lmhlo.broadcast_in_dim"(%arg0, %out) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (memref<i32>, memref<1x2x3xi32>) -> ()
-  func.return
-}
-
-// -----
-
-
-// CHECK-LABEL: func @reduce_memref
-func.func @reduce_memref(%input: memref<10xf32>, %init: memref<f32>, %out: memref<1xf32>) -> () {
-  "lmhlo.reduce"(%input, %init, %out) ({
-  ^bb0(%arg1: memref<f32>, %arg2: memref<f32>, %result: memref<f32>):
-    "lmhlo.add"(%arg1, %arg2, %result) : (memref<f32>, memref<f32>, memref<f32>) -> ()
-    "lmhlo.terminator"() : () -> ()
-  } ) {dimensions = dense<[0]> : tensor<1xi64>} : (memref<10xf32>, memref<f32>, memref<1xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @fusion_memref
-func.func @fusion_memref(%input1: memref<10xf32>, %input2: memref<10xf32>, %input3: memref<10xf32>, %out: memref<10xf32>) -> () {
-  "lmhlo.fusion"() ({
-    %0 = bufferization.to_tensor %input1 : memref<10xf32>
-    %1 = bufferization.to_tensor %input2 : memref<10xf32>
-    %2 = "mhlo.add"(%0, %1) {name = "add"} : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xf32>
-    %3 = bufferization.to_tensor %input3 : memref<10xf32>
-    %4 = "mhlo.multiply"(%2, %3) {name = "multiply"} : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xf32>
-    bufferization.materialize_in_destination %4 in writable %out
-        : (tensor<10xf32>, memref<10xf32>) -> ()
-    "lmhlo.terminator"() : () -> ()
-  } ) : () -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @case_memref
-func.func @case_memref(%index: memref<i32>, %operand_1: memref<f32>, %operand_2: memref<f32>, %operand_3: memref<f32>, %out: memref<f32>) -> () {
-  "lmhlo.case"(%index) ({
-    ^bb0:
-      "lmhlo.negate"(%operand_1, %out) : (memref<f32>, memref<f32>) -> ()
-      "lmhlo.terminator"() : () -> ()
-    },  {
-    ^bb0:
-      "lmhlo.copy"(%operand_2, %out) : (memref<f32>, memref<f32>) -> ()
-      "lmhlo.terminator"() : () -> ()
-    },  {
-    ^bb0:
-      "lmhlo.add"(%operand_3, %operand_3, %out) : (memref<f32>, memref<f32>, memref<f32>) -> ()
-      "lmhlo.terminator"() : () -> ()
-    }
-  ) : (memref<i32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @atan2_memrefs
-func.func @atan2_memrefs(%arg0: memref<1xf32>, %arg1: memref<1xf32>, %arg_out: memref<1xf32>) -> () {
-  "lmhlo.atan2"(%arg0, %arg1, %arg_out) : (memref<1xf32>, memref<1xf32>, memref<1xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @atan2_memrefs
-func.func @atan2_memrefs(%arg0: memref<1xcomplex<f32>>, %arg1: memref<1xcomplex<f32>>, %arg_out: memref<1xcomplex<f32>>) -> () {
-  "lmhlo.atan2"(%arg0, %arg1, %arg_out) : (memref<1xcomplex<f32>>, memref<1xcomplex<f32>>, memref<1xcomplex<f32>>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @atan2_memrefs(%arg0: memref<1xi32>, %arg1: memref<1xi32>, %arg_out: memref<1xi32>) -> () {
-  // expected-error@+1{{must be memref of floating-point or complex-type values}}
-  "lmhlo.atan2"(%arg0, %arg1, %arg_out) : (memref<1xi32>, memref<1xi32>, memref<1xi32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @bitcast_convert_memrefs
-func.func @bitcast_convert_memrefs(%arg0: memref<1xf32>, %arg_out: memref<1xi32>) -> () {
-  "lmhlo.bitcast_convert"(%arg0, %arg_out) : (memref<1xf32>, memref<1xi32>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @bitcast_convert_memrefs(%arg0: memref<1xf32>, %arg_out: memref<2xi32>) -> () {
-  // expected-error@+1{{requires the same shape for all operands}}
-  "lmhlo.bitcast_convert"(%arg0, %arg_out) : (memref<1xf32>, memref<2xi32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @clz_memrefs
-func.func @clz_memrefs(%arg0: memref<1xi32>, %arg_out: memref<1xi32>) -> () {
-  "lmhlo.count_leading_zeros"(%arg0, %arg_out) : (memref<1xi32>, memref<1xi32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @expm1_memrefs
-func.func @expm1_memrefs(%arg0: memref<1xf32>, %arg_out: memref<1xf32>) -> () {
-  "lmhlo.exponential_minus_one"(%arg0, %arg_out) : (memref<1xf32>, memref<1xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @expm1_memrefs
-func.func @expm1_memrefs(%arg0: memref<1xcomplex<f32>>, %arg_out: memref<1xcomplex<f32>>) -> () {
-  "lmhlo.exponential_minus_one"(%arg0, %arg_out) : (memref<1xcomplex<f32>>, memref<1xcomplex<f32>>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @floor_memrefs
-func.func @floor_memrefs(%arg0: memref<1xf32>, %arg_out: memref<1xf32>) -> () {
-  "lmhlo.floor"(%arg0, %arg_out) : (memref<1xf32>, memref<1xf32>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @floor_memrefs(%arg0: memref<1xi32>, %arg_out: memref<1xi32>) -> () {
-  // expected-error@+1{{must be memref of floating-point values}}
-  "lmhlo.floor"(%arg0, %arg_out) : (memref<1xi32>, memref<1xi32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @imag_memrefs
-func.func @imag_memrefs(%arg0: memref<1xcomplex<f32>>, %arg_out: memref<1xf32>) -> () {
-  "lmhlo.imag"(%arg0, %arg_out) : (memref<1xcomplex<f32>>, memref<1xf32>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @imag_memrefs(%arg0: memref<1xf32>, %arg_out: memref<1xf32>) -> () {
-  // expected-error@+1{{must be memref of complex-type values}}
-  "lmhlo.imag"(%arg0, %arg_out) : (memref<1xf32>, memref<1xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @real_memrefs
-func.func @real_memrefs(%arg0: memref<1xcomplex<f32>>, %arg_out: memref<1xf32>) -> () {
-  "lmhlo.real"(%arg0, %arg_out) : (memref<1xcomplex<f32>>, memref<1xf32>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @real_memrefs(%arg0: memref<1xf32>, %arg_out: memref<1xf32>) -> () {
-  // expected-error@+1{{must be memref of complex-type values}}
-  "lmhlo.real"(%arg0, %arg_out) : (memref<1xf32>, memref<1xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @is_finite_memrefs
-func.func @is_finite_memrefs(%arg0: memref<1xf32>, %arg_out: memref<1xi1>) -> () {
-  "lmhlo.is_finite"(%arg0, %arg_out) : (memref<1xf32>, memref<1xi1>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @log1p_memrefs
-func.func @log1p_memrefs(%arg0: memref<1xf32>, %arg_out: memref<1xf32>) -> () {
-  "lmhlo.log_plus_one"(%arg0, %arg_out) : (memref<1xf32>, memref<1xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @log1p_memrefs
-func.func @log1p_memrefs(%arg0: memref<1xcomplex<f32>>, %arg_out: memref<1xcomplex<f32>>) -> () {
-  "lmhlo.log_plus_one"(%arg0, %arg_out) : (memref<1xcomplex<f32>>, memref<1xcomplex<f32>>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @log1p_memref(%in: memref<10xi32>, %out: memref<10xi32>) -> () {
-  // expected-error@+1{{must be memref of floating-point or complex-type values}}
-  "lmhlo.log_plus_one"(%in, %out) : (memref<10xi32>, memref<10xi32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @not_memrefs
-func.func @not_memrefs(%arg0: memref<1xi32>, %arg_out: memref<1xi32>) -> () {
-  "lmhlo.not"(%arg0, %arg_out) : (memref<1xi32>, memref<1xi32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @not_memrefs
-func.func @not_memrefs(%arg0: memref<1xi1>, %arg_out: memref<1xi1>) -> () {
-  "lmhlo.not"(%arg0, %arg_out) : (memref<1xi1>, memref<1xi1>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @not_memrefs(%arg0: memref<1xf32>, %arg_out: memref<1xf32>) -> () {
-  // expected-error @+1 {{must be memref of 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or pred (AKA boolean or 1-bit integer) values}}
-  "lmhlo.not"(%arg0, %arg_out) : (memref<1xf32>, memref<1xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @popcnt_memrefs
-func.func @popcnt_memrefs(%arg0: memref<1xi32>, %arg_out: memref<1xi32>) -> () {
-  "lmhlo.popcnt"(%arg0, %arg_out) : (memref<1xi32>, memref<1xi32>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @popcnt_memrefs(%arg0: memref<1xf32>, %arg_out: memref<1xf32>) -> () {
-  // expected-error @+1 {{must be memref of 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer values}}
-  "lmhlo.popcnt"(%arg0, %arg_out) : (memref<1xf32>, memref<1xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @reduce_precision_memrefs
-func.func @reduce_precision_memrefs(%arg0: memref<1xf32>, %arg_out: memref<1xf32>) -> () {
-  "lmhlo.reduce_precision"(%arg0, %arg_out) { exponent_bits = 4 : i32, mantissa_bits = 4 : i32 } : (memref<1xf32>, memref<1xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @round_memrefs
-func.func @round_memrefs(%arg0: memref<1xf32>, %arg_out: memref<1xf32>) -> () {
-  "lmhlo.round_nearest_afz"(%arg0, %arg_out) : (memref<1xf32>, memref<1xf32>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @round_memrefs(%arg0: memref<1xi32>, %arg_out: memref<1xi32>) -> () {
-  // expected-error@+1{{must be memref of floating-point values}}
-  "lmhlo.round_nearest_afz"(%arg0, %arg_out) : (memref<1xi32>, memref<1xi32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @shift_left_memrefs
-func.func @shift_left_memrefs(%arg0: memref<1xi32>, %arg1: memref<1xi32>, %arg_out: memref<1xi32>) -> () {
-  "lmhlo.shift_left"(%arg0, %arg1, %arg_out) : (memref<1xi32>, memref<1xi32>, memref<1xi32>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @shift_left_memrefs(%arg0: memref<1xf32>, %arg1: memref<1xf32>, %arg_out: memref<1xf32>) -> () {
-  // expected-error @+1 {{must be memref of 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer values}}
-  "lmhlo.shift_left"(%arg0, %arg1, %arg_out) : (memref<1xf32>, memref<1xf32>, memref<1xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @shift_right_arithmetic_memrefs
-func.func @shift_right_arithmetic_memrefs(%arg0: memref<1xi32>, %arg1: memref<1xi32>, %arg_out: memref<1xi32>) -> () {
-  "lmhlo.shift_right_arithmetic"(%arg0, %arg1, %arg_out) : (memref<1xi32>, memref<1xi32>, memref<1xi32>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @shift_right_arithmetic_memrefs(%arg0: memref<1xf32>, %arg1: memref<1xf32>, %arg_out: memref<1xf32>) -> () {
-  // expected-error @+1 {{must be memref of 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer values}}
-  "lmhlo.shift_right_arithmetic"(%arg0, %arg1, %arg_out) : (memref<1xf32>, memref<1xf32>, memref<1xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @shift_right_logical_memrefs
-func.func @shift_right_logical_memrefs(%arg0: memref<1xi32>, %arg1: memref<1xi32>, %arg_out: memref<1xi32>) -> () {
-  "lmhlo.shift_right_logical"(%arg0, %arg1, %arg_out) : (memref<1xi32>, memref<1xi32>, memref<1xi32>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @shift_right_logical_memrefs(%arg0: memref<1xf32>, %arg1: memref<1xf32>, %arg_out: memref<1xf32>) -> () {
-  // expected-error @+1 {{must be memref of 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer values}}
-  "lmhlo.shift_right_logical"(%arg0, %arg1, %arg_out) : (memref<1xf32>, memref<1xf32>, memref<1xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @fft_memrefs
-func.func @fft_memrefs(%arg0: memref<3x9xf32>, %arg_out: memref<3x5xcomplex<f32>>) -> () {
-  "lmhlo.fft"(%arg0, %arg_out) {fft_length = dense<9> : tensor<1xi64>, fft_type = #mhlo<fft_type RFFT>} : (memref<3x9xf32>, memref<3x5xcomplex<f32>>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @batch_norm_grad_memrefs
-func.func @batch_norm_grad_memrefs(%arg0: memref<8x8x8x8xf32>, %arg1: memref<8xf32>, %arg2: memref<8xf32>,
-                              %arg3: memref<8xf32>, %arg4: memref<8x8x8x8xf32>,
-                              %grad_operand: memref<8x8x8x8xf32>, %grad_scale: memref<8xf32>,
-                              %grad_offset: memref<8xf32>) -> () {
-  "lmhlo.batch_norm_grad"(%arg0, %arg1, %arg2, %arg3, %arg4, %grad_operand, %grad_scale, %grad_offset) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64}
-      : (memref<8x8x8x8xf32>, memref<8xf32>, memref<8xf32>, memref<8xf32>, memref<8x8x8x8xf32>,
-         memref<8x8x8x8xf32>, memref<8xf32>, memref<8xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @batch_norm_inference_memrefs
-func.func @batch_norm_inference_memrefs(%arg0: memref<8x8x8x8xf32>, %arg1: memref<8xf32>, %arg2: memref<8xf32>,
-                                   %arg3: memref<8xf32>, %arg4: memref<8xf32>, %arg_out: memref<8x8x8x8xf32>) -> () {
-  "lmhlo.batch_norm_inference"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg_out) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64}
-      : (memref<8x8x8x8xf32>, memref<8xf32>, memref<8xf32>, memref<8xf32>, memref<8xf32>, memref<8x8x8x8xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @batch_norm_training_memrefs
-func.func @batch_norm_training_memrefs(%arg0: memref<8x8x8x8xf32>, %arg1: memref<8xf32>, %arg2: memref<8xf32>,
-                                  %output: memref<8x8x8x8xf32>, %batch_mean: memref<8xf32>,
-                                  %batch_var: memref<8xf32>) -> () {
-  "lmhlo.batch_norm_training"(%arg0, %arg1, %arg2, %output, %batch_mean, %batch_var) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64}
-      : (memref<8x8x8x8xf32>, memref<8xf32>, memref<8xf32>, memref<8x8x8x8xf32>, memref<8xf32>, memref<8xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @cholesky_memrefs
-func.func @cholesky_memrefs(%arg0: memref<1x291x291xf32>, %arg_out: memref<1x291x291xf32>) -> () {
-  "lmhlo.cholesky"(%arg0, %arg_out) : (memref<1x291x291xf32>, memref<1x291x291xf32>) -> ()
-  "lmhlo.cholesky"(%arg0, %arg_out) { lower = true } : (memref<1x291x291xf32>, memref<1x291x291xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @infeed_memrefs
-func.func @infeed_memrefs(%arg_out: memref<3xf32>) -> () {
-  "lmhlo.infeed"(%arg_out) { config = "x" } : (memref<3xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @outfeed_memrefs
-func.func @outfeed_memrefs(%arg0: memref<3xf32>) -> () {
-  "lmhlo.outfeed"(%arg0) { config = "x" } : (memref<3xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @replica_id_memrefs
-func.func @replica_id_memrefs(%arg_out: memref<ui32>) -> () {
-  "lmhlo.replica_id"(%arg_out) : (memref<ui32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @triangular_solve_memrefs
-func.func @triangular_solve_memrefs(%arg0: memref<4x4xf32>, %arg1: memref<3x4xf32>, %arg_out: memref<3x4xf32>) -> () {
-  "lmhlo.triangular_solve"(%arg0, %arg1, %arg_out)
-       {layout_a = dense<[1, 0]> : tensor<2xindex>,
-        layout_b = dense<[1, 0]> : tensor<2xindex>,
-        layout_output = dense<[1, 0]> : tensor<2xindex>,
-        left_side = true, lower = true, transpose_a = #mhlo<transpose NO_TRANSPOSE>,
-        unit_diagonal = true}
-      : (memref<4x4xf32>, memref<3x4xf32>, memref<3x4xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @while_memrefs
-func.func @while_memrefs(%arg0: memref<i64>, %arg_out: memref<i64>, %cond: memref<i1>) -> () {
-  "lmhlo.while"(%cond) (
-    { ^bb0: "lmhlo.terminator"() : () -> () },
-    { ^bb0: "lmhlo.terminator"() : () -> () }
-  ) : (memref<i1>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @while_memrefs
-func.func @while_memrefs(%arg0: memref<i64>, %arg1: memref<5xf32>, %arg0_out: memref<i64>, %arg1_out: memref<5xf32>, %cond: memref<i1>) -> () {
-  "lmhlo.while"(%cond) (
-    { ^bb0: "lmhlo.terminator"() : () -> () },
-    { ^bb0: "lmhlo.terminator"() : () -> () }
-  ) : (memref<i1>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @scatter_memrefs
-func.func @scatter_memrefs(%input: memref<200x100x300xf32>, %indices: memref<10x2xi32>,
-                      %updates: memref<10x300xf32>, %arg_out: memref<200x100x300xf32>) -> () {
-  "lmhlo.scatter" (%input, %indices, %updates, %arg_out) ({
-  ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
-    %add = mhlo.add %lhs, %rhs : tensor<f32>
-    "mhlo.return"(%add) : (tensor<f32>) -> ()
-  }) {
-    scatter_dimension_numbers = #mhlo.scatter<
-      inserted_window_dims = [0, 1],
-      index_vector_dim = 1,
-      update_window_dims = [1],
-      scatter_dims_to_operand_dims = [0, 1],
-    >,
-    indices_are_sorted = true,
-    unique_indices = true
-  } : (memref<200x100x300xf32>, memref<10x2xi32>, memref<10x300xf32>, memref<200x100x300xf32>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @map_memrefs
-func.func @map_memrefs(%arg0: memref<20xf32>, %arg1: memref<20xf32>, %arg_out: memref<20xf32>) -> () {
-  "lmhlo.map"(%arg0, %arg1, %arg_out) ({
-    ^bb0(%a: tensor<f32>, %b: tensor<f32>):
-    %c = mhlo.add %a, %b : tensor<f32>
-    "mhlo.return"(%c) : (tensor<f32>) -> ()
-  }) {dimensions = dense<0> : tensor<1xi64>} : (memref<20xf32>, memref<20xf32>, memref<20xf32>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @map_memrefs(%arg0: memref<20xf32>, %arg1: memref<20xf32>, %arg_out: memref<10xf32>) -> () {
-  // expected-error@+1{{requires the same shape for all operands}}
-  "lmhlo.map"(%arg0, %arg1, %arg_out) ({
-    ^bb0(%a: tensor<f32>, %b: tensor<f32>):
-    %c = mhlo.add %a, %b : tensor<f32>
-    "mhlo.return"(%c) : (tensor<f32>) -> ()
-  }) {dimensions = dense<0> : tensor<1xi64>} : (memref<20xf32>, memref<20xf32>, memref<10xf32>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @pad_output_rank_error(%arg0: memref<1x2x3xf16>, %arg1: memref<f16>, %arg2: memref<4x5xf16>) {
-  // expected-error@+1{{output's rank(2) is not same as operand's rank(3)}}
-  "lmhlo.pad"(%arg0, %arg1, %arg2) {
-         edge_padding_high = dense<[1, 1, 0]> : tensor<3xi64>,
-         edge_padding_low = dense<[0, 1, 2]> : tensor<3xi64>,
-         interior_padding = dense<0> : tensor<3xi64>
-   } : (memref<1x2x3xf16>, memref<f16>, memref<4x5xf16>) -> ()
-   func.return
-}
-
-// -----
-
-func.func @pad_config_rank_error(%arg0: memref<1x2x3xf16>, %arg1: memref<f16>, %arg2: memref<2x4x5xf16>) {
-  // expected-error@+1{{pad configurations to be specified for all 3 dimensions}}
-  "lmhlo.pad"(%arg0, %arg1, %arg2) {
-         edge_padding_high = dense<[1, 1]> : tensor<2xi64>,
-         edge_padding_low = dense<[0, 1, 2]> : tensor<3xi64>,
-         interior_padding = dense<0> : tensor<3xi64>
-   } : (memref<1x2x3xf16>, memref<f16>, memref<2x4x5xf16>) -> ()
-   func.return
-}
-
-// -----
-
-func.func @pad_config_error(%arg0: memref<2x2x3xf16>, %arg1: memref<f16>, %arg2: memref<2x4x5xf16>) {
-  // expected-error@+1{{expected 0-th dimension size after padding is 6 but found 2}}
-  "lmhlo.pad"(%arg0, %arg1, %arg2) {
-         edge_padding_high = dense<[1, 1, 0]> : tensor<3xi64>,
-         edge_padding_low = dense<[1, 1, 2]> : tensor<3xi64>,
-         interior_padding = dense<[2, 0, 0]> : tensor<3xi64>
-   } : (memref<2x2x3xf16>, memref<f16>, memref<2x4x5xf16>) -> ()
-   func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @rng_get_and_update_state_memrefs
-func.func @rng_get_and_update_state_memrefs(%state: memref<1xui64>) -> () {
-  "lmhlo.rng_get_and_update_state"(%state) { delta = 1 : i64 } : (memref<1xui64>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @sort_memrefs
-func.func @sort_memrefs(%arg0: memref<16x16xf32>, %arg1: memref<16x16xf16>,
-                   %out0: memref<16x16xf32>, %out1: memref<16x16xf16>) -> () {
-  "lmhlo.sort"(%arg0, %arg1, %out0, %out1) ({
-  ^bb0(%a: tensor<f32>, %b: tensor<f32>, %c: tensor<f16>, %d: tensor<f16>):
-    %7 = "mhlo.compare"(%a, %b) {comparison_direction = #mhlo<comparison_direction GT>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
-    "mhlo.return"(%7) : (tensor<i1>) -> ()
-  }) {dimension = 1 : i64, is_stable = true} : (memref<16x16xf32>, memref<16x16xf16>, memref<16x16xf32>, memref<16x16xf16>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @sort_memrefs
-func.func @sort_memrefs(%arg0: memref<16x16xf32>, %arg1: memref<16x16xf16>,
-                   %out0: memref<16x16xf32>, %out1: memref<16x16xf16>) -> () {
-  "lmhlo.sort"(%arg0, %arg1, %out0, %out1) ({
-  ^bb0(%a: tensor<f32>, %b: tensor<f32>, %c: tensor<f16>, %d: tensor<f16>):
-    %7 = "mhlo.compare"(%a, %b) {comparison_direction = #mhlo<comparison_direction GT>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
-    "mhlo.return"(%7) : (tensor<i1>) -> ()
-  }) {dimension = 1 : i64} : (memref<16x16xf32>, memref<16x16xf16>, memref<16x16xf32>, memref<16x16xf16>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @sort_memrefs
-func.func @sort_memrefs(%arg0: memref<16x16xf32>, %arg1: memref<16x16xf16>,
-                   %out0: memref<16x16xf32>, %out1: memref<16x16xf16>) -> () {
-  "lmhlo.sort"(%arg0, %arg1, %out0, %out1) ({
-  ^bb0(%a: tensor<f32>, %b: tensor<f32>, %c: tensor<f16>, %d: tensor<f16>):
-    %7 = "mhlo.compare"(%a, %b) {comparison_direction = #mhlo<comparison_direction GT>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
-    "mhlo.return"(%7) : (tensor<i1>) -> ()
-  }) : (memref<16x16xf32>, memref<16x16xf16>, memref<16x16xf32>, memref<16x16xf16>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @valid_custom_call
-func.func @valid_custom_call(%arg0:memref<1xf32>, %arg1:memref<1xf32>) -> () {
-  "lmhlo.custom_call"(%arg0, %arg0, %arg1, %arg1) ({}) {
-    backend_config = "",
-    call_target_name = "foo",
-    has_side_effects = false,
-    operandSegmentSizes = array<i32: 2, 2>,
-    target_arg_mapping = #lmhlo.custom_call_target_arg_mapping<
-      num_args = 4,
-      num_results = 3,
-      args_to_target_args = [0,3],
-      results_to_target_results = [1,2]
-    >
-  } : (memref<1xf32>, memref<1xf32>, memref<1xf32>, memref<1xf32>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @invalid_custom_call(%arg0:memref<1xf32>, %arg1:memref<1xf32>) -> () {
-  // expected-error @+1 {{number of entries in the mapping for args (1) should match the number of args for the operation (2)}}
-  "lmhlo.custom_call"(%arg0, %arg0, %arg1, %arg1) ({}) {
-    backend_config = "",
-    call_target_name = "foo",
-    has_side_effects = false,
-    operandSegmentSizes = array<i32: 2, 2>,
-    target_arg_mapping = #lmhlo.custom_call_target_arg_mapping<
-      num_args = 4,
-      num_results = 3,
-      args_to_target_args = [0],
-      results_to_target_results = [1,2]
-    >
-  } : (memref<1xf32>, memref<1xf32>, memref<1xf32>, memref<1xf32>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @invalid_custom_call(%arg0:memref<1xf32>, %arg1:memref<1xf32>) -> () {
-  // expected-error @+1 {{number of entries in the mapping for results (1) should match the number of results for the operation (2)}}
-  "lmhlo.custom_call"(%arg0, %arg0, %arg1, %arg1) ({}) {
-    backend_config = "",
-    call_target_name = "foo",
-    has_side_effects = false,
-    operandSegmentSizes = array<i32: 2, 2>,
-    target_arg_mapping = #lmhlo.custom_call_target_arg_mapping<
-      num_args = 4,
-      num_results = 3,
-      args_to_target_args = [0, 3],
-      results_to_target_results = [1]
-    >
-  } : (memref<1xf32>, memref<1xf32>, memref<1xf32>, memref<1xf32>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @invalid_custom_call(%arg0:memref<1xf32>, %arg1:memref<1xf32>) -> () {
-  // expected-error @+1 {{entry 0 cannot appear more than once in the mapping for args}}
-  "lmhlo.custom_call"(%arg0, %arg0, %arg1, %arg1) ({}) {
-    backend_config = "",
-    call_target_name = "foo",
-    has_side_effects = false,
-    operandSegmentSizes = array<i32: 2, 2>,
-    target_arg_mapping = #lmhlo.custom_call_target_arg_mapping<
-      num_args = 4,
-      num_results = 3,
-      args_to_target_args = [0, 0],
-      results_to_target_results = [1, 2]
-    >
-  } : (memref<1xf32>, memref<1xf32>, memref<1xf32>, memref<1xf32>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @invalid_custom_call(%arg0:memref<1xf32>, %arg1:memref<1xf32>) -> () {
-  // expected-error @+1 {{entry 1 cannot appear more than once in the mapping for results}}
-  "lmhlo.custom_call"(%arg0, %arg0, %arg1, %arg1) ({}) {
-    backend_config = "",
-    call_target_name = "foo",
-    has_side_effects = false,
-    operandSegmentSizes = array<i32: 2, 2>,
-    target_arg_mapping = #lmhlo.custom_call_target_arg_mapping<
-      num_args = 4,
-      num_results = 3,
-      args_to_target_args = [0, 1],
-      results_to_target_results = [1, 1]
-    >
-  } : (memref<1xf32>, memref<1xf32>, memref<1xf32>, memref<1xf32>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @invalid_custom_call(%arg0:memref<1xf32>, %arg1:memref<1xf32>) -> () {
-  // expected-error @+1 {{entries in mapping for args must be >= 0 and less than target's number of args (4)}}
-  "lmhlo.custom_call"(%arg0, %arg0, %arg1, %arg1) ({}) {
-    backend_config = "",
-    call_target_name = "foo",
-    has_side_effects = false,
-    operandSegmentSizes = array<i32: 2, 2>,
-    target_arg_mapping = #lmhlo.custom_call_target_arg_mapping<
-      num_args = 4,
-      num_results = 3,
-      args_to_target_args = [0, 6],
-      results_to_target_results = [1, 2]
-    >
-  } : (memref<1xf32>, memref<1xf32>, memref<1xf32>, memref<1xf32>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @invalid_custom_call(%arg0:memref<1xf32>, %arg1:memref<1xf32>) -> () {
-  // expected-error @+1 {{entries in mapping for results must be >= 0 and less than target's number of results (3)}}
-  "lmhlo.custom_call"(%arg0, %arg0, %arg1, %arg1) ({}) {
-    backend_config = "",
-    call_target_name = "foo",
-    has_side_effects = false,
-    operandSegmentSizes = array<i32: 2, 2>,
-    target_arg_mapping = #lmhlo.custom_call_target_arg_mapping<
-      num_args = 4,
-      num_results = 3,
-      args_to_target_args = [0, 1],
-      results_to_target_results = [1, 3]
-    >
-  } : (memref<1xf32>, memref<1xf32>, memref<1xf32>, memref<1xf32>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @invalid_complex_abs_call(%input:memref<2xcomplex<f32>>, %result:memref<2xcomplex<f32>>) -> () {
-  // expected-error @+1 {{requires output type to be the same as the element type of the input}}
-  "lmhlo.abs"(%input, %result)
-      : (memref<2xcomplex<f32>>, memref<2xcomplex<f32>>) -> ()
-  func.return
-}
-
-// -----
-
-func.func @invalid_float_abs_call(%input:memref<2xf32>, %result:memref<2xf64>) -> () {
-  // expected-error @+1 {{requires all operands to have the same type}}
-  "lmhlo.abs"(%input, %result) : (memref<2xf32>, memref<2xf64>) -> ()
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @send_memrefs
-func.func @send_memrefs(%arg0: memref<3xf32>) -> !mhlo.token {
-  // CHECK: lmhlo.send
-  // CHECK:   channel_handle = #mhlo.channel_handle<handle = 1, type = 2>
-  // CHECK:   frontend_attributes = {foo = "bar"}
-  // CHECK:   is_host_transfer = true
-  %token = "lmhlo.send"(%arg0) {
-    channel_handle = #mhlo.channel_handle<handle = 1, type = 2>,
-    frontend_attributes = {foo = "bar"},
-    is_host_transfer = true
-  } : (memref<3xf32>) -> (!mhlo.token)
-  return %token : !mhlo.token
-}
-
-// -----
-
-// CHECK-LABEL: func @send_done
-func.func @send_done(%arg0: !mhlo.token) {
-  // CHECK: lmhlo.send_done
-  // CHECK:   channel_handle = #mhlo.channel_handle<handle = 1, type = 2>
-  // CHECK:   is_host_transfer = true
-  "lmhlo.send_done"(%arg0) {
-    channel_handle = #mhlo.channel_handle<handle = 1, type = 2>,
-    is_host_transfer = true
-  } : (!mhlo.token) -> ()
-  return
-}
-
-// -----
-
-// CHECK-LABEL: func @recv_memrefs
-func.func @recv_memrefs(%arg0: memref<3xf32>) -> !mhlo.token {
-  // CHECK: lmhlo.recv
-  // CHECK:   channel_handle = #mhlo.channel_handle<handle = 1, type = 3>
-  // CHECK:   frontend_attributes = {foo = "bar"}
-  // CHECK:   is_host_transfer = true
-  %token = "lmhlo.recv"(%arg0) {
-    channel_handle = #mhlo.channel_handle<handle = 1, type = 3>,
-    frontend_attributes = {foo = "bar"},
-    is_host_transfer = true
-  } : (memref<3xf32>) -> (!mhlo.token)
-  return %token : !mhlo.token
-}
-
-// -----
-
-// CHECK-LABEL: func @recv_done
-func.func @recv_done(%arg0: !mhlo.token) {
-  // CHECK: lmhlo.recv_done
-  // CHECK:   channel_handle = #mhlo.channel_handle<handle = 1, type = 3>
-  // CHECK:   is_host_transfer = true
-  "lmhlo.recv_done"(%arg0) {
-    channel_handle = #mhlo.channel_handle<handle = 1, type = 3>,
-    is_host_transfer = true
-  } : (!mhlo.token) -> ()
-  return
-}
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/canonicalize.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/canonicalize.mlir
index 91ddb13539f2a3..371ff439b06341 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/canonicalize.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/canonicalize.mlir
@@ -689,11 +689,11 @@ func.func @broadcast_constant_fold_complex() -> tensor<1x64x224x224xcomplex<f32>
 
 // CHECK-LABEL: func @broadcast_constant_fold_quantized_skipped
 func.func @broadcast_constant_fold_quantized_skipped() -> tensor<1x64x224x224x!quant.uniform<i8:f32, 1.000000e+00:3>> {
-  %cst = stablehlo.constant() {value = dense<2> : tensor<i8>} : ()  ->  tensor<!quant.uniform<i8:f32, 1.000000e+00:3>>
+  %cst = mhlo.constant() {value = dense<2> : tensor<i8>} : ()  ->  tensor<!quant.uniform<i8:f32, 1.000000e+00:3>>
   %b = "mhlo.broadcast"(%cst) <{broadcast_sizes = dense<[1, 64, 224, 224]> : tensor<4xi64>}> : (tensor<!quant.uniform<i8:f32, 1.000000e+00:3>>) -> tensor<1x64x224x224x!quant.uniform<i8:f32, 1.000000e+00:3>>
   func.return %b : tensor<1x64x224x224x!quant.uniform<i8:f32, 1.000000e+00:3>>
 }
-// CHECK-NEXT: %[[CST:.*]] = stablehlo.constant() {value = dense<2> : tensor<i8>} : ()  ->  tensor<!quant.uniform<i8:f32, 1.000000e+00:3>>
+// CHECK-NEXT: %[[CST:.*]] = mhlo.constant() <{value = dense<2> : tensor<i8>}> : ()  ->  tensor<!quant.uniform<i8:f32, 1.000000e+00:3>>
 // CHECK-NEXT: %[[RES:.*]] = "mhlo.broadcast"(%[[CST:.*]]) <{broadcast_sizes = dense<[1, 64, 224, 224]> : tensor<4xi64>}> : (tensor<!quant.uniform<i8:f32, 1.000000e+00:3>>) -> tensor<1x64x224x224x!quant.uniform<i8:f32, 1.000000e+00:3>>
 // CHECK-NEXT: return %[[RES:.*]] : tensor<1x64x224x224x!quant.uniform<i8:f32, 1.000000e+00:3>>
 
@@ -1429,27 +1429,6 @@ func.func @unpack_repack_same_tuple_single_element(%arg0: tuple<tensor<i32>>) ->
   func.return %3 : tuple<tensor<i32>>
 }
 
-// CHECK-LABEL: func @erase_dead_lhlo_constant
-func.func @erase_dead_lhlo_constant() {
-  %M = memref.alloc() : memref<256x1024xf32>
-  // CHECK-NEXT: return
-  "lmhlo.constant"(%M) {value = dense<0.0> : tensor<f32>} : (memref<256x1024xf32>) -> ()
-  memref.dealloc %M : memref<256x1024xf32>
-  func.return
-}
-
-// A negative test for dead lhlo constant op erasure.
-// CHECK-LABEL: func @erase_dead_lhlo_constant_negative
-func.func @erase_dead_lhlo_constant_negative(%M : memref<4xf32>) -> memref<256x1024xf32> {
-  // CHECK-NEXT: lmhlo.constant
-  "lmhlo.constant"(%M) {value = dense<0.0> : tensor<f32>} : (memref<4xf32>) -> ()
-  // CHECK-NEXT: memref.alloc
-  // CHECK-NEXT: lmhlo.constant
-  %N = memref.alloc() : memref<256x1024xf32>
-  "lmhlo.constant"(%N) {value = dense<0.0> : tensor<f32>} : (memref<256x1024xf32>) -> ()
-  func.return %N : memref<256x1024xf32>
-}
-
 // CHECK-LABEL: func @fold_get_dimension_size
 func.func @fold_get_dimension_size(%I: tensor<1x128x512xf32>) -> tensor<i32> {
   %size = "mhlo.get_dimension_size"(%I) <{dimension = 2 : i64}> : (tensor<1x128x512xf32>) -> tensor<i32>
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/group_reduction_dimensions.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/group_reduction_dimensions.mlir
index ae9349a7b43673..2e9615f301bb52 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/group_reduction_dimensions.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/group_reduction_dimensions.mlir
@@ -16,7 +16,7 @@ func.func @trailing_reduction(%arg : tensor<10x10x3x3xf32>) -> tensor<10x10xf32>
   // CHECK-SAME:     applies mhlo.add across dimensions = [1]
   // CHECK-SAME:     : (tensor<100x9xf32>, tensor<f32>) -> tensor<100xf32>
   // CHECK:      %[[RESULT:.*]] = tensor.expand_shape %[[CRED]]
-  // CHECK-SAME:     {{\[}}[0, 1]{{\]}} : tensor<100xf32> into tensor<10x10xf32>
+  // CHECK-SAME:     {{\[}}[0, 1]{{\]}} output_shape [10, 10] : tensor<100xf32> into tensor<10x10xf32>
   // CHECK:      return %[[RESULT]]
   %c0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
   %0 = mhlo.reduce(%arg init: %c0) applies mhlo.add across dimensions = [2, 3]
@@ -183,7 +183,7 @@ func.func @accept_dynamic_shape(%arg : tensor<10x?x3x3xf32>) -> tensor<10x?xf32>
   // CHECK-SAME:     applies mhlo.add across dimensions = [1]
   // CHECK-SAME:     : (tensor<?x9xf32>, tensor<f32>) -> tensor<?xf32>
   // CHECK:      %[[RESULT:.*]] = tensor.expand_shape %[[CRED]]
-  // CHECK-SAME:     {{\[}}[0, 1]{{\]}} : tensor<?xf32> into tensor<10x?xf32>
+  // CHECK-SAME:     {{\[}}[0, 1]{{\]}} output_shape [10, %2] : tensor<?xf32> into tensor<10x?xf32>
   // CHECK:      return %[[RESULT]]
   %c0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
   %0 = mhlo.reduce(%arg init: %c0) applies mhlo.add across dimensions = [2, 3]
@@ -418,7 +418,7 @@ func.func @needs_transpose(%arg : tensor<10x11x12x13x14x15x16x17x18x19xf32>)
 // CHECK-ROW-RED-SAME:     : (tensor<7900200x42432xf32>, tensor<f32>)
 // CHECK-ROW-RED-SAME:     -> tensor<7900200xf32>
 // CHECK-ROW-RED:      %[[RESULT:.*]] = tensor.expand_shape %[[CTCRED]]
-// CHECK-ROW-RED-SAME:     {{\[}}[0, 1, 2, 3, 4, 5]{{\]}} : tensor<7900200xf32>
+// CHECK-ROW-RED-SAME:     {{\[}}[0, 1, 2, 3, 4, 5]{{\]}} output_shape [10, 11, 14, 15, 18, 19] : tensor<7900200xf32>
 // CHECK-ROW-RED-SAME:     into tensor<10x11x14x15x18x19xf32>
 // CHECK-ROW-RED:      return %[[RESULT]] : tensor<10x11x14x15x18x19xf32>
 
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-rng-to-linalg.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-rng-to-linalg.mlir
index bd7b54c5c3bc73..2a13bb9964207f 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-rng-to-linalg.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-rng-to-linalg.mlir
@@ -121,10 +121,10 @@ func.func @three_fry_i32(%arg0: tensor<2xi64>) -> (tensor<2xi64>, tensor<8xi32>)
 // CHECK-SAME: outs(%[[DEST0]], %[[DEST1]] : tensor<4xi32>, tensor<4xi32>)
 
 // CHECK: %expanded = tensor.expand_shape %[[GENERIC]]#0
-// CHECK-SAME{literal}: [[0, 1]] : tensor<4xi32> into tensor<4x1xi32>
+// CHECK-SAME{literal}: [[0, 1]] {{.*}} : tensor<4xi32> into tensor<4x1xi32>
 
 // CHECK: %expanded_1 = tensor.expand_shape %[[GENERIC]]#1
-// CHECK-SAME{literal}: [[0, 1]] : tensor<4xi32> into tensor<4x1xi32>
+// CHECK-SAME{literal}: [[0, 1]] {{.*}} : tensor<4xi32> into tensor<4x1xi32>
 
 // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<4x2xi32>
 // CHECK: %[[CONCAT:.+]] = linalg.generic
@@ -164,10 +164,10 @@ func.func @three_fry_odd_i32(%arg0: tensor<2xi64>) -> (tensor<2xi64>, tensor<7x1
 // CHECK-SAME: outs(%[[DEST0]], %[[DEST1]] : tensor<42xi32>, tensor<42xi32>)
 
 // CHECK: %expanded = tensor.expand_shape %[[GENERIC]]#0
-// CHECK-SAME{literal}: [[0, 1]] : tensor<4xi32> into tensor<7x6x1xi32>
+// CHECK-SAME{literal}: [[0, 1]] {{.*}} : tensor<4xi32> into tensor<7x6x1xi32>
 
 // CHECK: %expanded_1 = tensor.expand_shape %[[GENERIC]]#1
-// CHECK-SAME{literal}: [[0, 1]] : tensor<4xi32> into tensor<7x6x1xi32>
+// CHECK-SAME{literal}: [[0, 1]] {{.*}} : tensor<4xi32> into tensor<7x6x1xi32>
 
 // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<7x6x2xi32>
 // CHECK: %[[CONCAT:.+]] = linalg.generic
@@ -206,10 +206,10 @@ func.func @three_fry_i16(%arg0: tensor<2xi64>) -> (tensor<2xi64>, tensor<8xi16>)
 // CHECK-SAME: outs(%[[DEST0]], %[[DEST1]] : tensor<4xi16>, tensor<4xi16>)
 
 // CHECK: %expanded = tensor.expand_shape %[[GENERIC]]#0
-// CHECK-SAME{literal}: [[0, 1]] : tensor<4xi16> into tensor<4x1xi16>
+// CHECK-SAME{literal}: [[0, 1]] {{.*}} : tensor<4xi16> into tensor<4x1xi16>
 
 // CHECK: %expanded_1 = tensor.expand_shape %[[GENERIC]]#1
-// CHECK-SAME{literal}: [[0, 1]] : tensor<4xi16> into tensor<4x1xi16>
+// CHECK-SAME{literal}: [[0, 1]] {{.*}} : tensor<4xi16> into tensor<4x1xi16>
 
 // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<4x2xi16>
 // CHECK: %[[CONCAT:.+]] = linalg.generic
@@ -337,8 +337,8 @@ func.func @philox_i64(%arg0: tensor<2xi64>) -> (tensor<2xi64>, tensor<8xi64>) {
 // CHECK-DAG:   %[[VAL_101:.*]] = arith.xori %[[VAL_100]], %[[VAL_87]] : i32
 
 // CHECK: linalg.yield %[[YIELDED_1:.*]], %[[YIELDED_2:.*]] : i64, i64
-// CHECK-DAG: %[[VAL_206:.*]] = tensor.expand_shape %[[VAL_207:.*]]#0 {{\[\[}}0, 1]] : tensor<4xi64> into tensor<4x1xi64>
-// CHECK-DAG: %[[VAL_208:.*]] = tensor.expand_shape %[[VAL_207]]#1 {{\[\[}}0, 1]] : tensor<4xi64> into tensor<4x1xi64>
+// CHECK-DAG: %[[VAL_206:.*]] = tensor.expand_shape %[[VAL_207:.*]]#0 {{\[\[}}0, 1]] {{.*}} : tensor<4xi64> into tensor<4x1xi64>
+// CHECK-DAG: %[[VAL_208:.*]] = tensor.expand_shape %[[VAL_207]]#1 {{\[\[}}0, 1]] {{.*}} : tensor<4xi64> into tensor<4x1xi64>
 // CHECK-DAG: %[[VAL_209:.*]] = tensor.empty() : tensor<4x2xi64>
 // CHECK-DAG: %[[VAL_213:.*]] = tensor.insert %[[VAL_30]] into %[[VAL_0]]{{\[}}%[[VAL_19]]] : tensor<2xi64>
 
@@ -411,10 +411,10 @@ func.func @philox_i32_odd(%arg0: tensor<2xi64>) -> (tensor<2xi64>, tensor<7x11xi
 
 
 // CHECK: %expanded = tensor.expand_shape %[[GENERIC]]#0
-// CHECK-SAME{literal}: [[0, 1]] : tensor<4xi32> into tensor<4x1xi32>
+// CHECK-SAME{literal}: [[0, 1]] {{.*}} : tensor<4xi32> into tensor<4x1xi32>
 
 // CHECK: %expanded_1 = tensor.expand_shape %[[GENERIC]]#1
-// CHECK-SAME{literal}: [[0, 1]] : tensor<4xi32> into tensor<4x1xi32>
+// CHECK-SAME{literal}: [[0, 1]] {{.*}} : tensor<4xi32> into tensor<4x1xi32>
 
 
 // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<20x4xi32>
@@ -424,10 +424,10 @@ func.func @philox_i32_odd(%arg0: tensor<2xi64>) -> (tensor<2xi64>, tensor<7x11xi
 // CHECK: %[[COLLAPSE:.+]] = tensor.collapse_shape %[[CONCAT]]
 
 
-// CHECK: %[[VAL_213:.*]] = tensor.expand_shape %[[COLLAPSE]] {{\[\[}}0, 1]] : tensor<80xi32> into tensor<80x1xi32>
+// CHECK: %[[VAL_213:.*]] = tensor.expand_shape %[[COLLAPSE]] {{\[\[}}0, 1]] {{.*}} : tensor<80xi32> into tensor<80x1xi32>
 // CHECK: %[[VAL_214:.*]] = tensor.extract_slice %[[VAL_213]][0, 0] [77, 1] [1, 1] : tensor<80x1xi32> to tensor<77x1xi32>
 // CHECK: %[[VAL_215:.*]] = tensor.collapse_shape %[[VAL_214]] {{\[\[}}0, 1]] : tensor<77x1xi32> into tensor<77xi32>
-// CHECK: %[[VAL_216:.*]] = tensor.expand_shape %[[VAL_215]] {{\[\[}}0, 1]] : tensor<77xi32> into tensor<7x11xi32>
+// CHECK: %[[VAL_216:.*]] = tensor.expand_shape %[[VAL_215]] {{\[\[}}0, 1]] {{.*}} : tensor<77xi32> into tensor<7x11xi32>
 // CHECK: %[[VAL_217:.*]] = tensor.insert %[[VAL_30]] into %[[VAL_0]]{{\[}}%[[VAL_19]]] : tensor<2xi64>
 // CHECK: return %[[VAL_217]], %[[VAL_216]] : tensor<2xi64>, tensor<7x11xi32>
 
@@ -465,10 +465,10 @@ func.func @philox_i64_odd(%arg0: tensor<2xi64>) -> (tensor<2xi64>, tensor<3x5xi6
 // CHECK-DAG: %[[COLLAPSE:.+]] = tensor.collapse_shape %[[CONCAT]] {{\[\[}}0, 1]] : tensor<8x2xi64> into tensor<16xi64>
 
 
-// CHECK-DAG: %[[EXPANDED:.*]] = tensor.expand_shape %[[COLLAPSE]] {{\[\[}}0, 1]] : tensor<16xi64> into tensor<16x1xi64>
+// CHECK-DAG: %[[EXPANDED:.*]] = tensor.expand_shape %[[COLLAPSE]] {{\[\[}}0, 1]] {{.*}} : tensor<16xi64> into tensor<16x1xi64>
 // CHECK-DAG: %[[SLICE:.*]] = tensor.extract_slice %[[EXPANDED]][0, 0] [15, 1] [1, 1] : tensor<16x1xi64> to tensor<15x1xi64>
 // CHECK-DAG: %[[EXPAND_2:.*]] = tensor.collapse_shape %[[SLICE]] {{\[\[}}0, 1]] : tensor<15x1xi64> into tensor<15xi64>
-// CHECK-DAG: %[[RESHAPE:.*]] = tensor.expand_shape %[[EXPAND_2]] {{\[\[}}0, 1]] : tensor<15xi64> into tensor<3x5xi64>
+// CHECK-DAG: %[[RESHAPE:.*]] = tensor.expand_shape %[[EXPAND_2]] {{\[\[}}0, 1]] {{.*}} : tensor<15xi64> into tensor<3x5xi64>
 // CHECK-DAG: %[[INSERTED:.+]] = tensor.insert %[[NEWSTATE]] into %[[ARG0]][%[[C1]]] : tensor<2xi64>
 // CHECK: return %[[INSERTED]], %[[RESHAPE]]
 
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-shape-ops-to-standard.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-shape-ops-to-standard.mlir
deleted file mode 100644
index fcd88c62f6f9cf..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-shape-ops-to-standard.mlir
+++ /dev/null
@@ -1,63 +0,0 @@
-// RUN: mlir-hlo-opt %s -hlo-legalize-shapeops-to-standard -split-input-file | FileCheck %s
-
-// CHECK-LABEL: compute_reshape_shape
-// CHECK-SAME: %[[NUM_ELS:.*]]: index
-// CHECK-SAME: %[[TARGET_SHAPE:.*]]: tensor<2xi32>
-func.func @compute_reshape_shape(%arg0: index, %arg1: tensor<2xi32>) -> tensor<2xi32> {
-  // CHECK: %[[N1:.*]] = arith.constant -1 : index
-  // CHECK: %[[IT:.*]] = arith.index_cast %[[TARGET_SHAPE]] : tensor<2xi32> to tensor<2xindex>
-  // CHECK: %[[RANK:.*]] = shape.rank %[[IT]] : tensor<2xindex> -> index
-  // CHECK: %[[TOTAL:.*]] = shape.reduce(%[[IT]], %[[N1]]) : tensor<2xindex> -> index {
-  // CHECK:   ^bb0(%[[IDX:.*]]: index, %[[VAL:.*]]: index, %[[REDUCTION:.*]]: index):
-  // CHECK:   %[[NEW_RED:.*]] = arith.muli %[[VAL]], %[[REDUCTION]] : index
-  // CHECK:   shape.yield %[[NEW_RED]] : index
-  // CHECK: }
-  // CHECK: %[[DYNAMIC_EXTENT:.*]] = arith.divui %[[NUM_ELS]], %[[TOTAL]] : index
-  // CHECK: %[[COMPUTED_SHAPE:.*]] = tensor.generate   {
-  // CHECK:   ^bb0(%[[ARG:.*]]: index):
-  // CHECK:   %[[EXT1:.*]] = shape.get_extent %[[IT]], %[[ARG]] : tensor<2xindex>, index -> index
-  // CHECK:   %[[IS_DYNAMIC:.*]] = arith.cmpi eq, %[[EXT1]], %[[N1]] : index
-  // CHECK:   %[[EXTENT:.*]] = arith.select %[[IS_DYNAMIC]], %[[DYNAMIC_EXTENT]], %[[EXT1]] : index
-  // CHECK:   %[[EXTENT_INT:.*]] = arith.index_cast %[[EXTENT]] : index to i32
-  // CHECK:   tensor.yield %[[EXTENT_INT]] : i32
-  // CHECK: } : tensor<2xi32>
-  %0 = "mhlo.compute_reshape_shape"(%arg0, %arg1) : (index, tensor<2xi32>) -> tensor<2xi32>
-  func.return %0 : tensor<2xi32>
-}
-
-// CHECK-LABEL: cstr_reshapable_op
-// CHECK-SAME: %[[NUM_ELS:.*]]: index
-// CHECK-SAME: %[[TARGET_SHAPE:.*]]: tensor<2xi32>
-func.func @cstr_reshapable_op(%arg0: index, %arg1: tensor<2xi32>) -> !shape.witness {
-  // CHECK-DAG: %[[N1:.*]] = arith.constant -1 : index
-  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
-  // CHECK: %[[IT0:.*]] = arith.index_cast %[[TARGET_SHAPE]] : tensor<2xi32> to tensor<2xindex>
-  // CHECK: %[[VALID:.*]]:3 = shape.reduce(%[[IT0]], %[[C1]], %[[C0]], %[[C0]]) : tensor<2xindex> -> (index, index, index) {
-  // CHECK:   ^bb0(%[[IDX:.*]]: index, %[[VAL:.*]]: index, %[[PROD:.*]]: index, %[[DYN_DIMS:.*]]: index, %[[ILLEGAL_DIMS:.*]]: index):
-  // CHECK:   %[[V1:.*]] = arith.cmpi eq, %[[N1]], %[[VAL]] : index
-  // CHECK:   %[[V2:.*]] = arith.cmpi slt, %[[VAL]], %[[N1]] : index
-  // CHECK:   %[[V3:.*]] = arith.select %[[V1]], %[[C1]], %[[C0]] : index
-  // CHECK:   %[[V4:.*]] = arith.addi %[[V3]], %[[DYN_DIMS]] : index
-  // CHECK:   %[[V5:.*]] = arith.select %[[V2]], %[[C1]], %[[C0]] : index
-  // CHECK:   %[[V6:.*]] = arith.addi %[[V5]], %[[ILLEGAL_DIMS]] : index
-  // CHECK:   %[[V7:.*]] = arith.select %[[V1]], %[[C1]], %[[VAL]] : index
-  // CHECK:   %[[V8:.*]] = arith.muli %[[V7]], %[[PROD]] : index
-  // CHECK:   shape.yield %[[V8]], %[[V4]], %[[V6]] : index, index, index
-  // CHECK: }
-  // CHECK: %[[IS_ZERO_ELS:.*]] = arith.cmpi eq, %[[VALID]]#0, %[[C0]] : index
-  // CHECK: %[[DIV:.*]] = arith.select %[[IS_ZERO_ELS]], %[[C1]], %[[VALID]]#0 : index
-  // CHECK: %[[REM:.*]] = arith.remsi %[[NUM_ELS]], %[[DIV]] : index
-  // CHECK: %[[DIVISIBLE:.*]] = arith.cmpi eq, %[[C0]], %[[REM]] : index
-  // CHECK: %[[NOT_TOO_DYNAMIC:.*]] = arith.cmpi ule,  %[[VALID]]#1, %[[C1]] : index
-  // CHECK: %[[ALL_VALID_DIMS:.*]] = arith.cmpi eq, %[[VALID]]#2, %[[C0]] : index
-  // CHECK: %[[ONE_DYNAMIC:.*]] = arith.cmpi eq, %[[VALID]]#1, %[[C1]] : index
-  // CHECK: %[[IS_ALL_EQUAL:.*]] = arith.cmpi eq, %[[NUM_ELS]], %[[VALID]]#0 : index
-  // CHECK: %[[EQUAL_IF_NOT_DYNAMIC:.*]] = arith.ori %[[ONE_DYNAMIC]], %[[IS_ALL_EQUAL]] : i1
-  // CHECK: %[[PARTIAL_AND2:.*]] = arith.andi %[[ALL_VALID_DIMS]], %[[EQUAL_IF_NOT_DYNAMIC]] : i1
-  // CHECK: %[[PARTIAL_AND:.*]] = arith.andi %[[NOT_TOO_DYNAMIC]], %[[PARTIAL_AND2]] : i1
-  // CHECK: %[[ALL_CSTRS:.*]] = arith.andi %[[DIVISIBLE]], %[[PARTIAL_AND]] : i1
-  // CHECK: %[[W:.*]] = shape.cstr_require %[[ALL_CSTRS]], "Required valid reshape shape input"
-  %0 = "mhlo.cstr_reshapable"(%arg0, %arg1) : (index, tensor<2xi32>) -> !shape.witness
-  func.return %0 : !shape.witness
-}
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-linalg.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-linalg.mlir
index 3af8f3e9b6a206..de3b7640a1cb45 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-linalg.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-linalg.mlir
@@ -1209,7 +1209,7 @@ func.func @reshape_0D_1D(%arg0: tensor<i32>) -> tensor<1xi32> {
   %0 = "mhlo.reshape"(%arg0) : (tensor<i32>) -> tensor<1xi32>
   func.return %0 : tensor<1xi32>
 }
-// CHECK: tensor.expand_shape %{{.*}} [] : tensor<i32> into tensor<1xi32>
+// CHECK: tensor.expand_shape %{{.*}} [] output_shape [1] : tensor<i32> into tensor<1xi32>
 
 // -----
 
@@ -1220,7 +1220,7 @@ func.func @reshape_0D_1D_unsigned(%arg0: tensor<ui32>) -> tensor<1xui32> {
 // CHECK-LABEL: func @reshape_0D_1D_unsigned
 // CHECK-SAME:    %[[ARG_UNSIGNED:[a-zA-Z0-9_]*]]
 // CHECK:         %[[ARG_SIGNLESS:.*]] = builtin.unrealized_conversion_cast %[[ARG_UNSIGNED]] : tensor<ui32> to tensor<i32>
-// CHECK:         %[[RET_SIGNLESS:.*]] = tensor.expand_shape %[[ARG_SIGNLESS]] [] : tensor<i32> into tensor<1xi32>
+// CHECK:         %[[RET_SIGNLESS:.*]] = tensor.expand_shape %[[ARG_SIGNLESS]] [] output_shape [1] : tensor<i32> into tensor<1xi32>
 // CHECK:         %[[RET_UNSIGNED:.*]] = builtin.unrealized_conversion_cast %[[RET_SIGNLESS]] : tensor<1xi32> to tensor<1xui32>
 // CHECK:         return %[[RET_UNSIGNED]] : tensor<1xui32>
 
@@ -1322,7 +1322,7 @@ func.func @reshape_dynamic_in(%arg0: tensor<?x?xf32>) -> tensor<2x4x5xf32> {
 }
 // CHECK: %[[FLATTEN:.*]] = tensor.collapse_shape %{{.*}} {{\[}}[0, 1]] : tensor<?x?xf32> into tensor<?xf32>
 // CHECK: %[[CAST:.*]] = tensor.cast %[[FLATTEN]] : tensor<?xf32> to tensor<40xf32>
-// CHECK: tensor.expand_shape %[[CAST]] {{\[}}[0, 1, 2]] : tensor<40xf32> into tensor<2x4x5xf32>
+// CHECK: tensor.expand_shape %[[CAST]] {{\[}}[0, 1, 2]] output_shape [2, 4, 5] : tensor<40xf32> into tensor<2x4x5xf32>
 
 // -----
 
@@ -1332,7 +1332,7 @@ func.func @reshape_1D_2D_dynamic(%arg0: tensor<?xi32>) -> tensor<1x3xi32> {
   func.return %0 : tensor<1x3xi32>
 }
 // CHECK: %[[CAST:.*]] = tensor.cast %{{.*}} : tensor<?xi32> to tensor<3xi32>
-// CHECK: tensor.expand_shape %[[CAST]] {{\[}}[0, 1]] : tensor<3xi32> into tensor<1x3xi32>
+// CHECK: tensor.expand_shape %[[CAST]] {{\[}}[0, 1]] output_shape [1, 3] : tensor<3xi32> into tensor<1x3xi32>
 
 // -----
 
@@ -3998,7 +3998,7 @@ func.func @depthwise_conv(%arg0: tensor<2x4x5x2xf32>,
 // CHECK-SAME:   %[[FILTER:[a-zA-Z0-9_]*]]
 // CHECK-DAG:       %[[CST:.+]] = arith.constant 0.000000e+00 : f32
 // CHECK:       %[[COLLAPSE:.+]] = tensor.collapse_shape %[[FILTER]] {{\[}}[0, 1, 2, 3]] : tensor<2x2x1x6xf32> into tensor<24xf32>
-// CHECK:       %[[EXPAND:.+]] = tensor.expand_shape %[[COLLAPSE]] {{\[}}[0, 1, 2, 3]] : tensor<24xf32> into tensor<2x2x2x3xf32>
+// CHECK:       %[[EXPAND:.+]] = tensor.expand_shape %[[COLLAPSE]] {{\[}}[0, 1, 2, 3]] output_shape [2, 2, 2, 3] : tensor<24xf32> into tensor<2x2x2x3xf32>
 // CHECK:       %[[INIT:.+]] = tensor.empty() : tensor<2x3x4x2x3xf32>
 // CHECK:       %[[FILL:.+]] = linalg.fill ins(%[[CST]] : f32) outs(%[[INIT]] : tensor<2x3x4x2x3xf32>) -> tensor<2x3x4x2x3xf32>
 // CHECK:       %[[OUT:.+]] = linalg.depthwise_conv_2d_nhwc_hwcm
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-memref.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-memref.mlir
index 7756978359b9dd..11da4b07da59d5 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-memref.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-memref.mlir
@@ -99,55 +99,3 @@ func.func @reshape_unsigned(%operand: tensor<*xi32>) -> tensor<4x3xi32> {
 // CHECK: %[[RESHAPED:.*]] = memref.cast %[[OPERAND]] : memref<*xi32> to memref<4x3xi32>
 // CHECK: %[[TRESULT:.*]] = bufferization.to_tensor %[[RESHAPED]] : memref<4x3xi32>
 // CHECK: return %[[TRESULT]]
-
-// -----
-
-// CHECK-LABEL: func @custom_call_multiple_inputs_outputs
-// CHECK-SAME: %[[ARG0:.*]]: tensor<2xf32>
-// CHECK-SAME: %[[ARG1:.*]]: tensor<5xi32>
-func.func @custom_call_multiple_inputs_outputs(%x: tensor<2xf32>,
-    %y: tensor<5xi32>) -> (tensor<2xf32>, tensor<2xf32>) {
-  %0:3 = "mhlo.custom_call"(%x, %y) {
-    backend_config="",
-    call_target_name = "foo",
-    has_side_effect = false
-  } : (tensor<2xf32>, tensor<5xi32>)
-    -> (tensor<2xf32>, tensor<2xf32>, tensor<5xi32>)
-  func.return %0#0, %0#1 : tensor<2xf32>, tensor<2xf32>
-}
-
-// CHECK-DAG: %[[I0:.+]] = bufferization.to_memref %[[ARG0]] : memref<2xf32>
-// CHECK-DAG: %[[I1:.+]] = bufferization.to_memref %[[ARG1]] : memref<5xi32>
-// CHECK-DAG: %[[O0:.*]] = memref.alloc() {{.*}} : memref<2xf32>
-// CHECK-DAG: %[[O1:.*]] = memref.alloc() {{.*}} : memref<2xf32>
-// CHECK-DAG: %[[O2:.*]] = memref.alloc() {{.*}} : memref<5xi32>
-// CHECK: "lmhlo.custom_call"(%[[I0]], %[[I1]], %[[O0]], %[[O1]], %[[O2]]) <{backend_config = "", call_target_name = "foo", has_side_effect = false, operandSegmentSizes = array<i32: 2, 3>}>
-// CHECK-NEXT: }) : (memref<2xf32>, memref<5xi32>, memref<2xf32>, memref<2xf32>, memref<5xi32>) -> ()
-// CHECK-DAG: %[[T0:.+]] = bufferization.to_tensor %[[O0]] : memref<2xf32>
-// CHECK-DAG: %[[T1:.+]] = bufferization.to_tensor %[[O1]] : memref<2xf32>
-// CHECK: return %[[T0]], %[[T1]] : tensor<2xf32>, tensor<2xf32>
-
-// -----
-
-// CHECK-LABEL: func @custom_call_side_effect
-// CHECK-SAME: %[[ARG0:.*]]: tensor<2xf32>
-// CHECK-SAME: %[[ARG1:.*]]: tensor<5xi32>
-func.func @custom_call_side_effect(%x: tensor<2xf32>,
-                                   %y: tensor<5xi32>) -> !mhlo.token {
-  %token = mhlo.create_token : !mhlo.token
-  %0:2 = "mhlo.custom_call"(%x, %y, %token) {
-    backend_config = "",
-    call_target_name = "bar",
-    has_side_effect = true
-  } : (tensor<2xf32>, tensor<5xi32>, !mhlo.token)
-    -> (!mhlo.token, tensor<2xi32>)
-  func.return %0#0 : !mhlo.token
-}
-
-// CHECK-DAG: %[[TOKEN:.*]] = mhlo.create_token
-// CHECK-DAG: %[[I0:.+]] = bufferization.to_memref %[[ARG0]] : memref<2xf32>
-// CHECK-DAG: %[[I1:.+]] = bufferization.to_memref %[[ARG1]] : memref<5xi32>
-// CHECK-DAG: %[[ALLOC:.+]] = memref.alloc
-// CHECK: "lmhlo.custom_call"(%[[I0]], %[[I1]], %[[ALLOC]]) <{backend_config = "", call_target_name = "bar", has_side_effect = true, operandSegmentSizes = array<i32: 2, 1>, target_arg_mapping = #lmhlo.custom_call_target_arg_mapping<num_args = 3, num_results = 2, args_to_target_args = [0, 1], results_to_target_results = [1]>}>
-// CHECK-NEXT: }) : (memref<2xf32>, memref<5xi32>, memref<2xi32>) -> ()
-// CHECK: return %[[TOKEN]] : !mhlo.token
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo-experimental.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo-experimental.mlir
index ed2d07b367abc0..b58e61d1e358c6 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo-experimental.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo-experimental.mlir
@@ -6,9 +6,9 @@
 
 // CHECK-LABEL: "op_all_reduce_tuple"
 func.func @op_all_reduce_tuple(%arg0: tensor<8xf32>, %arg1: tensor<f32>) -> (tensor<8xf32>, tensor<f32>) {
-  //               CHECK: "stablehlo.custom_call"(%[[ARG0:.*]], %[[ARG1:.*]]) {
-  //          CHECK-SAME:    call_target_name = "mhlo.all_reduce"
-  //          CHECK-SAME:    called_computations = [@all_reduce]
+  //               CHECK: "stablehlo.custom_call"(%[[ARG0:.*]], %[[ARG1:.*]]) <{
+  //          CHECK-SAME:    backend_config = "", call_target_name = "mhlo.all_reduce", called_computations = [@all_reduce]
+  //          CHECK-SAME: }> {
   // CHECK-SAME{LITERAL}:    mhlo.attributes = {replica_groups = dense<> : tensor<0x0xi64>}
   //          CHECK-SAME: } : (tensor<8xf32>, tensor<f32>) -> (tensor<8xf32>, tensor<f32>)
   //               CHECK: func.func
@@ -29,8 +29,7 @@ func.func @op_all_reduce_tuple(%arg0: tensor<8xf32>, %arg1: tensor<f32>) -> (ten
 
 // CHECK-LABEL: "op_all_to_all_tuple"
 func.func @op_all_to_all_tuple(%arg0: tensor<128x4xf32>, %arg1: tensor<128x4xf32>) -> (tensor<128x4xf32>, tensor<128x4xf32>) {
-  //               CHECK: "stablehlo.custom_call"(%arg0, %arg1) {
-  //          CHECK-SAME:    call_target_name = "mhlo.all_to_all"
+  //               CHECK: "stablehlo.custom_call"(%arg0, %arg1) <{backend_config = "", call_target_name = "mhlo.all_to_all"}> {
   // CHECK-SAME{LITERAL}:    mhlo.attributes = {replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>}
   //          CHECK-SAME: } : (tensor<128x4xf32>, tensor<128x4xf32>)
   // expected-error@+1 {{failed to legalize operation 'mhlo.all_to_all' that was explicitly marked illegal}}
@@ -44,8 +43,7 @@ func.func @op_all_to_all_tuple(%arg0: tensor<128x4xf32>, %arg1: tensor<128x4xf32
 
 // CHECK-LABEL: "attr_precision_packed_nibble"
 func.func @attr_precision_packed_nibble(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor<8x8xf32> {
-  //      CHECK: "stablehlo.custom_call"(%arg0, %arg1) {
-  // CHECK-SAME:    call_target_name = "mhlo.dot"
+  //      CHECK: "stablehlo.custom_call"(%arg0, %arg1) <{backend_config = "", call_target_name = "mhlo.dot"}> {
   // CHECK-SAME:    mhlo.attributes = {precision_config = ["PACKED_NIBBLE"]}
   // CHECK-SAME: } : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<8x8xf32>
   // expected-error@+1 {{failed to legalize operation 'mhlo.dot' that was explicitly marked illegal}}
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir
index 31cf7728f48d2d..32f555f88cc1cc 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir
@@ -156,8 +156,7 @@ func.func @attr_custom_call_api_version_status_returning_unified(%arg0: tensor<f
 
 // CHECK-LABEL: "attr_custom_call_api_version_typed_ffi"
 func.func @attr_custom_call_api_version_typed_ffi(%arg0: tensor<f32>) -> tensor<f32> {
-  //      CHECK: "stablehlo.custom_call"(%arg0) {
-  // CHECK-SAME:   call_target_name = "mhlo.custom_call"
+  //      CHECK: "stablehlo.custom_call"([[ARG0:%arg[0-9]+]]) <{backend_config = "", call_target_name = "mhlo.custom_call"}> {
   // CHECK-SAME:   mhlo.attributes = {api_version = 4 : i32, backend_config = {foo = "bar"}, call_target_name = "foo"},
   // CHECK-SAME:   mhlo.version = 1 : i64
   // CHECK-SAME: } : (tensor<f32>) -> tensor<f32>
@@ -333,7 +332,7 @@ func.func @attr_transpose_adjoint(%arg0: tensor<16x16xf32>, %arg1: tensor<16x16x
 func.func @attr_type_extensions_bounds(
     %arg0: tensor<?x?xf32, #mhlo.type_extensions<bounds = [16, ?]>>)
     -> tensor<?x?xf32, #mhlo.type_extensions<bounds = [16, ?]>> {
-  // CHECK: "func.return"(%arg0) : (tensor<?x?xf32, #stablehlo.bounds<16, ?>>) -> ()
+  // CHECK: "func.return"([[ARG0:%arg[0-9]+]]) : (tensor<?x?xf32, #stablehlo.bounds<16, ?>>) -> ()
   func.return %arg0 : tensor<?x?xf32, #mhlo.type_extensions<bounds = [16, ?]>>
 }
 
@@ -341,7 +340,7 @@ func.func @attr_type_extensions_bounds(
 
 // CHECK-LABEL: "op_abs"
 func.func @op_abs(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "stablehlo.abs"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "stablehlo.abs"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.abs"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
@@ -350,26 +349,26 @@ func.func @op_abs(%arg0: tensor<f32>) -> tensor<f32> {
 
 // CHECK-LABEL: "op_add"
 func.func @op_add(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
-  // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: "stablehlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_after_all"
 func.func @op_after_all(%arg0: !mhlo.token) -> !mhlo.token {
-  // CHECK: "stablehlo.after_all"(%arg0) : (!stablehlo.token) -> !stablehlo.token
+  // CHECK: "stablehlo.after_all"([[ARG0:%arg[0-9]+]]) : (!stablehlo.token) -> !stablehlo.token
   %0 = "mhlo.after_all"(%arg0) : (!mhlo.token) -> !mhlo.token
   func.return %0 : !mhlo.token
 }
 
 // CHECK-LABEL: "op_all_gather"
 func.func @op_all_gather(%arg0: tensor<16x8xf32>) -> tensor<16x16xf32> {
-  //               CHECK: "stablehlo.all_gather"(%arg0) {
+  //               CHECK: "stablehlo.all_gather"([[ARG0:%arg[0-9]+]]) <{
   //          CHECK-SAME:   all_gather_dim = 1 : i64,
   //          CHECK-SAME:   channel_handle = #stablehlo.channel_handle<handle = 0, type = 0>,
   // CHECK-SAME{LITERAL}:   replica_groups = dense<[[0], [1]]> : tensor<2x1xi64>,
   //          CHECK-SAME:   use_global_device_ids
-  //          CHECK-SAME: } : (tensor<16x8xf32>) -> tensor<16x16xf32>
+  //          CHECK-SAME: }> : (tensor<16x8xf32>) -> tensor<16x16xf32>
   %0 = "mhlo.all_gather"(%arg0) {
     all_gather_dim = 1 : i64,
     replica_groups = dense<[[0], [1]]> : tensor<2x1xi64>,
@@ -381,15 +380,15 @@ func.func @op_all_gather(%arg0: tensor<16x8xf32>) -> tensor<16x16xf32> {
 
 // CHECK-LABEL: "op_all_reduce"
 func.func @op_all_reduce(%arg0: tensor<f32>) -> tensor<f32> {
-  //               CHECK: "stablehlo.all_reduce"(%arg0) ({
-  //          CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG1:arg.*]]: tensor<f32>, %[[ARG2:arg.*]]: tensor<f32>):
-  //          CHECK-NEXT:     %[[VAL1:.*]] = "stablehlo.add"(%[[ARG1]], %[[ARG2]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
-  //          CHECK-NEXT:     "stablehlo.return"(%[[VAL1]]) : (tensor<f32>) -> ()
-  //          CHECK-NEXT: }) {
+  //               CHECK: "stablehlo.all_reduce"([[ARG0:%arg[0-9]+]]) <{
   //          CHECK-SAME:   channel_handle = #stablehlo.channel_handle<handle = 1, type = 0>,
   // CHECK-SAME{LITERAL}:   replica_groups = dense<[[0], [1]]> : tensor<2x1xi64>,
   //          CHECK-SAME:   use_global_device_ids
-  //          CHECK-SAME: } : (tensor<f32>) -> tensor<f32>
+  //          CHECK-SAME: }> ({
+  //          CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG1:arg.*]]: tensor<f32>, %[[ARG2:arg.*]]: tensor<f32>):
+  //          CHECK-NEXT:     %[[VAL1:.*]] = "stablehlo.add"(%[[ARG1]], %[[ARG2]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  //          CHECK-NEXT:     "stablehlo.return"(%[[VAL1]]) : (tensor<f32>) -> ()
+  //          CHECK-NEXT: }) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.all_reduce"(%arg0) <{
     replica_groups = dense<[[0], [1]]> : tensor<2x1xi64>,
     channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
@@ -404,13 +403,13 @@ func.func @op_all_reduce(%arg0: tensor<f32>) -> tensor<f32> {
 
 // CHECK-LABEL: "op_all_to_all"
 func.func @op_all_to_all(%arg0: tensor<4x16xf32>) -> tensor<16x4xf32> {
-  //               CHECK: "stablehlo.all_to_all"(%arg0) {
+  //               CHECK: "stablehlo.all_to_all"([[ARG0:%arg[0-9]+]]) <{
   //          CHECK-SAME:   channel_handle = #stablehlo.channel_handle<handle = 1, type = 0>,
   //          CHECK-SAME:   concat_dimension = 0 : i64,
   // CHECK-SAME{LITERAL}:   replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
   //          CHECK-SAME:   split_count = 4 : i64,
   //          CHECK-SAME:   split_dimension = 1 : i64
-  //          CHECK-SAME: } : (tensor<4x16xf32>) -> tensor<16x4xf32>
+  //          CHECK-SAME: }> : (tensor<4x16xf32>) -> tensor<16x4xf32>
   %0 = "mhlo.all_to_all"(%arg0) {
     split_dimension = 1 : i64,
     concat_dimension = 0 : i64,
@@ -423,7 +422,7 @@ func.func @op_all_to_all(%arg0: tensor<4x16xf32>) -> tensor<16x4xf32> {
 
 // CHECK-LABEL: "op_and"
 func.func @op_and(%arg0: tensor<i1>, %arg1: tensor<i1>) -> tensor<i1> {
-  // CHECK: "stablehlo.and"(%arg0, %arg1) : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  // CHECK: "stablehlo.and"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<i1>, tensor<i1>) -> tensor<i1>
   %0 = "mhlo.and"(%arg0, %arg1) : (tensor<i1>, tensor<i1>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
@@ -434,17 +433,17 @@ func.func @op_and(%arg0: tensor<i1>, %arg1: tensor<i1>) -> tensor<i1> {
 
 // CHECK-LABEL: "op_atan2"
 func.func @op_atan2(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
-  // CHECK: "stablehlo.atan2"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: "stablehlo.atan2"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "mhlo.atan2"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_batch_norm_grad"
 func.func @op_batch_norm_grad(%arg0: tensor<16x16x16x16xf32>, %arg1: tensor<16xf32>, %arg2: tensor<16xf32>, %arg3: tensor<16xf32>, %arg4: tensor<16x16x16x16xf32>) -> (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>) {
-  //      CHECK: "stablehlo.batch_norm_grad"(%arg0, %arg1, %arg2, %arg3, %arg4) {
+  //      CHECK: "stablehlo.batch_norm_grad"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]], [[ARG2:%arg[0-9]+]], [[ARG3:%arg[0-9]+]], [[ARG4:%arg[0-9]+]]) <{
   // CHECK-SAME:   epsilon = 1.000000e-03 : f32,
   // CHECK-SAME:   feature_index = 0 : i64
-  // CHECK-SAME: } : (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16x16x16x16xf32>) -> (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>)
+  // CHECK-SAME: }> : (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16x16x16x16xf32>) -> (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>)
   %0:3 = "mhlo.batch_norm_grad"(%arg0, %arg1, %arg2, %arg3, %arg4) {
     epsilon = 0.001 : f32,
     feature_index = 0 : i64
@@ -454,10 +453,10 @@ func.func @op_batch_norm_grad(%arg0: tensor<16x16x16x16xf32>, %arg1: tensor<16xf
 
 // CHECK-LABEL: "op_batch_norm_inference"
 func.func @op_batch_norm_inference(%arg0: tensor<16x16x16x16xf32>, %arg1: tensor<16xf32>, %arg2: tensor<16xf32>, %arg3: tensor<16xf32>, %arg4: tensor<16xf32>) -> tensor<16x16x16x16xf32> {
-  //      CHECK: "stablehlo.batch_norm_inference"(%arg0, %arg1, %arg2, %arg3, %arg4) {
+  //      CHECK: "stablehlo.batch_norm_inference"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]], [[ARG2:%arg[0-9]+]], [[ARG3:%arg[0-9]+]], [[ARG4:%arg[0-9]+]]) <{
   // CHECK-SAME:   epsilon = 1.000000e-03 : f32,
   // CHECK-SAME:   feature_index = 0 : i64
-  // CHECK-SAME: } : (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>) -> tensor<16x16x16x16xf32>
+  // CHECK-SAME: }> : (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>) -> tensor<16x16x16x16xf32>
   %0 = "mhlo.batch_norm_inference"(%arg0, %arg1, %arg2, %arg3, %arg4) {
     epsilon = 0.001 : f32,
     feature_index = 0 : i64
@@ -467,10 +466,10 @@ func.func @op_batch_norm_inference(%arg0: tensor<16x16x16x16xf32>, %arg1: tensor
 
 // CHECK-LABEL: "op_batch_norm_training"
 func.func @op_batch_norm_training(%arg0: tensor<16x16x16x16xf32>, %arg1: tensor<16xf32>, %arg2: tensor<16xf32>) -> (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>) {
-  //      CHECK: "stablehlo.batch_norm_training"(%arg0, %arg1, %arg2) {
+  //      CHECK: "stablehlo.batch_norm_training"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]], [[ARG2:%arg[0-9]+]]) <{
   // CHECK-SAME:   epsilon = 1.000000e-03 : f32,
   // CHECK-SAME:   feature_index = 0 : i64
-  // CHECK-SAME: } : (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>) -> (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>)
+  // CHECK-SAME: }> : (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>) -> (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>)
   %0:3 = "mhlo.batch_norm_training"(%arg0, %arg1, %arg2) {
     epsilon = 0.001 : f32,
     feature_index = 0 : i64
@@ -482,16 +481,16 @@ func.func @op_batch_norm_training(%arg0: tensor<16x16x16x16xf32>, %arg1: tensor<
 
 // CHECK-LABEL: "op_bitcast_convert"
 func.func @op_bitcast_convert(%arg0: tensor<i32>) -> tensor<f32> {
-  // CHECK: "stablehlo.bitcast_convert"(%arg0) : (tensor<i32>) -> tensor<f32>
+  // CHECK: "stablehlo.bitcast_convert"([[ARG0:%arg[0-9]+]]) : (tensor<i32>) -> tensor<f32>
   %0 = "mhlo.bitcast_convert"(%arg0) : (tensor<i32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_broadcast_in_dim"
 func.func @op_broadcast_in_dim(%arg0: tensor<16xf32>) -> tensor<16x16xf32> {
-  //      CHECK: "stablehlo.broadcast_in_dim"(%arg0) {
+  //      CHECK: "stablehlo.broadcast_in_dim"([[ARG0:%arg[0-9]+]]) <{
   // CHECK-SAME:   broadcast_dimensions = array<i64: 1>
-  // CHECK-SAME: } : (tensor<16xf32>) -> tensor<16x16xf32>
+  // CHECK-SAME: }> : (tensor<16xf32>) -> tensor<16x16xf32>
   %0 = "mhlo.broadcast_in_dim"(%arg0) <{
     broadcast_dimensions = dense<1> : tensor<1xi64>
   }> : (tensor<16xf32>) -> tensor<16x16xf32>
@@ -500,9 +499,9 @@ func.func @op_broadcast_in_dim(%arg0: tensor<16xf32>) -> tensor<16x16xf32> {
 
 // CHECK-LABEL: "op_broadcast"
 func.func @op_broadcast(%arg0: tensor<16xf32>) -> tensor<16x16xf32> {
-  //      CHECK: "stablehlo.broadcast"(%arg0) {
+  //      CHECK: "stablehlo.broadcast"([[ARG0:%arg[0-9]+]]) <{
   // CHECK-SAME:   broadcast_sizes = array<i64: 16>
-  // CHECK-SAME: } : (tensor<16xf32>) -> tensor<16x16xf32>
+  // CHECK-SAME: }> : (tensor<16xf32>) -> tensor<16x16xf32>
   %0 = "mhlo.broadcast"(%arg0) {
     broadcast_sizes = dense<16> : tensor<1xi64>
   } : (tensor<16xf32>) -> tensor<16x16xf32>
@@ -511,8 +510,8 @@ func.func @op_broadcast(%arg0: tensor<16xf32>) -> tensor<16x16xf32> {
 
 // CHECK-LABEL: "op_case"
 func.func @op_case(%arg0: tensor<i32>, %arg1: tensor<f32>) -> tensor<f32> {
-  //      CHECK: "stablehlo.case"(%arg0) ({
-  // CHECK-NEXT:   "stablehlo.return"(%arg1) : (tensor<f32>) -> ()
+  //      CHECK: "stablehlo.case"([[ARG0:%arg[0-9]+]]) ({
+  // CHECK-NEXT:   "stablehlo.return"([[ARG1:%arg[0-9]+]]) : (tensor<f32>) -> ()
   // CHECK-NEXT: }) : (tensor<i32>) -> tensor<f32>
   %0 = "mhlo.case"(%arg0) ({
     "mhlo.return"(%arg1) : (tensor<f32>) -> ()
@@ -522,23 +521,23 @@ func.func @op_case(%arg0: tensor<i32>, %arg1: tensor<f32>) -> tensor<f32> {
 
 // CHECK-LABEL: "op_cbrt"
 func.func @op_cbrt(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "stablehlo.cbrt"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "stablehlo.cbrt"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.cbrt"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_ceil"
 func.func @op_ceil(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "stablehlo.ceil"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "stablehlo.ceil"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.ceil"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_cholesky"
 func.func @op_cholesky(%arg0: tensor<1x16x16xf32>) -> tensor<1x16x16xf32> {
-  //      CHECK: "stablehlo.cholesky"(%arg0) {
+  //      CHECK: "stablehlo.cholesky"([[ARG0:%arg[0-9]+]]) <{
   // CHECK-SAME:   lower = true
-  // CHECK-SAME: } : (tensor<1x16x16xf32>) -> tensor<1x16x16xf32>
+  // CHECK-SAME: }> : (tensor<1x16x16xf32>) -> tensor<1x16x16xf32>
   %0 = "mhlo.cholesky"(%arg0) {
     lower = true
   } : (tensor<1x16x16xf32>) -> tensor<1x16x16xf32>
@@ -547,24 +546,24 @@ func.func @op_cholesky(%arg0: tensor<1x16x16xf32>) -> tensor<1x16x16xf32> {
 
 // CHECK-LABEL: "op_clamp"
 func.func @op_clamp(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<f32> {
-  // CHECK: "stablehlo.clamp"(%arg0, %arg1, %arg2) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: "stablehlo.clamp"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]], [[ARG2:%arg[0-9]+]]) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "mhlo.clamp"(%arg0, %arg1, %arg2) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_count_leading_zeros"
 func.func @op_count_leading_zeros(%arg0: tensor<i32>) -> tensor<i32> {
-  // CHECK: "stablehlo.count_leading_zeros"(%arg0) : (tensor<i32>) -> tensor<i32>
+  // CHECK: "stablehlo.count_leading_zeros"([[ARG0:%arg[0-9]+]]) : (tensor<i32>) -> tensor<i32>
   %0 = "mhlo.count_leading_zeros"(%arg0) : (tensor<i32>) -> tensor<i32>
   func.return %0 : tensor<i32>
 }
 
 // CHECK-LABEL: "op_collective_broadcast"
 func.func @op_collective_broadcast(%arg0: tensor<1x2xi64>) -> tensor<1x2xi64> {
-  //               CHECK: "stablehlo.collective_broadcast"(%arg0) {
+  //               CHECK: "stablehlo.collective_broadcast"([[ARG0:%arg[0-9]+]]) <{
   //          CHECK-SAME:   channel_handle = #stablehlo.channel_handle<handle = 0, type = 0>,
   // CHECK-SAME{LITERAL}:   replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
-  //          CHECK-SAME: } : (tensor<1x2xi64>) -> tensor<1x2xi64>
+  //          CHECK-SAME: }> : (tensor<1x2xi64>) -> tensor<1x2xi64>
   %0 = "mhlo.collective_broadcast"(%arg0) {
     replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>,
     channel_handle = #mhlo.channel_handle<handle = 0, type = 0>
@@ -574,10 +573,10 @@ func.func @op_collective_broadcast(%arg0: tensor<1x2xi64>) -> tensor<1x2xi64> {
 
 // CHECK-LABEL: "op_collective_permute"
 func.func @op_collective_permute(%arg0: tensor<16x8xf32>) -> tensor<16x8xf32> {
-  //               CHECK: "stablehlo.collective_permute"(%arg0) {
+  //               CHECK: "stablehlo.collective_permute"([[ARG0:%arg[0-9]+]]) <{
   //          CHECK-SAME:   channel_handle = #stablehlo.channel_handle<handle = 0, type = 0>,
   // CHECK-SAME{LITERAL}:   source_target_pairs = dense<[[0, 1], [1, 2], [2, 3]]> : tensor<3x2xi64>
-  //          CHECK-SAME: } : (tensor<16x8xf32>) -> tensor<16x8xf32>
+  //          CHECK-SAME: }> : (tensor<16x8xf32>) -> tensor<16x8xf32>
   %0 = "mhlo.collective_permute"(%arg0) {
     source_target_pairs = dense<[[0, 1], [1, 2], [2, 3]]> : tensor<3x2xi64>,
     channel_handle = #mhlo.channel_handle<handle = 0, type = 0>
@@ -587,10 +586,10 @@ func.func @op_collective_permute(%arg0: tensor<16x8xf32>) -> tensor<16x8xf32> {
 
 // CHECK-LABEL: "op_compare"
 func.func @op_compare(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<i1> {
-  //      CHECK: "stablehlo.compare"(%arg0, %arg1) {
+  //      CHECK: "stablehlo.compare"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) <{
   // CHECK-SAME:   compare_type = #stablehlo<comparison_type TOTALORDER>,
   // CHECK-SAME:   comparison_direction = #stablehlo<comparison_direction EQ>
-  // CHECK-SAME: } : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  // CHECK-SAME: }> : (tensor<f32>, tensor<f32>) -> tensor<i1>
   %0 = "mhlo.compare"(%arg0, %arg1) {
     comparison_direction = #mhlo<comparison_direction EQ>,
     compare_type = #mhlo<comparison_type TOTALORDER>
@@ -600,14 +599,14 @@ func.func @op_compare(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<i1> {
 
 // CHECK-LABEL: "op_complex"
 func.func @op_complex(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<complex<f32>> {
-  // CHECK: "stablehlo.complex"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<complex<f32>>
+  // CHECK: "stablehlo.complex"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<f32>, tensor<f32>) -> tensor<complex<f32>>
   %0 = "mhlo.complex"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<complex<f32>>
   func.return %0 : tensor<complex<f32>>
 }
 
 // CHECK-LABEL: "op_composite"
 func.func @op_composite(%arg0 : tensor<i64>) -> tensor<i64> {
-  // CHECK: "stablehlo.composite"(%arg0) {composite_attributes = {n = 2 : i64}, decomposition = @add_n.impl, name = "mhlo.add_n"} : (tensor<i64>) -> tensor<i64>
+  // CHECK: "stablehlo.composite"([[ARG0:%arg[0-9]+]]) <{composite_attributes = {n = 2 : i64}, decomposition = @add_n.impl, name = "mhlo.add_n"}> : (tensor<i64>) -> tensor<i64>
   %0 = mhlo.composite "mhlo.add_n" %arg0 {
     composite_attributes = { n = 2 : i64 },
     decomposition = @add_n.impl
@@ -621,18 +620,11 @@ func.func @add_n.impl(%arg0: tensor<i64>) -> tensor<i64> {
   func.return %1 : tensor<i64>
 }
 
-// CHECK-LABEL: "op_compute_reshape_shape"
-func.func @op_compute_reshape_shape(%arg0: index, %arg1: tensor<1xindex>) -> tensor<1xindex> {
-  // CHECK: "stablehlo.compute_reshape_shape"(%arg0, %arg1) : (index, tensor<1xindex>) -> tensor<1xindex>
-  %0 = "mhlo.compute_reshape_shape"(%arg0, %arg1) : (index, tensor<1xindex>) -> tensor<1xindex>
-  func.return %0 : tensor<1xindex>
-}
-
 // CHECK-LABEL: "op_concatenate"
 func.func @op_concatenate(%arg0: tensor<8xf32>, %arg1: tensor<8xf32>) -> tensor<16xf32> {
-  //      CHECK: "stablehlo.concatenate"(%arg0, %arg1) {
+  //      CHECK: "stablehlo.concatenate"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) <{
   // CHECK-SAME:   dimension = 0 : i64
-  // CHECK-SAME: } : (tensor<8xf32>, tensor<8xf32>) -> tensor<16xf32>
+  // CHECK-SAME: }> : (tensor<8xf32>, tensor<8xf32>) -> tensor<16xf32>
   %0 = "mhlo.concatenate"(%arg0, %arg1) {
     dimension = 0 : i64
   } : (tensor<8xf32>, tensor<8xf32>) -> tensor<16xf32>
@@ -641,9 +633,9 @@ func.func @op_concatenate(%arg0: tensor<8xf32>, %arg1: tensor<8xf32>) -> tensor<
 
 // CHECK-LABEL: "op_constant"
 func.func @op_constant(%arg0: tensor<f32>) -> tensor<f32> {
-  //      CHECK: "stablehlo.constant"() {
+  //      CHECK: "stablehlo.constant"() <{
   // CHECK-SAME:   value = dense<0.000000e+00> : tensor<f32>
-  // CHECK-SAME: } : () -> tensor<f32>
+  // CHECK-SAME: }> : () -> tensor<f32>
   %0 = "mhlo.constant"() {
     value = dense<0.0> : tensor<f32>
   } : () -> tensor<f32>
@@ -652,14 +644,14 @@ func.func @op_constant(%arg0: tensor<f32>) -> tensor<f32> {
 
 // CHECK-LABEL: "op_convert"
 func.func @op_convert(%arg0: tensor<i32>) -> tensor<f32> {
-  // CHECK: "stablehlo.convert"(%arg0) : (tensor<i32>) -> tensor<f32>
+  // CHECK: "stablehlo.convert"([[ARG0:%arg[0-9]+]]) : (tensor<i32>) -> tensor<f32>
   %0 = "mhlo.convert"(%arg0) : (tensor<i32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_convolution"
 func.func @op_convolution(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
-  //      CHECK: "stablehlo.convolution"(%arg0, %arg1) {
+  //      CHECK: "stablehlo.convolution"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) <{
   // CHECK-SAME:   batch_group_count = 1 : i64,
   // CHECK-SAME:   dimension_numbers = #stablehlo.conv<[b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]>,
   // CHECK-SAME:   feature_group_count = 1 : i64,
@@ -669,7 +661,7 @@ func.func @op_convolution(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16
   // CHECK-SAME:   rhs_dilation = array<i64: 1, 1>,
   // CHECK-SAME:   window_reversal = array<i1: false, false>,
   // CHECK-SAME:   window_strides = array<i64: 1, 1>
-  // CHECK-SAME: } : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32>
+  // CHECK-SAME: }> : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32>
   %0 = "mhlo.convolution"(%arg0, %arg1) {
     window_strides = dense<1> : tensor<2xi64>,
     padding = dense<1> : tensor<2x2xi64>,
@@ -688,7 +680,7 @@ func.func @op_convolution(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16
 
 // CHECK-LABEL: "op_cosine"
 func.func @op_cosine(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "stablehlo.cosine"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "stablehlo.cosine"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.cosine"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
@@ -702,26 +694,19 @@ func.func @op_create_token() -> !mhlo.token {
 
 // CHECK-LABEL: "op_cross_replica_sum"
 func.func @op_cross_replica_sum(%arg0: tensor<f32>) -> tensor<f32> {
-  //               CHECK: "stablehlo.cross-replica-sum"(%arg0) {
+  //               CHECK: "stablehlo.cross-replica-sum"([[ARG0:%arg[0-9]+]]) <{
   // CHECK-SAME{LITERAL}:   replica_groups = dense<[[0], [1]]> : tensor<2x1xi64>
-  //          CHECK-SAME: } : (tensor<f32>) -> tensor<f32>
+  //          CHECK-SAME: }> : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.cross-replica-sum"(%arg0) {
     replica_groups = dense<[[0], [1]]> : tensor<2x1xi64>
   } : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
-// CHECK-LABEL: "op_cstr_reshapable"
-func.func @op_cstr_reshapable(%arg0: index, %arg1: tensor<1xindex>) -> !shape.witness {
-  // CHECK: "stablehlo.cstr_reshapable"(%arg0, %arg1) : (index, tensor<1xindex>) -> !shape.witness
-  %0 = "mhlo.cstr_reshapable"(%arg0, %arg1) : (index, tensor<1xindex>) -> !shape.witness
-  func.return %0 : !shape.witness
-}
-
 // CHECK-LABEL: "op_custom_call_api_version_original"
 func.func @called_computation() { func.return }
 func.func @op_custom_call_api_version_original(%arg0: tensor<f32>) -> tensor<f32> {
-  //      CHECK: "stablehlo.custom_call"(%arg0) {
+  //      CHECK: "stablehlo.custom_call"([[ARG0:%arg[0-9]+]]) <{
   // CHECK-SAME:   api_version = 1 : i32,
   // CHECK-SAME:   backend_config = "",
   // CHECK-SAME:   call_target_name = "foo",
@@ -734,7 +719,7 @@ func.func @op_custom_call_api_version_original(%arg0: tensor<f32>) -> tensor<f32
   // CHECK-SAME:       operand_index = 0,
   // CHECK-SAME:       operand_tuple_indices = []>]
   // CHECK-SAME:   result_layouts = [dense<> : tensor<0xindex>]
-  // CHECK-SAME: } : (tensor<f32>) -> tensor<f32>
+  // CHECK-SAME: }> : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.custom_call"(%arg0) {
     call_target_name = "foo",
     has_side_effect = false,
@@ -752,9 +737,34 @@ func.func @op_custom_call_api_version_original(%arg0: tensor<f32>) -> tensor<f32
   func.return %0 : tensor<f32>
 }
 
+// CHECK-LABEL: "op_custom_call_custom_call_schedule_none"
+func.func @op_custom_call_custom_call_schedule_none(%arg0: tensor<f32>) -> tensor<f32> {
+  //      CHECK: "stablehlo.custom_call"([[ARG0:%arg[0-9]+]]) <{backend_config = "", call_target_name = "foo"}> : (tensor<f32>) -> tensor<f32>
+  %0 = "mhlo.custom_call"(%arg0) {
+    call_target_name = "foo",
+    custom_call_schedule = #mhlo<custom_call_schedule NONE>
+  } : (tensor<f32>) -> tensor<f32>
+  func.return %0 : tensor<f32>
+}
+
+// CHECK-LABEL: "op_custom_call_custom_call_schedule_none_ffi"
+func.func @op_custom_call_custom_call_schedule_none_ffi(%arg0: tensor<f32>) -> tensor<f32> {
+  //      CHECK: "stablehlo.custom_call"([[ARG0:%arg[0-9]+]]) <{backend_config = "", call_target_name = "mhlo.custom_call"}> {
+  // CHECK-SAME:   mhlo.attributes = {api_version = 4 : i32, backend_config = {foo = "bar"}, call_target_name = "foo"},
+  // CHECK-SAME:   mhlo.version = 1 : i64
+  // CHECK-SAME: } : (tensor<f32>) -> tensor<f32>
+  %0 = "mhlo.custom_call"(%arg0) {
+    call_target_name = "foo",
+    backend_config = {foo = "bar"},
+    api_version = 4 : i32,
+    custom_call_schedule = #mhlo<custom_call_schedule NONE>
+  } : (tensor<f32>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
 // CHECK-LABEL: "op_divide"
 func.func @op_divide(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
-  // CHECK: "stablehlo.divide"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: "stablehlo.divide"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "mhlo.divide"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
@@ -763,7 +773,7 @@ func.func @op_divide(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
 
 // CHECK-LABEL: "op_dot_general"
 func.func @op_dot_general(%arg0: tensor<8x8x16xf32>, %arg1: tensor<8x16x8xf32>) -> tensor<8x8x8xf32> {
-  //      CHECK: "stablehlo.dot_general"(%arg0, %arg1) {
+  //      CHECK: "stablehlo.dot_general"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) <{
   // CHECK-SAME:   dot_dimension_numbers = #stablehlo.dot<
   // CHECK-SAME:     lhs_batching_dimensions = [0],
   // CHECK-SAME:     rhs_batching_dimensions = [0],
@@ -771,7 +781,7 @@ func.func @op_dot_general(%arg0: tensor<8x8x16xf32>, %arg1: tensor<8x16x8xf32>)
   // CHECK-SAME:     rhs_contracting_dimensions = [1]
   // CHECK-SAME:   >,
   // CHECK-SAME:   precision_config = []
-  // CHECK-SAME: } : (tensor<8x8x16xf32>, tensor<8x16x8xf32>) -> tensor<8x8x8xf32>
+  // CHECK-SAME: }> : (tensor<8x8x16xf32>, tensor<8x16x8xf32>) -> tensor<8x8x8xf32>
   %0 = "mhlo.dot_general"(%arg0, %arg1) {
     dot_dimension_numbers = #mhlo.dot<
       lhs_batching_dimensions = [0],
@@ -786,9 +796,9 @@ func.func @op_dot_general(%arg0: tensor<8x8x16xf32>, %arg1: tensor<8x16x8xf32>)
 
 // CHECK-LABEL: "op_dot"
 func.func @op_dot(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor<8x8xf32> {
-  //      CHECK: "stablehlo.dot"(%arg0, %arg1) {
+  //      CHECK: "stablehlo.dot"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) <{
   // CHECK-SAME:   precision_config = []
-  // CHECK-SAME: } : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<8x8xf32>
+  // CHECK-SAME: }> : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<8x8xf32>
   %0 = "mhlo.dot"(%arg0, %arg1) {
     precision_config = []
   } : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<8x8xf32>
@@ -797,11 +807,11 @@ func.func @op_dot(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor<8x
 
 // CHECK-LABEL: "op_dynamic_broadcast_in_dim"
 func.func @op_dynamic_broadcast_in_dim(%arg0: tensor<?xf32>, %arg1: tensor<2xindex>) -> tensor<?x?xf32> {
-  //      CHECK: "stablehlo.dynamic_broadcast_in_dim"(%arg0, %arg1) {
+  //      CHECK: "stablehlo.dynamic_broadcast_in_dim"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) <{
   // CHECK-SAME:   broadcast_dimensions = array<i64: 1>,
   // CHECK-SAME:   known_expanding_dimensions = array<i64>,
   // CHECK-SAME:   known_nonexpanding_dimensions = array<i64: 0>
-  // CHECK-SAME: } : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+  // CHECK-SAME: }> : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
   %0 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %arg1) <{
     broadcast_dimensions = dense<1> : tensor<1xi64>,
     known_expanding_dimensions = dense<[]> : tensor<0xi64>,
@@ -811,21 +821,19 @@ func.func @op_dynamic_broadcast_in_dim(%arg0: tensor<?xf32>, %arg1: tensor<2xind
 }
 
 // CHECK-LABEL: "op_dynamic_conv"
-func.func @op_dynamic_conv(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16xf32>, %arg2: tensor<4xi32>) -> tensor<1x?x?x16xf32> {
-  //      CHECK: "stablehlo.dynamic_conv"(%arg0, %arg1, %arg2) {
+func.func @op_dynamic_conv(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16xf32>, %arg2: tensor<2x2xi32>) -> tensor<1x?x?x16xf32> {
+  //      CHECK: "stablehlo.dynamic_conv"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]], [[ARG2:%arg[0-9]+]]) <{
   // CHECK-SAME:   batch_group_count = 1 : i64,
   // CHECK-SAME:   dimension_numbers = #stablehlo.conv<[b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]>,
   // CHECK-SAME:   feature_group_count = 1 : i64,
   // CHECK-SAME:   lhs_dilation = array<i64: 1, 1>,
-  // CHECK-SAME:   padding = dense<1> : tensor<2x2xi64>,
   // CHECK-SAME:   precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>],
   // CHECK-SAME:   rhs_dilation = array<i64: 1, 1>,
   // CHECK-SAME:   window_reversal = array<i1: false, false>,
   // CHECK-SAME:   window_strides = array<i64: 1, 1>
-  // CHECK-SAME: } : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>, tensor<4xi32>) -> tensor<1x?x?x16xf32>
+  // CHECK-SAME: }> : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>, tensor<2x2xi32>) -> tensor<1x?x?x16xf32>
   %0 = "mhlo.dynamic_conv"(%arg0, %arg1, %arg2) {
     window_strides = dense<1> : tensor<2xi64>,
-    padding = dense<1> : tensor<2x2xi64>,
     lhs_dilation = dense<1> : tensor<2xi64>,
     rhs_dilation = dense<1> : tensor<2xi64>,
     window_reversal = dense<false> : tensor<2xi1>,
@@ -833,13 +841,13 @@ func.func @op_dynamic_conv(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x1
     feature_group_count = 1 : i64,
     batch_group_count = 1 : i64,
     precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]
-  } : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>, tensor<4xi32>) -> tensor<1x?x?x16xf32>
+  } : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>, tensor<2x2xi32>) -> tensor<1x?x?x16xf32>
   func.return %0 : tensor<1x?x?x16xf32>
 }
 
 // CHECK-LABEL: "op_dynamic_gather"
 func.func @op_dynamic_gather(%arg0 : tensor<2x4x9xf32>, %arg1 : tensor<1x5x2xi32>, %arg2 : tensor<3xi32>) -> tensor<1x5x8xf32> {
-  //      CHECK: "stablehlo.dynamic_gather"(%arg0, %arg1, %arg2) {
+  //      CHECK: "stablehlo.dynamic_gather"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]], [[ARG2:%arg[0-9]+]]) <{
   // CHECK-SAME:   dimension_numbers = #stablehlo.gather<
   // CHECK-SAME:     offset_dims = [2],
   // CHECK-SAME:     collapsed_slice_dims = [0, 1],
@@ -847,7 +855,7 @@ func.func @op_dynamic_gather(%arg0 : tensor<2x4x9xf32>, %arg1 : tensor<1x5x2xi32
   // CHECK-SAME:     index_vector_dim = 2
   // CHECK-SAME:   >,
   // CHECK-SAME:   indices_are_sorted = false
-  // CHECK-SAME: } : (tensor<2x4x9xf32>, tensor<1x5x2xi32>, tensor<3xi32>) -> tensor<1x5x8xf32>
+  // CHECK-SAME: }> : (tensor<2x4x9xf32>, tensor<1x5x2xi32>, tensor<3xi32>) -> tensor<1x5x8xf32>
   %0 = "mhlo.dynamic_gather"(%arg0, %arg1, %arg2) {
     dimension_numbers = #mhlo.gather<
       offset_dims = [2],
@@ -862,9 +870,9 @@ func.func @op_dynamic_gather(%arg0 : tensor<2x4x9xf32>, %arg1 : tensor<1x5x2xi32
 
 // CHECK-LABEL: "op_dynamic_iota"
 func.func @op_dynamic_iota(%arg0: tensor<1xindex>) -> tensor<?xf32> {
-  //      CHECK: "stablehlo.dynamic_iota"(%arg0) {
+  //      CHECK: "stablehlo.dynamic_iota"([[ARG0:%arg[0-9]+]]) <{
   // CHECK-SAME:   iota_dimension = 0 : i64
-  // CHECK-SAME: } : (tensor<1xindex>) -> tensor<?xf32>
+  // CHECK-SAME: }> : (tensor<1xindex>) -> tensor<?xf32>
   %0 = "mhlo.dynamic_iota"(%arg0) {
     iota_dimension = 0 : i64
   } : (tensor<1xindex>) -> tensor<?xf32>
@@ -873,23 +881,23 @@ func.func @op_dynamic_iota(%arg0: tensor<1xindex>) -> tensor<?xf32> {
 
 // CHECK-LABEL: "op_dynamic_pad"
 func.func @op_dynamic_pad(%arg0: tensor<?xf32>, %arg1: tensor<f32>, %arg2: tensor<1xindex>, %arg3: tensor<1xindex>, %arg4: tensor<1xindex>) -> tensor<?xf32> {
-  // CHECK: "stablehlo.dynamic_pad"(%arg0, %arg1, %arg2, %arg3, %arg4) : (tensor<?xf32>, tensor<f32>, tensor<1xindex>, tensor<1xindex>, tensor<1xindex>) -> tensor<?xf32>
+  // CHECK: "stablehlo.dynamic_pad"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]], [[ARG2:%arg[0-9]+]], [[ARG3:%arg[0-9]+]], [[ARG4:%arg[0-9]+]]) : (tensor<?xf32>, tensor<f32>, tensor<1xindex>, tensor<1xindex>, tensor<1xindex>) -> tensor<?xf32>
   %0 = "mhlo.dynamic_pad"(%arg0, %arg1, %arg2, %arg3, %arg4) : (tensor<?xf32>, tensor<f32>, tensor<1xindex>, tensor<1xindex>, tensor<1xindex>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
 }
 
 // CHECK-LABEL: "op_dynamic_reshape"
 func.func @op_dynamic_reshape(%arg0: tensor<16xf32>, %arg1: tensor<2xindex>) -> tensor<?x?xf32> {
-  // CHECK: "stablehlo.dynamic_reshape"(%arg0, %arg1) : (tensor<16xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+  // CHECK: "stablehlo.dynamic_reshape"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<16xf32>, tensor<2xindex>) -> tensor<?x?xf32>
   %0 = "mhlo.dynamic_reshape"(%arg0, %arg1) : (tensor<16xf32>, tensor<2xindex>) -> tensor<?x?xf32>
   func.return %0 : tensor<?x?xf32>
 }
 
 // CHECK-LABEL: "op_dynamic_slice"
 func.func @op_dynamic_slice(%arg0: tensor<16xf32>, %arg1: tensor<i64>) -> tensor<4xf32> {
-  //      CHECK: "stablehlo.dynamic_slice"(%arg0, %arg1) {
+  //      CHECK: "stablehlo.dynamic_slice"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) <{
   // CHECK-SAME:   slice_sizes = array<i64: 4>
-  // CHECK-SAME: } : (tensor<16xf32>, tensor<i64>) -> tensor<4xf32>
+  // CHECK-SAME: }> : (tensor<16xf32>, tensor<i64>) -> tensor<4xf32>
   %0 = "mhlo.dynamic_slice"(%arg0, %arg1) {
     slice_sizes = dense<4> : tensor<1xi64>
   } : (tensor<16xf32>, tensor<i64>) -> tensor<4xf32>
@@ -898,16 +906,16 @@ func.func @op_dynamic_slice(%arg0: tensor<16xf32>, %arg1: tensor<i64>) -> tensor
 
 // CHECK-LABEL: "op_dynamic_update_slice"
 func.func @op_dynamic_update_slice(%arg0: tensor<16xf32>, %arg1: tensor<4xf32>, %arg2: tensor<i64>) -> tensor<16xf32> {
-  // CHECK: "stablehlo.dynamic_update_slice"(%arg0, %arg1, %arg2) : (tensor<16xf32>, tensor<4xf32>, tensor<i64>) -> tensor<16xf32>
+  // CHECK: "stablehlo.dynamic_update_slice"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]], [[ARG2:%arg[0-9]+]]) : (tensor<16xf32>, tensor<4xf32>, tensor<i64>) -> tensor<16xf32>
   %0 = "mhlo.dynamic_update_slice"(%arg0, %arg1, %arg2) : (tensor<16xf32>, tensor<4xf32>, tensor<i64>) -> tensor<16xf32>
   func.return %0 : tensor<16xf32>
 }
 
 // CHECK-LABEL: "op_einsum"
 func.func @op_einsum(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor<8x8xf32> {
-  //      CHECK: "stablehlo.einsum"(%arg0, %arg1) {
+  //      CHECK: "stablehlo.einsum"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) <{
   // CHECK-SAME:   einsum_config = "ab,bc->ac"
-  // CHECK-SAME: } : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<8x8xf32>
+  // CHECK-SAME: }> : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<8x8xf32>
   %0 = "mhlo.einsum"(%arg0, %arg1) {
     einsum_config = "ab,bc->ac"
   } : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<8x8xf32>
@@ -916,24 +924,24 @@ func.func @op_einsum(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor
 
 // CHECK-LABEL: "op_exponential_minus_one"
 func.func @op_exponential_minus_one(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "stablehlo.exponential_minus_one"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "stablehlo.exponential_minus_one"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.exponential_minus_one"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_exponential"
 func.func @op_exponential(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "stablehlo.exponential"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "stablehlo.exponential"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.exponential"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_fft"
 func.func @op_fft(%arg0: tensor<16xcomplex<f32>>) -> tensor<16xcomplex<f32>> {
-  //      CHECK: "stablehlo.fft"(%arg0) {
+  //      CHECK: "stablehlo.fft"([[ARG0:%arg[0-9]+]]) <{
   // CHECK-SAME:   fft_length = array<i64: 16>,
   // CHECK-SAME:   fft_type = #stablehlo<fft_type FFT>
-  // CHECK-SAME: } : (tensor<16xcomplex<f32>>) -> tensor<16xcomplex<f32>>
+  // CHECK-SAME: }> : (tensor<16xcomplex<f32>>) -> tensor<16xcomplex<f32>>
   %0 = "mhlo.fft"(%arg0) {
     fft_type = #mhlo<fft_type FFT>,
     fft_length = dense<16> : tensor<1xi64>
@@ -943,7 +951,7 @@ func.func @op_fft(%arg0: tensor<16xcomplex<f32>>) -> tensor<16xcomplex<f32>> {
 
 // CHECK-LABEL: "op_floor"
 func.func @op_floor(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "stablehlo.floor"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "stablehlo.floor"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.floor"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
@@ -952,7 +960,7 @@ func.func @op_floor(%arg0: tensor<f32>) -> tensor<f32> {
 
 // CHECK-LABEL: "op_gather"
 func.func @op_gather(%arg0 : tensor<2x4x9xf32>, %arg1 : tensor<1x5x2xi32>) -> tensor<1x5x1xf32> {
-  //      CHECK: "stablehlo.gather"(%arg0, %arg1) {
+  //      CHECK: "stablehlo.gather"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) <{
   // CHECK-SAME:   dimension_numbers = #stablehlo.gather<
   // CHECK-SAME:     offset_dims = [2],
   // CHECK-SAME:     collapsed_slice_dims = [0, 1],
@@ -961,7 +969,7 @@ func.func @op_gather(%arg0 : tensor<2x4x9xf32>, %arg1 : tensor<1x5x2xi32>) -> te
   // CHECK-SAME:   >,
   // CHECK-SAME:   indices_are_sorted = false,
   // CHECK-SAME:   slice_sizes = array<i64: 1, 1, 1>
-  // CHECK-SAME: } : (tensor<2x4x9xf32>, tensor<1x5x2xi32>) -> tensor<1x5x1xf32>
+  // CHECK-SAME: }> : (tensor<2x4x9xf32>, tensor<1x5x2xi32>) -> tensor<1x5x1xf32>
   %0 = "mhlo.gather"(%arg0, %arg1) {
     dimension_numbers = #mhlo.gather<
       offset_dims = [2],
@@ -977,9 +985,9 @@ func.func @op_gather(%arg0 : tensor<2x4x9xf32>, %arg1 : tensor<1x5x2xi32>) -> te
 
 // CHECK-LABEL: "op_get_dimension_size"
 func.func @op_get_dimension_size(%arg0: tensor<?xf32>) -> tensor<i32> {
-  //      CHECK: "stablehlo.get_dimension_size"(%arg0) {
+  //      CHECK: "stablehlo.get_dimension_size"([[ARG0:%arg[0-9]+]]) <{
   // CHECK-SAME:   dimension = 0 : i64
-  // CHECK-SAME: } : (tensor<?xf32>) -> tensor<i32>
+  // CHECK-SAME: }> : (tensor<?xf32>) -> tensor<i32>
   %0 = "mhlo.get_dimension_size"(%arg0) {
     dimension = 0 : i64
   } : (tensor<?xf32>) -> tensor<i32>
@@ -988,9 +996,9 @@ func.func @op_get_dimension_size(%arg0: tensor<?xf32>) -> tensor<i32> {
 
 // CHECK-LABEL: "op_get_tuple_element"
 func.func @op_get_tuple_element(%arg0: tuple<tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>>) -> tensor<f32> {
-  //      CHECK: "stablehlo.get_tuple_element"(%arg0) {
+  //      CHECK: "stablehlo.get_tuple_element"([[ARG0:%arg[0-9]+]]) <{
   // CHECK-SAME:   index = 4 : i32
-  // CHECK-SAME: } : (tuple<tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>>) -> tensor<f32>
+  // CHECK-SAME: }> : (tuple<tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>>) -> tensor<f32>
   %0 = "mhlo.get_tuple_element"(%arg0) {
     index = 4 : i32
   } : (tuple<tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>>) -> tensor<f32>
@@ -999,10 +1007,10 @@ func.func @op_get_tuple_element(%arg0: tuple<tensor<f32>, tensor<f32>, tensor<f3
 
 // CHECK-LABEL: "op_if"
 func.func @op_if(%arg0: tensor<i1>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<f32> {
-  //      CHECK: "stablehlo.if"(%arg0) ({
-  // CHECK-NEXT:   "stablehlo.return"(%arg1) : (tensor<f32>) -> ()
+  //      CHECK: "stablehlo.if"([[ARG0:%arg[0-9]+]]) ({
+  // CHECK-NEXT:   "stablehlo.return"([[ARG1:%arg[0-9]+]]) : (tensor<f32>) -> ()
   // CHECK-NEXT: }, {
-  // CHECK-NEXT:   "stablehlo.return"(%arg2) : (tensor<f32>) -> ()
+  // CHECK-NEXT:   "stablehlo.return"([[ARG2:%arg[0-9]+]]) : (tensor<f32>) -> ()
   // CHECK-NEXT: }) : (tensor<i1>) -> tensor<f32>
   %0 = "mhlo.if"(%arg0) ({
     "mhlo.return"(%arg1) : (tensor<f32>) -> ()
@@ -1014,17 +1022,17 @@ func.func @op_if(%arg0: tensor<i1>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> t
 
 // CHECK-LABEL: "op_imag"
 func.func @op_imag(%arg0: tensor<complex<f32>>) -> tensor<f32> {
-  // CHECK: "stablehlo.imag"(%arg0) : (tensor<complex<f32>>) -> tensor<f32>
+  // CHECK: "stablehlo.imag"([[ARG0:%arg[0-9]+]]) : (tensor<complex<f32>>) -> tensor<f32>
   %0 = "mhlo.imag"(%arg0) : (tensor<complex<f32>>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_infeed"
 func.func @op_infeed(%arg0: !mhlo.token) -> (tensor<f32>, !mhlo.token) {
-  //               CHECK: "stablehlo.infeed"(%arg0) {
+  //               CHECK: "stablehlo.infeed"([[ARG0:%arg[0-9]+]]) <{
   //          CHECK-SAME:   infeed_config = "",
   // CHECK-SAME{LITERAL}:   layout = [[]]
-  //          CHECK-SAME: } : (!stablehlo.token) -> (tensor<f32>, !stablehlo.token)
+  //          CHECK-SAME: }> : (!stablehlo.token) -> (tensor<f32>, !stablehlo.token)
   %0:2 = "mhlo.infeed"(%arg0) {
     infeed_config = "",
     layout = [[]]
@@ -1034,9 +1042,9 @@ func.func @op_infeed(%arg0: !mhlo.token) -> (tensor<f32>, !mhlo.token) {
 
 // CHECK-LABEL: "op_iota"
 func.func @op_iota() -> tensor<16xf32> {
-  //      CHECK: "stablehlo.iota"() {
+  //      CHECK: "stablehlo.iota"() <{
   // CHECK-SAME:   iota_dimension = 0 : i64
-  // CHECK-SAME: } : () -> tensor<16xf32>
+  // CHECK-SAME: }> : () -> tensor<16xf32>
   %0 = "mhlo.iota"() {
     iota_dimension = 0 : i64
   } : () -> tensor<16xf32>
@@ -1045,41 +1053,41 @@ func.func @op_iota() -> tensor<16xf32> {
 
 // CHECK-LABEL: "op_is_finite"
 func.func @op_is_finite(%arg0: tensor<f32>) -> tensor<i1> {
-  // CHECK: "stablehlo.is_finite"(%arg0) : (tensor<f32>) -> tensor<i1>
+  // CHECK: "stablehlo.is_finite"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<i1>
   %0 = "mhlo.is_finite"(%arg0) : (tensor<f32>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
 
 // CHECK-LABEL: "op_log"
 func.func @op_log(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "stablehlo.log"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "stablehlo.log"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.log"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_log_plus_one"
 func.func @op_log_plus_one(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "stablehlo.log_plus_one"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "stablehlo.log_plus_one"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.log_plus_one"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_logistic"
 func.func @op_logistic(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "stablehlo.logistic"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "stablehlo.logistic"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.logistic"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_map"
 func.func @op_map(%arg0: tensor<16xf32>) -> tensor<16xf32> {
-  //      CHECK: "stablehlo.map"(%arg0) ({
+  //      CHECK: "stablehlo.map"([[ARG0:%arg[0-9]+]]) <{
+  // CHECK-SAME:   dimensions = array<i64: 0>
+  // CHECK-SAME: }> ({
   // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG1:arg.*]]: tensor<f32>):
   // CHECK-NEXT:     %[[VAL1:.*]] = "stablehlo.abs"(%[[ARG1]]) : (tensor<f32>) -> tensor<f32>
   // CHECK-NEXT:     "stablehlo.return"(%[[VAL1]]) : (tensor<f32>) -> ()
-  // CHECK-NEXT: }) {
-  // CHECK-SAME:   dimensions = array<i64: 0>
-  // CHECK-SAME: } : (tensor<16xf32>) -> tensor<16xf32>
+  // CHECK-NEXT: }) : (tensor<16xf32>) -> tensor<16xf32>
   %0 = "mhlo.map"(%arg0) ({
     ^bb0(%arg1: tensor<f32>):
       %1 = "mhlo.abs"(%arg1) : (tensor<f32>) -> tensor<f32>
@@ -1092,58 +1100,58 @@ func.func @op_map(%arg0: tensor<16xf32>) -> tensor<16xf32> {
 
 // CHECK-LABEL: "op_maximum"
 func.func @op_maximum(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
-  // CHECK: "stablehlo.maximum"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: "stablehlo.maximum"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "mhlo.maximum"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_minimum"
 func.func @op_minimum(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
-  // CHECK: "stablehlo.minimum"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: "stablehlo.minimum"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "mhlo.minimum"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_multiply"
 func.func @op_multiply(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
-  // CHECK: "stablehlo.multiply"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: "stablehlo.multiply"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "mhlo.multiply"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_negate"
 func.func @op_negate(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "stablehlo.negate"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "stablehlo.negate"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.negate"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_not"
 func.func @op_not(%arg0: tensor<i1>) -> tensor<i1> {
-  // CHECK: "stablehlo.not"(%arg0) : (tensor<i1>) -> tensor<i1>
+  // CHECK: "stablehlo.not"([[ARG0:%arg[0-9]+]]) : (tensor<i1>) -> tensor<i1>
   %0 = "mhlo.not"(%arg0) : (tensor<i1>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
 
 // CHECK-LABEL: "op_optimization_barrier"
 func.func @op_optimization_barrier(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "stablehlo.optimization_barrier"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "stablehlo.optimization_barrier"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.optimization_barrier"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_or"
 func.func @op_or(%arg0: tensor<i1>, %arg1: tensor<i1>) -> tensor<i1> {
-  // CHECK: "stablehlo.or"(%arg0, %arg1) : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  // CHECK: "stablehlo.or"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<i1>, tensor<i1>) -> tensor<i1>
   %0 = "mhlo.or"(%arg0, %arg1) : (tensor<i1>, tensor<i1>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
 
 // CHECK-LABEL: "op_outfeed"
 func.func @op_outfeed(%arg0: tensor<f32>, %arg1: !mhlo.token) -> !mhlo.token {
-  //      CHECK: "stablehlo.outfeed"(%arg0, %arg1) {
+  //      CHECK: "stablehlo.outfeed"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) <{
   // CHECK-SAME:   outfeed_config = ""
-  // CHECK-SAME: } : (tensor<f32>, !stablehlo.token) -> !stablehlo.token
+  // CHECK-SAME: }> : (tensor<f32>, !stablehlo.token) -> !stablehlo.token
   %0 = "mhlo.outfeed"(%arg0, %arg1) {
     outfeed_config = ""
   } : (tensor<f32>, !mhlo.token) -> (!mhlo.token)
@@ -1152,11 +1160,11 @@ func.func @op_outfeed(%arg0: tensor<f32>, %arg1: !mhlo.token) -> !mhlo.token {
 
 // CHECK-LABEL: "op_pad"
 func.func @op_pad(%arg0: tensor<8xf32>, %arg1: tensor<f32>) -> tensor<16xf32> {
-  //      CHECK: "stablehlo.pad"(%arg0, %arg1) {
+  //      CHECK: "stablehlo.pad"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) <{
   // CHECK-SAME:   edge_padding_high = array<i64: 4>,
   // CHECK-SAME:   edge_padding_low = array<i64: 4>,
   // CHECK-SAME:   interior_padding = array<i64: 0>
-  // CHECK-SAME: } : (tensor<8xf32>, tensor<f32>) -> tensor<16xf32>
+  // CHECK-SAME: }> : (tensor<8xf32>, tensor<f32>) -> tensor<16xf32>
   %0 = "mhlo.pad"(%arg0, %arg1) {
     edge_padding_high = dense<4> : tensor<1xi64>,
     edge_padding_low = dense<4> : tensor<1xi64>,
@@ -1174,38 +1182,38 @@ func.func @op_partition_id() -> tensor<ui32> {
 
 // CHECK-LABEL: "op_popcnt"
 func.func @op_popcnt(%arg0: tensor<i32>) -> tensor<i32> {
-  // CHECK: "stablehlo.popcnt"(%arg0) : (tensor<i32>) -> tensor<i32>
+  // CHECK: "stablehlo.popcnt"([[ARG0:%arg[0-9]+]]) : (tensor<i32>) -> tensor<i32>
   %0 = "mhlo.popcnt"(%arg0) : (tensor<i32>) -> tensor<i32>
   func.return %0 : tensor<i32>
 }
 
 // CHECK-LABEL: "op_power"
 func.func @op_power(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
-  // CHECK: "stablehlo.power"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: "stablehlo.power"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "mhlo.power"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_real_dynamic_slice"
 func.func @op_real_dynamic_slice(%arg0: tensor<?xf32>, %arg1: tensor<1xindex>, %arg2: tensor<1xindex>, %arg3: tensor<1xindex>) -> tensor<?xf32> {
-  // CHECK: "stablehlo.real_dynamic_slice"(%arg0, %arg1, %arg2, %arg3) : (tensor<?xf32>, tensor<1xindex>, tensor<1xindex>, tensor<1xindex>) -> tensor<?xf32>
+  // CHECK: "stablehlo.real_dynamic_slice"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]], [[ARG2:%arg[0-9]+]], [[ARG3:%arg[0-9]+]]) : (tensor<?xf32>, tensor<1xindex>, tensor<1xindex>, tensor<1xindex>) -> tensor<?xf32>
   %0 = "mhlo.real_dynamic_slice"(%arg0, %arg1, %arg2, %arg3) : (tensor<?xf32>, tensor<1xindex>, tensor<1xindex>, tensor<1xindex>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
 }
 
 // CHECK-LABEL: "op_real"
 func.func @op_real(%arg0: tensor<complex<f32>>) -> tensor<f32> {
-  // CHECK: "stablehlo.real"(%arg0) : (tensor<complex<f32>>) -> tensor<f32>
+  // CHECK: "stablehlo.real"([[ARG0:%arg[0-9]+]]) : (tensor<complex<f32>>) -> tensor<f32>
   %0 = "mhlo.real"(%arg0) : (tensor<complex<f32>>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_recv"
 func.func @op_recv(%arg0: !mhlo.token) -> (tensor<f32>, !mhlo.token) {
-  //      CHECK: "stablehlo.recv"(%arg0) {
+  //      CHECK: "stablehlo.recv"([[ARG0:%arg[0-9]+]]) <{
   // CHECK-SAME:   channel_handle = #stablehlo.channel_handle<handle = 0, type = 3>,
   // CHECK-SAME:   is_host_transfer = true
-  // CHECK-SAME: } : (!stablehlo.token) -> (tensor<f32>, !stablehlo.token)
+  // CHECK-SAME: }> : (!stablehlo.token) -> (tensor<f32>, !stablehlo.token)
   %0:2 = "mhlo.recv"(%arg0) {
     channel_handle = #mhlo.channel_handle<handle = 0, type = 3>,
     is_host_transfer = true
@@ -1227,10 +1235,10 @@ func.func @op_reduce(%arg0: tensor<16xf32>, %arg1: tensor<f32>) -> tensor<f32> {
 
 // CHECK-LABEL: "op_reduce_precision"
 func.func @op_reduce_precision(%arg0: tensor<f32>) -> tensor<f32> {
-  //      CHECK: "stablehlo.reduce_precision"(%arg0) {
+  //      CHECK: "stablehlo.reduce_precision"([[ARG0:%arg[0-9]+]]) <{
   // CHECK-SAME:   exponent_bits = 8 : i32,
   // CHECK-SAME:   mantissa_bits = 10 : i32
-  // CHECK-SAME: } : (tensor<f32>) -> tensor<f32>
+  // CHECK-SAME: }> : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.reduce_precision"(%arg0) {
     exponent_bits = 8 : i32,
     mantissa_bits = 10 : i32
@@ -1240,15 +1248,15 @@ func.func @op_reduce_precision(%arg0: tensor<f32>) -> tensor<f32> {
 
 // CHECK-LABEL: "op_reduce_scatter"
 func.func @op_reduce_scatter(%arg0: tensor<16xf32>) -> tensor<16xf32> {
-  //               CHECK: "stablehlo.reduce_scatter"(%arg0) ({
-  //          CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG1:arg.*]]: tensor<f32>, %[[ARG2:arg.*]]: tensor<f32>):
-  //          CHECK-NEXT:     %[[VAL1:.*]] = "stablehlo.add"(%[[ARG1]], %[[ARG2]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
-  //          CHECK-NEXT:     "stablehlo.return"(%[[VAL1]]) : (tensor<f32>) -> ()
-  //          CHECK-NEXT: }) {
+  //               CHECK: "stablehlo.reduce_scatter"([[ARG0:%arg[0-9]+]]) <{
   //          CHECK-SAME:   channel_handle = #stablehlo.channel_handle<handle = 0, type = 0>,
   // CHECK-SAME{LITERAL}:   replica_groups = dense<[[0], [1]]> : tensor<2x1xi64>,
   //          CHECK-SAME:   scatter_dimension = 0 : i64
-  //          CHECK-SAME: } : (tensor<16xf32>) -> tensor<16xf32>
+  //          CHECK-SAME: }> ({
+  //          CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG1:arg.*]]: tensor<f32>, %[[ARG2:arg.*]]: tensor<f32>):
+  //          CHECK-NEXT:     %[[VAL1:.*]] = "stablehlo.add"(%[[ARG1]], %[[ARG2]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  //          CHECK-NEXT:     "stablehlo.return"(%[[VAL1]]) : (tensor<f32>) -> ()
+  //          CHECK-NEXT: }) : (tensor<16xf32>) -> tensor<16xf32>
   %0 = "mhlo.reduce_scatter"(%arg0) ({
     ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
       %1 = "mhlo.add"(%arg1, %arg2) : (tensor<f32>, tensor<f32>) -> tensor<f32>
@@ -1263,17 +1271,17 @@ func.func @op_reduce_scatter(%arg0: tensor<16xf32>) -> tensor<16xf32> {
 
 // CHECK-LABEL: "op_reduce_window"
 func.func @op_reduce_window(%arg0: tensor<2x17x31x7xf32>, %arg1: tensor<f32>) -> tensor<2x5x8x7xf32> {
-  //               CHECK: "stablehlo.reduce_window"(%arg0, %arg1) ({
-  //          CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG2:arg.*]]: tensor<f32>, %[[ARG3:arg.*]]: tensor<f32>):
-  //          CHECK-NEXT:     %[[VAL1:.*]] = "stablehlo.maximum"(%[[ARG2]], %[[ARG3]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
-  //          CHECK-NEXT:     "stablehlo.return"(%[[VAL1]]) : (tensor<f32>) -> ()
-  //          CHECK-NEXT: }) {
+  //               CHECK: "stablehlo.reduce_window"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) <{
   //          CHECK-SAME:   base_dilations = array<i64: 1, 1, 1, 1>,
   // CHECK-SAME{LITERAL}:   padding = dense<[[0, 0], [2, 0], [0, 2], [0, 0]]> : tensor<4x2xi64>,
   //          CHECK-SAME:   window_dilations = array<i64: 1, 2, 2, 1>,
   //          CHECK-SAME:   window_dimensions = array<i64: 1, 2, 2, 1>,
   //          CHECK-SAME:   window_strides = array<i64: 1, 4, 4, 1>
-  //          CHECK-SAME: } : (tensor<2x17x31x7xf32>, tensor<f32>) -> tensor<2x5x8x7xf32>
+  //          CHECK-SAME: }> ({
+  //          CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG2:arg.*]]: tensor<f32>, %[[ARG3:arg.*]]: tensor<f32>):
+  //          CHECK-NEXT:     %[[VAL1:.*]] = "stablehlo.maximum"(%[[ARG2]], %[[ARG3]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  //          CHECK-NEXT:     "stablehlo.return"(%[[VAL1]]) : (tensor<f32>) -> ()
+  //          CHECK-NEXT: }) : (tensor<2x17x31x7xf32>, tensor<f32>) -> tensor<2x5x8x7xf32>
   %0 = "mhlo.reduce_window"(%arg0, %arg1) ({
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
       %1 = "mhlo.maximum"(%arg2, %arg3) : (tensor<f32>, tensor<f32>) -> tensor<f32>
@@ -1290,7 +1298,7 @@ func.func @op_reduce_window(%arg0: tensor<2x17x31x7xf32>, %arg1: tensor<f32>) ->
 
 // CHECK-LABEL: "op_remainder"
 func.func @op_remainder(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
-  // CHECK: "stablehlo.remainder"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: "stablehlo.remainder"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "mhlo.remainder"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
@@ -1304,15 +1312,15 @@ func.func @op_replica_id() -> tensor<ui32> {
 
 // CHECK-LABEL: "op_reshape"
 func.func @op_reshape(%arg0: tensor<16xf32>) -> tensor<4x4xf32> {
-  // CHECK: "stablehlo.reshape"(%arg0) : (tensor<16xf32>) -> tensor<4x4xf32>
+  // CHECK: "stablehlo.reshape"([[ARG0:%arg[0-9]+]]) : (tensor<16xf32>) -> tensor<4x4xf32>
   %0 = "mhlo.reshape"(%arg0) : (tensor<16xf32>) -> tensor<4x4xf32>
   func.return %0 : tensor<4x4xf32>
 }
 
 // CHECK-LABEL: "op_return"
 func.func @op_return(%arg0: tensor<i32>, %arg1: tensor<f32>) -> tensor<f32> {
-  //      CHECK: "stablehlo.case"(%arg0) ({
-  // CHECK-NEXT:   "stablehlo.return"(%arg1) : (tensor<f32>) -> ()
+  //      CHECK: "stablehlo.case"([[ARG0:%arg[0-9]+]]) ({
+  // CHECK-NEXT:   "stablehlo.return"([[ARG1:%arg[0-9]+]]) : (tensor<f32>) -> ()
   // CHECK-NEXT: }) : (tensor<i32>) -> tensor<f32>
   %0 = "mhlo.case"(%arg0) ({
     "mhlo.return"(%arg1) : (tensor<f32>) -> ()
@@ -1322,9 +1330,9 @@ func.func @op_return(%arg0: tensor<i32>, %arg1: tensor<f32>) -> tensor<f32> {
 
 // CHECK-LABEL: "op_reverse"
 func.func @op_reverse(%arg0: tensor<16xf32>) -> tensor<16xf32> {
-  //      CHECK: "stablehlo.reverse"(%arg0) {
+  //      CHECK: "stablehlo.reverse"([[ARG0:%arg[0-9]+]]) <{
   // CHECK-SAME:   dimensions = array<i64: 0>
-  // CHECK-SAME: } : (tensor<16xf32>) -> tensor<16xf32>
+  // CHECK-SAME: }> : (tensor<16xf32>) -> tensor<16xf32>
   %0 = "mhlo.reverse"(%arg0) {
     dimensions = dense<0> : tensor<1xi64>
   } : (tensor<16xf32>) -> tensor<16xf32>
@@ -1333,9 +1341,9 @@ func.func @op_reverse(%arg0: tensor<16xf32>) -> tensor<16xf32> {
 
 // CHECK-LABEL: "op_rng_bit_generator"
 func.func @op_rng_bit_generator(%arg0: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
-  //      CHECK: "stablehlo.rng_bit_generator"(%arg0) {
+  //      CHECK: "stablehlo.rng_bit_generator"([[ARG0:%arg[0-9]+]]) <{
   // CHECK-SAME:   rng_algorithm = #stablehlo<rng_algorithm PHILOX>
-  // CHECK-SAME: } : (tensor<f32>) -> (tensor<f32>, tensor<f32>)
+  // CHECK-SAME: }> : (tensor<f32>) -> (tensor<f32>, tensor<f32>)
   %0:2 = "mhlo.rng_bit_generator"(%arg0) {
     rng_algorithm = #mhlo.rng_algorithm<PHILOX>
   } : (tensor<f32>) -> (tensor<f32>, tensor<f32>)
@@ -1344,9 +1352,9 @@ func.func @op_rng_bit_generator(%arg0: tensor<f32>) -> (tensor<f32>, tensor<f32>
 
 // CHECK-LABEL: "op_rng"
 func.func @op_rng(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<0xindex>) -> tensor<f32> {
-  //      CHECK: "stablehlo.rng"(%arg0, %arg1, %arg2) {
+  //      CHECK: "stablehlo.rng"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]], [[ARG2:%arg[0-9]+]]) <{
   // CHECK-SAME:   rng_distribution = #stablehlo<rng_distribution NORMAL>
-  // CHECK-SAME: } : (tensor<f32>, tensor<f32>, tensor<0xindex>) -> tensor<f32>
+  // CHECK-SAME: }> : (tensor<f32>, tensor<f32>, tensor<0xindex>) -> tensor<f32>
   %0 = "mhlo.rng"(%arg0, %arg1, %arg2) {
     rng_distribution = #mhlo.rng_distribution<NORMAL>
   } : (tensor<f32>, tensor<f32>, tensor<0xindex>) -> tensor<f32>
@@ -1355,32 +1363,28 @@ func.func @op_rng(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<0xindex>
 
 // CHECK-LABEL: "op_round_nearest_afz"
 func.func @op_round_nearest_afz(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "stablehlo.round_nearest_afz"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "stablehlo.round_nearest_afz"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.round_nearest_afz"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_round_nearest_even"
 func.func @op_round_nearest_even(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "stablehlo.round_nearest_even"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "stablehlo.round_nearest_even"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.round_nearest_even"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_rsqrt"
 func.func @op_rsqrt(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "stablehlo.rsqrt"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "stablehlo.rsqrt"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.rsqrt"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_scatter"
 func.func @op_scatter(%arg0: tensor<200x100x300xf32>, %arg1: tensor<10x2xi32>, %arg2: tensor<10x300xf32>) -> tensor<200x100x300xf32> {
-  //      CHECK: "stablehlo.scatter"(%arg0, %arg1, %arg2) ({
-  // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG3:arg.*]]: tensor<f32>, %[[ARG4:arg.*]]: tensor<f32>):
-  // CHECK-NEXT:     %[[VAL1:.*]] = "stablehlo.add"(%[[ARG3]], %[[ARG4]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
-  // CHECK-NEXT:     "stablehlo.return"(%[[VAL1]]) : (tensor<f32>) -> ()
-  // CHECK-NEXT: }) {
+  //      CHECK: "stablehlo.scatter"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]], [[ARG2:%arg[0-9]+]]) <{
   // CHECK-SAME:  indices_are_sorted = true,
   // CHECK-SAME:  scatter_dimension_numbers = #stablehlo.scatter<
   // CHECK-SAME:    update_window_dims = [1],
@@ -1389,7 +1393,11 @@ func.func @op_scatter(%arg0: tensor<200x100x300xf32>, %arg1: tensor<10x2xi32>, %
   // CHECK-SAME:    index_vector_dim = 1
   // CHECK-SAME:  >,
   // CHECK-SAME:  unique_indices = true
-  // CHECK-SAME: } : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<10x300xf32>) -> tensor<200x100x300xf32>
+  // CHECK-SAME: }> ({
+  // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG3:arg.*]]: tensor<f32>, %[[ARG4:arg.*]]: tensor<f32>):
+  // CHECK-NEXT:     %[[VAL1:.*]] = "stablehlo.add"(%[[ARG3]], %[[ARG4]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK-NEXT:     "stablehlo.return"(%[[VAL1]]) : (tensor<f32>) -> ()
+  // CHECK-NEXT: }) : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<10x300xf32>) -> tensor<200x100x300xf32>
   %0 = "mhlo.scatter"(%arg0, %arg1, %arg2) ({
     ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
       %1 = "mhlo.add"(%arg3, %arg4) : (tensor<f32>, tensor<f32>) -> tensor<f32>
@@ -1409,19 +1417,19 @@ func.func @op_scatter(%arg0: tensor<200x100x300xf32>, %arg1: tensor<10x2xi32>, %
 
 // CHECK-LABEL: "op_select_and_scatter"
 func.func @op_select_and_scatter(%arg0: tensor<10x24x24x64xf32>, %arg1: tensor<10x12x12x64xf32>, %arg2: tensor<f32>) -> tensor<10x24x24x64xf32> {
-  //      CHECK: "stablehlo.select_and_scatter"(%arg0, %arg1, %arg2) ({
+  //      CHECK: "stablehlo.select_and_scatter"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]], [[ARG2:%arg[0-9]+]]) <{
+  // CHECK-SAME:   padding = dense<0> : tensor<4x2xi64>,
+  // CHECK-SAME:   window_dimensions = array<i64: 1, 2, 2, 1>,
+  // CHECK-SAME:   window_strides = array<i64: 1, 2, 2, 1>
+  // CHECK-SAME: }> ({
   // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG31:arg.*]]: tensor<f32>, %[[ARG41:arg.*]]: tensor<f32>):
-  // CHECK-NEXT:     %[[VAL11:.*]] = "stablehlo.compare"(%[[ARG31]], %[[ARG41]]) {compare_type = #stablehlo<comparison_type TOTALORDER>, comparison_direction = #stablehlo<comparison_direction GE>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  // CHECK-NEXT:     %[[VAL11:.*]] = "stablehlo.compare"(%[[ARG31]], %[[ARG41]]) <{compare_type = #stablehlo<comparison_type TOTALORDER>, comparison_direction = #stablehlo<comparison_direction GE>}> : (tensor<f32>, tensor<f32>) -> tensor<i1>
   // CHECK-NEXT:     "stablehlo.return"(%[[VAL11]]) : (tensor<i1>) -> ()
   // CHECK-NEXT: }, {
   // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG32:arg.*]]: tensor<f32>, %[[ARG42:arg.*]]: tensor<f32>):
   // CHECK-NEXT:     %[[VAL12:.*]] = "stablehlo.add"(%[[ARG32]], %[[ARG42]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   // CHECK-NEXT:     "stablehlo.return"(%[[VAL12]]) : (tensor<f32>) -> ()
-  // CHECK-NEXT: }) {
-  // CHECK-SAME:   padding = dense<0> : tensor<4x2xi64>,
-  // CHECK-SAME:   window_dimensions = array<i64: 1, 2, 2, 1>,
-  // CHECK-SAME:   window_strides = array<i64: 1, 2, 2, 1>
-  // CHECK-SAME: } : (tensor<10x24x24x64xf32>, tensor<10x12x12x64xf32>, tensor<f32>) -> tensor<10x24x24x64xf32>
+  // CHECK-NEXT: }) : (tensor<10x24x24x64xf32>, tensor<10x12x12x64xf32>, tensor<f32>) -> tensor<10x24x24x64xf32>
   %0 = "mhlo.select_and_scatter"(%arg0, %arg1, %arg2) ({
     ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
       %1 = "mhlo.compare"(%arg3, %arg4) {compare_type = #mhlo<comparison_type TOTALORDER>, comparison_direction = #mhlo<comparison_direction GE>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
@@ -1440,17 +1448,17 @@ func.func @op_select_and_scatter(%arg0: tensor<10x24x24x64xf32>, %arg1: tensor<1
 
 // CHECK-LABEL: "op_select"
 func.func @op_select(%arg0: tensor<i1>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<f32> {
-  // CHECK: "stablehlo.select"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: "stablehlo.select"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]], [[ARG2:%arg[0-9]+]]) : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "mhlo.select"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_send"
 func.func @op_send(%arg0: tensor<f32>, %arg1: !mhlo.token) -> !mhlo.token {
-  //      CHECK: "stablehlo.send"(%arg0, %arg1) {
+  //      CHECK: "stablehlo.send"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) <{
   // CHECK-SAME:   channel_handle = #stablehlo.channel_handle<handle = 0, type = 2>,
   // CHECK-SAME:   is_host_transfer = true
-  // CHECK-SAME: } : (tensor<f32>, !stablehlo.token) -> !stablehlo.token
+  // CHECK-SAME: }> : (tensor<f32>, !stablehlo.token) -> !stablehlo.token
   %0 = "mhlo.send"(%arg0, %arg1) {
     channel_handle = #mhlo.channel_handle<handle = 0, type = 2>,
     is_host_transfer = true
@@ -1460,9 +1468,9 @@ func.func @op_send(%arg0: tensor<f32>, %arg1: !mhlo.token) -> !mhlo.token {
 
 // CHECK-LABEL: "op_set_dimension_size"
 func.func @op_set_dimension_size(%arg0: tensor<?xf32>, %arg1: tensor<i32>) -> tensor<16xf32> {
-  //      CHECK: "stablehlo.set_dimension_size"(%arg0, %arg1) {
+  //      CHECK: "stablehlo.set_dimension_size"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) <{
   // CHECK-SAME:   dimension = 0 : i64
-  // CHECK-SAME: } : (tensor<?xf32>, tensor<i32>) -> tensor<16xf32>
+  // CHECK-SAME: }> : (tensor<?xf32>, tensor<i32>) -> tensor<16xf32>
   %0 = "mhlo.set_dimension_size"(%arg0, %arg1) {
     dimension = 0 : i64
   } : (tensor<?xf32>, tensor<i32>) -> tensor<16xf32>
@@ -1471,46 +1479,46 @@ func.func @op_set_dimension_size(%arg0: tensor<?xf32>, %arg1: tensor<i32>) -> te
 
 // CHECK-LABEL: "op_shift_left"
 func.func @op_shift_left(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> {
-  // CHECK: "stablehlo.shift_left"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  // CHECK: "stablehlo.shift_left"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<i32>, tensor<i32>) -> tensor<i32>
   %0 = "mhlo.shift_left"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
   func.return %0 : tensor<i32>
 }
 
 // CHECK-LABEL: "op_shift_right_arithmetic"
 func.func @op_shift_right_arithmetic(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> {
-  // CHECK: "stablehlo.shift_right_arithmetic"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  // CHECK: "stablehlo.shift_right_arithmetic"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<i32>, tensor<i32>) -> tensor<i32>
   %0 = "mhlo.shift_right_arithmetic"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
   func.return %0 : tensor<i32>
 }
 
 // CHECK-LABEL: "op_shift_right_logical"
 func.func @op_shift_right_logical(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> {
-  // CHECK: "stablehlo.shift_right_logical"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  // CHECK: "stablehlo.shift_right_logical"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<i32>, tensor<i32>) -> tensor<i32>
   %0 = "mhlo.shift_right_logical"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
   func.return %0 : tensor<i32>
 }
 
 // CHECK-LABEL: "op_sign"
 func.func @op_sign(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "stablehlo.sign"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "stablehlo.sign"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.sign"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_sine"
 func.func @op_sine(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "stablehlo.sine"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "stablehlo.sine"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.sine"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_slice"
 func.func @op_slice(%arg0: tensor<16xf32>) -> tensor<4xf32> {
-  //      CHECK: "stablehlo.slice"(%arg0) {
+  //      CHECK: "stablehlo.slice"([[ARG0:%arg[0-9]+]]) <{
   // CHECK-SAME:   limit_indices = array<i64: 4>,
   // CHECK-SAME:   start_indices = array<i64: 0>,
   // CHECK-SAME:   strides = array<i64: 1>
-  // CHECK-SAME: } : (tensor<16xf32>) -> tensor<4xf32>
+  // CHECK-SAME: }> : (tensor<16xf32>) -> tensor<4xf32>
   %0 = "mhlo.slice"(%arg0) {
     start_indices = dense<0> : tensor<1xi64>,
     limit_indices = dense<4> : tensor<1xi64>,
@@ -1521,14 +1529,14 @@ func.func @op_slice(%arg0: tensor<16xf32>) -> tensor<4xf32> {
 
 // CHECK-LABEL: "op_sort"
 func.func @op_sort(%arg0: tensor<16xf32>) -> tensor<16xf32> {
-  //      CHECK: "stablehlo.sort"(%arg0) ({
-  // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG1:arg.*]]: tensor<f32>, %[[ARG2:arg.*]]: tensor<f32>):
-  // CHECK-NEXT:     %[[VAL1:.*]] = "stablehlo.compare"(%[[ARG1]], %[[ARG2]]) {compare_type = #stablehlo<comparison_type FLOAT>, comparison_direction = #stablehlo<comparison_direction GT>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
-  // CHECK-NEXT:     "stablehlo.return"(%[[VAL1]]) : (tensor<i1>) -> ()
-  // CHECK-NEXT: }) {
+  //      CHECK: "stablehlo.sort"([[ARG0:%arg[0-9]+]]) <{
   // CHECK-SAME:   dimension = 0 : i64,
   // CHECK-SAME:   is_stable = true
-  // CHECK-SAME: } : (tensor<16xf32>) -> tensor<16xf32>
+  // CHECK-SAME: }> ({
+  // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG1:arg.*]]: tensor<f32>, %[[ARG2:arg.*]]: tensor<f32>):
+  // CHECK-NEXT:     %[[VAL1:.*]] = "stablehlo.compare"(%[[ARG1]], %[[ARG2]]) <{compare_type = #stablehlo<comparison_type FLOAT>, comparison_direction = #stablehlo<comparison_direction GT>}> : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  // CHECK-NEXT:     "stablehlo.return"(%[[VAL1]]) : (tensor<i1>) -> ()
+  // CHECK-NEXT: }) : (tensor<16xf32>) -> tensor<16xf32>
   %0 = "mhlo.sort"(%arg0) ({
     ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
       %1 = "mhlo.compare"(%arg1, %arg2) {compare_type = #mhlo<comparison_type FLOAT>, comparison_direction = #mhlo<comparison_direction GT>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
@@ -1542,22 +1550,21 @@ func.func @op_sort(%arg0: tensor<16xf32>) -> tensor<16xf32> {
 
 // CHECK-LABEL: "op_sqrt"
 func.func @op_sqrt(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "stablehlo.sqrt"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "stablehlo.sqrt"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.sqrt"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_subtract"
 func.func @op_subtract(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
-  // CHECK: "stablehlo.subtract"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: "stablehlo.subtract"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "mhlo.subtract"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_tan"
 func.func @op_tan(%arg0: tensor<f32>) -> tensor<f32> {
-  //               CHECK: "stablehlo.custom_call"(%arg0) {
-  //          CHECK-SAME:    call_target_name = "mhlo.tan"
+  //               CHECK: "stablehlo.custom_call"([[ARG0:%arg[0-9]+]])  <{backend_config = "", call_target_name = "mhlo.tan"}> {
   // CHECK-SAME{LITERAL}:    mhlo.attributes = {}
   // CHECK-SAME{LITERAL}:    mhlo.version = 1 : i64
   //          CHECK-SAME: } : (tensor<f32>) -> tensor<f32>
@@ -1567,15 +1574,14 @@ func.func @op_tan(%arg0: tensor<f32>) -> tensor<f32> {
 
 // CHECK-LABEL: "op_tanh"
 func.func @op_tanh(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "stablehlo.tanh"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "stablehlo.tanh"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.tanh"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_topk"
 func.func @op_topk(%arg0: tensor<5x10xf32>) -> (tensor<5x8xf32>, tensor<5x8xi32>) {
-  //               CHECK: "stablehlo.custom_call"(%arg0) {
-  //          CHECK-SAME:    call_target_name = "mhlo.topk"
+  //               CHECK: "stablehlo.custom_call"([[ARG0:%arg[0-9]+]]) <{backend_config = "", call_target_name = "mhlo.topk"}> {
   // CHECK-SAME{LITERAL}:    mhlo.attributes = {k = 8 : i64, largest = true}
   // CHECK-SAME{LITERAL}:    mhlo.version = 1 : i64
   //          CHECK-SAME: } : (tensor<5x10xf32>) -> (tensor<5x8xf32>, tensor<5x8xi32>)
@@ -1585,10 +1591,10 @@ func.func @op_topk(%arg0: tensor<5x10xf32>) -> (tensor<5x8xf32>, tensor<5x8xi32>
 
 // CHECK-LABEL: "op_torch_index_select"
 func.func @op_torch_index_select(%arg0: tensor<5x1x5xf32>, %arg1: tensor<2xi32>) ->  tensor<2x1x5xf32> {
-  //      CHECK: "stablehlo.torch_index_select"(%arg0, %arg1) {
+  //      CHECK: "stablehlo.torch_index_select"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) <{
   // CHECK-SAME:   batch_dims = 0 : i64,
   // CHECK-SAME:   dim = 0 : i64
-  // CHECK-SAME: } : (tensor<5x1x5xf32>, tensor<2xi32>) -> tensor<2x1x5xf32>
+  // CHECK-SAME: }> : (tensor<5x1x5xf32>, tensor<2xi32>) -> tensor<2x1x5xf32>
   %0 = "mhlo.torch_index_select"(%arg0, %arg1) {
     dim = 0 : i64,
     batch_dims = 0 : i64
@@ -1596,22 +1602,11 @@ func.func @op_torch_index_select(%arg0: tensor<5x1x5xf32>, %arg1: tensor<2xi32>)
   func.return %0 : tensor<2x1x5xf32>
 }
 
-// CHECK-LABEL: "op_trace"
-func.func @op_trace(%arg0: tensor<f32>) {
-  //      CHECK: "stablehlo.trace"(%arg0) {
-  // CHECK-SAME:   tag = "foo"
-  // CHECK-SAME: } : (tensor<f32>) -> ()
-  "mhlo.trace"(%arg0) {
-    tag = "foo"
-  } : (tensor<f32>) -> ()
-  func.return
-}
-
 // CHECK-LABEL: "op_transpose"
 func.func @op_transpose(%arg0: tensor<16x8xf32>) ->  tensor<8x16xf32> {
-  //      CHECK: "stablehlo.transpose"(%arg0) {
+  //      CHECK: "stablehlo.transpose"([[ARG0:%arg[0-9]+]]) <{
   // CHECK-SAME:   permutation = array<i64: 1, 0>
-  // CHECK-SAME: } : (tensor<16x8xf32>) -> tensor<8x16xf32>
+  // CHECK-SAME: }> : (tensor<16x8xf32>) -> tensor<8x16xf32>
   %0 = "mhlo.transpose"(%arg0) {
     permutation = dense<[1, 0]> : tensor<2xi64>
   } : (tensor<16x8xf32>) -> tensor<8x16xf32>
@@ -1620,12 +1615,12 @@ func.func @op_transpose(%arg0: tensor<16x8xf32>) ->  tensor<8x16xf32> {
 
 // CHECK-LABEL: "op_triangular_solve"
 func.func @op_triangular_solve(%arg0: tensor<16x16xf32>, %arg1: tensor<16x16xf32>) ->  tensor<16x16xf32> {
-  //      CHECK: "stablehlo.triangular_solve"(%arg0, %arg1) {
+  //      CHECK: "stablehlo.triangular_solve"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) <{
   // CHECK-SAME:   left_side = true,
   // CHECK-SAME:   lower = true,
   // CHECK-SAME:   transpose_a = #stablehlo<transpose NO_TRANSPOSE>,
   // CHECK-SAME:   unit_diagonal = true
-  // CHECK-SAME: } : (tensor<16x16xf32>, tensor<16x16xf32>) -> tensor<16x16xf32>
+  // CHECK-SAME: }> : (tensor<16x16xf32>, tensor<16x16xf32>) -> tensor<16x16xf32>
   %0 = "mhlo.triangular_solve"(%arg0, %arg1) {
     left_side = true,
     lower = true,
@@ -1637,16 +1632,16 @@ func.func @op_triangular_solve(%arg0: tensor<16x16xf32>, %arg1: tensor<16x16xf32
 
 // CHECK-LABEL: "op_tuple"
 func.func @op_tuple(%arg0: tensor<f32>) -> tuple<tensor<f32>> {
-  // CHECK: "stablehlo.tuple"(%arg0) : (tensor<f32>) -> tuple<tensor<f32>>
+  // CHECK: "stablehlo.tuple"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tuple<tensor<f32>>
   %0 = "mhlo.tuple"(%arg0) : (tensor<f32>) -> tuple<tensor<f32>>
   func.return %0 : tuple<tensor<f32>>
 }
 
 // CHECK-LABEL: "op_unary_einsum"
 func.func @op_unary_einsum(%arg0: tensor<8x16xf32>) -> tensor<8xf32> {
-  //      CHECK: "stablehlo.unary_einsum"(%arg0) {
+  //      CHECK: "stablehlo.unary_einsum"([[ARG0:%arg[0-9]+]]) <{
   // CHECK-SAME:   einsum_config = "ab->a"
-  // CHECK-SAME: } : (tensor<8x16xf32>) -> tensor<8xf32>
+  // CHECK-SAME: }> : (tensor<8x16xf32>) -> tensor<8xf32>
   %0 = "mhlo.unary_einsum"(%arg0) {
     einsum_config = "ab->a"
   } : (tensor<8x16xf32>) -> tensor<8xf32>
@@ -1655,21 +1650,21 @@ func.func @op_unary_einsum(%arg0: tensor<8x16xf32>) -> tensor<8xf32> {
 
 // CHECK-LABEL: "op_uniform_dequantize"
 func.func @op_uniform_dequantize(%arg0: tensor<!quant.uniform<i8:f32, 34.0:16>>) -> tensor<f32> {
-  // CHECK: "stablehlo.uniform_dequantize"(%arg0) : (tensor<!quant.uniform<i8:f32, 3.400000e+01:16>>) -> tensor<f32>
+  // CHECK: "stablehlo.uniform_dequantize"([[ARG0:%arg[0-9]+]]) : (tensor<!quant.uniform<i8:f32, 3.400000e+01:16>>) -> tensor<f32>
   %0 = "mhlo.uniform_dequantize"(%arg0) : (tensor<!quant.uniform<i8:f32, 34.0:16>>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_uniform_quantize"
 func.func @op_uniform_quantize(%arg0: tensor<f32>) -> tensor<!quant.uniform<i8:f32, 34.0:16>> {
-  // CHECK: "stablehlo.uniform_quantize"(%arg0) : (tensor<f32>) -> tensor<!quant.uniform<i8:f32, 3.400000e+01:16>>
+  // CHECK: "stablehlo.uniform_quantize"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<!quant.uniform<i8:f32, 3.400000e+01:16>>
   %0 = "mhlo.uniform_quantize"(%arg0) : (tensor<f32>) -> tensor<!quant.uniform<i8:f32, 34.0:16>>
   func.return %0 : tensor<!quant.uniform<i8:f32, 34.0:16>>
 }
 
 // CHECK-LABEL: "op_while"
 func.func @op_while(%arg0: tensor<i1>) -> tensor<i1> {
-  //      CHECK: "stablehlo.while"(%arg0) ({
+  //      CHECK: "stablehlo.while"([[ARG0:%arg[0-9]+]]) ({
   // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG1:arg.*]]: tensor<i1>):
   // CHECK-NEXT:     "stablehlo.return"(%[[ARG1]]) : (tensor<i1>) -> ()
   // CHECK-NEXT:   }, {
@@ -1690,7 +1685,7 @@ func.func @op_while(%arg0: tensor<i1>) -> tensor<i1> {
 
 // CHECK-LABEL: "op_xor"
 func.func @op_xor(%arg0: tensor<i1>, %arg1: tensor<i1>) -> tensor<i1> {
-  // CHECK: "stablehlo.xor"(%arg0, %arg1) : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  // CHECK: "stablehlo.xor"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<i1>, tensor<i1>) -> tensor<i1>
   %0 = "mhlo.xor"(%arg0, %arg1) : (tensor<i1>, tensor<i1>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
@@ -1699,161 +1694,161 @@ func.func @op_xor(%arg0: tensor<i1>, %arg1: tensor<i1>) -> tensor<i1> {
 
 // CHECK-LABEL: "type_i1"
 func.func @type_i1(%arg0: tensor<i1>, %arg1: tensor<i1>) -> tensor<i1> {
-  // CHECK: "stablehlo.and"(%arg0, %arg1) : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  // CHECK: "stablehlo.and"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<i1>, tensor<i1>) -> tensor<i1>
   %0 = "mhlo.and"(%arg0, %arg1) : (tensor<i1>, tensor<i1>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
 
 // CHECK-LABEL: "type_i4"
 func.func @type_i4(%arg0: tensor<i4>, %arg1: tensor<i4>) -> tensor<i4> {
-  // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<i4>, tensor<i4>) -> tensor<i4>
+  // CHECK: "stablehlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<i4>, tensor<i4>) -> tensor<i4>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<i4>, tensor<i4>) -> tensor<i4>
   func.return %0 : tensor<i4>
 }
 
 // CHECK-LABEL: "type_i8"
 func.func @type_i8(%arg0: tensor<i8>, %arg1: tensor<i8>) -> tensor<i8> {
-  // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<i8>, tensor<i8>) -> tensor<i8>
+  // CHECK: "stablehlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<i8>, tensor<i8>) -> tensor<i8>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<i8>, tensor<i8>) -> tensor<i8>
   func.return %0 : tensor<i8>
 }
 
 // CHECK-LABEL: "type_i16"
 func.func @type_i16(%arg0: tensor<i16>, %arg1: tensor<i16>) -> tensor<i16> {
-  // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<i16>, tensor<i16>) -> tensor<i16>
+  // CHECK: "stablehlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<i16>, tensor<i16>) -> tensor<i16>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<i16>, tensor<i16>) -> tensor<i16>
   func.return %0 : tensor<i16>
 }
 
 // CHECK-LABEL: "type_i32"
 func.func @type_i32(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> {
-  // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  // CHECK: "stablehlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<i32>, tensor<i32>) -> tensor<i32>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
   func.return %0 : tensor<i32>
 }
 
 // CHECK-LABEL: "type_i64"
 func.func @type_i64(%arg0: tensor<i64>, %arg1: tensor<i64>) -> tensor<i64> {
-  // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  // CHECK: "stablehlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<i64>, tensor<i64>) -> tensor<i64>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<i64>, tensor<i64>) -> tensor<i64>
   func.return %0 : tensor<i64>
 }
 
 // CHECK-LABEL: "type_ui4"
 func.func @type_ui4(%arg0: tensor<ui4>, %arg1: tensor<ui4>) -> tensor<ui4> {
-  // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<ui4>, tensor<ui4>) -> tensor<ui4>
+  // CHECK: "stablehlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<ui4>, tensor<ui4>) -> tensor<ui4>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<ui4>, tensor<ui4>) -> tensor<ui4>
   func.return %0 : tensor<ui4>
 }
 
 // CHECK-LABEL: "type_ui8"
 func.func @type_ui8(%arg0: tensor<ui8>, %arg1: tensor<ui8>) -> tensor<ui8> {
-  // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<ui8>, tensor<ui8>) -> tensor<ui8>
+  // CHECK: "stablehlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<ui8>, tensor<ui8>) -> tensor<ui8>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<ui8>, tensor<ui8>) -> tensor<ui8>
   func.return %0 : tensor<ui8>
 }
 
 // CHECK-LABEL: "type_ui16"
 func.func @type_ui16(%arg0: tensor<ui16>, %arg1: tensor<ui16>) -> tensor<ui16> {
-  // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<ui16>, tensor<ui16>) -> tensor<ui16>
+  // CHECK: "stablehlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<ui16>, tensor<ui16>) -> tensor<ui16>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<ui16>, tensor<ui16>) -> tensor<ui16>
   func.return %0 : tensor<ui16>
 }
 
 // CHECK-LABEL: "type_ui32"
 func.func @type_ui32(%arg0: tensor<ui32>, %arg1: tensor<ui32>) -> tensor<ui32> {
-  // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<ui32>, tensor<ui32>) -> tensor<ui32>
+  // CHECK: "stablehlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<ui32>, tensor<ui32>) -> tensor<ui32>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<ui32>, tensor<ui32>) -> tensor<ui32>
   func.return %0 : tensor<ui32>
 }
 
 // CHECK-LABEL: "type_ui64"
 func.func @type_ui64(%arg0: tensor<ui64>, %arg1: tensor<ui64>) -> tensor<ui64> {
-  // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<ui64>, tensor<ui64>) -> tensor<ui64>
+  // CHECK: "stablehlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<ui64>, tensor<ui64>) -> tensor<ui64>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<ui64>, tensor<ui64>) -> tensor<ui64>
   func.return %0 : tensor<ui64>
 }
 
 // CHECK-LABEL: "type_f8E4M3FN"
 func.func @type_f8E4M3FN(%arg0: tensor<f8E4M3FN>, %arg1: tensor<f8E4M3FN>) -> tensor<f8E4M3FN> {
-  // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<f8E4M3FN>, tensor<f8E4M3FN>) -> tensor<f8E4M3FN>
+  // CHECK: "stablehlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<f8E4M3FN>, tensor<f8E4M3FN>) -> tensor<f8E4M3FN>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<f8E4M3FN>, tensor<f8E4M3FN>) -> tensor<f8E4M3FN>
   func.return %0 : tensor<f8E4M3FN>
 }
 
 // CHECK-LABEL: "type_f8E4M3FNUZ"
 func.func @type_f8E4M3FNUZ(%arg0: tensor<f8E4M3FNUZ>, %arg1: tensor<f8E4M3FNUZ>) -> tensor<f8E4M3FNUZ> {
-  // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<f8E4M3FNUZ>, tensor<f8E4M3FNUZ>) -> tensor<f8E4M3FNUZ>
+  // CHECK: "stablehlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<f8E4M3FNUZ>, tensor<f8E4M3FNUZ>) -> tensor<f8E4M3FNUZ>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<f8E4M3FNUZ>, tensor<f8E4M3FNUZ>) -> tensor<f8E4M3FNUZ>
   func.return %0 : tensor<f8E4M3FNUZ>
 }
 
 // CHECK-LABEL: "type_f8E4M3B11FNUZ"
 func.func @type_f8E4M3B11FNUZ(%arg0: tensor<f8E4M3B11FNUZ>, %arg1: tensor<f8E4M3B11FNUZ>) -> tensor<f8E4M3B11FNUZ> {
-  // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<f8E4M3B11FNUZ>, tensor<f8E4M3B11FNUZ>) -> tensor<f8E4M3B11FNUZ>
+  // CHECK: "stablehlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<f8E4M3B11FNUZ>, tensor<f8E4M3B11FNUZ>) -> tensor<f8E4M3B11FNUZ>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<f8E4M3B11FNUZ>, tensor<f8E4M3B11FNUZ>) -> tensor<f8E4M3B11FNUZ>
   func.return %0 : tensor<f8E4M3B11FNUZ>
 }
 
 // CHECK-LABEL: "type_f8E5M2"
 func.func @type_f8E5M2(%arg0: tensor<f8E5M2>, %arg1: tensor<f8E5M2>) -> tensor<f8E5M2> {
-  // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<f8E5M2>, tensor<f8E5M2>) -> tensor<f8E5M2>
+  // CHECK: "stablehlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<f8E5M2>, tensor<f8E5M2>) -> tensor<f8E5M2>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<f8E5M2>, tensor<f8E5M2>) -> tensor<f8E5M2>
   func.return %0 : tensor<f8E5M2>
 }
 
 // CHECK-LABEL: "type_f8E5M2FNUZ"
 func.func @type_f8E5M2FNUZ(%arg0: tensor<f8E5M2FNUZ>, %arg1: tensor<f8E5M2FNUZ>) -> tensor<f8E5M2FNUZ> {
-  // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<f8E5M2FNUZ>, tensor<f8E5M2FNUZ>) -> tensor<f8E5M2FNUZ>
+  // CHECK: "stablehlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<f8E5M2FNUZ>, tensor<f8E5M2FNUZ>) -> tensor<f8E5M2FNUZ>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<f8E5M2FNUZ>, tensor<f8E5M2FNUZ>) -> tensor<f8E5M2FNUZ>
   func.return %0 : tensor<f8E5M2FNUZ>
 }
 
 // CHECK-LABEL: "type_bf16"
 func.func @type_bf16(%arg0: tensor<bf16>, %arg1: tensor<bf16>) -> tensor<bf16> {
-  // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<bf16>, tensor<bf16>) -> tensor<bf16>
+  // CHECK: "stablehlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<bf16>, tensor<bf16>) -> tensor<bf16>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<bf16>, tensor<bf16>) -> tensor<bf16>
   func.return %0 : tensor<bf16>
 }
 
 // CHECK-LABEL: "type_f16"
 func.func @type_f16(%arg0: tensor<f16>, %arg1: tensor<f16>) -> tensor<f16> {
-  // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<f16>, tensor<f16>) -> tensor<f16>
+  // CHECK: "stablehlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<f16>, tensor<f16>) -> tensor<f16>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<f16>, tensor<f16>) -> tensor<f16>
   func.return %0 : tensor<f16>
 }
 
 // CHECK-LABEL: "type_f32"
 func.func @type_f32(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
-  // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: "stablehlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "type_f64"
 func.func @type_f64(%arg0: tensor<f64>, %arg1: tensor<f64>) -> tensor<f64> {
-  // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<f64>, tensor<f64>) -> tensor<f64>
+  // CHECK: "stablehlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<f64>, tensor<f64>) -> tensor<f64>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<f64>, tensor<f64>) -> tensor<f64>
   func.return %0 : tensor<f64>
 }
 
 // CHECK-LABEL: "type_complex_f32"
 func.func @type_complex_f32(%arg0: tensor<complex<f32>>, %arg1: tensor<complex<f32>>) -> tensor<complex<f32>> {
-  // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<complex<f32>>, tensor<complex<f32>>) -> tensor<complex<f32>>
+  // CHECK: "stablehlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<complex<f32>>, tensor<complex<f32>>) -> tensor<complex<f32>>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<complex<f32>>, tensor<complex<f32>>) -> tensor<complex<f32>>
   func.return %0 : tensor<complex<f32>>
 }
 
 // CHECK-LABEL: "type_complex_f64"
 func.func @type_complex_f64(%arg0: tensor<complex<f64>>, %arg1: tensor<complex<f64>>) -> tensor<complex<f64>> {
-  // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<complex<f64>>, tensor<complex<f64>>) -> tensor<complex<f64>>
+  // CHECK: "stablehlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<complex<f64>>, tensor<complex<f64>>) -> tensor<complex<f64>>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<complex<f64>>, tensor<complex<f64>>) -> tensor<complex<f64>>
   func.return %0 : tensor<complex<f64>>
 }
 
 // CHECK-LABEL: "type_dynamism_ranked"
 func.func @type_dynamism_ranked(%arg0: tensor<?xf32>) -> tensor<?xf32> {
-  // CHECK: "stablehlo.abs"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
+  // CHECK: "stablehlo.abs"([[ARG0:%arg[0-9]+]]) : (tensor<?xf32>) -> tensor<?xf32>
   %0 = "mhlo.abs"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
 }
@@ -1872,7 +1867,7 @@ func.func @type_quantization(%arg0: tensor<!quant.uniform<i8:f32, 34.0:16>>) ->
 // CHECK: #[[$SV:.*]] = #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>
 // CHECK-LABEL: "type_sparsity"
 func.func @type_sparsity(%arg0: tensor<16xf32, #SV>) -> tensor<16xf32> {
-  // CHECK: "stablehlo.abs"(%arg0) : (tensor<16xf32, #[[$SV]]>) -> tensor<16xf32>
+  // CHECK: "stablehlo.abs"([[ARG0:%arg[0-9]+]]) : (tensor<16xf32, #[[$SV]]>) -> tensor<16xf32>
   %0 = "mhlo.abs"(%arg0) : (tensor<16xf32, #SV>) -> tensor<16xf32>
   func.return %0 : tensor<16xf32>
 }
@@ -1883,22 +1878,22 @@ func.func @type_sparsity(%arg0: tensor<16xf32, #SV>) -> tensor<16xf32> {
 
 func.func @type_token_callee(%arg0: !mhlo.token) -> !mhlo.token {
   // CHECK: function_type = (!stablehlo.token) -> !stablehlo.token, sym_name = "type_token_callee"
-  // CHECK: "func.return"(%arg0) : (!stablehlo.token) -> ()
+  // CHECK: "func.return"([[ARG0:%arg[0-9]+]]) : (!stablehlo.token) -> ()
   return %arg0 : !mhlo.token
 }
 
 func.func @type_token_caller(%arg0: !mhlo.token) -> !mhlo.token {
   // CHECK: function_type = (!stablehlo.token) -> !stablehlo.token, sym_name = "type_token_caller"
-  // CHECK: "func.call"(%arg0) <{callee = @type_token_callee}> : (!stablehlo.token) -> !stablehlo.token
+  // CHECK: "func.call"([[ARG0:%arg[0-9]+]]) <{callee = @type_token_callee}> : (!stablehlo.token) -> !stablehlo.token
   %0 = func.call @type_token_callee(%arg0) : (!mhlo.token) -> !mhlo.token
   return %0 : !mhlo.token
 }
 
 // CHECK-LABEL: "type_token_region"
 func.func @type_token_region(%arg0: tensor<i1>, %arg1: !mhlo.token) {
-  //      CHECK: "stablehlo.while"(%arg1) ({
+  //      CHECK: "stablehlo.while"([[ARG1:%arg[0-9]+]]) ({
   // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG2:arg.*]]: !stablehlo.token):
-  // CHECK-NEXT:     "stablehlo.return"(%arg0) : (tensor<i1>) -> ()
+  // CHECK-NEXT:     "stablehlo.return"([[ARG0:%arg[0-9]+]]) : (tensor<i1>) -> ()
   // CHECK-NEXT:   }, {
   // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG2:arg.*]]: !stablehlo.token):
   // CHECK-NEXT:     "stablehlo.return"(%[[ARG2]]) : (!stablehlo.token) -> ()
@@ -1940,6 +1935,17 @@ func.func @attr_precision_config_invalid() -> tensor<8x8xf32> {
 
 // -----
 
+func.func @attr_invalid_nested_in_dictionary() -> tensor<8x8xf32> {
+  // expected-error@+1 {{failed to legalize operation 'mhlo.custom_call' that was explicitly marked illegal}}
+  %0 = "mhlo.custom_call"() {
+    call_target_name = "foo",
+    precision_config = {config = #mhlo<precision PACKED_NIBBLE>}
+  } : () -> tensor<8x8xf32>
+  func.return %0 : tensor<8x8xf32>
+}
+
+// -----
+
 func.func @op_add_dependency(%arg0: tensor<16xf32>, %arg1: !mhlo.token) -> tensor<16xf32> {
   // expected-error@+1 {{failed to legalize operation 'mhlo.add_dependency' that was explicitly marked illegal}}
   %0 = "mhlo.add_dependency"(%arg0, %arg1) : (tensor<16xf32>, !mhlo.token) -> tensor<16xf32>
@@ -1959,47 +1965,13 @@ func.func @op_async_done(%arg0: tensor<16xf32>) -> tensor<16xf32> {
     called_computation = @async_computation,
     execution_thread = "main"
   } : (tensor<16xf32>) -> !mhlo.async_bundle<tensor<16xf32>, tensor<16xf32>>
-  // At the moment, mhlo.async_done requires its defining op to be non-empty.
-  // As a result, it's impossible to test it in isolation from other async ops.
-  // However, if we test it together with other async ops, we cannot get an
-  // async_done-specific legalization error.
-  %1 = "mhlo.async_done"(%0) {
-    called_computation = @async_computation,
-    execution_thread = "main"
-  } : (!mhlo.async_bundle<tensor<16xf32>, tensor<16xf32>>) -> tensor<16xf32>
-  func.return %1 : tensor<16xf32>
-}
-
-// -----
-
-func.func @async_computation(%arg0: tensor<16xf32>) -> tensor<16xf32>
-  attributes {execution_thread = "main"} {
-  return %arg0 : tensor<16xf32>
-}
-
-// expected-error@+1 {{failed to legalize operation 'func.func' that was explicitly marked illegal}}
-func.func @op_async_start(%arg0: tensor<16xf32>) -> !mhlo.async_bundle<tensor<16xf32>, tensor<16xf32>> {
-  %0 = "mhlo.async_start"(%arg0) {
-    called_computation = @async_computation,
-    execution_thread = "main"
-  } : (tensor<16xf32>) -> !mhlo.async_bundle<tensor<16xf32>, tensor<16xf32>>
-  func.return %0 : !mhlo.async_bundle<tensor<16xf32>, tensor<16xf32>>
-}
-
-// -----
-
-func.func @async_computation(%arg0: tensor<16xf32>) -> tensor<16xf32>
-  attributes {execution_thread = "main"} {
-  return %arg0 : tensor<16xf32>
-}
-
-// expected-error@+1 {{failed to legalize operation 'func.func' that was explicitly marked illegal}}
-func.func @op_async_update(%arg0: !mhlo.async_bundle<tensor<16xf32>, tensor<16xf32>>) -> !mhlo.async_bundle<tensor<16xf32>, tensor<16xf32>> {
-  %0 = "mhlo.async_update"(%arg0) {
-    called_computation = @async_computation,
-    execution_thread = "main"
-  } : (!mhlo.async_bundle<tensor<16xf32>, tensor<16xf32>>) -> !mhlo.async_bundle<tensor<16xf32>, tensor<16xf32>>
-  func.return %0 : !mhlo.async_bundle<tensor<16xf32>, tensor<16xf32>>
+  // At the moment, mhlo.async_update and mhlo.async_done require its defining
+  // op to be non-empty. As a result, it's impossible to test it in isolation
+  // from other async ops.  However, if we test it together with other async
+  // ops, we cannot get an async_update/async_done-specific legalization error.
+  %1 = "mhlo.async_update"(%0) : (!mhlo.async_bundle<tensor<16xf32>, tensor<16xf32>>) -> !mhlo.async_bundle<tensor<16xf32>, tensor<16xf32>>
+  %2 = "mhlo.async_done"(%1) : (!mhlo.async_bundle<tensor<16xf32>, tensor<16xf32>>) -> tensor<16xf32>
+  func.return %2 : tensor<16xf32>
 }
 
 // -----
@@ -2039,7 +2011,7 @@ func.func @op_convolution_unknown_dimension_numbers(%arg0: tensor<1x8x8x32x207xf
 
 // -----
 
-func.func @op_custom_call_custom_call_schedule(%arg0: tensor<f32>) -> tensor<f32> {
+func.func @op_custom_call_custom_call_schedule_earliest(%arg0: tensor<f32>) -> tensor<f32> {
   // expected-error@+1 {{failed to legalize operation 'mhlo.custom_call' that was explicitly marked illegal}}
   %0 = "mhlo.custom_call"(%arg0) {
     call_target_name = "foo",
@@ -2050,6 +2022,19 @@ func.func @op_custom_call_custom_call_schedule(%arg0: tensor<f32>) -> tensor<f32
 
 // -----
 
+func.func @op_custom_call_custom_call_schedule_earliest_ffi(%arg0: tensor<f32>) -> tensor<f32> {
+  // expected-error@+1 {{failed to legalize operation 'mhlo.custom_call' that was explicitly marked illegal}}
+  %0 = "mhlo.custom_call"(%arg0) {
+    call_target_name = "foo",
+    backend_config = {foo = "bar"},
+    api_version = 4 : i32,
+    custom_call_schedule = #mhlo<custom_call_schedule EARLIEST>
+  } : (tensor<f32>) -> tensor<f32>
+  func.return %0 : tensor<f32>
+}
+
+// -----
+
 func.func @op_domain(%arg0: tensor<f32>) -> tensor<f32> {
   // expected-error@+1 {{failed to legalize operation 'mhlo.domain' that was explicitly marked illegal}}
   %0 = "mhlo.domain"(%arg0) {
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo-quant-legalize-to-int.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo-quant-legalize-to-int.mlir
index 5988d8efb68f4e..6b0404881d9e2b 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo-quant-legalize-to-int.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo-quant-legalize-to-int.mlir
@@ -697,7 +697,7 @@ func.func @dot_dynamic_batch_dim(
 // CHECK-LABEL: func @dot_general
 func.func @dot_general(
     %arg0: tensor<2x5x6x!quant.uniform<i8:f32, 2.000000e+00:3>>,
-    %arg1: tensor<6x8x2x!quant.uniform<i8:f32, 1.000000e+00:5>>
+    %arg1: tensor<6x8x2x!quant.uniform<i8:f32, 1.000000e+00:0>>
   ) -> tensor<2x5x8x!quant.uniform<i32:f32, 4.000000e+00:7>> {
   // CHECK: %[[DOT_RES:.*]] = "mhlo.dot_general"
   // CHECK-SAME: lhs_batching_dimensions = [0]
@@ -705,22 +705,7 @@ func.func @dot_general(
   // CHECK-SAME: lhs_contracting_dimensions = [2]
   // CHECK-SAME: rhs_contracting_dimensions = [0]
 
-  // Zero point offset contribution from LHS tensor * RHS ZP.
-
-  // CHECK: %[[LHS_I32:.*]] = mhlo.convert %[[LHS:.*]] : (tensor<2x5x6xi8>)
-  // CHECK-SAME: -> tensor<2x5x6xi32>
-  // CHECK: %[[LHS_REDUCE_INIT:.*]] = mhlo.constant dense<0> : tensor<i32>
-  // CHECK: %[[LHS_REDUCE:.*]] = mhlo.reduce(%[[LHS_I32]] init: %[[LHS_REDUCE_INIT]])
-  // CHECK-SAME: applies mhlo.add across dimensions = [2]
-  // CHECK-SAME: (tensor<2x5x6xi32>, tensor<i32>)
-  // CHECK-SAME: -> tensor<2x5xi32>
-  // CHECK: %[[RHS_ZP:.*]] = mhlo.constant dense<5> : tensor<i32>
-  // CHECK: %[[LHS_ZP_CONTRIB:.*]] = chlo.broadcast_multiply
-  // CHECK-SAME: %[[LHS_REDUCE]], %[[RHS_ZP]] :
-  // CHECK-SAME: (tensor<2x5xi32>, tensor<i32>) -> tensor<2x5xi32>
-  // CHECK: %[[LHS_ZP_BCAST:.*]] = "mhlo.broadcast_in_dim"(%[[LHS_ZP_CONTRIB]])
-  // CHECK-SAME: broadcast_dimensions = dense<[0, 1]>
-  // CHECK-SAME: (tensor<2x5xi32>) -> tensor<2x5x8xi32>
+  // Zero point offset contribution from LHS tensor * RHS ZP is 0 and skipped.
 
   // Zero point offset contribution from RHS tensor * LHS ZP.
 
@@ -738,13 +723,8 @@ func.func @dot_general(
   // CHECK: %[[RHS_ZP_BCAST:.*]] = "mhlo.broadcast_in_dim"(%[[RHS_ZP_CONTRIB]])
   // CHECK-SAME: broadcast_dimensions = dense<[2, 0]>
   // CHECK-SAME: (tensor<8x2xi32>) -> tensor<2x5x8xi32>
-  // CHECK: %[[ZP_TOTAL_1:.*]] = mhlo.add %[[LHS_ZP_BCAST]], %[[RHS_ZP_BCAST]]
-
-  // Zero point offset contribution from LHS ZP * RHS ZP.
 
-  // CHECK: %[[ZPS:.*]] = mhlo.constant dense<90> : tensor<i32>
-  // CHECK: %[[ZP_TOTAL_2:.*]] = chlo.broadcast_subtract %[[ZP_TOTAL_1]], %[[ZPS]]
-  // CHECK-SAME: (tensor<2x5x8xi32>, tensor<i32>) -> tensor<2x5x8xi32>
+  // Zero point offset contribution from LHS ZP * RHS ZP is 0 and skipped.
 
   // Combine dot result with zero point offset and output final result.
 
@@ -756,17 +736,17 @@ func.func @dot_general(
   // CHECK: %[[RES_INT:.*]] = mhlo.convert %[[RES_FP_1]]
   // CHECK-SAME: (tensor<2x5x8xf32>) -> tensor<2x5x8xi32>
 
-  // CHECK: %[[ZP_TOTAL_3:.*]] = mhlo.convert %[[ZP_TOTAL_2]]
+  // CHECK: %[[ZP_TOTAL_1:.*]] = mhlo.convert %[[RHS_ZP_BCAST]]
   // CHECK-SAME: (tensor<2x5x8xi32>) -> tensor<2x5x8xf32>
-  // CHECK: %[[ZP_TOTAL_4:.*]] = chlo.broadcast_multiply
-  // CHECK-SAME: %[[ZP_TOTAL_3:.*]], %[[COMBINED_SCALE]]
-  // CHECK: %[[ZP_TOTAL_5:.*]] = mhlo.convert %[[ZP_TOTAL_4]]
+  // CHECK: %[[ZP_TOTAL_2:.*]] = chlo.broadcast_multiply
+  // CHECK-SAME: %[[ZP_TOTAL_1:.*]], %[[COMBINED_SCALE]]
+  // CHECK: %[[ZP_TOTAL_3:.*]] = mhlo.convert %[[ZP_TOTAL_2]]
   // CHECK-SAME: (tensor<2x5x8xf32>) -> tensor<2x5x8xi32>
 
   // CHECK: %[[RES_ZP:.*]] = mhlo.constant dense<7> : tensor<i32>
-  // CHECK: %[[ZP_TOTAL_6:.*]] = chlo.broadcast_subtract %[[RES_ZP]], %[[ZP_TOTAL_5]]
+  // CHECK: %[[ZP_TOTAL_4:.*]] = chlo.broadcast_subtract %[[RES_ZP]], %[[ZP_TOTAL_3]]
   // CHECK-SAME: (tensor<i32>, tensor<2x5x8xi32>) -> tensor<2x5x8xi32>
-  // CHECK: chlo.broadcast_add %[[RES_INT]], %[[ZP_TOTAL_6]]
+  // CHECK: chlo.broadcast_add %[[RES_INT]], %[[ZP_TOTAL_4]]
 
   %0 = "mhlo.dot_general" (%arg0, %arg1) {
     dot_dimension_numbers = #mhlo.dot<
@@ -776,7 +756,7 @@ func.func @dot_general(
       rhs_contracting_dimensions = [0]
     >} : (
       tensor<2x5x6x!quant.uniform<i8:f32, 2.000000e+00:3>>,
-      tensor<6x8x2x!quant.uniform<i8:f32, 1.000000e+00:5>>
+      tensor<6x8x2x!quant.uniform<i8:f32, 1.000000e+00:0>>
     ) -> tensor<2x5x8x!quant.uniform<i32:f32, 4.000000e+00:7>>
   return %0 : tensor<2x5x8x!quant.uniform<i32:f32, 4.000000e+00:7>>
 }
@@ -786,7 +766,7 @@ func.func @dot_general(
 // CHECK-LABEL: func @dot_general_combined_scale_1
 func.func @dot_general_combined_scale_1(
     %arg0: tensor<2x5x6x!quant.uniform<i8:f32, 2.000000e+00:3>>,
-    %arg1: tensor<6x8x2x!quant.uniform<i8:f32, 3.000000e+00:5>>
+    %arg1: tensor<6x8x2x!quant.uniform<i8:f32, 3.000000e+00:0>>
   ) -> tensor<2x5x8x!quant.uniform<i32:f32, 6.000000e+00:7>> {
   // CHECK: %[[DOT_RES:.*]] = "mhlo.dot_general"
   // CHECK-SAME: lhs_batching_dimensions = [0]
@@ -794,22 +774,7 @@ func.func @dot_general_combined_scale_1(
   // CHECK-SAME: lhs_contracting_dimensions = [2]
   // CHECK-SAME: rhs_contracting_dimensions = [0]
 
-  // Zero point offset contribution from LHS tensor * RHS ZP.
-
-  // CHECK: %[[LHS_I32:.*]] = mhlo.convert %[[LHS:.*]] : (tensor<2x5x6xi8>)
-  // CHECK-SAME: -> tensor<2x5x6xi32>
-  // CHECK: %[[LHS_REDUCE_INIT:.*]] = mhlo.constant dense<0> : tensor<i32>
-  // CHECK: %[[LHS_REDUCE:.*]] = mhlo.reduce(%[[LHS_I32]] init: %[[LHS_REDUCE_INIT]])
-  // CHECK-SAME: applies mhlo.add across dimensions = [2]
-  // CHECK-SAME: (tensor<2x5x6xi32>, tensor<i32>)
-  // CHECK-SAME: -> tensor<2x5xi32>
-  // CHECK: %[[RHS_ZP:.*]] = mhlo.constant dense<5> : tensor<i32>
-  // CHECK: %[[LHS_ZP_CONTRIB:.*]] = chlo.broadcast_multiply
-  // CHECK-SAME: %[[LHS_REDUCE]], %[[RHS_ZP]] :
-  // CHECK-SAME: (tensor<2x5xi32>, tensor<i32>) -> tensor<2x5xi32>
-  // CHECK: %[[LHS_ZP_BCAST:.*]] = "mhlo.broadcast_in_dim"(%[[LHS_ZP_CONTRIB]])
-  // CHECK-SAME: broadcast_dimensions = dense<[0, 1]>
-  // CHECK-SAME: (tensor<2x5xi32>) -> tensor<2x5x8xi32>
+  // Zero point offset contribution from LHS tensor * RHS ZP is 0 and skipped.
 
   // Zero point offset contribution from RHS tensor * LHS ZP.
 
@@ -827,21 +792,11 @@ func.func @dot_general_combined_scale_1(
   // CHECK: %[[RHS_ZP_BCAST:.*]] = "mhlo.broadcast_in_dim"(%[[RHS_ZP_CONTRIB]])
   // CHECK-SAME: broadcast_dimensions = dense<[2, 0]>
   // CHECK-SAME: (tensor<8x2xi32>) -> tensor<2x5x8xi32>
-  // CHECK: %[[ZP_TOTAL_1:.*]] = mhlo.add %[[LHS_ZP_BCAST]], %[[RHS_ZP_BCAST]]
-
-  // Zero point offset contribution from LHS ZP * RHS ZP.
-
-  // CHECK: %[[ZPS:.*]] = mhlo.constant dense<90> : tensor<i32>
-  // CHECK: %[[ZP_TOTAL_2:.*]] = chlo.broadcast_subtract %[[ZP_TOTAL_1]], %[[ZPS]]
-  // CHECK-SAME: (tensor<2x5x8xi32>, tensor<i32>) -> tensor<2x5x8xi32>
-
-  // Combine dot result with zero point offset and output final result.
-  // Do not multiply by combined scale since it is 1.0 and thus no-op.
 
   // CHECK: %[[RES_ZP:.*]] = mhlo.constant dense<7> : tensor<i32>
-  // CHECK: %[[ZP_TOTAL_3:.*]] = chlo.broadcast_subtract %[[RES_ZP]], %[[ZP_TOTAL_2]]
+  // CHECK: %[[ZP_TOTAL_1:.*]] = chlo.broadcast_subtract %[[RES_ZP]], %[[RHS_ZP_BCAST]]
   // CHECK-SAME: (tensor<i32>, tensor<2x5x8xi32>) -> tensor<2x5x8xi32>
-  // CHECK: chlo.broadcast_add %[[DOT_RES]], %[[ZP_TOTAL_3]]
+  // CHECK: chlo.broadcast_add %[[DOT_RES]], %[[ZP_TOTAL_1]]
 
   %0 = "mhlo.dot_general" (%arg0, %arg1) {
     dot_dimension_numbers = #mhlo.dot<
@@ -851,7 +806,7 @@ func.func @dot_general_combined_scale_1(
       rhs_contracting_dimensions = [0]
     >} : (
       tensor<2x5x6x!quant.uniform<i8:f32, 2.000000e+00:3>>,
-      tensor<6x8x2x!quant.uniform<i8:f32, 3.000000e+00:5>>
+      tensor<6x8x2x!quant.uniform<i8:f32, 3.000000e+00:0>>
     ) -> tensor<2x5x8x!quant.uniform<i32:f32, 6.000000e+00:7>>
   return %0 : tensor<2x5x8x!quant.uniform<i32:f32, 6.000000e+00:7>>
 }
@@ -861,7 +816,7 @@ func.func @dot_general_combined_scale_1(
 // CHECK-LABEL: func @dot_general_multiple_batching_dims
 func.func @dot_general_multiple_batching_dims(
     %arg0: tensor<2x5x3x7x6x!quant.uniform<i8:f32, 2.000000e+00:3>>,
-    %arg1: tensor<6x2x7x8x3x!quant.uniform<i8:f32, 1.000000e+00:5>>
+    %arg1: tensor<6x2x7x8x3x!quant.uniform<i8:f32, 1.000000e+00:0>>
   ) -> tensor<2x3x5x8x!quant.uniform<i32:f32, 4.000000e+00:7>> {
   // CHECK: %[[DOT_RES:.*]] = "mhlo.dot_general"
   // CHECK-SAME: lhs_batching_dimensions = [0, 2]
@@ -869,22 +824,6 @@ func.func @dot_general_multiple_batching_dims(
   // CHECK-SAME: lhs_contracting_dimensions = [4, 3]
   // CHECK-SAME: rhs_contracting_dimensions = [0, 2]>}
 
-  // Zero point offset contribution from LHS tensor * RHS ZP.
-
-  // CHECK: %[[LHS_I32:.*]] = mhlo.convert %[[LHS:.*]] : (tensor<2x5x3x7x6xi8>)
-  // CHECK-SAME: -> tensor<2x5x3x7x6xi32>
-  // CHECK: %[[LHS_REDUCE_INIT:.*]] = mhlo.constant dense<0> : tensor<i32>
-  // CHECK: %[[LHS_REDUCE:.*]] = mhlo.reduce(%[[LHS_I32]] init: %[[LHS_REDUCE_INIT]])
-  // CHECK-SAME: applies mhlo.add across dimensions = [4, 3]
-  // CHECK-SAME: (tensor<2x5x3x7x6xi32>, tensor<i32>)
-  // CHECK-SAME: -> tensor<2x5x3xi32>
-  // CHECK: %[[RHS_ZP:.*]] = mhlo.constant dense<5> : tensor<i32>
-  // CHECK: %[[LHS_ZP_CONTRIB:.*]] = chlo.broadcast_multiply
-  // CHECK-SAME: %[[LHS_REDUCE]], %[[RHS_ZP]] :
-  // CHECK-SAME: (tensor<2x5x3xi32>, tensor<i32>) -> tensor<2x5x3xi32>
-  // CHECK: %[[LHS_ZP_BCAST:.*]] = "mhlo.broadcast_in_dim"(%[[LHS_ZP_CONTRIB]])
-  // CHECK-SAME: broadcast_dimensions = dense<[0, 2, 1]>
-  // CHECK-SAME: (tensor<2x5x3xi32>) -> tensor<2x3x5x8xi32>
 
   // Zero point offset contribution from RHS tensor * LHS ZP.
 
@@ -902,13 +841,7 @@ func.func @dot_general_multiple_batching_dims(
   // CHECK: %[[RHS_ZP_BCAST:.*]] = "mhlo.broadcast_in_dim"(%[[RHS_ZP_CONTRIB]])
   // CHECK-SAME: broadcast_dimensions = dense<[0, 3, 1]>
   // CHECK-SAME: (tensor<2x8x3xi32>) -> tensor<2x3x5x8xi32>
-  // CHECK: %[[ZP_TOTAL_1:.*]] = mhlo.add %[[LHS_ZP_BCAST]], %[[RHS_ZP_BCAST]]
 
-  // Zero point offset contribution from LHS ZP * RHS ZP.
-
-  // CHECK: %[[ZPS:.*]] = mhlo.constant dense<630> : tensor<i32>
-  // CHECK: %[[ZP_TOTAL_2:.*]] = chlo.broadcast_subtract %[[ZP_TOTAL_1]], %[[ZPS]]
-  // CHECK-SAME: (tensor<2x3x5x8xi32>, tensor<i32>) -> tensor<2x3x5x8xi32>
 
   // Combine dot result with zero point offset and output final result.
 
@@ -920,17 +853,17 @@ func.func @dot_general_multiple_batching_dims(
   // CHECK: %[[RES_INT:.*]] = mhlo.convert %[[RES_FP_1]]
   // CHECK-SAME: (tensor<2x3x5x8xf32>) -> tensor<2x3x5x8xi32>
 
-  // CHECK: %[[ZP_TOTAL_3:.*]] = mhlo.convert %[[ZP_TOTAL_2]]
+  // CHECK: %[[ZP_TOTAL_1:.*]] = mhlo.convert %[[RHS_ZP_BCAST]]
   // CHECK-SAME: (tensor<2x3x5x8xi32>) -> tensor<2x3x5x8xf32>
-  // CHECK: %[[ZP_TOTAL_4:.*]] = chlo.broadcast_multiply
-  // CHECK-SAME: %[[ZP_TOTAL_3:.*]], %[[COMBINED_SCALE]]
-  // CHECK: %[[ZP_TOTAL_5:.*]] = mhlo.convert %[[ZP_TOTAL_4]]
+  // CHECK: %[[ZP_TOTAL_2:.*]] = chlo.broadcast_multiply
+  // CHECK-SAME: %[[ZP_TOTAL_1:.*]], %[[COMBINED_SCALE]]
+  // CHECK: %[[ZP_TOTAL_3:.*]] = mhlo.convert %[[ZP_TOTAL_2]]
   // CHECK-SAME: (tensor<2x3x5x8xf32>) -> tensor<2x3x5x8xi32>
 
   // CHECK: %[[RES_ZP:.*]] = mhlo.constant dense<7> : tensor<i32>
-  // CHECK: %[[ZP_TOTAL_6:.*]] = chlo.broadcast_subtract %[[RES_ZP]], %[[ZP_TOTAL_5]]
+  // CHECK: %[[ZP_TOTAL_4:.*]] = chlo.broadcast_subtract %[[RES_ZP]], %[[ZP_TOTAL_3]]
   // CHECK-SAME: (tensor<i32>, tensor<2x3x5x8xi32>) -> tensor<2x3x5x8xi32>
-  // CHECK: chlo.broadcast_add %[[RES_INT]], %[[ZP_TOTAL_6]]
+  // CHECK: chlo.broadcast_add %[[RES_INT]], %[[ZP_TOTAL_4]]
 
   %0 = "mhlo.dot_general" (%arg0, %arg1) {
     dot_dimension_numbers = #mhlo.dot<
@@ -940,7 +873,7 @@ func.func @dot_general_multiple_batching_dims(
       rhs_contracting_dimensions = [0, 2]
     >} : (
       tensor<2x5x3x7x6x!quant.uniform<i8:f32, 2.000000e+00:3>>,
-      tensor<6x2x7x8x3x!quant.uniform<i8:f32, 1.000000e+00:5>>
+      tensor<6x2x7x8x3x!quant.uniform<i8:f32, 1.000000e+00:0>>
     ) -> tensor<2x3x5x8x!quant.uniform<i32:f32, 4.000000e+00:7>>
   return %0 : tensor<2x3x5x8x!quant.uniform<i32:f32, 4.000000e+00:7>>
 }
@@ -1058,7 +991,7 @@ func.func @dot_general_zero_zp(
 // CHECK-LABEL: func @dot_general_multiple_dynamic_dims
 func.func @dot_general_multiple_dynamic_dims(
     %arg0: tensor<?x?x3x?x6x!quant.uniform<i8:f32, 2.000000e+00:3>>,
-    %arg1: tensor<6x?x?x8x3x!quant.uniform<i8:f32, 1.000000e+00:5>>
+    %arg1: tensor<6x?x?x8x3x!quant.uniform<i8:f32, 1.000000e+00:0>>
   ) -> tensor<?x3x?x8x!quant.uniform<i32:f32, 4.000000e+00:7>> {
   // CHECK: %[[DOT_RES:.*]] = "mhlo.dot_general"
   // CHECK-SAME: lhs_batching_dimensions = [0, 2]
@@ -1068,17 +1001,17 @@ func.func @dot_general_multiple_dynamic_dims(
 
   // Zero point offset contribution from LHS tensor * RHS ZP.
 
-  // CHECK: %[[LHS_I32:.*]] = mhlo.convert %[[LHS:.*]] : (tensor<?x?x3x?x6xi8>)
-  // CHECK-SAME: -> tensor<?x?x3x?x6xi32>
-  // CHECK: %[[LHS_REDUCE_INIT:.*]] = mhlo.constant dense<0> : tensor<i32>
-  // CHECK: %[[LHS_REDUCE:.*]] = mhlo.reduce(%[[LHS_I32]] init: %[[LHS_REDUCE_INIT]])
-  // CHECK-SAME: applies mhlo.add across dimensions = [4, 3]
-  // CHECK-SAME: (tensor<?x?x3x?x6xi32>, tensor<i32>)
-  // CHECK-SAME: -> tensor<?x?x3xi32>
-  // CHECK: %[[RHS_ZP:.*]] = mhlo.constant dense<5> : tensor<i32>
-  // CHECK: %[[LHS_ZP_CONTRIB:.*]] = chlo.broadcast_multiply
-  // CHECK-SAME: %[[LHS_REDUCE]], %[[RHS_ZP]] :
-  // CHECK-SAME: (tensor<?x?x3xi32>, tensor<i32>) -> tensor<?x?x3xi32>
+  // CHECK: %[[RHS_I32:.*]] = mhlo.convert %[[RHS:.*]] : (tensor<6x?x?x8x3xi8>)
+  // CHECK-SAME: -> tensor<6x?x?x8x3xi32>
+  // CHECK: %[[RHS_REDUCE_INIT:.*]] = mhlo.constant dense<0> : tensor<i32>
+  // CHECK: %[[RHS_REDUCE:.*]] = mhlo.reduce(%[[RHS_I32]] init: %[[RHS_REDUCE_INIT]])
+  // CHECK-SAME: applies mhlo.add across dimensions = [0, 2]
+  // CHECK-SAME: (tensor<6x?x?x8x3xi32>, tensor<i32>)
+  // CHECK-SAME: -> tensor<?x8x3xi32>
+  // CHECK: %[[RHS_ZP:.*]] = mhlo.constant dense<3> : tensor<i32>
+  // CHECK: %[[RHS_ZP_CONTRIB:.*]] = chlo.broadcast_multiply
+  // CHECK-SAME: %[[RHS_REDUCE]], %[[RHS_ZP]] :
+  // CHECK-SAME: (tensor<?x8x3xi32>, tensor<i32>) -> tensor<?x8x3xi32>
 
   // Calculate output dynamic dims.
   // CHECK: %[[DIM_1_1:.*]] = "mhlo.get_dimension_size"(%[[DOT_RES]])
@@ -1094,40 +1027,10 @@ func.func @dot_general_multiple_dynamic_dims(
   // CHECK: %[[OUTPUT_DIMS:.*]] = "mhlo.concatenate"
   // CHECK-SAME: %[[DIM_1]], %[[DIM_2]], %[[DIM_3]], %[[DIM_4]]
 
-  // CHECK: %[[LHS_ZP_BCAST:.*]] = "mhlo.dynamic_broadcast_in_dim"
-  // CHECK-SAME: (%[[LHS_ZP_CONTRIB]], %[[OUTPUT_DIMS]])
-  // CHECK-SAME: broadcast_dimensions = dense<[0, 2, 1]>
-  // CHECK-SAME: (tensor<?x?x3xi32>, tensor<4xi64>) -> tensor<?x3x?x8xi32>
-
-  // Zero point offset contribution from RHS tensor * LHS ZP.
-
-  // CHECK: %[[RHS_I32:.*]] = mhlo.convert %[[RHS:.*]] : (tensor<6x?x?x8x3xi8>)
-  // CHECK-SAME: -> tensor<6x?x?x8x3xi32>
-  // CHECK: %[[RHS_REDUCE_INIT:.*]] = mhlo.constant dense<0> : tensor<i32>
-  // CHECK: %[[RHS_REDUCE:.*]] = mhlo.reduce(%[[RHS_I32]] init: %[[RHS_REDUCE_INIT]])
-  // CHECK-SAME: applies mhlo.add across dimensions = [0, 2]
-  // CHECK-SAME: (tensor<6x?x?x8x3xi32>, tensor<i32>)
-  // CHECK-SAME: -> tensor<?x8x3xi32>
-  // CHECK: %[[RHS_ZP:.*]] = mhlo.constant dense<3> : tensor<i32>
-  // CHECK: %[[RHS_ZP_CONTRIB:.*]] = chlo.broadcast_multiply
-  // CHECK-SAME: %[[RHS_REDUCE]], %[[RHS_ZP]] :
-  // CHECK-SAME: (tensor<?x8x3xi32>, tensor<i32>) -> tensor<?x8x3xi32>
-
   // CHECK: %[[RHS_ZP_BCAST:.*]] = "mhlo.dynamic_broadcast_in_dim"
   // CHECK-SAME: (%[[RHS_ZP_CONTRIB]], %[[OUTPUT_DIMS]])
   // CHECK-SAME: broadcast_dimensions = dense<[0, 3, 1]>
   // CHECK-SAME: (tensor<?x8x3xi32>, tensor<4xi64>) -> tensor<?x3x?x8xi32>
-  // CHECK: %[[ZP_TOTAL_1:.*]] = mhlo.add %[[LHS_ZP_BCAST]], %[[RHS_ZP_BCAST]]
-
-  // Zero point offset contribution from LHS ZP * RHS ZP.
-
-  // CHECK: %[[ZPS_INIT:.*]] = mhlo.constant dense<1> : tensor<i32>
-  // CHECK: %[[DYN_DIM:.*]] = "mhlo.get_dimension_size"(%[[RHS]])
-  // CHECK: %[[ZPS_1:.*]] = mhlo.multiply %[[ZPS_INIT]], %[[DYN_DIM]]
-  // CHECK: %[[STATIC_DIM:.*]] = mhlo.constant dense<90> : tensor<i32>
-  // CHECK: %[[ZPS:.*]] = mhlo.multiply %[[STATIC_DIM]], %[[ZPS_1]]
-  // CHECK: %[[ZP_TOTAL_2:.*]] = chlo.broadcast_subtract %[[ZP_TOTAL_1]], %[[ZPS]]
-  // CHECK-SAME: (tensor<?x3x?x8xi32>, tensor<i32>) -> tensor<?x3x?x8xi32>
 
   // Combine dot result with zero point offset and output final result.
 
@@ -1139,17 +1042,17 @@ func.func @dot_general_multiple_dynamic_dims(
   // CHECK: %[[RES_INT:.*]] = mhlo.convert %[[RES_FP_1]]
   // CHECK-SAME: (tensor<?x3x?x8xf32>) -> tensor<?x3x?x8xi32>
 
-  // CHECK: %[[ZP_TOTAL_3:.*]] = mhlo.convert %[[ZP_TOTAL_2]]
+  // CHECK: %[[ZP_TOTAL_1:.*]] = mhlo.convert %[[RHS_ZP_BCAST]]
   // CHECK-SAME: (tensor<?x3x?x8xi32>) -> tensor<?x3x?x8xf32>
-  // CHECK: %[[ZP_TOTAL_4:.*]] = chlo.broadcast_multiply
-  // CHECK-SAME: %[[ZP_TOTAL_3:.*]], %[[COMBINED_SCALE]]
-  // CHECK: %[[ZP_TOTAL_5:.*]] = mhlo.convert %[[ZP_TOTAL_4]]
+  // CHECK: %[[ZP_TOTAL_2:.*]] = chlo.broadcast_multiply
+  // CHECK-SAME: %[[ZP_TOTAL_1:.*]], %[[COMBINED_SCALE]]
+  // CHECK: %[[ZP_TOTAL_3:.*]] = mhlo.convert %[[ZP_TOTAL_2]]
   // CHECK-SAME: (tensor<?x3x?x8xf32>) -> tensor<?x3x?x8xi32>
 
   // CHECK: %[[RES_ZP:.*]] = mhlo.constant dense<7> : tensor<i32>
-  // CHECK: %[[ZP_TOTAL_6:.*]] = chlo.broadcast_subtract %[[RES_ZP]], %[[ZP_TOTAL_5]]
+  // CHECK: %[[ZP_TOTAL_4:.*]] = chlo.broadcast_subtract %[[RES_ZP]], %[[ZP_TOTAL_3]]
   // CHECK-SAME: (tensor<i32>, tensor<?x3x?x8xi32>) -> tensor<?x3x?x8xi32>
-  // CHECK: chlo.broadcast_add %[[RES_INT]], %[[ZP_TOTAL_6]]
+  // CHECK: chlo.broadcast_add %[[RES_INT]], %[[ZP_TOTAL_4]]
 
   %0 = "mhlo.dot_general" (%arg0, %arg1) {
     dot_dimension_numbers = #mhlo.dot<
@@ -1159,7 +1062,7 @@ func.func @dot_general_multiple_dynamic_dims(
       rhs_contracting_dimensions = [0, 2]
     >} : (
       tensor<?x?x3x?x6x!quant.uniform<i8:f32, 2.000000e+00:3>>,
-      tensor<6x?x?x8x3x!quant.uniform<i8:f32, 1.000000e+00:5>>
+      tensor<6x?x?x8x3x!quant.uniform<i8:f32, 1.000000e+00:0>>
     ) -> tensor<?x3x?x8x!quant.uniform<i32:f32, 4.000000e+00:7>>
   return %0 : tensor<?x3x?x8x!quant.uniform<i32:f32, 4.000000e+00:7>>
 }
@@ -1717,7 +1620,7 @@ func.func @conv2d_per_channel_res_only(
 
 func.func @conv2d_per_channel_unsupported_channel(
     %arg0: tensor<128x28x28x1x!quant.uniform<i8:f32, 2.000000e+00:4>>,
-    %arg1: tensor<3x3x1x2x!quant.uniform<i8:f32:2, {2.000000e+00:0, 1.000000e+00:0}>>
+    %arg1: tensor<3x3x1x2x!quant.uniform<i8:f32:2, {2.000000e+00:0}>>
   ) -> tensor<128x26x26x2x!quant.uniform<i32:f32:3, {4.000000e+00:0, 2.000000e+00:0}>> {
   // expected-error@+2 {{Conv quantized axis must be out channel axis}}
   // expected-error@+1 {{failed to legalize operation 'mhlo.convolution' that was explicitly marked illegal}}
@@ -1731,7 +1634,7 @@ func.func @conv2d_per_channel_unsupported_channel(
     {
       batch_group_count = 1 : i64,
       feature_group_count = 1 : i64
-    } : (tensor<128x28x28x1x!quant.uniform<i8:f32, 2.000000e+00:4>>, tensor<3x3x1x2x!quant.uniform<i8:f32:2, {2.000000e+00:0, 1.000000e+00:0}>>)
+    } : (tensor<128x28x28x1x!quant.uniform<i8:f32, 2.000000e+00:4>>, tensor<3x3x1x2x!quant.uniform<i8:f32:2, {2.000000e+00:0}>>)
     -> tensor<128x26x26x2x!quant.uniform<i32:f32:3, {4.000000e+00:0, 2.000000e+00:0}>>
   return %0 : tensor<128x26x26x2x!quant.uniform<i32:f32:3, {4.000000e+00:0, 2.000000e+00:0}>>
 }
@@ -1811,14 +1714,13 @@ func.func @dot_general_hybrid_per_channel(
 // CHECK-SAME: %[[ARG1:.*]]: tensor<2x2xi8>
 func.func @dot_general_hybrid_per_channel_asymmetric(
     %arg0: tensor<3x2xf32>,
-    %arg1: tensor<2x2x!quant.uniform<i8<-127:127>:f32:1, {3.000000e+00:10, 4.000000e+00:20}>>
+    %arg1: tensor<2x2x!quant.uniform<i8<-127:127>:f32:1, {3.000000e+00:0, 4.000000e+00:0}>>
   ) -> tensor<3x2xf32> {
   // CHECK-DAG: %[[BARRIER:.*]] = mhlo.optimization_barrier %[[ARG1]] : tensor<2x2xi8>
   // CHECK-DAG: %[[SCALES:.*]] = mhlo.constant dense<[3.000000e+00, 4.000000e+00]> : tensor<2xf32>
-  // CHECK-DAG: %[[ZPS:.*]] = mhlo.constant dense<[1.000000e+01, 2.000000e+01]> : tensor<2xf32>
+  // CHECK-DAG: %[[ZPS:.*]] = mhlo.constant dense<0.000000e+00> : tensor<2xf32>
   // CHECK-DAG: %[[CONVERT:.*]] = mhlo.convert %[[BARRIER]] : (tensor<2x2xi8>) -> tensor<2x2xf32>
-  // CHECK: %[[SUB:.*]] = chlo.broadcast_subtract %[[CONVERT]], %[[ZPS]] {broadcast_dimensions = array<i64: 1>} : (tensor<2x2xf32>, tensor<2xf32>) -> tensor<2x2xf32>
-  // CHECK: %[[MUL:.*]] = chlo.broadcast_multiply %[[SUB]], %[[SCALES]] {broadcast_dimensions = array<i64: 1>} : (tensor<2x2xf32>, tensor<2xf32>) -> tensor<2x2xf32>
+  // CHECK: %[[MUL:.*]] = chlo.broadcast_multiply %[[CONVERT]], %[[SCALES]] {broadcast_dimensions = array<i64: 1>} : (tensor<2x2xf32>, tensor<2xf32>) -> tensor<2x2xf32>
   // CHECK: %[[DOT:.*]] = "mhlo.dot_general"(%[[ARG0]], %[[MUL]])
   // CHECK-SAME: (tensor<3x2xf32>, tensor<2x2xf32>) -> tensor<3x2xf32>
   // CHECK: return %[[DOT]]
@@ -1827,7 +1729,7 @@ func.func @dot_general_hybrid_per_channel_asymmetric(
       dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1],
       rhs_contracting_dimensions = [0]>} : (
     tensor<3x2xf32>,
-    tensor<2x2x!quant.uniform<i8<-127:127>:f32:1, {3.000000e+00:10, 4.000000e+00:20}>>
+    tensor<2x2x!quant.uniform<i8<-127:127>:f32:1, {3.000000e+00:0, 4.000000e+00:0}>>
   ) -> tensor<3x2xf32>
   return %0 : tensor<3x2xf32>
 }
@@ -1989,9 +1891,9 @@ func.func @conv2d_hybrid_result_not_float(
 
 // -----
 
-func.func @dot_general_hybrid_result_not_float(
-    %arg0: tensor<2x5x6xf32>,
-    %arg1: tensor<6x8x2x!quant.uniform<i8:f32, 1.000000e+00:5>>) {
+func.func @dot_general_non_hybrid_result_not_float(
+  %arg0: tensor<2x5x6x!quant.uniform<i8:f32, 1.000000e+00:0>>,
+  %arg1: tensor<6x8x2x!quant.uniform<i8:f32:2, {1.000000e+00:0, 1.000000e+00:0}>>) {
   // expected-error@+2 {{Invalid input/output type for Dot/Convolution op}}
   // expected-error@+1 {{failed to legalize operation 'mhlo.dot_general' that was explicitly marked illegal}}
   %0 = "mhlo.dot_general" (%arg0, %arg1) {
@@ -2001,8 +1903,8 @@ func.func @dot_general_hybrid_result_not_float(
       lhs_contracting_dimensions = [2],
       rhs_contracting_dimensions = [0]
     >} : (
-      tensor<2x5x6xf32>,
-      tensor<6x8x2x!quant.uniform<i8:f32, 1.000000e+00:5>>
+      tensor<2x5x6x!quant.uniform<i8:f32, 1.000000e+00:0>>,
+      tensor<6x8x2x!quant.uniform<i8:f32:2, {1.000000e+00:0, 1.000000e+00:0}>>
     ) -> tensor<2x5x8x!quant.uniform<i8:f32, 4.000000e+00:7>>
   return
 }
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_canonicalize_scatter.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_canonicalize_scatter.mlir
index 2d61f1c0bf16ea..d4d36ba46308fb 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_canonicalize_scatter.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_canonicalize_scatter.mlir
@@ -25,11 +25,11 @@ func.func @insert_index_vector_and_window_dims(%dst1: tensor<3x3xf32>,
 // CHECK-SAME:      %[[UPD1:.*]]: tensor<2x3xf32>, %[[UPD2:.*]]: tensor<2x3xf32>)
 
 // CHECK:         %[[IND_:.*]] = tensor.expand_shape %[[IND]] [
-// CHECK-SAME:      [0, 1]] : tensor<2xi32> into tensor<2x1xi32>
+// CHECK-SAME:      [0, 1]] output_shape [2, 1] : tensor<2xi32> into tensor<2x1xi32>
 // CHECK:         %[[UPD1_:.*]] = tensor.expand_shape %[[UPD1]] [
-// CHECK-SAME:      [0], [1, 2]] : tensor<2x3xf32> into tensor<2x1x3xf32>
+// CHECK-SAME:      [0], [1, 2]] output_shape [2, 1, 3] : tensor<2x3xf32> into tensor<2x1x3xf32>
 // CHECK:         %[[UPD2_:.*]] = tensor.expand_shape %[[UPD2]] [
-// CHECK-SAME:      [0], [1, 2]] : tensor<2x3xf32> into tensor<2x1x3xf32>
+// CHECK-SAME:      [0], [1, 2]] output_shape [2, 1, 3] : tensor<2x3xf32> into tensor<2x1x3xf32>
 
 // CHECK:         "mhlo.scatter"(%[[DST1]], %[[DST2]], %[[IND_]], %[[UPD1_]], %[[UPD2_]])
 // CHECK:           update_window_dims = [1, 2],
@@ -194,9 +194,9 @@ func.func @zero_dim_scatter_indices(%dst: tensor<4x4xf32>,
 // CHECK-SAME:      %[[UPD:.*]]: tensor<3x3xf32>
 
 // CHECK:         %[[IND_:.*]] = tensor.expand_shape %[[IND]] [
-// CHECK-SAME:      [0, 1]] : tensor<2xi32> into tensor<1x2xi32>
+// CHECK-SAME:      [0, 1]] output_shape [1, 2] : tensor<2xi32> into tensor<1x2xi32>
 // CHECK:         %[[UPD_:.*]] = tensor.expand_shape %[[UPD]] [
-// CHECK-SAME:      [0, 1], [2]] : tensor<3x3xf32> into tensor<1x3x3xf32>
+// CHECK-SAME:      [0, 1], [2]] output_shape [1, 3, 3] : tensor<3x3xf32> into tensor<1x3x3xf32>
 // CHECK:         "mhlo.scatter"(%[[DST]], %[[IND_]], %[[UPD_]])
 // CHECK-SAME:      update_window_dims = [1, 2],
 // CHECK-SAME:      scatter_dims_to_operand_dims = [0, 1]
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_ops_prettyprint.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_ops_prettyprint.mlir
index d45da5fa1f7b77..12a4783bb64123 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_ops_prettyprint.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_ops_prettyprint.mlir
@@ -142,28 +142,22 @@ func.func @type_convert_ops(%arg0 : tensor<2xf32>) -> () {
 // CHECK-LABEL: func @no_attr_ops
 func.func @no_attr_ops(%arg0 : tensor<4xf32>, %arg1 : !mhlo.token,
                        %arg2 : tensor<4xi32>, %arg3 : index) -> !mhlo.token {
-  // CHECK:      %0 = mhlo.add_dependency %arg0, %arg1 : (tensor<4xf32>, !mhlo.token) -> tensor<4xf32>
-  // CHECK-NEXT: %1 = mhlo.clamp %arg0, %arg0, %arg0 : tensor<4xf32>
-  // CHECK-NEXT: %2 = mhlo.complex %arg0, %arg0 : tensor<4xcomplex<f32>>
-  // CHECK-NEXT: %3 = mhlo.compute_reshape_shape %arg3, %arg2 : (index, tensor<4xi32>) -> tensor<4xi32>
-  // CHECK-NEXT: %4 = mhlo.copy %3 : tensor<4xi32>
-  // CHECK-NEXT: %5 = mhlo.uniform_quantize %arg0 : (tensor<4xf32>) -> tensor<4x!quant.uniform<u8:f32, 3.400000e+01:16>>
-  // CHECK-NEXT: %6 = mhlo.uniform_dequantize %5 : (tensor<4x!quant.uniform<u8:f32, 3.400000e+01:16>>) -> tensor<4xf32>
-  // CHECK-NEXT: %7 = mhlo.after_all %arg1, %arg1 : !mhlo.token
-  // CHECK-NEXT: %8 = mhlo.after_all : !mhlo.token
-  // CHECK-NEXT: %9 = mhlo.cstr_reshapable %arg3, %arg2 : (index, tensor<4xi32>) -> !shape.witness
-  // CHECK-NEXT: %10 = mhlo.compute_reshape_shape %arg3, %arg2 : (index, tensor<4xi32>) -> tensor<4xi32>
+  // CHECK:      mhlo.add_dependency %arg0, %arg1 : (tensor<4xf32>, !mhlo.token) -> tensor<4xf32>
+  // CHECK-NEXT: mhlo.clamp %arg0, %arg0, %arg0 : tensor<4xf32>
+  // CHECK-NEXT: mhlo.complex %arg0, %arg0 : tensor<4xcomplex<f32>>
+  // CHECK-NEXT: mhlo.copy %arg2 : tensor<4xi32>
+  // CHECK-NEXT: mhlo.uniform_quantize %arg0 : (tensor<4xf32>) -> tensor<4x!quant.uniform<u8:f32, 3.400000e+01:16>>
+  // CHECK-NEXT: mhlo.uniform_dequantize %[[_:[0-9]+]] : (tensor<4x!quant.uniform<u8:f32, 3.400000e+01:16>>) -> tensor<4xf32>
+  // CHECK-NEXT: mhlo.after_all %arg1, %arg1 : !mhlo.token
+  // CHECK-NEXT: mhlo.after_all : !mhlo.token
   %0 = "mhlo.add_dependency"(%arg0, %arg1) : (tensor<4xf32>, !mhlo.token) -> tensor<4xf32>
   %1 = "mhlo.clamp"(%arg0, %arg0, %arg0) : (tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
   %2 = "mhlo.complex"(%arg0, %arg0) {} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xcomplex<f32>>
-  %3 = "mhlo.compute_reshape_shape"(%arg3, %arg2) : (index, tensor<4xi32>) -> tensor<4xi32>
-  %4 = "mhlo.copy"(%3) : (tensor<4xi32>) -> tensor<4xi32>
-  %5 = "mhlo.uniform_quantize"(%arg0) : (tensor<4xf32>) -> tensor<4x!quant.uniform<ui8:f32, 34.0:16>>
-  %6 = "mhlo.uniform_dequantize"(%5) : (tensor<4x!quant.uniform<ui8:f32, 34.0:16>>) -> tensor<4xf32>
-  %7 = "mhlo.after_all"(%arg1, %arg1) : (!mhlo.token, !mhlo.token) -> !mhlo.token
-  %8 = "mhlo.after_all"() : () -> !mhlo.token
-  %9 = "mhlo.cstr_reshapable"(%arg3, %arg2) : (index, tensor<4xi32>) -> !shape.witness
-  %10 = "mhlo.compute_reshape_shape"(%arg3, %arg2) : (index, tensor<4xi32>) -> tensor<4xi32>
+  %3 = "mhlo.copy"(%arg2) : (tensor<4xi32>) -> tensor<4xi32>
+  %4 = "mhlo.uniform_quantize"(%arg0) : (tensor<4xf32>) -> tensor<4x!quant.uniform<ui8:f32, 34.0:16>>
+  %5 = "mhlo.uniform_dequantize"(%4) : (tensor<4x!quant.uniform<ui8:f32, 34.0:16>>) -> tensor<4xf32>
+  %6 = "mhlo.after_all"(%arg1, %arg1) : (!mhlo.token, !mhlo.token) -> !mhlo.token
+  %7 = "mhlo.after_all"() : () -> !mhlo.token
   "mhlo.return"(%arg1) : (!mhlo.token) -> ()
 }
 
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir
index 8239be797a9b09..b4bdfba82540b2 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir
@@ -3449,7 +3449,7 @@ func.func @compatible_shapes(%arg0: tensor<?xf32>, %shape: tensor<2xindex>) -> t
 // -----
 
 func.func @incompatible_shapes(%arg0: tensor<?xf32>, %shape: tensor<2xindex>) -> tensor<?xf32> {
-  // expected-error @+1 {{output should have a rank equal to the number of elements in output_shape}}
+  // expected-error @+1 {{result should have a rank equal to the number of elements in output_shape}}
   %0 = "mhlo.dynamic_reshape"(%arg0, %shape) : (tensor<?xf32>, tensor<2xindex>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
 }
@@ -4061,7 +4061,7 @@ func.func @dynamic_gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<
 
 // -----
 
-func.func @dynamic_gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<?xi32>) -> tensor<?x?x?xi32> {
+func.func @dynamic_gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<3xi32>) -> tensor<?x?x?xi32> {
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
     dimension_numbers = #mhlo.gather<
       collapsed_slice_dims = [0, 1],
@@ -4070,13 +4070,13 @@ func.func @dynamic_gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<
       start_index_map = [0, 1]
     >,
     indices_are_sorted = false
-  } : (tensor<?x?x?xi32>, tensor<?x?x?xi32>, tensor<?xi32>) -> tensor<?x?x?xi32>
+  } : (tensor<?x?x?xi32>, tensor<?x?x?xi32>, tensor<3xi32>) -> tensor<?x?x?xi32>
   func.return %res : tensor<?x?x?xi32>
 }
 
 // -----
 
-func.func @dynamic_gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<?xi32>) -> tensor<?x?x?xi32> {
+func.func @dynamic_gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<3xi32>) -> tensor<?x?x?xi32> {
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
     dimension_numbers = #mhlo.gather<
       collapsed_slice_dims = [0, 1],
@@ -4085,14 +4085,31 @@ func.func @dynamic_gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<
       start_index_map = [0, 1]
     >,
     indices_are_sorted = false
-  } : (tensor<2x4x9xi32>, tensor<?x?x?xi32>, tensor<?xi32>) -> tensor<?x?x?xi32>
+  } : (tensor<2x4x9xi32>, tensor<?x?x?xi32>, tensor<3xi32>) -> tensor<?x?x?xi32>
   func.return %res : tensor<?x?x?xi32>
 }
 
 // -----
 
-func.func @dynamic_gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<?xi32>) -> tensor<?x?x?xi32> {
-  // @expected-error@+2 {{'mhlo.dynamic_gather' op failed to infer returned types}}
+func.func @dynamic_gather_c1(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>, %slice_sizes : tensor<3xi32>) -> tensor<1x5x8xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{offset_dims size (2) plus collapse_slice_dims size (2) is not equal to operand rank (3)}}
+  %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
+    dimension_numbers = #mhlo.gather<
+      offset_dims = [1, 2],
+      collapsed_slice_dims = [0, 1],
+      start_index_map = [0, 1],
+      index_vector_dim = 2
+    >,
+    indices_are_sorted = false
+  } : (tensor<2x4x9xi32>, tensor<1x5x2xi32>, tensor<3xi32>) -> tensor<1x5x8xi32>
+  func.return %res : tensor<1x5x8xi32>
+}
+
+// -----
+
+func.func @dynamic_gather_c2(%operand : tensor<?x?x?xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<3xi32>) -> tensor<?xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
   // expected-error@+1 {{index_vector_dim 4 is out of bounds for start indices with rank 3}}
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
     dimension_numbers = #mhlo.gather<
@@ -4102,65 +4119,221 @@ func.func @dynamic_gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<
       start_index_map = [0, 1]
     >,
     indices_are_sorted = false
-  } : (tensor<?x?x?xi32>, tensor<?x?x?xi32>, tensor<?xi32>) -> tensor<?x?x?xi32>
+  } : (tensor<?x?x?xi32>, tensor<?x?x?xi32>, tensor<3xi32>) -> tensor<?xi32>
+  func.return %res : tensor<?xi32>
+}
+
+// -----
+
+func.func @dynamic_gather_c3(%operand : tensor<?x?x?xi32>, %start_indices : tensor<?x?x2xi32>, %slice_sizes : tensor<3xi32>) -> tensor<?xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{start_index_map size (1) is not equal to size of index dimension (2) of start_indices (2)}}
+  %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
+    dimension_numbers = #mhlo.gather<
+      collapsed_slice_dims = [0, 1],
+      index_vector_dim = 2,
+      offset_dims = [2],
+      start_index_map = [0]
+    >,
+    indices_are_sorted = false
+  } : (tensor<?x?x?xi32>, tensor<?x?x2xi32>, tensor<3xi32>) -> tensor<?xi32>
+  func.return %res : tensor<?xi32>
+}
+
+// -----
+
+func.func @dynamic_gather_c4(%operand : tensor<2x4x9xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<3xi32>) -> tensor<?x?x?xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{expects offset_dims to be sorted, got: [2, 1]}}
+  %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
+    dimension_numbers = #mhlo.gather<
+      collapsed_slice_dims = [0, 1],
+      index_vector_dim = 2,
+      offset_dims = [2, 1],
+      start_index_map = [0, 1]
+    >,
+    indices_are_sorted = false
+  } : (tensor<2x4x9xi32>, tensor<?x?x?xi32>, tensor<3xi32>) -> tensor<?x?x?xi32>
   func.return %res : tensor<?x?x?xi32>
 }
 
 // -----
 
-func.func @dynamic_gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<?xi32>) -> tensor<?x?x?xi32> {
-  // @expected-error@+2 {{'mhlo.dynamic_gather' op failed to infer returned types}}
-  // expected-error@+1 {{offset_dims size (2) plus collapse_slice_dims size (2) is not equal to operand rank (3)}}
+func.func @dynamic_gather_c5(%operand : tensor<2x4x9xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<3xi32>) -> tensor<?x?x?xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{offset_dims[0]: -1 is out of bounds for implied result rank 3}}
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
     dimension_numbers = #mhlo.gather<
       collapsed_slice_dims = [0, 1],
       index_vector_dim = 2,
-      offset_dims = [1, 2],
+      offset_dims = [-1],
       start_index_map = [0, 1]
     >,
     indices_are_sorted = false
-  } : (tensor<?x?x?xi32>, tensor<?x?x?xi32>, tensor<?xi32>) -> tensor<?x?x?xi32>
+  } : (tensor<2x4x9xi32>, tensor<?x?x?xi32>, tensor<3xi32>) -> tensor<?x?x?xi32>
   func.return %res : tensor<?x?x?xi32>
 }
 
 // -----
 
-func.func @dynamic_gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<?x?x2xi32>, %slice_sizes : tensor<?xi32>) -> tensor<?x?x?xi32> {
-  // @expected-error@+2 {{'mhlo.dynamic_gather' op failed to infer returned types}}
-  // expected-error@+1 {{start_index_map size (1) is not equal to size of index dimension (2) of start_indices (2)}}
+func.func @dynamic_gather_c5(%operand : tensor<2x4x9xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<3xi32>) -> tensor<?x?x?xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{offset_dims[0]: 3 is out of bounds for implied result rank 3}}
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
     dimension_numbers = #mhlo.gather<
       collapsed_slice_dims = [0, 1],
       index_vector_dim = 2,
+      offset_dims = [3],
+      start_index_map = [0, 1]
+    >,
+    indices_are_sorted = false
+  } : (tensor<2x4x9xi32>, tensor<?x?x?xi32>, tensor<3xi32>) -> tensor<?x?x?xi32>
+  func.return %res : tensor<?x?x?xi32>
+}
+
+// -----
+
+func.func @dynamic_gather_c6(%operand : tensor<2x4x9xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<3xi32>) -> tensor<?x?x?xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{expects collapsed_slice_dims to be sorted, got: [1, 0]}}
+  %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
+    dimension_numbers = #mhlo.gather<
+      collapsed_slice_dims = [1, 0],
+      index_vector_dim = 2,
       offset_dims = [2],
-      start_index_map = [0]
+      start_index_map = [0, 1]
     >,
     indices_are_sorted = false
-  } : (tensor<?x?x?xi32>, tensor<?x?x2xi32>, tensor<?xi32>) -> tensor<?x?x?xi32>
+  } : (tensor<2x4x9xi32>, tensor<?x?x?xi32>, tensor<3xi32>) -> tensor<?x?x?xi32>
   func.return %res : tensor<?x?x?xi32>
 }
 
 // -----
 
-func.func @dynamic_gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<?x?xi32>) -> tensor<?xi32> {
-  // @expected-error@+2 {{'mhlo.dynamic_gather' op failed to infer returned types}}
-  // expected-error@+1 {{slice_sizes.rank != 1}}
+func.func @dynamic_gather_c6(%operand : tensor<2x4x9xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<3xi32>) -> tensor<?x?x?xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{expects collapsed_slice_dims to not repeat, got: [1, 1]}}
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
     dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [0, 1],
+      collapsed_slice_dims = [1, 1],
       index_vector_dim = 2,
       offset_dims = [2],
       start_index_map = [0, 1]
     >,
     indices_are_sorted = false
-  } : (tensor<?x?x?xi32>, tensor<?x?x?xi32>, tensor<?x?xi32>) -> tensor<?x?x?xi32>
+  } : (tensor<2x4x9xi32>, tensor<?x?x?xi32>, tensor<3xi32>) -> tensor<?x?x?xi32>
   func.return %res : tensor<?x?x?xi32>
 }
 
 // -----
 
-func.func @dynamic_gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<2xi32>) -> tensor<?x?x?xi32> {
-  // @expected-error@+2 {{'mhlo.dynamic_gather' op failed to infer returned types}}
+func.func @dynamic_gather_c7(%operand : tensor<2x4x9xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<3xi32>) -> tensor<?x?x?xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{collapsed dimension -1 is out of bounds for slice_sizes.size (3)}}
+  %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
+    dimension_numbers = #mhlo.gather<
+      collapsed_slice_dims = [-1, 1],
+      index_vector_dim = 2,
+      offset_dims = [2],
+      start_index_map = [0, 1]
+    >,
+    indices_are_sorted = false
+  } : (tensor<2x4x9xi32>, tensor<?x?x?xi32>, tensor<3xi32>) -> tensor<?x?x?xi32>
+  func.return %res : tensor<?x?x?xi32>
+}
+
+// -----
+
+func.func @dynamic_gather_c7(%operand : tensor<2x4x9xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<3xi32>) -> tensor<?x?x?xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{collapsed dimension 17 is out of bounds for slice_sizes.size (3)}}
+  %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
+    dimension_numbers = #mhlo.gather<
+      collapsed_slice_dims = [0, 17],
+      index_vector_dim = 2,
+      offset_dims = [2],
+      start_index_map = [0, 1]
+    >,
+    indices_are_sorted = false
+  } : (tensor<2x4x9xi32>, tensor<?x?x?xi32>, tensor<3xi32>) -> tensor<?x?x?xi32>
+  func.return %res : tensor<?x?x?xi32>
+}
+
+// -----
+
+func.func @dynamic_gather_c8(%operand : tensor<?x?x?xi32>, %start_indices : tensor<?x?x?xi32>) -> tensor<?x?x?xi32> {
+  %slize_sizes = mhlo.constant dense<[1,1,8]> : tensor<3xi32>
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{slice_sizes collapsed dimension 2 should <= 1 but got 8}}
+  %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slize_sizes) {
+    dimension_numbers = #mhlo.gather<
+      offset_dims = [2],
+      collapsed_slice_dims = [0, 2],
+      start_index_map = [0, 1],
+      index_vector_dim = 2
+    >,
+    indices_are_sorted = false
+  } : (tensor<?x?x?xi32>, tensor<?x?x?xi32>, tensor<3xi32>) -> tensor<?x?x?xi32>
+  func.return %res : tensor<?x?x?xi32>
+}
+
+// -----
+
+func.func @dynamic_gather_c9(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>, %slize_sizes : tensor<3xi32>) -> tensor<1x5x8xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{expects start_index_map to not repeat, got: [0, 0]}}
+  %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slize_sizes) {
+    dimension_numbers = #mhlo.gather<
+      offset_dims = [2],
+      collapsed_slice_dims = [0, 1],
+      start_index_map = [0, 0],
+      index_vector_dim = 2
+    >,
+    indices_are_sorted = false
+  } : (tensor<2x4x9xi32>, tensor<1x5x2xi32>, tensor<3xi32>) -> tensor<1x5x8xi32>
+  func.return %res : tensor<1x5x8xi32>
+}
+
+// -----
+
+func.func @dynamic_gather_c10(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>, %slize_sizes : tensor<3xi32>) -> tensor<1x5x8xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{start_index_map[0]: -2 is out of bounds for operand rank 3}}
+  %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slize_sizes) {
+    dimension_numbers = #mhlo.gather<
+      offset_dims = [2],
+      collapsed_slice_dims = [0, 1],
+      start_index_map = [-2, -1],
+      index_vector_dim = 2
+    >,
+    slice_sizes = array<i64: 1, 1, 8>,
+    indices_are_sorted = false
+  } : (tensor<2x4x9xi32>, tensor<1x5x2xi32>, tensor<3xi32>) -> tensor<1x5x8xi32>
+  func.return %res : tensor<1x5x8xi32>
+}
+
+// -----
+
+func.func @dynamic_gather_c10(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>, %slize_sizes : tensor<3xi32>) -> tensor<1x5x8xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{start_index_map[1]: 3 is out of bounds for operand rank 3}}
+  %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slize_sizes) {
+    dimension_numbers = #mhlo.gather<
+      offset_dims = [2],
+      collapsed_slice_dims = [0, 1],
+      start_index_map = [0, 3],
+      index_vector_dim = 2
+    >,
+    slice_sizes = array<i64: 1, 1, 8>,
+    indices_are_sorted = false
+  } : (tensor<2x4x9xi32>, tensor<1x5x2xi32>, tensor<3xi32>) -> tensor<1x5x8xi32>
+  func.return %res : tensor<1x5x8xi32>
+}
+
+// -----
+
+func.func @dynamic_gather_c11(%operand : tensor<?x?x?xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<2xi32>) -> tensor<?x?x?xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
   // expected-error@+1 {{slice_sizes size (2) not equal to (implied) operand rank (3)}}
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
     dimension_numbers = #mhlo.gather<
@@ -4176,8 +4349,45 @@ func.func @dynamic_gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<
 
 // -----
 
-func.func @dynamic_gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>, %slice_sizes : tensor<3xi32>) -> tensor<3xi32> {
-  // @expected-error@+2 {{'mhlo.dynamic_gather' op failed to infer returned types}}
+func.func @dynamic_gather_c12(%operand : tensor<?x?x2xi32>, %start_indices : tensor<?x?x?xi32>) -> tensor<?x?x?xi32> {
+  %slice_sizes = mhlo.constant dense<[1,1,-1]> : tensor<3xi32>
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{slice size (-1) is out of bounds for operand dimension (2) at index 2}}
+  %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
+    dimension_numbers = #mhlo.gather<
+      offset_dims = [2],
+      collapsed_slice_dims = [0, 1],
+      start_index_map = [0, 1],
+      index_vector_dim = 2
+    >,
+    indices_are_sorted = false
+  } : (tensor<?x?x2xi32>, tensor<?x?x?xi32>, tensor<3xi32>) -> tensor<?x?x?xi32>
+  func.return %res : tensor<?x?x?xi32>
+}
+
+// -----
+
+func.func @dynamic_gather_c12(%operand : tensor<?x?x2xi32>, %start_indices : tensor<?x?x?xi32>) -> tensor<?x?x?xi32> {
+  %slice_sizes = mhlo.constant dense<[1,1,8]> : tensor<3xi32>
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{slice size (8) is out of bounds for operand dimension (2) at index 2}}
+  %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
+    dimension_numbers = #mhlo.gather<
+      offset_dims = [2],
+      collapsed_slice_dims = [0, 1],
+      start_index_map = [0, 1],
+      index_vector_dim = 2
+    >,
+    slice_sizes = array<i64: 1, 1, 8>,
+    indices_are_sorted = false
+  } : (tensor<?x?x2xi32>, tensor<?x?x?xi32>, tensor<3xi32>) -> tensor<?x?x?xi32>
+  func.return %res : tensor<?x?x?xi32>
+}
+
+// -----
+
+func.func @dynamic_gather_c13(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>, %slice_sizes : tensor<3xi32>) -> tensor<3xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
   // expected-error@+1 {{inferred type(s) 'tensor<1x5x?xi32>' are incompatible with return type(s) of operation 'tensor<3xi32>'}}
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
     dimension_numbers = #mhlo.gather<
@@ -4193,8 +4403,8 @@ func.func @dynamic_gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<
 
 // -----
 
-func.func @dynamic_gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>, %slice_sizes : tensor<?xi32>) -> tensor<3xi32> {
-  // @expected-error@+2 {{'mhlo.dynamic_gather' op failed to infer returned types}}
+func.func @dynamic_gather_c13(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>, %slice_sizes : tensor<3xi32>) -> tensor<3xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
   // expected-error@+1 {{inferred type(s) 'tensor<1x5x?xi32>' are incompatible with return type(s) of operation 'tensor<3xi32>'}}
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
     dimension_numbers = #mhlo.gather<
@@ -4204,14 +4414,14 @@ func.func @dynamic_gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<
       start_index_map = [0, 1]
     >,
     indices_are_sorted = false
-  } : (tensor<2x4x9xi32>, tensor<1x5x2xi32>, tensor<?xi32>) -> tensor<3xi32>
+  } : (tensor<2x4x9xi32>, tensor<1x5x2xi32>, tensor<3xi32>) -> tensor<3xi32>
   func.return %res : tensor<3xi32>
 }
 
 // -----
 
-func.func @dynamic_gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<3xi32>) -> tensor<3xi32> {
-  // @expected-error@+2 {{'mhlo.dynamic_gather' op failed to infer returned types}}
+func.func @dynamic_gather_c13(%operand : tensor<?x?x?xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<3xi32>) -> tensor<3xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
   // expected-error@+1 {{inferred type(s) 'tensor<?x?x?xi32>' are incompatible with return type(s) of operation 'tensor<3xi32>'}}
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
     dimension_numbers = #mhlo.gather<
@@ -4227,8 +4437,8 @@ func.func @dynamic_gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<
 
 // -----
 
-func.func @dynamic_gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<?xi32>) -> tensor<?xi32> {
-  // @expected-error@+2 {{'mhlo.dynamic_gather' op failed to infer returned types}}
+func.func @dynamic_gather_c13(%operand : tensor<?x?x?xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<3xi32>) -> tensor<?xi32> {
+  // expected-error@+2 {{failed to infer returned types}}
   // expected-error@+1 {{inferred type(s) 'tensor<?x?x?xi32>' are incompatible with return type(s) of operation 'tensor<?xi32>'}}
   %res = "mhlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
     dimension_numbers = #mhlo.gather<
@@ -4238,7 +4448,7 @@ func.func @dynamic_gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<
       start_index_map = [0, 1]
     >,
     indices_are_sorted = false
-  } : (tensor<?x?x?xi32>, tensor<?x?x?xi32>, tensor<?xi32>) -> tensor<?xi32>
+  } : (tensor<?x?x?xi32>, tensor<?x?x?xi32>, tensor<3xi32>) -> tensor<?xi32>
   func.return %res : tensor<?xi32>
 }
 
@@ -5524,15 +5734,15 @@ func.func @quantized_dot_i8_per_axis(%arg0: tensor<2x2x!quant.uniform<i8:f32, 2.
 // -----
 
 // CHECK-LABEL: func @quantized_dot_i4
-func.func @quantized_dot_i4(%arg0: tensor<2x2x!quant.uniform<i4:f32, 2.0:15>>, %arg1: tensor<2x2x!quant.uniform<i4:f32, 5.0:20>>) -> tensor<2x2x!quant.uniform<i4:f32, 10.0:50>> {
-  %0 = "mhlo.dot"(%arg0, %arg1) : (tensor<2x2x!quant.uniform<i4:f32, 2.0:15>>, tensor<2x2x!quant.uniform<i4:f32, 5.0:20>>) -> tensor<2x2x!quant.uniform<i4:f32, 10.0:50>>
-  func.return %0: tensor<2x2x!quant.uniform<i4:f32, 10.0:50>>
+func.func @quantized_dot_i4(%arg0: tensor<2x2x!quant.uniform<i4:f32, 2.0:1>>, %arg1: tensor<2x2x!quant.uniform<i4:f32, 5.0:2>>) -> tensor<2x2x!quant.uniform<i4:f32, 10.0:5>> {
+  %0 = "mhlo.dot"(%arg0, %arg1) : (tensor<2x2x!quant.uniform<i4:f32, 2.0:1>>, tensor<2x2x!quant.uniform<i4:f32, 5.0:2>>) -> tensor<2x2x!quant.uniform<i4:f32, 10.0:5>>
+  func.return %0: tensor<2x2x!quant.uniform<i4:f32, 10.0:5>>
 }
 
 // -----
 
 // CHECK-LABEL: func @quantized_dot_general
-func.func @quantized_dot_general(%arg0: tensor<2x16x32x!quant.uniform<i8:f32, 2.0:15>>, %arg1: tensor<2x32x32x!quant.uniform<i8:f32, 5.0:20>>) -> tensor<2x16x32x!quant.uniform<i8:f32, 10.0:50>> {
+func.func @quantized_dot_general(%arg0: tensor<2x16x32x!quant.uniform<i8:f32, 2.0:15>>, %arg1: tensor<2x32x32x!quant.uniform<i8:f32, 5.0:0>>) -> tensor<2x16x32x!quant.uniform<i8:f32, 10.0:50>> {
   %0 = "mhlo.dot_general"(%arg0, %arg1) {
     dot_dimension_numbers = #mhlo.dot<
       lhs_batching_dimensions = [0],
@@ -5541,7 +5751,7 @@ func.func @quantized_dot_general(%arg0: tensor<2x16x32x!quant.uniform<i8:f32, 2.
       rhs_contracting_dimensions = [1]
     >,
     precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]}
-    : (tensor<2x16x32x!quant.uniform<i8:f32, 2.0:15>>, tensor<2x32x32x!quant.uniform<i8:f32, 5.0:20>>) -> tensor<2x16x32x!quant.uniform<i8:f32, 10.0:50>>
+    : (tensor<2x16x32x!quant.uniform<i8:f32, 2.0:15>>, tensor<2x32x32x!quant.uniform<i8:f32, 5.0:0>>) -> tensor<2x16x32x!quant.uniform<i8:f32, 10.0:50>>
   func.return %0 : tensor<2x16x32x!quant.uniform<i8:f32, 10.0:50>>
 }
 
@@ -5779,19 +5989,6 @@ func.func @pad_dynamic(%arg0: tensor<?x48x48x32xf32>) -> tensor<?x48x48x48xf32>
 
 // -----
 
-func.func @pad_i2(%arg0: tensor<1x2x3xf16>, %arg1: tensor<2xf16>) -> tensor<2x4x7xf16> {
-  // @expected-error@+2 {{'mhlo.pad' op failed to infer returned types}}
-  // expected-error@+1 {{padding value type should be a rank-0 tensor, is rank 1}}
-  %0 = "mhlo.pad"(%arg0, %arg1) {
-    edge_padding_low = dense<[0, 1, 2]> : tensor<3xi64>,
-    edge_padding_high = dense<[1, 1, 0]> : tensor<3xi64>,
-    interior_padding = dense<[0, 0, 1]> : tensor<3xi64>
-  } : (tensor<1x2x3xf16>, tensor<2xf16>) -> tensor<2x4x7xf16>
-  func.return %0 : tensor<2x4x7xf16>
-}
-
-// -----
-
 func.func @pad_i3(%arg0: tensor<1x2x3xf16>, %arg1: tensor<f16>) -> tensor<2x4x7xf16> {
   // @expected-error@+2 {{'mhlo.pad' op failed to infer returned types}}
   // expected-error@+1 {{edge_padding_low has rank 0 instead of required rank 1}}
@@ -5868,7 +6065,7 @@ func.func @is_compatible_quant_expressed_mismatch(%arg0: tensor<1x!quant.uniform
 
 func.func @is_compatible_quant_storage_mismatch(%arg0: tensor<1x!quant.uniform<i8:f32, 1.0:17>>) {
   // expected-error@+1 {{op requires compatible types for all operands and results}}
-  %0 = "mhlo.add"(%arg0, %arg0) : (tensor<1x!quant.uniform<i8:f32, 1.0:17>>, tensor<1x!quant.uniform<i8:f32, 1.0:17>>) -> tensor<1x!quant.uniform<i4:f32, 1.0:17>>
+  %0 = "mhlo.add"(%arg0, %arg0) : (tensor<1x!quant.uniform<i8:f32, 1.0:17>>, tensor<1x!quant.uniform<i8:f32, 1.0:17>>) -> tensor<1x!quant.uniform<i4:f32, 1.0:1>>
   func.return
 }
 
@@ -6105,8 +6302,8 @@ func.func @async_op(%arg0: tensor<10x10xf32>) -> tensor<32xf32>
 
 func.func @async(%arg0: tensor<10x10xf32>) -> tensor<32xf32> {
   %0 = "mhlo.async_start"(%arg0) {called_computation=@async_op, execution_thread="thread"} : (tensor<10x10xf32>) -> !mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>
-  %1 = "mhlo.async_update"(%0) {called_computation=@async_op, execution_thread="thread"} : (!mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>) -> !mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>
-  %2 = "mhlo.async_done"(%1) {called_computation=@async_op, execution_thread="thread"} : (!mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>) -> tensor<32xf32>
+  %1 = "mhlo.async_update"(%0) : (!mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>) -> !mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>
+  %2 = "mhlo.async_done"(%1) : (!mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>) -> tensor<32xf32>
   func.return %2 : tensor<32xf32>
 }
 
@@ -6121,8 +6318,8 @@ func.func @async_op(%arg0: tensor<10x10xf32>) -> tensor<32xf32>
 func.func @async(%arg0: tensor<10x10xf32>) -> tensor<32xf32> {
   // expected-error@+1 {{component #0 of async bundle doesn't match callee input types}}
   %0 = "mhlo.async_start"(%arg0) {called_computation=@async_op, execution_thread="thread"} : (tensor<10x10xf32>) -> !mhlo.async_bundle<tensor<10xf32>, tensor<32xf32>, tensor<i32>>
-  %1 = "mhlo.async_update"(%0) {called_computation=@async_op, execution_thread="thread"} : (!mhlo.async_bundle<tensor<10xf32>, tensor<32xf32>, tensor<i32>>) -> !mhlo.async_bundle<tensor<10xf32>, tensor<32xf32>, tensor<i32>>
-  %2 = "mhlo.async_done"(%1) {called_computation=@async_op, execution_thread="thread"} : (!mhlo.async_bundle<tensor<10xf32>, tensor<32xf32>, tensor<i32>>) -> tensor<32xf32>
+  %1 = "mhlo.async_update"(%0) : (!mhlo.async_bundle<tensor<10xf32>, tensor<32xf32>, tensor<i32>>) -> !mhlo.async_bundle<tensor<10xf32>, tensor<32xf32>, tensor<i32>>
+  %2 = "mhlo.async_done"(%1) : (!mhlo.async_bundle<tensor<10xf32>, tensor<32xf32>, tensor<i32>>) -> tensor<32xf32>
   func.return %2 : tensor<32xf32>
 }
 
@@ -6137,8 +6334,8 @@ func.func @async_op(%arg0: tensor<10x10xf32>) -> tensor<32xf32>
 func.func @async(%arg0: tensor<10x10xf32>) -> tensor<32xf32> {
   // expected-error@+1 {{component #1 of async bundle doesn't match callee result types}}
   %0 = "mhlo.async_start"(%arg0) {called_computation=@async_op, execution_thread="thread"} : (tensor<10x10xf32>) -> !mhlo.async_bundle<tensor<10x10xf32>, tensor<f32>, tensor<i32>>
-  %1 = "mhlo.async_update"(%0) {called_computation=@async_op, execution_thread="thread"} : (!mhlo.async_bundle<tensor<10x10xf32>, tensor<f32>, tensor<i32>>) -> !mhlo.async_bundle<tensor<10x10xf32>, tensor<f32>, tensor<i32>>
-  %2 = "mhlo.async_done"(%1) {called_computation=@async_op, execution_thread="thread"} : (!mhlo.async_bundle<tensor<10x10xf32>, tensor<f32>, tensor<i32>>) -> tensor<32xf32>
+  %1 = "mhlo.async_update"(%0) : (!mhlo.async_bundle<tensor<10x10xf32>, tensor<f32>, tensor<i32>>) -> !mhlo.async_bundle<tensor<10x10xf32>, tensor<f32>, tensor<i32>>
+  %2 = "mhlo.async_done"(%1) : (!mhlo.async_bundle<tensor<10x10xf32>, tensor<f32>, tensor<i32>>) -> tensor<32xf32>
   func.return %2 : tensor<32xf32>
 }
 
@@ -6151,8 +6348,8 @@ func.func @async(%arg0: tensor<10x10xf32>) -> tensor<32xf32> {
 func.func @async(%arg0: tensor<10x10xf32>) -> tensor<32xf32> {
   // expected-error@+1 {{can't find function: async_op}}
   %0 = "mhlo.async_start"(%arg0) {called_computation=@async_op, execution_thread="thread"} : (tensor<10x10xf32>) -> !mhlo.async_bundle<tensor<64xf32>, tensor<32xf32>, tensor<i32>>
-  %1 = "mhlo.async_update"(%0) {called_computation=@async_op, execution_thread="thread"} : (!mhlo.async_bundle<tensor<64xf32>, tensor<32xf32>, tensor<i32>>) -> !mhlo.async_bundle<tensor<64xf32>, tensor<32xf32>, tensor<i32>>
-  %2 = "mhlo.async_done"(%1) {called_computation=@async_op, execution_thread="thread"} : (!mhlo.async_bundle<tensor<64xf32>, tensor<32xf32>, tensor<i32>>) -> tensor<32xf32>
+  %1 = "mhlo.async_update"(%0) : (!mhlo.async_bundle<tensor<64xf32>, tensor<32xf32>, tensor<i32>>) -> !mhlo.async_bundle<tensor<64xf32>, tensor<32xf32>, tensor<i32>>
+  %2 = "mhlo.async_done"(%1) : (!mhlo.async_bundle<tensor<64xf32>, tensor<32xf32>, tensor<i32>>) -> tensor<32xf32>
   func.return %2 : tensor<32xf32>
 }
 
@@ -6166,8 +6363,8 @@ func.func @async_op(%arg0: tensor<10x10xf32>) -> tensor<32xf32> {
 func.func @async(%arg0: tensor<10x10xf32>) -> tensor<32xf32> {
   // expected-error@+1 {{callee must have execution_thread attribute}}
   %0 = "mhlo.async_start"(%arg0) {called_computation=@async_op, execution_thread="thread"} : (tensor<10x10xf32>) -> !mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>
-  %1 = "mhlo.async_update"(%0) {called_computation=@async_op, execution_thread="thread"} : (!mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>) -> !mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>
-  %2 = "mhlo.async_done"(%1) {called_computation=@async_op, execution_thread="thread"} : (!mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>) -> tensor<32xf32>
+  %1 = "mhlo.async_update"(%0) : (!mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>) -> !mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>
+  %2 = "mhlo.async_done"(%1) : (!mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>) -> tensor<32xf32>
   func.return %2 : tensor<32xf32>
 }
 
@@ -6196,8 +6393,8 @@ func.func @async_op(%arg0: tensor<10x10xf32>) -> tensor<32xf32>
 func.func @async(%arg0: tensor<10x10xf32>) -> tensor<32xf32> {
   // expected-error@+1 {{op execution_thread does not match the execution_thread of async_op. Got: "thread", but expected "thread2".}}
   %0 = "mhlo.async_start"(%arg0) {called_computation=@async_op, execution_thread="thread"} : (tensor<10x10xf32>) -> !mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>
-  %1 = "mhlo.async_update"(%0) {called_computation=@async_op, execution_thread="thread"} : (!mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>) -> !mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>
-  %2 = "mhlo.async_done"(%1) {called_computation=@async_op, execution_thread="thread"} : (!mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>) -> tensor<32xf32>
+  %1 = "mhlo.async_update"(%0) : (!mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>) -> !mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>
+  %2 = "mhlo.async_done"(%1) : (!mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>) -> tensor<32xf32>
   func.return %2 : tensor<32xf32>
 }
 
@@ -6212,8 +6409,8 @@ func.func @async_op(%arg0: tensor<10x10xf32>) -> tensor<32xf32>
 func.func @async(%arg0: tensor<10x10xf32>) -> tensor<32xf32> {
   // expected-error@+1 {{number of operands doesn't match operands for async_op. Got: 0, but expected: 1.}}
   %0 = "mhlo.async_start"() {called_computation=@async_op, execution_thread="thread"} : () -> !mhlo.async_bundle<tensor<64xf32>, tensor<32xf32>, tensor<i32>>
-  %1 = "mhlo.async_update"(%0) {called_computation=@async_op, execution_thread="thread"} : (!mhlo.async_bundle<tensor<64xf32>, tensor<32xf32>, tensor<i32>>) -> !mhlo.async_bundle<tensor<64xf32>, tensor<32xf32>, tensor<i32>>
-  %2 = "mhlo.async_done"(%1) {called_computation=@async_op, execution_thread="thread"} : (!mhlo.async_bundle<tensor<64xf32>, tensor<32xf32>, tensor<i32>>) -> tensor<32xf32>
+  %1 = "mhlo.async_update"(%0) : (!mhlo.async_bundle<tensor<64xf32>, tensor<32xf32>, tensor<i32>>) -> !mhlo.async_bundle<tensor<64xf32>, tensor<32xf32>, tensor<i32>>
+  %2 = "mhlo.async_done"(%1) : (!mhlo.async_bundle<tensor<64xf32>, tensor<32xf32>, tensor<i32>>) -> tensor<32xf32>
   func.return %2 : tensor<32xf32>
 }
 
@@ -6228,46 +6425,8 @@ func.func @async_op(%arg0: tensor<10x10xf32>) -> tensor<32xf32>
 func.func @async(%arg0: tensor<f32>) -> tensor<32xf32> {
   // expected-error@+1 {{type mismatch on argument #0 of async_op. Got: 'tensor<f32>', but expected: 'tensor<10x10xf32>'}}
   %0 = "mhlo.async_start"(%arg0) {called_computation=@async_op, execution_thread="thread"} : (tensor<f32>) -> !mhlo.async_bundle<tensor<64xf32>, tensor<32xf32>, tensor<i32>>
-  %1 = "mhlo.async_update"(%0) {called_computation=@async_op, execution_thread="thread"} : (!mhlo.async_bundle<tensor<64xf32>, tensor<32xf32>, tensor<i32>>) -> !mhlo.async_bundle<tensor<64xf32>, tensor<32xf32>, tensor<i32>>
-  %2 = "mhlo.async_done"(%1) {called_computation=@async_op, execution_thread="thread"} : (!mhlo.async_bundle<tensor<64xf32>, tensor<32xf32>, tensor<i32>>) -> tensor<32xf32>
-  func.return %2 : tensor<32xf32>
-}
-
-// -----
-/////
-// async_update negative tests
-/////
-
-func.func @async_op(%arg0: tensor<10x10xf32>) -> tensor<32xf32>
-  attributes {execution_thread = "thread"} {
-  %1 = mhlo.constant dense<2.0> : tensor<32xf32>
-  func.return %1 : tensor<32xf32>
-}
-
-func.func @async(%arg0: tensor<10x10xf32>) -> tensor<32xf32> {
-  %0 = "mhlo.async_start"(%arg0) {called_computation=@async_op, execution_thread="thread"} : (tensor<10x10xf32>) -> !mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>
-  // expected-error@+1 {{op execution_thread does not match name of async_op. Got: "thread2", but expected "thread".}}
-  %1 = "mhlo.async_update"(%0) {called_computation=@async_op, execution_thread="thread2"} : (!mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>) -> !mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>
-  %2 = "mhlo.async_done"(%1) {called_computation=@async_op, execution_thread="thread"} : (!mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>) -> tensor<32xf32>
-  func.return %2 : tensor<32xf32>
-}
-
-// -----
-/////
-// async_update negative tests
-/////
-
-func.func @async_op(%arg0: tensor<10x10xf32>) -> tensor<32xf32>
-  attributes {execution_thread = "thread"} {
-  %1 = mhlo.constant dense<2.0> : tensor<32xf32>
-  func.return %1 : tensor<32xf32>
-}
-
-func.func @async(%arg0: tensor<10x10xf32>) -> tensor<32xf32> {
-  %0 = "mhlo.async_start"(%arg0) {called_computation=@async_op, execution_thread="thread"} : (tensor<10x10xf32>) -> !mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>
-  %1 = "mhlo.async_update"(%0) {called_computation=@async_op, execution_thread="thread"} : (!mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>) -> !mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>
-  // expected-error@+1 {{op execution_thread does not match name of async_op. Got: "thread2", but expected "thread".}}
-  %2 = "mhlo.async_done"(%1) {called_computation=@async_op, execution_thread="thread2"} : (!mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>) -> tensor<32xf32>
+  %1 = "mhlo.async_update"(%0) : (!mhlo.async_bundle<tensor<64xf32>, tensor<32xf32>, tensor<i32>>) -> !mhlo.async_bundle<tensor<64xf32>, tensor<32xf32>, tensor<i32>>
+  %2 = "mhlo.async_done"(%1) : (!mhlo.async_bundle<tensor<64xf32>, tensor<32xf32>, tensor<i32>>) -> tensor<32xf32>
   func.return %2 : tensor<32xf32>
 }
 
@@ -6281,10 +6440,10 @@ func.func @async_op(%arg0: tensor<10x10xf32>) -> tensor<32xf32>
 
 func.func @async(%arg0: tensor<10x10xf32>) -> tensor<f32> {
   %0 = "mhlo.async_start"(%arg0) {called_computation=@async_op, execution_thread="thread"} : (tensor<10x10xf32>) -> !mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>
-  %1 = "mhlo.async_update"(%0) {called_computation=@async_op, execution_thread="thread"} : (!mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>) -> !mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>
+  %1 = "mhlo.async_update"(%0) : (!mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>) -> !mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>
   // expected-error@+2 {{'mhlo.async_done' op failed to infer returned types}}
   // expected-error@+1 {{inferred type(s) 'tensor<32xf32>' are incompatible with return type(s) of operation 'tensor<f32>'}}
-  %2 = "mhlo.async_done"(%1) {called_computation=@async_op, execution_thread="thread"} : (!mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>) -> tensor<f32>
+  %2 = "mhlo.async_done"(%1) : (!mhlo.async_bundle<tensor<10x10xf32>, tensor<32xf32>, tensor<i32>>) -> tensor<f32>
   func.return %2 : tensor<f32>
 }
 
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/shape_cstr_legalize_to_hlo.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/shape_cstr_legalize_to_hlo.mlir
index e5b645c816a534..fe28bbe8f4977b 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/shape_cstr_legalize_to_hlo.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/shape_cstr_legalize_to_hlo.mlir
@@ -108,82 +108,3 @@ func.func @shape_cstr_broadcast_too_many_operands(%arg0: tensor<4xindex>, %arg1:
   }
   func.return
 }
-
-// -----
-
-func.func @mhlo_cstr_reshapable(%arg0: index, %arg1: tensor<2xindex>) {
-  %0 = mhlo.cstr_reshapable %arg0, %arg1 : (index, tensor<2xindex>) -> !shape.witness
-  func.return
-  //  CHECK-DAG: %[[NUM_ELEMENTS:.*]] = builtin.unrealized_conversion_cast %arg0 : index to tensor<i32>
-  //  CHECK-DAG: %[[DYNAMIC_SHAPE:.*]] = builtin.unrealized_conversion_cast %arg1 : tensor<2xindex> to tensor<2xi32>
-  //  CHECK-DAG: %[[MINUS_ONE:.*]] = mhlo.constant dense<-1> : tensor<i32>
-  //  CHECK-DAG: %[[ONE:.*]] = mhlo.constant dense<1> : tensor<i32>
-  //  CHECK-DAG: %[[ZERO:.*]] = mhlo.constant dense<0> : tensor<i32>
-  // CHECK-NEXT: %[[DIM_SIZE_1:.*]] = "mhlo.slice"(%[[DYNAMIC_SHAPE]]) <{limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi32>) -> tensor<1xi32>
-  // CHECK-NEXT: %[[DIM_SIZE_SCALAR_1:.*]] = mhlo.reshape %[[DIM_SIZE_1]] : (tensor<1xi32>) -> tensor<i32>
-  // CHECK-NEXT: %[[ALL_DIMS_PRODUCT_1:.*]] = mhlo.multiply %[[ONE]], %[[DIM_SIZE_SCALAR_1]] : tensor<i32>
-  // CHECK-NEXT: %[[EQ_MINUS_ONE_1:.*]] = mhlo.compare  EQ, %[[DIM_SIZE_SCALAR_1]], %[[MINUS_ONE]],  NOTYPE : (tensor<i32>, tensor<i32>) -> tensor<i1>
-  // CHECK-NEXT: %[[DYNAMIC_DIM_1:.*]] = mhlo.select %[[EQ_MINUS_ONE_1]], %[[ONE]], %[[ZERO]] : tensor<i1>, tensor<i32>
-  // CHECK-NEXT: %[[NUM_DYNAMIC_DIM_1:.*]] = mhlo.add %[[ZERO]], %[[DYNAMIC_DIM_1]] : tensor<i32>
-  // CHECK-NEXT: %[[DIM_SIZE_2:.*]] = "mhlo.slice"(%[[DYNAMIC_SHAPE]]) <{limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi32>) -> tensor<1xi32>
-  // CHECK-NEXT: %[[DIM_SIZE_SCALAR_2:.*]] = mhlo.reshape %[[DIM_SIZE_2]] : (tensor<1xi32>) -> tensor<i32>
-  // CHECK-NEXT: %[[ALL_DIMS_PRODUCT:.*]] = mhlo.multiply %[[ALL_DIMS_PRODUCT_1]], %[[DIM_SIZE_SCALAR_2]] : tensor<i32>
-  // CHECK-NEXT: %[[EQ_MINUS_ONE_2:.*]] = mhlo.compare  EQ, %[[DIM_SIZE_SCALAR_2]], %[[MINUS_ONE]],  NOTYPE : (tensor<i32>, tensor<i32>) -> tensor<i1>
-  // CHECK-NEXT: %[[DYNAMIC_DIM_2:.*]] = mhlo.select %[[EQ_MINUS_ONE_2]], %[[ONE]], %[[ZERO]] : tensor<i1>, tensor<i32>
-  // CHECK-NEXT: %[[NUM_DYNAMIC_DIM:.*]] = mhlo.add %[[NUM_DYNAMIC_DIM_1]], %[[DYNAMIC_DIM_2]] : tensor<i32>
-  // CHECK-NEXT: %[[ONLY_ONE_DYNAMIC_DIM:.*]] = mhlo.compare  EQ, %[[NUM_DYNAMIC_DIM]], %[[ONE]],  NOTYPE : (tensor<i32>, tensor<i32>) -> tensor<i1>
-  // CHECK-NEXT: %[[STATIC_DIMS_PRODUCT:.*]] = mhlo.multiply %[[ALL_DIMS_PRODUCT]], %[[MINUS_ONE]] : tensor<i32>
-  // CHECK-NEXT: %[[REM:.*]] = mhlo.remainder %[[NUM_ELEMENTS]], %[[STATIC_DIMS_PRODUCT]] : tensor<i32>
-  // CHECK-NEXT: %[[NO_RESIDUAL:.*]] = mhlo.compare  EQ, %[[REM]], %[[ZERO]],  NOTYPE : (tensor<i32>, tensor<i32>) -> tensor<i1>
-  // CHECK-NEXT: %[[DYNAMIC_RESHAPABLE:.*]] = mhlo.and %[[NO_RESIDUAL]], %[[ONLY_ONE_DYNAMIC_DIM]] : tensor<i1>
-  // CHECK-NEXT: %[[NO_DYNAMIC_DIM:.*]] = mhlo.compare EQ, %16, %[[ZERO]], NOTYPE : (tensor<i32>, tensor<i32>) -> tensor<i1>
-  // CHECK-NEXT: %[[NUM_ELEMENTS_EQUALS:.*]] = mhlo.compare EQ, %[[ALL_DIMS_PRODUCT]], %[[NUM_ELEMENTS]], NOTYPE : (tensor<i32>, tensor<i32>) -> tensor<i1>
-  // CHECK-NEXT: %[[STATIC_RESHAPABLE:.*]] = mhlo.and %[[NO_DYNAMIC_DIM]], %[[NUM_ELEMENTS_EQUALS]] : tensor<i1>
-  // CHECK-NEXT: %[[RESHAPABLE:.*]] = mhlo.or %[[DYNAMIC_RESHAPABLE]], %[[STATIC_RESHAPABLE]] : tensor<i1>
-  // CHECK-NEXT: mhlo.custom_call @shape_assertion(%[[RESHAPABLE]]) {error_message = "Shape assertion failed", has_side_effect = true} : (tensor<i1>) -> ()
-}
-
-// -----
-
-// CHECK-LABEL: func.func @mhlo_cstr_reshapable_const
-func.func @mhlo_cstr_reshapable_const(%arg0: tensor<?x2xf32>) {
-  %0 = arith.constant 20 : index
-  %1 = mhlo.constant dense<[-1, 4]> : tensor<2xi32>
-  %2 = mhlo.cstr_reshapable %0, %1 : (index, tensor<2xi32>) -> !shape.witness
-  func.return
-  //  CHECK-DAG: %[[DYNAMIC_SHAPE:.*]] = mhlo.constant dense<[-1, 4]> : tensor<2xi32>
-  //  CHECK-DAG: %[[NUM_ELEMENTS:.*]] = mhlo.constant dense<20> : tensor<i32>
-  //  CHECK-DAG: %[[MINUS_ONE:.*]] = mhlo.constant dense<-1> : tensor<i32>
-  //  CHECK-DAG: %[[ONE:.*]] = mhlo.constant dense<1> : tensor<i32>
-  //  CHECK-DAG: %[[ZERO:.*]] = mhlo.constant dense<0> : tensor<i32>
-  // CHECK-NEXT: %[[DIM_SIZE_1:.*]] = "mhlo.slice"(%[[DYNAMIC_SHAPE]]) <{limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi32>) -> tensor<1xi32>
-  // CHECK-NEXT: %[[DIM_SIZE_SCALAR_1:.*]] = mhlo.reshape %[[DIM_SIZE_1]] : (tensor<1xi32>) -> tensor<i32>
-  // CHECK-NEXT: %[[ALL_DIMS_PRODUCT_1:.*]] = mhlo.multiply %[[ONE]], %[[DIM_SIZE_SCALAR_1]] : tensor<i32>
-  // CHECK-NEXT: %[[EQ_MINUS_ONE_1:.*]] = mhlo.compare  EQ, %[[DIM_SIZE_SCALAR_1]], %[[MINUS_ONE]],  NOTYPE : (tensor<i32>, tensor<i32>) -> tensor<i1>
-  // CHECK-NEXT: %[[DYNAMIC_DIM_1:.*]] = mhlo.select %[[EQ_MINUS_ONE_1]], %[[ONE]], %[[ZERO]] : tensor<i1>, tensor<i32>
-  // CHECK-NEXT: %[[NUM_DYNAMIC_DIM_1:.*]] = mhlo.add %[[ZERO]], %[[DYNAMIC_DIM_1]] : tensor<i32>
-  // CHECK-NEXT: %[[DIM_SIZE_2:.*]] = "mhlo.slice"(%[[DYNAMIC_SHAPE]]) <{limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi32>) -> tensor<1xi32>
-  // CHECK-NEXT: %[[DIM_SIZE_SCALAR_2:.*]] = mhlo.reshape %[[DIM_SIZE_2]] : (tensor<1xi32>) -> tensor<i32>
-  // CHECK-NEXT: %[[ALL_DIMS_PRODUCT:.*]] = mhlo.multiply %[[ALL_DIMS_PRODUCT_1]], %[[DIM_SIZE_SCALAR_2]] : tensor<i32>
-  // CHECK-NEXT: %[[EQ_MINUS_ONE_2:.*]] = mhlo.compare  EQ, %[[DIM_SIZE_SCALAR_2]], %[[MINUS_ONE]],  NOTYPE : (tensor<i32>, tensor<i32>) -> tensor<i1>
-  // CHECK-NEXT: %[[DYNAMIC_DIM_2:.*]] = mhlo.select %[[EQ_MINUS_ONE_2]], %[[ONE]], %[[ZERO]] : tensor<i1>, tensor<i32>
-  // CHECK-NEXT: %[[NUM_DYNAMIC_DIM:.*]] = mhlo.add %[[NUM_DYNAMIC_DIM_1]], %[[DYNAMIC_DIM_2]] : tensor<i32>
-  // CHECK-NEXT: %[[ONLY_ONE_DYNAMIC_DIM:.*]] = mhlo.compare  EQ, %[[NUM_DYNAMIC_DIM]], %[[ONE]],  NOTYPE : (tensor<i32>, tensor<i32>) -> tensor<i1>
-  // CHECK-NEXT: %[[STATIC_DIMS_PRODUCT:.*]] = mhlo.multiply %[[ALL_DIMS_PRODUCT]], %[[MINUS_ONE]] : tensor<i32>
-  // CHECK-NEXT: %[[REM:.*]] = mhlo.remainder %[[NUM_ELEMENTS]], %[[STATIC_DIMS_PRODUCT]] : tensor<i32>
-  // CHECK-NEXT: %[[NO_RESIDUAL:.*]] = mhlo.compare  EQ, %[[REM]], %[[ZERO]],  NOTYPE : (tensor<i32>, tensor<i32>) -> tensor<i1>
-  // CHECK-NEXT: %[[DYNAMIC_RESHAPABLE:.*]] = mhlo.and %[[NO_RESIDUAL]], %[[ONLY_ONE_DYNAMIC_DIM]] : tensor<i1>
-  // CHECK-NEXT: %[[NO_DYNAMIC_DIM:.*]] = mhlo.compare EQ, %16, %[[ZERO]], NOTYPE : (tensor<i32>, tensor<i32>) -> tensor<i1>
-  // CHECK-NEXT: %[[NUM_ELEMENTS_EQUALS:.*]] = mhlo.compare EQ, %[[ALL_DIMS_PRODUCT]], %[[NUM_ELEMENTS]], NOTYPE : (tensor<i32>, tensor<i32>) -> tensor<i1>
-  // CHECK-NEXT: %[[STATIC_RESHAPABLE:.*]] = mhlo.and %[[NO_DYNAMIC_DIM]], %[[NUM_ELEMENTS_EQUALS]] : tensor<i1>
-  // CHECK-NEXT: %[[RESHAPABLE:.*]] = mhlo.or %[[DYNAMIC_RESHAPABLE]], %[[STATIC_RESHAPABLE]] : tensor<i1>
-  // CHECK-NEXT: mhlo.custom_call @shape_assertion(%[[RESHAPABLE]]) {error_message = "Shape assertion failed", has_side_effect = true} : (tensor<i1>) -> ()
-}
-
-// -----
-
-func.func @mhlo_cstr_reshapable_i8(%arg0: index, %arg1: tensor<2xi8>) {
-  // expected-error@+1 {{failed to legalize operation 'mhlo.cstr_reshapable' that was explicitly marked illegal}}
-  %0 = mhlo.cstr_reshapable %arg0, %arg1 : (index, tensor<2xi8>) -> !shape.witness
-  func.return
-}
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/shape_cstr_legalize_to_hlo_e2e.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/shape_cstr_legalize_to_hlo_e2e.mlir
deleted file mode 100644
index 4d99d6a03294e7..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/shape_cstr_legalize_to_hlo_e2e.mlir
+++ /dev/null
@@ -1,82 +0,0 @@
-// RUN: mlir-hlo-opt --shape-legalize-to-hlo=legalize-constraints=true -reconcile-unrealized-casts -canonicalize --split-input-file --verify-diagnostics %s | FileCheck %s
-// This test verifies e2e lowering of cstr ops result is correct for constant inputs.
-
-// -----
-
-// CHECK-LABEL: func.func @mhlo_cstr_reshapable_true
-func.func @mhlo_cstr_reshapable_true(%arg0: tensor<?x2xf32>) -> tensor<?x2xf32> {
-  %0 = arith.constant 16 : index
-  %1 = mhlo.constant dense<[-1, 4, 2]> : tensor<3xi32>
-  %2 = mhlo.cstr_reshapable %0, %1 : (index, tensor<3xi32>) -> !shape.witness
-  %3 = shape.assuming %2 -> tensor<?x2xf32> {
-    shape.assuming_yield %arg0 : tensor<?x2xf32>
-  }
-  func.return %3 : tensor<?x2xf32>
-  //      CHECK: %[[TRUE:.*]] = mhlo.constant dense<true> : tensor<i1>
-  // CHECK-NEXT: mhlo.custom_call @shape_assertion(%[[TRUE]]) {error_message = "Shape assertion failed", has_side_effect = true} : (tensor<i1>) -> ()
-  // CHECK-NEXT: return %arg0 : tensor<?x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func.func @mhlo_cstr_reshapable_has_residual
-func.func @mhlo_cstr_reshapable_has_residual(%arg0: tensor<?x2xf32>) -> tensor<?x2xf32> {
-  %0 = arith.constant 19 : index
-  %1 = mhlo.constant dense<[-1, 4]> : tensor<2xi32>
-  %2 = mhlo.cstr_reshapable %0, %1 : (index, tensor<2xi32>) -> !shape.witness
-  %3 = shape.assuming %2 -> tensor<?x2xf32> {
-    shape.assuming_yield %arg0 : tensor<?x2xf32>
-  }
-  func.return %3 : tensor<?x2xf32>
-  //      CHECK: %[[FALSE:.*]] = mhlo.constant dense<false> : tensor<i1>
-  // CHECK-NEXT: mhlo.custom_call @shape_assertion(%[[FALSE]]) {error_message = "Shape assertion failed", has_side_effect = true} : (tensor<i1>) -> ()
-  // CHECK-NEXT: return %arg0 : tensor<?x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func.func @mhlo_cstr_reshapable_2_dynamic_dims
-func.func @mhlo_cstr_reshapable_2_dynamic_dims(%arg0: tensor<?x2xf32>) -> tensor<?x2xf32> {
-  %0 = arith.constant 20 : index
-  %1 = mhlo.constant dense<[-1, 4, -1]> : tensor<3xi32>
-  %2 = mhlo.cstr_reshapable %0, %1 : (index, tensor<3xi32>) -> !shape.witness
-  %3 = shape.assuming %2 -> tensor<?x2xf32> {
-    shape.assuming_yield %arg0 : tensor<?x2xf32>
-  }
-  func.return %3 : tensor<?x2xf32>
-  //      CHECK: %[[FALSE:.*]] = mhlo.constant dense<false> : tensor<i1>
-  // CHECK-NEXT: mhlo.custom_call @shape_assertion(%[[FALSE]]) {error_message = "Shape assertion failed", has_side_effect = true} : (tensor<i1>) -> ()
-  // CHECK-NEXT: return %arg0 : tensor<?x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func.func @mhlo_cstr_reshapable_static_true
-func.func @mhlo_cstr_reshapable_static_true(%arg0: tensor<?x2xf32>) -> tensor<?x2xf32> {
-  %0 = arith.constant 20 : index
-  %1 = mhlo.constant dense<[1, 4, 5]> : tensor<3xi32>
-  %2 = mhlo.cstr_reshapable %0, %1 : (index, tensor<3xi32>) -> !shape.witness
-  %3 = shape.assuming %2 -> tensor<?x2xf32> {
-    shape.assuming_yield %arg0 : tensor<?x2xf32>
-  }
-  func.return %3 : tensor<?x2xf32>
-  //      CHECK: %[[TRUE:.*]] = mhlo.constant dense<true> : tensor<i1>
-  // CHECK-NEXT: mhlo.custom_call @shape_assertion(%[[TRUE]]) {error_message = "Shape assertion failed", has_side_effect = true} : (tensor<i1>) -> ()
-  // CHECK-NEXT: return %arg0 : tensor<?x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func.func @mhlo_cstr_reshapable_static_false
-func.func @mhlo_cstr_reshapable_static_false(%arg0: tensor<?x2xf32>) -> tensor<?x2xf32> {
-  %0 = arith.constant 21 : index
-  %1 = mhlo.constant dense<[1, 4, 5]> : tensor<3xi32>
-  %2 = mhlo.cstr_reshapable %0, %1 : (index, tensor<3xi32>) -> !shape.witness
-  %3 = shape.assuming %2 -> tensor<?x2xf32> {
-    shape.assuming_yield %arg0 : tensor<?x2xf32>
-  }
-  func.return %3 : tensor<?x2xf32>
-  //      CHECK: %[[FALSE:.*]] = mhlo.constant dense<false> : tensor<i1>
-  // CHECK-NEXT: mhlo.custom_call @shape_assertion(%[[FALSE]]) {error_message = "Shape assertion failed", has_side_effect = true} : (tensor<i1>) -> ()
-  // CHECK-NEXT: return %arg0 : tensor<?x2xf32>
-}
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/shape_legalize_to_hlo.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/shape_legalize_to_hlo.mlir
index dc338bca947731..745d17ac89a801 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/shape_legalize_to_hlo.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/shape_legalize_to_hlo.mlir
@@ -1,31 +1,5 @@
 // RUN: mlir-hlo-opt --shape-legalize-to-hlo --split-input-file --verify-diagnostics %s | FileCheck %s
 
-// CHECK-LABEL: func.func @compute_reshape_shape
-func.func @compute_reshape_shape(%arg0: index, %arg1: tensor<2xi32>) -> tensor<2xi32> {
-  %0 = mhlo.compute_reshape_shape %arg0, %arg1 : (index, tensor<2xi32>) -> tensor<2xi32>
-  func.return %0 : tensor<2xi32>
-  //      CHECK: %[[ARG0_I32:.*]] = builtin.unrealized_conversion_cast %arg0 : index to tensor<i32>
-  // CHECK-NEXT: %[[TMP0:.*]] = mhlo.constant dense<-1> : tensor<i32>
-  // CHECK-NEXT: %[[INPUT_SIZE0x1:.*]] = "mhlo.slice"(%arg1) <{limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi32>) -> tensor<1xi32>
-  // CHECK-NEXT: %[[INPUT_SIZE0:.*]] = mhlo.reshape %[[INPUT_SIZE0x1]] : (tensor<1xi32>) -> tensor<i32>
-  // CHECK-NEXT: %[[TMP1:.*]] = mhlo.multiply %[[TMP0]], %[[INPUT_SIZE0]] : tensor<i32>
-  // CHECK-NEXT: %[[INPUT_SIZE1x1:.*]] = "mhlo.slice"(%arg1) <{limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}> : (tensor<2xi32>) -> tensor<1xi32>
-  // CHECK-NEXT: %[[INPUT_SIZE1:.*]] = mhlo.reshape %[[INPUT_SIZE1x1]] : (tensor<1xi32>) -> tensor<i32>
-  // CHECK-NEXT: %[[INPUT_SIZE_PRODUCT:.*]] = mhlo.multiply %[[TMP1]], %[[INPUT_SIZE1]] : tensor<i32>
-  // CHECK-NEXT: %[[COMPUTED_SIZE:.*]] = mhlo.divide %[[ARG0_I32]], %[[INPUT_SIZE_PRODUCT]] : tensor<i32>
-  // CHECK-NEXT: %[[M1:.*]] = mhlo.constant dense<-1> : tensor<i32>
-  // CHECK-NEXT: %[[INPUT_SIZE0_EQ_M1:.*]] = mhlo.compare  EQ, %3, %[[M1]],  NOTYPE : (tensor<i32>, tensor<i32>) -> tensor<i1>
-  // CHECK-NEXT: %[[RESULT_SIZE0:.*]] = mhlo.select %[[INPUT_SIZE0_EQ_M1]], %[[COMPUTED_SIZE]], %3 : tensor<i1>, tensor<i32>
-  // CHECK-NEXT: %[[RESULT_SIZE0x1:.*]] = mhlo.reshape %[[RESULT_SIZE0]] : (tensor<i32>) -> tensor<1xi32>
-  // CHECK-NEXT: %[[INPUT_SIZE1_EQ_M1:.*]] = mhlo.compare  EQ, %6, %[[M1]],  NOTYPE : (tensor<i32>, tensor<i32>) -> tensor<i1>
-  // CHECK-NEXT: %[[RESULT_SIZE1:.*]] = mhlo.select %[[INPUT_SIZE1_EQ_M1]], %[[COMPUTED_SIZE]], %6 : tensor<i1>, tensor<i32>
-  // CHECK-NEXT: %[[RESULT_SIZE1x1:.*]] = mhlo.reshape %[[RESULT_SIZE1]] : (tensor<i32>) -> tensor<1xi32>
-  // CHECK-NEXT: %[[RESULT:.*]] = "mhlo.concatenate"(%[[RESULT_SIZE0x1]], %[[RESULT_SIZE1x1]]) <{dimension = 0 : i64}> : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
-  // CHECK-NEXT: return %[[RESULT]] : tensor<2xi32>
-}
-
-// -----
-
 // CHECK-LABEL: func.func @num_elements_tensor_to_index
 func.func @num_elements_tensor_to_index(%arg0: tensor<2xindex>) -> index {
   %0 = shape.num_elements %arg0 : tensor<2xindex> -> index
@@ -213,18 +187,6 @@ func.func @shape_cstr_broadcastable(%arg0: tensor<2xindex>, %arg1: tensor<2xinde
 
 // -----
 
-func.func @mhlo_cstr_reshapable(%arg0: index, %arg1: tensor<2xindex>, %arg2: tensor<?x2xf32>) -> tensor<?x4xf32> {
-  // expected-error@+1 {{failed to legalize operation 'mhlo.cstr_reshapable' that was explicitly marked illegal}}
-  %0 = mhlo.cstr_reshapable %arg0, %arg1 : (index, tensor<2xindex>) -> !shape.witness
-  %1 = shape.assuming %0 -> (tensor<?x4xf32>) {
-    %2 = mhlo.dynamic_reshape %arg2, %arg1 : (tensor<?x2xf32>, tensor<2xindex>) -> tensor<?x4xf32>
-    shape.assuming_yield %2 : tensor<?x4xf32>
-  }
-  func.return %1 : tensor<?x4xf32>
-}
-
-// -----
-
 // CHECK-LABEL: func @const_shape
 func.func @const_shape() -> tensor<2xindex> {
   %0 = shape.const_shape [6, 4] : tensor<2xindex>
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir
index 8c8c890dbf87ac..7107b7d615e7b8 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir
@@ -311,7 +311,7 @@ func.func @attr_transpose_adjoint(%arg0: tensor<16x16xf32>, %arg1: tensor<16x16x
 func.func @attr_type_extensions_bounds(
     %arg0: tensor<?x?xf32, #stablehlo.type_extensions<bounds = [16, ?]>>)
     -> tensor<?x?xf32, #stablehlo.type_extensions<bounds = [16, ?]>> {
-  // CHECK: "func.return"(%arg0) : (tensor<?x?xf32, #mhlo.type_extensions<bounds = [16, ?]>>) -> ()
+  // CHECK: "func.return"([[ARG0:%arg[0-9]+]]) : (tensor<?x?xf32, #mhlo.type_extensions<bounds = [16, ?]>>) -> ()
   func.return %arg0 : tensor<?x?xf32, #stablehlo.type_extensions<bounds = [16, ?]>>
 }
 
@@ -320,28 +320,28 @@ func.func @attr_type_extensions_bounds(
 
 // CHECK-LABEL: "op_abs"
 func.func @op_abs(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "mhlo.abs"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "mhlo.abs"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.abs"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_add"
 func.func @op_add(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
-  // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: "mhlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_after_all"
 func.func @op_after_all(%arg0: !stablehlo.token) -> !stablehlo.token {
-  // CHECK: "mhlo.after_all"(%arg0) : (!mhlo.token) -> !mhlo.token
+  // CHECK: "mhlo.after_all"([[ARG0:%arg[0-9]+]]) : (!mhlo.token) -> !mhlo.token
   %0 = "stablehlo.after_all"(%arg0) : (!stablehlo.token) -> !stablehlo.token
   func.return %0 : !stablehlo.token
 }
 
 // CHECK-LABEL: "op_all_gather"
 func.func @op_all_gather(%arg0: tensor<16x8xf32>) -> tensor<16x16xf32> {
-  //               CHECK: "mhlo.all_gather"(%arg0) <{
+  //               CHECK: "mhlo.all_gather"([[ARG0:%arg[0-9]+]]) <{
   //          CHECK-SAME:   all_gather_dim = 1 : i64,
   //          CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 0, type = 0>,
   // CHECK-SAME{LITERAL}:   replica_groups = dense<[[0], [1]]> : tensor<2x1xi64>,
@@ -358,7 +358,7 @@ func.func @op_all_gather(%arg0: tensor<16x8xf32>) -> tensor<16x16xf32> {
 
 // CHECK-LABEL: "op_all_reduce"
 func.func @op_all_reduce(%arg0: tensor<f32>) -> tensor<f32> {
-  //               CHECK: "mhlo.all_reduce"(%arg0) <{
+  //               CHECK: "mhlo.all_reduce"([[ARG0:%arg[0-9]+]]) <{
   //          CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
   // CHECK-SAME{LITERAL}:   replica_groups = dense<[[0], [1]]> : tensor<2x1xi64>,
   //          CHECK-SAME:   use_global_device_ids
@@ -383,7 +383,7 @@ func.func @op_all_reduce(%arg0: tensor<f32>) -> tensor<f32> {
 func.func @op_all_reduce_tuple(%arg0: tensor<8xf32>, %arg1: tensor<f32>) -> (tensor<8xf32>, tensor<f32>) {
   //      CHECK: "mhlo.all_reduce"(%[[ARG0:.*]], %[[ARG1:.*]]) <{replica_groups = dense<> : tensor<0x0xi64>}> ({
   // CHECK-NEXT:   ^bb0(%[[ARG2:.*]]: tensor<f32>, %[[ARG3:.*]]: tensor<f32>):
-  // CHECK-NEXT:     %[[ADD:.*]] = "mhlo.add"(%arg2, %arg3) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK-NEXT:     %[[ADD:.*]] = "mhlo.add"(%[[ARG2]], %[[ARG3]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   // CHECK-NEXT:     "mhlo.return"(%[[ADD]]) : (tensor<f32>) -> ()
   // CHECK-NEXT: }) : (tensor<8xf32>, tensor<f32>) -> (tensor<8xf32>, tensor<f32>)
   %0:2 = stablehlo.custom_call @mhlo.all_reduce(%arg0, %arg1) {called_computations = [@all_reduce0], mhlo.attributes = {replica_groups = dense<> : tensor<0x0xi64>}} : (tensor<8xf32>, tensor<f32>) -> (tensor<8xf32>, tensor<f32>)
@@ -396,7 +396,7 @@ func.func @all_reduce0(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
 
 // CHECK-LABEL: "op_all_to_all"
 func.func @op_all_to_all(%arg0: tensor<4x16xf32>) -> tensor<16x4xf32> {
-  //               CHECK: "mhlo.all_to_all"(%arg0) <{
+  //               CHECK: "mhlo.all_to_all"([[ARG0:%arg[0-9]+]]) <{
   //          CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
   //          CHECK-SAME:   concat_dimension = 0 : i64,
   // CHECK-SAME{LITERAL}:   replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
@@ -415,21 +415,21 @@ func.func @op_all_to_all(%arg0: tensor<4x16xf32>) -> tensor<16x4xf32> {
 
 // CHECK-LABEL: "op_and"
 func.func @op_and(%arg0: tensor<i1>, %arg1: tensor<i1>) -> tensor<i1> {
-  // CHECK: "mhlo.and"(%arg0, %arg1) : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  // CHECK: "mhlo.and"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<i1>, tensor<i1>) -> tensor<i1>
   %0 = "stablehlo.and"(%arg0, %arg1) : (tensor<i1>, tensor<i1>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
 
 // CHECK-LABEL: "op_atan2"
 func.func @op_atan2(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
-  // CHECK: "mhlo.atan2"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: "mhlo.atan2"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.atan2"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_batch_norm_grad"
 func.func @op_batch_norm_grad(%arg0: tensor<16x16x16x16xf32>, %arg1: tensor<16xf32>, %arg2: tensor<16xf32>, %arg3: tensor<16xf32>, %arg4: tensor<16x16x16x16xf32>) -> (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>) {
-  //      CHECK: "mhlo.batch_norm_grad"(%arg0, %arg1, %arg2, %arg3, %arg4) <{
+  //      CHECK: "mhlo.batch_norm_grad"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]], [[ARG2:%arg[0-9]+]], [[ARG3:%arg[0-9]+]], [[ARG4:%arg[0-9]+]]) <{
   // CHECK-SAME:   epsilon = 1.000000e-03 : f32,
   // CHECK-SAME:   feature_index = 0 : i64
   // CHECK-SAME: }> : (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16x16x16x16xf32>) -> (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>)
@@ -442,7 +442,7 @@ func.func @op_batch_norm_grad(%arg0: tensor<16x16x16x16xf32>, %arg1: tensor<16xf
 
 // CHECK-LABEL: "op_batch_norm_inference"
 func.func @op_batch_norm_inference(%arg0: tensor<16x16x16x16xf32>, %arg1: tensor<16xf32>, %arg2: tensor<16xf32>, %arg3: tensor<16xf32>, %arg4: tensor<16xf32>) -> tensor<16x16x16x16xf32> {
-  //      CHECK: "mhlo.batch_norm_inference"(%arg0, %arg1, %arg2, %arg3, %arg4) <{
+  //      CHECK: "mhlo.batch_norm_inference"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]], [[ARG2:%arg[0-9]+]], [[ARG3:%arg[0-9]+]], [[ARG4:%arg[0-9]+]]) <{
   // CHECK-SAME:   epsilon = 1.000000e-03 : f32,
   // CHECK-SAME:   feature_index = 0 : i64
   // CHECK-SAME: }> : (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>) -> tensor<16x16x16x16xf32>
@@ -455,7 +455,7 @@ func.func @op_batch_norm_inference(%arg0: tensor<16x16x16x16xf32>, %arg1: tensor
 
 // CHECK-LABEL: "op_batch_norm_training"
 func.func @op_batch_norm_training(%arg0: tensor<16x16x16x16xf32>, %arg1: tensor<16xf32>, %arg2: tensor<16xf32>) -> (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>) {
-  //      CHECK: "mhlo.batch_norm_training"(%arg0, %arg1, %arg2) <{
+  //      CHECK: "mhlo.batch_norm_training"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]], [[ARG2:%arg[0-9]+]]) <{
   // CHECK-SAME:   epsilon = 1.000000e-03 : f32,
   // CHECK-SAME:   feature_index = 0 : i64
   // CHECK-SAME: }> : (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>) -> (tensor<16x16x16x16xf32>, tensor<16xf32>, tensor<16xf32>)
@@ -468,14 +468,14 @@ func.func @op_batch_norm_training(%arg0: tensor<16x16x16x16xf32>, %arg1: tensor<
 
 // CHECK-LABEL: "op_bitcast_convert"
 func.func @op_bitcast_convert(%arg0: tensor<i32>) -> tensor<f32> {
-  // CHECK: "mhlo.bitcast_convert"(%arg0) : (tensor<i32>) -> tensor<f32>
+  // CHECK: "mhlo.bitcast_convert"([[ARG0:%arg[0-9]+]]) : (tensor<i32>) -> tensor<f32>
   %0 = "stablehlo.bitcast_convert"(%arg0) : (tensor<i32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_broadcast_in_dim"
 func.func @op_broadcast_in_dim(%arg0: tensor<16xf32>) -> tensor<16x16xf32> {
-  //      CHECK: "mhlo.broadcast_in_dim"(%arg0) <{
+  //      CHECK: "mhlo.broadcast_in_dim"([[ARG0:%arg[0-9]+]]) <{
   // CHECK-SAME:   broadcast_dimensions = dense<1> : tensor<1xi64>
   // CHECK-SAME: }> : (tensor<16xf32>) -> tensor<16x16xf32>
   %0 = "stablehlo.broadcast_in_dim"(%arg0) {
@@ -486,7 +486,7 @@ func.func @op_broadcast_in_dim(%arg0: tensor<16xf32>) -> tensor<16x16xf32> {
 
 // CHECK-LABEL: "op_broadcast"
 func.func @op_broadcast(%arg0: tensor<16xf32>) -> tensor<16x16xf32> {
-  //      CHECK: "mhlo.broadcast"(%arg0) <{
+  //      CHECK: "mhlo.broadcast"([[ARG0:%arg[0-9]+]]) <{
   // CHECK-SAME:   broadcast_sizes = dense<16> : tensor<1xi64>
   // CHECK-SAME: }> : (tensor<16xf32>) -> tensor<16x16xf32>
   %0 = "stablehlo.broadcast"(%arg0) {
@@ -497,8 +497,8 @@ func.func @op_broadcast(%arg0: tensor<16xf32>) -> tensor<16x16xf32> {
 
 // CHECK-LABEL: "op_case"
 func.func @op_case(%arg0: tensor<i32>, %arg1: tensor<f32>) -> tensor<f32> {
-  //      CHECK: "mhlo.case"(%arg0) ({
-  // CHECK-NEXT:   "mhlo.return"(%arg1) : (tensor<f32>) -> ()
+  //      CHECK: "mhlo.case"([[ARG0:%arg[0-9]+]]) ({
+  // CHECK-NEXT:   "mhlo.return"([[ARG1:%arg[0-9]+]]) : (tensor<f32>) -> ()
   // CHECK-NEXT: }) : (tensor<i32>) -> tensor<f32>
   %0 = "stablehlo.case"(%arg0) ({
     "stablehlo.return"(%arg1) : (tensor<f32>) -> ()
@@ -508,21 +508,21 @@ func.func @op_case(%arg0: tensor<i32>, %arg1: tensor<f32>) -> tensor<f32> {
 
 // CHECK-LABEL: "op_cbrt"
 func.func @op_cbrt(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "mhlo.cbrt"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "mhlo.cbrt"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.cbrt"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_ceil"
 func.func @op_ceil(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "mhlo.ceil"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "mhlo.ceil"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.ceil"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_cholesky"
 func.func @op_cholesky(%arg0: tensor<1x16x16xf32>) -> tensor<1x16x16xf32> {
-  //      CHECK: "mhlo.cholesky"(%arg0) <{
+  //      CHECK: "mhlo.cholesky"([[ARG0:%arg[0-9]+]]) <{
   // CHECK-SAME:   lower = true
   // CHECK-SAME: }> : (tensor<1x16x16xf32>) -> tensor<1x16x16xf32>
   %0 = "stablehlo.cholesky"(%arg0) {
@@ -533,21 +533,21 @@ func.func @op_cholesky(%arg0: tensor<1x16x16xf32>) -> tensor<1x16x16xf32> {
 
 // CHECK-LABEL: "op_clamp"
 func.func @op_clamp(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<f32> {
-  // CHECK: "mhlo.clamp"(%arg0, %arg1, %arg2) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: "mhlo.clamp"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]], [[ARG2:%arg[0-9]+]]) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.clamp"(%arg0, %arg1, %arg2) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_count_leading_zeros"
 func.func @op_count_leading_zeros(%arg0: tensor<i32>) -> tensor<i32> {
-  // CHECK: "mhlo.count_leading_zeros"(%arg0) : (tensor<i32>) -> tensor<i32>
+  // CHECK: "mhlo.count_leading_zeros"([[ARG0:%arg[0-9]+]]) : (tensor<i32>) -> tensor<i32>
   %0 = "stablehlo.count_leading_zeros"(%arg0) : (tensor<i32>) -> tensor<i32>
   func.return %0 : tensor<i32>
 }
 
 // CHECK-LABEL: "op_collective_broadcast"
 func.func @op_collective_broadcast(%arg0: tensor<1x2xi64>) -> tensor<1x2xi64> {
-  //               CHECK: "mhlo.collective_broadcast"(%arg0) <{
+  //               CHECK: "mhlo.collective_broadcast"([[ARG0:%arg[0-9]+]]) <{
   //          CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 0, type = 0>,
   // CHECK-SAME{LITERAL}:   replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
   //          CHECK-SAME: }> : (tensor<1x2xi64>) -> tensor<1x2xi64>
@@ -560,7 +560,7 @@ func.func @op_collective_broadcast(%arg0: tensor<1x2xi64>) -> tensor<1x2xi64> {
 
 // CHECK-LABEL: "op_collective_permute"
 func.func @op_collective_permute(%arg0: tensor<16x8xf32>) -> tensor<16x8xf32> {
-  //               CHECK: "mhlo.collective_permute"(%arg0) <{
+  //               CHECK: "mhlo.collective_permute"([[ARG0:%arg[0-9]+]]) <{
   //          CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 0, type = 0>,
   // CHECK-SAME{LITERAL}:   source_target_pairs = dense<[[0, 1], [1, 2], [2, 3]]> : tensor<3x2xi64>
   //          CHECK-SAME: }> : (tensor<16x8xf32>) -> tensor<16x8xf32>
@@ -573,7 +573,7 @@ func.func @op_collective_permute(%arg0: tensor<16x8xf32>) -> tensor<16x8xf32> {
 
 // CHECK-LABEL: "op_compare"
 func.func @op_compare(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<i1> {
-  //      CHECK: "mhlo.compare"(%arg0, %arg1) <{
+  //      CHECK: "mhlo.compare"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) <{
   // CHECK-SAME:   compare_type = #mhlo<comparison_type TOTALORDER>,
   // CHECK-SAME:   comparison_direction = #mhlo<comparison_direction EQ>
   // CHECK-SAME: }> : (tensor<f32>, tensor<f32>) -> tensor<i1>
@@ -586,14 +586,14 @@ func.func @op_compare(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<i1> {
 
 // CHECK-LABEL: "op_complex"
 func.func @op_complex(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<complex<f32>> {
-  // CHECK: "mhlo.complex"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<complex<f32>>
+  // CHECK: "mhlo.complex"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<f32>, tensor<f32>) -> tensor<complex<f32>>
   %0 = "stablehlo.complex"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<complex<f32>>
   func.return %0 : tensor<complex<f32>>
 }
 
 // CHECK-LABEL: "op_composite"
 func.func @op_composite(%arg0 : tensor<i64>) -> tensor<i64> {
-  // CHECK: "mhlo.composite"(%arg0) <{composite_attributes = {n = 2 : i64}, decomposition = @add_n.impl, name = "stablehlo.add_n"}> : (tensor<i64>) -> tensor<i64>
+  // CHECK: "mhlo.composite"([[ARG0:%arg[0-9]+]]) <{composite_attributes = {n = 2 : i64}, decomposition = @add_n.impl, name = "stablehlo.add_n"}> : (tensor<i64>) -> tensor<i64>
   %0 = stablehlo.composite "stablehlo.add_n" %arg0 {
     composite_attributes = { n = 2 : i64 },
     decomposition = @add_n.impl
@@ -607,16 +607,9 @@ func.func @add_n.impl(%arg0: tensor<i64>) -> tensor<i64> {
   func.return %1 : tensor<i64>
 }
 
-// CHECK-LABEL: "op_compute_reshape_shape"
-func.func @op_compute_reshape_shape(%arg0: index, %arg1: tensor<1xindex>) -> tensor<1xindex> {
-  // CHECK: "mhlo.compute_reshape_shape"(%arg0, %arg1) : (index, tensor<1xindex>) -> tensor<1xindex>
-  %0 = "stablehlo.compute_reshape_shape"(%arg0, %arg1) : (index, tensor<1xindex>) -> tensor<1xindex>
-  func.return %0 : tensor<1xindex>
-}
-
 // CHECK-LABEL: "op_concatenate"
 func.func @op_concatenate(%arg0: tensor<8xf32>, %arg1: tensor<8xf32>) -> tensor<16xf32> {
-  //      CHECK: "mhlo.concatenate"(%arg0, %arg1) <{
+  //      CHECK: "mhlo.concatenate"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) <{
   // CHECK-SAME:   dimension = 0 : i64
   // CHECK-SAME: }> : (tensor<8xf32>, tensor<8xf32>) -> tensor<16xf32>
   %0 = "stablehlo.concatenate"(%arg0, %arg1) {
@@ -638,14 +631,14 @@ func.func @op_constant(%arg0: tensor<f32>) -> tensor<f32> {
 
 // CHECK-LABEL: "op_convert"
 func.func @op_convert(%arg0: tensor<i32>) -> tensor<f32> {
-  // CHECK: "mhlo.convert"(%arg0) : (tensor<i32>) -> tensor<f32>
+  // CHECK: "mhlo.convert"([[ARG0:%arg[0-9]+]]) : (tensor<i32>) -> tensor<f32>
   %0 = "stablehlo.convert"(%arg0) : (tensor<i32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_convolution"
 func.func @op_convolution(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
-  //      CHECK: "mhlo.convolution"(%arg0, %arg1) <{
+  //      CHECK: "mhlo.convolution"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) <{
   // CHECK-SAME:   batch_group_count = 1 : i64,
   // CHECK-SAME:   dimension_numbers = #mhlo.conv<[b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]>,
   // CHECK-SAME:   feature_group_count = 1 : i64,
@@ -672,7 +665,7 @@ func.func @op_convolution(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16
 
 // CHECK-LABEL: "op_cosine"
 func.func @op_cosine(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "mhlo.cosine"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "mhlo.cosine"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.cosine"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
@@ -686,7 +679,7 @@ func.func @op_create_token() -> !stablehlo.token {
 
 // CHECK-LABEL: "op_cross_replica_sum"
 func.func @op_cross_replica_sum(%arg0: tensor<f32>) -> tensor<f32> {
-  //               CHECK: "mhlo.cross-replica-sum"(%arg0) <{
+  //               CHECK: "mhlo.cross-replica-sum"([[ARG0:%arg[0-9]+]]) <{
   // CHECK-SAME{LITERAL}:   replica_groups = dense<[[0], [1]]> : tensor<2x1xi64>
   //          CHECK-SAME: }> : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.cross-replica-sum"(%arg0) {
@@ -695,17 +688,10 @@ func.func @op_cross_replica_sum(%arg0: tensor<f32>) -> tensor<f32> {
   func.return %0 : tensor<f32>
 }
 
-// CHECK-LABEL: "op_cstr_reshapable"
-func.func @op_cstr_reshapable(%arg0: index, %arg1: tensor<1xindex>) -> !shape.witness {
-  // CHECK: "mhlo.cstr_reshapable"(%arg0, %arg1) : (index, tensor<1xindex>) -> !shape.witness
-  %0 = "stablehlo.cstr_reshapable"(%arg0, %arg1) : (index, tensor<1xindex>) -> !shape.witness
-  func.return %0 : !shape.witness
-}
-
 // CHECK-LABEL: "op_custom_call_api_version_original"
 func.func @called_computation() { func.return }
 func.func @op_custom_call_api_version_original(%arg0: tensor<f32>) -> tensor<f32> {
-  //      CHECK: "mhlo.custom_call"(%arg0) <{
+  //      CHECK: "mhlo.custom_call"([[ARG0:%arg[0-9]+]]) <{
   // CHECK-SAME:   api_version = 1 : i32,
   // CHECK-SAME:   backend_config = "",
   // CHECK-SAME:   call_target_name = "foo",
@@ -737,7 +723,7 @@ func.func @op_custom_call_api_version_original(%arg0: tensor<f32>) -> tensor<f32
 
 // CHECK-LABEL: "op_custom_call_api_version_typed_ffi"
 func.func @op_custom_call_api_version_typed_ffi(%arg0: tensor<f32>) -> tensor<f32> {
-  //      CHECK: "mhlo.custom_call"(%arg0) <{
+  //      CHECK: "mhlo.custom_call"([[ARG0:%arg[0-9]+]]) <{
   // CHECK-SAME:   api_version = 4 : i32,
   // CHECK-SAME:   backend_config = {foo = "bar"},
   // CHECK-SAME:   call_target_name = "foo"
@@ -752,7 +738,7 @@ func.func @op_custom_call_api_version_typed_ffi(%arg0: tensor<f32>) -> tensor<f3
 
 // CHECK-LABEL: op_custom_call_mhlo_backend_config
 func.func @op_custom_call_mhlo_backend_config(%arg0: tensor<16x256xbf16>) -> tensor<16x4xbf16> {
-  // CHECK: "mhlo.custom_call"(%arg0) <{
+  // CHECK: "mhlo.custom_call"([[ARG0:%arg[0-9]+]]) <{
   // CHECK-SAME: api_version = 4 : i32,
   // CHECK-SAME: backend_config = {aggregate_to_topk = true},
   // CHECK-SAME: call_target_name = "foo"
@@ -765,14 +751,14 @@ func.func @op_custom_call_mhlo_backend_config(%arg0: tensor<16x256xbf16>) -> ten
 
 // CHECK-LABEL: "op_divide"
 func.func @op_divide(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
-  // CHECK: "mhlo.divide"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: "mhlo.divide"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.divide"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_dot_general"
 func.func @op_dot_general(%arg0: tensor<8x8x16xf32>, %arg1: tensor<8x16x8xf32>) -> tensor<8x8x8xf32> {
-  //      CHECK: "mhlo.dot_general"(%arg0, %arg1) <{
+  //      CHECK: "mhlo.dot_general"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) <{
   // CHECK-SAME:   dot_dimension_numbers = #mhlo.dot<
   // CHECK-SAME:     lhs_batching_dimensions = [0],
   // CHECK-SAME:     rhs_batching_dimensions = [0],
@@ -795,7 +781,7 @@ func.func @op_dot_general(%arg0: tensor<8x8x16xf32>, %arg1: tensor<8x16x8xf32>)
 
 // CHECK-LABEL: "op_dot"
 func.func @op_dot(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor<8x8xf32> {
-  //      CHECK: "mhlo.dot"(%arg0, %arg1) <{
+  //      CHECK: "mhlo.dot"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) <{
   // CHECK-SAME:   precision_config = []
   // CHECK-SAME: }> : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<8x8xf32>
   %0 = "stablehlo.dot"(%arg0, %arg1) {
@@ -806,7 +792,7 @@ func.func @op_dot(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor<8x
 
 // CHECK-LABEL: "op_dynamic_broadcast_in_dim"
 func.func @op_dynamic_broadcast_in_dim(%arg0: tensor<?xf32>, %arg1: tensor<2xindex>) -> tensor<?x?xf32> {
-  //      CHECK: "mhlo.dynamic_broadcast_in_dim"(%arg0, %arg1) <{
+  //      CHECK: "mhlo.dynamic_broadcast_in_dim"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) <{
   // CHECK-SAME:   broadcast_dimensions = dense<1> : tensor<1xi64>,
   // CHECK-SAME:   known_expanding_dimensions = dense<> : tensor<0xi64>,
   // CHECK-SAME:   known_nonexpanding_dimensions = dense<0> : tensor<1xi64>
@@ -820,21 +806,19 @@ func.func @op_dynamic_broadcast_in_dim(%arg0: tensor<?xf32>, %arg1: tensor<2xind
 }
 
 // CHECK-LABEL: "op_dynamic_conv"
-func.func @op_dynamic_conv(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16xf32>, %arg2: tensor<4xi32>) -> tensor<1x?x?x16xf32> {
-  //      CHECK: "mhlo.dynamic_conv"(%arg0, %arg1, %arg2) <{
+func.func @op_dynamic_conv(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16xf32>, %arg2: tensor<2x2xi32>) -> tensor<1x?x?x16xf32> {
+  //      CHECK: "mhlo.dynamic_conv"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]], [[ARG2:%arg[0-9]+]]) <{
   // CHECK-SAME:   batch_group_count = 1 : i64,
   // CHECK-SAME:   dimension_numbers = #mhlo.conv<[b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]>,
   // CHECK-SAME:   feature_group_count = 1 : i64,
   // CHECK-SAME:   lhs_dilation = dense<1> : tensor<2xi64>,
-  // CHECK-SAME:   padding = dense<1> : tensor<2x2xi64>,
   // CHECK-SAME:   precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>],
   // CHECK-SAME:   rhs_dilation = dense<1> : tensor<2xi64>,
   // CHECK-SAME:   window_reversal = dense<false> : tensor<2xi1>,
   // CHECK-SAME:   window_strides = dense<1> : tensor<2xi64>
-  // CHECK-SAME: }> : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>, tensor<4xi32>) -> tensor<1x?x?x16xf32>
+  // CHECK-SAME: }> : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>, tensor<2x2xi32>) -> tensor<1x?x?x16xf32>
   %0 = "stablehlo.dynamic_conv"(%arg0, %arg1, %arg2) {
     window_strides = array<i64: 1, 1>,
-    padding = dense<1> : tensor<2x2xi64>,
     lhs_dilation = array<i64: 1, 1>,
     rhs_dilation = array<i64: 1, 1>,
     window_reversal = array<i1: false, false>,
@@ -842,13 +826,13 @@ func.func @op_dynamic_conv(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x1
     feature_group_count = 1 : i64,
     batch_group_count = 1 : i64,
     precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]
-  } : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>, tensor<4xi32>) -> tensor<1x?x?x16xf32>
+  } : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>, tensor<2x2xi32>) -> tensor<1x?x?x16xf32>
   func.return %0 : tensor<1x?x?x16xf32>
 }
 
 // CHECK-LABEL: "op_dynamic_gather"
 func.func @op_dynamic_gather(%arg0 : tensor<2x4x9xf32>, %arg1 : tensor<1x5x2xi32>, %arg2 : tensor<3xi32>) -> tensor<1x5x8xf32> {
-  //      CHECK: "mhlo.dynamic_gather"(%arg0, %arg1, %arg2) <{
+  //      CHECK: "mhlo.dynamic_gather"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]], [[ARG2:%arg[0-9]+]]) <{
   // CHECK-SAME:   dimension_numbers = #mhlo.gather<
   // CHECK-SAME:     offset_dims = [2],
   // CHECK-SAME:     collapsed_slice_dims = [0, 1],
@@ -871,7 +855,7 @@ func.func @op_dynamic_gather(%arg0 : tensor<2x4x9xf32>, %arg1 : tensor<1x5x2xi32
 
 // CHECK-LABEL: "op_dynamic_iota"
 func.func @op_dynamic_iota(%arg0: tensor<1xindex>) -> tensor<?xf32> {
-  //      CHECK: "mhlo.dynamic_iota"(%arg0) <{
+  //      CHECK: "mhlo.dynamic_iota"([[ARG0:%arg[0-9]+]]) <{
   // CHECK-SAME:   iota_dimension = 0 : i64
   // CHECK-SAME: }> : (tensor<1xindex>) -> tensor<?xf32>
   %0 = "stablehlo.dynamic_iota"(%arg0) {
@@ -882,21 +866,21 @@ func.func @op_dynamic_iota(%arg0: tensor<1xindex>) -> tensor<?xf32> {
 
 // CHECK-LABEL: "op_dynamic_pad"
 func.func @op_dynamic_pad(%arg0: tensor<?xf32>, %arg1: tensor<f32>, %arg2: tensor<1xindex>, %arg3: tensor<1xindex>, %arg4: tensor<1xindex>) -> tensor<?xf32> {
-  // CHECK: "mhlo.dynamic_pad"(%arg0, %arg1, %arg2, %arg3, %arg4) : (tensor<?xf32>, tensor<f32>, tensor<1xindex>, tensor<1xindex>, tensor<1xindex>) -> tensor<?xf32>
+  // CHECK: "mhlo.dynamic_pad"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]], [[ARG2:%arg[0-9]+]], [[ARG3:%arg[0-9]+]], [[ARG4:%arg[0-9]+]]) : (tensor<?xf32>, tensor<f32>, tensor<1xindex>, tensor<1xindex>, tensor<1xindex>) -> tensor<?xf32>
   %0 = "stablehlo.dynamic_pad"(%arg0, %arg1, %arg2, %arg3, %arg4) : (tensor<?xf32>, tensor<f32>, tensor<1xindex>, tensor<1xindex>, tensor<1xindex>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
 }
 
 // CHECK-LABEL: "op_dynamic_reshape"
 func.func @op_dynamic_reshape(%arg0: tensor<16xf32>, %arg1: tensor<2xindex>) -> tensor<?x?xf32> {
-  // CHECK: "mhlo.dynamic_reshape"(%arg0, %arg1) : (tensor<16xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+  // CHECK: "mhlo.dynamic_reshape"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<16xf32>, tensor<2xindex>) -> tensor<?x?xf32>
   %0 = "stablehlo.dynamic_reshape"(%arg0, %arg1) : (tensor<16xf32>, tensor<2xindex>) -> tensor<?x?xf32>
   func.return %0 : tensor<?x?xf32>
 }
 
 // CHECK-LABEL: "op_dynamic_slice"
 func.func @op_dynamic_slice(%arg0: tensor<16xf32>, %arg1: tensor<i64>) -> tensor<4xf32> {
-  //      CHECK: "mhlo.dynamic_slice"(%arg0, %arg1) <{
+  //      CHECK: "mhlo.dynamic_slice"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) <{
   // CHECK-SAME:   slice_sizes = dense<4> : tensor<1xi64>
   // CHECK-SAME: }> : (tensor<16xf32>, tensor<i64>) -> tensor<4xf32>
   %0 = "stablehlo.dynamic_slice"(%arg0, %arg1) {
@@ -907,14 +891,14 @@ func.func @op_dynamic_slice(%arg0: tensor<16xf32>, %arg1: tensor<i64>) -> tensor
 
 // CHECK-LABEL: "op_dynamic_update_slice"
 func.func @op_dynamic_update_slice(%arg0: tensor<16xf32>, %arg1: tensor<4xf32>, %arg2: tensor<i64>) -> tensor<16xf32> {
-  // CHECK: "mhlo.dynamic_update_slice"(%arg0, %arg1, %arg2) : (tensor<16xf32>, tensor<4xf32>, tensor<i64>) -> tensor<16xf32>
+  // CHECK: "mhlo.dynamic_update_slice"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]], [[ARG2:%arg[0-9]+]]) : (tensor<16xf32>, tensor<4xf32>, tensor<i64>) -> tensor<16xf32>
   %0 = "stablehlo.dynamic_update_slice"(%arg0, %arg1, %arg2) : (tensor<16xf32>, tensor<4xf32>, tensor<i64>) -> tensor<16xf32>
   func.return %0 : tensor<16xf32>
 }
 
 // CHECK-LABEL: "op_einsum"
 func.func @op_einsum(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor<8x8xf32> {
-  //      CHECK: "mhlo.einsum"(%arg0, %arg1) <{
+  //      CHECK: "mhlo.einsum"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) <{
   // CHECK-SAME:   einsum_config = "ab,bc->ac"
   // CHECK-SAME: }> : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<8x8xf32>
   %0 = "stablehlo.einsum"(%arg0, %arg1) {
@@ -925,21 +909,21 @@ func.func @op_einsum(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor
 
 // CHECK-LABEL: "op_exponential_minus_one"
 func.func @op_exponential_minus_one(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "mhlo.exponential_minus_one"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "mhlo.exponential_minus_one"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.exponential_minus_one"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_exponential"
 func.func @op_exponential(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "mhlo.exponential"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "mhlo.exponential"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.exponential"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_fft"
 func.func @op_fft(%arg0: tensor<16xcomplex<f32>>) -> tensor<16xcomplex<f32>> {
-  //      CHECK: "mhlo.fft"(%arg0) <{
+  //      CHECK: "mhlo.fft"([[ARG0:%arg[0-9]+]]) <{
   // CHECK-SAME:   fft_length = dense<16> : tensor<1xi64>,
   // CHECK-SAME:   fft_type = #mhlo<fft_type FFT>
   // CHECK-SAME: }> : (tensor<16xcomplex<f32>>) -> tensor<16xcomplex<f32>>
@@ -952,14 +936,14 @@ func.func @op_fft(%arg0: tensor<16xcomplex<f32>>) -> tensor<16xcomplex<f32>> {
 
 // CHECK-LABEL: "op_floor"
 func.func @op_floor(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "mhlo.floor"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "mhlo.floor"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.floor"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_gather"
 func.func @op_gather(%arg0 : tensor<2x4x9xf32>, %arg1 : tensor<1x5x2xi32>) -> tensor<1x5x1xf32> {
-  //      CHECK: "mhlo.gather"(%arg0, %arg1) <{
+  //      CHECK: "mhlo.gather"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) <{
   // CHECK-SAME:   dimension_numbers = #mhlo.gather<
   // CHECK-SAME:     offset_dims = [2],
   // CHECK-SAME:     collapsed_slice_dims = [0, 1],
@@ -984,7 +968,7 @@ func.func @op_gather(%arg0 : tensor<2x4x9xf32>, %arg1 : tensor<1x5x2xi32>) -> te
 
 // CHECK-LABEL: "op_get_dimension_size"
 func.func @op_get_dimension_size(%arg0: tensor<?xf32>) -> tensor<i32> {
-  //      CHECK: "mhlo.get_dimension_size"(%arg0) <{
+  //      CHECK: "mhlo.get_dimension_size"([[ARG0:%arg[0-9]+]]) <{
   // CHECK-SAME:   dimension = 0 : i64
   // CHECK-SAME: }> : (tensor<?xf32>) -> tensor<i32>
   %0 = "stablehlo.get_dimension_size"(%arg0) {
@@ -995,7 +979,7 @@ func.func @op_get_dimension_size(%arg0: tensor<?xf32>) -> tensor<i32> {
 
 // CHECK-LABEL: "op_get_tuple_element"
 func.func @op_get_tuple_element(%arg0: tuple<tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>>) -> tensor<f32> {
-  //      CHECK: "mhlo.get_tuple_element"(%arg0) <{
+  //      CHECK: "mhlo.get_tuple_element"([[ARG0:%arg[0-9]+]]) <{
   // CHECK-SAME:   index = 4 : i32
   // CHECK-SAME: }> : (tuple<tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>>) -> tensor<f32>
   %0 = "stablehlo.get_tuple_element"(%arg0) {
@@ -1006,10 +990,10 @@ func.func @op_get_tuple_element(%arg0: tuple<tensor<f32>, tensor<f32>, tensor<f3
 
 // CHECK-LABEL: "op_if"
 func.func @op_if(%arg0: tensor<i1>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<f32> {
-  //      CHECK: "mhlo.if"(%arg0) ({
-  // CHECK-NEXT:   "mhlo.return"(%arg1) : (tensor<f32>) -> ()
+  //      CHECK: "mhlo.if"([[ARG0:%arg[0-9]+]]) ({
+  // CHECK-NEXT:   "mhlo.return"([[ARG1:%arg[0-9]+]]) : (tensor<f32>) -> ()
   // CHECK-NEXT: }, {
-  // CHECK-NEXT:   "mhlo.return"(%arg2) : (tensor<f32>) -> ()
+  // CHECK-NEXT:   "mhlo.return"([[ARG2:%arg[0-9]+]]) : (tensor<f32>) -> ()
   // CHECK-NEXT: }) : (tensor<i1>) -> tensor<f32>
   %0 = "stablehlo.if"(%arg0) ({
     "stablehlo.return"(%arg1) : (tensor<f32>) -> ()
@@ -1021,14 +1005,14 @@ func.func @op_if(%arg0: tensor<i1>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> t
 
 // CHECK-LABEL: "op_imag"
 func.func @op_imag(%arg0: tensor<complex<f32>>) -> tensor<f32> {
-  // CHECK: "mhlo.imag"(%arg0) : (tensor<complex<f32>>) -> tensor<f32>
+  // CHECK: "mhlo.imag"([[ARG0:%arg[0-9]+]]) : (tensor<complex<f32>>) -> tensor<f32>
   %0 = "stablehlo.imag"(%arg0) : (tensor<complex<f32>>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_infeed"
 func.func @op_infeed(%arg0: !stablehlo.token) -> (tensor<f32>, !stablehlo.token) {
-  //               CHECK: "mhlo.infeed"(%arg0) <{
+  //               CHECK: "mhlo.infeed"([[ARG0:%arg[0-9]+]]) <{
   //          CHECK-SAME:   infeed_config = "",
   // CHECK-SAME{LITERAL}:   layout = [[]]
   //          CHECK-SAME: }> : (!mhlo.token) -> (tensor<f32>, !mhlo.token)
@@ -1052,35 +1036,35 @@ func.func @op_iota() -> tensor<16xf32> {
 
 // CHECK-LABEL: "op_is_finite"
 func.func @op_is_finite(%arg0: tensor<f32>) -> tensor<i1> {
-  // CHECK: "mhlo.is_finite"(%arg0) : (tensor<f32>) -> tensor<i1>
+  // CHECK: "mhlo.is_finite"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<i1>
   %0 = "stablehlo.is_finite"(%arg0) : (tensor<f32>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
 
 // CHECK-LABEL: "op_log"
 func.func @op_log(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "mhlo.log"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "mhlo.log"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.log"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_log_plus_one"
 func.func @op_log_plus_one(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "mhlo.log_plus_one"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "mhlo.log_plus_one"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.log_plus_one"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_logistic"
 func.func @op_logistic(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "mhlo.logistic"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "mhlo.logistic"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.logistic"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_map"
 func.func @op_map(%arg0: tensor<16xf32>) -> tensor<16xf32> {
-  //      CHECK: "mhlo.map"(%arg0) <{
+  //      CHECK: "mhlo.map"([[ARG0:%arg[0-9]+]]) <{
   // CHECK-SAME:   dimensions = dense<0> : tensor<1xi64>
   // CHECK-SAME: }> ({
   // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG1:arg.*]]: tensor<f32>):
@@ -1099,56 +1083,56 @@ func.func @op_map(%arg0: tensor<16xf32>) -> tensor<16xf32> {
 
 // CHECK-LABEL: "op_maximum"
 func.func @op_maximum(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
-  // CHECK: "mhlo.maximum"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: "mhlo.maximum"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.maximum"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_minimum"
 func.func @op_minimum(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
-  // CHECK: "mhlo.minimum"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: "mhlo.minimum"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.minimum"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_multiply"
 func.func @op_multiply(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
-  // CHECK: "mhlo.multiply"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: "mhlo.multiply"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.multiply"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_negate"
 func.func @op_negate(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "mhlo.negate"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "mhlo.negate"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.negate"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_not"
 func.func @op_not(%arg0: tensor<i1>) -> tensor<i1> {
-  // CHECK: "mhlo.not"(%arg0) : (tensor<i1>) -> tensor<i1>
+  // CHECK: "mhlo.not"([[ARG0:%arg[0-9]+]]) : (tensor<i1>) -> tensor<i1>
   %0 = "stablehlo.not"(%arg0) : (tensor<i1>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
 
 // CHECK-LABEL: "op_optimization_barrier"
 func.func @op_optimization_barrier(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "mhlo.optimization_barrier"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "mhlo.optimization_barrier"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.optimization_barrier"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_or"
 func.func @op_or(%arg0: tensor<i1>, %arg1: tensor<i1>) -> tensor<i1> {
-  // CHECK: "mhlo.or"(%arg0, %arg1) : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  // CHECK: "mhlo.or"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<i1>, tensor<i1>) -> tensor<i1>
   %0 = "stablehlo.or"(%arg0, %arg1) : (tensor<i1>, tensor<i1>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
 
 // CHECK-LABEL: "op_outfeed"
 func.func @op_outfeed(%arg0: tensor<f32>, %arg1: !stablehlo.token) -> !stablehlo.token {
-  //      CHECK: "mhlo.outfeed"(%arg0, %arg1) <{
+  //      CHECK: "mhlo.outfeed"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) <{
   // CHECK-SAME:   outfeed_config = ""
   // CHECK-SAME: }> : (tensor<f32>, !mhlo.token) -> !mhlo.token
   %0 = "stablehlo.outfeed"(%arg0, %arg1) {
@@ -1159,7 +1143,7 @@ func.func @op_outfeed(%arg0: tensor<f32>, %arg1: !stablehlo.token) -> !stablehlo
 
 // CHECK-LABEL: "op_pad"
 func.func @op_pad(%arg0: tensor<8xf32>, %arg1: tensor<f32>) -> tensor<16xf32> {
-  //      CHECK: "mhlo.pad"(%arg0, %arg1) <{
+  //      CHECK: "mhlo.pad"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) <{
   // CHECK-SAME:   edge_padding_high = dense<4> : tensor<1xi64>,
   // CHECK-SAME:   edge_padding_low = dense<4> : tensor<1xi64>,
   // CHECK-SAME:   interior_padding = dense<0> : tensor<1xi64>
@@ -1181,35 +1165,35 @@ func.func @op_partition_id() -> tensor<ui32> {
 
 // CHECK-LABEL: "op_popcnt"
 func.func @op_popcnt(%arg0: tensor<i32>) -> tensor<i32> {
-  // CHECK: "mhlo.popcnt"(%arg0) : (tensor<i32>) -> tensor<i32>
+  // CHECK: "mhlo.popcnt"([[ARG0:%arg[0-9]+]]) : (tensor<i32>) -> tensor<i32>
   %0 = "stablehlo.popcnt"(%arg0) : (tensor<i32>) -> tensor<i32>
   func.return %0 : tensor<i32>
 }
 
 // CHECK-LABEL: "op_power"
 func.func @op_power(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
-  // CHECK: "mhlo.power"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: "mhlo.power"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.power"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_real_dynamic_slice"
 func.func @op_real_dynamic_slice(%arg0: tensor<?xf32>, %arg1: tensor<1xindex>, %arg2: tensor<1xindex>, %arg3: tensor<1xindex>) -> tensor<?xf32> {
-  // CHECK: "mhlo.real_dynamic_slice"(%arg0, %arg1, %arg2, %arg3) : (tensor<?xf32>, tensor<1xindex>, tensor<1xindex>, tensor<1xindex>) -> tensor<?xf32>
+  // CHECK: "mhlo.real_dynamic_slice"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]], [[ARG2:%arg[0-9]+]], [[ARG3:%arg[0-9]+]]) : (tensor<?xf32>, tensor<1xindex>, tensor<1xindex>, tensor<1xindex>) -> tensor<?xf32>
   %0 = "stablehlo.real_dynamic_slice"(%arg0, %arg1, %arg2, %arg3) : (tensor<?xf32>, tensor<1xindex>, tensor<1xindex>, tensor<1xindex>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
 }
 
 // CHECK-LABEL: "op_real"
 func.func @op_real(%arg0: tensor<complex<f32>>) -> tensor<f32> {
-  // CHECK: "mhlo.real"(%arg0) : (tensor<complex<f32>>) -> tensor<f32>
+  // CHECK: "mhlo.real"([[ARG0:%arg[0-9]+]]) : (tensor<complex<f32>>) -> tensor<f32>
   %0 = "stablehlo.real"(%arg0) : (tensor<complex<f32>>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_recv"
 func.func @op_recv(%arg0: !stablehlo.token) -> (tensor<f32>, !stablehlo.token) {
-  //      CHECK: "mhlo.recv"(%arg0) <{
+  //      CHECK: "mhlo.recv"([[ARG0:%arg[0-9]+]]) <{
   // CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 0, type = 3>,
   // CHECK-SAME:   is_host_transfer = true
   // CHECK-SAME: }> : (!mhlo.token) -> (tensor<f32>, !mhlo.token)
@@ -1234,7 +1218,7 @@ func.func @op_reduce(%arg0: tensor<16xf32>, %arg1: tensor<f32>) -> tensor<f32> {
 
 // CHECK-LABEL: "op_reduce_precision"
 func.func @op_reduce_precision(%arg0: tensor<f32>) -> tensor<f32> {
-  //      CHECK: "mhlo.reduce_precision"(%arg0) <{
+  //      CHECK: "mhlo.reduce_precision"([[ARG0:%arg[0-9]+]]) <{
   // CHECK-SAME:   exponent_bits = 8 : i32,
   // CHECK-SAME:   mantissa_bits = 10 : i32
   // CHECK-SAME: }> : (tensor<f32>) -> tensor<f32>
@@ -1247,7 +1231,7 @@ func.func @op_reduce_precision(%arg0: tensor<f32>) -> tensor<f32> {
 
 // CHECK-LABEL: "op_reduce_scatter"
 func.func @op_reduce_scatter(%arg0: tensor<16xf32>) -> tensor<16xf32> {
-  //               CHECK: "mhlo.reduce_scatter"(%arg0) <{
+  //               CHECK: "mhlo.reduce_scatter"([[ARG0:%arg[0-9]+]]) <{
   //          CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 0, type = 0>,
   // CHECK-SAME{LITERAL}:   replica_groups = dense<[[0], [1]]> : tensor<2x1xi64>,
   //          CHECK-SAME:   scatter_dimension = 0 : i64
@@ -1270,7 +1254,7 @@ func.func @op_reduce_scatter(%arg0: tensor<16xf32>) -> tensor<16xf32> {
 
 // CHECK-LABEL: "op_reduce_window"
 func.func @op_reduce_window(%arg0: tensor<2x17x31x7xf32>, %arg1: tensor<f32>) -> tensor<2x5x8x7xf32> {
-  //               CHECK: "mhlo.reduce_window"(%arg0, %arg1) <{
+  //               CHECK: "mhlo.reduce_window"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) <{
   //          CHECK-SAME:   base_dilations = dense<1> : tensor<4xi64>,
   // CHECK-SAME{LITERAL}:   padding = dense<[[0, 0], [2, 0], [0, 2], [0, 0]]> : tensor<4x2xi64>,
   //          CHECK-SAME:   window_dilations = dense<[1, 2, 2, 1]> : tensor<4xi64>,
@@ -1297,7 +1281,7 @@ func.func @op_reduce_window(%arg0: tensor<2x17x31x7xf32>, %arg1: tensor<f32>) ->
 
 // CHECK-LABEL: "op_remainder"
 func.func @op_remainder(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
-  // CHECK: "mhlo.remainder"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: "mhlo.remainder"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.remainder"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
@@ -1311,15 +1295,15 @@ func.func @op_replica_id() -> tensor<ui32> {
 
 // CHECK-LABEL: "op_reshape"
 func.func @op_reshape(%arg0: tensor<16xf32>) -> tensor<4x4xf32> {
-  // CHECK: "mhlo.reshape"(%arg0) : (tensor<16xf32>) -> tensor<4x4xf32>
+  // CHECK: "mhlo.reshape"([[ARG0:%arg[0-9]+]]) : (tensor<16xf32>) -> tensor<4x4xf32>
   %0 = "stablehlo.reshape"(%arg0) : (tensor<16xf32>) -> tensor<4x4xf32>
   func.return %0 : tensor<4x4xf32>
 }
 
 // CHECK-LABEL: "op_return"
 func.func @op_return(%arg0: tensor<i32>, %arg1: tensor<f32>) -> tensor<f32> {
-  //      CHECK: "mhlo.case"(%arg0) ({
-  // CHECK-NEXT:   "mhlo.return"(%arg1) : (tensor<f32>) -> ()
+  //      CHECK: "mhlo.case"([[ARG0:%arg[0-9]+]]) ({
+  // CHECK-NEXT:   "mhlo.return"([[ARG1:%arg[0-9]+]]) : (tensor<f32>) -> ()
   // CHECK-NEXT: }) : (tensor<i32>) -> tensor<f32>
   %0 = "stablehlo.case"(%arg0) ({
     "stablehlo.return"(%arg1) : (tensor<f32>) -> ()
@@ -1329,7 +1313,7 @@ func.func @op_return(%arg0: tensor<i32>, %arg1: tensor<f32>) -> tensor<f32> {
 
 // CHECK-LABEL: "op_reverse"
 func.func @op_reverse(%arg0: tensor<16xf32>) -> tensor<16xf32> {
-  //      CHECK: "mhlo.reverse"(%arg0) <{
+  //      CHECK: "mhlo.reverse"([[ARG0:%arg[0-9]+]]) <{
   // CHECK-SAME:   dimensions = dense<0> : tensor<1xi64>
   // CHECK-SAME: }> : (tensor<16xf32>) -> tensor<16xf32>
   %0 = "stablehlo.reverse"(%arg0) {
@@ -1340,7 +1324,7 @@ func.func @op_reverse(%arg0: tensor<16xf32>) -> tensor<16xf32> {
 
 // CHECK-LABEL: "op_rng_bit_generator"
 func.func @op_rng_bit_generator(%arg0: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
-  //      CHECK: "mhlo.rng_bit_generator"(%arg0) <{
+  //      CHECK: "mhlo.rng_bit_generator"([[ARG0:%arg[0-9]+]]) <{
   // CHECK-SAME:   rng_algorithm = #mhlo.rng_algorithm<PHILOX>
   // CHECK-SAME: }> : (tensor<f32>) -> (tensor<f32>, tensor<f32>)
   %0:2 = "stablehlo.rng_bit_generator"(%arg0) {
@@ -1351,7 +1335,7 @@ func.func @op_rng_bit_generator(%arg0: tensor<f32>) -> (tensor<f32>, tensor<f32>
 
 // CHECK-LABEL: "op_rng"
 func.func @op_rng(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<0xindex>) -> tensor<f32> {
-  //      CHECK: "mhlo.rng"(%arg0, %arg1, %arg2) <{
+  //      CHECK: "mhlo.rng"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]], [[ARG2:%arg[0-9]+]]) <{
   // CHECK-SAME:   rng_distribution = #mhlo.rng_distribution<NORMAL>
   // CHECK-SAME: }> : (tensor<f32>, tensor<f32>, tensor<0xindex>) -> tensor<f32>
   %0 = "stablehlo.rng"(%arg0, %arg1, %arg2) {
@@ -1362,28 +1346,28 @@ func.func @op_rng(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<0xindex>
 
 // CHECK-LABEL: "op_round_nearest_afz"
 func.func @op_round_nearest_afz(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "mhlo.round_nearest_afz"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "mhlo.round_nearest_afz"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.round_nearest_afz"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_round_nearest_even"
 func.func @op_round_nearest_even(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "mhlo.round_nearest_even"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "mhlo.round_nearest_even"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.round_nearest_even"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_rsqrt"
 func.func @op_rsqrt(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "mhlo.rsqrt"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "mhlo.rsqrt"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.rsqrt"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_scatter"
 func.func @op_scatter(%arg0: tensor<200x100x300xf32>, %arg1: tensor<10x2xi32>, %arg2: tensor<10x300xf32>) -> tensor<200x100x300xf32> {
-  //      CHECK: "mhlo.scatter"(%arg0, %arg1, %arg2) <{
+  //      CHECK: "mhlo.scatter"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]], [[ARG2:%arg[0-9]+]]) <{
   // CHECK-SAME:  indices_are_sorted = true,
   // CHECK-SAME:  scatter_dimension_numbers = #mhlo.scatter<
   // CHECK-SAME:    update_window_dims = [1],
@@ -1416,7 +1400,7 @@ func.func @op_scatter(%arg0: tensor<200x100x300xf32>, %arg1: tensor<10x2xi32>, %
 
 // CHECK-LABEL: "op_select_and_scatter"
 func.func @op_select_and_scatter(%arg0: tensor<10x24x24x64xf32>, %arg1: tensor<10x12x12x64xf32>, %arg2: tensor<f32>) -> tensor<10x24x24x64xf32> {
-  //      CHECK: "mhlo.select_and_scatter"(%arg0, %arg1, %arg2) <{
+  //      CHECK: "mhlo.select_and_scatter"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]], [[ARG2:%arg[0-9]+]]) <{
   // CHECK-SAME:   padding = dense<0> : tensor<4x2xi64>,
   // CHECK-SAME:   window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>,
   // CHECK-SAME:   window_strides = dense<[1, 2, 2, 1]> : tensor<4xi64>
@@ -1447,14 +1431,14 @@ func.func @op_select_and_scatter(%arg0: tensor<10x24x24x64xf32>, %arg1: tensor<1
 
 // CHECK-LABEL: "op_select"
 func.func @op_select(%arg0: tensor<i1>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<f32> {
-  // CHECK: "mhlo.select"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: "mhlo.select"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]], [[ARG2:%arg[0-9]+]]) : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.select"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_send"
 func.func @op_send(%arg0: tensor<f32>, %arg1: !stablehlo.token) -> !stablehlo.token {
-  //      CHECK: "mhlo.send"(%arg0, %arg1) <{
+  //      CHECK: "mhlo.send"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) <{
   // CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 0, type = 2>,
   // CHECK-SAME:   is_host_transfer = true
   // CHECK-SAME: }> : (tensor<f32>, !mhlo.token) -> !mhlo.token
@@ -1467,7 +1451,7 @@ func.func @op_send(%arg0: tensor<f32>, %arg1: !stablehlo.token) -> !stablehlo.to
 
 // CHECK-LABEL: "op_set_dimension_size"
 func.func @op_set_dimension_size(%arg0: tensor<?xf32>, %arg1: tensor<i32>) -> tensor<16xf32> {
-  //      CHECK: "mhlo.set_dimension_size"(%arg0, %arg1) <{
+  //      CHECK: "mhlo.set_dimension_size"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) <{
   // CHECK-SAME:   dimension = 0 : i64
   // CHECK-SAME: }> : (tensor<?xf32>, tensor<i32>) -> tensor<16xf32>
   %0 = "stablehlo.set_dimension_size"(%arg0, %arg1) {
@@ -1478,42 +1462,42 @@ func.func @op_set_dimension_size(%arg0: tensor<?xf32>, %arg1: tensor<i32>) -> te
 
 // CHECK-LABEL: "op_shift_left"
 func.func @op_shift_left(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> {
-  // CHECK: "mhlo.shift_left"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  // CHECK: "mhlo.shift_left"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<i32>, tensor<i32>) -> tensor<i32>
   %0 = "stablehlo.shift_left"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
   func.return %0 : tensor<i32>
 }
 
 // CHECK-LABEL: "op_shift_right_arithmetic"
 func.func @op_shift_right_arithmetic(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> {
-  // CHECK: "mhlo.shift_right_arithmetic"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  // CHECK: "mhlo.shift_right_arithmetic"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<i32>, tensor<i32>) -> tensor<i32>
   %0 = "stablehlo.shift_right_arithmetic"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
   func.return %0 : tensor<i32>
 }
 
 // CHECK-LABEL: "op_shift_right_logical"
 func.func @op_shift_right_logical(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> {
-  // CHECK: "mhlo.shift_right_logical"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  // CHECK: "mhlo.shift_right_logical"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<i32>, tensor<i32>) -> tensor<i32>
   %0 = "stablehlo.shift_right_logical"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
   func.return %0 : tensor<i32>
 }
 
 // CHECK-LABEL: "op_sign"
 func.func @op_sign(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "mhlo.sign"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "mhlo.sign"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.sign"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_sine"
 func.func @op_sine(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "mhlo.sine"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "mhlo.sine"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.sine"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_slice"
 func.func @op_slice(%arg0: tensor<16xf32>) -> tensor<4xf32> {
-  //      CHECK: "mhlo.slice"(%arg0) <{
+  //      CHECK: "mhlo.slice"([[ARG0:%arg[0-9]+]]) <{
   // CHECK-SAME:   limit_indices = dense<4> : tensor<1xi64>,
   // CHECK-SAME:   start_indices = dense<0> : tensor<1xi64>,
   // CHECK-SAME:   strides = dense<1> : tensor<1xi64>
@@ -1528,7 +1512,7 @@ func.func @op_slice(%arg0: tensor<16xf32>) -> tensor<4xf32> {
 
 // CHECK-LABEL: "op_sort"
 func.func @op_sort(%arg0: tensor<16xf32>) -> tensor<16xf32> {
-  //      CHECK: "mhlo.sort"(%arg0) <{
+  //      CHECK: "mhlo.sort"([[ARG0:%arg[0-9]+]]) <{
   // CHECK-SAME:   dimension = 0 : i64,
   // CHECK-SAME:   is_stable = true
   // CHECK-SAME: }> ({
@@ -1549,21 +1533,21 @@ func.func @op_sort(%arg0: tensor<16xf32>) -> tensor<16xf32> {
 
 // CHECK-LABEL: "op_sqrt"
 func.func @op_sqrt(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "mhlo.sqrt"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "mhlo.sqrt"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.sqrt"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_subtract"
 func.func @op_subtract(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
-  // CHECK: "mhlo.subtract"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: "mhlo.subtract"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.subtract"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_tan_mhlo_v1"
 func.func @op_tan_mhlo_v1(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "mhlo.tan"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "mhlo.tan"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.custom_call"(%arg0) {
     backend_config = "",
     call_target_name = "mhlo.tan",
@@ -1575,14 +1559,14 @@ func.func @op_tan_mhlo_v1(%arg0: tensor<f32>) -> tensor<f32> {
 
 // CHECK-LABEL: "op_tanh"
 func.func @op_tanh(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK: "mhlo.tanh"(%arg0) : (tensor<f32>) -> tensor<f32>
+  // CHECK: "mhlo.tanh"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.tanh"(%arg0) : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_torch_index_select"
 func.func @op_torch_index_select(%arg0: tensor<5x1x5xf32>, %arg1: tensor<2xi32>) ->  tensor<2x1x5xf32> {
-  //      CHECK: "mhlo.torch_index_select"(%arg0, %arg1) <{
+  //      CHECK: "mhlo.torch_index_select"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) <{
   // CHECK-SAME:   batch_dims = 0 : i64,
   // CHECK-SAME:   dim = 0 : i64
   // CHECK-SAME: }> : (tensor<5x1x5xf32>, tensor<2xi32>) -> tensor<2x1x5xf32>
@@ -1593,20 +1577,9 @@ func.func @op_torch_index_select(%arg0: tensor<5x1x5xf32>, %arg1: tensor<2xi32>)
   func.return %0 : tensor<2x1x5xf32>
 }
 
-// CHECK-LABEL: "op_trace"
-func.func @op_trace(%arg0: tensor<f32>) {
-  //      CHECK: "mhlo.trace"(%arg0) <{
-  // CHECK-SAME:   tag = "foo"
-  // CHECK-SAME: }> : (tensor<f32>) -> ()
-  "stablehlo.trace"(%arg0) {
-    tag = "foo"
-  } : (tensor<f32>) -> ()
-  func.return
-}
-
 // CHECK-LABEL: "op_transpose"
 func.func @op_transpose(%arg0: tensor<16x8xf32>) ->  tensor<8x16xf32> {
-  //      CHECK: "mhlo.transpose"(%arg0) <{
+  //      CHECK: "mhlo.transpose"([[ARG0:%arg[0-9]+]]) <{
   // CHECK-SAME:   permutation = dense<[1, 0]> : tensor<2xi64>
   // CHECK-SAME: }> : (tensor<16x8xf32>) -> tensor<8x16xf32>
   %0 = "stablehlo.transpose"(%arg0) {
@@ -1617,7 +1590,7 @@ func.func @op_transpose(%arg0: tensor<16x8xf32>) ->  tensor<8x16xf32> {
 
 // CHECK-LABEL: "op_triangular_solve"
 func.func @op_triangular_solve(%arg0: tensor<16x16xf32>, %arg1: tensor<16x16xf32>) ->  tensor<16x16xf32> {
-  //      CHECK: "mhlo.triangular_solve"(%arg0, %arg1) <{
+  //      CHECK: "mhlo.triangular_solve"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) <{
   // CHECK-SAME:   left_side = true,
   // CHECK-SAME:   lower = true,
   // CHECK-SAME:   transpose_a = #mhlo<transpose NO_TRANSPOSE>,
@@ -1634,14 +1607,14 @@ func.func @op_triangular_solve(%arg0: tensor<16x16xf32>, %arg1: tensor<16x16xf32
 
 // CHECK-LABEL: "op_tuple"
 func.func @op_tuple(%arg0: tensor<f32>) -> tuple<tensor<f32>> {
-  // CHECK: "mhlo.tuple"(%arg0) : (tensor<f32>) -> tuple<tensor<f32>>
+  // CHECK: "mhlo.tuple"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tuple<tensor<f32>>
   %0 = "stablehlo.tuple"(%arg0) : (tensor<f32>) -> tuple<tensor<f32>>
   func.return %0 : tuple<tensor<f32>>
 }
 
 // CHECK-LABEL: "op_unary_einsum"
 func.func @op_unary_einsum(%arg0: tensor<8x16xf32>) -> tensor<8xf32> {
-  //      CHECK: "mhlo.unary_einsum"(%arg0) <{
+  //      CHECK: "mhlo.unary_einsum"([[ARG0:%arg[0-9]+]]) <{
   // CHECK-SAME:   einsum_config = "ab->a"
   // CHECK-SAME: }> : (tensor<8x16xf32>) -> tensor<8xf32>
   %0 = "stablehlo.unary_einsum"(%arg0) {
@@ -1652,21 +1625,21 @@ func.func @op_unary_einsum(%arg0: tensor<8x16xf32>) -> tensor<8xf32> {
 
 // CHECK-LABEL: "op_uniform_dequantize"
 func.func @op_uniform_dequantize(%arg0: tensor<!quant.uniform<i8:f32, 34.0:16>>) -> tensor<f32> {
-  // CHECK: "mhlo.uniform_dequantize"(%arg0) : (tensor<!quant.uniform<i8:f32, 3.400000e+01:16>>) -> tensor<f32>
+  // CHECK: "mhlo.uniform_dequantize"([[ARG0:%arg[0-9]+]]) : (tensor<!quant.uniform<i8:f32, 3.400000e+01:16>>) -> tensor<f32>
   %0 = "stablehlo.uniform_dequantize"(%arg0) : (tensor<!quant.uniform<i8:f32, 34.0:16>>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "op_uniform_quantize"
 func.func @op_uniform_quantize(%arg0: tensor<f32>) -> tensor<!quant.uniform<i8:f32, 34.0:16>> {
-  // CHECK: "mhlo.uniform_quantize"(%arg0) : (tensor<f32>) -> tensor<!quant.uniform<i8:f32, 3.400000e+01:16>>
+  // CHECK: "mhlo.uniform_quantize"([[ARG0:%arg[0-9]+]]) : (tensor<f32>) -> tensor<!quant.uniform<i8:f32, 3.400000e+01:16>>
   %0 = "stablehlo.uniform_quantize"(%arg0) : (tensor<f32>) -> tensor<!quant.uniform<i8:f32, 34.0:16>>
   func.return %0 : tensor<!quant.uniform<i8:f32, 34.0:16>>
 }
 
 // CHECK-LABEL: "op_while"
 func.func @op_while(%arg0: tensor<i1>) -> tensor<i1> {
-  //      CHECK: "mhlo.while"(%arg0) ({
+  //      CHECK: "mhlo.while"([[ARG0:%arg[0-9]+]]) ({
   // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG1:arg.*]]: tensor<i1>):
   // CHECK-NEXT:     "mhlo.return"(%[[ARG1]]) : (tensor<i1>) -> ()
   // CHECK-NEXT:   }, {
@@ -1685,7 +1658,7 @@ func.func @op_while(%arg0: tensor<i1>) -> tensor<i1> {
 
 // CHECK-LABEL: "op_xor"
 func.func @op_xor(%arg0: tensor<i1>, %arg1: tensor<i1>) -> tensor<i1> {
-  // CHECK: "mhlo.xor"(%arg0, %arg1) : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  // CHECK: "mhlo.xor"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<i1>, tensor<i1>) -> tensor<i1>
   %0 = "stablehlo.xor"(%arg0, %arg1) : (tensor<i1>, tensor<i1>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
@@ -1694,168 +1667,168 @@ func.func @op_xor(%arg0: tensor<i1>, %arg1: tensor<i1>) -> tensor<i1> {
 
 // CHECK-LABEL: "type_i1"
 func.func @type_i1(%arg0: tensor<i1>, %arg1: tensor<i1>) -> tensor<i1> {
-  // CHECK: "mhlo.and"(%arg0, %arg1) : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  // CHECK: "mhlo.and"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<i1>, tensor<i1>) -> tensor<i1>
   %0 = "stablehlo.and"(%arg0, %arg1) : (tensor<i1>, tensor<i1>) -> tensor<i1>
   func.return %0 : tensor<i1>
 }
 
 // CHECK-LABEL: "type_i4"
 func.func @type_i4(%arg0: tensor<i4>, %arg1: tensor<i4>) -> tensor<i4> {
-  // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<i4>, tensor<i4>) -> tensor<i4>
+  // CHECK: "mhlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<i4>, tensor<i4>) -> tensor<i4>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<i4>, tensor<i4>) -> tensor<i4>
   func.return %0 : tensor<i4>
 }
 
 // CHECK-LABEL: "type_i8"
 func.func @type_i8(%arg0: tensor<i8>, %arg1: tensor<i8>) -> tensor<i8> {
-  // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<i8>, tensor<i8>) -> tensor<i8>
+  // CHECK: "mhlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<i8>, tensor<i8>) -> tensor<i8>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<i8>, tensor<i8>) -> tensor<i8>
   func.return %0 : tensor<i8>
 }
 
 // CHECK-LABEL: "type_i16"
 func.func @type_i16(%arg0: tensor<i16>, %arg1: tensor<i16>) -> tensor<i16> {
-  // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<i16>, tensor<i16>) -> tensor<i16>
+  // CHECK: "mhlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<i16>, tensor<i16>) -> tensor<i16>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<i16>, tensor<i16>) -> tensor<i16>
   func.return %0 : tensor<i16>
 }
 
 // CHECK-LABEL: "type_i32"
 func.func @type_i32(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> {
-  // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  // CHECK: "mhlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<i32>, tensor<i32>) -> tensor<i32>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
   func.return %0 : tensor<i32>
 }
 
 // CHECK-LABEL: "type_i64"
 func.func @type_i64(%arg0: tensor<i64>, %arg1: tensor<i64>) -> tensor<i64> {
-  // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  // CHECK: "mhlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<i64>, tensor<i64>) -> tensor<i64>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<i64>, tensor<i64>) -> tensor<i64>
   func.return %0 : tensor<i64>
 }
 
 // CHECK-LABEL: "type_ui4"
 func.func @type_ui4(%arg0: tensor<ui4>, %arg1: tensor<ui4>) -> tensor<ui4> {
-  // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<ui4>, tensor<ui4>) -> tensor<ui4>
+  // CHECK: "mhlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<ui4>, tensor<ui4>) -> tensor<ui4>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<ui4>, tensor<ui4>) -> tensor<ui4>
   func.return %0 : tensor<ui4>
 }
 
 // CHECK-LABEL: "type_ui8"
 func.func @type_ui8(%arg0: tensor<ui8>, %arg1: tensor<ui8>) -> tensor<ui8> {
-  // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<ui8>, tensor<ui8>) -> tensor<ui8>
+  // CHECK: "mhlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<ui8>, tensor<ui8>) -> tensor<ui8>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<ui8>, tensor<ui8>) -> tensor<ui8>
   func.return %0 : tensor<ui8>
 }
 
 // CHECK-LABEL: "type_ui16"
 func.func @type_ui16(%arg0: tensor<ui16>, %arg1: tensor<ui16>) -> tensor<ui16> {
-  // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<ui16>, tensor<ui16>) -> tensor<ui16>
+  // CHECK: "mhlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<ui16>, tensor<ui16>) -> tensor<ui16>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<ui16>, tensor<ui16>) -> tensor<ui16>
   func.return %0 : tensor<ui16>
 }
 
 // CHECK-LABEL: "type_ui32"
 func.func @type_ui32(%arg0: tensor<ui32>, %arg1: tensor<ui32>) -> tensor<ui32> {
-  // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<ui32>, tensor<ui32>) -> tensor<ui32>
+  // CHECK: "mhlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<ui32>, tensor<ui32>) -> tensor<ui32>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<ui32>, tensor<ui32>) -> tensor<ui32>
   func.return %0 : tensor<ui32>
 }
 
 // CHECK-LABEL: "type_ui64"
 func.func @type_ui64(%arg0: tensor<ui64>, %arg1: tensor<ui64>) -> tensor<ui64> {
-  // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<ui64>, tensor<ui64>) -> tensor<ui64>
+  // CHECK: "mhlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<ui64>, tensor<ui64>) -> tensor<ui64>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<ui64>, tensor<ui64>) -> tensor<ui64>
   func.return %0 : tensor<ui64>
 }
 
 // CHECK-LABEL: "type_f8E4M3FN"
 func.func @type_f8E4M3FN(%arg0: tensor<f8E4M3FN>, %arg1: tensor<f8E4M3FN>) -> tensor<f8E4M3FN> {
-  // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<f8E4M3FN>, tensor<f8E4M3FN>) -> tensor<f8E4M3FN>
+  // CHECK: "mhlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<f8E4M3FN>, tensor<f8E4M3FN>) -> tensor<f8E4M3FN>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<f8E4M3FN>, tensor<f8E4M3FN>) -> tensor<f8E4M3FN>
   func.return %0 : tensor<f8E4M3FN>
 }
 
 // CHECK-LABEL: "type_f8E4M3FNUZ"
 func.func @type_f8E4M3FNUZ(%arg0: tensor<f8E4M3FNUZ>, %arg1: tensor<f8E4M3FNUZ>) -> tensor<f8E4M3FNUZ> {
-  // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<f8E4M3FNUZ>, tensor<f8E4M3FNUZ>) -> tensor<f8E4M3FNUZ>
+  // CHECK: "mhlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<f8E4M3FNUZ>, tensor<f8E4M3FNUZ>) -> tensor<f8E4M3FNUZ>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<f8E4M3FNUZ>, tensor<f8E4M3FNUZ>) -> tensor<f8E4M3FNUZ>
   func.return %0 : tensor<f8E4M3FNUZ>
 }
 
 // CHECK-LABEL: "type_f8E4M3B11FNUZ"
 func.func @type_f8E4M3B11FNUZ(%arg0: tensor<f8E4M3B11FNUZ>, %arg1: tensor<f8E4M3B11FNUZ>) -> tensor<f8E4M3B11FNUZ> {
-  // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<f8E4M3B11FNUZ>, tensor<f8E4M3B11FNUZ>) -> tensor<f8E4M3B11FNUZ>
+  // CHECK: "mhlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<f8E4M3B11FNUZ>, tensor<f8E4M3B11FNUZ>) -> tensor<f8E4M3B11FNUZ>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<f8E4M3B11FNUZ>, tensor<f8E4M3B11FNUZ>) -> tensor<f8E4M3B11FNUZ>
   func.return %0 : tensor<f8E4M3B11FNUZ>
 }
 
 // CHECK-LABEL: "type_f8E5M2"
 func.func @type_f8E5M2(%arg0: tensor<f8E5M2>, %arg1: tensor<f8E5M2>) -> tensor<f8E5M2> {
-  // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<f8E5M2>, tensor<f8E5M2>) -> tensor<f8E5M2>
+  // CHECK: "mhlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<f8E5M2>, tensor<f8E5M2>) -> tensor<f8E5M2>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<f8E5M2>, tensor<f8E5M2>) -> tensor<f8E5M2>
   func.return %0 : tensor<f8E5M2>
 }
 
 // CHECK-LABEL: "type_f8E5M2FNUZ"
 func.func @type_f8E5M2FNUZ(%arg0: tensor<f8E5M2FNUZ>, %arg1: tensor<f8E5M2FNUZ>) -> tensor<f8E5M2FNUZ> {
-  // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<f8E5M2FNUZ>, tensor<f8E5M2FNUZ>) -> tensor<f8E5M2FNUZ>
+  // CHECK: "mhlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<f8E5M2FNUZ>, tensor<f8E5M2FNUZ>) -> tensor<f8E5M2FNUZ>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<f8E5M2FNUZ>, tensor<f8E5M2FNUZ>) -> tensor<f8E5M2FNUZ>
   func.return %0 : tensor<f8E5M2FNUZ>
 }
 
 // CHECK-LABEL: "type_bf16"
 func.func @type_bf16(%arg0: tensor<bf16>, %arg1: tensor<bf16>) -> tensor<bf16> {
-  // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<bf16>, tensor<bf16>) -> tensor<bf16>
+  // CHECK: "mhlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<bf16>, tensor<bf16>) -> tensor<bf16>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<bf16>, tensor<bf16>) -> tensor<bf16>
   func.return %0 : tensor<bf16>
 }
 
 // CHECK-LABEL: "type_f16"
 func.func @type_f16(%arg0: tensor<f16>, %arg1: tensor<f16>) -> tensor<f16> {
-  // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<f16>, tensor<f16>) -> tensor<f16>
+  // CHECK: "mhlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<f16>, tensor<f16>) -> tensor<f16>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<f16>, tensor<f16>) -> tensor<f16>
   func.return %0 : tensor<f16>
 }
 
 // CHECK-LABEL: "type_f32"
 func.func @type_f32(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
-  // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: "mhlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "type_f64"
 func.func @type_f64(%arg0: tensor<f64>, %arg1: tensor<f64>) -> tensor<f64> {
-  // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<f64>, tensor<f64>) -> tensor<f64>
+  // CHECK: "mhlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<f64>, tensor<f64>) -> tensor<f64>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<f64>, tensor<f64>) -> tensor<f64>
   func.return %0 : tensor<f64>
 }
 
 // CHECK-LABEL: "type_complex_f32"
 func.func @type_complex_f32(%arg0: tensor<complex<f32>>, %arg1: tensor<complex<f32>>) -> tensor<complex<f32>> {
-  // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<complex<f32>>, tensor<complex<f32>>) -> tensor<complex<f32>>
+  // CHECK: "mhlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<complex<f32>>, tensor<complex<f32>>) -> tensor<complex<f32>>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<complex<f32>>, tensor<complex<f32>>) -> tensor<complex<f32>>
   func.return %0 : tensor<complex<f32>>
 }
 
 // CHECK-LABEL: "type_complex_f64"
 func.func @type_complex_f64(%arg0: tensor<complex<f64>>, %arg1: tensor<complex<f64>>) -> tensor<complex<f64>> {
-  // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<complex<f64>>, tensor<complex<f64>>) -> tensor<complex<f64>>
+  // CHECK: "mhlo.add"([[ARG0:%arg[0-9]+]], [[ARG1:%arg[0-9]+]]) : (tensor<complex<f64>>, tensor<complex<f64>>) -> tensor<complex<f64>>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<complex<f64>>, tensor<complex<f64>>) -> tensor<complex<f64>>
   func.return %0 : tensor<complex<f64>>
 }
 
 // CHECK-LABEL: "type_dynamism_ranked"
 func.func @type_dynamism_ranked(%arg0: tensor<?xf32>) -> tensor<?xf32> {
-  // CHECK: "mhlo.abs"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
+  // CHECK: "mhlo.abs"([[ARG0:%arg[0-9]+]]) : (tensor<?xf32>) -> tensor<?xf32>
   %0 = "stablehlo.abs"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
 }
 
 // CHECK-LABEL: "type_quantization"
 func.func @type_quantization(%arg0: tensor<!quant.uniform<i8:f32, 34.0:16>>) -> tensor<!quant.uniform<i8:f32, 34.0:16>> {
-  // CHECK: "mhlo.add"(%arg0, %arg0) : (tensor<!quant.uniform<i8:f32, 3.400000e+01:16>>, tensor<!quant.uniform<i8:f32, 3.400000e+01:16>>) -> tensor<!quant.uniform<i8:f32, 3.400000e+01:16>>
+  // CHECK: "mhlo.add"([[ARG0:%arg[0-9]+]], [[ARG0]]) : (tensor<!quant.uniform<i8:f32, 3.400000e+01:16>>, tensor<!quant.uniform<i8:f32, 3.400000e+01:16>>) -> tensor<!quant.uniform<i8:f32, 3.400000e+01:16>>
   %0 = "stablehlo.add"(%arg0, %arg0) : (tensor<!quant.uniform<i8:f32, 34.0:16>>, tensor<!quant.uniform<i8:f32, 34.0:16>>) -> tensor<!quant.uniform<i8:f32, 34.0:16>>
   func.return %0 : tensor<!quant.uniform<i8:f32, 34.0:16>>
 }
@@ -1867,7 +1840,7 @@ func.func @type_quantization(%arg0: tensor<!quant.uniform<i8:f32, 34.0:16>>) ->
 // CHECK: #[[$SV:.*]] = #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>
 // CHECK-LABEL: "type_sparsity"
 func.func @type_sparsity(%arg0: tensor<16xf32, #SV>) -> tensor<16xf32> {
-  // CHECK: "mhlo.abs"(%arg0) : (tensor<16xf32, #[[$SV]]>) -> tensor<16xf32>
+  // CHECK: "mhlo.abs"([[ARG0:%arg[0-9]+]]) : (tensor<16xf32, #[[$SV]]>) -> tensor<16xf32>
   %0 = "stablehlo.abs"(%arg0) : (tensor<16xf32, #SV>) -> tensor<16xf32>
   func.return %0 : tensor<16xf32>
 }
@@ -1876,22 +1849,22 @@ func.func @type_sparsity(%arg0: tensor<16xf32, #SV>) -> tensor<16xf32> {
 
 func.func @type_token_callee(%arg0: !stablehlo.token) -> !stablehlo.token {
   // CHECK: function_type = (!mhlo.token) -> !mhlo.token, sym_name = "type_token_callee"
-  // CHECK: "func.return"(%arg0) : (!mhlo.token) -> ()
+  // CHECK: "func.return"([[ARG0:%arg[0-9]+]]) : (!mhlo.token) -> ()
   return %arg0 : !stablehlo.token
 }
 
 func.func @type_token_caller(%arg0: !stablehlo.token) -> !stablehlo.token {
   // CHECK: function_type = (!mhlo.token) -> !mhlo.token, sym_name = "type_token_caller"
-  // CHECK: "func.call"(%arg0) <{callee = @type_token_callee}> : (!mhlo.token) -> !mhlo.token
+  // CHECK: "func.call"([[ARG0:%arg[0-9]+]]) <{callee = @type_token_callee}> : (!mhlo.token) -> !mhlo.token
   %0 = func.call @type_token_callee(%arg0) : (!stablehlo.token) -> !stablehlo.token
   return %0 : !stablehlo.token
 }
 
 // CHECK-LABEL: "type_token_region"
 func.func @type_token_region(%arg0: tensor<i1>, %arg1: !stablehlo.token) {
-  //      CHECK: "mhlo.while"(%arg1) ({
+  //      CHECK: "mhlo.while"([[ARG1:%arg[0-9]+]]) ({
   // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG2:arg.*]]: !mhlo.token):
-  // CHECK-NEXT:     "mhlo.return"(%arg0) : (tensor<i1>) -> ()
+  // CHECK-NEXT:     "mhlo.return"([[ARG0:%arg[0-9]+]]) : (tensor<i1>) -> ()
   // CHECK-NEXT:   }, {
   // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG2:arg.*]]: !mhlo.token):
   // CHECK-NEXT:     "mhlo.return"(%[[ARG2]]) : (!mhlo.token) -> ()
@@ -1969,7 +1942,7 @@ func.func @op_custom_call_botched_mhlo_backend_config_version(%arg0: tensor<f32>
 
 // CHECK-LABEL: "op_topk_mhlo_v1"
 func.func @op_topk_mhlo_v1(%arg0: tensor<5x10xf32>) -> (tensor<5x8xf32>, tensor<5x8xi32>) {
-  // CHECK: "mhlo.topk"(%arg0) <{k = 8 : i64, largest = true}> : (tensor<5x10xf32>) -> (tensor<5x8xf32>, tensor<5x8xi32>)
+  // CHECK: "mhlo.topk"([[ARG0:%arg[0-9]+]]) <{k = 8 : i64, largest = true}> : (tensor<5x10xf32>) -> (tensor<5x8xf32>, tensor<5x8xi32>)
   %0:2 = "stablehlo.custom_call"(%arg0) {
     backend_config = "",
     call_target_name = "mhlo.topk",
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/symbolic-shape-optimization.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/symbolic-shape-optimization.mlir
index 258d78b613e71a..17f27e2c67f9aa 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/symbolic-shape-optimization.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/symbolic-shape-optimization.mlir
@@ -11,7 +11,7 @@ func.func @reshape_expand_front(%arg0: tensor<?x?xf32>) -> tensor<1x?x?xf32> {
   %reshape = "mhlo.dynamic_reshape"(%arg0, %shape)
       : (tensor<?x?xf32>, tensor<3xindex>) -> tensor<1x?x?xf32>
 // CHECK: tensor.expand_shape %arg0 [
-// CHECK-SAME: [0, 1], [2]] : tensor<?x?xf32> into tensor<1x?x?xf32>
+// CHECK-SAME: [0, 1], [2]] {{.*}} : tensor<?x?xf32> into tensor<1x?x?xf32>
   func.return %reshape : tensor<1x?x?xf32>
 }
 
@@ -25,7 +25,7 @@ func.func @reshape_expand_front_static(%arg0: tensor<2x?xf32>) -> tensor<1x2x?xf
   %reshape = "mhlo.dynamic_reshape"(%arg0, %shape)
       : (tensor<2x?xf32>, tensor<3xindex>) -> tensor<1x2x?xf32>
 // CHECK: tensor.expand_shape %arg0 [
-// CHECK-SAME: [0, 1], [2]] : tensor<2x?xf32> into tensor<1x2x?xf32>
+// CHECK-SAME: [0, 1], [2]] {{.*}} : tensor<2x?xf32> into tensor<1x2x?xf32>
   func.return %reshape : tensor<1x2x?xf32>
 }
 
@@ -41,7 +41,7 @@ func.func @reshape_expand_back(%arg0: tensor<?x?xf32>) -> tensor<?x?x1x1xf32> {
   %reshape = "mhlo.dynamic_reshape"(%arg0, %shape)
       : (tensor<?x?xf32>, tensor<4xindex>) -> tensor<?x?x1x1xf32>
 // CHECK: tensor.expand_shape %arg0 [
-// CHECK-SAME: [0], [1, 2, 3]] : tensor<?x?xf32> into tensor<?x?x1x1xf32>
+// CHECK-SAME: [0], [1, 2, 3]] {{.*}} : tensor<?x?xf32> into tensor<?x?x1x1xf32>
   func.return %reshape : tensor<?x?x1x1xf32>
 }
 
@@ -50,7 +50,7 @@ func.func @reshape_expand_back(%arg0: tensor<?x?xf32>) -> tensor<?x?x1x1xf32> {
 // CHECK-LABEL: @reshape_expand_scalar
 // CHECK-SAME:  %[[ARG:.*]]: tensor<f32>
 func.func @reshape_expand_scalar(%arg0: tensor<f32>) -> tensor<?x?xf32> {
-  // CHECK-DAG: %[[EXPAND:.*]] = tensor.expand_shape %[[ARG]] [] : tensor<f32> into tensor<1x1xf32>
+  // CHECK-DAG: %[[EXPAND:.*]] = tensor.expand_shape %[[ARG]] [] {{.*}} : tensor<f32> into tensor<1x1xf32>
   // CHECK-DAG: %[[RES:.*]] = tensor.cast %[[EXPAND]] : tensor<1x1xf32> to tensor<?x?xf32>
   // CHECK:     return %[[RES]]
   %shape = mhlo.constant dense<1> : tensor<2xi32>
@@ -89,7 +89,7 @@ func.func @reshape_undefined(%arg0: tensor<?xf32>) -> tensor<1x1x1xf32> {
 // CHECK-LABEL: @shape_expansion
 // CHECK-SAME:  %[[ARG:.*]]: tensor<?x1xi64>
 func.func @shape_expansion(%arg : tensor<?x1xi64>) -> tensor<?x1x1xi64> {
-  // CHECK-DAG: %[[RES:.*]] = tensor.expand_shape %[[ARG]] {{\[}}[0], [1, 2]{{\]}} : tensor<?x1xi64> into tensor<?x1x1xi64>
+  // CHECK-DAG: %[[RES:.*]] = tensor.expand_shape %[[ARG]] {{\[}}[0], [1, 2]{{\]}} {{.*}} : tensor<?x1xi64> into tensor<?x1x1xi64>
   // CHECK:     return %[[RES]]
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
@@ -106,7 +106,7 @@ func.func @shape_expansion(%arg : tensor<?x1xi64>) -> tensor<?x1x1xi64> {
 // CHECK-SAME:  %[[ARG:.*]]: tensor<3x?x1xi64>
 func.func @shape_collapse_and_expansion(%arg : tensor<3x?x1xi64>)
     -> tensor<?x1x1xi64> {
-  // CHECK: %[[EED:.*]] = tensor.expand_shape %[[ARG]] {{\[}}[0], [1], [2, 3]{{\]}} : tensor<3x?x1xi64> into tensor<3x?x1x1xi64>
+  // CHECK: %[[EED:.*]] = tensor.expand_shape %[[ARG]] {{\[}}[0], [1], [2, 3]{{\]}} {{.*}} : tensor<3x?x1xi64> into tensor<3x?x1x1xi64>
   // CHECK: %[[CED:.*]] = tensor.collapse_shape %[[EED]] {{\[}}[0, 1], [2], [3]{{\]}} : tensor<3x?x1x1xi64> into tensor<?x1x1xi64>
   // CHECK: return %[[CED]]
   %c1 = arith.constant 1 : index
@@ -124,7 +124,7 @@ func.func @shape_collapse_and_expansion(%arg : tensor<3x?x1xi64>)
 // CHECK-LABEL: @shape_collapse_and_expansion_w_cast
 // CHECK-SAME:  %[[ARG:.*]]: tensor<16x8x?x?xf32>
 func.func @shape_collapse_and_expansion_w_cast(%arg0: tensor<16x8x?x?xf32>) -> tensor<16x4x?x?xf32> {
-  // CHECK-DAG: %[[EED:.*]] = tensor.expand_shape %[[ARG]] {{\[}}[0], [1, 2], [3], [4]{{\]}} : tensor<16x8x?x?xf32> into tensor<16x4x2x?x?xf32>
+  // CHECK-DAG: %[[EED:.*]] = tensor.expand_shape %[[ARG]] {{\[}}[0], [1, 2], [3], [4]{{\]}} {{.*}} : tensor<16x8x?x?xf32> into tensor<16x4x2x?x?xf32>
   // CHECK-DAG: %[[CED:.*]] = tensor.collapse_shape %[[EED]] {{\[}}[0], [1], [2], [3, 4]{{\]}} : tensor<16x4x2x?x?xf32> into tensor<16x4x2x?xf32>
   // CHECK-DAG: %[[RES:.*]] = tensor.cast %[[CED]]
   // CHECK:     return %[[RES]]
@@ -171,7 +171,7 @@ func.func @dynamic_reshape_to_collapse_shape(%arg0 : tensor<1x4x?x64x?x8x1x1xf32
 // CHECK-LABEL: @expansion_unit_dims
 // CHECK-SAME:  %[[ARG:.*]]: tensor<1x?x1xi64>
 func.func @expansion_unit_dims(%arg0: tensor<1x?x1xi64>) -> tensor<1x1x?x1xi64> {
-  // CHECK-DAG: %[[RES:.*]] = tensor.expand_shape %[[ARG]] {{\[}}[0, 1], [2], [3]{{\]}} : tensor<1x?x1xi64> into tensor<1x1x?x1xi64>
+  // CHECK-DAG: %[[RES:.*]] = tensor.expand_shape %[[ARG]] {{\[}}[0, 1], [2], [3]{{\]}} {{.*}} : tensor<1x?x1xi64> into tensor<1x1x?x1xi64>
   // CHECK:     return %[[RES]]
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
@@ -188,9 +188,9 @@ func.func @expansion_unit_dims(%arg0: tensor<1x?x1xi64>) -> tensor<1x1x?x1xi64>
 // CHECK-SAME:  %[[ARG:.*]]: tensor<?x?x?x?xi64>
 func.func @multiple_reductions_and_reshape(%arg0: tensor<?x?x?x?xi64>) -> tensor<1x1x1x1xi64> {
   // CHECK: %[[RED0:.*]] = mhlo.reduce(%[[ARG]]
-  // CHECK: %[[RED0_:.*]] = tensor.expand_shape %[[RED0]] {{\[}}[0], [1], [2, 3]{{\]}} : tensor<?x?x?xi64> into tensor<?x?x?x1xi64>
+  // CHECK: %[[RED0_:.*]] = tensor.expand_shape %[[RED0]] {{\[}}[0], [1], [2, 3]{{\]}} {{.*}} : tensor<?x?x?xi64> into tensor<?x?x?x1xi64>
   // CHECK: %[[RED1:.*]] = mhlo.reduce(%[[RED0_]]
-  // CHECK: %[[RED1_:.*]] = tensor.expand_shape %[[RED1]] {{\[}}[0, 1, 2], [3]{{\]}} : tensor<?x1xi64> into tensor<1x1x?x1xi64>
+  // CHECK: %[[RED1_:.*]] = tensor.expand_shape %[[RED1]] {{\[}}[0, 1, 2], [3]{{\]}} {{.*}} : tensor<?x1xi64> into tensor<1x1x?x1xi64>
   // CHECK: %[[RED2:.*]] = mhlo.reduce(%[[RED1_]]
   // TODO(b/225204462): This should also become a shape expansion.
   // CHECK: %[[RED2_:.*]] = mhlo.reshape %[[RED2]] : (tensor<1xi64>) -> tensor<1x1x1x1xi64>
@@ -229,211 +229,6 @@ func.func @multiple_reductions_and_reshape(%arg0: tensor<?x?x?x?xi64>) -> tensor
 
 // -----
 
-// CHECK-LABEL: func @compute_reshape_shape
-func.func @compute_reshape_shape(%arg0: tensor<?x?xf32>, %arg1: index)
-    -> tensor<2xi32> {
-  %shape = shape.shape_of %arg0: tensor<?x?xf32> -> tensor<2xindex>
-  %casted = arith.index_cast %shape : tensor<2xindex> to tensor<2xi32>
-  %mul = mhlo.multiply %casted, %casted : tensor<2xi32>
-// CHECK:  %[[MUL:.*]] = mhlo.multiply
-  %crs = mhlo.compute_reshape_shape %arg1, %mul
-      : (index, tensor<2xi32>) -> tensor<2xi32>
-  func.return %crs : tensor<2xi32>
-// CHECK: return %[[MUL]] : tensor<2xi32>
-}
-
-// -----
-
-// CHECK-LABEL: func @compute_reshape_shape
-func.func @compute_reshape_shape(%arg0: tensor<2xi32>, %arg1: index)
-    -> tensor<2xi32> {
-  %mul = mhlo.multiply %arg0, %arg0 : tensor<2xi32>
-  %crs = mhlo.compute_reshape_shape %arg1, %mul
-      : (index, tensor<2xi32>) -> tensor<2xi32>
-// CHECK: mhlo.compute_reshape_shape
-  func.return %crs : tensor<2xi32>
-}
-
-// -----
-
-// CHECK-LABEL: @redundant_cstr_reshapable
-func.func @redundant_cstr_reshapable(%arg0 : tensor<?x8x?x64xf32>)
-    -> !shape.witness {
-  // CHECK: %[[WITNESS:.*]] = shape.const_witness true
-  // CHECK: return %[[WITNESS]] : !shape.witness
-  %c0 = arith.constant 0 : index
-  %c2 = arith.constant 2 : index
-  %c512 = arith.constant 512 : index
-  %s0 = shape.shape_of %arg0 : tensor<?x8x?x64xf32> -> tensor<4xindex>
-  %n0 = shape.num_elements %s0 : tensor<4xindex> -> index
-  %dim00 = tensor.dim %arg0, %c0 : tensor<?x8x?x64xf32>
-  %dim02 = tensor.dim %arg0, %c2 : tensor<?x8x?x64xf32>
-  %s1_ = tensor.from_elements %dim02, %dim00, %c512 : tensor<3xindex>
-  %s1 = arith.index_cast %s1_ : tensor<3xindex> to tensor<3xi32>
-  %w = mhlo.cstr_reshapable %n0, %s1 : (index, tensor<3xi32>) -> !shape.witness
-  func.return %w : !shape.witness
-}
-
-// -----
-
-// CHECK-LABEL: @redundant_cstr_reshapable_less_obvious
-func.func @redundant_cstr_reshapable_less_obvious(%arg0 : tensor<?x4x?x64xf32>)
-    -> !shape.witness {
-  // CHECK: %[[WITNESS:.*]] = shape.const_witness true
-  // CHECK: return %[[WITNESS]] : !shape.witness
-  %c0 = arith.constant 0 : index
-  %c2 = arith.constant 2 : index
-  %c128 = arith.constant 128 : i32
-  %s0 = shape.shape_of %arg0 : tensor<?x4x?x64xf32> -> tensor<4xindex>
-  %n0 = shape.num_elements %s0 : tensor<4xindex> -> index
-  %dim00 = tensor.dim %arg0, %c0 : tensor<?x4x?x64xf32>
-  %dim00_twice = arith.muli %c2, %dim00 : index
-  %dim00_twice_ = arith.index_cast %dim00_twice : index to i32
-  %dim02 = tensor.dim %arg0, %c2 : tensor<?x4x?x64xf32>
-  %dim02_ = arith.index_cast %dim02 : index to i32
-  %s1 = tensor.from_elements %dim02_, %dim00_twice_, %c128 : tensor<3xi32>
-  %w = mhlo.cstr_reshapable %n0, %s1 : (index, tensor<3xi32>) -> !shape.witness
-  func.return %w : !shape.witness
-}
-
-// -----
-
-// CHECK-LABEL: @redundant_cstr_reshapable
-func.func @redundant_cstr_reshapable(%arg0 : tensor<?x8x?x64xf32>)
-    -> !shape.witness {
-  // CHECK: %[[WITNESS:.*]] = shape.const_witness true
-  // CHECK: return %[[WITNESS]] : !shape.witness
-  %c0 = arith.constant 0 : index
-  %c2 = arith.constant 2 : index
-  %c128 = arith.constant 128 : i32
-  %cminus1 = arith.constant -1 : i32
-  %s0 = shape.shape_of %arg0 : tensor<?x8x?x64xf32> -> tensor<4xindex>
-  %n0 = shape.num_elements %s0 : tensor<4xindex> -> index
-  %dim02 = tensor.dim %arg0, %c2 : tensor<?x8x?x64xf32>
-  %dim02_ = arith.index_cast %dim02 : index to i32
-  %s1 = tensor.from_elements %dim02_, %cminus1, %c128 : tensor<3xi32>
-  %w = mhlo.cstr_reshapable %n0, %s1 : (index, tensor<3xi32>) -> !shape.witness
-  func.return %w : !shape.witness
-}
-
-// -----
-
-// CHECK-LABEL: @nonredundant_cstr_reshapable
-func.func @nonredundant_cstr_reshapable(%arg0 : tensor<?x8x?x64xf32>)
-    -> !shape.witness {
-  // CHECK: %[[WITNESS:.*]] = mhlo.cstr_reshapable %{{.*}}, %{{.*}}
-  // CHECK: return %[[WITNESS]] : !shape.witness
-  %c0 = arith.constant 0 : index
-  %c2 = arith.constant 2 : index
-  %c42 = arith.constant 42 : i32
-  %cminus1 = arith.constant -1 : i32
-  %s0 = shape.shape_of %arg0 : tensor<?x8x?x64xf32> -> tensor<4xindex>
-  %n0 = shape.num_elements %s0 : tensor<4xindex> -> index
-  %dim02 = tensor.dim %arg0, %c2 : tensor<?x8x?x64xf32>
-  %dim02_ = arith.index_cast %dim02 : index to i32
-  %s1 = tensor.from_elements %dim02_, %cminus1, %c42 : tensor<3xi32>
-  %w = mhlo.cstr_reshapable %n0, %s1 : (index, tensor<3xi32>) -> !shape.witness
-  func.return %w : !shape.witness
-}
-
-// -----
-
-// CHECK-LABEL: @redundant_cstr_reshapable
-func.func @redundant_cstr_reshapable(%arg0 : tensor<?x8x?x64xf32>)
-    -> !shape.witness {
-  // CHECK: %[[WITNESS:.*]] = shape.const_witness true
-  // CHECK: return %[[WITNESS]] : !shape.witness
-  %c0 = arith.constant 0 : index
-  %c2 = arith.constant 2 : index
-  %c64 = arith.constant 64 : i32
-  %cminus1 = arith.constant -1 : i32
-  %s0 = shape.shape_of %arg0 : tensor<?x8x?x64xf32> -> tensor<4xindex>
-  %n0 = shape.num_elements %s0 : tensor<4xindex> -> index
-  %dim00 = tensor.dim %arg0, %c0 : tensor<?x8x?x64xf32>
-  %dim02 = tensor.dim %arg0, %c2 : tensor<?x8x?x64xf32>
-  %dim00_ = arith.index_cast %dim00 : index to i32
-  %dim02_ = arith.index_cast %dim02 : index to i32
-  %s1 = tensor.from_elements %dim02_, %dim00_ , %c64, %cminus1 : tensor<4xi32>
-  %w = mhlo.cstr_reshapable %n0, %s1 : (index, tensor<4xi32>) -> !shape.witness
-  func.return %w : !shape.witness
-}
-
-// -----
-
-// CHECK-LABEL: func @reshape_integration(
-// CHECK-SAME:      %arg0: tensor<512x512xf32>,
-// CHECK-SAME:      %arg1: tensor<?x8x?x64xf32>,
-// CHECK-SAME:      %[[DYN_SHAPE:.*]]: tensor<4xi32>,
-// CHECK-SAME:      %arg3: tensor<512xf32>,
-// CHECK-SAME:      %arg4: tensor<?x?x512xf32>,
-// CHECK-SAME:      %arg5: tensor<512xf32>,
-// CHECK-SAME:      %arg6: tensor<512xf32>,
-// CHECK-SAME:      %arg7: tensor<512x2048xf32>,
-// CHECK-SAME:      %arg8: tensor<2048xf32>,
-// CHECK-SAME:      %arg9: tensor<2048x512xf32>,
-// CHECK-SAME:      %arg10: tensor<512xf32>,
-// CHECK-SAME:      %arg11: tensor<512xf32>,
-// CHECK-SAME:      %arg12: tensor<512xf32>)
-func.func @reshape_integration(%arg0: tensor<512x512xf32>,
-    %arg1: tensor<?x8x?x64xf32>, %arg2: tensor<4xi32>, %arg3: tensor<512xf32>,
-    %arg4: tensor<?x?x512xf32>, %arg5: tensor<512xf32>, %arg6: tensor<512xf32>,
-    %arg7: tensor<512x2048xf32>, %arg8: tensor<2048xf32>,
-    %arg9: tensor<2048x512xf32>, %arg10: tensor<512xf32>,
-    %arg11: tensor<512xf32>, %arg12: tensor<512xf32>) -> tensor<?x512xf32> {
-  %0 = mhlo.constant dense<512> : tensor<1xi32>
-  %1 = shape.shape_of %arg1 : tensor<?x8x?x64xf32> -> tensor<4xindex>
-  %2 = shape.num_elements %1 : tensor<4xindex> -> index
-  // CHECK: %[[W:.*]] = mhlo.cstr_reshapable
-  %3 = mhlo.cstr_reshapable %2, %arg2 : (index, tensor<4xi32>) -> !shape.witness
-  // CHECK: shape.assuming %[[W]]
-  %4 = shape.assuming %3 -> (tensor<?x8x?x64xf32>) {
-    // CHECK: %[[SHAPE:.*]] = mhlo.compute_reshape_shape %{{.*}}, %[[DYN_SHAPE]]
-    %20 = mhlo.compute_reshape_shape %2, %arg2
-        : (index, tensor<4xi32>) -> tensor<4xi32>
-    // CHECK: mhlo.dynamic_reshape %arg1, %[[SHAPE]]
-    %21 = "mhlo.dynamic_reshape"(%arg1, %20)
-        : (tensor<?x8x?x64xf32>, tensor<4xi32>) -> tensor<?x8x?x64xf32>
-    // CHECK: shape.assuming_yield
-    shape.assuming_yield %21 : tensor<?x8x?x64xf32>
-  }
-  %5 = "mhlo.transpose"(%4) <{permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>}>
-      : (tensor<?x8x?x64xf32>) -> tensor<?x?x8x64xf32>
-  %6 = "mhlo.transpose"(%5) {permutation = dense<[0, 1, 3, 2]>
-      : tensor<4xi64>} : (tensor<?x?x8x64xf32>) -> tensor<?x?x64x8xf32>
-  %7 = shape.shape_of %6 : tensor<?x?x64x8xf32> -> tensor<4xindex>
-  %8 = arith.index_cast %7 : tensor<4xindex> to tensor<4xi32>
-  %9 = "mhlo.slice"(%8) {limit_indices = dense<1> : tensor<1xi64>,
-      start_indices = dense<0> : tensor<1xi64>,
-      strides = dense<1> : tensor<1xi64>} : (tensor<4xi32>) -> tensor<1xi32>
-  %10 = "mhlo.reshape"(%9) : (tensor<1xi32>) -> tensor<i32>
-  %11 = "mhlo.slice"(%8) {limit_indices = dense<2> : tensor<1xi64>,
-      start_indices = dense<1> : tensor<1xi64>,
-      strides = dense<1> : tensor<1xi64>} : (tensor<4xi32>) -> tensor<1xi32>
-  %12 = "mhlo.reshape"(%11) : (tensor<1xi32>) -> tensor<i32>
-  %13 = mhlo.multiply %10, %12 : tensor<i32>
-  %14 = "mhlo.reshape"(%13) : (tensor<i32>) -> tensor<1xi32>
-  %15 = "mhlo.concatenate"(%14, %0) <{dimension = 0 : i64}>
-      : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
-  %16 = shape.shape_of %6 : tensor<?x?x64x8xf32> -> tensor<4xindex>
-  %17 = shape.num_elements %16 : tensor<4xindex> -> index
-  // CHECK-NOT: cstr_reshapable
-  %18 = mhlo.cstr_reshapable %17, %15 : (index, tensor<2xi32>) -> !shape.witness
-  // CHECK-NOT: assuming
-  %19 = shape.assuming %18 -> (tensor<?x512xf32>) {
-    // CHECK-NOT: compute_reshape_shape
-    %20 = mhlo.compute_reshape_shape %17, %15
-        : (index, tensor<2xi32>) -> tensor<2xi32>
-    // CHECK: tensor.collapse_shape
-    %21 = "mhlo.dynamic_reshape"(%6, %20)
-        : (tensor<?x?x64x8xf32>, tensor<2xi32>) -> tensor<?x512xf32>
-    // CHECK-NOT: assuming_yield
-    shape.assuming_yield %21 : tensor<?x512xf32>
-  }
-  func.return %19 : tensor<?x512xf32>
-}
-
-// -----
-
 // CHECK-LABEL: @optimize_1dx1d_constraint
 func.func @optimize_1dx1d_constraint(
   %arg0: tensor<?xf32>
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/verifier_conv_op.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/verifier_conv_op.mlir
index 5b22e609fa5882..004f295e660013 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/verifier_conv_op.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/verifier_conv_op.mlir
@@ -330,8 +330,8 @@ func.func @invalid_conv_dimensions(%arg0: tensor<1x8x8x207xf32>,
 
 // -----
 
-func.func @invalid_conv_dimensions(%arg0: tensor<1x8x8x207xf32>,
-    %arg1: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
+func.func @invalid_conv_dimensions(%arg0: tensor<3x8x8x207xf32>,
+    %arg1: tensor<3x3x207x16xf32>) -> tensor<3x8x8x16xf32> {
   // expected-error@+1 {{expects output feature dimension size (16) to be a multiple of batch_group_count. Got batch_group_count = 3.}}
   %0 = mhlo.convolution(%arg0, %arg1)
          dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
@@ -342,8 +342,8 @@ func.func @invalid_conv_dimensions(%arg0: tensor<1x8x8x207xf32>,
            feature_group_count = 1 : i64,
            precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]
          } :
-       (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32>
-  func.return %0 : tensor<1x8x8x16xf32>
+       (tensor<3x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<3x8x8x16xf32>
+  func.return %0 : tensor<3x8x8x16xf32>
 }
 
 // -----
diff --git a/third_party/xla/xla/mlir_hlo/tests/alloc_to_arg.mlir b/third_party/xla/xla/mlir_hlo/tests/alloc_to_arg.mlir
index f477ecc229286f..0238bf95f12ecd 100644
--- a/third_party/xla/xla/mlir_hlo/tests/alloc_to_arg.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/alloc_to_arg.mlir
@@ -23,7 +23,7 @@ func.func @fusion() -> memref<4x4x8x32xf32> {
   // CHECK:   %[[COLLAPSE_SHAPE:.*]] = memref.collapse_shape %[[ARG0]] {{\[\[}}0, 1, 2], [3{{\]\]}}
   // CHECK:   "some.use"(%[[COLLAPSE_SHAPE]], %[[ARG0]])
   %alloc = memref.alloc() {alignment = 64 : i64} : memref<128x32xf32>
-  %expand_shape = memref.expand_shape %alloc [[0, 1, 2], [3]] : memref<128x32xf32> into memref<4x4x8x32xf32>
+  %expand_shape = memref.expand_shape %alloc [[0, 1, 2], [3]] output_shape [4, 4, 8, 32] : memref<128x32xf32> into memref<4x4x8x32xf32>
   "some.use"(%alloc, %expand_shape) : (memref<128x32xf32>, memref<4x4x8x32xf32>) -> ()
   return %expand_shape : memref<4x4x8x32xf32>
 }
diff --git a/third_party/xla/xla/mlir_hlo/tests/bufferize.mlir b/third_party/xla/xla/mlir_hlo/tests/bufferize.mlir
index 1341eeb1942bb6..152db59387ec62 100644
--- a/third_party/xla/xla/mlir_hlo/tests/bufferize.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/bufferize.mlir
@@ -233,7 +233,7 @@ func.func @minimum_broadcast_shapes(%lhs: tensor<?xindex>, %rhs: tensor<?xindex>
 
   // Select whether to use the original shapes in case of invalid broadcasts.
   // CHECK: %[[FINAL_RESULT_RHS:.*]] = arith.select %[[MAIN_FOR]]#4, %[[RHS]], %[[REDUCED_RESULT_RHS:.*]] : memref<?xindex>
-  %0, %1 = chlo.minimum_broadcast_shapes %lhs, %rhs :
+  %0, %1 = mhlo.minimum_broadcast_shapes %lhs, %rhs :
       tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
   // CHECK-NEXT: return %[[FINAL_RESULT_LHS]], %[[FINAL_RESULT_RHS]] : memref<?xindex>, memref<?xindex>
   func.return %0, %1 : tensor<?xindex>, tensor<?xindex>
diff --git a/third_party/xla/xla/mlir_hlo/tests/collapse_parallel_loops_to_1d_pass.mlir b/third_party/xla/xla/mlir_hlo/tests/collapse_parallel_loops_to_1d_pass.mlir
index 584c5c85206444..5eee4051842c4d 100644
--- a/third_party/xla/xla/mlir_hlo/tests/collapse_parallel_loops_to_1d_pass.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/collapse_parallel_loops_to_1d_pass.mlir
@@ -17,5 +17,5 @@ func.func @parallel_2d(%arg0: memref<4x4xf32>, %arg1: memref<4x4xf32>) {
   %1 = bufferization.to_tensor %0 : memref<4x4xf32>
   bufferization.materialize_in_destination %1 in writable %arg1
       : (tensor<4x4xf32>, memref<4x4xf32>) -> ()
-  "lmhlo.terminator"() : () -> ()
+  return
 }
\ No newline at end of file
diff --git a/third_party/xla/xla/mlir_hlo/tests/naive_copy_removal.mlir b/third_party/xla/xla/mlir_hlo/tests/naive_copy_removal.mlir
index 2b6a5b191c4634..860d8b9682c182 100644
--- a/third_party/xla/xla/mlir_hlo/tests/naive_copy_removal.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/naive_copy_removal.mlir
@@ -143,30 +143,3 @@ func.func @do_not_simplify_subview_with_other_use(%arg0: memref<8x8xf32>)
 // CHECK:         memref.subview
 // CHECK:         memref.copy
 // CHECK:         memref.copy
-
-// -----
-
-func.func @target_is_alloc_with_loads_stores(%arg0: memref<8x8xf32>)
-                                             -> memref<8x8xf32> {
-  %c4 = arith.constant 4 : index
-  %cst_0 = arith.constant 0.000000e+00 : f32
-  %alloc_4 = memref.alloc() {alignment = 64 : i64} : memref<8x8xf32>
-  memref.copy %arg0, %alloc_4: memref<8x8xf32> to memref<8x8xf32>
-  "lmhlo.custom_call"(%alloc_4, %alloc_4) ({
-  }) {
-    backend_config = "",
-    call_target_name = "foo",
-    has_side_effect = false,
-    operandSegmentSizes = array<i32: 1, 1>
-  } : (memref<8x8xf32>, memref<8x8xf32>) -> ()
-
-  return %arg0 : memref<8x8xf32>
-}
-
-// CHECK-LABEL: func @target_is_alloc_with_loads_stores(
-// CHECK-SAME:      %[[INPUT:.*]]: memref<8x8xf32>)
-
-// CHECK:         memref.alloc
-// CHECK:         memref.copy
-// CHECK:         "lmhlo.custom_call"
-// CHECK:         return %[[INPUT]]
diff --git a/third_party/xla/xla/mlir_hlo/tests/tile_loops.mlir b/third_party/xla/xla/mlir_hlo/tests/tile_loops.mlir
index 1357736a52da63..9040f5a66991ca 100644
--- a/third_party/xla/xla/mlir_hlo/tests/tile_loops.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/tile_loops.mlir
@@ -21,7 +21,7 @@ func.func @parallel_loop(%arg0: memref<16xf32>, %arg1: memref<16xf32>) {
   %1 = bufferization.to_tensor %0 : memref<16xf32>
   bufferization.materialize_in_destination %1 in writable %arg1
       : (tensor<16xf32>, memref<16xf32>) -> ()
-  "lmhlo.terminator"() : () -> ()
+  return
 }
 
 // CHECK-LABEL: func @statically_unrolled
@@ -48,7 +48,7 @@ func.func @statically_unrolled(%arg0: memref<?xindex>) {
     scf.reduce
   }
 
-  "lmhlo.terminator"() : () -> ()
+  return
 }
 
 // CHECK-LABEL: func @dynamically_unrolled
@@ -80,7 +80,7 @@ func.func @dynamically_unrolled(%arg0: memref<?xindex>, %arg1 : index) {
     scf.reduce
   }
 
-  "lmhlo.terminator"() : () -> ()
+  return
 }
 
 // CHECK-LABEL: func @complex_access
@@ -104,5 +104,5 @@ func.func @complex_access(%arg0: memref<16xf32>, %arg1: memref<4xf32>) {
   %1 = bufferization.to_tensor %0 : memref<4xf32>
   bufferization.materialize_in_destination %1 in writable %arg1
       : (tensor<4xf32>, memref<4xf32>) -> ()
-  "lmhlo.terminator"() : () -> ()
+  return
 }
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir-hlo-opt/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/tools/mlir-hlo-opt/CMakeLists.txt
index 65988d923416b2..95d8d0a0c219d7 100644
--- a/third_party/xla/xla/mlir_hlo/tools/mlir-hlo-opt/CMakeLists.txt
+++ b/third_party/xla/xla/mlir_hlo/tools/mlir-hlo-opt/CMakeLists.txt
@@ -24,18 +24,13 @@ set(LIBS
 
         AllMhloPasses
         DeallocationPasses
-        LmhloDialect
-        LmhloPasses
         MLIRBufferTransforms
         MLIRHLOGPUTransforms
         MhloRegisterDialects
         )
 add_llvm_executable(mlir-hlo-opt mlir-hlo-opt.cc
   DEPENDS
-        MLIRLmhloPassIncGen
         MLIRMhloPassIncGen
-        LMHLOTransformsPassIncGen
-        LMHLOGPUTransformsPassIncGen
 )
 llvm_update_compile_flags(mlir-hlo-opt)
 target_link_libraries(mlir-hlo-opt PRIVATE ${LIBS})
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir-hlo-opt/mlir-hlo-opt.cc b/third_party/xla/xla/mlir_hlo/tools/mlir-hlo-opt/mlir-hlo-opt.cc
index cde8923187ed5b..533e5ab823db18 100644
--- a/third_party/xla/xla/mlir_hlo/tools/mlir-hlo-opt/mlir-hlo-opt.cc
+++ b/third_party/xla/xla/mlir_hlo/tools/mlir-hlo-opt/mlir-hlo-opt.cc
@@ -14,8 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include "deallocation/transforms/passes.h"
-#include "lhlo/IR/lhlo_ops.h"
-#include "lhlo/transforms/passes.h"
 #include "mhlo/IR/register.h"
 #include "mhlo/transforms/passes.h"
 #include "mlir/InitAllDialects.h"
@@ -32,7 +30,6 @@ int main(int argc, char** argv) {
   registerAllPasses();
   deallocation::registerDeallocationPasses();
   hlo::registerLMHLOTransformsPasses();
-  lmhlo::registerAllLmhloPasses();
   mhlo::registerAllMhloPasses();
   registerLMHLOGPUTransformsPasses();
 
@@ -41,6 +38,5 @@ int main(int argc, char** argv) {
   registerAllExtensions(registry);
   mhlo::registerAllMhloDialects(registry);
   stablehlo::registerAllDialects(registry);
-  registry.insert<lmhlo::LmhloDialect>();
   return failed(MlirOptMain(argc, argv, "MLIR HLO pass driver\n", registry));
 }
diff --git a/third_party/xla/xla/mlir_hlo/transforms/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/transforms/CMakeLists.txt
index 45c8f24a796f5f..18d08f827e60ce 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/CMakeLists.txt
+++ b/third_party/xla/xla/mlir_hlo/transforms/CMakeLists.txt
@@ -32,7 +32,6 @@ add_mlir_library(MLIRBufferTransforms
   lower_index_cast_pass.cc
   naive_copy_removal.cc
   propagate_static_shapes_to_kernel.cc
-  test_hlo_transform_dialect_interpreter.cc
   tile_loops_pass.cc
   vectorize_copy.cc
   unbufferize_pass.cc
diff --git a/third_party/xla/xla/mlir_hlo/transforms/alloc_to_arg_pass.cc b/third_party/xla/xla/mlir_hlo/transforms/alloc_to_arg_pass.cc
index 8768825ec13325..f099bf3b4dfe96 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/alloc_to_arg_pass.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/alloc_to_arg_pass.cc
@@ -75,7 +75,7 @@ void AllocToArgPass::runOnOperation() {
     // Case: shape-expanded alloc.
     if (auto expandOp =
             llvm::dyn_cast_or_null<memref::ExpandShapeOp>(resultDef)) {
-      Operation *expandDef = expandOp.getOperand().getDefiningOp();
+      Operation *expandDef = expandOp.getOperand(0).getDefiningOp();
       if (auto allocOp = llvm::dyn_cast_or_null<memref::AllocOp>(expandDef)) {
         resultsToErase.set(i);
         auto attrs = funcOp.getResultAttrDict(i);
diff --git a/third_party/xla/xla/mlir_hlo/transforms/bufferize.cc b/third_party/xla/xla/mlir_hlo/transforms/bufferize.cc
index 93063d24190f66..09ca9bd3ae1f72 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/bufferize.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/bufferize.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <optional>
 
 // This file implements logic for translating mixed IR to buffer form.
+#include "mhlo/IR/hlo_ops.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
 #include "mlir/Dialect/Complex/IR/Complex.h"
@@ -23,11 +24,11 @@ limitations under the License.
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/Attributes.h"
-#include "mlir/IR/IRMapping.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/IRMapping.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/DialectConversion.h"
-#include "stablehlo/dialect/ChloOps.h"
 #include "transforms/rewriters.h"
 
 namespace mlir {
@@ -41,21 +42,21 @@ struct BufferizeConstantOp : public OpConversionPattern<arith::ConstantOp> {
       ConversionPatternRewriter &rewriter) const final {
     // We only need to bufferize tensor constants.
     Location loc = op.getLoc();
-    auto resultType = op.getType().dyn_cast<RankedTensorType>();
+    auto resultType = mlir::dyn_cast<RankedTensorType>(op.getType());
     int64_t resultRank = resultType.getRank();
     if (!resultType || !resultType.hasStaticShape() || resultRank > 1)
       return failure();
 
     auto elementType = resultType.getElementType();
     auto memrefType = MemRefType::get(resultType.getShape(), elementType);
-    auto elementsAttr = op.getValue().cast<DenseElementsAttr>();
+    auto elementsAttr = mlir::cast<DenseElementsAttr>(op.getValue());
 
     // arith.constant doesn't handle scalar complex types.
     // TODO(kramerb): Should this use materializeConstant instead?
     auto makeConstant = [&](Attribute attr, Type type) -> Value {
       if (complex::ConstantOp::isBuildableWith(attr, type))
-        return rewriter.create<complex::ConstantOp>(loc, type,
-                                                    attr.cast<ArrayAttr>());
+        return rewriter.create<complex::ConstantOp>(
+            loc, type, mlir::cast<ArrayAttr>(attr));
       return rewriter.create<arith::ConstantOp>(loc, cast<TypedAttr>(attr));
     };
 
@@ -87,12 +88,12 @@ struct BufferizeConstantOp : public OpConversionPattern<arith::ConstantOp> {
 };
 
 struct BufferizeAndConvertMinimumBroadcastShapesOp
-    : public OpConversionPattern<chlo::MinimumBroadcastShapesOp> {
+    : public OpConversionPattern<mhlo::MinimumBroadcastShapesOp> {
   using OpConversionPattern<
-      chlo::MinimumBroadcastShapesOp>::OpConversionPattern;
+      mhlo::MinimumBroadcastShapesOp>::OpConversionPattern;
 
   LogicalResult matchAndRewrite(
-      chlo::MinimumBroadcastShapesOp broadcastShapesOp, OpAdaptor adaptor,
+      mhlo::MinimumBroadcastShapesOp broadcastShapesOp, OpAdaptor adaptor,
       ConversionPatternRewriter &rewriter) const override {
     auto loc = broadcastShapesOp.getLoc();
     ImplicitLocOpBuilder lb(loc, rewriter);
diff --git a/third_party/xla/xla/mlir_hlo/transforms/bufferize_pass.cc b/third_party/xla/xla/mlir_hlo/transforms/bufferize_pass.cc
index 93756798df52b1..1e810cff21a555 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/bufferize_pass.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/bufferize_pass.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include "lhlo/IR/lhlo_ops.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mhlo/IR/hlo_ops.h"
@@ -67,9 +66,9 @@ limitations under the License.
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/Visitors.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "stablehlo/dialect/ChloOps.h"
 #include "transforms/passes.h"
 #include "transforms/rewriters.h"
 
@@ -91,7 +90,7 @@ static constexpr char kFusionFunctionLabel[] = "fusion";
 static Value materializeToTensor(OpBuilder& builder, TensorType type,
                                  ValueRange inputs, Location loc) {
   assert(inputs.size() == 1);
-  assert(inputs[0].getType().isa<BaseMemRefType>());
+  assert(mlir::isa<BaseMemRefType>(inputs[0].getType()));
   return builder.create<bufferization::ToTensorOp>(loc, type, inputs[0]);
 }
 
@@ -120,11 +119,11 @@ class CustomBufferizeTypeConverter
       // a memref with a specified layout, i.e. non-empty affine map.
       // TODO(pifon) : Change how target materialization is invoked in dialect
       // conversion.
-      if (auto memrefType = inputs[0].getType().dyn_cast<MemRefType>()) {
+      if (auto memrefType = mlir::dyn_cast<MemRefType>(inputs[0].getType())) {
         assert(!memrefType.getLayout().isIdentity());
         return inputs[0];
       }
-      assert(inputs[0].getType().isa<TensorType>());
+      assert(mlir::isa<TensorType>(inputs[0].getType()));
       return builder.create<bufferization::ToMemrefOp>(loc, type, inputs[0]);
     });
   }
@@ -134,10 +133,9 @@ struct ComputeOpAndFuncBufferizePass
     : public impl::ComputeOpAndFuncBufferizePassBase<
           ComputeOpAndFuncBufferizePass> {
   void getDependentDialects(DialectRegistry& registry) const override {
-    registry
-        .insert<bufferization::BufferizationDialect, lmhlo::LmhloDialect,
-                linalg::LinalgDialect, memref::MemRefDialect, mhlo::MhloDialect,
-                shape::ShapeDialect, vector::VectorDialect>();
+    registry.insert<bufferization::BufferizationDialect, linalg::LinalgDialect,
+                    memref::MemRefDialect, mhlo::MhloDialect,
+                    shape::ShapeDialect, vector::VectorDialect>();
     arith::registerBufferizableOpInterfaceExternalModels(registry);
     mlir::bufferization::func_ext::
         registerBufferizableOpInterfaceExternalModels(registry);
@@ -173,12 +171,15 @@ struct ComputeOpAndFuncBufferizePass
     RewritePatternSet patterns(&getContext());
     auto& context = getContext();
     ConversionTarget target(context);
-    target.addLegalDialect<
-        affine::AffineDialect, arith::ArithDialect, complex::ComplexDialect,
-        func::FuncDialect, lmhlo::LmhloDialect, math::MathDialect,
-        memref::MemRefDialect, tensor::TensorDialect, vector::VectorDialect>();
+    target.addLegalDialect<affine::AffineDialect, arith::ArithDialect,
+                           complex::ComplexDialect, func::FuncDialect,
+                           math::MathDialect, memref::MemRefDialect,
+                           tensor::TensorDialect, vector::VectorDialect>();
     target.addLegalOp<UnrealizedConversionCastOp>();
-    target.addIllegalDialect<mhlo::MhloDialect>();
+    auto isLegalMhloOp = [&](Operation* op) {
+      return isa<mhlo::MinimumBroadcastShapesOp>(op);
+    };
+    target.addDynamicallyLegalDialect<mhlo::MhloDialect>(isLegalMhloOp);
 
     CustomBufferizeTypeConverter converter;
     populateFunctionOpInterfaceTypeConversionPattern<func::FuncOp>(patterns,
@@ -215,10 +216,9 @@ struct OneShotBufferizePass
     : public impl::OneShotBufferizeBase<OneShotBufferizePass> {
   // TODO(b/173201243): Move to tablegen.
   void getDependentDialects(DialectRegistry& registry) const override {
-    registry
-        .insert<bufferization::BufferizationDialect, lmhlo::LmhloDialect,
-                linalg::LinalgDialect, memref::MemRefDialect, mhlo::MhloDialect,
-                scf::SCFDialect, shape::ShapeDialect, vector::VectorDialect>();
+    registry.insert<bufferization::BufferizationDialect, linalg::LinalgDialect,
+                    memref::MemRefDialect, mhlo::MhloDialect, scf::SCFDialect,
+                    shape::ShapeDialect, vector::VectorDialect>();
     arith::registerBufferizableOpInterfaceExternalModels(registry);
     bufferization::func_ext::registerBufferizableOpInterfaceExternalModels(
         registry);
@@ -264,11 +264,10 @@ struct FinalBufferizePass
 
  public:
   void getDependentDialects(DialectRegistry& registry) const override {
-    registry
-        .insert<affine::AffineDialect, bufferization::BufferizationDialect,
-                linalg::LinalgDialect, memref::MemRefDialect, scf::SCFDialect,
-                shape::ShapeDialect, tensor::TensorDialect, lmhlo::LmhloDialect,
-                arith::ArithDialect, vector::VectorDialect>();
+    registry.insert<affine::AffineDialect, bufferization::BufferizationDialect,
+                    linalg::LinalgDialect, memref::MemRefDialect,
+                    scf::SCFDialect, shape::ShapeDialect, tensor::TensorDialect,
+                    arith::ArithDialect, vector::VectorDialect>();
     arith::registerBufferizableOpInterfaceExternalModels(registry);
     linalg::registerBufferizableOpInterfaceExternalModels(registry);
     shape::registerBufferizableOpInterfaceExternalModels(registry);
@@ -317,14 +316,14 @@ struct FinalBufferizePass
         arith::ArithDialect, bufferization::BufferizationDialect,
         cf::ControlFlowDialect, complex::ComplexDialect, memref::MemRefDialect,
         func::FuncDialect, scf::SCFDialect, tensor::TensorDialect,
-        affine::AffineDialect, shape::ShapeDialect, lmhlo::LmhloDialect,
-        linalg::LinalgDialect, math::MathDialect, vector::VectorDialect>();
+        affine::AffineDialect, shape::ShapeDialect, linalg::LinalgDialect,
+        math::MathDialect, vector::VectorDialect>();
     target.addLegalOp<func::FuncOp, ModuleOp>();
 
     target.addIllegalDialect<mhlo::MhloDialect>();
     target.addIllegalOp<tensor::GenerateOp, tensor::ExtractOp,
                         tensor::FromElementsOp, tensor::CastOp, tensor::DimOp,
-                        tensor::RankOp, chlo::MinimumBroadcastShapesOp,
+                        tensor::RankOp, mhlo::MinimumBroadcastShapesOp,
                         bufferization::ToTensorOp, bufferization::ToMemrefOp,
                         tensor::ExpandShapeOp, tensor::CollapseShapeOp>();
     CustomBufferizeTypeConverter converter;
diff --git a/third_party/xla/xla/mlir_hlo/transforms/detensorize_scf_ops.cc b/third_party/xla/xla/mlir_hlo/transforms/detensorize_scf_ops.cc
index 196a394ece577d..2a8be4e6b09ae0 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/detensorize_scf_ops.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/detensorize_scf_ops.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "transforms/passes.h"
 
@@ -33,7 +34,7 @@ namespace mlir {
 namespace {
 
 bool isUnitTensor(Value value) {
-  if (auto tensorTy = value.getType().dyn_cast<RankedTensorType>()) {
+  if (auto tensorTy = mlir::dyn_cast<RankedTensorType>(value.getType())) {
     return tensorTy.getRank() == 0;
   }
   return false;
@@ -74,7 +75,7 @@ struct RegionOpPattern : public OpRewritePattern<T> {
           b.setInsertionPointToStart(&block);
           // Change the argument type to a scalar, but repack it into a tensor.
           arg.setType(
-              arg.getType().template cast<RankedTensorType>().getElementType());
+              mlir::cast<RankedTensorType>(arg.getType()).getElementType());
           auto converted = b.create<tensor::FromElementsOp>(
               RankedTensorType::get({}, arg.getType()), arg);
           arg.replaceAllUsesExcept(converted, converted.getOperation());
@@ -94,7 +95,7 @@ struct RegionOpPattern : public OpRewritePattern<T> {
     llvm::SmallVector<Value> results = result->getResults();
     for (auto [index, opResult] : unitTensors(results)) {
       // Fix the result type in the SCF op (it's actually a scalar now).
-      auto oldType = opResult.getType().template cast<RankedTensorType>();
+      auto oldType = mlir::cast<RankedTensorType>(opResult.getType());
       opResult.setType(oldType.getElementType());
 
       // Convert the scalar back to a tensor in the output.
diff --git a/third_party/xla/xla/mlir_hlo/transforms/lower_index_cast_pass.cc b/third_party/xla/xla/mlir_hlo/transforms/lower_index_cast_pass.cc
index 6d67f53ca1df3c..489d8fb4cb811e 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/lower_index_cast_pass.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/lower_index_cast_pass.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Tensor/Utils/Utils.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "transforms/passes.h"
 
@@ -40,7 +41,7 @@ struct IndexCastConverter : public OpRewritePattern<T> {
  public:
   using OpRewritePattern<T>::OpRewritePattern;
   LogicalResult matchAndRewrite(T op, PatternRewriter &rewriter) const final {
-    auto resultTy = op.getType().template dyn_cast<RankedTensorType>();
+    auto resultTy = mlir::dyn_cast<RankedTensorType>(op.getType());
     if (!resultTy) return failure();
 
     SmallVector<Value> dynamicExtents =
diff --git a/third_party/xla/xla/mlir_hlo/transforms/passes.h b/third_party/xla/xla/mlir_hlo/transforms/passes.h
index 3ecee8797e076f..b7ac841829a64e 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/passes.h
+++ b/third_party/xla/xla/mlir_hlo/transforms/passes.h
@@ -90,12 +90,6 @@ std::unique_ptr<OperationPass<func::FuncOp>> createNaiveCopyRemovalPass();
 /// Pass to vectorize `memref.copy`.
 std::unique_ptr<OperationPass<func::FuncOp>> createVectorizeCopyPass();
 
-/// Registers the test pass for erasing transform dialect ops.
-void registerTestHloTransformDialectEraseSchedulePass();
-
-/// Registers the test pass for applying transform dialect ops.
-void registerTestHloTransformDialectInterpreterPass();
-
 namespace hlo {
 std::unique_ptr<OperationPass<ModuleOp>> createOneShotBufferizePass();
 
diff --git a/third_party/xla/xla/mlir_hlo/transforms/propagate_static_shapes_to_kernel.cc b/third_party/xla/xla/mlir_hlo/transforms/propagate_static_shapes_to_kernel.cc
index aa1873478a0e7b..26f4412a29ebcd 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/propagate_static_shapes_to_kernel.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/propagate_static_shapes_to_kernel.cc
@@ -166,7 +166,7 @@ LogicalResult PropagateStaticShapesPattern::matchAndRewrite(
   // corresponding to the flattened memref with just the 'base' pointer.
   for (auto arguments = funcOp.getArguments(); !arguments.empty();
        operands = operands.drop_front()) {
-    auto memref = operands.getTypes().front().dyn_cast<MemRefType>();
+    auto memref = mlir::dyn_cast<MemRefType>(operands.getTypes().front());
     if (!memref) {
       // Scalar argument, advance by one.
       arguments = arguments.drop_front();
@@ -176,10 +176,10 @@ LogicalResult PropagateStaticShapesPattern::matchAndRewrite(
     // memref is flattened to base, align, offset, strides and sizes.
     int64_t numArgs = 3 + memref.getRank() * 2;
     auto isPtr = [](BlockArgument arg) {
-      return arg.getType().isa<LLVM::LLVMPointerType>();
+      return mlir::isa<LLVM::LLVMPointerType>(arg.getType());
     };
     auto isInt = [](BlockArgument arg) {
-      return arg.getType().isa<IntegerType>();
+      return mlir::isa<IntegerType>(arg.getType());
     };
     // Bail out if the next num_args are not the expected type.
     if (static_cast<int64_t>(arguments.size()) < numArgs) break;
diff --git a/third_party/xla/xla/mlir_hlo/transforms/test_hlo_transform_dialect_interpreter.cc b/third_party/xla/xla/mlir_hlo/transforms/test_hlo_transform_dialect_interpreter.cc
deleted file mode 100644
index 99bfb28760403a..00000000000000
--- a/third_party/xla/xla/mlir_hlo/transforms/test_hlo_transform_dialect_interpreter.cc
+++ /dev/null
@@ -1,244 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <string>
-#include <utility>
-
-#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
-#include "mlir/Dialect/Transform/Transforms/TransformInterpreterPassBase.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/Pass/Pass.h"
-#include "transforms/passes.h"
-
-namespace mlir {
-namespace {
-
-template <typename Derived>
-class OpPassWrapper : public PassWrapper<Derived, OperationPass<>> {};
-
-class TestHloTransformDialectInterpreterPass
-    : public transform::TransformInterpreterPassBase<
-          TestHloTransformDialectInterpreterPass, OpPassWrapper> {
- public:
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
-      TestHloTransformDialectInterpreterPass)
-
-  TestHloTransformDialectInterpreterPass() = default;
-  TestHloTransformDialectInterpreterPass(
-      const TestHloTransformDialectInterpreterPass &pass)
-      : TransformInterpreterPassBase(pass) {}
-
-  StringRef getArgument() const override {
-    return "test-hlo-transform-dialect-interpreter";
-  }
-
-  StringRef getDescription() const override {
-    return "apply transform dialect operations one by one";
-  }
-
-  void findOperationsByName(Operation *root, StringRef name,
-                            SmallVectorImpl<Operation *> &operations) {
-    root->walk([&](Operation *op) {
-      if (op->getName().getStringRef() == name) {
-        operations.push_back(op);
-      }
-    });
-  }
-
-  void createParameterMapping(MLIRContext &context, ArrayRef<int> values,
-                              RaggedArray<transform::MappedValue> &result) {
-    SmallVector<transform::MappedValue> storage =
-        llvm::to_vector(llvm::map_range(values, [&](int v) {
-          Builder b(&context);
-          return transform::MappedValue(b.getI64IntegerAttr(v));
-        }));
-    result.push_back(std::move(storage));
-  }
-
-  void createOpResultMapping(
-      Operation *root, StringRef name,
-      RaggedArray<transform::MappedValue> &extraMapping) {
-    SmallVector<Operation *> operations;
-    findOperationsByName(root, name, operations);
-    SmallVector<Value> results;
-    for (Operation *op : operations)
-      llvm::append_range(results, op->getResults());
-    extraMapping.push_back(results);
-  }
-
-  unsigned numberOfSetOptions(const Option<std::string> &ops,
-                              const ListOption<int> &params,
-                              const Option<std::string> &values) {
-    unsigned numSetValues = 0;
-    numSetValues += !ops.empty();
-    numSetValues += !params.empty();
-    numSetValues += !values.empty();
-    return numSetValues;
-  }
-
-  void runOnOperation() override {
-    unsigned firstSetOptions =
-        numberOfSetOptions(bindFirstExtraToOps, bindFirstExtraToParams,
-                           bindFirstExtraToResultsOfOps);
-    unsigned secondSetOptions =
-        numberOfSetOptions(bindSecondExtraToOps, bindSecondExtraToParams,
-                           bindSecondExtraToResultsOfOps);
-    auto loc = UnknownLoc::get(&getContext());
-    if (firstSetOptions > 1) {
-      emitError(loc) << "cannot bind the first extra top-level argument to "
-                        "multiple entities";
-      return signalPassFailure();
-    }
-    if (secondSetOptions > 1) {
-      emitError(loc) << "cannot bind the second extra top-level argument to "
-                        "multiple entities";
-      return signalPassFailure();
-    }
-    if (firstSetOptions == 0 && secondSetOptions != 0) {
-      emitError(loc) << "cannot bind the second extra top-level argument "
-                        "without bindings the first";
-    }
-
-    RaggedArray<transform::MappedValue> extraMapping;
-    if (!bindFirstExtraToOps.empty()) {
-      SmallVector<Operation *> operations;
-      findOperationsByName(getOperation(), bindFirstExtraToOps.getValue(),
-                           operations);
-      extraMapping.push_back(operations);
-    } else if (!bindFirstExtraToParams.empty()) {
-      createParameterMapping(getContext(), bindFirstExtraToParams,
-                             extraMapping);
-    } else if (!bindFirstExtraToResultsOfOps.empty()) {
-      createOpResultMapping(getOperation(), bindFirstExtraToResultsOfOps,
-                            extraMapping);
-    }
-
-    if (!bindSecondExtraToOps.empty()) {
-      SmallVector<Operation *> operations;
-      findOperationsByName(getOperation(), bindSecondExtraToOps, operations);
-      extraMapping.push_back(operations);
-    } else if (!bindSecondExtraToParams.empty()) {
-      createParameterMapping(getContext(), bindSecondExtraToParams,
-                             extraMapping);
-    } else if (!bindSecondExtraToResultsOfOps.empty()) {
-      createOpResultMapping(getOperation(), bindSecondExtraToResultsOfOps,
-                            extraMapping);
-    }
-
-    options = options.enableExpensiveChecks(enableExpensiveChecks);
-    if (failed(transform::detail::interpreterBaseRunOnOperationImpl(
-            getOperation(), getArgument(), getSharedTransformModule(),
-            getTransformLibraryModule(), extraMapping, options,
-            transformFileName, transformLibraryPaths, debugPayloadRootTag,
-            debugTransformRootTag, getBinaryName())))
-      return signalPassFailure();
-  }
-
-  Option<bool> enableExpensiveChecks{
-      *this, "enable-expensive-checks", llvm::cl::init(false),
-      llvm::cl::desc("perform expensive checks to better report errors in the "
-                     "transform IR")};
-
-  Option<std::string> bindFirstExtraToOps{
-      *this, "bind-first-extra-to-ops",
-      llvm::cl::desc("bind the first extra argument of the top-level op to "
-                     "payload operations of the given kind")};
-  ListOption<int> bindFirstExtraToParams{
-      *this, "bind-first-extra-to-params",
-      llvm::cl::desc("bind the first extra argument of the top-level op to "
-                     "the given integer parameters")};
-  Option<std::string> bindFirstExtraToResultsOfOps{
-      *this, "bind-first-extra-to-results-of-ops",
-      llvm::cl::desc("bind the first extra argument of the top-level op to "
-                     "results of payload operations of the given kind")};
-
-  Option<std::string> bindSecondExtraToOps{
-      *this, "bind-second-extra-to-ops",
-      llvm::cl::desc("bind the second extra argument of the top-level op to "
-                     "payload operations of the given kind")};
-  ListOption<int> bindSecondExtraToParams{
-      *this, "bind-second-extra-to-params",
-      llvm::cl::desc("bind the second extra argument of the top-level op to "
-                     "the given integer parameters")};
-  Option<std::string> bindSecondExtraToResultsOfOps{
-      *this, "bind-second-extra-to-results-of-ops",
-      llvm::cl::desc("bind the second extra argument of the top-level op to "
-                     "results of payload operations of the given kind")};
-
-  Option<std::string> transformFileName{
-      *this, "transform-file-name", llvm::cl::init(""),
-      llvm::cl::desc(
-          "Optional filename containing a transform dialect specification to "
-          "apply. If left empty, the IR is assumed to contain one top-level "
-          "transform dialect operation somewhere in the module.")};
-  ListOption<std::string> transformLibraryPaths{
-      *this, "transform-library-paths", llvm::cl::ZeroOrMore,
-      llvm::cl::desc(
-          "Optional name of the file containing transform dialect symbol "
-          "definitions to be injected into the transform module.")};
-  Option<std::string> debugPayloadRootTag{
-      *this, "debug-payload-root-tag", llvm::cl::init(""),
-      llvm::cl::desc(
-          "Select the operation with 'transform.target_tag' attribute having "
-          "the given value as payload IR root. If empty select the pass anchor "
-          "operation as the payload IR root.")};
-  Option<std::string> debugTransformRootTag{
-      *this, "debug-transform-root-tag", llvm::cl::init(""),
-      llvm::cl::desc(
-          "Select the operation with 'transform.target_tag' attribute having "
-          "the given value as container IR for top-level transform ops. This "
-          "allows user control on what transformation to apply. If empty, "
-          "select the container of the top-level transform op.")};
-};
-
-struct TestHloTransformDialectEraseSchedulePass
-    : public PassWrapper<TestHloTransformDialectEraseSchedulePass,
-                         OperationPass<ModuleOp>> {
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
-      TestHloTransformDialectEraseSchedulePass)
-
-  StringRef getArgument() const final {
-    return "test-hlo-transform-dialect-erase-schedule";
-  }
-
-  StringRef getDescription() const final {
-    return "erase transform dialect schedule from the IR";
-  }
-
-  void runOnOperation() override {
-    getOperation()->walk<WalkOrder::PreOrder>([&](Operation *nestedOp) {
-      if (isa<transform::TransformOpInterface>(nestedOp)) {
-        nestedOp->erase();
-        return WalkResult::skip();
-      }
-      return WalkResult::advance();
-    });
-  }
-};
-
-}  // namespace
-
-/// Registers the test pass for erasing transform dialect ops.
-void registerTestHloTransformDialectEraseSchedulePass() {
-  PassRegistration<TestHloTransformDialectEraseSchedulePass> reg;
-}
-
-/// Registers the test pass for applying transform dialect ops.
-void registerTestHloTransformDialectInterpreterPass() {
-  PassRegistration<TestHloTransformDialectInterpreterPass> reg;
-}
-
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/transforms/unbufferize_pass.cc b/third_party/xla/xla/mlir_hlo/transforms/unbufferize_pass.cc
index 9b936bd759b58d..30932490b98970 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/unbufferize_pass.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/unbufferize_pass.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/Value.h"
+#include "mlir/Support/LLVM.h"
 #include "transforms/passes.h"
 
 namespace mlir {
@@ -54,7 +55,7 @@ void UnbufferizePass::runOnOperation() {
   IRMapping mapping;
   llvm::SmallDenseSet<BlockArgument> insertedArgs;
   funcOp->walk([&](bufferization::ToTensorOp op) {
-    auto arg = op.getMemref().dyn_cast<BlockArgument>();
+    auto arg = mlir::dyn_cast<BlockArgument>(op.getMemref());
     if (!arg) return;
     Value newValue = mapping.lookupOrNull(arg);
     if (newValue == nullptr) {
@@ -70,7 +71,7 @@ void UnbufferizePass::runOnOperation() {
   SmallVector<Value> results;
   SmallVector<DictionaryAttr> resultAttrs;
   funcOp->walk([&](bufferization::MaterializeInDestinationOp op) {
-    auto arg = op.getDest().dyn_cast<BlockArgument>();
+    auto arg = mlir::dyn_cast<BlockArgument>(op.getDest());
     if (!arg) return;
     argsToErase.set(arg.getArgNumber());
     results.push_back(op.getSource());
diff --git a/third_party/xla/xla/mlir_hlo/utils/codegen_utils.cc b/third_party/xla/xla/mlir_hlo/utils/codegen_utils.cc
index fc0622ae04627d..2562c060e82f9b 100644
--- a/third_party/xla/xla/mlir_hlo/utils/codegen_utils.cc
+++ b/third_party/xla/xla/mlir_hlo/utils/codegen_utils.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Location.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
 
 using llvm::SmallVector;
 
@@ -28,7 +29,7 @@ namespace mlir {
 namespace codegen_utils {
 
 Value emitNumElementsComputation(OpBuilder& b, Location loc, Value memref) {
-  int rank = memref.getType().cast<MemRefType>().getRank();
+  int rank = mlir::cast<MemRefType>(memref.getType()).getRank();
   Value numElements;
   numElements = b.create<mlir::arith::ConstantOp>(
       loc, b.getIndexType(), b.getIntegerAttr(b.getIndexType(), 1));
@@ -83,7 +84,7 @@ SmallVector<Value> calcMultiDimIndex(OpBuilder& b, Location loc,
 
 SmallVector<Value> calcMultiDimIndex(OpBuilder& b, Location loc,
                                      Value linearIndex, Value memref) {
-  int rank = memref.getType().cast<MemRefType>().getRank();
+  int rank = mlir::cast<MemRefType>(memref.getType()).getRank();
   SmallVector<Value> result;
   if (rank == 0) return result;
   if (rank == 1) {
diff --git a/third_party/xla/xla/mlir_hlo/utils/convert_op_folder.cc b/third_party/xla/xla/mlir_hlo/utils/convert_op_folder.cc
index 2468da81bfab99..bdfa26f54eed9d 100644
--- a/third_party/xla/xla/mlir_hlo/utils/convert_op_folder.cc
+++ b/third_party/xla/xla/mlir_hlo/utils/convert_op_folder.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/TypeUtilities.h"
+#include "mlir/Support/LLVM.h"
 
 namespace mlir {
 namespace hlo {
@@ -31,8 +32,8 @@ mlir::ElementsAttr convertElementsAttr(const mlir::ElementsAttr& elements,
                                        mlir::Type newType) {
   auto oldType = getElementTypeOrSelf(elements);
   // TODO(kramerb): Add support when MLIR can represent const complex tensors.
-  if (!oldType.isa<mlir::IntegerType, mlir::FloatType>() ||
-      !newType.isa<mlir::IntegerType, mlir::FloatType>()) {
+  if (!mlir::isa<mlir::IntegerType, mlir::FloatType>(oldType) ||
+      !mlir::isa<mlir::IntegerType, mlir::FloatType>(newType)) {
     return {};
   }
 
@@ -41,10 +42,10 @@ mlir::ElementsAttr convertElementsAttr(const mlir::ElementsAttr& elements,
   bool isOldTypeUnsigned = oldType.isInteger(1) || oldType.isUnsignedInteger();
   bool isNewTypeUnsigned = newType.isInteger(1) || newType.isUnsignedInteger();
 
-  if (oldType.isa<mlir::FloatType>()) {
-    if (auto newFloatType = newType.dyn_cast<mlir::FloatType>()) {
+  if (mlir::isa<mlir::FloatType>(oldType)) {
+    if (auto newFloatType = mlir::dyn_cast<mlir::FloatType>(newType)) {
       // Float -> Float
-      return elements.cast<DenseIntOrFPElementsAttr>().mapValues(
+      return mlir::cast<DenseIntOrFPElementsAttr>(elements).mapValues(
           newType, [&](const APFloat& floatVal) -> APInt {
             APFloat convertedFloat = floatVal;
             bool losesInfo = false;
@@ -54,7 +55,7 @@ mlir::ElementsAttr convertElementsAttr(const mlir::ElementsAttr& elements,
           });
     }
     // Float -> Int
-    return elements.cast<DenseIntOrFPElementsAttr>().mapValues(
+    return mlir::cast<DenseIntOrFPElementsAttr>(elements).mapValues(
         newType, [&](const APFloat& floatVal) -> APInt {
           bool ignored;
           APSInt intVal(bitWidth, isNewTypeUnsigned);
@@ -64,9 +65,9 @@ mlir::ElementsAttr convertElementsAttr(const mlir::ElementsAttr& elements,
   }
 
   // old_type is Integer
-  if (auto newFloatType = newType.dyn_cast<mlir::FloatType>()) {
+  if (auto newFloatType = mlir::dyn_cast<mlir::FloatType>(newType)) {
     // Int -> Float
-    return elements.cast<DenseIntOrFPElementsAttr>().mapValues(
+    return mlir::cast<DenseIntOrFPElementsAttr>(elements).mapValues(
         newType, [&](const APInt& intVal) -> APInt {
           APFloat floatVal(newFloatType.getFloatSemantics(),
                            APInt::getZero(newFloatType.getWidth()));
@@ -78,7 +79,7 @@ mlir::ElementsAttr convertElementsAttr(const mlir::ElementsAttr& elements,
   }
   // new_type is Integer
   // Int -> Int
-  return elements.cast<DenseIntOrFPElementsAttr>().mapValues(
+  return mlir::cast<DenseIntOrFPElementsAttr>(elements).mapValues(
       newType, [&](const APInt& intVal) -> APInt {
         return APSInt(intVal, isOldTypeUnsigned).extOrTrunc(bitWidth);
       });
diff --git a/third_party/xla/xla/mlir_hlo/utils/hlo_utils.cc b/third_party/xla/xla/mlir_hlo/utils/hlo_utils.cc
index fc02ada8840de2..a793e61a7fb367 100644
--- a/third_party/xla/xla/mlir_hlo/utils/hlo_utils.cc
+++ b/third_party/xla/xla/mlir_hlo/utils/hlo_utils.cc
@@ -37,8 +37,8 @@ static constexpr size_t kPaddingSize = 64;
 
 DenseI64ArrayAttr getBroadcastDimensionsAttr(Builder* b, Value x, Value y,
                                              bool allowEmpty) {
-  TensorType xType = x.getType().dyn_cast<RankedTensorType>();
-  TensorType yType = y.getType().dyn_cast<RankedTensorType>();
+  TensorType xType = mlir::dyn_cast<RankedTensorType>(x.getType());
+  TensorType yType = mlir::dyn_cast<RankedTensorType>(y.getType());
   if (!xType || !yType) return {};
   if (allowEmpty && xType == yType) return {};
 
@@ -68,17 +68,17 @@ DenseI64ArrayAttr getBroadcastDimensionsAttr(Builder* b, Value x, Value y,
 DenseElementsAttr getScalarOfType(Type ty, int64_t rawValue) {
   RankedTensorType scalarTy = RankedTensorType::get({}, ty);
 
-  if (auto floatTy = ty.dyn_cast<FloatType>()) {
+  if (auto floatTy = mlir::dyn_cast<FloatType>(ty)) {
     APFloat value(floatTy.getFloatSemantics(), rawValue);
     return DenseElementsAttr::get(scalarTy, value);
   }
-  if (auto intTy = ty.dyn_cast<IntegerType>()) {
+  if (auto intTy = mlir::dyn_cast<IntegerType>(ty)) {
     APInt value(intTy.getWidth(), static_cast<int64_t>(rawValue),
                 /*isSigned=*/true);
     return DenseElementsAttr::get(scalarTy, value);
   }
-  if (auto complexTy = ty.dyn_cast<ComplexType>()) {
-    if (auto floatTy = complexTy.getElementType().cast<FloatType>()) {
+  if (auto complexTy = mlir::dyn_cast<ComplexType>(ty)) {
+    if (auto floatTy = mlir::cast<FloatType>(complexTy.getElementType())) {
       APFloat real(floatTy.getFloatSemantics(), rawValue);
       APFloat imag = APFloat::getZero(floatTy.getFloatSemantics());
       return DenseElementsAttr::get(scalarTy,
@@ -91,16 +91,16 @@ DenseElementsAttr getScalarOfType(Type ty, int64_t rawValue) {
 DenseElementsAttr getScalarNegZeroOfType(Type ty) {
   RankedTensorType scalarTy = RankedTensorType::get({}, ty);
 
-  if (auto floatTy = ty.dyn_cast<FloatType>()) {
+  if (auto floatTy = mlir::dyn_cast<FloatType>(ty)) {
     APFloat negZero =
         APFloat::getZero(floatTy.getFloatSemantics(), /*Negative=*/true);
     return DenseElementsAttr::get(scalarTy, negZero);
   }
-  if (auto intTy = ty.dyn_cast<IntegerType>()) {
+  if (auto intTy = mlir::dyn_cast<IntegerType>(ty)) {
     return DenseElementsAttr::get(scalarTy, APInt::getZero(intTy.getWidth()));
   }
-  if (auto complexTy = ty.dyn_cast<ComplexType>()) {
-    if (auto floatTy = complexTy.getElementType().cast<FloatType>()) {
+  if (auto complexTy = mlir::dyn_cast<ComplexType>(ty)) {
+    if (auto floatTy = mlir::cast<FloatType>(complexTy.getElementType())) {
       APFloat negZero =
           APFloat::getZero(floatTy.getFloatSemantics(), /*Negative=*/true);
       return DenseElementsAttr::get(scalarTy,
@@ -156,11 +156,11 @@ static APInt getScalarLimitOfIntegerType(IntegerType integerTy,
 
 DenseElementsAttr getScalarLimitOfType(Type ty, ScalarLimit limit) {
   RankedTensorType scalarTy = RankedTensorType::get({}, ty);
-  if (auto floatTy = ty.dyn_cast<FloatType>()) {
+  if (auto floatTy = mlir::dyn_cast<FloatType>(ty)) {
     return DenseElementsAttr::get(scalarTy,
                                   getScalarLimitOfFloatType(floatTy, limit));
   }
-  if (auto integerTy = ty.dyn_cast<IntegerType>()) {
+  if (auto integerTy = mlir::dyn_cast<IntegerType>(ty)) {
     return DenseElementsAttr::get(
         scalarTy, getScalarLimitOfIntegerType(integerTy, limit));
   }
@@ -185,14 +185,14 @@ std::string lmhloToMhloOpName(llvm::StringRef opName,
 }
 
 bool isSequenceStartingWith0(Attribute attr) {
-  DenseIntElementsAttr denseAttr = attr.dyn_cast<DenseIntElementsAttr>();
+  DenseIntElementsAttr denseAttr = mlir::dyn_cast<DenseIntElementsAttr>(attr);
   for (int64_t i = 0, e = denseAttr.getNumElements(); i < e; ++i)
     if (denseAttr.getValues<APInt>()[i].getSExtValue() != i) return false;
   return true;
 }
 
 int64_t getArgumentIndex(mlir::func::FuncOp op, Value value) {
-  BlockArgument arg = value.dyn_cast<BlockArgument>();
+  BlockArgument arg = mlir::dyn_cast<BlockArgument>(value);
   if (!arg || arg.getOwner() != &op.front()) return -1;
   return arg.getArgNumber();
 }
@@ -202,7 +202,7 @@ std::pair<size_t, size_t> computeMemory(const std::vector<Value>& allocs) {
   size_t totalSize = 0;
   size_t allocCounter = 0;
   for (const Value alloc : allocs) {
-    auto shape = alloc.getType().cast<ShapedType>();
+    auto shape = mlir::cast<ShapedType>(alloc.getType());
     size_t shapeBytes = llvm::divideCeil(
         shape.getNumElements() * shape.getElementTypeBitWidth(), 8);
     size_t alignFactor = llvm::divideCeil(shapeBytes, kPaddingSize);
@@ -220,21 +220,21 @@ namespace mlir {
 namespace chlo {
 
 Value getConstantLikeMaxFiniteValue(OpBuilder& b, Location loc, Value val) {
-  auto ty = getElementTypeOrSelf(val.getType()).cast<FloatType>();
+  auto ty = mlir::cast<FloatType>(getElementTypeOrSelf(val.getType()));
   return getConstantLike(
       b, loc, llvm::APFloat::getLargest(ty.getFloatSemantics()), val);
 }
 
 Value getConstantLikeInfValue(OpBuilder& b, Location loc, Value val,
                               bool negative) {
-  auto ty = getElementTypeOrSelf(val.getType()).cast<FloatType>();
+  auto ty = mlir::cast<FloatType>(getElementTypeOrSelf(val.getType()));
   return getConstantLike(
       b, loc, llvm::APFloat::getInf(ty.getFloatSemantics(), negative), val);
 }
 
 Value getConstantLikeSmallestFiniteValue(OpBuilder& b, Location loc,
                                          Value val) {
-  auto ty = getElementTypeOrSelf(val.getType()).cast<FloatType>();
+  auto ty = mlir::cast<FloatType>(getElementTypeOrSelf(val.getType()));
   return getConstantLike(
       b, loc, llvm::APFloat::getSmallest(ty.getFloatSemantics()), val);
 }
diff --git a/third_party/xla/xla/mlir_hlo/utils/hlo_utils.h b/third_party/xla/xla/mlir_hlo/utils/hlo_utils.h
index 72f22992b4a944..74dfa37326213e 100644
--- a/third_party/xla/xla/mlir_hlo/utils/hlo_utils.h
+++ b/third_party/xla/xla/mlir_hlo/utils/hlo_utils.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeUtilities.h"
+#include "mlir/Support/LLVM.h"
 #include "stablehlo/dialect/ChloOps.h"
 
 namespace mlir {
@@ -49,10 +50,10 @@ static ElementsAttr getSplat(Builder* b, RankedTensorType ty, T constant) {
   if (elementTy.isSignlessInteger())
     return DenseElementsAttr::get(ty, b->getIntegerAttr(elementTy, constant));
 
-  if (elementTy.isa<FloatType>())
+  if (mlir::isa<FloatType>(elementTy))
     return DenseElementsAttr::get(ty, b->getFloatAttr(elementTy, constant));
 
-  if (auto complexTy = elementTy.dyn_cast<ComplexType>()) {
+  if (auto complexTy = mlir::dyn_cast<ComplexType>(elementTy)) {
     auto complexElementTy = complexTy.getElementType();
     if (complexElementTy.isF32())
       return DenseElementsAttr::get(ty,
@@ -66,7 +67,7 @@ static ElementsAttr getSplat(Builder* b, RankedTensorType ty, T constant) {
 
 template <typename T>
 static ElementsAttr getSplat(Builder* b, Value val, T constant) {
-  return getSplat(b, val.getType().cast<RankedTensorType>(), constant);
+  return getSplat(b, mlir::cast<RankedTensorType>(val.getType()), constant);
 }
 
 // Returns DenseElementsAttr of rank zero with the given element type and the
@@ -119,9 +120,9 @@ static Value getConstantLike(OpBuilder& b, Location loc, T constant,
                              Value val) {
   Type ty = getElementTypeOrSelf(val.getType());
   auto getAttr = [&]() -> Attribute {
-    if (ty.isa<IntegerType>()) return b.getIntegerAttr(ty, constant);
-    if (ty.isa<FloatType>()) return b.getFloatAttr(ty, constant);
-    if (auto complexTy = ty.dyn_cast<ComplexType>())
+    if (mlir::isa<IntegerType>(ty)) return b.getIntegerAttr(ty, constant);
+    if (mlir::isa<FloatType>(ty)) return b.getFloatAttr(ty, constant);
+    if (auto complexTy = mlir::dyn_cast<ComplexType>(ty))
       return complex::NumberAttr::get(complexTy, constant, 0);
     llvm_unreachable("unhandled element type");
   };
diff --git a/third_party/xla/xla/parse_flags_from_env.cc b/third_party/xla/xla/parse_flags_from_env.cc
index 0f58671ebff7df..4da2c9a0de2613 100644
--- a/third_party/xla/xla/parse_flags_from_env.cc
+++ b/third_party/xla/xla/parse_flags_from_env.cc
@@ -186,18 +186,15 @@ static absl::flat_hash_map<std::string, EnvArgv>& EnvArgvs() {
 // Used to protect accesses to env_argvs.
 static absl::Mutex env_argv_mu(absl::kConstInit);
 
-bool ParseFlagsFromEnvAndDieIfUnknown(absl::string_view envvar,
+static void DieIfEnvHasUnknownFlagsLeft(absl::string_view envvar);
+
+void ParseFlagsFromEnvAndDieIfUnknown(absl::string_view envvar,
                                       const std::vector<tsl::Flag>& flag_list) {
-  bool parsed_recognized_flags =
-      ParseFlagsFromEnvAndIgnoreUnknown(envvar, flag_list);
-  if (!parsed_recognized_flags) {
-    return false;
-  } else {
-    return !DieIfEnvHasUnknownFlagsLeft(envvar);
-  }
+  ParseFlagsFromEnvAndIgnoreUnknown(envvar, flag_list);
+  DieIfEnvHasUnknownFlagsLeft(envvar);
 }
 
-bool ParseFlagsFromEnvAndIgnoreUnknown(
+void ParseFlagsFromEnvAndIgnoreUnknown(
     absl::string_view envvar, const std::vector<tsl::Flag>& flag_list) {
   absl::MutexLock lock(&env_argv_mu);
   auto* env_argv = &EnvArgvs()[envvar];
@@ -210,10 +207,12 @@ bool ParseFlagsFromEnvAndIgnoreUnknown(
     }
   }
 
-  return tsl::Flags::Parse(&env_argv->argc, env_argv->argv.data(), flag_list);
+  QCHECK(tsl::Flags::Parse(&env_argv->argc, env_argv->argv.data(), flag_list))
+      << "Flag parsing failed.\n"
+      << tsl::Flags::Usage(getenv(std::string(envvar).c_str()), flag_list);
 }
 
-bool DieIfEnvHasUnknownFlagsLeft(absl::string_view envvar) {
+static void DieIfEnvHasUnknownFlagsLeft(absl::string_view envvar) {
   absl::MutexLock lock(&env_argv_mu);
   auto* env_argv = &EnvArgvs()[envvar];
   SetArgvFromEnv(envvar, env_argv);
@@ -225,9 +224,7 @@ bool DieIfEnvHasUnknownFlagsLeft(absl::string_view envvar) {
     LOG(QFATAL) << "Unknown flag" << (unknown_flags.size() > 1 ? "s" : "")
                 << " in " << envvar << ": "
                 << absl::StrJoin(unknown_flags, " ");
-    return true;
   }
-  return false;
 }
 
 // Testing only.
diff --git a/third_party/xla/xla/parse_flags_from_env.h b/third_party/xla/xla/parse_flags_from_env.h
index 01d476f22fa3dc..e135a664d93b47 100644
--- a/third_party/xla/xla/parse_flags_from_env.h
+++ b/third_party/xla/xla/parse_flags_from_env.h
@@ -57,26 +57,21 @@ limitations under the License.
 namespace xla {
 
 // Calls tsl::Flags::Parse(argc, argv, flag_list) against any as yet
-// unrecognized flags passed in the environment variable `envvar`, and returns
-// its return value.
+// unrecognized flags passed in the environment variable `envvar`.
 //
-// Raises a fatal error if any flags in `envvar` were not recognized.
-bool ParseFlagsFromEnvAndDieIfUnknown(absl::string_view envvar,
+// Raises a fatal error if any flags in `envvar` were not recognized, or if flag
+// parsing failed.
+void ParseFlagsFromEnvAndDieIfUnknown(absl::string_view envvar,
                                       const std::vector<tsl::Flag>& flag_list);
 
 // Calls tsl::Flags::Parse(argc, argv, flag_list) against any as yet
 // unrecognized flags passed in the environment variable `envvar`, and returns
 // its return value.
 //
-// Best effort and ignores unknown flags
-bool ParseFlagsFromEnvAndIgnoreUnknown(absl::string_view envvar,
+// Ignores unknown flags, raises a fatal if flag parsing failed.
+void ParseFlagsFromEnvAndIgnoreUnknown(absl::string_view envvar,
                                        const std::vector<tsl::Flag>& flag_list);
 
-// Raises a fatal error if there are unrecognized flags left in the specified
-// environment variable `envvar`, besides the fake argv[0]. Returns false if
-// there are no unrecognized flags.
-bool DieIfEnvHasUnknownFlagsLeft(absl::string_view envvar);
-
 // Used only for testing.  Not to be used by clients.
 void ResetFlagsFromEnvForTesting(absl::string_view envvar, int** pargc,
                                  std::vector<char*>** pargv);
diff --git a/third_party/xla/xla/parse_flags_from_env_test.cc b/third_party/xla/xla/parse_flags_from_env_test.cc
index f00cb309c12a96..0f68b41b7510be 100644
--- a/third_party/xla/xla/parse_flags_from_env_test.cc
+++ b/third_party/xla/xla/parse_flags_from_env_test.cc
@@ -53,12 +53,11 @@ static void TestParseFlagsFromEnv(const char* msg) {
       tsl::Flag("single_quoted", &single_quoted, ""),
       tsl::Flag("double_quoted", &double_quoted, ""),
   };
-  bool parsed_ok = ParseFlagsFromEnvAndDieIfUnknown("TF_XLA_FLAGS", flag_list);
+  ParseFlagsFromEnvAndDieIfUnknown("TF_XLA_FLAGS", flag_list);
   CHECK_EQ(*pargc, 1) << msg;
   const std::vector<char*>& argv_second = *pargv;
   CHECK_NE(argv_second[0], nullptr) << msg;
   CHECK_EQ(argv_second[1], nullptr) << msg;
-  CHECK(parsed_ok) << msg;
   CHECK(simple) << msg;
   CHECK_EQ(with_value, "a_value") << msg;
   CHECK_EQ(embedded_quotes, "single'double\"") << msg;
@@ -156,6 +155,56 @@ TEST(ParseFlagsFromEnv, EnvAndFlag) {
   }
 }
 
+TEST(ParseFlagsFromEnv, ErrorOutOnFlagFailure) {
+  const char* env = "--int_flag=3parsefailure";
+
+  if (env == nullptr) {
+    // Might be set from previous tests.
+    tsl::unsetenv("TF_XLA_FLAGS");
+  } else {
+    tsl::setenv("TF_XLA_FLAGS", env, /*overwrite=*/true);
+  }
+  tsl::SubProcess child;
+  std::vector<std::string> argv;
+  argv.push_back(binary_name);
+  argv.push_back("--recursing");
+  child.SetProgram(binary_name, argv);
+  child.SetChannelAction(tsl::CHAN_STDOUT, tsl::ACTION_PIPE);
+  child.SetChannelAction(tsl::CHAN_STDERR, tsl::ACTION_PIPE);
+  EXPECT_TRUE(child.Start());
+  std::string stdout_str;
+  std::string stderr_str;
+
+  // Expecting failure.
+  int child_status = child.Communicate(nullptr, &stdout_str, &stderr_str);
+  EXPECT_NE(child_status, 0);
+}
+
+TEST(ParseFlagsFromEnv, ErrorOutOnUnknownFlag) {
+  const char* env = "--int_flag=3 --unknown_flag=value";
+
+  if (env == nullptr) {
+    // Might be set from previous tests.
+    tsl::unsetenv("TF_XLA_FLAGS");
+  } else {
+    tsl::setenv("TF_XLA_FLAGS", env, /*overwrite=*/true);
+  }
+  tsl::SubProcess child;
+  std::vector<std::string> argv;
+  argv.push_back(binary_name);
+  argv.push_back("--recursing");
+  child.SetProgram(binary_name, argv);
+  child.SetChannelAction(tsl::CHAN_STDOUT, tsl::ACTION_PIPE);
+  child.SetChannelAction(tsl::CHAN_STDERR, tsl::ACTION_PIPE);
+  EXPECT_TRUE(child.Start());
+  std::string stdout_str;
+  std::string stderr_str;
+
+  // Expecting failure.
+  int child_status = child.Communicate(nullptr, &stdout_str, &stderr_str);
+  EXPECT_NE(child_status, 0);
+}
+
 }  // namespace xla
 
 int main(int argc, char* argv[]) {
@@ -169,15 +218,8 @@ int main(int argc, char* argv[]) {
       tsl::Flag("int_flag", &int_flag, "An integer flag to test with"),
   };
   std::string usage = tsl::Flags::Usage(argv[0], flag_list);
-  bool parse_ok =
-      xla::ParseFlagsFromEnvAndDieIfUnknown("TF_XLA_FLAGS", flag_list);
-  if (!parse_ok) {
-    LOG(QFATAL) << "can't parse from environment\n" << usage;
-  }
-  parse_ok = tsl::Flags::Parse(&argc, argv, flag_list);
-  if (!parse_ok) {
-    LOG(QFATAL) << usage;
-  }
+  xla::ParseFlagsFromEnvAndDieIfUnknown("TF_XLA_FLAGS", flag_list);
+  tsl::Flags::Parse(&argc, argv, flag_list);
   if (recursing) {
     printf("%d\n", int_flag);
     exit(0);
diff --git a/third_party/xla/xla/pjrt/BUILD b/third_party/xla/xla/pjrt/BUILD
index 509bdb61234be9..346fcb233ddd12 100644
--- a/third_party/xla/xla/pjrt/BUILD
+++ b/third_party/xla/xla/pjrt/BUILD
@@ -80,19 +80,24 @@ cc_library(
     hdrs = ["tracked_device_buffer.h"],
     deps = [
         ":event_pool",
-        ":local_device_state",
-        ":utils",
+        "//xla:shape_tree",
         "//xla:shape_util",
-        "//xla:types",
-        "//xla/runtime:async_runtime",
+        "//xla/service:executable",
+        "//xla/service:maybe_owning_device_memory",
         "//xla/service:shaped_buffer",
-        "//xla/service:transfer_manager",
         "//xla/stream_executor",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_allocator",
+        "//xla/tsl/concurrency:async_value",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/profiler/lib:connected_traceme",
         "@local_tsl//tsl/profiler/lib:context_types_hdrs",
     ],
@@ -195,10 +200,10 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/framework:allocator",
@@ -247,11 +252,14 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/client:executable_build_options",
+        "//xla/ffi:execution_context",
         "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
         "//xla/service:compiler",
         "//xla/service:computation_layout",
         "//xla/service:hlo_cost_analysis",
         "//xla/service:hlo_proto_cc",
+        "//xla/service:hlo_value",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
@@ -360,6 +368,7 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:platform_port",
@@ -425,12 +434,16 @@ cc_library(
     srcs = ["stream_executor_executable.cc"],
     hdrs = ["stream_executor_executable.h"],
     deps = [
+        ":pjrt_common",
         ":pjrt_executable",
         ":stream_executor_executable_proto_cc",
         "//xla:statusor",
         "//xla/hlo/ir:hlo",
         "//xla/service:compiler",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
@@ -449,13 +462,13 @@ cc_library(
         ":pjrt_client",
         ":pjrt_common",
         ":pjrt_compiler",
+        ":pjrt_device_description",
         ":pjrt_executable",
         ":pjrt_future",
         ":semaphore",
         ":tracked_device_buffer",
         ":transpose",
         ":utils",
-        "//xla:cpu_function_runtime",
         "//xla:executable_run_options",
         "//xla:literal",
         "//xla:shape_tree",
@@ -528,10 +541,10 @@ xla_cc_test(
         "//xla/client:xla_builder",
         "//xla/service:cpu_plugin",
         "//xla/service:platform_util",
+        "//xla/tsl/concurrency:async_value",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/synchronization",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/concurrency:async_value",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
@@ -563,7 +576,6 @@ cc_library(
         "//xla:util",
         "//xla/client:xla_computation",
         "//xla/mlir/utils:error_util",
-        "//xla/mlir_hlo",
         "//xla/mlir_hlo:hlo_dialect_registration",
         "//xla/mlir_hlo:mhlo_passes",
         "//xla/translate/mhlo_to_hlo:mlir_hlo_to_hlo",
@@ -579,15 +591,11 @@ cc_library(
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:ReconcileUnrealizedCasts",
-        "@llvm-project//mlir:ShapeDialect",
-        "@llvm-project//mlir:SparseTensorDialect",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
-        "@stablehlo//:chlo_ops",
         "@stablehlo//:register",
         "@stablehlo//:stablehlo_ops",
         "@stablehlo//:stablehlo_passes",
-        "@stablehlo//:stablehlo_portable_api",
         "@stablehlo//:stablehlo_serialization",
     ],
 )
@@ -598,13 +606,12 @@ cc_library(
     hdrs = ["pjrt_future.h"],
     visibility = internal_visibility([":friends"]),
     deps = [
+        "//xla/tsl/concurrency:async_value",
+        "//xla/tsl/concurrency:ref_count",
         "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/concurrency:async_value",
-        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/platform:logging",
     ],
 )
@@ -739,11 +746,12 @@ cc_library(
         "//xla:xla_proto_cc",
         "//xla/client:xla_computation",
         "//xla/hlo/ir:hlo",
-        "//xla/mlir_hlo:hlo_dialect_registration",
         "//xla/mlir_hlo:mhlo_passes",
         "//xla/pjrt/c:pjrt_c_api_hdrs",
         "//xla/pjrt/c:pjrt_c_api_helpers",
+        "//xla/pjrt/c:pjrt_c_api_layouts_extension_hdrs",
         "//xla/pjrt/c:pjrt_c_api_profiler_extension_hdrs",
+        "//xla/pjrt/c:pjrt_c_api_stream_extension_hdrs",
         "//xla/pjrt/distributed:key_value_store_interface",
         "//xla/service:computation_placer_hdr",
         "//xla/service:hlo_cost_analysis",
@@ -762,13 +770,8 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:MLProgramDialect",
-        "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:ShapeDialect",
         "@llvm-project//mlir:Support",
         "@local_tsl//tsl/framework:allocator",
         "@local_tsl//tsl/platform:casts",
@@ -776,7 +779,6 @@ cc_library(
         "@local_tsl//tsl/platform:fingerprint",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
-        "@stablehlo//:register",
     ],
 )
 
@@ -843,7 +845,15 @@ cc_library(
     visibility = internal_visibility([":friends"]),
     deps = [
         ":pjrt_client",
+        ":pjrt_executable",
         ":pjrt_future",
+        "//xla:shape_util",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:status",
     ],
 )
 
diff --git a/third_party/xla/xla/pjrt/c/BUILD b/third_party/xla/xla/pjrt/c/BUILD
index 172679b66dc995..647cc659e2a7c0 100644
--- a/third_party/xla/xla/pjrt/c/BUILD
+++ b/third_party/xla/xla/pjrt/c/BUILD
@@ -1,6 +1,6 @@
 load(
-    "@local_tsl//tsl/platform:build_config_root.bzl",
-    "tf_cuda_tests_tags",
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm_is_configured",
 )
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load(
@@ -43,6 +43,15 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "pjrt_c_api_stream_extension_hdrs",
+    hdrs = ["pjrt_c_api_stream_extension.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":pjrt_c_api_hdrs",
+    ],
+)
+
 cc_library(
     name = "pjrt_c_api_profiler_extension_hdrs",
     hdrs = ["pjrt_c_api_profiler_extension.h"],
@@ -62,6 +71,15 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "pjrt_c_api_layouts_extension_hdrs",
+    hdrs = ["pjrt_c_api_layouts_extension.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":pjrt_c_api_hdrs",
+    ],
+)
+
 cc_library(
     name = "pjrt_c_api_wrapper_impl",
     srcs = ["pjrt_c_api_wrapper_impl.cc"],
@@ -70,6 +88,7 @@ cc_library(
     deps = [
         ":pjrt_c_api_hdrs",
         ":pjrt_c_api_helpers",
+        ":pjrt_c_api_layouts_extension_hdrs",
         "//xla:literal",
         "//xla:shape_util",
         "//xla:status",
@@ -118,6 +137,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":pjrt_c_api_hdrs",
+        ":pjrt_c_api_layouts_extension_hdrs",
         ":pjrt_c_api_profiler_extension_hdrs",
         "//xla:shape_util",
         "//xla:status",
@@ -128,6 +148,7 @@ cc_library(
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt:pjrt_future",
         "//xla/pjrt/distributed:key_value_store_interface",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/status",
@@ -192,13 +213,16 @@ cc_library(
     name = "pjrt_c_api_gpu_internal",
     srcs = ["pjrt_c_api_gpu_internal.cc"],
     hdrs = ["pjrt_c_api_gpu_internal.h"],
+    local_defines = if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
     visibility = ["//visibility:public"],
     deps = [
         ":pjrt_c_api_custom_partitioner_extension_hdrs",
         ":pjrt_c_api_gpu_extension_hdrs",
         ":pjrt_c_api_hdrs",
         ":pjrt_c_api_helpers",
+        ":pjrt_c_api_layouts_extension_hdrs",
         ":pjrt_c_api_profiler_extension_hdrs",
+        ":pjrt_c_api_stream_extension_hdrs",
         ":pjrt_c_api_wrapper_impl",
         "//xla/backends/profiler/gpu:device_tracer",  # To register the GPU Tracers with the GPU Plugin".
         "//xla/backends/profiler/plugin:plugin_tracer_impl",
@@ -224,7 +248,6 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
-        "@local_tsl//tsl/platform:errors",
     ],
 )
 
@@ -263,6 +286,8 @@ xla_cc_binary(
         "//xla/service:gpu_plugin",
     ] + if_cuda_is_configured([
         "//xla/stream_executor:cuda_platform",
+    ]) + if_rocm_is_configured([
+        "//xla/stream_executor:rocm_platform",
     ]),
 )
 
@@ -284,7 +309,7 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:status",
     ],
 )
@@ -292,7 +317,7 @@ cc_library(
 xla_cc_test(
     name = "pjrt_c_api_gpu_test",
     srcs = ["pjrt_c_api_gpu_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":pjrt_c_api_gpu",
         ":pjrt_c_api_gpu_extension_hdrs",
@@ -314,6 +339,7 @@ xla_cc_test(
         "//xla/pjrt/distributed:in_memory_key_value_store",
         "//xla/service:custom_call_target_registry",
         "//xla/service:gpu_plugin",
+        "//xla/stream_executor/gpu:gpu_init",
         "//xla/tests:literal_test_util",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
@@ -396,7 +422,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:status",
     ],
 )
diff --git a/third_party/xla/xla/pjrt/c/CHANGELOG.md b/third_party/xla/xla/pjrt/c/CHANGELOG.md
index 7201180ea208cc..425beb3ae95787 100644
--- a/third_party/xla/xla/pjrt/c/CHANGELOG.md
+++ b/third_party/xla/xla/pjrt/c/CHANGELOG.md
@@ -1,13 +1,22 @@
 # PJRT C API changelog
 
-## 0.48
+## 0.51
+* Added ``PJRT_Extension_Type::PJRT_Extension_Type_Layouts``.
+
+## 0.50 (Apr 26, 2024)
+* Added a new type ``PJRT_Buffer_Type_TOKEN`` to ``PJRT_Buffer_Type``.
+
+## 0.49 (Apr 19, 2024)
+* Added ``PJRT_Extension_Type::PJRT_Extension_Type_Stream``.
+
+## 0.48 (Apr 10, 2024)
 * Added ``PjRtCApiMemorySpace::kind_id`` for uniquely identifying memory space kinds.
 * Renamed memory space kind to ``PjRtCApiMemorySpace::memory_space_kind`` to
   ``PjRtCApiMemorySpace::kind``.
 * Added new host buffer semantics enum
   ``PJRT_HostBufferSemantics_kMutableZeroCopy``
 
-## 0.47
+## 0.47 (Mar 29, 2024)
 * Added ``PJRT_Extension_Type::PJRT_Extension_Type_Custom_Partitioner``.
 * Renamed host buffer semantics enum from ``PJRT_HostBufferSemantics_kZeroCopy``
   to ``PJRT_HostBufferSemantics_kImmutableZeroCopy``.
diff --git a/third_party/xla/xla/pjrt/c/docs/pjrt_integration_guide.md b/third_party/xla/xla/pjrt/c/docs/pjrt_integration_guide.md
index 0cde6b00f632c9..557e1518d42c42 100644
--- a/third_party/xla/xla/pjrt/c/docs/pjrt_integration_guide.md
+++ b/third_party/xla/xla/pjrt/c/docs/pjrt_integration_guide.md
@@ -1,155 +1,4 @@
 # PJRT integration guide
 
-[jieying@google.com](mailto:jieying@google.com), [skyewm@google.com](mailto:skyewm@google.com)
-
-## Background
-
-[PJRT](https://github.com/openxla/xla/blob/c23fbd601a017be25726fd6d624b22daa6a8a4e5/xla/pjrt/c/pjrt_c_api.h) is the uniform Device API that we want to add to the ML ecosystem. The long term vision is that: (1) frameworks (JAX, TF, etc.) will call PJRT, which has device-specific implementations that are opaque to the frameworks; (2) each device focuses on implementing PJRT APIs, and can be opaque to the frameworks.
-
-This doc focuses on the recommendations about how to integrate with PJRT, and how to test PJRT integration with JAX.
-
-## How to integrate with PJRT
-
-### Step 1: Implement [PJRT C API interface](https://github.com/openxla/xla/blob/71a4e6e6e4e9f0f8b8f25c07a32ad489aff19239/xla/pjrt/c/pjrt_c_api.h)
-
-**Option A**: You can implement the PJRT C API directly.
-
-**Option B**: If you're able to build against C++ code in the [xla repo](https://github.com/openxla/xla) (via forking or bazel), you can also implement the PJRT C++ API and use the C→C++ wrapper:
-
-1. Implement a C++ PJRT client inheriting from the [base PJRT client](https://github.com/openxla/xla/blob/main/xla/pjrt/pjrt_client.h) (and related PJRT classes). Here are some examples of C++ PJRT client: [pjrt\_stream\_executor\_client.h](https://github.com/openxla/xla/blob/c23fbd601a017be25726fd6d624b22daa6a8a4e5/xla/pjrt/pjrt_stream_executor_client.h), [tfrt\_cpu\_pjrt\_client.h](https://github.com/openxla/xla/blob/c23fbd601a017be25726fd6d624b22daa6a8a4e5/xla/pjrt/tfrt_cpu_pjrt_client.h).
-1. Implement a few C API methods that are not part of C++ PJRT client:
-  * [PJRT\_Client\_Create](https://github.com/openxla/xla/blob/c23fbd601a017be25726fd6d624b22daa6a8a4e5/xla/pjrt/c/pjrt_c_api.h#L344-L365). Below is some sample pseudo code (assuming `GetPluginPjRtClient` returns a C++ PJRT client implemented above):
-```
-#include "xla/pjrt/c/pjrt_c_api_wrapper_impl.h"
-
-namespace my_plugin {
-PJRT_Error* PJRT_Client_Create(PJRT_Client_Create_Args* args) {
-  std::unique_ptr<xla::PjRtClient> client = GetPluginPjRtClient();
-  args->client = pjrt::CreateWrapperClient(std::move(client));
-  return nullptr;
-}
-}  // namespace my_plugin
-```
-  Note [PJRT\_Client\_Create](https://github.com/openxla/xla/blob/c23fbd601a017be25726fd6d624b22daa6a8a4e5/xla/pjrt/c/pjrt_c_api.h#L344-L365) can take options passed from the framework. [Here](https://github.com/openxla/xla/blob/c23fbd601a017be25726fd6d624b22daa6a8a4e5/xla/pjrt/c/pjrt_c_api_gpu_internal.cc#L48-L102) is an example of how a GPU client uses this feature.
-
-  * [Optional] [PJRT\_TopologyDescription\_Create](https://github.com/openxla/xla/blob/c23fbd601a017be25726fd6d624b22daa6a8a4e5/xla/pjrt/c/pjrt_c_api.h#L1815-L1830).
-  * [Optional] [PJRT\_Plugin\_Initialize](https://github.com/openxla/xla/blob/c23fbd601a017be25726fd6d624b22daa6a8a4e5/xla/pjrt/c/pjrt_c_api.h#L173-L180). This is a one-time plugin setup, which will be called by the framework before any other functions are called.
-  * [Optional] [PJRT\_Plugin\_Attributes](https://github.com/openxla/xla/blob/c23fbd601a017be25726fd6d624b22daa6a8a4e5/xla/pjrt/c/pjrt_c_api.h#L182-L194).
-
-With the [wrapper](https://github.com/openxla/xla/blob/c23fbd601a017be25726fd6d624b22daa6a8a4e5/xla/pjrt/c/pjrt_c_api_wrapper_impl.h), you do not need to implement the remaining C APIs.
-
-
-### Step 2: Implement GetPjRtApi
-
-You need to implement a method `GetPjRtApi` which returns a `PJRT_Api*` containing function pointers to PJRT C API implementations. Below is an example assuming implementing through wrapper (similar to [pjrt\_c\_api\_cpu.cc](https://github.com/openxla/xla/blob/main/xla/pjrt/c/pjrt_c_api_cpu.cc)):
-```
-const PJRT_Api* GetPjrtApi() {
-  static const PJRT_Api pjrt_api =
-      pjrt::CreatePjrtApi(my_plugin::PJRT_Client_Create);
-  return &pjrt_api;
-}
-```
-
-### Step 3: Test C API implementations
-
-You can call [RegisterPjRtCApiTestFactory](https://github.com/openxla/xla/blob/c23fbd601a017be25726fd6d624b22daa6a8a4e5/xla/pjrt/c/pjrt_c_api_test.h#L31C6-L31C33) to run a small set of tests for basic PJRT C API behaviors.
-
-## How to use a PJRT plugin from JAX
-
-### Step 1: Set up JAX
-
-You can either use JAX nightly
-```
-pip install --pre -U jaxlib -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-
-pip install git+https://github.com/google/jax
-```
-or [build JAX from source](https://jax.readthedocs.io/en/latest/developer.html#building-jaxlib-from-source).
-
-For now, you need to match the jaxlib version with the PJRT C API version. It's usually sufficient to use a jaxlib nightly version from the same day as the TF commit you're building your plugin against, e.g.
-```
-pip install --pre -U jaxlib==0.4.2.dev20230103 -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-```
-You can also build a jaxlib from source at exactly the XLA commit you're building against ([instructions](https://jax.readthedocs.io/en/latest/developer.html#building-jaxlib-from-source-with-a-modified-xla-repository)).
-
-We will start supporting ABI compatibility soon.
-
-### Step 2: Use jax\_plugins namespace or set up entry\_point
-
-There are two options for your plugin to be discovered by JAX.
-
-1. Using namespace packages ([ref](https://packaging.python.org/en/latest/guides/creating-and-discovering-plugins/#using-naming-convention)). Define a globally unique module under the `jax_plugins` namespace package (i.e. just create a `jax_plugins` directory and define your module below it). Here is an example directory structure:
-```
-jax_plugins/
-  my_plugin/
-    __init__.py
-    my_plugin.so
-```
-2. Using package metadata ([ref](https://packaging.python.org/en/latest/guides/creating-and-discovering-plugins/#using-package-metadata)). If building a package via pyproject.toml or setup.py, advertise your plugin module name by including an entry-point under the `jax_plugins` group which points to your full module name. Here is an example via pyproject.toml or setup.py:
-```
-# use pyproject.toml
-[project.entry-points.'jax_plugins']
-my_plugin = 'my_plugin'
-
-# use setup.py
-entry_points={
-  "jax_plugins": [
-    "my_plugin = my_plugin",
-  ],
-}
-```
-Here are examples of how openxla-pjrt-plugin is implemented using Option 2: https://github.com/openxla/openxla-pjrt-plugin/pull/119, https://github.com/openxla/openxla-pjrt-plugin/pull/120.
-
-### Step 3: Implement an initialize() method
-
-You need to implement an initialize() method in your python module to register the plugin, for example:
-```
-import os
-import jax._src.xla_bridge as xb
-
-def initialize():
-  path = os.path.join(os.path.dirname(__file__), 'my_plugin.so')
-  xb.register_plugin('my_plugin', priority=500, library_path=path, options=None)
-```
-Please refer to [here](https://github.com/google/jax/blob/8f283bc9ed50d3828bd468ae57b1ee4df1527624/jax/_src/xla_bridge.py#L420) about how to use `xla_bridge.register_plugin`. It is currently a private method. A public API will be released in the future.
-
-You can run the line below to verify that the plugin is registered and raise an error if it can't be loaded.
-```
-jax.config.update("jax_platforms", "my_plugin")
-```
-JAX may have multiple backends/plugins. There are a few options to ensure your plugin is used as the default backend:
-*   Option 1: run `jax.config.update("jax_platforms", "my_plugin")` in the beginning of the program.
-*   Option 2: set ENV `JAX_PLATFORMS=my_plugin`.
-*   Option 3: set a high enough priority when calling xb.register\_plugin (the default value is 400 which is higher than other existing backends). Note the backend with highest priority will be used only when `JAX_PLATFORMS=''`. The default value of `JAX_PLATFORMS` is `''` but sometimes it will get overwritten.
-
-## How to test with JAX
-
-Some basic test cases to try:
-```
-# JAX 1+1
-print(jax.numpy.add(1, 1))
-# => 2
-
-# jit
-print(jax.jit(lambda x: x * 2)(1.))
-# => 2.0
-
-# pmap
-
-arr = jax.numpy.arange(jax.device_count()) print(jax.pmap(lambda x: x +
-jax.lax.psum(x, 'i'), axis_name='i')(arr))
-
-# single device: [0]
-
-# 4 devices: [6 7 8 9]
-
-```
-(We'll add instructions for running the jax unit tests against your plugin soon!)
-
-## Example: JAX CUDA plugin
-
-1. PJRT C API implementation through wrapper ([pjrt\_c\_api\_gpu.h](https://github.com/openxla/xla/blob/c23fbd601a017be25726fd6d624b22daa6a8a4e5/xla/pjrt/c/pjrt_c_api_gpu.h)).
-1. Set up the entry point for the package ([setup.py](https://github.com/google/jax/blob/main/jax_plugins/cuda/setup.py)).
-1. Implement an initialize() method ([\_\_init\_\_.py](https://github.com/google/jax/blob/a10854786b6d1bc92a65dd314916b151640789af/plugins/cuda/__init__.py#L31-L51)).
-1. Can be tested with any jax tests for CUDA.
-```
+This file has been moved into the root
+[XLA documentation directory](https://github.com/openxla/xla/blob/main/docs/pjrt_integration.md.
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api.h b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
index 09eddb63157d02..fe06b0e5475881 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
@@ -42,6 +42,8 @@ typedef enum {
   PJRT_Extension_Type_Gpu_Custom_Call = 0,
   PJRT_Extension_Type_Profiler,
   PJRT_Extension_Type_Custom_Partitioner,
+  PJRT_Extension_Type_Stream,
+  PJRT_Extension_Type_Layouts,
 } PJRT_Extension_Type;
 
 // PJRT_Extension_Base contains a type and a pointer to next
@@ -76,7 +78,7 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Extension_Base, next);
 // Changes include:
 // * Adding a new field to the PJRT_Api or argument structs
 // * Renaming a method or argument (doesn't affect ABI)
-#define PJRT_API_MINOR 48
+#define PJRT_API_MINOR 51
 
 // The plugin should set the major_version and minor_version of
 // PJRT_Api.pjrt_api_version to be the `PJRT_API_MAJOR` and `PJRT_API_MINOR` in
@@ -635,6 +637,8 @@ typedef enum {
   // 4-bit integer types
   PJRT_Buffer_Type_S4,
   PJRT_Buffer_Type_U4,
+
+  PJRT_Buffer_Type_TOKEN,
 } PJRT_Buffer_Type;
 
 typedef enum {
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
index 297c678655298b..1976e4fceff3a2 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
@@ -35,7 +35,9 @@ limitations under the License.
 #include "xla/pjrt/c/pjrt_c_api_custom_partitioner_extension.h"
 #include "xla/pjrt/c/pjrt_c_api_gpu_extension.h"
 #include "xla/pjrt/c/pjrt_c_api_helpers.h"
+#include "xla/pjrt/c/pjrt_c_api_layouts_extension.h"
 #include "xla/pjrt/c/pjrt_c_api_profiler_extension.h"
+#include "xla/pjrt/c/pjrt_c_api_stream_extension.h"
 #include "xla/pjrt/c/pjrt_c_api_wrapper_impl.h"
 #include "xla/pjrt/gpu/gpu_helpers.h"
 #include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
@@ -48,12 +50,15 @@ limitations under the License.
 #include "xla/service/custom_call_target_registry.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/stream_executor_pimpl.h"
-#include "tsl/platform/errors.h"
 
 namespace pjrt {
 namespace gpu_plugin {
 
+#if TENSORFLOW_USE_ROCM
+#define PJRT_GPU_PLUGIN_PLATFORM_NAME "ROCM"
+#else
 #define PJRT_GPU_PLUGIN_PLATFORM_NAME "CUDA"
+#endif
 
 PJRT_Error* PJRT_Client_Create(PJRT_Client_Create_Args* args) {
   PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
@@ -212,12 +217,45 @@ PJRT_Error* PJRT_Register_Custom_Partitioner(
 }
 
 PJRT_Custom_Partitioner_Extension custom_partitioner{
-    /*struct_size=*/PJRT_Gpu_Custom_Call_STRUCT_SIZE,
+    /*struct_size=*/PJRT_Custom_Partitioner_Extension_STRUCT_SIZE,
     /*type=*/PJRT_Extension_Type::PJRT_Extension_Type_Custom_Partitioner,
     /*next=*/reinterpret_cast<PJRT_Extension_Base*>(&profiler_extension),
     /*register_custom_partitioner=*/PJRT_Register_Custom_Partitioner,
 };
 
+PJRT_Error* PJRT_Get_Stream_For_External_Ready_Events(
+    PJRT_Get_Stream_For_External_Ready_Events_Args* args) {
+  PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "PJRT_Get_Stream_For_External_Ready_Events_Args",
+      PJRT_Get_Stream_For_External_Ready_Events_Args_STRUCT_SIZE,
+      args->struct_size));
+  PJRT_ASSIGN_OR_RETURN(
+      args->stream, args->device->device->GetStreamForExternalReadyEvents());
+  return nullptr;
+}
+
+PJRT_Error* PJRT_Wait_Until_Buffer_Ready_On_Stream(
+    PJRT_Wait_Until_Buffer_Ready_On_Stream_Args* args) {
+  PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "PJRT_Wait_Until_Buffer_Ready_On_Stream_Args",
+      PJRT_Wait_Until_Buffer_Ready_On_Stream_Args_STRUCT_SIZE,
+      args->struct_size));
+  PJRT_ASSIGN_OR_RETURN(
+      std::unique_ptr<xla::PjRtBuffer::ExternalReference> external_reference,
+      args->buffer->buffer->AcquireExternalReference());
+  PJRT_RETURN_IF_ERROR(
+      external_reference->WaitUntilBufferReadyOnStream(args->stream));
+  return nullptr;
+}
+
+PJRT_Stream_Extension stream{
+    /*struct_size=*/PJRT_Stream_Extension_STRUCT_SIZE,
+    /*type=*/PJRT_Extension_Type::PJRT_Extension_Type_Stream,
+    /*next=*/reinterpret_cast<PJRT_Extension_Base*>(&custom_partitioner),
+    /*get_stream=*/PJRT_Get_Stream_For_External_Ready_Events,
+    /*wait_stream=*/PJRT_Wait_Until_Buffer_Ready_On_Stream,
+};
+
 PJRT_Error* PJRT_Gpu_Register_Custom_Call(
     PJRT_Gpu_Register_Custom_Call_Args* args) {
   PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
@@ -248,15 +286,18 @@ const PJRT_Api* GetGpuPjrtApi() {
   static PJRT_Gpu_Custom_Call custom_call{
       /*struct_size=*/PJRT_Gpu_Custom_Call_STRUCT_SIZE,
       /*type=*/PJRT_Extension_Type::PJRT_Extension_Type_Gpu_Custom_Call,
-      /*next=*/reinterpret_cast<PJRT_Extension_Base*>(&custom_partitioner),
+      /*next=*/reinterpret_cast<PJRT_Extension_Base*>(&stream),
       /*custom_call=*/PJRT_Gpu_Register_Custom_Call,
   };
-  static const PJRT_Api pjrt_api =
-      pjrt::CreatePjrtApi(pjrt::gpu_plugin::PJRT_Client_Create,
-                          pjrt::gpu_plugin::PJRT_GpuDeviceTopology_Create,
-                          pjrt::PJRT_Plugin_Initialize_NoOp,
-                          reinterpret_cast<PJRT_Extension_Base*>(&custom_call),
-                          pjrt::PJRT_Plugin_Attributes_Xla);
+  static PJRT_Layouts_Extension layouts_extension =
+      pjrt::CreateLayoutsExtension(
+          reinterpret_cast<PJRT_Extension_Base*>(&custom_call));
+  static const PJRT_Api pjrt_api = pjrt::CreatePjrtApi(
+      pjrt::gpu_plugin::PJRT_Client_Create,
+      pjrt::gpu_plugin::PJRT_GpuDeviceTopology_Create,
+      pjrt::PJRT_Plugin_Initialize_NoOp,
+      reinterpret_cast<PJRT_Extension_Base*>(&layouts_extension),
+      pjrt::PJRT_Plugin_Attributes_Xla);
 
   return &pjrt_api;
 }
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
index b155cfc92329e8..9a554d9d333cb9 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
@@ -54,6 +54,7 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/statusor.h"
+#include "xla/stream_executor/gpu/gpu_init.h"
 #include "xla/tests/literal_test_util.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/status_matchers.h"
@@ -399,8 +400,8 @@ TEST(PjrtCApiGpuExtensionTest, CustomCallUntyped) {
       reinterpret_cast<const PJRT_Gpu_Custom_Call*>(next)->custom_call(&args);
 
   CHECK_EQ(error, nullptr);
-  void* custom_call =
-      xla::CustomCallTargetRegistry::Global()->Lookup(function_name, "CUDA");
+  void* custom_call = xla::CustomCallTargetRegistry::Global()->Lookup(
+      function_name, stream_executor::GpuPlatformName());
   EXPECT_EQ(custom_call, reinterpret_cast<void*>(&TestCustomCallV2));
 }
 
@@ -430,7 +431,9 @@ TEST(PjrtCApiGpuExtensionTest, CustomCallTyped) {
       reinterpret_cast<const PJRT_Gpu_Custom_Call*>(next)->custom_call(&args);
 
   CHECK_EQ(error, nullptr);
-  auto registration = xla::ffi::FindHandler(function_name, "CUDA").value();
+  auto registration =
+      xla::ffi::FindHandler(function_name, stream_executor::GpuPlatformName())
+          .value();
   EXPECT_EQ(reinterpret_cast<void*>(registration.handler), kNoop);
 }
 
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc
index f54feed682fe2a..ac45d750adf394 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/layout.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
+#include "xla/pjrt/c/pjrt_c_api_layouts_extension.h"
 #include "xla/pjrt/c/pjrt_c_api_profiler_extension.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/pjrt/pjrt_client.h"
@@ -138,6 +139,21 @@ PJRT_TopologyDescriptionDeleter MakeTopologyDescriptionDeleter(
   };
 }
 
+PJRT_Layouts_MemoryLayoutDeleter MakeMemoryLayoutDeleter(const PJRT_Api* api) {
+  PJRT_Layouts_Extension* ext_api =
+      FindExtension<PJRT_Layouts_Extension>(api, PJRT_Extension_Type_Layouts);
+  CHECK_NE(ext_api, nullptr) << "MakeMemoryLayoutDeleter passed PJRT_Api that "
+                                "doesn't support layouts extension";
+  return [api, ext_api](PJRT_Layouts_MemoryLayout* layout) -> void {
+    PJRT_Layouts_MemoryLayout_Destroy_Args args;
+    args.struct_size = PJRT_Layouts_MemoryLayout_Destroy_Args_STRUCT_SIZE;
+    args.extension_start = nullptr;
+    args.layout = layout;
+    pjrt::LogFatalIfPjrtError(ext_api->PJRT_Layouts_MemoryLayout_Destroy(&args),
+                              api);
+  };
+}
+
 PJRT_Error_Code GetErrorCode(const PJRT_Error* error, const PJRT_Api* api) {
   PJRT_Error_GetCode_Args args;
   args.struct_size = PJRT_Error_GetCode_Args_STRUCT_SIZE;
@@ -244,6 +260,8 @@ PJRT_Buffer_Type ConvertToPjRtBufferType(xla::PrimitiveType type) {
       return PJRT_Buffer_Type::PJRT_Buffer_Type_INVALID;
     case xla::PrimitiveType::PRED:
       return PJRT_Buffer_Type::PJRT_Buffer_Type_PRED;
+    case xla::PrimitiveType::TOKEN:
+      return PJRT_Buffer_Type::PJRT_Buffer_Type_TOKEN;
     case xla::PrimitiveType::S4:
       return PJRT_Buffer_Type::PJRT_Buffer_Type_S4;
     case xla::PrimitiveType::S8:
@@ -297,6 +315,8 @@ xla::PrimitiveType ConvertFromPjRtBufferType(PJRT_Buffer_Type type) {
   switch (type) {
     case PJRT_Buffer_Type::PJRT_Buffer_Type_PRED:
       return xla::PrimitiveType::PRED;
+    case PJRT_Buffer_Type::PJRT_Buffer_Type_TOKEN:
+      return xla::PrimitiveType::TOKEN;
     case PJRT_Buffer_Type::PJRT_Buffer_Type_S4:
       return xla::PrimitiveType::S4;
     case PJRT_Buffer_Type::PJRT_Buffer_Type_S8:
@@ -400,7 +420,7 @@ xla::PjRtClient::HostBufferSemantics ConvertFromPjRtHostBufferSemantics(
 
 xla::PjRtFuture<> ConvertCEventToCppFuture(PJRT_Event* c_event,
                                            const PJRT_Api* c_api) {
-  using absl::Status, xla::PjRtFuture;
+  using xla::PjRtFuture;
   PJRT_Event_OnReady_Args event_onready_args;
   event_onready_args.struct_size = PJRT_Event_OnReady_Args_STRUCT_SIZE;
   event_onready_args.extension_start = nullptr;
@@ -896,14 +916,21 @@ absl::Span<const int64_t> GetDimensions(const PJRT_Api* api,
   return {args.dims, args.num_dims};
 }
 
-PJRT_Buffer_MemoryLayout GetMemoryLayout(const PJRT_Api* api,
-                                         PJRT_Buffer* buffer) {
-  PJRT_Buffer_GetMemoryLayout_Args args;
-  args.struct_size = PJRT_Buffer_GetMemoryLayout_Args_STRUCT_SIZE;
+std::unique_ptr<PJRT_Layouts_MemoryLayout, PJRT_Layouts_MemoryLayoutDeleter>
+GetMemoryLayout(const PJRT_Api* api, PJRT_Buffer* buffer) {
+  PJRT_Layouts_PJRT_Buffer_MemoryLayout_Args args;
+  args.struct_size = PJRT_Layouts_PJRT_Buffer_MemoryLayout_Args_STRUCT_SIZE;
   args.extension_start = nullptr;
   args.buffer = buffer;
-  LogFatalIfPjrtError(api->PJRT_Buffer_GetMemoryLayout(&args), api);
-  return args.layout;
+  PJRT_Layouts_Extension* ext_api =
+      FindExtension<PJRT_Layouts_Extension>(api, PJRT_Extension_Type_Layouts);
+  CHECK_NE(ext_api, nullptr) << "GetMemoryLayout called with PJRT_Api that "
+                                "doesn't support layouts extension";
+  LogFatalIfPjrtError(ext_api->PJRT_Layouts_PJRT_Buffer_MemoryLayout(&args),
+                      api);
+  return std::unique_ptr<PJRT_Layouts_MemoryLayout,
+                         PJRT_Layouts_MemoryLayoutDeleter>(
+      args.layout, MakeMemoryLayoutDeleter(api));
 }
 
 absl::StatusOr<xla::Shape> BuildXlaShapeFromC(
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h
index 05c2aa779beaf1..685c4e241093d8 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h
@@ -16,18 +16,28 @@ limitations under the License.
 #ifndef XLA_PJRT_C_PJRT_C_API_HELPERS_H_
 #define XLA_PJRT_C_PJRT_C_API_HELPERS_H_
 
+#include <cstddef>
+#include <cstdint>
 #include <functional>
 #include <memory>
 #include <string>
 #include <vector>
 
+#include "absl/base/attributes.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/layout.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
+#include "xla/pjrt/c/pjrt_c_api_layouts_extension.h"
 #include "xla/pjrt/c/pjrt_c_api_profiler_extension.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_future.h"
+#include "xla/shape.h"
 #include "xla/status.h"
 #include "xla/xla_data.pb.h"
 
@@ -100,6 +110,14 @@ using PJRT_TopologyDescriptionDeleter =
 PJRT_TopologyDescriptionDeleter MakeTopologyDescriptionDeleter(
     const PJRT_Api* api);
 
+using PJRT_Layouts_MemoryLayoutDeleter =
+    std::function<void(PJRT_Layouts_MemoryLayout*)>;
+
+// The lifetime of `api` must be longer than the layout object to be
+// deleted. This function requires that `api` includes the PJRT_Layouts
+// extension.
+PJRT_Layouts_MemoryLayoutDeleter MakeMemoryLayoutDeleter(const PJRT_Api* api);
+
 // Fatal error logging if status is not success. This terminates the process
 // and frees the PJRT_Error passed in.
 void LogFatalIfPjrtError(PJRT_Error* error, const PJRT_Api* api);
@@ -262,8 +280,8 @@ absl::StatusOr<xla::Layout> ConvertToLayout(
 PJRT_Buffer_Type GetElementType(const PJRT_Api* api, PJRT_Buffer* buffer);
 absl::Span<const int64_t> GetDimensions(const PJRT_Api* api,
                                         PJRT_Buffer* buffer);
-PJRT_Buffer_MemoryLayout GetMemoryLayout(const PJRT_Api* api,
-                                         PJRT_Buffer* buffer);
+std::unique_ptr<PJRT_Layouts_MemoryLayout, PJRT_Layouts_MemoryLayoutDeleter>
+GetMemoryLayout(const PJRT_Api* api, PJRT_Buffer* buffer);
 
 absl::StatusOr<xla::Shape> BuildXlaShapeFromC(PJRT_Buffer_Type element_type,
                                               const int64_t* dims,
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_layouts_extension.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_layouts_extension.h
new file mode 100644
index 00000000000000..2b390c24570852
--- /dev/null
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_layouts_extension.h
@@ -0,0 +1,133 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_C_PJRT_C_API_LAYOUTS_EXTENSION_H_
+#define XLA_PJRT_C_PJRT_C_API_LAYOUTS_EXTENSION_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "xla/pjrt/c/pjrt_c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// This extension provides capabilities around custom on-device memory layouts
+// for PJRT_Buffers. The extension is both optional and experimental, meaning
+// ABI-breaking and other incompatible changes may be introduced at any time.
+//
+// If this extension is provided, JAX and possibly other frameworks will assume
+// that the compiler MLIR input can contain "mhlo.layout_mode" attributes on
+// program inputs and outputs, which should then be reflected by the runtime
+// methods in this extension. See
+// https://github.com/openxla/xla/blob/main/xla/pjrt/layout_mode.h for more
+// details.
+
+#define PJRT_API_LAYOUTS_EXTENSION_VERSION 1
+
+// -------------------------------- Data types ---------------------------------
+
+typedef struct PJRT_Layouts_MemoryLayout PJRT_Layouts_MemoryLayout;
+typedef struct PJRT_Layouts_SerializedLayout PJRT_Layouts_SerializedLayout;
+
+// ---------------------------------- Methods ----------------------------------
+
+struct PJRT_Layouts_MemoryLayout_Destroy_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Layouts_MemoryLayout* layout;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Layouts_MemoryLayout_Destroy_Args, layout);
+
+// Frees `layout`. `layout` can be nullptr.
+typedef PJRT_Error* PJRT_Layouts_MemoryLayout_Destroy(
+    PJRT_Layouts_MemoryLayout_Destroy_Args* args);
+
+struct PJRT_Layouts_MemoryLayout_Serialize_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Layouts_MemoryLayout* layout;
+
+  // Lives only as long as serialized_layout
+  const char* serialized_bytes;  // out
+  size_t serialized_bytes_size;  // out
+
+  PJRT_Layouts_SerializedLayout* serialized_layout;  // backs serialized_bytes.
+
+  // cleanup fn must be called to free the backing memory for serialized_bytes.
+  // Should only be called once on serialized_layout.
+  void (*serialized_layout_deleter)(
+      PJRT_Layouts_SerializedLayout* s_layout);  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Layouts_MemoryLayout_Serialize_Args,
+                          serialized_layout_deleter);
+
+// Serializes the memory layout into a string.
+typedef PJRT_Error* PJRT_Layouts_MemoryLayout_Serialize(
+    PJRT_Layouts_MemoryLayout_Serialize_Args* args);
+
+struct PJRT_Layouts_PJRT_Buffer_MemoryLayout_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Buffer* buffer;
+  PJRT_Layouts_MemoryLayout* layout;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Layouts_PJRT_Buffer_MemoryLayout_Args, layout);
+
+// Returns the memory layout of the data in this buffer. Returned `layout` must
+// be freed via PJRT_Layouts_MemoryLayout_Destroy.
+typedef PJRT_Error* PJRT_Layouts_PJRT_Buffer_MemoryLayout(
+    PJRT_Layouts_PJRT_Buffer_MemoryLayout_Args* args);
+
+struct PJRT_Layouts_PJRT_Client_GetDefaultLayout_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Client* client;
+  PJRT_Buffer_Type type;
+  const int64_t* dims;
+  size_t num_dims;
+  PJRT_Layouts_MemoryLayout* layout;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Layouts_PJRT_Client_GetDefaultLayout_Args,
+                          layout);
+
+// Returns the default memory layout of the client given buffer type and dims.
+typedef PJRT_Error* PJRT_Layouts_PJRT_Client_GetDefaultLayout(
+    PJRT_Layouts_PJRT_Client_GetDefaultLayout_Args* args);
+
+// --------------------------- Extension entrypoint ----------------------------
+
+typedef struct PJRT_Layouts_Extension {
+  size_t struct_size;
+  PJRT_Extension_Type type;
+  PJRT_Extension_Base* next;
+
+  PJRT_Layouts_MemoryLayout_Destroy* PJRT_Layouts_MemoryLayout_Destroy;
+  PJRT_Layouts_MemoryLayout_Serialize* PJRT_Layouts_MemoryLayout_Serialize;
+
+  PJRT_Layouts_PJRT_Client_GetDefaultLayout*
+      PJRT_Layouts_PJRT_Client_GetDefaultLayout;
+
+  PJRT_Layouts_PJRT_Buffer_MemoryLayout* PJRT_Layouts_PJRT_Buffer_MemoryLayout;
+} PJRT_Layouts_Extension;
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Layouts_Extension,
+                          PJRT_Layouts_PJRT_Buffer_MemoryLayout);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // XLA_PJRT_C_PJRT_C_API_LAYOUTS_EXTENSION_H_
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_stream_extension.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_stream_extension.h
new file mode 100644
index 00000000000000..292db35d90a62c
--- /dev/null
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_stream_extension.h
@@ -0,0 +1,67 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_PJRT_C_PJRT_C_API_STREAM_EXTENSION_H_
+#define XLA_PJRT_C_PJRT_C_API_STREAM_EXTENSION_H_
+
+#include <stddef.h>
+
+#include <cstdint>
+
+#include "xla/pjrt/c/pjrt_c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PJRT_API_STREAM_EXTENSION_VERSION 0
+
+struct PJRT_Get_Stream_For_External_Ready_Events_Args {
+  size_t struct_size;
+  PJRT_Device* device;
+  intptr_t stream;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Get_Stream_For_External_Ready_Events_Args,
+                          stream);
+
+// Returns a platform-specific stream handle that should be used to track when
+// an externally-managed buffer is ready to use on this device.
+typedef PJRT_Error* PJRT_Get_Stream_For_External_Ready_Events(
+    PJRT_Get_Stream_For_External_Ready_Events_Args* args);
+
+struct PJRT_Wait_Until_Buffer_Ready_On_Stream_Args {
+  size_t struct_size;
+  intptr_t stream;
+  PJRT_Buffer* buffer;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Wait_Until_Buffer_Ready_On_Stream_Args, buffer);
+
+// Waits until buffer is ready on stream.
+typedef PJRT_Error* PJRT_Wait_Until_Buffer_Ready_On_Stream(
+    PJRT_Wait_Until_Buffer_Ready_On_Stream_Args* args);
+
+typedef struct PJRT_Stream_Extension {
+  size_t struct_size;
+  PJRT_Extension_Type type;
+  PJRT_Extension_Base* next;
+  PJRT_Get_Stream_For_External_Ready_Events* get_stream;
+  PJRT_Wait_Until_Buffer_Ready_On_Stream* wait_stream;
+} PJRT_Stream_Extension;
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Stream_Extension, wait_stream);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // XLA_PJRT_C_PJRT_C_API_STREAM_EXTENSION_H_
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
index 51ae06bd1d8947..68fd8f7b8edaaf 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
@@ -47,6 +47,7 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
 #include "xla/pjrt/c/pjrt_c_api_helpers.h"
+#include "xla/pjrt/c/pjrt_c_api_layouts_extension.h"
 #include "xla/pjrt/compile_options.pb.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/pjrt/mlir_to_hlo.h"
@@ -61,7 +62,6 @@ limitations under the License.
 #include "xla/service/hlo.pb.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
@@ -108,7 +108,7 @@ static absl::Status PopulateExecutableCostAnalysis(
                       executable->get()->GetCostAnalysis());
   // If no output, return empty result
   if (properties.empty()) {
-    return xla::OkStatus();
+    return absl::OkStatus();
   }
 
   // Copy each returned property to cost analysis vectors in PJRT_Executable
@@ -140,7 +140,7 @@ static absl::Status PopulateExecutableCostAnalysis(
     ++i;
   }
 
-  return xla::OkStatus();
+  return absl::OkStatus();
 }
 
 static absl::Status PopulateExecutableOutputElementTypes(
@@ -166,7 +166,7 @@ static absl::Status PopulateExecutableOutputElementTypes(
     out_types.push_back(ConvertToPjRtBufferType(element_type));
   }
 
-  return xla::OkStatus();
+  return absl::OkStatus();
 }
 
 static absl::Status PopulateExecutableOutputDimensions(
@@ -200,7 +200,7 @@ static absl::Status PopulateExecutableOutputDimensions(
     }
   }
 
-  return xla::OkStatus();
+  return absl::OkStatus();
 }
 
 static absl::Status PopulateExecutableOutputMemoryKinds(
@@ -229,7 +229,7 @@ static absl::Status PopulateExecutableOutputMemoryKinds(
     memory_kind_sizes.push_back(memory.size());
   }
 
-  return xla::OkStatus();
+  return absl::OkStatus();
 }
 
 class CApiKeyValueStore : public xla::KeyValueStoreInterface {
@@ -441,8 +441,9 @@ PJRT_Error* PJRT_Client_LookupDevice(PJRT_Client_LookupDevice_Args* args) {
   PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
       "PJRT_Client_LookupDevice_Args",
       PJRT_Client_LookupDevice_Args_STRUCT_SIZE, args->struct_size));
-  PJRT_ASSIGN_OR_RETURN(xla::PjRtDevice * device,
-                        args->client->client->LookupDevice(args->id));
+  PJRT_ASSIGN_OR_RETURN(
+      xla::PjRtDevice * device,
+      args->client->client->LookupDevice(xla::PjRtGlobalDeviceId(args->id)));
   args->device = GetCDevice(args->client, device);
   return nullptr;
 }
@@ -1099,7 +1100,7 @@ static absl::Status VerifyOptimizedProgramArgs(
       PJRT_Executable_OptimizedProgram_Args_STRUCT_SIZE, args->struct_size));
   TF_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
       "PJRT_Program", PJRT_Program_STRUCT_SIZE, args->program->struct_size));
-  return xla::OkStatus();
+  return absl::OkStatus();
 }
 
 static absl::StatusOr<std::shared_ptr<xla::HloModule>>
@@ -1965,8 +1966,7 @@ PJRT_Error* PJRT_Event_Await(PJRT_Event_Await_Args* args) {
       args->struct_size));
 
   PJRT_Event* event = args->event;
-  event->status.emplace(event->future.Await());
-  PJRT_RETURN_IF_ERROR(event->status.value());
+  PJRT_RETURN_IF_ERROR(event->future.Await());
   return nullptr;
 }
 
@@ -1977,14 +1977,7 @@ PJRT_Error* PJRT_Event_Error(PJRT_Event_Error_Args* args) {
 
   PJRT_Event* event = args->event;
   CHECK(event->future.IsReady());
-  if (!event->status.has_value()) {
-    PJRT_Event_Await_Args await_args;
-    await_args.struct_size = PJRT_Event_Await_Args_STRUCT_SIZE;
-    await_args.extension_start = nullptr;
-    await_args.event = event;
-    return PJRT_Event_Await(&await_args);
-  }
-  PJRT_RETURN_IF_ERROR(event->status.value());
+  PJRT_RETURN_IF_ERROR(event->future.Await());
   return nullptr;
 }
 
@@ -2110,6 +2103,58 @@ PJRT_Error* PJRT_Compile(PJRT_Compile_Args* args) {
   return nullptr;
 }
 
+PJRT_Error* PJRT_Layouts_MemoryLayout_Destroy(
+    PJRT_Layouts_MemoryLayout_Destroy_Args* args) {
+  PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "PJRT_Layouts_MemoryLayout_Destroy_Args",
+      PJRT_Layouts_MemoryLayout_Destroy_Args_STRUCT_SIZE, args->struct_size));
+  delete args->layout;
+  return nullptr;
+}
+
+PJRT_Error* PJRT_Layouts_MemoryLayout_Serialize(
+    PJRT_Layouts_MemoryLayout_Serialize_Args* args) {
+  PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "PJRT_Layouts_MemoryLayout_Serialize_Args",
+      PJRT_Layouts_MemoryLayout_Serialize_Args_STRUCT_SIZE, args->struct_size));
+
+  PJRT_Layouts_SerializedLayout* s_layout = new PJRT_Layouts_SerializedLayout{
+      .serialized = args->layout->layout->Serialize()};
+  args->serialized_layout = s_layout;
+  args->serialized_bytes = s_layout->serialized.data();
+  args->serialized_bytes_size = s_layout->serialized.size();
+  args->serialized_layout_deleter =
+      +[](PJRT_Layouts_SerializedLayout* s_lay) { delete s_lay; };
+  return nullptr;
+}
+
+PJRT_Error* PJRT_Layouts_PJRT_Client_GetDefaultLayout(
+    PJRT_Layouts_PJRT_Client_GetDefaultLayout_Args* args) {
+  PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "PJRT_Layouts_PJRT_Client_GetDefaultLayout_Args",
+      PJRT_Layouts_PJRT_Client_GetDefaultLayout_Args_STRUCT_SIZE,
+      args->struct_size));
+
+  PJRT_ASSIGN_OR_RETURN(xla::Layout xla_layout,
+                        args->client->client->GetDefaultLayout(
+                            pjrt::ConvertFromPjRtBufferType(args->type),
+                            {args->dims, args->num_dims}));
+  auto pjrt_xla_layout = std::make_unique<xla::PjRtXlaLayout>(xla_layout);
+  args->layout = new PJRT_Layouts_MemoryLayout{std::move(pjrt_xla_layout)};
+  return nullptr;
+}
+
+PJRT_Error* PJRT_Layouts_PJRT_Buffer_MemoryLayout(
+    PJRT_Layouts_PJRT_Buffer_MemoryLayout_Args* args) {
+  PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "PJRT_Layouts_PJRT_Buffer_MemoryLayout_Args",
+      PJRT_Layouts_PJRT_Buffer_MemoryLayout_Args_STRUCT_SIZE,
+      args->struct_size));
+
+  args->layout = new PJRT_Layouts_MemoryLayout{args->buffer->buffer->layout()};
+  return nullptr;
+}
+
 static std::vector<PJRT_NamedValue> PopulatePjrtAttributes(
     const absl::flat_hash_map<std::string, xla::PjRtDeviceAttribute>&
         attributes) {
@@ -2456,4 +2501,20 @@ PJRT_Api CreatePjrtApi(PJRT_Client_Create* create_fn,
   };
 }
 
+PJRT_Layouts_Extension CreateLayoutsExtension(PJRT_Extension_Base* next) {
+  return PJRT_Layouts_Extension{
+      /*struct_size=*/PJRT_Layouts_Extension_STRUCT_SIZE,
+      /*type=*/PJRT_Extension_Type_Layouts,
+      /*next=*/next,
+      /*PJRT_Layouts_MemoryLayout_Destroy=*/
+      pjrt::PJRT_Layouts_MemoryLayout_Destroy,
+      /*PJRT_Layouts_MemoryLayout_Serialize=*/
+      pjrt::PJRT_Layouts_MemoryLayout_Serialize,
+      /*PJRT_Layouts_PJRT_Client_GetDefaultLayout=*/
+      pjrt::PJRT_Layouts_PJRT_Client_GetDefaultLayout,
+      /*PJRT_Layouts_PJRT_Buffer_MemoryLayout=*/
+      pjrt::PJRT_Layouts_PJRT_Buffer_MemoryLayout,
+  };
+}
+
 }  // namespace pjrt
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
index dcd1395b55e54a..2a0ace99a70f6a 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
 #include "xla/pjrt/c/pjrt_c_api_helpers.h"
+#include "xla/pjrt/c/pjrt_c_api_layouts_extension.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
@@ -39,7 +40,6 @@ limitations under the License.
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/shape.h"
 #include "xla/status.h"
-#include "tsl/platform/casts.h"
 
 struct PJRT_Error {
   absl::Status status;
@@ -179,14 +179,6 @@ struct PJRT_Buffer {
 
 struct PJRT_Event {
   xla::PjRtFuture<> future;
-  // TODO(b/333538339): It's safe to Await() on PjRtFuture<> multiple times,
-  // remove this workaround.
-  //
-  // Set and stored upon future.Await(), as PjRtFuture only allows its result to
-  // be queried through Await() and Await() can only safely be called once. This
-  // variable allows C API users to check for error status any time after
-  // Await() has been called.
-  std::optional<absl::Status> status;
 };
 
 struct PJRT_SerializedExecutable {
@@ -208,6 +200,14 @@ struct PJRT_CopyToDeviceStream {
   std::unique_ptr<xla::CopyToDeviceStream> stream;
 };
 
+struct PJRT_Layouts_MemoryLayout {
+  std::unique_ptr<xla::PjRtLayout> layout;
+};
+
+struct PJRT_Layouts_SerializedLayout {
+  std::string serialized;
+};
+
 namespace pjrt {
 // C API definitions
 
@@ -368,6 +368,15 @@ PJRT_Error* PJRT_TopologyDescription_Attributes(
 
 PJRT_Error* PJRT_Compile(PJRT_Compile_Args* args);
 
+PJRT_Error* PJRT_Layouts_MemoryLayout_Destroy(
+    PJRT_Layouts_MemoryLayout_Destroy_Args* args);
+PJRT_Error* PJRT_Layouts_MemoryLayout_Serialize(
+    PJRT_Layouts_MemoryLayout_Serialize_Args* args);
+PJRT_Error* PJRT_Layouts_PJRT_Client_GetDefaultLayout(
+    PJRT_Layouts_PJRT_Client_GetDefaultLayout_Args* args);
+PJRT_Error* PJRT_Layouts_PJRT_Buffer_MemoryLayout(
+    PJRT_Layouts_PJRT_Buffer_MemoryLayout_Args* args);
+
 // Helper macros and functions
 
 #define PJRT_RETURN_IF_ERROR(expr)                                \
@@ -432,6 +441,9 @@ std::shared_ptr<xla::KeyValueStoreInterface> ToCppKeyValueStore(
 // specific initialization.
 PJRT_Error* PJRT_Plugin_Initialize_NoOp(PJRT_Plugin_Initialize_Args* args);
 
+PJRT_Layouts_Extension CreateLayoutsExtension(
+    PJRT_Extension_Base* next = nullptr);
+
 // Creates a PJRT_Api with create_fn from the input and other functions in
 // pjrt_c_api_wrapper_impl.
 PJRT_Api CreatePjrtApi(PJRT_Client_Create* create_fn,
diff --git a/third_party/xla/xla/pjrt/cpu/BUILD b/third_party/xla/xla/pjrt/cpu/BUILD
index c9c693072b27e4..7cc103260d4bd3 100644
--- a/third_party/xla/xla/pjrt/cpu/BUILD
+++ b/third_party/xla/xla/pjrt/cpu/BUILD
@@ -27,17 +27,18 @@ cc_library(
         "//xla:cpu_function_runtime",
         "//xla:shape_util",
         "//xla:util",
-        "//xla/runtime:cpu_event",
+        "//xla/service/cpu:cpu_event",
+        "//xla/tsl/concurrency:async_value",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/concurrency:async_value",
-        "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:platform_port",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -46,9 +47,11 @@ xla_cc_test(
     srcs = ["tracked_tfrt_cpu_device_buffer_test.cc"],
     deps = [
         ":tracked_tfrt_cpu_device_buffer",
+        "//xla/service/cpu:cpu_event",
+        "//xla/tsl/concurrency:async_value",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/concurrency:async_value",
         "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -73,11 +76,13 @@ cc_library(
         "//xla/pjrt:pjrt_future",
         "//xla/pjrt:transpose",
         "//xla/pjrt:utils",
-        "//xla/runtime:cpu_event",
         "//xla/service:shaped_buffer",
+        "//xla/service/cpu:cpu_event",
         "//xla/service/cpu:cpu_executable",
         "//xla/service/cpu:cpu_xfeed",
         "//xla/stream_executor:device_memory",
+        "//xla/tsl/concurrency:async_value",
+        "//xla/tsl/concurrency:ref_count",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
@@ -90,8 +95,6 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/concurrency:async_value",
-        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/profiler/lib:connected_traceme",
@@ -151,6 +154,7 @@ cc_library(
         "//xla/client:xla_computation",
         "//xla/hlo/ir:hlo",
         "//xla/pjrt:compile_options_proto_cc",
+        "//xla/pjrt:host_memory_spaces",
         "//xla/pjrt:mlir_to_hlo",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_common",
@@ -163,7 +167,6 @@ cc_library(
         "//xla/pjrt:utils",
         "//xla/pjrt/distributed:key_value_store_interface",
         "//xla/pjrt/distributed:topology_util",
-        "//xla/runtime:cpu_event",
         "//xla/service:buffer_assignment",
         "//xla/service:compiler",
         "//xla/service:computation_placer_hdr",
@@ -177,10 +180,14 @@ cc_library(
         "//xla/service:hlo_value",
         "//xla/service/cpu:collectives_interface",
         "//xla/service/cpu:cpu_compiler",
+        "//xla/service/cpu:cpu_event",
         "//xla/service/cpu:cpu_executable",
         "//xla/service/cpu:cpu_executable_run_options",
         "//xla/service/cpu:cpu_xfeed",
         "//xla/service/cpu:simple_orc_jit",
+        "//xla/tsl/concurrency:async_value",
+        "//xla/tsl/concurrency:ref_count",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:dynamic_annotations",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -198,8 +205,6 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@eigen_archive//:eigen3",  # TODO(zhangqiaorjc): Remove if use TFRT threadpool.
         "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/concurrency:async_value",
-        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/lib/strings:proto_serialization",
         "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:denormal",
@@ -223,11 +228,18 @@ xla_cc_test(
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:status",
+        "//xla:status_macros",
         "//xla:util",
-        "//xla/service:custom_call_status_public_headers",
-        "//xla/service:custom_call_target_registry",
+        "//xla/client:xla_computation",
+        "//xla/ffi",
+        "//xla/ffi:ffi_api",
+        "//xla/pjrt:host_memory_spaces",
+        "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:pjrt_executable",
         "//xla/service:hlo_parser",
+        "//xla/tests:literal_test_util",
         "//xla/tests:test_utils",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/synchronization",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/lib/core:status_test_util",
diff --git a/third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.cc b/third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.cc
index 88de16f8068925..f0247de19f532b 100644
--- a/third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.cc
+++ b/third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
-#include <functional>
 #include <memory>
 #include <optional>
 #include <string>
@@ -46,7 +45,7 @@ limitations under the License.
 #include "xla/pjrt/transpose.h"
 #include "xla/pjrt/utils.h"
 #include "xla/primitive_util.h"
-#include "xla/runtime/cpu_event.h"
+#include "xla/service/cpu/cpu_event.h"
 #include "xla/service/cpu/cpu_executable.h"
 #include "xla/service/cpu/cpu_xfeed.h"
 #include "xla/service/shaped_buffer.h"
@@ -54,13 +53,12 @@ limitations under the License.
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
-#include "xla/statusor.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/tsl/concurrency/async_value.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/concurrency/async_value.h"
-#include "tsl/concurrency/async_value_ref.h"
-#include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/profiler/lib/connected_traceme.h"
@@ -69,13 +67,12 @@ limitations under the License.
 namespace xla {
 namespace {
 
-using ::xla::runtime::CpuEvent;
-
 constexpr size_t kSmallDataTransferByteSize = 102400;  // 100 KiB
 
-// Unpacks and copies the int4 data at 'input' into the literal at the given
+// Unpacks and copies the packed data at `input` into the literal at the given
 // ShapeIndex.
-void UnpackInt4ToLiteral(const MaybeOwningCpuMemory& input,
+void UnpackIntNToLiteral(PrimitiveType input_element_type,
+                         const MaybeOwningCpuMemory& input,
                          MutableLiteralBase* literal,
                          const ShapeIndex& shape_index) {
   absl::Span<const char> input_span{static_cast<const char*>(input.data()),
@@ -84,17 +81,21 @@ void UnpackInt4ToLiteral(const MaybeOwningCpuMemory& input,
       ShapeUtil::GetSubshape(literal->shape(), shape_index)));
   absl::Span<char> output_span{
       static_cast<char*>(literal->untyped_data(shape_index)), output_size};
-  UnpackInt4(input_span, output_span);
+  primitive_util::UnpackIntN(input_element_type, input_span, output_span);
 }
 
+// `device_buffer`'s definition event must be ready before calling this
+// function.
 void CopyCpuBufferToLiteral(const Shape& device_shape,
                             TrackedTfrtCpuDeviceBuffer* device_buffer,
                             MutableLiteralBase* literal) {
   if (!device_shape.IsTuple()) {
-    const std::shared_ptr<MaybeOwningCpuMemory>& b =
+    const tsl::AsyncValueRef<MaybeOwningCpuMemory>& b =
         device_buffer->Buffers()[0];
-    if (primitive_util::Is4BitType(device_shape.element_type())) {
-      UnpackInt4ToLiteral(*b, literal, /*shape_index=*/{});
+    CHECK(b.IsConcrete());
+    if (primitive_util::IsSubByteNonPredType(device_shape.element_type())) {
+      UnpackIntNToLiteral(device_shape.element_type(), *b, literal,
+                          /*shape_index=*/{});
     } else {
       std::memcpy(literal->untyped_data(), b->data(),
                   ShapeUtil::ByteSizeOf(device_shape));
@@ -103,10 +104,11 @@ void CopyCpuBufferToLiteral(const Shape& device_shape,
     // Tuple case.
     int num_leaves = literal->shape().tuple_shapes().size();
     for (int i = 0; i < num_leaves; ++i) {
-      const std::shared_ptr<MaybeOwningCpuMemory>& b =
+      const tsl::AsyncValueRef<MaybeOwningCpuMemory>& b =
           device_buffer->Buffers()[i];
-      if (primitive_util::Is4BitType(device_shape.element_type())) {
-        UnpackInt4ToLiteral(*b, literal, {i});
+      CHECK(b.IsConcrete());
+      if (primitive_util::IsSubByteNonPredType(device_shape.element_type())) {
+        UnpackIntNToLiteral(device_shape.element_type(), *b, literal, {i});
       } else {
         std::memcpy(
             literal->untyped_data({i}), b->data(),
@@ -116,13 +118,15 @@ void CopyCpuBufferToLiteral(const Shape& device_shape,
   }
 }
 
+// `buffers` must be available.
 ShapedBuffer AsShapedBuffer(
     int device_ordinal, const Shape& on_device_shape,
-    absl::Span<const std::shared_ptr<MaybeOwningCpuMemory>> buffers) {
+    absl::Span<const tsl::AsyncValueRef<MaybeOwningCpuMemory>> buffers) {
   ShapedBuffer shaped_buffer(on_device_shape, device_ordinal);
   ShapeTree<se::DeviceMemoryBase>::iterator iterator =
       shaped_buffer.buffers().begin();
   for (const auto& buf : buffers) {
+    CHECK(buf.IsConcrete());
     CHECK(iterator != shaped_buffer.buffers().end());
     iterator->second = se::DeviceMemoryBase(buf->data(), buf->size());
     ++iterator;
@@ -163,6 +167,7 @@ absl::StatusOr<Shape> AbstractTfrtCpuBuffer::logical_on_device_shape() {
     return Internal("Error Execute: %s", error->message());
   }
 
+  // Safe to call `AsShapedBuffer` because the definition event is ready.
   ShapedBuffer shaped_buffer =
       AsShapedBuffer(device()->local_hardware_id(), on_device_shape_,
                      device_buffer->Buffers());
@@ -180,10 +185,15 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer::ExternalReference>>
 AbstractTfrtCpuBuffer::AcquireExternalReference() {
   class ScopedExternalReference : public PjRtBuffer::ExternalReference {
    public:
-    explicit ScopedExternalReference(AbstractTfrtCpuBuffer* buffer,
-                                     std::shared_ptr<MaybeOwningCpuMemory> data)
+    explicit ScopedExternalReference(
+        AbstractTfrtCpuBuffer* buffer,
+        tsl::AsyncValueRef<MaybeOwningCpuMemory> data)
         : buffer_(buffer), data_(std::move(data)) {
       DCHECK(data_);
+      // We need to wait for the memory to be allocated before sharing it with
+      // external frameworks like NumPy.
+      tsl::BlockUntilReady(data_);
+      CHECK(data_.IsConcrete());
       data_ptr_ = data_->data();
     }
 
@@ -193,7 +203,7 @@ AbstractTfrtCpuBuffer::AcquireExternalReference() {
     AbstractTfrtCpuBuffer* buffer_ = nullptr;
     // Keep a reference to the underlying data used. Note that it is still
     // users' responsibility to synchronize reads and writes to the data.
-    std::shared_ptr<MaybeOwningCpuMemory> data_;
+    tsl::AsyncValueRef<MaybeOwningCpuMemory> data_;
   };
 
   absl::MutexLock lock(&mu_);
@@ -222,7 +232,12 @@ class TrackedCpuDeviceBufferExternalReference
   explicit TrackedCpuDeviceBufferExternalReference(
       std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer)
       : tracked_device_buffer_(std::move(tracked_device_buffer)) {
-    data_ptr_ = tracked_device_buffer_->Buffers()[0]->data();
+    // We need to wait for the memory to be allocated before sharing it with
+    // external frameworks like NumPy.
+    const auto& buffer = tracked_device_buffer_->Buffers()[0];
+    tsl::BlockUntilReady(buffer);
+    CHECK(buffer.IsConcrete());
+    data_ptr_ = buffer->data();
   }
 
   ~TrackedCpuDeviceBufferExternalReference() override = default;
@@ -334,7 +349,7 @@ AbstractTfrtCpuBuffer::Release(bool wait_for_operations_to_complete) {
     // Block the host until all usage events have completed. Usage events
     // dominate definition events, so this also waits for the buffer to be
     // defined. Return the first error encountered.
-    Status first_error;
+    absl::Status first_error;
     for (const auto& av : events) {
       BlockUntilReady(av.GetAsyncValue());
       if (auto* error = av.GetErrorIfPresent()) {
@@ -479,20 +494,20 @@ AbstractTfrtCpuBuffer::CopyToDeviceHelper(AsyncWorkRunner* async_work_runner) {
   MarkEventReadyOnExit ready_on_exit(std::move(usage_event));
 
   int num_leaf_buffers = src_device_buffer->Buffers().size();
-  absl::InlinedVector<std::shared_ptr<MaybeOwningCpuMemory>, 4> src_buffers;
-  absl::InlinedVector<std::shared_ptr<MaybeOwningCpuMemory>, 4> dst_buffers;
+  absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> src_buffers;
+  absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> dst_buffers;
+  absl::InlinedVector<size_t, 4> dst_buffers_sizes;
   absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> dst_definition_events;
   src_buffers.reserve(num_leaf_buffers);
   dst_buffers.reserve(num_leaf_buffers);
+  dst_buffers_sizes.reserve(num_leaf_buffers);
   dst_definition_events.reserve(num_leaf_buffers);
 
   for (int i = 0; i < num_leaf_buffers; ++i) {
-    auto src_buffer = src_device_buffer->Buffers()[i];
-    TF_ASSIGN_OR_RETURN(
-        std::shared_ptr<MaybeOwningCpuMemory> dst_buffer,
-        MaybeOwningCpuMemory::AllocateShared(src_buffer->size()));
-    src_buffers.push_back(std::move(src_buffer));
-    dst_buffers.push_back(std::move(dst_buffer));
+    src_buffers.push_back(std::move(src_device_buffer->Buffers()[i]));
+    dst_buffers.push_back(
+        tsl::MakeUnconstructedAsyncValueRef<MaybeOwningCpuMemory>());
+    dst_buffers_sizes.push_back(src_device_buffer->BufferSizes()[i]);
     dst_definition_events.push_back(
         tsl::MakeConstructedAsyncValueRef<CpuEvent>());
   }
@@ -517,6 +532,15 @@ AbstractTfrtCpuBuffer::CopyToDeviceHelper(AsyncWorkRunner* async_work_runner) {
     }
 
     for (int i = 0; i < num_leaf_buffers; ++i) {
+      // `src_buffers` are available because `src_definition_event` should have
+      // been ready.
+      CHECK(src_buffers[i].IsConcrete());
+      auto dst_memory = MaybeOwningCpuMemory::Allocate(src_buffers[i]->size());
+      if (!dst_memory.ok()) {
+        dst_definition_events[i].SetError(dst_memory.status());
+        continue;
+      }
+      dst_buffers_copies[i].emplace(std::move(*dst_memory));
       std::memcpy(dst_buffers_copies[i]->data(), src_buffers[i]->data(),
                   src_buffers[i]->size());
       dst_definition_events[i].SetStateConcrete();
@@ -529,8 +553,8 @@ AbstractTfrtCpuBuffer::CopyToDeviceHelper(AsyncWorkRunner* async_work_runner) {
       });
 
   return std::make_unique<TrackedTfrtCpuDeviceBuffer>(
-      on_device_shape_.IsTuple(), std::move(dst_buffers),
-      std::move(dst_definition_events));
+      on_device_shape_.IsTuple(), /*owns_buffers=*/true, std::move(dst_buffers),
+      std::move(dst_buffers_sizes), std::move(dst_definition_events));
 }
 
 PjRtFuture<> AbstractTfrtCpuBuffer::GetReadyFuture() {
@@ -584,6 +608,7 @@ PjRtFuture<> AbstractTfrtCpuBuffer::GetReadyFuture() {
   }
 }
 
+// The buffer's memory should have been allocated before calling this function.
 void AbstractTfrtCpuBuffer::CopyFromLiteral(
     const LiteralSlice& literal, const Shape& shape,
     absl::InlinedVector<tsl::RCReference<tsl::AsyncValue>, 4>* avs,
@@ -597,8 +622,9 @@ void AbstractTfrtCpuBuffer::CopyFromLiteral(
     async_work_runner->Schedule(
         [literal, av = (*avs)[0].CopyRef(), device_buffer, shape]() mutable {
           tsl::profiler::TraceMe traceme("H2D Dispatch");
-          const std::shared_ptr<MaybeOwningCpuMemory>& b =
+          const tsl::AsyncValueRef<MaybeOwningCpuMemory>& b =
               device_buffer->Buffers()[0];
+          CHECK(b.IsConcrete());
           CHECK_EQ(literal.size_bytes(), b->size());
           std::memcpy(b->data(), literal.untyped_data(), b->size());
           // Signal copy is complete.
@@ -613,8 +639,9 @@ void AbstractTfrtCpuBuffer::CopyFromLiteral(
                                    device_buffer]() mutable {
         tsl::profiler::TraceMe traceme("H2D Dispatch");
         auto slice = LiteralSlice(literal, {i});
-        const std::shared_ptr<MaybeOwningCpuMemory>& b =
+        const tsl::AsyncValueRef<MaybeOwningCpuMemory>& b =
             device_buffer->Buffers()[i];
+        CHECK(b.IsConcrete());
         CHECK_EQ(slice.size_bytes(), b->size());
         std::memcpy(b->data(), slice.untyped_data(), slice.size_bytes());
         // Signal copy is complete.
@@ -628,32 +655,33 @@ void AbstractTfrtCpuBuffer::CopyFromLiteral(
 AbstractTfrtCpuBuffer::AllocateTrackedDeviceBuffer(
     const Shape& on_device_shape,
     absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events) {
-  absl::InlinedVector<std::shared_ptr<MaybeOwningCpuMemory>, 4> buffers;
+  absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> buffers;
   if (!on_device_shape.IsTuple()) {
     size_t byte_size = ShapeUtil::ByteSizeOf(on_device_shape);
-    TF_ASSIGN_OR_RETURN(std::shared_ptr<MaybeOwningCpuMemory> device_buffer,
-                        MaybeOwningCpuMemory::AllocateShared(byte_size));
+    TF_ASSIGN_OR_RETURN(tsl::AsyncValueRef<MaybeOwningCpuMemory> device_buffer,
+                        MaybeOwningCpuMemory::AllocateAvailableAvr(byte_size));
     buffers.push_back(std::move(device_buffer));
     return std::make_unique<TrackedTfrtCpuDeviceBuffer>(
-        /*is_tuple=*/false, std::move(buffers), std::move(definition_events));
+        /*is_tuple=*/false, /*owns_buffers=*/true, std::move(buffers),
+        std::move(definition_events));
   }
   // Tuple case.
   buffers.reserve(on_device_shape.tuple_shapes().size());
   for (const auto& leaf_shape : on_device_shape.tuple_shapes()) {
     size_t byte_size = ShapeUtil::ByteSizeOf(leaf_shape);
-    TF_ASSIGN_OR_RETURN(std::shared_ptr<MaybeOwningCpuMemory> device_buffer,
-                        MaybeOwningCpuMemory::AllocateShared(byte_size));
+    TF_ASSIGN_OR_RETURN(tsl::AsyncValueRef<MaybeOwningCpuMemory> device_buffer,
+                        MaybeOwningCpuMemory::AllocateAvailableAvr(byte_size));
     buffers.push_back(std::move(device_buffer));
   }
   return std::make_unique<TrackedTfrtCpuDeviceBuffer>(
-      /*is_tuple=*/true, std::move(buffers), std::move(definition_events));
+      /*is_tuple=*/true, /*owns_buffers=*/true, std::move(buffers),
+      std::move(definition_events));
 }
 
 /*static*/ void AbstractTfrtCpuBuffer::AllocateAvsAndEvents(
     const Shape& shape,
     absl::InlinedVector<tsl::RCReference<tsl::AsyncValue>, 4>* avs,
-    absl::InlinedVector<tsl::AsyncValueRef<runtime::CpuEvent>, 4>*
-        definition_events) {
+    absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4>* definition_events) {
   // Nested tuple shapes are not supported here.
   int num_leaf_buffers = shape.IsTuple() ? shape.tuple_shapes_size() : 1;
   for (int i = 0; i < num_leaf_buffers; ++i) {
@@ -674,8 +702,9 @@ AbstractTfrtCpuBuffer::BufferFromHostBufferHelper(
     TransposePlanCache* transpose_cache) {
   bool has_default_layout =
       !byte_strides || HasMajorToMinorLayout(type, dims, *byte_strides);
-  // Int4 arrays are unpacked on host and packed on device.
-  bool is_int4 = primitive_util::Is4BitType(type);
+  const int bit_width = primitive_util::BitWidth(type);
+  // Packed arrays are unpacked on host and packed on device.
+  bool is_packed = primitive_util::IsSubByteNonPredType(type);
 
   // If the input buffer has a default layout and is sufficiently aligned, we
   // can simply point to the input array's data without any further copies. At
@@ -691,20 +720,21 @@ AbstractTfrtCpuBuffer::BufferFromHostBufferHelper(
       host_buffer_semantics == HostBufferSemantics::kMutableZeroCopy;
 
   bool can_use_zero_copy =
-      has_default_layout && !is_int4 && is_aligned_data &&
+      has_default_layout && !is_packed && is_aligned_data &&
       (immutable_zero_copy_semantics || mutable_zero_copy_semantics);
 
-  absl::InlinedVector<std::shared_ptr<MaybeOwningCpuMemory>, 4> buffers;
+  absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> buffers;
   absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events;
   absl::AnyInvocable<void() &&> on_delete_callback;
   size_t byte_size = ShapeUtil::ByteSizeOf(shape);
+  bool owns_buffers = true;
 
   if (can_use_zero_copy && mutable_zero_copy_semantics) {
     // For a mutable zero copy semantics we pass a no-op deleter because
     // underlying buffer is owned by the caller and it will free it when
     // PjRt will call `on_done_with_host_buffer` callback.
     MaybeOwningCpuMemory::OwnedDataPtr::deleter_type no_op = +[](void*) {};
-    buffers.push_back(std::make_shared<MaybeOwningCpuMemory>(
+    buffers.push_back(tsl::MakeAvailableAsyncValueRef<MaybeOwningCpuMemory>(
         MaybeOwningCpuMemory::OwnedDataPtr(
             reinterpret_cast<uint8_t*>(const_cast<void*>(data)), no_op),
         byte_size));
@@ -712,18 +742,20 @@ AbstractTfrtCpuBuffer::BufferFromHostBufferHelper(
 
   } else if (can_use_zero_copy && immutable_zero_copy_semantics) {
     // For immutable zero-copy semantics we pass non-owning cpu memory.
-    buffers.push_back(std::make_shared<MaybeOwningCpuMemory>(
+    owns_buffers = false;
+    buffers.push_back(tsl::MakeAvailableAsyncValueRef<MaybeOwningCpuMemory>(
         const_cast<void*>(data), byte_size));
     on_delete_callback = std::move(on_done_with_host_buffer);
 
   } else {
     size_t dst_byte_size =
-        is_int4 ? CeilOfRatio(byte_size, size_t{2}) : byte_size;
-    TF_ASSIGN_OR_RETURN(std::shared_ptr<MaybeOwningCpuMemory> device_buffer,
-                        MaybeOwningCpuMemory::AllocateShared(dst_byte_size));
+        is_packed ? CeilOfRatio<size_t>(byte_size, 8 / bit_width) : byte_size;
+    TF_ASSIGN_OR_RETURN(
+        tsl::AsyncValueRef<MaybeOwningCpuMemory> device_buffer,
+        MaybeOwningCpuMemory::AllocateAvailableAvr(dst_byte_size));
     auto dst_data_ptr = device_buffer->data();
     buffers.push_back(device_buffer);
-    if (!has_default_layout || is_int4) {
+    if (!has_default_layout || is_packed) {
       // If the input array does not have a major-to-minor layout, transpose it
       // into major-to-minor layout. Currently we choose to always do this
       // synchronously.
@@ -741,7 +773,7 @@ AbstractTfrtCpuBuffer::BufferFromHostBufferHelper(
         absl::MutexLock lock(transpose_mu);
         TF_ASSIGN_OR_RETURN(transpose, transpose_cache->GetOrCreate(options));
       }
-      if (!is_int4) {
+      if (!is_packed) {
         transpose->Execute(data, dst_data_ptr);
       } else {
         // First transpose the unpacked data into a new temporary buffer, then
@@ -753,7 +785,7 @@ AbstractTfrtCpuBuffer::BufferFromHostBufferHelper(
         absl::Span<const char> src_data_span(data_transposed.get(), byte_size);
         absl::Span<char> dst_data_span(static_cast<char*>(dst_data_ptr),
                                        dst_byte_size);
-        PackInt4(src_data_span, dst_data_span);
+        PackIntN(bit_width, src_data_span, dst_data_span);
       }
       if (on_done_with_host_buffer) {
         std::move(on_done_with_host_buffer)();
@@ -792,8 +824,8 @@ AbstractTfrtCpuBuffer::BufferFromHostBufferHelper(
     }
   }
   return std::make_unique<TrackedTfrtCpuDeviceBuffer>(
-      /*is_tuple=*/false, std::move(buffers), std::move(definition_events),
-      std::move(on_delete_callback));
+      /*is_tuple=*/false, owns_buffers, std::move(buffers),
+      std::move(definition_events), std::move(on_delete_callback));
 }
 
 AbstractAsyncHostToHostMemoryTransferManager::
@@ -819,7 +851,7 @@ AbstractAsyncHostToHostMemoryTransferManager::
   // Wait for in-flight transfers to finish.
   absl::Condition transfers_finished(
       +[](int* t) { return *t == 0; }, &transfers_in_flight_);
-  LOG(INFO) << "Waiting for in-flight transfers to finish.";
+  VLOG(2) << "Waiting for in-flight transfers to finish.";
   absl::MutexLock l(&mu_);
   mu_.Await(transfers_finished);
   for (auto& avref : avs_) {
@@ -829,7 +861,7 @@ AbstractAsyncHostToHostMemoryTransferManager::
           "Async transfer object was deleted before transfers completed."));
     }
   }
-  LOG(INFO) << "In-flight transfers finished.";
+  VLOG(2) << "In-flight transfers finished.";
 }
 
 size_t AbstractAsyncHostToHostMemoryTransferManager::buffer_size(
@@ -847,7 +879,8 @@ AbstractAsyncHostToHostMemoryTransferManager::RetrieveBuffer(int buffer_index) {
   return std::move(buffers_[buffer_index]);
 }
 
-Status AbstractAsyncHostToHostMemoryTransferManager::TransferLiteralToBuffer(
+absl::Status
+AbstractAsyncHostToHostMemoryTransferManager::TransferLiteralToBuffer(
     int buffer_index, const LiteralSlice& literal,
     absl::AnyInvocable<void() &&> on_done) {
   return TransferRawDataToSubBuffer(buffer_index, literal.untyped_data(),
@@ -856,7 +889,8 @@ Status AbstractAsyncHostToHostMemoryTransferManager::TransferLiteralToBuffer(
                                     std::move(on_done));
 }
 
-Status AbstractAsyncHostToHostMemoryTransferManager::TransferRawDataToBuffer(
+absl::Status
+AbstractAsyncHostToHostMemoryTransferManager::TransferRawDataToBuffer(
     int buffer_index, absl::string_view data,
     absl::AnyInvocable<void() &&> on_done) {
   return TransferRawDataToSubBuffer(
@@ -864,7 +898,10 @@ Status AbstractAsyncHostToHostMemoryTransferManager::TransferRawDataToBuffer(
       /*is_last_transfer=*/true, std::move(on_done));
 }
 
-Status AbstractAsyncHostToHostMemoryTransferManager::TransferRawDataToSubBuffer(
+// The definition events of `device_buffers_` must be ready before calling this
+// function.
+absl::Status
+AbstractAsyncHostToHostMemoryTransferManager::TransferRawDataToSubBuffer(
     int buffer_index, const void* data, int64_t offset, int64_t transfer_size,
     bool is_last_transfer, absl::AnyInvocable<void() &&> on_done) {
   {
@@ -889,6 +926,7 @@ Status AbstractAsyncHostToHostMemoryTransferManager::TransferRawDataToSubBuffer(
     {
       absl::MutexLock l(&mu_);
       const auto& b = device_buffers_[buffer_index]->Buffers()[0];
+      CHECK(b.IsConcrete());
       std::memcpy(reinterpret_cast<char*>(b->data()) + offset, data,
                   transfer_size);
       if (is_last_transfer) {
@@ -908,16 +946,16 @@ Status AbstractAsyncHostToHostMemoryTransferManager::TransferRawDataToSubBuffer(
       event->SetStateConcrete();
     }
   });
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void AbstractAsyncHostToHostMemoryTransferManager::SetBufferError(
-    int buffer_index, Status error) {
+    int buffer_index, absl::Status error) {
   absl::MutexLock l(&mu_);
   avs_[buffer_index]->SetError(error);
 }
 
-/*static*/ Status
+/*static*/ absl::Status
 AbstractAsyncHostToHostMemoryTransferManager::PopulateAsyncTransferManagerData(
     absl::Span<const std::unique_ptr<AbstractTfrtCpuBuffer>> buffers,
     absl::InlinedVector<TrackedTfrtCpuDeviceBuffer*, 4>& device_buffers,
@@ -943,7 +981,7 @@ AbstractAsyncHostToHostMemoryTransferManager::PopulateAsyncTransferManagerData(
     buffer_sizes.push_back(buffer_size);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.h b/third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.h
index 98f4fbf22cfa32..f21e4546d47de2 100644
--- a/third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.h
+++ b/third_party/xla/xla/pjrt/cpu/abstract_tfrt_cpu_buffer.h
@@ -38,15 +38,15 @@ limitations under the License.
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/transpose.h"
-#include "xla/runtime/cpu_event.h"
+#include "xla/service/cpu/cpu_event.h"
 #include "xla/shape.h"
 #include "xla/status.h"
 #include "xla/statusor.h"
+#include "xla/tsl/concurrency/async_value.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/concurrency/async_value.h"
-#include "tsl/concurrency/async_value_ref.h"
-#include "tsl/concurrency/ref_count.h"
 
 namespace xla {
 
@@ -57,7 +57,7 @@ namespace xla {
 // robust by using setting the AsyncValue in the destructor.
 class MarkEventReadyOnExit {
  public:
-  explicit MarkEventReadyOnExit(tsl::AsyncValueRef<runtime::CpuEvent> event)
+  explicit MarkEventReadyOnExit(tsl::AsyncValueRef<CpuEvent> event)
       : event_(std::move(event)) {}
 
   MarkEventReadyOnExit(const MarkEventReadyOnExit&) = delete;
@@ -69,12 +69,10 @@ class MarkEventReadyOnExit {
     if (event_) event_.SetStateConcrete();
   }
 
-  tsl::AsyncValueRef<runtime::CpuEvent> Release() && {
-    return std::move(event_);
-  }
+  tsl::AsyncValueRef<CpuEvent> Release() && { return std::move(event_); }
 
  private:
-  tsl::AsyncValueRef<runtime::CpuEvent> event_;
+  tsl::AsyncValueRef<CpuEvent> event_;
 };
 
 // Async work runner abstracts away the implementation of the underlying thread
@@ -123,16 +121,14 @@ class AbstractTfrtCpuBuffer : public PjRtBuffer {
 
   bool IsDeleted() override;
 
-  void CopyToRemoteDevice(
-      PjRtFuture<absl::StatusOr<std::string>> serialized_descriptor,
-      RemoteSendCallback on_done) override {
+  void CopyToRemoteDevice(PjRtFuture<std::string> serialized_descriptor,
+                          RemoteSendCallback on_done) override {
     on_done(Unimplemented("CopyToRemoteDevice not implemented."),
             /*sends_were_enqueued=*/false);
   }
 
   void CopyToRemoteDeviceScattered(
-      PjRtFuture<absl::StatusOr<std::vector<std::string>>>
-          serialized_descriptors,
+      PjRtFuture<std::vector<std::string>> serialized_descriptors,
       std::vector<RemoteSendCallback> callbacks,
       const xla::PjRtBuffer::ScatterDetails& scatter_details) override {
     for (const auto& on_done : callbacks) {
@@ -151,7 +147,7 @@ class AbstractTfrtCpuBuffer : public PjRtBuffer {
   // nullptr if the buffer is already donated or there is outstanding external
   // references.
   TrackedTfrtCpuDeviceBuffer* AcquireUsage(
-      tsl::AsyncValueRef<runtime::CpuEvent> usage_event);
+      tsl::AsyncValueRef<CpuEvent> usage_event);
 
   // A helper class for managing a pending donation. It should be committed upon
   // success. Otherwise, the donated buffer is returned to the
@@ -218,8 +214,7 @@ class AbstractTfrtCpuBuffer : public PjRtBuffer {
   static absl::StatusOr<std::unique_ptr<TrackedTfrtCpuDeviceBuffer>>
   AllocateTrackedDeviceBuffer(
       const Shape& on_device_shape,
-      absl::InlinedVector<tsl::AsyncValueRef<runtime::CpuEvent>, 4>
-          definition_events);
+      absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events);
 
   // Allocates new cpu events to `avs` and `definition_events`. If `shape` is a
   // tuple, multiple events will be allocated. Otherwise, `avs` and
@@ -227,8 +222,7 @@ class AbstractTfrtCpuBuffer : public PjRtBuffer {
   static void AllocateAvsAndEvents(
       const Shape& shape,
       absl::InlinedVector<tsl::RCReference<tsl::AsyncValue>, 4>* avs,
-      absl::InlinedVector<tsl::AsyncValueRef<runtime::CpuEvent>, 4>*
-          definition_events);
+      absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4>* definition_events);
 
   // A helper function for PjRtClient::BufferFromHostBuffer. Creates a new cpu
   // device buffer from the host buffer (maybe zero-copy or async).
@@ -305,8 +299,8 @@ class AbstractTfrtCpuBuffer : public PjRtBuffer {
   // If this buffer has external references when Delete() is called, this event
   // is populated by Delete(). When the last external reference is released,
   // the event is triggered, which is a precondition for the buffer being
-  std::optional<tsl::AsyncValueRef<runtime::CpuEvent>>
-      external_references_dropped_event_ ABSL_GUARDED_BY(mu_);
+  std::optional<tsl::AsyncValueRef<CpuEvent>> external_references_dropped_event_
+      ABSL_GUARDED_BY(mu_);
 
   // `pending_donation_` indicates whether a donation is pending. The destructor
   // of the AbstractTfrtCpuBuffer will wait for a pending donation, as the
@@ -326,19 +320,19 @@ class AbstractAsyncHostToHostMemoryTransferManager
 
   std::unique_ptr<PjRtBuffer> RetrieveBuffer(int buffer_index) override;
 
-  Status TransferLiteralToBuffer(
+  absl::Status TransferLiteralToBuffer(
       int buffer_index, const LiteralSlice& literal,
       absl::AnyInvocable<void() &&> on_done) override;
 
-  Status TransferRawDataToBuffer(
+  absl::Status TransferRawDataToBuffer(
       int buffer_index, absl::string_view data,
       absl::AnyInvocable<void() &&> on_done) override;
 
-  Status TransferRawDataToSubBuffer(
+  absl::Status TransferRawDataToSubBuffer(
       int buffer_index, const void* data, int64_t offset, int64_t transfer_size,
       bool is_last_transfer, absl::AnyInvocable<void() &&> on_done) override;
 
-  void SetBufferError(int buffer_index, Status error) override;
+  void SetBufferError(int buffer_index, absl::Status error) override;
 
   void AddTransferMetadata(const TransferMetadata& meta) override {
     LOG(WARNING) << "AddTransferMetadata not implemented for "
@@ -357,7 +351,7 @@ class AbstractAsyncHostToHostMemoryTransferManager
 
   // Initialize `device_buffers`, `buffer_sizes`, `buffer_transfers_in_flight`,
   // and `last_transfer_finished` from `buffers`.
-  static Status PopulateAsyncTransferManagerData(
+  static absl::Status PopulateAsyncTransferManagerData(
       absl::Span<const std::unique_ptr<AbstractTfrtCpuBuffer>> buffers,
       absl::InlinedVector<TrackedTfrtCpuDeviceBuffer*, 4>& device_buffers,
       absl::InlinedVector<size_t, 4>& buffer_sizes,
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client.cc b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
index 7d0cc23b7bde62..73ea144c7e35c9 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
@@ -27,7 +27,9 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "xla/pjrt/cpu/cpu_topology.h"
+#include "xla/pjrt/host_memory_spaces.h"
 #include "xla/pjrt/pjrt_compiler.h"
 
 #define EIGEN_USE_THREADS
@@ -72,12 +74,12 @@ limitations under the License.
 #include "xla/pjrt/semaphore.h"
 #include "xla/pjrt/transpose.h"
 #include "xla/pjrt/utils.h"
-#include "xla/runtime/cpu_event.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/compiler.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/cpu/collectives_interface.h"
 #include "xla/service/cpu/cpu_compiler.h"
+#include "xla/service/cpu/cpu_event.h"
 #include "xla/service/cpu/cpu_executable.h"
 #include "xla/service/cpu/cpu_executable_run_options.h"
 #include "xla/service/cpu/cpu_xfeed.h"
@@ -93,12 +95,12 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
+#include "xla/tsl/concurrency/async_value.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/concurrency/async_value.h"
-#include "tsl/concurrency/async_value_ref.h"
-#include "tsl/concurrency/ref_count.h"
 #include "tsl/lib/strings/proto_serialization.h"
 #include "tsl/platform/casts.h"
 #include "tsl/platform/denormal.h"
@@ -114,8 +116,6 @@ limitations under the License.
 namespace xla {
 namespace {
 
-using ::xla::runtime::CpuEvent;
-
 absl::StatusOr<std::unique_ptr<TfrtCpuBuffer>> AllocateDestinationBuffer(
     const Shape& on_device_shape,
     absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events,
@@ -125,7 +125,8 @@ absl::StatusOr<std::unique_ptr<TfrtCpuBuffer>> AllocateDestinationBuffer(
       AbstractTfrtCpuBuffer::AllocateTrackedDeviceBuffer(
           on_device_shape, std::move(definition_events)));
   return std::make_unique<TfrtCpuBuffer>(
-      on_device_shape, std::move(tracked_device_buffer), client, device);
+      on_device_shape, std::move(tracked_device_buffer), client, device,
+      *device->default_memory_space());
 }
 
 absl::StatusOr<std::unique_ptr<TfrtCpuBuffer>> AllocateDestinationBufferAndAvs(
@@ -303,20 +304,56 @@ TfrtCpuDevice::TfrtCpuDevice(int id, int process_index, int local_hardware_id,
       max_inflight_computations_semaphore_(
           /*capacity=*/max_inflight_computations) {}
 
-Status TfrtCpuDevice::TransferToInfeed(const LiteralSlice& literal) {
+absl::Status TfrtCpuDevice::TransferToInfeed(const LiteralSlice& literal) {
   return TransferLiteralToInfeedOnCpu(local_hardware_id(), literal);
 }
 
-Status TfrtCpuDevice::TransferFromOutfeed(MutableBorrowingLiteral literal) {
+absl::Status TfrtCpuDevice::TransferFromOutfeed(
+    MutableBorrowingLiteral literal) {
   return TransferLiteralFromOutfeedOnCpu(local_hardware_id(), literal);
 }
 
+void TfrtCpuDevice::AttachMemorySpace(PjRtMemorySpace* memory_space) {
+  CHECK(memory_space != nullptr);
+  CHECK(client_ == memory_space->client()) << absl::StrFormat(
+      "Could not attach a TfrtCpuDevice to a PjRtMemorySpace owned by a "
+      "different client, the device's client: %s, the memory space's client: "
+      "%s.",
+      client_->platform_name(), memory_space->client()->platform_name());
+
+  memory_spaces_.push_back(memory_space);
+  memory_spaces_by_id_.emplace(memory_space->kind_id(), memory_space);
+}
+
 absl::Span<PjRtMemorySpace* const> TfrtCpuDevice::memory_spaces() const {
-  return {};
+  return memory_spaces_;
 }
 
 absl::StatusOr<PjRtMemorySpace*> TfrtCpuDevice::default_memory_space() const {
-  return Unimplemented("default_memory_space is not supported");
+  return memory_space_by_kind_id(UnpinnedHostMemorySpace::kKindId);
+}
+
+absl::StatusOr<PjRtMemorySpace*> TfrtCpuDevice::memory_space_by_kind(
+    absl::string_view memory_space_kind) const {
+  auto it =
+      absl::c_find_if(memory_spaces_, [memory_space_kind](PjRtMemorySpace* ms) {
+        return ms->kind() == memory_space_kind;
+      });
+  if (it != memory_spaces_.end()) {
+    return *it;
+  }
+  return absl::InternalError(
+      absl::StrCat("No memory space found (kind: ", memory_space_kind, ")"));
+}
+
+absl::StatusOr<PjRtMemorySpace*> TfrtCpuDevice::memory_space_by_kind_id(
+    int id) const {
+  auto it = memory_spaces_by_id_.find(id);
+  if (it == memory_spaces_by_id_.end()) {
+    return absl::InternalError(
+        absl::StrCat("No memory space found (kind_id: ", id, ")"));
+  }
+  return it->second;
 }
 
 static int CpuDeviceCount() {
@@ -374,7 +411,9 @@ static tsl::ThreadOptions GetThreadOptions() {
   tsl::ThreadOptions thread_options;
   // On Mac OS the default stack size is 512KiB, which is too small for some
   // BLAS and LAPACK functions (https://github.com/google/jax/issues/20428).
-  thread_options.stack_size = 2 * 1024 * 1024;
+  // On Linux we also observed that 2MB wasn't enough to run some OpenBLAS
+  // functions.
+  thread_options.stack_size = 8 * 1024 * 1024;
   return thread_options;
 }
 
@@ -419,17 +458,24 @@ TfrtCpuClient::TfrtCpuClient(
     }
   }
   for (int idx = 0; idx < addressable_devices_.size(); ++idx) {
-    CHECK(addressable_devices_[idx] != nullptr) << idx;
+    auto* const device = addressable_devices_[idx];
+    CHECK(device != nullptr) << idx;
+
+    // Use the device id to construct a globally unique memory space id. We
+    // do not promise that memory space ids and device ids are the same.
+    const int id = device->id();
+    auto memory_space = std::make_unique<UnpinnedHostMemorySpace>(id, device);
+    tensorflow::down_cast<TfrtCpuDevice*>(device)->AttachMemorySpace(
+        memory_space.get());
+    memory_spaces_.push_back(memory_space.get());
+    owned_memory_spaces_.push_back(std::move(memory_space));
   }
+
   LOG(INFO) << "TfrtCpuClient created.";
 }
 
 TfrtCpuClient::~TfrtCpuClient() { LOG(INFO) << "TfrtCpuClient destroyed."; }
 
-absl::StatusOr<PjRtDevice*> TfrtCpuClient::LookupDevice(int device_id) const {
-  return LookupDevice(PjRtGlobalDeviceId(device_id));
-}
-
 absl::StatusOr<PjRtDevice*> TfrtCpuClient::LookupDevice(
     xla::PjRtGlobalDeviceId global_device_id) const {
   auto it = id_to_device_.find(global_device_id.value());
@@ -457,7 +503,7 @@ absl::StatusOr<PjRtDevice*> TfrtCpuClient::LookupAddressableDevice(
 }
 
 absl::Span<PjRtMemorySpace* const> TfrtCpuClient::memory_spaces() const {
-  return {};
+  return memory_spaces_;
 }
 
 absl::StatusOr<DeviceAssignment> TfrtCpuClient::GetDefaultDeviceAssignment(
@@ -623,8 +669,10 @@ TfrtCpuClient::DeserializeExecutable(absl::string_view serialized,
     addressable_devices.reserve(num_replicas * num_partitions);
     for (int replica = 0; replica < num_replicas; ++replica) {
       for (int partition = 0; partition < num_partitions; ++partition) {
-        int device_id = (*device_assignment)(replica, partition);
-        TF_ASSIGN_OR_RETURN(PjRtDevice * device, LookupDevice(device_id));
+        int64_t device_id = (*device_assignment)(replica, partition);
+        PjRtGlobalDeviceId global_device_id(device_id);
+        TF_ASSIGN_OR_RETURN(PjRtDevice * device,
+                            LookupDevice(global_device_id));
         if (device->process_index() != process_index()) {
           VLOG(3) << "Non-local device: " << device_id;
           continue;
@@ -720,8 +768,9 @@ absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> TfrtCpuClient::Compile(
       for (int computation = 0;
            computation < device_assignment->computation_count();
            ++computation) {
-        int id = (*device_assignment)(replica, computation);
-        TF_ASSIGN_OR_RETURN(auto* device, LookupDevice(id));
+        int64_t id = (*device_assignment)(replica, computation);
+        PjRtGlobalDeviceId global_device_id(id);
+        TF_ASSIGN_OR_RETURN(auto* device, LookupDevice(global_device_id));
         if (device->process_index() != process_index()) {
           // TODO(phawkins): improve this error message when we're ready to
           // publicize that multiprocess collectives exist.
@@ -746,8 +795,10 @@ absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> TfrtCpuClient::Compile(
     addressable_devices.reserve(num_replicas * num_partitions);
     for (int replica = 0; replica < num_replicas; ++replica) {
       for (int partition = 0; partition < num_partitions; ++partition) {
-        int device_id = (*device_assignment)(replica, partition);
-        TF_ASSIGN_OR_RETURN(PjRtDevice * device, LookupDevice(device_id));
+        int64_t device_id = (*device_assignment)(replica, partition);
+        PjRtGlobalDeviceId global_device_id(device_id);
+        TF_ASSIGN_OR_RETURN(PjRtDevice * device,
+                            LookupDevice(global_device_id));
         if (device->process_index() != process_index()) {
           VLOG(3) << "Non-local device: " << device_id;
           continue;
@@ -778,6 +829,9 @@ absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> TfrtCpuClient::Compile(
   xla::Compiler::CompileOptions compile_options{
       build_options.device_allocator(), build_options.compile_thread_pool(),
       build_options.layout_canonicalization_callback()};
+  if (!compile_options.thread_pool) {
+    compile_options.thread_pool = pjrt_client_thread_pool();
+  }
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Executable> cpu_executable,
       JitCompile(computation, argument_layout_pointers, build_options,
@@ -833,31 +887,42 @@ TfrtCpuClient::CreateViewOfDeviceBuffer(
         "TfrtCpuClient::CreateViewOfDeviceBuffer does not support `stream` "
         "argument.");
   }
-  absl::InlinedVector<std::shared_ptr<MaybeOwningCpuMemory>, 4> buffers;
+  absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> buffers;
   size_t byte_size = ShapeUtil::ByteSizeOf(shape);
   auto non_owning_buffer =
-      std::make_shared<MaybeOwningCpuMemory>(device_ptr, byte_size);
+      tsl::MakeAvailableAsyncValueRef<MaybeOwningCpuMemory>(device_ptr,
+                                                            byte_size);
   buffers.push_back(std::move(non_owning_buffer));
   auto tracked_device_buffer = std::make_unique<TrackedTfrtCpuDeviceBuffer>(
-      /*is_tuple=*/false, std::move(buffers),
+      /*is_tuple=*/false, /*owns_buffers=*/false, std::move(buffers),
       /*definition_event=*/tsl::MakeAvailableAsyncValueRef<CpuEvent>(),
       std::move(on_delete_callback));
   return std::unique_ptr<PjRtBuffer>(std::make_unique<TfrtCpuBuffer>(
       shape, std::move(tracked_device_buffer), this,
-      tensorflow::down_cast<TfrtCpuDevice*>(device)));
+      tensorflow::down_cast<TfrtCpuDevice*>(device),
+      *device->default_memory_space()));
 }
 
 absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtCpuClient::CreateErrorBuffer(
-    Status error, const Shape& shape, PjRtDevice* device) {
+    absl::Status error, const Shape& shape, PjRtDevice* device) {
+  if (device->client() != this) {
+    return absl::InvalidArgumentError("Device is not attached to this client");
+  }
   return std::make_unique<TfrtCpuBuffer>(
       shape,
       std::make_unique<TrackedTfrtCpuDeviceBuffer>(
-          /*is_tuple=*/false,
-          absl::InlinedVector<std::shared_ptr<MaybeOwningCpuMemory>, 4>{},
+          /*is_tuple=*/false, /*owns_buffers=*/true,
+          absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4>{},
           absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4>{
               tsl::AsyncValueRef<CpuEvent>(
                   tsl::MakeErrorAsyncValueRef(std::move(error)))}),
-      this, tensorflow::down_cast<TfrtCpuDevice*>(device));
+      this, tensorflow::down_cast<TfrtCpuDevice*>(device),
+      *device->default_memory_space());
+}
+
+absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtCpuClient::CreateErrorBuffer(
+    absl::Status error, const Shape& shape, PjRtMemorySpace* memory) {
+  return CreateErrorBuffer(std::move(error), shape, memory->devices()[0]);
 }
 
 absl::StatusOr<std::unique_ptr<PjRtBuffer>>
@@ -879,6 +944,13 @@ TfrtCpuClient::CreateBuffersForAsyncHostToDevice(absl::Span<const Shape> shapes,
                                                          this);
 }
 
+absl::StatusOr<std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
+TfrtCpuClient::CreateBuffersForAsyncHostToDevice(
+    absl::Span<const Shape> shapes, PjRtMemorySpace* memory_space) {
+  CHECK_EQ(memory_space->devices().size(), 1);
+  return CreateBuffersForAsyncHostToDevice(shapes, memory_space->devices()[0]);
+}
+
 absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtCpuClient::BufferFromHostBuffer(
     const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
     std::optional<absl::Span<int64_t const>> byte_strides,
@@ -903,7 +975,38 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtCpuClient::BufferFromHostBuffer(
 
   return std::unique_ptr<PjRtBuffer>(std::make_unique<TfrtCpuBuffer>(
       shape, std::move(tracked_device_buffer), this,
-      tensorflow::down_cast<TfrtCpuDevice*>(device)));
+      tensorflow::down_cast<TfrtCpuDevice*>(device),
+      *device->default_memory_space()));
+}
+
+absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtCpuClient::BufferFromHostBuffer(
+    const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+    std::optional<absl::Span<int64_t const>> byte_strides,
+    HostBufferSemantics host_buffer_semantics,
+    absl::AnyInvocable<void() &&> on_done_with_host_buffer, PjRtDevice* device,
+    const Layout* device_layout) {
+  if (device_layout != nullptr) {
+    return absl::UnimplementedError(absl::StrCat(
+        "BufferFromHostBuffer with an optional device layout is not "
+        "implemented on platform: ",
+        platform_name()));
+  }
+  return BufferFromHostBuffer(data, type, dims, byte_strides,
+                              host_buffer_semantics,
+                              std::move(on_done_with_host_buffer), device);
+}
+
+absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtCpuClient::BufferFromHostBuffer(
+    const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+    std::optional<absl::Span<int64_t const>> byte_strides,
+    HostBufferSemantics host_buffer_semantics,
+    absl::AnyInvocable<void() &&> on_done_with_host_buffer,
+    PjRtMemorySpace* memory_space, const Layout* device_layout) {
+  CHECK_EQ(memory_space->devices().size(), 1);
+  return BufferFromHostBuffer(data, type, dims, byte_strides,
+                              host_buffer_semantics,
+                              std::move(on_done_with_host_buffer),
+                              memory_space->devices()[0], device_layout);
 }
 
 absl::StatusOr<std::unique_ptr<PjRtBuffer>>
@@ -926,14 +1029,22 @@ TfrtCpuClient::BufferFromHostLiteral(const LiteralSlice& literal,
   return std::unique_ptr<PjRtBuffer>(std::move(output_buffer));
 }
 
+absl::StatusOr<std::unique_ptr<PjRtBuffer>>
+TfrtCpuClient::BufferFromHostLiteral(const LiteralSlice& literal,
+                                     PjRtMemorySpace* memory_space) {
+  CHECK_EQ(memory_space->devices().size(), 1);
+  return BufferFromHostLiteral(literal, memory_space->devices()[0]);
+}
+
 TfrtCpuBuffer::TfrtCpuBuffer(
     Shape on_device_shape,
     std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer,
-    TfrtCpuClient* client, TfrtCpuDevice* device)
+    TfrtCpuClient* client, TfrtCpuDevice* device, PjRtMemorySpace* memory_space)
     : AbstractTfrtCpuBuffer(std::move(on_device_shape),
                             std::move(tracked_device_buffer)),
       client_(client),
-      device_(device) {}
+      device_(device),
+      memory_space_(memory_space) {}
 
 static std::vector<tsl::RCReference<tsl::AsyncValue>> CopyAsyncValues(
     absl::Span<const tsl::RCReference<tsl::AsyncValue>> events) {
@@ -986,7 +1097,14 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtCpuBuffer::CopyToDevice(
 
   return std::unique_ptr<PjRtBuffer>(std::make_unique<TfrtCpuBuffer>(
       on_device_shape_, std::move(tracked_device_buffer), client(),
-      tensorflow::down_cast<TfrtCpuDevice*>(dst_device)));
+      tensorflow::down_cast<TfrtCpuDevice*>(dst_device),
+      *dst_device->default_memory_space()));
+}
+
+absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtCpuBuffer::CopyToMemorySpace(
+    PjRtMemorySpace* dst_memory_space) {
+  CHECK_EQ(dst_memory_space->devices().size(), 1);
+  return CopyToDevice(dst_memory_space->devices()[0]);
 }
 
 TfrtCpuExecutable::TfrtCpuExecutable(
@@ -1054,82 +1172,147 @@ absl::StatusOr<std::optional<std::string>> TfrtCpuExecutable::Fingerprint()
   return std::optional<std::string>();
 }
 
-Status TfrtCpuExecutable::SetUpDonation(bool tuple_inputs) {
+absl::Status TfrtCpuExecutable::SetUpDonation(bool tuple_inputs) {
   TF_ASSIGN_OR_RETURN(parameters_that_must_be_donated_,
                       ComputeParametersThatMustBeDonated(
                           *cpu_executable_->shared_module(), tuple_inputs));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
+namespace {
+
+// Some helper structs to support delayed memory allocation.
+
+struct BufferInfo {
+  tsl::AsyncValueRef<MaybeOwningCpuMemory> buffer;
+  bool owns_buffer;
+  size_t buffer_size;
+};
+
+struct BufferAlloc {
+  // All data members should have the same size.
+  absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> buffers;
+  absl::InlinedVector<size_t, 4> allocation_sizes;
+
+  void Allocate() {
+    for (int i = 0; i < buffers.size(); ++i) {
+      auto memory = MaybeOwningCpuMemory::Allocate(allocation_sizes[i]);
+      if (!memory.ok()) {
+        buffers[i].SetError(memory.status());
+        return;
+      }
+      buffers[i].emplace(std::move(*memory));
+      ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(buffers[i]->data(),
+                                          allocation_sizes[i]);
+    }
+  }
+};
+
+struct BufferAllocAndCopy {
+  // All data members should have the same size.
+  absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> src_buffers;
+  absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> dst_buffers;
+  absl::InlinedVector<size_t, 4> allocation_sizes;
+
+  void AllocateAndCopy() {
+    for (int i = 0; i < src_buffers.size(); ++i) {
+      auto memory = MaybeOwningCpuMemory::Allocate(allocation_sizes[i]);
+      if (!memory.ok()) {
+        dst_buffers[i].SetError(memory.status());
+        return;
+      }
+      dst_buffers[i].emplace(std::move(*memory));
+      CHECK(src_buffers[i].IsConcrete());
+      std::memcpy(dst_buffers[i]->data(), src_buffers[i]->data(),
+                  allocation_sizes[i]);
+    }
+  }
+};
+
+}  // namespace
+
 // The following few helpers are adapted from XLA:CPU to create a buffer table
 // and assemble the buffer pointers in order to call into CpuExecutable.
-static absl::StatusOr<std::shared_ptr<MaybeOwningCpuMemory>>
-MemoryForAllocation(
+static absl::StatusOr<BufferInfo> MemoryForAllocation(
     const BufferAllocation& allocation,
-    absl::Span<std::pair<bool, TrackedTfrtCpuDeviceBuffer*> const> arguments) {
+    absl::Span<std::pair<bool, TrackedTfrtCpuDeviceBuffer*> const> arguments,
+    BufferAlloc& buffer_alloc, BufferAllocAndCopy& buffer_alloc_and_copy) {
+  BufferInfo buffer_info;
   if (allocation.is_entry_computation_parameter()) {
     auto [can_donate, arg] = arguments[allocation.parameter_number()];
-    std::shared_ptr<MaybeOwningCpuMemory> out =
+    tsl::AsyncValueRef<MaybeOwningCpuMemory> out =
         arg->Buffer(allocation.param_shape_index());
-    CHECK_EQ(allocation.size(), out->size())
+    CHECK_EQ(allocation.size(), arg->BufferSize(allocation.param_shape_index()))
         << "Size mismatch on param " << allocation.parameter_number()
         << " at shape index " << allocation.param_shape_index().ToString();
 
     // If we don't own the buffer, we can't overwrite it or donate it. For
     // example we might be pointing to a buffer owned by the client whose
     // lifetime will not extend past the lifetime of the donated input buffer.
-    if ((!can_donate || !out->owns_data()) && !allocation.is_readonly()) {
-      TF_ASSIGN_OR_RETURN(
-          auto copy, MaybeOwningCpuMemory::AllocateShared(allocation.size()));
-      std::memcpy(copy->data(), out->data(), allocation.size());
-      return copy;
+    if ((!can_donate || !arg->owns_buffers()) && !allocation.is_readonly()) {
+      auto copy = tsl::MakeUnconstructedAsyncValueRef<MaybeOwningCpuMemory>();
+
+      buffer_alloc_and_copy.src_buffers.push_back(std::move(out));
+      buffer_alloc_and_copy.dst_buffers.push_back(copy);
+      buffer_alloc_and_copy.allocation_sizes.push_back(allocation.size());
+
+      buffer_info.buffer = std::move(copy);
+      buffer_info.owns_buffer = true;
+      buffer_info.buffer_size = allocation.size();
+      return buffer_info;
     }
-    return out;
-  } else if (allocation.is_constant()) {
-    return std::make_shared<MaybeOwningCpuMemory>();
-  } else if (allocation.is_thread_local()) {
-    return std::make_shared<MaybeOwningCpuMemory>();
+
+    buffer_info.buffer = std::move(out);
+    buffer_info.owns_buffer = arg->owns_buffers();
+    buffer_info.buffer_size = arg->BufferSize(allocation.param_shape_index());
+    return buffer_info;
+  } else if (allocation.is_constant() || allocation.is_thread_local()) {
+    buffer_info.buffer =
+        tsl::MakeAvailableAsyncValueRef<MaybeOwningCpuMemory>();
+    buffer_info.owns_buffer = true;
+    buffer_info.buffer_size = 0;
+    return buffer_info;
   }
 
   // Output and temporary buffer.
-  TF_ASSIGN_OR_RETURN(auto out,
-                      MaybeOwningCpuMemory::AllocateShared(allocation.size()));
+  auto out = tsl::MakeUnconstructedAsyncValueRef<MaybeOwningCpuMemory>();
+
+  buffer_alloc.buffers.push_back(out);
+  buffer_alloc.allocation_sizes.push_back(allocation.size());
 
-  // Since the output buffer and all the temporary buffers were written into
-  // by the JITed code, msan has no way of knowing their memory was
-  // initialized. Mark them initialized so that msan doesn't flag loads from
-  // these buffers.
-  ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(out->data(), allocation.size());
-  return out;
+  buffer_info.buffer = std::move(out);
+  buffer_info.owns_buffer = true;
+  buffer_info.buffer_size = allocation.size();
+  return buffer_info;
 }
 
-static absl::StatusOr<std::vector<std::shared_ptr<MaybeOwningCpuMemory>>>
-CreateBufferTable(
+static absl::StatusOr<std::vector<BufferInfo>> CreateBufferTable(
     const BufferAssignment& assignment,
-    absl::Span<std::pair<bool, TrackedTfrtCpuDeviceBuffer*> const> arguments) {
-  std::vector<std::shared_ptr<MaybeOwningCpuMemory>> buffers(
-      assignment.Allocations().size());
+    absl::Span<std::pair<bool, TrackedTfrtCpuDeviceBuffer*> const> arguments,
+    BufferAlloc& buffer_alloc, BufferAllocAndCopy& buffer_alloc_and_copy) {
+  std::vector<BufferInfo> buffer_table(assignment.Allocations().size());
   for (BufferAllocation::Index i = 0; i < assignment.Allocations().size();
        ++i) {
     const BufferAllocation& allocation = assignment.GetAllocation(i);
-    TF_ASSIGN_OR_RETURN(buffers[i], MemoryForAllocation(allocation, arguments));
+    TF_ASSIGN_OR_RETURN(buffer_table[i],
+                        MemoryForAllocation(allocation, arguments, buffer_alloc,
+                                            buffer_alloc_and_copy));
   }
-  return std::move(buffers);
+  return std::move(buffer_table);
 }
 
-static absl::InlinedVector<std::shared_ptr<MaybeOwningCpuMemory>, 4>
-CreateResultShapedBuffer(
+static absl::InlinedVector<BufferInfo, 4> CreateResultBufferInfo(
     absl::Span<const BufferAllocation::Index> buffer_indices,
-    absl::Span<const std::shared_ptr<MaybeOwningCpuMemory>> buffer_table) {
-  absl::InlinedVector<std::shared_ptr<MaybeOwningCpuMemory>, 4> output_buffers;
-  output_buffers.reserve(buffer_indices.size());
+    absl::Span<const BufferInfo> buffer_table) {
+  absl::InlinedVector<BufferInfo, 4> output_buffer_info;
+  output_buffer_info.reserve(buffer_indices.size());
   for (int i = 0; i < buffer_indices.size(); ++i) {
-    output_buffers.push_back(buffer_table[buffer_indices[i]]);
+    output_buffer_info.push_back(buffer_table[buffer_indices[i]]);
   }
-  return output_buffers;
+  return output_buffer_info;
 }
 
-Status TfrtCpuExecutable::CheckBufferCompatibilities(
+absl::Status TfrtCpuExecutable::CheckBufferCompatibilities(
     absl::Span<std::pair<bool, TrackedTfrtCpuDeviceBuffer*> const>
         input_buffers) const {
   if (input_buffers.size() != input_buffer_sizes_in_bytes_.size()) {
@@ -1140,14 +1323,14 @@ Status TfrtCpuExecutable::CheckBufferCompatibilities(
   }
   for (int i = 0; i < input_buffers.size(); ++i) {
     const auto& buffer = input_buffers[i].second;
-    if (input_buffer_sizes_in_bytes_[i] != buffer->Buffers()[0]->size()) {
+    if (input_buffer_sizes_in_bytes_[i] != buffer->BufferSizes()[0]) {
       return InvalidArgument(
           "Executable expected parameter %d of size %lld but got buffer with "
           "incompatible size %lld",
-          i, input_buffer_sizes_in_bytes_[i], buffer->Buffers()[0]->size());
+          i, input_buffer_sizes_in_bytes_[i], buffer->BufferSizes()[0]);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
@@ -1160,9 +1343,10 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
   std::shared_ptr<DeviceAssignment> device_assignment;
   if (device == nullptr) {
     CHECK(device_assignment_ != nullptr);
-    const int device_id = (*device_assignment_)(replica, partition);
+    const int64_t device_id = (*device_assignment_)(replica, partition);
+    PjRtGlobalDeviceId global_device_id(device_id);
     TF_ASSIGN_OR_RETURN(PjRtDevice * pjrt_device,
-                        client_->LookupDevice(device_id));
+                        client_->LookupDevice(global_device_id));
     device = tensorflow::down_cast<TfrtCpuDevice*>(pjrt_device);
     device_assignment = device_assignment_;
   } else {
@@ -1226,7 +1410,7 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
     }
 
     TrackedTfrtCpuDeviceBuffer* tracked_buffer;
-    auto get_buffer = [&](int i) -> Status {
+    auto get_buffer = [&](int i) -> absl::Status {
       bool must_donate = donate_it != parameters_that_must_be_donated_.end() &&
                          *donate_it == i;
       TF_RETURN_IF_ERROR(TestBufferDonationClashes(
@@ -1253,7 +1437,7 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
           tracked_buffer = donation_transaction->device_buffer();
           tracked_buffers.emplace_back(/*can_donate=*/true, tracked_buffer);
           donation_transactions.push_back(std::move(*donation_transaction));
-          return OkStatus();
+          return absl::OkStatus();
         }
       }
       tracked_buffer = tfrt_buffer->AcquireUsage(execute_event);
@@ -1261,7 +1445,7 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
         return InvalidArgument(
             "Invalid buffer passed: buffer has been deleted or donated.");
       tracked_buffers.emplace_back(/*can_donate=*/false, tracked_buffer);
-      return OkStatus();
+      return absl::OkStatus();
     };
     TF_RETURN_IF_ERROR(get_buffer(i));
 
@@ -1278,28 +1462,42 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
   // gets many inputs that are not yet tupled.
   std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tuplized_arg;
   if (parameter_is_tupled_arguments_ && !options.arguments_are_tupled) {
-    absl::InlinedVector<std::shared_ptr<MaybeOwningCpuMemory>, 4> leaf_buffers;
+    bool owns_buffers = true;
+    absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4>
+        leaf_buffers;
+    absl::InlinedVector<size_t, 4> leaf_buffer_sizes;
     leaf_buffers.reserve(tracked_buffers.size());
+    leaf_buffer_sizes.reserve(tracked_buffers.size());
     for (const auto& tracked_buffer : tracked_buffers) {
+      owns_buffers = owns_buffers && tracked_buffer.second->owns_buffers();
       auto span = tracked_buffer.second->Buffers();
       leaf_buffers.insert(leaf_buffers.end(), span.begin(), span.end());
+      auto size_span = tracked_buffer.second->BufferSizes();
+      leaf_buffer_sizes.insert(leaf_buffer_sizes.end(), size_span.begin(),
+                               size_span.end());
     }
 
     // Tuplize into a single input.
     tracked_buffers.clear();
     tuplized_arg = std::make_unique<TrackedTfrtCpuDeviceBuffer>(
-        /*is_tuple=*/true, std::move(leaf_buffers),
-        /*definition_event=*/tsl::MakeAvailableAsyncValueRef<CpuEvent>());
+        /*is_tuple=*/true, owns_buffers, std::move(leaf_buffers),
+        std::move(leaf_buffer_sizes),
+        /*definition_event=*/tsl::MakeConstructedAsyncValueRef<CpuEvent>());
     tracked_buffers.emplace_back(false, tuplized_arg.get());
   }
 
   auto* cpu_executable =
       tensorflow::down_cast<cpu::CpuExecutable*>(cpu_executable_.get());
+  // `buffer_alloc` and `buffer_alloc_and_copy` are used to do real memory
+  // allocation and copy work.
+  BufferAlloc buffer_alloc;
+  BufferAllocAndCopy buffer_alloc_and_copy;
   TF_ASSIGN_OR_RETURN(
-      std::vector<std::shared_ptr<MaybeOwningCpuMemory>> buffer_table,
-      CreateBufferTable(cpu_executable->buffer_assignment(), tracked_buffers));
-  auto result_buffers =
-      CreateResultShapedBuffer(result_buffer_indices_, buffer_table);
+      std::vector<BufferInfo> buffer_table,
+      CreateBufferTable(cpu_executable->buffer_assignment(), tracked_buffers,
+                        buffer_alloc, buffer_alloc_and_copy));
+  auto result_buffers_info =
+      CreateResultBufferInfo(result_buffer_indices_, buffer_table);
 
   // The choice of where we wait is arbitrary; the reason for the wait is
   // pacing to avoid problems such as memory fragmentation and running ahead
@@ -1309,14 +1507,6 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
   auto compute_reservation = std::make_unique<Semaphore::ScopedReservation>(
       device->max_inflight_computations_semaphore().ScopedAcquire(1));
 
-  // Call the computation function following the calling convention.
-  std::vector<void*> buffer_pointers;
-  buffer_pointers.reserve(buffer_table.size());
-  for (const auto& buffer : buffer_table) {
-    buffer_pointers.push_back(buffer->data());
-  }
-  void* result_buffer = buffer_pointers[result_buffer_index_];
-
   ExecutableRunOptions run_options;
   run_options.set_run_id(run_id);
   run_options.set_device_ordinal(device->id());
@@ -1330,8 +1520,20 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
 
   // Schedule only one collective at a time.
   bool is_a_collective_launch = !!last_collective_launch_event;
+  // Add additional dependency conditioned on whether this is a collective
+  // launch or not.
   if (is_a_collective_launch) {
     input_deps.push_back(std::move(last_collective_launch_event));
+  } else {
+    // This is a non-parallel computation. Add the last enqueue event as a
+    // dependency.
+    auto last_enqueue_event = client_->GetLastEnqueueEvent();
+    if (!last_enqueue_event.IsAvailable()) {
+      input_deps.push_back(std::move(last_enqueue_event));
+    }
+  }
+  if (options.context != nullptr) {
+    run_options.set_ffi_execution_context(&options.context->ffi_context());
   }
 
   bool execute_inline = cheap_computation_ || !client_->asynchronous_;
@@ -1354,6 +1556,20 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
 
     XlaCustomCallStatus status;
 
+    // Immediately allocate memory and prepare for computation.
+    buffer_alloc.Allocate();
+    buffer_alloc_and_copy.AllocateAndCopy();
+    std::vector<void*> buffer_pointers;
+    buffer_pointers.reserve(buffer_table.size());
+    for (const auto& buffer_info : buffer_table) {
+      CHECK(buffer_info.buffer.IsAvailable());
+      if (buffer_info.buffer.IsError()) {
+        return buffer_info.buffer.GetError();
+      }
+      buffer_pointers.push_back(buffer_info.buffer->data());
+    }
+    void* result_buffer = buffer_pointers[result_buffer_index_];
+
     // Call generated function.
     cpu_executable->compute_function()(result_buffer, &run_options, nullptr,
                                        buffer_pointers.data(), &status,
@@ -1370,8 +1586,6 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
     }
 
   } else {
-    // TODO(zhangqiaorjc): Only async launch expensive computations. Need
-    // heuristics to decide what computation is expensive.
     // Asynchronously call generated function.
 
     // We only created enough threads for one collective to complete.
@@ -1379,13 +1593,18 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
     // this one completes.
     if (is_a_collective_launch) {
       client_->SetLastCollectiveLaunchEvent(execute_event.CopyRef());
+    } else {
+      // This is a non-parallel computation. Set the execute event as the new
+      // last enqueue event.
+      client_->SetLastEnqueueEvent(execute_event.CopyRef());
     }
     std::vector<tsl::RCReference<tsl::AsyncValue>> input_deps_avs_copy =
         CopyAsyncValues(input_deps);
     EnqueueWorkWhenReady(
         client()->pjrt_client_thread_pool(), input_deps,
-        [cpu_executable, result_buffer,
-         buffer_pointers = std::move(buffer_pointers),
+        [cpu_executable, buffer_alloc = std::move(buffer_alloc),
+         buffer_alloc_and_copy = std::move(buffer_alloc_and_copy),
+         result_buffer_index = result_buffer_index_,
          buffer_table = std::move(buffer_table),
          run_options = std::move(run_options),
          cpu_executable_copy = cpu_executable_,
@@ -1396,6 +1615,13 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
          donation_transactions = std::move(donation_transactions),
          execute_event = std::move(ready_on_exit).Release(),
          input_deps_avs = std::move(input_deps_avs_copy)]() mutable {
+          // Because `input_deps` contains the definition events of all inputs,
+          // when it is ready, all input buffers must have been allocated. So,
+          // we are safe to allocate and copy memory here. Since `execute_event`
+          // may error out, we need to do it early.
+          buffer_alloc.Allocate();
+          buffer_alloc_and_copy.AllocateAndCopy();
+
           for (const auto& av : input_deps_avs) {
             if (auto* error = av->GetErrorIfPresent()) {
               execute_event.SetError(absl::StrCat(
@@ -1404,21 +1630,26 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
             }
           }
 
-          // Commit donation transactions early before execution to prevent a
-          // tricky deadlock case when we have both donated args and python host
-          // callbacks: `~TfrtCpuBuffer()` will wait for all pending donations
-          // to be committed, while it may be holding the python GIL. However,
-          // when the computation contains python callbacks, they also need the
-          // python GIL, which leads to a deadlock.
-          for (auto& donation_transaction : donation_transactions) {
-            std::move(donation_transaction).Commit();
-          }
-
           // Set denormal and rounding behavior to match the default TF
           // ThreadPool behavior.
           tsl::port::ScopedFlushDenormal flush;
           tsl::port::ScopedSetRound round(FE_TONEAREST);
 
+          // Prepare for computation.
+          std::vector<void*> buffer_pointers;
+          buffer_pointers.reserve(buffer_table.size());
+          for (const auto& buffer_info : buffer_table) {
+            CHECK(buffer_info.buffer.IsAvailable());
+            if (buffer_info.buffer.IsError()) {
+              execute_event.SetError(
+                  absl::StrCat("Error preparing computation: %s",
+                               buffer_info.buffer.GetError().message()));
+              return;
+            }
+            buffer_pointers.push_back(buffer_info.buffer->data());
+          }
+          void* result_buffer = buffer_pointers[result_buffer_index];
+
           // Call generated function.
           std::optional<absl::string_view> error_message;
           XlaCustomCallStatus status;
@@ -1427,6 +1658,10 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
                                              &status, nullptr);
           error_message = xla::CustomCallStatusGetMessage(&status);
 
+          for (auto& donation_transaction : donation_transactions) {
+            std::move(donation_transaction).Commit();
+          }
+
           if (error_message) {
             // CPU computation fails with an error.
             execute_event.SetError(absl::StrFormat(
@@ -1443,31 +1678,47 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
   const Shape& result_shape = cpu_executable_->result_shape();
   std::vector<std::unique_ptr<PjRtBuffer>> res;
   if (options.untuple_result && result_shape.IsTuple()) {
-    res.reserve(result_buffers.size());
-    for (int i = 0; i < result_buffers.size(); ++i) {
-      absl::InlinedVector<std::shared_ptr<MaybeOwningCpuMemory>, 4> sub_buffer;
-      sub_buffer.push_back(std::move(result_buffers[i]));
+    res.reserve(result_buffers_info.size());
+    for (int i = 0; i < result_buffers_info.size(); ++i) {
       // Program execution writes to output buffers so it's a definition event.
       absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events;
       definition_events.push_back(execute_event.CopyRef());
       auto leaf_tracked_device_buffer =
           std::make_unique<TrackedTfrtCpuDeviceBuffer>(
-              /*is_tuple=*/false, std::move(sub_buffer),
+              /*is_tuple=*/false, result_buffers_info[i].owns_buffer,
+              absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4>{
+                  std::move(result_buffers_info[i].buffer)},
+              absl::InlinedVector<size_t, 4>{
+                  result_buffers_info[i].buffer_size},
               std::move(definition_events));
       auto leaf_buffer = std::make_unique<TfrtCpuBuffer>(
           result_shape.tuple_shapes(i), std::move(leaf_tracked_device_buffer),
-          client_, device);
+          client_, device, *device->default_memory_space());
       res.push_back(std::move(leaf_buffer));
     }
   } else {
+    bool owns_buffers = true;
+    absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4>
+        sub_buffers;
+    absl::InlinedVector<size_t, 4> sub_buffer_sizes;
+    sub_buffers.reserve(result_buffers_info.size());
+    sub_buffer_sizes.reserve(result_buffers_info.size());
+    for (int i = 0; i < result_buffers_info.size(); ++i) {
+      owns_buffers = owns_buffers && result_buffers_info[i].owns_buffer;
+      sub_buffers.push_back(std::move(result_buffers_info[i].buffer));
+      sub_buffer_sizes.push_back(result_buffers_info[i].buffer_size);
+    }
     // Program execution writes to output buffers so it's a definition event.
     auto tracked_device_buffer = std::make_unique<TrackedTfrtCpuDeviceBuffer>(
-        /*is_tuple=*/result_shape.IsTuple(), std::move(result_buffers),
+        /*is_tuple=*/result_shape.IsTuple(), owns_buffers,
+        std::move(sub_buffers), std::move(sub_buffer_sizes),
         /*definition_event=*/execute_event);
     auto tfrt_output_buffer = std::make_unique<TfrtCpuBuffer>(
-        result_shape, std::move(tracked_device_buffer), client_, device);
+        result_shape, std::move(tracked_device_buffer), client_, device,
+        *device->default_memory_space());
     res.push_back(std::move(tfrt_output_buffer));
   }
+
   std::optional<PjRtFuture<>> future;
   if (fill_future) {
     PjRtFuture<>::Promise promise = PjRtFuture<>::CreatePromise();
@@ -1588,7 +1839,7 @@ TfrtCpuExecutable::Execute(
     absl::Mutex mu;
     int running = num_addressable_devices;
     int failed = 0;
-    Status first_failure_status;
+    absl::Status first_failure_status;
 
     for (int i = 0; i < num_addressable_devices; ++i) {
       const int replica = addressable_device_logical_ids_[i].replica;
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client.h b/third_party/xla/xla/pjrt/cpu/cpu_client.h
index 494d85b827ef6b..310c6114467877 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client.h
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client.h
@@ -52,19 +52,19 @@ limitations under the License.
 #include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/semaphore.h"
 #include "xla/pjrt/transpose.h"
-#include "xla/runtime/cpu_event.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/cpu/collectives_interface.h"
+#include "xla/service/cpu/cpu_event.h"
 #include "xla/service/executable.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/shape.h"
 #include "xla/status.h"
 #include "xla/statusor.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/concurrency/async_value_ref.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/fingerprint.h"
 #include "tsl/platform/threadpool.h"
@@ -185,7 +185,7 @@ class TfrtCpuTopologyDescription : public PjRtTopologyDescription {
     return attributes_;
   }
 
-  StatusOr<Layout> GetDefaultLayout(
+  absl::StatusOr<Layout> GetDefaultLayout(
       PrimitiveType element_type,
       absl::Span<const int64_t> dims) const override;
 
@@ -229,14 +229,21 @@ class TfrtCpuDevice final : public PjRtDevice {
     return PjRtLocalHardwareId(description_.local_hardware_id());
   }
 
-  Status TransferToInfeed(const LiteralSlice& literal) override;
+  absl::Status TransferToInfeed(const LiteralSlice& literal) override;
+
+  absl::Status TransferFromOutfeed(MutableBorrowingLiteral literal) override;
 
-  Status TransferFromOutfeed(MutableBorrowingLiteral literal) override;
+  void AttachMemorySpace(PjRtMemorySpace* memory_space);
 
   absl::Span<PjRtMemorySpace* const> memory_spaces() const override;
 
   absl::StatusOr<PjRtMemorySpace*> default_memory_space() const override;
 
+  absl::StatusOr<PjRtMemorySpace*> memory_space_by_kind(
+      absl::string_view memory_space_kind) const override;
+
+  absl::StatusOr<PjRtMemorySpace*> memory_space_by_kind_id(int id) const;
+
   // Returns a semaphore for admission control on inflight computations.
   Semaphore& max_inflight_computations_semaphore() {
     return max_inflight_computations_semaphore_;
@@ -250,6 +257,8 @@ class TfrtCpuDevice final : public PjRtDevice {
  private:
   PjRtClient* client_ = nullptr;
   TfrtCpuDeviceDescription description_;
+  absl::InlinedVector<PjRtMemorySpace*, 1> memory_spaces_;
+  absl::flat_hash_map<int, PjRtMemorySpace*> memory_spaces_by_id_;
 
   // TODO(zhangqiaorjc): Optimize semaphore related overhead.
   // Semaphore used to limit how many programs can be enqueued by the host
@@ -279,7 +288,6 @@ class TfrtCpuClient final : public PjRtClient {
     return addressable_devices_;
   }
 
-  absl::StatusOr<PjRtDevice*> LookupDevice(int device_id) const override;
   absl::StatusOr<PjRtDevice*> LookupDevice(
       PjRtGlobalDeviceId global_device_id) const override;
 
@@ -322,7 +330,10 @@ class TfrtCpuClient final : public PjRtClient {
       std::optional<CompileOptions> options) override;
 
   absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateErrorBuffer(
-      Status error, const Shape& shape, PjRtDevice* device) override;
+      absl::Status error, const Shape& shape, PjRtDevice* device) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateErrorBuffer(
+      absl::Status error, const Shape& shape, PjRtMemorySpace* memory) override;
 
   absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateUninitializedBuffer(
       const Shape& shape, PjRtDevice* device) override;
@@ -333,10 +344,7 @@ class TfrtCpuClient final : public PjRtClient {
 
   absl::StatusOr<std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
   CreateBuffersForAsyncHostToDevice(absl::Span<const Shape> shapes,
-                                    PjRtMemorySpace* memory_space) override {
-    return Unimplemented(
-        "CreateBuffersForAsyncHostToDevice with memory_space not implemented.");
-  }
+                                    PjRtMemorySpace* memory_space) override;
 
   absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
       const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
@@ -345,9 +353,26 @@ class TfrtCpuClient final : public PjRtClient {
       absl::AnyInvocable<void() &&> on_done_with_host_buffer,
       PjRtDevice* device) override;
 
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
+      const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+      std::optional<absl::Span<int64_t const>> byte_strides,
+      HostBufferSemantics host_buffer_semantics,
+      absl::AnyInvocable<void() &&> on_done_with_host_buffer,
+      PjRtDevice* device, const Layout* device_layout) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
+      const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+      std::optional<absl::Span<int64_t const>> byte_strides,
+      HostBufferSemantics host_buffer_semantics,
+      absl::AnyInvocable<void() &&> on_done_with_host_buffer,
+      PjRtMemorySpace* memory_space, const Layout* device_layout) override;
+
   absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
       const LiteralSlice& literal, PjRtDevice* device) override;
 
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
+      const LiteralSlice& literal, PjRtMemorySpace* memory_space) override;
+
   absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
   MakeCrossHostReceiveBuffers(absl::Span<const Shape> shapes,
                               PjRtDevice* device,
@@ -378,7 +403,7 @@ class TfrtCpuClient final : public PjRtClient {
     return Unimplemented("CreateHostToDeviceChannelHandle not implemented.");
   }
 
-  Status Defragment() override {
+  absl::Status Defragment() override {
     return Unimplemented("Defragment not implemented.");
   }
 
@@ -394,17 +419,24 @@ class TfrtCpuClient final : public PjRtClient {
     return eigen_intraop_device_.get();
   }
 
-  tsl::AsyncValueRef<runtime::CpuEvent> GetLastCollectiveLaunchEvent() {
+  tsl::AsyncValueRef<CpuEvent> GetLastCollectiveLaunchEvent() {
     absl::MutexLock lock(&mu_);
     return last_collective_launch_event_.CopyRef();
   }
 
-  void SetLastCollectiveLaunchEvent(
-      tsl::AsyncValueRef<runtime::CpuEvent> event) {
+  void SetLastCollectiveLaunchEvent(tsl::AsyncValueRef<CpuEvent> event) {
     absl::MutexLock lock(&mu_);
     last_collective_launch_event_ = std::move(event);
   }
 
+  tsl::AsyncValueRef<CpuEvent> GetLastEnqueueEvent() {
+    return last_enqueue_event_.CopyRef();
+  }
+
+  void SetLastEnqueueEvent(tsl::AsyncValueRef<CpuEvent> event) {
+    last_enqueue_event_ = std::move(event);
+  }
+
   absl::StatusOr<const xla::PjRtTopologyDescription*> GetTopologyDescription()
       const override {
     return &topology_;
@@ -424,6 +456,11 @@ class TfrtCpuClient final : public PjRtClient {
   std::vector<PjRtDevice*> addressable_devices_;
   std::unique_ptr<ComputationPlacer> computation_placer_;
 
+  // Addressable memory spaces.
+  std::vector<std::unique_ptr<PjRtMemorySpace>> owned_memory_spaces_;
+  // Pointers to `owned_memory_spaces_`.
+  std::vector<PjRtMemorySpace*> memory_spaces_;
+
   // Thread pool for running PjRtClient tasks.
   std::unique_ptr<tsl::thread::ThreadPool> pjrt_client_thread_pool_;
   std::unique_ptr<AsyncWorkRunner> async_work_runner_;
@@ -443,7 +480,7 @@ class TfrtCpuClient final : public PjRtClient {
   // TODO(zhangqiaorjc): Explore alternatives that allow multiple concurrent
   // collectives.
   mutable absl::Mutex mu_;
-  tsl::AsyncValueRef<runtime::CpuEvent> last_collective_launch_event_
+  tsl::AsyncValueRef<CpuEvent> last_collective_launch_event_
       ABSL_GUARDED_BY(mu_);
 
   // A cache for transpose plans. We use transposes to convert
@@ -459,6 +496,13 @@ class TfrtCpuClient final : public PjRtClient {
   // Used to control whether asynchronous computation dispatch is available for
   // this client. Only applies to non-parallel computations.
   bool asynchronous_;
+
+  // Used to prevent too much parallelism: we will not enqueue next non-parallel
+  // computation until last one is done within each user thread.
+  // TODO(yueshengys): Consider moving the enqueuing/ordering logic to JAX via
+  // token threading.
+  inline static thread_local tsl::AsyncValueRef<CpuEvent> last_enqueue_event_ =
+      tsl::MakeAvailableAsyncValueRef<CpuEvent>();
 };
 
 class TfrtCpuBuffer final : public AbstractTfrtCpuBuffer {
@@ -466,14 +510,15 @@ class TfrtCpuBuffer final : public AbstractTfrtCpuBuffer {
   TfrtCpuBuffer(
       Shape on_device_shape,
       std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer,
-      TfrtCpuClient* client, TfrtCpuDevice* device);
+      TfrtCpuClient* client, TfrtCpuDevice* device,
+      PjRtMemorySpace* memory_space);
 
   TfrtCpuBuffer(const TfrtCpuBuffer&) = delete;
   TfrtCpuBuffer(TfrtCpuBuffer&&) = delete;
   TfrtCpuBuffer& operator=(const TfrtCpuBuffer&) = delete;
   TfrtCpuBuffer& operator=(TfrtCpuBuffer&&) = delete;
 
-  PjRtMemorySpace* memory_space() const override { return nullptr; }
+  PjRtMemorySpace* memory_space() const override { return memory_space_; }
   TfrtCpuDevice* device() const override { return device_; }
   TfrtCpuClient* client() const override { return client_; }
 
@@ -486,11 +531,15 @@ class TfrtCpuBuffer final : public AbstractTfrtCpuBuffer {
   absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDevice(
       PjRtDevice* dst_device) override;
 
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToMemorySpace(
+      PjRtMemorySpace* dst_memory_space) override;
+
  private:
   absl::string_view buffer_name() const override { return "TfrtCpuBuffer"; }
 
   TfrtCpuClient* client_;
   TfrtCpuDevice* const device_;
+  PjRtMemorySpace* const memory_space_;
 };
 
 class TfrtCpuExecutable final : public PjRtLoadedExecutable {
@@ -554,6 +603,8 @@ class TfrtCpuExecutable final : public PjRtLoadedExecutable {
           "cpu_executable_ has no hlo_proto.");
     }
     memory_stats.serialized_hlo_proto = proto->SerializeAsString();
+    memory_stats.PopulateBufferStatsFromAllocations(
+        cpu_executable_.get()->GetAllocations());
     return memory_stats;
   }
 
@@ -598,18 +649,18 @@ class TfrtCpuExecutable final : public PjRtLoadedExecutable {
  private:
   friend class TfrtCpuClient;
 
-  Status SetUpDonation(bool tuple_inputs);
+  absl::Status SetUpDonation(bool tuple_inputs);
 
   // Checks that the input buffers passed in by the user have the correct size
   // on device for the compiled program.
-  Status CheckBufferCompatibilities(
+  absl::Status CheckBufferCompatibilities(
       absl::Span<std::pair<bool, TrackedTfrtCpuDeviceBuffer*> const>
           input_buffers) const;
 
   absl::StatusOr<Result> ExecuteHelper(
       absl::Span<PjRtBuffer* const> argument_handles, int replica,
       int partition, const RunId& run_id, const ExecuteOptions& options,
-      tsl::AsyncValueRef<runtime::CpuEvent> last_collective_launch_event,
+      tsl::AsyncValueRef<CpuEvent> last_collective_launch_event,
       bool fill_future, TfrtCpuDevice* device = nullptr);
 
   TfrtCpuClient* client_;
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc b/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc
index 181db327a36535..e70c5b4f370919 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc
@@ -20,21 +20,30 @@ limitations under the License.
 #endif
 
 #include <algorithm>
+#include <cstdint>
 #include <cstring>
+#include <memory>
+#include <optional>
 #include <string>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
 #include "absl/synchronization/notification.h"
+#include "xla/client/xla_computation.h"
+#include "xla/ffi/ffi.h"
+#include "xla/ffi/ffi_api.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
-#include "xla/service/custom_call_status.h"
-#include "xla/service/custom_call_target_registry.h"
+#include "xla/pjrt/host_memory_spaces.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_executable.h"
 #include "xla/service/hlo_parser.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
+#include "xla/tests/literal_test_util.h"
 #include "xla/tests/test_utils.h"
 #include "xla/util.h"
 #include "tsl/lib/core/status_test_util.h"
@@ -49,22 +58,55 @@ namespace xla {
 namespace {
 
 using ::testing::Each;
+using ::testing::ElementsAre;
 using ::testing::ElementsAreArray;
 using ::testing::HasSubstr;
 using ::testing::IsFalse;
+using ::tsl::testing::IsOkAndHolds;
 
-void TestError(void* out, const void** in, XlaCustomCallStatus* status) {
-  static constexpr char kError[] = "test error.";
-  XlaCustomCallStatusSetFailure(status, kError, sizeof(kError));
+static absl::Status TestError(ffi::BufferBase, ffi::Result<ffi::BufferBase>,
+                              ffi::Result<ffi::BufferBase>) {
+  return absl::InternalError("test error.");
+}
+
+XLA_FFI_DEFINE_HANDLER(kTestError, TestError,
+                       ffi::Ffi::Bind()
+                           .Arg<ffi::BufferBase>()  // in
+                           .Ret<ffi::BufferBase>()  // out0
+                           .Ret<ffi::BufferBase>()  // out1
+);
+
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$TestError", "Host",
+                         kTestError);
+
+TEST(TfrtCpuClientTest, MemorySpace) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(CpuClientOptions()));
+  ASSERT_GE(client->devices().size(), 1);
+
+  ASSERT_EQ(client->memory_spaces().size(),
+            client->addressable_devices().size());
+  for (auto* device : client->devices()) {
+    TF_ASSERT_OK_AND_ASSIGN(auto* memory_space, device->default_memory_space());
+    EXPECT_THAT(device->memory_spaces(), ElementsAre(memory_space));
+    EXPECT_EQ(memory_space->kind(), UnpinnedHostMemorySpace::kKind);
+    EXPECT_EQ(memory_space->kind_id(), UnpinnedHostMemorySpace::kKindId);
+    EXPECT_THAT(device->memory_space_by_kind(UnpinnedHostMemorySpace::kKind),
+                IsOkAndHolds(memory_space));
+  }
 }
-XLA_CPU_REGISTER_CUSTOM_CALL_TARGET(TestError);
 
 TEST(TfrtCpuClientTest, DonationWithExecutionError) {
   constexpr char kProgram[] =
-      R"(HloModule DonationWithExecutionError, input_output_alias={ {}: (0, {}, must-alias) }
+      R"(
+HloModule DonationWithExecutionError,
+          input_output_alias={ {}: (0, {}, must-alias) }
+
 ENTRY DonationWithExecutionError() -> f32[2, 2] {
     %input = f32[2, 2] parameter(0)
-    %custom-call = (f32[2, 2], u8[0]) custom-call(%input), custom_call_target="TestError", api_version=API_VERSION_STATUS_RETURNING, output_to_operand_aliasing={{0}: (0, {})}
+    %custom-call = (f32[2, 2], u8[0]) custom-call(%input),
+                      custom_call_target="__xla_test$$TestError",
+                      api_version=API_VERSION_TYPED_FFI,
+                      output_to_operand_aliasing={{0}: (0, {})}
     ROOT %result = f32[2, 2] get-tuple-element(%custom-call), index=0
 })";
 
@@ -315,5 +357,61 @@ TEST(TfrtCpuClientTest, AsyncTransferRawDataToSubBuffer) {
   EXPECT_THAT(literal->data<uint32_t>(), Each(0x42424242));
 }
 
+// User-defined data type to be passed to FFI handler via the execute context
+// side channel.
+struct MemsetValue {
+  explicit MemsetValue(float value) : value(value) {}
+  float value;
+};
+
+static absl::Status MemsetFromValue(
+    ffi::Result<ffi::BufferR1<PrimitiveType::F32>> result,
+    MemsetValue* memset_value) {
+  for (size_t i = 0; i < result->dimensions.at(0); ++i) {
+    result->data.base()[i] = memset_value->value;
+  }
+  return absl::OkStatus();
+}
+
+XLA_FFI_DEFINE_HANDLER(kMemsetFromValue, MemsetFromValue,
+                       ffi::Ffi::Bind()
+                           .Ret<ffi::BufferR1<PrimitiveType::F32>>()
+                           .Ctx<ffi::UserData<MemsetValue>>());
+
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "MemsetFromValue", "HOST",
+                         kMemsetFromValue);
+
+TEST(TfrtCpuClientTest, ForwardUserDataToFfiHandler) {
+  static constexpr char const* kProgram = R"(
+    HloModule ffi_handler
+    ENTRY main {
+      ROOT %custom-call = f32[4] custom-call(),
+                          custom_call_target="MemsetFromValue",
+                          api_version=API_VERSION_TYPED_FFI
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(CpuClientOptions()));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module,
+                          ParseAndReturnUnverifiedModule(kProgram, {}));
+  XlaComputation xla_computation(hlo_module->ToProto());
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          client->Compile(xla_computation, {}));
+
+  ExecuteContext context;
+  TF_ASSERT_OK(context.ffi_context().Emplace<MemsetValue>(42.0f));
+
+  ExecuteOptions opts;
+  opts.context = &context;
+
+  auto result = executable->Execute(/*argument_handles=*/{{}}, opts);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::Literal> result_literal,
+                          result->at(0).at(0)->ToLiteralSync());
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      LiteralUtil::CreateR1<float>({42.0f, 42.0f, 42.0f, 42.0f}),
+      *result_literal));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.cc b/third_party/xla/xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.cc
index e3406e555868ff..6951b96465213f 100644
--- a/third_party/xla/xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.cc
+++ b/third_party/xla/xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.cc
@@ -16,23 +16,24 @@ limitations under the License.
 #include "xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.h"
 
 #include <atomic>
-#include <functional>
-#include <memory>
-#include <string>
+#include <cstddef>
+#include <cstdint>
 #include <utility>
 
 #include "absl/base/casts.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/synchronization/mutex.h"
-#include "xla/runtime/cpu_event.h"
-#include "tsl/concurrency/async_value_ref.h"
+#include "absl/types/span.h"
+#include "xla/service/cpu/cpu_event.h"
+#include "xla/shape_util.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
 
 namespace xla {
 namespace {
 
-using ::xla::runtime::CpuEvent;
-
 // Returns an AsyncValueRef<CpuEvent> that will be ready after all the async
 // values in `events` are ready. If errors occurs, one of the errors will be
 // propagated through the returned async value.
@@ -77,29 +78,45 @@ tsl::AsyncValueRef<CpuEvent> AfterAll(
 }  // namespace
 
 TrackedTfrtCpuDeviceBuffer::TrackedTfrtCpuDeviceBuffer(
-    bool is_tuple,
-    absl::InlinedVector<std::shared_ptr<MaybeOwningCpuMemory>, 4> buffers,
+    bool is_tuple, bool owns_buffers,
+    absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> buffers,
     absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events,
     absl::AnyInvocable<void() &&> on_delete_callback)
-    : TrackedTfrtCpuDeviceBuffer(is_tuple, std::move(buffers),
+    : TrackedTfrtCpuDeviceBuffer(is_tuple, owns_buffers, std::move(buffers),
                                  AfterAll(definition_events),
                                  std::move(on_delete_callback)) {}
 
 TrackedTfrtCpuDeviceBuffer::TrackedTfrtCpuDeviceBuffer(
-    bool is_tuple,
-    absl::InlinedVector<std::shared_ptr<MaybeOwningCpuMemory>, 4> buffers,
+    bool is_tuple, bool owns_buffers,
+    absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> buffers,
+    absl::InlinedVector<size_t, 4> buffer_sizes,
+    absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events,
+    absl::AnyInvocable<void() &&> on_delete_callback)
+    : TrackedTfrtCpuDeviceBuffer(
+          is_tuple, owns_buffers, std::move(buffers), std::move(buffer_sizes),
+          AfterAll(definition_events), std::move(on_delete_callback)) {}
+
+TrackedTfrtCpuDeviceBuffer::TrackedTfrtCpuDeviceBuffer(
+    bool is_tuple, bool owns_buffers,
+    absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> buffers,
     tsl::AsyncValueRef<CpuEvent> definition_event,
     absl::AnyInvocable<void() &&> on_delete_callback)
     : is_tuple_(is_tuple),
+      owns_buffers_(owns_buffers),
       buffers_(std::move(buffers)),
       definition_event_(std::move(definition_event)),
       on_delete_callback_(std::move(on_delete_callback)) {
   DCHECK(definition_event_);
+  for (const auto& buffer : buffers_) {
+    CHECK(buffer.IsConcrete());
+    buffer_sizes_.push_back(buffer->size());
+  }
   if (is_tuple) {
     size_t index_table_byte_size = buffers_.size() * sizeof(void*);
     // We assume tuple table allocations will not fail.
     tuple_index_table_ =
-        MaybeOwningCpuMemory::AllocateShared(index_table_byte_size).value();
+        MaybeOwningCpuMemory::AllocateAvailableAvr(index_table_byte_size)
+            .value();
     uintptr_t* index_table =
         reinterpret_cast<uintptr_t*>(tuple_index_table_->data());
     for (int i = 0; i < buffers_.size(); ++i) {
@@ -108,6 +125,38 @@ TrackedTfrtCpuDeviceBuffer::TrackedTfrtCpuDeviceBuffer(
   }
 }
 
+TrackedTfrtCpuDeviceBuffer::TrackedTfrtCpuDeviceBuffer(
+    bool is_tuple, bool owns_buffers,
+    absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> buffers,
+    absl::InlinedVector<size_t, 4> buffer_sizes,
+    tsl::AsyncValueRef<CpuEvent> definition_event,
+    absl::AnyInvocable<void() &&> on_delete_callback)
+    : is_tuple_(is_tuple),
+      owns_buffers_(owns_buffers),
+      buffers_(std::move(buffers)),
+      buffer_sizes_(std::move(buffer_sizes)),
+      definition_event_(std::move(definition_event)),
+      on_delete_callback_(std::move(on_delete_callback)) {
+  DCHECK(definition_event_);
+  if (is_tuple) {
+    tuple_index_table_ =
+        tsl::MakeUnconstructedAsyncValueRef<MaybeOwningCpuMemory>();
+    tsl::RunWhenReady(
+        absl::MakeConstSpan(buffers_),
+        [buffers = buffers_, tuple_index_table = tuple_index_table_] {
+          size_t index_table_byte_size = buffers.size() * sizeof(void*);
+          // We assume tuple table allocations will not fail.
+          tuple_index_table.emplace(
+              MaybeOwningCpuMemory::Allocate(index_table_byte_size).value());
+          uintptr_t* index_table =
+              reinterpret_cast<uintptr_t*>(tuple_index_table->data());
+          for (int i = 0; i < buffers.size(); ++i) {
+            index_table[i] = absl::bit_cast<uintptr_t>(buffers[i]->data());
+          }
+        });
+  }
+}
+
 TrackedTfrtCpuDeviceBuffer::~TrackedTfrtCpuDeviceBuffer() {
   ReleaseDeviceMemory();
   if (on_delete_callback_) {
@@ -115,7 +164,7 @@ TrackedTfrtCpuDeviceBuffer::~TrackedTfrtCpuDeviceBuffer() {
   }
 }
 
-std::shared_ptr<MaybeOwningCpuMemory> TrackedTfrtCpuDeviceBuffer::Buffer(
+tsl::AsyncValueRef<MaybeOwningCpuMemory> TrackedTfrtCpuDeviceBuffer::Buffer(
     const ShapeIndex& shape_index) {
   if (shape_index.empty()) {
     // shape_index={}
@@ -128,6 +177,18 @@ std::shared_ptr<MaybeOwningCpuMemory> TrackedTfrtCpuDeviceBuffer::Buffer(
   return buffers_[shape_index[0]];
 }
 
+size_t TrackedTfrtCpuDeviceBuffer::BufferSize(const ShapeIndex& shape_index) {
+  if (shape_index.empty()) {
+    // shape_index={}
+    if (is_tuple_) return buffers_.size() * sizeof(void*);
+    return buffer_sizes_[0];
+  }
+  // shape_index={i}
+  CHECK(is_tuple_);
+  CHECK_EQ(shape_index.size(), 1) << "nested tuple not supported";
+  return buffer_sizes_[shape_index[0]];
+}
+
 void TrackedTfrtCpuDeviceBuffer::AddUsageEvents(
     absl::Span<tsl::AsyncValueRef<CpuEvent>> events) {
   // Periodically remove available usage events to prevent memory blowup.
diff --git a/third_party/xla/xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.h b/third_party/xla/xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.h
index 4992db8f036aa8..04a8fa851a1fe5 100644
--- a/third_party/xla/xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.h
+++ b/third_party/xla/xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdint>
+#include <cstdlib>
 #include <memory>
 #include <utility>
 
@@ -26,11 +27,12 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "xla/cpu_function_runtime.h"
-#include "xla/runtime/cpu_event.h"
+#include "xla/service/cpu/cpu_event.h"
 #include "xla/shape_util.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/util.h"
-#include "tsl/concurrency/async_value_ref.h"
 #include "tsl/platform/mem.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 
@@ -53,16 +55,23 @@ class MaybeOwningCpuMemory {
   MaybeOwningCpuMemory(const MaybeOwningCpuMemory&) = delete;
   MaybeOwningCpuMemory& operator=(const MaybeOwningCpuMemory&) = delete;
 
-  // Owning.
-  static absl::StatusOr<std::shared_ptr<MaybeOwningCpuMemory>> AllocateShared(
-      size_t size) {
+  // Allocates owning memory wrapped in an available `AsyncValueRef`.
+  static absl::StatusOr<tsl::AsyncValueRef<MaybeOwningCpuMemory>>
+  AllocateAvailableAvr(size_t size) {
+    TF_ASSIGN_OR_RETURN(auto memory, Allocate(size));
+    return tsl::MakeAvailableAsyncValueRef<MaybeOwningCpuMemory>(
+        std::move(memory));
+  }
+
+  // Allocates raw owning memory. The typical usage is for delayed allocation.
+  static absl::StatusOr<MaybeOwningCpuMemory> Allocate(size_t size) {
     uint8_t* data = static_cast<uint8_t*>(
         tsl::port::AlignedMalloc(size, cpu_function_runtime::MinAlign()));
     if (!data) {
       return ResourceExhausted("Out of memory allocating %d bytes.", size);
     }
-    return std::make_shared<MaybeOwningCpuMemory>(
-        OwnedDataPtr{data, tsl::port::AlignedFree}, size);
+    return MaybeOwningCpuMemory(OwnedDataPtr{data, tsl::port::AlignedFree},
+                                size);
   }
 
   void* data() const { return buf_; }
@@ -83,17 +92,40 @@ class TrackedTfrtCpuDeviceBuffer {
   // For non-tuple, takes a single buffer.
   // For tuple, takes the leaf buffers. Tuple index table created internally.
   // Nested tuple is not supported.
+
+  // Constructor for allocated cpu memory, i.e., `buffers` should have concrete
+  // states. Definition event is after the list of `definition_events`.
+  TrackedTfrtCpuDeviceBuffer(
+      bool is_tuple, bool owns_buffers,
+      absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> buffers,
+      absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events,
+      absl::AnyInvocable<void() &&> on_delete_callback = nullptr);
+
+  // Variant with single definition event.
   TrackedTfrtCpuDeviceBuffer(
-      bool is_tuple,
-      absl::InlinedVector<std::shared_ptr<MaybeOwningCpuMemory>, 4> buffers,
-      absl::InlinedVector<tsl::AsyncValueRef<runtime::CpuEvent>, 4>
-          definition_events,
+      bool is_tuple, bool owns_buffers,
+      absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> buffers,
+      tsl::AsyncValueRef<CpuEvent> definition_event,
       absl::AnyInvocable<void() &&> on_delete_callback = nullptr);
 
+  // Constructor for unallocated cpu memory, i.e., `buffers` have unconstructed
+  // states, also needs to provide `buffer_sizes` which will be the sizes of
+  // the `buffers` after allocation. Definition event is after the list of
+  // `definition_events`. Callers need to ensure cpu memory is allocated before
+  // the definition event is ready.
   TrackedTfrtCpuDeviceBuffer(
-      bool is_tuple,
-      absl::InlinedVector<std::shared_ptr<MaybeOwningCpuMemory>, 4> buffers,
-      tsl::AsyncValueRef<runtime::CpuEvent> definition_event,
+      bool is_tuple, bool owns_buffers,
+      absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> buffers,
+      absl::InlinedVector<size_t, 4> buffer_sizes,
+      absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events,
+      absl::AnyInvocable<void() &&> on_delete_callback = nullptr);
+
+  // Variant with single definition event.
+  TrackedTfrtCpuDeviceBuffer(
+      bool is_tuple, bool owns_buffers,
+      absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> buffers,
+      absl::InlinedVector<size_t, 4> buffer_sizes,
+      tsl::AsyncValueRef<CpuEvent> definition_event,
       absl::AnyInvocable<void() &&> on_delete_callback = nullptr);
 
   // Move-only.
@@ -105,43 +137,53 @@ class TrackedTfrtCpuDeviceBuffer {
 
   ~TrackedTfrtCpuDeviceBuffer();
 
-  absl::Span<const std::shared_ptr<MaybeOwningCpuMemory>> Buffers() {
+  absl::Span<const tsl::AsyncValueRef<MaybeOwningCpuMemory>> Buffers() {
     return buffers_;
   }
 
-  std::shared_ptr<MaybeOwningCpuMemory> Buffer(const ShapeIndex& shape_index);
+  absl::Span<const size_t> BufferSizes() { return buffer_sizes_; }
+
+  tsl::AsyncValueRef<MaybeOwningCpuMemory> Buffer(
+      const ShapeIndex& shape_index);
 
-  const tsl::AsyncValueRef<runtime::CpuEvent>& definition_event() const {
+  size_t BufferSize(const ShapeIndex& shape_index);
+
+  const tsl::AsyncValueRef<CpuEvent>& definition_event() const {
     return definition_event_;
   }
 
-  absl::Span<const tsl::AsyncValueRef<runtime::CpuEvent>> UsageEvents() const {
+  absl::Span<const tsl::AsyncValueRef<CpuEvent>> UsageEvents() const {
     return usage_events_;
   }
 
-  void AddUsageEvents(absl::Span<tsl::AsyncValueRef<runtime::CpuEvent>> events);
+  void AddUsageEvents(absl::Span<tsl::AsyncValueRef<CpuEvent>> events);
 
   // Return the usage events for the buffers. After
   // LockUseAndTransferUsageEvents is called, it is illegal to AddUsageEvent.
-  absl::InlinedVector<tsl::AsyncValueRef<runtime::CpuEvent>, 4>
+  absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4>
   LockUseAndTransferUsageEvents();
 
   // Relinquishes ownership of the buffer's device memory, e.g., after the
   // buffer is passed to a computation that aliases its inputs to outputs.
   void ReleaseDeviceMemory();
 
+  bool owns_buffers() const { return owns_buffers_; }
+
  private:
   bool is_tuple_;
+  bool owns_buffers_;
   // If tuple, tuple index table is created and stored.
-  std::shared_ptr<MaybeOwningCpuMemory> tuple_index_table_;
+  tsl::AsyncValueRef<MaybeOwningCpuMemory> tuple_index_table_;
   // If non-tuple, `buffers_` contains 1 buffer; otherwise all leaf buffers.
-  absl::InlinedVector<std::shared_ptr<MaybeOwningCpuMemory>, 4> buffers_;
+  absl::InlinedVector<tsl::AsyncValueRef<MaybeOwningCpuMemory>, 4> buffers_;
+  // Should correspond to size of each buffer in `buffers_` when `buffers_` is
+  // available.
+  absl::InlinedVector<size_t, 4> buffer_sizes_;
   // The definition event are associated with CPU operations that write to the
   // buffers.
-  tsl::AsyncValueRef<runtime::CpuEvent> definition_event_;
-
+  tsl::AsyncValueRef<CpuEvent> definition_event_;
   // Usage events are associated with CPU operations that read from the buffers.
-  absl::InlinedVector<tsl::AsyncValueRef<runtime::CpuEvent>, 4> usage_events_;
+  absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> usage_events_;
   // A callback to call when the TrackedTfrtCpuDeviceBuffer is about to be
   // destroyed.
   absl::AnyInvocable<void() &&> on_delete_callback_;
diff --git a/third_party/xla/xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer_test.cc b/third_party/xla/xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer_test.cc
index b457cddf71910e..41ce9fc23706a0 100644
--- a/third_party/xla/xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer_test.cc
+++ b/third_party/xla/xla/pjrt/cpu/tracked_tfrt_cpu_device_buffer_test.cc
@@ -19,38 +19,44 @@ limitations under the License.
 #include <string>
 
 #include <gtest/gtest.h>
-#include "tsl/concurrency/async_value_ref.h"
+#include "xla/service/cpu/cpu_event.h"
+#include "xla/tsl/concurrency/async_value.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/statusor.h"
 #include "tsl/platform/threadpool.h"
 
 namespace xla {
 namespace {
 
-using ::xla::runtime::CpuEvent;
+using ::tsl::BlockUntilReady;
+using ::tsl::MakeConstructedAsyncValueRef;
+using ::tsl::MakeUnconstructedAsyncValueRef;
+using ::tsl::thread::ThreadPool;
 
 TEST(TrackedTfrtCpuDeviceBufferTest, Basic) {
   std::string expected = "tracked_tfrt_cpu_device_buffer_test";
   TF_ASSERT_OK_AND_ASSIGN(
-      auto buffer, MaybeOwningCpuMemory::AllocateShared(expected.size()));
+      auto buffer, MaybeOwningCpuMemory::AllocateAvailableAvr(expected.size()));
 
-  auto definition_event = tsl::MakeConstructedAsyncValueRef<CpuEvent>();
+  auto definition_event = MakeConstructedAsyncValueRef<CpuEvent>();
 
-  tsl::thread::ThreadPool thread_pool(tsl::Env::Default(),
-                                      "tracked_buffer_test",
-                                      /*num_threads=*/4);
+  ThreadPool thread_pool(tsl::Env::Default(), "tracked_buffer_test",
+                         /*num_threads=*/4);
 
   thread_pool.Schedule([&]() {
     std::memcpy(buffer->data(), expected.data(), expected.size());
     definition_event.SetStateConcrete();
   });
 
-  TrackedTfrtCpuDeviceBuffer tracked_buffer(/*is_tuple=*/false, {buffer},
-                                            definition_event,
-                                            /*on_delete_callback_=*/nullptr);
+  TrackedTfrtCpuDeviceBuffer tracked_buffer(
+      /*is_tuple=*/false, /*owns_buffers=*/true, {buffer}, definition_event,
+      /*on_delete_callback_=*/nullptr);
 
   BlockUntilReady(tracked_buffer.definition_event().GetAsyncValue());
 
   auto result = tracked_buffer.Buffers()[0];
-
+  ASSERT_TRUE(result.IsAvailable());
   EXPECT_EQ(
       std::string(static_cast<const char*>(result->data()), result->size()),
       expected);
@@ -60,16 +66,17 @@ TEST(TrackedTfrtCpuDeviceBufferTest, Tuple) {
   std::string expected_0 = "tracked_tfrt_cpu_device_buffer_test";
   std::string expected_1 = "tuple";
   TF_ASSERT_OK_AND_ASSIGN(
-      auto buffer_0, MaybeOwningCpuMemory::AllocateShared(expected_0.size()));
+      auto buffer_0,
+      MaybeOwningCpuMemory::AllocateAvailableAvr(expected_0.size()));
   TF_ASSERT_OK_AND_ASSIGN(
-      auto buffer_1, MaybeOwningCpuMemory::AllocateShared(expected_1.size()));
+      auto buffer_1,
+      MaybeOwningCpuMemory::AllocateAvailableAvr(expected_1.size()));
 
-  auto definition_event_0 = tsl::MakeConstructedAsyncValueRef<CpuEvent>();
-  auto definition_event_1 = tsl::MakeConstructedAsyncValueRef<CpuEvent>();
+  auto definition_event_0 = MakeConstructedAsyncValueRef<CpuEvent>();
+  auto definition_event_1 = MakeConstructedAsyncValueRef<CpuEvent>();
 
-  tsl::thread::ThreadPool thread_pool(tsl::Env::Default(),
-                                      "tracked_buffer_test",
-                                      /*num_threads=*/4);
+  ThreadPool thread_pool(tsl::Env::Default(), "tracked_buffer_test",
+                         /*num_threads=*/4);
 
   thread_pool.Schedule([&]() {
     std::memcpy(buffer_0->data(), expected_0.data(), expected_0.size());
@@ -81,7 +88,7 @@ TEST(TrackedTfrtCpuDeviceBufferTest, Tuple) {
   });
 
   TrackedTfrtCpuDeviceBuffer tracked_buffer(
-      /*is_tuple=*/true, {buffer_0, buffer_1},
+      /*is_tuple=*/true, /*owns_buffers=*/true, {buffer_0, buffer_1},
       {definition_event_0, definition_event_1},
       /*on_delete_callback_=*/nullptr);
 
@@ -89,7 +96,8 @@ TEST(TrackedTfrtCpuDeviceBufferTest, Tuple) {
 
   auto result_0 = tracked_buffer.Buffers()[0];
   auto result_1 = tracked_buffer.Buffers()[1];
-
+  ASSERT_TRUE(result_0.IsAvailable());
+  ASSERT_TRUE(result_1.IsAvailable());
   EXPECT_EQ(
       std::string(static_cast<const char*>(result_0->data()), result_0->size()),
       expected_0);
@@ -100,21 +108,20 @@ TEST(TrackedTfrtCpuDeviceBufferTest, Tuple) {
 
 TEST(TrackedTfrtCpuDeviceBufferTest, BasicError) {
   TF_ASSERT_OK_AND_ASSIGN(auto buffer,
-                          MaybeOwningCpuMemory::AllocateShared(64));
+                          MaybeOwningCpuMemory::AllocateAvailableAvr(64));
 
-  auto definition_event = tsl::MakeConstructedAsyncValueRef<CpuEvent>();
+  auto definition_event = MakeConstructedAsyncValueRef<CpuEvent>();
 
-  tsl::thread::ThreadPool thread_pool(tsl::Env::Default(),
-                                      "tracked_buffer_test",
-                                      /*num_threads=*/4);
+  ThreadPool thread_pool(tsl::Env::Default(), "tracked_buffer_test",
+                         /*num_threads=*/4);
 
   thread_pool.Schedule([&]() {
     definition_event.SetError("tracked_tfrt_cpu_device_buffer_test error.");
   });
 
-  TrackedTfrtCpuDeviceBuffer tracked_buffer(/*is_tuple=*/false, {buffer},
-                                            definition_event,
-                                            /*on_delete_callback_=*/nullptr);
+  TrackedTfrtCpuDeviceBuffer tracked_buffer(
+      /*is_tuple=*/false, /*owns_buffers=*/true, {buffer}, definition_event,
+      /*on_delete_callback_=*/nullptr);
 
   BlockUntilReady(tracked_buffer.definition_event().GetAsyncValue());
 
@@ -126,16 +133,17 @@ TEST(TrackedTfrtCpuDeviceBufferTest, BasicError) {
 TEST(TrackedTfrtCpuDeviceBufferTest, TupleError) {
   std::string expected = "tracked_tfrt_cpu_device_buffer_test";
   TF_ASSERT_OK_AND_ASSIGN(
-      auto buffer_0, MaybeOwningCpuMemory::AllocateShared(expected.size()));
+      auto buffer_0,
+      MaybeOwningCpuMemory::AllocateAvailableAvr(expected.size()));
   TF_ASSERT_OK_AND_ASSIGN(
-      auto buffer_1, MaybeOwningCpuMemory::AllocateShared(expected.size()));
+      auto buffer_1,
+      MaybeOwningCpuMemory::AllocateAvailableAvr(expected.size()));
 
-  auto definition_event_0 = tsl::MakeConstructedAsyncValueRef<CpuEvent>();
-  auto definition_event_1 = tsl::MakeConstructedAsyncValueRef<CpuEvent>();
+  auto definition_event_0 = MakeConstructedAsyncValueRef<CpuEvent>();
+  auto definition_event_1 = MakeConstructedAsyncValueRef<CpuEvent>();
 
-  tsl::thread::ThreadPool thread_pool(tsl::Env::Default(),
-                                      "tracked_buffer_test",
-                                      /*num_threads=*/4);
+  ThreadPool thread_pool(tsl::Env::Default(), "tracked_buffer_test",
+                         /*num_threads=*/4);
 
   thread_pool.Schedule([&]() {
     std::memcpy(buffer_0->data(), expected.data(), expected.size());
@@ -147,7 +155,7 @@ TEST(TrackedTfrtCpuDeviceBufferTest, TupleError) {
   });
 
   TrackedTfrtCpuDeviceBuffer tracked_buffer(
-      /*is_tuple=*/true, {buffer_0, buffer_1},
+      /*is_tuple=*/true, /*owns_buffers=*/true, {buffer_0, buffer_1},
       {definition_event_0, definition_event_1},
       /*on_delete_callback_=*/nullptr);
 
@@ -158,5 +166,99 @@ TEST(TrackedTfrtCpuDeviceBufferTest, TupleError) {
             "tracked_tfrt_cpu_device_buffer_test tuple error.");
 }
 
+TEST(TrackedTfrtCpuDeviceBufferTest, DelayedAllocation) {
+  std::string expected = "tracked_tfrt_cpu_device_buffer_test";
+
+  auto buffer = MakeUnconstructedAsyncValueRef<MaybeOwningCpuMemory>();
+  auto malloc_event = MakeConstructedAsyncValueRef<CpuEvent>();
+  malloc_event.AndThen([buffer_copy = buffer.CopyRef(),
+                        buffer_size = expected.size()] {
+    buffer_copy.emplace(MaybeOwningCpuMemory::Allocate(buffer_size).value());
+  });
+
+  auto definition_event = MakeConstructedAsyncValueRef<CpuEvent>();
+  TrackedTfrtCpuDeviceBuffer tracked_buffer(/*is_tuple=*/false,
+                                            /*owns_buffers=*/true, {buffer},
+                                            {expected.size()}, definition_event,
+                                            /*on_delete_callback_=*/nullptr);
+  auto result = tracked_buffer.Buffers()[0];
+  ASSERT_FALSE(result.IsAvailable());
+  ASSERT_EQ(tracked_buffer.BufferSizes()[0], expected.size());
+
+  ThreadPool thread_pool(tsl::Env::Default(), "tracked_buffer_test",
+                         /*num_threads=*/4);
+
+  thread_pool.Schedule([&]() {
+    malloc_event.SetStateConcrete();
+    std::memcpy(buffer->data(), expected.data(), expected.size());
+    definition_event.SetStateConcrete();
+  });
+
+  BlockUntilReady(tracked_buffer.definition_event().GetAsyncValue());
+
+  EXPECT_EQ(
+      std::string(static_cast<const char*>(result->data()), result->size()),
+      expected);
+}
+
+TEST(TrackedTfrtCpuDeviceBufferTest, DelayedAllocationTuple) {
+  std::string expected_0 = "tracked_tfrt_cpu_device_buffer_test";
+  std::string expected_1 = "tuple";
+
+  auto buffer_0 = MakeUnconstructedAsyncValueRef<MaybeOwningCpuMemory>();
+  auto malloc_event_0 = MakeConstructedAsyncValueRef<CpuEvent>();
+  malloc_event_0.AndThen(
+      [buffer_0_copy = buffer_0.CopyRef(), buffer_0_size = expected_0.size()] {
+        buffer_0_copy.emplace(
+            MaybeOwningCpuMemory::Allocate(buffer_0_size).value());
+      });
+  auto buffer_1 = MakeUnconstructedAsyncValueRef<MaybeOwningCpuMemory>();
+  auto malloc_event_1 = MakeConstructedAsyncValueRef<CpuEvent>();
+  malloc_event_1.AndThen(
+      [buffer_1_copy = buffer_1.CopyRef(), buffer_1_size = expected_1.size()] {
+        buffer_1_copy.emplace(
+            MaybeOwningCpuMemory::Allocate(buffer_1_size).value());
+      });
+
+  auto definition_event_0 = MakeConstructedAsyncValueRef<CpuEvent>();
+  auto definition_event_1 = MakeConstructedAsyncValueRef<CpuEvent>();
+  TrackedTfrtCpuDeviceBuffer tracked_buffer(
+      /*is_tuple=*/true,
+      /*owns_buffers=*/true, {buffer_0, buffer_1},
+      {expected_0.size(), expected_1.size()},
+      {definition_event_0, definition_event_1},
+      /*on_delete_callback_=*/nullptr);
+
+  auto result_0 = tracked_buffer.Buffers()[0];
+  auto result_1 = tracked_buffer.Buffers()[1];
+  ASSERT_FALSE(result_0.IsAvailable());
+  ASSERT_FALSE(result_1.IsAvailable());
+  ASSERT_EQ(tracked_buffer.BufferSizes()[0], expected_0.size());
+  ASSERT_EQ(tracked_buffer.BufferSizes()[1], expected_1.size());
+
+  ThreadPool thread_pool(tsl::Env::Default(), "tracked_buffer_test",
+                         /*num_threads=*/4);
+
+  thread_pool.Schedule([&]() {
+    malloc_event_0.SetStateConcrete();
+    std::memcpy(buffer_0->data(), expected_0.data(), expected_0.size());
+    definition_event_0.SetStateConcrete();
+  });
+  thread_pool.Schedule([&]() {
+    malloc_event_1.SetStateConcrete();
+    std::memcpy(buffer_1->data(), expected_1.data(), expected_1.size());
+    definition_event_1.SetStateConcrete();
+  });
+
+  BlockUntilReady(tracked_buffer.definition_event().GetAsyncValue());
+
+  EXPECT_EQ(
+      std::string(static_cast<const char*>(result_0->data()), result_0->size()),
+      expected_0);
+  EXPECT_EQ(
+      std::string(static_cast<const char*>(result_1->data()), result_1->size()),
+      expected_1);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/distributed/BUILD b/third_party/xla/xla/pjrt/distributed/BUILD
index 31b996faffe52a..20533c2cbb1073 100644
--- a/third_party/xla/xla/pjrt/distributed/BUILD
+++ b/third_party/xla/xla/pjrt/distributed/BUILD
@@ -50,10 +50,11 @@ xla_cc_test(
         ":in_memory_key_value_store",
         ":protocol_proto_cc",
         ":topology_util",
-        "@com_google_absl//absl/status",
+        "//xla:test_helpers",
         "@com_google_absl//absl/time",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
     ],
@@ -82,6 +83,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
@@ -123,6 +125,7 @@ cc_library(
         "//xla:util",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:utils",
+        "//xla/pjrt/gpu:gpu_topology_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
@@ -148,6 +151,8 @@ xla_cc_test(
         ":topology_util",
         "//xla:protobuf_util",
         "//xla:status_macros",
+        "//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
diff --git a/third_party/xla/xla/pjrt/distributed/client.cc b/third_party/xla/xla/pjrt/distributed/client.cc
index 5da921ac2c5035..74f30ff06ccc4e 100644
--- a/third_party/xla/xla/pjrt/distributed/client.cc
+++ b/third_party/xla/xla/pjrt/distributed/client.cc
@@ -16,7 +16,9 @@ limitations under the License.
 #include "xla/pjrt/distributed/client.h"
 
 #include <algorithm>
+#include <cstdint>
 #include <memory>
+#include <optional>
 #include <string>
 #include <string_view>
 #include <utility>
@@ -26,6 +28,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/time/time.h"
+#include "absl/types/span.h"
 #include "grpcpp/channel.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/tsl/distributed_runtime/coordination/coordination_client.h"
@@ -57,8 +60,9 @@ class DistributedRuntimeCoordinationServiceClient
   absl::Status KeyValueSet(std::string_view key,
                            std::string_view value) override;
   absl::Status KeyValueDelete(std::string_view key) override;
-  absl::Status WaitAtBarrier(std::string barrier_id,
-                             absl::Duration timeout) override;
+  absl::Status WaitAtBarrier(
+      std::string barrier_id, absl::Duration timeout,
+      std::optional<absl::Span<const int32_t>> process_ids) override;
   absl::StatusOr<tsl::CoordinationServiceAgent*> GetCoordinationServiceAgent()
       override;
 
@@ -85,16 +89,16 @@ DistributedRuntimeCoordinationServiceClient::
       absl::ToInt64Milliseconds(options.shutdown_timeout));
   config.set_agent_destruction_without_shutdown(
       !options.shutdown_on_destruction);
-  auto error_fn =
-      [timeout_fn = options.missed_heartbeat_callback](const Status& status) {
-        LOG(ERROR) << "Coordination service agent in error status: " << status;
-        timeout_fn(status, /*coordinator_reported_failure=*/true);
-      };
+  auto error_fn = [timeout_fn = options.missed_heartbeat_callback](
+                      const absl::Status& status) {
+    LOG(ERROR) << "Coordination service agent in error status: " << status;
+    timeout_fn(status, /*coordinator_reported_failure=*/true);
+  };
 
   std::unique_ptr<tsl::CoordinationClient> leader_client;
   leader_client.reset(tsl::NewGrpcCoordinationClient(channel));
   coord_agent_ = tsl::CreateCoordinationServiceAgent();
-  const Status status =
+  const absl::Status status =
       coord_agent_->Initialize(options.env, "jax_worker", options.node_id,
                                config, std::move(leader_client), error_fn);
   if (!status.ok()) {
@@ -112,7 +116,7 @@ absl::Status DistributedRuntimeCoordinationServiceClient::Connect() {
       absl::Now() +
       absl::Milliseconds(config_.cluster_register_timeout_in_ms());
 
-  Status s = coord_agent_->Connect();
+  absl::Status s = coord_agent_->Connect();
   if (s.ok()) {
     absl::Duration barrier_timeout = deadline - absl::Now();
     // Note: `init_timeout` in client options may be set to 0 so that the
@@ -132,7 +136,7 @@ absl::Status DistributedRuntimeCoordinationServiceClient::Connect() {
 
 absl::Status DistributedRuntimeCoordinationServiceClient::Shutdown() {
   LOG(INFO) << "Distributed task shutdown initiated.";
-  Status s = coord_agent_->Shutdown();
+  absl::Status s = coord_agent_->Shutdown();
   LOG(INFO) << "Distributed task shutdown result: " << s;
   return s;
 }
@@ -173,8 +177,19 @@ absl::Status DistributedRuntimeCoordinationServiceClient::KeyValueSet(
 }
 
 absl::Status DistributedRuntimeCoordinationServiceClient::WaitAtBarrier(
-    std::string barrier_id, absl::Duration timeout) {
-  return coord_agent_->WaitAtBarrier(barrier_id, timeout, /*tasks=*/{});
+    std::string barrier_id, absl::Duration timeout,
+    std::optional<absl::Span<const int32_t>> process_ids) {
+  std::vector<tensorflow::CoordinatedTask> tasks;
+  if (process_ids.has_value()) {
+    tasks.reserve(process_ids->size());
+    for (int32_t process_id : process_ids.value()) {
+      tensorflow::CoordinatedTask task;
+      task.set_job_name("jax_worker");
+      task.set_task_id(process_id);
+      tasks.push_back(std::move(task));
+    }
+  }
+  return coord_agent_->WaitAtBarrier(barrier_id, timeout, tasks);
 }
 
 absl::StatusOr<tsl::CoordinationServiceAgent*>
diff --git a/third_party/xla/xla/pjrt/distributed/client.h b/third_party/xla/xla/pjrt/distributed/client.h
index 37e675cae871aa..03560cd1ca59f7 100644
--- a/third_party/xla/xla/pjrt/distributed/client.h
+++ b/third_party/xla/xla/pjrt/distributed/client.h
@@ -16,14 +16,17 @@ limitations under the License.
 #ifndef XLA_PJRT_DISTRIBUTED_CLIENT_H_
 #define XLA_PJRT_DISTRIBUTED_CLIENT_H_
 
+#include <cstdint>
 #include <functional>
 #include <memory>
+#include <optional>
 #include <string>
 #include <string_view>
 #include <utility>
 #include <vector>
 
 #include "absl/time/time.h"
+#include "absl/types/span.h"
 #include "grpcpp/channel.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/statusor.h"
@@ -80,7 +83,8 @@ class DistributedRuntimeClient {
                        "missing heartbeats. This most likely indicates that "
                        "another task died; see the other task logs for more "
                        "details. Disable Python buffering, i.e. `python -u`, "
-                       "to be sure to see all the previous output. Status: "
+                       "to be sure to see all the previous output. "
+                       "absl::Status: "
                     << status;
               } else {
                 LOG(QFATAL)
@@ -89,7 +93,7 @@ class DistributedRuntimeClient {
                        "indicates that the coordinator task died; see the "
                        "coordinator's task logs for more details. "
                        "Disable Python buffering, i.e. `python -u`, to be "
-                       "sure to see all the previous output. Status: "
+                       "sure to see all the previous output. absl::Status: "
                     << status;
               }
             };
@@ -134,10 +138,12 @@ class DistributedRuntimeClient {
   // up all key-values under the directory.
   virtual absl::Status KeyValueDelete(std::string_view key) = 0;
 
-  // Blocks until all nodes are at the barrier or the barrier times out.
-  // `barrier_id` should be unique across barriers.
-  virtual absl::Status WaitAtBarrier(std::string barrier_id,
-                                     absl::Duration timeout) = 0;
+  // Blocks until all nodes (or the ones specified in `nodes`) are at the
+  // barrier or the barrier times out. `barrier_id` should be unique across
+  // barriers.
+  virtual absl::Status WaitAtBarrier(
+      std::string barrier_id, absl::Duration timeout,
+      std::optional<absl::Span<const int32_t>> nodes) = 0;
 
   // Returns pointer to coordination service agent, or InternalError if the
   // client does not use coordination service.
diff --git a/third_party/xla/xla/pjrt/distributed/client_server_test.cc b/third_party/xla/xla/pjrt/distributed/client_server_test.cc
index f9a9a5c0a5cda6..9fe0741a83ae85 100644
--- a/third_party/xla/xla/pjrt/distributed/client_server_test.cc
+++ b/third_party/xla/xla/pjrt/distributed/client_server_test.cc
@@ -15,10 +15,12 @@ limitations under the License.
 
 #include <functional>
 #include <memory>
+#include <optional>
 #include <string>
 #include <string_view>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/barrier.h"
@@ -33,6 +35,7 @@ limitations under the License.
 #include "xla/pjrt/distributed/topology_util.h"
 #include "xla/protobuf_util.h"
 #include "xla/status_macros.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
@@ -152,7 +155,7 @@ TEST_F(ClientServerTest, ConnectAndShutdownAreBarriers) {
       TF_RET_CHECK(shutdown_count == num_nodes);
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   std::vector<absl::Status> statuses(num_nodes);
@@ -232,7 +235,7 @@ TEST_F(ClientServerTest, ConnectAndEnumerateDevices) {
         std::string value,
         client->BlockingKeyValueGet("key2", absl::InfiniteDuration()));
     TF_RET_CHECK(value == "value2");
-    return OkStatus();
+    return absl::OkStatus();
   };
   auto thread1_fn = [&]() -> absl::Status {
     auto client = GetClient(/*node_id=*/1);
@@ -258,7 +261,7 @@ TEST_F(ClientServerTest, ConnectAndEnumerateDevices) {
         client->BlockingKeyValueGet("key1", absl::InfiniteDuration()));
     TF_RET_CHECK(value == "value1");
     TF_RETURN_IF_ERROR(client->KeyValueSet("key2", "value2"));
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   std::vector<std::function<absl::Status()>> functions = {thread0_fn,
@@ -312,7 +315,7 @@ TEST_F(ClientServerTest, EnumerateElevenDevices) {
     TF_RET_CHECK(
         xla::protobuf_util::ProtobufEquals(topology, expected_topology))
         << topology.DebugString();
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   std::vector<absl::Status> statuses(num_nodes);
@@ -348,7 +351,7 @@ TEST_F(ClientServerTest, ZeroInitTimeoutShouldStillWaitForOtherTasks) {
     }
     TF_RETURN_IF_ERROR(client->Connect());
 
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   std::vector<absl::Status> statuses(num_nodes);
@@ -378,13 +381,13 @@ TEST_F(ClientServerTest, ClientsTerminateShutdownIfAnyClientGoesAway) {
     TF_RETURN_IF_ERROR(client->Connect());
 
     if (node_id == 0) {
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     // The call to Shutdown() should be interrupted if a worker stops issuing
     // heartbeats.
     TF_RETURN_IF_ERROR(client->Shutdown());
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   std::vector<absl::Status> statuses(num_nodes);
@@ -426,10 +429,10 @@ TEST_F(ClientServerTest, ClientsReceiveMissedHeartbeatIfAnyClientGoesAway) {
     TF_RETURN_IF_ERROR(client->Connect());
 
     if (node_id == 0) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     shutdown.WaitForNotification();
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   std::vector<absl::Status> statuses(num_nodes);
@@ -477,7 +480,7 @@ TEST_F(ClientServerTest, ClientsTerminateIfServiceGoesAway) {
     shutdown.WaitForNotification();
 
     TF_RETURN_IF_ERROR(client->Shutdown());
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   std::vector<absl::Status> statuses(num_nodes);
@@ -512,7 +515,7 @@ TEST_F(ClientServerTest, LateClientsAreOk) {
     absl::SleepFor(absl::Milliseconds(200) * node_id);
     TF_RETURN_IF_ERROR(client->Connect());
     TF_RETURN_IF_ERROR(client->Shutdown());
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   std::vector<absl::Status> statuses(num_nodes);
@@ -550,7 +553,7 @@ TEST_F(ClientServerTest, ConnectEventuallyTimesOutIfAClientDoesNotShowUp) {
 
     TF_RETURN_IF_ERROR(client->Connect());
     TF_RETURN_IF_ERROR(client->Shutdown());
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   // Note: one fewer thread than 'num_nodes'.
@@ -575,11 +578,13 @@ TEST_F(ClientServerTest, WaitAtBarrier_Succeed) {
     auto client = GetClient(node_id);
     TF_RETURN_IF_ERROR(client->Connect());
 
-    TF_RETURN_IF_ERROR(client->WaitAtBarrier("barrier_1", kBarrierTimeout));
-    TF_RETURN_IF_ERROR(client->WaitAtBarrier("barrier_2", kBarrierTimeout));
+    TF_RETURN_IF_ERROR(
+        client->WaitAtBarrier("barrier_1", kBarrierTimeout, std::nullopt));
+    TF_RETURN_IF_ERROR(
+        client->WaitAtBarrier("barrier_2", kBarrierTimeout, std::nullopt));
 
     TF_RETURN_IF_ERROR(client->Shutdown());
-    return xla::OkStatus();
+    return absl::OkStatus();
   };
 
   std::vector<absl::Status> statuses(num_nodes);
@@ -608,7 +613,8 @@ TEST_F(ClientServerTest, WaitAtBarrier_Timeout) {
     if (node_id == 1) {
       n.WaitForNotification();
     }
-    Status barrier_status = client->WaitAtBarrier("barrier_1", kBarrierTimeout);
+    absl::Status barrier_status =
+        client->WaitAtBarrier("barrier_1", kBarrierTimeout, std::nullopt);
     // Node 0 notifies that barrier has already timed out.
     if (node_id == 0) {
       n.Notify();
@@ -616,7 +622,7 @@ TEST_F(ClientServerTest, WaitAtBarrier_Timeout) {
     TF_RETURN_IF_ERROR(barrier_status);
 
     TF_RETURN_IF_ERROR(client->Shutdown());
-    return xla::OkStatus();
+    return absl::OkStatus();
   };
 
   std::vector<absl::Status> statuses(num_nodes);
@@ -649,10 +655,11 @@ TEST_F(ClientServerTest, WaitAtBarrier_TimeoutWithDifferentBarrierId) {
     } else if (node_id == 1) {
       barrier_id = "barrier_1";
     }
-    TF_RETURN_IF_ERROR(client->WaitAtBarrier(barrier_id, kBarrierTimeout));
+    TF_RETURN_IF_ERROR(
+        client->WaitAtBarrier(barrier_id, kBarrierTimeout, std::nullopt));
 
     TF_RETURN_IF_ERROR(client->Shutdown());
-    return xla::OkStatus();
+    return absl::OkStatus();
   };
 
   std::vector<absl::Status> statuses(num_nodes);
@@ -677,11 +684,13 @@ TEST_F(ClientServerTest, WaitAtBarrier_FailWithSameBarrierId) {
     auto client = GetClient(node_id);
     TF_RETURN_IF_ERROR(client->Connect());
 
-    TF_RETURN_IF_ERROR(client->WaitAtBarrier("barrier_1", kBarrierTimeout));
-    TF_RETURN_IF_ERROR(client->WaitAtBarrier("barrier_1", kBarrierTimeout));
+    TF_RETURN_IF_ERROR(
+        client->WaitAtBarrier("barrier_1", kBarrierTimeout, std::nullopt));
+    TF_RETURN_IF_ERROR(
+        client->WaitAtBarrier("barrier_1", kBarrierTimeout, std::nullopt));
 
     TF_RETURN_IF_ERROR(client->Shutdown());
-    return xla::OkStatus();
+    return absl::OkStatus();
   };
 
   std::vector<absl::Status> statuses(num_nodes);
@@ -698,6 +707,92 @@ TEST_F(ClientServerTest, WaitAtBarrier_FailWithSameBarrierId) {
   }
 }
 
+TEST_F(ClientServerTest, WaitAtBarrierSubset_Succeeds) {
+  int num_nodes = 3;
+  StartService(num_nodes);
+  absl::Notification n0, n1;
+
+  auto thread_fn = [&](int node_id) -> absl::Status {
+    auto client = GetClient(node_id);
+    TF_RETURN_IF_ERROR(client->Connect());
+
+    if (node_id != 2) {
+      TF_RETURN_IF_ERROR(client->WaitAtBarrier(
+          "barrier_1", kBarrierTimeout, absl::Span<const int32_t>{0, 1}));
+    }
+
+    TF_RETURN_IF_ERROR(client->Shutdown());
+    return absl::OkStatus();
+  };
+
+  std::vector<absl::Status> statuses(num_nodes);
+  {
+    tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), "test_threads",
+                                        num_nodes);
+    for (int i = 0; i < num_nodes; ++i) {
+      thread_pool.Schedule([&, i]() { statuses[i] = thread_fn(i); });
+    }
+    for (int i = 0; i < num_nodes; ++i) {
+      TF_EXPECT_OK(statuses[i]);
+    }
+  }
+}
+
+TEST_F(ClientServerTest,
+       WaitAtBarrierSubsetNonParticipatingProcessAttempts_Fails) {
+  int num_nodes = 3;
+  StartService(num_nodes);
+  absl::Notification n;
+  absl::Barrier barrier(num_nodes + 1);
+
+  // Timeline:
+  // 1. Node 1, 2 joins barrier.
+  // 2. Barrier fails because node 2 is unexpected.
+  // 3. Node 0 joins barrier, but fails because barrier already failed.
+
+  auto thread_fn = [&](int node_id) -> absl::Status {
+    auto client = GetClient(node_id);
+    TF_RETURN_IF_ERROR(client->Connect());
+
+    // Node 0 will be notified only after the barrier has failed and will thus
+    // fail too.
+    if (node_id == 0) {
+      n.WaitForNotification();
+    }
+    auto status = client->WaitAtBarrier("barrier_1", kBarrierTimeout,
+                                        absl::Span<const int32_t>{0, 1});
+    // Node 1 will fail in the barrier because non-participating node 2 also
+    // calls it.
+    if (node_id == 1) {
+      n.Notify();
+    }
+    // Not calling `Shutdown` because the client will have already returned
+    // error in the previous call to `WaitAtBarrier` for all 3 nodes. In the
+    // error state, calling `Shutdown` is undefined behavior.
+    return status;
+  };
+
+  std::vector<absl::Status> statuses(num_nodes);
+  {
+    tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), "test_threads",
+                                        num_nodes);
+    for (int i = 0; i < num_nodes; ++i) {
+      thread_pool.Schedule([&, i]() {
+        statuses[i] = thread_fn(i);
+        barrier.Block();
+      });
+    }
+
+    // Block until the threads have finished execution.
+    barrier.Block();
+
+    for (int i = 0; i < num_nodes; ++i) {
+      EXPECT_EQ(statuses[i].code(), tsl::error::INVALID_ARGUMENT)
+          << " node id: " << i << " status: " << statuses[i].message();
+    }
+  }
+}
+
 TEST_F(ClientServerTest, KeyValueDirGet) {
   StartService(/*num_nodes=*/1);
   auto client = GetClient(/*node_id=*/0);
diff --git a/third_party/xla/xla/pjrt/distributed/topology_util.cc b/third_party/xla/xla/pjrt/distributed/topology_util.cc
index bec8c617010ce6..40a9d63b8cadea 100644
--- a/third_party/xla/xla/pjrt/distributed/topology_util.cc
+++ b/third_party/xla/xla/pjrt/distributed/topology_util.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/pjrt/distributed/topology_util.h"
 
 #include <fstream>
+#include <map>
 #include <string>
 #include <string_view>
 #include <vector>
@@ -77,7 +78,7 @@ static std::string GetGlobalTopologyKey(std::string_view platform) {
 static absl::StatusOr<std::vector<LocalTopologyProto>> GetAllLocalTopologies(
     std::string_view platform, int num_nodes, KeyValueStoreInterface* kv_store,
     absl::Duration timeout) {
-  std::vector<StatusOr<std::string>> local_topology_strs(num_nodes);
+  std::vector<absl::StatusOr<std::string>> local_topology_strs(num_nodes);
 
   // TODO(ezhulenev): Should a thread pool become a function argument?
   tsl::thread::ThreadPool thread_pool(
@@ -155,12 +156,13 @@ GlobalTopologyProto BuildGlobalTopology(
   return global_topology;
 }
 
-Status ExchangeTopologies(std::string_view platform, int node_id, int num_nodes,
-                          absl::Duration get_local_topology_timeout,
-                          absl::Duration get_global_topology_timeout,
-                          KeyValueStoreInterface* kv_store,
-                          const LocalTopologyProto& local_topology,
-                          GlobalTopologyProto* global_topology) {
+absl::Status ExchangeTopologies(std::string_view platform, int node_id,
+                                int num_nodes,
+                                absl::Duration get_local_topology_timeout,
+                                absl::Duration get_global_topology_timeout,
+                                KeyValueStoreInterface* kv_store,
+                                const LocalTopologyProto& local_topology,
+                                GlobalTopologyProto* global_topology) {
   VLOG(3) << "Local Topology for platform" << platform << ":\n"
           << local_topology.DebugString();
   if (num_nodes == 1) {
@@ -197,4 +199,49 @@ Status ExchangeTopologies(std::string_view platform, int node_id, int num_nodes,
   return absl::OkStatus();
 }
 
+absl::StatusOr<GpuTopologyProto> BuildGpuTopology(
+    const GlobalTopologyProto& global_topology) {
+  GpuTopologyProto gpu_topology;
+  std::map<int, std::vector<int>> slice_id_to_node_ids;
+  std::vector<int> device_ids;
+  for (int i = 0; i < global_topology.nodes_size(); ++i) {
+    const LocalTopologyProto& local_topology = global_topology.nodes(i);
+
+    slice_id_to_node_ids[local_topology.devices(0).slice_index()].push_back(
+        local_topology.node_id());
+
+    // Initializes GPU topology with the first local topology.
+    if (i == 0) {
+      gpu_topology.set_platform_version((local_topology.devices(0).name()));
+      gpu_topology.set_num_devices_per_host(local_topology.devices_size());
+    } else {
+      // Check for consistent number of devices per host.
+      if (gpu_topology.num_devices_per_host() !=
+          local_topology.devices_size()) {
+        return absl::InternalError(
+            "GpuTopology doesn't support multi-host with different number of "
+            "devices per host.");
+      }
+    }
+
+    for (const DeviceProto& device : local_topology.devices()) {
+      device_ids.push_back(device.global_device_id());
+    }
+  }
+
+  gpu_topology.set_num_slices(slice_id_to_node_ids.size());
+  gpu_topology.set_num_hosts_per_slice(
+      slice_id_to_node_ids.begin()->second.size());
+  // Check for consistent number of hosts per slice.
+  for (const auto& [boot_id, node_ids] : slice_id_to_node_ids) {
+    if (node_ids.size() != gpu_topology.num_hosts_per_slice()) {
+      return absl::InternalError(
+          "GpuTopology doesn't support multi-host with different number of "
+          "hosts per slice.");
+    }
+  }
+  gpu_topology.mutable_device_ids()->Add(device_ids.begin(), device_ids.end());
+  return gpu_topology;
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/distributed/topology_util.h b/third_party/xla/xla/pjrt/distributed/topology_util.h
index a74478cdb0b1ca..99cc8eab8012e1 100644
--- a/third_party/xla/xla/pjrt/distributed/topology_util.h
+++ b/third_party/xla/xla/pjrt/distributed/topology_util.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/pjrt/distributed/protocol.pb.h"
+#include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/status.h"
 #include "xla/statusor.h"
 
@@ -35,12 +36,13 @@ absl::StatusOr<std::string> GetBootIdString();
 // Performs a distributed exchange of topologies using a KV store. Each process
 // provides its local topology, and the local topologies are exchanged to
 // form a global topology.
-Status ExchangeTopologies(std::string_view platform, int node_id, int num_nodes,
-                          absl::Duration get_local_topology_timeout,
-                          absl::Duration get_global_topology_timeout,
-                          KeyValueStoreInterface* kv_store,
-                          const LocalTopologyProto& local_topology,
-                          GlobalTopologyProto* global_topology);
+absl::Status ExchangeTopologies(std::string_view platform, int node_id,
+                                int num_nodes,
+                                absl::Duration get_local_topology_timeout,
+                                absl::Duration get_global_topology_timeout,
+                                KeyValueStoreInterface* kv_store,
+                                const LocalTopologyProto& local_topology,
+                                GlobalTopologyProto* global_topology);
 
 // Functions below this point are public only for testing.
 
@@ -50,6 +52,10 @@ Status ExchangeTopologies(std::string_view platform, int node_id, int num_nodes,
 GlobalTopologyProto BuildGlobalTopology(
     absl::Span<LocalTopologyProto> local_topologies);
 
+// Builds a GpuTopologyProto representing the GPU configuration described in the
+// given GlobalTopologyProto.
+absl::StatusOr<GpuTopologyProto> BuildGpuTopology(
+    const GlobalTopologyProto& global_topology);
 }  // namespace xla
 
 #endif  // XLA_PJRT_DISTRIBUTED_TOPOLOGY_UTIL_H_
diff --git a/third_party/xla/xla/pjrt/distributed/topology_util_test.cc b/third_party/xla/xla/pjrt/distributed/topology_util_test.cc
index f311b398d5309e..e28df2c5e93e2f 100644
--- a/third_party/xla/xla/pjrt/distributed/topology_util_test.cc
+++ b/third_party/xla/xla/pjrt/distributed/topology_util_test.cc
@@ -22,8 +22,10 @@ limitations under the License.
 #include "absl/time/time.h"
 #include "xla/pjrt/distributed/in_memory_key_value_store.h"
 #include "xla/pjrt/distributed/protocol.pb.h"
+#include "xla/test_helpers.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/env.h"
+#include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 #include "tsl/platform/threadpool.h"
 
@@ -82,5 +84,79 @@ TEST(TopologyTest, ExchangeTopology) {
   }
 }
 
+TEST(TopologyTest, BuildGpuTopology) {
+  std::string slice_0_boot_id = "foo";
+  std::string slice_1_boot_id = "bar";
+  std::vector<LocalTopologyProto> locals(2);
+  // Adds 1 host to slice 0 and 1 host to slice 1.
+  locals[0].set_boot_id(slice_0_boot_id);
+  locals[1].set_boot_id(slice_1_boot_id);
+  locals[0].set_node_id(0);
+  locals[1].set_node_id(1);
+  // Adds 2 devices to host 0 and 2 devices to host 1.
+  DeviceProto* d0 = locals[0].add_devices();
+  d0->set_local_device_ordinal(0);
+  DeviceProto* d1 = locals[0].add_devices();
+  d1->set_local_device_ordinal(1);
+  DeviceProto* d2 = locals[1].add_devices();
+  d2->set_local_device_ordinal(0);
+  DeviceProto* d3 = locals[1].add_devices();
+  d3->set_local_device_ordinal(1);
+
+  GlobalTopologyProto global =
+      BuildGlobalTopology(absl::Span<LocalTopologyProto>(locals));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto gpu_topology, BuildGpuTopology(global));
+  EXPECT_EQ(gpu_topology.device_ids_size(), 4);
+  EXPECT_EQ(gpu_topology.num_slices(), 2);
+  EXPECT_EQ(gpu_topology.num_hosts_per_slice(), 1);
+  EXPECT_EQ(gpu_topology.num_devices_per_host(), 2);
+}
+
+TEST(TopologyTest, BuildGpuTopologyWithDifferentNumHostsPerSlice) {
+  std::string slice_0_boot_id = "foo";
+  std::string slice_1_boot_id = "bar";
+  std::vector<LocalTopologyProto> locals(3);
+  // Adds 2 hosts to slice 0 and 1 host to slice 1.
+  locals[0].set_boot_id(slice_0_boot_id);
+  locals[1].set_boot_id(slice_0_boot_id);
+  locals[2].set_boot_id(slice_1_boot_id);
+  locals[0].set_node_id(0);
+  locals[1].set_node_id(1);
+  locals[2].set_node_id(2);
+  DeviceProto* d0 = locals[0].add_devices();
+  d0->set_local_device_ordinal(0);
+  DeviceProto* d1 = locals[1].add_devices();
+  d1->set_local_device_ordinal(0);
+  DeviceProto* d2 = locals[2].add_devices();
+  d2->set_local_device_ordinal(0);
+
+  GlobalTopologyProto global =
+      BuildGlobalTopology(absl::Span<LocalTopologyProto>(locals));
+
+  EXPECT_IS_NOT_OK(BuildGpuTopology(global).status());
+}
+
+TEST(TopologyTest, BuildGpuTopologyWithDifferentNumDevicesPerHost) {
+  std::string slice_0_boot_id = "foo";
+  std::string slice_1_boot_id = "bar";
+  std::vector<LocalTopologyProto> locals(2);
+  locals[0].set_boot_id(slice_0_boot_id);
+  locals[1].set_boot_id(slice_1_boot_id);
+  locals[0].set_node_id(0);
+  locals[1].set_node_id(1);
+  // Adds 2 devices to host 0 and 1 device to host 1.
+  DeviceProto* d0 = locals[0].add_devices();
+  d0->set_local_device_ordinal(0);
+  DeviceProto* d1 = locals[0].add_devices();
+  d1->set_local_device_ordinal(1);
+  DeviceProto* d2 = locals[1].add_devices();
+  d2->set_local_device_ordinal(0);
+
+  GlobalTopologyProto global =
+      BuildGlobalTopology(absl::Span<LocalTopologyProto>(locals));
+
+  EXPECT_IS_NOT_OK(BuildGpuTopology(global).status());
+}
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/distributed/util.h b/third_party/xla/xla/pjrt/distributed/util.h
index 8e0f290795207b..b0fc713b191138 100644
--- a/third_party/xla/xla/pjrt/distributed/util.h
+++ b/third_party/xla/xla/pjrt/distributed/util.h
@@ -21,16 +21,16 @@ limitations under the License.
 
 namespace xla {
 
-inline Status FromGrpcStatus(const ::grpc::Status& s) {
+inline absl::Status FromGrpcStatus(const ::grpc::Status& s) {
   if (s.ok()) {
-    return OkStatus();
+    return absl::OkStatus();
   } else {
-    return Status(static_cast<absl::StatusCode>(s.error_code()),
-                  s.error_message());
+    return absl::Status(static_cast<absl::StatusCode>(s.error_code()),
+                        s.error_message());
   }
 }
 
-inline ::grpc::Status ToGrpcStatus(const Status& s) {
+inline ::grpc::Status ToGrpcStatus(const absl::Status& s) {
   if (s.ok()) {
     return ::grpc::Status::OK;
   } else {
diff --git a/third_party/xla/xla/pjrt/event_pool.cc b/third_party/xla/xla/pjrt/event_pool.cc
index a5e2f83c3e9d87..fadfe8bd56ee79 100644
--- a/third_party/xla/xla/pjrt/event_pool.cc
+++ b/third_party/xla/xla/pjrt/event_pool.cc
@@ -46,8 +46,7 @@ absl::StatusOr<EventPool::Handle> EventPool::AllocateEvent(
     }
   }
   if (!event.event_) {
-    event.event_ = std::make_unique<se::Event>(executor);
-    TF_RET_CHECK(event.event_->Init()) << "Event initialization failed";
+    TF_ASSIGN_OR_RETURN(event.event_, executor->CreateEvent());
   }
   return event;
 }
diff --git a/third_party/xla/xla/pjrt/exceptions.h b/third_party/xla/xla/pjrt/exceptions.h
index bf72c4f4fe5ec7..4ef8d544958a82 100644
--- a/third_party/xla/xla/pjrt/exceptions.h
+++ b/third_party/xla/xla/pjrt/exceptions.h
@@ -33,7 +33,7 @@ namespace xla {
 // Python code instead of RuntimeError.
 class XlaRuntimeError : public std::runtime_error {
  public:
-  explicit XlaRuntimeError(Status status)
+  explicit XlaRuntimeError(absl::Status status)
       : std::runtime_error(StatusToString(status)), status_(status) {
     CHECK(!status_->ok());
   }
@@ -43,7 +43,7 @@ class XlaRuntimeError : public std::runtime_error {
   std::optional<Status> status() const { return status_; }
 
  private:
-  static std::string StatusToString(const Status& st) {
+  static std::string StatusToString(const absl::Status& st) {
     if (!ShowStackTraces()) {
       return st.ToString(absl::StatusToStringMode::kWithNoExtraData);
     }
diff --git a/third_party/xla/xla/pjrt/gpu/BUILD b/third_party/xla/xla/pjrt/gpu/BUILD
index e446772dcd6e10..f3223cb52380f6 100644
--- a/third_party/xla/xla/pjrt/gpu/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/BUILD
@@ -87,10 +87,12 @@ cc_library(
         "//xla/stream_executor/integrations:device_mem_allocator",
         "//xla/stream_executor/integrations:tf_allocator_adapter",
         "//xla/tsl/util:env_var",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/functional:bind_front",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
@@ -106,10 +108,12 @@ cc_library(
         "@local_tsl//tsl/framework:device_id",
         "@local_tsl//tsl/framework:device_id_impl",
         "@local_tsl//tsl/lib/strings:proto_serialization",
+        "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:fingerprint",
         "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/profiler/lib:connected_traceme",
         "@local_tsl//tsl/profiler/lib:traceme",
     ] + if_cuda_or_rocm([
@@ -134,23 +138,36 @@ xla_cc_test(
         "requires-gpu-nvidia:2",
     ],
     deps = [
+        ":gpu_topology",
         ":se_gpu_pjrt_client",
         "//xla:literal",
         "//xla:literal_util",
+        "//xla:shape_util",
+        "//xla:status_macros",
         "//xla:statusor",
         "//xla:test",
+        "//xla:xla_data_proto_cc",
+        "//xla/client:xla_computation",
+        "//xla/ffi",
+        "//xla/ffi:ffi_api",
+        "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:pjrt_executable",
         "//xla/pjrt:pjrt_future",
-        "//xla/pjrt:utils",
+        "//xla/pjrt:pjrt_stream_executor_client",
         "//xla/pjrt/distributed:in_memory_key_value_store",
         "//xla/service:gpu_plugin",
         "//xla/service:hlo_parser",
+        "//xla/service:platform_util",
+        "//xla/stream_executor",
         "//xla/tests:literal_test_util",
-        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:status_matchers",
@@ -166,18 +183,12 @@ cc_library(
     deps = [
         "//xla:status_macros",
         "//xla:statusor",
-        "//xla:util",
-        "//xla/pjrt:pjrt_client",
-        "//xla/pjrt/distributed:client",
         "//xla/pjrt/distributed:key_value_store_interface",
         "//xla/service:global_device_id",
-        "//xla/service/gpu:nccl_clique_key",
         "//xla/service/gpu/runtime:nccl_api",
-        "@com_google_absl//absl/algorithm:container",
+        "//xla/service/gpu/runtime:nccl_clique_key",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@local_tsl//tsl/platform:errors",
@@ -214,6 +225,7 @@ cc_library(
     hdrs = ["gpu_topology.h"],
     deps = [
         ":gpu_topology_proto_cc",
+        "@com_google_absl//absl/strings:string_view",
     ],
 )
 
@@ -244,8 +256,10 @@ cc_library(
         "//xla/service/gpu:executable_proto_cc",
         "//xla/stream_executor/platform",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
     ] + if_cuda_or_rocm([
         ":nccl_id_store",
         "//xla/service/gpu:gpu_compiler",
diff --git a/third_party/xla/xla/pjrt/gpu/gpu_helpers.cc b/third_party/xla/xla/pjrt/gpu/gpu_helpers.cc
index b9477360de1190..bbbb4be4107e22 100644
--- a/third_party/xla/xla/pjrt/gpu/gpu_helpers.cc
+++ b/third_party/xla/xla/pjrt/gpu/gpu_helpers.cc
@@ -59,7 +59,7 @@ void EnablePeerAccess(absl::Span<se::StreamExecutor* const> executors) {
       se::StreamExecutor* from = executors[i];
       se::StreamExecutor* to = executors[j];
       if (from->CanEnablePeerAccessTo(to)) {
-        Status status = from->EnablePeerAccessTo(to);
+        absl::Status status = from->EnablePeerAccessTo(to);
         if (!status.ok()) {
           LOG(WARNING) << "Unable to enable peer access between GPUs " << i
                        << " and " << j << "; status: " << status;
@@ -76,8 +76,8 @@ absl::StatusOr<std::unique_ptr<tsl::BFCAllocator>> CreateBFCAllocator(
     se::StreamExecutor* executor, double memory_fraction, bool preallocate,
     std::optional<int64_t> gpu_system_memory_size) {
   bool enable_unified_memory;
-  Status status = tsl::ReadBoolFromEnvVar("TF_FORCE_UNIFIED_MEMORY", false,
-                                          &enable_unified_memory);
+  absl::Status status = tsl::ReadBoolFromEnvVar("TF_FORCE_UNIFIED_MEMORY",
+                                                false, &enable_unified_memory);
   if (!status.ok()) {
     LOG(ERROR) << "Unable to read TF_FORCE_UNIFIED_MEMORY: "
                << status.message();
@@ -173,7 +173,7 @@ std::unique_ptr<tsl::BFCAllocator> GetGpuHostAllocator(
                                   /*free_visitors=*/{}));
 
   int64_t xla_pjrt_gpu_host_memory_limit_gb;
-  Status status =
+  absl::Status status =
       tsl::ReadInt64FromEnvVar("XLA_PJRT_GPU_HOST_MEMORY_LIMIT_GB", 64,
                                &xla_pjrt_gpu_host_memory_limit_gb);
   if (!status.ok()) {
diff --git a/third_party/xla/xla/pjrt/gpu/gpu_topology.cc b/third_party/xla/xla/pjrt/gpu/gpu_topology.cc
index 278cd72824107f..e9baf5f359ba66 100644
--- a/third_party/xla/xla/pjrt/gpu/gpu_topology.cc
+++ b/third_party/xla/xla/pjrt/gpu/gpu_topology.cc
@@ -24,12 +24,19 @@ std::unique_ptr<const GpuTopology> GpuTopology::FromProto(
     const GpuTopologyProto& gpu_topology_proto) {
   return std::make_unique<GpuTopology>(
       std::vector<int>{gpu_topology_proto.device_ids().begin(),
-                       gpu_topology_proto.device_ids().end()});
+                       gpu_topology_proto.device_ids().end()},
+      gpu_topology_proto.platform_version(), gpu_topology_proto.num_slices(),
+      gpu_topology_proto.num_hosts_per_slice(),
+      gpu_topology_proto.num_devices_per_host());
 }
 
 GpuTopologyProto GpuTopology::ToProto() const {
   GpuTopologyProto proto;
   proto.mutable_device_ids()->Add(device_ids().begin(), device_ids().end());
+  proto.set_platform_version(platform_version());
+  proto.set_num_slices(num_slices());
+  proto.set_num_hosts_per_slice(num_hosts_per_slice());
+  proto.set_num_devices_per_host(num_devices_per_host());
   return proto;
 }
 
diff --git a/third_party/xla/xla/pjrt/gpu/gpu_topology.h b/third_party/xla/xla/pjrt/gpu/gpu_topology.h
index 25d1834c26e678..9c1e191bc4b72b 100644
--- a/third_party/xla/xla/pjrt/gpu/gpu_topology.h
+++ b/third_party/xla/xla/pjrt/gpu/gpu_topology.h
@@ -19,23 +19,51 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "xla/pjrt/gpu/gpu_topology.pb.h"
 
 namespace xla {
 class GpuTopology {
  public:
-  explicit GpuTopology(const std::vector<int>& gpu_device_ids)
-      : devices_ids_(gpu_device_ids) {}
+  explicit GpuTopology(const std::vector<int>& gpu_device_ids,
+                       absl::string_view platform_version, int32_t num_slices,
+                       int32_t num_hosts_per_slice,
+                       int32_t num_devices_per_host)
+      : devices_ids_(gpu_device_ids),
+        platform_version_(platform_version),
+        num_slices_(num_slices),
+        num_hosts_per_slice_(num_hosts_per_slice),
+        num_devices_per_host_(num_devices_per_host) {}
 
-  int number_of_devices() const { return devices_ids_.size(); }
+  bool operator==(const GpuTopology& other) const {
+    return devices_ids_ == other.devices_ids_ &&
+           platform_version_ == other.platform_version_ &&
+           num_slices_ == other.num_slices_ &&
+           num_hosts_per_slice_ == other.num_hosts_per_slice_ &&
+           num_devices_per_host_ == other.num_devices_per_host_;
+  }
+
+  int number_of_devices() const {
+    return number_of_hosts() * num_devices_per_host_;
+  }
   const std::vector<int>& device_ids() const { return devices_ids_; }
+  int number_of_hosts() const { return num_slices_ * num_hosts_per_slice_; }
 
   static std::unique_ptr<const GpuTopology> FromProto(
       const GpuTopologyProto& proto);
   GpuTopologyProto ToProto() const;
 
+  std::string platform_version() const { return platform_version_; }
+  int32_t num_slices() const { return num_slices_; }
+  int32_t num_hosts_per_slice() const { return num_hosts_per_slice_; }
+  int32_t num_devices_per_host() const { return num_devices_per_host_; }
+
  private:
   const std::vector<int> devices_ids_;
+  const std::string platform_version_;
+  const int32_t num_slices_;
+  const int32_t num_hosts_per_slice_;
+  const int32_t num_devices_per_host_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/gpu/gpu_topology.proto b/third_party/xla/xla/pjrt/gpu/gpu_topology.proto
index 0b8fca0e0ad964..0bb3c5b34ff62f 100644
--- a/third_party/xla/xla/pjrt/gpu/gpu_topology.proto
+++ b/third_party/xla/xla/pjrt/gpu/gpu_topology.proto
@@ -2,7 +2,31 @@ syntax = "proto3";
 
 package xla;
 
+enum GpuVersionProto {
+  GPU_VERSION_UNSPECIFIED = 0;
+  GPU_VERSION_A100 = 1;
+  GPU_VERSION_H100 = 2;
+}
+
 // A proto used to serialize GpuTopology instances.
 message GpuTopologyProto {
+  // TODO(b/331224674): Remove this field once all uses are removed.
   repeated int32 device_ids = 1;
+
+  GpuVersionProto gpu_version = 2;
+
+  // Name for the GPU version, e.g., "NVIDIA A100-SXM4-40GB". Returned as
+  // "device_kind" of a GPU device in the PJRT client API.
+  string platform_version = 3;
+
+  // The number of slices.
+  // Devices on the same slice are connected by the fast network via NVLinks,
+  // which could be within a host or span across multiple hosts.
+  int32 num_slices = 4;
+
+  // The number of hosts for each slice.
+  int32 num_hosts_per_slice = 5;
+
+  // The number of devices for each host.
+  int32 num_devices_per_host = 6;
 }
diff --git a/third_party/xla/xla/pjrt/gpu/nccl_id_store.cc b/third_party/xla/xla/pjrt/gpu/nccl_id_store.cc
index 55726a05427888..c61dbb748f40b8 100644
--- a/third_party/xla/xla/pjrt/gpu/nccl_id_store.cc
+++ b/third_party/xla/xla/pjrt/gpu/nccl_id_store.cc
@@ -20,8 +20,8 @@ limitations under the License.
 
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
-#include "xla/service/gpu/nccl_clique_key.h"
 #include "xla/service/gpu/runtime/nccl_api.h"
+#include "xla/service/gpu/runtime/nccl_clique_key.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/pjrt/gpu/nccl_id_store.h b/third_party/xla/xla/pjrt/gpu/nccl_id_store.h
index 70060e242b1506..389775debd4caf 100644
--- a/third_party/xla/xla/pjrt/gpu/nccl_id_store.h
+++ b/third_party/xla/xla/pjrt/gpu/nccl_id_store.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/service/global_device_id.h"
-#include "xla/service/gpu/nccl_clique_key.h"
+#include "xla/service/gpu/runtime/nccl_clique_key.h"
 #include "xla/statusor.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index 3d0f3d3ba76253..dc0768aa1473d6 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
-#include <limits>
 #include <map>
 #include <memory>
 #include <optional>
@@ -29,16 +28,17 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/any_invocable.h"
+#include "absl/functional/bind_front.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/memory/memory.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
@@ -47,6 +47,8 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/client/local_client.h"
 #include "xla/client/xla_computation.h"
+#include "xla/layout.h"
+#include "xla/layout_util.h"
 #include "xla/literal.h"
 #include "xla/pjrt/distributed/in_memory_key_value_store.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
@@ -79,8 +81,11 @@ limitations under the License.
 #include "xla/stream_executor/stream_executor.h"
 #include "tsl/framework/allocator.h"
 #include "tsl/lib/strings/proto_serialization.h"
+#include "tsl/platform/casts.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/fingerprint.h"
 #include "tsl/platform/status.h"
+#include "tsl/platform/statusor.h"
 #include "tsl/platform/threadpool.h"
 #include "tsl/profiler/lib/connected_traceme.h"
 #include "tsl/profiler/lib/traceme.h"
@@ -131,9 +136,18 @@ class AsyncHostToDeviceTransferManager
       // event will block the buffer usage until the transfer is done.
       definition_events.push_back(
           std::make_shared<BufferSequencingEvent>(client->thread_pool()));
+      TF_ASSIGN_OR_RETURN(Shape compact_shape,
+                          client->client()
+                              ->backend()
+                              .transfer_manager()
+                              ->ChooseCompactLayoutForShape(shape));
+      LocalDeviceState* local_device = device->local_device_state();
+      se::Stream* h2d_stream = local_device->host_to_device_stream();
       TF_ASSIGN_OR_RETURN(auto buffer,
-                          client->CreateUninitializedBuffer(
-                              shape, device, definition_events.back()));
+                          AllocateDestinationBuffer(
+                              compact_shape, device, local_device, h2d_stream,
+                              /*is_uninitialized_create=*/true, client,
+                              definition_events.back()));
       // Get a temporary hold just so we can fish out a shared_ptr to the
       // TrackedDeviceBuffer. It's ok to drop the hold before return the
       // buffers, because the invariants of this class ensure that the buffer
@@ -199,7 +213,7 @@ class AsyncHostToDeviceTransferManager
     return std::move(buffers_[buffer_index]);
   };
 
-  Status TransferLiteralToBuffer(
+  absl::Status TransferLiteralToBuffer(
       int buffer_index, const LiteralSlice& literal,
       absl::AnyInvocable<void() &&> on_done) override {
     tsl::profiler::TraceMe traceme(
@@ -288,10 +302,10 @@ class AsyncHostToDeviceTransferManager
           (*ptr)();
           delete ptr;
         }));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status TransferRawDataToBuffer(
+  absl::Status TransferRawDataToBuffer(
       int buffer_index, absl::string_view data,
       absl::AnyInvocable<void() &&> on_done) override {
     return TransferRawDataToSubBuffer(buffer_index, data.data(),
@@ -300,7 +314,7 @@ class AsyncHostToDeviceTransferManager
                                       std::move(on_done));
   }
 
-  Status TransferRawDataToSubBuffer(
+  absl::Status TransferRawDataToSubBuffer(
       int buffer_index, const void* data, int64_t offset, int64_t transfer_size,
       bool is_last_transfer, absl::AnyInvocable<void() &&> on_done) override {
     auto* stream = device_->local_device_state()->host_to_device_stream();
@@ -389,15 +403,15 @@ class AsyncHostToDeviceTransferManager
                                                                 event.value());
 
     auto cleanup = [this, buffer_index, event = std::move(event).value(),
-                    stream, is_last_transfer,
-                    on_done = std::move(on_done)]() mutable {
+                    stream, is_last_transfer, on_done = std::move(on_done),
+                    staging_buffer = std::move(staging_buffer)]() mutable {
       CleanUp(buffer_index, std::move(event), stream, is_last_transfer,
               std::move(on_done));
     };
     return stream->DoHostCallback(std::move(cleanup));
   }
 
-  void SetBufferError(int buffer_index, Status error) override {
+  void SetBufferError(int buffer_index, absl::Status error) override {
     {
       absl::MutexLock l(&mu_);
       // For a given buffer_index, SetBufferError can't be called twice, or
@@ -465,6 +479,43 @@ class AsyncHostToDeviceTransferManager
   }
 };
 
+StreamExecutorGpuClient::StreamExecutorGpuClient(
+    std::string platform_name, LocalClient* client,
+    std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> devices,
+    int process_index, std::unique_ptr<se::DeviceMemoryAllocator> allocator,
+    std::unique_ptr<tsl::Allocator> host_memory_allocator,
+    bool should_stage_host_to_device_transfers,
+    std::unique_ptr<gpu::GpuExecutableRunOptions> gpu_run_options)
+    : xla::PjRtStreamExecutorClient(
+          platform_name, client, std::move(devices), process_index,
+          std::move(allocator), std::move(host_memory_allocator),
+          should_stage_host_to_device_transfers, std::move(gpu_run_options)),
+      topology_(xla::StreamExecutorGpuTopologyDescription::Create(
+          tsl::Fingerprint64(platform_name), platform_name,
+          devices_.back()->device_kind(), devices_)) {
+  for (auto* device : addressable_devices()) {
+    // Use the device id to construct a globally unique memory space id. We do
+    // not promise that memory space ids and device ids are the same.
+    const int id = device->id();
+    auto memory_space =
+        std::make_unique<StreamExecutorGpuHbmMemorySpace>(id, device);
+    tensorflow::down_cast<PjRtStreamExecutorDevice*>(device)->AttachMemorySpace(
+        memory_space.get());
+    owned_memory_spaces_.push_back(std::move(memory_space));
+  }
+  for (const std::unique_ptr<PjRtMemorySpace>& memory_space :
+       owned_memory_spaces_) {
+    memory_spaces_.push_back(memory_space.get());
+  }
+
+  // We don't promise anything about the order of memory spaces, but this
+  // sorting is done for consistency with the device list that's sorted above.
+  absl::c_sort(memory_spaces_,
+               [](const PjRtMemorySpace* a, const PjRtMemorySpace* b) {
+                 return a->id() < b->id();
+               });
+}
+
 absl::string_view StreamExecutorGpuClient::platform_version() const {
 #define STRINGIFY2(X) #X
 #define STRINGIFY(X) STRINGIFY2(X)
@@ -488,6 +539,13 @@ StreamExecutorGpuClient::CreateBuffersForAsyncHostToDevice(
       shapes, stream_executor_device, this);
 }
 
+absl::StatusOr<std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
+StreamExecutorGpuClient::CreateBuffersForAsyncHostToDevice(
+    absl::Span<const Shape> shapes, PjRtMemorySpace* memory_space) {
+  CHECK_EQ(memory_space->devices().size(), 1);
+  return CreateBuffersForAsyncHostToDevice(shapes, memory_space->devices()[0]);
+}
+
 absl::StatusOr<xla::DeviceAssignment>
 StreamExecutorGpuClient::GetDefaultDeviceAssignment(int num_replicas,
                                                     int num_partitions) const {
@@ -504,107 +562,141 @@ StreamExecutorGpuClient::GetDefaultDeviceAssignment(int num_replicas,
 }
 
 PjRtFuture<> StreamExecutorGpuClient::CopyRawSubBufferToHost(
-    PjRtBuffer* pjrt_buffer, void* dst, int64_t offset, int64_t transfer_size) {
+    PjRtBuffer* pjrt_buffer, PjRtFuture<void*> dst, int64_t offset,
+    int64_t transfer_size) {
   auto* buffer = tensorflow::down_cast<PjRtStreamExecutorBuffer*>(pjrt_buffer);
   DCHECK(buffer);
   PjRtStreamExecutorDevice* device = buffer->device();
   LocalDeviceState* local_device = device->local_device_state();
-  // Always borrow a stream to avoid potential deadlocks enqueueing transfers
-  // that might be required in order to compute the inputs for computations
-  // that have already been enqueued. Such cycles can occur when there are
-  // cross-host data dependencies.
-  auto stream = local_device->BorrowStreamFromPool();
+  se::Stream* stream = local_device->GetDeviceToHostStream();
 
+  // Acquire the usage hold inline so that the buffer is kept alive even if
+  // `dst` is not immediately available.
   PjRtStreamExecutorBuffer::ScopedHold hold(buffer->GetBufferWithUsageHold());
   if (!hold.ok()) {
     return PjRtFuture<>(hold.status());
   }
+
   auto device_buffer = hold.buffer();
   if (device_buffer->device_memory().size() != 1) {
     return PjRtFuture<>(InvalidArgument("Copy raw buffer called on tuple"));
   }
-  auto& device_memory = device_buffer->device_memory()[0];
-  if (offset < 0 || offset > device_memory.size() ||
-      device_memory.size() - offset < transfer_size) {
-    return PjRtFuture<>(
-        InvalidArgument("Copy raw buffer called on buffer size %lld with "
-                        "invalid offset %lld, transfer size %lld",
-                        device_memory.size(), offset, transfer_size));
-  }
-  WaitForBufferDefinitionEventsOnStream(*device_buffer, stream.get());
-  absl::StatusOr<EventPool::Handle> event_or =
-      local_device->event_pool().AllocateEvent(stream->parent());
-  if (!event_or.ok()) {
-    return PjRtFuture<>(event_or.status());
-  }
 
-  std::unique_ptr<se::DeviceMemoryBase> sub_buffer;
-  if (transfer_size < device_memory.size()) {
-    sub_buffer = std::make_unique<se::DeviceMemoryBase>(
-        device_memory.GetByteSlice(offset, transfer_size));
-  } else {
-    sub_buffer = std::make_unique<se::DeviceMemoryBase>(device_memory);
-  }
+  auto promise = PjRtFuture<>::CreatePromise();
+  auto usage_event =
+      std::make_shared<BufferSequencingEvent>(this->thread_pool());
 
-  if (transfer_size != 0) {
-    if (should_stage_host_to_device_transfers()) {
-      if (host_memory_allocator() == nullptr) {
-        return PjRtFuture<>(InvalidArgument(
-            "host_memory_allocator should be initialized for staging buffer "
-            "transfer."));
-      }
-      void* ptr = host_memory_allocator()->AllocateRaw(
-          tsl::Allocator::kAllocatorAlignment, transfer_size);
+  // When using the ComputeSynchronized allocation model, retain a reference to
+  // the device_buffer until the copy completes, to ensure that the buffer isn't
+  // deleted or donated while it is still in use. The choice of retaining a
+  // reference at the host is a heuristic; the alternative is to ensure, before
+  // freeing the buffer, that the compute stream is synchronized past the
+  // transfer, but it seems better to hold onto the buffer too long than to
+  // stall the compute stream.
+  hold.ConvertUsageHold(stream, usage_event, /*reference_held=*/true);
+
+  auto async_copy = [this, promise, offset, transfer_size, stream, local_device,
+                     device_buffer, usage_event = std::move(usage_event)](
+                        absl::StatusOr<void*> dst) mutable {
+    absl::StatusOr<EventPool::Handle> event =
+        local_device->event_pool().AllocateEvent(stream->parent());
+    if (!event.ok()) {
+      promise.Set(event.status());
+      return;
+    }
 
-      std::shared_ptr<void> staging_buffer = std::shared_ptr<void>(
-          ptr, [host_memory_allocator = host_memory_allocator()](void* ptr) {
-            host_memory_allocator->DeallocateRaw(ptr);
-          });
-      if (auto status =
-              stream->Memcpy(staging_buffer.get(), *sub_buffer, transfer_size);
-          !status.ok()) {
-        return PjRtFuture<>(status);
-      }
-      auto copy_to_staging_buffer = [dst, transfer_size,
-                                     staging_buffer]() mutable {
-        std::memcpy(dst, staging_buffer.get(), transfer_size);
-      };
-      if (auto status = stream->DoHostCallback(copy_to_staging_buffer);
-          !status.ok()) {
-        return PjRtFuture<>(status);
-      }
+    absl::Status defined_status =
+        device_buffer->definition_events()[0]->GetDefinedStatus();
+    if (!defined_status.ok()) {
+      promise.Set(defined_status);
+      return;
+    }
+
+    auto& device_memory = device_buffer->device_memory()[0];
+    if (offset < 0 || offset > device_memory.size() ||
+        device_memory.size() - offset < transfer_size) {
+      promise.Set(
+          InvalidArgument("Copy raw buffer called on buffer size %lld with "
+                          "invalid offset %lld, transfer size %lld",
+                          device_memory.size(), offset, transfer_size));
+      return;
+    }
+
+    std::unique_ptr<se::DeviceMemoryBase> sub_buffer;
+    if (transfer_size < device_memory.size()) {
+      sub_buffer = std::make_unique<se::DeviceMemoryBase>(
+          device_memory.GetByteSlice(offset, transfer_size));
     } else {
-      // D2H request holds a non-owned pointer into sub_buffer base address
-      // that needs to outlive the transfer until the stream callback is
-      // invoked.
-      auto status = stream->Memcpy(dst, *sub_buffer, transfer_size);
-      if (!status.ok()) {
-        return PjRtFuture<>(status);
+      sub_buffer = std::make_unique<se::DeviceMemoryBase>(device_memory);
+    }
+
+    WaitForBufferDefinitionEventsOnStream(*device_buffer, stream);
+
+    if (transfer_size != 0) {
+      if (should_stage_host_to_device_transfers()) {
+        if (host_memory_allocator() == nullptr) {
+          promise.Set(InvalidArgument(
+              "host_memory_allocator should be initialized for staging buffer "
+              "transfer."));
+          return;
+        }
+        void* ptr = host_memory_allocator()->AllocateRaw(
+            tsl::Allocator::kAllocatorAlignment, transfer_size);
+
+        std::shared_ptr<void> staging_buffer = std::shared_ptr<void>(
+            ptr, [host_memory_allocator = host_memory_allocator()](void* ptr) {
+              host_memory_allocator->DeallocateRaw(ptr);
+            });
+        if (auto status = stream->Memcpy(staging_buffer.get(), *sub_buffer,
+                                         transfer_size);
+            !status.ok()) {
+          promise.Set(std::move(status));
+          return;
+        }
+        auto copy_to_staging_buffer = [dst, transfer_size,
+                                       staging_buffer]() mutable {
+          std::memcpy(*dst, staging_buffer.get(), transfer_size);
+        };
+        if (auto status = stream->DoHostCallback(copy_to_staging_buffer);
+            !status.ok()) {
+          promise.Set(std::move(status));
+          return;
+        }
+      } else {
+        // D2H request holds a non-owned pointer into sub_buffer base address
+        // that needs to outlive the transfer until the stream callback is
+        // invoked.
+        auto status = stream->Memcpy(*dst, *sub_buffer, transfer_size);
+        if (!status.ok()) {
+          promise.Set(std::move(status));
+          return;
+        }
       }
     }
-  }
 
-  auto usage_event =
-      std::make_shared<BufferSequencingEvent>(this->thread_pool());
-  local_device->event_pool().ThenRecordEvent(stream.get(), event_or.value());
-  usage_event->SetSequencingEvent(std::move(event_or).value(), stream.get());
-  // This usage hold will prevent device_buffer from being deleted before
-  // the transfer is complete.
-  hold.ConvertUsageHold(stream.get(), std::move(usage_event),
-                        /*reference_held=*/false);
+    local_device->event_pool().ThenRecordEvent(stream, event.value());
+    usage_event->SetSequencingEvent(std::move(event).value(), stream);
 
-  auto promise = PjRtFuture<>::CreatePromise();
-  auto stream_ptr = stream.get();
-  auto callback_status = local_device->ThenExecuteCallback(
-      stream_ptr,
-      [promise, free_stream = stream.release(), local_device]() mutable {
-        auto stream = std::unique_ptr<se::Stream>(free_stream);
-        local_device->ReturnStreamToPool(std::move(stream));
-        promise.Set();
+    auto callback_status = local_device->ThenExecuteCallback(
+        stream, [promise, device_buffer = std::move(device_buffer)]() mutable {
+          promise.Set();
+        });
+    if (!callback_status.ok()) {
+      promise.Set(std::move(callback_status));
+      return;
+    }
+  };
+
+  device_buffer->definition_events()[0]->ExecuteOrAddToFutureTasks(
+      absl::StrFormat("async_copy_raw_sub_buffer_to_host_%p", &async_copy),
+      [this, dst, async_copy = std::move(async_copy)]() mutable {
+        dst.OnReady([this, async_copy = std::move(async_copy)](
+                        absl::StatusOr<void*> dst) {
+          // Trampoline through a thread pool since GPUs do not allow calling
+          // D2H inside the callback's context.
+          thread_pool()->Schedule(absl::bind_front(async_copy, std::move(dst)));
+        });
       });
-  if (!callback_status.ok()) {
-    return PjRtFuture<>(callback_status);
-  }
 
   return PjRtFuture<>(
       std::move(promise),
@@ -650,66 +742,10 @@ StreamExecutorGpuClient::Compile(const XlaComputation& computation,
   return executable;
 }
 
-namespace {
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-absl::StatusOr<std::unique_ptr<StreamExecutorExecutable>> FromProto(
-    const StreamExecutorExecutableProto& proto) {
-  TF_ASSIGN_OR_RETURN(CompileOptions compile_options,
-                      CompileOptions::FromProto(proto.compile_options()));
-  std::vector<std::unique_ptr<xla::AotCompilationResult>>
-      deserialized_aot_executables;
-  for (const auto& executable : proto.executables()) {
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<xla::AotCompilationResult> deserialized,
-        gpu::GpuCompiler::LoadAotCompilationResultStatic(executable));
-    deserialized_aot_executables.push_back(std::move(deserialized));
-  }
-  return std::make_unique<StreamExecutorExecutable>(
-      compile_options, std::move(deserialized_aot_executables),
-      proto.num_replicas(), proto.num_partitions(), proto.name(),
-      proto.fingerprint());
-}
-#endif
-}  // namespace
-
 absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
 StreamExecutorGpuClient::LoadSerialized(absl::string_view serialized,
                                         std::optional<CompileOptions> options,
                                         const LoadOptions& load_options) {
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  StreamExecutorExecutableProto proto;
-  if (serialized.size() > std::numeric_limits<int>::max()) {
-    return Internal(
-        "PjRtStreamExecutorClient::DeserializeExecutable proto too large "
-        "(>2GB)");
-  }
-  if (!proto.ParseFromArray(serialized.data(), serialized.size())) {
-    return Internal(
-        "StreamExecutorGpuClient::DeserializeExecutable proto deserialization "
-        "failed");
-  }
-  TF_ASSIGN_OR_RETURN(auto se_executable, FromProto(proto));
-  // TODO(b/296466237): Unify the `Load` method.
-  return Load(std::move(se_executable));
-#endif
-  return absl::InternalError("LoadSerialized only works with cuda or rocm.");
-}
-
-absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
-StreamExecutorGpuClient::DeserializeExecutable(
-    absl::string_view serialized, std::optional<CompileOptions> options) {
-  if (serialized.size() > std::numeric_limits<int>::max()) {
-    return Internal(
-        "StreamExecutorGpuClient::DeserializeExecutable proto too large "
-        "(>2GB)");
-  }
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  StreamExecutorExecutableProto proto;
-  if (proto.ParseFromArray(serialized.data(), serialized.size())) {
-    TF_ASSIGN_OR_RETURN(auto se_executable, FromProto(proto));
-    return Load(std::move(se_executable));
-  }
-#endif
   return PjRtStreamExecutorClient::DeserializeExecutable(serialized, options);
 }
 
@@ -905,7 +941,7 @@ GetStreamExecutorGpuDeviceAllocator(
 
 }  // namespace
 
-Status BuildDistributedDevices(
+absl::Status BuildDistributedDevices(
     std::string_view platform_name,
     std::map<int, std::unique_ptr<LocalDeviceState>> local_device_states,
     int node_id, int num_nodes,
@@ -926,7 +962,7 @@ Status BuildDistributedDevices(
   local_topology.set_boot_id(boot_id_str);
   for (const auto& ordinal_and_device : local_device_states) {
     const se::Platform* platform =
-        ordinal_and_device.second->executor()->platform();
+        ordinal_and_device.second->executor()->GetPlatform();
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<xla::se::DeviceDescription> desc,
         platform->DescriptionForDevice(ordinal_and_device.first));
@@ -991,7 +1027,7 @@ Status BuildDistributedDevices(
         });
   }
 #endif  // GOOGLE_CUDA
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 std::string MakeComputeCapabilityString(const se::DeviceDescription* desc) {
@@ -1076,6 +1112,20 @@ int StreamExecutorGpuDevice::core_on_chip() const {
   return description().core_on_chip();
 }
 
+absl::StatusOr<PjRtMemorySpace*> StreamExecutorGpuDevice::default_memory_space()
+    const {
+  return memory_space_by_kind_id(StreamExecutorGpuHbmMemorySpace::kKindId);
+}
+
+const int StreamExecutorGpuHbmMemorySpace::kKindId = []() {
+  uint32_t kind_id = tsl::Fingerprint32(StreamExecutorGpuHbmMemorySpace::kKind);
+  return static_cast<int>(kind_id);
+}();
+
+StreamExecutorGpuHbmMemorySpace::StreamExecutorGpuHbmMemorySpace(
+    int id, PjRtDevice* device)
+    : PjRtStreamExecutorMemorySpace(id, device, kKind, kKindId) {}
+
 absl::StatusOr<std::unique_ptr<PjRtClient>> GetStreamExecutorGpuClient(
     const GpuClientOptions& options) {
 #if TENSORFLOW_USE_ROCM
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
index cbdb85351a239c..59efaf3226fd40 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_PJRT_GPU_SE_GPU_PJRT_CLIENT_H_
 #define XLA_PJRT_GPU_SE_GPU_PJRT_CLIENT_H_
 
+#include <cstdint>
 #include <map>
 #include <memory>
 #include <optional>
@@ -24,16 +25,33 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/memory/memory.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "absl/time/time.h"
 #include "absl/types/span.h"
-#include "xla/pjrt/distributed/client.h"
+#include "xla/client/local_client.h"
+#include "xla/client/xla_computation.h"
+#include "xla/layout.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/pjrt/gpu/gpu_helpers.h"
 #include "xla/pjrt/gpu/gpu_topology.h"
+#include "xla/pjrt/local_device_state.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_device_description.h"
 #include "xla/pjrt/pjrt_executable.h"
+#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/pjrt_stream_executor_client.h"
+#include "xla/service/computation_placer.h"
+#include "xla/service/gpu/gpu_executable_run_options.h"
+#include "xla/shape.h"
 #include "xla/statusor.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "tsl/framework/allocator.h"
+#include "tsl/platform/casts.h"
 #include "tsl/platform/fingerprint.h"
 
 namespace stream_executor {
@@ -69,15 +87,17 @@ class StreamExecutorGpuTopologyDescription : public PjRtTopologyDescription {
       : platform_id_(platform_id),
         platform_name_(platform_name),
         platform_version_(platform_version),
-        gpu_topology_(gpu_device_ids),
+        // TODO(b/331224674): Add support for multi-host.
+        gpu_topology_(gpu_device_ids, platform_version, /*num_slices=*/1,
+                      /*num_hosts_per_slice=*/1,
+                      /*num_devices_per_host=*/gpu_device_ids.size()),
         attributes_(attributes) {}
 
   bool operator==(const StreamExecutorGpuTopologyDescription& other) const {
     return this->platform_id() == other.platform_id() &&
            this->platform_name() == other.platform_name() &&
            this->platform_version() == other.platform_version() &&
-           this->gpu_topology().device_ids() ==
-               other.gpu_topology().device_ids();
+           this->gpu_topology() == other.gpu_topology();
   }
 
   PjRtPlatformId platform_id() const override { return platform_id_; }
@@ -132,7 +152,7 @@ class StreamExecutorGpuTopologyDescription : public PjRtTopologyDescription {
     return attributes_;
   }
 
-  StatusOr<Layout> GetDefaultLayout(
+  absl::StatusOr<Layout> GetDefaultLayout(
       PrimitiveType element_type,
       absl::Span<const int64_t> dims) const override;
 
@@ -162,11 +182,21 @@ class StreamExecutorGpuDevice : public PjRtStreamExecutorDevice {
 
   int core_on_chip() const;
 
+  absl::StatusOr<PjRtMemorySpace*> default_memory_space() const override;
+
  private:
   std::string device_vendor_;
   int slice_index_;
 };
 
+class StreamExecutorGpuHbmMemorySpace : public PjRtStreamExecutorMemorySpace {
+ public:
+  static constexpr absl::string_view kKind = "device";
+  static const int kKindId;
+
+  StreamExecutorGpuHbmMemorySpace(int id, PjRtDevice* device);
+};
+
 // A custom PjRtClient that overrides the device assignment method.
 class StreamExecutorGpuClient : public xla::PjRtStreamExecutorClient {
  public:
@@ -178,14 +208,7 @@ class StreamExecutorGpuClient : public xla::PjRtStreamExecutorClient {
       int process_index, std::unique_ptr<se::DeviceMemoryAllocator> allocator,
       std::unique_ptr<tsl::Allocator> host_memory_allocator,
       bool should_stage_host_to_device_transfers,
-      std::unique_ptr<gpu::GpuExecutableRunOptions> gpu_run_options)
-      : xla::PjRtStreamExecutorClient(
-            platform_name, client, std::move(devices), process_index,
-            std::move(allocator), std::move(host_memory_allocator),
-            should_stage_host_to_device_transfers, std::move(gpu_run_options)),
-        topology_(xla::StreamExecutorGpuTopologyDescription::Create(
-            tsl::Fingerprint64(platform_name), platform_name,
-            devices_.back()->device_kind(), devices_)) {}
+      std::unique_ptr<gpu::GpuExecutableRunOptions> gpu_run_options);
 
   absl::StatusOr<xla::DeviceAssignment> GetDefaultDeviceAssignment(
       int num_replicas, int num_partitions) const override;
@@ -196,7 +219,11 @@ class StreamExecutorGpuClient : public xla::PjRtStreamExecutorClient {
   CreateBuffersForAsyncHostToDevice(absl::Span<const Shape> shapes,
                                     PjRtDevice* device) override;
 
-  PjRtFuture<> CopyRawSubBufferToHost(PjRtBuffer* buffer, void* dst,
+  absl::StatusOr<std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
+  CreateBuffersForAsyncHostToDevice(absl::Span<const Shape> shapes,
+                                    PjRtMemorySpace* memory_space) override;
+
+  PjRtFuture<> CopyRawSubBufferToHost(PjRtBuffer* buffer, PjRtFuture<void*> dst,
                                       int64_t offset,
                                       int64_t transfer_size) override;
 
@@ -224,10 +251,6 @@ class StreamExecutorGpuClient : public xla::PjRtStreamExecutorClient {
       absl::string_view serialized, std::optional<CompileOptions> options,
       const LoadOptions& load_options);
 
-  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> DeserializeExecutable(
-      absl::string_view serialized,
-      std::optional<CompileOptions> options) override;
-
   absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
       const XlaComputation& computation, CompileOptions options) override;
 
@@ -241,7 +264,7 @@ std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> BuildLocalDevices(
 
 std::string MakeComputeCapabilityString(const se::DeviceDescription* desc);
 
-Status BuildDistributedDevices(
+absl::Status BuildDistributedDevices(
     std::string_view platform_name,
     std::map<int, std::unique_ptr<LocalDeviceState>> local_device_states,
     int node_id, int num_nodes,
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
index e7aac5c63b3549..c178cf38a2930d 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
@@ -19,41 +19,56 @@ limitations under the License.
 
 #include <array>
 #include <cstdint>
+#include <cstring>
 #include <memory>
 #include <numeric>
-#include <optional>
 #include <string>
-#include <string_view>
 #include <utility>
 #include <vector>
 
-#include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/match.h"
-#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
+#include "xla/client/xla_computation.h"
+#include "xla/ffi/ffi.h"
+#include "xla/ffi/ffi_api.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/pjrt/distributed/in_memory_key_value_store.h"
+#include "xla/pjrt/gpu/gpu_topology.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_future.h"
-#include "xla/pjrt/utils.h"
+#include "xla/pjrt/pjrt_stream_executor_client.h"
 #include "xla/service/hlo_parser.h"
+#include "xla/service/platform_util.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status_macros.h"
 #include "xla/statusor.h"
+#include "xla/stream_executor/stream.h"
 #include "xla/test.h"
 #include "xla/tests/literal_test_util.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/status_matchers.h"
 #include "tsl/platform/statusor.h"
+#include "tsl/platform/threadpool.h"
 
 namespace xla {
 namespace {
 
 using ::testing::ElementsAre;
 using ::testing::HasSubstr;
+using ::tsl::testing::IsOkAndHolds;
 using ::tsl::testing::StatusIs;
 
 absl::StatusOr<std::unique_ptr<xla::PjRtLoadedExecutable>> CompileExecutable(
@@ -108,6 +123,58 @@ static constexpr char const* kProgram = R"(HloModule HostTransfer
       ROOT result = f32[2] get-tuple-element(recv-done), index=0
     })";
 
+TEST(StreamExecutorGpuClientTest, MemorySpace) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(GpuClientOptions()));
+  ASSERT_GE(client->devices().size(), 1);
+
+  for (auto* device : client->devices()) {
+    TF_ASSERT_OK_AND_ASSIGN(auto* memory_space, device->default_memory_space());
+    EXPECT_THAT(device->memory_spaces(), ElementsAre(memory_space));
+    EXPECT_EQ(memory_space->kind(), StreamExecutorGpuHbmMemorySpace::kKind);
+    EXPECT_EQ(memory_space->kind_id(),
+              StreamExecutorGpuHbmMemorySpace::kKindId);
+    EXPECT_THAT(
+        device->memory_space_by_kind(StreamExecutorGpuHbmMemorySpace::kKind),
+        IsOkAndHolds(memory_space));
+  }
+}
+
+TEST(StreamExecutorGpuClientTest, PropagateError) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(GpuClientOptions()));
+  auto shape = xla::ShapeUtil::MakeScalarShape(xla::F32);
+  absl::Status input_error = absl::InvalidArgumentError("input error");
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto buffer,
+      client->CreateErrorBuffer(
+          input_error, shape,
+          *client->addressable_devices()[0]->default_memory_space()));
+
+  static constexpr char const* kAddProgram =
+      R"(
+HloModule Add.6, entry_computation_layout={(f32[], f32[])->(f32[], f32[])}
+
+ENTRY %Add.6 (a.1: f32[], b.2: f32[]) -> (f32[], f32[]) {
+  %a.1 = f32[] parameter(0)
+  %b.2 = f32[] parameter(1)
+  %add.3 = f32[] add(f32[] %a.1, f32[] %b.2)
+  %add.4 = f32[] add(f32[] %add.3, f32[] %add.3)
+  ROOT %tuple.5 = (f32[], f32[]) tuple(f32[] %add.3, f32[] %add.4)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          CompileExecutable(kAddProgram, *client));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto result,
+      executable->Execute({{buffer.get(), buffer.get()}}, /*options=*/{}));
+
+  ASSERT_EQ(result.size(), 1);
+  ASSERT_EQ(result[0].size(), 1);
+  EXPECT_EQ(result[0][0]->GetReadyFuture().Await(), input_error);
+}
+
 TEST(StreamExecutorGpuClientTest, SendRecvChunked) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
                           GetStreamExecutorGpuClient(GpuClientOptions()));
@@ -124,7 +191,7 @@ TEST(StreamExecutorGpuClientTest, SendRecvChunked) {
         float* data = reinterpret_cast<float*>(chunk.data());
         sent_value[0] = data[0];
         sent_value[1] = data[1];
-        return OkStatus();
+        return absl::OkStatus();
       }};
 
   // Recv buffer from host.
@@ -139,7 +206,7 @@ TEST(StreamExecutorGpuClientTest, SendRecvChunked) {
         *reinterpret_cast<float*>(chunk1.data()) = 6.0f;
         TF_CHECK_OK(stream->AddChunk(std::move(chunk1)).Await());
 
-        return OkStatus();
+        return absl::OkStatus();
       }};
 
   // Callbacks for point-to-point communication ops.
@@ -176,9 +243,10 @@ TEST(StreamExecutorGpuClientTest, SendErrorNoDeadLock) {
 
   // No-op Recv handler.
   RecvCallback recv_callback = {
-      /*channel_id=*/2,
-      [&](const PjRtTransferMetadata& m,
-          std::unique_ptr<CopyToDeviceStream> stream) { return OkStatus(); }};
+      /*channel_id=*/2, [&](const PjRtTransferMetadata& m,
+                            std::unique_ptr<CopyToDeviceStream> stream) {
+        return absl::OkStatus();
+      }};
 
   // Callbacks for point-to-point communication ops.
   std::vector<std::vector<SendCallback>> send_callbacks = {{send_callback}};
@@ -204,7 +272,7 @@ TEST(StreamExecutorGpuClientTest, RecvErrorNoDeadLock) {
   // No-op Send handler.
   SendCallback send_callback = {
       /*channel_id=*/1, [&](const PjRtTransferMetadata&, PjRtChunk, int64_t,
-                            bool) { return OkStatus(); }};
+                            bool) { return absl::OkStatus(); }};
 
   // Invalid Recv handler that tries to add invalid chunk.
   RecvCallback recv_callback = {
@@ -213,7 +281,7 @@ TEST(StreamExecutorGpuClientTest, RecvErrorNoDeadLock) {
         auto chunk = PjRtChunk::AllocateDefault(10 * sizeof(float));
         stream->AddChunk(std::move(chunk)).Await().IgnoreError();
         // Return ok status to proceed to corresponding recv-done call.
-        return OkStatus();
+        return absl::OkStatus();
       }};
 
   // Callbacks for point-to-point communication ops.
@@ -231,6 +299,62 @@ TEST(StreamExecutorGpuClientTest, RecvErrorNoDeadLock) {
                                 "of size 8 (0 already transferred)"));
 }
 
+// User-defined data type to be passed to FFI handler via the execute context
+// side channel.
+struct MemsetValue {
+  explicit MemsetValue(float value) : value(value) {}
+  float value;
+};
+
+static absl::Status MemsetFromValue(
+    se::Stream* stream, ffi::Result<ffi::BufferR1<PrimitiveType::F32>> result,
+    MemsetValue* memset_value) {
+  uint32_t pattern;
+  std::memcpy(&pattern, &memset_value->value, sizeof(pattern));
+
+  se::DeviceMemoryBase base = result->data;
+  return stream->Memset32(&base, pattern, result->data.size());
+}
+
+XLA_FFI_DEFINE_HANDLER(kMemsetFromValue, MemsetFromValue,
+                       ffi::Ffi::Bind()
+                           .Ctx<ffi::Stream>()
+                           .Ret<ffi::BufferR1<PrimitiveType::F32>>()
+                           .Ctx<ffi::UserData<MemsetValue>>());
+
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "MemsetFromValue",
+                         PlatformUtil::CanonicalPlatformName("GPU").value(),
+                         kMemsetFromValue);
+
+TEST(StreamExecutorGpuClientTest, ForwardUserDataToFfiHandler) {
+  static constexpr char const* kProgram = R"(
+    HloModule ffi_handler
+    ENTRY main {
+      ROOT %custom-call = f32[4] custom-call(),
+                          custom_call_target="MemsetFromValue",
+                          api_version=API_VERSION_TYPED_FFI
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(GpuClientOptions()));
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          CompileExecutable(kProgram, *client));
+
+  ExecuteContext context;
+  TF_ASSERT_OK(context.ffi_context().Emplace<MemsetValue>(42.0f));
+
+  ExecuteOptions opts;
+  opts.context = &context;
+
+  auto result = executable->Execute(/*argument_handles=*/{{}}, opts);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::Literal> result_literal,
+                          ExtractSingleResult(result));
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      LiteralUtil::CreateR1<float>({42.0f, 42.0f, 42.0f, 42.0f}),
+      *result_literal));
+}
+
 TEST(StreamExecutorGpuClientTest, ToLiteralAsync) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
                           GetStreamExecutorGpuClient(GpuClientOptions()));
@@ -251,7 +375,7 @@ TEST(StreamExecutorGpuClientTest, ToLiteralAsync) {
   TF_ASSERT_OK(
       transfer_manager->TransferLiteralToBuffer(0, src_literal, [&]() {}));
 
-  buffer->ToLiteral(literal.get()).OnReady([&](Status s) {
+  buffer->ToLiteral(literal.get()).OnReady([&](absl::Status s) {
     absl::MutexLock l(&mu);
     TF_ASSERT_OK(s);
     got_literal = true;
@@ -285,7 +409,7 @@ TEST(StreamExecutorGpuClientTest, ToLiteralAsyncBeforeBufferReady) {
       ShapeUtil::DeviceShapeToHostShape(buffer->on_device_shape()));
   bool got_literal = false;
 
-  buffer->ToLiteral(literal.get()).OnReady([&](Status s) {
+  buffer->ToLiteral(literal.get()).OnReady([&](absl::Status s) {
     absl::MutexLock l(&mu);
     TF_ASSERT_OK(s);
     got_literal = true;
@@ -345,12 +469,12 @@ TEST(StreamExecutorGpuClientTest, FromHostAsync) {
   for (auto& buffer : buffers) {
     literals.push_back(std::make_shared<Literal>(
         ShapeUtil::DeviceShapeToHostShape(buffer->on_device_shape())));
-    buffer->ToLiteral(literals.back().get()).OnReady([&](Status s) {
+    buffer->ToLiteral(literals.back().get()).OnReady([&](absl::Status s) {
       absl::MutexLock l(&mu);
       TF_ASSERT_OK(s);
       ++got_literal_count;
     });
-    buffer->GetReadyFuture().OnReady([&](Status s) {
+    buffer->GetReadyFuture().OnReady([&](absl::Status s) {
       absl::MutexLock l(&mu);
       TF_ASSERT_OK(s);
       ++got_callback_count;
@@ -375,6 +499,7 @@ TEST(StreamExecutorGpuClientTest, FromHostAsync) {
         literals[i]->Relayout(src_literals[i].shape().layout()).data<float>());
   }
 }
+
 TEST(StreamExecutorGpuClientTest, CopyRawToHostFullBuffer) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
                           GetStreamExecutorGpuClient(GpuClientOptions()));
@@ -436,16 +561,21 @@ TEST(StreamExecutorGpuClientTest, CopyRawToHostFuture) {
       std::unique_ptr<PjRtBuffer> buffer,
       client->BufferFromHostLiteral(literal, client->addressable_devices()[0]));
 
-  auto dst_promise = xla::PjRtFuture<absl::StatusOr<void*>>::CreatePromise();
-  xla::PjRtFuture<absl::StatusOr<void*>> dst_future(dst_promise);
+  auto dst_promise = xla::PjRtFuture<void*>::CreatePromise();
+  xla::PjRtFuture<void*> dst_future(dst_promise);
 
   TF_ASSERT_OK_AND_ASSIGN(int64_t size, buffer->GetOnDeviceSizeInBytes());
-  buffer->GetReadyFuture().OnReady([dst_promise = std::move(dst_promise),
-                                    size](absl::Status status) mutable {
+  auto ready = buffer->GetReadyFuture();
+  auto result = buffer->CopyRawToHostFuture(dst_future, 0, size);
+
+  // Drop the buffer before fulfilling `dst`. The transfer should still keep the
+  // buffer alive.
+  buffer.reset();
+  ready.OnReady([dst_promise = std::move(dst_promise),
+                 size](absl::Status status) mutable {
     dst_promise.Set(aligned_alloc(size, 0));
   });
 
-  auto result = buffer->CopyRawToHostFuture(dst_future, 0, size);
   TF_EXPECT_OK(result.Await());
   TF_ASSERT_OK_AND_ASSIGN(auto* dst, dst_future.Await());
   EXPECT_EQ(*(static_cast<float*>(dst)), 41.0f);
@@ -499,9 +629,10 @@ TEST(StreamExecutorGpuClientTest, CreateMixOfErrorBuffers) {
     src_literals.emplace_back(LiteralUtil::CreateR1<float>(data));
     src_shapes.push_back(src_literals.back().shape());
   }
-  TF_ASSERT_OK_AND_ASSIGN(auto transfer_manager,
-                          client->CreateBuffersForAsyncHostToDevice(
-                              src_shapes, client->addressable_devices()[0]));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto transfer_manager,
+      client->CreateBuffersForAsyncHostToDevice(
+          src_shapes, client->addressable_devices()[0]->memory_spaces()[0]));
   std::vector<std::unique_ptr<PjRtBuffer>> buffers;
   for (int i = 0; i < src_shapes.size(); ++i) {
     buffers.emplace_back(transfer_manager->RetrieveBuffer(i));
@@ -544,17 +675,33 @@ TEST(GpuTopology, FromProto) {
   ASSERT_TRUE(tsl::protobuf::TextFormat::ParseFromString(
       R"pb(
         device_ids: [ 3, 2, 1 ]
+        platform_version: "platform_version"
+        num_slices: 2
+        num_hosts_per_slice: 1
+        num_devices_per_host: 3
       )pb",
       &msg));
 
   std::unique_ptr<const GpuTopology> gpu_topology = GpuTopology::FromProto(msg);
   EXPECT_THAT(gpu_topology->device_ids(), ElementsAre(3, 2, 1));
+  EXPECT_THAT(gpu_topology->platform_version(), "platform_version");
+  EXPECT_THAT(gpu_topology->num_slices(), 2);
+  EXPECT_THAT(gpu_topology->num_hosts_per_slice(), 1);
+  EXPECT_THAT(gpu_topology->num_devices_per_host(), 3);
 }
 
 TEST(GpuTopology, ToProto) {
-  GpuTopology gpu_topology({3, 2, 1});
+  GpuTopology gpu_topology(/*gpu_device_ids=*/{3, 2, 1},
+                           /*platform_version=*/"platform_version",
+                           /*num_slices=*/2,
+                           /*num_hosts_per_slice=*/1,
+                           /*num_devices_per_host=*/3);
   GpuTopologyProto msg = gpu_topology.ToProto();
   EXPECT_THAT(msg.device_ids(), ElementsAre(3, 2, 1));
+  EXPECT_THAT(msg.platform_version(), "platform_version");
+  EXPECT_THAT(msg.num_slices(), 2);
+  EXPECT_THAT(msg.num_hosts_per_slice(), 1);
+  EXPECT_THAT(msg.num_devices_per_host(), 3);
 }
 
 TEST(StreamExecutorGpuClientTest, DistributedInit) {
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
index ebb68c2a483f93..0862de1a4ce1ea 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
@@ -16,19 +16,17 @@ limitations under the License.
 #include "xla/pjrt/gpu/se_gpu_pjrt_compiler.h"
 
 #include <memory>
-#include <optional>
-#include <utility>
 
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "xla/client/xla_computation.h"
 #include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
-#include "xla/status_macros.h"
 #include "xla/stream_executor/platform/initialize.h"
 #include "tsl/platform/casts.h"
-#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "xla/client/local_client.h"
@@ -158,15 +156,21 @@ StreamExecutorGpuCompiler::Compile(CompileOptions options,
   const int num_partitions = hlo_module->config().num_partitions();
   const std::string name = hlo_module->name();
   const std::string fingerprint = hlo_module->GetFingerprint128();
+  const int num_outputs = hlo_module->result_shape().IsTuple()
+                              ? hlo_module->result_shape().tuple_shapes_size()
+                              : 1;
   auto unique_module_group =
       std::make_unique<HloModuleGroup>(std::move(hlo_module));
   TF_ASSIGN_OR_RETURN(
       std::vector<std::unique_ptr<AotCompilationResult>> aot_results,
       gpu_compiler.CompileAheadOfTime(std::move(unique_module_group),
                                       aot_options));
+  std::vector<std::vector<absl::string_view>> output_memory_kinds(1);
+  output_memory_kinds[0].resize(num_outputs,
+                                StreamExecutorGpuHbmMemorySpace::kKind);
   return std::make_unique<StreamExecutorExecutable>(
       std::move(input_options), std::move(aot_results), num_replicas,
-      num_partitions, name, fingerprint);
+      num_partitions, name, fingerprint, std::move(output_memory_kinds));
 #else
   return absl::InternalError(
       "GPU Compilation requires the target to be built with CUDA or "
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
index 3d0c2d0319ed6b..c9d67adfa66911 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
@@ -179,5 +179,42 @@ TEST(StreamExecutorGpuCompilerTest, SuccessLoadFromSerializedExecutable) {
   ValidateResult(result);
 }
 
+constexpr absl::string_view kProgramIdentity = R"(HloModule Identity
+
+ENTRY main {
+  ROOT Arg_0.1 = s32[1]{0} parameter(0)
+})";
+
+TEST(StreamExecutorGpuCompilerTest, SuccessSerializeDeserialize) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(GpuClientOptions()));
+  auto se_client = absl::WrapUnique(
+      tensorflow::down_cast<StreamExecutorGpuClient*>(client.release()));
+  StreamExecutorGpuCompiler compiler;
+  xla::CompileOptions opts;
+  opts.target_config = Compiler::TargetConfig(
+      se_client->client()->backend().default_stream_executor());
+
+  TF_ASSERT_OK_AND_ASSIGN(XlaComputation computation,
+                          GetXlaComputation(kProgramIdentity));
+  TF_ASSERT_OK_AND_ASSIGN(const PjRtTopologyDescription* topology,
+                          se_client->GetTopologyDescription());
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtExecutable> executable,
+      compiler.Compile(opts, computation, *topology, /*client=*/nullptr));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtLoadedExecutable> loaded_executable,
+      se_client->Load(std::move(executable)));
+
+  // Serialize the executable and deserialize it without failure.
+  TF_ASSERT_OK_AND_ASSIGN(std::string serialized_executable,
+                          se_client->SerializeExecutable(*loaded_executable));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto deserialized_executable,
+      se_client->DeserializeExecutable(serialized_executable, std::nullopt));
+
+  EXPECT_EQ(deserialized_executable->name(), "Identity");
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/host_callback.cc b/third_party/xla/xla/pjrt/host_callback.cc
index e59eeb6f1f8153..1faa844936cc5f 100644
--- a/third_party/xla/xla/pjrt/host_callback.cc
+++ b/third_party/xla/xla/pjrt/host_callback.cc
@@ -15,10 +15,19 @@ limitations under the License.
 
 #include "xla/pjrt/host_callback.h"
 
+#include <cstddef>
 #include <memory>
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/shape_util.h"
+#include "tsl/platform/logging.h"
+#include "tsl/platform/status.h"
+
 namespace xla {
 
 static thread_local int on_send_guard = 0;
@@ -28,9 +37,9 @@ void LeaveHostCallback() { --on_send_guard; }
 
 bool ThisThreadIsInsideHostCallback() { return on_send_guard > 0; }
 
-Status HostCallbackContext::OnSend(int arg_num,
-                                   const PjRtTransferMetadata& metadata,
-                                   PjRtChunk data) {
+absl::Status HostCallbackContext::OnSend(int arg_num,
+                                         const PjRtTransferMetadata& metadata,
+                                         PjRtChunk data) {
   if (!use_major_to_minor_data_layout_for_callbacks_) {
     const auto& arg_info = host_callback_.operands.at(arg_num);
     const auto& host_shape = arg_info.shape;
@@ -54,7 +63,7 @@ Status HostCallbackContext::OnSend(int arg_num,
 
   DCHECK_GE(ready_count_.load(), 1);
   if (ready_count_.fetch_sub(1) != 1) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // This atomic store won't race against the next invocation of OnSend()
@@ -107,17 +116,19 @@ void HostCallbackContext::Receive(int res_num,
   auto& result_channel = result_channels_.at(res_num);
   result_channel->Pop().OnReady(
       [this, res_num, metadata,
-       stream = std::move(stream)](PjRtChunk chunk) mutable {
+       stream = std::move(stream)](absl::StatusOr<PjRtChunk> chunk) mutable {
+        TF_CHECK_OK(chunk.status());
+
         if (!use_major_to_minor_data_layout_for_callbacks_) {
           const auto& host_shape = host_callback_.results.at(res_num).shape;
           const auto& device_shape = metadata.device_shape;
           auto statusor_linearized =
               host_memory_for_device_manager_->ToDeviceLayout(
-                  chunk.data(), chunk.size(), host_shape, device_shape);
+                  chunk->data(), chunk->size(), host_shape, device_shape);
           chunk = std::move(statusor_linearized.value());
         }
 
-        stream->AddChunk(std::move(chunk)).OnReady([](Status s) {
+        stream->AddChunk(*std::move(chunk)).OnReady([](absl::Status s) {
           TF_CHECK_OK(s);
         });
       });
diff --git a/third_party/xla/xla/pjrt/host_callback.h b/third_party/xla/xla/pjrt/host_callback.h
index 58788c0ee87954..2777cd368d0104 100644
--- a/third_party/xla/xla/pjrt/host_callback.h
+++ b/third_party/xla/xla/pjrt/host_callback.h
@@ -17,14 +17,22 @@ limitations under the License.
 #define XLA_PJRT_HOST_CALLBACK_H_
 
 #include <atomic>
+#include <cstdint>
 #include <deque>
 #include <functional>
 #include <memory>
 #include <utility>
 #include <vector>
 
+#include "absl/base/thread_annotations.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_future.h"
+#include "xla/shape.h"
+#include "tsl/platform/logging.h"
 
 // The following provides an API for implementing host callbacks on top of
 // PjRT's send/recv interface (see xla::SendCallback and xla::RecvCallback).
@@ -80,7 +88,7 @@ class ThreadSafePjRtChunkQueue {
 struct HostCallbackArgInfo {
   // The channel_id associated with this value in HLO.
   uint16_t channel_id;
-  // The host shape for thie value.
+  // The host shape for this value.
   Shape shape;
 };
 
@@ -95,7 +103,7 @@ struct HostCallback {
   // inputs. The buffers are only guaranteed to be alive during the call. The
   // callback can also return error status to indicate the entire execution
   // should fail.
-  std::function<Status(void**, void**)> callback;
+  std::function<absl::Status(void**, void**)> callback;
 };
 
 // A helper class that maintains the send/recv states for a host callback.
@@ -120,8 +128,8 @@ class HostCallbackContext {
     }
   }
 
-  Status OnSend(int arg_num, const PjRtTransferMetadata& metadata,
-                PjRtChunk data);
+  absl::Status OnSend(int arg_num, const PjRtTransferMetadata& metadata,
+                      PjRtChunk data);
 
   void Receive(int res_num, const PjRtTransferMetadata& metadata,
                std::unique_ptr<CopyToDeviceStream> stream);
diff --git a/third_party/xla/xla/pjrt/host_callback_test.cc b/third_party/xla/xla/pjrt/host_callback_test.cc
index d142b878471539..d134b216e400df 100644
--- a/third_party/xla/xla/pjrt/host_callback_test.cc
+++ b/third_party/xla/xla/pjrt/host_callback_test.cc
@@ -32,17 +32,18 @@ class TestPjRtHostMemoryForDeviceManager
  public:
   ~TestPjRtHostMemoryForDeviceManager() override = default;
 
-  StatusOr<PjRtChunk> ToDeviceLayout(const void* src_data, size_t src_size,
-                                     const Shape& host_shape,
-                                     const Shape& device_shape) override {
+  absl::StatusOr<PjRtChunk> ToDeviceLayout(const void* src_data,
+                                           size_t src_size,
+                                           const Shape& host_shape,
+                                           const Shape& device_shape) override {
     auto chunk = PjRtChunk::AllocateDefault(src_size);
     std::memcpy(chunk.data(), src_data, src_size);
     return chunk;
   }
 
-  Status ToHostLayout(const void* src_data, size_t src_size,
-                      const Shape& src_shape, void* dst_data, size_t dst_size,
-                      const Shape& dst_shape) override {
+  absl::Status ToHostLayout(const void* src_data, size_t src_size,
+                            const Shape& src_shape, void* dst_data,
+                            size_t dst_size, const Shape& dst_shape) override {
     CHECK_EQ(src_size, dst_size);
     std::memcpy(dst_data, src_data, src_size);
     return OkStatus();
diff --git a/third_party/xla/xla/pjrt/interpreter_device.cc b/third_party/xla/xla/pjrt/interpreter_device.cc
index 2fd5a71c957d0a..52b52d15a22033 100644
--- a/third_party/xla/xla/pjrt/interpreter_device.cc
+++ b/third_party/xla/xla/pjrt/interpreter_device.cc
@@ -33,7 +33,7 @@ InterpreterDevice::InterpreterDevice(
     : PjRtStreamExecutorDevice(id, std::move(local_device_state),
                                /*device_kind=*/kInterpreterPlatformName) {}
 
-StatusOr<std::unique_ptr<PjRtClient>> GetInterpreterClient() {
+absl::StatusOr<std::unique_ptr<PjRtClient>> GetInterpreterClient() {
   TF_ASSIGN_OR_RETURN(se::Platform * platform,
                       PlatformUtil::GetPlatform("Interpreter"));
   if (platform->VisibleDeviceCount() != 1) {
@@ -55,8 +55,9 @@ StatusOr<std::unique_ptr<PjRtClient>> GetInterpreterClient() {
   devices.push_back(std::move(device));
 
   return std::unique_ptr<PjRtClient>(std::make_unique<PjRtStreamExecutorClient>(
-      "interpreter", client, std::move(devices), /*process_index=*/0,
-      /*allocator=*/nullptr, /*host_memory_allocator=*/nullptr,
+      "interpreter", client, std::move(devices),
+      /*process_index=*/0, /*allocator=*/nullptr,
+      /*host_memory_allocator=*/nullptr,
       /*should_stage_host_to_device_transfers=*/false,
       /*gpu_run_options=*/nullptr));
 }
diff --git a/third_party/xla/xla/pjrt/interpreter_device.h b/third_party/xla/xla/pjrt/interpreter_device.h
index 1842a816e9eb28..7952f362cf9a6b 100644
--- a/third_party/xla/xla/pjrt/interpreter_device.h
+++ b/third_party/xla/xla/pjrt/interpreter_device.h
@@ -29,7 +29,7 @@ class InterpreterDevice : public PjRtStreamExecutorDevice {
                     std::unique_ptr<LocalDeviceState> local_device_state);
 };
 
-StatusOr<std::unique_ptr<PjRtClient>> GetInterpreterClient();
+absl::StatusOr<std::unique_ptr<PjRtClient>> GetInterpreterClient();
 
 }  // namespace xla
 
diff --git a/third_party/xla/xla/pjrt/layout_mode.cc b/third_party/xla/xla/pjrt/layout_mode.cc
index 864a03799364db..b236f066598995 100644
--- a/third_party/xla/xla/pjrt/layout_mode.cc
+++ b/third_party/xla/xla/pjrt/layout_mode.cc
@@ -50,7 +50,7 @@ std::string LayoutMode::ToString() const {
   }
 }
 
-StatusOr<LayoutMode> LayoutMode::FromString(std::string s) {
+absl::StatusOr<LayoutMode> LayoutMode::FromString(std::string s) {
   if (s == "default") {
     return LayoutMode(Mode::kDefault);
   }
@@ -58,11 +58,12 @@ StatusOr<LayoutMode> LayoutMode::FromString(std::string s) {
     return LayoutMode(Mode::kAuto);
   }
   // LayoutMode is user-specified; parse Layout string
-  StatusOr<Layout> layout = ParseLayout(s);
+  absl::StatusOr<Layout> layout = ParseLayout(s);
   if (!layout.ok()) {
-    Status new_status(layout.status().code(),
-                      absl::StrCat("Error parsing user-specified layout mode '",
-                                   s, "': ", layout.status().message()));
+    absl::Status new_status(
+        layout.status().code(),
+        absl::StrCat("Error parsing user-specified layout mode '", s,
+                     "': ", layout.status().message()));
     return new_status;
   }
   return LayoutMode(*layout);
diff --git a/third_party/xla/xla/pjrt/layout_mode.h b/third_party/xla/xla/pjrt/layout_mode.h
index 156932a2f71c40..3e8d208fa89859 100644
--- a/third_party/xla/xla/pjrt/layout_mode.h
+++ b/third_party/xla/xla/pjrt/layout_mode.h
@@ -59,7 +59,7 @@ struct LayoutMode {
   // the correct format for the "mhlo.layout_mode" attribute.
   std::string ToString() const;
   // Parses a string produced by LayoutMode::ToString() or Layout::ToString().
-  static StatusOr<LayoutMode> FromString(std::string s);
+  static absl::StatusOr<LayoutMode> FromString(std::string s);
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/local_device_state.cc b/third_party/xla/xla/pjrt/local_device_state.cc
index 5c0d2b30e70325..bf52adbe6d1173 100644
--- a/third_party/xla/xla/pjrt/local_device_state.cc
+++ b/third_party/xla/xla/pjrt/local_device_state.cc
@@ -80,6 +80,10 @@ LocalDeviceState::LocalDeviceState(se::StreamExecutor* executor,
   for (int i = 0; i < num_device_to_device_streams; ++i) {
     device_to_device_streams_.emplace_back(create_stream());
   }
+  fixed_size_pool_usage_streams_.reserve(kNumFixedSizePoolUsageStreams);
+  for (int i = 0; i < kNumFixedSizePoolUsageStreams; ++i) {
+    fixed_size_pool_usage_streams_.emplace_back(create_stream());
+  }
   external_ready_event_streams_.reserve(kNumExternalReadyEventStreams);
   for (int i = 0; i < kNumExternalReadyEventStreams; ++i) {
     external_ready_event_streams_.emplace_back(create_stream());
@@ -91,14 +95,14 @@ LocalDeviceState::LocalDeviceState(se::StreamExecutor* executor,
 }
 
 LocalDeviceState::~LocalDeviceState() {
-  Status status = SynchronizeAllActivity();
+  absl::Status status = SynchronizeAllActivity();
   if (!status.ok()) {
     LOG(ERROR) << "Error when closing device: " << status;
   }
 }
 
-Status LocalDeviceState::SynchronizeAllActivity() {
-  Status status;
+absl::Status LocalDeviceState::SynchronizeAllActivity() {
+  absl::Status status;
   // TODO(phawkins): in theory the call to SynchronizeAllActivity below should
   // suffice. However on the Host platform SynchronizeAllActivity is a dummy
   // implementation that doesn't actually block. To make sure activity has
@@ -121,7 +125,7 @@ Status LocalDeviceState::SynchronizeAllActivity() {
   return status;
 }
 
-Status LocalDeviceState::ThenMemcpyDeviceToDevice(
+absl::Status LocalDeviceState::ThenMemcpyDeviceToDevice(
     se::Stream* transfer_stream, se::Stream* dst_stream,
     se::DeviceMemoryBase src_buffer, se::DeviceMemoryBase dst_buffer) {
   // The default implementation simply calls MemcpyD2D, and assumes that
@@ -167,6 +171,15 @@ se::Stream* LocalDeviceState::GetDeviceToDeviceStream() {
   return device_to_device_streams_.at(i).get();
 }
 
+se::Stream* LocalDeviceState::GetFixedSizePoolUsageStream() {
+  absl::MutexLock lock(&mu_);
+  int i = next_fixed_size_pool_usage_stream_;
+  next_fixed_size_pool_usage_stream_ =
+      (next_fixed_size_pool_usage_stream_ + 1) %
+      fixed_size_pool_usage_streams_.size();
+  return fixed_size_pool_usage_streams_.at(i).get();
+}
+
 se::Stream* LocalDeviceState::GetExternalReadyEventStream() {
   absl::MutexLock lock(&mu_);
   int i = next_external_ready_event_stream_;
@@ -175,7 +188,7 @@ se::Stream* LocalDeviceState::GetExternalReadyEventStream() {
   return external_ready_event_streams_.at(i).get();
 }
 
-StatusOr<se::Stream*> LocalDeviceState::GetStreamFromExternalStream(
+absl::StatusOr<se::Stream*> LocalDeviceState::GetStreamFromExternalStream(
     std::intptr_t stream) {
   // TODO(skyewm): replace with map lookup if performance is an issue (currently
   // it just iterates over 4 streams).
diff --git a/third_party/xla/xla/pjrt/local_device_state.h b/third_party/xla/xla/pjrt/local_device_state.h
index 73206d4fbd3abf..9cc0db39776e66 100644
--- a/third_party/xla/xla/pjrt/local_device_state.h
+++ b/third_party/xla/xla/pjrt/local_device_state.h
@@ -136,6 +136,11 @@ class LocalDeviceState {
   // fashion amongst the available streams.
   se::Stream* GetDeviceToDeviceStream();
 
+  // Returns a usage stream. Allocates streams in a round-robin fashion amongst
+  // the available streams. When the overhead from BorrowStreamFromPool is too
+  // large for a use case, consider using this API instead.
+  se::Stream* GetFixedSizePoolUsageStream();
+
   // Return a stream that should be used to track when an externally-managed
   // buffer is ready. This is intended to support dlpack on GPU. Allocates
   // streams in a round-robin fashion amongst the available streams.
@@ -146,12 +151,12 @@ class LocalDeviceState {
   // returned by GetExternalReadyEventStream.
   // TODO(skyewm): this function could map other raw streams if needed. It's
   // currently only used with external ready event streams.
-  StatusOr<se::Stream*> GetStreamFromExternalStream(std::intptr_t stream);
+  absl::StatusOr<se::Stream*> GetStreamFromExternalStream(std::intptr_t stream);
 
   // Returns a vector of device to device streams.
   std::vector<se::Stream*> GetDeviceToDeviceStreams();
 
-  // Returns a stream from a pool. The stream is guaranteed not to have any
+  // Borrows a stream from a pool. The stream is guaranteed not to have any
   // currently outstanding work at its tail.
   std::unique_ptr<se::Stream> BorrowStreamFromPool();
   // Returns a stream to the pool. The caller must ensure the stream does not
@@ -159,10 +164,9 @@ class LocalDeviceState {
   void ReturnStreamToPool(std::unique_ptr<se::Stream> stream);
 
   // Enqueues a copy of `src_buffer` to `dst_buffer` onto `transfer_stream`.
-  virtual Status ThenMemcpyDeviceToDevice(se::Stream* transfer_stream,
-                                          se::Stream* dst_stream,
-                                          se::DeviceMemoryBase src_buffer,
-                                          se::DeviceMemoryBase dst_buffer);
+  virtual absl::Status ThenMemcpyDeviceToDevice(
+      se::Stream* transfer_stream, se::Stream* dst_stream,
+      se::DeviceMemoryBase src_buffer, se::DeviceMemoryBase dst_buffer);
 
   WorkerThread* execute_thread() const { return execute_thread_.get(); }
 
@@ -196,7 +200,7 @@ class LocalDeviceState {
   int GetNewPrngSeed();
 
  private:
-  Status SynchronizeAllActivity();
+  absl::Status SynchronizeAllActivity();
 
   AllocationModel allocation_model_;
 
@@ -214,15 +218,18 @@ class LocalDeviceState {
   std::unique_ptr<se::Stream> host_to_device_stream_;
   std::vector<std::unique_ptr<se::Stream>> device_to_host_streams_;
   std::vector<std::unique_ptr<se::Stream>> device_to_device_streams_;
+  std::vector<std::unique_ptr<se::Stream>> fixed_size_pool_usage_streams_;
   std::vector<std::unique_ptr<se::Stream>> external_ready_event_streams_;
 
   static constexpr int kNumDeviceToHostStreams = 4;
   static constexpr int kNumDeviceToDeviceStreams = 4;
+  static constexpr int kNumFixedSizePoolUsageStreams = 4;
   static constexpr int kNumExternalReadyEventStreams = 4;
 
   absl::Mutex mu_;
   int next_device_to_host_stream_ ABSL_GUARDED_BY(mu_) = 0;
   int next_device_to_device_stream_ ABSL_GUARDED_BY(mu_) = 0;
+  int next_fixed_size_pool_usage_stream_ ABSL_GUARDED_BY(mu_) = 0;
   int next_external_ready_event_stream_ ABSL_GUARDED_BY(mu_) = 0;
 
   std::random_device prng_seed_device_ ABSL_GUARDED_BY(mu_);
diff --git a/third_party/xla/xla/pjrt/lru_cache.h b/third_party/xla/xla/pjrt/lru_cache.h
index 2d8bca88c25add..0e5ea6d07d9dad 100644
--- a/third_party/xla/xla/pjrt/lru_cache.h
+++ b/third_party/xla/xla/pjrt/lru_cache.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define XLA_PJRT_LRU_CACHE_H_
 
 #include <optional>
+#include <unordered_map>
 
 #include "absl/container/node_hash_map.h"
 #include "tsl/platform/logging.h"
@@ -101,16 +102,17 @@ class LRUCache {
   struct Entry : public LRUListEntry {
     Entry() = default;
 
-    // Pointer to the key in `entries_`. absl::node_hash_map<> promises
+    // Pointer to the key in `entries_`. std::unordered_map<> promises
     // pointer stability for keys.
     const Key* key;
     LRUCache* container;
     std::optional<Value> value;
   };
 
-  // We use `node_hash_map` because we want to guarantee pointer stability for
-  // keys and values.
-  absl::node_hash_map<Key, Entry, Hash, Eq> entries_;
+  // We use `unordered_map` because (a) we want to guarantee pointer stability
+  // for keys and values, and (b) we need exception safety so we can't use
+  // absl hashtables.
+  std::unordered_map<Key, Entry, Hash, Eq> entries_;
 };
 
 template <typename Key, typename Value, typename Hash, typename Eq>
@@ -140,9 +142,7 @@ LRUCache<Key, Value, Hash, Eq>::~LRUCache() {
 template <typename Key, typename Value, typename Hash, typename Eq>
 Value LRUCache<Key, Value, Hash, Eq>::GetOrCreateIfAbsent(
     const Key& key, const std::function<Value(const Key&)>& factory) {
-  typename absl::node_hash_map<Key, Entry, Hash, Eq>::iterator it;
-  bool inserted;
-  std::tie(it, inserted) = entries_.try_emplace(key);
+  auto [it, inserted] = entries_.try_emplace(key);
   Entry& entry = it->second;
   if (inserted) {
     entry.key = &it->first;
diff --git a/third_party/xla/xla/pjrt/mlir_to_hlo.cc b/third_party/xla/xla/pjrt/mlir_to_hlo.cc
index 664012f46b9780..d2d74da471083a 100644
--- a/third_party/xla/xla/pjrt/mlir_to_hlo.cc
+++ b/third_party/xla/xla/pjrt/mlir_to_hlo.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "stablehlo/dialect/Register.h"  // from @stablehlo
@@ -58,12 +59,12 @@ namespace xla {
 namespace {
 
 static mlir::Attribute ArrayToElements(mlir::Attribute attr) {
-  if (auto array = attr.dyn_cast<mlir::DenseI64ArrayAttr>()) {
+  if (auto array = mlir::dyn_cast<mlir::DenseI64ArrayAttr>(attr)) {
     return mlir::DenseIntElementsAttr::get(
         mlir::RankedTensorType::get(array.size(), array.getElementType()),
         array.asArrayRef());
   }
-  if (auto array = attr.dyn_cast<mlir::DenseBoolArrayAttr>()) {
+  if (auto array = mlir::dyn_cast<mlir::DenseBoolArrayAttr>(attr)) {
     return mlir::DenseIntElementsAttr::get(
         mlir::RankedTensorType::get(array.size(), array.getElementType()),
         array.asArrayRef());
@@ -192,9 +193,9 @@ void UpgradeStablehlo(mlir::ModuleOp module) {
 
 }  // namespace
 
-Status MlirToXlaComputation(mlir::ModuleOp module,
-                            XlaComputation& xla_computation,
-                            bool use_tuple_args, bool return_tuple) {
+absl::Status MlirToXlaComputation(mlir::ModuleOp module,
+                                  XlaComputation& xla_computation,
+                                  bool use_tuple_args, bool return_tuple) {
   mlir::BaseScopedDiagnosticHandler diagnostic_handler(module->getContext());
   {
     mlir::PassManager pm(module->getContext());
@@ -226,7 +227,7 @@ Status MlirToXlaComputation(mlir::ModuleOp module,
   return OkStatus();
 }
 
-StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ParseMlirModuleString(
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ParseMlirModuleString(
     absl::string_view mlir_module_str, mlir::MLIRContext& context) {
   mlir::DialectRegistry registry;
   registry.insert<mlir::arith::ArithDialect>();
@@ -266,7 +267,7 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ParseMlirModuleString(
   return std::move(module);
 }
 
-Status ParseMlirModuleStringAndConvertToXlaComputation(
+absl::Status ParseMlirModuleStringAndConvertToXlaComputation(
     absl::string_view mlir_module_str, XlaComputation& xla_computation,
     bool use_tuple_args, bool return_tuple) {
   mlir::MLIRContext context;
@@ -333,7 +334,7 @@ absl::StatusOr<std::string> SerializeUsingVersionedStablehlo(
   return buffer;
 }
 
-Status UpgradeVersionedStablehlo(mlir::ModuleOp mlir_module) {
+absl::Status UpgradeVersionedStablehlo(mlir::ModuleOp mlir_module) {
   // Apply StableHLO bytecode patch
   UpgradeStablehlo(mlir_module);
 
diff --git a/third_party/xla/xla/pjrt/mlir_to_hlo.h b/third_party/xla/xla/pjrt/mlir_to_hlo.h
index 9bad5399717422..efce0dc48f28d9 100644
--- a/third_party/xla/xla/pjrt/mlir_to_hlo.h
+++ b/third_party/xla/xla/pjrt/mlir_to_hlo.h
@@ -24,16 +24,16 @@ limitations under the License.
 namespace xla {
 
 // Converts an MHLO/CHLO module string to an mlir::Module.
-StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ParseMlirModuleString(
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ParseMlirModuleString(
     absl::string_view mlir_module_str, mlir::MLIRContext& context);
 
 // Converts an CHLO/MHLO module to XLA HLO.
-Status MlirToXlaComputation(mlir::ModuleOp module,
-                            XlaComputation& xla_computation,
-                            bool use_tuple_args, bool return_tuple);
+absl::Status MlirToXlaComputation(mlir::ModuleOp module,
+                                  XlaComputation& xla_computation,
+                                  bool use_tuple_args, bool return_tuple);
 
 // Converts an MHLO/CHLO module string to an XLA computation.
-Status ParseMlirModuleStringAndConvertToXlaComputation(
+absl::Status ParseMlirModuleStringAndConvertToXlaComputation(
     absl::string_view mlir_module_str, XlaComputation& xla_computation,
     bool use_tuple_args, bool return_tuple);
 
@@ -62,7 +62,7 @@ absl::StatusOr<std::string> SerializeUsingVersionedStablehlo(
 // on portable artifacts that are outside of the compatibility window.
 // `ParseMlirModuleString` uses this method, and should be preferred to directly
 // calling `UpgradeVersionedStablehlo` where possible.
-Status UpgradeVersionedStablehlo(mlir::ModuleOp mlir_module);
+absl::Status UpgradeVersionedStablehlo(mlir::ModuleOp mlir_module);
 
 }  // namespace xla
 
diff --git a/third_party/xla/xla/pjrt/pjrt_c_api_client.cc b/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
index 70b826e827f458..e8f5d3926c0465 100644
--- a/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
@@ -38,26 +38,19 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/MLProgram/IR/MLProgram.h"  // from @llvm-project
-#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
-#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
-#include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "stablehlo/dialect/Register.h"  // from @stablehlo
 #include "xla/client/xla_computation.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/layout.h"
 #include "xla/literal.h"
-#include "xla/mlir_hlo/mhlo/IR/register.h"
 #include "xla/mlir_hlo/mhlo/transforms/passes.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
 #include "xla/pjrt/c/pjrt_c_api_helpers.h"
+#include "xla/pjrt/c/pjrt_c_api_layouts_extension.h"
 #include "xla/pjrt/c/pjrt_c_api_profiler_extension.h"
+#include "xla/pjrt/c/pjrt_c_api_stream_extension.h"
 #include "xla/pjrt/compile_options.pb.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/pjrt/mlir_to_hlo.h"
@@ -106,9 +99,9 @@ namespace xla {
 
 // ---------------------------------- Client -----------------------------------
 
-static StatusOr<const PjRtCApiTopologyDescription> InitClientTopoDesc(
+static absl::StatusOr<const PjRtCApiTopologyDescription> InitClientTopoDesc(
     const PJRT_Api* c_api, PJRT_Client* c_client) {
-  StatusOr<PJRT_TopologyDescription*> c_topo =
+  absl::StatusOr<PJRT_TopologyDescription*> c_topo =
       pjrt::GetTopologyDescription(c_client, c_api);
   TF_RETURN_IF_ERROR(c_topo.status());
   return PjRtCApiTopologyDescription(c_api, *c_topo, /*owned=*/false);
@@ -310,7 +303,7 @@ static DeviceAssignment CalculateDefaultAssignment(
   return cpp_device_assignment;
 }
 
-StatusOr<DeviceAssignment> PjRtCApiClient::GetDefaultDeviceAssignment(
+absl::StatusOr<DeviceAssignment> PjRtCApiClient::GetDefaultDeviceAssignment(
     int num_replicas, int num_partitions) const {
   PJRT_Client_DefaultDeviceAssignment_Args args;
   args.struct_size = PJRT_Client_DefaultDeviceAssignment_Args_STRUCT_SIZE;
@@ -329,11 +322,7 @@ StatusOr<DeviceAssignment> PjRtCApiClient::GetDefaultDeviceAssignment(
                                     param);
 }
 
-StatusOr<PjRtDevice*> PjRtCApiClient::LookupDevice(int device_id) const {
-  return LookupDevice(PjRtGlobalDeviceId(device_id));
-}
-
-StatusOr<PjRtDevice*> PjRtCApiClient::LookupDevice(
+absl::StatusOr<PjRtDevice*> PjRtCApiClient::LookupDevice(
     PjRtGlobalDeviceId global_device_id) const {
   PJRT_Client_LookupDevice_Args args;
   args.struct_size = PJRT_Client_LookupDevice_Args_STRUCT_SIZE;
@@ -344,12 +333,12 @@ StatusOr<PjRtDevice*> PjRtCApiClient::LookupDevice(
   return GetCppDevice(args.device);
 }
 
-StatusOr<PjRtDevice*> PjRtCApiClient::LookupAddressableDevice(
+absl::StatusOr<PjRtDevice*> PjRtCApiClient::LookupAddressableDevice(
     int local_hardware_id) const {
   return LookupAddressableDevice(PjRtLocalDeviceId(local_hardware_id));
 }
 
-StatusOr<PjRtDevice*> PjRtCApiClient::LookupAddressableDevice(
+absl::StatusOr<PjRtDevice*> PjRtCApiClient::LookupAddressableDevice(
     PjRtLocalDeviceId local_device_id) const {
   PJRT_Client_LookupAddressableDevice_Args args;
   args.struct_size = PJRT_Client_LookupAddressableDevice_Args_STRUCT_SIZE;
@@ -367,10 +356,10 @@ absl::Span<PjRtMemorySpace* const> PjRtCApiClient::memory_spaces() const {
 
 // Initializes `PJRT_Client_Compile_Args`, which will be used to call
 // API PJRT_Client_Compile().
-static StatusOr<std::unique_ptr<PjRtLoadedExecutable>> InitializeArgsAndCompile(
-    PjRtCApiClient* api_client, const PJRT_Api* c_api, PJRT_Client* client,
-    const CompileOptions& options, const std::string& code,
-    const std::string& format) {
+static absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
+InitializeArgsAndCompile(PjRtCApiClient* api_client, const PJRT_Api* c_api,
+                         PJRT_Client* client, const CompileOptions& options,
+                         const std::string& code, const std::string& format) {
   PJRT_Client_Compile_Args args;
   args.struct_size = PJRT_Client_Compile_Args_STRUCT_SIZE;
   PJRT_Profiler_Extension profiler_extension =
@@ -399,7 +388,7 @@ static StatusOr<std::unique_ptr<PjRtLoadedExecutable>> InitializeArgsAndCompile(
   return ret;
 }
 
-StatusOr<std::unique_ptr<PjRtLoadedExecutable>> PjRtCApiClient::Compile(
+absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> PjRtCApiClient::Compile(
     const XlaComputation& computation, CompileOptions options) {
   std::string module_str = computation.proto().SerializeAsString();
   std::string format(pjrt::kHloFormat);
@@ -407,7 +396,7 @@ StatusOr<std::unique_ptr<PjRtLoadedExecutable>> PjRtCApiClient::Compile(
                                   module_str, format);
 }
 
-StatusOr<std::unique_ptr<PjRtLoadedExecutable>> PjRtCApiClient::Compile(
+absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> PjRtCApiClient::Compile(
     mlir::ModuleOp module, CompileOptions options) {
   // TODO: Once plugins are ready, use SerializeUsingVersionedStablehlo.
   if (!pjrt_c_api()) llvm::report_fatal_error("pjrt_c_api is null");
@@ -420,7 +409,7 @@ StatusOr<std::unique_ptr<PjRtLoadedExecutable>> PjRtCApiClient::Compile(
                                   serialized, format);
 }
 
-StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
+absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
 PjRtCApiClient::DeserializeExecutable(absl::string_view serialized,
                                       std::optional<CompileOptions> options) {
   PJRT_Executable_DeserializeAndLoad_Args des_args;
@@ -441,7 +430,7 @@ PjRtCApiClient::DeserializeExecutable(absl::string_view serialized,
       std::make_unique<PjRtCApiLoadedExecutable>(this, c_exec));
 }
 
-StatusOr<const PjRtTopologyDescription*>
+absl::StatusOr<const PjRtTopologyDescription*>
 PjRtCApiClient::GetTopologyDescription() const {
   if (!topo_desc_.ok()) {
     return topo_desc_.status();
@@ -449,7 +438,7 @@ PjRtCApiClient::GetTopologyDescription() const {
   return &(*topo_desc_);
 }
 
-StatusOr<std::uintptr_t> PjRtCApiClient::UnsafeBufferPointer(
+absl::StatusOr<std::uintptr_t> PjRtCApiClient::UnsafeBufferPointer(
     PjRtBuffer* buffer) {
   // Validate that the buffer's client matches the function call's client, since
   // that could be a common error.
@@ -475,7 +464,7 @@ StatusOr<std::uintptr_t> PjRtCApiClient::UnsafeBufferPointer(
   return args.buffer_pointer;
 }
 
-StatusOr<std::unique_ptr<PjRtBuffer>>
+absl::StatusOr<std::unique_ptr<PjRtBuffer>>
 PjRtCApiClient::BufferFromHostBufferInternalImpl(
     const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
     std::optional<absl::Span<int64_t const>> byte_strides,
@@ -570,7 +559,8 @@ PjRtCApiClient::BufferFromHostBufferInternalImpl(
   return buffer;
 }
 
-StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCApiClient::BufferFromHostBuffer(
+absl::StatusOr<std::unique_ptr<PjRtBuffer>>
+PjRtCApiClient::BufferFromHostBuffer(
     const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
     std::optional<absl::Span<int64_t const>> byte_strides,
     HostBufferSemantics host_buffer_semantics,
@@ -581,7 +571,8 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCApiClient::BufferFromHostBuffer(
       std::move(on_done_with_host_buffer), memory_space, device_layout);
 }
 
-StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCApiClient::BufferFromHostBuffer(
+absl::StatusOr<std::unique_ptr<PjRtBuffer>>
+PjRtCApiClient::BufferFromHostBuffer(
     const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
     std::optional<absl::Span<int64_t const>> byte_strides,
     HostBufferSemantics host_buffer_semantics,
@@ -592,7 +583,8 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCApiClient::BufferFromHostBuffer(
       std::move(on_done_with_host_buffer), device, device_layout);
 }
 
-StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCApiClient::BufferFromHostBuffer(
+absl::StatusOr<std::unique_ptr<PjRtBuffer>>
+PjRtCApiClient::BufferFromHostBuffer(
     const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
     std::optional<absl::Span<int64_t const>> byte_strides,
     HostBufferSemantics host_buffer_semantics,
@@ -603,7 +595,8 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCApiClient::BufferFromHostBuffer(
       std::move(on_done_with_host_buffer), device, /*device_layout=*/nullptr);
 }
 
-StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCApiClient::CreateViewOfDeviceBuffer(
+absl::StatusOr<std::unique_ptr<PjRtBuffer>>
+PjRtCApiClient::CreateViewOfDeviceBuffer(
     void* device_ptr, const Shape& shape, PjRtDevice* device,
     std::function<void()> on_delete_callback,
     std::optional<std::intptr_t> stream) {
@@ -650,6 +643,55 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCApiClient::CreateViewOfDeviceBuffer(
       std::make_unique<PjRtCApiBuffer>(this, args.buffer));
 }
 
+absl::StatusOr<Layout> PjRtCApiClient::GetDefaultLayout(
+    PrimitiveType element_type, absl::Span<const int64_t> dims) {
+  const PJRT_Api* c_api = pjrt_c_api();
+  PJRT_Layouts_Extension* extension =
+      pjrt::FindExtension<PJRT_Layouts_Extension>(
+          c_api, PJRT_Extension_Type::PJRT_Extension_Type_Layouts);
+  if (extension == nullptr) {
+    return absl::UnimplementedError(
+        "Layouts extension not implemented in this PJRT plugin.");
+  }
+  PJRT_Layouts_PJRT_Client_GetDefaultLayout_Args args;
+  args.struct_size = PJRT_Layouts_PJRT_Client_GetDefaultLayout_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+  args.client = c_client_.get();
+  args.type = pjrt::ConvertToPjRtBufferType(element_type);
+  args.dims = dims.data();
+  args.num_dims = dims.size();
+  RETURN_STATUS_IF_PJRT_ERROR(
+      extension->PJRT_Layouts_PJRT_Client_GetDefaultLayout(&args), c_api);
+
+  // Clean up `PJRT_Layouts_MemoryLayout`.
+  std::unique_ptr<PJRT_Layouts_MemoryLayout,
+                  pjrt::PJRT_Layouts_MemoryLayoutDeleter>
+      layout_destroyer(args.layout, pjrt::MakeMemoryLayoutDeleter(c_api));
+
+  // TODO(yueshengys): once b/338478940 is fixed, we can get rid of the
+  // serialization here and wrap the `args.layout` into a subclass of
+  // `PjRtLayout`.
+  PJRT_Layouts_MemoryLayout_Serialize_Args serialize_args;
+  serialize_args.struct_size =
+      PJRT_Layouts_MemoryLayout_Serialize_Args_STRUCT_SIZE;
+  serialize_args.extension_start = nullptr;
+  serialize_args.layout = args.layout;
+  RETURN_STATUS_IF_PJRT_ERROR(
+      extension->PJRT_Layouts_MemoryLayout_Serialize(&serialize_args), c_api);
+
+  // Clean up `PJRT_Layouts_SerializedLayout`.
+  absl::Cleanup cleanup = [&serialize_args] {
+    serialize_args.serialized_layout_deleter(serialize_args.serialized_layout);
+  };
+
+  std::string serialized_layout(serialize_args.serialized_bytes,
+                                serialize_args.serialized_bytes_size);
+  TF_ASSIGN_OR_RETURN(PjRtXlaLayout pjrt_xla_layout,
+                      PjRtXlaLayout::Deserialize(serialized_layout));
+
+  return pjrt_xla_layout.xla_layout();
+}
+
 const PJRT_Api* PjRtCApiClient::pjrt_c_api() const { return c_api_; }
 
 // --------------------------------- Devices -----------------------------------
@@ -797,7 +839,7 @@ PjRtLocalHardwareId PjRtCApiDevice::local_hardware_id_typed() const {
   return PjRtLocalHardwareId(args.local_hardware_id);
 }
 
-StatusOr<PjRtMemorySpace*> PjRtCApiDevice::default_memory_space() const {
+absl::StatusOr<PjRtMemorySpace*> PjRtCApiDevice::default_memory_space() const {
   PJRT_Device_DefaultMemory_Args args;
   args.struct_size = PJRT_Device_DefaultMemory_Args_STRUCT_SIZE;
   args.extension_start = nullptr;
@@ -807,7 +849,7 @@ StatusOr<PjRtMemorySpace*> PjRtCApiDevice::default_memory_space() const {
   return client_->GetCppMemory(args.memory);
 }
 
-StatusOr<tsl::AllocatorStats> PjRtCApiDevice::GetAllocatorStats() const {
+absl::StatusOr<tsl::AllocatorStats> PjRtCApiDevice::GetAllocatorStats() const {
   PJRT_Device_MemoryStats_Args args;
   args.struct_size = PJRT_Device_MemoryStats_Args_STRUCT_SIZE;
   args.extension_start = nullptr;
@@ -867,6 +909,22 @@ StatusOr<tsl::AllocatorStats> PjRtCApiDevice::GetAllocatorStats() const {
   return result;
 }
 
+absl::StatusOr<std::intptr_t> PjRtCApiDevice::GetStreamForExternalReadyEvents()
+    const {
+  const PJRT_Api* c_api = client_->pjrt_c_api();
+  PJRT_Stream_Extension* extension = pjrt::FindExtension<PJRT_Stream_Extension>(
+      c_api, PJRT_Extension_Type::PJRT_Extension_Type_Stream);
+  if (extension == nullptr) {
+    return absl::UnimplementedError(
+        "Stream extension not implemented in this PJRT plugin.");
+  }
+  PJRT_Get_Stream_For_External_Ready_Events_Args args;
+  args.struct_size = PJRT_Get_Stream_For_External_Ready_Events_Args_STRUCT_SIZE;
+  args.device = device_;
+  RETURN_STATUS_IF_PJRT_ERROR(extension->get_stream(&args), c_api);
+  return args.stream;
+}
+
 // ------------------------------- Memory --------------------------------------
 
 const PJRT_Api* PjRtCApiMemorySpace::pjrt_c_api() const {
@@ -988,7 +1046,7 @@ int64_t PjRtCApiExecutable::SizeOfGeneratedCodeInBytes() const {
   return args.size_in_bytes;
 }
 
-StatusOr<absl::flat_hash_map<std::string, PjRtValueType>>
+absl::StatusOr<absl::flat_hash_map<std::string, PjRtValueType>>
 PjRtCApiExecutable::GetCostAnalysis() const {
   // Initialize function call args
   PJRT_Executable_GetCostAnalysis_Args args;
@@ -1006,7 +1064,7 @@ PjRtCApiExecutable::GetCostAnalysis() const {
                                              args.num_properties);
 }
 
-StatusOr<std::vector<std::vector<PrimitiveType>>>
+absl::StatusOr<std::vector<std::vector<PrimitiveType>>>
 PjRtCApiExecutable::GetOutputElementTypes() const {
   PJRT_Executable_OutputElementTypes_Args args;
   args.struct_size = PJRT_Executable_OutputElementTypes_Args_STRUCT_SIZE;
@@ -1026,7 +1084,7 @@ PjRtCApiExecutable::GetOutputElementTypes() const {
   return std::vector<std::vector<PrimitiveType>>{std::move(out)};
 }
 
-StatusOr<std::vector<std::vector<DimensionVector>>>
+absl::StatusOr<std::vector<std::vector<DimensionVector>>>
 PjRtCApiExecutable::GetOutputDimensions() const {
   PJRT_Executable_OutputDimensions_Args args;
   args.struct_size = PJRT_Executable_OutputDimensions_Args_STRUCT_SIZE;
@@ -1052,7 +1110,7 @@ PjRtCApiExecutable::GetOutputDimensions() const {
   return std::vector<std::vector<DimensionVector>>{std::move(out)};
 }
 
-StatusOr<std::vector<std::vector<absl::string_view>>>
+absl::StatusOr<std::vector<std::vector<absl::string_view>>>
 PjRtCApiExecutable::GetOutputMemoryKinds() const {
   PJRT_Executable_OutputMemoryKinds_Args args;
   args.struct_size = PJRT_Executable_OutputMemoryKinds_Args_STRUCT_SIZE;
@@ -1072,7 +1130,7 @@ PjRtCApiExecutable::GetOutputMemoryKinds() const {
   return std::vector<std::vector<absl::string_view>>{std::move(out)};
 }
 
-StatusOr<std::vector<std::shared_ptr<HloModule>>>
+absl::StatusOr<std::vector<std::shared_ptr<HloModule>>>
 PjRtCApiExecutable::GetHloModules() const {
   auto* c_api = pjrt_c_api();
   auto* executable = c_executable();
@@ -1141,7 +1199,7 @@ PjRtCApiExecutable::GetHloModules() const {
   return out;
 }
 
-StatusOr<std::string> PjRtCApiExecutable::SerializeExecutable() const {
+absl::StatusOr<std::string> PjRtCApiExecutable::SerializeExecutable() const {
   auto* c_api = pjrt_c_api();
   auto* executable = c_executable();
   PJRT_Executable_Serialize_Args ser_args;
@@ -1158,7 +1216,7 @@ StatusOr<std::string> PjRtCApiExecutable::SerializeExecutable() const {
   return std::string(ser_args.serialized_bytes, ser_args.serialized_bytes_size);
 }
 
-StatusOr<std::string> PjRtCApiExecutable::FingerprintExecutable() const {
+absl::StatusOr<std::string> PjRtCApiExecutable::FingerprintExecutable() const {
   const PJRT_Api* c_api_ = pjrt_c_api();
   PJRT_Executable_Fingerprint_Args args;
   args.struct_size = PJRT_Executable_Fingerprint_Args_STRUCT_SIZE;
@@ -1531,7 +1589,7 @@ PjRtCApiLoadedExecutable::GetCommonExecuteArgs(
   return args;
 }
 
-StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>
+absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>
 PjRtCApiLoadedExecutable::Execute(
     absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
     const ExecuteOptions& options,
@@ -1594,15 +1652,15 @@ PjRtCApiLoadedExecutable::Execute(
                                        client_);
 }
 
-StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
 PjRtCApiLoadedExecutable::ExecuteWithSingleDevice(
     absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
     const ExecuteOptions& options, std::optional<PjRtFuture<>>& returned_future,
     bool fill_future) {
   if (!options.send_callbacks.empty() || !options.recv_callbacks.empty()) {
-    return Status(absl::StatusCode::kUnimplemented,
-                  "Send/recv callbacks not implemented for "
-                  "PjRtCApiLoadedExecutable::ExecuteWithSingleDevice.");
+    return absl::Status(absl::StatusCode::kUnimplemented,
+                        "Send/recv callbacks not implemented for "
+                        "PjRtCApiLoadedExecutable::ExecuteWithSingleDevice.");
   }
 
   std::vector<std::vector<PjRtBuffer*>> argument_handles_vec = {
@@ -1650,7 +1708,7 @@ PjRtCApiLoadedExecutable::ExecuteWithSingleDevice(
       client_)[0]);
 }
 
-StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
 PjRtCApiLoadedExecutable::ExecuteSharded(
     absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
     const ExecuteOptions& options, std::optional<PjRtFuture<>>& returned_future,
@@ -1659,7 +1717,7 @@ PjRtCApiLoadedExecutable::ExecuteSharded(
                                  returned_future, fill_future);
 }
 
-StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
 PjRtCApiLoadedExecutable::ExecutePortable(
     absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
     const ExecuteOptions& options, std::optional<PjRtFuture<>>& returned_future,
@@ -1689,8 +1747,10 @@ bool PjRtCApiLoadedExecutable::IsDeleted() {
   return args.is_deleted;
 }
 
-StatusOr<std::string> PjRtCApiLoadedExecutable::FingerprintExecutable() const {
-  StatusOr<std::string> fingerprint = executable_->FingerprintExecutable();
+absl::StatusOr<std::string> PjRtCApiLoadedExecutable::FingerprintExecutable()
+    const {
+  absl::StatusOr<std::string> fingerprint =
+      executable_->FingerprintExecutable();
   if (fingerprint.ok()) {
     return *fingerprint;
   }
@@ -1812,7 +1872,7 @@ absl::Span<const bool> PjRtCApiBuffer::is_dynamic_dimension() const {
   return *is_dynamic_dimension_;
 }
 
-StatusOr<std::vector<int64_t>> PjRtCApiBuffer::logical_dimensions() {
+absl::StatusOr<std::vector<int64_t>> PjRtCApiBuffer::logical_dimensions() {
   PJRT_Buffer_UnpaddedDimensions_Args args;
   args.struct_size = PJRT_Buffer_UnpaddedDimensions_Args_STRUCT_SIZE;
   args.extension_start = nullptr;
@@ -1873,7 +1933,7 @@ PjRtFuture<> PjRtCApiBuffer::ToLiteral(MutableLiteralBase* literal) {
   return pjrt::ConvertCEventToCppFuture(args.event, api);
 }
 
-StatusOr<size_t> PjRtCApiBuffer::GetOnDeviceSizeInBytes() const {
+absl::StatusOr<size_t> PjRtCApiBuffer::GetOnDeviceSizeInBytes() const {
   PJRT_Buffer_OnDeviceSizeInBytes_Args args;
   args.struct_size = PJRT_Buffer_OnDeviceSizeInBytes_Args_STRUCT_SIZE;
   args.extension_start = nullptr;
@@ -1931,7 +1991,7 @@ bool PjRtCApiBuffer::IsDeleted() {
   return args.is_deleted;
 }
 
-StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCApiBuffer::CopyToDevice(
+absl::StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCApiBuffer::CopyToDevice(
     PjRtDevice* dst_device) {
   if (dst_device->client() == client_) {
     PJRT_Buffer_CopyToDevice_Args args;
@@ -1962,7 +2022,7 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCApiBuffer::CopyToDevice(
   }
 }
 
-StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCApiBuffer::CopyToMemorySpace(
+absl::StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCApiBuffer::CopyToMemorySpace(
     PjRtMemorySpace* dst_memory) {
   const PJRT_Api* api = pjrt_c_api();
 
@@ -2054,7 +2114,7 @@ PjRtFuture<> PjRtCApiBuffer::GetReadyFuture() {
   return PjRtFuture<>{*readiness_promise_};
 }
 
-StatusOr<std::unique_ptr<PjRtBuffer::ExternalReference>>
+absl::StatusOr<std::unique_ptr<PjRtBuffer::ExternalReference>>
 PjRtCApiBuffer::AcquireExternalReference() {
   PJRT_Buffer_IncreaseExternalReferenceCount_Args increase_reference_count_args;
   increase_reference_count_args.buffer = c_buffer();
@@ -2094,6 +2154,23 @@ PjRtCApiExternalReference::~PjRtCApiExternalReference() {
       client_->pjrt_c_api());
 }
 
+absl::Status PjRtCApiExternalReference::WaitUntilBufferReadyOnStream(
+    std::intptr_t stream) {
+  const PJRT_Api* c_api = buffer_->pjrt_c_api();
+  PJRT_Stream_Extension* extension = pjrt::FindExtension<PJRT_Stream_Extension>(
+      c_api, PJRT_Extension_Type::PJRT_Extension_Type_Stream);
+  if (extension == nullptr) {
+    return absl::UnimplementedError(
+        "Stream extension not implemented in this PJRT plugin.");
+  }
+  PJRT_Wait_Until_Buffer_Ready_On_Stream_Args args;
+  args.struct_size = PJRT_Wait_Until_Buffer_Ready_On_Stream_Args_STRUCT_SIZE;
+  args.stream = stream;
+  args.buffer = buffer_->c_buffer();
+  RETURN_STATUS_IF_PJRT_ERROR(extension->wait_stream(&args), c_api);
+  return absl::OkStatus();
+}
+
 // ------------------------------ Device Topology ------------------------------
 
 PjRtCApiTopologyDescription::PjRtCApiTopologyDescription(
@@ -2149,7 +2226,7 @@ PjRtCApiTopologyDescription::DeviceDescriptions() const {
   return out;
 }
 
-StatusOr<std::string> PjRtCApiTopologyDescription::Serialize() const {
+absl::StatusOr<std::string> PjRtCApiTopologyDescription::Serialize() const {
   PJRT_TopologyDescription_Serialize_Args args;
   args.struct_size = PJRT_TopologyDescription_Serialize_Args_STRUCT_SIZE;
   args.extension_start = nullptr;
@@ -2174,10 +2251,12 @@ void PjRtCApiTopologyDescription::InitAttributes() {
 
 // Initializes `PJRT_Compile_Args`, which will be used to call
 // API PJRT_Compile().
-static StatusOr<std::unique_ptr<PjRtExecutable>> InitializeArgsAndCompileAot(
-    const PJRT_Api* c_api, PjRtClient* client, const CompileOptions& options,
-    const PjRtTopologyDescription& topology, const std::string& code,
-    const std::string& format) {
+static absl::StatusOr<std::unique_ptr<PjRtExecutable>>
+InitializeArgsAndCompileAot(const PJRT_Api* c_api, PjRtClient* client,
+                            const CompileOptions& options,
+                            const PjRtTopologyDescription& topology,
+                            const std::string& code,
+                            const std::string& format) {
   PJRT_Compile_Args args;
   args.struct_size = PJRT_Compile_Args_STRUCT_SIZE;
   args.extension_start = nullptr;
@@ -2211,7 +2290,7 @@ static StatusOr<std::unique_ptr<PjRtExecutable>> InitializeArgsAndCompileAot(
   return ret;
 }
 
-StatusOr<std::unique_ptr<PjRtExecutable>> PjRtCApiCompiler::Compile(
+absl::StatusOr<std::unique_ptr<PjRtExecutable>> PjRtCApiCompiler::Compile(
     CompileOptions options, const XlaComputation& computation,
     const PjRtTopologyDescription& topology, PjRtClient* client) {
   std::string module_str = computation.proto().SerializeAsString();
@@ -2220,7 +2299,7 @@ StatusOr<std::unique_ptr<PjRtExecutable>> PjRtCApiCompiler::Compile(
                                      module_str, format);
 }
 
-StatusOr<std::unique_ptr<PjRtExecutable>> PjRtCApiCompiler::Compile(
+absl::StatusOr<std::unique_ptr<PjRtExecutable>> PjRtCApiCompiler::Compile(
     CompileOptions options, mlir::ModuleOp module,
     const PjRtTopologyDescription& topology, PjRtClient* client) {
   // TODO: Once plugins are ready, use SerializeUsingVersionedStablehlo.
@@ -2237,7 +2316,7 @@ StatusOr<std::unique_ptr<PjRtExecutable>> PjRtCApiCompiler::Compile(
 
 // -------------------------------- API access ---------------------------------
 
-StatusOr<std::unique_ptr<PjRtClient>> GetCApiClient(
+absl::StatusOr<std::unique_ptr<PjRtClient>> GetCApiClient(
     absl::string_view device_type,
     const absl::flat_hash_map<std::string, PjRtValueType>& create_options,
     std::shared_ptr<KeyValueStoreInterface> kv_store) {
@@ -2270,7 +2349,7 @@ StatusOr<std::unique_ptr<PjRtClient>> GetCApiClient(
       c_api, c_client, std::move(kv_callback_data)));
 }
 
-StatusOr<std::unique_ptr<PjRtTopologyDescription>> GetCApiTopology(
+absl::StatusOr<std::unique_ptr<PjRtTopologyDescription>> GetCApiTopology(
     absl::string_view device_type, absl::string_view topology_name,
     const absl::flat_hash_map<std::string, PjRtValueType>& create_options) {
   TF_ASSIGN_OR_RETURN(const PJRT_Api* c_api, pjrt::PjrtApi(device_type));
diff --git a/third_party/xla/xla/pjrt/pjrt_c_api_client.h b/third_party/xla/xla/pjrt/pjrt_c_api_client.h
index 3dcf33ed28587f..0a2c0dc0350ece 100644
--- a/third_party/xla/xla/pjrt/pjrt_c_api_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_c_api_client.h
@@ -28,6 +28,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/strings/string_view.h"
@@ -131,23 +132,31 @@ class PjRtCApiDevice : public PjRtDevice {
   int local_hardware_id() const override;
   PjRtLocalHardwareId local_hardware_id_typed() const override;
 
-  Status TransferToInfeed(const LiteralSlice& literal) override {
-    return Unimplemented("PJRT C API does not support TransferToInfeed");
+  absl::Status TransferToInfeed(const LiteralSlice& literal) override {
+    return Unimplemented(
+        "PJRT C API does not support TransferToInfeed. Please report an issue "
+        "at https://github.com/google/jax/issues if you need this feature.");
   }
 
-  Status TransferFromOutfeed(MutableBorrowingLiteral literal) override {
-    return Unimplemented("PJRT C API does not support TransferFromOutfeed");
+  absl::Status TransferFromOutfeed(MutableBorrowingLiteral literal) override {
+    return Unimplemented(
+        "PJRT C API does not support TransferFromOutfeed. Please report an "
+        "issue at https://github.com/google/jax/issues if you need this "
+        "feature.");
   }
 
   absl::Span<PjRtMemorySpace* const> memory_spaces() const override {
     return memory_spaces_;
   }
 
-  StatusOr<PjRtMemorySpace*> default_memory_space() const override;
+  absl::StatusOr<PjRtMemorySpace*> default_memory_space() const override;
 
   std::unique_ptr<ScopedAsyncTrackingEvent> CreateAsyncTrackingEvent(
       absl::string_view description) const override {
-    LOG(FATAL) << "PJRT C API does not support CreateAsyncTrackingEvent";
+    LOG(FATAL)
+        << "PJRT C API does not support CreateAsyncTrackingEvent. Please "
+           "report an issue at https://github.com/google/jax/issues if you "
+           "need this feature.";
     return nullptr;
   }
 
@@ -157,7 +166,10 @@ class PjRtCApiDevice : public PjRtDevice {
     return description_;
   }
 
-  StatusOr<tsl::AllocatorStats> GetAllocatorStats() const override;
+  absl::StatusOr<tsl::AllocatorStats> GetAllocatorStats() const override;
+
+  absl::StatusOr<std::intptr_t> GetStreamForExternalReadyEvents()
+      const override;
 
  private:
   friend class PjRtCApiClient;
@@ -173,11 +185,11 @@ class PjRtCApiCompiler : public PjRtCompiler {
  public:
   explicit PjRtCApiCompiler(const PJRT_Api* c_api) : c_api_(c_api) {}
 
-  StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
+  absl::StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
       CompileOptions options, const XlaComputation& computation,
       const PjRtTopologyDescription& topology, PjRtClient* client) override;
 
-  StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
+  absl::StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
       CompileOptions options, mlir::ModuleOp module,
       const PjRtTopologyDescription& topology, PjRtClient* client) override;
 
@@ -219,7 +231,7 @@ class PjRtCApiTopologyDescription : public PjRtTopologyDescription {
     return attributes_;
   }
 
-  StatusOr<Layout> GetDefaultLayout(
+  absl::StatusOr<Layout> GetDefaultLayout(
       PrimitiveType element_type,
       absl::Span<const int64_t> dims) const override {
     return Unimplemented("PJRT C API does not support GetDefaultLayout");
@@ -255,13 +267,12 @@ class PjRtCApiClient : public PjRtClient {
   absl::Span<PjRtDevice* const> devices() const override;
   absl::Span<PjRtDevice* const> addressable_devices() const override;
 
-  StatusOr<PjRtDevice*> LookupDevice(int device_id) const override;
-  StatusOr<PjRtDevice*> LookupDevice(
+  absl::StatusOr<PjRtDevice*> LookupDevice(
       PjRtGlobalDeviceId global_device_id) const override;
 
-  StatusOr<PjRtDevice*> LookupAddressableDevice(
+  absl::StatusOr<PjRtDevice*> LookupAddressableDevice(
       int local_hardware_id) const override;
-  StatusOr<PjRtDevice*> LookupAddressableDevice(
+  absl::StatusOr<PjRtDevice*> LookupAddressableDevice(
       PjRtLocalDeviceId local_device_id) const override;
 
   absl::Span<PjRtMemorySpace* const> memory_spaces() const override;
@@ -279,119 +290,142 @@ class PjRtCApiClient : public PjRtClient {
     return PjRtRuntimeType::kTfrt;
   }
 
-  StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
+  absl::StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
       int num_replicas, int num_partitions) const override;
 
-  StatusOr<std::unique_ptr<HloCostAnalysis>> GetHloCostAnalysis()
+  absl::StatusOr<std::unique_ptr<HloCostAnalysis>> GetHloCostAnalysis()
       const override {
-    return Unimplemented("PJRT C API does not support GetHloCostAnalysis");
+    return Unimplemented(
+        "PJRT C API does not support GetHloCostAnalysis. Please report an "
+        "issue at https://github.com/google/jax/issues if you need this "
+        "feature.");
   }
 
-  StatusOr<Layout> GetDefaultLayout(PrimitiveType element_type,
-                                    absl::Span<const int64_t> dims) override {
-    // TODO(skyewm): implement
-    return Unimplemented("PJRT C API does not support GetDefaultLayout");
-  }
+  absl::StatusOr<Layout> GetDefaultLayout(
+      PrimitiveType element_type, absl::Span<const int64_t> dims) override;
 
-  StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
       const XlaComputation& computation, CompileOptions options) override;
 
-  StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
       mlir::ModuleOp module, CompileOptions options) override;
 
   // `PjRtCApiClient::DeserializeExecutable()` ignores `CompileOptions` arg
-  StatusOr<std::unique_ptr<PjRtLoadedExecutable>> DeserializeExecutable(
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> DeserializeExecutable(
       absl::string_view serialized,
       std::optional<CompileOptions> options) override;
 
-  StatusOr<std::unique_ptr<PjRtBuffer>> CreateUninitializedBuffer(
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateUninitializedBuffer(
       const Shape& shape, PjRtDevice* device) override {
     return Unimplemented(
-        "PJRT C API does not support CreateUninitializedBuffer");
+        "PJRT C API does not support CreateUninitializedBuffer. Please report "
+        "an issue at https://github.com/google/jax/issues if you need this "
+        "feature.");
   }
 
-  StatusOr<const PjRtTopologyDescription*> GetTopologyDescription()
+  absl::StatusOr<const PjRtTopologyDescription*> GetTopologyDescription()
       const override;
 
-  StatusOr<std::unique_ptr<AsyncHostToDeviceTransferManager>>
+  absl::StatusOr<std::unique_ptr<AsyncHostToDeviceTransferManager>>
   CreateBuffersForAsyncHostToDevice(absl::Span<const Shape> shapes,
                                     PjRtDevice* device) override {
     return Unimplemented(
-        "PJRT C API does not support CreateBuffersForAsyncHostToDevice");
+        "PJRT C API does not support CreateBuffersForAsyncHostToDevice. Please "
+        "report an issue at https://github.com/google/jax/issues if you need "
+        "this feature.");
   }
 
   absl::StatusOr<std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
   CreateBuffersForAsyncHostToDevice(absl::Span<const Shape> shapes,
                                     PjRtMemorySpace* memory_space) override {
     return Unimplemented(
-        "PJRT C API does not support CreateBuffersForAsyncHostToDevice");
+        "PJRT C API does not support CreateBuffersForAsyncHostToDevice. Please "
+        "report an issue at https://github.com/google/jax/issues if you need "
+        "this feature.");
   }
 
-  StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
       const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
       std::optional<absl::Span<int64_t const>> byte_strides,
       HostBufferSemantics host_buffer_semantics,
       absl::AnyInvocable<void() &&> on_done_with_host_buffer,
       PjRtDevice* device) override;
 
-  StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
       const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
       std::optional<absl::Span<int64_t const>> byte_strides,
       HostBufferSemantics host_buffer_semantics,
       absl::AnyInvocable<void() &&> on_done_with_host_buffer,
       PjRtDevice* device, const Layout* device_layout) override;
 
-  StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
       const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
       std::optional<absl::Span<int64_t const>> byte_strides,
       HostBufferSemantics host_buffer_semantics,
       absl::AnyInvocable<void() &&> on_done_with_host_buffer,
       PjRtMemorySpace* memory_space, const Layout* device_layout) override;
 
-  StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
       const LiteralSlice& literal, PjRtDevice* device) override {
-    return Unimplemented("PJRT C API does not support BufferFromHostLiteral");
+    return Unimplemented(
+        "PJRT C API does not support BufferFromHostLiteral. Please report an "
+        "issue at https://github.com/google/jax/issues if you need this "
+        "feature.");
   }
 
-  StatusOr<std::unique_ptr<PjRtBuffer>> CreateViewOfDeviceBuffer(
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateViewOfDeviceBuffer(
       void* device_ptr, const Shape& shape, PjRtDevice* device,
       std::function<void()> on_delete_callback,
       std::optional<std::intptr_t> stream) override;
 
-  StatusOr<std::uintptr_t> UnsafeBufferPointer(PjRtBuffer* buffer) override;
+  absl::StatusOr<std::uintptr_t> UnsafeBufferPointer(
+      PjRtBuffer* buffer) override;
 
-  StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
   MakeCrossHostReceiveBuffers(absl::Span<const Shape> shapes,
                               PjRtDevice* device,
                               PjRtCrossHostRecvNotifier notifier) override {
     return Unimplemented(
-        "PJRT C API does not support MakeCrossHostReceiveBuffers");
+        "PJRT C API does not support MakeCrossHostReceiveBuffers. Please "
+        "report an issue at https://github.com/google/jax/issues if you need "
+        "this feature.");
   }
 
-  StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
   MakeCrossHostReceiveBuffersForGather(
       absl::Span<const Shape> shapes, std::vector<GatherDetails> gather_details,
       PjRtDevice* device, PjRtCrossHostRecvNotifier notifier) override {
     return Unimplemented(
-        "PJRT C API does not support MakeCrossHostReceiveBuffers");
+        "PJRT C API does not support MakeCrossHostReceiveBuffers. Please "
+        "report an issue at https://github.com/google/jax/issues if you need "
+        "this feature.");
   }
 
-  StatusOr<ChannelHandle> CreateChannelHandle() override {
-    return Unimplemented("PJRT C API does not support CreateChannelHandle");
+  absl::StatusOr<ChannelHandle> CreateChannelHandle() override {
+    return Unimplemented(
+        "PJRT C API does not support CreateChannelHandle. Please report an "
+        "issue at https://github.com/google/jax/issues if you need this "
+        "feature.");
   }
 
-  StatusOr<ChannelHandle> CreateDeviceToHostChannelHandle() override {
+  absl::StatusOr<ChannelHandle> CreateDeviceToHostChannelHandle() override {
     return Unimplemented(
-        "PJRT C API does not support CreateDeviceToHostChannelHandle");
+        "PJRT C API does not support CreateDeviceToHostChannelHandle. Please "
+        "report an issue at https://github.com/google/jax/issues if you need "
+        "this feature.");
   }
 
-  StatusOr<ChannelHandle> CreateHostToDeviceChannelHandle() override {
+  absl::StatusOr<ChannelHandle> CreateHostToDeviceChannelHandle() override {
     return Unimplemented(
-        "PJRT C API does not support CreateHostToDeviceChannelHandle");
+        "PJRT C API does not support CreateHostToDeviceChannelHandle. Please "
+        "report an issue at https://github.com/google/jax/issues if you need "
+        "this feature.");
   }
 
-  Status Defragment() override {
-    return Unimplemented("PJRT C API does not support Defragment");
+  absl::Status Defragment() override {
+    return Unimplemented(
+        "PJRT C API does not support Defragment. Please report an issue at "
+        "https://github.com/google/jax/issues if you need this feature.");
   }
 
   bool SupportsSendRecvCallbacks() const override { return true; }
@@ -421,7 +455,7 @@ class PjRtCApiClient : public PjRtClient {
   void InitDevicesAndMemorySpaces();
   void InitAttributes();
 
-  StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBufferInternalImpl(
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBufferInternalImpl(
       const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
       std::optional<absl::Span<int64_t const>> byte_strides,
       HostBufferSemantics host_buffer_semantics,
@@ -444,7 +478,7 @@ class PjRtCApiClient : public PjRtClient {
   // There may be an error fetching the topology desc via the C API
   // (e.g. unimplemented). Save the error during client init so we can return it
   // from GetTopologyDescription().
-  StatusOr<const PjRtCApiTopologyDescription> topo_desc_;
+  absl::StatusOr<const PjRtCApiTopologyDescription> topo_desc_;
 
   const std::string platform_version_;
   const std::string platform_name_;
@@ -473,9 +507,9 @@ class PjRtCApiBuffer : public PjRtBuffer {
 
   absl::Span<const bool> is_dynamic_dimension() const override;
 
-  StatusOr<std::vector<int64_t>> logical_dimensions() override;
+  absl::StatusOr<std::vector<int64_t>> logical_dimensions() override;
 
-  StatusOr<Shape> logical_on_device_shape() override {
+  absl::StatusOr<Shape> logical_on_device_shape() override {
     LOG(FATAL) << "PjRtBuffer::on_logical_device_shape() not implemented in "
                   "PJRT C API";
   }
@@ -486,7 +520,7 @@ class PjRtCApiBuffer : public PjRtBuffer {
 
   PjRtClient* client() const override { return client_; }
 
-  StatusOr<std::unique_ptr<ExternalReference>> AcquireExternalReference()
+  absl::StatusOr<std::unique_ptr<ExternalReference>> AcquireExternalReference()
       override;
 
   PjRtFuture<> ToLiteral(MutableLiteralBase* literal) override;
@@ -494,41 +528,46 @@ class PjRtCApiBuffer : public PjRtBuffer {
       absl::AnyInvocable<absl::StatusOr<MutableLiteralBase*>() &&> generator)
       override;
 
-  StatusOr<size_t> GetOnDeviceSizeInBytes() const override;
+  absl::StatusOr<size_t> GetOnDeviceSizeInBytes() const override;
 
   PjRtFuture<> CopyRawToHost(void* dst, int64_t offset,
                              int64_t transfer_size) override {
-    return PjRtFuture<>(
-        Unimplemented("PJRT C API does not support CopyRawToHost"));
+    return PjRtFuture<>(Unimplemented(
+        "PJRT C API does not support CopyRawToHost. Please report an issue at "
+        "https://github.com/google/jax/issues if you need this feature."));
   }
 
   void Delete() override;
 
-  StatusOr<std::unique_ptr<ExternalReference>> ReleaseDeviceMemoryOwnership(
-      bool wait_for_operations_to_complete) override {
+  absl::StatusOr<std::unique_ptr<ExternalReference>>
+  ReleaseDeviceMemoryOwnership(bool wait_for_operations_to_complete) override {
     return Unimplemented(
         "PJRT C API does not support ReleaseDeviceMemoryOwnership");
   }
 
   bool IsDeleted() override;
 
-  StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDevice(
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDevice(
       PjRtDevice* dst_device) override;
 
-  StatusOr<std::unique_ptr<PjRtBuffer>> CopyToMemorySpace(
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToMemorySpace(
       PjRtMemorySpace* dst_memory_space) override;
 
-  void CopyToRemoteDevice(
-      PjRtFuture<StatusOr<std::string>> serialized_descriptor,
-      RemoteSendCallback on_done) override {
-    LOG(ERROR) << "PJRT C API does not support CopyToRemoteDevice";
+  void CopyToRemoteDevice(PjRtFuture<std::string> serialized_descriptor,
+                          RemoteSendCallback on_done) override {
+    LOG(ERROR) << "PJRT C API does not support CopyToRemoteDevice. Please "
+                  "report an issue at https://github.com/google/jax/issues if "
+                  "you need this feature.";
   }
 
   void CopyToRemoteDeviceScattered(
-      PjRtFuture<StatusOr<std::vector<std::string>>> serialized_descriptors,
+      PjRtFuture<std::vector<std::string>> serialized_descriptors,
       std::vector<RemoteSendCallback> callbacks,
       const ScatterDetails& scatter_details) override {
-    LOG(ERROR) << "PJRT C API does not support CopyToRemoteDeviceScattered";
+    LOG(ERROR)
+        << "PJRT C API does not support CopyToRemoteDeviceScattered. Please "
+           "report an issue at https://github.com/google/jax/issues if you "
+           "need this feature.";
   }
 
   PjRtFuture<> GetReadyFuture() override;
@@ -574,6 +613,8 @@ class PjRtCApiExternalReference : public PjRtBuffer::ExternalReference {
   }
   ~PjRtCApiExternalReference() override;
 
+  absl::Status WaitUntilBufferReadyOnStream(std::intptr_t stream) override;
+
  private:
   PjRtCApiClient* client_;
   PjRtCApiBuffer* buffer_;
@@ -589,37 +630,37 @@ class PjRtCApiExecutable : public PjRtExecutable {
 
   int64_t SizeOfGeneratedCodeInBytes() const override;
 
-  StatusOr<absl::flat_hash_map<std::string, PjRtValueType>> GetCostAnalysis()
-      const override;
+  absl::StatusOr<absl::flat_hash_map<std::string, PjRtValueType>>
+  GetCostAnalysis() const override;
 
-  StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
+  absl::StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
       const override;
 
-  StatusOr<CompiledMemoryStats> GetCompiledMemoryStats() const override {
+  absl::StatusOr<CompiledMemoryStats> GetCompiledMemoryStats() const override {
     return pjrt::GetCompiledMemoryStats(c_api_, executable_.get());
   }
 
-  StatusOr<std::vector<Shape>> GetOutputShapes() const override {
+  absl::StatusOr<std::vector<Shape>> GetOutputShapes() const override {
     LOG(FATAL) << "PjRtExecutable::GetOutputShapes() not implemented in PJRT C "
                   "API. Please use PjRtExecutable::GetOutputElementTypes() or "
                   "PjRtExecutable::GetOutputDimensions().";
   }
 
-  StatusOr<std::vector<std::vector<PrimitiveType>>> GetOutputElementTypes()
-      const override;
+  absl::StatusOr<std::vector<std::vector<PrimitiveType>>>
+  GetOutputElementTypes() const override;
 
-  StatusOr<std::vector<std::vector<DimensionVector>>> GetOutputDimensions()
-      const override;
+  absl::StatusOr<std::vector<std::vector<DimensionVector>>>
+  GetOutputDimensions() const override;
 
-  StatusOr<std::vector<std::vector<absl::string_view>>> GetOutputMemoryKinds()
-      const override;
+  absl::StatusOr<std::vector<std::vector<absl::string_view>>>
+  GetOutputMemoryKinds() const override;
 
   const PJRT_Api* pjrt_c_api() const { return c_api_; }
   PJRT_Executable* c_executable() const { return executable_.get(); }
 
-  StatusOr<std::string> SerializeExecutable() const override;
+  absl::StatusOr<std::string> SerializeExecutable() const override;
 
-  StatusOr<std::string> FingerprintExecutable() const override;
+  absl::StatusOr<std::string> FingerprintExecutable() const override;
 
  private:
   const PJRT_Api* c_api_;
@@ -640,8 +681,8 @@ class PjRtCApiLoadedExecutable : public PjRtLoadedExecutable {
     return executable_->SizeOfGeneratedCodeInBytes();
   }
 
-  StatusOr<absl::flat_hash_map<std::string, PjRtValueType>> GetCostAnalysis()
-      const override {
+  absl::StatusOr<absl::flat_hash_map<std::string, PjRtValueType>>
+  GetCostAnalysis() const override {
     return executable_->GetCostAnalysis();
   }
 
@@ -659,48 +700,48 @@ class PjRtCApiLoadedExecutable : public PjRtLoadedExecutable {
     return addressable_devices_;
   }
 
-  StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
+  absl::StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
       const override {
     return executable_->GetHloModules();
   }
 
-  StatusOr<CompiledMemoryStats> GetCompiledMemoryStats() const override {
+  absl::StatusOr<CompiledMemoryStats> GetCompiledMemoryStats() const override {
     return executable_->GetCompiledMemoryStats();
   }
 
-  StatusOr<std::vector<Shape>> GetOutputShapes() const override {
+  absl::StatusOr<std::vector<Shape>> GetOutputShapes() const override {
     LOG(FATAL)
         << "PjRtLoadedExecutable::GetOutputShapes() not implemented in PJRT C "
            "API. Please use PjRtLoadedExecutable::GetOutputElementTypes() or "
            "PjRtLoadedExecutable::GetOutputDimensions().";
   }
 
-  StatusOr<std::vector<std::vector<PrimitiveType>>> GetOutputElementTypes()
-      const override {
+  absl::StatusOr<std::vector<std::vector<PrimitiveType>>>
+  GetOutputElementTypes() const override {
     return executable_->GetOutputElementTypes();
   }
 
-  StatusOr<std::vector<std::vector<DimensionVector>>> GetOutputDimensions()
-      const override {
+  absl::StatusOr<std::vector<std::vector<DimensionVector>>>
+  GetOutputDimensions() const override {
     return executable_->GetOutputDimensions();
   }
 
-  StatusOr<std::vector<std::vector<absl::string_view>>> GetOutputMemoryKinds()
-      const override {
+  absl::StatusOr<std::vector<std::vector<absl::string_view>>>
+  GetOutputMemoryKinds() const override {
     return executable_->GetOutputMemoryKinds();
   }
 
-  StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> Execute(
+  absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> Execute(
       absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
       const ExecuteOptions& options,
       std::optional<std::vector<PjRtFuture<>>>& returned_futures) override;
 
-  StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteSharded(
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteSharded(
       absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
       const ExecuteOptions& options,
       std::optional<PjRtFuture<>>& returned_future, bool fill_future) override;
 
-  StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecutePortable(
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecutePortable(
       absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
       const ExecuteOptions& options,
       std::optional<PjRtFuture<>>& returned_future, bool fill_future) override;
@@ -708,7 +749,7 @@ class PjRtCApiLoadedExecutable : public PjRtLoadedExecutable {
   void Delete() override;
   bool IsDeleted() override;
 
-  StatusOr<std::string> SerializeExecutable() const override {
+  absl::StatusOr<std::string> SerializeExecutable() const override {
     return executable_->SerializeExecutable();
   }
 
@@ -731,7 +772,7 @@ class PjRtCApiLoadedExecutable : public PjRtLoadedExecutable {
 
   // Override to call FingerprintExecutable through the wrapped
   // PjRtCApiExecutable.
-  StatusOr<std::string> FingerprintExecutable() const override;
+  absl::StatusOr<std::string> FingerprintExecutable() const override;
 
  private:
   // Groups data needed to support send/recv execution callbacks.
@@ -758,10 +799,11 @@ class PjRtCApiLoadedExecutable : public PjRtLoadedExecutable {
       SendRecvCallbackData& send_recv_callback_data,
       std::vector<int64_t>& non_donatable_input_indices_storage);
 
-  StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteWithSingleDevice(
-      absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
-      const ExecuteOptions& options,
-      std::optional<PjRtFuture<>>& returned_future, bool fill_future);
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+  ExecuteWithSingleDevice(absl::Span<PjRtBuffer* const> argument_handles,
+                          PjRtDevice* device, const ExecuteOptions& options,
+                          std::optional<PjRtFuture<>>& returned_future,
+                          bool fill_future);
 
   PjRtCApiClient* client_;
   std::unique_ptr<PJRT_LoadedExecutable, ::pjrt::PJRT_LoadedExecutableDeleter>
@@ -785,7 +827,7 @@ class CApiCopyToDeviceStream : public CopyToDeviceStream {
   const PJRT_Api* c_api_;
 };
 
-StatusOr<std::unique_ptr<PjRtClient>> GetCApiClient(
+absl::StatusOr<std::unique_ptr<PjRtClient>> GetCApiClient(
     absl::string_view device_type,
     const absl::flat_hash_map<std::string, PjRtValueType>& create_options = {},
     std::shared_ptr<KeyValueStoreInterface> kv_store = nullptr);
@@ -798,7 +840,7 @@ absl::StatusOr<std::unique_ptr<PjRtTopologyDescription>> GetCApiTopology(
 // registered with standard way (xla_bridge.register_plugin).
 // TODO(b/322357665): Delete this method after TPU plugin changes to use the
 // standard registration.
-StatusOr<std::unique_ptr<PjRtTopologyDescription>> GetCApiTopology(
+absl::StatusOr<std::unique_ptr<PjRtTopologyDescription>> GetCApiTopology(
     absl::string_view device_type, absl::string_view topology_name,
     const absl::flat_hash_map<std::string, PjRtValueType>& create_options = {});
 
diff --git a/third_party/xla/xla/pjrt/pjrt_client.cc b/third_party/xla/xla/pjrt/pjrt_client.cc
index 35a397f4469731..ea3534ff8f241b 100644
--- a/third_party/xla/xla/pjrt/pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_client.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/base/casts.h"
+#include "absl/status/status.h"
 #include "absl/strings/substitute.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/pjrt/utils.h"
@@ -30,7 +31,8 @@ namespace xla {
 
 PjRtBuffer::ExternalReference::~ExternalReference() = default;
 
-StatusOr<std::uintptr_t> PjRtClient::UnsafeBufferPointer(PjRtBuffer* buffer) {
+absl::StatusOr<std::uintptr_t> PjRtClient::UnsafeBufferPointer(
+    PjRtBuffer* buffer) {
   if (buffer->on_device_shape().IsTuple()) {
     return Unimplemented(
         "unsafe_buffer_pointer is not implemented for tuple buffers.");
@@ -43,22 +45,11 @@ StatusOr<std::uintptr_t> PjRtClient::UnsafeBufferPointer(PjRtBuffer* buffer) {
   return absl::bit_cast<std::uintptr_t>(ptr);
 }
 
-PjRtFuture<> PjRtBuffer::CopyRawToHostFuture(PjRtFuture<StatusOr<void*>> dst,
+PjRtFuture<> PjRtBuffer::CopyRawToHostFuture(PjRtFuture<void*> dst,
                                              int64_t offset,
                                              int64_t transfer_size) {
-  auto promise = PjRtFuture<>::CreatePromise();
-  dst.OnReady(
-      [this, promise, offset, transfer_size](StatusOr<void*> dst) mutable {
-        if (dst.ok()) {
-          CopyRawToHost(*dst, offset, transfer_size)
-              .OnReady([promise = std::move(promise)](Status status) mutable {
-                promise.Set(std::move(status));
-              });
-        } else {
-          promise.Set(dst.status());
-        }
-      });
-  return PjRtFuture<>(std::move(promise));
+  return PjRtFuture<>(absl::UnimplementedError(
+      "PjRtBuffer::CopyRawToHostFuture is not implemented"));
 }
 
 std::string CompiledMemoryStats::DebugString() const {
@@ -88,7 +79,7 @@ PjRtHostMemoryForDeviceManager::~PjRtHostMemoryForDeviceManager() = default;
 
 CopyToDeviceStream::~CopyToDeviceStream() = default;
 
-StatusOr<absl::flat_hash_map<std::string, PjRtValueType>>
+absl::StatusOr<absl::flat_hash_map<std::string, PjRtValueType>>
 PjRtLoadedExecutable::GetCostAnalysis() const {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloCostAnalysis> hlo_cost_analysis,
                       client()->GetHloCostAnalysis());
diff --git a/third_party/xla/xla/pjrt/pjrt_client.h b/third_party/xla/xla/pjrt/pjrt_client.h
index 7437944dbbdf14..c5a2a0b1e2ff76 100644
--- a/third_party/xla/xla/pjrt/pjrt_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_client.h
@@ -209,15 +209,15 @@ class PjRtDevice {
       absl::string_view description) const = 0;
 
   // Transfer the given literal to the infeed queue.
-  virtual Status TransferToInfeed(const LiteralSlice& literal) = 0;
+  virtual absl::Status TransferToInfeed(const LiteralSlice& literal) = 0;
 
   // Transfer and return a value of the given shape from the outfeed queue.
-  virtual Status TransferFromOutfeed(MutableBorrowingLiteral literal) = 0;
+  virtual absl::Status TransferFromOutfeed(MutableBorrowingLiteral literal) = 0;
 
   // Returns allocator stats for the device. Only some PjRtDevice
   // implementations support allocator_stats, and those that do not will return
   // an Unimplemented error.
-  virtual StatusOr<tsl::AllocatorStats> GetAllocatorStats() const {
+  virtual absl::StatusOr<tsl::AllocatorStats> GetAllocatorStats() const {
     return Unimplemented("GetAllocatorStats is not supported");
   }
 
@@ -226,7 +226,7 @@ class PjRtDevice {
   virtual absl::Span<PjRtMemorySpace* const> memory_spaces() const = 0;
 
   // Returns the default memory space attached to this device.
-  virtual StatusOr<PjRtMemorySpace*> default_memory_space() const = 0;
+  virtual absl::StatusOr<PjRtMemorySpace*> default_memory_space() const = 0;
 
   virtual absl::StatusOr<PjRtMemorySpace*> memory_space_by_kind(
       absl::string_view memory_space_kind) const {
@@ -237,7 +237,8 @@ class PjRtDevice {
   // an externally-managed buffer is ready to use on this device. This is
   // intended to support dlpack on GPU and is not expected to be implemented for
   // all hardware platforms.
-  virtual StatusOr<std::intptr_t> GetStreamForExternalReadyEvents() const {
+  virtual absl::StatusOr<std::intptr_t> GetStreamForExternalReadyEvents()
+      const {
     return Unimplemented(
         "PjRtDevice::GetStreamForExternalReadyEvents only implemented for "
         "GPU");
@@ -255,7 +256,8 @@ class PjRtDevice {
   //
   // Returns error otherwise, including in the case that poisoning is not
   // implemented by this client.
-  virtual StatusOr<bool> PoisonExecution(int32_t launch_id, Status error) {
+  virtual absl::StatusOr<bool> PoisonExecution(int32_t launch_id,
+                                               absl::Status error) {
     return Unimplemented("PoisonExecution is not supported");
   }
 };
@@ -286,9 +288,9 @@ struct PjRtCrossHostRecvDescriptors {
 // duplicate (e.g., both send and cancel) then the system will be left in an
 // undefined state. If there is no send or cancellation then the system will
 // hang indefinitely.
-using PjRtCrossHostSendCancelNotifier =
-    std::function<void(absl::string_view serialized_descriptor, Status reason,
-                       std::function<void(Status)> on_canceled)>;
+using PjRtCrossHostSendCancelNotifier = std::function<void(
+    absl::string_view serialized_descriptor, absl::Status reason,
+    std::function<void(Status)> on_canceled)>;
 // State asynchronously returned by MakeCrossHostReceiveBuffers. "descriptors"
 // will match the returned PjRtBuffer objects 1:1. Specifically, each PjRtBuffer
 // returned by MakeCrossHostReceiveBuffers will have one
@@ -426,17 +428,17 @@ class PjRtHostMemoryForDeviceManager {
   // Transforms the host memory representations of a shape with the host layout
   // to the host memory representation of the same shape with the device layout.
   // `src_shape` and `dst_shape` may only differ in their layouts.
-  virtual StatusOr<PjRtChunk> ToDeviceLayout(const void* src_data,
-                                             size_t src_size,
-                                             const Shape& host_shape,
-                                             const Shape& device_shape) = 0;
+  virtual absl::StatusOr<PjRtChunk> ToDeviceLayout(
+      const void* src_data, size_t src_size, const Shape& host_shape,
+      const Shape& device_shape) = 0;
 
   // Transforms the host memory representations of a shape with the device
   // layout to the host memory representation of the same shape with the host
   // layout. `src_shape` and `dst_shape` may only differ in their layouts.
-  virtual Status ToHostLayout(const void* src_data, size_t src_size,
-                              const Shape& src_shape, void* dst_data,
-                              size_t dst_size, const Shape& dst_shape) = 0;
+  virtual absl::Status ToHostLayout(const void* src_data, size_t src_size,
+                                    const Shape& src_shape, void* dst_data,
+                                    size_t dst_size,
+                                    const Shape& dst_shape) = 0;
 };
 
 class PjRtLoadedExecutable;
@@ -525,21 +527,17 @@ class PjRtClient {
   virtual absl::Span<PjRtDevice* const> addressable_devices() const = 0;
 
   // Lookup any PjRtDevice for a given PjRtDevice::id().
-  ABSL_DEPRECATED("Use LookupDevice(PjRtGlobalDeviceId) instead")
-  virtual StatusOr<PjRtDevice*> LookupDevice(int device_id) const {
-    return LookupDevice(PjRtGlobalDeviceId(device_id));
-  }
-  virtual StatusOr<PjRtDevice*> LookupDevice(
+  virtual absl::StatusOr<PjRtDevice*> LookupDevice(
       PjRtGlobalDeviceId global_device_id) const = 0;
 
   // Return an addressable PjRtDevice for a given
   // PjRtDevice::local_hardware_id().
   ABSL_DEPRECATED("Use LookupAddressableDevice(PjRtLocalDeviceId) instead")
-  virtual StatusOr<PjRtDevice*> LookupAddressableDevice(
+  virtual absl::StatusOr<PjRtDevice*> LookupAddressableDevice(
       int local_hardware_id) const {
     return LookupAddressableDevice(PjRtLocalDeviceId(local_hardware_id));
   }
-  virtual StatusOr<PjRtDevice*> LookupAddressableDevice(
+  virtual absl::StatusOr<PjRtDevice*> LookupAddressableDevice(
       PjRtLocalDeviceId local_device_id) const = 0;
 
   // Return all memory spaces owned by the client.
@@ -569,7 +567,7 @@ class PjRtClient {
 
   // Return a device-specific default device assignment, e.g., GPU and TPU may
   // be different.
-  virtual StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
+  virtual absl::StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
       int num_replicas, int num_partitions) const = 0;
 
   // Returns a device-specific default device assignment for multi-slice system.
@@ -578,7 +576,7 @@ class PjRtClient {
   // communication happens across replicas assuming then that
   // num_replicas_per_slice is going to be "num_replicas / num_slices".
   // TODO(zhangqiaorjc): Convert this to pure virtual and push down.
-  virtual StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
+  virtual absl::StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
       int num_replicas, std::optional<int> num_replicas_per_slice,
       int num_partitions, const MultiSliceConfig* multi_slice_config) const {
     return Unimplemented("Multi slice device assignment is not supported.");
@@ -590,19 +588,19 @@ class PjRtClient {
   // default layout is used for program arguments and outputs unless
   // user-specified or compiler-chosen layouts are requested via the
   // "mhlo.layout_mode" attribute.
-  virtual StatusOr<Layout> GetDefaultLayout(PrimitiveType element_type,
-                                            absl::Span<const int64_t> dims) = 0;
+  virtual absl::StatusOr<Layout> GetDefaultLayout(
+      PrimitiveType element_type, absl::Span<const int64_t> dims) = 0;
 
   // Returns a backend-specific HLO cost analysis visitor.
-  virtual StatusOr<std::unique_ptr<HloCostAnalysis>> GetHloCostAnalysis()
+  virtual absl::StatusOr<std::unique_ptr<HloCostAnalysis>> GetHloCostAnalysis()
       const = 0;
 
   // Compile `computation` with given `options`.
-  virtual StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
+  virtual absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
       const XlaComputation& computation, CompileOptions options) = 0;
 
   // Variant of `Compile` that accepts an MLIR module.
-  virtual StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
+  virtual absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
       mlir::ModuleOp module, CompileOptions options) = 0;
 
   // Deserializes a serialized executable as produced by
@@ -612,13 +610,14 @@ class PjRtClient {
   // Pending completion of b/237720161, `options` is a mandatory argument in
   // most implementations of this interface. They _are_ optional for
   // implementations related to the PJRT C API.
-  virtual StatusOr<std::unique_ptr<PjRtLoadedExecutable>> DeserializeExecutable(
-      absl::string_view serialized, std::optional<CompileOptions> options) = 0;
+  virtual absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
+  DeserializeExecutable(absl::string_view serialized,
+                        std::optional<CompileOptions> options) = 0;
 
   // LoadSerializedExecutable takes the serialized output of PjRtExecutable. The
   // returned executable is loaded by this client. The same checks are made as
   // in Load that the serialized executable is compatible with the client.
-  virtual StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
+  virtual absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
   LoadSerializedExecutable(absl::string_view serialized,
                            std::optional<CompileOptions> options,
                            const LoadOptions& load_options) {
@@ -631,35 +630,42 @@ class PjRtClient {
   // PjRtExecutable contains a copy of the CompileOptions that was used to
   // generate the executable. Load will use the CompileOptions from within the
   // executable.
-  virtual StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Load(
+  virtual absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Load(
       std::unique_ptr<PjRtExecutable> executable,
       const LoadOptions& load_options) {
     return Unimplemented("Loading executable not supported.");
   }
 
   // Creates a buffer on the device without initializing or copying any data.
-  virtual StatusOr<std::unique_ptr<PjRtBuffer>> CreateUninitializedBuffer(
+  virtual absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateUninitializedBuffer(
       const Shape& shape, PjRtDevice* device) = 0;
 
-  // Creates buffer that carries an error future without allocating memory.
-  virtual StatusOr<std::unique_ptr<PjRtBuffer>> CreateErrorBuffer(
-      Status error, const Shape& shape, PjRtDevice* device) {
+  // Creates buffer in the given memory space that carries an error future
+  // without allocating memory.
+  virtual absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateErrorBuffer(
+      absl::Status error, const Shape& shape, PjRtMemorySpace* memory) {
     return Unimplemented("CreateErrorBuffer not supported.");
   }
 
+  // Creates buffer in the given device that carries an error future without
+  // allocating memory.
+  ABSL_DEPRECATED("Use CreateErrorBuffer(Status, Shape, PjRtMemorySpace*)")
+  virtual absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateErrorBuffer(
+      absl::Status error, const Shape& shape, PjRtDevice* device) {
+    auto default_memory_space = device->default_memory_space();
+    if (!default_memory_space.ok()) {
+      return default_memory_space.status();
+    }
+    return CreateErrorBuffer(std::move(error), shape, *default_memory_space);
+  }
+
   // Gets the pointer to the topology description held by the client.
-  virtual StatusOr<const PjRtTopologyDescription*> GetTopologyDescription()
-      const {
+  virtual absl::StatusOr<const PjRtTopologyDescription*>
+  GetTopologyDescription() const {
     return Unimplemented("GetTopologyDescription not supported on platform %s",
                          platform_name());
   }
 
-  // Returns topology object for compilation based on this client's topology.
-  virtual StatusOr<const PjRtTopologyDescription*>
-  GetFullTopologyForCompilation() const {
-    return GetTopologyDescription();
-  }
-
   // A client may want to create a buffer, and hand the buffer to other PjRt
   // methods, before the data to store in the buffer is available to the client.
   // This is supported using CreateBuffersForAsyncHostToDevice, which returns an
@@ -705,7 +711,7 @@ class PjRtClient {
     // transfer is complete but before the buffers are made available to
     // their consumers. 'literal' must remain in scope until on_done is
     // called.
-    virtual Status TransferLiteralToBuffer(
+    virtual absl::Status TransferLiteralToBuffer(
         int buffer_index, const LiteralSlice& literal,
         absl::AnyInvocable<void() &&> on_done) = 0;
 
@@ -718,7 +724,7 @@ class PjRtClient {
     // buffer_index can be made after this call. on_done is called when the
     // transfer is complete but before the buffers are made available to their
     // consumers. 'data' must remain in scope until on_done is called.
-    virtual Status TransferRawDataToBuffer(
+    virtual absl::Status TransferRawDataToBuffer(
         int buffer_index, absl::string_view data,
         absl::AnyInvocable<void() &&> on_done) = 0;
 
@@ -732,7 +738,7 @@ class PjRtClient {
     // calls) into buffer_index can be made after this call. on_done is called
     // when the transfer is complete but before the buffers are made available
     // to their consumers. 'data' must remain in scope until on_done is called.
-    virtual Status TransferRawDataToSubBuffer(
+    virtual absl::Status TransferRawDataToSubBuffer(
         int buffer_index, const void* data, int64_t offset,
         int64_t transfer_size, bool is_last_transfer,
         absl::AnyInvocable<void() &&> on_done) = 0;
@@ -740,7 +746,7 @@ class PjRtClient {
     // Indicates that a specific buffer should result in an error status. No
     // transfer calls (or further SetBufferError calls) into buffer_index can
     // be made after this call.
-    virtual void SetBufferError(int buffer_index, Status error) = 0;
+    virtual void SetBufferError(int buffer_index, absl::Status error) = 0;
 
     // Adds the specified key/value metadata for the transfer operation.
     // This is typically used for debugging purposes, such as adding a handle
@@ -751,12 +757,12 @@ class PjRtClient {
 
   // Returns a manager for async transfers into a set of buffers with on-host
   // shapes 'shapes'.
-  virtual StatusOr<std::unique_ptr<AsyncHostToDeviceTransferManager>>
+  virtual absl::StatusOr<std::unique_ptr<AsyncHostToDeviceTransferManager>>
   CreateBuffersForAsyncHostToDevice(absl::Span<const Shape> shapes,
                                     PjRtDevice* device) = 0;
 
   // Variant of CreateBuffersForAsyncHostToDevice with PjRtMemorySpace.
-  virtual StatusOr<std::unique_ptr<AsyncHostToDeviceTransferManager>>
+  virtual absl::StatusOr<std::unique_ptr<AsyncHostToDeviceTransferManager>>
   CreateBuffersForAsyncHostToDevice(absl::Span<const Shape> shapes,
                                     PjRtMemorySpace* memory_space) = 0;
 
@@ -775,23 +781,23 @@ class PjRtClient {
 
     // Transfers data to the device buffer. Data should already be in the
     // device layout.
-    virtual Status TransferRawDataToSubBuffer(
+    virtual absl::Status TransferRawDataToSubBuffer(
         const void* data, int64_t offset, int64_t transfer_size,
         bool is_last_transfer, absl::AnyInvocable<void() &&> on_done) = 0;
 
     // The resulting buffer becomes ready when all transfers complete.
-    virtual StatusOr<std::unique_ptr<PjRtBuffer>> Slice(
+    virtual absl::StatusOr<std::unique_ptr<PjRtBuffer>> Slice(
         int64_t offset, PrimitiveType type, absl::Span<int64_t const> dims,
         const Layout& layout) = 0;
   };
   // Creates a raw device buffer of a given size in bytes.
-  virtual StatusOr<std::unique_ptr<PjRtRawDeviceBuffer>> CreateRawDeviceBuffer(
-      int64_t size, PjRtDevice* device) {
+  virtual absl::StatusOr<std::unique_ptr<PjRtRawDeviceBuffer>>
+  CreateRawDeviceBuffer(int64_t size, PjRtDevice* device) {
     return Unimplemented("CreateRawDeviceBuffer is not implemented.");
   }
 
   // On-device bytes required for a PjRt buffer with these `Shape` attributes.
-  virtual StatusOr<int64_t> GetOnDeviceSizeInBytes(
+  virtual absl::StatusOr<int64_t> GetOnDeviceSizeInBytes(
       PrimitiveType type, absl::Span<int64_t const> dims,
       const Layout& layout) {
     return Unimplemented("GetOnDeviceSizeInBytes is not implemented.");
@@ -846,7 +852,7 @@ class PjRtClient {
   //
   // If byte_strides is omitted, the array is assumed to have a dense layout
   // with dimensions in major-to-minor order.
-  virtual StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
+  virtual absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
       const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
       std::optional<absl::Span<int64_t const>> byte_strides,
       HostBufferSemantics host_buffer_semantics,
@@ -857,7 +863,7 @@ class PjRtClient {
   // used when non-compact layout is preferred.
   // TODO(b/275645543): remove BufferFromHostBuffer without optional device
   // layout after all the inherited classes and call sites are updated.
-  virtual StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
+  virtual absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
       const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
       std::optional<absl::Span<int64_t const>> byte_strides,
       HostBufferSemantics host_buffer_semantics,
@@ -871,7 +877,7 @@ class PjRtClient {
 
   // TODO(b/277820585): remove BufferFromHostBuffer with PjRtDevice after the
   // migration is done.
-  virtual StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
+  virtual absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
       const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
       std::optional<absl::Span<int64_t const>> byte_strides,
       HostBufferSemantics host_buffer_semantics,
@@ -886,12 +892,12 @@ class PjRtClient {
   // Note that literal must remain in scope until the transfer has completed, so
   // the caller should, for example, wait for GetReadyFuture().Await()
   // completes on the return value before letting literal go out of scope.
-  virtual StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
+  virtual absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
       const LiteralSlice& literal, PjRtDevice* device) = 0;
 
   // TODO(b/277820585): remove BufferFromHostLiteral with PjRtDevice after the
   // migration is done.
-  virtual StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
+  virtual absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
       const LiteralSlice& literal, PjRtMemorySpace* memory_space) {
     return tsl::errors::Unimplemented(
         "BufferFromHostLiteral with PjRtMemorySpace is not implemented on "
@@ -911,14 +917,15 @@ class PjRtClient {
   // indicates when the returned buffer is ready to use. This is intended to
   // support dlpack on GPU and is not expected to be supported on all hardware
   // platforms.
-  virtual StatusOr<std::unique_ptr<PjRtBuffer>> CreateViewOfDeviceBuffer(
+  virtual absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateViewOfDeviceBuffer(
       void* device_ptr, const Shape& shape, PjRtDevice* device,
       std::function<void()> on_delete_callback,
       std::optional<std::intptr_t> stream = std::nullopt) = 0;
 
   // Returns platform-dependent address for the given buffer that is often but
   // not guaranteed to be the physical/device address.
-  virtual StatusOr<std::uintptr_t> UnsafeBufferPointer(PjRtBuffer* buffer);
+  virtual absl::StatusOr<std::uintptr_t> UnsafeBufferPointer(
+      PjRtBuffer* buffer);
 
   // Returns a vector of PjRtBuffers that can be used to receive
   // cross host transfers using `client` on `device'. Asynchronously calls
@@ -938,7 +945,7 @@ class PjRtClient {
   //
   // See note on semantics of cross-device copies in the class definition
   // comment for PjRtClient.
-  virtual StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+  virtual absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
   MakeCrossHostReceiveBuffers(absl::Span<const Shape> shapes,
                               PjRtDevice* device,
                               PjRtCrossHostRecvNotifier notifier) = 0;
@@ -972,19 +979,19 @@ class PjRtClient {
     // completes.
     std::vector<int64_t> slice_boundaries;
   };
-  virtual StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+  virtual absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
   MakeCrossHostReceiveBuffersForGather(
       absl::Span<const Shape> shapes, std::vector<GatherDetails> gather_details,
       PjRtDevice* device, PjRtCrossHostRecvNotifier notifier) = 0;
 
   // Create ChannelHandles for XLA send/recv.
-  virtual StatusOr<ChannelHandle> CreateChannelHandle() = 0;
-  virtual StatusOr<ChannelHandle> CreateDeviceToHostChannelHandle() = 0;
-  virtual StatusOr<ChannelHandle> CreateHostToDeviceChannelHandle() = 0;
+  virtual absl::StatusOr<ChannelHandle> CreateChannelHandle() = 0;
+  virtual absl::StatusOr<ChannelHandle> CreateDeviceToHostChannelHandle() = 0;
+  virtual absl::StatusOr<ChannelHandle> CreateHostToDeviceChannelHandle() = 0;
 
   // TODO(zhangqiaorjc): Experimental API to be removed.
   // Defragment device memory.
-  virtual Status Defragment() = 0;
+  virtual absl::Status Defragment() = 0;
 
   // If false, this client does not support send/recv host callbacks, and
   // callers should not set the `send_callbacks` and `recv_callbacks` arguments
@@ -1054,7 +1061,7 @@ class PjRtBuffer {
   //
   // Since this method actually acquires locks and communicate with the device,
   // it does not have the const qualifier, similar to what ToLiteral does.
-  virtual StatusOr<std::vector<int64_t>> logical_dimensions() {
+  virtual absl::StatusOr<std::vector<int64_t>> logical_dimensions() {
     TF_ASSIGN_OR_RETURN(Shape logical_shape, logical_on_device_shape());
     absl::Span<const int64_t> dims = logical_shape.dimensions();
     return std::vector<int64_t>(dims.begin(), dims.end());
@@ -1067,7 +1074,7 @@ class PjRtBuffer {
   //
   // Since this method actually acquires locks and communicate with the device,
   // it does not have the const qualifier, similar to what ToLiteral does.
-  virtual StatusOr<Shape> logical_on_device_shape() {
+  virtual absl::StatusOr<Shape> logical_on_device_shape() {
     const Shape& shape = on_device_shape();
     CHECK(shape.is_static())
         << "logical_on_device_shape needs to be overridden for platform '"
@@ -1095,7 +1102,7 @@ class PjRtBuffer {
 
     // Stream is platform-specific. This is intended to support dlpack on GPU
     // and is not expected to be implemented for all hardware platforms.
-    virtual Status WaitUntilBufferReadyOnStream(std::intptr_t stream) {
+    virtual absl::Status WaitUntilBufferReadyOnStream(std::intptr_t stream) {
       return Unimplemented(
           "WaitUntilBufferReadyOnStream is only implemented for GPU.");
     }
@@ -1103,7 +1110,7 @@ class PjRtBuffer {
    protected:
     void* data_ptr_;
   };
-  virtual StatusOr<std::unique_ptr<ExternalReference>>
+  virtual absl::StatusOr<std::unique_ptr<ExternalReference>>
   AcquireExternalReference() = 0;
 
   // Asynchronously copies the buffer's value into `literal`.
@@ -1122,10 +1129,10 @@ class PjRtBuffer {
           generator) = 0;
 
   // Synchronous overload of ToLiteral, as a convenience.
-  Status ToLiteralSync(MutableLiteralBase* literal) {
+  absl::Status ToLiteralSync(MutableLiteralBase* literal) {
     absl::Notification done;
-    Status status;
-    ToLiteral(literal).OnReady([&](Status s) {
+    absl::Status status;
+    ToLiteral(literal).OnReady([&](absl::Status s) {
       status = std::move(s);
       done.Notify();
     });
@@ -1171,7 +1178,7 @@ class PjRtBuffer {
   }
 
   // Returns the number of bytes of the buffer storage on the device.
-  virtual StatusOr<size_t> GetOnDeviceSizeInBytes() const = 0;
+  virtual absl::StatusOr<size_t> GetOnDeviceSizeInBytes() const = 0;
 
   // Transfers a sub-range of the on-device representation of the buffer.
   // offset+transfer_size must be less than GetOnDeviceSizeInBytes. The
@@ -1186,13 +1193,16 @@ class PjRtBuffer {
 
   // As above, but the transfer will not happen until `dst` is fulfilled with a
   // valid pointer. If `dst` is fulfilled with a non-Ok status, then the
-  // transfer will be cancelled.
+  // transfer will be cancelled. The implementation must ensure that the
+  // underlying buffer is kept alive even if the `PjRtBuffer` is deleted before
+  // the `dst` future is fulfilled.
   //
   // In error cases it is possible for the returned Future to become ready
   // before `dst` is fulfilled.
   //
-  // Note that the default implementation will block until `dst` is fulfilled.
-  virtual PjRtFuture<> CopyRawToHostFuture(PjRtFuture<StatusOr<void*>> dst,
+  // The default implementation always returns a future that is fulfilled with
+  // an UNIMPLEMENTED error.
+  virtual PjRtFuture<> CopyRawToHostFuture(PjRtFuture<void*> dst,
                                            int64_t offset,
                                            int64_t transfer_size);
 
@@ -1223,7 +1233,7 @@ class PjRtBuffer {
   // If the buffer was shared via an external reference it is the client's
   // responsibility that accesses via that reference do not interfere with
   // accesses via the buffer returned from ReleaseDeviceMemoryOwnership.
-  virtual StatusOr<std::unique_ptr<ExternalReference>>
+  virtual absl::StatusOr<std::unique_ptr<ExternalReference>>
   ReleaseDeviceMemoryOwnership(bool wait_for_operations_to_complete) = 0;
 
   // True if and only if Delete or Release has previously been called.
@@ -1236,7 +1246,7 @@ class PjRtBuffer {
   //
   // See note on semantics of cross-device copies in the class definition
   // comment for PjRtClient.
-  virtual StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDevice(
+  virtual absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDevice(
       PjRtDevice* dst_device) = 0;
 
   // Copies the buffer to memory space `dst_memory_space`.
@@ -1248,7 +1258,7 @@ class PjRtBuffer {
   //
   // See note on semantics of cross-device copies in the class definition
   // comment for PjRtClient.
-  virtual StatusOr<std::unique_ptr<PjRtBuffer>> CopyToMemorySpace(
+  virtual absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToMemorySpace(
       PjRtMemorySpace* dst_memory_space) = 0;
 
   // Prepares to send a copy of the buffer to a remote device. The destination
@@ -1281,10 +1291,9 @@ class PjRtBuffer {
   // See note on semantics of cross-device copies in the class definition
   // comment for PjRtClient.
   using RemoteSendCallback =
-      std::function<void(Status status, bool sends_were_enqueued)>;
-  virtual void CopyToRemoteDevice(
-      PjRtFuture<StatusOr<std::string>> serialized_descriptor,
-      RemoteSendCallback on_done) = 0;
+      std::function<void(absl::Status status, bool sends_were_enqueued)>;
+  virtual void CopyToRemoteDevice(PjRtFuture<std::string> serialized_descriptor,
+                                  RemoteSendCallback on_done) = 0;
   struct ScatterDetails {
     // The dimensions of the corresponding buffer that the scatter slices
     // across. These dimensions must be the major dimensions in the on-device
@@ -1308,7 +1317,7 @@ class PjRtBuffer {
   // `calbacks.size()` and (if Ok) `serialized_descriptors.size()` match the
   // product of the major dimensions specified in `scatter_details`.
   virtual void CopyToRemoteDeviceScattered(
-      PjRtFuture<StatusOr<std::vector<std::string>>> serialized_descriptors,
+      PjRtFuture<std::vector<std::string>> serialized_descriptors,
       std::vector<RemoteSendCallback> callbacks,
       const ScatterDetails& scatter_details) = 0;
 
@@ -1320,20 +1329,11 @@ class PjRtBuffer {
   //
   // If either 'this' or 'dependency' transitions to error, then the returned
   // buffer will transition to error.
-  virtual StatusOr<std::unique_ptr<PjRtBuffer>> DonateWithControlDependency(
-      PjRtFuture<> dependency) {
+  virtual absl::StatusOr<std::unique_ptr<PjRtBuffer>>
+  DonateWithControlDependency(PjRtFuture<> dependency) {
     return Unimplemented("DonateWithControlDependency is not supported.");
   }
 
-  // TODO(b/333538339): Delete this adaptor once all users migrate from
-  // PjRtFuture<Status> to PjRtFuture<>.
-  StatusOr<std::unique_ptr<PjRtBuffer>> DonateWithControlDependency(
-      PjRtFuture<Status> dependency) {
-    PjRtFuture<>::Promise promise = PjRtFuture<>::CreatePromise();
-    dependency.OnReady([promise](Status s) mutable { promise.Set(s); });
-    return DonateWithControlDependency(PjRtFuture<>(std::move(promise)));
-  }
-
   // Helper to allow a caller to indicate that it is going to do some "sends"
   // of the buffer a later date, where a send is a transfer out of a device
   // buffer, either copying to host, or to a remote device.
@@ -1374,7 +1374,7 @@ class PjRtBuffer {
         std::vector<PjRtBuffer::RemoteSendCallback> callbacks,
         const ScatterDetails& scatter_details) = 0;
   };
-  virtual StatusOr<std::unique_ptr<AsyncSendPlaceholder>>
+  virtual absl::StatusOr<std::unique_ptr<AsyncSendPlaceholder>>
   CreateAsyncSendPlaceholder() {
     return Unimplemented("AsyncSendPlaceholder is not supported.");
   }
@@ -1393,7 +1393,7 @@ class PjRtBuffer {
   // Blocks the host until the buffer's value has been computed and is ready for
   // immediate use on the device. Useful in particular for timing benchmarks.
   ABSL_DEPRECATED("Use GetReadyFuture().Await() instead")
-  Status BlockHostUntilReady() {
+  absl::Status BlockHostUntilReady() {
     auto s = GetReadyFuture().Await();
     // Fix up error string because some clients rely on it.
     if (!s.ok() &&
@@ -1423,8 +1423,8 @@ class PjRtLoadedExecutable : public PjRtExecutable {
   // Returns named values for cost properties of this executable (such as
   // operations, size of input/outputs, and run time estimate). Properties may
   // differ for different platforms.
-  StatusOr<absl::flat_hash_map<std::string, PjRtValueType>> GetCostAnalysis()
-      const override;
+  absl::StatusOr<absl::flat_hash_map<std::string, PjRtValueType>>
+  GetCostAnalysis() const override;
 
   // The replica and partition indices of device_assignment to be run by this
   // client. On single-host platforms without partitioning, this is all replicas
@@ -1471,12 +1471,12 @@ class PjRtLoadedExecutable : public PjRtExecutable {
   //
   // The caller is *NOT* required to ensure that PjRtLoadedExecutable stays
   // alive until futures are ready.
-  virtual StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>
+  virtual absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>
   Execute(absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
           const ExecuteOptions& options,
           std::optional<std::vector<PjRtFuture<>>>& returned_futures) = 0;
   // Convenience wrapper for Execute that never returns futures.
-  StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> Execute(
+  absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> Execute(
       absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
       const ExecuteOptions& options) {
     std::optional<std::vector<PjRtFuture<>>> returned_futures;
@@ -1493,12 +1493,13 @@ class PjRtLoadedExecutable : public PjRtExecutable {
   //     once the execution has completed.
   //    else:
   //     returned_future will not be modified.
-  virtual StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteSharded(
-      absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
-      const ExecuteOptions& options,
-      std::optional<PjRtFuture<>>& returned_future, bool fill_future) = 0;
+  virtual absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+  ExecuteSharded(absl::Span<PjRtBuffer* const> argument_handles,
+                 PjRtDevice* device, const ExecuteOptions& options,
+                 std::optional<PjRtFuture<>>& returned_future,
+                 bool fill_future) = 0;
   // Convenience wrapper for ExecuteSharded that always returns a future.
-  StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteSharded(
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteSharded(
       absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
       const ExecuteOptions& options,
       std::optional<PjRtFuture<>>& returned_future) {
@@ -1506,7 +1507,7 @@ class PjRtLoadedExecutable : public PjRtExecutable {
                           returned_future, /*fill_future=*/true);
   }
   // Convenience wrapper for ExecuteSharded that never returns a future.
-  StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteSharded(
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteSharded(
       absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
       const ExecuteOptions& options) {
     std::optional<PjRtFuture<>> returned_future;
@@ -1524,12 +1525,13 @@ class PjRtLoadedExecutable : public PjRtExecutable {
   //     once the execution has completed.
   //    else:
   //     returned_future will not be modified.
-  virtual StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecutePortable(
-      absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
-      const ExecuteOptions& options,
-      std::optional<PjRtFuture<>>& returned_future, bool fill_future) = 0;
+  virtual absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+  ExecutePortable(absl::Span<PjRtBuffer* const> argument_handles,
+                  PjRtDevice* device, const ExecuteOptions& options,
+                  std::optional<PjRtFuture<>>& returned_future,
+                  bool fill_future) = 0;
   // Convenience wrapper for ExecutePortable that always returns a future.
-  StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecutePortable(
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecutePortable(
       absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
       const ExecuteOptions& options,
       std::optional<PjRtFuture<>>& returned_future) {
@@ -1537,7 +1539,7 @@ class PjRtLoadedExecutable : public PjRtExecutable {
                            returned_future, /*fill_future=*/true);
   }
   // Convenience wrapper for ExecutePortable that never returns a future.
-  StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecutePortable(
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecutePortable(
       absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
       const ExecuteOptions& options) {
     std::optional<PjRtFuture<>> returned_future;
diff --git a/third_party/xla/xla/pjrt/pjrt_client_test.cc b/third_party/xla/xla/pjrt/pjrt_client_test.cc
index 1bc5d6704abbf3..2f0ccf833f369d 100644
--- a/third_party/xla/xla/pjrt/pjrt_client_test.cc
+++ b/third_party/xla/xla/pjrt/pjrt_client_test.cc
@@ -64,7 +64,7 @@ TestClientFactory& GetGlobalTestClientFactory() {
   return *factory;
 }
 
-StatusOr<std::unique_ptr<PjRtClient>> GetClient() {
+absl::StatusOr<std::unique_ptr<PjRtClient>> GetClient() {
   return GetGlobalTestClientFactory().Get()();
 }
 
@@ -486,7 +486,7 @@ TEST(PjRtClientTest, CopyToDeviceAsyncExternalCpuOnly) {
   }
 }
 
-StatusOr<std::unique_ptr<PjRtBuffer>> MakeFloatBuffer(
+absl::StatusOr<std::unique_ptr<PjRtBuffer>> MakeFloatBuffer(
     PjRtClient* client, const std::vector<float>& data,
     absl::Span<const int64_t> dimensions) {
   Shape shape = ShapeUtil::MakeShape(F32, {2, 2});
diff --git a/third_party/xla/xla/pjrt/pjrt_compiler.cc b/third_party/xla/xla/pjrt/pjrt_compiler.cc
index 9501e6f5908c01..7896b9ab902795 100644
--- a/third_party/xla/xla/pjrt/pjrt_compiler.cc
+++ b/third_party/xla/xla/pjrt/pjrt_compiler.cc
@@ -72,7 +72,7 @@ void PjRtRegisterCompiler(absl::string_view platform_name,
   (*compiler_registry)[platform_name] = std::move(compiler);
 }
 
-StatusOr<std::unique_ptr<PjRtExecutable>> PjRtCompile(
+absl::StatusOr<std::unique_ptr<PjRtExecutable>> PjRtCompile(
     CompileOptions options, const XlaComputation& computation,
     const PjRtTopologyDescription& topology, PjRtClient* client) {
   auto topology_compiler = topology.compiler();
@@ -91,7 +91,7 @@ StatusOr<std::unique_ptr<PjRtExecutable>> PjRtCompile(
   return it->second->Compile(std::move(options), computation, topology, client);
 }
 
-StatusOr<std::unique_ptr<PjRtExecutable>> PjRtCompile(
+absl::StatusOr<std::unique_ptr<PjRtExecutable>> PjRtCompile(
     CompileOptions options, mlir::ModuleOp module,
     const PjRtTopologyDescription& topology, PjRtClient* client) {
   auto topology_compiler = topology.compiler();
diff --git a/third_party/xla/xla/pjrt/pjrt_compiler.h b/third_party/xla/xla/pjrt/pjrt_compiler.h
index 46c363ace13619..0c8a21e7d0976f 100644
--- a/third_party/xla/xla/pjrt/pjrt_compiler.h
+++ b/third_party/xla/xla/pjrt/pjrt_compiler.h
@@ -146,7 +146,7 @@ class PjRtTopologyDescription {
   // default layout is used for program arguments and outputs unless
   // user-specified or compiler-chosen layouts are requested via the
   // "mhlo.layout_mode" attribute.
-  virtual StatusOr<Layout> GetDefaultLayout(
+  virtual absl::StatusOr<Layout> GetDefaultLayout(
       PrimitiveType element_type, absl::Span<const int64_t> dims) const = 0;
 };
 
@@ -157,12 +157,12 @@ class PjRtCompiler {
 
   // Compiles the 'computation' and returns a 'PjRtExecutable'. The returned
   // PjRtExecutable must be loaded by a compatible client before execution.
-  virtual StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
+  virtual absl::StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
       CompileOptions options, const XlaComputation& computation,
       const PjRtTopologyDescription& topology, PjRtClient* client) = 0;
 
   // Variant of `Compile` that accepts an MLIR module.
-  virtual StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
+  virtual absl::StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
       CompileOptions options, mlir::ModuleOp module,
       const PjRtTopologyDescription& topology, PjRtClient* client) = 0;
 };
@@ -183,12 +183,12 @@ void PjRtRegisterCompiler(absl::string_view platform_name,
 // Returns error::NotFound if a compiler has not been registered for the
 // platform. Forwards errors returned from the registered compiler in case of a
 // compilation failure.
-StatusOr<std::unique_ptr<PjRtExecutable>> PjRtCompile(
+absl::StatusOr<std::unique_ptr<PjRtExecutable>> PjRtCompile(
     CompileOptions options, const XlaComputation& computation,
     const PjRtTopologyDescription& topology, PjRtClient* client = nullptr);
 
 // Variant of `PjRtCompile` that accepts an MLIR module.
-StatusOr<std::unique_ptr<PjRtExecutable>> PjRtCompile(
+absl::StatusOr<std::unique_ptr<PjRtExecutable>> PjRtCompile(
     CompileOptions options, mlir::ModuleOp module,
     const PjRtTopologyDescription& topology, PjRtClient* client = nullptr);
 
diff --git a/third_party/xla/xla/pjrt/pjrt_compiler_test.cc b/third_party/xla/xla/pjrt/pjrt_compiler_test.cc
index 98a2b8e8d5e16b..14de671708a528 100644
--- a/third_party/xla/xla/pjrt/pjrt_compiler_test.cc
+++ b/third_party/xla/xla/pjrt/pjrt_compiler_test.cc
@@ -56,7 +56,7 @@ class PjRtTestTopology : public PjRtTopologyDescription {
       const override {
     LOG(FATAL) << "Unused";
   }
-  StatusOr<Layout> GetDefaultLayout(
+  absl::StatusOr<Layout> GetDefaultLayout(
       PrimitiveType element_type,
       absl::Span<const int64_t> dims) const override {
     return Unimplemented("TestTopology does not support GetDefaultLayout");
@@ -90,7 +90,7 @@ TEST(PjRtCompilerTest, CompilerRegistered) {
         const override {
       LOG(FATAL) << "Unused";
     }
-    StatusOr<Layout> GetDefaultLayout(
+    absl::StatusOr<Layout> GetDefaultLayout(
         PrimitiveType element_type,
         absl::Span<const int64_t> dims) const override {
       return Unimplemented("TestTopology does not support GetDefaultLayout");
@@ -100,12 +100,12 @@ TEST(PjRtCompilerTest, CompilerRegistered) {
 
   class PjRtTestCompiler : public PjRtCompiler {
    public:
-    StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
+    absl::StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
         CompileOptions options, const XlaComputation& computation,
         const PjRtTopologyDescription& topology, PjRtClient* client) override {
       return tsl::errors::Unimplemented("test compiler!");
     }
-    StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
+    absl::StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
         CompileOptions options, mlir::ModuleOp module,
         const PjRtTopologyDescription& topology, PjRtClient* client) override {
       return tsl::errors::Unimplemented("test compiler!");
diff --git a/third_party/xla/xla/pjrt/pjrt_executable.cc b/third_party/xla/xla/pjrt/pjrt_executable.cc
index 4c4eae38f0cbc9..be77360fe17196 100644
--- a/third_party/xla/xla/pjrt/pjrt_executable.cc
+++ b/third_party/xla/xla/pjrt/pjrt_executable.cc
@@ -29,14 +29,17 @@ limitations under the License.
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/client/executable_build_options.h"
 #include "xla/layout.h"
 #include "xla/pjrt/compile_options.pb.h"
 #include "xla/pjrt/execute_options.pb.h"
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_layout.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/service/computation_layout.h"
 #include "xla/service/hlo_cost_analysis.h"
+#include "xla/service/hlo_value.h"
 #include "xla/shape.h"
 #include "xla/shape_layout.h"
 #include "xla/shape_util.h"
@@ -69,7 +72,7 @@ void SetOptionOverride(OptionOverrideProto& option, double value) {
 
 }  // namespace
 
-StatusOr<CompileOptionsProto> CompileOptions::ToProto() const {
+absl::StatusOr<CompileOptionsProto> CompileOptions::ToProto() const {
   CompileOptionsProto output;
   if (argument_layouts.has_value()) {
     for (const auto& layout : *argument_layouts) {
@@ -107,7 +110,7 @@ void CompileOptions::SerializeEnvOptionOverrides(
   }
 }
 
-StatusOr<CompileOptions> CompileOptions::FromProto(
+absl::StatusOr<CompileOptions> CompileOptions::FromProto(
     const CompileOptionsProto& proto) {
   if (!proto.serialized_multi_slice_config().empty()) {
     return Unimplemented(
@@ -216,6 +219,114 @@ absl::StatusOr<ExecuteOptions> ExecuteOptions::FromProto(
   return options;
 }
 
+CompiledMemoryStatsProto CompiledMemoryStats::ToProto() {
+  CompiledMemoryStatsProto proto;
+  proto.set_generated_code_size_in_bytes(generated_code_size_in_bytes);
+  proto.set_argument_size_in_bytes(argument_size_in_bytes);
+  proto.set_output_size_in_bytes(output_size_in_bytes);
+  proto.set_alias_size_in_bytes(alias_size_in_bytes);
+  proto.set_temp_size_in_bytes(temp_size_in_bytes);
+  proto.mutable_hlo_proto()->ParseFromString(serialized_hlo_proto);
+  proto.set_host_generated_code_size_in_bytes(
+      host_generated_code_size_in_bytes);
+  proto.set_host_argument_size_in_bytes(host_argument_size_in_bytes);
+  proto.set_host_output_size_in_bytes(host_output_size_in_bytes);
+  proto.set_host_alias_size_in_bytes(host_alias_size_in_bytes);
+  proto.set_host_temp_size_in_bytes(host_temp_size_in_bytes);
+  return proto;
+}
+
+CompiledMemoryStats CompiledMemoryStats::FromProto(
+    const CompiledMemoryStatsProto& proto) {
+  CompiledMemoryStats stats;
+  stats.generated_code_size_in_bytes = proto.generated_code_size_in_bytes();
+  stats.argument_size_in_bytes = proto.argument_size_in_bytes();
+  stats.output_size_in_bytes = proto.output_size_in_bytes();
+  stats.alias_size_in_bytes = proto.alias_size_in_bytes();
+  stats.temp_size_in_bytes = proto.temp_size_in_bytes();
+  stats.serialized_hlo_proto = proto.hlo_proto().SerializeAsString();
+  stats.host_generated_code_size_in_bytes =
+      proto.host_generated_code_size_in_bytes();
+  stats.host_argument_size_in_bytes = proto.host_argument_size_in_bytes();
+  stats.host_output_size_in_bytes = proto.host_output_size_in_bytes();
+  stats.host_alias_size_in_bytes = proto.host_alias_size_in_bytes();
+  stats.host_temp_size_in_bytes = proto.host_temp_size_in_bytes();
+  return stats;
+}
+
+// Recomputes the memory stats from allocations. Why recompute?
+// Firstly, there are cases in which gpu::Executable inherits its allocations
+// from elsewhere, and no buffer assignment is available.
+// Secondly, exec->buffer_assignment()->GetStats() provides the statistics we
+// want, but does not distinguish between device and host memory, and does
+// not account for aliased memory.
+void CompiledMemoryStats::PopulateBufferStatsFromAllocations(
+    absl::Span<const BufferAllocation> allocs) {
+  argument_size_in_bytes = 0;
+  output_size_in_bytes = 0;
+  temp_size_in_bytes = 0;
+  alias_size_in_bytes = 0;
+  host_argument_size_in_bytes = 0;
+  host_output_size_in_bytes = 0;
+  host_temp_size_in_bytes = 0;
+  host_alias_size_in_bytes = 0;
+
+  for (auto& alloc : allocs) {
+    // All logical buffers assigned to a buffer allocation share a color.
+    // With buffer assigner's default colorer the color happens to be the
+    // memory space of the underlying HLO value. Callers may choose other
+    // colorers, however, e.g.:
+    // https://github.com/openxla/xla/blob/50c6489cb058881cc65622605c9c55029abebc5b/xla/service/gpu/compile_module_to_llvm_ir.cc#L152
+    // Until buffer allocations provide a stronger guarantee about colors,
+    // we sanity-check that the default coloring behavior was used.
+    int64_t alloc_memory_space = -1;
+    for (const auto& [value, _] : alloc.assigned_buffers()) {
+      const HloPosition& defining_position = value->defining_position();
+      int64_t memory_space = Layout::kDefaultMemorySpace;
+      if (defining_position.shape().has_layout()) {
+        memory_space = defining_position.shape().layout().memory_space();
+      }
+      if (alloc_memory_space == -1) {
+        alloc_memory_space = memory_space;
+      } else {
+        CHECK(alloc_memory_space == memory_space &&
+              "expected same memory space for all assignments in allocation");
+      }
+    }
+
+    bool is_host = alloc_memory_space == Layout::kHostMemorySpace;
+    int64_t size = alloc.size();
+    if (alloc.is_entry_computation_parameter()) {
+      if (is_host) {
+        host_argument_size_in_bytes += size;
+      } else {
+        argument_size_in_bytes += size;
+      }
+      if (alloc.is_parameter_aliased_with_output()) {
+        if (is_host) {
+          host_alias_size_in_bytes += size;
+        } else {
+          alias_size_in_bytes += size;
+        }
+      }
+    }
+    if (alloc.maybe_live_out()) {
+      if (is_host) {
+        host_output_size_in_bytes += size;
+      } else {
+        output_size_in_bytes += size;
+      }
+    }
+    if (alloc.IsPreallocatedTempBuffer()) {
+      if (is_host) {
+        host_temp_size_in_bytes += size;
+      } else {
+        temp_size_in_bytes += size;
+      }
+    }
+  }
+}
+
 void GetOpSharding(std::vector<OpSharding>& out, const OpSharding& sharding) {
   if (sharding.type() == OpSharding::TUPLE) {
     for (const OpSharding& s : sharding.tuple_shardings()) {
@@ -254,7 +365,7 @@ std::optional<std::vector<OpSharding>> PjRtExecutable::GetParameterShardings()
   return out;
 }
 
-StatusOr<std::vector<Shape>> PjRtExecutable::GetOutputShapes() const {
+absl::StatusOr<std::vector<Shape>> PjRtExecutable::GetOutputShapes() const {
   TF_ASSIGN_OR_RETURN(auto modules, GetHloModules());
   std::vector<Shape> output_shapes;
   output_shapes.reserve(modules.size());
@@ -264,7 +375,7 @@ StatusOr<std::vector<Shape>> PjRtExecutable::GetOutputShapes() const {
   return output_shapes;
 }
 
-StatusOr<std::vector<std::vector<PrimitiveType>>>
+absl::StatusOr<std::vector<std::vector<PrimitiveType>>>
 PjRtExecutable::GetOutputElementTypes() const {
   TF_ASSIGN_OR_RETURN(auto output_shapes, GetOutputShapes());
   std::vector<std::vector<PrimitiveType>> output_element_types;
@@ -292,7 +403,7 @@ PjRtExecutable::GetOutputElementTypes() const {
   return output_element_types;
 }
 
-StatusOr<std::vector<std::vector<DimensionVector>>>
+absl::StatusOr<std::vector<std::vector<DimensionVector>>>
 PjRtExecutable::GetOutputDimensions() const {
   TF_ASSIGN_OR_RETURN(auto output_shapes, GetOutputShapes());
   std::vector<std::vector<DimensionVector>> output_dimensions;
@@ -372,7 +483,7 @@ PjRtExecutable::GetOutputLayouts() const {
   return result;
 }
 
-StatusOr<absl::flat_hash_map<std::string, PjRtValueType>>
+absl::StatusOr<absl::flat_hash_map<std::string, PjRtValueType>>
 PjRtExecutableUtil::RunHloCostAnalysis(const PjRtExecutable& executable,
                                        HloCostAnalysis* hlo_cost_analysis) {
   TF_ASSIGN_OR_RETURN(std::vector<std::shared_ptr<HloModule>> modules,
@@ -391,7 +502,7 @@ PjRtExecutableUtil::RunHloCostAnalysis(const PjRtExecutable& executable,
   return RunHloCostAnalysis(modules, hlo_cost_analysis);
 }
 
-StatusOr<absl::flat_hash_map<std::string, PjRtValueType>>
+absl::StatusOr<absl::flat_hash_map<std::string, PjRtValueType>>
 PjRtExecutableUtil::RunHloCostAnalysis(
     const std::vector<std::shared_ptr<xla::HloModule>>& hlo_modules,
     HloCostAnalysis* hlo_cost_analysis) {
@@ -413,7 +524,8 @@ PjRtExecutableUtil::RunHloCostAnalysis(
   return ret;
 }
 
-StatusOr<std::vector<std::pair<std::string, CompileOptions::OptionOverride>>>
+absl::StatusOr<
+    std::vector<std::pair<std::string, CompileOptions::OptionOverride>>>
 CompileOptions::LoadEnvOptionOverrides(
     const google::protobuf::Map<std::string, xla::OptionOverrideProto>&
         env_option_overrides) {
@@ -447,8 +559,8 @@ CompileOptions::LoadEnvOptionOverrides(
   return result;
 }
 
-Status CompileOptions::ApplyOption(const std::string& key,
-                                   const OptionOverride& value) {
+absl::Status CompileOptions::ApplyOption(const std::string& key,
+                                         const OptionOverride& value) {
   if (auto* xla_field = xla::DebugOptions::descriptor()->FindFieldByName(key)) {
     xla::DebugOptions& debug_options =
         *executable_build_options.mutable_debug_options();
@@ -496,14 +608,14 @@ Status CompileOptions::ApplyOption(const std::string& key,
   }
 }
 
-Status CompileOptions::ApplyAllOptionOverrides() {
+absl::Status CompileOptions::ApplyAllOptionOverrides() {
   for (auto& option : env_option_overrides) {
     TF_RETURN_IF_ERROR(ApplyOption(option.first, option.second));
   }
   return OkStatus();
 }
 
-Status CompileOptions::ApplyOptionFromString(
+absl::Status CompileOptions::ApplyOptionFromString(
     const tsl::protobuf::FieldDescriptor* field, const std::string& value) {
   xla::DebugOptions& debug_options =
       *executable_build_options.mutable_debug_options();
diff --git a/third_party/xla/xla/pjrt/pjrt_executable.h b/third_party/xla/xla/pjrt/pjrt_executable.h
index e1a6c6f22af96a..d69b27033512dd 100644
--- a/third_party/xla/xla/pjrt/pjrt_executable.h
+++ b/third_party/xla/xla/pjrt/pjrt_executable.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/client/executable_build_options.h"
+#include "xla/ffi/execution_context.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/layout.h"
 #include "xla/pjrt/compile_options.pb.h"
@@ -110,15 +111,15 @@ struct CompileOptions {
       PrecisionConfig::DEFAULT;
 
   // Applies env_option_overrides to executable_build_options.debug_options().
-  Status ApplyAllOptionOverrides();
+  absl::Status ApplyAllOptionOverrides();
 
   // Applies a single option to executable_build_options.debug_options().
-  Status ApplyOption(const std::string& key, const OptionOverride& value);
+  absl::Status ApplyOption(const std::string& key, const OptionOverride& value);
 
-  Status ApplyOptionFromString(const tsl::protobuf::FieldDescriptor* field,
-                               const std::string& value);
+  absl::Status ApplyOptionFromString(
+      const tsl::protobuf::FieldDescriptor* field, const std::string& value);
 
-  static StatusOr<
+  static absl::StatusOr<
       std::vector<std::pair<std::string, CompileOptions::OptionOverride>>>
   LoadEnvOptionOverrides(
       const google::protobuf::Map<std::string, xla::OptionOverrideProto>&
@@ -129,10 +130,11 @@ struct CompileOptions {
           output_env_option_overrides) const;
 
   // Serialize the CompileOptions into a CompileOptionsProto.
-  StatusOr<CompileOptionsProto> ToProto() const;
+  absl::StatusOr<CompileOptionsProto> ToProto() const;
 
   // Deserialize the CompileOptionsProto into a CompileOptions.
-  static StatusOr<CompileOptions> FromProto(const CompileOptionsProto& proto);
+  static absl::StatusOr<CompileOptions> FromProto(
+      const CompileOptionsProto& proto);
 };
 
 struct LoadOptions {
@@ -152,6 +154,15 @@ struct LoadOptions {
 class ExecuteContext {
  public:
   virtual ~ExecuteContext() = default;
+
+  ffi::ExecutionContext& ffi_context() { return ffi_context_; }
+  const ffi::ExecutionContext& ffi_context() const { return ffi_context_; }
+
+ private:
+  // XLA FFI execution context is a mechanism to attach arbitrary user data to
+  // a particular call of PjRtLoadedExecutable::Execute and forward it to custom
+  // calls implemented as XLA FFI handlers.
+  ffi::ExecutionContext ffi_context_;
 };
 
 struct PjRtTransferMetadata {
@@ -181,8 +192,9 @@ struct SendCallback {
   // TODO(chky): Currently the callback invocation order may not be consistent
   // with the HLO send op invocation order, due to limitations in some PjRt
   // implementation. Consider making it strictly the same order as HLO program.
-  std::function<Status(const PjRtTransferMetadata& metadata, PjRtChunk chunk,
-                       size_t total_size_in_bytes, bool done)>
+  std::function<absl::Status(const PjRtTransferMetadata& metadata,
+                             PjRtChunk chunk, size_t total_size_in_bytes,
+                             bool done)>
       callback;
 };
 
@@ -213,7 +225,9 @@ struct ExecuteOptions {
   // the launch IDs may be used by the runtime to detect the mismatch.
   int32_t launch_id = 0;
   // If non-null, an opaque context passed to an execution that may be used to
-  // supply additional arguments to a derived class of PjRtExecutable.
+  // supply additional arguments to a derived class of PjRtExecutable. It is
+  // a caller responsibility to ensure that the context is valid for the
+  // duration of the execution.
   const ExecuteContext* context = nullptr;
   // If true, check that the PjRtBuffer argument shapes match the compiled
   // shapes. Otherwise, any shape with the right size on device may be passed.
@@ -286,39 +300,12 @@ struct CompiledMemoryStats {
   std::string serialized_hlo_proto = "";
   std::string DebugString() const;
 
-  CompiledMemoryStatsProto ToProto() {
-    CompiledMemoryStatsProto proto;
-    proto.set_generated_code_size_in_bytes(generated_code_size_in_bytes);
-    proto.set_argument_size_in_bytes(argument_size_in_bytes);
-    proto.set_output_size_in_bytes(output_size_in_bytes);
-    proto.set_alias_size_in_bytes(alias_size_in_bytes);
-    proto.set_temp_size_in_bytes(temp_size_in_bytes);
-    proto.mutable_hlo_proto()->ParseFromString(serialized_hlo_proto);
-    proto.set_host_generated_code_size_in_bytes(
-        host_generated_code_size_in_bytes);
-    proto.set_host_argument_size_in_bytes(host_argument_size_in_bytes);
-    proto.set_host_output_size_in_bytes(host_output_size_in_bytes);
-    proto.set_host_alias_size_in_bytes(host_alias_size_in_bytes);
-    proto.set_host_temp_size_in_bytes(host_temp_size_in_bytes);
-    return proto;
-  }
+  CompiledMemoryStatsProto ToProto();
 
-  static CompiledMemoryStats FromProto(const CompiledMemoryStatsProto& proto) {
-    CompiledMemoryStats stats;
-    stats.generated_code_size_in_bytes = proto.generated_code_size_in_bytes();
-    stats.argument_size_in_bytes = proto.argument_size_in_bytes();
-    stats.output_size_in_bytes = proto.alias_size_in_bytes();
-    stats.alias_size_in_bytes = proto.alias_size_in_bytes();
-    stats.temp_size_in_bytes = proto.temp_size_in_bytes();
-    stats.serialized_hlo_proto = proto.hlo_proto().SerializeAsString();
-    stats.host_generated_code_size_in_bytes =
-        proto.host_generated_code_size_in_bytes();
-    stats.host_argument_size_in_bytes = proto.host_argument_size_in_bytes();
-    stats.host_output_size_in_bytes = proto.host_output_size_in_bytes();
-    stats.host_alias_size_in_bytes = proto.host_alias_size_in_bytes();
-    stats.host_temp_size_in_bytes = proto.host_temp_size_in_bytes();
-    return stats;
-  }
+  static CompiledMemoryStats FromProto(const CompiledMemoryStatsProto& proto);
+
+  void PopulateBufferStatsFromAllocations(
+      absl::Span<const BufferAllocation> allocs);
 };
 
 class PjRtExecutable {
@@ -335,21 +322,21 @@ class PjRtExecutable {
   virtual absl::string_view name() const = 0;
 
   // Return an HloModule (optimized) per partition.
-  virtual StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
-      const = 0;
+  virtual absl::StatusOr<std::vector<std::shared_ptr<HloModule>>>
+  GetHloModules() const = 0;
 
   // Returns an output Shape per program, the size should be equal to
   // `GetHloModules()`.
-  virtual StatusOr<std::vector<Shape>> GetOutputShapes() const;
+  virtual absl::StatusOr<std::vector<Shape>> GetOutputShapes() const;
 
   // Returns a list of element types for each output, the size of the outer list
   // should be equal to `GetHloModules()`.
-  virtual StatusOr<std::vector<std::vector<PrimitiveType>>>
+  virtual absl::StatusOr<std::vector<std::vector<PrimitiveType>>>
   GetOutputElementTypes() const;
 
   // Returns a list of dimensions for each output, the size of the outer list
   // should be equal to `GetHloModules()`.
-  virtual StatusOr<std::vector<std::vector<DimensionVector>>>
+  virtual absl::StatusOr<std::vector<std::vector<DimensionVector>>>
   GetOutputDimensions() const;
 
   // Returns the layout of each input parameter.
@@ -364,7 +351,7 @@ class PjRtExecutable {
   // value is `[num_programs, num_output]`. The size of the outer list should be
   // equal to `GetHloModules()`. Under SPMD, one can use
   // `GetOutputMemoryKinds().front()`.
-  virtual StatusOr<std::vector<std::vector<absl::string_view>>>
+  virtual absl::StatusOr<std::vector<std::vector<absl::string_view>>>
   GetOutputMemoryKinds() const = 0;
 
   // Returns a list of parameter OpSharding protos.
@@ -375,38 +362,38 @@ class PjRtExecutable {
 
   // Return memory stats that allow callers to estimate device memory usage
   // when running this executable.
-  virtual StatusOr<CompiledMemoryStats> GetCompiledMemoryStats() const {
+  virtual absl::StatusOr<CompiledMemoryStats> GetCompiledMemoryStats() const {
     return Unimplemented("Retrieving CompiledMemoryStats is not supported.");
   }
 
   // Returns named values for cost properties of this executable (such as
   // operations, size of input/outputs, and run time estimate). Properties may
   // differ for different platforms.
-  virtual StatusOr<absl::flat_hash_map<std::string, PjRtValueType>>
+  virtual absl::StatusOr<absl::flat_hash_map<std::string, PjRtValueType>>
   GetCostAnalysis() const = 0;
 
   // Serialize this executable into a string and return the value.
-  virtual StatusOr<std::string> SerializeExecutable() const {
+  virtual absl::StatusOr<std::string> SerializeExecutable() const {
     return Unimplemented("Serializing executable is not supported.");
   }
 
   // Return a fingerprint of this executable.
-  virtual StatusOr<std::string> FingerprintExecutable() const {
+  virtual absl::StatusOr<std::string> FingerprintExecutable() const {
     return Unimplemented("Fingerprinting executable is not supported.");
   }
 
-  virtual StatusOr<struct CompileOptions> GetCompileOptions() const {
+  virtual absl::StatusOr<struct CompileOptions> GetCompileOptions() const {
     return Unimplemented("CompileOptions not available.");
   }
 };
 
 class PjRtExecutableUtil {
  public:
-  static StatusOr<absl::flat_hash_map<std::string, PjRtValueType>>
+  static absl::StatusOr<absl::flat_hash_map<std::string, PjRtValueType>>
   RunHloCostAnalysis(const PjRtExecutable& executable,
                      HloCostAnalysis* hlo_cost_analysis);
 
-  static StatusOr<absl::flat_hash_map<std::string, PjRtValueType>>
+  static absl::StatusOr<absl::flat_hash_map<std::string, PjRtValueType>>
   RunHloCostAnalysis(
       const std::vector<std::shared_ptr<xla::HloModule>>& hlo_modules,
       HloCostAnalysis* hlo_cost_analysis);
diff --git a/third_party/xla/xla/pjrt/pjrt_executable_test.cc b/third_party/xla/xla/pjrt/pjrt_executable_test.cc
index 8fa614c1f050e9..e3a0b3d09ba7f1 100644
--- a/third_party/xla/xla/pjrt/pjrt_executable_test.cc
+++ b/third_party/xla/xla/pjrt/pjrt_executable_test.cc
@@ -113,5 +113,25 @@ TEST(ExecuteOptionsTest, ApplyOptionsCanParseStrings) {
       debug_options.xla_gpu_auto_spmd_partitioning_memory_budget_ratio(), 0.9);
   EXPECT_EQ(debug_options.xla_gpu_pgle_profile_file_or_directory_path(), "abc");
 }
+
+TEST(CompiledMemoryStatsTest, Serialization) {
+  CompiledMemoryStats stats;
+  stats.generated_code_size_in_bytes = 2;
+  stats.argument_size_in_bytes = 3;
+  stats.output_size_in_bytes = 5;
+  stats.alias_size_in_bytes = 7;
+  stats.temp_size_in_bytes = 11;
+  stats.host_generated_code_size_in_bytes = 13;
+  stats.host_argument_size_in_bytes = 17;
+  stats.host_output_size_in_bytes = 19;
+  stats.host_alias_size_in_bytes = 23;
+  stats.host_temp_size_in_bytes = 29;
+
+  CompiledMemoryStatsProto serialized = stats.ToProto();
+  CompiledMemoryStats deserialized = CompiledMemoryStats::FromProto(serialized);
+  EXPECT_EQ(serialized.SerializeAsString(),
+            deserialized.ToProto().SerializeAsString());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/pjrt_future.h b/third_party/xla/xla/pjrt/pjrt_future.h
index ed8f8da6a6e9cb..ac76be51859a1d 100644
--- a/third_party/xla/xla/pjrt/pjrt_future.h
+++ b/third_party/xla/xla/pjrt/pjrt_future.h
@@ -17,19 +17,18 @@ limitations under the License.
 #define XLA_PJRT_PJRT_FUTURE_H_
 
 #include <algorithm>
+#include <atomic>
 #include <cstdint>
 #include <functional>
-#include <optional>
+#include <memory>
 #include <type_traits>
 #include <utility>
 
-#include "absl/base/optimization.h"
-#include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
 #include "absl/types/span.h"
-#include "tsl/concurrency/async_value.h"
-#include "tsl/concurrency/async_value_ref.h"
-#include "tsl/concurrency/ref_count.h"
+#include "xla/tsl/concurrency/async_value.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "tsl/platform/logging.h"
 
 namespace xla {
@@ -38,7 +37,7 @@ template <class T = void>
 class PjRtFuture;
 
 namespace internal {
-template <class T>
+template <class T, bool unique>
 class PjRtFutureBase;
 }
 
@@ -68,7 +67,7 @@ class ScopedAsyncTrackingEvent {
   virtual ~ScopedAsyncTrackingEvent() = default;
 
  private:
-  template <class T>
+  template <class T, bool unique>
   friend class internal::PjRtFutureBase;
 
   // Indicates that the ScopedAsyncTrackingEvent won't complete until dependency
@@ -98,11 +97,73 @@ struct PjRtFutureHelpers {
 };
 
 namespace internal {
-// A base class for a stateful future PjRtFuture<T> and a stateless future
-// PjRtFuture<void>.
+
+// Detects absl::StatusOr<T> specializations to disable them for PjRtFuture<T>.
 template <typename T>
-class PjRtFutureBase {
+struct IsStatusOr : public std::false_type {};
+template <typename T>
+struct IsStatusOr<absl::StatusOr<T>> : public std::true_type {};
+
+// A base class to conditionally disable copy constructor and assignment for a
+// PjRtFuture<T> (by default we always disable copy constructor when `T` is not
+// copyable), which makes PjRtFuture<T> an `std::unique_ptr`-like container for
+// move-only types.
+template <bool unique>
+class PjRtFutureMoveControl;
+
+template <>
+class PjRtFutureMoveControl</*unique=*/true> {
+ protected:
+  PjRtFutureMoveControl() = default;
+
+  PjRtFutureMoveControl(const PjRtFutureMoveControl&) = delete;
+  PjRtFutureMoveControl& operator=(const PjRtFutureMoveControl&) = delete;
+
+  PjRtFutureMoveControl(PjRtFutureMoveControl&&) = default;
+  PjRtFutureMoveControl& operator=(PjRtFutureMoveControl&&) = default;
+};
+
+template <>
+class PjRtFutureMoveControl</*unique=*/false> {
+ protected:
+  PjRtFutureMoveControl() = default;
+
+  PjRtFutureMoveControl(const PjRtFutureMoveControl&) = default;
+  PjRtFutureMoveControl& operator=(const PjRtFutureMoveControl&) = default;
+
+  PjRtFutureMoveControl(PjRtFutureMoveControl&&) = default;
+  PjRtFutureMoveControl& operator=(PjRtFutureMoveControl&&) = default;
+};
+
+// A base class for a stateful future PjRtFuture<T> and a stateless future
+// PjRtFuture<>. If `unique` is true, PjRtFuture derived from this class acts
+// as a move-only type and the value can be passed to the caller only using move
+// assignment (applied to Await and OnReady APIs).
+template <typename T, bool unique = !std::is_copy_constructible_v<T>>
+class PjRtFutureBase : public PjRtFutureMoveControl<unique> {
+ protected:
+  // A protected constructor that hides AsyncValueRef implementation detail
+  // from the end users of PjRtFuture and Promise. Must not be made public!
+  PjRtFutureBase(tsl::AsyncValueRef<T> promise,
+                 PjRtFutureHelpers::OnBlockStartFn on_block_start,
+                 PjRtFutureHelpers::OnBlockEndFn on_block_end)
+      : promise_(std::move(promise)),
+        on_block_start_(std::move(on_block_start)),
+        on_block_end_(std::move(on_block_end)) {}
+
  public:
+  PjRtFutureBase() = default;
+
+  // Constructor for an already-available PjRtFuture.
+  //
+  // Typically used to eagerly return error values when async work will not
+  // be enqueued, e.g., due to invalid arguments.
+  explicit PjRtFutureBase(
+      T t, PjRtFutureHelpers::OnBlockStartFn on_block_start = nullptr,
+      PjRtFutureHelpers::OnBlockEndFn on_block_end = nullptr)
+      : PjRtFutureBase(tsl::MakeAvailableAsyncValueRef<T>(std::move(t)),
+                       std::move(on_block_start), std::move(on_block_end)) {}
+
   bool IsValid() const { return promise_ != nullptr; }
 
   // Two functions exist to know whether the future is ready, to accommodate
@@ -138,9 +199,12 @@ class PjRtFutureBase {
   }
 
  protected:
-  // Wrapper for AsyncValueRef<T> that can be used by clients that don't
-  // natively use TSL concurrency library. Stateless and stateful PjRtFuture<T>
-  // specializations define their own Promise type inheriting from this one.
+  static constexpr bool is_unique() { return unique; }
+
+  // PjRtFuture<T>::Promise provides a facility to store a value or an error
+  // that is later acquired asynchronously via a PjRtFuture<T> constructed from
+  // the promise object. Note that the promise object is meant to be used only
+  // once (set value or error).
   class Promise {
    public:
     Promise() = default;
@@ -148,66 +212,134 @@ class PjRtFutureBase {
     Promise(Promise&& other) = default;
     Promise& operator=(Promise&& other) = default;
 
-    Promise(const Promise& other) : ref_(other.ref_.CopyRef()) {}
-    Promise& operator=(const Promise& other) {
-      ref_ = other.ref_.CopyRef();
-      return *this;
-    }
+    Promise(const Promise& other) = default;
+    Promise& operator=(const Promise& other) = default;
 
-    operator bool() const { return static_cast<bool>(ref_); }  // NOLINT
+    operator bool() const { return static_cast<bool>(promise_); }  // NOLINT
 
    protected:
-    friend class PjRtFuture<T>;
-    friend class PjRtFuture<void>;
-
-    explicit Promise(tsl::AsyncValueRef<T> ref) : ref_(std::move(ref)) {}
-
-    void SetStateConcrete() {
-      DCHECK(ref_) << "Promise must wrap an async value";
-      ref_.SetStateConcrete();
-    }
-
-    void SetError(absl::Status error) {
-      DCHECK(ref_) << "Promise must wrap an async value";
-      ref_.SetError(std::move(error));
-    }
+    explicit Promise(tsl::AsyncValueRef<T> promise)
+        : promise_(std::move(promise)) {}
 
     template <typename... Args>
     void emplace(Args&&... args) const {
-      DCHECK(ref_) << "Promise must wrap an async value";
-      ref_.template emplace<T>(std::forward<Args>(args)...);
+      DCHECK(promise_) << "Promise must wrap an async value";
+      promise_.template emplace<T>(std::forward<Args>(args)...);
     }
 
-    tsl::AsyncValueRef<T> ExtractRef() && { return std::move(ref_); }
+    // Releases the underlying AsyncValueRef container to the caller.
+    tsl::AsyncValueRef<T> release() { return std::move(promise_); }
 
-    tsl::RCReference<tsl::AsyncValue> CopyRCRef() const {
-      return ref_.CopyRCRef();
-    }
+    // Returns a pointer to the underlying AsyncValue that can be used to
+    // track completion of a promise. It is undefined behavior to access the
+    // value stored in the AsyncValue.
+    tsl::AsyncValue* async_value() const { return promise_.GetAsyncValue(); }
+
+#ifndef NDEBUG
+    int64_t AddFuture() { return num_futures_->fetch_add(1); }
+#endif
 
    private:
-    tsl::AsyncValueRef<T> ref_;
+    tsl::AsyncValueRef<T> promise_;
+
+#ifndef NDEBUG
+    // In debug builds we track the number of futures created from a promise to
+    // detect when a promise for a move-only type can be accidentally shared by
+    // multiple futures. We wrap the counter into shared pointer because promise
+    // for a unique future is still copyable, but only one future can be created
+    // from all the copies.
+    std::shared_ptr<std::atomic<int64_t>> num_futures_ =
+        std::make_shared<std::atomic<int64_t>>(0);
+#endif
   };
 
-  PjRtFutureBase() = default;
-
-  PjRtFutureBase(tsl::AsyncValueRef<T> promise,
-                 PjRtFutureHelpers::OnBlockStartFn on_block_start,
-                 PjRtFutureHelpers::OnBlockEndFn on_block_end)
-      : promise_(std::move(promise)),
-        on_block_start_(std::move(on_block_start)),
-        on_block_end_(std::move(on_block_end)) {}
-
-  tsl::AsyncValuePtr<T> promise() const { return promise_.AsPtr(); }
-
-  PjRtFutureHelpers::ProfilingKeys OnBlockStart() {
+  PjRtFutureHelpers::ProfilingKeys OnBlockStart() const {
     return on_block_start_ ? on_block_start_()
                            : PjRtFutureHelpers::ProfilingKeys();
   }
 
-  void OnBlockEnd(PjRtFutureHelpers::ProfilingKeys keys) {
+  void OnBlockEnd(PjRtFutureHelpers::ProfilingKeys keys) const {
     if (on_block_end_) on_block_end_(std::move(keys));
   }
 
+  // Blocks the calling thread until the future is ready.
+  void BlockUntilReady() const {
+    CHECK(IsValid());
+    if (!promise_.IsAvailable()) {
+      PjRtFutureHelpers::ProfilingKeys keys = OnBlockStart();
+      tsl::BlockUntilReady(promise_);
+      OnBlockEnd(std::move(keys));
+    }
+    DCHECK(promise_.IsConcrete());
+  }
+
+  // Blocks the calling thread until the future is ready, then returns the
+  // final value.
+  const T& Await() const& {
+    BlockUntilReady();
+    return *promise_;
+  }
+
+  // Blocks the calling thread until the future is ready, then returns the
+  // final value.
+  std::conditional_t<unique, T, const T&> Await() && {
+    BlockUntilReady();
+
+    if constexpr (unique) {
+      return std::move(*promise_);
+    } else {
+      // We can't move from the promise to the caller because for non-unique
+      // futures we can have multiple copies of the PjRtFuture sharing the
+      // same underlying promise object.
+      return *promise_;
+    }
+  }
+
+  // Registers callback to be called once the promise is ready, with the final
+  // value.
+  //
+  // callback may be called on an internal system thread or the calling thread.
+  // The client should avoid any potentially re-entrant API calls within the
+  // callback, for example by using the callback to enqueue work on a
+  // client-owned threadpool.
+  template <typename F, std::enable_if_t<std::is_invocable_v<F, const T&> &&
+                                         !unique>* = nullptr>
+  void OnReady(F&& f) const& {
+    CHECK(IsValid());
+    promise_.AndThen(
+        [promise = promise_.AsPtr(), f = std::forward<F>(f)]() mutable {
+          DCHECK(promise.IsConcrete());
+          f(*promise);
+        });
+  }
+
+  // Registers callback to be called once the promise is ready, with the final
+  // value.
+  //
+  // callback may be called on an internal system thread or the calling thread.
+  // The client should avoid any potentially re-entrant API calls within the
+  // callback, for example by using the callback to enqueue work on a
+  // client-owned threadpool.
+  template <
+      typename F,
+      std::enable_if_t<unique ? std::is_invocable_v<F, T>
+                              : std::is_invocable_v<F, const T&>>* = nullptr>
+  void OnReady(F&& f) && {
+    CHECK(IsValid());
+    promise_.AndThen(
+        [promise = promise_.AsPtr(), f = std::forward<F>(f)]() mutable {
+          DCHECK(promise.IsConcrete());
+          if constexpr (unique) {
+            f(std::move(*promise));
+          } else {
+            // We can't move from the promise to the caller because for
+            // non-unique futures we can have multiple copies of the PjRtFuture
+            // sharing the same underlying promise object.
+            f(*promise);
+          }
+        });
+  }
+
  private:
   tsl::AsyncValueRef<T> promise_;
 
@@ -220,33 +352,41 @@ class PjRtFutureBase {
 }  // namespace internal
 
 // PjRtFuture<T> is a simple future that is returned by PjRt APIs that
-// enqueue asynchronous work, reporting a value of type T (frequently T=Status)
-// when the work is complete.
+// enqueue asynchronous work, reporting a value of type T when the work is
+// complete.
 //
 // PjRtFuture can be used by the client to wait for work to complete, either via
 // a blocking call or a callback.
 //
 // The implementation wraps a tsl::AsyncValueRef<T>, but we prefer to
-// encapsulate the AVR rather than returning it directly for two reasons.
+// encapsulate the AVR rather than returning it directly for three reasons.
+//
+// First, in contrast to AsyncValueRef which has a smart-pointer semantics,
+// future has more of a value semantics, i.e. future of a move-only type also
+// is a move-only type. You can think of a move-only (unique) future as a box to
+// pass a value of type T between asynchronous producer/consumer: you can open
+// the box once to put the value into it and you can open the box only once to
+// take the value out of it. For copyable types PjRtFuture<T> is a copyable
+// type, although all copies share the same underlying value.
 //
-// First, we want to retain portability in case a future implementation moves
+// Second, we want to retain portability in case a future implementation moves
 // away from AsyncValueRef ---- we don't want clients to call arbitrary
 // AsyncValueRef APIs.
 //
-// Second, we want to export different semantics, for example we support
+// Third, we want to export different semantics, for example we support
 // integration between blocking and profiling (e.g., TraceMe).
-//
-// There are two ways to construct a PjRtFuture, one used by clients that
-// natively use TSL concurrency library, which already have import APIs for
-// constructing AsyncValueRefs; and another that avoids exposing TSL APIs and
-// can be used by non-TSL clients.
 template <class T>
-class PjRtFuture : public internal::PjRtFutureBase<T> {
-  using Base = internal::PjRtFutureBase<T>;
+class PjRtFuture : public internal::PjRtFutureBase<absl::StatusOr<T>> {
+  using Base = internal::PjRtFutureBase<absl::StatusOr<T>>;
+
+  static_assert(!std::is_same_v<T, absl::Status>,
+                "Use PjRtFuture<> specialization for stateless futures");
+
+  static_assert(
+      !internal::IsStatusOr<T>::value,
+      "PjRtFuture<T> already has an implicit absl::StatusOr<T> semantics");
 
  public:
-  // Wrapper for AsyncValueRef<T> that can be used by clients that don't
-  // natively use TSL concurrency library.
   class Promise : public Base::Promise {
    public:
     using Base::Promise::Promise;
@@ -255,207 +395,95 @@ class PjRtFuture : public internal::PjRtFutureBase<T> {
     //
     // After Set is called, value will be delivered to waiters on the PjRtFuture
     // constructed from a promise, via blocking or callbacks.
-    void Set(T value) { Base::Promise::emplace(std::move(value)); }
+    void Set(absl::StatusOr<T> value) {
+      Base::Promise::emplace(std::move(value));
+    }
+
+   private:
+    friend class PjRtFuture<T>;
   };
 
   // Returns a Promise that can be used to construct a PjRtFuture, and then Set
   // later.
-  //
-  // Used by clients that do not use TSL concurrency library natively.
   static Promise CreatePromise() {
-    return Promise(tsl::MakeUnconstructedAsyncValueRef<T>());
+    return Promise(tsl::MakeUnconstructedAsyncValueRef<absl::StatusOr<T>>());
   }
 
-  PjRtFuture() = default;
-
-  // Constructor for an already-available PjRtFuture.
-  //
-  // Typically used to eagerly return error values when async work will not
-  // be enqueued, e.g., due to invalid arguments.
-  explicit PjRtFuture(T t)
-      : Base(tsl::MakeAvailableAsyncValueRef<T>(std::move(t)),
-             /*on_block_start=*/nullptr,
-             /*on_block_end=*/nullptr) {}
-
-  // Constructor used by clients that natively use TSL concurrency library.
-  //
-  // on_block_start is called before Await starts to block.
-  // on_block_end is called after Await finishes blocking.
-  explicit PjRtFuture(
-      tsl::AsyncValueRef<T> async_value,
-      PjRtFutureHelpers::OnBlockStartFn on_block_start = nullptr,
-      PjRtFutureHelpers::OnBlockEndFn on_block_end = nullptr)
-      : Base(std::move(async_value), std::move(on_block_start),
-             std::move(on_block_end)) {}
+  // Bring PjRtFutureBase constructors in scope.
+  using Base::Base;
 
-  // Constructor used by clients that don't natively use TSL concurrency library
-  // and want to use the wrapped PjRtFuture<T>::Promise class.
+  // Constructor for unavailable future that will be fulfilled later via the
+  // promise object.
   //
-  // on_block_start is called before Await starts to block.
-  // on_block_end is called after Await finishes blocking.
+  // - on_block_start is called before Await starts to block.
+  //  - on_block_end is called after Await finishes blocking.
   explicit PjRtFuture(
       Promise promise,
       PjRtFutureHelpers::OnBlockStartFn on_block_start = nullptr,
       PjRtFutureHelpers::OnBlockEndFn on_block_end = nullptr)
-      : Base(std::move(promise).ExtractRef(), std::move(on_block_start),
-             std::move(on_block_end)) {}
-
-  // Blocks the calling thread until the future is ready, then returns the
-  // final value.
-  T Await() {
-    CHECK(Base::IsValid());
-    if (!Base::promise().IsAvailable()) {
-      PjRtFutureHelpers::ProfilingKeys keys = Base::OnBlockStart();
-      BlockUntilReady(Base::promise());
-      Base::OnBlockEnd(std::move(keys));
+      : Base(promise.release(), std::move(on_block_start),
+             std::move(on_block_end)) {
+#ifndef NDEBUG
+    if constexpr (Base::is_unique()) {
+      DCHECK_EQ(promise.AddFuture(), 0)
+          << "Unique PjRtFuture cannot share a promise object";
     }
-    DCHECK(Base::promise().IsConcrete());
-    return *Base::promise();
+#endif
   }
 
-  // Registers callback to be called once the promise is ready, with the final
-  // value.
-  //
-  // callback may be called on an internal system thread or the calling thread.
-  // The client should avoid any potentially re-entrant API calls within the
-  // callback, for example by using the callback to enqueue work on a
-  // client-owned threadpool.
-  void OnReady(absl::AnyInvocable<void(T) &&> callback) {
-    CHECK(Base::IsValid());
-    Base::promise().AndThen(
-        [promise = Base::promise(), callback = std::move(callback)]() mutable {
-          DCHECK(promise.IsConcrete());
-          if constexpr (std::is_copy_constructible_v<T>) {
-            std::move(callback)(*promise);
-            return;
-          }
-          // For non-copyable types, we have no ways to check the number of
-          // waiters but we have to move the data into the consumer callback.
-          // Registering two callbacks will lead to double-move of the data. It
-          // is users' responsibility to make sure only one waiter is
-          // registered.
-          // TODO(yunlongl): Implement `PjRtUniqueFuture`.
-          std::move(callback)(std::move(*promise));
-        });
-  }
+  using Base::Await;
+  using Base::OnReady;
 };
 
 // PjRtFuture<void> specialization for communicating stateless events.
 //
 // See PjRtFuture<T> documentation above for more details.
 template <>
-class PjRtFuture<void> : public internal::PjRtFutureBase<std::nullopt_t> {
-  using Base = internal::PjRtFutureBase<std::nullopt_t>;
+class PjRtFuture<void> : public internal::PjRtFutureBase<absl::Status> {
+  using Base = internal::PjRtFutureBase<absl::Status>;
 
  public:
-  // Wrapper for AsyncValueRef<T> that can be used by clients that don't
-  // natively use TSL concurrency library.
   class Promise : public Base::Promise {
    public:
+    using Base::Promise::async_value;
     using Base::Promise::Promise;
 
-    // Returns a reference to the underlying AsyncValue that can be used to
-    // track completion of a promise. It is undefined behavior to access the
-    // value stored in the AsyncValue.
-    using Base::Promise::CopyRCRef;
-
     // Sets the promise completed with a given status. Must be called at most
     // once.
     //
     // After Set is called, completion event will be delivered to waiters on the
     // PjRtFuture constructed from a promise, via blocking or callbacks.
     void Set(absl::Status status = absl::OkStatus()) {
-      if (ABSL_PREDICT_TRUE(status.ok())) {
-        Base::Promise::SetStateConcrete();
-      } else {
-        Base::Promise::SetError(std::move(status));
-      }
+      Base::Promise::emplace(std::move(status));
     }
 
-    // TODO(b/333538339): Remove this method in favor if Set() above.
-    void SetError(absl::Status status) { Set(std::move(status)); }
+   private:
+    friend class PjRtFuture<void>;
   };
 
   // Returns a Promise that can be used to construct a PjRtFuture, and then Set
   // later.
-  //
-  // Used by clients that do not use TSL concurrency library.
   static Promise CreatePromise() {
-    return Promise(
-        tsl::MakeConstructedAsyncValueRef<std::nullopt_t>(std::nullopt));
+    return Promise(tsl::MakeUnconstructedAsyncValueRef<absl::Status>());
   }
 
-  PjRtFuture() = default;
-
-  // Constructor for an already-available PjRtFuture. OkStatus means that future
-  // is already successfully completed. Error means that future is already
-  // completed with an error.
-  explicit PjRtFuture(absl::Status status)
-      : Base(status.ok()
-                 ? tsl::MakeAvailableAsyncValueRef<std::nullopt_t>(std::nullopt)
-                 : tsl::MakeErrorAsyncValueRef(std::move(status)),
-             /*on_block_start=*/nullptr, /*on_block_end=*/nullptr) {}
+  // Bring PjRtFutureBase constructors in scope.
+  using Base::Base;
 
-  // Constructor for an unavailable PjRtFuture that will be resolved later by
-  // setting the promise completed.
+  // Constructor for unavailable future that will be fulfilled later via the
+  // promise object.
   //
-  // on_block_start is called before Await starts to block.
-  // on_block_end is called after Await finishes blocking.
+  // - on_block_start is called before Await starts to block.
+  //  - on_block_end is called after Await finishes blocking.
   explicit PjRtFuture(
       Promise promise,
       PjRtFutureHelpers::OnBlockStartFn on_block_start = nullptr,
       PjRtFutureHelpers::OnBlockEndFn on_block_end = nullptr)
-      : Base(std::move(promise).ExtractRef(), std::move(on_block_start),
+      : Base(promise.release(), std::move(on_block_start),
              std::move(on_block_end)) {}
 
-  // Blocks the calling thread until the future is ready.
-  absl::Status Await() {
-    CHECK(Base::IsValid());
-    if (!Base::promise().IsAvailable()) {
-      PjRtFutureHelpers::ProfilingKeys keys = Base::OnBlockStart();
-      BlockUntilReady(Base::promise());
-      Base::OnBlockEnd(std::move(keys));
-    }
-    return Base::promise().IsError() ? Base::promise().GetError()
-                                     : absl::OkStatus();
-  }
-
-  // TODO(b/333538339): Remove when all users of PjRtFuture<Status> will be
-  // converted to PjRtFuture<>. Currently this is an escape hatch to convert
-  // implicit error of a stateless event to a stateful future.
-  PjRtFuture<absl::Status> ToStatusFuture() {
-    auto promise = PjRtFuture<absl::Status>::CreatePromise();
-    OnReady([promise](absl::Status status) mutable {
-      promise.Set(std::move(status));
-    });
-    return PjRtFuture<absl::Status>(std::move(promise));
-  }
-
-  // TODO(b/333538339): Remove when all users of PjRtFuture<Status> will be
-  // converted to PjRtFuture<>. Currently this is an escape hatch to convert
-  // explicit error carried in a stateful future to a stateless future.
-  static PjRtFuture<> FromStatusFuture(PjRtFuture<absl::Status> future) {
-    PjRtFuture<>::Promise promise = PjRtFuture<>::CreatePromise();
-    future.OnReady([promise](absl::Status status) mutable {
-      if (status.ok()) {
-        promise.Set();
-      } else {
-        promise.SetError(std::move(status));
-      }
-    });
-    return PjRtFuture<>(std::move(promise));
-  }
-
-  // Registers callback to be called once the future is ready.
-  //
-  // callback may be called on an internal system thread or the calling thread.
-  // The client should avoid any potentially re-entrant API calls within the
-  // callback, for example by using the callback to enqueue work on a
-  // client-owned threadpool.
-  void OnReady(absl::AnyInvocable<void(absl::Status)> callback) const {
-    CHECK(Base::IsValid());
-    Base::promise().AndThen(std::move(callback));
-  }
+  using Base::Await;
+  using Base::OnReady;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/pjrt_future_test.cc b/third_party/xla/xla/pjrt/pjrt_future_test.cc
index 9666d6a5cc3f4e..8f342d2f7f19b0 100644
--- a/third_party/xla/xla/pjrt/pjrt_future_test.cc
+++ b/third_party/xla/xla/pjrt/pjrt_future_test.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "xla/pjrt/pjrt_future.h"
 
 #include <cstdint>
+#include <memory>
+#include <utility>
 #include <vector>
 
 #include "absl/status/status.h"
@@ -38,6 +40,73 @@ TEST(PjRtFutureTest, StatelessFuture) {
       [](absl::Status status) { EXPECT_EQ(status, absl::OkStatus()); });
 }
 
+TEST(PjRtFutureTest, CopyableFuture) {
+  auto promise = PjRtFuture<int32_t>::CreatePromise();
+  PjRtFuture<int32_t> future(promise);
+
+  PjRtFuture<int32_t> copy_constructed(future);
+  PjRtFuture<int32_t> copy_assigned = future;
+
+  EXPECT_FALSE(copy_constructed.IsReady());
+  EXPECT_FALSE(copy_assigned.IsReady());
+  promise.Set(42);
+  EXPECT_TRUE(copy_constructed.IsReady());
+  EXPECT_TRUE(copy_assigned.IsReady());
+}
+
+TEST(PjRtFutureTest, MoveConstructedFuture) {
+  auto promise = PjRtFuture<std::unique_ptr<int32_t>>::CreatePromise();
+  PjRtFuture<std::unique_ptr<int32_t>> future(promise);
+
+  PjRtFuture<std::unique_ptr<int32_t>> move_constructed(std::move(future));
+
+  EXPECT_FALSE(move_constructed.IsReady());
+  promise.Set(std::make_unique<int32_t>(42));
+  EXPECT_TRUE(move_constructed.IsReady());
+}
+
+TEST(PjRtFutureTest, MoveAssignedFuture) {
+  auto promise = PjRtFuture<std::unique_ptr<int32_t>>::CreatePromise();
+  PjRtFuture<std::unique_ptr<int32_t>> future(promise);
+
+  PjRtFuture<std::unique_ptr<int32_t>> move_assigned = std::move(future);
+
+  EXPECT_FALSE(move_assigned.IsReady());
+  promise.Set(std::make_unique<int32_t>(42));
+  EXPECT_TRUE(move_assigned.IsReady());
+}
+
+TEST(PjRtFutureTest, AwaitMoveOnlyFuture) {
+  auto promise = PjRtFuture<std::unique_ptr<int32_t>>::CreatePromise();
+  PjRtFuture<std::unique_ptr<int32_t>> future(promise);
+
+  promise.Set(std::make_unique<int32_t>(42));
+
+  EXPECT_EQ(**future.Await(), 42);
+  EXPECT_EQ(**std::move(future).Await(), 42);
+}
+
+TEST(PjRtFutureTest, OnReadyRvalueFuture) {
+  auto promise = PjRtFuture<int32_t>::CreatePromise();
+  PjRtFuture<int32_t> future(promise);
+
+  promise.Set(42);
+
+  std::move(future).OnReady(
+      [](absl::StatusOr<int32_t> value) { EXPECT_EQ(*value, 42); });
+}
+
+TEST(PjRtFutureTest, OnReadyMoveOnlyFuture) {
+  auto promise = PjRtFuture<std::unique_ptr<int32_t>>::CreatePromise();
+  PjRtFuture<std::unique_ptr<int32_t>> future(promise);
+
+  promise.Set(std::make_unique<int32_t>(42));
+
+  std::move(future).OnReady([](absl::StatusOr<std::unique_ptr<int32_t>> value) {
+    EXPECT_EQ(**value, 42);
+  });
+}
+
 TEST(PjRtFutureTest, StatelessError) {
   auto promise = PjRtFuture<>::CreatePromise();
   PjRtFuture<> future(promise);
@@ -80,12 +149,12 @@ TEST(PjRtFutureTest, StatefulFuture) {
   promise.Set(42);
   EXPECT_TRUE(future.IsReady());
 
-  future.OnReady([](int32_t value) { EXPECT_EQ(value, 42); });
+  future.OnReady([](absl::StatusOr<int32_t> value) { EXPECT_EQ(*value, 42); });
 }
 
 TEST(PjRtFutureTest, StatusFuture) {
-  auto promise = PjRtFuture<absl::Status>::CreatePromise();
-  PjRtFuture<absl::Status> future(promise);
+  auto promise = PjRtFuture<>::CreatePromise();
+  PjRtFuture<> future(promise);
 
   EXPECT_FALSE(future.IsReady());
   promise.Set(absl::OkStatus());
@@ -96,11 +165,11 @@ TEST(PjRtFutureTest, StatusFuture) {
 }
 
 TEST(PjRtFutureTest, StatusOrFuture) {
-  auto promise = PjRtFuture<absl::StatusOr<int32_t>>::CreatePromise();
-  PjRtFuture<absl::StatusOr<int32_t>> future(promise);
+  auto promise = PjRtFuture<int32_t>::CreatePromise();
+  PjRtFuture<int32_t> future(promise);
 
   EXPECT_FALSE(future.IsReady());
-  promise.Set(absl::StatusOr<int32_t>(42));
+  promise.Set(42);
   EXPECT_TRUE(future.IsReady());
 
   future.OnReady([](absl::StatusOr<int32_t> value) { EXPECT_EQ(*value, 42); });
diff --git a/third_party/xla/xla/pjrt/pjrt_layout.h b/third_party/xla/xla/pjrt/pjrt_layout.h
index 0fbf205de8b6e7..60d25a3430b42b 100644
--- a/third_party/xla/xla/pjrt/pjrt_layout.h
+++ b/third_party/xla/xla/pjrt/pjrt_layout.h
@@ -70,7 +70,8 @@ class PjRtXlaLayout : public PjRtLayout {
 
   std::string Serialize() const override { return xla_layout_.ToString(); }
 
-  static StatusOr<PjRtXlaLayout> Deserialize(absl::string_view serialized) {
+  static absl::StatusOr<PjRtXlaLayout> Deserialize(
+      absl::string_view serialized) {
     TF_ASSIGN_OR_RETURN(Layout xla_layout, ParseLayout(serialized));
     return PjRtXlaLayout(std::move(xla_layout));
   }
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
index 0b47a863ec9696..4b9a0ac56d0a96 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
@@ -88,6 +88,7 @@ limitations under the License.
 #include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
@@ -97,7 +98,6 @@ limitations under the License.
 #include "xla/client/executable_build_options.h"
 #include "xla/client/local_client.h"
 #include "xla/client/xla_computation.h"
-#include "xla/cpu_function_runtime.h"
 #include "xla/executable_run_options.h"
 #include "xla/layout.h"
 #include "xla/literal.h"
@@ -132,11 +132,11 @@ limitations under the License.
 #include "xla/statusor.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
-#include "xla/stream_executor/host/host_platform_id.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/framework/allocator.h"
+#include "tsl/platform/casts.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/fingerprint.h"
@@ -151,6 +151,17 @@ limitations under the License.
 
 namespace xla {
 
+PjRtStreamExecutorMemorySpace::PjRtStreamExecutorMemorySpace(
+    int id, PjRtDevice* device, absl::string_view kind, int kind_id)
+    : id_(id), device_(device), kind_(kind), kind_id_(kind_id) {
+  DCHECK(device_ != nullptr && device_->client() != nullptr);
+  auto* client = device_->client();
+  to_string_ = absl::StrFormat("MEMORY_SPACE_%i", id_);
+  debug_string_ = absl::StrFormat(
+      "PjRtStreamExecutorMemory(id=%i, process_index=%i, client=%s)", id_,
+      client->process_index(), client->platform_name());
+}
+
 PjRtPlatformId PjRtStreamExecutorDevice::platform_id() const {
   return client_->platform_id();
 }
@@ -158,15 +169,15 @@ absl::string_view PjRtStreamExecutorDevice::platform_name() const {
   return client_->platform_name();
 }
 
-StatusOr<LocalDeviceState*> PjRtStreamExecutorDevice::GetLocalDeviceState()
-    const {
+absl::StatusOr<LocalDeviceState*>
+PjRtStreamExecutorDevice::GetLocalDeviceState() const {
   if (local_device_state_) {
     return local_device_state_.get();
   }
   return InvalidArgument("Device %s is not a local device.", DebugString());
 }
 
-StatusOr<DeviceAssignment> DevicesToDeviceAssignment(
+absl::StatusOr<DeviceAssignment> DevicesToDeviceAssignment(
     absl::Span<const std::vector<PjRtDevice*>> devices) {
   if (devices.empty()) {
     return InvalidArgument(
@@ -267,13 +278,14 @@ PjRtStreamExecutorClient::PjRtStreamExecutorClient(
                });
 }
 
-StatusOr<DeviceAssignment> PjRtStreamExecutorClient::GetDefaultDeviceAssignment(
-    int num_replicas, int num_partitions) const {
+absl::StatusOr<DeviceAssignment>
+PjRtStreamExecutorClient::GetDefaultDeviceAssignment(int num_replicas,
+                                                     int num_partitions) const {
   return client_->backend().computation_placer()->AssignDevices(num_replicas,
                                                                 num_partitions);
 }
 
-StatusOr<Layout> PjRtStreamExecutorClient::GetDefaultLayout(
+absl::StatusOr<Layout> PjRtStreamExecutorClient::GetDefaultLayout(
     PrimitiveType element_type, absl::Span<const int64_t> dims) {
   Shape shape = ShapeUtil::MakeShape(element_type, dims);
   TF_ASSIGN_OR_RETURN(
@@ -283,7 +295,7 @@ StatusOr<Layout> PjRtStreamExecutorClient::GetDefaultLayout(
   return shape.layout();
 }
 
-StatusOr<std::unique_ptr<HloCostAnalysis>>
+absl::StatusOr<std::unique_ptr<HloCostAnalysis>>
 PjRtStreamExecutorClient::GetHloCostAnalysis() const {
   return std::make_unique<HloCostAnalysis>(
       client_->backend().compiler()->ShapeSizeBytesFunction());
@@ -385,23 +397,42 @@ void RecordUsage(PjRtStreamExecutorBuffer::ScopedHold device_buffer,
                                  retain_buffer_until_completion);
 }
 
-// Allocates the device buffers for a buffer that will be used as the
-// destination of a copy, either from the host or another device. copy_stream
-// may be nullptr, e.g., when allocating a buffer for a cross-host copy. If the
-// buffer is a tuple then the tuple tables are allocated, and all necessary
-// synchronization for them is dealt with, before the buffer is returned.
-//
-// It is safe to delete the returned PjRtBuffer without further
-// synchronization if an error occurs before the buffer is used.
-//
-// The caller may optionally provide a definition event to be recorded in
-// the buffer.
-// TODO(phawkins): replace on_host_shape here with on_device_shape.
-StatusOr<std::unique_ptr<PjRtStreamExecutorBuffer>> AllocateDestinationBuffer(
+// Adds necessary synchronization after a copy has been enqueued to a buffer.
+// definition_event was added when the buffer was allocated, but has not yet
+// had an event recorded.
+absl::Status AddDestinationBufferSynchronization(
+    LocalDeviceState* local_device,
+    PjRtStreamExecutorBuffer::ScopedHold device_buffer,
+    std::shared_ptr<BufferSequencingEvent> definition_event,
+    se::Stream* copy_stream) {
+  absl::StatusOr<EventPool::Handle> event_or =
+      local_device->event_pool().ThenAllocateAndRecordEvent(copy_stream);
+  if (!event_or.ok()) {
+    StallStreamOnError(local_device, copy_stream);
+    return event_or.status();
+  }
+  definition_event->SetSequencingEvent(std::move(event_or).value(),
+                                       copy_stream);
+  // prefer_to_retain_reference=false means don't retain a memory reference
+  // until the transfer is complete when using the ComputeSynchronized
+  // allocation model. This is a heuristic because in the common case
+  // destination buffers will be used on the compute stream and therefore don't
+  // require any synchronization before being freed. If the buffer is allocated
+  // and never used, the free will take longer and this is assumed to be ok.
+  RecordUsage(std::move(device_buffer), local_device, local_device,
+              definition_event, copy_stream,
+              /*prefer_to_retain_reference=*/false);
+  return OkStatus();
+}
+
+}  // namespace
+
+absl::StatusOr<std::unique_ptr<PjRtStreamExecutorBuffer>>
+AllocateDestinationBuffer(
     const Shape& on_host_shape, PjRtDevice* device,
     LocalDeviceState* local_device, se::Stream* copy_stream,
     bool is_uninitialized_create, PjRtStreamExecutorClient* client,
-    std::shared_ptr<BufferSequencingEvent> definition_event = nullptr) {
+    std::shared_ptr<BufferSequencingEvent> definition_event) {
   if (on_host_shape.IsTuple() && on_host_shape.tuple_shapes_size() == 0) {
     return InvalidArgument("Can't make a buffer from an empty tuple");
   }
@@ -482,7 +513,7 @@ StatusOr<std::unique_ptr<PjRtStreamExecutorBuffer>> AllocateDestinationBuffer(
 
     definition_events.emplace_back(
         std::make_shared<BufferSequencingEvent>(client->thread_pool()));
-    StatusOr<EventPool::Handle> event_or =
+    absl::StatusOr<EventPool::Handle> event_or =
         local_device->event_pool().ThenAllocateAndRecordEvent(
             tuple_table_stream);
     if (!event_or.ok()) {
@@ -497,7 +528,8 @@ StatusOr<std::unique_ptr<PjRtStreamExecutorBuffer>> AllocateDestinationBuffer(
                                                   definition_events);
 
   auto py_buffer = std::make_unique<PjRtStreamExecutorBuffer>(
-      on_device_shape, std::move(dst_device_buffer), client, device);
+      on_device_shape, std::move(dst_device_buffer), client, device,
+      device->default_memory_space().value_or(nullptr));
 
   if (on_device_shape.IsTuple()) {
     // Add a usage hold for the tuple table write and immediately convert it to
@@ -516,36 +548,6 @@ StatusOr<std::unique_ptr<PjRtStreamExecutorBuffer>> AllocateDestinationBuffer(
   return py_buffer;
 }
 
-// Adds necessary synchronization after a copy has been enqueued to a buffer.
-// definition_event was added when the buffer was allocated, but has not yet
-// had an event recorded.
-Status AddDestinationBufferSynchronization(
-    LocalDeviceState* local_device,
-    PjRtStreamExecutorBuffer::ScopedHold device_buffer,
-    std::shared_ptr<BufferSequencingEvent> definition_event,
-    se::Stream* copy_stream) {
-  StatusOr<EventPool::Handle> event_or =
-      local_device->event_pool().ThenAllocateAndRecordEvent(copy_stream);
-  if (!event_or.ok()) {
-    StallStreamOnError(local_device, copy_stream);
-    return event_or.status();
-  }
-  definition_event->SetSequencingEvent(std::move(event_or).value(),
-                                       copy_stream);
-  // prefer_to_retain_reference=false means don't retain a memory reference
-  // until the transfer is complete when using the ComputeSynchronized
-  // allocation model. This is a heuristic because in the common case
-  // destination buffers will be used on the compute stream and therefore don't
-  // require any synchronization before being freed. If the buffer is allocated
-  // and never used, the free will take longer and this is assumed to be ok.
-  RecordUsage(std::move(device_buffer), local_device, local_device,
-              definition_event, copy_stream,
-              /*prefer_to_retain_reference=*/false);
-  return OkStatus();
-}
-
-}  // namespace
-
 PjRtStreamExecutorBuffer::ScopedHold::~ScopedHold() {
   if (ok()) {
     parent_->DropHold(type_, buffer().get());
@@ -563,7 +565,7 @@ PjRtStreamExecutorBuffer::ScopedHold::ScopedHold(ScopedHold&& other)
 }
 
 void PjRtStreamExecutorBuffer::ScopedHold::Acquire(
-    StatusOr<std::shared_ptr<TrackedDeviceBuffer>>&& buffer_or) {
+    absl::StatusOr<std::shared_ptr<TrackedDeviceBuffer>>&& buffer_or) {
   CHECK(!ok());
   if (buffer_or.ok()) {
     buffer_ = buffer_or.value();
@@ -619,7 +621,7 @@ void PjRtStreamExecutorBuffer::ScopedHold::AddToInput(
 
 bool PjRtStreamExecutorBuffer::IsOnCpu() const { return false; }
 
-StatusOr<Shape> PjRtStreamExecutorBuffer::logical_on_device_shape() {
+absl::StatusOr<Shape> PjRtStreamExecutorBuffer::logical_on_device_shape() {
   if (on_device_shape_.is_static()) {
     return on_device_shape_;
   }
@@ -639,7 +641,7 @@ StatusOr<Shape> PjRtStreamExecutorBuffer::logical_on_device_shape() {
 
   WaitForBufferDefinitionEventsOnStream(*device_buffer, stream);
   ShapedBuffer shaped_buffer = device_buffer->AsShapedBuffer(on_device_shape_);
-  StatusOr<EventPool::Handle> event_or =
+  absl::StatusOr<EventPool::Handle> event_or =
       local_device->event_pool().AllocateEvent(stream->parent());
   if (!event_or.ok()) {
     return event_or.status();
@@ -668,7 +670,7 @@ class ScopedHoldAsExternalReference : public PjRtBuffer::ExternalReference {
 
   ~ScopedHoldAsExternalReference() override = default;
 
-  Status WaitUntilBufferReadyOnStream(std::intptr_t stream) override {
+  absl::Status WaitUntilBufferReadyOnStream(std::intptr_t stream) override {
     for (const std::shared_ptr<BufferSequencingEvent>& event :
          external_reference_->definition_events()) {
       TF_RETURN_IF_ERROR(event->WaitForEventOnExternalStream(stream));
@@ -682,10 +684,10 @@ class ScopedHoldAsExternalReference : public PjRtBuffer::ExternalReference {
 
 }  // namespace
 
-StatusOr<std::unique_ptr<PjRtBuffer::ExternalReference>>
+absl::StatusOr<std::unique_ptr<PjRtBuffer::ExternalReference>>
 PjRtStreamExecutorBuffer::AcquireExternalReference() {
   ScopedHold hold = GetBufferWithExternalReference();
-  Status hold_status = hold.status();
+  absl::Status hold_status = hold.status();
   if (!hold_status.ok()) return hold_status;
   return std::unique_ptr<ExternalReference>(
       std::make_unique<ScopedHoldAsExternalReference>(std::move(hold)));
@@ -706,7 +708,7 @@ class TrackedDeviceBufferExternalReference
   std::shared_ptr<TrackedDeviceBuffer> tracked_device_buffer_;
 };
 
-StatusOr<std::unique_ptr<PjRtBuffer::ExternalReference>>
+absl::StatusOr<std::unique_ptr<PjRtBuffer::ExternalReference>>
 PjRtStreamExecutorBuffer::ReleaseDeviceMemoryOwnership(
     bool wait_for_operations_to_complete) {
   if (on_device_shape_.IsTuple()) {
@@ -765,7 +767,8 @@ PjRtStreamExecutorBuffer::DonateWithControlDependency(PjRtFuture<> dependency) {
   // definition event.
   new_buffer =
       std::unique_ptr<PjRtBuffer>(std::make_unique<PjRtStreamExecutorBuffer>(
-          on_device_shape(), std::move(new_device_buffer), client(), device()));
+          on_device_shape(), std::move(new_device_buffer), client(), device(),
+          device()->default_memory_space().value_or(nullptr)));
 
   PjRtStreamExecutorDevice* device = this->device();
   LocalDeviceState* local_device = device->local_device_state();
@@ -787,7 +790,7 @@ PjRtStreamExecutorBuffer::DonateWithControlDependency(PjRtFuture<> dependency) {
   return new_buffer;
 }
 
-StatusOr<std::unique_ptr<PjRtBuffer>>
+absl::StatusOr<std::unique_ptr<PjRtBuffer>>
 PjRtStreamExecutorClient::BufferFromHostBuffer(
     const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
     std::optional<absl::Span<int64_t const>> byte_strides,
@@ -850,11 +853,12 @@ PjRtStreamExecutorClient::BufferFromHostBuffer(
     TF_ASSIGN_OR_RETURN(transpose, transpose_cache_.GetOrCreate(options));
   }
 
-  bool should_pack =
-      primitive_util::Is4BitType(type) && transfer_manager->PackSubbyteTypes();
+  bool should_pack = primitive_util::IsSubByteNonPredType(type) &&
+                     transfer_manager->PackSubbyteTypes();
   int64_t packed_size;
   if (should_pack) {
-    packed_size = CeilOfRatio<int64_t>(size, 2);
+    packed_size =
+        CeilOfRatio<int64_t>(size, 8 / primitive_util::BitWidth(type));
   } else {
     packed_size = size;
   }
@@ -886,16 +890,19 @@ PjRtStreamExecutorClient::BufferFromHostBuffer(
     if (transpose) {
       transpose->Execute(data, staging_buffer.get());
       if (should_pack) {
-        PackInt4(absl::MakeConstSpan(
-                     static_cast<const char*>(staging_buffer.get()), size),
-                 absl::MakeSpan(static_cast<char*>(staging_buffer.get()),
-                                packed_size));
+        primitive_util::PackIntN(
+            type,
+            absl::MakeConstSpan(static_cast<const char*>(staging_buffer.get()),
+                                size),
+            absl::MakeSpan(static_cast<char*>(staging_buffer.get()),
+                           packed_size));
       }
     } else {
       if (should_pack) {
-        PackInt4(absl::MakeConstSpan(static_cast<const char*>(data), size),
-                 absl::MakeSpan(static_cast<char*>(staging_buffer.get()),
-                                packed_size));
+        primitive_util::PackIntN(
+            type, absl::MakeConstSpan(static_cast<const char*>(data), size),
+            absl::MakeSpan(static_cast<char*>(staging_buffer.get()),
+                           packed_size));
       } else {
         std::memcpy(staging_buffer.get(), data, size);
       }
@@ -945,7 +952,8 @@ PjRtStreamExecutorClient::BufferFromHostBuffer(
             if (transpose) {
               transpose->Execute(data, staging_buffer.get());
               if (should_pack) {
-                PackInt4(
+                primitive_util::PackIntN(
+                    type,
                     absl::MakeConstSpan(
                         static_cast<const char*>(staging_buffer.get()), size),
                     absl::MakeSpan(static_cast<char*>(staging_buffer.get()),
@@ -953,7 +961,8 @@ PjRtStreamExecutorClient::BufferFromHostBuffer(
               }
             } else {
               if (should_pack) {
-                PackInt4(
+                primitive_util::PackIntN(
+                    type,
                     absl::MakeConstSpan(static_cast<const char*>(data), size),
                     absl::MakeSpan(static_cast<char*>(staging_buffer.get()),
                                    packed_size));
@@ -989,7 +998,7 @@ PjRtStreamExecutorClient::BufferFromHostBuffer(
   return std::unique_ptr<PjRtBuffer>(std::move(py_buffer));
 }
 
-StatusOr<std::unique_ptr<PjRtBuffer>>
+absl::StatusOr<std::unique_ptr<PjRtBuffer>>
 PjRtStreamExecutorClient::BufferFromHostBuffer(
     const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
     std::optional<absl::Span<int64_t const>> byte_strides,
@@ -1001,13 +1010,32 @@ PjRtStreamExecutorClient::BufferFromHostBuffer(
       std::move(on_done_with_host_buffer), device, /*device_layout=*/nullptr);
 }
 
-StatusOr<std::unique_ptr<PjRtBuffer>>
+absl::StatusOr<std::unique_ptr<PjRtBuffer>>
+PjRtStreamExecutorClient::BufferFromHostBuffer(
+    const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+    std::optional<absl::Span<int64_t const>> byte_strides,
+    HostBufferSemantics host_buffer_semantics,
+    absl::AnyInvocable<void() &&> on_done_with_host_buffer,
+    PjRtMemorySpace* memory_space, const Layout* device_layout) {
+  if (memory_space->devices().size() == 1) {
+    return BufferFromHostBuffer(data, type, dims, byte_strides,
+                                host_buffer_semantics,
+                                std::move(on_done_with_host_buffer),
+                                memory_space->devices()[0], device_layout);
+  }
+  return absl::UnimplementedError(absl::StrCat(
+      "BufferFromHostBuffer with PjRtMemorySpace is not implemented on "
+      "platform: ",
+      platform_name()));
+}
+
+absl::StatusOr<std::unique_ptr<PjRtBuffer>>
 PjRtStreamExecutorClient::CreateUninitializedBuffer(const Shape& shape,
                                                     PjRtDevice* device) {
   return CreateUninitializedBuffer(shape, device, nullptr);
 }
 
-StatusOr<std::unique_ptr<PjRtBuffer>>
+absl::StatusOr<std::unique_ptr<PjRtBuffer>>
 PjRtStreamExecutorClient::CreateUninitializedBuffer(
     const Shape& shape, PjRtDevice* device,
     std::shared_ptr<BufferSequencingEvent> definition_event) {
@@ -1032,9 +1060,15 @@ PjRtStreamExecutorClient::CreateUninitializedBuffer(
   return std::unique_ptr<PjRtBuffer>(std::move(py_buffer));
 }
 
-StatusOr<std::unique_ptr<PjRtBuffer>>
-PjRtStreamExecutorClient::CreateErrorBuffer(Status error, const Shape& shape,
-                                            PjRtDevice* device) {
+absl::StatusOr<std::unique_ptr<PjRtBuffer>>
+PjRtStreamExecutorClient::CreateErrorBuffer(absl::Status error,
+                                            const Shape& shape,
+                                            PjRtMemorySpace* memory) {
+  if (memory->client() != this) {
+    return absl::InvalidArgumentError(
+        "Memory space is not attached to this client");
+  }
+  auto* device = memory->devices()[0];
   VLOG(1) << "PjRtStreamExecutorClient::CreateErrorBuffer: shape: "
           << shape.ToString() << " device: " << device->DebugString()
           << " error: " << error;
@@ -1055,11 +1089,12 @@ PjRtStreamExecutorClient::CreateErrorBuffer(Status error, const Shape& shape,
       /*on_delete_callback=*/nullptr);
 
   auto py_buffer = std::make_unique<PjRtStreamExecutorBuffer>(
-      shape, std::move(dummy_device_buffer), this, device);
+      shape, std::move(dummy_device_buffer), this, device,
+      device->default_memory_space().value_or(nullptr));
   return py_buffer;
 }
 
-StatusOr<std::unique_ptr<PjRtBuffer>>
+absl::StatusOr<std::unique_ptr<PjRtBuffer>>
 PjRtStreamExecutorClient::BufferFromHostLiteral(const LiteralSlice& literal,
                                                 PjRtDevice* device) {
   tsl::profiler::TraceMe traceme(
@@ -1122,7 +1157,19 @@ PjRtStreamExecutorClient::BufferFromHostLiteral(const LiteralSlice& literal,
   return std::unique_ptr<PjRtBuffer>(std::move(py_buffer));
 }
 
-StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+absl::StatusOr<std::unique_ptr<PjRtBuffer>>
+PjRtStreamExecutorClient::BufferFromHostLiteral(const LiteralSlice& literal,
+                                                PjRtMemorySpace* memory_space) {
+  if (memory_space->devices().size() == 1) {
+    return BufferFromHostLiteral(literal, memory_space->devices()[0]);
+  }
+  return absl::UnimplementedError(absl::StrCat(
+      "BufferFromHostLiteral with PjRtMemorySpace is not implemented on "
+      "platform: ",
+      platform_name()));
+}
+
+absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
 PjRtStreamExecutorClient::MakeCrossHostReceiveBuffers(
     absl::Span<const Shape> shapes, PjRtDevice* device,
     PjRtCrossHostRecvNotifier notifier) {
@@ -1153,7 +1200,7 @@ PjRtStreamExecutorClient::MakeCrossHostReceiveBuffers(
   return buffers;
 }
 
-StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
 PjRtStreamExecutorClient::MakeCrossHostReceiveBuffersForGather(
     absl::Span<const Shape> shapes, std::vector<GatherDetails> gather_details,
     PjRtDevice* device, PjRtCrossHostRecvNotifier notifier) {
@@ -1196,7 +1243,7 @@ PjRtStreamExecutorClient::MakeCrossHostReceiveBuffersForGather(
   return buffers;
 }
 
-StatusOr<std::unique_ptr<PjRtBuffer>>
+absl::StatusOr<std::unique_ptr<PjRtBuffer>>
 PjRtStreamExecutorClient::CreateViewOfDeviceBuffer(
     void* device_ptr, const Shape& shape, PjRtDevice* device,
     std::function<void()> on_delete_callback,
@@ -1230,18 +1277,20 @@ PjRtStreamExecutorClient::CreateViewOfDeviceBuffer(
       std::initializer_list<se::DeviceMemoryBase>{buffer}, definition_events,
       std::move(on_delete_callback));
   return std::unique_ptr<PjRtBuffer>(std::make_unique<PjRtStreamExecutorBuffer>(
-      shape, std::move(device_buffer), this, device));
+      shape, std::move(device_buffer), this, device,
+      device->default_memory_space().value_or(nullptr)));
 }
 
 // Transfer the given literal to the infeed queue of the given local device.
-Status PjRtStreamExecutorDevice::TransferToInfeed(const LiteralSlice& literal) {
+absl::Status PjRtStreamExecutorDevice::TransferToInfeed(
+    const LiteralSlice& literal) {
   // Only support infeed to local device.
   TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device, GetLocalDeviceState());
   return local_device->client()->TransferToInfeedLocal(
       literal, local_device->local_hardware_id().value());
 }
 
-Status PjRtStreamExecutorDevice::TransferFromOutfeed(
+absl::Status PjRtStreamExecutorDevice::TransferFromOutfeed(
     MutableBorrowingLiteral literal) {
   VLOG(1) << "PjRtStreamExecutorDevice::TransferFromOutfeed";
   TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device, GetLocalDeviceState());
@@ -1249,17 +1298,53 @@ Status PjRtStreamExecutorDevice::TransferFromOutfeed(
       local_device->local_hardware_id().value(), literal);
 }
 
+void PjRtStreamExecutorDevice::AttachMemorySpace(
+    PjRtMemorySpace* memory_space) {
+  CHECK(memory_space != nullptr);
+  CHECK(client_ == memory_space->client()) << absl::StrFormat(
+      "Could not attach a PjRtStreamExecutorDevice to a PjRtMemorySpace owned "
+      "by a different client, the device's client: %s, the memory space's "
+      "client: %s.",
+      client_->platform_name(), memory_space->client()->platform_name());
+
+  memory_spaces_.push_back(memory_space);
+  memory_spaces_by_id_.emplace(memory_space->kind_id(), memory_space);
+}
+
 absl::Span<PjRtMemorySpace* const> PjRtStreamExecutorDevice::memory_spaces()
     const {
-  return {};
+  return memory_spaces_;
 }
 
-StatusOr<PjRtMemorySpace*> PjRtStreamExecutorDevice::default_memory_space()
-    const {
+absl::StatusOr<PjRtMemorySpace*>
+PjRtStreamExecutorDevice::default_memory_space() const {
   return Unimplemented("default_memory_space is not supported.");
 }
 
-StatusOr<std::intptr_t>
+absl::StatusOr<PjRtMemorySpace*> PjRtStreamExecutorDevice::memory_space_by_kind(
+    absl::string_view memory_space_kind) const {
+  auto it =
+      absl::c_find_if(memory_spaces_, [memory_space_kind](PjRtMemorySpace* ms) {
+        return ms->kind() == memory_space_kind;
+      });
+  if (it != memory_spaces_.end()) {
+    return *it;
+  }
+  return absl::InternalError(
+      absl::StrCat("No memory space found (kind: ", memory_space_kind, ")"));
+}
+
+absl::StatusOr<PjRtMemorySpace*>
+PjRtStreamExecutorDevice::memory_space_by_kind_id(int id) const {
+  auto it = memory_spaces_by_id_.find(id);
+  if (it == memory_spaces_by_id_.end()) {
+    return absl::InternalError(
+        absl::StrCat("No memory space found (kind_id: ", id, ")"));
+  }
+  return it->second;
+}
+
+absl::StatusOr<std::intptr_t>
 PjRtStreamExecutorDevice::GetStreamForExternalReadyEvents() const {
   TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device, GetLocalDeviceState());
   se::Stream* stream = local_device->GetExternalReadyEventStream();
@@ -1272,12 +1357,12 @@ PjRtStreamExecutorDevice::GetStreamForExternalReadyEvents() const {
   return absl::bit_cast<std::intptr_t>(raw_stream);
 }
 
-StatusOr<PjRtDevice*> PjRtStreamExecutorClient::LookupAddressableDevice(
+absl::StatusOr<PjRtDevice*> PjRtStreamExecutorClient::LookupAddressableDevice(
     int local_hardware_id) const {
   return LookupAddressableDevice(PjRtLocalDeviceId(local_hardware_id));
 }
 
-StatusOr<PjRtDevice*> PjRtStreamExecutorClient::LookupAddressableDevice(
+absl::StatusOr<PjRtDevice*> PjRtStreamExecutorClient::LookupAddressableDevice(
     xla::PjRtLocalDeviceId local_device_id) const {
   for (auto* device : addressable_devices_) {
     if (local_device_id == device->local_device_id()) {
@@ -1290,15 +1375,16 @@ StatusOr<PjRtDevice*> PjRtStreamExecutorClient::LookupAddressableDevice(
 
 absl::Span<PjRtMemorySpace* const> PjRtStreamExecutorClient::memory_spaces()
     const {
-  return {};
+  return memory_spaces_;
 }
 
 PjRtStreamExecutorBuffer::PjRtStreamExecutorBuffer(
     Shape on_device_shape, std::shared_ptr<TrackedDeviceBuffer> device_buffer,
-    PjRtClient* client, PjRtDevice* device)
+    PjRtClient* client, PjRtDevice* device, PjRtMemorySpace* memory_space)
     : client_(tensorflow::down_cast<PjRtStreamExecutorClient*>(client)),
       on_device_shape_(std::move(on_device_shape)),
       device_(tensorflow::down_cast<PjRtStreamExecutorDevice*>(device)),
+      memory_space_(memory_space),
       device_buffer_(std::move(device_buffer)) {
   for (int i = 0; i < ScopedHold::Type::kMaxValue; ++i) {
     holds_[i] = 0;
@@ -1326,7 +1412,7 @@ void PjRtStreamExecutorBuffer::WaitForOutstandingDonationHold() {
   mu_.Await(absl::Condition(&not_in_donation_hold));
 }
 
-StatusOr<std::shared_ptr<TrackedDeviceBuffer>>
+absl::StatusOr<std::shared_ptr<TrackedDeviceBuffer>>
 PjRtStreamExecutorBuffer::Release(bool wait_for_operations_to_complete) {
   tsl::profiler::TraceMe trace_me("PjRtStreamExecutorBuffer::Release");
   std::shared_ptr<TrackedDeviceBuffer> device_buffer;
@@ -1370,31 +1456,82 @@ PjRtStreamExecutorBuffer::Release(bool wait_for_operations_to_complete) {
   } else {
     if (local_device_state->allocation_model() ==
         LocalDeviceState::kComputeSynchronized) {
-      std::unique_ptr<se::Stream> block_stream;
+      se::Stream* block_stream = nullptr;
       for (const auto& stream_and_event : events) {
+        VLOG(2)
+            << "Checking whether need to wait for stream_and_event: stream: "
+            << stream_and_event.stream
+            << "; event: " << stream_and_event.event.get()
+            << "; reference_held: " << stream_and_event.reference_held
+            << "; is_predetermined_error: "
+            << stream_and_event.event->IsPredeterminedError();
         // We only need to do something for events that didn't already acquire a
         // reference to the buffer, and also which the compute stream didn't
         // already wait for. Based on our heuristics this rare case should only
         // occur when a buffer was copied to a device and then never used there.
         // In that case we get a new stream and use it to hold onto a reference
         // to the buffer until the events are complete.
-        if (!stream_and_event.reference_held &&
+        //
+        // It is also important that we check IsPredeterminedError before
+        // checking DefinedOn(compute_stream) because otherwise DefinedOn would
+        // indefinitely wait since the event is never recorded when the buffer
+        // is predetermined error.
+        if (!stream_and_event.event->IsPredeterminedError() &&
+            !stream_and_event.reference_held &&
             !stream_and_event.event->DefinedOn(
                 local_device_state->compute_stream()) &&
             !stream_and_event.event->IsComplete()) {
           if (block_stream == nullptr) {
-            block_stream = local_device_state->BorrowStreamFromPool();
+            block_stream = local_device_state->GetFixedSizePoolUsageStream();
           }
-          stream_and_event.event->WaitForEventOnStream(block_stream.get());
+          VLOG(2) << "Waiting for stream_and_event: stream: "
+                  << stream_and_event.stream
+                  << "; event: " << stream_and_event.event.get()
+                  << "; reference_held: " << stream_and_event.reference_held
+                  << "; is_predetermined_error: "
+                  << stream_and_event.event->IsPredeterminedError();
+          stream_and_event.event->WaitForEventOnStream(block_stream);
+        }
+      }
+      for (const auto& definition_event : device_buffer->definition_events()) {
+        VLOG(2) << "Checking whether need to wait for definition_event: "
+                << definition_event.get() << "; is_predetermined_error: "
+                << definition_event->IsPredeterminedError();
+        // Here we wait for the definition events to complete on block_stream as
+        // well, if they are not on the compute stream and not also recorded as
+        // usage events.
+        //
+        // It is also important that we check IsPredeterminedError before
+        // checking DefinedOn(compute_stream) because otherwise DefinedOn would
+        // indefinitely wait since the event is never recorded when the buffer
+        // is predetermined error.
+        //
+        // Since it's possible that definition_event.SetSequencingEvent()
+        // is called on a different host thread than this host thread, when in
+        // future more conditions are added to this check, we should be careful
+        // about whether we put them before the DefinedOn check or after it.
+        // For example, we shouldn't add an IsDefined() check before the
+        // DefinedOn() check here because that could potentially cause a
+        // shortcut where we don't wait for
+        // definition_event.SetSequencingEvent() on the other thread and
+        // eventually cause memory corruption.
+        if (!definition_event->IsPredeterminedError() &&
+            !definition_event->DefinedOn(
+                local_device_state->compute_stream()) &&
+            !definition_event->IsComplete()) {
+          if (block_stream == nullptr) {
+            block_stream = local_device_state->GetFixedSizePoolUsageStream();
+          }
+          VLOG(2) << "Waiting for definition_event: " << definition_event.get()
+                  << "; is_predetermined_error: "
+                  << definition_event->IsPredeterminedError();
+          definition_event->WaitForEventOnStream(block_stream);
         }
       }
       if (block_stream != nullptr) {
-        se::Stream* block_stream_ptr = block_stream.release();
         TF_RETURN_IF_ERROR(local_device_state->ThenExecuteCallback(
-            block_stream_ptr,
-            [device_buffer, block_stream_ptr, local_device_state]() {
-              local_device_state->ReturnStreamToPool(
-                  std::unique_ptr<se::Stream>(block_stream_ptr));
+            block_stream, [device_buffer]() {
+              // Drops device_buffer shared pointer.
             }));
       }
     }
@@ -1404,7 +1541,13 @@ PjRtStreamExecutorBuffer::Release(bool wait_for_operations_to_complete) {
 
 void PjRtStreamExecutorBuffer::Delete() {
   VLOG(1) << "PjRtStreamExecutorBuffer::Delete";
+
   // When wait_for_reads_to_complete is false, Release should never fail.
+  //
+  // The only usage events that
+  // Release(/*wait_for_operations_to_complete=*/false) doesn't wait for are
+  // events defined on the compute stream. All streams other than the compute
+  // stream are expected to WaitFor compute stream before any write operations.
   TF_CHECK_OK(Release(/*wait_for_operations_to_complete=*/false).status());
 }
 
@@ -1413,7 +1556,7 @@ bool PjRtStreamExecutorBuffer::IsDeleted() {
   return device_buffer_ == nullptr;
 }
 
-StatusOr<std::shared_ptr<TrackedDeviceBuffer>>
+absl::StatusOr<std::shared_ptr<TrackedDeviceBuffer>>
 PjRtStreamExecutorBuffer::GetBufferForHoldLocked(ScopedHold::Type type) {
   // All callers should have called WaitForOutstandingDonationHold().
   CHECK_EQ(holds_[ScopedHold::kDonation], 0);
@@ -1545,14 +1688,14 @@ PjRtFuture<> PjRtStreamExecutorBuffer::ToLiteral(MutableLiteralBase* literal) {
                            transfer_manager = std::move(transfer_manager),
                            on_device_shape{on_device_shape_}, literal, promise,
                            local_device]() mutable {
-    StatusOr<EventPool::Handle> event_or =
+    absl::StatusOr<EventPool::Handle> event_or =
         local_device->event_pool().AllocateEvent(stream->parent());
     if (!event_or.ok()) {
       promise.Set(event_or.status());
       return;
     }
 
-    Status defined_status =
+    absl::Status defined_status =
         tracked_device_buffer->definition_events()[0]->GetDefinedStatus();
     if (!defined_status.ok()) {
       promise.Set(defined_status);
@@ -1574,7 +1717,9 @@ PjRtFuture<> PjRtStreamExecutorBuffer::ToLiteral(MutableLiteralBase* literal) {
 
     transfer_manager->TransferLiteralFromDevice(
         stream, shaped_buffer, literal,
-        [promise](Status status) mutable { promise.Set(std::move(status)); },
+        [promise](absl::Status status) mutable {
+          promise.Set(std::move(status));
+        },
         transfer_metadata_ptr);
 
     local_device->event_pool().ThenRecordEvent(stream, event_or.value());
@@ -1607,7 +1752,8 @@ PjRtFuture<> PjRtStreamExecutorBuffer::ToLiteral(MutableLiteralBase* literal) {
       });
 }
 
-StatusOr<size_t> PjRtStreamExecutorBuffer::GetOnDeviceSizeInBytes() const {
+absl::StatusOr<size_t> PjRtStreamExecutorBuffer::GetOnDeviceSizeInBytes()
+    const {
   absl::MutexLock lock(&mu_);
   if (device_buffer_ == nullptr) {
     return InvalidArgument(
@@ -1622,33 +1768,16 @@ StatusOr<size_t> PjRtStreamExecutorBuffer::GetOnDeviceSizeInBytes() const {
 
 PjRtFuture<> PjRtStreamExecutorBuffer::CopyRawToHost(void* dst, int64_t offset,
                                                      int64_t transfer_size) {
-  return client_->CopyRawSubBufferToHost(this, dst, offset, transfer_size);
+  return client_->CopyRawSubBufferToHost(this, PjRtFuture<void*>(dst), offset,
+                                         transfer_size);
 }
 
 PjRtFuture<> PjRtStreamExecutorBuffer::CopyRawToHostFuture(
-    PjRtFuture<StatusOr<void*>> dst, int64_t offset, int64_t transfer_size) {
-  auto promise = PjRtFuture<>::CreatePromise();
-  dst.OnReady([this, promise, offset,
-               transfer_size](absl::StatusOr<void*> dst) mutable {
-    if (dst.ok()) {
-      // Trampoline through a thread pool since some device types (e.g., GPUs)
-      // do not allow calling D2H inside the callback's context.
-      client_->thread_pool()->Schedule(
-          [this, dst = *dst, offset, transfer_size,
-           promise = std::move(promise)]() mutable {
-            CopyRawToHost(dst, offset, transfer_size)
-                .OnReady([promise = std::move(promise)](Status status) mutable {
-                  promise.Set(std::move(status));
-                });
-          });
-    } else {
-      promise.Set(dst.status());
-    }
-  });
-  return PjRtFuture<>(std::move(promise));
+    PjRtFuture<void*> dst, int64_t offset, int64_t transfer_size) {
+  return client_->CopyRawSubBufferToHost(this, dst, offset, transfer_size);
 }
 
-StatusOr<ShapedBuffer> PjRtStreamExecutorBuffer::AsShapedBuffer() const {
+absl::StatusOr<ShapedBuffer> PjRtStreamExecutorBuffer::AsShapedBuffer() const {
   absl::MutexLock lock(&mu_);
   if (device_buffer_ == nullptr) {
     return InvalidArgument(
@@ -1667,8 +1796,8 @@ PjRtStreamExecutorBuffer::GetBufferWithHold(ScopedHold::Type type) {
   return hold;
 }
 
-StatusOr<std::pair<std::unique_ptr<PjRtBuffer>,
-                   std::shared_ptr<BufferSequencingEvent>>>
+absl::StatusOr<std::pair<std::unique_ptr<PjRtBuffer>,
+                         std::shared_ptr<BufferSequencingEvent>>>
 PjRtStreamExecutorBuffer::CopyToDeviceHelper(
     PjRtDevice* dst_device, LocalDeviceState* dst_local_device,
     LocalDeviceState* transfer_local_device, LocalDeviceState* src_local_device,
@@ -1704,7 +1833,7 @@ PjRtStreamExecutorBuffer::CopyToDeviceHelper(
     VLOG(1)
         << "PjRtStreamExecutorBuffer::CopyToDeviceHelper::async_copy_to_device";
 
-    Status defined_status =
+    absl::Status defined_status =
         src_device_buffer->definition_events()[0]->GetDefinedStatus();
     // Only proceeds to transfer when the buffer doesn't hold an error.
     if (defined_status.ok()) {
@@ -1744,7 +1873,7 @@ PjRtStreamExecutorBuffer::CopyToDeviceHelper(
         }
       }
 
-      StatusOr<EventPool::Handle> event_or =
+      absl::StatusOr<EventPool::Handle> event_or =
           transfer_local_device->event_pool().ThenAllocateAndRecordEvent(
               transfer_stream);
       if (!event_or.ok()) {
@@ -1780,8 +1909,8 @@ PjRtStreamExecutorBuffer::CopyToDeviceHelper(
       std::move(copy_event));
 }
 
-StatusOr<std::unique_ptr<PjRtBuffer>> PjRtStreamExecutorBuffer::CopyToDevice(
-    PjRtDevice* dst_device) {
+absl::StatusOr<std::unique_ptr<PjRtBuffer>>
+PjRtStreamExecutorBuffer::CopyToDevice(PjRtDevice* dst_device) {
   tsl::profiler::TraceMe traceme("PjRtStreamExecutorBuffer::CopyToDevice");
   VLOG(1) << "PjRtStreamExecutorBuffer::CopyToDevice";
   if (dst_device == device_) {
@@ -1831,8 +1960,8 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtStreamExecutorBuffer::CopyToDevice(
     AcquireHoldLocked(&src_device_buffer);
   }
 
-  StatusOr<std::pair<std::unique_ptr<PjRtBuffer>,
-                     std::shared_ptr<BufferSequencingEvent>>>
+  absl::StatusOr<std::pair<std::unique_ptr<PjRtBuffer>,
+                           std::shared_ptr<BufferSequencingEvent>>>
       buffer_and_event_or = CopyToDeviceHelper(
           dst_device, dst_local_device, transfer_local_device,
           device_->local_device_state(), transfer_stream,
@@ -1857,9 +1986,16 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtStreamExecutorBuffer::CopyToDevice(
   return std::move(buffer);
 }
 
+absl::StatusOr<std::unique_ptr<PjRtBuffer>>
+PjRtStreamExecutorBuffer::CopyToMemorySpace(PjRtMemorySpace* dst_memory_space) {
+  if (dst_memory_space->devices().size() == 1) {
+    return CopyToDevice(dst_memory_space->devices()[0]);
+  }
+  return Unimplemented("CopyToMemorySpace is not supported");
+}
+
 void PjRtStreamExecutorBuffer::CopyToRemoteDevice(
-    PjRtFuture<StatusOr<std::string>> serialized_descriptor,
-    RemoteSendCallback on_done) {
+    PjRtFuture<std::string> serialized_descriptor, RemoteSendCallback on_done) {
   VLOG(1) << "PjRtStreamExecutorBuffer::CopyToRemoteDevice";
   auto desc = serialized_descriptor.Await();
   if (desc.ok()) {
@@ -1870,7 +2006,7 @@ void PjRtStreamExecutorBuffer::CopyToRemoteDevice(
 }
 
 void PjRtStreamExecutorBuffer::CopyToRemoteDeviceScattered(
-    PjRtFuture<StatusOr<std::vector<std::string>>> serialized_descriptors,
+    PjRtFuture<std::vector<std::string>> serialized_descriptors,
     std::vector<RemoteSendCallback> callbacks,
     const ScatterDetails& scatter_details) {
   VLOG(1) << "PjRtStreamExecutorBuffer::CopyToRemoteDeviceScattered";
@@ -1907,7 +2043,7 @@ PjRtFuture<> PjRtStreamExecutorBuffer::GetReadyFuture() {
         [device_buffer, local_device_state = std::move(local_device_state),
          definition_promise]() mutable {
           std::unique_ptr<se::Stream> stream;
-          Status defined_status =
+          absl::Status defined_status =
               device_buffer->definition_events()[0]->GetDefinedStatus();
           if (!defined_status.ok()) {
             definition_promise.Set(defined_status);
@@ -1982,11 +2118,19 @@ struct TupleHandle {
   std::shared_ptr<BufferSequencingEvent> event;
 };
 
-Status CheckCompatibleShapes(bool strict_shape_checking,
-                             const Shape& buffer_on_device_shape,
-                             const Shape& execution_shape,
-                             const TransferManager& transfer_manager,
-                             int parameter_index) {
+absl::Status CheckCompatibleShapes(bool strict_shape_checking,
+                                   const Shape& buffer_on_device_shape,
+                                   const Shape& execution_shape,
+                                   const TransferManager& transfer_manager,
+                                   int parameter_index) {
+  // Handle the special case: the underlying pjrt buffer of a JAX token may have
+  // shape `pred[0]`.
+  if (execution_shape.IsToken() &&
+      buffer_on_device_shape.element_type() == PrimitiveType::PRED &&
+      buffer_on_device_shape.dimensions_size() == 1 &&
+      buffer_on_device_shape.dimensions(0) == 0) {
+    return OkStatus();
+  }
   // TODO(misard) Support casting of tuple parameters.
   if (strict_shape_checking || buffer_on_device_shape.IsTuple()) {
     if (!ShapeUtil::Compatible(buffer_on_device_shape, execution_shape)) {
@@ -2023,7 +2167,7 @@ Status CheckCompatibleShapes(bool strict_shape_checking,
 }
 
 // Makes a tuple from the arguments to an execution.
-StatusOr<TupleHandle> MakeTupleHelper(
+absl::StatusOr<TupleHandle> MakeTupleHelper(
     PjRtStreamExecutorClient* client, LocalDeviceState* local_device,
     bool strict_shape_checking, const Shape& tupled_parameter_shape,
     absl::Span<PjRtBuffer* const> py_buffers,
@@ -2079,7 +2223,7 @@ StatusOr<TupleHandle> MakeTupleHelper(
 
   TF_RETURN_IF_ERROR(transfer_manager->WriteRootTupleIndexTable(
       stream, execution_input.Buffers()));
-  StatusOr<EventPool::Handle> event_or =
+  absl::StatusOr<EventPool::Handle> event_or =
       local_device->event_pool().ThenAllocateAndRecordEvent(stream);
   if (!event_or.ok()) {
     StallStreamOnError(local_device, stream);
@@ -2103,7 +2247,8 @@ std::unique_ptr<PjRtBuffer> OutputBufferHelper(
       TrackedDeviceBuffer::FromScopedShapedBuffer(result_buffer,
                                                   {definition_event});
   auto pjrt_buffer = std::make_unique<PjRtStreamExecutorBuffer>(
-      result_buffer->on_device_shape(), std::move(out_buffer), client, device);
+      result_buffer->on_device_shape(), std::move(out_buffer), client, device,
+      device->default_memory_space().value_or(nullptr));
   RecordUsage(pjrt_buffer->GetBufferWithUsageHold(), local_device, local_device,
               definition_event, local_device->compute_stream(),
               /*prefer_to_retain_reference=*/false, &buffers_to_release);
@@ -2189,7 +2334,8 @@ PjRtStreamExecutorLoadedExecutable::PjRtStreamExecutorLoadedExecutable(
   }
 }
 
-Status PjRtStreamExecutorLoadedExecutable::SetUpDonation(bool tuple_inputs) {
+absl::Status PjRtStreamExecutorLoadedExecutable::SetUpDonation(
+    bool tuple_inputs) {
   parameters_that_must_be_donated_.reserve(executables_.size());
   for (auto& executable : executables_) {
     TF_ASSIGN_OR_RETURN(std::vector<int> parameters_to_donate,
@@ -2216,7 +2362,7 @@ PjRtStreamExecutorLoadedExecutable::ParametersThatMustBeDonated(
   return parameters_that_must_be_donated_[executable_idx];
 }
 
-StatusOr<std::vector<ExecutionInput>>
+absl::StatusOr<std::vector<ExecutionInput>>
 PjRtStreamExecutorLoadedExecutable::MakeExecutionInputsAndWaitForEvents(
     int device_ordinal, const ExecuteOptions& options,
     absl::Span<const Shape> executable_parameter_shapes,
@@ -2307,7 +2453,7 @@ static SendDeviceMemoryFunction ConvertSendCallbacksToSendFunction(
              int64_t channel_id, se::Stream* stream, const Shape& shape,
              const se::DeviceMemoryBase& src,
              const absl::flat_hash_map<std::string, std::string>&)
-             -> StatusOr<AsyncValueRef<se::Event>> {
+             -> absl::StatusOr<AsyncValueRef<std::unique_ptr<se::Event>>> {
     VLOG(3) << "Send " << src.size() << " bytes to channel #" << channel_id
             << " (shape=" << shape.ToString() << ")";
 
@@ -2321,10 +2467,9 @@ static SendDeviceMemoryFunction ConvertSendCallbacksToSendFunction(
     // Allocate event that will signal completion of send operation. We do not
     // actually track the completion of the send callback, we only have to keep
     // the device memory long enough to complete the memcpy command.
-    auto done_event = MakeConstructedAsyncValueRef<se::Event>(stream->parent());
-    if (!done_event->Init())
-      return Internal("Failed to initialize done event (channel_id=%d)",
-                      channel_id);
+    TF_ASSIGN_OR_RETURN(auto se_event, stream->parent()->CreateEvent());
+    auto done_event = MakeConstructedAsyncValueRef<std::unique_ptr<se::Event>>(
+        std::move(se_event));
 
     thread_pool->Schedule([done_event, stream, src, channel_id, shape, send] {
       tsl::profiler::TraceMe trace([&] {
@@ -2341,7 +2486,7 @@ static SendDeviceMemoryFunction ConvertSendCallbacksToSendFunction(
         done_event.SetError(status);
         return;
       }
-      status = stream->RecordEvent(&done_event.get());
+      status = stream->RecordEvent(done_event.get().get());
       if (!status.ok()) {
         done_event.SetError(status);
         return;
@@ -2374,9 +2519,9 @@ static SendDeviceMemoryFunction ConvertSendCallbacksToSendFunction(
 namespace {
 class StreamExecutorCopyToDeviceStream : public CopyToDeviceStream {
  public:
-  StreamExecutorCopyToDeviceStream(int64_t channel_id, se::Stream* stream,
-                                   se::DeviceMemoryBase dst,
-                                   AsyncValueRef<se::Event> done)
+  StreamExecutorCopyToDeviceStream(
+      int64_t channel_id, se::Stream* stream, se::DeviceMemoryBase dst,
+      AsyncValueRef<std::unique_ptr<se::Event>> done)
       : CopyToDeviceStream(dst.size(), /*granule_bytes=*/1),
         channel_id_(channel_id),
         stream_(stream),
@@ -2438,7 +2583,7 @@ class StreamExecutorCopyToDeviceStream : public CopyToDeviceStream {
     // responsibility to synchronize with this event before submitting any new
     // computations to the stream.
     if (complete) {
-      auto recorded = stream_->RecordEvent(&done_.get());
+      auto recorded = stream_->RecordEvent(done_.get().get());
       if (!recorded.ok()) {
         done_.SetError(recorded);
         return PjRtFuture<>(done_.GetError());
@@ -2456,7 +2601,7 @@ class StreamExecutorCopyToDeviceStream : public CopyToDeviceStream {
 
   // Async value will become available after we'll submit the last memcpy
   // operation, and the event will be recorded on the stream.
-  AsyncValueRef<se::Event> done_;
+  AsyncValueRef<std::unique_ptr<se::Event>> done_;
 };
 }  // namespace
 
@@ -2480,7 +2625,7 @@ static RecvDeviceMemoryFunction ConvertRecvCallbacksToRecvFunction(
   return [callbacks](int64_t channel_id, se::Stream* stream, const Shape& shape,
                      se::DeviceMemoryBase* dst,
                      const absl::flat_hash_map<std::string, std::string>&)
-             -> StatusOr<AsyncValueRef<se::Event>> {
+             -> absl::StatusOr<AsyncValueRef<std::unique_ptr<se::Event>>> {
     VLOG(3) << "Recv from channel #" << channel_id
             << " (shape=" << shape.ToString() << ")";
 
@@ -2500,10 +2645,9 @@ static RecvDeviceMemoryFunction ConvertRecvCallbacksToRecvFunction(
     // Allocate event that will signal completion of recv operation. We record
     // it on a stream after submitting the memcpy for the last chunk (see
     // `StreamExecutorCopyToDeviceStream` implementation above).
-    auto done_event = MakeConstructedAsyncValueRef<se::Event>(stream->parent());
-    if (!done_event->Init())
-      return Internal("Failed to initialize done event (channel_id=%d)",
-                      channel_id);
+    TF_ASSIGN_OR_RETURN(auto event, stream->parent()->CreateEvent());
+    auto done_event = MakeConstructedAsyncValueRef<std::unique_ptr<se::Event>>(
+        std::move(event));
 
     recv->callback({shape}, std::make_unique<StreamExecutorCopyToDeviceStream>(
                                 channel_id, stream, *dst, done_event));
@@ -2517,7 +2661,7 @@ static RecvDeviceMemoryFunction ConvertRecvCallbacksToRecvFunction(
 // converted on success.
 // When `options` has non-zero `launch_id`, use `launch_id` instead of `run_id`
 // to initialize `run_options`.
-StatusOr<ScopedShapedBuffer>
+absl::StatusOr<ScopedShapedBuffer>
 PjRtStreamExecutorLoadedExecutable::EnqueueExecution(
     absl::Span<PjRtBuffer* const> argument_handles, int replica, int partition,
     int executable_idx, const RunId& run_id, const ExecuteOptions& options,
@@ -2660,7 +2804,9 @@ PjRtStreamExecutorLoadedExecutable::EnqueueExecution(
   if (run_options.launch_id() != 0) {
     VLOG(3) << "launch id for " << name() << ": " << run_options.launch_id();
   }
-
+  if (options.context != nullptr) {
+    run_options.set_ffi_execution_context(&options.context->ffi_context());
+  }
   // The choice of where we wait is arbitrary; the reason for the wait is
   // pacing to avoid problems such as memory fragmentation and running ahead
   // too far, not for correctness. Placing it before the executable launch
@@ -2673,7 +2819,7 @@ PjRtStreamExecutorLoadedExecutable::EnqueueExecution(
         device_state->compute_semaphore().ScopedAcquire(1));
   }
 
-  StatusOr<ExecutionOutput> result_buffer_or_status =
+  absl::StatusOr<ExecutionOutput> result_buffer_or_status =
       executables_[executable_idx]->RunAsync(std::move(execution_inputs),
                                              run_options);
 
@@ -2761,7 +2907,22 @@ PjRtStreamExecutorLoadedExecutable::MakeOutputBuffers(
   return outputs;
 }
 
-StatusOr<PjRtLoadedExecutable::Result>
+static Status GetFirstInputError(
+    absl::Span<PjRtBuffer* const> argument_handles) {
+  for (auto* handle : argument_handles) {
+    auto* buffer = tensorflow::down_cast<PjRtStreamExecutorBuffer*>(handle);
+    PjRtStreamExecutorBuffer::ScopedHold hold =
+        buffer->GetBufferWithUsageHold();
+    for (const auto& event : hold->definition_events()) {
+      if (event->IsPredeterminedError()) {
+        return event->GetDefinedStatus();
+      }
+    }
+  }
+  return OkStatus();
+}
+
+absl::StatusOr<PjRtLoadedExecutable::Result>
 PjRtStreamExecutorLoadedExecutable::ExecuteHelper(
     absl::Span<PjRtBuffer* const> argument_handles, int replica, int partition,
     const RunId& run_id, const ExecuteOptions& options, bool fill_future,
@@ -2770,8 +2931,9 @@ PjRtStreamExecutorLoadedExecutable::ExecuteHelper(
   std::shared_ptr<DeviceAssignment> device_assignment;
   if (device == nullptr) {
     CHECK(device_assignment_ != nullptr);
-    const int device_id = (*device_assignment_)(replica, partition);
-    TF_ASSIGN_OR_RETURN(device, client_->LookupDevice(device_id));
+    const int64_t device_id = (*device_assignment_)(replica, partition);
+    PjRtGlobalDeviceId global_device_id(device_id);
+    TF_ASSIGN_OR_RETURN(device, client_->LookupDevice(global_device_id));
     device_assignment = device_assignment_;
   } else {
     CHECK(device_assignment_ == nullptr);
@@ -2782,6 +2944,23 @@ PjRtStreamExecutorLoadedExecutable::ExecuteHelper(
     (*device_assignment)(0, 0) = device->id();
   }
 
+  Status input_error = GetFirstInputError(argument_handles);
+  if (!input_error.ok()) {
+    TF_ASSIGN_OR_RETURN(PjRtMemorySpace * memory_space,
+                        device->default_memory_space());
+    std::vector<std::unique_ptr<PjRtBuffer>> outputs;
+    TF_ASSIGN_OR_RETURN(auto hlo_modules, GetHloModules());
+    for (const auto& hlo_module : hlo_modules) {
+      TF_ASSIGN_OR_RETURN(
+          auto error_buffer,
+          client_->CreateErrorBuffer(input_error, hlo_module->result_shape(),
+                                     memory_space));
+      outputs.push_back(std::move(error_buffer));
+    }
+    auto future = std::make_optional(PjRtFuture<>(input_error));
+    return Result({std::move(future), /*buffers=*/std::move(outputs)});
+  }
+
   CHECK_EQ(device->process_index(), client_->process_index());
   int device_ordinal = tensorflow::down_cast<PjRtStreamExecutorDevice*>(device)
                            ->local_device_state()
@@ -2798,7 +2977,7 @@ PjRtStreamExecutorLoadedExecutable::ExecuteHelper(
   std::vector<std::function<void()>> compute_callbacks;
   std::vector<PjRtStreamExecutorBuffer::ScopedHold> device_buffers;
   device_buffers.reserve(argument_handles.size());
-  StatusOr<ScopedShapedBuffer> result_buffer_or_status = EnqueueExecution(
+  absl::StatusOr<ScopedShapedBuffer> result_buffer_or_status = EnqueueExecution(
       argument_handles, replica, partition, executable_idx, run_id, options,
       device, &device_buffers, std::move(device_assignment), compute_callbacks);
 
@@ -2811,7 +2990,7 @@ PjRtStreamExecutorLoadedExecutable::ExecuteHelper(
 
   LocalDeviceState* device_state = &(client_->device_state(device_ordinal));
   se::Stream* stream = device_state->compute_stream();
-  StatusOr<EventPool::Handle> event_or =
+  absl::StatusOr<EventPool::Handle> event_or =
       device_state->event_pool().ThenAllocateAndRecordEvent(stream);
   if (!event_or.ok()) {
     StallStreamOnError(device_state, stream);
@@ -2867,7 +3046,7 @@ PjRtStreamExecutorLoadedExecutable::ExecuteHelper(
   return Result({/*future=*/std::move(future), /*buffers=*/std::move(outputs)});
 }
 
-StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>
+absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>
 PjRtStreamExecutorLoadedExecutable::Execute(
     absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
     const ExecuteOptions& options,
@@ -2907,7 +3086,7 @@ PjRtStreamExecutorLoadedExecutable::Execute(
     absl::Mutex mu;
     int running = num_addressable_devices;
     int failed = 0;
-    Status first_failure_status;
+    absl::Status first_failure_status;
 
     for (int i = 0; i < num_addressable_devices; ++i) {
       const int replica = addressable_device_logical_ids_[i].replica;
@@ -2992,7 +3171,7 @@ PjRtStreamExecutorLoadedExecutable::Execute(
   return wrapped_results;
 }
 
-StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
 PjRtStreamExecutorLoadedExecutable::ExecuteSharded(
     absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
     const ExecuteOptions& options, std::optional<PjRtFuture<>>& returned_future,
@@ -3021,7 +3200,7 @@ PjRtStreamExecutorLoadedExecutable::ExecuteSharded(
       device->id());
 }
 
-StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
 PjRtStreamExecutorLoadedExecutable::ExecutePortable(
     absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
     const ExecuteOptions& options, std::optional<PjRtFuture<>>& returned_future,
@@ -3048,7 +3227,7 @@ PjRtStreamExecutorLoadedExecutable::ExecutePortable(
   return std::move(result.buffers);
 }
 
-StatusOr<std::vector<std::shared_ptr<HloModule>>>
+absl::StatusOr<std::vector<std::shared_ptr<HloModule>>>
 PjRtStreamExecutorLoadedExecutable::GetHloModules() const {
   std::vector<std::shared_ptr<HloModule>> modules;
   modules.reserve(executables().size());
@@ -3061,12 +3240,12 @@ PjRtStreamExecutorLoadedExecutable::GetHloModules() const {
   return std::move(modules);
 }
 
-StatusOr<std::vector<std::vector<absl::string_view>>>
+absl::StatusOr<std::vector<std::vector<absl::string_view>>>
 PjRtStreamExecutorLoadedExecutable::GetOutputMemoryKinds() const {
   return Unimplemented("GetOutputMemoryKinds is not supported.");
 }
 
-StatusOr<std::string>
+absl::StatusOr<std::string>
 PjRtStreamExecutorLoadedExecutable::FingerprintExecutable() const {
   if (executables_.size() != 1) {
     return absl::InternalError(
@@ -3082,7 +3261,7 @@ PjRtStreamExecutorLoadedExecutable::FingerprintExecutable() const {
   }
 }
 
-StatusOr<PjRtStreamExecutorClient::ExecutableExtras>
+absl::StatusOr<PjRtStreamExecutorClient::ExecutableExtras>
 PjRtStreamExecutorClient::GetExecutableExtras(CompileOptions* options) {
   ExecutableExtras extras;
   std::shared_ptr<DeviceAssignment>& device_assignment =
@@ -3100,7 +3279,7 @@ PjRtStreamExecutorClient::GetExecutableExtras(CompileOptions* options) {
   }
 
   auto layout_callback = [local_client = client()](const HloModule& module)
-      -> StatusOr<std::pair<std::vector<Shape>, Shape>> {
+      -> absl::StatusOr<std::pair<std::vector<Shape>, Shape>> {
     ExecutableBuildOptions build_options;
     std::vector<const Shape*> argument_layout_pointers;
     std::optional<std::vector<Shape>> argument_layouts;
@@ -3134,8 +3313,11 @@ PjRtStreamExecutorClient::GetExecutableExtras(CompileOptions* options) {
     addressable_devices.reserve(num_replicas * num_partitions);
     for (int replica = 0; replica < num_replicas; ++replica) {
       for (int partition = 0; partition < num_partitions; ++partition) {
-        int device_id = (*device_assignment)(replica, partition);
-        TF_ASSIGN_OR_RETURN(PjRtDevice * device, LookupDevice(device_id));
+        int64_t device_id = (*device_assignment)(replica, partition);
+        PjRtGlobalDeviceId global_device_id(device_id);
+
+        TF_ASSIGN_OR_RETURN(PjRtDevice * device,
+                            LookupDevice(global_device_id));
         if (device->process_index() != process_index()) {
           VLOG(3) << "Non-local device: " << device_id;
           continue;
@@ -3161,7 +3343,7 @@ PjRtStreamExecutorClient::GetExecutableExtras(CompileOptions* options) {
   return extras;
 }
 
-StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
+absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
 PjRtStreamExecutorClient::Compile(const XlaComputation& computation,
                                   CompileOptions options) {
   tsl::profiler::TraceMe traceme("PjRtStreamExecutorClient::Compile");
@@ -3204,7 +3386,7 @@ PjRtStreamExecutorClient::Compile(const XlaComputation& computation,
   return std::unique_ptr<PjRtLoadedExecutable>(std::move(executable));
 }
 
-StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
+absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
 PjRtStreamExecutorClient::Compile(mlir::ModuleOp module,
                                   CompileOptions options) {
   XlaComputation xla_computation;
@@ -3215,7 +3397,7 @@ PjRtStreamExecutorClient::Compile(mlir::ModuleOp module,
   return Compile(xla_computation, options);
 }
 
-StatusOr<std::string> PjRtStreamExecutorClient::SerializeExecutable(
+absl::StatusOr<std::string> PjRtStreamExecutorClient::SerializeExecutable(
     const PjRtLoadedExecutable& executable) const {
   const PjRtStreamExecutorLoadedExecutable* se_executable =
       tensorflow::down_cast<const PjRtStreamExecutorLoadedExecutable*>(
@@ -3249,7 +3431,7 @@ StatusOr<std::string> PjRtStreamExecutorClient::SerializeExecutable(
   return proto.SerializeAsString();
 }
 
-StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
+absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
 PjRtStreamExecutorClient::DeserializeExecutable(
     absl::string_view serialized, std::optional<CompileOptions> options) {
   ExecutableAndOptionsProto proto;
@@ -3306,7 +3488,7 @@ PjRtStreamExecutorClient::DeserializeExecutable(
   return std::unique_ptr<PjRtLoadedExecutable>(std::move(executable));
 }
 
-StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
+absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
 PjRtStreamExecutorClient::LoadSerializedExecutable(
     absl::string_view serialized, std::optional<CompileOptions> options,
     const LoadOptions& load_options) {
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
index 93429c0917902f..ad33f57e6cd5c8 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
@@ -16,8 +16,9 @@ limitations under the License.
 #ifndef XLA_PJRT_PJRT_STREAM_EXECUTOR_CLIENT_H_
 #define XLA_PJRT_PJRT_STREAM_EXECUTOR_CLIENT_H_
 
-#include <algorithm>
 #include <array>
+#include <cstddef>
+#include <cstdint>
 #include <functional>
 #include <map>
 #include <memory>
@@ -27,29 +28,40 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
-#include "absl/synchronization/notification.h"
 #include "absl/types/span.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "xla/client/executable_build_options.h"
 #include "xla/client/local_client.h"
 #include "xla/client/xla_computation.h"
+#include "xla/executable_run_options.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/layout.h"
 #include "xla/literal.h"
 #include "xla/pjrt/local_device_state.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_device_description.h"
+#include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/tracked_device_buffer.h"
 #include "xla/pjrt/transpose.h"
-#include "xla/service/computation_layout.h"
 #include "xla/service/computation_placer.h"
+#include "xla/service/executable.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/service/maybe_owning_device_memory.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
+#include "xla/shape_tree.h"
 #include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/stream.h"
@@ -57,7 +69,7 @@ limitations under the License.
 #include "xla/xla_data.pb.h"
 #include "tsl/framework/allocator.h"
 #include "tsl/platform/casts.h"
-#include "tsl/platform/status.h"
+#include "tsl/platform/threadpool.h"
 
 namespace xla {
 
@@ -176,17 +188,25 @@ class PjRtStreamExecutorDevice : public PjRtDevice {
   // If this is a device local to this host, returns a LocalDeviceState object
   // that can be used to manipulate the device. Returns an error if the device
   // is not local to this host.
-  StatusOr<LocalDeviceState*> GetLocalDeviceState() const;
+  absl::StatusOr<LocalDeviceState*> GetLocalDeviceState() const;
 
-  Status TransferToInfeed(const LiteralSlice& literal) override;
+  absl::Status TransferToInfeed(const LiteralSlice& literal) override;
 
-  Status TransferFromOutfeed(MutableBorrowingLiteral literal) override;
+  absl::Status TransferFromOutfeed(MutableBorrowingLiteral literal) override;
+
+  void AttachMemorySpace(PjRtMemorySpace* memory_space);
 
   absl::Span<PjRtMemorySpace* const> memory_spaces() const override;
 
-  StatusOr<PjRtMemorySpace*> default_memory_space() const override;
+  absl::StatusOr<PjRtMemorySpace*> default_memory_space() const override;
+
+  absl::StatusOr<PjRtMemorySpace*> memory_space_by_kind(
+      absl::string_view memory_space_kind) const override;
 
-  StatusOr<std::intptr_t> GetStreamForExternalReadyEvents() const override;
+  absl::StatusOr<PjRtMemorySpace*> memory_space_by_kind_id(int id) const;
+
+  absl::StatusOr<std::intptr_t> GetStreamForExternalReadyEvents()
+      const override;
 
   std::unique_ptr<ScopedAsyncTrackingEvent> CreateAsyncTrackingEvent(
       absl::string_view description) const override {
@@ -199,6 +219,38 @@ class PjRtStreamExecutorDevice : public PjRtDevice {
   const PjRtLocalHardwareId local_hardware_id_;
   const std::unique_ptr<LocalDeviceState> local_device_state_;
   PjRtClient* client_ = nullptr;
+  absl::InlinedVector<PjRtMemorySpace*, 1> memory_spaces_;
+  absl::flat_hash_map<int, PjRtMemorySpace*> memory_spaces_by_id_;
+};
+
+class PjRtStreamExecutorMemorySpace : public PjRtMemorySpace {
+ public:
+  PjRtStreamExecutorMemorySpace(int id, PjRtDevice* device,
+                                absl::string_view kind, int kind_id);
+
+  PjRtClient* client() const override { return device_->client(); }
+
+  absl::Span<PjRtDevice* const> devices() const override {
+    return absl::Span<PjRtDevice* const>(&device_, device_ != nullptr ? 1 : 0);
+  }
+
+  int id() const override { return id_; }
+
+  absl::string_view kind() const override { return kind_; }
+
+  int kind_id() const override { return kind_id_; }
+
+  absl::string_view DebugString() const override { return debug_string_; }
+
+  absl::string_view ToString() const override { return to_string_; }
+
+ private:
+  int id_;
+  PjRtDevice* device_ = nullptr;
+  absl::string_view kind_;
+  int kind_id_;
+  std::string debug_string_;
+  std::string to_string_;
 };
 
 class PjRtStreamExecutorClient : public PjRtClient {
@@ -224,11 +276,7 @@ class PjRtStreamExecutorClient : public PjRtClient {
     return addressable_devices_;
   }
 
-  StatusOr<PjRtDevice*> LookupDevice(int device_id) const override {
-    return LookupDevice(PjRtGlobalDeviceId(device_id));
-  }
-
-  StatusOr<PjRtDevice*> LookupDevice(
+  absl::StatusOr<PjRtDevice*> LookupDevice(
       PjRtGlobalDeviceId global_device_id) const override {
     auto it = id_to_device_.find(global_device_id.value());
     if (it != id_to_device_.end()) {
@@ -238,9 +286,9 @@ class PjRtStreamExecutorClient : public PjRtClient {
                            global_device_id.value());
   }
 
-  StatusOr<PjRtDevice*> LookupAddressableDevice(
+  absl::StatusOr<PjRtDevice*> LookupAddressableDevice(
       int local_hardware_id) const override;
-  StatusOr<PjRtDevice*> LookupAddressableDevice(
+  absl::StatusOr<PjRtDevice*> LookupAddressableDevice(
       PjRtLocalDeviceId local_device_id) const override;
 
   absl::Span<PjRtMemorySpace* const> memory_spaces() const override;
@@ -255,103 +303,114 @@ class PjRtStreamExecutorClient : public PjRtClient {
   // function specifies which one the platform expects.
   virtual bool EnqueueD2DTransfersOnSrcStream() const { return true; }
 
-  StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
+  absl::StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
       int num_replicas, int num_partitions) const override;
 
-  StatusOr<Layout> GetDefaultLayout(PrimitiveType element_type,
-                                    absl::Span<const int64_t> dims) override;
+  absl::StatusOr<Layout> GetDefaultLayout(
+      PrimitiveType element_type, absl::Span<const int64_t> dims) override;
 
-  StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
       const XlaComputation& computation, CompileOptions options) override;
-  StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
       mlir::ModuleOp mlir_module, CompileOptions options) override;
 
-  virtual StatusOr<std::string> SerializeExecutable(
+  virtual absl::StatusOr<std::string> SerializeExecutable(
       const PjRtLoadedExecutable& executable) const;
 
   // For PjRtStreamExecutorClient, `options` is mandatory.
   // This function returns an InvalidArgument error if `std::nullopt` is passed.
   // TODO(b/237720161): make it actually optional
-  StatusOr<std::unique_ptr<PjRtLoadedExecutable>> DeserializeExecutable(
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> DeserializeExecutable(
       absl::string_view serialized,
       std::optional<CompileOptions> options) override;
 
-  StatusOr<std::unique_ptr<PjRtLoadedExecutable>> LoadSerializedExecutable(
-      absl::string_view serialized, std::optional<CompileOptions> options,
-      const LoadOptions& load_options) override;
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
+  LoadSerializedExecutable(absl::string_view serialized,
+                           std::optional<CompileOptions> options,
+                           const LoadOptions& load_options) override;
 
-  StatusOr<std::unique_ptr<HloCostAnalysis>> GetHloCostAnalysis()
+  absl::StatusOr<std::unique_ptr<HloCostAnalysis>> GetHloCostAnalysis()
       const override;
 
   // Creates a buffer on the device without initializing or copying any data.
   // An optional `definition_event` may be speficied that can be used to
   // ensure the buffer isn't referenced until some external mechanism has
   // initialized the data.
-  StatusOr<std::unique_ptr<PjRtBuffer>> CreateUninitializedBuffer(
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateUninitializedBuffer(
       const Shape& shape, PjRtDevice* device) override;
-  StatusOr<std::unique_ptr<PjRtBuffer>> CreateUninitializedBuffer(
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateUninitializedBuffer(
       const Shape& shape, PjRtDevice* device,
       std::shared_ptr<BufferSequencingEvent> definition_event);
 
-  StatusOr<std::unique_ptr<PjRtBuffer>> CreateErrorBuffer(
-      Status error, const Shape& shape, PjRtDevice* device) override;
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateErrorBuffer(
+      absl::Status error, const Shape& shape, PjRtMemorySpace* memory) override;
 
-  StatusOr<std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
+  absl::StatusOr<std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
   CreateBuffersForAsyncHostToDevice(absl::Span<const Shape> shapes,
                                     PjRtDevice* device) override {
     return Unimplemented("Async transfer to buffers not implemented");
   };
 
-  StatusOr<std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
+  absl::StatusOr<std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
   CreateBuffersForAsyncHostToDevice(absl::Span<const Shape> shapes,
                                     PjRtMemorySpace* memory_space) override {
     return Unimplemented("Async transfer to buffers not implemented");
   };
 
-  StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
       const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
       std::optional<absl::Span<int64_t const>> byte_strides,
       HostBufferSemantics host_buffer_semantics,
       absl::AnyInvocable<void() &&> on_done_with_host_buffer,
       PjRtDevice* device, const Layout* device_layout) override;
 
-  StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
       const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
       std::optional<absl::Span<int64_t const>> byte_strides,
       HostBufferSemantics host_buffer_semantics,
       absl::AnyInvocable<void() &&> on_done_with_host_buffer,
       PjRtDevice* device) override;
 
-  StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
+      const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+      std::optional<absl::Span<int64_t const>> byte_strides,
+      HostBufferSemantics host_buffer_semantics,
+      absl::AnyInvocable<void() &&> on_done_with_host_buffer,
+      PjRtMemorySpace* memory_space, const Layout* device_layout) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
       const LiteralSlice& literal, PjRtDevice* device) override;
 
-  StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
+      const LiteralSlice& literal, PjRtMemorySpace* memory_space) override;
+
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
   MakeCrossHostReceiveBuffers(absl::Span<const Shape> shapes,
                               PjRtDevice* device,
                               PjRtCrossHostRecvNotifier notifier) override;
 
-  StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
   MakeCrossHostReceiveBuffersForGather(
       absl::Span<const Shape> shapes, std::vector<GatherDetails> gather_details,
       PjRtDevice* device, PjRtCrossHostRecvNotifier notifier) override;
 
-  StatusOr<std::unique_ptr<PjRtBuffer>> CreateViewOfDeviceBuffer(
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateViewOfDeviceBuffer(
       void* device_ptr, const Shape& shape, PjRtDevice* device,
       std::function<void()> on_delete_callback,
       std::optional<std::intptr_t> stream) override;
 
-  StatusOr<ChannelHandle> CreateChannelHandle() override {
+  absl::StatusOr<ChannelHandle> CreateChannelHandle() override {
     return client()->CreateChannelHandle();
   }
-  StatusOr<ChannelHandle> CreateDeviceToHostChannelHandle() override {
+  absl::StatusOr<ChannelHandle> CreateDeviceToHostChannelHandle() override {
     return client()->CreateDeviceToHostChannelHandle();
   }
-  StatusOr<ChannelHandle> CreateHostToDeviceChannelHandle() override {
+  absl::StatusOr<ChannelHandle> CreateHostToDeviceChannelHandle() override {
     return client()->CreateHostToDeviceChannelHandle();
   }
 
   // TODO(zhangqiaorjc): Experimental. Will be removed.
-  Status Defragment() override {
+  absl::Status Defragment() override {
     return Unimplemented("Defragment not implemented");
   }
 
@@ -378,7 +437,7 @@ class PjRtStreamExecutorClient : public PjRtClient {
  protected:
   friend class PjRtStreamExecutorBuffer;
 
-  virtual Status EnqueueCrossHostReceive(
+  virtual absl::Status EnqueueCrossHostReceive(
       absl::Span<const std::unique_ptr<PjRtBuffer>> buffers,
       std::shared_ptr<BufferSequencingEvent> definition_event,
       PjRtCrossHostRecvNotifier notifier,
@@ -403,7 +462,8 @@ class PjRtStreamExecutorClient : public PjRtClient {
     }
   }
 
-  virtual PjRtFuture<> CopyRawSubBufferToHost(PjRtBuffer* buffer, void* dst,
+  virtual PjRtFuture<> CopyRawSubBufferToHost(PjRtBuffer* buffer,
+                                              PjRtFuture<void*> dst,
                                               int64_t offset,
                                               int64_t transfer_size) {
     return PjRtFuture<>(Unimplemented("Raw copies to host not implemented."));
@@ -417,7 +477,7 @@ class PjRtStreamExecutorClient : public PjRtClient {
         addressable_device_logical_ids;
     std::vector<PjRtDevice*> addressable_devices;
   };
-  StatusOr<ExecutableExtras> GetExecutableExtras(CompileOptions* options);
+  absl::StatusOr<ExecutableExtras> GetExecutableExtras(CompileOptions* options);
 
   const PjRtPlatformId platform_id_;
   const std::string platform_name_;
@@ -442,6 +502,10 @@ class PjRtStreamExecutorClient : public PjRtClient {
   std::vector<PjRtDevice*> addressable_devices_;
   int process_index_;
 
+  std::vector<std::unique_ptr<PjRtMemorySpace>> owned_memory_spaces_;
+  // Pointers to `owned_memory_spaces_`.
+  std::vector<PjRtMemorySpace*> memory_spaces_;
+
   // Should we always prefer to stage host-to-device transfers via memory
   // allocated on host_memory_allocator_? True only on GPU, where we prefer to
   // transfer via pinned memory.
@@ -457,7 +521,7 @@ class PjRtStreamExecutorClient : public PjRtClient {
 
 // Converts a 2D set of Device objects indexed by [replica][partition] into an
 // xla::DeviceAssignment.
-StatusOr<DeviceAssignment> DevicesToDeviceAssignment(
+absl::StatusOr<DeviceAssignment> DevicesToDeviceAssignment(
     absl::Span<const std::vector<PjRtDevice*>> devices);
 
 class PjRtStreamExecutorBuffer : public PjRtBuffer {
@@ -511,9 +575,10 @@ class PjRtStreamExecutorBuffer : public PjRtBuffer {
   class ScopedHold {
    public:
     enum Type { kUsage = 0, kExternalReference, kDonation, kMaxValue };
-    // Use a State enum instead of encoding the state in an error Status to
-    // avoid creating Status values in non-error cases. Creating a Status
-    // entails several allocations and can add O(us) to every use of a hold.
+    // Use a State enum instead of encoding the state in an error absl::Status
+    // to avoid creating absl::Status values in non-error cases. Creating a
+    // absl::Status entails several allocations and can add O(us) to every use
+    // of a hold.
     enum State {
       kUninitialized = 0,
       kValid,
@@ -531,8 +596,8 @@ class PjRtStreamExecutorBuffer : public PjRtBuffer {
 
     Type type() const { return type_; }
 
-    Status status() const {
-      // Lazily create Status values only when they are requested.
+    absl::Status status() const {
+      // Lazily create absl::Status values only when they are requested.
       switch (state_) {
         case kUninitialized:
           return InvalidArgument("Buffer has not been initialized");
@@ -600,8 +665,9 @@ class PjRtStreamExecutorBuffer : public PjRtBuffer {
 
     // Helper struct that makes it possible to move a ScopedHold through a
     // closure.
-    using ForClosure = std::tuple<PjRtStreamExecutorBuffer*, Type, State,
-                                  Status, std::shared_ptr<TrackedDeviceBuffer>>;
+    using ForClosure =
+        std::tuple<PjRtStreamExecutorBuffer*, Type, State, absl::Status,
+                   std::shared_ptr<TrackedDeviceBuffer>>;
 
     ScopedHold(PjRtStreamExecutorBuffer* parent, Type type)
         : parent_(parent), type_(type), state_(kUninitialized) {}
@@ -632,13 +698,14 @@ class PjRtStreamExecutorBuffer : public PjRtBuffer {
     // There is an invariant that if ok() then
     // buffer_.value() != nullptr.
     State state_;
-    Status status_;
+    absl::Status status_;
     std::shared_ptr<TrackedDeviceBuffer> buffer_;
   };
 
   PjRtStreamExecutorBuffer(Shape on_device_shape,
                            std::shared_ptr<TrackedDeviceBuffer> device_buffer,
-                           PjRtClient* client, PjRtDevice* device);
+                           PjRtClient* client, PjRtDevice* device,
+                           PjRtMemorySpace* memory_space);
   ~PjRtStreamExecutorBuffer() override;
 
   PjRtStreamExecutorBuffer(const PjRtStreamExecutorBuffer&) = delete;
@@ -647,8 +714,8 @@ class PjRtStreamExecutorBuffer : public PjRtBuffer {
   PjRtStreamExecutorBuffer& operator=(PjRtStreamExecutorBuffer&&) = delete;
 
   const Shape& on_device_shape() const override { return on_device_shape_; }
-  StatusOr<Shape> logical_on_device_shape() override;
-  PjRtMemorySpace* memory_space() const override { return nullptr; }
+  absl::StatusOr<Shape> logical_on_device_shape() override;
+  PjRtMemorySpace* memory_space() const override { return memory_space_; }
   PjRtStreamExecutorDevice* device() const override { return device_; }
   PjRtPlatformId platform_id() const { return client_->platform_id(); }
   absl::string_view platform_name() const { return client_->platform_name(); }
@@ -658,11 +725,11 @@ class PjRtStreamExecutorBuffer : public PjRtBuffer {
            on_device_shape_.tuple_shapes_size() == 0;
   }
 
-  StatusOr<std::unique_ptr<ExternalReference>> AcquireExternalReference()
+  absl::StatusOr<std::unique_ptr<ExternalReference>> AcquireExternalReference()
       override;
 
-  StatusOr<std::unique_ptr<ExternalReference>> ReleaseDeviceMemoryOwnership(
-      bool wait_for_operations_to_complete) override;
+  absl::StatusOr<std::unique_ptr<ExternalReference>>
+  ReleaseDeviceMemoryOwnership(bool wait_for_operations_to_complete) override;
 
   using PjRtBuffer::ToLiteralSync;
   PjRtFuture<> ToLiteral(MutableLiteralBase* literal) override;
@@ -670,13 +737,12 @@ class PjRtStreamExecutorBuffer : public PjRtBuffer {
       absl::AnyInvocable<absl::StatusOr<MutableLiteralBase*>() &&> generator)
       override;
 
-  StatusOr<size_t> GetOnDeviceSizeInBytes() const override;
+  absl::StatusOr<size_t> GetOnDeviceSizeInBytes() const override;
 
   PjRtFuture<> CopyRawToHost(void* dst, int64_t offset,
                              int64_t transfer_size) override;
 
-  PjRtFuture<> CopyRawToHostFuture(PjRtFuture<StatusOr<void*>> dst,
-                                   int64_t offset,
+  PjRtFuture<> CopyRawToHostFuture(PjRtFuture<void*> dst, int64_t offset,
                                    int64_t transfer_size) override;
 
   // Drops the buffer's reference to its associated device memory, leaving the
@@ -694,7 +760,7 @@ class PjRtStreamExecutorBuffer : public PjRtBuffer {
 
   // Returns a view of the PjRtBuffer device memory as a ShapedBuffer. The
   // PjRtBuffer retains ownership of the device buffers.
-  StatusOr<ShapedBuffer> AsShapedBuffer() const;
+  absl::StatusOr<ShapedBuffer> AsShapedBuffer() const;
 
   // Returns a hold on the TrackedDeviceBuffer holding the device
   // buffers. See comment on ScopedHold.
@@ -706,20 +772,17 @@ class PjRtStreamExecutorBuffer : public PjRtBuffer {
     return GetBufferWithHold(ScopedHold::kExternalReference);
   }
 
-  StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDevice(
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDevice(
       PjRtDevice* dst_device) override;
 
-  StatusOr<std::unique_ptr<PjRtBuffer>> CopyToMemorySpace(
-      PjRtMemorySpace* dst_memory_space) override {
-    return Unimplemented("Implement CopyToMemorySpace");
-  }
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToMemorySpace(
+      PjRtMemorySpace* dst_memory_space) override;
 
-  void CopyToRemoteDevice(
-      PjRtFuture<StatusOr<std::string>> serialized_descriptor,
-      RemoteSendCallback on_done) override;
+  void CopyToRemoteDevice(PjRtFuture<std::string> serialized_descriptor,
+                          RemoteSendCallback on_done) override;
 
   void CopyToRemoteDeviceScattered(
-      PjRtFuture<StatusOr<std::vector<std::string>>> serialized_descriptors,
+      PjRtFuture<std::vector<std::string>> serialized_descriptors,
       std::vector<RemoteSendCallback> callbacks,
       const ScatterDetails& scatter_details) override;
 
@@ -730,18 +793,19 @@ class PjRtStreamExecutorBuffer : public PjRtBuffer {
   // Similar to Delete, drops the buffer's reference to its associated device
   // memory, leaving the buffer in an invalid state, but returns the
   // TrackedDeviceBuffer rather than freeing the device memory, so that another
-  // framework can take ownership of it. The buffer returned from Release may
-  // be safely dropped at any time even if it still has pending async
-  // operations. The client should call GetReadyFuture()->Await() before calling
-  // Release with wait_for_operations_to_complete=false, to ensure that the host
-  // has synchronized past any outstanding write operations to the buffer. If
-  // wait_for_operations_to_complete=true the host will block until any
+  // framework can take ownership of it.
+  //
+  // When called with wait_for_operations_to_complete=false, the buffer returned
+  // from Release should be dropped on the compute stream, since the only events
+  // that Release doesn't wait for are events defined on the compute stream.
+  //
+  // If wait_for_operations_to_complete=true, the host will block until any
   // potentially outstanding asynchronous operations have completed before
   // returning, in which case it is safe to read or mutate the returned buffer.
   // If the buffer was shared via an external reference it is the client's
   // responsibility that accesses via that reference do not interfere with
   // accesses via the buffer returned from Release.
-  StatusOr<std::shared_ptr<TrackedDeviceBuffer>> Release(
+  absl::StatusOr<std::shared_ptr<TrackedDeviceBuffer>> Release(
       bool wait_for_operations_to_complete);
 
   absl::StatusOr<std::unique_ptr<PjRtBuffer>> DonateWithControlDependency(
@@ -761,7 +825,7 @@ class PjRtStreamExecutorBuffer : public PjRtBuffer {
   // an outstanding external hold.
   // Requires holds_[kDonation] == 0 (i.e., WaitForOutstandingDonationHolds()
   // must be called first.)
-  StatusOr<std::shared_ptr<TrackedDeviceBuffer>> GetBufferForHoldLocked(
+  absl::StatusOr<std::shared_ptr<TrackedDeviceBuffer>> GetBufferForHoldLocked(
       ScopedHold::Type type) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Adds a hold of hold->type() and initializes `hold` with device_buffer_.
@@ -787,8 +851,8 @@ class PjRtStreamExecutorBuffer : public PjRtBuffer {
   // buffer==device_buffer_ or device_buffer_==nullptr.
   void DropHold(ScopedHold::Type type, TrackedDeviceBuffer* buffer);
 
-  StatusOr<std::pair<std::unique_ptr<PjRtBuffer>,
-                     std::shared_ptr<BufferSequencingEvent>>>
+  absl::StatusOr<std::pair<std::unique_ptr<PjRtBuffer>,
+                           std::shared_ptr<BufferSequencingEvent>>>
   CopyToDeviceHelper(PjRtDevice* dst_device, LocalDeviceState* dst_local_device,
                      LocalDeviceState* transfer_local_device,
                      LocalDeviceState* src_local_device,
@@ -798,6 +862,7 @@ class PjRtStreamExecutorBuffer : public PjRtBuffer {
   PjRtStreamExecutorClient* const client_;
   const Shape on_device_shape_;
   PjRtStreamExecutorDevice* const device_;
+  PjRtMemorySpace* const memory_space_;
 
   mutable absl::Mutex mu_;
   std::shared_ptr<TrackedDeviceBuffer> device_buffer_ ABSL_GUARDED_BY(mu_);
@@ -806,6 +871,25 @@ class PjRtStreamExecutorBuffer : public PjRtBuffer {
   PjRtFuture<>::Promise definition_promise_ ABSL_GUARDED_BY(mu_);
 };
 
+// Allocates the device buffers for a buffer that will be used as the
+// destination of a copy, either from the host or another device. copy_stream
+// may be nullptr, e.g., when allocating a buffer for a cross-host copy. If the
+// buffer is a tuple then the tuple tables are allocated, and all necessary
+// synchronization for them is dealt with, before the buffer is returned.
+//
+// It is safe to delete the returned PjRtBuffer without further
+// synchronization if an error occurs before the buffer is used.
+//
+// The caller may optionally provide a definition event to be recorded in
+// the buffer.
+// TODO(phawkins): replace on_host_shape here with on_device_shape.
+absl::StatusOr<std::unique_ptr<PjRtStreamExecutorBuffer>>
+AllocateDestinationBuffer(
+    const Shape& on_host_shape, PjRtDevice* device,
+    LocalDeviceState* local_device, se::Stream* copy_stream,
+    bool is_uninitialized_create, PjRtStreamExecutorClient* client,
+    std::shared_ptr<BufferSequencingEvent> definition_event = nullptr);
+
 // Wraps one or more XLA LocalExecutables (one per partition, as specified by
 // the build options).
 class PjRtStreamExecutorLoadedExecutable : public PjRtLoadedExecutable {
@@ -841,7 +925,7 @@ class PjRtStreamExecutorLoadedExecutable : public PjRtLoadedExecutable {
     return size;
   }
 
-  StatusOr<CompiledMemoryStats> GetCompiledMemoryStats() const override {
+  absl::StatusOr<CompiledMemoryStats> GetCompiledMemoryStats() const override {
     if (executables_.size() != 1) {
       return Unimplemented(
           "Retrieving CompiledMemoryStats is not supported for multiple "
@@ -853,6 +937,8 @@ class PjRtStreamExecutorLoadedExecutable : public PjRtLoadedExecutable {
     if (proto != nullptr) {
       memory_stats.serialized_hlo_proto = proto->SerializeAsString();
     }
+    memory_stats.PopulateBufferStatsFromAllocations(
+        executables_[0]->executable()->GetAllocations());
     return memory_stats;
   }
 
@@ -870,26 +956,26 @@ class PjRtStreamExecutorLoadedExecutable : public PjRtLoadedExecutable {
   }
 
   // Return an HloModule per partition.
-  StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
+  absl::StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
       const override;
 
-  StatusOr<std::vector<std::vector<absl::string_view>>> GetOutputMemoryKinds()
-      const override;
+  absl::StatusOr<std::vector<std::vector<absl::string_view>>>
+  GetOutputMemoryKinds() const override;
 
   using PjRtLoadedExecutable::Execute;
-  StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> Execute(
+  absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> Execute(
       absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
       const ExecuteOptions& options,
       std::optional<std::vector<PjRtFuture<>>>& returned_futures) override;
 
   using PjRtLoadedExecutable::ExecuteSharded;
-  StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteSharded(
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteSharded(
       absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
       const ExecuteOptions& options,
       std::optional<PjRtFuture<>>& returned_future, bool fill_future) override;
 
   using PjRtLoadedExecutable::ExecutePortable;
-  StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecutePortable(
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecutePortable(
       absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
       const ExecuteOptions& options,
       std::optional<PjRtFuture<>>& returned_future, bool fill_future) override;
@@ -898,7 +984,7 @@ class PjRtStreamExecutorLoadedExecutable : public PjRtLoadedExecutable {
 
   bool IsDeleted() override { return executables_.empty(); }
 
-  StatusOr<std::string> SerializeExecutable() const override {
+  absl::StatusOr<std::string> SerializeExecutable() const override {
     return client_->SerializeExecutable(*this);
   }
 
@@ -926,14 +1012,14 @@ class PjRtStreamExecutorLoadedExecutable : public PjRtLoadedExecutable {
   friend class StreamExecutorGpuClient;
   // Initializes information about which arguments to which executables must be
   // donated due to aliases that were specified by the computation.
-  Status SetUpDonation(bool tuple_inputs);
+  absl::Status SetUpDonation(bool tuple_inputs);
 
   // Returns a sorted list of the parameters that must be donated. Derived
   // classes may use custom logic.
   virtual absl::Span<int const> ParametersThatMustBeDonated(
       int executable_idx) const;
 
-  virtual StatusOr<std::vector<ExecutionInput>>
+  virtual absl::StatusOr<std::vector<ExecutionInput>>
   MakeExecutionInputsAndWaitForEvents(
       int device_ordinal, const ExecuteOptions& options,
       absl::Span<const Shape> executable_parameter_shapes,
@@ -941,7 +1027,7 @@ class PjRtStreamExecutorLoadedExecutable : public PjRtLoadedExecutable {
       absl::Span<const PjRtStreamExecutorBuffer::ScopedHold> device_buffers,
       absl::flat_hash_set<BufferSequencingEvent*>& events) const;
 
-  StatusOr<ScopedShapedBuffer> EnqueueExecution(
+  absl::StatusOr<ScopedShapedBuffer> EnqueueExecution(
       absl::Span<PjRtBuffer* const> argument_handles, int replica,
       int partition, int executable_idx, const RunId& run_id,
       const ExecuteOptions& options, PjRtDevice* device,
@@ -957,12 +1043,10 @@ class PjRtStreamExecutorLoadedExecutable : public PjRtLoadedExecutable {
       std::vector<std::shared_ptr<TrackedDeviceBuffer>>& buffers_to_release)
       const;
 
-  StatusOr<Result> ExecuteHelper(absl::Span<PjRtBuffer* const> argument_handles,
-                                 int replica, int partition,
-                                 const RunId& run_id,
-                                 const ExecuteOptions& options,
-                                 bool fill_future,
-                                 PjRtDevice* device = nullptr) const;
+  absl::StatusOr<Result> ExecuteHelper(
+      absl::Span<PjRtBuffer* const> argument_handles, int replica,
+      int partition, const RunId& run_id, const ExecuteOptions& options,
+      bool fill_future, PjRtDevice* device = nullptr) const;
 
   // Create shared pointers so we can free them after the execution: with
   // asynchronous execution, the process being executed can outlive the
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client_test.cc b/third_party/xla/xla/pjrt/pjrt_stream_executor_client_test.cc
index a0870acc1ec1a7..d34d5c3c54740f 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client_test.cc
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client_test.cc
@@ -33,8 +33,8 @@ limitations under the License.
 #include "xla/service/platform_util.h"
 #include "xla/shape_util.h"
 #include "xla/test.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/concurrency/async_value_ref.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
@@ -59,13 +59,14 @@ absl::StatusOr<std::unique_ptr<PjRtStreamExecutorClient>> GetClient() {
   std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> devices;
   devices.emplace_back(std::move(device));
   return std::make_unique<PjRtStreamExecutorClient>(
-      "cpu", local_client, std::move(devices), /*process_index=*/0,
-      /*allocator=*/nullptr, /*host_memory_allocator=*/nullptr,
+      "cpu", local_client, std::move(devices),
+      /*process_index=*/0, /*allocator=*/nullptr,
+      /*host_memory_allocator=*/nullptr,
       /*should_stage_host_to_device_transfers=*/false,
       /*gpu_run_options=*/nullptr);
 }
 
-StatusOr<std::unique_ptr<PjRtLoadedExecutable>> ToyExecutable(
+absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> ToyExecutable(
     PjRtStreamExecutorClient& client, Shape shape,
     absl::AnyInvocable<void(XlaBuilder&)> set_up_aliases) {
   CompileOptions compile_options;
@@ -83,11 +84,12 @@ StatusOr<std::unique_ptr<PjRtLoadedExecutable>> ToyExecutable(
   return executable;
 }
 
-Status ExecuteWithSameInputBuffer(
+absl::Status ExecuteWithSameInputBuffer(
     absl::AnyInvocable<void(XlaBuilder&)> set_up_aliases) {
   auto shape = xla::ShapeUtil::MakeScalarShape(xla::F32);
   TF_ASSIGN_OR_RETURN(auto client, GetClient());
-  TF_ASSIGN_OR_RETURN(auto* device0, client->LookupDevice(0));
+  TF_RET_CHECK(!client->addressable_devices().empty());
+  auto* device0 = client->addressable_devices().front();
   TF_ASSIGN_OR_RETURN(auto buffer,
                       client->CreateUninitializedBuffer(shape, device0));
   TF_ASSIGN_OR_RETURN(auto executable,
diff --git a/third_party/xla/xla/pjrt/status_casters.h b/third_party/xla/xla/pjrt/status_casters.h
index bd4a044f6c3042..7e45a310dfa5ba 100644
--- a/third_party/xla/xla/pjrt/status_casters.h
+++ b/third_party/xla/xla/pjrt/status_casters.h
@@ -25,7 +25,7 @@ namespace xla {
 
 // C++ -> Python caster helpers.
 //
-// Failing statuses become Python exceptions; OK Status() becomes None.
+// Failing statuses become Python exceptions; OK absl::Status() becomes None.
 //
 // Given there can be only a single global pybind11 type_caster for the
 // `absl::Status` type, and given XLA wants a custom exception being raised,
diff --git a/third_party/xla/xla/pjrt/stream_executor_executable.cc b/third_party/xla/xla/pjrt/stream_executor_executable.cc
index 41e0098eaa3eb0..766cd0d2147a07 100644
--- a/third_party/xla/xla/pjrt/stream_executor_executable.cc
+++ b/third_party/xla/xla/pjrt/stream_executor_executable.cc
@@ -17,26 +17,37 @@ limitations under the License.
 
 #include <memory>
 #include <string>
+#include <utility>
 
+#include "absl/status/status.h"
 #include "xla/pjrt/stream_executor_executable.pb.h"
 #include "xla/service/compiler.h"
 #include "xla/statusor.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
-StatusOr<std::string> StreamExecutorExecutable::SerializeExecutable() const {
-  StreamExecutorExecutableProto proto;
+absl::StatusOr<std::string> StreamExecutorExecutable::SerializeExecutable()
+    const {
+  if (aot_executables_.empty()) {
+    return absl::InternalError("No local executable");
+  }
+  if (aot_executables_.size() != 1) {
+    return absl::UnimplementedError(
+        "PjRtStreamExecutorClient::SerializeExecutable unimplemented for MPMD "
+        "executables");
+  }
+
+  TF_ASSIGN_OR_RETURN(std::string serialized,
+                      aot_executables_[0]->SerializeAsString());
+  if (serialized.empty()) {
+    return absl::InternalError(
+        "PjRtStreamExecutorClient::SerializeExecutable proto serialization "
+        "failed");
+  }
+  ExecutableAndOptionsProto proto;
+  *proto.mutable_serialized_executable() = std::move(serialized);
   TF_ASSIGN_OR_RETURN(*proto.mutable_compile_options(),
                       compile_options_.ToProto());
-  for (const std::unique_ptr<xla::AotCompilationResult>& aot_executable :
-       aot_executables_) {
-    TF_ASSIGN_OR_RETURN(*proto.add_executables(),
-                        aot_executable->SerializeAsString());
-  }
-  proto.set_num_replicas(num_replicas_);
-  proto.set_num_partitions(num_partitions_);
-  proto.set_name(name_);
-  proto.set_fingerprint(fingerprint_);
   return proto.SerializeAsString();
 }
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/stream_executor_executable.h b/third_party/xla/xla/pjrt/stream_executor_executable.h
index b12f21a14dd0a6..826e4f2912f176 100644
--- a/third_party/xla/xla/pjrt/stream_executor_executable.h
+++ b/third_party/xla/xla/pjrt/stream_executor_executable.h
@@ -16,8 +16,19 @@ limitations under the License.
 #ifndef XLA_PJRT_STREAM_EXECUTOR_EXECUTABLE_H_
 #define XLA_PJRT_STREAM_EXECUTOR_EXECUTABLE_H_
 
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/service/compiler.h"
 
@@ -28,7 +39,9 @@ class StreamExecutorExecutable : public PjRtExecutable {
       const CompileOptions& compile_options,
       std::vector<std::unique_ptr<xla::AotCompilationResult>> executables,
       int num_replicas, int num_partitions, absl::string_view name,
-      absl::string_view fingerprint)
+      absl::string_view fingerprint,
+      std::optional<std::vector<std::vector<absl::string_view>>>
+          output_memory_kinds)
       : compile_options_(compile_options),
         aot_executables_(std::move(executables)),
         num_replicas_(num_replicas),
@@ -36,7 +49,7 @@ class StreamExecutorExecutable : public PjRtExecutable {
         name_(name),
         fingerprint_(fingerprint) {}
 
-  StatusOr<std::string> SerializeExecutable() const override;
+  absl::StatusOr<std::string> SerializeExecutable() const override;
 
   absl::string_view name() const override { return name_; }
   int num_replicas() const override { return num_replicas_; }
@@ -51,10 +64,13 @@ class StreamExecutorExecutable : public PjRtExecutable {
 
   absl::StatusOr<std::vector<std::vector<absl::string_view>>>
   GetOutputMemoryKinds() const override {
+    if (output_memory_kinds_.has_value()) {
+      return *output_memory_kinds_;
+    }
     return absl::UnimplementedError("GetOutputMemoryKinds is not supported.");
   }
-  StatusOr<absl::flat_hash_map<std::string, PjRtValueType>> GetCostAnalysis()
-      const override {
+  absl::StatusOr<absl::flat_hash_map<std::string, PjRtValueType>>
+  GetCostAnalysis() const override {
     return absl::UnimplementedError("GetCostAnalysis is not supported.");
   }
 
@@ -65,7 +81,7 @@ class StreamExecutorExecutable : public PjRtExecutable {
     return aot_executables_;
   }
 
-  StatusOr<std::string> FingerprintExecutable() const override {
+  absl::StatusOr<std::string> FingerprintExecutable() const override {
     return fingerprint_;
   }
 
@@ -76,6 +92,8 @@ class StreamExecutorExecutable : public PjRtExecutable {
   int num_partitions_;
   std::string name_;
   std::string fingerprint_;
+  std::optional<std::vector<std::vector<absl::string_view>>>
+      output_memory_kinds_;
 };
 }  // namespace xla
 
diff --git a/third_party/xla/xla/pjrt/tf_pjrt_client.cc b/third_party/xla/xla/pjrt/tf_pjrt_client.cc
index 816a6b8b3dbedd..c35bee43e91bcc 100644
--- a/third_party/xla/xla/pjrt/tf_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/tf_pjrt_client.cc
@@ -38,7 +38,7 @@ TfPjRtBuffer::~TfPjRtBuffer() { client_->UntrackBuffer(this); }
 PjRtClient* TfPjRtBuffer::client() const { return client_; }
 PjRtClient* TfPjRtExecutable::client() const { return client_; }
 
-StatusOr<std::unique_ptr<PjRtBuffer>> TfPjRtBuffer::CopyToDevice(
+absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfPjRtBuffer::CopyToDevice(
     PjRtDevice* dst_device) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtBuffer> result,
                       wrapped_->CopyToDevice(dst_device));
@@ -50,7 +50,7 @@ TfPjRtExecutable::TfPjRtExecutable(
     TfPjRtClient* client, std::unique_ptr<PjRtLoadedExecutable> wrapped)
     : client_(client), wrapped_(std::move(wrapped)) {}
 
-StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>
+absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>
 TfPjRtExecutable::Execute(
     absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
     const ExecuteOptions& options,
@@ -76,7 +76,7 @@ TfPjRtExecutable::Execute(
   return out;
 }
 
-StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
 TfPjRtExecutable::ExecuteSharded(absl::Span<PjRtBuffer* const> argument_handles,
                                  PjRtDevice* device,
                                  const ExecuteOptions& options,
@@ -96,7 +96,7 @@ TfPjRtExecutable::ExecuteSharded(absl::Span<PjRtBuffer* const> argument_handles,
   }
   return out;
 }
-StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
 TfPjRtExecutable::ExecutePortable(
     absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
     const ExecuteOptions& options, std::optional<PjRtFuture<>>& returned_future,
@@ -129,15 +129,16 @@ TfPjRtClient::TfPjRtClient(std::unique_ptr<PjRtClient> wrapped)
 
 TfPjRtClient::~TfPjRtClient() { LOG(INFO) << "TfPjRtClient destroyed."; }
 
-StatusOr<std::unique_ptr<PjRtBuffer>> TfPjRtClient::WrapBuffer(
-    StatusOr<std::unique_ptr<PjRtBuffer>> to_wrap) {
+absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfPjRtClient::WrapBuffer(
+    absl::StatusOr<std::unique_ptr<PjRtBuffer>> to_wrap) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtBuffer> buffer, std::move(to_wrap));
   return std::unique_ptr<PjRtBuffer>(
       std::make_unique<TfPjRtBuffer>(this, std::move(buffer)));
 }
 
-StatusOr<std::unique_ptr<PjRtLoadedExecutable>> TfPjRtClient::WrapExecutable(
-    StatusOr<std::unique_ptr<PjRtLoadedExecutable>> to_wrap) {
+absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
+TfPjRtClient::WrapExecutable(
+    absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> to_wrap) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtLoadedExecutable> executable,
                       std::move(to_wrap));
   return std::unique_ptr<PjRtLoadedExecutable>(
diff --git a/third_party/xla/xla/pjrt/tf_pjrt_client.h b/third_party/xla/xla/pjrt/tf_pjrt_client.h
index e01d1b5260dbc8..d1dfd3eace20e7 100644
--- a/third_party/xla/xla/pjrt/tf_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/tf_pjrt_client.h
@@ -45,7 +45,7 @@ class TfPjRtBuffer : public PjRtBuffer {
   const Shape& on_device_shape() const override {
     return wrapped_->on_device_shape();
   }
-  StatusOr<Shape> logical_on_device_shape() override {
+  absl::StatusOr<Shape> logical_on_device_shape() override {
     return wrapped_->logical_on_device_shape();
   }
   PjRtMemorySpace* memory_space() const override {
@@ -53,7 +53,7 @@ class TfPjRtBuffer : public PjRtBuffer {
   }
   PjRtDevice* device() const override { return wrapped_->device(); }
   PjRtClient* client() const override;
-  StatusOr<std::unique_ptr<ExternalReference>> AcquireExternalReference()
+  absl::StatusOr<std::unique_ptr<ExternalReference>> AcquireExternalReference()
       override {
     return wrapped_->AcquireExternalReference();
   }
@@ -65,7 +65,7 @@ class TfPjRtBuffer : public PjRtBuffer {
       override {
     return wrapped_->LazyToLiteral(std::move(generator));
   }
-  StatusOr<size_t> GetOnDeviceSizeInBytes() const override {
+  absl::StatusOr<size_t> GetOnDeviceSizeInBytes() const override {
     return wrapped_->GetOnDeviceSizeInBytes();
   }
   PjRtFuture<> CopyRawToHost(void* dst, int64_t offset,
@@ -73,26 +73,25 @@ class TfPjRtBuffer : public PjRtBuffer {
     return wrapped_->CopyRawToHost(dst, offset, transfer_size);
   }
   void Delete() override { wrapped_->Delete(); }
-  StatusOr<std::unique_ptr<ExternalReference>> ReleaseDeviceMemoryOwnership(
-      bool wait_for_operations_to_complete) override {
+  absl::StatusOr<std::unique_ptr<ExternalReference>>
+  ReleaseDeviceMemoryOwnership(bool wait_for_operations_to_complete) override {
     return wrapped_->ReleaseDeviceMemoryOwnership(
         wait_for_operations_to_complete);
   }
   bool IsDeleted() override { return wrapped_->IsDeleted(); }
-  StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDevice(
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDevice(
       PjRtDevice* dst_device) override;
-  StatusOr<std::unique_ptr<PjRtBuffer>> CopyToMemorySpace(
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToMemorySpace(
       PjRtMemorySpace* dst_memory_space) override {
     return Unimplemented("CopyToMemorySpace not implemented");
   }
-  void CopyToRemoteDevice(
-      PjRtFuture<StatusOr<std::string>> serialized_descriptor,
-      RemoteSendCallback on_done) override {
+  void CopyToRemoteDevice(PjRtFuture<std::string> serialized_descriptor,
+                          RemoteSendCallback on_done) override {
     wrapped_->CopyToRemoteDevice(std::move(serialized_descriptor),
                                  std::move(on_done));
   }
   void CopyToRemoteDeviceScattered(
-      PjRtFuture<StatusOr<std::vector<std::string>>> serialized_descriptors,
+      PjRtFuture<std::vector<std::string>> serialized_descriptors,
       std::vector<RemoteSendCallback> callbacks,
       const ScatterDetails& scatter_details) override {
     return wrapped_->CopyToRemoteDeviceScattered(
@@ -139,26 +138,26 @@ class TfPjRtExecutable : public PjRtLoadedExecutable {
   absl::Span<PjRtDevice* const> addressable_devices() const override {
     return wrapped_->addressable_devices();
   }
-  StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
+  absl::StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
       const override {
     return wrapped_->GetHloModules();
   }
-  StatusOr<std::vector<std::vector<absl::string_view>>> GetOutputMemoryKinds()
-      const override {
+  absl::StatusOr<std::vector<std::vector<absl::string_view>>>
+  GetOutputMemoryKinds() const override {
     return wrapped_->GetOutputMemoryKinds();
   }
   using PjRtLoadedExecutable::Execute;
-  StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> Execute(
+  absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> Execute(
       absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
       const ExecuteOptions& options,
       std::optional<std::vector<PjRtFuture<>>>& returned_futures) override;
   using PjRtLoadedExecutable::ExecuteSharded;
-  StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteSharded(
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteSharded(
       absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
       const ExecuteOptions& options,
       std::optional<PjRtFuture<>>& returned_future, bool fill_future) override;
   using PjRtLoadedExecutable::ExecutePortable;
-  StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecutePortable(
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecutePortable(
       absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
       const ExecuteOptions& options,
       std::optional<PjRtFuture<>>& returned_future, bool fill_future) override;
@@ -169,15 +168,15 @@ class TfPjRtExecutable : public PjRtLoadedExecutable {
     return wrapped_->IsReturnedFutureSupported();
   }
 
-  StatusOr<std::string> SerializeExecutable() const override {
+  absl::StatusOr<std::string> SerializeExecutable() const override {
     return wrapped_->SerializeExecutable();
   }
 
-  StatusOr<struct CompileOptions> GetCompileOptions() const override {
+  absl::StatusOr<struct CompileOptions> GetCompileOptions() const override {
     return wrapped_->GetCompileOptions();
   }
 
-  StatusOr<std::string> FingerprintExecutable() const override {
+  absl::StatusOr<std::string> FingerprintExecutable() const override {
     return wrapped_->FingerprintExecutable();
   }
 
@@ -205,18 +204,15 @@ class TfPjRtClient : public PjRtClient {
   absl::Span<PjRtDevice* const> addressable_devices() const override {
     return wrapped_->addressable_devices();
   }
-  StatusOr<PjRtDevice*> LookupDevice(int device_id) const override {
-    return LookupDevice(PjRtGlobalDeviceId(device_id));
-  }
-  StatusOr<PjRtDevice*> LookupDevice(
+  absl::StatusOr<PjRtDevice*> LookupDevice(
       PjRtGlobalDeviceId global_device_id) const override {
-    return wrapped_->LookupDevice(global_device_id.value());
+    return wrapped_->LookupDevice(global_device_id);
   }
-  StatusOr<PjRtDevice*> LookupAddressableDevice(
+  absl::StatusOr<PjRtDevice*> LookupAddressableDevice(
       int local_hardware_id) const override {
     return LookupAddressableDevice(PjRtLocalDeviceId(local_hardware_id));
   }
-  StatusOr<PjRtDevice*> LookupAddressableDevice(
+  absl::StatusOr<PjRtDevice*> LookupAddressableDevice(
       PjRtLocalDeviceId local_device_id) const override {
     if (wrapped_ == nullptr) {
       return tsl::errors::Internal(
@@ -239,51 +235,51 @@ class TfPjRtClient : public PjRtClient {
   PjRtRuntimeType runtime_type() const override {
     return wrapped_->runtime_type();
   }
-  StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
+  absl::StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
       int num_replicas, int num_partitions) const override {
     return wrapped_->GetDefaultDeviceAssignment(num_replicas, num_partitions);
   }
-  StatusOr<Layout> GetDefaultLayout(PrimitiveType element_type,
-                                    absl::Span<const int64_t> dims) override {
+  absl::StatusOr<Layout> GetDefaultLayout(
+      PrimitiveType element_type, absl::Span<const int64_t> dims) override {
     return wrapped_->GetDefaultLayout(element_type, dims);
   }
-  StatusOr<std::unique_ptr<HloCostAnalysis>> GetHloCostAnalysis()
+  absl::StatusOr<std::unique_ptr<HloCostAnalysis>> GetHloCostAnalysis()
       const override {
     return wrapped_->GetHloCostAnalysis();
   }
-  StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
       const XlaComputation& computation, CompileOptions options) override {
     return WrapExecutable(wrapped_->Compile(computation, options));
   }
-  StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
       mlir::ModuleOp module, CompileOptions options) override {
     return WrapExecutable(wrapped_->Compile(std::move(module), options));
   }
 
-  StatusOr<std::unique_ptr<PjRtLoadedExecutable>> DeserializeExecutable(
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> DeserializeExecutable(
       absl::string_view serialized,
       std::optional<CompileOptions> options) override {
     return WrapExecutable(wrapped_->DeserializeExecutable(serialized, options));
   }
 
-  StatusOr<std::unique_ptr<PjRtBuffer>> CreateUninitializedBuffer(
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateUninitializedBuffer(
       const Shape& shape, PjRtDevice* device) override {
     return Unimplemented(
         "CreateUninitializedBuffer not supported for TfPjRtClient.");
   }
-  StatusOr<std::unique_ptr<AsyncHostToDeviceTransferManager>>
+  absl::StatusOr<std::unique_ptr<AsyncHostToDeviceTransferManager>>
   CreateBuffersForAsyncHostToDevice(absl::Span<const Shape> shapes,
                                     PjRtDevice* device) override {
     return Unimplemented(
         "AsyncHostToDeviceTransferManager not supported for Tf.");
   }
-  StatusOr<std::unique_ptr<AsyncHostToDeviceTransferManager>>
+  absl::StatusOr<std::unique_ptr<AsyncHostToDeviceTransferManager>>
   CreateBuffersForAsyncHostToDevice(absl::Span<const Shape> shapes,
                                     PjRtMemorySpace* memory_space) override {
     return Unimplemented(
         "AsyncHostToDeviceTransferManager not supported for Tf.");
   }
-  StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
       const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
       std::optional<absl::Span<int64_t const>> byte_strides,
       HostBufferSemantics host_buffer_semantics,
@@ -293,7 +289,7 @@ class TfPjRtClient : public PjRtClient {
         data, type, dims, byte_strides, host_buffer_semantics,
         std::move(on_done_with_host_buffer), device));
   }
-  StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostBuffer(
       const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
       std::optional<absl::Span<int64_t const>> byte_strides,
       HostBufferSemantics host_buffer_semantics,
@@ -303,53 +299,54 @@ class TfPjRtClient : public PjRtClient {
         data, type, dims, byte_strides, host_buffer_semantics,
         std::move(on_done_with_host_buffer), device, device_layout));
   }
-  StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> BufferFromHostLiteral(
       const LiteralSlice& literal, PjRtDevice* device) override {
     return WrapBuffer(wrapped_->BufferFromHostLiteral(literal, device));
   }
-  StatusOr<std::unique_ptr<PjRtBuffer>> CreateViewOfDeviceBuffer(
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> CreateViewOfDeviceBuffer(
       void* device_ptr, const Shape& shape, PjRtDevice* device,
       std::function<void()> on_delete_callback,
       std::optional<std::intptr_t> stream) override {
     return WrapBuffer(wrapped_->CreateViewOfDeviceBuffer(
         device_ptr, shape, device, on_delete_callback, stream));
   }
-  StatusOr<std::uintptr_t> UnsafeBufferPointer(PjRtBuffer* buffer) override {
+  absl::StatusOr<std::uintptr_t> UnsafeBufferPointer(
+      PjRtBuffer* buffer) override {
     return wrapped_->UnsafeBufferPointer(UnwrapBuffer(buffer));
   }
-  StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
   MakeCrossHostReceiveBuffers(absl::Span<const Shape> shapes,
                               PjRtDevice* device,
                               PjRtCrossHostRecvNotifier notifier) override {
     return wrapped_->MakeCrossHostReceiveBuffers(shapes, device,
                                                  std::move(notifier));
   }
-  StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
   MakeCrossHostReceiveBuffersForGather(
       absl::Span<const Shape> shapes, std::vector<GatherDetails> gather_details,
       PjRtDevice* device, PjRtCrossHostRecvNotifier notifier) override {
     return wrapped_->MakeCrossHostReceiveBuffersForGather(
         shapes, std::move(gather_details), device, std::move(notifier));
   }
-  StatusOr<ChannelHandle> CreateChannelHandle() override {
+  absl::StatusOr<ChannelHandle> CreateChannelHandle() override {
     return wrapped_->CreateChannelHandle();
   }
-  StatusOr<ChannelHandle> CreateDeviceToHostChannelHandle() override {
+  absl::StatusOr<ChannelHandle> CreateDeviceToHostChannelHandle() override {
     return wrapped_->CreateDeviceToHostChannelHandle();
   }
-  StatusOr<ChannelHandle> CreateHostToDeviceChannelHandle() override {
+  absl::StatusOr<ChannelHandle> CreateHostToDeviceChannelHandle() override {
     return wrapped_->CreateHostToDeviceChannelHandle();
   }
-  StatusOr<const PjRtTopologyDescription*> GetTopologyDescription()
+  absl::StatusOr<const PjRtTopologyDescription*> GetTopologyDescription()
       const override {
     return wrapped_->GetTopologyDescription();
   }
-  Status Defragment() override { return wrapped_->Defragment(); }
+  absl::Status Defragment() override { return wrapped_->Defragment(); }
 
   PjRtClient* wrapped() const { return wrapped_.get(); }
 
-  StatusOr<std::unique_ptr<PjRtBuffer>> WrapBuffer(
-      StatusOr<std::unique_ptr<PjRtBuffer>> to_wrap);
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> WrapBuffer(
+      absl::StatusOr<std::unique_ptr<PjRtBuffer>> to_wrap);
 
   // Tracks a non-owning pointer of TfPjRtBuffer in TfPjRtClient.
   void TrackBuffer(TfPjRtBuffer* buffer);
@@ -374,8 +371,8 @@ class TfPjRtClient : public PjRtClient {
                 ->wrapped();
   }
 
-  StatusOr<std::unique_ptr<PjRtLoadedExecutable>> WrapExecutable(
-      StatusOr<std::unique_ptr<PjRtLoadedExecutable>> to_wrap);
+  absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> WrapExecutable(
+      absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> to_wrap);
 
   std::unique_ptr<PjRtClient> wrapped_;
 
diff --git a/third_party/xla/xla/pjrt/tracked_device_buffer.cc b/third_party/xla/xla/pjrt/tracked_device_buffer.cc
index c1a699e2de4b81..11db8c45b7ae27 100644
--- a/third_party/xla/xla/pjrt/tracked_device_buffer.cc
+++ b/third_party/xla/xla/pjrt/tracked_device_buffer.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <algorithm>
 #include <atomic>
-#include <cinttypes>
 #include <cstdint>
 #include <functional>
 #include <iterator>
@@ -27,14 +26,20 @@ limitations under the License.
 #include <vector>
 
 #include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
 #include "absl/synchronization/mutex.h"
-#include "xla/pjrt/local_device_state.h"
-#include "xla/pjrt/utils.h"
+#include "absl/types/span.h"
+#include "xla/pjrt/event_pool.h"
+#include "xla/service/executable.h"
+#include "xla/service/maybe_owning_device_memory.h"
 #include "xla/service/shaped_buffer.h"
+#include "xla/shape.h"
+#include "xla/shape_tree.h"
+#include "xla/shape_util.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/event.h"
-#include "xla/types.h"
+#include "tsl/platform/logging.h"
 #include "tsl/profiler/lib/connected_traceme.h"
 #include "tsl/profiler/lib/context_types.h"
 
@@ -44,7 +49,7 @@ void BufferSequencingEvent::SetSequencingEvent(EventPool::Handle event,
                                                se::Stream* stream) {
   {
     absl::MutexLock lock(&mu_);
-    defined_status_.emplace(OkStatus());
+    defined_status_.emplace(absl::OkStatus());
     CHECK(!event_.event());
     event_ = std::move(event);
     CHECK(streams_defined_on_.empty());
@@ -83,7 +88,7 @@ void BufferSequencingEvent::WaitForEventOnStream(se::Stream* stream) {
   streams_defined_on_.push_back(stream);
 }
 
-Status BufferSequencingEvent::WaitForEventOnExternalStream(
+absl::Status BufferSequencingEvent::WaitForEventOnExternalStream(
     std::intptr_t stream) {
   absl::MutexLock lock(&mu_);
 
@@ -233,7 +238,7 @@ TrackedDeviceBuffer::TrackedDeviceBuffer(
 TrackedDeviceBuffer::~TrackedDeviceBuffer() {
   if (allocator_) {
     for (const se::DeviceMemoryBase& buffer : device_memory_) {
-      Status status = allocator_->Deallocate(device_ordinal_, buffer);
+      absl::Status status = allocator_->Deallocate(device_ordinal_, buffer);
       if (!status.ok()) {
         LOG(ERROR) << "Buffer deallocation failed: " << status;
       }
diff --git a/third_party/xla/xla/pjrt/tracked_device_buffer.h b/third_party/xla/xla/pjrt/tracked_device_buffer.h
index 218d0bad85cd1b..3b04f44ecdfe98 100644
--- a/third_party/xla/xla/pjrt/tracked_device_buffer.h
+++ b/third_party/xla/xla/pjrt/tracked_device_buffer.h
@@ -16,23 +16,26 @@ limitations under the License.
 #ifndef XLA_PJRT_TRACKED_DEVICE_BUFFER_H_
 #define XLA_PJRT_TRACKED_DEVICE_BUFFER_H_
 
-#include <algorithm>
 #include <atomic>
+#include <cstdint>
 #include <functional>
 #include <memory>
 #include <string>
 
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/functional/any_invocable.h"
 #include "xla/pjrt/event_pool.h"
-#include "xla/pjrt/local_device_state.h"
-#include "xla/pjrt/utils.h"
-#include "xla/runtime/async_runtime.h"
+#include "xla/service/executable.h"
+#include "xla/service/maybe_owning_device_memory.h"
 #include "xla/service/shaped_buffer.h"
-#include "xla/service/transfer_manager.h"
 #include "xla/shape.h"
+#include "xla/shape_tree.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "tsl/platform/threadpool.h"
 
 namespace xla {
 
@@ -81,7 +84,7 @@ class BufferSequencingEvent {
   // Same as WaitForEventOnStream, but takes a raw platform-specific
   // stream. Currently on implemented for CUDA and ROCM GPU, where stream is a
   // GpuStreamHandle (e.g. a cudaStream_t).
-  Status WaitForEventOnExternalStream(std::intptr_t stream);
+  absl::Status WaitForEventOnExternalStream(std::intptr_t stream);
 
   // Returns true if the event is known to have occurred by the tail of
   // 'stream'. If RecordOnStream has not yet been called, blocks the calling
@@ -127,7 +130,7 @@ class BufferSequencingEvent {
     return defined_status_.IsConcrete();
   }
 
-  void SetDefinedStatus(Status status) {
+  void SetDefinedStatus(absl::Status status) {
     {
       absl::MutexLock lock(&mu_);
       defined_status_.emplace(status);
@@ -136,12 +139,17 @@ class BufferSequencingEvent {
     this->ExecuteFutureTasks();
   }
 
-  Status GetDefinedStatus() {
+  absl::Status GetDefinedStatus() {
     absl::MutexLock lock(&mu_);
     CHECK(defined_status_.IsConcrete());
     return defined_status_.get();
   }
 
+  bool IsPredeterminedError() {
+    absl::MutexLock lock(&mu_);
+    return defined_status_.IsConcrete() && !defined_status_.get().ok();
+  }
+
  private:
   bool EventHasBeenRecorded() const ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
   uint64_t sequence_number() const;
@@ -173,7 +181,7 @@ class BufferSequencingEvent {
 
   // Indicates if the buffer is in an error status. And error status is used to
   // propagate the error to the buffer consumers.
-  tsl::AsyncValueRef<Status> defined_status_ ABSL_GUARDED_BY(mu_);
+  tsl::AsyncValueRef<absl::Status> defined_status_ ABSL_GUARDED_BY(mu_);
 };
 
 // Class that represents a tuple of device buffers. Like a ScopedShapedBuffer it
diff --git a/third_party/xla/xla/pjrt/tracked_device_buffer_test.cc b/third_party/xla/xla/pjrt/tracked_device_buffer_test.cc
index 9b4b238775443b..814307b1ea90d5 100644
--- a/third_party/xla/xla/pjrt/tracked_device_buffer_test.cc
+++ b/third_party/xla/xla/pjrt/tracked_device_buffer_test.cc
@@ -28,12 +28,12 @@ limitations under the License.
 namespace xla {
 namespace {
 
-StatusOr<std::shared_ptr<TrackedDeviceBuffer>> MakeArray(const Shape& shape,
-                                                         LocalClient* client) {
+absl::StatusOr<std::shared_ptr<TrackedDeviceBuffer>> MakeArray(
+    const Shape& shape, LocalClient* client) {
   std::vector<stream_executor::DeviceMemoryBase> device_buffers;
   TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
       client->backend().transfer_manager()->HostShapeToDeviceShape(shape),
-      [&](const Shape& subshape, const ShapeIndex&) -> Status {
+      [&](const Shape& subshape, const ShapeIndex&) -> absl::Status {
         TF_ASSIGN_OR_RETURN(
             se::OwningDeviceMemory device_memory,
             client->backend().memory_allocator()->Allocate(
diff --git a/third_party/xla/xla/pjrt/transpose.cc b/third_party/xla/xla/pjrt/transpose.cc
index 67dae968451d27..d45fab4fb1051a 100644
--- a/third_party/xla/xla/pjrt/transpose.cc
+++ b/third_party/xla/xla/pjrt/transpose.cc
@@ -677,7 +677,7 @@ int64_t TransposePlan::OutputNumElems() const {
 }
 
 // Parses and validates a tiling specification, and populates `tiling`.
-static Status ParseTilingSpecification(
+static absl::Status ParseTilingSpecification(
     int ndim, absl::Span<int64_t const> tiling_spec,
     absl::InlinedVector<int64_t, 4>& tiling) {
   tiling.resize(ndim, 1);
@@ -894,7 +894,7 @@ void TransposePlan::BuildPlanNodes(
   }
 }
 
-StatusOr<std::unique_ptr<TransposePlan>> TransposePlan::Create(
+absl::StatusOr<std::unique_ptr<TransposePlan>> TransposePlan::Create(
     const Options& o) {
   auto is_negative = [](int d) { return d < 0; };
   if (absl::c_find_if(o.dims, is_negative) != o.dims.end()) {
@@ -1360,7 +1360,7 @@ TransposePlanCache::TransposePlanCache(int capacity)
 
 TransposePlanCache::~TransposePlanCache() = default;
 
-StatusOr<std::shared_ptr<TransposePlan>> TransposePlanCache::GetOrCreate(
+absl::StatusOr<std::shared_ptr<TransposePlan>> TransposePlanCache::GetOrCreate(
     const TransposePlan::Options& o) {
   TransposePlanCacheKey key;
   key.elem_size_in_bytes = o.elem_size_in_bytes;
@@ -1388,7 +1388,7 @@ StatusOr<std::shared_ptr<TransposePlan>> TransposePlanCache::GetOrCreate(
   return cache_.GetOrCreateIfAbsent(
       key,
       [&](const TransposePlanCacheKey& key)
-          -> StatusOr<std::shared_ptr<TransposePlan>> {
+          -> absl::StatusOr<std::shared_ptr<TransposePlan>> {
         TF_ASSIGN_OR_RETURN(std::unique_ptr<TransposePlan> plan,
                             TransposePlan::Create(o));
         return std::shared_ptr<TransposePlan>(std::move(plan));
diff --git a/third_party/xla/xla/pjrt/transpose.h b/third_party/xla/xla/pjrt/transpose.h
index 767d707fdb88e5..0df1f8a7402289 100644
--- a/third_party/xla/xla/pjrt/transpose.h
+++ b/third_party/xla/xla/pjrt/transpose.h
@@ -98,7 +98,7 @@ class TransposePlan {
     int num_threads = 1;
   };
 
-  static StatusOr<std::unique_ptr<TransposePlan>> Create(
+  static absl::StatusOr<std::unique_ptr<TransposePlan>> Create(
       const Options& options);
 
   TransposePlan();
@@ -282,13 +282,14 @@ class TransposePlanCache {
   TransposePlanCache& operator=(TransposePlanCache&&) = delete;
 
   // Creates or returns a cached copy of a transpose plan.
-  StatusOr<std::shared_ptr<TransposePlan>> GetOrCreate(
+  absl::StatusOr<std::shared_ptr<TransposePlan>> GetOrCreate(
       const TransposePlan::Options& options);
 
  private:
   LRUCache<TransposePlanCacheKey,
-           StatusOr<std::shared_ptr<TransposePlan>>>::LRUList lru_list_;
-  LRUCache<TransposePlanCacheKey, StatusOr<std::shared_ptr<TransposePlan>>>
+           absl::StatusOr<std::shared_ptr<TransposePlan>>>::LRUList lru_list_;
+  LRUCache<TransposePlanCacheKey,
+           absl::StatusOr<std::shared_ptr<TransposePlan>>>
       cache_;
 };
 
diff --git a/third_party/xla/xla/pjrt/utils.cc b/third_party/xla/xla/pjrt/utils.cc
index c5a0f6016c5ad8..647b19b112cadf 100644
--- a/third_party/xla/xla/pjrt/utils.cc
+++ b/third_party/xla/xla/pjrt/utils.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "xla/client/executable_build_options.h"
 #include "xla/client/xla_computation.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -62,8 +63,8 @@ limitations under the License.
 namespace xla {
 
 namespace {
-StatusOr<Shape> GetShardedShape(const Shape& shape,
-                                const OpSharding& sharding) {
+absl::StatusOr<Shape> GetShardedShape(const Shape& shape,
+                                      const OpSharding& sharding) {
   TF_ASSIGN_OR_RETURN(HloSharding hlo_sharding,
                       HloSharding::FromProto(sharding));
   if (shape.IsTuple()) {
@@ -81,7 +82,7 @@ StatusOr<Shape> GetShardedShape(const Shape& shape,
   }
 }
 
-StatusOr<Shape> GetShardedShape(const HloInstructionProto& instr) {
+absl::StatusOr<Shape> GetShardedShape(const HloInstructionProto& instr) {
   const Shape unsharded_shape(instr.shape());
   Shape sharded_shape;
   if (instr.has_sharding()) {
@@ -95,7 +96,7 @@ StatusOr<Shape> GetShardedShape(const HloInstructionProto& instr) {
 }
 
 // Returns sharded (argument shapes, result shape) without layouts.
-StatusOr<std::pair<std::vector<Shape>, Shape>> GetShardedProgramShapes(
+absl::StatusOr<std::pair<std::vector<Shape>, Shape>> GetShardedProgramShapes(
     const XlaComputation& computation, const ProgramShape& program_shape) {
   std::vector<Shape> arg_shapes;
   arg_shapes.resize(program_shape.parameters_size());
@@ -134,7 +135,7 @@ StatusOr<std::pair<std::vector<Shape>, Shape>> GetShardedProgramShapes(
 }
 }  // namespace
 
-Status ParseDeviceAssignmentCompileOptions(
+absl::Status ParseDeviceAssignmentCompileOptions(
     bool compile_portable_executable, ExecutableBuildOptions* build_options,
     std::function<StatusOr<DeviceAssignment>(int, int)>
         GetDefaultDeviceAssignmentFunction,
@@ -178,7 +179,7 @@ Status ParseDeviceAssignmentCompileOptions(
 // Helper method that takes an ArrayAttr of DictionaryAttrs for each arg or
 // result of a function, and looks for "mhlo.layout_mode". `all_attrs` can be
 // nullptr. `num_values` is the number of arguments or results.
-static StatusOr<std::vector<LayoutMode>> MlirAttrsToLayoutModes(
+static absl::StatusOr<std::vector<LayoutMode>> MlirAttrsToLayoutModes(
     mlir::ArrayAttr all_attrs, size_t num_values) {
   if (all_attrs == nullptr) {
     return std::vector<LayoutMode>(num_values);
@@ -194,7 +195,7 @@ static StatusOr<std::vector<LayoutMode>> MlirAttrsToLayoutModes(
   result.reserve(all_attrs.size());
   for (const mlir::Attribute& dict_attr : all_attrs) {
     mlir::StringAttr attr =
-        dict_attr.cast<mlir::DictionaryAttr>().getAs<mlir::StringAttr>(
+        mlir::cast<mlir::DictionaryAttr>(dict_attr).getAs<mlir::StringAttr>(
             "mhlo.layout_mode");
     if (attr != nullptr) {
       TF_ASSIGN_OR_RETURN(LayoutMode mode,
@@ -209,7 +210,8 @@ static StatusOr<std::vector<LayoutMode>> MlirAttrsToLayoutModes(
 
 // TODO(b/329428415): Make this generic enough to be used by the GPU and TPU
 // compilers.
-StatusOr<MemorySpaceColor> GetMemorySpaceColor(const std::string& memory_kind) {
+absl::StatusOr<MemorySpaceColor> GetMemorySpaceColor(
+    const std::string& memory_kind) {
   // TODO(yashkatariya,zce): Unpinned_host is not valid for compiler. Only
   // pinned_host matters. So should there be a different lowering for
   // unpinned_host?
@@ -242,7 +244,7 @@ static absl::StatusOr<std::vector<MemorySpaceColor>> MlirAttrsToMemoryKinds(
   result.reserve(all_attrs.size());
   for (const mlir::Attribute& dict_attr : all_attrs) {
     mlir::StringAttr attr =
-        dict_attr.cast<mlir::DictionaryAttr>().getAs<mlir::StringAttr>(
+        mlir::cast<mlir::DictionaryAttr>(dict_attr).getAs<mlir::StringAttr>(
             "mhlo.memory_kind");
     if (attr != nullptr) {
       TF_ASSIGN_OR_RETURN(MemorySpaceColor memory_space,
@@ -258,8 +260,9 @@ static absl::StatusOr<std::vector<MemorySpaceColor>> MlirAttrsToMemoryKinds(
 // Helper function for getting default LayoutModes for tupled arguments or
 // outputs. Returns nullopt if the arguments/outputs are not tupled. Raises an
 // error if layout modes are requested on tupled values.
-static StatusOr<std::optional<std::vector<LayoutMode>>> GetTupleLayoutModes(
-    mlir::ArrayRef<mlir::Type> types, mlir::ArrayAttr all_attrs) {
+static absl::StatusOr<std::optional<std::vector<LayoutMode>>>
+GetTupleLayoutModes(mlir::ArrayRef<mlir::Type> types,
+                    mlir::ArrayAttr all_attrs) {
   if (types.size() != 1 || !llvm::isa<mlir::TupleType>(types[0])) {
     return std::nullopt;
   }
@@ -269,15 +272,14 @@ static StatusOr<std::optional<std::vector<LayoutMode>>> GetTupleLayoutModes(
           "GetTupleLayoutModes expected single tuple attr, got %d attrs",
           all_attrs.size());
     }
-    mlir::StringAttr attr =
-        all_attrs.begin()->cast<mlir::DictionaryAttr>().getAs<mlir::StringAttr>(
-            "mhlo.layout_mode");
+    mlir::StringAttr attr = mlir::cast<mlir::DictionaryAttr>(*all_attrs.begin())
+                                .getAs<mlir::StringAttr>("mhlo.layout_mode");
     if (attr != nullptr) {
       return Unimplemented("mhlo.layout_mode not supported with tupled values");
     }
   }
   // Use default layout for all outputs.
-  return std::vector<LayoutMode>(types[0].cast<mlir::TupleType>().size());
+  return std::vector<LayoutMode>(mlir::cast<mlir::TupleType>(types[0]).size());
 }
 
 // Helper function for getting default LayoutModes for tupled arguments or
@@ -295,19 +297,20 @@ GetTupleMemoryKinds(mlir::ArrayRef<mlir::Type> types,
           "GetTupleMemoryKinds expected single tuple attr, got %d attrs",
           all_attrs.size());
     }
-    mlir::StringAttr attr =
-        all_attrs.begin()->cast<mlir::DictionaryAttr>().getAs<mlir::StringAttr>(
-            "mhlo.memory_kind");
+    mlir::StringAttr attr = mlir::cast<mlir::DictionaryAttr>(*all_attrs.begin())
+                                .getAs<mlir::StringAttr>("mhlo.memory_kind");
     if (attr != nullptr) {
       return Unimplemented("mhlo.memory_kind not supported with tupled values");
     }
   }
   // Use default layout for all outputs.
-  return std::vector<MemorySpaceColor>(types[0].cast<mlir::TupleType>().size(),
-                                       xla::Layout::kDefaultMemorySpace);
+  return std::vector<MemorySpaceColor>(
+      mlir::cast<mlir::TupleType>(types[0]).size(),
+      xla::Layout::kDefaultMemorySpace);
 }
 
-StatusOr<std::vector<LayoutMode>> GetArgLayoutModes(mlir::ModuleOp module) {
+absl::StatusOr<std::vector<LayoutMode>> GetArgLayoutModes(
+    mlir::ModuleOp module) {
   mlir::func::FuncOp main = module.lookupSymbol<mlir::func::FuncOp>("main");
   if (main == nullptr) {
     return InvalidArgument(
@@ -323,7 +326,8 @@ StatusOr<std::vector<LayoutMode>> GetArgLayoutModes(mlir::ModuleOp module) {
   return MlirAttrsToLayoutModes(main.getAllArgAttrs(), main.getNumArguments());
 }
 
-StatusOr<std::vector<LayoutMode>> GetOutputLayoutModes(mlir::ModuleOp module) {
+absl::StatusOr<std::vector<LayoutMode>> GetOutputLayoutModes(
+    mlir::ModuleOp module) {
   mlir::func::FuncOp main = module.lookupSymbol<mlir::func::FuncOp>("main");
   if (main == nullptr) {
     return InvalidArgument(
@@ -385,8 +389,8 @@ static std::string GetFrontendAttr(absl::Span<const LayoutMode> layout_modes) {
                        });
 }
 
-Status AddLayoutModesToFrontendAttrs(mlir::ModuleOp module,
-                                     XlaComputation& xla_computation) {
+absl::Status AddLayoutModesToFrontendAttrs(mlir::ModuleOp module,
+                                           XlaComputation& xla_computation) {
   TF_ASSIGN_OR_RETURN(std::vector<LayoutMode> arg_layout_modes,
                       GetArgLayoutModes(module));
   TF_ASSIGN_OR_RETURN(std::vector<LayoutMode> out_layout_modes,
@@ -411,8 +415,8 @@ static std::string GetFrontendAttrForMemorySpace(
       });
 }
 
-Status AddMemoryKindsToFrontendAttrs(mlir::ModuleOp module,
-                                     XlaComputation& xla_computation) {
+absl::Status AddMemoryKindsToFrontendAttrs(mlir::ModuleOp module,
+                                           XlaComputation& xla_computation) {
   TF_ASSIGN_OR_RETURN(std::vector<MemorySpaceColor> arg_memory_spaces,
                       GetArgMemoryKinds(module));
   TF_ASSIGN_OR_RETURN(std::vector<MemorySpaceColor> out_memory_spaces,
@@ -430,7 +434,7 @@ Status AddMemoryKindsToFrontendAttrs(mlir::ModuleOp module,
   return OkStatus();
 }
 
-static StatusOr<std::vector<LayoutMode>> GetLayoutModesFromFrontendAttr(
+static absl::StatusOr<std::vector<LayoutMode>> GetLayoutModesFromFrontendAttr(
     absl::string_view attr) {
   // SkipEmpty() needed to avoid returning the empty string when attr is empty.
   std::vector<std::string> str_modes =
@@ -443,7 +447,7 @@ static StatusOr<std::vector<LayoutMode>> GetLayoutModesFromFrontendAttr(
   return result;
 }
 
-static StatusOr<std::vector<LayoutMode>> GetLayoutModes(
+static absl::StatusOr<std::vector<LayoutMode>> GetLayoutModes(
     const XlaComputation& computation, absl::string_view frontend_attr_name,
     size_t num_values) {
   const auto& frontend_attrs = computation.proto().frontend_attributes().map();
@@ -455,8 +459,8 @@ static StatusOr<std::vector<LayoutMode>> GetLayoutModes(
   return GetLayoutModesFromFrontendAttr(iter->second);
 }
 
-static StatusOr<std::vector<MemorySpaceColor>> GetMemoryKindsFromFrontendAttr(
-    absl::string_view attr) {
+static absl::StatusOr<std::vector<MemorySpaceColor>>
+GetMemoryKindsFromFrontendAttr(absl::string_view attr) {
   // SkipEmpty() needed to avoid returning the empty string when attr is empty.
   std::vector<std::string> str_memory_spaces =
       absl::StrSplit(attr, kDelimiter, absl::SkipEmpty());
@@ -471,7 +475,7 @@ static StatusOr<std::vector<MemorySpaceColor>> GetMemoryKindsFromFrontendAttr(
   return result;
 }
 
-static StatusOr<std::vector<MemorySpaceColor>> GetMemoryKinds(
+static absl::StatusOr<std::vector<MemorySpaceColor>> GetMemoryKinds(
     const XlaComputation& computation, absl::string_view frontend_attr_name,
     size_t num_values) {
   const auto& frontend_attrs = computation.proto().frontend_attributes().map();
@@ -484,7 +488,7 @@ static StatusOr<std::vector<MemorySpaceColor>> GetMemoryKinds(
   return GetMemoryKindsFromFrontendAttr(iter->second);
 }
 
-StatusOr<std::vector<LayoutMode>> GetArgLayoutModes(
+absl::StatusOr<std::vector<LayoutMode>> GetArgLayoutModes(
     const XlaComputation& computation) {
   TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
                       computation.GetProgramShape());
@@ -495,7 +499,7 @@ StatusOr<std::vector<LayoutMode>> GetArgLayoutModes(
   return GetLayoutModes(computation, "arg_layout_modes", num_args);
 }
 
-StatusOr<std::vector<MemorySpaceColor>> GetArgMemoryKinds(
+absl::StatusOr<std::vector<MemorySpaceColor>> GetArgMemoryKinds(
     const XlaComputation& computation) {
   TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
                       computation.GetProgramShape());
@@ -506,7 +510,7 @@ StatusOr<std::vector<MemorySpaceColor>> GetArgMemoryKinds(
   return GetMemoryKinds(computation, "arg_memory_spaces", num_args);
 }
 
-StatusOr<std::vector<LayoutMode>> GetOutputLayoutModes(
+absl::StatusOr<std::vector<LayoutMode>> GetOutputLayoutModes(
     const XlaComputation& computation) {
   TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
                       computation.GetProgramShape());
@@ -516,7 +520,7 @@ StatusOr<std::vector<LayoutMode>> GetOutputLayoutModes(
   return GetLayoutModes(computation, "out_layout_modes", num_outputs);
 }
 
-StatusOr<std::vector<MemorySpaceColor>> GetOutputMemoryKinds(
+absl::StatusOr<std::vector<MemorySpaceColor>> GetOutputMemoryKinds(
     const XlaComputation& computation) {
   TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
                       computation.GetProgramShape());
@@ -526,7 +530,7 @@ StatusOr<std::vector<MemorySpaceColor>> GetOutputMemoryKinds(
   return GetMemoryKinds(computation, "out_memory_spaces", num_outputs);
 }
 
-static StatusOr<Shape> LayoutModeToXlaShape(
+static absl::StatusOr<Shape> LayoutModeToXlaShape(
     const LayoutMode& layout_mode, const Shape& unsharded_shape,
     const Shape& sharded_shape,
     std::function<StatusOr<Shape>(Shape)>
@@ -565,7 +569,7 @@ static StatusOr<Shape> LayoutModeToXlaShape(
   return result;
 }
 
-StatusOr<std::pair<std::vector<Shape>, Shape>> LayoutModesToXlaShapes(
+absl::StatusOr<std::pair<std::vector<Shape>, Shape>> LayoutModesToXlaShapes(
     const XlaComputation& computation, std::vector<LayoutMode> arg_layout_modes,
     std::vector<LayoutMode> out_layout_modes,
     const std::vector<MemorySpaceColor>& arg_memory_spaces,
@@ -673,7 +677,7 @@ StatusOr<std::pair<std::vector<Shape>, Shape>> LayoutModesToXlaShapes(
                                               std::move(out_layout)};
 }
 
-StatusOr<std::pair<std::vector<Shape>, std::vector<const Shape*>>>
+absl::StatusOr<std::pair<std::vector<Shape>, std::vector<const Shape*>>>
 LayoutModesToXla(const XlaComputation& computation,
                  std::vector<LayoutMode> arg_layout_modes,
                  std::vector<LayoutMode> out_layout_modes,
@@ -704,7 +708,7 @@ LayoutModesToXla(const XlaComputation& computation,
       std::move(arg_layouts), std::move(arg_layout_pointers)};
 }
 
-Status DetermineArgumentLayoutsFromCompileOptions(
+absl::Status DetermineArgumentLayoutsFromCompileOptions(
     const XlaComputation& computation,
     std::function<StatusOr<Shape>(Shape)>
         choose_compact_layout_for_shape_function,
@@ -767,7 +771,7 @@ Status DetermineArgumentLayoutsFromCompileOptions(
   return OkStatus();
 }
 
-StatusOr<std::vector<int>> ComputeParametersThatMustBeDonated(
+absl::StatusOr<std::vector<int>> ComputeParametersThatMustBeDonated(
     const HloModule& module, bool tuple_inputs) {
   HloComputation* computation = module.entry_computation();
   int number_of_parameters = [&]() -> int {
@@ -858,7 +862,7 @@ bool HasMajorToMinorLayout(PrimitiveType type, absl::Span<int64_t const> dims,
   return true;
 }
 
-StatusOr<Shape> MakeShapeWithTrivialByteStrides(
+absl::StatusOr<Shape> MakeShapeWithTrivialByteStrides(
     PrimitiveType element_type, absl::Span<const int64_t> dimensions,
     absl::Span<const int64_t> byte_strides) {
   TF_RET_CHECK(dimensions.size() == byte_strides.size());
@@ -894,7 +898,7 @@ StatusOr<Shape> MakeShapeWithTrivialByteStrides(
                                              minor_to_major);
 }
 
-Status TestBufferDonationClashes(
+absl::Status TestBufferDonationClashes(
     void* opaque_key,
     absl::flat_hash_map<const void*, std::pair<bool, int>>& donation_clashes,
     bool is_donated, int arg_idx, int replica, int partition) {
diff --git a/third_party/xla/xla/pjrt/utils.h b/third_party/xla/xla/pjrt/utils.h
index 3134b49702eee5..0271924420913b 100644
--- a/third_party/xla/xla/pjrt/utils.h
+++ b/third_party/xla/xla/pjrt/utils.h
@@ -41,7 +41,7 @@ using MemorySpaceColor = int;
 
 // Returns the num_replicas, num_partitions and device assignment given a
 // ExecutableBuildOptions and whether we want a portable executable.
-Status ParseDeviceAssignmentCompileOptions(
+absl::Status ParseDeviceAssignmentCompileOptions(
     bool compile_portable_executable, ExecutableBuildOptions* build_options,
     std::function<StatusOr<DeviceAssignment>(int, int)>
         GetDefaultDeviceAssignmentFunction,
@@ -51,56 +51,58 @@ Status ParseDeviceAssignmentCompileOptions(
 // Returns the LayoutMode for each argument of the main function in the
 // module. Checks for the "mhlo.layout_mode" attr, and if not present, assumes
 // LayoutMode::Mode::kDefault.
-StatusOr<std::vector<LayoutMode>> GetArgLayoutModes(mlir::ModuleOp module);
+absl::StatusOr<std::vector<LayoutMode>> GetArgLayoutModes(
+    mlir::ModuleOp module);
 // Returns the LayoutMode for each output of the main function in the
 // module. Checks for the "mhlo.layout_mode" attr, and if not present, assumes
 // LayoutMode::Mode::kDefault.
-StatusOr<std::vector<LayoutMode>> GetOutputLayoutModes(mlir::ModuleOp module);
+absl::StatusOr<std::vector<LayoutMode>> GetOutputLayoutModes(
+    mlir::ModuleOp module);
 
 // Returns the memory space for each argument of the computations. Checks
 // for the "mhlo.memory_kind" frontend attribute, and if not present, assumes 0.
-StatusOr<std::vector<MemorySpaceColor>> GetArgMemoryKinds(
+absl::StatusOr<std::vector<MemorySpaceColor>> GetArgMemoryKinds(
     mlir::ModuleOp module);
 // Returns the memory space for each output of the computations. Checks for
 // the "mhlo.memory_kind" frontend attribute, and if not present, assumes 0.
-StatusOr<std::vector<MemorySpaceColor>> GetOutputMemoryKinds(
+absl::StatusOr<std::vector<MemorySpaceColor>> GetOutputMemoryKinds(
     mlir::ModuleOp module);
 
 // Populates the frontend attributes "arg_layout_mode" and "out_layout_mode" in
 // xla_computation based on `module`. This function must be called before the
 // LayoutMode getters below work correctly on `computation`.
-Status AddLayoutModesToFrontendAttrs(mlir::ModuleOp module,
-                                     XlaComputation& xla_computation);
+absl::Status AddLayoutModesToFrontendAttrs(mlir::ModuleOp module,
+                                           XlaComputation& xla_computation);
 
 // Populates the frontend attributes "arg_memory_kinds" and "out_memory_kinds"
 // in xla_computation based on `module`. This function must be called before the
 // LayoutMode getters below work correctly on `computation`.
-Status AddMemoryKindsToFrontendAttrs(mlir::ModuleOp module,
-                                     XlaComputation& xla_computation);
+absl::Status AddMemoryKindsToFrontendAttrs(mlir::ModuleOp module,
+                                           XlaComputation& xla_computation);
 
 // Returns the LayoutMode for each argument of the computations. Checks for the
 // "arg_layout_mode" frontend attribute, and if not present, assumes
 // LayoutMode::Mode::kDefault.
-StatusOr<std::vector<LayoutMode>> GetArgLayoutModes(
+absl::StatusOr<std::vector<LayoutMode>> GetArgLayoutModes(
     const XlaComputation& computation);
 // Returns the LayoutMode for each argument of the computations. Checks for the
 // "out_layout_mode" frontend attribute, and if not present, assumes
 // LayoutMode::Mode::kDefault.
-StatusOr<std::vector<LayoutMode>> GetOutputLayoutModes(
+absl::StatusOr<std::vector<LayoutMode>> GetOutputLayoutModes(
     const XlaComputation& computation);
 
 // Returns the memory space for each argument of the computations. Checks for
 // the "arg_memory_kind" frontend attribute, and if not present, assumes 0.
-StatusOr<std::vector<MemorySpaceColor>> GetArgMemoryKinds(
+absl::StatusOr<std::vector<MemorySpaceColor>> GetArgMemoryKinds(
     const XlaComputation& computation);
 // Returns the memory space for each argument of the computations. Checks for
 // the "out_memory_kind" frontend attribute, and if not present, assumes 0.
-StatusOr<std::vector<MemorySpaceColor>> GetOutputMemoryKinds(
+absl::StatusOr<std::vector<MemorySpaceColor>> GetOutputMemoryKinds(
     const XlaComputation& computation);
 
 // Returns (arg shapes, output shape) with properly-set Layouts that can
 // be passed to XLA to reflect arg_layout_modes and out_layout_modes.
-StatusOr<std::pair<std::vector<Shape>, Shape>> LayoutModesToXlaShapes(
+absl::StatusOr<std::pair<std::vector<Shape>, Shape>> LayoutModesToXlaShapes(
     const XlaComputation& computation, std::vector<LayoutMode> arg_layout_modes,
     std::vector<LayoutMode> out_layout_modes,
     const std::vector<MemorySpaceColor>& arg_memory_spaces,
@@ -112,7 +114,7 @@ StatusOr<std::pair<std::vector<Shape>, Shape>> LayoutModesToXlaShapes(
 // * Returns a vector of argument xla::Shapes with properly-set Layouts
 // * Returns vector of pointers to those Shapes to create HloModuleConfig
 // * Modifies `build_options` to have the correct result_layout set or unset
-StatusOr<std::pair<std::vector<Shape>, std::vector<const Shape*>>>
+absl::StatusOr<std::pair<std::vector<Shape>, std::vector<const Shape*>>>
 LayoutModesToXla(const XlaComputation& computation,
                  std::vector<LayoutMode> arg_layout_modes,
                  std::vector<LayoutMode> out_layout_modes,
@@ -124,7 +126,7 @@ LayoutModesToXla(const XlaComputation& computation,
 
 // Returns pointers to the argument layouts given an XlaComputation and
 // ExecutableBuildOptions.
-Status DetermineArgumentLayoutsFromCompileOptions(
+absl::Status DetermineArgumentLayoutsFromCompileOptions(
     const XlaComputation& computation,
     std::function<StatusOr<Shape>(Shape)>
         choose_compact_layout_for_shape_function,
@@ -136,7 +138,7 @@ Status DetermineArgumentLayoutsFromCompileOptions(
 // to outputs. This function returns a sorted vector of parameters that must be
 // donated when executable is run. tuple_inputs reflects the option that
 // executable was compiled with.
-StatusOr<std::vector<int>> ComputeParametersThatMustBeDonated(
+absl::StatusOr<std::vector<int>> ComputeParametersThatMustBeDonated(
     const HloModule& hlo_module, bool tuple_inputs);
 
 // Return max parallelism level.
@@ -150,7 +152,7 @@ bool HasMajorToMinorLayout(PrimitiveType type, absl::Span<int64_t const> dims,
 // Constructs a new dense array shape with the given byte strides. Supports only
 // trivial (compact) byte_strides that represents a transposition of a dense
 // buffer.
-StatusOr<Shape> MakeShapeWithTrivialByteStrides(
+absl::StatusOr<Shape> MakeShapeWithTrivialByteStrides(
     PrimitiveType element_type, absl::Span<const int64_t> dimensions,
     absl::Span<const int64_t> byte_strides);
 
@@ -159,7 +161,7 @@ StatusOr<Shape> MakeShapeWithTrivialByteStrides(
 // Multiple uses are valid iff they are all not donations.  The provided map
 // stores the opaque buffer identity, a bool to denote if the previous use is a
 // donation, and the index of the previous use for better error messages.
-Status TestBufferDonationClashes(
+absl::Status TestBufferDonationClashes(
     void* opaque_key,
     absl::flat_hash_map<const void*, std::pair<bool, int>>& donation_clashes,
     bool is_donated, int arg_idx, int replica, int partition);
diff --git a/third_party/xla/xla/primitive_util.cc b/third_party/xla/xla/primitive_util.cc
index 75f263ced59939..b70ba275a1f47f 100644
--- a/third_party/xla/xla/primitive_util.cc
+++ b/third_party/xla/xla/primitive_util.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/ascii.h"
 #include "absl/strings/string_view.h"
-#include "xla/statusor.h"
 #include "xla/types.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -107,6 +106,8 @@ bool HasNegativeZero(PrimitiveType type) {
 
 xla::PrimitiveType SignedIntegralTypeForBitWidth(int64_t src_bitwidth) {
   switch (src_bitwidth) {
+    case 2:
+      return xla::S2;
     case 4:
       return xla::S4;
     case 8:
diff --git a/third_party/xla/xla/primitive_util.h b/third_party/xla/xla/primitive_util.h
index 33309d2ee91484..1f400fc7a89d60 100644
--- a/third_party/xla/xla/primitive_util.h
+++ b/third_party/xla/xla/primitive_util.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include "absl/base/attributes.h"
 #include "absl/base/optimization.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/statusor.h"
 #include "xla/types.h"
 #include "xla/util.h"
@@ -92,6 +93,11 @@ constexpr PrimitiveType NativeToPrimitiveType<bool>() {
 }
 
 // Unsigned integer
+template <>
+constexpr PrimitiveType NativeToPrimitiveType<u2>() {
+  return U2;
+}
+
 template <>
 constexpr PrimitiveType NativeToPrimitiveType<u4>() {
   return U4;
@@ -118,6 +124,11 @@ constexpr PrimitiveType NativeToPrimitiveType<uint64_t>() {
 }
 
 // Signed integer
+template <>
+constexpr PrimitiveType NativeToPrimitiveType<s2>() {
+  return S2;
+}
+
 template <>
 constexpr PrimitiveType NativeToPrimitiveType<s4>() {
   return S4;
@@ -175,7 +186,7 @@ constexpr PrimitiveType NativeToPrimitiveType<tsl::float8_e4m3fn>() {
 }
 
 template <>
-constexpr PrimitiveType NativeToPrimitiveType<tsl::float8_e4m3b11>() {
+constexpr PrimitiveType NativeToPrimitiveType<tsl::float8_e4m3b11fnuz>() {
   return F8E4M3B11FNUZ;
 }
 
@@ -213,6 +224,11 @@ struct PrimitiveTypeToNative<PRED> {
 };
 
 // Unsigned integer
+template <>
+struct PrimitiveTypeToNative<U2> {
+  using type = u2;
+};
+
 template <>
 struct PrimitiveTypeToNative<U4> {
   using type = u4;
@@ -239,6 +255,11 @@ struct PrimitiveTypeToNative<U64> {
 };
 
 // Signed integer
+template <>
+struct PrimitiveTypeToNative<S2> {
+  using type = s2;
+};
+
 template <>
 struct PrimitiveTypeToNative<S4> {
   using type = s4;
@@ -295,7 +316,7 @@ struct PrimitiveTypeToNative<F8E4M3FN> {
 
 template <>
 struct PrimitiveTypeToNative<F8E4M3B11FNUZ> {
-  using type = tsl::float8_e4m3b11;
+  using type = tsl::float8_e4m3b11fnuz;
 };
 
 template <>
@@ -319,6 +340,12 @@ struct PrimitiveTypeToNative<C128> {
   using type = complex128;
 };
 
+// Token
+template <>
+struct PrimitiveTypeToNative<TOKEN> {
+  using type = void;
+};
+
 template <PrimitiveType kType>
 using NativeTypeOf =
     typename primitive_util::PrimitiveTypeToNative<kType>::type;
@@ -349,25 +376,25 @@ constexpr bool IsComplexType(PrimitiveType type) {
 }
 
 constexpr bool IsSignedIntegralType(PrimitiveType type) {
-  return type == S4 || type == S8 || type == S16 || type == S32 || type == S64;
+  return type == S2 || type == S4 || type == S8 || type == S16 || type == S32 ||
+         type == S64;
 }
 
 constexpr bool IsUnsignedIntegralType(PrimitiveType type) {
-  return type == U4 || type == U8 || type == U16 || type == U32 || type == U64;
+  return type == U2 || type == U4 || type == U8 || type == U16 || type == U32 ||
+         type == U64;
 }
 
 constexpr bool IsIntegralType(PrimitiveType type) {
   return IsUnsignedIntegralType(type) || IsSignedIntegralType(type);
 }
 
-constexpr bool Is4BitType(PrimitiveType type) {
-  return type == S4 || type == U4;
-}
-
 template <typename R, typename F>
 constexpr R IntegralTypeSwitch(F&& f, PrimitiveType type) {
   if (ABSL_PREDICT_TRUE(IsIntegralType(type))) {
     switch (type) {
+      case S2:
+        return std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::S2>());
       case S4:
         return std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::S4>());
       case S8:
@@ -378,6 +405,8 @@ constexpr R IntegralTypeSwitch(F&& f, PrimitiveType type) {
         return std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::S32>());
       case S64:
         return std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::S64>());
+      case U2:
+        return std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::U2>());
       case U4:
         return std::forward<F>(f)(PrimitiveTypeConstant<PrimitiveType::U4>());
       case U8:
@@ -546,6 +575,8 @@ inline constexpr int ByteWidth(PrimitiveType type) {
 
 constexpr PrimitiveType UnsignedIntegralTypeForBitWidth(int64_t src_bitwidth) {
   switch (src_bitwidth) {
+    case 2:
+      return xla::U2;
     case 4:
       return xla::U4;
     case 8:
@@ -754,6 +785,21 @@ inline bool FitsInIntegralType(int64_t x, PrimitiveType ty) {
       ty);
 }
 
+constexpr bool IsSubByteNonPredType(PrimitiveType type) {
+  return IsArrayType(type) && type != PRED &&
+         primitive_util::BitWidth(type) < 8;
+}
+
+inline void PackIntN(PrimitiveType input_type, absl::Span<const char> input,
+                     absl::Span<char> output) {
+  xla::PackIntN(primitive_util::BitWidth(input_type), input, output);
+}
+
+inline void UnpackIntN(PrimitiveType input_type, absl::Span<const char> input,
+                       absl::Span<char> output) {
+  xla::UnpackIntN(primitive_util::BitWidth(input_type), input, output);
+}
+
 }  // namespace primitive_util
 }  // namespace xla
 
diff --git a/third_party/xla/xla/primitive_util_test.cc b/third_party/xla/xla/primitive_util_test.cc
index c61f73438e8a57..e8c9dc77087062 100644
--- a/third_party/xla/xla/primitive_util_test.cc
+++ b/third_party/xla/xla/primitive_util_test.cc
@@ -18,12 +18,10 @@ limitations under the License.
 #include <numeric>
 #include <string>
 
-#include "xla/status_macros.h"
 #include "xla/test.h"
 #include "xla/test_helpers.h"
-#include "xla/types.h"
-#include "xla/util.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -59,11 +57,13 @@ TEST(PrimitiveUtilTest, FloatTypes) {
 TEST(PrimitiveUtilTest, CastPreservesValues) {
   bool expecteds[PrimitiveType_ARRAYSIZE][PrimitiveType_ARRAYSIZE];
   expecteds[PRED][PRED] = true;
+  expecteds[PRED][S2] = true;
   expecteds[PRED][S4] = true;
   expecteds[PRED][S8] = true;
   expecteds[PRED][S16] = true;
   expecteds[PRED][S32] = true;
   expecteds[PRED][S64] = true;
+  expecteds[PRED][U2] = true;
   expecteds[PRED][U4] = true;
   expecteds[PRED][U8] = true;
   expecteds[PRED][U16] = true;
@@ -80,12 +80,38 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[PRED][F8E4M3B11FNUZ] = true;
   expecteds[PRED][F8E5M2FNUZ] = true;
   expecteds[PRED][F8E4M3FNUZ] = true;
+  expecteds[S2][PRED] = false;
+  expecteds[S2][S2] = true;
+  expecteds[S2][S4] = true;
+  expecteds[S2][S8] = true;
+  expecteds[S2][S16] = true;
+  expecteds[S2][S32] = true;
+  expecteds[S2][S64] = true;
+  expecteds[S2][U2] = false;
+  expecteds[S2][U4] = false;
+  expecteds[S2][U8] = false;
+  expecteds[S2][U16] = false;
+  expecteds[S2][U32] = false;
+  expecteds[S2][U64] = false;
+  expecteds[S2][F16] = true;
+  expecteds[S2][F32] = true;
+  expecteds[S2][F64] = true;
+  expecteds[S2][C64] = true;
+  expecteds[S2][BF16] = true;
+  expecteds[S2][C128] = true;
+  expecteds[S2][F8E5M2] = true;
+  expecteds[S2][F8E4M3FN] = true;
+  expecteds[S2][F8E4M3B11FNUZ] = true;
+  expecteds[S2][F8E5M2FNUZ] = true;
+  expecteds[S2][F8E4M3FNUZ] = true;
   expecteds[S4][PRED] = false;
+  expecteds[S4][S2] = false;
   expecteds[S4][S4] = true;
   expecteds[S4][S8] = true;
   expecteds[S4][S16] = true;
   expecteds[S4][S32] = true;
   expecteds[S4][S64] = true;
+  expecteds[S4][U2] = false;
   expecteds[S4][U4] = false;
   expecteds[S4][U8] = false;
   expecteds[S4][U16] = false;
@@ -103,11 +129,13 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[S4][F8E5M2FNUZ] = true;
   expecteds[S4][F8E4M3FNUZ] = true;
   expecteds[S8][PRED] = false;
+  expecteds[S8][S2] = false;
   expecteds[S8][S4] = false;
   expecteds[S8][S8] = true;
   expecteds[S8][S16] = true;
   expecteds[S8][S32] = true;
   expecteds[S8][S64] = true;
+  expecteds[S8][U2] = false;
   expecteds[S8][U4] = false;
   expecteds[S8][U8] = false;
   expecteds[S8][U16] = false;
@@ -125,11 +153,13 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[S8][F8E5M2FNUZ] = false;
   expecteds[S8][F8E4M3FNUZ] = false;
   expecteds[S16][PRED] = false;
+  expecteds[S16][S2] = false;
   expecteds[S16][S4] = false;
   expecteds[S16][S8] = false;
   expecteds[S16][S16] = true;
   expecteds[S16][S32] = true;
   expecteds[S16][S64] = true;
+  expecteds[S16][U2] = false;
   expecteds[S16][U4] = false;
   expecteds[S16][U8] = false;
   expecteds[S16][U16] = false;
@@ -147,11 +177,13 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[S16][F8E5M2FNUZ] = false;
   expecteds[S16][F8E4M3FNUZ] = false;
   expecteds[S32][PRED] = false;
+  expecteds[S32][S2] = false;
   expecteds[S32][S4] = false;
   expecteds[S32][S8] = false;
   expecteds[S32][S16] = false;
   expecteds[S32][S32] = true;
   expecteds[S32][S64] = true;
+  expecteds[S32][U2] = false;
   expecteds[S32][U4] = false;
   expecteds[S32][U8] = false;
   expecteds[S32][U16] = false;
@@ -169,11 +201,13 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[S32][F8E5M2FNUZ] = false;
   expecteds[S32][F8E4M3FNUZ] = false;
   expecteds[S64][PRED] = false;
+  expecteds[S64][S2] = false;
   expecteds[S64][S4] = false;
   expecteds[S64][S8] = false;
   expecteds[S64][S16] = false;
   expecteds[S64][S32] = false;
   expecteds[S64][S64] = true;
+  expecteds[S64][U2] = false;
   expecteds[S64][U4] = false;
   expecteds[S64][U8] = false;
   expecteds[S64][U16] = false;
@@ -190,12 +224,40 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[S64][F8E4M3B11FNUZ] = false;
   expecteds[S64][F8E5M2FNUZ] = false;
   expecteds[S64][F8E4M3FNUZ] = false;
+  expecteds[U2][PRED] = false;
+  expecteds[U2][S2] = false;
+  expecteds[U2][S4] = true;
+  expecteds[U2][S8] = true;
+  expecteds[U2][S16] = true;
+  expecteds[U2][S32] = true;
+  expecteds[U2][S64] = true;
+  expecteds[U2][U2] = true;
+  expecteds[U2][U4] = true;
+  expecteds[U2][U8] = true;
+  expecteds[U2][U16] = true;
+  expecteds[U2][U32] = true;
+  expecteds[U2][U64] = true;
+  expecteds[U2][F16] = true;
+  expecteds[U2][F32] = true;
+  expecteds[U2][F64] = true;
+  expecteds[U2][C64] = true;
+  expecteds[U2][BF16] = true;
+  expecteds[U2][C128] = true;
+  expecteds[U2][BF16] = true;
+  expecteds[U2][C128] = true;
+  expecteds[U2][F8E5M2] = true;
+  expecteds[U2][F8E4M3FN] = true;
+  expecteds[U2][F8E4M3B11FNUZ] = true;
+  expecteds[U2][F8E5M2FNUZ] = true;
+  expecteds[U2][F8E4M3FNUZ] = true;
   expecteds[U4][PRED] = false;
+  expecteds[U4][S2] = false;
   expecteds[U4][S4] = false;
   expecteds[U4][S8] = true;
   expecteds[U4][S16] = true;
   expecteds[U4][S32] = true;
   expecteds[U4][S64] = true;
+  expecteds[U4][U2] = false;
   expecteds[U4][U4] = true;
   expecteds[U4][U8] = true;
   expecteds[U4][U16] = true;
@@ -215,11 +277,13 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[U4][F8E5M2FNUZ] = false;
   expecteds[U4][F8E4M3FNUZ] = true;
   expecteds[U8][PRED] = false;
+  expecteds[U8][S2] = false;
   expecteds[U8][S4] = false;
   expecteds[U8][S8] = false;
   expecteds[U8][S16] = true;
   expecteds[U8][S32] = true;
   expecteds[U8][S64] = true;
+  expecteds[U8][U2] = false;
   expecteds[U8][U4] = false;
   expecteds[U8][U8] = true;
   expecteds[U8][U16] = true;
@@ -239,11 +303,13 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[U8][F8E5M2FNUZ] = false;
   expecteds[U8][F8E4M3FNUZ] = false;
   expecteds[U16][PRED] = false;
+  expecteds[U16][S2] = false;
   expecteds[U16][S4] = false;
   expecteds[U16][S8] = false;
   expecteds[U16][S16] = false;
   expecteds[U16][S32] = true;
   expecteds[U16][S64] = true;
+  expecteds[U16][U2] = false;
   expecteds[U16][U4] = false;
   expecteds[U16][U8] = false;
   expecteds[U16][U16] = true;
@@ -261,11 +327,13 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[U16][F8E5M2FNUZ] = false;
   expecteds[U16][F8E4M3FNUZ] = false;
   expecteds[U32][PRED] = false;
+  expecteds[U32][S2] = false;
   expecteds[U32][S4] = false;
   expecteds[U32][S8] = false;
   expecteds[U32][S16] = false;
   expecteds[U32][S32] = false;
   expecteds[U32][S64] = true;
+  expecteds[U32][U2] = false;
   expecteds[U32][U4] = false;
   expecteds[U32][U8] = false;
   expecteds[U32][U16] = false;
@@ -283,11 +351,13 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[U32][F8E5M2FNUZ] = false;
   expecteds[U32][F8E4M3FNUZ] = false;
   expecteds[U64][PRED] = false;
+  expecteds[U64][S2] = false;
   expecteds[U64][S4] = false;
   expecteds[U64][S8] = false;
   expecteds[U64][S16] = false;
   expecteds[U64][S32] = false;
   expecteds[U64][S64] = false;
+  expecteds[U64][U2] = false;
   expecteds[U64][U4] = false;
   expecteds[U64][U8] = false;
   expecteds[U64][U16] = false;
@@ -305,11 +375,13 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[U64][F8E5M2FNUZ] = false;
   expecteds[U64][F8E4M3FNUZ] = false;
   expecteds[F16][PRED] = false;
+  expecteds[F16][S2] = false;
   expecteds[F16][S4] = false;
   expecteds[F16][S8] = false;
   expecteds[F16][S16] = false;
   expecteds[F16][S32] = false;
   expecteds[F16][S64] = false;
+  expecteds[F16][U2] = false;
   expecteds[F16][U4] = false;
   expecteds[F16][U8] = false;
   expecteds[F16][U16] = false;
@@ -327,11 +399,13 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[F16][F8E5M2FNUZ] = false;
   expecteds[F16][F8E4M3FNUZ] = false;
   expecteds[F32][PRED] = false;
+  expecteds[F32][S2] = false;
   expecteds[F32][S4] = false;
   expecteds[F32][S8] = false;
   expecteds[F32][S16] = false;
   expecteds[F32][S32] = false;
   expecteds[F32][S64] = false;
+  expecteds[F32][U2] = false;
   expecteds[F32][U4] = false;
   expecteds[F32][U8] = false;
   expecteds[F32][U16] = false;
@@ -349,11 +423,13 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[F32][F8E5M2FNUZ] = false;
   expecteds[F32][F8E4M3FNUZ] = false;
   expecteds[F64][PRED] = false;
+  expecteds[F64][S2] = false;
   expecteds[F64][S4] = false;
   expecteds[F64][S8] = false;
   expecteds[F64][S16] = false;
   expecteds[F64][S32] = false;
   expecteds[F64][S64] = false;
+  expecteds[F64][U2] = false;
   expecteds[F64][U4] = false;
   expecteds[F64][U8] = false;
   expecteds[F64][U16] = false;
@@ -371,11 +447,13 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[F64][F8E5M2FNUZ] = false;
   expecteds[F64][F8E4M3FNUZ] = false;
   expecteds[C64][PRED] = false;
+  expecteds[C64][S2] = false;
   expecteds[C64][S4] = false;
   expecteds[C64][S8] = false;
   expecteds[C64][S16] = false;
   expecteds[C64][S32] = false;
   expecteds[C64][S64] = false;
+  expecteds[C64][U2] = false;
   expecteds[C64][U4] = false;
   expecteds[C64][U8] = false;
   expecteds[C64][U16] = false;
@@ -393,11 +471,13 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[C64][F8E5M2FNUZ] = false;
   expecteds[C64][F8E4M3FNUZ] = false;
   expecteds[BF16][PRED] = false;
+  expecteds[BF16][S2] = false;
   expecteds[BF16][S4] = false;
   expecteds[BF16][S8] = false;
   expecteds[BF16][S16] = false;
   expecteds[BF16][S32] = false;
   expecteds[BF16][S64] = false;
+  expecteds[BF16][U2] = false;
   expecteds[BF16][U4] = false;
   expecteds[BF16][U8] = false;
   expecteds[BF16][U16] = false;
@@ -415,11 +495,13 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[BF16][F8E5M2FNUZ] = false;
   expecteds[BF16][F8E4M3FNUZ] = false;
   expecteds[C128][PRED] = false;
+  expecteds[C128][S2] = false;
   expecteds[C128][S4] = false;
   expecteds[C128][S8] = false;
   expecteds[C128][S16] = false;
   expecteds[C128][S32] = false;
   expecteds[C128][S64] = false;
+  expecteds[C128][U2] = false;
   expecteds[C128][U4] = false;
   expecteds[C128][U8] = false;
   expecteds[C128][U16] = false;
@@ -437,11 +519,13 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[C128][F8E5M2FNUZ] = false;
   expecteds[C128][F8E4M3FNUZ] = false;
   expecteds[F8E5M2][PRED] = false;
+  expecteds[F8E5M2][S2] = false;
   expecteds[F8E5M2][S4] = false;
   expecteds[F8E5M2][S8] = false;
   expecteds[F8E5M2][S16] = false;
   expecteds[F8E5M2][S32] = false;
   expecteds[F8E5M2][S64] = false;
+  expecteds[F8E5M2][U2] = false;
   expecteds[F8E5M2][U4] = false;
   expecteds[F8E5M2][U8] = false;
   expecteds[F8E5M2][U16] = false;
@@ -459,11 +543,13 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[F8E5M2][F8E5M2FNUZ] = false;
   expecteds[F8E5M2][F8E4M3FNUZ] = false;
   expecteds[F8E4M3FN][PRED] = false;
+  expecteds[F8E4M3FN][S2] = false;
   expecteds[F8E4M3FN][S4] = false;
   expecteds[F8E4M3FN][S8] = false;
   expecteds[F8E4M3FN][S16] = false;
   expecteds[F8E4M3FN][S32] = false;
   expecteds[F8E4M3FN][S64] = false;
+  expecteds[F8E4M3FN][U2] = false;
   expecteds[F8E4M3FN][U4] = false;
   expecteds[F8E4M3FN][U8] = false;
   expecteds[F8E4M3FN][U16] = false;
@@ -479,11 +565,13 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[F8E4M3FN][F8E4M3FN] = true;
   expecteds[F8E4M3FN][F8E4M3B11FNUZ] = false;
   expecteds[F8E4M3B11FNUZ][PRED] = false;
+  expecteds[F8E4M3B11FNUZ][S2] = false;
   expecteds[F8E4M3B11FNUZ][S4] = false;
   expecteds[F8E4M3B11FNUZ][S8] = false;
   expecteds[F8E4M3B11FNUZ][S16] = false;
   expecteds[F8E4M3B11FNUZ][S32] = false;
   expecteds[F8E4M3B11FNUZ][S64] = false;
+  expecteds[F8E4M3B11FNUZ][U2] = false;
   expecteds[F8E4M3B11FNUZ][U4] = false;
   expecteds[F8E4M3B11FNUZ][U8] = false;
   expecteds[F8E4M3B11FNUZ][U16] = false;
@@ -503,11 +591,13 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[F8E4M3FN][F8E5M2FNUZ] = false;
   expecteds[F8E4M3FN][F8E4M3FNUZ] = false;
   expecteds[F8E5M2FNUZ][PRED] = false;
+  expecteds[F8E5M2FNUZ][S2] = false;
   expecteds[F8E5M2FNUZ][S4] = false;
   expecteds[F8E5M2FNUZ][S8] = false;
   expecteds[F8E5M2FNUZ][S16] = false;
   expecteds[F8E5M2FNUZ][S32] = false;
   expecteds[F8E5M2FNUZ][S64] = false;
+  expecteds[F8E5M2FNUZ][U2] = false;
   expecteds[F8E5M2FNUZ][U4] = false;
   expecteds[F8E5M2FNUZ][U8] = false;
   expecteds[F8E5M2FNUZ][U16] = false;
@@ -525,11 +615,13 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[F8E5M2FNUZ][F8E5M2FNUZ] = true;
   expecteds[F8E5M2FNUZ][F8E4M3FNUZ] = false;
   expecteds[F8E4M3FNUZ][PRED] = false;
+  expecteds[F8E4M3FNUZ][S2] = false;
   expecteds[F8E4M3FNUZ][S4] = false;
   expecteds[F8E4M3FNUZ][S8] = false;
   expecteds[F8E4M3FNUZ][S16] = false;
   expecteds[F8E4M3FNUZ][S32] = false;
   expecteds[F8E4M3FNUZ][S64] = false;
+  expecteds[F8E4M3FNUZ][U2] = false;
   expecteds[F8E4M3FNUZ][U4] = false;
   expecteds[F8E4M3FNUZ][U8] = false;
   expecteds[F8E4M3FNUZ][U16] = false;
diff --git a/third_party/xla/xla/printer.cc b/third_party/xla/xla/printer.cc
index 7aae58aa22f7f4..26c7575efa42b2 100644
--- a/third_party/xla/xla/printer.cc
+++ b/third_party/xla/xla/printer.cc
@@ -20,7 +20,8 @@ limitations under the License.
 #include <utility>
 
 #include "absl/strings/cord.h"
-#include "absl/strings/string_view.h"
+#include "absl/strings/cord_buffer.h"
+#include "absl/strings/str_cat.h"
 #include "tsl/platform/logging.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/printer.h b/third_party/xla/xla/printer.h
index cef4658f0db055..fd6c4314e85ce4 100644
--- a/third_party/xla/xla/printer.h
+++ b/third_party/xla/xla/printer.h
@@ -19,7 +19,9 @@ limitations under the License.
 #include <iterator>
 #include <string>
 
+#include "absl/base/optimization.h"
 #include "absl/strings/cord.h"
+#include "absl/strings/cord_buffer.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 
diff --git a/third_party/xla/xla/protobuf_util.cc b/third_party/xla/xla/protobuf_util.cc
index 022cbb745baace..d465270316569c 100644
--- a/third_party/xla/xla/protobuf_util.cc
+++ b/third_party/xla/xla/protobuf_util.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <string>
 
 #include "absl/hash/hash.h"
-#include "xla/status_macros.h"
-#include "xla/types.h"
+#include "xla/status.h"
 #include "xla/util.h"
 #include "tsl/platform/env.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/path.h"
 #include "tsl/platform/protobuf.h"
 
@@ -49,10 +49,10 @@ size_t ProtobufHash(const tsl::protobuf::Message& m) {
   return absl::HashOf(serialized);
 }
 
-Status DumpProtoToDirectory(const tsl::protobuf::Message& message,
-                            const std::string& directory,
-                            const std::string& file_name,
-                            std::string* full_path) {
+absl::Status DumpProtoToDirectory(const tsl::protobuf::Message& message,
+                                  const std::string& directory,
+                                  const std::string& file_name,
+                                  std::string* full_path) {
   tsl::Env* env = tsl::Env::Default();
   TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(directory));
   std::string safe_file_name = SanitizeFileName(file_name) + ".pb";
diff --git a/third_party/xla/xla/protobuf_util.h b/third_party/xla/xla/protobuf_util.h
index e739353cf826be..031596083df361 100644
--- a/third_party/xla/xla/protobuf_util.h
+++ b/third_party/xla/xla/protobuf_util.h
@@ -62,10 +62,10 @@ class ProtobufHashWrapper {
 //
 // If 'full_name' is not null then it is set to the name of the file the
 // protobuf was written to.
-Status DumpProtoToDirectory(const tsl::protobuf::Message& message,
-                            const std::string& directory,
-                            const std::string& file_name,
-                            std::string* full_path = nullptr);
+absl::Status DumpProtoToDirectory(const tsl::protobuf::Message& message,
+                                  const std::string& directory,
+                                  const std::string& file_name,
+                                  std::string* full_path = nullptr);
 
 // Registers a function that may either expand a dirpath or forward the original
 // dirpath along as-is.
diff --git a/third_party/xla/xla/python/BUILD b/third_party/xla/xla/python/BUILD
index c1d96723a03582..cc3c219dbdfebb 100644
--- a/third_party/xla/xla/python/BUILD
+++ b/third_party/xla/xla/python/BUILD
@@ -1,7 +1,7 @@
 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
-load("@local_tsl//tsl/platform:build_config.bzl", "pyx_library", "tf_proto_library")
+load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
@@ -55,10 +55,16 @@ exports_files([
     "xla_client.pyi",
 ])
 
-pyx_library(
-    name = "custom_call_for_test",
-    testonly = True,
-    srcs = ["custom_call_for_test.pyx"],
+tsl_pybind_extension(
+    name = "custom_calls_testlib",
+    srcs = ["custom_calls_testlib.cc"],
+    visibility = ["//visibility:private"],
+    deps = [
+        "//third_party/nanobind",
+        "//xla/ffi/api:c_api",
+        "//xla/ffi/api:ffi",
+        "@com_google_absl//absl/status",
+    ],
 )
 
 py_strict_test(
@@ -100,7 +106,7 @@ py_strict_test(
         "not_run:arm",
     ],  # TODO(phawkins): This test passes, but requires --config=monolithic.
     deps = [
-        ":custom_call_for_test",
+        ":custom_calls_testlib",
         ":xla_client",
         ":xla_extension",
         "@absl_py//absl/flags",
@@ -143,7 +149,10 @@ py_strict_test(
         "@absl_py//absl/logging",
         "@absl_py//absl/testing:absltest",
         "@absl_py//absl/testing:parameterized",
-    ] + xla_py_test_deps(),
+    ] + if_google(
+        [":xla_gpu_extension"],
+        [],
+    ) + xla_py_test_deps(),
 )
 
 tsl_pybind_extension(
@@ -191,7 +200,6 @@ cc_library(
         "//xla:shape_util",
         "//xla:status",
         "//xla:status_macros",
-        "//xla:statusor",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/pjrt:exceptions",
@@ -323,7 +331,6 @@ cc_library(
         ":nb_helpers",
         ":nb_numpy",
         ":pprof_profile_builder",
-        ":py_client_gpu",
         ":py_host_callback_proto_cc",
         ":python_ref_manager",
         ":traceback",
@@ -376,6 +383,7 @@ cc_library(
         "//xla/python/ifrt",
         "//xla/python/ifrt:plugin_program",
         "//xla/python/ifrt:plugin_program_serdes",
+        "//xla/python/ifrt/hlo:hlo_program",
         "//xla/python/pjrt_ifrt",
         "//xla/python/pjrt_ifrt:xla_host_callback_proto_cc",
         "//xla/python/pjrt_ifrt:xla_ifrt",
@@ -383,8 +391,8 @@ cc_library(
         "//xla/service:custom_call_status",
         "//xla/service:custom_call_target_registry",
         "//xla/service:platform_util",
+        "//xla/tsl/concurrency:ref_count",
         "//xla/tsl/python/lib/core:numpy",
-        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/framework:allocator",
         "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:errors",
@@ -399,10 +407,11 @@ cc_library(
         "@llvm-project//mlir:IR",
     ] + if_cuda([
         "@local_config_cuda//cuda:cuda_headers",
+        # TODO(b/324133505): remove this dependency after JAX OSS migrates to cuda plugin.
         "//xla/stream_executor/cuda:cuda_driver",
     ]) + if_rocm([
         "@local_config_rocm//rocm:rocm_headers",
-    ]),
+    ]) + if_cuda_or_rocm([":py_client_gpu"]),  # TODO(b/337876408): remove after migration to plugin
 )
 
 cc_library(
@@ -442,12 +451,14 @@ cc_library(
 
 cc_library(
     name = "py_client_gpu",
-    srcs = if_cuda_or_rocm([
-        "py_client_gpu.cc",
-    ]),
-    hdrs = if_cuda_or_rocm([
-        "py_client_gpu.h",
-    ]),
+    srcs = if_google(
+        ["py_client_gpu.cc"],
+        if_cuda_or_rocm(["py_client_gpu.cc"]),
+    ),
+    hdrs = if_google(
+        ["py_client_gpu.h"],
+        if_cuda_or_rocm(["py_client_gpu.h"]),
+    ),
     compatible_with = [],
     copts = [
         "-fexceptions",
@@ -468,11 +479,10 @@ cc_library(
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:errors",
-    ] + if_cuda([
-        "@local_config_cuda//cuda:cuda_headers",
-    ]) + if_rocm([
-        "@local_config_rocm//rocm:rocm_headers",
-    ]),
+    ] + if_rocm(
+        ["@local_config_rocm//rocm:rocm_headers"],
+        ["@local_config_cuda//cuda:cuda_headers"],
+    ),
 )
 
 cc_library(
@@ -503,7 +513,6 @@ cc_library(
         "//xla/python/ifrt",
         "//xla/python/pjrt_ifrt",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -558,6 +567,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:custom_call_sharding_helper",
         "//xla/service/spmd:spmd_partitioner",
+        "@com_google_absl//absl/status",
     ],
     # Always register 'InspectSharding' custom partitioning handler.
     alwayslink = 1,
@@ -724,7 +734,7 @@ cc_library(
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_future",
         "//xla/python/ifrt",
-        "@local_tsl//tsl/concurrency:ref_count",
+        "//xla/tsl/concurrency:ref_count",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
@@ -772,8 +782,8 @@ cc_library(
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:status_casters",
         "//xla/python/ifrt",
+        "//xla/tsl/concurrency:ref_count",
         "//xla/tsl/python/lib/core:numpy",
-        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/profiler/lib:traceme",
@@ -1078,6 +1088,8 @@ cc_library(
         ":py_client",
         ":types",
         # placeholder for index annotation deps
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
@@ -1117,7 +1129,10 @@ cc_library(
         "//xla/service:name_uniquer",
         "//xla/service:tuple_simplifier",
         "@local_tsl//tsl/lib/strings:proto_serialization",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -1181,6 +1196,52 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "gpu_support",
+    srcs = ["gpu_support.cc"],
+    hdrs = ["gpu_support.h"],
+    compatible_with = [],
+    copts = [
+        "-fexceptions",
+        "-fno-strict-aliasing",
+    ],
+    features = ["-use_header_modules"],
+    deps = [
+        ":nb_class_ptr",
+        ":py_client",
+        "//third_party/nanobind",
+        "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:status_casters",
+        "//xla/pjrt/distributed:client",
+        "//xla/pjrt/distributed:key_value_store_interface",
+        "//xla/pjrt/gpu:gpu_helpers",
+        "//xla/pjrt/gpu:se_gpu_pjrt_client",
+        "//xla/python/pjrt_ifrt",
+        "@local_config_python//:python_headers",  # buildcleaner: keep
+    ],
+)
+
+tsl_pybind_extension(
+    name = "xla_gpu_extension",
+    srcs = ["xla_gpu_support.cc"],
+    copts = [
+        "-fexceptions",
+        "-fno-strict-aliasing",
+    ],
+    features = ["-use_header_modules"],
+    pytype_deps = [],
+    pytype_srcs = [],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":gpu_support",
+        ":logging",
+        ":py_client",
+        # placeholder for index annotation deps
+        "//third_party/nanobind",
+        "@local_config_python//:python_headers",  # buildcleaner: keep
+    ],
+)
+
 tsl_pybind_extension(
     name = "xla_extension",
     srcs = ["xla.cc"],
@@ -1188,10 +1249,13 @@ tsl_pybind_extension(
         "-fexceptions",
         "-fno-strict-aliasing",
     ],
-    defines = select({
-        ":gpu_enabled": ["XLA_PYTHON_ENABLE_GPU=1"],
-        "//conditions:default": [],
-    }),
+    defines = if_google(
+        [],
+        select({
+            ":gpu_enabled": ["XLA_PYTHON_ENABLE_GPU=1"],
+            "//conditions:default": [],
+        }),
+    ),
     features = ["-use_header_modules"],
     linkopts = select({
         ":use_jax_cuda_pip_rpaths": [
@@ -1273,7 +1337,6 @@ tsl_pybind_extension(
         "//xla/pjrt/distributed:key_value_store_interface",
         "//xla/pjrt/distributed:protocol_proto_cc",
         "//xla/pjrt/distributed:service",
-        "//xla/pjrt/gpu:gpu_helpers",
         "//xla/python/ifrt",
         "//xla/python/ifrt:plugin_program",
         "//xla/python/ifrt:plugin_program_serdes",
@@ -1281,9 +1344,9 @@ tsl_pybind_extension(
         "//xla/python/pjrt_ifrt",
         "//xla/python/pjrt_ifrt:xla_ifrt",
         "//xla/service/cpu:collectives_interface",
+        "//xla/tsl/concurrency:ref_count",
         "//xla/tsl/distributed_runtime/preemption:preemption_sync_manager",
         "//xla/tsl/python/lib/core:numpy",
-        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/platform",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:status",
@@ -1305,17 +1368,20 @@ tsl_pybind_extension(
         "//conditions:default": [
             "//xla/pjrt/cpu:mpi_collectives",
         ],
-    }) + select({
-        ":gpu_enabled": [
-            "//xla/pjrt/gpu:se_gpu_pjrt_client",
-        ],
-        "//conditions:default": [],
-    }) + select({
-        ":link_gpu_plugin": [
-            ":gpu_plugin_deps",
-        ],
-        "//conditions:default": [],
-    }),
+    }) + if_google(
+        [],
+        select({
+            ":gpu_enabled": [
+                ":gpu_support",
+            ],
+            "//conditions:default": [],
+        }) + select({
+            ":link_gpu_plugin": [
+                ":gpu_plugin_deps",
+            ],
+            "//conditions:default": [],
+        }),
+    ),
 )
 
 cc_library(
diff --git a/third_party/xla/xla/python/custom_call_for_test.pyx b/third_party/xla/xla/python/custom_call_for_test.pyx
deleted file mode 100644
index 306b424067d115..00000000000000
--- a/third_party/xla/xla/python/custom_call_for_test.pyx
+++ /dev/null
@@ -1,33 +0,0 @@
-# cython: language_level=2
-# distutils: language = c++
-
-# Test case for defining a XLA custom call target in Cython, and registering
-# it via the xla_client SWIG API.
-
-from cpython.pycapsule cimport PyCapsule_New
-
-cdef void test_subtract_f32(void* out_ptr, void** data_ptr,
-                            void* xla_custom_call_status) nogil:
-  cdef float a = (<float*>(data_ptr[0]))[0]
-  cdef float b = (<float*>(data_ptr[1]))[0]
-  cdef float* out = <float*>(out_ptr)
-  out[0] = a - b
-
-cdef void test_add_input_and_opaque_len(void* out_buffer, void** ins,
-                                        const char* opaque, size_t opaque_len,
-                                        void* xla_custom_call_status):
-  cdef float a = (<float*>(ins[0]))[0]
-  cdef float b = <float>opaque_len
-  cdef float* out = <float*>(out_buffer)
-  out[0] = a + b
-
-
-cpu_custom_call_targets = {}
-
-cdef register_custom_call_target(fn_name, void* fn):
-  cdef const char* name = "xla._CUSTOM_CALL_TARGET"
-  cpu_custom_call_targets[fn_name] = PyCapsule_New(fn, name, NULL)
-
-register_custom_call_target(b"test_subtract_f32", <void*>(test_subtract_f32))
-register_custom_call_target(b"test_add_input_and_opaque_len",
-                            <void*>(test_add_input_and_opaque_len))
diff --git a/third_party/xla/xla/python/custom_calls_testlib.cc b/third_party/xla/xla/python/custom_calls_testlib.cc
new file mode 100644
index 00000000000000..ee32a9864681fe
--- /dev/null
+++ b/third_party/xla/xla/python/custom_calls_testlib.cc
@@ -0,0 +1,85 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "third_party/nanobind/include/nanobind/nanobind.h"
+#include "xla/ffi/api/c_api.h"
+#include "xla/ffi/api/ffi.h"
+
+namespace xla::ffi {
+namespace nb = ::nanobind;
+
+// Implement custom calls as static functions with XLA FFI types in the function
+// signature that gives access to the arguments and results buffers together
+// with their types and dimensions. See `ffi/api/ffi_test.cc` for more XLA FFI
+// examples and features (e.g. binding attributes, custom user-defined structs
+// and arbitrary execution context).
+
+static Error AlwaysFail(Result<BufferBase>) {
+  return Error(XLA_FFI_Error_Code_INTERNAL, "Failed intentionally");
+}
+
+static Error AlwaysSucceed(Result<BufferBase>) { return Error::Success(); }
+
+static Error Subtract(BufferR0<DataType::F32> a, BufferR0<DataType::F32> b,
+                      Result<BufferR0<DataType::F32>> out) {
+  *out->data = *a.data - *b.data;
+  return Error::Success();
+}
+
+static Error SubtractCst(BufferR0<DataType::F32> a,
+                         Result<BufferR0<DataType::F32>> out, float cst) {
+  *out->data = *a.data - cst;
+  return Error::Success();
+}
+
+// Define XLA FFI handlers from the implementations defined above using explicit
+// XLA FFI binding API to describe type signatures of custom calls.
+
+XLA_FFI_DEFINE_HANDLER(kAlwaysFail, AlwaysFail, Ffi::Bind().Ret<BufferBase>());
+
+XLA_FFI_DEFINE_HANDLER(kAlwaysSucceed, AlwaysSucceed,
+                       Ffi::Bind().Ret<BufferBase>());
+
+XLA_FFI_DEFINE_HANDLER(kSubtract, Subtract,
+                       Ffi::Bind()
+                           .Arg<BufferR0<DataType::F32>>()
+                           .Arg<BufferR0<DataType::F32>>()
+                           .Ret<BufferR0<DataType::F32>>());
+
+XLA_FFI_DEFINE_HANDLER(kSubtractCst, SubtractCst,
+                       Ffi::Bind()
+                           .Arg<BufferR0<DataType::F32>>()
+                           .Ret<BufferR0<DataType::F32>>()
+                           .Attr<float>("cst"));
+
+template <typename T>
+static auto BindFunction(T* fn) {
+  return nb::capsule(reinterpret_cast<void*>(fn));
+}
+
+// Custom calls registration library that exports function pointers to XLA FFI
+// handlers to the python users.
+NB_MODULE(custom_calls_testlib, m) {
+  m.def("registrations", []() {
+    nb::dict dict;
+    dict["always_fail"] = BindFunction(kAlwaysFail);
+    dict["always_succeed"] = BindFunction(kAlwaysSucceed);
+    dict["subtract_f32"] = BindFunction(kSubtract);
+    dict["subtract_f32_cst"] = BindFunction(kSubtractCst);
+    return dict;
+  });
+}
+
+}  // namespace xla::ffi
diff --git a/third_party/xla/xla/python/dlpack.cc b/third_party/xla/xla/python/dlpack.cc
index f4089c63330d2a..1714eead8aac77 100644
--- a/third_party/xla/xla/python/dlpack.cc
+++ b/third_party/xla/xla/python/dlpack.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
-#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
@@ -35,7 +34,6 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
 #include "xla/layout.h"
-#include "xla/layout_util.h"
 #include "xla/pjrt/exceptions.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
@@ -428,19 +426,8 @@ absl::StatusOr<nb::object> DLPackManagedTensorToBuffer(
   // for non-default layouts, and will return wrong results if a non-default
   // layout is passed to a computation expecting default layouts. Remove this
   // special case when non-default layouts are better supported by JAX.
-  absl::StatusOr<Layout> default_layout_from_client =
-      device->client()->GetDefaultLayout(element_type, dimensions);
-  Layout default_layout;
-  if (default_layout_from_client.ok()) {
-    default_layout = *default_layout_from_client;
-  } else if (absl::IsUnimplemented(default_layout_from_client.status())) {
-    // TODO(skyewm): consider remove the fallback path when GetDefaultLayout is
-    // unimplemented.
-    Shape host_shape = ShapeUtil::MakeShape(element_type, dimensions);
-    default_layout = LayoutUtil::GetWithDefaultLayout(host_shape).layout();
-  } else {
-    return default_layout_from_client.status();
-  }
+  TF_ASSIGN_OR_RETURN(Layout default_layout, device->client()->GetDefaultLayout(
+                                                 element_type, dimensions));
   if (shape.layout() != default_layout) {
     return Unimplemented(
         "from_dlpack got array with non-default layout with minor-to-major "
@@ -488,6 +475,11 @@ absl::StatusOr<nb::object> DLPackManagedTensorToBuffer(
     throw XlaRuntimeError(
         "DLPack is supported for PjRt-compatible backends only.");
   }
+  if (!device->IsAddressable()) {
+    throw XlaRuntimeError(
+        "DLPack is only supported for devices addressable by the current "
+        "process.");
+  }
   if (std::string_view(tensor.name()) != kDlTensorCapsuleName) {
     return InvalidArgument(
         "DLPack tensor must be a capsule with name \"dltensor\", got \"%s\". "
diff --git a/third_party/xla/xla/python/gpu_support.cc b/third_party/xla/xla/python/gpu_support.cc
new file mode 100644
index 00000000000000..eb590545092233
--- /dev/null
+++ b/third_party/xla/xla/python/gpu_support.cc
@@ -0,0 +1,99 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <Python.h>
+
+#include <memory>
+#include <optional>
+#include <set>
+#include <string>
+#include <utility>
+
+#include "third_party/nanobind/include/nanobind/nanobind.h"
+#include "third_party/nanobind/include/nanobind/stl/optional.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/set.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/shared_ptr.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/string.h"  // IWYU pragma: keep
+#include "xla/pjrt/distributed/client.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/pjrt/gpu/gpu_helpers.h"
+#include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/status_casters.h"
+#include "xla/python/nb_class_ptr.h"
+#include "xla/python/pjrt_ifrt/pjrt_client.h"
+#include "xla/python/py_client.h"
+
+namespace xla {
+namespace {
+
+namespace nb = nanobind;
+
+}  // namespace
+
+void RegisterGpuClientAndDefineGpuAllocatorConfig(nanobind::module_& m_nb) {
+  nb::class_<GpuAllocatorConfig> alloc_config(m_nb, "GpuAllocatorConfig");
+  alloc_config.def(nb::init<>())
+      .def_rw("kind", &GpuAllocatorConfig::kind)
+      .def_rw("memory_fraction", &GpuAllocatorConfig::memory_fraction)
+      .def_rw("preallocate", &GpuAllocatorConfig::preallocate)
+      .def_rw("collective_memory_size",
+              &GpuAllocatorConfig::collective_memory_size);
+  nb::enum_<GpuAllocatorConfig::Kind>(alloc_config, "Kind")
+      .value("DEFAULT", GpuAllocatorConfig::Kind::kDefault)
+      .value("PLATFORM", GpuAllocatorConfig::Kind::kPlatform)
+      .value("BFC", GpuAllocatorConfig::Kind::kBFC)
+      .value("CUDA_ASYNC", GpuAllocatorConfig::Kind::kCudaAsync);
+
+  m_nb.def(
+      "get_gpu_client",
+      [](bool asynchronous, const GpuAllocatorConfig& allocator_config,
+         std::shared_ptr<DistributedRuntimeClient> distributed_client,
+         int node_id, int num_nodes,
+         std::optional<std::set<int>> allowed_devices,
+         std::optional<std::string> platform_name,
+         std::optional<bool> mock = false) -> nb_class_ptr<PyClient> {
+        std::unique_ptr<ifrt::PjRtClient> ifrt_client;
+        {
+          nb::gil_scoped_release gil_release;
+          std::shared_ptr<KeyValueStoreInterface> kv_store = nullptr;
+          if (distributed_client != nullptr) {
+            kv_store = GetDistributedKeyValueStore(distributed_client,
+                                                   /*key_prefix=*/"gpu:");
+          }
+          GpuClientOptions options;
+          options.allocator_config = allocator_config;
+          options.node_id = node_id;
+          options.num_nodes = num_nodes;
+          options.allowed_devices = allowed_devices;
+          options.platform_name = platform_name;
+          options.kv_store = kv_store;
+          options.enable_mock_nccl = mock.value_or(false);
+          std::unique_ptr<PjRtClient> pjrt_client =
+              xla::ValueOrThrow(GetStreamExecutorGpuClient(options));
+          ifrt_client = ifrt::PjRtClient::Create(std::move(pjrt_client));
+        }
+        return PyClient::Make(std::move(ifrt_client));
+      },
+      nb::arg("asynchronous") = true,
+      nb::arg("allocator_config") = GpuAllocatorConfig(),
+      nb::arg("distributed_client") = nullptr, nb::arg("node_id") = 0,
+      nb::arg("num_nodes") = 1,
+      nb::arg("allowed_devices").none() = std::nullopt,
+      nb::arg("platform_name").none() = std::nullopt,
+      nb::arg("mock").none() = std::nullopt);
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/runtime/type_id.cc b/third_party/xla/xla/python/gpu_support.h
similarity index 63%
rename from third_party/xla/xla/runtime/type_id.cc
rename to third_party/xla/xla/python/gpu_support.h
index cc34ef81c59a56..6beff18daf4938 100644
--- a/third_party/xla/xla/runtime/type_id.cc
+++ b/third_party/xla/xla/python/gpu_support.h
@@ -1,4 +1,4 @@
-/* Copyright 2022 The OpenXLA Authors.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,19 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "xla/runtime/type_id.h"
 
-#include <string_view>
-#include <vector>
+#ifndef XLA_PYTHON_GPU_SUPPORT_H_
+#define XLA_PYTHON_GPU_SUPPORT_H_
+
+#include "third_party/nanobind/include/nanobind/nanobind.h"
 
 namespace xla {
-namespace runtime {
 
-std::string_view TypeIDNameRegistry::FindTypeIDSymbolName(TypeID type_id) {
-  auto it = type_id_name_map_.find(type_id);
-  if (it == type_id_name_map_.end()) return "";
-  return it->second;
-}
+// Registers `make_gpu_client` and `GpuAllocatorConfig` in the parameter
+// nanobind module.
+void RegisterGpuClientAndDefineGpuAllocatorConfig(nanobind::module_& m_nb);
 
-}  // namespace runtime
 }  // namespace xla
+
+#endif  // XLA_PYTHON_GPU_SUPPORT_H_
diff --git a/third_party/xla/xla/python/ifrt/BUILD b/third_party/xla/xla/python/ifrt/BUILD
index a5b869268b1d4d..4eef2c9d43718b 100644
--- a/third_party/xla/xla/python/ifrt/BUILD
+++ b/third_party/xla/xla/python/ifrt/BUILD
@@ -41,8 +41,8 @@ cc_library(
         "compiler.cc",
         "device.cc",
         "dtype.cc",
+        "remap_plan.cc",
         "executable.cc",
-        "future.cc",
         "host_callback.cc",
         "index.cc",
         "index_domain.cc",
@@ -60,6 +60,7 @@ cc_library(
         "array_spec.h",
         "client.h",
         "compiler.h",
+        "remap_plan.h",
         "device.h",
         "dtype.h",
         "executable.h",
@@ -81,20 +82,25 @@ cc_library(
         ":array_spec_proto_cc",
         ":device_proto_cc",
         ":dtype_proto_cc",
+        ":remap_plan_proto_cc",
         ":serdes",
         ":shape_proto_cc",
         ":sharding_proto_cc",
         "//xla:status",
+        "//xla:status_macros",
         "//xla:statusor",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_common",
+        "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:pjrt_device_description",
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt:pjrt_future",
         "//xla/pjrt:pjrt_layout",
-        "//xla/python/ifrt/ir",
+        "//xla/python/ifrt/ir:sharding_param",
+        "//xla/service:computation_placer_hdr",
+        "//xla/tsl/concurrency:ref_count",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -108,10 +114,8 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
-        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/lib/gtl:int_type",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
@@ -196,7 +200,7 @@ xla_cc_test(
     deps = [
         ":ifrt",
         ":sharding_test_util",
-        "//xla/python/ifrt/ir",
+        "//xla/python/ifrt/ir:sharding_param",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//llvm:Support",
         "@local_tsl//tsl/platform:errors",
@@ -213,10 +217,10 @@ cc_library(
     deps = [
         ":ifrt",
         "//xla:statusor",
+        "//xla/tsl/concurrency:ref_count",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
@@ -232,7 +236,11 @@ cc_library(
         ":ifrt",
         ":mock",
         ":test_util",
-        "//xla/pjrt:pjrt_common",
+        "//xla:util",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:test",
     ],
 )
@@ -299,8 +307,8 @@ cc_library(
     deps = [
         ":ifrt",
         ":test_util",
+        "//xla/tsl/concurrency:ref_count",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
@@ -325,18 +333,25 @@ cc_library(
     deps = [
         ":ifrt",
         "//xla:literal",
+        "//xla:status",
         "//xla:test",
+        "//xla/hlo/ir:hlo",
         "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:pjrt_common",
+        "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:pjrt_device_description",
+        "//xla/pjrt:pjrt_executable",
+        "//xla/pjrt:pjrt_future",
         "//xla/pjrt:pjrt_layout",
+        "//xla/tsl/concurrency:ref_count",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/concurrency:ref_count",
+        "@local_tsl//tsl/framework:allocator",
     ],
 )
 
@@ -493,6 +508,50 @@ xla_cc_test(
     ],
 )
 
+tf_proto_library(
+    name = "remap_plan_proto",
+    srcs = ["remap_plan.proto"],
+    protodeps = [":array_spec_proto"],
+)
+
+xla_cc_test(
+    name = "remap_plan_test",
+    size = "small",
+    srcs = ["remap_plan_test.cc"],
+    deps = [
+        ":ifrt",
+        ":sharding_serdes",
+        ":sharding_test_util",
+        "@com_google_absl//absl/functional:bind_front",
+        "@com_google_absl//absl/status",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+        "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
+    ],
+)
+
+cc_library(
+    name = "remap_impl_test_lib",
+    testonly = True,
+    srcs = ["remap_impl_test_lib.cc"],
+    deps = [
+        ":ifrt",
+        ":test_util",
+        "//xla:status_macros",
+        "//xla/tsl/concurrency:ref_count",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
+    ],
+    alwayslink = True,
+)
+
 tf_proto_library(
     name = "shape_proto",
     srcs = ["shape.proto"],
@@ -553,3 +612,66 @@ xla_cc_test(
         "@local_tsl//tsl/protobuf:status_proto_cc",
     ],
 )
+
+cc_library(
+    name = "custom_call_program",
+    srcs = ["custom_call_program.cc"],
+    hdrs = ["custom_call_program.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":ifrt",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+tf_proto_library(
+    name = "custom_call_program_proto",
+    srcs = ["custom_call_program.proto"],
+    protodeps = [
+        ":array_spec_proto",
+        ":device_proto",
+    ],
+)
+
+cc_library(
+    name = "custom_call_program_serdes",
+    srcs = ["custom_call_program_serdes.cc"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":array_spec_proto_cc",
+        ":custom_call_program",
+        ":custom_call_program_proto_cc",
+        ":ifrt",
+        ":program_serdes",
+        ":serdes",
+        ":sharding_proto_cc",
+        ":sharding_serdes",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//llvm:Support",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+    alwayslink = True,
+)
+
+xla_cc_test(
+    name = "custom_call_program_serdes_test",
+    srcs = ["custom_call_program_serdes_test.cc"],
+    deps = [
+        ":custom_call_program",
+        ":custom_call_program_serdes",
+        ":ifrt",
+        ":program_serdes",
+        ":serdes",
+        ":sharding_test_util",
+        "@com_google_absl//absl/functional:bind_front",
+        "@com_google_absl//absl/status",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
+    ],
+)
diff --git a/third_party/xla/xla/python/ifrt/array.h b/third_party/xla/xla/python/ifrt/array.h
index 9047945e02e447..e0b334831e8d20 100644
--- a/third_party/xla/xla/python/ifrt/array.h
+++ b/third_party/xla/xla/python/ifrt/array.h
@@ -31,7 +31,7 @@ limitations under the License.
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/ifrt/value.h"
 #include "xla/status.h"
-#include "tsl/concurrency/ref_count.h"
+#include "xla/tsl/concurrency/ref_count.h"
 
 namespace xla {
 namespace ifrt {
diff --git a/third_party/xla/xla/python/ifrt/client.cc b/third_party/xla/xla/python/ifrt/client.cc
index 25233a2150ff21..a834f6957d1506 100644
--- a/third_party/xla/xla/python/ifrt/client.cc
+++ b/third_party/xla/xla/python/ifrt/client.cc
@@ -15,8 +15,6 @@ limitations under the License.
 
 #include "xla/python/ifrt/client.h"
 
-#include "xla/statusor.h"
-
 namespace xla {
 namespace ifrt {
 
diff --git a/third_party/xla/xla/python/ifrt/client.h b/third_party/xla/xla/python/ifrt/client.h
index 6ac3368eb3156f..ef42e3b3b42f88 100644
--- a/third_party/xla/xla/python/ifrt/client.h
+++ b/third_party/xla/xla/python/ifrt/client.h
@@ -21,22 +21,28 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
-#include <variant>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/Support/ExtensibleRTTI.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/compiler.h"
 #include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/remap_plan.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
 #include "xla/python/ifrt/tuple.h"
 #include "xla/python/ifrt/value.h"
-#include "xla/statusor.h"
+#include "xla/service/computation_placer.h"
+#include "xla/tsl/concurrency/ref_count.h"
 
 namespace xla {
 namespace ifrt {
@@ -111,6 +117,23 @@ class Client : public llvm::RTTIExtends<Client, llvm::RTTIRoot> {
       absl::Span<tsl::RCReference<Array>> arrays,
       ArrayCopySemantics semantics) = 0;
 
+  // Remaps shards across input `Array`s to create new `Array`s based on `plan`.
+  // This array remapping is a metadata-only operation that can shuffle or
+  // extract shards without changing their per-shard interpretation and causing
+  // data copy/transfer.
+  //
+  // There are constraints on `semantics`:
+  //
+  // * `ArrayCopySemantics::kAlwaysCopy` has an undefined behavior because
+  // `RemapArrays` does not copy data.
+  // * `ArrayCopySemantics::kReuseInput` is allowed only if the number of inputs
+  // is 1. This is safe because each input shard can be used only once.
+  // * `ArrayCopySemantics::kDonateInput` is always allowed.
+  virtual absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>>
+  RemapArrays(const RemapPlan& plan,
+              absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
+              ArrayCopySemantics semantics) = 0;
+
   // Builds a tuple from a sequence of values.
   virtual absl::StatusOr<tsl::RCReference<Tuple>> MakeTuple(
       absl::Span<tsl::RCReference<Value>> values) = 0;
diff --git a/third_party/xla/xla/mlir/runtime/ir/rt_interfaces.cc b/third_party/xla/xla/python/ifrt/custom_call_program.cc
similarity index 69%
rename from third_party/xla/xla/mlir/runtime/ir/rt_interfaces.cc
rename to third_party/xla/xla/python/ifrt/custom_call_program.cc
index 611b341b2bad17..ce060f100b602c 100644
--- a/third_party/xla/xla/mlir/runtime/ir/rt_interfaces.cc
+++ b/third_party/xla/xla/python/ifrt/custom_call_program.cc
@@ -1,4 +1,4 @@
-/* Copyright 2022 The OpenXLA Authors.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,7 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/mlir/runtime/ir/rt_interfaces.h"  // IWYU pragma: keep
+#include "xla/python/ifrt/custom_call_program.h"
 
-#define GET_ATTR_INTERFACE_CLASSES
-#include "xla/mlir/runtime/ir/rt_attr_interfaces.cc.inc"  // IWYU pragma: keep
+namespace xla {
+namespace ifrt {
+
+char CustomCallProgram::ID = 0;         // NOLINT
+char CustomCallCompileOptions::ID = 0;  // NOLINT
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/custom_call_program.h b/third_party/xla/xla/python/ifrt/custom_call_program.h
new file mode 100644
index 00000000000000..9d8e924a1561f2
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/custom_call_program.h
@@ -0,0 +1,87 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_CUSTOM_CALL_PROGRAM_H_
+#define XLA_PYTHON_IFRT_CUSTOM_CALL_PROGRAM_H_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/python/ifrt/array_spec.h"
+#include "xla/python/ifrt/compiler.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/program.h"
+
+namespace xla {
+namespace ifrt {
+
+// Wraps a custom call program that expresses a runtime-specific execution.
+struct CustomCallProgram
+    : public llvm::RTTIExtends<CustomCallProgram, Program> {
+  // Specification for a single array. The sharding of all input and output
+  // specs must use only the devices in `devices`.
+  CustomCallProgram(std::string type, std::string name,
+                    std::string serialized_program_text, DeviceList devices,
+                    std::vector<ArraySpec> input_specs,
+                    std::vector<ArraySpec> output_specs)
+      : type(std::move(type)),
+        name(std::move(name)),
+        serialized_program_text(std::move(serialized_program_text)),
+        devices(std::move(devices)),
+        input_specs(std::move(input_specs)),
+        output_specs(std::move(output_specs)) {}
+  ~CustomCallProgram() override = default;
+
+  // Type of this custom call program recognized by IFRT implementations. It
+  // indicates what this program represents, e.g., a runtime-specific feature or
+  // a pickled Python function.
+  std::string type;
+
+  // Name of this program. Used for debugging.
+  std::string name;
+
+  // Serialized custom call program. The interpretation of the program text
+  // depends `type`.
+  std::string serialized_program_text;
+
+  // List of devices to compile and run the custom call program on.
+  DeviceList devices;
+
+  // Specification for input and output arrays. The custom call program must
+  // expect to receive input arrays and return output arrays both following the
+  // specification.
+  std::vector<ArraySpec> input_specs;
+  std::vector<ArraySpec> output_specs;
+
+  static char ID;  // NOLINT
+};
+
+// Compile options for a custom call program. It is currently empty because
+// the custom call program does not use any other runtime objects for
+// compilation.
+struct CustomCallCompileOptions
+    : llvm::RTTIExtends<CustomCallCompileOptions, CompileOptions> {
+  CustomCallCompileOptions() = default;
+  ~CustomCallCompileOptions() override = default;
+
+  static char ID;  // NOLINT
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_CUSTOM_CALL_PROGRAM_H_
diff --git a/third_party/xla/xla/mlir/runtime/ir/rt_interfaces.h b/third_party/xla/xla/python/ifrt/custom_call_program.proto
similarity index 58%
rename from third_party/xla/xla/mlir/runtime/ir/rt_interfaces.h
rename to third_party/xla/xla/python/ifrt/custom_call_program.proto
index 6a58a9578f9db0..e631b2935f1782 100644
--- a/third_party/xla/xla/mlir/runtime/ir/rt_interfaces.h
+++ b/third_party/xla/xla/python/ifrt/custom_call_program.proto
@@ -1,4 +1,4 @@
-/* Copyright 2022 The OpenXLA Authors.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,12 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_MLIR_RUNTIME_IR_RT_INTERFACES_H_
-#define XLA_MLIR_RUNTIME_IR_RT_INTERFACES_H_
+syntax = "proto3";
 
-#include "mlir/IR/Attributes.h"  // from @llvm-project  // IWYU pragma: keep
+package xla.ifrt;
 
-#define GET_ATTR_INTERFACE_CLASSES
-#include "xla/mlir/runtime/ir/rt_attr_interfaces.h.inc"
+import "xla/python/ifrt/array_spec.proto";
+import "xla/python/ifrt/device.proto";
 
-#endif  // XLA_MLIR_RUNTIME_IR_RT_INTERFACES_H_
+// Proto equivalent of C++ `CustomCallProgram`.
+message CustomCallProgramProto {
+  string type = 1;
+  string name = 2;
+  bytes serialized_program_text = 3;
+  DeviceListProto devices = 4;
+  repeated ArraySpecProto input_specs = 5;
+  repeated ArraySpecProto output_specs = 6;
+}
diff --git a/third_party/xla/xla/python/ifrt/custom_call_program_serdes.cc b/third_party/xla/xla/python/ifrt/custom_call_program_serdes.cc
new file mode 100644
index 00000000000000..feb476ef0043a3
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/custom_call_program_serdes.cc
@@ -0,0 +1,156 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/python/ifrt/array_spec.h"
+#include "xla/python/ifrt/array_spec.pb.h"
+#include "xla/python/ifrt/custom_call_program.h"
+#include "xla/python/ifrt/custom_call_program.pb.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/program_serdes.h"
+#include "xla/python/ifrt/serdes.h"
+#include "xla/python/ifrt/sharding.pb.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace ifrt {
+
+namespace {
+
+// Serialization/deserialization for `CustomCallProgram`.
+class CustomCallProgramSerDes
+    : public llvm::RTTIExtends<CustomCallProgramSerDes, SerDes> {
+ public:
+  absl::string_view type_name() const override {
+    return "xla::ifrt::CustomCallProgram";
+  }
+
+  absl::StatusOr<std::string> Serialize(Serializable& serializable) override {
+    const CustomCallProgram& program =
+        llvm::cast<CustomCallProgram>(serializable);
+    CustomCallProgramProto proto;
+    proto.set_type(program.type);
+    proto.set_name(program.name);
+    proto.set_serialized_program_text(program.serialized_program_text);
+    *proto.mutable_devices() = program.devices.ToProto();
+    for (const ArraySpec& spec : program.input_specs) {
+      TF_ASSIGN_OR_RETURN(*proto.add_input_specs(), spec.ToProto());
+    }
+    for (const ArraySpec& spec : program.output_specs) {
+      TF_ASSIGN_OR_RETURN(*proto.add_output_specs(), spec.ToProto());
+    }
+    return proto.SerializeAsString();
+  }
+
+  absl::StatusOr<std::unique_ptr<Serializable>> Deserialize(
+      const std::string& serialized,
+      std::unique_ptr<DeserializeOptions> options) override {
+    const auto* deserialize_program_options =
+        llvm::cast<DeserializeProgramOptions>(options.get());
+
+    CustomCallProgramProto proto;
+    if (!proto.ParseFromString(serialized)) {
+      return absl::InvalidArgumentError(
+          "Failed to parse serialized CustomCallProgramProto");
+    }
+    TF_ASSIGN_OR_RETURN(
+        DeviceList devices,
+        DeviceList::FromProto(deserialize_program_options->lookup_device,
+                              proto.devices()));
+    std::vector<ArraySpec> input_specs;
+    input_specs.reserve(proto.input_specs_size());
+    for (const ArraySpecProto& spec_proto : proto.input_specs()) {
+      TF_ASSIGN_OR_RETURN(
+          ArraySpec spec,
+          ArraySpec::FromProto(deserialize_program_options->lookup_device,
+                               spec_proto));
+      input_specs.push_back(std::move(spec));
+    }
+    std::vector<ArraySpec> output_specs;
+    output_specs.reserve(proto.output_specs_size());
+    for (const ArraySpecProto& spec_proto : proto.output_specs()) {
+      TF_ASSIGN_OR_RETURN(
+          ArraySpec spec,
+          ArraySpec::FromProto(deserialize_program_options->lookup_device,
+                               spec_proto));
+      output_specs.push_back(std::move(spec));
+    }
+
+    return std::make_unique<CustomCallProgram>(
+        /*type=*/proto.type(), /*name=*/proto.name(),
+        /*serialized_program_text=*/
+        std::move(*proto.mutable_serialized_program_text()),
+        /*devices=*/std::move(devices),
+        /*input_specs=*/std::move(input_specs),
+        /*output_specs=*/std::move(output_specs));
+  }
+
+  static char ID;  // NOLINT
+};
+
+// Serialization/deserialization for `CustomCallCompileOptions`.
+class CustomCallCompileOptionsSerDes
+    : public llvm::RTTIExtends<CustomCallCompileOptionsSerDes, SerDes> {
+ public:
+  absl::string_view type_name() const override {
+    return "xla::ifrt::CustomCallCompileOptions";
+  }
+
+  absl::StatusOr<std::string> Serialize(Serializable& serializable) override {
+    return "";
+  }
+
+  absl::StatusOr<std::unique_ptr<Serializable>> Deserialize(
+      const std::string& serialized,
+      std::unique_ptr<DeserializeOptions> options) override {
+    if (!serialized.empty()) {
+      return absl::InvalidArgumentError(
+          "Invalid serialized CustomCallCompileOptions; a serialized "
+          "CustomCallCompileOptions is expected to be an empty string");
+    }
+    return std::make_unique<CustomCallCompileOptions>();
+  }
+
+  static char ID;  // NOLINT
+};
+
+[[maybe_unused]] char CustomCallProgramSerDes::ID = 0;         // NOLINT
+[[maybe_unused]] char CustomCallCompileOptionsSerDes::ID = 0;  // NOLINT
+
+// clang-format off
+bool register_custom_call_program_serdes = ([]{
+  RegisterSerDes<CustomCallProgram>(
+      std::make_unique<CustomCallProgramSerDes>());
+}(), true);
+
+bool register_custom_call_compile_options_serdes = ([]{
+  RegisterSerDes<CustomCallCompileOptions>(
+      std::make_unique<CustomCallCompileOptionsSerDes>());
+}(), true);
+// clang-format on
+
+}  // namespace
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/custom_call_program_serdes_test.cc b/third_party/xla/xla/python/ifrt/custom_call_program_serdes_test.cc
new file mode 100644
index 00000000000000..e942c4bce5d4ef
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/custom_call_program_serdes_test.cc
@@ -0,0 +1,144 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/functional/bind_front.h"
+#include "absl/status/status.h"
+#include "llvm/Support/Casting.h"
+#include "xla/python/ifrt/array_spec.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/custom_call_program.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/memory.h"
+#include "xla/python/ifrt/program_serdes.h"
+#include "xla/python/ifrt/serdes.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt/sharding_test_util.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/status_matchers.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/test.h"
+
+namespace xla {
+namespace ifrt {
+namespace {
+
+using ::testing::MatchesRegex;
+using ::testing::SizeIs;
+using ::tsl::testing::StatusIs;
+
+class CustomCallProgramSerDesTest : public test_util::ShardingTest {};
+
+TEST_P(CustomCallProgramSerDesTest, RoundTrip) {
+  Shape shape0({10, 20});
+  Shape shard_shape0({5, 20});
+  DeviceList devices = GetDevices({0, 1});
+  std::shared_ptr<const Sharding> sharding0 =
+      ConcreteEvenSharding::Create(devices, MemoryKind(),
+                                   /*shape=*/shape0,
+                                   /*shard_shape=*/shard_shape0);
+
+  Shape shape1({});
+  Shape shard_shape1({});
+  std::shared_ptr<const Sharding> sharding1 =
+      ConcreteEvenSharding::Create(devices, MemoryKind(),
+                                   /*shape=*/shape1,
+                                   /*shard_shape=*/shard_shape1);
+
+  CustomCallProgram orig(
+      /*type=*/"test type",
+      /*name=*/"test name",
+      /*serialized_program_text=*/"test\0program\0text\0",
+      /*devices=*/std::move(devices),
+      /*input_specs=*/
+      {
+          ArraySpec{/*dtype=*/DType(DType::kF32), /*shape=*/shape0,
+                    /*sharding=*/sharding0},
+      },
+      /*output_specs=*/
+      {
+          ArraySpec{/*dtype=*/DType(DType::kF32), /*shape=*/shape1,
+                    /*sharding=*/sharding1},
+      });
+
+  TF_ASSERT_OK_AND_ASSIGN(Serialized serialized, Serialize(orig));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<CustomCallProgram> deserialized_program,
+      Deserialize<CustomCallProgram>(
+          serialized, std::make_unique<DeserializeProgramOptions>(
+                          absl::bind_front(&Client::LookupDevice, client()))));
+
+  EXPECT_EQ(deserialized_program->type, "test type");
+  EXPECT_EQ(deserialized_program->name, "test name");
+  EXPECT_EQ(deserialized_program->serialized_program_text,
+            "test\0program\0text\0");
+
+  EXPECT_EQ(deserialized_program->devices, orig.devices);
+
+  ASSERT_THAT(deserialized_program->input_specs, SizeIs(1));
+  EXPECT_EQ(deserialized_program->input_specs.front().dtype,
+            DType(DType::kF32));
+  EXPECT_EQ(deserialized_program->input_specs.front().shape, shape0);
+  const auto* deserialized_sharding0 = llvm::dyn_cast<ConcreteEvenSharding>(
+      deserialized_program->input_specs.front().sharding.get());
+  ASSERT_NE(deserialized_sharding0, nullptr);
+  EXPECT_EQ(deserialized_sharding0->devices(), sharding0->devices());
+  EXPECT_EQ(deserialized_sharding0->shape(), shape0);
+  EXPECT_EQ(deserialized_sharding0->shard_shape(), shard_shape0);
+
+  ASSERT_THAT(deserialized_program->output_specs, SizeIs(1));
+  EXPECT_EQ(deserialized_program->output_specs.front().dtype,
+            DType(DType::kF32));
+  EXPECT_EQ(deserialized_program->output_specs.front().shape, shape1);
+  const auto* deserialized_sharding1 = llvm::dyn_cast<ConcreteEvenSharding>(
+      deserialized_program->output_specs.front().sharding.get());
+  ASSERT_NE(deserialized_sharding1, nullptr);
+  EXPECT_EQ(deserialized_sharding1->devices(), sharding1->devices());
+  EXPECT_EQ(deserialized_sharding1->shape(), shape1);
+  EXPECT_EQ(deserialized_sharding1->shard_shape(), shard_shape1);
+}
+
+INSTANTIATE_TEST_SUITE_P(NumDevices, CustomCallProgramSerDesTest,
+                         testing::Values(test_util::ShardingTestParam{
+                             /*num_devices=*/2,
+                             /*num_addressable_devices=*/2}));
+
+TEST(CustomCallCompileOptionsSerDesTest, RoundTrip) {
+  CustomCallCompileOptions orig;
+  TF_ASSERT_OK_AND_ASSIGN(Serialized serialized, Serialize(orig));
+  TF_EXPECT_OK(
+      Deserialize<CustomCallCompileOptions>(serialized, /*options=*/nullptr)
+          .status());
+}
+
+TEST(CustomCallCompileOptionsSerDesTest, InvalidSerialized) {
+  CustomCallCompileOptions orig;
+  TF_ASSERT_OK_AND_ASSIGN(Serialized serialized, Serialize(orig));
+  serialized.set_data("abc");
+  EXPECT_THAT(
+      Deserialize<CustomCallCompileOptions>(serialized, /*options=*/nullptr),
+      StatusIs(absl::StatusCode::kInvalidArgument,
+               MatchesRegex("Invalid serialized CustomCallCompileOptions.*")));
+}
+
+}  // namespace
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/executable.h b/third_party/xla/xla/python/ifrt/executable.h
index 0000e6ac9df9bd..6e9fcc149d2846 100644
--- a/third_party/xla/xla/python/ifrt/executable.h
+++ b/third_party/xla/xla/python/ifrt/executable.h
@@ -36,7 +36,7 @@ limitations under the License.
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/future.h"
 #include "xla/status.h"
-#include "tsl/concurrency/ref_count.h"
+#include "xla/tsl/concurrency/ref_count.h"
 
 namespace xla {
 namespace ifrt {
@@ -124,7 +124,7 @@ class LoadedExecutable
   // compilation work in the background. Implementations must still ensure that
   // all other methods can be used even without explicitly waiting for the ready
   // future (e.g., via blocking).
-  virtual Future<absl::Status> GetReadyFuture() const = 0;
+  virtual Future<> GetReadyFuture() const = 0;
 
   // The following APIs are taken from `xla::PjRtExecutable` for fast
   // prototyping.
@@ -206,7 +206,7 @@ class LoadedExecutable
   // The returned future will have the result of the deletion on the devices.
   // Implementations that do not track the completion of the deletion operation
   // may make the future immediately ready with an OK status.
-  virtual Future<Status> Delete() = 0;
+  virtual Future<> Delete() = 0;
   // Returns whether the executable has been enqueued for deletion from the
   // devices.
   virtual bool IsDeleted() const = 0;
diff --git a/third_party/xla/xla/python/ifrt/future.cc b/third_party/xla/xla/python/ifrt/future.cc
deleted file mode 100644
index 3533b85af3e99e..00000000000000
--- a/third_party/xla/xla/python/ifrt/future.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/python/ifrt/future.h"
-
-#include <atomic>
-#include <memory>
-#include <utility>
-
-#include "absl/base/thread_annotations.h"
-#include "absl/synchronization/mutex.h"
-#include "absl/types/span.h"
-#include "xla/pjrt/pjrt_future.h"
-#include "xla/status.h"
-
-namespace xla {
-namespace ifrt {
-
-Future<Status> JoinFutures(absl::Span<Future<Status>> futures) {
-  if (futures.empty()) {
-    return Future<Status>(OkStatus());
-  } else if (futures.size() == 1) {
-    return futures.front();
-  }
-  // State shared by `PjRtFuture` onready callbacks.
-  struct CombinedStatus {
-    explicit CombinedStatus(int initial_count)
-        : count(initial_count), promise(Future<Status>::CreatePromise()) {}
-    std::atomic<int> count;
-    absl::Mutex mu;
-    Status status ABSL_GUARDED_BY(&mu);
-    Promise<Status> promise;
-  };
-  auto combined_status = std::make_shared<CombinedStatus>(futures.size());
-  Future<Status> future(combined_status->promise);
-  for (auto& fut : futures) {
-    fut.OnReady([combined_status](Status s) {
-      if (!s.ok()) {
-        absl::MutexLock lock(&combined_status->mu);
-        combined_status->status.Update(std::move(s));
-      }
-      const int pre_dec_count =
-          combined_status->count.fetch_add(-1, std::memory_order_acq_rel);
-      CHECK_GE(pre_dec_count, 1);
-      if (pre_dec_count == 1) {
-        absl::MutexLock lock(&combined_status->mu);
-        combined_status->promise.Set(std::move(combined_status->status));
-      }
-    });
-  }
-  return future;
-}
-
-Future<> JoinFutures(absl::Span<Future<>> futures) {
-  return ::xla::JoinFutures(futures);
-}
-
-}  // namespace ifrt
-}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/future.h b/third_party/xla/xla/python/ifrt/future.h
index 0c87042f971286..49680ce337d09b 100644
--- a/third_party/xla/xla/python/ifrt/future.h
+++ b/third_party/xla/xla/python/ifrt/future.h
@@ -16,9 +16,7 @@ limitations under the License.
 #ifndef XLA_PYTHON_IFRT_FUTURE_H_
 #define XLA_PYTHON_IFRT_FUTURE_H_
 
-#include "absl/types/span.h"
 #include "xla/pjrt/pjrt_future.h"
-#include "xla/status.h"
 
 namespace xla {
 namespace ifrt {
@@ -43,11 +41,7 @@ using Future = ::xla::PjRtFuture<T>;
 template <typename T = void>
 using Promise = typename ::xla::PjRtFuture<T>::Promise;
 
-// Returns a `Future` that aggregates the return status of all `Future`s.
-Future<Status> JoinFutures(absl::Span<Future<Status>> futures);
-
-// Returns a `Future` that aggregates the return status of all `Future`s.
-Future<> JoinFutures(absl::Span<Future<>> futures);
+using ::xla::JoinFutures;
 
 }  // namespace ifrt
 }  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/future_test.cc b/third_party/xla/xla/python/ifrt/future_test.cc
index 5a967318af84dd..650f4849c0db1f 100644
--- a/third_party/xla/xla/python/ifrt/future_test.cc
+++ b/third_party/xla/xla/python/ifrt/future_test.cc
@@ -31,29 +31,29 @@ using ::testing::HasSubstr;
 using ::tsl::testing::StatusIs;
 
 TEST(FutureTest, JoinZeroFuture) {
-  Future<Status> future = JoinFutures(absl::Span<Future<Status>>());
+  Future<> future = JoinFutures({});
 
   TF_EXPECT_OK(future.Await());
 }
 
 TEST(FutureTest, JoinOneOkFuture) {
-  Promise<Status> promise = Future<Status>::CreatePromise();
-  std::vector<Future<Status>> futures;
-  futures.push_back(Future<Status>(promise));
+  Promise<> promise = Future<>::CreatePromise();
+  std::vector<Future<>> futures;
+  futures.push_back(Future<>(promise));
 
-  Future<Status> future = JoinFutures(absl::MakeSpan(futures));
+  Future<> future = JoinFutures(absl::MakeSpan(futures));
 
   ASSERT_FALSE(future.IsReady());
-  promise.Set(OkStatus());
+  promise.Set(absl::OkStatus());
   TF_EXPECT_OK(future.Await());
 }
 
 TEST(FutureTest, JoinOneFailingFuture) {
-  Promise<Status> promise = Future<Status>::CreatePromise();
-  std::vector<Future<Status>> futures;
-  futures.push_back(Future<Status>(promise));
+  Promise<> promise = Future<>::CreatePromise();
+  std::vector<Future<>> futures;
+  futures.push_back(Future<>(promise));
 
-  Future<Status> future = JoinFutures(absl::MakeSpan(futures));
+  Future<> future = JoinFutures(absl::MakeSpan(futures));
 
   ASSERT_FALSE(future.IsReady());
   promise.Set(absl::InvalidArgumentError("Some error"));
@@ -63,39 +63,39 @@ TEST(FutureTest, JoinOneFailingFuture) {
 
 TEST(FutureTest, JoinAllOkFutures) {
   constexpr int kNumFutures = 3;
-  std::vector<Promise<Status>> promises;
-  std::vector<Future<Status>> futures;
+  std::vector<Promise<>> promises;
+  std::vector<Future<>> futures;
   promises.reserve(kNumFutures);
   futures.reserve(kNumFutures);
   for (int i = 0; i < kNumFutures; ++i) {
-    promises.push_back(Future<Status>::CreatePromise());
-    futures.push_back(Future<Status>(promises.back()));
+    promises.push_back(Future<>::CreatePromise());
+    futures.push_back(Future<>(promises.back()));
   }
 
-  Future<Status> future = JoinFutures(absl::MakeSpan(futures));
+  Future<> future = JoinFutures(absl::MakeSpan(futures));
 
   ASSERT_FALSE(future.IsReady());
-  for (Promise<Status>& promise : promises) {
-    promise.Set(OkStatus());
+  for (Promise<>& promise : promises) {
+    promise.Set(absl::OkStatus());
   }
   TF_EXPECT_OK(future.Await());
 }
 
 TEST(FutureTest, JoinAllFailingFutures) {
   constexpr int kNumFutures = 3;
-  std::vector<Promise<Status>> promises;
-  std::vector<Future<Status>> futures;
+  std::vector<Promise<>> promises;
+  std::vector<Future<>> futures;
   promises.reserve(kNumFutures);
   futures.reserve(kNumFutures);
   for (int i = 0; i < kNumFutures; ++i) {
-    promises.push_back(Future<Status>::CreatePromise());
-    futures.push_back(Future<Status>(promises.back()));
+    promises.push_back(Future<>::CreatePromise());
+    futures.push_back(Future<>(promises.back()));
   }
 
-  Future<Status> future = JoinFutures(absl::MakeSpan(futures));
+  Future<> future = JoinFutures(absl::MakeSpan(futures));
 
   ASSERT_FALSE(future.IsReady());
-  for (Promise<Status>& promise : promises) {
+  for (Promise<>& promise : promises) {
     promise.Set(absl::InvalidArgumentError("Some error"));
   }
   EXPECT_THAT(future.Await(), StatusIs(absl::StatusCode::kInvalidArgument,
@@ -107,23 +107,23 @@ class JoinAllOkFuturesExceptForOneTest : public testing::TestWithParam<int> {};
 TEST_P(JoinAllOkFuturesExceptForOneTest, JoinAllOkFuturesExceptForOne) {
   const int kNumFutures = 3;
   const int failing_future_idx = GetParam();
-  std::vector<Promise<Status>> promises;
-  std::vector<Future<Status>> futures;
+  std::vector<Promise<>> promises;
+  std::vector<Future<>> futures;
   promises.reserve(kNumFutures);
   futures.reserve(kNumFutures);
   for (int i = 0; i < kNumFutures; ++i) {
-    promises.push_back(Future<Status>::CreatePromise());
-    futures.push_back(Future<Status>(promises.back()));
+    promises.push_back(Future<>::CreatePromise());
+    futures.push_back(Future<>(promises.back()));
   }
 
-  Future<Status> future = JoinFutures(absl::MakeSpan(futures));
+  Future<> future = JoinFutures(absl::MakeSpan(futures));
 
   ASSERT_FALSE(future.IsReady());
   for (int i = 0; i < kNumFutures; ++i) {
     if (i == failing_future_idx) {
       promises[i].Set(absl::InvalidArgumentError("Some error"));
     } else {
-      promises[i].Set(OkStatus());
+      promises[i].Set(absl::OkStatus());
     }
   }
   EXPECT_THAT(future.Await(), StatusIs(absl::StatusCode::kInvalidArgument,
diff --git a/third_party/xla/xla/python/ifrt/hlo/BUILD b/third_party/xla/xla/python/ifrt/hlo/BUILD
new file mode 100644
index 00000000000000..9925bf37459263
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/hlo/BUILD
@@ -0,0 +1,62 @@
+load("//xla:xla.bzl", "xla_cc_test")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
+load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([
+        "//xla/python/ifrt:friends",
+        "//xla/python/ifrt:internal",
+    ]),
+)
+
+cc_library(
+    name = "hlo_program",
+    srcs = ["hlo_program.cc"],
+    hdrs = ["hlo_program.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//xla/python/ifrt",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+cc_library(
+    name = "hlo_program_serdes",
+    srcs = ["hlo_program_serdes.cc"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":hlo_program",
+        "//xla/mlir_hlo:mhlo_passes",
+        "//xla/pjrt:mlir_to_hlo",
+        "//xla/python/ifrt:serdes",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@local_tsl//tsl/platform:status",
+        "@stablehlo//:stablehlo_portable_api",
+        "@stablehlo//:stablehlo_serialization",
+    ],
+    alwayslink = True,
+)
+
+xla_cc_test(
+    name = "hlo_program_serdes_test",
+    srcs = ["hlo_program_serdes_test.cc"],
+    deps = [
+        ":hlo_program",
+        ":hlo_program_serdes",
+        "//xla/mlir_hlo",
+        "//xla/pjrt:mlir_to_hlo",
+        "//xla/python/ifrt:serdes",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
diff --git a/third_party/xla/xla/mlir_hlo/lhlo/IR/lhlo_structured_interface.cc b/third_party/xla/xla/python/ifrt/hlo/hlo_program.cc
similarity index 73%
rename from third_party/xla/xla/mlir_hlo/lhlo/IR/lhlo_structured_interface.cc
rename to third_party/xla/xla/python/ifrt/hlo/hlo_program.cc
index bb75b36e64834f..3e79c3d4ec6079 100644
--- a/third_party/xla/xla/mlir_hlo/lhlo/IR/lhlo_structured_interface.cc
+++ b/third_party/xla/xla/python/ifrt/hlo/hlo_program.cc
@@ -1,4 +1,4 @@
-/* Copyright 2021 The OpenXLA Authors.
+/* Copyright 2023 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,12 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "lhlo/IR/lhlo_structured_interface.h"
+#include "xla/python/ifrt/hlo/hlo_program.h"
 
-namespace mlir {
-namespace lmhlo {
+namespace xla::ifrt {
 
-#include "lhlo/IR/lhlo_structured_interface.cpp.inc"
+char HloProgram::ID = 0;
 
-}  // namespace lmhlo
-}  // namespace mlir
+}  // namespace xla::ifrt
diff --git a/third_party/xla/xla/python/ifrt/hlo/hlo_program.h b/third_party/xla/xla/python/ifrt/hlo/hlo_program.h
new file mode 100644
index 00000000000000..79f291cb062c3b
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/hlo/hlo_program.h
@@ -0,0 +1,54 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_HLO_HLO_PROGRAM_H_
+#define XLA_PYTHON_IFRT_HLO_HLO_PROGRAM_H_
+
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "xla/python/ifrt/program.h"
+
+namespace xla {
+namespace ifrt {
+
+struct HloProgram : llvm::RTTIExtends<HloProgram, Program> {
+  HloProgram() = default;
+  explicit HloProgram(mlir::ModuleOp module) : mlir_module(module) {}
+  HloProgram(std::unique_ptr<mlir::MLIRContext> context,
+             mlir::OwningOpRef<mlir::ModuleOp> module)
+      : mlir_module(*module),
+        mlir_context(std::move(context)),
+        owning_mlir_module(std::move(module)) {}
+
+  mlir::ModuleOp mlir_module;
+
+  static char ID;  // NOLINT
+
+ private:
+  std::unique_ptr<mlir::MLIRContext> mlir_context;
+  mlir::OwningOpRef<mlir::ModuleOp> owning_mlir_module;
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_HLO_HLO_PROGRAM_H_
diff --git a/third_party/xla/xla/python/pjrt_ifrt/xla_program_serdes.cc b/third_party/xla/xla/python/ifrt/hlo/hlo_program_serdes.cc
similarity index 85%
rename from third_party/xla/xla/python/pjrt_ifrt/xla_program_serdes.cc
rename to third_party/xla/xla/python/ifrt/hlo/hlo_program_serdes.cc
index 83348b7b8b6e27..1360bfa0df5875 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/xla_program_serdes.cc
+++ b/third_party/xla/xla/python/ifrt/hlo/hlo_program_serdes.cc
@@ -29,8 +29,8 @@ limitations under the License.
 #include "stablehlo/dialect/Serialization.h"  // from @stablehlo
 #include "xla/mlir_hlo/mhlo/transforms/passes.h"
 #include "xla/pjrt/mlir_to_hlo.h"
+#include "xla/python/ifrt/hlo/hlo_program.h"
 #include "xla/python/ifrt/serdes.h"
-#include "xla/python/pjrt_ifrt/xla_compiler.h"
 #include "tsl/platform/status.h"
 
 namespace xla {
@@ -39,7 +39,7 @@ namespace ifrt {
 namespace {
 
 // Library that provides stable serialization and deserialization of
-// `xla::ifrt::XlaProgram`. Both serialization and deserialization require
+// `xla::ifrt::HloProgram`. Both serialization and deserialization require
 // linking in this library.
 //
 // Serialization:
@@ -50,21 +50,23 @@ namespace {
 // Deserialization:
 // ```
 // TF_ASSIGN_OR_RETURN(auto deserialized, Deserialize(serialized));
-// auto xla_program = llvm::dyn_cast<XlaProgram>(deserialized);
+// auto xla_program = llvm::dyn_cast<HloProgram>(deserialized);
 // ```
 
-class XlaProgramSerDes : public llvm::RTTIExtends<XlaProgramSerDes, SerDes> {
+class HloProgramSerDes : public llvm::RTTIExtends<HloProgramSerDes, SerDes> {
  public:
   absl::string_view type_name() const override {
+    // TODO(phawkins): whenever we next break compatibility, change this to
+    // "xla::ifrt::HloProgram".
     return "xla::ifrt::XlaProgram";
   }
 
   absl::StatusOr<std::string> Serialize(Serializable& serializable) override {
-    // Currently, PjRT-IFRT accepts an `XlaProgram` that contains C/MHLO. Since
+    // Currently, PjRT-IFRT accepts an `HloProgram` that contains C/MHLO. Since
     // these dialects don't provide version compatibility, the following
     // converts the module into StableHLO and use its portable serialization.
 
-    const auto& program = llvm::cast<XlaProgram>(serializable);
+    const auto& program = llvm::cast<HloProgram>(serializable);
     if (program.mlir_module == nullptr) {
       return absl::InvalidArgumentError("Unable to serialize null MLIR module");
     }
@@ -97,17 +99,17 @@ class XlaProgramSerDes : public llvm::RTTIExtends<XlaProgramSerDes, SerDes> {
       return absl::InvalidArgumentError("StableHLO => MHLO failed");
     }
 
-    return std::make_unique<XlaProgram>(std::move(context), std::move(module));
+    return std::make_unique<HloProgram>(std::move(context), std::move(module));
   }
 
   static char ID;  // NOLINT
 };
 
-char XlaProgramSerDes::ID = 0;  // NOLINT
+char HloProgramSerDes::ID = 0;  // NOLINT
 
 // clang-format off
 bool register_xla_program_serdes = ([]() {
-  RegisterSerDes<XlaProgram>(std::make_unique<XlaProgramSerDes>());
+  RegisterSerDes<HloProgram>(std::make_unique<HloProgramSerDes>());
 }(), true);
 // clang-format on
 
diff --git a/third_party/xla/xla/python/pjrt_ifrt/xla_program_serdes_test.cc b/third_party/xla/xla/python/ifrt/hlo/hlo_program_serdes_test.cc
similarity index 91%
rename from third_party/xla/xla/python/pjrt_ifrt/xla_program_serdes_test.cc
rename to third_party/xla/xla/python/ifrt/hlo/hlo_program_serdes_test.cc
index 41ce1c6ab67a3d..2febe7ddf31896 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/xla_program_serdes_test.cc
+++ b/third_party/xla/xla/python/ifrt/hlo/hlo_program_serdes_test.cc
@@ -27,8 +27,8 @@ limitations under the License.
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/pjrt/mlir_to_hlo.h"
+#include "xla/python/ifrt/hlo/hlo_program.h"
 #include "xla/python/ifrt/serdes.h"
-#include "xla/python/pjrt_ifrt/xla_compiler.h"
 
 namespace xla {
 namespace ifrt {
@@ -37,7 +37,7 @@ namespace {
 using ::testing::IsNull;
 using ::testing::Not;
 
-TEST(XlaProgramSerDesTest, RoundTrip) {
+TEST(HloProgramSerDesTest, RoundTrip) {
   static constexpr absl::string_view kMlirModuleStr = R"(
 module {
   func.func @main(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
@@ -56,13 +56,13 @@ module {
         mlir::OwningOpRef<mlir::ModuleOp> module,
         xla::ParseMlirModuleString(kMlirModuleStr, *context));
     auto program =
-        std::make_unique<XlaProgram>(std::move(context), std::move(module));
+        std::make_unique<HloProgram>(std::move(context), std::move(module));
     TF_ASSERT_OK_AND_ASSIGN(serialized, Serialize(*program));
   }
 
   TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<XlaProgram> xla_program,
-      Deserialize<XlaProgram>(serialized, /*options=*/nullptr));
+      std::unique_ptr<HloProgram> xla_program,
+      Deserialize<HloProgram>(serialized, /*options=*/nullptr));
 
   // Verify that the deserialized program has no StableHLO ops.
   bool has_unsupported_dialect = false;
diff --git a/third_party/xla/xla/python/ifrt/host_callback.h b/third_party/xla/xla/python/ifrt/host_callback.h
index 3432f6f0370198..3c36237a22908d 100644
--- a/third_party/xla/xla/python/ifrt/host_callback.h
+++ b/third_party/xla/xla/python/ifrt/host_callback.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include "llvm/Support/ExtensibleRTTI.h"
 #include "xla/statusor.h"
-#include "tsl/concurrency/ref_count.h"
+#include "xla/tsl/concurrency/ref_count.h"
 
 namespace xla {
 namespace ifrt {
diff --git a/third_party/xla/xla/python/ifrt/ir/BUILD b/third_party/xla/xla/python/ifrt/ir/BUILD
index 50a5f28f87db7c..1cbe51f5a529cc 100644
--- a/third_party/xla/xla/python/ifrt/ir/BUILD
+++ b/third_party/xla/xla/python/ifrt/ir/BUILD
@@ -127,14 +127,12 @@ cc_library(
         "ifrt_dialect.cc",
         "ifrt_interfaces.cc",
         "ifrt_ops.cc",
-        "sharding_param.cc",
     ],
     hdrs = [
         "constants.h",
         "ifrt_dialect.h",
         "ifrt_interfaces.h",
         "ifrt_ops.h",
-        "sharding_param.h",
     ],
     compatible_with = get_compatible_with_portable(),
     visibility = ["//xla/python/ifrt:friends"],
@@ -142,14 +140,30 @@ cc_library(
         ":ifrt_dialect_inc_gen",
         ":ifrt_interfaces_inc_gen",
         ":ifrt_ops_inc_gen",
-        "@com_google_absl//absl/status",
+        ":sharding_param",
+        "//xla/python/ifrt",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:CallOpInterfaces",  # buildcleaner: keep
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
+    ],
+)
+
+cc_library(
+    name = "sharding_param",
+    srcs = ["sharding_param.cc"],
+    hdrs = ["sharding_param.h"],
+    compatible_with = get_compatible_with_portable(),
+    visibility = ["//xla/python/ifrt:friends"],
+    deps = [
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
         "@local_tsl//tsl/platform:errors",
     ],
 )
@@ -161,9 +175,9 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     visibility = ["//xla/python/ifrt:friends"],
     deps = [
-        "//xla:statusor",
         "//xla/python/ifrt",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status:statusor",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
     ],
diff --git a/third_party/xla/xla/python/ifrt/ir/compiler.h b/third_party/xla/xla/python/ifrt/ir/compiler.h
index 10c842478d5101..d36b63eb04bb61 100644
--- a/third_party/xla/xla/python/ifrt/ir/compiler.h
+++ b/third_party/xla/xla/python/ifrt/ir/compiler.h
@@ -22,12 +22,13 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
 #include "llvm/Support/ExtensibleRTTI.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "xla/python/ifrt/compiler.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/executable.h"
-#include "xla/statusor.h"
+#include "xla/python/ifrt/program.h"
 
 namespace xla {
 namespace ifrt {
@@ -48,8 +49,8 @@ struct IfrtIRCompileOptions
   IfrtIRCompileOptions() = default;
   explicit IfrtIRCompileOptions(
       std::vector<DeviceId> device_assignments,
-      absl::flat_hash_map<std::string, LoadedExecutable*> loaded_exec_binding =
-          {},
+      absl::flat_hash_map<std::string, std::shared_ptr<LoadedExecutable>>
+          loaded_exec_binding = {},
       std::shared_ptr<absl::flat_hash_map<
           std::string, std::unique_ptr<xla::ifrt::CompileOptions>>>
           compile_options_overrides = {})
@@ -64,7 +65,8 @@ struct IfrtIRCompileOptions
   // Map from `getSymName()` of declared LoadedExecutableOp in the `mlir_module`
   // to pre-compiled LoadedExecutable instance. The LoadedExecutables must
   // outlive the LoadedExecutable to be compiled.
-  absl::flat_hash_map<std::string, LoadedExecutable*> loaded_exec_binding;
+  absl::flat_hash_map<std::string, std::shared_ptr<LoadedExecutable>>
+      loaded_exec_binding;
 
   // Mapping from values of `ifrt.compile_option_key` attribute of a `CallOp` to
   // compile options. If a `CallOp` does not have have the attribute set or does
diff --git a/third_party/xla/xla/python/ifrt/ir/constants.h b/third_party/xla/xla/python/ifrt/ir/constants.h
index cd1bc06bf85ee0..27e9d11fb6a1cf 100644
--- a/third_party/xla/xla/python/ifrt/ir/constants.h
+++ b/third_party/xla/xla/python/ifrt/ir/constants.h
@@ -25,6 +25,12 @@ namespace ifrt {
 // apart from atom program FuncOps (callee of `ifrt.Call`).
 inline constexpr llvm::StringLiteral kIfrtFunctionAttrName = "ifrt.function";
 
+// Name of UnitAttr on FuncOp to indicate it's an IFRT IR function that
+// only reshards arrays. While functions with kIfrtFunctionAttrName attribute
+// cannot be `ifrt.Call`ed, kIfrtReshardFunctionAttrName can be called.
+inline constexpr llvm::StringLiteral kIfrtReshardFunctionAttrName =
+    "ifrt.reshard_function";
+
 // Name of UnitAttr on arguments of FuncOp to indicate a donated input.
 // Must be used in a FuncOp with `ifrt.function` attr.
 inline constexpr llvm::StringLiteral kIfrtDonatedArgAttrName = "ifrt.donated";
diff --git a/third_party/xla/xla/python/ifrt/ir/ifrt_dialect.cc b/third_party/xla/xla/python/ifrt/ir/ifrt_dialect.cc
index 15e7cd2b22fb56..265c4deea601fc 100644
--- a/third_party/xla/xla/python/ifrt/ir/ifrt_dialect.cc
+++ b/third_party/xla/xla/python/ifrt/ir/ifrt_dialect.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLFunctionalExtras.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Casting.h"
@@ -34,14 +33,16 @@ limitations under the License.
 #include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/IR/DialectImplementation.h"  // from @llvm-project
 #include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "xla/python/ifrt/ir/constants.h"
 #include "xla/python/ifrt/ir/ifrt_interfaces.h"
 #include "xla/python/ifrt/ir/ifrt_ops.h"
+#include "xla/python/ifrt/ir/sharding_param.h"
+#include "xla/python/ifrt/memory.h"
 
 // Generated definitions.
 #include "xla/python/ifrt/ir/ifrt_dialect.cc.inc"
-#include "xla/python/ifrt/ir/sharding_param.h"
 #define GET_TYPEDEF_CLASSES
 #include "xla/python/ifrt/ir/ifrt_types.cc.inc"
 #define GET_ATTRDEF_CLASSES
@@ -80,6 +81,10 @@ IfrtAsmDialectInterface::AliasResult IfrtAsmDialectInterface::getAlias(
       devices != nullptr && devices.getIds().size() > 4) {
     os << "devices";
     return AliasResult::FinalAlias;
+  } else if (auto mapping = llvm::dyn_cast<IfrtArrayMappingAttr>(attr);
+             mapping != nullptr && mapping.getMappings().size() > 2) {
+    os << "array_mapping";
+    return AliasResult::FinalAlias;
   }
   return AliasResult::NoAlias;
 }
@@ -91,7 +96,7 @@ mlir::LogicalResult IfrtDialect::verifyOperationAttribute(
       return op->emitOpError() << "has `" << kIfrtFunctionAttrName
                                << "` attr but is not a function";
     }
-    if (!attr.getValue().isa<mlir::UnitAttr>()) {
+    if (!mlir::isa<mlir::UnitAttr>(attr.getValue())) {
       return op->emitOpError() << "has `" << kIfrtFunctionAttrName
                                << "` attr that is not a UnitAttr";
     }
@@ -107,7 +112,7 @@ mlir::LogicalResult IfrtDialect::verifyRegionArgAttribute(
       return op->emitOpError() << "has `" << kIfrtDonatedArgAttrName
                                << "` arg attr but is not a function";
     }
-    if (!attr.getValue().isa<mlir::UnitAttr>()) {
+    if (!mlir::isa<mlir::UnitAttr>(attr.getValue())) {
       return op->emitOpError() << "has `" << kIfrtDonatedArgAttrName
                                << "` arg attr that is not a UnitAttr";
     }
@@ -126,7 +131,7 @@ mlir::LogicalResult IfrtDialect::verifyRegionArgAttribute(
 
 mlir::LogicalResult IfrtShardingParamAttr::verify(
     llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
-    ShardingParam sharding_param) {
+    ShardingParam sharding_param, mlir::StringAttr memory_kind) {
   return sharding_param.verify(emitError);
 }
 
@@ -153,6 +158,12 @@ int IfrtShardingParamAttr::NumDevices() const {
   return getSharding().NumDevices();
 };
 
+xla::ifrt::MemoryKind IfrtShardingParamAttr::MemoryKind() const {
+  return getMemoryKind() == nullptr
+             ? xla::ifrt::MemoryKind()
+             : xla::ifrt::MemoryKind(getMemoryKind().str());
+};
+
 //===----------------------------------------------------------------------===//
 // IfrtUnspecifiedShardingAttr
 //===----------------------------------------------------------------------===//
@@ -184,6 +195,10 @@ IfrtUnspecifiedShardingAttr::LocalShapeFromGlobalShape(
 
 int IfrtUnspecifiedShardingAttr::NumDevices() const { return 0; }
 
+xla::ifrt::MemoryKind IfrtUnspecifiedShardingAttr::MemoryKind() const {
+  return xla::ifrt::MemoryKind();
+}
+
 //===----------------------------------------------------------------------===//
 // IfrtArrayType
 //===----------------------------------------------------------------------===//
@@ -209,7 +224,7 @@ IfrtDevicesAttr::operator llvm::ArrayRef<int>() const { return getIds(); }
 mlir::LogicalResult IfrtDevicesAttr::verify(
     llvm::function_ref<::mlir::InFlightDiagnostic()> emitError,
     llvm::ArrayRef<int> ids) {
-  llvm::SmallSet<int, 4> device_set;
+  llvm::DenseSet<int> device_set;
   for (int id : ids) {
     if (id < 0) {
       return emitError() << "Device list has negative logical id " << id;
@@ -222,5 +237,44 @@ mlir::LogicalResult IfrtDevicesAttr::verify(
   return mlir::success();
 }
 
+//===----------------------------------------------------------------------===//
+// IfrtIntervalAttr
+//===----------------------------------------------------------------------===//
+
+mlir::LogicalResult IfrtIntervalAttr::verify(
+    llvm::function_ref<mlir::InFlightDiagnostic()> emitError, int start,
+    int end, int step) {
+  if (start < 0 || end < 0) {
+    return emitError() << "start, end must be zero or positive";
+  }
+  if (step <= 0) {
+    return emitError() << "step must be positive";
+  }
+  if (start > end) {
+    return emitError() << "interval is empty";
+  }
+  return mlir::success();
+}
+
+int IfrtIntervalAttr::size() const {
+  return (getEnd() - getStart() + getStep() - 1) / getStep();
+}
+
+//===----------------------------------------------------------------------===//
+// IfrtMappingAttr
+//===----------------------------------------------------------------------===//
+
+mlir::LogicalResult IfrtMappingAttr::verify(
+    llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
+    IfrtIntervalAttr from_shards, IfrtIntervalAttr to_shards) {
+  // Verify that from and to contains the same number of shards.
+  if (from_shards.size() != to_shards.size()) {
+    return emitError() << "from has " << from_shards.size() << " and to has "
+                       << to_shards.size()
+                       << ", but they must have the same number of shards.";
+  }
+  return mlir::success();
+}
+
 }  // namespace ifrt
 }  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/ir/ifrt_dialect.td b/third_party/xla/xla/python/ifrt/ir/ifrt_dialect.td
index 044668aa915116..dbcba1a62aaccd 100644
--- a/third_party/xla/xla/python/ifrt/ir/ifrt_dialect.td
+++ b/third_party/xla/xla/python/ifrt/ir/ifrt_dialect.td
@@ -74,8 +74,26 @@ def Ifrt_ShardingParamAttr : AttrDef<Ifrt_Dialect, "IfrtShardingParam", [
   let mnemonic = "sharding_param";
   let summary = "ShardingParam as an attribute.";
 
-  let parameters = (ins Ifrt_ShardingParameter:$sharding);
-  let assemblyFormat = "`<` $sharding `>`";
+  let parameters = (ins
+    Ifrt_ShardingParameter:$sharding,
+    OptionalParameter<"::mlir::StringAttr">:$memory_kind
+  );
+  let assemblyFormat = [{
+    `<` $sharding (`,` `memory_kind` `=` $memory_kind^)? `>`
+  }];
+
+  let builders = [
+    AttrBuilder<(ins "::xla::ifrt::ShardingParam":$sharding), [{
+      return $_get($_ctxt, sharding, /*memory_kind=*/nullptr);
+    }]>,
+    AttrBuilder<(ins
+      "::xla::ifrt::ShardingParam":$sharding,
+      "::mlir::StringRef":$memory_kind), [{
+        return $_get($_ctxt,
+                     sharding,
+                     ::mlir::StringAttr::get($_ctxt,  memory_kind));
+    }]>
+  ];
 
   let genVerifyDecl = 1;
 }
@@ -93,6 +111,65 @@ def Ifrt_UnspecifiedShardingAttr : AttrDef<Ifrt_Dialect,
   let genVerifyDecl = 1;
 }
 
+def Ifrt_IntervalAttr : AttrDef<Ifrt_Dialect, "IfrtInterval"> {
+  let mnemonic = "interval";
+  let summary = [{
+    Half-open interval attribute using the Python slice format
+    `[start:end:step]`. Reverse iteration is not supported for simplicity.
+    Therefore, `start` and `end` must be zero or positive, and `step`
+    must be positive.
+  }];
+
+  let parameters = (ins "int":$start, "int":$end, "int":$step);
+
+  let assemblyFormat = "`[`$start `:` $end `:` $step`]`";
+
+  let genVerifyDecl = 1;
+
+  let extraClassDeclaration = [{
+    int size() const;
+  }];
+}
+
+def Ifrt_MappingAttr : AttrDef<Ifrt_Dialect, "IfrtMapping"> {
+  let mnemonic = "mapping";
+  let summary = "Mapping between from shards interval to shards interval.";
+
+  let parameters = (ins
+     Ifrt_IntervalAttr:$from_shards,
+     Ifrt_IntervalAttr:$to_shards);
+
+  let assemblyFormat = "`<` $from_shards `to` $to_shards `>`";
+
+  let genVerifyDecl = 1;
+}
+
+def Ifrt_MappingAttrArrayAttr :
+  TypedArrayAttrBase<Ifrt_MappingAttr, "mappings array attribute"> {
+  string cppType = "::mlir::ArrayAttr";
+}
+
+def Ifrt_ArrayMappingAttr : AttrDef<Ifrt_Dialect, "IfrtArrayMapping"> {
+  let mnemonic = "array_mapping";
+  let summary = [{
+    Mapping of shards from an input array to an output array. The shards are
+    chosen from input array with index `in_array_index` and are used to assemble
+    the output array with index `out_array_index`.
+  }];
+
+  let parameters = (ins
+    "int32_t":$in_array_index,
+    "int32_t":$out_array_index,
+    Ifrt_MappingAttrArrayAttr:$mappings);
+
+  let assemblyFormat = "`<`$in_array_index`,` $out_array_index`,` $mappings`>`";
+}
+
+def Ifrt_ArrayMappingAttrArrayAttr :
+  TypedArrayAttrBase<Ifrt_ArrayMappingAttr, "array mappings array attribute"> {
+  let constBuilderCall = "::mlir::ArrayAttr::get($_builder.getContext(), $0)";
+}
+
 //===---------------------------------------------------------------------------
 // Types
 //===---------------------------------------------------------------------------
diff --git a/third_party/xla/xla/python/ifrt/ir/ifrt_interfaces.cc b/third_party/xla/xla/python/ifrt/ir/ifrt_interfaces.cc
index a079f75fbdd597..db8844da11e36b 100644
--- a/third_party/xla/xla/python/ifrt/ir/ifrt_interfaces.cc
+++ b/third_party/xla/xla/python/ifrt/ir/ifrt_interfaces.cc
@@ -37,9 +37,12 @@ namespace impl {
 LogicalResult verifyNestedInIfrtFunc(Operation* op) {
   auto func_op = op->getParentOfType<func::FuncOp>();
   if (func_op != nullptr &&
-      !func_op->hasAttr(::xla::ifrt::kIfrtFunctionAttrName)) {
-    return op->emitOpError() << "must be in a FuncOp with attr `"
-                             << ::xla::ifrt::kIfrtFunctionAttrName << "`";
+      !func_op->hasAttr(::xla::ifrt::kIfrtFunctionAttrName) &&
+      !func_op->hasAttr(::xla::ifrt::kIfrtReshardFunctionAttrName)) {
+    return op->emitOpError()
+           << "must be in a FuncOp with attr `"
+           << ::xla::ifrt::kIfrtFunctionAttrName << "` or atttr `"
+           << ::xla::ifrt::kIfrtReshardFunctionAttrName << "`";
   }
   return success();
 }
diff --git a/third_party/xla/xla/python/ifrt/ir/ifrt_interfaces.h b/third_party/xla/xla/python/ifrt/ir/ifrt_interfaces.h
index b496f4b57d3be4..77d754bbb4a39b 100644
--- a/third_party/xla/xla/python/ifrt/ir/ifrt_interfaces.h
+++ b/third_party/xla/xla/python/ifrt/ir/ifrt_interfaces.h
@@ -1,5 +1,3 @@
-#include "xla/python/ifrt/ir/constants.h"
-#include "xla/python/ifrt/ir/ifrt_dialect.h"
 /* Copyright 2023 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,11 +16,14 @@ limitations under the License.
 #ifndef XLA_PYTHON_IFRT_IR_IFRT_INTERFACES_H_
 #define XLA_PYTHON_IFRT_IR_IFRT_INTERFACES_H_
 
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "xla/python/ifrt/ir/constants.h"
 #include "xla/python/ifrt/ir/sharding_param.h"
+#include "xla/python/ifrt/memory.h"
 
 namespace mlir {
 namespace OpTrait {
diff --git a/third_party/xla/xla/python/ifrt/ir/ifrt_interfaces.td b/third_party/xla/xla/python/ifrt/ir/ifrt_interfaces.td
index f1a94a517b91b8..f2dfe981a89957 100644
--- a/third_party/xla/xla/python/ifrt/ir/ifrt_interfaces.td
+++ b/third_party/xla/xla/python/ifrt/ir/ifrt_interfaces.td
@@ -108,6 +108,12 @@ def Ifrt_ShardingAttrInterface : Ifrt_AttrInterface<"IfrtShardingAttrInterface">
       /*retTy=*/"int",
       /*methodName=*/"NumDevices",
       /*args=*/(ins)
+    >,
+    InterfaceMethod<
+      /*desc=*/"Returns the memory kind.",
+      /*retTy=*/"::xla::ifrt::MemoryKind",
+      /*methodName=*/"MemoryKind",
+      /*args=*/(ins)
     >
   ];
 }
diff --git a/third_party/xla/xla/python/ifrt/ir/ifrt_ops.cc b/third_party/xla/xla/python/ifrt/ir/ifrt_ops.cc
index c61e3b82b7e592..ab6468d518c005 100644
--- a/third_party/xla/xla/python/ifrt/ir/ifrt_ops.cc
+++ b/third_party/xla/xla/python/ifrt/ir/ifrt_ops.cc
@@ -16,10 +16,15 @@ limitations under the License.
 #include "xla/python/ifrt/ir/ifrt_ops.h"
 
 #include <algorithm>
+#include <cstdint>
 #include <optional>
 #include <utility>
+#include <vector>
 
+#include "absl/status/statusor.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
@@ -34,6 +39,7 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/ValueRange.h"  // from @llvm-project
 #include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "xla/python/ifrt/ir/constants.h"
 #include "xla/python/ifrt/ir/ifrt_dialect.h"
@@ -49,9 +55,9 @@ namespace ifrt {
 namespace {
 
 mlir::FailureOr<mlir::RankedTensorType> GetGlobalShape(mlir::Type type) {
-  if (auto ranked_tensor = type.dyn_cast<mlir::RankedTensorType>()) {
+  if (auto ranked_tensor = mlir::dyn_cast<mlir::RankedTensorType>(type)) {
     return ranked_tensor;
-  } else if (auto array = type.dyn_cast<IfrtArrayType>()) {
+  } else if (auto array = mlir::dyn_cast<IfrtArrayType>(type)) {
     return array.getShape();
   } else {
     return mlir::failure();
@@ -64,7 +70,7 @@ mlir::FailureOr<mlir::RankedTensorType> GetGlobalShape(mlir::Value value) {
 
 mlir::FailureOr<mlir::RankedTensorType> GetGlobalShapeFromLocal(
     mlir::Type type, IfrtShardingAttrInterface sharding_attr) {
-  if (auto local_ranked_tensor = type.dyn_cast<mlir::RankedTensorType>()) {
+  if (auto local_ranked_tensor = mlir::dyn_cast<mlir::RankedTensorType>(type)) {
     auto global_shape =
         sharding_attr.GlobalShapeFromLocalShape(local_ranked_tensor.getShape());
     if (global_shape.ok()) {
@@ -115,7 +121,7 @@ mlir::LogicalResult VerifyGlobalLocalShapesEquivalent(
                              << call_mnemonic << ": " << call_value;
   }
   // The types of the CallOp func signature must be IfrtArrayType.
-  auto array = call_value.getType().dyn_cast<IfrtArrayType>();
+  auto array = mlir::dyn_cast<IfrtArrayType>(call_value.getType());
   if (array == nullptr) {
     return mlir::failure();
   }
@@ -141,7 +147,7 @@ mlir::LogicalResult VerifyDevicePlacement(
     mlir::Operation* op, llvm::ArrayRef<int> devices,
     llvm::ArrayRef<IfrtArrayType> inputs,
     llvm::ArrayRef<IfrtArrayType> outputs) {
-  llvm::SmallSet<int, 4> device_set;
+  llvm::DenseSet<int> device_set;
   device_set.insert(devices.begin(), devices.end());
 
   for (const IfrtArrayType input : inputs) {
@@ -234,7 +240,7 @@ mlir::LogicalResult ReshardOp::verify() {
 }
 
 mlir::LogicalResult AssembleOp::verify() {
-  llvm::SmallVector<int, 4> input_devices;
+  std::vector<int> input_devices;
   for (const mlir::Value input : getInputs()) {
     const auto array = llvm::cast<IfrtArrayType>(input.getType());
     if (array.getDevices().size() != 1) {
@@ -254,7 +260,7 @@ mlir::LogicalResult AssembleOp::verify() {
 }
 
 mlir::LogicalResult DisassembleOp::verify() {
-  llvm::SmallVector<int, 4> output_devices;
+  std::vector<int> output_devices;
   for (const mlir::Value output : getOutputs()) {
     const auto array = llvm::cast<IfrtArrayType>(output.getType());
     if (array.getDevices().size() != 1) {
@@ -273,6 +279,142 @@ mlir::LogicalResult DisassembleOp::verify() {
   return mlir::success();
 }
 
+mlir::LogicalResult RemapArraysOp::verify() {
+  int num_in_arrays = getInputs().size();
+  int num_out_arrays = getOutputs().size();
+  if (num_in_arrays == 0) {
+    return emitOpError() << "requires at least one input array";
+  }
+  IfrtArrayType first_array =
+      llvm::cast<IfrtArrayType>(getInputs()[0].getType());
+  mlir::Type dtype = first_array.getShape().getElementType();
+  absl::StatusOr<llvm::SmallVector<int64_t>> in_per_shard_shape =
+      first_array.getShardingAttr().LocalShapeFromGlobalShape(
+          first_array.getShape().getShape());
+  if (!in_per_shard_shape.ok()) {
+    return emitOpError() << "unable to get per-shard shape of input #0. "
+                         << in_per_shard_shape.status().message();
+  }
+  std::vector<std::vector<bool>> in_used_shards_list(num_in_arrays);
+  // Verify that all input/output arrays have the same DType and per-shard
+  // shape.
+  for (const auto [idx, input] : llvm::enumerate(getInputs())) {
+    const auto array = llvm::cast<IfrtArrayType>(input.getType());
+    if (array.getShape().getElementType() != dtype) {
+      return emitOpError()
+             << "requires every input and output array to have the same dtype.";
+    }
+    auto input_per_shard_shape =
+        array.getShardingAttr().LocalShapeFromGlobalShape(
+            array.getShape().getShape());
+    if (!input_per_shard_shape.ok()) {
+      return emitOpError() << "unable to get per-shard shape of input #" << idx
+                           << ". " << input_per_shard_shape.status().message();
+    }
+    if (*input_per_shard_shape != *in_per_shard_shape) {
+      return emitOpError() << "requires every input array to have the same "
+                              "per-shard shape, but input #"
+                           << idx << " has a different shape.";
+    }
+    in_used_shards_list[idx].resize(/*count=*/array.getDevices().size(),
+                                    /*value=*/false);
+  }
+  std::vector<std::vector<bool>> out_mapped_shards_list(num_out_arrays);
+  if (num_out_arrays > 0) {
+    IfrtArrayType first_out_array =
+        llvm::cast<IfrtArrayType>(getOutputs()[0].getType());
+    absl::StatusOr<llvm::SmallVector<int64_t>> out_per_shard_shape =
+        first_out_array.getShardingAttr().LocalShapeFromGlobalShape(
+            first_out_array.getShape().getShape());
+    if (!out_per_shard_shape.ok()) {
+      return emitOpError() << "unable to get per-shard shape of output #0. "
+                           << out_per_shard_shape.status().message();
+    }
+    for (const auto [idx, output] : llvm::enumerate(getOutputs())) {
+      const auto array = llvm::cast<IfrtArrayType>(output.getType());
+      if (array.getShape().getElementType() != dtype) {
+        return emitOpError() << "requires every input and output array to have "
+                                "the same dtype.";
+      }
+      auto output_per_shard_shape =
+          array.getShardingAttr().LocalShapeFromGlobalShape(
+              array.getShape().getShape());
+      if (!output_per_shard_shape.ok()) {
+        return emitOpError()
+               << "unable to get per-shard shape of output #" << idx << ". "
+               << output_per_shard_shape.status().message();
+      }
+      if (*output_per_shard_shape != *out_per_shard_shape) {
+        return emitOpError() << "requires every output array to have the same "
+                                "per-shard shape, but output #"
+                             << idx << " has a different shape.";
+      }
+      out_mapped_shards_list[idx].resize(/*count=*/array.getDevices().size(),
+                                         /*value=*/false);
+    }
+  }
+
+  // Verify that an input shard is used at most once, and that every output
+  // shard has exactly one input shard mapped.
+  for (const auto& array_mapping : getMappings()) {
+    const auto array_mapping_attr =
+        llvm::cast<IfrtArrayMappingAttr>(array_mapping);
+    int in_index = array_mapping_attr.getInArrayIndex();
+    int out_index = array_mapping_attr.getOutArrayIndex();
+    if (in_index < 0 || in_index >= in_used_shards_list.size()) {
+      return emitOpError() << "mapping array index " << in_index
+                           << " is out of range of input arrays.";
+    }
+    if (out_index < 0 || out_index >= out_mapped_shards_list.size()) {
+      return emitOpError() << "mapping array index " << out_index
+                           << " is out of range of output arrays.";
+    }
+    std::vector<bool>& in_used_shards = in_used_shards_list[in_index];
+    std::vector<bool>& out_mapped_shards = out_mapped_shards_list[out_index];
+    for (const auto& mapping : array_mapping_attr.getMappings()) {
+      const auto mapping_attr = llvm::cast<IfrtMappingAttr>(mapping);
+      auto from_shards = mapping_attr.getFromShards();
+      auto to_shards = mapping_attr.getToShards();
+      int in_shard = from_shards.getStart();
+      int out_shard = to_shards.getStart();
+      while (in_shard < from_shards.getEnd()) {
+        if (in_shard >= in_used_shards.size()) {
+          return emitOpError()
+                 << "input array #" << in_index << " shard #" << in_shard
+                 << " is out of range of input shards.";
+        }
+        if (out_shard >= out_mapped_shards.size()) {
+          return emitOpError()
+                 << "output array #" << out_index << " shard #" << out_shard
+                 << " is out of range of output shards.";
+        }
+        if (in_used_shards[in_shard]) {
+          return emitOpError() << "input array #" << in_index << " shard #"
+                               << in_shard << " is already used.";
+        }
+        in_used_shards[in_shard] = true;
+        if (out_mapped_shards[out_shard]) {
+          return emitOpError() << "output array #" << out_index << " shard #"
+                               << out_shard << " is already assigned.";
+        }
+        out_mapped_shards[out_shard] = true;
+        in_shard += from_shards.getStep();
+        out_shard += to_shards.getStep();
+      }
+    }
+  }
+  for (int idx = 0; idx < num_out_arrays; ++idx) {
+    for (int out_shard = 0; out_shard < out_mapped_shards_list[idx].size();
+         ++out_shard) {
+      if (!out_mapped_shards_list[idx][out_shard]) {
+        return emitOpError() << "output array #" << idx << " shard #"
+                             << out_shard << " is unassigned.";
+      }
+    }
+  }
+  return mlir::success();
+}
+
 mlir::CallInterfaceCallable CallOp::getCallableForCallee() {
   return (*this)->getAttrOfType<mlir::SymbolRefAttr>("callee");
 }
@@ -291,8 +433,8 @@ mlir::MutableOperandRange CallOp::getArgOperandsMutable() {
 }
 
 mlir::LogicalResult CallOp::verifySymbolUses(
-    mlir::SymbolTableCollection& symbol_table) {
-  mlir::func::FuncOp callee = getCalleeOp(symbol_table);
+    mlir::SymbolTableCollection& symbolTable) {
+  mlir::func::FuncOp callee = getCalleeOp(symbolTable);
   mlir::FunctionType callee_type = callee.getFunctionType();
   auto local_view_attr =
       (*this)->getAttrOfType<mlir::UnitAttr>(kIfrtLocalViewAttrName);
@@ -347,13 +489,13 @@ mlir::LogicalResult CallOp::verify() {
   llvm::SmallVector<IfrtArrayType, 4> input_arrays;
   input_arrays.reserve(getInputs().size());
   for (const mlir::Value input : getInputs()) {
-    input_arrays.push_back(input.getType().cast<IfrtArrayType>());
+    input_arrays.push_back(mlir::cast<IfrtArrayType>(input.getType()));
   }
 
   llvm::SmallVector<IfrtArrayType, 4> output_arrays;
   output_arrays.reserve(getOutputs().size());
   for (const mlir::Value output : getOutputs()) {
-    output_arrays.push_back(output.getType().cast<IfrtArrayType>());
+    output_arrays.push_back(mlir::cast<IfrtArrayType>(output.getType()));
   }
 
   if (mlir::failed(VerifyDevicePlacement(*this, getDevices(), input_arrays,
@@ -386,7 +528,7 @@ mlir::MutableOperandRange CallLoadedExecutableOp::getArgOperandsMutable() {
 }
 
 mlir::LogicalResult CallLoadedExecutableOp::verifySymbolUses(
-    mlir::SymbolTableCollection& symbol_table) {
+    mlir::SymbolTableCollection& symbolTable) {
   llvm::SmallVector<mlir::Type, 4> input_types;
   input_types.reserve(getInputs().size());
   for (const mlir::Value input : getInputs()) {
@@ -399,7 +541,7 @@ mlir::LogicalResult CallLoadedExecutableOp::verifySymbolUses(
   }
   auto func_type =
       mlir::FunctionType::get(getContext(), input_types, output_types);
-  LoadedExecutableOp callee = getCalleeOp(symbol_table);
+  LoadedExecutableOp callee = getCalleeOp(symbolTable);
   if (callee.getFunctionType() != func_type) {
     return emitOpError() << "requires callee signature matching " << func_type
                          << ". Actual " << callee.getFunctionType();
@@ -411,13 +553,13 @@ mlir::LogicalResult CallLoadedExecutableOp::verify() {
   llvm::SmallVector<IfrtArrayType, 4> input_arrays;
   input_arrays.reserve(getInputs().size());
   for (const mlir::Value input : getInputs()) {
-    input_arrays.push_back(input.getType().cast<IfrtArrayType>());
+    input_arrays.push_back(mlir::cast<IfrtArrayType>(input.getType()));
   }
 
   llvm::SmallVector<IfrtArrayType, 4> output_arrays;
   output_arrays.reserve(getOutputs().size());
   for (const mlir::Value output : getOutputs()) {
-    output_arrays.push_back(output.getType().cast<IfrtArrayType>());
+    output_arrays.push_back(mlir::cast<IfrtArrayType>(output.getType()));
   }
 
   return VerifyIoAliases(*this, getIoAliases(), input_arrays, output_arrays);
diff --git a/third_party/xla/xla/python/ifrt/ir/ifrt_ops.td b/third_party/xla/xla/python/ifrt/ir/ifrt_ops.td
index b6fb4ba10f1449..cb46203901da56 100644
--- a/third_party/xla/xla/python/ifrt/ir/ifrt_ops.td
+++ b/third_party/xla/xla/python/ifrt/ir/ifrt_ops.td
@@ -40,6 +40,7 @@ def Ifrt_ReshardOp : Ifrt_Op<"Reshard", [NestedInIfrtFunc]> {
 
   let arguments = (ins
     Ifrt_ArrayType:$input,
+    DefaultValuedOptionalAttr<BoolAttr, "false">:$donated,
     Variadic<Ifrt_ControlType>:$control_inputs);
   let results = (outs
     Ifrt_ArrayType:$output);
@@ -88,13 +89,40 @@ def Ifrt_DisassembleOp : Ifrt_Op<"Disassemble", [NestedInIfrtFunc]> {
 
   let arguments = (ins
     Ifrt_ArrayType:$input,
-    Variadic<Ifrt_ControlType>:$controlInputs);
+    Variadic<Ifrt_ControlType>:$control_inputs);
   let results = (
     outs Variadic<Ifrt_ArrayType>:$outputs);
 
   let hasVerifier = 1;
 }
 
+def Ifrt_RemapArraysOp
+    : Ifrt_Op<"RemapArrays", [NestedInIfrtFunc]> {
+  let summary = "Remaps shards from input `Array`s to create new `Array`s.";
+  let description = [{
+    A metadata-only op that remaps shards across input `Array`s to create
+    new `Array`s based on a plan remapping array shards. The op can shuffle
+    or extract input array shards without causing a data copy or transfer. It
+    must meet the following requirements:
+     - All input/output `Array`s must have the same `DType` and per-shard shape.
+     - An input shard can be used at most once.
+     - Every output shard must have exactly one input shard mapped.
+  }];
+
+  let arguments = (ins
+    Variadic<Ifrt_ArrayType>:$inputs,
+    Ifrt_ArrayMappingAttrArrayAttr:$mappings,
+    DefaultValuedOptionalAttr<BoolAttr, "false">:$donated);
+  let results = (outs Variadic<Ifrt_ArrayType>:$outputs);
+
+  let assemblyFormat = [{
+    `(` $inputs `)` `mappings` `=` $mappings attr-dict
+    `:` functional-type($inputs, $outputs)
+  }];
+
+  let hasVerifier = 1;
+}
+
 def IoAliasesAttr : TypedArrayAttrBase<
     ConfinedAttr<DenseI32ArrayAttr, [DenseArrayCount<2>]>,
     "Array of pairs of aliased input/output indices">;
@@ -129,7 +157,6 @@ def Ifrt_CallOp : Ifrt_Op<"Call",
   let arguments = (ins
     Variadic<Ifrt_ArrayType>:$inputs,
     Variadic<Ifrt_ControlType>:$control_inputs,
-
     SymbolRefAttr:$callee,
     Ifrt_DevicesAttr:$devices,
     DefaultValuedAttr<IoAliasesAttr, "{}">:$io_aliases);
@@ -168,7 +195,6 @@ def Ifrt_CallLoadedExecutableOp : Ifrt_Op<"CallLoadedExecutable",
   let arguments = (ins
     Variadic<Ifrt_ArrayType>:$inputs,
     Variadic<Ifrt_ControlType>:$control_inputs,
-
     SymbolRefAttr:$callee,
     DefaultValuedAttr<IoAliasesAttr, "{}">:$io_aliases);
   let results = (outs
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/BUILD b/third_party/xla/xla/python/ifrt/ir/tests/BUILD
index 767db7c1bbffda..65fb16d0a7813a 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/BUILD
+++ b/third_party/xla/xla/python/ifrt/ir/tests/BUILD
@@ -1,5 +1,5 @@
 load("//xla:lit.bzl", "enforce_glob", "lit_test_suite")
-load("//xla:xla.bzl", "xla_cc_test")
+load("//xla:xla.bzl", "xla_cc_binary", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -11,6 +11,7 @@ lit_test_suite(
     srcs = enforce_glob(
         [
             "ifrt_duplicated_callee_elimination.mlir",
+            "ifrt_verify_donation.mlir",
             "ifrt_verify_sharding_specified.mlir",
             "spmd_expansion.mlir",
             "spmd_interface_verification.mlir",
@@ -21,6 +22,7 @@ lit_test_suite(
             "verify_call_loaded_executable.mlir",
             "verify_disassemble.mlir",
             "verify_loaded_executable.mlir",
+            "verify_remap_arrays.mlir",
             "verify_reshard.mlir",
         ],
         include = [
@@ -34,7 +36,7 @@ lit_test_suite(
     ],
 )
 
-cc_binary(
+xla_cc_binary(
     name = "ifrt-opt",
     srcs = ["ifrt-opt.cc"],
     deps = [
@@ -60,14 +62,15 @@ cc_library(
         "//xla/python/ifrt",
         "//xla/python/ifrt:test_util",
         "//xla/python/ifrt/ir",
+        "//xla/python/ifrt/ir:sharding_param",
         "//xla/python/ifrt/ir/transforms:built_in_spmd_expansions",
+        "//xla/tsl/concurrency:ref_count",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
-        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
     ],
@@ -83,12 +86,15 @@ cc_library(
         "//xla/pjrt:pjrt_executable",
         "//xla/python/ifrt",
         "//xla/python/ifrt:test_util",
-        "//xla/python/ifrt/ir",
+        "//xla/python/ifrt/hlo:hlo_program",
         "//xla/python/ifrt/ir:compiler",
+        "//xla/python/ifrt/ir:sharding_param",
         "//xla/python/pjrt_ifrt:xla_ifrt",
+        "//xla/service:computation_placer_hdr",
+        "//xla/tsl/concurrency:ref_count",
         "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:statusor",
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/executable_impl_test_base.h b/third_party/xla/xla/python/ifrt/ir/tests/executable_impl_test_base.h
index 26f9b5840b9ce9..510f18f9d749be 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/executable_impl_test_base.h
+++ b/third_party/xla/xla/python/ifrt/ir/tests/executable_impl_test_base.h
@@ -28,7 +28,7 @@ limitations under the License.
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/dtype.h"
 #include "xla/python/ifrt/shape.h"
-#include "tsl/concurrency/ref_count.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/executable_impl_test_lib.cc b/third_party/xla/xla/python/ifrt/ir/tests/executable_impl_test_lib.cc
index ff0aeea5b0472b..8cbb44f908f0e2 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/executable_impl_test_lib.cc
+++ b/third_party/xla/xla/python/ifrt/ir/tests/executable_impl_test_lib.cc
@@ -14,25 +14,29 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
+#include <optional>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/types/span.h"
+#include "llvm/ADT/STLExtras.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/python/ifrt/array.h"
-#include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/dtype.h"
 #include "xla/python/ifrt/executable.h"
+#include "xla/python/ifrt/hlo/hlo_program.h"
 #include "xla/python/ifrt/ir/compiler.h"
 #include "xla/python/ifrt/ir/sharding_param.h"
 #include "xla/python/ifrt/ir/tests/executable_impl_test_base.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/test_util.h"
 #include "xla/python/pjrt_ifrt/xla_compiler.h"
-#include "tsl/concurrency/ref_count.h"
+#include "xla/service/computation_placer.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/status_matchers.h"
 #include "tsl/platform/statusor.h"
@@ -301,7 +305,7 @@ module {
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<LoadedExecutable> child_exec,
       client_->GetDefaultCompiler()->Compile(
-          std::make_unique<xla::ifrt::XlaProgram>(*mhlo_module),
+          std::make_unique<xla::ifrt::HloProgram>(*mhlo_module),
           std::make_unique<XlaCompileOptions>(std::move(xla_options))));
 
   std::string source = R"(
@@ -320,7 +324,7 @@ module {
   TF_ASSERT_OK_AND_ASSIGN(mlir::OwningOpRef<mlir::ModuleOp> mlir_module,
                           LoadFromSource(source));
   auto options = std::make_unique<IfrtIRCompileOptions>(GetDeviceIds(devices));
-  options->loaded_exec_binding["add_one"] = child_exec.get();
+  options->loaded_exec_binding["add_one"] = std::move(child_exec);
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<LoadedExecutable> loaded_exec,
       client_->GetDefaultCompiler()->Compile(
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/ifrt_verify_donation.mlir b/third_party/xla/xla/python/ifrt/ir/tests/ifrt_verify_donation.mlir
new file mode 100644
index 00000000000000..59a1fb61b9ac77
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/ir/tests/ifrt_verify_donation.mlir
@@ -0,0 +1,130 @@
+// RUN: ifrt-opt %s -ifrt-verify-donation -split-input-file -verify-diagnostics | FileCheck %s
+
+!array0 = !ifrt.array<tensor<2xi32>,
+                      #ifrt.sharding_param<2 to [0] on 2>, [0, 1]>
+!array1 = !ifrt.array<tensor<2xi32>,
+                      #ifrt.sharding_param<2 to [0] on 2>, [2, 3]>
+// CHECK-LABEL: @donate_call_output_to_call_and_reshard
+module @donate_call_output_to_call_and_reshard {
+  func.func @main(%arg0: !array0 {ifrt.donated}) -> !array1
+      attributes {ifrt.function} {
+    %0, %ctrl_0 = ifrt.Call @identity(%arg0) on devices [0,1]
+        : (!array0) -> !array0
+    %1, %ctrl_1 = ifrt.Call @identity(%0) on devices [0,1]
+        {io_aliases=[array<i32: 0, 0>]} : (!array0) -> !array0
+    %2 = ifrt.Reshard(%1) {donated=true} : (!array0) -> !array1
+    return %2 : !array1
+  }
+
+  func.func private @identity(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+    return %arg0 : tensor<2xi32>
+  }
+}
+
+// -----
+
+!array = !ifrt.array<tensor<2xi32>, #ifrt.sharding_param<2 to [0] on 2>, [0, 1]>
+module @donate_to_two_calls_error {
+  func.func @main(%arg0: !array {ifrt.donated}) -> (!array, !array)
+      attributes {ifrt.function} {
+    %0, %ctrl_0 = ifrt.Call @identity(%arg0) on devices [0,1]
+        {io_aliases=[array<i32: 0, 0>]} : (!array) -> !array
+    // expected-error @+1 {{'ifrt.Call' op input #0 already donated.}}
+    %1, %ctrl_1 = ifrt.Call @identity(%arg0) on devices [0,1]
+        {io_aliases=[array<i32: 0, 0>]} : (!array) -> !array
+    return %0, %1 : !array, !array
+  }
+
+  func.func private @identity(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+    return %arg0 : tensor<2xi32>
+  }
+}
+
+// -----
+
+!array0 = !ifrt.array<tensor<2xi32>,
+                      #ifrt.sharding_param<2 to [0] on 2>, [0, 1]>
+!array1 = !ifrt.array<tensor<2xi32>,
+                      #ifrt.sharding_param<2 to [0] on 2>, [2, 3]>
+module @program_arg_not_donated_error {
+  func.func @main(%arg0: !array0) -> (!array1) attributes {ifrt.function} {
+    // expected-error @+1 {{'ifrt.Reshard' op input has not been donated to the program.}}
+    %0 = ifrt.Reshard(%arg0) {donated=true} : (!array0) -> !array1
+    return %0 : !array1
+  }
+}
+
+// -----
+
+!array0 = !ifrt.array<tensor<2xi32>,
+                      #ifrt.sharding_param<2 to [0] on 2>, [0, 1]>
+!array1 = !ifrt.array<tensor<2xi32>,
+                      #ifrt.sharding_param<2 to [0] on 2>, [2, 3]>
+module @donate_to_two_reshards_error {
+  func.func @main(%arg0: !array0 {ifrt.donated}) -> (!array1, !array1)
+      attributes {ifrt.function} {
+    %0 = ifrt.Reshard(%arg0) {donated=true} : (!array0) -> !array1
+    // expected-error @+1 {{'ifrt.Reshard' op input already donated.}}
+    %1 = ifrt.Reshard(%arg0) {donated=true} : (!array0) -> !array1
+    return %0, %1 : !array1, !array1
+  }
+}
+
+// -----
+
+!array0 = !ifrt.array<tensor<2xi32>,
+                      #ifrt.sharding_param<2 to [0] on 2>, [0, 1]>
+!array1 = !ifrt.array<tensor<2xi32>,
+                      #ifrt.sharding_param<2 to [0] on 2>, [2, 3]>
+module @donate_to_reshard_and_call_error {
+  func.func @main(%arg0: !array0 {ifrt.donated}) -> (!array0, !array1)
+        attributes {ifrt.function} {
+    %0, %ctrl_0 = ifrt.Call @identity(%arg0) on devices [0,1]
+        {io_aliases=[array<i32: 0, 0>]} : (!array0) -> !array0
+    // expected-error @+1 {{'ifrt.Reshard' op input already donated.}}
+    %1 = ifrt.Reshard(%arg0) {donated=true} : (!array0) -> !array1
+    return %0, %1 : !array0, !array1
+  }
+
+  func.func private @identity(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+    return %arg0 : tensor<2xi32>
+  }
+}
+
+// -----
+
+!array = !ifrt.array<tensor<2xi32>,
+                     #ifrt.sharding_param<2 to [0] on 2>, [0, 1]>
+module @program_arg_not_donated_to_remap_error {
+  func.func @main(%arg0: !array {ifrt.donated}, %arg1: !array) -> (!array)
+      attributes {ifrt.function} {
+    // expected-error @+1 {{'ifrt.RemapArrays' op input has not been donated to the program.}}
+    %0 = ifrt.RemapArrays(%arg0, %arg1)
+      mappings=[#ifrt.array_mapping<0, 0, [#ifrt.mapping<[0:1:1] to [0:1:1]>]>,
+                #ifrt.array_mapping<1, 0, [#ifrt.mapping<[0:1:1] to [1:2:1]>]>]
+      {donated=true} : (!array, !array) -> !array
+    return %0 : !array
+  }
+}
+
+// -----
+
+!array = !ifrt.array<tensor<2xi32>,
+                      #ifrt.sharding_param<2 to [0] on 2>, [0, 1]>
+module @donate_to_reshard_and_call_error {
+  func.func @main(%arg0: !array {ifrt.donated}) -> (!array)
+        attributes {ifrt.function} {
+    %0, %ctrl_0 = ifrt.Call @identity(%arg0) on devices [0,1]
+        {io_aliases=[array<i32: 0, 0>]} : (!array) -> !array
+    // expected-error @+1 {{'ifrt.RemapArrays' op input #1 already donated.}}
+    %1 = ifrt.RemapArrays(%0, %arg0)
+      mappings=[#ifrt.array_mapping<0, 0, [#ifrt.mapping<[0:1:1] to [0:1:1]>]>,
+                #ifrt.array_mapping<1, 0, [#ifrt.mapping<[0:1:1] to [1:2:1]>]>]
+      {donated=true} : (!array, !array) -> !array
+    return %1 : !array
+  }
+
+  func.func private @identity(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+    return %arg0 : tensor<2xi32>
+  }
+}
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/ifrt_verify_sharding_specified.mlir b/third_party/xla/xla/python/ifrt/ir/tests/ifrt_verify_sharding_specified.mlir
index 6be0131833826d..e79d8816c7b937 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/ifrt_verify_sharding_specified.mlir
+++ b/third_party/xla/xla/python/ifrt/ir/tests/ifrt_verify_sharding_specified.mlir
@@ -1,7 +1,7 @@
 // RUN: ifrt-opt %s -ifrt-verify-sharding-specified -split-input-file -verify-diagnostics | FileCheck %s
 
 // CHECK-LABEL: @good_arrays
-#sharding = #ifrt.sharding_param<2 to [0] on 2>
+#sharding = #ifrt.sharding_param<2 to [0] on 2, memory_kind = "device">
 module @good_arrays {
   func.func @main(%arg0: !ifrt.array<tensor<2xi32>, #sharding, [0,1]>)
       -> !ifrt.array<tensor<2xi32>, #sharding, [2,3]>
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/verify_remap_arrays.mlir b/third_party/xla/xla/python/ifrt/ir/tests/verify_remap_arrays.mlir
new file mode 100644
index 00000000000000..b7c4db63d753a5
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/ir/tests/verify_remap_arrays.mlir
@@ -0,0 +1,194 @@
+// RUN: ifrt-opt %s -split-input-file -verify-diagnostics
+
+!array0 = !ifrt.array<tensor<2x2xi32>,
+                      #ifrt.sharding_param<1x1 to [0] on 1>, [0]>
+!array1 = !ifrt.array<tensor<2x2xi32>,
+                      #ifrt.sharding_param<1x1 to [0] on 1>, [1]>
+!array2 = !ifrt.array<tensor<2x4xi32>,
+                      #ifrt.sharding_param<1x2 to [0] on 2>, [0,1]>
+func.func @remap_from_two_to_one_array(%arg0: !array0, %arg1: !array1)
+    attributes {ifrt.function} {
+  %0 = ifrt.RemapArrays(%arg0, %arg1)
+      mappings=[#ifrt.array_mapping<0, 0, [#ifrt.mapping<[0:1:1] to [0:1:1]>]>,
+                #ifrt.array_mapping<1, 0, [#ifrt.mapping<[0:1:1] to [1:2:1]>]>]
+      : (!array0, !array1) -> (!array2)
+  return
+}
+
+// -----
+
+!array = !ifrt.array<tensor<2x8xi32>,
+                     #ifrt.sharding_param<1x4 to [0] on 4>, [0,1,2,3]>
+!array0 = !ifrt.array<tensor<2x4xi32>,
+                      #ifrt.sharding_param<1x2 to [0] on 2>, [0,1]>
+!array1 = !ifrt.array<tensor<2x4xi32>,
+                      #ifrt.sharding_param<1x2 to [0] on 2>, [2,3]>
+func.func @remap_from_one_array_to_two_arrays(%arg0: !array)
+    attributes {ifrt.function} {
+  %0, %1 = ifrt.RemapArrays(%arg0)
+      mappings=[#ifrt.array_mapping<0, 0, [#ifrt.mapping<[0:2:1] to [0:2:1]>]>,
+                #ifrt.array_mapping<0, 1, [#ifrt.mapping<[2:4:1] to [0:2:1]>]>]
+      : (!array) -> (!array0, !array1)
+  return
+}
+
+// -----
+
+!array = !ifrt.array<tensor<2x4xi32>,
+                     #ifrt.sharding_param<1x2 to [0] on 2>, [0,1]>
+func.func @requires_at_least_one_input() attributes {ifrt.function} {
+  // expected-error@+1 {{'ifrt.RemapArrays' op requires at least one input array}}
+  %0 = ifrt.RemapArrays() mappings = [] : () -> (!array)
+  return
+}
+
+// -----
+
+!array0 = !ifrt.array<tensor<2x4xi32>,
+                      #ifrt.sharding_param<1x2 to [0] on 2>, [0,1]>
+!array1 = !ifrt.array<tensor<2x4xf32>,
+                      #ifrt.sharding_param<1x2 to [0] on 2>, [0,1]>
+func.func @requires_inputs_have_same_dtype(%arg0: !array0, %arg1: !array1)
+    attributes {ifrt.function} {
+  // expected-error@+1 {{'ifrt.RemapArrays' op requires every input and output array to have the same dtype}}
+  %0 = ifrt.RemapArrays(%arg0, %arg1) mappings = []
+      : (!array0, !array1) -> (!array0)
+  return
+}
+
+// -----
+
+!array0 = !ifrt.array<tensor<2x4xi32>,
+                      #ifrt.sharding_param<1x2 to [0] on 2>, [0,1]>
+!array1 = !ifrt.array<tensor<2x4xf32>,
+                      #ifrt.sharding_param<1x2 to [0] on 2>, [0,1]>
+func.func @requires_outputs_have_same_dtype(%arg0: !array0)
+    attributes {ifrt.function} {
+  // expected-error@+1 {{'ifrt.RemapArrays' op requires every input and output array to have the same dtype}}
+  %0 = ifrt.RemapArrays(%arg0) mappings = []
+      : (!array0) -> (!array1)
+  return
+}
+
+// -----
+
+!array0 = !ifrt.array<tensor<2x4xi32>,
+                      #ifrt.sharding_param<1x2 to [0] on 2>, [0,1]>
+!array1 = !ifrt.array<tensor<2x4xi32>,
+                      #ifrt.sharding_param<1x1 to [0] on 2>, [0,1]>
+func.func @requires_different_input_per_shard_shape(
+    %arg0: !array0, %arg1: !array1) attributes {ifrt.function} {
+  // expected-error@+1 {{'ifrt.RemapArrays' op requires every input array to have the same per-shard shape}}
+  %0 = ifrt.RemapArrays(%arg0, %arg1) mappings = []
+      : (!array0, !array1) -> (!array0)
+  return
+}
+
+// -----
+
+!array0 = !ifrt.array<tensor<2x4xi32>,
+                      #ifrt.sharding_param<1x1 to [0] on 2>, [0,1]>
+!array1 = !ifrt.array<tensor<2x4xi32>,
+                      #ifrt.sharding_param<1x2 to [0] on 2>, [0,1]>
+func.func @requires_different_output_per_shard_shape(%arg0: !array0)
+    attributes {ifrt.function} {
+  // expected-error@+1 {{'ifrt.RemapArrays' op requires every output array to have the same per-shard shape}}
+  %0, %1 = ifrt.RemapArrays(%arg0) mappings = [] : (!array0) -> (!array0, !array1)
+  return
+}
+
+// -----
+
+!array = !ifrt.array<tensor<2x4xi32>,
+                     #ifrt.sharding_param<1x2 to [0] on 2>, [0,1]>
+func.func @error_negative_start(%arg0: !array) attributes {ifrt.function} {
+  // expected-error@+4 {{start, end must be zero or positive}}
+  // expected-error@+3 {{failed to parse Ifrt_MappingAttr parameter}}
+  // expected-error@+2 {{failed to parse Ifrt_ArrayMappingAttr parameter}}
+  %0 = ifrt.RemapArrays(%arg0) mappings =
+      [#ifrt.array_mapping<0, 0, [#ifrt.mapping<[-1:1:1] to [1:2:1]>]>]
+      : (!array) -> (!array)
+  return
+}
+
+// -----
+
+!array = !ifrt.array<tensor<2x4xi32>,
+                     #ifrt.sharding_param<1x2 to [0] on 2>, [0,1]>
+func.func @error_non_positive_step(%arg0: !array) attributes {ifrt.function} {
+  // expected-error@+4 {{step must be positive}}
+  // expected-error@+3 {{failed to parse Ifrt_MappingAttr parameter}}
+  // expected-error@+2 {{failed to parse Ifrt_ArrayMappingAttr parameter}}
+  %0 = ifrt.RemapArrays(%arg0) mappings =
+      [#ifrt.array_mapping<0, 0, [#ifrt.mapping<[0:1:1] to [1:2:0]>]>]
+      : (!array) -> (!array)
+  return
+}
+
+// -----
+
+!array = !ifrt.array<tensor<2x4xi32>,
+                     #ifrt.sharding_param<1x2 to [0] on 2>, [0,1]>
+func.func @error_empty_interval(%arg0: !array) attributes {ifrt.function} {
+  // expected-error@+4 {{interval is empty}}
+  // expected-error@+3 {{failed to parse Ifrt_MappingAttr parameter}}
+  // expected-error@+2 {{failed to parse Ifrt_ArrayMappingAttr parameter}}
+  %0 = ifrt.RemapArrays(%arg0) mappings =
+      [#ifrt.array_mapping<0, 0, [#ifrt.mapping<[1:0:1] to [1:2:0]>]>]
+      : (!array) -> (!array)
+  return
+}
+
+// -----
+
+!array = !ifrt.array<tensor<2x4xi32>,
+                     #ifrt.sharding_param<1x2 to [0] on 2>, [0,1]>
+func.func @error_diff_number_shards(%arg0: !array) attributes {ifrt.function} {
+  // expected-error@+3 {{but they must have the same number of shards}}
+  // expected-error@+2 {{failed to parse Ifrt_ArrayMappingAttr parameter}}
+  %0 = ifrt.RemapArrays(%arg0) mappings =
+      [#ifrt.array_mapping<0, 0, [#ifrt.mapping<[0:1:1] to [0:2:1]>]>]
+      : (!array) -> (!array)
+  return
+}
+
+// -----
+
+!array = !ifrt.array<tensor<2x4xi32>,
+                     #ifrt.sharding_param<1x2 to [0] on 2>, [0,1]>
+func.func @error_input_shard_used_more_than_once(%arg0: !array)
+    attributes {ifrt.function} {
+  // expected-error@+1 {{op input array #0 shard #0 is already used}}
+  %0 = ifrt.RemapArrays(%arg0)
+    mappings = [#ifrt.array_mapping<0, 0, [#ifrt.mapping<[0:1:1] to [0:1:1]>]>,
+                #ifrt.array_mapping<0, 0, [#ifrt.mapping<[0:1:1] to [1:2:1]>]>]
+    : (!array) -> (!array)
+  return
+}
+
+// -----
+
+!array = !ifrt.array<tensor<2x4xi32>,
+                     #ifrt.sharding_param<1x2 to [0] on 2>, [0,1]>
+func.func @error_output_shard_has_no_input_shard_mapped(%arg0: !array)
+    attributes {ifrt.function} {
+  // expected-error@+1 {{op output array #0 shard #1 is unassigned.}}
+  %0 = ifrt.RemapArrays(%arg0)
+    mappings = [#ifrt.array_mapping<0, 0, [#ifrt.mapping<[0:1:1] to [0:1:1]>]>]
+    : (!array) -> (!array)
+  return
+}
+
+// -----
+
+!array = !ifrt.array<tensor<2x4xi32>,
+                     #ifrt.sharding_param<1x2 to [0] on 2>, [0,1]>
+func.func @error_output_shard_has_more_than_on_input_shard_mapped(%arg0: !array)
+    attributes {ifrt.function} {
+  // expected-error@+1 {{op output array #0 shard #0 is already assigned}}
+  %0 = ifrt.RemapArrays(%arg0)
+    mappings = [#ifrt.array_mapping<0, 0, [#ifrt.mapping<[0:1:1] to [0:1:1]>]>,
+                #ifrt.array_mapping<0, 0, [#ifrt.mapping<[1:2:1] to [0:1:1]>]>]
+    : (!array) -> (!array)
+  return
+}
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/BUILD b/third_party/xla/xla/python/ifrt/ir/transforms/BUILD
index 736863147045ee..53c70df7f6b56f 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/BUILD
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/BUILD
@@ -30,6 +30,7 @@ cc_library(
     name = "passes",
     srcs = [
         "ifrt_duplicated_callee_elimination_pass.cc",
+        "ifrt_verify_donation_pass.cc",
         "ifrt_verify_sharding_specified_pass.cc",
         "spmd_expandable_interface_verification_pass.cc",
         "spmd_expansion_pass.cc",
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_verify_donation_pass.cc b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_verify_donation_pass.cc
new file mode 100644
index 00000000000000..63f8a19d0256a9
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_verify_donation_pass.cc
@@ -0,0 +1,142 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/TypeSwitch.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "xla/python/ifrt/ir/constants.h"
+#include "xla/python/ifrt/ir/ifrt_ops.h"
+
+namespace xla {
+namespace ifrt {
+
+namespace {
+
+#define GEN_PASS_DEF_IFRTVERIFYDONATIONPASS
+#include "xla/python/ifrt/ir/transforms/passes.h.inc"
+
+// Verifies that if the value is an input to the IR, then it has been donated.
+mlir::LogicalResult VerifyIfInputAndDonated(mlir::Operation* op,
+                                            mlir::Value arg) {
+  auto block_arg = mlir::dyn_cast<mlir::BlockArgument>(arg);
+  mlir::func::FuncOp func_op = block_arg
+                                   ? mlir::dyn_cast<mlir::func::FuncOp>(
+                                         block_arg.getOwner()->getParentOp())
+                                   : nullptr;
+  if (func_op &&
+      func_op.getArgAttr(block_arg.getArgNumber(),
+                         xla::ifrt::kIfrtDonatedArgAttrName) == nullptr) {
+    return op->emitOpError() << "input has not been donated to the program.";
+  }
+  return mlir::success();
+}
+
+// Verifies that no array is donated more than once, and that all arrays donated
+// to reshard or atom programs, which are also inputs to the main func have
+// `ifrt.donated` attribute set.
+class IfrtVerifyDonationPass
+    : public impl::IfrtVerifyDonationPassBase<IfrtVerifyDonationPass> {
+ public:
+  void runOnOperation() override;
+};
+
+void IfrtVerifyDonationPass::runOnOperation() {
+  mlir::ModuleOp module_op = getOperation();
+  xla::ifrt::ReshardOp reshard_op;
+  llvm::DenseSet<mlir::Value> donated_values;
+  mlir::WalkResult result = module_op.walk([&](mlir::Operation* op)
+                                               -> mlir::WalkResult {
+    auto result =
+        llvm::TypeSwitch<mlir::Operation*, mlir::LogicalResult>(op)
+            .Case<xla::ifrt::CallOp, xla::ifrt::CallLoadedExecutableOp>(
+                [&](auto& op) {
+                  for (const auto& io_alias :
+                       op.getIoAliases()
+                           .template getAsRange<mlir::DenseI32ArrayAttr>()) {
+                    mlir::ArrayRef<int> io_alias_as_array =
+                        io_alias.asArrayRef();
+                    auto donated_value = op.getInputs()[io_alias_as_array[0]];
+                    if (!donated_values.insert(donated_value).second) {
+                      op.emitOpError() << "input #" << io_alias_as_array[0]
+                                       << " already donated.";
+                      return mlir::failure();
+                    }
+
+                    if (mlir::failed(
+                            VerifyIfInputAndDonated(op, donated_value))) {
+                      return mlir::failure();
+                    }
+                  }
+                  return mlir::success();
+                })
+            .Case<xla::ifrt::ReshardOp>([&](auto& op) {
+              if (op.getDonated()) {
+                auto donated_value = op.getInput();
+                if (!donated_values.insert(donated_value).second) {
+                  op.emitOpError() << "input already donated.";
+                  return mlir::failure();
+                }
+                if (mlir::failed(VerifyIfInputAndDonated(op, donated_value))) {
+                  return mlir::failure();
+                }
+              }
+              return mlir::success();
+            })
+            .Case<xla::ifrt::RemapArraysOp>([&](auto& op) {
+              if (op.getDonated()) {
+                for (const auto [idx, input] :
+                     llvm::enumerate(op.getInputs())) {
+                  if (!donated_values.insert(input).second) {
+                    op.emitOpError() << "input #" << idx << " already donated.";
+                    return mlir::failure();
+                  }
+                  if (mlir::failed(VerifyIfInputAndDonated(op, input))) {
+                    return mlir::failure();
+                  }
+                }
+              }
+              return mlir::success();
+            })
+            .Default(mlir::success());
+
+    if (mlir::failed(result)) {
+      return mlir::WalkResult::interrupt();
+    }
+    return mlir::WalkResult::advance();
+  });
+  if (result.wasInterrupted()) {
+    signalPassFailure();
+    return;
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateIfrtVerifyDonationPass() {
+  return std::make_unique<IfrtVerifyDonationPass>();
+}
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_verify_sharding_specified_pass.cc b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_verify_sharding_specified_pass.cc
index 9aaa990832a261..ccca9a8bd32501 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_verify_sharding_specified_pass.cc
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_verify_sharding_specified_pass.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "xla/python/ifrt/ir/ifrt_dialect.h"
 #include "xla/python/ifrt/ir/transforms/passes.h"
 
@@ -40,7 +41,7 @@ bool IsArrayWithUnspecifiedSharding(mlir::Type type) {
   if (array_type == nullptr) {
     return false;
   }
-  return array_type.getShardingAttr().isa<IfrtUnspecifiedShardingAttr>();
+  return mlir::isa<IfrtUnspecifiedShardingAttr>(array_type.getShardingAttr());
 }
 
 class IfrtVerifyShardingSpecifiedPass
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/passes.h b/third_party/xla/xla/python/ifrt/ir/transforms/passes.h
index 546eb55e681797..0d439ce67db189 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/passes.h
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/passes.h
@@ -39,6 +39,9 @@ std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> CreateSpmdExpansionPass();
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 CreateIfrtDuplicatedCalleeEliminationPass();
 
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateIfrtVerifyDonationPass();
+
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 CreateIfrtVerifyShardingSpecifiedPass();
 
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/passes.td b/third_party/xla/xla/python/ifrt/ir/transforms/passes.td
index fe9709004f35f5..839cb50e21aff9 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/passes.td
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/passes.td
@@ -107,6 +107,17 @@ them. The duplicated callee `FuncOp` will not be removed.
   let constructor = "CreateIfrtDuplicatedCalleeEliminationPass()";
 }
 
+def IfrtVerifyDonationPass : Pass<"ifrt-verify-donation", "mlir::ModuleOp"> {
+  let summary = "Verify that `!ifrt.array` are not donated more than once.";
+  let description = [{
+Verifiy that no `!ifrt.array` is donated more than once, and that all
+`ifrt.array`s that are donated and arguments of the main func have
+`ifrt.donated` attribute.
+  }];
+
+  let constructor = "CreateIfrtVerifyDonationPass()";
+}
+
 def IfrtVerifyShardingSpecifiedPass :
     Pass<"ifrt-verify-sharding-specified", "mlir::ModuleOp"> {
   let summary = "Verify that all `!ifrt.array` have sharding specified.";
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/spmd_expandable_interface_verification_pass.cc b/third_party/xla/xla/python/ifrt/ir/transforms/spmd_expandable_interface_verification_pass.cc
index b7b8dee3ec5128..541c9a855e390e 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/spmd_expandable_interface_verification_pass.cc
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/spmd_expandable_interface_verification_pass.cc
@@ -23,6 +23,8 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/spmd_expanders/BUILD b/third_party/xla/xla/python/ifrt/ir/transforms/spmd_expanders/BUILD
index 9ff15e6ced87c0..6a8b461c6d078d 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/spmd_expanders/BUILD
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/spmd_expanders/BUILD
@@ -17,6 +17,7 @@ cc_library(
     visibility = ["//xla/python/ifrt:friends"],
     deps = [
         "//xla/python/ifrt/ir",
+        "//xla/python/ifrt/ir:sharding_param",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/spmd_expansion_pass.cc b/third_party/xla/xla/python/ifrt/ir/transforms/spmd_expansion_pass.cc
index cc13c47df669c5..024c394b00e2d0 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/spmd_expansion_pass.cc
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/spmd_expansion_pass.cc
@@ -176,7 +176,7 @@ mlir::LogicalResult UpdateFunctionArgsUsingSharding(
     auto value = function.getFunctionType().getInput(i);
 
     mlir::RankedTensorType ranked_type =
-        value.dyn_cast<mlir::RankedTensorType>();
+        mlir::dyn_cast<mlir::RankedTensorType>(value);
     if (ranked_type == nullptr) {
       return function.emitOpError()
              << "requires `mlir::RankedTensorType` for arg " << i;
@@ -225,7 +225,8 @@ mlir::LogicalResult UpdateFunctionWithLocalInputShapes(
     mlir::func::FuncOp function) {
   for (auto& operand : function_operands) {
     const int index = operand.getOperandNumber();
-    auto arg_type = operand.get().getType().dyn_cast<mlir::RankedTensorType>();
+    auto arg_type =
+        mlir::dyn_cast<mlir::RankedTensorType>(operand.get().getType());
     if (!arg_type) continue;
 
     llvm::ArrayRef<int64_t> arg_local_shape = arg_type.getShape();
diff --git a/third_party/xla/xla/python/ifrt/mock.cc b/third_party/xla/xla/python/ifrt/mock.cc
index a70648001eb916..6b4479a14c6344 100644
--- a/third_party/xla/xla/python/ifrt/mock.cc
+++ b/third_party/xla/xla/python/ifrt/mock.cc
@@ -24,19 +24,19 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include "absl/container/flat_hash_map.h"
-#include "absl/strings/string_view.h"
+#include "absl/status/statusor.h"
 #include "absl/types/span.h"
-#include "xla/literal.h"
 #include "xla/pjrt/pjrt_device_description.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/remap_plan.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/ifrt/value.h"
-#include "tsl/concurrency/ref_count.h"
+#include "xla/tsl/concurrency/ref_count.h"
 
 namespace xla {
 namespace ifrt {
@@ -124,6 +124,12 @@ MockClient::MockClient(std::unique_ptr<xla::ifrt::Client> delegated)
         return delegated_->AssembleArrayFromSingleDeviceArrays(
             std::move(shape), std::move(sharding), arrays, semantics);
       });
+  ON_CALL(*this, RemapArrays)
+      .WillByDefault([this](const RemapPlan& plan,
+                            absl::Span<tsl::RCReference<Array>> arrays,
+                            ArrayCopySemantics semantics) {
+        return delegated_->RemapArrays(plan, arrays, semantics);
+      });
   ON_CALL(*this, MakeTuple)
       .WillByDefault([this](absl::Span<tsl::RCReference<Value>> values) {
         return delegated_->MakeTuple(values);
diff --git a/third_party/xla/xla/python/ifrt/mock.h b/third_party/xla/xla/python/ifrt/mock.h
index 942f0858e0c41a..ab69933e361ce0 100644
--- a/third_party/xla/xla/python/ifrt/mock.h
+++ b/third_party/xla/xla/python/ifrt/mock.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_PYTHON_IFRT_MOCK_H_
 #define XLA_PYTHON_IFRT_MOCK_H_
 
+#include <cstdint>
 #include <functional>
 #include <memory>
 #include <optional>
@@ -24,24 +25,40 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/Support/ExtensibleRTTI.h"
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/literal.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_device_description.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/pjrt/pjrt_future.h"
+#include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/compiler.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/executable.h"
+#include "xla/python/ifrt/executable_serdes.h"
+#include "xla/python/ifrt/future.h"
 #include "xla/python/ifrt/host_callback.h"
 #include "xla/python/ifrt/index_domain.h"
 #include "xla/python/ifrt/memory.h"
+#include "xla/python/ifrt/program.h"
+#include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt/tuple.h"
+#include "xla/python/ifrt/value.h"
+#include "xla/status.h"
 #include "xla/test.h"
-#include "tsl/concurrency/ref_count.h"
+#include "xla/tsl/concurrency/ref_count.h"
+#include "tsl/framework/allocator.h"
 
 namespace xla {
 namespace ifrt {
@@ -112,6 +129,11 @@ class MockClient : public llvm::RTTIExtends<MockClient, Client> {
                absl::Span<tsl::RCReference<Array>> arrays,
                ArrayCopySemantics semantics),
               (final));
+  MOCK_METHOD(absl::StatusOr<std::vector<tsl::RCReference<Array>>>, RemapArrays,
+              (const RemapPlan& plan,
+               absl::Span<tsl::RCReference<Array>> arrays,
+               ArrayCopySemantics semantics),
+              (final));
   MOCK_METHOD(absl::StatusOr<tsl::RCReference<Tuple>>, MakeTuple,
               (absl::Span<tsl::RCReference<Value>> values), (final));
   MOCK_METHOD(absl::string_view, runtime_type, (), (const, final));
@@ -245,7 +267,7 @@ class MockLoadedExecutable
   MOCK_METHOD(absl::StatusOr<std::optional<std::string>>, Fingerprint, (),
               (const, final));
   MOCK_METHOD(absl::StatusOr<std::string>, Serialize, (), (const, final));
-  MOCK_METHOD(Future<absl::Status>, GetReadyFuture, (), (const, override));
+  MOCK_METHOD(Future<>, GetReadyFuture, (), (const, override));
   MOCK_METHOD(int, num_devices, (), (const, final));
   MOCK_METHOD(int64_t, SizeOfGeneratedCodeInBytes, (), (const, final));
   MOCK_METHOD(absl::StatusOr<CompiledMemoryStats>, GetCompiledMemoryStats, (),
@@ -271,7 +293,7 @@ class MockLoadedExecutable
                const ExecuteOptions& options,
                std::optional<DeviceList> devices),
               (final));
-  MOCK_METHOD(Future<Status>, Delete, (), (final));
+  MOCK_METHOD(Future<>, Delete, (), (final));
   MOCK_METHOD(bool, IsDeleted, (), (const, final));
   MOCK_METHOD(absl::Span<const LogicalDeviceIds>,
               addressable_device_logical_ids, (), (const, final));
diff --git a/third_party/xla/xla/python/ifrt/remap_impl_test_lib.cc b/third_party/xla/xla/python/ifrt/remap_impl_test_lib.cc
new file mode 100644
index 00000000000000..a55d97d13998e4
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/remap_impl_test_lib.cc
@@ -0,0 +1,353 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <memory>
+#include <numeric>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "llvm/Support/Casting.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/array_spec.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/memory.h"
+#include "xla/python/ifrt/remap_plan.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt/test_util.h"
+#include "xla/status_macros.h"
+#include "xla/tsl/concurrency/ref_count.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/test.h"
+
+namespace xla {
+namespace ifrt {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+using ::testing::HasSubstr;
+using ::testing::SizeIs;
+using ::tsl::testing::StatusIs;
+
+// Creates an array with (base_values.size()) shards. The constructed array
+// shape is [2 * base_values.size(), 3]. Each shard has a shape of [2, 3] and
+// has a content reshaped from an iota starting from `base_values[i]` for shard
+// i. Shard i is placed on addressable device (device_indices[i]).
+absl::StatusOr<tsl::RCReference<Array>> CreateArray(
+    Client* client, absl::Span<const int32_t> base_values,
+    absl::Span<const int> device_indices) {
+  TF_RET_CHECK(base_values.size() == device_indices.size());
+
+  DType dtype(DType::kS32);
+  Shape shape({2 * static_cast<int64_t>(base_values.size()), 3});
+  Shape shard_shape({2, 3});
+
+  std::vector<tsl::RCReference<Array>> shards;
+  shards.reserve(base_values.size());
+  DeviceList::Devices devices;
+  devices.reserve(device_indices.size());
+
+  for (int i = 0; i < base_values.size(); ++i) {
+    std::vector<int32_t> data(6);
+    std::iota(data.begin(), data.end(), base_values[i]);
+
+    Device* device = client->addressable_devices().at(device_indices[i]);
+    devices.push_back(device);
+    std::shared_ptr<const Sharding> sharding =
+        SingleDeviceSharding::Create(device, MemoryKind());
+
+    TF_ASSIGN_OR_RETURN(
+        shards.emplace_back(),
+        client->MakeArrayFromHostBuffer(
+            data.data(), dtype, shard_shape,
+            /*byte_strides=*/std::nullopt, std::move(sharding),
+            Client::HostBufferSemantics::kImmutableOnlyDuringCall,
+            /*on_done_with_host_buffer=*/{}));
+  }
+
+  std::shared_ptr<const Sharding> assembled_sharding =
+      ConcreteEvenSharding::Create(DeviceList(std::move(devices)), MemoryKind(),
+                                   /*shape=*/shape,
+                                   /*shard_shape=*/std::move(shard_shape));
+  return client->AssembleArrayFromSingleDeviceArrays(
+      std::move(shape), std::move(assembled_sharding), absl::MakeSpan(shards),
+      ArrayCopySemantics::kDonateInput);
+}
+
+// Checks the shards of an array. The expected array shape is [2 *
+// base_values.size(), 3]. Each shard has an expected shape of [2, 3], whose
+// content is an iota starting from `base_values[i]` for shard i. Shard i is
+// expected to be placed on addressable device (device_indices[i]).
+void AssertArrayContent(Client* client, Array* array,
+                        absl::Span<const int32_t> base_values,
+                        absl::Span<const int> device_indices) {
+  DType expected_dtype(DType::kS32);
+  Shape expected_shape({2 * static_cast<int64_t>(base_values.size()), 3});
+  Shape expected_shard_shape({2, 3});
+  EXPECT_EQ(array->dtype(), expected_dtype);
+  EXPECT_EQ(array->shape(), expected_shape);
+  const auto* actual_sharding =
+      llvm::dyn_cast<ConcreteEvenSharding>(array->shared_ptr_sharding().get());
+  ASSERT_NE(actual_sharding, nullptr);
+  EXPECT_EQ(actual_sharding->shape(), expected_shape);
+  EXPECT_EQ(actual_sharding->shard_shape(), expected_shard_shape);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto shards, array->DisassembleIntoSingleDeviceArrays(
+                                           ArrayCopySemantics::kReuseInput));
+  ASSERT_THAT(shards, SizeIs(base_values.size()));
+  for (int i = 0; i < shards.size(); ++i) {
+    EXPECT_EQ(shards[i]->dtype(), expected_dtype);
+    EXPECT_EQ(shards[i]->shape(), expected_shard_shape);
+    const auto* actual_shard_sharding = llvm::dyn_cast<SingleDeviceSharding>(
+        shards[i]->shared_ptr_sharding().get());
+    ASSERT_NE(actual_shard_sharding, nullptr);
+    Device* expected_device =
+        client->addressable_devices().at(device_indices[i]);
+    EXPECT_THAT(actual_shard_sharding->devices(), ElementsAre(expected_device));
+
+    std::vector<int32_t> expected_data(6);
+    std::iota(expected_data.begin(), expected_data.end(), base_values[i]);
+
+    std::vector<int32_t> actual_data(6);
+    TF_ASSERT_OK(shards[i]
+                     ->CopyToHostBuffer(actual_data.data(),
+                                        /*byte_strides=*/std::nullopt,
+                                        ArrayCopySemantics::kAlwaysCopy)
+                     .Await());
+    EXPECT_THAT(actual_data, ElementsAreArray(expected_data));
+  }
+};
+
+TEST(RemapImplTest, ExtractSingleShard) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+
+  RemapPlan plan;
+  plan.input_specs.push_back(
+      ArraySpec{/*dtype=*/DType(DType::kS32),
+                /*shape=*/Shape({8, 3}),
+                /*sharding=*/
+                ConcreteEvenSharding::Create(
+                    test_util::GetDevices(client.get(), {0, 1, 2, 3}).value(),
+                    MemoryKind(), /*shape=*/Shape({8, 3}),
+                    /*shard_shape=*/Shape({2, 3}))});
+  plan.output_specs.push_back(
+      ArraySpec{/*dtype=*/DType(DType::kS32),
+                /*shape=*/Shape({2, 3}),
+                /*sharding=*/
+                ConcreteEvenSharding::Create(
+                    test_util::GetDevices(client.get(), {1}).value(),
+                    MemoryKind(), /*shape=*/Shape({2, 3}),
+                    /*shard_shape=*/Shape({2, 3}))});
+  // arrays[0].shards[1:2:1] is mapped into out_arrays[0].shards[0:1:1].
+  plan.mappings = std::make_shared<std::vector<RemapPlan::Mapping>>();
+  plan.mappings->push_back(
+      RemapPlan::Mapping{/*in_array=*/0, /*out_array=*/0,
+                         /*from=*/{RemapPlan::Interval{1, 2, 1}},
+                         /*to=*/{RemapPlan::Interval{0, 1, 1}}});
+  TF_ASSERT_OK(plan.Validate());
+
+  std::vector<tsl::RCReference<Array>> arrays;
+  TF_ASSERT_OK_AND_ASSIGN(
+      arrays.emplace_back(),
+      CreateArray(client.get(), /*base_values=*/{0, 6, 100, 106},
+                  /*device_indices=*/{0, 1, 2, 3}));
+
+  {
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto out_arrays, client->RemapArrays(plan, absl::MakeSpan(arrays),
+                                             ArrayCopySemantics::kReuseInput));
+    ASSERT_THAT(out_arrays, SizeIs(1));
+    // `out_arrays[0].shards[0] == arrays[0].shards[1]`.
+    AssertArrayContent(client.get(), out_arrays[0].get(), /*base_values=*/{6},
+                       /*device_indices=*/{1});
+  }
+  {
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto out_arrays, client->RemapArrays(plan, absl::MakeSpan(arrays),
+                                             ArrayCopySemantics::kDonateInput));
+    ASSERT_THAT(out_arrays, SizeIs(1));
+    // `out_arrays[0].shards[0] == arrays[0].shards[1]`.
+    AssertArrayContent(client.get(), out_arrays[0].get(), /*base_values=*/{6},
+                       /*device_indices=*/{1});
+  }
+}
+
+TEST(RemapImplTest, InterleaveArrays) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+
+  RemapPlan plan;
+  plan.input_specs.push_back(
+      ArraySpec{/*dtype=*/DType(DType::kS32),
+                /*shape=*/Shape({4, 3}),
+                /*sharding=*/
+                ConcreteEvenSharding::Create(
+                    test_util::GetDevices(client.get(), {0, 1}).value(),
+                    MemoryKind(), /*shape=*/Shape({4, 3}),
+                    /*shard_shape=*/Shape({2, 3}))});
+  plan.input_specs.push_back(
+      ArraySpec{/*dtype=*/DType(DType::kS32),
+                /*shape=*/Shape({4, 3}),
+                /*sharding=*/
+                ConcreteEvenSharding::Create(
+                    test_util::GetDevices(client.get(), {2, 3}).value(),
+                    MemoryKind(), /*shape=*/Shape({4, 3}),
+                    /*shard_shape=*/Shape({2, 3}))});
+  plan.output_specs.push_back(
+      ArraySpec{/*dtype=*/DType(DType::kS32),
+                /*shape=*/Shape({8, 3}),
+                /*sharding=*/
+                ConcreteEvenSharding::Create(
+                    test_util::GetDevices(client.get(), {0, 2, 1, 3}).value(),
+                    MemoryKind(), /*shape=*/Shape({8, 3}),
+                    /*shard_shape=*/Shape({2, 3}))});
+  // arrays[0].shards[0:2:1] is mapped into out_arrays[0].shards[0:4:2].
+  plan.mappings = std::make_shared<std::vector<RemapPlan::Mapping>>();
+  plan.mappings->reserve(2);
+  plan.mappings->push_back(
+      RemapPlan::Mapping{/*in_array=*/0, /*out_array=*/0,
+                         /*from=*/{RemapPlan::Interval{0, 2, 1}},
+                         /*to=*/{RemapPlan::Interval{0, 4, 2}}});
+  // arrays[1].shards[0:2:1] is mapped into out_arrays[0].shards[1:4:2].
+  plan.mappings->push_back(
+      RemapPlan::Mapping{/*in_array=*/1, /*out_array=*/0,
+                         /*from=*/{RemapPlan::Interval{0, 2, 1}},
+                         /*to=*/{RemapPlan::Interval{1, 4, 2}}});
+  TF_ASSERT_OK(plan.Validate());
+
+  std::vector<tsl::RCReference<Array>> arrays;
+  TF_ASSERT_OK_AND_ASSIGN(arrays.emplace_back(),
+                          CreateArray(client.get(), /*base_values=*/{0, 6},
+                                      /*device_indices=*/{0, 1}));
+  TF_ASSERT_OK_AND_ASSIGN(arrays.emplace_back(),
+                          CreateArray(client.get(), /*base_values=*/{100, 106},
+                                      /*device_indices=*/{2, 3}));
+
+  EXPECT_THAT(
+      client->RemapArrays(plan, absl::MakeSpan(arrays),
+                          ArrayCopySemantics::kReuseInput),
+      StatusIs(
+          absl::StatusCode::kInvalidArgument,
+          HasSubstr("kDonateInput is required if multiple inputs are used")));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto out_arrays, client->RemapArrays(plan, absl::MakeSpan(arrays),
+                                           ArrayCopySemantics::kDonateInput));
+
+  ASSERT_THAT(out_arrays, SizeIs(1));
+  // `out_arrays[0].shards[0] == arrays[0].shards[0]`
+  // `out_arrays[0].shards[1] == arrays[1].shards[0]`
+  // `out_arrays[0].shards[2] == arrays[0].shards[1]`
+  // `out_arrays[0].shards[3] == arrays[1].shards[1]`
+  AssertArrayContent(client.get(), out_arrays[0].get(),
+                     /*base_values=*/{0, 100, 6, 106},
+                     /*device_indices=*/{0, 2, 1, 3});
+}
+
+TEST(RemapImplTest, DeinterleaveArrays) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+
+  RemapPlan plan;
+  plan.input_specs.push_back(
+      ArraySpec{/*dtype=*/DType(DType::kS32),
+                /*shape=*/Shape({8, 3}),
+                /*sharding=*/
+                ConcreteEvenSharding::Create(
+                    test_util::GetDevices(client.get(), {0, 2, 1, 3}).value(),
+                    MemoryKind(), /*shape=*/Shape({8, 3}),
+                    /*shard_shape=*/Shape({2, 3}))});
+  plan.output_specs.push_back(
+      ArraySpec{/*dtype=*/DType(DType::kS32),
+                /*shape=*/Shape({4, 3}),
+                /*sharding=*/
+                ConcreteEvenSharding::Create(
+                    test_util::GetDevices(client.get(), {0, 1}).value(),
+                    MemoryKind(), /*shape=*/Shape({4, 3}),
+                    /*shard_shape=*/Shape({2, 3}))});
+  plan.output_specs.push_back(
+      ArraySpec{/*dtype=*/DType(DType::kS32),
+                /*shape=*/Shape({4, 3}),
+                /*sharding=*/
+                ConcreteEvenSharding::Create(
+                    test_util::GetDevices(client.get(), {2, 3}).value(),
+                    MemoryKind(), /*shape=*/Shape({4, 3}),
+                    /*shard_shape=*/Shape({2, 3}))});
+  // arrays[0].shards[0:4:2] is mapped into out_arrays[0].shards[0:2:1].
+  plan.mappings = std::make_shared<std::vector<RemapPlan::Mapping>>();
+  plan.mappings->reserve(2);
+  plan.mappings->push_back(
+      RemapPlan::Mapping{/*in_array=*/0, /*out_array=*/0,
+                         /*from=*/{RemapPlan::Interval{0, 4, 2}},
+                         /*to=*/{RemapPlan::Interval{0, 2, 1}}});
+  // arrays[0].shards[1:4:2] is mapped into out_arrays[1].shards[0:2:1].
+  plan.mappings->push_back(
+      RemapPlan::Mapping{/*in_array=*/0, /*out_array=*/1,
+                         /*from=*/{RemapPlan::Interval{1, 4, 2}},
+                         /*to=*/{RemapPlan::Interval{0, 2, 1}}});
+  TF_ASSERT_OK(plan.Validate());
+
+  std::vector<tsl::RCReference<Array>> arrays;
+  TF_ASSERT_OK_AND_ASSIGN(
+      arrays.emplace_back(),
+      CreateArray(client.get(), /*base_values=*/{0, 100, 6, 106},
+                  /*device_indices=*/{0, 2, 1, 3}));
+
+  {
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto out_arrays, client->RemapArrays(plan, absl::MakeSpan(arrays),
+                                             ArrayCopySemantics::kReuseInput));
+
+    ASSERT_THAT(out_arrays, SizeIs(2));
+    // `out_arrays[0].shards[0] == arrays[0].shards[0]`
+    // `out_arrays[0].shards[1] == arrays[0].shards[2]`
+    // `out_arrays[1].shards[0] == arrays[0].shards[1]`
+    // `out_arrays[1].shards[1] == arrays[0].shards[3]`
+    AssertArrayContent(client.get(), out_arrays[0].get(),
+                       /*base_values=*/{0, 6},
+                       /*device_indices=*/{0, 1});
+    AssertArrayContent(client.get(), out_arrays[1].get(),
+                       /*base_values=*/{100, 106},
+                       /*device_indices=*/{2, 3});
+  }
+  {
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto out_arrays, client->RemapArrays(plan, absl::MakeSpan(arrays),
+                                             ArrayCopySemantics::kDonateInput));
+
+    ASSERT_THAT(out_arrays, SizeIs(2));
+    // `out_arrays[0].shards[0] == arrays[0].shards[0]`
+    // `out_arrays[0].shards[1] == arrays[0].shards[2]`
+    // `out_arrays[1].shards[0] == arrays[0].shards[1]`
+    // `out_arrays[1].shards[1] == arrays[0].shards[3]`
+    AssertArrayContent(client.get(), out_arrays[0].get(),
+                       /*base_values=*/{0, 6},
+                       /*device_indices=*/{0, 1});
+    AssertArrayContent(client.get(), out_arrays[1].get(),
+                       /*base_values=*/{100, 106},
+                       /*device_indices=*/{2, 3});
+  }
+}
+
+}  // namespace
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/remap_plan.cc b/third_party/xla/xla/python/ifrt/remap_plan.cc
new file mode 100644
index 00000000000000..af4ac039d770b0
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/remap_plan.cc
@@ -0,0 +1,338 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/ifrt/remap_plan.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/types/span.h"
+#include "xla/python/ifrt/array_spec.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/remap_plan.pb.h"
+#include "xla/status_macros.h"
+#include "xla/util.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace ifrt {
+
+namespace {
+
+// Deserializes `RemapPlanProto::MappingProto` into `RemapPlan::Mapping`.
+absl::StatusOr<RemapPlan::Mapping> MappingFromProto(
+    const RemapPlanProto::MappingProto& mapping_proto) {
+  RemapPlan::Mapping mapping;
+
+  mapping.in_array = mapping_proto.in_array();
+  mapping.out_array = mapping_proto.out_array();
+
+  const int64_t num_intervals = mapping_proto.from_start_size();
+  TF_RET_CHECK(mapping_proto.from_end_size() == num_intervals);
+  TF_RET_CHECK(mapping_proto.from_step_size() == num_intervals);
+  TF_RET_CHECK(mapping_proto.to_start_size() == num_intervals);
+  TF_RET_CHECK(mapping_proto.to_end_size() == num_intervals);
+  TF_RET_CHECK(mapping_proto.to_step_size() == num_intervals);
+
+  mapping.from.reserve(num_intervals);
+  mapping.to.reserve(num_intervals);
+  for (int64_t i = 0; i < num_intervals; ++i) {
+    mapping.from.push_back(
+        RemapPlan::Interval{/*start=*/mapping_proto.from_start(i),
+                            /*end=*/mapping_proto.from_end(i),
+                            /*step=*/mapping_proto.from_step(i)});
+    mapping.to.push_back(
+        RemapPlan::Interval{/*start=*/mapping_proto.to_start(i),
+                            /*end=*/mapping_proto.to_end(i),
+                            /*step=*/mapping_proto.to_step(i)});
+  }
+  return mapping;
+}
+
+// Serializes `RemapPlan::Mapping` into `RemapPlanProto::MappingProto`.
+absl::StatusOr<RemapPlanProto::MappingProto> MappingToProto(
+    const RemapPlan::Mapping& mapping) {
+  TF_RET_CHECK(mapping.from.size() == mapping.to.size());
+
+  RemapPlanProto::MappingProto proto;
+
+  proto.set_in_array(mapping.in_array);
+  proto.set_out_array(mapping.out_array);
+
+  const int64_t num_intervals = mapping.from.size();
+  proto.mutable_from_start()->Reserve(num_intervals);
+  proto.mutable_from_end()->Reserve(num_intervals);
+  proto.mutable_from_step()->Reserve(num_intervals);
+  proto.mutable_to_start()->Reserve(num_intervals);
+  proto.mutable_to_end()->Reserve(num_intervals);
+  proto.mutable_to_step()->Reserve(num_intervals);
+  for (int64_t i = 0; i < mapping.from.size(); ++i) {
+    proto.add_from_start(mapping.from[i].start);
+    proto.add_from_end(mapping.from[i].end);
+    proto.add_from_step(mapping.from[i].step);
+    proto.add_to_start(mapping.to[i].start);
+    proto.add_to_end(mapping.to[i].end);
+    proto.add_to_step(mapping.to[i].step);
+  }
+  return proto;
+}
+
+// Checks if `interval` is in a valid range for the given number of shards.
+absl::Status CheckRange(int64_t num_shards,
+                        const RemapPlan::Interval& interval) {
+  if (interval.start < 0 || interval.start > num_shards - 1) {
+    return InvalidArgument("start must be in [0, %d], but is %d",
+                           num_shards - 1, interval.start);
+  }
+  if (interval.end < 0 || interval.end > num_shards) {
+    return InvalidArgument("end must be in [0, %d], but is %d", num_shards,
+                           interval.end);
+  }
+  if (interval.step <= 0) {
+    return InvalidArgument("step must be positive, but is %d", interval.step);
+  }
+  return absl::OkStatus();
+}
+
+// Returns the number of steps in `interval`.
+int64_t GetNumberOfSteps(const RemapPlan::Interval& interval) {
+  return (interval.end - interval.start + interval.step - 1) / interval.step;
+}
+
+}  // namespace
+
+std::string RemapPlan::Interval::DebugString() const {
+  return absl::StrCat("[", start, ":", end, ":", step, "]");
+}
+
+std::string RemapPlan::Mapping::DebugString() const {
+  auto format_intervals = [](absl::Span<const RemapPlan::Interval> intervals) {
+    return absl::StrCat(
+        "[",
+        absl::StrJoin(
+            intervals, ",",
+            [](std::string* out, const RemapPlan::Interval& interval) {
+              absl::StrAppend(out, interval.DebugString());
+            }),
+        "]");
+  };
+  return absl::StrCat("Mapping(in_array=", in_array, ",",
+                      "out_array=", out_array, ",from=", format_intervals(from),
+                      ",to=", format_intervals(to), ")");
+}
+
+absl::Status RemapPlan::Validate() const {
+  const int num_inputs = input_specs.size();
+  if (num_inputs == 0) {
+    return InvalidArgument("Must have at least one input");
+  }
+
+  for (int i = 0; i < num_inputs; ++i) {
+    if (input_specs[i].dtype != input_specs.front().dtype) {
+      return InvalidArgument(
+          "Input must have the same dtype: %s (input 0) vs. %s (input "
+          "%d)",
+          input_specs.front().dtype.DebugString(),
+          input_specs[i].dtype.DebugString(), i);
+    }
+  }
+  const int num_outputs = output_specs.size();
+  for (int i = 0; i < num_outputs; ++i) {
+    if (output_specs[i].dtype != input_specs.front().dtype) {
+      return InvalidArgument(
+          "Input and output must have the same dtype: %s (input 0) vs. %s "
+          "(output %d)",
+          output_specs.front().dtype.DebugString(),
+          output_specs[i].dtype.DebugString(), i);
+    }
+  }
+
+  std::vector<std::vector<bool>> in_used_buffers_list(num_inputs);
+  for (int i = 0; i < num_inputs; ++i) {
+    in_used_buffers_list[i].resize(
+        /*count=*/input_specs[i].sharding->devices().size(),
+        /*value=*/false);
+  }
+  std::vector<DeviceList::Devices> out_assigned_devices_list(num_outputs);
+  for (int i = 0; i < num_outputs; ++i) {
+    out_assigned_devices_list[i].resize(
+        /*n=*/output_specs[i].sharding->devices().size(),
+        /*v=*/nullptr);
+  }
+
+  for (int64_t i = 0; i < mappings->size(); ++i) {
+    const RemapPlan::Mapping& mapping = (*mappings)[i];
+    if (mapping.in_array < 0 || mapping.in_array >= num_inputs) {
+      return InvalidArgument(
+          "mappings[%d].in_array must be in [0, %d], but is %d", i,
+          num_inputs - 1, mapping.in_array);
+    }
+    if (mapping.out_array < 0 || mapping.out_array >= num_outputs) {
+      return InvalidArgument(
+          "mappings[%d].out_array must be in [0, %d], but is %d", i,
+          num_outputs - 1, mapping.out_array);
+    }
+    if (mapping.from.size() != mapping.to.size()) {
+      return InvalidArgument(
+          "mappings[%d].from and mappings[%d].to must have the same number of "
+          "intervals, but has %d and %d intervals",
+          i, i, mapping.from.size(), mapping.to.size());
+    }
+
+    std::vector<bool>& in_used_buffers = in_used_buffers_list[mapping.in_array];
+    const DeviceList& in_devices =
+        input_specs[mapping.in_array].sharding->devices();
+    DeviceList::Devices& out_assigned_devices =
+        out_assigned_devices_list[mapping.out_array];
+    const int64_t in_shards_count = in_used_buffers.size();
+    const int64_t out_shards_count = out_assigned_devices.size();
+
+    for (int s = 0; s < mapping.from.size(); ++s) {
+      const RemapPlan::Interval& in_interval = mapping.from[s];
+      const RemapPlan::Interval& out_interval = mapping.to[s];
+
+      TF_RETURN_IF_ERROR(CheckRange(in_shards_count, in_interval));
+      TF_RETURN_IF_ERROR(CheckRange(out_shards_count, out_interval));
+      if (GetNumberOfSteps(in_interval) != GetNumberOfSteps(out_interval)) {
+        return InvalidArgument(
+            "mappings[%d].from[%d] and mappings[%d].to[%d] must have the same "
+            "number of steps, but were %d and %d "
+            "(%s vs. %s)",
+            i, s, i, s, GetNumberOfSteps(in_interval),
+            GetNumberOfSteps(out_interval), in_interval.DebugString(),
+            out_interval.DebugString());
+      }
+
+      int64_t in_shard = in_interval.start;
+      int64_t out_shard = out_interval.start;
+      while (in_shard < in_interval.end) {
+        if (in_used_buffers[in_shard]) {
+          return InvalidArgument("Input array %d shard %d is already used",
+                                 mapping.in_array, in_shard);
+        }
+        in_used_buffers[in_shard] = true;
+        if (out_assigned_devices[out_shard] != nullptr) {
+          return InvalidArgument("Output array %d shard %d is already assigned",
+                                 mapping.out_array, out_shard);
+        }
+        out_assigned_devices[out_shard] = in_devices[in_shard];
+        in_shard += in_interval.step;
+        out_shard += out_interval.step;
+      }
+    }
+  }
+
+  for (int i = 0; i < num_outputs; ++i) {
+    for (int out_shard = 0;
+         out_shard < output_specs[i].sharding->devices().size(); ++out_shard) {
+      if (out_assigned_devices_list[i][out_shard] == nullptr) {
+        return InvalidArgument("Output array %d shard %d is unassigned", i,
+                               out_shard);
+      }
+    }
+    if (out_assigned_devices_list[i] !=
+        output_specs[i].sharding->devices().devices()) {
+      return InvalidArgument(
+          "Output array %d devices and sharding devices do not match: "
+          "Expected %s, but got %s",
+          i, output_specs[i].sharding->devices().DebugString(),
+          DeviceList(std::move(out_assigned_devices_list[i])).DebugString());
+    }
+  }
+  return absl::OkStatus();
+}
+
+absl::StatusOr<RemapPlan> RemapPlan::FromProto(
+    DeviceList::LookupDeviceFunc lookup_device, const RemapPlanProto& proto) {
+  RemapPlan plan;
+
+  plan.input_specs.reserve(proto.input_specs_size());
+  for (const auto& input_spec_proto : proto.input_specs()) {
+    TF_ASSIGN_OR_RETURN(ArraySpec input_spec,
+                        ArraySpec::FromProto(lookup_device, input_spec_proto));
+    plan.input_specs.push_back(std::move(input_spec));
+  }
+
+  plan.output_specs.reserve(proto.output_specs_size());
+  for (const auto& output_spec_proto : proto.output_specs()) {
+    TF_ASSIGN_OR_RETURN(ArraySpec output_spec,
+                        ArraySpec::FromProto(lookup_device, output_spec_proto));
+    plan.output_specs.push_back(std::move(output_spec));
+  }
+
+  plan.mappings = std::make_shared<std::vector<Mapping>>();
+  plan.mappings->reserve(proto.mappings_size());
+  for (const auto& mapping_proto : proto.mappings()) {
+    TF_ASSIGN_OR_RETURN(auto mapping, MappingFromProto(mapping_proto));
+    plan.mappings->push_back(std::move(mapping));
+  }
+
+  return plan;
+}
+
+absl::StatusOr<RemapPlanProto> RemapPlan::ToProto() const {
+  RemapPlanProto proto;
+
+  proto.mutable_input_specs()->Reserve(input_specs.size());
+  for (const auto& input_spec : input_specs) {
+    TF_ASSIGN_OR_RETURN(*proto.add_input_specs(), input_spec.ToProto());
+  }
+  proto.mutable_output_specs()->Reserve(output_specs.size());
+  for (const auto& output_spec : output_specs) {
+    TF_ASSIGN_OR_RETURN(*proto.add_output_specs(), output_spec.ToProto());
+  }
+
+  proto.mutable_mappings()->Reserve(mappings->size());
+  for (const auto& mapping : *mappings) {
+    TF_ASSIGN_OR_RETURN(*proto.add_mappings(), MappingToProto(mapping));
+  }
+
+  return proto;
+}
+
+std::string RemapPlan::DebugString() const {
+  auto format_array_specs = [](absl::Span<const ArraySpec> array_specs) {
+    return absl::StrCat(
+        "[",
+        absl::StrJoin(array_specs, ",",
+                      [](std::string* out, const ArraySpec& spec) {
+                        absl::StrAppend(out, spec.DebugString());
+                      }),
+        "]");
+  };
+  auto format_mappings = [](absl::Span<const Mapping> mappings) {
+    return absl::StrCat(
+        "[",
+        absl::StrJoin(mappings, ",",
+                      [](std::string* out, const Mapping& mapping) {
+                        absl::StrAppend(out, mapping.DebugString());
+                      }),
+        "]");
+  };
+  return absl::StrCat(
+      "RemapPlan(output_specs=", format_array_specs(output_specs), ",",
+      "mappings=", format_mappings(*mappings), ")");
+}
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/remap_plan.h b/third_party/xla/xla/python/ifrt/remap_plan.h
new file mode 100644
index 00000000000000..22679c6f27f14f
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/remap_plan.h
@@ -0,0 +1,109 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_IFRT_REMAP_PLAN_H_
+#define XLA_PYTHON_IFRT_REMAP_PLAN_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/python/ifrt/array_spec.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/remap_plan.pb.h"
+
+namespace xla {
+namespace ifrt {
+
+// Remap plan that describes how the shards from input `Array`s are mapped to
+// the shards of output `Array`s.
+//
+// * All input (or output) `Array`s must have the same dtype and per-shard
+// shape.
+// * An input shard can be used at most once.
+// * Every output shard must have exactly one input shard mapped.
+//
+// There is no API-level constraint on their global shapes and shardings.
+struct RemapPlan {
+  // Half-open interval with optional skips. Represents elements at offset
+  // `[start, start + step, start + step * 2, ..., end)` (`end` is excluded).
+  // Using the Python slice representation, it corresponds to
+  // `[start:end:step]`. `start` and `end` must be zero or positive. `step`
+  // must be positive (reverse iteration is disallowed for simplicity).
+  struct Interval {
+    int64_t start;
+    int64_t end;
+    int64_t step;
+
+    bool operator==(const Interval& other) const {
+      return start == other.start && end == other.end && step == other.step;
+    }
+
+    std::string DebugString() const;
+  };
+
+  // Mapping of shards from an input array to an output array. The shards whose
+  // index is chosen by `from` in `arrays[in_array]` will be used for the shards
+  // whose index is chosen by `to` in `out_arrays[out_array]`. `from` and `to`
+  // must contain the same number of `Interval`s, and each corresponding pair of
+  // `Interval` from `from` and `to` must represent the same number of shards.
+  struct Mapping {
+    int in_array;
+    int out_array;
+    std::vector<Interval> from;
+    std::vector<Interval> to;
+
+    bool operator==(const Mapping& other) const {
+      return in_array == other.in_array && out_array == other.out_array &&
+             from == other.from && to == other.to;
+    }
+
+    std::string DebugString() const;
+  };
+
+  // Specification of inputs.
+  std::vector<ArraySpec> input_specs;
+
+  // Specification of outputs.
+  std::vector<ArraySpec> output_specs;
+
+  // Mappings.
+  std::shared_ptr<std::vector<Mapping>> mappings;
+
+  // Validates this plan against the requirements (see `RemapPlan` comment).
+  // This is a slow operation. It should not be performed repeatedly.
+  // Implementations of `Client::RemapArrays()` may bypass runtime checks on a
+  // plan's validity, delegating the role to this method.
+  absl::Status Validate() const;
+
+  // Constructs `RemapPlan` from `RemapPlanProto`. Devices are looked up
+  // using `lookup_device`. Device ids in the proto must be consistent with
+  // the devices returned by `lookup_device`.
+  static absl::StatusOr<RemapPlan> FromProto(
+      DeviceList::LookupDeviceFunc lookup_device, const RemapPlanProto& proto);
+
+  // Returns a `RemapPlanProto` representation.
+  absl::StatusOr<RemapPlanProto> ToProto() const;
+
+  std::string DebugString() const;
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_REMAP_PLAN_H_
diff --git a/third_party/xla/xla/python/ifrt/remap_plan.proto b/third_party/xla/xla/python/ifrt/remap_plan.proto
new file mode 100644
index 00000000000000..3de2690ca8d118
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/remap_plan.proto
@@ -0,0 +1,40 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package xla.ifrt;
+
+import "xla/python/ifrt/array_spec.proto";
+
+// Wire format for `RemapPlan`. See `RemapPlan` for the semantics of the proto
+// fields.
+message RemapPlanProto {
+  message MappingProto {
+    int32 in_array = 1;
+    int32 out_array = 2;
+    // Transposed lists for `from` and `to`. This transposition makes a
+    // serialized proto smaller when `from` or `to` has many elements.
+    repeated int64 from_start = 3;
+    repeated int64 from_end = 4;
+    repeated int64 from_step = 5;
+    repeated int64 to_start = 6;
+    repeated int64 to_end = 7;
+    repeated int64 to_step = 8;
+  }
+  repeated ArraySpecProto input_specs = 1;
+  repeated ArraySpecProto output_specs = 2;
+  repeated MappingProto mappings = 3;
+}
diff --git a/third_party/xla/xla/python/ifrt/remap_plan_test.cc b/third_party/xla/xla/python/ifrt/remap_plan_test.cc
new file mode 100644
index 00000000000000..69865ad659419e
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt/remap_plan_test.cc
@@ -0,0 +1,417 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/ifrt/remap_plan.h"
+
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/functional/bind_front.h"
+#include "absl/status/status.h"
+#include "llvm/Support/Casting.h"
+#include "xla/python/ifrt/array_spec.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/memory.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt/sharding_test_util.h"
+#include "tsl/platform/status_matchers.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/test.h"
+
+namespace xla {
+namespace ifrt {
+namespace {
+
+using ::testing::ElementsAreArray;
+using ::testing::HasSubstr;
+using ::testing::SizeIs;
+using ::tsl::testing::StatusIs;
+
+class RemapPlanTest : public test_util::ShardingTest {};
+
+TEST_P(RemapPlanTest, ToFromProto) {
+  RemapPlan plan;
+
+  Shape shape({20, 20});
+  Shape shard_shape({5, 20});
+  DeviceList devices = GetDevices({0, 1, 2, 3});
+  std::shared_ptr<const Sharding> sharding =
+      ConcreteEvenSharding::Create(devices, MemoryKind(), /*shape=*/shape,
+                                   /*shard_shape=*/shard_shape);
+
+  plan.input_specs.reserve(2);
+  plan.input_specs.push_back(ArraySpec{/*dtype=*/DType(DType::kF32),
+                                       /*shape=*/shape, /*sharding=*/sharding});
+  plan.input_specs.push_back(ArraySpec{/*dtype=*/DType(DType::kF32),
+                                       /*shape=*/shape, /*sharding=*/sharding});
+
+  plan.output_specs.reserve(2);
+  plan.output_specs.push_back(ArraySpec{
+      /*dtype=*/DType(DType::kF32), /*shape=*/shape, /*sharding=*/sharding});
+  plan.output_specs.push_back(ArraySpec{
+      /*dtype=*/DType(DType::kF32), /*shape=*/shape, /*sharding=*/sharding});
+
+  plan.mappings = std::make_shared<std::vector<RemapPlan::Mapping>>();
+  plan.mappings->reserve(2);
+  plan.mappings->push_back(RemapPlan::Mapping{
+      /*in_array=*/0, /*out_array=*/1,
+      /*from=*/{RemapPlan::Interval{0, 2, 1}, RemapPlan::Interval{2, 4, 1}},
+      /*to=*/{RemapPlan::Interval{1, 4, 2}, RemapPlan::Interval{0, 4, 2}}});
+  plan.mappings->push_back(RemapPlan::Mapping{
+      /*in_array=*/1, /*out_array=*/0,
+      /*from=*/{RemapPlan::Interval{0, 4, 2}, RemapPlan::Interval{1, 4, 2}},
+      /*to=*/{RemapPlan::Interval{0, 2, 1}, RemapPlan::Interval{2, 4, 1}}});
+
+  TF_ASSERT_OK_AND_ASSIGN(RemapPlanProto plan_proto, plan.ToProto());
+  TF_ASSERT_OK_AND_ASSIGN(
+      RemapPlan plan_copy,
+      RemapPlan::FromProto(absl::bind_front(&Client::LookupDevice, client()),
+                           plan_proto));
+
+  EXPECT_THAT(*plan_copy.mappings, ElementsAreArray(*plan.mappings));
+
+  EXPECT_THAT(plan_copy.output_specs, SizeIs(2));
+  for (const auto& spec : plan_copy.input_specs) {
+    EXPECT_EQ(spec.dtype, DType(DType::kF32));
+    EXPECT_EQ(spec.shape, shape);
+    const auto* sharding_copy =
+        llvm::dyn_cast<ConcreteEvenSharding>(spec.sharding.get());
+    ASSERT_NE(sharding_copy, nullptr);
+    EXPECT_EQ(sharding_copy->devices(), devices);
+    EXPECT_EQ(sharding_copy->shape(), shape);
+    EXPECT_EQ(sharding_copy->shard_shape(), shard_shape);
+  }
+  for (const auto& spec : plan_copy.output_specs) {
+    EXPECT_EQ(spec.dtype, DType(DType::kF32));
+    EXPECT_EQ(spec.shape, shape);
+    const auto* sharding_copy =
+        llvm::dyn_cast<ConcreteEvenSharding>(spec.sharding.get());
+    ASSERT_NE(sharding_copy, nullptr);
+    EXPECT_EQ(sharding_copy->devices(), devices);
+    EXPECT_EQ(sharding_copy->shape(), shape);
+    EXPECT_EQ(sharding_copy->shard_shape(), shard_shape);
+  }
+}
+
+TEST_P(RemapPlanTest, InvalidInputDtype) {
+  RemapPlan plan;
+  plan.input_specs.push_back(
+      ArraySpec{/*dtype=*/DType(DType::kS32),
+                /*shape=*/Shape({2, 3}),
+                /*sharding=*/
+                ConcreteEvenSharding::Create(GetDevices({0}), MemoryKind(),
+                                             /*shape=*/Shape({2, 3}),
+                                             /*shard_shape=*/Shape({2, 3}))});
+  plan.input_specs.push_back(
+      ArraySpec{/*dtype=*/DType(DType::kF32),  // dtype differs
+                /*shape=*/Shape({2, 3}),
+                /*sharding=*/
+                ConcreteEvenSharding::Create(GetDevices({0}), MemoryKind(),
+                                             /*shape=*/Shape({2, 3}),
+                                             /*shard_shape=*/Shape({2, 3}))});
+  EXPECT_THAT(plan.Validate(),
+              StatusIs(absl::StatusCode::kInvalidArgument,
+                       HasSubstr("Input must have the same dtype")));
+}
+
+TEST_P(RemapPlanTest, InvalidOutputDtype) {
+  RemapPlan plan;
+  plan.input_specs.push_back(
+      ArraySpec{/*dtype=*/DType(DType::kS32),
+                /*shape=*/Shape({2, 3}),
+                /*sharding=*/
+                ConcreteEvenSharding::Create(GetDevices({0}), MemoryKind(),
+                                             /*shape=*/Shape({2, 3}),
+                                             /*shard_shape=*/Shape({2, 3}))});
+  plan.output_specs.push_back(
+      ArraySpec{/*dtype=*/DType(DType::kF32),  // dtype differs
+                /*shape=*/Shape({2, 3}),
+                /*sharding=*/
+                ConcreteEvenSharding::Create(GetDevices({0}), MemoryKind(),
+                                             /*shape=*/Shape({2, 3}),
+                                             /*shard_shape=*/Shape({2, 3}))});
+  EXPECT_THAT(plan.Validate(),
+              StatusIs(absl::StatusCode::kInvalidArgument,
+                       HasSubstr("Input and output must have the same dtype")));
+}
+
+TEST_P(RemapPlanTest, InvalidInputArrayIndex) {
+  RemapPlan plan;
+  plan.input_specs.push_back(
+      ArraySpec{/*dtype=*/DType(DType::kS32),
+                /*shape=*/Shape({2, 3}),
+                /*sharding=*/
+                ConcreteEvenSharding::Create(GetDevices({0}), MemoryKind(),
+                                             /*shape=*/Shape({2, 3}),
+                                             /*shard_shape=*/Shape({2, 3}))});
+  plan.output_specs.push_back(
+      ArraySpec{/*dtype=*/DType(DType::kS32),
+                /*shape=*/Shape({2, 3}),
+                /*sharding=*/
+                ConcreteEvenSharding::Create(GetDevices({0}), MemoryKind(),
+                                             /*shape=*/Shape({2, 3}),
+                                             /*shard_shape=*/Shape({2, 3}))});
+  plan.mappings = std::make_shared<std::vector<RemapPlan::Mapping>>();
+  plan.mappings->push_back(
+      RemapPlan::Mapping{/*in_array=*/1,  // Invalid in_array
+                         /*out_array=*/0,
+                         /*from=*/{RemapPlan::Interval{0, 1, 1}},
+                         /*to=*/{RemapPlan::Interval{0, 1, 1}}});
+  EXPECT_THAT(
+      plan.Validate(),
+      StatusIs(absl::StatusCode::kInvalidArgument,
+               HasSubstr("mappings[0].in_array must be in [0, 0], but is 1")));
+}
+
+TEST_P(RemapPlanTest, InvalidOutputArrayIndex) {
+  RemapPlan plan;
+  plan.input_specs.push_back(
+      ArraySpec{/*dtype=*/DType(DType::kS32),
+                /*shape=*/Shape({2, 3}),
+                /*sharding=*/
+                ConcreteEvenSharding::Create(GetDevices({0}), MemoryKind(),
+                                             /*shape=*/Shape({2, 3}),
+                                             /*shard_shape=*/Shape({2, 3}))});
+  plan.output_specs.push_back(
+      ArraySpec{/*dtype=*/DType(DType::kS32),
+                /*shape=*/Shape({2, 3}),
+                /*sharding=*/
+                ConcreteEvenSharding::Create(GetDevices({0}), MemoryKind(),
+                                             /*shape=*/Shape({2, 3}),
+                                             /*shard_shape=*/Shape({2, 3}))});
+  plan.mappings = std::make_shared<std::vector<RemapPlan::Mapping>>();
+  plan.mappings->push_back(
+      RemapPlan::Mapping{/*in_array=*/0,
+                         /*out_array=*/1,  // Invalid out_array
+                         /*from=*/{RemapPlan::Interval{0, 1, 1}},
+                         /*to=*/{RemapPlan::Interval{0, 1, 1}}});
+  EXPECT_THAT(
+      plan.Validate(),
+      StatusIs(absl::StatusCode::kInvalidArgument,
+               HasSubstr("mappings[0].out_array must be in [0, 0], but is 1")));
+}
+
+TEST_P(RemapPlanTest, InvalidIntervalCount) {
+  RemapPlan plan;
+  plan.input_specs.push_back(
+      ArraySpec{/*dtype=*/DType(DType::kS32),
+                /*shape=*/Shape({2, 3}),
+                /*sharding=*/
+                ConcreteEvenSharding::Create(GetDevices({0}), MemoryKind(),
+                                             /*shape=*/Shape({2, 3}),
+                                             /*shard_shape=*/Shape({2, 3}))});
+  plan.output_specs.push_back(
+      ArraySpec{/*dtype=*/DType(DType::kS32),
+                /*shape=*/Shape({2, 3}),
+                /*sharding=*/
+                ConcreteEvenSharding::Create(GetDevices({0}), MemoryKind(),
+                                             /*shape=*/Shape({2, 3}),
+                                             /*shard_shape=*/Shape({2, 3}))});
+  plan.mappings = std::make_shared<std::vector<RemapPlan::Mapping>>();
+  plan.mappings->push_back(RemapPlan::Mapping{
+      /*in_array=*/0,
+      /*out_array=*/0,
+      /*from=*/{RemapPlan::Interval{0, 1, 1}, RemapPlan::Interval{0, 1, 1}},
+      /*to=*/{RemapPlan::Interval{0, 1, 1}}});
+  EXPECT_THAT(
+      plan.Validate(),
+      StatusIs(
+          absl::StatusCode::kInvalidArgument,
+          HasSubstr("mappings[0].from and mappings[0].to must have the same "
+                    "number of intervals, but has 2 and 1 intervals")));
+}
+
+TEST_P(RemapPlanTest, InvalidShardIndex) {
+  auto run = [&](RemapPlan::Interval from, RemapPlan::Interval to) {
+    RemapPlan plan;
+    plan.input_specs.push_back(
+        ArraySpec{/*dtype=*/DType(DType::kS32),
+                  /*shape=*/Shape({2, 3}),
+                  /*sharding=*/
+                  ConcreteEvenSharding::Create(GetDevices({0}), MemoryKind(),
+                                               /*shape=*/Shape({2, 3}),
+                                               /*shard_shape=*/Shape({2, 3}))});
+    plan.output_specs.push_back(
+        ArraySpec{/*dtype=*/DType(DType::kS32),
+                  /*shape=*/Shape({2, 3}),
+                  /*sharding=*/
+                  ConcreteEvenSharding::Create(GetDevices({0}), MemoryKind(),
+                                               /*shape=*/Shape({2, 3}),
+                                               /*shard_shape=*/Shape({2, 3}))});
+    plan.mappings = std::make_shared<std::vector<RemapPlan::Mapping>>();
+    plan.mappings->push_back(RemapPlan::Mapping{/*in_array=*/0, /*out_array=*/0,
+                                                /*from=*/{from},
+                                                /*to=*/{to}});
+    return plan.Validate();
+  };
+
+  EXPECT_THAT(run(RemapPlan::Interval{-1, 1, 1}, RemapPlan::Interval{0, 1, 1}),
+              StatusIs(absl::StatusCode::kInvalidArgument,
+                       HasSubstr("start must be in [0, 0], but is -1")));
+  EXPECT_THAT(run(RemapPlan::Interval{1, 1, 1}, RemapPlan::Interval{0, 1, 1}),
+              StatusIs(absl::StatusCode::kInvalidArgument,
+                       HasSubstr("start must be in [0, 0], but is 1")));
+  EXPECT_THAT(run(RemapPlan::Interval{0, 1, 1}, RemapPlan::Interval{-1, 1, 1}),
+              StatusIs(absl::StatusCode::kInvalidArgument,
+                       HasSubstr("start must be in [0, 0], but is -1")));
+  EXPECT_THAT(run(RemapPlan::Interval{0, 1, 1}, RemapPlan::Interval{1, 1, 1}),
+              StatusIs(absl::StatusCode::kInvalidArgument,
+                       HasSubstr("start must be in [0, 0], but is 1")));
+
+  EXPECT_THAT(run(RemapPlan::Interval{0, -1, 1}, RemapPlan::Interval{0, 1, 1}),
+              StatusIs(absl::StatusCode::kInvalidArgument,
+                       HasSubstr("end must be in [0, 1], but is -1")));
+  EXPECT_THAT(run(RemapPlan::Interval{0, 2, 1}, RemapPlan::Interval{0, 1, 1}),
+              StatusIs(absl::StatusCode::kInvalidArgument,
+                       HasSubstr("end must be in [0, 1], but is 2")));
+  EXPECT_THAT(run(RemapPlan::Interval{0, 1, 1}, RemapPlan::Interval{0, -1, 1}),
+              StatusIs(absl::StatusCode::kInvalidArgument,
+                       HasSubstr("end must be in [0, 1], but is -1")));
+  EXPECT_THAT(run(RemapPlan::Interval{0, 1, 1}, RemapPlan::Interval{0, 2, 1}),
+              StatusIs(absl::StatusCode::kInvalidArgument,
+                       HasSubstr("end must be in [0, 1], but is 2")));
+
+  EXPECT_THAT(run(RemapPlan::Interval{0, 1, 0}, RemapPlan::Interval{0, 1, 1}),
+              StatusIs(absl::StatusCode::kInvalidArgument,
+                       HasSubstr("step must be positive, but is 0")));
+  EXPECT_THAT(run(RemapPlan::Interval{0, 1, 1}, RemapPlan::Interval{0, 1, -1}),
+              StatusIs(absl::StatusCode::kInvalidArgument,
+                       HasSubstr("step must be positive, but is -1")));
+}
+
+TEST_P(RemapPlanTest, AlreadyUsedInputShard) {
+  RemapPlan plan;
+  plan.input_specs.push_back(
+      ArraySpec{/*dtype=*/DType(DType::kS32),
+                /*shape=*/Shape({2, 3}),
+                /*sharding=*/
+                ConcreteEvenSharding::Create(GetDevices({0}), MemoryKind(),
+                                             /*shape=*/Shape({2, 3}),
+                                             /*shard_shape=*/Shape({2, 3}))});
+  plan.output_specs.push_back(
+      ArraySpec{/*dtype=*/DType(DType::kS32),
+                /*shape=*/Shape({4, 3}),
+                /*sharding=*/
+                ConcreteEvenSharding::Create(GetDevices({0, 1}), MemoryKind(),
+                                             /*shape=*/Shape({4, 3}),
+                                             /*shard_shape=*/Shape({2, 3}))});
+  plan.mappings = std::make_shared<std::vector<RemapPlan::Mapping>>();
+  plan.mappings->push_back(RemapPlan::Mapping{
+      /*in_array=*/0,
+      /*out_array=*/0,
+      /*from=*/{RemapPlan::Interval{0, 1, 1}, RemapPlan::Interval{0, 1, 1}},
+      /*to=*/{RemapPlan::Interval{0, 1, 1}, RemapPlan::Interval{1, 2, 1}}});
+  EXPECT_THAT(plan.Validate(),
+              StatusIs(absl::StatusCode::kInvalidArgument,
+                       HasSubstr("Input array 0 shard 0 is already used")));
+}
+
+TEST_P(RemapPlanTest, UnassignedOutputShard) {
+  RemapPlan plan;
+  plan.input_specs.push_back(
+      ArraySpec{/*dtype=*/DType(DType::kS32),
+                /*shape=*/Shape({2, 3}),
+                /*sharding=*/
+                ConcreteEvenSharding::Create(GetDevices({0}), MemoryKind(),
+                                             /*shape=*/Shape({2, 3}),
+                                             /*shard_shape=*/Shape({2, 3}))});
+  plan.output_specs.push_back(
+      ArraySpec{/*dtype=*/DType(DType::kS32),
+                /*shape=*/Shape({4, 3}),
+                /*sharding=*/
+                ConcreteEvenSharding::Create(GetDevices({0, 1}), MemoryKind(),
+                                             /*shape=*/Shape({4, 3}),
+                                             /*shard_shape=*/Shape({2, 3}))});
+  plan.mappings = std::make_shared<std::vector<RemapPlan::Mapping>>();
+  plan.mappings->push_back(
+      RemapPlan::Mapping{/*in_array=*/0,
+                         /*out_array=*/0,
+                         /*from=*/{RemapPlan::Interval{0, 1, 1}},
+                         /*to=*/{RemapPlan::Interval{0, 1, 1}}});
+  EXPECT_THAT(plan.Validate(),
+              StatusIs(absl::StatusCode::kInvalidArgument,
+                       HasSubstr("Output array 0 shard 1 is unassigned")));
+}
+
+TEST_P(RemapPlanTest, AlreadyAssignedOutputShard) {
+  RemapPlan plan;
+  plan.input_specs.push_back(
+      ArraySpec{/*dtype=*/DType(DType::kS32),
+                /*shape=*/Shape({4, 3}),
+                /*sharding=*/
+                ConcreteEvenSharding::Create(GetDevices({0, 1}), MemoryKind(),
+                                             /*shape=*/Shape({4, 3}),
+                                             /*shard_shape=*/Shape({2, 3}))});
+  plan.output_specs.push_back(
+      ArraySpec{/*dtype=*/DType(DType::kS32),
+                /*shape=*/Shape({2, 3}),
+                /*sharding=*/
+                ConcreteEvenSharding::Create(GetDevices({0}), MemoryKind(),
+                                             /*shape=*/Shape({2, 3}),
+                                             /*shard_shape=*/Shape({2, 3}))});
+  plan.mappings = std::make_shared<std::vector<RemapPlan::Mapping>>();
+  plan.mappings->push_back(RemapPlan::Mapping{
+      /*in_array=*/0,
+      /*out_array=*/0,
+      /*from=*/{RemapPlan::Interval{0, 1, 1}, RemapPlan::Interval{1, 2, 1}},
+      /*to=*/{RemapPlan::Interval{0, 1, 1}, RemapPlan::Interval{0, 1, 1}}});
+  EXPECT_THAT(
+      plan.Validate(),
+      StatusIs(absl::StatusCode::kInvalidArgument,
+               HasSubstr("Output array 0 shard 0 is already assigned")));
+}
+
+TEST_P(RemapPlanTest, InvalidOutputDevices) {
+  RemapPlan plan;
+  plan.input_specs.push_back(
+      ArraySpec{/*dtype=*/DType(DType::kS32),
+                /*shape=*/Shape({4, 3}),
+                /*sharding=*/
+                ConcreteEvenSharding::Create(GetDevices({0, 1}), MemoryKind(),
+                                             /*shape=*/Shape({4, 3}),
+                                             /*shard_shape=*/Shape({2, 3}))});
+  plan.output_specs.push_back(
+      ArraySpec{/*dtype=*/DType(DType::kS32),
+                /*shape=*/Shape({4, 3}),
+                /*sharding=*/
+                ConcreteEvenSharding::Create(GetDevices({1, 0}), MemoryKind(),
+                                             /*shape=*/Shape({4, 3}),
+                                             /*shard_shape=*/Shape({2, 3}))});
+  plan.mappings = std::make_shared<std::vector<RemapPlan::Mapping>>();
+  plan.mappings->push_back(
+      RemapPlan::Mapping{/*in_array=*/0,
+                         /*out_array=*/0,
+                         /*from=*/{RemapPlan::Interval{0, 2, 1}},
+                         /*to=*/{RemapPlan::Interval{0, 2, 1}}});
+  EXPECT_THAT(
+      plan.Validate(),
+      StatusIs(
+          absl::StatusCode::kInvalidArgument,
+          HasSubstr(
+              "Output array 0 devices and sharding devices do not match")));
+}
+
+INSTANTIATE_TEST_SUITE_P(NumDevices, RemapPlanTest,
+                         testing::Values(test_util::ShardingTestParam{
+                             /*num_devices=*/4,
+                             /*num_addressable_devices=*/4}));
+
+}  // namespace
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt/shape.cc b/third_party/xla/xla/python/ifrt/shape.cc
index 0d6fb7c1963cbc..fb66cbe464cf87 100644
--- a/third_party/xla/xla/python/ifrt/shape.cc
+++ b/third_party/xla/xla/python/ifrt/shape.cc
@@ -22,11 +22,11 @@ limitations under the License.
 #include <variant>
 
 #include "absl/container/inlined_vector.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "xla/python/ifrt/shape.pb.h"
-#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
@@ -111,7 +111,7 @@ absl::StatusOr<DynamicShape> DynamicShape::Create(Shape shape,
               return InvalidArgument(
                   "Shape and tag must have the same number of dimensions.");
             }
-            return xla::OkStatus();
+            return absl::OkStatus();
           },
       },
       tag));
diff --git a/third_party/xla/xla/python/ifrt/sharding_test_util.cc b/third_party/xla/xla/python/ifrt/sharding_test_util.cc
index ba51d315619e32..db8199a1ed3476 100644
--- a/third_party/xla/xla/python/ifrt/sharding_test_util.cc
+++ b/third_party/xla/xla/python/ifrt/sharding_test_util.cc
@@ -19,10 +19,14 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "xla/pjrt/pjrt_common.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/mock.h"
 #include "xla/python/ifrt/test_util.h"
+#include "xla/util.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
@@ -55,6 +59,8 @@ std::shared_ptr<MockClient> MakeShardingTestClient(
     auto device = std::make_unique<MockDevice>();
     ON_CALL(*device, Id).WillByDefault(Return(DeviceId(i + 10)));
     ON_CALL(*device, IsAddressable).WillByDefault(Return(true));
+    ON_CALL(*device, DebugString)
+        .WillByDefault(Return(absl::StrCat("device(", i + 10, ")")));
     state->devices.push_back(device.get());
     state->device_map.insert({DeviceId(i + 10), std::move(device)});
   }
diff --git a/third_party/xla/xla/python/ifrt/support/BUILD b/third_party/xla/xla/python/ifrt/support/BUILD
index 33907fd34f3d13..a214a2e3cca52e 100644
--- a/third_party/xla/xla/python/ifrt/support/BUILD
+++ b/third_party/xla/xla/python/ifrt/support/BUILD
@@ -13,7 +13,7 @@ cc_library(
     deps = [
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
-        "//xla/python/ifrt/ir",
+        "//xla/python/ifrt/ir:sharding_param",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -34,7 +34,7 @@ xla_cc_test(
         "//xla/hlo/ir:tile_assignment",
         "//xla/python/ifrt",
         "//xla/python/ifrt:sharding_test_util",
-        "//xla/python/ifrt/ir",
+        "//xla/python/ifrt/ir:sharding_param",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
diff --git a/third_party/xla/xla/python/ifrt/test_util.h b/third_party/xla/xla/python/ifrt/test_util.h
index 5ac6ecdea5fadb..45e1258e8ec0e2 100644
--- a/third_party/xla/xla/python/ifrt/test_util.h
+++ b/third_party/xla/xla/python/ifrt/test_util.h
@@ -29,7 +29,7 @@ limitations under the License.
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/dtype.h"
 #include "xla/python/ifrt/shape.h"
-#include "tsl/concurrency/ref_count.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
diff --git a/third_party/xla/xla/python/ifrt/tuple.h b/third_party/xla/xla/python/ifrt/tuple.h
index 076455354a8562..54cb1a1fd0284b 100644
--- a/third_party/xla/xla/python/ifrt/tuple.h
+++ b/third_party/xla/xla/python/ifrt/tuple.h
@@ -22,11 +22,11 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "llvm/Support/ExtensibleRTTI.h"
 #include "xla/python/ifrt/value.h"
-#include "xla/status.h"
-#include "tsl/concurrency/ref_count.h"
+#include "xla/tsl/concurrency/ref_count.h"
 
 namespace xla {
 namespace ifrt {
@@ -53,7 +53,7 @@ class Tuple : public llvm::RTTIExtends<Tuple, Value> {
   virtual int Arity() = 0;
 
   // Unpacks the tuple into its constituent pieces.
-  virtual Status Unpack(absl::Span<tsl::RCReference<Value>> values) = 0;
+  virtual absl::Status Unpack(absl::Span<tsl::RCReference<Value>> values) = 0;
 
   static char ID;  // NOLINT
 };
diff --git a/third_party/xla/xla/python/ifrt/tuple_impl_test_lib.cc b/third_party/xla/xla/python/ifrt/tuple_impl_test_lib.cc
index aba8c7a58a388b..643421076f3a5f 100644
--- a/third_party/xla/xla/python/ifrt/tuple_impl_test_lib.cc
+++ b/third_party/xla/xla/python/ifrt/tuple_impl_test_lib.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/test_util.h"
 #include "xla/python/ifrt/tuple.h"
-#include "tsl/concurrency/ref_count.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
diff --git a/third_party/xla/xla/python/ifrt/value.h b/third_party/xla/xla/python/ifrt/value.h
index 5293e4da84ceaa..78ba3bd8242af8 100644
--- a/third_party/xla/xla/python/ifrt/value.h
+++ b/third_party/xla/xla/python/ifrt/value.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include "llvm/Support/ExtensibleRTTI.h"
 #include "xla/python/ifrt/future.h"
 #include "xla/status.h"
-#include "tsl/concurrency/ref_count.h"
+#include "xla/tsl/concurrency/ref_count.h"
 
 namespace xla {
 namespace ifrt {
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/BUILD b/third_party/xla/xla/python/ifrt_proxy/client/BUILD
index 0126a72a798fe8..b21964d40aa22d 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/BUILD
+++ b/third_party/xla/xla/python/ifrt_proxy/client/BUILD
@@ -42,6 +42,7 @@ cc_library(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:env",
@@ -123,6 +124,7 @@ cc_library(
         "//xla/python/ifrt_proxy/common:common_serdes",
         "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
         "//xla/python/ifrt_proxy/common:types",
+        "//xla/tsl/concurrency:ref_count",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
@@ -132,7 +134,6 @@ cc_library(
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
-        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -193,6 +194,7 @@ cc_library(
         "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
         "//xla/python/ifrt_proxy/common:types",
         "//xla/python/ifrt_proxy/common:types_proto_cc",
+        "//xla/tsl/concurrency:ref_count",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/log",
@@ -204,7 +206,6 @@ cc_library(
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
-        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -226,10 +227,10 @@ ifrt_proxy_cc_test(
         "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
         "//xla/python/ifrt_proxy/common:types",
         "//xla/python/ifrt_proxy/common:types_proto_cc",
+        "//xla/tsl/concurrency:ref_count",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:test",
@@ -274,11 +275,11 @@ cc_library(
         "//xla/python/ifrt_proxy/server:host_callback",
         "//xla/python/pjrt_ifrt",
         "//xla/python/pjrt_ifrt:xla_ifrt",
+        "//xla/tsl/concurrency:ref_count",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
-        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/platform:status_to_from_proto",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -331,6 +332,7 @@ cc_library(
         "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
         "//xla/python/ifrt_proxy/common:types",
         "//xla/python/pjrt_ifrt",
+        "//xla/tsl/concurrency:ref_count",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:node_hash_set",
@@ -343,7 +345,6 @@ cc_library(
         "@com_google_absl//absl/strings:cord",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
-        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:platform_port",
@@ -486,11 +487,11 @@ ifrt_proxy_cc_test(
         "//xla/python/ifrt:sharding_serdes",
         "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
         "//xla/python/ifrt_proxy/common:types",
+        "//xla/tsl/concurrency:ref_count",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//llvm:Support",
-        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:status_matchers",
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/array.cc b/third_party/xla/xla/python/ifrt_proxy/client/array.cc
index 34edd2cbc7e347..b39502969318ee 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/array.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/array.cc
@@ -38,6 +38,7 @@
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/dtype.h"
 #include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt/remap_plan.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/ifrt_proxy/client/rpc_helper.h"
@@ -46,7 +47,7 @@
 #include "xla/python/ifrt_proxy/common/types.h"
 #include "xla/python/ifrt_proxy/common/types.pb.h"
 #include "xla/status_macros.h"
-#include "tsl/concurrency/ref_count.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
@@ -120,13 +121,7 @@ Future<> Array::GetReadyFuture() const {
   rpc_helper_->CheckArrayReady(std::move(req))
       .OnReady(
           [promise](absl::StatusOr<std::shared_ptr<CheckArrayReadyResponse>>
-                        resp) mutable -> void {
-            if (resp.status().ok()) {
-              promise.Set();
-            } else {
-              promise.SetError(resp.status());
-            }
-          });
+                        resp) mutable { promise.Set(resp.status()); });
   return Future<>(std::move(promise));
 }
 
@@ -144,8 +139,7 @@ Future<> Array::Delete() {
   // replies with the deletion's response, from within
   // `Future(status_handle_promise).OnReady()`, schedule `CheckFuture()` on a
   // separate thread.
-  return Future<>::FromStatusFuture(
-      rpc_helper_->CheckFuture((*response)->deletion_future_handle()));
+  return rpc_helper_->CheckFuture((*response)->deletion_future_handle());
 }
 
 bool Array::IsDeleted() const {
@@ -198,6 +192,46 @@ Array::AssembleArrayFromSingleDeviceArrays(
                           std::move(shape), std::move(sharding), handle));
 }
 
+absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>>
+Array::RemapArrays(xla::ifrt::Client* client,
+                   std::shared_ptr<RpcHelper> rpc_helper, const RemapPlan& plan,
+                   absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
+                   ArrayCopySemantics semantics) {
+  auto req = std::make_unique<RemapArraysRequest>();
+  TF_RET_CHECK(!arrays.empty());
+  TF_ASSIGN_OR_RETURN(*req->mutable_plan(), plan.ToProto());
+  req->set_copy_semantics(ToArrayCopySemanticsProto(semantics));
+  for (const tsl::RCReference<xla::ifrt::Array>& rcref : arrays) {
+    Array* array = llvm::dyn_cast<Array>(rcref.get());
+    if (array == nullptr) {
+      return absl::InvalidArgumentError(
+          absl::Substitute("Array at $0 supplied to RemapArrays() is "
+                           "not a xla::ifrt::proxy::Array.",
+                           rcref.get()));
+    }
+    req->add_array_handles(array->handle_.handle);
+  }
+
+  TF_ASSIGN_OR_RETURN(std::shared_ptr<RemapArraysResponse> response,
+                      rpc_helper->RemapArrays(std::move(req)).Await());
+
+  std::vector<ArrayHandle> handles;
+  for (auto& handle : response->array_handles()) {
+    handles.push_back(ArrayHandle{handle});
+  }
+  TF_RET_CHECK(handles.size() == plan.output_specs.size());
+
+  std::vector<tsl::RCReference<xla::ifrt::Array>> result;
+  result.reserve(handles.size());
+  for (int i = 0; i < handles.size(); ++i) {
+    result.push_back(tsl::RCReference<xla::ifrt::Array>(
+        tsl::MakeRef<Array>(client, rpc_helper, plan.output_specs[i].dtype,
+                            plan.output_specs[i].shape,
+                            plan.output_specs[i].sharding, handles[i])));
+  }
+  return result;
+}
+
 absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>>
 Array::DisassembleIntoSingleDeviceArrays(ArrayCopySemantics semantics) {
   auto req = std::make_unique<DisassembleIntoSingleDeviceArraysRequest>();
@@ -295,7 +329,7 @@ Future<> Array::CopyToHostBuffer(
                       absl::StatusOr<std::shared_ptr<CopyToHostBufferResponse>>
                           resp) mutable {
     if (!resp.ok()) {
-      promise.SetError(resp.status());
+      promise.Set(resp.status());
       return;
     }
 
@@ -314,7 +348,7 @@ Future<> Array::CopyToHostBuffer(
           };
 
           if (!data.ok()) {
-            promise.SetError(data.status());
+            promise.Set(data.status());
             return;
           }
           if (data->size() != mem_region.size()) {
@@ -323,7 +357,7 @@ Future<> Array::CopyToHostBuffer(
                              "response from proxy: ",
                              mem_region.size(), " vs ", data->size()));
             LOG(ERROR) << status;
-            promise.SetError(status);
+            promise.Set(status);
             return;
           }
 #if defined(PLATFORM_GOOGLE)
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/array.h b/third_party/xla/xla/python/ifrt_proxy/client/array.h
index 9e96d14900a047..26d337a21916af 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/array.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/array.h
@@ -35,13 +35,14 @@
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/dtype.h"
 #include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt/remap_plan.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/ifrt/tuple.h"
 #include "xla/python/ifrt/value.h"
 #include "xla/python/ifrt_proxy/client/rpc_helper.h"
 #include "xla/python/ifrt_proxy/common/types.h"
-#include "tsl/concurrency/ref_count.h"
+#include "xla/tsl/concurrency/ref_count.h"
 
 namespace xla {
 namespace ifrt {
@@ -72,6 +73,14 @@ class Array final : public llvm::RTTIExtends<Array, xla::ifrt::Array> {
       absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
       ArrayCopySemantics semantics);
 
+  // `Array::RemapArrays()` implements `Client::RemapArrays()`.
+  // TODO(b/261226026): Implement logic directly in client.cc.
+  static absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>>
+  RemapArrays(xla::ifrt::Client* client, std::shared_ptr<RpcHelper> rpc_helper,
+              const RemapPlan& plan,
+              absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
+              ArrayCopySemantics semantics);
+
   // Destructs the array associated with the given handle. The corresponding
   // array becomes unusable afterwards.
   static void Destruct(RpcHelper* rpc_helper, ArrayHandle handle);
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/array_test.cc b/third_party/xla/xla/python/ifrt_proxy/client/array_test.cc
index 686f533387bde3..f069dd959f662e 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/array_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/array_test.cc
@@ -37,7 +37,7 @@
 #include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
 #include "xla/python/ifrt_proxy/common/types.h"
 #include "xla/python/ifrt_proxy/common/types.pb.h"
-#include "tsl/concurrency/ref_count.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "tsl/platform/protobuf.h"  // IWYU pragma: keep
 #include "tsl/platform/status_matchers.h"
 #include "tsl/platform/test.h"
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/client.cc b/third_party/xla/xla/python/ifrt_proxy/client/client.cc
index 2839cb44db694b..d77e9d797d0792 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/client.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/client.cc
@@ -34,6 +34,7 @@
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/remap_plan.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/ifrt_proxy/client/array.h"
@@ -42,8 +43,8 @@
 #include "xla/python/ifrt_proxy/client/rpc_helper.h"
 #include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
 #include "xla/python/ifrt_proxy/common/types.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/casts.h"
 #include "tsl/platform/statusor.h"
 
@@ -195,6 +196,13 @@ Client::AssembleArrayFromSingleDeviceArrays(
       this, rpc_helper_, std::move(shape), sharding, arrays, semantics);
 }
 
+absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>>
+Client::RemapArrays(const RemapPlan& plan,
+                    absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
+                    ArrayCopySemantics semantics) {
+  return Array::RemapArrays(this, rpc_helper_, plan, arrays, semantics);
+}
+
 absl::StatusOr<DeviceAssignment> Client::GetDefaultDeviceAssignment(
     int num_replicas, int num_partitions) const {
   auto req = std::make_unique<GetDefaultDeviceAssignmentRequest>();
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/client.h b/third_party/xla/xla/python/ifrt_proxy/client/client.h
index 647451400db018..2bf7a6c82c6173 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/client.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/client.h
@@ -37,6 +37,7 @@
 #include "xla/python/ifrt/compiler.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/remap_plan.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/ifrt/tuple.h"
@@ -46,7 +47,7 @@
 #include "xla/python/ifrt_proxy/client/memory.h"
 #include "xla/python/ifrt_proxy/client/rpc_helper.h"
 #include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
-#include "tsl/concurrency/ref_count.h"
+#include "xla/tsl/concurrency/ref_count.h"
 
 namespace xla {
 namespace ifrt {
@@ -72,6 +73,11 @@ class Client final : public llvm::RTTIExtends<Client, xla::ifrt::Client> {
       absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
       ArrayCopySemantics semantics) override;
 
+  absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>> RemapArrays(
+      const RemapPlan& plan,
+      absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
+      ArrayCopySemantics semantics) override;
+
   absl::StatusOr<tsl::RCReference<Tuple>> MakeTuple(
       absl::Span<tsl::RCReference<Value>> values) override {
     return absl::UnimplementedError(
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/client_session.h b/third_party/xla/xla/python/ifrt_proxy/client/client_session.h
index 9bd795825e50cf..8258f41533ef59 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/client_session.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/client_session.h
@@ -20,7 +20,6 @@
 #include <memory>
 
 #include "absl/status/status.h"
-#include "absl/status/statusor.h"
 #include "xla/python/ifrt/future.h"
 #include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
 
@@ -35,10 +34,7 @@ namespace proxy {
 // `ClientSession` implementation must be thread-safe.
 class ClientSession {
  public:
-  // `Response` represents either an `IfrtResponse` value, or an `absl::Status`
-  // value corresponding to termination of the session stream. Value will never
-  // be a nullptr with OK status.
-  using Response = absl::StatusOr<std::shared_ptr<IfrtResponse>>;
+  using Response = std::shared_ptr<IfrtResponse>;
 
   virtual ~ClientSession() = default;
 
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/compiler.cc b/third_party/xla/xla/python/ifrt_proxy/client/compiler.cc
index d23dfc041b2cc4..2d0bb3cb1f7f40 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/compiler.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/compiler.cc
@@ -39,7 +39,7 @@
 #include "xla/python/ifrt_proxy/server/host_callback.h"
 #include "xla/python/pjrt_ifrt/pjrt_host_callback.h"
 #include "xla/python/pjrt_ifrt/xla_compiler.h"
-#include "tsl/concurrency/ref_count.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "tsl/platform/status_to_from_proto.h"
 #include "tsl/platform/statusor.h"
 
@@ -127,7 +127,7 @@ absl::StatusOr<std::unique_ptr<xla::ifrt::LoadedExecutable>> Compiler::Compile(
       break;
   }
 
-  Future<absl::Status> ready_future =
+  Future<> ready_future =
       rpc_helper_->CheckFuture(response->ready_future_handle());
 
   std::vector<uint64_t> loaded_host_callback_handles(
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/executable.cc b/third_party/xla/xla/python/ifrt_proxy/client/executable.cc
index c05939bd732723..5b4a77691daeca 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/executable.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/executable.cc
@@ -55,8 +55,8 @@
 #include "xla/python/ifrt_proxy/common/types.h"
 #include "xla/python/pjrt_ifrt/pjrt_host_callback.h"
 #include "xla/shape_util.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/cpu_info.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
@@ -184,7 +184,7 @@ LoadedExecutable::LoadedExecutable(
         addressable_device_logical_device_ids,
     std::vector<xla::ifrt::Device*> addressable_devices,
     absl::StatusOr<std::optional<std::string>> fingerprint,
-    Future<absl::Status> ready_future,
+    Future<> ready_future,
     std::vector<tsl::RCReference<xla::ifrt::LoadedHostCallback>>
         loaded_host_callbacks,
     std::vector<uint64_t> loaded_host_callback_handles)
@@ -212,9 +212,8 @@ LoadedExecutable::LoadedExecutable(
   // eagerly schedule this fetch since, in some implementations, it may take a
   // long time for sharding information to be available.
 
-  auto promise =
-      Future<absl::StatusOr<std::shared_ptr<Metadata>>>::CreatePromise();
-  metadata_future_ = Future<absl::StatusOr<std::shared_ptr<Metadata>>>(promise);
+  auto promise = Future<std::shared_ptr<Metadata>>::CreatePromise();
+  metadata_future_ = Future<std::shared_ptr<Metadata>>(promise);
 
   auto req = std::make_unique<LoadedExecutableMetadataRequest>();
   req->set_loaded_executable_handle(handle_);
@@ -327,9 +326,7 @@ absl::StatusOr<std::string> LoadedExecutable::Serialize() const {
       "underlying serialization format is not stable");
 }
 
-Future<absl::Status> LoadedExecutable::GetReadyFuture() const {
-  return ready_future_;
-}
+Future<> LoadedExecutable::GetReadyFuture() const { return ready_future_; }
 
 int LoadedExecutable::num_devices() const { return num_devices_; }
 
@@ -437,8 +434,7 @@ LoadedExecutable::Execute(absl::Span<tsl::RCReference<xla::ifrt::Array>> args,
 
   // Populate the execution status future. `CheckFuture` deletes the server-side
   // futures after its completion.
-  result.status = Future<>::FromStatusFuture(
-      rpc_helper_->CheckFuture(response->status_handle()));
+  result.status = rpc_helper_->CheckFuture(response->status_handle());
 
   // Create output arrays. The cleanup logic ensures that all handles are
   // properly cleaned up on early return.
@@ -466,14 +462,14 @@ LoadedExecutable::Execute(absl::Span<tsl::RCReference<xla::ifrt::Array>> args,
   return result;
 }
 
-Future<absl::Status> LoadedExecutable::Delete() {
+Future<> LoadedExecutable::Delete() {
   auto req = std::make_unique<LoadedExecutableDeleteRequest>();
   req->set_loaded_executable_handle(handle_);
 
   absl::StatusOr<std::shared_ptr<LoadedExecutableDeleteResponse>> response =
       rpc_helper_->LoadedExecutableDelete(std::move(req)).Await();
   if (!response.ok()) {
-    return Future<absl::Status>(response.status());
+    return Future<>(response.status());
   }
   return rpc_helper_->CheckFuture((*response)->future_handle());
 }
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/executable.h b/third_party/xla/xla/python/ifrt_proxy/client/executable.h
index d6b12d97211911..04e0223dd34d1e 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/executable.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/executable.h
@@ -40,8 +40,8 @@
 #include "xla/python/ifrt/future.h"
 #include "xla/python/ifrt/host_callback.h"
 #include "xla/python/ifrt_proxy/client/rpc_helper.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/concurrency/ref_count.h"
 
 namespace xla {
 namespace ifrt {
@@ -57,7 +57,7 @@ class LoadedExecutable final
                        addressable_device_logical_device_ids,
                    std::vector<xla::ifrt::Device*> addressable_devices,
                    absl::StatusOr<std::optional<std::string>> fingerprint,
-                   Future<absl::Status> ready_future,
+                   Future<> ready_future,
                    std::vector<tsl::RCReference<xla::ifrt::LoadedHostCallback>>
                        loaded_host_callbacks,
                    std::vector<uint64_t> loaded_host_callback_handles);
@@ -68,7 +68,7 @@ class LoadedExecutable final
   absl::string_view name() const override;
   absl::StatusOr<std::optional<std::string>> Fingerprint() const override;
   absl::StatusOr<std::string> Serialize() const override;
-  Future<absl::Status> GetReadyFuture() const override;
+  Future<> GetReadyFuture() const override;
 
   int num_devices() const override;
   int64_t SizeOfGeneratedCodeInBytes() const override;
@@ -94,7 +94,7 @@ class LoadedExecutable final
       const ExecuteOptions& options,
       std::optional<DeviceList> devices) override;
 
-  Future<absl::Status> Delete() override;
+  Future<> Delete() override;
   bool IsDeleted() const override;
 
   absl::Span<const LogicalDeviceIds> addressable_device_logical_ids()
@@ -133,11 +133,11 @@ class LoadedExecutable final
       addressable_device_logical_device_ids_;
   const std::vector<xla::ifrt::Device*> addressable_devices_;
   const absl::StatusOr<std::optional<std::string>> fingerprint_;
-  const Future<absl::Status> ready_future_;
+  const Future<> ready_future_;
 
   // Metadata queried when the executable is created. Declared as `mutable`
   // since `Future::Await()` is not const.
-  mutable Future<absl::StatusOr<std::shared_ptr<Metadata>>> metadata_future_;
+  mutable Future<std::shared_ptr<Metadata>> metadata_future_;
 };
 
 }  // namespace proxy
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc b/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc
index 8b43f39d18b7e0..f774d615d5a67d 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc
@@ -44,7 +44,7 @@
 #include "xla/python/ifrt_proxy/client/version.h"
 #include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
 #include "xla/python/ifrt_proxy/common/types.h"
-#include "tsl/concurrency/ref_count.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "tsl/platform/casts.h"
 #include "tsl/platform/protobuf.h"  // IWYU pragma: keep
 #include "tsl/platform/status_matchers.h"
@@ -138,7 +138,7 @@ TEST_F(LoadedExecutableTest, Metadata) {
       &client, rpc_helper_, /*handle=*/1234, /*name=*/"foo",
       /*num_devices=*/2, /*addressable_device_logical_device_ids=*/{},
       /*addressable_devices=*/{}, /*fingerprint=*/"fingerprint",
-      /*ready_future=*/Future<absl::Status>(absl::OkStatus()),
+      /*ready_future=*/Future<>(absl::OkStatus()),
       /*loaded_host_callbacks=*/{}, /*loaded_host_callback_handles=*/{});
 
   EXPECT_THAT(
@@ -187,7 +187,7 @@ TEST_F(LoadedExecutableTest, Execute) {
       &client, rpc_helper_, /*handle=*/1234, /*name=*/"foo",
       /*num_devices=*/2, /*addressable_device_logical_device_ids=*/{},
       /*addressable_devices=*/{}, /*fingerprint=*/"fingerprint",
-      /*ready_future=*/Future<absl::Status>(absl::OkStatus()),
+      /*ready_future=*/Future<>(absl::OkStatus()),
       /*loaded_host_callbacks=*/{}, /*loaded_host_callback_handles=*/{});
 
   IfrtResponse response;
@@ -279,7 +279,7 @@ TEST_F(LoadedExecutableTest, Delete) {
       &client, rpc_helper_, /*handle=*/1234, /*name=*/"foo",
       /*num_devices=*/2, /*addressable_device_logical_device_ids=*/{},
       /*addressable_devices=*/{}, /*fingerprint=*/"fingerprint",
-      /*ready_future=*/Future<absl::Status>(absl::OkStatus()),
+      /*ready_future=*/Future<>(absl::OkStatus()),
       /*loaded_host_callbacks=*/{}, /*loaded_host_callback_handles=*/{});
 
   {
@@ -312,7 +312,7 @@ TEST_F(LoadedExecutableTest, Delete) {
                                                     })pb")))))
         .WillOnce(MockClientSessionReturnResponse(response));
 
-    Future<absl::Status> result = executable.Delete();
+    Future<> result = executable.Delete();
     EXPECT_THAT(result.Await(),
                 StatusIs(absl::StatusCode::kUnknown, StrEq("injected error")));
   }
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/grpc_client.cc b/third_party/xla/xla/python/ifrt_proxy/client/grpc_client.cc
index 4279ab7d4d430c..590fff2a5c3e1e 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/grpc_client.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/grpc_client.cc
@@ -56,7 +56,7 @@ absl::StatusOr<std::unique_ptr<Client>> AttemptConnection(
     absl::AnyInvocable<void(absl::string_view)> log_initial_connection) {
   std::unique_ptr<RpcHelper> rpc_helper;
   auto init_response_promise =
-      Future<absl::StatusOr<std::shared_ptr<InitResponse>>>::CreatePromise();
+      Future<std::shared_ptr<InitResponse>>::CreatePromise();
 
   if (on_disconnect == nullptr) {
     on_disconnect = [](absl::Status s) {
@@ -68,32 +68,31 @@ absl::StatusOr<std::unique_ptr<Client>> AttemptConnection(
   // that we can pass mock `ClientSession` to the client.
   auto stub = CreateGrpcStub(server_address);
 
-  auto session_disconnect_cb =
-      [init_response = Future<absl::StatusOr<std::shared_ptr<InitResponse>>>(
-           init_response_promise),
-       on_disconnect = std::move(on_disconnect),
-       attempt_no](absl::Status s) mutable {
-        // If the `rpc_helper->Init().OnReady(cb)` statement below has returned,
-        // the callback cb in that statement (which sets `init_response`) is
-        // guaranteed by `GrpcClientSession::Create()` to be called before
-        // `session_disconnect_cb`.
-        // TODO(madthanu): The above statement is false (even if we wanted to,
-        // we cannot meaningfully enforce or document the guarantee of
-        // the returned Future's OnReady being called before another callback),
-        // although the exact way init_response_promise is set below makes it
-        // work most of the time.
-        if (init_response.IsReady() && init_response.Await().ok()) {
-          // If the init RPC has already completed successfully, we have
-          // already or will be returning OK from the `AttemptConnection` call.
-          // So, invoke `on_disconnect`.
-          on_disconnect(s);
-        } else {
-          // Otherwise, we are going to return an error from
-          // `AttemptConnection`. So do not invoke `on_disconnect`.
-          VLOG(0) << "GrpcClientSession attempt " << attempt_no
-                  << " failed: " << s;
-        }
-      };
+  auto session_disconnect_cb = [init_response =
+                                    Future<std::shared_ptr<InitResponse>>(
+                                        init_response_promise),
+                                on_disconnect = std::move(on_disconnect),
+                                attempt_no](absl::Status s) mutable {
+    // If the `rpc_helper->Init().OnReady(cb)` statement below has returned,
+    // the callback cb in that statement (which sets `init_response`) is
+    // guaranteed by `GrpcClientSession::Create()` to be called before
+    // `session_disconnect_cb`.
+    // TODO(madthanu): The above statement is false (even if we wanted to,
+    // we cannot meaningfully enforce or document the guarantee of
+    // the returned Future's OnReady being called before another callback),
+    // although the exact way init_response_promise is set below makes it
+    // work most of the time.
+    if (init_response.IsReady() && init_response.Await().ok()) {
+      // If the init RPC has already completed successfully, we have
+      // already or will be returning OK from the `AttemptConnection` call.
+      // So, invoke `on_disconnect`.
+      on_disconnect(s);
+    } else {
+      // Otherwise, we are going to return an error from
+      // `AttemptConnection`. So do not invoke `on_disconnect`.
+      VLOG(0) << "GrpcClientSession attempt " << attempt_no << " failed: " << s;
+    }
+  };
 
   GrpcIfrtSessionMetadata metadata;
   {
@@ -126,10 +125,9 @@ absl::StatusOr<std::unique_ptr<Client>> AttemptConnection(
   rpc_helper->Init(std::make_unique<InitRequest>())
       .OnReady([&](auto resp) mutable { init_response_promise.Set(resp); });
 
-  TF_ASSIGN_OR_RETURN(auto init_response,
-                      Future<absl::StatusOr<std::shared_ptr<InitResponse>>>(
-                          init_response_promise)
-                          .Await());
+  TF_ASSIGN_OR_RETURN(
+      auto init_response,
+      Future<std::shared_ptr<InitResponse>>(init_response_promise).Await());
 
   auto host_buffer_store = std::make_unique<GrpcClientHostBufferStore>(
       stub, metadata.version(), init_response->session_id());
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/grpc_client_session.cc b/third_party/xla/xla/python/ifrt_proxy/client/grpc_client_session.cc
index a70e633d6767e5..d673a5f7561829 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/grpc_client_session.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/grpc_client_session.cc
@@ -124,12 +124,13 @@ GrpcClientSession::GrpcClientSession(
       absl::bind_front(&GrpcClientSession::ReadLoop, this));
 }
 
-Future<ClientSession::Response> GrpcClientSession::Enqueue(
+Future<std::shared_ptr<IfrtResponse>> GrpcClientSession::Enqueue(
     std::unique_ptr<IfrtRequest> request) {
-  auto promise = Future<ClientSession::Response>::CreatePromise();
+  auto promise = Future<std::shared_ptr<IfrtResponse>>::CreatePromise();
   absl::Status status = Enqueue(
-      std::move(request), [promise, queue = user_futures_work_queue_.get()](
-                              Response response) mutable {
+      std::move(request),
+      [promise, queue = user_futures_work_queue_.get()](
+          absl::StatusOr<std::shared_ptr<IfrtResponse>> response) mutable {
         queue->Schedule([promise = std::move(promise),
                          response = std::move(response)]() mutable -> void {
           promise.Set(std::move(response));
@@ -140,7 +141,7 @@ Future<ClientSession::Response> GrpcClientSession::Enqueue(
       promise.Set(std::move(status));
     });
   }
-  return Future<ClientSession::Response>(std::move(promise));
+  return Future<std::shared_ptr<IfrtResponse>>(std::move(promise));
 }
 
 absl::Status GrpcClientSession::Enqueue(std::unique_ptr<IfrtRequest> req,
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/grpc_client_session.h b/third_party/xla/xla/python/ifrt_proxy/client/grpc_client_session.h
index 9ca8219760a15b..9e80b9ad850858 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/grpc_client_session.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/grpc_client_session.h
@@ -23,6 +23,7 @@
 #include "absl/base/call_once.h"
 #include "absl/base/thread_annotations.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/synchronization/notification.h"
@@ -61,12 +62,14 @@ class GrpcClientSession : public ClientSession {
       GrpcIfrtSessionMetadata metadata,
       StreamTerminatedCallback stream_terminated_cb);
 
-  Future<Response> Enqueue(std::unique_ptr<IfrtRequest> request) override;
+  Future<std::shared_ptr<IfrtResponse>> Enqueue(
+      std::unique_ptr<IfrtRequest> request) override;
 
   // `ResponseCallback` represents a function that can be invoked when
   // `ClientSession` receives an `IfrtResponse`. May be invoked by the "primary"
   // thread and with various mutex locks held.
-  using ResponseCallback = std::function<void(Response)>;
+  using ResponseCallback =
+      std::function<void(absl::StatusOr<std::shared_ptr<IfrtResponse>>)>;
 
   absl::Status Enqueue(std::unique_ptr<IfrtRequest> req,
                        ResponseCallback callback);
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/grpc_client_session_test.cc b/third_party/xla/xla/python/ifrt_proxy/client/grpc_client_session_test.cc
index 18f1bb1328de2d..882f7b271841d0 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/grpc_client_session_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/grpc_client_session_test.cc
@@ -280,8 +280,9 @@ class ClientAndServer {
     auto req = std::make_unique<IfrtRequest>();
     req->mutable_request_metadata()->set_op_id(op_id);
     TF_RETURN_IF_ERROR(client_session_->Enqueue(
-        std::move(req),
-        [q](GrpcClientSession::Response resp) { q->Push(resp.status()); }));
+        std::move(req), [q](absl::StatusOr<GrpcClientSession::Response> resp) {
+          q->Push(resp.status());
+        }));
 
     return q;
   }
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/grpc_host_buffer.cc b/third_party/xla/xla/python/ifrt_proxy/client/grpc_host_buffer.cc
index c5a69737f057d5..442381ef0abd50 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/grpc_host_buffer.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/grpc_host_buffer.cc
@@ -62,8 +62,8 @@ uint64_t GrpcClientHostBufferStore::NextHandle() {
   return next_handle_.fetch_add(1, std::memory_order_relaxed);
 }
 
-Future<absl::Status> GrpcClientHostBufferStore::Store(uint64_t handle,
-                                                      absl::string_view data) {
+Future<> GrpcClientHostBufferStore::Store(uint64_t handle,
+                                          absl::string_view data) {
   // The current implementation synchronously sends host buffer chunks. We may
   // consider making it asynchronous if the caller can leverage such asynchrony.
 
@@ -91,15 +91,15 @@ Future<absl::Status> GrpcClientHostBufferStore::Store(uint64_t handle,
   }
 
   if (!writer->WritesDone()) {
-    return Future<absl::Status>(
+    return Future<>(
         absl::InternalError("Failed to write all host buffer chunks"));
   }
 
-  return Future<absl::Status>(xla::FromGrpcStatus(writer->Finish()));
+  return Future<>(xla::FromGrpcStatus(writer->Finish()));
 }
 
-Future<absl::Status> GrpcClientHostBufferStore::Store(uint64_t handle,
-                                                      const absl::Cord& data) {
+Future<> GrpcClientHostBufferStore::Store(uint64_t handle,
+                                          const absl::Cord& data) {
   // The current implementation synchronously sends host buffer chunks. We may
   // consider making it asynchronous if the caller can leverage such asynchrony.
 
@@ -128,16 +128,15 @@ Future<absl::Status> GrpcClientHostBufferStore::Store(uint64_t handle,
     }
   }
   if (!writer->WritesDone()) {
-    return Future<absl::Status>(
+    return Future<>(
         absl::InternalError("Failed to write all host buffer chunks"));
   }
 
-  return Future<absl::Status>(xla::FromGrpcStatus(writer->Finish()));
+  return Future<>(xla::FromGrpcStatus(writer->Finish()));
 }
 
-Future<absl::StatusOr<absl::Cord>> GrpcClientHostBufferStore::Lookup(
-    uint64_t handle) {
-  auto promise = Future<absl::StatusOr<absl::Cord>>::CreatePromise();
+Future<absl::Cord> GrpcClientHostBufferStore::Lookup(uint64_t handle) {
+  auto promise = Future<absl::Cord>::CreatePromise();
 
   lookup_work_queue_->Schedule([this, handle, promise]() mutable -> void {
     GrpcHostBufferLookupRequest request;
@@ -163,17 +162,17 @@ Future<absl::StatusOr<absl::Cord>> GrpcClientHostBufferStore::Lookup(
     }
   });
 
-  return Future<absl::StatusOr<absl::Cord>>(promise);
+  return Future<absl::Cord>(promise);
 }
 
-Future<absl::Status> GrpcClientHostBufferStore::Delete(uint64_t handle) {
+Future<> GrpcClientHostBufferStore::Delete(uint64_t handle) {
   GrpcHostBufferDeleteRequest request;
   request.set_session_id(session_id_);
   request.set_handle(handle);
 
   ::grpc::ClientContext context;
   GrpcHostBufferDeleteResponse response;
-  return Future<absl::Status>(xla::FromGrpcStatus(
+  return Future<>(xla::FromGrpcStatus(
       stub_->HostBufferDelete(&context, request, &response)));
 }
 
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/grpc_host_buffer.h b/third_party/xla/xla/python/ifrt_proxy/client/grpc_host_buffer.h
index bbf9b9eecfeefe..50ab06a7fd8941 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/grpc_host_buffer.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/grpc_host_buffer.h
@@ -21,7 +21,6 @@
 #include <cstdint>
 #include <memory>
 
-#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/cord.h"
 #include "absl/strings/string_view.h"
@@ -45,10 +44,10 @@ class GrpcClientHostBufferStore : public ClientHostBufferStore {
   // Implements ClientHostBufferStore.
 
   uint64_t NextHandle() override;
-  Future<absl::Status> Store(uint64_t handle, absl::string_view data) override;
-  Future<absl::Status> Store(uint64_t handle, const absl::Cord& data) override;
-  Future<absl::StatusOr<absl::Cord>> Lookup(uint64_t handle) override;
-  Future<absl::Status> Delete(uint64_t handle) override;
+  Future<> Store(uint64_t handle, absl::string_view data) override;
+  Future<> Store(uint64_t handle, const absl::Cord& data) override;
+  Future<absl::Cord> Lookup(uint64_t handle) override;
+  Future<> Delete(uint64_t handle) override;
 
  private:
   const std::shared_ptr<grpc::GrpcIfrtService::StubInterface> stub_;
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/host_buffer.h b/third_party/xla/xla/python/ifrt_proxy/client/host_buffer.h
index ceaf51debc7d8f..cf9ccfa7afe415 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/host_buffer.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/host_buffer.h
@@ -19,7 +19,6 @@
 
 #include <cstdint>
 
-#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/cord.h"
 #include "absl/strings/string_view.h"
@@ -37,22 +36,20 @@ class ClientHostBufferStore {
 
   // Stores the data associated with the given handle. Returns an error if the
   // handle already exists.
-  virtual Future<absl::Status> Store(uint64_t handle,
-                                     absl::string_view data) = 0;
+  virtual Future<> Store(uint64_t handle, absl::string_view data) = 0;
 
   // Stores the data associated with the given handle. Returns an error if the
   // handle already exists.
   // TODO(b/315023499) Find a way to increase the chunk size
-  virtual Future<absl::Status> Store(uint64_t handle,
-                                     const absl::Cord& data) = 0;
+  virtual Future<> Store(uint64_t handle, const absl::Cord& data) = 0;
 
   // Retrieves the data associated with the handle. Returns an error if the
   // handle does not exist.
-  virtual Future<absl::StatusOr<absl::Cord>> Lookup(uint64_t handle) = 0;
+  virtual Future<absl::Cord> Lookup(uint64_t handle) = 0;
 
   // Deletes the host buffer associated with the handle. Returns an error if the
   // handle does not exist.
-  virtual Future<absl::Status> Delete(uint64_t handle) = 0;
+  virtual Future<> Delete(uint64_t handle) = 0;
 };
 
 }  // namespace proxy
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/mock_host_buffer.h b/third_party/xla/xla/python/ifrt_proxy/client/mock_host_buffer.h
index 81d70cc4e93012..f947f350d56940 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/mock_host_buffer.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/mock_host_buffer.h
@@ -20,7 +20,6 @@
 #include <cstdint>
 
 #include <gmock/gmock.h>
-#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/cord.h"
 #include "absl/strings/string_view.h"
@@ -34,13 +33,12 @@ namespace proxy {
 class MockClientHostBufferStore final : public ClientHostBufferStore {
  public:
   MOCK_METHOD(uint64_t, NextHandle, (), (override));
-  MOCK_METHOD(Future<absl::Status>, Store,
-              (uint64_t handle, absl::string_view data), (override));
-  MOCK_METHOD(Future<absl::Status>, Store,
-              (uint64_t handle, const absl::Cord& data), (override));
-  MOCK_METHOD(Future<absl::StatusOr<absl::Cord>>, Lookup, (uint64_t handle),
+  MOCK_METHOD(Future<>, Store, (uint64_t handle, absl::string_view data),
               (override));
-  MOCK_METHOD(Future<absl::Status>, Delete, (uint64_t handle), (override));
+  MOCK_METHOD(Future<>, Store, (uint64_t handle, const absl::Cord& data),
+              (override));
+  MOCK_METHOD(Future<absl::Cord>, Lookup, (uint64_t handle), (override));
+  MOCK_METHOD(Future<>, Delete, (uint64_t handle), (override));
 };
 
 }  // namespace proxy
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.cc b/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.cc
index e07f689513e6a7..ec577968b52253 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.cc
@@ -39,20 +39,22 @@ namespace proxy {
 // DoRpc is a templated function that implements the logic of all RPC-wrapping
 // functions of `RpcHelper`, such as `RpcHelper::MakeArrayFromHostBuffer()`.
 template <typename Req, typename Resp>
-Future<absl::StatusOr<std::shared_ptr<Resp>>> DoRpc(
-    ClientSession* session, RequestMetadata metadata,
-    void (IfrtRequest::*set_req)(Req*), Resp* (IfrtResponse::*get_resp)(),
-    bool (IfrtResponse::*has_resp)() const, std::unique_ptr<Req> req) {
+Future<std::shared_ptr<Resp>> DoRpc(ClientSession* session,
+                                    RequestMetadata metadata,
+                                    void (IfrtRequest::*set_req)(Req*),
+                                    Resp* (IfrtResponse::*get_resp)(),
+                                    bool (IfrtResponse::*has_resp)() const,
+                                    std::unique_ptr<Req> req) {
   auto ifrt_req = std::make_unique<IfrtRequest>();
   *ifrt_req->mutable_request_metadata() = metadata;
   (ifrt_req.get()->*set_req)(req.release());
 
-  auto promise = Future<absl::StatusOr<std::shared_ptr<Resp>>>::CreatePromise();
-  auto on_ready = [promise, has_resp,
-                   get_resp](ClientSession::Response r) mutable {
+  auto promise = Future<std::shared_ptr<Resp>>::CreatePromise();
+  auto on_ready = [promise, has_resp, get_resp](
+                      absl::StatusOr<std::shared_ptr<IfrtResponse>> r) mutable {
     if (!r.ok()) {
-      LOG(ERROR) << "Connection to IFRT proxy server was terminated: "
-                 << r.status();
+      LOG_EVERY_N_SEC(ERROR, 10)
+          << "Connection to IFRT proxy server was terminated: " << r.status();
       promise.Set(absl::UnavailableError(
           absl::StrCat("Connection to IFRT proxy server was terminated: ",
                        r.status().ToString())));
@@ -100,7 +102,7 @@ Future<absl::StatusOr<std::shared_ptr<Resp>>> DoRpc(
   };
   session->Enqueue(std::move(ifrt_req)).OnReady(on_ready);
 
-  return Future<absl::StatusOr<std::shared_ptr<Resp>>>(promise);
+  return Future<std::shared_ptr<Resp>>(promise);
 }
 
 RequestMetadata RpcHelper::ManufactureRequestMetadata() {
@@ -140,6 +142,7 @@ RPC(CheckFuture, check_future);
 RPC(MakeArrayFromHostBuffer, make_array_from_host_buffer);
 RPC(AssembleArrayFromSingleDeviceArrays,
     assemble_array_from_single_device_arrays);
+RPC(RemapArrays, remap_arrays);
 RPC(DisassembleIntoSingleDeviceArrays, disassemble_into_single_device_arrays);
 RPC(CopyToHostBuffer, copy_to_host_buffer);
 RPC(CheckArrayReady, check_array_ready);
@@ -157,17 +160,17 @@ RPC(LoadedExecutableDestruct, loaded_executable_destruct);
 RPC(LoadedHostCallbackPoll, loaded_host_callback_poll);
 RPC(LoadedHostCallbackReturn, loaded_host_callback_return);
 
-Future<absl::Status> RpcHelper::CheckFuture(uint64_t handle) {
+Future<> RpcHelper::CheckFuture(uint64_t handle) {
   auto req = std::make_unique<CheckFutureRequest>();
   req->set_future_handle(handle);
 
-  auto promise = Future<absl::Status>::CreatePromise();
+  auto promise = Future<>::CreatePromise();
   CheckFuture(std::move(req))
       .OnReady(
           [promise](absl::StatusOr<std::shared_ptr<CheckFutureResponse>>
                         response) mutable { promise.Set(response.status()); });
 
-  return Future<absl::Status>(promise);
+  return Future<>(std::move(promise));
 }
 
 }  // namespace proxy
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.h b/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.h
index b5c3bf6340241d..304bb17b3c4605 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.h
@@ -69,7 +69,7 @@ class RpcHelper {
   }
 
   template <typename T>
-  using ResponseFuture = Future<absl::StatusOr<std::shared_ptr<T>>>;
+  using ResponseFuture = Future<std::shared_ptr<T>>;
 
   // Wrapper function for various logical RPCs defined in ifrt_service.proto.
   // Whenever the RPC finishes, `on_done` will be called with the result or the
@@ -93,6 +93,8 @@ class RpcHelper {
   ResponseFuture<AssembleArrayFromSingleDeviceArraysResponse>
   AssembleArrayFromSingleDeviceArrays(
       std::unique_ptr<AssembleArrayFromSingleDeviceArraysRequest> req);
+  ResponseFuture<RemapArraysResponse> RemapArrays(
+      std::unique_ptr<RemapArraysRequest> req);
   ResponseFuture<DisassembleIntoSingleDeviceArraysResponse>
   DisassembleIntoSingleDeviceArrays(
       std::unique_ptr<DisassembleIntoSingleDeviceArraysRequest> req);
@@ -130,7 +132,7 @@ class RpcHelper {
 
   // Utility functions for common functions.
 
-  Future<absl::Status> CheckFuture(uint64_t handle);
+  Future<> CheckFuture(uint64_t handle);
 
  private:
   RequestMetadata ManufactureRequestMetadata() ABSL_LOCKS_EXCLUDED(mu_);
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/BUILD b/third_party/xla/xla/python/ifrt_proxy/common/BUILD
index cfe004734bccef..30e93ada9ccfda 100644
--- a/third_party/xla/xla/python/ifrt_proxy/common/BUILD
+++ b/third_party/xla/xla/python/ifrt_proxy/common/BUILD
@@ -71,6 +71,7 @@ tf_proto_library(
         "//xla:xla_data_proto",
         "//xla/pjrt:execute_options_proto",
         "//xla/python/ifrt:dtype_proto",
+        "//xla/python/ifrt:remap_plan_proto",
         "//xla/python/ifrt:serdes_proto",
         "//xla/python/ifrt:shape_proto",
         "//xla/python/ifrt:sharding_proto",
@@ -157,7 +158,10 @@ ifrt_proxy_cc_test(
 # common_serdes is a collection of all common libraries that register SerDes implementations.
 cc_library(
     name = "common_serdes",
-    deps = ["//xla/python/pjrt_ifrt:xla_program_serdes"],
+    deps = [
+        "//xla/python/ifrt:plugin_program_serdes",
+        "//xla/python/ifrt/hlo:hlo_program_serdes",
+    ],
     alwayslink = True,
 )
 
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/ifrt_service.proto b/third_party/xla/xla/python/ifrt_proxy/common/ifrt_service.proto
index 15af18661b0c5a..6c37451979dbda 100644
--- a/third_party/xla/xla/python/ifrt_proxy/common/ifrt_service.proto
+++ b/third_party/xla/xla/python/ifrt_proxy/common/ifrt_service.proto
@@ -19,6 +19,7 @@ package xla.ifrt.proxy;
 import "google/protobuf/any.proto";
 import "xla/pjrt/execute_options.proto";
 import "xla/python/ifrt/dtype.proto";
+import "xla/python/ifrt/remap_plan.proto";
 import "xla/python/ifrt/serdes.proto";
 import "xla/python/ifrt/shape.proto";
 import "xla/python/ifrt/sharding.proto";
@@ -45,6 +46,7 @@ message IfrtRequest {
     MakeArrayFromHostBufferRequest make_array_from_host_buffer_request = 4;
     AssembleArrayFromSingleDeviceArraysRequest
         assemble_array_from_single_device_arrays_request = 5;
+    RemapArraysRequest remap_arrays_request = 23;
     CopyToHostBufferRequest copy_to_host_buffer_request = 6;
     DisassembleIntoSingleDeviceArraysRequest
         disassemble_into_single_device_arrays_request = 7;
@@ -88,6 +90,7 @@ message IfrtResponse {
     MakeArrayFromHostBufferResponse make_array_from_host_buffer_response = 4;
     AssembleArrayFromSingleDeviceArraysResponse
         assemble_array_from_single_device_arrays_response = 5;
+    RemapArraysResponse remap_arrays_response = 23;
     CopyToHostBufferResponse copy_to_host_buffer_response = 6;
     DisassembleIntoSingleDeviceArraysResponse
         disassemble_into_single_device_arrays_response = 7;
@@ -160,21 +163,21 @@ message ResponseMetadata {
   // ID of the operation this response belongs to.
   fixed64 op_id = 1;
 
-  // Status of the operation.
+  // absl::Status of the operation.
   //
   //    In case of "chunked" responses (i.e., the full logical response is
   //    spread across a sequence of IfrtResponse protos), the actual sequence of
-  //    IfrtResponse messages will follow only if this Status is OK in the very
-  //    first message. That is, in case of errors, server sends a single
+  //    IfrtResponse messages will follow only if this absl::Status is OK in the
+  //    very first message. That is, in case of errors, server sends a single
   //    IfrtResponse with the appropriate error included.
   //
   //    In case of "batched" operations (i.e., where the response is carrying
   //    the outcomes of multiple requests that were "batched" in the same
-  //    IfrtRequest proto - such as deleting a bunch of Arrays) this Status
-  //    field provides a way to quickly check if none of the individual
-  //    operations encountered errors. Clients should not rely on specific error
-  //    type or string when this is not OK, they should check the response
-  //    message for individual Statuses.
+  //    IfrtRequest proto - such as deleting a bunch of Arrays) this
+  //    absl::Status field provides a way to quickly check if none of the
+  //    individual operations encountered errors. Clients should not rely on
+  //    specific error type or string when this is not OK, they should check the
+  //    response message for individual absl::Statuses.
   tensorflow.StatusProto status = 2;
 }
 
@@ -268,6 +271,17 @@ message AssembleArrayFromSingleDeviceArraysResponse {
   fixed64 array_handle = 1;
 }
 
+// Remaps the shards of given IFRT arrays to new IFRT arrays.
+// Equivalent to ifrt::Client::RemapArrays.
+message RemapArraysRequest {
+  RemapPlanProto plan = 1;
+  repeated fixed64 array_handles = 2;
+  proto.ArrayCopySemantics copy_semantics = 3;
+}
+message RemapArraysResponse {
+  repeated fixed64 array_handles = 1;
+}
+
 // Reads the contents of a given IFRT Array.
 // Equivalent to ifrt::Array::CopyToHostBuffer.
 message CopyToHostBufferRequest {
diff --git a/third_party/xla/xla/python/ifrt_proxy/integration_tests/BUILD b/third_party/xla/xla/python/ifrt_proxy/integration_tests/BUILD
index 23a56803a43fd7..d063dd3f03835e 100644
--- a/third_party/xla/xla/python/ifrt_proxy/integration_tests/BUILD
+++ b/third_party/xla/xla/python/ifrt_proxy/integration_tests/BUILD
@@ -75,6 +75,17 @@ ifrt_proxy_cc_test(
     ],
 )
 
+ifrt_proxy_cc_test(
+    name = "remap_impl_test_tfrt_cpu",
+    size = "small",
+    srcs = [],
+    deps = [
+        ":register_pjrt_cpu_for_ifrt_api_tests",  # buildcleaner: keep
+        "//xla/python/ifrt:remap_impl_test_lib",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 ifrt_proxy_cc_test(
     name = "mock_array_test",
     size = "small",
@@ -89,6 +100,7 @@ ifrt_proxy_cc_test(
         "//xla/python/ifrt_proxy/client:registry",
         "//xla/python/ifrt_proxy/server:grpc_server",
         "//xla/python/pjrt_ifrt",
+        "//xla/tsl/concurrency:ref_count",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -98,7 +110,6 @@ ifrt_proxy_cc_test(
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:statusor",
diff --git a/third_party/xla/xla/python/ifrt_proxy/integration_tests/mock_array_test.cc b/third_party/xla/xla/python/ifrt_proxy/integration_tests/mock_array_test.cc
index 9a0121fdcb2826..fe0aefc64b7c6e 100644
--- a/third_party/xla/xla/python/ifrt_proxy/integration_tests/mock_array_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/integration_tests/mock_array_test.cc
@@ -48,7 +48,7 @@
 #include "xla/python/ifrt_proxy/server/grpc_server.h"
 #include "xla/python/pjrt_ifrt/pjrt_client.h"
 #include "xla/status.h"
-#include "tsl/concurrency/ref_count.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/status_matchers.h"
 #include "tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/python/ifrt_proxy/integration_tests/register_pjrt_cpu_for_ifrt_api_tests.cc b/third_party/xla/xla/python/ifrt_proxy/integration_tests/register_pjrt_cpu_for_ifrt_api_tests.cc
index 6b344e011ede78..d3866618d33fcd 100644
--- a/third_party/xla/xla/python/ifrt_proxy/integration_tests/register_pjrt_cpu_for_ifrt_api_tests.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/integration_tests/register_pjrt_cpu_for_ifrt_api_tests.cc
@@ -43,7 +43,7 @@ namespace {
 absl::StatusOr<std::unique_ptr<xla::ifrt::Client>> CreateIfrtBackendClient() {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::PjRtClient> tfrt_cpu_client,
                       xla::GetTfrtCpuClient(/*asynchronous=*/true,
-                                            /*cpu_device_count=*/2));
+                                            /*cpu_device_count=*/4));
   return xla::ifrt::PjRtClient::Create(std::move(tfrt_cpu_client));
 }
 
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/BUILD b/third_party/xla/xla/python/ifrt_proxy/server/BUILD
index 546a8a42564aef..092cad38c102a9 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/BUILD
+++ b/third_party/xla/xla/python/ifrt_proxy/server/BUILD
@@ -64,7 +64,6 @@ cc_library(
     deps = [
         ":host_buffer",
         ":ifrt_backend",
-        ":ifrt_session_handler",
         ":version",
         "//xla/pjrt/distributed:util",
         "//xla/python/ifrt_proxy/common:grpc_ifrt_service_cc_grpc_proto",
@@ -112,37 +111,6 @@ ifrt_proxy_cc_test(
     ],
 )
 
-cc_library(
-    name = "ifrt_session_handler",
-    srcs = ["ifrt_session_handler.cc"],
-    hdrs = ["ifrt_session_handler.h"],
-    deps = [
-        ":ifrt_backend",
-        "//xla/python/ifrt",
-        "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
-        "//xla/python/ifrt_proxy/common:proto_util",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/synchronization",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
-ifrt_proxy_cc_test(
-    name = "ifrt_session_handler_test",
-    srcs = ["ifrt_session_handler_test.cc"],
-    deps = [
-        ":ifrt_backend",
-        ":ifrt_session_handler",
-        "//xla/python/ifrt",
-        "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:status_matchers",
-    ],
-)
-
 cc_library(
     name = "ifrt_backend",
     srcs = ["ifrt_backend.cc"],
@@ -167,6 +135,7 @@ cc_library(
         "//xla/python/ifrt_proxy/common:types",
         "//xla/python/ifrt_proxy/common:types_proto_cc",
         "//xla/python/pjrt_ifrt:xla_ifrt",
+        "//xla/tsl/concurrency:ref_count",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -180,7 +149,6 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
-        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:status_to_from_proto",
@@ -215,6 +183,7 @@ ifrt_proxy_cc_test(
         "//xla/python/ifrt_proxy/common:types_proto_cc",
         "//xla/python/pjrt_ifrt:xla_ifrt",
         "//xla/service:computation_placer_hdr",
+        "//xla/tsl/concurrency:ref_count",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log:check",
@@ -227,7 +196,6 @@ ifrt_proxy_cc_test(
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//llvm:Support",
-        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:protobuf",
@@ -278,6 +246,7 @@ cc_library(
         "//xla/python/ifrt_proxy/common:proto_util",
         "//xla/python/pjrt_ifrt",
         "//xla/python/pjrt_ifrt:xla_host_callback_proto_cc",
+        "//xla/tsl/concurrency:ref_count",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/functional:bind_front",
         "@com_google_absl//absl/status",
@@ -286,7 +255,6 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
-        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/platform:errors",
     ],
 )
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/grpc_service_impl.cc b/third_party/xla/xla/python/ifrt_proxy/server/grpc_service_impl.cc
index 8f89d253affcbe..36ba09f8f5f147 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/grpc_service_impl.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/server/grpc_service_impl.cc
@@ -34,7 +34,6 @@
 #include "xla/python/ifrt_proxy/common/grpc_ifrt_service.pb.h"
 #include "xla/python/ifrt_proxy/common/proto_util.h"
 #include "xla/python/ifrt_proxy/server/host_buffer.h"
-#include "xla/python/ifrt_proxy/server/ifrt_session_handler.h"
 #include "xla/python/ifrt_proxy/server/version.h"
 
 namespace xla {
@@ -90,21 +89,15 @@ ::grpc::Status GrpcServiceImpl::IfrtSession(
     CHECK_GT(host_buffer_stores_.erase(session_id), 0);
   };
 
-  absl::Mutex writer_mu;
-
-  auto session_handler = IfrtSessionHandler::Create(
-      session_id,
-      [this, version = metadata.version(),
-       host_buffer_store = std::move(host_buffer_store)](uint64_t session_id) {
-        return backend_factory_(version, session_id, host_buffer_store);
-      });
-
-  if (!session_handler.ok()) {
-    LOG(INFO) << "Creating session " << session_id
-              << " failed: " << session_handler.status();
-    return xla::ToGrpcStatus(session_handler.status());
+  auto backend = backend_factory_(metadata.version(), session_id,
+                                  std::move(host_buffer_store));
+  if (!backend.ok()) {
+    LOG(INFO) << "Creating IFRT backend " << session_id
+              << " failed: " << backend.status();
+    return xla::ToGrpcStatus(backend.status());
   }
 
+  absl::Mutex writer_mu;
   bool first_request_read = false;
   while (true) {
     auto request = std::make_unique<IfrtRequest>();
@@ -115,14 +108,21 @@ ::grpc::Status GrpcServiceImpl::IfrtSession(
       VLOG(0) << "First request read for session " << session_id;
       first_request_read = true;
     }
-    (*session_handler)
-        ->NewIncomingRequest(std::move(request),
-                             [&](std::shared_ptr<IfrtResponse> response) {
-                               absl::MutexLock l(&writer_mu);
-                               stream->Write(*response);
-                             });
+    const uint64_t op_id = request->request_metadata().op_id();
+    auto response = (*backend)->Process(std::move(request));
+    response.OnReady(
+        [op_id, stream,
+         &writer_mu](absl::StatusOr<std::shared_ptr<IfrtResponse>> response) {
+          absl::MutexLock l(&writer_mu);
+          if (response.ok()) {
+            stream->Write(**response);
+          } else {
+            stream->Write(*NewIfrtResponse(op_id, response.status()));
+          }
+        });
   }
 
+  backend->reset();  // Blocks until all response callbacks are called.
   VLOG(0) << "Finishing IFRT session " << session_id;
   return ::grpc::Status::OK;
 }
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/host_callback.cc b/third_party/xla/xla/python/ifrt_proxy/server/host_callback.cc
index 43e13700c293bd..5bc7320841549d 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/host_callback.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/server/host_callback.cc
@@ -39,7 +39,7 @@
 #include "xla/python/pjrt_ifrt/xla_host_callback.pb.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "tsl/concurrency/ref_count.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "tsl/platform/errors.h"
 
 namespace xla {
@@ -161,8 +161,8 @@ absl::Status RemoteLoadedHostCallback::Execute(void** result_ptrs,
   to_buffer(host_callback().operands, operand_ptrs, request.operands);
   to_buffer(host_callback().results, result_ptrs, request.results);
 
-  request.status = Future<absl::Status>::CreatePromise();
-  Future<absl::Status> status(request.status);
+  request.status = Future<>::CreatePromise();
+  Future<> status(request.status);
 
   // Enqueue the execution request. `IfrtBackend` retrieves this by calling
   // `PopExecutionRequest` and fulfills the `results` promise.
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/host_callback.h b/third_party/xla/xla/python/ifrt_proxy/server/host_callback.h
index e2d6ea834e7d60..6310487dd20f2a 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/host_callback.h
+++ b/third_party/xla/xla/python/ifrt_proxy/server/host_callback.h
@@ -34,7 +34,7 @@
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/future.h"
 #include "xla/python/pjrt_ifrt/pjrt_host_callback.h"
-#include "tsl/concurrency/ref_count.h"
+#include "xla/tsl/concurrency/ref_count.h"
 
 namespace xla {
 namespace ifrt {
@@ -55,7 +55,7 @@ class RemoteLoadedHostCallbackQueue {
   struct ExecutionRequest {
     std::vector<Buffer> operands;
     std::vector<Buffer> results;
-    Future<absl::Status>::Promise status;
+    Future<>::Promise status;
   };
 
   ~RemoteLoadedHostCallbackQueue();
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
index 572e2414766bae..32e7bff17827f4 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
@@ -51,6 +51,7 @@
 #include "xla/python/ifrt/memory.h"
 #include "xla/python/ifrt/program.h"
 #include "xla/python/ifrt/program_serdes.h"
+#include "xla/python/ifrt/remap_plan.h"
 #include "xla/python/ifrt/serdes.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
@@ -64,8 +65,8 @@
 #include "xla/python/ifrt_proxy/server/version.h"
 #include "xla/python/pjrt_ifrt/xla_compiler.h"
 #include "xla/status_macros.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/status_to_from_proto.h"
@@ -117,6 +118,12 @@ absl::StatusOr<std::unique_ptr<IfrtBackend>> IfrtBackend::Create(
 
 IfrtBackend::~IfrtBackend() {
   // Cancel all in-flight host callback executions.
+  {
+    absl::MutexLock lock(&host_callback_queues_mutex_);
+    for (const auto& [key, queue] : host_callback_queues_) {
+      queue->Close();
+    }
+  }
   absl::flat_hash_map<uint64_t, RemoteLoadedHostCallbackQueue::ExecutionRequest>
       host_callback_executions;
   {
@@ -150,6 +157,8 @@ Future<BackendInterface::Response> IfrtBackend::Process(
     case IfrtRequest::RequestCase::kAssembleArrayFromSingleDeviceArraysRequest:
       return Future<Response>(
           HandleAssembleArrayFromSingleDeviceArraysRequest(std::move(request)));
+    case IfrtRequest::RequestCase::kRemapArraysRequest:
+      return Future<Response>(HandleRemapArraysRequest(std::move(request)));
     case IfrtRequest::RequestCase::kCopyToHostBufferRequest:
       return HandleCopyToHostBufferRequest(std::move(request));
     case IfrtRequest::RequestCase::kDisassembleIntoSingleDeviceArraysRequest:
@@ -210,7 +219,8 @@ void IfrtBackend::HandleGenerator::BulkNew(absl::Span<uint64_t> handles) {
 }
 
 Future<BackendInterface::Response> IfrtBackend::AsyncExecute(
-    std::function<Response()> handle_fn, tsl::thread::ThreadPool* thread_pool) {
+    std::function<absl::StatusOr<Response>()> handle_fn,
+    tsl::thread::ThreadPool* thread_pool) {
   {
     absl::MutexLock lock(&in_flight_count_mutex_);
     ++in_flight_count_;
@@ -236,7 +246,7 @@ Future<BackendInterface::Response> IfrtBackend::AsyncExecute(
 // Handlers for individual request types
 //
 
-BackendInterface::Response IfrtBackend::HandleInit(
+absl::StatusOr<BackendInterface::Response> IfrtBackend::HandleInit(
     std::unique_ptr<IfrtRequest> request) {
   std::unique_ptr<IfrtResponse> response =
       NewIfrtResponse(request->request_metadata().op_id());
@@ -300,7 +310,7 @@ Future<BackendInterface::Response> IfrtBackend::HandleCheckFutureRequest(
     std::unique_ptr<IfrtRequest> request) {
   const CheckFutureRequest& check_request = request->check_future_request();
 
-  Future<absl::Status> future;
+  Future<> future;
   {
     absl::MutexLock lock(&futures_mutex_);
     const auto it = futures_.find(check_request.future_handle());
@@ -330,7 +340,8 @@ Future<BackendInterface::Response> IfrtBackend::HandleCheckFutureRequest(
   return Future<BackendInterface::Response>(std::move(promise));
 }
 
-BackendInterface::Response IfrtBackend::HandleMakeArrayFromHostBufferRequest(
+absl::StatusOr<BackendInterface::Response>
+IfrtBackend::HandleMakeArrayFromHostBufferRequest(
     std::unique_ptr<IfrtRequest> request) {
   if (!request->has_make_array_from_host_buffer_request()) {
     return absl::InternalError(
@@ -392,7 +403,7 @@ BackendInterface::Response IfrtBackend::HandleMakeArrayFromHostBufferRequest(
   return response;
 }
 
-BackendInterface::Response
+absl::StatusOr<BackendInterface::Response>
 IfrtBackend::HandleAssembleArrayFromSingleDeviceArraysRequest(
     std::unique_ptr<IfrtRequest> request) {
   const auto& assemble_request =
@@ -432,6 +443,54 @@ IfrtBackend::HandleAssembleArrayFromSingleDeviceArraysRequest(
   return ifrt_resp;
 }
 
+absl::StatusOr<BackendInterface::Response>
+IfrtBackend::HandleRemapArraysRequest(std::unique_ptr<IfrtRequest> request) {
+  const auto& remap_request = request->remap_arrays_request();
+
+  std::vector<tsl::RCReference<xla::ifrt::Array>> arrays;
+  {
+    absl::ReaderMutexLock lock(&arrays_mutex_);
+    for (const uint64_t handle : remap_request.array_handles()) {
+      TF_ASSIGN_OR_RETURN(arrays.emplace_back(), GetArrayLocked(handle));
+    }
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      RemapPlan plan,
+      RemapPlan::FromProto(
+          absl::bind_front(&Client::LookupDevice, client_.get()),
+          remap_request.plan()));
+  TF_ASSIGN_OR_RETURN(auto semantics, FromArrayCopySemanticsProto(
+                                          remap_request.copy_semantics()));
+
+  TF_ASSIGN_OR_RETURN(
+      auto out_arrays,
+      client_->RemapArrays(plan, absl::MakeSpan(arrays), semantics));
+
+  // Set up an IfrtResponse with pre-allocated space for the right number of
+  // single device array handles.
+  int64_t num_arrays = out_arrays.size();
+  auto response = NewIfrtResponse(request->request_metadata().op_id());
+
+  // Pre-allocate space in the response proto and fill it in with bulk allocated
+  // new handles.
+  auto* handles =
+      response->mutable_remap_arrays_response()->mutable_array_handles();
+  handles->Reserve(num_arrays);
+  uint64_t* handles_buf = handles->AddNAlreadyReserved(num_arrays);
+  handle_generator_.BulkNew(absl::MakeSpan(handles_buf, num_arrays));
+
+  // Install the newly created arrays into the arrays_.
+  {
+    absl::MutexLock lock(&arrays_mutex_);
+    for (int i = 0; i < num_arrays; ++i) {
+      arrays_.insert({handles_buf[i], out_arrays[i]});
+    }
+  }
+
+  return response;
+}
+
 Future<BackendInterface::Response> IfrtBackend::HandleCopyToHostBufferRequest(
     std::unique_ptr<IfrtRequest> request) {
   const CopyToHostBufferRequest& copy_to_host =
@@ -497,7 +556,7 @@ Future<BackendInterface::Response> IfrtBackend::HandleCopyToHostBufferRequest(
   return resp_future;
 }
 
-BackendInterface::Response
+absl::StatusOr<BackendInterface::Response>
 IfrtBackend::HandleDisassembleIntoSingleDeviceArraysRequest(
     std::unique_ptr<IfrtRequest> request) {
   TF_ASSIGN_OR_RETURN(
@@ -562,7 +621,7 @@ Future<BackendInterface::Response> IfrtBackend::HandleCheckArrayReadyRequest(
   return ifrt_response_future;
 }
 
-BackendInterface::Response IfrtBackend::HandleReshardRequest(
+absl::StatusOr<BackendInterface::Response> IfrtBackend::HandleReshardRequest(
     std::unique_ptr<IfrtRequest> request) {
   const auto& reshard_request = request->reshard_request();
   TF_ASSIGN_OR_RETURN(auto array, GetArray(reshard_request.array_handle()));
@@ -589,7 +648,8 @@ BackendInterface::Response IfrtBackend::HandleReshardRequest(
   return ifrt_resp;
 }
 
-BackendInterface::Response IfrtBackend::HandleFullyReplicatedShardRequest(
+absl::StatusOr<BackendInterface::Response>
+IfrtBackend::HandleFullyReplicatedShardRequest(
     std::unique_ptr<IfrtRequest> request) {
   const auto& fully_replicated_shard_request =
       request->fully_replicated_shard_request();
@@ -619,8 +679,8 @@ BackendInterface::Response IfrtBackend::HandleFullyReplicatedShardRequest(
   return ifrt_resp;
 }
 
-BackendInterface::Response IfrtBackend::HandleDeleteArrayRequest(
-    std::unique_ptr<IfrtRequest> request) {
+absl::StatusOr<BackendInterface::Response>
+IfrtBackend::HandleDeleteArrayRequest(std::unique_ptr<IfrtRequest> request) {
   TF_ASSIGN_OR_RETURN(auto array,
                       GetArray(request->delete_array_request().array_handle()));
 
@@ -628,8 +688,7 @@ BackendInterface::Response IfrtBackend::HandleDeleteArrayRequest(
   uint64_t future_handle = handle_generator_.New();
   {
     absl::MutexLock lock(&futures_mutex_);
-    futures_.insert(
-        {future_handle, std::move(deletion_future).ToStatusFuture()});
+    futures_.insert({future_handle, std::move(deletion_future)});
   }
 
   auto ifrt_resp = NewIfrtResponse(request->request_metadata().op_id());
@@ -638,8 +697,8 @@ BackendInterface::Response IfrtBackend::HandleDeleteArrayRequest(
   return ifrt_resp;
 }
 
-BackendInterface::Response IfrtBackend::HandleIsArrayDeletedRequest(
-    std::unique_ptr<IfrtRequest> request) {
+absl::StatusOr<BackendInterface::Response>
+IfrtBackend::HandleIsArrayDeletedRequest(std::unique_ptr<IfrtRequest> request) {
   TF_ASSIGN_OR_RETURN(
       auto array, GetArray(request->is_array_deleted_request().array_handle()));
 
@@ -649,8 +708,8 @@ BackendInterface::Response IfrtBackend::HandleIsArrayDeletedRequest(
   return ifrt_resp;
 }
 
-BackendInterface::Response IfrtBackend::HandleDestructArrayRequest(
-    std::unique_ptr<IfrtRequest> request) {
+absl::StatusOr<BackendInterface::Response>
+IfrtBackend::HandleDestructArrayRequest(std::unique_ptr<IfrtRequest> request) {
   {
     absl::MutexLock lock(&arrays_mutex_);
     bool deleted =
@@ -675,7 +734,7 @@ Future<BackendInterface::Response> IfrtBackend::HandleCompileRequest(
   // thread during compilation and (2) run compilation with bigger stacks (often
   // necessary for XLA).
   auto f = [this, request = std::shared_ptr<IfrtRequest>(
-                      std::move(request))]() -> Response {
+                      std::move(request))]() -> absl::StatusOr<Response> {
     const CompileRequest& compile_request = request->compile_request();
 
     auto deserialize_program_options =
@@ -790,8 +849,8 @@ IfrtBackend::HandleLoadedExecutableMetadataRequest(
     std::unique_ptr<IfrtRequest> request) {
   // Call `GetParameterShardings` and `GetOutputShardings` on a thread pool
   // since some implementations may block until compilation completes.
-  return AsyncExecute([this, request = std::shared_ptr<IfrtRequest>(
-                                 std::move(request))]() -> Response {
+  return AsyncExecute([this, request = std::shared_ptr<IfrtRequest>(std::move(
+                                 request))]() -> absl::StatusOr<Response> {
     const uint64_t handle = request->loaded_executable_metadata_request()
                                 .loaded_executable_handle();
     TF_ASSIGN_OR_RETURN(std::shared_ptr<xla::ifrt::LoadedExecutable> executable,
@@ -868,7 +927,8 @@ IfrtBackend::HandleLoadedExecutableMetadataRequest(
   });
 }
 
-BackendInterface::Response IfrtBackend::HandleLoadedExecutableExecuteRequest(
+absl::StatusOr<BackendInterface::Response>
+IfrtBackend::HandleLoadedExecutableExecuteRequest(
     std::unique_ptr<IfrtRequest> request) {
   const LoadedExecutableExecuteRequest& execute =
       request->loaded_executable_execute_request();
@@ -914,8 +974,8 @@ BackendInterface::Response IfrtBackend::HandleLoadedExecutableExecuteRequest(
   {
     absl::MutexLock lock(&futures_mutex_);
     execute_response->set_status_handle(handle_generator_.New());
-    futures_.insert({execute_response->status_handle(),
-                     std::move(result.status).ToStatusFuture()});
+    futures_.insert(
+        {execute_response->status_handle(), std::move(result.status)});
   }
 
   // Register output arrays. At this point, we should never early return because
@@ -942,13 +1002,14 @@ BackendInterface::Response IfrtBackend::HandleLoadedExecutableExecuteRequest(
   return ifrt_resp;
 }
 
-BackendInterface::Response IfrtBackend::HandleLoadedExecutableDeleteRequest(
+absl::StatusOr<BackendInterface::Response>
+IfrtBackend::HandleLoadedExecutableDeleteRequest(
     std::unique_ptr<IfrtRequest> request) {
   const auto& del = request->loaded_executable_delete_request();
   TF_ASSIGN_OR_RETURN(std::shared_ptr<xla::ifrt::LoadedExecutable> executable,
                       GetLoadedExecutable(del.loaded_executable_handle()));
 
-  Future<absl::Status> future = executable->Delete();
+  Future<> future = executable->Delete();
 
   auto ifrt_resp = NewIfrtResponse(request->request_metadata().op_id());
   auto* del_response = ifrt_resp->mutable_loaded_executable_delete_response();
@@ -962,7 +1023,8 @@ BackendInterface::Response IfrtBackend::HandleLoadedExecutableDeleteRequest(
   return ifrt_resp;
 }
 
-BackendInterface::Response IfrtBackend::HandleLoadedExecutableIsDeletedRequest(
+absl::StatusOr<BackendInterface::Response>
+IfrtBackend::HandleLoadedExecutableIsDeletedRequest(
     std::unique_ptr<IfrtRequest> request) {
   const auto& is_deleted = request->loaded_executable_is_deleted_request();
   TF_ASSIGN_OR_RETURN(
@@ -977,7 +1039,8 @@ BackendInterface::Response IfrtBackend::HandleLoadedExecutableIsDeletedRequest(
   return ifrt_resp;
 }
 
-BackendInterface::Response IfrtBackend::HandleLoadedExecutableDestructRequest(
+absl::StatusOr<BackendInterface::Response>
+IfrtBackend::HandleLoadedExecutableDestructRequest(
     std::unique_ptr<IfrtRequest> request) {
   const auto& destruct = request->loaded_executable_destruct_request();
 
@@ -1007,8 +1070,8 @@ BackendInterface::Response IfrtBackend::HandleLoadedExecutableDestructRequest(
 Future<BackendInterface::Response>
 IfrtBackend::HandleLoadedHostCallbackPollRequest(
     std::unique_ptr<IfrtRequest> request) {
-  return AsyncExecute([this, request = std::shared_ptr<IfrtRequest>(
-                                 std::move(request))]() -> Response {
+  return AsyncExecute([this, request = std::shared_ptr<IfrtRequest>(std::move(
+                                 request))]() -> absl::StatusOr<Response> {
     const auto& poll = request->loaded_host_callback_poll_request();
     const uint64_t handle = poll.loaded_host_callback_handle();
 
@@ -1074,7 +1137,8 @@ IfrtBackend::HandleLoadedHostCallbackPollRequest(
   });
 }
 
-BackendInterface::Response IfrtBackend::HandleLoadedHostCallbackReturnRequest(
+absl::StatusOr<BackendInterface::Response>
+IfrtBackend::HandleLoadedHostCallbackReturnRequest(
     std::unique_ptr<IfrtRequest> request) {
   const auto& ret = request->loaded_host_callback_return_request();
 
@@ -1142,7 +1206,8 @@ BackendInterface::Response IfrtBackend::HandleLoadedHostCallbackReturnRequest(
   return ifrt_resp;
 }
 
-BackendInterface::Response IfrtBackend::HandleGetDefaultDeviceAssignmentRequest(
+absl::StatusOr<BackendInterface::Response>
+IfrtBackend::HandleGetDefaultDeviceAssignmentRequest(
     std::unique_ptr<IfrtRequest> request) {
   const auto& get_default_device_assignment_request =
       request->get_default_device_assignment_request();
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.h b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.h
index 9dd57c66dd2a1a..b61c1b248fdde9 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.h
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.h
@@ -23,7 +23,6 @@
 
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
-#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
@@ -35,7 +34,7 @@
 #include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
 #include "xla/python/ifrt_proxy/server/host_buffer.h"
 #include "xla/python/ifrt_proxy/server/host_callback.h"
-#include "tsl/concurrency/ref_count.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "tsl/platform/threadpool.h"
 
 namespace xla {
@@ -53,7 +52,7 @@ class BackendInterface {
   // want to revise the shared_ptr below to the `IfrtResponse` proto itself.
   // Also, if and when we have a move-only Future in xla::ifrt, we may consider
   // changing it to std::unique_ptr.
-  using Response = absl::StatusOr<std::shared_ptr<IfrtResponse>>;
+  using Response = std::shared_ptr<IfrtResponse>;
 
   // Processes a given IFRT Request and returns a Future of an IfrtResponse.
   virtual Future<Response> Process(std::unique_ptr<IfrtRequest> request) = 0;
@@ -102,54 +101,61 @@ class IfrtBackend final : public BackendInterface {
   // that becomes ready when the function returns. If the thread pool is not
   // given, uses a default thread pool implementation that does not limit the
   // maximum number of threads.
-  Future<Response> AsyncExecute(std::function<Response()> handle_fn,
-                                tsl::thread::ThreadPool* thread_pool = nullptr);
+  Future<Response> AsyncExecute(
+      std::function<absl::StatusOr<Response>()> handle_fn,
+      tsl::thread::ThreadPool* thread_pool = nullptr);
 
   //////////////////////////////////////////////////////////////////////
   // Handlers for individual requests
   //
 
-  Response HandleInit(std::unique_ptr<IfrtRequest> request);
+  absl::StatusOr<Response> HandleInit(std::unique_ptr<IfrtRequest> request);
 
   Future<Response> HandleCheckFutureRequest(
       std::unique_ptr<IfrtRequest> request);
 
-  Response HandleMakeArrayFromHostBufferRequest(
+  absl::StatusOr<Response> HandleMakeArrayFromHostBufferRequest(
       std::unique_ptr<IfrtRequest> request);
-  Response HandleAssembleArrayFromSingleDeviceArraysRequest(
+  absl::StatusOr<Response> HandleAssembleArrayFromSingleDeviceArraysRequest(
+      std::unique_ptr<IfrtRequest> request);
+  absl::StatusOr<Response> HandleRemapArraysRequest(
       std::unique_ptr<IfrtRequest> request);
   Future<Response> HandleCopyToHostBufferRequest(
       std::unique_ptr<IfrtRequest> request);
-  Response HandleDisassembleIntoSingleDeviceArraysRequest(
+  absl::StatusOr<Response> HandleDisassembleIntoSingleDeviceArraysRequest(
+      std::unique_ptr<IfrtRequest> request);
+  absl::StatusOr<Response> HandleReshardRequest(
       std::unique_ptr<IfrtRequest> request);
-  Response HandleReshardRequest(std::unique_ptr<IfrtRequest> request);
-  Response HandleFullyReplicatedShardRequest(
+  absl::StatusOr<Response> HandleFullyReplicatedShardRequest(
       std::unique_ptr<IfrtRequest> request);
   Future<Response> HandleCheckArrayReadyRequest(
       std::unique_ptr<IfrtRequest> request);
-  Response HandleDeleteArrayRequest(std::unique_ptr<IfrtRequest> request);
-  Response HandleIsArrayDeletedRequest(std::unique_ptr<IfrtRequest> request);
-  Response HandleDestructArrayRequest(std::unique_ptr<IfrtRequest> request);
+  absl::StatusOr<Response> HandleDeleteArrayRequest(
+      std::unique_ptr<IfrtRequest> request);
+  absl::StatusOr<Response> HandleIsArrayDeletedRequest(
+      std::unique_ptr<IfrtRequest> request);
+  absl::StatusOr<Response> HandleDestructArrayRequest(
+      std::unique_ptr<IfrtRequest> request);
 
   Future<Response> HandleCompileRequest(std::unique_ptr<IfrtRequest> request);
 
   Future<Response> HandleLoadedExecutableMetadataRequest(
       std::unique_ptr<IfrtRequest> request);
-  Response HandleLoadedExecutableExecuteRequest(
+  absl::StatusOr<Response> HandleLoadedExecutableExecuteRequest(
       std::unique_ptr<IfrtRequest> request);
-  Response HandleLoadedExecutableDeleteRequest(
+  absl::StatusOr<Response> HandleLoadedExecutableDeleteRequest(
       std::unique_ptr<IfrtRequest> request);
-  Response HandleLoadedExecutableIsDeletedRequest(
+  absl::StatusOr<Response> HandleLoadedExecutableIsDeletedRequest(
       std::unique_ptr<IfrtRequest> request);
-  Response HandleLoadedExecutableDestructRequest(
+  absl::StatusOr<Response> HandleLoadedExecutableDestructRequest(
       std::unique_ptr<IfrtRequest> request);
 
   Future<Response> HandleLoadedHostCallbackPollRequest(
       std::unique_ptr<IfrtRequest> request);
-  Response HandleLoadedHostCallbackReturnRequest(
+  absl::StatusOr<Response> HandleLoadedHostCallbackReturnRequest(
       std::unique_ptr<IfrtRequest> request);
 
-  Response HandleGetDefaultDeviceAssignmentRequest(
+  absl::StatusOr<Response> HandleGetDefaultDeviceAssignmentRequest(
       std::unique_ptr<IfrtRequest> request);
 
   //////////////////////////////////////////////////////////////////////
@@ -172,7 +178,7 @@ class IfrtBackend final : public BackendInterface {
   const std::shared_ptr<HostBufferStore> host_buffer_store_;
 
   absl::Mutex futures_mutex_;
-  absl::flat_hash_map<uint64_t, Future<absl::Status>> futures_
+  absl::flat_hash_map<uint64_t, Future<>> futures_
       ABSL_GUARDED_BY(futures_mutex_);
 
   absl::Mutex arrays_mutex_;
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc
index 00be0ab1497868..c8be96379969f8 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc
@@ -67,8 +67,8 @@
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/test.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/protobuf.h"  // IWYU pragma: keep
@@ -896,7 +896,7 @@ TEST_F(IfrtBackendHandlerTest, CompileSuccess) {
       .WillOnce(Return(absl::MakeSpan(addressable_devices)));
   EXPECT_CALL(*executable, Fingerprint()).WillOnce(Return("fingerprint"));
   EXPECT_CALL(*executable, GetReadyFuture())
-      .WillOnce(Return(Future<absl::Status>(absl::OkStatus())));
+      .WillOnce(Return(Future<>(absl::OkStatus())));
 
   ASSERT_OK_AND_ASSIGN(CompileResponse response,
                        CompileTestLoadedExecutable(std::move(executable)));
@@ -1144,7 +1144,7 @@ TEST_F(IfrtBackendHandlerTest, LoadedExecutableDelete) {
 
   {
     EXPECT_CALL(*executable, Delete())
-        .WillOnce(Return(Future<absl::Status>(absl::OkStatus())));
+        .WillOnce(Return(Future<>(absl::OkStatus())));
 
     auto request = NewIfrtRequest(NewOpId());
     LoadedExecutableDeleteRequest* delete_request =
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_session_handler.cc b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_session_handler.cc
deleted file mode 100644
index a4b0a95f2866f3..00000000000000
--- a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_session_handler.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-// Copyright 2023 The OpenXLA Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "xla/python/ifrt_proxy/server/ifrt_session_handler.h"
-
-#include <cstdint>
-#include <functional>
-#include <memory>
-#include <utility>
-
-#include "absl/log/log.h"
-#include "absl/memory/memory.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/synchronization/mutex.h"
-#include "xla/python/ifrt/future.h"
-#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
-#include "xla/python/ifrt_proxy/common/proto_util.h"
-
-// The tsl include below is needed only for the Status macros such as
-// ASSIGN_OR_RETURN, since the OSS absl package does not have the counterparts
-// yet.
-#include "tsl/platform/statusor.h"
-
-namespace xla {
-namespace ifrt {
-namespace proxy {
-
-absl::StatusOr<std::unique_ptr<IfrtSessionHandler>> IfrtSessionHandler::Create(
-    uint64_t id, BackendFactory backend_factory) {
-  if (backend_factory == nullptr) {
-    return absl::InvalidArgumentError("BackendFactory cannot be nullptr.");
-  }
-  return absl::WrapUnique(
-      new IfrtSessionHandler(id, std::move(backend_factory)));
-}
-
-IfrtSessionHandler::IfrtSessionHandler(uint64_t id,
-                                       BackendFactory backend_factory)
-    : session_id_(id), backend_factory_(std::move(backend_factory)) {}
-
-void IfrtSessionHandler::NewIncomingRequest(
-    std::unique_ptr<IfrtRequest> request,
-    std::function<void(std::shared_ptr<IfrtResponse>)> on_done) {
-  VLOG(2) << "NewIncomingRequest: " << request->DebugString();
-
-  const uint64_t op_id = request->request_metadata().op_id();
-
-  // The current implementation exploits the async nature of the backend_ IFRT
-  // client to minimize the amount of work we do per request. However, using a
-  // threadpool here might make sense as a performance optimization.
-
-  auto result = [&]() -> Future<Response> {
-    if (request->has_init_request()) {
-      return ProcessInitRequest(std::move(request));
-    }
-    if (auto status = SetupBackendIfNeeded(); !status.ok()) {
-      return Future<Response>(status);
-    }
-    absl::ReaderMutexLock read_lock(&backend_mu_);
-    return backend_->Process(std::move(request));
-  }();
-
-  // Consider maintaining a count of in-flight requests (that won't complete
-  // until the following OnReady callback happens) so we can safely deleting the
-  // reactor_.
-  result.OnReady([op_id, on_done = std::move(on_done)](
-                     absl::StatusOr<std::shared_ptr<IfrtResponse>> result) {
-    if (result.ok()) {
-      on_done(*std::move(result));
-    } else {
-      on_done(NewIfrtResponse(op_id, result.status()));
-    }
-  });
-}
-
-Future<IfrtSessionHandler::Response> IfrtSessionHandler::ProcessInitRequest(
-    std::unique_ptr<IfrtRequest> request) {
-  absl::MutexLock lock(&backend_mu_);
-  if (backend_ != nullptr) {
-    // Currently backends cannot be reinitialized.
-    return Future<Response>(absl::FailedPreconditionError(
-        "This session has already been initialized."));
-  }
-
-  auto backend = backend_factory_(session_id_);
-  if (!backend.ok()) {
-    return Future<Response>(backend.status());
-  }
-  backend_ = *std::move(backend);
-
-  return backend_->Process(std::move(request));
-}
-
-absl::Status IfrtSessionHandler::SetupBackendIfNeeded() {
-  absl::MutexLock lock(&backend_mu_);
-  if (backend_ != nullptr) {
-    return absl::OkStatus();
-  }
-  TF_ASSIGN_OR_RETURN(backend_, backend_factory_(session_id_));
-  return absl::OkStatus();
-}
-
-}  // namespace proxy
-}  // namespace ifrt
-}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_session_handler.h b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_session_handler.h
deleted file mode 100644
index 505341a6934958..00000000000000
--- a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_session_handler.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright 2023 The OpenXLA Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef XLA_PYTHON_IFRT_PROXY_SERVER_IFRT_SESSION_HANDLER_H_
-#define XLA_PYTHON_IFRT_PROXY_SERVER_IFRT_SESSION_HANDLER_H_
-
-#include <functional>
-#include <memory>
-
-#include "absl/container/flat_hash_set.h"
-#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
-#include "xla/python/ifrt_proxy/server/ifrt_backend.h"
-
-namespace xla {
-namespace ifrt {
-namespace proxy {
-
-// IfrtSessionHandler glues an incoming stream to a stack of backend runtimes
-// abstracted out by a `BackendInterface`. It utilizes the provided `Backend` to
-// process the incoming client requests after ensuring that dependencies as
-// specified by the client are honored and the chunked requests are fully
-// re-assembled.
-class IfrtSessionHandler {
- public:
-  using BackendFactory =
-      absl::AnyInvocable<absl::StatusOr<std::unique_ptr<BackendInterface>>(
-          uint64_t session_id)>;
-
-  using Response = BackendInterface::Response;
-
-  // Makes a new IfrtSessionHandler with the given Session ID that uniquely
-  // identifies this session. The backend_factory cannot be a nullptr.
-  static absl::StatusOr<std::unique_ptr<IfrtSessionHandler>> Create(
-      uint64_t id, BackendFactory backend_factory);
-
-  uint64_t session_id() const { return session_id_; }
-
-  // Top-level handler the transport implementation calls to hand off a new
-  // incoming request. `on_done` is called asynchronously to return responses.
-  void NewIncomingRequest(
-      std::unique_ptr<IfrtRequest> request,
-      std::function<void(std::shared_ptr<IfrtResponse>)> on_done);
-
- private:
-  IfrtSessionHandler(uint64_t id, BackendFactory backend_factory);
-
-  // InitRequest is treated somewhat differently than the rest since it triggers
-  // the creation of the backend_
-  Future<Response> ProcessInitRequest(std::unique_ptr<IfrtRequest> request)
-      ABSL_LOCKS_EXCLUDED(backend_mu_);
-
-  // Sets up the backaned_ only if needed - i.e., only if it is a nullptr.
-  absl::Status SetupBackendIfNeeded() ABSL_LOCKS_EXCLUDED(backend_mu_);
-
-  const uint64_t session_id_;  // Unique ID of this Session.
-
-  // The backend_ runtime(s) this session relies on for processing the incoming
-  // requests. It is instantiated at the start of a new Bidi stream, and
-  // currently does not change for the life of this object.
-  BackendFactory backend_factory_;
-  absl::Mutex backend_mu_;
-  std::unique_ptr<BackendInterface> backend_ ABSL_GUARDED_BY(backend_mu_);
-};
-
-}  // namespace proxy
-}  // namespace ifrt
-}  // namespace xla
-
-#endif  // XLA_PYTHON_IFRT_PROXY_SERVER_IFRT_SESSION_HANDLER_H_
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_session_handler_test.cc b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_session_handler_test.cc
deleted file mode 100644
index b5a1e0bc316d57..00000000000000
--- a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_session_handler_test.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright 2023 The OpenXLA Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "xla/python/ifrt_proxy/server/ifrt_session_handler.h"
-
-#include <cstdint>
-#include <memory>
-#include <utility>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "xla/python/ifrt/future.h"
-#include "xla/python/ifrt_proxy/server/ifrt_backend.h"
-#include "tsl/platform/status_matchers.h"
-
-namespace xla {
-namespace ifrt {
-namespace proxy {
-namespace {
-
-using ::testing::Not;
-using ::tsl::testing::IsOk;
-
-// FakeBackend. Currently: Fails or returns illegal values where possible.
-// All other methods return dummy strings or empty vectors. Individual tests
-// can make derived classes that override specific methods as needed.
-class FakeBackend : public BackendInterface {
- public:
-  FakeBackend() = default;
-  ~FakeBackend() override = default;
-
-  Future<BackendInterface::Response> Process(
-      std::unique_ptr<IfrtRequest> request) override {
-    return Future<BackendInterface::Response>(std::make_unique<IfrtResponse>());
-  }
-};
-
-TEST(IfrtSessionHandlerTest, NullptrForBackendMakerFails) {
-  EXPECT_THAT(IfrtSessionHandler::Create(1234, nullptr), Not(IsOk()));
-}
-
-TEST(IfrtSessionHandlerTest, SuccessfulCreation) {
-  std::unique_ptr<BackendInterface> backend = std::make_unique<FakeBackend>();
-  EXPECT_THAT(
-      IfrtSessionHandler::Create(
-          1234, [&](uint64_t session_id) { return std::move(backend); }),
-      IsOk());
-}
-
-// TODO(b/282757875) Add "end-to-end" tests that cover the entire path from the
-// Server/BidiReactor to the backend.  Since IfrtSessionHandler writes the
-// responses (IfrtResponse messages) directly to the Bidi Reactor, tests for the
-// actual processing of requests need a full server and a fake client that
-// allows us retrieve and examine the responses.
-
-}  // namespace
-}  // namespace proxy
-}  // namespace ifrt
-}  // namespace xla
diff --git a/third_party/xla/xla/python/inspect_sharding.cc b/third_party/xla/xla/python/inspect_sharding.cc
index 2ab386983b0a51..dfa03f37f01e01 100644
--- a/third_party/xla/xla/python/inspect_sharding.cc
+++ b/third_party/xla/xla/python/inspect_sharding.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/status/status.h"
 #include "xla/service/custom_call_sharding_helper.h"
 #include "xla/service/spmd/spmd_partitioner_util.h"
 
@@ -85,7 +86,7 @@ class InspectShardingCallPartitioner : public xla::CustomCallPartitioner {
     partitioner->SetPartitionedHlo(
         instruction,
         partitioner->GetPartitionedHlo(instruction->mutable_operand(0)));
-    return xla::OkStatus();
+    return absl::OkStatus();
   }
   HloSharding PropagateUserSharding(
       const HloInstruction* instruction, const HloInstruction* user,
diff --git a/third_party/xla/xla/python/jax_jit.cc b/third_party/xla/xla/python/jax_jit.cc
index 20ff684cd38ca4..fa6125ded04f6c 100644
--- a/third_party/xla/xla/python/jax_jit.cc
+++ b/third_party/xla/xla/python/jax_jit.cc
@@ -62,7 +62,7 @@ namespace jax {
 namespace nb = nanobind;
 
 // TODO(phawkins): Add support for Tracers.
-// TODO(jblespiau): Use absl Status.
+// TODO(jblespiau): Use absl absl::Status.
 
 namespace {
 
diff --git a/third_party/xla/xla/python/outfeed_receiver_test.cc b/third_party/xla/xla/python/outfeed_receiver_test.cc
index 8a6acb01692ceb..447e67cda2786c 100644
--- a/third_party/xla/xla/python/outfeed_receiver_test.cc
+++ b/third_party/xla/xla/python/outfeed_receiver_test.cc
@@ -33,8 +33,8 @@ namespace xla {
 
 namespace {
 
-Status CompileAndExecute(XlaBuilder* builder, XlaOp root, int device_id,
-                         PjRtClient* client) {
+absl::Status CompileAndExecute(XlaBuilder* builder, XlaOp root, int device_id,
+                               PjRtClient* client) {
   XlaComputation computation = builder->Build(root).value();
 
   CompileOptions compile_options;
@@ -79,7 +79,8 @@ class Accumulator {
 
 // TODO(necula): update this test for the TFRT CPU client, which current does
 // not support non-local devices.
-// StatusOr<std::unique_ptr<PjRtClient>> GetCpuClientWithNonLocalDevice() {
+// absl::StatusOr<std::unique_ptr<PjRtClient>> GetCpuClientWithNonLocalDevice()
+// {
 //   TF_ASSIGN_OR_RETURN(se::Platform * platform,
 //                       PlatformUtil::GetPlatform("Host"));
 //   if (platform->VisibleDeviceCount() <= 0) {
@@ -265,8 +266,15 @@ TEST(OutfeedReceiverTest, DifferentShapeForConsumerIdError) {
   absl::StatusOr<XlaOp> send1 = outfeed_receiver->AddOutfeedToBuilder(
       &builder, send0, consumer_id0, {data1}, 0);
   EXPECT_FALSE(send1.ok());
-  EXPECT_THAT(send1.status().ToString(),
-              testing::HasSubstr("does not match previous shape element_type"));
+  EXPECT_THAT(
+      send1.status().ToString(),
+      testing::ContainsRegex(
+#if defined(PLATFORM_WINDOWS)
+          "does not match previous shape \\w*/*\\w* *\\n?element_type"));
+#else
+          "does not match previous shape (go/\\w+[ "
+          "]+\\n)?element_type"));
+#endif
 }
 
 TEST(OutfeedReceiverTest, InvalidConsumerIdError) {
diff --git a/third_party/xla/xla/python/pjit.cc b/third_party/xla/xla/python/pjit.cc
index 6a309a0d9a9e20..2300448bc7a554 100644
--- a/third_party/xla/xla/python/pjit.cc
+++ b/third_party/xla/xla/python/pjit.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
+#include <cstring>
 #include <exception>
 #include <memory>
 #include <optional>
@@ -62,8 +63,8 @@ limitations under the License.
 #include "xla/python/sharding.h"
 #include "xla/python/traceback.h"
 #include "xla/python/transfer_guard_lib.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/util.h"
-#include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
@@ -332,7 +333,7 @@ PjitFunction::PjitFunction(
       shard_arg_fallback_(std::move(shard_arg_fallback)),
       cache_(std::move(cache)) {
   std::sort(static_argnums_.begin(), static_argnums_.end());
-  static_argnames.reserve(static_argnames.size());
+  static_argnames_.reserve(static_argnames.size());
   for (nb::str& name : static_argnames) {
     PyObject* s = name.inc_ref().ptr();
     PyUnicode_InternInPlace(&s);
@@ -885,8 +886,10 @@ void PjitFunction_tp_dealloc(PyObject* self) {
   PyObject_ClearWeakRefs(self);
 #if PY_VERSION_HEX < 0x030C0000
   Py_CLEAR(o->dict);
-#else
+#elif PY_VERSION_HEX < 0x030D0000
   _PyObject_ClearManagedDict(self);
+#else
+  PyObject_ClearManagedDict(self);
 #endif  // PY_VERSION_HEX < 0x030C0000
   o->fun.~PjitFunction();
   tp->tp_free(self);
@@ -902,8 +905,10 @@ int PjitFunction_tp_traverse(PyObject* self, visitproc visit, void* arg) {
   Py_VISIT(Py_TYPE(self));
 #if PY_VERSION_HEX < 0x030C0000
   Py_VISIT(o->dict);
-#else
+#elif PY_VERSION_HEX < 0x030D0000
   _PyObject_VisitManagedDict(self, visit, arg);
+#else
+  PyObject_VisitManagedDict(self, visit, arg);
 #endif  // PY_VERSION_HEX < 0x030C0000
   Py_VISIT(o->fun.cache_miss().ptr());
   Py_VISIT(o->fun.shard_arg_fallback().ptr());
@@ -917,8 +922,10 @@ int PjitFunction_tp_clear(PyObject* self) {
   PjitFunctionObject* o = reinterpret_cast<PjitFunctionObject*>(self);
 #if PY_VERSION_HEX < 0x030C0000
   Py_CLEAR(o->dict);
-#else
+#elif PY_VERSION_HEX < 0x030D0000
   _PyObject_ClearManagedDict(self);
+#else
+  PyObject_ClearManagedDict(self);
 #endif  // PY_VERSION_HEX < 0x030C0000
   o->fun.ClearPythonReferences();
   return 0;
@@ -1061,7 +1068,14 @@ void BuildPjitSubmodule(nb::module_& m) {
   std::string name =
       absl::StrCat(nb::cast<std::string>(m.attr("__name__")), ".PjitFunction");
   PyType_Spec PjitFunction_spec = {
+#if PY_VERSION_HEX < 0x030B0000
+      // Work around for https://github.com/python/cpython/issues/89478
+      // CPython 3.10 and earlier assume that the .name value remains alive
+      // forever.
+      /*.name=*/strdup(name.c_str()),
+#else
       /*.name=*/name.c_str(),
+#endif  // PY_VERSION_HEX < 0x030B0000
       /*.basicsize=*/static_cast<int>(sizeof(PjitFunctionObject)),
       /*.itemsize=*/0,
 #if PY_VERSION_HEX < 0x030C0000
diff --git a/third_party/xla/xla/python/pjrt_ifrt/BUILD b/third_party/xla/xla/python/pjrt_ifrt/BUILD
index b77674803a05de..2ce7d49e13e4df 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/BUILD
+++ b/third_party/xla/xla/python/pjrt_ifrt/BUILD
@@ -77,45 +77,6 @@ tf_proto_library(
     protodeps = ["//xla/pjrt:compile_options_proto"],
 )
 
-cc_library(
-    name = "xla_program_serdes",
-    srcs = ["xla_program_serdes.cc"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        ":xla_ifrt",
-        "//xla/mlir_hlo:mhlo_passes",
-        "//xla/pjrt:mlir_to_hlo",
-        "//xla/python/ifrt:serdes",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Support",
-        "@local_tsl//tsl/platform:status",
-        "@stablehlo//:stablehlo_portable_api",
-        "@stablehlo//:stablehlo_serialization",
-    ],
-    alwayslink = True,
-)
-
-xla_cc_test(
-    name = "xla_program_serdes_test",
-    srcs = ["xla_program_serdes_test.cc"],
-    deps = [
-        ":xla_ifrt",
-        ":xla_program_serdes",
-        "//xla/mlir_hlo",
-        "//xla/pjrt:mlir_to_hlo",
-        "//xla/python/ifrt:serdes",
-        "@com_google_googletest//:gtest_main",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
 tf_proto_library(
     name = "xla_sharding_proto",
     srcs = ["xla_sharding.proto"],
@@ -169,14 +130,19 @@ cc_library(
     srcs = ["xla_executable_impl_test_lib.cc"],
     deps = [
         ":xla_ifrt",
+        "//xla/client:executable_build_options",
         "//xla/pjrt:mlir_to_hlo",
+        "//xla/pjrt:pjrt_executable",
         "//xla/python/ifrt",
         "//xla/python/ifrt:test_util",
+        "//xla/python/ifrt/hlo:hlo_program",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
+        "@llvm-project//mlir:IR",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
     ],
     alwayslink = True,
 )
@@ -220,6 +186,7 @@ cc_library(
         "pjrt_executable.cc",
         "pjrt_host_callback.cc",
         "pjrt_memory.cc",
+        "pjrt_remap.cc",
         "pjrt_tuple.cc",
     ],
     hdrs = [
@@ -230,10 +197,12 @@ cc_library(
         "pjrt_executable.h",
         "pjrt_host_callback.h",
         "pjrt_memory.h",
+        "pjrt_remap.h",
         "pjrt_tuple.h",
     ],
     compatible_with = get_compatible_with_portable(),
     deps = [
+        ":basic_string_array",
         ":xla_ifrt",
         "//xla:literal",
         "//xla:shape_util",
@@ -252,8 +221,10 @@ cc_library(
         "//xla/pjrt:pjrt_layout",
         "//xla/pjrt:utils",
         "//xla/python/ifrt",
+        "//xla/python/ifrt/hlo:hlo_program",
         "//xla/service:hlo_proto_cc",
         "//xla/translate/mhlo_to_hlo:type_to_shape",
+        "//xla/tsl/concurrency:ref_count",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:any_invocable",
@@ -269,7 +240,6 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
@@ -289,6 +259,54 @@ cc_library(
     alwayslink = True,
 )
 
+cc_library(
+    name = "basic_string_array",
+    srcs = ["basic_string_array.cc"],
+    hdrs = ["basic_string_array.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//xla:status",
+        "//xla:xla_data_proto_cc",
+        "//xla/pjrt:pjrt_layout",
+        "//xla/python/ifrt",
+        "//xla/tsl/concurrency:ref_count",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+xla_cc_test(
+    name = "basic_string_array_test",
+    srcs = ["basic_string_array_test.cc"],
+    deps = [
+        ":basic_string_array",
+        ":tfrt_cpu_client_test_lib",
+        "//xla/pjrt:pjrt_future",
+        "//xla/python/ifrt",
+        "//xla/python/ifrt:test_util",
+        "//xla/tsl/concurrency:ref_count",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
+    ],
+)
+
 xla_cc_test(
     name = "pjrt_array_impl_test_tfrt_cpu",
     size = "small",
@@ -336,3 +354,14 @@ xla_cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+xla_cc_test(
+    name = "pjrt_remap_impl_test_tfrt_cpu",
+    size = "small",
+    srcs = [],
+    deps = [
+        ":tfrt_cpu_client_test_lib",
+        "//xla/python/ifrt:remap_impl_test_lib",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/third_party/xla/xla/python/pjrt_ifrt/basic_string_array.cc b/third_party/xla/xla/python/pjrt_ifrt/basic_string_array.cc
new file mode 100644
index 00000000000000..54486105df21e4
--- /dev/null
+++ b/third_party/xla/xla/python/pjrt_ifrt/basic_string_array.cc
@@ -0,0 +1,152 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/pjrt_ifrt/basic_string_array.h"
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/pjrt/pjrt_layout.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/status.h"
+#include "xla/tsl/concurrency/ref_count.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace ifrt {
+
+char BasicStringArray::ID = 0;
+
+absl::StatusOr<tsl::RCReference<BasicStringArray>> BasicStringArray::Create(
+    Client* client, Shape shape, std::shared_ptr<const Sharding> sharding,
+    Future<Buffers> buffers, OnDoneWithBuffer on_done_with_buffer) {
+  if (!buffers.IsValid()) {
+    return absl::InvalidArgumentError("Got buffers_ future is invalid");
+  }
+  return tsl::MakeRef<BasicStringArray>(client, std::move(shape),
+                                        std::move(sharding), std::move(buffers),
+                                        std::move(on_done_with_buffer));
+}
+
+BasicStringArray::BasicStringArray(Client* client, Shape shape,
+                                   std::shared_ptr<const Sharding> sharding,
+                                   Future<Buffers> buffers,
+                                   OnDoneWithBuffer on_done_with_buffer)
+    : client_(client),
+      shape_(std::move(shape)),
+      sharding_(std::move(sharding)),
+      buffers_(std::move(buffers)),
+      on_done_with_buffer_(std::move(on_done_with_buffer)) {}
+
+BasicStringArray::~BasicStringArray() { DeleteInternal(); }
+
+Future<> BasicStringArray::Delete() {
+  DeleteInternal();
+  return Future<>(absl::OkStatus());
+}
+
+bool BasicStringArray::IsDeleted() const {
+  absl::MutexLock lock(&mu_);
+  return is_deleted_;
+}
+
+void BasicStringArray::DeleteInternal() {
+  absl::MutexLock lock(&mu_);
+  if (is_deleted_) {
+    return;
+  }
+  if (on_done_with_buffer_) {
+    std::move(on_done_with_buffer_)();
+  }
+  is_deleted_ = true;
+}
+
+Future<> BasicStringArray::GetReadyFuture() const {
+  DCHECK(this);
+  absl::MutexLock lock(&mu_);
+  if (is_deleted_) {
+    return Future<>(
+        absl::FailedPreconditionError("Array has already been deleted"));
+  }
+  if (ready_future_.IsValid()) {
+    return ready_future_;
+  }
+
+  // TODO(b/337922817) The ready future returned should capture the status
+  // of consistency checks across the buffers, shape and sharding. These checks
+  // will run when the buffers become available - i.e., when the `buffers_`
+  // future becomes ready.
+  auto promise = Future<>::CreatePromise();
+  ready_future_ = Future<>(promise);
+  buffers_.OnReady(
+      [promise = std::move(promise)](absl::StatusOr<Buffers> buffers) mutable {
+        promise.Set(buffers.status());
+      });
+  return ready_future_;
+}
+
+absl::StatusOr<std::vector<tsl::RCReference<Array>>>
+BasicStringArray::DisassembleIntoSingleDeviceArrays(
+    ArrayCopySemantics semantics) {
+  DCHECK(this);
+  return absl::UnimplementedError("Not implemented");
+}
+
+Future<> BasicStringArray::CopyToHostBuffer(
+    void* data, std::optional<absl::Span<const int64_t>> byte_strides,
+    ArrayCopySemantics semantics) {
+  DCHECK(this);
+  return Future<>(absl::UnimplementedError("Not implemented"));
+}
+
+absl::StatusOr<tsl::RCReference<Array>> BasicStringArray::Reshard(
+    std::shared_ptr<const Sharding> new_sharding,
+    ArrayCopySemantics semantics) {
+  DCHECK(this);
+  return absl::UnimplementedError("Not implemented");
+}
+
+absl::StatusOr<tsl::RCReference<Array>> BasicStringArray::FullyReplicatedShard(
+    ArrayCopySemantics semantics) {
+  // Make a single sharded BasicStringArray from the first shard.
+  return absl::UnimplementedError("Not implemented");
+}
+
+absl::StatusOr<std::unique_ptr<PjRtLayout>> BasicStringArray::layout() const {
+  return absl::UnimplementedError("Not implemented");
+}
+
+std::string BasicStringArray::DebugString() const {
+  DCHECK(this);
+  return absl::StrFormat(
+      "BasicStringArray(shape=%s; sharding=%s; layout=major-to-minor-dense)",
+      shape_.DebugString(), sharding_->DebugString());
+}
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/pjrt_ifrt/basic_string_array.h b/third_party/xla/xla/python/pjrt_ifrt/basic_string_array.h
new file mode 100644
index 00000000000000..03d06bbec5cb67
--- /dev/null
+++ b/third_party/xla/xla/python/pjrt_ifrt/basic_string_array.h
@@ -0,0 +1,165 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PJRT_IFRT_BASIC_STRING_ARRAY_H_
+#define XLA_PYTHON_PJRT_IFRT_BASIC_STRING_ARRAY_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/base/attributes.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/pjrt/pjrt_layout.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/tsl/concurrency/ref_count.h"
+
+namespace xla {
+namespace ifrt {
+
+// `BasicStringArray` implements an `ifrt::Array` by wrapping a local (aka host)
+// string buffer. This object is expected to live exclusively in the IFRT layer,
+// and thus is not specific to any particular backend. However, it is currently
+// located in the pjrt_ifrt directory because we expect the main use of this
+// class is to implement IO Callable support in pjrt_ifrt.
+class BasicStringArray final
+    : public llvm::RTTIExtends<BasicStringArray, Array> {
+ public:
+  // Must be in dense major to minor order.
+  using Buffer = absl::Span<const absl::string_view>;
+
+  // One Buffer per shard.
+  static constexpr int kBuffersInlineSize = 1;
+  using Buffers = absl::InlinedVector<Buffer, kBuffersInlineSize>;
+
+  // Called when this object is done with the string buffer provided at the
+  // construction time.
+  using OnDoneWithBuffer = std::function<void()>;
+
+  // General array construction. The `buffers` and their elements
+  // (absl::string_views) must live until the `on_done_with_buffer` is called.
+  // The number and order of buffers must match the number and order of devices
+  // in `sharding`.
+  static absl::StatusOr<tsl::RCReference<BasicStringArray>> Create(
+      Client* client, Shape shape, std::shared_ptr<const Sharding> sharding,
+      Future<Buffers> buffers, OnDoneWithBuffer on_done_with_buffer);
+
+  ~BasicStringArray() override;
+
+  absl::StatusOr<tsl::RCReference<Array>> FullyReplicatedShard(
+      ArrayCopySemantics semantics) override;
+
+  // ifrt::Array API
+
+  Client* client() const override {
+    DCHECK(this);
+    return client_;
+  }
+
+  DType dtype() const override {
+    DCHECK(this);
+    return DType(DType::kString);
+  }
+
+  const Shape& shape() const override {
+    DCHECK(this);
+    return shape_;
+  }
+
+  const Sharding& sharding() const override {
+    DCHECK(this);
+    return *sharding_;
+  }
+
+  std::shared_ptr<const Sharding> shared_ptr_sharding() const override {
+    DCHECK(this);
+    return sharding_;
+  }
+
+  absl::StatusOr<std::unique_ptr<PjRtLayout>> layout() const override;
+
+  absl::StatusOr<std::vector<tsl::RCReference<Array>>>
+  DisassembleIntoSingleDeviceArrays(ArrayCopySemantics semantics) override;
+
+  ABSL_MUST_USE_RESULT
+  Future<> CopyToHostBuffer(
+      void* data, std::optional<absl::Span<const int64_t>> byte_strides,
+      ArrayCopySemantics semantics) override;
+
+  absl::StatusOr<tsl::RCReference<Array>> Reshard(
+      std::shared_ptr<const Sharding> new_sharding,
+      ArrayCopySemantics semantics) override;
+
+  Future<> GetReadyFuture() const override;
+
+  Future<> Delete() override;
+  bool IsDeleted() const override;
+
+  std::string DebugString() const override;
+
+  // Methods specific to this Array variant (i.e., not from `ifrt::Array`).
+
+  // Returns a future holding the string buffers underlying this array. Valid
+  // only while this Array object is alive.
+  Future<Buffers> buffers() const {
+    return buffers_;  // Future copying is not considered expensive.
+  }
+
+  static char ID;  // NOLINT
+
+ private:
+  template <typename T, typename... Args>
+  friend tsl::RCReference<T> tsl::MakeRef(Args&&... args);
+
+  BasicStringArray(Client* client, Shape shape,
+                   std::shared_ptr<const Sharding> sharding,
+                   Future<Buffers> buffers,
+                   OnDoneWithBuffer on_done_with_buffer);
+
+  // Internal implementation of delete.
+  void DeleteInternal() ABSL_LOCKS_EXCLUDED(mu_);
+
+  Client* client_;
+  Shape shape_;
+  std::shared_ptr<const Sharding> sharding_;
+  Future<Buffers> buffers_;
+
+  // TODO(b/337922817): Consider checking the buffers when they become available
+  // (i.e., the future above becomes ready) to ensure that they are consistent
+  // with the Shape and Sharding provided at the construction time.
+
+  mutable absl::Mutex mu_;
+  OnDoneWithBuffer on_done_with_buffer_ ABSL_GUARDED_BY(mu_);
+  bool is_deleted_ ABSL_GUARDED_BY(mu_) = false;
+  mutable Future<> ready_future_ ABSL_GUARDED_BY(mu_);
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PJRT_IFRT_BASIC_STRING_ARRAY_H_
diff --git a/third_party/xla/xla/python/pjrt_ifrt/basic_string_array_test.cc b/third_party/xla/xla/python/pjrt_ifrt/basic_string_array_test.cc
new file mode 100644
index 00000000000000..e8f2154268b5f8
--- /dev/null
+++ b/third_party/xla/xla/python/pjrt_ifrt/basic_string_array_test.cc
@@ -0,0 +1,251 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/pjrt_ifrt/basic_string_array.h"
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/notification.h"
+#include "absl/types/span.h"
+#include "xla/pjrt/pjrt_future.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt/memory.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt/test_util.h"
+#include "xla/tsl/concurrency/ref_count.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/test.h"
+
+namespace xla {
+namespace ifrt {
+namespace {
+
+using ::tsl::testing::StatusIs;
+
+// Makes a simple single device sharded string array by means of
+// `BasicStringArray::Create` factory method.
+absl::StatusOr<tsl::RCReference<BasicStringArray>> CreateTestArray(
+    Client* client, Future<BasicStringArray::Buffers> buffers,
+    BasicStringArray::OnDoneWithBuffer on_done_with_buffer) {
+  Shape shape({1});
+  Device* device = client->addressable_devices().at(0);
+  std::shared_ptr<const Sharding> sharding =
+      SingleDeviceSharding::Create(device, MemoryKind());
+
+  return BasicStringArray::Create(client, shape, sharding, std::move(buffers),
+                                  std::move(on_done_with_buffer));
+}
+
+TEST(BasicStringArrayTest, CreateSuccess) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+  BasicStringArray::Buffers buffers;
+  buffers.push_back({"abc", "def"});
+
+  // This test implicitly tests that the on_done_with_buffer can be a nullptr,
+  // and that the destruction of the BasicStringArray object completes
+  // successfully (even when the callback is a nullptr).
+  TF_EXPECT_OK(CreateTestArray(client.get(),
+                               Future<BasicStringArray::Buffers>(buffers),
+                               /*on_done_with_buffer=*/nullptr));
+}
+
+TEST(BasicStringArrayTest, CreateFailure) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+  // Create fails if with invalid future.
+  EXPECT_THAT(CreateTestArray(client.get(), Future<BasicStringArray::Buffers>(),
+                              /*on_done_with_buffer=*/nullptr),
+              StatusIs(absl::StatusCode::kInvalidArgument));
+}
+
+TEST(BasicStringArrayTest, Destruction) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+
+  BasicStringArray::Buffers buffers;
+  buffers.push_back({"abc", "def"});
+
+  absl::Notification on_done_with_buffer_called;
+  BasicStringArray::OnDoneWithBuffer on_done_with_buffer =
+      [&on_done_with_buffer_called]() { on_done_with_buffer_called.Notify(); };
+
+  auto array_creation_status_promise = PjRtFuture<>::CreatePromise();
+
+  tsl::Env::Default()->SchedClosure(([&]() {
+    auto array = CreateTestArray(client.get(),
+                                 Future<BasicStringArray::Buffers>(buffers),
+                                 std::move(on_done_with_buffer));
+
+    array_creation_status_promise.Set(array.status());
+    // `array` goes out of scope and gets destroyed.
+  }));
+
+  // Make sure that the array has been created successfully.
+  TF_ASSERT_OK(Future<>(array_creation_status_promise).Await());
+
+  // Destruction must release the buffer. That is, the `on_done_with_buffer`
+  // callback must be called.
+  on_done_with_buffer_called.WaitForNotification();
+}
+
+TEST(BasicStringArrayTest, GetReadyFutureSuccess) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+  // Make a BasicStringArray with a future that is not ready.
+  auto promise = Future<BasicStringArray::Buffers>::CreatePromise();
+  auto buffers_future = Future<BasicStringArray::Buffers>(promise);
+  TF_ASSERT_OK_AND_ASSIGN(auto array,
+                          CreateTestArray(client.get(), buffers_future,
+                                          /*on_done_with_buffer=*/nullptr));
+
+  // Array should not be ready since the buffers future is not ready.
+  auto ready_future = array->GetReadyFuture();
+  EXPECT_FALSE(ready_future.IsKnownReady());
+
+  // Make the buffers future ready asynchronously.
+  BasicStringArray::Buffers buffers;
+  buffers.push_back({"abc", "def"});
+  tsl::Env::Default()->SchedClosure([&]() { promise.Set(buffers); });
+  TF_EXPECT_OK(ready_future.Await());
+}
+
+TEST(BasicStringArrayTest, GetReadyFutureFailure) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+  // Make a BasicStringArray with a future that is not ready.
+  auto promise = Future<BasicStringArray::Buffers>::CreatePromise();
+  auto buffers_future = Future<BasicStringArray::Buffers>(promise);
+  TF_ASSERT_OK_AND_ASSIGN(auto array,
+                          CreateTestArray(client.get(), buffers_future,
+                                          /*on_done_with_buffer=*/nullptr));
+
+  // Array should not be ready since the buffers future is not ready.
+  auto ready_future = array->GetReadyFuture();
+  EXPECT_FALSE(ready_future.IsKnownReady());
+
+  // Make the buffers future ready with an error asynchronously
+  tsl::Env::Default()->SchedClosure(
+      [&]() { promise.Set(absl::InternalError("injected error")); });
+
+  EXPECT_THAT(ready_future.Await(), StatusIs(absl::StatusCode::kInternal));
+}
+
+TEST(BasicStringArrayTest, Delete) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+  BasicStringArray::Buffers buffers;
+  buffers.push_back({"abc", "def"});
+  absl::Notification on_done_with_buffer_called;
+  BasicStringArray::OnDoneWithBuffer on_done_with_buffer =
+      [&on_done_with_buffer_called]() { on_done_with_buffer_called.Notify(); };
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto array,
+      CreateTestArray(client.get(), Future<BasicStringArray::Buffers>(buffers),
+                      std::move(on_done_with_buffer)));
+
+  tsl::Env::Default()->SchedClosure([&]() { array->Delete(); });
+
+  // Delete must have released the buffer by calling `on_done_with_buffer`.
+  on_done_with_buffer_called.WaitForNotification();
+
+  // IsDeleted should return true.
+  EXPECT_TRUE(array->IsDeleted());
+}
+
+TEST(BasicStringArrayTest, MakeArrayFromHostBufferSuccess) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+  Shape shape({1});
+  Device* device = client->addressable_devices().at(0);
+  std::shared_ptr<const Sharding> sharding =
+      SingleDeviceSharding::Create(device, MemoryKind());
+
+  auto string_views = std::make_shared<std::vector<absl::string_view>>();
+  string_views->push_back("abc");
+  string_views->push_back("def");
+  const void* data = string_views->data();
+  auto on_done_with_host_buffer = [string_views = std::move(string_views)]() {};
+
+  TF_ASSERT_OK(client->MakeArrayFromHostBuffer(
+      data, DType(DType::kString), shape,
+      /*byte_strides=*/std::nullopt, std::move(sharding),
+      Client::HostBufferSemantics::kImmutableOnlyDuringCall,
+      std::move(on_done_with_host_buffer)));
+}
+
+TEST(BasicStringArrayTest, MakeArrayFromHostBufferErrorHandling) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+  Shape shape({1});
+  Device* device = client->addressable_devices().at(0);
+  std::shared_ptr<const Sharding> single_device_sharding =
+      SingleDeviceSharding::Create(device, MemoryKind());
+  auto string_views = std::make_shared<std::vector<absl::string_view>>();
+  string_views->push_back("abc");
+  string_views->push_back("def");
+  const void* data = string_views->data();
+  auto on_done_with_host_buffer = [string_views = std::move(string_views)]() {};
+
+  // MakeArrayFromHostBuffer should check and fail if `byte_strides` in not
+  // nullopt.
+  EXPECT_THAT(
+      client->MakeArrayFromHostBuffer(
+          data, DType(DType::kString), shape,
+          /*byte_strides=*/std::optional<absl::Span<const int64_t>>({8}),
+          single_device_sharding,
+          Client::HostBufferSemantics::kImmutableOnlyDuringCall,
+          on_done_with_host_buffer),
+      StatusIs(absl::StatusCode::kInvalidArgument));
+
+  // MakeArrayFromHostBuffer should check and fail if the sharding is not a
+  // SingleDeviceSharding.
+  std::shared_ptr<const Sharding> opaque_sharding =
+      OpaqueSharding::Create(DeviceList({device}), MemoryKind());
+  EXPECT_THAT(client->MakeArrayFromHostBuffer(
+                  data, DType(DType::kString), shape,
+                  /*byte_strides=*/std::nullopt, opaque_sharding,
+                  Client::HostBufferSemantics::kImmutableOnlyDuringCall,
+                  on_done_with_host_buffer),
+              StatusIs(absl::StatusCode::kInvalidArgument));
+
+  // MakeArrayFromHostBuffer should check and fail if the requested
+  // HostBufferSemantics is not supported.
+  for (Client::HostBufferSemantics host_buffer_semantics :
+       {Client::HostBufferSemantics::kImmutableUntilTransferCompletes,
+        Client::HostBufferSemantics::kImmutableZeroCopy,
+        Client::HostBufferSemantics::kMutableZeroCopy}) {
+    SCOPED_TRACE(
+        absl::StrCat("host_buffer_semantics: ", host_buffer_semantics));
+    EXPECT_THAT(client->MakeArrayFromHostBuffer(
+                    data, DType(DType::kString), shape,
+                    /*byte_strides=*/std::nullopt, single_device_sharding,
+                    host_buffer_semantics, on_done_with_host_buffer),
+                StatusIs(absl::StatusCode::kInvalidArgument));
+  }
+}
+
+}  // namespace
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
index 7336b5e610a254..2d595ab45e3d22 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
@@ -47,9 +47,9 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/status_macros.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
@@ -60,8 +60,9 @@ namespace {
 
 // Validates the sharding and PjRtBuffers have consistent device and memory
 // kind.
-Status ValidateArrayCreationInput(std::shared_ptr<const Sharding> sharding,
-                                  const PjRtArray::PjRtBuffers& pjrt_buffers) {
+absl::Status ValidateArrayCreationInput(
+    std::shared_ptr<const Sharding> sharding,
+    const PjRtArray::PjRtBuffers& pjrt_buffers) {
   if (pjrt_buffers.empty()) {
     return InvalidArgument("pjrt_buffers must be non-empty");
   }
@@ -74,7 +75,8 @@ Status ValidateArrayCreationInput(std::shared_ptr<const Sharding> sharding,
   MemoryKind canonicalized_sharding_memory_kind = CanonicalizeMemoryKind(
       sharding->memory_kind(), sharding->devices().front());
   for (int i = 0; i < sharding->devices().size(); ++i) {
-    PjRtDevice* device = llvm::dyn_cast<PjRtDevice>(sharding->devices()[i]);
+    PjRtCompatibleDevice* device =
+        llvm::dyn_cast<PjRtCompatibleDevice>(sharding->devices()[i]);
     if (!device) {
       return InvalidArgument("Sharding device %d is not a PjRtDevice", i);
     }
@@ -415,7 +417,7 @@ Future<> PjRtArray::CopyToHostBuffer(
   // TODO(hyeontaek): Handle semantics == kDonateInput.
   pjrt_buffer->ToLiteral(literal_ptr)
       .OnReady([literal = std::move(literal),
-                promise = std::move(promise)](Status s) mutable {
+                promise = std::move(promise)](absl::Status s) mutable {
         promise.Set(std::move(s));
         literal = nullptr;
       });
@@ -492,8 +494,8 @@ absl::StatusOr<tsl::RCReference<Array>> PjRtArray::Reshard(
           break;
       }
     } else {
-      PjRtDevice* pjrt_device =
-          llvm::dyn_cast<PjRtDevice>(new_sharding->devices()[i]);
+      PjRtCompatibleDevice* pjrt_device =
+          llvm::dyn_cast<PjRtCompatibleDevice>(new_sharding->devices()[i]);
       if (!pjrt_device) {
         return InvalidArgument(
             "The destination device is owned by a non-PjRt-compatible client. "
@@ -501,6 +503,10 @@ absl::StatusOr<tsl::RCReference<Array>> PjRtArray::Reshard(
             "first fetched to the host and then sent to the destination "
             "device.");
       }
+      if (!pjrt_device->IsAddressable()) {
+        return InvalidArgument("Cannot copy array to non-addressable device %s",
+                               pjrt_device->DebugString());
+      }
       // Use `PjRtBuffer::CopyToMemorySpace` instead of
       // `PjRtBuffer::CopyToDevice` when memories are supported. Because the
       // semantics of the latter one is to copy to the default memory space of
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.h
index 614187481e85f8..94253b30a3b912 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.h
@@ -28,7 +28,7 @@ limitations under the License.
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/pjrt_ifrt/pjrt_client.h"
-#include "tsl/concurrency/ref_count.h"
+#include "xla/tsl/concurrency/ref_count.h"
 
 namespace xla {
 namespace ifrt {
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc
index 2212c9bf30b47c..79fc87330c1aa6 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc
@@ -15,19 +15,24 @@ limitations under the License.
 
 #include "xla/python/pjrt_ifrt/pjrt_client.h"
 
+#include <cstddef>
 #include <cstdint>
 #include <functional>
 #include <memory>
 #include <optional>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
 #include "absl/memory/memory.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/Support/Casting.h"
 #include "xla/layout.h"
@@ -39,19 +44,24 @@ limitations under the License.
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/future.h"
 #include "xla/python/ifrt/memory.h"
+#include "xla/python/ifrt/remap_plan.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/ifrt/tuple.h"
 #include "xla/python/ifrt/value.h"
+#include "xla/python/pjrt_ifrt/basic_string_array.h"
 #include "xla/python/pjrt_ifrt/pjrt_array.h"
 #include "xla/python/pjrt_ifrt/pjrt_device.h"
 #include "xla/python/pjrt_ifrt/pjrt_memory.h"
+#include "xla/python/pjrt_ifrt/pjrt_remap.h"
 #include "xla/python/pjrt_ifrt/pjrt_tuple.h"
 #include "xla/python/pjrt_ifrt/xla_sharding.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/util.h"
-#include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/casts.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
 
@@ -66,6 +76,57 @@ absl::AnyInvocable<void() &&> FromStdFunction(std::function<void()>&& f) {
   return f ? std::move(f) : absl::AnyInvocable<void() &&>();
 }
 
+absl::StatusOr<tsl::RCReference<Array>> MakeStringArrayFromHostBuffer(
+    Client* client, const void* data, DType dtype, Shape shape,
+    std::optional<absl::Span<const int64_t>> byte_strides,
+    std::shared_ptr<const Sharding> sharding,
+    Client::HostBufferSemantics semantics,
+    std::function<void()> on_done_with_host_buffer) {
+  auto param_validation = [&]() -> absl::Status {
+    if (byte_strides.has_value()) {
+      return absl::InvalidArgumentError(
+          "byte_strides is not currently supported for making "
+          "BasicStringArrays.");
+    }
+    if (semantics != Client::HostBufferSemantics::kImmutableOnlyDuringCall) {
+      return absl::InvalidArgumentError(
+          "HostBufferSemantics other than kImmutableOnlyDuringCall are not "
+          "currently supported for making BasicStringArrays.");
+    }
+    if (!llvm::isa<const SingleDeviceSharding>(sharding.get())) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Only SingleDeviceSharding is supported for making "
+                       "BasicStringArrays: got: ",
+                       sharding->DebugString()));
+    }
+    return absl::OkStatus();
+  }();
+
+  TF_RETURN_IF_ERROR(param_validation);
+
+  auto num_elements = shape.num_elements();
+  auto strings = std::make_shared<std::vector<std::string>>();
+  strings->reserve(num_elements);
+  auto string_views = std::make_shared<std::vector<absl::string_view>>();
+  string_views->reserve(num_elements);
+  auto element = static_cast<const absl::string_view*>(data);
+  for (int i = 0; i < num_elements; ++i, ++element) {
+    strings->push_back(std::string(*element));
+    string_views->push_back(absl::string_view(strings->back()));
+  }
+  std::move(on_done_with_host_buffer)();
+
+  BasicStringArray::Buffers buffers;
+  buffers.push_back(*string_views);
+  auto buffer_releaser = [strings = std::move(strings),
+                          string_views = std::move(string_views)]() {};
+
+  return BasicStringArray::Create(
+      client, std::move(shape), std::move(sharding),
+      Future<BasicStringArray::Buffers>(std::move(buffers)),
+      std::move(buffer_releaser));
+}
+
 }  // namespace
 
 char PjRtCompatibleClient::ID = 0;
@@ -81,9 +142,14 @@ PjRtClient::PjRtClient(std::shared_ptr<xla::PjRtClient> pjrt_client)
   devices_.reserve(pjrt_client_->devices().size());
   device_map_.reserve(pjrt_client_->devices().size());
   for (xla::PjRtDevice* device : pjrt_client_->devices()) {
-    auto ifrt_device = std::make_unique<PjRtDevice>(this, device);
+    auto ifrt_device = std::make_unique<PjRtDevice>(
+        this, DeviceId(device->global_device_id().value()),
+        std::string(device->device_kind()), std::string(device->ToString()),
+        std::string(device->DebugString()), device->process_index(),
+        device->Attributes(), device->IsAddressable() ? device : nullptr);
     devices_.push_back(ifrt_device.get());
-    CHECK(device_map_.insert({device, std::move(ifrt_device)}).second);
+    CHECK(device_id_map_.emplace(ifrt_device->Id(), ifrt_device.get()).second);
+    CHECK(device_map_.emplace(device, std::move(ifrt_device)).second);
   }
   addressable_devices_.reserve(pjrt_client_->addressable_devices().size());
   for (xla::PjRtDevice* device : pjrt_client_->addressable_devices()) {
@@ -98,20 +164,27 @@ PjRtClient::PjRtClient(std::shared_ptr<xla::PjRtClient> pjrt_client)
     memory_map_[memory_space] = std::move(ifrt_memory_space);
   }
 
-  for (Device* device : devices_) {
-    auto* pjrt_device = tensorflow::down_cast<PjRtDevice*>(device);
-    pjrt_device->memories_.reserve(
-        pjrt_device->pjrt_device()->memory_spaces().size());
+  for (size_t i = 0; i < devices_.size(); ++i) {
+    auto* device = tensorflow::down_cast<PjRtDevice*>(devices_[i]);
+    auto* pjrt_device = pjrt_client_->devices()[i];
+    device->memories_.reserve(pjrt_device->memory_spaces().size());
     for (xla::PjRtMemorySpace* pjrt_memory_space :
-         pjrt_device->pjrt_device()->memory_spaces()) {
-      pjrt_device->memories_.push_back(*LookupPjRtMemory(pjrt_memory_space));
+         pjrt_device->memory_spaces()) {
+      device->memories_.push_back(*LookupPjRtMemory(pjrt_memory_space));
+    }
+    absl::StatusOr<PjRtMemorySpace*> memory =
+        pjrt_device->default_memory_space();
+    if (memory.ok()) {
+      device->default_memory_ = *LookupPjRtMemory(*memory);
+    } else {
+      device->default_memory_ = memory.status();
     }
   }
 }
 
 PjRtClient::~PjRtClient() = default;
 
-absl::StatusOr<PjRtDevice*> PjRtClient::LookupPjRtDevice(
+absl::StatusOr<PjRtCompatibleDevice*> PjRtClient::LookupPjRtDevice(
     xla::PjRtDevice* pjrt_device) const {
   auto it = device_map_.find(pjrt_device);
   if (it == device_map_.end()) {
@@ -121,7 +194,7 @@ absl::StatusOr<PjRtDevice*> PjRtClient::LookupPjRtDevice(
   return it->second.get();
 }
 
-absl::StatusOr<PjRtMemory*> PjRtClient::LookupPjRtMemory(
+absl::StatusOr<PjRtCompatibleMemory*> PjRtClient::LookupPjRtMemory(
     xla::PjRtMemorySpace* pjrt_memory) const {
   auto it = memory_map_.find(pjrt_memory);
   if (it == memory_map_.end()) {
@@ -133,9 +206,12 @@ absl::StatusOr<PjRtMemory*> PjRtClient::LookupPjRtMemory(
 
 absl::StatusOr<Device*> PjRtClient::LookupDevice(DeviceId device_id) const {
   DCHECK(this);
-  TF_ASSIGN_OR_RETURN(xla::PjRtDevice * pjrt_device,
-                      pjrt_client_->LookupDevice(device_id.value()));
-  return LookupPjRtDevice(pjrt_device);
+  auto it = device_id_map_.find(device_id);
+  if (it != device_id_map_.end()) {
+    return it->second;
+  }
+  return InvalidArgument("No matching device found for device_id %d",
+                         device_id.value());
 }
 
 absl::StatusOr<Device*> PjRtClient::LookupAddressableDevice(
@@ -189,6 +265,11 @@ absl::StatusOr<tsl::RCReference<Array>> PjRtClient::MakeArrayFromHostBuffer(
     Client::HostBufferSemantics semantics,
     std::function<void()> on_done_with_host_buffer) {
   DCHECK(this);
+  if (dtype.kind() == DType::kString) {
+    return MakeStringArrayFromHostBuffer(this, data, dtype, shape, byte_strides,
+                                         sharding, semantics,
+                                         on_done_with_host_buffer);
+  }
   if (!llvm::isa<const SingleDeviceSharding>(sharding.get())) {
     return InvalidArgument(
         "Only SingleDeviceSharding is supported: sharding=%s",
@@ -204,14 +285,14 @@ absl::StatusOr<tsl::RCReference<Array>> PjRtClient::MakeArrayFromHostBuffer(
   if (sharding->memory_kind().memory_kind().has_value()) {
     // Find `PjRtMemorySpace` that is associated with the sharding's device and
     // matches the sharding's memory_kind.
-    Memory* memory_space = nullptr;
+    Memory* memory = nullptr;
     for (Memory* ms : sharding->devices().front()->Memories()) {
       if (ms->Kind() == sharding->memory_kind()) {
-        memory_space = ms;
+        memory = ms;
         break;
       }
     }
-    if (memory_space == nullptr) {
+    if (memory == nullptr) {
       return InvalidArgument(
           "Invalid memory kind: %s; available memory kinds: %s",
           *sharding->memory_kind().memory_kind(),
@@ -221,13 +302,17 @@ absl::StatusOr<tsl::RCReference<Array>> PjRtClient::MakeArrayFromHostBuffer(
                         }));
     }
     TF_ASSIGN_OR_RETURN(
-        buffer,
-        pjrt_client_->BufferFromHostBuffer(
-            data, primitive_type, shape.dims(), byte_strides, semantics,
-            FromStdFunction(std::move(on_done_with_host_buffer)),
-            tensorflow::down_cast<PjRtMemory*>(memory_space)->pjrt_memory(),
-            /*device_layout=*/nullptr));
+        buffer, pjrt_client_->BufferFromHostBuffer(
+                    data, primitive_type, shape.dims(), byte_strides, semantics,
+                    FromStdFunction(std::move(on_done_with_host_buffer)),
+                    tensorflow::down_cast<PjRtMemory*>(memory)->pjrt_memory(),
+                    /*device_layout=*/nullptr));
   } else {
+    Device* device = sharding->devices().front();
+    if (!device->IsAddressable()) {
+      return InvalidArgument("Cannot copy array to non-addressable device %s",
+                             device->DebugString());
+    }
     TF_ASSIGN_OR_RETURN(
         buffer,
         pjrt_client_->BufferFromHostBuffer(
@@ -310,6 +395,13 @@ PjRtClient::AssembleArrayFromSingleDeviceArrays(
                            std::move(buffers));
 }
 
+absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>>
+PjRtClient::RemapArrays(const RemapPlan& plan,
+                        absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
+                        ArrayCopySemantics semantics) {
+  return PjRtCompatibleClientRemapArrays(this, plan, arrays, semantics);
+}
+
 absl::StatusOr<tsl::RCReference<Tuple>> PjRtClient::MakeTuple(
     absl::Span<tsl::RCReference<Value>> values) {
   return PjRtTuple::Create(this, values);
@@ -336,11 +428,23 @@ PjRtClient::GetDefaultLayoutForDevice(DType dtype,
 
 absl::Status PjRtClient::TransferToInfeed(PjRtDevice* device,
                                           const LiteralSlice& literal) {
+  if (!device->IsAddressable()) {
+    return InvalidArgument(
+        "Infeed is only supported on addressable devices "
+        "but device %s is not addressable",
+        device->DebugString());
+  }
   return device->pjrt_device()->TransferToInfeed(literal);
 }
 
 absl::Status PjRtClient::TransferFromOutfeed(PjRtDevice* device,
                                              MutableBorrowingLiteral literal) {
+  if (!device->IsAddressable()) {
+    return InvalidArgument(
+        "Outfeed is only supported on addressable devices "
+        "but device %s is not addressable",
+        device->DebugString());
+  }
   return device->pjrt_device()->TransferFromOutfeed(literal);
 }
 
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h
index ba7bcbf8f76036..4b52af1ed47659 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h
@@ -25,6 +25,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
@@ -32,24 +33,28 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/compiler.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/remap_plan.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/ifrt/tuple.h"
 #include "xla/python/ifrt/value.h"
 #include "xla/python/pjrt_ifrt/pjrt_compiler.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/logging.h"
 
 namespace xla {
 namespace ifrt {
 
 class PjRtCompatibleArray;
+class PjRtCompatibleDevice;
+class PjRtCompatibleMemory;
 class PjRtDevice;
 class PjRtMemory;
 
@@ -69,9 +74,9 @@ class PjRtCompatibleClient
       std::shared_ptr<PjRtBuffer> pjrt_buffer) = 0;
   virtual absl::StatusOr<tsl::RCReference<PjRtCompatibleArray>> CreatePjRtArray(
       Shape shape, PjRtBuffers pjrt_buffers) = 0;
-  virtual absl::StatusOr<PjRtDevice*> LookupPjRtDevice(
+  virtual absl::StatusOr<PjRtCompatibleDevice*> LookupPjRtDevice(
       xla::PjRtDevice* pjrt_device) const = 0;
-  virtual absl::StatusOr<PjRtMemory*> LookupPjRtMemory(
+  virtual absl::StatusOr<PjRtCompatibleMemory*> LookupPjRtMemory(
       xla::PjRtMemorySpace* pjrt_memory) const = 0;
 
   static char ID;  // NOLINT
@@ -100,6 +105,12 @@ class PjRtClient final
 
   ~PjRtClient() override;
 
+  // For making Arrays with `dtype` as kString:
+  //   (1) the `data` argument should point to an array of `absl::string_view`
+  //   in major-to-minor order,
+  //   (2) `byte_strides` are not supported, and non-`nullopt` values cause this
+  //   function to fail.
+  //   (3) only the `kImmutableDuringCall` semantics is supported currently.
   absl::StatusOr<tsl::RCReference<Array>> MakeArrayFromHostBuffer(
       const void* data, DType dtype, Shape shape,
       std::optional<absl::Span<const int64_t>> byte_strides,
@@ -112,6 +123,11 @@ class PjRtClient final
       absl::Span<tsl::RCReference<Array>> arrays,
       ArrayCopySemantics semantics) override;
 
+  absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>> RemapArrays(
+      const RemapPlan& plan,
+      absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
+      ArrayCopySemantics semantics) override;
+
   absl::StatusOr<tsl::RCReference<Tuple>> MakeTuple(
       absl::Span<tsl::RCReference<Value>> values) override;
 
@@ -175,9 +191,9 @@ class PjRtClient final
       DType dtype, absl::Span<const int64_t> dims,
       Device* device) const override;
 
-  absl::StatusOr<PjRtDevice*> LookupPjRtDevice(
+  absl::StatusOr<PjRtCompatibleDevice*> LookupPjRtDevice(
       xla::PjRtDevice* pjrt_device) const override;
-  absl::StatusOr<PjRtMemory*> LookupPjRtMemory(
+  absl::StatusOr<PjRtCompatibleMemory*> LookupPjRtMemory(
       xla::PjRtMemorySpace* pjrt_memory) const override;
 
   // Transfer the given literal to the infeed queue.
@@ -202,6 +218,7 @@ class PjRtClient final
       device_map_;
   absl::flat_hash_map<xla::PjRtMemorySpace*, std::unique_ptr<PjRtMemory>>
       memory_map_;
+  absl::flat_hash_map<DeviceId, PjRtDevice*> device_id_map_;
 };
 
 }  // namespace ifrt
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_compiler.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_compiler.cc
index bb7aaad5e02df0..028e480b79af2c 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_compiler.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_compiler.cc
@@ -20,10 +20,18 @@ limitations under the License.
 #include <utility>
 
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "llvm/Support/Casting.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/python/ifrt/compiler.h"
+#include "xla/python/ifrt/executable.h"
+#include "xla/python/ifrt/hlo/hlo_program.h"
+#include "xla/python/ifrt/program.h"
 #include "xla/python/pjrt_ifrt/pjrt_client.h"
 #include "xla/python/pjrt_ifrt/pjrt_executable.h"
 #include "xla/python/pjrt_ifrt/xla_compiler.h"
+#include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -34,9 +42,9 @@ char PjRtCompiler::ID = 0;
 absl::StatusOr<std::unique_ptr<LoadedExecutable>> PjRtCompiler::Compile(
     std::unique_ptr<Program> program, std::unique_ptr<CompileOptions> options) {
   DCHECK(this);
-  const auto* xla_program = llvm::dyn_cast<XlaProgram>(program.get());
+  const auto* xla_program = llvm::dyn_cast<HloProgram>(program.get());
   if (xla_program == nullptr) {
-    return absl::InvalidArgumentError("PjRtCompiler requires an XlaProgram");
+    return absl::InvalidArgumentError("PjRtCompiler requires an HloProgram");
   }
   TF_ASSIGN_OR_RETURN(auto xla_compile_options,
                       GetXlaCompileOptions(std::move(options)));
@@ -54,11 +62,13 @@ PjRtCompiler::DeserializeLoadedExecutable(
   TF_ASSIGN_OR_RETURN(auto xla_deserialize_options,
                       GetXlaDeserializeExecutableOptions(std::move(options)));
   TF_ASSIGN_OR_RETURN(
-      auto pjrt_loaded_executble,
+      auto pjrt_loaded_executable,
       client_->pjrt_client()->DeserializeExecutable(
           serialized, std::move(xla_deserialize_options->compile_options)));
   return PjRtLoadedExecutable::Create(
-      client_, std::move(pjrt_loaded_executble),
+      client_,
+      std::shared_ptr<xla::PjRtLoadedExecutable>(
+          std::move(pjrt_loaded_executable)),
       std::move(xla_deserialize_options->loaded_host_callbacks));
 }
 
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_device.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_device.cc
index 8427af3dea6fd1..48a7225e3d7bd3 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_device.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_device.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/python/pjrt_ifrt/pjrt_device.h"
 
 #include <string>
+#include <utility>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/statusor.h"
@@ -27,47 +28,49 @@ limitations under the License.
 #include "xla/python/ifrt/memory.h"
 #include "xla/python/pjrt_ifrt/pjrt_client.h"
 #include "xla/python/pjrt_ifrt/pjrt_memory.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace ifrt {
 
+char PjRtCompatibleDevice::ID = 0;
+
 char PjRtDevice::ID = 0;
 
-PjRtDevice::PjRtDevice(PjRtClient* client, xla::PjRtDevice* pjrt_device)
-    : client_(client), pjrt_device_(pjrt_device) {}
+PjRtDevice::PjRtDevice(
+    PjRtClient* client, DeviceId id, std::string kind, std::string to_string,
+    std::string debug_string, int process_index,
+    absl::flat_hash_map<std::string, PjRtDeviceAttribute> attributes,
+    xla::PjRtDevice* pjrt_device)
+    : client_(client),
+      id_(id),
+      kind_(std::move(kind)),
+      to_string_(std::move(to_string)),
+      debug_string_(std::move(debug_string)),
+      process_index_(process_index),
+      attributes_(std::move(attributes)),
+      pjrt_device_(pjrt_device) {}
 
-DeviceId PjRtDevice::Id() const {
-  return DeviceId(pjrt_device_->global_device_id().value());
-}
+DeviceId PjRtDevice::Id() const { return id_; }
 
-absl::string_view PjRtDevice::Kind() const {
-  return pjrt_device_->device_kind();
-}
+absl::string_view PjRtDevice::Kind() const { return kind_; }
 
-absl::string_view PjRtDevice::ToString() const {
-  return pjrt_device_->ToString();
-}
+absl::string_view PjRtDevice::ToString() const { return to_string_; }
 
-absl::string_view PjRtDevice::DebugString() const {
-  return pjrt_device_->DebugString();
-}
+absl::string_view PjRtDevice::DebugString() const { return debug_string_; }
 
 absl::StatusOr<Memory*> PjRtDevice::DefaultMemory() const {
-  TF_ASSIGN_OR_RETURN(xla::PjRtMemorySpace * pjrt_memory_space,
-                      pjrt_device_->default_memory_space());
-  return client_->LookupPjRtMemory(pjrt_memory_space);
+  return default_memory_;
 }
 
-bool PjRtDevice::IsAddressable() const { return pjrt_device_->IsAddressable(); }
+bool PjRtDevice::IsAddressable() const { return pjrt_device_ != nullptr; }
 
 absl::Span<Memory* const> PjRtDevice::Memories() const { return memories_; }
 
-int PjRtDevice::ProcessIndex() const { return pjrt_device_->process_index(); }
+int PjRtDevice::ProcessIndex() const { return process_index_; }
 
 const absl::flat_hash_map<std::string, PjRtDeviceAttribute>&
 PjRtDevice::Attributes() const {
-  return pjrt_device_->Attributes();
+  return attributes_;
 }
 
 }  // namespace ifrt
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_device.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_device.h
index da4064e0261cd4..dd2790615785e9 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_device.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_device.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_PYTHON_PJRT_IFRT_PJRT_DEVICE_H_
 #define XLA_PYTHON_PJRT_IFRT_PJRT_DEVICE_H_
 
+#include <string>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
@@ -24,32 +25,45 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "llvm/Support/ExtensibleRTTI.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_device_description.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/pjrt_ifrt/pjrt_client.h"
 
 namespace xla {
 namespace ifrt {
 
-class PjRtDevice final : public llvm::RTTIExtends<PjRtDevice, Device> {
+class PjRtCompatibleDevice : public llvm::RTTIExtends<PjRtDevice, Device> {
  public:
-  PjRtDevice(PjRtClient* client, xla::PjRtDevice* pjrt_device);
+  virtual xla::PjRtDevice* pjrt_device() const = 0;
 
-  xla::PjRtDevice* pjrt_device() const { return pjrt_device_; }
+  static char ID;  // NOLINT
+};
+
+class PjRtDevice final
+    : public llvm::RTTIExtends<PjRtDevice, PjRtCompatibleDevice> {
+ public:
+  PjRtDevice(PjRtClient* client, DeviceId id, std::string kind,
+             std::string to_string, std::string debug_string, int process_index,
+             absl::flat_hash_map<std::string, PjRtDeviceAttribute> attributes,
+             xla::PjRtDevice* pjrt_device);
+
+  // Non-null only for addressable devices. nullptr for non-addressable devices.
+  xla::PjRtDevice* pjrt_device() const override { return pjrt_device_; }
 
   // Device implementation.
 
   PjRtClient* client() const override { return client_; }
 
-  DeviceId Id() const override;
-  absl::string_view Kind() const override;
-  absl::string_view ToString() const override;
-  absl::string_view DebugString() const override;
-  bool IsAddressable() const override;
-  absl::StatusOr<Memory*> DefaultMemory() const override;
-  absl::Span<Memory* const> Memories() const override;
-  int ProcessIndex() const override;
+  DeviceId Id() const final;
+  absl::string_view Kind() const final;
+  absl::string_view ToString() const final;
+  absl::string_view DebugString() const final;
+  bool IsAddressable() const final;
+  absl::StatusOr<Memory*> DefaultMemory() const final;
+  absl::Span<Memory* const> Memories() const final;
+  int ProcessIndex() const final;
   const absl::flat_hash_map<std::string, PjRtDeviceAttribute>& Attributes()
-      const override;
+      const final;
 
   static char ID;  // NOLINT
 
@@ -57,8 +71,17 @@ class PjRtDevice final : public llvm::RTTIExtends<PjRtDevice, Device> {
   friend class PjRtClient;
 
   PjRtClient* client_;
-  xla::PjRtDevice* pjrt_device_;
+
+  DeviceId id_;
+  std::string kind_;
+  std::string to_string_;
+  std::string debug_string_;
+  absl::StatusOr<Memory*> default_memory_;
   std::vector<Memory*> memories_;
+  int process_index_;
+  absl::flat_hash_map<std::string, PjRtDeviceAttribute> attributes_;
+
+  xla::PjRtDevice* pjrt_device_;
 };
 
 }  // namespace ifrt
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc
index d7edd8270865d2..02b4cea85f504f 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc
@@ -54,11 +54,12 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
+#include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "xla/translate/mhlo_to_hlo/type_to_shape.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
@@ -180,12 +181,6 @@ char PjRtCompatibleLoadedExecutable::ID = 0;
 char PjRtExecutable::ID = 0;
 char PjRtLoadedExecutable::ID = 0;
 
-absl::StatusOr<std::unique_ptr<Executable>> PjRtExecutable::Create(
-    std::unique_ptr<xla::PjRtExecutable> pjrt_executable) {
-  return std::unique_ptr<Executable>(new PjRtExecutable(
-      std::shared_ptr<xla::PjRtExecutable>(pjrt_executable.release())));
-}
-
 absl::StatusOr<std::unique_ptr<Executable>> PjRtExecutable::Create(
     std::shared_ptr<xla::PjRtExecutable> pjrt_executable) {
   return std::unique_ptr<Executable>(
@@ -202,16 +197,6 @@ absl::StatusOr<std::string> PjRtExecutable::Serialize() const {
   return pjrt_executable_->SerializeExecutable();
 }
 
-absl::StatusOr<std::unique_ptr<LoadedExecutable>> PjRtLoadedExecutable::Create(
-    PjRtCompatibleClient* client,
-    std::unique_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable,
-    std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks) {
-  return Create(client,
-                std::shared_ptr<xla::PjRtLoadedExecutable>(
-                    pjrt_loaded_executable.release()),
-                std::move(loaded_host_callbacks));
-}
-
 absl::StatusOr<std::unique_ptr<LoadedExecutable>> PjRtLoadedExecutable::Create(
     PjRtCompatibleClient* client,
     std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable,
@@ -339,8 +324,13 @@ PjRtLoadedExecutable::CreateInternal(
     ds.push_back(ifrt_device);
   }
   DeviceList devices(std::move(ds));
+  // Devices used for constructing output shardings. A fake one will be used for
+  // a portable executable.
+  std::optional<DeviceList> sharding_devices;
   if (devices.empty()) {
-    return InvalidArgument("At least one device is required");
+    sharding_devices = DeviceList({client->addressable_devices().front()});
+  } else {
+    sharding_devices = devices;
   }
   std::vector<DType> output_dtypes;
   std::vector<Shape> output_shapes;
@@ -349,7 +339,7 @@ PjRtLoadedExecutable::CreateInternal(
   auto append_arg = [&](const xla::PrimitiveType& element_type,
                         const xla::DimensionVector& dimensions,
                         const xla::HloSharding* sharding,
-                        MemoryKind memory_kind) -> Status {
+                        MemoryKind memory_kind) -> absl::Status {
     TF_ASSIGN_OR_RETURN(auto dtype, ToDType(element_type));
     output_dtypes.push_back(dtype);
     output_shapes.push_back(Shape(dimensions));
@@ -366,15 +356,18 @@ PjRtLoadedExecutable::CreateInternal(
               xla::ShapeUtil::MakeShape(element_type, dimensions)));
     }
     output_shardings.push_back(ifrt::ConcreteEvenSharding::Create(
-        devices, memory_kind,
+        *sharding_devices, memory_kind,
         /*shape=*/ifrt::Shape(dimensions),
         /*shard_shape=*/ifrt::Shape(tile_shape_dimensions)));
     return OkStatus();
   };
-  auto append_token = [&] {
+  auto append_token = [&](MemoryKind memory_kind) {
     output_dtypes.push_back(DType(DType::kToken));
     output_shapes.push_back(Shape({}));
-    output_shardings.push_back(OpaqueSharding::Create(devices, MemoryKind()));
+    output_shardings.push_back(
+        ifrt::ConcreteEvenSharding::Create(*sharding_devices, memory_kind,
+                                           /*shape=*/ifrt::Shape({}),
+                                           /*shard_shape=*/ifrt::Shape({})));
   };
   auto check_output_sharding_condition =
       [](absl::Span<const xla::PrimitiveType> element_types,
@@ -410,6 +403,10 @@ PjRtLoadedExecutable::CreateInternal(
   output_shardings.reserve(result_element_types.size());
   for (int i = 0; i < result_element_types.size(); ++i) {
     const auto& element_type = result_element_types[i];
+    MemoryKind element_memory_kind;
+    if (result_memory_kinds.has_value()) {
+      element_memory_kind = MemoryKind((*result_memory_kinds)[i]);
+    }
     if (xla::primitive_util::IsArrayType(element_type)) {
       const xla::HloSharding* element_hlo_sharding = nullptr;
       if (result_hlo_sharding.has_value()) {
@@ -421,14 +418,10 @@ PjRtLoadedExecutable::CreateInternal(
               "Nested-tupled output sharding is not supported");
         }
       }
-      MemoryKind element_memory_kind;
-      if (result_memory_kinds.has_value()) {
-        element_memory_kind = MemoryKind((*result_memory_kinds)[i]);
-      }
       TF_RETURN_IF_ERROR(append_arg(element_type, result_dimensions[i],
                                     element_hlo_sharding, element_memory_kind));
     } else if (element_type == TOKEN) {
-      append_token();
+      append_token(element_memory_kind);
     } else {
       return FailedPrecondition(
           "The element type is not a supported type (array, token)");
@@ -502,7 +495,23 @@ PjRtLoadedExecutable::Execute(absl::Span<tsl::RCReference<Array>> args,
   std::vector<std::vector<PjRtBuffer*>> argument_handles;
   std::vector<std::unique_ptr<PjRtBuffer>> owned_buffers;
 
-  const int num_computations = devices_.size();
+  int num_computations;
+  const bool portable_execution = devices.has_value();
+  PjRtCompatibleDevice* portable_execution_device = nullptr;
+  if (portable_execution) {
+    if (devices->size() != 1) {
+      return InvalidArgument(
+          "Only single-shard portable execution is supported");
+    }
+    num_computations = 1;
+    portable_execution_device = static_cast<PjRtDevice*>(devices->front());
+  } else {
+    if (devices_.empty()) {
+      return InvalidArgument("No devices provided for portable executable");
+    }
+    num_computations = devices_.size();
+  }
+
   argument_handles.resize(num_computations);
   for (int i = 0; i < num_computations; ++i) {
     argument_handles[i].reserve(args.size());
@@ -524,29 +533,6 @@ PjRtLoadedExecutable::Execute(absl::Span<tsl::RCReference<Array>> args,
     }
   }
 
-  const bool portable_execution = devices.has_value();
-  PjRtDevice* portable_execution_device =
-      static_cast<PjRtDevice*>(devices_.front());
-  if (portable_execution) {
-    if (devices->size() != 1) {
-      return InvalidArgument(
-          "Only single-shard portable execution is supported");
-    }
-    portable_execution_device = static_cast<PjRtDevice*>(devices->front());
-  }
-
-  if (portable_execution) {
-    if (!argument_handles[0].empty()) {
-      TF_ASSIGN_OR_RETURN(
-          portable_execution_device,
-          client_->LookupPjRtDevice(argument_handles[0][0]->device()));
-    } else {
-      // Cannot infer the device from the input.
-      // TODO(hyeontaek): Probably we should take devices as an argument?
-      portable_execution_device = static_cast<PjRtDevice*>(devices_.front());
-    }
-  }
-
   const bool returned_future_supported =
       pjrt_loaded_executable_->IsReturnedFutureSupported();
 
@@ -587,6 +573,7 @@ PjRtLoadedExecutable::Execute(absl::Span<tsl::RCReference<Array>> args,
   ExecuteResult result;
   if (portable_execution) {
     std::optional<PjRtFuture<>> returned_pjrt_future;
+    TF_RET_CHECK(portable_execution_device->IsAddressable());
     TF_ASSIGN_OR_RETURN(
         std::vector<std::unique_ptr<PjRtBuffer>> single_device_pjrt_results,
         pjrt_loaded_executable_->ExecutePortable(
@@ -624,7 +611,7 @@ PjRtLoadedExecutable::Execute(absl::Span<tsl::RCReference<Array>> args,
     result.status.OnReady(
         [all_loaded_host_callbacks = all_loaded_host_callbacks_,
          host_callback_states = std::move(host_callback_states)](
-            Status) mutable { all_loaded_host_callbacks.reset(); });
+            absl::Status) mutable { all_loaded_host_callbacks.reset(); });
   }
 
   // Convert 2-level PjRtBuffer vectors into an Array vector.
@@ -714,11 +701,11 @@ absl::StatusOr<std::string> PjRtLoadedExecutable::Serialize() const {
   return pjrt_loaded_executable_->SerializeExecutable();
 }
 
-Future<Status> PjRtLoadedExecutable::Delete() {
+Future<> PjRtLoadedExecutable::Delete() {
   DCHECK(this);
   pjrt_loaded_executable_->Delete();
   // TODO(hyeontaek): Return a correct future.
-  return Future<Status>(OkStatus());
+  return Future<>(OkStatus());
 }
 
 }  // namespace ifrt
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
index f899f908826f7e..fd59a236b15e20 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
@@ -44,9 +44,9 @@ limitations under the License.
 #include "xla/python/pjrt_ifrt/pjrt_client.h"
 #include "xla/python/pjrt_ifrt/pjrt_host_callback.h"
 #include "xla/status.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/concurrency/ref_count.h"
 
 namespace xla {
 namespace ifrt {
@@ -81,8 +81,6 @@ class PjRtExecutable final
     : public llvm::RTTIExtends<PjRtExecutable, PjRtCompatibleExecutable> {
  public:
   // Creates PjRtExecutable from xla::PjRtExecutable.
-  static absl::StatusOr<std::unique_ptr<Executable>> Create(
-      std::unique_ptr<xla::PjRtExecutable> pjrt_executable);
   static absl::StatusOr<std::unique_ptr<Executable>> Create(
       std::shared_ptr<xla::PjRtExecutable> pjrt_executable);
 
@@ -175,10 +173,6 @@ class PjRtLoadedExecutable final
   // Creates PjRtExecutable from xla::PjRtLoadedExecutable. We expect that
   // xla::PjRtLoadedExecutable has fixed output dtypes/shapes/shardings.
   // PjRtLoadedExecutable::GetHloModules() must be implemented.
-  static absl::StatusOr<std::unique_ptr<LoadedExecutable>> Create(
-      PjRtCompatibleClient* client,
-      std::unique_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable,
-      std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks);
   static absl::StatusOr<std::unique_ptr<LoadedExecutable>> Create(
       PjRtCompatibleClient* client,
       std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable,
@@ -215,10 +209,10 @@ class PjRtLoadedExecutable final
     return pjrt_loaded_executable_->name();
   }
 
-  Future<absl::Status> GetReadyFuture() const override {
+  Future<> GetReadyFuture() const override {
     // PjRtCompiler blocks until compilation finishes and returns only the
     // executables that are ready.
-    return Future<absl::Status>(absl::OkStatus());
+    return Future<>(absl::OkStatus());
   }
 
   std::optional<std::vector<OpSharding>> GetParameterShardings()
@@ -282,7 +276,7 @@ class PjRtLoadedExecutable final
       absl::Span<tsl::RCReference<Array>> args, const ExecuteOptions& options,
       std::optional<DeviceList> devices) override;
 
-  Future<Status> Delete() override;
+  Future<> Delete() override;
   bool IsDeleted() const override {
     DCHECK(this);
     return pjrt_loaded_executable_->IsDeleted();
@@ -329,11 +323,18 @@ class PjRtLoadedExecutable final
 
   PjRtCompatibleClient* client_;
   std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable_;
+  // Devices that `pjrt_loaded_executable_` runs on. Empty if the executable is
+  // portable.
   DeviceList devices_;
   std::vector<Device*> addressable_devices_;
   std::shared_ptr<std::vector<tsl::RCReference<LoadedHostCallback>>>
       all_loaded_host_callbacks_;
   std::vector<PjRtHostSendAndRecvLoadedHostCallback*> host_send_recv_callbacks_;
+
+  // Output array specs. If the executable is portable, shardings in
+  // `output_shardings_` will use an arbitrary addressable device, and will be
+  // overridden by a `SingleDeviceSharding` generated on the fly at execution
+  // time.
   std::vector<DType> output_dtypes_;
   std::vector<Shape> output_shapes_;
   std::vector<std::shared_ptr<const Sharding>> output_shardings_;
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_memory.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_memory.cc
index 01190328335606..8edb3bfa29fe2c 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_memory.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_memory.cc
@@ -26,6 +26,8 @@ limitations under the License.
 namespace xla {
 namespace ifrt {
 
+char PjRtCompatibleMemory::ID = 0;
+
 char PjRtMemory::ID = 0;
 
 PjRtMemory::PjRtMemory(PjRtClient* client, xla::PjRtMemorySpace* pjrt_memory)
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_memory.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_memory.h
index 948ce1227f1b0c..1d01683fe05369 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_memory.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_memory.h
@@ -26,12 +26,21 @@ namespace ifrt {
 
 class PjRtClient;
 
-class PjRtMemory final : public llvm::RTTIExtends<PjRtMemory, Memory> {
+class PjRtCompatibleMemory
+    : public llvm::RTTIExtends<PjRtCompatibleMemory, Memory> {
+ public:
+  virtual xla::PjRtMemorySpace* pjrt_memory() = 0;
+
+  static char ID;  // NOLINT
+};
+
+class PjRtMemory final
+    : public llvm::RTTIExtends<PjRtMemory, PjRtCompatibleMemory> {
  public:
   PjRtMemory(PjRtClient* client, xla::PjRtMemorySpace* pjrt_memory);
 
   PjRtClient* client() const { return client_; }
-  xla::PjRtMemorySpace* pjrt_memory() { return pjrt_memory_; }
+  xla::PjRtMemorySpace* pjrt_memory() override { return pjrt_memory_; }
 
   MemoryId Id() const override;
   const MemoryKind& Kind() const override;
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_remap.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_remap.cc
new file mode 100644
index 00000000000000..87e2046a268347
--- /dev/null
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_remap.cc
@@ -0,0 +1,114 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/pjrt_ifrt/pjrt_remap.h"
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "llvm/Support/Casting.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/remap_plan.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/pjrt_ifrt/pjrt_array.h"
+#include "xla/tsl/concurrency/ref_count.h"
+#include "xla/util.h"
+#include "tsl/platform/logging.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace ifrt {
+
+absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>>
+PjRtCompatibleClientRemapArrays(
+    PjRtCompatibleClient* client, const RemapPlan& plan,
+    absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
+    ArrayCopySemantics semantics) {
+  const int num_inputs = arrays.size();
+  for (int i = 0; i < num_inputs; ++i) {
+    if (!llvm::isa<PjRtCompatibleArray>(arrays[i].get())) {
+      return InvalidArgument(
+          "Only PjRtCompatibleArray is supported: arrays[%d]=%s", i,
+          arrays[i]->DebugString());
+    }
+  }
+  if (plan.input_specs.size() > 1) {
+    if (semantics != ArrayCopySemantics::kDonateInput) {
+      return InvalidArgument(
+          "kDonateInput is required if multiple inputs are used");
+    }
+  }
+
+  const int num_outputs = plan.output_specs.size();
+  std::vector<PjRtArray::PjRtBuffers> out_buffers_list(num_outputs);
+  for (int i = 0; i < num_outputs; ++i) {
+    out_buffers_list[i].resize(plan.output_specs[i].sharding->devices().size());
+  }
+
+  for (const RemapPlan::Mapping& mapping : *plan.mappings) {
+    TF_ASSIGN_OR_RETURN(
+        absl::Span<std::shared_ptr<xla::PjRtBuffer>> in_buffers,
+        static_cast<PjRtCompatibleArray*>(arrays[mapping.in_array].get())
+            ->mutable_pjrt_buffers());
+    PjRtArray::PjRtBuffers& out_buffers = out_buffers_list[mapping.out_array];
+    for (int s = 0; s < mapping.from.size(); ++s) {
+      const RemapPlan::Interval& in_interval = mapping.from[s];
+      const RemapPlan::Interval& out_interval = mapping.to[s];
+      int64_t in_shard = in_interval.start;
+      int64_t out_shard = out_interval.start;
+      switch (semantics) {
+        case ArrayCopySemantics::kReuseInput:
+          while (in_shard < in_interval.end) {
+            out_buffers[out_shard] = in_buffers[in_shard];
+            in_shard += in_interval.step;
+            out_shard += out_interval.step;
+          }
+          break;
+        case ArrayCopySemantics::kDonateInput:
+          while (in_shard < in_interval.end) {
+            out_buffers[out_shard] = std::move(in_buffers[in_shard]);
+            in_shard += in_interval.step;
+            out_shard += out_interval.step;
+          }
+          break;
+        default:
+          return InvalidArgument("Invalid ArrayCopySemantics: %d", semantics);
+      }
+    }
+  }
+
+  std::vector<tsl::RCReference<xla::ifrt::Array>> output_arrays;
+  output_arrays.reserve(num_outputs);
+  for (int i = 0; i < num_outputs; ++i) {
+    TF_ASSIGN_OR_RETURN(auto output_array,
+                        PjRtArray::Create(client, plan.output_specs[i].dtype,
+                                          plan.output_specs[i].shape,
+                                          plan.output_specs[i].sharding,
+                                          std::move(out_buffers_list[i])));
+    output_arrays.push_back(std::move(output_array));
+  }
+  return output_arrays;
+}
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_remap.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_remap.h
new file mode 100644
index 00000000000000..ed9083e1206e6a
--- /dev/null
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_remap.h
@@ -0,0 +1,46 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PJRT_IFRT_PJRT_REMAP_H_
+#define XLA_PYTHON_PJRT_IFRT_PJRT_REMAP_H_
+
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/remap_plan.h"
+#include "xla/tsl/concurrency/ref_count.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/logging.h"
+
+namespace xla {
+namespace ifrt {
+
+class PjRtCompatibleClient;
+
+// Common implementation of `xla::ifrt::Client::RemapArrays` for
+// `PjRtCompatibleClient`.
+absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>>
+PjRtCompatibleClientRemapArrays(
+    PjRtCompatibleClient* client, const RemapPlan& plan,
+    absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
+    ArrayCopySemantics semantics);
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PJRT_IFRT_PJRT_REMAP_H_
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_tuple.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_tuple.cc
index 61578abb195c5d..ab5d935147e2ed 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_tuple.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_tuple.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/future.h"
-#include "tsl/concurrency/ref_count.h"
+#include "xla/tsl/concurrency/ref_count.h"
 
 namespace xla {
 namespace ifrt {
@@ -81,7 +81,7 @@ std::string PjRtTuple::DebugString() const {
 }
 int PjRtTuple::Arity() { return values_.size(); }
 
-Status PjRtTuple::Unpack(absl::Span<tsl::RCReference<Value>> values_out) {
+absl::Status PjRtTuple::Unpack(absl::Span<tsl::RCReference<Value>> values_out) {
   if (values_out.size() != values_.size()) {
     return InvalidArgument(
         "Wrong number of output values for "
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_tuple.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_tuple.h
index 98bc66aff699b6..2a6359be9507b2 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_tuple.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_tuple.h
@@ -29,7 +29,7 @@ limitations under the License.
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/pjrt_ifrt/pjrt_client.h"
-#include "tsl/concurrency/ref_count.h"
+#include "xla/tsl/concurrency/ref_count.h"
 
 namespace xla {
 namespace ifrt {
@@ -56,7 +56,7 @@ class PjRtTuple final : public llvm::RTTIExtends<PjRtTuple, Tuple> {
 
   int Arity() override;
 
-  Status Unpack(absl::Span<tsl::RCReference<Value>> values) override;
+  absl::Status Unpack(absl::Span<tsl::RCReference<Value>> values) override;
 
   static char ID;  // NOLINT
 
diff --git a/third_party/xla/xla/python/pjrt_ifrt/xla_compiler.cc b/third_party/xla/xla/python/pjrt_ifrt/xla_compiler.cc
index 165742ad28904c..279fb27ad9d864 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/xla_compiler.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/xla_compiler.cc
@@ -82,7 +82,6 @@ bool register_xla_compile_options_serdes = ([]{
 
 }  // namespace
 
-char XlaProgram::ID = 0;
 char XlaCompileOptions::ID = 0;
 char XlaDeserializeExecutableOptions::ID = 0;
 
diff --git a/third_party/xla/xla/python/pjrt_ifrt/xla_compiler.h b/third_party/xla/xla/python/pjrt_ifrt/xla_compiler.h
index dc670d52418a6d..ad6a8442249866 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/xla_compiler.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/xla_compiler.h
@@ -31,24 +31,6 @@ limitations under the License.
 namespace xla {
 namespace ifrt {
 
-struct XlaProgram : llvm::RTTIExtends<XlaProgram, Program> {
-  XlaProgram() = default;
-  explicit XlaProgram(mlir::ModuleOp module) : mlir_module(module) {}
-  XlaProgram(std::unique_ptr<mlir::MLIRContext> context,
-             mlir::OwningOpRef<mlir::ModuleOp> module)
-      : mlir_module(*module),
-        mlir_context(std::move(context)),
-        owning_mlir_module(std::move(module)) {}
-
-  mlir::ModuleOp mlir_module;
-
-  static char ID;  // NOLINT
-
- private:
-  std::unique_ptr<mlir::MLIRContext> mlir_context;
-  mlir::OwningOpRef<mlir::ModuleOp> owning_mlir_module;
-};
-
 // Wraps compilation options for an XLA computation.
 //
 // TODO(hyeontaek): Move this class out of pjrt_ifrt.
diff --git a/third_party/xla/xla/python/pjrt_ifrt/xla_executable_impl_test_lib.cc b/third_party/xla/xla/python/pjrt_ifrt/xla_executable_impl_test_lib.cc
index ae1c54f64e05b6..e52c9c2a169290 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/xla_executable_impl_test_lib.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/xla_executable_impl_test_lib.cc
@@ -22,14 +22,27 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "xla/client/executable_build_options.h"
 #include "xla/pjrt/mlir_to_hlo.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/compiler.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/dtype.h"
 #include "xla/python/ifrt/executable.h"
+#include "xla/python/ifrt/hlo/hlo_program.h"
+#include "xla/python/ifrt/memory.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
 #include "xla/python/ifrt/test_util.h"
 #include "xla/python/pjrt_ifrt/xla_compiler.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/statusor.h"
+#include "tsl/platform/test.h"
 
 namespace xla {
 namespace ifrt {
@@ -49,7 +62,8 @@ func.func @main(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
   return %3 : tensor<2x3xf32>
 }})";
 
-// Compiles an MLIR module on specified devices.
+// Compiles an MLIR module on specified devices. If devices is empty, compiles
+// it as a portable executable.
 absl::StatusOr<std::unique_ptr<LoadedExecutable>> CompileOnDevices(
     Client* client, Compiler* compiler, absl::string_view mlir_module_str,
     absl::Span<Device* const> devices, bool replicated) {
@@ -61,25 +75,30 @@ absl::StatusOr<std::unique_ptr<LoadedExecutable>> CompileOnDevices(
       std::make_unique<XlaCompileOptions>(xla::CompileOptions());
   ExecutableBuildOptions& build_options =
       compile_options->compile_options.executable_build_options;
-  for (Device* device : devices) {
-    build_options.set_device_ordinal(device->Id().value());
-    if (replicated) {
-      DeviceAssignment device_assignment(/*replica_count=*/devices.size(),
-                                         /*computation_count=*/1);
-      for (int i = 0; i < devices.size(); ++i) {
-        device_assignment(i, 0) = i;
+  if (devices.empty()) {
+    compile_options->compile_options.compile_portable_executable = true;
+  } else {
+    for (Device* device : devices) {
+      build_options.set_device_ordinal(device->Id().value());
+      if (replicated) {
+        DeviceAssignment device_assignment(/*replica_count=*/devices.size(),
+                                           /*computation_count=*/1);
+        for (int i = 0; i < devices.size(); ++i) {
+          device_assignment(i, 0) = i;
+        }
+        build_options.set_device_assignment(device_assignment);
+      } else {
+        DeviceAssignment device_assignment(
+            /*replica_count=*/1,
+            /*computation_count=*/devices.size());
+        for (int i = 0; i < devices.size(); ++i) {
+          device_assignment(i, 0) = i;
+        }
+        build_options.set_device_assignment(device_assignment);
       }
-      build_options.set_device_assignment(device_assignment);
-    } else {
-      DeviceAssignment device_assignment(/*replica_count=*/1,
-                                         /*computation_count=*/devices.size());
-      for (int i = 0; i < devices.size(); ++i) {
-        device_assignment(i, 0) = i;
-      }
-      build_options.set_device_assignment(device_assignment);
     }
   }
-  return compiler->Compile(std::make_unique<XlaProgram>(*module),
+  return compiler->Compile(std::make_unique<HloProgram>(*module),
                            std::move(compile_options));
 }
 
@@ -127,6 +146,50 @@ TEST(LoadedExecutableImplTest, CompileAndExecute) {
   EXPECT_THAT(out_data, ElementsAreArray(expected_out_data));
 }
 
+TEST(LoadedExecutableImplTest, CompileAndExecutePortable) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+  Compiler* compiler = client->GetDefaultCompiler();
+
+  std::vector<Device*> devices = {};
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto loaded_executable,
+      CompileOnDevices(client.get(), compiler, module_add_one, devices,
+                       /*replicated=*/false));
+
+  DType dtype(DType::kF32);
+  Shape shape({2, 3});
+  std::vector<float> data(6);
+  std::iota(data.begin(), data.end(), 0);
+  Device* device = client->addressable_devices().at(0);
+  std::shared_ptr<const Sharding> sharding =
+      SingleDeviceSharding::Create(device, MemoryKind());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto array, client->MakeArrayFromHostBuffer(
+                      data.data(), dtype, std::move(shape),
+                      /*byte_strides=*/std::nullopt, std::move(sharding),
+                      Client::HostBufferSemantics::kImmutableOnlyDuringCall,
+                      /*on_done_with_host_buffer=*/{}));
+
+  ExecuteOptions execute_options;
+  TF_ASSERT_OK_AND_ASSIGN(
+      LoadedExecutable::ExecuteResult result,
+      loaded_executable->Execute(absl::MakeSpan(&array, 1), execute_options,
+                                 /*devices=*/xla::ifrt::DeviceList({device})));
+  TF_ASSERT_OK(result.status.Await());
+  EXPECT_THAT(result.outputs, SizeIs(1));
+
+  std::vector<float> out_data(6);
+  auto future = result.outputs[0]->CopyToHostBuffer(
+      out_data.data(), /*byte_strides=*/std::nullopt,
+      ArrayCopySemantics::kAlwaysCopy);
+  TF_ASSERT_OK(future.Await());
+
+  std::vector<float> expected_out_data(6);
+  std::iota(expected_out_data.begin(), expected_out_data.end(), 1);
+  EXPECT_THAT(out_data, ElementsAreArray(expected_out_data));
+}
+
 TEST(LoadedExecutableImplTest, Delete) {
   TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
   Compiler* compiler = client->GetDefaultCompiler();
diff --git a/third_party/xla/xla/python/pmap_lib.cc b/third_party/xla/xla/python/pmap_lib.cc
index 7fea65fcd85218..97d96c8b64a398 100644
--- a/third_party/xla/xla/python/pmap_lib.cc
+++ b/third_party/xla/xla/python/pmap_lib.cc
@@ -67,10 +67,10 @@ limitations under the License.
 #include "xla/python/traceback.h"
 #include "xla/python/types.h"
 #include "xla/status_macros.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/python/lib/core/numpy.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/profiler/lib/traceme.h"
@@ -125,8 +125,8 @@ struct ShardArgResult {
 // `arg`: The object to shard across `devices`. If a `Array`,
 //   a fast-path will be executed if it's already correctly sharded.
 //
-// Returns a failure Status when an unrecoverable error occurred, so we don't
-// need to fallback to Python.
+// Returns a failure absl::Status when an unrecoverable error occurred, so we
+// don't need to fallback to Python.
 //
 // Both `devices` and `sharding_spec` has the same length.
 absl::StatusOr<ShardArgResult> ShardArg(
diff --git a/third_party/xla/xla/python/profiler/internal/BUILD b/third_party/xla/xla/python/profiler/internal/BUILD
index 00db5bd09ffe75..30ebac2cd5fe68 100644
--- a/third_party/xla/xla/python/profiler/internal/BUILD
+++ b/third_party/xla/xla/python/profiler/internal/BUILD
@@ -22,6 +22,7 @@ cc_library(
     ]),
     deps = [
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:env",
diff --git a/third_party/xla/xla/python/profiler/internal/python_hooks.cc b/third_party/xla/xla/python/profiler/internal/python_hooks.cc
index 6d2ff7318c273b..51dabd858bdbe3 100644
--- a/third_party/xla/xla/python/profiler/internal/python_hooks.cc
+++ b/third_party/xla/xla/python/profiler/internal/python_hooks.cc
@@ -17,10 +17,13 @@ limitations under the License.
 #include <atomic>
 #include <string>
 
+#include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/strip.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/path.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
 #include "tsl/profiler/utils/time_utils.h"
 #include "tsl/profiler/utils/xplane_builder.h"
 #include "tsl/profiler/utils/xplane_schema.h"
diff --git a/third_party/xla/xla/python/py_array.cc b/third_party/xla/xla/python/py_array.cc
index 32b04d3ff735fe..49d6d1c66e173f 100644
--- a/third_party/xla/xla/python/py_array.cc
+++ b/third_party/xla/xla/python/py_array.cc
@@ -21,8 +21,10 @@ limitations under the License.
 #include <cstdint>
 #include <cstdlib>
 #include <cstring>
+#include <functional>
 #include <memory>
 #include <new>
+#include <numeric>
 #include <optional>
 #include <string>
 #include <string_view>
@@ -30,6 +32,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/base/casts.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
@@ -66,6 +69,7 @@ limitations under the License.
 #include "xla/python/nb_helpers.h"
 #include "xla/python/nb_numpy.h"
 #include "xla/python/pjrt_ifrt/pjrt_array.h"
+#include "xla/python/pjrt_ifrt/pjrt_client.h"
 #include "xla/python/pjrt_ifrt/pjrt_device.h"
 #include "xla/python/pjrt_ifrt/xla_sharding.h"
 #include "xla/python/py_client.h"
@@ -83,11 +87,13 @@ limitations under the License.
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "xla/xla_data.pb.h"
+// TODO(b/324133505): remove this GOOGLE_CUDA block after JAX OSS migrates
+// to cuda plugin.
 #if GOOGLE_CUDA
 #include "xla/stream_executor/cuda/cuda_driver.h"
 #endif
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/util.h"
-#include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
@@ -106,8 +112,8 @@ PjRtBuffer* GetPjrtBuffer(ifrt::Array* ifrt_array) {
   return arr->pjrt_buffers().front().get();
 }
 
-StatusOr<const Shape*> XlaDynamicShape(ifrt::Array* ifrt_array,
-                                       std::optional<Shape>& scratch) {
+absl::StatusOr<const Shape*> XlaDynamicShape(ifrt::Array* ifrt_array,
+                                             std::optional<Shape>& scratch) {
   auto* pjrt_buffer = GetPjrtBuffer(ifrt_array);
 
   if (!scratch) {
@@ -136,7 +142,7 @@ tsl::RCReference<ifrt::Array> CreateIfRtArrayFromSingleDeviceShardedPyArrays(
     nb_dtype dtype, absl::Span<const int64_t> shape,
     absl::Span<const PyArray> py_arrays) {
   if (py_arrays.empty()) {
-    // TODO(hyeontaek): Return a Status.
+    // TODO(hyeontaek): Return a absl::Status.
     throw nb::value_error("At least one array must be provided.");
   }
   std::vector<tsl::RCReference<ifrt::Array>> ifrt_arrays;
@@ -175,7 +181,7 @@ tsl::RCReference<ifrt::Array> CreateIfRtArrayFromSingleDeviceShardedPyArrays(
 
   auto ifrt_dtype = DtypeToIfRtDType(dtype);
   if (!ifrt_dtype.ok()) {
-    // TODO(hyeontaek): Return a Status.
+    // TODO(hyeontaek): Return a absl::Status.
     throw nb::value_error(ifrt_dtype.status().ToString().c_str());
   }
   auto ifrt_array = client->AssembleArrayFromSingleDeviceArrays(
@@ -186,7 +192,7 @@ tsl::RCReference<ifrt::Array> CreateIfRtArrayFromSingleDeviceShardedPyArrays(
                                      /*shard_shapes=*/std::move(shapes)),
       absl::MakeSpan(ifrt_arrays), ifrt::ArrayCopySemantics::kReuseInput);
   if (!ifrt_array.ok()) {
-    // TODO(hyeontaek): Return a Status.
+    // TODO(hyeontaek): Return a absl::Status.
     throw nb::value_error(ifrt_array.status().ToString().c_str());
   }
   return *std::move(ifrt_array);
@@ -251,8 +257,10 @@ extern "C" void PyArray_tp_dealloc(PyObject* self) {
 #if PY_VERSION_HEX < 0x030C0000
   PyObject*& dict = *_PyObject_GetDictPtr(self);
   Py_CLEAR(dict);
-#else
+#elif PY_VERSION_HEX < 0x030D0000
   _PyObject_ClearManagedDict(self);
+#else
+  PyObject_ClearManagedDict(self);
 #endif  // PY_VERSION_HEX < 0x030C0000
 
   tp->tp_free(self);
@@ -265,8 +273,10 @@ extern "C" int PyArray_tp_traverse(PyObject* self, visitproc visit, void* arg) {
 #if PY_VERSION_HEX < 0x030C0000
   PyObject*& dict = *_PyObject_GetDictPtr(self);
   Py_VISIT(dict);
-#else
+#elif PY_VERSION_HEX < 0x030D0000
   _PyObject_VisitManagedDict(self, visit, arg);
+#else
+  PyObject_VisitManagedDict(self, visit, arg);
 #endif  // PY_VERSION_HEX < 0x030C0000
   // https://docs.python.org/3/c-api/typeobj.html#c.PyTypeObject.tp_traverse
   Py_VISIT(Py_TYPE(self));
@@ -278,8 +288,10 @@ extern "C" int PyArray_tp_clear(PyObject* self) {
 #if PY_VERSION_HEX < 0x030C0000
   PyObject*& dict = *_PyObject_GetDictPtr(self);
   Py_CLEAR(dict);
-#else
+#elif PY_VERSION_HEX < 0x030D0000
   _PyObject_ClearManagedDict(self);
+#else
+  PyObject_ClearManagedDict(self);
 #endif  // PY_VERSION_HEX < 0x030C0000
   return 0;
 }
@@ -331,10 +343,13 @@ nb::object MakeShapedArrayCached(const ShapedArrayCacheKey& key) {
       });
 
   if (!value->has_value()) {
-    nb_dtype dtype = IfrtDtypeToNbDtype(key.dtype).value();
-    nb::object aval =
-        (*shaped_array)(SpanToNbTuple(absl::Span<const int64_t>(key.dims)),
-                        dtype, key.weak_type);
+    nb_dtype dtype =
+        IfrtDtypeToDtypeWithTokenCanonicalization(key.dtype).value();
+    nb::object aval = (*shaped_array)(
+        SpanToNbTuple(absl::Span<const int64_t>(
+            key.dtype.kind() == ifrt::DType::kToken ? std::vector<int64_t>{0}
+                                                    : key.dims)),
+        dtype, key.weak_type);
     *value = aval;
     return aval;
   }
@@ -343,14 +358,11 @@ nb::object MakeShapedArrayCached(const ShapedArrayCacheKey& key) {
 
 }  // namespace
 
-PyArray_Storage::PyArray_Storage(nb::object aval, bool weak_type,
-                                 xla::nb_dtype dtype,
-                                 std::vector<int64_t> shape,
-                                 nb::object sharding, bool committed,
-                                 nb_class_ptr<PyClient> py_client,
-                                 std::optional<nb_traceback> traceback,
-                                 tsl::RCReference<ifrt::Array> ifrt_array,
-                                 xla::PjRtFuture<absl::Status> result_status)
+PyArray_Storage::PyArray_Storage(
+    nb::object aval, bool weak_type, xla::nb_dtype dtype,
+    std::vector<int64_t> shape, nb::object sharding, bool committed,
+    nb_class_ptr<PyClient> py_client, std::optional<nb_traceback> traceback,
+    tsl::RCReference<ifrt::Array> ifrt_array, xla::PjRtFuture<> result_status)
     : aval(std::move(aval)),
       weak_type(weak_type),
       dtype(std::move(dtype)),
@@ -380,7 +392,7 @@ void PyArray::PyInit(PyArray self, nb::object aval, nb::object sharding,
             nb::cast<bool>(aval.attr("weak_type")), std::move(dtype),
             std::move(shape), std::move(sharding), committed,
             py_arrays.at(0).py_client(), Traceback::Get(),
-            std::move(ifrt_array), xla::PjRtFuture<absl::Status>());
+            std::move(ifrt_array), xla::PjRtFuture<>());
 
   if (!skip_checks) {
     self.CheckAndRearrange();
@@ -390,7 +402,7 @@ void PyArray::PyInit(PyArray self, nb::object aval, nb::object sharding,
 PyArray PyArray::MakeFromSingleDeviceArray(
     nb_class_ptr<PyClient> py_client, std::optional<nb_traceback> traceback,
     tsl::RCReference<ifrt::Array> ifrt_array, bool weak_type, bool committed,
-    xla::PjRtFuture<absl::Status> result_status) {
+    xla::PjRtFuture<> result_status) {
   if (!llvm::isa<ifrt::SingleDeviceSharding>(ifrt_array->sharding())) {
     throw XlaRuntimeError(
         InvalidArgument("Constructing single device jax.Array from non-single "
@@ -398,11 +410,13 @@ PyArray PyArray::MakeFromSingleDeviceArray(
   }
   auto shape_span = ifrt_array->shape().dims();
   ShapedArrayCacheKey key;
-  key.dims = std::vector<int64_t>(shape_span.begin(), shape_span.end());
   key.dtype = ifrt_array->dtype();
+  key.dims = key.dtype.kind() == ifrt::DType::kToken
+                 ? std::vector<int64_t>{0}
+                 : std::vector<int64_t>(shape_span.begin(), shape_span.end());
   key.weak_type = weak_type;
   auto aval = MakeShapedArrayCached(key);
-  auto dtype = IfrtDtypeToNbDtype(key.dtype).value();
+  auto dtype = IfrtDtypeToDtypeWithTokenCanonicalization(key.dtype).value();
   const ifrt::MemoryKind memory_kind = ifrt_array->sharding().memory_kind();
   nb::object py_memory_kind =
       (jax::GetEnableMemories() && memory_kind.memory_kind().has_value())
@@ -423,11 +437,13 @@ PyArray PyArray::MakeFromIfrtArrayAndSharding(
     bool weak_type, bool committed, bool skip_checks) {
   auto shape_span = ifrt_array->shape().dims();
   ShapedArrayCacheKey key;
-  key.dims = std::vector<int64_t>(shape_span.begin(), shape_span.end());
   key.dtype = ifrt_array->dtype();
+  key.dims = key.dtype.kind() == ifrt::DType::kToken
+                 ? std::vector<int64_t>{0}
+                 : std::vector<int64_t>(shape_span.begin(), shape_span.end());
   key.weak_type = weak_type;
   auto aval = MakeShapedArrayCached(key);
-  auto dtype = IfrtDtypeToNbDtype(key.dtype).value();
+  auto dtype = IfrtDtypeToDtypeWithTokenCanonicalization(key.dtype).value();
   return PyArray(std::move(aval), weak_type, dtype, std::move(key.dims),
                  std::move(sharding), std::move(py_client),
                  std::move(traceback), std::move(ifrt_array), committed,
@@ -449,12 +465,12 @@ PyArray PyArrayResultHandler::Call(absl::Span<const PyArray> py_arrays) const {
   return Call(
       py_arrays.at(0).py_client(),
       CreateIfRtArrayFromSingleDeviceShardedPyArrays(dtype_, shape_, py_arrays),
-      xla::PjRtFuture<absl::Status>());
+      xla::PjRtFuture<>());
 }
 
-PyArray PyArrayResultHandler::Call(
-    nb_class_ptr<PyClient> py_client, tsl::RCReference<ifrt::Array> ifrt_array,
-    xla::PjRtFuture<absl::Status> result_status) const {
+PyArray PyArrayResultHandler::Call(nb_class_ptr<PyClient> py_client,
+                                   tsl::RCReference<ifrt::Array> ifrt_array,
+                                   xla::PjRtFuture<> result_status) const {
   return PyArray(aval_, weak_type_, dtype_, shape_, sharding_,
                  std::move(py_client), Traceback::Get(), std::move(ifrt_array),
                  committed_, skip_checks_, std::move(result_status));
@@ -462,7 +478,7 @@ PyArray PyArrayResultHandler::Call(
 
 PyArray PyArrayResultHandler::Call(PyArray py_array) const {
   return Call(py_array.py_client(), tsl::FormRef(py_array.ifrt_array()),
-              xla::PjRtFuture<absl::Status>());
+              xla::PjRtFuture<>());
 }
 
 PyArray::PyArray(nb::object aval, bool weak_type, nb_dtype dtype,
@@ -470,8 +486,7 @@ PyArray::PyArray(nb::object aval, bool weak_type, nb_dtype dtype,
                  nb_class_ptr<PyClient> py_client,
                  std::optional<nb_traceback> traceback,
                  tsl::RCReference<ifrt::Array> ifrt_array, bool committed,
-                 bool skip_checks,
-                 xla::PjRtFuture<absl::Status> result_status) {
+                 bool skip_checks, xla::PjRtFuture<> result_status) {
   auto* self =
       PyArray_tp_new(reinterpret_cast<PyTypeObject*>(type_), nullptr, nullptr);
   m_ptr = self;
@@ -539,7 +554,7 @@ nb::object PyArray::arrays() {
   return nb::cast(py_arrays_cached());
 }
 
-Status PyArray::set_arrays(nb::object obj) {
+absl::Status PyArray::set_arrays(nb::object obj) {
   if (obj.is_none()) {
     SetIfrtArray(tsl::RCReference<ifrt::Array>());
     py_arrays().clear();
@@ -617,7 +632,7 @@ Status PyArray::set_arrays(nb::object obj) {
   return OkStatus();
 }
 
-StatusOr<PyArray> PyArray::FullyReplicatedShard() {
+absl::StatusOr<PyArray> PyArray::FullyReplicatedShard() {
   auto& cached = GetStorage().fully_replicated_array;
   if (!cached.is_none()) {
     return nb::cast<PyArray>(cached);
@@ -638,7 +653,7 @@ StatusOr<PyArray> PyArray::FullyReplicatedShard() {
   return nb::cast<PyArray>(cached);
 }
 
-Status PyArray::BlockUntilReady() const {
+absl::Status PyArray::BlockUntilReady() const {
   nb::gil_scoped_release gil_release;
   if (ifrt_array() == nullptr) {
     return InvalidArgument(
@@ -648,7 +663,7 @@ Status PyArray::BlockUntilReady() const {
   return AwaitBuffersReady(absl::MakeConstSpan(&ifrt_array, 1));
 }
 
-StatusOr<size_t> PyArray::GetOnDeviceSizeInBytes() {
+absl::StatusOr<size_t> PyArray::GetOnDeviceSizeInBytes() {
   if (ifrt_array() == nullptr) {
     return InvalidArgument(
         "GetOnDeviceSizeInBytes() called on deleted or donated buffer");
@@ -674,7 +689,7 @@ absl::Status PyArray::BlockUntilResultStatusIsReady() {
   return result_status.Await();
 }
 
-StatusOr<nb::object> PyArray::SingleDeviceArrayToNumpyArray() {
+absl::StatusOr<nb::object> PyArray::SingleDeviceArrayToNumpyArray() {
   TF_ASSIGN_OR_RETURN(auto arr, FullyReplicatedShard());
   auto result = arr.GetStorage().host_value.AsNumPyArray(
       arr.GetStorage().dynamic_shape, arr.ifrt_array());
@@ -682,13 +697,13 @@ StatusOr<nb::object> PyArray::SingleDeviceArrayToNumpyArray() {
   return result;
 }
 
-Status PyArray::CopySingleDeviceArrayToHostAsync() {
+absl::Status PyArray::CopySingleDeviceArrayToHostAsync() {
   TF_ASSIGN_OR_RETURN(auto arr, FullyReplicatedShard());
   return arr.GetStorage().host_value.CopyToHostAsync(
       arr.GetStorage().dynamic_shape, arr.ifrt_array());
 }
 
-StatusOr<PyArray> PyArray::AssertUnsharded(std::string_view api) {
+absl::StatusOr<PyArray> PyArray::AssertUnsharded(std::string_view api) {
   if (ifrt_array() == nullptr) {
     return InvalidArgument("%s( called on deleted or donated buffer", api);
   }
@@ -704,7 +719,7 @@ StatusOr<PyArray> PyArray::AssertUnsharded(std::string_view api) {
   return py_arrays[0];
 }
 
-StatusOr<std::uintptr_t> PyArray::UnsafeBufferPointer() {
+absl::StatusOr<std::uintptr_t> PyArray::UnsafeBufferPointer() {
   TF_ASSIGN_OR_RETURN(auto arr, AssertUnsharded("UnsafeBufferPointer"));
 
   return py_client()->pjrt_client()->UnsafeBufferPointer(
@@ -785,11 +800,9 @@ nb::dict PyArray::CudaArrayInterface() {
   return result;
 }
 
-StatusOr<nb::object> CudaArrayInterfaceToBuffer(const nb::dict& cai,
-                                                nb_class_ptr<PyClient> client) {
-#ifndef GOOGLE_CUDA
-  throw XlaRuntimeError("This operation requires CUDA support.");
-#else
+absl::StatusOr<nb::object> CudaArrayInterfaceToBuffer(
+    const nb::dict& cai, nb_class_ptr<PyClient> client,
+    std::optional<int> device_id) {
   if (!cai.contains("data")) {
     return absl::InvalidArgumentError(
         "CUDA Array Interface does not define `data`");
@@ -825,19 +838,31 @@ StatusOr<nb::object> CudaArrayInterfaceToBuffer(const nb::dict& cai,
       PrimitiveType element_type,
       DtypeToPrimitiveType(nb_dtype::from_args(cai["typestr"])));
 
-  // cannot determine device_id/stream when device pointer is NULL.
-  int device_id =
-      (data_value == 0
-           ? 0
-           : stream_executor::gpu::CreatedContexts::GetDeviceOrdinal(data_ptr));
+  // TODO(b/324133505): remove this GOOGLE_CUDA block after JAX OSS migrates
+  // to cuda plugin.
+#ifdef GOOGLE_CUDA
+  if (!device_id.has_value()) {
+    // cannot determine device_id/stream when device pointer is NULL.
+    device_id.emplace(
+        (data_value == 0
+             ? 0
+             : stream_executor::gpu::CreatedContexts::GetDeviceOrdinal(
+                   data_ptr)));
+  }
+#endif  // GOOGLE_CUDA
+
+  if (!device_id.has_value()) {
+    throw XlaRuntimeError(
+        "This operation requires CUDA support from jaxlib or jax cuda plugin.");
+  }
   TF_ASSIGN_OR_RETURN(auto device,
-                      client->DeviceFromLocalHardwareId(device_id));
+                      client->DeviceFromLocalHardwareId(*device_id));
   bool is_default_stream =
       data_value == 0 || version == 2 ||
       (version == 3 && (!cai.contains("stream") || cai["stream"].is_none()));
   TF_ASSIGN_OR_RETURN(
       std::intptr_t stream,
-      ([is_default_stream, cai, device]() -> StatusOr<std::intptr_t> {
+      ([is_default_stream, cai, device]() -> absl::StatusOr<std::intptr_t> {
         if (is_default_stream) {
           return device->GetStreamForExternalReadyEvents();
         } else {
@@ -888,6 +913,7 @@ StatusOr<nb::object> CudaArrayInterfaceToBuffer(const nb::dict& cai,
     return InvalidArgument(
         "This operation is implemented for a PjRt-compatible backend only.");
   }
+  TF_RET_CHECK(pjrt_device->IsAddressable());
   TF_ASSIGN_OR_RETURN(
       auto pjrt_buffer,
       device->client()->pjrt_client()->CreateViewOfDeviceBuffer(
@@ -904,10 +930,9 @@ StatusOr<nb::object> CudaArrayInterfaceToBuffer(const nb::dict& cai,
                       ifrt_client->CreatePjRtArray(std::move(pjrt_buffer)));
   return PyArray::MakeFromSingleDeviceArray(std::move(client), Traceback::Get(),
                                             std::move(ifrt_array), false, true);
-#endif  // GOOGLE_CUDA
 }
 
-Status PyArray::Delete() {
+absl::Status PyArray::Delete() {
   for (auto& arr : py_arrays()) {
     TF_RETURN_IF_ERROR(arr.Delete());
   }
@@ -955,10 +980,15 @@ PyArray::Storage::~PyArray_Storage() {
   if (next) {
     next->prev = prev;
   }
+  // Release GIL and then explicitly destroy `ifrt_array` to prevent deadlock on
+  // CPU backend caused by interactions between argument donations and host
+  // callbacks.
+  nb::gil_scoped_release gil_release;
+  ifrt_array.reset();
 }
 
-StatusOr<PyArray> PyArray::CopyToDeviceWithSharding(ifrt::DeviceList devices,
-                                                    nb::object dst_sharding) {
+absl::StatusOr<PyArray> PyArray::CopyToDeviceWithSharding(
+    ifrt::DeviceList devices, nb::object dst_sharding) {
   auto* ifrt_array_ptr = ifrt_array();
   ifrt::MemoryKind dst_memory_kind =
       CreateIfRtMemoryKindFromSharding(dst_sharding);
@@ -1039,7 +1069,7 @@ StatusOr<PyArray> PyArray::CopyToDeviceWithSharding(ifrt::DeviceList devices,
                  /*skip_checks=*/true, result_status());
 }
 
-StatusOr<PyArray> PyArray::BatchedDevicePut(
+absl::StatusOr<PyArray> PyArray::BatchedDevicePut(
     nb::object aval, nb::object sharding, std::vector<nb::object> xs,
     absl::Span<const PyDevice* const> dst_devices, bool committed,
     bool force_copy, PjRtClient::HostBufferSemantics host_buffer_semantics,
@@ -1206,7 +1236,7 @@ bool HasDefaultLayout(const Layout& layout) {
 }
 
 int PyArray_bf_getbuffer(PyObject* exporter, Py_buffer* view, int flags) {
-  Status status = [&]() -> absl::Status {
+  absl::Status status = [&]() -> absl::Status {
     PyArray py_array = nb::borrow<PyArray>(exporter);
     if (py_array.ifrt_array() == nullptr) {
       // TODO(phawkins): why is this happening?
@@ -1366,9 +1396,10 @@ bool IsZeroCopyableCpuBuffer(const PjRtBuffer* buf) {
   // device.
   bool has_default_layout = buf->layout() == nullptr ||
                             HasDefaultLayout(GetXlaLayoutUnsafe(buf->layout()));
-  // On CPU for non-int4 values, we can return the value in a zero-copy way.
-  // For int4 values, we must copy in order to unpack the array.
-  return buf->IsOnCpu() && !primitive_util::Is4BitType(buf->element_type()) &&
+  // On CPU for values >= 8 bits, we can return the value in a zero-copy way.
+  // For sub-byte values, we must copy in order to unpack the array.
+  return buf->IsOnCpu() &&
+         !primitive_util::IsSubByteNonPredType(buf->element_type()) &&
          has_default_layout;
 }
 }  // namespace
@@ -1376,17 +1407,24 @@ bool IsZeroCopyableCpuBuffer(const PjRtBuffer* buf) {
 PyHostValue::PyHostValue() = default;
 PyHostValue::~PyHostValue() = default;
 
-StatusOr<nb::object> PyHostValue::AsNumPyArray(
+absl::StatusOr<nb::object> PyHostValue::AsNumPyArray(
     std::optional<Shape>& dynamic_shape_holder, ifrt::Array* ifrt_array) {
   if (ifrt_array->IsDeleted()) {
     return InvalidArgument("DeviceArray has been deleted.");
   }
+  // The only `jax.Array` with token-shape buffer is the one wrapped by
+  // `jax.core.Token`. Since it is an internal implementation detail, we
+  // don't support converting it to a numpy array.
+  if (ifrt_array->dtype().kind() == ifrt::DType::kToken) {
+    return InvalidArgument(
+        "Cannot convert a token-shape buffer to a numpy array.");
+  }
   auto* arr = llvm::dyn_cast_or_null<ifrt::PjRtCompatibleArray>(ifrt_array);
   if (arr != nullptr) {
     auto* pjrt_buffer = arr->pjrt_buffers().front().get();
     TF_RET_CHECK(!pjrt_buffer->IsTuple());
-    // On CPU for non-int4 values, we can return the value in a zero-copy way.
-    // For int4 values, we must copy in order to unpack the array.
+    // On CPU for values >= 8 bits, we can return the value in a zero-copy way.
+    // For sub-byte values, we must copy in order to unpack the array.
     if (IsZeroCopyableCpuBuffer(pjrt_buffer)) {
       TF_ASSIGN_OR_RETURN(const auto* shape,
                           XlaDynamicShape(ifrt_array, dynamic_shape_holder));
@@ -1398,21 +1436,22 @@ StatusOr<nb::object> PyHostValue::AsNumPyArray(
         std::unique_ptr<PjRtBuffer::ExternalReference> external_reference_hold;
       };
       auto hold = std::make_unique<Hold>();
-      TF_ASSIGN_OR_RETURN(hold->external_reference_hold,
-                          pjrt_buffer->AcquireExternalReference());
       hold->buffer = tsl::FormRef(ifrt_array);
-      void* data =
-          hold->external_reference_hold->OpaqueDeviceMemoryDataPointer();
-      nb::capsule hold_capsule(hold.release(), [](void* h) noexcept {
-        delete static_cast<Hold*>(h);
-      });
-      nb_numpy_ndarray array(dtype, shape->dimensions(),
-                             ByteStridesForShape(*shape), data, hold_capsule);
-      array.attr("flags").attr("writeable") = nb::bool_(false);
+      auto* hold_ptr = hold.release();
+      nb::capsule hold_capsule(
+          hold_ptr, [](void* h) noexcept { delete static_cast<Hold*>(h); });
       {
+        // Release the GIL as `AcquireExternalReference` may block.
         nb::gil_scoped_release gil;
+        TF_ASSIGN_OR_RETURN(hold_ptr->external_reference_hold,
+                            pjrt_buffer->AcquireExternalReference());
         TF_RETURN_IF_ERROR(ifrt_array->GetReadyFuture().Await());
       }
+      void* data =
+          hold_ptr->external_reference_hold->OpaqueDeviceMemoryDataPointer();
+      nb_numpy_ndarray array(dtype, shape->dimensions(),
+                             ByteStridesForShape(*shape), data, hold_capsule);
+      array.attr("flags").attr("writeable") = nb::bool_(false);
       return array;
     }
   }
@@ -1427,8 +1466,8 @@ StatusOr<nb::object> PyHostValue::AsNumPyArray(
   return value_;
 }
 
-Status PyHostValue::CopyToHostAsync(std::optional<Shape>& dynamic_shape_holder,
-                                    ifrt::Array* ifrt_array) {
+absl::Status PyHostValue::CopyToHostAsync(
+    std::optional<Shape>& dynamic_shape_holder, ifrt::Array* ifrt_array) {
   if (ready_.IsValid()) {
     // The array value has been populated, so CopyToHostAsync has been called.
     return OkStatus();
@@ -1440,7 +1479,7 @@ Status PyHostValue::CopyToHostAsync(std::optional<Shape>& dynamic_shape_holder,
   }
   auto transfer_guard_formatter = [ifrt_array] {
     return absl::StrCat(
-        "shape=()", absl::StrJoin(ifrt_array->shape().dims(), ","),
+        "shape=(", absl::StrJoin(ifrt_array->shape().dims(), ","),
         "), dtype=", ifrt_array->dtype().DebugString(),
         ", device=", ifrt_array->sharding().devices().front()->DebugString());
   };
@@ -1477,13 +1516,11 @@ Status PyHostValue::CopyToHostAsync(std::optional<Shape>& dynamic_shape_holder,
   // better about an efficient layout for the host buffer. It will be useful
   // to revisit the semantics of PjRtBuffer::ToLiteral() to see if it is
   // desirable for the runtime to choose the layout.
-  ready_ = ifrt_array
-               ->CopyToHostBuffer(value_.mutable_data(), strides,
-                                  ifrt::ArrayCopySemantics::kReuseInput)
-               .ToStatusFuture();
+  ready_ = ifrt_array->CopyToHostBuffer(value_.mutable_data(), strides,
+                                        ifrt::ArrayCopySemantics::kReuseInput);
   // Make sure the destination of the copy remains alive until the copy is done.
   value_.inc_ref();
-  ready_.OnReady([array{value_.ptr()}](Status status) {
+  ready_.OnReady([array{value_.ptr()}](absl::Status status) {
     GlobalPyRefManager()->AddGarbage(nb::steal(array));
   });
   value_.attr("flags").attr("writeable") = nb::bool_(false);
@@ -1522,12 +1559,19 @@ PyType_Slot PyArray_slots[] = {
 
 }  // namespace
 
-Status PyArray::RegisterTypes(nb::module_& m) {
+absl::Status PyArray::RegisterTypes(nb::module_& m) {
   std::string name =
       absl::StrCat(nb::cast<std::string>(m.attr("__name__")), ".ArrayImpl");
 
   PyType_Spec PyArray_spec = {
+#if PY_VERSION_HEX < 0x030B0000
+      // Work around for https://github.com/python/cpython/issues/89478
+      // CPython 3.10 and earlier assume that the .name value remains alive
+      // forever.
+      /*.name=*/strdup(name.c_str()),
+#else
       /*.name=*/name.c_str(),
+#endif  // PY_VERSION_HEX < 0x030B0000
       /*.basicsize=*/static_cast<int>(sizeof(PyArrayObject)),
       /*.itemsize=*/0,
 #if PY_VERSION_HEX < 0x030C0000
diff --git a/third_party/xla/xla/python/py_array.h b/third_party/xla/xla/python/py_array.h
index 4c15e5c9a80ea8..75df6229ee59a1 100644
--- a/third_party/xla/xla/python/py_array.h
+++ b/third_party/xla/xla/python/py_array.h
@@ -46,8 +46,8 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/status.h"
 #include "xla/statusor.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/util.h"
-#include "tsl/concurrency/ref_count.h"
 
 namespace xla {
 
@@ -63,14 +63,14 @@ class PyHostValue {
   PyHostValue& operator=(const PyHostValue&) = delete;
   PyHostValue& operator=(PyHostValue&&) = delete;
 
-  Status CopyToHostAsync(std::optional<Shape>& dynamic_shape_holder,
-                         ifrt::Array* ifrt_array);
+  absl::Status CopyToHostAsync(std::optional<Shape>& dynamic_shape_holder,
+                               ifrt::Array* ifrt_array);
 
   absl::StatusOr<nanobind::object> AsNumPyArray(
       std::optional<Shape>& dynamic_shape_holder, ifrt::Array* ifrt_array);
 
  private:
-  ifrt::Future<Status> ready_;
+  ifrt::Future<> ready_;
   nb_numpy_ndarray value_;
 };
 
@@ -81,7 +81,7 @@ struct PyArray_Storage {
                   bool committed, nb_class_ptr<PyClient> py_client,
                   std::optional<nb_traceback> traceback,
                   tsl::RCReference<ifrt::Array> ifrt_array,
-                  xla::PjRtFuture<absl::Status> result_status);
+                  xla::PjRtFuture<> result_status);
 
   ~PyArray_Storage();
   nanobind::handle AsHandle();
@@ -107,7 +107,7 @@ struct PyArray_Storage {
   // Only set if this Array was generated by a computation that has effects.
   // This is the result status of the XLA computation that generated this
   // array.
-  xla::PjRtFuture<absl::Status> result_status;
+  xla::PjRtFuture<> result_status;
 
   // Doubly-linked list of all PyArrays known to the client. Protected by the
   // GIL. Since multiple PyArrays may share the same PjRtBuffer, there may be
@@ -140,21 +140,19 @@ class PyArray : public nanobind::object {
           std::optional<nb_traceback> traceback,
           tsl::RCReference<ifrt::Array> ifrt_array, bool committed,
           bool skip_checks,
-          xla::PjRtFuture<absl::Status> result_status =
-              xla::PjRtFuture<absl::Status>());
+          xla::PjRtFuture<> result_status = xla::PjRtFuture<>());
 
   static PyArray MakeFromSingleDeviceArray(
       nb_class_ptr<PyClient> py_client, std::optional<nb_traceback> traceback,
       tsl::RCReference<ifrt::Array> ifrt_array, bool weak_type, bool committed,
-      xla::PjRtFuture<absl::Status> result_status =
-          xla::PjRtFuture<absl::Status>());
+      xla::PjRtFuture<> result_status = xla::PjRtFuture<>());
 
   static PyArray MakeFromIfrtArrayAndSharding(
       nb_class_ptr<PyClient> py_client, std::optional<nb_traceback> traceback,
       tsl::RCReference<ifrt::Array> ifrt_array, nanobind::object sharding,
       bool weak_type, bool committed, bool skip_checks);
 
-  static Status RegisterTypes(nanobind::module_& m);
+  static absl::Status RegisterTypes(nanobind::module_& m);
 
   using Storage = PyArray_Storage;
 
@@ -197,7 +195,7 @@ class PyArray : public nanobind::object {
     return ifrt_array_ptr->GetReadyFuture().IsReady();
   }
 
-  const xla::PjRtFuture<absl::Status>& result_status() const {
+  const xla::PjRtFuture<>& result_status() const {
     return GetStorage().result_status;
   }
 
@@ -240,7 +238,7 @@ class PyArray : public nanobind::object {
   const std::vector<PyArray>& py_arrays_cached();
 
   nanobind::object arrays();
-  Status set_arrays(nanobind::object obj);
+  absl::Status set_arrays(nanobind::object obj);
   absl::StatusOr<PyArray> FullyReplicatedShard();
 
   int num_shards() const {
@@ -260,17 +258,17 @@ class PyArray : public nanobind::object {
     return arg.type().is(PyArray::type());
   }
 
-  Status BlockUntilReady() const;
+  absl::Status BlockUntilReady() const;
 
   absl::Status BlockUntilResultStatusIsReady();
 
   absl::StatusOr<size_t> GetOnDeviceSizeInBytes();
   absl::StatusOr<nanobind::object> SingleDeviceArrayToNumpyArray();
-  Status CopySingleDeviceArrayToHostAsync();
+  absl::Status CopySingleDeviceArrayToHostAsync();
   nanobind::dict CudaArrayInterface();
   absl::StatusOr<std::uintptr_t> UnsafeBufferPointer();
 
-  Status Delete();
+  absl::Status Delete();
 
   bool IsDeleted() const;
 
@@ -312,8 +310,7 @@ class PyArrayResultHandler {
 
   PyArray Call(nb_class_ptr<PyClient> py_client,
                tsl::RCReference<ifrt::Array> ifrt_array,
-               xla::PjRtFuture<absl::Status> result_status =
-                   xla::PjRtFuture<absl::Status>()) const;
+               xla::PjRtFuture<> result_status = xla::PjRtFuture<>()) const;
 
  private:
   nanobind::object aval_;
@@ -327,7 +324,8 @@ class PyArrayResultHandler {
 };
 
 absl::StatusOr<nanobind::object> CudaArrayInterfaceToBuffer(
-    const nanobind::dict& cai, nb_class_ptr<PyClient> cuda_client);
+    const nanobind::dict& cai, nb_class_ptr<PyClient> cuda_client,
+    std::optional<int> device_id);
 
 }  // namespace xla
 
diff --git a/third_party/xla/xla/python/py_client.cc b/third_party/xla/xla/python/py_client.cc
index b67f18b16c1e7c..2321ec18abd3c9 100644
--- a/third_party/xla/xla/python/py_client.cc
+++ b/third_party/xla/xla/python/py_client.cc
@@ -64,6 +64,7 @@ limitations under the License.
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/dtype.h"
 #include "xla/python/ifrt/executable.h"
+#include "xla/python/ifrt/hlo/hlo_program.h"
 #include "xla/python/ifrt/host_callback.h"
 #include "xla/python/ifrt/memory.h"
 #include "xla/python/nb_absl_span.h"  // IWYU pragma: keep
@@ -87,8 +88,8 @@ limitations under the License.
 #include "xla/service/platform_util.h"  // IWYU pragma: keep
 #include "xla/shape.h"
 #include "xla/status_macros.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/util.h"
-#include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/casts.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
@@ -421,6 +422,7 @@ PyClient::CompileIfrtProgram(
     TF_ASSIGN_OR_RETURN(ifrt_loaded_executable,
                         client->ifrt_client_->GetDefaultCompiler()->Compile(
                             std::move(ifrt_program), std::move(ifrt_options)));
+    TF_RETURN_IF_ERROR(ifrt_loaded_executable->GetReadyFuture().Await());
     TF_ASSIGN_OR_RETURN(fingerprint, ifrt_loaded_executable->Fingerprint());
   }
   auto traceback = Traceback::Get();
@@ -436,7 +438,7 @@ PyClient::CompileIfrtProgram(
   TF_ASSIGN_OR_RETURN(mlir::OwningOpRef<mlir::ModuleOp> module,
                       ParseMlirModuleString(mlir_module, context));
   return CompileIfrtProgram(
-      client, std::make_unique<xla::ifrt::XlaProgram>(module.get()),
+      client, std::make_unique<xla::ifrt::HloProgram>(module.get()),
       MakeIfrtCompileOptions(std::move(options), std::move(host_callbacks)));
 }
 
diff --git a/third_party/xla/xla/python/py_compile_only_client.cc b/third_party/xla/xla/python/py_compile_only_client.cc
index 7fa7ed3305ccb3..d172c5a1b2870d 100644
--- a/third_party/xla/xla/python/py_compile_only_client.cc
+++ b/third_party/xla/xla/python/py_compile_only_client.cc
@@ -55,16 +55,18 @@ limitations under the License.
 #include "xla/python/ifrt/dtype.h"
 #include "xla/python/ifrt/executable.h"
 #include "xla/python/ifrt/memory.h"
+#include "xla/python/ifrt/remap_plan.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/ifrt/tuple.h"
 #include "xla/python/ifrt/value.h"
 #include "xla/python/nb_class_ptr.h"
+#include "xla/python/pjrt_ifrt/pjrt_array.h"
 #include "xla/python/py_client.h"
 #include "xla/service/computation_placer.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/python/lib/core/numpy.h"
 #include "xla/util.h"
-#include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
 
@@ -170,6 +172,13 @@ class CompileOnlyIfRtClient final
         "client.");
   }
 
+  absl::StatusOr<std::vector<tsl::RCReference<ifrt::Array>>> RemapArrays(
+      const ifrt::RemapPlan& plan,
+      absl::Span<tsl::RCReference<ifrt::Array>> arrays,
+      ifrt::ArrayCopySemantics semantics) override {
+    return Unimplemented("RemapArrays not available with compile-only client.");
+  }
+
   absl::StatusOr<tsl::RCReference<ifrt::Tuple>> MakeTuple(
       absl::Span<tsl::RCReference<ifrt::Value>> values) override {
     return Unimplemented("MakeTuple not available with compile-only client.");
@@ -231,8 +240,10 @@ class CompileOnlyIfRtClient final
   absl::StatusOr<std::unique_ptr<PjRtLayout>> GetDefaultLayoutForDevice(
       ifrt::DType dtype, absl::Span<const int64_t> dims,
       ifrt::Device* device) const override {
-    return absl::UnimplementedError(
-        "GetDefaultLayout not supported for CompileOnlyIfRtClient.");
+    TF_ASSIGN_OR_RETURN(PrimitiveType element_type, ToPrimitiveType(dtype));
+    TF_ASSIGN_OR_RETURN(xla::Layout layout,
+                        topology_->GetDefaultLayout(element_type, dims));
+    return std::make_unique<PjRtXlaLayout>(std::move(layout));
   }
 
  private:
diff --git a/third_party/xla/xla/python/py_device.cc b/third_party/xla/xla/python/py_device.cc
index da639270517e5b..09d349836fd7af 100644
--- a/third_party/xla/xla/python/py_device.cc
+++ b/third_party/xla/xla/python/py_device.cc
@@ -86,7 +86,7 @@ std::string_view PyDevice::device_kind() const { return device_->Kind(); }
 std::optional<int> PyDevice::local_hardware_id() const {
   // TODO(phawkins): consider supporting this for non-PJRT devices.
   ifrt::PjRtDevice* device = llvm::dyn_cast<ifrt::PjRtDevice>(device_);
-  if (device == nullptr) {
+  if (device == nullptr || !device->IsAddressable()) {
     return std::nullopt;
   }
   int local_hardware_id = device->pjrt_device()->local_hardware_id();
@@ -190,9 +190,9 @@ nb::list PyDevice::AddressableMemories() const {
 absl::StatusOr<std::optional<nb::dict>> PyDevice::MemoryStats() const {
   GlobalPyRefManager()->CollectGarbage();
   ifrt::PjRtDevice* device = llvm::dyn_cast<ifrt::PjRtDevice>(device_);
-  if (device == nullptr) {
+  if (device == nullptr || !device->IsAddressable()) {
     return xla::InvalidArgument(
-        "MemoryStats is only supported for PjRt devices.");
+        "MemoryStats is only supported for addressable PjRt devices.");
   }
   absl::StatusOr<tsl::AllocatorStats> maybe_stats =
       device->pjrt_device()->GetAllocatorStats();
@@ -228,9 +228,10 @@ absl::StatusOr<std::optional<nb::dict>> PyDevice::MemoryStats() const {
 absl::StatusOr<std::intptr_t> PyDevice::GetStreamForExternalReadyEvents()
     const {
   ifrt::PjRtDevice* device = llvm::dyn_cast<ifrt::PjRtDevice>(device_);
-  if (device == nullptr) {
+  if (device == nullptr || !device->IsAddressable()) {
     return xla::InvalidArgument(
-        "GetStreamForExternalReadyEvents is only supported for PjRt devices.");
+        "GetStreamForExternalReadyEvents is only supported for addressable "
+        "PjRt devices.");
   }
   return device->pjrt_device()->GetStreamForExternalReadyEvents();
 }
diff --git a/third_party/xla/xla/python/py_device_list.cc b/third_party/xla/xla/python/py_device_list.cc
index f89ff54adb8c04..27110f439b9d98 100644
--- a/third_party/xla/xla/python/py_device_list.cc
+++ b/third_party/xla/xla/python/py_device_list.cc
@@ -326,11 +326,6 @@ void PyDeviceList::PopulateMemoryKindInfo() {
     throw nb::value_error("Unrecognized DeviceList type");
   }
   MemoryKindInfo info;
-  if (!GetEnableMemories()) {
-    info.default_memory_kind = nb::none();
-    memory_kind_info_ = std::move(info);
-    return;
-  }
   xla::ifrt::Device* addressable_device = nullptr;
   const int process_index = py_client_ ? py_client_->process_index() : 0;
   for (xla::ifrt::Device* device : std::get<0>(device_list_).devices()) {
@@ -366,12 +361,6 @@ void PyDeviceList::PopulateMemoryKindInfo() {
 
 void PyDeviceList::PopulateMemoryKindInfoForDuckTypedDevices() {
   MemoryKindInfo info;
-  if (!GetEnableMemories()) {
-    info.default_memory_kind = nb::none();
-    // info.memory_kinds is default-initialized to an empty tuple.
-    memory_kind_info_ = std::move(info);
-    return;
-  }
   try {
     nb::handle addressable_device;
     for (nb::handle device : std::get<1>(device_list_)) {
@@ -399,6 +388,9 @@ void PyDeviceList::PopulateMemoryKindInfoForDuckTypedDevices() {
 }
 
 absl::StatusOr<nb::tuple> PyDeviceList::MemoryKinds() {
+  if (!GetEnableMemories()) {
+    return nb::tuple();
+  }
   if (!memory_kind_info_.has_value()) {
     PopulateMemoryKindInfo();
   }
@@ -409,6 +401,9 @@ absl::StatusOr<nb::tuple> PyDeviceList::MemoryKinds() {
 }
 
 absl::StatusOr<nb::object> PyDeviceList::DefaultMemoryKind() {
+  if (!GetEnableMemories()) {
+    return nb::none();
+  }
   if (!memory_kind_info_.has_value()) {
     PopulateMemoryKindInfo();
   }
diff --git a/third_party/xla/xla/python/py_executable.cc b/third_party/xla/xla/python/py_executable.cc
index 32f2025cb61984..85f9ad57d96335 100644
--- a/third_party/xla/xla/python/py_executable.cc
+++ b/third_party/xla/xla/python/py_executable.cc
@@ -47,9 +47,9 @@ limitations under the License.
 #include "xla/python/py_client.h"
 #include "xla/python/py_device.h"
 #include "xla/python/traceback.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/fingerprint.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/status.h"
@@ -78,7 +78,7 @@ absl::Status PyShardedToken::Await() {
 
 PyLoadedExecutable::PyLoadedExecutable(
     nb_class_ptr<PyClient> client,
-    std::unique_ptr<ifrt::LoadedExecutable> ifrt_loaded_executable,
+    std::shared_ptr<ifrt::LoadedExecutable> ifrt_loaded_executable,
     std::optional<nb_traceback> traceback,
     std::optional<std::string> fingerprint)
     : client_(std::move(client)),
@@ -178,7 +178,7 @@ struct ShardedBufferAdapter<ExecuteShardedArg> {
 void PopulateExecuteShardedResults(
     const nb_class_ptr<PyClient>& client,
     std::vector<tsl::RCReference<ifrt::Array>> ifrt_arrays,
-    const PjRtFuture<absl::Status>& result_status, int num_computations,
+    const PjRtFuture<>& result_status, int num_computations,
     std::vector<std::vector<PyArray>>& outputs) {
   auto traceback = Traceback::Get();
   DCHECK_GT(num_computations, 0);
@@ -205,7 +205,7 @@ absl::StatusOr<PyExecuteResults> ExecuteShardedOnLocalDevicesInternal(
     std::optional<std::vector<PjRtFuture<>>>& returned_futures,
     bool attach_status_to_results) {
   std::vector<tsl::RCReference<ifrt::Array>> output_arrays;
-  std::unique_ptr<ifrt::Future<absl::Status>> returned_future;
+  std::unique_ptr<ifrt::Future<>> returned_future;
   int num_computations = ifrt_loaded_executable->addressable_devices().size();
   PjRtFuture<> result_status;
   {
@@ -286,11 +286,10 @@ PyShardedToken PyExecuteResults::ConsumeToken() {
 std::vector<std::vector<PyArray>>
 PyExecuteResults::DisassembleIntoSingleDeviceArrays() {
   std::vector<std::vector<PyArray>> outputs;
-  PopulateExecuteShardedResults(client_, Consume(),
-                                result_status_.IsValid()
-                                    ? result_status_.ToStatusFuture()
-                                    : PjRtFuture<absl::Status>(),
-                                num_computations_, outputs);
+  PopulateExecuteShardedResults(
+      client_, Consume(),
+      result_status_.IsValid() ? result_status_ : PjRtFuture<>(),
+      num_computations_, outputs);
   return outputs;
 }
 
@@ -311,11 +310,10 @@ PyExecuteResults::DisassemblePrefixIntoSingleDeviceArrays(size_t n) {
   ifrt_arrays_.erase(ifrt_arrays_.begin() + n, ifrt_arrays_.end());
   std::swap(ifrt_arrays_, ifrt_arrays);
   std::vector<std::vector<PyArray>> outputs;
-  PopulateExecuteShardedResults(client_, std::move(ifrt_arrays),
-                                result_status_.IsValid()
-                                    ? result_status_.ToStatusFuture()
-                                    : PjRtFuture<absl::Status>(),
-                                num_computations_, outputs);
+  PopulateExecuteShardedResults(
+      client_, std::move(ifrt_arrays),
+      result_status_.IsValid() ? result_status_ : PjRtFuture<>(),
+      num_computations_, outputs);
   return outputs;
 }
 
@@ -339,8 +337,7 @@ std::vector<nb::object> PyExecuteResults::ConsumeWithHandlers(
     if (std::holds_alternative<const PyArrayResultHandler*>(handler)) {
       outputs.push_back(std::get<const PyArrayResultHandler*>(handler)->Call(
           client_, std::move(ifrt_arrays[buffer_id]),
-          result_status_.IsValid() ? result_status_.ToStatusFuture()
-                                   : PjRtFuture<absl::Status>()));
+          result_status_.IsValid() ? result_status_ : PjRtFuture<>()));
     } else {
       tsl::profiler::TraceMe traceme("ConsumeWithHandlers fallback.");
       auto disassembled_arrays =
@@ -353,8 +350,7 @@ std::vector<nb::object> PyExecuteResults::ConsumeWithHandlers(
       for (auto& disassembled_array : *disassembled_arrays) {
         nb::object array = PyArray::MakeFromSingleDeviceArray(
             client_, traceback, std::move(disassembled_array), false, true,
-            result_status_.IsValid() ? result_status_.ToStatusFuture()
-                                     : PjRtFuture<absl::Status>());
+            result_status_.IsValid() ? result_status_ : PjRtFuture<>());
         PyList_SET_ITEM(bufs.ptr(), i, array.release().ptr());
         ++i;
       }
diff --git a/third_party/xla/xla/python/py_executable.h b/third_party/xla/xla/python/py_executable.h
index 31174ef7195be9..87fe73aa61c9ae 100644
--- a/third_party/xla/xla/python/py_executable.h
+++ b/third_party/xla/xla/python/py_executable.h
@@ -47,8 +47,8 @@ limitations under the License.
 #include "xla/python/py_array.h"
 #include "xla/python/py_client.h"
 #include "xla/python/traceback.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/status.h"
 
 namespace xla {
@@ -134,7 +134,7 @@ class PyLoadedExecutable {
  public:
   PyLoadedExecutable(
       nb_class_ptr<PyClient> client,
-      std::unique_ptr<ifrt::LoadedExecutable> ifrt_loaded_executable,
+      std::shared_ptr<ifrt::LoadedExecutable> ifrt_loaded_executable,
       std::optional<nb_traceback> traceback,
       std::optional<std::string> fingerprint);
   ~PyLoadedExecutable();
@@ -144,6 +144,10 @@ class PyLoadedExecutable {
     return ifrt_loaded_executable_.get();
   }
 
+  std::shared_ptr<ifrt::LoadedExecutable> shared_ifrt_loaded_executable() {
+    return ifrt_loaded_executable_;
+  }
+
   absl::Span<const PjRtLoadedExecutable::LogicalDeviceIds>
   addressable_device_logical_ids() const {
     return ifrt_loaded_executable_->addressable_device_logical_ids();
@@ -238,7 +242,7 @@ class PyLoadedExecutable {
   friend class PyClient;
 
   nb_class_ptr<PyClient> client_;
-  std::unique_ptr<ifrt::LoadedExecutable> ifrt_loaded_executable_;
+  std::shared_ptr<ifrt::LoadedExecutable> ifrt_loaded_executable_;
   std::optional<nb_traceback> traceback_;
 
   // Identical executables (i.e. representing the same program) will have the
diff --git a/third_party/xla/xla/python/py_host_callback.cc b/third_party/xla/xla/python/py_host_callback.cc
index 71fc2c15b40a62..6c41625df71288 100644
--- a/third_party/xla/xla/python/py_host_callback.cc
+++ b/third_party/xla/xla/python/py_host_callback.cc
@@ -45,9 +45,9 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/statusor.h"
 
 namespace nb = nanobind;
diff --git a/third_party/xla/xla/python/py_host_callback.h b/third_party/xla/xla/python/py_host_callback.h
index bcda552a4df183..12df78435ac879 100644
--- a/third_party/xla/xla/python/py_host_callback.h
+++ b/third_party/xla/xla/python/py_host_callback.h
@@ -33,7 +33,7 @@ limitations under the License.
 #include "xla/python/ifrt/host_callback.h"
 #include "xla/python/pjrt_ifrt/pjrt_host_callback.h"
 #include "xla/shape.h"
-#include "tsl/concurrency/ref_count.h"
+#include "xla/tsl/concurrency/ref_count.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/python/py_program.cc b/third_party/xla/xla/python/py_program.cc
index 4cabd1b156f22a..e85b0a5fda8770 100644
--- a/third_party/xla/xla/python/py_program.cc
+++ b/third_party/xla/xla/python/py_program.cc
@@ -34,10 +34,11 @@ limitations under the License.
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/status_casters.h"
 #include "xla/python/ifrt/compiler.h"
+#include "xla/python/ifrt/hlo/hlo_program.h"
 #include "xla/python/ifrt/host_callback.h"
 #include "xla/python/ifrt/plugin_program.h"
 #include "xla/python/pjrt_ifrt/xla_compiler.h"
-#include "tsl/concurrency/ref_count.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -65,23 +66,23 @@ MakePluginCompileOptions() {
   return std::make_unique<ifrt::PluginCompileOptions>();
 }
 
-absl::StatusOr<std::unique_ptr<ifrt::Program>> MakeXlaProgram(
+absl::StatusOr<std::unique_ptr<ifrt::Program>> MakeHloProgram(
     absl::string_view mlir_module) {
   auto context = std::make_unique<mlir::MLIRContext>();
   TF_ASSIGN_OR_RETURN(mlir::OwningOpRef<mlir::ModuleOp> module,
                       ParseMlirModuleString(mlir_module, *context));
-  return std::make_unique<xla::ifrt::XlaProgram>(std::move(context),
+  return std::make_unique<xla::ifrt::HloProgram>(std::move(context),
                                                  std::move(module));
 }
 
-absl::StatusOr<std::unique_ptr<ifrt::Program>> MakeXlaProgramFromString(
+absl::StatusOr<std::unique_ptr<ifrt::Program>> MakeHloProgramFromString(
     std::string mlir_module) {
-  return MakeXlaProgram(mlir_module);
+  return MakeHloProgram(mlir_module);
 }
 
-absl::StatusOr<std::unique_ptr<ifrt::Program>> MakeXlaProgramFromBytes(
+absl::StatusOr<std::unique_ptr<ifrt::Program>> MakeHloProgramFromBytes(
     nb::bytes mlir_module) {
-  return MakeXlaProgram(
+  return MakeHloProgram(
       absl::string_view(mlir_module.c_str(), mlir_module.size()));
 }
 
@@ -109,10 +110,10 @@ void BuildIfrtProgramsSubmodule(nanobind::module_& m) {
   nb::class_<xla::ifrt::CompileOptions> ifrt_compile_options_base_class(
       sub_module, "CompileOptions");
   sub_module
-      .def("make_xla_program",
-           xla::ValueOrThrowWrapper(MakeXlaProgramFromString))
-      .def("make_xla_program",
-           xla::ValueOrThrowWrapper(MakeXlaProgramFromBytes))
+      .def("make_hlo_program",
+           xla::ValueOrThrowWrapper(MakeHloProgramFromString))
+      .def("make_hlo_program",
+           xla::ValueOrThrowWrapper(MakeHloProgramFromBytes))
       .def("make_plugin_program",
            xla::ValueOrThrowWrapper(MakePluginProgramFromString))
       .def("make_plugin_program",
diff --git a/third_party/xla/xla/python/py_values.cc b/third_party/xla/xla/python/py_values.cc
index 1853c0ef52d7b5..a6243c0ead1232 100644
--- a/third_party/xla/xla/python/py_values.cc
+++ b/third_party/xla/xla/python/py_values.cc
@@ -51,11 +51,11 @@ limitations under the License.
 #include "xla/python/sharding.h"
 #include "xla/python/types.h"
 #include "xla/shape.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/python/lib/core/numpy.h"
 #include "xla/types.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/ml_dtypes.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/profiler/lib/traceme.h"
@@ -181,7 +181,7 @@ absl::StatusOr<DevicePutResultFn> HandleNumpyScalar(
   } else if (std::is_same<T, tsl::float8_e4m3fn>()) {
     PyArray_ScalarAsCtype(h.ptr(), &data.template emplace<2>());
     type = F8E4M3FN;
-  } else if (std::is_same<T, tsl::float8_e4m3b11>()) {
+  } else if (std::is_same<T, tsl::float8_e4m3b11fnuz>()) {
     PyArray_ScalarAsCtype(h.ptr(), &data.template emplace<2>());
     type = F8E4M3B11FNUZ;
   } else if (std::is_same<T, tsl::float8_e5m2>()) {
@@ -382,7 +382,7 @@ absl::StatusOr<DevicePutResultFn> DevicePut(nb::handle arg,
         (*p)[dtypes.np_float8_e4m3fn.ptr()] =
             HandleNumpyScalar<tsl::float8_e4m3fn>;
         (*p)[dtypes.np_float8_e4m3b11fnuz.ptr()] =
-            HandleNumpyScalar<tsl::float8_e4m3b11>;
+            HandleNumpyScalar<tsl::float8_e4m3b11fnuz>;
         (*p)[dtypes.np_float8_e5m2.ptr()] = HandleNumpyScalar<tsl::float8_e5m2>;
         (*p)[dtypes.np_float8_e4m3fnuz.ptr()] =
             HandleNumpyScalar<tsl::float8_e4m3fnuz>;
diff --git a/third_party/xla/xla/python/py_values.h b/third_party/xla/xla/python/py_values.h
index d82f5310e5f39b..7f19ba1fbb71c0 100644
--- a/third_party/xla/xla/python/py_values.h
+++ b/third_party/xla/xla/python/py_values.h
@@ -32,8 +32,8 @@ limitations under the License.
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/memory.h"
 #include "xla/python/nb_numpy.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/concurrency/ref_count.h"
 
 namespace xla {
 
@@ -72,7 +72,7 @@ struct DevicePutResult {
 // to execute.
 //
 // May throw exceptions from nanobind in addition to failing via an error
-// Status. (We could catch these if needed, but there seems little point.)
+// absl::Status. (We could catch these if needed, but there seems little point.)
 struct DevicePutOptions {
   bool squash_64bit_types = false;
   bool allow_zero_copy = true;
diff --git a/third_party/xla/xla/python/pytree.cc b/third_party/xla/xla/python/pytree.cc
index be3172282a4e44..633d246f6983f5 100644
--- a/third_party/xla/xla/python/pytree.cc
+++ b/third_party/xla/xla/python/pytree.cc
@@ -98,6 +98,22 @@ void PyTreeRegistry::Register(nb::object type, nb::callable to_iterable,
   }
 }
 
+void PyTreeRegistry::RegisterDataclass(nb::object type,
+                                       std::vector<nb::str> data_fields,
+                                       std::vector<nb::str> meta_fields) {
+  auto registration = std::make_unique<Registration>();
+  registration->kind = PyTreeKind::kDataclass;
+  registration->type = type;
+  registration->data_fields = std::move(data_fields);
+  registration->meta_fields = std::move(meta_fields);
+  auto it = registrations_.emplace(type, std::move(registration));
+  if (!it.second) {
+    throw std::invalid_argument(absl::StrFormat(
+        "Duplicate custom dataclass PyTreeDef type registration for %s.",
+        nb::cast<std::string_view>(nb::repr(std::move(type)))));
+  }
+}
+
 std::pair<nanobind::iterable, nanobind::object>
 PyTreeRegistry::Registration::ToIterable(nanobind::handle o) const {
   nb::object out = to_iterable(o);
@@ -125,7 +141,8 @@ PyTreeKind PyTreeRegistry::KindOfObject(
     nb::handle obj, PyTreeRegistry::Registration const** custom) const {
   const PyTreeRegistry::Registration* registration = Lookup(obj.type());
   if (registration) {
-    if (registration->kind == PyTreeKind::kCustom) {
+    if (registration->kind == PyTreeKind::kCustom ||
+        registration->kind == PyTreeKind::kDataclass) {
       *custom = registration;
     } else {
       *custom = nullptr;
@@ -252,6 +269,23 @@ nb::object PyTreeRegistry::FlattenOneLevel(nb::handle x) const {
       auto [leaves, aux_data] = custom->ToIterable(x);
       return nb::make_tuple(std::move(leaves), std::move(aux_data));
     }
+    case PyTreeKind::kDataclass: {
+      auto data_size = custom->data_fields.size();
+      nb::list leaves = nb::steal<nb::list>(PyList_New(data_size));
+      for (int leaf = 0; leaf < data_size; ++leaf) {
+        PyList_SET_ITEM(
+            leaves.ptr(), leaf,
+            nb::getattr(x, custom->data_fields[leaf]).release().ptr());
+      }
+      auto meta_size = custom->meta_fields.size();
+      nb::object aux_data = nb::steal(PyTuple_New(meta_size));
+      for (int meta_leaf = 0; meta_leaf < meta_size; ++meta_leaf) {
+        PyTuple_SET_ITEM(
+            aux_data.ptr(), meta_leaf,
+            nb::getattr(x, custom->meta_fields[meta_leaf]).release().ptr());
+      }
+      return nb::make_tuple(std::move(leaves), std::move(aux_data));
+    }
     default:
       DCHECK(kind == PyTreeKind::kLeaf);
       return nb::none();
@@ -332,6 +366,24 @@ void PyTreeDef::FlattenImpl(nb::handle handle, T& leaves,
         }
         break;
       }
+      case PyTreeKind::kDataclass: {
+        auto meta_size = node.custom->meta_fields.size();
+        nb::object aux_data = nb::steal(PyTuple_New(meta_size));
+        for (int meta_leaf = 0; meta_leaf < meta_size; ++meta_leaf) {
+          PyTuple_SET_ITEM(
+              aux_data.ptr(), meta_leaf,
+              nb::getattr(handle, node.custom->meta_fields[meta_leaf])
+                  .release()
+                  .ptr());
+        }
+        node.node_data = std::move(aux_data);
+        auto data_size = node.custom->data_fields.size();
+        node.arity = data_size;
+        for (int leaf = 0; leaf < data_size; ++leaf) {
+          recurse(nb::getattr(handle, node.custom->data_fields[leaf]));
+        }
+        break;
+      }
       case PyTreeKind::kNamedTuple: {
         nb::tuple tuple = nb::borrow<nb::tuple>(handle);
         node.arity = tuple.size();
@@ -416,7 +468,8 @@ nb::object PyTreeDef::UnflattenImpl(T leaves) const {
       case PyTreeKind::kNamedTuple:
       case PyTreeKind::kList:
       case PyTreeKind::kDict:
-      case PyTreeKind::kCustom: {
+      case PyTreeKind::kCustom:
+      case PyTreeKind::kDataclass: {
         const int size = agenda.size();
         absl::Span<nb::object> span;
         if (node.arity > 0) {
@@ -495,6 +548,20 @@ nb::object PyTreeDef::Unflatten(absl::Span<const nb::object> leaves) const {
       }
       return node.custom->from_iterable(node.node_data, tuple);
     }
+
+    case PyTreeKind::kDataclass: {
+      nb::kwargs kwargs;
+      auto meta_size = node.custom->meta_fields.size();
+      for (int i = 0; i < meta_size; ++i) {
+        kwargs[node.custom->meta_fields[i]] =
+            nb::borrow(nb::tuple(node.node_data)[i]);
+      }
+      auto data_size = node.custom->data_fields.size();
+      for (int i = 0; i < data_size; ++i) {
+        kwargs[node.custom->data_fields[i]] = std::move(children[i]);
+      }
+      return node.custom->type(**kwargs);
+    }
   }
   throw std::logic_error("Unreachable code.");
 }
@@ -640,6 +707,44 @@ nb::list PyTreeDef::FlattenUpTo(nb::handle xs) const {
         }
         break;
       }
+
+      case PyTreeKind::kDataclass: {
+        auto* registration = registry_->Lookup(object.type());
+        if (registration != node.custom) {
+          throw std::invalid_argument(absl::StrFormat(
+              "Custom dataclasss node type mismatch: expected type: %s, value: "
+              "%s.",
+              nb::cast<std::string_view>(nb::repr(node.custom->type)),
+              nb::cast<std::string_view>(nb::repr(std::move(object)))));
+        }
+        auto meta_size = node.custom->meta_fields.size();
+        nb::object aux_data = nb::steal(PyTuple_New(meta_size));
+        for (int meta_leaf = 0; meta_leaf < meta_size; ++meta_leaf) {
+          PyTuple_SET_ITEM(
+              aux_data.ptr(), meta_leaf,
+              nb::getattr(object, node.custom->meta_fields[meta_leaf])
+                  .release()
+                  .ptr());
+        }
+        if (node.node_data.not_equal(aux_data)) {
+          throw std::invalid_argument(absl::StrFormat(
+              "Mismatch custom dataclass node data: %s != %s; value: %s.",
+              nb::cast<std::string_view>(nb::repr(node.node_data)),
+              nb::cast<std::string_view>(nb::repr(aux_data)),
+              nb::cast<std::string_view>(nb::repr(object))));
+        }
+        auto data_size = node.custom->data_fields.size();
+        if (data_size != node.arity) {
+          throw std::invalid_argument(absl::StrFormat(
+              "Custom type arity mismatch: %d != %d; value: %s.", data_size,
+              node.arity, nb::cast<std::string_view>(nb::repr(object))));
+        }
+        for (int leaf = 0; leaf < data_size; ++leaf) {
+          agenda.push_back(nb::borrow<nb::object>(
+              nb::getattr(object, node.custom->data_fields[leaf])));
+        }
+        break;
+      }
     }
   }
   if (it != traversal_.rend() || leaf != -1) {
@@ -673,7 +778,8 @@ nb::object PyTreeDef::Walk(const nb::callable& f_node, nb::handle f_leaf,
       case PyTreeKind::kNamedTuple:
       case PyTreeKind::kList:
       case PyTreeKind::kDict:
-      case PyTreeKind::kCustom: {
+      case PyTreeKind::kCustom:
+      case PyTreeKind::kDataclass: {
         if (agenda.size() < node.arity) {
           throw std::logic_error("Too few elements for custom type.");
         }
@@ -847,7 +953,8 @@ std::string PyTreeDef::ToString() const {
       }
 
       case PyTreeKind::kNamedTuple:
-      case PyTreeKind::kCustom: {
+      case PyTreeKind::kCustom:
+      case PyTreeKind::kDataclass: {
         std::string kind;
         std::string data;
         if (node.kind == PyTreeKind::kNamedTuple) {
@@ -917,6 +1024,7 @@ void PyTreeDef::FromPickle(nb::object pickle) {
         node.sorted_dict_keys = nb::cast<std::vector<nb::object>>(t[2]);
         break;
       case PyTreeKind::kCustom:
+      case PyTreeKind::kDataclass:
         node.node_data = t[2];
         break;
       default:
@@ -925,7 +1033,8 @@ void PyTreeDef::FromPickle(nb::object pickle) {
         }
         break;
     }
-    if (node.kind == PyTreeKind::kCustom) {
+    if (node.kind == PyTreeKind::kCustom ||
+        node.kind == PyTreeKind::kDataclass) {
       node.custom = t[3].is_none() ? nullptr : registry()->Lookup(t[3]);
       if (node.custom == nullptr) {
         throw xla::XlaRuntimeError(
@@ -1081,6 +1190,7 @@ std::optional<std::pair<nb::object, nb::object>> PyTreeDef::GetNodeData()
     case PyTreeKind::kNamedTuple:
       return std::make_pair(node.node_data, nb::none());
     case PyTreeKind::kCustom:
+    case PyTreeKind::kDataclass:
       return std::make_pair(node.custom->type, node.node_data);
   }
 }
@@ -1165,6 +1275,7 @@ void BuildPytreeSubmodule(nb::module_& m) {
   registry.def("flatten_one_level", &PyTreeRegistry::FlattenOneLevel,
                nb::arg("tree").none());
   registry.def("register_node", &PyTreeRegistry::Register);
+  registry.def("register_dataclass_node", &PyTreeRegistry::RegisterDataclass);
   registry.def("__reduce__",
                [](nb::object self) { return self.attr("__name__"); });
   pytree.def("default_registry", &DefaultPyTreeRegistry);
diff --git a/third_party/xla/xla/python/pytree.h b/third_party/xla/xla/python/pytree.h
index 2cce9c5c3c7abc..54438cd1195277 100644
--- a/third_party/xla/xla/python/pytree.h
+++ b/third_party/xla/xla/python/pytree.h
@@ -45,6 +45,7 @@ enum class PyTreeKind {
   kList,        // A list
   kDict,        // A dict
   kCustom,      // A custom type.
+  kDataclass,   // A dataclass.
 };
 
 // Registry of custom node types.
@@ -73,12 +74,20 @@ class PyTreeRegistry : public std::enable_shared_from_this<PyTreeRegistry> {
     // of an iterable and an aux_data object
     std::pair<nanobind::iterable, nanobind::object> ToIterable(
         nanobind::handle o) const;
+
+    // For dataclasses.
+    std::vector<nanobind::str> data_fields;
+    std::vector<nanobind::str> meta_fields;
   };
 
   // Registers a new custom type. Objects of `type` will be treated as container
   // node types in PyTrees.
   void Register(nanobind::object type, nanobind::callable to_iterable,
                 nanobind::callable from_iterable);
+  // Same, but for dataclasses.
+  void RegisterDataclass(nanobind::object type,
+                         std::vector<nanobind::str> data_fields,
+                         std::vector<nanobind::str> meta_fields);
 
   // Finds the custom type registration for `type`. Returns nullptr if none
   // exists.
diff --git a/third_party/xla/xla/python/refine_polymorphic_shapes.cc b/third_party/xla/xla/python/refine_polymorphic_shapes.cc
index 58873b9de65cd7..6b74f8ce3777f8 100644
--- a/third_party/xla/xla/python/refine_polymorphic_shapes.cc
+++ b/third_party/xla/xla/python/refine_polymorphic_shapes.cc
@@ -154,7 +154,7 @@ struct CheckShapeAssertionsPass
 
     // input[0] (assert_what) : tensor<i1>
     auto assertWhatType =
-        op.getInputs()[0].getType().dyn_cast<mlir::ShapedType>();
+        mlir::dyn_cast<mlir::ShapedType>(op.getInputs()[0].getType());
     if (!assertWhatType || !assertWhatType.hasRank() ||
         assertWhatType.getRank() != 0 ||
         !assertWhatType.getElementType().isSignlessInteger() ||
@@ -165,7 +165,7 @@ struct CheckShapeAssertionsPass
     // input[1:] (error_message_inputs) : tensor<i32> or tensor<i64>
     for (int i = 0; i < nrErrorMessageInputs; ++i) {
       auto errorMessageInputType =
-          op.getInputs()[i + 1].getType().dyn_cast<mlir::ShapedType>();
+          mlir::dyn_cast<mlir::ShapedType>(op.getInputs()[i + 1].getType());
       if (!errorMessageInputType || !errorMessageInputType.hasRank() ||
           errorMessageInputType.getRank() != 0 ||
           !errorMessageInputType.getElementType().isSignlessInteger() ||
@@ -202,8 +202,7 @@ struct CheckShapeAssertionsPass
   }
 
   llvm::StringRef getErrorMessage(mlir::stablehlo::CustomCallOp op) const {
-    return op->getAttr(errorMessageAttrName)
-        .cast<mlir::StringAttr>()
+    return mlir::cast<mlir::StringAttr>(op->getAttr(errorMessageAttrName))
         .getValue();
   }
 
@@ -320,7 +319,7 @@ absl::Status ValidateStaticShapes(mlir::ModuleOp module) {
     // It's sufficient to only check results because operands either come from
     // results or from block arguments which are checked below.
     auto hasDynamicShape = [](mlir::Value value) {
-      auto shaped_type = value.getType().dyn_cast<mlir::ShapedType>();
+      auto shaped_type = mlir::dyn_cast<mlir::ShapedType>(value.getType());
       return shaped_type ? !shaped_type.hasStaticShape() : false;
     };
     bool opHasDynamicShapes = false;
diff --git a/third_party/xla/xla/python/sharding.cc b/third_party/xla/xla/python/sharding.cc
index fbb4020ff603cb..3693779a177a29 100644
--- a/third_party/xla/xla/python/sharding.cc
+++ b/third_party/xla/xla/python/sharding.cc
@@ -191,7 +191,16 @@ NamedSharding::NamedSharding(nb::object mesh, nb::object spec,
       CheckAndCanonicalizeMemoryKind(memory_kind_, internal_device_list_);
 
   nb::module_ si = nb::module_::import_("jax._src.sharding_impls");
-  parsed_pspec_ = si.attr("preprocess")(mesh_, spec_, parsed_pspec_);
+  // TODO(parkers): Once jax always has preprocess_with_manual, we can
+  // remove the fallback.
+  nb::object preprocess_fn;
+  try {
+    preprocess_fn = si.attr("preprocess_with_manual");
+  } catch (nb::python_error& e) {
+    parsed_pspec_ = si.attr("preprocess")(mesh_, spec_, parsed_pspec_);
+    return;
+  }
+  parsed_pspec_ = preprocess_fn(mesh_, spec_, parsed_pspec_, manual_axes_);
 }
 
 SingleDeviceSharding::SingleDeviceSharding(nb::object device,
diff --git a/third_party/xla/xla/python/types.cc b/third_party/xla/xla/python/types.cc
index 5ecb181fb2592e..c869042de8303a 100644
--- a/third_party/xla/xla/python/types.cc
+++ b/third_party/xla/xla/python/types.cc
@@ -278,6 +278,17 @@ absl::StatusOr<ifrt::DType> DtypeToIfRtDType(nb_dtype dtype) {
   return ifrt::ToDType(primitive_type);
 }
 
+absl::StatusOr<nb_dtype> IfrtDtypeToDtypeWithTokenCanonicalization(
+    ifrt::DType dtype) {
+  if (dtype.kind() == ifrt::DType::kToken) {
+    // Treat token as bool.
+    return nb::steal<nb_dtype>(
+        reinterpret_cast<PyObject*>(PyArray_DescrFromType(NPY_BOOL)));
+  }
+
+  return IfrtDtypeToNbDtype(dtype);
+}
+
 const NumpyScalarTypes& GetNumpyScalarTypes() {
   static const NumpyScalarTypes* singleton = []() {
     NumpyScalarTypes* dtypes = new NumpyScalarTypes();
diff --git a/third_party/xla/xla/python/types.h b/third_party/xla/xla/python/types.h
index 00f28dbdbc3607..82b67a206a2d32 100644
--- a/third_party/xla/xla/python/types.h
+++ b/third_party/xla/xla/python/types.h
@@ -33,7 +33,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
-#include "xla/statusor.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
@@ -49,6 +48,12 @@ absl::StatusOr<nb_dtype> IfrtDtypeToNbDtype(ifrt::DType dtype);
 
 absl::StatusOr<ifrt::DType> DtypeToIfRtDType(nb_dtype dtype);
 
+// Converts an IFRT dtype to a NumPy dtype. It specially converts `kToken` into
+// bool to avoid exposing the token type to the JAX dtype system, expecting JAX
+// internals to use a bool array to express a token input/output.
+absl::StatusOr<nb_dtype> IfrtDtypeToDtypeWithTokenCanonicalization(
+    ifrt::DType dtype);
+
 // Returns a Python buffer protocol (PEP 3118) format descriptor string for
 // `type`. Return nullptr if there is no suitable choice of format string.
 const char* PEP3118FormatDescriptorForPrimitiveType(PrimitiveType type);
diff --git a/third_party/xla/xla/python/util.cc b/third_party/xla/xla/python/util.cc
index d03de01d88f341..724f71772ed213 100644
--- a/third_party/xla/xla/python/util.cc
+++ b/third_party/xla/xla/python/util.cc
@@ -29,7 +29,7 @@ limitations under the License.
 
 namespace xla {
 
-Status AwaitBuffersReady(absl::Span<ifrt::Array* const> ifrt_arrays) {
+absl::Status AwaitBuffersReady(absl::Span<ifrt::Array* const> ifrt_arrays) {
   ifrt::Future<> future;
   if (ifrt_arrays.size() == 1) {
     future = ifrt_arrays[0]->GetReadyFuture();
@@ -42,7 +42,7 @@ Status AwaitBuffersReady(absl::Span<ifrt::Array* const> ifrt_arrays) {
     future = ifrt::JoinFutures(absl::MakeSpan(futures));
   }
 
-  Status s = future.Await();
+  absl::Status s = future.Await();
   if (!s.ok()) {
     // Fix up error string because some clients rely on it.
     if (s.message() == "GetReadyFuture() called on deleted or donated buffer") {
diff --git a/third_party/xla/xla/python/util.h b/third_party/xla/xla/python/util.h
index 3e9f26d2bbd086..20977ffa0be36c 100644
--- a/third_party/xla/xla/python/util.h
+++ b/third_party/xla/xla/python/util.h
@@ -27,7 +27,7 @@ namespace xla {
 
 // Requests if given buffers are ready, awaits for results and returns OK if
 // all of the buffers are ready or the last non-ok status.
-Status AwaitBuffersReady(absl::Span<ifrt::Array* const> ifrt_arrays);
+absl::Status AwaitBuffersReady(absl::Span<ifrt::Array* const> ifrt_arrays);
 
 }  // namespace xla
 
diff --git a/third_party/xla/xla/python/xla.cc b/third_party/xla/xla/python/xla.cc
index b4c279c3d1c522..8b05979daf2523 100644
--- a/third_party/xla/xla/python/xla.cc
+++ b/third_party/xla/xla/python/xla.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <functional>
 #include <memory>
 #include <optional>
-#include <set>
 #include <string>
 #include <utility>
 #include <variant>
@@ -59,7 +58,7 @@ limitations under the License.
 #include "xla/service/cpu/collectives_interface.h"
 #include "xla/tsl/python/lib/core/numpy.h"  //NOLINT
 #ifdef XLA_PYTHON_ENABLE_GPU
-#include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
+#include "xla/python/gpu_support.h"
 #endif  // XLA_PYTHON_ENABLE_GPU
 
 #ifdef __linux__
@@ -76,7 +75,6 @@ limitations under the License.
 #include "xla/pjrt/cpu/cpu_client.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/pjrt/exceptions.h"
-#include "xla/pjrt/gpu/gpu_helpers.h"
 #include "xla/pjrt/pjrt_api.h"
 #include "xla/pjrt/pjrt_c_api_client.h"
 #include "xla/pjrt/pjrt_client.h"
@@ -357,56 +355,7 @@ NB_MODULE(xla_extension, m_nb) {
   });
 
 #ifdef XLA_PYTHON_ENABLE_GPU
-  nb::class_<GpuAllocatorConfig> alloc_config(m_nb, "GpuAllocatorConfig");
-  alloc_config.def(nb::init<>())
-      .def_rw("kind", &GpuAllocatorConfig::kind)
-      .def_rw("memory_fraction", &GpuAllocatorConfig::memory_fraction)
-      .def_rw("preallocate", &GpuAllocatorConfig::preallocate)
-      .def_rw("collective_memory_size",
-              &GpuAllocatorConfig::collective_memory_size);
-  nb::enum_<GpuAllocatorConfig::Kind>(alloc_config, "Kind")
-      .value("DEFAULT", GpuAllocatorConfig::Kind::kDefault)
-      .value("PLATFORM", GpuAllocatorConfig::Kind::kPlatform)
-      .value("BFC", GpuAllocatorConfig::Kind::kBFC)
-      .value("CUDA_ASYNC", GpuAllocatorConfig::Kind::kCudaAsync);
-
-  m_nb.def(
-      "get_gpu_client",
-      [](bool asynchronous, const GpuAllocatorConfig& allocator_config,
-         std::shared_ptr<DistributedRuntimeClient> distributed_client,
-         int node_id, int num_nodes,
-         std::optional<std::set<int>> allowed_devices,
-         std::optional<std::string> platform_name,
-         std::optional<bool> mock = false) -> nb_class_ptr<PyClient> {
-        std::unique_ptr<ifrt::PjRtClient> ifrt_client;
-        {
-          nb::gil_scoped_release gil_release;
-          std::shared_ptr<KeyValueStoreInterface> kv_store = nullptr;
-          if (distributed_client != nullptr) {
-            kv_store = GetDistributedKeyValueStore(distributed_client,
-                                                   /*key_prefix=*/"gpu:");
-          }
-          GpuClientOptions options;
-          options.allocator_config = allocator_config;
-          options.node_id = node_id;
-          options.num_nodes = num_nodes;
-          options.allowed_devices = allowed_devices;
-          options.platform_name = platform_name;
-          options.kv_store = kv_store;
-          options.enable_mock_nccl = mock.value_or(false);
-          std::unique_ptr<PjRtClient> pjrt_client =
-              xla::ValueOrThrow(GetStreamExecutorGpuClient(options));
-          ifrt_client = ifrt::PjRtClient::Create(std::move(pjrt_client));
-        }
-        return PyClient::Make(std::move(ifrt_client));
-      },
-      nb::arg("asynchronous") = true,
-      nb::arg("allocator_config") = GpuAllocatorConfig(),
-      nb::arg("distributed_client") = nullptr, nb::arg("node_id") = 0,
-      nb::arg("num_nodes") = 1,
-      nb::arg("allowed_devices").none() = std::nullopt,
-      nb::arg("platform_name").none() = std::nullopt,
-      nb::arg("mock").none() = std::nullopt);
+  RegisterGpuClientAndDefineGpuAllocatorConfig(m_nb);
 #endif  // XLA_PYTHON_ENABLE_GPU
 
   m_nb.def(
@@ -609,7 +558,9 @@ NB_MODULE(xla_extension, m_nb) {
       nb::arg("dlpack"), nb::arg("cpu_backend").none() = nb::none(),
       nb::arg("gpu_backend").none() = nb::none());
   m_nb.def("cuda_array_interface_to_buffer",
-           xla::ValueOrThrowWrapper(CudaArrayInterfaceToBuffer));
+           xla::ValueOrThrowWrapper(CudaArrayInterfaceToBuffer), nb::arg("cai"),
+           nb::arg("gpu_backend").none() = nb::none(),
+           nb::arg("device_id").none() = nb::none());
 
   BuildIfrtProgramsSubmodule(m_nb);
   BuildProfilerSubmodule(m_nb);
@@ -692,12 +643,14 @@ NB_MODULE(xla_extension, m_nb) {
       .def(
           "wait_at_barrier",
           [](DistributedRuntimeClient& client, std::string barrier_id,
-             int64_t timeout_in_ms) {
+             int64_t timeout_in_ms,
+             std::optional<std::vector<int32_t>> process_ids) {
             nb::gil_scoped_release gil_release;
             xla::ThrowIfError(client.WaitAtBarrier(
-                barrier_id, absl::Milliseconds(timeout_in_ms)));
+                barrier_id, absl::Milliseconds(timeout_in_ms), process_ids));
           },
-          nb::arg("barrier_id"), nb::arg("timeout_in_ms"))
+          nb::arg("barrier_id"), nb::arg("timeout_in_ms"),
+          nb::arg("process_ids") = std::nullopt)
       // The key must be a string, but the value can either be a Python string
       // or bytes object.
       // With Python string values, use `key_value_set()` and
@@ -765,7 +718,7 @@ NB_MODULE(xla_extension, m_nb) {
           "key_value_delete",
           [](DistributedRuntimeClient& client, std::string_view key) {
             nb::gil_scoped_release gil_release;
-            return client.KeyValueDelete(key);
+            return xla::ThrowIfError(client.KeyValueDelete(key));
           },
           nb::arg("key"));
 
diff --git a/third_party/xla/xla/python/xla_client.py b/third_party/xla/xla/python/xla_client.py
index a911602a16fe91..b649011958ff32 100644
--- a/third_party/xla/xla/python/xla_client.py
+++ b/third_party/xla/xla/python/xla_client.py
@@ -49,7 +49,7 @@
 
 # Just an internal arbitrary increasing number to help with backward-compatible
 # changes. In JAX, reference this via jax._src.lib.xla_extension_version.
-_version = 257
+_version = 266
 
 # Version number for MLIR:Python components.
 mlir_api_version = 56
diff --git a/third_party/xla/xla/python/xla_client_test.py b/third_party/xla/xla/python/xla_client_test.py
index db86a6a5fa8c6f..c6e8e586198cc6 100644
--- a/third_party/xla/xla/python/xla_client_test.py
+++ b/third_party/xla/xla/python/xla_client_test.py
@@ -33,9 +33,20 @@
 
 # pylint: disable=g-import-not-at-top
 try:
-  from xla.python import custom_call_for_test
+  from xla.python import custom_calls_testlib
 except ImportError:
-  custom_call_for_test = None
+  custom_calls_testlib = None
+
+try:
+  from xla.python import xla_extension
+  from xla.python import xla_gpu_extension
+
+  if not hasattr(xla_extension, "GpuAllocatorConfig"):
+    xla_extension.GpuAllocatorConfig = xla_gpu_extension.GpuAllocatorConfig
+  if not hasattr(xla_extension, "get_gpu_client"):
+    xla_extension.get_gpu_client = xla_gpu_extension.get_gpu_client
+except ImportError:
+  pass
 
 xla_client._xla.jax_jit.set_thread_local_state_initialization_callback(
     lambda: None
@@ -113,8 +124,10 @@ def setUp(self):
 
       global _CUSTOM_CALLS_REGISTERED
       if self.backend.platform == "cpu" and not _CUSTOM_CALLS_REGISTERED:
-        for name, fn in custom_call_for_test.cpu_custom_call_targets.items():
-          xla_client.register_custom_call_target(name, fn, platform="cpu")
+        for name, fn in custom_calls_testlib.registrations().items():
+          xla_client.register_custom_call_target(
+              name, fn, platform="cpu", api_version=1
+          )
         _CUSTOM_CALLS_REGISTERED = True
 
     def _NewComputation(self, name=None):
@@ -413,7 +426,7 @@ def testCustomCall(self):
       c = self._NewComputation()
       ops.CustomCallWithLayout(
           c,
-          b"test_subtract_f32",
+          b"subtract_f32",
           operands=[
               ops.Constant(c, np.float32(1.25)),
               ops.Constant(c, np.float32(0.5))
@@ -425,33 +438,106 @@ def testCustomCall(self):
               xla_client.Shape.array_shape(np.dtype(np.float32), (), ()),
           ],
           api_version=xla_client.ops.CustomCallApiVersion
-          .API_VERSION_STATUS_RETURNING)
+          .API_VERSION_TYPED_FFI)
       self._ExecuteAndCompareClose(c, expected=[0.75])
 
-    def testCustomCallWithUnifiedApi(self):
+    def testCustomCallWithUnifiedApiUnknownTarget(self):
       if self.backend.platform != "cpu":
         self.skipTest("Test requires cpu platform")
       c = self._NewComputation()
 
-      opaque_str = b"foo"
       ops.CustomCallWithLayout(
           c,
-          b"test_add_input_and_opaque_len",
-          operands=[
-              ops.Constant(c, np.float32(1.25)),
-              ops.Constant(c, np.float32(0.5))
-          ],
+          b"not_existing",
+          operands=[],
           shape_with_layout=xla_client.Shape.array_shape(
-              np.dtype(np.float32), (), ()),
+              np.dtype(np.float32), (), ()
+          ),
+          operand_shapes_with_layout=[],
+          api_version=xla_client.ops.CustomCallApiVersion
+          .API_VERSION_STATUS_RETURNING_UNIFIED,
+      )
+      with self.assertRaisesRegex(
+          xla_client.XlaRuntimeError, expected_regex="INVALID_ARGUMENT"
+      ):
+        self._Execute(c, arguments=())
+
+    def testCustomCallTypedFfiUnknownTarget(self):
+      if self.backend.platform != "cpu":
+        self.skipTest("Test requires cpu platform")
+      c = self._NewComputation()
+
+      ops.CustomCallWithLayout(
+          c,
+          b"not_existing",
+          operands=[],
+          shape_with_layout=xla_client.Shape.array_shape(
+              np.dtype(np.float32), (), ()
+          ),
+          operand_shapes_with_layout=[],
+          api_version=xla_client.ops.CustomCallApiVersion.API_VERSION_TYPED_FFI,
+      )
+      with self.assertRaises(xla_client.XlaRuntimeError):
+        self._Execute(c, arguments=())
+
+    def testCustomCallTypedFfiAlwaysFail(self):
+      if self.backend.platform != "cpu":
+        self.skipTest("Test requires cpu platform")
+      c = self._NewComputation()
+
+      ops.CustomCallWithLayout(
+          c,
+          b"always_fail",
+          operands=[],
+          shape_with_layout=xla_client.Shape.array_shape(
+              np.dtype(np.float32), (), ()
+          ),
+          operand_shapes_with_layout=[],
+          api_version=xla_client.ops.CustomCallApiVersion.API_VERSION_TYPED_FFI,
+      )
+
+      with self.assertRaisesRegex(
+          Exception, expected_regex="Failed intentionally"
+      ):
+        self._Execute(c, arguments=())
+
+    def testCustomCallTypedFfiAlwaysSucceed(self):
+      if self.backend.platform != "cpu":
+        self.skipTest("Test requires cpu platform")
+      c = self._NewComputation()
+
+      ops.CustomCallWithLayout(
+          c,
+          b"always_succeed",
+          operands=[],
+          shape_with_layout=xla_client.Shape.array_shape(
+              np.dtype(np.float32), (), ()
+          ),
+          operand_shapes_with_layout=[],
+          api_version=xla_client.ops.CustomCallApiVersion.API_VERSION_TYPED_FFI,
+      )
+
+      self._Execute(c, arguments=())
+
+    def testCustomCallTypedFfiSubtract(self):
+      if self.backend.platform != "cpu":
+        self.skipTest("Test requires cpu platform")
+      c = self._NewComputation()
+
+      ops.CustomCallWithLayout(
+          c,
+          b"subtract_f32_cst",
+          operands=[ops.Constant(c, np.float32(1.25))],
+          shape_with_layout=xla_client.Shape.array_shape(
+              np.dtype(np.float32), (), ()
+          ),
           operand_shapes_with_layout=[
               xla_client.Shape.array_shape(np.dtype(np.float32), (), ()),
-              xla_client.Shape.array_shape(np.dtype(np.float32), (), ()),
           ],
-          # With opaque length = 3.0
-          opaque=opaque_str,
-          api_version=xla_client.ops.CustomCallApiVersion
-          .API_VERSION_STATUS_RETURNING_UNIFIED)
-      self._ExecuteAndCompareClose(c, expected=[1.25 + len(opaque_str)])
+          opaque=b"{cst = 3.0 : f32}",
+          api_version=xla_client.ops.CustomCallApiVersion.API_VERSION_TYPED_FFI,
+      )
+      self._ExecuteAndCompareClose(c, expected=[-1.75])
 
     def testCustomCallLookup(self):
       if self.backend.platform != "cpu":
@@ -462,10 +548,7 @@ def testCustomCallLookup(self):
       self.assertTrue(_CUSTOM_CALLS_REGISTERED)
       xla_client.make_cpu_client()
       self.assertContainsSubset(
-          [
-              call.decode()
-              for call in custom_call_for_test.cpu_custom_call_targets.keys()
-          ],
+          list(custom_calls_testlib.registrations().keys()),
           xla_client.custom_call_targets("Host").keys(),
       )
 
@@ -2952,15 +3035,15 @@ def testPluginProgramDoesNotCompile(self):
       program = xla_client.ifrt_programs.make_plugin_program("foobar")
       options = xla_client.ifrt_programs.make_plugin_compile_options()
       with self.assertRaisesRegex(
-          xla_client.XlaRuntimeError, "PjRtCompiler requires an XlaProgram"
+          xla_client.XlaRuntimeError, "PjRtCompiler requires an HloProgram"
       ):
         self.backend.compile_ifrt_program(program, options)
 
     @unittest.skipIf(pathways, "does not work with non-ifrt legacy pathways")
-    def testXlaProgramViaIfrtProgram(self):
+    def testHloProgramViaIfrtProgram(self):
       c = self._NewComputation()
       ops.Iota(c, xla_client.PrimitiveType.F32, 10)
-      program = xla_client.ifrt_programs.make_xla_program(
+      program = xla_client.ifrt_programs.make_hlo_program(
           xla_computation_to_mlir_module(c.build())
       )
       options = xla_client.ifrt_programs.make_xla_compile_options(
diff --git a/third_party/xla/xla/python/xla_compiler.cc b/third_party/xla/xla/python/xla_compiler.cc
index b5ef8148f142bc..27022340dbc323 100644
--- a/third_party/xla/xla/python/xla_compiler.cc
+++ b/third_party/xla/xla/python/xla_compiler.cc
@@ -23,11 +23,14 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/base/thread_annotations.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/hash/hash.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
@@ -77,12 +80,14 @@ limitations under the License.
 #include "xla/service/tuple_simplifier.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/statusor.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/lib/strings/proto_serialization.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
+#include "tsl/platform/statusor.h"
 
 namespace nanobind {
 namespace detail {
@@ -139,7 +144,7 @@ static std::string UniquifyName(const std::string& name) {
 }
 
 // Converts a computation to a serialized HloModuleProto.
-StatusOr<nb::bytes> GetComputationSerializedProto(
+absl::StatusOr<nb::bytes> GetComputationSerializedProto(
     const XlaComputation& computation) {
   std::string result;
   if (!tsl::SerializeToStringDeterministic(computation.proto(), &result)) {
@@ -149,7 +154,7 @@ StatusOr<nb::bytes> GetComputationSerializedProto(
 }
 
 // Converts a hlo module to a serialized HloModuleProto.
-StatusOr<nb::bytes> GetHloModuleSerializedProto(const HloModule& module) {
+absl::StatusOr<nb::bytes> GetHloModuleSerializedProto(const HloModule& module) {
   std::string result;
   if (!tsl::SerializeToStringDeterministic(module.ToProto(), &result)) {
     return Unknown("Failed to serialize the HloModuleProto.");
@@ -158,7 +163,7 @@ StatusOr<nb::bytes> GetHloModuleSerializedProto(const HloModule& module) {
 }
 
 // Converts a serialized HloModuleProto into a HloModule.
-StatusOr<std::shared_ptr<HloModule>> HloModuleFromSerializedProto(
+absl::StatusOr<std::shared_ptr<HloModule>> HloModuleFromSerializedProto(
     const nb::bytes& bytes) {
   HloModuleProto proto;
   proto.ParseFromArray(bytes.c_str(), bytes.size());
@@ -170,7 +175,7 @@ StatusOr<std::shared_ptr<HloModule>> HloModuleFromSerializedProto(
   return std::shared_ptr<HloModule>(std::move(module));
 }
 
-StatusOr<std::shared_ptr<HloModule>> GetHloModule(
+absl::StatusOr<std::shared_ptr<HloModule>> GetHloModule(
     const XlaComputation& computation) {
   TF_ASSIGN_OR_RETURN(const HloModuleConfig module_config,
                       HloModule::CreateModuleConfigFromProto(
@@ -182,7 +187,7 @@ StatusOr<std::shared_ptr<HloModule>> GetHloModule(
 }
 
 // Converts a computation to textual HLO form.
-StatusOr<std::string> GetComputationHloText(
+absl::StatusOr<std::string> GetComputationHloText(
     const XlaComputation& computation, bool print_large_constants = false) {
   TF_ASSIGN_OR_RETURN(std::shared_ptr<HloModule> hlo_module,
                       GetHloModule(computation));
@@ -193,7 +198,7 @@ StatusOr<std::string> GetComputationHloText(
 }
 
 // Converts a computation to HLO dot graph form.
-StatusOr<std::string> GetComputationHloDotGraph(
+absl::StatusOr<std::string> GetComputationHloDotGraph(
     const XlaComputation& computation) {
   TF_ASSIGN_OR_RETURN(std::shared_ptr<HloModule> hlo_module,
                       GetHloModule(computation));
@@ -203,14 +208,14 @@ StatusOr<std::string> GetComputationHloDotGraph(
 }
 
 // Hashes the HLO module.
-StatusOr<uint64_t> HashComputation(const XlaComputation& computation) {
+absl::StatusOr<uint64_t> HashComputation(const XlaComputation& computation) {
   TF_ASSIGN_OR_RETURN(std::shared_ptr<HloModule> hlo_module,
                       GetHloModule(computation));
   return absl::HashOf(*hlo_module);
 }
 // Safe version of ShapeUtil::MakeShapeWithDenseLayout that fails gracefully on
 // invalid input.
-StatusOr<Shape> MakeShapeWithDenseLayout(
+absl::StatusOr<Shape> MakeShapeWithDenseLayout(
     PrimitiveType element_type, absl::Span<const int64_t> dims,
     std::optional<absl::Span<const int64_t>> minor_to_major,
     std::optional<const std::vector<bool>> dynamic_dimensions) {
@@ -255,7 +260,7 @@ StatusOr<Shape> MakeShapeWithDenseLayout(
 // transpose_perm=[0,1,2] (no transpose)
 // PartitionSpec('B', 'A', 'C') corresponds to reshape_dims=[4,2,2],
 // transpose_perm=[1,0,2] (swap A and B)
-StatusOr<HloSharding> IotaTileHelper(
+absl::StatusOr<HloSharding> IotaTileHelper(
     absl::Span<const int64_t> dims, absl::Span<const int64_t> reshape_dims,
     absl::Span<const int> transpose_perm,
     absl::Span<const OpSharding::Type> subgroup_types) {
@@ -390,7 +395,7 @@ void BuildXlaCompilerSubmodule(nb::module_& m) {
                       [](PrimitiveType type, nb::sequence dims_seq,
                          std::optional<nb::sequence> layout_seq,
                          std::optional<std::vector<bool>> dynamic_dimensions)
-                          -> StatusOr<Shape> {
+                          -> absl::StatusOr<Shape> {
                         std::vector<int64_t> dims =
                             SequenceToVector<int64_t>(dims_seq);
                         if (layout_seq) {
@@ -412,7 +417,7 @@ void BuildXlaCompilerSubmodule(nb::module_& m) {
               [](nb_dtype dtype, nb::sequence dims_seq,
                  std::optional<nb::sequence> layout_seq,
                  std::optional<std::vector<bool>> dynamic_dimensions)
-                  -> StatusOr<Shape> {
+                  -> absl::StatusOr<Shape> {
                 PrimitiveType type = ValueOrThrow(DtypeToPrimitiveType(dtype));
                 std::vector<int64_t> dims = SequenceToVector<int64_t>(dims_seq);
                 if (layout_seq) {
@@ -617,24 +622,24 @@ void BuildXlaCompilerSubmodule(nb::module_& m) {
    public:
     ComputationWrapper(const HloComputation* comp,
                        const std::shared_ptr<HloModule> module)
-        : comp{comp}, module{module} {}
-    absl::string_view name() const { return comp->name(); }
+        : comp_(comp), module_(module) {}
+    absl::string_view name() const { return comp_->name(); }
     void render_html(const std::string& filename) {
       std::string html = xla::ValueOrThrow(RenderGraph(
-          *comp, /*label=*/"", comp->parent()->config().debug_options(),
+          *comp_, /*label=*/"", comp_->parent()->config().debug_options(),
           RenderedGraphFormat::kHtml, HloRenderOptions()));
       xla::ThrowIfError(tsl::WriteStringToFile(
           tsl::Env::Default(), absl::StrCat(filename, ".html"), html));
     }
 
    private:
-    const HloComputation* comp;
+    const HloComputation* comp_;
     // The module owns the computations: if its destructor is called, the
     // computations are freed. To prevent that from happening in cases where the
     // module Python object goes out of scope and gets garbage collected before
     // the computations, we keep a shared_ptr to the module that originated the
     // computation.
-    const std::shared_ptr<HloModule> module;
+    const std::shared_ptr<HloModule> module_;
   };
 
   nb::class_<ComputationWrapper> hlo_computation_class(m, "HloComputation");
@@ -715,7 +720,7 @@ void BuildXlaCompilerSubmodule(nb::module_& m) {
   m.def(
       "hlo_module_cost_analysis",
       xla::ValueOrThrowWrapper([](PyClient* client, const HloModule& module)
-                                   -> StatusOr<nb::dict> {
+                                   -> absl::StatusOr<nb::dict> {
         TF_ASSIGN_OR_RETURN(auto analysis,
                             client->pjrt_client()->GetHloCostAnalysis());
         TF_RETURN_IF_ERROR(module.entry_computation()->Accept(analysis.get()));
@@ -728,14 +733,15 @@ void BuildXlaCompilerSubmodule(nb::module_& m) {
         return ret;
       }));
   m.def("hlo_module_from_text",
-        xla::ValueOrThrowWrapper([](const std::string& hlo_module_text)
-                                     -> StatusOr<std::shared_ptr<HloModule>> {
-          auto hlo_module =
-              xla::ParseAndReturnUnverifiedModule(hlo_module_text);
-          TF_RETURN_IF_ERROR(hlo_module.status());
-          std::shared_ptr<HloModule> result(std::move(*hlo_module));
-          return result;
-        }));
+        xla::ValueOrThrowWrapper(
+            [](const std::string& hlo_module_text)
+                -> absl::StatusOr<std::shared_ptr<HloModule>> {
+              auto hlo_module =
+                  xla::ParseAndReturnUnverifiedModule(hlo_module_text);
+              TF_RETURN_IF_ERROR(hlo_module.status());
+              std::shared_ptr<HloModule> result(std::move(*hlo_module));
+              return result;
+            }));
 
   nb::class_<XlaOp> xla_op_class(m, "XlaOp");
 
@@ -765,7 +771,7 @@ void BuildXlaCompilerSubmodule(nb::module_& m) {
       .def(
           "get_program_shape",
           [](const XlaBuilder& builder,
-             std::optional<XlaOp> root) -> StatusOr<ProgramShape> {
+             std::optional<XlaOp> root) -> absl::StatusOr<ProgramShape> {
             return root ? builder.GetProgramShape(*root)
                         : builder.GetProgramShape();
           },
@@ -790,7 +796,7 @@ void BuildXlaCompilerSubmodule(nb::module_& m) {
       .def_static(
           "create",
           xla::ValueOrThrowWrapper([](nb::ndarray<int, nb::ndim<2>> array)
-                                       -> StatusOr<DeviceAssignment> {
+                                       -> absl::StatusOr<DeviceAssignment> {
             if (array.ndim() != 2) {
               return InvalidArgument(
                   "Argument to DeviceAssignment constructor must be a "
@@ -808,16 +814,18 @@ void BuildXlaCompilerSubmodule(nb::module_& m) {
       .def("replica_count", &DeviceAssignment::replica_count)
       .def("computation_count", &DeviceAssignment::computation_count)
       .def("__repr__", &DeviceAssignment::ToString)
-      .def("serialize", xla::ValueOrThrowWrapper([](const DeviceAssignment& da)
-                                                     -> StatusOr<nb::bytes> {
-             DeviceAssignmentProto proto;
-             TF_RETURN_IF_ERROR(da.Serialize(&proto));
-             std::string result;
-             if (!tsl::SerializeToStringDeterministic(proto, &result)) {
-               return Unknown("Failed to serialize the DeviceAssignmentProto.");
-             }
-             return nb::bytes(result.data(), result.size());
-           }));
+      .def("serialize",
+           xla::ValueOrThrowWrapper(
+               [](const DeviceAssignment& da) -> absl::StatusOr<nb::bytes> {
+                 DeviceAssignmentProto proto;
+                 TF_RETURN_IF_ERROR(da.Serialize(&proto));
+                 std::string result;
+                 if (!tsl::SerializeToStringDeterministic(proto, &result)) {
+                   return Unknown(
+                       "Failed to serialize the DeviceAssignmentProto.");
+                 }
+                 return nb::bytes(result.data(), result.size());
+               }));
 
   nb::class_<CompileOptions> compile_options(m, "CompileOptions");
   compile_options
@@ -1078,24 +1086,11 @@ void BuildXlaCompilerSubmodule(nb::module_& m) {
                    [](DebugOptions* self, std::string value) {
                      self->set_xla_dump_hlo_pipeline_re(value);
                    })
-      .def_prop_rw("xla_gpu_enable_async_all_reduce",
-                   &DebugOptions::xla_gpu_enable_async_all_reduce,
-                   &DebugOptions::set_xla_gpu_enable_async_all_reduce)
-      .def_prop_rw("xla_gpu_enable_async_all_gather",
-                   &DebugOptions::xla_gpu_enable_async_all_gather,
-                   &DebugOptions::set_xla_gpu_enable_async_all_gather)
-      .def_prop_rw("xla_gpu_enable_async_collective_broadcast",
-                   &DebugOptions::xla_gpu_enable_async_collective_broadcast,
-                   &DebugOptions::set_xla_gpu_enable_async_collective_broadcast)
-      .def_prop_rw("xla_gpu_enable_async_collective_permute",
-                   &DebugOptions::xla_gpu_enable_async_collective_permute,
-                   &DebugOptions::set_xla_gpu_enable_async_collective_permute)
-      .def_prop_rw("xla_gpu_enable_async_all_to_all",
-                   &DebugOptions::xla_gpu_enable_async_all_to_all,
-                   &DebugOptions::set_xla_gpu_enable_async_all_to_all)
-      .def_prop_rw("xla_gpu_enable_async_reduce_scatter",
-                   &DebugOptions::xla_gpu_enable_async_reduce_scatter,
-                   &DebugOptions::set_xla_gpu_enable_async_reduce_scatter);
+      .def_prop_rw("xla_gpu_dump_autotune_logs_to",
+                   &DebugOptions::xla_gpu_dump_autotune_logs_to,
+                   [](DebugOptions* self, std::string value) {
+                     self->set_xla_gpu_dump_autotune_logs_to(value);
+                   });
 
   nb::class_<ExecutableBuildOptions>(m, "ExecutableBuildOptions")
       .def(nb::init<>())
diff --git a/third_party/xla/xla/python/xla_extension/__init__.pyi b/third_party/xla/xla/python/xla_extension/__init__.pyi
index b9af7fe9321a5b..3bbb5e3e3feb02 100644
--- a/third_party/xla/xla/python/xla_extension/__init__.pyi
+++ b/third_party/xla/xla/python/xla_extension/__init__.pyi
@@ -303,17 +303,12 @@ class DebugOptions:
   xla_dump_hlo_as_long_text: bool
   xla_dump_disable_metadata: bool
   xla_dump_hlo_pipeline_re: str
-  xla_gpu_enable_async_all_reduce: bool
-  xla_gpu_enable_async_all_gather: bool
-  xla_gpu_enable_async_collective_broadcast: bool
-  xla_gpu_enable_async_collective_permute: bool
-  xla_gpu_enable_async_all_to_all: bool
-  xla_gpu_enable_async_reduce_scatter: bool
   xla_gpu_cuda_data_dir: str
   xla_detailed_logging: bool
   xla_enable_dumping: bool
   xla_gpu_dump_autotune_results_to: str
   xla_gpu_load_autotune_results_from: str
+  xla_gpu_dump_autotune_logs_to: str
 
 class CompiledMemoryStats:
   generated_code_size_in_bytes: int
@@ -549,6 +544,12 @@ def make_gloo_tcp_collectives(
     interface: Optional[str] = ...,
 ) -> CpuCollectives: ...
 
+class MpiCollectives(CpuCollectives):
+  def Init(self): ...
+  def Finalize(self): ...
+
+def make_mpi_collectives() -> MpiCollectives: ...
+
 def get_tfrt_cpu_client(
     asynchronous: bool = ...,
     distributed_client: Optional[DistributedRuntimeClient] = ...,
@@ -728,6 +729,7 @@ def cuda_array_interface_to_buffer(
       List[Tuple[str, str, Tuple[int, ...]]]]
     ],
     gpu_backend: Optional[Client] = ...,
+    device_id: int | None = None,
 ) -> ArrayImpl: ...
 
 
@@ -774,7 +776,9 @@ class DistributedRuntimeClient:
   def key_value_set(self, key: str, value: str) -> _Status: ...
   def key_value_set_bytes(self, key: str, value: bytes) -> _Status: ...
   def key_value_delete(self, key: str) -> _Status: ...
-  def wait_at_barrier(self, barrier_id: str, timeout_in_ms: int) -> _Status: ...
+  def wait_at_barrier(
+      self, barrier_id: str, timeout_in_ms: int, process_ids: Optional[List[int]]
+  ) -> _Status: ...
 
 def get_distributed_runtime_service(
     address: str,
diff --git a/third_party/xla/xla/python/xla_extension/ifrt_programs.pyi b/third_party/xla/xla/python/xla_extension/ifrt_programs.pyi
index 58b0996b75797b..830df593b6e4fc 100644
--- a/third_party/xla/xla/python/xla_extension/ifrt_programs.pyi
+++ b/third_party/xla/xla/python/xla_extension/ifrt_programs.pyi
@@ -21,7 +21,7 @@ class Program:  ...
 
 class CompileOptions:  ...
 
-def make_xla_program(mlir_module: Union[str, bytes]) -> Program: ...
+def make_hlo_program(mlir_module: Union[str, bytes]) -> Program: ...
 
 def make_plugin_program(data: Union[str, bytes]) -> Program: ...
 
diff --git a/third_party/xla/xla/python/xla_extension/pytree.pyi b/third_party/xla/xla/python/xla_extension/pytree.pyi
index e493fda1dfde55..1a7ddf5f0b5961 100644
--- a/third_party/xla/xla/python/xla_extension/pytree.pyi
+++ b/third_party/xla/xla/python/xla_extension/pytree.pyi
@@ -37,6 +37,12 @@ class PyTreeRegistry:
       __type: Type[_T],
       to_iterable: Callable[[_T], Tuple[_Children, _AuxData]],
       from_iterable: Callable[[_AuxData, _Children], _T]) -> Any: ...
+  def register_dataclass_node(
+      self,
+      __type: Type[_T],
+      meta_fields: List[str],
+      data_fields: List[str]) -> Any: ...
+
 
 def default_registry() -> PyTreeRegistry: ...
 
diff --git a/third_party/xla/xla/python/xla_gpu_support.cc b/third_party/xla/xla/python/xla_gpu_support.cc
new file mode 100644
index 00000000000000..b243b11adef474
--- /dev/null
+++ b/third_party/xla/xla/python/xla_gpu_support.cc
@@ -0,0 +1,43 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <Python.h>
+
+#include "third_party/nanobind/include/nanobind/nanobind.h"
+#include "xla/python/gpu_support.h"
+#include "xla/python/logging.h"
+#include "xla/python/py_client.h"  // IWYU pragma: keep
+
+namespace xla {
+namespace {
+
+namespace nb = nanobind;
+
+}  // namespace
+
+NB_MODULE(xla_gpu_extension, m_nb) {
+  // Initialize ABSL logging because code within XLA uses it.
+#ifndef PLATFORM_GOOGLE
+  InitializeAbslLogging();
+#endif  // PLATFORM_GOOGLE
+
+  // We seem to get a fair number of leak warnings from nanobind. It's unclear
+  // whether these are false positives or not.
+  nb::set_leak_warnings(false);
+
+  RegisterGpuClientAndDefineGpuAllocatorConfig(m_nb);
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/python/xplane_to_profile_instructions.cc b/third_party/xla/xla/python/xplane_to_profile_instructions.cc
index 772e1821a82a6b..07137cd1d923b2 100644
--- a/third_party/xla/xla/python/xplane_to_profile_instructions.cc
+++ b/third_party/xla/xla/python/xplane_to_profile_instructions.cc
@@ -177,7 +177,7 @@ void GetXPlaneHloModuleInfo(
 
 }  // namespace
 
-Status ConvertXplaneUnderLogdirToProfiledInstructionsProto(
+absl::Status ConvertXplaneUnderLogdirToProfiledInstructionsProto(
     const std::string& logdir, tensorflow::profiler::ProfiledInstructionsProto*
                                    profiled_instructions_proto) {
   // Find the xplane files for each host under logdir.
@@ -202,7 +202,7 @@ Status ConvertXplaneUnderLogdirToProfiledInstructionsProto(
                                                   profiled_instructions_proto);
 }
 
-Status ConvertXplaneToProfiledInstructionsProto(
+absl::Status ConvertXplaneToProfiledInstructionsProto(
     std::vector<tensorflow::profiler::XSpace> xspaces,
     tensorflow::profiler::ProfiledInstructionsProto*
         profiled_instructions_proto) {
diff --git a/third_party/xla/xla/python/xplane_to_profile_instructions.h b/third_party/xla/xla/python/xplane_to_profile_instructions.h
index 8971dc651030e8..b32bfd2fbb9220 100644
--- a/third_party/xla/xla/python/xplane_to_profile_instructions.h
+++ b/third_party/xla/xla/python/xplane_to_profile_instructions.h
@@ -36,14 +36,14 @@ struct HloLatencyInfo {
 
 // Convert XSpace to ProfiledInstructionsProto. This function will aggregate
 // all the xplane.pb info into ProfiledInstructionsProto.
-Status ConvertXplaneToProfiledInstructionsProto(
+absl::Status ConvertXplaneToProfiledInstructionsProto(
     std::vector<tensorflow::profiler::XSpace> xspaces,
     tensorflow::profiler::ProfiledInstructionsProto*
         profiled_instructions_proto);
 
 // Convert XSpace to ProfiledInstructionsProto. This function will aggregate
 // all the xplane.pb info under logdir into ProfiledInstructionsProto.
-Status ConvertXplaneUnderLogdirToProfiledInstructionsProto(
+absl::Status ConvertXplaneUnderLogdirToProfiledInstructionsProto(
     const std::string& logdir, tensorflow::profiler::ProfiledInstructionsProto*
                                    profiled_instructions_proto);
 
diff --git a/third_party/xla/xla/runtime/BUILD b/third_party/xla/xla/runtime/BUILD
deleted file mode 100644
index 2f9adfb4546d12..00000000000000
--- a/third_party/xla/xla/runtime/BUILD
+++ /dev/null
@@ -1,506 +0,0 @@
-load("@local_tsl//tsl/platform:build_config.bzl", "tf_platform_deps")
-load(
-    "@local_tsl//tsl/platform:build_config_root.bzl",
-    "if_llvm_aarch32_available",
-    "if_llvm_aarch64_available",
-    "if_llvm_powerpc_available",
-    "if_llvm_system_z_available",
-    "if_llvm_x86_available",
-)
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("//xla:xla.bzl", "xla_cc_test")
-load("//xla/tsl:tsl.bzl", "internal_visibility")
-load("//xla/tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = internal_visibility(["//xla:internal"]),
-    licenses = ["notice"],
-)
-
-cc_library(
-    name = "arguments",
-    srcs = ["arguments.cc"],
-    hdrs = ["arguments.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        ":async_runtime",
-        ":types",
-        "//xla:shape_util",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@llvm-project//llvm:Support",
-    ],
-)
-
-xla_cc_test(
-    name = "arguments_test",
-    srcs = ["arguments_test.cc"],
-    deps = [
-        ":arguments",
-        "@local_tsl//tsl/platform:test_benchmark",
-        "@local_tsl//tsl/platform:test_main",
-    ],
-)
-
-cc_library(
-    name = "async_runtime",
-    srcs = ["async_runtime.cc"],
-    hdrs = ["async_runtime.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        "@com_google_absl//absl/base:dynamic_annotations",
-        "@local_tsl//tsl/concurrency:async_value",
-        "@local_tsl//tsl/concurrency:ref_count",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:platform_port",
-    ],
-)
-
-xla_cc_test(
-    name = "async_runtime_test",
-    srcs = ["async_runtime_test.cc"],
-    deps = [
-        ":async_runtime",
-        "@local_tsl//tsl/concurrency:async_value",
-        "@local_tsl//tsl/platform:test",
-        "@local_tsl//tsl/platform:test_main",
-    ],
-)
-
-cc_library(
-    name = "async_values_cache",
-    hdrs = ["async_values_cache.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        "@local_tsl//tsl/platform",
-    ] + tf_platform_deps(
-        "async_values_cache",
-        platform_dir = "//xla/runtime/",
-    ),
-)
-
-cc_library(
-    name = "constraints",
-    srcs = ["constraints.cc"],
-    hdrs = ["constraints.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-    ],
-)
-
-filegroup(
-    name = "aot_ffi_execution_context_hdrs",
-    srcs = ["aot_ffi_execution_context.h"],
-)
-
-cc_library(
-    name = "aot_ffi_execution_context",
-    hdrs = ["aot_ffi_execution_context.h"],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "aot_ffi",
-    srcs = ["aot_ffi.cc"],
-    hdrs = ["aot_ffi.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":aot_ffi_c_symbols",
-        "//xla/runtime/ffi:ffi_api",
-    ],
-)
-
-cc_library(
-    name = "aot_ffi_c_symbols",
-    srcs = ["aot_ffi_c_symbols.cc"],
-    hdrs = ["aot_ffi_c_symbols.h"],
-    visibility = ["//visibility:public"],
-    deps = [":aot_ffi_execution_context"],
-)
-
-cc_library(
-    name = "custom_call",
-    srcs = ["custom_call.cc"],
-    hdrs = ["custom_call.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        ":async_runtime",
-        ":diagnostics",
-        ":errors",
-        ":logical_result",
-        ":map_by_type",
-        ":memref_view",
-        ":state",
-        ":type_id",
-        "//xla:shape_util",
-        "//xla/runtime/ffi:ffi_abi",
-        "@com_google_absl//absl/base:dynamic_annotations",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/types:span",
-        "@eigen_archive//:eigen3",
-        "@llvm-project//llvm:Support",
-        "@local_tsl//tsl/concurrency:async_value",
-    ],
-)
-
-cc_library(
-    name = "custom_call_registry",
-    srcs = ["custom_call_registry.cc"],
-    hdrs = ["custom_call_registry.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        ":custom_call",
-        "@llvm-project//llvm:Support",
-    ],
-)
-
-cc_library(
-    name = "diagnostics",
-    srcs = ["diagnostics.cc"],
-    hdrs = ["diagnostics.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        ":logical_result",
-        "@com_google_absl//absl/status",
-        "@llvm-project//llvm:Support",
-        "@local_tsl//tsl/platform:logging",
-    ],
-)
-
-xla_cc_test(
-    name = "diagnostics_test",
-    srcs = ["diagnostics_test.cc"],
-    deps = [
-        ":diagnostics",
-        "@com_google_absl//absl/status",
-        "@local_tsl//tsl/platform:test",
-        "@local_tsl//tsl/platform:test_main",
-    ],
-)
-
-cc_library(
-    name = "errors",
-    hdrs = ["errors.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings:str_format",
-    ],
-)
-
-cc_library(
-    name = "executable",
-    srcs = ["executable.cc"],
-    hdrs = ["executable.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        ":arguments",
-        ":async_runtime",
-        ":custom_call",
-        ":custom_call_registry",
-        ":diagnostics",
-        ":errors",
-        ":execution_engine",
-        ":logical_result",
-        ":memory_mapper",
-        ":results",
-        ":runtime",
-        ":type_id",
-        ":types",
-        "//xla/mlir/runtime/utils:async_runtime_api",
-        "//xla/mlir/runtime/utils:c_runner_utils",
-        "//xla/mlir/runtime/utils:float_16bits",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:str_format",
-        "@llvm-project//llvm:OrcJIT",
-        "@llvm-project//llvm:Support",
-    ],
-)
-
-cc_library(
-    name = "execution_engine",
-    srcs = ["execution_engine.cc"],
-    hdrs = ["execution_engine.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        ":errors",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:str_format",
-        "@llvm-project//llvm:Core",
-        "@llvm-project//llvm:ExecutionEngine",
-        "@llvm-project//llvm:OrcJIT",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//llvm:Target",
-        "@llvm-project//llvm:TransformUtils",
-    ] + if_llvm_aarch32_available([
-        "@llvm-project//llvm:ARMAsmParser",
-        "@llvm-project//llvm:ARMCodeGen",
-    ]) + if_llvm_aarch64_available([
-        "@llvm-project//llvm:AArch64AsmParser",
-        "@llvm-project//llvm:AArch64CodeGen",
-    ]) + if_llvm_powerpc_available([
-        "@llvm-project//llvm:PowerPCAsmParser",
-        "@llvm-project//llvm:PowerPCCodeGen",
-    ]) + if_llvm_system_z_available([
-        "@llvm-project//llvm:SystemZAsmParser",
-        "@llvm-project//llvm:SystemZCodeGen",
-    ]) + if_llvm_x86_available([
-        "@llvm-project//llvm:X86AsmParser",
-        "@llvm-project//llvm:X86CodeGen",
-    ]),
-)
-
-cc_library(
-    name = "jit_executable",
-    srcs = ["jit_executable.cc"],
-    hdrs = ["jit_executable.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        ":async_values_cache",
-        ":constraints",
-        ":errors",
-        "//xla/mlir/runtime/transforms:jit_compiler",
-        "//xla/mlir/runtime/utils:constraints",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@llvm-project//llvm:Support",
-        "@local_tsl//tsl/concurrency:async_value",
-    ],
-)
-
-cc_library(
-    name = "logical_result",
-    hdrs = ["logical_result.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = ["@llvm-project//mlir:Support"],
-)
-
-cc_library(
-    name = "map_by_type",
-    hdrs = ["map_by_type.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        ":type_id",
-        "@llvm-project//llvm:Support",
-    ],
-)
-
-xla_cc_test(
-    name = "map_by_type_test",
-    srcs = ["map_by_type_test.cc"],
-    deps = [
-        ":map_by_type",
-        "@local_tsl//tsl/platform:test",
-        "@local_tsl//tsl/platform:test_benchmark",
-        "@local_tsl//tsl/platform:test_main",
-    ],
-)
-
-cc_library(
-    name = "memory_mapper",
-    srcs = ["memory_mapper.cc"],
-    hdrs = ["memory_mapper.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        "@llvm-project//llvm:ExecutionEngine",
-        "@llvm-project//llvm:Support",
-        "@local_tsl//tsl/platform",
-    ] + tf_platform_deps(
-        "memory_mapper",
-        platform_dir = "//xla/runtime/",
-    ),
-)
-
-cc_library(
-    name = "memref_view",
-    hdrs = ["memref_view.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        "//xla:xla_data_proto_cc",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-cc_library(
-    name = "module",
-    hdrs = ["module.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        ":custom_call_registry",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-    ],
-)
-
-cc_library(
-    name = "module_registry",
-    srcs = ["module_registry.cc"],
-    hdrs = ["module_registry.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        ":module",
-    ],
-)
-
-xla_cc_test(
-    name = "module_test",
-    srcs = ["module_test.cc"],
-    deps = [
-        ":custom_call",
-        ":module",
-        "@local_tsl//tsl/platform:test",
-        "@local_tsl//tsl/platform:test_main",
-    ],
-)
-
-cc_library(
-    name = "results",
-    hdrs = ["results.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        ":logical_result",
-        ":types",
-    ],
-)
-
-xla_cc_test(
-    name = "results_test",
-    srcs = ["results_test.cc"],
-    deps = [
-        ":logical_result",
-        ":results",
-        ":types",
-        "@local_tsl//tsl/platform:test",
-        "@local_tsl//tsl/platform:test_benchmark",
-        "@local_tsl//tsl/platform:test_main",
-    ],
-)
-
-cc_library(
-    name = "runtime",
-    hdrs = ["runtime.h"],
-    compatible_with = get_compatible_with_portable(),
-)
-
-cc_library(
-    name = "state",
-    hdrs = ["state.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/synchronization",
-    ],
-)
-
-xla_cc_test(
-    name = "state_test",
-    srcs = ["state_test.cc"],
-    deps = [
-        ":state",
-        "@local_tsl//tsl/platform:test",
-        "@local_tsl//tsl/platform:test_benchmark",
-        "@local_tsl//tsl/platform:test_main",
-    ],
-)
-
-cc_library(
-    name = "symbolic_shape",
-    srcs = ["symbolic_shape.cc"],
-    hdrs = ["symbolic_shape.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        ":arguments",
-        ":constraints",
-        ":logical_result",
-        ":types",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@llvm-project//llvm:Support",
-    ],
-)
-
-xla_cc_test(
-    name = "symbolic_shape_test",
-    srcs = ["symbolic_shape_test.cc"],
-    deps = [
-        ":arguments",
-        ":constraints",
-        ":symbolic_shape",
-        ":types",
-        "@llvm-project//llvm:Support",
-        "@local_tsl//tsl/platform:test",
-        "@local_tsl//tsl/platform:test_benchmark",
-        "@local_tsl//tsl/platform:test_main",
-    ],
-)
-
-cc_library(
-    name = "types",
-    srcs = ["types.cc"],
-    hdrs = ["types.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        "//xla:shape_util",
-        "//xla:xla_data_proto_cc",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-    ],
-)
-
-cc_library(
-    name = "tracing",
-    hdrs = ["tracing.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        ":custom_call",
-        ":type_id",
-    ],
-)
-
-cc_library(
-    name = "type_id",
-    srcs = ["type_id.cc"],
-    hdrs = ["type_id.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-cc_library(
-    name = "compiler",
-    hdrs = ["compiler.h"],
-    compatible_with = get_compatible_with_portable(),
-)
-
-cc_library(
-    name = "cpu_event",
-    hdrs = ["cpu_event.h"],
-)
-
-xla_cc_test(
-    name = "type_id_test",
-    srcs = ["type_id_test.cc"],
-    deps = [
-        ":type_id",
-        "@local_tsl//tsl/platform:test",
-        "@local_tsl//tsl/platform:test_benchmark",
-        "@local_tsl//tsl/platform:test_main",
-    ],
-)
diff --git a/third_party/xla/xla/runtime/README.md b/third_party/xla/xla/runtime/README.md
deleted file mode 100644
index c74b1a0cd8431c..00000000000000
--- a/third_party/xla/xla/runtime/README.md
+++ /dev/null
@@ -1,13 +0,0 @@
-# XLA Runtime
-
-XLA runtime is a set of libraries that support execution of XLA programs
-compiled to native executables. XLA runtime provides user-friendly APIs for
-calling compiled programs, takes care of passing arguments and returning
-results according to the expected ABI, implements async tasks support and
-defines the FFI for compiled programs to call into user-defined callbacks.
-
-If you squint and look at XLA as a programming language like Objective-C, then
-the XLA runtime is somewhat similar to Objective-C runtime: a runtime library
-that provides support for the functionality that we do not want to compile, e.g.
-it provides functionality to launch asynchronous tasks in a thread pool, because
-we do not want to codegen directly on top of `pthreads` library.
diff --git a/third_party/xla/xla/runtime/aot_ffi.cc b/third_party/xla/xla/runtime/aot_ffi.cc
deleted file mode 100644
index 950a0d1395a9b6..00000000000000
--- a/third_party/xla/xla/runtime/aot_ffi.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-// Copyright 2023 The OpenXLA Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "xla/runtime/aot_ffi.h"
-
-#include <iostream>
-#include <string>
-
-#include "xla/runtime/aot_ffi_c_symbols.h"
-#include "xla/runtime/ffi/ffi_api.h"
-
-// XLA_FFI_Error is forward-declared by the XLA FFI C API,
-// and therefore has to be defined in the global namespace.
-
-struct XLA_FFI_Error {
-  XLA_FFI_Error_Code errc;
-  std::string error;
-};
-
-namespace xla {
-namespace runtime {
-namespace aot {
-
-template <const int64_t* ptr>
-static const void* GetAotType() {
-  return static_cast<const void*>(ptr);
-}
-
-static XLA_FFI_Error* CreateError(XLA_FFI_Error_Create_Args* args) {
-  assert(XLA_FFI_Error_Create_Args_STRUCT_SIZE == args->struct_size);
-  return new XLA_FFI_Error{args->errc, std::string(args->message)};
-}
-
-XLA_FFI_Api FfiApi() {
-  XLA_FFI_Api api = {
-      /*struct_size=*/XLA_FFI_Api_STRUCT_SIZE,
-      /*priv=*/nullptr,
-
-      // Module Registration APIs.
-      nullptr,
-
-      // Execution Context APIs.
-      nullptr,  // module state
-      nullptr,  // stream
-
-      // Error Reporting APIs.
-      CreateError,
-
-      // Type table.
-      GetAotType<&__type_id_string>,
-      GetAotType<&__type_id_float>,
-      GetAotType<&__type_id_double>,
-      GetAotType<&__type_id_bool>,
-      GetAotType<&__type_id_int32>,
-      GetAotType<&__type_id_int64>,
-      GetAotType<&__type_id_array_float>,
-      GetAotType<&__type_id_array_double>,
-      GetAotType<&__type_id_array_int32>,
-      GetAotType<&__type_id_array_int64>,
-      GetAotType<&__type_id_tensor_float>,
-      GetAotType<&__type_id_tensor_double>,
-      GetAotType<&__type_id_tensor_int32>,
-      GetAotType<&__type_id_tensor_int64>,
-      GetAotType<&__type_id_memref_view>,
-      GetAotType<&__type_id_strided_memref_view>,
-      GetAotType<&__type_id_dictionary>,
-  };
-  return api;
-}
-
-XLA_FFI_Function_Args FfiArgs(XLA_FFI_Api* api, void** args, void** attrs,
-                              void** rets) {
-  XLA_FFI_Function_Args ffi_args;
-  ffi_args.api = api;
-  ffi_args.ctx = nullptr;
-  ffi_args.priv = nullptr;
-  ffi_args.struct_size = XLA_FFI_Function_Args_STRUCT_SIZE;
-  ffi_args.args = args;
-  ffi_args.attrs = attrs;
-  ffi_args.rets = rets;
-  return ffi_args;
-}
-
-bool ProcessErrorIfAny(XLA_FFI_Error* error) {
-  if (error == nullptr) {
-    return true;
-  }
-  // XLA has no way of passing errors; print to stderr.
-  std::cerr << "XLA FFI error: " << error->error << ".\n";
-  delete error;
-  return false;
-}
-
-}  // namespace aot
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/runtime/aot_ffi.h b/third_party/xla/xla/runtime/aot_ffi.h
deleted file mode 100644
index 4d5a101f8a8637..00000000000000
--- a/third_party/xla/xla/runtime/aot_ffi.h
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright 2023 The OpenXLA Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef XLA_RUNTIME_AOT_FFI_H_
-#define XLA_RUNTIME_AOT_FFI_H_
-
-#include "xla/runtime/ffi/ffi_api.h"
-
-namespace xla {
-namespace runtime {
-namespace aot {
-
-XLA_FFI_Api FfiApi();
-
-XLA_FFI_Function_Args FfiArgs(XLA_FFI_Api* api, void** args, void** attrs,
-                              void** rets);
-
-bool ProcessErrorIfAny(XLA_FFI_Error* error);
-
-}  // namespace aot
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_RUNTIME_AOT_FFI_H_
diff --git a/third_party/xla/xla/runtime/aot_ffi_c_symbols.cc b/third_party/xla/xla/runtime/aot_ffi_c_symbols.cc
deleted file mode 100644
index 7851172d4b8de7..00000000000000
--- a/third_party/xla/xla/runtime/aot_ffi_c_symbols.cc
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/runtime/aot_ffi_c_symbols.h"
-
-#include "xla/runtime/aot_ffi_execution_context.h"
-
-void* GetResultStorage(void* execution_context, int64_t index) {
-  auto* ctx =
-      static_cast<xla::runtime::aot::ExecutionContext*>(execution_context);
-  ctx->error = "AOT uses no result storage";
-  return nullptr;
-}
-
-void runtimeSetError(void* execution_context, const char* error) {
-  auto* ctx =
-      static_cast<xla::runtime::aot::ExecutionContext*>(execution_context);
-  ctx->error = error;
-}
-
-bool CustomCall(void* execution_context, const char* target, void** args,
-                void** attrs, void** rets) {
-  auto* ctx =
-      static_cast<xla::runtime::aot::ExecutionContext*>(execution_context);
-  ctx->error = "AOT has no custom call registry";
-  return false;
-}
-
-int64_t __type_id_opaque;
-int64_t __type_id_nullopt;
-int64_t __type_id_string;
-int64_t __type_id_function_ordinal;
-
-int64_t __type_id_bool;
-int64_t __type_id_int8;
-int64_t __type_id_int16;
-int64_t __type_id_int32;
-int64_t __type_id_int64;
-int64_t __type_id_uint8;
-int64_t __type_id_uint16;
-int64_t __type_id_uint32;
-int64_t __type_id_uint64;
-int64_t __type_id_bfloat16;
-int64_t __type_id_f16;
-int64_t __type_id_float;
-int64_t __type_id_double;
-
-int64_t __type_id_memref_view;
-int64_t __type_id_strided_memref_view;
-int64_t __type_id_empty_array;
-int64_t __type_id_dictionary;
-
-int64_t __type_id_array_int8;
-int64_t __type_id_array_int16;
-int64_t __type_id_array_int32;
-int64_t __type_id_array_int64;
-int64_t __type_id_array_float;
-int64_t __type_id_array_double;
-
-int64_t __type_id_tensor_int32;
-int64_t __type_id_tensor_int64;
-int64_t __type_id_tensor_float;
-int64_t __type_id_tensor_double;
-
-int64_t __type_id_async_bool;
-int64_t __type_id_async_int8;
-int64_t __type_id_async_int16;
-int64_t __type_id_async_int32;
-int64_t __type_id_async_int64;
-int64_t __type_id_async_float;
-int64_t __type_id_async_double;
-int64_t __type_id_async_memref;
-int64_t __type_id_async_chain;
diff --git a/third_party/xla/xla/runtime/aot_ffi_c_symbols.h b/third_party/xla/xla/runtime/aot_ffi_c_symbols.h
deleted file mode 100644
index 694c880521e36c..00000000000000
--- a/third_party/xla/xla/runtime/aot_ffi_c_symbols.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_RUNTIME_AOT_FFI_C_SYMBOLS_H_
-#define XLA_RUNTIME_AOT_FFI_C_SYMBOLS_H_
-
-#include <stdint.h>
-
-extern "C" {
-
-void* GetResultStorage(void* execution_context, int64_t index);
-void runtimeSetError(void* execution_context, const char* error);
-bool CustomCall(void* execution_context, const char* target, void** args,
-                void** attrs, void** rets);
-
-// We use int64_t just to make sure these symbols have reasonable alignment.
-// But really all we need is to have these symbols defined; we don't do anything
-// with them other than taking their address.
-extern int64_t __type_id_opaque;
-extern int64_t __type_id_nullopt;
-extern int64_t __type_id_string;
-extern int64_t __type_id_function_ordinal;
-
-extern int64_t __type_id_bool;
-extern int64_t __type_id_int8;
-extern int64_t __type_id_int16;
-extern int64_t __type_id_int32;
-extern int64_t __type_id_int64;
-extern int64_t __type_id_uint8;
-extern int64_t __type_id_uint16;
-extern int64_t __type_id_uint32;
-extern int64_t __type_id_uint64;
-extern int64_t __type_id_bfloat16;
-extern int64_t __type_id_f16;
-extern int64_t __type_id_float;
-extern int64_t __type_id_double;
-
-extern int64_t __type_id_memref_view;
-extern int64_t __type_id_strided_memref_view;
-extern int64_t __type_id_empty_array;
-extern int64_t __type_id_dictionary;
-
-extern int64_t __type_id_array_int8;
-extern int64_t __type_id_array_int16;
-extern int64_t __type_id_array_int32;
-extern int64_t __type_id_array_int64;
-extern int64_t __type_id_array_float;
-extern int64_t __type_id_array_double;
-
-extern int64_t __type_id_tensor_int32;
-extern int64_t __type_id_tensor_int64;
-extern int64_t __type_id_tensor_float;
-extern int64_t __type_id_tensor_double;
-
-extern int64_t __type_id_async_bool;
-extern int64_t __type_id_async_int8;
-extern int64_t __type_id_async_int16;
-extern int64_t __type_id_async_int32;
-extern int64_t __type_id_async_int64;
-extern int64_t __type_id_async_float;
-extern int64_t __type_id_async_double;
-extern int64_t __type_id_async_memref;
-extern int64_t __type_id_async_chain;
-
-}  // extern "C"
-
-#endif  // XLA_RUNTIME_AOT_FFI_C_SYMBOLS_H_
diff --git a/third_party/xla/xla/runtime/aot_ffi_execution_context.h b/third_party/xla/xla/runtime/aot_ffi_execution_context.h
deleted file mode 100644
index 3eeae717c56370..00000000000000
--- a/third_party/xla/xla/runtime/aot_ffi_execution_context.h
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright 2023 The OpenXLA Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef XLA_RUNTIME_AOT_FFI_EXECUTION_CONTEXT_H_
-#define XLA_RUNTIME_AOT_FFI_EXECUTION_CONTEXT_H_
-
-namespace xla {
-namespace runtime {
-namespace aot {
-
-// To keep dependencies to a minimum we cannot include the actual
-// xla::runtime::ExecutionContext. Instead, we re-define it here
-// to contain just the information we need for AOT.
-//
-// LINT.IfChange
-struct ExecutionContext {
-  void* results_memory_layout = nullptr;  // unused by aot_ffi.
-  void* call_frame = nullptr;             // unused by aot_ffi.
-  void* custom_call_data = nullptr;
-  void* custom_call_registry = nullptr;  // unused by aot_ffi.
-  const char* error = nullptr;  // Error message owned by the AOT object.
-};
-// LINT.ThenChange(//tensorflow/compiler/xla/runtime/executable.cc)
-
-}  // namespace aot
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_RUNTIME_AOT_FFI_EXECUTION_CONTEXT_H_
diff --git a/third_party/xla/xla/runtime/arguments.cc b/third_party/xla/xla/runtime/arguments.cc
deleted file mode 100644
index 4f69b70099a2ae..00000000000000
--- a/third_party/xla/xla/runtime/arguments.cc
+++ /dev/null
@@ -1,373 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/runtime/arguments.h"
-
-#include <cstddef>
-#include <cstdlib>
-#include <cstring>
-#include <optional>
-#include <string>
-#include <string_view>
-#include <type_traits>
-#include <utility>
-
-#include "absl/status/status.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/str_join.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/raw_ostream.h"
-#include "xla/primitive_util.h"
-#include "xla/runtime/async_runtime.h"
-#include "xla/runtime/types.h"
-
-namespace xla {
-namespace runtime {
-
-using absl::InvalidArgumentError;
-using absl::Status;
-using absl::StrCat;
-using absl::StrFormat;
-using absl::StrJoin;
-
-using llvm::dyn_cast;
-using llvm::isa;
-
-using xla::primitive_util::LowercasePrimitiveTypeName;
-
-//===----------------------------------------------------------------------===//
-// OpaqueArg.
-//===----------------------------------------------------------------------===//
-
-Status OpaqueArg::Verify(const Type& type) const {
-  if (isa<OpaqueOperandType>(type)) return absl::OkStatus();
-  return InvalidArgumentError(
-      StrCat("unsupported opaque argument type: ", type.ToString()));
-}
-
-void OpaqueArg::Pack(absl::Span<void*> args) const {
-  args[0] = const_cast<void*>(reinterpret_cast<const void*>(&ptr_));
-}
-
-std::string OpaqueArg::ToString() const {
-  return StrFormat("OpaqueArg: ptr=%p", ptr_);
-}
-
-//===----------------------------------------------------------------------===//
-// ScalarArg.
-//===----------------------------------------------------------------------===//
-
-Status ScalarArg::Verify(const Type& type) const {
-  auto* scalar = dyn_cast<ScalarType>(&type);
-  if (scalar && scalar->type() == type_) return absl::OkStatus();
-  return InvalidArgumentError(
-      StrCat("unsupported scalar argument type: ", type.ToString()));
-}
-
-void ScalarArg::Pack(absl::Span<void*> args) const {
-  args[0] = const_cast<void*>(reinterpret_cast<const void*>(&value_));
-}
-
-std::string ScalarArg::ToString() const {
-  return primitive_util::LowercasePrimitiveTypeName(type_);
-}
-
-//===----------------------------------------------------------------------===//
-// MemrefDesc.
-//===----------------------------------------------------------------------===//
-
-static bool AreCompatibleTypes(PrimitiveType type1, PrimitiveType type2) {
-  auto compatible = [&](PrimitiveType fromType, PrimitiveType toType) {
-    return (type1 == fromType && type2 == toType) ||
-           (type1 == toType && type2 == fromType);
-  };
-  // I1 and I8 types are compatible since they both are 1-byte size at runtime.
-  if (compatible(PrimitiveType::PRED, PrimitiveType::S8)) return true;
-
-  // Signed and unsigned integers of the same size are compatible in memory.
-  if (compatible(PrimitiveType::S8, PrimitiveType::U8) ||
-      compatible(PrimitiveType::S16, PrimitiveType::U16) ||
-      compatible(PrimitiveType::S32, PrimitiveType::U32) ||
-      compatible(PrimitiveType::S64, PrimitiveType::U64))
-    return true;
-
-  return type1 == type2;
-}
-
-static Status VerifyMemrefArgument(
-    PrimitiveType element_type, std::optional<absl::Span<const int64_t>> sizes,
-    const MemrefDesc& memref) {
-  // Format memref argument and expected type for user-friendly error messages.
-  auto pretty_print = [&]() -> std::string {
-    std::string err;
-    llvm::raw_string_ostream os(err);
-
-    auto dim = [](int64_t d) -> std::string {
-      return d == MemrefType::kDynamic ? "?" : std::to_string(d);
-    };
-
-    auto print_shaped = [&](std::optional<absl::Span<const int64_t>> dims,
-                            PrimitiveType dtype) {
-      if (!dims.has_value()) {
-        os << "[*x" << LowercasePrimitiveTypeName(dtype) << "]";
-        return;
-      }
-
-      if (dims->empty()) {
-        os << "[" << LowercasePrimitiveTypeName(dtype) << "]";
-        return;
-      }
-
-      os << "[" << dim((*dims)[0]);
-      for (int i = 1; i < dims->size(); ++i) os << "x" << dim((*dims)[i]);
-      os << "x" << LowercasePrimitiveTypeName(dtype) << "]";
-    };
-
-    os << "got ";
-    print_shaped({memref.sizes()}, memref.dtype());
-    os << " vs expected ";
-    print_shaped(sizes, element_type);
-
-    return err;
-  };
-
-  // Check that memref data type is compatible with the expected element type.
-  if (LLVM_UNLIKELY(!AreCompatibleTypes(element_type, memref.dtype()))) {
-    return InvalidArgumentError(
-        StrCat("type is not compatible with the expected element type: ",
-               primitive_util::LowercasePrimitiveTypeName(memref.dtype()),
-               " vs ", primitive_util::LowercasePrimitiveTypeName(element_type),
-               " (", pretty_print(), ")"));
-  }
-
-  // Skip sizes verification if they are not available (unranked tensor or
-  // memref type is compatible with run-time arguments of any shape).
-  if (!sizes.has_value()) return absl::OkStatus();
-
-  // Check that memref rank is the same as the expected rank.
-  if (LLVM_UNLIKELY(memref.rank() != sizes->size()))
-    return InvalidArgumentError(
-        StrCat("rank does not match expected input rank: ", memref.rank(),
-               " vs ", sizes->size(), " (", pretty_print(), ")"));
-
-  // Check that all statically known dimensions matches the memref dimensions.
-  for (const auto& pair : llvm::enumerate(llvm::zip(memref.sizes(), *sizes))) {
-    int64_t argument_dim = std::get<0>(pair.value());
-    int64_t expected_dim = std::get<1>(pair.value());
-
-    bool is_dynamic_dim = MemrefType::IsDynamic(expected_dim);
-
-    if (LLVM_UNLIKELY(argument_dim != expected_dim && !is_dynamic_dim))
-      return InvalidArgumentError(
-          StrCat("dimension #", pair.index(),
-                 " does not match expected input dimension: ", argument_dim,
-                 " vs ", expected_dim, " (", pretty_print(), ")"));
-  }
-
-  return absl::OkStatus();
-}
-
-Status MemrefDesc::Verify(const Type& type) const {
-  // Only ranked memrefs have a defined ABI and can be passed as an argument.
-  if (auto* memref = dyn_cast<MemrefType>(&type))
-    return VerifyMemrefArgument(memref->element_type(), memref->sizes(), *this);
-  return InvalidArgumentError(
-      StrCat("unsupported memref type: ", type.ToString()));
-}
-
-void MemrefDesc::Pack(absl::Span<void*> args) const {
-  auto cast = [](const void* p) { return const_cast<void*>(p); };
-
-  // Packs memref with a rank not known at compile time.
-  auto pack_memref = [&](int64_t rank) {
-    args[0] = cast(&data_);  // memref.basePtr
-    args[1] = cast(&data_);  // memref.data
-    args[2] = cast(&offset_);
-    for (int64_t d = 0; d < rank; ++d) {
-      args[3 + d] = cast(&sizes_and_strides_[d]);
-      args[3 + rank + d] = cast(&sizes_and_strides_[rank_ + d]);
-    }
-  };
-
-  // Packs memref with a rank known at compile time.
-  auto pack_ranked_memref = [&](auto rank_tag) {
-    static constexpr int64_t rank = decltype(rank_tag)::value;
-    return pack_memref(rank);
-  };
-
-  // Dispatch to lambda with a statically known rank parameter for the most
-  // common ranks. It allows to inline the nested lambda, and generate better
-  // code without for loops on a hot path.
-  switch (rank_) {
-    case 0:
-      return pack_ranked_memref(std::integral_constant<int64_t, 0>{});
-    case 1:
-      return pack_ranked_memref(std::integral_constant<int64_t, 1>{});
-    case 2:
-      return pack_ranked_memref(std::integral_constant<int64_t, 2>{});
-    case 3:
-      return pack_ranked_memref(std::integral_constant<int64_t, 3>{});
-    case 4:
-      return pack_ranked_memref(std::integral_constant<int64_t, 4>{});
-    default:
-      return pack_memref(rank_);
-  }
-}
-
-std::string MemrefDesc::ToString() const {
-  return StrFormat("MemrefDesc: dtype: %s offset: %i sizes: [%s] strides: [%s]",
-                   LowercasePrimitiveTypeName(dtype()), offset(),
-                   StrJoin(sizes(), ", "), StrJoin(strides(), ", "));
-}
-
-//===----------------------------------------------------------------------===//
-// Verify that argument type is compatible with the run-time memref argument.
-//===----------------------------------------------------------------------===//
-
-static Status VerifyMemrefArgument(const Type& type, const MemrefDesc& arg) {
-  if (auto* memref = dyn_cast<MemrefType>(&type))
-    return VerifyMemrefArgument(memref->element_type(), memref->sizes(), arg);
-  if (auto* memref = dyn_cast<UnrankedMemrefType>(&type))
-    return VerifyMemrefArgument(memref->element_type(), std::nullopt, arg);
-
-  if (auto* tensor = dyn_cast<RankedTensorType>(&type))
-    return VerifyMemrefArgument(tensor->element_type(), tensor->sizes(), arg);
-  if (auto* tensor = dyn_cast<UnrankedTensorType>(&type))
-    return VerifyMemrefArgument(tensor->element_type(), std::nullopt, arg);
-
-  return InvalidArgumentError(
-      StrCat("unsupported memref type: ", type.ToString()));
-}
-
-Status VerifyMemrefArgument(unsigned index, const Type& type,
-                            const MemrefDesc& arg) {
-  if (auto st = VerifyMemrefArgument(type, arg); !st.ok())
-    return InvalidArgumentError(StrCat("argument #", index, " ", st.message()));
-  return absl::OkStatus();
-}
-
-//===----------------------------------------------------------------------===//
-// AsyncTokenArg.
-//===----------------------------------------------------------------------===//
-
-Status AsyncTokenArg::Verify(const Type& type) const {
-  if (isa<AsyncTokenType>(type)) return absl::OkStatus();
-  return InvalidArgumentError(
-      absl::StrCat("expected async token type, got: ", type.ToString()));
-}
-
-void AsyncTokenArg::Pack(absl::Span<void*> args) const {
-  args[0] = const_cast<void*>(reinterpret_cast<const void*>(&storage_));
-}
-
-std::string AsyncTokenArg::ToString() const { return "Async token argument"; }
-
-//===----------------------------------------------------------------------===//
-// AsyncScalarArg.
-//===----------------------------------------------------------------------===//
-
-absl::Status AsyncScalarArg::Verify(const Type& type) const {
-  auto* value_type = llvm::dyn_cast<AsyncValueType>(&type);
-  if (!value_type)
-    return absl::InvalidArgumentError(
-        absl::StrCat("expected async value type, got: ", type.ToString()));
-
-  auto* scalar = llvm::dyn_cast<ScalarType>(&value_type->value_type());
-  if (scalar && scalar->type() == type_) return absl::OkStatus();
-  return absl::InvalidArgumentError(
-      absl::StrCat("unsupported scalar argument type: ", type.ToString()));
-}
-
-void AsyncScalarArg::Pack(absl::Span<void*> args) const {
-  args[0] = const_cast<void*>(reinterpret_cast<const void*>(&storage_));
-}
-
-std::string AsyncScalarArg::ToString() const {
-  return absl::StrFormat("Async value type: %s",
-                         primitive_util::LowercasePrimitiveTypeName(type_));
-}
-
-//===----------------------------------------------------------------------===//
-// AsyncMemrefArg.
-//===----------------------------------------------------------------------===//
-
-AsyncMemrefArg::AsyncMemrefArg(tsl::AsyncValueRef<MemrefDesc> value)
-    : value_(value) {
-  struct MemrefDescriptor {
-    void* allocated_ptr;
-    void* aligned_ptr;
-    int64_t offset;
-    int64_t dims[];
-  };
-
-  auto size_and_alignment =
-      [](const MemrefDesc* desc) -> std::pair<size_t, size_t> {
-    size_t size = 3 * sizeof(int64_t) + 2 * desc->rank() * sizeof(int64_t);
-    return std::make_pair(size, alignof(std::max_align_t));
-  };
-
-  auto write = [](const MemrefDesc* v, std::byte* store) {
-    MemrefDescriptor* store_t = reinterpret_cast<MemrefDescriptor*>(store);
-    auto rank = v->rank();
-    for (unsigned i = 0; i < rank; ++i) {
-      store_t->dims[i] = v->size(i);
-      store_t->dims[i + rank] = v->stride(i);
-    }
-
-    store_t->allocated_ptr = v->data();
-    store_t->aligned_ptr = v->data();
-    store_t->offset = 0;
-  };
-
-  storage_ =
-      AsyncRuntime::AsValue<MemrefDesc>(value_, size_and_alignment, write);
-}
-
-Status AsyncMemrefArg::Verify(const Type& type) const {
-  auto* value_type = llvm::dyn_cast<AsyncValueType>(&type);
-  if (!value_type)
-    return InvalidArgumentError(
-        absl::StrCat("expected async value type, got: ", type.ToString()));
-  auto* memref = llvm::dyn_cast<MemrefType>(&value_type->value_type());
-  if (!memref)
-    return InvalidArgumentError(
-        absl::StrCat("expected async memref type, got ",
-                     value_type->value_type().ToString()));
-  value_.AndThen([memref](absl::StatusOr<MemrefDesc*> status_or) {
-    if (!status_or.ok()) {
-      llvm::errs() << status_or.status().message();
-      assert(false && "async memref argument is in error state");
-    } else {
-      auto status = VerifyMemrefArgument(memref->element_type(),
-                                         memref->sizes(), **status_or);
-      if (!status.ok()) {
-        llvm::errs() << status.message();
-        assert(false && "failed to verify memref argument");
-      }
-    }
-  });
-  return absl::OkStatus();
-}
-
-void AsyncMemrefArg::Pack(absl::Span<void*> args) const {
-  args[0] = const_cast<void*>(reinterpret_cast<const void*>(&storage_));
-}
-
-std::string AsyncMemrefArg::ToString() const { return "Async memref argument"; }
-
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/runtime/arguments.h b/third_party/xla/xla/runtime/arguments.h
deleted file mode 100644
index 527c7b32db22d7..00000000000000
--- a/third_party/xla/xla/runtime/arguments.h
+++ /dev/null
@@ -1,435 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_RUNTIME_ARGUMENTS_H_
-#define XLA_RUNTIME_ARGUMENTS_H_
-
-#include <cstddef>
-#include <initializer_list>
-#include <string>
-#include <type_traits>
-
-#include "absl/status/status.h"
-#include "absl/strings/str_format.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Casting.h"
-#include "xla/primitive_util.h"
-#include "xla/runtime/async_runtime.h"
-#include "xla/runtime/types.h"
-
-namespace xla {
-namespace runtime {
-
-//===----------------------------------------------------------------------===//
-// A base class for XLA executable arguments.
-//===----------------------------------------------------------------------===//
-
-class Argument : public llvm::RTTIExtends<Type, llvm::RTTIRoot> {
- public:
-  static constexpr char ID = 0;  // NOLINT
-
-  Argument() = default;
-
-  // Verifies that the argument matches the expected type.
-  virtual absl::Status Verify(const Type& type) const = 0;
-
-  // Packs argument into the `args` view according to the expected executable
-  // ABI.
-  //
-  // Arguments view is guaranteed to be properly sized to have space for all
-  // arguments according to the arguments memory layout.
-  virtual void Pack(absl::Span<void*> args) const = 0;
-
-  virtual std::string ToString() const = 0;
-};
-
-inline llvm::raw_ostream& operator<<(llvm::raw_ostream& os,
-                                     const Argument& arg) {
-  return os << arg.ToString();
-}
-
-//===----------------------------------------------------------------------===//
-// Owning container for storing arguments of different types.
-//===----------------------------------------------------------------------===//
-
-// Forward declare class defined below.
-class ArgumentsRef;
-
-// An owning container for the variadic arguments, optimized for storing all
-// arguments of the declared types without dynamic memory allocations.
-//
-// Example:
-//
-//   Arguments<OpaqueArg, MemrefDesc> arguments;
-//   arguments.emplace_back<OpaqueArg>(...);
-//
-// Variadic type parameter `Ts` specifies arguments of what types can be added
-// to the container.
-template <typename... Ts>
-class Arguments {
- public:
-  explicit Arguments(size_t num_args) : num_args_(num_args) {
-    storage_.reserve(num_args);
-  }
-
-  ~Arguments() {
-    for (size_t i = 0; i < storage_.size(); ++i) {
-      reinterpret_cast<Argument*>(storage_[i].data)->~Argument();
-    }
-  }
-
-  template <typename T>
-  T& push_back(T value) {
-    static_assert(std::disjunction_v<std::is_same<T, Ts>...>,
-                  "type is not supported by this instance of arguments");
-    assert(storage_.size() < num_args_ && "arguments overflow");
-    storage_.resize_for_overwrite(storage_.size() + 1);
-    return *(new (&storage_.back()) T(std::forward<T>(value)));
-  }
-
-  template <typename T = std::tuple_element_t<0, std::tuple<Ts...>>,
-            typename... Args>
-  T& emplace_back(Args... args) {
-    static_assert(std::disjunction_v<std::is_same<T, Ts>...>,
-                  "type is not supported by this instance of arguments");
-    assert(storage_.size() < num_args_ && "arguments overflow");
-    storage_.resize_for_overwrite(storage_.size() + 1);
-    return *(new (&storage_.back()) T(std::forward<Args>(args)...));
-  }
-
-  const auto& operator[](size_t index) const {
-    using T = std::conditional_t<sizeof...(Ts) == 1,
-                                 std::tuple_element_t<0, std::tuple<Ts...>>,
-                                 Argument>;
-    return *reinterpret_cast<const T*>(storage_[index].data);
-  }
-
-  size_t size() const { return storage_.size(); }
-
- private:
-  friend class ArgumentsRef;
-
-  static_assert(std::conjunction_v<std::is_base_of<Argument, Ts>...>,
-                "all types must be arguments");
-
-  // Arguments are not movable or copyable because we do manual memory
-  // management using the `Storage` struct, and moving or copying bytes storing
-  // the argument value is undefined behavior.
-  Arguments(const Arguments&) = delete;
-  Arguments& operator=(const Arguments&) = delete;
-  Arguments(Arguments&&) = delete;
-  Arguments& operator=(Arguments&&) = delete;
-
-  // Avoid dynamic memory allocation for storing arguments of different types
-  // by storing them in the properly aligned byte array.
-  struct Storage {
-    alignas(Ts...) std::byte data[std::max({sizeof(Ts)...})];
-  };
-
-  // To guarantee safe conversion between pointer to `Storage` and pointer to
-  // the first byte (Argument), the storage struct must have standard layout.
-  static_assert(std::is_standard_layout_v<Storage>,
-                "storage must have standard layout");
-
-  size_t num_args_;
-  llvm::SmallVector<Storage> storage_;
-};
-
-// A constant reference to an array of arguments, somewhat similar to the
-// `absl::Span<const Argument>`, however because `Span` of a virtual base is not
-// possible, we have our own type that is constructible from the `Arguments`
-// and array reference or vector of any argument subtype.
-class ArgumentsRef {
-  template <typename T>
-  static constexpr bool is_argument = std::is_base_of_v<Argument, T>;
-
- public:
-  ArgumentsRef() : data_(nullptr), size_(0), stride_(0) {}
-
-  template <typename... Ts, std::enable_if_t<sizeof...(Ts) != 0>* = nullptr>
-  ArgumentsRef(const Arguments<Ts...>& args)  // NOLINT
-      : data_(reinterpret_cast<const Argument*>(args.storage_.data())),
-        size_(args.size()),
-        stride_(sizeof(typename Arguments<Ts...>::Storage)) {}
-
-  template <typename T, std::enable_if_t<is_argument<T>>* = nullptr>
-  ArgumentsRef(llvm::ArrayRef<T> ref)  // NOLINT
-      : data_(ref.data()), size_(ref.size()), stride_(sizeof(T)) {}
-
-  template <typename T, std::enable_if_t<is_argument<T>>* = nullptr>
-  ArgumentsRef(const llvm::SmallVectorImpl<T>& vec)  // NOLINT
-      : ArgumentsRef(llvm::ArrayRef<T>(vec)) {}
-
-  template <typename T, std::enable_if_t<is_argument<T>>* = nullptr>
-  ArgumentsRef(const std::vector<T>& vec)  // NOLINT
-      : ArgumentsRef(llvm::ArrayRef<T>(vec)) {}
-
-  template <typename T, size_t n, std::enable_if_t<is_argument<T>>* = nullptr>
-  ArgumentsRef(const std::array<T, n>& arr)  // NOLINT
-      : ArgumentsRef(llvm::ArrayRef<T>(arr)) {}
-
-  template <typename T, std::enable_if_t<is_argument<T>>* = nullptr>
-  ArgumentsRef(const std::initializer_list<T>& list)  // NOLINT
-      : ArgumentsRef(llvm::ArrayRef<T>(list)) {}
-
-  const Argument& operator[](size_t index) const {
-    assert(index < size_ && "index out of bounds");
-    auto* ptr = reinterpret_cast<const std::byte*>(data_) + index * stride_;
-    return *reinterpret_cast<const Argument*>(ptr);
-  }
-
-  size_t size() const { return size_; }
-
- private:
-  // Arguments stored in the contiguous memory starting at `data_` pointer,
-  // with the given `stride_` in bytes.
-  const Argument* data_;
-  size_t size_;
-  size_t stride_;
-};
-
-//===----------------------------------------------------------------------===//
-// Canonical types for passing compiled executable arguments.
-//===----------------------------------------------------------------------===//
-
-// By default we provide a set of types for passing common arguments to the
-// compiled executable. The type hierarchy is open, and users can extend it by
-// definining new `Type` and `Argument` with the corresponding MLIR types and
-// MLIR passes to lower types and operations to the LLVM dialect.
-
-//===----------------------------------------------------------------------===//
-// OpaqueArg for passing `!rt.opaque` arguments (lowered to `!llvm.ptr`).
-//===----------------------------------------------------------------------===//
-
-class OpaqueArg final : public llvm::RTTIExtends<OpaqueArg, Argument> {
- public:
-  static constexpr char ID = 0;  // NOLINT
-
-  explicit OpaqueArg(void* ptr) : ptr_(ptr) {}
-
-  void* ptr() const { return ptr_; }
-
-  absl::Status Verify(const Type& type) const final;
-  void Pack(absl::Span<void*> args) const final;
-  std::string ToString() const final;
-
- private:
-  void* ptr_;
-};
-
-template <typename T>
-using EnableIfScalarType = typename std::enable_if_t<
-    std::disjunction_v<std::is_same<T, float>, std::is_same<T, int32_t>,
-                       std::is_same<T, int64_t>>>;
-
-//===----------------------------------------------------------------------===//
-// ScalarArg for passing integer or float scalar arguments.
-//===----------------------------------------------------------------------===//
-
-class ScalarArg final : public llvm::RTTIExtends<ScalarArg, Argument> {
- public:
-  static constexpr char ID = 0;  // NOLINT
-
-  template <typename T, EnableIfScalarType<T>* = nullptr>
-  explicit ScalarArg(T value)
-      : type_(primitive_util::NativeToPrimitiveType<T>()), value_(value) {}
-
-  absl::Status Verify(const Type& type) const final;
-  void Pack(absl::Span<void*> args) const final;
-  std::string ToString() const final;
-
- private:
-  // We store value in a union instead of an `std::variant` so that we can pack
-  // a pointer to this union as an executable argument.
-  union Value {
-    explicit Value(int32_t i32) : i32(i32) {}
-    explicit Value(int64_t i64) : i64(i64) {}
-    explicit Value(float f32) : f32(f32) {}
-    int32_t i32;
-    int64_t i64;
-    float f32;
-  };
-
-  PrimitiveType type_;
-  Value value_;
-};
-
-//===----------------------------------------------------------------------===//
-// MemrefDesc for passing `memref` arguments.
-//===----------------------------------------------------------------------===//
-
-class MemrefDesc final : public llvm::RTTIExtends<MemrefDesc, Argument> {
- public:
-  static constexpr char ID = 0;  // NOLINT
-
-  MemrefDesc(PrimitiveType dtype, void* data, int64_t offset,
-             absl::Span<const int64_t> sizes, absl::Span<const int64_t> strides)
-      : rank_(sizes.size()), dtype_(dtype), data_(data), offset_(offset) {
-    assert(sizes.size() == strides.size() && "invalid sizes and strides pair");
-    sizes_and_strides_.reserve(2 * rank_);
-    sizes_and_strides_.append(sizes.begin(), sizes.end());
-    sizes_and_strides_.append(strides.begin(), strides.end());
-  }
-
-  // Constructs MemrefDesc of the given rank and calls user-provided callback to
-  // initialize sizes and strides.
-  //
-  // Expected `InitializeSizesAndStrides` callback signature:
-  //
-  //   void operator()(absl::Span<int64_t> sizes,
-  //                   absl::Span<int64_t> strides);
-  //
-  // We pass the init callback as a template argument to be able to
-  // inline it at the call site, because MemrefDesc construction is on a hot
-  // path.
-  template <typename InitializeSizesAndStrides>
-  MemrefDesc(unsigned rank, PrimitiveType dtype, void* data, int64_t offset,
-             InitializeSizesAndStrides initialize);
-
-  // Ensure that MemrefDesc is always moved around instead of copying.
-  MemrefDesc(const MemrefDesc&) = delete;
-  MemrefDesc& operator=(const MemrefDesc&) = delete;
-  MemrefDesc(MemrefDesc&&) = default;
-  MemrefDesc& operator=(MemrefDesc&&) = default;
-
-  unsigned rank() const { return rank_; }
-  PrimitiveType dtype() const { return dtype_; }
-
-  void* data() const { return data_; }
-  int64_t offset() const { return offset_; }
-
-  int64_t size(size_t index) const { return sizes_and_strides_[index]; }
-  int64_t stride(size_t index) const {
-    return sizes_and_strides_[rank_ + index];
-  }
-
-  absl::Span<const int64_t> sizes() const {
-    return {sizes_and_strides_.data(), rank_};
-  }
-
-  absl::Span<const int64_t> strides() const {
-    return {sizes_and_strides_.data() + rank_, rank_};
-  }
-
-  absl::Status Verify(const Type& type) const final;
-  void Pack(absl::Span<void*> args) const final;
-  std::string ToString() const final;
-
- private:
-  unsigned rank_;
-  PrimitiveType dtype_;
-  void* data_;
-  int64_t offset_;
-  // We keep sizes and strides in a single container to save one potential
-  // memory allocation for memrefs of higher ranks, and to save one vector
-  // constructor/destructor call.
-  llvm::SmallVector<int64_t, 8> sizes_and_strides_;
-};
-
-template <typename InitializeSizesAndStrides>
-MemrefDesc::MemrefDesc(unsigned rank, PrimitiveType dtype, void* data,
-                       int64_t offset, InitializeSizesAndStrides initialize)
-    : rank_(rank), dtype_(dtype), data_(data), offset_(offset) {
-  sizes_and_strides_.resize(2 * rank_);
-  auto ref = absl::Span<int64_t>(sizes_and_strides_);
-  initialize(ref.subspan(0, rank_), ref.subspan(rank_));
-}
-
-//===----------------------------------------------------------------------===//
-// Verify that argument type is compatible with the run-time memref argument.
-//===----------------------------------------------------------------------===//
-
-// Verifies that the type at the given `index` matches the run-time memref
-// argument: type is a tensor of a memref with compatible element type, and all
-// statically known dimensions match the run-time sizes. Returns user-friendly
-// error message in case of an error.
-absl::Status VerifyMemrefArgument(unsigned index, const Type& type,
-                                  const MemrefDesc& arg);
-
-//===----------------------------------------------------------------------===//
-// AsyncTokenArg for passing async token arguments
-//===----------------------------------------------------------------------===//
-
-class AsyncTokenArg final : public llvm::RTTIExtends<AsyncTokenArg, Argument> {
- public:
-  static constexpr char ID = 0;  // NOLINT
-
-  explicit AsyncTokenArg(tsl::AsyncValueRef<tsl::Chain> value)
-      : storage_(AsyncRuntime::AsToken(value)) {}
-
-  absl::Status Verify(const Type& type) const final;
-  void Pack(absl::Span<void*> args) const final;
-  std::string ToString() const final;
-
- private:
-  // In the runtime execution, we unpack args with pointer to pointer
-  // dereferening. We declare storage_ as a member variable (instead of a local
-  // inside the Pack function) to keep its address valid when unpacking.
-  AsyncRuntime::Token* storage_;
-};
-
-//===----------------------------------------------------------------------===//
-// AsyncScalarArg for passing async scalar arguments
-//===----------------------------------------------------------------------===//
-
-class AsyncScalarArg final
-    : public llvm::RTTIExtends<AsyncScalarArg, Argument> {
- public:
-  static constexpr char ID = 0;  // NOLINT
-
-  template <typename T, EnableIfScalarType<T>* = nullptr>
-  explicit AsyncScalarArg(tsl::AsyncValueRef<T> value)
-      : type_(primitive_util::NativeToPrimitiveType<T>()) {
-    auto write = [](const T* v, std::byte* store) {
-      T* store_t = reinterpret_cast<T*>(store);
-      *store_t = *v;
-    };
-
-    storage_ = AsyncRuntime::AsValue<T>(value, sizeof(T),
-                                        alignof(std::max_align_t), write);
-  }
-
-  absl::Status Verify(const Type& type) const final;
-  void Pack(absl::Span<void*> args) const final;
-
-  std::string ToString() const final;
-
- private:
-  PrimitiveType type_;
-  AsyncRuntime::Value* storage_;
-};
-
-//===----------------------------------------------------------------------===//
-// AsyncMemrefArg for passing async memref arguments
-//===----------------------------------------------------------------------===//
-class AsyncMemrefArg final
-    : public llvm::RTTIExtends<AsyncMemrefArg, Argument> {
- public:
-  static constexpr char ID = 0;  // NOLINT
-
-  explicit AsyncMemrefArg(tsl::AsyncValueRef<MemrefDesc> value);
-
-  absl::Status Verify(const Type& type) const final;
-  void Pack(absl::Span<void*> args) const final;
-  std::string ToString() const final;
-
- private:
-  tsl::AsyncValueRef<MemrefDesc> value_;
-  AsyncRuntime::Value* storage_;
-};
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_RUNTIME_ARGUMENTS_H_
diff --git a/third_party/xla/xla/runtime/arguments_test.cc b/third_party/xla/xla/runtime/arguments_test.cc
deleted file mode 100644
index f5493ffac1e6f3..00000000000000
--- a/third_party/xla/xla/runtime/arguments_test.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright 2022 The TensorFlow Runtime Authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "xla/runtime/arguments.h"
-
-#include <array>
-#include <type_traits>
-#include <utility>
-
-#include "tsl/platform/test_benchmark.h"
-
-namespace xla {
-namespace runtime {
-
-//===----------------------------------------------------------------------===//
-// Benchmarks for constructing MemrefDesc.
-//===----------------------------------------------------------------------===//
-
-static void BM_CreateMemrefDesc_1d(benchmark::State& state) {
-  void* ptr = reinterpret_cast<void*>(0xDEADBEEF);
-  int64_t size = 123;
-  int64_t stride = 456;
-
-  int64_t num_memrefs = state.range(0);
-
-  for (auto _ : state) {
-    Arguments<MemrefDesc> memrefs(num_memrefs);
-
-    for (unsigned i = 0; i < num_memrefs; ++i) {
-      std::array<int64_t, 1> sizes = {size};
-      std::array<int64_t, 1> strides = {stride};
-      memrefs.emplace_back(PrimitiveType::S8, ptr, 0, sizes, strides);
-    }
-
-    benchmark::DoNotOptimize(memrefs);
-  }
-}
-
-BENCHMARK(BM_CreateMemrefDesc_1d)->Arg(1)->Arg(4)->Arg(8)->Arg(12)->Arg(16);
-
-//===----------------------------------------------------------------------===//
-// Run benchmarks for verifying operands.
-//===----------------------------------------------------------------------===//
-
-static MemrefDesc GetFakeMemref(absl::Span<const int64_t> sizes) {
-  return MemrefDesc(PrimitiveType::F32, nullptr, 0, sizes,
-                    sizes /* fake strides*/);
-}
-
-static void BenchmarkVerifyMemrefOperand(benchmark::State& state,
-                                         const MemrefDesc& memref) {
-  MemrefType type(memref.sizes(), memref.dtype());
-
-  for (auto _ : state) {
-    if (auto st = VerifyMemrefArgument(0, type, memref); !st.ok()) break;
-  }
-}
-
-static void BM_VerifyMemref_1d(benchmark::State& state) {
-  auto memref = GetFakeMemref({1});
-  BenchmarkVerifyMemrefOperand(state, memref);
-}
-
-static void BM_VerifyMemref_2d(benchmark::State& state) {
-  auto memref = GetFakeMemref({1, 2});
-  BenchmarkVerifyMemrefOperand(state, memref);
-}
-
-static void BM_VerifyMemref_3d(benchmark::State& state) {
-  auto memref = GetFakeMemref({1, 2, 3});
-  BenchmarkVerifyMemrefOperand(state, memref);
-}
-
-static void BM_VerifyMemref_4d(benchmark::State& state) {
-  auto memref = GetFakeMemref({1, 2, 3, 4});
-  BenchmarkVerifyMemrefOperand(state, memref);
-}
-
-static void BM_VerifyMemref_5d(benchmark::State& state) {
-  auto memref = GetFakeMemref({1, 2, 3, 4, 5});
-  BenchmarkVerifyMemrefOperand(state, memref);
-}
-
-BENCHMARK(BM_VerifyMemref_1d);
-BENCHMARK(BM_VerifyMemref_2d);
-BENCHMARK(BM_VerifyMemref_3d);
-BENCHMARK(BM_VerifyMemref_4d);
-BENCHMARK(BM_VerifyMemref_5d);
-
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/runtime/async_runtime.cc b/third_party/xla/xla/runtime/async_runtime.cc
deleted file mode 100644
index 753c0e336a9a6d..00000000000000
--- a/third_party/xla/xla/runtime/async_runtime.cc
+++ /dev/null
@@ -1,382 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/runtime/async_runtime.h"
-
-#include <array>
-#include <atomic>
-#include <cstddef>
-#include <cstdlib>
-#include <memory>
-#include <optional>
-#include <type_traits>
-#include <utility>
-
-#include "absl/base/dynamic_annotations.h"
-#include "tsl/concurrency/async_value.h"
-#include "tsl/concurrency/async_value_ref.h"
-#include "tsl/concurrency/chain.h"
-#include "tsl/platform/mem.h"
-
-// -------------------------------------------------------------------------- //
-// Define AsyncToken and AsyncGroup in the mlir::runtime namespace to implement
-// opaque structs defined in the MLIR Async Runtime API header file.
-// -------------------------------------------------------------------------- //
-
-namespace mlir {
-namespace runtime {
-
-using tsl::AsyncValueOwningRef;
-using tsl::Chain;
-using tsl::MakeAvailableAsyncValueRef;
-using tsl::MakeConstructedAsyncValueRef;
-using tsl::internal::AsyncValueStorage;
-
-using xla::runtime::AsyncRuntimeObject;
-
-using tsl::port::AlignedFree;
-using tsl::port::AlignedMalloc;
-
-struct AsyncToken : public AsyncRuntimeObject {
-  explicit AsyncToken(unsigned ref_count = 1)
-      : AsyncRuntimeObject(ref_count),
-        chain(MakeConstructedAsyncValueRef<Chain>(storage)) {}
-
-  tsl::AsyncValue* GetAsyncValue() const { return chain.AsPtr().value(); }
-
-  AsyncValueStorage<Chain> storage;
-  AsyncValueOwningRef<Chain> chain;
-};
-
-struct AsyncValue : public AsyncRuntimeObject {
-  explicit AsyncValue(unsigned ref_count = 1)
-      : AsyncRuntimeObject(ref_count),
-        chain(MakeConstructedAsyncValueRef<Chain>(storage)) {}
-
-  explicit AsyncValue(size_t size, size_t alignment, unsigned ref_count = 1)
-      : AsyncRuntimeObject(ref_count),
-        data_storage(Storage(size, alignment)),
-        chain(MakeConstructedAsyncValueRef<Chain>(storage)) {
-    // Storage memory will be initialized by the compiled executable.
-    ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(GetStorage(), size);
-  }
-
-  std::byte* GetStorage() {
-    assert(!GetAsyncValue()->IsError() && "unexpected error state");
-    assert(data_storage.has_value() && "unallocated data storage");
-    if (data_storage->is_inline) return &data_storage->inline_buffer[0];
-    return data_storage->allocated_buffer;
-  }
-
-  void AllocateStorage(size_t size, size_t alignment) {
-    data_storage = Storage(size, alignment);
-    // Storage memory will be initialized by the compiled executable.
-    ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(GetStorage(), size);
-  }
-
-  tsl::AsyncValue* GetAsyncValue() const { return chain.AsPtr().value(); }
-
-  // If the requested async value storage is small, use the inlined storage.
-  // Fall back on dynamic allocation if the requested storage size is large.
-  struct Storage {
-    static const int kSize = 128;  // enough to fit memref descriptor of rank 5
-    static const int kAlign = alignof(std::max_align_t);
-
-    Storage(size_t size, size_t alignment)
-        : is_inline(CanStoreInline(size, alignment)) {
-      if (!is_inline)
-        allocated_buffer =
-            reinterpret_cast<std::byte*>(AlignedMalloc(size, alignment));
-    }
-
-    ~Storage() {
-      if (!is_inline) AlignedFree(allocated_buffer);
-    }
-
-    static bool CanStoreInline(size_t size, size_t alignment) {
-      assert(absl::has_single_bit(alignment));
-      return size <= kSize && alignment <= kAlign;
-    }
-
-    bool is_inline;
-    union {
-      alignas(kAlign) std::array<std::byte, kSize> inline_buffer;
-      std::byte* allocated_buffer;
-    };
-  };
-
-  std::optional<Storage> data_storage;
-
-  // Async value that tracks value readiness. It becomes available when result
-  // is written to the data storage and ready for consumption.
-  AsyncValueStorage<Chain> storage;
-  AsyncValueOwningRef<Chain> chain;
-};
-
-struct AsyncGroup : public AsyncRuntimeObject {
-  explicit AsyncGroup(int64_t size, unsigned ref_count = 1)
-      : AsyncRuntimeObject(ref_count),
-        size(size),
-        rank(0),
-        pending_tokens(size),
-        num_errors(0),
-        completed(size == 0 ? MakeAvailableAsyncValueRef<Chain>(storage)
-                            : MakeConstructedAsyncValueRef<Chain>(storage)) {
-    assert(size >= 0 && "size can't be negative");
-  }
-
-  size_t AddToken(AsyncToken* token) {
-    size_t token_rank = rank.fetch_add(1, std::memory_order_relaxed);
-    assert(token_rank < size && "can't add more tokens than the group size");
-
-    // When token becomes available drop the number of pending tokens and maybe
-    // make the group completion async value available.
-    token->GetAsyncValue()->AndThen([group = this, token]() {
-      // Increment the number of errors in the group.
-      if (token->GetAsyncValue()->IsError()) group->num_errors.fetch_add(1);
-
-      // Pending tokens can't drop below zero.
-      assert(group->pending_tokens > 0 && "wrong group size");
-
-      // We do track group error state with the number of errors, and never
-      // set completion async value state to error.
-      if (group->pending_tokens.fetch_sub(1) == 1)
-        group->completed.AsPtr().SetStateConcrete();
-    });
-
-    return token_rank;
-  }
-
-  tsl::AsyncValue* GetCompletionAsyncValue() const {
-    return completed.AsPtr().value();
-  }
-
-  bool IsError() const { return num_errors.load() != 0; }
-
-  int64_t size;
-  std::atomic<int64_t> rank;
-  std::atomic<int64_t> pending_tokens;
-  std::atomic<int64_t> num_errors;
-
-  // Async value that keeps track the group completion, it will become available
-  // when the number of pending tokens will drop to zero.
-  AsyncValueStorage<Chain> storage;
-  AsyncValueOwningRef<Chain> completed;
-};
-
-}  // namespace runtime
-}  // namespace mlir
-
-// -------------------------------------------------------------------------- //
-
-namespace xla {
-namespace runtime {
-
-using tsl::AsyncValue;
-
-namespace {
-// Always keep the current active async runtime in a thread local variable.
-static thread_local AsyncRuntime async_runtime;
-
-static_assert(std::is_trivially_destructible<AsyncRuntime>::value,
-              "AsyncRuntime must be trivially destructible");
-
-static_assert(std::is_trivially_copy_assignable<AsyncRuntime>::value,
-              "AsyncRuntime must be trivially copy assignable");
-
-static_assert(std::is_trivially_copy_constructible<AsyncRuntime>::value,
-              "AsyncRuntime must be trivially copy constructible");
-
-// This is an arbitrary limitation, to make sure that AsyncRuntime would not
-// become expensive to copy unnoticed.
-static_assert(sizeof(AsyncRuntime) == 1 * sizeof(void*),
-              "AsyncRuntime must only hold one pointer");
-
-}  // namespace
-
-/*static*/ void AsyncRuntime::Set(AsyncRuntime runtime) {
-  assert(runtime.runner() != nullptr);
-  async_runtime = runtime;
-}
-
-/*static*/ AsyncRuntime& AsyncRuntime::GetCurrentRuntime() {
-  assert(async_runtime.runner() != nullptr);
-  return async_runtime;
-}
-
-/*static*/ std::byte* AsyncRuntime::GetStorage(Value* value) {
-  return value->GetStorage();
-}
-
-/*static*/ void AsyncRuntime::AllocateStorage(Value* value, size_t size,
-                                              size_t alignment) {
-  return value->AllocateStorage(size, alignment);
-}
-
-/*static*/ AsyncValue* AsyncRuntime::GetAsyncValue(AsyncRuntime::Value* value) {
-  return value->GetAsyncValue();
-}
-
-/*static*/ AsyncValue* AsyncRuntime::GetAsyncValue(AsyncRuntime::Token* token) {
-  return token->GetAsyncValue();
-}
-
-/*static*/ AsyncValue* AsyncRuntime::GetAsyncValue(AsyncRuntime::Group* group) {
-  return group->GetCompletionAsyncValue();
-}
-
-/*static*/ void AsyncRuntime::Await(AsyncValue* awaitable) {
-  // Short circuit the trivial case.
-  if (awaitable->IsAvailable()) return;
-  tsl::BlockUntilReady(awaitable);
-}
-
-/*static*/ void AsyncRuntime::AddRef(AsyncRuntimeObject* obj, unsigned count) {
-  assert(count == 1 && "AsyncRuntimeObject can add just one ref");
-  obj->AddRef();
-}
-
-/*static*/ void AsyncRuntime::DropRef(AsyncRuntimeObject* obj, unsigned count) {
-  assert(count == 1 && "AsyncRuntimeObject can drop just one ref");
-  obj->DropRef();
-}
-
-/*static*/ AsyncRuntimeObject* AsyncRuntime::ToAsyncRuntimeObject(
-    AsyncRuntime::Token* token) {
-  return static_cast<AsyncRuntimeObject*>(token);
-}
-
-/*static*/ AsyncRuntimeObject* AsyncRuntime::ToAsyncRuntimeObject(
-    AsyncRuntime::Value* value) {
-  return static_cast<AsyncRuntimeObject*>(value);
-}
-
-/*static*/ AsyncRuntimeObject* AsyncRuntime::ToAsyncRuntimeObject(
-    AsyncRuntime::Group* group) {
-  return static_cast<AsyncRuntimeObject*>(group);
-}
-
-/*static*/ AsyncRuntime::Token* AsyncRuntime::CreateToken() {
-  // AsyncRuntime::Token created with a reference count of 2 because it will be
-  // returned to the `async.execute` caller and also will be later on emplaced
-  // by the asynchronously executed task. If the caller immediately will drop
-  // its reference we must ensure that the token will be alive until the
-  // asynchronous operation is completed.
-  return new AsyncRuntime::Token(/*ref_count=*/2);
-}
-
-/*static*/ void AsyncRuntime::SetAvailable(AsyncRuntime::Token* token) {
-  token->GetAsyncValue()->SetStateConcrete();
-  // Async tokens created with a ref count `2` to keep token alive until the
-  // async task completes. Drop extra reference explicitly when token emplaced.
-  DropRef(token);
-}
-
-/*static*/ void AsyncRuntime::SetError(AsyncRuntime::Token* token) {
-  // TODO(ezhulenev): Construct a better diagnostincs when async runtime API
-  // will support passing custom error messages.
-  token->GetAsyncValue()->SetError(
-      absl::InternalError("<async runtime error>"));
-  // Async tokens created with a ref count `2` to keep token alive until the
-  // async task completes. Drop extra reference explicitly when token emplaced.
-  DropRef(token);
-}
-
-/*static*/ bool AsyncRuntime::IsError(AsyncRuntime::Token* token) {
-  return token->GetAsyncValue()->IsError();
-}
-
-/*static*/ void AsyncRuntime::AwaitToken(AsyncRuntime::Token* token) {
-  Await(token->GetAsyncValue());
-}
-
-/*static*/ AsyncRuntime::Value* AsyncRuntime::CreateValue() {
-  // AsyncRuntime::Value created with a reference count of 2 because it will be
-  // returned to the `async.execute` caller and also will be later on emplaced
-  // by the asynchronously executed task. If the caller immediately will drop
-  // its reference we must ensure that the value will be alive until the
-  // asynchronous operation is completed.
-  return new AsyncRuntime::Value(/*ref_count=*/2);
-}
-
-/*static*/ AsyncRuntime::Value* AsyncRuntime::CreateValue(size_t size,
-                                                          size_t alignment) {
-  // AsyncRuntime::Value created with a reference count of 2 because it will be
-  // returned to the `async.execute` caller and also will be later on emplaced
-  // by the asynchronously executed task. If the caller immediately will drop
-  // its reference we must ensure that the value will be alive until the
-  // asynchronous operation is completed.
-  return new AsyncRuntime::Value(size, alignment, /*ref_count=*/2);
-}
-
-/*static*/ void AsyncRuntime::SetAvailable(AsyncRuntime::Value* value) {
-  value->GetAsyncValue()->SetStateConcrete();
-  // Async values created with a ref count `2` to keep token alive until the
-  // async task completes. Drop extra reference explicitly when token emplaced.
-  DropRef(value);
-}
-
-/*static*/ void AsyncRuntime::SetError(AsyncRuntime::Value* value) {
-  // TODO(ezhulenev): Construct a better diagnostincs when async runtime API
-  // will support passing custom error messages.
-  value->GetAsyncValue()->SetError(
-      absl::InternalError("<async runtime error>"));
-  // Async values created with a ref count `2` to keep token alive until the
-  // async task completes. Drop extra reference explicitly when token emplaced.
-  DropRef(value);
-}
-
-/*static*/ bool AsyncRuntime::IsError(AsyncRuntime::Value* value) {
-  return value->GetAsyncValue()->IsError();
-}
-
-/*static*/ void AsyncRuntime::AwaitValue(AsyncRuntime::Value* value) {
-  Await(value->GetAsyncValue());
-}
-
-/*static*/ AsyncRuntime::Group* AsyncRuntime::CreateGroup(int64_t size) {
-  return new AsyncRuntime::Group(size);
-}
-
-/*static*/ size_t AsyncRuntime::AddTokenToGroup(AsyncRuntime::Group* group,
-                                                AsyncRuntime::Token* token) {
-  return group->AddToken(token);
-}
-
-/*static*/ bool AsyncRuntime::IsError(AsyncRuntime::Group* group) {
-  return group->IsError();
-}
-
-/*static*/ void AsyncRuntime::AwaitGroup(AsyncRuntime::Group* group) {
-  Await(group->GetCompletionAsyncValue());
-}
-
-/*static*/ AsyncRuntime::Token* AsyncRuntime::AsToken(
-    tsl::AsyncValueRef<tsl::Chain> chain) {
-  AsyncRuntime::Token* token = CreateToken();
-
-  chain.AndThen([token](absl::StatusOr<tsl::Chain*> status_or) {
-    if (!status_or.ok()) {
-      SetError(token);
-    } else {
-      SetAvailable(token);
-    }
-  });
-
-  return token;
-}
-
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/runtime/async_runtime.h b/third_party/xla/xla/runtime/async_runtime.h
deleted file mode 100644
index 9562c5c0fdecaa..00000000000000
--- a/third_party/xla/xla/runtime/async_runtime.h
+++ /dev/null
@@ -1,283 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_RUNTIME_ASYNC_RUNTIME_H_
-#define XLA_RUNTIME_ASYNC_RUNTIME_H_
-
-#define EIGEN_USE_THREADS
-
-#include <cstddef>
-#include <functional>
-#include <utility>
-
-#include "tsl/concurrency/async_value.h"
-#include "tsl/concurrency/async_value_ref.h"
-#include "tsl/concurrency/chain.h"
-#include "tsl/concurrency/ref_count.h"
-#include "tsl/platform/threadpool.h"
-
-namespace mlir {
-namespace runtime {
-
-// Async runtime in the XLA implements the MLIR async runtime API that supports
-// the lowering of the `async` dialect to the LLVM and LLVM coroutines.
-struct AsyncToken;
-struct AsyncValue;
-struct AsyncGroup;
-
-}  // namespace runtime
-}  // namespace mlir
-
-namespace xla {
-namespace runtime {
-
-// Forward declare a base class for async runtime objects.
-class AsyncRuntimeObject;
-
-// Async task runner abstracts over the underlying thread pool (or concurrent
-// work queue) implementation.
-class AsyncTaskRunner {
- public:
-  using Task = std::function<void()>;
-  virtual ~AsyncTaskRunner() = default;
-  virtual void Schedule(Task task) = 0;
-};
-
-class AsyncRuntime {
- public:
-  using Token = ::mlir::runtime::AsyncToken;
-  using Value = ::mlir::runtime::AsyncValue;
-  using Group = ::mlir::runtime::AsyncGroup;
-
-  explicit AsyncRuntime(AsyncTaskRunner* runner) : runner_(runner) {
-    assert(runner != nullptr && "async task runner must be not null");
-  }
-
-  // We need a default constructor to define a thread local variable for async
-  // runtime passing between tasks (see implementation in async_runtime_api.cc).
-  AsyncRuntime() : runner_(nullptr) {}
-
-  // ------------------------------------------------------------------------ //
-  // Implicit AsyncRuntime propagation.
-  // ------------------------------------------------------------------------ //
-
-  // Set the AsyncRuntime that will be implicitly propagated to all async tasks.
-  //
-  // On every launch of an async task (see `async_runtime_api.h`), current async
-  // runtime will be captured, and restored when the task will start its
-  // execution on a different thread.
-  static void Set(AsyncRuntime runtime);
-
-  // Returns the current async runtime.
-  static AsyncRuntime& GetCurrentRuntime();
-
-  // ------------------------------------------------------------------------ //
-  // Async Token API.
-  // ------------------------------------------------------------------------ //
-
-  // Creates a new token in not-ready state.
-  static Token* CreateToken();
-
-  // Switches the token to the available state and runs all the awaiters.
-  static void SetAvailable(Token* token);
-
-  // Switches the token to the error state and runs all the awaiters.
-  static void SetError(Token* token);
-
-  // Returns `true` if the token is in the error state.
-  static bool IsError(Token* token);
-
-  // Blocks the caller thread until the token becomes ready.
-  static void AwaitToken(Token* token);
-
-  // ------------------------------------------------------------------------ //
-  // Async Value API.
-  // ------------------------------------------------------------------------ //
-
-  // Creates a new value in not-ready state without allocating storage
-  static Value* CreateValue();
-
-  // Creates a new value in not-ready state with a storage of the given size.
-  static Value* CreateValue(size_t size, size_t alignment);
-
-  // Switches the value to the available state and runs all the awaiters.
-  static void SetAvailable(Value* value);
-
-  // Switches the value to the error state and runs all the awaiters.
-  static void SetError(Value* value);
-
-  // Returns `true` if the value is in the error state.
-  static bool IsError(Value* value);
-
-  // Blocks the caller thread until the value becomes ready.
-  static void AwaitValue(Value* value);
-
-  // ------------------------------------------------------------------------ //
-  // Async Group API.
-  // ------------------------------------------------------------------------ //
-
-  // Creates a new empty group.
-  static Group* CreateGroup(int64_t size);
-
-  // Adds `token` to the `group`.
-  static size_t AddTokenToGroup(Group* group, Token* token);
-
-  // Returns `true` if the group is in the error state (any of the tokens or
-  // values added to the group is in the error state).
-  static bool IsError(Group* group);
-
-  // Blocks the caller thread until the group becomes ready (all tokens that
-  // were added to the group are emplaced).
-  static void AwaitGroup(Group* group);
-
-  // ------------------------------------------------------------------------ //
-  // Execution and continuation based resumption API.
-  // ------------------------------------------------------------------------ //
-
-  // Execute the callable `f` on a thread managed by the runtime.
-  template <typename F>
-  void Execute(F&& f);
-
-  // Await operation that do not block the caller thread, but instead execute
-  // the callable `F` when the token/group become ready.
-  template <typename F>
-  static void AwaitToken(Token* token, F&& f);
-  template <typename F>
-  static void AwaitValue(Value* value, F&& f);
-  template <typename F>
-  static void AwaitGroup(Group* group, F&& f);
-
-  // ------------------------------------------------------------------------ //
-
-  // Returns a pointer to the async value storage.
-  static std::byte* GetStorage(Value* value);
-
-  // Allocate storage for the async value
-  static void AllocateStorage(Value* value, size_t size, size_t alignment);
-
-  // Extracts async value that holds a chain owned by the value.
-  static tsl::AsyncValue* GetAsyncValue(Value* value);
-
-  // Extracts async value that is owned by the token.
-  static tsl::AsyncValue* GetAsyncValue(Token* token);
-
-  // Extracts async value that signals group completion.
-  static tsl::AsyncValue* GetAsyncValue(Group* group);
-
-  // Reference counting operations for the runtime objects.
-  static void AddRef(AsyncRuntimeObject* obj, unsigned count = 1);
-  static void DropRef(AsyncRuntimeObject* obj, unsigned count = 1);
-
-  // Convert Token/Value/Group to AsyncRuntimeObject*;
-  static AsyncRuntimeObject* ToAsyncRuntimeObject(Token* token);
-  static AsyncRuntimeObject* ToAsyncRuntimeObject(Value* value);
-  static AsyncRuntimeObject* ToAsyncRuntimeObject(Group* group);
-
-  // Convert async value/token to async runtime object.
-  static Token* AsToken(tsl::AsyncValueRef<tsl::Chain> chain);
-
-  template <typename T>
-  static Value* AsValue(
-      tsl::AsyncValueRef<T> value, size_t size, size_t alignment,
-      absl::FunctionRef<void(const T*, std::byte* storage)> write) {
-    Value* runtime_async_value = AsyncRuntime::CreateValue(size, alignment);
-    value.AndThen([runtime_async_value, write](absl::StatusOr<T*> status_or) {
-      if (!status_or.ok()) {
-        AsyncRuntime::SetError(runtime_async_value);
-      } else {
-        auto* store = AsyncRuntime::GetStorage(runtime_async_value);
-        write(*status_or, store);
-        AsyncRuntime::SetAvailable(runtime_async_value);
-      }
-    });
-    return runtime_async_value;
-  }
-
-  template <typename T>
-  static Value* AsValue(
-      tsl::AsyncValueRef<T> value,
-      absl::FunctionRef<std::pair<size_t, size_t>(const T*)> size_and_alignment,
-      absl::FunctionRef<void(const T*, std::byte* storage)> write) {
-    Value* runtime_async_value = AsyncRuntime::CreateValue();
-    value.AndThen([runtime_async_value, size_and_alignment,
-                   write](absl::StatusOr<T*> status_or) {
-      if (!status_or.ok()) {
-        AsyncRuntime::SetError(runtime_async_value);
-      } else {
-        auto size_alignment = size_and_alignment(*status_or);
-        AsyncRuntime::AllocateStorage(runtime_async_value, size_alignment.first,
-                                      size_alignment.second);
-        auto* store = AsyncRuntime::GetStorage(runtime_async_value);
-        write(*status_or, store);
-        AsyncRuntime::SetAvailable(runtime_async_value);
-      }
-    });
-    return runtime_async_value;
-  }
-
-  AsyncTaskRunner* runner() const { return runner_; }
-
- private:
-  // Blocks the caller thread until awaitable async value becomes available.
-  static void Await(tsl::AsyncValue* awaitable);
-
-  AsyncTaskRunner* runner_;  // must outlive *this
-};
-
-// A base class for all Async dialect types reference counted at runtime.
-class AsyncRuntimeObject : public tsl::ReferenceCounted<AsyncRuntimeObject> {
- public:
-  using ReferenceCounted::ReferenceCounted;  // inherit constructors
-  virtual ~AsyncRuntimeObject() = default;
-};
-
-template <typename F>
-void AsyncRuntime::Execute(F&& f) {
-  runner_->Schedule(std::forward<F>(f));
-}
-
-template <typename F>
-/*static*/ void AsyncRuntime::AwaitToken(Token* token, F&& f) {
-  AsyncRuntime::GetAsyncValue(token)->AndThen(std::forward<F>(f));
-}
-
-template <typename F>
-/*static*/ void AsyncRuntime::AwaitValue(Value* value, F&& f) {
-  AsyncRuntime::GetAsyncValue(value)->AndThen(std::forward<F>(f));
-}
-
-template <typename F>
-/*static*/ void AsyncRuntime::AwaitGroup(Group* group, F&& f) {
-  AsyncRuntime::GetAsyncValue(group)->AndThen(std::forward<F>(f));
-}
-
-//===-----------------------------------------------------------------------===/
-// AsyncTaskRunner implementation on top of the default ThreadPool.
-//===-----------------------------------------------------------------------===/
-
-class ThreadPoolAsyncTaskRunner : public AsyncTaskRunner {
- public:
-  explicit ThreadPoolAsyncTaskRunner(tsl::thread::ThreadPool* thread_pool)
-      : thread_pool_(thread_pool) {}
-  void Schedule(Task task) final { thread_pool_->Schedule(std::move(task)); }
-
- private:
-  tsl::thread::ThreadPool* thread_pool_;
-};
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_RUNTIME_ASYNC_RUNTIME_H_
diff --git a/third_party/xla/xla/runtime/async_runtime_test.cc b/third_party/xla/xla/runtime/async_runtime_test.cc
deleted file mode 100644
index 2e3d188412d4d1..00000000000000
--- a/third_party/xla/xla/runtime/async_runtime_test.cc
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Copyright 2022 The TensorFlow Runtime Authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "xla/runtime/async_runtime.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-
-#include "tsl/concurrency/async_value_ref.h"
-#include "tsl/concurrency/chain.h"
-#include "tsl/platform/test.h"
-
-namespace xla {
-namespace runtime {
-constexpr int kDefaultNumOfThreads = 4;
-
-class AsyncRuntimeTest : public ::testing::Test {
- protected:
-  AsyncRuntimeTest() {
-    thread_pool_ = std::make_unique<tsl::thread::ThreadPool>(
-        tsl::Env::Default(), "test", kDefaultNumOfThreads);
-    async_task_runner_ =
-        std::make_unique<ThreadPoolAsyncTaskRunner>(thread_pool_.get());
-    AsyncRuntime::Set(AsyncRuntime(async_task_runner_.get()));
-  }
-  std::unique_ptr<tsl::thread::ThreadPool> thread_pool_;
-  std::unique_ptr<AsyncTaskRunner> async_task_runner_;
-};
-
-TEST_F(AsyncRuntimeTest, SetTokenError) {
-  AsyncRuntime::Token *token = AsyncRuntime::CreateToken();
-  AsyncRuntime::SetError(token);
-  EXPECT_EQ(AsyncRuntime::IsError(token), true);
-
-  AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(token));
-}
-
-TEST_F(AsyncRuntimeTest, SetValueError) {
-  AsyncRuntime::Value *value =
-      AsyncRuntime::CreateValue(sizeof(int32_t), alignof(std::max_align_t));
-  AsyncRuntime::SetError(value);
-  EXPECT_EQ(AsyncRuntime::IsError(value), true);
-
-  AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(value));
-}
-
-TEST_F(AsyncRuntimeTest, IsGroupError) {
-  AsyncRuntime::Group *group = AsyncRuntime::CreateGroup(1);
-  AsyncRuntime::Token *token = AsyncRuntime::CreateToken();
-  AsyncRuntime::SetError(token);
-  AsyncRuntime::AddTokenToGroup(group, token);
-  EXPECT_EQ(AsyncRuntime::IsError(group), true);
-
-  AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(group));
-  AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(token));
-}
-
-TEST_F(AsyncRuntimeTest, AwaitToken) {
-  AsyncRuntime::Token *token = AsyncRuntime::CreateToken();
-  AsyncRuntime::Value *value =
-      AsyncRuntime::CreateValue(sizeof(int32_t), alignof(std::max_align_t));
-  int v = 0;
-  AsyncRuntime::AwaitToken(token, [&] {
-    v = 42;
-    AsyncRuntime::SetAvailable(value);
-  });
-
-  AsyncRuntime::SetAvailable(token);
-  AsyncRuntime::AwaitValue(value);
-  EXPECT_EQ(v, 42);
-
-  AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(token));
-  AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(value));
-}
-
-TEST_F(AsyncRuntimeTest, AwaitGroup) {
-  AsyncRuntime::Group *group = AsyncRuntime::CreateGroup(1);
-  AsyncRuntime::Token *token = AsyncRuntime::CreateToken();
-  AsyncRuntime::Value *value =
-      AsyncRuntime::CreateValue(sizeof(int32_t), alignof(std::max_align_t));
-  AsyncRuntime::AddTokenToGroup(group, token);
-  int v = 0;
-  AsyncRuntime::AwaitGroup(group, [&] {
-    v = 42;
-    AsyncRuntime::SetAvailable(value);
-  });
-
-  AsyncRuntime::SetAvailable(token);
-  AsyncRuntime::AwaitValue(value);
-  EXPECT_EQ(v, 42);
-
-  AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(group));
-  AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(token));
-  AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(value));
-}
-
-TEST_F(AsyncRuntimeTest, Execute) {
-  auto &runtime = AsyncRuntime::GetCurrentRuntime();
-  AsyncRuntime::Token *token = AsyncRuntime::CreateToken();
-  int v = 0;
-  runtime.Execute([&] {
-    v = 42;
-    AsyncRuntime::SetAvailable(token);
-  });
-  AsyncRuntime::AwaitToken(token);
-  EXPECT_EQ(v, 42);
-
-  AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(token));
-}
-
-TEST_F(AsyncRuntimeTest, AsToken) {
-  auto chain1 = tsl::MakeAvailableAsyncValueRef<tsl::Chain>();
-  auto *token1 = AsyncRuntime::AsToken(chain1);
-  EXPECT_EQ(AsyncRuntime::GetAsyncValue(token1)->IsAvailable(), true);
-
-  auto chain2 = tsl::MakeConstructedAsyncValueRef<tsl::Chain>();
-  chain2.SetError("error");
-  auto *token2 = AsyncRuntime::AsToken(chain2);
-  EXPECT_EQ(AsyncRuntime::IsError(token2), true);
-
-  auto chain3 = tsl::MakeConstructedAsyncValueRef<tsl::Chain>();
-  auto *token3 = AsyncRuntime::AsToken(chain3);
-  EXPECT_EQ(AsyncRuntime::GetAsyncValue(token3)->IsAvailable(), false);
-  chain3.SetStateConcrete();
-  AsyncRuntime::AwaitToken(token3);
-  EXPECT_EQ(AsyncRuntime::GetAsyncValue(token3)->IsAvailable(), true);
-
-  AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(token1));
-  AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(token2));
-  AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(token3));
-}
-
-TEST_F(AsyncRuntimeTest, AsValue) {
-  auto async_value1 = tsl::MakeAvailableAsyncValueRef<int32_t>(42);
-  auto write = [](auto *v, std::byte *store) {
-    int32_t *store_t = reinterpret_cast<int32_t *>(store);
-    *store_t = *v;
-  };
-
-  auto *value1 = AsyncRuntime::AsValue<int32_t>(
-      async_value1, sizeof(int32_t), alignof(std::max_align_t), write);
-  auto *storage1 =
-      reinterpret_cast<int32_t *>(AsyncRuntime::GetStorage(value1));
-  EXPECT_EQ(*storage1, 42);
-
-  auto async_value2 = tsl::MakeConstructedAsyncValueRef<int32_t>();
-  async_value2.SetError("error");
-  auto *value2 = AsyncRuntime::AsValue<int32_t>(
-      async_value2, sizeof(int32_t), alignof(std::max_align_t), write);
-  EXPECT_EQ(AsyncRuntime::IsError(value2), true);
-
-  auto async_value3 = tsl::MakeConstructedAsyncValueRef<int32_t>(42);
-  auto *value3 = AsyncRuntime::AsValue<int32_t>(
-      async_value3, sizeof(int32_t), alignof(std::max_align_t), write);
-  EXPECT_EQ(AsyncRuntime::GetAsyncValue(value3)->IsAvailable(), false);
-  async_value3.SetStateConcrete();
-  AsyncRuntime::AwaitValue(value3);
-  auto *storage3 =
-      reinterpret_cast<int32_t *>(AsyncRuntime::GetStorage(value3));
-  EXPECT_EQ(*storage3, 42);
-
-  AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(value1));
-  AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(value2));
-  AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(value3));
-}
-
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/runtime/async_values_cache.h b/third_party/xla/xla/runtime/async_values_cache.h
deleted file mode 100644
index 03d16efa24e603..00000000000000
--- a/third_party/xla/xla/runtime/async_values_cache.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_RUNTIME_ASYNC_VALUES_CACHE_H_
-#define XLA_RUNTIME_ASYNC_VALUES_CACHE_H_
-
-#include "tsl/platform/platform.h"
-
-#if defined(PLATFORM_GOOGLE)
-#include "xla/runtime/google/async_values_cache.h"
-#else
-#include "xla/runtime/default/async_values_cache.h"
-#endif
-
-#endif  // XLA_RUNTIME_ASYNC_VALUES_CACHE_H_
diff --git a/third_party/xla/xla/runtime/compiler.h b/third_party/xla/xla/runtime/compiler.h
deleted file mode 100644
index 68a72e4d2fe2e5..00000000000000
--- a/third_party/xla/xla/runtime/compiler.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_RUNTIME_COMPILER_H_
-#define XLA_RUNTIME_COMPILER_H_
-
-namespace xla {
-namespace runtime {
-
-// Dialect registry is a container for registering dialects supported by the Xla
-// runtime compilation pipeline.
-class DialectRegistry;  // NOLINT
-
-// The main Xla runtime pass manager and compilation pipeline builder.
-class PassManager;  // NOLINT
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_RUNTIME_COMPILER_H_
diff --git a/third_party/xla/xla/runtime/constraints.cc b/third_party/xla/xla/runtime/constraints.cc
deleted file mode 100644
index 0f702404464918..00000000000000
--- a/third_party/xla/xla/runtime/constraints.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/runtime/constraints.h"
-
-#include <string>
-#include <string_view>
-
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-
-namespace xla {
-namespace runtime {
-
-using absl::InvalidArgumentError;
-using absl::StatusOr;
-using absl::StrCat;
-
-StatusOr<ArgumentConstraint> ParseArgumentConstraint(std::string_view str) {
-  if (str == "rank") return ArgumentConstraint::kRank;
-  if (str == "shape") return ArgumentConstraint::kShape;
-  if (str == "value") return ArgumentConstraint::kValue;
-  return InvalidArgumentError(StrCat("unknown operand constraint: ", str));
-}
-
-std::string ArgumentConstraintToString(ArgumentConstraint constraint) {
-  switch (constraint) {
-    case ArgumentConstraint::kResolved:
-      return "resolved";
-    case ArgumentConstraint::kRank:
-      return "rank";
-    case ArgumentConstraint::kShape:
-      return "shape";
-    case ArgumentConstraint::kValue:
-      return "value";
-    default:
-      llvm_unreachable("unknown operand constraint");
-  }
-}
-
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/runtime/constraints.h b/third_party/xla/xla/runtime/constraints.h
deleted file mode 100644
index a9a2d624b56d9e..00000000000000
--- a/third_party/xla/xla/runtime/constraints.h
+++ /dev/null
@@ -1,160 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_RUNTIME_CONSTRAINTS_H_
-#define XLA_RUNTIME_CONSTRAINTS_H_
-
-#include <string>
-#include <string_view>
-
-#include "absl/status/statusor.h"
-#include "llvm/Support/ErrorHandling.h"
-
-namespace xla {
-namespace runtime {
-
-// Constraints on the function argument can be specified with the function
-// argument attributes.
-//
-// Example:
-//
-//   func @compute(
-//     // Rank of the `%arg` must be known at compile time.
-//     %arg: tensor<*xf32> { rt.constraint = "rank" }
-//   ) -> tensor<?xf32> { ... }
-//
-// TODO(b/187114012): Add attribute verifier to `rt` dialect.
-constexpr const char* kArgumentConstraintAttrName = "rt.constraint";
-
-// Constraint on what argument information must be available at compile time in
-// order to successfully compile the executable:
-//
-//   `rank`  : argument must have statically known rank.
-//   `shape` : argument must have statically known shape.
-//   `value` : argument must have statically known value, and such arguments
-//             replaced with constants inside the compiled function body and
-//             and all value constrained argument uses replaced with the sunk
-//             constant value.
-//
-// For now these constraints are supported by arguments of shaped types (tensors
-// or memrefs), but potentially can be extended to support open type hierarchy
-// of user-defined types.
-//
-// XLA program example:
-//
-//   func @main(
-//     %input0: memref<*xf32>   { rt.constraint = "rank"  },
-//     %input1: memref<?x?xf32> { rt.constraint = "shape" },
-//     %perm: memref<4xi32>     { rt.constraint = "value" }
-//   ) { ... }
-//
-// Entrypoint function can define constraints on its arguments, that must be
-// resolved before the function can be compiled. If constraints can't be
-// resolved statically from the function signature (e.g. rank is unknown), then
-// the runtime will specialize generic function to concrete operands at runtime
-// (concrete operands rank, shape or value).
-//
-// If function arguments do not have unresolved constraints, compiler can
-// instantiate the default executable, that can take all compatible inputs
-// without recompilation.
-//
-// (a) Rank constraint:
-//
-//     %arg : tensor<*xf32> { rt.constraint = "rank" }
-//
-//     Before compiling the function, unranked input type will be updated to the
-//     corresponding ranked input type (e.g. unranked tensor -> ranked tensor).
-//
-// (b) Shape constraint:
-//
-//     %arg : tensor<?x?xf32> { rt.constraint = "shape" }
-//
-//     Shape of the runtime argument will be used to specialize the compiled
-//     function, if this shape seen the first time, it will trigger function
-//     recompilation.
-//
-// (c) Value constraint:
-//
-//     %reduction_dimension : tensor<i32> { rt.constraint = "value" }
-//
-//     Runtime value will be sunk into the body of a function as a constant,
-//     and the function will be recompiled. For example this can be used to sink
-//     reduction dimensions to generate more efficient code.
-//
-//     Value constraint is only supported for the integer data type, in practice
-//     it should be reduction dimension, dimension permutation, or any similar
-//     value that does not change often, and is required for generating
-//     efficient code.
-//
-//  Shape and value specialization example:
-//
-//    // Computes `%arg0` mean value over the axis specified by the `%arg1`.
-//    // See: https://www.tensorflow.org/api_docs/python/tf/math/reduce_mean
-//    func @mean(%arg0: tensor<?x?xf32>, %arg1: tensor<i32>) -> tensor<?xf32> {
-//      %0 = "tf.Mean(%arg0, %arg1)
-//             : (tensor<?x?xf32>, tensor<i32>) -> tensor<?xf32>
-//      return %0: tensor<?xf32>
-//    }
-//
-//  Shape specialization to input shapes: [tensor<4x8xf32>, tensor<f32>]
-//
-//    func @mean(%arg0: tensor<4x8xf32>, %arg1: tensor<i32>) -> tensor<?xf32> {
-//      %0 = "tf.Mean(%arg0, %arg1)
-//             : (tensor<4x8xf32>, tensor<i32>) -> tensor<?xf32>
-//      return %0: tensor<?xf32>
-//    }
-//
-//    Shape specialization in this particular case doesn't bring much
-//    improvement, because without knowing the reduction axis we can't infer
-//    any new information from the input shape alone.
-//
-//  Value specialization to input values: [ <do-not-specialize>, dense<1 : i32>]
-//
-//    func @mean(%arg0: tensor<4x8xf32>) -> tensor<4xf32> {
-//      %0 = "tf.Constant" { value = dense<1 : i32>} -> tensor<i32>
-//      %1 = "tf.Mean(%arg0, %0)
-//             : (tensor<4x8xf32>, tensor<i32>) -> tensor<4xf32>
-//      return %1 : tensor<4xf32>
-//    }
-//
-//    By specializing function to the concrete value of the second argument, by
-//    sinking it into the function body we can infer the output shape. Also this
-//    information allows to statically choose reduction implementation optimized
-//    for reducing along the inner most dimension.
-//
-//    Furthermore static information about reduction axis allows to lower mean
-//    operation to Linalg generic operation. Dynamic reduction axis is not
-//    representable in Linalg, and would require multi-versioning and dynamic
-//    dispatch at runtime.
-//
-enum class ArgumentConstraint {
-  // Constraint was resolved based on the static information in the function
-  // signature type or it was never specified by the argument attribute.
-  kResolved = 0,
-  kRank = 1,
-  kShape = 2,
-  kValue = 3
-};
-
-// Converts argument constraint string to the corresponding enum class.
-absl::StatusOr<ArgumentConstraint> ParseArgumentConstraint(
-    std::string_view str);
-
-std::string ArgumentConstraintToString(ArgumentConstraint constraint);
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_RUNTIME_CONSTRAINTS_H_
diff --git a/third_party/xla/xla/runtime/custom_call.cc b/third_party/xla/xla/runtime/custom_call.cc
deleted file mode 100644
index af411d63976d05..00000000000000
--- a/third_party/xla/xla/runtime/custom_call.cc
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/runtime/custom_call.h"
-
-#include <cstddef>
-#include <optional>
-#include <string>
-#include <string_view>
-
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/raw_ostream.h"
-
-namespace xla {
-namespace runtime {
-
-using llvm::raw_ostream;
-
-template <typename T>
-using TensorRef = CustomCall::TensorRef<T>;
-
-static void PrintArr(raw_ostream& os, std::string_view name,
-                     absl::Span<const int64_t> arr) {
-  os << " " << name << ": [";
-  auto i64_to_string = [](int64_t v) { return std::to_string(v); };
-  os << llvm::join(llvm::map_range(arr, i64_to_string), ", ");
-  os << "]";
-}
-
-static raw_ostream& operator<<(raw_ostream& os, PrimitiveType type) {
-  return os << primitive_util::LowercasePrimitiveTypeName(type);
-}
-
-raw_ostream& operator<<(raw_ostream& os, const StridedMemrefView& view) {
-  os << "StridedMemrefView: dtype: " << view.dtype;
-  PrintArr(os, "sizes", view.sizes);
-  PrintArr(os, "strides", view.strides);
-  return os;
-}
-
-raw_ostream& operator<<(raw_ostream& os, const MemrefView& view) {
-  os << "MemrefView: dtype: " << view.dtype;
-  PrintArr(os, "sizes", view.sizes);
-  return os;
-}
-
-raw_ostream& operator<<(raw_ostream& os, const FlatMemrefView& view) {
-  return os << "FlatMemrefView: dtype: " << view.dtype
-            << " size_in_bytes: " << view.size_in_bytes;
-}
-
-void PopulateCustomCallTypeIdNames(TypeIDNameRegistry& r) {
-  r.Register<Tagged<void*>>("__type_id_opaque");
-  r.Register<Tagged<std::nullopt_t>>("__type_id_nullopt");
-  r.Register<Tagged<std::string_view>>("__type_id_string");
-  r.Register<Tagged<CustomCall::FunctionOrdinal>>("__type_id_function_ordinal");
-
-  r.Register<Tagged<bool>>("__type_id_bool");
-  r.Register<Tagged<int8_t>>("__type_id_int8");
-  r.Register<Tagged<int16_t>>("__type_id_int16");
-  r.Register<Tagged<int32_t>>("__type_id_int32");
-  r.Register<Tagged<int64_t>>("__type_id_int64");
-  r.Register<Tagged<uint8_t>>("__type_id_uint8");
-  r.Register<Tagged<uint16_t>>("__type_id_uint16");
-  r.Register<Tagged<uint32_t>>("__type_id_uint32");
-  r.Register<Tagged<uint64_t>>("__type_id_uint64");
-  r.Register<Tagged<Eigen::bfloat16>>("__type_id_bfloat16");
-  r.Register<Tagged<Eigen::half>>("__type_id_f16");
-  r.Register<Tagged<float>>("__type_id_float");
-  r.Register<Tagged<double>>("__type_id_double");
-
-  r.Register<Tagged<MemrefView>>("__type_id_memref_view");
-  r.Register<Tagged<StridedMemrefView>>("__type_id_strided_memref_view");
-  r.Register<Tagged<EmptyArray>>("__type_id_empty_array");
-  r.Register<Tagged<Dictionary>>("__type_id_dictionary");
-
-  r.Register<Tagged<absl::Span<const int8_t>>>("__type_id_array_int8");
-  r.Register<Tagged<absl::Span<const int16_t>>>("__type_id_array_int16");
-  r.Register<Tagged<absl::Span<const int32_t>>>("__type_id_array_int32");
-  r.Register<Tagged<absl::Span<const int64_t>>>("__type_id_array_int64");
-  r.Register<Tagged<absl::Span<const float>>>("__type_id_array_float");
-  r.Register<Tagged<absl::Span<const double>>>("__type_id_array_double");
-
-  r.Register<Tagged<TensorRef<int32_t>>>("__type_id_tensor_int32");
-  r.Register<Tagged<TensorRef<int64_t>>>("__type_id_tensor_int64");
-  r.Register<Tagged<TensorRef<float>>>("__type_id_tensor_float");
-  r.Register<Tagged<TensorRef<double>>>("__type_id_tensor_double");
-
-  r.Register<Tagged<tsl::AsyncValueRef<bool>>>("__type_id_async_bool");
-  r.Register<Tagged<tsl::AsyncValueRef<int8_t>>>("__type_id_async_int8");
-  r.Register<Tagged<tsl::AsyncValueRef<int16_t>>>("__type_id_async_int16");
-  r.Register<Tagged<tsl::AsyncValueRef<int32_t>>>("__type_id_async_int32");
-  r.Register<Tagged<tsl::AsyncValueRef<int64_t>>>("__type_id_async_int64");
-  r.Register<Tagged<tsl::AsyncValueRef<float>>>("__type_id_async_float");
-  r.Register<Tagged<tsl::AsyncValueRef<double>>>("__type_id_async_double");
-  r.Register<Tagged<tsl::AsyncValueRef<xla::runtime::MemrefView>>>(
-      "__type_id_async_memref");
-  r.Register<Tagged<tsl::AsyncValueRef<tsl::Chain>>>("__type_id_async_chain");
-}
-
-}  // namespace runtime
-}  // namespace xla
-
-XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(std::string_view);
-XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(xla::runtime::StridedMemrefView);
-XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(xla::runtime::MemrefView);
-XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(xla::runtime::FlatMemrefView);
-XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(xla::runtime::EmptyArray);
-XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(xla::runtime::Dictionary);
-XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(int32_t);
-XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(int64_t);
-XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(float);
-XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(double);
-XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(absl::Span<const int32_t>);
-XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(absl::Span<const int64_t>);
-XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(absl::Span<const float>);
-XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(absl::Span<const double>);
-XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(tsl::AsyncValueRef<bool>);
-XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(tsl::AsyncValueRef<int8_t>);
-XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(tsl::AsyncValueRef<int16_t>);
-XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(tsl::AsyncValueRef<int32_t>);
-XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(tsl::AsyncValueRef<int64_t>);
-XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(tsl::AsyncValueRef<float>);
-XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(tsl::AsyncValueRef<double>);
-XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(
-    tsl::AsyncValueRef<xla::runtime::MemrefView>);
-XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(tsl::AsyncValueRef<tsl::Chain>);
diff --git a/third_party/xla/xla/runtime/custom_call.h b/third_party/xla/xla/runtime/custom_call.h
deleted file mode 100644
index 27c64483b55d21..00000000000000
--- a/third_party/xla/xla/runtime/custom_call.h
+++ /dev/null
@@ -1,1893 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_RUNTIME_CUSTOM_CALL_H_
-#define XLA_RUNTIME_CUSTOM_CALL_H_
-
-#include <algorithm>
-#include <any>
-#include <cstddef>
-#include <cstdint>
-#include <functional>
-#include <iterator>
-#include <optional>
-#include <string>
-#include <string_view>
-#include <tuple>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#include "absl/base/dynamic_annotations.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/types/span.h"
-#include "Eigen/Core"  // from @eigen_archive
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringExtras.h"
-#include "xla/primitive_util.h"
-#include "xla/runtime/async_runtime.h"
-#include "xla/runtime/diagnostics.h"
-#include "xla/runtime/errors.h"
-#include "xla/runtime/ffi/ffi_abi.h"
-#include "xla/runtime/logical_result.h"
-#include "xla/runtime/map_by_type.h"
-#include "xla/runtime/memref_view.h"
-#include "xla/runtime/state.h"
-#include "xla/runtime/type_id.h"
-#include "tsl/concurrency/async_value_ref.h"
-#include "tsl/concurrency/chain.h"
-
-namespace xla {
-namespace runtime {
-
-// Forward declare.
-struct ExecutionContext;
-
-// Forward declare template defined below.
-template <typename... Ts>
-class CustomCallBinding;
-
-// Registers mappings from TypeIDs supported by the custom calls to their unique
-// names in the given registry.
-void PopulateCustomCallTypeIdNames(TypeIDNameRegistry& registry);
-
-// A type tag to declare MLIR TypeID specializations for types passed to the
-// custom calls. We don't want to declare specializations for scalar types
-// directly in this translation unit, so we rely on a tag to wrap them.
-//
-// See explicit TypeID declarations at the end of this file.
-template <typename T>
-struct Tagged {};
-
-class CustomCall {
- public:
-  // Container for passing data between XLA user and the custom call handler.
-  using UserData = PtrMapByType<CustomCall>;
-
-  // A type for matching all remaining custom call arguments.
-  class RemainingArgs;
-
-  // A type for passing an argument of different types at the same position,
-  // and the handler will do the decoding.
-  class VariantArg;
-  class VariantAttr;
-
-  // A type for representing tensors with shapes.
-  template <typename T>
-  struct TensorRef {
-    absl::Span<const int64_t> shape;
-    absl::Span<const T> data;
-  };
-
-  // An ordinal of a function exported from executable.
-  struct FunctionOrdinal {
-    unsigned ordinal;
-  };
-
-  // Custom call handler can check arguments and attributes types and names
-  // at runtime, however this comes at extra cost and can be optionally
-  // disabled. If the version of the compiler that generated the XLA executable
-  // doesn't match the custom call handler, it can lead to undefined behavior.
-  enum class RuntimeChecks : uint8_t {
-    // Check arguments and attributes types, also check attribute names. It is
-    // safe to pass extra attributes (if `exact_attrs` is false) to the custom
-    // call when name checking is enabled, because it will safely skip
-    // irrelevant attributes.
-    kDefault = 0,
-
-    // Check only the types of the arguments and attributes. At this check level
-    // custom calls never check the names of the attributes because it can be
-    // too expensive, however type checking should prevent catastrophic
-    // segfaults. This is the recommended checks level in optimized builds.
-    kLess = 1,
-
-    // Do not check the number of arguments and attributes and their types, and
-    // do not check that the user data was passed to the custom call. This is
-    // the most dangerous option, because it blindly reinterprets opaque memory
-    // passed to the handler, and can easily lead to segfaults if the data
-    // doesn't match the expected custom call signature.
-    kNone = 2
-  };
-
-  struct Options {
-    // Check that attributes passed at run time exactly match the attributes
-    // defined by the custom call binding. If `false` then custom call handler
-    // will happily ignore any additional attributes passed at run time. It is
-    // unsafe to disable run-time checks for custom calls that support non-exact
-    // attributes, as the custom call handler uses pre-computed attributes
-    // offsets based on the binding specification.
-    bool exact_attrs = true;
-  };
-
-  static constexpr bool CheckNames(RuntimeChecks checks) {
-    return checks == RuntimeChecks::kDefault;
-  }
-
-  static constexpr bool CheckTypes(RuntimeChecks checks) {
-    return checks != RuntimeChecks::kNone;
-  }
-
-  static constexpr bool CheckUserData(RuntimeChecks checks) {
-    return checks != RuntimeChecks::kNone;
-  }
-
-  template <typename T>
-  static bool Isa(RuntimeChecks checks, TypeID type_id) {
-    return !CheckTypes(checks) || type_id == TypeID::get<Tagged<T>>();
-  }
-
-  template <typename T, typename U, typename... Ts>
-  static bool Isa(RuntimeChecks checks, TypeID type_id) {
-    return !CheckTypes(checks) || type_id == TypeID::get<Tagged<T>>() ||
-           Isa<U, Ts...>(checks, type_id);
-  }
-
-  virtual ~CustomCall() = default;
-
-  virtual std::string_view name() const = 0;
-  virtual LogicalResult call(void** args, void** attrs, void** rets,
-                             const UserData* user_data,
-                             const DiagnosticEngine* diagnostic) const = 0;
-
-  static CustomCallBinding<> Bind(std::string callee);
-  static CustomCallBinding<> Bind(std::string callee, const Options& opts);
-
-  // This is a helper template that allows to convert functions pointers from
-  // the run time values to compile time values (template arguments) with
-  // automatic template arguments inference.
-  //
-  // Example:
-  //
-  //   static LogicalResult Foo(int32_t arg) {... }
-  //
-  //   template<typename Callable>
-  //   void call(Callable callable) { callable(42); }
-  //
-  //   call(Foo);                     // `Foo` passed as a runtime value
-  //   call(FunctionWrapper<Foo>())   // `Foo` passed as a template argument
-  //
-  // In the first case compiler will not be able to inline `Foo` into the `call`
-  // body. However in the second case it can do that, because function pointer
-  // is a statically known value (template non-type argument).
-  template <auto fn>
-  struct FunctionWrapper;
-
-  template <typename Ret, typename... Args, Ret (*fn)(Args...)>
-  struct FunctionWrapper<fn> {
-    ABSL_ATTRIBUTE_ALWAYS_INLINE Ret operator()(Args... args) const {
-      return fn(args...);
-    }
-  };
-};
-
-// Forward declare template defined below.
-template <CustomCall::RuntimeChecks checks, typename Fn, typename... Ts>
-class CustomCallHandler;
-
-namespace internal {
-
-// A type tag to distinguish arguments tied to the attributes in the
-// `CustomCallBinding` variadic template argument.
-template <typename T>
-struct Attr {};
-
-// A type tag to distinguish arguments tied to the return in the
-// `CustomCallBinding` variadic template argument.
-template <typename T>
-struct Ret {};
-
-// A type tag to distinguish arguments tied to the user data in the
-// `CustomCallBinding` variadic template argument.
-template <typename T>
-struct UserData {};
-
-// A type tag to distinguish arguments tied to the state in the
-// `CustomCallBinding` variadic template argument.
-template <typename T>
-struct StateTag {};
-
-// A type tag to distinguish arguments tied to the constant values in the
-// `CustomCallBinding` variadic template argument.
-template <typename T>
-struct Value {};
-
-// A template for checking if type is a regular argument or one of the special
-// arguments wrapped in a type tag (e.g. attr, user data, etc...).
-template <typename>
-struct IsWrapped : std::false_type {};
-
-template <typename T>
-struct IsWrapped<internal::Attr<T>> : std::true_type {};
-
-template <typename T>
-struct IsWrapped<internal::Ret<T>> : std::true_type {};
-
-template <typename T>
-struct IsWrapped<internal::UserData<T>> : std::true_type {};
-
-template <typename T>
-struct IsWrapped<internal::StateTag<T>> : std::true_type {};
-
-template <typename T>
-struct IsWrapped<internal::Value<T>> : std::true_type {};
-
-template <typename T>
-struct IsResult : std::false_type {};
-
-template <typename T>
-struct IsResult<internal::Ret<T>> : std::true_type {};
-
-// Checks if remaining arguments are in the parameter pack.
-template <typename... Ts>
-using HasRemainingArgs =
-    std::disjunction<std::is_same<CustomCall::RemainingArgs, Ts>...>;
-
-}  // namespace internal
-
-// Custom call binding describes the function signature of the expected custom
-// call handler using its variadic template parameter.
-//
-//   Custom call binding:
-//     CustomCallBinding<int32_t, MemrefView>
-//
-//   Function signature:
-//     LogicalResult MyHandle(int32_t algo, MemrefView memref);
-//
-template <typename... Ts>
-class CustomCallBinding {
- public:
-  using Options = CustomCall::Options;
-  using RuntimeChecks = CustomCall::RuntimeChecks;
-
-  template <typename T>
-  CustomCallBinding<Ts..., T> Arg() && {
-    return {std::move(*this)};
-  }
-
-  CustomCallBinding<Ts..., CustomCall::RemainingArgs> RemainingArgs() && {
-    static_assert(!internal::HasRemainingArgs<Ts...>::value,
-                  "remaining arguments can be passed just once");
-    return {std::move(*this)};
-  }
-
-  template <typename T>
-  CustomCallBinding<Ts..., internal::Attr<T>> Attr(std::string attr) && {
-    attrs_.push_back(std::move(attr));
-    return {std::move(*this)};
-  }
-
-  template <typename T>
-  CustomCallBinding<Ts..., internal::Ret<T>> Ret() && {
-    return {std::move(*this)};
-  }
-
-  template <typename T>
-  CustomCallBinding<Ts..., internal::UserData<T>> UserData() && {
-    static_assert(std::is_pointer<T>::value, "user data must be a pointer");
-    return {std::move(*this)};
-  }
-
-  template <typename T>
-  CustomCallBinding<Ts..., internal::StateTag<T>> State(std::string id) && {
-    attrs_.push_back(std::move(id));
-    return {std::move(*this)};
-  }
-
-  template <typename T>
-  CustomCallBinding<Ts..., internal::Value<T>> Value(T value) && {
-    values_.push_back(std::move(value));
-    return {std::move(*this)};
-  }
-
-  template <RuntimeChecks checks = RuntimeChecks::kDefault, typename Fn>
-  std::unique_ptr<CustomCallHandler<checks, Fn, Ts...>> To(Fn fn) {
-    return std::unique_ptr<CustomCallHandler<checks, Fn, Ts...>>(
-        new CustomCallHandler<checks, Fn, Ts...>(
-            std::forward<Fn>(fn), std::move(callee_), std::move(attrs_),
-            std::move(values_), opts_));
-  }
-
- private:
-  template <typename...>
-  friend class CustomCallBinding;
-  friend class CustomCall;
-
-  CustomCallBinding(std::string callee, const Options& opts)
-      : callee_(std::move(callee)), opts_(opts) {
-    static_assert(sizeof...(Ts) == 0, "custom call arguments must be empty");
-  }
-
-  template <typename... TTs>
-  CustomCallBinding(CustomCallBinding<TTs...>&& other)  // NOLINT
-      : callee_(std::move(other.callee_)),
-        attrs_(std::move(other.attrs_)),
-        values_(std::move(other.values_)),
-        opts_(other.opts_) {}
-
-  CustomCallBinding(CustomCallBinding&) = delete;
-
-  std::string callee_;              // custom call target
-  std::vector<std::string> attrs_;  // names of bound attributes
-  std::vector<std::any> values_;    // values bound to arguments
-  Options opts_;
-};
-
-inline CustomCallBinding<> CustomCall::Bind(std::string callee) {
-  return Bind(std::move(callee), Options());
-}
-
-inline CustomCallBinding<> CustomCall::Bind(std::string callee,
-                                            const Options& opts) {
-  return CustomCallBinding<>(std::move(callee), opts);
-}
-
-// Custom calls return results to the caller through the template
-// specializations of the `Result`. Each template specialization is responsible
-// for definining the result encoding/decoding to/from opaque memory.
-template <typename T>
-class Result;
-
-// Custom call arguments decoding must be defined by specializing this template.
-//
-// Example: decoding for the `MyType` arguments
-//
-//   template <CustomCall::RuntimeChecks checks>
-//   struct CustomCallArgDecoding<MyType, checks> {
-//    static FailureOr<MyType> Decode(TypeID type_id, void* value);
-//   };
-//
-template <typename T, CustomCall::RuntimeChecks>
-struct CustomCallArgDecoding;
-
-// Custom call attribute decoding must be defined by specializing this template.
-//
-// Example: decoding for the `MyType` attributes
-//
-//   template <CustomCall::RuntimeChecks checks>
-//   struct CustomCallAttrDecoding<MyType, checks> {
-//    static FailureOr<MyType> Decode(std::string_view name,
-//                                    TypeID type_id, void* value);
-//   }
-//
-template <typename T, CustomCall::RuntimeChecks>
-struct CustomCallAttrDecoding;
-
-// Custom call returns decoding must be defined by specializing this template.
-//
-// Example: decoding for the `MyType` arguments
-//
-//   template <CustomCall::RuntimeChecks checks>
-//   struct CustomCallRetDecoding<MyType, checks> {
-//    static FailureOr<Result<MyType>> Decode(TypeID type_id, void* value);
-//   };
-//
-template <typename T, CustomCall::RuntimeChecks>
-struct CustomCallRetDecoding;
-
-//===----------------------------------------------------------------------===//
-// Helpers for decoding opaque arguments and attributes memory.
-//===----------------------------------------------------------------------===//
-
-namespace internal {
-
-// Decoded pair of an argument type and opaque value.
-struct DecodedArg {
-  TypeID type_id;
-  void* value;
-};
-
-// Decoded triple of an attribute name, type and opaque value.
-struct DecodedAttr {
-  std::string_view name;
-  TypeID type_id;
-  void* value;
-};
-
-// A convenience wrapper around opaque arguments memory.
-class DecodedArgs {
- public:
-  explicit DecodedArgs(void** args) {
-    ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(args, sizeof(void*));
-    size_ = *reinterpret_cast<int64_t*>(args[0]);
-    if (size_) {
-      ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(args + 1, sizeof(void*));
-      type_table_ = reinterpret_cast<void**>(args[1]);
-      ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(type_table_, size_ * sizeof(void*));
-      values_ = args + 2;
-      ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(values_, size_ * sizeof(void*));
-    }
-  }
-
-  ABSL_ATTRIBUTE_ALWAYS_INLINE int64_t size() const { return size_; }
-
-  ABSL_ATTRIBUTE_ALWAYS_INLINE DecodedArg operator[](size_t i) const {
-    DecodedArg arg;
-    arg.type_id = TypeID::getFromOpaquePointer(type_table_[i]);
-    arg.value = values_[i];
-    return arg;
-  }
-
- private:
-  int64_t size_;
-  void** type_table_ = nullptr;
-  void** values_ = nullptr;
-};
-
-// A convenience wrapper around opaque attributes memory.
-class DecodedAttrs {
- public:
-  explicit DecodedAttrs(void** attrs) : encoded_(attrs + 1) {
-    ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(attrs, sizeof(void*));
-    size_ = *reinterpret_cast<int64_t*>(attrs[0]);
-    ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(encoded_, 3 * size_ * sizeof(void*));
-  }
-
-  ABSL_ATTRIBUTE_ALWAYS_INLINE int64_t size() const { return size_; }
-
-  ABSL_ATTRIBUTE_ALWAYS_INLINE DecodedAttr operator[](size_t i) const {
-    void** attr_base = encoded_ + i * 3;
-
-    DecodedAttr attr;
-    auto* name = reinterpret_cast<internal::EncodedArray<char>*>(attr_base[0]);
-    attr.name = std::string_view(name->data, name->size);
-    attr.type_id = TypeID::getFromOpaquePointer(attr_base[1]);
-    attr.value = attr_base[2];
-
-    return attr;
-  }
-
- private:
-  void** encoded_;
-  int64_t size_;
-};
-
-// Using the same class for decoded returns
-using DecodedRet = DecodedArg;
-using DecodedRets = DecodedArgs;
-
-}  // namespace internal
-
-//===----------------------------------------------------------------------===//
-// CustomCall remaining arguments wraps the type-erased `DecodedArg` container,
-// and provides a type-safe API for accessing individual arguments.
-//===----------------------------------------------------------------------===//
-
-class CustomCall::RemainingArgs {
- public:
-  using RuntimeChecks = CustomCall::RuntimeChecks;
-
-  RemainingArgs(internal::DecodedArgs args, size_t offset)
-      : args_(args), offset_(offset) {
-    assert(offset <= args_.size() && "illegal remaining args offset");
-  }
-
-  size_t size() const { return args_.size() - offset_; }
-  bool empty() const { return size() == 0; }
-
-  template <typename T>
-  bool isa(size_t index) const {
-    return args_[index + offset_].type_id == TypeID::get<Tagged<T>>();
-  }
-
-  template <typename T, RuntimeChecks checks = RuntimeChecks::kDefault>
-  FailureOr<T> get(size_t index) const {
-    internal::DecodedArg arg = args_[index + offset_];
-    return CustomCallArgDecoding<T, checks>::Decode(arg.type_id, arg.value);
-  }
-
- private:
-  internal::DecodedArgs args_;
-  size_t offset_;
-};
-
-class CustomCall::VariantArg {
- public:
-  using RuntimeChecks = CustomCall::RuntimeChecks;
-
-  VariantArg(internal::DecodedArgs args, size_t offset)
-      : args_(args), offset_(offset) {
-    assert(offset <= args_.size() && "illegal remaining args offset");
-  }
-
-  template <typename T>
-  bool isa() const {
-    return args_[offset_].type_id == TypeID::get<Tagged<T>>();
-  }
-
-  template <typename T, RuntimeChecks checks = RuntimeChecks::kDefault>
-  FailureOr<T> get() const {
-    internal::DecodedArg arg = args_[offset_];
-    return CustomCallArgDecoding<T, checks>::Decode(arg.type_id, arg.value);
-  }
-
- private:
-  internal::DecodedArgs args_;
-  size_t offset_;
-};
-
-class CustomCall::VariantAttr {
- public:
-  using RuntimeChecks = CustomCall::RuntimeChecks;
-
-  VariantAttr(std::string_view name, TypeID type_id, void* value)
-      : name_(name), type_id_(type_id), value_(value) {}
-
-  template <typename T>
-  bool isa() const {
-    return type_id_ == TypeID::get<Tagged<T>>();
-  }
-
-  template <typename T, RuntimeChecks checks = RuntimeChecks::kDefault>
-  FailureOr<T> get() const {
-    return CustomCallAttrDecoding<T, checks>::Decode(name_, type_id_, value_);
-  }
-
- private:
-  std::string_view name_;
-  TypeID type_id_;
-  void* value_;
-};
-
-//===----------------------------------------------------------------------===//
-// A little bit of template metaprogramming to implement type safe binding
-// of custom calls to C++ functions. This is internal implementation details,
-// and must not be relied on in any of the client code.
-//===----------------------------------------------------------------------===//
-
-namespace internal {
-
-// A helper struct to extract the type of the handler argument.
-template <typename T>
-struct FnArgType {
-  using Type = T;
-};
-
-// Extracts the underlying type from the attribute type tag.
-template <typename T>
-struct FnArgType<internal::Attr<T>> {
-  using Type = T;
-};
-
-// Extracts the underlying type from the return type tag.
-template <typename T>
-struct FnArgType<internal::Ret<T>> {
-  using Type = Result<T>;
-};
-
-// Extracts the underlying type from the user data type tag.
-template <typename T>
-struct FnArgType<internal::UserData<T>> {
-  using Type = T;
-};
-
-// Extracts the underlying type from the state type tag.
-template <typename T>
-struct FnArgType<internal::StateTag<T>> {
-  using Type = State<T>;
-};
-
-// Extracts the underlying type from the value type tag.
-template <typename T>
-struct FnArgType<internal::Value<T>> {
-  using Type = T;
-};
-
-// A template for counting regular arguments in the Ts pack.
-template <typename... Ts>
-struct NumArgs;
-
-template <typename T, typename... Ts>
-struct NumArgs<T, Ts...> {
-  static constexpr int64_t value = !IsWrapped<T>::value + NumArgs<Ts...>::value;
-};
-
-template <>
-struct NumArgs<> {
-  static constexpr int64_t value = 0;
-};
-
-// A template for counting returns in the Ts pack.
-template <typename... Ts>
-struct NumRets;
-
-template <typename T, typename... Ts>
-struct NumRets<T, Ts...> {
-  static constexpr int64_t value = IsResult<T>::value + NumRets<Ts...>::value;
-};
-
-template <>
-struct NumRets<> {
-  static constexpr int64_t value = 0;
-};
-
-// Unwrap return type to get the type expected by result `Set` method.
-//
-// TODO(ezhulenev): Result template itself should define what type `T` it
-// expects to see in the `Set` method, because it's not necessery the same as
-// the template type of the result.
-template <typename T>
-struct UnwrapRet;
-
-template <typename T>
-struct UnwrapRet<Result<T>> {
-  using Type = T;
-};
-
-// A helper template to concatenate index + index sequence.
-template <size_t, typename>
-struct ConsIdx;
-
-template <size_t idx, size_t... Is>
-struct ConsIdx<idx, std::index_sequence<Is...>> {
-  using Type = std::index_sequence<idx, Is...>;
-};
-
-// Get indices of the variadic template type parameters corresponding to
-// results. This template will produce an `std::index_sequence` type with
-// indices of custom call result arguments.
-template <size_t idx, typename... Ts>
-struct IndexRets;
-
-template <size_t idx>
-struct IndexRets<idx> {
-  using Is = std::index_sequence<>;
-};
-
-template <size_t idx, typename T, typename... Ts>
-struct IndexRets<idx, T, Ts...> {
-  using Is = std::conditional_t<
-      IsResult<T>::value,
-      typename ConsIdx<idx, typename IndexRets<idx + 1, Ts...>::Is>::Type,
-      typename IndexRets<idx + 1, Ts...>::Is>;
-};
-
-// Get indices of the variadic template type parameters corresponding to
-// all arguments excluding results.
-template <size_t idx, typename... Ts>
-struct IndexArgs;
-
-template <size_t idx>
-struct IndexArgs<idx> {
-  using Is = std::index_sequence<>;
-};
-
-template <size_t idx, typename T, typename... Ts>
-struct IndexArgs<idx, T, Ts...> {
-  using Is = std::conditional_t<
-      !IsResult<T>::value,
-      typename ConsIdx<idx, typename IndexArgs<idx + 1, Ts...>::Is>::Type,
-      typename IndexArgs<idx + 1, Ts...>::Is>;
-};
-
-template <size_t... FnArgsIs, size_t... ResultIs, typename FnArgs,
-          typename Result>
-void SetResultsFromTuple(std::index_sequence<ResultIs...>, FnArgs fn_args,
-                         Result tuple) {
-  ((*std::get<FnArgsIs>(fn_args)).Set(std::get<ResultIs>(tuple)), ...);
-}
-
-// When decoding input data we need to keep track of how many arguments,
-// attributes, and returns we decoded so far to index into the correct data
-// structure.
-struct DecodingOffsets {
-  int64_t args = 0;
-  int64_t attrs = 0;
-  int64_t rets = 0;
-  int64_t values = 0;
-};
-
-struct DecodingContext {
-  internal::DecodedArgs args;
-  internal::DecodedRets rets;
-  internal::DecodedAttrs attrs;
-
-  // Attributes' names and mapping from attrs' offsets to indices in `attrs`.
-  absl::Span<const std::string> attrs_names;
-  absl::Span<const size_t> attrs_idx;
-
-  // Values bound to arguments at handler construction time.
-  absl::Span<const std::any> values;
-
-  // User-provided auxiliary data.
-  const CustomCall::UserData* user_data;
-
-  // User-provided diagnostic engine for reporting detailed errors.
-  const DiagnosticEngine* diagnostic;
-};
-
-template <typename T, CustomCall::RuntimeChecks checks>
-ABSL_ATTRIBUTE_ALWAYS_INLINE inline FailureOr<T*> DecodeUserData(
-    const CustomCall::UserData* user_data) {
-  if (!CustomCall::CheckUserData(checks)) return user_data->get<T>();
-
-  // TODO(ezhulenev): Add an option to request nullable user data, because
-  // right now we do not distinguish between a user data pointer that doesn't
-  // exist, and a null pointer passed by the user.
-
-  // Get the requested value if user data was passed to the custom call.
-  auto* ptr = user_data ? user_data->getIfExists<T>() : nullptr;
-  if (LLVM_UNLIKELY(!ptr)) return failure();
-  return ptr;
-}
-
-template <typename T, CustomCall::RuntimeChecks checks>
-ABSL_ATTRIBUTE_ALWAYS_INLINE inline FailureOr<T> DecodeAttr(
-    DecodingOffsets& offsets, absl::Span<const std::string> attrs_names,
-    absl::Span<const size_t> attrs_idx, internal::DecodedAttrs attrs) {
-  // Find decoded attribute corresponding for the given attribute index.
-  int64_t idx = offsets.attrs++;
-
-  // Do not check the attribute name, and decode attribute at the given index.
-  if (!CustomCall::CheckNames(checks)) {
-    size_t i = attrs_idx[idx];
-    return CustomCallAttrDecoding<T, checks>::Decode(
-        attrs[i].name, attrs[i].type_id, attrs[i].value);
-  }
-
-  std::string_view attr_name = attrs_names[idx];
-
-  // Given that attributes are passed to the custom call handler
-  // lexicographically sorted by name, we can find the attribute we are
-  // looking for only between the `attrs_idx` offset and the end of the
-  // attributes array.
-  for (size_t i = attrs_idx[idx]; i < attrs.size(); ++i) {
-    if (LLVM_LIKELY(attrs[i].name == attr_name))
-      return CustomCallAttrDecoding<T, checks>::Decode(
-          attrs[i].name, attrs[i].type_id, attrs[i].value);
-  }
-
-  // Attribute we were looking for was not passed as an argument.
-  return failure();
-}
-
-template <typename T, CustomCall::RuntimeChecks checks>
-struct Decode {
-  ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<T> call(
-      DecodingOffsets& offsets, DecodingContext& ctx) {
-    internal::DecodedArg arg = ctx.args[offsets.args++];
-    return CustomCallArgDecoding<T, checks>::Decode(arg.type_id, arg.value);
-  }
-};
-
-template <typename T, CustomCall::RuntimeChecks checks>
-struct Decode<internal::Ret<T>, checks> {
-  ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<Result<T>> call(
-      DecodingOffsets& offsets, DecodingContext& ctx) {
-    internal::DecodedRet ret = ctx.rets[offsets.rets++];
-    return CustomCallRetDecoding<T, checks>::Decode(ret.type_id, ret.value);
-  }
-};
-
-template <typename T, CustomCall::RuntimeChecks checks>
-struct Decode<internal::Attr<T>, checks> {
-  ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<T> call(
-      DecodingOffsets& offsets, DecodingContext& ctx) {
-    return DecodeAttr<T, checks>(offsets, ctx.attrs_names, ctx.attrs_idx,
-                                 ctx.attrs);
-  }
-};
-
-template <typename T, CustomCall::RuntimeChecks checks>
-struct Decode<internal::UserData<T>, checks> {
-  ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<T> call(
-      DecodingOffsets& offsets, DecodingContext& ctx) {
-    using UserDataT = std::remove_pointer_t<T>;
-    if (auto decoded = DecodeUserData<UserDataT, checks>(ctx.user_data);
-        LLVM_LIKELY(succeeded(decoded)))
-      return decoded;
-    return ctx.diagnostic->EmitError(Internal(
-        "failed to decode UserData of type %s", typeid(T).name()));
-  }
-};
-
-template <typename T, CustomCall::RuntimeChecks checks>
-struct Decode<internal::StateTag<T>, checks> {
-  using Snapshot = typename StateVector<T>::Snapshot;
-
-  ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<runtime::State<T>> call(
-      DecodingOffsets& offsets, DecodingContext& ctx) {
-    // Get the state snapshot and state id from user data and attributes.
-    FailureOr<Snapshot*> snapshot =
-        DecodeUserData<Snapshot, checks>(ctx.user_data);
-    FailureOr<int64_t> id = DecodeAttr<int64_t, checks>(
-        offsets, ctx.attrs_names, ctx.attrs_idx, ctx.attrs);
-    if (LLVM_UNLIKELY(failed(snapshot) || failed(id))) return failure();
-
-    return (*snapshot)->state(*id);
-  }
-};
-
-template <typename T, CustomCall::RuntimeChecks checks>
-struct Decode<internal::Value<T>, checks> {
-  ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<T> call(
-      DecodingOffsets& offsets, DecodingContext& ctx) {
-    return std::any_cast<T>(ctx.values[offsets.values++]);
-  }
-};
-
-template <CustomCall::RuntimeChecks checks>
-struct Decode<CustomCall::RemainingArgs, checks> {
-  ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<CustomCall::RemainingArgs> call(
-      DecodingOffsets& offsets, DecodingContext& ctx) {
-    return CustomCall::RemainingArgs(ctx.args, offsets.args);
-  }
-};
-
-template <CustomCall::RuntimeChecks checks>
-struct Decode<CustomCall::VariantArg, checks> {
-  ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<CustomCall::VariantArg> call(
-      DecodingOffsets& offsets, DecodingContext& ctx) {
-    return CustomCall::VariantArg(ctx.args, offsets.args++);
-  }
-};
-
-}  // namespace internal
-
-// Custom call handler binds concrete custom call implementation of type `Fn` to
-// the custom call function signature. `Fn` can be a function pointer, or a
-// lambda.
-//
-// Custom call handler uses the variadic template parameter `Ts` to decode the
-// opaque pointers passed to the `call` function into the C++ types that are
-// forwarded to the custom call implementation.
-template <CustomCall::RuntimeChecks checks, typename Fn, typename... Ts>
-class CustomCallHandler : public CustomCall {
-  static constexpr int64_t kSize = sizeof...(Ts);
-  static constexpr int64_t kNumArgs = internal::NumArgs<Ts...>::value;
-  static constexpr int64_t kNumRets = internal::NumRets<Ts...>::value;
-
-  template <typename T>
-  using FnArgType = typename internal::FnArgType<T>::Type;
-
-  template <typename T>
-  using UnwrapRet = typename internal::UnwrapRet<T>::Type;
-
-  // Custom call can signal error using a LogicalError result.
-  static constexpr bool kIsLogicalErr =
-      std::is_invocable_r_v<LogicalResult, Fn, FnArgType<Ts>...>;
-
-  // Custom call can signal error together with a detailed error message.
-  static constexpr bool kIsStatusErr =
-      std::is_invocable_r_v<absl::Status, Fn, FnArgType<Ts>...>;
-
-  // Custom call returns results as `absl::StatusOr<std::tuple<Ts...>>`
-  // (multiple results) or `absl::StatusOr<T>` (single result).
-  template <size_t... RetsIs, size_t... ArgsIs>
-  static constexpr bool IsStatusOrInvocable(std::index_sequence<RetsIs...>,
-                                            std::index_sequence<ArgsIs...>) {
-    // Define a tuple to help extracting type by index.
-    using ArgsTuple = std::tuple<FnArgType<Ts>...>;
-
-    // Custom call doesn't have any results.
-    if constexpr (sizeof...(RetsIs) == 0) return false;
-
-    // Custom call returns a single result.
-    if constexpr (sizeof...(RetsIs) == 1) {
-      using StatusOr =
-          absl::StatusOr<UnwrapRet<std::tuple_element_t<RetsIs, ArgsTuple>>...>;
-      return std::is_invocable_r_v<StatusOr, Fn,
-                                   std::tuple_element_t<ArgsIs, ArgsTuple>...>;
-    }
-
-    // Custom call returns multiple results as a tuple.
-    if constexpr (sizeof...(RetsIs) > 1) {
-      using StatusOr = absl::StatusOr<
-          std::tuple<UnwrapRet<std::tuple_element_t<RetsIs, ArgsTuple>>...>>;
-      return std::is_invocable_r_v<StatusOr, Fn,
-                                   std::tuple_element_t<ArgsIs, ArgsTuple>...>;
-    }
-
-    llvm_unreachable("unsupported result rank");
-  }
-
-  static constexpr bool kIsStatusOrResult =
-      IsStatusOrInvocable(typename internal::IndexRets<0, Ts...>::Is{},
-                          typename internal::IndexArgs<0, Ts...>::Is{});
-
-  static_assert(kIsLogicalErr || kIsStatusErr || kIsStatusOrResult,
-                "incompatible custom call handler types");
-
- public:
-  std::string_view name() const final { return callee_; }
-
-  ABSL_ATTRIBUTE_ALWAYS_INLINE LogicalResult
-  call(void** args, void** attrs, void** rets, const UserData* user_data,
-       const DiagnosticEngine* diagnostic) const final {
-    // Decode arguments and attributes from the opaque pointers.
-    internal::DecodedArgs decoded_args(args);
-    internal::DecodedAttrs decoded_attrs(attrs);
-    internal::DecodedRets decoded_rets(rets);
-
-    int64_t num_args = decoded_args.size();
-    int64_t num_attrs = decoded_attrs.size();
-    int64_t num_rets = decoded_rets.size();
-
-    if (LLVM_UNLIKELY(diagnostic == nullptr))
-      diagnostic = DiagnosticEngine::DefaultDiagnosticEngine();
-
-    // If all runtime checks are disabled we are just reinterpreting opaque
-    // `args`, `attrs` and `rets` memory according to the custom call handler
-    // signature and skip all checks (these checks will be optimized out).
-    auto eval = [](bool condition) {
-      return checks == RuntimeChecks::kNone ? false : condition;
-    };
-
-    // Check that the number of passed arguments matches the signature. Each
-    // individual argument decoding will check the actual type.
-    if (internal::HasRemainingArgs<Ts...>::value) {
-      if (LLVM_UNLIKELY(eval(num_args < kNumArgs - 1)))
-        return diagnostic->EmitError(InvalidArgument(
-            "Wrong number of arguments: expected at least %d got %d",
-            kNumArgs - 1, num_args));
-    } else {
-      if (LLVM_UNLIKELY(eval(num_args != kNumArgs)))
-        return diagnostic->EmitError(
-            InvalidArgument("Wrong number of arguments: expected %d got %d",
-                            kNumArgs, num_args));
-    }
-
-    // Check that the number of returns matches the signature. The return
-    // decoding will check the actual type.
-    if (LLVM_UNLIKELY(eval(num_rets != kNumRets)))
-      return diagnostic->EmitError(InvalidArgument(
-          "Wrong number of returns: expected %d got %d", kNumRets, num_rets));
-
-    // Check that we have a correct number of attributes passed to the custom
-    // call. Each individual attribute decoding will check the name and the
-    // type of the attribute.
-    if (LLVM_UNLIKELY(eval(opts_.exact_attrs ? num_attrs != num_encoded_attrs_
-                                             : num_attrs < num_encoded_attrs_)))
-      return diagnostic->EmitError(InvalidArgument(
-          "Wrong number of attributes: expected %s%d got %d",
-          opts_.exact_attrs ? "" : "at least ", num_encoded_attrs_, num_attrs));
-
-    // Define index sequences to access custom call operands.
-    using Is = std::make_index_sequence<kSize>;
-    using ArgsIs = typename internal::IndexArgs<0, Ts...>::Is;
-    using RetsIs = typename internal::IndexRets<0, Ts...>::Is;
-
-    return call(decoded_args, decoded_attrs, decoded_rets, user_data,
-                diagnostic, Is{}, ArgsIs{}, RetsIs{});
-  }
-
-  template <size_t... Is, size_t... ArgsIs, size_t... RetsIs>
-  ABSL_ATTRIBUTE_ALWAYS_INLINE LogicalResult
-  call(internal::DecodedArgs args, internal::DecodedAttrs attrs,
-       internal::DecodedRets rets, const UserData* user_data,
-       const DiagnosticEngine* diagnostic, std::index_sequence<Is...>,
-       std::index_sequence<ArgsIs...>, std::index_sequence<RetsIs...>) const {
-    // A helper structure to allow each decoder find the correct offset in the
-    // arguments, attributes or results.
-    internal::DecodingOffsets offsets;
-
-    // Package all the data required for decoding custom call operands.
-    internal::DecodingContext ctx{args,       rets,    attrs,     attrs_,
-                                  attrs_idx_, values_, user_data, diagnostic};
-
-    // Decode all operands into FailureOr containers. It is guaranteed
-    // that initializer list will be evaluated left-to-right, and we can rely
-    // on correct offsets computation.
-    std::tuple<FailureOr<FnArgType<Ts>>...> fn_args = {
-        internal::Decode<Ts, checks>::call(offsets, ctx)...};
-
-    // Check if all operands and results were decoded.
-    bool all_decoded = (succeeded(std::get<Is>(fn_args)) && ...);
-
-    if (LLVM_UNLIKELY(!all_decoded)) {
-      std::array<bool, kSize> decoded = {succeeded(std::get<Is>(fn_args))...};
-      auto bad_args = llvm::make_filter_range(
-          llvm::enumerate(decoded), [](auto pair) { return !pair.value(); });
-      auto to_str = [](auto pair) { return std::to_string(pair.index()); };
-
-      return diagnostic->EmitError(InvalidArgument(
-          "Failed to decode all custom call operands (bad operads at: %s)",
-          llvm::join(llvm::map_range(bad_args, to_str), ", ")));
-    }
-
-    // Custom call returns logical result to signal failures.
-    if constexpr (kIsLogicalErr) {
-      return fn_(std::move(*std::get<Is>(fn_args))...);
-    }
-
-    // Custom call returns detailed error to signal failures.
-    if constexpr (kIsStatusErr) {
-      if (auto st = fn_(std::move(*std::get<Is>(fn_args))...); !st.ok()) {
-        return diagnostic->EmitError(std::move(st));
-      }
-      return success();
-    }
-
-    // Custom call returns result(s) as `absl::StatusOr`.
-    if constexpr (kIsStatusOrResult) {
-      auto status_or = fn_(std::move(*std::get<ArgsIs>(fn_args))...);
-      if (!status_or.ok()) {
-        return diagnostic->EmitError(status_or.status());
-      }
-
-      static_assert(sizeof...(RetsIs) >= 1, "unsupported number or results");
-
-      if constexpr (sizeof...(RetsIs) == 1) {
-        (*std::get<RetsIs...>(fn_args)).Set(status_or.value());
-        return success();
-      }
-
-      if constexpr (sizeof...(RetsIs) > 1) {
-        using ResultIs = std::make_index_sequence<kNumRets>;
-        internal::SetResultsFromTuple<RetsIs...>(ResultIs{}, std::move(fn_args),
-                                                 std::move(status_or.value()));
-        return success();
-      }
-    }
-
-    llvm_unreachable("unexpected custom call type");
-  }
-
- private:
-  template <typename...>
-  friend class CustomCallBinding;
-
-  CustomCallHandler(Fn fn, std::string callee, std::vector<std::string> attrs,
-                    std::vector<std::any> values, const Options& opts)
-      : fn_(std::move(fn)),
-        callee_(std::move(callee)),
-        attrs_(std::move(attrs)),
-        values_(std::move(values)),
-        opts_(opts),
-        attrs_idx_(attrs_.size()) {
-    // Sort attributes names and remove duplicates. These unique attributes are
-    // what we'll be looking for in the encoded custom call attributes.
-    std::vector<std::string> sorted = attrs_;
-    std::sort(sorted.begin(), sorted.end());
-    sorted.erase(
-        std::unique(sorted.begin(), sorted.end(), std::equal_to<std::string>()),
-        sorted.end());
-    num_encoded_attrs_ = sorted.size();
-
-    // Find index or every attribute in the sorted attributes vector.
-    for (size_t i = 0; i < attrs_.size(); ++i) {
-      std::string_view attr = attrs_[i];
-      attrs_idx_[i] = std::distance(sorted.begin(), llvm::find(sorted, attr));
-    }
-  }
-
-  Fn fn_;
-  std::string callee_;
-  std::vector<std::string> attrs_;
-  std::vector<std::any> values_;
-  Options opts_;
-
-  // A mapping from the attribute index to its index in the lexicographically
-  // sorter vector of attribute names. Attributes passed in the custom call
-  // handler sorted by the name, we use this index to efficiently find the
-  // decoded attribute entry.
-  std::vector<size_t> attrs_idx_;
-
-  // The number of attributes we expect in the encoded custom call arguments.
-  // This is not the same as `attrs_.size()` because of potential duplicates,
-  // e.g. attribute corresponding to state id might be used multiple times.
-  size_t num_encoded_attrs_;
-};
-
-template <CustomCall::RuntimeChecks checks, typename Fn, typename... Ts>
-constexpr int64_t CustomCallHandler<checks, Fn, Ts...>::kSize;
-
-template <CustomCall::RuntimeChecks checks, typename Fn, typename... Ts>
-constexpr int64_t CustomCallHandler<checks, Fn, Ts...>::kNumArgs;
-
-template <CustomCall::RuntimeChecks checks, typename Fn, typename... Ts>
-constexpr int64_t CustomCallHandler<checks, Fn, Ts...>::kNumRets;
-
-//===----------------------------------------------------------------------===//
-// Custom arguments decoding.
-//===----------------------------------------------------------------------===//
-
-llvm::raw_ostream& operator<<(llvm::raw_ostream& os, const StridedMemrefView&);
-llvm::raw_ostream& operator<<(llvm::raw_ostream& os, const MemrefView&);
-llvm::raw_ostream& operator<<(llvm::raw_ostream& os, const FlatMemrefView&);
-
-template <CustomCall::RuntimeChecks checks>
-struct CustomCallArgDecoding<StridedMemrefView, checks> {
-  using EncodedMemref = internal::EncodedMemref;
-
-  ABSL_ATTRIBUTE_ALWAYS_INLINE
-  static FailureOr<StridedMemrefView> Decode(TypeID type_id, void* value) {
-    if (!CustomCall::Isa<MemrefView, StridedMemrefView>(checks, type_id)) {
-      return failure();
-    }
-
-    auto* encoded = reinterpret_cast<EncodedMemref*>(value);
-    ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(encoded, sizeof(EncodedMemref));
-    ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(
-        encoded, sizeof(EncodedMemref) + encoded->rank * sizeof(int64_t));
-
-    PrimitiveType dtype = static_cast<PrimitiveType>(encoded->dtype);
-    return StridedMemrefView{dtype,
-                             encoded->data,
-                             {encoded->dims, encoded->rank},
-                             {encoded->dims + encoded->rank, encoded->rank}};
-  }
-};
-
-template <CustomCall::RuntimeChecks checks>
-struct CustomCallArgDecoding<MemrefView, checks> {
-  using EncodedMemref = internal::EncodedMemref;
-
-  ABSL_ATTRIBUTE_ALWAYS_INLINE
-  static FailureOr<MemrefView> Decode(TypeID type_id, void* value) {
-    if (!CustomCall::Isa<MemrefView>(checks, type_id)) {
-      return failure();
-    }
-
-    auto* encoded = reinterpret_cast<EncodedMemref*>(value);
-    ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(encoded, sizeof(EncodedMemref));
-    ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(
-        encoded, sizeof(EncodedMemref) + encoded->rank * sizeof(int64_t));
-
-    PrimitiveType dtype = static_cast<PrimitiveType>(encoded->dtype);
-    return MemrefView{dtype, encoded->data, {encoded->dims, encoded->rank}};
-  }
-};
-
-template <CustomCall::RuntimeChecks checks>
-struct CustomCallArgDecoding<FlatMemrefView, checks> {
-  using EncodedMemref = internal::EncodedMemref;
-
-  ABSL_ATTRIBUTE_ALWAYS_INLINE
-  static FailureOr<FlatMemrefView> Decode(TypeID type_id, void* value) {
-    if (!CustomCall::Isa<MemrefView>(checks, type_id)) {
-      return failure();
-    }
-
-    auto* encoded = reinterpret_cast<EncodedMemref*>(value);
-    ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(encoded, sizeof(EncodedMemref));
-    ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(
-        encoded, sizeof(EncodedMemref) + encoded->rank * sizeof(int64_t));
-
-    PrimitiveType dtype = static_cast<PrimitiveType>(encoded->dtype);
-    int64_t size_in_bytes = primitive_util::ByteWidth(dtype);
-    for (int d = 0; d < encoded->rank; ++d) size_in_bytes *= encoded->dims[d];
-    return FlatMemrefView{dtype, encoded->data, size_in_bytes};
-  }
-};
-
-#define XLA_RUNTIME_REGISTER_SCALAR_ARG_DECODING(T)                         \
-  template <CustomCall::RuntimeChecks checks>                               \
-  struct CustomCallArgDecoding<T, checks> {                                 \
-    ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<T> Decode(TypeID type_id, \
-                                                            void* value) {  \
-      if (!CustomCall::Isa<T>(checks, type_id)) {                           \
-        return failure();                                                   \
-      }                                                                     \
-                                                                            \
-      ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(value, sizeof(T));                \
-      return *reinterpret_cast<T*>(value);                                  \
-    }                                                                       \
-  }
-
-XLA_RUNTIME_REGISTER_SCALAR_ARG_DECODING(bool);
-XLA_RUNTIME_REGISTER_SCALAR_ARG_DECODING(int8_t);
-XLA_RUNTIME_REGISTER_SCALAR_ARG_DECODING(int16_t);
-XLA_RUNTIME_REGISTER_SCALAR_ARG_DECODING(int32_t);
-XLA_RUNTIME_REGISTER_SCALAR_ARG_DECODING(int64_t);
-XLA_RUNTIME_REGISTER_SCALAR_ARG_DECODING(float);
-XLA_RUNTIME_REGISTER_SCALAR_ARG_DECODING(double);
-
-#undef XLA_RUNTIME_REGISTER_SCALAR_ARG_DECODING
-
-// Register decoding for special floating point types defined in Eigen.
-#define XLA_RUNTIME_REGISTER_EIGEN_FP_ARG_DECODING(T, STORAGE)              \
-  template <CustomCall::RuntimeChecks checks>                               \
-  struct CustomCallArgDecoding<T, checks> {                                 \
-    ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<T> Decode(TypeID type_id, \
-                                                            void* value) {  \
-      if (!CustomCall::Isa<T>(checks, type_id)) {                           \
-        return failure();                                                   \
-      }                                                                     \
-                                                                            \
-      auto* src = reinterpret_cast<STORAGE*>(value);                        \
-      ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(value, sizeof(STORAGE));          \
-      return Eigen::numext::bit_cast<T>(*src);                              \
-    }                                                                       \
-  }
-
-XLA_RUNTIME_REGISTER_EIGEN_FP_ARG_DECODING(Eigen::bfloat16, uint16_t);
-XLA_RUNTIME_REGISTER_EIGEN_FP_ARG_DECODING(Eigen::half, uint16_t);
-
-#undef XLA_RUNTIME_REGISTER_EIGEN_FP_ARG_DECODING
-
-//===----------------------------------------------------------------------===//
-// Opaque arguments at run time passed as pointers and decoded by wrapping them
-// into a reference type, for example `AsyncValue *` pointer can be wrapped into
-// a typed `AsyncValuePtr<T>` pointer wrapper.
-//===----------------------------------------------------------------------===//
-
-#define XLA_RUNTIME_REGISTER_OPAQUE_ARG_DECODING(T, PTR)                    \
-  template <CustomCall::RuntimeChecks checks>                               \
-  struct CustomCallArgDecoding<T, checks> {                                 \
-    static_assert(std::is_pointer_v<PTR>, "must be a pointer");             \
-    static_assert(std::is_trivially_destructible_v<T>,                      \
-                  "must be a trivially destructible reference type");       \
-                                                                            \
-    ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<T> Decode(TypeID type_id, \
-                                                            void* value) {  \
-      if (!CustomCall::Isa<T>(checks, type_id)) {                           \
-        return failure();                                                   \
-      }                                                                     \
-                                                                            \
-      auto* src = reinterpret_cast<PTR*>(value);                            \
-      ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(value, sizeof(PTR));              \
-      T ref{*src};                                                          \
-      return std::move(ref);                                                \
-    }                                                                       \
-  }
-
-XLA_RUNTIME_REGISTER_OPAQUE_ARG_DECODING(void*, void*);
-
-//===----------------------------------------------------------------------===//
-// Custom call results decoding.
-//===----------------------------------------------------------------------===//
-
-#define XLA_RUNTIME_REGISTER_SCALAR_RET_DECODING(T)                  \
-  template <>                                                        \
-  class Result<T> {                                                  \
-   public:                                                           \
-    explicit Result(T* storage) : storage_(storage) {}               \
-    void Set(T value) { *storage_ = value; }                         \
-                                                                     \
-   private:                                                          \
-    T* storage_;                                                     \
-  };                                                                 \
-                                                                     \
-  template <CustomCall::RuntimeChecks checks>                        \
-  struct CustomCallRetDecoding<T, checks> {                          \
-    ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<Result<T>> Decode( \
-        TypeID type_id, void* value) {                               \
-      if (!CustomCall::Isa<T>(checks, type_id)) {                    \
-        return failure();                                            \
-      }                                                              \
-                                                                     \
-      return Result<T>(reinterpret_cast<T*>(value));                 \
-    }                                                                \
-  };
-
-XLA_RUNTIME_REGISTER_SCALAR_RET_DECODING(bool);
-XLA_RUNTIME_REGISTER_SCALAR_RET_DECODING(int8_t);
-XLA_RUNTIME_REGISTER_SCALAR_RET_DECODING(int16_t);
-XLA_RUNTIME_REGISTER_SCALAR_RET_DECODING(int32_t);
-XLA_RUNTIME_REGISTER_SCALAR_RET_DECODING(int64_t);
-XLA_RUNTIME_REGISTER_SCALAR_RET_DECODING(float);
-XLA_RUNTIME_REGISTER_SCALAR_RET_DECODING(double);
-
-#undef XLA_RUNTIME_REGISTER_SCALAR_RET_DECODING
-
-//===----------------------------------------------------------------------===//
-// Opaque results at run time passed as pointers, and a typed wrapper binds
-// together the reference type and the underlying pointer type.
-//===----------------------------------------------------------------------===//
-
-#define XLA_RUNTIME_REGISTER_OPAQUE_RET_DECODING(T, PTR)             \
-  template <>                                                        \
-  class Result<T> {                                                  \
-    static_assert(std::is_pointer_v<PTR>, "must be a pointer type"); \
-                                                                     \
-   public:                                                           \
-    explicit Result(PTR* storage) : storage_(storage) {}             \
-    void Set(PTR value) { *storage_ = value; }                       \
-                                                                     \
-   private:                                                          \
-    PTR* storage_;                                                   \
-  };                                                                 \
-                                                                     \
-  template <CustomCall::RuntimeChecks checks>                        \
-  struct CustomCallRetDecoding<T, checks> {                          \
-    static_assert(std::is_pointer_v<PTR>, "must be a pointer type"); \
-                                                                     \
-    ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<Result<T>> Decode( \
-        TypeID type_id, void* value) {                               \
-      if (!CustomCall::Isa<T>(checks, type_id)) {                    \
-        return failure();                                            \
-      }                                                              \
-                                                                     \
-      return Result<T>(reinterpret_cast<PTR*>(value));               \
-    }                                                                \
-  };
-
-XLA_RUNTIME_REGISTER_OPAQUE_RET_DECODING(void*, void*);
-
-//===----------------------------------------------------------------------===//
-
-// Custom call memref result decoding
-template <>
-class Result<MemrefView> {
-  using EncodedMemref = internal::EncodedMemref;
-
- public:
-  explicit Result(EncodedMemref* storage) : storage_(storage) {}
-  void Set(MemrefView value) {
-    assert(IsCompatible(value) &&
-           "Custom call return types is not compatible with types in MLIR");
-    storage_->data = value.data;
-    for (unsigned i = 0; i < storage_->rank; ++i) {
-      storage_->dims[i] = value.sizes[i];
-    }
-  }
-
-  PrimitiveType GetDType() { return PrimitiveType{storage_->dtype}; }
-  absl::Span<const int64_t> GetDims() {
-    return absl::Span<const int64_t>(storage_->dims, storage_->rank);
-  }
-
- private:
-  bool IsCompatible(MemrefView value) {
-    bool is_compatible =
-        storage_->dtype == value.dtype && storage_->rank == value.sizes.size();
-    if (!is_compatible) return false;
-
-    for (unsigned i = 0; i < storage_->rank; ++i) {
-      is_compatible = (storage_->dims[i] == value.sizes[i]) ||
-                      (storage_->dims[i] == /*MemrefType::kDynamic=*/-1);
-    }
-    return is_compatible;
-  }
-
-  EncodedMemref* storage_;
-};
-
-template <CustomCall::RuntimeChecks checks>
-struct CustomCallRetDecoding<MemrefView, checks> {
-  using EncodedMemref = internal::EncodedMemref;
-
-  ABSL_ATTRIBUTE_ALWAYS_INLINE
-  static FailureOr<Result<MemrefView>> Decode(TypeID type_id, void* value) {
-    if (!CustomCall::Isa<MemrefView>(checks, type_id)) return failure();
-
-    auto* encoded = reinterpret_cast<EncodedMemref*>(value);
-    ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(encoded, sizeof(EncodedMemref));
-    ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(
-        encoded, sizeof(EncodedMemref) + encoded->rank * sizeof(int64_t));
-    return Result<MemrefView>(encoded);
-  }
-};
-
-//===----------------------------------------------------------------------===//
-
-// Custom call AsyncValueRef result decoding
-#define XLA_RUNTIME_REGISTER_ASYNC_SCALAR_VALUE_RET_DECODING(T)              \
-  template <>                                                                \
-  class Result<tsl::AsyncValueRef<T>> {                                      \
-   public:                                                                   \
-    explicit Result(void** storage) : storage_(storage) {}                   \
-    void Set(tsl::AsyncValueRef<T> value) {                                  \
-      auto write = [](const T* v, std::byte* store) {                        \
-        T* store_t = reinterpret_cast<T*>(store);                            \
-        *store_t = *v;                                                       \
-      };                                                                     \
-      *storage_ = runtime::AsyncRuntime::AsValue<T>(                         \
-          value, sizeof(T), alignof(std::max_align_t), write);               \
-    }                                                                        \
-                                                                             \
-   private:                                                                  \
-    void** storage_;                                                         \
-  };                                                                         \
-                                                                             \
-  template <CustomCall::RuntimeChecks checks>                                \
-  struct CustomCallRetDecoding<tsl::AsyncValueRef<T>, checks> {              \
-    LLVM_ATTRIBUTE_ALWAYS_INLINE                                             \
-    static FailureOr<Result<tsl::AsyncValueRef<T>>> Decode(TypeID type_id,   \
-                                                           void* value) {    \
-      if (!CustomCall::Isa<tsl::AsyncValueRef<T>>(checks, type_id))          \
-        return failure();                                                    \
-      return Result<tsl::AsyncValueRef<T>>(reinterpret_cast<void**>(value)); \
-    }                                                                        \
-  };
-
-XLA_RUNTIME_REGISTER_ASYNC_SCALAR_VALUE_RET_DECODING(bool);
-XLA_RUNTIME_REGISTER_ASYNC_SCALAR_VALUE_RET_DECODING(int8_t);
-XLA_RUNTIME_REGISTER_ASYNC_SCALAR_VALUE_RET_DECODING(int16_t);
-XLA_RUNTIME_REGISTER_ASYNC_SCALAR_VALUE_RET_DECODING(int32_t);
-XLA_RUNTIME_REGISTER_ASYNC_SCALAR_VALUE_RET_DECODING(int64_t);
-XLA_RUNTIME_REGISTER_ASYNC_SCALAR_VALUE_RET_DECODING(float);
-XLA_RUNTIME_REGISTER_ASYNC_SCALAR_VALUE_RET_DECODING(double);
-
-#undef XLA_RUNTIME_REGISTER_ASYNC_SCALAR_VALUE_RET_DECODING
-
-template <>
-class Result<tsl::AsyncValueRef<tsl::Chain>> {
- public:
-  explicit Result(void** storage) : storage_(storage) {}
-  void Set(tsl::AsyncValueRef<tsl::Chain> value) {
-    *storage_ = runtime::AsyncRuntime::AsToken(value);
-  }
-
- private:
-  void** storage_;
-};
-
-template <CustomCall::RuntimeChecks checks>
-struct CustomCallRetDecoding<tsl::AsyncValueRef<tsl::Chain>, checks> {
-  LLVM_ATTRIBUTE_ALWAYS_INLINE
-  static FailureOr<Result<tsl::AsyncValueRef<tsl::Chain>>> Decode(
-      TypeID type_id, void* value) {
-    if (!CustomCall::Isa<tsl::AsyncValueRef<tsl::Chain>>(checks, type_id))
-      return failure();
-
-    return Result<tsl::AsyncValueRef<tsl::Chain>>(
-        reinterpret_cast<void**>(value));
-  }
-};
-
-template <>
-class Result<tsl::AsyncValueRef<MemrefView>> {
-  using EncodedMemref = internal::EncodedMemref;
-
-  struct MemrefDescriptor {
-    void* allocated_ptr;
-    void* aligned_ptr;
-    int64_t offset;
-    int64_t dims[];
-  };
-
- public:
-  explicit Result(EncodedMemref* storage) : storage_(storage) {}
-  void Set(tsl::AsyncValueRef<MemrefView> value) {
-    auto write = [this](const MemrefView* view, std::byte* store) {
-      assert(IsCompatible(*view) &&
-             "Custom call return types is not compatible with types in MLIR");
-      MemrefDescriptor* store_t = reinterpret_cast<MemrefDescriptor*>(store);
-      store_t->allocated_ptr = view->data;
-      store_t->aligned_ptr = view->data;
-      store_t->offset = 0;
-      for (unsigned i = 0; i < storage_->rank; ++i) {
-        store_t->dims[i] = view->sizes[i];
-      }
-    };
-    storage_->data = runtime::AsyncRuntime::AsValue<MemrefView>(
-        value, 3 * sizeof(int64_t) + 2 * storage_->rank * sizeof(int64_t),
-        alignof(std::max_align_t), write);
-  }
-
-  PrimitiveType GetDType() { return PrimitiveType{storage_->dtype}; }
-  absl::Span<const int64_t> GetDims() {
-    return absl::Span<const int64_t>(storage_->dims, storage_->rank);
-  }
-
- private:
-  bool IsCompatible(MemrefView value) {
-    bool is_compatible =
-        storage_->dtype == value.dtype && storage_->rank == value.sizes.size();
-    if (!is_compatible) return false;
-
-    for (unsigned i = 0; i < storage_->rank; ++i) {
-      is_compatible = (storage_->dims[i] == value.sizes[i]) ||
-                      (storage_->dims[i] == /*MemrefType::kDynamic=*/-1);
-    }
-
-    return is_compatible;
-  }
-
-  EncodedMemref* storage_;
-};
-
-template <CustomCall::RuntimeChecks checks>
-struct CustomCallRetDecoding<tsl::AsyncValueRef<MemrefView>, checks> {
-  using EncodedMemref = internal::EncodedMemref;
-
-  LLVM_ATTRIBUTE_ALWAYS_INLINE
-  static FailureOr<Result<tsl::AsyncValueRef<MemrefView>>> Decode(
-      TypeID type_id, void* value) {
-    if (!CustomCall::Isa<tsl::AsyncValueRef<MemrefView>>(checks, type_id))
-      return failure();
-
-    auto* encoded = reinterpret_cast<EncodedMemref*>(value);
-    ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(encoded, sizeof(EncodedMemref));
-    ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(
-        encoded, sizeof(EncodedMemref) + encoded->rank * sizeof(int64_t));
-
-    return Result<tsl::AsyncValueRef<MemrefView>>(encoded);
-  }
-};
-
-// XLA_RUNTIME_REGISTER_ASYNC_VALUE_RET_DECODING(MemrefView);
-
-//===----------------------------------------------------------------------===//
-// Custom call attributes decoding.
-//===----------------------------------------------------------------------===//
-
-template <CustomCall::RuntimeChecks checks>
-struct CustomCallAttrDecoding<std::string_view, checks> {
-  ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<std::string_view> Decode(
-      std::string_view name, TypeID type_id, void* value) {
-    if (!CustomCall::Isa<std::string_view>(checks, type_id)) {
-      return failure();
-    }
-
-    auto* encoded = reinterpret_cast<internal::EncodedArray<char>*>(value);
-    return std::string_view(encoded->data, encoded->size);
-  }
-};
-
-template <CustomCall::RuntimeChecks checks>
-struct CustomCallAttrDecoding<CustomCall::FunctionOrdinal, checks> {
-  using FunctionOrdinal = CustomCall::FunctionOrdinal;
-
-  ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<FunctionOrdinal> Decode(
-      std::string_view name, TypeID type_id, void* value) {
-    if (!CustomCall::Isa<FunctionOrdinal>(checks, type_id)) {
-      return failure();
-    }
-
-    unsigned ordinal = *reinterpret_cast<int32_t*>(value);
-    return FunctionOrdinal{ordinal};
-  }
-};
-
-template <typename T, CustomCall::RuntimeChecks checks>
-struct CustomCallAttrDecoding<std::optional<T>, checks> {
-  using ValueDecoding = CustomCallAttrDecoding<T, checks>;
-
-  ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<std::optional<T>> Decode(
-      std::string_view name, TypeID type_id, void* value) {
-    // Convert nullptr to empty optional.
-    bool is_nullopt = CustomCall::Isa<std::nullopt_t>(checks, type_id);
-    if (is_nullopt && value == nullptr) return std::optional<T>();
-
-    // Try to decode the underlying value if it is present.
-    if (auto decoded = ValueDecoding::Decode(name, type_id, value);
-        succeeded(decoded)) {
-      return std::optional<T>(std::move(*decoded));
-    }
-    return failure();
-  }
-};
-
-template <CustomCall::RuntimeChecks checks>
-struct CustomCallAttrDecoding<CustomCall::VariantAttr, checks> {
-  ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<CustomCall::VariantAttr> Decode(
-      std::string_view name, TypeID type_id, void* value) {
-    return CustomCall::VariantAttr(name, type_id, value);
-  }
-};
-
-#define XLA_RUNTIME_REGISTER_SCALAR_ATTR_DECODING(T)          \
-  template <CustomCall::RuntimeChecks checks>                 \
-  struct CustomCallAttrDecoding<T, checks> {                  \
-    ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<T> Decode(  \
-        std::string_view name, TypeID type_id, void* value) { \
-      if (!CustomCall::Isa<T>(checks, type_id)) {             \
-        return failure();                                     \
-      }                                                       \
-                                                              \
-      return *reinterpret_cast<T*>(value);                    \
-    }                                                         \
-  }
-
-XLA_RUNTIME_REGISTER_SCALAR_ATTR_DECODING(bool);
-XLA_RUNTIME_REGISTER_SCALAR_ATTR_DECODING(int32_t);
-XLA_RUNTIME_REGISTER_SCALAR_ATTR_DECODING(int64_t);
-XLA_RUNTIME_REGISTER_SCALAR_ATTR_DECODING(float);
-XLA_RUNTIME_REGISTER_SCALAR_ATTR_DECODING(double);
-
-#undef XLA_RUNTIME_REGISTER_SCALAR_ATTR_DECODING
-
-// A type tag to represent empty arrays of unknown element type.
-struct EmptyArray {};
-
-// Both EncodedArray and 1-D EncodedDenseElements can be decoded as an
-// absl::Span. Pointers to both EncodedArray and 1-D EncodedDenseElements
-// can be dereferenced as a pointer to EncodedArray.
-#define XLA_RUNTIME_REGISTER_ARRAY_ATTR_DECODING(T)                            \
-  template <CustomCall::RuntimeChecks checks>                                  \
-  struct CustomCallAttrDecoding<absl::Span<const T>, checks> {                 \
-    ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<absl::Span<const T>> Decode( \
-        std::string_view name, TypeID type_id, void* value) {                  \
-      if (!CustomCall::Isa<absl::Span<const T>, CustomCall::TensorRef<T>,      \
-                           EmptyArray>(checks, type_id)) {                     \
-        return failure();                                                      \
-      }                                                                        \
-                                                                               \
-      auto* encoded = reinterpret_cast<internal::EncodedArray<T>*>(value);     \
-      return absl::Span<const T>(encoded->data, encoded->size);                \
-    }                                                                          \
-  }
-
-XLA_RUNTIME_REGISTER_ARRAY_ATTR_DECODING(int32_t);
-XLA_RUNTIME_REGISTER_ARRAY_ATTR_DECODING(int64_t);
-XLA_RUNTIME_REGISTER_ARRAY_ATTR_DECODING(float);
-XLA_RUNTIME_REGISTER_ARRAY_ATTR_DECODING(double);
-
-#undef XLA_RUNTIME_REGISTER_ARRAY_ATTR_DECODING
-
-#define XLA_RUNTIME_REGISTER_DENSE_ELEMENTS_ATTR_DECODING(T)                \
-  template <CustomCall::RuntimeChecks checks>                               \
-  struct CustomCallAttrDecoding<CustomCall::TensorRef<T>, checks> {         \
-    ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<CustomCall::TensorRef<T>> \
-    Decode(std::string_view name, TypeID type_id, void* value) {            \
-      if (!CustomCall::Isa<CustomCall::TensorRef<T>>(checks, type_id)) {    \
-        return failure();                                                   \
-      }                                                                     \
-                                                                            \
-      auto* encoded =                                                       \
-          reinterpret_cast<internal::EncodedDenseElements<T>*>(value);      \
-      auto payload = encoded->payload;                                      \
-      absl::Span<const T> data(payload.data, payload.size);                 \
-      absl::Span<const int64_t> shape(encoded->shape, encoded->rank);       \
-      return CustomCall::TensorRef<T>({shape, data});                       \
-    }                                                                       \
-  }
-
-XLA_RUNTIME_REGISTER_DENSE_ELEMENTS_ATTR_DECODING(int32_t);
-XLA_RUNTIME_REGISTER_DENSE_ELEMENTS_ATTR_DECODING(int64_t);
-XLA_RUNTIME_REGISTER_DENSE_ELEMENTS_ATTR_DECODING(float);
-XLA_RUNTIME_REGISTER_DENSE_ELEMENTS_ATTR_DECODING(double);
-
-#undef XLA_RUNTIME_REGISTER_DENSE_ELEMENTS_ATTR_DECODING
-
-//===----------------------------------------------------------------------===//
-// Register an XLA custom call attribute decoding for enum class. At runtime the
-// value should be passed as the underlying enum type.
-//===----------------------------------------------------------------------===//
-
-// Example: register decoding for a user-defined enum class
-//
-//   enum class MyEnumType { kFoo, kBar, kBaz };
-//
-//   XLA_RUNTIME_REGISTER_ENUM_ATTR_DECODING(MyEnumType);
-//
-#define XLA_RUNTIME_REGISTER_ENUM_ATTR_DECODING(T)                \
-  template <CustomCall::RuntimeChecks checks>                     \
-  struct CustomCallAttrDecoding<T, checks> {                      \
-    static_assert(std::is_enum<T>::value, "expected enum class"); \
-    using U = std::underlying_type_t<T>;                          \
-                                                                  \
-    ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<T> Decode(      \
-        std::string_view name, TypeID type_id, void* value) {     \
-      if (!CustomCall::Isa<T>(checks, type_id)) {                 \
-        return failure();                                         \
-      }                                                           \
-                                                                  \
-      return static_cast<T>(*reinterpret_cast<U*>(value));        \
-    }                                                             \
-  }
-
-//===----------------------------------------------------------------------===//
-// Register an XLA custom call attribute decoding for aggregate attributes.
-//===----------------------------------------------------------------------===//
-
-template <typename T>
-struct AggregateMember {
-  using Type = T;
-
-  explicit AggregateMember(std::string_view name) : name(name) {}
-  std::string_view name;
-};
-
-// Example: register decoding for a user-defined struct
-//
-//   struct PairOfI64 { int64_t a; int64_t b; };
-//
-//   XLA_RUNTIME_REGISTER_AGGREGATE_ATTR_DECODING(
-//     PairOfI64,
-//     AggregateMember<int64_t>("a"),
-//     AggregateMember<int64_t>("b"));
-//
-#define XLA_RUNTIME_REGISTER_AGGREGATE_ATTR_DECODING(T, ...)                   \
-  template <CustomCall::RuntimeChecks checks>                                  \
-  struct CustomCallAttrDecoding<T, checks> {                                   \
-    ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<T> Decode(                   \
-        std::string_view name, TypeID type_id, void* value) {                  \
-      if (!CustomCall::Isa<T>(checks, type_id)) {                              \
-        return failure();                                                      \
-      }                                                                        \
-                                                                               \
-      auto decoder = internal::AggregateDecoder<T, checks>(__VA_ARGS__);       \
-      return decltype(decoder)::Decode(reinterpret_cast<void**>(value),        \
-                                       internal::AggregateNames(__VA_ARGS__)); \
-    }                                                                          \
-  }
-
-namespace internal {
-// Decodes aggregate attribute into the object of type `T` that must be
-// constructible from the `Ts` types.
-template <typename T, CustomCall::RuntimeChecks checks, typename... Ts>
-struct DecodeAggregateAttr {
-  static constexpr size_t kSize = sizeof...(Ts);
-
-  using RuntimeChecks = CustomCall::RuntimeChecks;
-
-  ABSL_ATTRIBUTE_ALWAYS_INLINE
-  static FailureOr<T> Decode(void** value,
-                             std::array<std::string_view, kSize> names) {
-    internal::DecodedAttrs attrs(value);
-    return Decode(attrs, names, std::make_index_sequence<kSize>{});
-  }
-
-  template <size_t... Is>
-  ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<T> Decode(
-      internal::DecodedAttrs attrs, std::array<std::string_view, kSize> names,
-      std::index_sequence<Is...>) {
-    // Check that the number of encoded attributes matches the signature.
-    if (checks != RuntimeChecks::kNone && kSize != attrs.size())
-      return failure();
-
-    // Check that aggregate member names match the expected names.
-    if (CustomCall::CheckNames(checks)) {
-      for (unsigned i = 0; i < kSize; ++i)
-        if (attrs[i].name != names[i]) return failure();
-    }
-
-    // Decode all arguments into FailureOr containers. It is guaranteed
-    // that initializer list will be evaluated left-to-right, and we can rely
-    // on correct offsets computation.
-    std::tuple<FailureOr<Ts>...> members = {
-        CustomCallAttrDecoding<Ts, checks>::Decode(
-            attrs[Is].name, attrs[Is].type_id, attrs[Is].value)...};
-
-    bool all_decoded = (succeeded(std::get<Is>(members)) && ...);
-    if (LLVM_UNLIKELY(!all_decoded)) return failure();
-
-    // Forward unpacked members to the type constructor.
-    return T{std::move(*std::get<Is>(members))...};
-  }
-};
-
-template <typename... Members>
-auto AggregateNames(Members... m) {
-  return std::array<std::string_view, sizeof...(Members)>{m.name...};
-}
-
-template <typename T, CustomCall::RuntimeChecks checks, typename... Members>
-auto AggregateDecoder(Members... m) {
-  return DecodeAggregateAttr<T, checks, typename Members::Type...>();
-}
-
-}  // namespace internal
-
-//===----------------------------------------------------------------------===//
-// Register an XLA custom call attribute decoding for dictionary attributes.
-//===----------------------------------------------------------------------===//
-
-// Dictionary attributes are encoded using the same scheme as aggregate
-// attributes and as custom call attributes: <type_id, name, data> x length.
-class Dictionary {
-  using RuntimeChecks = CustomCall::RuntimeChecks;
-
- public:
-  explicit Dictionary(internal::DecodedAttrs attrs) : attrs_(attrs) {}
-
-  int64_t size() { return attrs_.size(); }
-
-  std::vector<std::string_view> keys() {
-    std::vector<std::string_view> attr_keys(attrs_.size());
-    for (int64_t i = 0; i < attrs_.size(); ++i) {
-      attr_keys[i] = attrs_[i].name;
-    }
-    return attr_keys;
-  }
-
-  template <typename T, RuntimeChecks checks = RuntimeChecks::kDefault>
-  ABSL_ATTRIBUTE_ALWAYS_INLINE FailureOr<T> get(std::string_view name) const {
-    // TODO(ezhulenev): Use `std::binary_search` because it's guaranteed that
-    // encoded attributes are sorted by name.
-    for (int64_t i = 0; i < attrs_.size(); ++i) {
-      if (auto attr = attrs_[i]; attr.name == name)
-        return CustomCallAttrDecoding<T, checks>::Decode(
-            attr.name, attr.type_id, attr.value);
-    }
-    return failure();
-  }
-
- private:
-  internal::DecodedAttrs attrs_;
-};
-
-template <CustomCall::RuntimeChecks checks>
-struct CustomCallAttrDecoding<Dictionary, checks> {
-  ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<Dictionary> Decode(
-      std::string_view name, TypeID type_id, void* value) {
-    if (!CustomCall::Isa<Dictionary>(checks, type_id)) return failure();
-    return Dictionary(internal::DecodedAttrs(reinterpret_cast<void**>(value)));
-  }
-};
-
-//===----------------------------------------------------------------------===//
-// XLA Custom Call helper macro for registering custom call handlers.
-//===----------------------------------------------------------------------===//
-
-#define XLA_RUNTIME_DEFINE_CUSTOM_CALL(fn, impl, checks, bind)             \
-  static bool fn(::xla::runtime::ExecutionContext* ctx, void** args,       \
-                 void** attrs, void** rets) {                              \
-    static auto* handler = bind.To<checks>(impl).release();                \
-    return ::xla::runtime::succeeded(                                      \
-        xla::runtime::Executable::Call(ctx, *handler, args, attrs, rets)); \
-  }
-
-#define XLA_RUNTIME_DEFINE_CUSTOM_CALL_TEMPLATE(param, fn, impl, checks, bind) \
-  template <param>                                                             \
-  static bool fn(::xla::runtime::ExecutionContext* ctx, void** args,           \
-                 void** attrs, void** rets) {                                  \
-    static auto* handler = bind.To<checks>(impl).release();                    \
-    return ::xla::runtime::succeeded(                                          \
-        xla::runtime::Executable::Call(ctx, *handler, args, attrs, rets));     \
-  }
-
-//===----------------------------------------------------------------------===//
-// Declare/define an explicit specialization for TypeID for types used
-// by the custom calls. This forces the compiler to emit a strong definition for
-// a class and controls which translation unit and shared object will actually
-// have it.
-//
-// See TypeID for more documentation.
-//
-// Because custom calls do not "own" the types passed across the function
-// boundary, we declare/define specializations for tagged types to avoid
-// potential conflicts with other libraries.
-//===----------------------------------------------------------------------===//
-
-#define XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(T) \
-  MLIR_DECLARE_EXPLICIT_TYPE_ID(::xla::runtime::Tagged<T>)
-
-#define XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(T) \
-  MLIR_DEFINE_EXPLICIT_TYPE_ID(::xla::runtime::Tagged<T>)
-
-}  // namespace runtime
-}  // namespace xla
-
-XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(std::string_view);
-XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(xla::runtime::StridedMemrefView);
-XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(xla::runtime::MemrefView);
-XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(xla::runtime::FlatMemrefView);
-XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(xla::runtime::EmptyArray);
-XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(xla::runtime::Dictionary);
-XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(int32_t);
-XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(int64_t);
-XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(float);
-XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(double);
-XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(absl::Span<const int32_t>);
-XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(absl::Span<const int64_t>);
-XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(absl::Span<const float>);
-XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(absl::Span<const double>);
-XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(tsl::AsyncValueRef<bool>);
-XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(tsl::AsyncValueRef<int8_t>);
-XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(tsl::AsyncValueRef<int16_t>);
-XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(tsl::AsyncValueRef<int32_t>);
-XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(tsl::AsyncValueRef<int64_t>);
-XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(tsl::AsyncValueRef<float>);
-XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(tsl::AsyncValueRef<double>);
-XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(
-    tsl::AsyncValueRef<xla::runtime::MemrefView>);
-XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(tsl::AsyncValueRef<tsl::Chain>);
-
-#endif  // XLA_RUNTIME_CUSTOM_CALL_H_
diff --git a/third_party/xla/xla/runtime/custom_call_registry.cc b/third_party/xla/xla/runtime/custom_call_registry.cc
deleted file mode 100644
index 596d7c7a3ea398..00000000000000
--- a/third_party/xla/xla/runtime/custom_call_registry.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/runtime/custom_call_registry.h"
-
-#include <functional>
-#include <memory>
-#include <string>
-#include <string_view>
-#include <utility>
-
-namespace xla {
-namespace runtime {
-
-void DynamicCustomCallRegistry::Register(
-    std::unique_ptr<CustomCall> custom_call) {
-  std::string_view name = custom_call->name();
-  auto emplaced = custom_calls_.try_emplace(name, std::move(custom_call));
-  assert(emplaced.second && "duplicate custom call registration");
-  (void)emplaced;
-}
-
-CustomCall* DynamicCustomCallRegistry::Find(std::string_view callee) const {
-  auto it = custom_calls_.find(callee);
-  if (it == custom_calls_.end()) return nullptr;
-  return it->second.get();
-}
-
-void DirectCustomCallRegistry::Register(std::string_view name,
-                                        DirectCustomCall custom_call) {
-  auto emplaced = custom_calls_.try_emplace(name, std::move(custom_call));
-  assert(emplaced.second && "duplicate custom call registration");
-  (void)emplaced;
-}
-
-void DirectCustomCallRegistry::ForEach(
-    std::function<void(std::string_view, DirectCustomCall)> f) const {
-  for (auto& kv : custom_calls_) f(kv.first(), kv.second);
-}
-
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/runtime/custom_call_registry.h b/third_party/xla/xla/runtime/custom_call_registry.h
deleted file mode 100644
index 344d608c302c24..00000000000000
--- a/third_party/xla/xla/runtime/custom_call_registry.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_RUNTIME_CUSTOM_CALL_REGISTRY_H_
-#define XLA_RUNTIME_CUSTOM_CALL_REGISTRY_H_
-
-#include <functional>
-#include <memory>
-#include <string_view>
-
-#include "llvm/ADT/StringMap.h"
-#include "xla/runtime/custom_call.h"
-
-namespace xla {
-namespace runtime {
-
-// Dynamic custom call registry is a container for the custom calls that looks
-// up the handler implementing the custom call by name at run time. It is used
-// to implement a generic `rt.custom_call` runtime intrinsic.
-//
-// For low overhead custom calls prefer direct custom calls that linked with the
-// compiled executable and bypass by-name look up.
-class DynamicCustomCallRegistry {
- public:
-  // The type for custom call registration functions.
-  using RegistrationFunction = void (*)(DynamicCustomCallRegistry*);
-
-  void Register(std::unique_ptr<class CustomCall> custom_call);
-
-  class CustomCall* Find(std::string_view callee) const;
-
- private:
-  llvm::StringMap<std::unique_ptr<CustomCall>> custom_calls_;
-};
-
-// Direct custom call is a custom call that can be linked directly with the
-// compiled executable, and doesn't have to go through the custom call look up
-// by name at run time (see CustomCallRegistry).
-//
-// Direct custom call is a preferred way of implemenenting custom calls with
-// low run time overheads, as they will become just an indirect function calls
-// once LLVM ORC links them with the executable.
-//
-// See `ToSymbolsBinding` (in executor.h header) to convert direct custom call
-// registry and type name registry to symbols binding.
-class DirectCustomCallRegistry {
- public:
-  // Function type corresponding to the direct custom call (custom calls
-  // linked directly with the compiled executable).
-  using DirectCustomCall = bool (*)(ExecutionContext* ctx, void** args,
-                                    void** attrs, void** rets);
-
-  void Register(std::string_view name, DirectCustomCall custom_call);
-
-  void ForEach(std::function<void(std::string_view, DirectCustomCall)> f) const;
-
- private:
-  llvm::StringMap<DirectCustomCall> custom_calls_;
-};
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_RUNTIME_CUSTOM_CALL_REGISTRY_H_
diff --git a/third_party/xla/xla/runtime/default/BUILD b/third_party/xla/xla/runtime/default/BUILD
deleted file mode 100644
index c331cf8ef3da6b..00000000000000
--- a/third_party/xla/xla/runtime/default/BUILD
+++ /dev/null
@@ -1,26 +0,0 @@
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//xla/runtime:__pkg__"],
-    licenses = ["notice"],
-)
-
-cc_library(
-    name = "async_values_cache",
-    hdrs = ["async_values_cache.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        "@com_google_absl//absl/synchronization",
-        "@llvm-project//llvm:Support",
-        "@local_tsl//tsl/concurrency:async_value",
-    ],
-)
-
-cc_library(
-    name = "memory_mapper",
-    hdrs = ["memory_mapper.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [],
-)
diff --git a/third_party/xla/xla/runtime/default/async_values_cache.h b/third_party/xla/xla/runtime/default/async_values_cache.h
deleted file mode 100644
index a5175a7f5f494a..00000000000000
--- a/third_party/xla/xla/runtime/default/async_values_cache.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_RUNTIME_DEFAULT_ASYNC_VALUES_CACHE_H_
-#define XLA_RUNTIME_DEFAULT_ASYNC_VALUES_CACHE_H_
-
-#include "absl/synchronization/mutex.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
-#include "tsl/concurrency/async_value.h"
-#include "tsl/concurrency/async_value_ref.h"
-#include "tsl/concurrency/chain.h"
-
-namespace xla {
-namespace runtime {
-
-using tsl::AsyncValue;
-using tsl::AsyncValuePtr;
-using tsl::AsyncValueRef;
-using tsl::Chain;
-using tsl::MakeConstructedAsyncValueRef;
-using tsl::MakeUnconstructedAsyncValueRef;
-using tsl::RunWhenReady;
-
-template <typename Key, typename Value>
-class AsyncValuesCache {
- public:
-  struct Entry;
-
-  AsyncValuesCache() = default;
-
-  // Returns a pointer to the cached value if it exists, otherwise returns
-  // nullptr. It is the caller's responsibility to form an async reference and
-  // extend its lifetime if the lifetime of the cached async value can be
-  // larger than the lifetime of the cache.
-  AsyncValuePtr<Value> Find(Key key) const;
-
-  // Allocates an async value in the unconstructed state to store the cached
-  // value with the given key.
-  //
-  // The `entry.allocated` value is `true` if the new async value was allocated,
-  // and the caller is responsible for eventually setting the error or emplacing
-  // the value. If it is false, then it means that the storage was already
-  // allocated, and someone else will eventually update it.
-  //
-  // The returned `entry.size` value is equal to the size of the cache. If the
-  // new async value was allocated, it will be reflected in the size.
-  Entry Allocate(Key key);
-
-  // Returns an async value that becomes available once all entries added to
-  // the cache are available.
-  AsyncValueRef<Chain> AllAvailable() const;
-
-  struct Entry {
-    AsyncValuePtr<Value> ptr;
-    bool allocated;
-    size_t size;
-  };
-
- private:
-  mutable absl::Mutex mu_;
-  llvm::DenseMap<Key, AsyncValueRef<Value>> cache_ ABSL_GUARDED_BY(mu_);
-};
-
-template <typename Key, typename Value>
-AsyncValuePtr<Value> AsyncValuesCache<Key, Value>::Find(Key key) const {
-  absl::MutexLock lock(&mu_);
-  auto it = cache_.find(key);
-  return it != cache_.end() ? it->getSecond().AsPtr() : AsyncValuePtr<Value>();
-}
-
-template <typename Key, typename Value>
-auto AsyncValuesCache<Key, Value>::Allocate(Key key) -> Entry {
-  absl::MutexLock lock(&mu_);
-  auto it = cache_.find(key);
-  if (it != cache_.end())
-    return {it->getSecond().AsPtr(), false, cache_.size()};
-
-  AsyncValueRef<Value> allocated = MakeUnconstructedAsyncValueRef<Value>();
-
-  auto emplaced = cache_.try_emplace(key, std::move(allocated));
-  assert(emplaced.second && "emplace must be successful");
-  return {emplaced.first->getSecond().AsPtr(), true, cache_.size()};
-}
-
-template <typename Key, typename Value>
-AsyncValueRef<Chain> AsyncValuesCache<Key, Value>::AllAvailable() const {
-  absl::MutexLock lock(&mu_);
-
-  llvm::SmallVector<AsyncValue*> avs;
-  avs.reserve(cache_.size());
-  for (auto& it : cache_) avs.push_back(it.getSecond().GetAsyncValue());
-
-  AsyncValueRef<Chain> chain = MakeConstructedAsyncValueRef<Chain>();
-  RunWhenReady(avs, [chain]() { chain.SetStateConcrete(); });
-  return chain;
-}
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_RUNTIME_DEFAULT_ASYNC_VALUES_CACHE_H_
diff --git a/third_party/xla/xla/runtime/default/memory_mapper.h b/third_party/xla/xla/runtime/default/memory_mapper.h
deleted file mode 100644
index 44de6730e2f01f..00000000000000
--- a/third_party/xla/xla/runtime/default/memory_mapper.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_RUNTIME_DEFAULT_MEMORY_MAPPER_H_
-#define XLA_RUNTIME_DEFAULT_MEMORY_MAPPER_H_
-
-#include <errno.h>
-
-namespace xla {
-namespace runtime {
-
-// Some syscalls can be interrupted by a signal handler; retry if that happens.
-template <typename FunctionType>
-static auto RetryOnEINTR(FunctionType func, decltype(func()) failure_value) {
-  using ReturnType = decltype(func());
-  ReturnType ret;
-  do {
-    ret = func();
-  } while (ret == failure_value && errno == EINTR);
-  return ret;
-}
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_RUNTIME_DEFAULT_MEMORY_MAPPER_H_
diff --git a/third_party/xla/xla/runtime/diagnostics.cc b/third_party/xla/xla/runtime/diagnostics.cc
deleted file mode 100644
index a111e01bc5898d..00000000000000
--- a/third_party/xla/xla/runtime/diagnostics.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/runtime/diagnostics.h"
-
-#include <utility>
-
-namespace xla {
-namespace runtime {
-
-const DiagnosticEngine* DiagnosticEngine::DefaultDiagnosticEngine() {
-  static auto* diagnostic_engine = new DiagnosticEngine();
-  return diagnostic_engine;
-}
-
-void InFlightDiagnostic::Report() {
-  if (IsInFlight()) {
-    engine_->Emit(std::move(*diagnostic_));
-    engine_ = nullptr;
-  }
-  diagnostic_.reset();
-}
-
-void InFlightDiagnostic::Abandon() { engine_ = nullptr; }
-
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/runtime/diagnostics.h b/third_party/xla/xla/runtime/diagnostics.h
deleted file mode 100644
index f2d82077233374..00000000000000
--- a/third_party/xla/xla/runtime/diagnostics.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_RUNTIME_DIAGNOSTICS_H_
-#define XLA_RUNTIME_DIAGNOSTICS_H_
-
-#include <functional>
-#include <optional>
-#include <string>
-#include <utility>
-
-#include "absl/status/status.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "xla/runtime/logical_result.h"
-#include "tsl/platform/logging.h"
-
-namespace xla {
-namespace runtime {
-
-// Forward declare.
-class DiagnosticEngine;
-
-// XLA runtime diagnostic engine enables XLA runtime custom calls and compiled
-// programs to pass diagnostic information (e.g. detailed run time error
-// information attached to the absl::Status) to the caller via the side channel,
-// because the API (and ABI) of the compiled executable is very simple, and
-// doesn't allow to pass complex C++ tyopes.
-//
-// XLA runtime diagnostics borrows a lot of ideas from the MLIR compile time
-// diagnostics (which is largely based on the Swift compiler diagnostics),
-// however in contrast to MLIR compilation pipelines we need to emit diagnostics
-// for the run time events (vs compile time) and correlate them back to the
-// location in the input module.
-//
-// See MLIR Diagnostics documentation: https://mlir.llvm.org/docs/Diagnostics.
-
-//===----------------------------------------------------------------------===//
-// Diagnostic
-//===----------------------------------------------------------------------===//
-
-// TODO(ezhulenev): Add location tracking to diagnostic, so that we can
-// correlate emitted diagnostic to the location in the input module, and from
-// there rely on the MLIR location to correlate events back to the user program
-// (e.g. original JAX program written in Python).
-class Diagnostic {
- public:
-  explicit Diagnostic(absl::Status status) : status_(std::move(status)) {}
-
-  Diagnostic(Diagnostic &&) = default;
-  Diagnostic &operator=(Diagnostic &&) = default;
-
-  absl::Status status() const { return status_; }
-
- private:
-  Diagnostic(const Diagnostic &rhs) = delete;
-  Diagnostic &operator=(const Diagnostic &rhs) = delete;
-
-  absl::Status status_;
-};
-
-//===----------------------------------------------------------------------===//
-// InFlightDiagnostic
-//===----------------------------------------------------------------------===//
-
-// RAII wrapper around constructed, but but not yet emitted diagnostic. In
-// flight diagnostic gives an opportunity to build a diagnostic before reporting
-// it to the engine, similar to the builder pattern.
-class InFlightDiagnostic {
- public:
-  InFlightDiagnostic(InFlightDiagnostic &&other)
-      : engine_(other.engine_), diagnostic_(std::move(other.diagnostic_)) {
-    other.diagnostic_.reset();
-    other.Abandon();
-  }
-
-  ~InFlightDiagnostic() {
-    if (IsInFlight()) Report();
-  }
-
-  void Report();
-  void Abandon();
-
-  // Allow a diagnostic to be converted to 'failure'.
-  //
-  // Example:
-  //
-  //   LogicalResult call(DiagnosticEngine diag, ...) {
-  //     if (<check failed>) return diag.EmitError(Internal("oops"));
-  //     ...
-  //   }
-  //
-  operator LogicalResult() const {  // NOLINT
-    return failure();
-  }
-  template <typename T>
-  operator FailureOr<T>() const {  // NOLINT
-    return failure();
-  }
-
- private:
-  friend class DiagnosticEngine;
-
-  InFlightDiagnostic(const DiagnosticEngine *engine, Diagnostic diagnostic)
-      : engine_(engine), diagnostic_(std::move(diagnostic)) {}
-
-  InFlightDiagnostic &operator=(const InFlightDiagnostic &) = delete;
-  InFlightDiagnostic &operator=(InFlightDiagnostic &&) = delete;
-
-  bool IsActive() const { return diagnostic_.has_value(); }
-  bool IsInFlight() const { return engine_ != nullptr; }
-
-  // Diagnostic engine that will report this diagnostic once its ready.
-  const DiagnosticEngine *engine_ = nullptr;
-  std::optional<Diagnostic> diagnostic_;
-};
-
-//===----------------------------------------------------------------------===//
-// DiagnosticEngine
-//===----------------------------------------------------------------------===//
-
-// Diagnostic engine is responsible for passing diagnostics to the user.
-//
-// XLA runtime users must set up diagnostic engine to report errors back to the
-// caller, e.g. the handler can collect all of the emitted diagnostics into the
-// string message, and pass it to the caller as the async error.
-//
-// Unhandled error diagnostics will be logged with the warning level.
-class DiagnosticEngine {
- public:
-  // Diagnostic handler must return success if it consumed the diagnostic, and
-  // failure if the engine should pass it to the next registered handler.
-  using HandlerTy = std::function<LogicalResult(Diagnostic &)>;
-
-  // Returns the default instance of the diagnostic engine.
-  static const DiagnosticEngine *DefaultDiagnosticEngine();
-
-  InFlightDiagnostic EmitError(absl::Status status) const {
-    return InFlightDiagnostic(this, Diagnostic(std::move(status)));
-  }
-
-  void AddHandler(HandlerTy handler) {
-    handlers_.push_back(std::move(handler));
-  }
-
-  void Emit(Diagnostic diagnostic) const {
-    for (auto &handler : llvm::reverse(handlers_)) {
-      if (succeeded(handler(diagnostic))) return;
-    }
-
-    // Log unhandled errors to the warning log.
-    LOG(WARNING) << "XLA runtime error: " << diagnostic.status();
-  }
-
- private:
-  llvm::SmallVector<HandlerTy> handlers_;
-};
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_RUNTIME_DIAGNOSTICS_H_
diff --git a/third_party/xla/xla/runtime/diagnostics_test.cc b/third_party/xla/xla/runtime/diagnostics_test.cc
deleted file mode 100644
index a1995b08f0a7c5..00000000000000
--- a/third_party/xla/xla/runtime/diagnostics_test.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/runtime/diagnostics.h"
-
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "absl/status/status.h"
-#include "tsl/platform/test.h"
-
-namespace xla {
-namespace runtime {
-
-using absl::InternalError;
-
-TEST(DiagnosticEngineTest, Basic) {
-  std::string message;
-
-  DiagnosticEngine engine;
-  engine.AddHandler([&](Diagnostic& diagnostic) {
-    message += diagnostic.status().message();
-    return success();
-  });
-
-  {  // Check that diagnostic is reported when InFlightDiagnostic is destructed.
-    InFlightDiagnostic diagnostic = engine.EmitError(InternalError("Oops"));
-    EXPECT_EQ(message, "");
-    (void)diagnostic;
-  }
-
-  EXPECT_EQ(message, "Oops");
-}
-
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/runtime/errors.h b/third_party/xla/xla/runtime/errors.h
deleted file mode 100644
index 95e96cc08e0ee4..00000000000000
--- a/third_party/xla/xla/runtime/errors.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_RUNTIME_ERRORS_H_
-#define XLA_RUNTIME_ERRORS_H_
-
-#include <string>
-
-#include "absl/status/status.h"
-#include "absl/strings/str_format.h"
-
-namespace xla {
-namespace runtime {
-
-template <typename... Args>
-absl::Status InvalidArgument(const absl::FormatSpec<Args...>& format,
-                             const Args&... args) {
-  return absl::InvalidArgumentError(absl::StrFormat(format, args...));
-}
-
-template <typename... Args>
-absl::Status Internal(const absl::FormatSpec<Args...>& format,
-                           const Args&... args) {
-  return absl::InternalError(absl::StrFormat(format, args...));
-}
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_RUNTIME_ERRORS_H_
diff --git a/third_party/xla/xla/runtime/executable.cc b/third_party/xla/xla/runtime/executable.cc
deleted file mode 100644
index 0b9b1b67f41e00..00000000000000
--- a/third_party/xla/xla/runtime/executable.cc
+++ /dev/null
@@ -1,604 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/runtime/executable.h"
-
-#include <algorithm>
-#include <functional>
-#include <iterator>
-#include <memory>
-#include <string>
-#include <string_view>
-#include <utility>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_format.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ExecutionEngine/Orc/Core.h"
-#include "llvm/Support/ErrorOr.h"
-#include "xla/mlir/runtime/utils/async_runtime_api.h"
-#include "xla/mlir/runtime/utils/c_runner_utils.h"
-#include "xla/mlir/runtime/utils/float_16bits.h"
-#include "xla/runtime/custom_call.h"
-#include "xla/runtime/custom_call_registry.h"
-#include "xla/runtime/errors.h"
-#include "xla/runtime/runtime.h"
-#include "xla/runtime/type_id.h"
-
-namespace xla {
-namespace runtime {
-
-using absl::Status;
-using absl::StatusOr;
-
-using llvm::dyn_cast;
-
-// ExecutionContext encapsulates all the data that is required to implement XLA
-// Runtime <-> XLA Executable integration API.
-// LINT.IfChange
-struct ExecutionContext {
-  // Results memory layout is owned by the executable, and stays alive after the
-  // function execution completes.
-  const Executable::ResultsMemoryLayout* results_memory_layout = nullptr;
-
-  // CallFrame life time bound to the compiled function execution and destroyed
-  // immediately when the function returns. Only the compiled function itself
-  // reads the arguments and writes to the function results storage.
-  Executable::CallFrame* call_frame = nullptr;
-
-  // User-defined data for custom call handlers.
-  const CustomCall::UserData* custom_call_data = nullptr;
-
-  // User-defined custom call registry.
-  const DynamicCustomCallRegistry* custom_call_registry = nullptr;
-
-  // User-defined diagnostic engine for reporting diagnostics.
-  const DiagnosticEngine* diagnostic_engine = nullptr;
-};
-// LINT.ThenChange(//tensorflow/compiler/xla/runtime/aot_ffi_execution_context.h)
-
-void DestroyExecutionContext::operator()(ExecutionContext* ctx) { delete ctx; }
-
-//===----------------------------------------------------------------------===//
-// Conversion from custom calls and type id registries to symbols binding.
-//===----------------------------------------------------------------------===//
-
-ExecutionEngine::SymbolsBinding ToSymbolsBinding(
-    std::function<void(DirectCustomCallRegistry&)> custom_calls,
-    std::function<void(TypeIDNameRegistry&)> types) {
-  return [=](llvm::orc::MangleAndInterner mangle) {
-    llvm::orc::SymbolMap symbol_map;
-
-    DirectCustomCallRegistry custom_call_registry;
-    if (custom_calls) custom_calls(custom_call_registry);
-
-    TypeIDNameRegistry type_registry;
-    if (types) types(type_registry);
-
-    // Always register canonical custom call types.
-    PopulateCustomCallTypeIdNames(type_registry);
-
-    // Register direct custom calls.
-    using DirectCustomCall = DirectCustomCallRegistry::DirectCustomCall;
-    custom_call_registry.ForEach([&](std::string_view name,
-                                     DirectCustomCall custom_call) {
-      symbol_map[mangle(name)] = {llvm::orc::ExecutorAddr::fromPtr(custom_call),
-                                  llvm::JITSymbolFlags()};
-    });
-
-    // Register type id symbols.
-    type_registry.ForEach([&](std::string_view name, TypeID type_id) {
-      auto type_id_ptr =
-          reinterpret_cast<std::uintptr_t>(type_id.getAsOpaquePointer());
-      symbol_map[mangle(name)] = {llvm::orc::ExecutorAddr(type_id_ptr),
-                                  llvm::JITSymbolFlags()};
-    });
-
-    return symbol_map;
-  };
-}
-
-//===----------------------------------------------------------------------===//
-// Register XLA runtime symbols with XLA execution engine.
-//===----------------------------------------------------------------------===//
-
-static llvm::orc::SymbolMap RuntimeApiSymbolMap(llvm::orc::MangleAndInterner);
-
-//===----------------------------------------------------------------------===//
-// Construct a symbols binding for XLA executable.
-//===----------------------------------------------------------------------===//
-
-ExecutionEngine::SymbolsBinding RuntimeSymbolsBinding(
-    ExecutionEngine::SymbolsBinding custom_binding) {
-  return ExecutionEngine::BindAll(
-      {// Register MLIR C Runner API intrinsics (defined in CRunnerUtils).
-       CRunnerUtilsSymbolMap,
-       // Register Async Runtime API intrinsics.
-       AsyncRuntimeApiSymbolMap,
-       // Register memory allocation functions (malloc, free, ...).
-       AsyncRuntimeMemoryAllocationSymbolMap,
-       // Register Runtime API intrinsics (returning results and errors).
-       RuntimeApiSymbolMap,
-       // Register LLVM f16 and bf16 API intrinsics (defined in Float16bits).
-       Float16bitsSymbolMap,
-       // Register any additional user-defined symbol bindings
-       std::move(custom_binding)});
-}
-
-//===----------------------------------------------------------------------===//
-// Get executable arguments and results memory layouts.
-//===----------------------------------------------------------------------===//
-
-/*static*/ StatusOr<Executable::ArgumentsMemoryLayout>
-Executable::GetArgumentsMemoryLayout(const FunctionType& signature) {
-  // Requirements for passing function arguments.
-  ArgumentsMemoryLayout layout;
-
-  for (unsigned i = 0; i < signature.num_operands(); ++i) {
-    const Type* type = signature.operand(i);
-
-    // Check if the type defines the ABI for passing it as an argument.
-    if (StatusOr<Type::ArgumentAbi> abi = type->AsArgument(); abi.ok()) {
-      layout.num_args_ptrs += abi->num_ptrs;
-      layout.num_ptrs.emplace_back(abi->num_ptrs);
-      layout.offsets.emplace_back(
-          i == 0 ? 0 : (layout.offsets[i - 1] + layout.num_ptrs[i - 1]));
-      continue;
-    }
-
-    return Internal("unknown operand #%i argument ABI: %s", i,
-                         type->ToString());
-  }
-
-  return layout;
-}
-
-/*static*/ StatusOr<Executable::ResultsMemoryLayout>
-Executable::GetResultsMemoryLayout(const FunctionType& signature) {
-  // Requirements for returning function results.
-  ResultsMemoryLayout layout;
-  layout.offsets.reserve(signature.num_results());
-
-  // TODO(ezhulenev): We should support allocating storage for results with non
-  // standard alignment requirements.
-
-  for (unsigned i = 0; i < signature.num_results(); ++i) {
-    const Type* type = signature.result(i);
-
-    // Keep track if the function has asynchronous results.
-    layout.has_async_results |= llvm::isa<AsyncTokenType, AsyncValueType>(type);
-
-    // Check if the type defines the ABI for returning it as a result.
-    if (StatusOr<Type::ResultAbi> abi = type->AsResult(); abi.ok()) {
-      layout.offsets.emplace_back(layout.size);
-      // TODO(ezhulenev): Support user-defined result alignment. As a
-      // workaround, we require all results to be a multiple of 8 bytes. This
-      // way, we automatically get 8-byte alignment for all results, which is
-      // enough for all currently supported types.
-      size_t size = std::max<size_t>(abi->size, 8);
-      assert(size % 8 == 0 && "size must be a multiple of 8 bytes");
-      layout.size += size;
-      continue;
-    }
-
-    return Internal("unknown result #%i argument ABI: %s", i,
-                         type->ToString());
-  }
-
-  return layout;
-}
-
-//===----------------------------------------------------------------------===//
-// Executable CallFrame initialization.
-//===----------------------------------------------------------------------===//
-
-// Always verify executable arguments in debug mode.
-static bool VerifyArguments(bool verify_arguments) {
-#if defined(NDEBUG)
-  return verify_arguments;
-#endif
-  return true;
-}
-
-Status Executable::InitializeCallFrame(unsigned ordinal, ArgumentsRef arguments,
-                                       CallFrame* call_frame,
-                                       bool verify_arguments) const {
-  assert(ordinal < functions_.size() && "function ordinal out of bounds");
-  const Function& fn = functions_[ordinal];
-
-  // TODO(ezhulenev): If executable is specialized for concrete shapes then
-  // there is no need to verify them once more here. However currently we rely
-  // on a hash code to look up specializations, and this can lead to collisions.
-  if (VerifyArguments(verify_arguments)) {
-    // We verify run time arguments against the run time signature.
-    const FunctionType& signature = fn.runtime_signature;
-
-    // Make sure that we call the executable with the correct number of
-    // arguments. We subtract one argument from the signature because it
-    // corresponds to the context that we prepend to the given arguments.
-    if (LLVM_UNLIKELY(arguments.size() != signature.num_operands() - 1))
-      return InvalidArgument(
-          "number of arguments doesn't match the function signature: %i vs %i",
-          arguments.size(), signature.num_operands() - 1);
-
-    // Verify that all arguments passed at runtime are compatible with compiled
-    // function signature.
-    auto kctx = dyn_cast<ExecutionContextOperandType>(signature.operand(0));
-    if (LLVM_UNLIKELY(!kctx)) {
-      return InvalidArgument(
-          "expected ExecutionContext in first argument of signature, got: %s",
-          signature.operand(0)->ToString());
-    }
-
-    // We use 0-based index for arguments, because the execution context
-    // argument is an internal implementation detail, and in case of an error
-    // users should get back argument index corresponding to the user provided
-    // signature.
-    for (unsigned i = 0; i < arguments.size(); ++i) {
-      unsigned idx = i + 1;  // use 1-based index to fetch signature operand
-      if (auto st = arguments[i].Verify(*signature.operand(idx)); !st.ok())
-        return InvalidArgument("argument #%i doesn't match the signature: %s",
-                               i, st.message());
-    }
-  }
-
-  size_t num_args_ptrs = fn.arguments_memory_layout.num_args_ptrs;
-  call_frame->args.resize_for_overwrite(num_args_ptrs);
-
-  // Add a placeholder for the execution context as the first argument.
-  call_frame->args[0] = nullptr;
-
-  // Mutable view into the call frame arguments.
-  absl::Span<void*> args(call_frame->args);
-
-  // Pack all arguments according to the ABI to the call frame arguments.
-  //
-  // We use layout information starting with offset 1 because the execution
-  // context argument packed above, and is not passed as a regular argument.
-  for (unsigned i = 0; i < arguments.size(); ++i) {
-    size_t offset = fn.arguments_memory_layout.offsets[i + 1];
-    size_t len = fn.arguments_memory_layout.num_ptrs[i + 1];
-    assert(offset >= 1 && (offset + len) <= num_args_ptrs);
-    arguments[i].Pack(args.subspan(offset, len));
-  }
-
-  // Allocate storage for results.
-  call_frame->results.resize_for_overwrite(fn.results_memory_layout.size);
-
-  // Mark results memory initialized to supress potential msan errors.
-  ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(call_frame->results.data(),
-                                      call_frame->results.size());
-
-  return absl::OkStatus();
-}
-
-//===----------------------------------------------------------------------===//
-// Execute the compiled XLA runtime executable.
-//===----------------------------------------------------------------------===//
-
-absl::StatusOr<ExecutionReference> Executable::Execute(
-    unsigned ordinal, ArgumentsRef arguments, const ResultConverter& results,
-    const ExecuteOpts& opts, bool verify_arguments) const {
-  // CallFrame can be allocated on the stack because compiled function will
-  // unpack all the arguments it needs, and async regions will not access
-  // the data after the initial function will return the result.
-  CallFrame call_frame;
-
-  // Touch every byte of the memref arguments, to trigger memory sanitizer error
-  // if some of the memrefs are already deallocated. Unfortunatelly sanitizers
-  // do not work inside the JIT compiled code, and compiled executables still
-  // can do out of bounds memory access, however this sanity check allows to
-  // catch obvious errors earlier.
-#if defined(MEMORY_SANITIZER)
-  auto do_not_optimize = [&](const auto& value) -> void {
-    asm volatile("" : : "r,m"(value) : "memory");
-  };
-
-  for (unsigned i = 0; i < arguments.size(); ++i) {
-    auto* memref = dyn_cast<MemrefDesc>(&arguments[i]);
-    if (!memref) continue;
-
-    int64_t size_in_bytes = primitive_util::ByteWidth(memref->dtype());
-    for (int64_t size : memref->sizes()) size_in_bytes *= size;
-
-    uint8_t* data = static_cast<uint8_t*>(memref->data());
-    for (int64_t i = 0; i < size_in_bytes; ++i) {
-      uint8_t value = data[i];
-      do_not_optimize(value);
-    }
-  }
-#endif
-
-  // Compiled function takes arguments and results as `void**` type erased
-  // pointer. See mlir::ExecutionEngine `packFunctionArguments` for the details.
-  if (auto st = InitializeCallFrame(ordinal, arguments, &call_frame,
-                                    verify_arguments);
-      !st.ok())
-    return (results.ReturnError(st), st);
-
-  auto exec_ref = Execute(ordinal, call_frame, opts);
-
-  // Convert compiled function return values into results.
-  if (auto st = ReturnResults(ordinal, results, &call_frame); !st.ok())
-    return st;
-
-  return {std::move(exec_ref)};
-}
-
-ExecutionReference Executable::Execute(unsigned ordinal, CallFrame& call_frame,
-                                       const ExecuteOpts& opts) const {
-  assert(ordinal < functions_.size() && "function ordinal out of bounds");
-  const Function& fn = functions_[ordinal];
-
-  // Set the AsyncRuntime to be used by all async tasks spawned by the
-  // executable.
-  AsyncRuntime::Set(AsyncRuntime(opts.async_task_runner));
-
-  ExecutionReference exec_ref;
-  ExecutionContext* execution_ctx_ptr = nullptr;
-  // For sync executable, runtime execution context can be used only by the
-  // compiled function and can be safely allocated on the stack.
-  ExecutionContext execution_ctx = {
-      &fn.results_memory_layout, &call_frame, opts.custom_call_data,
-      opts.custom_call_registry, opts.diagnostic_engine};
-  if (IsAsync()) {
-    // With custom calls inside async functions the lifetime of the execution
-    // context must be extended until all pending async tasks are completed.
-    exec_ref = ExecutionReference(new ExecutionContext{
-        &fn.results_memory_layout, &call_frame, opts.custom_call_data,
-        opts.custom_call_registry, opts.diagnostic_engine});
-    execution_ctx_ptr = exec_ref.get();
-  } else {
-    // Override the execution context argument.
-    execution_ctx_ptr = &execution_ctx;
-  }
-
-  assert(call_frame.args.size() == fn.arguments_memory_layout.num_args_ptrs);
-  assert(call_frame.args[0] == nullptr && "expected to see a placeholder");
-  call_frame.args[0] = &execution_ctx_ptr;
-
-  // Call the compiled function.
-  (*fn.fptr)(call_frame.args.data());
-  return exec_ref;
-}
-
-Status Executable::ReturnResults(unsigned ordinal,
-                                 const ResultConverter& results,
-                                 CallFrame* call_frame) const {
-  // If execution failed, forward error to all results.
-  if (call_frame->is_error) {
-    auto err = Internal("run time error: %s", call_frame->error);
-    return (results.ReturnError(err), err);
-  }
-
-  assert(ordinal < functions_.size() && "function ordinal out of bounds");
-  const Function& fn = functions_[ordinal];
-
-  // Try to convert results using registered conversion functions.
-  bool converted = true;
-
-  for (unsigned i = 0; i < fn.runtime_signature.num_results(); ++i) {
-    const Type* type = fn.signature.result(i);
-    const Type* runtime_type = fn.runtime_signature.result(i);
-    void* ret = &call_frame->results[fn.results_memory_layout.offsets[i]];
-    bool res = succeeded(results.ReturnValue(i, type, runtime_type, ret));
-    converted = converted && res;
-  }
-
-  if (LLVM_UNLIKELY(!converted))
-    return Internal("failed to convert all returned values");
-  else
-    return absl::OkStatus();
-}
-
-//===----------------------------------------------------------------------===//
-// Load AOT compiled executable from an object file.
-//===----------------------------------------------------------------------===//
-
-/*static*/ StatusOr<Executable> Executable::LoadFromObjFile(
-    std::string_view name, std::unique_ptr<llvm::MemoryBuffer> obj_file,
-    std::vector<LoadFunction> load_functions,
-    ExecutionEngine::SymbolsBinding symbols_binding,
-    std::string_view memory_region_name) {
-  // Memory region name to mmap executable code.
-  std::string mapper_name = llvm::formatv(
-      "/xla_aot{0}{1}:@{2}", memory_region_name.empty() ? "" : ":",
-      EscapeMemRegionName(memory_region_name), name);
-
-  // Custom memory mapper to tag memory allocated for XLA executables.
-  std::unique_ptr<XlaRuntimeMemoryMapper> memory_mapper =
-      XlaRuntimeMemoryMapper::Create(std::move(mapper_name));
-
-  // Construct options for the XLA execution engine.
-  ExecutionEngine::AotOptions options;
-  options.section_memory_mapper = memory_mapper.get();
-  options.symbols_binding = RuntimeSymbolsBinding(std::move(symbols_binding));
-
-  // Function that must be exported by the execution engine.
-  std::vector<std::string_view> exported;
-  llvm::transform(load_functions, std::back_inserter(exported),
-                  [](LoadFunction& fn) -> std::string_view { return fn.name; });
-
-  auto engine = ExecutionEngine::CreateFromObjFile(std::move(obj_file), options,
-                                                   exported);
-
-  // Prepare exported functions for the executable.
-  std::vector<Executable::Function> functions;
-
-  for (const auto& indexed : llvm::enumerate(load_functions)) {
-    LoadFunction& fn = indexed.value();
-
-    // Get the memory layout for passing function arguments.
-    auto args_memory_layout = GetArgumentsMemoryLayout(fn.runtime_signature);
-    if (!args_memory_layout.ok()) return args_memory_layout.status();
-
-    // Get the memory layout for returning function results.
-    auto results_memory_layout = GetResultsMemoryLayout(fn.runtime_signature);
-    if (!results_memory_layout.ok()) return results_memory_layout.status();
-
-    functions.push_back(Executable::Function(
-        std::move(fn.name), (*engine)->exported(indexed.index()),
-        std::move(fn.signature), std::move(fn.runtime_signature),
-        std::move(*args_memory_layout), std::move(*results_memory_layout),
-        true));
-  }
-
-  return Executable(name, std::move(memory_mapper), std::move(*engine),
-                    std::move(functions),
-                    /*specialization=*/std::nullopt,
-                    /*time_to_compile*/ std::chrono::milliseconds(0));
-}
-
-//===----------------------------------------------------------------------===//
-
-bool Executable::IsAsync(unsigned ordinal) const {
-  assert(ordinal < functions_.size() && "function ordinal out of bounds");
-  return functions_[ordinal].results_memory_layout.has_async_results;
-}
-
-std::string_view Executable::function_name(unsigned ordinal) const {
-  assert(ordinal < functions_.size() && "function ordinal out of bounds");
-  return functions_[ordinal].name;
-}
-
-unsigned Executable::num_results(unsigned ordinal) const {
-  assert(ordinal < functions_.size() && "function ordinal out of bounds");
-  return functions_[ordinal].runtime_signature.num_results();
-}
-
-const FunctionType& Executable::signature(unsigned ordinal) const {
-  assert(ordinal < functions_.size() && "function ordinal out of bounds");
-  return functions_[ordinal].signature;
-}
-
-const FunctionType& Executable::runtime_signature(unsigned ordinal) const {
-  assert(ordinal < functions_.size() && "function ordinal out of bounds");
-  return functions_[ordinal].runtime_signature;
-}
-
-std::chrono::milliseconds Executable::time_to_compile() const {
-  return time_to_compile_;
-}
-
-std::unique_ptr<llvm::MemoryBuffer> Executable::obj_file() const {
-  return engine_->obj_file();
-}
-
-const CustomCall::UserData* Executable::GetUserData(ExecutionContext* ctx) {
-  return ctx->custom_call_data;
-}
-
-const DiagnosticEngine* Executable::GetDiagnosticEngine(ExecutionContext* ctx) {
-  return ctx->diagnostic_engine;
-}
-
-LogicalResult Executable::Call(ExecutionContext* ctx, class CustomCall& call,
-                               void** args, void** attrs, void** rets) {
-  return call.call(args, attrs, rets, ctx->custom_call_data,
-                   ctx->diagnostic_engine);
-}
-
-FunctionRef Executable::function_ref(unsigned ordinal) const {
-  assert(ordinal < functions_.size() && "function ordinal out of bounds");
-  return FunctionRef(this, ordinal);
-}
-
-//===----------------------------------------------------------------------===//
-// Executable function reference.
-//===----------------------------------------------------------------------===//
-
-FunctionRef::FunctionRef(const Executable* executable, unsigned ordinal)
-    : executable_(executable), ordinal_(ordinal) {
-  assert(executable && "executable must be not null");
-}
-
-absl::StatusOr<ExecutionReference> FunctionRef::operator()(
-    ArgumentsRef arguments, const ResultConverter& results,
-    const Executable::ExecuteOpts& opts, bool verify_arguments) const {
-  return executable_->Execute(ordinal_, arguments, results, opts,
-                              verify_arguments);
-}
-
-//===----------------------------------------------------------------------===//
-// Register XLA runtime symbols with XLA execution engine.
-//===----------------------------------------------------------------------===//
-
-llvm::orc::SymbolMap RuntimeApiSymbolMap(llvm::orc::MangleAndInterner mangle) {
-  llvm::orc::SymbolMap symbol_map;
-
-  auto bind = [&](std::string_view name, auto symbol_ptr) {
-    symbol_map[mangle(name)] = {llvm::orc::ExecutorAddr::fromPtr(symbol_ptr),
-                                llvm::JITSymbolFlags()};
-  };
-
-  bind("runtimeGetResultStorage", &GetResultStorage);
-  bind("runtimeSetError", &SetError);
-  bind("runtimeCustomCall", &CustomCall);
-
-  return symbol_map;
-}
-
-//----------------------------------------------------------------------------//
-// Implement XLA Runtime <-> XLA Executable integration API.
-//----------------------------------------------------------------------------//
-
-void* GetResultStorage(ExecutionContext* ctx, int64_t index) {
-  assert(ctx && "execution context must be not null");
-  assert(!ctx->call_frame->is_error && "error must not be set");
-  size_t offset = ctx->results_memory_layout->offsets[index];
-  assert(offset < ctx->call_frame->results.size() && "offset is out of bounds");
-  ctx->call_frame->has_set_outputs = true;
-  return &ctx->call_frame->results[offset];
-}
-
-void SetError(ExecutionContext* ctx, const char* error) {
-  assert(ctx && "execution context must be not null");
-  assert(error && "runtime error must be not null");
-  assert(!ctx->call_frame->is_error && "error must be set only once");
-  assert(!ctx->call_frame->has_set_outputs && "outputs must be undefined");
-  ctx->call_frame->is_error = true;
-  ctx->call_frame->error = {error};
-}
-
-bool CustomCall(ExecutionContext* ctx, const char* target, void** args,
-                void** attrs, void** rets) {
-  assert(ctx && target && args && attrs && rets && "must be not null");
-  assert(ctx->custom_call_registry && "custom call registry must be not null");
-
-  const DiagnosticEngine* diagnostic = ctx->diagnostic_engine;
-
-  if (ctx->custom_call_registry == nullptr) {
-    if (diagnostic)
-      diagnostic->EmitError(
-          absl::InternalError("custom call registry is not available"));
-    return false;
-  }
-
-  auto* custom_call = ctx->custom_call_registry->Find(target);
-  if (custom_call == nullptr) {
-    if (diagnostic)
-      diagnostic->EmitError(absl::InternalError(absl::StrFormat(
-          "custom call is not registered with runtime: %s", target)));
-    return false;
-  }
-
-  return succeeded(custom_call->call(args, attrs, rets, ctx->custom_call_data,
-                                     ctx->diagnostic_engine));
-}
-
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/runtime/executable.h b/third_party/xla/xla/runtime/executable.h
deleted file mode 100644
index b71c9901795e7a..00000000000000
--- a/third_party/xla/xla/runtime/executable.h
+++ /dev/null
@@ -1,447 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_RUNTIME_EXECUTABLE_H_
-#define XLA_RUNTIME_EXECUTABLE_H_
-
-#include <functional>
-#include <memory>
-#include <optional>
-#include <string>
-#include <string_view>
-#include <utility>
-#include <vector>
-
-#include "absl/status/statusor.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "xla/runtime/arguments.h"
-#include "xla/runtime/async_runtime.h"
-#include "xla/runtime/custom_call.h"
-#include "xla/runtime/custom_call_registry.h"
-#include "xla/runtime/diagnostics.h"
-#include "xla/runtime/execution_engine.h"
-#include "xla/runtime/logical_result.h"
-#include "xla/runtime/memory_mapper.h"
-#include "xla/runtime/results.h"
-#include "xla/runtime/type_id.h"
-#include "xla/runtime/types.h"
-
-namespace xla {
-namespace runtime {
-
-struct ExecutionContext;
-
-struct DestroyExecutionContext {
-  void operator()(ExecutionContext* ctx);
-};
-
-// If executable has async results, ExecutionReference keeps that
-// execution context alive. For sync executables `Execute` always returns
-// ExecutionReference with nullptr.
-class ExecutionReference
-    : public std::unique_ptr<ExecutionContext, DestroyExecutionContext> {
-  // Bring std::unique_ptr constructors in scope.
-  using std::unique_ptr<ExecutionContext, DestroyExecutionContext>::unique_ptr;
-};
-
-class FunctionRef;
-class JitCompiler;
-
-// Returns a symbols binding for running XLA executable with a custom symbols
-// provided by the user.
-ExecutionEngine::SymbolsBinding RuntimeSymbolsBinding(
-    ExecutionEngine::SymbolsBinding custom_binding);
-
-// Converts a direct custom call and custom type id name registration functions
-// (types required by the library) to the execution engine symbols binding.
-// Returned symbols binding always includes type id symbols for all
-// canonical types supported by the XLA runtime custom calls.
-ExecutionEngine::SymbolsBinding ToSymbolsBinding(
-    std::function<void(DirectCustomCallRegistry&)> custom_calls = {},
-    std::function<void(TypeIDNameRegistry&)> types = {});
-
-class Executable {
- public:
-  // Forward declare types defined below.
-  struct ArgumentsMemoryLayout;
-  struct ResultsMemoryLayout;
-  struct CallFrame;
-  struct ExecuteOpts;
-
-  // Initializes call frame by adding all arguments according to the exported
-  // function ABI. Also allocates storage for the returned values according to
-  // the results memory layout.
-  //
-  // If `verify_arguments` is true (in debug mode it's always on, independent of
-  // the argument value) this function also verifies that arguments passed at
-  // run time matches the exported function signature (e.g. all statically known
-  // dimensions of the memrefs matches the arguments). Returns an error if finds
-  // a mismatch.
-  //
-  // This function leaves the execution context argument (the first argument of
-  // an exported function) uninitialized. It will be initialized in the
-  // `Execute` function right before the actual execution.
-  absl::Status InitializeCallFrame(unsigned ordinal, ArgumentsRef arguments,
-                                   CallFrame* call_frame,
-                                   bool verify_arguments = true) const;
-
-  absl::Status InitializeCallFrame(ArgumentsRef arguments,
-                                   CallFrame* call_frame,
-                                   bool verify_arguments = true) const {
-    return InitializeCallFrame(0, arguments, call_frame, verify_arguments);
-  }
-
-  // Converts returned values owned by the call frame using provided result
-  // converter. If exported function execution finished with an error (error
-  // flag is `true` in the call frame) returns error for all results (see
-  // `ResultConverter::ReturnError` documentation).
-  absl::Status ReturnResults(unsigned ordinal, const ResultConverter& results,
-                             CallFrame* call_frame) const;
-
-  absl::Status ReturnResults(const ResultConverter& results,
-                             CallFrame* call_frame) const {
-    return ReturnResults(0, results, call_frame);
-  }
-
-  // Executes exported function exported with given arguments.
-  //
-  // If `verify_arguments` is true (in debug mode it's always on, independent of
-  // the argument value) this function also verifies that arguments passed at
-  // run time matches the exported function signature. If some of the
-  // arguments do not match the expected type, this function allocates error
-  // async values for all results and returns an error.
-  //
-  // Returns exported function results via the user-provided results converter.
-  // If execution completed in the error state, returns error for all results.
-  absl::StatusOr<ExecutionReference> Execute(
-      unsigned ordinal, ArgumentsRef arguments, const ResultConverter& results,
-      const ExecuteOpts& opts, bool verify_arguments = true) const;
-
-  absl::StatusOr<ExecutionReference> Execute(
-      ArgumentsRef arguments, const ResultConverter& results,
-      const ExecuteOpts& opts, bool verify_arguments = true) const {
-    return Execute(0, arguments, results, opts, verify_arguments);
-  }
-
-  // Executes exported function using user provided call frame.
-  //
-  // It is the caller responsibility to handle the compiled function results
-  // stored in the call frame.
-  ExecutionReference Execute(unsigned ordinal, CallFrame& call_frame,
-                             const ExecuteOpts& opts) const;
-
-  void Execute(CallFrame& call_frame, const ExecuteOpts& opts) const {
-    Execute(0, call_frame, opts);
-  }
-
-  std::string_view name() const { return name_; }
-
-  std::string&& take_ir_module_string() { return std::move(ir_module_string_); }
-
-  std::optional<size_t> specialization() const { return specialization_; }
-
-  // Returns the number of exported functions. Functions are indexed by their
-  // ordinal number in the [0, num_functions) range.
-  size_t num_functions() const { return functions_.size(); }
-
-  // Returns a function reference to an exported function with given ordinal.
-  FunctionRef function_ref(unsigned ordinal) const;
-
-  // Returns true if exported function with given ordinal has async results.
-  bool IsAsync(unsigned ordinal) const;
-  bool IsAsync() const { return IsAsync(0); }
-
-  // Returns the name of the exported function with the given ordinal.
-  std::string_view function_name(unsigned ordinal) const;
-  std::string_view function_name() const { return function_name(0); }
-
-  // Returns the number of results of the exported function with given ordinal.
-  unsigned num_results(unsigned ordinal) const;
-  unsigned num_results() const { return num_results(0); }
-
-  // Signature of the exported function with the given ordinal before lowering
-  // to the runtime dialects. See JitExecutable::Function's `signature` for
-  // more details.
-  const FunctionType& signature(unsigned ordinal) const;
-  const FunctionType& signature() const { return signature(0); }
-
-  // Signature of the exported function with the given ordinal after lowering it
-  // from high level dialects to the dialects supported by the XLA runtime. See
-  // JitExecutable::Function's `signature` for more details.
-  const FunctionType& runtime_signature(unsigned ordinal) const;
-  const FunctionType& runtime_signature() const { return runtime_signature(0); }
-
-  std::chrono::milliseconds time_to_compile() const;
-
-  // Get the object file behind this executable (on linux for example, it will
-  // be https://en.wikipedia.org/wiki/Executable_and_Linkable_Format
-  // executable). Can be null.
-  std::unique_ptr<llvm::MemoryBuffer> obj_file() const;
-
-  // CallFrame provides a pointer-stable storage for packed function arguments
-  // and storage for returned values.
-  struct CallFrame {
-    // Pointers to executable arguments.
-    llvm::SmallVector<void*, 32> args;
-
-    // We use single block of memory to store executable results. We need to be
-    // able to store pointers to async values and tokens, and strided memrefs
-    // which at runtime are represented as StridedMemrefType<T, rank>.
-    //
-    // Currently we only need to provide result storage for pointers and memref
-    // sizes and strides (int64_t type). If we'll need to support more complex
-    // return types we'll have to be more careful about alignment requirements.
-    static_assert(sizeof(uintptr_t) == sizeof(int64_t),
-                  "uintptr_t size must be the same as int64_t");
-
-    // Memory where the executable will write its results.
-    llvm::SmallVector<uint8_t, 128> results;
-
-    // Tracks whether any of the outputs were set.
-    bool has_set_outputs = false;
-
-    // Indicates whether the execution finished with an error.
-    bool is_error = false;
-
-    // The error message which is available only if `is_error` is true. The
-    // assumption is that the error message string is owned by the compiled
-    // binary and the call frame can safely keep a non-owning pointer.
-    std::string_view error;
-  };
-
-  // Requirements for passing arguments to the compiled function.
-  struct ArgumentsMemoryLayout {
-    size_t num_args_ptrs = 0;            // total number of required pointers
-    llvm::SmallVector<size_t> num_ptrs;  // num_ptrs for each argument
-    llvm::SmallVector<size_t> offsets;   // offsets into the args array
-  };
-
-  // Requirements for the contiguous block of memory to store compiled function
-  // results. When we invoke a compiled fuction we allocate a block of memory,
-  // and pass pointers to pre-computed offsets as output arguments to the
-  // function.
-  struct ResultsMemoryLayout {
-    bool has_async_results = false;     // true iff returns async results
-    size_t size = 0;                    // number of bytes required
-    llvm::SmallVector<size_t> offsets;  // offsets in the block of memory
-  };
-
-  struct ExecuteOpts {
-    // Async task runner for executing async runtime tasks. Typically it
-    // schedules async tasks into the underlying thread pool. It's the caller's
-    // responsibility to guarantee that it will outlive the execution of all
-    // async tasks started by the executable.
-    AsyncTaskRunner* async_task_runner = nullptr;
-
-    // A container for passing arbitrary user-provided data to the custom call
-    // handlers. Must outlive all async tasks launched by this executable.
-    const CustomCall::UserData* custom_call_data = nullptr;
-
-    // Dynamically registered custom calls library. These custom calls resolved
-    // at run time by name. In contrast to custom calls defined by the
-    // `DirectCustomCallRegistry` which are linked directly with the executable
-    // at compile time.
-    const DynamicCustomCallRegistry* custom_call_registry = nullptr;
-
-    // Diagnostic engine is responsible for passing runtime diagnostics back
-    // to the caller through the diagnostic handler.
-    const DiagnosticEngine* diagnostic_engine = nullptr;
-  };
-
-  // Function specification for loading from the object file.
-  struct LoadFunction {
-    std::string name;
-    FunctionType signature;
-    FunctionType runtime_signature;
-  };
-
-  // Loads executable from an object file. It is the caller responsibility to
-  // guarantee that signatures do match the compiled function in the object
-  // file, otherwise it will surely lead to crash.
-  static absl::StatusOr<Executable> LoadFromObjFile(
-      std::string_view name, std::unique_ptr<llvm::MemoryBuffer> obj_file,
-      std::vector<LoadFunction> load_functions,
-      ExecutionEngine::SymbolsBinding symbols_binding = {},
-      std::string_view memory_region_name = "");
-
-  // Verifies that all arguments types in the exported function signature are
-  // supported at run time. Returns a pre-computed layout for the function
-  // arguments. If some arguments are not supported returns an error.
-  static absl::StatusOr<ArgumentsMemoryLayout> GetArgumentsMemoryLayout(
-      const FunctionType& signature);
-
-  // Verifies that all results types in the exported function signature are
-  // supported at run time . Returns a pre-computed layout for the function
-  // results. If some results are not supported returns an error.
-  static absl::StatusOr<ResultsMemoryLayout> GetResultsMemoryLayout(
-      const FunctionType& signature);
-
-  // TODO(ezhulenev): The following three functions should be decoupled from
-  // the executable header file (maybe move them to runtime.h?) so that custom
-  // call implementations do not have to depend on the `executable` target.
-
-  // Returns the user data passed via the ExecuteOpts to the executable.
-  static const CustomCall::UserData* GetUserData(ExecutionContext* ctx);
-
-  // Returns the diagnostic engine passed via the ExecuteOpts to the executable.
-  static const DiagnosticEngine* GetDiagnosticEngine(ExecutionContext* ctx);
-
-  // Calls the custom call handler with the given runtime context, arguments,
-  // attributes and results.
-  static LogicalResult Call(ExecutionContext* ctx, CustomCall& call,
-                            void** args, void** attrs, void** rets);
-
-  bool RequiresBlas(int ordinal) const {
-    return functions_[ordinal].requires_blas;
-  }
-
- private:
-  friend class JitCompiler;  // see `mlir/runtime/transforms/jit_compiler.h`
-
-  // Executable exports multiple functions available for users to call into. At
-  // run time they are referenced by their ordinal, so that we don't depend on
-  // expensive by-name lookup on the hot path. We keep function name only for
-  // debugging. Function ordinal is defined by its index in the `functions_`
-  // vector.
-  struct Function {
-    Function(std::string_view name, ExecutionEngine::ExportedFunctionPtr fptr,
-             FunctionType signature, FunctionType runtime_signature,
-             ArgumentsMemoryLayout arguments_memory_layout,
-             ResultsMemoryLayout results_memory_layout, bool requires_blas)
-        : name(name),
-          fptr(std::move(fptr)),
-          signature(std::move(signature)),
-          runtime_signature(std::move(runtime_signature)),
-          arguments_memory_layout(std::move(arguments_memory_layout)),
-          results_memory_layout(std::move(results_memory_layout)),
-          requires_blas(requires_blas) {}
-    Function(const Function&) = delete;
-    Function(Function&&) = default;
-
-    // Exported function name.
-    std::string name;
-
-    // Pointer to an exported function owned by the execution engine.
-    ExecutionEngine::ExportedFunctionPtr fptr;
-
-    // Signature of the exported function function before lowering to the
-    // runtime dialects (see JitExecutable::Function's `signature`).
-    FunctionType signature;
-
-    // Signature of the exported function after lowering it from high level
-    // dialects to the dialects supported by the XLA runtime.
-    //
-    // - Operands and results types converted to the types with well-defined ABI
-    //   (e.g. tensors converted to memrefs).
-    //
-    // - First argument is always an execution context added to the function by
-    //   the lowering pipeline.
-    //
-    // From this signatur, Executable infers how to pack runtime arguments
-    // according to the expected memory layout, and how to convert results
-    // returned from the JIT-compiled function into high level types (e.g. how
-    // to convert StridedMemrefType into Tensorflow Tensor).
-    //
-    // To infer the type of the returned value, Executable looks at the type
-    // defined by the `runtime_signature` to get the memory layout of the
-    // returned value, and at the type defined by the `signature` to get the
-    // type expected by the runtime.
-    FunctionType runtime_signature;
-
-    // Memory layout required for passing function arguments.
-    ArgumentsMemoryLayout arguments_memory_layout;
-
-    // Memory layout for returning function results.
-    ResultsMemoryLayout results_memory_layout;
-
-    // If this flag is true, then this function is outlined for cuda graph, and
-    // cuBlas should be initiated when capturing the cuda graph.
-    bool requires_blas;
-  };
-
-  Executable(std::string_view name,
-             std::unique_ptr<XlaRuntimeMemoryMapper> memory_mapper,
-             std::unique_ptr<ExecutionEngine> engine,
-             std::vector<Function> functions,
-             std::optional<size_t> specialization,
-             std::chrono::milliseconds time_to_compile,
-             std::string&& ir_module_string = "")
-      : name_(name),
-        memory_mapper_(std::move(memory_mapper)),
-        engine_(std::move(engine)),
-        functions_(std::move(functions)),
-        specialization_(specialization),
-        time_to_compile_(time_to_compile),
-        ir_module_string_(ir_module_string) {
-    // All exported functions must have a non-null function pointer.
-    assert(llvm::all_of(functions_, [](const Function& f) { return f.fptr; }));
-  }
-
-  std::string name_;  // name of the compiled executable
-
-  // Called by `engine_`'s destructor; must appear before it.
-  std::unique_ptr<XlaRuntimeMemoryMapper> memory_mapper_;  // optional
-
-  // XLA runtime execution engine owns the LLVM ORC jit compilation stack.
-  std::unique_ptr<ExecutionEngine> engine_;
-
-  // Functions exported by this executable, indexed by function ordinal.
-  std::vector<Function> functions_;
-
-  // Specialization id if this executable is a specialization, or an empty
-  // optional if this executable is a default one.
-  std::optional<size_t> specialization_;
-
-  // The time it took to compile this binary.
-  std::chrono::milliseconds time_to_compile_;
-
-  // The (optional) string containing the LLVM module, if requested by
-  // compilation or set explicitly.
-  std::string ir_module_string_;
-};
-
-// Function reference provides a function-like API for a function exported from
-// the executabled with the given ordinal.
-class FunctionRef {
- public:
-  FunctionRef(const Executable* executable, unsigned ordinal);
-
-  absl::StatusOr<ExecutionReference> operator()(
-      ArgumentsRef arguments, const ResultConverter& results,
-      const Executable::ExecuteOpts& opts, bool verify_arguments = true) const;
-
-  bool RequiresBlas() const { return executable_->RequiresBlas(ordinal_); }
-
-  unsigned ordinal() const { return ordinal_; }
-
- private:
-  const Executable* executable_;
-  unsigned ordinal_;
-};
-
-// Escape slashes, substituting them with double underscores to get a memory
-// region name for the XlaRuntimeMemoryMapper.
-//
-// The profiler's UI might interpret slashes as callchain separators,
-// whereas we want the region name to be shown in full.
-inline std::string EscapeMemRegionName(std::string_view memory_region_name) {
-  return llvm::join(llvm::split(memory_region_name, '/'), "__");
-}
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_RUNTIME_EXECUTABLE_H_
diff --git a/third_party/xla/xla/runtime/execution_engine.cc b/third_party/xla/xla/runtime/execution_engine.cc
deleted file mode 100644
index 3396c10e0c1bea..00000000000000
--- a/third_party/xla/xla/runtime/execution_engine.cc
+++ /dev/null
@@ -1,522 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/runtime/execution_engine.h"
-
-#include <memory>
-#include <optional>
-#include <string>
-#include <string_view>
-#include <utility>
-#include <vector>
-
-#include "absl/log/check.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_format.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ExecutionEngine/JITEventListener.h"
-#include "llvm/ExecutionEngine/ObjectCache.h"
-#include "llvm/ExecutionEngine/Orc/CompileUtils.h"
-#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
-#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
-#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
-#include "llvm/ExecutionEngine/Orc/Mangling.h"
-#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "xla/runtime/errors.h"
-
-namespace xla {
-namespace runtime {
-
-using absl::StatusOr;
-using absl::StrFormat;
-
-using llvm::cast;
-
-using llvm::Expected;
-using llvm::MemoryBuffer;
-using llvm::SectionMemoryManager;
-using llvm::Triple;
-
-using llvm::orc::DynamicLibrarySearchGenerator;
-using llvm::orc::ExecutionSession;
-using llvm::orc::ExecutorAddr;
-using llvm::orc::InPlaceTaskDispatcher;
-using llvm::orc::IRCompileLayer;
-using llvm::orc::JITTargetMachineBuilder;
-using llvm::orc::RTDyldObjectLinkingLayer;
-using llvm::orc::SelfExecutorProcessControl;
-using llvm::orc::SimpleCompiler;
-using llvm::orc::SymbolMap;
-using llvm::orc::ThreadSafeModule;
-
-namespace {
-
-// This compiler keeps weak pointers to the TargetMachine and the ObjectCache.
-//
-// This allows releasing the memory of those objects, even though the LLJIT
-// keeps the compiler alive.
-//
-// We wrote this class based on the code of llvm::orc::ConcurrentIRCompiler.
-class WeakCompiler : public IRCompileLayer::IRCompiler {
- public:
-  static llvm::orc::IRSymbolMapper::ManglingOptions
-  IrManglingOptionsForWeakTargetMachine(
-      std::weak_ptr<llvm::TargetMachine> weak_target_machine) {
-    std::shared_ptr<llvm::TargetMachine> target_machine =
-        weak_target_machine.lock();
-    CHECK(target_machine != nullptr)
-        << "Compiler should not be used after the TargetMachine is destroyed.";
-
-    return llvm::orc::irManglingOptionsFromTargetOptions(
-        target_machine->Options);
-  }
-
-  // It's not recommended to allocate the parameters with std::make_shared,
-  // because that would allocate the object and the control block in one
-  // allocation, so the weak_ptr would keep alive the memory of the object as
-  // well.
-  explicit WeakCompiler(std::weak_ptr<llvm::TargetMachine> weak_target_machine,
-                        std::weak_ptr<llvm::ObjectCache> weak_object_cache = {})
-      : IRCompiler(IrManglingOptionsForWeakTargetMachine(weak_target_machine)),
-        weak_target_machine_(weak_target_machine),
-        weak_object_cache_(weak_object_cache) {}
-
-  Expected<std::unique_ptr<MemoryBuffer>> operator()(
-      llvm::Module &module) override {
-    std::shared_ptr<llvm::TargetMachine> target_machine =
-        weak_target_machine_.lock();
-    CHECK(target_machine != nullptr)
-        << "Compiler should not be used after the TargetMachine is destroyed.";
-
-    // This may be nullptr, and that's fine.
-    std::shared_ptr<llvm::ObjectCache> object_cache = weak_object_cache_.lock();
-
-    SimpleCompiler compiler(*target_machine, object_cache.get());
-    return compiler(module);
-  }
-
- private:
-  std::weak_ptr<llvm::TargetMachine> weak_target_machine_;
-  std::weak_ptr<llvm::ObjectCache> weak_object_cache_;
-};
-
-}  // namespace
-
-ExecutionEngine::ExecutionEngine(bool enable_gdb_listener,
-                                 bool enable_perf_listener) {
-  if (enable_gdb_listener)
-    gdb_listener_ = llvm::JITEventListener::createGDBRegistrationListener();
-  if (enable_perf_listener)
-    perf_listener_ = llvm::JITEventListener::createPerfJITEventListener();
-}
-
-/*static*/ ExecutionEngine::SymbolsBinding ExecutionEngine::BindAll(
-    std::vector<SymbolsBinding> bindings) {
-  return [b = std::move(bindings)](llvm::orc::MangleAndInterner mangle) {
-    llvm::orc::SymbolMap symbol_map;
-
-    for (const SymbolsBinding &binding : b) {
-      if (!binding) continue;
-      auto symbols = binding(mangle);
-      symbol_map.insert(symbols.begin(), symbols.end());
-    }
-
-    return symbol_map;
-  };
-}
-
-std::unique_ptr<MemoryBuffer> ExecutionEngine::obj_file() const {
-  return obj_file_ ? MemoryBuffer::getMemBuffer(obj_file_->getMemBufferRef())
-                   : nullptr;
-}
-
-// -------------------------------------------------------------------------- //
-
-static std::string GetExportedName(std::string_view name) {
-  return StrFormat("__xla__%s", name);
-}
-
-absl::Status ExportWithXlaRuntimeAbi(llvm::Module &module,
-                                     std::string_view original_name,
-                                     std::string_view exported_name) {
-  llvm::IRBuilder<> builder(module.getContext());
-
-  // Check that we have a function with a valid type.
-  llvm::Function *func = module.getFunction(original_name);
-  if (!func)
-    return Internal("exported function not found: %s", original_name);
-  if (!func->getReturnType()->isVoidTy())
-    return Internal("exported function must return void");
-
-  // Add an XLA interface function for the exported function.
-  llvm::FunctionType *xla_runtime_type =
-      llvm::FunctionType::get(builder.getVoidTy(), builder.getPtrTy(),
-                              /*isVarArg=*/false);
-
-  llvm::FunctionCallee xla_runtime_func =
-      module.getOrInsertFunction(exported_name, xla_runtime_type);
-
-  llvm::Function *callee = cast<llvm::Function>(xla_runtime_func.getCallee());
-  llvm::Value *packed_args = callee->arg_begin();
-
-  // Load arguments from the type erased pointer array and cast them to the
-  // original type.
-  llvm::BasicBlock *bb = llvm::BasicBlock::Create(builder.getContext());
-  bb->insertInto(callee);
-  builder.SetInsertPoint(bb);
-
-  llvm::SmallVector<llvm::Value *> args;
-  args.reserve(llvm::size(func->args()));
-
-  for (const auto &indexed_arg : llvm::enumerate(func->args())) {
-    llvm::Type *art_ty = indexed_arg.value().getType();
-
-    llvm::Value *arg_ptr_gep = builder.CreateConstGEP1_64(
-        builder.getPtrTy(), packed_args, indexed_arg.index());
-    llvm::LoadInst *arg_ptr_load =
-        builder.CreateLoad(builder.getPtrTy(), arg_ptr_gep);
-    llvm::LoadInst *arg_load = builder.CreateLoad(art_ty, arg_ptr_load);
-
-    args.emplace_back(arg_load);
-  }
-
-  // Call the implementation function with the extracted arguments.
-  auto *call = builder.CreateCall(func, args);
-  builder.CreateRetVoid();
-
-  // Make sure that we do not keep exported function in the binary if we do not
-  // have any other callers.
-  func->setLinkage(llvm::GlobalValue::LinkageTypes::PrivateLinkage);
-
-  // Explicitly inline implementation function into the interface function,
-  // because it potentially can have thousands of arguments and it interacts
-  // badly with various SCCP passes in LLVM.
-  llvm::InlineFunctionInfo ifi;
-
-  // If inlined function is a coroutine (result of lowering async function),
-  // then we have to mark the interface function as a corotuine as well.
-  bool is_coro = func->isPresplitCoroutine();
-  if (auto inlined = llvm::InlineFunction(*call, ifi); inlined.isSuccess()) {
-    if (is_coro) callee->setPresplitCoroutine();
-  }
-
-  // Always keep the frame pointer inside jit-compiled modules, so that we can
-  // correctly walk the stack when collecting profiles at run time.
-  for (llvm::Function &fn : module.functions()) {
-    if (!fn.isDeclaration()) fn.addFnAttr("frame-pointer", "all");
-  }
-
-  return absl::OkStatus();
-}
-
-// -------------------------------------------------------------------------- //
-
-namespace {
-using llvm::DenseMap;
-
-// Intercept object compilation to save the object file corresponding to the
-// XLA executable in the execution engine.
-class ExecutionEngineObjectCache : public llvm::ObjectCache {
- public:
-  void notifyObjectCompiled(const llvm::Module *m,
-                            llvm::MemoryBufferRef objBuffer) override;
-
-  std::unique_ptr<llvm::MemoryBuffer> getObject(const llvm::Module *m) override;
-
-  // Transfer memory buffer from the cache to the caller.
-  std::unique_ptr<llvm::MemoryBuffer> stealObject(const llvm::Module *m);
-
- private:
-  DenseMap<const llvm::Module *, std::unique_ptr<llvm::MemoryBuffer>> objs_;
-};
-}  // namespace
-
-void ExecutionEngineObjectCache::notifyObjectCompiled(
-    const llvm::Module *m, llvm::MemoryBufferRef objBuffer) {
-  objs_[m] = llvm::MemoryBuffer::getMemBufferCopy(
-      objBuffer.getBuffer(), objBuffer.getBufferIdentifier());
-}
-
-std::unique_ptr<llvm::MemoryBuffer> ExecutionEngineObjectCache::getObject(
-    const llvm::Module *m) {
-  auto it = objs_.find(m);
-  if (it == objs_.end()) return nullptr;
-  return llvm::MemoryBuffer::getMemBuffer(it->second->getMemBufferRef());
-}
-
-std::unique_ptr<llvm::MemoryBuffer> ExecutionEngineObjectCache::stealObject(
-    const llvm::Module *m) {
-  auto it = objs_.find(m);
-  if (it == objs_.end()) return nullptr;
-  return std::move(it->second);
-}
-
-// -------------------------------------------------------------------------- //
-
-// llvm_ir::DumpToString() is not used here, because we don't want to add too
-// many dependencies to the runtime.
-std::string ToString(const llvm::Error &err) {
-  std::string str;
-  llvm::raw_string_ostream(str) << err;
-  return str;
-}
-
-/*static*/ StatusOr<std::unique_ptr<ExecutionEngine>>
-ExecutionEngine::CreateFromModule(std::unique_ptr<llvm::LLVMContext> ctx,
-                                  std::unique_ptr<llvm::Module> module,
-                                  JitOptions options,
-                                  absl::Span<const std::string_view> exported) {
-  auto engine = std::unique_ptr<ExecutionEngine>(new ExecutionEngine(
-      options.enable_gdb_listener, options.enable_perf_listener));
-
-  // We'll need module pointer later to lookup object file in the cache.
-  llvm::Module *module_ptr = module.get();
-
-  // Set up the target machine details.
-  if (!options.target_machine)
-    return Internal("target machine was not provided");
-  module->setDataLayout(options.target_machine->createDataLayout());
-  module->setTargetTriple(options.target_machine->getTargetTriple().str());
-
-  // Set up exported functions interface functions in the LLVM module.
-  for (std::string_view name : exported) {
-    if (auto status =
-            ExportWithXlaRuntimeAbi(*module, name, GetExportedName(name));
-        !status.ok()) {
-      return Internal(
-          "failed to set up exported function %s interface: %s", name,
-          status.message());
-    }
-  }
-
-  // Run an optimization pipeline over the LLVM module (alway run with default
-  // opt level independent of the options).
-  //
-  // TODO(ezhulenev): We should have out own optimizing transformer pipelines
-  // for different Xla backends, e.g. there is absolutely no need to run
-  // SLV vectorizer for Xla Gpi host side executable.
-  auto transformer =
-      options.make_optimizing_transformer(options.target_machine.get());
-  if (auto err = transformer(module_ptr))
-    return Internal("failed to run optimization pipeline: %s",
-                         ToString(err));
-
-  // Callback to create the object layer with a user-provided section memory
-  // mapper and JIT event listeners.
-  auto obj_layer_creator = [&](ExecutionSession &session, const Triple &tt) {
-    auto obj_layer = std::make_unique<RTDyldObjectLinkingLayer>(
-        session, [section_memory_mapper = options.section_memory_mapper]() {
-          return std::make_unique<SectionMemoryManager>(section_memory_mapper);
-        });
-
-    // Register JIT event listeners if they are enabled.
-    if (engine->gdb_listener_)
-      obj_layer->registerJITEventListener(*engine->gdb_listener_);
-    if (engine->perf_listener_)
-      obj_layer->registerJITEventListener(*engine->perf_listener_);
-
-    return obj_layer;
-  };
-
-  // Optionally enable cache for compiled object files.
-  // Not using std::make_shared to allocate the control block and the object in
-  // separate allocations.
-  std::shared_ptr<ExecutionEngineObjectCache> obj_cache =
-      options.save_compiled_obj_file
-          ? std::shared_ptr<ExecutionEngineObjectCache>(
-                std::make_unique<ExecutionEngineObjectCache>())
-          : nullptr;
-
-  // Callback to compile IR module on demand.
-  auto compile_function_creator =
-      [weak_target_machine =
-           std::weak_ptr<llvm::TargetMachine>(options.target_machine),
-       weak_obj_cache =
-           std::weak_ptr<llvm::ObjectCache>(obj_cache)](JITTargetMachineBuilder)
-      -> Expected<std::unique_ptr<IRCompileLayer::IRCompiler>> {
-    return std::make_unique<WeakCompiler>(weak_target_machine, weak_obj_cache);
-  };
-
-  // Use in-process executor process control with in-place task dispatcher.
-  auto executorProcessControl = SelfExecutorProcessControl::Create(
-      nullptr, std::make_unique<InPlaceTaskDispatcher>());
-
-  if (auto err = executorProcessControl.takeError())
-    return Internal("failed to create executor process control: %s",
-                         ToString(err));
-
-  // TODO(b/286475799): Concurrent compilation leads to spurious memory
-  // corruptions and segfaults at run time, however nothing shows up in tsan
-  // or asan builds. This is a hack that for some unknown reason helps.
-  static auto *lljit_mu = new absl::Mutex();
-  std::optional<absl::MutexLock> lljit_lock(lljit_mu);
-
-  // Construct the LLJIT with the given compiler and object linking layers.
-  auto jit = llvm::orc::LLJITBuilder()
-                 .setCompileFunctionCreator(compile_function_creator)
-                 .setObjectLinkingLayerCreator(obj_layer_creator)
-                 .setExecutorProcessControl(std::move(*executorProcessControl))
-                 .setNumCompileThreads(0)  // disable multi-threading
-                 .create();
-
-  if (auto err = jit.takeError())
-    return Internal("failed to construct LLJIT: %s", ToString(err));
-
-  lljit_lock.reset();
-
-  // Register input module with the LLJIT.
-  ThreadSafeModule tsm(std::move(module), std::move(ctx));
-  if (auto err = (*jit)->addIRModule(std::move(tsm)))
-    return Internal("failed to add source module: %s", ToString(err));
-
-  llvm::orc::JITDylib &main_jd = (*jit)->getMainJITDylib();
-  llvm::DataLayout data_layout = (*jit)->getDataLayout();
-
-  // Register user-provided symbols.
-  if (options.symbols_binding) {
-    auto mangle = llvm::orc::MangleAndInterner(main_jd.getExecutionSession(),
-                                               data_layout);
-    auto symbols = absoluteSymbols(options.symbols_binding(mangle));
-    if (auto err = main_jd.define(symbols))
-      return Internal("failed to add symbols bindings: %s", ToString(err));
-  }
-
-  // Resolve all exported functions to function pointers.
-  for (std::string_view name : exported) {
-    // Trigger compilation by looking up the exported function.
-    Expected<ExecutorAddr> addr = (*jit)->lookup(GetExportedName(name));
-    if (auto err = addr.takeError())
-      return Internal("failed to compile exported function %s: %s", name,
-                           ToString(err));
-
-    // Check that we found an address of an exported function.
-    auto ptr = addr->toPtr<ExportedFunctionPtr>();
-    if (!ptr)
-      return Internal("exported function %s resolved to null", name);
-
-    engine->exported_.push_back(ptr);
-  }
-
-  // Check that if we enabled object cache we have an object file for the
-  // compiled module.
-  std::unique_ptr<llvm::MemoryBuffer> obj_file =
-      options.save_compiled_obj_file ? obj_cache->stealObject(module_ptr)
-                                     : nullptr;
-  if (options.save_compiled_obj_file && !obj_file)
-    return Internal("could not find object file for the XLA module");
-
-  // Fill remaining fields and return constructed ExecutionEngine to the caller.
-  engine->jit_ = std::move(*jit);
-  engine->obj_file_ = std::move(obj_file);
-  return std::move(engine);
-}
-
-static void InitializeLlvmNativeTarget() {
-  static const bool initialized = [] {
-    llvm::InitializeNativeTarget();
-    return true;
-  }();
-  (void)initialized;
-}
-
-/*static*/ StatusOr<std::unique_ptr<ExecutionEngine>>
-ExecutionEngine::CreateFromObjFile(
-    std::unique_ptr<llvm::MemoryBuffer> obj_file, AotOptions options,
-    absl::Span<const std::string_view> exported) {
-  auto engine = std::unique_ptr<ExecutionEngine>(new ExecutionEngine(
-      options.enable_gdb_listener, options.enable_perf_listener));
-
-  // Callback to create the object layer with a user-provided section memory
-  // mapper and JIT event listeners.
-  auto obj_layer_creator = [&](ExecutionSession &session, const Triple &tt) {
-    auto obj_layer = std::make_unique<RTDyldObjectLinkingLayer>(
-        session, [section_memory_mapper = options.section_memory_mapper]() {
-          return std::make_unique<SectionMemoryManager>(section_memory_mapper);
-        });
-
-    // Register JIT event listeners if they are enabled.
-    if (engine->gdb_listener_)
-      obj_layer->registerJITEventListener(*engine->gdb_listener_);
-    if (engine->perf_listener_)
-      obj_layer->registerJITEventListener(*engine->perf_listener_);
-
-    return obj_layer;
-  };
-
-  // Initialize LLVM native target before constructing LLJIT.
-  InitializeLlvmNativeTarget();
-
-  // Construct the LLJIT with the given compiler and object linking layers.
-  auto jit = llvm::orc::LLJITBuilder()
-                 .setObjectLinkingLayerCreator(obj_layer_creator)
-                 .create();
-  if (auto err = jit.takeError())
-    return Internal("failed to construct LLJIT: %s", ToString(err));
-
-  if (auto err = (*jit)->addObjectFile(std::move(obj_file)))
-    return Internal("failed to add object file: %s", ToString(err));
-
-  llvm::orc::JITDylib &main_jd = (*jit)->getMainJITDylib();
-  llvm::DataLayout data_layout = (*jit)->getDataLayout();
-
-  // Register symbols that are statically linked in the current process.
-  auto generator = DynamicLibrarySearchGenerator::GetForCurrentProcess(
-      data_layout.getGlobalPrefix());
-  if (auto err = generator.takeError())
-    return Internal("failed to construct DyLib search generator");
-  main_jd.addGenerator(std::move(*generator));
-
-  // Register user-provided symbols.
-  if (options.symbols_binding) {
-    auto mangle = llvm::orc::MangleAndInterner(main_jd.getExecutionSession(),
-                                               data_layout);
-    auto symbols = absoluteSymbols(options.symbols_binding(mangle));
-    if (auto err = main_jd.define(symbols))
-      return Internal("failed to add symbols bindings: %s", ToString(err));
-  }
-
-  // Resolve all exported functions to function pointers.
-  for (std::string_view name : exported) {
-    // Lookup exported function in the loaded object file.
-    Expected<ExecutorAddr> addr = (*jit)->lookup(GetExportedName(name));
-    if (auto err = addr.takeError())
-      return Internal("failed to look up the exported function %s: %s",
-                           name, ToString(err));
-
-    // Check that we found an address of an exported function.
-    auto ptr = addr->toPtr<ExportedFunctionPtr>();
-    if (!ptr)
-      return Internal("exported function %s resolved to null", name);
-
-    engine->exported_.push_back(ptr);
-  }
-
-  // Fill remaining fields and return constructed ExecutionEngine to the caller.
-  engine->jit_ = std::move(*jit);
-  return std::move(engine);
-}
-
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/runtime/execution_engine.h b/third_party/xla/xla/runtime/execution_engine.h
deleted file mode 100644
index 19c71d53290408..00000000000000
--- a/third_party/xla/xla/runtime/execution_engine.h
+++ /dev/null
@@ -1,187 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_RUNTIME_EXECUTION_ENGINE_H_
-#define XLA_RUNTIME_EXECUTION_ENGINE_H_
-
-#include <functional>
-#include <memory>
-#include <string_view>
-#include <utility>
-#include <vector>
-
-#include "absl/status/statusor.h"
-#include "llvm/ExecutionEngine/JITEventListener.h"
-#include "llvm/ExecutionEngine/Orc/LLJIT.h"
-#include "llvm/ExecutionEngine/SectionMemoryManager.h"
-#include "llvm/Support/MemoryBuffer.h"
-
-namespace xla {
-namespace runtime {
-
-// A pre-fabricated wrapper around ORC JIT stack for running XLA executables.
-//
-// It allows to run jit-compiled XLA executables, AOT compile them or load
-// previously AOT compiled executables.
-//
-// XLA executable itself is responsible for the function signature verification,
-// arguments packing according to the ABI, results decoding and linking the
-// executable with runtime intrinsics. Execution engine only helps with setting
-// up ORC JIT stack to support the execution, but itself doesn't know what it is
-// executing.
-class ExecutionEngine {
- public:
-  // Pointer to a function exported to the Xla executable.
-  //
-  // Xla exported function expects all arguments to be passed as an array of
-  // opaque pointers to the actual values. In C++ it would look like this:
-  //
-  //   void compute(int32_t arg0, float arg1, ...);
-  //
-  //   void __xla_compute(void** args) {
-  //      int32_t arg0 = *reinterpret_cast<int32_t*>(args[0]);
-  //      float arg1 = *reinterpret_cast<float*>(args[1]);
-  //      ...
-  //      compute(arg0, arg1, ...);
-  //   }
-  //
-  // This is required to avoid dealing with ABI of the compiled function. See
-  // `SetUpExportedFunction` for implementation details.
-  using ExportedFunctionPtr = void (*)(void **);
-
-  // Callback to register symbols with the execution engine (e.g. to register
-  // custom runtime intrinsics for Gpu integration).
-  using SymbolsBinding =
-      std::function<llvm::orc::SymbolMap(llvm::orc::MangleAndInterner)>;
-
-  // Callback to run optimization passes on the compiled LLVM module.
-  using OptimizingTransformer = std::function<llvm::Error(llvm::Module *)>;
-
-  // Callback to construct an optimizing transformer for the given options.
-  using MakeOptimizingTransformer =
-      std::function<OptimizingTransformer(llvm::TargetMachine *targetMachine)>;
-
-  // Compose multiple symbol bindings into a single symbol binding function.
-  static SymbolsBinding BindAll(std::vector<SymbolsBinding> bindings);
-
-  //------------------------------------------------------------------------- //
-  // Options for creating execution engine from an LLVM module.
-  //------------------------------------------------------------------------- //
-
-  struct JitOptions {
-    // User-provided codegen optimization level.
-    llvm::CodeGenOptLevel opt_level = llvm::CodeGenOptLevel::Default;
-
-    // User-provided target machine specification.
-    std::shared_ptr<llvm::TargetMachine> target_machine = nullptr;
-
-    // User-provided builder for the optimizing transformer.
-    MakeOptimizingTransformer make_optimizing_transformer;
-
-    // User-provided memory mapper for allocating memory for executables.
-    llvm::SectionMemoryManager::MemoryMapper *section_memory_mapper = nullptr;
-
-    // User-provided bindings for symbols.
-    SymbolsBinding symbols_binding = nullptr;
-
-    // Notify the llvm's global GDB notifications listener.
-    bool enable_gdb_listener = false;
-
-    // Notify the llvm's global Perf notifications listener.
-    bool enable_perf_listener = false;
-
-    // Save compiled object file.
-    bool save_compiled_obj_file = true;
-  };
-
-  // Creates a new execution engine by compiling the provided LLVM module to
-  // a native executable using LLVM ORC stack.
-  static absl::StatusOr<std::unique_ptr<ExecutionEngine>> CreateFromModule(
-      std::unique_ptr<llvm::LLVMContext> ctx,
-      std::unique_ptr<llvm::Module> module, JitOptions options,
-      absl::Span<const std::string_view> exported);
-
-  //------------------------------------------------------------------------- //
-  // Options for creating execution engine from an AOT compiled object file.
-  //------------------------------------------------------------------------- //
-
-  struct AotOptions {
-    // User-provided memory mapper for allocating memory for executables.
-    llvm::SectionMemoryManager::MemoryMapper *section_memory_mapper = nullptr;
-
-    // User-provided bindings for symbols.
-    SymbolsBinding symbols_binding = nullptr;
-
-    // Notify the llvm's global GDB notifications listener.
-    bool enable_gdb_listener = true;
-
-    // Notify the llvm's global Perf notifications listener.
-    bool enable_perf_listener = true;
-  };
-
-  // Creates a new execution engine by loading AOT compiled XLA executable
-  // object file.
-  static absl::StatusOr<std::unique_ptr<ExecutionEngine>> CreateFromObjFile(
-      std::unique_ptr<llvm::MemoryBuffer>, AotOptions options,
-      absl::Span<const std::string_view> exported);
-
-  //------------------------------------------------------------------------- //
-
-  // Returns a pointer to the exported function.
-  absl::Span<const ExportedFunctionPtr> exported() const { return exported_; }
-
-  ExportedFunctionPtr exported(unsigned ordinal) const {
-    return exported_[ordinal];
-  }
-
-  // Return a memory buffer with a object file behind this execution engine. Can
-  // be null if execution engine didn't save the compiled object file.
-  std::unique_ptr<llvm::MemoryBuffer> obj_file() const;
-
- private:
-  ExecutionEngine(bool enable_gdb_listener, bool enable_perf_listener);
-
-  // We build execution engine on top of the ORC LLJIT API, which owns all
-  // compiled/loaded object files and does the linking at run time.
-  //
-  // TODO(ezhulenev): Instead of keeping LLJIT alive we should be able to keep
-  // only llvm::orc::JITDylibSP owning main dylib and the object layer owning
-  // memory-mapped regions holding object files. Once we are done with
-  // executable compilation this jit is defunct because it holds an expired
-  // weak_ptr to an llvm::orc::TargetMachine instance.
-  std::unique_ptr<llvm::orc::LLJIT> jit_;
-
-  // Pointers to resolved exported functions. Indexed by function ordinal.
-  std::vector<ExportedFunctionPtr> exported_;
-
-  // Object file behind the compiled executable. Can be null.
-  std::unique_ptr<llvm::MemoryBuffer> obj_file_;
-
-  llvm::JITEventListener *gdb_listener_ = nullptr;
-  llvm::JITEventListener *perf_listener_ = nullptr;
-};
-
-// Emits an interface function ('exported_name') that wraps all arguments
-// of a function ('original_name') into a single pointer to a ptr**,
-// thereby exposing a trivial ABI. The original function is also inlined,
-// if possible.
-absl::Status ExportWithXlaRuntimeAbi(llvm::Module &module,
-                                     std::string_view original_name,
-                                     std::string_view exported_name);
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_RUNTIME_EXECUTION_ENGINE_H_
diff --git a/third_party/xla/xla/runtime/ffi/BUILD b/third_party/xla/xla/runtime/ffi/BUILD
deleted file mode 100644
index 024d01d5550d65..00000000000000
--- a/third_party/xla/xla/runtime/ffi/BUILD
+++ /dev/null
@@ -1,38 +0,0 @@
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("//xla/tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//visibility:public"],
-)
-
-filegroup(
-    name = "ffi_hdrs",
-    srcs = [
-        "ffi_abi.h",
-        "ffi_api.h",
-        "ffi_c_api.h",
-    ],
-)
-
-cc_library(
-    name = "ffi_abi",
-    hdrs = ["ffi_abi.h"],
-    compatible_with = get_compatible_with_portable(),
-)
-
-cc_library(
-    name = "ffi_api",
-    hdrs = ["ffi_api.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        ":ffi_abi",
-        ":ffi_c_api_hdrs",
-    ],
-)
-
-cc_library(
-    name = "ffi_c_api_hdrs",
-    hdrs = ["ffi_c_api.h"],
-    compatible_with = get_compatible_with_portable(),
-)
diff --git a/third_party/xla/xla/runtime/ffi/ffi_abi.h b/third_party/xla/xla/runtime/ffi/ffi_abi.h
deleted file mode 100644
index c0185e6e61263d..00000000000000
--- a/third_party/xla/xla/runtime/ffi/ffi_abi.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_RUNTIME_FFI_FFI_ABI_H_
-#define XLA_RUNTIME_FFI_FFI_ABI_H_
-
-#include <cstdint>
-
-namespace xla {
-namespace runtime {
-namespace internal {
-
-//===----------------------------------------------------------------------===//
-// C structures that XLA uses internally to encode arguments and attributes.
-//===----------------------------------------------------------------------===//
-
-// When XLA compiles host-side executable via lowering to LLVM (see `rt-to-llvm`
-// pass) it encodes arguments and attributes as `!llvm.struct<...>` types stored
-// as LLVM global constants (attributes and statically known arguments) or as
-// allocas on the stack. We rely on standard layout C++ structs to reinterpret
-// cast arguments and attributes pointers, and convert them to user-friendly C++
-// types (e.g. `EncodedMemref` to `StridedBufferArg`).
-//
-// See: https://en.cppreference.com/w/cpp/types/is_standard_layout
-
-struct EncodedMemref {
-  uint8_t dtype;
-  uint8_t rank;
-  void* data;
-  int64_t dims[];
-};
-
-template <typename T>
-struct EncodedArray {
-  int64_t size;
-  const T* data;
-};
-
-template <typename T>
-struct EncodedDenseElements {
-  struct EncodedArray<T> payload;
-  int64_t rank;
-  int64_t shape[];
-};
-
-}  // namespace internal
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_RUNTIME_FFI_FFI_ABI_H_
diff --git a/third_party/xla/xla/runtime/ffi/ffi_api.h b/third_party/xla/xla/runtime/ffi/ffi_api.h
deleted file mode 100644
index f6f9a9e25d992f..00000000000000
--- a/third_party/xla/xla/runtime/ffi/ffi_api.h
+++ /dev/null
@@ -1,1135 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_RUNTIME_FFI_FFI_API_H_
-#define XLA_RUNTIME_FFI_FFI_API_H_
-
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <iostream>
-#include <iterator>
-#include <memory>
-#include <optional>
-#include <ostream>
-#include <sstream>
-#include <string>
-#include <string_view>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#include "xla/runtime/ffi/ffi_abi.h"
-#include "xla/runtime/ffi/ffi_c_api.h"
-
-namespace xla {
-namespace runtime {
-namespace ffi {
-
-// Forward declare template defined below.
-template <typename... Ts>
-class FfiBinding;
-
-// Forward declare template defined below.
-template <typename Fn, typename... Ts>
-class FfiHandler;
-
-// FFI arguments allocated by the jit-compiled code and we need to mark all
-// memory initialized to suppress memory sanitizer errors.
-#if defined(MEMORY_SANITIZER)
-#define XLA_FFI_ANNOTATE_MEMORY_IS_INITIALIZED(address, size) \
-  __msan_unpoison(address, size)
-#else
-#define XLA_FFI_ANNOTATE_MEMORY_IS_INITIALIZED(address, size)  // empty
-#endif  // MEMORY_SANITIZER
-
-//===----------------------------------------------------------------------===//
-// Check struct sizes passed across the C API to detect mismatched versions.
-//===----------------------------------------------------------------------===//
-
-namespace internal {
-inline void CheckStructSize(std::string_view struct_name, size_t expected_size,
-                            size_t actual_size) {
-  if (expected_size != actual_size) {
-    std::cerr << "Unexpected " << struct_name << " size: expected "
-              << expected_size << " << got " << actual_size
-              << ". Check installed software versions." << std::endl;
-    std::abort();
-  }
-}
-}  // namespace internal
-
-#define CHECK_ARGS_SIZE(name, args)                            \
-  internal::CheckStructSize("XLA_FFI_##name##_Args",           \
-                            XLA_FFI_##name##_Args_STRUCT_SIZE, \
-                            args->struct_size)
-
-//===----------------------------------------------------------------------===//
-// Span is non-owning view into contiguous values of type `T`.
-//===----------------------------------------------------------------------===//
-
-// TODO(ezhulenev): Replace with `std::span` when C++20 is available.
-template <typename T>
-class Span {
- public:
-  Span(T* data, size_t size) : data_(data), size_(size) {}
-  Span(const std::vector<std::remove_const_t<T>>& vec)  // NOLINT
-      : Span(vec.data(), vec.size()) {}
-
-  T& operator[](size_t index) const { return data_[index]; }
-
-  size_t size() const { return size_; }
-
-  T* begin() const { return data_; }
-  T* end() const { return data_ + size_; }
-
- private:
-  T* data_;
-  size_t size_;
-};
-
-// A type for representing shaped tensors.
-// TODO(ecg): expand API to be compatible with `std::mdspan`, and eventually
-// (when C++23 is available) replace with `std::mdspan`.
-template <typename T>
-struct MdSpan {
-  Span<const int64_t> shape;
-  Span<const T> data;
-};
-
-//===----------------------------------------------------------------------===//
-// XLA FFI status wrapper around error reporting APIs.
-//===----------------------------------------------------------------------===//
-
-class FfiStatus {
- public:
-  static FfiStatus Ok() { return FfiStatus(); }
-
-  static FfiStatus Internal(std::string message) {
-    XLA_FFI_Error_Code errc = XLA_FFI_Error_Code_INTERNAL;
-    return FfiStatus(errc, message);
-  }
-
-  static FfiStatus InvalidArgument(std::string message) {
-    XLA_FFI_Error_Code errc = XLA_FFI_Error_Code_INVALID_ARGUMENT;
-    return FfiStatus(errc, message);
-  }
-
-  std::optional<XLA_FFI_Error_Code> errc() const { return errc_; }
-
-  std::string_view message() const {
-    return message_.has_value() ? *message_ : std::string_view();
-  }
-
-  const char* message_c_str() const {
-    return message_.has_value() ? message_->c_str() : "";
-  }
-
- private:
-  FfiStatus() = default;
-
-  FfiStatus(XLA_FFI_Error_Code errc, std::string message)
-      : errc_(errc), message_(std::move(message)) {}
-
-  std::optional<XLA_FFI_Error_Code> errc_;
-  std::optional<std::string> message_;
-};
-
-//===----------------------------------------------------------------------===//
-// XLA FFI virtual base for implementing FFI functions.
-//===----------------------------------------------------------------------===//
-
-class Ffi {
- public:
-  virtual ~Ffi() = default;
-
-  virtual XLA_FFI_Error* operator()(const XLA_FFI_Api* api,
-                                    XLA_FFI_ExecutionContext* ctx, void** args,
-                                    void** attrs, void** rets) const = 0;
-
-  static FfiBinding<> Binding();
-
-  template <typename T>
-  static bool Isa(const XLA_FFI_Api* api, XLA_FFI_TypeId type_id);
-
-  template <typename T, typename U, typename... Ts>
-  static bool Isa(const XLA_FFI_Api* api, XLA_FFI_TypeId type_id) {
-    return Isa<T>(api, type_id) || Isa<U, Ts...>(api, type_id);
-  }
-};
-
-//===----------------------------------------------------------------------===//
-// Arguments supported by the FFI handlers.
-//===----------------------------------------------------------------------===//
-
-// This enum corresponds to xla::PrimitiveType enum defined in `hlo.proto`.
-enum class PrimitiveType : uint8_t {
-  // Invalid primitive type to serve as default.
-  PRIMITIVE_TYPE_INVALID = 0,
-
-  // Predicates are two-state booleans.
-  PRED = 1,
-
-  // Signed integral values of fixed width.
-  S8 = 2,
-  S16 = 3,
-  S32 = 4,
-  S64 = 5,
-
-  // Unsigned integral values of fixed width.
-  U8 = 6,
-  U16 = 7,
-  U32 = 8,
-  U64 = 9,
-
-  // Floating-point values of fixed width.
-  //
-  // Note: if f16s are not natively supported on the device, they will be
-  // converted to f16 from f32 at arbitrary points in the computation.
-  F16 = 10,
-  F32 = 11,
-
-  // Truncated 16 bit floating-point format. This is similar to IEEE's 16 bit
-  // floating-point format, but uses 1 bit for the sign, 8 bits for the exponent
-  // and 7 bits for the mantissa.
-  BF16 = 16,
-
-  F64 = 12,
-};
-
-constexpr int ByteWidth(PrimitiveType type) {
-  switch (type) {
-    case PrimitiveType::PRED:
-      return 1;
-
-    case PrimitiveType::S8:
-    case PrimitiveType::U8:
-      return 1;
-
-    case PrimitiveType::S16:
-    case PrimitiveType::U16:
-    case PrimitiveType::F16:
-    case PrimitiveType::BF16:
-      return 2;
-
-    case PrimitiveType::S32:
-    case PrimitiveType::U32:
-    case PrimitiveType::F32:
-      return 4;
-
-    case PrimitiveType::S64:
-    case PrimitiveType::U64:
-    case PrimitiveType::F64:
-      return 8;
-
-    case PrimitiveType::PRIMITIVE_TYPE_INVALID:
-      assert(false && "Unsupported type");
-      return 0;
-  }
-}
-
-constexpr std::string_view PrimitiveTypeToString(PrimitiveType type) {
-  switch (type) {
-    case PrimitiveType::PRIMITIVE_TYPE_INVALID:
-      return "invalid";
-    case PrimitiveType::PRED:
-      return "pred";
-    case PrimitiveType::S8:
-      return "s8";
-    case PrimitiveType::S16:
-      return "s16";
-    case PrimitiveType::S32:
-      return "s32";
-    case PrimitiveType::S64:
-      return "s64";
-    case PrimitiveType::U8:
-      return "u8";
-    case PrimitiveType::U16:
-      return "u16";
-    case PrimitiveType::U32:
-      return "u32";
-    case PrimitiveType::U64:
-      return "u64";
-    case PrimitiveType::F16:
-      return "f16";
-    case PrimitiveType::F32:
-      return "f32";
-    case PrimitiveType::BF16:
-      return "bf16";
-    case PrimitiveType::F64:
-      return "f64";
-  }
-}
-
-// A view into the buffer argument. Buffers with non-identity layouts can be
-// decoded only as a StridedBufferArg.
-struct StridedBufferArg {
-  std::string ToString() const;
-
-  PrimitiveType dtype;
-  void* data;
-  Span<const int64_t> sizes;
-  Span<const int64_t> strides;
-};
-
-// A view into the buffer argument with an identity (row major) layout.
-struct BufferArg {
-  std::string ToString() const;
-
-  PrimitiveType dtype;
-  void* data;
-  Span<const int64_t> sizes;
-};
-
-// A flat view into the buffer argument with an identity (row major) layout.
-// If the memref shape and strides are not required, it is cheaper to pass the
-// flat buffer argument.
-struct FlatBufferArg {
-  std::string ToString() const;
-
-  PrimitiveType dtype;
-  void* data;
-  int64_t size_in_bytes;
-};
-
-// A type tag to represent dictionary attributes that can be decoded into
-// structs using aggregate attribute decoding.
-struct Dictionary {};
-
-template <typename T>
-bool Ffi::Isa(const XLA_FFI_Api* api, XLA_FFI_TypeId type_id) {
-#define ISA(type, name)                  \
-  if constexpr (std::is_same_v<T, type>) \
-    return api->XLA_FFI_Get_##name##_TypeId() == type_id;
-
-  ISA(std::string_view, String);
-
-  ISA(float, Float);
-  ISA(double, Double);
-  ISA(bool, Int1);
-  ISA(int32_t, Int32);
-  ISA(int64_t, Int64);
-
-  ISA(Span<const float>, FloatArray);
-  ISA(Span<const double>, DoubleArray);
-  ISA(Span<const int32_t>, Int32Array);
-  ISA(Span<const int64_t>, Int64Array);
-
-  ISA(MdSpan<float>, FloatTensor);
-  ISA(MdSpan<double>, DoubleTensor);
-  ISA(MdSpan<int32_t>, Int32Tensor);
-  ISA(MdSpan<int64_t>, Int64Tensor);
-
-  ISA(StridedBufferArg, StridedBufferArg);
-  ISA(BufferArg, BufferArg);
-  ISA(Dictionary, Dictionary);
-
-  assert(false && "Unsupported type");
-  return false;
-
-#undef ISA
-}
-
-//===----------------------------------------------------------------------===//
-// Pretty printing for buffers.
-//===----------------------------------------------------------------------===//
-
-static void PrintArray(std::stringstream& ss, Span<const int64_t> arr) {
-  ss << "[";
-  for (unsigned i = 0; i < arr.size(); ++i)
-    (i > 0) ? ss << ", " << arr[i] : ss << arr[i];
-  ss << "]";
-}
-
-inline std::string StridedBufferArg::ToString() const {
-  std::stringstream ss;
-  ss << "Buffer: dtype=" << PrimitiveTypeToString(dtype);
-  ss << " sizes=";
-  PrintArray(ss, sizes);
-  ss << " strides=";
-  PrintArray(ss, strides);
-  return ss.str();
-}
-
-inline std::string BufferArg::ToString() const {
-  std::stringstream ss;
-  ss << "Buffer: dtype=" << PrimitiveTypeToString(dtype);
-  ss << " sizes=";
-  PrintArray(ss, sizes);
-  return ss.str();
-}
-
-inline std::string FlatBufferArg::ToString() const {
-  std::stringstream ss;
-  ss << "Buffer: dtype=" << PrimitiveTypeToString(dtype);
-  ss << " size=" << size_in_bytes;
-  return ss.str();
-}
-
-//===----------------------------------------------------------------------===//
-// FFI binding describes the function signature expected by the FFI handler
-// using its variadic template parameter.
-//
-//   FFI binding:
-//     FfiBinding<int32_t, float>
-//
-//   Corresponds to the function signature:
-//     FfiStatus MyHandler(int32_t arg0, float arg1);
-//
-//===----------------------------------------------------------------------===//
-
-namespace internal {
-
-// A type tag to distinguish arguments tied to the attributes in the
-// `FfiBinding` variadic template argument.
-template <typename T>
-struct AttrTag {};
-
-// A type tag to distinguish argument tied to FFI module state.
-template <typename T>
-struct StateTag {};
-
-// A type tag to distinguish argument tied to XLA runtime stream.
-template <typename T>
-struct StreamTag {};
-
-// Type tag to distinguish an argument tied to an "ApiPriv" argument
-// to `Ffi::Binding`. This is necessary to obtain `XLA_FFI_Api.priv`
-// from the foreign function. For example:
-//
-// static ffi::FfiStatus FooFfi(MyStruct* bar, ffi::BufferArg input) { ... }
-//
-// XLA_FFI_DEFINE_FUNCTION(
-//     FFI_Foo, FooFfi, ffi::Ffi::Binding()
-//     .ApriPriv<MyStruct*>
-//     .Arg<ffi::BufferArg>);
-template <typename T>
-struct ApiPrivTag {};
-
-// A template for checking if type is a wrapped attribute or user data.
-// clang-format off
-template <typename>   struct IsWrapped                : std::false_type {};
-template <typename T> struct IsWrapped<AttrTag<T>>    : std::true_type {};
-template <typename T> struct IsWrapped<StateTag<T>>   : std::true_type {};
-template <typename T> struct IsWrapped<StreamTag<T>>  : std::true_type {};
-template <typename T> struct IsWrapped<ApiPrivTag<T>> : std::true_type {};
-// clang-format on
-
-}  // namespace internal
-
-template <typename... Ts>
-class FfiBinding {
- public:
-  template <typename T>
-  FfiBinding<Ts..., T> Arg() && {
-    return {std::move(*this)};
-  }
-
-  template <typename T>
-  FfiBinding<Ts..., internal::AttrTag<T>> Attr(std::string attr) && {
-    attrs_.push_back(std::move(attr));
-    return {std::move(*this)};
-  }
-
-  template <typename T>
-  FfiBinding<Ts..., internal::StateTag<T>> State() && {
-    return {std::move(*this)};
-  }
-
-  template <typename T>
-  FfiBinding<Ts..., internal::StreamTag<T>> Stream() && {
-    static_assert(std::is_pointer_v<T>,
-                  "T must be a pointer type, e.g. for GPU platform it must be "
-                  "se::gpu::GpuStreamHandle");
-    return {std::move(*this)};
-  }
-
-  template <typename T>
-  FfiBinding<Ts..., internal::ApiPrivTag<T>> ApiPriv() && {
-    static_assert(std::is_pointer_v<T>, "T must be a pointer type");
-    return {std::move(*this)};
-  }
-
-  template <typename Fn>
-  std::unique_ptr<FfiHandler<Fn, Ts...>> To(Fn fn) {
-    return std::unique_ptr<FfiHandler<Fn, Ts...>>(
-        new FfiHandler<Fn, Ts...>(std::forward<Fn>(fn), std::move(attrs_)));
-  }
-
- private:
-  template <typename...>
-  friend class FfiBinding;
-  friend class Ffi;
-
-  explicit FfiBinding() {
-    static_assert(sizeof...(Ts) == 0, "ffi arguments must be empty");
-  }
-
-  template <typename... TTs>
-  FfiBinding(FfiBinding<TTs...>&& other)  // NOLINT
-      : attrs_(std::move(other.attrs_)) {}
-
-  FfiBinding(FfiBinding&) = delete;
-
-  std::vector<std::string> attrs_;  // names of bound attributes
-};
-
-inline FfiBinding<> Ffi::Binding() { return FfiBinding<>(); }
-
-//===----------------------------------------------------------------------===//
-// Helpers for decoding opaque arguments and attributes' memory.
-//===----------------------------------------------------------------------===//
-
-namespace internal {
-
-using runtime::internal::EncodedArray;
-using runtime::internal::EncodedMemref;
-
-// Decoded pair of argument type and opaque value.
-struct DecodedArg {
-  XLA_FFI_TypeId type_id;
-  void* value;
-};
-
-// Decoded triple of attribute name, type and opaque value.
-struct DecodedAttr {
-  std::string_view name;
-  XLA_FFI_TypeId type_id;
-  void* value;
-};
-
-// A convenience wrapper around opaque arguments memory.
-class DecodedArgs {
- public:
-  explicit DecodedArgs(void** args) {
-    XLA_FFI_ANNOTATE_MEMORY_IS_INITIALIZED(args, sizeof(void*));
-    size_ = *reinterpret_cast<int64_t*>(args[0]);
-    if (size_) {
-      XLA_FFI_ANNOTATE_MEMORY_IS_INITIALIZED(args + 1, sizeof(void*));
-      types_ = reinterpret_cast<void**>(args[1]);
-      values_ = args + 2;
-      XLA_FFI_ANNOTATE_MEMORY_IS_INITIALIZED(types_, size_ * sizeof(void*));
-      XLA_FFI_ANNOTATE_MEMORY_IS_INITIALIZED(values_, size_ * sizeof(void*));
-    }
-  }
-
-  int64_t size() const { return size_; }
-
-  DecodedArg operator[](size_t i) const {
-    DecodedArg arg;
-    arg.type_id = types_[i];
-    arg.value = values_[i];
-    return arg;
-  }
-
- private:
-  int64_t size_;
-  void** types_ = nullptr;
-  void** values_ = nullptr;
-};
-
-// A convenience wrapper around opaque attributes' memory.
-class DecodedAttrs {
- public:
-  explicit DecodedAttrs(void** attrs) : encoded_(attrs + 1) {
-    XLA_FFI_ANNOTATE_MEMORY_IS_INITIALIZED(attrs, sizeof(void*));
-    size_ = *reinterpret_cast<int64_t*>(attrs[0]);
-    XLA_FFI_ANNOTATE_MEMORY_IS_INITIALIZED(encoded_, 3 * size_ * sizeof(void*));
-  }
-
-  int64_t size() const { return size_; }
-
-  DecodedAttr operator[](size_t i) const {
-    void** attr_base = encoded_ + i * 3;
-
-    DecodedAttr attr;
-    auto* name = reinterpret_cast<internal::EncodedArray<char>*>(attr_base[0]);
-    attr.name = std::string_view(name->data, name->size);
-    attr.type_id = attr_base[1];
-    attr.value = attr_base[2];
-
-    return attr;
-  }
-
- private:
-  void** encoded_;
-  int64_t size_;
-};
-
-}  // namespace internal
-
-//===----------------------------------------------------------------------===//
-// XLA FFI arguments decoding implementation.
-//===----------------------------------------------------------------------===//
-
-// XLA FFI arguments decoding must be defined by specializing this template.
-//
-// Example: decoding for the `MyType` arguments
-//
-//   template <>
-//   struct FfiArgDecoding<MyType> {
-//    static std::optional<MyType> Decode(ExecutionContext* ctx, TypeId type_id,
-//                                        void* value);
-//   };
-//
-template <typename T>
-struct FfiArgDecoding;
-
-// XLA FFI attribute decoding must be defined by specializing this template.
-//
-// Example: decoding for the `MyType` attributes
-//
-//   template <>
-//   struct FfiAttrDecoding<MyType> {
-//    static std::optional<MyType> Decode(ExecutionContext* ctx,
-//                                        std::string_view name,
-//                                        TypeId type_id, void* value);
-//   }
-//
-template <typename T>
-struct FfiAttrDecoding;
-
-namespace internal {
-
-// When decoding input data we need to keep track of how many arguments,
-// attributes, and returns we decoded so far to index into the correct data
-// structure.
-struct DecodingOffsets {
-  int64_t args = 0;
-  int64_t attrs = 0;
-  int64_t rets = 0;
-  int64_t values = 0;
-};
-
-template <typename T>
-struct Decode {
-  static std::optional<T> call(const XLA_FFI_Api* api,
-                               XLA_FFI_ExecutionContext* ctx,
-                               DecodingOffsets& offsets,
-                               internal::DecodedArgs args,
-                               const std::vector<std::string>& attrs_names,
-                               const std::vector<size_t>& attrs_idx,
-                               internal::DecodedAttrs attrs) {
-    internal::DecodedArg arg = args[offsets.args++];
-    return FfiArgDecoding<T>::Decode(api, arg.type_id, arg.value);
-  }
-};
-
-template <typename T>
-struct Decode<AttrTag<T>> {
-  static std::optional<T> call(const XLA_FFI_Api* api,
-                               XLA_FFI_ExecutionContext* ctx,
-                               DecodingOffsets& offsets,
-                               internal::DecodedArgs args,
-                               const std::vector<std::string>& attrs_names,
-                               const std::vector<size_t>& attrs_idx,
-                               internal::DecodedAttrs attrs) {
-    // Find decoded attribute corresponding to the given attribute index.
-    int64_t idx = offsets.attrs++;
-
-    // Get mapping from the attribute to its index in the sorted array.
-    size_t i = attrs_idx[idx];
-
-    // Attribute name does not match.
-    if (attrs[i].name != attrs_names[idx]) return std::nullopt;
-
-    return FfiAttrDecoding<T>::Decode(api, attrs[i].name, attrs[i].type_id,
-                                      attrs[i].value);
-  }
-};
-
-template <typename T>
-struct Decode<StateTag<T>> {
-  static std::optional<T*> call(const XLA_FFI_Api* api,
-                                XLA_FFI_ExecutionContext* ctx,
-                                DecodingOffsets& offsets, internal::DecodedArgs,
-                                const std::vector<std::string>& attrs_names,
-                                const std::vector<size_t>& attrs_idx,
-                                internal::DecodedAttrs attrs) {
-    XLA_FFI_ExecutionContext_GetModuleState_Args args;
-    args.struct_size = XLA_FFI_ExecutionContext_GetModuleState_Args_STRUCT_SIZE;
-    args.priv = nullptr;
-    args.ctx = ctx;
-
-    XLA_FFI_Module_State* state =
-        api->XLA_FFI_ExecutionContext_GetModuleState(&args);
-    return reinterpret_cast<T*>(state);
-  }
-};
-
-template <typename T>
-struct Decode<StreamTag<T>> {
-  static std::optional<T> call(const XLA_FFI_Api* api,
-                               XLA_FFI_ExecutionContext* ctx,
-                               DecodingOffsets& offsets, internal::DecodedArgs,
-                               const std::vector<std::string>& attrs_names,
-                               const std::vector<size_t>& attrs_idx,
-                               internal::DecodedAttrs attrs) {
-    XLA_FFI_ExecutionContext_GetStream_Args args;
-    args.struct_size = XLA_FFI_ExecutionContext_GetStream_Args_STRUCT_SIZE;
-    args.priv = nullptr;
-    args.ctx = ctx;
-
-    XLA_FFI_Stream* stream = api->XLA_FFI_ExecutionContext_GetStream(&args);
-    return reinterpret_cast<T>(stream);
-  }
-};
-
-template <typename T>
-struct Decode<ApiPrivTag<T>> {
-  static std::optional<T> call(const XLA_FFI_Api* api,
-                               XLA_FFI_ExecutionContext* ctx,
-                               DecodingOffsets& offsets, internal::DecodedArgs,
-                               const std::vector<std::string>& attrs_names,
-                               const std::vector<size_t>& attrs_idx,
-                               internal::DecodedAttrs attrs) {
-    return reinterpret_cast<T>(api->priv);
-  }
-};
-
-}  // namespace internal
-
-//===----------------------------------------------------------------------===//
-// Ffi handler binds concrete ffi implementation of type `Fn` to the ffi
-// function signature. `Fn` can be a function pointer or a lambda.
-//
-// Ffi handler uses the variadic template parameter `Ts` to decode the
-// opaque pointers passed to the `call` function into the C++ types that are
-// forwarded to the ffi implementation.
-//===----------------------------------------------------------------------===//
-
-namespace internal {
-
-// A helper template to extract the type of the handler argument.
-// clang-format off
-template <typename T> struct FnArgType                { using Type = T;  };
-template <typename T> struct FnArgType<AttrTag<T>>    { using Type = T;  };
-template <typename T> struct FnArgType<StateTag<T>>   { using Type = T*; };
-template <typename T> struct FnArgType<StreamTag<T>>  { using Type = T;  };
-template <typename T> struct FnArgType<ApiPrivTag<T>> { using Type = T;  };
-// clang-format on
-
-// A template for counting regular arguments in the Ts pack.
-template <typename... Ts>
-struct NumArgs;
-template <>
-struct NumArgs<> {
-  static constexpr int64_t value = 0;
-};
-
-template <typename T, typename... Ts>
-struct NumArgs<T, Ts...> {
-  static constexpr int64_t value = !IsWrapped<T>::value + NumArgs<Ts...>::value;
-};
-
-}  // namespace internal
-
-template <typename Fn, typename... Ts>
-class FfiHandler : public Ffi {
-  static constexpr int64_t kSize = sizeof...(Ts);
-  static constexpr int64_t kNumArgs = internal::NumArgs<Ts...>::value;
-
-  template <typename T>
-  using FnArgType = typename internal::FnArgType<T>::Type;
-
-  // Check if FFI function returns `FfiStatus`.
-  static constexpr bool kIsFfiStatusHandler =
-      std::is_invocable_r_v<FfiStatus, Fn, FnArgType<Ts>...>;
-  static_assert(kIsFfiStatusHandler, "unsupported FFI handler type");
-
-  static XLA_FFI_Error* ToError(const XLA_FFI_Api* api, FfiStatus status) {
-    if (!status.errc().has_value()) return nullptr;
-
-    XLA_FFI_Error_Create_Args args;
-    args.struct_size = XLA_FFI_Error_Create_Args_STRUCT_SIZE;
-    args.priv = nullptr;
-    args.errc = *status.errc();
-    args.message = status.message_c_str();
-
-    return api->XLA_FFI_Error_Create(&args);
-  }
-
- public:
-  XLA_FFI_Error* operator()(const XLA_FFI_Api* api,
-                            XLA_FFI_ExecutionContext* ctx, void** args,
-                            void** attrs, void** rets) const final {
-    // Decode arguments and attributes from the opaque pointers.
-    internal::DecodedArgs decoded_args(args);
-    internal::DecodedAttrs decoded_attrs(attrs);
-
-    int64_t num_args = decoded_args.size();
-    int64_t num_attrs = decoded_attrs.size();
-
-    // Check that we have the correct number of arguments passed to the handler.
-    if (num_args != kNumArgs) {
-      std::ostringstream err;
-      err << "Wrong number of arguments: expected " << kNumArgs << " got "
-          << num_args;
-      return ToError(api, FfiStatus::InvalidArgument(err.str()));
-    }
-
-    // Check that we have the correct number of attributes passed to the
-    // handler. Each individual attribute decoding will check the name and the
-    // type of the attribute.
-    if (num_attrs != attrs_.size()) {
-      std::ostringstream err;
-      err << "Wrong number of attributes: expected " << attrs_.size() << " got "
-          << num_attrs;
-      return ToError(api, FfiStatus::InvalidArgument(err.str()));
-    }
-
-    // Define index sequence to access ffi handler arguments.
-    using Is = std::make_index_sequence<kSize>;
-    return call(api, ctx, decoded_args, decoded_attrs, Is{});
-  }
-
- private:
-  template <typename...>
-  friend class FfiBinding;
-
-  template <size_t... Is>
-  XLA_FFI_Error* call(const XLA_FFI_Api* api, XLA_FFI_ExecutionContext* ctx,
-                      internal::DecodedArgs args, internal::DecodedAttrs attrs,
-                      std::index_sequence<Is...>) const {
-    // A helper structure to allow each decoder find the correct offset in the
-    // arguments, attributes or results.
-    internal::DecodingOffsets offsets;
-
-    // Decode all operands into `std::optional` containers. It is guaranteed
-    // that initializer list will be evaluated left-to-right, and we can rely
-    // on correct offsets computation.
-    std::tuple<std::optional<FnArgType<Ts>>...> fn_args = {
-        internal::Decode<Ts>::call(api, ctx, offsets, args, attrs_, attrs_idx_,
-                                   attrs)...};
-
-    // Check if all arguments, attributes and results were decoded;
-    bool all_decoded = (std::get<Is>(fn_args).has_value() && ...);
-    if (!all_decoded) {
-      std::array<bool, kSize> decoded = {std::get<Is>(fn_args).has_value()...};
-      std::string err = "Failed to decode all FFI operands (bad operands at: ";
-      for (size_t cnt = 0, idx = 0; idx < kSize; ++idx) {
-        if (!decoded[idx]) {
-          if (cnt++) err.append(", ");
-          err.append(std::to_string(idx));
-        }
-      }
-      err.append(")");
-      return ToError(api, FfiStatus::InvalidArgument(err));
-    }
-
-    // Custom call returns `FfiStatus`, we can call it directly.
-    if constexpr (kIsFfiStatusHandler) {
-      return ToError(api, fn_(std::move(*std::get<Is>(fn_args))...));
-    }
-
-    return ToError(api, FfiStatus::Ok());
-  }
-
-  FfiHandler(Fn fn, std::vector<std::string> attrs)
-      : fn_(std::move(fn)),
-        attrs_(std::move(attrs)),
-        attrs_idx_(attrs_.size()) {
-    // Sort attributes names.
-    std::vector<std::string> sorted = attrs_;
-    std::sort(sorted.begin(), sorted.end());
-
-    // Find the index of every attribute in the sorted attributes vector.
-    for (size_t i = 0; i < attrs_.size(); ++i) {
-      const std::string& attr = attrs_[i];
-      attrs_idx_[i] = std::distance(
-          sorted.begin(), std::find(sorted.begin(), sorted.end(), attr));
-    }
-  }
-
-  Fn fn_;
-
-  std::vector<std::string> attrs_;
-
-  // A mapping from the attribute index to its index in the lexicographically
-  // sorted vector of attribute names. Attributes are passed to the ffi handler
-  // sorted by the name, we use this index to efficiently find the decoded
-  // attribute entry.
-  std::vector<size_t> attrs_idx_;
-};
-
-//===----------------------------------------------------------------------===//
-// XLA FFI arguments decoding.
-//===----------------------------------------------------------------------===//
-
-#define XLA_FFI_REGISTER_SCALAR_ARG_DECODING(T)                           \
-  template <>                                                             \
-  struct FfiArgDecoding<T> {                                              \
-    static std::optional<T> Decode(const XLA_FFI_Api* api,                \
-                                   XLA_FFI_TypeId type_id, void* value) { \
-      if (!Ffi::Isa<T>(api, type_id)) {                                   \
-        return std::nullopt;                                              \
-      }                                                                   \
-                                                                          \
-      XLA_FFI_ANNOTATE_MEMORY_IS_INITIALIZED(value, sizeof(T));           \
-      return *reinterpret_cast<T*>(value);                                \
-    }                                                                     \
-  }
-
-XLA_FFI_REGISTER_SCALAR_ARG_DECODING(int32_t);
-
-#undef XLA_FFI_REGISTER_SCALAR_ARG_DECODING
-
-template <>
-struct FfiArgDecoding<StridedBufferArg> {
-  using EncodedMemref = internal::EncodedMemref;
-
-  static std::optional<StridedBufferArg> Decode(const XLA_FFI_Api* api,
-                                                XLA_FFI_TypeId type_id,
-                                                void* value) {
-    if (!Ffi::Isa<BufferArg, StridedBufferArg>(api, type_id)) {
-      return std::nullopt;
-    }
-
-    auto* encoded = reinterpret_cast<EncodedMemref*>(value);
-    XLA_FFI_ANNOTATE_MEMORY_IS_INITIALIZED(encoded, sizeof(EncodedMemref));
-    XLA_FFI_ANNOTATE_MEMORY_IS_INITIALIZED(
-        encoded, sizeof(EncodedMemref) + encoded->rank * sizeof(int64_t));
-
-    PrimitiveType dtype = static_cast<PrimitiveType>(encoded->dtype);
-    return StridedBufferArg{dtype,
-                            encoded->data,
-                            {encoded->dims, encoded->rank},
-                            {encoded->dims + encoded->rank, encoded->rank}};
-  }
-};
-
-template <>
-struct FfiArgDecoding<BufferArg> {
-  using EncodedMemref = internal::EncodedMemref;
-
-  static std::optional<BufferArg> Decode(const XLA_FFI_Api* api,
-                                         XLA_FFI_TypeId type_id, void* value) {
-    if (!Ffi::Isa<BufferArg>(api, type_id)) {
-      return std::nullopt;
-    }
-
-    auto* encoded = reinterpret_cast<EncodedMemref*>(value);
-    XLA_FFI_ANNOTATE_MEMORY_IS_INITIALIZED(encoded, sizeof(EncodedMemref));
-    XLA_FFI_ANNOTATE_MEMORY_IS_INITIALIZED(
-        encoded, sizeof(EncodedMemref) + encoded->rank * sizeof(int64_t));
-
-    PrimitiveType dtype = static_cast<PrimitiveType>(encoded->dtype);
-    return BufferArg{dtype, encoded->data, {encoded->dims, encoded->rank}};
-  }
-};
-
-template <>
-struct FfiArgDecoding<FlatBufferArg> {
-  using EncodedMemref = internal::EncodedMemref;
-
-  static std::optional<FlatBufferArg> Decode(const XLA_FFI_Api* api,
-                                             XLA_FFI_TypeId type_id,
-                                             void* value) {
-    if (!Ffi::Isa<BufferArg>(api, type_id)) {
-      return std::nullopt;
-    }
-
-    auto* encoded = reinterpret_cast<EncodedMemref*>(value);
-    XLA_FFI_ANNOTATE_MEMORY_IS_INITIALIZED(encoded, sizeof(EncodedMemref));
-    XLA_FFI_ANNOTATE_MEMORY_IS_INITIALIZED(
-        encoded, sizeof(EncodedMemref) + encoded->rank * sizeof(int64_t));
-
-    auto dtype = static_cast<PrimitiveType>(encoded->dtype);
-    int64_t size_in_bytes = ByteWidth(dtype);
-    for (int d = 0; d < encoded->rank; ++d) {
-      size_in_bytes *= encoded->dims[d];
-    }
-    return FlatBufferArg{dtype, encoded->data, size_in_bytes};
-  }
-};
-
-//===----------------------------------------------------------------------===//
-// XLA FFI attributes decoding.
-//===----------------------------------------------------------------------===//
-
-#define XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(T)                          \
-  template <>                                                             \
-  struct FfiAttrDecoding<T> {                                             \
-    static std::optional<T> Decode(const XLA_FFI_Api* api,                \
-                                   std::string_view name,                 \
-                                   XLA_FFI_TypeId type_id, void* value) { \
-      if (!Ffi::Isa<T>(api, type_id)) {                                   \
-        return std::nullopt;                                              \
-      }                                                                   \
-                                                                          \
-      return *reinterpret_cast<T*>(value);                                \
-    }                                                                     \
-  }
-
-XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(float);
-XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(double);
-XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(bool);
-XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(int32_t);
-XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(int64_t);
-
-#undef XLA_FFI_REGISTER_SCALAR_ATTR_DECODING
-
-// Both EncodedArray and 1-D EncodedDenseElements can be decoded as a Span.
-// Pointers to both EncodedArray and 1-D EncodedDenseElements can be
-// dereferenced as a pointer to EncodedArray.
-#define XLA_FFI_REGISTER_ARRAY_ATTR_DECODING(T)                            \
-  template <>                                                              \
-  struct FfiAttrDecoding<Span<const T>> {                                  \
-    static std::optional<Span<const T>> Decode(const XLA_FFI_Api* api,     \
-                                               std::string_view name,      \
-                                               XLA_FFI_TypeId type_id,     \
-                                               void* value) {              \
-      if (!Ffi::Isa<Span<const T>, MdSpan<T>>(api, type_id)) {             \
-        return std::nullopt;                                               \
-      }                                                                    \
-                                                                           \
-      auto* encoded = reinterpret_cast<internal::EncodedArray<T>*>(value); \
-      return Span<const T>(encoded->data, encoded->size);                  \
-    }                                                                      \
-  }
-
-XLA_FFI_REGISTER_ARRAY_ATTR_DECODING(float);
-XLA_FFI_REGISTER_ARRAY_ATTR_DECODING(double);
-XLA_FFI_REGISTER_ARRAY_ATTR_DECODING(bool);
-XLA_FFI_REGISTER_ARRAY_ATTR_DECODING(int32_t);
-XLA_FFI_REGISTER_ARRAY_ATTR_DECODING(int64_t);
-
-#undef XLA_FFI_REGISTER_ARRAY_ATTR_DECODING
-
-template <>
-struct FfiAttrDecoding<std::string_view> {
-  static std::optional<std::string_view> Decode(const XLA_FFI_Api* api,
-                                                std::string_view name,
-                                                XLA_FFI_TypeId type_id,
-                                                void* value) {
-    if (!Ffi::Isa<std::string_view>(api, type_id)) {
-      return std::nullopt;
-    }
-
-    auto* encoded = reinterpret_cast<internal::EncodedArray<char>*>(value);
-    return std::string_view(encoded->data, encoded->size);
-  }
-};
-
-//===----------------------------------------------------------------------===//
-// Register an XLA FFI attribute decoding from dictionaries to structs.
-//===----------------------------------------------------------------------===//
-
-template <typename T>
-struct AggregateMember {
-  using Type = T;
-
-  explicit AggregateMember(std::string_view name) : name(name) {}
-  std::string_view name;
-};
-
-// Example: register decoding for a user-defined struct
-//
-//   struct PairOfI64 { int64_t a; int64_t b; };
-//
-//   XLA_FFI_REGISTER_AGGREGATE_ATTR_DECODING(
-//     PairOfI64,
-//     AggregateMember<int64_t>("a"),
-//     AggregateMember<int64_t>("b"));
-//
-#define XLA_FFI_REGISTER_AGGREGATE_ATTR_DECODING(T, ...)                       \
-  template <>                                                                  \
-  struct FfiAttrDecoding<T> {                                                  \
-    static std::optional<T> Decode(const XLA_FFI_Api* api,                     \
-                                   std::string_view name,                      \
-                                   XLA_FFI_TypeId type_id, void* value) {      \
-      if (!Ffi::Isa<Dictionary>(api, type_id)) {                               \
-        return std::nullopt;                                                   \
-      }                                                                        \
-                                                                               \
-      auto decoder = internal::AggregateDecoder<T>(__VA_ARGS__);               \
-      return decltype(decoder)::Decode(api, reinterpret_cast<void**>(value),   \
-                                       internal::AggregateNames(__VA_ARGS__)); \
-    }                                                                          \
-  }
-
-namespace internal {
-// Decodes aggregate attribute into the object of type `T` that must be
-// constructible from the `Ts` types.
-template <typename T, typename... Ts>
-struct DecodeAggregateAttr {
-  static constexpr size_t kSize = sizeof...(Ts);
-
-  static std::optional<T> Decode(const XLA_FFI_Api* api, void** value,
-                                 std::array<std::string_view, kSize> names) {
-    internal::DecodedAttrs attrs(value);
-    return Decode(api, attrs, names, std::make_index_sequence<kSize>{});
-  }
-
-  template <size_t... Is>
-  static std::optional<T> Decode(const XLA_FFI_Api* api,
-                                 internal::DecodedAttrs attrs,
-                                 std::array<std::string_view, kSize> names,
-                                 std::index_sequence<Is...>) {
-    // Check that the number of encoded attributes matches the signature.
-    if (kSize != attrs.size()) return std::nullopt;
-
-    // Check that aggregate member names match the expected names.
-    for (unsigned i = 0; i < kSize; ++i)
-      if (attrs[i].name != names[i]) return std::nullopt;
-
-    // Decode all arguments into std::optional containers. It is guaranteed
-    // that initializer list will be evaluated left-to-right, and we can rely
-    // on correct offsets computation.
-    std::tuple<std::optional<Ts>...> members = {FfiAttrDecoding<Ts>::Decode(
-        api, attrs[Is].name, attrs[Is].type_id, attrs[Is].value)...};
-
-    bool all_decoded = (std::get<Is>(members).has_value() && ...);
-    if (!all_decoded) return std::nullopt;
-
-    // Forward unpacked members to the type constructor.
-    return T{std::move(*std::get<Is>(members))...};
-  }
-};
-
-template <typename... Members>
-auto AggregateNames(Members... m) {
-  return std::array<std::string_view, sizeof...(Members)>{m.name...};
-}
-
-template <typename T, typename... Members>
-auto AggregateDecoder(Members... m) {
-  return DecodeAggregateAttr<T, typename Members::Type...>();
-}
-
-}  // namespace internal
-
-//===----------------------------------------------------------------------===//
-// XLA FFI helper macro for registering FFI implementations.
-//===----------------------------------------------------------------------===//
-
-#define XLA_FFI_DEFINE_FUNCTION(fn, impl, binding)                   \
-  static XLA_FFI_Error* fn(XLA_FFI_Function_Args* args) {            \
-    ::xla::runtime::ffi::internal::CheckStructSize(                  \
-        "XLA_FFI_Function_Args", XLA_FFI_Function_Args_STRUCT_SIZE,  \
-        args->struct_size);                                          \
-    static auto* handler = binding.To(impl).release();               \
-    return (*handler)(args->api, args->ctx, args->args, args->attrs, \
-                      args->rets);                                   \
-  }
-
-#undef CHECK_ARGS_SIZE
-
-}  // namespace ffi
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_RUNTIME_FFI_FFI_API_H_
diff --git a/third_party/xla/xla/runtime/ffi/ffi_c_api.h b/third_party/xla/xla/runtime/ffi/ffi_c_api.h
deleted file mode 100644
index c7f3f31f4bc0a1..00000000000000
--- a/third_party/xla/xla/runtime/ffi/ffi_c_api.h
+++ /dev/null
@@ -1,273 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_RUNTIME_FFI_FFI_C_API_H_
-#define XLA_RUNTIME_FFI_FFI_C_API_H_
-
-#include <stddef.h>
-#include <stdint.h>
-
-// Every struct passed across the C API boundary has its size as a member, and
-// we use it as a sanity check for API compatibility.
-#define XLA_FFI_STRUCT_SIZE(struct_type, last_field) \
-  offsetof(struct_type, last_field) + sizeof(((struct_type*)0)->last_field)
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Forward declare.
-typedef struct XLA_FFI_Api XLA_FFI_Api;
-
-//===----------------------------------------------------------------------===//
-// XLA FFI Type checking.
-//===----------------------------------------------------------------------===//
-
-// XLA FFI passes type ids along with all arguments and attributes so that it
-// should be possible to check types at run time inside the FFI handler.
-typedef const void* XLA_FFI_TypeId;
-
-typedef XLA_FFI_TypeId XLA_FFI_Get_TypeId();
-
-//===----------------------------------------------------------------------===//
-// XLA FFI Error handling.
-//===----------------------------------------------------------------------===//
-
-// XLA FFI handler must return a XLA_FFI_Error*, which is NULL if there is no
-// error and set if there is. Caller allocates any returned XLA_FFI_Errors, and
-// the XLA FFI is responsible for freeing them.
-typedef struct XLA_FFI_Error XLA_FFI_Error;
-
-// Codes are based on https://abseil.io/docs/cpp/guides/status-codes
-typedef enum {
-  XLA_FFI_Error_Code_CANCELLED = 1,
-  XLA_FFI_Error_Code_UNKNOWN = 2,
-  XLA_FFI_Error_Code_INVALID_ARGUMENT = 3,
-  XLA_FFI_Error_Code_DEADLINE_EXCEEDED = 4,
-  XLA_FFI_Error_Code_NOT_FOUND = 5,
-  XLA_FFI_Error_Code_ALREADY_EXISTS = 6,
-  XLA_FFI_Error_Code_PERMISSION_DENIED = 7,
-  XLA_FFI_Error_Code_RESOURCE_EXHAUSTED = 8,
-  XLA_FFI_Error_Code_FAILED_PRECONDITION = 9,
-  XLA_FFI_Error_Code_ABORTED = 10,
-  XLA_FFI_Error_Code_OUT_OF_RANGE = 11,
-  XLA_FFI_Error_Code_UNIMPLEMENTED = 12,
-  XLA_FFI_Error_Code_INTERNAL = 13,
-  XLA_FFI_Error_Code_UNAVAILABLE = 14,
-  XLA_FFI_Error_Code_DATA_LOSS = 15,
-  XLA_FFI_Error_Code_UNAUTHENTICATED = 16
-} XLA_FFI_Error_Code;
-
-//===----------------------------------------------------------------------===//
-// XLA FFI module defines a set of exported FFI functions and their state.
-//===----------------------------------------------------------------------===//
-
-// XLA FFI module is a way to structure FFI functions together with a state
-// required for calling them. XLA runtime executable can be linked with multiple
-// of such modules at run time.
-typedef struct XLA_FFI_Module XLA_FFI_Module;
-
-// XLA FFI module state can be instantiated once for each XLA executable, and
-// its life time will be bound to the executable iself, or it can be
-// instantiated for each separate execution.
-typedef enum {
-  XLA_FFI_Module_State_PER_EXECUTABLE = 1,
-  XLA_FFI_Module_State_PER_EXECUTION = 2,
-} XLA_FFI_Module_StateType;
-
-// Exported FFI functions will have access to the state object that they can
-// use to share data between different function invocations. If the state is
-// instantiated for each executable, it is the user's responsibility to
-// guarantee that it is thread-safe to use from multiple concurrent executions.
-typedef struct XLA_FFI_Module_State XLA_FFI_Module_State;
-
-// Creates a new per-executable module state.
-typedef struct {
-  size_t struct_size;
-  void* priv;
-  XLA_FFI_Module* module;
-  XLA_FFI_Module_State* state;  // out
-} XLA_FFI_Module_CreateState_Args;
-
-const size_t XLA_FFI_Module_CreateState_Args_STRUCT_SIZE =
-    XLA_FFI_STRUCT_SIZE(XLA_FFI_Module_CreateState_Args, state);
-
-typedef XLA_FFI_Error* XLA_FFI_Module_CreateState(
-    XLA_FFI_Module_CreateState_Args* args);
-
-// Destroys a module state.
-typedef struct {
-  size_t struct_size;
-  void* priv;
-  XLA_FFI_Module* module;
-  XLA_FFI_Module_State* state;
-} XLA_FFI_Module_DestroyState_Args;
-
-const size_t XLA_FFI_Module_DestroyState_Args_STRUCT_SIZE =
-    XLA_FFI_STRUCT_SIZE(XLA_FFI_Module_DestroyState_Args, state);
-
-typedef void XLA_FFI_Module_DestroyState(
-    XLA_FFI_Module_DestroyState_Args* args);
-
-//===----------------------------------------------------------------------===//
-// XLA FFI Error Reporting APIs.
-//===----------------------------------------------------------------------===//
-
-typedef struct {
-  size_t struct_size;
-  void* priv;
-  const char* message;
-  XLA_FFI_Error_Code errc;
-} XLA_FFI_Error_Create_Args;
-
-const size_t XLA_FFI_Error_Create_Args_STRUCT_SIZE =
-    XLA_FFI_STRUCT_SIZE(XLA_FFI_Error_Create_Args, message);
-
-typedef XLA_FFI_Error* XLA_FFI_Error_Create(XLA_FFI_Error_Create_Args* args);
-
-//===----------------------------------------------------------------------===//
-// XLA FFI Stream.
-//===----------------------------------------------------------------------===//
-
-// XLA FFI stream is an opaque handle to the underlying stream executor `Stream`
-// implementation. In XLA:GPU it is `se::gpu::GpuStreamHandle` (when running on
-// CUDA platform it is a `CUstream`).
-typedef struct XLA_FFI_Stream XLA_FFI_Stream;
-
-//===----------------------------------------------------------------------===//
-// XLA FFI Execution Context.
-//===----------------------------------------------------------------------===//
-
-typedef struct XLA_FFI_ExecutionContext XLA_FFI_ExecutionContext;
-
-// Get `XLA_FFI_Module_State` from the execution context.
-typedef struct {
-  size_t struct_size;
-  void* priv;
-  XLA_FFI_ExecutionContext* ctx;
-} XLA_FFI_ExecutionContext_GetModuleState_Args;
-
-const size_t XLA_FFI_ExecutionContext_GetModuleState_Args_STRUCT_SIZE =
-    XLA_FFI_STRUCT_SIZE(XLA_FFI_ExecutionContext_GetModuleState_Args, ctx);
-
-typedef XLA_FFI_Module_State* XLA_FFI_ExecutionContext_GetModuleState(
-    XLA_FFI_ExecutionContext_GetModuleState_Args* args);
-
-// Get `XLA_FFI_Stream` from the execution context.
-typedef struct {
-  size_t struct_size;
-  void* priv;
-  XLA_FFI_ExecutionContext* ctx;
-} XLA_FFI_ExecutionContext_GetStream_Args;
-
-const size_t XLA_FFI_ExecutionContext_GetStream_Args_STRUCT_SIZE =
-    XLA_FFI_STRUCT_SIZE(XLA_FFI_ExecutionContext_GetStream_Args, ctx);
-
-typedef XLA_FFI_Stream* XLA_FFI_ExecutionContext_GetStream(
-    XLA_FFI_ExecutionContext_GetStream_Args* args);
-
-//===----------------------------------------------------------------------===//
-// XLA FFI Function API.
-//===----------------------------------------------------------------------===//
-
-// Arguments passed to an FFI function.
-typedef struct {
-  size_t struct_size;
-  void* priv;
-  const XLA_FFI_Api* api;
-  XLA_FFI_ExecutionContext* ctx;
-  void** args;
-  void** attrs;
-  void** rets;
-} XLA_FFI_Function_Args;
-
-const size_t XLA_FFI_Function_Args_STRUCT_SIZE =
-    XLA_FFI_STRUCT_SIZE(XLA_FFI_Function_Args, rets);
-
-// XLA FFI function type that can be exported to a runtime.
-typedef XLA_FFI_Error* XLA_FFI_Function(XLA_FFI_Function_Args* args);
-
-//===----------------------------------------------------------------------===//
-// XLA FFI Api.
-//===----------------------------------------------------------------------===//
-
-// Register FFI module with an XLA runtime.
-typedef struct {
-  size_t struct_size;
-  void* priv;
-  const char* name;
-  XLA_FFI_Module* module;
-  XLA_FFI_Module_StateType state_type;
-  XLA_FFI_Module_CreateState* create_state;
-  XLA_FFI_Module_DestroyState* destroy_state;
-  int64_t num_exported_functions;
-  const char** exported_names;            // length == num_exported_functions
-  XLA_FFI_Function** exported_functions;  // length == num_exported_functions
-} XLA_FFI_Module_Register_Args;
-
-const size_t XLA_FFI_Module_Register_Args_STRUCT_SIZE =
-    XLA_FFI_STRUCT_SIZE(XLA_FFI_Module_Register_Args, exported_functions);
-
-typedef void XLA_FFI_Module_Register(XLA_FFI_Module_Register_Args* args);
-
-#define XLA_FFI_API_STRUCT_FIELD(fn_type) fn_type* fn_type
-
-#define XLA_FFI_API_TYPEID_FIELD(type) \
-  XLA_FFI_Get_TypeId* XLA_FFI_Get_##type##_TypeId
-
-typedef struct XLA_FFI_Api {
-  size_t struct_size;
-  void* priv;
-
-  XLA_FFI_API_STRUCT_FIELD(XLA_FFI_Module_Register);
-
-  XLA_FFI_API_STRUCT_FIELD(XLA_FFI_ExecutionContext_GetModuleState);
-  XLA_FFI_API_STRUCT_FIELD(XLA_FFI_ExecutionContext_GetStream);
-
-  XLA_FFI_API_STRUCT_FIELD(XLA_FFI_Error_Create);
-
-  XLA_FFI_API_TYPEID_FIELD(String);
-  XLA_FFI_API_TYPEID_FIELD(Float);
-  XLA_FFI_API_TYPEID_FIELD(Double);
-  XLA_FFI_API_TYPEID_FIELD(Int1);
-  XLA_FFI_API_TYPEID_FIELD(Int32);
-  XLA_FFI_API_TYPEID_FIELD(Int64);
-
-  XLA_FFI_API_TYPEID_FIELD(FloatArray);
-  XLA_FFI_API_TYPEID_FIELD(DoubleArray);
-  XLA_FFI_API_TYPEID_FIELD(Int32Array);
-  XLA_FFI_API_TYPEID_FIELD(Int64Array);
-
-  XLA_FFI_API_TYPEID_FIELD(FloatTensor);
-  XLA_FFI_API_TYPEID_FIELD(DoubleTensor);
-  XLA_FFI_API_TYPEID_FIELD(Int32Tensor);
-  XLA_FFI_API_TYPEID_FIELD(Int64Tensor);
-
-  XLA_FFI_API_TYPEID_FIELD(BufferArg);
-  XLA_FFI_API_TYPEID_FIELD(StridedBufferArg);
-  XLA_FFI_API_TYPEID_FIELD(Dictionary);
-} XLA_FFI_Api;
-
-#undef XLA_FFI_API_STRUCT_FIELD
-#undef XLA_FFI_API_TYPEID_FIELD
-
-const size_t XLA_FFI_Api_STRUCT_SIZE =
-    XLA_FFI_STRUCT_SIZE(XLA_FFI_Api, XLA_FFI_Get_StridedBufferArg_TypeId);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // XLA_RUNTIME_FFI_FFI_C_API_H_
diff --git a/third_party/xla/xla/runtime/jit_executable.cc b/third_party/xla/xla/runtime/jit_executable.cc
deleted file mode 100644
index a18eda247ba95b..00000000000000
--- a/third_party/xla/xla/runtime/jit_executable.cc
+++ /dev/null
@@ -1,368 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/runtime/jit_executable.h"
-
-#include <memory>
-#include <optional>
-#include <string_view>
-#include <utility>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/str_join.h"
-#include "llvm/ADT/STLExtras.h"
-#include "xla/mlir/runtime/utils/constraints.h"
-#include "xla/runtime/errors.h"
-
-namespace xla {
-namespace runtime {
-
-using absl::Span;
-using absl::StatusOr;
-
-using llvm::cast;
-using llvm::dyn_cast;
-using llvm::isa;
-
-using tsl::MakeAvailableAsyncValueRef;
-using tsl::MakeErrorAsyncValueRef;
-
-using Specialization = JitExecutable::Specialization;
-
-static bool IsSpecializationOnly(Span<const ArgumentConstraint> constraints) {
-  return llvm::any_of(constraints, [](ArgumentConstraint constraint) {
-    return constraint != ArgumentConstraint::kResolved;
-  });
-}
-
-static bool HasValueConstraints(Span<const ArgumentConstraint> constraints) {
-  return llvm::any_of(constraints, [](ArgumentConstraint constraint) {
-    return constraint == ArgumentConstraint::kValue;
-  });
-}
-
-// Returns true if all function operands have statically known shape.
-static bool HasStaticShapeOperands(const FunctionType& signature) {
-  auto is_dynamic = [](Span<const int64_t> sizes) -> bool {
-    return llvm::any_of(sizes, MemrefType::IsDynamic);
-  };
-
-  for (unsigned i = 0; i < signature.num_operands(); ++i) {
-    const Type* type = signature.operand(i);
-
-    // Get the underlying value type from the async value.
-    while (auto* value = dyn_cast<AsyncValueType>(type))
-      type = &value->value_type();
-
-    // Unranked types do not have statically known shape.
-    if (isa<UnrankedTensorType, UnrankedMemrefType>(type)) return false;
-
-    // For ranked memrefs and tensors check known sizes.
-    if (auto* memref = dyn_cast<MemrefType>(type))
-      if (is_dynamic(memref->sizes())) return false;
-    if (auto* tensor = dyn_cast<RankedTensorType>(type))
-      if (is_dynamic(tensor->sizes())) return false;
-
-    // All other types are non-shaped and thus have "statically known shape".
-
-    // TODO(ezhulenev): Run-time types might need to support type interfaces or
-    // a hierarchy with a base `ShapedType` so that users can define their own
-    // types that can participate in shape specialization. This becomes
-    // complicated for container-like types (e.g. async value) that might
-    // contain a nested type that is shaped (e.g. memref). For now only the
-    // canonical types can participate in shape specialization.
-  }
-
-  return true;
-}
-
-/*static*/ void JitExecutable::InlineCompilationTaskRunner(
-    size_t num_specializations, Span<const ArgumentConstraint> constraints,
-    ArgumentsRef arguments, CompilationTask task, UserData user_data) {
-  task();
-}
-
-/*static*/ StatusOr<JitExecutable> JitExecutable::Instantiate(
-    std::string_view mlir_module, Options opts,
-    absl::Span<const std::string_view> exported,
-    std::string_view memory_region_name, CompilationTaskRunner runner) {
-  // Try to instantiate compilation context from the mlir source.
-  StatusOr<std::unique_ptr<JitCompiler>> compiler =
-      JitCompiler::Instantiate(opts.compiler, mlir_module, exported);
-  if (!compiler.ok()) return compiler.status();
-
-  // Collect Functions exported from the jit executable.
-  std::vector<JitExecutable::Function> functions;
-
-  for (unsigned ordinal = 0; ordinal < (*compiler)->num_exported(); ordinal++) {
-    auto fn = (*compiler)->exported(ordinal);
-    // Get resolved operands constraints for the exported function.
-    auto constraints = GetArgumentsConstraints(fn);
-    if (!constraints.ok()) return constraints.status();
-
-    // Get the exported function signature, it will be later required to
-    // compute the specialized function signature from the operands at runtime.
-    auto signature = opts.compiler.type_converter.Convert(
-        llvm::cast<mlir::FunctionType>(fn.getFunctionType()));
-    if (!signature.ok()) return signature.status();
-
-    JitExecutable::Function function{fn.getName(), std::move(*signature),
-                                     *constraints};
-
-    functions.push_back(std::move(function));
-  }
-
-  // TODO(ezhulenev): We currently only check the constraints of the function
-  // with ordinal 0, figure out how to support specialization and recompilation
-  // of modules with multiple exported functions.
-  const Function& fn = functions[0];
-
-  // If all of the operands have static shape, then we can always use default
-  // binary for execution (unless specialization is explicitly required by the
-  // operands constraints).
-  if (HasStaticShapeOperands(fn.signature) &&
-      !IsSpecializationOnly(fn.constraints))
-    opts.specialization = Specialization::kDisabled;
-
-  // Return an error if specialization is explicitly disabled, yet some of
-  // the operands have unresolved constraints.
-  if (opts.specialization == Specialization::kDisabled &&
-      IsSpecializationOnly(fn.constraints))
-    return Internal(
-        "compilation options disabled specialization, yet operands "
-        "have unresolved constraints: [%s]",
-        absl::StrJoin(fn.constraints, ", "));
-
-  // If the module must be specialized, return JitExecutable without a default
-  // compiled executable.
-  if (opts.specialization == Specialization::kAlways ||
-      IsSpecializationOnly(fn.constraints))
-    return JitExecutable(mlir_module, std::move(opts), std::move(functions),
-                         /*default_executable=*/std::nullopt,
-                         memory_region_name, std::move(runner));
-
-  // Otherwise try to compile the default executable.
-  StatusOr<Executable> executable =
-      JitCompiler::Compile(std::move(*compiler), memory_region_name);
-  if (!executable.ok()) return executable.status();
-
-  return JitExecutable(mlir_module, std::move(opts), std::move(functions),
-                       std::move(*executable), memory_region_name,
-                       std::move(runner));
-}
-
-JitExecutable::Function::Function(
-    std::string_view name, FunctionType signature,
-    absl::Span<const ArgumentConstraint> constraints)
-    : name(name),
-      signature(std::move(signature)),
-      constraints(constraints.begin(), constraints.end()),
-      has_value_constraints(HasValueConstraints(constraints)),
-      symbolic_shapes_resolver(this->signature, constraints) {}
-
-JitExecutable::JitExecutable(std::string_view mlir_module, Options opts,
-                             std::vector<Function> functions,
-                             std::optional<Executable> default_executable,
-                             std::string_view memory_region_name,
-                             CompilationTaskRunner runner)
-    : mlir_module_(mlir_module),
-      opts_(std::move(opts)),
-      functions_(std::move(functions)),
-      has_default_executable_(default_executable.has_value()),
-      memory_region_name_(memory_region_name),
-      runner_(std::move(runner)),
-      specializations_(std::make_unique<Specializations>()) {
-  // Initialize default executable if it is available.
-  if (has_default_executable_) {
-    default_executable_ =
-        MakeAvailableAsyncValueRef<Executable>(std::move(*default_executable));
-  } else {
-    default_executable_ =
-        MakeErrorAsyncValueRef("default executable is not available");
-  }
-}
-
-AsyncValuePtr<Executable> JitExecutable::DefaultExecutable() const {
-  return default_executable_.AsPtr();
-}
-
-// Combines `hash` with a hash value computed from a value constrained operands.
-static llvm::hash_code CombineWithValueConstrainedOperands(
-    llvm::hash_code hash, ArgumentsRef arguments,
-    Span<const ArgumentConstraint> constraints) {
-  for (int i = 0; i < constraints.size(); ++i) {
-    if (LLVM_LIKELY(constraints[i] != ArgumentConstraint::kValue)) continue;
-
-    // TODO(ezhulenev): Currently we only support value specialization of Tensor
-    // operands (with MemrefDesc run time argument), it should be extended to
-    // support open type and argument hierarchies.
-    const MemrefDesc& memref = cast<MemrefDesc>(arguments[i]);
-    const auto* data = static_cast<uint8_t*>(memref.data());
-    size_t rank = memref.rank();
-    assert(rank == 0 || rank == 1);
-    size_t num_values = rank == 0 ? 1 : memref.size(0);
-    int64_t len = num_values * primitive_util::ByteWidth(memref.dtype());
-    hash = llvm::hash_combine(hash, llvm::hash_combine_range(data, data + len));
-  }
-  return hash;
-}
-
-// TODO(ezhulenev): The fast path should be free of mutex to find the
-// pre-compiled specialization. Maybe use atomic pointers (multiple atomic
-// pointers?) to keep the most commonly used specialization available without
-// doing a lookup in the AsyncValuesCache.
-//
-// TODO(ezhulenev): The number of specializations should be bounded, ideally we
-// should only keep N most common specializations, and for everything else
-// fall back on the default executable. However what to do if default executable
-// is not available, and the number of specializations is above N?
-StatusOr<AsyncValuePtr<Executable>> JitExecutable::GetExecutable(
-    ArgumentsRef arguments, UserData user_data,
-    const SpecializationListener* listener) {
-  // Do not try to compile specialized executable if it is explicitly disabled.
-  if (opts_.specialization == Specialization::kDisabled)
-    return DefaultExecutable();
-
-  // TODO(ezhulenev): Add support for specialization and recompilation for any
-  // function exported by the executable.
-  const Function& fn = functions_[0];
-
-  // The number of arguments must match the exported function signature.
-  if (LLVM_UNLIKELY(arguments.size() != fn.signature.num_operands()))
-    return InvalidArgument("expected %i arguments, got: %i",
-                           fn.signature.num_operands(), arguments.size());
-
-  // Resolve symbolic shapes hash based on the static and runtime information.
-  //
-  // We rely on the hash code to find the specialized executable. In case of
-  // a collision (practically impossible) incompatible arguments will be
-  // rejected by the executable arguments verification.
-  StatusOr<llvm::hash_code> hash =
-      fn.symbolic_shapes_resolver.ResolveHash(arguments);
-
-  // If we failed to resolve the symbolic shapes hash, then we need to verify
-  // all the operands to find the mismatch and report it to the user.
-  if (LLVM_UNLIKELY(!hash.ok())) {
-    for (unsigned i = 0; i < arguments.size(); ++i) {
-      auto* type = fn.signature.operand(i);
-
-      // TODO(ezhulenev): Support open shaped type/argument hierarchy.
-      auto* memref_arg = dyn_cast<MemrefDesc>(&arguments[i]);
-      if (!memref_arg) continue;
-
-      if (auto* memref = dyn_cast<MemrefType>(type)) {
-        if (auto st = VerifyMemrefArgument(i, *memref, *memref_arg); !st.ok())
-          return st;
-
-      } else if (auto* tensor = dyn_cast<RankedTensorType>(type)) {
-        if (auto st = VerifyMemrefArgument(i, *tensor, *memref_arg); !st.ok())
-          return st;
-
-      } else {
-        return InvalidArgument("expected shaped operand at #%i, got: %s", i,
-                               fn.signature.operand(i)->ToString());
-      }
-    }
-
-    assert(false && "failed to detect incorrect operand");
-    return Internal("failed to resolve symbolic shapes");
-  }
-
-  // Combine with a hash value computed from the value constrained operands.
-  if (LLVM_UNLIKELY(fn.has_value_constraints))
-    *hash =
-        CombineWithValueConstrainedOperands(*hash, arguments, fn.constraints);
-
-  // Maybe return Executable from the cache.
-  if (auto cached = specializations_->Find(*hash)) {
-    // Always use specialized executable if required by the compilation options.
-    if (opts_.specialization == Specialization::kAlways) return cached;
-
-    // Fall back on default executable if the specialization is not yet
-    // available.
-    if (has_default_executable_ && !cached.IsAvailable())
-      return DefaultExecutable();
-
-    return cached;
-  }
-
-  // Instantiation from the source and specialization are cheap, so we do it in
-  // the caller thread. We only use compilation runner for expensive part.
-
-  // Try to instantiate compilation context from the mlir source.
-  StatusOr<std::unique_ptr<JitCompiler>> compiler =
-      JitCompiler::Instantiate(opts_.compiler, mlir_module_, {fn.name});
-
-  if (!compiler.ok()) {
-    llvm::errs() << compiler.status().message();
-    assert(false && "parsing mlir module must always succeed at this point");
-    return compiler.status();
-  }
-
-  // Specialize executable to the concrete operands.
-  StatusOr<llvm::SmallVector<SymbolicShapesResolver::SymbolicShape>>
-      symbolic_shapes = fn.symbolic_shapes_resolver.Resolve(arguments);
-  if (auto specialized = (*compiler)->Specialize(0, arguments, *symbolic_shapes,
-                                                 fn.constraints, listener);
-      !specialized.ok()) {
-    return Internal("failed to specialize executable: %s",
-                         specialized.message());
-  }
-
-  // Allocate a placeholder for the compiled specialization only after we are
-  // ready to dispatch the compilation task.
-  Specializations::Entry entry = specializations_->Allocate(*hash);
-
-  // We lost the race; some other invocation will do the compilation.
-  if (!entry.allocated) return entry.ptr;
-
-  // Get the specialization id from the size of the specializations cache.
-  size_t specialization = entry.size - 1;
-
-  // Construct the task that will do the specialized executable compilation.
-  auto compile = CompilationTask(
-      [compiler = std::move(*compiler), ref = entry.ptr.CopyRef(),
-       memory_region_name = memory_region_name_, specialization]() mutable {
-        StatusOr<Executable> executable = JitCompiler::Compile(
-            std::move(compiler), memory_region_name, specialization);
-
-        // Set the allocated entry async value state to error or concrete.
-        if (!executable.ok()) {
-          ref.SetError(executable.status());
-        } else {
-          ref.emplace(std::move(*executable));
-        }
-      });
-
-  // Offload specialization compilation to the user provided runner.
-  runner_(specialization, fn.constraints, arguments, std::move(compile),
-          user_data);
-
-  // Use the default executable while we are compiling a specialized version if
-  // this is not explicitly disabled by the compilation options.
-  if (opts_.specialization == Specialization::kAlways)
-    return entry.ptr;
-  else
-    return has_default_executable_ ? DefaultExecutable() : entry.ptr;
-}
-
-AsyncValueRef<Chain> JitExecutable::AllExecutablesCompiled() const {
-  return specializations_->AllAvailable();
-}
-
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/runtime/jit_executable.h b/third_party/xla/xla/runtime/jit_executable.h
deleted file mode 100644
index 2cf8878f0fcb90..00000000000000
--- a/third_party/xla/xla/runtime/jit_executable.h
+++ /dev/null
@@ -1,240 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_RUNTIME_JIT_EXECUTABLE_H_
-#define XLA_RUNTIME_JIT_EXECUTABLE_H_
-
-#include <any>
-#include <memory>
-#include <optional>
-#include <string>
-#include <string_view>
-#include <utility>
-#include <vector>
-
-#include "absl/status/statusor.h"
-#include "xla/mlir/runtime/transforms/jit_compiler.h"
-#include "xla/runtime/async_values_cache.h"  // IWYU pragma: keep
-#include "xla/runtime/constraints.h"
-#include "tsl/concurrency/async_value_ref.h"
-#include "tsl/concurrency/chain.h"
-
-namespace xla {
-namespace runtime {
-
-// JitExecutable owns a default executable compiled from the MLIR module (if
-// operands constraints allow that), and orchestrates on-demand re-compilation
-// for specific argument ranks, shapes or values depending on the operands
-// constraints.
-class JitExecutable {
- public:
-  using UserData = std::any;
-
-  // XLA program can be specialized and recompiled at runtime to the concrete
-  // input shapes and sometimes values (e.g. reduction dimension).
-  enum class Specialization {
-    // Recompile specialized executables when needed.
-    kEnabled,
-    // Completely disable specialized executables (always call default
-    // executable).
-    kDisabled,
-    // Always use specialized executables, and never call default executable
-    // (only required for getting reproducible results in benchmarks).
-    kAlways,
-  };
-
-  struct Options {
-    // What level of specialization is enabled at runtime.
-    Specialization specialization = Specialization::kAlways;
-
-    // Options for the XLA runtime JitCompiler.
-    JitCompiler::Options compiler;
-  };
-
-  // We use `llvm::unique_function` to represent compilation task because it
-  // allows to capture move-only values.
-  using CompilationTask = llvm::unique_function<void()>;
-
-  // Compilation task runner called at runtime when specialization compilation
-  // is required with the `TaskFunction` that does the compilation, and updates
-  // the internal state of the `JitExecutable`. This runner can be used by the
-  // caller to offload compilation task to the specialized thread pool and
-  // add tracing events (e.g. add Tensorflow profiler tracing). Task runner must
-  // call the `TaskFunction`, otherwise it will lead to deadlock.
-  //
-  // Caller can pass arbitrary user data to the `GetExecutable` method, and it
-  // will be passed to the runner if recompilation is required. It is guaranteed
-  // that the runner will be called in the same thread as `GetExecutable`.
-  //
-  using CompilationTaskRunner =
-      llvm::unique_function<void(size_t, absl::Span<const ArgumentConstraint>,
-                                 ArgumentsRef, CompilationTask, UserData)>;
-
-  // Inline compilation task runner runs compilation task in the caller thread.
-  static void InlineCompilationTaskRunner(
-      size_t num_specializations,
-      absl::Span<const ArgumentConstraint> constraints, ArgumentsRef arguments,
-      CompilationTask task, UserData user_data);
-
-  // TODO(ezhulenev): Currently exported functions must be defined explicitly by
-  // the user. It should be possible to define exported functions implicitly by
-  // having `rt.export` operations in the compiled module, and export new
-  // functions while running compilation pipeline. Also `Executable` potentially
-  // might have more exported functions than the `JitExecutable` that
-  // instantiated it. Consider adding "private" exported functions, that are not
-  // visible through the `Executable` API (e.g. function references might be
-  // passed to custom calls, but they should not be visible to the client).
-  static absl::StatusOr<JitExecutable> Instantiate(
-      std::string_view mlir_module, Options opts,
-      absl::Span<const std::string_view> exported,
-      std::string_view memory_region_name = "",
-      CompilationTaskRunner runner = InlineCompilationTaskRunner);
-
-  static absl::StatusOr<JitExecutable> Instantiate(
-      std::string_view mlir_module, std::string_view exported, Options opts,
-      std::string_view memory_region_name = "",
-      CompilationTaskRunner runner = InlineCompilationTaskRunner) {
-    return Instantiate(mlir_module, opts, {exported}, memory_region_name,
-                       std::move(runner));
-  }
-
-  // Returns default executable that accepts all compatible operands
-  // (operands rank and all static dimensions should match the operands).
-  tsl::AsyncValuePtr<Executable> DefaultExecutable() const;
-
-  // Returns an executable that may be specialized for the arguments. Can return
-  // default executable if no specialization is required, or if the specialized
-  // executable is not yet available.
-  //
-  // Caller can pass arbitrary data via the `user_data` argument, and it will be
-  // available to the compilation task runner. This can be used for tracing,
-  // e.g. to track what user-level requests triggered recompilation.
-  //
-  // Returns an error if the arguments do not match the expected function
-  // signature and specialization is not possible (without trying to compile).
-  // If specialization is disabled, returns the default executable without
-  // checking the arguments (the default executable itself will check arguments
-  // when called).
-  //
-  // Async values holding compilation results (executables) cached in the
-  // JitExecutable, and successive calls with the same arguments are cheap (the
-  // definition of "same" depend on the argument type specialization and chosen
-  // hash function, e.g. shaped arguments compared using their symbolic shape).
-  // If compilation fails, then the returned async value will hold a compilation
-  // error message. Compilation errors are never retried.
-  //
-  // Note: This function never falls back on the default executable if
-  // specialization compilation fails.
-  //
-  // TODO(ezhulenev): Add support for specifying exported function ordinal,
-  // currently this will always specialize exported function with ordinal 0.
-  absl::StatusOr<tsl::AsyncValuePtr<Executable>> GetExecutable(
-      ArgumentsRef arguments, UserData user_data = {},
-      const SpecializationListener* listener = nullptr);
-
-  // Returns an async value that becomes ready when all executables owned by
-  // this JitExecutable are compiled (no pending compilation tasks).
-  tsl::AsyncValueRef<tsl::Chain> AllExecutablesCompiled() const;
-
-  // JitExecutable is move-only type.
-  JitExecutable(const JitExecutable&) = delete;
-  JitExecutable(JitExecutable&&) = default;
-
-  std::string_view mlir_module() { return mlir_module_; }
-
-  unsigned num_functions() const { return functions_.size(); }
-
- private:
-  // JitExecutable defines multiple exported functions that could be compiled
-  // into the executable. At run time they are referenced by their ordinal, so
-  // that we don't depend on expensive by-name lookup on the hot path. Function
-  // ordinal is defined by its index in the `functions_` vector.
-  //
-  // TODO(ezhulenev): Today when JitExecutable instantiates specialized
-  // executable via call to `GetExecutable` it can only specialize the function
-  // with ordinal 0. It should be possible to specialize multiple functions, and
-  // select which functions should be compiled at all.
-  struct Function {
-    Function(std::string_view name, FunctionType signature,
-             absl::Span<const ArgumentConstraint> constraints);
-
-    Function(const Function&) = delete;
-    Function(Function&&) = default;
-
-    // Exported function name.
-    std::string name;
-
-    // Signature of the exported function.
-    //
-    // This function signature is allowed to have operands and results types
-    // without a well-defined ABI (e.g. it can have tensors when compiled module
-    // defined in Tensorflow dialect), and it corresponds to the executable
-    // definition in one of the high level dialects (e.g. Tensorflow or mHLO).
-    //
-    // When compiled module prepared for execution, function operands and
-    // results are mapped to the types with well-defined ABI (e.g. tensors
-    // mapped to memrefs). See `runtime_signature` documentation in the
-    // `Executable::Function` type.
-    FunctionType signature;
-
-    // Exported function arguments constraints after resolving them using the
-    // statically known information in the function signature. If constraint
-    // specified by the argument attribute known to be statically satisfied by
-    // the argument type (e.g. rank constraint with an operand of statically
-    // known rank), then the constraint value for that operand will be updated
-    // to `kResolved`.
-    llvm::SmallVector<ArgumentConstraint> constraints;
-
-    // True if any of the arguments has `ArgumentConstraint::kValue` constraint.
-    bool has_value_constraints;
-
-    // Symbolic shape resolver assigns symbolic dimensions to runtime operands
-    // based on the exported function signature.
-    SymbolicShapesResolver symbolic_shapes_resolver;
-  };
-
-  JitExecutable(std::string_view mlir_module, Options opts,
-                std::vector<Function> functions,
-                std::optional<Executable> default_executable,
-                std::string_view memory_region_name,
-                CompilationTaskRunner runner);
-
-  std::string mlir_module_;
-  Options opts_;
-
-  // Functions exported by this jit executable, indexed by function ordinal.
-  std::vector<Function> functions_;
-
-  // Default executable that was not specialized to any of the arguments.
-  AsyncValueRef<Executable> default_executable_;
-  bool has_default_executable_;
-
-  // Name of the memory region where JIT'ed code is compiled to.
-  // This allows profilers to correctly label JIT-executed code.
-  // Note: this feature might only be available on some platforms, e.g. Linux.
-  std::string memory_region_name_;
-
-  // A custom runner for compiling specializations.
-  CompilationTaskRunner runner_;
-
-  // Executables specialized for the arguments shapes or/and values.
-  using Specializations = AsyncValuesCache<llvm::hash_code, Executable>;
-  std::unique_ptr<Specializations> specializations_;
-};
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_RUNTIME_JIT_EXECUTABLE_H_
diff --git a/third_party/xla/xla/runtime/logical_result.h b/third_party/xla/xla/runtime/logical_result.h
deleted file mode 100644
index 2702010e5273a1..00000000000000
--- a/third_party/xla/xla/runtime/logical_result.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_RUNTIME_LOGICAL_RESULT_H_
-#define XLA_RUNTIME_LOGICAL_RESULT_H_
-
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-
-namespace xla {
-namespace runtime {
-
-using ::mlir::failure;  // NOLINT
-using ::mlir::success;  // NOLINT
-
-using ::mlir::failed;     // NOLINT
-using ::mlir::succeeded;  // NOLINT
-
-using ::mlir::FailureOr;      // NOLINT
-using ::mlir::LogicalResult;  // NOLINT
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_RUNTIME_LOGICAL_RESULT_H_
diff --git a/third_party/xla/xla/runtime/map_by_type.h b/third_party/xla/xla/runtime/map_by_type.h
deleted file mode 100644
index cbd3740811b969..00000000000000
--- a/third_party/xla/xla/runtime/map_by_type.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_RUNTIME_MAP_BY_TYPE_H_
-#define XLA_RUNTIME_MAP_BY_TYPE_H_
-
-#include <algorithm>
-#include <vector>
-
-#include "llvm/ADT/SmallVector.h"
-#include "xla/runtime/type_id.h"
-
-namespace xla {
-namespace runtime {
-
-// An optimized map container for storing pointers of different types.
-//
-// Example:
-//
-//   PtrMapByType<IdSet> map;
-//
-//   int32_t i32 = 0;
-//   int64_t i64 = 1;
-//
-//   map.insert(&i32);
-//   map.insert(&i64);
-//
-//   assert(map.contains<int32_t*>());
-//   assert(map.contains<int64_t*>());
-//
-template <typename IdSet, unsigned n = 16>
-class PtrMapByType {
- public:
-  PtrMapByType() = default;
-
-  template <typename... Ts>
-  explicit PtrMapByType(Ts*... values) {
-    insert_all<Ts...>(values..., std::make_index_sequence<sizeof...(Ts)>{});
-  }
-
-  template <typename T>
-  T* insert(T* value) {
-    size_t id = GetDenseTypeId<T>();
-    if (id >= data_.size()) {
-      data_.resize(id + 1);
-    }
-    data_[id] = const_cast<std::decay_t<T>*>(value);
-    return value;
-  }
-
-  template <typename... Ts>
-  void insert_all(Ts*... values) {
-    insert_all<Ts...>(values..., std::make_index_sequence<sizeof...(Ts)>{});
-  }
-
-  template <typename T>
-  T* get() const {
-    size_t id = GetDenseTypeId<T>();
-    assert(id < data_.size());
-    return reinterpret_cast<T*>(data_[id]);
-  }
-
-  template <typename T>
-  T* getIfExists() const {
-    size_t id = GetDenseTypeId<T>();
-    return LLVM_LIKELY(id < data_.size()) ? reinterpret_cast<T*>(data_[id])
-                                          : nullptr;
-  }
-
-  template <typename T>
-  bool contains() const {
-    size_t id = GetDenseTypeId<T>();
-    return id < data_.size() && data_[id] != nullptr;
-  }
-
- private:
-  template <typename T>
-  static size_t GetDenseTypeId() {
-    return DenseTypeId<IdSet>::template get<T>();
-  }
-
-  template <typename... Ts, size_t... Is>
-  void insert_all(Ts*... values, std::index_sequence<Is...>) {
-    static constexpr size_t kNumInserted = sizeof...(Ts);
-    if constexpr (kNumInserted > 0) {
-      std::array<size_t, kNumInserted> ids = {GetDenseTypeId<Ts>()...};
-      data_.resize(1 + *std::max_element(ids.begin(), ids.end()), nullptr);
-      ((data_[ids[Is]] = const_cast<std::decay_t<Ts>*>(values)), ...);
-    }
-  }
-
-  llvm::SmallVector<void*, n> data_;
-};
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_RUNTIME_MAP_BY_TYPE_H_
diff --git a/third_party/xla/xla/runtime/map_by_type_test.cc b/third_party/xla/xla/runtime/map_by_type_test.cc
deleted file mode 100644
index 0b3b37d876e69b..00000000000000
--- a/third_party/xla/xla/runtime/map_by_type_test.cc
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/runtime/map_by_type.h"
-
-#include "tsl/platform/test.h"
-#include "tsl/platform/test_benchmark.h"
-
-namespace xla {
-namespace runtime {
-
-struct IdSetA {};
-struct IdSetB {};
-
-TEST(PtrMapByTypeTest, Basic) {
-  PtrMapByType<IdSetA> map;
-  EXPECT_FALSE(map.contains<int32_t>());
-
-  int32_t i32 = 1;
-  map.insert(&i32);
-
-  EXPECT_TRUE(map.contains<int32_t>());
-  EXPECT_FALSE(map.contains<const int>());
-  EXPECT_EQ(*map.get<int32_t>(), 1);
-
-  EXPECT_EQ(map.getIfExists<int64_t>(), nullptr);
-  EXPECT_EQ(*map.getIfExists<int32_t>(), 1);
-  EXPECT_EQ(map.getIfExists<int32_t>(), &i32);
-
-  const int32_t ci32 = 2;
-  map.insert(&ci32);
-
-  EXPECT_TRUE(map.contains<const int32_t>());
-  EXPECT_EQ(*map.get<const int32_t>(), 2);
-}
-
-//===----------------------------------------------------------------------===//
-// Performance benchmarks are below.
-//===----------------------------------------------------------------------===//
-
-static void BM_InsertAndGetPtrs(benchmark::State& state) {
-  int32_t i32 = 1;
-  int64_t i64 = 1;
-  float f32 = 1.0;
-  double f64 = 1.0;
-
-  for (auto _ : state) {
-    PtrMapByType<IdSetA> map;
-    map.insert_all(&i32, &i64, &f32, &f64);
-    benchmark::DoNotOptimize(map);
-    benchmark::DoNotOptimize(map.getIfExists<int32_t>());
-    benchmark::DoNotOptimize(map.getIfExists<int64_t>());
-    benchmark::DoNotOptimize(map.getIfExists<float>());
-    benchmark::DoNotOptimize(map.getIfExists<double>());
-  }
-}
-
-static void BM_InsertAndGetOptPtrs(benchmark::State& state) {
-  int32_t i32 = 1;
-  int64_t i64 = 1;
-  float f32 = 1.0;
-  double f64 = 1.0;
-
-  for (auto _ : state) {
-    PtrMapByType<IdSetB> map;
-    map.insert_all(&i32, &i64, &f32, &f64);
-    benchmark::DoNotOptimize(map);
-    benchmark::DoNotOptimize(map.getIfExists<int32_t>());
-    benchmark::DoNotOptimize(map.getIfExists<int64_t>());
-    benchmark::DoNotOptimize(map.getIfExists<float>());
-    benchmark::DoNotOptimize(map.getIfExists<double>());
-  }
-}
-
-BENCHMARK(BM_InsertAndGetPtrs);
-BENCHMARK(BM_InsertAndGetOptPtrs);
-
-}  // namespace runtime
-}  // namespace xla
-
-XLA_RUNTIME_DECLARE_EXPLICIT_DENSE_TYPE_ID(xla::runtime::IdSetB, int32_t);
-XLA_RUNTIME_DECLARE_EXPLICIT_DENSE_TYPE_ID(xla::runtime::IdSetB, int64_t);
-XLA_RUNTIME_DECLARE_EXPLICIT_DENSE_TYPE_ID(xla::runtime::IdSetB, float);
-XLA_RUNTIME_DECLARE_EXPLICIT_DENSE_TYPE_ID(xla::runtime::IdSetB, double);
-
-XLA_RUNTIME_DEFINE_EXPLICIT_DENSE_TYPE_ID(xla::runtime::IdSetB, int32_t);
-XLA_RUNTIME_DEFINE_EXPLICIT_DENSE_TYPE_ID(xla::runtime::IdSetB, int64_t);
-XLA_RUNTIME_DEFINE_EXPLICIT_DENSE_TYPE_ID(xla::runtime::IdSetB, float);
-XLA_RUNTIME_DEFINE_EXPLICIT_DENSE_TYPE_ID(xla::runtime::IdSetB, double);
diff --git a/third_party/xla/xla/runtime/memory_mapper.cc b/third_party/xla/xla/runtime/memory_mapper.cc
deleted file mode 100644
index 77ce86f0f8ce02..00000000000000
--- a/third_party/xla/xla/runtime/memory_mapper.cc
+++ /dev/null
@@ -1,187 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/runtime/memory_mapper.h"
-
-#include <memory>
-#include <string_view>
-#include <system_error>  // NOLINT
-
-#include "llvm/ExecutionEngine/SectionMemoryManager.h"
-
-// Support for memfd_create(2) was added in glibc v2.27.
-#if defined(__linux__) && defined(__GLIBC__) && defined(__GLIBC_PREREQ)
-#if __GLIBC_PREREQ(2, 27)
-#include <sys/mman.h>
-#include <sys/types.h>
-#include <unistd.h>
-#define XLA_RUNTIME_ENABLE_MEMORY_MAPPER
-#endif  // __GLIBC_PREREQ(2, 27)
-#endif  // __linux__ and __GLIBC__ and __GLIBC_PREREQ
-
-namespace xla {
-namespace runtime {
-
-//===-----------------------------------------------------------------------===/
-#ifndef XLA_RUNTIME_ENABLE_MEMORY_MAPPER
-//===-----------------------------------------------------------------------===/
-
-std::unique_ptr<XlaRuntimeMemoryMapper> XlaRuntimeMemoryMapper::Create(
-    std::string_view name) {
-  return nullptr;
-}
-
-llvm::sys::MemoryBlock XlaRuntimeMemoryMapper::allocateMappedMemory(
-    llvm::SectionMemoryManager::AllocationPurpose purpose, size_t len,
-    const llvm::sys::MemoryBlock* const near_block, unsigned prot_flags,
-    std::error_code& error_code) {
-  llvm_unreachable("XlaRuntimeMemoryMapper is not implemented");
-}
-
-std::error_code XlaRuntimeMemoryMapper::protectMappedMemory(
-    const llvm::sys::MemoryBlock& block, unsigned prot_flags) {
-  llvm_unreachable("XlaRuntimeMemoryMapper is not implemented");
-}
-
-std::error_code XlaRuntimeMemoryMapper::releaseMappedMemory(
-    llvm::sys::MemoryBlock& block) {
-  llvm_unreachable("XlaRuntimeMemoryMapper is not implemented");
-}
-
-//===-----------------------------------------------------------------------===/
-#else  // XLA_RUNTIME_ENABLE_MEMORY_MAPPER
-//===-----------------------------------------------------------------------===/
-
-namespace {
-
-using MemoryMapper = llvm::SectionMemoryManager::MemoryMapper;
-using AllocationPurpose = llvm::SectionMemoryManager::AllocationPurpose;
-
-int retrying_close(int fd) {
-  return RetryOnEINTR([&]() { return close(fd); }, -1);
-}
-
-int retrying_ftruncate(int fd, off_t length) {
-  return RetryOnEINTR([&]() { return ftruncate(fd, length); }, -1);
-}
-
-int retrying_memfd_create(const char* name, unsigned int flags) {
-  return RetryOnEINTR([&]() { return memfd_create(name, flags); }, -1);
-}
-
-void* retrying_mmap(void* addr, size_t length, int prot, int flags, int fd,
-                    off_t offset) {
-  return RetryOnEINTR(
-      [&]() { return mmap(addr, length, prot, flags, fd, offset); },
-      MAP_FAILED);
-}
-
-int retrying_mprotect(void* addr, size_t len, int prot) {
-  return RetryOnEINTR([&]() { return mprotect(addr, len, prot); }, -1);
-}
-
-int retrying_munmap(void* addr, size_t length) {
-  return RetryOnEINTR([&]() { return munmap(addr, length); }, -1);
-}
-
-int64_t retrying_sysconf(int name) {
-  return RetryOnEINTR([&]() { return sysconf(name); }, -1);
-}
-
-int ToPosixProtectionFlags(unsigned flags) {
-  int ret = 0;
-  if (flags & llvm::sys::Memory::MF_READ) {
-    ret |= PROT_READ;
-  }
-  if (flags & llvm::sys::Memory::MF_WRITE) {
-    ret |= PROT_WRITE;
-  }
-  if (flags & llvm::sys::Memory::MF_EXEC) {
-    ret |= PROT_EXEC;
-  }
-  return ret;
-}
-
-}  // namespace
-
-std::unique_ptr<XlaRuntimeMemoryMapper> XlaRuntimeMemoryMapper::Create(
-    std::string_view name) {
-  std::unique_ptr<XlaRuntimeMemoryMapper> ret(new XlaRuntimeMemoryMapper(name));
-  return ret;
-}
-
-llvm::sys::MemoryBlock XlaRuntimeMemoryMapper::allocateMappedMemory(
-    AllocationPurpose purpose, size_t len,
-    const llvm::sys::MemoryBlock* const near_block, unsigned prot_flags,
-    std::error_code& error_code) {
-  auto round_up = [](size_t size, size_t align) {
-    return (size + align - 1) & ~(align - 1);
-  };
-  int64_t page_size = retrying_sysconf(_SC_PAGESIZE);
-  len = round_up(len, page_size);
-
-  int fd = -1;
-  int mmap_flags = MAP_PRIVATE;
-  if (purpose == llvm::SectionMemoryManager::AllocationPurpose::Code) {
-    // Try to get a truncated memfd. If that fails, use an anonymous mapping.
-    fd = retrying_memfd_create(name_.c_str(), 0);
-    if (fd != -1 && retrying_ftruncate(fd, len) == -1) {
-      retrying_close(fd);
-      fd = -1;
-    }
-  }
-  if (fd == -1) {
-    mmap_flags |= MAP_ANONYMOUS;
-  }
-  prot_flags = ToPosixProtectionFlags(prot_flags);
-  void* map = retrying_mmap(nullptr, len, prot_flags, mmap_flags, fd, 0);
-  // Regardless of the outcome of the mmap, we can close the fd now.
-  if (fd != -1) retrying_close(fd);
-
-  if (map == MAP_FAILED) {
-    error_code = std::error_code(errno, std::generic_category());
-    return llvm::sys::MemoryBlock();
-  }
-  return llvm::sys::MemoryBlock(map, len);
-}
-
-std::error_code XlaRuntimeMemoryMapper::protectMappedMemory(
-    const llvm::sys::MemoryBlock& block, unsigned prot_flags) {
-  int64_t page_size = retrying_sysconf(_SC_PAGESIZE);
-  uintptr_t base = reinterpret_cast<uintptr_t>(block.base());
-  uintptr_t rounded_down_base = base & ~(page_size - 1);
-  size_t size = block.allocatedSize();
-  size += base - rounded_down_base;
-
-  prot_flags = ToPosixProtectionFlags(prot_flags);
-  void* addr = reinterpret_cast<void*>(rounded_down_base);
-  if (retrying_mprotect(addr, size, prot_flags) == -1) {
-    return std::error_code(errno, std::generic_category());
-  }
-  return std::error_code();
-}
-
-std::error_code XlaRuntimeMemoryMapper::releaseMappedMemory(
-    llvm::sys::MemoryBlock& block) {
-  if (retrying_munmap(block.base(), block.allocatedSize()) == -1) {
-    return std::error_code(errno, std::generic_category());
-  }
-  return std::error_code();
-}
-
-#endif  // XLA_RUNTIME_ENABLE_MEMORY_MAPPER
-
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/runtime/memory_mapper.h b/third_party/xla/xla/runtime/memory_mapper.h
deleted file mode 100644
index 1b9271abab64b7..00000000000000
--- a/third_party/xla/xla/runtime/memory_mapper.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_RUNTIME_MEMORY_MAPPER_H_
-#define XLA_RUNTIME_MEMORY_MAPPER_H_
-
-#include <memory>
-#include <string>
-#include <string_view>
-#include <system_error>  // NOLINT
-
-#include "tsl/platform/platform.h"
-
-#if defined(PLATFORM_GOOGLE)
-#include "xla/runtime/google/memory_mapper.h"
-#else
-#include "xla/runtime/default/memory_mapper.h"
-#endif
-
-#include "llvm/ExecutionEngine/SectionMemoryManager.h"
-
-namespace xla {
-namespace runtime {
-
-// XLA runtime memory mapper allocates memory for XLA executables (object files)
-// using `memfd_create` system call, and gives the user-friendly name to the
-// file descriptor, so that for example in `perf` it should be possible to
-// identify input XLA programs by name.
-class XlaRuntimeMemoryMapper final
-    : public llvm::SectionMemoryManager::MemoryMapper {
- public:
-  static std::unique_ptr<XlaRuntimeMemoryMapper> Create(std::string_view name);
-
-  llvm::sys::MemoryBlock allocateMappedMemory(
-      llvm::SectionMemoryManager::AllocationPurpose purpose, size_t len,
-      const llvm::sys::MemoryBlock* const near_block, unsigned prot_flags,
-      std::error_code& error_code) final;
-
-  std::error_code protectMappedMemory(const llvm::sys::MemoryBlock& block,
-                                      unsigned prot_flags) final;
-
-  std::error_code releaseMappedMemory(llvm::sys::MemoryBlock& block) final;
-
- private:
-  explicit XlaRuntimeMemoryMapper(std::string_view name) : name_(name) {}
-
-  std::string name_;
-};
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_RUNTIME_MEMORY_MAPPER_H_
diff --git a/third_party/xla/xla/runtime/memref_view.h b/third_party/xla/xla/runtime/memref_view.h
deleted file mode 100644
index 1abefcbe44eb76..00000000000000
--- a/third_party/xla/xla/runtime/memref_view.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_RUNTIME_MEMREF_VIEW_H_
-#define XLA_RUNTIME_MEMREF_VIEW_H_
-
-#include <cstdint>
-
-#include "absl/types/span.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla {
-namespace runtime {
-
-// A view into the memref argument. Corresponds to the MemrefDesc, however it
-// doesn't own the sizes/strides vectors, and cheap to pass around. Memrefs with
-// non-identity layouts can be decoded only as a StridedMemrefView.
-struct StridedMemrefView {
-  PrimitiveType dtype;
-  void* data;
-  absl::Span<const int64_t> sizes;
-  absl::Span<const int64_t> strides;
-};
-
-// A view into the memref argument with an identity (row major) layout.
-struct MemrefView {
-  PrimitiveType dtype;
-  void* data;
-  absl::Span<const int64_t> sizes;
-};
-
-// A flat view into memref argument with an identity (row major) layout. If the
-// memref shape and strides are not required for the custom call, it's cheaper
-// to pass the flat view.
-struct FlatMemrefView {
-  PrimitiveType dtype;
-  void* data;
-  int64_t size_in_bytes;
-};
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_RUNTIME_MEMREF_VIEW_H_
diff --git a/third_party/xla/xla/runtime/module.h b/third_party/xla/xla/runtime/module.h
deleted file mode 100644
index c7ba0a6ff4a790..00000000000000
--- a/third_party/xla/xla/runtime/module.h
+++ /dev/null
@@ -1,185 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_RUNTIME_MODULE_H_
-#define XLA_RUNTIME_MODULE_H_
-
-#include <memory>
-#include <optional>
-#include <string>
-#include <string_view>
-#include <type_traits>
-#include <utility>
-
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "xla/runtime/custom_call_registry.h"
-
-namespace xla {
-namespace runtime {
-
-template <typename S, typename R>
-class StatefulModule;
-class StatelessModule;
-
-//===---------------------------------------------------------------------===///
-// Xla runtime module.
-//===---------------------------------------------------------------------===///
-
-// Xla runtime module is a way to structure custom calls (external functions)
-// together with the state required for calling them. Xla runtime executable can
-// be linked with multiple of such modules at run time.
-//
-// As an example Xla Gpu has a StreamExecutor-based HAL (hardware abstraction
-// layer) implemented as a runtime module. Different Xla backends can require
-// different sets of modules for running executables, and some of the modules
-// can be shared between backends (e.g. XProf integration).
-//
-// User-defined modules must inherit from `StatefulModule` or `StatelessModule`.
-class Module {
- public:
-  // When a module is instantiated for each runtime executable it can optionally
-  // create a state object that can be used to implement stateful functions,
-  // to keep a state between exported functions invocations. State can be
-  // accessed from different executable invocations running concurrently.
-  struct State {
-    virtual ~State() = default;
-  };
-
-  // StateRef is a potentially long-lived reference held by the runtime that
-  // must guarantee that all pointers added to the user data container are
-  // pointing to live objects (see `InitializeUserData` for more details).
-  struct StateRef {
-    virtual ~StateRef() = default;
-  };
-
-  virtual ~Module() = default;
-
-  virtual std::string_view name() const = 0;
-
-  // Creates a new per-executable module state.
-  virtual absl::StatusOr<std::unique_ptr<State>> CreateState() const = 0;
-
-  // Initializes `user_data` with values required for calling functions exported
-  // from the module. If initializing `user_data` requires creating temporaries,
-  // then the returned `StateRef` should guarantee their lifetime. Runtime
-  // itself guarantees that `state` is alive when it calls any of the exported
-  // functions; this means that `user_data` can safely point to `state`'s
-  // members.
-  virtual absl::StatusOr<std::unique_ptr<StateRef>> InitializeUserData(
-      State* state, CustomCall::UserData& user_data) const = 0;
-
-  // Exports direct custom calls provided by this module to the registry.
-  virtual void Export(DirectCustomCallRegistry& registry) const {}
-
-  // Exports dynamic custom calls provided by this module to the registry.
-  virtual void Export(DynamicCustomCallRegistry& registry) const {}
-
- private:
-  template <typename S, typename R>
-  friend class StatefulModule;
-  friend class StatelessModule;
-
-  Module() = default;
-};
-
-//===---------------------------------------------------------------------===///
-// Stateful and typed Xla runtime module.
-//===---------------------------------------------------------------------===///
-
-template <typename S, typename R = void>
-class StatefulModule : public Module {
-  static_assert(std::is_base_of_v<State, S>,
-                "State must be derived from Module::State");
-
-  static_assert(std::is_base_of_v<StateRef, R> || std::is_void_v<R>,
-                "State ref must be `void` or derived from Module::StateRef");
-
- public:
-  std::string_view name() const final { return name_; }
-
-  // Creates a new per-executable module State.
-  virtual absl::StatusOr<std::unique_ptr<S>> CreateModuleState() const = 0;
-
-  // If state ref is void then UserData initialization must return
-  // `absl::Status`, otherwise it must return `absl::StatusOr` with a state
-  // reference value.
-  virtual std::conditional_t<std::is_void_v<R>, absl::Status,
-                             absl::StatusOr<std::unique_ptr<R>>>
-  InitializeUserData(S* state, CustomCall::UserData& user_data) const {
-    if constexpr (std::is_void_v<R>) {
-      return absl::OkStatus();
-    } else {
-      return nullptr;
-    }
-  }
-
- protected:
-  explicit StatefulModule(std::string name) : name_(std::move(name)) {}
-
- private:
-  absl::StatusOr<std::unique_ptr<State>> CreateState() const final {
-    return CreateModuleState();
-  }
-
-  absl::StatusOr<std::unique_ptr<StateRef>> InitializeUserData(
-      State* state, CustomCall::UserData& user_data) const final {
-    auto initialized = InitializeUserData(dynamic_cast<S*>(state), user_data);
-
-    // Convert `absl::OkStatus` to an `absl::StatusOr` with an empty state ref.
-    if constexpr (std::is_void_v<R>) {
-      if (initialized.ok()) return nullptr;
-    }
-
-    return initialized;
-  }
-
-  std::string name_;
-};
-
-//===---------------------------------------------------------------------===///
-// Stateless Xla runtime module.
-//===---------------------------------------------------------------------===///
-
-class StatelessModule : public Module {
- public:
-  std::string_view name() const final { return name_; }
-
-  virtual absl::Status InitializeUserData(
-      CustomCall::UserData& user_data) const {
-    return absl::OkStatus();
-  }
-
- protected:
-  explicit StatelessModule(std::string name) : name_(std::move(name)) {}
-
- private:
-  absl::StatusOr<std::unique_ptr<State>> CreateState() const final {
-    return nullptr;
-  }
-
-  absl::StatusOr<std::unique_ptr<StateRef>> InitializeUserData(
-      State* state, CustomCall::UserData& user_data) const final {
-    if (auto init = InitializeUserData(user_data); !init.ok()) return init;
-    return nullptr;
-  }
-
-  std::string name_;
-};
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_RUNTIME_MODULE_H_
diff --git a/third_party/xla/xla/runtime/module_registry.cc b/third_party/xla/xla/runtime/module_registry.cc
deleted file mode 100644
index 8c7fbae80fb99b..00000000000000
--- a/third_party/xla/xla/runtime/module_registry.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/runtime/module_registry.h"
-
-#include <memory>
-#include <utility>
-#include <vector>
-
-namespace xla {
-namespace runtime {
-
-static std::vector<std::unique_ptr<Module>>& RegisteredModules() {
-  static auto* modules = new std::vector<std::unique_ptr<Module>>;
-  return *modules;
-}
-
-void RegisterModule(std::unique_ptr<Module> module) {
-  VLOG(1) << "Register XLA runtime module: " << module->name();
-  RegisteredModules().push_back(std::move(module));
-}
-
-void ExportModules(DynamicCustomCallRegistry& registry) {
-  for (auto& module : RegisteredModules()) {
-    module->Export(registry);
-  }
-}
-
-ModulesState::ModulesState(
-    std::vector<std::pair<Module*, std::unique_ptr<Module::State>>> state)
-    : state_(std::move(state)) {}
-
-/*static*/ absl::StatusOr<ModulesState> ModulesState::Instantiate() {
-  VLOG(1) << "Instantiate state for all registered XLA runtime modules";
-  std::vector<std::pair<Module*, std::unique_ptr<Module::State>>> state_vec;
-
-  for (auto& module : RegisteredModules()) {
-    VLOG(2) << "Instantiate state for module: " << module->name();
-    auto state = module->CreateState();
-    if (!state.ok()) return state.status();
-    state_vec.emplace_back(module.get(), std::move(*state));
-  }
-
-  return ModulesState(std::move(state_vec));
-}
-
-absl::StatusOr<std::vector<std::unique_ptr<Module::StateRef>>>
-ModulesState::InitializeUserData(CustomCall::UserData& user_data) {
-  VLOG(1) << "Initialize UserData for all XLA runtime modules";
-  std::vector<std::unique_ptr<Module::StateRef>> ref_vec;
-  ref_vec.reserve(state_.size());
-
-  for (auto& [module, state] : state_) {
-    VLOG(2) << "Initialize user data for module: " << module->name();
-    auto ref = module->InitializeUserData(state.get(), user_data);
-    if (!ref.ok()) return ref.status();
-    ref_vec.push_back(std::move(*ref));
-  }
-
-  return {std::move(ref_vec)};
-}
-
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/runtime/module_registry.h b/third_party/xla/xla/runtime/module_registry.h
deleted file mode 100644
index 63e183714665d8..00000000000000
--- a/third_party/xla/xla/runtime/module_registry.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_RUNTIME_MODULE_REGISTRY_H_
-#define XLA_RUNTIME_MODULE_REGISTRY_H_
-
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "xla/runtime/module.h"
-
-namespace xla {
-namespace runtime {
-
-// Registers Xla runtime module with a global modules registry.
-//
-// TODO(ezhulenev): We need to support modules restricted for different
-// platforms, e.g. we should not instantiate state for GPU modules when
-// compiling a CPU executable. Xla today uses a "platform name" (Host,
-// CUDA, etc...) for this.
-void RegisterModule(std::unique_ptr<Module> module);
-
-// Exports registered modules to the given custom call registry.
-//
-// TODO(ezhulenev): We also need to support exporting direct custom calls.
-void ExportModules(DynamicCustomCallRegistry& registry);
-
-// Helper macro to define a static module registration.
-#define XLA_REGISTER_RUNTIME_MODULE(FUNC) \
-  XLA_REGISTER_RUNTIME_MODULE_IMPL(FUNC, __COUNTER__)
-
-#define XLA_REGISTER_RUNTIME_MODULE_IMPL(FUNC, N)           \
-  static bool xla_runtime_module_##N##_registered_ = []() { \
-    ::xla::runtime::RegisterModule(FUNC);                   \
-    return true;                                            \
-  }()
-
-// Container that owns the state of all registered modules.
-class ModulesState {
- public:
-  ModulesState() = default;
-
-  // Instantiates `ModulesState` from the registered module.
-  //
-  // TODO(ezhulenev): Take module's platform.
-  static absl::StatusOr<ModulesState> Instantiate();
-
-  // Initializes `UserData` from the module's state.
-  absl::StatusOr<std::vector<std::unique_ptr<Module::StateRef>>>
-  InitializeUserData(CustomCall::UserData& user_data);
-
- private:
-  explicit ModulesState(
-      std::vector<std::pair<Module*, std::unique_ptr<Module::State>>> state);
-
-  std::vector<std::pair<Module*, std::unique_ptr<Module::State>>> state_;
-};
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_RUNTIME_MODULE_REGISTRY_H_
diff --git a/third_party/xla/xla/runtime/module_test.cc b/third_party/xla/xla/runtime/module_test.cc
deleted file mode 100644
index f342c79cba663d..00000000000000
--- a/third_party/xla/xla/runtime/module_test.cc
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/runtime/module.h"
-
-#include <memory>
-#include <optional>
-
-#include "xla/runtime/custom_call.h"
-#include "tsl/platform/test.h"
-
-namespace xla {
-namespace runtime {
-
-struct TestStateRef : public Module::StateRef {};
-
-struct TestState : public Module::State {
-  explicit TestState(int32_t value) : value(value) {}
-
-  int32_t value;
-};
-
-struct ModuleNoStateRef : public StatefulModule<TestState> {
-  using Base = StatefulModule<TestState>;
-
-  ModuleNoStateRef() : Base("module-no-state-ref") {}
-
-  absl::StatusOr<std::unique_ptr<TestState>> CreateModuleState() const final {
-    return std::make_unique<TestState>(42);
-  }
-};
-
-struct ModuleWithStateRef : public StatefulModule<TestState, TestStateRef> {
-  using Base = StatefulModule<TestState, TestStateRef>;
-
-  ModuleWithStateRef() : Base("module-with-state-ref") {}
-
-  absl::StatusOr<std::unique_ptr<TestState>> CreateModuleState() const final {
-    return std::make_unique<TestState>(42);
-  }
-};
-
-struct ModuleNoState : public StatelessModule {
-  ModuleNoState() : StatelessModule("module-no-state") {}
-};
-
-TEST(ModuleTest, ModuleNoStateRef) {
-  ModuleNoStateRef module;
-  EXPECT_EQ(module.name(), "module-no-state-ref");
-
-  auto state = module.CreateModuleState();
-  ASSERT_TRUE(state.ok());
-  EXPECT_EQ((*state)->value, 42);
-
-  CustomCall::UserData user_data;
-  ASSERT_TRUE(module.InitializeUserData(state->get(), user_data).ok());
-}
-
-TEST(ModuleTest, ModuleWithStateRef) {
-  ModuleWithStateRef module;
-  EXPECT_EQ(module.name(), "module-with-state-ref");
-
-  auto state = module.CreateModuleState();
-  ASSERT_TRUE(state.ok());
-  EXPECT_EQ((*state)->value, 42);
-
-  CustomCall::UserData user_data;
-  ASSERT_TRUE(module.InitializeUserData(state->get(), user_data).ok());
-}
-
-TEST(ModuleTest, ModuleNoState) {
-  ModuleNoState module;
-  EXPECT_EQ(module.name(), "module-no-state");
-
-  auto state = dynamic_cast<Module&>(module).CreateState();
-  ASSERT_TRUE(state.ok());
-  EXPECT_FALSE(state->get());
-
-  CustomCall::UserData user_data;
-  ASSERT_TRUE(module.InitializeUserData(user_data).ok());
-}
-
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/runtime/results.h b/third_party/xla/xla/runtime/results.h
deleted file mode 100644
index 45b81f0d5f80ba..00000000000000
--- a/third_party/xla/xla/runtime/results.h
+++ /dev/null
@@ -1,186 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_RUNTIME_RESULTS_H_
-#define XLA_RUNTIME_RESULTS_H_
-
-#include <type_traits>
-
-#include "xla/runtime/logical_result.h"
-#include "xla/runtime/types.h"
-
-namespace xla {
-namespace runtime {
-
-//===----------------------------------------------------------------------===//
-// Conversions from XLA executable results to C++ types.
-//===----------------------------------------------------------------------===//
-
-// The result type defines its own ABI as a required number of bytes and
-// alignment, and executable returns results by writing into the requested
-// memory allocated in the call frame. The user is responsible for providing
-// a conversion function that converts this opaque memory back to the C++
-// data type. For example memrefs returned as a `StridedMemrefType` structure,
-// and it is the user responsibiity to define a conversion function that can
-// convert a memref to the run time Tensor/Buffer type.
-//
-// It is important that the type that is written into the call frame memory has
-// a standard memory layout, because we rely on `reinterpret_cast` to reinterpet
-// the opaque bytes to a C struct.
-//
-// See https://en.cppreference.com/w/cpp/types/is_standard_layout
-
-// Result converter is responsible for taking a pointer to the memory location
-// where the executable wrote the result, and converting it to the corresponding
-// run time value expected by the caller (e.g. memref descriptor to Tensor).
-class ResultConverter {
- public:
-  virtual ~ResultConverter() = default;
-
-  // Converts value `ret` of type `runtime_type` (runtime type derived from the
-  // original `type`) returned from the executable at `result_index` result
-  // position using registered conversion functions. Returns a logical result
-  // telling if the conversion was successful.
-  virtual LogicalResult ReturnValue(unsigned result_index, const Type* type,
-                                    const Type* runtime_type,
-                                    void* ret) const = 0;
-
-  // Returns error for all results. This function is called if the runtime
-  // fails to run the executable, or if the executable returns an error.
-  //
-  // This function is not called if an individual `ReturnValue` conversion
-  // fails.
-  //
-  // It is the user's responsibility to handle the case where the return value
-  // conversion fails, and some error value has to be returned for unhandled
-  // results, e.g. error async value for unconverted results.
-  virtual void ReturnError(const absl::Status& error) const = 0;
-};
-
-//===----------------------------------------------------------------------===//
-// Result converter for functions without results (returning void).
-//===----------------------------------------------------------------------===//
-
-struct NoResultConverter : public ResultConverter {
-  LLVM_ATTRIBUTE_ALWAYS_INLINE
-  LogicalResult ReturnValue(unsigned, const Type*, const Type*,
-                            void*) const final {
-    assert(false && "no result converter must never be called");
-    return failure();
-  }
-
-  void ReturnError(const absl::Status&) const final {}
-};
-
-//===----------------------------------------------------------------------===//
-// Returns results using user-provided set of conversion functions.
-//===----------------------------------------------------------------------===//
-
-template <typename RetError, typename... RetValue>
-class ResultConverterSet : public ResultConverter {
-  static_assert(sizeof...(RetValue), "result converters must be non-empty");
-
-  static_assert(std::is_invocable_v<RetError, const absl::Status&>);
-
-  static_assert(std::conjunction_v<
-                std::is_invocable_r<LogicalResult, RetValue, unsigned,
-                                    const Type*, const Type*, void*>...>);
-
- public:
-  explicit ResultConverterSet(RetError ret_error, RetValue... ret_value)
-      : ret_error_(std::move(ret_error)),
-        ret_value_(std::forward<RetValue>(ret_value)...) {}
-
-  LogicalResult ReturnValue(unsigned result_index, const Type* type,
-                            const Type* runtime_type, void* ret) const final {
-    return ReturnValue<0>(result_index, type, runtime_type, ret);
-  }
-
-  void ReturnError(const absl::Status& error) const final { ret_error_(error); }
-
- private:
-  template <size_t idx>
-  LogicalResult ReturnValue(unsigned result_index, const Type* type,
-                            const Type* runtime_type, void* ret) const {
-    // Try to call the user-provided converter.
-    auto& converter = std::get<idx>(ret_value_);
-    if (succeeded(converter(result_index, type, runtime_type, ret)))
-      return success();
-
-    // If conversion failed try the next one if available.
-    if constexpr (idx + 1 < sizeof...(RetValue))
-      return ReturnValue<idx + 1>(result_index, type, runtime_type, ret);
-
-    return failure();
-  }
-
-  RetError ret_error_;
-  std::tuple<RetValue...> ret_value_;
-};
-
-template <typename RetError, typename... RetValue>
-ResultConverterSet(RetError, RetValue...)
-    -> ResultConverterSet<RetError, RetValue...>;
-
-//===----------------------------------------------------------------------===//
-// Helper functions for converting results of canonical types.
-//===----------------------------------------------------------------------===//
-
-namespace internal {
-
-// This struct corresponds to the `llvm.struct` used by memref to llvm lowering
-// pass to represent memref descriptors in the compilation pipeline. It is a
-// type-erased version of `::mlir::StridedMemRefType<T>` template.
-struct MemrefDescriptor {
-  void* base_ptr;
-  void* data_ptr;
-  int64_t offset;
-  int64_t sizes_and_strides[];
-};
-
-}  // namespace internal
-
-// Converts returned memref using user-provided converter. Converter must
-// satisfy this concept:
-//
-//   struct Converter {
-//     ResultType operator()(PrimitiveType element_type, void* base_ptr,
-//                           void* data_ptr, int64_t offset,
-//                           absl::Span<const int64_t> dims,
-//                           absl::Span<const int64_t> strides);
-//   };
-//
-template <typename T, typename Converter>
-FailureOr<T> ConvertReturnedMemref(const Converter& converter,
-                                   const Type* memref_type, void* ret) {
-  // Check if the runtime type is a valid memref.
-  auto* memref = llvm::dyn_cast<MemrefType>(memref_type);
-  if (!memref) return failure();
-
-  PrimitiveType element_type = memref->element_type();
-  size_t rank = memref->rank();
-
-  auto* desc = reinterpret_cast<internal::MemrefDescriptor*>(ret);
-  absl::Span<const int64_t> dims(desc->sizes_and_strides, rank);
-  absl::Span<const int64_t> strides(desc->sizes_and_strides + rank, rank);
-
-  return converter(element_type, desc->base_ptr, desc->data_ptr, desc->offset,
-                   dims, strides);
-}
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_RUNTIME_RESULTS_H_
diff --git a/third_party/xla/xla/runtime/results_test.cc b/third_party/xla/xla/runtime/results_test.cc
deleted file mode 100644
index eb571de7c955be..00000000000000
--- a/third_party/xla/xla/runtime/results_test.cc
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/runtime/results.h"
-
-#include <memory>
-#include <optional>
-#include <utility>
-
-#include "xla/runtime/logical_result.h"
-#include "xla/runtime/types.h"
-#include "tsl/platform/test.h"
-#include "tsl/platform/test_benchmark.h"
-
-namespace xla {
-namespace runtime {
-
-TEST(ResultsTest, ResultConverterSet) {
-  std::optional<int32_t> value;
-
-  auto ret_error = [&](const absl::Status& status) { ASSERT_TRUE(false); };
-
-  auto ret_i32 = [&](unsigned, const Type* type, const Type*, void* ret) {
-    auto* scalar = llvm::dyn_cast<ScalarType>(type);
-    if (scalar && scalar->type() == PrimitiveType::S32) {
-      value = *reinterpret_cast<int32_t*>(ret);
-      return success();
-    }
-    return failure();
-  };
-
-  auto s32 = std::make_unique<ScalarType>(PrimitiveType::S32);
-  auto s64 = std::make_unique<ScalarType>(PrimitiveType::S64);
-
-  ResultConverterSet converter(ret_error, ret_i32);
-
-  // S64 conversion is not supported.
-  ASSERT_TRUE(failed(converter.ReturnValue(0, s64.get(), s64.get(), nullptr)));
-
-  // Check that int32_t value was successfully returned.
-  int32_t i32 = 42;
-  ASSERT_TRUE(succeeded(converter.ReturnValue(0, s32.get(), s32.get(), &i32)));
-  ASSERT_TRUE(value.has_value());
-  EXPECT_EQ(*value, i32);
-}
-
-TEST(ResultsTest, MoveOnlyResultConverterSet) {
-  {  // Move-only capture in error converter.
-    auto ptr = std::make_unique<int32_t>(42);
-    auto ret_error = [ptr = std::move(ptr)](const absl::Status& status) {};
-    auto ret_value = [](unsigned, const Type*, const Type*, void*) {
-      return success();
-    };
-
-    ResultConverterSet converter(std::move(ret_error), ret_value);
-  }
-
-  {  // Move-only capture in value converter.
-    auto ptr = std::make_unique<int32_t>(42);
-    auto ret_error = [](const absl::Status& status) {};
-    auto ret_value = [ptr = std::move(ptr)](unsigned, const Type*, const Type*,
-                                            void* ret) { return success(); };
-
-    ResultConverterSet converter(ret_error, std::move(ret_value));
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// Performance benchmarks are below.
-//===----------------------------------------------------------------------===//
-
-static void ReturnError(const absl::Status& status) {
-  assert(false && "Unexpected call to `ReturnError`");
-}
-
-static LogicalResult ReturnI32(unsigned, const Type* type, const Type*, void*) {
-  auto* scalar = llvm::dyn_cast<ScalarType>(type);
-  return (scalar && scalar->type() == PrimitiveType::S32) ? success()
-                                                          : failure();
-}
-
-static void BM_RetI32(benchmark::State& state) {
-  ResultConverterSet converter(ReturnError, ReturnI32);
-
-  auto s32 = std::make_unique<ScalarType>(PrimitiveType::S32);
-
-  for (auto _ : state) {
-    auto converted = converter.ReturnValue(0, s32.get(), s32.get(), nullptr);
-    benchmark::DoNotOptimize(converted);
-  }
-}
-
-BENCHMARK(BM_RetI32);
-
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/runtime/runtime.h b/third_party/xla/xla/runtime/runtime.h
deleted file mode 100644
index f7ad385b0fcf22..00000000000000
--- a/third_party/xla/xla/runtime/runtime.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_RUNTIME_RUNTIME_H_
-#define XLA_RUNTIME_RUNTIME_H_
-
-#include <stdint.h>
-
-namespace xla {
-namespace runtime {
-
-//===----------------------------------------------------------------------===//
-// XLA Runtime <-> XLA Executable integration API.
-//===----------------------------------------------------------------------===//
-
-// This API enables compiled XLA executables to call back into the XLA runtime
-// for:
-//
-//  - Returning results back to the caller by writing to result storage
-//    allocated in the call frame.
-//  - User friendly error reporting integrated with the high level execution
-//    model (errors do not crash the compiled binary).
-//  - Invoking custom calls registered with the XLA runtime.
-//
-// XLA compilation pipeline sets up passes to convert the regular functions
-// to the so called "XLA entrypoint functions" integrated with the runtime using
-// the API defined below, e.g. instead of conventional returns all results are
-// returned via the `GetResultStorage` API. At MLIR level these operations
-// correspond to the `rt` dialect, and converted to LLVM using the `rt-to-llvm`
-// conversion pass.
-//
-// Runtime errors are reported back to the runtime via the `SetError` API.
-// The compilation pipeline will automatically convert assertions in the
-// entrypoint function into run-time errors.
-
-// Opaque runtime execution context passed as the first argument to compiled
-// executables and passed back to all runtime API methods.
-using ExecutionContext = struct ExecutionContext;
-
-// Returns a pointer to the memory location of the result at the given index.
-void *GetResultStorage(ExecutionContext *, int64_t);
-
-// Sets execution context to an error state.
-void SetError(ExecutionContext *, const char *);
-
-// Calls the custom call function registered with the runtime. Returns true
-// if the custom call was successful.
-bool CustomCall(ExecutionContext *, const char *target, void **args,
-                void **attrs, void **rets);
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_RUNTIME_RUNTIME_H_
diff --git a/third_party/xla/xla/runtime/state.h b/third_party/xla/xla/runtime/state.h
deleted file mode 100644
index 8a5c4e09f4f189..00000000000000
--- a/third_party/xla/xla/runtime/state.h
+++ /dev/null
@@ -1,218 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_RUNTIME_STATE_H_
-#define XLA_RUNTIME_STATE_H_
-
-#include <cstddef>
-#include <memory>
-#include <type_traits>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/synchronization/mutex.h"
-
-namespace xla {
-namespace runtime {
-
-// Forward declare.
-template <typename T>
-class State;
-
-// StateVector is a container for keeping the state between custom call
-// invocations (within same or concurrent calls to `Executable::Execute`).
-//
-// Every instance of the custom call in the executable must be assigned a unique
-// id (dense i64 integer in the [0, num_instances) range), and the state vector
-// enables efficient access to value of type `T`, with lazy initialization done
-// by the custom call itself.
-//
-// Custom calls implement special bindings for state arguments:
-//
-//   CustomCall::Bind("foo")
-//     .State<FooState>("id_attr_name")
-//     .To([](State<FooState> state) {
-//       FooState foo = state.GetOrCreate(...);
-//       ...
-//     });
-//
-// State snapshot must be passed via UserData:
-//
-//   StateVector<FooState>::Snapshot snapshot = ...;
-//   CustomCall::UserData user_data(&snapshot, ...);
-//
-// See `custom_call.h` and `custom_call_test.cc` for more examples.
-template <typename T>
-class StateVector {
- public:
-  explicit StateVector(size_t reserve = 0);
-
-  // Snapshot wraps the read-only state snapshot (potentially obsolete) and
-  // falls back on synchronized state access if it can't find an entry in its
-  // read-only state snapshot.
-  class Snapshot {
-   public:
-    // Returns a state for the given id if it's already available, or tries to
-    // create a new one using the user-provided `create` function. Returns an
-    // error if state creation fails. Failures are not cached and always
-    // retried.
-    template <typename F>
-    absl::StatusOr<T*> GetOrCreate(size_t id, F&& create);
-
-    absl::StatusOr<T*> Get(size_t id);
-
-    absl::Status Erase(size_t id);
-
-    // Returns a state constructed from this snapshot for a given id.
-    State<T> state(size_t id) { return State<T>(id, this); }
-
-   private:
-    friend StateVector;
-    Snapshot(StateVector& state, std::shared_ptr<std::vector<T*>> snapshot);
-
-    // State vector that this snapshot was taken from.
-    StateVector& owning_state_;
-
-    // State vector snapshot captured at construction time. It might not contain
-    // all the latest data available in the `owning_state_`.
-    std::shared_ptr<std::vector<T*>> maybe_obsolete_snapshot_;
-  };
-
-  Snapshot snapshot();
-  T* operator[](size_t id);
-
- private:
-  mutable absl::Mutex mu_;
-
-  // StateVector owns the values of type `T` indexed by the key.
-  std::vector<std::unique_ptr<T>> vector_ ABSL_GUARDED_BY(mu_);
-
-  // A read-only snapshot of `vector_` that is updated every time a new entry
-  // is added to the state vector.
-  std::shared_ptr<std::vector<T*>> vector_snapshot_ ABSL_GUARDED_BY(mu_);
-};
-
-template <typename T>
-class State {
-  using Snapshot = typename StateVector<T>::Snapshot;
-
- public:
-  template <typename F>
-  absl::StatusOr<T*> GetOrCreate(F&& create) {
-    return snapshot_->GetOrCreate(id_, std::forward<F>(create));
-  }
-
- private:
-  friend Snapshot;
-  State(size_t id, Snapshot* snapshot) : id_(id), snapshot_(snapshot) {}
-
-  size_t id_;
-  Snapshot* snapshot_;
-};
-
-//===----------------------------------------------------------------------===//
-
-template <typename T>
-StateVector<T>::StateVector(size_t reserve)
-    : vector_(reserve), vector_snapshot_(new std::vector<T*>(reserve)) {}
-
-template <typename T>
-StateVector<T>::Snapshot::Snapshot(StateVector& state,
-                                   std::shared_ptr<std::vector<T*>> snapshot)
-    : owning_state_(state), maybe_obsolete_snapshot_(std::move(snapshot)) {}
-
-template <typename T>
-auto StateVector<T>::snapshot() -> Snapshot {
-  absl::MutexLock lock(&mu_);
-  return Snapshot(*this, vector_snapshot_);
-}
-
-template <typename T>
-T* StateVector<T>::operator[](size_t id) {
-  absl::MutexLock lock(&mu_);
-  return id < vector_.size() ? vector_[id].get() : nullptr;
-}
-
-template <typename T>
-template <typename F>
-absl::StatusOr<T*> StateVector<T>::Snapshot::GetOrCreate(size_t id,
-                                                         F&& create) {
-  static_assert(std::is_invocable_r_v<absl::StatusOr<T>, F>);
-
-  // If snapshot already contains the entry, just return it.
-  std::vector<T*>& snapshot = *maybe_obsolete_snapshot_;
-  if (id < snapshot.size() && snapshot[id]) return snapshot[id];
-
-  // Otherwise go through the slow synchronized code path.
-  absl::MutexLock lock(&owning_state_.mu_);
-
-  // Check if value is present in the state vector, and was not captured in
-  // the snapshot that we have.
-  std::vector<std::unique_ptr<T>>& state = owning_state_.vector_;
-  if (id < state.size() && state[id].get()) return state[id].get();
-
-  // Try to insert a new entry.
-  absl::StatusOr<T> value = create();
-  if (!value.ok()) return value.status();
-
-  // Update the state vector.
-  if (id >= state.size()) state.resize(id + 1);
-  state[id] = std::make_unique<T>(*std::move(value));
-
-  // And take the new snapshot.
-  auto new_snapshot = std::make_shared<std::vector<T*>>(state.size());
-  for (size_t i = 0; i < state.size(); ++i) (*new_snapshot)[i] = state[i].get();
-
-  // Update the snapshot owned by the state.
-  owning_state_.vector_snapshot_ = std::move(new_snapshot);
-
-  return state[id].get();
-}
-
-template <typename T>
-absl::StatusOr<T*> StateVector<T>::Snapshot::Get(size_t id) {
-  // If snapshot already contains the entry, just return it.
-  std::vector<T*>& snapshot = *maybe_obsolete_snapshot_;
-  if (id < snapshot.size() && snapshot[id]) return snapshot[id];
-
-  // Otherwise go through the slow synchronized code path.
-  absl::MutexLock lock(&owning_state_.mu_);
-
-  // Check if value is present in the state vector, and was not captured in
-  // the snapshot that we have.
-  std::vector<std::unique_ptr<T>>& state = owning_state_.vector_;
-  if (id < state.size() && state[id].get()) return state[id].get();
-
-  return absl::InternalError("Value not found in state vector");
-}
-
-template <typename T>
-absl::Status StateVector<T>::Snapshot::Erase(size_t id) {
-  absl::MutexLock lock(&owning_state_.mu_);
-
-  std::vector<std::unique_ptr<T>>& state = owning_state_.vector_;
-  if (id < state.size() && state[id].get()) {
-    state[id].reset(nullptr);
-    return absl::OkStatus();
-  }
-
-  return absl::InternalError("Value not found in state vector");
-}
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_RUNTIME_STATE_H_
diff --git a/third_party/xla/xla/runtime/state_test.cc b/third_party/xla/xla/runtime/state_test.cc
deleted file mode 100644
index 61ccce7aaa698d..00000000000000
--- a/third_party/xla/xla/runtime/state_test.cc
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/runtime/state.h"
-
-#include "tsl/platform/test.h"
-#include "tsl/platform/test_benchmark.h"
-
-namespace xla {
-namespace runtime {
-
-TEST(StateVectorTest, GetOrCreate) {
-  int32_t cnt = 0;
-  auto create = [&] { return cnt++; };
-
-  StateVector<int32_t> state;
-
-  StateVector<int32_t>::Snapshot empty_snapshot = state.snapshot();
-  EXPECT_EQ(**empty_snapshot.GetOrCreate(0, create), 0);
-  EXPECT_EQ(**empty_snapshot.GetOrCreate(0, create), 0);
-  EXPECT_EQ(**empty_snapshot.GetOrCreate(1, create), 1);
-
-  StateVector<int32_t>::Snapshot snapshot = state.snapshot();
-  EXPECT_EQ(**snapshot.GetOrCreate(0, create), 0);
-  EXPECT_EQ(**snapshot.GetOrCreate(1, create), 1);
-  EXPECT_EQ(**snapshot.GetOrCreate(9, create), 2);
-
-  State<int32_t> st0 = snapshot.state(0);
-  State<int32_t> st1 = snapshot.state(1);
-  EXPECT_EQ(**st0.GetOrCreate(create), 0);
-  EXPECT_EQ(**st1.GetOrCreate(create), 1);
-
-  EXPECT_EQ(cnt, 3);
-}
-
-TEST(StateVectorTest, GetOrCreateAtRandomOrder) {
-  int32_t cnt = 0;
-  auto create = [&] { return cnt++; };
-
-  StateVector<int32_t> state;
-
-  StateVector<int32_t>::Snapshot empty_snapshot = state.snapshot();
-  EXPECT_EQ(**empty_snapshot.GetOrCreate(99, create), 0);
-  EXPECT_EQ(**empty_snapshot.GetOrCreate(22, create), 1);
-  EXPECT_EQ(**empty_snapshot.GetOrCreate(33, create), 2);
-
-  StateVector<int32_t>::Snapshot snapshot = state.snapshot();
-  EXPECT_EQ(**snapshot.GetOrCreate(99, create), 0);
-  EXPECT_EQ(**snapshot.GetOrCreate(22, create), 1);
-  EXPECT_EQ(**snapshot.GetOrCreate(33, create), 2);
-}
-
-//===----------------------------------------------------------------------===//
-// Performance benchmarks.
-//===----------------------------------------------------------------------===//
-
-static void BM_TakeSnapshot(benchmark::State& state) {
-  StateVector<int32_t> ints;
-
-  for (auto _ : state) {
-    StateVector<int32_t>::Snapshot snapshot = ints.snapshot();
-    benchmark::DoNotOptimize(snapshot);
-  }
-}
-
-static void BM_GetFromStateVectorVector(benchmark::State& state) {
-  StateVector<int32_t> ints;
-  StateVector<int32_t>::Snapshot snapshot = ints.snapshot();
-
-  for (auto _ : state) {
-    auto value = snapshot.GetOrCreate(0, [] { return 0; });
-    assert(value.ok() && "unexpected error");
-    benchmark::DoNotOptimize(value);
-  }
-}
-
-static void BM_GetFromSnapshot(benchmark::State& state) {
-  StateVector<int32_t> ints;
-  StateVector<int32_t>::Snapshot empty_snapshot = ints.snapshot();
-  empty_snapshot.GetOrCreate(0, [] { return 0; }).IgnoreError();
-
-  StateVector<int32_t>::Snapshot snapshot = ints.snapshot();
-
-  for (auto _ : state) {
-    auto value = snapshot.GetOrCreate(0, [] { return 0; });
-    assert(value.ok() && "unexpected error");
-    benchmark::DoNotOptimize(value);
-  }
-}
-
-BENCHMARK(BM_TakeSnapshot);
-BENCHMARK(BM_GetFromStateVectorVector);
-BENCHMARK(BM_GetFromSnapshot);
-
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/runtime/symbolic_shape.cc b/third_party/xla/xla/runtime/symbolic_shape.cc
deleted file mode 100644
index a562b3af50928e..00000000000000
--- a/third_party/xla/xla/runtime/symbolic_shape.cc
+++ /dev/null
@@ -1,310 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/runtime/symbolic_shape.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <iterator>
-#include <numeric>
-
-#include "absl/status/status.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Compiler.h"
-#include "xla/runtime/arguments.h"
-#include "xla/runtime/constraints.h"
-#include "xla/runtime/logical_result.h"
-#include "xla/runtime/types.h"
-
-namespace xla {
-namespace runtime {
-
-using llvm::cast;
-using llvm::dyn_cast;
-using llvm::isa;
-
-using llvm::ArrayRef;
-using llvm::MutableArrayRef;
-
-using SymbolicShape = SymbolicShapesResolver::SymbolicShape;
-using StaticShape = SymbolicShapesResolver::StaticShape;
-
-SymbolicShapesResolver::SymbolicShapesResolver(
-    const FunctionType& signature,
-    absl::Span<const ArgumentConstraint> constraints)
-    : constraints_(constraints.begin(), constraints.end()) {
-  for (unsigned i = 0; i < signature.num_operands(); ++i) {
-    auto* type = signature.operand(i);
-
-    // For unranked arguments we do not know any static shape information.
-    if (isa<UnrankedTensorType, UnrankedMemrefType>(type)) {
-      arguments_sizes_.emplace_back();
-      continue;
-    }
-
-    auto emplace_sizes = [&](absl::Span<const int64_t> sizes) {
-      arguments_sizes_.emplace_back(llvm::to_vector(sizes));
-
-      // Keep track of all statically known dimension sizes.
-      for (int64_t size : sizes) {
-        if (size != MemrefType::kDynamic) seen_static_sizes_.insert(size);
-      }
-    };
-
-    // Copy memref dimensions sizes from the signature type.
-    if (auto* memref = dyn_cast<MemrefType>(type)) {
-      emplace_sizes(memref->sizes());
-      continue;
-    }
-
-    // Copy tensor dimensions sizes from the signature type.
-    if (auto* tensor = dyn_cast<RankedTensorType>(type)) {
-      emplace_sizes(tensor->sizes());
-      continue;
-    }
-
-    // TODO(ezhulenev): Add support for `ShapedType` to allow users to enable
-    // symbolic shape resolution for user-defined types.
-
-    // All non-shaped types have statically known empty shape.
-    emplace_sizes({});
-  }
-
-  // When resolving symbolic shapes we should visit arguments starting from the
-  // more constrained ones, because they can change the static signature of the
-  // function, and this information should be propagated to arguments with
-  // dynamic shapes (e.g. all seen static sizes should be materialized in the
-  // function signature).
-  iteration_order_.resize(signature.num_operands());
-  std::iota(iteration_order_.begin(), iteration_order_.end(), 0);
-
-  // Make the sort stable so that dynamic shapes are computed deterministically.
-  llvm::sort(iteration_order_, [&](size_t a, size_t b) {
-    unsigned ca = static_cast<unsigned>(constraints[a]);
-    unsigned cb = static_cast<unsigned>(constraints[b]);
-    if (ca > cb) return true;
-    return ca < cb ? false : a < b;
-  });
-
-  // We can safely skip arguments with a known empty symbolic shape, because
-  // that's the default value we return when resolving symbolic shapes for
-  // the arguments, and such shapes do not participate in the hash computation.
-  llvm::erase_if(iteration_order_, [&](size_t i) {
-    return arguments_sizes_[i].has_value() && arguments_sizes_[i]->empty();
-  });
-
-  // When computing a symbolic shapes hash we don't need to visit arguments with
-  // a statically known shape.
-  auto is_dynamic_shape_argument = [&](size_t idx) {
-    return !arguments_sizes_[idx].has_value() ||
-           llvm::any_of(*arguments_sizes_[idx],
-                        [](int64_t d) { return d < 0; });
-  };
-  llvm::copy_if(iteration_order_, std::back_inserter(hash_iteration_order_),
-                is_dynamic_shape_argument);
-}
-
-ArgumentConstraint SymbolicShapesResolver::constraint(size_t index) const {
-  return constraints_[index];
-}
-
-size_t SymbolicShapesResolver::num_arguments() const {
-  return arguments_sizes_.size();
-}
-
-bool SymbolicShapesResolver::has_argument_sizes(size_t index) const {
-  return arguments_sizes_[index].has_value();
-}
-
-const StaticShape& SymbolicShapesResolver::argument_sizes(size_t index) const {
-  return *arguments_sizes_[index];
-}
-
-bool SymbolicShapesResolver::seen_static_size(size_t dim) const {
-  return seen_static_sizes_.contains(dim);
-}
-
-template <typename SymbolicShapes>
-LLVM_ATTRIBUTE_ALWAYS_INLINE static LogicalResult ResolveImpl(
-    const SymbolicShapesResolver& resolver, ArgumentsRef arguments,
-    ArrayRef<size_t> iteration_order, SymbolicShapes& symbolic_shapes) {
-  // The number of arguments must match the function signature.
-  assert(arguments.size() == resolver.num_arguments());
-
-  // Mapping from the runtime dimension size to the symbolic dimension.
-  llvm::SmallDenseMap<int64_t, int64_t, 16> size_to_symbolic_dim;
-
-  int64_t sym_dim = -2;  // the next symbolic dimension id
-
-  for (size_t i : iteration_order) {
-    bool has_static_sizes = resolver.has_argument_sizes(i);
-
-    // TODO(ezhulenev): Add support for `ShapedArgument` to allow users to
-    // enable symbolic shape resolution for user-defined arguments.
-    //
-    // At this point it's guaranteed that the argument at `i` is a shaped one,
-    // because non-shaped argument are not in the `iteration_order`.
-    const MemrefDesc* shaped = cast<MemrefDesc>(&arguments[i]);
-    absl::Span<const int64_t> runtime_sizes = shaped->sizes();
-
-    // Check that statically known rank matches the runtime rank.
-    if (LLVM_UNLIKELY(has_static_sizes && resolver.argument_sizes(i).size() !=
-                                              runtime_sizes.size()))
-      return failure();
-
-    // For shape constrained argument use runtime shape.
-    if (resolver.constraint(i) == ArgumentConstraint::kShape) {
-      symbolic_shapes[i].assign(runtime_sizes.begin(), runtime_sizes.end());
-
-      // Add all runtime dimensions to the `size_to_symbolic_dim` to materialize
-      // all dynamic dimensions of the same size as static dimensions.
-      for (int64_t d : runtime_sizes) size_to_symbolic_dim.try_emplace(d, d);
-
-      continue;
-    }
-
-    // Initialize symbolic shape with a statically known shape of the argument
-    // if it is available, otherwise initialize it with a fully dynamic shape
-    // with rank matching the runtime rank.
-    if (has_static_sizes) {
-      ArrayRef<int64_t> static_sizes = resolver.argument_sizes(i);
-      assert(runtime_sizes.size() == static_sizes.size());
-      symbolic_shapes[i].assign(static_sizes.begin(), static_sizes.end());
-    } else {
-      size_t rank = runtime_sizes.size();
-      symbolic_shapes[i].resize(rank, MemrefType::kDynamic);
-    }
-
-    MutableArrayRef<int64_t> symbolic_sizes = symbolic_shapes[i];
-
-    for (unsigned d = 0; d < runtime_sizes.size(); ++d) {
-      int64_t symbolic_dim = symbolic_sizes[d];
-      int64_t runtime_dim = runtime_sizes[d];
-
-      // Skip statically known dimensions.
-      if (symbolic_dim >= 0) {
-        // Check that statically known dimension agrees with runtime dimension.
-        if (LLVM_UNLIKELY(symbolic_dim != runtime_dim)) return failure();
-        continue;
-      }
-
-      // Update unknown dimension to a static dimension.
-      if (runtime_dim == 1 || resolver.seen_static_size(runtime_dim)) {
-        symbolic_sizes[d] = runtime_dim;
-        continue;
-      }
-
-      // Try to assign a symbolic dimension to the runtime dimension.
-      auto emplaced = size_to_symbolic_dim.try_emplace(runtime_dim, sym_dim);
-      symbolic_sizes[d] = emplaced.first->second;
-
-      // Update the symbolic dimension if we assigned the previous value to the
-      // runtime dimension size.
-      if (emplaced.second) --sym_dim;
-    }
-  }
-
-  return success();
-}
-
-absl::StatusOr<llvm::SmallVector<SymbolicShape>>
-SymbolicShapesResolver::Resolve(ArgumentsRef arguments) const {
-  // Prepare storage for resolving symbolic shapes.
-  llvm::SmallVector<SymbolicShape> symbolic_shapes;
-  symbolic_shapes.resize(arguments.size());
-
-  if (LLVM_UNLIKELY(failed(
-          ResolveImpl(*this, arguments, iteration_order_, symbolic_shapes))))
-    return absl::InternalError("failed to resolve symbolic shape");
-
-  return symbolic_shapes;
-}
-
-namespace {
-// A struct to accumulate all resolved symbolic dimensions in a single vector.
-// Resolved symbolic dimensions stored according to the iteration order, and not
-// the argument order, however for computing the hash value it doesn't matter.
-struct SymbolicShapesFingerprint {
-  SymbolicShapesFingerprint() : offset(0) {}
-
-  // Make sure that we do not copy the fingerprint.
-  SymbolicShapesFingerprint(const SymbolicShapesFingerprint&) = delete;
-
-  SymbolicShapesFingerprint& operator[](size_t i) { return *this; }
-
-  template <typename InputIt>
-  LLVM_ATTRIBUTE_ALWAYS_INLINE void assign(InputIt first, InputIt last) {
-    auto rank = std::distance(first, last);
-    offset = values.size();
-    values.resize_for_overwrite(offset + rank);
-    llvm::copy(llvm::make_range(first, last), values.begin() + offset);
-  }
-
-  LLVM_ATTRIBUTE_ALWAYS_INLINE void resize(int64_t rank, int64_t dim) {
-    values.push_back(rank);
-    offset = values.size();
-    values.resize(offset + rank, dim);
-  }
-
-  operator MutableArrayRef<int64_t>() {  // NOLINT
-    return {values.begin() + offset, values.end()};
-  }
-
-  size_t offset;
-  llvm::SmallVector<int64_t, 32> values;
-};
-}  // namespace
-
-absl::StatusOr<llvm::hash_code> SymbolicShapesResolver::ResolveHash(
-    ArgumentsRef arguments) const {
-  // Accumulate symbolic shapes into the shapes fingerprint.
-  SymbolicShapesFingerprint fingerprint;
-
-  if (LLVM_UNLIKELY(failed(
-          ResolveImpl(*this, arguments, hash_iteration_order_, fingerprint))))
-    return absl::InternalError("failed to resolve symbolic shape hash");
-
-  return llvm::hash_combine_range(fingerprint.values.begin(),
-                                  fingerprint.values.end());
-}
-
-/*static*/ StaticShape SymbolicShapesResolver::Normalize(
-    const SymbolicShape& shape) {
-  auto normalize = llvm::map_range(
-      shape, [](int64_t dim) { return dim < 0 ? MemrefType::kDynamic : dim; });
-  return {normalize.begin(), normalize.end()};
-}
-
-static llvm::hash_code SymbolicShapeHash(const SymbolicShape& shape) {
-  return llvm::hash_combine(
-      shape.size(), llvm::hash_combine_range(shape.begin(), shape.end()));
-}
-
-/*static*/ llvm::hash_code SymbolicShapesResolver::Hash(
-    absl::Span<const SymbolicShape> symbolic_shapes) {
-  if (LLVM_UNLIKELY(symbolic_shapes.empty())) return llvm::hash_code(0);
-
-  llvm::hash_code hash = SymbolicShapeHash(symbolic_shapes[0]);
-  for (unsigned i = 1; i < symbolic_shapes.size(); ++i)
-    hash = llvm::hash_combine(hash, SymbolicShapeHash(symbolic_shapes[i]));
-
-  return hash;
-}
-
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/runtime/symbolic_shape.h b/third_party/xla/xla/runtime/symbolic_shape.h
deleted file mode 100644
index 6bf8a4466a1b94..00000000000000
--- a/third_party/xla/xla/runtime/symbolic_shape.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_RUNTIME_SYMBOLIC_SHAPE_H_
-#define XLA_RUNTIME_SYMBOLIC_SHAPE_H_
-
-#include <optional>
-
-#include "absl/status/statusor.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/SmallVector.h"
-#include "xla/runtime/arguments.h"
-#include "xla/runtime/constraints.h"
-
-namespace xla {
-namespace runtime {
-
-// Symbolic shapes resolver computes the symbolic shapes of the arguments based
-// on the function signature, and concrete shapes of the arguments at runtime.
-//
-// Example: dimensions that have the same symbolic shape at runtime.
-//
-//   signature: func @compute(%arg0: tensor<?xf32>, %arg1: tensor<?xf32)
-//                            ^                     ^
-//   arguments:               memref<123xf32>       memref<123xf32>
-//                            ^                     ^
-//   symbolic shapes:         [-2xf32]              [-2xf32]
-//
-// Each unknown dimension in the function signature will be assigned a symbolic
-// dimension. If multiple shaped arguments have unknown dimensions that are the
-// same at runtime, they will be assigned the same symbolic dimensions value
-// (e.g. `-2` in the example above).
-//
-// If an unknown dimension at runtime is equal to some statically known
-// dimension in the function signature (of any shaped argument), it will be
-// resolved to that statically known constant value:
-//
-// Example: in this example unknown dimension of `arg0` replaced with a `32`.
-//
-//  signature:  func @compute(%arg0: tensor<?xf32>, %arg1: tensor<32xf32>)
-//                            ^                     ^
-//  arguments:                memref<32xf32>        memref<32xf32>
-//                            ^                     ^
-//  symbolic shapes:          [32xf32]              [32xf32]
-//
-// Unknown dimensions that are `1` at runtime are always materialized as a
-// statically known `1` in the symbolic shape.
-class SymbolicShapesResolver {
- public:
-  // Dimension size can be symbolic (<= -2) or static.
-  using SymbolicShape = llvm::SmallVector<int64_t>;
-  // Dimension size can be dynamic (ShapedType::kDynamic) or static.
-  using StaticShape = llvm::SmallVector<int64_t>;
-
-  SymbolicShapesResolver(const FunctionType& signature,
-                         absl::Span<const ArgumentConstraint> constraints);
-
-  // Resolves symbolic shapes from the runtime arguments. Returns failure if
-  // runtime dimensions do not match the statically known dimensions.
-  absl::StatusOr<llvm::SmallVector<SymbolicShape>> Resolve(
-      ArgumentsRef arguments) const;
-
-  // Resolves symbolic shapes and computes the hash value from the runtime
-  // arguments. Returns failure if runtime dimensions do not match the
-  // statically known dimensions.
-  //
-  // This function might not return the same hash value as calling `Resolve` and
-  // then `Hash`, because it might use more efficient hashing algorithm.
-  absl::StatusOr<llvm::hash_code> ResolveHash(ArgumentsRef arguments) const;
-
-  // Replaces all symbolic dimensions with dynamic dimension.
-  static StaticShape Normalize(const SymbolicShape& shape);
-
-  // Computes a hash value of the symbolic shapes.
-  static llvm::hash_code Hash(absl::Span<const SymbolicShape> symbolic_shapes);
-
-  ArgumentConstraint constraint(size_t index) const;
-  size_t num_arguments() const;
-  bool has_argument_sizes(size_t index) const;
-  const StaticShape& argument_sizes(size_t index) const;
-  bool seen_static_size(size_t dim) const;
-
- private:
-  // Constraints on the function arguments.
-  llvm::SmallVector<ArgumentConstraint> constraints_;
-
-  // Statically known sizes of shaped arguments from the function signature. For
-  // non-shaped arguments (e.g. opaque pointers) we keep empty shape value.
-  llvm::SmallVector<std::optional<StaticShape>> arguments_sizes_;
-
-  // Values of statically known dimensions sizes in the function signature.
-  llvm::DenseSet<int64_t> seen_static_sizes_;
-
-  // The iteration order for the arguments when resolving symbolic shapes.
-  llvm::SmallVector<size_t> iteration_order_;
-
-  // The iteration order for the arguments when resolving symbolic shapes hash.
-  llvm::SmallVector<size_t> hash_iteration_order_;
-};
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_RUNTIME_SYMBOLIC_SHAPE_H_
diff --git a/third_party/xla/xla/runtime/symbolic_shape_test.cc b/third_party/xla/xla/runtime/symbolic_shape_test.cc
deleted file mode 100644
index fe4f599366f8b0..00000000000000
--- a/third_party/xla/xla/runtime/symbolic_shape_test.cc
+++ /dev/null
@@ -1,700 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/runtime/symbolic_shape.h"
-
-#include <memory>
-#include <optional>
-#include <utility>
-#include <vector>
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallVector.h"
-#include "xla/runtime/arguments.h"
-#include "xla/runtime/constraints.h"
-#include "xla/runtime/types.h"
-#include "tsl/platform/test.h"
-#include "tsl/platform/test_benchmark.h"
-
-namespace xla {
-namespace runtime {
-
-using llvm::ArrayRef;
-
-using SymbolicShape = SymbolicShapesResolver::SymbolicShape;
-
-// Create a function type with empty results from the operands shapes.
-static FunctionType GetFunctionType(
-    llvm::SmallVector<PrimitiveType> dtypes,
-    llvm::SmallVector<std::optional<SymbolicShape>> shapes) {
-  std::vector<std::unique_ptr<Type>> operands;
-  operands.reserve(shapes.size());
-
-  for (auto tuple : llvm::zip(dtypes, shapes)) {
-    auto dtype = std::get<0>(tuple);
-    auto shape = std::get<1>(tuple);
-    if (shape.has_value()) {
-      operands.push_back(std::make_unique<MemrefType>(*shape, dtype));
-    } else {
-      operands.push_back(std::make_unique<UnrankedMemrefType>(dtype));
-    }
-  }
-
-  return FunctionType(std::move(operands), {});
-}
-
-// Creates fake opaque argument.
-static OpaqueArg GetFakeOpaqueArg() { return OpaqueArg(nullptr); }
-
-// Creates fake memref argument of the given shape.
-static MemrefDesc GetFakeMemref(SymbolicShape shape) {
-  // Data type of the fake memrefs doesn't matter.
-  return MemrefDesc(PrimitiveType::F32, nullptr, 0, shape,
-                    shape /* fake strides */);
-}
-
-// Creates fake memref arguments of the given shapes.
-static llvm::SmallVector<MemrefDesc> GetFakeMemrefs(
-    llvm::SmallVector<SymbolicShape> shapes) {
-  llvm::SmallVector<MemrefDesc> memrefs;
-  memrefs.reserve(shapes.size());
-  for (auto& shape : shapes) memrefs.push_back(GetFakeMemref(shape));
-  return memrefs;
-}
-
-// A helper function to convert initializer list to a list of shapes.
-static llvm::SmallVector<SymbolicShape> SymbolicShapes(
-    llvm::SmallVector<SymbolicShape> shapes) {
-  return shapes;
-}
-
-TEST(SymbolicShapeResolverTest, UnrankedInputs) {
-  // Operands: tensor<*xf32>, tensor<?xi32>, tensor<?x4xi1>
-  auto dtypes = {PrimitiveType::F32, PrimitiveType::S32, PrimitiveType::PRED};
-
-  auto type = GetFunctionType(
-      dtypes,
-      {std::nullopt, {{MemrefType::kDynamic}}, {{MemrefType::kDynamic, 4}}});
-
-  auto constraints = {ArgumentConstraint::kResolved,
-                      ArgumentConstraint::kResolved,
-                      ArgumentConstraint::kResolved};
-
-  SymbolicShapesResolver resolver(type, constraints);
-
-  {  // All unknown dimensions are the same at runtime.
-    auto operands = GetFakeMemrefs({{100, 100}, {100}, {100, 4}});
-    auto symbolic = resolver.Resolve(operands);
-    auto hash = resolver.ResolveHash(operands);
-
-    EXPECT_EQ(symbolic->size(), 3);
-    EXPECT_EQ(*symbolic, SymbolicShapes({{-2, -2}, {-2}, {-2, 4}}));
-
-    llvm::SmallVector<int64_t> values = {2, -2, -2, -2, -2, 4};
-    EXPECT_EQ(*hash, llvm::hash_combine_range(values.begin(), values.end()));
-  }
-
-  {  // All unknown dimensions are unique at runtime.
-    auto operands = GetFakeMemrefs({{100, 101}, {102}, {103, 4}});
-    auto symbolic = resolver.Resolve(operands);
-    auto hash = resolver.ResolveHash(operands);
-
-    EXPECT_EQ(symbolic->size(), 3);
-    EXPECT_EQ(*symbolic, SymbolicShapes({{-2, -3}, {-4}, {-5, 4}}));
-
-    llvm::SmallVector<int64_t> values = {2, -2, -3, -4, -5, 4};
-    EXPECT_EQ(*hash, llvm::hash_combine_range(values.begin(), values.end()));
-  }
-
-  {  // Ones converted to a static dimension.
-    auto operands = GetFakeMemrefs({{1, 1, 1}, {1}, {1, 4}});
-    auto symbolic = resolver.Resolve(operands);
-    auto hash = resolver.ResolveHash(operands);
-
-    EXPECT_EQ(symbolic->size(), 3);
-    EXPECT_EQ(*symbolic, SymbolicShapes({{1, 1, 1}, {1}, {1, 4}}));
-
-    llvm::SmallVector<int64_t> values = {3, 1, 1, 1, 1, 1, 4};
-    EXPECT_EQ(*hash, llvm::hash_combine_range(values.begin(), values.end()));
-  }
-
-  {  // Known constants converted to a static dimension.
-    auto operands = GetFakeMemrefs({{100, 4}, {4}, {1, 4}});
-    auto symbolic = resolver.Resolve(operands);
-    auto hash = resolver.ResolveHash(operands);
-
-    EXPECT_EQ(symbolic->size(), 3);
-    EXPECT_EQ(*symbolic, SymbolicShapes({{-2, 4}, {4}, {1, 4}}));
-
-    llvm::SmallVector<int64_t> values = {2, -2, 4, 4, 1, 4};
-    EXPECT_EQ(*hash, llvm::hash_combine_range(values.begin(), values.end()));
-  }
-}
-
-TEST(SymbolicShapeResolverTest, DynamicInputShapes) {
-  // Operands: tensor<?xf32>, tensor<?xi32>, tensor<?xi1>
-  auto dtypes = {PrimitiveType::F32, PrimitiveType::S32, PrimitiveType::PRED};
-  auto type = GetFunctionType(dtypes, {{{MemrefType::kDynamic}},
-                                       {{MemrefType::kDynamic}},
-                                       {{MemrefType::kDynamic}}});
-
-  auto constraints = {ArgumentConstraint::kResolved,
-                      ArgumentConstraint::kResolved,
-                      ArgumentConstraint::kResolved};
-
-  SymbolicShapesResolver resolver(type, constraints);
-
-  {  // All unknown dimensions are the same at runtime.
-    auto operands = GetFakeMemrefs({{100}, {100}, {100}});
-    auto symbolic = resolver.Resolve(operands);
-    auto hash = resolver.ResolveHash(operands);
-
-    EXPECT_EQ(symbolic->size(), 3);
-    EXPECT_EQ(*symbolic, SymbolicShapes({{-2}, {-2}, {-2}}));
-
-    llvm::SmallVector<int64_t> values = {-2, -2, -2};
-    EXPECT_EQ(*hash, llvm::hash_combine_range(values.begin(), values.end()));
-  }
-
-  {  // All unknown dimensions are unique at runtime.
-    auto operands = GetFakeMemrefs({{100}, {101}, {102}});
-    auto symbolic = resolver.Resolve(operands);
-    auto hash = resolver.ResolveHash(operands);
-
-    EXPECT_EQ(symbolic->size(), 3);
-    EXPECT_EQ(*symbolic, SymbolicShapes({{-2}, {-3}, {-4}}));
-
-    llvm::SmallVector<int64_t> values = {-2, -3, -4};
-    EXPECT_EQ(*hash, llvm::hash_combine_range(values.begin(), values.end()));
-  }
-
-  {  // Two of the three dimensions are the same.
-    auto operands = GetFakeMemrefs({{100}, {101}, {100}});
-    auto symbolic = resolver.Resolve(operands);
-    auto hash = resolver.ResolveHash(operands);
-
-    EXPECT_EQ(symbolic->size(), 3);
-    EXPECT_EQ(*symbolic, SymbolicShapes({{-2}, {-3}, {-2}}));
-
-    llvm::SmallVector<int64_t> values = {-2, -3, -2};
-    EXPECT_EQ(*hash, llvm::hash_combine_range(values.begin(), values.end()));
-  }
-
-  {  // Ones converted to a static dimension.
-    auto operands = GetFakeMemrefs({{1}, {1}, {100}});
-    auto symbolic = resolver.Resolve(operands);
-    auto hash = resolver.ResolveHash(operands);
-
-    EXPECT_EQ(symbolic->size(), 3);
-    EXPECT_EQ(*symbolic, SymbolicShapes({{1}, {1}, {-2}}));
-
-    llvm::SmallVector<int64_t> values = {1, 1, -2};
-    EXPECT_EQ(*hash, llvm::hash_combine_range(values.begin(), values.end()));
-  }
-}
-
-TEST(SymbolicShapeResolverTest, PartialInputShapes) {
-  // Operands: tensor<?x4xf32>, tensor<?x8xi32>, tensor<?xi1>
-  auto dtypes = {PrimitiveType::F32, PrimitiveType::S32, PrimitiveType::PRED};
-  auto type = GetFunctionType(dtypes, {{{MemrefType::kDynamic, 4}},
-                                       {{MemrefType::kDynamic, 8}},
-                                       {{MemrefType::kDynamic}}});
-
-  auto constraints = {ArgumentConstraint::kResolved,
-                      ArgumentConstraint::kResolved,
-                      ArgumentConstraint::kResolved};
-
-  SymbolicShapesResolver resolver(type, constraints);
-
-  {  // All unknown dimensions are the same at runtime.
-    auto operands = GetFakeMemrefs({{100, 4}, {100, 8}, {100}});
-    auto symbolic = resolver.Resolve(operands);
-    auto hash = resolver.ResolveHash(operands);
-
-    EXPECT_EQ(symbolic->size(), 3);
-    EXPECT_EQ(*symbolic, SymbolicShapes({{-2, 4}, {-2, 8}, {-2}}));
-
-    llvm::SmallVector<int64_t> values = {-2, 4, -2, 8, -2};
-    EXPECT_EQ(*hash, llvm::hash_combine_range(values.begin(), values.end()));
-  }
-
-  {  // All unknown dimensions are unique at runtime.
-    auto operands = GetFakeMemrefs({{100, 4}, {101, 8}, {102}});
-    auto symbolic = resolver.Resolve(operands);
-    auto hash = resolver.ResolveHash(operands);
-
-    EXPECT_EQ(symbolic->size(), 3);
-    EXPECT_EQ(*symbolic, SymbolicShapes({{-2, 4}, {-3, 8}, {-4}}));
-
-    llvm::SmallVector<int64_t> values = {-2, 4, -3, 8, -4};
-    EXPECT_EQ(*hash, llvm::hash_combine_range(values.begin(), values.end()));
-  }
-
-  {  // Two of the three dimensions are the same.
-    auto operands = GetFakeMemrefs({{100, 4}, {101, 8}, {100}});
-    auto symbolic = resolver.Resolve(operands);
-    auto hash = resolver.ResolveHash(operands);
-
-    EXPECT_EQ(symbolic->size(), 3);
-    EXPECT_EQ(*symbolic, SymbolicShapes({{-2, 4}, {-3, 8}, {-2}}));
-
-    llvm::SmallVector<int64_t> values = {-2, 4, -3, 8, -2};
-    EXPECT_EQ(*hash, llvm::hash_combine_range(values.begin(), values.end()));
-  }
-
-  {  // Ones converted to a static dimension.
-    auto operands = GetFakeMemrefs({{1, 4}, {100, 8}, {1}});
-    auto symbolic = resolver.Resolve(operands);
-    auto hash = resolver.ResolveHash(operands);
-
-    EXPECT_EQ(symbolic->size(), 3);
-    EXPECT_EQ(*symbolic, SymbolicShapes({{1, 4}, {-2, 8}, {1}}));
-
-    llvm::SmallVector<int64_t> values = {1, 4, -2, 8, 1};
-    EXPECT_EQ(*hash, llvm::hash_combine_range(values.begin(), values.end()));
-  }
-
-  {  // Known constants converted to a static dimension.
-    auto operands = GetFakeMemrefs({{100, 4}, {8, 8}, {8}});
-    auto symbolic = resolver.Resolve(operands);
-    auto hash = resolver.ResolveHash(operands);
-
-    EXPECT_EQ(symbolic->size(), 3);
-    EXPECT_EQ(*symbolic, SymbolicShapes({{-2, 4}, {8, 8}, {8}}));
-
-    llvm::SmallVector<int64_t> values = {-2, 4, 8, 8, 8};
-    EXPECT_EQ(*hash, llvm::hash_combine_range(values.begin(), values.end()));
-  }
-}
-
-TEST(SymbolicShapeResolverTest, ShapeConstrainedInput) {
-  // Operands: tensor<*xf32>, tensor<?x4xi32>
-  auto dtypes = {PrimitiveType::F32, PrimitiveType::S32};
-
-  auto type =
-      GetFunctionType(dtypes, {std::nullopt, {{MemrefType::kDynamic, 4}}});
-
-  auto constraints = {ArgumentConstraint::kShape, ArgumentConstraint::kShape};
-
-  SymbolicShapesResolver resolver(type, constraints);
-
-  {  // All unknown materialized as static shapes.
-    auto operands = GetFakeMemrefs({{100, 100}, {100, 4}});
-    auto symbolic = resolver.Resolve(operands);
-    auto hash = resolver.ResolveHash(operands);
-
-    EXPECT_EQ(symbolic->size(), 2);
-    EXPECT_EQ(*symbolic, SymbolicShapes({{100, 100}, {100, 4}}));
-
-    llvm::SmallVector<int64_t> values = {100, 100, 100, 4};
-    EXPECT_EQ(*hash, llvm::hash_combine_range(values.begin(), values.end()));
-  }
-}
-
-TEST(SymbolicShapeResolverTest, ShapeConstrainedInputAfterDynamicInput) {
-  // Operands: tensor<?x?xf32>, tensor<?x?xi32>
-  auto dtypes = {PrimitiveType::F32, PrimitiveType::S32};
-
-  auto type =
-      GetFunctionType(dtypes, {{{MemrefType::kDynamic, MemrefType::kDynamic}},
-                               {{MemrefType::kDynamic, MemrefType::kDynamic}}});
-
-  auto constraints = {ArgumentConstraint::kResolved,
-                      ArgumentConstraint::kShape};
-
-  SymbolicShapesResolver resolver(type, constraints);
-
-  {  // All unknown dimensions materialized as static shapes (first operand
-     // resolved based on seen static shapes of the second one).
-    auto operands = GetFakeMemrefs({{100, 50}, {100, 50}});
-    auto symbolic = resolver.Resolve(operands);
-    auto hash = resolver.ResolveHash(operands);
-
-    EXPECT_EQ(symbolic->size(), 2);
-    EXPECT_EQ(*symbolic, SymbolicShapes({{100, 50}, {100, 50}}));
-
-    llvm::SmallVector<int64_t> values = {100, 50, 100, 50};
-    EXPECT_EQ(*hash, llvm::hash_combine_range(values.begin(), values.end()));
-  }
-
-  {  // Unknown dimension correctly resolved to a symbolic dimension.
-    auto operands = GetFakeMemrefs({{100, 50}, {100, 4}});
-    auto symbolic = resolver.Resolve(operands);
-    auto hash = resolver.ResolveHash(operands);
-
-    EXPECT_EQ(symbolic->size(), 2);
-    EXPECT_EQ(*symbolic, SymbolicShapes({{100, -2}, {100, 4}}));
-
-    llvm::SmallVector<int64_t> values = {100, 4, 100, -2};
-    EXPECT_EQ(*hash, llvm::hash_combine_range(values.begin(), values.end()));
-  }
-}
-
-TEST(SymbolicShapeResolverTest, StaticShapeOperandHash) {
-  // Operands: tensor<?x?xf32>, tensor<4x4xi32>
-  auto dtypes = {PrimitiveType::F32, PrimitiveType::S32};
-
-  auto type = GetFunctionType(
-      dtypes, {{{MemrefType::kDynamic, MemrefType::kDynamic}}, {{4, 4}}});
-
-  auto constraints = {ArgumentConstraint::kResolved,
-                      ArgumentConstraint::kShape};
-
-  SymbolicShapesResolver resolver(type, constraints);
-
-  {  // Static shape doesn't participate in the hash value.
-    auto operands = GetFakeMemrefs({{2, 2}, {4, 4}});
-    auto symbolic = resolver.Resolve(operands);
-    auto hash = resolver.ResolveHash(operands);
-
-    EXPECT_EQ(symbolic->size(), 2);
-    EXPECT_EQ(*symbolic, SymbolicShapes({{-2, -2}, {4, 4}}));
-
-    llvm::SmallVector<int64_t> values = {-2, -2};
-    EXPECT_EQ(*hash, llvm::hash_combine_range(values.begin(), values.end()));
-  }
-}
-
-TEST(SymbolicShapeResolverTest, IncompatibleInput) {
-  // Operands: tensor<?x4xi32>
-  auto dtypes = {PrimitiveType::F32};
-  auto type = GetFunctionType(dtypes, {{{MemrefType::kDynamic, 4}}});
-  auto constraints = {ArgumentConstraint::kResolved};
-
-  SymbolicShapesResolver resolver(type, constraints);
-
-  {  // Operand of a different rank.
-    auto operands = GetFakeMemrefs({{100, 100, 100}});
-    auto symbolic = resolver.Resolve(operands);
-    auto hash = resolver.ResolveHash(operands);
-
-    EXPECT_FALSE(symbolic.ok());
-    EXPECT_FALSE(hash.ok());
-  }
-
-  {  // Operand with mismatched static shape.
-    auto operands = GetFakeMemrefs({{100, 100}});
-    auto symbolic = resolver.Resolve(operands);
-    auto hash = resolver.ResolveHash(operands);
-
-    EXPECT_FALSE(symbolic.ok());
-    EXPECT_FALSE(hash.ok());
-  }
-}
-
-TEST(SymbolicShapeResolverTest, OpaqueAndShapedInputs) {
-  std::vector<int64_t> shape = {MemrefType::kDynamic, 4};
-
-  // Operands: !async.token, tensor<?x4xf32>, tensor<?x4xf32>
-  std::vector<std::unique_ptr<Type>> operands;
-  operands.push_back(std::make_unique<AsyncTokenType>());
-  operands.push_back(std::make_unique<MemrefType>(shape, PrimitiveType::F32));
-  operands.push_back(std::make_unique<MemrefType>(shape, PrimitiveType::F32));
-
-  auto constraints = {ArgumentConstraint::kResolved,
-                      ArgumentConstraint::kResolved,
-                      ArgumentConstraint::kResolved};
-
-  FunctionType type(std::move(operands), {});
-
-  SymbolicShapesResolver resolver(type, constraints);
-
-  {  // Operand of a different shape.
-    Arguments<OpaqueArg, MemrefDesc> arguments(3);
-    arguments.push_back(GetFakeOpaqueArg());
-    arguments.push_back(GetFakeMemref({2, 4}));
-    arguments.push_back(GetFakeMemref({3, 4}));
-
-    auto symbolic = resolver.Resolve(arguments);
-    auto hash = resolver.ResolveHash(arguments);
-
-    ASSERT_TRUE(symbolic.ok());
-    EXPECT_EQ(symbolic->size(), 3);
-    EXPECT_EQ(*symbolic, SymbolicShapes({{}, {-2, 4}, {-3, 4}}));
-
-    llvm::SmallVector<int64_t> values = {-2, 4, -3, 4};
-    EXPECT_EQ(*hash, llvm::hash_combine_range(values.begin(), values.end()));
-  }
-
-  {  // Both dynamic dimensions are the same.
-    Arguments<OpaqueArg, MemrefDesc> arguments(3);
-    arguments.push_back(GetFakeOpaqueArg());
-    arguments.push_back(GetFakeMemref({2, 4}));
-    arguments.push_back(GetFakeMemref({2, 4}));
-
-    auto symbolic = resolver.Resolve(arguments);
-    auto hash = resolver.ResolveHash(arguments);
-
-    ASSERT_TRUE(symbolic.ok());
-    EXPECT_EQ(symbolic->size(), 3);
-    EXPECT_EQ(*symbolic, SymbolicShapes({{}, {-2, 4}, {-2, 4}}));
-
-    llvm::SmallVector<int64_t> values = {-2, 4, -2, 4};
-    EXPECT_EQ(*hash, llvm::hash_combine_range(values.begin(), values.end()));
-  }
-}
-
-// -------------------------------------------------------------------------- //
-// Performance benchmarks are below.
-// -------------------------------------------------------------------------- //
-
-struct Resolve {
-  static absl::StatusOr<llvm::SmallVector<SymbolicShape>> Run(
-      SymbolicShapesResolver& resolver, ArrayRef<MemrefDesc> operands) {
-    return resolver.Resolve(operands);
-  }
-};
-
-struct ResolveHash {
-  static absl::StatusOr<llvm::hash_code> Run(SymbolicShapesResolver& resolver,
-                                             ArrayRef<MemrefDesc> operands) {
-    return resolver.ResolveHash(operands);
-  }
-};
-
-template <typename Resolver>
-static void BenchmarkFullyDynamic(benchmark::State& state) {
-  auto dtypes = {PrimitiveType::F32, PrimitiveType::S32, PrimitiveType::PRED,
-                 PrimitiveType::F32};
-
-  auto type =
-      GetFunctionType(dtypes, {{{MemrefType::kDynamic, MemrefType::kDynamic}},
-                               {{MemrefType::kDynamic, MemrefType::kDynamic}},
-                               {{MemrefType::kDynamic, MemrefType::kDynamic}},
-                               {{MemrefType::kDynamic, MemrefType::kDynamic}}});
-
-  auto constraints = {
-      ArgumentConstraint::kResolved, ArgumentConstraint::kResolved,
-      ArgumentConstraint::kResolved, ArgumentConstraint::kResolved};
-
-  SymbolicShapesResolver resolver(type, constraints);
-
-  auto operands = GetFakeMemrefs({{1, 2}, {3, 4}, {5, 6}, {7, 8}});
-
-  for (auto _ : state) {
-    auto result = Resolver::Run(resolver, operands);
-    benchmark::DoNotOptimize(*result);
-  }
-}
-
-template <typename Resolver>
-static void BenchmarkSameDynamic(benchmark::State& state) {
-  auto dtypes = {PrimitiveType::F32, PrimitiveType::S32, PrimitiveType::PRED,
-                 PrimitiveType::F32};
-
-  auto type =
-      GetFunctionType(dtypes, {{{MemrefType::kDynamic, MemrefType::kDynamic}},
-                               {{MemrefType::kDynamic, MemrefType::kDynamic}},
-                               {{MemrefType::kDynamic, MemrefType::kDynamic}},
-                               {{MemrefType::kDynamic, MemrefType::kDynamic}}});
-
-  auto constraints = {
-      ArgumentConstraint::kResolved, ArgumentConstraint::kResolved,
-      ArgumentConstraint::kResolved, ArgumentConstraint::kResolved};
-
-  SymbolicShapesResolver resolver(type, constraints);
-
-  auto operands = GetFakeMemrefs({{2, 2}, {2, 2}, {2, 2}, {2, 2}});
-
-  for (auto _ : state) {
-    auto result = Resolver::Run(resolver, operands);
-    benchmark::DoNotOptimize(*result);
-  }
-}
-
-template <typename Resolver>
-static void BenchmarkSomeDynamic(benchmark::State& state) {
-  auto dtypes = {PrimitiveType::F32, PrimitiveType::S32, PrimitiveType::PRED,
-                 PrimitiveType::F32};
-
-  auto type =
-      GetFunctionType(dtypes, {{{2, 2}},
-                               {{4, 4}},
-                               {{8, 8}},
-                               {{MemrefType::kDynamic, MemrefType::kDynamic}}});
-
-  auto constraints = {
-      ArgumentConstraint::kResolved, ArgumentConstraint::kResolved,
-      ArgumentConstraint::kResolved, ArgumentConstraint::kResolved};
-
-  SymbolicShapesResolver resolver(type, constraints);
-
-  auto operands = GetFakeMemrefs({{2, 2}, {4, 4}, {8, 8}, {16, 16}});
-
-  for (auto _ : state) {
-    auto result = Resolver::Run(resolver, operands);
-    benchmark::DoNotOptimize(*result);
-  }
-}
-
-template <typename Resolver>
-static void BenchmarkStatic(benchmark::State& state) {
-  auto dtypes = {PrimitiveType::F32, PrimitiveType::S32, PrimitiveType::PRED,
-                 PrimitiveType::F32};
-
-  auto type = GetFunctionType(dtypes, {{{MemrefType::kDynamic, 4}},
-                                       {{MemrefType::kDynamic, 8}},
-                                       {{MemrefType::kDynamic, 16}},
-                                       {{MemrefType::kDynamic, 32}}});
-
-  auto constraints = {
-      ArgumentConstraint::kResolved, ArgumentConstraint::kResolved,
-      ArgumentConstraint::kResolved, ArgumentConstraint::kResolved};
-
-  SymbolicShapesResolver resolver(type, constraints);
-
-  auto operands = GetFakeMemrefs({{32, 4}, {16, 8}, {8, 16}, {4, 32}});
-
-  for (auto _ : state) {
-    auto result = Resolver::Run(resolver, operands);
-    benchmark::DoNotOptimize(*result);
-  }
-}
-
-template <typename Resolver>
-static void BenchmarkSymbolic(benchmark::State& state) {
-  auto dtypes = {PrimitiveType::F32, PrimitiveType::S32, PrimitiveType::PRED,
-                 PrimitiveType::F32};
-
-  auto type = GetFunctionType(dtypes, {{{MemrefType::kDynamic, 4}},
-                                       {{MemrefType::kDynamic, 8}},
-                                       {{MemrefType::kDynamic, 16}},
-                                       {{MemrefType::kDynamic, 32}}});
-
-  auto constraints = {
-      ArgumentConstraint::kResolved, ArgumentConstraint::kResolved,
-      ArgumentConstraint::kResolved, ArgumentConstraint::kResolved};
-
-  SymbolicShapesResolver resolver(type, constraints);
-
-  auto operands = GetFakeMemrefs({{1, 4}, {2, 8}, {3, 16}, {4, 32}});
-
-  for (auto _ : state) {
-    auto result = Resolver::Run(resolver, operands);
-    benchmark::DoNotOptimize(*result);
-  }
-}
-
-// -------------------------------------------------------------------------- //
-// Run benchmarks for resolving symbolic shapes.
-// -------------------------------------------------------------------------- //
-
-static void BM_ResolveFullyDynamic(benchmark::State& state) {
-  BenchmarkFullyDynamic<Resolve>(state);
-}
-
-static void BM_ResolveSameDynamic(benchmark::State& state) {
-  BenchmarkSameDynamic<Resolve>(state);
-}
-
-static void BM_ResolveSomeDynamic(benchmark::State& state) {
-  BenchmarkSomeDynamic<Resolve>(state);
-}
-
-static void BM_ResolveAsStatic(benchmark::State& state) {
-  BenchmarkStatic<Resolve>(state);
-}
-
-static void BM_ResolveAsSymbolic(benchmark::State& state) {
-  BenchmarkSymbolic<Resolve>(state);
-}
-
-BENCHMARK(BM_ResolveFullyDynamic);
-BENCHMARK(BM_ResolveSameDynamic);
-BENCHMARK(BM_ResolveSomeDynamic);
-BENCHMARK(BM_ResolveAsStatic);
-BENCHMARK(BM_ResolveAsSymbolic);
-
-// -------------------------------------------------------------------------- //
-// Run benchmarks for resolving and computing a hash of symbolic shapes.
-// -------------------------------------------------------------------------- //
-
-static void BM_ResolveHashFullyDynamic(benchmark::State& state) {
-  BenchmarkFullyDynamic<ResolveHash>(state);
-}
-
-static void BM_ResolveHashSameDynamic(benchmark::State& state) {
-  BenchmarkSameDynamic<ResolveHash>(state);
-}
-
-static void BM_ResolveHashSomeDynamic(benchmark::State& state) {
-  BenchmarkSomeDynamic<ResolveHash>(state);
-}
-
-static void BM_ResolveHashAsStatic(benchmark::State& state) {
-  BenchmarkStatic<ResolveHash>(state);
-}
-
-static void BM_ResolveHashAsSymbolic(benchmark::State& state) {
-  BenchmarkSymbolic<ResolveHash>(state);
-}
-
-BENCHMARK(BM_ResolveHashFullyDynamic);
-BENCHMARK(BM_ResolveHashSameDynamic);
-BENCHMARK(BM_ResolveHashSomeDynamic);
-BENCHMARK(BM_ResolveHashAsStatic);
-BENCHMARK(BM_ResolveHashAsSymbolic);
-
-// -------------------------------------------------------------------------- //
-// Run benchmarks for hashing resolved symbolic shapes.
-// -------------------------------------------------------------------------- //
-
-static void HashSymbolicShapes(benchmark::State& state,
-                               ArrayRef<SymbolicShape> symbolic_shapes) {
-  for (auto _ : state) {
-    auto hash = SymbolicShapesResolver::Hash(symbolic_shapes);
-    benchmark::DoNotOptimize(hash);
-  }
-}
-
-static void BM_Hash1x1(benchmark::State& state) {
-  llvm::SmallVector<SymbolicShape> symbolic_shapes(1, {1});
-  HashSymbolicShapes(state, symbolic_shapes);
-}
-
-static void BM_Hash1x4(benchmark::State& state) {
-  llvm::SmallVector<SymbolicShape> symbolic_shapes(4, {1});
-  HashSymbolicShapes(state, symbolic_shapes);
-}
-
-static void BM_Hash1x8(benchmark::State& state) {
-  llvm::SmallVector<SymbolicShape> symbolic_shapes(8, {1});
-  HashSymbolicShapes(state, symbolic_shapes);
-}
-
-static void BM_Hash2x1(benchmark::State& state) {
-  llvm::SmallVector<SymbolicShape> symbolic_shapes(1, {1, 2});
-  HashSymbolicShapes(state, symbolic_shapes);
-}
-
-static void BM_Hash2x4(benchmark::State& state) {
-  llvm::SmallVector<SymbolicShape> symbolic_shapes(4, {1, 2});
-  HashSymbolicShapes(state, symbolic_shapes);
-}
-
-static void BM_Hash2x8(benchmark::State& state) {
-  llvm::SmallVector<SymbolicShape> symbolic_shapes(8, {1, 2});
-  HashSymbolicShapes(state, symbolic_shapes);
-}
-
-BENCHMARK(BM_Hash1x1);
-BENCHMARK(BM_Hash1x4);
-BENCHMARK(BM_Hash1x8);
-BENCHMARK(BM_Hash2x1);
-BENCHMARK(BM_Hash2x4);
-BENCHMARK(BM_Hash2x8);
-
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/runtime/tracing.h b/third_party/xla/xla/runtime/tracing.h
deleted file mode 100644
index 2d5b8e3b04921d..00000000000000
--- a/third_party/xla/xla/runtime/tracing.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_RUNTIME_TRACING_H_
-#define XLA_RUNTIME_TRACING_H_
-
-#include <string_view>
-
-#include "xla/runtime/custom_call.h"
-#include "xla/runtime/type_id.h"
-
-namespace xla {
-namespace runtime {
-
-// XLA run-time representation of the `!rt.hlo_trace` attribute.
-struct HloTrace {
-  std::string_view hlo_op;
-};
-
-// Registers type id names for tracing attributes.
-inline void PopulateTraceTypeIdNames(TypeIDNameRegistry& registry) {
-  registry.Register<Tagged<HloTrace>>("__type_id_hlo_trace");
-}
-
-// Register XLA runtime custom calls attribute decoding.
-XLA_RUNTIME_REGISTER_AGGREGATE_ATTR_DECODING(
-    HloTrace, AggregateMember<std::string_view>("hlo_op"));
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_RUNTIME_TRACING_H_
diff --git a/third_party/xla/xla/runtime/type_id.h b/third_party/xla/xla/runtime/type_id.h
deleted file mode 100644
index 73c1f3af9b0f74..00000000000000
--- a/third_party/xla/xla/runtime/type_id.h
+++ /dev/null
@@ -1,173 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_RUNTIME_TYPE_ID_H_
-#define XLA_RUNTIME_TYPE_ID_H_
-
-#include <atomic>
-#include <cstdint>
-#include <functional>
-#include <memory>
-#include <string>
-#include <string_view>
-
-#include "absl/container/flat_hash_map.h"
-#include "mlir/Support/TypeID.h"  // from @llvm-project
-
-namespace mlir {
-// Allow TypeID to be used as a key in ABSL map containers.
-template <typename H>
-H AbslHashValue(H h, const TypeID& type_id) {
-  return H::combine(std::move(h), type_id.getAsOpaquePointer());
-}
-}  // namespace mlir
-
-namespace xla {
-namespace runtime {
-
-using ::mlir::TypeID;  // NOLINT
-
-//===----------------------------------------------------------------------===//
-// Mapping from TypeID to unique symbol name.
-//===----------------------------------------------------------------------===//
-
-// A mapping from TypeID to unique type name, because we do not have any
-// standard tools to get a type name in C++. We rely on this mapping to declare
-// external symbols corresponding to type ids in compiled XLA executables.
-class TypeIDNameRegistry {
- public:
-  TypeIDNameRegistry() = default;
-  ~TypeIDNameRegistry() = default;
-
-  template <typename T>
-  void Register(std::string_view type_name) {
-    auto inserted = type_id_name_map_.try_emplace(TypeID::get<T>(), type_name);
-    assert((inserted.second || inserted.first->second == type_name) &&
-           "conflicting typeid name registration");
-    (void)inserted;
-  }
-
-  std::string_view FindTypeIDSymbolName(TypeID type_id);
-
-  void ForEach(std::function<void(std::string_view, TypeID)> f) const {
-    for (auto& kv : type_id_name_map_) f(kv.second, kv.first);
-  }
-
- private:
-  absl::flat_hash_map<TypeID, std::string> type_id_name_map_;
-};
-
-//===----------------------------------------------------------------------===//
-// DenseTypeID for generating sequential type ids.
-//===----------------------------------------------------------------------===//
-
-// Forward declare
-template <typename IdSet>
-class DenseTypeId;
-
-namespace internal {
-template <typename IdSet, typename T>
-class DenseTypeIdResolver {
-  friend DenseTypeId<IdSet>;
-  static size_t get();
-};
-}  // namespace internal
-
-// Use this as DenseTypeId<some_type_specific_to_your_use>, that way you are
-// guaranteed to get contiguous IDs starting at 0 unique to your particular
-// use case, as would be appropriate to use for indexes into a vector.
-// 'some_type_specific_to_your_use' could (e.g.) be the class that contains
-// that particular vector.
-template <typename IdSet>
-class DenseTypeId {
- public:
-  template <typename T>
-  static size_t get() {
-    return internal::DenseTypeIdResolver<IdSet, T>::get();
-  }
-
- private:
-  // Partial template specialization can't be declared as a friend, so we
-  // declare all `DenseTypeIdResolver` as a friend.
-  template <typename OtherIdSet, typename T>
-  friend class internal::DenseTypeIdResolver;
-
-  static size_t next_id() {
-    return next_id_.fetch_add(1, std::memory_order_relaxed);
-  }
-
-  static std::atomic<size_t> next_id_;
-};
-
-template <typename IdSet>
-std::atomic<size_t> DenseTypeId<IdSet>::next_id_;
-
-namespace internal {
-template <typename IdSet, typename T>
-size_t DenseTypeIdResolver<IdSet, T>::get() {
-  static const size_t id = DenseTypeId<IdSet>::next_id();
-  return id;
-}
-}  // namespace internal
-
-}  // namespace runtime
-}  // namespace xla
-
-// Declare/define an explicit specialization for DenseTypeId.
-//
-// This forces the compiler to assign a dense type id for the given type and
-// avoids checking the static initialization guard if the type id defined as a
-// static variable (default implementation of the DenseTypeIdResolver).
-//
-//  Example:
-//
-//  // Foo.h
-//  struct FooIdSet {};
-//
-//  XLA_RUNTIME_DECLARE_EXPLICIT_DENSE_TYPE_ID(FooIdSet, int32_t);
-//
-//  // Foo.cpp
-//  XLA_RUNTIME_DEFINE_EXPLICIT_DENSE_TYPE_ID(FooIdSet, int32_t);
-//
-#define XLA_RUNTIME_DECLARE_EXPLICIT_DENSE_TYPE_ID(ID_SET, T) \
-  namespace xla {                                             \
-  namespace runtime {                                         \
-  namespace internal {                                        \
-                                                              \
-  template <>                                                 \
-  class DenseTypeIdResolver<ID_SET, T> {                      \
-   public:                                                    \
-    static size_t get() { return id; }                        \
-                                                              \
-   private:                                                   \
-    static size_t id;                                         \
-  };                                                          \
-                                                              \
-  } /* namespace internal */                                  \
-  } /* namespace runtime */                                   \
-  } /* namespace xla */
-
-#define XLA_RUNTIME_DEFINE_EXPLICIT_DENSE_TYPE_ID(ID_SET, T)                  \
-  namespace xla {                                                             \
-  namespace runtime {                                                         \
-  namespace internal {                                                        \
-                                                                              \
-  size_t DenseTypeIdResolver<ID_SET, T>::id = DenseTypeId<ID_SET>::next_id(); \
-                                                                              \
-  } /* namespace internal */                                                  \
-  } /* namespace runtime */                                                   \
-  } /* namespace xla */
-
-#endif  // XLA_RUNTIME_TYPE_ID_H_
diff --git a/third_party/xla/xla/runtime/type_id_test.cc b/third_party/xla/xla/runtime/type_id_test.cc
deleted file mode 100644
index 984a767d001cbc..00000000000000
--- a/third_party/xla/xla/runtime/type_id_test.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/runtime/type_id.h"
-
-#include "tsl/platform/test.h"
-#include "tsl/platform/test_benchmark.h"
-
-namespace xla {
-namespace runtime {
-
-struct IdSetA {};
-struct IdSetB {};
-struct IdSetC {};
-
-using DenseIdA = DenseTypeId<IdSetA>;
-using DenseIdB = DenseTypeId<IdSetB>;
-using DenseIdC = DenseTypeId<IdSetC>;
-
-TEST(DenseTypeIdTest, GetId) {
-  // Generate unique type ids in the set A.
-  EXPECT_EQ(DenseIdA::get<int32_t>(), 0);
-  EXPECT_EQ(DenseIdA::get<int64_t>(), 1);
-
-  // Check that unique type ids in the set B are independent.
-  EXPECT_EQ(DenseIdB::get<int64_t>(), 0);
-  EXPECT_EQ(DenseIdB::get<int32_t>(), 1);
-
-  // Check that we get back the same type id.
-  EXPECT_EQ(DenseIdA::get<int32_t>(), 0);
-}
-
-//===----------------------------------------------------------------------===//
-// Performance benchmarks are below.
-//===----------------------------------------------------------------------===//
-
-static void BM_GetDenseTypeIdA(benchmark::State& state) {
-  for (auto _ : state) {
-    auto id = DenseIdA::get<int32_t>();
-    benchmark::DoNotOptimize(id);
-  }
-}
-
-static void BM_GetDenseTypeIdC(benchmark::State& state) {
-  for (auto _ : state) {
-    auto id = DenseIdC::get<int32_t>();
-    benchmark::DoNotOptimize(id);
-  }
-}
-
-BENCHMARK(BM_GetDenseTypeIdA);
-BENCHMARK(BM_GetDenseTypeIdC);
-
-}  // namespace runtime
-}  // namespace xla
-
-XLA_RUNTIME_DECLARE_EXPLICIT_DENSE_TYPE_ID(xla::runtime::IdSetC, int32_t);
-XLA_RUNTIME_DEFINE_EXPLICIT_DENSE_TYPE_ID(xla::runtime::IdSetC, int32_t);
diff --git a/third_party/xla/xla/runtime/types.cc b/third_party/xla/xla/runtime/types.cc
deleted file mode 100644
index 2e2b7dfdaafb5e..00000000000000
--- a/third_party/xla/xla/runtime/types.cc
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/runtime/types.h"
-
-#include <functional>
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_join.h"
-#include "llvm/ADT/StringExtras.h"
-#include "xla/primitive_util.h"
-
-namespace xla {
-namespace runtime {
-
-using xla::primitive_util::LowercasePrimitiveTypeName;
-
-//===----------------------------------------------------------------------===//
-// Pretty printing for canonical types.
-//===----------------------------------------------------------------------===//
-
-using absl::StrCat;
-using absl::StrJoin;
-
-static std::string FormatSizes(absl::Span<const int64_t> arr) {
-  return arr.empty() ? "" : StrCat(StrJoin(arr, "x"), "x");
-}
-
-std::string AsyncTokenType::ToString() const { return "!async.token"; }
-
-std::string AsyncValueType::ToString() const {
-  return StrCat("!async.value<", value_type().ToString(), ">");
-}
-
-std::string ScalarType::ToString() const {
-  return LowercasePrimitiveTypeName(type_);
-}
-
-std::string TupleType::ToString() const {
-  auto to_string = [](const auto& elem) { return elem->ToString(); };
-  return StrCat("tuple<", llvm::join(llvm::map_range(elems_, to_string), ", "),
-                ">");
-}
-
-std::string RankedTensorType::ToString() const {
-  return StrCat("tensor<", FormatSizes(sizes()),
-                LowercasePrimitiveTypeName(element_type()), ">");
-}
-
-std::string UnrankedTensorType::ToString() const {
-  return StrCat("tensor<*x", LowercasePrimitiveTypeName(element_type()), ">");
-}
-
-std::string MemrefType::ToString() const {
-  return StrCat("memref<", FormatSizes(sizes()),
-                LowercasePrimitiveTypeName(element_type()), ">");
-}
-
-std::string UnrankedMemrefType::ToString() const {
-  return StrCat("memref<*x", LowercasePrimitiveTypeName(element_type()), ">");
-}
-
-std::string ExecutionContextOperandType::ToString() const {
-  return "!rt.execution_context";
-}
-
-std::string OpaqueOperandType::ToString() const { return "!rt.opaque"; }
-
-//===----------------------------------------------------------------------===//
-// ABI definition for canonical types.
-//===----------------------------------------------------------------------===//
-
-using ArgumentAbi = Type::ArgumentAbi;
-using ResultAbi = Type::ResultAbi;
-
-// Async token passed as a pointer to the runtime async token.
-absl::StatusOr<ArgumentAbi> AsyncTokenType::AsArgument() const {
-  return ArgumentAbi{1};
-}
-
-// Async token returned as a pointer to the runtime async token.
-absl::StatusOr<ResultAbi> AsyncTokenType::AsResult() const {
-  return ResultAbi{sizeof(void*)};
-}
-
-// Async value passed as a pointer to the runtime async value.
-absl::StatusOr<ArgumentAbi> AsyncValueType::AsArgument() const {
-  return ArgumentAbi{1};
-}
-
-// Async value returned as a pointer to the runtime async value.
-absl::StatusOr<ResultAbi> AsyncValueType::AsResult() const {
-  return ResultAbi{sizeof(void*)};
-}
-
-absl::StatusOr<ArgumentAbi> ScalarType::AsArgument() const {
-  return ArgumentAbi{1};  // scalars passed as a single pointer
-}
-
-absl::StatusOr<ResultAbi> ScalarType::AsResult() const {
-  size_t n_bytes = primitive_util::ByteWidth(type_);
-  return ResultAbi{n_bytes};
-}
-
-// Memref passed as an unrolled strided memref type.
-absl::StatusOr<ArgumentAbi> MemrefType::AsArgument() const {
-  return ArgumentAbi{3 + 2 * rank()};
-}
-
-// TODO(ezhulenev): We should query the size of the `StridedMemrefType`
-// directly, however it introduces dependency on the MLIR C runner utils.
-//
-// Memrefs are returned as StridedMemref<T, rank> type:
-//   basePtr, data, offset, sizes[rank], strides[rank]
-absl::StatusOr<ResultAbi> MemrefType::AsResult() const {
-  return ResultAbi{
-      sizeof(void*) * 2 +           // pointers
-      sizeof(int64_t) +             // offset
-      sizeof(int64_t) * 2 * rank()  // sizes and strides
-  };
-}
-
-// Execution context passed as a single opaque pointer.
-absl::StatusOr<ArgumentAbi> ExecutionContextOperandType::AsArgument() const {
-  return ArgumentAbi{1};
-}
-
-// Opaque operands passed as a single opaque pointer.
-absl::StatusOr<ArgumentAbi> OpaqueOperandType::AsArgument() const {
-  return ArgumentAbi{1};
-}
-
-}  // namespace runtime
-}  // namespace xla
diff --git a/third_party/xla/xla/runtime/types.h b/third_party/xla/xla/runtime/types.h
deleted file mode 100644
index e4cae8bf95cdeb..00000000000000
--- a/third_party/xla/xla/runtime/types.h
+++ /dev/null
@@ -1,326 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_RUNTIME_TYPES_H_
-#define XLA_RUNTIME_TYPES_H_
-
-#include <functional>
-#include <limits>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "llvm/Support/ExtensibleRTTI.h"
-#include "llvm/Support/raw_ostream.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla {
-namespace runtime {
-
-//===----------------------------------------------------------------------===//
-// Canonical XLA runtime types for the executable arguments.
-//===----------------------------------------------------------------------===//
-
-// Types supported by the compiled function signature. We do rely on the LLVM
-// style RTTI (https://llvm.org/docs/HowToSetUpLLVMStyleRTTI.html) to avoid
-// dependency on the MLIR types at runtime, because we don't want to depend
-// on any of the compiler implementation details at runtime and we want to
-// support lightweight loading and execution of AOT compiled programs.
-//
-// We rely on the RTTI for the open class hierarchies, because we want to allow
-// users to define their own types for the arguments.
-//
-// If the type can be passed to the compiled function as an argument or returned
-// as a result, it must define its own ABI. The ABI is defined by the MLIR to
-// LLVM lowering pipeline and the runtime integration (see `runtime.h`).
-class Type : public llvm::RTTIExtends<Type, llvm::RTTIRoot> {
- public:
-  static constexpr char ID = 0;  // NOLINT
-
-  // Arguments to compiled functions passed as a set of pointers. For example
-  // memref descriptor passed in as a set of pointers to data, sizes and
-  // strides. See `Argument::Pack` implementation for details (in `argument.h`).
-  struct ArgumentAbi {
-    size_t num_ptrs;
-  };
-
-  // Compiled function returns results by writing into the pre-allocated storage
-  // of the given size with the requested alignment. Runtime pre-allocates
-  // memory required for all results in the call frame.
-  struct ResultAbi {
-    size_t size;
-
-    // TODO(ezhulenev): Add alignment to the result ABI. Alignment is an
-    // important part of the result ABI that we ignore today. It all doesn't
-    // crash only because all results happen to have a size that is multiple of
-    // 8 bytes, and because of that all of the results are properly aligned.
-    // Results memory layout in the call frame should take in account base
-    // pointer alignment and alignment requirements of all results.
-  };
-
-  // Returns an Abi if the type can be used as an argument.
-  virtual absl::StatusOr<ArgumentAbi> AsArgument() const {
-    return absl::UnimplementedError("argument ABI is not implemented");
-  }
-
-  // Returns an Abi if the type can be returned as a result.
-  virtual absl::StatusOr<ResultAbi> AsResult() const {
-    return absl::UnimplementedError("result ABI is not implemented");
-  }
-
-  virtual std::string ToString() const = 0;
-
- protected:
-  Type() = default;
-};
-
-inline llvm::raw_ostream& operator<<(llvm::raw_ostream& os, const Type& type) {
-  return os << type.ToString();
-}
-
-//===----------------------------------------------------------------------===//
-// Async Token type corresponding to the mlir::async::TokenType
-//===----------------------------------------------------------------------===//
-
-class AsyncTokenType : public llvm::RTTIExtends<AsyncTokenType, Type> {
- public:
-  static constexpr char ID = 0;  // NOLINT
-
-  absl::StatusOr<ArgumentAbi> AsArgument() const final;
-  absl::StatusOr<ResultAbi> AsResult() const final;
-
-  std::string ToString() const final;
-};
-
-//===----------------------------------------------------------------------===//
-// Async Value type corresponding to the mlir::async::ValueType.
-//===----------------------------------------------------------------------===//
-
-class AsyncValueType : public llvm::RTTIExtends<AsyncValueType, Type> {
- public:
-  static constexpr char ID = 0;  // NOLINT
-
-  explicit AsyncValueType(std::unique_ptr<Type> value_type)
-      : value_type_(std::move(value_type)) {}
-
-  const Type& value_type() const { return *value_type_; }
-
-  absl::StatusOr<ArgumentAbi> AsArgument() const final;
-  absl::StatusOr<ResultAbi> AsResult() const final;
-
-  std::string ToString() const final;
-
- private:
-  std::unique_ptr<Type> value_type_;
-};
-
-//===----------------------------------------------------------------------===//
-// Scalar type corresponding to mlir::IntegerType or mlir::FloatType.
-//===----------------------------------------------------------------------===//
-
-class ScalarType : public llvm::RTTIExtends<ScalarType, Type> {
- public:
-  static constexpr char ID = 0;  // NOLINT
-
-  explicit ScalarType(PrimitiveType type) : type_(type) {}
-
-  PrimitiveType type() const { return type_; }
-
-  absl::StatusOr<ArgumentAbi> AsArgument() const final;
-  absl::StatusOr<ResultAbi> AsResult() const final;
-
-  std::string ToString() const final;
-
- private:
-  PrimitiveType type_;
-};
-
-//===----------------------------------------------------------------------===//
-// Tuple type corresponding to mlir::TupleType.
-//===----------------------------------------------------------------------===//
-
-class TupleType : public llvm::RTTIExtends<TupleType, Type> {
- public:
-  static constexpr char ID = 0;  // NOLINT
-
-  explicit TupleType(llvm::SmallVector<std::unique_ptr<Type>> elems)
-      : elems_(std::move(elems)) {}
-
-  std::string ToString() const final;
-
-  // Note: the AsArgument() and AsResult() methods are unimplemented, because
-  // this type is not meant to be used without expansion at run time.
-
- private:
-  llvm::SmallVector<std::unique_ptr<Type>> elems_;
-};
-
-//===----------------------------------------------------------------------===//
-// Ranked Tensor type corresponding to the mlir::RankedTensorType.
-//===----------------------------------------------------------------------===//
-
-class RankedTensorType : public llvm::RTTIExtends<RankedTensorType, Type> {
- public:
-  static constexpr char ID = 0;  // NOLINT
-  static constexpr int64_t kDynamic = std::numeric_limits<int64_t>::min();
-
-  static constexpr bool IsDynamic(int64_t dim) { return dim == kDynamic; }
-
-  RankedTensorType(absl::Span<const int64_t> sizes, PrimitiveType element_type)
-      : sizes_(sizes.begin(), sizes.end()), element_type_(element_type) {}
-
-  absl::Span<const int64_t> sizes() const { return sizes_; }
-  unsigned rank() const { return sizes_.size(); }
-  PrimitiveType element_type() const { return element_type_; }
-
-  std::string ToString() const final;
-
- private:
-  std::vector<int64_t> sizes_;
-  PrimitiveType element_type_;
-};
-
-//===----------------------------------------------------------------------===//
-// Unranked Tensor type corresponding to the mlir::UnrankedTensorType.
-//===----------------------------------------------------------------------===//
-
-class UnrankedTensorType : public llvm::RTTIExtends<UnrankedTensorType, Type> {
- public:
-  static constexpr char ID = 0;  // NOLINT
-
-  explicit UnrankedTensorType(PrimitiveType element_type)
-      : element_type_(element_type) {}
-
-  PrimitiveType element_type() const { return element_type_; }
-
-  std::string ToString() const final;
-
- private:
-  PrimitiveType element_type_;
-};
-
-//===----------------------------------------------------------------------===//
-// Ranked Memref type corresponding to the mlir::MemrefType.
-//===----------------------------------------------------------------------===//
-
-class MemrefType : public llvm::RTTIExtends<MemrefType, Type> {
- public:
-  static constexpr char ID = 0;  // NOLINT
-  static constexpr int64_t kDynamic = std::numeric_limits<int64_t>::min();
-
-  static constexpr bool IsDynamic(int64_t dim) { return dim == kDynamic; }
-
-  MemrefType(absl::Span<const int64_t> sizes, PrimitiveType element_type)
-      : sizes_(sizes.begin(), sizes.end()), element_type_(element_type) {}
-
-  absl::Span<const int64_t> sizes() const { return sizes_; }
-  int64_t size(size_t dim) const { return sizes_[dim]; }
-  unsigned rank() const { return sizes_.size(); }
-  PrimitiveType element_type() const { return element_type_; }
-
-  absl::StatusOr<ArgumentAbi> AsArgument() const final;
-  absl::StatusOr<ResultAbi> AsResult() const final;
-
-  std::string ToString() const final;
-
- private:
-  std::vector<int64_t> sizes_;
-  PrimitiveType element_type_;
-};
-
-//===----------------------------------------------------------------------===//
-// Unranked Memref type corresponding to the mlir::UnrankedMemrefType.
-//===----------------------------------------------------------------------===//
-
-class UnrankedMemrefType : public llvm::RTTIExtends<UnrankedMemrefType, Type> {
- public:
-  static constexpr char ID = 0;  // NOLINT
-
-  explicit UnrankedMemrefType(PrimitiveType element_type)
-      : element_type_(element_type) {}
-
-  PrimitiveType element_type() const { return element_type_; }
-
-  std::string ToString() const final;
-
- private:
-  PrimitiveType element_type_;
-};
-
-//===----------------------------------------------------------------------===//
-// Corresponds to the RT dialect's ExecutionContextType.
-//===----------------------------------------------------------------------===//
-
-class ExecutionContextOperandType
-    : public llvm::RTTIExtends<ExecutionContextOperandType, Type> {
- public:
-  static constexpr char ID = 0;  // NOLINT
-
-  absl::StatusOr<ArgumentAbi> AsArgument() const final;
-
-  std::string ToString() const final;
-};
-
-//===----------------------------------------------------------------------===//
-// Corresponds to the RT dialect's OpaqueType.
-//===----------------------------------------------------------------------===//
-
-class OpaqueOperandType : public llvm::RTTIExtends<OpaqueOperandType, Type> {
- public:
-  static constexpr char ID = 0;  // NOLINT
-
-  absl::StatusOr<ArgumentAbi> AsArgument() const final;
-
-  std::string ToString() const final;
-};
-
-//===----------------------------------------------------------------------===//
-// Compiled function signature type corresponding to the mlir::FunctionType.
-//===----------------------------------------------------------------------===//
-
-// TODO(ezhulenev): Make function type copyable (replace std::unique_ptr with
-// std::shared ptr).
-class FunctionType {
- public:
-  const Type* operand(unsigned index) const { return operands_[index].get(); }
-  const Type* result(unsigned index) const { return results_[index].get(); }
-
-  unsigned num_operands() const { return operands_.size(); }
-  unsigned num_results() const { return results_.size(); }
-
-  FunctionType(std::vector<std::unique_ptr<Type>> operands,
-               std::vector<std::unique_ptr<Type>> results)
-      : operands_(std::move(operands)), results_(std::move(results)) {}
-
-  void insert_operand(unsigned index, std::unique_ptr<Type> operand) {
-    operands_.insert(operands_.begin() + index, std::move(operand));
-  }
-
-  void insert_result(unsigned index, std::unique_ptr<Type> result) {
-    results_.insert(results_.begin() + index, std::move(result));
-  }
-
- private:
-  std::vector<std::unique_ptr<Type>> operands_;
-  std::vector<std::unique_ptr<Type>> results_;
-};
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_RUNTIME_TYPES_H_
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 56933bf4056053..fa49f73567299d 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -23,9 +23,9 @@ load(
     "//xla:xla.bzl",
     "xla_cc_binary",
     "xla_cc_test",
+    "xla_internal",
     "xla_py_proto_library",
     "xla_py_test_deps",
-    "xla_symbol_repository_deps",
 )
 load("//xla/service:xla_compile.bzl", "xla_aot_compile_cpu", "xla_aot_compile_gpu", "xla_aot_compile_gpu_runtime_autotuning")
 load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
@@ -102,6 +102,7 @@ cc_library(
     srcs = ["collective_opt_utils.cc"],
     hdrs = ["collective_opt_utils.h"],
     deps = [
+        "//xla:util",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
     ],
@@ -144,6 +145,7 @@ cc_library(
     deps = [
         ":hlo_domain_map",
         "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/log",
     ],
 )
 
@@ -492,6 +494,7 @@ cc_library(
         ":hlo_pass",
         ":value_range",
         "//xla:comparison_util",
+        "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:status",
@@ -507,9 +510,11 @@ cc_library(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/numeric:int128",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -535,6 +540,7 @@ cc_library(
     name = "dump",
     srcs = ["dump.cc"],
     hdrs = ["dump.h"],
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     deps = [
         ":hlo_graph_dumper",
         ":hlo_proto_util",
@@ -556,6 +562,7 @@ cc_library(
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:regexp",
         "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/profiler/lib:scoped_annotation",
     ],
 )
 
@@ -638,6 +645,7 @@ cc_library(
         ":dot_as_convolution_util",
         ":hlo_graph_dumper",
         ":hlo_pass",
+        ":host_memory_offload_annotations_hdr",
         "//xla:array",
         "//xla:protobuf_util",
         "//xla:shape_tree",
@@ -650,7 +658,10 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_sharding_util",
+        "//xla/service/spmd:shard_barrier_partitioner",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log:check",
@@ -877,6 +888,8 @@ xla_cc_test(
     srcs = ["hlo_instruction_test.cc"],
     tags = ["no_aarch64"],
     deps = [
+        ":pattern_matcher",
+        ":pattern_matcher_gmock",
         "//xla:literal",
         "//xla:protobuf_util",
         "//xla:shape_util",
@@ -1092,6 +1105,7 @@ cc_library(
         "//xla:util",
         "//xla/stream_executor",
         "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:stream_executor_interface",
         "//xla/stream_executor/host:host_platform_id",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
@@ -1239,6 +1253,8 @@ cc_library(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
@@ -1437,8 +1453,8 @@ xla_cc_test(
         ":shaped_buffer",
         "//xla:shape_util",
         "//xla:test",
-        "//xla/stream_executor",
         "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:stream_executor_interface",
         "//xla/tests:xla_internal_test_main",
         "@local_tsl//tsl/platform:test_benchmark",
     ],
@@ -1452,6 +1468,7 @@ cc_library(
         "service_executable_run_options.h",
     ],
     deps = [
+        ":buffer_assignment",
         ":computation_layout",
         ":dump",
         ":hlo_execution_profile",
@@ -1542,10 +1559,12 @@ cc_library(
     name = "llvm_compiler",
     srcs = ["llvm_compiler.cc"],
     hdrs = ["llvm_compiler.h"],
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     deps = [
         ":compiler",
         "@llvm-project//llvm:Core",
         "@local_tsl//tsl/platform:denormal",
+        "@local_tsl//tsl/profiler/lib:scoped_annotation",
     ],
 )
 
@@ -1888,6 +1907,7 @@ cc_library(
     name = "hlo_memory_scheduler",
     srcs = ["hlo_memory_scheduler.cc"],
     hdrs = ["hlo_memory_scheduler.h"],
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     deps = [
         ":hlo_alias_analysis",
         ":hlo_pass",
@@ -1905,6 +1925,7 @@ cc_library(
         "@local_tsl//tsl/lib/gtl:map_util",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/profiler/lib:scoped_annotation",
     ],
 )
 
@@ -2012,6 +2033,7 @@ cc_library(
         "//xla/client/lib:comparators",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -2629,7 +2651,6 @@ cc_library(
     name = "collective_combiner_utils",
     hdrs = ["collective_combiner_utils.h"],
     deps = [
-        ":hlo_domain_map",
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:xla_data_proto_cc",
@@ -2638,6 +2659,10 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
     ],
 )
@@ -3170,6 +3195,51 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "scan_loop_accumulator_input_unification",
+    srcs = ["scan_loop_accumulator_input_unification.cc"],
+    hdrs = ["scan_loop_accumulator_input_unification.h"],
+    deps = [
+        ":call_graph",
+        ":hlo_alias_analysis",
+        ":hlo_dataflow_analysis",
+        ":hlo_pass",
+        ":pattern_matcher",
+        ":tuple_simplifier",
+        ":while_loop_simplifier",
+        ":while_loop_unroller",
+        "//xla:literal_util",
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "scan_loop_accumulator_input_unification_test",
+    srcs = ["scan_loop_accumulator_input_unification_test.cc"],
+    deps = [
+        ":copy_insertion",
+        ":scan_loop_accumulator_input_unification",
+        "//xla:literal",
+        "//xla/hlo/ir:hlo",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:literal_test_util",
+        "//xla/tests:test_utils",
+        "//xla/tests:verified_hlo_module",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/log",
+        "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "while_loop_unroller",
     srcs = ["while_loop_unroller.cc"],
@@ -3178,8 +3248,10 @@ cc_library(
         ":call_inliner",
         ":collective_ops_utils",
         ":flatten_call_graph",
+        ":hlo_creation_utils",
         ":hlo_cse",
         ":hlo_pass",
+        ":pattern_matcher",
         ":tuple_simplifier",
         ":while_loop_analysis",
         ":while_loop_constant_sinking",
@@ -3202,7 +3274,6 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -3291,6 +3362,7 @@ cc_library(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
@@ -3332,6 +3404,7 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:errors",
     ],
@@ -3557,12 +3630,14 @@ cc_library(
         ":hlo_pass",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:status_macros",
-        "//xla:types",
+        "//xla:status",
+        "//xla:util",
         "//xla/hlo/ir:hlo",
-        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -4043,6 +4118,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@local_tsl//tsl/lib/gtl:map_util",
         "@local_tsl//tsl/platform:errors",
     ],
@@ -4742,6 +4818,7 @@ cc_library(
     hdrs = ["hlo_dce.h"],
     deps = [
         ":hlo_pass",
+        "//xla:shape_util",
         "//xla:status",
         "//xla:statusor",
         "//xla:util",
@@ -4749,7 +4826,9 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
@@ -4818,7 +4897,9 @@ xla_cc_test(
         ":hlo_parser",
         ":hlo_verifier",
         ":layout_assignment",
+        "//xla:literal_util",
         "//xla:shape_util",
+        "//xla:status",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -4826,10 +4907,13 @@ xla_cc_test(
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/base:log_severity",
         "@com_google_absl//absl/log:scoped_mock_log",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -4930,9 +5014,12 @@ xla_cc_test(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_matchers",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -4941,6 +5028,8 @@ xla_cc_test(
     srcs = ["hlo_dce_test.cc"],
     deps = [
         ":hlo_dce",
+        ":pattern_matcher",
+        ":pattern_matcher_gmock",
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:types",
@@ -4951,6 +5040,7 @@ xla_cc_test(
         "//xla/tests:test_utils",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/lib/core:status_test_util",
     ],
 )
@@ -5542,6 +5632,7 @@ cc_library(
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:errors",
     ],
@@ -5673,12 +5764,21 @@ cc_library(
         ":executable",
         ":hlo_module_util",
         ":hlo_runner_interface",
+        "//xla:shape_layout",
+        "//xla:shape_util",
+        "//xla:status",
+        "//xla:status_macros",
         "//xla:statusor",
+        "//xla:util",
         "//xla/client:xla_computation",
         "//xla/hlo/ir:hlo",
+        "//xla/pjrt:host_memory_spaces",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt:pjrt_future",
+        "//xla/service:computation_layout",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -5973,6 +6073,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
@@ -5991,6 +6092,7 @@ xla_cc_test(
         "//xla:statusor",
         "//xla:util",
         "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_verifier",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -6003,6 +6105,40 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "host_offloading_prepare",
+    srcs = ["host_offloading_prepare.cc"],
+    hdrs = ["host_offloading_prepare.h"],
+    deps = [
+        ":call_graph",
+        ":hlo_pass",
+        ":host_memory_offload_annotations_hdr",
+        "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "host_offloading_prepare_test",
+    srcs = ["host_offloading_prepare_test.cc"],
+    deps = [
+        ":host_memory_offload_annotations_hdr",
+        ":host_offloading_prepare",
+        "//xla/hlo/ir:hlo",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "while_util",
     srcs = ["while_util.cc"],
@@ -6022,6 +6158,7 @@ cc_library(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
@@ -6034,7 +6171,6 @@ xla_cc_test(
     srcs = ["while_util_test.cc"],
     deps = [
         ":while_util",
-        "//xla:statusor",
         "//xla:test",
         "//xla:util",
         "//xla/hlo/ir:hlo",
@@ -6043,6 +6179,7 @@ xla_cc_test(
         "//xla/tests:verified_hlo_module",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/status:statusor",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
@@ -6151,6 +6288,7 @@ cc_library(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
@@ -6193,6 +6331,8 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
     ],
 )
 
@@ -6202,9 +6342,13 @@ xla_cc_test(
     deps = [
         ":hlo_parser",
         ":while_loop_expensive_invariant_code_motion",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_matchers",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -6287,7 +6431,10 @@ cc_library(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -6470,6 +6617,7 @@ cc_library(
         "//xla:statusor",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings",
@@ -7203,7 +7351,12 @@ cc_library(
     name = "custom_call_sharding_helper",
     srcs = ["custom_call_sharding_helper.cc"],
     hdrs = ["custom_call_sharding_helper.h"],
-    deps = ["//xla/hlo/ir:hlo"],
+    deps = [
+        "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/platform:logging",
+    ],
 )
 
 tf_proto_library(
@@ -7282,9 +7435,11 @@ xla_cc_test(
     name = "scatter_simplifier_test",
     srcs = ["scatter_simplifier_test.cc"],
     deps = [
+        ":hlo_parser",
         ":hlo_pass",
         ":hlo_pass_pipeline",
         ":scatter_simplifier",
+        "//xla/hlo/ir:hlo",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
     ],
@@ -7320,7 +7475,11 @@ xla_cc_test(
     ],
     deps = [
         ":layout_normalization",
+        ":scatter_simplifier",
+        "//xla/hlo/ir:hlo",
         "//xla/tests:hlo_test_base",
+        "@com_google_absl//absl/strings:string_view",
+        "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
     ],
@@ -7345,6 +7504,7 @@ cc_library(
     deps = [
         ":hlo_creation_utils",
         "//xla:permutation_util",
+        "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/types:span",
     ],
@@ -7367,6 +7527,46 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "reduce_window_rewriter",
+    srcs = ["reduce_window_rewriter.cc"],
+    hdrs = ["reduce_window_rewriter.h"],
+    deps = [
+        ":hlo_pass",
+        "//xla:shape_util",
+        "//xla:status",
+        "//xla:status_macros",
+        "//xla:statusor",
+        "//xla:util",
+        "//xla:window_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "reduce_window_rewriter_test",
+    srcs = ["reduce_window_rewriter_test.cc"],
+    deps = [
+        ":reduce_window_rewriter",
+        "//xla:test",
+        "//xla:xla_data_proto_cc",
+        "//xla/tests:hlo_test_base",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
+
 cc_library(
     name = "stochastic_convert_decomposer",
     srcs = ["stochastic_convert_decomposer.cc"],
@@ -7505,7 +7705,7 @@ xla_cc_binary(
         "//xla/stream_executor/cuda:cublas_plugin",
     ]) + if_rocm([
         "//xla/stream_executor/rocm:rocblas_plugin",
-    ]) + xla_symbol_repository_deps(),
+    ]) + xla_internal(["tools:xsymbol_repository"]),
 )
 
 # A simple test of xla_aot_compile which generates an output file from an mhlo file.
@@ -7586,6 +7786,7 @@ xla_cc_test(
     name = "xla_aot_compile_stablehlo_cpu_test",
     srcs = ["xla_aot_compile_stablehlo_cpu_test.cc"],
     data = [":xla_aot_compile_stablehlo_test_cpu_executable"],
+    tags = ["no_aarch64"],  # b/341355246
     deps = [
         ":cpu_plugin",
         ":platform_util",
@@ -7659,6 +7860,7 @@ cc_library(
     name = "export_hlo",
     hdrs = ["export_hlo.h"],
     deps = [
+        "//xla:autotune_results_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/stream_executor:device_description_proto_cc",
         "@com_google_absl//absl/strings",
diff --git a/third_party/xla/xla/service/algebraic_simplifier.cc b/third_party/xla/xla/service/algebraic_simplifier.cc
index ea6fec69552e54..90c9a6812717a3 100644
--- a/third_party/xla/xla/service/algebraic_simplifier.cc
+++ b/third_party/xla/xla/service/algebraic_simplifier.cc
@@ -556,7 +556,7 @@ std::unique_ptr<HloInstruction> MakeScalarInstruction(HloInstruction* target,
 
 }  // namespace
 
-Status AlgebraicSimplifierVisitor::ScalarMultiplyReduction(
+absl::Status AlgebraicSimplifierVisitor::ScalarMultiplyReduction(
     HloInstruction* dot) {
   // We only process bfloat16 and float32 for now.
   if (dot->shape().element_type() != BF16 &&
@@ -778,7 +778,7 @@ bool AlgebraicSimplifierVisitor::ReplaceInstructionIfCompatible(
       .value();
 }
 
-Status AlgebraicSimplifierVisitor::HandleAbs(HloInstruction* abs) {
+absl::Status AlgebraicSimplifierVisitor::HandleAbs(HloInstruction* abs) {
   HloInstruction* abs_operand = abs->mutable_operand(0);
   VLOG(10) << "trying transform [Abs(A) => A] " << abs->ToString()
            << " Abs operand is: " << abs_operand->ToString();
@@ -788,7 +788,7 @@ Status AlgebraicSimplifierVisitor::HandleAbs(HloInstruction* abs) {
   return OkStatus();
 }
 
-Status AlgebraicSimplifierVisitor::HandleAdd(HloInstruction* add) {
+absl::Status AlgebraicSimplifierVisitor::HandleAdd(HloInstruction* add) {
   HloInstruction *lhs, *rhs;
   CHECK(Match(add, m::Add(m::Op(&lhs), m::Op(&rhs))));
 
@@ -1125,7 +1125,8 @@ absl::StatusOr<bool> AlgebraicSimplifierVisitor::TrySimplifyTautologicalCompare(
   return false;
 }
 
-Status AlgebraicSimplifierVisitor::HandleAllToAll(HloInstruction* all_to_all) {
+absl::Status AlgebraicSimplifierVisitor::HandleAllToAll(
+    HloInstruction* all_to_all) {
   if (all_to_all->shape().IsArray() &&
       Match(all_to_all->mutable_operand(0),
             m::Broadcast(m::ConstantScalar()))) {
@@ -1134,7 +1135,8 @@ Status AlgebraicSimplifierVisitor::HandleAllToAll(HloInstruction* all_to_all) {
   return OkStatus();
 }
 
-Status AlgebraicSimplifierVisitor::HandleAnd(HloInstruction* logical_and) {
+absl::Status AlgebraicSimplifierVisitor::HandleAnd(
+    HloInstruction* logical_and) {
   HloInstruction *lhs, *rhs;
   CHECK(Match(logical_and, m::And(m::Op(&lhs), m::Op(&rhs))));
   // Simplify logical and
@@ -1178,7 +1180,8 @@ Status AlgebraicSimplifierVisitor::HandleAnd(HloInstruction* logical_and) {
   return OkStatus();
 }
 
-Status AlgebraicSimplifierVisitor::HandleBitcast(HloInstruction* bitcast) {
+absl::Status AlgebraicSimplifierVisitor::HandleBitcast(
+    HloInstruction* bitcast) {
   // It's tricky for the simplifier to determine whether
   // it should remove the op when control deps are present. I.e.
   // control deps might be added to preserve a certain order.
@@ -1479,7 +1482,7 @@ bool AlgebraicSimplifierVisitor::SwapCopyBitcastCopy(
   return false;
 }
 
-Status AlgebraicSimplifierVisitor::HandleBitcastConvert(
+absl::Status AlgebraicSimplifierVisitor::HandleBitcastConvert(
     HloInstruction* bitcast) {
   TF_ASSIGN_OR_RETURN(bool replaced,
                       TrySimplifyTautologicalBitcastConvert(bitcast));
@@ -1491,7 +1494,7 @@ Status AlgebraicSimplifierVisitor::HandleBitcastConvert(
   return OkStatus();
 }
 
-Status AlgebraicSimplifierVisitor::HandleCopy(HloInstruction* copy) {
+absl::Status AlgebraicSimplifierVisitor::HandleCopy(HloInstruction* copy) {
   if (SwapCopyBitcastCopy(copy)) {
     return OkStatus();
   }
@@ -1633,7 +1636,7 @@ Status AlgebraicSimplifierVisitor::HandleCopy(HloInstruction* copy) {
   return OkStatus();
 }
 
-Status AlgebraicSimplifierVisitor::HandleConcatenate(
+absl::Status AlgebraicSimplifierVisitor::HandleConcatenate(
     HloInstruction* concatenate) {
   absl::Span<HloInstruction* const> operands(concatenate->operands());
   if (operands.size() == 1) {
@@ -1843,7 +1846,7 @@ AlgebraicSimplifierVisitor::TrySimplifyTautologicalBitcastConvert(
   return true;
 }
 
-Status
+absl::Status
 AlgebraicSimplifierVisitor::TryRemoveUpcastAndDowncastSurroundingBinaryOp(
     HloInstruction* convert_instruction) {
   HloInstruction* arg_1 = nullptr;
@@ -1893,8 +1896,8 @@ AlgebraicSimplifierVisitor::TryRemoveUpcastAndDowncastSurroundingBinaryOp(
   const PrimitiveType bin_op_type = bin_op_instr->shape().element_type();
   if (!primitive_util::IsIntegralType(final_type) ||
       !primitive_util::IsIntegralType(bin_op_type) ||
-      primitive_util::Is4BitType(final_type) ||
-      primitive_util::Is4BitType(bin_op_type) ||
+      primitive_util::IsSubByteNonPredType(final_type) ||
+      primitive_util::IsSubByteNonPredType(bin_op_type) ||
       (primitive_util::IsSignedIntegralType(final_type) !=
        primitive_util::IsSignedIntegralType(bin_op_type)) ||
       (primitive_util::IsUnsignedIntegralType(final_type) !=
@@ -1939,7 +1942,8 @@ static HloInstruction* BuildTupleConstant(HloComputation* computation,
   }
 }
 
-Status AlgebraicSimplifierVisitor::HandleConstant(HloInstruction* constant) {
+absl::Status AlgebraicSimplifierVisitor::HandleConstant(
+    HloInstruction* constant) {
   // Tuple constants aren't directly supported by any backend. Expand them into
   // explicit Tuple instructions.
   if (constant->shape().IsTuple()) {
@@ -1985,7 +1989,7 @@ Status AlgebraicSimplifierVisitor::HandleConstant(HloInstruction* constant) {
   return OkStatus();
 }
 
-Status AlgebraicSimplifierVisitor::HandleSubtract(HloInstruction* sub) {
+absl::Status AlgebraicSimplifierVisitor::HandleSubtract(HloInstruction* sub) {
   HloInstruction *lhs, *rhs;
   CHECK(Match(sub, m::Subtract(m::Op(&lhs), m::Op(&rhs))));
   // A - 0 => A
@@ -2021,7 +2025,7 @@ Status AlgebraicSimplifierVisitor::HandleSubtract(HloInstruction* sub) {
 }
 namespace {
 template <typename T>
-Status InvertConstant(const HloInstruction& constant, Literal* result) {
+absl::Status InvertConstant(const HloInstruction& constant, Literal* result) {
   return result->Populate<T>([&](absl::Span<const int64_t> indices) {
     return T{1.0} / constant.literal().Get<T>(indices);
   });
@@ -2084,7 +2088,7 @@ std::unique_ptr<HloInstruction> TryDivideToShift(
 }
 }  // namespace
 
-Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
+absl::Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
   HloInstruction *a, *b, *c, *d;
   CHECK(Match(divide, m::Divide(m::Op(&a), m::Op(&b))));
   // A/1 => A
@@ -2181,7 +2185,7 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
     Shape result_shape = c->literal().shape();
     Literal new_literal(result_shape);
     return primitive_util::PrimitiveTypeSwitch<Status>(
-        [&](auto primitive_type_constant) -> Status {
+        [&](auto primitive_type_constant) -> absl::Status {
           if constexpr (primitive_util::IsFloatingPointType(
                             primitive_type_constant) ||
                         primitive_util::IsComplexType(
@@ -2373,7 +2377,7 @@ AlgebraicSimplifierVisitor::RemoveDegenerateDimensionFromDot(
 // We want to transform this into
 //
 //  bcast' = f32[2,3,1,4] broadcast(f32[2,4] x), dimensions={0,3}
-Status AlgebraicSimplifierVisitor::SimplifyTransposeOfBroadcast(
+absl::Status AlgebraicSimplifierVisitor::SimplifyTransposeOfBroadcast(
     HloInstruction* transpose, absl::Span<const int64_t> dimensions) {
   HloInstruction* broadcast = transpose->mutable_operand(0);
   if (broadcast->opcode() != HloOpcode::kBroadcast ||
@@ -2467,6 +2471,10 @@ AlgebraicSimplifierVisitor::RemoveTransposesFromDotOperands(
       rank != dnums.lhs_batch_dimensions_size() + 2) {
     return false;
   }
+  // Skip sparse dots.
+  if (Cast<HloDotInstruction>(dot)->sparse_operands()) {
+    return false;
+  }
 
   // The last two elements of the permutation must be either [rank-2, rank-1]
   // (i.e. no permutation) or [rank-1, rank-2].  Otherwise, this means that
@@ -2541,6 +2549,7 @@ absl::StatusOr<HloInstruction*> AlgebraicSimplifierVisitor::OptimizeDotOfConcat(
   const DotDimensionNumbers& dnums = dot->dot_dimension_numbers();
   if (dnums.lhs_contracting_dimensions_size() != 1 ||
       dnums.lhs_batch_dimensions_size() != 0 ||
+      Cast<HloDotInstruction>(dot)->sparse_operands() ||
       dot->shape().dimensions_size() != 2) {  // dot output 2D
     return nullptr;
   }
@@ -2684,9 +2693,8 @@ absl::StatusOr<HloInstruction*> AlgebraicSimplifierVisitor::OptimizeDotOfGather(
     HloInstruction* dot) {
   const DotDimensionNumbers& dnums = dot->dot_dimension_numbers();
   if (dnums.lhs_contracting_dimensions_size() != 1 ||
-      dnums.rhs_contracting_dimensions_size() != 1 ||
       dnums.lhs_batch_dimensions_size() != 0 ||
-      dnums.rhs_batch_dimensions_size() != 0 ||
+      Cast<HloDotInstruction>(dot)->sparse_operands() ||
       dot->shape().dimensions_size() != 2) {  // dot output 2D
     VLOG(10) << "DotOfGather: Can only optimize 2D, non-batch dot operations.";
     return nullptr;
@@ -2853,7 +2861,8 @@ AlgebraicSimplifierVisitor::OptimizeDotOfReorderContractingDims(
   HloInstruction* constant;
   if (!Match(lhs,
              m::Reshape(&reshape, m::Transpose(&transpose, m::Op(&input)))) ||
-      !Match(rhs, m::Constant(&constant))) {
+      !Match(rhs, m::Constant(&constant)) ||
+      Cast<HloDotInstruction>(dot)->sparse_operands()) {
     return nullptr;
   }
 
@@ -3290,7 +3299,7 @@ AlgebraicSimplifierVisitor::AssociativeReorderDotOperator(
   return nullptr;
 }
 
-Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
+absl::Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
   CHECK(computation_ == dot->parent());
   HloDotInstruction* dot_cast = Cast<HloDotInstruction>(dot);
   const auto& dnums = dot->dot_dimension_numbers();
@@ -3366,19 +3375,18 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
     bool outer_lhs_dot = false;
     bool outer_rhs_dot = false;
 
-    if (lhs->opcode() == HloOpcode::kDot &&
-        !Cast<HloDotInstruction>(lhs)->sparse_operands()) {
+    if (lhs->opcode() == HloOpcode::kDot) {
       outer = dot;
       inner = lhs;
       outer_lhs_dot = true;
-    } else if (rhs->opcode() == HloOpcode::kDot &&
-               !Cast<HloDotInstruction>(rhs)->sparse_operands()) {
+    } else if (rhs->opcode() == HloOpcode::kDot) {
       outer = dot;
       inner = rhs;
       outer_rhs_dot = true;
     }
 
-    if (outer_lhs_dot || outer_rhs_dot) {
+    if ((outer_lhs_dot || outer_rhs_dot) &&
+        !Cast<HloDotInstruction>(inner)->sparse_operands()) {
       DotDimensionNumbers ab_dnums, ac_dnums, bc_dnums;
 
       // We will now use inner and outer to build up ab_dnums, ac_dnums, and
@@ -3854,7 +3862,7 @@ std::vector<int64_t> GetPaddedDims(const HloInstruction* pad) {
 }
 }  // namespace
 
-Status AlgebraicSimplifierVisitor::HandleGather(HloInstruction* gather) {
+absl::Status AlgebraicSimplifierVisitor::HandleGather(HloInstruction* gather) {
   const Shape& operand_shape = gather->operand(0)->shape();
   if (ShapeUtil::IsZeroElementArray(operand_shape)) {
     return ReplaceInstruction(gather, MakeScalarLike(gather, 0));
@@ -4131,7 +4139,8 @@ absl::StatusOr<std::unique_ptr<HloInstruction>> MinMaxToClamp(
 }
 }  // namespace
 
-Status AlgebraicSimplifierVisitor::HandleMaximum(HloInstruction* maximum) {
+absl::Status AlgebraicSimplifierVisitor::HandleMaximum(
+    HloInstruction* maximum) {
   HloInstruction *lhs, *rhs;
   CHECK(Match(maximum, m::Maximum(m::Op(&lhs), m::Op(&rhs))));
 
@@ -4203,7 +4212,8 @@ Status AlgebraicSimplifierVisitor::HandleMaximum(HloInstruction* maximum) {
   return OkStatus();
 }
 
-Status AlgebraicSimplifierVisitor::HandleMinimum(HloInstruction* minimum) {
+absl::Status AlgebraicSimplifierVisitor::HandleMinimum(
+    HloInstruction* minimum) {
   HloInstruction *lhs, *rhs;
   CHECK(Match(minimum, m::Minimum(m::Op(&lhs), m::Op(&rhs))));
 
@@ -4260,7 +4270,7 @@ Status AlgebraicSimplifierVisitor::HandleMinimum(HloInstruction* minimum) {
   return OkStatus();
 }
 
-Status AlgebraicSimplifierVisitor::HandleClamp(HloInstruction* clamp) {
+absl::Status AlgebraicSimplifierVisitor::HandleClamp(HloInstruction* clamp) {
   HloInstruction* clamp_lower_bound;
   HloInstruction* clamp_upper_bound;
   HloInstruction* to_clamp;
@@ -4297,7 +4307,8 @@ Status AlgebraicSimplifierVisitor::HandleClamp(HloInstruction* clamp) {
   return OkStatus();
 }
 
-Status AlgebraicSimplifierVisitor::HandleMultiply(HloInstruction* multiply) {
+absl::Status AlgebraicSimplifierVisitor::HandleMultiply(
+    HloInstruction* multiply) {
   HloInstruction *lhs, *rhs;
   CHECK(Match(multiply, m::Multiply(m::Op(&lhs), m::Op(&rhs))));
   // LHS*1 => LHS
@@ -4497,7 +4508,7 @@ Status AlgebraicSimplifierVisitor::HandleMultiply(HloInstruction* multiply) {
   return OkStatus();
 }
 
-Status AlgebraicSimplifierVisitor::HandleNegate(HloInstruction* negate) {
+absl::Status AlgebraicSimplifierVisitor::HandleNegate(HloInstruction* negate) {
   // negate(negate(x)) => x
   HloInstruction* x;
   if (Match(negate, m::Negate(m::Negate(m::Op(&x)))) &&
@@ -4507,7 +4518,8 @@ Status AlgebraicSimplifierVisitor::HandleNegate(HloInstruction* negate) {
   return OkStatus();
 }
 
-Status AlgebraicSimplifierVisitor::HandleNot(HloInstruction* logical_not) {
+absl::Status AlgebraicSimplifierVisitor::HandleNot(
+    HloInstruction* logical_not) {
   // not(not(x)) => x
   HloInstruction* x;
   if (Match(logical_not, m::Not(m::Not(m::Op(&x)))) &&
@@ -4517,7 +4529,7 @@ Status AlgebraicSimplifierVisitor::HandleNot(HloInstruction* logical_not) {
   return OkStatus();
 }
 
-Status AlgebraicSimplifierVisitor::HandleOr(HloInstruction* logical_or) {
+absl::Status AlgebraicSimplifierVisitor::HandleOr(HloInstruction* logical_or) {
   HloInstruction *lhs, *rhs;
   CHECK(Match(logical_or, m::Or(m::Op(&lhs), m::Op(&rhs))));
 
@@ -4553,7 +4565,7 @@ Status AlgebraicSimplifierVisitor::HandleOr(HloInstruction* logical_or) {
   return OkStatus();
 }
 
-Status AlgebraicSimplifierVisitor::HandleLog(HloInstruction* log) {
+absl::Status AlgebraicSimplifierVisitor::HandleLog(HloInstruction* log) {
   // ln(exp(A)) => A
   VLOG(10) << "trying transform [ln(exp(A)) => A]: " << log->ToString();
   HloInstruction *a, *b;
@@ -4603,7 +4615,7 @@ Status AlgebraicSimplifierVisitor::HandleLog(HloInstruction* log) {
   return OkStatus();
 }
 
-Status AlgebraicSimplifierVisitor::HandleGetTupleElement(
+absl::Status AlgebraicSimplifierVisitor::HandleGetTupleElement(
     HloInstruction* get_tuple_element) {
   auto operand = get_tuple_element->mutable_operand(0);
   if (operand->opcode() == HloOpcode::kTuple) {
@@ -4620,7 +4632,7 @@ Status AlgebraicSimplifierVisitor::HandleGetTupleElement(
   return OkStatus();
 }
 
-Status AlgebraicSimplifierVisitor::HandleOptimizationBarrier(
+absl::Status AlgebraicSimplifierVisitor::HandleOptimizationBarrier(
     HloInstruction* barrier) {
   if (!barrier->shape().IsTuple() ||
       barrier == computation_->root_instruction()) {
@@ -4739,7 +4751,8 @@ bool OutputIsSubsetOfOperandElements(HloInstruction* instruction,
 
 }  // namespace
 
-Status AlgebraicSimplifierVisitor::HandleBroadcast(HloInstruction* broadcast) {
+absl::Status AlgebraicSimplifierVisitor::HandleBroadcast(
+    HloInstruction* broadcast) {
   HloInstruction* operand;
   CHECK(Match(broadcast, m::Broadcast(m::Op(&operand))));
   auto dims = *broadcast->mutable_dimensions();
@@ -4863,7 +4876,8 @@ Status AlgebraicSimplifierVisitor::HandleBroadcast(HloInstruction* broadcast) {
   return OkStatus();
 }
 
-Status AlgebraicSimplifierVisitor::HandleCompare(HloInstruction* compare) {
+absl::Status AlgebraicSimplifierVisitor::HandleCompare(
+    HloInstruction* compare) {
   HloInstruction* lhs;
   HloInstruction* rhs;
   CHECK(Match(compare, m::Compare(m::Op(&lhs), m::Op(&rhs))));
@@ -5009,7 +5023,8 @@ Status AlgebraicSimplifierVisitor::HandleCompare(HloInstruction* compare) {
   return OkStatus();
 }
 
-Status AlgebraicSimplifierVisitor::HandleConvert(HloInstruction* convert) {
+absl::Status AlgebraicSimplifierVisitor::HandleConvert(
+    HloInstruction* convert) {
   PrimitiveType src_type = convert->operand(0)->shape().element_type();
   PrimitiveType dest_type = convert->shape().element_type();
   // A conversion to the same element type as the operand is a nop and can be
@@ -5038,7 +5053,7 @@ Status AlgebraicSimplifierVisitor::HandleConvert(HloInstruction* convert) {
   return TryRemoveUpcastAndDowncastSurroundingBinaryOp(convert);
 }
 
-Status AlgebraicSimplifierVisitor::HandleCustomCall(
+absl::Status AlgebraicSimplifierVisitor::HandleCustomCall(
     HloInstruction* custom_call) {
   // Remove redundant slice to dynamic of pad to static
   HloInstruction *pad_to_static0, *pad_to_static1, *pad_to_static_operand;
@@ -5060,7 +5075,8 @@ Status AlgebraicSimplifierVisitor::HandleCustomCall(
 }
 
 // Complex(Real(c), Imag(c)) -> c
-Status AlgebraicSimplifierVisitor::HandleComplex(HloInstruction* complex) {
+absl::Status AlgebraicSimplifierVisitor::HandleComplex(
+    HloInstruction* complex) {
   HloInstruction *c0, *c1;
   if (Match(complex, m::Complex(m::Real(m::Op(&c0)), m::Imag(m::Op(&c1)))) &&
       c0 == c1) {
@@ -5070,7 +5086,7 @@ Status AlgebraicSimplifierVisitor::HandleComplex(HloInstruction* complex) {
 }
 
 // Real(Complex(r, i)) -> r
-Status AlgebraicSimplifierVisitor::HandleReal(HloInstruction* real) {
+absl::Status AlgebraicSimplifierVisitor::HandleReal(HloInstruction* real) {
   HloInstruction* op;
   if (Match(real, m::Real(m::Complex(m::Op(&op), m::Op())))) {
     return ReplaceInstruction(real, op);
@@ -5079,7 +5095,7 @@ Status AlgebraicSimplifierVisitor::HandleReal(HloInstruction* real) {
 }
 
 // Imag(Complex(r, i)) -> i
-Status AlgebraicSimplifierVisitor::HandleImag(HloInstruction* imag) {
+absl::Status AlgebraicSimplifierVisitor::HandleImag(HloInstruction* imag) {
   HloInstruction* op;
   if (Match(imag, m::Imag(m::Complex(m::Op(), m::Op(&op))))) {
     return ReplaceInstruction(imag, op);
@@ -5087,7 +5103,8 @@ Status AlgebraicSimplifierVisitor::HandleImag(HloInstruction* imag) {
   return OkStatus();
 }
 
-Status AlgebraicSimplifierVisitor::HandleIota(HloInstruction* instruction) {
+absl::Status AlgebraicSimplifierVisitor::HandleIota(
+    HloInstruction* instruction) {
   // iota -> zero if the iota dimension never produces an element other than
   // zero.
   auto* iota = Cast<HloIotaInstruction>(instruction);
@@ -5097,7 +5114,7 @@ Status AlgebraicSimplifierVisitor::HandleIota(HloInstruction* instruction) {
   return OkStatus();
 }
 
-Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
+absl::Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
   if (ShapeUtil::IsZeroElementArray(pad->operand(0)->shape())) {
     return ReplaceWithNewInstruction(
         pad, HloInstruction::CreateBroadcast(pad->shape(),
@@ -5300,7 +5317,7 @@ Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
   return OkStatus();
 }
 
-Status AlgebraicSimplifierVisitor::HandlePower(HloInstruction* power) {
+absl::Status AlgebraicSimplifierVisitor::HandlePower(HloInstruction* power) {
   VLOG(10) << "trying transform [pow(A, 0) => 1]: " << power->ToString();
   HloInstruction *lhs, *rhs;
   CHECK(Match(power, m::Power(m::Op(&lhs), m::Op(&rhs))));
@@ -5517,7 +5534,8 @@ std::unique_ptr<HloInstruction> TryRemainderToAnd(
 }
 }  // namespace
 
-Status AlgebraicSimplifierVisitor::HandleRemainder(HloInstruction* remainder) {
+absl::Status AlgebraicSimplifierVisitor::HandleRemainder(
+    HloInstruction* remainder) {
   HloInstruction *a, *b;
   CHECK(Match(remainder, m::Remainder(m::Op(&a), m::Op(&b))));
 
@@ -5603,7 +5621,8 @@ Status AlgebraicSimplifierVisitor::HandleRemainder(HloInstruction* remainder) {
   return OkStatus();
 }
 
-Status AlgebraicSimplifierVisitor::HandleReshape(HloInstruction* reshape) {
+absl::Status AlgebraicSimplifierVisitor::HandleReshape(
+    HloInstruction* reshape) {
   auto operand = reshape->mutable_operand(0);
 
   // Reshape directly to empty constant if the shape contains zero-element
@@ -5870,7 +5889,8 @@ int64_t CountElementsLessThan(absl::Span<const int64_t> elements,
   return count;
 }
 
-Status AlgebraicSimplifierVisitor::HandleReverse(HloInstruction* reverse) {
+absl::Status AlgebraicSimplifierVisitor::HandleReverse(
+    HloInstruction* reverse) {
   // When all the dimensions to reverse are trivial (i.e. the bound is 1),
   // there is nothing to be done.
   auto dim_is_one = [&](int64_t i) -> bool {
@@ -6128,7 +6148,7 @@ absl::StatusOr<bool> AlgebraicSimplifierVisitor::TryToReorderSliceAndReverse(
   return false;
 }
 
-Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
+absl::Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
   // Delete no-op slices, i.e. where shape = operand shape.
   if (ReplaceInstructionIfCompatible(slice, slice->mutable_operand(0))) {
     return OkStatus();
@@ -6482,7 +6502,7 @@ Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
   return OkStatus();
 }
 
-Status AlgebraicSimplifierVisitor::HandleRsqrt(HloInstruction* rsqrt) {
+absl::Status AlgebraicSimplifierVisitor::HandleRsqrt(HloInstruction* rsqrt) {
   VLOG(10) << "trying transform [rsqrt(Pow(A, -2)) => |A|] "
            << rsqrt->ToString();
   HloInstruction* rsqrt_operand = rsqrt->mutable_operand(0);
@@ -6507,7 +6527,7 @@ Status AlgebraicSimplifierVisitor::HandleRsqrt(HloInstruction* rsqrt) {
   return OkStatus();
 }
 
-Status AlgebraicSimplifierVisitor::HandleDynamicSlice(
+absl::Status AlgebraicSimplifierVisitor::HandleDynamicSlice(
     HloInstruction* dynamic_slice) {
   // Skip optimizations for async dynamic-slices.
   if (dynamic_slice->parent()->IsAsyncComputation()) {
@@ -6773,7 +6793,7 @@ Status AlgebraicSimplifierVisitor::HandleDynamicSlice(
   return OkStatus();
 }
 
-Status AlgebraicSimplifierVisitor::HandleDynamicUpdateSlice(
+absl::Status AlgebraicSimplifierVisitor::HandleDynamicUpdateSlice(
     HloInstruction* dynamic_update_slice) {
   // Skip optimizations for async dynamic update slices
   if (dynamic_update_slice->parent()->IsAsyncComputation()) {
@@ -7057,7 +7077,7 @@ static bool ReductionComputationsEquivalent(const HloComputation& a,
          category_a == category_b;
 }
 
-Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* hlo) {
+absl::Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* hlo) {
   HloReduceInstruction* reduce = Cast<HloReduceInstruction>(hlo);
   bool multi_output_reduce = reduce->shape().IsTuple();
   // For tuple reduce, we require all reduce shapes to be the same, up to the
@@ -7462,13 +7482,9 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* hlo) {
       Match(arg, m::Dot(&dot, m::Op(&lhs), m::Op(&rhs)).WithOneUser()) &&
       Match(reduce->to_apply()->root_instruction(),
             m::AddAnyOrder(m::Parameter(0), m::Parameter(1))) &&
-      absl::c_any_of(
-          reduce->dimensions(),
-          [&](int64_t dim) {
-            return dim <
-                   dot->dot_dimension_numbers().lhs_batch_dimensions_size();
-          }) &&
-      !Cast<HloDotInstruction>(dot)->sparse_operands()) {
+      absl::c_any_of(reduce->dimensions(), [&](int64_t dim) {
+        return dim < dot->dot_dimension_numbers().lhs_batch_dimensions_size();
+      })) {
     const auto& dnums = dot->dot_dimension_numbers();
     DotDimensionNumbers new_dnums = dnums;
     new_dnums.clear_lhs_batch_dimensions();
@@ -7495,10 +7511,16 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* hlo) {
         reduce_dims.push_back(dim - removed_dims);
       }
     }
+    HloDotInstruction* dot_cast = Cast<HloDotInstruction>(dot);
+    std::vector<SparsityDescriptor> sparsity(dot_cast->sparsity().begin(),
+                                             dot_cast->sparsity().end());
+    auto sparse_meta =
+        absl::MakeSpan(dot->operands()).subspan(HloDotInstruction::kOperands);
     TF_ASSIGN_OR_RETURN(
         auto new_dot,
         MakeDotHlo(lhs, rhs, new_dnums, dot->precision_config(),
-                   /*preferred_element_type=*/dot->shape().element_type()));
+                   /*preferred_element_type=*/dot->shape().element_type(),
+                   std::move(sparsity), sparse_meta));
     dot->SetupDerivedInstruction(new_dot);
     if (reduce_dims.empty()) {
       return ReplaceInstruction(hlo, new_dot);
@@ -7597,7 +7619,8 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status AlgebraicSimplifierVisitor::HandleReduceWindow(HloInstruction* hlo) {
+absl::Status AlgebraicSimplifierVisitor::HandleReduceWindow(
+    HloInstruction* hlo) {
   auto* reduce_window = Cast<HloReduceWindowInstruction>(hlo);
   const bool multi_output_reduce_window = reduce_window->shape().IsTuple();
   auto inputs = reduce_window->inputs();
@@ -7931,7 +7954,7 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow(HloInstruction* hlo) {
                          /*reduce_computation=*/function));
 }
 
-Status AlgebraicSimplifierVisitor::HandleSelect(HloInstruction* select) {
+absl::Status AlgebraicSimplifierVisitor::HandleSelect(HloInstruction* select) {
   // select(x, y, y) -> y.
   if (select->operand(1) == select->operand(2) &&
       ReplaceInstructionIfCompatible(select, select->mutable_operand(1))) {
@@ -8018,7 +8041,7 @@ Status AlgebraicSimplifierVisitor::HandleSelect(HloInstruction* select) {
   return OkStatus();
 }
 
-Status AlgebraicSimplifierVisitor::HandleScatter(HloInstruction* hlo) {
+absl::Status AlgebraicSimplifierVisitor::HandleScatter(HloInstruction* hlo) {
   auto* scatter = Cast<HloScatterInstruction>(hlo);
 
   if (absl::c_all_of(scatter->scatter_updates(),
@@ -8041,7 +8064,7 @@ Status AlgebraicSimplifierVisitor::HandleScatter(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status AlgebraicSimplifierVisitor::HandleSort(HloInstruction* sort) {
+absl::Status AlgebraicSimplifierVisitor::HandleSort(HloInstruction* sort) {
   auto operand = sort->mutable_operand(0);
   int64_t dimension_to_sort = sort->dimensions(0);
   if (ShapeUtil::IsZeroElementArray(operand->shape()) ||
@@ -8056,7 +8079,7 @@ Status AlgebraicSimplifierVisitor::HandleSort(HloInstruction* sort) {
   return OkStatus();
 }
 
-Status AlgebraicSimplifierVisitor::HandleSqrt(HloInstruction* sqrt) {
+absl::Status AlgebraicSimplifierVisitor::HandleSqrt(HloInstruction* sqrt) {
   VLOG(10) << "trying transform [sqrt(A*A) => |A|] " << sqrt->ToString();
   HloInstruction* sqrt_operand = sqrt->mutable_operand(0);
   if (sqrt_operand->opcode() == HloOpcode::kMultiply &&
@@ -8111,7 +8134,8 @@ bool IsPermutationOfIota(absl::Span<const int64_t> elems) {
 
 }  // namespace
 
-Status AlgebraicSimplifierVisitor::HandleTranspose(HloInstruction* transpose) {
+absl::Status AlgebraicSimplifierVisitor::HandleTranspose(
+    HloInstruction* transpose) {
   auto operand = transpose->mutable_operand(0);
   if (std::is_sorted(transpose->dimensions().begin(),
                      transpose->dimensions().end())) {
@@ -8130,7 +8154,8 @@ Status AlgebraicSimplifierVisitor::HandleTranspose(HloInstruction* transpose) {
   // Convert transpose(dot(a,b)) to dot(b,a).
   auto do_transpose_of_dot = [&]() -> absl::StatusOr<bool> {
     if (options_.supports_non_canonical_dots() ||
-        operand->opcode() != HloOpcode::kDot || operand->user_count() != 1) {
+        operand->opcode() != HloOpcode::kDot || operand->user_count() != 1 ||
+        Cast<HloDotInstruction>(operand)->sparse_operands()) {
       return false;
     }
     HloInstruction* dot = operand;
@@ -8184,7 +8209,8 @@ Status AlgebraicSimplifierVisitor::HandleTranspose(HloInstruction* transpose) {
   HloInstruction *lhs, *rhs, *dot;
   if (options_.supports_non_canonical_dots() &&
       Match(operand, m::Dot(&dot, m::Op(&lhs), m::Op(&rhs))) &&
-      dot->user_count() == 1) {
+      dot->user_count() == 1 &&
+      !Cast<HloDotInstruction>(dot)->sparse_operands()) {
     TF_ASSIGN_OR_RETURN(bool did_transform, [&]() -> absl::StatusOr<bool> {
       const auto& dnums = dot->dot_dimension_numbers();
       const int64_t num_batch_dims = dnums.lhs_batch_dimensions_size();
@@ -8223,22 +8249,10 @@ Status AlgebraicSimplifierVisitor::HandleTranspose(HloInstruction* transpose) {
             dnums.lhs_batch_dimensions(transpose->dimensions(batch_dim)));
       }
 
-      HloDotInstruction* dot_cast = Cast<HloDotInstruction>(dot);
-      int size = dot_cast->sparse_operands();  // 0..2
-      std::vector<SparsityDescriptor> sparsity(size);
-      std::vector<HloInstruction*> sparse_meta(size);
-      for (int i = 0; i < size; ++i) {
-        SparsityDescriptor descriptor = dot_cast->sparsity()[i];
-        descriptor.set_index(1 - descriptor.index());
-        sparsity[size - i - 1] = descriptor;
-        sparse_meta[size - i - 1] =
-            dot_cast->mutable_operand(HloDotInstruction::kOperands + i);
-      }
-
       HloInstruction* new_dot =
           MakeDotHlo(rhs, lhs, new_dnums,
                      SwapOperandsInDotPrecisionConfig(dot->precision_config()),
-                     dot->shape().element_type(), sparsity, sparse_meta)
+                     dot->shape().element_type())
               .value();
       *new_dot->mutable_shape()->mutable_layout() = transpose->shape().layout();
 
@@ -8918,7 +8932,7 @@ absl::StatusOr<bool> AlgebraicSimplifierVisitor::SimplifyConvToMultiply(
   return true;
 }
 
-Status AlgebraicSimplifierVisitor::HandleConvolution(
+absl::Status AlgebraicSimplifierVisitor::HandleConvolution(
     HloInstruction* convolution) {
   if (options_.enable_scalar_multiply_reduction()) {
     TF_RETURN_IF_ERROR(ScalarMultiplyReduction(convolution));
@@ -8961,7 +8975,7 @@ Status AlgebraicSimplifierVisitor::HandleConvolution(
   return OkStatus();
 }
 
-Status AlgebraicSimplifierVisitor::HandleMap(HloInstruction* map) {
+absl::Status AlgebraicSimplifierVisitor::HandleMap(HloInstruction* map) {
   auto* map_computation = map->to_apply();
   auto* map_root = map_computation->root_instruction();
   if (map_root->opcode() == HloOpcode::kParameter) {
diff --git a/third_party/xla/xla/service/algebraic_simplifier.h b/third_party/xla/xla/service/algebraic_simplifier.h
index 6ec43b9928dceb..b23250c644f60d 100644
--- a/third_party/xla/xla/service/algebraic_simplifier.h
+++ b/third_party/xla/xla/service/algebraic_simplifier.h
@@ -315,103 +315,104 @@ class AlgebraicSimplifierVisitor : public DfsHloRewriteVisitor {
                                       AlgebraicSimplifier* simplifier)
       : options_(options), simplifier_(simplifier) {}
 
-  Status HandleAbs(HloInstruction* abs) override;
+  absl::Status HandleAbs(HloInstruction* abs) override;
 
-  Status HandleAdd(HloInstruction* add) override;
+  absl::Status HandleAdd(HloInstruction* add) override;
 
-  Status HandleAllToAll(HloInstruction* all_to_all) override;
+  absl::Status HandleAllToAll(HloInstruction* all_to_all) override;
 
-  Status HandleAnd(HloInstruction* logical_and) override;
+  absl::Status HandleAnd(HloInstruction* logical_and) override;
 
-  Status HandleBitcast(HloInstruction* bitcast) override;
+  absl::Status HandleBitcast(HloInstruction* bitcast) override;
 
-  Status HandleBitcastConvert(HloInstruction* bitcast) override;
+  absl::Status HandleBitcastConvert(HloInstruction* bitcast) override;
 
-  Status HandleBroadcast(HloInstruction* broadcast) override;
+  absl::Status HandleBroadcast(HloInstruction* broadcast) override;
 
-  Status HandleCompare(HloInstruction* compare) override;
+  absl::Status HandleCompare(HloInstruction* compare) override;
 
-  Status HandleConcatenate(HloInstruction* concatenate) override;
+  absl::Status HandleConcatenate(HloInstruction* concatenate) override;
 
-  Status HandleConstant(HloInstruction* constant) override;
+  absl::Status HandleConstant(HloInstruction* constant) override;
 
-  Status HandleCopy(HloInstruction* copy) override;
+  absl::Status HandleCopy(HloInstruction* copy) override;
 
-  Status HandleConvert(HloInstruction* convert) override;
+  absl::Status HandleConvert(HloInstruction* convert) override;
 
-  Status HandleComplex(HloInstruction* complex) override;
+  absl::Status HandleComplex(HloInstruction* complex) override;
 
-  Status HandleCustomCall(HloInstruction* custom_call) override;
+  absl::Status HandleCustomCall(HloInstruction* custom_call) override;
 
-  Status HandleReal(HloInstruction* real) override;
+  absl::Status HandleReal(HloInstruction* real) override;
 
-  Status HandleImag(HloInstruction* imag) override;
+  absl::Status HandleImag(HloInstruction* imag) override;
 
-  Status HandleIota(HloInstruction* instruction) override;
+  absl::Status HandleIota(HloInstruction* instruction) override;
 
-  Status HandleConvolution(HloInstruction* convolution) override;
+  absl::Status HandleConvolution(HloInstruction* convolution) override;
 
-  Status HandleDivide(HloInstruction* divide) override;
+  absl::Status HandleDivide(HloInstruction* divide) override;
 
-  Status HandleDot(HloInstruction* dot) override;
+  absl::Status HandleDot(HloInstruction* dot) override;
 
-  Status HandleGather(HloInstruction* gather) override;
+  absl::Status HandleGather(HloInstruction* gather) override;
 
-  Status HandleGetTupleElement(HloInstruction* get_tuple_element) override;
+  absl::Status HandleGetTupleElement(
+      HloInstruction* get_tuple_element) override;
 
-  Status HandleLog(HloInstruction* log) override;
+  absl::Status HandleLog(HloInstruction* log) override;
 
-  Status HandleMaximum(HloInstruction* maximum) override;
+  absl::Status HandleMaximum(HloInstruction* maximum) override;
 
-  Status HandleMinimum(HloInstruction* minimum) override;
+  absl::Status HandleMinimum(HloInstruction* minimum) override;
 
-  Status HandleClamp(HloInstruction* clamp) override;
+  absl::Status HandleClamp(HloInstruction* clamp) override;
 
-  Status HandleMultiply(HloInstruction* multiply) override;
+  absl::Status HandleMultiply(HloInstruction* multiply) override;
 
-  Status HandleNegate(HloInstruction* negate) override;
+  absl::Status HandleNegate(HloInstruction* negate) override;
 
-  Status HandleNot(HloInstruction* logical_not) override;
+  absl::Status HandleNot(HloInstruction* logical_not) override;
 
-  Status HandleOptimizationBarrier(HloInstruction* barrier) override;
+  absl::Status HandleOptimizationBarrier(HloInstruction* barrier) override;
 
-  Status HandleOr(HloInstruction* logical_or) override;
+  absl::Status HandleOr(HloInstruction* logical_or) override;
 
-  Status HandlePad(HloInstruction* pad) override;
+  absl::Status HandlePad(HloInstruction* pad) override;
 
-  Status HandlePower(HloInstruction* power) override;
+  absl::Status HandlePower(HloInstruction* power) override;
 
-  Status HandleRemainder(HloInstruction* remainder) override;
+  absl::Status HandleRemainder(HloInstruction* remainder) override;
 
-  Status HandleReshape(HloInstruction* reshape) override;
+  absl::Status HandleReshape(HloInstruction* reshape) override;
 
-  Status HandleReduce(HloInstruction* hlo) override;
+  absl::Status HandleReduce(HloInstruction* hlo) override;
 
-  Status HandleReduceWindow(HloInstruction* hlo) override;
+  absl::Status HandleReduceWindow(HloInstruction* hlo) override;
 
-  Status HandleReverse(HloInstruction* reverse) override;
+  absl::Status HandleReverse(HloInstruction* reverse) override;
 
-  Status HandleRsqrt(HloInstruction* rsqrt) override;
+  absl::Status HandleRsqrt(HloInstruction* rsqrt) override;
 
-  Status HandleSlice(HloInstruction* slice) override;
+  absl::Status HandleSlice(HloInstruction* slice) override;
 
-  Status HandleSqrt(HloInstruction* sqrt) override;
+  absl::Status HandleSqrt(HloInstruction* sqrt) override;
 
-  Status HandleDynamicSlice(HloInstruction* dynamic_slice) override;
+  absl::Status HandleDynamicSlice(HloInstruction* dynamic_slice) override;
 
-  Status HandleDynamicUpdateSlice(
+  absl::Status HandleDynamicUpdateSlice(
       HloInstruction* dynamic_update_slice) override;
-  Status HandleScatter(HloInstruction* hlo) override;
+  absl::Status HandleScatter(HloInstruction* hlo) override;
 
-  Status HandleSelect(HloInstruction* select) override;
+  absl::Status HandleSelect(HloInstruction* select) override;
 
-  Status HandleSort(HloInstruction* sort) override;
+  absl::Status HandleSort(HloInstruction* sort) override;
 
-  Status HandleTranspose(HloInstruction* transpose) override;
+  absl::Status HandleTranspose(HloInstruction* transpose) override;
 
-  Status HandleSubtract(HloInstruction* sub) override;
+  absl::Status HandleSubtract(HloInstruction* sub) override;
 
-  Status HandleMap(HloInstruction* map) override;
+  absl::Status HandleMap(HloInstruction* map) override;
 
   // Runs the visitor on a computation.
   bool Run(HloComputation* computation,
@@ -455,8 +456,8 @@ class AlgebraicSimplifierVisitor : public DfsHloRewriteVisitor {
 
   // Moves the transpose to the broadcast if possible. Can also be called with a
   // bitcast transpose.
-  Status SimplifyTransposeOfBroadcast(HloInstruction* transpose,
-                                      absl::Span<const int64_t> dimensions);
+  absl::Status SimplifyTransposeOfBroadcast(
+      HloInstruction* transpose, absl::Span<const int64_t> dimensions);
 
   // Converts to primitive type if the input hlo is not that type, otherwise
   // returns the original hlo.
@@ -494,7 +495,7 @@ class AlgebraicSimplifierVisitor : public DfsHloRewriteVisitor {
 
   // Move scalar multiply to the smallest side of convolution to
   // reduce multiply computations.
-  Status ScalarMultiplyReduction(HloInstruction* dot);
+  absl::Status ScalarMultiplyReduction(HloInstruction* dot);
 
   // Convenience method for replacing an instruction with a bitcast. If operand
   // is not null, then the bitcast will use the specified operand instead of the
@@ -610,7 +611,7 @@ class AlgebraicSimplifierVisitor : public DfsHloRewriteVisitor {
   //  where TS is a smaller point type than TL (ex, TS=fp16, TL=fp32)
   // ->
   // bin_op<TS>(data1<TS>, data2<TS>)
-  Status TryRemoveUpcastAndDowncastSurroundingBinaryOp(
+  absl::Status TryRemoveUpcastAndDowncastSurroundingBinaryOp(
       HloInstruction* convert_instruction);
 
   // Useful when we want to use the same visitor over multiple computations.
diff --git a/third_party/xla/xla/service/algebraic_simplifier_test.cc b/third_party/xla/xla/service/algebraic_simplifier_test.cc
index 93e28a1e97ae9f..6c7bf94bae7eb3 100644
--- a/third_party/xla/xla/service/algebraic_simplifier_test.cc
+++ b/third_party/xla/xla/service/algebraic_simplifier_test.cc
@@ -10921,7 +10921,7 @@ TEST_F(AlgebraicSimplifierTest, SparseDotMoveSliceToOperands) {
   EXPECT_EQ(descriptor.dimension(), 2);
 }
 
-TEST_F(AlgebraicSimplifierTest, SparseDotTranspose) {
+TEST_F(AlgebraicSimplifierTest, SparseDotKeepTranspose) {
   const char* hlo_string = R"(
     HloModule m
     ENTRY test {
@@ -10936,16 +10936,200 @@ TEST_F(AlgebraicSimplifierTest, SparseDotTranspose) {
   )";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
-  EXPECT_TRUE(AlgebraicSimplifier(default_options_).Run(module.get()).value());
-  HloInstruction* root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root,
-              GmockMatch(SparseDotMatcher(m::Parameter(1), m::Parameter(0),
-                                          m::Parameter(2))
-                             .WithShape(F32, {20, 10})));
-  auto dot = Cast<HloDotInstruction>(root);
-  auto descriptor = dot->sparsity().front();
-  EXPECT_EQ(descriptor.index(), 1);
-  EXPECT_EQ(descriptor.dimension(), 1);
+
+  auto options = AlgebraicSimplifierOptions();
+
+  options.set_supports_non_canonical_dots(false);
+  AlgebraicSimplifier simplifier1(options);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloPass(&simplifier1, module.get()));
+  EXPECT_FALSE(changed);
+
+  options.set_supports_non_canonical_dots(true);
+  AlgebraicSimplifier simplifier2(options);
+  TF_ASSERT_OK_AND_ASSIGN(changed, RunHloPass(&simplifier2, module.get()));
+  EXPECT_FALSE(changed);
+}
+
+TEST_F(AlgebraicSimplifierTest, SparseDotKeepOperandsTranspose) {
+  const char* hlo_string = R"(
+    HloModule m
+    ENTRY test {
+      %lhs = f32[10,20,30,16] parameter(0)
+      %rhs = f32[10,20,32,40] parameter(1)
+      %lhs_t = f32[20,10,30,16] transpose(%lhs), dimensions={1,0,2,3}
+      %rhs_t = f32[20,10,32,40] transpose(%rhs), dimensions={1,0,2,3}
+      %meta = u16[20,10,30,2] parameter(2)
+      ROOT %root = dot(%lhs_t, %rhs_t, %meta),
+          lhs_batch_dims={0,1}, rhs_batch_dims={0,1},
+          lhs_contracting_dims={3}, rhs_contracting_dims={2}, sparsity=L.3@2:4
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifier simplifier(default_options_);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloPass(&simplifier, module.get()));
+  EXPECT_FALSE(changed);
+}
+
+TEST_F(AlgebraicSimplifierTest, SparseDotNoAssociativeReorderOuter) {
+  const char* hlo_string = R"(
+    HloModule m
+    ENTRY test {
+      %a = f32[10,5] parameter(0)
+      %b = f32[5,32] parameter(1)
+      %c = f32[64,20] parameter(2)
+      %meta = u16[10,4] parameter(3)
+      %inner = f32[10,32] dot(%a, %b),
+          lhs_contracting_dims={1}, rhs_contracting_dims={0}
+      ROOT %outer = f32[10,20] dot(%inner, %c, %meta),
+          lhs_contracting_dims={1}, rhs_contracting_dims={0}, sparsity=L.1@2:4
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options;
+  options.set_use_associative_reordering(true);
+  options.set_associative_reordering_threshold(0);
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_FALSE(simplifier.Run(module.get()).value());
+}
+
+TEST_F(AlgebraicSimplifierTest, SparseDotNoAssociativeReorderInner) {
+  const char* hlo_string = R"(
+    HloModule m
+    ENTRY test {
+      %a = f32[10,64] parameter(0)
+      %b = f32[128,32] parameter(1)
+      %c = f32[32,20] parameter(2)
+      %meta = u16[10,8] parameter(3)
+      %inner = f32[10,32] dot(%a, %b, %meta),
+          lhs_contracting_dims={1}, rhs_contracting_dims={0}, sparsity=L.1@2:4
+      ROOT %outer = f32[10,20] dot(%inner, %c),
+          lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options;
+  options.set_use_associative_reordering(true);
+  options.set_associative_reordering_threshold(0);
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_FALSE(simplifier.Run(module.get()).value());
+}
+
+TEST_F(AlgebraicSimplifierTest, SparseDotNoAssociativeReorderReduce) {
+  const char* hlo_string = R"(
+    HloModule m
+    add {
+      %p0 = f32[] parameter(0)
+      %p1 = f32[] parameter(1)
+      ROOT %add = f32[] add(p0, p1)
+    }
+    ENTRY test {
+      %a = f32[10,16] parameter(0)
+      %b = f32[32,20] parameter(1)
+      %meta = u16[10,2] parameter(2)
+      %dot = f32[10,20] dot(%a, %b, %meta),
+          lhs_contracting_dims={1}, rhs_contracting_dims={0}, sparsity=L.1@2:4
+      %c = f32[] constant(0)
+      ROOT %reduce = f32[10] reduce(%dot, %c), dimensions={1}, to_apply=add
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options;
+  options.set_use_associative_reordering(true);
+  options.set_associative_reordering_threshold(0);
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_FALSE(simplifier.Run(module.get()).value());
+}
+
+TEST_F(AlgebraicSimplifierTest, SparseDotNoAssociativeReorderOther) {
+  const char* hlo_string = R"(
+    HloModule m
+    ENTRY test {
+      %a = f32[10,16] parameter(0)
+      %b = f32[32,20] parameter(1)
+      %meta = u16[10,2] parameter(2)
+      %reverse = f32[10,16] reverse(%a), dimensions={1}
+      ROOT %dot = f32[10,20] dot(%reverse, %b, %meta),
+          lhs_contracting_dims={1}, rhs_contracting_dims={0}, sparsity=L.1@2:4
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options;
+  options.set_use_associative_reordering(true);
+  options.set_associative_reordering_threshold(0);
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_FALSE(simplifier.Run(module.get()).value());
+}
+
+TEST_F(AlgebraicSimplifierTest, SparseDotReduceBatchDimension) {
+  const char* kModuleStr = R"(
+    HloModule m
+    add {
+      %p0 = f32[] parameter(0)
+      %p1 = f32[] parameter(1)
+      ROOT %add = f32[] add(%p0, %p1)
+    }
+    ENTRY test {
+      %p0 = f32[32,8,5,64] parameter(0)
+      %p1 = f32[8,32,128,7] parameter(1)
+      %meta = u16[32,8,5,8] parameter(2)
+      %dot = f32[32,8,5,7] dot(%p0, %p1, %meta),
+          lhs_batch_dims={0,1}, rhs_batch_dims={1,0},
+          lhs_contracting_dims={3}, rhs_contracting_dims={2}, sparsity=L.3@2:4
+      %c = f32[] constant(0)
+      ROOT %r = f32[8,5,7] reduce(%dot, %c), dimensions={0}, to_apply=add
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).value());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(SparseDotMatcher(m::Parameter(0), m::Parameter(1),
+                                          m::Parameter(2))));
+}
+
+TEST_F(AlgebraicSimplifierTest, SparseDotNoContractingReorder) {
+  const char* kModuleStr = R"(
+    HloModule m
+    ENTRY test {
+      %lhs = f32[2,8] constant({{1,2,3,4,5,6,7,8},{9,10,11,12,13,14,15,16}})
+      %meta = u16[2,1] constant({{0},{1}})
+      %t0 = f32[5,2,8] parameter(0)
+      %t1 = f32[5,8,2] transpose(%t0), dimensions={0,2,1}
+      %rhs = f32[5,16] reshape(t1)
+      ROOT %dot = f32[2,5] dot(%lhs, %rhs, %meta),
+          lhs_contracting_dims={1}, rhs_contracting_dims={1}, sparsity=L.1@2:4
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_FALSE(AlgebraicSimplifier(default_options_).Run(m.get()).value());
+}
+
+TEST_F(AlgebraicSimplifierTest, SparseDotOfConcat) {
+  const char* kModuleStr = R"(
+    HloModule m
+    ENTRY test {
+      %a = f32[2,4] parameter(0)
+      %b = f32[2,4] parameter(1)
+      %lhs = f32[2,8] concatenate(%a, %b), dimensions={1}
+      %meta = u16[2,1] constant({{0},{1}})
+      %rhs = f32[16,2] constant({
+          {0,1},{2,3},{4,5},{6,7},{8,9},{10,11},{12,13},{14,15},
+          {16,17},{18,19},{20,21},{22,23},{24,25},{26,27},{28,29},{30,31}})
+      ROOT %dot = f32[2,2] dot(%lhs, %rhs, %meta),
+          lhs_contracting_dims={1}, rhs_contracting_dims={0}, sparsity=L.1@2:4
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_FALSE(AlgebraicSimplifier(default_options_).Run(m.get()).value());
 }
 
 TEST_F(AlgebraicSimplifierTest, BroadcastToTranspose) {
diff --git a/third_party/xla/xla/service/all_gather_combiner.cc b/third_party/xla/xla/service/all_gather_combiner.cc
index ecb9fb42474d61..6d2e86cac466ad 100644
--- a/third_party/xla/xla/service/all_gather_combiner.cc
+++ b/third_party/xla/xla/service/all_gather_combiner.cc
@@ -73,8 +73,8 @@ int64_t FindMostFrequentGatherDim(
 // Combines the elements of to_combine into a single AllGather op. All entries
 // in to_combine must be AllGather ops with exactly one operand and the same
 // preferred all_gather_dimension.
-Status CombineAllGathers(absl::Span<HloInstruction* const> to_combine,
-                         bool combine_by_dim) {
+absl::Status CombineAllGathers(absl::Span<HloInstruction* const> to_combine,
+                               bool combine_by_dim) {
   if (to_combine.size() < 2) {
     return OkStatus();
   }
@@ -130,7 +130,7 @@ Status CombineAllGathers(absl::Span<HloInstruction* const> to_combine,
   HloInstruction* combined;
   combined = computation.AddInstruction(HloInstruction::CreateAllGather(
       ShapeUtil::MakeTupleShape(output_shapes), operands, most_frequent_dim,
-      to_combine.front()->replica_groups(),
+      to_combine.front()->device_list(),
       /*constrain_layout=*/false, to_combine.front()->channel_id(),
       Cast<HloAllGatherInstruction>(to_combine.front())
           ->use_global_device_ids()));
@@ -225,7 +225,7 @@ absl::StatusOr<bool> AllGatherCombiner::Run(
       return CombineKey(instruction, *domain_map, combine_by_dim_);
     };
     auto combine_fn =
-        [&](absl::Span<HloInstruction* const> to_combine) -> Status {
+        [&](absl::Span<HloInstruction* const> to_combine) -> absl::Status {
       return CombineAllGathers(to_combine, combine_by_dim_);
     };
 
diff --git a/third_party/xla/xla/service/all_gather_decomposer.cc b/third_party/xla/xla/service/all_gather_decomposer.cc
index 24db8fd3599476..3f3183fe3920af 100644
--- a/third_party/xla/xla/service/all_gather_decomposer.cc
+++ b/third_party/xla/xla/service/all_gather_decomposer.cc
@@ -15,10 +15,12 @@ limitations under the License.
 
 #include "xla/service/all_gather_decomposer.h"
 
+#include <cstdint>
+#include <optional>
 #include <vector>
 
-#include "absl/algorithm/container.h"
-#include "absl/strings/str_join.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -27,13 +29,17 @@ limitations under the License.
 #include "xla/literal_util.h"
 #include "xla/service/collective_decomposer_utils.h"
 #include "xla/service/collective_ops_utils.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status_macros.h"
-#include "xla/types.h"
+#include "xla/status.h"
+#include "xla/util.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 
+namespace {
 // Creates a computation of x + y.
 HloComputation* MakeBinaryAdd(PrimitiveType type, HloModule* module) {
   HloComputation::Builder sum_b("add");
@@ -51,14 +57,15 @@ HloComputation* MakeBinaryAdd(PrimitiveType type, HloModule* module) {
   HloComputation* reduction = module->AddEmbeddedComputation(sum_b.Build());
   return reduction;
 }
+}  // namespace
 
-HloInstruction* TranslateAllGatherToAllReducePerOperand(
+HloInstruction* AllGatherDecomposer::TranslateAllGatherToAllReducePerOperand(
     CollectiveOpGroupMode group_mode, const HloAllGatherInstruction& ag,
-    const Shape& output_shape, HloInstruction* operand, HloComputation* comp) {
+    const Shape& output_shape, HloInstruction* operand, HloComputation* comp,
+    int64_t ag_dim) {
   std::vector<HloInstruction*> start_indices =
       CreateStartIndicesForCollectiveDecomposition(
-          group_mode, ag.replica_groups(), operand->shape(),
-          ag.all_gather_dimension(), comp)
+          group_mode, ag.replica_groups(), operand->shape(), ag_dim, comp)
           .value();
 
   auto zero = comp->AddInstruction(HloInstruction::CreateConstant(
@@ -71,13 +78,14 @@ HloInstruction* TranslateAllGatherToAllReducePerOperand(
   auto ar = comp->AddInstruction(HloInstruction::CreateAllReduce(
       dus->shape(), {dus},
       MakeBinaryAdd(dus->shape().element_type(), comp->parent()),
-      ag.replica_groups(),
+      ag.device_list(),
       /*constrain_layout=*/ag.constrain_layout(), ag.channel_id(),
       ag.use_global_device_ids()));
   return ar;
 }
 
-Status DecomposeAllGather(HloAllGatherInstruction* ag, HloComputation* comp) {
+absl::Status AllGatherDecomposer::DecomposeAllGather(
+    HloAllGatherInstruction* ag, HloComputation* comp) {
   TF_ASSIGN_OR_RETURN(CollectiveOpGroupMode group_mode,
                       GetCollectiveOpGroupMode(ag->channel_id().has_value(),
                                                ag->use_global_device_ids()));
@@ -87,14 +95,16 @@ Status DecomposeAllGather(HloAllGatherInstruction* ag, HloComputation* comp) {
       auto* input_operand = ag->mutable_operand(i);
       const auto& output_shape = ag->shape().tuple_shapes(i);
       auto* ar = TranslateAllGatherToAllReducePerOperand(
-          group_mode, *ag, output_shape, input_operand, comp);
+          group_mode, *ag, output_shape, input_operand, comp,
+          ag->all_gather_dimension());
       tuple_inputs.push_back(ar);
     }
     auto tup = comp->AddInstruction(HloInstruction::CreateTuple(tuple_inputs));
     TF_RETURN_IF_ERROR(ag->ReplaceAllUsesWith(tup));
   } else {
     auto* ar = TranslateAllGatherToAllReducePerOperand(
-        group_mode, *ag, ag->shape(), ag->mutable_operand(0), comp);
+        group_mode, *ag, ag->shape(), ag->mutable_operand(0), comp,
+        ag->all_gather_dimension());
     TF_RETURN_IF_ERROR(ag->ReplaceAllUsesWith(ar));
   }
   TF_RETURN_IF_ERROR(comp->RemoveInstructionAndUnusedOperands(ag));
@@ -111,7 +121,7 @@ absl::StatusOr<bool> AllGatherDecomposer::Run(
         continue;
       }
       auto ag = Cast<HloAllGatherInstruction>(hlo);
-      if (should_decompose_(*ag)) {
+      if (ShouldDecompose(*ag)) {
         TF_RETURN_IF_ERROR(DecomposeAllGather(ag, comp));
         changed = true;
       }
diff --git a/third_party/xla/xla/service/all_gather_decomposer.h b/third_party/xla/xla/service/all_gather_decomposer.h
index da56d0c4023037..924113fc23d2de 100644
--- a/third_party/xla/xla/service/all_gather_decomposer.h
+++ b/third_party/xla/xla/service/all_gather_decomposer.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/collective_ops_utils.h"
 #include "xla/service/hlo_pass_interface.h"
 
 namespace xla {
@@ -41,6 +42,18 @@ class AllGatherDecomposer : public HloModulePass {
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
+ protected:
+  virtual HloInstruction* TranslateAllGatherToAllReducePerOperand(
+      CollectiveOpGroupMode group_mode, const HloAllGatherInstruction& ag,
+      const Shape& output_shape, HloInstruction* operand, HloComputation* comp,
+      int64_t ag_dim);
+
+  virtual bool ShouldDecompose(const HloAllGatherInstruction& ag) const {
+    return should_decompose_(ag);
+  }
+
+  Status DecomposeAllGather(HloAllGatherInstruction* ag, HloComputation* comp);
+
  private:
   std::function<bool(const HloAllGatherInstruction&)> should_decompose_;
 };
diff --git a/third_party/xla/xla/service/all_reduce_combiner.cc b/third_party/xla/xla/service/all_reduce_combiner.cc
index 5d7b9b6ee4c5bb..6417717b13fef5 100644
--- a/third_party/xla/xla/service/all_reduce_combiner.cc
+++ b/third_party/xla/xla/service/all_reduce_combiner.cc
@@ -45,7 +45,7 @@ namespace {
 // Combines the elements of to_combine into a single AllReduce op. All
 // entries in to_combine must be AllReduce ops with exactly one operand
 // and the same reduction operation.
-Status CombineAllReduces(absl::Span<HloInstruction* const> to_combine) {
+absl::Status CombineAllReduces(absl::Span<HloInstruction* const> to_combine) {
   if (to_combine.size() < 2) {
     return OkStatus();
   }
@@ -80,7 +80,7 @@ Status CombineAllReduces(absl::Span<HloInstruction* const> to_combine) {
   TF_RET_CHECK(operands.size() >= 2);
   combined = computation.AddInstruction(HloInstruction::CreateAllReduce(
       ShapeUtil::MakeTupleShapeWithPtrs(operand_shapes), operands, reduction,
-      to_combine.front()->replica_groups(),
+      to_combine.front()->device_list(),
       /*constrain_layout=*/false, to_combine.front()->channel_id(),
       Cast<HloAllReduceInstruction>(to_combine.front())
           ->use_global_device_ids()));
diff --git a/third_party/xla/xla/service/all_reduce_combiner_test.cc b/third_party/xla/xla/service/all_reduce_combiner_test.cc
index 33399c2bae7208..188d7a99251bb0 100644
--- a/third_party/xla/xla/service/all_reduce_combiner_test.cc
+++ b/third_party/xla/xla/service/all_reduce_combiner_test.cc
@@ -71,7 +71,7 @@ HloInstruction* MakeCrossReplicaReductions(
         b->AddInstruction(HloInstruction::CreateBroadcast(shape, constant, {}));
     inputs->push_back(input);
     all_reduces.push_back(b->AddInstruction(HloInstruction::CreateAllReduce(
-        shape, {input}, reduction, /*replica_groups=*/{},
+        shape, {input}, reduction, /*device_list=*/CollectiveDeviceList(),
         /*constrain_layout=*/false, /*channel_id=*/nullopt,
         /*use_global_device_ids=*/false)));
   }
@@ -92,18 +92,6 @@ HloComputation* MakeReduction(const HloOpcode type, HloModule* module) {
   return reduction;
 }
 
-// Creates replica groups for AllReduce. groups[i] represents replica ids
-// for group 'i'.
-std::vector<ReplicaGroup> CreateReplicaGroups(
-    absl::Span<const std::vector<int64_t>> groups) {
-  std::vector<ReplicaGroup> replica_groups(groups.size());
-  for (int64_t i = 0; i < groups.size(); ++i) {
-    *replica_groups[i].mutable_replica_ids() = {groups[i].begin(),
-                                                groups[i].end()};
-  }
-  return replica_groups;
-}
-
 using AllReduceCombinerTest = HloTestBase;
 
 // Tests combination of several AllReduce instructions.
@@ -215,12 +203,13 @@ TEST_F(AllReduceCombinerTest, NoDependentCombination) {
   auto constant = b.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.3)));
   auto all_reduce = b.AddInstruction(HloInstruction::CreateAllReduce(
-      constant->shape(), {constant}, reduction, /*replica_groups=*/{},
+      constant->shape(), {constant}, reduction,
+      /*device_list=*/CollectiveDeviceList(),
       /*constrain_layout=*/false, /*channel_id=*/nullopt,
       /*use_global_device_ids=*/false));
   b.AddInstruction(HloInstruction::CreateAllReduce(
       constant->shape(), {all_reduce}, reduction,
-      /*replica_groups=*/{}, /*constrain_layout=*/false,
+      /*device_list=*/CollectiveDeviceList(), /*constrain_layout=*/false,
       /*channel_id=*/nullopt, /*use_global_device_ids=*/false));
 
   module->AddEntryComputation(b.Build());
@@ -242,12 +231,12 @@ TEST_F(AllReduceCombinerTest, GroupAllReduce) {
       HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.3)));
   auto crs0 = b.AddInstruction(HloInstruction::CreateAllReduce(
       constant->shape(), {constant}, reduction,
-      CreateReplicaGroups({{0, 1}, {2, 3}}),
+      CollectiveDeviceList({{0, 1}, {2, 3}}),
       /*constrain_layout=*/false,
       /*channel_id=*/nullopt, /*use_global_device_ids=*/false));
   auto crs1 = b.AddInstruction(HloInstruction::CreateAllReduce(
       constant->shape(), {constant}, reduction,
-      CreateReplicaGroups({{0, 2}, {1, 3}}),
+      CollectiveDeviceList({{0, 2}, {1, 3}}),
       /*constrain_layout=*/false,
       /*channel_id=*/nullopt, /*use_global_device_ids=*/false));
   b.AddInstruction(HloInstruction::CreateTuple({crs0, crs1}));
diff --git a/third_party/xla/xla/service/all_reduce_contiguous.cc b/third_party/xla/xla/service/all_reduce_contiguous.cc
index 7f07b2fa756df6..b40a4d04371316 100644
--- a/third_party/xla/xla/service/all_reduce_contiguous.cc
+++ b/third_party/xla/xla/service/all_reduce_contiguous.cc
@@ -28,7 +28,8 @@ limitations under the License.
 namespace xla {
 namespace {
 
-Status ReplaceWithContiguousAllReduce(HloAllReduceInstruction* all_reduce) {
+absl::Status ReplaceWithContiguousAllReduce(
+    HloAllReduceInstruction* all_reduce) {
   TF_RET_CHECK(all_reduce);
   TF_RET_CHECK(!all_reduce->has_sharding());
 
@@ -56,7 +57,7 @@ Status ReplaceWithContiguousAllReduce(HloAllReduceInstruction* all_reduce) {
   HloInstruction* new_all_reduce =
       computation.AddInstruction(HloInstruction::CreateAllReduce(
           concat_shape, {concatenated}, all_reduce->to_apply(),
-          all_reduce->replica_groups(),
+          all_reduce->device_list(),
           /*constrain_layout=*/false, all_reduce->channel_id(),
           all_reduce->use_global_device_ids()));
 
diff --git a/third_party/xla/xla/service/all_reduce_folder.cc b/third_party/xla/xla/service/all_reduce_folder.cc
index 9d034dd45ef606..ae90b49fbb8ffc 100644
--- a/third_party/xla/xla/service/all_reduce_folder.cc
+++ b/third_party/xla/xla/service/all_reduce_folder.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "xla/hlo/ir/collective_device_list.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -200,7 +201,8 @@ absl::StatusOr<bool> AllReduceFolder::Run(
       HloInstruction *new_ar =
           computation->AddInstruction(HloInstruction::CreateAllReduce(
               ar0->shape(), ar0->operands(), ar0->to_apply(),
-              *new_replica_groups, /*constrain_layout=*/false, channel_id,
+              CollectiveDeviceList(*new_replica_groups),
+              /*constrain_layout=*/false, channel_id,
               ar0->use_global_device_ids()));
       TF_RETURN_IF_ERROR(ar1->ReplaceAllUsesWith(new_ar));
       TF_RETURN_IF_ERROR(computation->RemoveInstruction(ar1));
diff --git a/third_party/xla/xla/service/all_reduce_key.cc b/third_party/xla/xla/service/all_reduce_key.cc
index ddc0ac79a41aa2..bd2fd49dc6be51 100644
--- a/third_party/xla/xla/service/all_reduce_key.cc
+++ b/third_party/xla/xla/service/all_reduce_key.cc
@@ -15,8 +15,16 @@ limitations under the License.
 
 #include "xla/service/all_reduce_key.h"
 
+#include <cstdint>
+#include <optional>
+#include <vector>
+
+#include "absl/log/log.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/hlo_domain_map.h"
 
 namespace xla {
 
@@ -32,7 +40,8 @@ std::optional<AllReduceKey> GetAllReduceKey(const HloInstruction* instruction,
 
   if (instruction->to_apply()->instruction_count() != 3 ||
       instruction->to_apply()->num_parameters() != 2) {
-    VLOG(1) << "Skipping due to non-trivial reduction function.";
+    VLOG(1) << "Skipping due to non-trivial reduction function: "
+            << instruction->to_apply()->ToString();
     return std::nullopt;
   }
 
diff --git a/third_party/xla/xla/service/all_to_all_decomposer.cc b/third_party/xla/xla/service/all_to_all_decomposer.cc
index 241b242f693fc9..ecb08af7660382 100644
--- a/third_party/xla/xla/service/all_to_all_decomposer.cc
+++ b/third_party/xla/xla/service/all_to_all_decomposer.cc
@@ -107,7 +107,7 @@ absl::StatusOr<HloInstruction*> AllToAllDecomposer::ExpandInstruction(
       std::vector<const Shape*>(all_to_all_group_size, &slice_shape));
   HloInstruction* new_all_to_all =
       all_to_all->parent()->AddInstruction(HloInstruction::CreateAllToAll(
-          all_to_all_shape, slices, all_to_all->replica_groups(), false,
+          all_to_all_shape, slices, all_to_all->device_list(), false,
           all_to_all->channel_id(), std::nullopt));
   std::vector<HloInstruction*> gtes;
   gtes.reserve(all_to_all_group_size);
diff --git a/third_party/xla/xla/service/allocation_tracker.cc b/third_party/xla/xla/service/allocation_tracker.cc
index a2cbad64b75964..2bc816b0415968 100644
--- a/third_party/xla/xla/service/allocation_tracker.cc
+++ b/third_party/xla/xla/service/allocation_tracker.cc
@@ -94,7 +94,7 @@ absl::StatusOr<GlobalDataHandle> AllocationTracker::RegisterInternal(
   return result;
 }
 
-Status AllocationTracker::Unregister(const GlobalDataHandle& data) {
+absl::Status AllocationTracker::Unregister(const GlobalDataHandle& data) {
   absl::MutexLock lock(&mutex_);
   VLOG(2) << "Unregister("
           << "handle: " << data.handle() << ")";
@@ -218,8 +218,8 @@ void AllocationTracker::AddAllocationOrIncrementRefCount(
   }
 }
 
-Status AllocationTracker::DecrementRefCount(se::DeviceMemoryBase device_memory,
-                                            int device_ordinal) {
+absl::Status AllocationTracker::DecrementRefCount(
+    se::DeviceMemoryBase device_memory, int device_ordinal) {
   AllocationMap& allocation_map = opaque_to_allocation_map_[device_ordinal];
   auto it = allocation_map.find(device_memory.opaque());
   TF_RET_CHECK(it != allocation_map.end());
diff --git a/third_party/xla/xla/service/allocation_tracker.h b/third_party/xla/xla/service/allocation_tracker.h
index c8359c0fc30971..035641e8c35eb5 100644
--- a/third_party/xla/xla/service/allocation_tracker.h
+++ b/third_party/xla/xla/service/allocation_tracker.h
@@ -52,7 +52,7 @@ class AllocationTracker {
       const std::string& tag);
 
   // Unregister the allocation for the given data handle.
-  Status Unregister(const GlobalDataHandle& data);
+  absl::Status Unregister(const GlobalDataHandle& data);
 
   // Returns a vector of global data handles that point to the tuple elements.
   absl::StatusOr<std::vector<GlobalDataHandle>> DeconstructTuple(
@@ -103,8 +103,8 @@ class AllocationTracker {
 
   // Decrements the reference count of the given device memory. Then, if it is
   // zero, deallocate the memory.
-  Status DecrementRefCount(se::DeviceMemoryBase device_memory,
-                           int device_ordinal)
+  absl::Status DecrementRefCount(se::DeviceMemoryBase device_memory,
+                                 int device_ordinal)
       ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
   // A map from device memory opaque value to allocation. One such map is
diff --git a/third_party/xla/xla/service/ar_crs_combiner.cc b/third_party/xla/xla/service/ar_crs_combiner.cc
index 1ad71ca8d7e56c..e64d746861ab7e 100644
--- a/third_party/xla/xla/service/ar_crs_combiner.cc
+++ b/third_party/xla/xla/service/ar_crs_combiner.cc
@@ -462,7 +462,7 @@ void ArCrsCombiner::GroupAllReducesById(HloModule* module) {
   }
 }
 
-Status ArCrsCombiner::KeepProvablyEqualInstructionGroupsMPMD() {
+absl::Status ArCrsCombiner::KeepProvablyEqualInstructionGroupsMPMD() {
   for (auto it = all_reduce_map_.begin(); it != all_reduce_map_.end();) {
     auto copy_it = it++;  // Advance `it` before invalidation from erase.
     auto channel_id = copy_it->first;
@@ -496,7 +496,7 @@ Status ArCrsCombiner::KeepProvablyEqualInstructionGroupsMPMD() {
   return OkStatus();
 }
 
-Status ArCrsCombiner::KeepProvablyEqualInstructionGroupsSPMD(
+absl::Status ArCrsCombiner::KeepProvablyEqualInstructionGroupsSPMD(
     HloModule* module) {
   // For SPMD mode, use HloReplicationAnalysis to figure out HLO value
   // equivalence across partitions.
diff --git a/third_party/xla/xla/service/ar_crs_combiner.h b/third_party/xla/xla/service/ar_crs_combiner.h
index 7b537b7dd87429..925dcb9d8b2784 100644
--- a/third_party/xla/xla/service/ar_crs_combiner.h
+++ b/third_party/xla/xla/service/ar_crs_combiner.h
@@ -154,10 +154,10 @@ class ArCrsCombiner : public HloModulePass {
 
   // Looks at each AllReduce group in all_reduce_map_, and keeps only the
   // groups for which it's safe to move the AllReduce later in the HLO graph.
-  Status KeepProvablyEqualInstructionGroupsMPMD();
+  absl::Status KeepProvablyEqualInstructionGroupsMPMD();
 
   // Same as above, but runs on SPMD partitioned module instead of MPMD.
-  Status KeepProvablyEqualInstructionGroupsSPMD(HloModule* module);
+  absl::Status KeepProvablyEqualInstructionGroupsSPMD(HloModule* module);
 
   // Performs the graph rewrite that eliminates the early AllReduce and turns
   // the later CRS into an AllReduce.
diff --git a/third_party/xla/xla/service/async_collective_creator.cc b/third_party/xla/xla/service/async_collective_creator.cc
index 2903a87bede47f..58ab8a56100903 100644
--- a/third_party/xla/xla/service/async_collective_creator.cc
+++ b/third_party/xla/xla/service/async_collective_creator.cc
@@ -44,7 +44,7 @@ absl::StatusOr<ReplacedAsync> CreateAsyncAllReduce(
   auto* ar = Cast<HloAllReduceInstruction>(instruction);
   HloInstruction* start =
       computation->AddInstruction(HloInstruction::CreateAllReduceStart(
-          ar->shape(), ar->operands(), ar->to_apply(), ar->replica_groups(),
+          ar->shape(), ar->operands(), ar->to_apply(), ar->device_list(),
           ar->constrain_layout(), ar->channel_id(),
           ar->use_global_device_ids()));
   HloInstruction* done =
@@ -69,8 +69,8 @@ absl::StatusOr<ReplacedAsync> CreateAsyncAllGather(
        ag->shape()});
   HloInstruction* start =
       computation->AddInstruction(HloInstruction::CreateAllGatherStart(
-          shape, ag->operands(), ag->all_gather_dimension(),
-          ag->replica_groups(), ag->constrain_layout(), ag->channel_id(),
+          shape, ag->operands(), ag->all_gather_dimension(), ag->device_list(),
+          ag->constrain_layout(), ag->channel_id(),
           ag->use_global_device_ids()));
   HloInstruction* done =
       computation->AddInstruction(HloInstruction::CreateUnary(
diff --git a/third_party/xla/xla/service/backend.cc b/third_party/xla/xla/service/backend.cc
index aac138f395caf9..b3999ae2364001 100644
--- a/third_party/xla/xla/service/backend.cc
+++ b/third_party/xla/xla/service/backend.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "xla/statusor.h"
 #include "xla/stream_executor/host/host_platform_id.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/stream_executor_interface.h"
 #include "xla/util.h"
 #include "tsl/platform/cpu_info.h"
 #include "tsl/platform/env.h"
@@ -147,7 +148,8 @@ Backend::Backend(se::Platform* platform, Compiler* compiler,
       stream_executors_(stream_executors.begin(), stream_executors.end()) {
   // Create a memory allocator for the valid stream executors.
   memory_allocator_ = std::make_shared<se::StreamExecutorMemoryAllocator>(
-      platform, stream_executors_);
+      platform, std::vector<se::StreamExecutorInterface*>{
+                    stream_executors_.begin(), stream_executors_.end()});
   CHECK(!stream_executors_.empty())
       << "Service found no devices for backend " << platform_->Name() << '.';
 
@@ -211,7 +213,7 @@ absl::StatusOr<bool> Backend::devices_equivalent(int device_ordinal_a,
           executor_b->GetDeviceDescription().name());
 }
 
-Status Backend::ResetDevices() {
+absl::Status Backend::ResetDevices() {
   return transfer_manager_->ResetDevices(stream_executors_);
 }
 
diff --git a/third_party/xla/xla/service/backend.h b/third_party/xla/xla/service/backend.h
index fb8a324a4d320d..0c919517d98bb7 100644
--- a/third_party/xla/xla/service/backend.h
+++ b/third_party/xla/xla/service/backend.h
@@ -168,7 +168,7 @@ class Backend {
   tsl::thread::ThreadPool* eigen_intra_op_thread_pool() const;
 
   // Resets the devices associated with this backend.
-  Status ResetDevices();
+  absl::Status ResetDevices();
 
  private:
   Backend(se::Platform* platform, Compiler* compiler,
diff --git a/third_party/xla/xla/service/batch_dot_simplification.cc b/third_party/xla/xla/service/batch_dot_simplification.cc
index 2941182ccc4a61..3f22acf1930249 100644
--- a/third_party/xla/xla/service/batch_dot_simplification.cc
+++ b/third_party/xla/xla/service/batch_dot_simplification.cc
@@ -111,10 +111,6 @@ BatchDotSimplification::ElideDegenerateBatchDimensionFromBatchDot(
   return true;
 }
 
-absl::string_view BatchDotSimplification::name() const {
-  return "batch-dot-simplification";
-}
-
 absl::StatusOr<bool> BatchDotSimplification::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
diff --git a/third_party/xla/xla/service/batch_dot_simplification.h b/third_party/xla/xla/service/batch_dot_simplification.h
index 0f5238386429dc..32c386fe11db4a 100644
--- a/third_party/xla/xla/service/batch_dot_simplification.h
+++ b/third_party/xla/xla/service/batch_dot_simplification.h
@@ -31,7 +31,7 @@ class BatchDotSimplification : public HloModulePass {
   absl::StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-  absl::string_view name() const override;
+  absl::string_view name() const override { return "batch-dot-simplification"; }
 
  private:
   absl::StatusOr<bool> ElideDegenerateBatchDimensionFromBatchDot(
diff --git a/third_party/xla/xla/service/batchnorm_expander.cc b/third_party/xla/xla/service/batchnorm_expander.cc
index 592cb4a210cfe0..4fcd3b22b6d2ef 100644
--- a/third_party/xla/xla/service/batchnorm_expander.cc
+++ b/third_party/xla/xla/service/batchnorm_expander.cc
@@ -52,11 +52,11 @@ using std::optional;
 // operations into smaller operations.
 class BatchNormExpanderVisitor : public DfsHloRewriteVisitor {
  public:
-  Status HandleBatchNormTraining(HloInstruction* batch_norm) override;
+  absl::Status HandleBatchNormTraining(HloInstruction* batch_norm) override;
 
-  Status HandleBatchNormInference(HloInstruction* batch_norm) override;
+  absl::Status HandleBatchNormInference(HloInstruction* batch_norm) override;
 
-  Status HandleBatchNormGrad(HloInstruction* batch_norm) override;
+  absl::Status HandleBatchNormGrad(HloInstruction* batch_norm) override;
 
   // Runs the visitor on a computation.
   static bool Run(HloComputation* computation, bool rewrite_training_op,
@@ -150,7 +150,7 @@ bool BatchNormExpanderVisitor::Run(HloComputation* computation,
   return visitor.changed();
 }
 
-Status BatchNormExpanderVisitor::HandleBatchNormTraining(
+absl::Status BatchNormExpanderVisitor::HandleBatchNormTraining(
     HloInstruction* batch_norm) {
   if (!rewrite_training_op_) {
     return OkStatus();
@@ -298,7 +298,7 @@ Status BatchNormExpanderVisitor::HandleBatchNormTraining(
   return OkStatus();
 }
 
-Status BatchNormExpanderVisitor::HandleBatchNormInference(
+absl::Status BatchNormExpanderVisitor::HandleBatchNormInference(
     HloInstruction* batch_norm) {
   if (!rewrite_inference_op_) {
     return OkStatus();
@@ -390,7 +390,7 @@ Status BatchNormExpanderVisitor::HandleBatchNormInference(
   return OkStatus();
 }
 
-Status BatchNormExpanderVisitor::HandleBatchNormGrad(
+absl::Status BatchNormExpanderVisitor::HandleBatchNormGrad(
     HloInstruction* batch_norm) {
   // Use the following formulas to calculate gradients:
   // scale_grad =
diff --git a/third_party/xla/xla/service/bfloat16_conversion_folding.cc b/third_party/xla/xla/service/bfloat16_conversion_folding.cc
index c8e94d576c5c0b..9e39947c3341ae 100644
--- a/third_party/xla/xla/service/bfloat16_conversion_folding.cc
+++ b/third_party/xla/xla/service/bfloat16_conversion_folding.cc
@@ -36,10 +36,10 @@ class BFloat16ConversionFoldingVisitor : public DfsHloVisitorWithDefault {
         bfloat16_support_(bfloat16_support),
         bfloat16_conversion_folding_(bfloat16_conversion_folding) {}
 
-  Status DefaultAction(HloInstruction* hlo) override;
+  absl::Status DefaultAction(HloInstruction* hlo) override;
 
   // Special handling for all-reduce which can have a tuple output.
-  Status HandleAllReduce(HloInstruction* crs) override;
+  absl::Status HandleAllReduce(HloInstruction* crs) override;
 
   static bool Run(HloComputation* computation,
                   const FloatSupport* bfloat16_support,
@@ -53,17 +53,18 @@ class BFloat16ConversionFoldingVisitor : public DfsHloVisitorWithDefault {
  private:
   // Checks if the HLO has a BF16 -> F32 conversion as input, or a F32 -> BF16
   // conversion as output, and folds them to the HLO itself if feasible.
-  Status TryFoldBF16Conversions(HloInstruction* hlo);
+  absl::Status TryFoldBF16Conversions(HloInstruction* hlo);
 
   // Folds the F32 -> BF16 conversions from the HLO's output.
   //
   // Precondition: all of the HLO's users are F32 -> BF16 conversions.
-  Status FoldOutputConversions(HloInstruction* hlo);
+  absl::Status FoldOutputConversions(HloInstruction* hlo);
 
   // Folds the BF16 -> F32 conversion operand to the HLO.
   //
   // Precondition: the operand is a BF16 -> F32 conversion.
-  Status FoldOperandConversion(HloInstruction* hlo, int64_t operand_index);
+  absl::Status FoldOperandConversion(HloInstruction* hlo,
+                                     int64_t operand_index);
 
   HloComputation* computation_;
   const FloatSupport* bfloat16_support_;
@@ -71,7 +72,7 @@ class BFloat16ConversionFoldingVisitor : public DfsHloVisitorWithDefault {
   bool changed_ = false;
 };
 
-Status BFloat16ConversionFoldingVisitor::FoldOutputConversions(
+absl::Status BFloat16ConversionFoldingVisitor::FoldOutputConversions(
     HloInstruction* hlo) {
   std::vector<HloInstruction*> materialized_users = hlo->users();
   hlo->mutable_shape()->set_element_type(BF16);
@@ -84,7 +85,7 @@ Status BFloat16ConversionFoldingVisitor::FoldOutputConversions(
   return OkStatus();
 }
 
-Status BFloat16ConversionFoldingVisitor::FoldOperandConversion(
+absl::Status BFloat16ConversionFoldingVisitor::FoldOperandConversion(
     HloInstruction* hlo, int64_t operand_index) {
   // The operand is a convert from BF16 to F32.
   auto operand = hlo->mutable_operand(operand_index);
@@ -114,7 +115,7 @@ bool AllUsersAreF32ToBF16Converts(const HloInstruction* hlo) {
 
 }  // namespace
 
-Status BFloat16ConversionFoldingVisitor::TryFoldBF16Conversions(
+absl::Status BFloat16ConversionFoldingVisitor::TryFoldBF16Conversions(
     HloInstruction* hlo) {
   std::vector<int64_t> bf16_to_f32_operands;
   bool has_other_f32_operands = false;
@@ -157,7 +158,8 @@ Status BFloat16ConversionFoldingVisitor::TryFoldBF16Conversions(
   return OkStatus();
 }
 
-Status BFloat16ConversionFoldingVisitor::DefaultAction(HloInstruction* hlo) {
+absl::Status BFloat16ConversionFoldingVisitor::DefaultAction(
+    HloInstruction* hlo) {
   // Do not fold BF16 conversions for instructions related to tuples, entry and
   // exit of a computation, fusion, convert, side-effecting instructions,
   // in-place operations and control flow.
@@ -186,7 +188,8 @@ Status BFloat16ConversionFoldingVisitor::DefaultAction(HloInstruction* hlo) {
   return TryFoldBF16Conversions(hlo);
 }
 
-Status BFloat16ConversionFoldingVisitor::HandleAllReduce(HloInstruction* crs) {
+absl::Status BFloat16ConversionFoldingVisitor::HandleAllReduce(
+    HloInstruction* crs) {
   if (crs->HasSideEffectNoRecurse()) {
     // Do not perform optimization on side-effected AllReduce.
     return OkStatus();
diff --git a/third_party/xla/xla/service/bfloat16_conversion_folding_test.cc b/third_party/xla/xla/service/bfloat16_conversion_folding_test.cc
index 99cf031b565d5e..dad6743c3c40ef 100644
--- a/third_party/xla/xla/service/bfloat16_conversion_folding_test.cc
+++ b/third_party/xla/xla/service/bfloat16_conversion_folding_test.cc
@@ -239,7 +239,7 @@ TEST_F(BFloat16ConversionFoldingTest, FoldAllReduceTupleOutput) {
 
   HloInstruction* crs = builder.AddInstruction(HloInstruction::CreateAllReduce(
       ShapeUtil::MakeTupleShape({f32_shape, f32_shape}), {convert_a, b}, sum,
-      /*replica_groups=*/{},
+      /*device_list=*/CollectiveDeviceList(),
       /*constrain_layout=*/false,
       /*channel_id=*/std::nullopt, /*use_global_device_ids=*/false));
   HloInstruction* gte_a = builder.AddInstruction(
diff --git a/third_party/xla/xla/service/bfloat16_propagation.cc b/third_party/xla/xla/service/bfloat16_propagation.cc
index 38eb493081f302..8bcf979a7c24a9 100644
--- a/third_party/xla/xla/service/bfloat16_propagation.cc
+++ b/third_party/xla/xla/service/bfloat16_propagation.cc
@@ -708,7 +708,7 @@ void BFloat16Propagation::ResolveInconsistencyOfAliasingBuffers(
   }
 }
 
-Status BFloat16Propagation::ResolveInconsistentFusions(
+absl::Status BFloat16Propagation::ResolveInconsistentFusions(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // We could have changed a fusion computation's root shape to have a different
@@ -771,7 +771,7 @@ Status BFloat16Propagation::ResolveInconsistentFusions(
   return OkStatus();
 }
 
-Status BFloat16Propagation::ResolveConvertedConstants(
+absl::Status BFloat16Propagation::ResolveConvertedConstants(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // We may have converted some constants from F32 to BF16, so adjust the
@@ -803,7 +803,7 @@ Status BFloat16Propagation::ResolveConvertedConstants(
   return OkStatus();
 }
 
-Status BFloat16Propagation::SkipNoopConversions(
+absl::Status BFloat16Propagation::SkipNoopConversions(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   for (auto computation : module->computations(execution_threads)) {
diff --git a/third_party/xla/xla/service/bfloat16_propagation.h b/third_party/xla/xla/service/bfloat16_propagation.h
index 21625e7337573d..b7d1fdabfac706 100644
--- a/third_party/xla/xla/service/bfloat16_propagation.h
+++ b/third_party/xla/xla/service/bfloat16_propagation.h
@@ -159,19 +159,19 @@ class BFloat16Propagation : public HloModulePass {
 
   // Resolves inconsistencies introduced by this pass for fusions with
   // tuple-type output.
-  Status ResolveInconsistentFusions(
+  absl::Status ResolveInconsistentFusions(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads);
 
   // Converts the literals in kConstant HLOs which have their types changed to
   // BF16 by this pass.
-  Status ResolveConvertedConstants(
+  absl::Status ResolveConvertedConstants(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads);
 
   // Skips no-op conversions (same source and target shapes) that can be
   // produced this pass, i.e., replaces them in their uses with their operands.
-  Status SkipNoopConversions(
+  absl::Status SkipNoopConversions(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads);
 
diff --git a/third_party/xla/xla/service/bfloat16_propagation_test.cc b/third_party/xla/xla/service/bfloat16_propagation_test.cc
index c52c6a37d67c3d..72f7be63a73426 100644
--- a/third_party/xla/xla/service/bfloat16_propagation_test.cc
+++ b/third_party/xla/xla/service/bfloat16_propagation_test.cc
@@ -212,7 +212,7 @@ TEST_F(BFloat16PropagationTest, DoNotChangeAllReduce) {
   HloInstruction* all_reduce =
       builder.AddInstruction(HloInstruction::CreateAllReduce(
           ShapeUtil::MakeTupleShape({shape, shape}), {a, b}, reduction,
-          /*replica_groups=*/{}, /*constrain_layout=*/false,
+          /*device_list=*/CollectiveDeviceList(), /*constrain_layout=*/false,
           /*channel_id=*/1, /*use_global_device_ids=*/false));
   HloInstruction* gte0 = builder.AddInstruction(
       HloInstruction::CreateGetTupleElement(shape, all_reduce, 0));
@@ -541,7 +541,6 @@ TEST_F(BFloat16PropagationTest, ConvertTupleFusionElementIfUsedByAdd) {
   EXPECT_TRUE(OutputsBF16(new_fusion_root->operand(0)));
 }
 
-
 // Tests that BF16 is propagated properly through a while computation with
 // non-tuple input/output.
 TEST_F(BFloat16PropagationTest, PropagateThroughSimpleWhile) {
diff --git a/third_party/xla/xla/service/buffer_assignment.cc b/third_party/xla/xla/service/buffer_assignment.cc
index 1f09360a6b07ce..5b326950691898 100644
--- a/third_party/xla/xla/service/buffer_assignment.cc
+++ b/third_party/xla/xla/service/buffer_assignment.cc
@@ -117,7 +117,7 @@ BuildIdToLogicalBufferMap(
 
 }  // namespace
 
-Status GatherComputationsByAllocationType(
+absl::Status GatherComputationsByAllocationType(
     const HloModule* module,
     std::vector<const HloComputation*>* thread_local_computations,
     std::vector<const HloComputation*>* global_computations) {
@@ -522,7 +522,7 @@ bool BufferAssignment::HaveDisjointSlices(const HloInstruction* hlo_a,
   // assigned slice, returns the empty set.
   auto collect_slices = [&](const HloInstruction* instr) -> SliceSet {
     SliceSet slices;
-    Status status = ShapeUtil::ForEachSubshapeWithStatus(
+    absl::Status status = ShapeUtil::ForEachSubshapeWithStatus(
         instr->shape(),
         [&](const Shape& /*subshape*/,
             const ShapeIndex& index) -> absl::Status {
@@ -723,7 +723,7 @@ void BufferAssignment::CombineTempAllocations(
   }
 }
 
-Status BufferAssignment::ComputeSummaryStats() {
+absl::Status BufferAssignment::ComputeSummaryStats() {
   for (auto& allocation : Allocations()) {
     if (allocation.is_entry_computation_parameter()) {
       stats_.parameter_allocation_count++;
@@ -1277,7 +1277,7 @@ bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation,
   return true;
 }  // namespace xla
 
-Status BufferAssigner::AssignSingleHloBuffer(
+absl::Status BufferAssigner::AssignSingleHloBuffer(
     const HloBuffer* hlo_buffer, bool is_thread_local,
     absl::flat_hash_map<const HloComputation*,
                         absl::flat_hash_set<const HloValue*>>*
@@ -1415,7 +1415,7 @@ Status BufferAssigner::AssignSingleHloBuffer(
   return OkStatus();
 }
 
-Status BufferAssigner::AssignBuffersForComputations(
+absl::Status BufferAssigner::AssignBuffersForComputations(
     const std::vector<const HloComputation*>& computations,
     bool is_thread_local,
     absl::flat_hash_map<const HloComputation*,
@@ -1565,7 +1565,7 @@ BufferAssigner::SplitBuffersByPrivateStackComputation(
   return computation_map;
 }
 
-Status BufferAssigner::AssignPresetBuffers(
+absl::Status BufferAssigner::AssignPresetBuffers(
     absl::flat_hash_set<const HloBuffer*>* assigned_buffers,
     BufferAssignment* assignment) {
   if (!preset_assignments_) {
@@ -1616,7 +1616,7 @@ Status BufferAssigner::AssignPresetBuffers(
   return OkStatus();
 }
 
-Status BufferAssigner::AssignBuffersWithSequentialOrdering(
+absl::Status BufferAssigner::AssignBuffersWithSequentialOrdering(
     const flat_hash_map<const HloComputation*, flat_hash_set<const HloValue*>>&
         buffers_to_assign_sequentially,
     bool run_whole_module_heap_simulation, BufferAssignment* assignment,
diff --git a/third_party/xla/xla/service/buffer_assignment.h b/third_party/xla/xla/service/buffer_assignment.h
index e908b617641f05..bf60d9676abf2e 100644
--- a/third_party/xla/xla/service/buffer_assignment.h
+++ b/third_party/xla/xla/service/buffer_assignment.h
@@ -49,7 +49,7 @@ namespace xla {
 // elements in thread_local_computations and global_computations are in post
 // order (if computation A has an instruction which calls computation B, then A
 // will appear after B in the vector).
-Status GatherComputationsByAllocationType(
+absl::Status GatherComputationsByAllocationType(
     const HloModule* module,
     std::vector<const HloComputation*>* thread_local_computations,
     std::vector<const HloComputation*>* global_computations);
@@ -114,6 +114,10 @@ class BufferAllocation {
     return is_entry_computation_parameter_;
   }
 
+  bool is_parameter_aliased_with_output() const {
+    return is_parameter_aliased_with_output_;
+  }
+
   // Whether this allocation holds a constant.  On the CPU and GPU backends
   // constant allocations are not allocated dynamically, instead we resolve
   // references to these buffer allocations to a global in the readonly section
@@ -568,7 +572,7 @@ class BufferAssignment {
       const absl::flat_hash_set<BufferValue::Color>& private_stack_colors);
 
   // Computes stats for the assignment, to be retrieved by GetStats.
-  Status ComputeSummaryStats();
+  absl::Status ComputeSummaryStats();
 
   // The vector of buffer allocations. Indexed by BufferAllocation::Index.
   std::vector<BufferAllocation> allocations_;
@@ -607,7 +611,8 @@ class BufferAssignment {
 // A class which constructs a buffer assignment.
 class BufferAssigner {
  public:
-  using Colorer = std::function<Status(HloAliasAnalysis*, const HloOrdering&)>;
+  using Colorer =
+      std::function<absl::Status(HloAliasAnalysis*, const HloOrdering&)>;
   using MustNotLiveOut =
       std::function<bool(const HloInstruction*, const ShapeIndex&)>;
   using PrivateStacks = absl::flat_hash_map<BufferValue::Color,
@@ -679,7 +684,7 @@ class BufferAssigner {
   // is modified to reflect the new buffer assignments. If is_thread_local is
   // true, then all assigned buffers have the is_thread_local flag set to
   // true.
-  Status AssignBuffersForComputations(
+  absl::Status AssignBuffersForComputations(
       const std::vector<const HloComputation*>& computations,
       bool is_thread_local,
       absl::flat_hash_map<const HloComputation*,
@@ -693,12 +698,12 @@ class BufferAssigner {
 
   // Assigns pre-set assignments, if provided. These assignments will be added
   // to assigned_buffers and skip buffer allocation.
-  Status AssignPresetBuffers(
+  absl::Status AssignPresetBuffers(
       absl::flat_hash_set<const HloBuffer*>* assigned_buffers,
       BufferAssignment* assignment);
 
   // Assigns a single hlo buffer to an HLO allocation.
-  Status AssignSingleHloBuffer(
+  absl::Status AssignSingleHloBuffer(
       const HloBuffer* hlo_buffer, bool is_thread_local,
       absl::flat_hash_map<const HloComputation*,
                           absl::flat_hash_set<const HloValue*>>*
@@ -711,7 +716,7 @@ class BufferAssigner {
   // assignment->liveness().hlo_ordering().SequentialOrder. If
   // 'run_whole_module_heap_simulation' is true, the heap simulation will be run
   // assuming all global computations are sequentially ordered.
-  Status AssignBuffersWithSequentialOrdering(
+  absl::Status AssignBuffersWithSequentialOrdering(
       const absl::flat_hash_map<const HloComputation*,
                                 absl::flat_hash_set<const HloValue*>>&
           buffers_to_assign_sequentially,
diff --git a/third_party/xla/xla/service/buffer_assignment_test.cc b/third_party/xla/xla/service/buffer_assignment_test.cc
index 5380368cad491e..02b7707dbf7fdf 100644
--- a/third_party/xla/xla/service/buffer_assignment_test.cc
+++ b/third_party/xla/xla/service/buffer_assignment_test.cc
@@ -62,7 +62,7 @@ class InstructionListVisitor : public DfsHloVisitorWithDefault {
  public:
   explicit InstructionListVisitor(const HloInstruction* root) : root_(root) {}
 
-  Status DefaultAction(HloInstruction* hlo) override {
+  absl::Status DefaultAction(HloInstruction* hlo) override {
     // For each instruction, just push it on the list after walking the
     // operands.
     instructions_.push_back(hlo);
@@ -3042,6 +3042,53 @@ TEST_F(BufferAssignmentTest, AsyncCallImplicitSharding) {
   EXPECT_EQ(get_slice("dynamic-update-slice", {}).size(), 32);
 }
 
+TEST_F(BufferAssignmentTest, AsyncCustomCall) {
+  const char* hlo_text = R"(
+HloModule AsyncCustomCall, is_scheduled=true
+
+ENTRY %main (a: f32[4096]) -> f32[4096] {
+  %a = f32[4096]{0} parameter(0)
+  %neg_0 = f32[4096]{0} negate(f32[4096]{0} %a)
+  %async-start = ((f32[4096]{0}), f32[4096]{0}, u32[])
+                 custom-call-start(f32[4096]{0} %neg_0),
+                 custom_call_target="Foo"
+  %async-done = f32[4096]{0} custom-call-done(((f32[4096]{0}), f32[4096]{0}, u32[]) %async-start)
+  ROOT %neg_1 = f32[4096]{0} negate(f32[4096]{0} %async-done)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_text));
+  auto buffers = RunBufferAssignmentWithSequentialOrdering(m.get());
+
+  HloInstruction* neg_0 = FindInstruction(m.get(), "neg_0");
+  HloInstruction* async_done = FindInstruction(m.get(), "async-done");
+  EXPECT_FALSE(buffers->SharesTopLevelSlice(neg_0, async_done));
+}
+
+TEST_F(BufferAssignmentTest, AsyncCustomCallWithAliasing) {
+  const char* hlo_text = R"(
+HloModule AsyncCustomCall, is_scheduled=true
+
+ENTRY %main (a: f32[4096]) -> f32[4096] {
+  %a = f32[4096]{0} parameter(0)
+  %neg_0 = f32[4096]{0} negate(f32[4096]{0} %a)
+  %async-start = ((f32[4096]{0}), f32[4096]{0}, u32[])
+                 custom-call-start(f32[4096]{0} %neg_0),
+                 custom_call_target="Foo",
+                 output_to_operand_aliasing={{}: (0, {})}
+  %async-done = f32[4096]{0} custom-call-done(((f32[4096]{0}), f32[4096]{0}, u32[]) %async-start)
+  ROOT %neg_1 = f32[4096]{0} negate(f32[4096]{0} %async-done)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_text));
+  auto buffers = RunBufferAssignmentWithSequentialOrdering(m.get());
+
+  HloInstruction* neg_0 = FindInstruction(m.get(), "neg_0");
+  HloInstruction* async_done = FindInstruction(m.get(), "async-done");
+  EXPECT_TRUE(buffers->SharesTopLevelSlice(neg_0, async_done));
+}
+
 TEST_F(BufferAssignmentTest, BufferIsolation) {
   absl::string_view module_str = R"(
 HloModule test_module, is_scheduled=true
diff --git a/third_party/xla/xla/service/call_graph.cc b/third_party/xla/xla/service/call_graph.cc
index fcf0d239a2dabb..805de50bff9838 100644
--- a/third_party/xla/xla/service/call_graph.cc
+++ b/third_party/xla/xla/service/call_graph.cc
@@ -363,7 +363,7 @@ std::unique_ptr<CallGraph> CallGraph::Build(
   return call_graph;
 }
 
-Status CallGraph::VisitNodesInternal(
+absl::Status CallGraph::VisitNodesInternal(
     VisitorFunction visitor_func, const CallGraphNode& node,
     absl::flat_hash_set<const CallGraphNode*>* visited) const {
   auto pair = visited->insert(&node);
@@ -380,8 +380,8 @@ Status CallGraph::VisitNodesInternal(
   return visitor_func(node);
 }
 
-Status CallGraph::VisitNodes(VisitorFunction visitor_func,
-                             bool visit_unreachable_nodes) const {
+absl::Status CallGraph::VisitNodes(VisitorFunction visitor_func,
+                                   bool visit_unreachable_nodes) const {
   absl::flat_hash_set<const CallGraphNode*> visited;
   if (visit_unreachable_nodes) {
     // Traverse from all roots in the call graph.
diff --git a/third_party/xla/xla/service/call_graph.h b/third_party/xla/xla/service/call_graph.h
index d0fa22157f363f..a64346e5d0dc3d 100644
--- a/third_party/xla/xla/service/call_graph.h
+++ b/third_party/xla/xla/service/call_graph.h
@@ -188,7 +188,7 @@ class CallGraphNode {
 // computation in the module.
 class CallGraph {
  public:
-  using VisitorFunction = absl::FunctionRef<Status(const CallGraphNode&)>;
+  using VisitorFunction = absl::FunctionRef<absl::Status(const CallGraphNode&)>;
 
   // Builds and returns a call graph for the given HLO module. If a non-empty
   // execution_threads is provided, only computations that are in
@@ -208,8 +208,8 @@ class CallGraph {
   // in post order (callees before callers). If visit_unreachable_nodes is true
   // then all nodes in the call graph are visited. Otherwise only those nodes
   // reachable from the entry computation are visited.
-  Status VisitNodes(VisitorFunction visitor_func,
-                    bool visit_unreachable_nodes = true) const;
+  absl::Status VisitNodes(VisitorFunction visitor_func,
+                          bool visit_unreachable_nodes = true) const;
 
   // Returns true if 'a' dominates 'b' in the call graph. Computation 'a'
   // dominates computation 'b' iff all callgraph paths in the caller-to-callee
@@ -360,7 +360,7 @@ class CallGraph {
   // post order (callee before caller) calling visitor_func on each node. Adds
   // nodes to 'visited' as each node is visited. Skips nodes already in
   // 'visited'.
-  Status VisitNodesInternal(
+  absl::Status VisitNodesInternal(
       VisitorFunction visitor_func, const CallGraphNode& node,
       absl::flat_hash_set<const CallGraphNode*>* visited) const;
 
diff --git a/third_party/xla/xla/service/call_graph_test.cc b/third_party/xla/xla/service/call_graph_test.cc
index 9337bec456fa5b..f97dc23510b093 100644
--- a/third_party/xla/xla/service/call_graph_test.cc
+++ b/third_party/xla/xla/service/call_graph_test.cc
@@ -706,7 +706,7 @@ TEST_F(CallGraphTest, VisitWithError) {
   module->AddEntryComputation(MakeScalarComputation());
   std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
 
-  Status status = call_graph->VisitNodes(
+  absl::Status status = call_graph->VisitNodes(
       [](const CallGraphNode&) { return Internal("Visitation failed"); });
 
   ASSERT_FALSE(status.ok());
diff --git a/third_party/xla/xla/service/call_inliner.cc b/third_party/xla/xla/service/call_inliner.cc
index f92369d8605e91..63edc5c1f8f952 100644
--- a/third_party/xla/xla/service/call_inliner.cc
+++ b/third_party/xla/xla/service/call_inliner.cc
@@ -55,7 +55,7 @@ class SubcomputationInsertionVisitor : public DfsHloVisitorWithDefault {
 
   // Resolves the operands to the HLO instruction in the inlined (caller) graph,
   // and clones the HLO instruction into that graph with the new operands.
-  Status DefaultAction(HloInstruction* hlo) override {
+  absl::Status DefaultAction(HloInstruction* hlo) override {
     std::vector<HloInstruction*> new_operands;
     for (HloInstruction* operand : hlo->operands()) {
       TF_ASSIGN_OR_RETURN(HloInstruction * new_operand, Resolve(operand));
@@ -81,7 +81,7 @@ class SubcomputationInsertionVisitor : public DfsHloVisitorWithDefault {
   // Does not create new nodes for the parameter; rather, notes the mapping from
   // the subcomputation parameter node to the call operands in the caller
   // computation.
-  Status HandleParameter(HloInstruction* parameter) override {
+  absl::Status HandleParameter(HloInstruction* parameter) override {
     TF_RETURN_IF_ERROR(NoteMapping(
         parameter, call_->mutable_operand(parameter->parameter_number())));
     return OkStatus();
@@ -89,7 +89,7 @@ class SubcomputationInsertionVisitor : public DfsHloVisitorWithDefault {
 
   // Wires the consumers of the call to instead point at the newly created root,
   // replacing the call operation in the caller computation.
-  Status FinishVisit(HloInstruction* root) override {
+  absl::Status FinishVisit(HloInstruction* root) override {
     TF_ASSIGN_OR_RETURN(HloInstruction * new_root, Resolve(root));
     VLOG(1) << "Replacing all uses of " << call_->ToString()
             << " with new root " << new_root->ToString();
@@ -119,8 +119,8 @@ class SubcomputationInsertionVisitor : public DfsHloVisitorWithDefault {
   //
   // Returns an error status if the subcomputation_hlo is mapped more than
   // once.
-  Status NoteMapping(HloInstruction* subcomputation_hlo,
-                     HloInstruction* new_hlo) {
+  absl::Status NoteMapping(HloInstruction* subcomputation_hlo,
+                           HloInstruction* new_hlo) {
     auto result = subcomputation_hlo_to_new_hlo_.insert(
         std::make_pair(subcomputation_hlo, new_hlo));
     TF_RET_CHECK(result.second)
@@ -161,7 +161,7 @@ absl::StatusOr<bool> CallInliner::Run(
   // we'll always inline kCalls into their callers in the appropriate order.
   bool did_mutate = false;
   TF_RETURN_IF_ERROR(call_graph->VisitNodes([&](const CallGraphNode& node)
-                                                -> Status {
+                                                -> absl::Status {
     if (!HloInstruction::IsThreadIncluded(
             node.computation()->execution_thread(), execution_threads)) {
       return OkStatus();
diff --git a/third_party/xla/xla/service/collective_combiner_utils.h b/third_party/xla/xla/service/collective_combiner_utils.h
index b54296d764a87a..29ef053dc6d60b 100644
--- a/third_party/xla/xla/service/collective_combiner_utils.h
+++ b/third_party/xla/xla/service/collective_combiner_utils.h
@@ -16,18 +16,20 @@ limitations under the License.
 #ifndef XLA_SERVICE_COLLECTIVE_COMBINER_UTILS_H_
 #define XLA_SERVICE_COLLECTIVE_COMBINER_UTILS_H_
 
-#include <functional>
-#include <utility>
+#include <cstdint>
+#include <memory>
+#include <optional>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/functional/function_ref.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_reachability.h"
-#include "xla/service/hlo_domain_map.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/xla_data.pb.h"
@@ -46,7 +48,8 @@ template <typename K>
 absl::StatusOr<bool> CombineInstructionsByKey(
     HloComputation* computation,
     absl::FunctionRef<std::optional<K>(const HloInstruction*)> key_fn,
-    absl::FunctionRef<Status(absl::Span<HloInstruction* const>)> combine_fn,
+    absl::FunctionRef<absl::Status(absl::Span<HloInstruction* const>)>
+        combine_fn,
     int64_t combine_threshold_bytes, int64_t combine_threshold_count) {
   // Cache keys for each instruction and build sets of instructions with the
   // same key that might be combined together.
@@ -122,7 +125,14 @@ absl::StatusOr<bool> CombineInstructionsByKey(
       // We can't combine dependent instructions.
       bool is_reachable =
           absl::c_any_of(to_combine, [&](HloInstruction* to_combine_inst) {
-            return reachability->IsReachable(to_combine_inst, instruction);
+            bool reachable =
+                reachability->IsReachable(to_combine_inst, instruction);
+            if (reachable) {
+              VLOG(2) << "<< Instruction {" << instruction->ToShortString()
+                      << "} is reachable from {"
+                      << to_combine_inst->ToShortString() << "}";
+            }
+            return reachable;
           });
       if (is_reachable) {
         VLOG(1) << "Instruction is reachable.";
diff --git a/third_party/xla/xla/service/collective_ops_utils.cc b/third_party/xla/xla/service/collective_ops_utils.cc
index 70adcd2de8722a..c565836aaf7b36 100644
--- a/third_party/xla/xla/service/collective_ops_utils.cc
+++ b/third_party/xla/xla/service/collective_ops_utils.cc
@@ -22,7 +22,9 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_join.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
@@ -609,25 +611,18 @@ bool IsCollective(const HloInstruction* instruction) {
 }
 
 bool IsCollectiveWithChannelId(const HloInstruction* instruction) {
-  switch (instruction->opcode()) {
-    case HloOpcode::kAllReduce:
-    case HloOpcode::kAllReduceStart:
-    case HloOpcode::kAllGather:
-    case HloOpcode::kAllGatherStart:
-    case HloOpcode::kAllToAll:
-    case HloOpcode::kCollectivePermute:
-    case HloOpcode::kCollectivePermuteStart:
-      return instruction->channel_id().has_value();
-    case HloOpcode::kFusion:
-      for (const auto* inner_inst : instruction->fused_instructions()) {
-        if (IsCollectiveWithChannelId(inner_inst)) {
-          return true;
-        }
+  if (instruction->opcode() == HloOpcode::kFusion) {
+    for (const auto* inner_inst : instruction->fused_instructions()) {
+      if (IsCollectiveWithChannelId(inner_inst)) {
+        return true;
       }
-      return false;
-    default:
-      return false;
+    }
+    return false;
+  }
+  if (DynCast<HloChannelInstruction>(instruction) == nullptr) {
+    return false;
   }
+  return IsCollective(instruction) && instruction->channel_id().has_value();
 }
 
 bool IsSyncCollective(const HloInstruction* instr) {
diff --git a/third_party/xla/xla/service/collective_ops_utils.h b/third_party/xla/xla/service/collective_ops_utils.h
index d8e046bee4df90..315d0f3330b836 100644
--- a/third_party/xla/xla/service/collective_ops_utils.h
+++ b/third_party/xla/xla/service/collective_ops_utils.h
@@ -308,7 +308,7 @@ class Rendezvous {
 
   // Submit a participant to the rendezvous. We get the rendezvous from
   // `rendezvous_getter`, which we can then use to drop the existing reference.
-  static StatusOr<O> SubmitParticipant(
+  static absl::StatusOr<O> SubmitParticipant(
       absl::FunctionRef<std::shared_ptr<Rendezvous<I, O>>()> rendezvous_getter,
       I participant) {
     std::shared_ptr<Rendezvous<I, O>> rendezvous = rendezvous_getter();
@@ -336,7 +336,7 @@ class Rendezvous {
 
  protected:
   // Returns domain-specific output O and whether this replica is primary.
-  virtual StatusOr<O> RunCollectiveOp(const I& participant) = 0;
+  virtual absl::StatusOr<O> RunCollectiveOp(const I& participant) = 0;
 
   // Adding participants_ requires holding mu_.
   // Not annotated with ABSL_GUARDED_BY(mu_) because we do not require the lock
@@ -353,7 +353,7 @@ class Rendezvous {
   //  - a BlockingCounter initialized to the number of participants, so that
   //    the caller can coordinate with the participants one last time if it
   //    chooses.  This is useful for coordinating destruction of the Rendezvous.
-  StatusOr<std::pair<O, std::shared_ptr<tsl::BlockingCounter>>>
+  absl::StatusOr<std::pair<O, std::shared_ptr<tsl::BlockingCounter>>>
   SubmitParticipant(const I& participant) {
     {
       absl::MutexLock lock(&mu_);
diff --git a/third_party/xla/xla/service/collective_ops_utils_test.cc b/third_party/xla/xla/service/collective_ops_utils_test.cc
index 9bb6d2239e0d29..4f4a725e2314d6 100644
--- a/third_party/xla/xla/service/collective_ops_utils_test.cc
+++ b/third_party/xla/xla/service/collective_ops_utils_test.cc
@@ -101,16 +101,32 @@ TEST(CollectiveOpsUtilsTest, CollectiveWithChannelId2) {
           0, ShapeUtil::MakeShape(BF16, {1, 512, 4096}), "p0")));
   HloInstruction *instr =
       builder.AddInstruction(HloInstruction::CreateAllGather(
-          ShapeUtil::MakeShape(BF16, {1, 4096, 4096}), {param_0}, 1, {group},
-          true, 231, true));
+          ShapeUtil::MakeShape(BF16, {1, 4096, 4096}), {param_0}, 1,
+          CollectiveDeviceList({group}), true, 231, true));
   auto computation = builder.Build(
       builder.AddInstruction(HloInstruction::CreateTuple({instr})));
   auto fusion =
       HloInstruction::CreateFusion(ShapeUtil::MakeShape(BF16, {1, 4096, 4096}),
                                    HloInstruction::FusionKind::kOutput,
                                    {param_0}, computation.get(), "fusion");
-
   EXPECT_TRUE(IsCollectiveWithChannelId(fusion.get()));
+
+  auto builder2 = HloComputation::Builder("CollectiveWithChannelId2");
+  TF_ASSERT_OK_AND_ASSIGN(
+      HloInstruction * param_1,
+      builder2.AddParameter(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(BF16, {1, 512, 4096}), "p1")));
+  HloInstruction *instr_without_channel_id =
+      builder2.AddInstruction(HloInstruction::CreateAllGather(
+          ShapeUtil::MakeShape(BF16, {1, 4096, 4096}), {param_1}, 1, {group},
+          true, std::nullopt, true));
+  auto computation2 = builder2.Build(builder2.AddInstruction(
+      HloInstruction::CreateTuple({instr_without_channel_id})));
+  auto fusion2 =
+      HloInstruction::CreateFusion(ShapeUtil::MakeShape(BF16, {1, 4096, 4096}),
+                                   HloInstruction::FusionKind::kOutput,
+                                   {param_1}, computation2.get(), "fusion2");
+  EXPECT_FALSE(IsCollectiveWithChannelId(fusion2.get()));
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/collective_opt_utils.cc b/third_party/xla/xla/service/collective_opt_utils.cc
index cbc7a4c8867bd4..13b7d553adb22b 100644
--- a/third_party/xla/xla/service/collective_opt_utils.cc
+++ b/third_party/xla/xla/service/collective_opt_utils.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/util.h"
 
 namespace xla {
 namespace {
@@ -301,7 +302,8 @@ std::optional<ReduceScatterSpec> MatchReduceScatter(
     const HloAllReduceInstructionBase* ar, int64_t num_partitions,
     int64_t num_replicas, bool allow_multiple_split_dims,
     bool allow_intervening_reshape, int64_t min_rank,
-    HloPredicate match_partition_id, HloPredicate match_replica_id) {
+    HloPredicate match_partition_id, HloPredicate match_replica_id,
+    bool allow_intervening_bitcast) {
   if (ar->opcode() == HloOpcode::kReduceScatter) {
     return SpecFromReduceScatterInstr(
         ar, num_partitions, num_replicas, min_rank, ar->constrain_layout(),
@@ -311,7 +313,8 @@ std::optional<ReduceScatterSpec> MatchReduceScatter(
       ar, num_partitions, num_replicas, allow_multiple_split_dims,
       allow_intervening_reshape, min_rank, match_partition_id, match_replica_id,
       ar->constrain_layout(), ar->use_global_device_ids(),
-      ar->channel_id() && ar->opcode() == HloOpcode::kAllReduce);
+      ar->channel_id() && ar->opcode() == HloOpcode::kAllReduce,
+      allow_intervening_bitcast);
   return spec;
 }
 
@@ -336,8 +339,8 @@ std::optional<ReduceScatterSpec> MatchWithDynamicSlice(
     int64_t num_replicas, bool allow_multiple_split_dims,
     bool allow_intervening_reshape, int64_t min_rank,
     HloPredicate match_partition_id, HloPredicate match_replica_id,
-    bool is_constrain_layout, bool use_global_device_ids,
-    bool is_cross_module) {
+    bool is_constrain_layout, bool use_global_device_ids, bool is_cross_module,
+    bool allow_intervening_bitcast) {
   if (!instruction->shape().IsArray() || is_constrain_layout ||
       (is_cross_module &&
        !instruction->GetModule()->config().use_spmd_partitioning())) {
@@ -382,6 +385,18 @@ std::optional<ReduceScatterSpec> MatchWithDynamicSlice(
     }
     user = reshape->users().front();
   }
+  HloInstruction* bitcast = nullptr;
+  if (allow_intervening_bitcast && user->opcode() == HloOpcode::kBitcast) {
+    VLOG(2) << "Allowing intervening bitcast " << user->ToString();
+    bitcast = user;
+    if (bitcast->user_count() != 1) {
+      VLOG(2) << "Bitcast following all-reduce has user count > 1"
+              << bitcast->ToString();
+      return std::nullopt;
+    }
+    user = bitcast->users().front();
+  }
+
   if (user->opcode() != HloOpcode::kDynamicSlice) {
     VLOG(2) << "All-reduce user is not dynamic slice " << user->ToString();
     return std::nullopt;
diff --git a/third_party/xla/xla/service/collective_opt_utils.h b/third_party/xla/xla/service/collective_opt_utils.h
index 7d044be3c34568..983969e41e6076 100644
--- a/third_party/xla/xla/service/collective_opt_utils.h
+++ b/third_party/xla/xla/service/collective_opt_utils.h
@@ -40,7 +40,8 @@ std::optional<ReduceScatterSpec> MatchReduceScatter(
     int64_t num_replicas, bool allow_multiple_split_dims = false,
     bool allow_intervening_reshape = false, int64_t min_rank = 1,
     HloPredicate match_partition_id = HloPredicateIsOp<HloOpcode::kPartitionId>,
-    HloPredicate match_replica_id = HloPredicateIsOp<HloOpcode::kReplicaId>);
+    HloPredicate match_replica_id = HloPredicateIsOp<HloOpcode::kReplicaId>,
+    bool allow_intervening_bitcast = false);
 
 // Check whether AG(ICI) and its single user DS(ICI) can be canceled out.
 bool AllGatherDynamicSliceCancellation(
@@ -59,7 +60,7 @@ std::optional<ReduceScatterSpec> MatchWithDynamicSlice(
     HloPredicate match_partition_id = HloPredicateIsOp<HloOpcode::kPartitionId>,
     HloPredicate match_replica_id = HloPredicateIsOp<HloOpcode::kReplicaId>,
     bool is_constrain_layout = false, bool use_global_device_ids = false,
-    bool is_cross_module = false);
+    bool is_cross_module = false, bool allow_intervening_bitcast = false);
 
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/collective_permute_decomposer.cc b/third_party/xla/xla/service/collective_permute_decomposer.cc
index 1660163b948b11..39f4b241c7f361 100644
--- a/third_party/xla/xla/service/collective_permute_decomposer.cc
+++ b/third_party/xla/xla/service/collective_permute_decomposer.cc
@@ -110,7 +110,7 @@ bool MayPipeline(const HloCollectivePermuteInstruction& collective_permute) {
 // collective-permute will be pipelined and the value of the attribute
 // represents the runtime stream to execute the instruction. Without the
 // frontend attribute, the collective-permute will not be pipelined.
-Status DecomposeCollectivePermute(
+absl::Status DecomposeCollectivePermute(
     HloCollectivePermuteInstruction* collective_permute,
     HloComputation* computation, const std::string& pipeline_decision) {
   // We currently only decompose collective-permute with a channel_id.
diff --git a/third_party/xla/xla/service/collective_pipeliner.cc b/third_party/xla/xla/service/collective_pipeliner.cc
index 040c2042e7b731..ffd988752d7edb 100644
--- a/third_party/xla/xla/service/collective_pipeliner.cc
+++ b/third_party/xla/xla/service/collective_pipeliner.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <functional>
+#include <iterator>
 #include <limits>
 #include <memory>
 #include <optional>
@@ -29,6 +30,7 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/numeric/int128.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
@@ -41,6 +43,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_query.h"
+#include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/map_util.h"
 #include "xla/primitive_util.h"
@@ -50,10 +53,10 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
-#include "xla/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 
@@ -73,9 +76,9 @@ using LoopVariantParameterInfo =
 
 // Update all control dependencies for a cloned instruction to connect other
 // cloned instructions rather than originals.
-Status UpdateControlDependencies(HloInstruction* original,
-                                 HloInstruction* new_instr,
-                                 const InstructionMap& cloned_map) {
+absl::Status UpdateControlDependencies(HloInstruction* original,
+                                       HloInstruction* new_instr,
+                                       const InstructionMap& cloned_map) {
   for (auto* pred : original->control_predecessors()) {
     auto it = cloned_map.find(pred);
     if (it == cloned_map.end()) {
@@ -526,8 +529,9 @@ std::optional<std::vector<HloInstruction*>> CollectChainsToPushBackwards(
     HloInstruction* instr, int64_t loop_iter, const HloComputation* while_body,
     int64_t level_to_operate_on,
     const absl::flat_hash_set<const HloInstruction*>& loop_invariant_params,
-    HloPredicate should_allow_loop_variant_parameter_in_chain) {
-  if (instr->HasControlDependencies()) {
+    HloPredicate should_allow_loop_variant_parameter_in_chain,
+    bool should_allow_control_dependencies) {
+  if (instr->HasControlDependencies() && !should_allow_control_dependencies) {
     return std::nullopt;
   }
   return CollectIndependentOperandChain(
@@ -689,7 +693,8 @@ class WhileLoopAnalysis {
       CollectivePipeliner::PipeliningDirection direction,
       HloPredicate should_process, HloPredicate acceptable_formatting,
       HloPredicate should_allow_loop_variant_parameter_in_chain =
-          HloPredicateFalse);
+          HloPredicateFalse,
+      bool should_allow_control_dependencies = false);
   HloInstruction* while_loop_instruction() const { return while_; }
 
  private:
@@ -798,7 +803,8 @@ void WhileLoopAnalysis::CollectCollectivesToMove(
     int64_t level_to_operate_on,
     CollectivePipeliner::PipeliningDirection direction,
     HloPredicate should_process, HloPredicate acceptable_formatting,
-    HloPredicate should_allow_loop_variant_parameter_in_chain) {
+    HloPredicate should_allow_loop_variant_parameter_in_chain,
+    bool should_allow_control_dependencies) {
   move_infos_.clear();
   HloComputation* while_body = while_->while_body();
   const HloInstruction* loop_parameter =
@@ -1006,7 +1012,8 @@ void WhileLoopAnalysis::CollectCollectivesToMove(
       auto chain_collected = CollectChainsToPushBackwards(
           instr, *loop_iteration_idx_, while_body, level_to_operate_on,
           invariant_loop_parameters_,
-          should_allow_loop_variant_parameter_in_chain);
+          should_allow_loop_variant_parameter_in_chain,
+          should_allow_control_dependencies);
       if (!chain_collected.has_value()) {
         VLOG(5) << "Skipping " << instr->name()
                 << " because didn't find compatible slice of parameter";
@@ -1169,14 +1176,12 @@ HloInstruction* CreateZero(HloComputation* comp, const Shape& shape,
 // }
 // xg_last = all-reduce(x)
 // yg_last = all-reduce(y)
-Status TransformLoopForward(const WhileLoopAnalysis& loop_analysis,
-                            bool insert_non_alias_custom_call,
-                            int64_t level_to_operate_on, bool pipeline_use_tree,
-                            bool process_different_sized_ops,
-                            HloPredicate should_process,
-                            HloPredicate acceptable_formatting,
-                            HloPredicate reuse_output_buffer,
-                            int64_t& next_channel_id) {
+absl::Status TransformLoopForward(
+    const WhileLoopAnalysis& loop_analysis, bool insert_non_alias_custom_call,
+    int64_t level_to_operate_on, bool pipeline_use_tree,
+    bool process_different_sized_ops, HloPredicate should_process,
+    HloPredicate acceptable_formatting, HloPredicate reuse_output_buffer,
+    int64_t& next_channel_id) {
   // Defining some maps/sets to keep track of instructions duplicated.
   InstructionMap while_body_to_peeled;
   absl::flat_hash_set<HloInstruction*> to_skip_set;
@@ -1577,13 +1582,13 @@ Status TransformLoopForward(const WhileLoopAnalysis& loop_analysis,
 // }
 // xg_all = all-reduce(x_all)
 // yg_all = all-reduce(y_all)
-Status TransformLoopForwardSink(const WhileLoopAnalysis& loop_analysis,
-                                bool insert_non_alias_custom_call,
-                                int64_t level_to_operate_on,
-                                bool pipeline_use_tree,
-                                bool process_different_sized_ops,
-                                HloPredicate should_process,
-                                int64_t& next_channel_id) {
+absl::Status TransformLoopForwardSink(const WhileLoopAnalysis& loop_analysis,
+                                      bool insert_non_alias_custom_call,
+                                      int64_t level_to_operate_on,
+                                      bool pipeline_use_tree,
+                                      bool process_different_sized_ops,
+                                      HloPredicate should_process,
+                                      int64_t& next_channel_id) {
   // Defining some maps/sets to keep track of instructions duplicated.
   absl::flat_hash_map<HloInstruction*, int64_t> is_output_instruction;
   absl::flat_hash_map<const HloInstruction*, bool> invariant_cache;
@@ -2075,7 +2080,7 @@ Status TransformLoopForwardSink(const WhileLoopAnalysis& loop_analysis,
 //   x_ag = p0_ag_next
 // }
 // x_last = computation(p0_ag_next)
-static Status TransformLoopBackward(
+static absl::Status TransformLoopBackward(
     const WhileLoopAnalysis& loop_analysis, bool insert_non_alias_custom_call,
     int64_t level_to_operate_on, bool process_different_sized_ops,
     HloPredicate should_process, HloPredicate acceptable_formatting,
@@ -2414,7 +2419,8 @@ absl::StatusOr<bool> CollectivePipeliner::Run(
     loop_analysis.CollectCollectivesToMove(
         config_.level_to_operate_on, config_.pipelining_direction,
         config_.should_process, config_.acceptable_formatting,
-        config_.should_allow_loop_variant_parameter_in_chain);
+        config_.should_allow_loop_variant_parameter_in_chain,
+        config_.should_allow_control_dependencies);
     if (loop_analysis.GetMoveInfos().empty()) {
       continue;
     }
@@ -2448,7 +2454,7 @@ absl::StatusOr<bool> CollectivePipeliner::Run(
           loop_analysis, !config_.last_run, config_.level_to_operate_on,
           config_.process_different_sized_ops, config_.should_process,
           config_.acceptable_formatting, config_.postprocess_backward_peeled_op,
-          config_.postprocess_backward_rorated_op, next_channel_id));
+          config_.postprocess_backward_rotated_op, next_channel_id));
     }
     ++transformed_loops;
     changed = true;
diff --git a/third_party/xla/xla/service/collective_pipeliner.h b/third_party/xla/xla/service/collective_pipeliner.h
index 384bafe4df26bb..433686fdb5a1b9 100644
--- a/third_party/xla/xla/service/collective_pipeliner.h
+++ b/third_party/xla/xla/service/collective_pipeliner.h
@@ -65,7 +65,7 @@ class CollectivePipeliner : public HloModulePass {
   // Postprocessing cloned collective instructions, such as for modifying loop
   // iteration related frontend attributes to reflect loop pipelining.
   using HloPostprocessor =
-      std::optional<std::function<Status(HloInstruction* instr)>>;
+      std::optional<std::function<absl::Status(HloInstruction* instr)>>;
 
   struct Config {
     int64_t level_to_operate_on = 0;
@@ -93,8 +93,12 @@ class CollectivePipeliner : public HloModulePass {
     // pipelinining.
     HloPredicate should_allow_loop_variant_parameter_in_chain =
         HloPredicateFalse;
+    // Whether we allow control dependencies on the Collective operation being
+    // pipelined. The control dependencies will be dropped when the operation is
+    // pipelined. This is currently only used to support kBackward pipelining.
+    bool should_allow_control_dependencies = false;
     HloPostprocessor postprocess_backward_peeled_op = std::nullopt;
-    HloPostprocessor postprocess_backward_rorated_op = std::nullopt;
+    HloPostprocessor postprocess_backward_rotated_op = std::nullopt;
   };
   static const char* const kInsertedByPreviousStep;
   static const char* const kSunkByPreviousStep;
diff --git a/third_party/xla/xla/service/collective_pipeliner_test.cc b/third_party/xla/xla/service/collective_pipeliner_test.cc
index f28d00b4d2fbd7..00d657f42d37a5 100644
--- a/third_party/xla/xla/service/collective_pipeliner_test.cc
+++ b/third_party/xla/xla/service/collective_pipeliner_test.cc
@@ -81,7 +81,8 @@ absl::StatusOr<bool> RunOptimizer(
       /*should_process=*/should_process,
       /*acceptable_formatting=*/acceptable_formatting,
       /*reuse_pipelined_op_buffer=*/reuse_pipelined_op_buffer,
-      should_allow_loop_variant_parameter_in_chain, postprocess_backward_peeled,
+      should_allow_loop_variant_parameter_in_chain,
+      /*should_allow_control_dependencies=*/false, postprocess_backward_peeled,
       postprocess_backward_rotated};
   HloPassPipeline pass("optimizer");
   pass.AddPass<HloVerifier>(/*layout_sensitive=*/false,
diff --git a/third_party/xla/xla/service/collective_transformation_reorderer.cc b/third_party/xla/xla/service/collective_transformation_reorderer.cc
index ba04054de22fb1..ea31b2197babab 100644
--- a/third_party/xla/xla/service/collective_transformation_reorderer.cc
+++ b/third_party/xla/xla/service/collective_transformation_reorderer.cc
@@ -223,7 +223,7 @@ CollectiveTransformationReorder::ReorderAllGatherTransformations(
     HloInstruction* new_all_gather =
         computation->AddInstruction(HloInstruction::CreateAllGather(
             new_all_gather_shape, {all_gather_operand}, all_gather_dimension,
-            all_gather->replica_groups(), all_gather->constrain_layout(),
+            all_gather->device_list(), all_gather->constrain_layout(),
             all_gather->channel_id(), all_gather->use_global_device_ids()));
     TF_RETURN_IF_ERROR(
         transformations.back().hlo->ReplaceAllUsesWith(new_all_gather));
@@ -272,7 +272,7 @@ CollectiveTransformationReorder::ReorderAllReduceTransformations(
     HloInstruction* new_all_reduce =
         computation->AddInstruction(HloInstruction::CreateAllReduce(
             cur_operand->shape(), {cur_operand}, all_reduce->to_apply(),
-            all_reduce->replica_groups(), all_reduce->constrain_layout(),
+            all_reduce->device_list(), all_reduce->constrain_layout(),
             all_reduce->channel_id(), all_reduce->use_global_device_ids()));
 
     // For each eligible reshape on the old all-reduce's operand, we reshape the
diff --git a/third_party/xla/xla/service/collective_transformation_reorderer.h b/third_party/xla/xla/service/collective_transformation_reorderer.h
index fe730bd5bfe503..7d76c8d2eb2cbb 100644
--- a/third_party/xla/xla/service/collective_transformation_reorderer.h
+++ b/third_party/xla/xla/service/collective_transformation_reorderer.h
@@ -56,9 +56,7 @@ class CollectiveTransformationReorder : public HloModulePass {
   CollectiveTransformationReorder() = default;
   ~CollectiveTransformationReorder() override = default;
   absl::string_view name() const override {
-    static constexpr absl::string_view kName =
-        "collective-transformation-reorderer";
-    return kName;
+    return "collective-transformation-reorderer";
   }
   using HloPassInterface::Run;
   absl::StatusOr<bool> Run(
diff --git a/third_party/xla/xla/service/compilation_environments.cc b/third_party/xla/xla/service/compilation_environments.cc
index 0c2569b92dfcfb..a86209a01a7a69 100644
--- a/third_party/xla/xla/service/compilation_environments.cc
+++ b/third_party/xla/xla/service/compilation_environments.cc
@@ -185,7 +185,7 @@ void CompilationEnvironments::RegisterProcessNewEnvFn(
                   << descriptor->full_name() << "' has already been registered";
 }
 
-Status CompilationEnvironments::AddEnv(
+absl::Status CompilationEnvironments::AddEnv(
     std::unique_ptr<tsl::protobuf::Message> env) {
   if (!env) {
     return tsl::errors::InvalidArgument(
@@ -239,7 +239,7 @@ void CompilationEnvironments::EnvAdded(std::string_view env_type) {
   GlobalCompEnvStats::GetSingleton().EnvAdded(env_type);
 }
 
-Status CompilationEnvironments::AddEnvImpl(
+absl::Status CompilationEnvironments::AddEnvImpl(
     const tsl::protobuf::Descriptor& descriptor,
     std::unique_ptr<tsl::protobuf::Message> env) {
   // Check if we already have an environment of env's type
diff --git a/third_party/xla/xla/service/compilation_environments.h b/third_party/xla/xla/service/compilation_environments.h
index 3ffea24bb53b11..9dbb73df94292e 100644
--- a/third_party/xla/xla/service/compilation_environments.h
+++ b/third_party/xla/xla/service/compilation_environments.h
@@ -84,7 +84,7 @@ class CompilationEnvironments {
   // All added environments are processed via registered ProcessNewEnvFns. If
   // such a function was not regitered for env's proto descriptor or env's
   // proto type is unknown, an error will be returned.
-  Status AddEnv(std::unique_ptr<tsl::protobuf::Message> env);
+  absl::Status AddEnv(std::unique_ptr<tsl::protobuf::Message> env);
 
   // Returns the CompilationEnvironment corresponding to T. If such an
   // environment has not been added, ProcessNewEnvFn(nullptr) will be added and
@@ -122,8 +122,8 @@ class CompilationEnvironments {
   // are added to CompilationEnvironments.
   static void EnvAdded(std::string_view env_type);
 
-  Status AddEnvImpl(const tsl::protobuf::Descriptor& descriptor,
-                    std::unique_ptr<tsl::protobuf::Message> env);
+  absl::Status AddEnvImpl(const tsl::protobuf::Descriptor& descriptor,
+                          std::unique_ptr<tsl::protobuf::Message> env);
 
   absl::flat_hash_map<const tsl::protobuf::Descriptor*,
                       std::unique_ptr<tsl::protobuf::Message>>
diff --git a/third_party/xla/xla/service/compile_only_service.h b/third_party/xla/xla/service/compile_only_service.h
index 09ca0534454b46..72a63ec2e23b58 100644
--- a/third_party/xla/xla/service/compile_only_service.h
+++ b/third_party/xla/xla/service/compile_only_service.h
@@ -53,27 +53,28 @@ class CompileOnlyService : public Service {
                      const AotCompilationOptions& options,
                      std::unique_ptr<AotCompilationMetadata>* metadata);
 
-  Status GetDeviceHandles(const GetDeviceHandlesRequest* arg,
-                          GetDeviceHandlesResponse* result) override {
+  absl::Status GetDeviceHandles(const GetDeviceHandlesRequest* arg,
+                                GetDeviceHandlesResponse* result) override {
     return Unimplemented("CompileOnlyService does not support devices.");
   }
-  Status TransferToServer(const TransferToServerRequest* arg,
-                          TransferToServerResponse* result) override {
+  absl::Status TransferToServer(const TransferToServerRequest* arg,
+                                TransferToServerResponse* result) override {
     return Unimplemented(
         "CompileOnlyService does not support device data transfers.");
   }
-  Status TransferToInfeed(const TransferToInfeedRequest* arg,
-                          TransferToInfeedResponse* result) override {
+  absl::Status TransferToInfeed(const TransferToInfeedRequest* arg,
+                                TransferToInfeedResponse* result) override {
     return Unimplemented(
         "CompileOnlyService does not support device data transfers.");
   }
-  Status TransferFromOutfeed(const TransferFromOutfeedRequest* arg,
-                             TransferFromOutfeedResponse* result) override {
+  absl::Status TransferFromOutfeed(
+      const TransferFromOutfeedRequest* arg,
+      TransferFromOutfeedResponse* result) override {
     return Unimplemented(
         "CompileOnlyService does not support device data transfers.");
   }
-  Status ResetDevice(const ResetDeviceRequest* arg,
-                     ResetDeviceResponse* result) override {
+  absl::Status ResetDevice(const ResetDeviceRequest* arg,
+                           ResetDeviceResponse* result) override {
     return Unimplemented("CompileOnlyService does not support devices.");
   }
 
diff --git a/third_party/xla/xla/service/compiler.cc b/third_party/xla/xla/service/compiler.cc
index b0feb9a62ae811..ee78a3b661818f 100644
--- a/third_party/xla/xla/service/compiler.cc
+++ b/third_party/xla/xla/service/compiler.cc
@@ -31,7 +31,7 @@ namespace xla {
 
 Compiler::TargetConfig::TargetConfig(se::StreamExecutor* s)
     : device_description(s->GetDeviceDescription().ToGpuProto()),
-      platform_name(s->platform()->Name()),
+      platform_name(s->GetPlatform()->Name()),
       device_description_str(s->GetDeviceDescription().name()) {
   se::dnn::DnnSupport* dnn = s->AsDnn();
   if (dnn != nullptr) {
diff --git a/third_party/xla/xla/service/compiler.h b/third_party/xla/xla/service/compiler.h
index 321e4a621921af..74c75ea58d643a 100644
--- a/third_party/xla/xla/service/compiler.h
+++ b/third_party/xla/xla/service/compiler.h
@@ -128,6 +128,8 @@ class Compiler {
              other.ToProto().SerializeAsString();
     }
 
+    std::string ToString() { return ToProto().DebugString(); }
+
     se::DeviceDescription device_description;
     std::string platform_name;
     se::dnn::VersionInfo dnn_version_info;
diff --git a/third_party/xla/xla/service/computation_placer.cc b/third_party/xla/xla/service/computation_placer.cc
index b896c7d10cc408..40243118504763 100644
--- a/third_party/xla/xla/service/computation_placer.cc
+++ b/third_party/xla/xla/service/computation_placer.cc
@@ -15,11 +15,11 @@ limitations under the License.
 
 #include "xla/service/computation_placer.h"
 
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <string>
 #include <utility>
-#include <vector>
 
 #include "absl/strings/str_cat.h"
 #include "xla/literal.h"
@@ -49,9 +49,8 @@ DeviceAssignment::LogicalIdForDevice(GlobalDeviceId device_id) const {
     for (int c = 0; c < computation_count(); ++c) {
       if ((*this)(r, c) == device_id.value()) {
         if (logical_id.has_value()) {
-          return Internal(
-              "Device %d appears twice in DeviceAssignment: %s",
-              device_id.value(), ToString());
+          return Internal("Device %d appears twice in DeviceAssignment: %s",
+                          device_id.value(), ToString());
         }
         logical_id.emplace(DeviceAssignment::LogicalID{r, c});
       }
@@ -61,7 +60,7 @@ DeviceAssignment::LogicalIdForDevice(GlobalDeviceId device_id) const {
     return *logical_id;
   } else {
     return Internal("Device %d doesn't appear in DeviceAssignment: %s",
-                         device_id.value(), ToString());
+                    device_id.value(), ToString());
   }
 }
 
@@ -85,7 +84,7 @@ DeviceAssignment::GetDeviceToLogicalIdMap() const {
   return device_to_logical_id;
 }
 
-Status DeviceAssignment::Serialize(DeviceAssignmentProto* proto) const {
+absl::Status DeviceAssignment::Serialize(DeviceAssignmentProto* proto) const {
   proto->set_replica_count(replica_count());
   proto->set_computation_count(computation_count());
   for (int computation = 0; computation < computation_count(); ++computation) {
@@ -112,8 +111,9 @@ DeviceAssignment::Deserialize(const DeviceAssignmentProto& proto) {
   for (int computation = 0; computation < proto.computation_count();
        ++computation) {
     const auto& computation_device = proto.computation_devices(computation);
-    TF_RET_CHECK(computation_device.replica_device_ids_size() ==
-                 proto.replica_count());
+    int64_t replica_count = proto.replica_count();
+    int64_t ids = computation_device.replica_device_ids_size();
+    TF_RET_CHECK(ids == replica_count);
     for (int replica = 0; replica < proto.replica_count(); ++replica) {
       (*assignment)(replica, computation) =
           computation_device.replica_device_ids(replica);
diff --git a/third_party/xla/xla/service/computation_placer.h b/third_party/xla/xla/service/computation_placer.h
index 12facc79505621..3ee82afd157d3e 100644
--- a/third_party/xla/xla/service/computation_placer.h
+++ b/third_party/xla/xla/service/computation_placer.h
@@ -16,10 +16,10 @@ limitations under the License.
 #ifndef XLA_SERVICE_COMPUTATION_PLACER_H_
 #define XLA_SERVICE_COMPUTATION_PLACER_H_
 
+#include <cstdint>
 #include <map>
 #include <memory>
-#include <utility>
-#include <vector>
+#include <string>
 
 #include "absl/container/flat_hash_map.h"
 #include "xla/array2d.h"
@@ -36,11 +36,11 @@ namespace xla {
 // computations. For R replicas and C computations, R * C devices are required
 // execute the computation in parallel. The assigned device ids can be accessed
 // by assignment(replica, computation).
-class DeviceAssignment : public Array2D<int> {
+class DeviceAssignment : public Array2D<int64_t> {
  public:
   DeviceAssignment() {}
   DeviceAssignment(int replica_count, int computation_count)
-      : Array2D<int>(replica_count, computation_count, -1) {
+      : Array2D<int64_t>(replica_count, computation_count, -1) {
     CHECK_GT(replica_count, 0);
     CHECK_GT(computation_count, 0);
   }
@@ -64,11 +64,11 @@ class DeviceAssignment : public Array2D<int> {
       const;
 
   // Protocol buffer serialization and deserialization.
-  Status Serialize(DeviceAssignmentProto* proto) const;
+  absl::Status Serialize(DeviceAssignmentProto* proto) const;
 
   // Return a std::unique_ptr<DeviceAssignment> instead of a DeviceAssignment
   // directly because one of the supported TF platforms (mac) does not compile
-  // due to a StatusOr of an incomplete type (DeviceAssignment).
+  // due to a absl::StatusOr of an incomplete type (DeviceAssignment).
   static absl::StatusOr<std::unique_ptr<DeviceAssignment>> Deserialize(
       const DeviceAssignmentProto& proto);
 
diff --git a/third_party/xla/xla/service/conditional_canonicalizer.cc b/third_party/xla/xla/service/conditional_canonicalizer.cc
index 22eb2cb6500fea..9b87eceb763dda 100644
--- a/third_party/xla/xla/service/conditional_canonicalizer.cc
+++ b/third_party/xla/xla/service/conditional_canonicalizer.cc
@@ -21,7 +21,7 @@ limitations under the License.
 
 namespace xla {
 namespace {
-Status CanonicalizeNonTupleConditional(HloInstruction* conditional) {
+absl::Status CanonicalizeNonTupleConditional(HloInstruction* conditional) {
   TF_RET_CHECK(conditional->opcode() == HloOpcode::kConditional);
   for (auto* branch : conditional->called_computations()) {
     HloInstruction* root = branch->root_instruction();
diff --git a/third_party/xla/xla/service/conditional_code_motion.cc b/third_party/xla/xla/service/conditional_code_motion.cc
index 1fed4183420b64..a1429c38b19bbe 100644
--- a/third_party/xla/xla/service/conditional_code_motion.cc
+++ b/third_party/xla/xla/service/conditional_code_motion.cc
@@ -498,8 +498,8 @@ absl::flat_hash_set<int64_t> FindSpecialConverts(HloInstruction* old_root,
 // z = tuple(y.0, y.1, ...y.n)
 // Doing so ensures that we can accommodate the possible shape-change of the
 // conditional when the instructions are hoisted.
-Status RestructureConditionalInstruction(HloComputation* computation,
-                                         HloInstruction* conditional) {
+absl::Status RestructureConditionalInstruction(HloComputation* computation,
+                                               HloInstruction* conditional) {
   HloInstruction* old_root = computation->root_instruction();
   std::vector<HloInstruction*> new_operands;
   int cur_index = 0;
@@ -937,7 +937,7 @@ absl::StatusOr<bool> ConditionalCodeMotion::MoveUserInstructionsIn(
 class MoveOperandIntoBranch {
  public:
   MoveOperandIntoBranch() = default;
-  Status operator()(HloInstruction* inst, HloInstruction*& user) {
+  absl::Status operator()(HloInstruction* inst, HloInstruction*& user) {
     VLOG(1) << "operand to move into branch: " << inst->ToString();
     VLOG(2) << "MoveIntoBranches user =" << user->ToString() << "\n";
     CHECK(inst->user_count() == 1 || inst->opcode() == HloOpcode::kBroadcast);
@@ -1061,7 +1061,7 @@ class MoveOperandIntoBranch {
   // Replace input with its operands inside user. Use matching_tuple_indices to
   // remember which operands of input is matched to which entries in the new
   // user.
-  Status ReplaceInputInUser(
+  absl::Status ReplaceInputInUser(
       HloInstruction* input, HloInstruction*& user,
       absl::InlinedVector<HloInstruction*, 4>& new_operands,
       std::vector<std::vector<int64_t>>& matching_tuple_indices) {
@@ -1137,7 +1137,7 @@ class MoveOperandIntoBranch {
     }
     return OkStatus();
   }
-  Status MoveInputIntoBranch(
+  absl::Status MoveInputIntoBranch(
       HloInstruction* input, HloInstruction*& user,
       absl::InlinedVector<HloInstruction*, 4>& new_operands,
       std::vector<std::vector<int64_t>>& matching_tuple_indices) {
diff --git a/third_party/xla/xla/service/conditional_to_select.cc b/third_party/xla/xla/service/conditional_to_select.cc
index 3b2fd710038300..b32b96efe76c49 100644
--- a/third_party/xla/xla/service/conditional_to_select.cc
+++ b/third_party/xla/xla/service/conditional_to_select.cc
@@ -73,7 +73,7 @@ absl::StatusOr<bool> ConditionalToSelect::Run(
   bool did_mutate = false;
   VLOG(1) << "Running conditional-to-select pass";
   TF_RETURN_IF_ERROR(
-      call_graph->VisitNodes([&](const CallGraphNode& node) -> Status {
+      call_graph->VisitNodes([&](const CallGraphNode& node) -> absl::Status {
         std::vector<HloInstruction*> ToInline;
         if (node.context() != CallContext::kEmbedded) {
           return OkStatus();
diff --git a/third_party/xla/xla/service/convert_async_collectives_to_sync.cc b/third_party/xla/xla/service/convert_async_collectives_to_sync.cc
index 60ec4a8788f685..6da602e1239918 100644
--- a/third_party/xla/xla/service/convert_async_collectives_to_sync.cc
+++ b/third_party/xla/xla/service/convert_async_collectives_to_sync.cc
@@ -42,7 +42,7 @@ absl::StatusOr<HloInstruction*> CreateSyncVariant(HloInstruction* async_start,
       sync_instruction =
           computation->AddInstruction(HloInstruction::CreateAllReduce(
               async_done->shape(), async_ar->operands(), async_ar->to_apply(),
-              async_ar->replica_groups(), async_ar->constrain_layout(),
+              async_ar->device_list(), async_ar->constrain_layout(),
               async_ar->channel_id(), async_ar->use_global_device_ids()));
       break;
     }
@@ -51,7 +51,7 @@ absl::StatusOr<HloInstruction*> CreateSyncVariant(HloInstruction* async_start,
       sync_instruction =
           computation->AddInstruction(HloInstruction::CreateAllGather(
               async_done->shape(), async_ag->operands(),
-              async_ag->all_gather_dimension(), async_ag->replica_groups(),
+              async_ag->all_gather_dimension(), async_ag->device_list(),
               async_ag->constrain_layout(), async_ag->channel_id(),
               async_ag->use_global_device_ids()));
       break;
@@ -106,7 +106,7 @@ absl::StatusOr<HloInstruction*> CreateSyncVariant(HloInstruction* async_start,
   return sync_instruction;
 }
 
-/*static*/ Status
+/*static*/ absl::Status
 ConvertAsyncCollectivesToSync::ReplaceAsyncInstructionsWithSync(
     HloComputation* computation,
     absl::Span<const std::pair<HloInstruction*, HloInstruction*>> async_pairs) {
diff --git a/third_party/xla/xla/service/convert_async_collectives_to_sync.h b/third_party/xla/xla/service/convert_async_collectives_to_sync.h
index 2b37c6ee7fa469..c359b2bda38db4 100644
--- a/third_party/xla/xla/service/convert_async_collectives_to_sync.h
+++ b/third_party/xla/xla/service/convert_async_collectives_to_sync.h
@@ -40,7 +40,7 @@ class ConvertAsyncCollectivesToSync : public HloModulePass {
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
-  virtual Status ConvertAsyncInstructionsToSync(
+  virtual absl::Status ConvertAsyncInstructionsToSync(
       HloComputation* computation,
       absl::Span<const std::pair<HloInstruction*, HloInstruction*>> async_pairs)
       const {
@@ -49,7 +49,7 @@ class ConvertAsyncCollectivesToSync : public HloModulePass {
 
   // Helper utility to replace a list of pairs of async-start/done ops in a
   // computation with their synchronous variants and update the schedule.
-  static Status ReplaceAsyncInstructionsWithSync(
+  static absl::Status ReplaceAsyncInstructionsWithSync(
       HloComputation* computation,
       absl::Span<const std::pair<HloInstruction*, HloInstruction*>>
           async_pairs);
diff --git a/third_party/xla/xla/service/convert_async_collectives_to_sync_test.cc b/third_party/xla/xla/service/convert_async_collectives_to_sync_test.cc
index 355ba78b7e2fc9..2ba38a727559e2 100644
--- a/third_party/xla/xla/service/convert_async_collectives_to_sync_test.cc
+++ b/third_party/xla/xla/service/convert_async_collectives_to_sync_test.cc
@@ -35,8 +35,8 @@ namespace m = xla::testing::opcode_matchers;
 // to the HLO module string.
 class ConvertAsyncCollectivesToSyncTest : public HloTestBase {
  public:
-  Status RunPass(HloModule *module, bool expect_change,
-                 HloPredicate is_nop = {}) {
+  absl::Status RunPass(HloModule *module, bool expect_change,
+                       HloPredicate is_nop = {}) {
     TF_ASSIGN_OR_RETURN(bool changed,
                         ConvertAsyncCollectivesToSync{is_nop}.Run(module));
     EXPECT_EQ(changed, expect_change);
diff --git a/third_party/xla/xla/service/convert_memory_placement_to_internal_annotations.cc b/third_party/xla/xla/service/convert_memory_placement_to_internal_annotations.cc
index c3c266f1c683f9..fe32bac6c1a7f3 100644
--- a/third_party/xla/xla/service/convert_memory_placement_to_internal_annotations.cc
+++ b/third_party/xla/xla/service/convert_memory_placement_to_internal_annotations.cc
@@ -56,10 +56,6 @@ absl::StatusOr<bool> ConvertMemoryPlacementToInternalAnnotations::Run(
         }
         if (is_to_host_case) {
           VLOG(1) << "Process forward case: " << instruction->ToString();
-          if (instruction->users().size() != 1) {
-            VLOG(1) << "Skip because of too many users on instruction";
-            continue;
-          }
           if (instruction->operand_count() != 1) {
             return Internal(
                 "Custom calls with target %s must have exactly one operand. %s "
diff --git a/third_party/xla/xla/service/convert_memory_placement_to_internal_annotations_test.cc b/third_party/xla/xla/service/convert_memory_placement_to_internal_annotations_test.cc
index 4ee97fc1fbfba6..876b4f548c0683 100644
--- a/third_party/xla/xla/service/convert_memory_placement_to_internal_annotations_test.cc
+++ b/third_party/xla/xla/service/convert_memory_placement_to_internal_annotations_test.cc
@@ -19,6 +19,7 @@
 #include <memory>
 #include <optional>
 #include <string>
+#include <string_view>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -484,5 +485,31 @@ ENTRY main.183 {
   EXPECT_EQ(custom_calls_count, 4);
 }
 
+TEST_F(ConvertMemoryPlacementToInternalAnnotationsTest,
+       ConvertOutputPinnedHostTest) {
+  constexpr std::string_view hlo_string = R"(
+  HloModule m, entry_computation_layout={(f32[2,2]{1,0:T(2,128)},f32[2,2]{1,0:T(2,128)})->f32[2,2]{1,0:T(2,128)S(5)}}
+  ENTRY m {
+    x = f32[2,2] parameter(0)
+    y = f32[2,2] parameter(1)
+    crs = f32[2,2] add(x, y)
+    ROOT transfer = f32[2,2] custom-call(crs), custom_call_target="annotate_device_placement", frontend_attributes={_xla_buffer_placement="pinned_host"}
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  bool changed =
+      ConvertMemoryPlacementToInternalAnnotations().Run(module.get()).value();
+  EXPECT_TRUE(changed);
+  XLA_VLOG_LINES(1, module->ToString());
+  int64_t move_to_host_count = 0;
+  for (auto* c : module->computations()) {
+    for (auto* instr : c->instructions()) {
+      move_to_host_count += instr->IsCustomCall(
+          host_memory_offload_annotations::kMoveToHostCustomCallTarget);
+    }
+  }
+  EXPECT_EQ(move_to_host_count, 1);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/convert_mover.cc b/third_party/xla/xla/service/convert_mover.cc
index b847c199753b87..fbfb1d2e1b2a8a 100644
--- a/third_party/xla/xla/service/convert_mover.cc
+++ b/third_party/xla/xla/service/convert_mover.cc
@@ -114,9 +114,9 @@ absl::StatusOr<bool> MoveConvertPrecisionOps(HloComputation* comp) {
       continue;
     }
 
-    // Currently int4 is not supported in most ops so moving the convert is not
-    // safe.
-    if (primitive_util::Is4BitType(src_ty)) {
+    // Currently packed types are not supported in most ops so moving the
+    // convert is not safe.
+    if (primitive_util::IsSubByteNonPredType(src_ty)) {
       continue;
     }
 
@@ -169,7 +169,7 @@ absl::StatusOr<bool> MoveConvertPrecisionOps(HloComputation* comp) {
     if (primitive_util::BitWidth(src_ty) <= primitive_util::BitWidth(dst_ty)) {
       continue;
     }
-    if (primitive_util::Is4BitType(dst_ty)) {
+    if (primitive_util::IsSubByteNonPredType(dst_ty)) {
       continue;
     }
 
diff --git a/third_party/xla/xla/service/convert_mover_test.cc b/third_party/xla/xla/service/convert_mover_test.cc
index cda7a4eb040464..9050023bd64999 100644
--- a/third_party/xla/xla/service/convert_mover_test.cc
+++ b/third_party/xla/xla/service/convert_mover_test.cc
@@ -127,8 +127,8 @@ TEST_F(ConvertMoverTest, MoveDownTwoSlices) {
   HloModule module
 
   ENTRY main {
-    s1 = f32[9] slice(f32[10] convert(f16[10] parameter(0))), slice={[0:9]}
-    ROOT s2 = f32[8] slice(s1), slice={[0:8]}
+    slice1 = f32[9] slice(f32[10] convert(f16[10] parameter(0))), slice={[0:9]}
+    ROOT slice2 = f32[8] slice(slice1), slice={[0:8]}
   })";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(module_string));
diff --git a/third_party/xla/xla/service/convolution_group_converter.cc b/third_party/xla/xla/service/convolution_group_converter.cc
index 4dfba405d7a7f2..c4e192101c8800 100644
--- a/third_party/xla/xla/service/convolution_group_converter.cc
+++ b/third_party/xla/xla/service/convolution_group_converter.cc
@@ -45,13 +45,13 @@ namespace {
 class ConvolutionVisitor : public DfsHloVisitorWithDefault {
  public:
   // Default visitor action is to do nothing and return OK.
-  Status DefaultAction(HloInstruction* /*hlo_instruction*/) override {
+  absl::Status DefaultAction(HloInstruction* /*hlo_instruction*/) override {
     return OkStatus();
   }
 
-  Status HandleConvolution(HloInstruction* convolution) override;
+  absl::Status HandleConvolution(HloInstruction* convolution) override;
 
-  Status HandleBatchGroupCount(HloInstruction* convolution);
+  absl::Status HandleBatchGroupCount(HloInstruction* convolution);
 
   // Runs the visitor on a computation.
   static bool Run(HloComputation* computation,
@@ -201,7 +201,8 @@ HloInstruction* GetExpandedFilterMask(
 
 // This function handles batch_group_counts which are relevant only for
 // depthwise backprop filter convolutions.
-Status ConvolutionVisitor::HandleBatchGroupCount(HloInstruction* convolution) {
+absl::Status ConvolutionVisitor::HandleBatchGroupCount(
+    HloInstruction* convolution) {
   auto dim_numbers = convolution->convolution_dimension_numbers();
   auto activation = convolution->mutable_operand(0);
   auto filter = convolution->mutable_operand(1);
@@ -418,7 +419,8 @@ Status ConvolutionVisitor::HandleBatchGroupCount(HloInstruction* convolution) {
   return OkStatus();
 }
 
-Status ConvolutionVisitor::HandleConvolution(HloInstruction* convolution) {
+absl::Status ConvolutionVisitor::HandleConvolution(
+    HloInstruction* convolution) {
   if (convert_batch_groups_only_) {
     return HandleBatchGroupCount(convolution);
   }
diff --git a/third_party/xla/xla/service/copy_insertion.cc b/third_party/xla/xla/service/copy_insertion.cc
index 5e99bcb5fa4a2a..5c8e9bad01d5b5 100644
--- a/third_party/xla/xla/service/copy_insertion.cc
+++ b/third_party/xla/xla/service/copy_insertion.cc
@@ -284,8 +284,8 @@ bool IndicesToCopyForConditional(const HloDataflowAnalysis& dataflow,
 // If the loop state is a tuple then the above kCopy instructions are a deep
 // copy constructed of kCopy, kGetTupleElement, and kTuple instruction as
 // constructed by HloInstruction::DeepCopyInstruction.
-Status AddCopiesForWhile(const HloAliasAnalysis& alias_analysis,
-                         HloInstruction* xla_while) {
+absl::Status AddCopiesForWhile(const HloAliasAnalysis& alias_analysis,
+                               HloInstruction* xla_while) {
   VLOG(2) << "Adding copies for kWhile instruction " << xla_while->name();
   TF_RET_CHECK(xla_while->opcode() == HloOpcode::kWhile);
 
@@ -342,9 +342,9 @@ Status AddCopiesForWhile(const HloAliasAnalysis& alias_analysis,
 
 // Add copies for the operands of in-place operations. RemoveUnnecessaryCopies
 // will remove the unnecessary copies.
-Status AddCopiesForInPlaceOperation(const HloAliasAnalysis& alias_analysis,
-                                    HloInstruction* in_place_op,
-                                    int64_t operand_number) {
+absl::Status AddCopiesForInPlaceOperation(
+    const HloAliasAnalysis& alias_analysis, HloInstruction* in_place_op,
+    int64_t operand_number) {
   VLOG(2) << "Adding copies for in-place operation " << in_place_op->name();
   HloInstruction* operand = in_place_op->mutable_operand(operand_number);
   TF_ASSIGN_OR_RETURN(HloInstruction * deep_copy,
@@ -358,7 +358,7 @@ Status AddCopiesForInPlaceOperation(const HloAliasAnalysis& alias_analysis,
 // each aliased parameter to resolve interference of aliased input and output
 // buffer. We later rely on RemoveUnnecessaryCopies to drop the unnecessary
 // ones.
-Status AddCopiesForAliasedInputOutputs(
+absl::Status AddCopiesForAliasedInputOutputs(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   HloComputation* entry = module->entry_computation();
@@ -429,7 +429,7 @@ Status AddCopiesForAliasedInputOutputs(
   // Add control dependencies between the input/output copies.
   TF_RETURN_IF_ERROR(module->input_output_alias_config().ForEachAliasWithStatus(
       [&](const ShapeIndex& output_index,
-          const HloInputOutputAliasConfig::Alias& alias) -> Status {
+          const HloInputOutputAliasConfig::Alias& alias) -> absl::Status {
         if (!copied_parameters[alias.parameter_number]) {
           return OkStatus();
         }
@@ -450,7 +450,7 @@ Status AddCopiesForAliasedInputOutputs(
 }
 
 // Removes any control dependencies to or from the given instruction.
-Status StripControlDependenciesFrom(HloInstruction* instruction) {
+absl::Status StripControlDependenciesFrom(HloInstruction* instruction) {
   while (!instruction->control_successors().empty()) {
     TF_RETURN_IF_ERROR(instruction->RemoveControlDependencyTo(
         instruction->control_successors().front()));
@@ -1454,7 +1454,7 @@ class CopyRemover {
   }
 
   // Verify invariants within the linked lists.
-  Status Verify() const {
+  absl::Status Verify() const {
     for (const ValueNode* head : value_lists_) {
       const ValueNode* p = head;
       do {
@@ -1967,7 +1967,7 @@ class CopyRemover {
 // We add copies for all phi indices of the true and false computation
 // roots, in order to resolve interference. We later rely on
 // RemoveUnnecessaryCopies to drop the unnecessary ones.
-Status CopyInsertion::AddCopiesForConditional(
+absl::Status CopyInsertion::AddCopiesForConditional(
     const HloAliasAnalysis& alias_analysis, HloInstruction* conditional) {
   VLOG(2) << "Adding copies for kConditional instruction "
           << conditional->name();
@@ -1997,7 +1997,7 @@ Status CopyInsertion::AddCopiesForConditional(
 // Add kCopy instructions to the given module to guarantee there is no
 // live-range interference. Generally interference can only occur around kWhile
 // instructions which have update-in-place semantics.
-Status CopyInsertion::AddCopiesToResolveInterference(
+absl::Status CopyInsertion::AddCopiesToResolveInterference(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
@@ -2071,14 +2071,14 @@ Status CopyInsertion::AddCopiesToResolveInterference(
   return OkStatus();
 }
 
-Status CopyInsertion::AddSpecialCaseCopies(
+absl::Status CopyInsertion::AddSpecialCaseCopies(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
   return AddSpecialCaseCopies(*call_graph, execution_threads, module);
 }
 
-Status CopyInsertion::AddSpecialCaseCopies(
+absl::Status CopyInsertion::AddSpecialCaseCopies(
     const CallGraph& call_graph,
     const absl::flat_hash_set<absl::string_view>& execution_threads,
     HloModule* module) {
@@ -2245,7 +2245,7 @@ static int64_t GetNumExistingCopies(
   return num_existing_copies;
 }
 
-Status CopyInsertion::RemoveUnnecessaryCopies(
+absl::Status CopyInsertion::RemoveUnnecessaryCopies(
     HloModule* module, bool check_live_range_ordering,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   XLA_VLOG_LINES(
diff --git a/third_party/xla/xla/service/copy_insertion.h b/third_party/xla/xla/service/copy_insertion.h
index a062e6d133d5de..cba44576d9283d 100644
--- a/third_party/xla/xla/service/copy_insertion.h
+++ b/third_party/xla/xla/service/copy_insertion.h
@@ -72,7 +72,7 @@ class CopyInsertion : public HloModulePass {
   // eligible for copy elision are considered for removal.
   // If check_live_range_ordering is true, check that live ranges are ordered
   // in all the existing aliased buffers.
-  Status RemoveUnnecessaryCopies(
+  absl::Status RemoveUnnecessaryCopies(
       HloModule* module, bool check_live_range_ordering = false,
       const absl::flat_hash_set<absl::string_view>& execution_threads = {});
 
@@ -86,27 +86,27 @@ class CopyInsertion : public HloModulePass {
   //
   //    (3) Constants and parameters cannot be live out of the entry computation
   //
-  Status AddSpecialCaseCopies(
+  absl::Status AddSpecialCaseCopies(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads = {});
 
  protected:
   // Override which requires the caller to pass in a call graph.
-  virtual Status AddSpecialCaseCopies(
+  virtual absl::Status AddSpecialCaseCopies(
       const CallGraph& call_graph,
       const absl::flat_hash_set<absl::string_view>& execution_threads,
       HloModule* module);
 
   // Add copies for conditional instructions.
-  virtual Status AddCopiesForConditional(const HloAliasAnalysis& alias_analysis,
-                                         HloInstruction* conditional);
+  virtual absl::Status AddCopiesForConditional(
+      const HloAliasAnalysis& alias_analysis, HloInstruction* conditional);
 
   // Backend specific function that decides whether an instruction can share
   // buffer with its operand.
   HloDataflowAnalysis::CanShareBuffer can_share_buffer_;
 
  private:
-  Status AddCopiesToResolveInterference(
+  absl::Status AddCopiesToResolveInterference(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads);
   int64_t use_region_based_live_range_analysis_;
diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index 5041c3204e7565..0918e2d6c03e49 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -19,9 +19,9 @@ load(
 )
 load(
     "//xla:xla.bzl",
-    "ORC_JIT_MEMORY_MAPPER_TARGETS",
     "xla_cc_binary",
     "xla_cc_test",
+    "xla_internal",
 )
 load("//xla/tsl:tsl.bzl", "internal_visibility", "tf_openmp_copts", "tsl_copts")
 load("//xla/tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
@@ -111,6 +111,7 @@ filegroup(
         "runtime_matmul_f64.cc",
         "runtime_matmul_s32.cc",
         "runtime_fork_join.cc",
+        #"runtime_handle_ffi_call.cc", # TODO(b/338344732): Add  "runtime_handle_ffi_call.cc".
     ],
     visibility = internal_visibility([":friends"]),
 )
@@ -138,10 +139,16 @@ filegroup(
         "runtime_fork_join.h",
         "runtime_lightweight_check.h",
         "runtime_matmul.h",
+        #"runtime_handle_ffi_call.h", # TODO(b/338344732): Add  "runtime_handle_ffi_call.h"
     ],
     visibility = internal_visibility([":friends"]),
 )
 
+cc_library(
+    name = "cpu_event",
+    hdrs = ["cpu_event.h"],
+)
+
 cc_library(
     name = "cpu_xfeed",
     srcs = ["cpu_xfeed.cc"],
@@ -223,7 +230,6 @@ cc_library(
         ":cpu_options",
         ":dot_op_emitter",
         ":executable_proto_cc",
-        ":hlo_xla_runtime_pipeline",
         ":ir_emission_utils",
         ":ir_emitter",
         ":onednn_matmul_rewriter",
@@ -246,19 +252,10 @@ cc_library(
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/ir:hlo_module_group",
-        "//xla/mlir/framework/ir:xla_framework",
-        "//xla/mlir/runtime/ir:rt",
-        "//xla/mlir/runtime/transforms:calling_convention",
-        "//xla/mlir/runtime/transforms:compilation_pipeline_cpu",
-        "//xla/mlir/runtime/transforms:compiler",
-        "//xla/mlir/runtime/transforms:jit_compiler",
         "//xla/mlir_hlo",
         "//xla/mlir_hlo:all_passes",
         "//xla/mlir_hlo:mhlo_passes",
         "//xla/mlir_hlo:transforms_passes",
-        "//xla/runtime:custom_call_registry",
-        "//xla/runtime:executable",
-        "//xla/runtime:jit_executable",
         "//xla/service:algebraic_simplifier",
         "//xla/service:all_reduce_promotion",
         "//xla/service:all_to_all_decomposer",
@@ -316,6 +313,7 @@ cc_library(
         "//xla/service:optimize_input_output_buffer_alias",
         "//xla/service:qr_expander",
         "//xla/service:reduce_decomposer",
+        "//xla/service:reduce_window_rewriter",
         "//xla/service:reshape_decomposer",
         "//xla/service:reshape_mover",
         "//xla/service:result_caster",
@@ -340,13 +338,6 @@ cc_library(
         "//xla/service:while_loop_invariant_code_motion",
         "//xla/service:while_loop_simplifier",
         "//xla/service:zero_sized_hlo_elimination",
-        "//xla/service/cpu/runtime:collectives",
-        "//xla/service/cpu/runtime:convolution_call",
-        "//xla/service/cpu/runtime:custom_call",
-        "//xla/service/cpu/runtime:fft_call",
-        "//xla/service/cpu/runtime:retain",
-        "//xla/service/cpu/runtime:rng_call",
-        "//xla/service/cpu/runtime:xfeed",
         "//xla/service/llvm_ir:llvm_command_line_options",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/service/spmd:stateful_rng_spmd_partitioner",
@@ -439,6 +430,7 @@ cc_library(
         "//xla/service:hlo_profile_printer_data_cc",
         "//xla/service:hlo_proto_cc",
         "//xla/service:llvm_compiler",
+        "//xla/service:reduce_window_rewriter",
         "//xla/stream_executor",
         "//xla/stream_executor/host:host_platform_id",
         "@llvm-project//llvm:Target",
@@ -469,56 +461,6 @@ cc_library(
     deps = [":xla_framework_proto_cc"],
 )
 
-cc_library(
-    name = "hlo_xla_runtime_pipeline",
-    srcs = ["hlo_xla_runtime_pipeline.cc"],
-    hdrs = ["hlo_xla_runtime_pipeline.h"],
-    local_defines = select({
-        ":experimental_mlir_gpu_enabled": ["EXPERIMENTAL_MLIR_GPU=1"],
-        "//conditions:default": [],
-    }),
-    deps = [
-        "//xla:status",
-        "//xla/mlir/backends/cpu/transforms:passes",
-        "//xla/mlir/runtime/transforms:compiler",
-        "//xla/mlir_hlo:all_passes",
-        "//xla/mlir_hlo:mhlo_passes",
-        "//xla/mlir_hlo:transforms_passes",
-        "//xla/runtime:compiler",
-        "@llvm-project//mlir:ArithTransforms",
-        "@llvm-project//mlir:BufferizationToMemRef",
-        "@llvm-project//mlir:BufferizationTransforms",
-        "@llvm-project//mlir:ComplexToStandard",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:FuncTransforms",
-        "@llvm-project//mlir:LinalgTransforms",
-        "@llvm-project//mlir:MemRefTransforms",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:ReconcileUnrealizedCasts",
-        "@llvm-project//mlir:SCFToControlFlow",
-        "@llvm-project//mlir:SCFTransforms",
-        "@llvm-project//mlir:ShapeToStandard",
-        "@llvm-project//mlir:ShapeTransforms",
-        "@llvm-project//mlir:SparseTensorTransforms",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TensorToLinalg",
-        "@llvm-project//mlir:TensorTransforms",
-        "@llvm-project//mlir:Transforms",
-        "@llvm-project//mlir:VectorToLLVM",
-        "@llvm-project//mlir:VectorToSCF",
-        "@llvm-project//mlir:VectorTransforms",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-    ] + select({
-        ":experimental_mlir_gpu_enabled": [
-            "@llvm-project//mlir:GPUDialect",
-            "@llvm-project//mlir:GPUToNVVMTransforms",
-        ],
-        "//conditions:default": [],
-    }),
-    alwayslink = 1,  # has pipeline registration
-)
-
 cc_library(
     name = "simple_orc_jit",
     srcs = [
@@ -543,6 +485,7 @@ cc_library(
         ":runtime_fft",
         ":runtime_fork_join",
         ":runtime_fp16",
+        ":runtime_handle_ffi_call",
         ":runtime_key_value_sort",
         ":runtime_matmul",
         ":runtime_matmul_acl",
@@ -567,7 +510,7 @@ cc_library(
         "@llvm-project//llvm:TargetParser",
         "@llvm-project//mlir:mlir_c_runner_utils",
         "@local_tsl//tsl/platform:logging",
-    ] + ORC_JIT_MEMORY_MAPPER_TARGETS,
+    ] + xla_internal(["service/cpu:named_orc_jit_memory_mapper"]),
 )
 
 cc_library(
@@ -622,9 +565,6 @@ cc_library(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
-        "//xla/mlir/runtime/transforms:compiler",
-        "//xla/runtime:executable",
-        "//xla/runtime:jit_executable",
         "//xla/service:buffer_assignment",
         "//xla/service:computation_layout",
         "//xla/service:custom_call_status_internal",
@@ -702,6 +642,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/meta:type_traits",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
@@ -866,7 +807,6 @@ cc_library(
         "//xla:statusor",
         "//xla:types",
         "//xla:util",
-        "//xla/runtime:execution_engine",
         "//xla/service:llvm_compiler",
         "//xla/service/llvm_ir:llvm_util",
         "@com_google_absl//absl/functional:any_invocable",
@@ -1235,6 +1175,36 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "runtime_handle_ffi_call",
+    srcs = ["runtime_handle_ffi_call.cc"],
+    hdrs = ["runtime_handle_ffi_call.h"],
+    copts = runtime_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla:executable_run_options",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/ffi:attribute_map",
+        "//xla/ffi:call_frame",
+        "//xla/ffi:ffi_api",
+        "//xla/service:custom_call_status_public_headers",
+        "//xla/service:executable",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/base:dynamic_annotations",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AsmParser",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 xla_cc_test(
     name = "cpu_runtime_test",
     srcs = ["cpu_runtime_test.cc"],
@@ -1681,7 +1651,6 @@ cc_library(
         "//xla:executable_run_options",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:dynamic_annotations",
-        "@com_google_absl//absl/log:check",
         "@eigen_archive//:eigen3",
         "@local_tsl//tsl/platform:blocking_counter",
         "@local_tsl//tsl/platform:env",
@@ -1689,6 +1658,15 @@ cc_library(
     ] + mkl_deps(),
 )
 
+cc_library(
+    name = "onednn_pattern_utils",
+    hdrs = ["onednn_pattern_utils.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":onednn_util",
+    ] + mkl_deps(),
+)
+
 cc_library(
     name = "onednn_matmul_rewriter",
     srcs = ["onednn_matmul_rewriter.cc"],
@@ -1702,6 +1680,7 @@ cc_library(
         ":backend_config_proto_cc",
         ":onednn_matmul",
         ":onednn_memory_util",
+        ":onednn_pattern_utils",
         ":onednn_util",
         "//xla:executable_run_options",
         "//xla:shape_util",
@@ -1730,6 +1709,7 @@ cc_library(
     deps = [
         ":backend_config_proto_cc",
         ":onednn_memory_util",
+        ":onednn_pattern_utils",
         ":onednn_util",
         "//xla:status_macros",
         "//xla:xla_data_proto_cc",
diff --git a/third_party/xla/xla/service/cpu/backend_config.proto b/third_party/xla/xla/service/cpu/backend_config.proto
index d5ce59285e55c9..b63bcd5c376014 100644
--- a/third_party/xla/xla/service/cpu/backend_config.proto
+++ b/third_party/xla/xla/service/cpu/backend_config.proto
@@ -27,6 +27,8 @@ message OneDnnMatMulConfig {
     GELU_TANH = 5;
     BINARY_ADD = 6;
     LINEAR = 7;
+    ELU = 8;
+    RELU6 = 9;
   }
   repeated FusionKind fused_ops = 3;
   bool bias_broadcast = 4;
diff --git a/third_party/xla/xla/service/cpu/compiler_functor.cc b/third_party/xla/xla/service/cpu/compiler_functor.cc
index f909cb6ff20755..b481ba24bd9418 100644
--- a/third_party/xla/xla/service/cpu/compiler_functor.cc
+++ b/third_party/xla/xla/service/cpu/compiler_functor.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Instrumentation/DataFlowSanitizer.h"
-#include "xla/runtime/execution_engine.h"
 #include "xla/service/cpu/cpu_runtime.h"
 #include "xla/service/cpu/llvm_ir_runtime.h"
 #include "xla/service/llvm_ir/llvm_util.h"
@@ -160,20 +159,6 @@ llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>> CompilerFunctor::operator()(
 
   llvm::ModulePassManager pm;
 
-  for (const auto& func_name : convert_to_xla_runtime_abi_) {
-    llvm::Function* func = module.getFunction(func_name);
-    // Create a new function with the XLA Runtime ABI and inline the original
-    // (i.e. with ctx + memref args) into it.
-    std::string inlined_func_name =
-        absl::StrCat(func_name, "__orig_xla_runtime_abi");
-    func->setName(inlined_func_name);
-    absl::Status status = xla::runtime::ExportWithXlaRuntimeAbi(
-        module, inlined_func_name, func_name);
-    if (!status.ok()) {
-      LOG(FATAL) << status.message();
-    }
-  }
-
   if (dfsan_enabled_) {
     pm.addPass(llvm::DataFlowSanitizerPass(dfsan_abi_list_files_));
   }
diff --git a/third_party/xla/xla/service/cpu/compiler_functor.h b/third_party/xla/xla/service/cpu/compiler_functor.h
index 3c8752b3446880..1366f69f696d77 100644
--- a/third_party/xla/xla/service/cpu/compiler_functor.h
+++ b/third_party/xla/xla/service/cpu/compiler_functor.h
@@ -46,8 +46,7 @@ class CompilerFunctor : public llvm::orc::IRCompileLayer::IRCompiler {
       absl::AnyInvocable<void(const llvm::object::ObjectFile&)>
           post_codegen_hook = nullptr,
       bool dfsan_enabled = false,
-      const std::vector<std::string>& dfsan_abi_list_files = {},
-      const std::vector<std::string>& convert_to_xla_runtime_abi = {})
+      const std::vector<std::string>& dfsan_abi_list_files = {})
       : IRCompiler(llvm::orc::IRSymbolMapper::ManglingOptions()),
         target_machine_(target_machine),
         opt_level_(opt_level),
@@ -59,8 +58,7 @@ class CompilerFunctor : public llvm::orc::IRCompileLayer::IRCompiler {
         post_optimization_hook_(std::move(post_optimization_hook)),
         post_codegen_hook_(std::move(post_codegen_hook)),
         dfsan_enabled_(dfsan_enabled),
-        dfsan_abi_list_files_(dfsan_abi_list_files),
-        convert_to_xla_runtime_abi_(convert_to_xla_runtime_abi) {}
+        dfsan_abi_list_files_(dfsan_abi_list_files) {}
 
   // Compile a Module to an ObjectFile.
   llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>> operator()(
@@ -78,7 +76,6 @@ class CompilerFunctor : public llvm::orc::IRCompileLayer::IRCompiler {
   absl::AnyInvocable<void(const llvm::object::ObjectFile&)> post_codegen_hook_;
   const bool dfsan_enabled_ = false;
   const std::vector<std::string> dfsan_abi_list_files_;
-  const std::vector<std::string> convert_to_xla_runtime_abi_;
 };
 
 }  // namespace cpu
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc
index dcd9781dfef05a..6bed3bd4263cb4 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc
@@ -63,32 +63,14 @@ limitations under the License.
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/TargetParser/Host.h"
 #include "llvm/TargetParser/Triple.h"
+#include "xla/service/reduce_window_rewriter.h"
 #ifdef TF_LLVM_X86_AVAILABLE
 #include "llvm/TargetParser/X86TargetParser.h"
 #endif
-#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"  // from @llvm-project
-#include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"  // from @llvm-project
-#include "mlir/Dialect/Affine/IR/AffineOps.h"  // from @llvm-project
-#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
-#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"  // from @llvm-project
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
-#include "mlir/Dialect/Linalg/IR/Linalg.h"  // from @llvm-project
-#include "mlir/Dialect/MemRef/Transforms/AllocationOpInterfaceImpl.h"  // from @llvm-project
-#include "mlir/Dialect/MemRef/Transforms/Passes.h"  // from @llvm-project
-#include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
-#include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
 #include "mlir/Dialect/Vector/IR/VectorOps.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/OperationSupport.h"  // from @llvm-project
-#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Export.h"  // from @llvm-project
@@ -106,18 +88,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/layout_util.h"
 #include "xla/map_util.h"
-#include "xla/mlir/framework/ir/xla_framework.h"
-#include "xla/mlir/runtime/ir/rt_dialect.h"
-#include "xla/mlir/runtime/transforms/calling_convention.h"
-#include "xla/mlir/runtime/transforms/compilation_pipeline_cpu.h"
-#include "xla/mlir/runtime/transforms/compiler.h"
-#include "xla/mlir/runtime/transforms/jit_compiler.h"
-#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
-#include "xla/mlir_hlo/mhlo/transforms/passes.h"
 #include "xla/mlir_hlo/transforms/passes.h"
-#include "xla/runtime/custom_call_registry.h"
-#include "xla/runtime/executable.h"
-#include "xla/runtime/jit_executable.h"
 #include "xla/service/algebraic_simplifier.h"
 #include "xla/service/all_reduce_promotion.h"
 #include "xla/service/all_to_all_decomposer.h"
@@ -145,15 +116,8 @@ limitations under the License.
 #include "xla/service/cpu/cpu_layout_assignment.h"
 #include "xla/service/cpu/cpu_options.h"
 #include "xla/service/cpu/dot_op_emitter.h"
-#include "xla/service/cpu/hlo_xla_runtime_pipeline.h"
 #include "xla/service/cpu/ir_emitter.h"
 #include "xla/service/cpu/parallel_task_assignment.h"
-#include "xla/service/cpu/runtime/collectives.h"
-#include "xla/service/cpu/runtime/convolution_call.h"
-#include "xla/service/cpu/runtime/custom_call.h"
-#include "xla/service/cpu/runtime/fft_call.h"
-#include "xla/service/cpu/runtime/rng_call.h"
-#include "xla/service/cpu/runtime/xfeed.h"
 #include "xla/service/cpu/simple_orc_jit.h"
 #include "xla/service/cpu/target_machine_features.h"
 #include "xla/service/cpu_gpu_shape_verifier.h"
@@ -244,58 +208,6 @@ limitations under the License.
 #include "xla/service/cpu/onednn_ops_rewriter.h"
 #endif
 
-namespace {
-
-// We need to explicitly load all the dialects we will involved in emitting the
-// IR. This is only needed because of how MLIR is bolted into XLA and does not
-// make use of the MLIR infrastructure (like using a proper pass pipeline).
-// Hopefully this will all go away at some point in favor of a better
-// integration.
-void LoadMLIRDialects(mlir::MLIRContext& context) {
-  context.loadDialect<mlir::arith::ArithDialect, mlir::linalg::LinalgDialect,
-                      mlir::scf::SCFDialect, mlir::vector::VectorDialect,
-                      mlir::func::FuncDialect, mlir::affine::AffineDialect,
-                      mlir::tensor::TensorDialect,
-                      mlir::xla_framework::XLAFrameworkDialect,
-                      xla::runtime::RuntimeDialect>();
-  mlir::registerBuiltinDialectTranslation(context);
-  mlir::registerLLVMDialectTranslation(context);
-
-  mlir::DialectRegistry registry;
-  mlir::memref::registerAllocationOpInterfaceExternalModels(registry);
-  context.appendDialectRegistry(registry);
-}
-
-xla::cpu::HloXlaRuntimePipelineOptions GetHloXlaRuntimePipelineOptions(
-    llvm::Triple target_triple, llvm::StringRef cpu_name) {
-  xla::cpu::HloXlaRuntimePipelineOptions options;
-  options.enable_tiling_and_fusion = false;
-  if (xla::GetDebugOptionsFromFlags().xla_cpu_enable_custom_matmul_tiling()) {
-    options.matmul_tile_sizes = {
-        xla::GetDebugOptionsFromFlags().xla_cpu_matmul_tiling_m_dim(),
-        xla::GetDebugOptionsFromFlags().xla_cpu_matmul_tiling_n_dim(),
-        xla::GetDebugOptionsFromFlags().xla_cpu_matmul_tiling_k_dim()};
-  }
-#ifdef TF_LLVM_X86_AVAILABLE
-  options.enable_avx2 = [&] {
-    // Derive whether this is an x86 CPU with AVX2 enabled.
-    if (!target_triple.isX86()) return false;
-    llvm::SmallVector<llvm::StringRef> cpu_features;
-    llvm::X86::getFeaturesForCPU(cpu_name, cpu_features);
-    return llvm::is_contained(cpu_features, "avx2");
-  }();
-#else
-  options.enable_avx2 = false;
-#endif
-  options.cpu_name = cpu_name;
-  if (xla::GetDebugOptionsFromFlags().xla_cpu_enable_mlir_fusion_outlining()) {
-    options.enable_fusion_outlining = true;
-  }
-  return options;
-}
-
-}  // namespace
-
 namespace xla {
 
 namespace {
@@ -359,102 +271,6 @@ se::Platform::Id CpuAotCompilationOptions::PlatformId() const {
   return se::host::kHostPlatformId;
 }
 
-namespace {
-
-namespace runtime = ::xla::runtime;
-
-class FlattenTuplesAndBufferizeTypeConverter : public mlir::TypeConverter {
- public:
-  FlattenTuplesAndBufferizeTypeConverter() {
-    addConversion(
-        [](mlir::Type type, mlir::SmallVectorImpl<mlir::Type>& converted)
-            -> mlir::LogicalResult {
-          mlir::bufferization::BufferizeTypeConverter bufferize;
-          auto tuple_type = type.dyn_cast<mlir::TupleType>();
-          if (!tuple_type) {
-            converted.push_back(bufferize.convertType(type));
-            return mlir::success();
-          }
-          // TODO(b/249078472): update this expansion to support nested tuples.
-          converted.append(llvm::to_vector(llvm::map_range(
-              tuple_type.getTypes(),
-              [&](mlir::Type t) { return bufferize.convertType(t); })));
-          return mlir::success();
-        });
-  }
-};
-
-runtime::JitExecutable::Options GetXlaRuntimeJitExecutableOptions(
-    const HloModule& module, mlir::DialectRegistry* custom_registry) {
-  runtime::CpuPipelineOptions copts;
-  runtime::JitExecutable::Options opts;
-  copts.xla_cpu_sparse_cuda_threads =
-      GetDebugOptionsFromFlags().xla_cpu_sparse_cuda_threads();
-  std::optional<std::string> maybeOverriddenPipeline =
-      options::ExperimentalOverriddenPipeline(module.config());
-  opts.specialization = runtime::JitExecutable::Specialization::kDisabled;
-  opts.compiler.register_dialects =
-      [custom_registry](xla::runtime::DialectRegistry& dialects) {
-        dialects->insert<mlir::mhlo::MhloDialect>();
-        runtime::RegisterDefaultXlaCpuRuntimeDialects(dialects);
-        RegisterHloXlaRuntimePipelineDialects(*dialects);
-        if (custom_registry) {
-          custom_registry->appendTo(*dialects);
-        }
-      };
-  opts.compiler.symbols_binding = runtime::ToSymbolsBinding(
-      [](runtime::DirectCustomCallRegistry& registry) {
-        PopulateXlaCpuCollectivesCall(registry);
-        PopulateXlaCpuConvolutionCall(registry);
-        PopulateXlaCpuCustomCall(registry);
-        PopulateXlaCpuFftCall(registry);
-        PopulateXlaCpuRngCall(registry);
-        PopulateXlaXfeedCall(registry);
-      });
-  opts.compiler.create_compilation_pipeline =
-      [copts, maybeOverriddenPipeline = std::move(maybeOverriddenPipeline)](
-          xla::runtime::PassManager& passes) {
-        if (maybeOverriddenPipeline.has_value()) {
-          std::string error_message;
-          llvm::raw_string_ostream error_stream(error_message);
-          mlir::LogicalResult result = mlir::parsePassPipeline(
-              maybeOverriddenPipeline.value(), *passes, error_stream);
-          if (mlir::failed(result)) {
-            LOG(ERROR)
-                << "Failed to parse experimental CPU compilation pipeline: "
-                << error_stream.str();
-            return absl::InternalError(
-                "Failed to parse experimental CPU compilation pipeline.");
-          }
-          LOG(INFO) << "Experimental CPU compilation pipeline: "
-                    << maybeOverriddenPipeline.value();
-          return absl::OkStatus();
-        }
-
-        HloXlaRuntimePipelineOptions options = GetHloXlaRuntimePipelineOptions(
-            llvm::Triple(llvm::sys::getProcessTriple()),
-            llvm::sys::getHostCPUName());
-        options.xla_cpu_sparse_cuda_threads =
-            GetDebugOptionsFromFlags().xla_cpu_sparse_cuda_threads();
-
-        Status status = CreateHloXlaRuntimePipeline(passes, options);
-        if (!status.ok()) {
-          LOG(ERROR) << "HLO-XLA Runtime pipeline failed with: "
-                     << status.message();
-          return status;
-        }
-        runtime::CreateDefaultXlaCpuRuntimeCompilationPipeline(passes, copts);
-        return absl::OkStatus();
-      };
-  opts.compiler.calling_convention = runtime::ResultsToOutsCallingConvention(
-      FlattenTuplesAndBufferizeTypeConverter());
-  opts.compiler.embed_ir_in_executable =
-      module.config().debug_options().xla_embed_ir_in_executable();
-  return opts;
-}
-
-}  // namespace
-
 CpuAotCompilationResult::CpuAotCompilationResult(
     ObjectFileData object_file_data, std::vector<BufferInfo> buffer_infos,
     int64_t result_buffer_index, std::unique_ptr<HloModule> module,
@@ -535,13 +351,13 @@ class CollectProfileCandidates : public DfsHloVisitorWithDefault {
       : hlo_to_profile_idx_(hlo_to_profile_idx),
         assigned_indices_(assigned_indices) {}
 
-  Status DefaultAction(HloInstruction* hlo_instruction) override {
+  absl::Status DefaultAction(HloInstruction* hlo_instruction) override {
     hlo_to_profile_idx_->insert(
         {hlo_instruction, FindOrDie(assigned_indices_, hlo_instruction)});
     return absl::OkStatus();
   }
 
-  Status HandleCall(HloInstruction* call) override {
+  absl::Status HandleCall(HloInstruction* call) override {
     TF_RETURN_IF_ERROR(DefaultAction(call));
     CollectProfileCandidates candidates_for_call(hlo_to_profile_idx_,
                                                  assigned_indices_);
@@ -549,7 +365,7 @@ class CollectProfileCandidates : public DfsHloVisitorWithDefault {
     return absl::OkStatus();
   }
   // Recurse into "conditional" so we can profile inside of it.
-  Status HandleConditional(HloInstruction* conditional) override {
+  absl::Status HandleConditional(HloInstruction* conditional) override {
     TF_RETURN_IF_ERROR(DefaultAction(conditional));
 
     CollectProfileCandidates candidates_for_true(hlo_to_profile_idx_,
@@ -566,12 +382,16 @@ class CollectProfileCandidates : public DfsHloVisitorWithDefault {
   }
 
   // Skip constants, there is nothing to profile.
-  Status HandleConstant(HloInstruction*) override { return absl::OkStatus(); }
+  absl::Status HandleConstant(HloInstruction*) override {
+    return absl::OkStatus();
+  }
   // Skip parameters, they are a simple load.
-  Status HandleParameter(HloInstruction*) override { return absl::OkStatus(); }
+  absl::Status HandleParameter(HloInstruction*) override {
+    return absl::OkStatus();
+  }
   // It is important to recurse for "while" or else we risk overly coarse
   // profiling information.
-  Status HandleWhile(HloInstruction* xla_while) override {
+  absl::Status HandleWhile(HloInstruction* xla_while) override {
     TF_RETURN_IF_ERROR(DefaultAction(xla_while));
 
     CollectProfileCandidates candidates_for_condition(hlo_to_profile_idx_,
@@ -607,9 +427,10 @@ void AddHloVerifier(HloPassPipeline* pipeline, HloVerifierOpts&& opts = {},
 
 }  // namespace
 
-Status CpuCompiler::RunHloPassesThroughLayoutAssn(
+absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn(
     HloModule* module, bool is_aot_compile,
     LLVMTargetMachineFeatures* target_machine_features, bool is_mlir_compile) {
+  const DebugOptions& debug_options = module->config().debug_options();
   const int64_t num_partitions = module->config().num_partitions();
   if (num_partitions > 1) {
     if (!module->config().use_spmd_partitioning()) {
@@ -642,13 +463,13 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   }
 
   {
-    // Int4Packer must be run before the rest of the pipeline since it
+    // SubbytePacker must be run before the rest of the pipeline since it
     // modifies the layout of the entry computation inputs/outputs, which is
     // passed to LayoutAssignment.
-    HloPassPipeline int4_packer_pipeline("Int4Packer pipeline");
-    int4_packer_pipeline.AddPass<SubByteNormalization>(
+    HloPassPipeline subbyte_packer_pipeline("SubbytePacker pipeline");
+    subbyte_packer_pipeline.AddPass<SubByteNormalization>(
         SubByteNormalization::SET_ELEMENT_SIZE);
-    TF_RETURN_IF_ERROR(int4_packer_pipeline.Run(module).status());
+    TF_RETURN_IF_ERROR(subbyte_packer_pipeline.Run(module).status());
   }
 
   HloPassPipeline pipeline("HLO passes through layout assignment");
@@ -761,6 +582,12 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   pipeline.AddPass<LogisticExpander>();
   pipeline.AddPass<ConditionalCanonicalizer>();
   pipeline.AddPass<DynamicDimensionSimplifier>();
+
+  if (debug_options.xla_reduce_window_rewrite_base_length() != 0) {
+    pipeline.AddPass<HloPassFix<ReduceWindowRewriter>>(
+        debug_options.xla_reduce_window_rewrite_base_length());
+  }
+
   auto dynamic_padder_options = DynamicPadderOptions();
   // TODO(pgavin): ShapeChecks were never implemented correctly by the dynamic
   // padder.  The mode defaults to kIgnore, and it was not overridden for nested
@@ -873,7 +700,7 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   return pipeline.Run(module).status();
 }
 
-Status CpuCompiler::RunHloPassesAfterLayoutAssn(
+absl::Status CpuCompiler::RunHloPassesAfterLayoutAssn(
     HloModule* module, bool is_aot_compile,
     LLVMTargetMachineFeatures* target_machine_features,
     const CompileOptions& compile_options, bool is_mlir_compile) {
@@ -976,10 +803,10 @@ Status CpuCompiler::RunHloPassesAfterLayoutAssn(
   return pipeline.Run(module).status();
 }
 
-Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile,
-                                 llvm::TargetMachine* target_machine,
-                                 const CompileOptions& compile_options,
-                                 bool is_mlir_compile) {
+absl::Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile,
+                                       llvm::TargetMachine* target_machine,
+                                       const CompileOptions& compile_options,
+                                       bool is_mlir_compile) {
   LLVMTargetMachineFeatures target_machine_features(target_machine);
   TF_RETURN_IF_ERROR(RunHloPassesThroughLayoutAssn(
       module, is_aot_compile, &target_machine_features, is_mlir_compile));
@@ -1047,7 +874,7 @@ std::pair<LLVMCompiler::ModuleHook, LLVMCompiler::ModuleHook> GetIRModuleHooks(
           }};
 }
 
-Status VerifyLlvmModule(const llvm::Module& llvm_module) {
+absl::Status VerifyLlvmModule(const llvm::Module& llvm_module) {
   XLA_SCOPED_LOGGING_TIMER("CpuCompiler - Running LLVM verifier");
 
   std::string err;
@@ -1062,7 +889,7 @@ Status VerifyLlvmModule(const llvm::Module& llvm_module) {
   return absl::OkStatus();
 }
 
-Status CreateHloProfilingArtifacts(
+absl::Status CreateHloProfilingArtifacts(
     const HloModule& module,
     absl::flat_hash_map<const HloInstruction*, int64_t>*
         instruction_to_profile_idx,
@@ -1138,129 +965,6 @@ void InitializeLLVMCommandLineOptions(const HloModuleConfig& config) {
       config.debug_options().xla_backend_extra_options());
 }
 
-Status LowerMLIRModule(HloModule* module, mlir::ModuleOp mlir_module,
-                       mlir::MLIRContext& mlir_context,
-                       const llvm::TargetMachine& target) {
-  LoadMLIRDialects(mlir_context);
-  mlir::PassManager pm(&mlir_context);
-  if (VLOG_IS_ON(5)) {
-    mlir_context.disableMultithreading();
-    // Do not print large constants.
-    mlir::OpPrintingFlags printing_flags;
-    printing_flags.elideLargeElementsAttrs(32);
-    pm.enableIRPrinting(
-        [](mlir::Pass* pass, mlir::Operation* op) { return true; },
-        [](mlir::Pass* pass, mlir::Operation* op) { return true; },
-        /*printModuleScope=*/true, /*printAfterOnlyOnChange=*/true,
-        /*printAfterOnlyOnFailure=*/false, llvm::errs(), printing_flags);
-  }
-
-  xla::runtime::PassManager xla_pm(&pm);
-  HloXlaRuntimePipelineOptions options = GetHloXlaRuntimePipelineOptions(
-      target.getTargetTriple(), target.getTargetCPU());
-  options.sparse_bufferization = false;
-  TF_RETURN_IF_ERROR(CreateHloXlaRuntimePipeline(xla_pm, options));
-
-  runtime::CpuPipelineOptions cpu_pipeline_opts;
-  CreateDefaultXlaCpuRuntimeCompilationPipeline(xla_pm, cpu_pipeline_opts);
-
-  if (pm.run(mlir_module).failed()) {
-    mlir_module->dump();
-    return tsl::errors::Internal("Failed to compile through MLIR pipeline");
-  }
-  return absl::OkStatus();
-}
-
-absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> createMLIRModule(
-    HloModule* module, mlir::MLIRContext& mlir_context,
-    BufferAssignment* assignment,
-    XlaFrameworkMapping* export_mapping = nullptr) {
-  LoadMLIRDialects(mlir_context);
-  mlir::OpBuilder builder(&mlir_context);
-  auto mlir_module = builder.create<mlir::ModuleOp>(builder.getUnknownLoc());
-  TF_RETURN_IF_ERROR(ConvertHloToMlirHlo(mlir_module, module));
-
-  // Flatten tuples before we set up the input mapping. The flattening pass
-  // doesn't preserve attributes so we'd lose some in the process.
-  mlir::PassManager pm(mlir_module.getOperation()->getName(),
-                       mlir::PassManager::Nesting::Implicit);
-  pm.addPass(mlir::mhlo::createExpandHloTuplesPass("main"));
-  if (failed(pm.run(mlir_module.getOperation()))) {
-    return tsl::errors::Internal("Failed to flatten tuples");
-  }
-
-  // Add buffer mappings. The first attribute is the index of the slice, the
-  // second is a boolean attribute on whether the allocation is writeable.
-  llvm::SmallVector<std::pair<mlir::Attribute, mlir::Attribute>>
-      operand_mapping;
-  for (auto i : module->entry_computation()->parameter_instructions()) {
-    ShapeUtil::ForEachSubshape(
-        i->shape(), [&](const Shape& subshape, const ShapeIndex& index) {
-          if (subshape.IsTuple()) {
-            return;
-          }
-          auto slice = assignment->GetUniqueSlice(i, index);
-          operand_mapping.emplace_back(
-              builder.getI32IntegerAttr(static_cast<int32_t>(slice->index())),
-              builder.getBoolAttr(!slice->allocation()->is_readonly()));
-        });
-  }
-
-  auto root_instr = module->entry_computation()->root_instruction();
-  auto output_allocation = assignment->GetUniqueTopLevelOutputSlice();
-
-  // Gather mappings to each element in the tuple if necessary
-  llvm::SmallVector<mlir::Attribute> result_inner_mapping;
-  if (output_allocation->allocation()->is_tuple()) {
-    ShapeUtil::ForEachSubshape(
-        root_instr->shape(),
-        [&](const Shape& subshape, const ShapeIndex& index) {
-          if (subshape.IsTuple()) {
-            return;
-          }
-          int64_t result_index =
-              assignment->GetUniqueSlice(root_instr, index)->index();
-          result_inner_mapping.push_back(
-              builder.getI64IntegerAttr(result_index));
-          if (export_mapping != nullptr) {
-            export_mapping->flattened_outputs.push_back(result_index);
-          }
-        });
-  }
-
-  int output_index = static_cast<int>(output_allocation->index());
-  auto result_mapping = builder.getI32IntegerAttr(output_index);
-  mlir_module->walk([&](mlir::func::FuncOp f) {
-    if (f.getSymName() == "main") {
-      for (const auto& p : llvm::enumerate(operand_mapping)) {
-        f.setArgAttr(p.index(), "xla_framework.input_mapping", p.value().first);
-        if (export_mapping != nullptr) {
-          auto index_attr = p.value().first.dyn_cast<mlir::IntegerAttr>();
-          if (index_attr) {
-            export_mapping->inputs.push_back(index_attr.getInt());
-          }
-        }
-        // Mark argument as (non-)writeable for bufferization. This ensures that
-        // entry parameters are not overwritten.
-        f.setArgAttr(p.index(), "bufferization.writable", p.value().second);
-      }
-      f->setAttr("xla_framework.result_mapping", result_mapping);
-      if (export_mapping != nullptr) {
-        export_mapping->result = output_index;
-      }
-    }
-
-    if (output_allocation->allocation()->is_tuple()) {
-      f->setAttr("xla_framework.result_inner_mapping",
-                 mlir::ArrayAttr::get(f.getContext(), result_inner_mapping));
-      if (export_mapping != nullptr) {
-        export_mapping->output_is_tuple = true;
-      }
-    }
-  });
-  return {mlir_module};
-}
-
 struct ComputationToEmit {
   HloComputation* computation;
 
@@ -1335,7 +1039,6 @@ CpuCompiler::CompileLegacyCpuExecutable(std::unique_ptr<HloModule> module) {
 
   // Compile must be thread-safe so create a new LLVM context for the module.
   mlir::MLIRContext mlir_context;
-  LoadMLIRDialects(mlir_context);
   auto llvm_context = std::make_unique<llvm::LLVMContext>();
   auto llvm_module =
       std::make_unique<llvm::Module>("__compute_module", *llvm_context);
@@ -1484,96 +1187,6 @@ CpuCompiler::CompileLegacyCpuExecutable(std::unique_ptr<HloModule> module) {
   return cpu_executable;
 }
 
-namespace {
-
-absl::StatusOr<std::unique_ptr<XlaRuntimeCpuExecutable>>
-GetXlaRuntimeCpuExecutable(const HloModule& hlo_module,
-                           mlir::ModuleOp mlir_module,
-                           absl::string_view entry_point,
-                           const XlaFrameworkMapping& xla_framework_mapping,
-                           mlir::DialectRegistry* registry) {
-  runtime::JitExecutable::Options opts =
-      GetXlaRuntimeJitExecutableOptions(hlo_module, registry);
-  std::string serialized_mlir = llvm_ir::DumpToString(mlir_module);
-
-  absl::StatusOr<runtime::JitExecutable> jit_executable =
-      runtime::JitExecutable::Instantiate(serialized_mlir, entry_point, opts);
-  if (!jit_executable.ok()) {
-    return Internal("Failed to compile XLA Runtime program: %s",
-                    jit_executable.status().message());
-  }
-
-  return std::make_unique<XlaRuntimeCpuExecutable>(
-      std::make_unique<runtime::JitExecutable>(std::move(*jit_executable)),
-      xla_framework_mapping);
-}
-}  // namespace
-
-absl::StatusOr<std::unique_ptr<CpuExecutable>>
-CpuCompiler::CompileXlaRuntimeCpuExecutable(
-    std::unique_ptr<HloModule> hlo_module, mlir::DialectRegistry* registry) {
-  // Select an order for emitting the HLO instructions for each
-  // computation. Using this sequence enables tighter buffer liveness analysis
-  // and reduced memory usage (as compared to using DependencyHloOrdering).
-  TF_ASSIGN_OR_RETURN(
-      HloSchedule schedule,
-      ScheduleModule(
-          hlo_module.get(), BufferSizeBytesFunction(),
-          ComputationSchedulerToModuleScheduler(DFSMemoryScheduler)));
-
-  // Run buffer allocation on the HLO graph.
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<BufferAssignment> assignment,
-      BufferAssigner::Run(hlo_module.get(),
-                          std::make_unique<SequentialHloOrdering>(schedule),
-                          BufferSizeBytesFunction(), memory_alignment,
-                          /*allocate_buffers_for_constants=*/true));
-  VLOG(1) << "Buffer Assignment Stats for " << hlo_module->name() << "\n"
-          << assignment->GetStats().ToString();
-  DumpHloModuleIfEnabled(*hlo_module, *assignment, "cpu_after_optimizations");
-
-  // TODO(ecg): these are just being collected here to be passed to
-  // CpuExecutable's constructor; we should actually do something with them.
-  absl::flat_hash_map<const HloInstruction*, int64_t>
-      instruction_to_profile_idx;
-  absl::flat_hash_map<const HloComputation*, int64_t>
-      computation_to_profile_idx;
-  std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map;
-  std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data;
-  if (hlo_module->config().hlo_profiling_enabled()) {
-    TF_RETURN_IF_ERROR(CreateHloProfilingArtifacts(
-        *hlo_module, &instruction_to_profile_idx, &computation_to_profile_idx,
-        &hlo_profile_index_map, &hlo_profile_printer_data));
-  }
-
-  mlir::MLIRContext mlir_context;
-  if (registry) {
-    mlir_context.appendDialectRegistry(*registry);
-  }
-  XlaFrameworkMapping xla_framework_mapping;
-  TF_ASSIGN_OR_RETURN(
-      auto mlir_module,
-      createMLIRModule(hlo_module.get(), mlir_context, assignment.get(),
-                       &xla_framework_mapping));
-
-  TF_ASSIGN_OR_RETURN(
-      auto xla_runtime_executable,
-      GetXlaRuntimeCpuExecutable(*hlo_module, *mlir_module, "main",
-                                 xla_framework_mapping, registry));
-
-  if (DumpingEnabledForHloModule(*hlo_module)) {
-    TF_ASSIGN_OR_RETURN(std::string_view obj_file,
-                        xla_runtime_executable->GetObjFile());
-    DumpToFileInDir(*hlo_module, /*file_prefix=*/"", /*file_suffix=*/"o",
-                    obj_file);
-  }
-
-  return CpuExecutable::Create(
-      std::move(hlo_module), std::move(hlo_profile_printer_data),
-      std::move(hlo_profile_index_map), std::move(assignment),
-      std::move(xla_runtime_executable));
-}
-
 absl::StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
     std::unique_ptr<HloModule> module,
     [[maybe_unused]] se::StreamExecutor* stream_exec,
@@ -1688,9 +1301,7 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
 
   // Compile must be thread-safe so create a new LLVM context for the module.
   mlir::MLIRContext mlir_context;
-  LoadMLIRDialects(mlir_context);
   llvm::LLVMContext llvm_context;
-  std::unique_ptr<llvm::Module> llvm_module;
 
   std::vector<std::unique_ptr<AotCompilationResult>> results;
   for (size_t i = 0; i < modules.size(); ++i) {
@@ -1741,79 +1352,52 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
           CreateBufferInfosFromBufferAssignment(*module, *assignment);
       HloComputation* computation = module->entry_computation();
 
-      if (options.use_mlir_hlo_lowering()) {
-        TF_ASSIGN_OR_RETURN(
-            auto mlir_module,
-            createMLIRModule(module, mlir_context, assignment.get()));
-        TF_RETURN_IF_ERROR(
-            xla::runtime::ExportMainWithOrdinal0(*mlir_module, mlir_context));
-        TF_RETURN_IF_ERROR(LowerMLIRModule(module, *mlir_module, mlir_context,
-                                           *target_machine));
-
-        llvm::cast<mlir::LLVM::LLVMFuncOp>(mlir_module->lookupSymbol("main"))
-            .setName(options.entry_point_name());
-
-        llvm_module = mlir::translateModuleToLLVMIR(*mlir_module, llvm_context);
-        if (!llvm_module) {
-          return Internal("Failed to translate module to LLVM IR");
-        }
-        // Set missing information
-        llvm_module->setDataLayout(target_machine->createDataLayout());
-        llvm_module->setTargetTriple(triple.getTriple());
-        if (pic_level != llvm::PICLevel::NotPIC) {
-          llvm_module->setPICLevel(pic_level);
-        }
-        if (pie_level != llvm::PIELevel::Default) {
-          llvm_module->setPIELevel(pie_level);
-        }
-      } else {
-        // Set required information before emitting IR
-        llvm_module =
-            std::make_unique<llvm::Module>("__compute_module", llvm_context);
-        llvm_module->setDataLayout(target_machine->createDataLayout());
-        llvm_module->setTargetTriple(triple.getTriple());
-        if (pic_level != llvm::PICLevel::NotPIC) {
-          llvm_module->setPICLevel(pic_level);
-        }
-        if (pie_level != llvm::PIELevel::Default) {
-          llvm_module->setPIELevel(pie_level);
-        }
-        IrEmitter ir_emitter(
-            &mlir_context, *module, *assignment, llvm_module.get(),
-            std::move(instruction_to_profile_idx),
-            std::move(computation_to_profile_idx),
-            ModuleComputationsTransitivelyContainCustomCall(*module),
-            &target_machine_features,
-            // TODO(b/66051036): Run full msan for AOT.
-            /*emit_code_for_msan=*/false);
-
-        TF_RETURN_IF_ERROR(ir_emitter.EmitConstantGlobals());
-
-        for (ComputationToEmit subcomputation :
-             SubcomputationEmissionOrder(computation)) {
-          if (subcomputation.computation->IsFusionComputation()) {
-            continue;
-          }
-          TF_RETURN_IF_ERROR(
-              ir_emitter
-                  .EmitComputation(subcomputation.computation,
-                                   subcomputation.computation->name(),
-                                   /*is_top_level_computation=*/false,
-                                   schedule.sequence(subcomputation.computation)
-                                       .instructions(),
-                                   subcomputation.allow_reassociation)
-                  .status());
+      // Set required information before emitting IR
+      auto llvm_module =
+          std::make_unique<llvm::Module>("__compute_module", llvm_context);
+      llvm_module->setDataLayout(target_machine->createDataLayout());
+      llvm_module->setTargetTriple(triple.getTriple());
+      if (pic_level != llvm::PICLevel::NotPIC) {
+        llvm_module->setPICLevel(pic_level);
+      }
+      if (pie_level != llvm::PIELevel::Default) {
+        llvm_module->setPIELevel(pie_level);
+      }
+      IrEmitter ir_emitter(
+          &mlir_context, *module, *assignment, llvm_module.get(),
+          std::move(instruction_to_profile_idx),
+          std::move(computation_to_profile_idx),
+          ModuleComputationsTransitivelyContainCustomCall(*module),
+          &target_machine_features,
+          // TODO(b/66051036): Run full msan for AOT.
+          /*emit_code_for_msan=*/false);
+
+      TF_RETURN_IF_ERROR(ir_emitter.EmitConstantGlobals());
+
+      for (ComputationToEmit subcomputation :
+           SubcomputationEmissionOrder(computation)) {
+        if (subcomputation.computation->IsFusionComputation()) {
+          continue;
         }
-        const std::string& entry_point_name = options.entry_point_name();
-        TF_ASSIGN_OR_RETURN(llvm::Function * entry_function,
-                            ir_emitter.EmitComputation(
-                                computation, entry_point_name,
-                                /*is_top_level_computation=*/true,
-                                schedule.sequence(computation).instructions(),
-                                /*allow_reassociation=*/false));
-
-        CHECK(entry_function->getName() == entry_point_name);
+        TF_RETURN_IF_ERROR(
+            ir_emitter
+                .EmitComputation(subcomputation.computation,
+                                 subcomputation.computation->name(),
+                                 /*is_top_level_computation=*/false,
+                                 schedule.sequence(subcomputation.computation)
+                                     .instructions(),
+                                 subcomputation.allow_reassociation)
+                .status());
       }
+      const std::string& entry_point_name = options.entry_point_name();
+      TF_ASSIGN_OR_RETURN(llvm::Function * entry_function,
+                          ir_emitter.EmitComputation(
+                              computation, entry_point_name,
+                              /*is_top_level_computation=*/true,
+                              schedule.sequence(computation).instructions(),
+                              /*allow_reassociation=*/false));
+
+      CHECK(entry_function->getName() == entry_point_name);
 
       ModuleHook pre_optimization_ir_hook;
       ModuleHook post_optimization_ir_hook;
@@ -1824,7 +1408,7 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
       // Run the LLVM verifier over the unoptimized LLVM IR.  If it fails, run
       // the pre-optimization IR dump hook before returning.
       {
-        Status verify_status = VerifyLlvmModule(*llvm_module);
+        absl::Status verify_status = VerifyLlvmModule(*llvm_module);
         if (!verify_status.ok() && pre_optimization_ir_hook) {
           pre_optimization_ir_hook(*llvm_module);
         }
@@ -1840,11 +1424,6 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
                                           obj_file.getData().size()));
       };
 
-      std::vector<std::string> xla_runtime_abi_conversions;
-      if (options.use_mlir_hlo_lowering()) {
-        xla_runtime_abi_conversions.push_back(options.entry_point_name());
-      }
-
       CompilerFunctor compiler_functor(
           target_machine.get(), static_cast<int>(opt_level),
           options::OptimizeForSizeRequested(module->config()),
@@ -1853,8 +1432,7 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
           llvm_ir::GetCpuFastMathFlags(module->config()),
           pre_optimization_ir_hook, post_optimization_ir_hook,
           post_codegen_hook, aot_options.sanitize_dataflow(),
-          aot_options.sanitize_abilists_dataflow(),
-          xla_runtime_abi_conversions);
+          aot_options.sanitize_abilists_dataflow());
       std::unique_ptr<llvm::MemoryBuffer> object_file =
           cantFail(compiler_functor(*llvm_module));
       ObjectFileData object_file_data(object_file->getBufferStart(),
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.h b/third_party/xla/xla/service/cpu/cpu_compiler.h
index 69ba7f498f3db7..3cc54c4a616e3f 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.h
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.h
@@ -195,19 +195,19 @@ class CpuCompiler : public LLVMCompiler {
 
   // Runs the HLO passes which are necessary for both optimizations and
   // correctness.
-  Status RunHloPasses(HloModule* module, bool is_aot_compile,
-                      llvm::TargetMachine* target_machine,
-                      const CompileOptions& compile_options,
-                      bool is_mlir_compile = false);
+  absl::Status RunHloPasses(HloModule* module, bool is_aot_compile,
+                            llvm::TargetMachine* target_machine,
+                            const CompileOptions& compile_options,
+                            bool is_mlir_compile = false);
 
   // Runs HLO passes up to and including layout assignment.
-  Status RunHloPassesThroughLayoutAssn(
+  absl::Status RunHloPassesThroughLayoutAssn(
       HloModule* module, bool /*is_aot_compile*/,
       LLVMTargetMachineFeatures* target_machine_features,
       bool is_mlir_compile = false);
 
   // Runs HLO passes after layout assignment.
-  Status RunHloPassesAfterLayoutAssn(
+  absl::Status RunHloPassesAfterLayoutAssn(
       HloModule* module, bool is_aot_compile,
       LLVMTargetMachineFeatures* target_machine_features,
       const CompileOptions& compile_options, bool is_mlir_compile);
diff --git a/third_party/xla/xla/runtime/cpu_event.h b/third_party/xla/xla/service/cpu/cpu_event.h
similarity index 85%
rename from third_party/xla/xla/runtime/cpu_event.h
rename to third_party/xla/xla/service/cpu/cpu_event.h
index 3611d7fd728cfa..e5a8d803927886 100644
--- a/third_party/xla/xla/runtime/cpu_event.h
+++ b/third_party/xla/xla/service/cpu/cpu_event.h
@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_RUNTIME_CPU_EVENT_H_
-#define XLA_RUNTIME_CPU_EVENT_H_
+#ifndef XLA_SERVICE_CPU_CPU_EVENT_H_
+#define XLA_SERVICE_CPU_CPU_EVENT_H_
 
 namespace xla {
-namespace runtime {
 
 // Typical use case: AsyncValueRef<CpuEvent> is used to indicate the
 // completion of a CPU operation, e.g., data transfer or running a program.
@@ -25,7 +24,6 @@ struct CpuEvent {
   CpuEvent() = default;
 };
 
-}  // namespace runtime
 }  // namespace xla
 
-#endif  // XLA_RUNTIME_CPU_EVENT_H_
+#endif  // XLA_SERVICE_CPU_CPU_EVENT_H_
diff --git a/third_party/xla/xla/service/cpu/cpu_executable.cc b/third_party/xla/xla/service/cpu/cpu_executable.cc
index 13da05afe20250..c6c9337a8b94b6 100644
--- a/third_party/xla/xla/service/cpu/cpu_executable.cc
+++ b/third_party/xla/xla/service/cpu/cpu_executable.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/mlir/runtime/transforms/compiler.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/computation_layout.h"
 #include "xla/service/logical_buffer.h"
@@ -56,8 +55,6 @@ limitations under the License.
 namespace xla {
 namespace cpu {
 
-namespace runtime = ::xla::runtime;
-
 absl::StatusOr<std::unique_ptr<CpuExecutable>> CpuExecutable::Create(
     std::unique_ptr<SimpleOrcJIT> jit,
     std::unique_ptr<const BufferAssignment> assignment,
@@ -95,15 +92,11 @@ absl::StatusOr<std::unique_ptr<CpuExecutable>> CpuExecutable::Create(
     std::unique_ptr<HloModule> hlo_module,
     std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
     std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map,
-    std::unique_ptr<const BufferAssignment> assignment,
-    std::unique_ptr<XlaRuntimeCpuExecutable> xla_runtime_executable) {
+    std::unique_ptr<const BufferAssignment> assignment) {
   std::unique_ptr<CpuExecutable> executable(new CpuExecutable(
       std::move(hlo_module), std::move(hlo_profile_printer_data),
       std::move(hlo_profile_index_map), std::move(assignment)));
-  executable->set_ir_module_string(
-      xla_runtime_executable->GetExecutable().take_ir_module_string());
   executable->module_name_ = "main";
-  executable->xla_runtime_executable_ = std::move(xla_runtime_executable);
   return executable;
 }
 
@@ -187,7 +180,7 @@ CpuExecutable::CreateBufferTable(se::DeviceMemoryAllocator* memory_allocator,
   return std::move(buffers);
 }
 
-Status CpuExecutable::ExecuteComputeFunction(
+absl::Status CpuExecutable::ExecuteComputeFunction(
     const ExecutableRunOptions* run_options,
     absl::Span<MaybeOwningDeviceMemory const> buffers,
     HloExecutionProfile* hlo_execution_profile) {
@@ -237,100 +230,20 @@ Status CpuExecutable::ExecuteComputeFunction(
     }
   };
 
-  if (IsXlaRuntime()) {
-    std::vector<BufferDesc> descriptor_table;
-    descriptor_table.reserve(buffers.size());
-    for (const auto& buffer : buffers) {
-      const tensorflow::se::DeviceMemoryBase& base =
-          buffer.AsDeviceMemoryBase();
-      BufferDesc desc(const_cast<void*>(base.opaque()), base.size());
-      descriptor_table.push_back(std::move(desc));
-    }
-    Status status = ExecuteXlaRuntime(descriptor_table, run_options);
-    record_profile();
-    if (!status.ok()) {
-      return status;
-    }
-  } else {
-    XlaCustomCallStatus status;
-    // For the entry computation (like all global computations), all inputs and
-    // outputs are in the buffer table, and both the result pointer and args
-    // array pointers are unused (so we set them to 'nullptr').
-    compute_function_(nullptr, run_options, nullptr, buffer_pointers.data(),
-                      &status, profile_counters);
-    record_profile();
-    std::optional<absl::string_view> error_message =
-        CustomCallStatusGetMessage(&status);
-    if (error_message) {
-      return Internal("CustomCall failed: %s", *error_message);
-    }
+  XlaCustomCallStatus status;
+  // For the entry computation (like all global computations), all inputs and
+  // outputs are in the buffer table, and both the result pointer and args
+  // array pointers are unused (so we set them to 'nullptr').
+  compute_function_(nullptr, run_options, nullptr, buffer_pointers.data(),
+                    &status, profile_counters);
+  record_profile();
+  std::optional<absl::string_view> error_message =
+      CustomCallStatusGetMessage(&status);
+  if (error_message) {
+    return Internal("CustomCall failed: %s", *error_message);
   }
 
-  return OkStatus();
-}
-
-absl::StatusOr<std::unique_ptr<Executable>> CpuExecutable::LoadFromObjFile(
-    std::unique_ptr<HloModule> hlo_module, absl::string_view obj_file,
-    absl::string_view mlir_module,
-    std::unique_ptr<BufferAssignment> buffer_assignment,
-    XlaFrameworkMapping xla_framework_mapping,
-    runtime::JitExecutable::Options opts) {
-  VLOG(1) << "Load serialized Cpu executable from object file: module="
-          << hlo_module->name();
-
-  runtime::DialectRegistry dialects;
-  opts.compiler.register_dialects(dialects);
-  auto threading = mlir::MLIRContext::Threading::DISABLED;
-  auto ctx = std::make_unique<mlir::MLIRContext>(*dialects, threading);
-  ctx->loadAllAvailableDialects();
-
-  // Load MLIR module behind the compiled object file.
-  auto module = mlir::parseSourceString<mlir::ModuleOp>(mlir_module, ctx.get());
-  if (!module) return Internal("Failed to parse AOT compiled module");
-
-  llvm::StringRef data(obj_file.data(), obj_file.size());
-  auto buffer = llvm::MemoryBuffer::getMemBuffer(data, hlo_module->name());
-
-  // Recover function signatures using calling convention and type converter.
-  auto func = mlir::cast<mlir::func::FuncOp>(module->lookupSymbol("main"));
-  mlir::FunctionType func_type = func.getFunctionType();
-  absl::StatusOr<runtime::FunctionType> sig =
-      opts.compiler.type_converter.Convert(func_type);
-  if (!sig.ok())
-    return Internal("Type converter failed to convert function type");
-
-  mlir::FunctionType runtime_type = opts.compiler.calling_convention(func_type);
-  if (!runtime_type)
-    return Internal("Calling convention failed to convert function type");
-
-  absl::StatusOr<runtime::FunctionType> runtime_sig =
-      opts.compiler.type_converter.Convert(runtime_type);
-  if (!runtime_sig.ok())
-    return Internal(
-        "Type converter failed to convert runtime function type");
-
-  // Cpu executable has a single exported function.
-  std::vector<runtime::Executable::LoadFunction> functions;
-  functions.push_back({"main", std::move(*sig), std::move(*runtime_sig)});
-
-  // Load XLA Runtime executable from an object file.
-  auto executable = runtime::Executable::LoadFromObjFile(
-      hlo_module->name(), std::move(buffer), std::move(functions),
-      opts.compiler.symbols_binding);
-
-  if (!executable.ok())
-    return Internal("Failed to load XLA Runtime executable: %s",
-                         executable.status().message());
-
-  // Move runtime::Executable ownership to the XlaRuntimeCpuExecutable.
-  auto executable_ptr =
-      std::make_unique<runtime::Executable>(std::move(executable.value()));
-  auto xla_runtime_executable = std::make_unique<XlaRuntimeCpuExecutable>(
-      std::move(executable_ptr), xla_framework_mapping);
-
-  return CpuExecutable::Create(std::move(hlo_module), nullptr, nullptr,
-                               std::move(buffer_assignment),
-                               std::move(xla_runtime_executable));
+  return absl::OkStatus();
 }
 
 absl::StatusOr<ExecutionOutput> CpuExecutable::CreateResultShapedBuffer(
@@ -433,162 +346,6 @@ absl::StatusOr<ExecutionOutput> CpuExecutable::CreateResultShapedBuffer(
   return std::move(result);
 }
 
-// Converts a BufferDesc to a MemrefDesc according to the given 'operand_type',
-// which should point to a runtime::MemrefType.
-// Note: 'descriptor_index' and 'operand_index' are just used for error
-// reporting.
-static absl::StatusOr<runtime::MemrefDesc> BufferToMemref(
-    const BufferDesc& descriptor, const runtime::Type& operand_type,
-    size_t descriptor_index, size_t operand_index) {
-  auto* memref = llvm::dyn_cast<runtime::MemrefType>(&operand_type);
-  if (!memref) {
-    return Internal(
-        "Cannot convert descriptor %zu (operand_index %zu): "
-        "the corresponding type in the signature is a %s, "
-        "not a MemrefType.",
-        descriptor_index, operand_index, operand_type.ToString());
-  }
-
-  absl::Span<const int64_t> dims = memref->sizes();
-
-  // Verify that the provided descriptor size matches that of the memref.
-  size_t n_elem = absl::c_accumulate(dims, size_t{1}, std::multiplies<>());
-  size_t expected_size =
-      primitive_util::ByteWidth(memref->element_type()) * n_elem;
-  if (LLVM_UNLIKELY(expected_size != descriptor.size())) {
-    return InvalidArgument(
-        "Cannot convert descriptor %zu (operand_index %zu): "
-        "buffer size is not equal to that expected from the element type: "
-        "got %zu vs expected %zu.",
-        descriptor_index, operand_index, descriptor.size(), expected_size);
-  }
-
-  auto fill_sizes_and_strides = [&](auto sizes, auto strides) {
-    size_t multiplier = 1;
-    for (int i = static_cast<int>(dims.size()) - 1; i >= 0; --i) {
-      size_t size = dims[i];
-      sizes[i] = size;
-      strides[i] = multiplier;
-      multiplier *= size;
-    }
-  };
-  return runtime::MemrefDesc(memref->rank(), memref->element_type(),
-                             descriptor.data(), /*offset=*/0,
-                             fill_sizes_and_strides);
-}
-
-// Executes from an XLA Runtime CPU executable, given a buffer descriptor table.
-// Relevant elements of the descriptor table (i.e. arguments and results) are
-// converted to MemrefDesc's according to the corresponding operands in the
-// runtime signature.
-Status XlaRuntimeCpuExecutable::Execute(
-    const std::vector<BufferDesc>& descriptor_table,
-    const ExecutableRunOptions* run_options) {
-  const runtime::FunctionType& signature = GetExecutable().runtime_signature();
-
-  size_t num_arguments = xla_framework_mapping_.inputs.size();
-  if (xla_framework_mapping_.output_is_tuple) {
-    num_arguments += xla_framework_mapping_.flattened_outputs.size();
-  } else if (xla_framework_mapping_.result != -1) {
-    num_arguments += 1;
-  }
-
-  // Verify that the number of arguments in the mapping matches the signature.
-  // Add one to num_arguments to account for the signature's execution context.
-  if (num_arguments + 1 != signature.num_operands()) {
-    return Internal(
-        "Wrong number of arguments: got %zu via XLA FrameworkMapping, expected "
-        "%d.",
-        num_arguments, static_cast<int>(signature.num_operands()) - 1);
-  }
-
-  std::vector<runtime::MemrefDesc> arguments;
-  arguments.reserve(num_arguments);
-
-  auto append_converted_buffer = [&](size_t descriptor_index) -> Status {
-    const BufferDesc& descriptor = descriptor_table[descriptor_index];
-
-    // Use 1-based index to account for the execution context.
-    size_t operand_index = arguments.size() + 1;
-    const runtime::Type* operand_type = signature.operand(operand_index);
-
-    absl::StatusOr<runtime::MemrefDesc> memref = BufferToMemref(
-        descriptor, *operand_type, descriptor_index, operand_index);
-    if (!memref.ok()) {
-      return memref.status();
-    }
-    arguments.push_back(std::move(*memref));
-    return OkStatus();
-  };
-
-  // Inputs come first; results come last.
-  for (int64_t index : xla_framework_mapping_.inputs) {
-    TF_RETURN_IF_ERROR(append_converted_buffer(index));
-  }
-
-  int64_t result_index = xla_framework_mapping_.result;
-  if (xla_framework_mapping_.output_is_tuple) {
-    size_t num_outputs = xla_framework_mapping_.flattened_outputs.size();
-    for (size_t i = 0; i < num_outputs; ++i) {
-      int64_t output_index = xla_framework_mapping_.flattened_outputs[i];
-
-      TF_RETURN_IF_ERROR(append_converted_buffer(output_index));
-
-      // Populate the output tuple with a pointer to this result.
-      // TODO(b/249078472): make this work with nested tuples, if needed.
-      assert(result_index != -1);
-      void** results =
-          static_cast<void**>(descriptor_table[result_index].data());
-      results[i] = descriptor_table[output_index].data();
-    }
-  } else if (result_index != -1) {
-    TF_RETURN_IF_ERROR(append_converted_buffer(result_index));
-  }
-
-  runtime::Executable::CallFrame call_frame;
-  // Skip verification. The MemrefDesc's we created above come from the runtime
-  // signature; verifying them against the same signature would be redundant.
-  if (auto status =
-          GetExecutable().InitializeCallFrame(arguments, &call_frame,
-                                              /*verify_arguments=*/false);
-      !status.ok()) {
-    return Internal("Failed to initialize call frame: %s.",
-                         status.message());
-  }
-
-  // No results to return; they are returned via out params.
-  runtime::NoResultConverter converter;
-
-  // Collect all emitted diagnostic messages.
-  std::string diagnostic;
-  runtime::DiagnosticEngine diagnostic_engine;
-  diagnostic_engine.AddHandler([&](runtime::Diagnostic& d) {
-    absl::StrAppend(&diagnostic, d.status().message());
-    return runtime::success();
-  });
-
-  runtime::CustomCall::UserData user_data(run_options);
-
-  runtime::Executable::ExecuteOpts opts;
-  opts.custom_call_data = &user_data;
-  opts.diagnostic_engine = &diagnostic_engine;
-  opts.custom_call_registry = &dynamic_custom_calls_;
-
-  // We don't expect to see any async tasks in the XLA Runtime executable.
-  opts.async_task_runner =
-      reinterpret_cast<runtime::AsyncTaskRunner*>(0xdeadbeef);
-
-  // Execute with the prepared call frame.
-  GetExecutable().Execute(call_frame, opts);
-  if (auto status = GetExecutable().ReturnResults(converter, &call_frame);
-      !status.ok()) {
-    return Internal("Failed to execute XLA Runtime executable: %s%s%s.",
-                         status.message(), diagnostic.empty() ? "" : ": ",
-                         diagnostic);
-  }
-  return OkStatus();
-}
-
 absl::StatusOr<ExecutionOutput> CpuExecutable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
     std::vector<ExecutionInput> arguments,
@@ -642,7 +399,7 @@ absl::StatusOr<ExecutionOutput> CpuExecutable::ExecuteAsyncOnStream(
     std::shared_ptr<std::vector<MaybeOwningDeviceMemory>> task_buffers;
     HloExecutionProfile* hlo_execution_profile;
 
-    Status operator()() {
+    absl::Status operator()() {
       return executable->ExecuteComputeFunction(
           &run_options.run_options(), *task_buffers, hlo_execution_profile);
     }
@@ -676,9 +433,6 @@ const InstructionValueSet& CpuExecutable::GetRootValueSet() const {
 }
 
 int64_t CpuExecutable::SizeOfGeneratedCodeInBytes() const {
-  // TODO(b/233850967): support profiling in XLA:CPU-Next, instead of
-  // punting on it as we are doing here.
-  if (IsXlaRuntime()) return 0;
   return jit_->SizeOfGeneratedCodeInBytes();
 }
 
diff --git a/third_party/xla/xla/service/cpu/cpu_executable.h b/third_party/xla/xla/service/cpu/cpu_executable.h
index c0c0f51f4194d1..5cf8aa83357a3d 100644
--- a/third_party/xla/xla/service/cpu/cpu_executable.h
+++ b/third_party/xla/xla/service/cpu/cpu_executable.h
@@ -27,8 +27,6 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/runtime/executable.h"
-#include "xla/runtime/jit_executable.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/cpu/buffer_desc.h"
 #include "xla/service/cpu/simple_orc_jit.h"
@@ -46,78 +44,6 @@ limitations under the License.
 namespace xla {
 namespace cpu {
 
-class XlaRuntimeCpuExecutable {
- public:
-  explicit XlaRuntimeCpuExecutable(
-      std::unique_ptr<runtime::JitExecutable> jit_executable,
-      const XlaFrameworkMapping& xla_framework_mapping)
-      : executable_(std::move(jit_executable)),
-        xla_framework_mapping_(xla_framework_mapping) {}
-
-  explicit XlaRuntimeCpuExecutable(
-      std::unique_ptr<runtime::Executable> executable,
-      const XlaFrameworkMapping& xla_framework_mapping)
-      : executable_(std::move(executable)),
-        xla_framework_mapping_(xla_framework_mapping) {}
-
-  Status Execute(const std::vector<BufferDesc>& descriptor_table,
-                 const ExecutableRunOptions* run_options);
-
-  runtime::Executable& GetExecutable() {
-    if (std::holds_alternative<std::unique_ptr<runtime::JitExecutable>>(
-            executable_)) {
-      runtime::JitExecutable* jit_executable =
-          std::get<std::unique_ptr<runtime::JitExecutable>>(executable_).get();
-      return *jit_executable->DefaultExecutable();
-    } else {
-      runtime::Executable* aot_executable =
-          std::get<std::unique_ptr<runtime::Executable>>(executable_).get();
-      return *aot_executable;
-    }
-  }
-
-  absl::StatusOr<std::string_view> GetObjFile() const {
-    if (!std::holds_alternative<std::unique_ptr<runtime::JitExecutable>>(
-            executable_)) {
-      return Internal("No JitExecutable");
-    }
-
-    runtime::JitExecutable* jit_executable =
-        std::get<std::unique_ptr<runtime::JitExecutable>>(executable_).get();
-    std::unique_ptr<llvm::MemoryBuffer> obj_file =
-        jit_executable->DefaultExecutable()->obj_file();
-    if (!obj_file)
-      return Internal("XlaRuntimeCpuExecutable didn't save the obj file");
-
-    return std::string_view(obj_file->getBuffer());
-  }
-
-  absl::StatusOr<std::string_view> GetMlirModule() const {
-    if (!std::holds_alternative<std::unique_ptr<runtime::JitExecutable>>(
-            executable_)) {
-      return Internal("No JitExecutable");
-    }
-
-    runtime::JitExecutable* jit_executable =
-        std::get<std::unique_ptr<runtime::JitExecutable>>(executable_).get();
-    return jit_executable->mlir_module();
-  }
-
-  XlaFrameworkMapping xla_framework_mapping() { return xla_framework_mapping_; }
-
- private:
-  // In JIT compilation mode `JitExecutable` is used. In AOT compilation mode
-  // `Executable` is used.
-  std::variant<std::unique_ptr<runtime::JitExecutable>,
-               std::unique_ptr<runtime::Executable>>
-      executable_;
-
-  XlaFrameworkMapping xla_framework_mapping_;
-
-  // Dynamic custom calls exported from XLA runtime modules (and FFI modules).
-  runtime::DynamicCustomCallRegistry dynamic_custom_calls_;
-};
-
 // CPU-targeting implementation of the XLA Executable interface.
 //
 // Wraps a JIT-ed object that can be executed "on device". We JIT for the host
@@ -136,19 +62,10 @@ class CpuExecutable : public Executable {
       std::unique_ptr<HloModule> hlo_module,
       std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
       std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map,
-      std::unique_ptr<const BufferAssignment> assignment,
-      std::unique_ptr<XlaRuntimeCpuExecutable> xla_runtime_executable);
+      std::unique_ptr<const BufferAssignment> assignment);
 
   ~CpuExecutable() override;
 
-  bool IsXlaRuntime() const { return xla_runtime_executable_ != nullptr; }
-
-  Status ExecuteXlaRuntime(
-      const std::vector<BufferDesc>& descriptor_table,
-      const ExecutableRunOptions* run_options = nullptr) const {
-    return xla_runtime_executable_->Execute(descriptor_table, run_options);
-  }
-
   absl::StatusOr<ExecutionOutput> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
       std::vector<ExecutionInput> arguments,
@@ -156,20 +73,11 @@ class CpuExecutable : public Executable {
 
   // Calls the generated function performing the computation with the given
   // arguments using the supplied buffers.
-  Status ExecuteComputeFunction(
+  absl::Status ExecuteComputeFunction(
       const ExecutableRunOptions* run_options,
       absl::Span<MaybeOwningDeviceMemory const> buffers,
       HloExecutionProfile* hlo_execution_profile);
 
-  // Returns an Executable that is loaded from an object file (XLA program
-  // compiled to a native function using the XLA Runtime stack).
-  static absl::StatusOr<std::unique_ptr<Executable>> LoadFromObjFile(
-      std::unique_ptr<HloModule> hlo_module, absl::string_view obj_file,
-      absl::string_view mlir_module,
-      std::unique_ptr<BufferAssignment> buffer_assignment,
-      XlaFrameworkMapping xla_framework_mapping,
-      runtime::JitExecutable::Options opts);
-
   absl::Span<const std::string> obj_files() const { return obj_files_; }
 
   void set_obj_files(std::vector<std::string> obj_files) {
@@ -201,19 +109,8 @@ class CpuExecutable : public Executable {
 
   int64_t SizeOfGeneratedCodeInBytes() const override;
 
-  absl::StatusOr<std::string_view> GetObjFile() const {
-    if (!IsXlaRuntime()) return Unimplemented("Not an XLA Runtime executable");
-    return xla_runtime_executable_->GetObjFile();
-  }
-
-  absl::StatusOr<std::string_view> GetMlirModule() const {
-    if (!IsXlaRuntime()) return Unimplemented("Not an XLA Runtime executable");
-    return xla_runtime_executable_->GetMlirModule();
-  }
-
-  absl::StatusOr<XlaFrameworkMapping> GetXlaFrameworkMapping() const {
-    if (!IsXlaRuntime()) return Unimplemented("Not an XLA Runtime executable");
-    return xla_runtime_executable_->xla_framework_mapping();
+  absl::Span<const BufferAllocation> GetAllocations() const override {
+    return assignment_->Allocations();
   }
 
  private:
@@ -276,9 +173,6 @@ class CpuExecutable : public Executable {
   // Entry function name for the computation.
   const std::string entry_function_name_;
 
-  // If not null, XLA Runtime is enabled.
-  std::unique_ptr<XlaRuntimeCpuExecutable> xla_runtime_executable_;
-
   CpuExecutable(std::unique_ptr<HloModule> hlo_module,
                 std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
                 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map,
diff --git a/third_party/xla/xla/service/cpu/cpu_layout_assignment.cc b/third_party/xla/xla/service/cpu/cpu_layout_assignment.cc
index 35371ea946369f..d4907a88f364fc 100644
--- a/third_party/xla/xla/service/cpu/cpu_layout_assignment.cc
+++ b/third_party/xla/xla/service/cpu/cpu_layout_assignment.cc
@@ -120,7 +120,7 @@ static bool OperandsAndResultMustHaveRowMajorLayout(
   return false;
 }
 
-Status CpuLayoutAssignment::AddBackendConstraints(
+absl::Status CpuLayoutAssignment::AddBackendConstraints(
     LayoutConstraints* constraints) {
   ShouldMakeOperandColMajorCache cache;
 
@@ -185,7 +185,7 @@ Status CpuLayoutAssignment::AddBackendConstraints(
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace cpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/cpu_layout_assignment.h b/third_party/xla/xla/service/cpu/cpu_layout_assignment.h
index 35ecde418bc60f..26e155458f569f 100644
--- a/third_party/xla/xla/service/cpu/cpu_layout_assignment.h
+++ b/third_party/xla/xla/service/cpu/cpu_layout_assignment.h
@@ -37,7 +37,7 @@ class CpuLayoutAssignment : public LayoutAssignment {
   ~CpuLayoutAssignment() override {}
 
  protected:
-  Status AddBackendConstraints(LayoutConstraints* constraints) override;
+  absl::Status AddBackendConstraints(LayoutConstraints* constraints) override;
 
   const TargetMachineFeatures& target_machine_features_;
 };
diff --git a/third_party/xla/xla/service/cpu/cpu_runtime.cc b/third_party/xla/xla/service/cpu/cpu_runtime.cc
index 3e38a06748b808..a11a32f4688cbd 100644
--- a/third_party/xla/xla/service/cpu/cpu_runtime.cc
+++ b/third_party/xla/xla/service/cpu/cpu_runtime.cc
@@ -166,6 +166,8 @@ extern const char* const kOneDnnLayerNormSymbolName =
     "__xla_cpu_runtime_OneDnnLayerNorm";
 extern const char* const kOneDnnMatMulReorderSymbolName =
     "__xla_cpu_runtime_OneDnnMatMulReorder";
+extern const char* const kHandleFfiCallSymbolName =
+    "__xla_cpu_runtime_HandleFfiCall";
 
 namespace {
 
diff --git a/third_party/xla/xla/service/cpu/cpu_runtime.h b/third_party/xla/xla/service/cpu/cpu_runtime.h
index de84b7e7e54fed..f6cb91404ff6bf 100644
--- a/third_party/xla/xla/service/cpu/cpu_runtime.h
+++ b/third_party/xla/xla/service/cpu/cpu_runtime.h
@@ -90,6 +90,7 @@ extern const char* const kOneDnnMatMulSymbolName;
 extern const char* const kOneDnnSoftmaxSymbolName;
 extern const char* const kOneDnnLayerNormSymbolName;
 extern const char* const kOneDnnMatMulReorderSymbolName;
+extern const char* const kHandleFfiCallSymbolName;
 
 // All symbol names for XLA CPU runtime functions need to start with this
 // prefix.
diff --git a/third_party/xla/xla/service/cpu/cpu_transfer_manager.cc b/third_party/xla/xla/service/cpu/cpu_transfer_manager.cc
index 27e62fc1af723d..fd68f2996357f6 100644
--- a/third_party/xla/xla/service/cpu/cpu_transfer_manager.cc
+++ b/third_party/xla/xla/service/cpu/cpu_transfer_manager.cc
@@ -43,19 +43,19 @@ CpuTransferManager::CpuTransferManager()
     : GenericTransferManager(se::host::kHostPlatformId,
                              /*pointer_size=*/sizeof(void*)) {}
 
-Status CpuTransferManager::TransferLiteralToInfeed(
+absl::Status CpuTransferManager::TransferLiteralToInfeed(
     se::StreamExecutor* executor, const LiteralSlice& literal) {
   return TransferLiteralToInfeedOnCpu(executor->device_ordinal(), literal);
 }
 
-Status CpuTransferManager::TransferLiteralFromOutfeed(
+absl::Status CpuTransferManager::TransferLiteralFromOutfeed(
     se::StreamExecutor* executor, MutableBorrowingLiteral literal) {
   return TransferLiteralFromOutfeedOnCpu(executor->device_ordinal(), literal);
 }
 
-Status CpuTransferManager::ReadDynamicShapes(se::Stream* stream,
-                                             const ShapedBuffer* device_buffer,
-                                             Shape* device_shape) {
+absl::Status CpuTransferManager::ReadDynamicShapes(
+    se::Stream* stream, const ShapedBuffer* device_buffer,
+    Shape* device_shape) {
   if (stream != nullptr) {
     // When a stream is presented, respect the stream dependency.
     return TransferManager::ReadDynamicShapes(stream, device_buffer,
diff --git a/third_party/xla/xla/service/cpu/cpu_transfer_manager.h b/third_party/xla/xla/service/cpu/cpu_transfer_manager.h
index 9cdf0478a47cb2..ed7f0fe3f7b383 100644
--- a/third_party/xla/xla/service/cpu/cpu_transfer_manager.h
+++ b/third_party/xla/xla/service/cpu/cpu_transfer_manager.h
@@ -37,10 +37,10 @@ class CpuTransferManager : public GenericTransferManager {
   CpuTransferManager();
   ~CpuTransferManager() override {}
 
-  Status TransferLiteralToInfeed(se::StreamExecutor* executor,
-                                 const LiteralSlice& literal) override;
-  Status TransferLiteralFromOutfeed(se::StreamExecutor* executor,
-                                    MutableBorrowingLiteral literal) override;
+  absl::Status TransferLiteralToInfeed(se::StreamExecutor* executor,
+                                       const LiteralSlice& literal) override;
+  absl::Status TransferLiteralFromOutfeed(
+      se::StreamExecutor* executor, MutableBorrowingLiteral literal) override;
 
   bool CanShapedBufferBeAccessedNow(
       se::StreamExecutor* executor,
@@ -54,9 +54,9 @@ class CpuTransferManager : public GenericTransferManager {
     return true;
   }
 
-  Status ReadDynamicShapes(se::Stream* stream,
-                           const ShapedBuffer* device_buffer,
-                           Shape* device_shape) override;
+  absl::Status ReadDynamicShapes(se::Stream* stream,
+                                 const ShapedBuffer* device_buffer,
+                                 Shape* device_shape) override;
 
  private:
   bool PackSubbyteTypes() const override { return true; }
diff --git a/third_party/xla/xla/service/cpu/cpu_xfeed.cc b/third_party/xla/xla/service/cpu/cpu_xfeed.cc
index b6fea6cb0d3851..332ce3b1c126fb 100644
--- a/third_party/xla/xla/service/cpu/cpu_xfeed.cc
+++ b/third_party/xla/xla/service/cpu/cpu_xfeed.cc
@@ -102,8 +102,8 @@ absl::StatusOr<cpu::runtime::XfeedBuffer*> TransferBufferToInfeedInternal(
   return queued_buffer;
 }
 
-Status TransferBufferToInfeed(int device_ordinal, int64_t size,
-                              const void* source) {
+absl::Status TransferBufferToInfeed(int device_ordinal, int64_t size,
+                                    const void* source) {
   TF_ASSIGN_OR_RETURN(cpu::runtime::XfeedBuffer * buffer,
                       TransferBufferToInfeedInternal(size, source));
 
@@ -111,7 +111,7 @@ Status TransferBufferToInfeed(int device_ordinal, int64_t size,
       cpu::runtime::GetXfeedManager(device_ordinal);
   xfeed_manager->infeed()->EnqueueBuffersAtomically({buffer});
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 absl::StatusOr<Shape> TransferBuffersFromOutfeedInternal(
@@ -175,8 +175,8 @@ absl::StatusOr<Shape> TransferTupleBuffersFromOutfeed(
 }
 }  // namespace
 
-Status TransferLiteralToInfeedOnCpu(int device_ordinal,
-                                    const LiteralSlice& literal) {
+absl::Status TransferLiteralToInfeedOnCpu(int device_ordinal,
+                                          const LiteralSlice& literal) {
   const Shape& shape = literal.shape();
   VLOG(2) << "Transferring literal to infeed with shape: "
           << ShapeUtil::HumanString(shape);
@@ -218,11 +218,11 @@ Status TransferLiteralToInfeedOnCpu(int device_ordinal,
   xfeed_manager->infeed()->EnqueueBuffersAtomically(buffers);
 
   std::move(cleanup).Cancel();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status TransferLiteralFromOutfeedOnCpu(int device_ordinal,
-                                       MutableBorrowingLiteral literal) {
+absl::Status TransferLiteralFromOutfeedOnCpu(int device_ordinal,
+                                             MutableBorrowingLiteral literal) {
   if (!literal.shape().IsTuple()) {
     int64_t size =
         cpu::runtime::GetByteSizeRequirement(literal.shape(), sizeof(void*));
@@ -242,7 +242,7 @@ Status TransferLiteralFromOutfeedOnCpu(int device_ordinal,
     TF_RET_CHECK(size == cpu::runtime::GetByteSizeRequirement(received_shape,
                                                               sizeof(void*)));
     *literal.mutable_shape_do_not_use() = received_shape;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   if (ShapeUtil::IsNestedTuple(literal.shape())) {
@@ -272,10 +272,10 @@ Status TransferLiteralFromOutfeedOnCpu(int device_ordinal,
       cpu::runtime::GetByteSizeRequirement(received_shape, sizeof(void*)));
 
   TF_RET_CHECK(ShapeUtil::Equal(literal.shape(), literal.shape()));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status ReadDynamicShapesOnCpu(
+absl::Status ReadDynamicShapesOnCpu(
     const ShapedBuffer* device_buffer, Shape* device_shape,
     HloCostAnalysis::ShapeSizeFunction shape_size_fn) {
   TF_RET_CHECK(device_shape->is_dynamic());
@@ -286,12 +286,12 @@ Status ReadDynamicShapesOnCpu(
         const Shape& buffer_shape =
             ShapeUtil::GetSubshape(*device_shape, index);
         if (buffer_shape.IsTuple()) {
-          return OkStatus();
+          return absl::OkStatus();
         }
         Shape& device_sub_shape =
             *ShapeUtil::GetMutableSubshape(device_shape, index);
         if (device_sub_shape.is_static()) {
-          return OkStatus();
+          return absl::OkStatus();
         }
         const void* memory = buffer.opaque();
 
@@ -310,12 +310,12 @@ Status ReadDynamicShapesOnCpu(
         for (int64_t i = 0; i < device_sub_shape.rank(); ++i) {
           device_sub_shape.mutable_dimensions()[i] = metadata_buffer[i];
         }
-        return OkStatus();
+        return absl::OkStatus();
       }));
   device_shape->clear_dynamic_dimensions();
 
   TF_RET_CHECK(ShapeUtil::DynamicShapeIsCompatible(*device_shape,
                                                    original_device_shape));
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/cpu_xfeed.h b/third_party/xla/xla/service/cpu/cpu_xfeed.h
index 26512839d50174..7d09ec862d84d1 100644
--- a/third_party/xla/xla/service/cpu/cpu_xfeed.h
+++ b/third_party/xla/xla/service/cpu/cpu_xfeed.h
@@ -30,17 +30,17 @@ limitations under the License.
 namespace xla {
 
 // Helper function to transfers to infeed on CPU.
-Status TransferLiteralToInfeedOnCpu(int device_ordinal,
-                                    const LiteralSlice& literal);
+absl::Status TransferLiteralToInfeedOnCpu(int device_ordinal,
+                                          const LiteralSlice& literal);
 
 // Helper function to transfers from outfeed on CPU.
-Status TransferLiteralFromOutfeedOnCpu(int device_ordinal,
-                                       MutableBorrowingLiteral literal);
+absl::Status TransferLiteralFromOutfeedOnCpu(int device_ordinal,
+                                             MutableBorrowingLiteral literal);
 
 // Helper function to retrieve dynamic shape on CPU.
-Status ReadDynamicShapesOnCpu(const ShapedBuffer* device_buffer,
-                              Shape* device_shape,
-                              HloCostAnalysis::ShapeSizeFunction shape_size_fn);
+absl::Status ReadDynamicShapesOnCpu(
+    const ShapedBuffer* device_buffer, Shape* device_shape,
+    HloCostAnalysis::ShapeSizeFunction shape_size_fn);
 }  // namespace xla
 
 #endif  // XLA_SERVICE_CPU_CPU_XFEED_H_
diff --git a/third_party/xla/xla/service/cpu/dot_op_emitter.cc b/third_party/xla/xla/service/cpu/dot_op_emitter.cc
index c6bad1e54ec045..ef06f3bb08e338 100644
--- a/third_party/xla/xla/service/cpu/dot_op_emitter.cc
+++ b/third_party/xla/xla/service/cpu/dot_op_emitter.cc
@@ -134,21 +134,21 @@ class DotOpEmitter {
                         const TargetMachineFeatures& target_machine_features);
 
   // Emits the IR to perform the dot operation.
-  Status Emit();
+  absl::Status Emit();
 
   // Emits the IR to perform the batch dot operation.
-  Status EmitBatch();
+  absl::Status EmitBatch();
 
  private:
   // Emits instructions to perform a scalar dot product (a multiply of the
   // LHS and RHS) and store the results in the target.
-  Status EmitScalarDot();
+  absl::Status EmitScalarDot();
 
   // Emits a call to the CPU runtime to perform the matrix multiply.
-  Status EmitCallToRuntime();
+  absl::Status EmitCallToRuntime();
 
   // Emits a call to the CPU runtime to perform the batch matrix multiply.
-  Status EmitCallToBatchRuntime();
+  absl::Status EmitCallToBatchRuntime();
 
   // Represents the dimensions of a matrix-matrix multiply operation.
   struct MatMultDims {
@@ -192,7 +192,7 @@ class DotOpEmitter {
   void EmitTiledLlvmIrGemm();
 
   // Lowers the dot operation through MLIR's linalg.matmul.
-  Status EmitLinalgMatmul();
+  absl::Status EmitLinalgMatmul();
 
   // Lowers the dot operation as a naive nested loop that computes the result
   // one element at a time.
@@ -264,7 +264,7 @@ DotOpEmitter::DotOpEmitter(
       hlo_module_config_(hlo_module_config),
       target_machine_features_(target_machine_features) {}
 
-Status DotOpEmitter::EmitLinalgMatmul() {
+absl::Status DotOpEmitter::EmitLinalgMatmul() {
   Shape operand_shapes[] = {dot_info_.lhs_shape, dot_info_.rhs_shape};
   llvm::Value* operand_ptrs[] = {lhs_array_.GetBasePointer(),
                                  rhs_array_.GetBasePointer()};
@@ -529,7 +529,7 @@ void DotOpEmitter::EmitTiledLlvmIrGemv() {
   }
 }
 
-Status DotOpEmitter::Emit() {
+absl::Status DotOpEmitter::Emit() {
   // The dot operation performs a sum of products over dimension 0 of the left
   // hand side operand and dimension 1 of the right hand side operand.
   //
@@ -566,15 +566,15 @@ Status DotOpEmitter::Emit() {
                                        target_machine_features_)) {
     case DotImplementationStrategy::kNaiveLlvmIr:
       EmitNaiveLlvmIrGemm();
-      return OkStatus();
+      return absl::OkStatus();
 
     case DotImplementationStrategy::kTiledLlvmIrGemv:
       EmitTiledLlvmIrGemv();
-      return OkStatus();
+      return absl::OkStatus();
 
     case DotImplementationStrategy::kTiledLlvmIrGemm:
       EmitTiledLlvmIrGemm();
-      return OkStatus();
+      return absl::OkStatus();
 
     case DotImplementationStrategy::kLinalgMatmul:
       return EmitLinalgMatmul();
@@ -584,7 +584,7 @@ Status DotOpEmitter::Emit() {
   }
 }
 
-Status DotOpEmitter::EmitBatch() {
+absl::Status DotOpEmitter::EmitBatch() {
   // The dot operation performs a sum of products over dimension 0 of the left
   // hand side operand and dimension 1 of the right hand side operand.
   //
@@ -756,7 +756,7 @@ void DotOpEmitter::EmitNaiveLlvmIrGemm() {
   b_->SetInsertPoint(loop_nest.GetOuterLoopExitBasicBlock());
 }
 
-Status DotOpEmitter::EmitScalarDot() {
+absl::Status DotOpEmitter::EmitScalarDot() {
   // A scalar dot is just a scalar multiply.
   llvm::Value* result;
   // Use the same index_type for all tensor accesses in the same kernel.
@@ -788,10 +788,10 @@ Status DotOpEmitter::EmitScalarDot() {
     result = b_->CreateFMul(lhs_value, rhs_value);
   }
   target_array_.EmitWriteArrayElement(/*index=*/element_index, result, b_);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status DotOpEmitter::EmitCallToRuntime() {
+absl::Status DotOpEmitter::EmitCallToRuntime() {
   // The signature of the Eigen runtime matmul function is:
   //
   //   (void)(void* run_options, float* out, float* lhs, float* rhs,
@@ -899,10 +899,10 @@ Status DotOpEmitter::EmitCallToRuntime() {
                   b_->getInt64(mat_mult_dims.m), b_->getInt64(mat_mult_dims.n),
                   b_->getInt64(mat_mult_dims.k), b_->getInt32(transpose_lhs),
                   b_->getInt32(transpose_rhs)});
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status DotOpEmitter::EmitCallToBatchRuntime() {
+absl::Status DotOpEmitter::EmitCallToBatchRuntime() {
   // The signature of the runtime batch matmul function is:
   //
   //   (void)(void* run_options, float* out, float* lhs, float* rhs,
@@ -982,7 +982,7 @@ Status DotOpEmitter::EmitCallToBatchRuntime() {
        b_->getInt64(mat_mult_dims.k), b_->getInt64(lhs_shape.dimensions(0)),
        b_->getInt32(static_cast<uint32_t>(transpose_lhs)),
        b_->getInt32(static_cast<uint32_t>(transpose_rhs))});
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 DotOpEmitter::MatMultDims DotOpEmitter::GetMatMultDims() const {
@@ -1218,7 +1218,7 @@ DotImplementationStrategy GetDotImplementationStrategy(
   return DotImplementationStrategy::kNaiveLlvmIr;
 }
 
-Status EmitNonBatchDotOperation(
+absl::Status EmitNonBatchDotOperation(
     DotInfo dot_info, std::string hlo_name,
     const llvm_ir::IrArray& target_array, const llvm_ir::IrArray& lhs_array,
     const llvm_ir::IrArray& rhs_array, const llvm_ir::IrArray* addend_array,
@@ -1270,7 +1270,8 @@ llvm_ir::IrArray CollapseFirstNDims(llvm::IRBuilder<>* b,
                           std::move(new_shape));
 }
 
-Status ValidateDotDimensionNumbers(const DotDimensionNumbers& dim_numbers) {
+absl::Status ValidateDotDimensionNumbers(
+    const DotDimensionNumbers& dim_numbers) {
   // Checks some invariants that do not hold in general, but DotDecomposer
   // should have established for us.  This is just a debugging aid.
   TF_RET_CHECK(dim_numbers.lhs_contracting_dimensions_size() == 1);
@@ -1281,7 +1282,7 @@ Status ValidateDotDimensionNumbers(const DotDimensionNumbers& dim_numbers) {
       absl::c_equal(batch_dim_numbers, dim_numbers.lhs_batch_dimensions()));
   TF_RET_CHECK(
       absl::c_equal(batch_dim_numbers, dim_numbers.rhs_batch_dimensions()));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Slice out the inner array at batch index `batch_index` from `outer_array`.
@@ -1357,7 +1358,7 @@ bool PotentiallyImplementedAsEigenMatmul(
   return impl_strategy == DotImplementationStrategy::kEigen;
 }
 
-Status EmitBatchDotOperation(
+absl::Status EmitBatchDotOperation(
     const HloInstruction& dot, const llvm_ir::IrArray& target_array,
     const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array,
     llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b,
@@ -1481,15 +1482,13 @@ bool DotOperandsAndResultMustHaveRowMajorLayout(
          impl_strategy == DotImplementationStrategy::kEigen;
 }
 
-Status EmitDotOperation(const HloInstruction& dot,
-                        const llvm_ir::IrArray& target_array,
-                        const llvm_ir::IrArray& lhs_array,
-                        const llvm_ir::IrArray& rhs_array,
-                        const llvm_ir::IrArray* addend_array,
-                        llvm::Value* executable_run_options_value,
-                        llvm::IRBuilder<>* b, mlir::MLIRContext* mlir_context,
-                        const HloModuleConfig& hlo_module_config,
-                        const TargetMachineFeatures& target_machine_features) {
+absl::Status EmitDotOperation(
+    const HloInstruction& dot, const llvm_ir::IrArray& target_array,
+    const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array,
+    const llvm_ir::IrArray* addend_array,
+    llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b,
+    mlir::MLIRContext* mlir_context, const HloModuleConfig& hlo_module_config,
+    const TargetMachineFeatures& target_machine_features) {
   // This routine assumes that the dot operation is not in a parallelized
   // enclosing computation.
   CHECK(dot.parent()
diff --git a/third_party/xla/xla/service/cpu/dot_op_emitter.h b/third_party/xla/xla/service/cpu/dot_op_emitter.h
index 9d4f56179601f5..58e3afee737f7b 100644
--- a/third_party/xla/xla/service/cpu/dot_op_emitter.h
+++ b/third_party/xla/xla/service/cpu/dot_op_emitter.h
@@ -57,15 +57,13 @@ std::optional<int64_t> ProfitableToMakeDotOperandColumnMajor(
 // dimensions as the result, and the result is computed as `addend_array` +
 // dot(`lhs_array`, `rhs_array`).  A non-null `addend_array` is only supported
 // for Matrix-vector products.
-Status EmitDotOperation(const HloInstruction& dot,
-                        const llvm_ir::IrArray& target_array,
-                        const llvm_ir::IrArray& lhs_array,
-                        const llvm_ir::IrArray& rhs_array,
-                        const llvm_ir::IrArray* addend_array,
-                        llvm::Value* executable_run_options_value,
-                        llvm::IRBuilder<>* b, mlir::MLIRContext* mlir_context,
-                        const HloModuleConfig& hlo_module_config,
-                        const TargetMachineFeatures& target_machine_features);
+absl::Status EmitDotOperation(
+    const HloInstruction& dot, const llvm_ir::IrArray& target_array,
+    const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array,
+    const llvm_ir::IrArray* addend_array,
+    llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b,
+    mlir::MLIRContext* mlir_context, const HloModuleConfig& hlo_module_config,
+    const TargetMachineFeatures& target_machine_features);
 }  // namespace cpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/cpu/hlo_xla_runtime_pipeline.cc b/third_party/xla/xla/service/cpu/hlo_xla_runtime_pipeline.cc
deleted file mode 100644
index 331f5d933671a5..00000000000000
--- a/third_party/xla/xla/service/cpu/hlo_xla_runtime_pipeline.cc
+++ /dev/null
@@ -1,263 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/cpu/hlo_xla_runtime_pipeline.h"
-
-#include <utility>
-
-#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
-#include "mlir/Conversion/BufferizationToMemRef/BufferizationToMemRef.h"  // from @llvm-project
-#include "mlir/Conversion/ComplexToStandard/ComplexToStandard.h"  // from @llvm-project
-#include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"  // from @llvm-project
-#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"  // from @llvm-project
-#include "mlir/Conversion/ShapeToStandard/ShapeToStandard.h"  // from @llvm-project
-#include "mlir/Conversion/TensorToLinalg/TensorToLinalgPass.h"  // from @llvm-project
-#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"  // from @llvm-project
-#include "mlir/Conversion/VectorToSCF/VectorToSCF.h"  // from @llvm-project
-#include "mlir/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.h"  // from @llvm-project
-#include "mlir/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.h"  // from @llvm-project
-#include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h"  // from @llvm-project
-#include "mlir/Dialect/Bufferization/Transforms/Passes.h"  // from @llvm-project
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/Func/Transforms/Passes.h"  // from @llvm-project
-#include "mlir/Dialect/Linalg/Passes.h"  // from @llvm-project
-#include "mlir/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.h"  // from @llvm-project
-#include "mlir/Dialect/MemRef/Transforms/AllocationOpInterfaceImpl.h"  // from @llvm-project
-#include "mlir/Dialect/MemRef/Transforms/Passes.h"  // from @llvm-project
-#include "mlir/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.h"  // from @llvm-project
-#include "mlir/Dialect/Shape/Transforms/BufferizableOpInterfaceImpl.h"  // from @llvm-project
-#include "mlir/Dialect/Shape/Transforms/Passes.h"  // from @llvm-project
-#include "mlir/Dialect/SparseTensor/Transforms/BufferizableOpInterfaceImpl.h"  // from @llvm-project
-#include "mlir/Dialect/SparseTensor/Transforms/Passes.h"  // from @llvm-project
-#include "mlir/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.h"  // from @llvm-project
-#include "mlir/Dialect/Vector/Transforms/BufferizableOpInterfaceImpl.h"  // from @llvm-project
-#include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Transforms/Passes.h"  // from @llvm-project
-#include "xla/mlir/backends/cpu/transforms/passes.h"
-#include "xla/mlir/runtime/transforms/compiler.h"
-#include "xla/mlir_hlo/mhlo/interfaces/bufferizable_op_interface_impl.h"
-#include "xla/mlir_hlo/mhlo/transforms/passes.h"
-#include "xla/mlir_hlo/transforms/passes.h"
-#include "xla/status.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
-
-#ifdef EXPERIMENTAL_MLIR_GPU
-#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"  // from @llvm-project
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"  // from @llvm-project
-#endif  // EXPERIMENTAL_MLIR_GPU
-
-namespace xla {
-namespace cpu {
-namespace {
-
-using mlir::func::FuncOp;
-
-mlir::bufferization::OneShotBufferizationOptions GetBufferizationOptions(
-    bool new_deallocator) {
-  using mlir::bufferization::BufferizationOptions;
-  using mlir::bufferization::LayoutMapOption;
-  using mlir::bufferization::OneShotBufferizationOptions;
-
-  OneShotBufferizationOptions options;
-  options.bufferizeFunctionBoundaries = true;
-  options.allowReturnAllocsFromLoops = true;
-  options.setFunctionBoundaryTypeConversion(LayoutMapOption::IdentityLayoutMap);
-  options.unknownTypeConverterFn = [](mlir::Value value,
-                                      mlir::Attribute memorySpace,
-                                      const BufferizationOptions& options) {
-    return mlir::bufferization::getMemRefTypeWithStaticIdentityLayout(
-        value.getType().cast<mlir::TensorType>(), memorySpace);
-  };
-  return options;
-}
-
-}  // namespace
-
-// -------------------------------------------------------------------------- //
-// Assemble a HLO XLA Runtime pipeline to lower from HLO to Linalg on buffers.
-// -------------------------------------------------------------------------- //
-
-static Status CreateHloXlaPipeline(
-    mlir::OpPassManager& pm, const HloXlaRuntimePipelineOptions& options) {
-  // Resolve all shape constraints (e.g. broadcast constraints that can be
-  // proved statically and changed to const witness) early to allow more
-  // efficient broadcast operations moving.
-  // Move up broadcasting operations to allow for more fusion opportunities.
-  pm.addPass(mlir::createInlinerPass());
-  pm.addPass(mlir::mhlo::createExpandHloTuplesPass("main"));
-  // TODO(b/233771980): Remove once custom_call doesn't use tuples.
-  pm.addNestedPass<mlir::func::FuncOp>(mlir::mhlo::createFlattenTuplePass());
-  pm.addPass(createXlaAbiLegalizationPass());
-  pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::mhlo::createLegalizeGeneralDotPass());
-  pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::mhlo::createBroadcastPropagationPass());
-  pm.addPass(mlir::createCSEPass());
-  pm.addPass(mlir::createCanonicalizerPass());
-
-  // Transform HLO operations to Linalg.
-  pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::mhlo::createLegalizeControlFlowPass());
-  pm.addNestedPass<FuncOp>(mlir::mhlo::createLegalizeDotGeneralToDotPass());
-  pm.addPass(::mlir::mhlo::createLegalizeToArithmeticPass());
-  pm.addNestedPass<mlir::func::FuncOp>(
-      xla::cpu::createLegalizeLibraryOpsPass());
-  pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::mhlo::createMhloExpandOpsSimplifierPass());
-  pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::mhlo::createHloCanonicalizeScatterPass());
-  pm.addNestedPass<FuncOp>(mlir::mhlo::createHloCanonicalizeDotPass());
-  pm.addNestedPass<FuncOp>(mlir::mhlo::createGroupReductionDimensionsPass());
-  pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::mhlo::createLegalizeHloToLinalgPass());
-
-  // Lower index cast on tensors to tensor.generate.
-  pm.addNestedPass<mlir::func::FuncOp>(mlir::createLowerIndexCastPass());
-
-  pm.addPass(mlir::mhlo::createConvertToSignlessPass());
-
-  // Lower shape dialect to standard to enable linalg canonicalizations (e.g.
-  // use linalg inputs instead of outputs for memref.dim operations).
-  pm.addNestedPass<mlir::func::FuncOp>(mlir::mhlo::createShapeSimplification());
-  pm.addNestedPass<mlir::func::FuncOp>(mlir::createShapeToShapeLowering());
-  pm.addPass(mlir::createConvertShapeToStandardPass());
-  pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::createConvertShapeConstraintsPass());
-
-  // Fuse Linalg on tensors operations.
-  pm.addPass(mlir::createCSEPass());
-  pm.addPass(mlir::memref::createResolveShapedTypeResultDimsPass());
-  pm.addPass(mlir::createCanonicalizerPass());
-  pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::createLinalgElementwiseOpFusionPass());
-  pm.addPass(mlir::createReconcileUnrealizedCastsPass());
-  pm.addPass(mlir::createConvertTensorToLinalgPass());
-
-  // Detensorize SCF iter args.
-  pm.addNestedPass<mlir::func::FuncOp>(mlir::createDetensorizeScfOpsPass());
-  // mhlo ops on unit tensors generate trivial linalg.generics, which
-  // one-shot-bufferize generates unnecessary allocs for. The detensorize pass
-  // replaces these linalg.generics with scalar ops.
-  auto detensorize = mlir::createLinalgDetensorizePass();
-  if (detensorize
-          ->initializeOptions(
-              "aggressive-mode=true",
-              [](const mlir::Twine&) { return mlir::failure(); })
-          .failed()) {
-    return tsl::errors::Internal("Failed to set up detensorize pass.");
-  }
-  pm.addNestedPass<mlir::func::FuncOp>(std::move(detensorize));
-  pm.addPass(mlir::bufferization::createEmptyTensorEliminationPass());
-  pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::bufferization::createEmptyTensorToAllocTensorPass());
-
-  // Always run canonicalizer (which does dead code removal) before
-  // bufferizing anything.
-  pm.addPass(mlir::createCanonicalizerPass());
-  pm.addPass(mlir::hlo::createOneShotBufferizePass());
-  pm.addNestedPass<mlir::func::FuncOp>(createRewriteReallocToAllocPass());
-  pm.addNestedPass<FuncOp>(mlir::createVectorizeCopyPass());
-  pm.addNestedPass<FuncOp>(mlir::createNaiveCopyRemovalPass());
-
-  // This should be unified. It exists, because the async runtime tests expect
-  // parallel loops.
-  if (options.sparse_bufferization) {
-    pm.addNestedPass<mlir::func::FuncOp>(
-        mlir::createConvertLinalgToLoopsPass());
-  } else {
-    pm.addNestedPass<mlir::func::FuncOp>(
-        mlir::createConvertLinalgToParallelLoopsPass());
-  }
-  pm.addPass(mlir::createCSEPass());
-  pm.addPass(mlir::createCanonicalizerPass());
-  mlir::bufferization::BufferResultsToOutParamsOpts out_params_opts;
-  out_params_opts.filterFn = [](mlir::func::FuncOp* func) {
-    // Only transform the entry point.
-    return func->getSymName() == "main";
-  };
-  pm.addPass(
-      mlir::bufferization::createBufferResultsToOutParamsPass(out_params_opts));
-
-  pm.addNestedPass<FuncOp>(
-      mlir::bufferization::createPromoteBuffersToStackPass(nullptr));
-  pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::bufferization::createBufferDeallocationPass());
-  pm.addPass(mlir::createBufferizationToMemRefPass());
-  if (options.remove_copies_to_outparams) {
-    pm.addNestedPass<mlir::func::FuncOp>(
-        xla::cpu::createRemoveCopiesToOutParamsPass());
-  }
-
-  // Specialize linalg.matmul to linalg.dot, linalg.matvec or linalg.vecmat,
-  // and immediately canonicalize to clean up not taken branches.
-  // pm.addNestedPass<mlir::func::FuncOp>(CreateLinalgMatmulSpecializationPass());
-  pm.addPass(mlir::createCanonicalizerPass());
-
-  // TODO(tpopp): Move hits to mlir::hlo::createGenericHostToLLVMPass?
-  pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::createConvertComplexToStandardPass());
-
-  pm.addPass(mlir::createCSEPass());
-  pm.addPass(mlir::createCanonicalizerPass());
-  pm.addNestedPass<FuncOp>(mlir::createConvertVectorToSCFPass());
-  pm.addNestedPass<FuncOp>(xla::cpu::createLegalizeI1VectorTransferOpsPass());
-  pm.addNestedPass<FuncOp>(
-      xla::cpu::createConvertXlaCpuMemRefElementCastToLLVMPass());
-  return OkStatus();
-}
-
-Status CreateHloXlaRuntimePipeline(
-    xla::runtime::PassManager& passes,
-    const HloXlaRuntimePipelineOptions& options) {
-  return CreateHloXlaPipeline(*passes, options);
-}
-
-Status CreateDefaultHloXlaRuntimePipeline(xla::runtime::PassManager& passes) {
-  HloXlaRuntimePipelineOptions options;
-  return CreateHloXlaPipeline(*passes, options);
-}
-
-void RegisterHloXlaRuntimePipelineDialects(mlir::DialectRegistry& dialects) {
-  mlir::arith::registerBufferizableOpInterfaceExternalModels(dialects);
-  mlir::bufferization::func_ext::registerBufferizableOpInterfaceExternalModels(
-      dialects);
-  mlir::memref::registerAllocationOpInterfaceExternalModels(dialects);
-  mlir::linalg::registerBufferizableOpInterfaceExternalModels(dialects);
-  mlir::linalg::registerTilingInterfaceExternalModels(dialects);
-  mlir::mhlo::registerBufferizableOpInterfaceExternalModels(dialects);
-  mlir::scf::registerBufferizableOpInterfaceExternalModels(dialects);
-  mlir::shape::registerBufferizableOpInterfaceExternalModels(dialects);
-  mlir::sparse_tensor::registerBufferizableOpInterfaceExternalModels(dialects);
-  mlir::tensor::registerBufferizableOpInterfaceExternalModels(dialects);
-  mlir::vector::registerBufferizableOpInterfaceExternalModels(dialects);
-}
-
-static mlir::PassPipelineRegistration<> hlo_xla_runtime_pipeline(
-    "hlo-xla-runtime-pipeline",
-    "Convert HLO dialect to XLA Runtime compatible dialects",
-    [](mlir::OpPassManager& pm) {
-      HloXlaRuntimePipelineOptions options;
-      Status status = CreateHloXlaPipeline(pm, options);
-      if (!status.ok()) {
-        LOG(FATAL) << "HLO-XLA Runtime pipeline failed with: "
-                   << status.message();
-      }
-    });
-
-}  // namespace cpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/hlo_xla_runtime_pipeline.h b/third_party/xla/xla/service/cpu/hlo_xla_runtime_pipeline.h
deleted file mode 100644
index 5b5a970d1352f0..00000000000000
--- a/third_party/xla/xla/service/cpu/hlo_xla_runtime_pipeline.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_CPU_HLO_XLA_RUNTIME_PIPELINE_H_
-#define XLA_SERVICE_CPU_HLO_XLA_RUNTIME_PIPELINE_H_
-
-#include <string>
-#include <vector>
-
-#include "xla/runtime/compiler.h"
-#include "xla/status.h"
-
-namespace mlir {
-class DialectRegistry;
-}  // namespace mlir
-
-namespace xla {
-namespace cpu {
-
-struct HloXlaRuntimePipelineOptions {
-  bool enable_tiling_and_fusion = false;
-  bool enable_fusion_outlining = true;
-  bool remove_copies_to_outparams = true;
-  bool sparse_bufferization = true;
-  bool enable_avx2 = true;
-  // Accelerate sparse computations with CUDA threading.
-  // This is an experimental feature, so off by default.
-  int32_t xla_cpu_sparse_cuda_threads = 0;
-  // Optional CPU name, similar to llc's -mcpu flag.
-  std::string cpu_name = "";
-  std::vector<int64_t> matmul_tile_sizes = {};
-};
-
-// Creates a pipeline that lowers modules from HLO to Linalg on buffers.
-Status CreateHloXlaRuntimePipeline(xla::runtime::PassManager& passes,
-                                   const HloXlaRuntimePipelineOptions& options);
-Status CreateDefaultHloXlaRuntimePipeline(xla::runtime::PassManager& passes);
-
-void RegisterHloXlaRuntimePipelineDialects(mlir::DialectRegistry& dialects);
-}  // namespace cpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_CPU_HLO_XLA_RUNTIME_PIPELINE_H_
diff --git a/third_party/xla/xla/service/cpu/ir_emitter.cc b/third_party/xla/xla/service/cpu/ir_emitter.cc
index 5a2ff9e61c03d1..7a0689790f1498 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter.cc
+++ b/third_party/xla/xla/service/cpu/ir_emitter.cc
@@ -19,12 +19,14 @@ limitations under the License.
 #include <stdint.h>
 
 #include <algorithm>
+#include <cstddef>
 #include <iterator>
 #include <limits>
 #include <map>
 #include <memory>
 #include <numeric>
 #include <string>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
@@ -33,6 +35,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/meta/type_traits.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
@@ -123,7 +126,7 @@ IrEmitter::IrEmitter(mlir::MLIRContext* mlir_context,
       target_machine_features_(*target_machine_features),
       emit_code_for_msan_(emit_code_for_msan) {
   b_.setFastMathFlags(llvm_ir::GetCpuFastMathFlags(hlo_module_config_));
-  Status s = GatherComputationsByAllocationType(
+  absl::Status s = GatherComputationsByAllocationType(
       &hlo_module, &thread_local_computations_, &global_computations_);
   absl::c_sort(thread_local_computations_);
   absl::c_sort(global_computations_);
@@ -252,10 +255,10 @@ void IrEmitter::InitializeIrFunction(const std::string& function_name) {
                                    module_, &b_, num_dynamic_loop_bounds_);
 }
 
-Status IrEmitter::HandleBitcast(HloInstruction* bitcast) {
+absl::Status IrEmitter::HandleBitcast(HloInstruction* bitcast) {
   VLOG(2) << "HandleBitcast: " << bitcast->ToString();
   emitted_value_[bitcast] = GetEmittedValueFor(bitcast->operand(0));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 llvm::Constant* IrEmitter::EmitGlobalForLiteral(const Literal& literal) {
@@ -274,7 +277,7 @@ llvm::Constant* IrEmitter::EmitGlobalForLiteral(const Literal& literal) {
   return result_global;
 }
 
-Status IrEmitter::EmitConstantGlobals() {
+absl::Status IrEmitter::EmitConstantGlobals() {
   for (const BufferAllocation& allocation : assignment_.Allocations()) {
     if (!allocation.is_constant()) {
       continue;
@@ -294,17 +297,17 @@ Status IrEmitter::EmitConstantGlobals() {
                 global_for_const);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status IrEmitter::HandleConstant(HloInstruction* constant) {
+absl::Status IrEmitter::HandleConstant(HloInstruction* constant) {
   VLOG(2) << "HandleConstant: " << constant->ToString();
   // IrEmitter::EmitConstantGlobals has already taken care of emitting the body
   // of the constant.
   return EmitTargetAddressForOp(constant);
 }
 
-Status IrEmitter::HandleCopy(HloInstruction* copy) {
+absl::Status IrEmitter::HandleCopy(HloInstruction* copy) {
   if (copy->shape().IsTuple() ||
       (copy->shape().IsArray() &&
        LayoutUtil::Equal(copy->operand(0)->shape().layout(),
@@ -379,7 +382,8 @@ void IrEmitter::AttachDereferenceableMetadataForLoad(llvm::LoadInst* load,
   }
 }
 
-Status IrEmitter::HandleGetTupleElement(HloInstruction* get_tuple_element) {
+absl::Status IrEmitter::HandleGetTupleElement(
+    HloInstruction* get_tuple_element) {
   // A tuple is an array of pointers, one for each operand. Each pointer points
   // to the output buffer of its corresponding operand. A GetTupleElement
   // instruction forwards a pointer to the tuple element buffer at the given
@@ -389,16 +393,16 @@ Status IrEmitter::HandleGetTupleElement(HloInstruction* get_tuple_element) {
   emitted_value_[get_tuple_element] = llvm_ir::EmitGetTupleElement(
       shape, get_tuple_element->tuple_index(), MinimumAlignmentForShape(shape),
       GetEmittedValueFor(operand), IrShapeType(operand->shape()), &b_);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status IrEmitter::HandleSelect(HloInstruction* select) {
+absl::Status IrEmitter::HandleSelect(HloInstruction* select) {
   auto pred = select->operand(0);
   TF_RET_CHECK(pred->shape().element_type() == PRED);
   return DefaultAction(select);
 }
 
-Status IrEmitter::HandleInfeed(HloInstruction* instruction) {
+absl::Status IrEmitter::HandleInfeed(HloInstruction* instruction) {
   HloInfeedInstruction* infeed = Cast<HloInfeedInstruction>(instruction);
   VLOG(2) << "HandleInfeed: " << infeed->ToString();
 
@@ -455,11 +459,11 @@ Status IrEmitter::HandleInfeed(HloInstruction* instruction) {
         EmitXfeedTransfer(XfeedKind::kInfeed, data_shape, data_address));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status IrEmitter::EmitXfeedTransfer(XfeedKind kind, const Shape& shape,
-                                    llvm::Value* program_buffer_address) {
+absl::Status IrEmitter::EmitXfeedTransfer(XfeedKind kind, const Shape& shape,
+                                          llvm::Value* program_buffer_address) {
   int64_t length = ByteSizeOf(shape);
   if (length < 0 || length > std::numeric_limits<int32_t>::max()) {
     return InvalidArgument(
@@ -520,10 +524,10 @@ Status IrEmitter::EmitXfeedTransfer(XfeedKind kind, const Shape& shape,
                   acquired_pointer, shape_ptr, b_.getInt32(shape_length)},
                  b_.getVoidTy());
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status IrEmitter::HandleOutfeed(HloInstruction* outfeed) {
+absl::Status IrEmitter::HandleOutfeed(HloInstruction* outfeed) {
   // Outfeed produces no useful result, but it does return a token[] that can be
   // threaded through to other side effecting operations to ensure ordering.  In
   // the IR emitter we treat this token as a normal u8[] and thus need to insert
@@ -550,10 +554,10 @@ Status IrEmitter::HandleOutfeed(HloInstruction* outfeed) {
                                          tuple_element_shape, tuple_element));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status IrEmitter::HandleSort(HloInstruction* hlo) {
+absl::Status IrEmitter::HandleSort(HloInstruction* hlo) {
   const HloSortInstruction* sort = Cast<HloSortInstruction>(hlo);
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(sort));
   Shape keys_shape = sort->keys()->shape();
@@ -645,20 +649,20 @@ Status IrEmitter::HandleSort(HloInstruction* hlo) {
   if (sort->values_count() > 0) {
     llvm_ir::EmitTuple(GetIrArrayFor(sort), destination_addresses, &b_);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status IrEmitter::HandleTuple(HloInstruction* tuple) {
+absl::Status IrEmitter::HandleTuple(HloInstruction* tuple) {
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(tuple));
   llvm::SmallVector<llvm::Value*> base_ptrs;
   for (auto operand : tuple->operands()) {
     base_ptrs.push_back(GetEmittedValueFor(operand));
   }
   llvm_ir::EmitTuple(GetIrArrayFor(tuple), base_ptrs, &b_);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status IrEmitter::HandleReduceWindow(HloInstruction* reduce_window) {
+absl::Status IrEmitter::HandleReduceWindow(HloInstruction* reduce_window) {
   // Pseudo code for reduce window:
   //
   //   for (coordinates O in the output)
@@ -674,12 +678,13 @@ Status IrEmitter::HandleReduceWindow(HloInstruction* reduce_window) {
   // that works.
   bool saved_allow_reassociation = allow_reassociation_;
   allow_reassociation_ = true;
-  Status status = DefaultAction(reduce_window);
+  absl::Status status = DefaultAction(reduce_window);
   allow_reassociation_ = saved_allow_reassociation;
   return status;
 }
 
-Status IrEmitter::HandleSelectAndScatter(HloInstruction* select_and_scatter) {
+absl::Status IrEmitter::HandleSelectAndScatter(
+    HloInstruction* select_and_scatter) {
   CHECK_EQ(select_and_scatter->operand_count(), 3);
   const auto operand = select_and_scatter->operand(0);
   const auto source = select_and_scatter->operand(1);
@@ -858,10 +863,10 @@ Status IrEmitter::HandleSelectAndScatter(HloInstruction* select_and_scatter) {
   output_array.EmitWriteArrayElement(selected_index, scatter_value, &b_);
 
   SetToFirstInsertPoint(source_loops.GetOuterLoopExitBasicBlock(), &b_);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status IrEmitter::HandleDot(HloInstruction* dot) {
+absl::Status IrEmitter::HandleDot(HloInstruction* dot) {
   auto lhs = dot->operand(0);
   auto rhs = dot->operand(1);
   TF_RETURN_IF_ERROR(ElementTypesSameAndSupported(
@@ -897,7 +902,7 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
                           hlo_module_config_, target_machine_features_);
 }
 
-Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
+absl::Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
   auto lhs = convolution->operand(0);
   auto rhs = convolution->operand(1);
   TF_RETURN_IF_ERROR(ElementTypesSameAndSupported(
@@ -1070,7 +1075,7 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
       EmitCallToFunc(fn_name, args, b_.getVoidTy(), /*does_not_throw=*/true,
                      /*only_accesses_arg_memory=*/true);
 
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
   // This is a completely un-optimized version of convolution just to
@@ -1082,7 +1087,7 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
   return DefaultAction(convolution);
 }
 
-Status IrEmitter::HandleFft(HloInstruction* fft) {
+absl::Status IrEmitter::HandleFft(HloInstruction* fft) {
   auto operand = fft->operand(0);
   TF_RETURN_IF_ERROR(ElementTypesSameAndSupported(
       /*instruction=*/*fft, /*operands=*/{operand},
@@ -1131,10 +1136,10 @@ Status IrEmitter::HandleFft(HloInstruction* fft) {
                  /*only_accesses_arg_memory=*/false,
                  /*only_accesses_inaccessible_mem_or_arg_mem=*/true);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status IrEmitter::HandleAllReduceSingleReplica(HloInstruction* crs) {
+absl::Status IrEmitter::HandleAllReduceSingleReplica(HloInstruction* crs) {
   // When there is a single replica, a cross replica sum is the identity
   // function, and the buffer assignment expects a copy.
   //
@@ -1166,7 +1171,7 @@ Status IrEmitter::HandleAllReduceSingleReplica(HloInstruction* crs) {
            /*SrcAlign=*/llvm::Align(1), ShapeUtil::ByteSizeOf(operand_shape));
   }
   llvm_ir::EmitTuple(GetIrArrayFor(crs), operand_ptrs, &b_);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Data types supported by ReduceScatter and AllReduce.
@@ -1193,7 +1198,7 @@ static bool DataTypeIsSupportedByReduceScatter(PrimitiveType datatype) {
   }
 }
 
-Status IrEmitter::HandleAllReduceMultipleReplica(HloInstruction* crs) {
+absl::Status IrEmitter::HandleAllReduceMultipleReplica(HloInstruction* crs) {
   CHECK_GE(crs->operand_count(), 1);
   PrimitiveType datatype = crs->operand(0)->shape().element_type();
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(crs));
@@ -1274,10 +1279,10 @@ Status IrEmitter::HandleAllReduceMultipleReplica(HloInstruction* crs) {
        /*output_buffers=*/output_buffers},
       b_.getVoidTy());
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status IrEmitter::HandleAllReduce(HloInstruction* crs) {
+absl::Status IrEmitter::HandleAllReduce(HloInstruction* crs) {
   if (hlo_module_config_.replica_count() == 1 &&
       hlo_module_config_.num_partitions() == 1) {
     return HandleAllReduceSingleReplica(crs);
@@ -1285,7 +1290,7 @@ Status IrEmitter::HandleAllReduce(HloInstruction* crs) {
   return HandleAllReduceMultipleReplica(crs);
 }
 
-Status IrEmitter::HandleReduceScatter(HloInstruction* rs) {
+absl::Status IrEmitter::HandleReduceScatter(HloInstruction* rs) {
   CHECK_EQ(rs->operand_count(), 1);
   PrimitiveType datatype = rs->operand(0)->shape().element_type();
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(rs));
@@ -1338,10 +1343,10 @@ Status IrEmitter::HandleReduceScatter(HloInstruction* rs) {
        /*output_buffer=*/output_buffer},
       b_.getVoidTy());
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status IrEmitter::HandleAllToAll(HloInstruction* instruction) {
+absl::Status IrEmitter::HandleAllToAll(HloInstruction* instruction) {
   auto* instr = Cast<HloAllToAllInstruction>(instruction);
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(instruction));
   CHECK(!instr->split_dimension() && instr->shape().IsTuple())
@@ -1394,10 +1399,10 @@ Status IrEmitter::HandleAllToAll(HloInstruction* instruction) {
                  b_.getVoidTy());
 
   llvm_ir::EmitTuple(GetIrArrayFor(instruction), output_buffer_ptrs, &b_);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status IrEmitter::HandleAllGather(HloInstruction* instruction) {
+absl::Status IrEmitter::HandleAllGather(HloInstruction* instruction) {
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(instruction));
 
   std::string replica_groups =
@@ -1445,10 +1450,10 @@ Status IrEmitter::HandleAllGather(HloInstruction* instruction) {
       b_.getVoidTy());
 
   llvm_ir::EmitTuple(GetIrArrayFor(instruction), output_buffer_ptrs, &b_);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status IrEmitter::HandleCollectivePermute(HloInstruction* crs) {
+absl::Status IrEmitter::HandleCollectivePermute(HloInstruction* crs) {
   auto* instr = Cast<HloCollectivePermuteInstruction>(crs);
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(instr));
   std::string source_target_pairs = absl::StrJoin(
@@ -1482,10 +1487,10 @@ Status IrEmitter::HandleCollectivePermute(HloInstruction* crs) {
        /*source_target_pairs_size=*/b_.getInt32(source_target_pairs.size())},
       b_.getVoidTy());
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status IrEmitter::HandlePartitionId(HloInstruction* hlo) {
+absl::Status IrEmitter::HandlePartitionId(HloInstruction* hlo) {
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(hlo));
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice output_slice,
                       assignment_.GetUniqueSlice(hlo, {}));
@@ -1494,10 +1499,10 @@ Status IrEmitter::HandlePartitionId(HloInstruction* hlo) {
                  {/*run_options=*/GetExecutableRunOptionsArgument(),
                   /*output_buffer=*/output_buffer},
                  b_.getVoidTy());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status IrEmitter::HandleReplicaId(HloInstruction* hlo) {
+absl::Status IrEmitter::HandleReplicaId(HloInstruction* hlo) {
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(hlo));
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice output_slice,
                       assignment_.GetUniqueSlice(hlo, {}));
@@ -1506,10 +1511,10 @@ Status IrEmitter::HandleReplicaId(HloInstruction* hlo) {
                  {/*run_options=*/GetExecutableRunOptionsArgument(),
                   /*output_buffer=*/output_buffer},
                  b_.getVoidTy());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status IrEmitter::HandleParameter(HloInstruction* parameter) {
+absl::Status IrEmitter::HandleParameter(HloInstruction* parameter) {
   VLOG(2) << "HandleParameter: " << parameter->ToString();
   return EmitTargetAddressForOp(parameter);
 }
@@ -1995,7 +2000,7 @@ absl::StatusOr<bool> IrEmitter::EmitVectorizedReduce(
   return true;
 }
 
-Status IrEmitter::HandleReduce(HloInstruction* reduce) {
+absl::Status IrEmitter::HandleReduce(HloInstruction* reduce) {
   auto arg = reduce->mutable_operand(0);
   auto init_value = reduce->mutable_operand(1);
   absl::Span<const int64_t> dimensions(reduce->dimensions());
@@ -2014,7 +2019,7 @@ Status IrEmitter::HandleReduce(HloInstruction* reduce) {
     if (vectorization_successful) {
       VLOG(1) << "Successfully vectorized reduction " << reduce->ToString()
               << "\n";
-      return OkStatus();
+      return absl::OkStatus();
     } else {
       VLOG(1) << "Could not vectorize reduction " << reduce->ToString() << ": "
               << vectorization_failure_reason;
@@ -2024,21 +2029,21 @@ Status IrEmitter::HandleReduce(HloInstruction* reduce) {
   return DefaultAction(reduce);
 }
 
-Status IrEmitter::HandleSend(HloInstruction* send) {
+absl::Status IrEmitter::HandleSend(HloInstruction* send) {
   // TODO(b/33942983): Support Send/Recv on CPU.
   return Unimplemented("Send is not implemented on CPU.");
 }
 
-Status IrEmitter::HandleSendDone(HloInstruction* send_done) {
+absl::Status IrEmitter::HandleSendDone(HloInstruction* send_done) {
   // TODO(b/33942983): Support Send/Recv on CPU.
   return Unimplemented("Send-done is not implemented on CPU.");
 }
 
-Status IrEmitter::HandleScatter(HloInstruction*) {
+absl::Status IrEmitter::HandleScatter(HloInstruction*) {
   return Unimplemented("Scatter is not implemented on CPUs.");
 }
 
-Status IrEmitter::HandleSlice(HloInstruction* slice) {
+absl::Status IrEmitter::HandleSlice(HloInstruction* slice) {
   VLOG(2) << "HandleSlice: " << slice->ToString();
   auto operand = slice->operand(0);
   // The code below emits a sequential loop nest. For the parallel backend, use
@@ -2055,7 +2060,7 @@ Status IrEmitter::HandleSlice(HloInstruction* slice) {
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(slice));
 
   if (ShapeUtil::IsZeroElementArray(slice->shape())) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   const Layout& layout = operand->shape().layout();
@@ -2172,10 +2177,10 @@ Status IrEmitter::HandleSlice(HloInstruction* slice) {
     SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status IrEmitter::HandleDynamicSlice(HloInstruction* dynamic_slice) {
+absl::Status IrEmitter::HandleDynamicSlice(HloInstruction* dynamic_slice) {
   if (ShapeUtil::IsScalar(dynamic_slice->shape())) {
     TF_RETURN_IF_ERROR(EmitTargetAddressForOp(dynamic_slice));
     return EmitMemcpy(*dynamic_slice->operand(0), *dynamic_slice);
@@ -2183,7 +2188,7 @@ Status IrEmitter::HandleDynamicSlice(HloInstruction* dynamic_slice) {
   return DefaultAction(dynamic_slice);
 }
 
-Status IrEmitter::HandleDynamicUpdateSlice(
+absl::Status IrEmitter::HandleDynamicUpdateSlice(
     HloInstruction* dynamic_update_slice) {
   auto update = dynamic_update_slice->operand(1);
   if (ShapeUtil::IsScalar(dynamic_update_slice->shape())) {
@@ -2200,17 +2205,17 @@ Status IrEmitter::HandleDynamicUpdateSlice(
   return DefaultAction(dynamic_update_slice);
 }
 
-Status IrEmitter::HandleRecv(HloInstruction* recv) {
+absl::Status IrEmitter::HandleRecv(HloInstruction* recv) {
   // TODO(b/33942983): Support Send/Recv on CPU.
   return Unimplemented("Recv is not implemented on CPU.");
 }
 
-Status IrEmitter::HandleRecvDone(HloInstruction* recv_done) {
+absl::Status IrEmitter::HandleRecvDone(HloInstruction* recv_done) {
   // TODO(b/33942983): Support Send/Recv on CPU.
   return Unimplemented("Recv-done is not implemented on CPU.");
 }
 
-Status IrEmitter::HandlePad(HloInstruction* pad) {
+absl::Status IrEmitter::HandlePad(HloInstruction* pad) {
   // CPU backend does not properly handle negative padding but this is ok
   // because negative padding should be removed by the algebraic simplifier.
   for (auto& padding_dimension : pad->padding_config().dimensions()) {
@@ -2266,10 +2271,10 @@ Status IrEmitter::HandlePad(HloInstruction* pad) {
   output_array.EmitWriteArrayElement(output_index, operand_data, &b_);
 
   SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status IrEmitter::HandleFusion(HloInstruction* fusion) {
+absl::Status IrEmitter::HandleFusion(HloInstruction* fusion) {
   auto* root = fusion->fused_expression_root();
   if (llvm_ir::CanEmitFusedDynamicUpdateSliceInPlace(fusion, assignment_)) {
     VLOG(3) << "HandleFusion FusedDynamicUpdateSliceInPlace";
@@ -2318,13 +2323,13 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) {
         *dot, target_array, lhs_array, rhs_array, &addend_array,
         GetExecutableRunOptionsArgument(), &b_, mlir_context_,
         hlo_module_config_, target_machine_features_));
-    return OkStatus();
+    return absl::OkStatus();
   } else {
     return Unimplemented("Fusion kind not implemented on CPU");
   }
 }
 
-Status IrEmitter::HandleCall(HloInstruction* call) {
+absl::Status IrEmitter::HandleCall(HloInstruction* call) {
   HloComputation* computation = call->to_apply();
   llvm::Function* call_ir_function = FindOrDie(
       emitted_functions_, ComputationToEmit{computation, allow_reassociation_});
@@ -2366,10 +2371,10 @@ Status IrEmitter::HandleCall(HloInstruction* call) {
     EmitGlobalCall(*computation, computation->name());
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status IrEmitter::HandleSliceToDynamic(HloInstruction* hlo) {
+absl::Status IrEmitter::HandleSliceToDynamic(HloInstruction* hlo) {
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(hlo));
   std::vector<llvm::Value*> dynamic_dims;
   int32_t raw_data_size =
@@ -2397,7 +2402,7 @@ Status IrEmitter::HandleSliceToDynamic(HloInstruction* hlo) {
   //     dest_index = delinearize(linearize(i, dynamic_dim), static_dim)
   //     dest[dest_index] = source[i]
   auto loop_body_emitter =
-      [&](const llvm_ir::IrArray::Index& array_index) -> Status {
+      [&](const llvm_ir::IrArray::Index& array_index) -> absl::Status {
     llvm::Value* source_element =
         GetIrArrayFor(hlo->operand(0)).EmitReadArrayElement(array_index, &b_);
     llvm::Value* linear_index = array_index.Linearize(dynamic_dims, &b_);
@@ -2405,14 +2410,14 @@ Status IrEmitter::HandleSliceToDynamic(HloInstruction* hlo) {
     llvm_ir::IrArray::Index dest_index(linear_index, data_array.GetShape(),
                                        &b_);
     data_array.EmitWriteArrayElement(dest_index, source_element, &b_);
-    return OkStatus();
+    return absl::OkStatus();
   };
   return llvm_ir::LoopEmitter(loop_body_emitter, data_array.GetShape(),
                               dynamic_dims, &b_)
       .EmitLoop(IrName(hlo));
 }
 
-Status IrEmitter::HandlePadToStatic(HloInstruction* hlo) {
+absl::Status IrEmitter::HandlePadToStatic(HloInstruction* hlo) {
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(hlo));
 
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice data_slice,
@@ -2460,13 +2465,13 @@ Status IrEmitter::HandlePadToStatic(HloInstruction* hlo) {
   //     source_index = delinearize(inearize(i, dynamic_dim), static_dim)
   //     dest[i] = source[source_index]
   auto loop_body_emitter =
-      [&](const llvm_ir::IrArray::Index& array_index) -> Status {
+      [&](const llvm_ir::IrArray::Index& array_index) -> absl::Status {
     llvm::Value* linear_index = array_index.Linearize(dynamic_dims, &b_);
     llvm_ir::IrArray::Index source_index(linear_index, input_shape, &b_);
     llvm::Value* source_element =
         GetIrArrayFor(hlo->operand(0)).EmitReadArrayElement(source_index, &b_);
     data_array.EmitWriteArrayElement(array_index, source_element, &b_);
-    return OkStatus();
+    return absl::OkStatus();
   };
   TF_RETURN_IF_ERROR(
       llvm_ir::LoopEmitter(loop_body_emitter, input_shape, dynamic_dims, &b_)
@@ -2474,10 +2479,10 @@ Status IrEmitter::HandlePadToStatic(HloInstruction* hlo) {
 
   // Emit static tensor and dynamic sizes as one tuple.
   llvm_ir::EmitTuple(GetIrArrayFor(hlo), tuple_operand_ptrs, &b_);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status IrEmitter::HandleTopK(HloInstruction* hlo) {
+absl::Status IrEmitter::HandleTopK(HloInstruction* hlo) {
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(hlo));
   const HloInstruction* input = hlo->operand(0);
   const int64_t k = hlo->shape().tuple_shapes(0).dimensions().back();
@@ -2513,12 +2518,12 @@ Status IrEmitter::HandleTopK(HloInstruction* hlo) {
 
   llvm_ir::EmitTuple(GetIrArrayFor(hlo), {out_values_ptr, out_indices_ptr},
                      &b_);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 #if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
-Status IrEmitter::HandleOneDnnMatMulCalls(HloInstruction* custom_call,
-                                          std::string runtime_symbol_name) {
+absl::Status IrEmitter::HandleOneDnnMatMulCalls(
+    HloInstruction* custom_call, std::string runtime_symbol_name) {
   // We would like to emit LLVM IR for the following function call
   //      custom_call_target(void* result, void** args)
   // args can be thought of an array of pointers allocated on the stack,
@@ -2644,10 +2649,10 @@ Status IrEmitter::HandleOneDnnMatMulCalls(HloInstruction* custom_call,
     scratch_stack_alloca.EmitLifetimeEnd();
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status IrEmitter::HandleOneDnnLayerNorm(HloInstruction* custom_call) {
+absl::Status IrEmitter::HandleOneDnnLayerNorm(HloInstruction* custom_call) {
   //      args[0]: ptr to nargs
   //      args[1]: ptr to ExecutableRunOptions
   //      args[2]: ptr to OneDnnLayerNormConfig
@@ -2721,10 +2726,10 @@ Status IrEmitter::HandleOneDnnLayerNorm(HloInstruction* custom_call) {
   b_.CreateLifetimeEnd(args_ptr, b_.getInt64(-1));
   result_stack_alloca.EmitLifetimeEnd();
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status IrEmitter::HandleOneDnnSoftmax(HloInstruction* custom_call) {
+absl::Status IrEmitter::HandleOneDnnSoftmax(HloInstruction* custom_call) {
   auto input = custom_call->operand(0);
   llvm_ir::IrArray input_array(GetIrArrayFor(input));
   auto input_stack_alloca = GetAllocaAndEmitMemrefInfo(b_, input_array);
@@ -2744,11 +2749,11 @@ Status IrEmitter::HandleOneDnnSoftmax(HloInstruction* custom_call) {
   input_stack_alloca.EmitLifetimeEnd();
   result_stack_alloca.EmitLifetimeEnd();
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 #endif  // INTEL_MKL && ENABLE_ONEDNN_V3
 
-Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) {
+absl::Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) {
   if (custom_call->custom_call_target() == "PadToStatic") {
     return HandlePadToStatic(custom_call);
   }
@@ -2814,30 +2819,35 @@ Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) {
     }
     llvm_ir::EmitTuple(GetIrArrayFor(custom_call), base_ptrs, &b_);
   }
-  auto* output_address_arg = GetEmittedValueFor(custom_call);
+  auto* output_address = GetEmittedValueFor(custom_call);
 
   auto typed_custom_call = Cast<HloCustomCallInstruction>(custom_call);
   switch (typed_custom_call->api_version()) {
     case CustomCallApiVersion::API_VERSION_ORIGINAL:
       EmitCallToFunc(custom_call->custom_call_target(),
-                     {output_address_arg, operands_alloca}, b_.getVoidTy());
+                     {output_address, operands_alloca}, b_.getVoidTy());
       break;
     case CustomCallApiVersion::API_VERSION_STATUS_RETURNING:
       EmitCallToFunc(custom_call->custom_call_target(),
-                     {output_address_arg, operands_alloca, GetStatusArgument()},
+                     {output_address, operands_alloca, GetStatusArgument()},
                      b_.getVoidTy());
       EmitEarlyReturnIfErrorStatus();
       break;
     case CustomCallApiVersion::API_VERSION_STATUS_RETURNING_UNIFIED: {
       absl::string_view opaque = typed_custom_call->opaque();
       EmitCallToFunc(custom_call->custom_call_target(),
-                     {output_address_arg, operands_alloca,
+                     {output_address, operands_alloca,
                       b_.CreateGlobalStringPtr(llvm_ir::AsStringRef(opaque)),
                       b_.getInt64(opaque.size()), GetStatusArgument()},
                      b_.getVoidTy());
       EmitEarlyReturnIfErrorStatus();
       break;
     }
+    case CustomCallApiVersion::API_VERSION_TYPED_FFI: {
+      EmitCallToFfi(typed_custom_call, output_address, operands_alloca);
+      EmitEarlyReturnIfErrorStatus();
+      break;
+    }
     default:
       return Internal(
           "Unknown custom-call API version enum value: %d (%s)",
@@ -2845,10 +2855,10 @@ Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) {
           CustomCallApiVersion_Name(typed_custom_call->api_version()));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status IrEmitter::HandleWhile(HloInstruction* xla_while) {
+absl::Status IrEmitter::HandleWhile(HloInstruction* xla_while) {
   // Precondition: Condition computation must return a scalar bool.
   HloComputation* condition = xla_while->while_condition();
   TF_RET_CHECK(ShapeUtil::IsScalar(condition->root_instruction()->shape()) &&
@@ -2859,7 +2869,7 @@ Status IrEmitter::HandleWhile(HloInstruction* xla_while) {
   TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
       xla_while->shape(),
       [this, &xla_while](const Shape& /*subshape*/,
-                         const ShapeIndex& index) -> Status {
+                         const ShapeIndex& index) -> absl::Status {
         auto check = [this](const HloInstruction* a, const HloInstruction* b,
                             const ShapeIndex& index) -> absl::Status {
           const BufferAllocation::Slice slice_a =
@@ -2873,7 +2883,7 @@ Status IrEmitter::HandleWhile(HloInstruction* xla_while) {
                 a->ToString(), slice_a.ToString(), b->ToString(),
                 slice_b.ToString());
           }
-          return OkStatus();
+          return absl::OkStatus();
         };
         TF_RETURN_IF_ERROR(check(xla_while, xla_while->operand(0), index));
         TF_RETURN_IF_ERROR(check(
@@ -2884,7 +2894,7 @@ Status IrEmitter::HandleWhile(HloInstruction* xla_while) {
                   index));
         TF_RETURN_IF_ERROR(check(
             xla_while, xla_while->while_body()->root_instruction(), index));
-        return OkStatus();
+        return absl::OkStatus();
       }));
 
   // Set emitted value to that of 'init' with which it shares an allocation.
@@ -2936,7 +2946,7 @@ Status IrEmitter::HandleWhile(HloInstruction* xla_while) {
   llvm_fn->insert(llvm_fn->end(), exit_bb);
   b_.SetInsertPoint(exit_bb);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 absl::StatusOr<bool> IrEmitter::EmitFastConcatenate(
@@ -3083,6 +3093,115 @@ llvm::Value* IrEmitter::EmitCallToFunc(
   return b_.CreateCall(func, arguments);
 }
 
+template <typename T>
+static const Shape& GetShape(T&& arg) {
+  if constexpr (std::is_convertible_v<absl::remove_cvref_t<decltype(arg)>,
+                                      Shape>) {
+    return arg;  // convertible to shape, so just return
+  } else {
+    return arg->shape();
+  }
+};
+
+template <typename T>
+llvm::AllocaInst* IrEmitter::StoreTypes(std::string_view alloca_name,
+                                        T&& args) {
+  auto* types_alloca = llvm_ir::EmitAllocaAtFunctionEntryWithCount(
+      b_.getInt32Ty(), b_.getInt64(args.size()), alloca_name, &b_);
+
+  for (int64_t i = 0; i < args.size(); ++i) {
+    llvm::Value* slot_in_types_alloca =
+        ConstInBoundsGEP1_32(b_.getInt32Ty(), types_alloca, i);
+    Store(b_.getInt32(GetShape(args[i]).element_type()), slot_in_types_alloca);
+  }
+  return types_alloca;
+};
+
+template <typename T>
+llvm::Value* IrEmitter::StoreShapes(std::string_view alloca_name, T&& args) {
+  // Prepare metadata for all buffers
+  // Shapes metadata is encoded using contiguous flattened dimension values:
+  //    {
+  // 1:   DIMCOUNT_1, DIM_1[1], DIM_1[2], ..., DIM_1[DIMCOUNT_1],
+  //                  \______________DIMCOUNT_1 _______________/
+  // 2:   DIMCOUNT_2, DIM_2[1], DIM_2[2], ..., DIM_2[DIMCOUNT_2],
+  //                  \______________DIMCOUNT_2 _______________/
+  // .:   ...
+  // N:   DIMCOUNT_N, DIM_N[1], DIM_N[2], ..., DIM_N[DIMCOUNT_N],
+  //                  \______________DIMCOUNT_N _______________/
+  //    }
+  //  where N is `operand_count`, and `DIMCOUNT_i` is the # of dimensions
+  std::size_t total_dims =
+      absl::c_accumulate(args, int64_t{0}, [](int64_t acc, auto&& arg) {
+        return acc + GetShape(arg).dimensions().size();
+      });
+  int64_t encoded_shapes_size = args.size()  // the dimension count identifiers
+                                + total_dims;  // the # of dimension values
+
+  llvm::Value* shapes_alloca = llvm_ir::EmitAllocaAtFunctionEntryWithCount(
+      b_.getInt64Ty(), b_.getInt64(encoded_shapes_size), alloca_name, &b_);
+
+  int64_t slot_id = 0;
+  for (int64_t i = 0; i < args.size(); ++i) {
+    auto dims = GetShape(args[i]).dimensions();
+    llvm::Value* alloca_slot =
+        ConstInBoundsGEP1_64(b_.getInt64Ty(), shapes_alloca, slot_id++);
+    // Store the operand count
+    Store(b_.getInt64(dims.size()), alloca_slot);
+    // Store the operand dimensions
+    for (int64_t dim : dims) {
+      alloca_slot =
+          ConstInBoundsGEP1_64(b_.getInt64Ty(), shapes_alloca, slot_id++);
+      Store(b_.getInt64(dim), alloca_slot);
+    }
+  }
+  CHECK_EQ(slot_id, encoded_shapes_size);  // All slots are filled
+  return shapes_alloca;
+};
+
+llvm::Value* IrEmitter::EmitCallToFfi(HloCustomCallInstruction* custom_call,
+                                      llvm::Value* output_address,
+                                      llvm::AllocaInst* operands_alloca) {
+  const auto& operands = absl::MakeSpan(custom_call->operands());
+  const auto& shape = custom_call->shape();
+  const auto& result_shapes =
+      shape.IsTuple() ? shape.tuple_shapes() : std::vector<Shape>({shape});
+
+  auto operand_types_alloca = StoreTypes("meta_types_operands", operands);
+  auto operand_shapes_alloca = StoreShapes("meta_shapes_operands", operands);
+
+  auto result_types_alloca = StoreTypes("meta_types_results", result_shapes);
+  auto result_shapes_alloca = StoreShapes("meta_shapes_results", result_shapes);
+
+  const absl::string_view target = custom_call->custom_call_target();  // name
+  const absl::string_view opaque = custom_call->opaque();
+
+  const auto target_ref = llvm_ir::AsStringRef(target);
+  const auto opaque_ref = llvm_ir::AsStringRef(opaque);
+
+  std::vector<llvm::Value*> arguments = {
+      GetExecutableRunOptionsArgument(),     // run_options_ptr
+      b_.CreateGlobalStringPtr(target_ref),  // target_name_ptr
+      b_.getInt64(target.size()),            // target_name_len
+      output_address,                        // output
+      operands_alloca,                       // inputs
+      b_.CreateGlobalStringPtr(opaque_ref),  // opaque_str_ptr
+      b_.getInt64(opaque.size()),            // opaque_str_len
+      GetStatusArgument(),                   // status_opaque
+      operand_types_alloca,                  // operand_types
+      b_.getInt64(operands.size()),          // operand_count
+      operand_shapes_alloca,                 // operand_dims
+      result_types_alloca,                   // result_types
+      b_.getInt64(result_shapes.size()),     // result_count
+      result_shapes_alloca,                  // result_dims
+  };
+
+  return EmitCallToFunc(runtime::kHandleFfiCallSymbolName, arguments,
+                        b_.getVoidTy(),
+                        /*does_not_throw=*/false,
+                        /*only_accesses_arg_memory=*/true);
+}
+
 void IrEmitter::EmitTransferElements(llvm::Value* target, llvm::Value* source,
                                      int64_t element_count,
                                      PrimitiveType primitive_type,
@@ -3119,7 +3238,7 @@ void IrEmitter::EmitTransferElements(llvm::Value* target, llvm::Value* source,
   }
 }
 
-Status IrEmitter::HandleConcatenate(HloInstruction* concatenate) {
+absl::Status IrEmitter::HandleConcatenate(HloInstruction* concatenate) {
   absl::Span<HloInstruction* const> operands(concatenate->operands());
   std::string failure_reason;
   TF_ASSIGN_OR_RETURN(
@@ -3127,7 +3246,7 @@ Status IrEmitter::HandleConcatenate(HloInstruction* concatenate) {
       EmitFastConcatenate(concatenate, operands, &failure_reason));
   if (successful) {
     VLOG(1) << "Emitted fast concatenate for " << concatenate->ToString();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   VLOG(1) << "Could not emit fast concatenate for " << concatenate->ToString()
@@ -3136,7 +3255,7 @@ Status IrEmitter::HandleConcatenate(HloInstruction* concatenate) {
   return DefaultAction(concatenate);
 }
 
-Status IrEmitter::HandleConditional(HloInstruction* conditional) {
+absl::Status IrEmitter::HandleConditional(HloInstruction* conditional) {
   auto branch_index = conditional->operand(0);
   int num_branches = conditional->branch_count();
   TF_RET_CHECK(ShapeUtil::IsScalar(branch_index->shape()) &&
@@ -3183,7 +3302,7 @@ Status IrEmitter::HandleConditional(HloInstruction* conditional) {
                    IrName(conditional, "_false"));
 
     SetToFirstInsertPoint(if_data.after_block, &b_);
-    return OkStatus();
+    return absl::OkStatus();
   }
   // We emit a switch statement to LLVM:
   // switch (branch_index) {
@@ -3242,28 +3361,28 @@ Status IrEmitter::HandleConditional(HloInstruction* conditional) {
   }
 
   SetToFirstInsertPoint(after_block, &b_);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status IrEmitter::HandleAfterAll(HloInstruction* after_all) {
+absl::Status IrEmitter::HandleAfterAll(HloInstruction* after_all) {
   TF_RET_CHECK(ByteSizeOf(after_all->shape()) == 0);
   // No code to generate, but we need to emit an address for book-keeping.
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(after_all));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status IrEmitter::HandleAddDependency(HloInstruction* add_dependency) {
+absl::Status IrEmitter::HandleAddDependency(HloInstruction* add_dependency) {
   // AddDedendency just forwards its zero-th operand.
   emitted_value_[add_dependency] =
       GetEmittedValueFor(add_dependency->operand(0));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status IrEmitter::HandleRng(HloInstruction* rng) {
+absl::Status IrEmitter::HandleRng(HloInstruction* rng) {
   return Unimplemented("Rng should be expanded for CPU.");
 }
 
-Status IrEmitter::HandleRngGetAndUpdateState(HloInstruction* rng_state) {
+absl::Status IrEmitter::HandleRngGetAndUpdateState(HloInstruction* rng_state) {
   VLOG(2) << "RngGetAndUpdateState: " << rng_state->ToString();
   llvm::Value* old_state = llvm_ir::RngGetAndUpdateState(
       Cast<HloRngGetAndUpdateStateInstruction>(rng_state)->delta(), module_,
@@ -3278,10 +3397,10 @@ Status IrEmitter::HandleRngGetAndUpdateState(HloInstruction* rng_state) {
   store->setAlignment(llvm::Align(IrEmitter::MinimumAlignmentForPrimitiveType(
       rng_state->shape().element_type())));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status IrEmitter::FinishVisit(HloInstruction* root) {
+absl::Status IrEmitter::FinishVisit(HloInstruction* root) {
   // When this method is called, we should have already emitted an IR value for
   // the root (return) op. The IR value holds the address of the buffer holding
   // the value. If the root is a constant or parameter, we perform a memcpy from
@@ -3306,7 +3425,7 @@ Status IrEmitter::FinishVisit(HloInstruction* root) {
   // computations since it includes cycles spent in computations invoked by
   // While, Call etc.
   record_complete_computation(GetProfileCounterFor(*root->parent()));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename T>
@@ -3474,7 +3593,7 @@ bool IsHloVeryCheap(const HloInstruction* hlo) {
 }
 }  // namespace
 
-Status IrEmitter::Preprocess(HloInstruction* hlo) {
+absl::Status IrEmitter::Preprocess(HloInstruction* hlo) {
   VLOG(3) << "Visiting: " << hlo->ToString();
   // When profiling is enabled, trace the same HLOs that the profiler does.
   if (instruction_to_profile_idx_.count(hlo) ||
@@ -3484,10 +3603,10 @@ Status IrEmitter::Preprocess(HloInstruction* hlo) {
                                     GetExecutableRunOptionsArgument());
     profiling_state_.RecordCycleStart(&b_, hlo);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status IrEmitter::Postprocess(HloInstruction* hlo) {
+absl::Status IrEmitter::Postprocess(HloInstruction* hlo) {
   if (auto* prof_counter = GetProfileCounterFor(*hlo)) {
     profiling_state_.RecordCycleDelta(&b_, hlo, prof_counter);
   }
@@ -3497,7 +3616,7 @@ Status IrEmitter::Postprocess(HloInstruction* hlo) {
        hlo->parent()->IsEntryComputation())) {
     tracing_state_.EmitTracingEnd(&b_, hlo, GetExecutableRunOptionsArgument());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 llvm_ir::IrArray IrEmitter::GetIrArrayFor(const HloInstruction* hlo) {
@@ -3647,23 +3766,23 @@ llvm::Value* IrEmitter::EmitBufferPointer(const BufferAllocation::Slice& slice,
   }
 }
 
-Status IrEmitter::EmitTargetAddressForOp(const HloInstruction* op) {
+absl::Status IrEmitter::EmitTargetAddressForOp(const HloInstruction* op) {
   const Shape& target_shape = op->shape();
   TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice slice,
                       assignment_.GetUniqueTopLevelSlice(op));
   llvm::Value* addr = EmitBufferPointer(slice, target_shape);
   addr->setName(IrName(op));
   emitted_value_[op] = addr;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status IrEmitter::EmitTargetElementLoop(
+absl::Status IrEmitter::EmitTargetElementLoop(
     HloInstruction* target_op,
     const llvm_ir::ElementGenerator& element_generator) {
   return EmitTargetElementLoop(target_op, /*desc=*/"", element_generator);
 }
 
-Status IrEmitter::EmitTargetElementLoop(
+absl::Status IrEmitter::EmitTargetElementLoop(
     HloInstruction* target_op, absl::string_view desc,
     const llvm_ir::ElementGenerator& element_generator) {
   VLOG(2) << "EmitTargetElementLoop: " << target_op->ToString();
@@ -3713,21 +3832,21 @@ Status IrEmitter::EmitTargetElementLoop(
               .EmitLoop(IrName(target_op)));
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status IrEmitter::EmitMemcpy(const HloInstruction& source,
-                             const HloInstruction& destination) {
+absl::Status IrEmitter::EmitMemcpy(const HloInstruction& source,
+                                   const HloInstruction& destination) {
   llvm::Value* source_value = GetEmittedValueFor(&source);
   llvm::Value* destination_value = GetEmittedValueFor(&destination);
   int64_t source_size = ByteSizeOf(source.shape());
   // TODO(b/63762267): Be more aggressive about specifying alignment.
   MemCpy(destination_value, /*DstAlign=*/llvm::Align(1), source_value,
          /*SrcAlign=*/llvm::Align(1), source_size);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status IrEmitter::ElementTypesSameAndSupported(
+absl::Status IrEmitter::ElementTypesSameAndSupported(
     const HloInstruction& instruction,
     absl::Span<const HloInstruction* const> operands,
     absl::Span<const PrimitiveType> supported_types) {
@@ -3743,10 +3862,10 @@ Status IrEmitter::ElementTypesSameAndSupported(
                          PrimitiveType_Name(primitive_type),
                          HloOpcodeString(instruction.opcode()));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status IrEmitter::DefaultAction(HloInstruction* hlo) {
+absl::Status IrEmitter::DefaultAction(HloInstruction* hlo) {
   ElementalIrEmitter::HloToElementGeneratorMap operand_to_generator;
   for (const HloInstruction* operand : hlo->operands()) {
     operand_to_generator[operand] = [=](const llvm_ir::IrArray::Index& index) {
diff --git a/third_party/xla/xla/service/cpu/ir_emitter.h b/third_party/xla/xla/service/cpu/ir_emitter.h
index 93516138be50e4..21a371810116ac 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter.h
+++ b/third_party/xla/xla/service/cpu/ir_emitter.h
@@ -124,7 +124,7 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   llvm::IRBuilder<>* builder() { return &b_; }
 
   // Emit an LLVM global variable for every constant buffer allocation.
-  Status EmitConstantGlobals();
+  absl::Status EmitConstantGlobals();
 
  protected:
   //
@@ -132,55 +132,57 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   //
   // Default action which emits code for most operations. Operations which are
   // special in some way are handled explicitly in HandleFoo methods.
-  Status DefaultAction(HloInstruction* hlo) override;
-
-  Status HandleAllGather(HloInstruction* instruction) override;
-  Status HandleAllToAll(HloInstruction* instruction) override;
-  Status HandleBitcast(HloInstruction* bitcast) override;
-  Status HandleConstant(HloInstruction* constant) override;
-  Status HandleCopy(HloInstruction* copy) override;
-  Status HandleGetTupleElement(HloInstruction* get_tuple_element) override;
-  Status HandleSelect(HloInstruction* select) override;
-  Status HandleDot(HloInstruction* dot) override;
-  Status HandleConvolution(HloInstruction* convolution) override;
-  Status HandleFft(HloInstruction* fft) override;
-  Status HandleAllReduce(HloInstruction* crs) override;
-  Status HandleReduceScatter(HloInstruction* crs) override;
-  Status HandleCollectivePermute(HloInstruction* crs) override;
-  Status HandleInfeed(HloInstruction* instruction) override;
-  Status HandleOutfeed(HloInstruction* outfeed) override;
-  Status HandleSort(HloInstruction* hlo) override;
-  Status HandleParameter(HloInstruction* parameter) override;
-  Status HandleReduce(HloInstruction* reduce) override;
-  Status HandleReduceWindow(HloInstruction* reduce_window) override;
-  Status HandleSelectAndScatter(HloInstruction* select_and_scatter) override;
-  Status HandleSend(HloInstruction* send) override;
-  Status HandleSendDone(HloInstruction* send_done) override;
-  Status HandleSlice(HloInstruction* slice) override;
-  Status HandleDynamicSlice(HloInstruction* dynamic_slice) override;
-  Status HandleDynamicUpdateSlice(
+  absl::Status DefaultAction(HloInstruction* hlo) override;
+
+  absl::Status HandleAllGather(HloInstruction* instruction) override;
+  absl::Status HandleAllToAll(HloInstruction* instruction) override;
+  absl::Status HandleBitcast(HloInstruction* bitcast) override;
+  absl::Status HandleConstant(HloInstruction* constant) override;
+  absl::Status HandleCopy(HloInstruction* copy) override;
+  absl::Status HandleGetTupleElement(
+      HloInstruction* get_tuple_element) override;
+  absl::Status HandleSelect(HloInstruction* select) override;
+  absl::Status HandleDot(HloInstruction* dot) override;
+  absl::Status HandleConvolution(HloInstruction* convolution) override;
+  absl::Status HandleFft(HloInstruction* fft) override;
+  absl::Status HandleAllReduce(HloInstruction* crs) override;
+  absl::Status HandleReduceScatter(HloInstruction* crs) override;
+  absl::Status HandleCollectivePermute(HloInstruction* crs) override;
+  absl::Status HandleInfeed(HloInstruction* instruction) override;
+  absl::Status HandleOutfeed(HloInstruction* outfeed) override;
+  absl::Status HandleSort(HloInstruction* hlo) override;
+  absl::Status HandleParameter(HloInstruction* parameter) override;
+  absl::Status HandleReduce(HloInstruction* reduce) override;
+  absl::Status HandleReduceWindow(HloInstruction* reduce_window) override;
+  absl::Status HandleSelectAndScatter(
+      HloInstruction* select_and_scatter) override;
+  absl::Status HandleSend(HloInstruction* send) override;
+  absl::Status HandleSendDone(HloInstruction* send_done) override;
+  absl::Status HandleSlice(HloInstruction* slice) override;
+  absl::Status HandleDynamicSlice(HloInstruction* dynamic_slice) override;
+  absl::Status HandleDynamicUpdateSlice(
       HloInstruction* dynamic_update_slice) override;
-  Status HandleRecv(HloInstruction* recv) override;
-  Status HandleRecvDone(HloInstruction* recv_done) override;
-  Status HandlePad(HloInstruction* pad) override;
-  Status HandleTuple(HloInstruction* tuple) override;
-  Status HandleFusion(HloInstruction* fusion) override;
-  Status HandleCall(HloInstruction* call) override;
-  Status HandleCustomCall(HloInstruction* custom_call) override;
-  Status HandleWhile(HloInstruction* xla_while) override;
-  Status HandleConcatenate(HloInstruction* concatenate) override;
-  Status HandleConditional(HloInstruction* conditional) override;
-  Status HandleScatter(HloInstruction* scatter) override;
-  Status HandleAfterAll(HloInstruction* after_all) override;
-  Status HandleAddDependency(HloInstruction* add_dependency) override;
-  Status HandlePartitionId(HloInstruction* hlo) override;
-  Status HandleReplicaId(HloInstruction* hlo) override;
-  Status HandleRng(HloInstruction* rng) override;
-  Status HandleRngGetAndUpdateState(HloInstruction* rng_state) override;
-  Status FinishVisit(HloInstruction* root) override;
-
-  Status Preprocess(HloInstruction* hlo) override;
-  Status Postprocess(HloInstruction* hlo) override;
+  absl::Status HandleRecv(HloInstruction* recv) override;
+  absl::Status HandleRecvDone(HloInstruction* recv_done) override;
+  absl::Status HandlePad(HloInstruction* pad) override;
+  absl::Status HandleTuple(HloInstruction* tuple) override;
+  absl::Status HandleFusion(HloInstruction* fusion) override;
+  absl::Status HandleCall(HloInstruction* call) override;
+  absl::Status HandleCustomCall(HloInstruction* custom_call) override;
+  absl::Status HandleWhile(HloInstruction* xla_while) override;
+  absl::Status HandleConcatenate(HloInstruction* concatenate) override;
+  absl::Status HandleConditional(HloInstruction* conditional) override;
+  absl::Status HandleScatter(HloInstruction* scatter) override;
+  absl::Status HandleAfterAll(HloInstruction* after_all) override;
+  absl::Status HandleAddDependency(HloInstruction* add_dependency) override;
+  absl::Status HandlePartitionId(HloInstruction* hlo) override;
+  absl::Status HandleReplicaId(HloInstruction* hlo) override;
+  absl::Status HandleRng(HloInstruction* rng) override;
+  absl::Status HandleRngGetAndUpdateState(HloInstruction* rng_state) override;
+  absl::Status FinishVisit(HloInstruction* root) override;
+
+  absl::Status Preprocess(HloInstruction* hlo) override;
+  absl::Status Postprocess(HloInstruction* hlo) override;
 
   // A convenient helper for calling BufferAssignment::GetUniqueSlice.
   BufferAllocation::Slice GetAllocationSlice(
@@ -189,16 +191,16 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   }
 
  private:
-  Status HandleSliceToDynamic(HloInstruction* hlo);
-  Status HandlePadToStatic(HloInstruction* hlo);
-  Status HandleTopK(HloInstruction* hlo);
-  Status HandleAllReduceSingleReplica(HloInstruction* crs);
-  Status HandleAllReduceMultipleReplica(HloInstruction* crs);
+  absl::Status HandleSliceToDynamic(HloInstruction* hlo);
+  absl::Status HandlePadToStatic(HloInstruction* hlo);
+  absl::Status HandleTopK(HloInstruction* hlo);
+  absl::Status HandleAllReduceSingleReplica(HloInstruction* crs);
+  absl::Status HandleAllReduceMultipleReplica(HloInstruction* crs);
 #if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
-  Status HandleOneDnnMatMulCalls(HloInstruction* hlo,
-                                 std::string runtime_symbol_name);
-  Status HandleOneDnnSoftmax(HloInstruction* hlo);
-  Status HandleOneDnnLayerNorm(HloInstruction* hlo);
+  absl::Status HandleOneDnnMatMulCalls(HloInstruction* hlo,
+                                       std::string runtime_symbol_name);
+  absl::Status HandleOneDnnSoftmax(HloInstruction* hlo);
+  absl::Status HandleOneDnnLayerNorm(HloInstruction* hlo);
 #endif  // INTEL_MKL && ENABLE_ONEDNN_V3
   // Private helper to initialize an IR function for the computation.
   void InitializeIrFunction(const std::string& function_name);
@@ -316,7 +318,7 @@ class IrEmitter : public DfsHloVisitorWithDefault,
 
   // Verifies that the element types of all of the given operand instructions
   // match and are of one of the given supported types.
-  Status ElementTypesSameAndSupported(
+  absl::Status ElementTypesSameAndSupported(
       const HloInstruction& instruction,
       absl::Span<const HloInstruction* const> operands,
       absl::Span<const PrimitiveType> supported_types);
@@ -331,23 +333,23 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   // in the loop name.
   //
   // TODO(jingyue): target_op should be a `const HloInstruction*`.
-  Status EmitTargetElementLoop(
+  absl::Status EmitTargetElementLoop(
       HloInstruction* target_op,
       const llvm_ir::ElementGenerator& element_generator);
-  Status EmitTargetElementLoop(
+  absl::Status EmitTargetElementLoop(
       HloInstruction* target_op, absl::string_view desc,
       const llvm_ir::ElementGenerator& element_generator);
 
   // Emits a memcpy from the source instruction's result value to the
   // destination's.  Both source and destination must have an entry in the
   // emitted_value_ table.
-  Status EmitMemcpy(const HloInstruction& source,
-                    const HloInstruction& destination);
+  absl::Status EmitMemcpy(const HloInstruction& source,
+                          const HloInstruction& destination);
 
   // Emits IR to compute the target address of the buffer for the given op.
   // After calling this function, you can get a pointer to this buffer by
   // calling GetIrArrayForOp or GetEmittedValueFor.
-  Status EmitTargetAddressForOp(const HloInstruction* op);
+  absl::Status EmitTargetAddressForOp(const HloInstruction* op);
 
   // Structurizes "array_elements" into an MD array that represents "shape".
   // This is a recursive function, and "dimension_index" indicates the index of
@@ -450,6 +452,16 @@ class IrEmitter : public DfsHloVisitorWithDefault,
       bool only_accesses_arg_memory = false,
       bool only_accesses_inaccessible_mem_or_arg_mem = false);
 
+  template <typename T>
+  llvm::AllocaInst* StoreTypes(std::string_view alloca_name, T&& args);
+  template <typename T>
+  llvm::Value* StoreShapes(std::string_view alloca_name, T&& args);
+
+  // Emits a call to a proxy that builds an FFI call frame for `custom_call`
+  llvm::Value* EmitCallToFfi(HloCustomCallInstruction* custom_call,
+                             llvm::Value* output_address,
+                             llvm::AllocaInst* operands_alloca);
+
   // Assignment of the buffers needed by the computation and their shape
   // information.
   const BufferAssignment& assignment_;
@@ -642,8 +654,8 @@ class IrEmitter : public DfsHloVisitorWithDefault,
 
   // Emit IR to transfer between a {infeed,outfeed} buffer and an in-program
   // address.
-  Status EmitXfeedTransfer(XfeedKind kind, const Shape& shape,
-                           llvm::Value* program_buffer_address);
+  absl::Status EmitXfeedTransfer(XfeedKind kind, const Shape& shape,
+                                 llvm::Value* program_buffer_address);
 
   // Returns a ConstExpr bitcast.
   llvm::Constant* EmitGlobalForLiteral(const Literal& literal);
diff --git a/third_party/xla/xla/service/cpu/ir_function.cc b/third_party/xla/xla/service/cpu/ir_function.cc
index 69961501a37b8b..1c6180604b6d88 100644
--- a/third_party/xla/xla/service/cpu/ir_function.cc
+++ b/third_party/xla/xla/service/cpu/ir_function.cc
@@ -234,7 +234,7 @@ std::vector<llvm::Value*> GetArrayFunctionCallArguments(
 
 // Emits a call to a runtime fork/join function which dispatches parallel
 // calls to 'parallel_function' (and joins threads before returning).
-Status EmitCallToParallelForkJoin(
+absl::Status EmitCallToParallelForkJoin(
     const std::vector<llvm::Value*>& arguments, const Shape& shape,
     absl::Span<const int64_t> dimension_partition_counts, llvm::IRBuilder<>* b,
     llvm::Function* parallel_function, absl::string_view name) {
@@ -329,7 +329,7 @@ Status EmitCallToParallelForkJoin(
   // Emit call to parallel fork/join.
   b->CreateCall(fork_join_func, fork_join_arguments);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace cpu
diff --git a/third_party/xla/xla/service/cpu/ir_function.h b/third_party/xla/xla/service/cpu/ir_function.h
index 47034675bdf8e6..73a85867438a9d 100644
--- a/third_party/xla/xla/service/cpu/ir_function.h
+++ b/third_party/xla/xla/service/cpu/ir_function.h
@@ -145,7 +145,7 @@ std::vector<llvm::Value*> GetArrayFunctionCallArguments(
 
 // Emits a call to a runtime fork/join function which dispatches parallel
 // calls to 'parallel_function' (and joins threads before returning).
-Status EmitCallToParallelForkJoin(
+absl::Status EmitCallToParallelForkJoin(
     const std::vector<llvm::Value*>& arguments, const Shape& shape,
     absl::Span<const int64_t> dimension_partition_counts, llvm::IRBuilder<>* b,
     llvm::Function* parallel_function, absl::string_view name);
diff --git a/third_party/xla/xla/service/cpu/mlir_emitter.cc b/third_party/xla/xla/service/cpu/mlir_emitter.cc
index 8d3a28815ee01b..62939d896b6eff 100644
--- a/third_party/xla/xla/service/cpu/mlir_emitter.cc
+++ b/third_party/xla/xla/service/cpu/mlir_emitter.cc
@@ -82,7 +82,7 @@ void BuildViewForBuffer(llvm::SmallVectorImpl<llvm::Value *> *args,
 }
 }  // namespace
 
-Status EmitMlirFuncAndCall(
+absl::Status EmitMlirFuncAndCall(
     mlir::MLIRContext *context, llvm::IRBuilder<> *b, const Shape &result_shape,
     llvm::ArrayRef<Shape> operand_shapes, llvm::Value *result_ptr,
     llvm::ArrayRef<llvm::Value *> operand_ptrs, llvm::StringRef func_name,
@@ -133,7 +133,7 @@ Status EmitMlirFuncAndCall(
   }
   b->CreateCall(func, op_vals);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace cpu
diff --git a/third_party/xla/xla/service/cpu/mlir_emitter.h b/third_party/xla/xla/service/cpu/mlir_emitter.h
index c7a8480e4e9d73..af1b1626af2a50 100644
--- a/third_party/xla/xla/service/cpu/mlir_emitter.h
+++ b/third_party/xla/xla/service/cpu/mlir_emitter.h
@@ -32,7 +32,7 @@ namespace cpu {
 // `emitter` and create a call, passing it the buffers defined by
 // resultShape/resultPtr and operandShapes/operandPtrs. The function is added to
 // the LLVM module at `b`s insertion point.
-Status EmitMlirFuncAndCall(
+absl::Status EmitMlirFuncAndCall(
     mlir::MLIRContext *context, llvm::IRBuilder<> *b, const Shape &result_shape,
     llvm::ArrayRef<Shape> operand_shapes, llvm::Value *result_ptr,
     llvm::ArrayRef<llvm::Value *> operand_ptrs, llvm::StringRef func_name,
diff --git a/third_party/xla/xla/service/cpu/onednn_matmul.cc b/third_party/xla/xla/service/cpu/onednn_matmul.cc
index 04e0bd47a2fb00..4a6484ec07d405 100644
--- a/third_party/xla/xla/service/cpu/onednn_matmul.cc
+++ b/third_party/xla/xla/service/cpu/onednn_matmul.cc
@@ -138,6 +138,9 @@ std::unique_ptr<matmul::primitive_desc> CreateMatMulPrimDesc(
       case OneDnnMatMulConfig::GELU_ERF:
         post_ops.append_eltwise(dnnl::algorithm::eltwise_gelu_erf, 0.f, 0.f);
         break;
+      case OneDnnMatMulConfig::RELU6:
+        post_ops.append_eltwise(dnnl::algorithm::eltwise_clip_v2, 0.f, 6.0f);
+        break;
       case OneDnnMatMulConfig::BIAS: {
         bias_md = fused_mds.at(fused_operand_idx);
         // Extend bias rank to match result rank.
@@ -156,6 +159,9 @@ std::unique_ptr<matmul::primitive_desc> CreateMatMulPrimDesc(
         }
         fused_operand_idx++;
       } break;
+      case OneDnnMatMulConfig::ELU:
+        post_ops.append_eltwise(dnnl::algorithm::eltwise_elu, 1.0f, 0.0f);
+        break;
       case OneDnnMatMulConfig::BINARY_ADD: {
         auto binary_md = fused_mds.at(fused_operand_idx);
         if (fused_operands_ref) {
diff --git a/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc b/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc
index 08749c15399c73..57418d4959e145 100644
--- a/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc
+++ b/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "xla/service/cpu/backend_config.pb.h"
 #include "xla/service/cpu/onednn_matmul.h"
 #include "xla/service/cpu/onednn_memory_util.h"
+#include "xla/service/cpu/onednn_pattern_utils.h"
 #include "xla/service/cpu/onednn_util.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/pattern_matcher.h"
@@ -40,8 +41,9 @@ namespace cpu {
 
 namespace {
 namespace m = match;
+namespace pu = ::xla::cpu::onednn_pattern_utils_internal;
 
-inline Status ValidateDotDimensionNumbers(
+inline absl::Status ValidateDotDimensionNumbers(
     const DotDimensionNumbers& dim_numbers) {
   // Checks some invariants that do not hold in general, but DotDecomposer
   // should have established for us.
@@ -53,7 +55,7 @@ inline Status ValidateDotDimensionNumbers(
       absl::c_equal(batch_dim_numbers, dim_numbers.lhs_batch_dimensions()));
   TF_RET_CHECK(
       absl::c_equal(batch_dim_numbers, dim_numbers.rhs_batch_dimensions()));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Whether the element type of instr is compatible with oneDNN kernels.
@@ -63,34 +65,44 @@ inline bool CompatibleElementType(const HloInstruction* instr) {
   return element_type == BF16 || element_type == F32 || element_type == F16;
 }
 
-// Type conversion from and to any of F16, BF16 and FP32.
-// TODO(intel-tf): Support more types when enabled.
-template <typename Pattern>
-inline auto SupportedConvert(Pattern pattern) {
-  auto supported_convert = [](const HloInstruction* instr) -> bool {
-    return CompatibleElementType(instr) &&
-           CompatibleElementType(instr->operand(0));
-  };
-  return m::Convert(pattern).WithPredicate(supported_convert);
+inline bool IsRowMajor(const Shape& shape) {
+  return LayoutUtil::IsMonotonicWithDim0Major(shape.layout());
 }
 
 template <typename Pattern>
-inline auto SupportedConvert(HloInstruction** convert, Pattern pattern) {
-  auto supported_convert = [](const HloInstruction* instr) -> bool {
-    return CompatibleElementType(instr) &&
-           CompatibleElementType(instr->operand(0));
+inline auto BitcastWithReshapeSemantics(HloInstruction** bitcast,
+                                        Pattern pattern) {
+  // TODO(intel-tf): Add stronger condition that Bitcast does not have transpose
+  // semantics. Some of the HLO passes replaces Transpose with Bitcast. Here
+  // the layouts are checked to be rowmajor since the current pass runs after
+  // the layout assignment and oneDNN matmul is enabled for rowmajor layouts.
+  auto is_reshape = [](const HloInstruction* instr) -> bool {
+    if (!instr) return false;
+    auto input_shape = instr->operand(0)->shape();
+    auto output_shape = instr->shape();
+    bool is_same_type = ShapeUtil::SameElementType(input_shape, output_shape);
+    bool has_equal_num_elems = ShapeUtil::ElementsIn(input_shape) ==
+                               ShapeUtil::ElementsIn(output_shape);
+    bool has_rowmajor_layout =
+        IsRowMajor(input_shape) && IsRowMajor(output_shape);
+    return is_same_type && has_equal_num_elems && has_rowmajor_layout;
   };
-  return m::Convert(convert, pattern).WithPredicate(supported_convert);
+  return m::Bitcast(bitcast, pattern).WithPredicate(is_reshape);
 }
 
 template <typename Pattern>
-auto ElementwiseSafeIntermediate(HloInstruction** instr, Pattern pattern) {
+auto ElementwiseSafeIntermediates(HloInstruction** instr,
+                                  HloInstruction** optional_bitcast,
+                                  Pattern pattern) {
   return m::AnyOf<HloInstruction>(
       m::Broadcast(instr, pattern.WithOneUser()),
       m::Slice(instr, pattern.WithOneUser()),
       m::Bitcast(instr, pattern.WithOneUser()),
       m::Reshape(instr, pattern.WithOneUser()),
-      SupportedConvert(instr, pattern.WithOneUser()), pattern);
+      pu::SupportedConvert(instr, pattern.WithOneUser()),
+      pu::SupportedConvert(instr, BitcastWithReshapeSemantics(
+                                      optional_bitcast, pattern.WithOneUser())),
+      pattern);
 }
 
 inline auto OneDnnMatmulInstr(HloInstruction** instr) {
@@ -110,6 +122,17 @@ inline auto BcastConstScalar(double value) {
   return BcastConstScalar(nullptr, value);
 }
 
+inline auto BcastConvertConstScalar(double value) {
+  return m::Broadcast(pu::OptionalConvert(m::ConstantScalar(value)));
+}
+
+inline bool IsBatchDot(const HloInstruction& instr) {
+  if (auto* dot_instr = DynCast<HloDotInstruction>(&instr)) {
+    return dot_instr->dot_dimension_numbers().lhs_batch_dimensions_size() > 0;
+  }
+  return false;
+}
+
 auto ConstScalarNear(double value) {
   return m::ConstantScalar().WithPredicate(
       [expected = value](const HloInstruction* instr) {
@@ -178,22 +201,26 @@ inline auto MultiplyMultiplyAnyOrder(PatternA a, PatternB b, PatternC c) {
       m::MultiplyAnyOrder(c, m::MultiplyAnyOrder(a, b)));
 }
 
-bool GELUActivation(HloInstruction* instr, HloInstruction** src) {
-  // Attempt to match GELU_TANH activation
+auto GELUActivation(HloInstruction* instr, HloInstruction** src) {
+  // Attempt to match GELU_TANH activation or GELU_ERF activation
   // (https://arxiv.org/abs/1606.08415), where:
   // gelu_tanh(x) = x * cdf(x)
   // cdf(x) = 0.5 * (1 + tanh(sqrt(2 / pi) * (x + 0.044715 * x**3))
-  //                     \--------- errf_approximate term -------/
+  //                     -------------errf_approximate------------
+  //
+  // gelu_erf(x) = x * cdf(x)
+  // cdf(x) = 0.5 * (1 + erf(x / sqrt(2)))
+  //                     --errf_exact--
 
   HloInstruction* errf;
 
   // The expression 0.5 * x * (1 + errf) as common pattern for GELU exact and
   // approximate activations.
-  auto common_pattrn = MultiplyMultiplyAnyOrder(
+  auto common_pattern = MultiplyMultiplyAnyOrder(
       BcastConstScalar(0.5), m::Op(src),
       m::AddAnyOrder(BcastConstScalar(1.0), m::Op(&errf).WithOneUser()));
 
-  bool matched = Match(instr, common_pattrn);
+  bool matched = Match(instr, common_pattern);
   if (matched) {
     // The subexpression 0.044715 * x**3 appears in GELU approximate activation.
     // However, it is often optimized by other HLO passes into an expression of
@@ -221,16 +248,36 @@ bool GELUActivation(HloInstruction* instr, HloInstruction** src) {
                     m::AddAnyOrder(m::Op().Is(*src), subexpr_pattern)
                         .WithOneUser()))
             .WithOneUser();
-    matched = Match(errf, errf_apprx_pattern);
+
+    HloInstruction* erf;
+    auto errf_exact_pattern =
+        m::Op(&erf)
+            .WithOpcode(HloOpcode::kErf)
+            .WithOperand(
+                0, m::MultiplyAnyOrder(m::Op(src),
+                                       m::AnyOf<HloInstruction>(
+                                           BcastConstScalarNear(0.707106769),
+                                           BcastConstScalarNear(0.70703125),
+                                           BcastConstScalarNear(0.707182348)))
+                       .WithOneUser())
+            .WithOneUser();
+
+    if (Match(errf, errf_apprx_pattern)) {
+      // Matched Gelu-approximate pattern
+      return OneDnnMatMulConfig::GELU_TANH;
+    } else if (Match(errf, errf_exact_pattern)) {
+      // Matched Gelu-exact pattern
+      return OneDnnMatMulConfig::GELU_ERF;
+    }
   }
-  return matched;
+  return OneDnnMatMulConfig::UNDEFINED;
 }
 
 // OneDNN matmul can fuse add operation with automatic broadcasting along the
 // addend's dimensions that are 1s. When compatible, Broadcast can be replaced
 // by Bitcast, which is much cheaper. Compute new shape for the Bitcast.
-StatusOr<Shape> AdjustBiasShape(const HloInstruction* broadcast_instr,
-                                const Shape& dot_shape) {
+absl::StatusOr<Shape> AdjustBiasShape(const HloInstruction* broadcast_instr,
+                                      const Shape& dot_shape) {
   if (broadcast_instr->opcode() != HloOpcode::kBroadcast) {
     return absl::InvalidArgumentError(
         "Hlo instruction is not a Broadcast insruction.");
@@ -295,31 +342,6 @@ inline bool IsOperandFusible(HloInstruction* operand, HloInstruction* dot) {
   return true;
 }
 
-inline bool IsRowMajor(const Shape& shape) {
-  return LayoutUtil::IsMonotonicWithDim0Major(shape.layout());
-}
-
-template <typename Pattern>
-inline auto BitcastWithReshapeSemantics(HloInstruction** bitcast,
-                                        Pattern pattern) {
-  // TODO(intel-tf): Add stronger condition that Bitcast does not have transpose
-  // semantics. Some of the HLO passes replaces Transpose with Bitcast. Here
-  // the layouts are checked to be rowmajor since the current pass runs after
-  // the layout assignment and oneDNN matmul is enabled for rowmajor layouts.
-  auto is_reshape = [](const HloInstruction* instr) -> bool {
-    if (!instr) return false;
-    auto input_shape = instr->operand(0)->shape();
-    auto output_shape = instr->shape();
-    bool is_same_type = ShapeUtil::SameElementType(input_shape, output_shape);
-    bool has_equal_num_elems = ShapeUtil::ElementsIn(input_shape) ==
-                               ShapeUtil::ElementsIn(output_shape);
-    bool has_rowmajor_layout =
-        IsRowMajor(input_shape) && IsRowMajor(output_shape);
-    return is_same_type && has_equal_num_elems && has_rowmajor_layout;
-  };
-  return m::Bitcast(bitcast, pattern).WithPredicate(is_reshape);
-}
-
 template <typename Pattern>
 inline auto OptionalConvertAndBitcast(HloInstruction** optional_convert,
                                       HloInstruction** optional_bitcast,
@@ -331,12 +353,11 @@ inline auto OptionalConvertAndBitcast(HloInstruction** optional_convert,
   //   2. pattern-root -> bf16/f16-to-fp32 convert
   //   3. pattern-root -> bitcast
   //   4. pattern-root
-  auto common =
-      m::AnyOf<HloInstruction>(
-          SupportedConvert(optional_convert, std::move(pattern).WithOneUser())
-              .WithElementType(PrimitiveType::F32),
-          std::move(pattern).WithOneUser())
-          .WithOneUser();
+  auto common = m::AnyOf<HloInstruction>(
+      pu::SupportedConvert(optional_convert, std::move(pattern).WithOneUser())
+          .WithOperand(0, m::Op().WithElementType(PrimitiveType::BF16))
+          .WithElementType(PrimitiveType::F32),
+      std::move(pattern).WithOneUser());
   return m::AnyOf<HloInstruction>(
       BitcastWithReshapeSemantics(optional_bitcast, common), common);
 }
@@ -375,8 +396,11 @@ bool OneDnnMatMulRewriter::ShouldRewrite(const HloInstruction* dot_instr) {
 
   // Layout should be row-major, contraction dimensions captures transpose
   // scenarios in last two dimensions.
-  if (!IsRowMajor(lhs_shape) || !IsRowMajor(rhs_shape) ||
-      !IsRowMajor(output_shape)) {
+  // Col-major layouts are corrected to row-majow for BatchDot operation as
+  // part of the layout-pass.
+  if (!IsBatchDot(*dot_instr) &&
+      (!IsRowMajor(lhs_shape) || !IsRowMajor(rhs_shape) ||
+       !IsRowMajor(output_shape))) {
     return false;
   }
 
@@ -404,14 +428,15 @@ class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
  public:
   // Matches patterns for possible MatMul fusions that are supported by oneDNN
   // library. Matched HLO instruction(s) are replaced by custom call.
-  Status HandleDot(HloInstruction* instr) override {
+  absl::Status HandleDot(HloInstruction* instr) override {
     HloInstruction* dot_instr;
     auto pattern = m::Op(&dot_instr).WithOpcode(HloOpcode::kDot);
-    if (!Match(instr, pattern)) return OkStatus();
+    if (!Match(instr, pattern)) return absl::OkStatus();
 
     TF_RETURN_IF_ERROR(
         ValidateDotDimensionNumbers(dot_instr->dot_dimension_numbers()));
-    if (!OneDnnMatMulRewriter::ShouldRewrite(dot_instr)) return OkStatus();
+    if (!OneDnnMatMulRewriter::ShouldRewrite(dot_instr))
+      return absl::OkStatus();
     TF_ASSIGN_OR_RETURN(dot_instr, ReconfigureDotDimensions(dot_instr));
     auto dot_dim_numbers = dot_instr->dot_dimension_numbers();
     const Shape& lhs_shape = dot_instr->operand(0)->shape();
@@ -436,10 +461,10 @@ class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
     matmul_config->set_transpose_b(transpose_b);
     TF_RETURN_IF_ERROR(matmul_call->set_backend_config(backend_config));
     TF_RETURN_IF_ERROR(ReplaceInstruction(dot_instr, matmul_call));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status HandleAdd(HloInstruction* instr) override {
+  absl::Status HandleAdd(HloInstruction* instr) override {
     // Try to do a fusion for Dot(onednn-matmul) + Add. However,
     // HLO Add instruction might receive the addends after additional
     // processing like Broadcast, Bitcast, Convert, etc. is applied to the raw
@@ -472,7 +497,8 @@ class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
         m::Op(&addend_intermediate));
 
     if (Match(instr, pattern)) {
-      if (!IsSupportedType(dot->shape().element_type())) return OkStatus();
+      if (!IsSupportedType(dot->shape().element_type()))
+        return absl::OkStatus();
       // TODO(intel-tf): Remove the condition below when the fusion Dot +
       // Add(bias) + Add(e.g., residual) is enabled.
       if (!dot->backend_config<BackendConfig>()
@@ -482,7 +508,7 @@ class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
           dot->backend_config<BackendConfig>()
                   ->mutable_onednn_matmul_config()
                   ->fused_ops(0) == OneDnnMatMulConfig::BIAS) {
-        return OkStatus();
+        return absl::OkStatus();
       }
       std::vector<HloInstruction*> new_operands;
       for (auto operand : dot->operands()) {
@@ -509,7 +535,7 @@ class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
           m::Convert(&addend, m::Op()),
           m::Broadcast(&optional_addend_broadcast, m::Op(&addend)),
           m::Op(&addend));
-      if (!Match(addend_intermediate, addend_pattern)) return OkStatus();
+      if (!Match(addend_intermediate, addend_pattern)) return absl::OkStatus();
 
       if (optional_addend_broadcast && addend->shape().rank() != 1) {
         auto new_shape =
@@ -519,24 +545,31 @@ class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
               HloInstruction::CreateBitcast(new_shape.value(), addend));
         } else {
           VLOG(2) << new_shape.status();
-          return OkStatus();
+          return absl::OkStatus();
         }
       }
 
       // Validate addend for fusion.
-      if (CompatibleElementType(addend) && IsOperandFusible(addend, dot)) {
+      if (IsSupportedType(addend->shape().element_type()) &&
+          IsOperandFusible(addend, dot)) {
         new_operands.push_back(addend);
       } else {
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       auto matmul_call = Cast<HloCustomCallInstruction>(instr->AddInstruction(
           dot->CloneWithNewOperands(dot->shape(), new_operands)));
 
+      OneDnnMatMulConfig_FusionKind kind;
       auto backend_config = matmul_call->backend_config<BackendConfig>();
-      backend_config->mutable_onednn_matmul_config()->add_fused_ops(
-          addend->shape().rank() != 1 ? OneDnnMatMulConfig::BINARY_ADD
-                                      : OneDnnMatMulConfig::BIAS);
+      if (backend_config->mutable_onednn_matmul_config()->fused_ops().empty() &&
+          addend->shape().rank() == 1) {
+        kind = OneDnnMatMulConfig::BIAS;
+      } else {
+        kind = OneDnnMatMulConfig::BINARY_ADD;
+      }
+      backend_config->mutable_onednn_matmul_config()->add_fused_ops(kind);
+
       if (optional_addend_broadcast) {
         backend_config->mutable_onednn_matmul_config()->set_bias_broadcast(
             true);
@@ -582,46 +615,118 @@ class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
       TF_RETURN_IF_ERROR(ReplaceInstruction(instr, new_instr));
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status HandleMaximum(HloInstruction* instr) override {
+  absl::Status HandleMaximum(HloInstruction* instr) override {
     HloInstruction* matmul_call;
     HloInstruction* intermediate_instr = nullptr;
+    HloInstruction* optional_bitcast = nullptr;
     // Attempt to elide maximum and fuse ReLU activation into GEMM, including
     // when slicing or bitcasting is applied to the result.
-    if (Match(instr, m::MaximumAnyOrder(ElementwiseSafeIntermediate(
-                                            &intermediate_instr,
-                                            OneDnnMatmulInstr(&matmul_call))
-                                            .WithOneUser(),
-                                        BcastConstScalar(0)))) {
+    if (Match(instr,
+              m::MaximumAnyOrder(ElementwiseSafeIntermediates(
+                                     &intermediate_instr, &optional_bitcast,
+                                     OneDnnMatmulInstr(&matmul_call))
+                                     .WithOneUser(),
+                                 BcastConstScalar(0)))) {
       return FuseActivation(OneDnnMatMulConfig::RELU, instr, matmul_call,
-                            intermediate_instr);
+                            intermediate_instr, optional_bitcast);
     }
-    return OkStatus();
+    return absl::OkStatus();
+  }
+
+  auto ELUActivation(HloInstruction* instr, HloInstruction** src) {
+    //  Reference: tensorflow/compiler/tf2xla/kernels/elu_op.cc
+    //  const auto zero = ScalarLike(x, 0);
+    //  const auto pred = Gt(x, zero);
+    //  const auto expm1 = Expm1(x);
+    //  return Select(pred, x, expm1);
+    auto pattern = m::Select(
+        m::Gt(pu::OptionalConvert(m::Op(src)), BcastConvertConstScalar(0)),
+        m::Op(src),
+        pu::OptionalConvert(m::Expm1(pu::OptionalConvert(m::Op(src)))));
+    return Match(instr, pattern);
   }
 
-  Status HandleMultiply(HloInstruction* instr) override {
+  Status HandleSelect(HloInstruction* instr) override {
     HloInstruction* matmul_call;
     HloInstruction* intermediate_instr = nullptr;
+    HloInstruction* optional_bitcast = nullptr;
     HloInstruction* src;
-    if (GELUActivation(instr, &src)) {
-      if (Match(src,
-                ElementwiseSafeIntermediate(&intermediate_instr,
-                                            OneDnnMatmulInstr(&matmul_call)))) {
-        return FuseActivation(OneDnnMatMulConfig::GELU_TANH, instr, matmul_call,
+    // Attempt to elide ELU subgraph and fuse ELU activation into GEMM,
+    // including when slicing or bitcasting is applied to the result.
+    if (ELUActivation(instr, &src)) {
+      if (Match(src, ElementwiseSafeIntermediates(
+                         &intermediate_instr, &optional_bitcast,
+                         OneDnnMatmulInstr(&matmul_call)))) {
+        return FuseActivation(OneDnnMatMulConfig::ELU, instr, matmul_call,
                               intermediate_instr);
       }
     }
+    return absl::OkStatus();
+  }
+
+  Status HandleTanh(HloInstruction* instr) override {
+    HloInstruction* matmul_call;
+    HloInstruction* intermediate_instr = nullptr;
+    HloInstruction* optional_bitcast = nullptr;
+    // Attempt to elide Tanh and fuse Tanh activation into GEMM, including
+    // when slicing or bitcasting is applied to the result.
+    if (Match(instr, m::Tanh(ElementwiseSafeIntermediates(
+                                 &intermediate_instr, &optional_bitcast,
+                                 OneDnnMatmulInstr(&matmul_call))
+                                 .WithOneUser()))) {
+      return FuseActivation(OneDnnMatMulConfig::TANH, instr, matmul_call,
+                            intermediate_instr);
+    }
+    return absl::OkStatus();
+  }
+
+  Status HandleClamp(HloInstruction* instr) override {
+    HloInstruction* matmul_call;
+    HloInstruction* intermediate_instr = nullptr;
+    HloInstruction* optional_bitcast = nullptr;
+    // Attempt to elide RELU6 and fuse RELU6 activation into GEMM, including
+    // when slicing or bitcasting is applied to the result.
+    if (Match(instr, m::Clamp(BcastConstScalar(0),
+                              ElementwiseSafeIntermediates(
+                                  &intermediate_instr, &optional_bitcast,
+                                  OneDnnMatmulInstr(&matmul_call))
+                                  .WithOneUser(),
+                              BcastConstScalar(6)))) {
+      return FuseActivation(OneDnnMatMulConfig::RELU6, instr, matmul_call,
+                            intermediate_instr);
+    }
+    return absl::OkStatus();
+  }
+
+  absl::Status HandleMultiply(HloInstruction* instr) override {
+    HloInstruction* matmul_call;
+    HloInstruction* intermediate_instr = nullptr;
+    HloInstruction* src;
+    auto activation = GELUActivation(instr, &src);
+    if (activation != OneDnnMatMulConfig::UNDEFINED) {
+      HloInstruction* optional_bitcast = nullptr;
+      if (Match(src, ElementwiseSafeIntermediates(
+                         &intermediate_instr, &optional_bitcast,
+                         OneDnnMatmulInstr(&matmul_call)))) {
+        return FuseActivation(activation, instr, matmul_call,
+                              intermediate_instr, optional_bitcast);
+      }
+    }
 
     HloInstruction *dot, *constant;
+    HloInstruction* optional_convert = nullptr;
     auto pattern = m::Op(&instr)
                        .WithOpcode(HloOpcode::kMultiply)
                        .WithBinaryOperandsAnyOrder(
-                           m::Op(&dot)
-                               .WithOneUser()
-                               .WithOpcode(HloOpcode::kCustomCall)
-                               .WithCustomCallTarget({"__onednn$matmul"}),
+                           m::AnyOf<HloInstruction>(
+                               pu::SupportedConvert(&optional_convert,
+                                                    OneDnnMatmulInstr(&dot))
+                                   .WithElementType(PrimitiveType::F32),
+                               OneDnnMatmulInstr(&dot))
+                               .WithOneUser(),
                            m::Broadcast(m::Constant(&constant)));
 
     if (Match(instr, pattern)) {
@@ -641,23 +746,51 @@ class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
       backend_config->mutable_onednn_matmul_config()->set_alpha_typecast(
           *(reinterpret_cast<int32_t*>(&constant_value)));
       TF_RETURN_IF_ERROR(matmul_call->set_backend_config(*backend_config));
-      TF_RETURN_IF_ERROR(ReplaceInstruction(instr, matmul_call));
+      HloInstruction* new_instr;
+      if (optional_convert != nullptr &&
+          optional_convert->opcode() == HloOpcode::kConvert) {
+        new_instr = matmul_call->AddInstruction(HloInstruction::CreateConvert(
+            ShapeUtil::ChangeElementType(
+                matmul_call->shape(), optional_convert->shape().element_type()),
+            matmul_call));
+      } else {
+        new_instr = matmul_call;
+      }
+
+      TF_RETURN_IF_ERROR(ReplaceInstruction(instr, new_instr));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status FuseActivation(OneDnnMatMulConfig_FusionKind kind,
-                        HloInstruction* activation, HloInstruction* matmul,
-                        HloInstruction* intermediate_instr = nullptr) {
+  absl::Status FuseActivation(OneDnnMatMulConfig_FusionKind kind,
+                              HloInstruction* activation,
+                              HloInstruction* matmul,
+                              HloInstruction* intermediate_instr = nullptr,
+                              HloInstruction* optional_bitcast = nullptr) {
     TF_ASSIGN_OR_RETURN(auto backend_config,
                         matmul->backend_config<BackendConfig>());
     auto* matmul_config = backend_config.mutable_onednn_matmul_config();
     matmul_config->add_fused_ops(kind);
-
+    TF_RETURN_IF_ERROR(matmul->set_backend_config(backend_config));
     std::unique_ptr<HloInstruction> output = matmul->Clone();
-    TF_RETURN_IF_ERROR(output->set_backend_config(backend_config));
-
-    if (intermediate_instr) {
+    if (optional_bitcast != nullptr &&
+        optional_bitcast->opcode() == HloOpcode::kBitcast) {
+      HloInstruction* new_instr = nullptr;
+      if (intermediate_instr != nullptr &&
+          intermediate_instr->opcode() == HloOpcode::kConvert) {
+        auto bitcast_call =
+            matmul->AddInstruction(HloInstruction::CreateBitcast(
+                ShapeUtil::ChangeElementType(optional_bitcast->shape(),
+                                             matmul->shape().element_type()),
+                matmul));
+        new_instr = bitcast_call->AddInstruction(HloInstruction::CreateConvert(
+            ShapeUtil::ChangeElementType(
+                bitcast_call->shape(),
+                intermediate_instr->shape().element_type()),
+            bitcast_call));
+        return ReplaceInstruction(activation, new_instr);
+      }
+    } else if (intermediate_instr) {
       output = intermediate_instr->CloneWithNewOperands(
           intermediate_instr->shape(),
           {matmul->parent()->AddInstruction(std::move(output))});
@@ -681,7 +814,7 @@ class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
   //      lhs:    [batch_dims,contracting_dim] to [batch_dims,1,contracting_dim]
   //      rhs:    [batch_dims,contracting_dim] to [batch_dims,contracting_dim,1]
   //      result: [batch_dims] to [batch_dims,1,1]
-  StatusOr<HloInstruction*> ReconfigureDotDimensions(
+  absl::StatusOr<HloInstruction*> ReconfigureDotDimensions(
       HloInstruction* dot_instr) {
     HloInstruction* lhs = dot_instr->mutable_operand(0);
     HloInstruction* rhs = dot_instr->mutable_operand(1);
@@ -770,7 +903,7 @@ class OneDnnPostRewriteVisitor : public DfsHloRewriteVisitor {
 #endif
   }
 
-  Status HandleCustomCall(HloInstruction* custom_call) override {
+  absl::Status HandleCustomCall(HloInstruction* custom_call) override {
     HloInstruction* matmul;
     if (Match(custom_call, OneDnnMatmulInstr(&matmul))) {
       return HandleCustomCallInternal<dnnl::matmul::primitive_desc>(
@@ -781,7 +914,7 @@ class OneDnnPostRewriteVisitor : public DfsHloRewriteVisitor {
   }
 
   template <typename PrimDesc>
-  Status HandleCustomCallInternal(HloInstruction* custom_call) {
+  absl::Status HandleCustomCallInternal(HloInstruction* custom_call) {
     auto scratch_add = AddScratch<PrimDesc>(custom_call);
     if (scratch_add.ok()) {
       custom_call = *scratch_add;
@@ -792,14 +925,14 @@ class OneDnnPostRewriteVisitor : public DfsHloRewriteVisitor {
     if (!weights_prepack.ok()) {
       VLOG(2) << weights_prepack.status();
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   template <typename>
-  Status SetWeightsPrepack(HloInstruction*, bool);
+  absl::Status SetWeightsPrepack(HloInstruction*, bool);
 
   template <typename>
-  Status SetUserScratch(HloInstruction*, bool);
+  absl::Status SetUserScratch(HloInstruction*, bool);
 
   template <typename>
   bool GetWeightsPrepack(HloInstruction*);
@@ -810,7 +943,7 @@ class OneDnnPostRewriteVisitor : public DfsHloRewriteVisitor {
   // Add scratch for matmul by changing the result of custom-call to
   // tuple(result, scratch)
   template <typename PrimDesc>
-  StatusOr<HloInstruction*> AddScratch(HloInstruction* custom_call) {
+  absl::StatusOr<HloInstruction*> AddScratch(HloInstruction* custom_call) {
     if (GetUserScratch<PrimDesc>(custom_call)) {
       return custom_call;
     }
@@ -834,19 +967,15 @@ class OneDnnPostRewriteVisitor : public DfsHloRewriteVisitor {
   }
 
   template <typename PrimDesc>
-  StatusOr<HloInstruction*> PrepackWeights(HloInstruction* custom_call) {
+  absl::StatusOr<HloInstruction*> PrepackWeights(HloInstruction* custom_call) {
     if (GetWeightsPrepack<PrimDesc>(custom_call)) {
       return custom_call;
     }
     auto weights = custom_call->operand(1);
-    if (weights->user_count() > 1) {
-      return absl::FailedPreconditionError(
-          "Cannot prepack weights. There is more than one consumer.");
-    }
     auto weights_shape = weights->shape();
     Literal weights_literal;
     if (!(weights_shape.rank() == 2 &&
-          evaluator_.TryEvaluate(weights, &weights_literal))) {
+          evaluator_.TryEvaluate(weights, &weights_literal, true))) {
       return absl::CancelledError(
           "Cannot prepack weights. Not constant 2D weights.");
     }
@@ -916,7 +1045,7 @@ EMIT_GET_BACKEND_CONFIG_SPECIALIZATION(GetWeightsPrepack,
 #define EMIT_SET_BACKEND_CONFIG_SPECIALIZATION(SETTER, PRIM_DESC, CONFIG_TYPE, \
                                                CONFIG, FIELD)                  \
   template <>                                                                  \
-  inline Status OneDnnPostRewriteVisitor::SETTER<PRIM_DESC>(                   \
+  inline absl::Status OneDnnPostRewriteVisitor::SETTER<PRIM_DESC>(             \
       HloInstruction * custom_call, bool value) {                              \
     TF_ASSIGN_OR_RETURN(auto backend_config,                                   \
                         custom_call->backend_config<BackendConfig>());         \
@@ -934,7 +1063,7 @@ EMIT_SET_BACKEND_CONFIG_SPECIALIZATION(SetUserScratch,
                                        OneDnnMatMulConfig, onednn_matmul_config,
                                        user_scratchpad);
 
-StatusOr<bool> OneDnnMatMulRewriter::Run(
+absl::StatusOr<bool> OneDnnMatMulRewriter::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   OneDnnMatMulRewriteVisitor visitor;
diff --git a/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.h b/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.h
index 36cab7ee949c37..eaeea210021666 100644
--- a/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.h
+++ b/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.h
@@ -41,7 +41,7 @@ class OneDnnMatMulRewriter : public HloModulePass {
   absl::string_view name() const override { return "onednn-matmul-rewriter"; }
 
   using HloPassInterface::Run;
-  StatusOr<bool> Run(
+  absl::StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
diff --git a/third_party/xla/xla/service/cpu/onednn_memory_util.cc b/third_party/xla/xla/service/cpu/onednn_memory_util.cc
index fd0b6f927957a0..6ab913161a7e4e 100644
--- a/third_party/xla/xla/service/cpu/onednn_memory_util.cc
+++ b/third_party/xla/xla/service/cpu/onednn_memory_util.cc
@@ -169,7 +169,7 @@ int64_t MemrefInfo::GetChannels() const { return pod_->dims[pod_->rank - 1]; }
 
 int64_t MemrefInfo::GetRank() const { return pod_->rank; }
 
-StatusOr<dnnl::memory::desc> TransposeLastTwoDims(
+absl::StatusOr<dnnl::memory::desc> TransposeLastTwoDims(
     const dnnl::memory::desc& md) {
   int64_t ndims = md.get_ndims();
   if (ndims < 2) {
diff --git a/third_party/xla/xla/service/cpu/onednn_memory_util.h b/third_party/xla/xla/service/cpu/onednn_memory_util.h
index 5793c1bdf4f1ed..c0c956a32dc0b1 100644
--- a/third_party/xla/xla/service/cpu/onednn_memory_util.h
+++ b/third_party/xla/xla/service/cpu/onednn_memory_util.h
@@ -119,7 +119,8 @@ class MemrefInfo {
   MemrefInfoPOD* pod_;
 };
 
-StatusOr<dnnl::memory::desc> TransposeLastTwoDims(const dnnl::memory::desc& md);
+absl::StatusOr<dnnl::memory::desc> TransposeLastTwoDims(
+    const dnnl::memory::desc& md);
 #define TRANSPOSE_LAST_TWO_DIMS_IF(pred, mem_desc)        \
   if (pred) {                                             \
     auto trans_mem_desc = TransposeLastTwoDims(mem_desc); \
diff --git a/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc b/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc
index 058355223cd5ee..95242415afbdde 100644
--- a/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc
+++ b/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/cpu/backend_config.pb.h"
 #include "xla/service/cpu/onednn_memory_util.h"
+#include "xla/service/cpu/onednn_pattern_utils.h"
 #include "xla/service/cpu/onednn_util.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/status_macros.h"
@@ -28,11 +29,7 @@ namespace cpu {
 
 namespace {
 namespace m = match;
-
-template <typename Pattern>
-auto OptionalConvert(Pattern pattern) {
-  return m::AnyOf<HloInstruction>(m::Convert(pattern), std::move(pattern));
-}
+namespace pu = ::xla::cpu::onednn_pattern_utils_internal;
 
 inline auto OneDnnConvertibleInstr(HloInstruction** instr) {
   return m::AnyOf<HloInstruction>(m::CustomCall(instr, {"__onednn$layernorm"}),
@@ -101,9 +98,9 @@ std::optional<HloInstruction*> MatchSoftmax(HloInstruction* instr) {
   if (!Match(instr,
              m::Divide(
                  m::Exp(&left_exponential, m::Op()),
-                 m::Broadcast(m::Reshape(
-                     m::Broadcast(OptionalConvert(m::Reshape(OptionalConvert(
-                         m::Reduce(OptionalConvert(
+                 m::Broadcast(m::Reshape(m::Broadcast(
+                     pu::OptionalConvert(m::Reshape(pu::OptionalConvert(
+                         m::Reduce(pu::OptionalConvert(
                                        m::Exp(&right_exponential, m::Op())),
                                    m::Op())
                              .WithPredicate([](const HloInstruction* reduce) {
@@ -256,7 +253,7 @@ bool MatchFlaxLayerNorm(HloInstruction* instr, HloInstruction** src,
           .WithBinaryOperandsAnyOrder(
               m::Op(&hinge).WithOneUser(),
               m::Subtract(
-                  OptionalConvert(m::Op(&prod_s)),
+                  pu::OptionalConvert(m::Op(&prod_s)),
                   m::Broadcast(
                       m::Reshape(
                           m::Broadcast(m::Reshape(m::Op(&div_red).WithOpcode(
@@ -326,8 +323,8 @@ bool MatchFlaxLayerNorm(HloInstruction* instr, HloInstruction** src,
   auto div_red_mul_src =
       m::Divide()
           .WithOperand(0, m::Reduce(m::Multiply().WithBinaryOperandsAnyOrder(
-                                        OptionalConvert(m::Op(&mul_in0)),
-                                        OptionalConvert(m::Op(&mul_in1))),
+                                        pu::OptionalConvert(m::Op(&mul_in0)),
+                                        pu::OptionalConvert(m::Op(&mul_in1))),
                                     m::Constant())
                               .WithPredicate([](const HloInstruction* reduce) {
                                 HloComputation* reducer = reduce->to_apply();
@@ -348,7 +345,7 @@ bool MatchFlaxLayerNorm(HloInstruction* instr, HloInstruction** src,
       m::Divide()
           .WithOperand(
               0,
-              m::Reduce(OptionalConvert(m::Op(&reduce_in0)), m::Constant())
+              m::Reduce(pu::OptionalConvert(m::Op(&reduce_in0)), m::Constant())
                   .WithPredicate([](const HloInstruction* reduce) {
                     HloComputation* reducer = reduce->to_apply();
                     return (reducer->root_instruction()->opcode() ==
@@ -388,7 +385,7 @@ bool MatchFlaxLayerNorm(HloInstruction* instr, HloInstruction** src,
 
 class OneDnnOpsRewriterVisitor : public DfsHloRewriteVisitor {
  public:
-  Status HandleAdd(HloInstruction* instr) override {
+  absl::Status HandleAdd(HloInstruction* instr) override {
     HloInstruction *src, *scale, *bias;
     float eps;
     bool is_bf16orfp16_convert = false;
@@ -404,7 +401,7 @@ class OneDnnOpsRewriterVisitor : public DfsHloRewriteVisitor {
                                     &is_producer_bf16orfp16, &convert_instr);
     }
 
-    if (!found_ln) return OkStatus();
+    if (!found_ln) return absl::OkStatus();
 
     const Shape& src_shape = src->shape();
     auto scale_type = scale->shape().element_type();
@@ -445,10 +442,10 @@ class OneDnnOpsRewriterVisitor : public DfsHloRewriteVisitor {
       TF_RETURN_IF_ERROR(ReplaceInstruction(instr, ln_call));
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status HandleConvert(HloInstruction* instr) override {
+  absl::Status HandleConvert(HloInstruction* instr) override {
     HloInstruction* custom_call;
     HloInstruction* convert_instr;
     auto pattern =
@@ -458,12 +455,13 @@ class OneDnnOpsRewriterVisitor : public DfsHloRewriteVisitor {
                                 .WithOneUser()
                                 .WithElementType(PrimitiveType::F32));
 
-    if (!IsSupportedType(instr->shape().element_type())) return OkStatus();
+    if (!IsSupportedType(instr->shape().element_type()))
+      return absl::OkStatus();
     if (Match(instr, pattern)) {
       bool is_bf16orfp16_convert =
           (convert_instr->shape().element_type() == PrimitiveType::BF16) ||
           (convert_instr->shape().element_type() == PrimitiveType::F16);
-      if (!is_bf16orfp16_convert) return OkStatus();
+      if (!is_bf16orfp16_convert) return absl::OkStatus();
       HloInstruction* producer = instr->mutable_operand(0)->mutable_operand(0);
       HloInstruction* newinp =
           producer->AddInstruction(HloInstruction::CreateConvert(
@@ -478,15 +476,15 @@ class OneDnnOpsRewriterVisitor : public DfsHloRewriteVisitor {
       TF_RETURN_IF_ERROR(ReplaceInstruction(instr, updated_call));
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status HandleDivide(HloInstruction* divide_instr) override {
-    if (divide_instr->HasControlDependencies()) return OkStatus();
+  absl::Status HandleDivide(HloInstruction* divide_instr) override {
+    if (divide_instr->HasControlDependencies()) return absl::OkStatus();
     if (!IsSupportedType(divide_instr->shape().element_type()))
-      return OkStatus();
+      return absl::OkStatus();
     std::optional<HloInstruction*> producer = MatchSoftmax(divide_instr);
-    if (producer == std::nullopt) return OkStatus();
+    if (producer == std::nullopt) return absl::OkStatus();
 
     const Shape& output_shape = divide_instr->shape();
     HloInstruction* softmax_call =
@@ -494,11 +492,11 @@ class OneDnnOpsRewriterVisitor : public DfsHloRewriteVisitor {
             output_shape, {producer.value()}, "__onednn$softmax"));
     TF_RETURN_IF_ERROR(ReplaceInstruction(divide_instr, softmax_call));
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
-StatusOr<bool> OneDnnOpsRewriter::Run(
+absl::StatusOr<bool> OneDnnOpsRewriter::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   OneDnnOpsRewriterVisitor visitor;
diff --git a/third_party/xla/xla/service/cpu/onednn_ops_rewriter.h b/third_party/xla/xla/service/cpu/onednn_ops_rewriter.h
index ea62f33ebcfb99..8e777d8889ad86 100644
--- a/third_party/xla/xla/service/cpu/onednn_ops_rewriter.h
+++ b/third_party/xla/xla/service/cpu/onednn_ops_rewriter.h
@@ -32,7 +32,7 @@ class OneDnnOpsRewriter : public HloModulePass {
   absl::string_view name() const override { return "onednn-ops-rewriter"; }
 
   using HloPassInterface::Run;
-  StatusOr<bool> Run(
+  absl::StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/cpu/onednn_pattern_utils.h b/third_party/xla/xla/service/cpu/onednn_pattern_utils.h
new file mode 100644
index 00000000000000..6635d79d115938
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/onednn_pattern_utils.h
@@ -0,0 +1,60 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_ONEDNN_PATTERN_UTILS_H_
+#define XLA_SERVICE_CPU_ONEDNN_PATTERN_UTILS_H_
+#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
+
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/cpu/onednn_util.h"
+#include "xla/service/pattern_matcher.h"
+
+namespace xla {
+namespace cpu {
+
+namespace onednn_pattern_utils_internal {
+namespace m = match;
+
+template <typename Pattern>
+auto OptionalConvert(Pattern pattern) {
+  return m::AnyOf<HloInstruction>(m::Convert(pattern), std::move(pattern));
+}
+
+// Type conversion from and to any of BF16 and FP32.
+// TODO(intel-tf): Support more types when enabled.
+template <typename Pattern>
+inline auto SupportedConvert(Pattern pattern) {
+  auto supported_convert = [](const HloInstruction* instr) -> bool {
+    return IsSupportedType(instr->shape().element_type()) &&
+           IsSupportedType(instr->operand(0)->shape().element_type());
+  };
+  return m::Convert(pattern).WithPredicate(supported_convert);
+}
+
+template <typename Pattern>
+inline auto SupportedConvert(HloInstruction** convert, Pattern pattern) {
+  auto supported_convert = [](const HloInstruction* instr) -> bool {
+    return IsSupportedType(instr->shape().element_type()) &&
+           IsSupportedType(instr->operand(0)->shape().element_type());
+  };
+  return m::Convert(convert, pattern).WithPredicate(supported_convert);
+}
+}  // namespace onednn_pattern_utils_internal
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // INTEL_MKL && ENABLE_ONEDNN_V3
+#endif  // XLA_SERVICE_CPU_ONEDNN_PATTERN_UTILS_H_
diff --git a/third_party/xla/xla/service/cpu/onednn_rewriter.h b/third_party/xla/xla/service/cpu/onednn_rewriter.h
index a1ba3205c96805..53fd5c0f977c48 100644
--- a/third_party/xla/xla/service/cpu/onednn_rewriter.h
+++ b/third_party/xla/xla/service/cpu/onednn_rewriter.h
@@ -33,7 +33,7 @@ class OneDnnRewriter : public HloModulePass {
   absl::string_view name() const override { return "onednn-rewriter"; }
 
   using HloPassInterface::Run;
-  StatusOr<bool> Run(
+  absl::StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 };
diff --git a/third_party/xla/xla/service/cpu/onednn_softmax.cc b/third_party/xla/xla/service/cpu/onednn_softmax.cc
index 5af6de54078596..d2a55adb2d10ac 100644
--- a/third_party/xla/xla/service/cpu/onednn_softmax.cc
+++ b/third_party/xla/xla/service/cpu/onednn_softmax.cc
@@ -20,16 +20,6 @@ limitations under the License.
 #include <initializer_list>
 #include <vector>
 
-// Both "absl/log/check.h" and "third_party/tsl/platform/logging.h"
-// are transitively included in bazel. Both of them define similar CHECK macros.
-// Explicitly including the Abseil header first because the TSL version has
-// undefs.
-
-// Otherwise, we would get redefinition error.
-// clang-format off
-#include "absl/log/check.h"
-// clang-format on
-
 #include "dnnl.hpp"
 #include "absl/base/dynamic_annotations.h"
 #include "xla/executable_run_options.h"
diff --git a/third_party/xla/xla/service/cpu/parallel_task_assignment.cc b/third_party/xla/xla/service/cpu/parallel_task_assignment.cc
index 6d103759585896..b82707a367e246 100644
--- a/third_party/xla/xla/service/cpu/parallel_task_assignment.cc
+++ b/third_party/xla/xla/service/cpu/parallel_task_assignment.cc
@@ -133,7 +133,8 @@ ParallelTaskAssignment::ParallelTaskAssignment(
   // Run cost analysis on 'module'.
   auto cost_analysis = std::make_unique<HloCostAnalysis>(shape_size);
   HloComputation* computation = module->entry_computation();
-  Status status = computation->root_instruction()->Accept(cost_analysis.get());
+  absl::Status status =
+      computation->root_instruction()->Accept(cost_analysis.get());
   if (status.ok()) {
     // Set default cost model based on 'cost_analysis'.
     cost_model_ = std::make_unique<DefaultCostModel>(
diff --git a/third_party/xla/xla/service/cpu/runtime/BUILD b/third_party/xla/xla/service/cpu/runtime/BUILD
deleted file mode 100644
index 1169fdaa39d394..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime/BUILD
+++ /dev/null
@@ -1,201 +0,0 @@
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [":friends"],
-    licenses = ["notice"],
-)
-
-package_group(
-    name = "friends",
-    includes = [
-        "//xla:friends",
-    ],
-)
-
-cc_library(
-    name = "retain",
-    srcs = ["retain.cc"],
-    visibility = ["//visibility:public"],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "collectives",
-    srcs = ["collectives.cc"],
-    hdrs = ["collectives.h"],
-    deps = [
-        "//xla:executable_run_options",
-        "//xla:shape_util",
-        "//xla:xla_data_proto_cc",
-        "//xla/runtime:custom_call",
-        "//xla/runtime:custom_call_registry",
-        "//xla/runtime:executable",
-        "//xla/runtime:memref_view",
-        "//xla/service/cpu:cpu_runtime",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-cc_library(
-    name = "convolution",
-    srcs = ["convolution.cc"],
-    hdrs = ["convolution.h"],
-    deps = [
-        "//xla:executable_run_options",
-        "//xla:xla_data_proto_cc",
-        "//xla/runtime:memref_view",
-        "//xla/service/cpu:runtime_conv2d",
-        "//xla/service/cpu:runtime_conv3d",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/types:span",
-        "@eigen_archive//:eigen3",
-    ],
-)
-
-cc_library(
-    name = "convolution_ffi",
-    srcs = ["convolution_ffi.cc"],
-    hdrs = ["convolution_ffi.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":convolution",
-        "//xla:xla_data_proto_cc",
-        "//xla/runtime:aot_ffi",
-        "//xla/runtime:aot_ffi_execution_context",
-        "//xla/runtime:memref_view",
-        "//xla/runtime/ffi:ffi_api",
-        "//xla/runtime/ffi:ffi_c_api_hdrs",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-cc_library(
-    name = "convolution_call",
-    srcs = ["convolution_call.cc"],
-    hdrs = ["convolution_call.h"],
-    deps = [
-        ":convolution",
-        "//xla:executable_run_options",
-        "//xla/runtime:custom_call",
-        "//xla/runtime:custom_call_registry",
-        "//xla/runtime:executable",
-        "//xla/runtime:memref_view",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-cc_library(
-    name = "custom_call",
-    srcs = ["custom_call.cc"],
-    hdrs = ["custom_call.h"],
-    deps = [
-        "//xla:shape_util",
-        "//xla:xla_proto_cc",
-        "//xla/runtime:custom_call",
-        "//xla/runtime:custom_call_registry",
-        "//xla/runtime:executable",
-        "//xla/runtime:memref_view",
-        "//xla/service:custom_call_status_internal",
-        "//xla/service:custom_call_status_public_headers",
-        "//xla/service:custom_call_target_registry",
-        "//xla/service:hlo_proto_cc",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-cc_library(
-    name = "fft_call",
-    srcs = ["fft_call.cc"],
-    hdrs = ["fft_call.h"],
-    deps = [
-        "//xla:executable_run_options",
-        "//xla:xla_data_proto_cc",
-        "//xla:xla_proto_cc",
-        "//xla/runtime:custom_call",
-        "//xla/runtime:custom_call_registry",
-        "//xla/runtime:executable",
-        "//xla/runtime:memref_view",
-        "//xla/service:hlo_proto_cc",
-        "//xla/service/cpu:runtime_fft",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-cc_library(
-    name = "xfeed",
-    srcs = ["xfeed.cc"],
-    hdrs = ["xfeed.h"],
-    deps = [
-        "//xla:executable_run_options",
-        "//xla:shape_util",
-        "//xla:xla_data_proto_cc",
-        "//xla/runtime:custom_call",
-        "//xla/runtime:custom_call_registry",
-        "//xla/runtime:executable",
-        "//xla/runtime:memref_view",
-        "//xla/service/cpu:cpu_runtime",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-cc_library(
-    name = "rng",
-    srcs = ["rng.cc"],
-    hdrs = ["rng.h"],
-    deps = [
-        "//xla:executable_run_options",
-        "//xla:xla_data_proto_cc",
-        "//xla/runtime:memref_view",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-cc_library(
-    name = "rng_call",
-    srcs = ["rng_call.cc"],
-    hdrs = ["rng_call.h"],
-    deps = [
-        ":rng",
-        "//xla:executable_run_options",
-        "//xla/runtime:custom_call",
-        "//xla/runtime:custom_call_registry",
-        "//xla/runtime:executable",
-        "//xla/runtime:memref_view",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-cc_library(
-    name = "rng_ffi",
-    srcs = ["rng_ffi.cc"],
-    hdrs = ["rng_ffi.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":rng",
-        "//xla:xla_data_proto_cc",
-        "//xla/runtime:aot_ffi",
-        "//xla/runtime:aot_ffi_execution_context",
-        "//xla/runtime:memref_view",
-        "//xla/runtime/ffi:ffi_api",
-        "//xla/runtime/ffi:ffi_c_api_hdrs",
-        "@com_google_absl//absl/status",
-    ],
-)
diff --git a/third_party/xla/xla/service/cpu/runtime/collectives.cc b/third_party/xla/xla/service/cpu/runtime/collectives.cc
deleted file mode 100644
index 6034cc600245b2..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime/collectives.cc
+++ /dev/null
@@ -1,368 +0,0 @@
-// Copyright 2022 The OpenXLA Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "xla/service/cpu/runtime/collectives.h"
-
-#include <cstdint>
-#include <cstring>
-#include <functional>
-#include <iterator>
-#include <memory>
-#include <numeric>
-#include <optional>
-#include <string>
-#include <string_view>
-#include <utility>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_join.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "xla/executable_run_options.h"
-#include "xla/runtime/custom_call.h"
-#include "xla/runtime/custom_call_registry.h"
-#include "xla/runtime/executable.h"
-#include "xla/runtime/memref_view.h"
-#include "xla/service/cpu/cpu_runtime.h"
-#include "xla/shape.h"
-#include "xla/shape_util.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla {
-namespace cpu {
-
-using mlir::succeeded;
-
-using ::xla::runtime::CustomCall;
-using ::xla::runtime::Executable;
-using ::xla::runtime::MemrefView;
-
-// Disable all CustomCall checks in optimized build.
-static constexpr CustomCall::RuntimeChecks RuntimeChecks() {
-#if defined(NDEBUG)
-  return CustomCall::RuntimeChecks::kNone;
-#else
-  return CustomCall::RuntimeChecks::kDefault;
-#endif
-}
-
-static std::string ReplicaGroupsToString(
-    CustomCall::TensorRef<int64_t> replica_groups) {
-  if (replica_groups.shape[0] == 0) {
-    return "{}";
-  }
-  std::string result;
-
-  const auto& shape = replica_groups.shape;
-  size_t stride = replica_groups.data.size() / shape[0];
-
-  absl::StrAppend(&result, "{");
-  for (size_t i = 0; i < replica_groups.data.size(); i += stride) {
-    if (i > 0) {
-      absl::StrAppend(&result, ", ");
-    }
-
-    auto start = replica_groups.data.begin() + i;
-    llvm::ArrayRef<int64_t> inner_data(start, start + stride);
-
-    absl::StrAppend(&result, "{");
-    absl::StrAppend(
-        &result,
-        // The replica groups can have different sizes. Smaller groups are
-        // padded with -1.
-        absl::StrJoin(llvm::make_filter_range(
-                          inner_data, [](int64_t id) { return id >= 0; }),
-                      ", "));
-    absl::StrAppend(&result, "}");
-  }
-  absl::StrAppend(&result, "}");
-
-  return result;
-}
-
-static std::string SourceTargetPairsToString(
-    CustomCall::TensorRef<int64_t> source_target_pairs) {
-  std::string result;
-  for (size_t i = 0; i < source_target_pairs.data.size(); i += 2) {
-    if (i > 0) {
-      absl::StrAppend(&result, ",");
-    }
-    absl::StrAppend(&result, source_target_pairs.data[i], "=",
-                    source_target_pairs.data[i + 1]);
-  }
-  return result;
-}
-
-// -------------------------------------------------------------------------- //
-
-namespace {
-struct XlaPartitionId {
-  absl::StatusOr<int32_t> operator()(
-      const ExecutableRunOptions* run_options) const;
-  static XlaPartitionId Handler() { return XlaPartitionId(); }
-};
-}  // namespace
-
-absl::StatusOr<int32_t> XlaPartitionId::operator()(
-    const ExecutableRunOptions* run_options) const {
-  int32_t result;
-  __xla_cpu_runtime_PartitionId(run_options, &result);
-  return result;
-}
-
-static bool PartitionId(xla::runtime::ExecutionContext* ctx, void** args,
-                        void** attrs, void** rets) {
-  static auto* handler = CustomCall::Bind("xla.cpu.partition_id")
-                             .Ret<int32_t>()
-                             .UserData<const ExecutableRunOptions*>()
-                             .To<RuntimeChecks()>(XlaPartitionId::Handler())
-                             .release();
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
-
-// -------------------------------------------------------------------------- //
-
-namespace {
-struct XlaReplicaId {
-  absl::StatusOr<int32_t> operator()(
-      const ExecutableRunOptions* run_options) const;
-  static XlaReplicaId Handler() { return XlaReplicaId(); }
-};
-}  // namespace
-
-absl::StatusOr<int32_t> XlaReplicaId::operator()(
-    const ExecutableRunOptions* run_options) const {
-  int32_t result;
-  __xla_cpu_runtime_ReplicaId(run_options, &result);
-  return result;
-}
-
-static bool ReplicaId(xla::runtime::ExecutionContext* ctx, void** args,
-                      void** attrs, void** rets) {
-  static auto* handler = CustomCall::Bind("xla.cpu.replica_id")
-                             .Ret<int32_t>()
-                             .UserData<const ExecutableRunOptions*>()
-                             .To<RuntimeChecks()>(XlaReplicaId::Handler())
-                             .release();
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
-
-// -------------------------------------------------------------------------- //
-
-namespace {
-struct XlaAllReduce {
-  absl::Status operator()(const ExecutableRunOptions* run_options,
-                          CustomCall::RemainingArgs buffers,
-                          CustomCall::TensorRef<int64_t> replica_groups,
-                          int64_t channel_id, int32_t use_global_device_ids,
-                          int64_t op_id, int32_t reduction_kind) const;
-  static XlaAllReduce Handler() { return XlaAllReduce(); }
-};
-}  // namespace
-
-absl::Status XlaAllReduce::operator()(
-    const ExecutableRunOptions* run_options, CustomCall::RemainingArgs buffers,
-    CustomCall::TensorRef<int64_t> replica_groups, int64_t channel_id,
-    int32_t use_global_device_ids, int64_t op_id,
-    int32_t reduction_kind) const {
-  if (replica_groups.shape.size() != 2) {
-    return absl::InvalidArgumentError("replica_groups must be a 2d tensor.");
-  }
-
-  if (buffers.size() % 2) {
-    return absl::InvalidArgumentError(
-        "number of input buffers and output buffers must be equal.");
-  }
-
-  std::string replica_groups_str = ReplicaGroupsToString(replica_groups);
-  int64_t num_buffers = static_cast<int64_t>(buffers.size()) / 2;
-
-  llvm::SmallVector<void*> input_buffers, output_buffers;
-  ShapeProto shape;
-  for (int i = 0; i < num_buffers; ++i) {
-    auto input = buffers.get<MemrefView>(i);
-    auto output = buffers.get<MemrefView>(i + num_buffers);
-    if (!succeeded(input) || !succeeded(output)) {
-      return absl::InvalidArgumentError("all arguments must be memrefs.");
-    }
-
-    *shape.add_tuple_shapes() =
-        ShapeUtil::MakeShapeWithDescendingLayout(input->dtype, input->sizes)
-            .ToProto();
-    input_buffers.push_back(input->data);
-    output_buffers.push_back(output->data);
-  }
-  std::string shape_str =
-      (shape.tuple_shapes().size() == 1 ? shape.tuple_shapes(0) : shape)
-          .SerializeAsString();
-
-  __xla_cpu_runtime_AllReduce(
-      run_options, replica_groups_str.c_str(),
-      static_cast<int32_t>(replica_groups_str.size()),
-      static_cast<int32_t>(channel_id), use_global_device_ids, op_id,
-      reduction_kind, shape_str.c_str(), static_cast<int32_t>(shape_str.size()),
-      static_cast<int32_t>(num_buffers), input_buffers.data(),
-      output_buffers.data());
-
-  return absl::OkStatus();
-}
-
-static bool AllReduce(xla::runtime::ExecutionContext* ctx, void** args,
-                      void** attrs, void** rets) {
-  static auto* handler =
-      CustomCall::Bind("xla.cpu.all_reduce")
-          .UserData<const ExecutableRunOptions*>()
-          .RemainingArgs()
-          .Attr<CustomCall::TensorRef<int64_t>>("replica_groups")
-          .Attr<int64_t>("channel_handle")
-          .Attr<int32_t>("use_global_device_ids")
-          .Attr<int64_t>("op_id")
-          .Attr<int32_t>("reduction_kind")
-          .To<RuntimeChecks()>(XlaAllReduce::Handler())
-          .release();
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
-
-// -------------------------------------------------------------------------- //
-
-namespace {
-struct XlaTupleAllToAll {
-  absl::Status operator()(const ExecutableRunOptions* run_options,
-                          CustomCall::RemainingArgs buffers,
-                          CustomCall::TensorRef<int64_t> replica_groups,
-                          int32_t channel_id_present, int64_t op_id) const;
-  static XlaTupleAllToAll Handler() { return XlaTupleAllToAll(); }
-};
-}  // namespace
-
-absl::Status XlaTupleAllToAll::operator()(
-    const ExecutableRunOptions* run_options, CustomCall::RemainingArgs buffers,
-    CustomCall::TensorRef<int64_t> replica_groups, int32_t channel_id_present,
-    int64_t op_id) const {
-  if (replica_groups.shape.size() != 2) {
-    return absl::InvalidArgumentError("replica_groups must be a 2d tensor.");
-  }
-
-  if (buffers.size() % 2) {
-    return absl::InvalidArgumentError(
-        "number of input buffers and output buffers must be equal.");
-  }
-
-  std::string replica_groups_str = ReplicaGroupsToString(replica_groups);
-  int64_t num_buffers = static_cast<int64_t>(buffers.size()) / 2;
-
-  llvm::SmallVector<void*> input_buffers, output_buffers;
-  for (int i = 0; i < num_buffers; ++i) {
-    auto input = buffers.get<MemrefView>(i);
-    auto output = buffers.get<MemrefView>(i + num_buffers);
-    if (!succeeded(input) || !succeeded(output)) {
-      return absl::InvalidArgumentError("all arguments must be memrefs.");
-    }
-
-    input_buffers.push_back(input->data);
-    output_buffers.push_back(output->data);
-  }
-
-  auto first_input = *buffers.get<MemrefView>(0);
-  size_t buffer_size = ShapeUtil::ByteSizeOfElements(
-      ShapeUtil::MakeShape(first_input.dtype, first_input.sizes));
-
-  __xla_cpu_runtime_AllToAll(
-      run_options, channel_id_present, op_id, replica_groups_str.c_str(),
-      static_cast<int32_t>(replica_groups_str.size()),
-      static_cast<int32_t>(num_buffers), static_cast<int64_t>(buffer_size),
-      input_buffers.data(), output_buffers.data());
-
-  return absl::OkStatus();
-}
-
-static bool TupleAllToAll(xla::runtime::ExecutionContext* ctx, void** args,
-                          void** attrs, void** rets) {
-  static auto* handler =
-      CustomCall::Bind("xla.cpu.all_reduce")
-          .UserData<const ExecutableRunOptions*>()
-          .RemainingArgs()
-          .Attr<CustomCall::TensorRef<int64_t>>("replica_groups")
-          .Attr<int32_t>("channel_id_present")
-          .Attr<int64_t>("op_id")
-          .To<RuntimeChecks()>(XlaTupleAllToAll::Handler())
-          .release();
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
-
-// -------------------------------------------------------------------------- //
-
-namespace {
-struct XlaCollectivePermute {
-  absl::Status operator()(const ExecutableRunOptions* run_options,
-                          MemrefView input, MemrefView output,
-                          CustomCall::TensorRef<int64_t> source_target_pairs,
-                          int64_t channel_id) const;
-  static XlaCollectivePermute Handler() { return XlaCollectivePermute(); }
-};
-}  // namespace
-
-absl::Status XlaCollectivePermute::operator()(
-    const ExecutableRunOptions* run_options, MemrefView input,
-    MemrefView output, CustomCall::TensorRef<int64_t> source_target_pairs,
-    int64_t channel_id) const {
-  if (source_target_pairs.shape.size() != 2 ||
-      source_target_pairs.shape[1] != 2) {
-    return absl::InvalidArgumentError(
-        "source_target_pairs must be a ?x2 tensor.");
-  }
-  size_t byte_size = ShapeUtil::ByteSizeOfElements(
-      ShapeUtil::MakeShape(input.dtype, input.sizes));
-  std::string source_target_pairs_str =
-      SourceTargetPairsToString(source_target_pairs);
-
-  __xla_cpu_runtime_CollectivePermute(
-      run_options, static_cast<int32_t>(channel_id), 0,
-      static_cast<int32_t>(byte_size), input.data, output.data,
-      source_target_pairs_str.c_str(),
-      static_cast<int32_t>(source_target_pairs_str.size()));
-
-  return absl::OkStatus();
-}
-
-static bool CollectivePermute(xla::runtime::ExecutionContext* ctx, void** args,
-                              void** attrs, void** rets) {
-  static auto* handler =
-      CustomCall::Bind("xla.cpu.collective_permute")
-          .UserData<const ExecutableRunOptions*>()
-          .Arg<MemrefView>()  // input
-          .Arg<MemrefView>()  // output
-          .Attr<CustomCall::TensorRef<int64_t>>("source_target_pairs")
-          .Attr<int64_t>("channel_handle")
-          .To<RuntimeChecks()>(XlaCollectivePermute::Handler())
-          .release();
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
-
-void PopulateXlaCpuCollectivesCall(
-    xla::runtime::DirectCustomCallRegistry& registry) {
-  registry.Register("xla.cpu.all_reduce", &xla::cpu::AllReduce);
-  registry.Register("xla.cpu.tuple_all_to_all", &xla::cpu::TupleAllToAll);
-  registry.Register("xla.cpu.collective_permute", &xla::cpu::CollectivePermute);
-  registry.Register("xla.cpu.partition_id", &xla::cpu::PartitionId);
-  registry.Register("xla.cpu.replica_id", &xla::cpu::ReplicaId);
-}
-
-}  // namespace cpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/runtime/collectives.h b/third_party/xla/xla/service/cpu/runtime/collectives.h
deleted file mode 100644
index 043a3aaeb12223..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime/collectives.h
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright 2022 The OpenXLA Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef XLA_SERVICE_CPU_RUNTIME_COLLECTIVES_H_
-#define XLA_SERVICE_CPU_RUNTIME_COLLECTIVES_H_
-
-#include "xla/runtime/custom_call_registry.h"
-
-namespace xla {
-namespace cpu {
-
-// Populate custom call implementing XLA CPU collectives
-void PopulateXlaCpuCollectivesCall(runtime::DirectCustomCallRegistry& registry);
-
-}  // namespace cpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_CPU_RUNTIME_COLLECTIVES_H_
diff --git a/third_party/xla/xla/service/cpu/runtime/convolution.cc b/third_party/xla/xla/service/cpu/runtime/convolution.cc
deleted file mode 100644
index bc2c7ef29b2535..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime/convolution.cc
+++ /dev/null
@@ -1,219 +0,0 @@
-// Copyright 2023 The OpenXLA Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "xla/service/cpu/runtime/convolution.h"
-
-#include <cstdint>
-#include <memory>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "absl/types/span.h"
-#include "Eigen/Core"  // from @eigen_archive
-#include "xla/executable_run_options.h"
-#include "xla/runtime/memref_view.h"
-#include "xla/service/cpu/runtime_conv2d.h"
-#include "xla/service/cpu/runtime_conv3d.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla {
-namespace cpu {
-
-using ::xla::runtime::MemrefView;
-
-absl::Status XlaConvolution::operator()(
-    const ExecutableRunOptions* run_options, MemrefView input,
-    MemrefView kernel, MemrefView output, int64_t inputBatchDimension,
-    absl::Span<const int64_t> inputSpatialDimensions,
-    int64_t inputFeatureDimension,
-    absl::Span<const int64_t> kernelSpatialDimensions,
-    int64_t kernelInputFeatureDimension, int64_t kernelOutputFeatureDimension,
-    absl::Span<const int64_t> outputSpatialDimensions,
-    absl::Span<const int64_t> window_strides, absl::Span<const int64_t> padding,
-    absl::Span<const int64_t> lhs_dilation,
-    absl::Span<const int64_t> rhs_dilation, int64_t feature_group_count) const {
-  auto size = inputSpatialDimensions.size();
-  if (size < 1 || size > 3) {
-    return absl::InvalidArgumentError(
-        "Only 1D, 2D and 3D convolutions are supported");
-  }
-
-  if (size != kernelSpatialDimensions.size() ||
-      size != outputSpatialDimensions.size() || size != window_strides.size() ||
-      size * 2 != padding.size() || size != lhs_dilation.size() ||
-      size != rhs_dilation.size()) {
-    return absl::InvalidArgumentError("Number of attributes mismatched");
-  }
-
-  // We lower 1D convolutions into calls to the same Eigen function as 2D
-  // convolutions, except that we pretend that the 1D convolution is really a 2D
-  // convolution with the missing dimension set to 1.  We also adjust the
-  // padding, dilation parameters as needed.
-  std::vector<int64_t> input_dims;
-  std::vector<int64_t> kernel_dims;
-  std::vector<int64_t> output_dims;
-  std::vector<int64_t> strides;
-  std::vector<int64_t> pad;
-  std::vector<int64_t> base_dilation;
-  std::vector<int64_t> window_dilation;
-  if (size == 1) {
-    input_dims.push_back(1);
-    kernel_dims.push_back(1);
-    output_dims.push_back(1);
-    strides.push_back(1);
-    pad.insert(pad.end(), {0, 0});
-    base_dilation.push_back(1);
-    window_dilation.push_back(1);
-  }
-  for (auto dim : inputSpatialDimensions) {
-    input_dims.push_back(input.sizes[dim]);
-  }
-  for (auto dim : kernelSpatialDimensions) {
-    kernel_dims.push_back(kernel.sizes[dim]);
-  }
-  for (auto dim : outputSpatialDimensions) {
-    output_dims.push_back(output.sizes[dim]);
-  }
-  strides.insert(strides.end(), window_strides.begin(), window_strides.end());
-  pad.insert(pad.end(), padding.begin(), padding.end());
-  base_dilation.insert(base_dilation.end(), lhs_dilation.begin(),
-                       lhs_dilation.end());
-  window_dilation.insert(window_dilation.end(), rhs_dilation.begin(),
-                         rhs_dilation.end());
-
-  if (output.dtype == PrimitiveType::F16) {
-    auto* out = reinterpret_cast<Eigen::half*>(output.data);
-    auto* lhs = reinterpret_cast<Eigen::half*>(input.data);
-    auto* rhs = reinterpret_cast<Eigen::half*>(kernel.data);
-    if (size != 3) {
-      __xla_cpu_runtime_EigenConv2DF16(
-          run_options, out, lhs, rhs,
-          /*input_batch*/ input.sizes[inputBatchDimension],
-          /*input_rows*/ input_dims[0],
-          /*input_cols*/ input_dims[1],
-          /*input_channels*/ input.sizes[inputFeatureDimension],
-          /*kernel_rows*/ kernel_dims[0],
-          /*kernel_cols*/ kernel_dims[1],
-          /*kernel_channels*/ kernel.sizes[kernelInputFeatureDimension],
-          /*kernel_filters*/ kernel.sizes[kernelOutputFeatureDimension],
-          /*output_rows*/ output_dims[0],
-          /*output_cols*/ output_dims[1],
-          /*row_stride*/ strides[0],
-          /*col_stride*/ strides[1],
-          /*padding_top*/ pad[0],
-          /*padding_bottom*/ pad[1],
-          /*padding_left*/ pad[2],
-          /*padding_right*/ pad[3],
-          /*lhs_row_dilation*/ base_dilation[0],
-          /*lhs_col_dilation*/ base_dilation[1],
-          /*rhs_row_dilation*/ window_dilation[0],
-          /*rhs_col_dilation*/ window_dilation[1], feature_group_count);
-    } else {
-      __xla_cpu_runtime_EigenConv3DF16(
-          run_options, out, lhs, rhs,
-          /*input_batch*/ input.sizes[inputBatchDimension],
-          /*input_x*/ input_dims[0],
-          /*input_y*/ input_dims[1],
-          /*input_z*/ input_dims[2],
-          /*input_channels*/ input.sizes[inputFeatureDimension],
-          /*kernel_x*/ kernel_dims[0],
-          /*kernel_y*/ kernel_dims[1],
-          /*kernel_z*/ kernel_dims[2],
-          /*kernel_channels*/ kernel.sizes[kernelInputFeatureDimension],
-          /*kernel_filters*/ kernel.sizes[kernelOutputFeatureDimension],
-          /*output_x*/ output_dims[0],
-          /*output_y*/ output_dims[1],
-          /*output_z*/ output_dims[2],
-          /*x_stride*/ strides[0],
-          /*y_stride*/ strides[1],
-          /*z_stride*/ strides[2],
-          /*padding_x_before*/ pad[0],
-          /*padding_x_after*/ pad[1],
-          /*padding_y_before*/ pad[2],
-          /*padding_y_after*/ pad[3],
-          /*padding_z_before*/ pad[4],
-          /*padding_z_after*/ pad[5],
-          /*lhs_x_dilation*/ base_dilation[0],
-          /*lhs_y_dilation*/ base_dilation[1],
-          /*lhs_z_dilation*/ base_dilation[2],
-          /*rhs_x_dilation*/ window_dilation[0],
-          /*rhs_y_dilation*/ window_dilation[1],
-          /*rhs_z_dilation*/ window_dilation[2], feature_group_count);
-    }
-  } else {
-    auto* out = reinterpret_cast<float*>(output.data);
-    auto* lhs = reinterpret_cast<float*>(input.data);
-    auto* rhs = reinterpret_cast<float*>(kernel.data);
-    if (size != 3) {
-      __xla_cpu_runtime_EigenConv2DF32(
-          run_options, out, lhs, rhs,
-          /*input_batch*/ input.sizes[inputBatchDimension],
-          /*input_rows*/ input_dims[0],
-          /*input_cols*/ input_dims[1],
-          /*input_channels*/ input.sizes[inputFeatureDimension],
-          /*kernel_rows*/ kernel_dims[0],
-          /*kernel_cols*/ kernel_dims[1],
-          /*kernel_channels*/ kernel.sizes[kernelInputFeatureDimension],
-          /*kernel_filters*/ kernel.sizes[kernelOutputFeatureDimension],
-          /*output_rows*/ output_dims[0],
-          /*output_cols*/ output_dims[1],
-          /*row_stride*/ strides[0],
-          /*col_stride*/ strides[1],
-          /*padding_top*/ pad[0],
-          /*padding_bottom*/ pad[1],
-          /*padding_left*/ pad[2],
-          /*padding_right*/ pad[3],
-          /*lhs_row_dilation*/ base_dilation[0],
-          /*lhs_col_dilation*/ base_dilation[1],
-          /*rhs_row_dilation*/ window_dilation[0],
-          /*rhs_col_dilation*/ window_dilation[1], feature_group_count);
-    } else {
-      __xla_cpu_runtime_EigenConv3DF32(
-          run_options, out, lhs, rhs,
-          /*input_batch*/ input.sizes[inputBatchDimension],
-          /*input_x*/ input_dims[0],
-          /*input_y*/ input_dims[1],
-          /*input_z*/ input_dims[2],
-          /*input_channels*/ input.sizes[inputFeatureDimension],
-          /*kernel_x*/ kernel_dims[0],
-          /*kernel_y*/ kernel_dims[1],
-          /*kernel_z*/ kernel_dims[2],
-          /*kernel_channels*/ kernel.sizes[kernelInputFeatureDimension],
-          /*kernel_filters*/ kernel.sizes[kernelOutputFeatureDimension],
-          /*output_x*/ output_dims[0],
-          /*output_y*/ output_dims[1],
-          /*output_z*/ output_dims[2],
-          /*x_stride*/ strides[0],
-          /*y_stride*/ strides[1],
-          /*z_stride*/ strides[2],
-          /*padding_x_before*/ pad[0],
-          /*padding_x_after*/ pad[1],
-          /*padding_y_before*/ pad[2],
-          /*padding_y_after*/ pad[3],
-          /*padding_z_before*/ pad[4],
-          /*padding_z_after*/ pad[5],
-          /*lhs_x_dilation*/ base_dilation[0],
-          /*lhs_y_dilation*/ base_dilation[1],
-          /*lhs_z_dilation*/ base_dilation[2],
-          /*rhs_x_dilation*/ window_dilation[0],
-          /*rhs_y_dilation*/ window_dilation[1],
-          /*rhs_z_dilation*/ window_dilation[2], feature_group_count);
-    }
-  }
-
-  return absl::OkStatus();
-}
-
-}  // namespace cpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/runtime/convolution.h b/third_party/xla/xla/service/cpu/runtime/convolution.h
deleted file mode 100644
index fe4433774a7040..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime/convolution.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright 2023 The OpenXLA Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifndef XLA_SERVICE_CPU_RUNTIME_CONVOLUTION_H_
-#define XLA_SERVICE_CPU_RUNTIME_CONVOLUTION_H_
-
-#include <cstdint>
-
-#include "absl/status/status.h"
-#include "absl/types/span.h"
-#include "xla/executable_run_options.h"
-#include "xla/runtime/memref_view.h"
-
-namespace xla {
-namespace cpu {
-
-struct XlaConvolution {
-  absl::Status operator()(
-      const ExecutableRunOptions* run_options, xla::runtime::MemrefView input,
-      xla::runtime::MemrefView kernel, xla::runtime::MemrefView output,
-      int64_t inputBatchDimension,
-      absl::Span<const int64_t> inputSpatialDimensions,
-      int64_t inputFeatureDimension,
-      absl::Span<const int64_t> kernelSpatialDimensions,
-      int64_t kernelInputFeatureDimension, int64_t kernelOutputFeatureDimension,
-      absl::Span<const int64_t> outputSpatialDimensions,
-      absl::Span<const int64_t> window_strides,
-      absl::Span<const int64_t> padding, absl::Span<const int64_t> lhs_dilation,
-      absl::Span<const int64_t> rhs_dilation,
-      int64_t feature_group_count) const;
-  static XlaConvolution Handler() { return XlaConvolution(); }
-};
-
-}  // namespace cpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_CPU_RUNTIME_CONVOLUTION_H_
diff --git a/third_party/xla/xla/service/cpu/runtime/convolution_call.cc b/third_party/xla/xla/service/cpu/runtime/convolution_call.cc
deleted file mode 100644
index 793f6285da40c1..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime/convolution_call.cc
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright 2023 The OpenXLA Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "xla/service/cpu/runtime/convolution_call.h"
-
-#include <cstdint>
-#include <functional>
-#include <iterator>
-#include <memory>
-#include <numeric>
-#include <optional>
-#include <string>
-#include <string_view>
-#include <utility>
-#include <vector>
-
-#include "absl/types/span.h"
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "xla/executable_run_options.h"
-#include "xla/runtime/custom_call.h"
-#include "xla/runtime/custom_call_registry.h"
-#include "xla/runtime/executable.h"
-#include "xla/runtime/memref_view.h"
-#include "xla/service/cpu/runtime/convolution.h"
-
-namespace xla {
-namespace cpu {
-
-using ::xla::runtime::CustomCall;
-using ::xla::runtime::Executable;
-using ::xla::runtime::MemrefView;
-
-// Disable all CustomCall checks in optimized build.
-static constexpr CustomCall::RuntimeChecks RuntimeChecks() {
-#if defined(NDEBUG)
-  return CustomCall::RuntimeChecks::kNone;
-#else
-  return CustomCall::RuntimeChecks::kDefault;
-#endif
-}
-
-static bool Convolution(xla::runtime::ExecutionContext* ctx, void** args,
-                        void** attrs, void** rets) {
-  static auto* handler =
-      CustomCall::Bind("xla_cpu_convolution")
-          .UserData<const ExecutableRunOptions*>()
-          .Arg<MemrefView>()  // input
-          .Arg<MemrefView>()  // kernel
-          .Arg<MemrefView>()  // output
-          .Attr<int64_t>("inputBatchDimension")
-          .Attr<absl::Span<const int64_t>>("inputSpatialDimensions")
-          .Attr<int64_t>("inputFeatureDimension")
-          .Attr<absl::Span<const int64_t>>("kernelSpatialDimensions")
-          .Attr<int64_t>("kernelInputFeatureDimension")
-          .Attr<int64_t>("kernelOutputFeatureDimension")
-          .Attr<absl::Span<const int64_t>>("outputSpatialDimensions")
-          .Attr<absl::Span<const int64_t>>("window_strides")
-          .Attr<absl::Span<const int64_t>>("padding")
-          .Attr<absl::Span<const int64_t>>("lhs_dilation")
-          .Attr<absl::Span<const int64_t>>("rhs_dilation")
-          .Attr<int64_t>("feature_group_count")
-          .To<RuntimeChecks()>(xla::cpu::XlaConvolution::Handler())
-          .release();
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
-
-void PopulateXlaCpuConvolutionCall(
-    xla::runtime::DirectCustomCallRegistry& registry) {
-  registry.Register("xla_cpu_convolution", &Convolution);
-}
-
-}  // namespace cpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/runtime/convolution_call.h b/third_party/xla/xla/service/cpu/runtime/convolution_call.h
deleted file mode 100644
index 07bc96c51b4bfc..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime/convolution_call.h
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright 2023 The OpenXLA Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifndef XLA_SERVICE_CPU_RUNTIME_CONVOLUTION_CALL_H_
-#define XLA_SERVICE_CPU_RUNTIME_CONVOLUTION_CALL_H_
-
-#include "xla/runtime/custom_call_registry.h"
-
-namespace xla {
-namespace cpu {
-
-// Populate custom call implementing XLA CPU Convolution.
-void PopulateXlaCpuConvolutionCall(runtime::DirectCustomCallRegistry& registry);
-
-}  // namespace cpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_CPU_RUNTIME_CONVOLUTION_CALL_H_
diff --git a/third_party/xla/xla/service/cpu/runtime/convolution_ffi.cc b/third_party/xla/xla/service/cpu/runtime/convolution_ffi.cc
deleted file mode 100644
index 9673938a05eea9..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime/convolution_ffi.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-// Copyright 2023 The OpenXLA Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "xla/service/cpu/runtime/convolution_ffi.h"
-
-#include "absl/status/status.h"
-#include "absl/types/span.h"
-#include "xla/runtime/aot_ffi.h"
-#include "xla/runtime/aot_ffi_execution_context.h"
-#include "xla/runtime/ffi/ffi_api.h"
-#include "xla/runtime/ffi/ffi_c_api.h"
-#include "xla/runtime/memref_view.h"
-#include "xla/service/cpu/runtime/convolution.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla {
-struct ExecutableRunOptions;
-}  // namespace xla
-
-namespace aot = ::xla::runtime::aot;
-namespace ffi = ::xla::runtime::ffi;
-
-namespace {
-
-using ::xla::runtime::MemrefView;
-
-ffi::FfiStatus ConvolutionFfi(
-    xla::ExecutableRunOptions* executable_run_options, ffi::BufferArg input,
-    ffi::BufferArg kernel, ffi::BufferArg output, int64_t inputBatchDimension,
-    ffi::Span<const int64_t> inputSpatialDimensions,
-    int64_t inputFeatureDimension,
-    ffi::Span<const int64_t> kernelSpatialDimensions,
-    int64_t kernelInputFeatureDimension, int64_t kernelOutputFeatureDimension,
-    ffi::Span<const int64_t> outputSpatialDimensions,
-    ffi::Span<const int64_t> window_strides, ffi::Span<const int64_t> padding,
-    ffi::Span<const int64_t> lhs_dilation,
-    ffi::Span<const int64_t> rhs_dilation, int64_t feature_group_count) {
-  auto to_memref_view = [](const ffi::BufferArg& view) -> MemrefView {
-    auto dtype = static_cast<xla::PrimitiveType>(view.dtype);
-    return MemrefView{
-        dtype, view.data,
-        absl::MakeConstSpan(view.sizes.begin(), view.sizes.end())};
-  };
-  auto to_span =
-      [](ffi::Span<const int64_t> span) -> absl::Span<const int64_t> {
-    return absl::MakeConstSpan(span.begin(), span.end());
-  };
-
-  xla::cpu::XlaConvolution convolution;
-  absl::Status status = convolution(
-      executable_run_options, to_memref_view(input), to_memref_view(kernel),
-      to_memref_view(output), inputBatchDimension,
-      to_span(inputSpatialDimensions), inputFeatureDimension,
-      to_span(kernelSpatialDimensions), kernelInputFeatureDimension,
-      kernelOutputFeatureDimension, to_span(outputSpatialDimensions),
-      to_span(window_strides), to_span(padding), to_span(lhs_dilation),
-      to_span(rhs_dilation), feature_group_count);
-  return status.ok() ? ffi::FfiStatus::Ok() : ffi::FfiStatus::Internal("err");
-}
-
-XLA_FFI_DEFINE_FUNCTION(
-    FFI_Convolution, ConvolutionFfi,
-    ffi::Ffi::Binding()
-        .ApiPriv<xla::ExecutableRunOptions*>()
-        .Arg<ffi::BufferArg>()  // input
-        .Arg<ffi::BufferArg>()  // kernel
-        .Arg<ffi::BufferArg>()  // output
-        .Attr<int64_t>("inputBatchDimension")
-        .Attr<ffi::Span<const int64_t>>("inputSpatialDimensions")
-        .Attr<int64_t>("inputFeatureDimension")
-        .Attr<ffi::Span<const int64_t>>("kernelSpatialDimensions")
-        .Attr<int64_t>("kernelInputFeatureDimension")
-        .Attr<int64_t>("kernelOutputFeatureDimension")
-        .Attr<ffi::Span<const int64_t>>("outputSpatialDimensions")
-        .Attr<ffi::Span<const int64_t>>("window_strides")
-        .Attr<ffi::Span<const int64_t>>("padding")
-        .Attr<ffi::Span<const int64_t>>("lhs_dilation")
-        .Attr<ffi::Span<const int64_t>>("rhs_dilation")
-        .Attr<int64_t>("feature_group_count"));
-
-}  // namespace
-
-bool xla_cpu_convolution(void* execution_context, void** args, void** attrs,
-                         void** rets) {
-  auto ctx = static_cast<aot::ExecutionContext*>(execution_context);
-  void* executable_run_options = ctx->custom_call_data;
-
-  XLA_FFI_Api api = aot::FfiApi();
-  api.priv = executable_run_options;
-
-  XLA_FFI_Function_Args ffi_args = aot::FfiArgs(&api, args, attrs, rets);
-
-  XLA_FFI_Error* error = FFI_Convolution(&ffi_args);
-  return aot::ProcessErrorIfAny(error);
-}
diff --git a/third_party/xla/xla/service/cpu/runtime/convolution_ffi.h b/third_party/xla/xla/service/cpu/runtime/convolution_ffi.h
deleted file mode 100644
index 7ca9319269a547..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime/convolution_ffi.h
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright 2023 The OpenXLA Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifndef XLA_SERVICE_CPU_RUNTIME_CONVOLUTION_FFI_H_
-#define XLA_SERVICE_CPU_RUNTIME_CONVOLUTION_FFI_H_
-
-extern "C" {
-bool xla_cpu_convolution(void* execution_context, void** args, void** attrs,
-                         void** rets);
-}  // extern "C"
-
-#endif  // XLA_SERVICE_CPU_RUNTIME_CONVOLUTION_FFI_H_
diff --git a/third_party/xla/xla/service/cpu/runtime/custom_call.cc b/third_party/xla/xla/service/cpu/runtime/custom_call.cc
deleted file mode 100644
index 6b45f3d1a36718..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime/custom_call.cc
+++ /dev/null
@@ -1,177 +0,0 @@
-// Copyright 2022 The OpenXLA Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "xla/service/cpu/runtime/custom_call.h"
-
-#include <cstdint>
-#include <functional>
-#include <iterator>
-#include <memory>
-#include <numeric>
-#include <optional>
-#include <string>
-#include <string_view>
-#include <utility>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "absl/strings/str_cat.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "xla/primitive_util.h"
-#include "xla/runtime/custom_call.h"
-#include "xla/runtime/custom_call_registry.h"
-#include "xla/runtime/executable.h"
-#include "xla/runtime/memref_view.h"
-#include "xla/service/custom_call_status.h"
-#include "xla/service/custom_call_status_internal.h"
-#include "xla/service/custom_call_target_registry.h"
-#include "xla/service/hlo.pb.h"
-#include "xla/xla.pb.h"
-
-namespace xla {
-namespace cpu {
-
-using mlir::StringRef;
-using mlir::succeeded;
-
-using ::xla::runtime::CustomCall;
-using ::xla::runtime::Executable;
-
-// Disable all CustomCall checks in optimized build.
-static constexpr CustomCall::RuntimeChecks RuntimeChecks() {
-#if defined(NDEBUG)
-  return CustomCall::RuntimeChecks::kNone;
-#else
-  return CustomCall::RuntimeChecks::kDefault;
-#endif
-}
-
-// -------------------------------------------------------------------------- //
-
-namespace {
-struct XlaCustomCall {
-  absl::Status operator()(CustomCall::RemainingArgs args, int32_t num_results,
-                          bool output_tuple, StringRef call_target_name,
-                          int32_t api_version, StringRef backend_config) const;
-  static XlaCustomCall Handler() { return XlaCustomCall(); }
-};
-}  // namespace
-
-absl::Status XlaCustomCall::operator()(CustomCall::RemainingArgs args,
-                                       int32_t num_results, bool output_tuple,
-                                       StringRef call_target_name,
-                                       int32_t api_version,
-                                       StringRef backend_config) const {
-  // Find the Xla custom call handler.
-  void* call_target = CustomCallTargetRegistry::Global()->Lookup(
-      call_target_name.str(), "Host");
-  if (!call_target) {
-    return absl::InvalidArgumentError(absl::StrCat(
-        "Cannot find the Xla custom call handler ", call_target_name.str()));
-  }
-
-  // Prepare pointers to buffers to pass to the Xla custom call handler.
-  llvm::SmallVector<void*> buffers;
-  for (unsigned i = 0; i < args.size(); ++i) {
-    // We use zero-sized memrefs to represent holes in custom calls with target
-    // arguments mapping (see `CustomCallTargetArgMapping`).
-    if (auto memref = args.get<runtime::FlatMemrefView>(i); succeeded(memref)) {
-      buffers.push_back(memref->size_in_bytes == 0 ? nullptr : memref->data);
-      continue;
-    }
-    if (auto strided = args.get<runtime::StridedMemrefView>(i);
-        succeeded(strided)) {
-      int64_t size_in_bytes = primitive_util::ByteWidth(strided->dtype);
-      for (int64_t size : strided->sizes) size_in_bytes *= size;
-      buffers.push_back(size_in_bytes == 0 ? nullptr : strided->data);
-      continue;
-    }
-    return absl::InvalidArgumentError(
-        "Failed to get arguments as (strided) memref view");
-  }
-
-  // Multiple result buffers are passed as a tuple, which is represented as a
-  // buffer of pointers.
-  void* result_buffer =
-      !output_tuple ? buffers.back() : buffers.end() - num_results;
-
-  // Original custom call API version that doesn't support returning status.
-  if (api_version == CustomCallApiVersion::API_VERSION_ORIGINAL) {
-    using XlaCustomCallType = void (*)(void* /*result*/, void** /*args*/);
-    auto xla_call_target = reinterpret_cast<XlaCustomCallType>(call_target);
-
-    xla_call_target(result_buffer, buffers.data());
-
-    return absl::OkStatus();
-  }
-
-  // Xla Custom call API returning status.
-  if (api_version == CustomCallApiVersion::API_VERSION_STATUS_RETURNING) {
-    using XlaCustomCallType = void (*)(void* /*result*/, void** /*args*/,
-                                       XlaCustomCallStatus* /*status*/);
-    auto xla_call_target = reinterpret_cast<XlaCustomCallType>(call_target);
-
-    XlaCustomCallStatus custom_call_status;
-    xla_call_target(result_buffer, buffers.data(), &custom_call_status);
-
-    if (auto message = CustomCallStatusGetMessage(&custom_call_status)) {
-      return absl::InternalError(message.value());
-    } else {
-      return absl::OkStatus();
-    }
-  }
-
-  if (api_version ==
-      CustomCallApiVersion::API_VERSION_STATUS_RETURNING_UNIFIED) {
-    using XlaCustomCallType =
-        void (*)(void* /*result*/, void** /*args*/, const char*, size_t,
-                 XlaCustomCallStatus* /*status*/);
-    auto xla_call_target = reinterpret_cast<XlaCustomCallType>(call_target);
-
-    XlaCustomCallStatus custom_call_status;
-    xla_call_target(result_buffer, buffers.data(), backend_config.data(),
-                    backend_config.size(), &custom_call_status);
-
-    if (auto message = CustomCallStatusGetMessage(&custom_call_status)) {
-      return absl::InternalError(message.value());
-    } else {
-      return absl::OkStatus();
-    }
-  }
-
-  return absl::InvalidArgumentError("Incorrect custom call API version");
-}
-
-static bool CustomCall(runtime::ExecutionContext* ctx, void** args,
-                       void** attrs, void** rets) {
-  static auto* handler = CustomCall::Bind("xla.cpu.custom_call")
-                             .Arg<CustomCall::RemainingArgs>()  // args
-                             .Attr<int32_t>("num_results")
-                             .Attr<bool>("output_tuple")
-                             .Attr<std::string_view>("call_target_name")
-                             .Attr<int32_t>("api_version")
-                             .Attr<std::string_view>("backend_config")
-                             .To<RuntimeChecks()>(XlaCustomCall::Handler())
-                             .release();
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
-
-void PopulateXlaCpuCustomCall(runtime::DirectCustomCallRegistry& registry) {
-  registry.Register("xla.cpu.custom_call", &xla::cpu::CustomCall);
-}
-
-}  // namespace cpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/runtime/custom_call.h b/third_party/xla/xla/service/cpu/runtime/custom_call.h
deleted file mode 100644
index c4992e60cb9a2d..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime/custom_call.h
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright 2022 The OpenXLA Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef XLA_SERVICE_CPU_RUNTIME_CUSTOM_CALL_H_
-#define XLA_SERVICE_CPU_RUNTIME_CUSTOM_CALL_H_
-
-#include "xla/runtime/custom_call_registry.h"
-
-namespace xla {
-namespace cpu {
-
-// Populate custom call implementing XLA CPU runtime API for the legacy ABI.
-void PopulateXlaCpuCustomCall(runtime::DirectCustomCallRegistry& registry);
-
-}  // namespace cpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_CPU_RUNTIME_CUSTOM_CALL_H_
diff --git a/third_party/xla/xla/service/cpu/runtime/fft_call.cc b/third_party/xla/xla/service/cpu/runtime/fft_call.cc
deleted file mode 100644
index c62b57422a1543..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime/fft_call.cc
+++ /dev/null
@@ -1,114 +0,0 @@
-// Copyright 2022 The OpenXLA Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "xla/service/cpu/runtime/fft_call.h"
-
-#include <cstdint>
-#include <functional>
-#include <iterator>
-#include <memory>
-#include <numeric>
-#include <optional>
-#include <string>
-#include <string_view>
-#include <utility>
-#include <vector>
-
-#include "absl/container/inlined_vector.h"
-#include "absl/status/status.h"
-#include "absl/strings/str_cat.h"
-#include "absl/types/span.h"
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "xla/executable_run_options.h"
-#include "xla/runtime/custom_call.h"
-#include "xla/runtime/custom_call_registry.h"
-#include "xla/runtime/executable.h"
-#include "xla/runtime/memref_view.h"
-#include "xla/service/cpu/runtime_fft.h"
-#include "xla/service/hlo.pb.h"
-#include "xla/xla.pb.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla {
-namespace cpu {
-
-using ::xla::runtime::CustomCall;
-using ::xla::runtime::Executable;
-using ::xla::runtime::MemrefView;
-
-// Disable all CustomCall checks in optimized build.
-static constexpr CustomCall::RuntimeChecks RuntimeChecks() {
-#if defined(NDEBUG)
-  return CustomCall::RuntimeChecks::kNone;
-#else
-  return CustomCall::RuntimeChecks::kDefault;
-#endif
-}
-
-namespace {
-struct XlaFft {
-  absl::Status operator()(const ExecutableRunOptions* run_options,
-                          MemrefView input, MemrefView output, int32_t fft_type,
-                          absl::Span<const int64_t> fft_length) const;
-  static XlaFft Handler() { return XlaFft(); }
-};
-}  // namespace
-
-absl::Status XlaFft::operator()(const ExecutableRunOptions* run_options,
-                                MemrefView input, MemrefView output,
-                                int32_t fft_type,
-                                absl::Span<const int64_t> fft_length) const {
-  bool double_precision = output.dtype == PrimitiveType::C128;
-  auto fft_rank = static_cast<int32_t>(fft_length.size());
-  if (fft_length.empty() || fft_length.size() > input.sizes.length()) {
-    return absl::InvalidArgumentError(absl::StrCat(
-        "fft_length must contain between 1 and ", input.sizes.length(),
-        " elements for an input with rank ", input.sizes.length()));
-  }
-
-  // Flatten batch dimensions.
-  absl::InlinedVector<int64_t, 4> input_sizes(fft_rank + 1);
-  int64_t input_batch = 1;
-  int64_t dim_offset = input.sizes.size() - fft_rank;
-  for (int64_t dim = 0; dim < dim_offset; ++dim) {
-    input_batch *= input.sizes[dim];
-  }
-  input_sizes[0] = input_batch;
-  for (int64_t dim = 0; dim < fft_rank; ++dim) {
-    input_sizes[1 + dim] = input.sizes[dim_offset + dim];
-  }
-  __xla_cpu_runtime_DuccFft(run_options, output.data, input.data, fft_type,
-                            static_cast<int32_t>(double_precision), fft_rank,
-                            input_sizes.data(), fft_length.data());
-  return absl::OkStatus();
-}
-
-static bool Fft(xla::runtime::ExecutionContext* ctx, void** args, void** attrs,
-                void** rets) {
-  static auto* handler = CustomCall::Bind("xla.cpu.fft")
-                             .UserData<const ExecutableRunOptions*>()
-                             .Arg<MemrefView>()  // input
-                             .Arg<MemrefView>()  // output
-                             .Attr<int32_t>("fft_type")
-                             .Attr<absl::Span<const int64_t>>("fft_length")
-                             .To<RuntimeChecks()>(XlaFft::Handler())
-                             .release();
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
-
-void PopulateXlaCpuFftCall(xla::runtime::DirectCustomCallRegistry& registry) {
-  registry.Register("xla.cpu.fft", &Fft);
-}
-
-}  // namespace cpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/runtime/fft_call.h b/third_party/xla/xla/service/cpu/runtime/fft_call.h
deleted file mode 100644
index 7e728824fefd10..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime/fft_call.h
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright 2022 The OpenXLA Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifndef XLA_SERVICE_CPU_RUNTIME_FFT_CALL_H_
-#define XLA_SERVICE_CPU_RUNTIME_FFT_CALL_H_
-
-#include "xla/runtime/custom_call_registry.h"
-
-namespace xla {
-namespace cpu {
-
-// Populate custom call implementing XLA CPU FFT.
-void PopulateXlaCpuFftCall(runtime::DirectCustomCallRegistry& registry);
-
-}  // namespace cpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_CPU_RUNTIME_FFT_CALL_H_
diff --git a/third_party/xla/xla/service/cpu/runtime/retain.cc b/third_party/xla/xla/service/cpu/runtime/retain.cc
deleted file mode 100644
index 431c0f75a8c8c4..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime/retain.cc
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cstdint>
-#include <cstdlib>
-#include <utility>
-
-extern "C" void retainBuffers(int64_t numAllocs, void** allocBuffers,
-                              int64_t numRetained, void** retainedBuffers) {
-  for (int64_t i = 0; i < numRetained; ++i) {
-    void* retained = retainedBuffers[i];
-    retainedBuffers[i] = nullptr;
-    for (int64_t j = 0; j < numAllocs; ++j) {
-      if (allocBuffers[j] == retained) {
-        std::swap(allocBuffers[j], retainedBuffers[i]);
-        break;
-      }
-    }
-  }
-
-  for (int64_t i = 0; i < numAllocs; ++i) {
-    if (allocBuffers[i]) {
-      free(allocBuffers[i]);
-    }
-  }
-}
diff --git a/third_party/xla/xla/service/cpu/runtime/rng.cc b/third_party/xla/xla/service/cpu/runtime/rng.cc
deleted file mode 100644
index 7f2edd42b56b26..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime/rng.cc
+++ /dev/null
@@ -1,201 +0,0 @@
-// Copyright 2023 The OpenXLA Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "xla/service/cpu/runtime/rng.h"
-
-#include <array>
-#include <cstdint>
-
-#include "absl/status/status.h"
-#include "absl/strings/str_cat.h"
-#include "xla/executable_run_options.h"
-#include "xla/runtime/memref_view.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla {
-namespace cpu {
-
-using ::xla::runtime::FlatMemrefView;
-
-static std::array<uint32_t, 2> threefry2x32(std::array<uint32_t, 2> key,
-                                            std::array<uint32_t, 2> ctr) {
-  constexpr std::array<std::array<int, 4>, 2> rotations{
-      std::array<int, 4>{13, 15, 26, 6}, std::array<int, 4>{17, 29, 16, 24}};
-
-  std::array<uint32_t, 3> ks{key[0], key[1], key[0] ^ key[1] ^ 0x1BD11BDAu};
-  ctr[0] += ks[0];
-  ctr[1] += ks[1];
-
-  auto apply_round = [&](int r, int i0, int i1, int b) {
-    for (int64_t rot : rotations[r]) {
-      ctr[0] += ctr[1];
-      ctr[1] = (ctr[1] << rot) | (ctr[1] >> (32 - rot));
-      ctr[1] ^= ctr[0];
-    }
-    ctr[0] += ks[i0];
-    ctr[1] += ks[i1] + b;
-  };
-
-  apply_round(0, 1, 2, 1);
-  apply_round(1, 2, 0, 2);
-  apply_round(0, 0, 1, 3);
-  apply_round(1, 1, 2, 4);
-  apply_round(0, 2, 0, 5);
-  return ctr;
-}
-
-static std::array<uint32_t, 4> philox4x32(std::array<uint32_t, 2> key,
-                                          std::array<uint32_t, 4> ctr) {
-  auto mulhilo = [](uint64_t a, uint64_t b) -> std::array<uint32_t, 2> {
-    return {static_cast<uint32_t>((a * b) >> 32), static_cast<uint32_t>(a * b)};
-  };
-  for (int i = 0; i < 10; ++i) {
-    auto [hi0, lo0] = mulhilo(0xD2511F53, ctr[0]);
-    auto [hi1, lo1] = mulhilo(0xCD9E8D57, ctr[2]);
-    ctr = {{hi1 ^ ctr[1] ^ key[0], lo1, hi0 ^ ctr[3] ^ key[1], lo0}};
-    key[0] += 0x9E3779B9u;
-    key[1] += 0xBB67AE85u;
-  }
-  return ctr;
-}
-
-template <typename E, typename T, typename C>
-void FillBuffer(void* buffer, void* state_buffer, int64_t size_bytes, T fn,
-                C ctr, std::array<uint32_t, 2> key) {
-  E* out = static_cast<E*>(buffer);
-  int64_t i = 0;
-  int64_t num = size_bytes / sizeof(E);
-  while (i < num) {
-    auto val = fn(key, ctr);
-    for (int64_t j = 0; j < val.size() && i < num; ++i, ++j) {
-      out[i] = val[j];
-    }
-    if (!++ctr[0]) {
-      ++ctr[1];
-    }
-  }
-
-  auto state_out = static_cast<uint32_t*>(state_buffer);
-  state_out[0] = key[0];
-  state_out[1] = key[1];
-  state_out[2] = ctr[0];
-  state_out[3] = ctr[1];
-}
-
-static absl::Status ValidateStateBuffers(FlatMemrefView state_buffer,
-                                         FlatMemrefView state_out_buffer,
-                                         bool allow_24 = false) {
-  if (state_buffer.size_in_bytes != 16 &&
-      !(allow_24 && state_buffer.size_in_bytes == 24)) {
-    return absl::InvalidArgumentError(
-        absl::StrCat("Unexpected state size: ", state_buffer.size_in_bytes));
-  }
-  if (state_out_buffer.size_in_bytes != state_buffer.size_in_bytes) {
-    return absl::InvalidArgumentError(
-        "Expected state output to have the same size as input.");
-  }
-  return absl::OkStatus();
-}
-
-absl::Status XlaThreeFry::operator()(const ExecutableRunOptions*,
-                                     FlatMemrefView state_buffer,
-                                     FlatMemrefView state_out_buffer,
-                                     FlatMemrefView values_buffer) const {
-  auto status = ValidateStateBuffers(state_buffer, state_out_buffer);
-  if (!status.ok()) {
-    return status;
-  }
-
-  auto* state_vals = static_cast<uint32_t*>(state_buffer.data);
-  std::array<uint32_t, 2> key{state_vals[0], state_vals[1]};
-  std::array<uint32_t, 2> ctr{state_vals[2], state_vals[3]};
-
-  switch (values_buffer.dtype) {
-    case S8:
-    case U8:
-      FillBuffer<uint8_t>(values_buffer.data, state_out_buffer.data,
-                          values_buffer.size_in_bytes, threefry2x32, ctr, key);
-      break;
-    case F16:
-    case U16:
-    case S16:
-      // XLA's RngBitGeneratorExpander has a corner case for bit widths less
-      // than 32 where it discards half the bits. We don't really need that, but
-      // some TF tests depend on it, somehow.
-      FillBuffer<uint16_t>(values_buffer.data, state_out_buffer.data,
-                           values_buffer.size_in_bytes, threefry2x32, ctr, key);
-      break;
-    case F32:
-    case U32:
-    case S32:
-    case F64:
-    case U64:
-    case S64:
-      FillBuffer<uint32_t>(values_buffer.data, state_out_buffer.data,
-                           values_buffer.size_in_bytes, threefry2x32, ctr, key);
-      break;
-    default:
-      return absl::UnimplementedError(
-          "Type not implemented by ThreeFryBitGenerator");
-  }
-
-  return absl::OkStatus();
-}
-
-absl::Status XlaPhilox::operator()(const ExecutableRunOptions*,
-                                   FlatMemrefView state_buffer,
-                                   FlatMemrefView state_out_buffer,
-                                   FlatMemrefView values_buffer) const {
-  auto status = ValidateStateBuffers(state_buffer, state_out_buffer, true);
-  if (!status.ok()) {
-    return status;
-  }
-
-  auto* state_vals = static_cast<uint32_t*>(state_buffer.data);
-  std::array<uint32_t, 2> key{state_vals[0], state_vals[1]};
-  bool is_24 = state_buffer.size_in_bytes == 24;
-  std::array<uint32_t, 4> ctr{state_vals[2], state_vals[3],
-                              state_vals[is_24 ? 4 : 0],
-                              state_vals[is_24 ? 5 : 1]};
-
-  switch (values_buffer.dtype) {
-    case S8:
-    case U8:
-      FillBuffer<uint8_t>(values_buffer.data, state_out_buffer.data,
-                          values_buffer.size_in_bytes, philox4x32, ctr, key);
-      break;
-    case F16:
-    case U16:
-    case S16:
-      FillBuffer<uint16_t>(values_buffer.data, state_out_buffer.data,
-                           values_buffer.size_in_bytes, philox4x32, ctr, key);
-      break;
-    case F32:
-    case U32:
-    case S32:
-    case F64:
-    case U64:
-    case S64:
-      FillBuffer<uint32_t>(values_buffer.data, state_out_buffer.data,
-                           values_buffer.size_in_bytes, philox4x32, ctr, key);
-      break;
-    default:
-      return absl::UnimplementedError(
-          "Type not implemented by PhiloxBitGenerator");
-  }
-  return absl::OkStatus();
-}
-
-}  // namespace cpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/runtime/rng.h b/third_party/xla/xla/service/cpu/runtime/rng.h
deleted file mode 100644
index dc724ec15eb8ff..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime/rng.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright 2023 The OpenXLA Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef XLA_SERVICE_CPU_RUNTIME_RNG_H_
-#define XLA_SERVICE_CPU_RUNTIME_RNG_H_
-
-#include <cstdint>
-
-#include "absl/status/status.h"
-#include "xla/executable_run_options.h"
-#include "xla/runtime/memref_view.h"
-
-namespace xla {
-namespace cpu {
-
-struct XlaThreeFry {
-  absl::Status operator()(const ExecutableRunOptions*,
-                          xla::runtime::FlatMemrefView state_buffer,
-                          xla::runtime::FlatMemrefView state_out_buffer,
-                          xla::runtime::FlatMemrefView values_buffer) const;
-  static XlaThreeFry Handler() { return XlaThreeFry(); }
-};
-
-struct XlaPhilox {
-  absl::Status operator()(const ExecutableRunOptions*,
-                          xla::runtime::FlatMemrefView state_buffer,
-                          xla::runtime::FlatMemrefView state_out_buffer,
-                          xla::runtime::FlatMemrefView values_buffer) const;
-  static XlaPhilox Handler() { return XlaPhilox(); }
-};
-
-}  // namespace cpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_CPU_RUNTIME_RNG_H_
diff --git a/third_party/xla/xla/service/cpu/runtime/rng_call.cc b/third_party/xla/xla/service/cpu/runtime/rng_call.cc
deleted file mode 100644
index 6bcbe0fe0bf7e4..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime/rng_call.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-// Copyright 2023 The OpenXLA Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "xla/service/cpu/runtime/rng_call.h"
-
-#include <array>
-#include <cstdint>
-
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "xla/executable_run_options.h"
-#include "xla/runtime/custom_call.h"
-#include "xla/runtime/custom_call_registry.h"
-#include "xla/runtime/executable.h"
-#include "xla/runtime/memref_view.h"
-#include "xla/service/cpu/runtime/rng.h"
-
-namespace xla {
-namespace cpu {
-
-using ::xla::runtime::CustomCall;
-using ::xla::runtime::Executable;
-using ::xla::runtime::FlatMemrefView;
-
-// Disable all CustomCall checks in optimized build.
-static constexpr CustomCall::RuntimeChecks RuntimeChecks() {
-#if defined(NDEBUG)
-  return CustomCall::RuntimeChecks::kNone;
-#else
-  return CustomCall::RuntimeChecks::kDefault;
-#endif
-}
-
-static bool ThreeFry(xla::runtime::ExecutionContext* ctx, void** args,
-                     void** attrs, void** rets) {
-  static auto* handler =
-      CustomCall::Bind("xla_cpu_rng_three_fry")
-          .UserData<const ExecutableRunOptions*>()
-          .Arg<FlatMemrefView>()
-          .Arg<FlatMemrefView>()
-          .Arg<FlatMemrefView>()
-          .To<RuntimeChecks()>(xla::cpu::XlaThreeFry::Handler())
-          .release();
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
-
-static bool Philox(xla::runtime::ExecutionContext* ctx, void** args,
-                   void** attrs, void** rets) {
-  static auto* handler =
-      CustomCall::Bind("xla_cpu_rng_philox")
-          .UserData<const ExecutableRunOptions*>()
-          .Arg<FlatMemrefView>()
-          .Arg<FlatMemrefView>()
-          .Arg<FlatMemrefView>()
-          .To<RuntimeChecks()>(xla::cpu::XlaPhilox::Handler())
-          .release();
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
-
-void PopulateXlaCpuRngCall(xla::runtime::DirectCustomCallRegistry& registry) {
-  registry.Register("xla_cpu_rng_three_fry", &ThreeFry);
-  registry.Register("xla_cpu_rng_philox", &Philox);
-}
-
-}  // namespace cpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/runtime/rng_call.h b/third_party/xla/xla/service/cpu/runtime/rng_call.h
deleted file mode 100644
index f189b90084076c..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime/rng_call.h
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright 2023 The OpenXLA Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef XLA_SERVICE_CPU_RUNTIME_RNG_CALL_H_
-#define XLA_SERVICE_CPU_RUNTIME_RNG_CALL_H_
-
-#include "xla/runtime/custom_call_registry.h"
-
-namespace xla {
-namespace cpu {
-
-// Populate custom call implementing XLA CPU RNGs.
-void PopulateXlaCpuRngCall(runtime::DirectCustomCallRegistry& registry);
-
-}  // namespace cpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_CPU_RUNTIME_RNG_CALL_H_
diff --git a/third_party/xla/xla/service/cpu/runtime/rng_ffi.cc b/third_party/xla/xla/service/cpu/runtime/rng_ffi.cc
deleted file mode 100644
index 8efd9aabfade06..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime/rng_ffi.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-// Copyright 2023 The OpenXLA Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "xla/service/cpu/runtime/rng_ffi.h"
-
-#include "absl/status/status.h"
-#include "xla/runtime/aot_ffi.h"
-#include "xla/runtime/aot_ffi_execution_context.h"
-#include "xla/runtime/ffi/ffi_api.h"
-#include "xla/runtime/ffi/ffi_c_api.h"
-#include "xla/runtime/memref_view.h"
-#include "xla/service/cpu/runtime/rng.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla {
-struct ExecutableRunOptions;
-}  // namespace xla
-
-namespace aot = ::xla::runtime::aot;
-namespace ffi = ::xla::runtime::ffi;
-
-namespace {
-
-using ::xla::runtime::FlatMemrefView;
-
-// Converts an ffi::FlatBufferArg to an xla::runtime::FlatMemrefView.
-FlatMemrefView ToFlatMemrefView(const ffi::FlatBufferArg& view) {
-  auto dtype = static_cast<xla::PrimitiveType>(view.dtype);
-  return FlatMemrefView{dtype, view.data, view.size_in_bytes};
-}
-
-ffi::FfiStatus ThreeFryFfi(xla::ExecutableRunOptions* executable_run_options,
-                           ffi::FlatBufferArg state_buffer,
-                           ffi::FlatBufferArg state_out_buffer,
-                           ffi::FlatBufferArg values_buffer) {
-  xla::cpu::XlaThreeFry three_fry;
-  absl::Status status = three_fry(
-      executable_run_options, ToFlatMemrefView(state_buffer),
-      ToFlatMemrefView(state_out_buffer), ToFlatMemrefView(values_buffer));
-  return status.ok() ? ffi::FfiStatus::Ok() : ffi::FfiStatus::Internal("err");
-}
-
-XLA_FFI_DEFINE_FUNCTION(FFI_ThreeFry, ThreeFryFfi,
-                        ffi::Ffi::Binding()
-                            .ApiPriv<xla::ExecutableRunOptions*>()
-                            .Arg<ffi::FlatBufferArg>()    // state_buffer
-                            .Arg<ffi::FlatBufferArg>()    // state_out_buffer
-                            .Arg<ffi::FlatBufferArg>());  // values_buffer
-
-ffi::FfiStatus PhiloxFfi(xla::ExecutableRunOptions* executable_run_options,
-                         ffi::FlatBufferArg state_buffer,
-                         ffi::FlatBufferArg state_out_buffer,
-                         ffi::FlatBufferArg values_buffer) {
-  xla::cpu::XlaPhilox philox;
-  absl::Status status = philox(
-      executable_run_options, ToFlatMemrefView(state_buffer),
-      ToFlatMemrefView(state_out_buffer), ToFlatMemrefView(values_buffer));
-  return status.ok() ? ffi::FfiStatus::Ok() : ffi::FfiStatus::Internal("err");
-}
-
-XLA_FFI_DEFINE_FUNCTION(FFI_Philox, PhiloxFfi,
-                        ffi::Ffi::Binding()
-                            .ApiPriv<xla::ExecutableRunOptions*>()
-                            .Arg<ffi::FlatBufferArg>()    // state_buffer
-                            .Arg<ffi::FlatBufferArg>()    // state_out_buffer
-                            .Arg<ffi::FlatBufferArg>());  // values_buffer
-
-}  // namespace
-
-bool xla_cpu_rng_three_fry(void* execution_context, void** args, void** attrs,
-                           void** rets) {
-  auto ctx = static_cast<aot::ExecutionContext*>(execution_context);
-  void* executable_run_options = ctx->custom_call_data;
-
-  XLA_FFI_Api api = aot::FfiApi();
-  api.priv = executable_run_options;
-
-  XLA_FFI_Function_Args ffi_args = aot::FfiArgs(&api, args, attrs, rets);
-
-  XLA_FFI_Error* error = FFI_ThreeFry(&ffi_args);
-  return aot::ProcessErrorIfAny(error);
-}
-
-bool xla_cpu_rng_philox(void* execution_context, void** args, void** attrs,
-                        void** rets) {
-  auto ctx = static_cast<aot::ExecutionContext*>(execution_context);
-  void* executable_run_options = ctx->custom_call_data;
-
-  XLA_FFI_Api api = aot::FfiApi();
-  api.priv = executable_run_options;
-
-  XLA_FFI_Function_Args ffi_args = aot::FfiArgs(&api, args, attrs, rets);
-
-  XLA_FFI_Error* error = FFI_Philox(&ffi_args);
-  return aot::ProcessErrorIfAny(error);
-}
diff --git a/third_party/xla/xla/service/cpu/runtime/rng_ffi.h b/third_party/xla/xla/service/cpu/runtime/rng_ffi.h
deleted file mode 100644
index 4383f96ae45205..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime/rng_ffi.h
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright 2023 The OpenXLA Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifndef XLA_SERVICE_CPU_RUNTIME_RNG_FFI_H_
-#define XLA_SERVICE_CPU_RUNTIME_RNG_FFI_H_
-
-extern "C" {
-bool xla_cpu_rng_three_fry(void* execution_context, void** args, void** attrs,
-                           void** rets);
-bool xla_cpu_rng_philox(void* execution_context, void** args, void** attrs,
-                        void** rets);
-}  // extern "C"
-
-#endif  // XLA_SERVICE_CPU_RUNTIME_RNG_FFI_H_
diff --git a/third_party/xla/xla/service/cpu/runtime/xfeed.cc b/third_party/xla/xla/service/cpu/runtime/xfeed.cc
deleted file mode 100644
index 38bb2eb34644e3..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime/xfeed.cc
+++ /dev/null
@@ -1,189 +0,0 @@
-// Copyright 2022 The OpenXLA Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "xla/service/cpu/runtime/xfeed.h"
-
-#include <cstdint>
-#include <cstring>
-#include <functional>
-#include <iterator>
-#include <memory>
-#include <numeric>
-#include <optional>
-#include <string>
-#include <string_view>
-#include <utility>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "absl/types/span.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "xla/executable_run_options.h"
-#include "xla/primitive_util.h"
-#include "xla/runtime/custom_call.h"
-#include "xla/runtime/custom_call_registry.h"
-#include "xla/runtime/executable.h"
-#include "xla/runtime/memref_view.h"
-#include "xla/service/cpu/cpu_runtime.h"
-#include "xla/shape_util.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla {
-namespace cpu {
-
-using mlir::succeeded;
-
-using ::xla::runtime::CustomCall;
-using ::xla::runtime::Executable;
-
-// Disable all CustomCall checks in optimized build.
-static constexpr CustomCall::RuntimeChecks RuntimeChecks() {
-#if defined(NDEBUG)
-  return CustomCall::RuntimeChecks::kNone;
-#else
-  return CustomCall::RuntimeChecks::kDefault;
-#endif
-}
-
-static xla::Shape ToShape(const xla::runtime::StridedMemrefView& memref) {
-  // Recover `minor_to_major` dimensions permutation from strides.
-  auto indexed_strides_range =
-      llvm::map_range(llvm::enumerate(memref.strides), [](auto pair) {
-        return std::pair<int64_t, int64_t>{pair.value(), pair.index()};
-      });
-
-  auto indexed_strides = llvm::to_vector(indexed_strides_range);
-  llvm::stable_sort(indexed_strides);
-
-  auto minor_to_major =
-      llvm::to_vector(llvm::make_second_range(indexed_strides));
-  return xla::ShapeUtil::MakeShapeWithDenseLayout(memref.dtype, memref.sizes,
-                                                  minor_to_major);
-}
-
-static int64_t MemrefSize(const xla::runtime::StridedMemrefView& memref) {
-  int64_t size_in_bytes = primitive_util::ByteWidth(memref.dtype);
-  for (int64_t size : memref.sizes) {
-    size_in_bytes *= size;
-  }
-  return size_in_bytes;
-}
-
-// -------------------------------------------------------------------------- //
-
-namespace {
-struct XlaInfeed {
-  absl::Status operator()(const ExecutableRunOptions* run_options,
-                          CustomCall::RemainingArgs args) const;
-  static XlaInfeed Handler() { return XlaInfeed(); }
-};
-}  // namespace
-
-absl::Status XlaInfeed::operator()(const ExecutableRunOptions* run_options,
-                                   CustomCall::RemainingArgs args) const {
-  for (unsigned i = 0; i < args.size(); ++i) {
-    auto memref = args.get<xla::runtime::StridedMemrefView>(i);
-    if (!succeeded(memref)) {
-      return absl::InvalidArgumentError(
-          "Failed to get arguments as (strided) memref view");
-    }
-
-    auto size_in_bytes = static_cast<int32_t>(MemrefSize(*memref));
-    std::string shape_string = ToShape(*memref).SerializeAsString();
-
-    void* infeed_buffer = __xla_cpu_runtime_AcquireInfeedBufferForDequeue(
-        run_options, size_in_bytes, shape_string.data(),
-        static_cast<int32_t>(shape_string.size()));
-    // Copy from the infeed buffer.
-    std::memcpy(memref->data, infeed_buffer, size_in_bytes);
-    __xla_cpu_runtime_ReleaseInfeedBufferAfterDequeue(
-        run_options, size_in_bytes, infeed_buffer, shape_string.data(),
-        static_cast<int32_t>(shape_string.size()));
-  }
-  return absl::OkStatus();
-}
-
-static bool Infeed(xla::runtime::ExecutionContext* ctx, void** args,
-                   void** attrs, void** rets) {
-  static auto* handler = CustomCall::Bind("xla.cpu.infeed")
-                             .UserData<const ExecutableRunOptions*>()
-                             .Arg<CustomCall::RemainingArgs>()  // args
-                             .To<RuntimeChecks()>(XlaInfeed::Handler())
-                             .release();
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
-
-// -------------------------------------------------------------------------- //
-
-namespace {
-struct XlaOutfeed {
-  absl::Status operator()(const ExecutableRunOptions* run_options,
-                          CustomCall::RemainingArgs args,
-                          absl::Span<const int32_t> result_type) const;
-  static XlaOutfeed Handler() { return XlaOutfeed(); }
-};
-}  // namespace
-
-absl::Status XlaOutfeed::operator()(
-    const ExecutableRunOptions* run_options, CustomCall::RemainingArgs args,
-    absl::Span<const int32_t> result_type) const {
-  assert(result_type.size() == args.size() &&
-         "Result types and input args should be of the same size.");
-  for (unsigned i = 0; i < args.size(); ++i) {
-    auto memref = args.get<xla::runtime::StridedMemrefView>(i);
-    if (!succeeded(memref)) {
-      return absl::InvalidArgumentError(
-          "Failed to get arguments as (strided) memref view");
-    }
-
-    // Restoring the sign information that was lost during convert-to-signless
-    // pass. This information was stashed in an attribute inside
-    // xla_cpu::outfeed.
-    memref->dtype = PrimitiveType(result_type[i]);
-
-    auto size_in_bytes = static_cast<int32_t>(MemrefSize(*memref));
-    std::string shape_string = ToShape(*memref).SerializeAsString();
-
-    void* outfeed_buffer = __xla_cpu_runtime_AcquireOutfeedBufferForPopulation(
-        run_options, size_in_bytes, shape_string.data(),
-        static_cast<int32_t>(shape_string.size()));
-    // Copy to the outfeed buffer.
-    std::memcpy(outfeed_buffer, memref->data, size_in_bytes);
-    __xla_cpu_runtime_ReleaseOutfeedBufferAfterPopulation(
-        run_options, size_in_bytes, outfeed_buffer, shape_string.data(),
-        static_cast<int32_t>(shape_string.size()));
-  }
-  return absl::OkStatus();
-}
-
-static bool Outfeed(xla::runtime::ExecutionContext* ctx, void** args,
-                    void** attrs, void** rets) {
-  static auto* handler = CustomCall::Bind("xla.cpu.outfeed")
-                             .UserData<const ExecutableRunOptions*>()
-                             .Arg<CustomCall::RemainingArgs>()  // args
-                             .Attr<absl::Span<const int32_t>>("result_type")
-                             .To<RuntimeChecks()>(XlaOutfeed::Handler())
-                             .release();
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
-
-void PopulateXlaXfeedCall(xla::runtime::DirectCustomCallRegistry& registry) {
-  registry.Register("xla.cpu.infeed", &xla::cpu::Infeed);
-  registry.Register("xla.cpu.outfeed", &xla::cpu::Outfeed);
-}
-
-}  // namespace cpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/runtime/xfeed.h b/third_party/xla/xla/service/cpu/runtime/xfeed.h
deleted file mode 100644
index abdb7f117edc76..00000000000000
--- a/third_party/xla/xla/service/cpu/runtime/xfeed.h
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright 2022 The OpenXLA Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef XLA_SERVICE_CPU_RUNTIME_XFEED_H_
-#define XLA_SERVICE_CPU_RUNTIME_XFEED_H_
-
-#include "xla/runtime/custom_call_registry.h"
-
-namespace xla {
-namespace cpu {
-
-// Populate custom call implementing XLA CPU infeed and outfeed.
-void PopulateXlaXfeedCall(runtime::DirectCustomCallRegistry& registry);
-
-}  // namespace cpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_CPU_RUNTIME_XFEED_H_
diff --git a/third_party/xla/xla/service/cpu/runtime_handle_ffi_call.cc b/third_party/xla/xla/service/cpu/runtime_handle_ffi_call.cc
new file mode 100644
index 00000000000000..b8dd5c628bec77
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/runtime_handle_ffi_call.cc
@@ -0,0 +1,215 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/cpu/runtime_handle_ffi_call.h"
+
+#include <cstdint>
+#include <functional>
+#include <string_view>
+#include <utility>
+
+#include "absl/algorithm/container.h"
+#include "absl/base/attributes.h"
+#include "absl/base/dynamic_annotations.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "mlir/AsmParser/AsmParser.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "xla/executable_run_options.h"
+#include "xla/ffi/attribute_map.h"
+#include "xla/ffi/call_frame.h"
+#include "xla/ffi/ffi_api.h"
+#include "xla/primitive_util.h"
+#include "xla/service/custom_call_status.h"
+#include "xla/service/service_executable_run_options.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/logging.h"
+#include "tsl/platform/statusor.h"
+
+namespace ffi = xla::ffi;
+
+namespace {
+
+absl::Span<const int64_t> DecodeDims(int64_t* encoded_dims_data) {
+  // Annotate memory coming from jit compiled function as initialized to
+  // suppress false positives from msan sanitizer.
+  ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(encoded_dims_data, sizeof(int64_t));
+  auto dims_count = encoded_dims_data[0];
+  ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(encoded_dims_data,
+                                      dims_count * sizeof(int64_t));
+  auto dims_begin = encoded_dims_data + 1;
+  return absl::MakeSpan(dims_begin, dims_begin + dims_count);
+}
+
+// TODO(heinsaar): Once on C++20, this can (and should) be a local lambda with
+//                 an explicit template parameter list.
+class ArgInserter {
+ public:
+  template <typename B>
+  explicit ArgInserter(B&& b) : b_(std::forward<B>(b)) {}
+
+  template <typename... Args>
+  void operator()(Args&&... args) const {
+    b_.AddBufferArg(std::forward<Args>(args)...);
+  }
+
+ private:
+  ffi::CallFrameBuilder& b_;
+};
+
+// TODO(heinsaar): Once on C++20, this can (and should) be a local lambda with
+//                 an explicit template parameter list.
+class RetInserter {
+ public:
+  template <typename B>
+  explicit RetInserter(B&& b) : b_(std::forward<B>(b)) {}
+
+  template <typename... Args>
+  void operator()(Args&&... args) const {
+    b_.AddBufferRet(std::forward<Args>(args)...);
+  }
+
+ private:
+  ffi::CallFrameBuilder& b_;
+};
+
+template <typename Builder>
+void BuildBuffers(absl::Span<const int32_t> types, int64_t* encoded_dims,
+                  absl::Span<void* const> address_space, Builder&& builder) {
+  int64_t dim_pos = 0;
+  for (int64_t i = 0; i < types.size(); ++i) {
+    auto dtype = static_cast<xla::PrimitiveType>(types[i]);
+    auto dims = DecodeDims(encoded_dims + dim_pos);
+    auto elem_count = absl::c_accumulate(dims, 1, std::multiplies<int64_t>());
+    auto data_width = xla::primitive_util::ByteWidth(dtype) * elem_count;
+
+    builder(tensorflow::se::DeviceMemoryBase(address_space[i], data_width),
+            /*type = */ dtype,
+            /*dims = */ dims);
+    dim_pos += 1;            // Jumps over count value
+    dim_pos += dims.size();  // Jumps over all dimensions in a shape
+  }
+}
+
+static absl::Status BuildAndCallFfi(
+    const xla::ExecutableRunOptions* run_options, std::string_view target_name,
+    std::string_view backend_config, absl::Span<void* const> outputs,
+    absl::Span<void* const> inputs, absl::Span<const int32_t> result_types,
+    int64_t* result_dims, absl::Span<const int32_t> operand_types,
+    int64_t* operand_dims) {
+  CHECK_EQ(outputs.size(), result_types.size());
+  CHECK_EQ(inputs.size(), operand_types.size());
+
+  if (absl::c_any_of(operand_types, [](int32_t type) {
+        return static_cast<xla::PrimitiveType>(type) ==
+               xla::PrimitiveType::TUPLE;
+      })) {
+    return absl::InternalError(
+        "Tuple operands are not supported yet in typed FFI custom calls.");
+  }
+
+  // Find the registered FFI handler for this custom call target.
+  absl::StatusOr<ffi::HandlerRegistration> registration =
+      ffi::FindHandler(target_name, "Host");
+
+  if (!registration.ok()) {
+    return absl::UnimplementedError(
+        absl::StrCat("No registered implementation for custom call to ",
+                     target_name, " for Host."));
+  }
+
+  // For FFI handlers backend config must be a compatible MLIR dictionary.
+  mlir::MLIRContext mlir_context;
+  ffi::CallFrameBuilder::FlatAttributesMap attributes;
+  if (!backend_config.empty()) {
+    // Backend config not empty, so proceed to parse it into an MLIR attribute
+    // and build an MLIR compatible map of attributes out of it.
+    mlir::Attribute attr = mlir::parseAttribute(backend_config, &mlir_context);
+    if (auto dict = attr.dyn_cast_or_null<mlir::DictionaryAttr>()) {
+      TF_ASSIGN_OR_RETURN(attributes, xla::ffi::BuildAttributesMap(dict));
+    } else {
+      return absl::InternalError(
+          "Unsupported backend config. Expected a string parsable into "
+          "dictionary attribute");
+    }
+  }
+
+  ffi::CallFrameBuilder builder;
+
+  // Forward the constructed attributes to the call frame
+  ffi::CallFrameBuilder::AttributesBuilder attrs;
+  attrs.Append(std::move(attributes));
+  builder.AddAttributes(attrs.Build());
+
+  // Decode dimensions metadata into shapes and build operand & result buffers
+  BuildBuffers(operand_types, operand_dims, inputs, ArgInserter(builder));
+  BuildBuffers(result_types, result_dims, outputs, RetInserter(builder));
+
+  // Forward executable run options to the FFI handlers via the call options.
+  xla::ServiceExecutableRunOptions service_run_options(*run_options);
+
+  ffi::CallOptions call_options;
+  call_options.run_options = &service_run_options;
+
+  ffi::CallFrame call_frame = builder.Build();
+  return ffi::Call(registration->handler, call_frame, call_options);
+}
+
+}  // namespace
+
+ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_HandleFfiCall(
+    const void* run_options_ptr, const char* target_name_ptr,
+    int64_t target_name_len, void* output, void** inputs,
+    const char* opaque_str_ptr, int64_t opaque_str_len, void* status_opaque,
+    int32_t* operand_types, int64_t operand_count, int64_t* operand_dims,
+    int32_t* result_types, int64_t result_count, int64_t* result_dims) {
+  auto target_name = absl::string_view(target_name_ptr, target_name_len);
+  auto backend_config = absl::string_view(opaque_str_ptr, opaque_str_len);
+  auto xla_status = reinterpret_cast<XlaCustomCallStatus*>(status_opaque);
+
+  void** outputs = &output;
+  if (result_count > 1) {  // output is a tuple
+    outputs = reinterpret_cast<void**>(output);
+  }
+
+  // Annotate memory coming from jit compiled function as initialized to
+  // suppress false positives from msan sanitizer.
+  ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(result_types,
+                                      result_count * sizeof(int32_t));
+  ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(operand_types,
+                                      operand_count * sizeof(int32_t));
+
+  const xla::ExecutableRunOptions* run_options =
+      reinterpret_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
+
+  absl::Status status = BuildAndCallFfi(
+      run_options, target_name, backend_config,
+      absl::MakeSpan(outputs, result_count),
+      absl::MakeSpan(inputs, operand_count),
+      absl::MakeSpan(result_types, result_count), result_dims,
+      absl::MakeSpan(operand_types, operand_count), operand_dims);
+
+  if (!status.ok()) {
+    // In the future, status propagation will likely be possible.
+    // However, currently this has to pass through XlaCustomCallStatus
+    // which lacks functionality for status codes (it is fixed on INTERNAL)
+    XlaCustomCallStatusSetFailure(xla_status, status.message().data(),
+                                  status.message().size());
+  }
+}
diff --git a/third_party/xla/xla/service/cpu/runtime_handle_ffi_call.h b/third_party/xla/xla/service/cpu/runtime_handle_ffi_call.h
new file mode 100644
index 00000000000000..22511ebfa5c8f6
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/runtime_handle_ffi_call.h
@@ -0,0 +1,32 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_RUNTIME_HANDLE_FFI_CALL_H_
+#define XLA_SERVICE_CPU_RUNTIME_HANDLE_FFI_CALL_H_
+
+#include <cstdint>
+
+extern "C" {
+
+extern void __xla_cpu_runtime_HandleFfiCall(
+    const void* run_options_ptr, const char* target_name_ptr,
+    int64_t target_name_len, void* output, void** inputs,
+    const char* opaque_str_ptr, int64_t opaque_str_len, void* status_opaque,
+    int32_t* operand_types, int64_t operand_count, int64_t* operand_dims,
+    int32_t* result_types, int64_t result_count, int64_t* result_dims);
+
+}  // extern "C"
+
+#endif  // XLA_SERVICE_CPU_RUNTIME_HANDLE_FFI_CALL_H_
diff --git a/third_party/xla/xla/service/cpu/simple_orc_jit.cc b/third_party/xla/xla/service/cpu/simple_orc_jit.cc
index 04f522c770cc83..21ce0280906c1e 100644
--- a/third_party/xla/xla/service/cpu/simple_orc_jit.cc
+++ b/third_party/xla/xla/service/cpu/simple_orc_jit.cc
@@ -53,6 +53,7 @@ limitations under the License.
 #include "xla/service/cpu/runtime_fft.h"
 #include "xla/service/cpu/runtime_fork_join.h"
 #include "xla/service/cpu/runtime_fp16.h"
+#include "xla/service/cpu/runtime_handle_ffi_call.h"  // NOLINT
 #include "xla/service/cpu/runtime_key_value_sort.h"
 #include "xla/service/cpu/runtime_matmul.h"
 #include "xla/service/cpu/runtime_matmul_acl.h"
@@ -541,6 +542,7 @@ bool RegisterKnownJITSymbols() {
   REGISTER_CPU_RUNTIME_SYMBOL(TopKF32);
   REGISTER_CPU_RUNTIME_SYMBOL(TracingStart);
   REGISTER_CPU_RUNTIME_SYMBOL(TracingEnd);
+  REGISTER_CPU_RUNTIME_SYMBOL(HandleFfiCall);
 #if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
   REGISTER_CPU_RUNTIME_SYMBOL(OneDnnMatMul);
   REGISTER_CPU_RUNTIME_SYMBOL(OneDnnSoftmax);
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_noalias_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_noalias_test.cc
index ca64a1263c3e58..8426238fbeb288 100644
--- a/third_party/xla/xla/service/cpu/tests/cpu_noalias_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/cpu_noalias_test.cc
@@ -80,7 +80,7 @@ TEST_F(CpuNoAliasTest, Concat) {
       std::make_unique<DependencyHloOrdering>(hlo_module.get()),
       backend().compiler()->BufferSizeBytesFunction(),
       [](LogicalBuffer::Color) { return /*alignment=*/1; });
-  ASSERT_EQ(status_or_buffer_assn.status(), OkStatus());
+  ASSERT_EQ(status_or_buffer_assn.status(), absl::OkStatus());
 
   llvm::LLVMContext context;
   llvm_ir::AliasAnalysis aa(*hlo_module, *status_or_buffer_assn.value(),
diff --git a/third_party/xla/xla/service/cpu/tiled_dot_emitter.cc b/third_party/xla/xla/service/cpu/tiled_dot_emitter.cc
index e06fb90f9b764f..9418cfcfe0c8f9 100644
--- a/third_party/xla/xla/service/cpu/tiled_dot_emitter.cc
+++ b/third_party/xla/xla/service/cpu/tiled_dot_emitter.cc
@@ -360,8 +360,9 @@ void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
   ksl_.For(
       "dot.inner.epilg.outer", /*start=*/current_tile_col,
       /*end=*/b_->CreateAdd(columns_llvm, current_tile_col),
-      /*step=*/1, /*peel_first_iteration=*/false,
-      [&](llvm::Value* col, llvm::Value* is_first_scalar_col) {
+      /*step=*/1, [&](llvm::Value* col) {
+        llvm::Value* is_first_scalar_col =
+            b_->CreateICmpEQ(col, current_tile_col);
         llvm::Value* rhs_element = vsl_.LoadScalar(rhs_, col);
         llvm::Value* total_offset = b_->CreateMul(col, b_->getInt64(m()));
         llvm::Value* lhs_base_pointer =
diff --git a/third_party/xla/xla/service/cpu_gpu_shape_verifier.cc b/third_party/xla/xla/service/cpu_gpu_shape_verifier.cc
index d67269aa51bf28..d3966c1cf3d000 100644
--- a/third_party/xla/xla/service/cpu_gpu_shape_verifier.cc
+++ b/third_party/xla/xla/service/cpu_gpu_shape_verifier.cc
@@ -28,7 +28,7 @@ limitations under the License.
 namespace xla {
 
 namespace {
-Status VerifyS4U4Usage(HloInstruction* instruction) {
+absl::Status VerifyS4U4Usage(HloInstruction* instruction) {
   switch (instruction->opcode()) {
     case HloOpcode::kBitcast:
     case HloOpcode::kConstant:
@@ -45,10 +45,12 @@ Status VerifyS4U4Usage(HloInstruction* instruction) {
     default:
       TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
           instruction->shape(), [&](const Shape& shape, const ShapeIndex&) {
-            if (primitive_util::Is4BitType(shape.element_type())) {
+            if (primitive_util::IsSubByteNonPredType(shape.element_type())) {
               return absl::InvalidArgumentError(absl::StrFormat(
-                  "S4/U4 is currently only supported in convert instructions, "
-                  "but got instruction with S4/U4 input: %s",
+                  "%s is currently only supported in convert instructions, "
+                  "but got instruction: %s",
+                  primitive_util::LowercasePrimitiveTypeName(
+                      shape.element_type()),
                   instruction->ToString()));
             }
             return OkStatus();
@@ -60,7 +62,7 @@ Status VerifyS4U4Usage(HloInstruction* instruction) {
 }
 }  // namespace
 
-Status CpuGpuShapeVerifier::Preprocess(HloInstruction* hlo) {
+absl::Status CpuGpuShapeVerifier::Preprocess(HloInstruction* hlo) {
   TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
       hlo->shape(), [&](const Shape& shape, const ShapeIndex&) {
         if (shape.has_layout()) {
@@ -69,11 +71,11 @@ Status CpuGpuShapeVerifier::Preprocess(HloInstruction* hlo) {
                 "The XLA CPU/GPU backend does not support sparse shapes: %s",
                 hlo->ToString()));
           }
-          if (!primitive_util::Is4BitType(shape.element_type()) &&
+          if (!primitive_util::IsSubByteNonPredType(shape.element_type()) &&
               shape.layout().element_size_in_bits() != 0) {
             return absl::InvalidArgumentError(absl::StrFormat(
                 "The XLA CPU/GPU backend does not support custom element sizes "
-                "on non-4-bit types: %s",
+                "on non-sub-byte-bit types: %s",
                 hlo->ToString()));
           }
         }
diff --git a/third_party/xla/xla/service/cpu_gpu_shape_verifier.h b/third_party/xla/xla/service/cpu_gpu_shape_verifier.h
index 78fd43649b37d9..7adee3b49495f5 100644
--- a/third_party/xla/xla/service/cpu_gpu_shape_verifier.h
+++ b/third_party/xla/xla/service/cpu_gpu_shape_verifier.h
@@ -29,7 +29,7 @@ class CpuGpuShapeVerifier : public ShapeVerifier {
   explicit CpuGpuShapeVerifier(const HloVerifierOpts& opts)
       : ShapeVerifier(opts) {}
 
-  Status Preprocess(HloInstruction* hlo) override;
+  absl::Status Preprocess(HloInstruction* hlo) override;
 };
 
 // A verifier metadata class that uses the CpuGpuShapeVerifier.
diff --git a/third_party/xla/xla/service/cpu_gpu_shape_verifier_test.cc b/third_party/xla/xla/service/cpu_gpu_shape_verifier_test.cc
index 18e852b4d594dd..9ca69020302acc 100644
--- a/third_party/xla/xla/service/cpu_gpu_shape_verifier_test.cc
+++ b/third_party/xla/xla/service/cpu_gpu_shape_verifier_test.cc
@@ -57,7 +57,7 @@ TEST_F(CpuGpuShapeVerifierTest, Int4UnsupportedInstruction) {
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(
       status.message(),
-      HasSubstr("S4/U4 is currently only supported in convert instructions"));
+      HasSubstr("u4 is currently only supported in convert instructions"));
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/custom_call_sharding_helper.cc b/third_party/xla/xla/service/custom_call_sharding_helper.cc
index 8bc09a4ca79ae1..4492bf7c148993 100644
--- a/third_party/xla/xla/service/custom_call_sharding_helper.cc
+++ b/third_party/xla/xla/service/custom_call_sharding_helper.cc
@@ -19,6 +19,11 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/base/attributes.h"
+#include "absl/base/const_init.h"
+#include "absl/synchronization/mutex.h"
+#include "tsl/platform/logging.h"
+
 namespace xla {
 
 HloSharding CustomCallShardingHelper::PropagateUserSharding(
@@ -53,10 +58,13 @@ GetPartitioners() {
                               std::unique_ptr<CustomCallPartitioner>>;
   return *out;
 }
+
+ABSL_CONST_INIT absl::Mutex partitioners_mutex(absl::kConstInit);
 }  // namespace
 
 const CustomCallPartitioner* GetCustomCallPartitioner(
     const std::string& custom_call_target) {
+  absl::MutexLock partitioners_lock(&partitioners_mutex);
   auto& partitioners = GetPartitioners();
   auto it = partitioners.find(custom_call_target);
   if (it == partitioners.end()) {
@@ -68,8 +76,16 @@ const CustomCallPartitioner* GetCustomCallPartitioner(
 void RegisterCustomCallPartitioner(
     const std::string& custom_call_target,
     std::unique_ptr<CustomCallPartitioner> partitioner) {
+  absl::MutexLock partitioners_lock(&partitioners_mutex);
   auto& partitioners = GetPartitioners();
-  partitioners.emplace(custom_call_target, std::move(partitioner));
+  // Warn if something has already been registered. We prefer to keep the
+  // existing object as other threads are more likely to observe it.
+  auto [it, did_insert] =
+      partitioners.try_emplace(custom_call_target, std::move(partitioner));
+  if (!did_insert) {
+    LOG(ERROR) << "Failed to register custom call partitioner for "
+               << custom_call_target;
+  }
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/defuser.cc b/third_party/xla/xla/service/defuser.cc
index 2e8f9f03a82374..f539c7b67d1d66 100644
--- a/third_party/xla/xla/service/defuser.cc
+++ b/third_party/xla/xla/service/defuser.cc
@@ -45,7 +45,7 @@ absl::StatusOr<bool> Defuser::Run(
   bool changed = false;
   std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
   TF_RETURN_IF_ERROR(call_graph->VisitNodes(
-      [&](const CallGraphNode& call_graph_node) -> Status {
+      [&](const CallGraphNode& call_graph_node) -> absl::Status {
         if (call_graph_node.computation()->IsFusionComputation()) {
           TF_RET_CHECK(call_graph_node.caller_callsites().size() == 1);
           HloInstruction* fusion_instruction =
diff --git a/third_party/xla/xla/service/dfs_hlo_visitor_with_default_test.cc b/third_party/xla/xla/service/dfs_hlo_visitor_with_default_test.cc
index a3a2329992ffd5..21821ea1f22e18 100644
--- a/third_party/xla/xla/service/dfs_hlo_visitor_with_default_test.cc
+++ b/third_party/xla/xla/service/dfs_hlo_visitor_with_default_test.cc
@@ -37,7 +37,7 @@ TEST_F(DfsHloVisitorWithDefaultTest, DefaultElementwiseTest) {
 
   class ElementwiseTestVisitor : public DfsHloVisitorWithDefault {
    public:
-    Status DefaultAction(HloInstruction* hlo) override {
+    absl::Status DefaultAction(HloInstruction* hlo) override {
       // The HLO should be neither an elementwise unary nor binary op. These
       // cases are handled in HandleElementwiseBinary/Unary.
       TF_RET_CHECK(!(hlo->IsElementwise() && hlo->operand_count() == 2))
@@ -47,13 +47,13 @@ TEST_F(DfsHloVisitorWithDefaultTest, DefaultElementwiseTest) {
       return OkStatus();
     }
 
-    Status HandleElementwiseBinary(HloInstruction* hlo) override {
+    absl::Status HandleElementwiseBinary(HloInstruction* hlo) override {
       // HLO should be elementwise binary.
       TF_RET_CHECK(hlo->IsElementwise() && hlo->operand_count() == 2)
           << hlo->ToString();
       return OkStatus();
     }
-    Status HandleElementwiseUnary(HloInstruction* hlo) override {
+    absl::Status HandleElementwiseUnary(HloInstruction* hlo) override {
       // HLO should be elementwise unary.
       TF_RET_CHECK(hlo->IsElementwise() && hlo->operand_count() == 1)
           << hlo->ToString();
diff --git a/third_party/xla/xla/service/dot_decomposer.cc b/third_party/xla/xla/service/dot_decomposer.cc
index 33e77654ff60d8..2ddf54ddce3ac3 100644
--- a/third_party/xla/xla/service/dot_decomposer.cc
+++ b/third_party/xla/xla/service/dot_decomposer.cc
@@ -48,7 +48,7 @@ namespace {
 // * Batch dimensions are the most major dimensions.
 // This requires transposing and reshaping of the lhs and rhs, and reshaping the
 // output batch to the original shape.
-Status CanonicalizeDot(HloDotInstruction* original_dot) {
+absl::Status CanonicalizeDot(HloDotInstruction* original_dot) {
   auto computation = original_dot->parent();
   const auto& original_dnums = original_dot->dot_dimension_numbers();
   const int64_t num_batch_dims = original_dnums.lhs_batch_dimensions_size();
diff --git a/third_party/xla/xla/service/dot_dimension_merger.cc b/third_party/xla/xla/service/dot_dimension_merger.cc
index ba2f26bcca4a22..5dc254f440576e 100644
--- a/third_party/xla/xla/service/dot_dimension_merger.cc
+++ b/third_party/xla/xla/service/dot_dimension_merger.cc
@@ -60,7 +60,7 @@ std::vector<int64_t> ShiftDimensions(absl::Span<const int64_t> dimensions,
 // Merge all batch dimensions into logically first one for both operands.
 class BatchDimensionMerger : public DfsHloRewriteVisitor {
  public:
-  Status HandleDot(HloInstruction* dot) override {
+  absl::Status HandleDot(HloInstruction* dot) override {
     const DotDimensionNumbers& dnums = dot->dot_dimension_numbers();
     const Shape& lhs_shape = dot->operand(0)->shape();
     const Shape& rhs_shape = dot->operand(1)->shape();
diff --git a/third_party/xla/xla/service/dump.cc b/third_party/xla/xla/service/dump.cc
index 5adf5957d523bd..dd2843966d0792 100644
--- a/third_party/xla/xla/service/dump.cc
+++ b/third_party/xla/xla/service/dump.cc
@@ -43,6 +43,7 @@ limitations under the License.
 #include "tsl/platform/path.h"
 #include "tsl/platform/regexp.h"
 #include "tsl/platform/status.h"
+#include "tsl/profiler/lib/scoped_annotation.h"
 
 namespace xla {
 
@@ -83,7 +84,8 @@ struct CanonicalDebugOptions {
         dump_compress_protos(opts.xla_dump_compress_protos()),
         dump_hlo_metadata(!opts.xla_dump_disable_metadata()),
         dump_as_long_text(opts.xla_dump_hlo_as_long_text()),
-        dump_mlir_pretty_form(opts.xla_dump_enable_mlir_pretty_form()) {
+        dump_mlir_pretty_form(opts.xla_dump_enable_mlir_pretty_form()),
+        dump_large_constants(opts.xla_dump_large_constants()) {
     // This constructor examines the values in `opts` and turns on other flags
     // based on what we think is the user's intent.  To reduce confusion about
     // what was a user-specified value versus an extrapolated value, within this
@@ -200,6 +202,7 @@ struct CanonicalDebugOptions {
   bool dump_hlo_metadata;
   bool dump_as_long_text;
   bool dump_mlir_pretty_form;
+  bool dump_large_constants;
 };
 
 // Helper class to hold a list of functions that produces data to be written to
@@ -224,8 +227,9 @@ class DataProducer {
   std::queue<std::function<std::string()>> produce_funcs_;
 };
 
-static Status WriteStringToFile(tsl::Env* env, const std::string& fname,
-                                DataProducer& data_producer, bool compressed) {
+static absl::Status WriteStringToFile(tsl::Env* env, const std::string& fname,
+                                      DataProducer& data_producer,
+                                      bool compressed) {
   std::unique_ptr<tsl::WritableFile> file;
   TF_RETURN_IF_ERROR(env->NewWritableFile(fname, &file));
   if (compressed) {
@@ -245,8 +249,8 @@ static Status WriteStringToFile(tsl::Env* env, const std::string& fname,
   }
 }
 
-static Status WriteStringToFile(tsl::Env* env, const std::string& fname,
-                                absl::string_view data, bool compressed) {
+static absl::Status WriteStringToFile(tsl::Env* env, const std::string& fname,
+                                      absl::string_view data, bool compressed) {
   if (!compressed) {
     return tsl::WriteStringToFile(env, fname, data);
   }
@@ -406,6 +410,10 @@ static bool IsTrivial(const HloComputation& computation) {
 static std::vector<std::string> DumpHloModuleImpl(
     const HloModule& module, const BufferAssignment* buffer_assn,
     string_view prefix, string_view suffix, const CanonicalDebugOptions& opts) {
+  tsl::profiler::ScopedAnnotation annotation([&] {
+    return absl::StrFormat("XlaDumpHloModule:#module=%s,program_id=%d#",
+                           module.name(), module.unique_id());
+  });
   std::string filename = FilenameFor(module, prefix, suffix);
 
   std::vector<std::optional<std::string>> file_paths;
@@ -414,7 +422,7 @@ static std::vector<std::string> DumpHloModuleImpl(
     auto print_options = opts.dump_as_long_text
                              ? HloPrintOptions::Default()
                              : HloPrintOptions::ShortParsable();
-    print_options.set_print_large_constants(false);
+    print_options.set_print_large_constants(opts.dump_large_constants);
     print_options.set_print_control_dependencies(true);
     print_options.set_print_operand_index_annotation_interval(5);
     print_options.set_print_backend_config(true);
@@ -658,7 +666,7 @@ void DumpProtobufToFile(const tsl::protobuf::Message& proto,
     return;
   }
   const std::string path = tsl::io::JoinPath(dir, filename);
-  Status status;
+  absl::Status status;
   if (opts.dump_as_text) {
     if (text_formatter) {
       auto written_proto = text_formatter(env, proto);
diff --git a/third_party/xla/xla/service/dynamic_dimension_inference.cc b/third_party/xla/xla/service/dynamic_dimension_inference.cc
index ee74ae367d93f8..ce80a0ecd4a10a 100644
--- a/third_party/xla/xla/service/dynamic_dimension_inference.cc
+++ b/third_party/xla/xla/service/dynamic_dimension_inference.cc
@@ -117,7 +117,7 @@ class DynamicDimensionInferenceVisitor : public DfsHloRewriteVisitor {
         shape_check_mode_(shape_check_mode),
         assertion_generator_(assertion_generator) {}
 
-  Status DefaultAction(HloInstruction* hlo) override;
+  absl::Status DefaultAction(HloInstruction* hlo) override;
 
   static absl::StatusOr<bool> Run(
       HloComputation* computation, HloDataflowAnalysis& dataflow_analysis,
@@ -145,86 +145,86 @@ class DynamicDimensionInferenceVisitor : public DfsHloRewriteVisitor {
     return visitor.changed();
   }
 
-  Status HandleParameter(HloInstruction* hlo) override;
+  absl::Status HandleParameter(HloInstruction* hlo) override;
 
-  Status HandleInfeed(HloInstruction* hlo) override;
+  absl::Status HandleInfeed(HloInstruction* hlo) override;
 
-  Status HandleConstant(HloInstruction* hlo) override;
+  absl::Status HandleConstant(HloInstruction* hlo) override;
 
-  Status HandleReduce(HloInstruction* hlo) override;
+  absl::Status HandleReduce(HloInstruction* hlo) override;
 
-  Status HandleDot(HloInstruction* hlo) override;
+  absl::Status HandleDot(HloInstruction* hlo) override;
 
-  Status HandleTuple(HloInstruction* hlo) override;
+  absl::Status HandleTuple(HloInstruction* hlo) override;
 
-  Status HandleTranspose(HloInstruction* hlo) override;
+  absl::Status HandleTranspose(HloInstruction* hlo) override;
 
-  Status HandleDynamicReshape(HloInstruction* hlo) override;
+  absl::Status HandleDynamicReshape(HloInstruction* hlo) override;
 
-  Status HandleReshape(HloInstruction* hlo) override;
+  absl::Status HandleReshape(HloInstruction* hlo) override;
 
-  Status HandleSort(HloInstruction* hlo) override;
+  absl::Status HandleSort(HloInstruction* hlo) override;
 
-  Status HandlePad(HloInstruction* hlo) override;
+  absl::Status HandlePad(HloInstruction* hlo) override;
 
-  Status HandleCustomCall(HloInstruction* hlo) override;
+  absl::Status HandleCustomCall(HloInstruction* hlo) override;
 
-  Status HandleBroadcast(HloInstruction* hlo) override;
+  absl::Status HandleBroadcast(HloInstruction* hlo) override;
 
-  Status HandleGetDimensionSize(HloInstruction* hlo) override;
+  absl::Status HandleGetDimensionSize(HloInstruction* hlo) override;
 
-  Status HandleSetDimensionSize(HloInstruction* hlo) override;
+  absl::Status HandleSetDimensionSize(HloInstruction* hlo) override;
 
-  Status HandleSelect(HloInstruction* hlo) override;
+  absl::Status HandleSelect(HloInstruction* hlo) override;
 
-  Status HandleConvolution(HloInstruction* hlo) override;
+  absl::Status HandleConvolution(HloInstruction* hlo) override;
 
-  Status HandleConcatenate(HloInstruction* hlo) override;
+  absl::Status HandleConcatenate(HloInstruction* hlo) override;
 
-  Status HandleReduceWindow(HloInstruction* hlo) override;
+  absl::Status HandleReduceWindow(HloInstruction* hlo) override;
 
-  Status HandleReverse(HloInstruction* hlo) override;
+  absl::Status HandleReverse(HloInstruction* hlo) override;
 
-  Status HandleSelectAndScatter(HloInstruction* hlo) override;
+  absl::Status HandleSelectAndScatter(HloInstruction* hlo) override;
 
-  Status HandleGetTupleElement(HloInstruction* hlo) override;
+  absl::Status HandleGetTupleElement(HloInstruction* hlo) override;
 
-  Status HandleElementwiseUnary(HloInstruction* hlo) override;
+  absl::Status HandleElementwiseUnary(HloInstruction* hlo) override;
 
-  Status HandleElementwiseNary(HloInstruction* hlo);
+  absl::Status HandleElementwiseNary(HloInstruction* hlo);
 
-  Status HandleElementwiseBinary(HloInstruction* hlo) override;
+  absl::Status HandleElementwiseBinary(HloInstruction* hlo) override;
 
-  Status HandleClamp(HloInstruction* hlo) override;
+  absl::Status HandleClamp(HloInstruction* hlo) override;
 
-  Status HandleConditional(HloInstruction* hlo) override;
+  absl::Status HandleConditional(HloInstruction* hlo) override;
 
-  Status HandleWhile(HloInstruction* hlo) override;
+  absl::Status HandleWhile(HloInstruction* hlo) override;
 
-  Status HandleSlice(HloInstruction* hlo) override;
+  absl::Status HandleSlice(HloInstruction* hlo) override;
 
-  Status HandleDynamicSlice(HloInstruction* hlo) override;
+  absl::Status HandleDynamicSlice(HloInstruction* hlo) override;
 
-  Status HandleDynamicUpdateSlice(HloInstruction* hlo) override;
+  absl::Status HandleDynamicUpdateSlice(HloInstruction* hlo) override;
 
-  Status HandleGather(HloInstruction* hlo) override;
+  absl::Status HandleGather(HloInstruction* hlo) override;
 
-  Status HandleScatter(HloInstruction* hlo) override;
+  absl::Status HandleScatter(HloInstruction* hlo) override;
 
-  Status HandleMap(HloInstruction* hlo) override;
+  absl::Status HandleMap(HloInstruction* hlo) override;
 
-  Status HandleDomain(HloInstruction* hlo) override;
+  absl::Status HandleDomain(HloInstruction* hlo) override;
 
-  Status HandleAsyncStart(HloInstruction* hlo) override;
+  absl::Status HandleAsyncStart(HloInstruction* hlo) override;
 
-  Status HandleAsyncDone(HloInstruction* hlo) override;
+  absl::Status HandleAsyncDone(HloInstruction* hlo) override;
 
  private:
-  using OperandDynamicDimensionFn = absl::FunctionRef<Status(
+  using OperandDynamicDimensionFn = absl::FunctionRef<absl::Status(
       HloInstruction* operand, ShapeIndex index, int64_t dimension,
       int64_t operand_index, HloInstruction* dynamic_size)>;
 
-  using DynamicDimensionFn = std::function<Status(
+  using DynamicDimensionFn = std::function<absl::Status(
       ShapeIndex index, int64_t dimension, HloInstruction* dynamic_size)>;
 
   void SetDynamicSize(HloInstruction* inst, const ShapeIndex& index,
@@ -234,31 +234,31 @@ class DynamicDimensionInferenceVisitor : public DfsHloRewriteVisitor {
   void SetDynamicSizes(HloInstruction* inst, const ShapeIndex& index,
                        absl::Span<HloInstruction* const> sizes);
 
-  Status HandleDynamicConvolutionForward(HloInstruction* hlo,
-                                         int64_t operand_index,
-                                         int64_t dimension,
-                                         HloInstruction* dynamic_size);
+  absl::Status HandleDynamicConvolutionForward(HloInstruction* hlo,
+                                               int64_t operand_index,
+                                               int64_t dimension,
+                                               HloInstruction* dynamic_size);
 
-  Status HandleDynamicConvolutionKernelGrad(HloInstruction* hlo,
-                                            int64_t operand_index,
-                                            int64_t dimension);
+  absl::Status HandleDynamicConvolutionKernelGrad(HloInstruction* hlo,
+                                                  int64_t operand_index,
+                                                  int64_t dimension);
 
-  Status HandleDynamicConvolutionInputGrad(HloInstruction* hlo,
-                                           int64_t operand_index,
-                                           int64_t dimension);
+  absl::Status HandleDynamicConvolutionInputGrad(HloInstruction* hlo,
+                                                 int64_t operand_index,
+                                                 int64_t dimension);
 
-  Status HandleDynamicWindowSamePadding(HloInstruction* hlo,
-                                        HloInstruction* dynamic_size,
-                                        int64_t operand_index,
-                                        int64_t dimension);
+  absl::Status HandleDynamicWindowSamePadding(HloInstruction* hlo,
+                                              HloInstruction* dynamic_size,
+                                              int64_t operand_index,
+                                              int64_t dimension);
 
-  Status ForEachOperandDynamicDimension(HloInstruction* inst,
-                                        OperandDynamicDimensionFn);
-  Status ForEachDynamicDimensionInOperand(HloInstruction* inst,
-                                          int64_t operand_index,
-                                          OperandDynamicDimensionFn);
-  Status ForEachDynamicDimension(HloInstruction* inst,
-                                 const DynamicDimensionFn& fn);
+  absl::Status ForEachOperandDynamicDimension(HloInstruction* inst,
+                                              OperandDynamicDimensionFn);
+  absl::Status ForEachDynamicDimensionInOperand(HloInstruction* inst,
+                                                int64_t operand_index,
+                                                OperandDynamicDimensionFn);
+  absl::Status ForEachDynamicDimension(HloInstruction* inst,
+                                       const DynamicDimensionFn& fn);
 
   bool CanInfer(HloInstruction* hlo) { return parent_->CanInfer(hlo); }
 
@@ -274,18 +274,18 @@ class DynamicDimensionInferenceVisitor : public DfsHloRewriteVisitor {
   // produces a tuple, each tuple component will be considered independently.
   // Returns the original instruction, with all arrays converted to static
   // shapes.
-  Status InsertPadToStaticOnInstruction(HloInstruction* inst);
+  absl::Status InsertPadToStaticOnInstruction(HloInstruction* inst);
 
   // Insert shape check to make sure `dim1` is equal to `dim2`. If
   // support_implicit_broadcast is true, the check will pass if either of them
   // is 1, even if they are different.
-  Status InsertShapeCheck(HloInstruction* dim1, HloInstruction* dim2,
-                          bool support_implicit_broadcast);
+  absl::Status InsertShapeCheck(HloInstruction* dim1, HloInstruction* dim2,
+                                bool support_implicit_broadcast);
 
   // Pass through a dynamic dimension from the input to the output with the
   // same value and index in the shape. This is a helper function to handle
   // trivial instructions like elementwise operations.
-  Status PassThroughDynamicDimension(HloInstruction*);
+  absl::Status PassThroughDynamicDimension(HloInstruction*);
 
   // The dynamic parameter bindings of this computation.
   const DynamicParameterBinding& param_bindings_;
@@ -333,7 +333,8 @@ void DynamicDimensionInferenceVisitor::SetDynamicSizes(
   }
 }
 
-Status DynamicDimensionInferenceVisitor::DefaultAction(HloInstruction* hlo) {
+absl::Status DynamicDimensionInferenceVisitor::DefaultAction(
+    HloInstruction* hlo) {
   return ForEachOperandDynamicDimension(
       hlo, [&](HloInstruction* operand, ShapeIndex index, int64_t dimension,
                int64_t operand_index, HloInstruction* dynamic_size) {
@@ -344,7 +345,7 @@ Status DynamicDimensionInferenceVisitor::DefaultAction(HloInstruction* hlo) {
       });
 }
 
-Status DynamicDimensionInferenceVisitor::HandleGetTupleElement(
+absl::Status DynamicDimensionInferenceVisitor::HandleGetTupleElement(
     HloInstruction* hlo) {
   if (!CanInfer(hlo)) {
     return OkStatus();
@@ -352,7 +353,7 @@ Status DynamicDimensionInferenceVisitor::HandleGetTupleElement(
   return ForEachOperandDynamicDimension(
       hlo,
       [&](HloInstruction* operand, ShapeIndex index, int64_t dimension,
-          int64_t operand_index, HloInstruction* dynamic_size) -> Status {
+          int64_t operand_index, HloInstruction* dynamic_size) -> absl::Status {
         if (hlo->tuple_index() != index[0]) {
           return OkStatus();
         }
@@ -362,7 +363,8 @@ Status DynamicDimensionInferenceVisitor::HandleGetTupleElement(
       });
 }
 
-Status DynamicDimensionInferenceVisitor::HandleTuple(HloInstruction* hlo) {
+absl::Status DynamicDimensionInferenceVisitor::HandleTuple(
+    HloInstruction* hlo) {
   if (!CanInfer(hlo)) {
     return OkStatus();
   }
@@ -376,7 +378,8 @@ Status DynamicDimensionInferenceVisitor::HandleTuple(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status DynamicDimensionInferenceVisitor::HandleBroadcast(HloInstruction* hlo) {
+absl::Status DynamicDimensionInferenceVisitor::HandleBroadcast(
+    HloInstruction* hlo) {
   if (!CanInfer(hlo)) {
     return OkStatus();
   }
@@ -389,7 +392,8 @@ Status DynamicDimensionInferenceVisitor::HandleBroadcast(HloInstruction* hlo) {
       });
 }
 
-Status DynamicDimensionInferenceVisitor::HandleConstant(HloInstruction* hlo) {
+absl::Status DynamicDimensionInferenceVisitor::HandleConstant(
+    HloInstruction* hlo) {
   if (!hlo->shape().is_dynamic()) {
     return OkStatus();
   }
@@ -398,7 +402,8 @@ Status DynamicDimensionInferenceVisitor::HandleConstant(HloInstruction* hlo) {
   Shape padded_shape = constant->shape();
   bool pad_any = false;
   TF_RETURN_IF_ERROR(ShapeUtil::ForEachMutableSubshapeWithStatus(
-      &padded_shape, [&](Shape* subshape, const ShapeIndex& index) -> Status {
+      &padded_shape,
+      [&](Shape* subshape, const ShapeIndex& index) -> absl::Status {
         if (!subshape->IsArray()) {
           return OkStatus();
         }
@@ -437,7 +442,7 @@ Status DynamicDimensionInferenceVisitor::HandleConstant(HloInstruction* hlo) {
   TF_RETURN_IF_ERROR(constant->ReplaceAllUsesWith(padded_constant));
   SetVisited(*padded_constant);
   TF_RETURN_IF_ERROR(do_pad.ForEachElementWithStatus(
-      [&](const ShapeIndex& index, bool requires_pad) -> Status {
+      [&](const ShapeIndex& index, bool requires_pad) -> absl::Status {
         if (!requires_pad) {
           return OkStatus();
         }
@@ -460,7 +465,8 @@ Status DynamicDimensionInferenceVisitor::HandleConstant(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status DynamicDimensionInferenceVisitor::HandleCustomCall(HloInstruction* hlo) {
+absl::Status DynamicDimensionInferenceVisitor::HandleCustomCall(
+    HloInstruction* hlo) {
   if (hlo->custom_call_target() == "PadToStatic") {
     for (int64_t i = 0; i < hlo->operand(0)->shape().rank(); ++i) {
       if (hlo->operand(0)->shape().is_dynamic_dimension(i)) {
@@ -543,7 +549,7 @@ Status DynamicDimensionInferenceVisitor::HandleCustomCall(HloInstruction* hlo) {
   return InsertPadToStaticOnInstruction(hlo);
 }
 
-Status DynamicDimensionInferenceVisitor::HandleSort(HloInstruction* hlo) {
+absl::Status DynamicDimensionInferenceVisitor::HandleSort(HloInstruction* hlo) {
   if (!CanInfer(hlo)) {
     return OkStatus();
   }
@@ -562,7 +568,7 @@ Status DynamicDimensionInferenceVisitor::HandleSort(HloInstruction* hlo) {
       });
 }
 
-Status DynamicDimensionInferenceVisitor::HandlePad(HloInstruction* hlo) {
+absl::Status DynamicDimensionInferenceVisitor::HandlePad(HloInstruction* hlo) {
   if (!CanInfer(hlo)) {
     return OkStatus();
   }
@@ -620,7 +626,8 @@ Status DynamicDimensionInferenceVisitor::HandlePad(HloInstruction* hlo) {
       });
 }
 
-Status DynamicDimensionInferenceVisitor::HandleReduce(HloInstruction* hlo) {
+absl::Status DynamicDimensionInferenceVisitor::HandleReduce(
+    HloInstruction* hlo) {
   if (!CanInfer(hlo)) {
     return OkStatus();
   }
@@ -628,7 +635,7 @@ Status DynamicDimensionInferenceVisitor::HandleReduce(HloInstruction* hlo) {
   int64_t rank = -1;
   TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
       reduce->shape(),
-      [&](const Shape& subshape, const ShapeIndex& index) -> Status {
+      [&](const Shape& subshape, const ShapeIndex& index) -> absl::Status {
         if (!subshape.IsArray()) {
           return OkStatus();
         }
@@ -686,7 +693,7 @@ Status DynamicDimensionInferenceVisitor::HandleReduce(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status DynamicDimensionInferenceVisitor::HandleDot(HloInstruction* hlo) {
+absl::Status DynamicDimensionInferenceVisitor::HandleDot(HloInstruction* hlo) {
   if (!CanInfer(hlo)) {
     return OkStatus();
   }
@@ -696,7 +703,7 @@ Status DynamicDimensionInferenceVisitor::HandleDot(HloInstruction* hlo) {
       hlo,
       [&](HloInstruction* operand, ShapeIndex operand_shape_index,
           int64_t operand_dimension, int64_t operand_index,
-          HloInstruction* dynamic_size) -> Status {
+          HloInstruction* dynamic_size) -> absl::Status {
         // There are three types of dimensions in a dot:
         // A. batch dims
         // B. contracting dims
@@ -780,14 +787,15 @@ Status DynamicDimensionInferenceVisitor::HandleDot(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status DynamicDimensionInferenceVisitor::HandleTranspose(HloInstruction* hlo) {
+absl::Status DynamicDimensionInferenceVisitor::HandleTranspose(
+    HloInstruction* hlo) {
   if (!CanInfer(hlo)) {
     return OkStatus();
   }
   return ForEachOperandDynamicDimension(
       hlo,
       [&](HloInstruction* operand, ShapeIndex index, int64_t dimension,
-          int64_t operand_index, HloInstruction* dynamic_size) -> Status {
+          int64_t operand_index, HloInstruction* dynamic_size) -> absl::Status {
         int64_t permuted_dim = -1;
         for (int64_t i = 0; i < hlo->dimensions().size(); ++i) {
           if (hlo->dimensions()[i] == dimension) {
@@ -800,7 +808,7 @@ Status DynamicDimensionInferenceVisitor::HandleTranspose(HloInstruction* hlo) {
       });
 }
 
-Status DynamicDimensionInferenceVisitor::HandleConvolution(
+absl::Status DynamicDimensionInferenceVisitor::HandleConvolution(
     HloInstruction* hlo) {
   if (!CanInfer(hlo)) {
     return OkStatus();
@@ -833,7 +841,7 @@ Status DynamicDimensionInferenceVisitor::HandleConvolution(
       });
 }
 
-Status DynamicDimensionInferenceVisitor::HandleConcatenate(
+absl::Status DynamicDimensionInferenceVisitor::HandleConcatenate(
     HloInstruction* hlo) {
   if (!CanInfer(hlo)) {
     return OkStatus();
@@ -882,7 +890,7 @@ Status DynamicDimensionInferenceVisitor::HandleConcatenate(
   TF_RETURN_IF_ERROR(ForEachOperandDynamicDimension(
       hlo,
       [&](HloInstruction* operand, ShapeIndex index, int64_t dimension,
-          int64_t operand_index, HloInstruction* dynamic_size) -> Status {
+          int64_t operand_index, HloInstruction* dynamic_size) -> absl::Status {
         TF_RET_CHECK(index.empty());
         int64_t concatenate_dimension = hlo->concatenate_dimension();
         if (concatenate_dimension == dimension) {
@@ -897,7 +905,7 @@ Status DynamicDimensionInferenceVisitor::HandleConcatenate(
   return OkStatus();
 }
 
-Status DynamicDimensionInferenceVisitor::HandleGetDimensionSize(
+absl::Status DynamicDimensionInferenceVisitor::HandleGetDimensionSize(
     HloInstruction* gds) {
   // Dynamic dimension doesn't propagate through GetDimensionSize:
   //
@@ -934,7 +942,7 @@ Status DynamicDimensionInferenceVisitor::HandleGetDimensionSize(
   return OkStatus();
 }
 
-Status DynamicDimensionInferenceVisitor::HandleSetDimensionSize(
+absl::Status DynamicDimensionInferenceVisitor::HandleSetDimensionSize(
     HloInstruction* hlo) {
   bool dimension_is_static = false;
   const HloInstruction* size = hlo->operand(1);
@@ -965,7 +973,7 @@ Status DynamicDimensionInferenceVisitor::HandleSetDimensionSize(
   TF_RETURN_IF_ERROR(ForEachOperandDynamicDimension(
       hlo,
       [&](HloInstruction* operand, ShapeIndex index, int64_t dimension,
-          int64_t operand_index, HloInstruction* dynamic_size) -> Status {
+          int64_t operand_index, HloInstruction* dynamic_size) -> absl::Status {
         TF_RET_CHECK(operand_index == 0);
         if (dimension != hlo->dimension()) {
           SetDynamicSize(hlo, index, dimension, dynamic_size,
@@ -977,7 +985,7 @@ Status DynamicDimensionInferenceVisitor::HandleSetDimensionSize(
   return OkStatus();
 }
 
-Status DynamicDimensionInferenceVisitor::HandleDynamicConvolutionForward(
+absl::Status DynamicDimensionInferenceVisitor::HandleDynamicConvolutionForward(
     HloInstruction* hlo, int64_t operand_index, int64_t dimension,
     HloInstruction* dynamic_size) {
   if (!CanInfer(hlo)) {
@@ -1017,7 +1025,7 @@ Status DynamicDimensionInferenceVisitor::HandleDynamicConvolutionForward(
   return OkStatus();
 }
 
-Status DynamicDimensionInferenceVisitor::HandleDynamicWindowSamePadding(
+absl::Status DynamicDimensionInferenceVisitor::HandleDynamicWindowSamePadding(
     HloInstruction* hlo, HloInstruction* dynamic_size, int64_t operand_index,
     int64_t dimension) {
   if (!CanInfer(hlo)) {
@@ -1037,7 +1045,8 @@ Status DynamicDimensionInferenceVisitor::HandleDynamicWindowSamePadding(
   return OkStatus();
 }
 
-Status DynamicDimensionInferenceVisitor::HandleDynamicConvolutionInputGrad(
+absl::Status
+DynamicDimensionInferenceVisitor::HandleDynamicConvolutionInputGrad(
     HloInstruction* hlo, int64_t operand_index, int64_t dimension) {
   if (!CanInfer(hlo)) {
     return OkStatus();
@@ -1060,13 +1069,14 @@ Status DynamicDimensionInferenceVisitor::HandleDynamicConvolutionInputGrad(
   return OkStatus();
 }
 
-Status DynamicDimensionInferenceVisitor::HandleDynamicConvolutionKernelGrad(
+absl::Status
+DynamicDimensionInferenceVisitor::HandleDynamicConvolutionKernelGrad(
     HloInstruction* hlo, int64_t operand_index, int64_t dimension) {
   // Dynamic convolution kernel grad produces static shape outputs.
   return OkStatus();
 }
 
-Status DynamicDimensionInferenceVisitor::PassThroughDynamicDimension(
+absl::Status DynamicDimensionInferenceVisitor::PassThroughDynamicDimension(
     HloInstruction* hlo) {
   if (!CanInfer(hlo)) {
     return OkStatus();
@@ -1093,11 +1103,13 @@ Status DynamicDimensionInferenceVisitor::PassThroughDynamicDimension(
   return OkStatus();
 }
 
-Status DynamicDimensionInferenceVisitor::HandleDomain(HloInstruction* hlo) {
+absl::Status DynamicDimensionInferenceVisitor::HandleDomain(
+    HloInstruction* hlo) {
   return PassThroughDynamicDimension(hlo);
 }
 
-Status DynamicDimensionInferenceVisitor::HandleAsyncStart(HloInstruction* hlo) {
+absl::Status DynamicDimensionInferenceVisitor::HandleAsyncStart(
+    HloInstruction* hlo) {
   if (!HloInstruction::IsThreadIncluded(hlo->async_execution_thread(),
                                         parent_->execution_threads_)) {
     // Async-start not included in specified execution thread set will use
@@ -1108,7 +1120,8 @@ Status DynamicDimensionInferenceVisitor::HandleAsyncStart(HloInstruction* hlo) {
   return DefaultAction(hlo);
 }
 
-Status DynamicDimensionInferenceVisitor::HandleAsyncDone(HloInstruction* hlo) {
+absl::Status DynamicDimensionInferenceVisitor::HandleAsyncDone(
+    HloInstruction* hlo) {
   if (!HloInstruction::IsThreadIncluded(hlo->async_execution_thread(),
                                         parent_->execution_threads_)) {
     // Other threads can return a dynamic shape directly, so we may need to
@@ -1118,16 +1131,17 @@ Status DynamicDimensionInferenceVisitor::HandleAsyncDone(HloInstruction* hlo) {
   return DefaultAction(hlo);
 }
 
-Status DynamicDimensionInferenceVisitor::HandleElementwiseUnary(
+absl::Status DynamicDimensionInferenceVisitor::HandleElementwiseUnary(
     HloInstruction* hlo) {
   return PassThroughDynamicDimension(hlo);
 }
 
-Status DynamicDimensionInferenceVisitor::HandleSelect(HloInstruction* hlo) {
+absl::Status DynamicDimensionInferenceVisitor::HandleSelect(
+    HloInstruction* hlo) {
   return HandleElementwiseNary(hlo);
 }
 
-Status DynamicDimensionInferenceVisitor::HandleElementwiseNary(
+absl::Status DynamicDimensionInferenceVisitor::HandleElementwiseNary(
     HloInstruction* hlo) {
   if (!CanInfer(hlo)) {
     return OkStatus();
@@ -1141,7 +1155,7 @@ Status DynamicDimensionInferenceVisitor::HandleElementwiseNary(
   TF_RETURN_IF_ERROR(ForEachOperandDynamicDimension(
       hlo,
       [&](HloInstruction* operand, ShapeIndex index, int64_t dimension,
-          int64_t operand_index, HloInstruction* dynamic_size) -> Status {
+          int64_t operand_index, HloInstruction* dynamic_size) -> absl::Status {
         TF_RET_CHECK(index.empty());
         operand_sizes[dimension][operand_index] = dynamic_size;
         return OkStatus();
@@ -1214,16 +1228,17 @@ Status DynamicDimensionInferenceVisitor::HandleElementwiseNary(
   return OkStatus();
 }
 
-Status DynamicDimensionInferenceVisitor::HandleElementwiseBinary(
+absl::Status DynamicDimensionInferenceVisitor::HandleElementwiseBinary(
     HloInstruction* hlo) {
   return HandleElementwiseNary(hlo);
 }
 
-Status DynamicDimensionInferenceVisitor::HandleClamp(HloInstruction* hlo) {
+absl::Status DynamicDimensionInferenceVisitor::HandleClamp(
+    HloInstruction* hlo) {
   return PassThroughDynamicDimension(hlo);
 }
 
-Status DynamicDimensionInferenceVisitor::HandleDynamicReshape(
+absl::Status DynamicDimensionInferenceVisitor::HandleDynamicReshape(
     HloInstruction* hlo) {
   if (!CanInfer(hlo)) {
     return OkStatus();
@@ -1238,7 +1253,7 @@ Status DynamicDimensionInferenceVisitor::HandleDynamicReshape(
   return OkStatus();
 }
 
-Status DynamicDimensionInferenceVisitor::HandleReshape(
+absl::Status DynamicDimensionInferenceVisitor::HandleReshape(
     HloInstruction* const hlo) {
   if (!CanInfer(hlo)) {
     return OkStatus();
@@ -1326,7 +1341,7 @@ Status DynamicDimensionInferenceVisitor::HandleReshape(
                   op,
                   [&](HloInstruction* operand, ShapeIndex index,
                       int64_t op_dynamic_dimension, int64_t operand_index,
-                      HloInstruction* operand_dynamic_size) -> Status {
+                      HloInstruction* operand_dynamic_size) -> absl::Status {
                     if (op_dynamic_dimension >= orig_reshape_pair.first.first &&
                         op_dynamic_dimension < orig_reshape_pair.first.second) {
                       auto dynamic_size =
@@ -1368,7 +1383,7 @@ Status DynamicDimensionInferenceVisitor::HandleReshape(
       hlo,
       [&](HloInstruction* operand, ShapeIndex index,
           int64_t input_dynamic_dimension, int64_t operand_index,
-          HloInstruction* operand_dynamic_size) -> Status {
+          HloInstruction* operand_dynamic_size) -> absl::Status {
         auto reshape_pair =
             find_reshape_group_pair(hlo, input_dynamic_dimension);
         reshape_group_pairs[input_dynamic_dimension] = reshape_pair;
@@ -1429,7 +1444,7 @@ Status DynamicDimensionInferenceVisitor::HandleReshape(
       hlo,
       [&](HloInstruction* operand, ShapeIndex index,
           int64_t input_dynamic_dimension, int64_t operand_index,
-          HloInstruction* operand_dynamic_size) -> Status {
+          HloInstruction* operand_dynamic_size) -> absl::Status {
         HloInstruction* const reshape = hlo;
         if (reshape->shape().rank() == 0) {
           VLOG(0) << "Reshaping a dynamic dimension into a scalar, which has "
@@ -1619,7 +1634,7 @@ Status DynamicDimensionInferenceVisitor::HandleReshape(
   return OkStatus();
 }
 
-Status DynamicDimensionInferenceVisitor::HandleReduceWindow(
+absl::Status DynamicDimensionInferenceVisitor::HandleReduceWindow(
     HloInstruction* hlo) {
   if (!CanInfer(hlo)) {
     return OkStatus();
@@ -1675,7 +1690,7 @@ Status DynamicDimensionInferenceVisitor::HandleReduceWindow(
   return OkStatus();
 }
 
-Status DynamicDimensionInferenceVisitor::HandleSelectAndScatter(
+absl::Status DynamicDimensionInferenceVisitor::HandleSelectAndScatter(
     HloInstruction* hlo) {
   if (!CanInfer(hlo)) {
     return OkStatus();
@@ -1694,14 +1709,16 @@ Status DynamicDimensionInferenceVisitor::HandleSelectAndScatter(
       });
 }
 
-Status DynamicDimensionInferenceVisitor::HandleSlice(HloInstruction* hlo) {
+absl::Status DynamicDimensionInferenceVisitor::HandleSlice(
+    HloInstruction* hlo) {
   if (!CanInfer(hlo)) {
     return OkStatus();
   }
   return ForEachOperandDynamicDimension(
       hlo,
       [&](HloInstruction* operand, ShapeIndex /*index*/, int64_t dimension,
-          int64_t /*operand_index*/, HloInstruction* dynamic_size) -> Status {
+          int64_t /*operand_index*/,
+          HloInstruction* dynamic_size) -> absl::Status {
         int64_t start = hlo->slice_starts(dimension);
         int64_t limit = hlo->slice_limits(dimension);
         int64_t stride = hlo->slice_strides(dimension);
@@ -1735,7 +1752,7 @@ Status DynamicDimensionInferenceVisitor::HandleSlice(HloInstruction* hlo) {
       });
 }
 
-Status DynamicDimensionInferenceVisitor::HandleDynamicSlice(
+absl::Status DynamicDimensionInferenceVisitor::HandleDynamicSlice(
     HloInstruction* hlo) {
   if (!CanInfer(hlo)) {
     return OkStatus();
@@ -1743,7 +1760,7 @@ Status DynamicDimensionInferenceVisitor::HandleDynamicSlice(
   return ForEachOperandDynamicDimension(
       hlo,
       [&](HloInstruction* operand, ShapeIndex index, int64_t dimension,
-          int64_t operand_index, HloInstruction* dynamic_size) -> Status {
+          int64_t operand_index, HloInstruction* dynamic_size) -> absl::Status {
         // Slicing a single element out kills the dynamic dimension.
         if (hlo->shape().dimensions(dimension) == 1) {
           return OkStatus();
@@ -1766,7 +1783,7 @@ Status DynamicDimensionInferenceVisitor::HandleDynamicSlice(
       });
 }
 
-Status DynamicDimensionInferenceVisitor::HandleDynamicUpdateSlice(
+absl::Status DynamicDimensionInferenceVisitor::HandleDynamicUpdateSlice(
     HloInstruction* hlo) {
   if (!CanInfer(hlo)) {
     return OkStatus();
@@ -1776,7 +1793,7 @@ Status DynamicDimensionInferenceVisitor::HandleDynamicUpdateSlice(
   TF_RETURN_IF_ERROR(ForEachOperandDynamicDimension(
       hlo,
       [&](HloInstruction* operand, ShapeIndex index, int64_t dimension,
-          int64_t operand_index, HloInstruction* dynamic_size) -> Status {
+          int64_t operand_index, HloInstruction* dynamic_size) -> absl::Status {
         TF_RET_CHECK(index.empty());
 
         if (hlo->shape().dimensions(dimension) !=
@@ -1808,11 +1825,13 @@ Status DynamicDimensionInferenceVisitor::HandleDynamicUpdateSlice(
   return OkStatus();
 }
 
-Status DynamicDimensionInferenceVisitor::HandleReverse(HloInstruction* hlo) {
+absl::Status DynamicDimensionInferenceVisitor::HandleReverse(
+    HloInstruction* hlo) {
   return PassThroughDynamicDimension(hlo);
 }
 
-Status DynamicDimensionInferenceVisitor::HandleGather(HloInstruction* hlo) {
+absl::Status DynamicDimensionInferenceVisitor::HandleGather(
+    HloInstruction* hlo) {
   if (!CanInfer(hlo)) {
     return OkStatus();
   }
@@ -1822,7 +1841,7 @@ Status DynamicDimensionInferenceVisitor::HandleGather(HloInstruction* hlo) {
       hlo,
       [&](HloInstruction* operand, ShapeIndex /*index*/,
           int64_t input_dynamic_dimension, int64_t operand_index,
-          HloInstruction* dynamic_size) -> Status {
+          HloInstruction* dynamic_size) -> absl::Status {
         const GatherDimensionNumbers& gather_dims =
             hlo->gather_dimension_numbers();
         if (operand_index == 0) {
@@ -1887,7 +1906,7 @@ Status DynamicDimensionInferenceVisitor::HandleGather(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status DynamicDimensionInferenceVisitor::HandleConditional(
+absl::Status DynamicDimensionInferenceVisitor::HandleConditional(
     HloInstruction* hlo) {
   if (!CanInfer(hlo)) {
     return OkStatus();
@@ -1923,7 +1942,7 @@ Status DynamicDimensionInferenceVisitor::HandleConditional(
     TF_RETURN_IF_ERROR(ForEachDynamicDimensionInOperand(
         hlo, operand_index,
         [&](HloInstruction*, ShapeIndex, int64_t, int64_t,
-            HloInstruction* dynamic_size) -> Status {
+            HloInstruction* dynamic_size) -> absl::Status {
           TF_RET_CHECK(hlo->operand(operand_index)->shape().IsTuple())
               << "Only tuple typed inputs can have dynamic dimension. Please "
                  "file a bug against XLA team.";
@@ -2110,14 +2129,15 @@ Status DynamicDimensionInferenceVisitor::HandleConditional(
   return OkStatus();
 }
 
-Status DynamicDimensionInferenceVisitor::HandleMap(HloInstruction* hlo) {
+absl::Status DynamicDimensionInferenceVisitor::HandleMap(HloInstruction* hlo) {
   if (!CanInfer(hlo)) {
     return OkStatus();
   }
   return HandleElementwiseNary(hlo);
 }
 
-Status DynamicDimensionInferenceVisitor::HandleScatter(HloInstruction* hlo) {
+absl::Status DynamicDimensionInferenceVisitor::HandleScatter(
+    HloInstruction* hlo) {
   if (!CanInfer(hlo)) {
     return OkStatus();
   }
@@ -2186,7 +2206,8 @@ Status DynamicDimensionInferenceVisitor::HandleScatter(HloInstruction* hlo) {
       });
 }
 
-Status DynamicDimensionInferenceVisitor::HandleWhile(HloInstruction* hlo) {
+absl::Status DynamicDimensionInferenceVisitor::HandleWhile(
+    HloInstruction* hlo) {
   if (!CanInfer(hlo)) {
     return OkStatus();
   }
@@ -2207,7 +2228,7 @@ Status DynamicDimensionInferenceVisitor::HandleWhile(HloInstruction* hlo) {
   TF_RETURN_IF_ERROR(ForEachOperandDynamicDimension(
       hlo,
       [&](HloInstruction* operand, ShapeIndex index, int64_t dim,
-          int64_t operand_num, HloInstruction* dynamic_size) -> Status {
+          int64_t operand_num, HloInstruction* dynamic_size) -> absl::Status {
         TF_RET_CHECK(operand_num == 0);
         operands_to_add.push_back(dynamic_size);
         dynamic_output_mapping.mutable_element(index)->emplace(dim,
@@ -2298,7 +2319,8 @@ Status DynamicDimensionInferenceVisitor::HandleWhile(HloInstruction* hlo) {
   // Add dynamic dimension size as new outputs of the while loop body.
   TF_RETURN_IF_ERROR(dynamic_output_mapping.ForEachElementWithStatus(
       [&](const ShapeIndex& index,
-          const absl::flat_hash_map<int64_t, int64_t>& dim_to_size) -> Status {
+          const absl::flat_hash_map<int64_t, int64_t>& dim_to_size)
+          -> absl::Status {
         for (auto [dimension, output_index] : dim_to_size) {
           TF_RET_CHECK(new_root_operands[output_index] == nullptr);
           HloInstruction* dynamic_size =
@@ -2317,7 +2339,7 @@ Status DynamicDimensionInferenceVisitor::HandleWhile(HloInstruction* hlo) {
     TF_RETURN_IF_ERROR(ForEachDynamicDimension(
         body_root,
         [&](ShapeIndex index, int64_t dimension,
-            HloInstruction* dynamic_size) -> Status {
+            HloInstruction* dynamic_size) -> absl::Status {
           SetDynamicSize(new_body_root, index, dimension, dynamic_size);
           if (index.empty() || index.front() != i) {
             return OkStatus();
@@ -2333,7 +2355,8 @@ Status DynamicDimensionInferenceVisitor::HandleWhile(HloInstruction* hlo) {
   // Record the dynamic sizes of while loop output.
   return dynamic_output_mapping.ForEachElementWithStatus(
       [&](const ShapeIndex& index,
-          const absl::flat_hash_map<int64_t, int64_t>& dim_to_size) -> Status {
+          const absl::flat_hash_map<int64_t, int64_t>& dim_to_size)
+          -> absl::Status {
         for (auto [dimension, output_index] : dim_to_size) {
           HloInstruction* dynamic_size = hlo->AddInstruction(
               HloInstruction::CreateGetTupleElement(hlo, output_index));
@@ -2354,7 +2377,8 @@ Status DynamicDimensionInferenceVisitor::HandleWhile(HloInstruction* hlo) {
       });
 }
 
-Status DynamicDimensionInferenceVisitor::HandleParameter(HloInstruction* hlo) {
+absl::Status DynamicDimensionInferenceVisitor::HandleParameter(
+    HloInstruction* hlo) {
   if (hlo->parent()->IsEntryComputation()) {
     TF_RET_CHECK(param_bindings_.empty());
     return InsertPadToStaticOnInstruction(hlo);
@@ -2363,7 +2387,7 @@ Status DynamicDimensionInferenceVisitor::HandleParameter(HloInstruction* hlo) {
   return param_bindings_.ForEachBinding(
       [&](const DynamicParameterBinding::DynamicSizeParameter& dynamic_size,
           const DynamicParameterBinding::DynamicDimension& dynamic_dimension)
-          -> Status {
+          -> absl::Status {
         if (dynamic_dimension.parameter_num == hlo->parameter_number()) {
           SetDynamicSize(
               hlo, dynamic_dimension.parameter_index,
@@ -2378,11 +2402,12 @@ Status DynamicDimensionInferenceVisitor::HandleParameter(HloInstruction* hlo) {
       });
 }
 
-Status DynamicDimensionInferenceVisitor::HandleInfeed(HloInstruction* hlo) {
+absl::Status DynamicDimensionInferenceVisitor::HandleInfeed(
+    HloInstruction* hlo) {
   return InsertPadToStaticOnInstruction(hlo);
 }
 
-Status DynamicDimensionInferenceVisitor::ForEachDynamicDimension(
+absl::Status DynamicDimensionInferenceVisitor::ForEachDynamicDimension(
     HloInstruction* inst, const DynamicDimensionFn& fn) {
   auto iter = parent_->per_hlo_dynamic_dimensions_.find(inst);
   if (iter != parent_->per_hlo_dynamic_dimensions_.end()) {
@@ -2454,7 +2479,7 @@ absl::StatusOr<bool> DynamicDimensionInferenceVisitor::RequiresPadToStatic(
 // Insert pad-to-static after `inst` if `inst` has dynamic dimensions in it.
 // If the instruction produces a tuple, each tuple component will be considered
 // independently.
-Status DynamicDimensionInferenceVisitor::InsertPadToStaticOnInstruction(
+absl::Status DynamicDimensionInferenceVisitor::InsertPadToStaticOnInstruction(
     HloInstruction* inst) {
   if (inst->shape().is_static()) {
     return OkStatus();
@@ -2490,7 +2515,8 @@ Status DynamicDimensionInferenceVisitor::InsertPadToStaticOnInstruction(
   ShapeTree<HloInstruction*> padded(inst->shape(), nullptr);
   TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapePostOrderWithStatus(
       inst->shape(),
-      [&](const Shape& subshape, const ShapeIndex& shape_index) -> Status {
+      [&](const Shape& subshape,
+          const ShapeIndex& shape_index) -> absl::Status {
         HloInstruction* element = gtes.element(shape_index);
         SetVisited(*gtes.element(shape_index));
         if (subshape.IsTuple()) {
@@ -2574,7 +2600,7 @@ Status DynamicDimensionInferenceVisitor::InsertPadToStaticOnInstruction(
   return OkStatus();
 }
 
-Status DynamicDimensionInferenceVisitor::InsertShapeCheck(
+absl::Status DynamicDimensionInferenceVisitor::InsertShapeCheck(
     HloInstruction* dim1, HloInstruction* dim2,
     bool support_implicit_broadcast) {
   switch (shape_check_mode_) {
@@ -2603,7 +2629,7 @@ Status DynamicDimensionInferenceVisitor::InsertShapeCheck(
   }
 }
 
-Status DynamicDimensionInferenceVisitor::ForEachDynamicDimensionInOperand(
+absl::Status DynamicDimensionInferenceVisitor::ForEachDynamicDimensionInOperand(
     HloInstruction* inst, int64_t operand_index, OperandDynamicDimensionFn fn) {
   auto iter =
       parent_->per_hlo_dynamic_dimensions_.find(inst->operand(operand_index));
@@ -2620,7 +2646,7 @@ Status DynamicDimensionInferenceVisitor::ForEachDynamicDimensionInOperand(
   return OkStatus();
 }
 
-Status DynamicDimensionInferenceVisitor::ForEachOperandDynamicDimension(
+absl::Status DynamicDimensionInferenceVisitor::ForEachOperandDynamicDimension(
     HloInstruction* inst, OperandDynamicDimensionFn fn) {
   for (int64_t operand_index = 0; operand_index < inst->operand_count();
        ++operand_index) {
@@ -2714,7 +2740,7 @@ DynamicDimensionInference::DynamicDimensionInference(
       assertion_generator_(assertion_generator),
       execution_threads_(execution_threads) {}
 
-Status DynamicDimensionInference::AnalyzeDynamicDimensions() {
+absl::Status DynamicDimensionInference::AnalyzeDynamicDimensions() {
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloDataflowAnalysis> dataflow_analysis,
       HloDataflowAnalysis::Run(*module_, /*ssa_form=*/false,
@@ -2749,9 +2775,8 @@ void DynamicDimensionInference::ReplaceAllDynamicDimensionUsesWith(
   }
 }
 
-Status DynamicDimensionInference::ForwardDynamicSize(HloInstruction* inst,
-                                                     HloInstruction* new_inst,
-                                                     const ShapeIndex& index) {
+absl::Status DynamicDimensionInference::ForwardDynamicSize(
+    HloInstruction* inst, HloInstruction* new_inst, const ShapeIndex& index) {
   TF_RET_CHECK(ShapeUtil::Compatible(inst->shape(), new_inst->shape()));
 
   for (int64_t dim = 0; dim < inst->shape().rank(); ++dim) {
diff --git a/third_party/xla/xla/service/dynamic_dimension_inference.h b/third_party/xla/xla/service/dynamic_dimension_inference.h
index 4ce0b0c3087c16..cab9502ccccbf9 100644
--- a/third_party/xla/xla/service/dynamic_dimension_inference.h
+++ b/third_party/xla/xla/service/dynamic_dimension_inference.h
@@ -71,7 +71,7 @@ class DynamicDimensionInference {
     kIgnore,
   };
   using CustomCallInferenceHandler =
-      std::function<Status(HloInstruction*, DynamicDimensionInference*)>;
+      std::function<absl::Status(HloInstruction*, DynamicDimensionInference*)>;
 
   // Generate an assertion which fails the execution if the instruction value is
   // false.
@@ -108,8 +108,9 @@ class DynamicDimensionInference {
                            ShapeIndexView index = {}) const;
 
   // Forward dynamic dimension size at `dim` from `inst` to `new_inst`.
-  Status ForwardDynamicSize(HloInstruction* inst, HloInstruction* new_inst,
-                            const ShapeIndex& index);
+  absl::Status ForwardDynamicSize(HloInstruction* inst,
+                                  HloInstruction* new_inst,
+                                  const ShapeIndex& index);
 
   // Update the dynamic mapping so that we know dimension `dim` of instruction
   // `inst` at `index` has a dynamic size, and its runtime size is represented
@@ -193,7 +194,7 @@ class DynamicDimensionInference {
 
   // AnalyzeDynamicDimensions starts the analysis of the dynamic dimensions in
   // module_.
-  Status AnalyzeDynamicDimensions();
+  absl::Status AnalyzeDynamicDimensions();
 
   // HloModule being analyzed.
   HloModule* module_;
diff --git a/third_party/xla/xla/service/dynamic_dimension_inference_test.cc b/third_party/xla/xla/service/dynamic_dimension_inference_test.cc
index 24b65c063c95e8..3c6b9d73d06a2b 100644
--- a/third_party/xla/xla/service/dynamic_dimension_inference_test.cc
+++ b/third_party/xla/xla/service/dynamic_dimension_inference_test.cc
@@ -46,7 +46,7 @@ class DynamicDimensionInferenceTest : public HloTestBase {
     module_ = CreateNewVerifiedModule();
   }
 
-  Status RunInference(
+  absl::Status RunInference(
       OpSupportsDynamismHandler op_supports_dynamism_handler = nullptr,
       DynamicDimensionInference::CustomCallInferenceHandler handler = nullptr,
       DynamicDimensionInference::ShapeCheckMode shape_check_mode =
@@ -603,7 +603,7 @@ TEST_F(DynamicDimensionInferenceTest, ReshapeTestMajorDimension) {
   module_->AddEntryComputation(builder.Build());
 
   SCOPED_TRACE(module_->ToString());
-  Status status = RunInference();
+  absl::Status status = RunInference();
   EXPECT_NE(inference_->GetDynamicSize(reshape, {}, 0), nullptr);
 }
 
diff --git a/third_party/xla/xla/service/dynamic_padder.cc b/third_party/xla/xla/service/dynamic_padder.cc
index f5e7ae0fbba8a0..7b4021c2690873 100644
--- a/third_party/xla/xla/service/dynamic_padder.cc
+++ b/third_party/xla/xla/service/dynamic_padder.cc
@@ -1850,25 +1850,25 @@ class DynamicShapeRemovingVisitor : public DfsHloRewriteVisitor {
         dynamic_dimension_inference_(dynamic_dimension_inference),
         execution_threads_(execution_threads) {}
 
-  Status DefaultAction(HloInstruction* hlo) override;
+  absl::Status DefaultAction(HloInstruction* hlo) override;
 
-  Status HandleCustomCall(HloInstruction* hlo) override;
+  absl::Status HandleCustomCall(HloInstruction* hlo) override;
 
-  Status HandleTuple(HloInstruction* hlo) override;
-  Status HandleGetTupleElement(HloInstruction* hlo) override;
+  absl::Status HandleTuple(HloInstruction* hlo) override;
+  absl::Status HandleGetTupleElement(HloInstruction* hlo) override;
 
-  Status HandleParameter(HloInstruction* hlo) override;
-  Status HandleInfeed(HloInstruction* hlo) override;
+  absl::Status HandleParameter(HloInstruction* hlo) override;
+  absl::Status HandleInfeed(HloInstruction* hlo) override;
 
-  Status HandleAsyncStart(HloInstruction* hlo) override;
-  Status HandleAsyncUpdate(HloInstruction* hlo) override;
-  Status HandleAsyncDone(HloInstruction* hlo) override;
+  absl::Status HandleAsyncStart(HloInstruction* hlo) override;
+  absl::Status HandleAsyncUpdate(HloInstruction* hlo) override;
+  absl::Status HandleAsyncDone(HloInstruction* hlo) override;
 
-  Status HandleWhile(HloInstruction* hlo) override;
-  Status HandleConditional(HloInstruction* hlo) override;
+  absl::Status HandleWhile(HloInstruction* hlo) override;
+  absl::Status HandleConditional(HloInstruction* hlo) override;
 
-  Status HandleGetDimensionSize(HloInstruction* hlo) override;
-  Status HandleSetDimensionSize(HloInstruction* hlo) override;
+  absl::Status HandleGetDimensionSize(HloInstruction* hlo) override;
+  absl::Status HandleSetDimensionSize(HloInstruction* hlo) override;
 
   static absl::StatusOr<bool> Run(
       HloComputation* computation,
@@ -1900,7 +1900,7 @@ class DynamicShapeRemovingVisitor : public DfsHloRewriteVisitor {
 
   // Same as above, but for all of the instructions operands.  The operands will
   // be replaced by dynamic operands as needed.
-  Status ConvertOperandsToDynamic(HloInstruction* inst);
+  absl::Status ConvertOperandsToDynamic(HloInstruction* inst);
 
   const OpSupportsDynamismHandler& op_supports_dynamism_handler_;
 
@@ -1947,7 +1947,7 @@ absl::StatusOr<HloInstruction*> DynamicShapeRemovingVisitor::ConvertToDynamic(
   return TupleUtil::AssembleTupleInstruction(inst->parent(), std::move(gtes));
 }
 
-Status DynamicShapeRemovingVisitor::ConvertOperandsToDynamic(
+absl::Status DynamicShapeRemovingVisitor::ConvertOperandsToDynamic(
     HloInstruction* inst) {
   for (int64_t i = 0; i < inst->operand_count(); ++i) {
     auto operand = inst->mutable_operand(i);
@@ -1961,7 +1961,7 @@ Status DynamicShapeRemovingVisitor::ConvertOperandsToDynamic(
   return OkStatus();
 }
 
-Status DynamicShapeRemovingVisitor::DefaultAction(HloInstruction* hlo) {
+absl::Status DynamicShapeRemovingVisitor::DefaultAction(HloInstruction* hlo) {
   // By default, ops don't support dynamic lowering.
   OpDynamismSupport op_support = OpDynamismSupport::kNoSupport;
   if (op_supports_dynamism_handler_) {
@@ -1993,23 +1993,25 @@ Status DynamicShapeRemovingVisitor::DefaultAction(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status DynamicShapeRemovingVisitor::HandleGetTupleElement(HloInstruction* hlo) {
+absl::Status DynamicShapeRemovingVisitor::HandleGetTupleElement(
+    HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status DynamicShapeRemovingVisitor::HandleTuple(HloInstruction* hlo) {
+absl::Status DynamicShapeRemovingVisitor::HandleTuple(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status DynamicShapeRemovingVisitor::HandleInfeed(HloInstruction* hlo) {
+absl::Status DynamicShapeRemovingVisitor::HandleInfeed(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status DynamicShapeRemovingVisitor::HandleParameter(HloInstruction* hlo) {
+absl::Status DynamicShapeRemovingVisitor::HandleParameter(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status DynamicShapeRemovingVisitor::HandleCustomCall(HloInstruction* hlo) {
+absl::Status DynamicShapeRemovingVisitor::HandleCustomCall(
+    HloInstruction* hlo) {
   if (hlo->custom_call_target() == "SliceToDynamic" ||
       hlo->custom_call_target() == "PadToStatic") {
     // Those ops support are created to handle dynamic tensors so by their
@@ -2020,7 +2022,8 @@ Status DynamicShapeRemovingVisitor::HandleCustomCall(HloInstruction* hlo) {
   return DefaultAction(hlo);
 }
 
-Status DynamicShapeRemovingVisitor::HandleAsyncStart(HloInstruction* hlo) {
+absl::Status DynamicShapeRemovingVisitor::HandleAsyncStart(
+    HloInstruction* hlo) {
   if (HloInstruction::IsThreadIncluded(hlo->async_execution_thread(),
                                        execution_threads_)) {
     return OkStatus();
@@ -2028,28 +2031,30 @@ Status DynamicShapeRemovingVisitor::HandleAsyncStart(HloInstruction* hlo) {
   return ConvertOperandsToDynamic(hlo);
 }
 
-Status DynamicShapeRemovingVisitor::HandleAsyncUpdate(HloInstruction* hlo) {
+absl::Status DynamicShapeRemovingVisitor::HandleAsyncUpdate(
+    HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status DynamicShapeRemovingVisitor::HandleAsyncDone(HloInstruction* hlo) {
+absl::Status DynamicShapeRemovingVisitor::HandleAsyncDone(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status DynamicShapeRemovingVisitor::HandleWhile(HloInstruction* hlo) {
+absl::Status DynamicShapeRemovingVisitor::HandleWhile(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status DynamicShapeRemovingVisitor::HandleConditional(HloInstruction* hlo) {
+absl::Status DynamicShapeRemovingVisitor::HandleConditional(
+    HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status DynamicShapeRemovingVisitor::HandleGetDimensionSize(
+absl::Status DynamicShapeRemovingVisitor::HandleGetDimensionSize(
     HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status DynamicShapeRemovingVisitor::HandleSetDimensionSize(
+absl::Status DynamicShapeRemovingVisitor::HandleSetDimensionSize(
     HloInstruction* hlo) {
   *hlo->mutable_shape() = hlo->operand(0)->shape();
   hlo->mutable_shape()->set_dynamic_dimension(hlo->dimension(), false);
diff --git a/third_party/xla/xla/service/dynamic_padder_test.cc b/third_party/xla/xla/service/dynamic_padder_test.cc
index 20421c57f0116e..e0a70f8bdc4fe6 100644
--- a/third_party/xla/xla/service/dynamic_padder_test.cc
+++ b/third_party/xla/xla/service/dynamic_padder_test.cc
@@ -78,7 +78,7 @@ OpDynamismSupport OpHasDynamismSupport(HloInstruction* hlo) {
   return OpDynamismSupport::kNoSupport;
 }
 
-Status CustomCallDynamicDimensionInference(
+absl::Status CustomCallDynamicDimensionInference(
     HloInstruction* hlo, DynamicDimensionInference* inferencer) {
   if (hlo->custom_call_target() == "OpWithDynamicLowering") {
     if (hlo->shape().IsTuple()) {
diff --git a/third_party/xla/xla/service/eigh_expander.cc b/third_party/xla/xla/service/eigh_expander.cc
index 93d08aa184f40f..84a75bd6b4ffda 100644
--- a/third_party/xla/xla/service/eigh_expander.cc
+++ b/third_party/xla/xla/service/eigh_expander.cc
@@ -236,9 +236,9 @@ void PermuteColumnsInRow(XlaOp& left, XlaOp& right) {
 // implicit way of computing a tournament for n players such that each player
 // plays every other player exactly once in n - 1 rounds. See the Brent/Luk
 // paper for more details.
-Status ApplyRotations(int64_t n, XlaOp& w_tl, XlaOp& w_tr, XlaOp& w_bl,
-                      XlaOp& w_br, XlaOp& v_tl, XlaOp& v_tr, XlaOp& v_bl,
-                      XlaOp& v_br) {
+absl::Status ApplyRotations(int64_t n, XlaOp& w_tl, XlaOp& w_tr, XlaOp& w_bl,
+                            XlaOp& w_br, XlaOp& v_tl, XlaOp& v_tr, XlaOp& v_bl,
+                            XlaOp& v_br) {
   TF_ASSIGN_OR_RETURN(Eigh2x2 rotation,
                       HermitianEigenDecomposition2x2(w_tl, w_tr, w_br));
 
@@ -351,7 +351,7 @@ absl::StatusOr<std::vector<XlaOp>> Sweeps(
 
 }  // namespace
 
-Status EighExpander::SortByEigenvalues(XlaOp& v, XlaOp& w) {
+absl::Status EighExpander::SortByEigenvalues(XlaOp& v, XlaOp& w) {
   XlaBuilder* builder = v.builder();
   TF_ASSIGN_OR_RETURN(Shape v_shape, builder->GetShape(v));
   TF_ASSIGN_OR_RETURN(Shape w_shape, builder->GetShape(w));
diff --git a/third_party/xla/xla/service/eigh_expander.h b/third_party/xla/xla/service/eigh_expander.h
index ed53fac030afd8..074e09305354d9 100644
--- a/third_party/xla/xla/service/eigh_expander.h
+++ b/third_party/xla/xla/service/eigh_expander.h
@@ -35,7 +35,7 @@ class EighExpander : public OpExpanderPass {
   virtual XlaOp BuildEigh(XlaOp a, bool lower, int64_t max_iter, float tol,
                           bool sort_eigenvalues);
 
-  Status SortByEigenvalues(XlaOp& v, XlaOp& w);
+  absl::Status SortByEigenvalues(XlaOp& v, XlaOp& w);
 
  private:
   // Mapping from op signatures to existing computations.
diff --git a/third_party/xla/xla/service/elemental_ir_emitter.cc b/third_party/xla/xla/service/elemental_ir_emitter.cc
index 2a40e473ab60c2..ed4c16fe37cbf4 100644
--- a/third_party/xla/xla/service/elemental_ir_emitter.cc
+++ b/third_party/xla/xla/service/elemental_ir_emitter.cc
@@ -511,7 +511,7 @@ llvm::Value* EmitF8e4m3b11fnuzToF16(llvm::Value* f8_value,
 
   llvm::Value* max_like_sign = llvm::ConstantFP::get(
       f16_value->getType(),
-      static_cast<float>(std::numeric_limits<tsl::float8_e4m3b11>::max()));
+      static_cast<float>(std::numeric_limits<tsl::float8_e4m3b11fnuz>::max()));
   max_like_sign = b->CreateBitCast(max_like_sign, f16_sign_bit->getType());
   max_like_sign = b->CreateOr(max_like_sign, f16_sign_bit);
   max_like_sign = b->CreateBitCast(max_like_sign, f16_value->getType());
diff --git a/third_party/xla/xla/service/executable.cc b/third_party/xla/xla/service/executable.cc
index 7349471ea24396..d0dc2bf5b2e24f 100644
--- a/third_party/xla/xla/service/executable.cc
+++ b/third_party/xla/xla/service/executable.cc
@@ -41,7 +41,7 @@ ExecutionInput::~ExecutionInput() {
   }
 }
 
-Status ExecutionInput::SetDynamicShape(Shape dynamic_shape) {
+absl::Status ExecutionInput::SetDynamicShape(Shape dynamic_shape) {
   const Shape& input_shape = shape();
   if (!ShapeUtil::DynamicShapeIsCompatible(input_shape, dynamic_shape)) {
     return tsl::errors::InvalidArgument(
@@ -83,7 +83,7 @@ absl::StatusOr<ScopedShapedBuffer> Executable::ExecuteOnStream(
     HloExecutionProfile* hlo_execution_profile) {
   absl::StatusOr<ScopedShapedBuffer> result =
       ExecuteAsyncOnStream(run_options, arguments, hlo_execution_profile);
-  Status blocking_status = run_options->stream()->BlockHostUntilDone();
+  absl::Status blocking_status = run_options->stream()->BlockHostUntilDone();
   TF_RETURN_IF_ERROR(result.status());
   TF_RETURN_IF_ERROR(blocking_status);
   return result;
@@ -120,7 +120,7 @@ absl::StatusOr<ExecutionOutput> Executable::ExecuteOnStream(
     HloExecutionProfile* hlo_execution_profile) {
   absl::StatusOr<ExecutionOutput> result = ExecuteAsyncOnStream(
       run_options, std::move(arguments), hlo_execution_profile);
-  Status blocking_status = run_options->stream()->BlockHostUntilDone();
+  absl::Status blocking_status = run_options->stream()->BlockHostUntilDone();
   TF_RETURN_IF_ERROR(result.status());
   TF_RETURN_IF_ERROR(blocking_status);
   return result;
@@ -163,7 +163,7 @@ absl::StatusOr<ScopedShapedBuffer> Executable::ExecuteOnStreamWrapper(
     absl::Span<const ShapedBuffer* const> arguments) {
   absl::StatusOr<ScopedShapedBuffer> result =
       ExecuteAsyncOnStreamWrapper(run_options, arguments);
-  Status block_status = run_options->stream()->BlockHostUntilDone();
+  absl::Status block_status = run_options->stream()->BlockHostUntilDone();
   TF_RETURN_IF_ERROR(result.status());
   TF_RETURN_IF_ERROR(block_status);
   return result;
@@ -174,7 +174,7 @@ absl::StatusOr<ExecutionOutput> Executable::ExecuteOnStreamWrapper(
     std::vector<ExecutionInput> arguments) {
   absl::StatusOr<ExecutionOutput> result =
       ExecuteAsyncOnStreamWrapper(run_options, std::move(arguments));
-  Status block_status = run_options->stream()->BlockHostUntilDone();
+  absl::Status block_status = run_options->stream()->BlockHostUntilDone();
   TF_RETURN_IF_ERROR(result.status());
   TF_RETURN_IF_ERROR(block_status);
   return result;
@@ -194,12 +194,12 @@ static ExecuteAsyncOnStreamWrapperState ExecuteWrapperBeforeExecution(
   return state;
 }
 
-Status ExecuteWrapperAfterExecution(
+absl::Status ExecuteWrapperAfterExecution(
     Executable* executable, const ExecuteAsyncOnStreamWrapperState& state,
-    Status return_status, se::Stream* stream) {
+    absl::Status return_status, se::Stream* stream) {
   if (!return_status.ok()) {
     if (state.profile != nullptr) {
-      Status status = stream->BlockHostUntilDone();
+      absl::Status status = stream->BlockHostUntilDone();
       if (!status.ok()) {
         LOG(ERROR) << "Failed to BlockHostUntilDone: " << status;
       }
diff --git a/third_party/xla/xla/service/executable.h b/third_party/xla/xla/service/executable.h
index 573ff79a564c3d..e7d9cd29244055 100644
--- a/third_party/xla/xla/service/executable.h
+++ b/third_party/xla/xla/service/executable.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/types/variant.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/service/computation_layout.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_execution_profile.h"
@@ -95,7 +96,7 @@ class ExecutionInput {
     return host_shape_ != nullptr ? *host_shape_ : shape();
   }
 
-  Status SetDynamicShape(Shape dynamic_shape);
+  absl::Status SetDynamicShape(Shape dynamic_shape);
 
   absl::StatusOr<xla::ShapedBuffer> ToShapedBuffer(
       se::DeviceMemoryAllocator* allocator, int device_ordinal) const;
@@ -406,6 +407,12 @@ class Executable {
   void MarkToBeReleasedArguments(absl::Span<ExecutionInput> arguments,
                                  ExecutionOutput& result);
 
+  // Returns the allocations resulting from buffer assignment, or an empty span
+  // if unimplemented.
+  virtual absl::Span<const BufferAllocation> GetAllocations() const {
+    return {};
+  }
+
  protected:
   // HloModule this was compiled from. BufferAssignment keeps pointers to
   // HloInstructions owned by the HloModule so we need to keep the HloModule
diff --git a/third_party/xla/xla/service/execution_tracker.cc b/third_party/xla/xla/service/execution_tracker.cc
index 655aca424f1906..9bef365644d886 100644
--- a/third_party/xla/xla/service/execution_tracker.cc
+++ b/third_party/xla/xla/service/execution_tracker.cc
@@ -37,7 +37,7 @@ AsyncExecution::AsyncExecution(Backend* backend,
   }
 }
 
-Status AsyncExecution::BlockUntilDone() const {
+absl::Status AsyncExecution::BlockUntilDone() const {
   for (auto& stream : streams_) {
     TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
   }
@@ -62,7 +62,7 @@ ExecutionHandle ExecutionTracker::Register(Backend* backend,
   return execution_handle;
 }
 
-Status ExecutionTracker::Unregister(const ExecutionHandle& handle) {
+absl::Status ExecutionTracker::Unregister(const ExecutionHandle& handle) {
   absl::MutexLock lock(&execution_mutex_);
   auto it = handle_to_execution_.find(handle.handle());
   if (it == handle_to_execution_.end()) {
diff --git a/third_party/xla/xla/service/execution_tracker.h b/third_party/xla/xla/service/execution_tracker.h
index 3bd5f32edced51..06c9279299ef7d 100644
--- a/third_party/xla/xla/service/execution_tracker.h
+++ b/third_party/xla/xla/service/execution_tracker.h
@@ -39,7 +39,7 @@ class AsyncExecution {
   AsyncExecution(Backend* backend, std::vector<StreamPool::Ptr> streams,
                  const ExecutionProfile& profile, GlobalDataHandle result);
 
-  Status BlockUntilDone() const;
+  absl::Status BlockUntilDone() const;
 
   const GlobalDataHandle& result() const { return result_; }
 
@@ -73,7 +73,7 @@ class ExecutionTracker {
                            GlobalDataHandle data);
 
   // Unregisters the execution for the given handle.
-  Status Unregister(const ExecutionHandle& handle);
+  absl::Status Unregister(const ExecutionHandle& handle);
 
   // Resolves the given ExecutionHandle to an AsyncExecution. Returns an
   // error status if the given handle is not found, which means that the
diff --git a/third_party/xla/xla/service/export_hlo.h b/third_party/xla/xla/service/export_hlo.h
index 7e111019bd46f9..e0e0456cd506e6 100644
--- a/third_party/xla/xla/service/export_hlo.h
+++ b/third_party/xla/xla/service/export_hlo.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/strings/string_view.h"
+#include "xla/autotune_results.pb.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/stream_executor/device_description.pb.h"
 
@@ -38,14 +39,14 @@ class SymbolUploader {
   virtual ~SymbolUploader() = default;
 
   // Returns a string identifying the uploaded HLO, or empty if the upload did
-  // not complete. We use optional rather than StatusOr because an upload error
-  // is not a compiler error.
+  // not complete. We use optional rather than absl::StatusOr because an upload
+  // error is not a compiler error.
   virtual std::optional<std::string> MaybeUploadUnoptimizedHloModule(
       HloModule* module,
       const stream_executor::GpuTargetConfigProto& gpu_target_config) = 0;
 
   virtual std::optional<std::string> MaybeUploadOptimizedHloModule(
-      HloModule* module) = 0;
+      HloModule* module, const AutotuneResults& autotune_results) = 0;
 
   virtual void MaybeUploadSymbolMapping(
       absl::string_view unoptimized_fingerprint,
@@ -88,10 +89,10 @@ inline std::optional<std::string> MaybeUploadUnoptimizedGpuSymbols(
 }
 
 inline std::optional<std::string> MaybeUploadOptimizedGpuSymbols(
-    HloModule* module) {
+    HloModule* module, const AutotuneResults& autotune_results) {
   if (SymbolUploader* uploader = GetGlobalSymbolUploaderRegistry().uploader();
       uploader != nullptr) {
-    return uploader->MaybeUploadOptimizedHloModule(module);
+    return uploader->MaybeUploadOptimizedHloModule(module, autotune_results);
   }
 
   return std::nullopt;
diff --git a/third_party/xla/xla/service/flatten_call_graph.cc b/third_party/xla/xla/service/flatten_call_graph.cc
index 191aca4b46e13c..39a3ca7054137d 100644
--- a/third_party/xla/xla/service/flatten_call_graph.cc
+++ b/third_party/xla/xla/service/flatten_call_graph.cc
@@ -68,7 +68,7 @@ void ReplaceCalledComputation(HloInstruction* instruction,
 }
 
 // Flatten a single call graph node. Expects to visit nodes in postorder.
-Status FlattenNode(const CallGraphNode& node) {
+absl::Status FlattenNode(const CallGraphNode& node) {
   HloComputation* computation = node.computation();
   HloModule* module = computation->parent();
   // Clone callee for all call-sites except the first one.
diff --git a/third_party/xla/xla/service/float_normalization.cc b/third_party/xla/xla/service/float_normalization.cc
index 97fbc5bbc26888..7110c7b866decd 100644
--- a/third_party/xla/xla/service/float_normalization.cc
+++ b/third_party/xla/xla/service/float_normalization.cc
@@ -43,18 +43,18 @@ class FloatNormalizationVisitor : public DfsHloVisitorWithDefault {
         float_normalization_(float_normalization) {}
 
   bool changed() const { return changed_; }
-  Status DefaultAction(HloInstruction* hlo) override;
-  Status Preprocess(HloInstruction* hlo) override;
+  absl::Status DefaultAction(HloInstruction* hlo) override;
+  absl::Status Preprocess(HloInstruction* hlo) override;
 
  private:
   // Checks if the HLO uses low-precision in an unsupported way, and if so,
   // inserts conversions between the low- and high-precision types to make it
   // supported.
-  Status HandleInstruction(HloInstruction* hlo);
+  absl::Status HandleInstruction(HloInstruction* hlo);
 
   // Handle instructions with tuple outputs by examining each output
   // independently.
-  Status HandleMultipleOutputs(HloInstruction* hlo);
+  absl::Status HandleMultipleOutputs(HloInstruction* hlo);
 
   // Creates a copy of `hlo` with subshapes matching `from` type converted to
   // `to` type. If no matching subshapes are found, returns the original `hlo`.
@@ -65,27 +65,27 @@ class FloatNormalizationVisitor : public DfsHloVisitorWithDefault {
 
   // Inserts a conversion HLO that changes the given HLO's output type. If the
   // output is a tuple, change all elements that match the from type.
-  Status InsertConvertAfterOutput(HloInstruction* hlo, PrimitiveType from,
-                                  PrimitiveType to,
-                                  HloComputation* computation);
+  absl::Status InsertConvertAfterOutput(HloInstruction* hlo, PrimitiveType from,
+                                        PrimitiveType to,
+                                        HloComputation* computation);
 
   // Changes the output type to the specified type, then inserts a conversion
   // to the original type. If the output is a tuple, change all elements that
   // match the from type.
-  Status ChangeOutputTypeThenInsertConvertBack(HloInstruction* hlo,
-                                               PrimitiveType from,
-                                               PrimitiveType to,
-                                               HloComputation* computation);
+  absl::Status ChangeOutputTypeThenInsertConvertBack(
+      HloInstruction* hlo, PrimitiveType from, PrimitiveType to,
+      HloComputation* computation);
 
   // Inserts a conversion HLO that changes the given HLO's operand type. If the
   // operand is a tuple, change all elements that match the from type.
-  Status InsertConvertBeforeOperand(HloInstruction* hlo, int64_t operand_idx,
-                                    PrimitiveType from, PrimitiveType to,
-                                    HloComputation* computation);
+  absl::Status InsertConvertBeforeOperand(HloInstruction* hlo,
+                                          int64_t operand_idx,
+                                          PrimitiveType from, PrimitiveType to,
+                                          HloComputation* computation);
 
   // Inserts conversion HLOs to replace the called computations' low-precision
   // operands/outputs to high-precision.
-  Status ConvertCalledComputations(
+  absl::Status ConvertCalledComputations(
       HloInstruction* hlo,
       absl::Span<HloComputation* const> low_precision_called_comps);
 
@@ -156,7 +156,7 @@ absl::StatusOr<HloInstruction*> FloatNormalizationVisitor::ConvertType(
   return new_hlo;
 }
 
-Status FloatNormalizationVisitor::InsertConvertAfterOutput(
+absl::Status FloatNormalizationVisitor::InsertConvertAfterOutput(
     HloInstruction* hlo, PrimitiveType from, PrimitiveType to,
     HloComputation* computation) {
   bool is_root = computation->root_instruction() == hlo;
@@ -177,7 +177,7 @@ Status FloatNormalizationVisitor::InsertConvertAfterOutput(
   return OkStatus();
 }
 
-Status FloatNormalizationVisitor::ChangeOutputTypeThenInsertConvertBack(
+absl::Status FloatNormalizationVisitor::ChangeOutputTypeThenInsertConvertBack(
     HloInstruction* hlo, PrimitiveType from, PrimitiveType to,
     HloComputation* computation) {
   auto original_shape = hlo->shape();
@@ -246,7 +246,7 @@ Status FloatNormalizationVisitor::ChangeOutputTypeThenInsertConvertBack(
   return OkStatus();
 }
 
-Status FloatNormalizationVisitor::InsertConvertBeforeOperand(
+absl::Status FloatNormalizationVisitor::InsertConvertBeforeOperand(
     HloInstruction* hlo, int64_t operand_idx, PrimitiveType from,
     PrimitiveType to, HloComputation* computation) {
   auto operand = hlo->mutable_operand(operand_idx);
@@ -261,7 +261,7 @@ Status FloatNormalizationVisitor::InsertConvertBeforeOperand(
   return OkStatus();
 }
 
-Status FloatNormalizationVisitor::ConvertCalledComputations(
+absl::Status FloatNormalizationVisitor::ConvertCalledComputations(
     HloInstruction* hlo,
     absl::Span<HloComputation* const> low_precision_called_comps) {
   absl::flat_hash_map<HloComputation*, HloComputation*> cloned_computations;
@@ -300,7 +300,8 @@ bool ShouldAvoidNormalizingComputationsForInstruction(HloInstruction* hlo) {
          hlo->opcode() == HloOpcode::kReduceScatter;
 }
 
-Status FloatNormalizationVisitor::HandleMultipleOutputs(HloInstruction* hlo) {
+absl::Status FloatNormalizationVisitor::HandleMultipleOutputs(
+    HloInstruction* hlo) {
   std::vector<PrimitiveType> operand_types(hlo->operand_count());
   std::vector<PrimitiveType> output_types(hlo->operand_count());
   int64_t high_prec_count = 0;
@@ -426,7 +427,7 @@ Status FloatNormalizationVisitor::HandleMultipleOutputs(HloInstruction* hlo) {
   return ConvertCalledComputations(hlo, low_precision_called_comps);
 }
 
-Status FloatNormalizationVisitor::HandleInstruction(HloInstruction* hlo) {
+absl::Status FloatNormalizationVisitor::HandleInstruction(HloInstruction* hlo) {
   int high_prec_count = 0;
   int low_prec_count = 0;
 
@@ -540,7 +541,7 @@ Status FloatNormalizationVisitor::HandleInstruction(HloInstruction* hlo) {
   return ConvertCalledComputations(hlo, low_precision_called_comps);
 }
 
-Status FloatNormalizationVisitor::DefaultAction(HloInstruction* hlo) {
+absl::Status FloatNormalizationVisitor::DefaultAction(HloInstruction* hlo) {
   // Do not change instructions related to entry and exit of a computation,
   // tuples, fusion, convert, side-effecting instructions, control flow, and
   // bitcast-convert.
@@ -569,7 +570,7 @@ Status FloatNormalizationVisitor::DefaultAction(HloInstruction* hlo) {
   return HandleInstruction(hlo);
 }
 
-Status FloatNormalizationVisitor::Preprocess(HloInstruction* hlo) {
+absl::Status FloatNormalizationVisitor::Preprocess(HloInstruction* hlo) {
   computation_ = hlo->parent();
   return OkStatus();
 }
diff --git a/third_party/xla/xla/service/float_normalization_test.cc b/third_party/xla/xla/service/float_normalization_test.cc
index 973a8ad12669d8..9c6a55afd5e34a 100644
--- a/third_party/xla/xla/service/float_normalization_test.cc
+++ b/third_party/xla/xla/service/float_normalization_test.cc
@@ -303,7 +303,7 @@ TEST_F(FloatNormalizationTest, ResolveMixedPrecisionTupleAllReduce) {
 
   HloInstruction* crs = builder.AddInstruction(HloInstruction::CreateAllReduce(
       ShapeUtil::MakeTupleShape({f32_shape, bf16_shape}), {a, b}, reduction,
-      /*replica_groups=*/{},
+      /*device_list=*/CollectiveDeviceList(),
       /*constrain_layout=*/false,
       /*channel_id=*/std::nullopt,
       /*use_global_device_ids=*/false));
@@ -334,7 +334,8 @@ TEST_F(FloatNormalizationTest, ResolveMixedPrecisionTupleAllToAllToBF16) {
   replica_groups[0].add_replica_ids(1);
   HloInstruction* a2a = builder.AddInstruction(HloInstruction::CreateAllToAll(
       ShapeUtil::MakeTupleShape({bf16_shape, bf16_shape}), {a, a},
-      replica_groups, /*constrain_layout=*/false, std::nullopt));
+      CollectiveDeviceList(replica_groups), /*constrain_layout=*/false,
+      std::nullopt));
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_TRUE(Normalize(module.get()));
@@ -363,7 +364,8 @@ TEST_F(FloatNormalizationTest, ResolveMixedPrecisionTupleAllToAllToF32) {
   replica_groups[0].add_replica_ids(1);
   HloInstruction* a2a = builder.AddInstruction(HloInstruction::CreateAllToAll(
       ShapeUtil::MakeTupleShape({bf16_shape, f32_shape}), {a, a},
-      replica_groups, /*constrain_layout=*/false, std::nullopt));
+      CollectiveDeviceList(replica_groups), /*constrain_layout=*/false,
+      std::nullopt));
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_TRUE(Normalize(module.get()));
@@ -569,7 +571,7 @@ TEST_F(FloatNormalizationNoComputeSupportTest,
   HloInstruction* crs = builder.AddInstruction(HloInstruction::CreateAllReduce(
       ShapeUtil::MakeTupleShape({bf16_shape_a, bf16_shape_b}), {a, b},
       reduction,
-      /*replica_groups=*/{},
+      /*device_list=*/CollectiveDeviceList(),
       /*constrain_layout=*/false,
       /*channel_id=*/std::nullopt,
       /*use_global_device_ids=*/false));
@@ -615,7 +617,7 @@ TEST_F(FloatNormalizationNoComputeSupportTest,
 
   HloInstruction* all_reduce = builder.AddInstruction(
       HloInstruction::CreateAllReduce(bf16_shape_a, {a}, reduction,
-                                      /*replica_groups=*/{},
+                                      /*device_list=*/CollectiveDeviceList(),
                                       /*constrain_layout=*/false,
                                       /*channel_id=*/std::nullopt,
                                       /*use_global_device_ids=*/false));
@@ -668,7 +670,7 @@ TEST_F(FloatNormalizationNoComputeSupportTest,
 
   HloInstruction* crs = builder.AddInstruction(
       HloInstruction::CreateAllReduce(bf16_shape_a, {a}, reduction,
-                                      /*replica_groups=*/{},
+                                      /*device_list=*/CollectiveDeviceList(),
                                       /*constrain_layout=*/false,
                                       /*channel_id=*/std::nullopt,
                                       /*use_global_device_ids=*/false));
@@ -705,7 +707,7 @@ TEST_F(FloatNormalizationNoComputeSupportTest,
   HloInstruction* crs =
       builder.AddInstruction(HloInstruction::CreateReduceScatter(
           bf16_shape_scattered, {a}, reduction,
-          /*replica_groups=*/{},
+          /*device_list=*/CollectiveDeviceList(),
           /*constrain_layout=*/false,
           /*channel_id=*/std::nullopt,
           /*use_global_device_ids=*/false, /*scatter_dimension*/ 0));
diff --git a/third_party/xla/xla/service/gather_expander_test.cc b/third_party/xla/xla/service/gather_expander_test.cc
index 2a215559e281d3..271fabdbda2c62 100644
--- a/third_party/xla/xla/service/gather_expander_test.cc
+++ b/third_party/xla/xla/service/gather_expander_test.cc
@@ -43,9 +43,9 @@ ENTRY main {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
 
-  Status status = GatherExpander{GatherExpander::kEliminateAllGathers}
-                      .Run(module.get())
-                      .status();
+  absl::Status status = GatherExpander{GatherExpander::kEliminateAllGathers}
+                            .Run(module.get())
+                            .status();
   EXPECT_EQ(status.code(), tsl::error::UNIMPLEMENTED);
 
   ASSERT_THAT(
diff --git a/third_party/xla/xla/service/gather_scatter_utils.cc b/third_party/xla/xla/service/gather_scatter_utils.cc
index 223879d4936fe2..897f1e7429b50e 100644
--- a/third_party/xla/xla/service/gather_scatter_utils.cc
+++ b/third_party/xla/xla/service/gather_scatter_utils.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "xla/layout_util.h"
 #include "xla/permutation_util.h"
 #include "xla/service/hlo_creation_utils.h"
 
@@ -68,6 +69,10 @@ absl::StatusOr<HloInstruction*> MaybeTranspose(
     return operand;
   }
   TF_ASSIGN_OR_RETURN(auto* result, MakeTransposeHlo(operand, permutation));
+  // Assign the default layout to the transpose. This method is also used after
+  // layout normalization, and before, we don't care about the layout.
+  *result->mutable_shape()->mutable_layout() =
+      LayoutUtil::GetDefaultLayoutForShape(result->shape());
   return result;
 }
 
diff --git a/third_party/xla/xla/service/gather_simplifier_test.cc b/third_party/xla/xla/service/gather_simplifier_test.cc
index 5f00fe83a01e28..61b8bc716e120b 100644
--- a/third_party/xla/xla/service/gather_simplifier_test.cc
+++ b/third_party/xla/xla/service/gather_simplifier_test.cc
@@ -100,7 +100,7 @@ TEST_F(GatherSimplifierTest, MakesStartIndexMapIdentity) {
 
   RunAndFilecheckHloRewrite(kModuleStr, GatherSimplifier(), R"(
   %operand = f32[33,34,35]{2,1,0} parameter(0)
-           CHECK: %[[OPERAND:.*]] = f32[35,33,34]{0,2,1} transpose(%operand)
+           CHECK: %[[OPERAND:.*]] = f32[35,33,34]{2,1,0} transpose(%operand)
            CHECK: %[[GATHER:.*]] = f32[42,3,1,2]{{.*}} gather(%[[OPERAND]],
       CHECK-SAME:    start_index_map={0,1,2},
            CHECK: ROOT {{.*}} = f32[42,1,2,3]{{.*}} transpose(%[[GATHER]])
@@ -126,7 +126,7 @@ TEST_F(GatherSimplifierTest, CollapsesSomeDims) {
   RunAndFilecheckHloRewrite(kModuleStr, GatherSimplifier(), R"(
            CHECK: %[[GATHER:.*]] = f32[42,1,7,1]{3,2,1,0} gather(
            CHECK: %[[COLLAPSED:.*]] = f32[42,7]{1,0} reshape(%[[GATHER]])
-           CHECK: ROOT {{.*}} = f32[7,42]{0,1} transpose(%[[COLLAPSED]]),
+           CHECK: ROOT {{.*}} = f32[7,42]{1,0} transpose(%[[COLLAPSED]]),
       CHECK-SAME: dimensions={1,0}
   )");
 }
diff --git a/third_party/xla/xla/service/generic_transfer_manager.cc b/third_party/xla/xla/service/generic_transfer_manager.cc
index 6a0ab999eb9b16..28a0e4e7f47dc7 100644
--- a/third_party/xla/xla/service/generic_transfer_manager.cc
+++ b/third_party/xla/xla/service/generic_transfer_manager.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 
@@ -52,7 +53,7 @@ se::Platform::Id GenericTransferManager::PlatformId() const {
   return platform_id_;
 }
 
-Status GenericTransferManager::WriteSingleTupleIndexTable(
+absl::Status GenericTransferManager::WriteSingleTupleIndexTable(
     se::Stream* stream, absl::Span<const se::DeviceMemoryBase> elements,
     const Shape& shape, se::DeviceMemoryBase* region) {
   TF_RET_CHECK(elements.size() == ShapeUtil::TupleElementCount(shape));
@@ -80,23 +81,24 @@ void GenericTransferManager::TransferLiteralFromDevice(
           << stream->parent()->device_ordinal()
           << "; device buffer: " << device_buffer;
 
-  Status status = [&]() -> Status {
+  absl::Status status = [&]() -> absl::Status {
     TF_RET_CHECK(stream->parent()->device_ordinal() ==
                  device_buffer.device_ordinal());
 
     TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
         device_buffer.on_device_shape(),
-        [&](const Shape& subshape, const ShapeIndex& index) -> Status {
+        [&](const Shape& subshape, const ShapeIndex& index) -> absl::Status {
           if (subshape.IsArray()) {
             if (PackSubbyteTypes() &&
-                primitive_util::Is4BitType(subshape.element_type())) {
+                primitive_util::IsSubByteNonPredType(subshape.element_type())) {
               if (!subshape.is_static()) {
                 return absl::UnimplementedError(
                     "Int4 outputs with dynamic shapes are unsupported");
               }
-              return TransferInt4ArrayFromDevice(
+              return TransferIntNArrayFromDevice(
                   stream,
                   /*source=*/device_buffer.buffer(index),
+                  subshape.element_type(),
                   /*num_elements=*/ShapeUtil::ElementsIn(subshape),
                   /*destination=*/literal.untyped_data(index));
             } else {
@@ -141,7 +143,7 @@ void GenericTransferManager::TransferLiteralFromDevice(
   }
 }
 
-Status GenericTransferManager::TransferLiteralToDeviceAsync(
+absl::Status GenericTransferManager::TransferLiteralToDeviceAsync(
     se::Stream* stream, const LiteralSlice& literal,
     const ShapedBuffer& device_buffer,
     const TransferMetadata* /*transfer_metadata*/) {
@@ -159,21 +161,24 @@ Status GenericTransferManager::TransferLiteralToDeviceAsync(
 
   return ShapeUtil::ForEachSubshapeWithStatus(
       device_buffer.on_device_shape(),
-      [&](const Shape& device_subshape, const ShapeIndex& index) -> Status {
+      [&](const Shape& device_subshape,
+          const ShapeIndex& index) -> absl::Status {
         if (device_subshape.IsArray()) {
           int64_t size = GetByteSizeRequirement(device_subshape);
           se::DeviceMemoryBase device_memory = device_buffer.buffer(index);
           TF_RET_CHECK(size == device_memory.size());
 
           auto TransferBuffer = [&](const void* source) {
-            if (PackSubbyteTypes() &&
-                primitive_util::Is4BitType(device_subshape.element_type())) {
+            if (PackSubbyteTypes() && primitive_util::IsSubByteNonPredType(
+                                          device_subshape.element_type())) {
               if (!device_subshape.is_static()) {
-                return absl::UnimplementedError(
-                    "Int4 inputs with dynamic shapes are unsupported");
+                return absl::UnimplementedError(absl::StrCat(
+                    primitive_util::LowercasePrimitiveTypeName(
+                        device_subshape.element_type()),
+                    " inputs with dynamic shapes are unsupported"));
               }
-              return TransferInt4ArrayToDevice(
-                  stream,
+              return TransferIntNArrayToDevice(
+                  stream, device_subshape.element_type(),
                   /*num_elements=*/ShapeUtil::ElementsIn(device_subshape),
                   /*source=*/source,
                   /*destination=*/&device_memory);
@@ -201,24 +206,24 @@ Status GenericTransferManager::TransferLiteralToDeviceAsync(
       });
 }
 
-Status GenericTransferManager::TransferLiteralToInfeed(
+absl::Status GenericTransferManager::TransferLiteralToInfeed(
     se::StreamExecutor* executor, const LiteralSlice& literal) {
   return Unimplemented("Generic transfer to Infeed");
 }
 
-Status GenericTransferManager::TransferLiteralFromOutfeed(
+absl::Status GenericTransferManager::TransferLiteralFromOutfeed(
     se::StreamExecutor* executor, MutableBorrowingLiteral literal) {
   return Unimplemented("Generic transfer from Outfeed");
 }
 
-Status GenericTransferManager::ResetDevices(
+absl::Status GenericTransferManager::ResetDevices(
     absl::Span<se::StreamExecutor* const>
     /*executors*/) {
   return Unimplemented(
       "Device reset is not yet supported on this platform (b/30481585)");
 }
 
-Status GenericTransferManager::TransferBufferFromDevice(
+absl::Status GenericTransferManager::TransferBufferFromDevice(
     se::Stream* stream, const se::DeviceMemoryBase& source, int64_t size,
     void* destination) {
   if (source.size() < size) {
@@ -230,7 +235,7 @@ Status GenericTransferManager::TransferBufferFromDevice(
   return stream->Memcpy(destination, source, size);
 }
 
-Status GenericTransferManager::TransferBufferToDevice(
+absl::Status GenericTransferManager::TransferBufferToDevice(
     se::Stream* stream, int64_t size, const void* source,
     se::DeviceMemoryBase* destination) {
   if (destination->size() < size) {
@@ -242,28 +247,34 @@ Status GenericTransferManager::TransferBufferToDevice(
   return stream->Memcpy(destination, source, size);
 }
 
-Status GenericTransferManager::TransferInt4ArrayFromDevice(
+absl::Status GenericTransferManager::TransferIntNArrayFromDevice(
     se::Stream* stream, const se::DeviceMemoryBase& source,
-    int64_t num_elements, void* destination) {
-  int64_t packed_size = (num_elements + 1) / 2;
+    PrimitiveType element_type, int64_t num_elements, void* destination) {
+  int bit_width = primitive_util::BitWidth(element_type);
+  int64_t elements_per_byte = 8 / bit_width;
+  int64_t packed_size = CeilOfRatio(num_elements, elements_per_byte);
   auto packed_dst_data = std::make_unique<std::vector<char>>(packed_size);
   TF_RETURN_IF_ERROR(TransferBufferFromDevice(stream, source, packed_size,
                                               packed_dst_data->data()));
-  TF_RETURN_IF_ERROR(stream->DoHostCallback([destination, num_elements,
-                                             packed_dst_data =
-                                                 std::move(packed_dst_data)]() {
-    UnpackInt4(*packed_dst_data,
-               absl::MakeSpan(static_cast<char*>(destination), num_elements));
-  }));
+  TF_RETURN_IF_ERROR(
+      stream->DoHostCallback([destination, bit_width, num_elements,
+                              packed_dst_data = std::move(packed_dst_data)]() {
+        UnpackIntN(
+            bit_width, *packed_dst_data,
+            absl::MakeSpan(static_cast<char*>(destination), num_elements));
+      }));
   return OkStatus();
 }
 
-Status GenericTransferManager::TransferInt4ArrayToDevice(
-    se::Stream* stream, int64_t num_elements, const void* source,
-    se::DeviceMemoryBase* destination) {
+absl::Status GenericTransferManager::TransferIntNArrayToDevice(
+    se::Stream* stream, PrimitiveType element_type, int64_t num_elements,
+    const void* source, se::DeviceMemoryBase* destination) {
+  int bit_width = primitive_util::BitWidth(element_type);
+  int64_t elements_per_byte = 8 / bit_width;
   auto packed_src_data = std::make_unique<std::vector<char>>(
-      CeilOfRatio(num_elements, int64_t{2}));
-  PackInt4(absl::MakeSpan(static_cast<const char*>(source), num_elements),
+      CeilOfRatio(num_elements, elements_per_byte));
+  PackIntN(bit_width,
+           absl::MakeSpan(static_cast<const char*>(source), num_elements),
            absl::MakeSpan(*packed_src_data));
   TF_RETURN_IF_ERROR(TransferBufferToDevice(
       stream, packed_src_data->size(), packed_src_data->data(), destination));
@@ -283,8 +294,9 @@ Shape GenericTransferManager::HostShapeToDeviceShape(
     const Shape& host_shape) const {
   Shape device_shape = TransferManager::HostShapeToDeviceShape(host_shape);
   if (PackSubbyteTypes() &&
-      primitive_util::Is4BitType(device_shape.element_type())) {
-    device_shape.mutable_layout()->set_element_size_in_bits(4);
+      primitive_util::IsSubByteNonPredType(device_shape.element_type())) {
+    device_shape.mutable_layout()->set_element_size_in_bits(
+        primitive_util::BitWidth(device_shape.element_type()));
   }
   return device_shape;
 }
diff --git a/third_party/xla/xla/service/generic_transfer_manager.h b/third_party/xla/xla/service/generic_transfer_manager.h
index c80d89187073ee..a00f46d4f6fc45 100644
--- a/third_party/xla/xla/service/generic_transfer_manager.h
+++ b/third_party/xla/xla/service/generic_transfer_manager.h
@@ -61,22 +61,23 @@ class GenericTransferManager : public TransferManager {
       MutableBorrowingLiteral literal, std::function<void(Status)> done,
       const TransferMetadata* transfer_metadata) override;
 
-  Status TransferLiteralToDeviceAsync(
+  absl::Status TransferLiteralToDeviceAsync(
       se::Stream* stream, const LiteralSlice& literal,
       const ShapedBuffer& device_buffer,
       const TransferMetadata* transfer_metadata) override;
 
-  Status TransferLiteralToInfeed(se::StreamExecutor* executor,
-                                 const LiteralSlice& literal) override;
+  absl::Status TransferLiteralToInfeed(se::StreamExecutor* executor,
+                                       const LiteralSlice& literal) override;
 
-  Status TransferLiteralFromOutfeed(se::StreamExecutor* executor,
-                                    MutableBorrowingLiteral literal) override;
+  absl::Status TransferLiteralFromOutfeed(
+      se::StreamExecutor* executor, MutableBorrowingLiteral literal) override;
 
-  Status ResetDevices(absl::Span<se::StreamExecutor* const> executors) override;
+  absl::Status ResetDevices(
+      absl::Span<se::StreamExecutor* const> executors) override;
 
   int64_t GetByteSizeRequirement(const Shape& shape) const override;
 
-  Status WriteSingleTupleIndexTable(
+  absl::Status WriteSingleTupleIndexTable(
       se::Stream* stream, absl::Span<const se::DeviceMemoryBase> elements,
       const Shape& shape, se::DeviceMemoryBase* region) override;
 
@@ -87,37 +88,35 @@ class GenericTransferManager : public TransferManager {
   // 'destination' buffer.
   //
   // size is the size to transfer to destination in bytes.
-  virtual Status TransferBufferFromDevice(se::Stream* stream,
-                                          const se::DeviceMemoryBase& source,
-                                          int64_t size, void* destination);
+  virtual absl::Status TransferBufferFromDevice(
+      se::Stream* stream, const se::DeviceMemoryBase& source, int64_t size,
+      void* destination);
 
   // Transfer a memory block of the given size from 'source' buffer to the given
   // destination of the device.
   //
   // size is the size to transfer from source in bytes.
-  virtual Status TransferBufferToDevice(se::Stream* stream, int64_t size,
-                                        const void* source,
-                                        se::DeviceMemoryBase* destination);
+  virtual absl::Status TransferBufferToDevice(
+      se::Stream* stream, int64_t size, const void* source,
+      se::DeviceMemoryBase* destination);
 
   // Transfers a buffer of packed int4 values from the device to the host, then
   // unpacks them on the host. 'source' is a buffer with (num_elements+1)/2
   // bytes where each byte stores two int4 values. 'destination' is a buffer
   // with num_elements bytes, where a single int4 value will be written to each
   // byte in the lower 4 bits.
-  virtual Status TransferInt4ArrayFromDevice(se::Stream* stream,
-                                             const se::DeviceMemoryBase& source,
-                                             int64_t num_elements,
-                                             void* destination);
+  virtual absl::Status TransferIntNArrayFromDevice(
+      se::Stream* stream, const se::DeviceMemoryBase& source,
+      PrimitiveType element_type, int64_t num_elements, void* destination);
 
   // Packs an array of int4 values then transfers the packed buffer from the
   // host to the device. 'source' is a buffer with num_elements bytes, where the
   // lower 4 bits of each byte stores an int4 value. 'destination' is a buffer
   // with (num_elements+1)/2 bytes, where two int4 values will be written into
   // each byte.
-  virtual Status TransferInt4ArrayToDevice(se::Stream* stream,
-                                           int64_t num_elements,
-                                           const void* source,
-                                           se::DeviceMemoryBase* destination);
+  virtual absl::Status TransferIntNArrayToDevice(
+      se::Stream* stream, PrimitiveType element_type, int64_t num_elements,
+      const void* source, se::DeviceMemoryBase* destination);
 
   // The platform this transfer manager targets.
   const se::Platform::Id platform_id_;
diff --git a/third_party/xla/xla/service/generic_transfer_manager_test.cc b/third_party/xla/xla/service/generic_transfer_manager_test.cc
index 05eda50a0cdcd1..d0235816488e65 100644
--- a/third_party/xla/xla/service/generic_transfer_manager_test.cc
+++ b/third_party/xla/xla/service/generic_transfer_manager_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
+#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/host/host_platform_id.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
@@ -62,18 +63,21 @@ class GenericTransferManagerTest : public ::testing::Test {
         se::PlatformManager::PlatformWithId(se::host::kHostPlatformId));
     TF_ASSERT_OK_AND_ASSIGN(stream_executor_, platform->ExecutorForDevice(0));
     TF_ASSERT_OK_AND_ASSIGN(stream_, stream_executor_->CreateStream());
+    allocator_ =
+        std::make_unique<se::StreamExecutorMemoryAllocator>(stream_executor_);
   }
 
   ScopedShapedBuffer AllocateBuffer(const Shape& shape) {
-    auto buffer = transfer_manager_.AllocateScopedShapedBuffer(
-        shape, stream_executor_->GetAllocator(),
-        /*device_ordinal=*/0);
+    auto buffer =
+        transfer_manager_.AllocateScopedShapedBuffer(shape, allocator_.get(),
+                                                     /*device_ordinal=*/0);
     return std::move(buffer.value());
   }
 
   PackingTransferManager transfer_manager_;
   se::StreamExecutor* stream_executor_;
   std::unique_ptr<se::Stream> stream_;
+  std::unique_ptr<se::DeviceMemoryAllocator> allocator_;
 };
 
 TEST_F(GenericTransferManagerTest, TransferLiteralToDevice) {
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 6c8ddf2ce84289..a925edb9986b2c 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -16,14 +16,12 @@ load(
 load(
     "@local_tsl//tsl/platform:build_config_root.bzl",
     "if_static",
-    "tf_cuda_tests_tags",
-    "tf_gpu_tests_tags",
 )
 load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
-load("//xla:xla.bzl", "xla_cc_test", "xla_cub_deps", "xla_export_hlo_deps")
+load("//xla:xla.bzl", "xla_cc_test", "xla_cub_deps", "xla_internal")
 load(
     "//xla/service/gpu:build_defs.bzl",
     "build_cub_sort_kernels",
@@ -38,6 +36,7 @@ load("//xla/tests:build_defs.bzl", "xla_test")
 load(
     "//xla/tsl:tsl.bzl",
     "if_google",
+    "if_oss",
     "internal_visibility",
     "tsl_copts",
     "tsl_gpu_library",
@@ -100,12 +99,12 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     visibility = ["//visibility:public"],
     deps = [
-        ":nccl_clique_key",
         "//xla:executable_run_options",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla/service:executable",
         "//xla/service:global_device_id",
+        "//xla/service/gpu/runtime:nccl_clique_key",
         "//xla/stream_executor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/status",
@@ -160,7 +159,7 @@ xla_cc_test(
     name = "custom_call_test",
     srcs = if_gpu_is_configured(["custom_call_test.cc"]),
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         "//xla:debug_options_flags",
         "//xla:shape_util",
@@ -170,6 +169,7 @@ xla_cc_test(
         "//xla/client:xla_builder",
         "//xla/client/lib:constants",
         "//xla/ffi",
+        "//xla/ffi:execution_context",
         "//xla/ffi:ffi_api",
         "//xla/hlo/ir:hlo",
         "//xla/service:custom_call_status",
@@ -199,7 +199,7 @@ xla_cc_test(
 xla_cc_test(
     name = "gpu_copy_insertion_test",
     srcs = ["gpu_copy_insertion_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":buffer_sharing",
         "//xla:test",
@@ -347,6 +347,7 @@ cc_library(
         "//xla:statusor",
         "//xla:util",
         "//xla:xla_data_proto_cc",
+        "//xla/ffi:attribute_map",
         "//xla/ffi:ffi_api",
         "//xla/ffi/api:c_api",
         "//xla/hlo/ir:hlo",
@@ -381,6 +382,7 @@ cc_library(
         "//xla/service/gpu/runtime:nccl_collective_broadcast_thunk",
         "//xla/service/gpu/runtime:nccl_collective_permute_thunk",
         "//xla/service/gpu/runtime:nccl_collective_thunk",
+        "//xla/service/gpu/runtime:nccl_p2p_thunk_common",
         "//xla/service/gpu/runtime:nccl_recv_thunk",
         "//xla/service/gpu/runtime:nccl_send_thunk",
         "//xla/service/gpu/runtime:norm_thunk",
@@ -535,6 +537,7 @@ cc_library(
         "//xla/service/gpu/model:indexing_map",
         "//xla/service/gpu/model:symbolic_tile_analysis",
         "//xla/service/gpu/model:symbolic_tiled_hlo_instruction",
+        "//xla/service/gpu/model:tiled_hlo_computation",
         "//xla/service/gpu/model:tiled_hlo_instruction",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor:device_description",
@@ -590,11 +593,13 @@ cc_library(
         "@triton//:TritonNvidiaGPUTransforms",
         "@triton//:TritonGPUToLLVM",
         "@triton//:TritonToTritonGPU",
+        "@triton//:TritonGPUTransforms",
     ]) + if_cuda_is_configured([
         "@triton//third_party/nvidia:NVGPUToLLVM",
         "@triton//third_party/nvidia:TritonNVIDIAGPUToLLVM",
-        "@triton//:TritonGPUTransforms",
         "@triton//:TritonLLVMIR",
+    ]) + if_rocm_is_configured([
+        "@local_tsl//tsl/platform:rocm_rocdl_path",
     ]),
 )
 
@@ -651,9 +656,6 @@ xla_test(
 xla_test(
     name = "ir_emitter_triton_large_test",
     srcs = if_cuda_is_configured(["ir_emitter_triton_large_test.cc"]),
-    backend_tags = {"gpu": [
-        "requires-gpu-sm80",
-    ]},
     backends = [
         "gpu_a100",
         "gpu_h100",
@@ -680,6 +682,7 @@ xla_test(
     srcs = if_cuda_is_configured(["ir_emitter_triton_parametrized_test.cc"]),
     backends = [
         "gpu_a100",
+        "gpu_h100",
     ],
     shard_count = 10,
     tags = ["nomac"],
@@ -704,6 +707,7 @@ cc_library(
     name = "gemm_fusion_autotuner",
     srcs = if_cuda_is_configured(["gemm_fusion_autotuner.cc"]),
     hdrs = if_cuda_is_configured(["gemm_fusion_autotuner.h"]),
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     deps = if_cuda_is_configured([
         ":autotuner_compile_util",
         ":autotuner_util",
@@ -757,10 +761,13 @@ cc_library(
         "@local_tsl//tsl/platform:blocking_counter",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/profiler/lib:scoped_annotation",
         "//xla/tsl/util/proto:proto_utils",
-    ]),
+        "//xla/service/gpu:hlo_traversal",
+    ]) + ["@local_tsl//tsl/platform:path"],
 )
 
 xla_test(
@@ -810,7 +817,9 @@ xla_test(
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:statusor",
-    ],
+    ] + if_cuda_is_configured([
+        "@local_config_cuda//cuda:cuda_headers",
+    ]),
 )
 
 cc_library(
@@ -870,41 +879,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "nccl_clique_key",
-    srcs = ["nccl_clique_key.cc"],
-    hdrs = ["nccl_clique_key.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        "//xla/service:global_device_id",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:btree",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-xla_cc_test(
-    name = "nccl_clique_key_test",
-    srcs = ["nccl_clique_key_test.cc"],
-    deps = [
-        ":nccl_clique_key",
-        "//xla/service:global_device_id",
-        "@com_google_absl//absl/container:btree",
-        "@local_tsl//tsl/platform:test",
-        "@local_tsl//tsl/platform:test_main",
-    ],
-)
-
-cuda_library(
-    name = "sleep_kernel",
-    srcs = if_cuda_is_configured(["sleep_kernel.cu.cc"]),
-    hdrs = if_cuda_is_configured(["sleep_kernel.h"]),
-    deps = ["@local_config_cuda//cuda:cuda_headers"],
-)
-
 # TODO(b/244780257): Remove this config.
 bool_flag(
     name = "enable_xlir",
@@ -928,7 +902,6 @@ cc_library(
         ":gpu_constants",
         ":gpu_executable_run_options",
         ":ir_emission_utils",
-        ":nccl_clique_key",
         ":stream_executor_util",
         "//xla:executable_run_options",
         "//xla:shape_tree",
@@ -949,17 +922,20 @@ cc_library(
         "//xla/service:stream_pool",
         "//xla/service:xla_debug_info_manager",
         "//xla/service/gpu/runtime:annotation",
+        "//xla/service/gpu/runtime:for_all_thunks",
         "//xla/service/gpu/runtime:nccl_clique",
+        "//xla/service/gpu/runtime:nccl_clique_key",
         "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:scoped_module_handle",
         "//xla/stream_executor/cuda:cuda_platform_id",
         "//xla/stream_executor/gpu:gpu_activation",
         "//xla/stream_executor/gpu:gpu_executor_header",
         "//xla/stream_executor/gpu:gpu_stream_header",
-        "//xla/stream_executor/gpu:gpu_timer_header",
+        "//xla/stream_executor/gpu:gpu_timer",
         "//xla/stream_executor/rocm:rocm_platform_id",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
@@ -980,6 +956,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Support",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
@@ -1014,13 +991,10 @@ cc_library(
         "//xla:literal",
         "//xla:shape_util",
         "//xla:status",
-        "//xla:status_macros",
         "//xla:statusor",
-        "//xla:types",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
-        "//xla/mlir_hlo",
         "//xla/service:buffer_assignment",
         "//xla/service:hlo_parser",
         "//xla/service/llvm_ir:buffer_assignment_util",
@@ -1040,15 +1014,11 @@ cc_library(
         "@llvm-project//llvm:Core",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:TargetParser",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:BufferizationDialect",
-        "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
         "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:ml_dtypes",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
@@ -1163,9 +1133,6 @@ cc_library(
     name = "gemm_rewriter",
     srcs = ["gemm_rewriter.cc"],
     hdrs = ["gemm_rewriter.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
     deps = [
         ":backend_configs_cc",
         ":cublas_cudnn",
@@ -1200,9 +1167,7 @@ cc_library(
         "@local_tsl//tsl/platform:ml_dtypes",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/protobuf:dnn_proto_cc",
-    ] + if_cuda_is_configured([
-        "@local_config_cuda//cuda:cuda_headers",
-    ]),
+    ],
 )
 
 cc_library(
@@ -1254,6 +1219,7 @@ xla_test(
         "@com_google_googletest//:gtest_main",
         "@llvm-project//llvm:ir_headers",
         "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -1297,6 +1263,7 @@ cc_library(
     srcs = ["triton_fusion_analysis.cc"],
     hdrs = ["triton_fusion_analysis.h"],
     deps = [
+        ":cudnn_support_utils",
         ":matmul_utils",
         ":triton_tiling_propagation",
         "//xla:autotuning_proto_cc",
@@ -1435,6 +1402,41 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "gemm_degenerate_dim_remover",
+    srcs = ["gemm_degenerate_dim_remover.cc"],
+    hdrs = ["gemm_degenerate_dim_remover.h"],
+    deps = [
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_pass",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "gemm_degenerate_dim_remover_test",
+    srcs = ["gemm_degenerate_dim_remover_test.cc"],
+    deps = [
+        ":gemm_degenerate_dim_remover",
+        "//xla/hlo/ir:hlo",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "split_k_gemm_rewriter",
     srcs = ["split_k_gemm_rewriter.cc"],
@@ -1531,9 +1533,6 @@ cc_library(
     name = "gemm_algorithm_picker",
     srcs = if_gpu_is_configured(["gemm_algorithm_picker.cc"]),
     hdrs = if_gpu_is_configured(["gemm_algorithm_picker.h"]),
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
     deps = if_gpu_is_configured([
         ":backend_configs_cc",
         ":buffer_comparator",
@@ -1544,6 +1543,7 @@ cc_library(
         ":matmul_utils",
         ":stream_executor_util",
         ":variant_visitor",
+        ":autotuner_compile_util",
         ":autotuner_util",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
@@ -1571,6 +1571,7 @@ cc_library(
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/profiler/lib:scoped_annotation",
     ]),
 )
 
@@ -1637,6 +1638,7 @@ cc_library(
         "//xla/service:shaped_buffer",
         "//xla/stream_executor",
         "//xla/stream_executor/gpu:gpu_stream_header",
+        "//xla/stream_executor/gpu:redzone_allocator",
         "//xla:executable_run_options",
         "//xla:shape_util",
         "//xla:statusor",
@@ -1647,19 +1649,33 @@ cc_library(
     ]),
 )
 
+xla_test(
+    name = "autotuner_compile_util_test",
+    srcs = if_gpu_is_configured(["autotuner_compile_util_test.cc"]),
+    backends = ["gpu"],
+    deps = if_gpu_is_configured(
+        [
+            ":autotuner_compile_util",
+            ":autotuner_util",
+            "@com_google_googletest//:gtest_main",
+            "@com_google_absl//absl/strings",
+            "@com_google_absl//absl/strings:string_view",
+            "//xla/hlo/ir:hlo",
+            "//xla/service:platform_util",
+            "//xla/stream_executor:platform",
+            "//xla/tests:hlo_test_base",
+            "@local_tsl//tsl/platform:statusor",
+        ],
+        if_false = [
+            "@com_google_googletest//:gtest_main",  # b/317293391
+        ],
+    ),
+)
+
 xla_test(
     name = "gemm_algorithm_picker_test",
     srcs = if_gpu_is_configured(["gemm_algorithm_picker_test.cc"]),
-    backend_tags = {
-        "gpu": [
-            "requires-gpu-nvidia",
-            "noasan",
-            "nomsan",
-            "requires-gpu-sm70-only",
-        ],
-    },
     backends = [
-        "gpu",
         "gpu_v100",
     ],
     deps = [
@@ -1718,16 +1734,20 @@ cc_library(
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
-    ] + if_cuda_is_configured([
+    ] + if_gpu_is_configured([
+        #keep sorted
+        "//xla/stream_executor:host_or_device_scalar",
+    ]) + if_cuda_is_configured([
+        #keep sorted
         "//xla/stream_executor/cuda:cublas_lt_header",
         "//xla/stream_executor/cuda:cublas_plugin",
         "@local_tsl//tsl/platform:tensor_float_32_hdr_lib",
-        "//xla/stream_executor:host_or_device_scalar",
     ]) + if_rocm_is_configured([
-        "//xla/stream_executor/rocm:hipblas_lt_header",
-        "//xla/stream_executor/rocm:amdhipblaslt_plugin",
-        "//xla/stream_executor:host_or_device_scalar",
+        #keep sorted
         "//xla/stream_executor/platform:dso_loader",
+        "//xla/stream_executor/rocm:amdhipblaslt_plugin",
+        "//xla/stream_executor/rocm:hipblas_lt_header",
+        "@local_config_rocm//rocm:rocm_headers",
     ]) + if_static([
         "@local_tsl//tsl/platform:tensor_float_32_utils",
     ]),
@@ -1775,7 +1795,7 @@ cc_library(
 xla_cc_test(
     name = "dot_dimension_sorter_test",
     srcs = ["dot_dimension_sorter_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":dot_dimension_sorter",
         "//xla:error_spec",
@@ -1788,6 +1808,39 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "dot_sparsity_rewriter",
+    srcs = ["dot_sparsity_rewriter.cc"],
+    hdrs = ["dot_sparsity_rewriter.h"],
+    deps = [
+        "//xla:status",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_creation_utils",
+        "//xla/service:hlo_pass",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "dot_sparsity_rewriter_test",
+    srcs = ["dot_sparsity_rewriter_test.cc"],
+    use_gpu = True,
+    deps = [
+        ":dot_sparsity_rewriter",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "gpu_async_collective_annotator",
     srcs = ["gpu_async_collective_annotator.cc"],
@@ -1868,6 +1921,7 @@ cc_library(
         "TENSORFLOW_USE_ROCM=1",
     ]),
     deps = if_gpu_is_configured([
+        ":autotuner_compile_util",
         ":autotuner_util",
         ":backend_configs_cc",
         ":buffer_comparator",
@@ -1919,18 +1973,13 @@ cc_library(
 xla_test(
     name = "conv_algorithm_picker_test",
     srcs = if_gpu_is_configured(["conv_algorithm_picker_test.cc"]),
-    backend_tags = {
-        "gpu": [
-            "requires-gpu-nvidia",
-            "noasan",
-            "nomsan",
-            "requires-gpu-sm70-only",
-        ],
-    },
     backends = [
-        "gpu",
         "gpu_v100",
     ],
+    tags = [
+        "noasan",
+        "nomsan",
+    ],
     deps = [
         ":autotuner_util",
         ":conv_algorithm_picker",
@@ -2586,6 +2635,7 @@ xla_cc_test(
         ":gpu_device_info_for_tests",
         ":gpu_fusible",
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_cost_analysis",
         "//xla/service:pattern_matcher",
@@ -2908,9 +2958,11 @@ cc_library(
     hdrs = if_cuda_is_configured(["cudnn_fusion_compiler.h"]),
     deps = if_cuda_is_configured([
         ":backend_configs_cc",
+        ":cudnn_support_utils",
         ":ir_emission_utils",
         ":kernel_reuse_cache",
         ":matmul_utils",
+        ":triton_fusion_analysis",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
@@ -2923,7 +2975,6 @@ cc_library(
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
-        "//xla/service/gpu:triton_fusion_analysis",
         "//xla/service:hlo_pass",
         "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor/cuda:cudnn_frontend_helpers",
@@ -2946,6 +2997,8 @@ cc_library(
         ":stream_executor_util",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
@@ -2959,7 +3012,7 @@ cc_library(
         "//xla/stream_executor/cuda:cudnn_plugin",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
-        "@com_google_absl//absl/container:flat_hash_set",
+        "//xla:status_macros",
     ]),
 )
 
@@ -3079,6 +3132,7 @@ cc_library(
     hdrs = [
         "compile_module_to_llvm_ir.h",
     ],
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     deps = [
         ":executable_proto_cc",
         ":gpu_constants",
@@ -3121,6 +3175,7 @@ cc_library(
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/profiler/lib:scoped_annotation",
     ],
 )
 
@@ -3333,11 +3388,9 @@ cc_library(
     hdrs = if_gpu_is_configured([
         "gpu_compiler.h",
     ]),
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
     deps = if_gpu_is_configured([
         ":gpu_p2p_pipeliner",
+        ":pipelined_p2p_rewriter",
         ":collective_permute_cycle_decomposer",
         ":address_computation_fusion_rewriter",
         ":algorithm_checker",
@@ -3377,7 +3430,7 @@ cc_library(
         ":instruction_fusion",
         ":ir_emission_utils",
         ":ir_emitter",
-        ":loop_double_buffer_transformer",
+        ":double_buffer_loop_unrolling",
         ":matmul_utils",
         ":metrics",
         ":move_copy_to_users",
@@ -3395,6 +3448,7 @@ cc_library(
         ":topk_specializer",
         ":topk_splitter",
         ":tree_reduction_rewriter",
+        ":triton_fusion_numerics_verifier",
         ":variadic_op_splitter",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log:check",
@@ -3402,7 +3456,6 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:variant",
-        "@local_config_cuda//cuda:cuda_headers",
         "@llvm-project//llvm:AsmParser",
         "@llvm-project//llvm:BitReader",
         "@llvm-project//llvm:BitWriter",
@@ -3486,6 +3539,7 @@ cc_library(
         "//xla/service:qr_expander",
         "//xla/service:real_imag_expander",
         "//xla/service:reduce_decomposer",
+        "//xla/service:reduce_window_rewriter",
         "//xla/service:reduce_scatter_combiner",
         "//xla/service:reduce_scatter_reassociate",
         "//xla/service:reshape_decomposer",
@@ -3520,7 +3574,6 @@ cc_library(
         "//xla/stream_executor:device_description",
         "//xla/stream_executor",
         "//xla/stream_executor/gpu:gpu_driver_header",
-        "//xla/stream_executor/cuda:cuda_platform_id",
         "//xla/stream_executor/integrations:device_mem_allocator",
         "//xla/translate/hlo_to_mhlo:hlo_utils",
         "//xla/translate/mhlo_to_hlo:location_exporter",
@@ -3532,6 +3585,7 @@ cc_library(
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/profiler/lib:scoped_annotation",
         "@local_tsl//tsl/profiler/lib:traceme",
         "//xla:status",
         "@com_google_absl//absl/status",
@@ -3550,9 +3604,8 @@ cc_library(
         "//xla/service:hlo_ordering",
         "//xla/service:layout_assignment",
         "//xla/service:logical_buffer",
-        "//xla/stream_executor/rocm:rocm_platform_id",
         "@local_tsl//tsl/platform:numbers",
-    ]) + xla_export_hlo_deps() + [
+    ]) + xla_internal(["service:export_hlo"]) + [
         ":command_buffer_scheduling",
         ":fusion_pipeline",
         ":ir_emitter_context",
@@ -3572,19 +3625,20 @@ xla_test(
     backends = ["gpu"],
     data = ["gpu_compiler_test_autotune_db.textproto"],
     deps = [
+        ":autotuner_util",
         ":gpu_compiler",
         ":gpu_hlo_schedule",
         ":metrics",
         "//xla:autotune_results_proto_cc",
         "//xla:error_spec",
         "//xla/hlo/ir:hlo",
-        "//xla/service:buffer_assignment",
         "//xla/service:executable",
         "//xla/service:hlo_module_config",
         "//xla/service:pattern_matcher",
         "//xla/service:pattern_matcher_gmock",
         "//xla/service:xla_debug_info_manager",
-        "//xla/service/gpu:autotuner_util",
+        "//xla/stream_executor:device_description",
+        "//xla/tests:filecheck",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/log",
@@ -3593,18 +3647,22 @@ xla_test(
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:path",
+        "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "gpu_offloading_test",
     srcs = ["gpu_offloading_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    backends = ["gpu"],
     deps = [
+        ":backend_configs_cc",
         ":horizontal_loop_fusion",
         ":metrics",
         "//xla:autotune_results_proto_cc",
@@ -3615,7 +3673,6 @@ xla_cc_test(
         "//xla/hlo/utils:hlo_matchers",
         "//xla/service:buffer_assignment",
         "//xla/service:buffer_value",
-        "//xla/service:gpu_plugin",
         "//xla/service:hlo_cost_analysis",
         "//xla/service:hlo_memory_scheduler",
         "//xla/service:hlo_rematerialization",
@@ -3623,7 +3680,9 @@ xla_cc_test(
         "//xla/service:pattern_matcher",
         "//xla/service:pattern_matcher_gmock",
         "//xla/service:xla_debug_info_manager",
+        "//xla/service/gpu:stream_attribute_annotator",
         "//xla/tests:hlo_test_base",
+        "//xla/tests:test_macros_header",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/base:log_severity",
         "@com_google_absl//absl/log:scoped_mock_log",
@@ -3638,7 +3697,8 @@ xla_cc_test(
 xla_cc_test(
     name = "auto_sharding_gpu_compiler_test",
     srcs = ["auto_sharding_gpu_compiler_test.cc"],
-    tags = tf_cuda_tests_tags() + ["no_oss"],  # TODO(b/277355322): Make autosharding work in OSS
+    tags = ["no_oss"],  # TODO(b/277355322): Make autosharding work in OSS
+    use_gpu = True,
     deps = [
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
@@ -3680,7 +3740,7 @@ cc_library(
             "ENABLE_LIBNVPTXCOMPILER_SUPPORT=1",
         ],
         "//conditions:default": [],
-    }),
+    }) + if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     deps = if_cuda_is_configured([
         ":autotuner_util",
         ":buffer_sharing",
@@ -3698,6 +3758,7 @@ cc_library(
         ":cudnn_simplify_padding",
         ":cudnn_vectorize_convolutions",
         ":cusolver_rewriter",
+        ":dot_sparsity_rewriter",
         ":gemm_algorithm_picker",
         ":gemm_fusion_autotuner",
         ":gpu_asm_opts_util",
@@ -3718,6 +3779,7 @@ cc_library(
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:node_hash_map",
+        "@local_config_cuda//cuda:cuda_headers",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -3778,6 +3840,7 @@ cc_library(
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/profiler/lib:scoped_annotation",
         "@local_tsl//tsl/profiler/lib:traceme",
         "//xla/tsl/util:env_var",
     ]),
@@ -3860,6 +3923,7 @@ cc_library(
     srcs = [
         "amdgpu_compiler_registration.cc",
     ],
+    local_defines = if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
     tags = ["manual"],
     deps = [
         ":amdgpu_compiler_impl",
@@ -3869,6 +3933,44 @@ cc_library(
     alwayslink = True,  # Contains compiler registration
 )
 
+cc_library(
+    name = "gpu_algebraic_simplifier",
+    srcs = [
+        "gpu_algebraic_simplifier.cc",
+    ],
+    hdrs = [
+        "gpu_algebraic_simplifier.h",
+    ],
+    deps = [
+        ":triton_support",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:algebraic_simplifier",
+        "//xla/service:hlo_pass",
+        "//xla/stream_executor:device_description",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
+xla_cc_test(
+    name = "gpu_algebraic_simplifier_test",
+    srcs = ["gpu_algebraic_simplifier_test.cc"],
+    deps = [
+        ":gpu_algebraic_simplifier",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:algebraic_simplifier",
+        "//xla/stream_executor:device_description",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "amdgpu_compiler_impl",
     srcs = [
@@ -3883,47 +3985,47 @@ cc_library(
         ":conv_algorithm_picker",
         ":cublas_pad_for_gemms",
         ":cublas_padding_requirements",
+        ":cudnn_fused_conv_rewriter",
         ":cusolver_rewriter",
         ":gemm_algorithm_picker",
-        ":gemm_rewriter",
         ":gpu_compiler",
         ":gpu_conv_padding_legalization",
         ":gpu_conv_rewriter",
-        ":gpu_layout_assignment",
         ":gpu_sort_rewriter",
-        ":reduction_degenerate_dim_remover",
-        ":reduction_dimension_grouper",
-        ":reduction_layout_normalizer",
         ":target_constants",
-        ":tree_reduction_rewriter",
         ":triangular_solve_rewriter",
-        "//xla:statusor",
         "//xla:util",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:algebraic_simplifier",
         "//xla/service:call_inliner",
+        "//xla/service:convert_mover",
         "//xla/service:dot_dimension_merger",
+        "//xla/service:float_normalization",
+        "//xla/service:float_support",
         "//xla/service:hlo_constant_folding",
-        "//xla/service:hlo_cse",
         "//xla/service:hlo_module_config",
         "//xla/service:hlo_pass",
         "//xla/service:hlo_pass_pipeline",
         "//xla/service:hlo_verifier",
+        "//xla/service:reshape_mover",
         "//xla/service:tuple_simplifier",
         "//xla/service/gpu/llvm_gpu_backend",
-        "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor:device_description",
-        "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor/rocm:rocm_platform_id",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@llvm-project//llvm:ir_headers",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
-    ],
+    ] + if_rocm_is_configured([
+        # keep sorted
+        "@local_config_rocm//rocm:rocm_headers",
+    ]),
 )
 
 cc_library(
@@ -4001,6 +4103,7 @@ cc_library(
         "//xla:types",
         "//xla:util",
         "//xla/stream_executor",
+        "//xla/stream_executor:device_memory_handle",
         "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor/gpu:gpu_executor_header",
         "@com_google_absl//absl/base:core_headers",
@@ -4035,6 +4138,7 @@ cc_library(
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:computation_layout",
+        "//xla/service:host_memory_offload_annotations_hdr",
         "//xla/service:layout_assignment",
         "//xla/service:logical_buffer",
         "//xla/stream_executor",
@@ -4052,7 +4156,7 @@ cc_library(
 xla_cc_test(
     name = "gpu_layout_assignment_test",
     srcs = ["gpu_layout_assignment_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":gpu_layout_assignment",
         ":stream_executor_util",
@@ -4081,7 +4185,6 @@ cc_library(
     hdrs = ["gpu_schedule_postprocessing.h"],
     deps = [
         ":backend_configs_cc",
-        "//xla:statusor",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
         "//xla/service:hlo_pass",
@@ -4157,7 +4260,7 @@ xla_cc_test(
     srcs = [
         "gpu_hlo_schedule_test.cc",
     ],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":gpu_hlo_schedule",
         "//xla:shape_util",
@@ -4240,14 +4343,22 @@ xla_cc_test(
     ],
 )
 
+cuda_library(
+    name = "stream_executor_util_kernel",
+    srcs = if_cuda_is_configured(["stream_executor_util_kernel.cu.cc"]),
+    deps = ["@local_config_cuda//cuda:cuda_headers"],
+)
+
 cc_library(
     name = "stream_executor_util",
     srcs = ["stream_executor_util.cc"],
     hdrs = ["stream_executor_util.h"],
     copts = tsl_copts(),
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     deps = [
         ":cublas_cudnn",
         ":launch_dimensions",
+        ":stream_executor_util_kernel",
         "//xla:autotuning_proto_cc",
         "//xla:shape_util",
         "//xla:statusor",
@@ -4257,7 +4368,9 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_module_config",
         "//xla/stream_executor",
+        "//xla/stream_executor:kernel_factory",
         "//xla/stream_executor:launch_dim",
+        "//xla/stream_executor:typed_kernel_factory",
         "//xla/tsl/util:env_var",
         "//xla/tsl/util/proto:proto_utils",
         "@com_google_absl//absl/algorithm:container",
@@ -4309,23 +4422,21 @@ cc_library(
     name = "hlo_fusion_analysis",
     srcs = ["hlo_fusion_analysis.cc"],
     hdrs = ["hlo_fusion_analysis.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     deps = [
         ":backend_configs_cc",
         ":hlo_traversal",
         ":ir_emission_utils",
-        ":launch_dimensions",
         ":reduction_utils",
         "//xla:shape_util",
         "//xla:statusor",
         "//xla/hlo/ir:hlo",
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
     ],
 )
@@ -4368,6 +4479,8 @@ cc_library(
         "//xla:util",
         "//xla/service:hlo_module_config",
         "//xla/stream_executor",
+        "//xla/stream_executor:device_memory_handle",
+        "//xla/stream_executor:typed_kernel_factory",
         "//xla/stream_executor/gpu:asm_compiler",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
@@ -4376,7 +4489,7 @@ cc_library(
     ]),
 )
 
-cuda_library(
+gpu_kernel_library(
     name = "buffer_comparator_kernel",
     srcs = if_gpu_is_configured(["buffer_comparator.cu.cc"]),
     copts = rocm_copts(),
@@ -4396,7 +4509,7 @@ xla_cc_test(
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
-    tags = tf_gpu_tests_tags(),
+    use_gpu = True,
     deps = [
         ":stream_executor_util",
         "//xla:shape_util",
@@ -4404,6 +4517,7 @@ xla_cc_test(
         "//xla/service:hlo_module_config",
         "//xla/stream_executor",
         "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:device_memory_handle",
         "//xla/stream_executor:platform_manager",
         "@local_tsl//tsl/platform:ml_dtypes",
         "@local_tsl//tsl/platform:status",
@@ -4455,6 +4569,7 @@ cc_library(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings:string_view",
     ],
 )
 
@@ -4519,17 +4634,15 @@ xla_test(
     name = "cudnn_fused_conv_rewriter_test",
     srcs = ["cudnn_fused_conv_rewriter_test.cc"],
     backend_tags = {
-        "gpu": [
-            "requires-gpu-nvidia",
-            "requires-gpu-sm80-only",
+        "gpu_a100": [
             "noasan",
             "nomsan",
             "no_rocm",
         ],
     },
     backends = [
-        "gpu",
-    ],
+        "gpu_a100",
+    ] + if_oss(["gpu_any"]),
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     shard_count = 10,
     deps = [
@@ -4610,7 +4723,7 @@ xla_cc_test(
     name = "cudnn_norm_rewriter_test",
     srcs = ["cudnn_norm_rewriter_test.cc"],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":cublas_cudnn",
         ":cudnn_norm_rewriter",
@@ -4761,12 +4874,13 @@ xla_test(
 xla_cc_test(
     name = "conv_layout_normalization_test",
     srcs = ["conv_layout_normalization_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         "//xla:error_spec",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu/tests:gpu_codegen_test",  # fixdeps: keep
         "//xla/tests:hlo_test_base",
+        "//xla/tests:test_macros_header",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
     ],
@@ -4848,7 +4962,6 @@ cc_library(
         "//xla/stream_executor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:status",
@@ -4859,16 +4972,12 @@ xla_cc_test(
     name = "hlo_algorithm_denylist_test",
     srcs = ["hlo_algorithm_denylist_test.cc"],
     data = ["data/hlo_algorithm_denylist.pbtxt"],
-    tags = [
-        "nomsan",
-    ],
     deps = [
         ":hlo_algorithm_denylist",
         "//xla/stream_executor:dnn",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:path",
-        "@local_tsl//tsl/platform:resource_loader",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
     ],
@@ -4937,7 +5046,7 @@ cc_library(
 xla_cc_test(
     name = "horizontal_loop_fusion_test",
     srcs = ["horizontal_loop_fusion_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":gpu_device_info_for_tests",
         ":horizontal_loop_fusion",
@@ -4989,7 +5098,7 @@ cc_library(
 xla_cc_test(
     name = "horizontal_input_fusion_test",
     srcs = ["horizontal_input_fusion_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":gpu_device_info_for_tests",
         ":horizontal_input_fusion",
@@ -5197,19 +5306,22 @@ xla_test(
         "gpu_p100",
         "gpu_v100",
     ],
-    deps = if_cuda_is_configured([
-        ":dot_operand_converter",
-        "@com_google_googletest//:gtest",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:string_view",
-        "//xla:shape_util",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/utils:hlo_matchers",
-        "//xla/service:pattern_matcher",
-        "//xla/tests:hlo_test_base",
-        "//xla/tests:xla_internal_test_main",
-        "@local_tsl//tsl/platform:statusor",
-    ]),
+    deps = if_cuda_is_configured(
+        [
+            ":dot_operand_converter",
+            "@com_google_googletest//:gtest",
+            "@com_google_absl//absl/strings",
+            "@com_google_absl//absl/strings:string_view",
+            "//xla:shape_util",
+            "//xla/hlo/ir:hlo",
+            "//xla/hlo/utils:hlo_matchers",
+            "//xla/service:pattern_matcher",
+            "//xla/tests:hlo_test_base",
+            "//xla/tests:xla_internal_test_main",
+            "@local_tsl//tsl/platform:statusor",
+        ],
+        ["@local_tsl//tsl/platform:test_main"],  # b/317293391
+    ) + ["//xla:xla_data_proto_cc"],
 )
 
 cc_library(
@@ -5223,6 +5335,7 @@ cc_library(
         "//xla:util",
         "//xla/stream_executor",
         "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:typed_kernel_factory",
         "//xla/stream_executor/gpu:gpu_stream_header",
         "@com_google_absl//absl/status",
         "@local_tsl//tsl/platform:errors",
@@ -5290,7 +5403,7 @@ tsl_gpu_library(
 xla_cc_test(
     name = "runtime_intrinsics_test",
     srcs = ["runtime_intrinsics_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":runtime_intrinsics",
         "//xla/hlo/ir:hlo",
@@ -5470,7 +5583,7 @@ xla_cc_test(
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
-    tags = tf_gpu_tests_tags(),
+    use_gpu = True,
     deps = [
         ":topk_specializer",
         "//xla:shape_util",
@@ -5498,9 +5611,9 @@ cc_library(
     hdrs = ["copy_fusion.h"],
     deps = [
         ":gpu_fusible",
+        ":hlo_traversal",
         ":ir_emission_utils",
         ":reduction_utils",
-        "//xla:statusor",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_pass",
         "@com_google_absl//absl/algorithm:container",
@@ -5542,6 +5655,7 @@ xla_test(
         "gpu_a100",
     ],
     tags = [
+        "no_oss",  # Needs fix for `ConvertGenerator`
         "nomac",
     ],
     deps = [
@@ -5609,7 +5723,6 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
     ],
@@ -5621,7 +5734,10 @@ xla_cc_test(
     deps = [
         ":hlo_traversal",
         "//xla/hlo/ir:hlo",
+        "//xla/service:pattern_matcher",
+        "//xla/service:pattern_matcher_gmock",
         "//xla/tests:hlo_test_base",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -5699,12 +5815,11 @@ xla_cc_test(
 )
 
 cc_library(
-    name = "loop_double_buffer_transformer",
-    srcs = ["loop_double_buffer_transformer.cc"],
-    hdrs = ["loop_double_buffer_transformer.h"],
+    name = "double_buffer_loop_unrolling",
+    srcs = ["double_buffer_loop_unrolling.cc"],
+    hdrs = ["double_buffer_loop_unrolling.h"],
     deps = [
         "//xla:status",
-        "//xla:statusor",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -5726,14 +5841,15 @@ cc_library(
 )
 
 xla_cc_test(
-    name = "loop_double_buffer_transformer_test",
-    srcs = ["loop_double_buffer_transformer_test.cc"],
+    name = "double_buffer_loop_unrolling_test",
+    srcs = ["double_buffer_loop_unrolling_test.cc"],
     deps = [
-        ":loop_double_buffer_transformer",
+        ":double_buffer_loop_unrolling",
         "//xla:test",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/utils:hlo_query",
         "//xla/service:tuple_simplifier",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
@@ -5745,34 +5861,38 @@ xla_cc_test(
 
 xla_test(
     name = "determinism_test",
-    srcs = ["determinism_test.cc"],
+    srcs = if_gpu_is_configured(["determinism_test.cc"]),
     backends = [
         "gpu_a100",
     ],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
-    deps = [
-        ":autotuner_util",
-        "//xla:literal",
-        "//xla:xla_proto_cc",
-        "//xla/hlo/ir:hlo",
-        "//xla/service/gpu/tests:gpu_codegen_test",
-        "//xla/stream_executor:device_description",
-        "//xla/stream_executor/gpu:gpu_timer_header",
-        "//xla/tests:hlo_test_base",
-        "//xla/tests:literal_test_util",
-        "//xla/tests:test_utils",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:statusor",
-    ],
+    deps = if_gpu_is_configured(
+        [
+            ":autotuner_util",
+            "@com_google_googletest//:gtest_main",
+            "@com_google_absl//absl/strings",
+            "//xla:literal",
+            "//xla:xla_proto_cc",
+            "//xla/hlo/ir:hlo",
+            "//xla/service/gpu/tests:gpu_codegen_test",
+            "//xla/stream_executor:device_description",
+            "//xla/stream_executor/gpu:gpu_timer",
+            "//xla/tests:hlo_test_base",
+            "//xla/tests:literal_test_util",
+            "//xla/tests:test_utils",
+            "@local_tsl//tsl/platform:statusor",
+        ],
+        ["@local_tsl//tsl/platform:test_main"],  # b/317293391
+    ),
 )
 
 cc_library(
     name = "gpu_symbol_repository",
     hdrs = ["gpu_symbol_repository.h"],
     deps = [
+        "//xla:autotune_results_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/service:symbol_repository",
     ],
@@ -5960,6 +6080,8 @@ xla_cc_test(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/service:pattern_matcher",
+        "//xla/service:pattern_matcher_gmock",
         "//xla/tests:hlo_test_base",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings:string_view",
@@ -5971,3 +6093,130 @@ xla_cc_test(
         "@local_tsl//tsl/platform:statusor",
     ],
 )
+
+cc_library(
+    name = "triton_fusion_numerics_verifier",
+    srcs = if_gpu_is_configured(["triton_fusion_numerics_verifier.cc"]),
+    hdrs = if_gpu_is_configured(["triton_fusion_numerics_verifier.h"]),
+    deps = if_gpu_is_configured([
+        ":autotuner_compile_util",
+        ":autotuner_util",
+        ":backend_configs_cc",
+        ":buffer_comparator",
+        ":ir_emission_utils",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "//xla:shape_util",
+        "//xla:status_macros",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:executable",
+        "//xla/service:hlo_pass",
+        "//xla/service:shaped_buffer",
+        "//xla/service:hlo_module_config",
+        "//xla/stream_executor:stream_executor_headers",
+        "//xla/tools:hlo_decomposer_lib",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+    ]),
+)
+
+xla_test(
+    name = "triton_fusion_numerics_verifier_test",
+    srcs = if_gpu_is_configured(["triton_fusion_numerics_verifier_test.cc"]),
+    backend_tags = {"gpu": [
+        "requires-gpu-sm80",
+    ]},
+    backends = ["gpu"],
+    deps = [
+        ":autotuner_compile_util",
+        ":autotuner_util",
+        ":triton_fusion_numerics_verifier",
+        "//xla:shape_util",
+        "//xla:test_helpers",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:platform_util",
+        "//xla/stream_executor:platform",
+        "//xla/tests:hlo_test_base",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/lib/core:status_test_util",
+    ],
+)
+
+cc_library(
+    name = "pipelined_p2p_rewriter",
+    srcs = ["pipelined_p2p_rewriter.cc"],
+    hdrs = ["pipelined_p2p_rewriter.h"],
+    deps = [
+        "//xla:shape_util",
+        "//xla:status",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/utils:hlo_query",
+        "//xla/service:collective_ops_utils",
+        "//xla/service:hlo_pass",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "pipelined_p2p_rewriter_test",
+    srcs = ["pipelined_p2p_rewriter_test.cc"],
+    deps = [
+        ":pipelined_p2p_rewriter",
+        "//xla/hlo/ir:hlo",
+        "//xla/tests:filecheck",
+        "//xla/tests:hlo_test_base",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
+
+cc_library(
+    name = "execution_stream_assignment",
+    srcs = ["execution_stream_assignment.cc"],
+    hdrs = ["execution_stream_assignment.h"],
+    deps = [
+        "//xla/hlo/ir:hlo",
+        "//xla/service:call_graph",
+        "//xla/service/gpu/runtime:thunk",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+xla_cc_test(
+    name = "execution_stream_assignment_test",
+    srcs = ["execution_stream_assignment_test.cc"],
+    deps = [
+        ":execution_stream_assignment",
+        "//xla:shape_util",
+        "//xla/hlo/ir:hlo",
+        "//xla/service/gpu/runtime:thunk",
+        "//xla/tests:hlo_test_base",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
diff --git a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
index 5b92eb3423ebbb..2f2a9e835e102e 100644
--- a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
@@ -141,7 +141,6 @@ bool IsAlignedSlice(const Shape& src_shape, const Shape& dst_shape,
 UseDefDataflowPaths GetSlicedOperandPaths(const HloInstruction* instr) {
   UseDefDataflowPaths sliced_operand_paths;
 
-  auto fusion = HloFusionAdaptor::ForComputation(instr->parent());
   // This set is used to avoid duplicates in the matched results. It contains
   // the matched instructions that we have seen so far.
   InstructionSet processed_instrs;
@@ -153,7 +152,7 @@ UseDefDataflowPaths GetSlicedOperandPaths(const HloInstruction* instr) {
     aliased_operands.insert(pair.second.first);
   }
 
-  for (auto* operand : instr->operands()) {
+  for (const auto* operand : instr->operands()) {
     // output_to_operand_aliasing means the operand is to be materialized, which
     // is against the whole idea of address computation fusion. Skip this
     // operand.
@@ -165,17 +164,14 @@ UseDefDataflowPaths GetSlicedOperandPaths(const HloInstruction* instr) {
     // flows through unary op). We might want to keep finding until the queue is
     // empty: if the operand is a tuple, it might have different data flows
     // (i.e. 1 for each element).
-    auto maybe_slice_adaptor =
-        HloFindIf({HloInstructionAdaptor(*operand)}, *fusion, [&](auto node) {
-          const HloInstruction* cur = &node.instruction();
-
+    auto maybe_slice_instr =
+        HloFindIf({operand}, [&](const HloInstruction* cur) {
           // If the node is a match that has been processed, stop the traversal.
           if (processed_instrs.contains(cur)) return true;
 
           maybe_sliced_operand_path.push_back(const_cast<HloInstruction*>(cur));
 
-          if (IsOpcodeAnyOf<HloOpcode::kDynamicSlice, HloOpcode::kSlice>(
-                  node)) {
+          if (IsOpcodeAnyOf<HloOpcode::kDynamicSlice, HloOpcode::kSlice>(cur)) {
             if (IsAlignedSlice(cur->operand(0)->shape(), cur->shape(),
                                DynCast<HloSliceInstruction>(cur))) {
               slice_found = true;
@@ -183,17 +179,12 @@ UseDefDataflowPaths GetSlicedOperandPaths(const HloInstruction* instr) {
             }
           }
 
-          // TODO(vuson): lift the first restriction by considering fusing other
-          // uses of the operand to reuse the address computation. Only worth it
-          // if other uses are also custom calls though.
-          return cur->user_count() > 1 || !IsNoOp(cur);
+          return !IsNoOp(cur);
         });
 
-    if (maybe_slice_adaptor == std::nullopt) continue;
-
-    const auto& maybe_slice_instr = maybe_slice_adaptor->instruction();
+    if (maybe_slice_instr == std::nullopt) continue;
 
-    if (slice_found || processed_instrs.contains(&maybe_slice_instr)) {
+    if (slice_found || processed_instrs.contains(maybe_slice_instr.value())) {
       // Even in the case of stopping at a match that has been processed, we
       // still need to add instructions encountered in the sliced operand path
       // during the latest traversal.
@@ -215,7 +206,6 @@ UseDefDataflowPaths GetSlicedOperandPaths(const HloInstruction* instr) {
 // following the dataflow from the user itself to the DUS (included).
 DefUseDataflowPaths GetSlicedUserPaths(const HloInstruction* instr) {
   DefUseDataflowPaths sliced_user_paths;
-  auto fusion = HloFusionAdaptor::ForComputation(instr->parent());
   // This set is used to avoid duplicates in the matched results. It contains
   // the matched instructions that we have seen so far.
   InstructionSet processed_instrs;
@@ -223,10 +213,9 @@ DefUseDataflowPaths GetSlicedUserPaths(const HloInstruction* instr) {
   auto traverse_hlo_and_collect = [&](HloInstruction* start) {
     DefUseDataflowPath maybe_sliced_user_path;
     bool dus_found = false;
-    auto maybe_dus_adaptor = HloFindIf(
-        {HloInstructionAdaptor(*start)}, *fusion,
-        [&](auto node) {
-          const HloInstruction* cur = &node.instruction();
+    auto maybe_dus_instr = HloFindIf(
+        {start},
+        [&](const HloInstruction* cur) {
           // If the node is a match that has been processed, stop the
           // traversal.
           if (processed_instrs.contains(cur)) return true;
@@ -242,9 +231,8 @@ DefUseDataflowPaths GetSlicedUserPaths(const HloInstruction* instr) {
           return cur->user_count() > 1 || !IsNoOp(cur);
         },
         /*visit_operands=*/false);
-    if (maybe_dus_adaptor == std::nullopt) return;
-    const auto& maybe_dus_instr = maybe_dus_adaptor->instruction();
-    if (dus_found || processed_instrs.contains(&maybe_dus_instr)) {
+    if (maybe_dus_instr == std::nullopt) return;
+    if (dus_found || processed_instrs.contains(maybe_dus_instr.value())) {
       // Even in the case of stopping at a match that has been processed, we
       // still need to add instructions encountered in the sliced user path
       // during the latest traversal.
@@ -287,10 +275,11 @@ absl::InlinedVector<HloInstruction*, 4> GetPatternCaptures(
   return captures;
 }
 
-Status CreateRootTuple(HloInstruction* hero, HloComputation::Builder& builder,
-                       DataflowPathsView sliced_user_paths,
-                       absl::flat_hash_map<const HloInstruction*,
-                                           HloInstruction*>& instr_mapping) {
+absl::Status CreateRootTuple(
+    HloInstruction* hero, HloComputation::Builder& builder,
+    DataflowPathsView sliced_user_paths,
+    absl::flat_hash_map<const HloInstruction*, HloInstruction*>&
+        instr_mapping) {
   unsigned tuple_size = hero->shape().tuple_shapes_size();
 
   std::vector<HloInstruction*> sliced_elems(tuple_size, nullptr);
diff --git a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc
index 4d14024115a621..8adc62ffaadbfc 100644
--- a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc
@@ -343,26 +343,28 @@ TEST_F(AddressComputationFusionRewriterTest,
 
   const char* expected = R"(
     ; CHECK:     %address-computation {{.*}} {
-    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[8,8]{1,0} parameter(1)
-    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[4,8,8]{2,1,0} parameter(0)
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(0)
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[4,8,8]{2,1,0} parameter(1)
+    ; CHECK-DAG:   [[S0:%[^ ]+]] = f16[1,8,8]{2,1,0} slice([[P0]]), slice={[1:2], [0:8], [0:8]}
+    ; CHECK-DAG:   [[B0:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S0]])
     ; CHECK-DAG:   [[S1:%[^ ]+]] = f16[1,8,8]{2,1,0} slice([[P1]]), slice={[2:3], [0:8], [0:8]}
     ; CHECK-DAG:   [[B1:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S1]])
-    ; CHECK:       ROOT [[CC:%[^ ]+]] = f16[8,8]{1,0} custom-call([[P0]], [[B1]]),
+    ; CHECK:       ROOT [[CC:%[^ ]+]] = f16[8,8]{1,0} custom-call([[B0]], [[B1]]),
     ; CHECK:              custom_call_target="__cublas$gemm"
     ; CHECK:     }
 
     ; CHECK:     ENTRY %main{{.*}} {
-    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(0)
-    ; CHECK-DAG:   [[S1:%[^ ]+]] = f16[1,8,8]{2,1,0} slice([[P1]]), slice={[1:2], [0:8], [0:8]}
-    ; CHECK-DAG:   [[B1:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S1]])
-    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[4,8,8]{2,1,0} parameter(1)
-    ; CHECK:       [[FUSION:%[^ ]+]] = f16[8,8]{1,0} fusion([[P0]], [[B1]])
-    ; CHECK:         kind=kCustom, calls=%address-computation,
-    ; CHECK:         backend_config={
-    ; CHECK:           "kind":"__custom_fusion",
-    ; CHECK:           "custom_fusion_config":{"name":"address_computation"}
-    ; CHECK:         }
-    ; CHECK:       ROOT {{.*}} = f16[8,8]{1,0} add([[FUSION]], [[B1]])
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(0)
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[4,8,8]{2,1,0} parameter(1)
+    ; CHECK-DAG:   [[FUSION:%[^ ]+]] = f16[8,8]{1,0} fusion([[P0]], [[P1]])
+    ; CHECK-DAG:     kind=kCustom, calls=%address-computation,
+    ; CHECK-DAG:     backend_config={
+    ; CHECK-DAG:       "kind":"__custom_fusion",
+    ; CHECK-DAG:       "custom_fusion_config":{"name":"address_computation"}
+    ; CHECK-DAG:     }
+    ; CHECK-DAG:   [[S0:%[^ ]+]] = f16[1,8,8]{2,1,0} slice([[P0]]), slice={[1:2], [0:8], [0:8]}
+    ; CHECK-DAG:   [[B0:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S0]])
+    ; CHECK:       ROOT {{.*}} = f16[8,8]{1,0} add([[FUSION]], [[B0]])
     ; CHECK:     }
   )";
 
@@ -426,9 +428,32 @@ TEST_F(AddressComputationFusionRewriterTest,
     }
   )";
 
+  const char* expected = R"(
+    ; CHECK:     %address-computation{{.*}} {
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(0)
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(1)
+    ; CHECK-DAG:   [[S0:%[^ ]+]] = f16[1,8,8]{2,1,0} slice([[P0]]), slice={[1:2], [0:8], [0:8]}
+    ; CHECK-DAG:   [[B0:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S0]])
+    ; CHECK-DAG:   [[S1:%[^ ]+]] = f16[1,8,8]{2,1,0} slice([[P1]]), slice={[1:2], [0:8], [0:8]}
+    ; CHECK-DAG:   [[B1:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S1]])
+    ; CHECK:       ROOT [[CC:%[^ ]+]] = f16[8,8]{1,0} custom-call([[B0]], [[B1]]),
+    ; CHECK:              custom_call_target="__cublas$gemm"
+    ; CHECK:     }
+    ; CHECK:     %address-computation{{.*}} {
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(0)
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(1)
+    ; CHECK-DAG:   [[S0:%[^ ]+]] = f16[1,8,8]{2,1,0} slice([[P0]]), slice={[1:2], [0:8], [0:8]}
+    ; CHECK-DAG:   [[B0:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S0]])
+    ; CHECK-DAG:   [[S1:%[^ ]+]] = f16[1,8,8]{2,1,0} slice([[P1]]), slice={[1:2], [0:8], [0:8]}
+    ; CHECK-DAG:   [[B1:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S1]])
+    ; CHECK:       ROOT [[CC:%[^ ]+]] = f16[8,8]{1,0} custom-call([[B0]], [[B1]]),
+    ; CHECK:              custom_call_target="__cublas$gemm"
+    ; CHECK:     }
+  )";
+
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
   RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
-                            std::nullopt);
+                            expected);
 }
 
 TEST_F(AddressComputationFusionRewriterTest, SimpleGemmSlicingNotParameter) {
diff --git a/third_party/xla/xla/service/gpu/all_reduce_blueconnect.cc b/third_party/xla/xla/service/gpu/all_reduce_blueconnect.cc
index 033f255d7e568f..9d259f5f64a861 100644
--- a/third_party/xla/xla/service/gpu/all_reduce_blueconnect.cc
+++ b/third_party/xla/xla/service/gpu/all_reduce_blueconnect.cc
@@ -236,14 +236,16 @@ absl::StatusOr<bool> TryDecomposeAllReduce(HloAllReduceInstruction* all_reduce,
   HloInstruction* reduce_scatter =
       computation.AddInstruction(HloInstruction::CreateReduceScatter(
           reduce_scatter_shape, flat_operands, all_reduce->to_apply(),
-          decomposed_groups->scatter_gather_groups, /*constrain_layout=*/false,
-          all_reduce->channel_id(), all_reduce->use_global_device_ids(),
+          CollectiveDeviceList(decomposed_groups->scatter_gather_groups),
+          /*constrain_layout=*/false, all_reduce->channel_id(),
+          all_reduce->use_global_device_ids(),
           /*scatter_dimension=*/0));
 
   HloInstruction* new_all_reduce =
       computation.AddInstruction(HloInstruction::CreateAllReduce(
           reduce_scatter_shape, GetOutputs(*reduce_scatter),
-          all_reduce->to_apply(), decomposed_groups->new_all_reduce_groups,
+          all_reduce->to_apply(),
+          CollectiveDeviceList(decomposed_groups->new_all_reduce_groups),
           /*constrain_layout=*/false, all_reduce->channel_id(),
           all_reduce->use_global_device_ids()));
 
@@ -251,7 +253,8 @@ absl::StatusOr<bool> TryDecomposeAllReduce(HloAllReduceInstruction* all_reduce,
       computation.AddInstruction(HloInstruction::CreateAllGather(
           ShapeUtil::MakeMaybeTupleShape(flat_shapes),
           GetOutputs(*new_all_reduce),
-          /*all_gather_dimension=*/0, decomposed_groups->scatter_gather_groups,
+          /*all_gather_dimension=*/0,
+          CollectiveDeviceList(decomposed_groups->scatter_gather_groups),
           /*constrain_layout=*/false, all_reduce->channel_id(),
           all_reduce->use_global_device_ids()));
 
diff --git a/third_party/xla/xla/service/gpu/amdgpu_compiler.cc b/third_party/xla/xla/service/gpu/amdgpu_compiler.cc
index f429e20f27d1ac..d5936c62a91244 100644
--- a/third_party/xla/xla/service/gpu/amdgpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/amdgpu_compiler.cc
@@ -18,21 +18,25 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "llvm/IR/Module.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/algebraic_simplifier.h"
 #include "xla/service/call_inliner.h"
 #include "xla/service/convert_mover.h"
 #include "xla/service/dot_dimension_merger.h"
 #include "xla/service/float_normalization.h"
+#include "xla/service/float_support.h"
 #include "xla/service/gpu/autotuner_util.h"
 #include "xla/service/gpu/conv_algorithm_picker.h"
 #include "xla/service/gpu/cublas_pad_for_gemms.h"
 #include "xla/service/gpu/cublas_padding_requirements.h"
+#include "xla/service/gpu/cudnn_fused_conv_rewriter.h"
 #include "xla/service/gpu/cusolver_rewriter.h"
 #include "xla/service/gpu/gemm_algorithm_picker.h"
 #include "xla/service/gpu/gpu_compiler.h"
@@ -59,12 +63,17 @@ limitations under the License.
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/threadpool.h"
 
+#if TENSORFLOW_USE_ROCM
+#include "rocm/rocm_config.h"
+#endif
+
 namespace xla {
 namespace gpu {
 
 namespace {
 
-struct ConvBfloat16Support : public FloatSupport {
+class ConvBfloat16Support : public FloatSupport {
+ public:
   explicit ConvBfloat16Support(const se::RocmComputeCapability& rocm)
       : FloatSupport(BF16),
         // TODO: MIOpen does not support bf16 convolutions yet
@@ -90,6 +99,13 @@ struct ConvBfloat16Support : public FloatSupport {
 
 }  // namespace
 
+int32_t AMDGPUCompiler::GetToolkitVersion() const {
+#if TENSORFLOW_USE_ROCM
+  return TF_ROCM_VERSION;
+#endif
+  LOG(FATAL) << "Failed to get ROCm version.";
+}
+
 absl::Status AMDGPUCompiler::OptimizeHloConvolutionCanonicalization(
     HloModule* hlo_module, se::GpuComputeCapability gpu_version,
     se::dnn::VersionInfo dnn_version,
@@ -109,6 +125,8 @@ absl::Status AMDGPUCompiler::OptimizeHloConvolutionCanonicalization(
   pipeline.AddPass<GpusolverRewriter>();
   pipeline.AddPass<GpuConvRewriter>();
   pipeline.AddPass<GpuConvPaddingLegalization>();
+  auto rcc = std::get<se::RocmComputeCapability>(gpu_version);
+  pipeline.AddPass<CudnnFusedConvRewriter>(rcc);
 
   // The conv padding/vectorization passes which we need to get rid of.  They
   // also leave behind unnecessary tuple/get-tuple-element pairs that
@@ -132,7 +150,7 @@ absl::Status AMDGPUCompiler::OptimizeHloConvolutionCanonicalization(
           "reshape_mover_after_conv_canonicalization")] {
     ReshapeMoverOptions reshape_mover_options;
     reshape_mover_options.reshape_of_1d_broadcast_is_cheap = true;
-    pipeline.AddPass<HloPassFix<ReshapeMover>>(reshape_mover_options);
+    pipeline.AddPass<ReshapeMover>(reshape_mover_options);
     pipeline.AddPass<AlgebraicSimplifier>(options);
   }();
 
diff --git a/third_party/xla/xla/service/gpu/amdgpu_compiler.h b/third_party/xla/xla/service/gpu/amdgpu_compiler.h
index 9f4ad9f656256f..477ac1aeb1028d 100644
--- a/third_party/xla/xla/service/gpu/amdgpu_compiler.h
+++ b/third_party/xla/xla/service/gpu/amdgpu_compiler.h
@@ -39,6 +39,8 @@ class AMDGPUCompiler : public GpuCompiler {
  public:
   AMDGPUCompiler();
 
+  int32_t GetToolkitVersion() const override;
+
   absl::Status OptimizeHloConvolutionCanonicalization(
       HloModule* hlo_module, se::GpuComputeCapability gpu_version,
       se::dnn::VersionInfo dnn_version,
diff --git a/third_party/xla/xla/service/gpu/autotuner_compile_util.cc b/third_party/xla/xla/service/gpu/autotuner_compile_util.cc
index e88f893a480366..89c8640fdc7391 100644
--- a/third_party/xla/xla/service/gpu/autotuner_compile_util.cc
+++ b/third_party/xla/xla/service/gpu/autotuner_compile_util.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "xla/service/gpu/autotuner_compile_util.h"
 
+#include <cstdint>
+#include <iterator>
 #include <memory>
 #include <optional>
 #include <utility>
@@ -38,7 +40,9 @@ limitations under the License.
 #include "xla/service/maybe_owning_device_memory.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/redzone_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
@@ -80,10 +84,11 @@ AutotunerCompileUtil::AutotunerCompileUtil(const AutotuneConfig& config,
       allocator_(allocator),
       opts_(opts) {
   // Avoid dumping compilation steps.
-  opts_.set_xla_dump_to("");
+  opts_.set_xla_enable_dumping(false);
   opts_.set_xla_gpu_dump_autotune_results_to("");
   opts_.set_xla_gpu_load_autotune_results_from("");
   opts_.set_xla_gpu_dump_llvmir(false);
+  opts_.set_xla_gpu_dump_autotune_logs_to("");
   // Avoid using another thread pool.
   opts_.set_xla_gpu_force_compilation_parallelism(1);
   opts_.set_xla_gpu_enable_llvm_module_compilation_parallelism(false);
@@ -170,7 +175,7 @@ AutotunerCompileUtil::Create(const AutotuneConfig& config,
   se::DeviceMemoryAllocator* allocator = config.GetAllocator();
   TF_ASSIGN_OR_RETURN(se::Stream* const stream, config.GetStream());
   TF_ASSIGN_OR_RETURN(Compiler * compiler,
-                      Compiler::GetForPlatform(stream_exec->platform()));
+                      Compiler::GetForPlatform(stream_exec->GetPlatform()));
   return AutotunerCompileUtil(config, compiler, *stream_exec, *stream,
                               *allocator, opts);
 }
@@ -196,5 +201,83 @@ absl::StatusOr<ExecutionOutput> AutotunerCompileUtil::Execute(
   return std::move(output);
 }
 
+absl::StatusOr<RedzoneBuffers> RedzoneBuffers::FromInstruction(
+    const HloInstruction& instruction, const AutotuneConfig& config,
+    const DebugOptions& debug_options, BuffersToCreate buffers_to_create) {
+  RedzoneBuffers buffers;
+
+  TF_ASSIGN_OR_RETURN(auto rz_allocator, AutotunerUtil::CreateRedzoneAllocator(
+                                             config, debug_options));
+  buffers.redzone_allocator_ =
+      std::make_unique<se::RedzoneAllocator>(std::move(rz_allocator));
+
+  int64_t rng_state = 0;
+
+  TF_RETURN_IF_ERROR(
+      buffers.CreateInputs(instruction, config, debug_options, rng_state));
+
+  if (buffers_to_create == BuffersToCreate::kAllInputsAllOutputs ||
+      buffers_to_create == BuffersToCreate::kAllInputsOutputsNoScratch) {
+    TF_RETURN_IF_ERROR(buffers.CreateOutputs(instruction, config, debug_options,
+                                             buffers_to_create, rng_state));
+  }
+
+  return buffers;
+}
+
+absl::Status RedzoneBuffers::CreateInputs(const HloInstruction& instruction,
+                                          const AutotuneConfig& config,
+                                          const DebugOptions& debug_options,
+                                          int64_t& rng_state) {
+  for (const auto* operand : instruction.operands()) {
+    TF_ASSIGN_OR_RETURN(
+        se::DeviceMemoryBase buf,
+        AutotunerUtil::CreateBuffer(*redzone_allocator_, operand->shape(),
+                                    config, rng_state));
+    input_buffers_.push_back(buf);
+    input_shapes_.push_back(operand->shape());
+  }
+  return absl::OkStatus();
+}
+
+absl::Status RedzoneBuffers::CreateOutputs(const HloInstruction& instruction,
+                                           const AutotuneConfig& config,
+                                           const DebugOptions& debug_options,
+                                           BuffersToCreate buffers_to_create,
+                                           int64_t& rng_state) {
+  if (!instruction.shape().IsTuple()) {
+    TF_ASSIGN_OR_RETURN(
+        se::DeviceMemoryBase buf,
+        AutotunerUtil::CreateBuffer(*redzone_allocator_, instruction.shape(),
+                                    config, rng_state));
+    output_buffers_.push_back(buf);
+    output_shape_ = instruction.shape();
+    return absl::OkStatus();
+  }
+
+  // The output is a tuple.
+
+  auto current_shape_it = instruction.shape().tuple_shapes().begin();
+  auto end = instruction.shape().tuple_shapes().end();
+  end -= buffers_to_create == kAllInputsAllOutputs ? 0 : 1;
+
+  output_shape_ = std::distance(current_shape_it, end) == 1
+                      ? output_shape_ = *current_shape_it
+                      : ShapeUtil::MakeTupleShape(
+                            std::vector<Shape>{current_shape_it, end});
+
+  for (; current_shape_it < end; current_shape_it++) {
+    if (current_shape_it->IsTuple()) {
+      return Unimplemented("Nested tuples are unsupported by RedzoneBuffers.");
+    }
+    TF_ASSIGN_OR_RETURN(
+        se::DeviceMemoryBase buf,
+        AutotunerUtil::CreateBuffer(*redzone_allocator_, *current_shape_it,
+                                    config, rng_state));
+    output_buffers_.push_back(buf);
+  }
+  return absl::OkStatus();
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/autotuner_compile_util.h b/third_party/xla/xla/service/gpu/autotuner_compile_util.h
index a5d018c568b937..5137fcf95b43a0 100644
--- a/third_party/xla/xla/service/gpu/autotuner_compile_util.h
+++ b/third_party/xla/xla/service/gpu/autotuner_compile_util.h
@@ -16,12 +16,14 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_AUTOTUNER_COMPILE_UTIL_H_
 #define XLA_SERVICE_GPU_AUTOTUNER_COMPILE_UTIL_H_
 
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <utility>
 #include <vector>
 
 #include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/time/time.h"
 #include "absl/types/span.h"
@@ -35,6 +37,7 @@ limitations under the License.
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/gpu/redzone_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
@@ -115,6 +118,59 @@ class AutotunerCompileUtil {
   DebugOptions opts_;
 };
 
+// A RedZone allocator and a collection of buffers that store the inputs and
+// outputs of an HloInstruction. These are used when running the instruction
+// for autotuning.
+class RedzoneBuffers {
+ public:
+  enum BuffersToCreate {
+    // Create a buffer for all of the instruction's operands. The result shape
+    // is ignored.
+    kAllInputs = 0,
+    // Create a buffer for all of the instruction's operands and the entire
+    // result shape. If the result shape is a tuple, a separate buffer is
+    // created for each subshape.
+    kAllInputsAllOutputs = 1,
+    // Create a buffer for all of the instruction's operands and all of the
+    // subshapes of the result tuple, except for the last one. The last subshape
+    // is considered a scratch buffer and is assumed to be allocated elsewhere.
+    // If the result shape is not a tuple, this will create a buffer
+    // corresponding to the entire shape - equivalent to `kAllInputsAllOutputs`.
+    kAllInputsOutputsNoScratch = 2,
+  };
+  static absl::StatusOr<RedzoneBuffers> FromInstruction(
+      const HloInstruction& instruction, const AutotuneConfig& config,
+      const DebugOptions& debug_options, BuffersToCreate buffers_to_create);
+
+  const std::vector<se::DeviceMemoryBase>& input_buffers() const {
+    return input_buffers_;
+  }
+  const std::vector<Shape>& input_shapes() const { return input_shapes_; }
+  const std::vector<se::DeviceMemoryBase>& output_buffers() const {
+    return output_buffers_;
+  }
+  const Shape& output_shape() const { return output_shape_; }
+  se::RedzoneAllocator& RedzoneAllocator() const { return *redzone_allocator_; }
+
+ private:
+  absl::Status CreateInputs(const HloInstruction& instruction,
+                            const AutotuneConfig& config,
+                            const DebugOptions& debug_options,
+                            int64_t& rng_state);
+
+  absl::Status CreateOutputs(const HloInstruction& instruction,
+                             const AutotuneConfig& config,
+                             const DebugOptions& debug_options,
+                             BuffersToCreate buffers_to_create,
+                             int64_t& rng_state);
+
+  std::unique_ptr<se::RedzoneAllocator> redzone_allocator_;
+  std::vector<se::DeviceMemoryBase> input_buffers_;
+  std::vector<Shape> input_shapes_;
+  std::vector<se::DeviceMemoryBase> output_buffers_;
+  Shape output_shape_;
+};
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/autotuner_compile_util_test.cc b/third_party/xla/xla/service/gpu/autotuner_compile_util_test.cc
new file mode 100644
index 00000000000000..1db5afb8988222
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/autotuner_compile_util_test.cc
@@ -0,0 +1,196 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/autotuner_compile_util.h"
+
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/gpu/autotuner_util.h"
+#include "xla/service/platform_util.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla::gpu {
+namespace {
+
+using AutotunerCompileUtilTest = HloTestBase;
+
+TEST_F(AutotunerCompileUtilTest, VerifyOutputNotATuple) {
+  constexpr absl::string_view kHlo = R"(
+HloModule hlo
+ENTRY main {
+  p0 = f32[2,2] parameter(0)
+  p1 = f32[4,4] parameter(1)
+  p2 = f32[6,6] parameter(2)
+  ROOT root = f32[1,2,3] custom-call(p0, p1, p2), custom_call_target="fake"
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHlo));
+
+  se::Platform* platform = PlatformUtil::GetDefaultPlatform().value();
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<se::StreamExecutor*> executors,
+                          PlatformUtil::GetStreamExecutors(platform));
+
+  AutotuneConfig autotune_config{DeviceConfig{executors.at(0), nullptr},
+                                 GetDebugOptionsForTest()};
+
+  auto& root = *module->entry_computation()->root_instruction();
+
+  TF_ASSERT_OK_AND_ASSIGN(RedzoneBuffers rzb,
+                          RedzoneBuffers::FromInstruction(
+                              root, autotune_config, GetDebugOptionsForTest(),
+                              RedzoneBuffers::kAllInputs));
+
+  EXPECT_EQ(rzb.input_shapes().size(), 3);
+  EXPECT_EQ(rzb.input_buffers().size(), 3);
+  EXPECT_EQ(rzb.output_buffers().size(), 0);
+  EXPECT_NE(rzb.output_shape(), root.shape());
+
+  TF_ASSERT_OK_AND_ASSIGN(RedzoneBuffers rzb2,
+                          RedzoneBuffers::FromInstruction(
+                              root, autotune_config, GetDebugOptionsForTest(),
+                              RedzoneBuffers::kAllInputsAllOutputs));
+
+  EXPECT_EQ(rzb2.input_shapes().size(), 3);
+  EXPECT_EQ(rzb2.input_buffers().size(), 3);
+  EXPECT_EQ(rzb2.output_buffers().size(), 1);
+  EXPECT_EQ(rzb2.output_shape(), root.shape());
+
+  TF_ASSERT_OK_AND_ASSIGN(RedzoneBuffers rzb3,
+                          RedzoneBuffers::FromInstruction(
+                              root, autotune_config, GetDebugOptionsForTest(),
+                              RedzoneBuffers::kAllInputsOutputsNoScratch));
+
+  EXPECT_EQ(rzb3.input_shapes().size(), 3);
+  EXPECT_EQ(rzb3.input_buffers().size(), 3);
+  EXPECT_EQ(rzb3.output_buffers().size(), 1);
+  EXPECT_EQ(rzb3.output_shape(), root.shape());
+}
+
+TEST_F(AutotunerCompileUtilTest, VerifyOutputTupleOneElement) {
+  constexpr absl::string_view kHlo = R"(
+HloModule hlo
+ENTRY main {
+  p0 = f32[2,2] parameter(0)
+  p1 = f32[4,4] parameter(1)
+  p2 = f32[6,6] parameter(2)
+  ROOT root = (f32[1,2,3]) custom-call(p0, p1, p2), custom_call_target="fake"
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHlo));
+
+  se::Platform* platform = PlatformUtil::GetDefaultPlatform().value();
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<se::StreamExecutor*> executors,
+                          PlatformUtil::GetStreamExecutors(platform));
+
+  AutotuneConfig autotune_config{DeviceConfig{executors.at(0), nullptr},
+                                 GetDebugOptionsForTest()};
+
+  auto& root = *module->entry_computation()->root_instruction();
+
+  TF_ASSERT_OK_AND_ASSIGN(RedzoneBuffers rzb,
+                          RedzoneBuffers::FromInstruction(
+                              root, autotune_config, GetDebugOptionsForTest(),
+                              RedzoneBuffers::kAllInputs));
+
+  EXPECT_EQ(rzb.input_shapes().size(), 3);
+  EXPECT_EQ(rzb.input_buffers().size(), 3);
+  EXPECT_EQ(rzb.output_buffers().size(), 0);
+  EXPECT_NE(rzb.output_shape(), root.shape());
+
+  TF_ASSERT_OK_AND_ASSIGN(RedzoneBuffers rzb2,
+                          RedzoneBuffers::FromInstruction(
+                              root, autotune_config, GetDebugOptionsForTest(),
+                              RedzoneBuffers::kAllInputsAllOutputs));
+
+  EXPECT_EQ(rzb2.input_shapes().size(), 3);
+  EXPECT_EQ(rzb2.input_buffers().size(), 3);
+  EXPECT_EQ(rzb2.output_buffers().size(), 1);
+  EXPECT_FALSE(rzb2.output_shape().IsTuple());
+  EXPECT_EQ(rzb2.output_shape(), root.shape().tuple_shapes(0));
+
+  TF_ASSERT_OK_AND_ASSIGN(RedzoneBuffers rzb3,
+                          RedzoneBuffers::FromInstruction(
+                              root, autotune_config, GetDebugOptionsForTest(),
+                              RedzoneBuffers::kAllInputsOutputsNoScratch));
+
+  EXPECT_EQ(rzb3.input_shapes().size(), 3);
+  EXPECT_EQ(rzb3.input_buffers().size(), 3);
+  EXPECT_EQ(rzb3.output_buffers().size(), 0);
+}
+
+TEST_F(AutotunerCompileUtilTest, VerifyOutputTupleTwoElements) {
+  constexpr absl::string_view kHlo = R"(
+HloModule hlo
+ENTRY main {
+  p0 = f32[2,2] parameter(0)
+  p1 = f32[4,4] parameter(1)
+  p2 = f32[6,6] parameter(2)
+  ROOT root = (f32[1,2,3], u8[1,2]) custom-call(p0, p1, p2), custom_call_target="fake"
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, GetOptimizedModule(kHlo));
+
+  se::Platform* platform = PlatformUtil::GetDefaultPlatform().value();
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<se::StreamExecutor*> executors,
+                          PlatformUtil::GetStreamExecutors(platform));
+
+  AutotuneConfig autotune_config{DeviceConfig{executors.at(0), nullptr},
+                                 GetDebugOptionsForTest()};
+
+  auto& root = *module->entry_computation()->root_instruction();
+
+  TF_ASSERT_OK_AND_ASSIGN(RedzoneBuffers rzb,
+                          RedzoneBuffers::FromInstruction(
+                              root, autotune_config, GetDebugOptionsForTest(),
+                              RedzoneBuffers::kAllInputs));
+
+  EXPECT_EQ(rzb.input_shapes().size(), 3);
+  EXPECT_EQ(rzb.input_buffers().size(), 3);
+  EXPECT_EQ(rzb.output_buffers().size(), 0);
+  EXPECT_NE(rzb.output_shape(), root.shape());
+
+  TF_ASSERT_OK_AND_ASSIGN(RedzoneBuffers rzb2,
+                          RedzoneBuffers::FromInstruction(
+                              root, autotune_config, GetDebugOptionsForTest(),
+                              RedzoneBuffers::kAllInputsAllOutputs));
+
+  EXPECT_EQ(rzb2.input_shapes().size(), 3);
+  EXPECT_EQ(rzb2.input_buffers().size(), 3);
+  EXPECT_EQ(rzb2.output_buffers().size(), 2);
+  EXPECT_TRUE(rzb2.output_shape().IsTuple());
+  EXPECT_EQ(rzb2.output_shape(), root.shape());
+
+  TF_ASSERT_OK_AND_ASSIGN(RedzoneBuffers rzb3,
+                          RedzoneBuffers::FromInstruction(
+                              root, autotune_config, GetDebugOptionsForTest(),
+                              RedzoneBuffers::kAllInputsOutputsNoScratch));
+
+  EXPECT_EQ(rzb3.input_shapes().size(), 3);
+  EXPECT_EQ(rzb3.input_buffers().size(), 3);
+  EXPECT_EQ(rzb3.output_buffers().size(), 1);
+  EXPECT_FALSE(rzb3.output_shape().IsTuple());
+  EXPECT_EQ(rzb3.output_shape(), root.shape().tuple_shapes(0));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/autotuner_util.cc b/third_party/xla/xla/service/gpu/autotuner_util.cc
index 9f268335ef418a..f3737a60bb996a 100644
--- a/third_party/xla/xla/service/gpu/autotuner_util.cc
+++ b/third_party/xla/xla/service/gpu/autotuner_util.cc
@@ -53,6 +53,14 @@ limitations under the License.
 
 namespace xla {
 namespace gpu {
+namespace {
+
+// Bump this version whenever you change the structure of the results.
+// LINT.IfChange(version)
+constexpr int kVersion = 3;
+// LINT.ThenChange()
+
+}  // namespace
 
 using AutotuneCacheMap = absl::flat_hash_map<AutotuneCacheKey, AutotuneResult>;
 
@@ -103,6 +111,7 @@ static void SerializeAutotuneEntry(AutotuneResults* results,
     SerializeAutotuneEntry(results, k, &result);
   }
 
+  results->set_version(kVersion);
   SortAutotuneResults(results);
 
   return absl::OkStatus();
@@ -111,7 +120,7 @@ static void SerializeAutotuneEntry(AutotuneResults* results,
 /*static*/ absl::Status AutotunerUtil::LoadAutotuneResults(
     const AutotuneResults& results) {
   absl::MutexLock lock(&autotune_cache_mu);
-  for (const auto& result : results.results()) {
+  for (const AutotuneResults::Entry& result : results.results()) {
     autotune_cache[AutotuneCacheKey(result.device(), result.hlo())] =
         result.result();
   }
@@ -220,11 +229,6 @@ static AutotuneResult* TryFindInCache(const AutotuneCacheKey& key) {
 
 namespace {
 
-// Bump this version whenever you change the structure of the results.
-// LINT.IfChange(version)
-constexpr int kVersion = 3;
-// LINT.ThenChange()
-
 bool IsTextProtoPath(absl::string_view file_path) {
   return absl::EndsWith(file_path, ".txt") ||
          absl::EndsWith(file_path, ".textproto") ||
@@ -259,22 +263,24 @@ bool IsTextProtoPath(absl::string_view file_path) {
 /*static*/ absl::StatusOr<std::string> AutotunerUtil::SerializeAutotuneResults(
     bool as_textproto) {
   AutotuneResults results;
-  results.set_version(kVersion);
   TF_RETURN_IF_ERROR(SerializeAutotuneResults(&results));
   return AutotuneResultsToString(results, as_textproto);
 }
 
-/*static*/ absl::Status AutotunerUtil::SerializeAutotuneResultsToFile(
-    absl::string_view file_path) {
+/* static */ absl::Status AutotunerUtil::SerializeAutotuneResultsToFile(
+    const AutotuneResults& results, absl::string_view file_path) {
   TF_RET_CHECK(!file_path.empty());
+  TF_RET_CHECK(results.version() > 0)
+      << "Did you call SerializeAutotuneResults to get this AutotuneResults?";
 
   std::string resolved_path;
   if (!tsl::io::ResolveTestPrefixes(file_path, resolved_path)) {
     return FailedPrecondition("File path can not be resolved: %s", file_path);
   }
 
-  TF_ASSIGN_OR_RETURN(std::string autotune_results_str,
-                      SerializeAutotuneResults(IsTextProtoPath(resolved_path)));
+  TF_ASSIGN_OR_RETURN(
+      std::string autotune_results_str,
+      AutotuneResultsToString(results, IsTextProtoPath(resolved_path)));
   TF_RETURN_IF_ERROR(tsl::WriteStringToFile(tsl::Env::Default(), resolved_path,
                                             autotune_results_str));
   LOG(INFO) << "Autotune results serialized to file: " << resolved_path;
@@ -282,6 +288,13 @@ bool IsTextProtoPath(absl::string_view file_path) {
   return absl::OkStatus();
 }
 
+/*static*/ absl::Status AutotunerUtil::SerializeAutotuneResultsToFile(
+    absl::string_view file_path) {
+  AutotuneResults results;
+  TF_RETURN_IF_ERROR(SerializeAutotuneResults(&results));
+  return SerializeAutotuneResultsToFile(results, file_path);
+}
+
 /*static*/ absl::Status AutotunerUtil::LoadAutotuneResultsFromFile(
     absl::string_view file_path) {
   TF_RET_CHECK(!file_path.empty());
@@ -319,24 +332,5 @@ AutotunerUtil::CreateRedzoneAllocator(const AutotuneConfig& config,
           : 0);
 }
 
-/*static*/ absl::StatusOr<std::string>
-AutotunerUtil::SerializeAutotuneResultsForModule(
-    const HloModule& module, const AutotuneConfig& autotune_config,
-    bool as_textproto) {
-  AutotuneResults results;
-  results.set_version(kVersion);
-
-  for (const HloInstruction* instr :
-       module.entry_computation()->instructions()) {
-    AutotuneCacheKey k(autotune_config.GetModelStr(), *instr);
-    if (const AutotuneResult* res = TryFindInCache(k)) {
-      SerializeAutotuneEntry(&results, k, res);
-    }
-  }
-
-  SortAutotuneResults(&results);
-  return AutotuneResultsToString(results, as_textproto);
-}
-
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/autotuner_util.h b/third_party/xla/xla/service/gpu/autotuner_util.h
index 34cb957e6751f8..d9ab1989ddaefc 100644
--- a/third_party/xla/xla/service/gpu/autotuner_util.h
+++ b/third_party/xla/xla/service/gpu/autotuner_util.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <cstdint>
 #include <functional>
+#include <memory>
 #include <string>
 #include <utility>
 #include <variant>
@@ -108,6 +109,14 @@ class AutotuneConfig {
     return require_complete_aot_autotune_results_;
   }
 
+  AutotuneConfig(const AutotuneConfig& right)
+      : config_(right.config_),
+        autotune_level_(right.autotune_level_),
+        should_crash_on_check_failure_(right.should_crash_on_check_failure_),
+        exhaustive_tiling_search_(right.exhaustive_tiling_search_),
+        require_complete_aot_autotune_results_(
+            right.require_complete_aot_autotune_results_) {}
+
   AutotuneConfig(const std::variant<DeviceConfig, DevicelessConfig>& config,
                  const DebugOptions& debug_options)
       : config_(config),
@@ -136,7 +145,14 @@ class AutotuneConfig {
   se::DeviceMemoryAllocator* GetAllocator() const {
     CHECK(std::holds_alternative<DeviceConfig>(config_));
     auto& cf = std::get<DeviceConfig>(config_);
-    return cf.allocator ? cf.allocator : GetExecutor()->GetAllocator();
+    if (cf.allocator != nullptr) {
+      return cf.allocator;
+    }
+    if (allocator_ == nullptr) {
+      allocator_ =
+          std::make_unique<se::StreamExecutorMemoryAllocator>(GetExecutor());
+    }
+    return allocator_.get();
   }
 
   absl::StatusOr<se::Stream*> GetStream() const {
@@ -163,6 +179,7 @@ class AutotuneConfig {
   bool should_crash_on_check_failure_;
   bool exhaustive_tiling_search_;
   bool require_complete_aot_autotune_results_;
+  mutable std::unique_ptr<se::DeviceMemoryAllocator> allocator_;
 };
 
 using AutotuneNoCacheFn = std::function<absl::StatusOr<AutotuneResult>()>;
@@ -242,16 +259,6 @@ struct AutotunerUtil {
   static absl::StatusOr<std::string> SerializeAutotuneResults(
       bool as_textproto = false);
 
-  // As above, but only performs serialization for instructions found in the
-  // module.
-  //
-  // Only serializes autotuning results for instructions found in the module:
-  // while this is more expensive than serializing all cache, this avoids
-  // quadratic blow-up when serializing cache for a large number of modules.
-  static absl::StatusOr<std::string> SerializeAutotuneResultsForModule(
-      const HloModule& module, const AutotuneConfig& autotune_config,
-      bool as_textproto = false);
-
   static absl::Status SerializeAutotuneResults(AutotuneResults* results);
   static absl::Status LoadAutotuneResults(absl::string_view data,
                                           bool as_textproto = false);
@@ -265,6 +272,11 @@ struct AutotunerUtil {
   static absl::Status SerializeAutotuneResultsToFile(
       absl::string_view file_path);
 
+  // As above, but if you already called SerializeAutotuneResults to get a
+  // proto.
+  static absl::Status SerializeAutotuneResultsToFile(
+      const AutotuneResults& results, absl::string_view file_path);
+
   // Loads autotune results from a file.
   //
   // If `file_path` ends with ".txt" or ".textproto", then the file is
diff --git a/third_party/xla/xla/service/gpu/backend_configs.proto b/third_party/xla/xla/service/gpu/backend_configs.proto
index 7f225008337be8..18264a76795cb6 100644
--- a/third_party/xla/xla/service/gpu/backend_configs.proto
+++ b/third_party/xla/xla/service/gpu/backend_configs.proto
@@ -104,6 +104,7 @@ message GemmBackendConfig {
 
   optional bool grad_x = 16;
   optional bool grad_y = 17;
+  bool damax_output = 18;
 }
 
 // Backend config for bitcast operation generated from MLIR MHLO dialect.
diff --git a/third_party/xla/xla/service/gpu/buffer_allocations.cc b/third_party/xla/xla/service/gpu/buffer_allocations.cc
index df7472d8e6b167..e0ed378af1cbf0 100644
--- a/third_party/xla/xla/service/gpu/buffer_allocations.cc
+++ b/third_party/xla/xla/service/gpu/buffer_allocations.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/status.h"
 #include "xla/stream_executor/device_memory.h"
-#include "xla/util.h"
 #include "tsl/platform/logging.h"
 
 namespace xla {
@@ -57,23 +56,7 @@ se::DeviceMemoryBase BufferAllocations::GetDeviceAddress(
     BufferAllocation::Index buffer_index) const {
   CHECK_GE(buffer_index, 0);
   CHECK_LT(buffer_index, buffers_.size());
-  se::DeviceMemoryBase base = buffers_[buffer_index];
-  if (reinterpret_cast<uintptr_t>(base.opaque()) == kExternalAllocationMarker) {
-    if (!external_allocations_) {
-      LOG(ERROR) << "Does not have external allocations for buffer "
-                 << buffer_index;
-      return se::DeviceMemoryBase();
-    }
-    auto external_address =
-        external_allocations_->GetDeviceAddress(buffer_index);
-    if (external_address.ok()) {
-      return external_address.value();
-    }
-    LOG(ERROR) << "External address for allocation" << buffer_index
-               << " is not allocated yet";
-    return se::DeviceMemoryBase();
-  }
-  return base;
+  return buffers_[buffer_index];
 }
 
 se::DeviceMemoryBase& BufferAllocations::GetMutableDeviceAddress(
@@ -101,27 +84,5 @@ se::DeviceMemoryBase BufferAllocations::GetDeviceAddress(
   return base.GetByteSlice(buffer_slice.offset(), buffer_slice.size());
 }
 
-absl::Status BufferAllocations::AddExternalAllocation(
-    BufferAllocation::Index index, se::DeviceMemoryBase memory) const {
-  if (external_allocations_ == nullptr) {
-    return Internal(
-        "Calling external allocations, but no allocation tracker is provided"
-        "for allocation %d",
-        index);
-  }
-  return external_allocations_->AddAllocation(index, memory);
-}
-
-absl::Status BufferAllocations::EraseExternalAllocation(
-    BufferAllocation::Index index) const {
-  if (external_allocations_ == nullptr) {
-    return Internal(
-        "Calling external allocations, but no allocation tracker is provided"
-        "for allocation %d",
-        index);
-  }
-  return external_allocations_->EraseAllocation(index);
-}
-
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/buffer_allocations.h b/third_party/xla/xla/service/gpu/buffer_allocations.h
index 5f5883b8500a33..ddf80ff9278ce0 100644
--- a/third_party/xla/xla/service/gpu/buffer_allocations.h
+++ b/third_party/xla/xla/service/gpu/buffer_allocations.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define XLA_SERVICE_GPU_BUFFER_ALLOCATIONS_H_
 
 #include <cstddef>
-#include <cstdint>
 #include <set>
 #include <string>
 #include <vector>
@@ -36,45 +35,12 @@ namespace gpu {
 // allocated device buffers.
 class BufferAllocations {
  public:
-  // This special address is used to indicate that the allocation is not
-  // allocated at construction time and instead will be lazily allocated and
-  // owned by the XLA executable itself (we use this special marker to handle
-  // buffer allocations allocated within command buffers, which for CUDA
-  // backends means that buffer allocation is done via memory allocation node).
-  //
-  // TODO(ezhulenev): Replace magic bit pattern with std::optional or
-  // std::variant to distinguish external allocations from a regular ones.
-  static constexpr uintptr_t kExternalAllocationMarker = 0xDEADBEEF;
-
-  // A virtual base class for external allocations that provides a mapping
-  // from a buffer index to an externally-managed device memory.
-  class ExternalAllocations {
-   public:
-    virtual ~ExternalAllocations() = default;
-
-    // Return a device address for a given buffer allocation. Returns error if
-    // corresponding allocation is not yet allocated.
-    virtual absl::StatusOr<se::DeviceMemoryBase> GetDeviceAddress(
-        BufferAllocation::Index index) const = 0;
-
-    // Adds an external allocation for a given buffer index. Returns error if
-    // allocation already exists.
-    virtual absl::Status AddAllocation(BufferAllocation::Index index,
-                                       se::DeviceMemoryBase memory) = 0;
-
-    // Erases an external allocation for a given buffer index. Returns error if
-    // allocation does not exists.
-    virtual absl::Status EraseAllocation(BufferAllocation::Index index) = 0;
-  };
-
   BufferAllocations(absl::Span<se::DeviceMemoryBase const> buffers,
                     int device_ordinal,
-                    se::DeviceMemoryAllocator* memory_allocator,
-                    ExternalAllocations* external_allocations = nullptr)
+                    se::DeviceMemoryAllocator* memory_allocator)
       : buffers_(buffers.begin(), buffers.end()),
         device_ordinal_(device_ordinal),
-        memory_allocator_(memory_allocator),
-        external_allocations_(external_allocations) {}
+        memory_allocator_(memory_allocator) {}
 
   BufferAllocations(BufferAllocations&& other) = default;
   BufferAllocations& operator=(BufferAllocations&& other) = default;
@@ -84,9 +50,6 @@ class BufferAllocations {
   se::DeviceMemoryAllocator* memory_allocator() const {
     return memory_allocator_;
   }
-  ExternalAllocations* external_allocations() const {
-    return external_allocations_;
-  }
   int device_ordinal() const { return device_ordinal_; }
 
   // Returns the device address of buffer `buffer_index`. `buffer_index` must be
@@ -104,13 +67,6 @@ class BufferAllocations {
   se::DeviceMemoryBase GetDeviceAddress(
       const BufferAllocation::Slice& buffer_slice) const;
 
-  // Add new allocation allocated by external allocator.
-  absl::Status AddExternalAllocation(BufferAllocation::Index index,
-                                     se::DeviceMemoryBase memory) const;
-
-  // Remove allocation freed by external allocator.
-  absl::Status EraseExternalAllocation(BufferAllocation::Index index) const;
-
   // Tears down all buffers allocated by this object that are not in
   // `live_addresses`.
   absl::Status TearDown(const std::set<se::DeviceMemoryBase>& live_addresses,
@@ -132,17 +88,9 @@ class BufferAllocations {
   // An array of device pointers that stores the address of each buffer
   // indexed by Index. Each element can point to a temporary buffer, an
   // input buffer, or nullptr if no buffer is needed for that Index.
-
-  // a special address (se::kExternalAllocationMarker) with non-zero size buffer
-  // is assumed to be lazily allocated buffer, and will be allocated through
-  // command buffer Allocate command during runtime.
   std::vector<se::DeviceMemoryBase> buffers_;
   int device_ordinal_;
   se::DeviceMemoryAllocator* memory_allocator_;
-
-  // For buffer address that marked as ExternalAllocations, tracks its real
-  // address here.
-  ExternalAllocations* external_allocations_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/buffer_comparator.cc b/third_party/xla/xla/service/gpu/buffer_comparator.cc
index 679b5375d06949..f4f5112add357b 100644
--- a/third_party/xla/xla/service/gpu/buffer_comparator.cc
+++ b/third_party/xla/xla/service/gpu/buffer_comparator.cc
@@ -28,9 +28,10 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_memory_handle.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/typed_kernel_factory.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
@@ -60,10 +61,10 @@ static absl::StatusOr<bool> DeviceCompare(se::Stream* stream,
                                           void* kernel_symbol) {
   se::StreamExecutor* executor = stream->parent();
 
-  se::ScopedDeviceMemory<uint64_t> out_param =
-      executor->AllocateOwnedScalar<uint64_t>();
+  se::DeviceMemoryHandle out_param(executor,
+                                   executor->AllocateScalar<uint64_t>());
 
-  TF_RETURN_IF_ERROR(stream->MemZero(out_param.ptr(), sizeof(uint64_t)));
+  TF_RETURN_IF_ERROR(stream->MemZero(out_param.memory_ptr(), sizeof(uint64_t)));
   if (current.size() != expected.size()) {
     return Internal("Mismatched buffer size: %d bytes vs. %d bytes",
                     current.size(), expected.size());
@@ -75,11 +76,10 @@ static absl::StatusOr<bool> DeviceCompare(se::Stream* stream,
 
   TF_ASSIGN_OR_RETURN(
       ComparisonKernelT<ElementT> comparison_kernel,
-      (se::TypedKernel<se::DeviceMemory<ElementT>, se::DeviceMemory<ElementT>,
-                       float, uint64_t,
-                       se::DeviceMemory<uint64_t>>::Create(executor,
-                                                           kernel_name,
-                                                           kernel_symbol)));
+      (se::TypedKernelFactory<
+          se::DeviceMemory<ElementT>, se::DeviceMemory<ElementT>, float,
+          uint64_t, se::DeviceMemory<uint64_t>>::Create(executor, kernel_name,
+                                                        kernel_symbol)));
 
   const se::DeviceDescription& gpu_device_info =
       executor->GetDeviceDescription();
@@ -87,14 +87,16 @@ static absl::StatusOr<bool> DeviceCompare(se::Stream* stream,
   LaunchDimensions dim =
       CalculateLaunchDimensions(buffer_shape, gpu_device_info);
 
+  se::DeviceMemory<uint64_t> as_uint64(out_param.memory());
   TF_RETURN_IF_ERROR(stream->ThenLaunch(
       dim.thread_counts_per_block(), dim.block_counts(), comparison_kernel,
       current_typed, expected_typed, static_cast<float>(kTolerance),
-      buffer_size, out_param.cref()));
+      buffer_size, as_uint64));
 
   uint64_t result = -1;
-  CHECK_EQ(out_param->size(), sizeof(result));
-  TF_RETURN_IF_ERROR(stream->Memcpy(&result, *out_param, sizeof(result)));
+  CHECK_EQ(out_param.memory().size(), sizeof(result));
+  TF_RETURN_IF_ERROR(
+      stream->Memcpy(&result, out_param.memory(), sizeof(result)));
   TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
   return result == 0;
 }
diff --git a/third_party/xla/xla/service/gpu/buffer_comparator_test.cc b/third_party/xla/xla/service/gpu/buffer_comparator_test.cc
index 4576ab18c89aa0..9faa3156a84965 100644
--- a/third_party/xla/xla/service/gpu/buffer_comparator_test.cc
+++ b/third_party/xla/xla/service/gpu/buffer_comparator_test.cc
@@ -26,7 +26,7 @@ limitations under the License.
 #include "xla/service/hlo_module_config.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_memory_handle.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
@@ -56,25 +56,26 @@ class BufferComparatorTest : public testing::Test {
                            const std::vector<ElementType>& expected) {
     auto stream = stream_exec_->CreateStream().value();
 
-    se::ScopedDeviceMemory<ElementType> current_buffer(
+    se::DeviceMemoryHandle current_buffer(
         stream_exec_, stream_exec_->AllocateArray<ElementType>(current.size()));
-    se::ScopedDeviceMemory<ElementType> expected_buffer(
+    se::DeviceMemoryHandle expected_buffer(
         stream_exec_,
         stream_exec_->AllocateArray<ElementType>(expected.size()));
 
-    TF_CHECK_OK(stream->Memcpy(current_buffer.ptr(), current.data(),
-                               current_buffer->size()));
-    TF_CHECK_OK(stream->Memcpy(expected_buffer.ptr(), expected.data(),
-                               expected_buffer->size()));
+    TF_CHECK_OK(stream->Memcpy(current_buffer.memory_ptr(), current.data(),
+                               current_buffer.memory().size()));
+    TF_CHECK_OK(stream->Memcpy(expected_buffer.memory_ptr(), expected.data(),
+                               expected_buffer.memory().size()));
     TF_CHECK_OK(stream->BlockHostUntilDone());
 
     BufferComparator comparator(
         ShapeUtil::MakeShape(
             primitive_util::NativeToPrimitiveType<ElementType>(),
-            {static_cast<int64_t>(current_buffer->ElementCount())}),
+            {static_cast<int64_t>(current.size())}),
         HloModuleConfig());
     return comparator
-        .CompareEqual(stream.get(), *current_buffer, *expected_buffer)
+        .CompareEqual(stream.get(), current_buffer.memory(),
+                      expected_buffer.memory())
         .value();
   }
 
@@ -350,20 +351,20 @@ TEST_F(BufferComparatorTest, BF16) {
 
   auto stream = stream_exec_->CreateStream().value();
 
-  se::ScopedDeviceMemory<Eigen::bfloat16> lhs(
+  se::DeviceMemoryHandle lhs(
       stream_exec_,
       stream_exec_->AllocateArray<Eigen::bfloat16>(element_count));
-  InitializeBuffer(stream.get(), BF16, &rng_state, *lhs.ptr());
+  InitializeBuffer(stream.get(), BF16, &rng_state, lhs.memory());
 
-  se::ScopedDeviceMemory<Eigen::bfloat16> rhs(
+  se::DeviceMemoryHandle rhs(
       stream_exec_,
       stream_exec_->AllocateArray<Eigen::bfloat16>(element_count));
-  InitializeBuffer(stream.get(), BF16, &rng_state, *rhs.ptr());
+  InitializeBuffer(stream.get(), BF16, &rng_state, rhs.memory());
 
   BufferComparator comparator(ShapeUtil::MakeShape(BF16, {element_count}),
                               HloModuleConfig());
-  EXPECT_FALSE(
-      comparator.CompareEqual(stream.get(), *lhs.ptr(), *rhs.ptr()).value());
+  EXPECT_FALSE(comparator.CompareEqual(stream.get(), lhs.memory(), rhs.memory())
+                   .value());
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/gpu/collective_permute_cycle_decomposer.cc b/third_party/xla/xla/service/gpu/collective_permute_cycle_decomposer.cc
index 8edf682481baf9..d595352c7522a5 100644
--- a/third_party/xla/xla/service/gpu/collective_permute_cycle_decomposer.cc
+++ b/third_party/xla/xla/service/gpu/collective_permute_cycle_decomposer.cc
@@ -126,10 +126,10 @@ CycleType ShouldDecomposeWithCycleType(
 
 // Constructs the frontend attributes for the two decomposed CollectivePermute
 // instructions.
-Status GetFrontendAttributes(HloCollectivePermuteInstruction* cp,
-                             CycleType cycle_type,
-                             xla::FrontendAttributes& cp1_attr,
-                             xla::FrontendAttributes& cp2_attr) {
+absl::Status GetFrontendAttributes(HloCollectivePermuteInstruction* cp,
+                                   CycleType cycle_type,
+                                   xla::FrontendAttributes& cp1_attr,
+                                   xla::FrontendAttributes& cp2_attr) {
   cp1_attr = cp->frontend_attributes();
   cp2_attr = cp->frontend_attributes();
   auto validation_it =
@@ -177,11 +177,9 @@ Status GetFrontendAttributes(HloCollectivePermuteInstruction* cp,
 
 // Decomposes a CollectivePermute instruction with a cycle in its source-target
 // pairs into two CollectivePermute instructions.
-Status DecomposeCollectivePermuteCycle(HloCollectivePermuteInstruction* cp,
-                                       HloComputation* computation,
-                                       HloModule* module,
-                                       int64_t next_channel_id,
-                                       CycleType cycle_type) {
+absl::Status DecomposeCollectivePermuteCycle(
+    HloCollectivePermuteInstruction* cp, HloComputation* computation,
+    HloModule* module, int64_t next_channel_id, CycleType cycle_type) {
   const SourceTargetPairs& pairs = cp->source_target_pairs();
   int64_t num_pairs = pairs.size();
   // A forward cycle has its backedge at the end as in
diff --git a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
index 0eed8983af4060..e3197f495d471d 100644
--- a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
+++ b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
@@ -102,29 +102,6 @@ void RemoveUnusedAndUninitializedGlobals(
 
 }  // namespace
 
-void ForAllThunks(const std::function<void(Thunk*)>& fn,
-                  ThunkSequence* thunk_sequence) {
-  for (std::unique_ptr<Thunk>& thunk : *thunk_sequence) {
-    if (thunk->kind() == Thunk::kConditional) {
-      auto* cond_thunk = tensorflow::down_cast<ConditionalThunk*>(thunk.get());
-      for (const std::unique_ptr<SequentialThunk>& branch_thunks :
-           cond_thunk->branch_thunks()) {
-        ForAllThunks(fn, &branch_thunks->thunks());
-      }
-    } else if (thunk->kind() == Thunk::kSequential) {
-      auto* sequential_thunk =
-          tensorflow::down_cast<SequentialThunk*>(thunk.get());
-      ForAllThunks(fn, &sequential_thunk->thunks());
-    } else if (thunk->kind() == Thunk::kWhile) {
-      auto* while_thunk = tensorflow::down_cast<WhileThunk*>(thunk.get());
-      ForAllThunks(fn, &while_thunk->condition_thunk_sequence()->thunks());
-      ForAllThunks(fn, &while_thunk->body_thunk_sequence()->thunks());
-    } else {
-      fn(thunk.get());
-    }
-  }
-}
-
 absl::StatusOr<CompileModuleResults> CompileModuleToLlvmIr(
     HloModule* hlo_module, llvm::LLVMContext* llvm_context,
     const std::string& target_triple, const std::string& data_layout,
@@ -133,26 +110,33 @@ absl::StatusOr<CompileModuleResults> CompileModuleToLlvmIr(
     const HloDataflowAnalysis::CanShareBuffer& can_share_buffer_function,
     const BufferValue::SizeFunction& buffer_size_bytes_function) {
   CompileModuleResults results;
-  results.llvm_module = std::make_unique<llvm::Module>("", *llvm_context);
+  results.llvm_module =
+      std::make_unique<llvm::Module>(hlo_module->name(), *llvm_context);
   results.llvm_module->setTargetTriple(target_triple);
   results.llvm_module->setDataLayout(data_layout);
 
-  TF_ASSIGN_OR_RETURN(
-      results.buffer_assignment,
-      BufferAssigner::Run(
-          hlo_module,
-          std::make_unique<SequentialHloOrdering>(hlo_module->schedule()),
-          buffer_size_bytes_function,
-          /*color_alignment=*/
-          [](LogicalBuffer::Color) { return kXlaAllocatedBufferAlignBytes; },
-          /*allocate_buffers_for_constants=*/true,
-          /*colorer=*/
-          hlo_module->config()
-                  .debug_options()
-                  .xla_gpu_enable_nccl_user_buffers()
-              ? CollectiveColorer()
-              : BufferAssigner::DefaultColorer(),
-          /*must_not_live_out=*/{}, can_share_buffer_function));
+  {
+    tsl::profiler::ScopedAnnotation annotation([&] {
+      return absl::StrFormat("XlaBufferAssignment:#module=%s,program_id=%d#",
+                             hlo_module->name(), hlo_module->unique_id());
+    });
+    TF_ASSIGN_OR_RETURN(
+        results.buffer_assignment,
+        BufferAssigner::Run(
+            hlo_module,
+            std::make_unique<SequentialHloOrdering>(hlo_module->schedule()),
+            buffer_size_bytes_function,
+            /*color_alignment=*/
+            [](LogicalBuffer::Color) { return kXlaAllocatedBufferAlignBytes; },
+            /*allocate_buffers_for_constants=*/true,
+            /*colorer=*/
+            hlo_module->config()
+                    .debug_options()
+                    .xla_gpu_enable_nccl_user_buffers()
+                ? CollectiveColorer()
+                : BufferAssigner::DefaultColorer(),
+            /*must_not_live_out=*/{}, can_share_buffer_function));
+  }
 
   VLOG(1) << "Buffer Assignment Stats for " << hlo_module->name() << "\n"
           << results.buffer_assignment->GetStats().ToString();
@@ -184,6 +168,10 @@ absl::StatusOr<CompileModuleResults> CompileModuleToLlvmIr(
 
   results.module_name = hlo_module->name();
 
+  tsl::profiler::ScopedAnnotation annotation([&] {
+    return absl::StrFormat("XlaEmitLlvmIr:#module=%s,program_id=%d#",
+                           hlo_module->name(), hlo_module->unique_id());
+  });
   IrEmitterContext ir_emitter_context(
       hlo_module, results.buffer_assignment.get(), platform_name,
       gpu_device_info, mlir_context.get(), results.llvm_module.get(),
@@ -223,10 +211,7 @@ absl::StatusOr<CompileModuleResults> CompileModuleToLlvmIr(
     RecordHloToLlvmDuration(end_usecs - start_usecs);
   }
 
-  auto thunk_sequence = ir_emitter->ConsumeThunkSequence();
-  ForAllThunks([](Thunk* thunk) { thunk->ClearCompileTimeInfo(); },
-               thunk_sequence.get());
-  results.executable = std::move(thunk_sequence);
+  results.executable = ir_emitter->ConsumeThunkSequence();
 
   return results;
 }
diff --git a/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc b/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
index c6bb1b20d3f73d..cda0d5a700d238 100644
--- a/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
+++ b/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/literal_util.h"
+#include "xla/service/gpu/autotuner_compile_util.h"
 #include "xla/service/gpu/autotuner_util.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/cublas_cudnn.h"
@@ -394,7 +395,6 @@ absl::StatusOr<AutotuneResult> GpuConvAlgorithmPicker::PickBestAlgorithm(
 
 absl::StatusOr<AutotuneResult> GpuConvAlgorithmPicker::PickBestAlgorithmNoCache(
     const HloCustomCallInstruction* instr) {
-  AutotuneCacheKey key(config_.GetModelStr(), *instr);
   if (config_.IsDeviceless()) {
     // Return an autotune result with algo id -1, which means that we autotune
     // at runtime.
@@ -422,29 +422,16 @@ absl::StatusOr<AutotuneResult> GpuConvAlgorithmPicker::PickBestAlgorithmNoCache(
         "Failed to synchronize GPU for autotuning conv instruction");
   }
 
-  // allocator either points to this->allocator_ or, if that's null, to a
-  // se::StreamExecutorMemoryAllocator for stream_exec.
-  se::DeviceMemoryAllocator* allocator = config_.GetAllocator();
-
   absl::StatusOr<AutotuneResult> result_or(Internal("Unknown platform."));
   // Check StreamExecutor on which platform it is. ROCm and Cuda implementation
   // have diverged. Specifically, we need to make sure redzone allocator related
   // utilities are not used in ROCm routine
-  se::Platform::Id platform_id = stream_exec->platform()->id();
+  se::Platform::Id platform_id = stream_exec->GetPlatform()->id();
   if (platform_id == se::rocm::kROCmPlatformId) {
-    result_or = PickBestAlgorithmNoCacheRocm(instr, allocator);
+    result_or = PickBestAlgorithmNoCacheRocm(instr);
   } else if (platform_id == se::cuda::kCudaPlatformId) {
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA)
-    DebugOptions debug_opts = instr->GetModule()->config().debug_options();
-    TF_ASSIGN_OR_RETURN(
-        se::RedzoneAllocator input_output_allocator,
-        AutotunerUtil::CreateRedzoneAllocator(config_, debug_opts));
-
-    TF_ASSIGN_OR_RETURN(
-        AutotuneRuntimeArguments runtime_arguments,
-        AutotuneRuntimeArguments::FromInstruction(instr, allocator, stream_exec,
-                                                  &input_output_allocator));
-    result_or = PickBestAlgorithmNoCacheCuda(instr, key, runtime_arguments);
+    result_or = PickBestAlgorithmNoCacheCuda(instr);
 #endif
   }
 
@@ -455,77 +442,36 @@ absl::StatusOr<AutotuneResult> GpuConvAlgorithmPicker::PickBestAlgorithmNoCache(
 
 absl::StatusOr<GpuConvAlgorithmPicker::AutotuneRuntimeArguments>
 GpuConvAlgorithmPicker::AutotuneRuntimeArguments::FromInstruction(
-    const HloCustomCallInstruction* instr, se::DeviceMemoryAllocator* allocator,
-    se::StreamExecutor* stream_exec,
-    se::RedzoneAllocator* input_output_allocator) {
-  TF_ASSIGN_OR_RETURN(se::Stream* const stream,
-                      allocator->GetStream(stream_exec->device_ordinal()));
-
-  // Construct RedzoneAllocator.
-  int64_t rng_state = 0;
-  auto hlo_module_config = instr->GetModule()->config();
-  const bool init_conv_data = ShouldInitConvData(hlo_module_config);
-  const auto initialize_buffer = [init_conv_data, &stream, &rng_state](
-                                     DeviceMemoryBase buffer,
-                                     const Shape& buffer_shape) {
-    if (init_conv_data) {
-      InitializeBuffer(stream, buffer_shape.element_type(), &rng_state, buffer);
-    }
-  };
-
-  // Construct operand buffers.
-  std::vector<se::DeviceMemoryBase> operand_buffers;
-  for (const auto* operand : instr->operands()) {
-    TF_ASSIGN_OR_RETURN(auto buffer,
-                        input_output_allocator->AllocateBytes(
-                            ShapeUtil::ByteSizeOf(operand->shape())));
-    initialize_buffer(buffer, operand->shape());
-    operand_buffers.push_back(buffer);
-  }
-
-  // Construct the result buffers.
-  Shape result_shape;
-  // Disregard the workspace, which is the final element in the tuple returned
-  // by instr.
-  std::vector<se::DeviceMemoryBase> result_buffers(
-      instr->shape().tuple_shapes_size() - 1);
-  // Set the shape to a tuple when instr returns more than one result.
-  if (instr->shape().tuple_shapes_size() > 2) {
-    result_shape = ShapeUtil::MakeTupleShape(
-        std::vector<Shape>{instr->shape().tuple_shapes().begin(),
-                           instr->shape().tuple_shapes().end() - 1});
-  } else {
-    result_shape = instr->shape().tuple_shapes(0);
-  }
-
-  for (int i = 0; i < instr->shape().tuple_shapes_size() - 1; ++i) {
-    TF_ASSIGN_OR_RETURN(
-        result_buffers[i],
-        input_output_allocator->AllocateBytes(
-            ShapeUtil::ByteSizeOf(instr->shape().tuple_shapes(i))));
-    initialize_buffer(result_buffers[i], instr->shape().tuple_shapes(i));
-  }
+    const HloCustomCallInstruction* instr, const AutotuneConfig& config,
+    const DebugOptions& debug_options) {
+  TF_ASSIGN_OR_RETURN(auto rz_buffers,
+                      RedzoneBuffers::FromInstruction(
+                          *instr, config, debug_options,
+                          RedzoneBuffers::kAllInputsOutputsNoScratch));
 
   // Get canonical HLO.
   std::string canonical_hlo(
-      AutotuneCacheKey(stream_exec->GetDeviceDescription().model_str(), *instr)
+      AutotuneCacheKey(config.GetExecutor()->GetDeviceDescription().model_str(),
+                       *instr)
           .GetHlo());
 
   TF_ASSIGN_OR_RETURN(GpuConvConfig gpu_conv_config, GetGpuConvConfig(instr));
 
   GpuConvAlgorithmPicker::AutotuneRuntimeArguments runtime_arguments = {
-      result_shape,   hlo_module_config,      operand_buffers,
-      result_buffers, input_output_allocator, gpu_conv_config,
+      instr->GetModule()->config(),
+      std::move(rz_buffers),
+      std::move(gpu_conv_config),
       {canonical_hlo}};
 
   return runtime_arguments;
 }
 
-// There are three tiers of errors possible here: returning a failed StatusOr
-// means autotuning fails immediately; returning an AutotuneResult with a
-// failure code other than DISQUALIFIED means autotuning fails if
-// crash_on_checking_failure is set; and returning a DISQUALIFIED AutotuneResult
-// simply skips the engine/algorithm while recording a reason for skipping it.
+// There are three tiers of errors possible here: returning a failed
+// absl::StatusOr means autotuning fails immediately; returning an
+// AutotuneResult with a failure code other than DISQUALIFIED means autotuning
+// fails if crash_on_checking_failure is set; and returning a DISQUALIFIED
+// AutotuneResult simply skips the engine/algorithm while recording a reason for
+// skipping it.
 absl::StatusOr<AutotuneResult> GpuConvAlgorithmPicker::AutotuneOneConvRunner(
     GenericConvRunner* const runner,
     std::optional<ReferenceResult>* reference_result,
@@ -552,7 +498,7 @@ absl::StatusOr<AutotuneResult> GpuConvAlgorithmPicker::AutotuneOneConvRunner(
   AlgorithmDesc alg_key(alg.algo_id(), alg.tensor_ops_enabled(), std::nullopt);
 
   std::string instr_str = instruction_info.has_value()
-                              ? instruction_info->GetHlo().data()
+                              ? std::string(instruction_info->GetHlo())
                               : "<unknown>";
 
   if (absl::c_linear_search(disabled_algos, alg_key)) {
@@ -623,9 +569,9 @@ absl::StatusOr<AutotuneResult> GpuConvAlgorithmPicker::AutotuneOneConvRunner(
   float min_time = std::numeric_limits<float>::max();
   absl::Status launch_status;
   std::vector<se::DeviceMemoryBase> operand_buffers =
-      runtime_arguments.operand_buffers;
+      runtime_arguments.rz_buffers.input_buffers();
   std::vector<se::DeviceMemoryBase> result_buffers =
-      runtime_arguments.result_buffers;
+      runtime_arguments.rz_buffers.output_buffers();
 
   TF_ASSIGN_OR_RETURN(se::Stream* const stream, config_.GetStream());
 
@@ -696,7 +642,7 @@ absl::StatusOr<AutotuneResult> GpuConvAlgorithmPicker::AutotuneOneConvRunner(
   // Check for writes to redzones.
   TF_ASSIGN_OR_RETURN(
       bool input_output_allocator_redzone_clear,
-      CheckRedzones(*runtime_arguments.input_output_allocator, stream,
+      CheckRedzones(runtime_arguments.rz_buffers.RedzoneAllocator(), stream,
                     "input/output", instr_str, &result));
 
   TF_ASSIGN_OR_RETURN(
@@ -736,7 +682,7 @@ absl::StatusOr<AutotuneResult> GpuConvAlgorithmPicker::AutotuneOneConvRunner(
 
   if (reference_result->has_value()) {
     XLA_SCOPED_LOGGING_TIMER_LEVEL("BufferComparator::CompareEqual", 2);
-    BufferComparator comparator(runtime_arguments.result_shape,
+    BufferComparator comparator(runtime_arguments.rz_buffers.output_shape(),
                                 runtime_arguments.hlo_module_config);
     for (int i = 0; i < result_buffers.size(); ++i) {
       absl::StatusOr<bool> compare_result = comparator.CompareEqual(
@@ -780,7 +726,7 @@ absl::StatusOr<AutotuneResult> GpuConvAlgorithmPicker::AutotuneOneConvRunner(
     for (int i = 0; i < result_buffers.size(); ++i) {
       TF_ASSIGN_OR_RETURN(
           reference_result_buffers[i],
-          runtime_arguments.input_output_allocator->AllocateBytes(
+          runtime_arguments.rz_buffers.RedzoneAllocator().AllocateBytes(
               result_buffers[i].size()));
       TF_RETURN_IF_ERROR(stream->Memcpy(&reference_result_buffers[i],
                                         result_buffers[i],
@@ -794,30 +740,27 @@ absl::StatusOr<AutotuneResult> GpuConvAlgorithmPicker::AutotuneOneConvRunner(
 
 absl::StatusOr<AutotuneResult>
 GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
-    const HloCustomCallInstruction* instr,
-    std::optional<AutotuneCacheKey> instruction_info,
-    const AutotuneRuntimeArguments& runtime_arguments) {
-  se::StreamExecutor* stream_exec = config_.GetExecutor();
-
-  std::string instr_str = instruction_info.has_value()
-                              ? instruction_info->GetHlo().data()
-                              : "<unknown>";
-
+    const HloCustomCallInstruction* instr) {
+  AutotuneCacheKey instruction_info{config_.GetModelStr(), *instr};
+  std::string instr_str(instruction_info.GetHlo());
   XLA_SCOPED_LOGGING_TIMER(absl::StrCat(
       "GpuConvAlgorithmPicker::PickBestAlgorithmImpl for ", instr_str));
 
   const DebugOptions& debug_options =
-      runtime_arguments.hlo_module_config.debug_options();
-
+      instr->GetModule()->config().debug_options();
   const bool crash_on_checking_failure =
       debug_options.xla_gpu_crash_on_verification_failures();
 
   std::string blas_version;
+  se::StreamExecutor* stream_exec = config_.GetExecutor();
   if (auto* blas = stream_exec->AsBlas()) {
     (void)blas->GetVersion(&blas_version);
   }
 
-  absl::Span<const AlgorithmDesc> disabled_algos;
+  std::vector<AlgorithmDesc> disabled_algos;
+  TF_ASSIGN_OR_RETURN(
+      AutotuneRuntimeArguments runtime_arguments,
+      AutotuneRuntimeArguments::FromInstruction(instr, config_, debug_options));
   if (runtime_arguments.canonical_hlo.has_value()) {
     disabled_algos = GetDisabledConvAlgorithms(
         GetComputeCapability(stream_exec), GetCudnnVersion(stream_exec),
@@ -826,7 +769,9 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
 
   const bool cudnn_frontend_enabled =
       debug_options.xla_gpu_enable_cudnn_frontend();
-  const bool deterministic_ops = debug_options.xla_gpu_deterministic_ops();
+  const bool deterministic_ops =
+      debug_options.xla_gpu_deterministic_ops() ||
+      debug_options.xla_gpu_exclude_nondeterministic_ops();
   bool allow_tf32 = true;
   // TODO(b/284371623): Properly set allow_tf32 even if instr==nullptr, which is
   // the case when running an AOT compiled executable with runtime autotuning.
@@ -891,10 +836,10 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
       for (int i = 0; i < instr->operand_count(); i++) {
         *instr_log.add_operand_shapes() = instr->operand(i)->shape().ToProto();
         instr_log.add_operand_addresses(reinterpret_cast<uint64_t>(
-            runtime_arguments.operand_buffers[i].opaque()));
+            runtime_arguments.rz_buffers.input_buffers()[i].opaque()));
       }
       for (se::DeviceMemoryBase result_buffer :
-           runtime_arguments.result_buffers) {
+           runtime_arguments.rz_buffers.output_buffers()) {
         instr_log.add_result_addresses(
             reinterpret_cast<uint64_t>(result_buffer.opaque()));
       }
@@ -930,14 +875,15 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
 
 absl::StatusOr<AutotuneResult>
 GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheRocm(
-    const HloCustomCallInstruction* instr,
-    se::DeviceMemoryAllocator* allocator) {
+    const HloCustomCallInstruction* instr) {
   XLA_SCOPED_LOGGING_TIMER(absl::StrCat(
       "GpuConvAlgorithmPicker::PickBestAlgorithmImpl for ", instr->ToString()));
 
   const DebugOptions& debug_options =
       instr->GetModule()->config().debug_options();
-  const bool deterministic_ops = debug_options.xla_gpu_deterministic_ops();
+  const bool deterministic_ops =
+      debug_options.xla_gpu_deterministic_ops() ||
+      debug_options.xla_gpu_exclude_nondeterministic_ops();
   const bool allow_tf32 = absl::c_all_of(
       instr->precision_config().operand_precision(),
       [](int precision) { return precision <= PrecisionConfig::HIGH; });
@@ -947,6 +893,9 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheRocm(
   const auto device_ordinal = stream_exec->device_ordinal();
   std::vector<se::DeviceMemoryBase> operand_buffers;
 
+  // allocator either points to this->allocator_ or, if that's null, to a
+  // se::StreamExecutorMemoryAllocator for stream_exec.
+  se::DeviceMemoryAllocator* allocator = config_.GetAllocator();
   ScratchAllocator input_output_allocator(device_ordinal, allocator);
   TF_ASSIGN_OR_RETURN(se::Stream* const stream, config_.GetStream());
   const auto initialize_buffer = [stream](DeviceMemoryBase buffer) {
diff --git a/third_party/xla/xla/service/gpu/conv_algorithm_picker.h b/third_party/xla/xla/service/gpu/conv_algorithm_picker.h
index d0648dbbeee423..e6dea8b2f20b0c 100644
--- a/third_party/xla/xla/service/gpu/conv_algorithm_picker.h
+++ b/third_party/xla/xla/service/gpu/conv_algorithm_picker.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/gpu/autotuner_compile_util.h"
 #include "xla/service/gpu/autotuner_util.h"
 #include "xla/service/gpu/cublas_cudnn.h"
 #include "xla/service/gpu/gpu_conv_runner.h"
@@ -124,18 +125,14 @@ class GpuConvAlgorithmPicker : public HloModulePass {
   // information such as input/output buffers in order to run. It can be
   // constructed from the autotuned instruction by FromInstruction.
   struct AutotuneRuntimeArguments {
-    const Shape result_shape;
     const HloModuleConfig hlo_module_config;
-    std::vector<se::DeviceMemoryBase> operand_buffers;
-    std::vector<se::DeviceMemoryBase> result_buffers;
-    se::RedzoneAllocator* input_output_allocator;
+    RedzoneBuffers rz_buffers;
     const GpuConvConfig gpu_conv_config;
     std::optional<std::string> canonical_hlo;
 
     static absl::StatusOr<AutotuneRuntimeArguments> FromInstruction(
-        const HloCustomCallInstruction* instr,
-        se::DeviceMemoryAllocator* allocator, se::StreamExecutor* stream,
-        se::RedzoneAllocator* input_output_allocator);
+        const HloCustomCallInstruction* instr, const AutotuneConfig& config,
+        const DebugOptions& debug_options);
   };
 
   absl::StatusOr<AutotuneResult> AutotuneOneConvRunner(
@@ -147,14 +144,11 @@ class GpuConvAlgorithmPicker : public HloModulePass {
 
   // Pick the best algorithm for CUDA platform.
   absl::StatusOr<AutotuneResult> PickBestAlgorithmNoCacheCuda(
-      const HloCustomCallInstruction* instr,
-      std::optional<AutotuneCacheKey> instruction_info,
-      const AutotuneRuntimeArguments& runtime_arguments);
+      const HloCustomCallInstruction* instr);
 #endif
 
   absl::StatusOr<AutotuneResult> PickBestAlgorithmNoCacheRocm(
-      const HloCustomCallInstruction* instr,
-      se::DeviceMemoryAllocator* allocator);
+      const HloCustomCallInstruction* instr);
 
  private:
   AutotuneConfig config_;
diff --git a/third_party/xla/xla/service/gpu/conv_layout_normalization_test.cc b/third_party/xla/xla/service/gpu/conv_layout_normalization_test.cc
index 56629fe39bee3f..206416b332f8c3 100644
--- a/third_party/xla/xla/service/gpu/conv_layout_normalization_test.cc
+++ b/third_party/xla/xla/service/gpu/conv_layout_normalization_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/error_spec.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/tests/hlo_test_base.h"
+#include "xla/tests/test_macros.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/gpu/copy_fusion.cc b/third_party/xla/xla/service/gpu/copy_fusion.cc
index 4f06ef04ba5ae2..a83354ca5d0508 100644
--- a/third_party/xla/xla/service/gpu/copy_fusion.cc
+++ b/third_party/xla/xla/service/gpu/copy_fusion.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/gpu/gpu_fusible.h"
+#include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/reduction_utils.h"
 #include "tsl/platform/errors.h"
@@ -111,8 +112,9 @@ absl::StatusOr<bool> CopyFusion::DoCopyFusion(HloComputation* computation) {
     if (copies.empty()) {
       continue;
     }
-    auto dynamic_update_slices = GetOutputDefiningDynamicUpdateSlices(
-        GetFusionRoots(*fused_computation));
+    auto fusion_adaptor = HloFusionAdaptor::ForComputation(fused_computation);
+    auto dynamic_update_slices =
+        GetOutputDefiningDynamicUpdateSlices(fusion_adaptor->GetRoots());
     // Skip dynamic update slice fusions which might be emitted in-place.
     if (!dynamic_update_slices.empty() &&
         (root->opcode() != HloOpcode::kTuple ||
diff --git a/third_party/xla/xla/service/gpu/cublas_cudnn.cc b/third_party/xla/xla/service/gpu/cublas_cudnn.cc
index cab9e2c54a3f15..a3cb21031769d3 100644
--- a/third_party/xla/xla/service/gpu/cublas_cudnn.cc
+++ b/third_party/xla/xla/service/gpu/cublas_cudnn.cc
@@ -76,41 +76,21 @@ const absl::string_view kCudnnConvReorderFilterAndBiasCallTarget =
 const absl::string_view kCudnnNormCallTarget = "__cudnn$norm";
 
 // fMHA forward call targets.
-const absl::string_view kCudnnfMHABmmBmmCallTarget = "__cudnn$fmhaBmmBmm";
 const absl::string_view kCudnnfMHASoftmaxCallTarget = "__cudnn$fmhaSoftmax";
-const absl::string_view kCudnnfMHAScaleBiasMaskSoftmaxCallTarget =
-    "__cudnn$fmhaScaleBiasMaskSoftmax";
-const absl::string_view kCudnnfMHAScaleBiasMaskSoftmaxDropoutCallTarget =
-    "__cudnn$fmhaScaleBiasMaskSoftmaxDropout";
 const absl::string_view kCudnnfMHAScaleBiasSoftmaxDropoutCallTarget =
     "__cudnn$fmhaScaleBiasSoftmaxDropout";
 const absl::string_view kCudnnfMHAScaleBiasSoftmaxCallTarget =
     "__cudnn$fmhaScaleBiasSoftmax";
-const absl::string_view kCudnnfMHAScaleMaskSoftmaxCallTarget =
-    "__cudnn$fmhaScaleMaskSoftmax";
-const absl::string_view kCudnnfMHAScaleMaskSoftmaxDropoutCallTarget =
-    "__cudnn$fmhaScaleMaskSoftmaxDropout";
 const absl::string_view kCudnnfMHASoftmaxDropoutCallTarget =
     "__cudnn$fmhaSoftmaxDropout";
 
 // fMHA backward call targets.
-const absl::string_view kCudnnfMHABmmBmmBackwardCallTarget =
-    "__cudnn$fmhaBmmBmmBackward";
 const absl::string_view kCudnnfMHASoftmaxBackwardCallTarget =
     "__cudnn$fmhaSoftmaxBackward";
-const absl::string_view kCudnnfMHAScaleBiasMaskSoftmaxBackwardCallTarget =
-    "__cudnn$fmhaScaleBiasMaskSoftmaxBackward";
-const absl::string_view
-    kCudnnfMHAScaleBiasMaskSoftmaxDropoutBackwardCallTarget =
-        "__cudnn$fmhaScaleBiasMaskSoftmaxDropoutBackward";
 const absl::string_view kCudnnfMHAScaleBiasSoftmaxDropoutBackwardCallTarget =
     "__cudnn$fmhaScaleBiasSoftmaxDropoutBackward";
 const absl::string_view kCudnnfMHAScaleBiasSoftmaxBackwardCallTarget =
     "__cudnn$fmhaScaleBiasSoftmaxBackward";
-const absl::string_view kCudnnfMHAScaleMaskSoftmaxBackwardCallTarget =
-    "__cudnn$fmhaScaleMaskSoftmaxBackward";
-const absl::string_view kCudnnfMHAScaleMaskSoftmaxDropoutBackwardCallTarget =
-    "__cudnn$fmhaScaleMaskSoftmaxDropoutBackward";
 const absl::string_view kCudnnfMHASoftmaxDropoutBackwardCallTarget =
     "__cudnn$fmhaSoftmaxDropoutBackward";
 
@@ -150,12 +130,7 @@ bool IsFwdCustomCallTofMHA(const HloInstruction& hlo) {
     return false;
   }
   const auto& target = hlo.custom_call_target();
-  return target == kCudnnfMHABmmBmmCallTarget ||
-         target == kCudnnfMHASoftmaxCallTarget ||
-         target == kCudnnfMHAScaleBiasMaskSoftmaxCallTarget ||
-         target == kCudnnfMHAScaleBiasMaskSoftmaxDropoutCallTarget ||
-         target == kCudnnfMHAScaleMaskSoftmaxCallTarget ||
-         target == kCudnnfMHAScaleMaskSoftmaxDropoutCallTarget ||
+  return target == kCudnnfMHASoftmaxCallTarget ||
          target == kCudnnfMHASoftmaxDropoutCallTarget ||
          target == kCudnnfMHAScaleBiasSoftmaxCallTarget ||
          target == kCudnnfMHAScaleBiasSoftmaxDropoutCallTarget;
@@ -166,26 +141,17 @@ bool IsBwdCustomCallTofMHA(const HloInstruction& hlo) {
     return false;
   }
   const auto& target = hlo.custom_call_target();
-  return target == kCudnnfMHABmmBmmBackwardCallTarget ||
-         target == kCudnnfMHASoftmaxBackwardCallTarget ||
-         target == kCudnnfMHAScaleBiasMaskSoftmaxBackwardCallTarget ||
-         target == kCudnnfMHAScaleBiasMaskSoftmaxDropoutBackwardCallTarget ||
-         target == kCudnnfMHAScaleMaskSoftmaxBackwardCallTarget ||
-         target == kCudnnfMHAScaleMaskSoftmaxDropoutBackwardCallTarget ||
+  return target == kCudnnfMHASoftmaxBackwardCallTarget ||
          target == kCudnnfMHASoftmaxDropoutBackwardCallTarget ||
          target == kCudnnfMHAScaleBiasSoftmaxBackwardCallTarget ||
          target == kCudnnfMHAScaleBiasSoftmaxDropoutBackwardCallTarget;
 }
 
 bool MHACallHasDropout(const absl::string_view fmha_call_name) {
-  return fmha_call_name == kCudnnfMHAScaleBiasMaskSoftmaxDropoutCallTarget ||
-         fmha_call_name == kCudnnfMHAScaleMaskSoftmaxDropoutCallTarget ||
+  return fmha_call_name == kCudnnfMHASoftmaxDropoutCallTarget ||
+         fmha_call_name == kCudnnfMHASoftmaxDropoutBackwardCallTarget ||
          fmha_call_name == kCudnnfMHAScaleBiasSoftmaxDropoutCallTarget ||
-         fmha_call_name ==
-             kCudnnfMHAScaleBiasSoftmaxDropoutBackwardCallTarget ||
-         fmha_call_name ==
-             kCudnnfMHAScaleBiasMaskSoftmaxDropoutBackwardCallTarget ||
-         fmha_call_name == kCudnnfMHAScaleMaskSoftmaxDropoutBackwardCallTarget;
+         fmha_call_name == kCudnnfMHAScaleBiasSoftmaxDropoutBackwardCallTarget;
 }
 
 bool IsCustomCallTofMHA(const HloInstruction& hlo) {
@@ -236,15 +202,6 @@ std::string CudnnConvKindToString(CudnnConvKind kind) {
 absl::StatusOr<CudnnfMHAKind> GetCudnnfMHAKind(
     const HloCustomCallInstruction* instr) {
   absl::string_view target = instr->custom_call_target();
-  if (target == kCudnnfMHABmmBmmCallTarget) return CudnnfMHAKind::kBmmBmm;
-  if (target == kCudnnfMHAScaleBiasMaskSoftmaxCallTarget)
-    return CudnnfMHAKind::kScaleBiasMaskSoftmax;
-  if (target == kCudnnfMHAScaleBiasMaskSoftmaxDropoutCallTarget)
-    return CudnnfMHAKind::kScaleBiasMaskSoftmaxDropout;
-  if (target == kCudnnfMHAScaleMaskSoftmaxCallTarget)
-    return CudnnfMHAKind::kScaleMaskSoftmax;
-  if (target == kCudnnfMHAScaleMaskSoftmaxDropoutCallTarget)
-    return CudnnfMHAKind::kScaleMaskSoftmaxDropout;
   if (target == kCudnnfMHASoftmaxDropoutCallTarget)
     return CudnnfMHAKind::kSoftmaxDropout;
   if (target == kCudnnfMHASoftmaxCallTarget) return CudnnfMHAKind::kSoftmax;
@@ -253,16 +210,6 @@ absl::StatusOr<CudnnfMHAKind> GetCudnnfMHAKind(
   if (target == kCudnnfMHAScaleBiasSoftmaxDropoutCallTarget)
     return CudnnfMHAKind::kScaleBiasSoftmaxDropout;
   // backward
-  if (target == kCudnnfMHABmmBmmBackwardCallTarget)
-    return CudnnfMHAKind::kBackwardBmmBmm;
-  if (target == kCudnnfMHAScaleBiasMaskSoftmaxBackwardCallTarget)
-    return CudnnfMHAKind::kBackwardScaleBiasMaskSoftmax;
-  if (target == kCudnnfMHAScaleBiasMaskSoftmaxDropoutBackwardCallTarget)
-    return CudnnfMHAKind::kBackwardScaleBiasMaskSoftmaxDropout;
-  if (target == kCudnnfMHAScaleMaskSoftmaxBackwardCallTarget)
-    return CudnnfMHAKind::kBackwardScaleMaskSoftmax;
-  if (target == kCudnnfMHAScaleMaskSoftmaxDropoutBackwardCallTarget)
-    return CudnnfMHAKind::kBackwardScaleMaskSoftmaxDropout;
   if (target == kCudnnfMHASoftmaxDropoutBackwardCallTarget)
     return CudnnfMHAKind::kBackwardSoftmaxDropout;
   if (target == kCudnnfMHASoftmaxBackwardCallTarget)
@@ -276,39 +223,19 @@ absl::StatusOr<CudnnfMHAKind> GetCudnnfMHAKind(
 
 std::string CudnnfMHAKindToString(CudnnfMHAKind kind) {
   switch (kind) {
-    case CudnnfMHAKind::kBmmBmm:
-      return "fused_batched_matmuls";
     case CudnnfMHAKind::kSoftmax:
       return "fmha_softmax";
     case CudnnfMHAKind::kSoftmaxDropout:
       return "fmha_softmax_with_dropout";
-    case CudnnfMHAKind::kScaleMaskSoftmax:
-      return "fmha_scaled_masked_softmax";
-    case CudnnfMHAKind::kScaleMaskSoftmaxDropout:
-      return "fmha_scaled_masked_softmax_with_dropout";
-    case CudnnfMHAKind::kScaleBiasMaskSoftmax:
-      return "fmha_scaled_bias_masked_softmax";
-    case CudnnfMHAKind::kScaleBiasMaskSoftmaxDropout:
-      return "fmha_scaled_bias_masked_softmax_with_dropout";
     case CudnnfMHAKind::kScaleBiasSoftmaxDropout:
       return "fmha_bias_softmax_with_dropout";
     case CudnnfMHAKind::kScaleBiasSoftmax:
       return "fmha_bias_softmax";
     // backward
-    case CudnnfMHAKind::kBackwardBmmBmm:
-      return "fused_batched_matmuls_backward";
     case CudnnfMHAKind::kBackwardSoftmax:
       return "fmha_softmax_backward";
     case CudnnfMHAKind::kBackwardSoftmaxDropout:
       return "fmha_softmax_with_dropout_backward";
-    case CudnnfMHAKind::kBackwardScaleMaskSoftmax:
-      return "fmha_scaled_masked_softmax_backward";
-    case CudnnfMHAKind::kBackwardScaleMaskSoftmaxDropout:
-      return "fmha_scaled_masked_softmax_with_dropout_backward";
-    case CudnnfMHAKind::kBackwardScaleBiasMaskSoftmax:
-      return "fmha_scaled_bias_masked_softmax_backward";
-    case CudnnfMHAKind::kBackwardScaleBiasMaskSoftmaxDropout:
-      return "fmha_scaled_bias_masked_softmax_with_dropout_backward";
     case CudnnfMHAKind::kBackwardScaleBiasSoftmaxDropout:
       return "fmha_bias_softmax_with_dropout_backward";
     case CudnnfMHAKind::kBackwardScaleBiasSoftmax:
@@ -318,24 +245,9 @@ std::string CudnnfMHAKindToString(CudnnfMHAKind kind) {
 
 absl::StatusOr<std::string> GetFMHAInstructionPrefix(
     const std::string& custom_call_target) {
-  if (custom_call_target == kCudnnfMHABmmBmmCallTarget) {
-    return "fmha-bmm-bmm";
-  }
   if (custom_call_target == kCudnnfMHASoftmaxDropoutCallTarget) {
     return "fmha-bmm-softmax-dropout-bmm";
   }
-  if (custom_call_target == kCudnnfMHAScaleMaskSoftmaxCallTarget) {
-    return "fmha-bmm-scale-mask-softmax-bmm";
-  }
-  if (custom_call_target == kCudnnfMHAScaleMaskSoftmaxDropoutCallTarget) {
-    return "fmha-bmm-scale-mask-softmax-dropout-bmm";
-  }
-  if (custom_call_target == kCudnnfMHAScaleBiasMaskSoftmaxCallTarget) {
-    return "fmha-bmm-scale-bias-mask-softmax-bmm";
-  }
-  if (custom_call_target == kCudnnfMHAScaleBiasMaskSoftmaxDropoutCallTarget) {
-    return "fmha-bmm-scale-bias-mask-softmax-dropout-bmm";
-  }
   if (custom_call_target == kCudnnfMHASoftmaxCallTarget) {
     return "fmha-bmm-softmax-bmm";
   }
@@ -347,26 +259,9 @@ absl::StatusOr<std::string> GetFMHAInstructionPrefix(
   }
 
   // Backward calls
-  if (custom_call_target == kCudnnfMHABmmBmmBackwardCallTarget) {
-    return "fmha-bmm-bmm-backward";
-  }
   if (custom_call_target == kCudnnfMHASoftmaxDropoutBackwardCallTarget) {
     return "fmha-bmm-softmax-dropout-bmm-backward";
   }
-  if (custom_call_target == kCudnnfMHAScaleMaskSoftmaxBackwardCallTarget) {
-    return "fmha-bmm-scale-mask-softmax-bmm-backward";
-  }
-  if (custom_call_target ==
-      kCudnnfMHAScaleMaskSoftmaxDropoutBackwardCallTarget) {
-    return "fmha-bmm-scale-mask-softmax-dropout-bmm-backward";
-  }
-  if (custom_call_target == kCudnnfMHAScaleBiasMaskSoftmaxBackwardCallTarget) {
-    return "fmha-bmm-scale-bias-mask-softmax-bmm-backward";
-  }
-  if (custom_call_target ==
-      kCudnnfMHAScaleBiasMaskSoftmaxDropoutBackwardCallTarget) {
-    return "fmha-bmm-scale-bias-mask-softmax-dropout-bmm-backward";
-  }
   if (custom_call_target == kCudnnfMHASoftmaxBackwardCallTarget) {
     return "fmha-bmm-softmax-bmm-backward";
   }
diff --git a/third_party/xla/xla/service/gpu/cublas_cudnn.h b/third_party/xla/xla/service/gpu/cublas_cudnn.h
index 08f41affd2e911..c5f009406292e1 100644
--- a/third_party/xla/xla/service/gpu/cublas_cudnn.h
+++ b/third_party/xla/xla/service/gpu/cublas_cudnn.h
@@ -59,20 +59,10 @@ enum class CudnnNormKind {
 };
 
 enum class CudnnfMHAKind {
-  kBmmBmm,
-  kScaleBiasMaskSoftmax,
-  kScaleBiasMaskSoftmaxDropout,
-  kScaleMaskSoftmax,
-  kScaleMaskSoftmaxDropout,
   kSoftmaxDropout,
   kSoftmax,
   kScaleBiasSoftmax,
   kScaleBiasSoftmaxDropout,
-  kBackwardBmmBmm,
-  kBackwardScaleBiasMaskSoftmax,
-  kBackwardScaleBiasMaskSoftmaxDropout,
-  kBackwardScaleMaskSoftmax,
-  kBackwardScaleMaskSoftmaxDropout,
   kBackwardSoftmaxDropout,
   kBackwardSoftmax,
   kBackwardScaleBiasSoftmax,
@@ -181,37 +171,21 @@ bool IsCustomCallToDnnNorm(const HloInstruction& hlo);
 
 // The fused_mha_rewriter phase where each of the MHA signatures are pattern
 // matched and rewritten into a custom-call with specific custom-call target.
-// The custom-call target specifies the MHA signature. For example,  BMM1 - Bias
-// - Scale - Mask - Softmax - BMM2 pattern can have the target as
-// cudnn$fmhaBiasScaleMaskSoftmax.
-// The fMHA signatures currently supported by cudnn are:
-// 1.BMM1 - BMM2
-// 2. BMM1 - Scale - Bias - Mask - Softmax - BMM2
-// 3. BMM1 - Scale - Bias - Mask - Softmax - Dropout - BMM2
-// 4. BMM1 - Scale - Mask - Softmax - BMM2
-// 5. BMM1 - Scale - Mask - Softmax - Dropout - BMM2
-// 6. BMM1 - Softmax - Dropout - BMM2
-// 7. BMM1 - Softmax - BMM2
-// 8. BMM1 - scale - Bias - Softmax - BMM2
+// The custom-call target specifies the MHA signature. For example,  BMM1 -Scale
+// - Bias - Softmax - BMM2 pattern can have the target as
+// cudnn$fmhaScaleBiasSoftmax. The fMHA signatures currently supported by cudnn
+// are:
+// 1. BMM1 - Softmax - BMM2
+// 2. BMM1 - Softmax - Dropout - BMM2
+// 3. BMM1 - scale - Bias - Softmax - BMM2
+// 4. BMM1 - scale - Bias - Softmax - Dropout - BMM2
 // Forward calls
-extern const absl::string_view kCudnnfMHABmmBmmCallTarget;
 extern const absl::string_view kCudnnfMHASoftmaxCallTarget;
-extern const absl::string_view kCudnnfMHAScaleBiasMaskSoftmaxCallTarget;
-extern const absl::string_view kCudnnfMHAScaleBiasMaskSoftmaxDropoutCallTarget;
-extern const absl::string_view kCudnnfMHAScaleMaskSoftmaxCallTarget;
-extern const absl::string_view kCudnnfMHAScaleMaskSoftmaxDropoutCallTarget;
 extern const absl::string_view kCudnnfMHASoftmaxDropoutCallTarget;
 extern const absl::string_view kCudnnfMHAScaleBiasSoftmaxDropoutCallTarget;
 extern const absl::string_view kCudnnfMHAScaleBiasSoftmaxCallTarget;
 // Backward calls
-extern const absl::string_view kCudnnfMHABmmBmmBackwardCallTarget;
 extern const absl::string_view kCudnnfMHASoftmaxBackwardCallTarget;
-extern const absl::string_view kCudnnfMHAScaleBiasMaskSoftmaxBackwardCallTarget;
-extern const absl::string_view
-    kCudnnfMHAScaleBiasMaskSoftmaxDropoutBackwardCallTarget;
-extern const absl::string_view kCudnnfMHAScaleMaskSoftmaxBackwardCallTarget;
-extern const absl::string_view
-    kCudnnfMHAScaleMaskSoftmaxDropoutBackwardCallTarget;
 extern const absl::string_view kCudnnfMHASoftmaxDropoutBackwardCallTarget;
 extern const absl::string_view
     kCudnnfMHAScaleBiasSoftmaxDropoutBackwardCallTarget;
diff --git a/third_party/xla/xla/service/gpu/cublas_pad_for_gemms.cc b/third_party/xla/xla/service/gpu/cublas_pad_for_gemms.cc
index 050f219d12b6c8..a90981f1c932fb 100644
--- a/third_party/xla/xla/service/gpu/cublas_pad_for_gemms.cc
+++ b/third_party/xla/xla/service/gpu/cublas_pad_for_gemms.cc
@@ -133,7 +133,7 @@ namespace {
 // We need this check because PadForGemm works in the assumption that
 // the dot instruction is canonicalized.
 bool CheckCanonical(HloDotInstruction* dot) {
-  auto dimension_numbers = dot->dot_dimension_numbers();
+  const auto& dimension_numbers = dot->dot_dimension_numbers();
 
   if (dimension_numbers.lhs_batch_dimensions_size() + 2 !=
           dot->operand(0)->shape().rank() ||
diff --git a/third_party/xla/xla/service/gpu/cudnn_fused_conv_rewriter.cc b/third_party/xla/xla/service/gpu/cudnn_fused_conv_rewriter.cc
index a1f9bc04a0d290..43226313cd7521 100644
--- a/third_party/xla/xla/service/gpu/cudnn_fused_conv_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_fused_conv_rewriter.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <string>
 #include <tuple>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -96,6 +97,10 @@ bool IsNonDepthwiseConvCustomCall(const HloInstruction* instr) {
   return IsConvCustomCall(instr) && !IsConvDepthwise(instr);
 }
 
+bool IsROCm(se::GpuComputeCapability cc) {
+  return std::holds_alternative<se::RocmComputeCapability>(cc);
+}
+
 // elu, relu6, and leaky-relu activations are supported in cudnn via the
 // "runtime fusion" engine, which JIT compiles C++ code.  This can be slow to
 // compile, so we guard it with a debug option.
@@ -106,8 +111,12 @@ bool IsNonDepthwiseConvCustomCall(const HloInstruction* instr) {
 // Note that as of writing, xla_gpu_use_runtime_fusion is disabled by default
 // due to apparent bugs in cudnn 8.9.0.  See debug_options_flags.cc for details.
 bool ShouldUseCudnnRuntimeFusion(const DebugOptions& debug_opts,
-                                 se::CudaComputeCapability cc) {
-  return debug_opts.xla_gpu_use_runtime_fusion() && cc.IsAtLeast(7, 5);
+                                 se::GpuComputeCapability cc) {
+  const auto* cuda_cc = std::get_if<se::CudaComputeCapability>(&cc);
+  if (cuda_cc != nullptr)
+    return debug_opts.xla_gpu_use_runtime_fusion() && cuda_cc->IsAtLeast(7, 5);
+  else
+    return true;
 }
 
 bool IsSuitableForCudnnRuntimeFusion(HloInstruction* conv) {
@@ -782,6 +791,12 @@ absl::StatusOr<bool> FuseBiasOrSideInput(HloComputation* comp) {
       continue;
     }
 
+    if (!ConsumeFuel("cudnn-fused-convolution-rewriter", [&] {
+          return absl::StrCat("FuseBiasOrSideInput: ", conv->ToString());
+        })) {
+      continue;
+    }
+
     // If it's a vanilla forward conv, upgrade it to a bias-activation conv.  We
     // only want to do this if the fusion will succeed, but we're guaranteed
     // that it will, because the only reason we'll bail at this point is if
@@ -848,12 +863,6 @@ absl::StatusOr<bool> FuseBiasOrSideInput(HloComputation* comp) {
       continue;
     }
 
-    if (!ConsumeFuel("cudnn-fused-convolution-rewriter", [&] {
-          return absl::StrCat("FuseBiasOrSideInput: ", conv->ToString());
-        })) {
-      continue;
-    }
-
     HloInstruction* new_conv = comp->AddInstruction(
         conv->CloneWithNewOperands(conv->shape(), new_operands));
     comp->parent()->SetAndUniquifyInstrName(new_conv, conv->name());
@@ -984,7 +993,7 @@ absl::StatusOr<bool> FuseSideInputAlpha(HloComputation* comp) {
 }
 
 absl::StatusOr<bool> FuseElu(HloComputation* comp,
-                             se::CudaComputeCapability cc) {
+                             se::GpuComputeCapability cc) {
   if (!ShouldUseCudnnRuntimeFusion(comp->parent()->config().debug_options(),
                                    cc)) {
     return false;
@@ -1085,7 +1094,7 @@ absl::StatusOr<bool> FuseRelu(HloComputation* comp) {
 }
 
 absl::StatusOr<bool> FuseRelu6(HloComputation* comp,
-                               se::CudaComputeCapability cc) {
+                               se::GpuComputeCapability cc) {
   if (!ShouldUseCudnnRuntimeFusion(comp->parent()->config().debug_options(),
                                    cc)) {
     return false;
@@ -1134,7 +1143,7 @@ absl::StatusOr<bool> FuseRelu6(HloComputation* comp,
 }
 
 absl::StatusOr<bool> FuseLeakyRelu(HloComputation* comp,
-                                   se::CudaComputeCapability cc) {
+                                   se::GpuComputeCapability cc) {
   if (!ShouldUseCudnnRuntimeFusion(comp->parent()->config().debug_options(),
                                    cc)) {
     return false;
@@ -1254,7 +1263,9 @@ absl::StatusOr<bool> FuseConvertToF16(HloComputation* comp) {
   return changed;
 }
 
-absl::StatusOr<bool> FuseConvertToS8(HloComputation* comp) {
+absl::StatusOr<bool> FuseConvertToS8(HloComputation* comp,
+                                     se::GpuComputeCapability cc) {
+  if (IsROCm(cc)) return false;
   bool changed = false;
   for (HloInstruction* instr : comp->MakeInstructionPostOrder()) {
     HloInstruction* gte = nullptr;
@@ -1480,9 +1491,12 @@ absl::StatusOr<bool> CudnnFusedConvRewriter::Run(
     bool changed = false;
     // Rewrite FP8 convolutions and supported adjacent pointwise ops into a
     // ForwardGraph Custom Call.
-    TF_ASSIGN_OR_RETURN(changed, F8GraphConv(comp, compute_capability_));
-    if (changed) {
-      return changed;
+    if (!IsROCm(compute_capability_)) {
+      auto cc = std::get<se::CudaComputeCapability>(compute_capability_);
+      TF_ASSIGN_OR_RETURN(changed, F8GraphConv(comp, cc));
+      if (changed) {
+        return changed;
+      }
     }
     // Fuse "inside out" starting with the operations closest to the conv.
     TF_ASSIGN_OR_RETURN(changed, FuseRemoveConvertInConv(comp));
@@ -1516,7 +1530,7 @@ absl::StatusOr<bool> CudnnFusedConvRewriter::Run(
     TF_ASSIGN_OR_RETURN(changed, FuseConvertToF16(comp));
     any_changed |= changed;
 
-    TF_ASSIGN_OR_RETURN(changed, FuseConvertToS8(comp));
+    TF_ASSIGN_OR_RETURN(changed, FuseConvertToS8(comp, compute_capability_));
     any_changed |= changed;
 
     // f16 convs' bias+side-input can appear before or after conversion to f16.
diff --git a/third_party/xla/xla/service/gpu/cudnn_fused_conv_rewriter.h b/third_party/xla/xla/service/gpu/cudnn_fused_conv_rewriter.h
index bc7291d262a616..ff1d1565255399 100644
--- a/third_party/xla/xla/service/gpu/cudnn_fused_conv_rewriter.h
+++ b/third_party/xla/xla/service/gpu/cudnn_fused_conv_rewriter.h
@@ -100,6 +100,8 @@ class CudnnFusedConvRewriter : public HloModulePass {
  public:
   explicit CudnnFusedConvRewriter(se::CudaComputeCapability cc)
       : compute_capability_(cc) {}
+  explicit CudnnFusedConvRewriter(se::RocmComputeCapability cc)
+      : compute_capability_(cc) {}
 
   absl::string_view name() const override {
     return "cudnn-fused-convolution-rewriter";
@@ -111,7 +113,7 @@ class CudnnFusedConvRewriter : public HloModulePass {
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
  private:
-  const se::CudaComputeCapability compute_capability_;
+  const se::GpuComputeCapability compute_capability_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/cudnn_fused_conv_rewriter_test.cc b/third_party/xla/xla/service/gpu/cudnn_fused_conv_rewriter_test.cc
index 4a55b9dc9eb403..a1862bf2074d01 100644
--- a/third_party/xla/xla/service/gpu/cudnn_fused_conv_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_fused_conv_rewriter_test.cc
@@ -19,7 +19,10 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <string_view>
+#include <thread>  // NOLINT
 #include <utility>
+#include <variant>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -68,8 +71,19 @@ namespace m = match;
 using ::testing::HasSubstr;
 using ::testing::Not;
 
+// TODO: Use constexpr vector once XLA is compiled with C++20.
+const auto* kf16f32f64 = new std::vector<std::string>({"f16", "f32", "f64"});
+const auto* kf16f32 = new std::vector<std::string>({"f16", "f32"});
+
 class CudnnFusedConvRewriterHloTest : public HloTestBase {
  public:
+  bool IsCuda() {
+    return std::holds_alternative<se::CudaComputeCapability>(
+        backend()
+            .default_stream_executor()
+            ->GetDeviceDescription()
+            .gpu_compute_capability());
+  }
   se::CudaComputeCapability GetCudaComputeCapability() {
     return backend()
         .default_stream_executor()
@@ -85,6 +99,13 @@ class CudnnFusedConvRewriterHloTest : public HloTestBase {
 
 class CudnnFusedConvRewriterTest : public GpuCodegenTest {
  public:
+  bool IsCuda() {
+    return std::holds_alternative<se::CudaComputeCapability>(
+        backend()
+            .default_stream_executor()
+            ->GetDeviceDescription()
+            .gpu_compute_capability());
+  }
   se::CudaComputeCapability GetCudaComputeCapability() {
     return backend()
         .default_stream_executor()
@@ -119,7 +140,7 @@ class CudnnFusedConvRewriterTest : public GpuCodegenTest {
   }
 
   void TestMatchWithAllTypes(absl::string_view hlo_string) {
-    for (absl::string_view type : {"f16", "f32", "f64"}) {
+    for (absl::string_view type : *(IsCuda() ? kf16f32f64 : kf16f32)) {
       const std::string hlo_with_new_type =
           absl::StrReplaceAll(hlo_string, {{"TYPE", type}});
       std::string optimized_hlo_string = GetOptimizedHlo(hlo_with_new_type);
@@ -157,7 +178,7 @@ class CudnnFusedConvRewriterTest : public GpuCodegenTest {
   }
 
   void TestNotMatchWithAllTypes(absl::string_view hlo_string) {
-    for (absl::string_view type : {"f16", "f32", "f64"}) {
+    for (absl::string_view type : *(IsCuda() ? kf16f32f64 : kf16f32)) {
       const std::string hlo_with_new_type =
           absl::StrReplaceAll(hlo_string, {{"TYPE", type}});
       std::string optimized_hlo_string = GetOptimizedHlo(hlo_with_new_type);
@@ -170,6 +191,7 @@ class CudnnFusedConvRewriterTest : public GpuCodegenTest {
 
   void TestF8(std::string pre_hlo_string, std::string custom_call_string,
               std::string serialized_graph_string) {
+    if (!IsCuda()) return;
     if (GetCudaComputeCapability().IsAtLeast(
             se::CudaComputeCapability::HOPPER)) {
       // On Hopper and newer architectures, test numerical correctness and
@@ -244,6 +266,23 @@ class CudnnFusedConvRewriterTest : public GpuCodegenTest {
   }
 };
 
+#if GOOGLE_CUDA
+#if (CUDA_VERSION < 12000 || CUDNN_VERSION < 8900)
+#define MAYBE_SKIP_TEST(CAUSE)                                           \
+  do {                                                                   \
+    if (absl::string_view(CAUSE) == "F8")                                \
+      GTEST_SKIP() << "FP8 convolutions require CUDA 12 and cuDNN 8.9."; \
+  } while (0)
+#else
+#define MAYBE_SKIP_TEST(CAUSE)
+#endif
+#else
+#define MAYBE_SKIP_TEST(CAUSE)                                \
+  do {                                                        \
+    GTEST_SKIP() << "ROCm does not support " CAUSE " fusion"; \
+  } while (0)
+#endif
+
 TEST_F(CudnnFusedConvRewriterTest, TestConvOnly) {
   // max(0, conv(x, w));
   TestMatchWithAllTypes(R"(
@@ -298,6 +337,81 @@ TEST_F(CudnnFusedConvRewriterTest, TestBias) {
     })");
 }
 
+TEST_F(CudnnFusedConvRewriterTest, Test3D) {
+  // max(0, conv(x, w) + bias);
+  std::string body = R"(
+    HloModule Test
+
+    ENTRY Test {
+      zero = TYPE[] constant(0)
+      zeros = TYPE[1,3,5,7,64] broadcast(zero), dimensions={}
+
+      input = TYPE[1,3,5,7,64] parameter(0)
+      filter = TYPE[3,3,3,64,64] parameter(1)
+      bias = TYPE[64] parameter(2)
+
+      conv = TYPE[1,3,5,7,64] convolution(input, filter), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=b012f_012io->b012f, feature_group_count=1
+      broadcasted_bias = TYPE[1,3,5,7,64] broadcast(bias), dimensions={4}
+      add1 = TYPE[1,3,5,7,64] add(conv, broadcasted_bias)
+    )";
+
+  std::string relu = R"(
+      ROOT relu = TYPE[1,3,5,7,64] maximum(zeros, add1)
+    })";
+
+  std::string elu = R"(
+      cmp = pred[1,3,5,7,64] compare(add1, zeros), direction=GT
+      expm1 = TYPE[1,3,5,7,64] exponential-minus-one(add1)
+      ROOT elu = TYPE[1,3,5,7,64] select(cmp, add1, expm1)
+    })";
+
+  TestMatchWithAllTypes(body + relu);
+  if (!IsCuda()) TestMatchWithAllTypes(body + elu);
+}
+
+TEST_F(CudnnFusedConvRewriterTest, TestBiasMultiCall) {
+  // max(0, conv(x, w) + bias);
+  std::string code = R"(
+    HloModule Test
+
+    ENTRY Test {
+      zero = TYPE[] constant(0)
+      zeros = TYPE[1,<<<format>>>,64] broadcast(zero), dimensions={}
+
+      input = TYPE[1,<<<format>>>,64] parameter(0)
+      filter = TYPE[3,3,64,64] parameter(1)
+      bias = TYPE[64] parameter(2)
+
+      conv = TYPE[1,<<<format>>>,64] convolution(input, filter), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, feature_group_count=1
+      broadcasted_bias = TYPE[1,<<<format>>>,64] broadcast(bias), dimensions={3}
+      add1 = TYPE[1,<<<format>>>,64] add(conv, broadcasted_bias)
+      ROOT relu = TYPE[1,<<<format>>>,64] maximum(zeros, add1)
+    })";
+  absl::flat_hash_map<absl::string_view, absl::string_view> replacements;
+  replacements["<<<format>>>"] = "3,3";
+  TestMatchWithAllTypes(absl::StrReplaceAll(code, replacements));
+  replacements["<<<format>>>"] = "5,5";
+  TestMatchWithAllTypes(absl::StrReplaceAll(code, replacements));
+  replacements["<<<format>>>"] = "3,3";
+  TestMatchWithAllTypes(absl::StrReplaceAll(code, replacements));
+}
+
+TEST_F(CudnnFusedConvRewriterTest, TestBiasNoRelu) {
+  // conv(x, w) + bias;
+  TestMatchWithAllTypes(R"(
+    HloModule Test
+
+    ENTRY Test {
+      input = TYPE[1,3,3,64] parameter(0)
+      filter = TYPE[3,3,64,64] parameter(1)
+      bias = TYPE[64] parameter(2)
+
+      conv = TYPE[1,3,3,64] convolution(input, filter), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, feature_group_count=1
+      broadcasted_bias = TYPE[1,3,3,64] broadcast(bias), dimensions={3}
+      ROOT add1 = TYPE[1,3,3,64] add(conv, broadcasted_bias)
+    })");
+}
+
 TEST_F(CudnnFusedConvRewriterTest, DontFuseBiasWithDepthwiseConv) {
   // conv(x, w) + bias;
   TestNotMatchWithAllTypes(R"(
@@ -365,8 +479,8 @@ TEST_F(CudnnFusedConvRewriterTest, DontFuseEluWithDepthwiseConv) {
 }
 
 TEST_F(CudnnFusedConvRewriterTest, TestRelu6) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
+  if (IsCuda() && !GetCudaComputeCapability().IsAtLeast(
+                      se::CudaComputeCapability::AMPERE)) {
     GTEST_SKIP() << "Conv-Bias-Relu6 fusion is supported and recommended with "
                     "the Nvidia Ampere+ GPUs.";
   }
@@ -393,12 +507,11 @@ TEST_F(CudnnFusedConvRewriterTest, TestRelu6) {
 // number of input/output channels.  Check that we don't try to run this conv
 // with runtime fusion (or, if we do, that it works!).
 TEST_F(CudnnFusedConvRewriterTest, TestRelu6OddChannels) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
+  if (IsCuda() && !GetCudaComputeCapability().IsAtLeast(
+                      se::CudaComputeCapability::AMPERE)) {
     GTEST_SKIP() << "Conv-Bias-Relu6 fusion is supported and recommended with "
                     "the Nvidia Ampere+ GPUs.";
   }
-
   TestMatchWithAllTypes(R"(
     HloModule Test
     ENTRY Test {
@@ -415,8 +528,8 @@ TEST_F(CudnnFusedConvRewriterTest, TestRelu6OddChannels) {
 }
 
 TEST_F(CudnnFusedConvRewriterTest, TestLeakyRelu) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
+  if (IsCuda() && !GetCudaComputeCapability().IsAtLeast(
+                      se::CudaComputeCapability::AMPERE)) {
     GTEST_SKIP()
         << "Conv-Bias-LeakyRelu fusion is supported and recommended with "
            "the Nvidia Ampere+ GPUs.";
@@ -730,9 +843,7 @@ TEST_F(CudnnFusedConvRewriterTest, TestPreservesFeatureGroupCount) {
 }
 
 TEST_F(CudnnFusedConvRewriterTest, TestConvF8) {
-#if (CUDA_VERSION < 12000 || CUDNN_VERSION < 8900)
-  GTEST_SKIP() << "FP8 convolutions require CUDA 12 and cuDNN 8.9.";
-#endif
+  MAYBE_SKIP_TEST("F8");
   TestF8(
       // pre_hlo
       R"(
@@ -755,9 +866,7 @@ TEST_F(CudnnFusedConvRewriterTest, TestConvF8) {
 }
 
 TEST_F(CudnnFusedConvRewriterTest, TestConvScaledOutputF8) {
-#if (CUDA_VERSION < 12000 || CUDNN_VERSION < 8900)
-  GTEST_SKIP() << "FP8 convolutions require CUDA 12 and cuDNN 8.9.";
-#endif
+  MAYBE_SKIP_TEST("F8");
   TestF8(
       // pre_hlo
       R"(
@@ -791,9 +900,7 @@ TEST_F(CudnnFusedConvRewriterTest, TestConvScaledOutputF8) {
 }
 
 TEST_F(CudnnFusedConvRewriterTest, TestConvInvscaledOutputF8) {
-#if (CUDA_VERSION < 12000 || CUDNN_VERSION < 8900)
-  GTEST_SKIP() << "FP8 convolutions require CUDA 12 and cuDNN 8.9.";
-#endif
+  MAYBE_SKIP_TEST("F8");
   TestF8(
       // pre_hlo
       R"(
@@ -827,9 +934,7 @@ TEST_F(CudnnFusedConvRewriterTest, TestConvInvscaledOutputF8) {
 }
 
 TEST_F(CudnnFusedConvRewriterTest, TestConvScaledF8Parameterized) {
-#if (CUDA_VERSION < 12000 || CUDNN_VERSION < 8900)
-  GTEST_SKIP() << "FP8 convolutions require CUDA 12 and cuDNN 8.9.";
-#endif
+  MAYBE_SKIP_TEST("F8");
   TestF8Parameterized(
       // pre_hlo
       R"(
@@ -869,9 +974,7 @@ TEST_F(CudnnFusedConvRewriterTest, TestConvScaledF8Parameterized) {
 }
 
 TEST_F(CudnnFusedConvRewriterTest, TestConvScaledBiasF8) {
-#if (CUDA_VERSION < 12000 || CUDNN_VERSION < 8900)
-  GTEST_SKIP() << "FP8 convolutions require CUDA 12 and cuDNN 8.9.";
-#endif
+  MAYBE_SKIP_TEST("F8");
   TestF8(
       // pre_hlo
       R"(
@@ -913,9 +1016,7 @@ TEST_F(CudnnFusedConvRewriterTest, TestConvScaledBiasF8) {
 }
 
 TEST_F(CudnnFusedConvRewriterTest, TestConvScaledReluF8) {
-#if (CUDA_VERSION < 12000 || CUDNN_VERSION < 8900)
-  GTEST_SKIP() << "FP8 convolutions require CUDA 12 and cuDNN 8.9.";
-#endif
+  MAYBE_SKIP_TEST("F8");
   TestF8(
       // pre_hlo
       R"(
@@ -952,6 +1053,7 @@ TEST_F(CudnnFusedConvRewriterTest, TestConvScaledReluF8) {
 }
 
 TEST_F(CudnnFusedConvRewriterTest, TestConvAmaxF8) {
+  MAYBE_SKIP_TEST("F8");
   TestF8(
       // pre_hlo
       R"(
@@ -1001,6 +1103,7 @@ TEST_F(CudnnFusedConvRewriterTest, TestConvAmaxF8) {
 }
 
 TEST_F(CudnnFusedConvRewriterTest, TestConvReluAmaxF8) {
+  MAYBE_SKIP_TEST("F8");
   TestF8(
       // pre_hlo
       R"(
@@ -1053,9 +1156,7 @@ TEST_F(CudnnFusedConvRewriterTest, TestConvReluAmaxF8) {
 }
 
 TEST_F(CudnnFusedConvRewriterTest, TestConvScaledOutputMultipleUsersF8) {
-#if (CUDA_VERSION < 12000 || CUDNN_VERSION < 8900)
-  GTEST_SKIP() << "FP8 convolutions require CUDA 12 and cuDNN 8.9.";
-#endif
+  MAYBE_SKIP_TEST("F8");
   TestF8(
       // pre_hlo
       R"(
@@ -1095,9 +1196,7 @@ TEST_F(CudnnFusedConvRewriterTest, TestConvScaledOutputMultipleUsersF8) {
 }
 
 TEST_F(CudnnFusedConvRewriterTest, TestConvScaledOutputUnsupportedUserF8) {
-#if (CUDA_VERSION < 12000 || CUDNN_VERSION < 8900)
-  GTEST_SKIP() << "FP8 convolutions require CUDA 12 and cuDNN 8.9.";
-#endif
+  MAYBE_SKIP_TEST("F8");
   TestF8(
       // pre_hlo
       R"(
@@ -1133,6 +1232,7 @@ TEST_F(CudnnFusedConvRewriterTest, TestConvScaledOutputUnsupportedUserF8) {
 }
 
 TEST_F(CudnnFusedConvRewriterTest, TestConvInt8ToInt8) {
+  MAYBE_SKIP_TEST("I8");
   // max(0, clamp(conv(x, w)))); for int8_t
   TestClamp(
       // pre_hlo
@@ -1167,6 +1267,7 @@ TEST_F(CudnnFusedConvRewriterTest, TestConvInt8ToInt8) {
 }
 
 TEST_F(CudnnFusedConvRewriterHloTest, TestConvInt8ToFloat) {
+  MAYBE_SKIP_TEST("I8");
   const std::string module_str = R"(
     HloModule Test
 
@@ -1198,6 +1299,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, TestConvInt8ToFloat) {
 }
 
 TEST_F(CudnnFusedConvRewriterHloTest, TestConvInt8ToInt8BiasSideInput) {
+  MAYBE_SKIP_TEST("I8");
   const std::string module_str = R"(
     HloModule Test
 
@@ -1238,6 +1340,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, TestConvInt8ToInt8BiasSideInput) {
 }
 
 TEST_F(CudnnFusedConvRewriterHloTest, TestReluAfterConvert) {
+  MAYBE_SKIP_TEST("I8");
   const std::string module_str = R"(
     HloModule Test
 
@@ -1286,6 +1389,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, TestReluAfterConvert) {
 }
 
 TEST_F(CudnnFusedConvRewriterHloTest, TestConvInt8ToFloatBiasSideInput) {
+  MAYBE_SKIP_TEST("I8");
   const std::string module_str = R"(
     HloModule Test
 
@@ -1333,6 +1437,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, TestConvInt8ToFloatBiasSideInput) {
 //   reshape(side_input * alpha).
 // Make sure we can pattern-match this.
 TEST_F(CudnnFusedConvRewriterHloTest, Int8SideInputWithScaleAndReshape) {
+  MAYBE_SKIP_TEST("I8");
   const std::string module_str = R"(
     HloModule Test
 
@@ -1386,6 +1491,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, Int8SideInputWithScaleAndReshape) {
 }
 
 TEST_F(CudnnFusedConvRewriterHloTest, FuseAlpha) {
+  MAYBE_SKIP_TEST("I8");
   const std::string module_str = R"(
     HloModule Test
 
@@ -2054,6 +2160,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, DontFuseToS8IfMultipleUsers) {
 }
 
 TEST_F(CudnnFusedConvRewriterHloTest, RemoveConvertByFusingS32ToF32) {
+  MAYBE_SKIP_TEST("I8");
   const std::string_view module_str = R"(
     HloModule Test
 
@@ -2079,6 +2186,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, RemoveConvertByFusingS32ToF32) {
 }
 
 TEST_F(CudnnFusedConvRewriterHloTest, RemoveConvertByFusingS8ToF32) {
+  MAYBE_SKIP_TEST("I8");
   const std::string_view module_str = R"(
     HloModule Test
 
@@ -2104,6 +2212,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, RemoveConvertByFusingS8ToF32) {
 }
 
 TEST_F(CudnnFusedConvRewriterHloTest, RemoveConvertByFusingF32ToS8) {
+  MAYBE_SKIP_TEST("I8");
   const std::string_view module_str = R"(
     HloModule Test
 
@@ -2608,6 +2717,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, F32ConstantsNotLosslesslyConvertible) {
 }
 
 TEST_F(CudnnFusedConvRewriterHloTest, FuseReluBeforeConvert) {
+  MAYBE_SKIP_TEST("I8");
   const std::string module_str = R"(
   HloModule Test
 
@@ -2663,6 +2773,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, FuseReluBeforeConvert) {
 }
 
 TEST_F(CudnnFusedConvRewriterHloTest, BiasTypeMatchesConvTypeIfFp) {
+  MAYBE_SKIP_TEST("F64");
   const std::string module_str = R"(
   HloModule Test
 
@@ -2699,6 +2810,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, BiasTypeMatchesConvTypeIfFp) {
 }
 
 TEST_F(CudnnFusedConvRewriterTest, TestFusedConvInt8ToInt8) {
+  MAYBE_SKIP_TEST("I8");
   // clamp(max(0, conv(x, w)+bias)); for int8_t
   TestClamp(
       // pre_hlo
@@ -2740,6 +2852,7 @@ TEST_F(CudnnFusedConvRewriterTest, TestFusedConvInt8ToInt8) {
 
 // Disabled per b/190854862 or nvbugs/3326122.
 TEST_F(CudnnFusedConvRewriterTest, DISABLED_TestFusedConvInt8ToFloat) {
+  MAYBE_SKIP_TEST("I8");
   // max(0, convert<float>(conv<int32_t>(int8_x),
   // conv<int32_t>(int8_w))+float_bias)); int8_t to float via bias.
   TestClamp(
@@ -2775,6 +2888,7 @@ TEST_F(CudnnFusedConvRewriterTest, DISABLED_TestFusedConvInt8ToFloat) {
 
 TEST_F(CudnnFusedConvRewriterTest,
        TestFusedConvWithScaledInt8SideInputBiasInt8ToInt8) {
+  MAYBE_SKIP_TEST("I8");
   // clamp(max(0, alpha_conv * conv(x, w) + alpha_side *
   // convert<int32_t>(int8_side_input) + bias)); for int8_t
   TestClamp(
@@ -2826,6 +2940,7 @@ TEST_F(CudnnFusedConvRewriterTest,
 
 TEST_F(CudnnFusedConvRewriterTest,
        TestFusedConvWithScaledFloatSideInputBiasInt8ToInt8) {
+  MAYBE_SKIP_TEST("I8");
   // From:
   // convert<int8_t>(clamp(max(0, alpha_conv * conv(x, w) + alpha_side *
   // float_side_input + bias))); To: convert<int8_t>(clamp(conv(int8_x, int8_w,
@@ -2878,6 +2993,7 @@ TEST_F(CudnnFusedConvRewriterTest,
 
 TEST_F(CudnnFusedConvRewriterTest,
        TestFusedConvWithScaledInt8SideInputBiasInt8ToFloat) {
+  MAYBE_SKIP_TEST("I8");
   // From:
   // clamp(max(0, alpha_conv * conv(x, w) + alpha_side *
   // convert<float>(int8_side_input) + bias)); To: clamp(conv(int8_x, int8_w,
@@ -2928,6 +3044,7 @@ TEST_F(CudnnFusedConvRewriterTest,
 }
 
 TEST_F(CudnnFusedConvRewriterTest, TestConvInt8ToInt8NoClamp) {
+  MAYBE_SKIP_TEST("I8");
   // Check that integer convolution without clamp to int8_t is not allowed.
   // convert<int8_t>(custom_call<int32_t>(int32_x, int32_w,
   // cudnnConvolutionForward))
@@ -2951,6 +3068,7 @@ TEST_F(CudnnFusedConvRewriterTest, TestConvInt8ToInt8NoClamp) {
 }
 
 TEST_F(CudnnFusedConvRewriterTest, TestFusedConvInt8ToInt8NoClamp) {
+  MAYBE_SKIP_TEST("I8");
   // Although bias and so on are fused with forward convolution,
   // it is still not allowed if the output is not clampped/converted to int8_t
   // max(0, alpha_conv * conv(x, w) + alpha_side * side_input + bias); for
diff --git a/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter.cc b/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter.cc
index 636788795049b4..ebbe483725bdc4 100644
--- a/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter.cc
@@ -71,7 +71,6 @@ struct MatchFwdResult {
   HloInstruction* matched_bmm_1 = nullptr;
   HloInstruction* matched_bmm_2 = nullptr;
   HloInstruction* matched_bias = nullptr;
-  HloInstruction* matched_mask = nullptr;
   HloInstruction* matched_scale = nullptr;
   HloInstruction* matched_softmax_input = nullptr;
   HloInstruction* matched_reduce_sum = nullptr;
@@ -79,13 +78,10 @@ struct MatchFwdResult {
   double matched_dropout_rate = 0.0;
   bool need_canonicalization = false;
   bool is_training = false;
-  // We use this to keep track of whether the bias or the mask that is being
+  // We use this to keep track of whether the bias is being
   // applied to the bmm1 is a causal mask, cuDNN can generate causal mask inside
   // the attention kernel to save I/O.
   bool is_causal_mask = false;
-  // We use this to keep track of whether the attention block should be lowered
-  // to flash attention or regular fused attention in cuDNN.
-  bool is_flash_attention = false;
   bool has_match = false;
   std::string matched_custom_call_name;
 };
@@ -424,22 +420,6 @@ absl::StatusOr<std::optional<QKVLayout>> GetQKVLayout(
   return qkv_layout;
 }
 
-absl::StatusOr<bool> IsFusedAttention(
-    QKVLayout qkv_layout, bool is_training,
-    stream_executor::CudaComputeCapability cc,
-    stream_executor::dnn::VersionInfo cudnn_version) {
-  // otherwise check if it is supported by regular attention
-  int64_t s_q = qkv_layout.seqlen_q;
-  int64_t s_kv = qkv_layout.seqlen_kv;
-  int64_t hidden_dim = qkv_layout.hidden_dim;
-  bool is_seqlen_supported =
-      (s_q <= 512 && s_kv <= 512) &&
-      (!is_training || (s_q % 64 == 0 && s_kv % 64 == 0));
-  bool is_hidden_dim_supported = hidden_dim == 64;
-  bool is_fused_attention = is_seqlen_supported && is_hidden_dim_supported;
-  return is_fused_attention;
-}
-
 absl::StatusOr<bool> IsFlashAttention(
     QKVLayout qkv_layout, bool is_training,
     stream_executor::CudaComputeCapability cc,
@@ -448,8 +428,7 @@ absl::StatusOr<bool> IsFlashAttention(
   int64_t s_kv = qkv_layout.seqlen_kv;
   int64_t hidden_dim = qkv_layout.hidden_dim;
   // start with most relaxed constraint
-  bool is_seqlen_supported = (s_q > 512 || s_kv > 512) &&
-                             (!is_training || (s_q % 2 == 0 && s_kv % 2 == 0));
+  bool is_seqlen_supported = (!is_training || (s_q % 2 == 0 && s_kv % 2 == 0));
   bool is_hidden_dim_supported = hidden_dim <= 128 && hidden_dim % 8 == 0;
   bool is_flash_attention = is_seqlen_supported && is_hidden_dim_supported;
   if (!is_flash_attention) return false;
@@ -521,42 +500,6 @@ bool IsCausalMaskPattern(HloInstruction* mask) {
   return false;
 }
 
-MatchFwdResult MatchDefaultFwdBmmBmm(MatchFwdResult previous_result,
-                                     int64_t bmm2_operand_position,
-                                     HloInstruction* instr) {
-  MatchFwdResult match_result = previous_result;
-  // Try matching default bmm1-bmm2 pattern
-  HloInstruction* bmm_1;
-  HloInstruction* bmm_2;
-  // bmm1 should have at most 2 users at this case
-  // 1. 1 user(bmm2) in case of inference
-  // 2. 2 users(bmm2 and backward bmm) in case of training
-  auto default_bmm_bmm_pattern =
-      m::Op(&bmm_2)
-          .WithPredicate(IsBatchedMatmul)
-          .WithOperand(bmm2_operand_position,
-                       m::Op(&bmm_1)
-                           .WithPredicate(IsBatchedMatmul)
-                           .WithAtMostNumUser(2));
-
-  // If any of bmm1's operands is coming from a forward fMHA call, then return
-  // false
-  if (Match(instr, default_bmm_bmm_pattern) && IsFirstFwdMatmul(bmm_1)) {
-    match_result.matched_bmm_1 = bmm_1;
-    match_result.matched_bmm_2 = bmm_2;
-    // In training mode, the forward fmha call needs to output an activation
-    // to backward graph. In the case of bmm-bmm pattern, if the first bmm
-    // has 2 users namely:
-    //    1. the second forward bmm
-    //    2. one of the backward bmms(activation)
-    // then we know this is a training graph, otherwise it's an inference graph.
-    match_result.is_training = bmm_1->user_count() == 2;
-    match_result.has_match = true;
-    match_result.matched_custom_call_name = kCudnnfMHABmmBmmCallTarget;
-  }
-  return match_result;
-}
-
 MatchFwdResult MatchSoftmaxDropoutBmm(MatchFwdResult previous_result,
                                       int64_t bmm2_operand_position,
                                       HloInstruction* instr) {
@@ -603,7 +546,7 @@ MatchFwdResult MatchSoftmaxDropoutBmm(MatchFwdResult previous_result,
           m::Op()),
       m::Broadcast(m::Constant(&dropout).WithPredicate(IsScalar)));
 
-  // Try matching BMM1 - (Scale) - (Bias) - (Mask) - Softmax - (Dropout) -
+  // Try matching BMM1 - (Scale) - (Bias) - Softmax - (Dropout) -
   // BMM2 Dropout with non-zero drop rate has select(divide(softmax_output,
   // broadcast(1-dropout_rate)))
   auto softmax_dropout_bmm2_pattern =
@@ -673,11 +616,30 @@ MatchFwdResult MatchBmm1UnfusedBiasSoftmaxBmm2(MatchFwdResult previous_result,
     // bmm1 - (scale) - bias - softmax
     match_result.matched_bmm_1 = bmm_1;
     match_result.matched_scale = scale;
-    match_result.matched_bias = bias;
     match_result.matched_custom_call_name =
         has_dropout ? kCudnnfMHAScaleBiasSoftmaxDropoutCallTarget
                     : kCudnnfMHAScaleBiasSoftmaxCallTarget;
     match_result.is_causal_mask |= IsCausalMaskPattern(bias);
+    if (!match_result.is_causal_mask &&
+        bias->opcode() == HloOpcode::kBroadcast) {
+      // we can take the bias before broadcast
+      auto dims = Cast<HloBroadcastInstruction>(bias)->dimensions();
+      if (dims == std::vector<int64_t>{2, 3} ||
+          dims == std::vector<int64_t>{0, 2, 3} ||
+          dims == std::vector<int64_t>{1, 2, 3}) {
+        // shapes [1, 1, s, s], [b, 1, s, s], [1, h, s, s] are supported
+        HloInstruction* bias_bc = bias->mutable_operand(0);
+        // bitcast bias_before_broadcast to be 4D
+        std::vector<int64_t> bitcast_dims(bias->shape().rank(), 1);
+        for (int dim : dims) {
+          bitcast_dims[dim] = bias->shape().dimensions()[dim];
+        }
+        bias = bias_bc->AddInstruction(HloInstruction::CreateBitcast(
+            ShapeUtil::MakeShape(bias->shape().element_type(), bitcast_dims),
+            bias_bc));
+      }
+    }
+    match_result.matched_bias = bias;
     match_result.has_match = true;
   } else {
     match_result.has_match = false;
@@ -685,80 +647,11 @@ MatchFwdResult MatchBmm1UnfusedBiasSoftmaxBmm2(MatchFwdResult previous_result,
   return match_result;
 }
 
-MatchFwdResult MatchBmm1ScaleBiasMaskSoftmaxDropoutBmm2(
-    MatchFwdResult previous_result, HloInstruction* softmax_input,
-    bool has_dropout) {
-  MatchFwdResult matched_result = previous_result;
-  HloInstruction* bmm_1;
-  HloInstruction* bias = nullptr;
-  HloInstruction* scale = nullptr;
-  HloInstruction* mask = nullptr;
-
-  // This is the subpattern for unfused scaled gemm since cublas
-  // doesn't always fuse the scale into alpha.
-  auto unfused_scaled_bmm_subpattern = m::SharedSubpattern(m::MultiplyAnyOrder(
-      OptionalConvert(
-          m::Op(&bmm_1).WithPredicate(IsBatchedMatmul).WithOneUse()),
-      m::Broadcast(m::Constant(&scale).WithPredicate(IsScalar))));
-  // bmm1/scale/bias add/mask should have 2 users if being connected to softmax
-  // otherwise should have exactly 1 user
-  if (Match(softmax_input,
-            OptionalConvert(m::Select(
-                m::Op(&mask).WithPredicate([](const HloInstruction* instr) {
-                  return instr->shape().element_type() == PRED;
-                }),
-                // Match bmm1-scale-bias-mask
-                m::AnyOf<HloInstruction>(
-                    // Scale and bias might or might not be fused
-                    // with gemm
-                    m::Op(&bmm_1).WithPredicate(IsBatchedMatmul).WithOneUse(),
-                    OptionalConvert(m::AnyOf<HloInstruction>(
-                        // Try to match unfused bias
-                        m::AddAnyOrder(
-                            m::Op(&bias),
-                            m::AnyOf<HloInstruction>(
-                                OptionalConvert(
-                                    m::Op(&bmm_1)
-                                        .WithPredicate(IsBatchedMatmul)
-                                        .WithOneUse()),
-                                unfused_scaled_bmm_subpattern.WithOneUse())),
-                        unfused_scaled_bmm_subpattern.WithOneUse()))),
-                m::Op())))) {
-    if (!IsSupportedPrimitiveType(bmm_1)) {
-      matched_result.has_match = false;
-      return matched_result;
-    }
-    if (has_dropout) {
-      // Found BMM1 - (Scale) - (bias) - Mask - Softmax - dropout - BMM2
-      matched_result.matched_custom_call_name =
-          bias == nullptr ? kCudnnfMHAScaleMaskSoftmaxDropoutCallTarget
-                          : kCudnnfMHAScaleBiasMaskSoftmaxDropoutCallTarget;
-    } else {
-      // Found BMM1 - Scale - Mask - Softmax - BMM2
-      matched_result.matched_custom_call_name =
-          bias == nullptr ? kCudnnfMHAScaleMaskSoftmaxCallTarget
-                          : kCudnnfMHAScaleBiasMaskSoftmaxCallTarget;
-    }
-    matched_result.matched_bmm_1 = bmm_1;
-    matched_result.matched_scale = scale;
-    matched_result.matched_mask = mask;
-    matched_result.matched_bias = bias;
-    matched_result.has_match = true;
-  } else {
-    matched_result.has_match = false;
-  }
-  return matched_result;
-}
-
 // We will try to match all the patterns below:
-// BMM1 - Scale - Bias - Mask - Softmax - Dropout - BMM2
-// BMM1 - Scale - Mask - Softmax - Dropout - BMM2
-// BMM1 - Scale - Bias - Mask - Softmax - BMM2
-// BMM1 - Scale - Mask - Softmax - BMM2
+// BMM1 - Scale - bias - Softmax - Dropout - BMM2
 // BMM1 - Scale - bias - Softmax - BMM2
 // BMM1 - Softmax - Dropout - BMM2
 // BMM1 - Softmax - BMM2
-// BMM1 - BMM2
 MatchFwdResult MatchFwdMHAPatternsForCanonicalization(HloInstruction* instr) {
   // We need to match 2 general cases:
   // 1. bmm1 --> (intermediate nodes) --> bmm2 <-- V matrix
@@ -771,10 +664,6 @@ MatchFwdResult MatchFwdMHAPatternsForCanonicalization(HloInstruction* instr) {
     if (bmm2_operand_pos == 1) {
       match_result.need_canonicalization = true;
     }
-    match_result = MatchDefaultFwdBmmBmm(match_result, bmm2_operand_pos, instr);
-    if (match_result.has_match) {
-      return match_result;
-    }
 
     bool has_dropout = false;
     // We first check if bmm2 is connect to a softmax or dropout.
@@ -791,11 +680,6 @@ MatchFwdResult MatchFwdMHAPatternsForCanonicalization(HloInstruction* instr) {
     if (match_result.has_match) {
       return match_result;
     }
-    match_result = MatchBmm1ScaleBiasMaskSoftmaxDropoutBmm2(
-        match_result, match_result.matched_softmax_input, has_dropout);
-    if (match_result.has_match) {
-      return match_result;
-    }
   }
   // Didn't find any match
   match_result.need_canonicalization = false;
@@ -939,20 +823,27 @@ MatchBwdResult MatchDbias(MatchBwdResult previous_result,
       user_count == 1 &&
       Match(dbias_user, m::Reduce(&dbias, m::Op(), m::Op()).WithOneUse()) &&
       dbias->shape().rank() == 3 && ConsumeExtraConvert(dbias);
-  match_result.matched_dbias = dbias;
+  if (match_result.has_match) {
+    // cuDNN only supports dbias for [1, h, s, s]
+    // make sure reduce dimension is on batch dim
+    auto reduce_dim = dbias->dimensions();
+    if (reduce_dim.size() == 1 && reduce_dim[0] == 0) {
+      match_result.matched_dbias = dbias;
+    } else {
+      match_result.has_match = false;
+    }
+  }
   return match_result;
 }
 
 MatchBwdResult MatchBwdBmmSoftmaxDropoutBmm(MatchBwdResult previous_result,
-                                            HloInstruction* fwd_fmha_call,
-                                            HloInstruction* mask) {
+                                            HloInstruction* fwd_fmha_call) {
   MatchBwdResult match_result = previous_result;
   bool is_bmm1_grad1_canonicalized =
       match_result.bmm_1_grad_1_need_canonicalization;
   match_result.has_match = false;
   bool has_scale = false;
   bool has_dropout = false;
-  bool has_mask = false;
   // Backward dropout pattern
   // select(mask, bmm2_grad2, broadcast())
   auto bwd_dropout_pattern_form_1 = m::SharedSubpattern(
@@ -1017,23 +908,12 @@ MatchBwdResult MatchBwdBmmSoftmaxDropoutBmm(MatchBwdResult previous_result,
           m::Exp(&exp_1, m::Op()))
           .WithAtMostNumUser(3)));
 
-  // Backward mask input pattern
-  // we already matched this in the fwd. Just make sure the same mask is used in
-  // the bwd
-  HloInstruction* bwd_mask_input = nullptr;
-  HloInstruction* bwd_mask = nullptr;
-  HloInstruction* d_mask = nullptr;
-  auto bwd_mask_pattern = OptionalConvert(m::Select(
-      &d_mask, m::Op(&bwd_mask).WithPredicate([](const HloInstruction* instr) {
-        return instr->shape().element_type() == PRED;
-      }),
-      m::Op(&bwd_mask_input), m::Op()));
-
   // Backward scale input pattern
   HloInstruction* bwd_scale_input = nullptr;
+  HloInstruction* bwd_scale = nullptr;
 
   auto bwd_scale_pattern =
-      m::MultiplyAnyOrder(m::Op(&bwd_scale_input),
+      m::MultiplyAnyOrder(&bwd_scale, m::Op(&bwd_scale_input),
                           m::Broadcast(m::Constant().WithPredicate(IsScalar)))
           .WithNumUser(2);
   int intermediate_input_pos = is_bmm1_grad1_canonicalized ? 1 : 0;
@@ -1048,11 +928,6 @@ MatchBwdResult MatchBwdBmmSoftmaxDropoutBmm(MatchBwdResult previous_result,
     intermediate_input = bwd_scale_input;
   }
 
-  has_mask = Match(intermediate_input, bwd_mask_pattern) && mask == bwd_mask;
-
-  if (has_mask) {
-    intermediate_input = bwd_mask_input;
-  }
   if (!Match(intermediate_input, bwd_softmax_pattern) || exp_1 != exp_2) {
     return match_result;
   }
@@ -1068,18 +943,7 @@ MatchBwdResult MatchBwdBmmSoftmaxDropoutBmm(MatchBwdResult previous_result,
     return match_result;
   }
 
-  if (has_mask && has_dropout) {
-    // has bias
-    if (fwd_fmha_call->custom_call_target() ==
-        kCudnnfMHAScaleBiasMaskSoftmaxDropoutCallTarget)
-      match_result.matched_custom_call_name =
-          kCudnnfMHAScaleBiasMaskSoftmaxDropoutBackwardCallTarget;
-    // no bias
-    if (fwd_fmha_call->custom_call_target() ==
-        kCudnnfMHAScaleMaskSoftmaxDropoutCallTarget)
-      match_result.matched_custom_call_name =
-          kCudnnfMHAScaleMaskSoftmaxDropoutBackwardCallTarget;
-  } else if (!has_mask && has_dropout) {
+  if (has_dropout) {
     // has bias
     if (fwd_fmha_call->custom_call_target() ==
         kCudnnfMHAScaleBiasSoftmaxDropoutCallTarget)
@@ -1090,17 +954,6 @@ MatchBwdResult MatchBwdBmmSoftmaxDropoutBmm(MatchBwdResult previous_result,
         kCudnnfMHASoftmaxDropoutCallTarget)
       match_result.matched_custom_call_name =
           kCudnnfMHASoftmaxDropoutBackwardCallTarget;
-  } else if (has_mask && !has_dropout) {
-    // has bias
-    if (fwd_fmha_call->custom_call_target() ==
-        kCudnnfMHAScaleBiasMaskSoftmaxCallTarget)
-      match_result.matched_custom_call_name =
-          kCudnnfMHAScaleBiasMaskSoftmaxBackwardCallTarget;
-    // no bias
-    if (fwd_fmha_call->custom_call_target() ==
-        kCudnnfMHAScaleMaskSoftmaxCallTarget)
-      match_result.matched_custom_call_name =
-          kCudnnfMHAScaleMaskSoftmaxBackwardCallTarget;
   } else {
     // has bias
     if (fwd_fmha_call->custom_call_target() ==
@@ -1113,19 +966,17 @@ MatchBwdResult MatchBwdBmmSoftmaxDropoutBmm(MatchBwdResult previous_result,
           kCudnnfMHASoftmaxBackwardCallTarget;
   }
   // try to pattern match dbias
-  HloInstruction* dS = has_mask ? d_mask : d_softmax;
+  HloInstruction* dS = d_softmax;
   if (dS->users()[0]->opcode() == HloOpcode::kConvert) {
     dS = dS->users()[0];
   }
   if (has_scale) {
-    // bmm1-(scale)-(bias)-(mask)-softmax pattern
-    // users could be dbias besides mask bwd or scale bwd
+    // bmm1-(scale)-(bias)-softmax pattern users could be dbias or scale bwd
     if (dS->user_count() == 1) {
       // no dbias
       match_result.has_match = true;
     } else if (dS->user_count() == 2) {
-      match_result =
-          MatchDbias(match_result, dS, {bwd_scale_input, bwd_mask_input});
+      match_result = MatchDbias(match_result, dS, {bwd_scale});
     } else {
       match_result.has_match = false;
     }
@@ -1179,32 +1030,20 @@ MatchBwdResult MatchBackwardBmms(HloInstruction* fwd_fmha_call,
 // We will match the backward graphs for all forward patterns defined in
 // MatchFwdMHAPatternsForCanonicalization
 MatchBwdResult MatchBwdMHAPatternsForCanonicalization(
-    HloInstruction* fwd_fmha_call, HloInstruction* bmm_1, HloInstruction* mask,
-    bool v_transposed) {
+    HloInstruction* fwd_fmha_call, HloInstruction* bmm_1, bool v_transposed) {
   MatchBwdResult match_result =
       MatchBackwardBmms(fwd_fmha_call, bmm_1, v_transposed);
   if (!match_result.has_match) {
     return match_result;
   }
-  // Found default bmm-bmm backward graph.
-  if (match_result.matched_bmm_2_grad_2->users().size() == 2 &&
-      (match_result.matched_bmm_1_grad_1->IsUserOf(
-          match_result.matched_bmm_2_grad_2)) &&
-      (match_result.matched_bmm_1_grad_2->IsUserOf(
-          match_result.matched_bmm_2_grad_2))) {
-    match_result.matched_custom_call_name = kCudnnfMHABmmBmmBackwardCallTarget;
-    return match_result;
-  }
-  // TODO match all other patterns
-  match_result =
-      MatchBwdBmmSoftmaxDropoutBmm(match_result, fwd_fmha_call, mask);
+  match_result = MatchBwdBmmSoftmaxDropoutBmm(match_result, fwd_fmha_call);
   return match_result;
 }
 
 absl::StatusOr<bool> IsMHABlockSupported(
     HloInstruction* bmm_1, HloInstruction* bmm_2, bool need_canonicalization,
-    bool is_training, bool is_causal_mask, bool& is_flash_attention,
-    std::string& custom_call_name, const DebugOptions& debug_options,
+    bool is_training, bool is_causal_mask, std::string& custom_call_name,
+    const DebugOptions& debug_options,
     stream_executor::CudaComputeCapability cc,
     stream_executor::dnn::VersionInfo cudnn_version) {
   if (MHACallHasDropout(custom_call_name) &&
@@ -1213,12 +1052,6 @@ absl::StatusOr<bool> IsMHABlockSupported(
     return false;
   }
 
-  // cuDNN FMHA requires softmax for backward
-  if (is_training && custom_call_name == kCudnnfMHABmmBmmCallTarget) {
-    VLOG(3) << "Unsupported fused MHA training pattern.\n";
-    return false;
-  }
-
   // cuDNN 8.8 currently only supports BF16 and F16 data types.
   if (!IsSupportedPrimitiveType(bmm_1) || !IsSupportedPrimitiveType(bmm_2)) {
     if (VLOG_IS_ON(2)) {
@@ -1251,7 +1084,7 @@ absl::StatusOr<bool> IsMHABlockSupported(
 
   // check if matched attention block is supported by cuDNN flash attention.
   TF_ASSIGN_OR_RETURN(
-      is_flash_attention,
+      bool is_flash_attention,
       IsFlashAttention(qkv_layout.value(), is_training, cc, cudnn_version));
   if (is_flash_attention) {
     if (is_causal_mask) {
@@ -1260,20 +1093,10 @@ absl::StatusOr<bool> IsMHABlockSupported(
         custom_call_name = kCudnnfMHASoftmaxDropoutCallTarget;
       } else if (custom_call_name == kCudnnfMHAScaleBiasSoftmaxCallTarget) {
         custom_call_name = kCudnnfMHASoftmaxCallTarget;
-      } else if (custom_call_name ==
-                 kCudnnfMHAScaleBiasMaskSoftmaxDropoutCallTarget) {
-        custom_call_name = kCudnnfMHAScaleMaskSoftmaxDropoutCallTarget;
-      } else if (custom_call_name == kCudnnfMHAScaleBiasMaskSoftmaxCallTarget) {
-        custom_call_name = kCudnnfMHAScaleMaskSoftmaxCallTarget;
       }
     }
-    return true;
   }
-  // check if matched attention block is supported by cuDNN fused attention.
-  TF_ASSIGN_OR_RETURN(
-      bool is_fused_attention,
-      IsFusedAttention(qkv_layout.value(), is_training, cc, cudnn_version));
-  return is_fused_attention;
+  return is_flash_attention;
 }
 
 absl::StatusOr<HloInstruction*> CanonicalizeBatchedGemmForcuDNNFMHA(
@@ -1374,11 +1197,10 @@ absl::StatusOr<HloInstruction*> ChangeCheckedDimToFastest(
 
 absl::StatusOr<HloInstruction*> FuseFwdMultiHeadedAttentionBlock(
     HloComputation* comp, HloInstruction* bmm_1, HloInstruction* bmm_2,
-    HloInstruction* bias, HloInstruction* mask, HloInstruction* scale,
-    HloInstruction* reduce_sum, HloInstruction* softmax_input,
-    double dropout_rate, std::string& custom_call_name,
-    stream_executor::CudaComputeCapability cc, bool is_training, bool& changed,
-    bool& v_transposed, bool is_causal_mask, bool is_flash_attention) {
+    HloInstruction* bias, HloInstruction* scale, HloInstruction* reduce_sum,
+    HloInstruction* softmax_input, double dropout_rate,
+    std::string& custom_call_name, stream_executor::CudaComputeCapability cc,
+    bool is_training, bool& changed, bool& v_transposed, bool is_causal_mask) {
   double scale_value = 1.0;
   HloInstruction* lhs_bmm1;
   HloInstruction* rhs_bmm1;
@@ -1445,24 +1267,19 @@ absl::StatusOr<HloInstruction*> FuseFwdMultiHeadedAttentionBlock(
     algorithm->set_is_cudnn_frontend(true);
     algorithm->mutable_workspace_size()->set_value(0);
   }
-
-  // set is flash attention here
-  // choose to use flash attention or non-fa attention based on this flag.
-  fmha_config.set_is_flash_attention(is_flash_attention);
   // set is_causal_mask here
   // choose to generate causal mask inside cuDNN attention or not
   fmha_config.set_mask_type(is_causal_mask ? CudnnfMHABackendConfig::CAUSAL
                                            : CudnnfMHABackendConfig::NO_MASK);
 
-  // Output Order: {O, scratch, Fwd act*}
   const Shape& output_shape = bmm_2->shape();
 
   Shape call_shape;
   // Activation output is used by backward gemm.
   HloInstruction* activation_output = nullptr;
 
-  std::vector<Shape> output_shapes = {output_shape,
-                                      ShapeUtil::MakeShape(U8, {0})};
+  // Output Order: {O, Fwd act*, workspace}
+  std::vector<Shape> output_shapes = {output_shape};
   if (is_training) {
     activation_output = bmm_2->mutable_operand(0);
     // Sometimes activation output is bitcast, the actual activation is the
@@ -1484,24 +1301,16 @@ absl::StatusOr<HloInstruction*> FuseFwdMultiHeadedAttentionBlock(
       }
     }
     // if it is flash attention, should output softmax stats to the bwd
-    if (is_flash_attention) {
-      TF_RET_CHECK(reduce_sum != nullptr);
-      output_shapes.push_back(
-          ShapeUtil::MakeShape(F32, reduce_sum->shape().dimensions()));
-    } else {
-      output_shapes.push_back(activation_output->shape());
-    }
+    TF_RET_CHECK(reduce_sum != nullptr);
+    output_shapes.push_back(
+        ShapeUtil::MakeShape(F32, reduce_sum->shape().dimensions()));
   }
+  output_shapes.push_back(ShapeUtil::MakeShape(U8, {0}));
   call_shape = ShapeUtil::MakeTupleShape(output_shapes);
 
-  // Input Order: {Q, K, V, mask*, bias*}
+  // Input Order: {Q, K, V, bias*}
   std::vector<HloInstruction*> operands = {lhs_bmm1, rhs_bmm1, rhs_bmm2};
-  if (mask != nullptr) {
-    HloInstruction* converted_mask = comp->AddInstruction(
-        HloInstruction::CreateConvert(bmm_1->shape(), mask));
-    operands.push_back(converted_mask);
-  }
-  if ((!is_flash_attention || !is_causal_mask) && bias != nullptr) {
+  if (!is_causal_mask && bias != nullptr) {
     HloInstruction* original_bias;
     HloInstruction* original_broadcast;
     // There will be cases where the bias is up-casted to wider float type,
@@ -1535,7 +1344,7 @@ absl::StatusOr<HloInstruction*> FuseFwdMultiHeadedAttentionBlock(
         bcast_dimensions.push_back(dim - starting_index);
       }
 
-      Shape bcast_shape = bmm_1->shape();
+      const Shape& bcast_shape = bmm_1->shape();
       bias = comp->AddInstruction(HloInstruction::CreateBroadcast(
           bcast_shape, original_bias, bcast_dimensions));
     }
@@ -1555,7 +1364,7 @@ absl::StatusOr<HloInstruction*> FuseFwdMultiHeadedAttentionBlock(
   if (activation_output) {
     HloInstruction* activation_gte =
         comp->AddInstruction(HloInstruction::CreateGetTupleElement(
-            activation_output->shape(), fmha_call, 2));
+            activation_output->shape(), fmha_call, 1));
     TF_RETURN_IF_ERROR(comp->ReplaceInstructionWithDifferentShape(
                                activation_output, activation_gte,
                                /*preserve_sharding=*/false,
@@ -1575,11 +1384,10 @@ absl::StatusOr<bool> FuseBwdMultiHeadedAttentionBlock(
     HloComputation* comp, HloInstruction* bmm_1_grad_1,
     HloInstruction* bmm_1_grad_2, HloInstruction* bmm_2_grad_1,
     HloInstruction* bmm_2_grad_2, HloInstruction* fwd_fmha_call,
-    HloInstruction* dbias, HloInstruction* mask, HloInstruction* bias,
+    HloInstruction* dbias, HloInstruction* bias,
     std::string& bwd_custom_call_name) {
   HloInstruction* rhs_bmm1_grad_gemm1;
   HloInstruction* lhs_bmm1_grad_gemm2;
-  HloInstruction* lhs_bmm2_grad_gemm1;
   HloInstruction* rhs_bmm2_grad_gemm2;
   HloInstruction* d_output_grad;
 
@@ -1594,8 +1402,8 @@ absl::StatusOr<bool> FuseBwdMultiHeadedAttentionBlock(
 
   TF_ASSIGN_OR_RETURN(GpuBackendConfig gpu_config,
                       fwd_fmha_call->backend_config<GpuBackendConfig>());
-  CudnnfMHABackendConfig fwd_config = gpu_config.cudnn_fmha_backend_config();
-  bool is_flash_attention = fwd_config.is_flash_attention();
+  const CudnnfMHABackendConfig& fwd_config =
+      gpu_config.cudnn_fmha_backend_config();
   bool is_causal_mask =
       fwd_config.mask_type() == CudnnfMHABackendConfig::CAUSAL;
   CudnnfMHABackendConfig bwd_fmha_config;
@@ -1609,24 +1417,14 @@ absl::StatusOr<bool> FuseBwdMultiHeadedAttentionBlock(
       lhs_bmm1_grad_gemm2,
       ChangeCheckedDimToFastest(comp, bmm_1_grad_2, false /*is_lhs*/,
                                 false /*should_contracting_be_fastest*/));
-  // P tensor
-  TF_ASSIGN_OR_RETURN(
-      lhs_bmm2_grad_gemm1,
-      ChangeCheckedDimToFastest(comp, bmm_2_grad_1, true /*is_lhs*/,
-                                false /*should_contracting_be_fastest*/));
 
   // Forward activation
-  // if it is not flash attention, fwd activation is the P tensor
-  // else it is the softmax_stats
+  // softmax_stats
   HloInstruction* fwd_act;
-  if (fwd_config.is_flash_attention()) {
-    auto fwd_act_index = 2;
-    fwd_act = comp->AddInstruction(HloInstruction::CreateGetTupleElement(
-        fwd_fmha_call->shape().tuple_shapes(fwd_act_index), fwd_fmha_call,
-        fwd_act_index));
-  } else {
-    fwd_act = lhs_bmm2_grad_gemm1;
-  }
+  int64_t fwd_act_index = 1;
+  fwd_act = comp->AddInstruction(HloInstruction::CreateGetTupleElement(
+      fwd_fmha_call->shape().tuple_shapes(fwd_act_index), fwd_fmha_call,
+      fwd_act_index));
 
   // V tensor
   TF_ASSIGN_OR_RETURN(
@@ -1650,34 +1448,27 @@ absl::StatusOr<bool> FuseBwdMultiHeadedAttentionBlock(
       ChangeCheckedDimToFastest(comp, bmm_2_grad_1, false /*is_lhs*/,
                                 false /*should_contracting_be_fastest*/));
   (void)bmm_2_grad_1_rhs;
-  // Operand order: {Q, K, V, Fwd act, d_o, mask*, bias*, O*}
+  // Operand order: {Q, K, V, Fwd act, d_o, bias*, O*}
   std::vector<HloInstruction*> operands = {
       rhs_bmm1_grad_gemm1, lhs_bmm1_grad_gemm2, rhs_bmm2_grad_gemm2, fwd_act,
       d_output_grad};
-  if (mask) {
-    HloInstruction* converted_mask = comp->AddInstruction(
-        HloInstruction::CreateConvert(bmm_2_grad_2->shape(), mask));
-    operands.push_back(converted_mask);
-  }
 
-  // if is flash attention, add fwd output to input list
-  if (is_flash_attention) {
-    if (!is_causal_mask && bias) {
-      operands.push_back(bias);
-    }
-    HloInstruction* fwd_output;
-    for (auto user : fwd_fmha_call->users()) {
-      if (user->opcode() == HloOpcode::kGetTupleElement &&
-          user->tuple_index() == 0) {
-        fwd_output = user;
-      }
+  // For flash attention, add fwd output to input list
+  if (!is_causal_mask && bias) {
+    operands.push_back(bias);
+  }
+  HloInstruction* fwd_output;
+  for (auto user : fwd_fmha_call->users()) {
+    if (user->opcode() == HloOpcode::kGetTupleElement &&
+        user->tuple_index() == 0) {
+      fwd_output = user;
     }
-    // should be able to find the instruction
-    TF_RET_CHECK(fwd_output != nullptr);
-    // check dO and O have the same layout as it is required by cuDNN
-    TF_RET_CHECK(fwd_output->shape() == d_output_grad->shape());
-    operands.push_back(fwd_output);
   }
+  // should be able to find the instruction
+  TF_RET_CHECK(fwd_output != nullptr);
+  // check dO and O have the same layout as it is required by cuDNN
+  TF_RET_CHECK(fwd_output->shape() == d_output_grad->shape());
+  operands.push_back(fwd_output);
 
   *bwd_fmha_config.mutable_bmm1_grad_gemm1_dot_dimension_numbers() =
       bmm_1_grad_1->dot_dimension_numbers();
@@ -1704,9 +1495,6 @@ absl::StatusOr<bool> FuseBwdMultiHeadedAttentionBlock(
   // graph.
   // TODO Find a way to compute original seed from dropout keys.
   bwd_fmha_config.set_seed(fwd_config.seed());
-
-  // Set is flash attention
-  bwd_fmha_config.set_is_flash_attention(is_flash_attention);
   bwd_fmha_config.set_mask_type(is_causal_mask
                                     ? CudnnfMHABackendConfig::CAUSAL
                                     : CudnnfMHABackendConfig::NO_MASK);
@@ -1727,16 +1515,9 @@ absl::StatusOr<bool> FuseBwdMultiHeadedAttentionBlock(
   }
 
   // Output order:
-  // {dQ(bmm_1_grad_2), dK(bmm_1_grad_1), dV(bmm_2_grad_1),
-  // d_intermediate_tensor*, scratch, dbias*}
+  // {dQ(bmm_1_grad_2), dK(bmm_1_grad_1), dV(bmm_2_grad_1), dbias*, workspace}
   std::vector<Shape> output_shapes = {
       bmm_1_grad_2->shape(), bmm_1_grad_1->shape(), bmm_2_grad_1->shape()};
-  if (!fwd_config.is_flash_attention()) {
-    output_shapes.push_back(lhs_bmm2_grad_gemm1->shape());
-  }
-
-  // Reserved placeholder for workspace
-  output_shapes.push_back(ShapeUtil::MakeShape(U8, {0}));
 
   if (dbias) {
     // Cudnn kernel only outputs dbias in this shape [1, num_heads, seq, seq],
@@ -1748,6 +1529,8 @@ absl::StatusOr<bool> FuseBwdMultiHeadedAttentionBlock(
         ShapeUtil::MakeShape(dbias->shape().element_type(), dbias_shape_vector);
     output_shapes.push_back(cudnn_dbias_shape);
   }
+  // Reserved placeholder for workspace
+  output_shapes.push_back(ShapeUtil::MakeShape(U8, {0}));
   Shape call_shape = ShapeUtil::MakeTupleShape(output_shapes);
   HloInstruction* fmha_bwd_call =
       comp->AddInstruction(HloInstruction::CreateCustomCall(
@@ -1779,7 +1562,7 @@ absl::StatusOr<bool> FuseBwdMultiHeadedAttentionBlock(
     HloInstruction* dbias_user = dbias->users()[0];
     HloInstruction* cudnn_dbias_output =
         comp->AddInstruction(HloInstruction::CreateGetTupleElement(
-            output_shapes.back(), fmha_bwd_call, 5));
+            output_shapes[3], fmha_bwd_call, 3));
     HloInstruction* reshape_dbias = comp->AddInstruction(
         HloInstruction::CreateReshape(original_shape, cudnn_dbias_output));
     TF_RETURN_IF_ERROR(dbias_user->ReplaceOperandWith(
@@ -1791,7 +1574,7 @@ absl::StatusOr<bool> FuseBwdMultiHeadedAttentionBlock(
   return true;
 }
 
-Status RestoreFwdGraph(
+absl::Status RestoreFwdGraph(
     HloComputation* comp, HloInstruction* fwd_fmha_call, HloInstruction* bmm2,
     HloInstruction* activation, HloInstruction* original_bmm2_producer0,
     HloInstruction* original_bmm2_producer1,
@@ -1852,7 +1635,7 @@ absl::StatusOr<bool> CudnnFusedMHARewriter::Run(
     if (!debug_options.xla_gpu_enable_cudnn_fmha() ||
         !IsComputeCapabilityAndCudnnSupported(
             compute_capability_, cudnn_version,
-            stream_executor::dnn::VersionInfo(8, 8, 0))) {
+            stream_executor::dnn::VersionInfo(8, 9, 4))) {
       return false;
     }
     for (HloInstruction* instr : comp->MakeInstructionPostOrder()) {
@@ -1863,19 +1646,14 @@ absl::StatusOr<bool> CudnnFusedMHARewriter::Run(
       if (!matched_result.has_match) {
         continue;
       }
-      // disable cuDNN mask input
-      if (matched_result.matched_mask) {
-        continue;
-      }
       // We check the validity of bmms here before canonicalization so we don't
       // modify the graph if mha fusion is not possible
-      // Relax 512 constraint if it is flash attention
       TF_ASSIGN_OR_RETURN(
           bool is_mha_module_supported,
           IsMHABlockSupported(
               matched_result.matched_bmm_1, matched_result.matched_bmm_2,
               matched_result.need_canonicalization, matched_result.is_training,
-              matched_result.is_causal_mask, matched_result.is_flash_attention,
+              matched_result.is_causal_mask,
               matched_result.matched_custom_call_name, debug_options,
               compute_capability_, cudnn_version));
 
@@ -1920,20 +1698,18 @@ absl::StatusOr<bool> CudnnFusedMHARewriter::Run(
           HloInstruction * fwd_fmha_call,
           FuseFwdMultiHeadedAttentionBlock(
               comp, matched_result.matched_bmm_1, matched_result.matched_bmm_2,
-              matched_result.matched_bias, matched_result.matched_mask,
-              matched_result.matched_scale, matched_result.matched_reduce_sum,
+              matched_result.matched_bias, matched_result.matched_scale,
+              matched_result.matched_reduce_sum,
               matched_result.matched_softmax_input,
               matched_result.matched_dropout_rate,
               matched_result.matched_custom_call_name, compute_capability_,
               matched_result.is_training, changed, v_transposed,
-              matched_result.is_causal_mask,
-              matched_result.is_flash_attention));
+              matched_result.is_causal_mask));
       any_changed |= changed;
       if (matched_result.is_training) {
         MatchBwdResult matched_bwd_result =
             MatchBwdMHAPatternsForCanonicalization(
-                fwd_fmha_call, matched_result.matched_bmm_1,
-                matched_result.matched_mask, v_transposed);
+                fwd_fmha_call, matched_result.matched_bmm_1, v_transposed);
         if (!matched_bwd_result.has_match) {
           VLOG(2) << "Backward pattern not matching, skipping.";
           // restore fwd graph if bwd pattern match failed
@@ -1944,27 +1720,11 @@ absl::StatusOr<bool> CudnnFusedMHARewriter::Run(
                               matched_result.need_canonicalization));
           continue;
         }
-        // if fwd uses mask input, then bwd needs cudnn 8.9.1 to take in a mask
-        // input if cudnn version < 8.9.1 we won't lower the bwd pass
-        if (matched_result.matched_mask != nullptr &&
-            !IsComputeCapabilityAndCudnnSupported(
-                compute_capability_, cudnn_version,
-                stream_executor::dnn::VersionInfo(8, 9, 1))) {
-          // restore fwd graph if bwd pattern match failed
-          TF_RETURN_IF_ERROR(
-              RestoreFwdGraph(comp, fwd_fmha_call, original_bmm2, activation,
-                              original_bmm2_producer0, original_bmm2_producer1,
-                              original_activation_producers,
-                              matched_result.need_canonicalization));
-          continue;
-        }
-        // check if dbias exist and the cudnn version is > 8.9.1. We
-        // won't lower bwd if this condition is not met as we won't deal with
-        // unswizzling now
         if (matched_bwd_result.matched_dbias &&
-            !IsComputeCapabilityAndCudnnSupported(
-                compute_capability_, cudnn_version,
-                stream_executor::dnn::VersionInfo(8, 9, 1))) {
+            !(compute_capability_.IsAtLeastHopper() &&
+              compute_capability_.minor == 0 &&
+              cudnn_version >= stream_executor::dnn::VersionInfo(8, 9, 6))) {
+          VLOG(2) << "Flash attention dbias requires cudnn 8.9.6 + hopper.";
           // restore fwd graph if bwd pattern match failed
           TF_RETURN_IF_ERROR(
               RestoreFwdGraph(comp, fwd_fmha_call, original_bmm2, activation,
@@ -2007,8 +1767,7 @@ absl::StatusOr<bool> CudnnFusedMHARewriter::Run(
                 matched_bwd_result.matched_bmm_1_grad_2,
                 matched_bwd_result.matched_bmm_2_grad_1,
                 matched_bwd_result.matched_bmm_2_grad_2, fwd_fmha_call,
-                matched_bwd_result.matched_dbias, matched_result.matched_mask,
-                matched_result.matched_bias,
+                matched_bwd_result.matched_dbias, matched_result.matched_bias,
                 matched_bwd_result.matched_custom_call_name));
         any_changed |= changed;
       }
diff --git a/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter_test.cc b/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter_test.cc
index 9741e3250f0831..5125c9ec243722 100644
--- a/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter_test.cc
@@ -77,20 +77,6 @@ class CudnnFusedMhaRewriterTestHloTest : public HloTestBase {
   }
 
   se::dnn::VersionInfo GetCudnnVersion() {
-    // Fake a supported compute capability to run tests,
-    // we don't run any kernels in these tests so they should be safe
-    // to run anywhere.
-    return se::dnn::VersionInfo(8, 8, 0);
-  }
-
-  se::dnn::VersionInfo GetCudnnVersionWithDbiasAndMaskBwdInputSupport() {
-    // Fake a supported compute capability to run tests for training with dbias
-    // and mask bwd input support, we don't run any kernels in these tests so
-    // they should be safe to run anywhere.
-    return se::dnn::VersionInfo(8, 9, 1);
-  }
-
-  se::dnn::VersionInfo GetCudnnVersionWithFlashAttentionSupport() {
     // Fake a supported compute capability to run tests,
     // we don't run any kernels in these tests so they should be safe
     // to run anywhere.
@@ -121,7 +107,6 @@ class CudnnFusedMhaRewriterTestHloTest : public HloTestBase {
 
   DebugOptions GetDebugOptionsForTest() override {
     auto debug_options = HloTestBase::GetDebugOptionsForTest();
-    debug_options.set_xla_gpu_enable_xla_runtime_executable(false);
     debug_options.set_xla_gpu_enable_cudnn_fmha(true);
     debug_options.set_xla_gpu_fused_attention_use_cudnn_rng(true);
     return debug_options;
@@ -164,135 +149,47 @@ class CudnnFusedMhaRewriterPipelineTest
   }
 };
 
-constexpr absl::string_view hlo_BF16Bmm1Bmm2Pattern = R"(
+constexpr absl::string_view
+    hlo_BF16Bmm1SoftmaxBmm2Pattern_k_hidden_not_most_minor = R"(
 HloModule fmha_test, entry_computation_layout={(bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0})->bf16[16,16,256,64]{3,2,1,0}}
-ENTRY main.6 {
-  Arg_2.3 = bf16[16,16,256,64]{3,2,1,0} parameter(2)
-  Arg_0.1 = bf16[16,16,256,64]{3,2,1,0} parameter(0)
-  Arg_1.2 = bf16[16,16,256,64]{3,2,1,0} parameter(1)
-  dot.0 = bf16[16,16,256,256]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}, metadata={}
-  ROOT dot.1 = bf16[16,16,256,64]{3,2,1,0} dot(dot.0, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}, metadata={}
-})";
-
-TEST_F(CudnnFusedMhaRewriterTestHloTest, BF16Bmm1Bmm2Pattern) {
-  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto m,
-      ParseAndReturnVerifiedModule(hlo_BF16Bmm1Bmm2Pattern, GetModuleConfig()));
-  CudnnFusedMHARewriter fusedMhaRewriter{GetCudaComputeCapability(),
-                                         GetCudnnVersion()};
-  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
-  const HloInstruction* fmha;
-
-  SCOPED_TRACE(m->ToString());
-  EXPECT_THAT(
-      m->entry_computation()->root_instruction(),
-      GmockMatch(m::GetTupleElement(
-                     m::CustomCall(&fmha, {kCudnnfMHABmmBmmCallTarget}), 0)
-                     .WithShape(BF16, {16, 16, 256, 64})));
-  TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
-                          fmha->backend_config<GpuBackendConfig>());
-  const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
-  EXPECT_EQ(config.fmha_scale(), 1.0);
-  EXPECT_EQ(config.dropout_rate(), 0.0);
-}
-
-TEST_F(CudnnFusedMhaRewriterPipelineTest, BF16Bmm1Bmm2Pattern) {
-  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto m,
-      ParseAndReturnVerifiedModule(hlo_BF16Bmm1Bmm2Pattern, GetModuleConfig()));
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
-                          GetOptimizedModule(std::move(m)));
-  const HloInstruction* fmha;
-
-  SCOPED_TRACE(optimized_module->ToString());
-  EXPECT_THAT(
-      optimized_module->entry_computation()->root_instruction(),
-      GmockMatch(m::GetTupleElement(
-                     m::CustomCall(&fmha, {kCudnnfMHABmmBmmCallTarget}), 0)
-                     .WithShape(BF16, {16, 16, 256, 64})));
-  TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
-                          fmha->backend_config<GpuBackendConfig>());
-  const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
-  EXPECT_EQ(config.fmha_scale(), 1.0);
-  EXPECT_EQ(config.dropout_rate(), 0.0);
-}
-
-constexpr absl::string_view hlo_BF16Bmm1Bmm2UncanonicalizedPattern = R"(
-HloModule fmha_test, entry_computation_layout={(bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0})->bf16[16,16,64,256]{3,2,1,0}}
-
-ENTRY main.6 {
-  Arg_2.3 = bf16[16,16,256,64]{3,2,1,0} parameter(2)
-  Arg_0.1 = bf16[16,16,256,64]{3,2,1,0} parameter(0)
-  Arg_1.2 = bf16[16,16,256,64]{3,2,1,0} parameter(1)
-  dot.0 = bf16[16,16,256,256]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}, metadata={}
-  ROOT dot.1 = bf16[16,16,64,256]{3,2,1,0} dot(Arg_2.3, dot.0), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}, metadata={}
-})";
-
-TEST_F(CudnnFusedMhaRewriterTestHloTest, BF16Bmm1Bmm2UncanonicalizedPattern) {
-  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(
-                                      hlo_BF16Bmm1Bmm2UncanonicalizedPattern));
-  CudnnFusedMHARewriter fusedMhaRewriter{GetCudaComputeCapability(),
-                                         GetCudnnVersion()};
-  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
-  const HloInstruction* fmha;
 
-  SCOPED_TRACE(m->ToString());
-  EXPECT_THAT(m->entry_computation()->root_instruction(),
-              GmockMatch(m::Transpose(
-                  m::GetTupleElement(
-                      m::CustomCall(&fmha, {kCudnnfMHABmmBmmCallTarget}), 0)
-                      .WithShape(BF16, {16, 16, 256, 64}))));
-  TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
-                          fmha->backend_config<GpuBackendConfig>());
-  const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
-  EXPECT_EQ(config.fmha_scale(), 1.0);
-  EXPECT_EQ(config.dropout_rate(), 0.0);
+region_0.7 {
+  Arg_0.8 = bf16[] parameter(0)
+  Arg_1.9 = bf16[] parameter(1)
+  ROOT maximum = bf16[] maximum(Arg_0.8, Arg_1.9)
 }
 
-TEST_F(CudnnFusedMhaRewriterPipelineTest, BF16Bmm1Bmm2UncanonicalizedPattern) {
-  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto m, ParseAndReturnVerifiedModule(
-                  hlo_BF16Bmm1Bmm2UncanonicalizedPattern, GetModuleConfig()));
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
-                          GetOptimizedModule(std::move(m)));
-  const HloInstruction* fmha;
-
-  SCOPED_TRACE(optimized_module->ToString());
-  EXPECT_THAT(optimized_module->entry_computation()->root_instruction(),
-              GmockMatch(m::Transpose(
-                  m::GetTupleElement(
-                      m::CustomCall(&fmha, {kCudnnfMHABmmBmmCallTarget}), 0)
-                      .WithShape(BF16, {16, 16, 256, 64}))));
-  TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
-                          fmha->backend_config<GpuBackendConfig>());
-  const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
-  EXPECT_EQ(config.fmha_scale(), 1.0);
-  EXPECT_EQ(config.dropout_rate(), 0.0);
+region_1.19 {
+  Arg_0.20 = f32[] parameter(0)
+  Arg_1.21 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0.20, Arg_1.21)
 }
 
-constexpr absl::string_view
-    hlo_BF16Bmm1Bmm2Pattern_bmm1_rhs_contracting_dim_not_most_minor = R"(
-HloModule fmha_test, entry_computation_layout={(bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0})->bf16[16,16,256,64]{3,2,1,0}}
-
 ENTRY main.6 {
   Arg_2.3 = bf16[16,16,256,64]{3,2,1,0} parameter(2)
   Arg_0.1 = bf16[16,16,256,64]{3,2,1,0} parameter(0)
   Arg_1.2 = bf16[16,16,256,64]{2,3,1,0} parameter(1)
   dot.0 = bf16[16,16,256,256]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}, metadata={}
-  ROOT dot.1 = bf16[16,16,256,64]{3,2,1,0} dot(dot.0, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}, metadata={}
+  constant = bf16[] constant(-inf)
+  reduce.11 = bf16[16,16,256]{2,1,0} reduce(dot.0, constant), dimensions={3}, to_apply=region_0.7
+  broadcast.3 = bf16[16,16,256,256]{3,2,1,0} broadcast(reduce.11), dimensions={0,1,2}
+  subtract.1 = bf16[16,16,256,256]{3,2,1,0} subtract(dot.0, broadcast.3)
+  exponential.1 = bf16[16,16,256,256]{3,2,1,0} exponential(subtract.1)
+  convert.1 = f32[16,16,256,256]{3,2,1,0} convert(exponential.1)
+  constant.1 = f32[] constant(0)
+  reduce.23 = f32[16,16,256]{2,1,0} reduce(convert.1, constant.1), dimensions={3}, to_apply=region_1.19
+  convert.2 = bf16[16,16,256]{2,1,0} convert(reduce.23)
+  broadcast.4 = bf16[16,16,256,256]{3,2,1,0} broadcast(convert.2), dimensions={0,1,2}
+  divide = bf16[16,16,256,256]{3,2,1,0} divide(exponential.1, broadcast.4)
+  ROOT dot.1 = bf16[16,16,256,64]{3,2,1,0} dot(divide, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}, metadata={}
 })";
 
 TEST_F(CudnnFusedMhaRewriterTestHloTest,
-       BF16Bmm1Bmm2Pattern_bmm1_rhs_contracting_dim_not_most_minor) {
+       BF16Bmm1SoftmaxBmm2Pattern_bmm1_rhs_contracting_dim_not_most_minor) {
   if (skip_reason_) GTEST_SKIP() << *skip_reason_;
   TF_ASSERT_OK_AND_ASSIGN(
-      auto m,
-      ParseAndReturnVerifiedModule(
-          hlo_BF16Bmm1Bmm2Pattern_bmm1_rhs_contracting_dim_not_most_minor));
+      auto m, ParseAndReturnVerifiedModule(
+                  hlo_BF16Bmm1SoftmaxBmm2Pattern_k_hidden_not_most_minor));
   CudnnFusedMHARewriter fusedMhaRewriter{GetCudaComputeCapability(),
                                          GetCudnnVersion()};
   TF_ASSERT_OK_AND_ASSIGN(bool result, RunHloPass(&fusedMhaRewriter, m.get()));
@@ -303,7 +200,7 @@ TEST_F(CudnnFusedMhaRewriterTestHloTest,
   EXPECT_THAT(
       m->entry_computation()->root_instruction(),
       GmockMatch(m::GetTupleElement(
-                     m::CustomCall(&fmha, {kCudnnfMHABmmBmmCallTarget}), 0)
+                     m::CustomCall(&fmha, {kCudnnfMHASoftmaxCallTarget}), 0)
                      .WithShape(BF16, {16, 16, 256, 64})));
   TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
                           fmha->backend_config<GpuBackendConfig>());
@@ -312,50 +209,47 @@ TEST_F(CudnnFusedMhaRewriterTestHloTest,
             2);
 }
 
-TEST_F(CudnnFusedMhaRewriterPipelineTest,
-       BF16Bmm1Bmm2Pattern_bmm1_rhs_contracting_dim_not_most_minor) {
-  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto m,
-      ParseAndReturnVerifiedModule(
-          hlo_BF16Bmm1Bmm2Pattern_bmm1_rhs_contracting_dim_not_most_minor,
-          GetModuleConfig()));
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
-                          GetOptimizedModule(std::move(m)));
-  const HloInstruction* fmha;
+constexpr absl::string_view
+    hlo_BF16Bmm1SoftmaxBmm2Pattern_q_hidden_not_most_minor = R"(
+HloModule fmha_test, entry_computation_layout={(bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0})->bf16[16,16,256,64]{3,2,1,0}}
 
-  SCOPED_TRACE(optimized_module->ToString());
-  EXPECT_THAT(
-      optimized_module->entry_computation()->root_instruction(),
-      GmockMatch(m::GetTupleElement(
-                     m::CustomCall(&fmha, {kCudnnfMHABmmBmmCallTarget}), 0)
-                     .WithShape(BF16, {16, 16, 256, 64})));
-  TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
-                          fmha->backend_config<GpuBackendConfig>());
-  const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
-  EXPECT_EQ(config.bmm1_dot_dimension_numbers().rhs_contracting_dimensions()[0],
-            2);
+region_0.7 {
+  Arg_0.8 = bf16[] parameter(0)
+  Arg_1.9 = bf16[] parameter(1)
+  ROOT maximum = bf16[] maximum(Arg_0.8, Arg_1.9)
 }
 
-constexpr absl::string_view
-    hlo_BF16Bmm1Bmm2Pattern_bmm1_lhs_contracting_dim_not_most_minor = R"(
-HloModule fmha_test, entry_computation_layout={(bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0})->bf16[16,16,256,64]{3,2,1,0}}
+region_1.19 {
+  Arg_0.20 = f32[] parameter(0)
+  Arg_1.21 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0.20, Arg_1.21)
+}
 
 ENTRY main.6 {
   Arg_2.3 = bf16[16,16,256,64]{3,2,1,0} parameter(2)
   Arg_0.1 = bf16[16,16,256,64]{2,3,1,0} parameter(0)
   Arg_1.2 = bf16[16,16,256,64]{2,3,1,0} parameter(1)
   dot.0 = bf16[16,16,256,256]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}, metadata={}
-  ROOT dot.1 = bf16[16,16,256,64]{3,2,1,0} dot(dot.0, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}, metadata={}
+  constant = bf16[] constant(-inf)
+  reduce.11 = bf16[16,16,256]{2,1,0} reduce(dot.0, constant), dimensions={3}, to_apply=region_0.7
+  broadcast.3 = bf16[16,16,256,256]{3,2,1,0} broadcast(reduce.11), dimensions={0,1,2}
+  subtract.1 = bf16[16,16,256,256]{3,2,1,0} subtract(dot.0, broadcast.3)
+  exponential.1 = bf16[16,16,256,256]{3,2,1,0} exponential(subtract.1)
+  convert.1 = f32[16,16,256,256]{3,2,1,0} convert(exponential.1)
+  constant.1 = f32[] constant(0)
+  reduce.23 = f32[16,16,256]{2,1,0} reduce(convert.1, constant.1), dimensions={3}, to_apply=region_1.19
+  convert.2 = bf16[16,16,256]{2,1,0} convert(reduce.23)
+  broadcast.4 = bf16[16,16,256,256]{3,2,1,0} broadcast(convert.2), dimensions={0,1,2}
+  divide = bf16[16,16,256,256]{3,2,1,0} divide(exponential.1, broadcast.4)
+  ROOT dot.1 = bf16[16,16,256,64]{3,2,1,0} dot(divide, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}, metadata={}
 })";
 
 TEST_F(CudnnFusedMhaRewriterTestHloTest,
-       BF16Bmm1Bmm2Pattern_bmm1_lhs_contracting_dim_not_most_minor) {
+       BF16Bmm1SoftmaxBmm2Pattern_bmm1_lhs_contracting_dim_not_most_minor) {
   if (skip_reason_) GTEST_SKIP() << *skip_reason_;
   TF_ASSERT_OK_AND_ASSIGN(
-      auto m,
-      ParseAndReturnVerifiedModule(
-          hlo_BF16Bmm1Bmm2Pattern_bmm1_lhs_contracting_dim_not_most_minor));
+      auto m, ParseAndReturnVerifiedModule(
+                  hlo_BF16Bmm1SoftmaxBmm2Pattern_q_hidden_not_most_minor));
   CudnnFusedMHARewriter fusedMhaRewriter{GetCudaComputeCapability(),
                                          GetCudnnVersion()};
   TF_ASSERT_OK_AND_ASSIGN(bool result, RunHloPass(&fusedMhaRewriter, m.get()));
@@ -366,7 +260,7 @@ TEST_F(CudnnFusedMhaRewriterTestHloTest,
   EXPECT_THAT(
       m->entry_computation()->root_instruction(),
       GmockMatch(m::GetTupleElement(
-                     m::CustomCall(&fmha, {kCudnnfMHABmmBmmCallTarget}), 0)
+                     m::CustomCall(&fmha, {kCudnnfMHASoftmaxCallTarget}), 0)
                      .WithShape(BF16, {16, 16, 256, 64})));
   TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
                           fmha->backend_config<GpuBackendConfig>());
@@ -377,52 +271,47 @@ TEST_F(CudnnFusedMhaRewriterTestHloTest,
             2);
 }
 
-TEST_F(CudnnFusedMhaRewriterPipelineTest,
-       BF16Bmm1Bmm2Pattern_bmm1_lhs_contracting_dim_not_most_minor) {
-  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto m,
-      ParseAndReturnVerifiedModule(
-          hlo_BF16Bmm1Bmm2Pattern_bmm1_lhs_contracting_dim_not_most_minor,
-          GetModuleConfig()));
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
-                          GetOptimizedModule(std::move(m)));
-  const HloInstruction* fmha;
+constexpr absl::string_view
+    hlo_BF16Bmm1SoftmaxBmm2Pattern_v_hidden_dim_not_most_minor = R"(
+HloModule fmha_test, entry_computation_layout={(bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0})->bf16[16,16,256,64]{3,2,1,0}}
 
-  SCOPED_TRACE(optimized_module->ToString());
-  EXPECT_THAT(
-      optimized_module->entry_computation()->root_instruction(),
-      GmockMatch(m::GetTupleElement(
-                     m::CustomCall(&fmha, {kCudnnfMHABmmBmmCallTarget}), 0)
-                     .WithShape(BF16, {16, 16, 256, 64})));
-  TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
-                          fmha->backend_config<GpuBackendConfig>());
-  const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
-  EXPECT_EQ(config.bmm1_dot_dimension_numbers().lhs_contracting_dimensions()[0],
-            2);
-  EXPECT_EQ(config.bmm1_dot_dimension_numbers().rhs_contracting_dimensions()[0],
-            2);
+region_0.7 {
+  Arg_0.8 = bf16[] parameter(0)
+  Arg_1.9 = bf16[] parameter(1)
+  ROOT maximum = bf16[] maximum(Arg_0.8, Arg_1.9)
 }
 
-constexpr absl::string_view
-    hlo_BF16Bmm1Bmm2Pattern_bmm2_non_contracting_dim_not_most_minor = R"(
-HloModule fmha_test, entry_computation_layout={(bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0})->bf16[16,16,256,64]{3,2,1,0}}
+region_1.19 {
+  Arg_0.20 = f32[] parameter(0)
+  Arg_1.21 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0.20, Arg_1.21)
+}
 
 ENTRY main.6 {
   Arg_2.3 = bf16[16,16,256,64]{2,3,1,0} parameter(2)
   Arg_0.1 = bf16[16,16,256,64]{2,3,1,0} parameter(0)
   Arg_1.2 = bf16[16,16,256,64]{2,3,1,0} parameter(1)
   dot.0 = bf16[16,16,256,256]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}, metadata={}
-  ROOT dot.1 = bf16[16,16,256,64]{3,2,1,0} dot(dot.0, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}, metadata={}
+  constant = bf16[] constant(-inf)
+  reduce.11 = bf16[16,16,256]{2,1,0} reduce(dot.0, constant), dimensions={3}, to_apply=region_0.7
+  broadcast.3 = bf16[16,16,256,256]{3,2,1,0} broadcast(reduce.11), dimensions={0,1,2}
+  subtract.1 = bf16[16,16,256,256]{3,2,1,0} subtract(dot.0, broadcast.3)
+  exponential.1 = bf16[16,16,256,256]{3,2,1,0} exponential(subtract.1)
+  convert.1 = f32[16,16,256,256]{3,2,1,0} convert(exponential.1)
+  constant.1 = f32[] constant(0)
+  reduce.23 = f32[16,16,256]{2,1,0} reduce(convert.1, constant.1), dimensions={3}, to_apply=region_1.19
+  convert.2 = bf16[16,16,256]{2,1,0} convert(reduce.23)
+  broadcast.4 = bf16[16,16,256,256]{3,2,1,0} broadcast(convert.2), dimensions={0,1,2}
+  divide = bf16[16,16,256,256]{3,2,1,0} divide(exponential.1, broadcast.4)
+  ROOT dot.1 = bf16[16,16,256,64]{3,2,1,0} dot(divide, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}, metadata={}
 })";
 
 TEST_F(CudnnFusedMhaRewriterTestHloTest,
-       BF16Bmm1Bmm2Pattern_bmm2_non_contracting_dim_not_most_minor) {
+       BF16Bmm1SoftmaxBmm2Pattern_bmm2_non_contracting_dim_not_most_minor) {
   if (skip_reason_) GTEST_SKIP() << *skip_reason_;
   TF_ASSERT_OK_AND_ASSIGN(
-      auto m,
-      ParseAndReturnVerifiedModule(
-          hlo_BF16Bmm1Bmm2Pattern_bmm2_non_contracting_dim_not_most_minor));
+      auto m, ParseAndReturnVerifiedModule(
+                  hlo_BF16Bmm1SoftmaxBmm2Pattern_v_hidden_dim_not_most_minor));
   CudnnFusedMHARewriter fusedMhaRewriter{GetCudaComputeCapability(),
                                          GetCudnnVersion()};
   TF_ASSERT_OK_AND_ASSIGN(bool result, RunHloPass(&fusedMhaRewriter, m.get()));
@@ -433,34 +322,7 @@ TEST_F(CudnnFusedMhaRewriterTestHloTest,
   EXPECT_THAT(
       m->entry_computation()->root_instruction(),
       GmockMatch(m::GetTupleElement(
-                     m::CustomCall(&fmha, {kCudnnfMHABmmBmmCallTarget}), 0)
-                     .WithShape(BF16, {16, 16, 256, 64})));
-  TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
-                          fmha->backend_config<GpuBackendConfig>());
-  const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
-  EXPECT_EQ(config.bmm2_dot_dimension_numbers().lhs_contracting_dimensions()[0],
-            3);
-  EXPECT_EQ(config.bmm2_dot_dimension_numbers().rhs_contracting_dimensions()[0],
-            3);
-}
-
-TEST_F(CudnnFusedMhaRewriterPipelineTest,
-       BF16Bmm1Bmm2Pattern_bmm2_non_contracting_dim_not_most_minor) {
-  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto m,
-      ParseAndReturnVerifiedModule(
-          hlo_BF16Bmm1Bmm2Pattern_bmm2_non_contracting_dim_not_most_minor,
-          GetModuleConfig()));
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
-                          GetOptimizedModule(std::move(m)));
-  const HloInstruction* fmha;
-
-  SCOPED_TRACE(optimized_module->ToString());
-  EXPECT_THAT(
-      optimized_module->entry_computation()->root_instruction(),
-      GmockMatch(m::GetTupleElement(
-                     m::CustomCall(&fmha, {kCudnnfMHABmmBmmCallTarget}), 0)
+                     m::CustomCall(&fmha, {kCudnnfMHASoftmaxCallTarget}), 0)
                      .WithShape(BF16, {16, 16, 256, 64})));
   TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
                           fmha->backend_config<GpuBackendConfig>());
@@ -471,61 +333,6 @@ TEST_F(CudnnFusedMhaRewriterPipelineTest,
             3);
 }
 
-absl::string_view F16Bmm1Bmm2Pattern_str = R"(
-HloModule fmha_test, entry_computation_layout={(f16[16,16,256,64]{3,2,1,0},f16[16,16,256,64]{3,2,1,0},f16[16,16,256,64]{3,2,1,0})->f16[16,16,256,64]{3,2,1,0}}
-ENTRY main.6 {
-  Arg_2.3 = f16[16,16,256,64]{3,2,1,0} parameter(2)
-  Arg_0.1 = f16[16,16,256,64]{3,2,1,0} parameter(0)
-  Arg_1.2 = f16[16,16,256,64]{3,2,1,0} parameter(1)
-  dot.0 = f16[16,16,256,256]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}, metadata={}
-  ROOT dot.1 = f16[16,16,256,64]{3,2,1,0} dot(dot.0, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}, metadata={}
-})";
-
-TEST_F(CudnnFusedMhaRewriterTestHloTest, F16Bmm1Bmm2Pattern) {
-  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto m,
-      ParseAndReturnVerifiedModule(F16Bmm1Bmm2Pattern_str, GetModuleConfig()));
-  CudnnFusedMHARewriter fusedMhaRewriter{GetCudaComputeCapability(),
-                                         GetCudnnVersion()};
-  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
-  const HloInstruction* fmha;
-
-  SCOPED_TRACE(m->ToString());
-  EXPECT_THAT(
-      m->entry_computation()->root_instruction(),
-      GmockMatch(m::GetTupleElement(
-                     m::CustomCall(&fmha, {kCudnnfMHABmmBmmCallTarget}), 0)
-                     .WithShape(F16, {16, 16, 256, 64})));
-  TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
-                          fmha->backend_config<GpuBackendConfig>());
-  const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
-  EXPECT_EQ(config.fmha_scale(), 1.0);
-  EXPECT_EQ(config.dropout_rate(), 0.0);
-}
-
-TEST_F(CudnnFusedMhaRewriterPipelineTest, F16Bmm1Bmm2Pattern) {
-  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto m,
-      ParseAndReturnVerifiedModule(F16Bmm1Bmm2Pattern_str, GetModuleConfig()));
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
-                          GetOptimizedModule(std::move(m)));
-  const HloInstruction* fmha;
-
-  SCOPED_TRACE(optimized_module->ToString());
-  EXPECT_THAT(
-      optimized_module->entry_computation()->root_instruction(),
-      GmockMatch(m::GetTupleElement(
-                     m::CustomCall(&fmha, {kCudnnfMHABmmBmmCallTarget}), 0)
-                     .WithShape(F16, {16, 16, 256, 64})));
-  TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
-                          fmha->backend_config<GpuBackendConfig>());
-  const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
-  EXPECT_FLOAT_EQ(config.fmha_scale(), 1.0);
-  EXPECT_FLOAT_EQ(config.dropout_rate(), 0.0);
-}
-
 TEST_F(CudnnFusedMhaRewriterTestHloTest, BF16Bmm1CombinedMaskBiasSoftmaxBmm2) {
   if (skip_reason_) GTEST_SKIP() << *skip_reason_;
   const char* module_str = R"(
@@ -768,33 +575,6 @@ ENTRY main.6 {
                              .WithShape(BF16, {16, 16, 256, 64})));
 }
 
-TEST_F(CudnnFusedMhaRewriterTestHloTest,
-       BF16Bmm1Bmm2Pattern_bmm1_non_contracting_dim_larger_than_512) {
-  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-  const char* module_str = R"(
-HloModule fmha_test, entry_computation_layout={(bf16[16,16,1024,64]{3,2,1,0},bf16[16,16,1024,64]{3,2,1,0},bf16[16,16,1024,64]{3,2,1,0})->bf16[16,16,1024,64]{3,2,1,0}}
-ENTRY main.6 {
-  Arg_2.3 = bf16[16,16,1024,64]{3,2,1,0} parameter(2)
-  Arg_0.1 = bf16[16,16,1024,64]{3,2,1,0} parameter(0)
-  Arg_1.2 = bf16[16,16,1024,64]{3,2,1,0} parameter(1)
-  dot.0 = bf16[16,16,1024,1024]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}, metadata={}
-  ROOT dot.1 = bf16[16,16,1024,64]{3,2,1,0} dot(dot.0, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}, metadata={}
-}
-
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
-  CudnnFusedMHARewriter fusedMhaRewriter{GetCudaComputeCapability(),
-                                         GetCudnnVersion()};
-  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
-  const HloInstruction* dot;
-
-  SCOPED_TRACE(m->ToString());
-  EXPECT_THAT(m->entry_computation()->root_instruction(),
-              GmockMatch(m::Dot(&dot, m::Op(), m::Parameter(2))
-                             .WithShape(BF16, {16, 16, 1024, 64})));
-}
-
 TEST_F(CudnnFusedMhaRewriterTestHloTest,
        BF16Bmm1Bmm2Pattern_bmm2_rhs_non_contracting_dim_not_equal_64) {
   if (skip_reason_) GTEST_SKIP() << *skip_reason_;
@@ -1043,689 +823,89 @@ ENTRY main.79 {
   divide.51 = bf16[32,60,40,40]{3,2,1,0} divide(exponential.39, broadcast.50)
   constant = bf16[] constant(1.109)
   broadcast.1 = bf16[32,60,40,40]{3,2,1,0} broadcast(constant), dimensions={}
-  multiply = bf16[32,60,40,40]{3,2,1,0} multiply(divide.51, broadcast.1)
-  constant.4 = bf16[] constant(0)
-  broadcast.5 = bf16[32,60,40,40]{3,2,1,0} broadcast(constant.4), dimensions={}
-  select.76 = bf16[32,60,40,40]{3,2,1,0} select(reshape.18, multiply, broadcast.5)
-  dot.1 = bf16[32,60,64,40]{3,2,1,0} dot(transpose.2, select.76), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-  transpose.78 = bf16[32,40,60,64]{1,3,2,0} transpose(dot.1), dimensions={0,3,1,2}
-  ROOT copy.3 = bf16[32,40,60,64]{3,2,1,0} copy(transpose.78)
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
-  CudnnFusedMHARewriter fusedMhaRewriter{GetCudaComputeCapability(),
-                                         GetCudnnVersion()};
-  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
-  const HloInstruction* fmha;
-
-  SCOPED_TRACE(m->ToString());
-  EXPECT_THAT(
-      m->entry_computation()->root_instruction(),
-      GmockMatch(
-          m::Copy(m::Transpose(m::Transpose(m::GetTupleElement(
-                      m::CustomCall(
-                          &fmha, {kCudnnfMHAScaleBiasSoftmaxDropoutCallTarget}),
-                      0))))
-              .WithShape(BF16, {32, 40, 60, 64})));
-  TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
-                          fmha->backend_config<GpuBackendConfig>());
-  const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
-  EXPECT_NEAR(config.dropout_rate(), 0.1, 1e-2);
-  EXPECT_EQ(fmha->operands().size(), 4);
-}
-
-TEST_F(CudnnFusedMhaRewriterTestHloTest, BF16TrainingBmm1Bmm2) {
-  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-  const char* module_str = R"(
-HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[16,256,16,64]{3,2,1,0},bf16[16,256,16,64]{3,2,1,0},bf16[16,256,16,64]{3,2,1,0},bf16[16,256,16,64]{3,2,1,0})->(bf16[16,256,16,64]{3,2,1,0}, bf16[16,256,16,64]{3,2,1,0}, bf16[16,256,16,64]{3,2,1,0}, bf16[16,256,16,64]{3,2,1,0})}
-
-ENTRY main.17 {
-  Arg_2.3 = bf16[16,256,16,64]{3,2,1,0} parameter(2), sharding={replicated}
-  copy = bf16[16,256,16,64]{1,3,2,0} copy(Arg_2.3), sharding={replicated}
-  transpose.2 = bf16[16,16,64,256]{3,2,1,0} transpose(copy), dimensions={0,2,3,1}
-  Arg_0.1 = bf16[16,256,16,64]{3,2,1,0} parameter(0), sharding={replicated}
-  copy.1 = bf16[16,256,16,64]{3,1,2,0} copy(Arg_0.1), sharding={replicated}
-  transpose = bf16[16,16,256,64]{3,2,1,0} transpose(copy.1), dimensions={0,2,1,3}
-  Arg_1.2 = bf16[16,256,16,64]{3,2,1,0} parameter(1), sharding={replicated}
-  copy.2 = bf16[16,256,16,64]{1,3,2,0} copy(Arg_1.2), sharding={replicated}
-  transpose.1 = bf16[16,16,64,256]{3,2,1,0} transpose(copy.2), dimensions={0,2,3,1}
-  dot = bf16[16,16,256,256]{3,2,1,0} dot(transpose, transpose.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  dot.1 = bf16[16,16,64,256]{3,2,1,0} dot(transpose.2, dot), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-  transpose.7 = bf16[16,256,16,64]{1,3,2,0} transpose(dot.1), dimensions={0,3,1,2}
-  Arg_3.4 = bf16[16,256,16,64]{3,2,1,0} parameter(3), sharding={replicated}
-  copy.3 = bf16[16,256,16,64]{3,1,2,0} copy(Arg_3.4), sharding={replicated}
-  transpose.4 = bf16[16,16,256,64]{3,2,1,0} transpose(copy.3), dimensions={0,2,1,3}
-  dot.2 = bf16[16,16,256,256]{3,2,1,0} dot(transpose.4, transpose.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  copy.4 = bf16[16,256,16,64]{3,1,2,0} copy(Arg_1.2), sharding={replicated}
-  transpose.12 = bf16[16,16,256,64]{3,2,1,0} transpose(copy.4), dimensions={0,2,1,3}
-  dot.4 = bf16[16,16,256,64]{3,2,1,0} dot(dot.2, transpose.12), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  transpose.15 = bf16[16,256,16,64]{3,1,2,0} transpose(dot.4), dimensions={0,2,1,3}
-  dot.3 = bf16[16,16,256,64]{3,2,1,0} dot(dot.2, transpose), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  transpose.13 = bf16[16,256,16,64]{3,1,2,0} transpose(dot.3), dimensions={0,2,1,3}
-  copy.5 = bf16[16,256,16,64]{1,3,2,0} copy(Arg_3.4), sharding={replicated}
-  transpose.8 = bf16[16,16,64,256]{3,2,1,0} transpose(copy.5), dimensions={0,2,3,1}
-  dot.10 = bf16[16,16,64,256]{3,2,1,0} dot(transpose.8, dot), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  transpose.11 = bf16[16,256,16,64]{1,3,2,0} transpose(dot.10), dimensions={0,3,1,2}
-  tuple.16 = (bf16[16,256,16,64]{1,3,2,0}, bf16[16,256,16,64]{3,1,2,0}, bf16[16,256,16,64]{3,1,2,0}, bf16[16,256,16,64]{1,3,2,0}) tuple(transpose.7, transpose.15, transpose.13, transpose.11)
-  get-tuple-element = bf16[16,256,16,64]{1,3,2,0} get-tuple-element(tuple.16), index=0
-  copy.6 = bf16[16,256,16,64]{3,2,1,0} copy(get-tuple-element)
-  get-tuple-element.1 = bf16[16,256,16,64]{3,1,2,0} get-tuple-element(tuple.16), index=1
-  copy.7 = bf16[16,256,16,64]{3,2,1,0} copy(get-tuple-element.1)
-  get-tuple-element.2 = bf16[16,256,16,64]{3,1,2,0} get-tuple-element(tuple.16), index=2
-  copy.8 = bf16[16,256,16,64]{3,2,1,0} copy(get-tuple-element.2)
-  get-tuple-element.3 = bf16[16,256,16,64]{1,3,2,0} get-tuple-element(tuple.16), index=3
-  copy.9 = bf16[16,256,16,64]{3,2,1,0} copy(get-tuple-element.3)
-  ROOT tuple = (bf16[16,256,16,64]{3,2,1,0}, bf16[16,256,16,64]{3,2,1,0}, bf16[16,256,16,64]{3,2,1,0}, bf16[16,256,16,64]{3,2,1,0}) tuple(copy.6, copy.7, copy.8, copy.9)
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
-  CudnnFusedMHARewriter fusedMhaRewriter{GetCudaComputeCapability(),
-                                         GetCudnnVersion()};
-  const auto status = RunHloPass(&fusedMhaRewriter, m.get());
-  const bool changed = status.value();
-  EXPECT_EQ(changed, false);
-}
-
-TEST_F(CudnnFusedMhaRewriterTestHloTest,
-       BF16TrainingBmm1ScaleBiasSoftmaxDropoutBmm2) {
-  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-  const char* module_str = R"(
-HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[16,256,16,64]{3,2,1,0},bf16[16,256,16,64]{3,2,1,0},bf16[16,256,16,64]{3,2,1,0},bf16[1,16,256,256]{3,2,1,0},pred[16,1,256,256]{3,2,1,0},bf16[16,256,16,64]{3,2,1,0})->(bf16[16,256,16,64]{3,2,1,0}, bf16[16,256,16,64]{3,2,1,0}, bf16[16,256,16,64]{3,2,1,0}, bf16[16,256,16,64]{3,2,1,0}, bf16[1,16,256,256]{3,2,1,0})}
-
-region_0.54 {
-  Arg_0.55 = bf16[] parameter(0)
-  Arg_1.56 = bf16[] parameter(1)
-  ROOT maximum.57 = bf16[] maximum(Arg_0.55, Arg_1.56)
-}
-
-region_1.66 {
-  Arg_0.67 = f32[] parameter(0)
-  Arg_1.68 = f32[] parameter(1)
-  ROOT add.69 = f32[] add(Arg_0.67, Arg_1.68)
-}
-
-region_2.114 {
-  Arg_0.115 = bf16[] parameter(0)
-  Arg_1.116 = bf16[] parameter(1)
-  ROOT add.117 = bf16[] add(Arg_0.115, Arg_1.116)
-}
-
-ENTRY main.146 {
-  Arg_2.3 = bf16[16,256,16,64]{3,2,1,0} parameter(2), sharding={replicated}
-  copy = bf16[16,256,16,64]{1,3,2,0} copy(Arg_2.3), sharding={replicated}
-  transpose.5 = bf16[16,16,64,256]{3,2,1,0} transpose(copy), dimensions={0,2,3,1}
-  Arg_0.1 = bf16[16,256,16,64]{3,2,1,0} parameter(0), sharding={replicated}
-  copy.1 = bf16[16,256,16,64]{3,1,2,0} copy(Arg_0.1), sharding={replicated}
-  transpose = bf16[16,16,256,64]{3,2,1,0} transpose(copy.1), dimensions={0,2,1,3}
-  Arg_1.2 = bf16[16,256,16,64]{3,2,1,0} parameter(1), sharding={replicated}
-  copy.2 = bf16[16,256,16,64]{1,3,2,0} copy(Arg_1.2), sharding={replicated}
-  transpose.1 = bf16[16,16,64,256]{3,2,1,0} transpose(copy.2), dimensions={0,2,3,1}
-  dot = bf16[16,16,256,256]{3,2,1,0} dot(transpose, transpose.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  Arg_4.5 = pred[16,1,256,256]{3,2,1,0} parameter(4), sharding={replicated}
-  convert.35 = s32[16,1,256,256]{3,2,1,0} convert(Arg_4.5)
-  constant.28 = s32[] constant(0)
-  broadcast.29 = s32[16,1,256,256]{3,2,1,0} broadcast(constant.28), dimensions={}
-  compare.36 = pred[16,1,256,256]{3,2,1,0} compare(convert.35, broadcast.29), direction=GT
-  constant.30 = bf16[] constant(0)
-  broadcast.1 = bf16[16,1,256,256]{3,2,1,0} broadcast(constant.30), dimensions={}
-  constant.10 = bf16[] constant(-9.999e+09)
-  broadcast.3 = bf16[16,1,256,256]{3,2,1,0} broadcast(constant.10), dimensions={}
-  select.39 = bf16[16,1,256,256]{3,2,1,0} select(compare.36, broadcast.1, broadcast.3)
-  reshape.41 = bf16[16,256,256]{2,1,0} reshape(select.39)
-  broadcast.42 = bf16[16,16,256,256]{3,2,1,0} broadcast(reshape.41), dimensions={0,2,3}
-  Arg_3.4 = bf16[1,16,256,256]{3,2,1,0} parameter(3), sharding={replicated}
-  reshape.44 = bf16[16,256,256]{2,1,0} reshape(Arg_3.4)
-  broadcast.45 = bf16[16,16,256,256]{3,2,1,0} broadcast(reshape.44), dimensions={1,2,3}
-  add.46 = bf16[16,16,256,256]{3,2,1,0} add(broadcast.42, broadcast.45)
-  add.53 = bf16[16,16,256,256]{3,2,1,0} add(dot, add.46)
-  constant.31 = bf16[] constant(-inf)
-  reduce.58 = bf16[16,16,256]{2,1,0} reduce(add.53, constant.31), dimensions={3}, to_apply=region_0.54
-  broadcast.62 = bf16[16,16,256,256]{3,2,1,0} broadcast(reduce.58), dimensions={0,1,2}
-  subtract.63 = bf16[16,16,256,256]{3,2,1,0} subtract(add.53, broadcast.62)
-  exponential.64 = bf16[16,16,256,256]{3,2,1,0} exponential(subtract.63)
-  convert.65 = f32[16,16,256,256]{3,2,1,0} convert(exponential.64)
-  constant.11 = f32[] constant(0)
-  reduce.70 = f32[16,16,256]{2,1,0} reduce(convert.65, constant.11), dimensions={3}, to_apply=region_1.66
-  convert.4 = bf16[16,16,256]{2,1,0} convert(reduce.70)
-  broadcast.75 = bf16[16,16,256,256]{3,2,1,0} broadcast(convert.4), dimensions={0,1,2}
-  divide.76 = bf16[16,16,256,256]{3,2,1,0} divide(exponential.64, broadcast.75)
-  constant.22 = u32[1]{0} constant({255383827})
-  constant.21 = u32[1]{0} constant({267815257})
-  constant.2 = u32[1]{0} constant({0})
-  constant.23 = u32[1]{0} constant({3213575472})
-  custom-call.49 = (u32[1]{0}, u32[1]{0}) custom-call(constant.22, constant.21, constant.2, constant.23), custom_call_target="cu_threefry2x32", operand_layout_constraints={u32[1]{0}, u32[1]{0}, u32[1]{0}, u32[1]{0}}, api_version=API_VERSION_STATUS_RETURNING, backend_config="\001\000\000\000\000\000\000\000"
-  get-tuple-element.50 = u32[1]{0} get-tuple-element(custom-call.49), index=0
-  reshape.80 = u32[] reshape(get-tuple-element.50)
-  broadcast.84 = u32[32768]{0} broadcast(reshape.80), dimensions={}
-  get-tuple-element.51 = u32[1]{0} get-tuple-element(custom-call.49), index=1
-  reshape.81 = u32[] reshape(get-tuple-element.51)
-  broadcast.85 = u32[32768]{0} broadcast(reshape.81), dimensions={}
-  iota.79 = u32[65536]{0} iota(), iota_dimension=0
-  slice.82 = u32[32768]{0} slice(iota.79), slice={[0:32768]}
-  slice.83 = u32[32768]{0} slice(iota.79), slice={[32768:65536]}
-  custom-call.86 = (u32[32768]{0}, u32[32768]{0}) custom-call(broadcast.84, broadcast.85, slice.82, slice.83), custom_call_target="cu_threefry2x32", operand_layout_constraints={u32[32768]{0}, u32[32768]{0}, u32[32768]{0}, u32[32768]{0}}, api_version=API_VERSION_STATUS_RETURNING, backend_config="\000\200\000\000\000\000\000\000"
-  get-tuple-element.87 = u32[32768]{0} get-tuple-element(custom-call.86), index=0
-  get-tuple-element.88 = u32[32768]{0} get-tuple-element(custom-call.86), index=1
-  concatenate.89 = u32[65536]{0} concatenate(get-tuple-element.87, get-tuple-element.88), dimensions={0}
-  constant.17 = u32[] constant(9)
-  broadcast.13 = u32[65536]{0} broadcast(constant.17), dimensions={}
-  shift-right-logical.0 = u32[65536]{0} shift-right-logical(concatenate.89, broadcast.13)
-  constant.15 = u32[] constant(1065353216)
-  broadcast.21 = u32[65536]{0} broadcast(constant.15), dimensions={}
-  or.0 = u32[65536]{0} or(shift-right-logical.0, broadcast.21)
-  bitcast-convert.0 = f32[65536]{0} bitcast-convert(or.0)
-  constant.3 = f32[] constant(-1)
-  broadcast.30 = f32[65536]{0} broadcast(constant.3), dimensions={}
-  add.1 = f32[65536]{0} add(bitcast-convert.0, broadcast.30)
-  broadcast.31 = f32[65536]{0} broadcast(constant.11), dimensions={}
-  maximum.0 = f32[65536]{0} maximum(add.1, broadcast.31)
-  constant.9 = f32[] constant(0.9)
-  broadcast.32 = f32[65536]{0} broadcast(constant.9), dimensions={}
-  compare.0 = pred[65536]{0} compare(maximum.0, broadcast.32), direction=LT
-  constant = bf16[] constant(1.109)
-  broadcast.33 = bf16[65536]{0} broadcast(constant), dimensions={}
-  broadcast.34 = bf16[65536]{0} broadcast(constant.30), dimensions={}
-  select.2 = bf16[65536]{0} select(compare.0, broadcast.33, broadcast.34)
-  reshape.39 = bf16[16,16,256]{2,1,0} reshape(select.2)
-  broadcast.9 = bf16[16,16,256,256]{3,2,1,0} broadcast(reshape.39), dimensions={0,1,3}
-  multiply.101 = bf16[16,16,256,256]{3,2,1,0} multiply(divide.76, broadcast.9)
-  dot.1 = bf16[16,16,64,256]{3,2,1,0} dot(transpose.5, multiply.101), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-  transpose.103 = bf16[16,256,16,64]{1,3,2,0} transpose(dot.1), dimensions={0,3,1,2}
-  Arg_5.6 = bf16[16,256,16,64]{3,2,1,0} parameter(5), sharding={replicated}
-  copy.3 = bf16[16,256,16,64]{3,1,2,0} copy(Arg_5.6), sharding={replicated}
-  transpose.4 = bf16[16,16,256,64]{3,2,1,0} transpose(copy.3), dimensions={0,2,1,3}
-  dot.2 = bf16[16,16,256,256]{3,2,1,0} dot(transpose.4, transpose.5), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  multiply.108 = bf16[16,16,256,256]{3,2,1,0} multiply(dot.2, broadcast.9)
-  divide.124 = bf16[16,16,256,256]{3,2,1,0} divide(multiply.108, broadcast.75)
-  constant.19 = bf16[] constant(1)
-  broadcast.24 = bf16[16,16,256]{2,1,0} broadcast(constant.19), dimensions={}
-  multiply.2 = bf16[16,16,256]{2,1,0} multiply(convert.4, convert.4)
-  divide.0 = bf16[16,16,256]{2,1,0} divide(broadcast.24, multiply.2)
-  broadcast.111 = bf16[16,16,256,256]{3,2,1,0} broadcast(divide.0), dimensions={0,1,2}
-  multiply.112 = bf16[16,16,256,256]{3,2,1,0} multiply(multiply.108, broadcast.111)
-  multiply.113 = bf16[16,16,256,256]{3,2,1,0} multiply(multiply.112, exponential.64)
-  reduce.118 = bf16[16,16,256]{2,1,0} reduce(multiply.113, constant.30), dimensions={3}, to_apply=region_2.114
-  negate.1 = bf16[16,16,256]{2,1,0} negate(reduce.118)
-  broadcast.11 = bf16[16,16,256,256]{3,2,1,0} broadcast(negate.1), dimensions={0,1,2}
-  add.133 = bf16[16,16,256,256]{3,2,1,0} add(divide.124, broadcast.11)
-  multiply.134 = bf16[16,16,256,256]{3,2,1,0} multiply(add.133, exponential.64)
-  copy.4 = bf16[16,256,16,64]{3,1,2,0} copy(Arg_1.2), sharding={replicated}
-  transpose.9 = bf16[16,16,256,64]{3,2,1,0} transpose(copy.4), dimensions={0,2,1,3}
-  dot.4 = bf16[16,16,256,64]{3,2,1,0} dot(multiply.134, transpose.9), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  transpose.144 = bf16[16,256,16,64]{3,1,2,0} transpose(dot.4), dimensions={0,2,1,3}
-  dot.3 = bf16[16,16,256,64]{3,2,1,0} dot(multiply.134, transpose), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  transpose.142 = bf16[16,256,16,64]{3,1,2,0} transpose(dot.3), dimensions={0,2,1,3}
-  copy.5 = bf16[16,256,16,64]{1,3,2,0} copy(Arg_5.6), sharding={replicated}
-  transpose.104 = bf16[16,16,64,256]{3,2,1,0} transpose(copy.5), dimensions={0,2,3,1}
-  dot.106 = bf16[16,16,64,256]{3,2,1,0} dot(transpose.104, multiply.101), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  transpose.107 = bf16[16,256,16,64]{1,3,2,0} transpose(dot.106), dimensions={0,3,1,2}
-  reduce.139 = bf16[16,256,256]{2,1,0} reduce(multiply.134, constant.30), dimensions={0}, to_apply=region_2.114
-  reshape.140 = bf16[1,16,256,256]{3,2,1,0} reshape(reduce.139)
-  tuple.145 = (bf16[16,256,16,64]{1,3,2,0}, bf16[16,256,16,64]{3,1,2,0}, bf16[16,256,16,64]{3,1,2,0}, bf16[16,256,16,64]{1,3,2,0}, bf16[1,16,256,256]{3,2,1,0}) tuple(transpose.103, transpose.144, transpose.142, transpose.107, reshape.140)
-  get-tuple-element = bf16[16,256,16,64]{1,3,2,0} get-tuple-element(tuple.145), index=0
-  copy.6 = bf16[16,256,16,64]{3,2,1,0} copy(get-tuple-element)
-  get-tuple-element.1 = bf16[16,256,16,64]{3,1,2,0} get-tuple-element(tuple.145), index=1
-  copy.7 = bf16[16,256,16,64]{3,2,1,0} copy(get-tuple-element.1)
-  get-tuple-element.2 = bf16[16,256,16,64]{3,1,2,0} get-tuple-element(tuple.145), index=2
-  copy.8 = bf16[16,256,16,64]{3,2,1,0} copy(get-tuple-element.2)
-  get-tuple-element.3 = bf16[16,256,16,64]{1,3,2,0} get-tuple-element(tuple.145), index=3
-  copy.9 = bf16[16,256,16,64]{3,2,1,0} copy(get-tuple-element.3)
-  get-tuple-element.4 = bf16[1,16,256,256]{3,2,1,0} get-tuple-element(tuple.145), index=4
-  ROOT tuple = (bf16[16,256,16,64]{3,2,1,0}, bf16[16,256,16,64]{3,2,1,0}, bf16[16,256,16,64]{3,2,1,0}, bf16[16,256,16,64]{3,2,1,0}, bf16[1,16,256,256]{3,2,1,0}) tuple(copy.6, copy.7, copy.8, copy.9, get-tuple-element.4)
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
-  CudnnFusedMHARewriter fusedMhaRewriter{
-      GetCudaComputeCapability(),
-      GetCudnnVersionWithDbiasAndMaskBwdInputSupport()};
-  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
-
-  HloDCE dce;
-  TF_ASSERT_OK(RunHloPass(&dce, m.get()).status());
-
-  ComputationLayout computation_layout(
-      m->entry_computation()->ComputeProgramShape());
-
-  const HloInstruction* fmha;
-  const absl::string_view backward_target =
-      kCudnnfMHAScaleBiasSoftmaxDropoutBackwardCallTarget;
-  auto dbias_index = 5;
-  SCOPED_TRACE(m->ToString());
-  EXPECT_THAT(
-      m->entry_computation()->root_instruction(),
-      GmockMatch(m::Tuple(
-          m::Copy(m::GetTupleElement(
-              m::Tuple(
-                  m::Transpose().WithShape(BF16, {16, 256, 16, 64}),
-                  m::Transpose(m::GetTupleElement(
-                                   m::CustomCall(&fmha, {backward_target}), 0))
-                      .WithShape(BF16, {16, 256, 16, 64}),
-                  m::Transpose(
-                      m::GetTupleElement(m::CustomCall({backward_target}), 1))
-                      .WithShape(BF16, {16, 256, 16, 64}),
-                  m::Transpose(m::Transpose(m::GetTupleElement(
-                                   m::CustomCall({backward_target}), 2)))
-                      .WithShape(BF16, {16, 256, 16, 64}),
-                  m::Reshape(
-                      m::Reshape(m::GetTupleElement(  // dbias
-                          m::CustomCall({backward_target}), dbias_index)))
-                      .WithShape(BF16, {1, 16, 256, 256})),
-              0)),
-          m::Op(), m::Op(), m::Op(), m::Op())));
-  TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
-                          fmha->backend_config<GpuBackendConfig>());
-  const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
-  EXPECT_EQ(fmha->operands().size(), 5);
-  EXPECT_NEAR(config.dropout_rate(), 0.1, 1e-2);
-}
-
-TEST_F(CudnnFusedMhaRewriterTestHloTest,
-       F16TrainingBmm1ScaleBiasSoftmaxDropoutBmm2) {
-  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-  const char* module_str = R"(
-HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(f16[16,256,16,64]{3,2,1,0},f16[16,256,16,64]{3,2,1,0},f16[16,256,16,64]{3,2,1,0},f16[1,16,256,256]{3,2,1,0},pred[16,1,256,256]{3,2,1,0},f16[16,256,16,64]{3,2,1,0})->(f16[16,256,16,64]{3,2,1,0}, f16[16,256,16,64]{3,2,1,0}, f16[16,256,16,64]{3,2,1,0}, f16[16,256,16,64]{3,2,1,0}, f16[1,16,256,256]{3,2,1,0})}, allow_spmd_sharding_propagation_to_output={true,true,true,true,true}
-
-region_0.54 {
-  Arg_0.55 = f16[] parameter(0)
-  Arg_1.56 = f16[] parameter(1)
-  ROOT maximum.57 = f16[] maximum(Arg_0.55, Arg_1.56)
-}
-
-region_1.66 {
-  Arg_0.67 = f32[] parameter(0)
-  Arg_1.68 = f32[] parameter(1)
-  ROOT add.69 = f32[] add(Arg_0.67, Arg_1.68)
-}
-
-region_2.114 {
-  Arg_0.115 = f16[] parameter(0)
-  Arg_1.116 = f16[] parameter(1)
-  ROOT add.117 = f16[] add(Arg_0.115, Arg_1.116)
-}
-
-ENTRY main.146 {
-  Arg_2.3 = f16[16,256,16,64]{3,2,1,0} parameter(2), sharding={replicated}
-  copy = f16[16,256,16,64]{1,3,2,0} copy(Arg_2.3), sharding={replicated}
-  transpose.5 = f16[16,16,64,256]{3,2,1,0} transpose(copy), dimensions={0,2,3,1}
-  Arg_0.1 = f16[16,256,16,64]{3,2,1,0} parameter(0), sharding={replicated}
-  copy.1 = f16[16,256,16,64]{3,1,2,0} copy(Arg_0.1), sharding={replicated}
-  transpose = f16[16,16,256,64]{3,2,1,0} transpose(copy.1), dimensions={0,2,1,3}
-  Arg_1.2 = f16[16,256,16,64]{3,2,1,0} parameter(1), sharding={replicated}
-  copy.2 = f16[16,256,16,64]{1,3,2,0} copy(Arg_1.2), sharding={replicated}
-  transpose.1 = f16[16,16,64,256]{3,2,1,0} transpose(copy.2), dimensions={0,2,3,1}
-  dot = f16[16,16,256,256]{3,2,1,0} dot(transpose, transpose.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  Arg_4.5 = pred[16,1,256,256]{3,2,1,0} parameter(4), sharding={replicated}
-  convert.35 = s32[16,1,256,256]{3,2,1,0} convert(Arg_4.5)
-  constant.28 = s32[] constant(0)
-  broadcast.29 = s32[16,1,256,256]{3,2,1,0} broadcast(constant.28), dimensions={}
-  compare.36 = pred[16,1,256,256]{3,2,1,0} compare(convert.35, broadcast.29), direction=GT
-  constant.30 = f16[] constant(0)
-  broadcast.1 = f16[16,1,256,256]{3,2,1,0} broadcast(constant.30), dimensions={}
-  constant.31 = f16[] constant(-inf)
-  broadcast.3 = f16[16,1,256,256]{3,2,1,0} broadcast(constant.31), dimensions={}
-  select.39 = f16[16,1,256,256]{3,2,1,0} select(compare.36, broadcast.1, broadcast.3)
-  reshape.41 = f16[16,256,256]{2,1,0} reshape(select.39)
-  broadcast.42 = f16[16,16,256,256]{3,2,1,0} broadcast(reshape.41), dimensions={0,2,3}
-  Arg_3.4 = f16[1,16,256,256]{3,2,1,0} parameter(3), sharding={replicated}
-  reshape.44 = f16[16,256,256]{2,1,0} reshape(Arg_3.4)
-  broadcast.45 = f16[16,16,256,256]{3,2,1,0} broadcast(reshape.44), dimensions={1,2,3}
-  add.46 = f16[16,16,256,256]{3,2,1,0} add(broadcast.42, broadcast.45)
-  add.53 = f16[16,16,256,256]{3,2,1,0} add(dot, add.46)
-  reduce.58 = f16[16,16,256]{2,1,0} reduce(add.53, constant.31), dimensions={3}, to_apply=region_0.54
-  broadcast.62 = f16[16,16,256,256]{3,2,1,0} broadcast(reduce.58), dimensions={0,1,2}
-  subtract.63 = f16[16,16,256,256]{3,2,1,0} subtract(add.53, broadcast.62)
-  exponential.64 = f16[16,16,256,256]{3,2,1,0} exponential(subtract.63)
-  convert.65 = f32[16,16,256,256]{3,2,1,0} convert(exponential.64)
-  constant.11 = f32[] constant(0)
-  reduce.70 = f32[16,16,256]{2,1,0} reduce(convert.65, constant.11), dimensions={3}, to_apply=region_1.66
-  convert.4 = f16[16,16,256]{2,1,0} convert(reduce.70)
-  broadcast.75 = f16[16,16,256,256]{3,2,1,0} broadcast(convert.4), dimensions={0,1,2}
-  divide.76 = f16[16,16,256,256]{3,2,1,0} divide(exponential.64, broadcast.75)
-  constant.22 = u32[1]{0} constant({255383827})
-  constant.21 = u32[1]{0} constant({267815257})
-  constant.2 = u32[1]{0} constant({0})
-  constant.23 = u32[1]{0} constant({3213575472})
-  custom-call.49 = (u32[1]{0}, u32[1]{0}) custom-call(constant.22, constant.21, constant.2, constant.23), custom_call_target="cu_threefry2x32", operand_layout_constraints={u32[1]{0}, u32[1]{0}, u32[1]{0}, u32[1]{0}}, api_version=API_VERSION_STATUS_RETURNING, backend_config="\001\000\000\000\000\000\000\000"
-  get-tuple-element.50 = u32[1]{0} get-tuple-element(custom-call.49), index=0
-  reshape.80 = u32[] reshape(get-tuple-element.50)
-  broadcast.84 = u32[32768]{0} broadcast(reshape.80), dimensions={}
-  get-tuple-element.51 = u32[1]{0} get-tuple-element(custom-call.49), index=1
-  reshape.81 = u32[] reshape(get-tuple-element.51)
-  broadcast.85 = u32[32768]{0} broadcast(reshape.81), dimensions={}
-  iota.79 = u32[65536]{0} iota(), iota_dimension=0
-  slice.82 = u32[32768]{0} slice(iota.79), slice={[0:32768]}
-  slice.83 = u32[32768]{0} slice(iota.79), slice={[32768:65536]}
-  custom-call.86 = (u32[32768]{0}, u32[32768]{0}) custom-call(broadcast.84, broadcast.85, slice.82, slice.83), custom_call_target="cu_threefry2x32", operand_layout_constraints={u32[32768]{0}, u32[32768]{0}, u32[32768]{0}, u32[32768]{0}}, api_version=API_VERSION_STATUS_RETURNING, backend_config="\000\200\000\000\000\000\000\000"
-  get-tuple-element.87 = u32[32768]{0} get-tuple-element(custom-call.86), index=0
-  get-tuple-element.88 = u32[32768]{0} get-tuple-element(custom-call.86), index=1
-  concatenate.89 = u32[65536]{0} concatenate(get-tuple-element.87, get-tuple-element.88), dimensions={0}
-  constant.17 = u32[] constant(9)
-  broadcast.13 = u32[65536]{0} broadcast(constant.17), dimensions={}
-  shift-right-logical.0 = u32[65536]{0} shift-right-logical(concatenate.89, broadcast.13)
-  constant.15 = u32[] constant(1065353216)
-  broadcast.21 = u32[65536]{0} broadcast(constant.15), dimensions={}
-  or.0 = u32[65536]{0} or(shift-right-logical.0, broadcast.21)
-  bitcast-convert.0 = f32[65536]{0} bitcast-convert(or.0)
-  constant.3 = f32[] constant(-1)
-  broadcast.30 = f32[65536]{0} broadcast(constant.3), dimensions={}
-  add.1 = f32[65536]{0} add(bitcast-convert.0, broadcast.30)
-  broadcast.31 = f32[65536]{0} broadcast(constant.11), dimensions={}
-  maximum.0 = f32[65536]{0} maximum(add.1, broadcast.31)
-  constant.9 = f32[] constant(0.9)
-  broadcast.32 = f32[65536]{0} broadcast(constant.9), dimensions={}
-  compare.0 = pred[65536]{0} compare(maximum.0, broadcast.32), direction=LT
-  constant = f16[] constant(1.1113)
-  broadcast.33 = f16[65536]{0} broadcast(constant), dimensions={}
-  broadcast.34 = f16[65536]{0} broadcast(constant.30), dimensions={}
-  select.2 = f16[65536]{0} select(compare.0, broadcast.33, broadcast.34)
-  reshape.39 = f16[16,16,256]{2,1,0} reshape(select.2)
-  broadcast.9 = f16[16,16,256,256]{3,2,1,0} broadcast(reshape.39), dimensions={0,1,3}
-  multiply.101 = f16[16,16,256,256]{3,2,1,0} multiply(divide.76, broadcast.9)
-  dot.1 = f16[16,16,64,256]{3,2,1,0} dot(transpose.5, multiply.101), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-  transpose.103 = f16[16,256,16,64]{1,3,2,0} transpose(dot.1), dimensions={0,3,1,2}
-  Arg_5.6 = f16[16,256,16,64]{3,2,1,0} parameter(5), sharding={replicated}
-  copy.3 = f16[16,256,16,64]{3,1,2,0} copy(Arg_5.6), sharding={replicated}
-  transpose.4 = f16[16,16,256,64]{3,2,1,0} transpose(copy.3), dimensions={0,2,1,3}
-  dot.2 = f16[16,16,256,256]{3,2,1,0} dot(transpose.4, transpose.5), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  multiply.108 = f16[16,16,256,256]{3,2,1,0} multiply(dot.2, broadcast.9)
-  divide.124 = f16[16,16,256,256]{3,2,1,0} divide(multiply.108, broadcast.75)
-  constant.19 = f16[] constant(1)
-  broadcast.24 = f16[16,16,256]{2,1,0} broadcast(constant.19), dimensions={}
-  multiply.2 = f16[16,16,256]{2,1,0} multiply(convert.4, convert.4)
-  divide.0 = f16[16,16,256]{2,1,0} divide(broadcast.24, multiply.2)
-  broadcast.111 = f16[16,16,256,256]{3,2,1,0} broadcast(divide.0), dimensions={0,1,2}
-  multiply.112 = f16[16,16,256,256]{3,2,1,0} multiply(multiply.108, broadcast.111)
-  multiply.113 = f16[16,16,256,256]{3,2,1,0} multiply(multiply.112, exponential.64)
-  reduce.118 = f16[16,16,256]{2,1,0} reduce(multiply.113, constant.30), dimensions={3}, to_apply=region_2.114
-  negate.1 = f16[16,16,256]{2,1,0} negate(reduce.118)
-  broadcast.11 = f16[16,16,256,256]{3,2,1,0} broadcast(negate.1), dimensions={0,1,2}
-  add.133 = f16[16,16,256,256]{3,2,1,0} add(divide.124, broadcast.11)
-  multiply.134 = f16[16,16,256,256]{3,2,1,0} multiply(add.133, exponential.64)
-  copy.4 = f16[16,256,16,64]{3,1,2,0} copy(Arg_1.2), sharding={replicated}
-  transpose.9 = f16[16,16,256,64]{3,2,1,0} transpose(copy.4), dimensions={0,2,1,3}
-  dot.4 = f16[16,16,256,64]{3,2,1,0} dot(multiply.134, transpose.9), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  transpose.144 = f16[16,256,16,64]{3,1,2,0} transpose(dot.4), dimensions={0,2,1,3}
-  dot.3 = f16[16,16,256,64]{3,2,1,0} dot(multiply.134, transpose), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  transpose.142 = f16[16,256,16,64]{3,1,2,0} transpose(dot.3), dimensions={0,2,1,3}
-  copy.5 = f16[16,256,16,64]{1,3,2,0} copy(Arg_5.6), sharding={replicated}
-  transpose.104 = f16[16,16,64,256]{3,2,1,0} transpose(copy.5), dimensions={0,2,3,1}
-  dot.106 = f16[16,16,64,256]{3,2,1,0} dot(transpose.104, multiply.101), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  transpose.107 = f16[16,256,16,64]{1,3,2,0} transpose(dot.106), dimensions={0,3,1,2}
-  reduce.139 = f16[16,256,256]{2,1,0} reduce(multiply.134, constant.30), dimensions={0}, to_apply=region_2.114
-  reshape.140 = f16[1,16,256,256]{3,2,1,0} reshape(reduce.139)
-  tuple.145 = (f16[16,256,16,64]{1,3,2,0}, f16[16,256,16,64]{3,1,2,0}, f16[16,256,16,64]{3,1,2,0}, f16[16,256,16,64]{1,3,2,0}, f16[1,16,256,256]{3,2,1,0}) tuple(transpose.103, transpose.144, transpose.142, transpose.107, reshape.140)
-  get-tuple-element = f16[16,256,16,64]{1,3,2,0} get-tuple-element(tuple.145), index=0
-  copy.6 = f16[16,256,16,64]{3,2,1,0} copy(get-tuple-element)
-  get-tuple-element.1 = f16[16,256,16,64]{3,1,2,0} get-tuple-element(tuple.145), index=1
-  copy.7 = f16[16,256,16,64]{3,2,1,0} copy(get-tuple-element.1)
-  get-tuple-element.2 = f16[16,256,16,64]{3,1,2,0} get-tuple-element(tuple.145), index=2
-  copy.8 = f16[16,256,16,64]{3,2,1,0} copy(get-tuple-element.2)
-  get-tuple-element.3 = f16[16,256,16,64]{1,3,2,0} get-tuple-element(tuple.145), index=3
-  copy.9 = f16[16,256,16,64]{3,2,1,0} copy(get-tuple-element.3)
-  get-tuple-element.4 = f16[1,16,256,256]{3,2,1,0} get-tuple-element(tuple.145), index=4
-  ROOT tuple = (f16[16,256,16,64]{3,2,1,0}, f16[16,256,16,64]{3,2,1,0}, f16[16,256,16,64]{3,2,1,0}, f16[16,256,16,64]{3,2,1,0}, f16[1,16,256,256]{3,2,1,0}) tuple(copy.6, copy.7, copy.8, copy.9, get-tuple-element.4)
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
-  CudnnFusedMHARewriter fusedMhaRewriter{
-      GetCudaComputeCapability(),
-      GetCudnnVersionWithDbiasAndMaskBwdInputSupport()};
-  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
-  HloDCE dce;
-  TF_ASSERT_OK(RunHloPass(&dce, m.get()).status());
-
-  ComputationLayout computation_layout(
-      m->entry_computation()->ComputeProgramShape());
-
-  const HloInstruction* fmha;
-  const absl::string_view backward_target =
-      kCudnnfMHAScaleBiasSoftmaxDropoutBackwardCallTarget;
-  auto dbias_index = 5;
-  SCOPED_TRACE(m->ToString());
-  EXPECT_THAT(
-      m->entry_computation()->root_instruction(),
-      GmockMatch(m::Tuple(
-          m::Copy(m::GetTupleElement(
-              m::Tuple(
-                  m::Transpose().WithShape(F16, {16, 256, 16, 64}),
-                  m::Transpose(m::GetTupleElement(
-                                   m::CustomCall(&fmha, {backward_target}), 0))
-                      .WithShape(F16, {16, 256, 16, 64}),
-                  m::Transpose(
-                      m::GetTupleElement(m::CustomCall({backward_target}), 1))
-                      .WithShape(F16, {16, 256, 16, 64}),
-                  m::Transpose(m::Transpose(m::GetTupleElement(
-                                   m::CustomCall({backward_target}), 2)))
-                      .WithShape(F16, {16, 256, 16, 64}),
-                  m::Reshape(
-                      m::Reshape(m::GetTupleElement(  // dbias
-                          m::CustomCall({backward_target}), dbias_index)))
-                      .WithShape(F16, {1, 16, 256, 256})),
-              0)),
-          m::Op(), m::Op(), m::Op(), m::Op())));
-  TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
-                          fmha->backend_config<GpuBackendConfig>());
-  const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
-  EXPECT_EQ(fmha->operands().size(), 5);
-  EXPECT_NEAR(config.dropout_rate(), 0.1, 1e-2);
-}
-
-TEST_F(CudnnFusedMhaRewriterTestHloTest,
-       F16TrainingBmm1ScaleBiasSoftmaxDropoutBmm2WithTransposeFusion) {
-  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-  const char* module_str = R"(
-HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(f16[16,256,16,64]{3,2,1,0},f16[16,256,16,64]{3,2,1,0},f16[16,256,16,64]{3,2,1,0},f16[1,16,256,256]{3,2,1,0},pred[16,1,256,256]{3,2,1,0},f16[16,256,16,64]{3,2,1,0})->(f16[16,256,16,64]{3,2,1,0}, f16[16,256,16,64]{3,2,1,0}, f16[16,256,16,64]{3,2,1,0}, f16[16,256,16,64]{3,2,1,0}, f16[1,16,256,256]{3,2,1,0})}, allow_spmd_sharding_propagation_to_output={true,true,true,true,true}
-
-region_0.54 {
-  Arg_0.55 = f16[] parameter(0)
-  Arg_1.56 = f16[] parameter(1)
-  ROOT maximum.57 = f16[] maximum(Arg_0.55, Arg_1.56)
-}
-
-region_1.66 {
-  Arg_0.67 = f32[] parameter(0)
-  Arg_1.68 = f32[] parameter(1)
-  ROOT add.69 = f32[] add(Arg_0.67, Arg_1.68)
-}
-
-region_2.114 {
-  Arg_0.115 = f16[] parameter(0)
-  Arg_1.116 = f16[] parameter(1)
-  ROOT add.117 = f16[] add(Arg_0.115, Arg_1.116)
-}
-
-ENTRY main.146 {
-  Arg_2.3 = f16[16,256,16,64]{3,2,1,0} parameter(2), sharding={replicated}
-  copy = f16[16,256,16,64]{1,3,2,0} copy(Arg_2.3), sharding={replicated}
-  transpose.5 = f16[16,16,64,256]{3,2,1,0} transpose(copy), dimensions={0,2,3,1}
-  Arg_0.1 = f16[16,256,16,64]{3,2,1,0} parameter(0), sharding={replicated}
-  copy.1 = f16[16,256,16,64]{3,1,2,0} copy(Arg_0.1), sharding={replicated}
-  transpose = f16[16,16,256,64]{3,2,1,0} transpose(copy.1), dimensions={0,2,1,3}
-  Arg_1.2 = f16[16,256,16,64]{3,2,1,0} parameter(1), sharding={replicated}
-  copy.2 = f16[16,256,16,64]{1,3,2,0} copy(Arg_1.2), sharding={replicated}
-  transpose.1 = f16[16,16,64,256]{3,2,1,0} transpose(copy.2), dimensions={0,2,3,1}
-  dot = f16[16,16,256,256]{3,2,1,0} dot(transpose, transpose.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  Arg_4.5 = pred[16,1,256,256]{3,2,1,0} parameter(4), sharding={replicated}
-  convert.35 = s32[16,1,256,256]{3,2,1,0} convert(Arg_4.5)
-  constant.28 = s32[] constant(0)
-  broadcast.29 = s32[16,1,256,256]{3,2,1,0} broadcast(constant.28), dimensions={}
-  compare.36 = pred[16,1,256,256]{3,2,1,0} compare(convert.35, broadcast.29), direction=GT
-  constant.30 = f16[] constant(0)
-  broadcast.1 = f16[16,1,256,256]{3,2,1,0} broadcast(constant.30), dimensions={}
-  constant.31 = f16[] constant(-inf)
-  broadcast.3 = f16[16,1,256,256]{3,2,1,0} broadcast(constant.31), dimensions={}
-  select.39 = f16[16,1,256,256]{3,2,1,0} select(compare.36, broadcast.1, broadcast.3)
-  reshape.41 = f16[16,256,256]{2,1,0} reshape(select.39)
-  broadcast.42 = f16[16,16,256,256]{3,2,1,0} broadcast(reshape.41), dimensions={0,2,3}
-  Arg_3.4 = f16[1,16,256,256]{3,2,1,0} parameter(3), sharding={replicated}
-  reshape.44 = f16[16,256,256]{2,1,0} reshape(Arg_3.4)
-  broadcast.45 = f16[16,16,256,256]{3,2,1,0} broadcast(reshape.44), dimensions={1,2,3}
-  add.46 = f16[16,16,256,256]{3,2,1,0} add(broadcast.42, broadcast.45)
-  add.53 = f16[16,16,256,256]{3,2,1,0} add(dot, add.46)
-  reduce.58 = f16[16,16,256]{2,1,0} reduce(add.53, constant.31), dimensions={3}, to_apply=region_0.54
-  broadcast.62 = f16[16,16,256,256]{3,2,1,0} broadcast(reduce.58), dimensions={0,1,2}
-  subtract.63 = f16[16,16,256,256]{3,2,1,0} subtract(add.53, broadcast.62)
-  exponential.64 = f16[16,16,256,256]{3,2,1,0} exponential(subtract.63)
-  convert.65 = f32[16,16,256,256]{3,2,1,0} convert(exponential.64)
-  constant.11 = f32[] constant(0)
-  reduce.70 = f32[16,16,256]{2,1,0} reduce(convert.65, constant.11), dimensions={3}, to_apply=region_1.66
-  convert.4 = f16[16,16,256]{2,1,0} convert(reduce.70)
-  broadcast.75 = f16[16,16,256,256]{3,2,1,0} broadcast(convert.4), dimensions={0,1,2}
-  divide.76 = f16[16,16,256,256]{3,2,1,0} divide(exponential.64, broadcast.75)
-  constant.22 = u32[1]{0} constant({255383827})
-  constant.21 = u32[1]{0} constant({267815257})
-  constant.2 = u32[1]{0} constant({0})
-  constant.23 = u32[1]{0} constant({3213575472})
-  custom-call.49 = (u32[1]{0}, u32[1]{0}) custom-call(constant.22, constant.21, constant.2, constant.23), custom_call_target="cu_threefry2x32", operand_layout_constraints={u32[1]{0}, u32[1]{0}, u32[1]{0}, u32[1]{0}}, api_version=API_VERSION_STATUS_RETURNING, backend_config="\001\000\000\000\000\000\000\000"
-  get-tuple-element.50 = u32[1]{0} get-tuple-element(custom-call.49), index=0
-  reshape.80 = u32[] reshape(get-tuple-element.50)
-  broadcast.84 = u32[32768]{0} broadcast(reshape.80), dimensions={}
-  get-tuple-element.51 = u32[1]{0} get-tuple-element(custom-call.49), index=1
-  reshape.81 = u32[] reshape(get-tuple-element.51)
-  broadcast.85 = u32[32768]{0} broadcast(reshape.81), dimensions={}
-  iota.79 = u32[65536]{0} iota(), iota_dimension=0
-  slice.82 = u32[32768]{0} slice(iota.79), slice={[0:32768]}
-  slice.83 = u32[32768]{0} slice(iota.79), slice={[32768:65536]}
-  custom-call.86 = (u32[32768]{0}, u32[32768]{0}) custom-call(broadcast.84, broadcast.85, slice.82, slice.83), custom_call_target="cu_threefry2x32", operand_layout_constraints={u32[32768]{0}, u32[32768]{0}, u32[32768]{0}, u32[32768]{0}}, api_version=API_VERSION_STATUS_RETURNING, backend_config="\000\200\000\000\000\000\000\000"
-  get-tuple-element.87 = u32[32768]{0} get-tuple-element(custom-call.86), index=0
-  get-tuple-element.88 = u32[32768]{0} get-tuple-element(custom-call.86), index=1
-  concatenate.89 = u32[65536]{0} concatenate(get-tuple-element.87, get-tuple-element.88), dimensions={0}
-  constant.17 = u32[] constant(9)
-  broadcast.13 = u32[65536]{0} broadcast(constant.17), dimensions={}
-  shift-right-logical.0 = u32[65536]{0} shift-right-logical(concatenate.89, broadcast.13)
-  constant.15 = u32[] constant(1065353216)
-  broadcast.21 = u32[65536]{0} broadcast(constant.15), dimensions={}
-  or.0 = u32[65536]{0} or(shift-right-logical.0, broadcast.21)
-  bitcast-convert.0 = f32[65536]{0} bitcast-convert(or.0)
-  constant.3 = f32[] constant(-1)
-  broadcast.30 = f32[65536]{0} broadcast(constant.3), dimensions={}
-  add.1 = f32[65536]{0} add(bitcast-convert.0, broadcast.30)
-  broadcast.31 = f32[65536]{0} broadcast(constant.11), dimensions={}
-  maximum.0 = f32[65536]{0} maximum(add.1, broadcast.31)
-  constant.9 = f32[] constant(0.9)
-  broadcast.32 = f32[65536]{0} broadcast(constant.9), dimensions={}
-  compare.0 = pred[65536]{0} compare(maximum.0, broadcast.32), direction=LT
-  constant = f16[] constant(1.1113)
-  broadcast.33 = f16[65536]{0} broadcast(constant), dimensions={}
-  broadcast.34 = f16[65536]{0} broadcast(constant.30), dimensions={}
-  select.2 = f16[65536]{0} select(compare.0, broadcast.33, broadcast.34)
-  reshape.39 = f16[16,16,256]{2,1,0} reshape(select.2)
-  broadcast.9 = f16[16,16,256,256]{3,2,1,0} broadcast(reshape.39), dimensions={0,1,3}
-  multiply.101 = f16[16,16,256,256]{3,2,1,0} multiply(divide.76, broadcast.9)
-  dot.1 = f16[16,16,64,256]{3,2,1,0} dot(transpose.5, multiply.101), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-  transpose.103 = f16[16,256,16,64]{1,3,2,0} transpose(dot.1), dimensions={0,3,1,2}
-  Arg_5.6 = f16[16,256,16,64]{3,2,1,0} parameter(5), sharding={replicated}
-  copy.3 = f16[16,256,16,64]{3,1,2,0} copy(Arg_5.6), sharding={replicated}
-  transpose.4 = f16[16,16,256,64]{3,2,1,0} transpose(copy.3), dimensions={0,2,1,3}
-  dot.2 = f16[16,16,256,256]{3,2,1,0} dot(transpose.4, transpose.5), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  multiply.108 = f16[16,16,256,256]{3,2,1,0} multiply(dot.2, broadcast.9)
-  divide.124 = f16[16,16,256,256]{3,2,1,0} divide(multiply.108, broadcast.75)
-  constant.19 = f16[] constant(1)
-  broadcast.24 = f16[16,16,256]{2,1,0} broadcast(constant.19), dimensions={}
-  multiply.2 = f16[16,16,256]{2,1,0} multiply(convert.4, convert.4)
-  divide.0 = f16[16,16,256]{2,1,0} divide(broadcast.24, multiply.2)
-  broadcast.111 = f16[16,16,256,256]{3,2,1,0} broadcast(divide.0), dimensions={0,1,2}
-  multiply.112 = f16[16,16,256,256]{3,2,1,0} multiply(multiply.108, broadcast.111)
-  multiply.113 = f16[16,16,256,256]{3,2,1,0} multiply(multiply.112, exponential.64)
-  reduce.118 = f16[16,16,256]{2,1,0} reduce(multiply.113, constant.30), dimensions={3}, to_apply=region_2.114
-  negate.1 = f16[16,16,256]{2,1,0} negate(reduce.118)
-  broadcast.11 = f16[16,16,256,256]{3,2,1,0} broadcast(negate.1), dimensions={0,1,2}
-  add.133 = f16[16,16,256,256]{3,2,1,0} add(divide.124, broadcast.11)
-  multiply.134 = f16[16,16,256,256]{3,2,1,0} multiply(add.133, exponential.64)
-  copy.4 = f16[16,256,16,64]{3,1,2,0} copy(Arg_1.2), sharding={replicated}
-  transpose.9 = f16[16,16,256,64]{3,2,1,0} transpose(copy.4), dimensions={0,2,1,3}
-  dot.4 = f16[16,16,256,64]{3,2,1,0} dot(multiply.134, transpose.9), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  transpose.144 = f16[16,256,16,64]{3,1,2,0} transpose(dot.4), dimensions={0,2,1,3}
-  dot.3 = f16[16,16,256,64]{3,2,1,0} dot(multiply.134, transpose), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  transpose.142 = f16[16,256,16,64]{3,1,2,0} transpose(dot.3), dimensions={0,2,1,3}
-  copy.5 = f16[16,256,16,64]{1,3,2,0} copy(Arg_5.6), sharding={replicated}
-  transpose.104 = f16[16,16,64,256]{3,2,1,0} transpose(copy.5), dimensions={0,2,3,1}
-  dot.106 = f16[16,16,64,256]{3,2,1,0} dot(transpose.104, multiply.101), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  transpose.107 = f16[16,256,16,64]{1,3,2,0} transpose(dot.106), dimensions={0,3,1,2}
-  reduce.139 = f16[16,256,256]{2,1,0} reduce(multiply.134, constant.30), dimensions={0}, to_apply=region_2.114
-  reshape.140 = f16[1,16,256,256]{3,2,1,0} reshape(reduce.139)
-  tuple.145 = (f16[16,256,16,64]{1,3,2,0}, f16[16,256,16,64]{3,1,2,0}, f16[16,256,16,64]{3,1,2,0}, f16[16,256,16,64]{1,3,2,0}, f16[1,16,256,256]{3,2,1,0}) tuple(transpose.103, transpose.144, transpose.142, transpose.107, reshape.140)
-  get-tuple-element = f16[16,256,16,64]{1,3,2,0} get-tuple-element(tuple.145), index=0
-  copy.6 = f16[16,256,16,64]{3,2,1,0} copy(get-tuple-element)
-  get-tuple-element.1 = f16[16,256,16,64]{3,1,2,0} get-tuple-element(tuple.145), index=1
-  copy.7 = f16[16,256,16,64]{3,2,1,0} copy(get-tuple-element.1)
-  get-tuple-element.2 = f16[16,256,16,64]{3,1,2,0} get-tuple-element(tuple.145), index=2
-  copy.8 = f16[16,256,16,64]{3,2,1,0} copy(get-tuple-element.2)
-  get-tuple-element.3 = f16[16,256,16,64]{1,3,2,0} get-tuple-element(tuple.145), index=3
-  copy.9 = f16[16,256,16,64]{3,2,1,0} copy(get-tuple-element.3)
-  get-tuple-element.4 = f16[1,16,256,256]{3,2,1,0} get-tuple-element(tuple.145), index=4
-  ROOT tuple = (f16[16,256,16,64]{3,2,1,0}, f16[16,256,16,64]{3,2,1,0}, f16[16,256,16,64]{3,2,1,0}, f16[16,256,16,64]{3,2,1,0}, f16[1,16,256,256]{3,2,1,0}) tuple(copy.6, copy.7, copy.8, copy.9, get-tuple-element.4)
+  multiply = bf16[32,60,40,40]{3,2,1,0} multiply(divide.51, broadcast.1)
+  constant.4 = bf16[] constant(0)
+  broadcast.5 = bf16[32,60,40,40]{3,2,1,0} broadcast(constant.4), dimensions={}
+  select.76 = bf16[32,60,40,40]{3,2,1,0} select(reshape.18, multiply, broadcast.5)
+  dot.1 = bf16[32,60,64,40]{3,2,1,0} dot(transpose.2, select.76), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
+  transpose.78 = bf16[32,40,60,64]{1,3,2,0} transpose(dot.1), dimensions={0,3,1,2}
+  ROOT copy.3 = bf16[32,40,60,64]{3,2,1,0} copy(transpose.78)
 }
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
-  AlgebraicSimplifierOptions alg_sim_options;
-  alg_sim_options.set_supports_non_canonical_dots(false);
-  alg_sim_options.set_is_layout_sensitive(true);
-  alg_sim_options.set_enable_conv_operand_swap(false);
-  AlgebraicSimplifier alge_simp{alg_sim_options};
-
-  LayoutNormalization layout_normalizer;
-  HloCSE cse{/*is_layout_sensitive=*/true};
-  TF_ASSERT_OK(RunHloPass(&layout_normalizer, m.get()).status());
-  TF_ASSERT_OK(RunHloPass(&cse, m.get()).status());
-  TF_ASSERT_OK(RunHloPass(&alge_simp, m.get()).status());
-
-  CudnnFusedMHARewriter fusedMhaRewriter{
-      GetCudaComputeCapability(),
-      GetCudnnVersionWithDbiasAndMaskBwdInputSupport()};
+  CudnnFusedMHARewriter fusedMhaRewriter{GetCudaComputeCapability(),
+                                         GetCudnnVersion()};
   TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
-
-  CudnnFusedMHATransposeFusion fmha_transpose_fusion;
-
-  HloDCE dce;
-  TF_ASSERT_OK(RunHloPass(&alge_simp, m.get()).status());
-  TF_ASSERT_OK(RunHloPass(&fmha_transpose_fusion, m.get()).status());
-
-  TF_ASSERT_OK(RunHloPass(&dce, m.get()).status());
-
-  ComputationLayout computation_layout(
-      m->entry_computation()->ComputeProgramShape());
-
   const HloInstruction* fmha;
-  auto dbias_index = 5;
+
   SCOPED_TRACE(m->ToString());
   EXPECT_THAT(
       m->entry_computation()->root_instruction(),
-      GmockMatch(m::Tuple(
-          m::Bitcast().WithShape(F16, {16, 256, 16, 64}),
-          m::Bitcast(
-              m::GetTupleElement(
-                  m::CustomCall(
-                      &fmha,
-                      {kCudnnfMHAScaleBiasSoftmaxDropoutBackwardCallTarget}),
-                  0))
-              .WithShape(F16, {16, 256, 16, 64}),
-          m::Bitcast(
-              m::GetTupleElement(
-                  m::CustomCall(
-                      {kCudnnfMHAScaleBiasSoftmaxDropoutBackwardCallTarget}),
-                  1))
-              .WithShape(F16, {16, 256, 16, 64}),
-          m::Bitcast(
-              m::GetTupleElement(
-                  m::CustomCall(
-                      {kCudnnfMHAScaleBiasSoftmaxDropoutBackwardCallTarget}),
-                  2))
-              .WithShape(F16, {16, 256, 16, 64}),
-          m::GetTupleElement(  // dbias
-              m::CustomCall(
-                  {kCudnnfMHAScaleBiasSoftmaxDropoutBackwardCallTarget}),
-              dbias_index))));
+      GmockMatch(
+          m::Copy(m::Transpose(m::Transpose(m::GetTupleElement(
+                      m::CustomCall(
+                          &fmha, {kCudnnfMHAScaleBiasSoftmaxDropoutCallTarget}),
+                      0))))
+              .WithShape(BF16, {32, 40, 60, 64})));
   TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
                           fmha->backend_config<GpuBackendConfig>());
   const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
-  EXPECT_EQ(fmha->operands().size(), 5);
   EXPECT_NEAR(config.dropout_rate(), 0.1, 1e-2);
+  EXPECT_EQ(fmha->operands().size(), 4);
+}
+
+TEST_F(CudnnFusedMhaRewriterTestHloTest, BF16TrainingBmm1Bmm2) {
+  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
+  const char* module_str = R"(
+HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[16,256,16,64]{3,2,1,0},bf16[16,256,16,64]{3,2,1,0},bf16[16,256,16,64]{3,2,1,0},bf16[16,256,16,64]{3,2,1,0})->(bf16[16,256,16,64]{3,2,1,0}, bf16[16,256,16,64]{3,2,1,0}, bf16[16,256,16,64]{3,2,1,0}, bf16[16,256,16,64]{3,2,1,0})}
+
+ENTRY main.17 {
+  Arg_2.3 = bf16[16,256,16,64]{3,2,1,0} parameter(2), sharding={replicated}
+  copy = bf16[16,256,16,64]{1,3,2,0} copy(Arg_2.3), sharding={replicated}
+  transpose.2 = bf16[16,16,64,256]{3,2,1,0} transpose(copy), dimensions={0,2,3,1}
+  Arg_0.1 = bf16[16,256,16,64]{3,2,1,0} parameter(0), sharding={replicated}
+  copy.1 = bf16[16,256,16,64]{3,1,2,0} copy(Arg_0.1), sharding={replicated}
+  transpose = bf16[16,16,256,64]{3,2,1,0} transpose(copy.1), dimensions={0,2,1,3}
+  Arg_1.2 = bf16[16,256,16,64]{3,2,1,0} parameter(1), sharding={replicated}
+  copy.2 = bf16[16,256,16,64]{1,3,2,0} copy(Arg_1.2), sharding={replicated}
+  transpose.1 = bf16[16,16,64,256]{3,2,1,0} transpose(copy.2), dimensions={0,2,3,1}
+  dot = bf16[16,16,256,256]{3,2,1,0} dot(transpose, transpose.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  dot.1 = bf16[16,16,64,256]{3,2,1,0} dot(transpose.2, dot), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
+  transpose.7 = bf16[16,256,16,64]{1,3,2,0} transpose(dot.1), dimensions={0,3,1,2}
+  Arg_3.4 = bf16[16,256,16,64]{3,2,1,0} parameter(3), sharding={replicated}
+  copy.3 = bf16[16,256,16,64]{3,1,2,0} copy(Arg_3.4), sharding={replicated}
+  transpose.4 = bf16[16,16,256,64]{3,2,1,0} transpose(copy.3), dimensions={0,2,1,3}
+  dot.2 = bf16[16,16,256,256]{3,2,1,0} dot(transpose.4, transpose.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  copy.4 = bf16[16,256,16,64]{3,1,2,0} copy(Arg_1.2), sharding={replicated}
+  transpose.12 = bf16[16,16,256,64]{3,2,1,0} transpose(copy.4), dimensions={0,2,1,3}
+  dot.4 = bf16[16,16,256,64]{3,2,1,0} dot(dot.2, transpose.12), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  transpose.15 = bf16[16,256,16,64]{3,1,2,0} transpose(dot.4), dimensions={0,2,1,3}
+  dot.3 = bf16[16,16,256,64]{3,2,1,0} dot(dot.2, transpose), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  transpose.13 = bf16[16,256,16,64]{3,1,2,0} transpose(dot.3), dimensions={0,2,1,3}
+  copy.5 = bf16[16,256,16,64]{1,3,2,0} copy(Arg_3.4), sharding={replicated}
+  transpose.8 = bf16[16,16,64,256]{3,2,1,0} transpose(copy.5), dimensions={0,2,3,1}
+  dot.10 = bf16[16,16,64,256]{3,2,1,0} dot(transpose.8, dot), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  transpose.11 = bf16[16,256,16,64]{1,3,2,0} transpose(dot.10), dimensions={0,3,1,2}
+  tuple.16 = (bf16[16,256,16,64]{1,3,2,0}, bf16[16,256,16,64]{3,1,2,0}, bf16[16,256,16,64]{3,1,2,0}, bf16[16,256,16,64]{1,3,2,0}) tuple(transpose.7, transpose.15, transpose.13, transpose.11)
+  get-tuple-element = bf16[16,256,16,64]{1,3,2,0} get-tuple-element(tuple.16), index=0
+  copy.6 = bf16[16,256,16,64]{3,2,1,0} copy(get-tuple-element)
+  get-tuple-element.1 = bf16[16,256,16,64]{3,1,2,0} get-tuple-element(tuple.16), index=1
+  copy.7 = bf16[16,256,16,64]{3,2,1,0} copy(get-tuple-element.1)
+  get-tuple-element.2 = bf16[16,256,16,64]{3,1,2,0} get-tuple-element(tuple.16), index=2
+  copy.8 = bf16[16,256,16,64]{3,2,1,0} copy(get-tuple-element.2)
+  get-tuple-element.3 = bf16[16,256,16,64]{1,3,2,0} get-tuple-element(tuple.16), index=3
+  copy.9 = bf16[16,256,16,64]{3,2,1,0} copy(get-tuple-element.3)
+  ROOT tuple = (bf16[16,256,16,64]{3,2,1,0}, bf16[16,256,16,64]{3,2,1,0}, bf16[16,256,16,64]{3,2,1,0}, bf16[16,256,16,64]{3,2,1,0}) tuple(copy.6, copy.7, copy.8, copy.9)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
+  CudnnFusedMHARewriter fusedMhaRewriter{GetCudaComputeCapability(),
+                                         GetCudnnVersion()};
+  const auto status = RunHloPass(&fusedMhaRewriter, m.get());
+  const bool changed = status.value();
+  EXPECT_EQ(changed, false);
 }
 
 TEST_F(CudnnFusedMhaRewriterTestHloTest, BF16MiniT5xTest) {
@@ -1877,202 +1057,6 @@ ENTRY main.129 {
   EXPECT_EQ(CountFusedAttentionCall(m.get(), /*is_backward*/ true), 1);
 }
 
-TEST_F(CudnnFusedMhaRewriterTestHloTest,
-       BF16TrainingBmm1ScaleBiasSoftmaxDropoutBmm2DbiasShouldHaveUserShape) {
-  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-  const char* module_str = R"(
-HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[16,256,16,64]{3,2,1,0},bf16[16,256,16,64]{3,2,1,0},bf16[16,256,16,64]{3,2,1,0},bf16[1,16,256,256]{3,2,1,0},pred[16,1,256,256]{3,2,1,0},bf16[16,256,16,64]{3,2,1,0})->(bf16[16,256,16,64]{3,2,1,0}, bf16[16,256,16,64]{3,2,1,0}, bf16[16,256,16,64]{3,2,1,0}, bf16[16,256,16,64]{3,2,1,0}, bf16[1,16,256,256]{3,2,1,0})}
-
-region_0.54 {
-  Arg_0.55 = bf16[] parameter(0)
-  Arg_1.56 = bf16[] parameter(1)
-  ROOT maximum.57 = bf16[] maximum(Arg_0.55, Arg_1.56)
-}
-
-region_1.66 {
-  Arg_0.67 = f32[] parameter(0)
-  Arg_1.68 = f32[] parameter(1)
-  ROOT add.69 = f32[] add(Arg_0.67, Arg_1.68)
-}
-
-region_2.114 {
-  Arg_0.115 = bf16[] parameter(0)
-  Arg_1.116 = bf16[] parameter(1)
-  ROOT add.117 = bf16[] add(Arg_0.115, Arg_1.116)
-}
-
-ENTRY main.146 {
-  Arg_2.3 = bf16[16,256,16,64]{3,2,1,0} parameter(2), sharding={replicated}
-  copy = bf16[16,256,16,64]{1,3,2,0} copy(Arg_2.3), sharding={replicated}
-  transpose.5 = bf16[16,16,64,256]{3,2,1,0} transpose(copy), dimensions={0,2,3,1}
-  Arg_0.1 = bf16[16,256,16,64]{3,2,1,0} parameter(0), sharding={replicated}
-  copy.1 = bf16[16,256,16,64]{3,1,2,0} copy(Arg_0.1), sharding={replicated}
-  transpose = bf16[16,16,256,64]{3,2,1,0} transpose(copy.1), dimensions={0,2,1,3}
-  Arg_1.2 = bf16[16,256,16,64]{3,2,1,0} parameter(1), sharding={replicated}
-  copy.2 = bf16[16,256,16,64]{1,3,2,0} copy(Arg_1.2), sharding={replicated}
-  transpose.1 = bf16[16,16,64,256]{3,2,1,0} transpose(copy.2), dimensions={0,2,3,1}
-  dot = bf16[16,16,256,256]{3,2,1,0} dot(transpose, transpose.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  Arg_4.5 = pred[16,1,256,256]{3,2,1,0} parameter(4), sharding={replicated}
-  convert.35 = s32[16,1,256,256]{3,2,1,0} convert(Arg_4.5)
-  constant.28 = s32[] constant(0)
-  broadcast.29 = s32[16,1,256,256]{3,2,1,0} broadcast(constant.28), dimensions={}
-  compare.36 = pred[16,1,256,256]{3,2,1,0} compare(convert.35, broadcast.29), direction=GT
-  constant.30 = bf16[] constant(0)
-  broadcast.1 = bf16[16,1,256,256]{3,2,1,0} broadcast(constant.30), dimensions={}
-  constant.10 = bf16[] constant(-9.999e+09)
-  broadcast.3 = bf16[16,1,256,256]{3,2,1,0} broadcast(constant.10), dimensions={}
-  select.39 = bf16[16,1,256,256]{3,2,1,0} select(compare.36, broadcast.1, broadcast.3)
-  reshape.41 = bf16[16,256,256]{2,1,0} reshape(select.39)
-  broadcast.42 = bf16[16,16,256,256]{3,2,1,0} broadcast(reshape.41), dimensions={0,2,3}
-  Arg_3.4 = bf16[1,16,256,256]{3,2,1,0} parameter(3), sharding={replicated}
-  reshape.44 = bf16[16,256,256]{2,1,0} reshape(Arg_3.4)
-  broadcast.45 = bf16[16,16,256,256]{3,2,1,0} broadcast(reshape.44), dimensions={1,2,3}
-  add.46 = bf16[16,16,256,256]{3,2,1,0} add(broadcast.42, broadcast.45)
-  add.53 = bf16[16,16,256,256]{3,2,1,0} add(dot, add.46)
-  constant.31 = bf16[] constant(-inf)
-  reduce.58 = bf16[16,16,256]{2,1,0} reduce(add.53, constant.31), dimensions={3}, to_apply=region_0.54
-  broadcast.62 = bf16[16,16,256,256]{3,2,1,0} broadcast(reduce.58), dimensions={0,1,2}
-  subtract.63 = bf16[16,16,256,256]{3,2,1,0} subtract(add.53, broadcast.62)
-  exponential.64 = bf16[16,16,256,256]{3,2,1,0} exponential(subtract.63)
-  convert.65 = f32[16,16,256,256]{3,2,1,0} convert(exponential.64)
-  constant.11 = f32[] constant(0)
-  reduce.70 = f32[16,16,256]{2,1,0} reduce(convert.65, constant.11), dimensions={3}, to_apply=region_1.66
-  convert.4 = bf16[16,16,256]{2,1,0} convert(reduce.70)
-  broadcast.75 = bf16[16,16,256,256]{3,2,1,0} broadcast(convert.4), dimensions={0,1,2}
-  divide.76 = bf16[16,16,256,256]{3,2,1,0} divide(exponential.64, broadcast.75)
-  constant.22 = u32[1]{0} constant({255383827})
-  constant.21 = u32[1]{0} constant({267815257})
-  constant.2 = u32[1]{0} constant({0})
-  constant.23 = u32[1]{0} constant({3213575472})
-  custom-call.49 = (u32[1]{0}, u32[1]{0}) custom-call(constant.22, constant.21, constant.2, constant.23), custom_call_target="cu_threefry2x32", operand_layout_constraints={u32[1]{0}, u32[1]{0}, u32[1]{0}, u32[1]{0}}, api_version=API_VERSION_STATUS_RETURNING, backend_config="\001\000\000\000\000\000\000\000"
-  get-tuple-element.50 = u32[1]{0} get-tuple-element(custom-call.49), index=0
-  reshape.80 = u32[] reshape(get-tuple-element.50)
-  broadcast.84 = u32[32768]{0} broadcast(reshape.80), dimensions={}
-  get-tuple-element.51 = u32[1]{0} get-tuple-element(custom-call.49), index=1
-  reshape.81 = u32[] reshape(get-tuple-element.51)
-  broadcast.85 = u32[32768]{0} broadcast(reshape.81), dimensions={}
-  iota.79 = u32[65536]{0} iota(), iota_dimension=0
-  slice.82 = u32[32768]{0} slice(iota.79), slice={[0:32768]}
-  slice.83 = u32[32768]{0} slice(iota.79), slice={[32768:65536]}
-  custom-call.86 = (u32[32768]{0}, u32[32768]{0}) custom-call(broadcast.84, broadcast.85, slice.82, slice.83), custom_call_target="cu_threefry2x32", operand_layout_constraints={u32[32768]{0}, u32[32768]{0}, u32[32768]{0}, u32[32768]{0}}, api_version=API_VERSION_STATUS_RETURNING, backend_config="\000\200\000\000\000\000\000\000"
-  get-tuple-element.87 = u32[32768]{0} get-tuple-element(custom-call.86), index=0
-  get-tuple-element.88 = u32[32768]{0} get-tuple-element(custom-call.86), index=1
-  concatenate.89 = u32[65536]{0} concatenate(get-tuple-element.87, get-tuple-element.88), dimensions={0}
-  constant.17 = u32[] constant(9)
-  broadcast.13 = u32[65536]{0} broadcast(constant.17), dimensions={}
-  shift-right-logical.0 = u32[65536]{0} shift-right-logical(concatenate.89, broadcast.13)
-  constant.15 = u32[] constant(1065353216)
-  broadcast.21 = u32[65536]{0} broadcast(constant.15), dimensions={}
-  or.0 = u32[65536]{0} or(shift-right-logical.0, broadcast.21)
-  bitcast-convert.0 = f32[65536]{0} bitcast-convert(or.0)
-  constant.3 = f32[] constant(-1)
-  broadcast.30 = f32[65536]{0} broadcast(constant.3), dimensions={}
-  add.1 = f32[65536]{0} add(bitcast-convert.0, broadcast.30)
-  broadcast.31 = f32[65536]{0} broadcast(constant.11), dimensions={}
-  maximum.0 = f32[65536]{0} maximum(add.1, broadcast.31)
-  constant.9 = f32[] constant(0.9)
-  broadcast.32 = f32[65536]{0} broadcast(constant.9), dimensions={}
-  compare.0 = pred[65536]{0} compare(maximum.0, broadcast.32), direction=LT
-  constant = bf16[] constant(1.109)
-  broadcast.33 = bf16[65536]{0} broadcast(constant), dimensions={}
-  broadcast.34 = bf16[65536]{0} broadcast(constant.30), dimensions={}
-  select.2 = bf16[65536]{0} select(compare.0, broadcast.33, broadcast.34)
-  reshape.39 = bf16[16,16,256]{2,1,0} reshape(select.2)
-  broadcast.9 = bf16[16,16,256,256]{3,2,1,0} broadcast(reshape.39), dimensions={0,1,3}
-  multiply.101 = bf16[16,16,256,256]{3,2,1,0} multiply(divide.76, broadcast.9)
-  dot.1 = bf16[16,16,64,256]{3,2,1,0} dot(transpose.5, multiply.101), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-  transpose.103 = bf16[16,256,16,64]{1,3,2,0} transpose(dot.1), dimensions={0,3,1,2}
-  Arg_5.6 = bf16[16,256,16,64]{3,2,1,0} parameter(5), sharding={replicated}
-  copy.3 = bf16[16,256,16,64]{3,1,2,0} copy(Arg_5.6), sharding={replicated}
-  transpose.4 = bf16[16,16,256,64]{3,2,1,0} transpose(copy.3), dimensions={0,2,1,3}
-  dot.2 = bf16[16,16,256,256]{3,2,1,0} dot(transpose.4, transpose.5), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  multiply.108 = bf16[16,16,256,256]{3,2,1,0} multiply(dot.2, broadcast.9)
-  divide.124 = bf16[16,16,256,256]{3,2,1,0} divide(multiply.108, broadcast.75)
-  constant.19 = bf16[] constant(1)
-  broadcast.24 = bf16[16,16,256]{2,1,0} broadcast(constant.19), dimensions={}
-  multiply.2 = bf16[16,16,256]{2,1,0} multiply(convert.4, convert.4)
-  divide.0 = bf16[16,16,256]{2,1,0} divide(broadcast.24, multiply.2)
-  broadcast.111 = bf16[16,16,256,256]{3,2,1,0} broadcast(divide.0), dimensions={0,1,2}
-  multiply.112 = bf16[16,16,256,256]{3,2,1,0} multiply(multiply.108, broadcast.111)
-  multiply.113 = bf16[16,16,256,256]{3,2,1,0} multiply(multiply.112, exponential.64)
-  reduce.118 = bf16[16,16,256]{2,1,0} reduce(multiply.113, constant.30), dimensions={3}, to_apply=region_2.114
-  negate.1 = bf16[16,16,256]{2,1,0} negate(reduce.118)
-  broadcast.11 = bf16[16,16,256,256]{3,2,1,0} broadcast(negate.1), dimensions={0,1,2}
-  add.133 = bf16[16,16,256,256]{3,2,1,0} add(divide.124, broadcast.11)
-  multiply.134 = bf16[16,16,256,256]{3,2,1,0} multiply(add.133, exponential.64)
-  copy.4 = bf16[16,256,16,64]{3,1,2,0} copy(Arg_1.2), sharding={replicated}
-  transpose.9 = bf16[16,16,256,64]{3,2,1,0} transpose(copy.4), dimensions={0,2,1,3}
-  dot.4 = bf16[16,16,256,64]{3,2,1,0} dot(multiply.134, transpose.9), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  transpose.144 = bf16[16,256,16,64]{3,1,2,0} transpose(dot.4), dimensions={0,2,1,3}
-  dot.3 = bf16[16,16,256,64]{3,2,1,0} dot(multiply.134, transpose), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  transpose.142 = bf16[16,256,16,64]{3,1,2,0} transpose(dot.3), dimensions={0,2,1,3}
-  copy.5 = bf16[16,256,16,64]{1,3,2,0} copy(Arg_5.6), sharding={replicated}
-  transpose.104 = bf16[16,16,64,256]{3,2,1,0} transpose(copy.5), dimensions={0,2,3,1}
-  dot.106 = bf16[16,16,64,256]{3,2,1,0} dot(transpose.104, multiply.101), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-  transpose.107 = bf16[16,256,16,64]{1,3,2,0} transpose(dot.106), dimensions={0,3,1,2}
-  reduce.139 = bf16[16,256,256]{2,1,0} reduce(multiply.134, constant.30), dimensions={0}, to_apply=region_2.114
-  bitcast.111 = bf16[1,16,256,256]{3,2,1,0} bitcast(reduce.139)
-  all-reduce = bf16[1,16,256,256]{3,2,1,0} all-reduce(bitcast.111), channel_id=85, replica_groups={{0}}, to_apply=region_2.114
-  tuple.145 = (bf16[16,256,16,64]{1,3,2,0}, bf16[16,256,16,64]{3,1,2,0}, bf16[16,256,16,64]{3,1,2,0}, bf16[16,256,16,64]{1,3,2,0}, bf16[1,16,256,256]{3,2,1,0}) tuple(transpose.103, transpose.144, transpose.142, transpose.107, all-reduce)
-  get-tuple-element = bf16[16,256,16,64]{1,3,2,0} get-tuple-element(tuple.145), index=0
-  copy.6 = bf16[16,256,16,64]{3,2,1,0} copy(get-tuple-element)
-  get-tuple-element.1 = bf16[16,256,16,64]{3,1,2,0} get-tuple-element(tuple.145), index=1
-  copy.7 = bf16[16,256,16,64]{3,2,1,0} copy(get-tuple-element.1)
-  get-tuple-element.2 = bf16[16,256,16,64]{3,1,2,0} get-tuple-element(tuple.145), index=2
-  copy.8 = bf16[16,256,16,64]{3,2,1,0} copy(get-tuple-element.2)
-  get-tuple-element.3 = bf16[16,256,16,64]{1,3,2,0} get-tuple-element(tuple.145), index=3
-  copy.9 = bf16[16,256,16,64]{3,2,1,0} copy(get-tuple-element.3)
-  get-tuple-element.4 = bf16[1,16,256,256]{3,2,1,0} get-tuple-element(tuple.145), index=4
-  ROOT tuple = (bf16[16,256,16,64]{3,2,1,0}, bf16[16,256,16,64]{3,2,1,0}, bf16[16,256,16,64]{3,2,1,0}, bf16[16,256,16,64]{3,2,1,0}, bf16[1,16,256,256]{3,2,1,0}) tuple(copy.6, copy.7, copy.8, copy.9, get-tuple-element.4)
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
-  CudnnFusedMHARewriter fusedMhaRewriter{
-      GetCudaComputeCapability(),
-      GetCudnnVersionWithDbiasAndMaskBwdInputSupport()};
-  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
-
-  HloDCE dce;
-  TF_ASSERT_OK(RunHloPass(&dce, m.get()).status());
-
-  ComputationLayout computation_layout(
-      m->entry_computation()->ComputeProgramShape());
-
-  const HloInstruction* fmha;
-  const absl::string_view backward_target =
-      kCudnnfMHAScaleBiasSoftmaxDropoutBackwardCallTarget;
-  auto dbias_index = 5;
-  SCOPED_TRACE(m->ToString());
-  EXPECT_THAT(
-      m->entry_computation()->root_instruction(),
-      GmockMatch(m::Tuple(
-          m::Copy(m::GetTupleElement(
-              m::Tuple(
-                  m::Transpose().WithShape(BF16, {16, 256, 16, 64}),
-                  m::Transpose(m::GetTupleElement(
-                                   m::CustomCall(&fmha, {backward_target}), 0))
-                      .WithShape(BF16, {16, 256, 16, 64}),
-                  m::Transpose(
-                      m::GetTupleElement(m::CustomCall({backward_target}), 1))
-                      .WithShape(BF16, {16, 256, 16, 64}),
-                  m::Transpose(m::Transpose(m::GetTupleElement(
-                                   m::CustomCall({backward_target}), 2)))
-                      .WithShape(BF16, {16, 256, 16, 64}),
-                  m::AllReduce(m::Bitcast(
-                      m::Reshape(
-                          m::GetTupleElement(  // dbias
-                              m::CustomCall({backward_target}), dbias_index))
-                          .WithShape(BF16, {16, 256, 256})))),
-              0)),
-          m::Op(), m::Op(), m::Op(), m::Op())));
-  TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
-                          fmha->backend_config<GpuBackendConfig>());
-  const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
-  EXPECT_EQ(fmha->operands().size(), 5);
-  EXPECT_NEAR(config.dropout_rate(), 0.1, 1e-2);
-}
-
 TEST_F(CudnnFusedMhaRewriterTestHloTest,
        ActivationHasMoreThan1UserShouldNotLower) {
   if (skip_reason_) GTEST_SKIP() << *skip_reason_;
@@ -2118,9 +1102,8 @@ ENTRY main {
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
-  CudnnFusedMHARewriter fusedMhaRewriter{
-      GetCudaComputeCapability(),
-      GetCudnnVersionWithDbiasAndMaskBwdInputSupport()};
+  CudnnFusedMHARewriter fusedMhaRewriter{GetCudaComputeCapability(),
+                                         GetCudnnVersion()};
   TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
   HloDCE dce;
   TF_ASSERT_OK(RunHloPass(&dce, m.get()).status());
@@ -2206,9 +1189,8 @@ ENTRY main.82 {
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
-  CudnnFusedMHARewriter fusedMhaRewriter{
-      GetCudaComputeCapability(),
-      GetCudnnVersionWithDbiasAndMaskBwdInputSupport()};
+  CudnnFusedMHARewriter fusedMhaRewriter{GetCudaComputeCapability(),
+                                         GetCudnnVersion()};
   TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
   HloDCE dce;
   TF_ASSERT_OK(RunHloPass(&dce, m.get()).status());
@@ -2343,9 +1325,8 @@ ENTRY main.126 {
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
-  CudnnFusedMHARewriter fusedMhaRewriter{
-      GetCudaComputeCapability(),
-      GetCudnnVersionWithDbiasAndMaskBwdInputSupport()};
+  CudnnFusedMHARewriter fusedMhaRewriter{GetCudaComputeCapability(),
+                                         GetCudnnVersion()};
   TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
   HloDCE dce;
   TF_ASSERT_OK(RunHloPass(&dce, m.get()).status());
@@ -2434,9 +1415,8 @@ ENTRY main.82 {
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
-  CudnnFusedMHARewriter fusedMhaRewriter{
-      GetCudaComputeCapability(),
-      GetCudnnVersionWithDbiasAndMaskBwdInputSupport()};
+  CudnnFusedMHARewriter fusedMhaRewriter{GetCudaComputeCapability(),
+                                         GetCudnnVersion()};
   TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
   HloDCE dce;
   TF_ASSERT_OK(RunHloPass(&dce, m.get()).status());
@@ -2469,7 +1449,7 @@ ENTRY main.82 {
   TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
                           fmha->backend_config<GpuBackendConfig>());
   const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
-  EXPECT_EQ(fmha->operands().size(), 5);
+  EXPECT_EQ(fmha->operands().size(), 7);
   EXPECT_NEAR(config.dropout_rate(), 0, 1e-2);
 }
 
@@ -2646,9 +1626,8 @@ ENTRY main.82 {
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
-  CudnnFusedMHARewriter fusedMhaRewriter{
-      GetCudaComputeCapability(),
-      GetCudnnVersionWithDbiasAndMaskBwdInputSupport()};
+  CudnnFusedMHARewriter fusedMhaRewriter{GetCudaComputeCapability(),
+                                         GetCudnnVersion()};
   TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
   HloDCE dce;
   TF_ASSERT_OK(RunHloPass(&dce, m.get()).status());
@@ -2740,9 +1719,8 @@ ENTRY main {
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
-  CudnnFusedMHARewriter fusedMhaRewriter{
-      GetCudaComputeCapability(),
-      GetCudnnVersionWithDbiasAndMaskBwdInputSupport()};
+  CudnnFusedMHARewriter fusedMhaRewriter{GetCudaComputeCapability(),
+                                         GetCudnnVersion()};
   const auto status_or = RunHloPass(&fusedMhaRewriter, m.get());
   TF_ASSERT_OK(status_or.status());
   EXPECT_FALSE(status_or.value());
@@ -2828,9 +1806,8 @@ ENTRY main.82 {
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
-  CudnnFusedMHARewriter fusedMhaRewriter{
-      GetCudaComputeCapability(),
-      GetCudnnVersionWithDbiasAndMaskBwdInputSupport()};
+  CudnnFusedMHARewriter fusedMhaRewriter{GetCudaComputeCapability(),
+                                         GetCudnnVersion()};
   const auto status_or = RunHloPass(&fusedMhaRewriter, m.get());
   TF_ASSERT_OK(status_or.status());
   EXPECT_FALSE(status_or.value());
@@ -2918,9 +1895,8 @@ ENTRY main.82 {
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
-  CudnnFusedMHARewriter fusedMhaRewriter{
-      GetCudaComputeCapability(),
-      GetCudnnVersionWithDbiasAndMaskBwdInputSupport()};
+  CudnnFusedMHARewriter fusedMhaRewriter{GetCudaComputeCapability(),
+                                         GetCudnnVersion()};
   TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
   HloDCE dce;
   TF_ASSERT_OK(RunHloPass(&dce, m.get()).status());
@@ -3009,8 +1985,8 @@ ENTRY main.92 {
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
-  CudnnFusedMHARewriter fusedMhaRewriter{
-      GetCudaComputeCapability(), GetCudnnVersionWithFlashAttentionSupport()};
+  CudnnFusedMHARewriter fusedMhaRewriter{GetCudaComputeCapability(),
+                                         GetCudnnVersion()};
   TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
   HloDCE dce;
   TF_ASSERT_OK(RunHloPass(&dce, m.get()).status());
@@ -3044,7 +2020,6 @@ ENTRY main.92 {
   EXPECT_EQ(fwd_fmha->operands().size(), 3);
   EXPECT_EQ(bwd_fmha->operands().size(), 6);
   EXPECT_NEAR(config.dropout_rate(), 0, 1e-2);
-  EXPECT_EQ(config.is_flash_attention(), true);
   EXPECT_EQ(config.mask_type(), CudnnfMHABackendConfig::CAUSAL);
 }
 
@@ -3116,8 +2091,8 @@ ENTRY main.92 {
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
-  CudnnFusedMHARewriter fusedMhaRewriter{
-      GetCudaComputeCapability(), GetCudnnVersionWithFlashAttentionSupport()};
+  CudnnFusedMHARewriter fusedMhaRewriter{GetCudaComputeCapability(),
+                                         GetCudnnVersion()};
   TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
   HloDCE dce;
   TF_ASSERT_OK(RunHloPass(&dce, m.get()).status());
@@ -3152,7 +2127,6 @@ ENTRY main.92 {
   const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
   EXPECT_EQ(fmha->operands().size(), 7);
   EXPECT_NEAR(config.dropout_rate(), 0, 1e-2);
-  EXPECT_EQ(config.is_flash_attention(), true);
   EXPECT_EQ(config.mask_type(), CudnnfMHABackendConfig::NO_MASK);
 }
 
@@ -3221,8 +2195,8 @@ ENTRY main.92 {
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
-  CudnnFusedMHARewriter fusedMhaRewriter{
-      GetCudaComputeCapability(), GetCudnnVersionWithFlashAttentionSupport()};
+  CudnnFusedMHARewriter fusedMhaRewriter{GetCudaComputeCapability(),
+                                         GetCudnnVersion()};
   TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
   HloDCE dce;
   TF_ASSERT_OK(RunHloPass(&dce, m.get()).status());
@@ -3255,7 +2229,6 @@ ENTRY main.92 {
   EXPECT_EQ(fmha->operands().size(), 6);
   EXPECT_NEAR(config.dropout_rate(), 0, 1e-2);
   EXPECT_FLOAT_EQ(config.fmha_scale(), 2);
-  EXPECT_EQ(config.is_flash_attention(), true);
   EXPECT_EQ(config.mask_type(), CudnnfMHABackendConfig::NO_MASK);
 }
 
@@ -3816,8 +2789,8 @@ main {
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
-  CudnnFusedMHARewriter fusedMhaRewriter{
-      GetCudaComputeCapability(), GetCudnnVersionWithFlashAttentionSupport()};
+  CudnnFusedMHARewriter fusedMhaRewriter{GetCudaComputeCapability(),
+                                         GetCudnnVersion()};
   TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
 
   ComputationLayout computation_layout(
@@ -3842,7 +2815,6 @@ main {
   TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
                           fwd_instruction->backend_config<GpuBackendConfig>());
   const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
-  EXPECT_EQ(config.is_flash_attention(), true);
   EXPECT_EQ(config.mask_type(), CudnnfMHABackendConfig::CAUSAL);
 }
 
@@ -4009,9 +2981,8 @@ ENTRY main.164_spmd {
 )";
   // Dropout bwd pattern not supported, should not lower fwd as well
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
-  CudnnFusedMHARewriter fusedMhaRewriter{
-      GetCudaComputeCapability(),
-      GetCudnnVersionWithDbiasAndMaskBwdInputSupport()};
+  CudnnFusedMHARewriter fusedMhaRewriter{GetCudaComputeCapability(),
+                                         GetCudnnVersion()};
   TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
   SCOPED_TRACE(m->ToString());
   // check if fwd graph has been restored with cloned activation
@@ -4025,40 +2996,6 @@ ENTRY main.164_spmd {
               }))))));
 }
 
-constexpr absl::string_view hlo_should_lower_to_flash_attention = R"(
-HloModule fmha_test, entry_computation_layout={(bf16[16,16,128,64]{3,2,1,0},bf16[16,16,1024,64]{3,2,1,0},bf16[16,16,1024,64]{3,2,1,0})->bf16[16,16,128,64]{3,2,1,0}}
-ENTRY main.6 {
-  Arg_0.1 = bf16[16,16,128,64]{3,2,1,0} parameter(0)
-  Arg_1.2 = bf16[16,16,1024,64]{3,2,1,0} parameter(1)
-  Arg_2.3 = bf16[16,16,1024,64]{3,2,1,0} parameter(2)
-  dot.0 = bf16[16,16,128,1024]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}, metadata={}
-  ROOT dot.1 = bf16[16,16,128,64]{3,2,1,0} dot(dot.0, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}, metadata={}
-})";
-
-TEST_F(CudnnFusedMhaRewriterTestHloTest, ShouldLowerToFlashAttention) {
-  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto m, ParseAndReturnVerifiedModule(hlo_should_lower_to_flash_attention,
-                                           GetModuleConfig()));
-  CudnnFusedMHARewriter fusedMhaRewriter{
-      GetCudaComputeCapability(), GetCudnnVersionWithFlashAttentionSupport()};
-  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
-  const HloInstruction* fmha;
-
-  SCOPED_TRACE(m->ToString());
-  EXPECT_THAT(
-      m->entry_computation()->root_instruction(),
-      GmockMatch(m::GetTupleElement(
-                     m::CustomCall(&fmha, {kCudnnfMHABmmBmmCallTarget}), 0)
-                     .WithShape(BF16, {16, 16, 128, 64})));
-  TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
-                          fmha->backend_config<GpuBackendConfig>());
-  const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
-  EXPECT_EQ(config.fmha_scale(), 1.0);
-  EXPECT_EQ(config.dropout_rate(), 0.0);
-  EXPECT_EQ(config.is_flash_attention(), true);
-}
-
 constexpr absl::string_view hlo_head_dim_not_multiple_of_64 = R"(
 HloModule jit__reference, entry_computation_layout={(f16[4,48,1024,16]{3,2,1,0}, f16[4,48,1024,16]{3,2,1,0}, f16[4,48,1024,16]{3,2,1,0})->f16[4,48,1024,16]{3,2,1,0}}
 
@@ -4106,8 +3043,8 @@ TEST_F(CudnnFusedMhaRewriterTestHloTest, HeadDimNotMultipleOf64) {
   TF_ASSERT_OK_AND_ASSIGN(
       auto m, ParseAndReturnVerifiedModule(hlo_head_dim_not_multiple_of_64,
                                            GetModuleConfig()));
-  CudnnFusedMHARewriter fusedMhaRewriter{
-      GetCudaComputeCapability(), GetCudnnVersionWithFlashAttentionSupport()};
+  CudnnFusedMHARewriter fusedMhaRewriter{GetCudaComputeCapability(),
+                                         GetCudnnVersion()};
   TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
 
   // head dim not a multiple of 64 should not be lowered with cuDNN < 8.9.6
@@ -4131,7 +3068,136 @@ TEST_F(CudnnFusedMhaRewriterTestHloTest, HeadDimNotMultipleOf64) {
   const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
   EXPECT_EQ(config.fmha_scale(), 0.5);
   EXPECT_EQ(config.dropout_rate(), 0.0);
-  EXPECT_EQ(config.is_flash_attention(), true);
+}
+
+constexpr absl::string_view hlo_BF16Bmm1BiasSoftmaxBmm2Pattern_dbias = R"(
+HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[2,1024,4,64]{3,2,1,0}, bf16[2,1024,4,64]{3,2,1,0}, bf16[2,1024,4,64]{3,2,1,0}, bf16[2,1024,4,64]{3,2,1,0}, bf16[4,1024,1024]{2,1,0})->(bf16[2,1024,4,64]{3,2,1,0}, bf16[2,1024,4,64]{3,2,1,0}, bf16[2,1024,4,64]{3,2,1,0}, bf16[2,1024,4,64]{3,2,1,0}, bf16[4,1024,1024]{2,1,0})}, allow_spmd_sharding_propagation_to_parameters={true,true,true,true,true}, allow_spmd_sharding_propagation_to_output={true,true,true,true,true}
+
+region_0.14 {
+  Arg_0.15 = bf16[] parameter(0)
+  Arg_1.16 = bf16[] parameter(1)
+  ROOT maximum = bf16[] maximum(Arg_0.15, Arg_1.16)
+}
+
+region_1.27 {
+  Arg_0.28 = f32[] parameter(0)
+  Arg_1.29 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0.28, Arg_1.29)
+}
+
+region_2.56 {
+  Arg_0.57 = bf16[] parameter(0)
+  Arg_1.58 = bf16[] parameter(1)
+  ROOT add.1 = bf16[] add(Arg_0.57, Arg_1.58)
+}
+
+ENTRY main.87 {
+  Arg_2.3 = bf16[2,1024,4,64]{3,2,1,0} parameter(2)
+  transpose.12 = bf16[2,4,64,1024]{3,2,1,0} transpose(Arg_2.3), dimensions={0,2,3,1}
+  Arg_0.1 = bf16[2,1024,4,64]{3,2,1,0} parameter(0)
+  transpose.13 = bf16[2,4,1024,64]{3,2,1,0} transpose(Arg_0.1), dimensions={0,2,1,3}
+  Arg_1.2 = bf16[2,1024,4,64]{3,2,1,0} parameter(1)
+  transpose.15 = bf16[2,4,64,1024]{3,2,1,0} transpose(Arg_1.2), dimensions={0,2,3,1}
+  dot = bf16[2,4,1024,1024]{3,2,1,0} dot(transpose.13, transpose.15), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  Arg_4.5 = bf16[4,1024,1024]{2,1,0} parameter(4)
+  broadcast.9 = bf16[2,4,1024,1024]{3,2,1,0} broadcast(Arg_4.5), dimensions={1,2,3}
+  add.2 = bf16[2,4,1024,1024]{3,2,1,0} add(dot, broadcast.9)
+  constant.10 = bf16[] constant(-inf)
+  reduce.18 = bf16[2,4,1024]{2,1,0} reduce(add.2, constant.10), dimensions={3}, to_apply=region_0.14
+  broadcast.10 = bf16[2,4,1024,1024]{3,2,1,0} broadcast(reduce.18), dimensions={0,1,2}
+  subtract = bf16[2,4,1024,1024]{3,2,1,0} subtract(add.2, broadcast.10)
+  exponential = bf16[2,4,1024,1024]{3,2,1,0} exponential(subtract)
+  convert.5 = f32[2,4,1024,1024]{3,2,1,0} convert(exponential)
+  constant.9 = f32[] constant(0)
+  reduce.31 = f32[2,4,1024]{2,1,0} reduce(convert.5, constant.9), dimensions={3}, to_apply=region_1.27
+  convert.6 = bf16[2,4,1024]{2,1,0} convert(reduce.31)
+  broadcast.11 = bf16[2,4,1024,1024]{3,2,1,0} broadcast(convert.6), dimensions={0,1,2}
+  divide.2 = bf16[2,4,1024,1024]{3,2,1,0} divide(exponential, broadcast.11)
+  dot.1 = bf16[2,4,64,1024]{3,2,1,0} dot(transpose.12, divide.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
+  transpose.22 = bf16[2,1024,4,64]{3,2,1,0} transpose(dot.1), dimensions={0,3,1,2}
+  Arg_3.4 = bf16[2,1024,4,64]{3,2,1,0} parameter(3)
+  transpose.17 = bf16[2,4,1024,64]{3,2,1,0} transpose(Arg_3.4), dimensions={0,2,1,3}
+  dot.2 = bf16[2,4,1024,1024]{3,2,1,0} dot(transpose.17, transpose.12), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  divide.3 = bf16[2,4,1024,1024]{3,2,1,0} divide(dot.2, broadcast.11)
+  constant.0 = bf16[] constant(1)
+  broadcast.13 = bf16[2,4,1024]{2,1,0} broadcast(constant.0), dimensions={}
+  multiply.2 = bf16[2,4,1024]{2,1,0} multiply(convert.6, convert.6)
+  divide.4 = bf16[2,4,1024]{2,1,0} divide(broadcast.13, multiply.2)
+  broadcast.14 = bf16[2,4,1024,1024]{3,2,1,0} broadcast(divide.4), dimensions={0,1,2}
+  multiply.3 = bf16[2,4,1024,1024]{3,2,1,0} multiply(dot.2, broadcast.14)
+  multiply.4 = bf16[2,4,1024,1024]{3,2,1,0} multiply(multiply.3, exponential)
+  constant.8 = bf16[] constant(0)
+  reduce.60 = bf16[2,4,1024]{2,1,0} reduce(multiply.4, constant.8), dimensions={3}, to_apply=region_2.56
+  negate.1 = bf16[2,4,1024]{2,1,0} negate(reduce.60)
+  broadcast.15 = bf16[2,4,1024,1024]{3,2,1,0} broadcast(negate.1), dimensions={0,1,2}
+  add.3 = bf16[2,4,1024,1024]{3,2,1,0} add(divide.3, broadcast.15)
+  multiply.5 = bf16[2,4,1024,1024]{3,2,1,0} multiply(add.3, exponential)
+  transpose.18 = bf16[2,4,1024,64]{3,2,1,0} transpose(Arg_1.2), dimensions={0,2,1,3}
+  dot.4 = bf16[2,4,1024,64]{3,2,1,0} dot(multiply.5, transpose.18), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  transpose.23 = bf16[2,1024,4,64]{3,2,1,0} transpose(dot.4), dimensions={0,2,1,3}
+  dot.3 = bf16[2,4,1024,64]{3,2,1,0} dot(multiply.5, transpose.13), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  transpose.24 = bf16[2,1024,4,64]{3,2,1,0} transpose(dot.3), dimensions={0,2,1,3}
+  transpose.20 = bf16[2,4,64,1024]{3,2,1,0} transpose(Arg_3.4), dimensions={0,2,3,1}
+  dot.49 = bf16[2,4,64,1024]{3,2,1,0} dot(transpose.20, divide.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  transpose.25 = bf16[2,1024,4,64]{3,2,1,0} transpose(dot.49), dimensions={0,3,1,2}
+  reduce.81 = bf16[4,1024,1024]{2,1,0} reduce(multiply.5, constant.8), dimensions={0}, to_apply=region_2.56
+  ROOT tuple = (bf16[2,1024,4,64]{3,2,1,0}, bf16[2,1024,4,64]{3,2,1,0}, bf16[2,1024,4,64]{3,2,1,0}, bf16[2,1024,4,64]{3,2,1,0}, bf16[4,1024,1024]{2,1,0}) tuple(transpose.22, transpose.23, transpose.24, transpose.25, reduce.81)
+} // main.87
+)";
+
+TEST_F(CudnnFusedMhaRewriterTestHloTest, BF16Bmm1BiasSoftmaxBmm2PatternDbias) {
+  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto m,
+      ParseAndReturnVerifiedModule(hlo_BF16Bmm1BiasSoftmaxBmm2Pattern_dbias));
+  // require cudnn 8.9.6 + hopper for dbias
+  CudnnFusedMHARewriter fusedMhaRewriter{se::CudaComputeCapability(9, 0),
+                                         se::dnn::VersionInfo(8, 9, 6)};
+  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
+
+  ComputationLayout computation_layout(
+      m->entry_computation()->ComputeProgramShape());
+
+  const HloInstruction* fmha;
+
+  SCOPED_TRACE(m->ToString());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::Tuple(
+          m::Transpose(
+              m::Transpose(m::GetTupleElement(
+                  m::CustomCall(&fmha, {kCudnnfMHAScaleBiasSoftmaxCallTarget}),
+                  0)))
+              .WithShape(BF16, {2, 1024, 4, 64}),
+          m::Transpose(
+              m::GetTupleElement(
+                  m::CustomCall({kCudnnfMHAScaleBiasSoftmaxBackwardCallTarget}),
+                  0))
+              .WithShape(BF16, {2, 1024, 4, 64}),
+          m::Transpose(
+              m::GetTupleElement(
+                  m::CustomCall({kCudnnfMHAScaleBiasSoftmaxBackwardCallTarget}),
+                  1))
+              .WithShape(BF16, {2, 1024, 4, 64}),
+          m::Transpose(
+              m::Transpose(m::GetTupleElement(
+                  m::CustomCall({kCudnnfMHAScaleBiasSoftmaxBackwardCallTarget}),
+                  2)))
+              .WithShape(BF16, {2, 1024, 4, 64}),
+          m::Reshape(
+              m::GetTupleElement(
+                  m::CustomCall({kCudnnfMHAScaleBiasSoftmaxBackwardCallTarget}),
+                  3))
+              .WithShape(BF16, {4, 1024, 1024}))));
+  TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
+                          fmha->backend_config<GpuBackendConfig>());
+  const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
+  EXPECT_EQ(fmha->operands().size(), 4);
+  EXPECT_EQ(fmha->operand(3)->shape(),
+            ShapeUtil::MakeShape(BF16, {1, 4, 1024, 1024}));
+  EXPECT_EQ(config.fmha_scale(), 1.0);
+  EXPECT_EQ(config.dropout_rate(), 0.0);
+  EXPECT_EQ(config.mask_type(), CudnnfMHABackendConfig::NO_MASK);
 }
 }  // anonymous namespace
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/cudnn_fused_mha_transpose_fusion.cc b/third_party/xla/xla/service/gpu/cudnn_fused_mha_transpose_fusion.cc
index 4abd0940e07955..665cc0bf824383 100644
--- a/third_party/xla/xla/service/gpu/cudnn_fused_mha_transpose_fusion.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_fused_mha_transpose_fusion.cc
@@ -463,15 +463,6 @@ absl::StatusOr<bool> FusePrologueTransposeWithcuDNNFMHA(HloComputation* comp) {
       // should have the same layout of O
       TF_ASSIGN_OR_RETURN(auto gpu_config,
                           fmha->backend_config<GpuBackendConfig>());
-      const CudnnfMHABackendConfig config =
-          gpu_config.cudnn_fmha_backend_config();
-      if (!config.is_flash_attention()) {
-        TF_ASSIGN_OR_RETURN(changed,
-                            FuseArgPrologueTransposeWithcuDNNFMHA(
-                                fmha, 4, true /*is_lhs=*/,
-                                true /*should_contracting_be_fastest=*/));
-      }
-
       if (changed && VLOG_IS_ON(2)) {
         VLOG(2) << "After CudnnFusedMHATransposeFusion Arg 4: \n"
                 << comp->parent()->ToString();
diff --git a/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc b/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc
index 2c3f4ec7221b30..3782301595a36a 100644
--- a/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc
@@ -43,6 +43,7 @@ limitations under the License.
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/primitive_util.h"
 #include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/cudnn_support_utils.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/kernel_reuse_cache.h"
 #include "xla/service/gpu/matmul_utils.h"
@@ -70,6 +71,8 @@ inline std::optional<fe::PointwiseMode_t> GetElementwiseMode(
       return m::ABS;
     case HloOpcode::kAdd:
       return m::ADD;
+    case HloOpcode::kCeil:
+      return m::CEIL;
     case HloOpcode::kCompare:
       switch (instruction.comparison_direction()) {
         case Comparison::Direction::kEq:
@@ -94,6 +97,8 @@ inline std::optional<fe::PointwiseMode_t> GetElementwiseMode(
       return m::DIV;
     case HloOpcode::kExp:
       return m::EXP;
+    case HloOpcode::kFloor:
+      return m::FLOOR;
     case HloOpcode::kLog:
       return m::LOG;
     case HloOpcode::kMaximum:
@@ -142,6 +147,10 @@ inline std::optional<fe::DataType_t> ToCudnnDataType(const PrimitiveType type) {
       return t::INT8;
     case PrimitiveType::PRED:
       return t::INT8;
+    case PrimitiveType::F8E5M2:
+      return t::FP8_E5M2;
+    case PrimitiveType::F8E4M3FN:
+      return t::FP8_E4M3;
     default:
       return std::nullopt;
   }
@@ -311,6 +320,35 @@ class GemmDimensionAdapter {
   const HloDotInstruction& dot_;
 };
 
+template <PrimitiveType XlaT, typename T>
+std::shared_ptr<graph::Tensor_attributes> LiteralToCudnnTensor(
+    const HloInstruction& hlo, graph::Graph& graph) {
+  using NativeT = typename primitive_util::PrimitiveTypeToNative<XlaT>::type;
+  return graph.tensor(T(hlo.literal().GetFirstElement<NativeT>()));
+}
+
+std::optional<std::shared_ptr<graph::Tensor_attributes>>
+HandleConstantHloToCudnnGraph(const HloInstruction& hlo, graph::Graph& graph) {
+  CHECK(hlo.IsConstant()) << "HLO is not a constant: " << hlo.ToShortString();
+  if (!ShapeUtil::IsScalar(hlo.shape())) {
+    VLOG(3) << "Currently only support fusing scalar in the graph";
+    return std::nullopt;
+  }
+  PrimitiveType constant_type = hlo.shape().element_type();
+  switch (constant_type) {
+    case BF16:
+      return LiteralToCudnnTensor<BF16, __nv_bfloat16>(hlo, graph);
+    case F32:
+      return LiteralToCudnnTensor<F32, float>(hlo, graph);
+    case S32:
+      return LiteralToCudnnTensor<S32, int>(hlo, graph);
+    default:
+      VLOG(3) << "Unsupported constant type: "
+              << PrimitiveType_Name(constant_type);
+      return std::nullopt;
+  }
+}
+
 // Traverses fusion computations and creates cuDNN graphs out of them.
 absl::StatusOr<std::optional<se::gpu::CudnnGraph>> HloFusionToCuDnnGraph(
     const HloFusionInstruction& fusion) {
@@ -342,6 +380,7 @@ absl::StatusOr<std::optional<se::gpu::CudnnGraph>> HloFusionToCuDnnGraph(
             .set_dim(dimensions)
             .set_stride(strides)
             .set_data_type(*data_type)
+            .set_name(std::string(parameter.name()))
             .set_uid(se::gpu::CuDnnTensorUID(parameter.parameter_number())));
     return true;
   };
@@ -371,6 +410,29 @@ absl::StatusOr<std::optional<se::gpu::CudnnGraph>> HloFusionToCuDnnGraph(
     if (hlo->opcode() == HloOpcode::kParameter) {
       CHECK(hlo_to_cudnn.contains(hlo));
       continue;
+    } else if (hlo->opcode() == HloOpcode::kCustomCall) {
+      if (hlo->user_count() != 1 ||
+          !IsWorkspaceAllocationRoot(*hlo->users()[0])) {
+        VLOG(3) << "Custom calls are only expected to be used for workspace "
+                   "allocation.";
+        return std::nullopt;
+      }
+      continue;
+    } else if (hlo->opcode() == HloOpcode::kTuple) {
+      if (!IsWorkspaceAllocationRoot(*hlo)) {
+        VLOG(3) << "Tuples are only expected at outputs for workspace "
+                   "allocation.";
+        return std::nullopt;
+      }
+      continue;
+    } else if (FusionLevel(fusion) >= 2 &&
+               hlo->opcode() == HloOpcode::kConstant) {
+      if (const auto const_tensor = HandleConstantHloToCudnnGraph(*hlo, graph);
+          const_tensor.has_value()) {
+        hlo_to_cudnn[hlo] = const_tensor.value();
+      } else {
+        return std::nullopt;
+      }
     } else if (hlo->opcode() == HloOpcode::kReshape ||
                hlo->opcode() == HloOpcode::kBitcast ||
                hlo->opcode() == HloOpcode::kTranspose ||
@@ -395,6 +457,25 @@ absl::StatusOr<std::optional<se::gpu::CudnnGraph>> HloFusionToCuDnnGraph(
                              .set_compute_data_type(compute_dtype.value());
       if (hlo->operand_count() == 1) {
         hlo_to_cudnn[hlo] = graph.pointwise(operand(0), attrs);
+        // Sets the dimensions for unary ops whose operands are broadcast for
+        // cuDNN to infer its inputs' shapes. constant has dimension [1] while
+        // cuDNN requires constant to have dimension [1,1,1]. Not setting output
+        // of the unary shapes results in the rejection of the cuDNN graph.
+        if (hlo->operand(0)->opcode() == HloOpcode::kBroadcast) {
+          const auto scope = adapter->analysis_.QueryInstructionScope(*hlo);
+          std::vector<int64_t> dimensions;
+          std::vector<int64_t> strides;
+          if (!scope.has_value()) {
+            LOG(FATAL) << "No scope for instruction: " << hlo->ToShortString();
+          }
+          if (!adapter->DimensionsAndStrides(*hlo, scope.value(), dimensions,
+                                             strides)) {
+            VLOG(3) << "Unsupported hlo for querying dimensions: "
+                    << hlo->ToShortString();
+          } else {
+            hlo_to_cudnn[hlo]->set_dim(dimensions);
+          }
+        }
       } else if (hlo->operand_count() == 2) {
         hlo_to_cudnn[hlo] = graph.pointwise(operand(0), operand(1), attrs);
       } else if (hlo->operand_count() == 3) {
@@ -432,7 +513,9 @@ absl::StatusOr<std::optional<se::gpu::CudnnGraph>> HloFusionToCuDnnGraph(
       VLOG(3) << "Unimplemented data type: " << hlo->shape().element_type();
       return std::nullopt;
     }
-    hlo_to_cudnn[hlo]->set_data_type(data_type.value());
+    hlo_to_cudnn[hlo]
+        ->set_data_type(data_type.value())
+        .set_name(std::string(hlo->name()));
   }
   const HloInstruction* output = instructions.back();
   if (instructions.back()->shape().IsTuple()) {
@@ -466,6 +549,7 @@ absl::StatusOr<se::gpu::CudnnGraph> PrepareGraph(
   if (!graph.has_value()) {
     return absl::InternalError("Construction of cuDNN graph failed.");
   }
+  VLOG(6) << graph->Graph().print();
   TF_ASSIGN_OR_RETURN(bool supported, graph->Prepare(dnn_support));
   if (!supported) {
     return absl::InternalError("cuDNN graph is not supported.");
@@ -473,6 +557,28 @@ absl::StatusOr<se::gpu::CudnnGraph> PrepareGraph(
   return *graph;
 }
 
+absl::StatusOr<HloInstruction*> AddWorkspace(HloInstruction& fusion,
+                                             const int64_t workspace_size) {
+  if (workspace_size == 0 || fusion.shape().IsTuple()) {
+    return &fusion;
+  }
+  HloComputation* computation = fusion.fused_instructions_computation();
+  HloInstruction* custom_call =
+      computation->AddInstruction(HloInstruction::CreateCustomCall(
+          ShapeUtil::MakeShape(S8, {workspace_size}), {},
+          kWorkspaceAllocationCustomCallTarget));
+  HloInstruction* output_tuple =
+      computation->AddInstruction(HloInstruction::CreateTuple(
+          {computation->root_instruction(), custom_call}));
+  computation->set_root_instruction(output_tuple, true);
+  HloInstruction* new_fusion = fusion.parent()->AddInstruction(
+      fusion.CloneWithNewShape(output_tuple->shape()));
+  TF_RETURN_IF_ERROR(fusion.ReplaceAllUsesWith(fusion.parent()->AddInstruction(
+      HloInstruction::CreateGetTupleElement(new_fusion, 0))));
+  TF_RETURN_IF_ERROR(fusion.parent()->RemoveInstruction(&fusion));
+  return new_fusion;
+}
+
 class CuDnnFusionVisitor : public DfsHloRewriteVisitor {
  public:
   explicit CuDnnFusionVisitor(
@@ -495,10 +601,11 @@ class CuDnnFusionVisitor : public DfsHloRewriteVisitor {
     VLOG(4) << "Processing " << hlo->ToString();
     VLOG(4) << "Plan ID: " << plan_id;
 
-    const std::string cache_key =
+    const std::string fingerprint_without_workspace =
         GetComputationFingerprint(hlo->fused_instructions_computation(), {});
-    std::string& cache_entry = compilation_results_[cache_key];
-    if (cache_entry.empty()) {
+    auto workspace_size_it =
+        workspace_sizes_.find(fingerprint_without_workspace);
+    if (workspace_size_it == workspace_sizes_.cend()) {
       TF_ASSIGN_OR_RETURN(
           se::gpu::CudnnGraph graph,
           PrepareGraph(dnn_support_, *DynCast<HloFusionInstruction>(hlo)));
@@ -524,19 +631,22 @@ class CuDnnFusionVisitor : public DfsHloRewriteVisitor {
           return absl::InternalError("No cuDNN plans can be built.");
         }
       }
-
-      if (graph.Graph().get_workspace_size() != 0) {
-        return absl::UnimplementedError(
-            "Support of workspace allocation is not added yet.");
-      }
+      const int64_t workspace_size = graph.Graph().get_workspace_size();
+      workspace_sizes_.insert(workspace_size_it,
+                              {fingerprint_without_workspace, workspace_size});
+      TF_ASSIGN_OR_RETURN(hlo, AddWorkspace(*hlo, workspace_size));
 
       std::vector<uint8_t> serialized_graph;
       RETURN_IF_CUDNN_FRONTEND_ERROR(graph.Graph().serialize(serialized_graph));
-      cache_entry =
+      // Compute a new fingerprint with a potential workspace for the
+      // compilation results to match a fingerprint computed by the emitter.
+      compilation_results_[GetComputationFingerprint(
+          hlo->fused_instructions_computation(), {})] =
           std::string(reinterpret_cast<char*>(serialized_graph.data()),
                       serialized_graph.size());
     } else {
       VLOG(4) << "Cache hit.";
+      TF_ASSIGN_OR_RETURN(hlo, AddWorkspace(*hlo, workspace_size_it->second));
     }
     auto cudnn_config = gpu_config.mutable_fusion_backend_config()
                             ->mutable_cudnn_fusion_config();
@@ -551,6 +661,7 @@ class CuDnnFusionVisitor : public DfsHloRewriteVisitor {
   se::dnn::DnnSupport& dnn_support_;
   // <HLO computation fingerprint, serialized compiled cuDNN graph>.
   CuDnnFusionCompiler::BinaryMap& compilation_results_;
+  absl::flat_hash_map<std::string, int64_t> workspace_sizes_;
 };
 
 }  // namespace
diff --git a/third_party/xla/xla/service/gpu/cudnn_norm_rewriter.cc b/third_party/xla/xla/service/gpu/cudnn_norm_rewriter.cc
index ec966e18085b11..27c736c7261cdc 100644
--- a/third_party/xla/xla/service/gpu/cudnn_norm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_norm_rewriter.cc
@@ -1038,13 +1038,11 @@ class CudnnNormRewriterVisitor : public DfsHloRewriteVisitor {
           HloInstruction * x_reshape,
           MakeReshapeHlo(reshaped_shape, x_transpose.value_or(x.Instr())));
 
-      // Reshape the scale and bias.
-      std::vector<int64_t> reshaped_scale_dims(reshaped_dims.begin() + 1,
-                                               reshaped_dims.end());
-      // cuDNN requires tensors to have at least four dimensions.
-      while (reshaped_scale_dims.size() < 4) {
-        reshaped_scale_dims.emplace_back(1);
-      }
+      // Reshape the scale and bias. The first dimension corresponds to the
+      // non-normalization dimension of the norm input and must have size 1.
+      std::vector<int64_t> reshaped_scale_dims = reshaped_dims;
+      reshaped_scale_dims[0] = 1;
+
       Shape scale_bias_shape = ShapeUtil::MakeShape(
           scale->shape().element_type(), reshaped_scale_dims);
       TF_ASSIGN_OR_RETURN(HloInstruction * scale_reshape,
diff --git a/third_party/xla/xla/service/gpu/cudnn_norm_rewriter_test.cc b/third_party/xla/xla/service/gpu/cudnn_norm_rewriter_test.cc
index f598e46da9accb..a13ffba5e51ed5 100644
--- a/third_party/xla/xla/service/gpu/cudnn_norm_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_norm_rewriter_test.cc
@@ -109,9 +109,9 @@ TEST_F(CudnnNormRewriterTest, LayerNorm2D1) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[2,4,1,1]{3,2,1,0} bitcast([[P0]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4]{0} parameter(1)
-; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} bitcast([[P1]])
+; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[1,4,1,1]{3,2,1,0} bitcast([[P1]])
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4]{0} parameter(2)
-; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} bitcast([[P2]])
+; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[1,4,1,1]{3,2,1,0} bitcast([[P2]])
 ; CHECK-NEXT:    [[CC:%[^ ]+]] = (f32[2,4,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P2_BITCAST]]),
 ; CHECK:           custom_call_target="__cudnn$norm",
 ; CHECK:           backend_config={
@@ -178,9 +178,9 @@ TEST_F(CudnnNormRewriterTest, LayerNorm4D3) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[48,8,1,1]{3,2,1,0} bitcast([[P0]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[8]{0} parameter(1)
-; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[8,1,1,1]{3,2,1,0} bitcast([[P1]])
+; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[1,8,1,1]{3,2,1,0} bitcast([[P1]])
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[8]{0} parameter(2)
-; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[8,1,1,1]{3,2,1,0} bitcast([[P2]])
+; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[1,8,1,1]{3,2,1,0} bitcast([[P2]])
 ; CHECK-NEXT:    [[CC:%[^ ]+]] = (f32[48,8,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P2_BITCAST]]),
 ; CHECK:           custom_call_target="__cudnn$norm",
 ; CHECK:           backend_config={
@@ -247,9 +247,9 @@ TEST_F(CudnnNormRewriterTest, LayerNorm4D3Degenerate0) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[1,4,6,8]{3,2,1,0} parameter(0)
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[24,8,1,1]{3,2,1,0} bitcast([[P0]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[8]{0} parameter(1)
-; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[8,1,1,1]{3,2,1,0} bitcast([[P1]])
+; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[1,8,1,1]{3,2,1,0} bitcast([[P1]])
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[8]{0} parameter(2)
-; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[8,1,1,1]{3,2,1,0} bitcast([[P2]])
+; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[1,8,1,1]{3,2,1,0} bitcast([[P2]])
 ; CHECK-NEXT:    [[CC:%[^ ]+]] = (f32[24,8,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P2_BITCAST]]),
 ; CHECK:           custom_call_target="__cudnn$norm",
 ; CHECK:           backend_config={
@@ -317,9 +317,9 @@ TEST_F(CudnnNormRewriterTest, LayerNorm4D2) {
 ; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[2,4,8,6]{3,2,1,0} transpose([[P0]]), dimensions={0,1,3,2}
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[64,6,1,1]{3,2,1,0} bitcast([[TRANSPOSE]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[6]{0} parameter(1)
-; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[6,1,1,1]{3,2,1,0} bitcast([[P1]])
+; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[1,6,1,1]{3,2,1,0} bitcast([[P1]])
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[6]{0} parameter(2)
-; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[6,1,1,1]{3,2,1,0} bitcast([[P2]])
+; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[1,6,1,1]{3,2,1,0} bitcast([[P2]])
 ; CHECK-NEXT:    [[CC:%[^ ]+]] = (f32[64,6,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P2_BITCAST]]),
 ; CHECK:           custom_call_target="__cudnn$norm",
 ; CHECK:           backend_config={
@@ -387,9 +387,9 @@ TEST_F(CudnnNormRewriterTest, LayerNorm4D2Degenerate1) {
 ; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[1,2,8,6]{3,2,1,0} transpose([[P0]]), dimensions={1,0,3,2}
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[16,6,1,1]{3,2,1,0} bitcast([[TRANSPOSE]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[6]{0} parameter(1)
-; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[6,1,1,1]{3,2,1,0} bitcast([[P1]])
+; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[1,6,1,1]{3,2,1,0} bitcast([[P1]])
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[6]{0} parameter(2)
-; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[6,1,1,1]{3,2,1,0} bitcast([[P2]])
+; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[1,6,1,1]{3,2,1,0} bitcast([[P2]])
 ; CHECK-NEXT:    [[CC:%[^ ]+]] = (f32[16,6,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P2_BITCAST]]),
 ; CHECK:           custom_call_target="__cudnn$norm",
 ; CHECK:           backend_config={
@@ -457,9 +457,9 @@ TEST_F(CudnnNormRewriterTest, LayerNorm4D12) {
 ; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[2,8,4,6]{3,2,1,0} transpose([[P0]]), dimensions={0,3,1,2}
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[16,4,6,1]{3,2,1,0} bitcast([[TRANSPOSE]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4,6]{1,0} parameter(1)
-; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[4,6,1,1]{3,2,1,0} bitcast([[P1]])
+; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[1,4,6,1]{3,2,1,0} bitcast([[P1]])
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4,6]{1,0} parameter(2)
-; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[4,6,1,1]{3,2,1,0} bitcast([[P2]])
+; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[1,4,6,1]{3,2,1,0} bitcast([[P2]])
 ; CHECK-NEXT:    [[CC:%[^ ]+]] = (f32[16,4,6,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P2_BITCAST]]),
 ; CHECK:           custom_call_target="__cudnn$norm",
 ; CHECK:           backend_config={
@@ -527,9 +527,9 @@ TEST_F(CudnnNormRewriterTest, LayerNorm4D12Degenerate2) {
 ; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[1,2,8,4]{3,2,1,0} transpose([[P0]]), dimensions={2,0,3,1}
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[16,4,1,1]{3,2,1,0} bitcast([[TRANSPOSE]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4,1]{1,0} parameter(1)
-; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} bitcast([[P1]])
+; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[1,4,1,1]{3,2,1,0} bitcast([[P1]])
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4,1]{1,0} parameter(2)
-; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} bitcast([[P2]])
+; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[1,4,1,1]{3,2,1,0} bitcast([[P2]])
 ; CHECK-NEXT:    [[CC:%[^ ]+]] = (f32[16,4,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P2_BITCAST]]),
 ; CHECK:           custom_call_target="__cudnn$norm",
 ; CHECK:           backend_config={
@@ -655,9 +655,9 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrain2D1) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[2,4,1,1]{3,2,1,0} bitcast([[P0]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4]{0} parameter(1)
-; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} bitcast([[P1]])
+; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[1,4,1,1]{3,2,1,0} bitcast([[P1]])
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4]{0} parameter(2)
-; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} bitcast([[P2]])
+; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[1,4,1,1]{3,2,1,0} bitcast([[P2]])
 ; CHECK-NEXT:    [[CC:%[^ ]+]] = (f32[2,4,1,1]{3,2,1,0}, f32[2,1,1,1]{3,2,1,0}, f32[2,1,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P2_BITCAST]]),
 ; CHECK:           custom_call_target="__cudnn$norm",
 ; CHECK:           backend_config={
@@ -732,9 +732,9 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrain4D3) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[48,8,1,1]{3,2,1,0} bitcast([[P0]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[8]{0} parameter(1)
-; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[8,1,1,1]{3,2,1,0} bitcast([[P1]])
+; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[1,8,1,1]{3,2,1,0} bitcast([[P1]])
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[8]{0} parameter(2)
-; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[8,1,1,1]{3,2,1,0} bitcast([[P2]])
+; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[1,8,1,1]{3,2,1,0} bitcast([[P2]])
 ; CHECK-NEXT:    [[CC:%[^ ]+]] = (f32[48,8,1,1]{3,2,1,0}, f32[48,1,1,1]{3,2,1,0}, f32[48,1,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P2_BITCAST]]),
 ; CHECK:           custom_call_target="__cudnn$norm",
 ; CHECK:           backend_config={
@@ -810,9 +810,9 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrain4D12) {
 ; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[2,8,4,6]{3,2,1,0} transpose([[P0]]), dimensions={0,3,1,2}
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[16,4,6,1]{3,2,1,0} bitcast([[TRANSPOSE]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4,6]{1,0} parameter(1)
-; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[4,6,1,1]{3,2,1,0} bitcast([[P1]])
+; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[1,4,6,1]{3,2,1,0} bitcast([[P1]])
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4,6]{1,0} parameter(2)
-; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[4,6,1,1]{3,2,1,0} bitcast([[P2]])
+; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[1,4,6,1]{3,2,1,0} bitcast([[P2]])
 ; CHECK-NEXT:    [[CC:%[^ ]+]] = (f32[16,4,6,1]{3,2,1,0}, f32[16,1,1,1]{3,2,1,0}, f32[16,1,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P2_BITCAST]]),
 ; CHECK:           custom_call_target="__cudnn$norm",
 ; CHECK:           backend_config={
@@ -888,9 +888,9 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrain4D12Degenerate2) {
 ; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[1,2,8,4]{3,2,1,0} transpose([[P0]]), dimensions={2,0,3,1}
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[16,4,1,1]{3,2,1,0} bitcast([[TRANSPOSE]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4,1]{1,0} parameter(1)
-; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} bitcast([[P1]])
+; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[1,4,1,1]{3,2,1,0} bitcast([[P1]])
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4,1]{1,0} parameter(2)
-; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} bitcast([[P2]])
+; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[1,4,1,1]{3,2,1,0} bitcast([[P2]])
 ; CHECK-NEXT:    [[CC:%[^ ]+]] = (f32[16,4,1,1]{3,2,1,0}, f32[16,1,1,1]{3,2,1,0}, f32[16,1,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P2_BITCAST]]),
 ; CHECK:           custom_call_target="__cudnn$norm",
 ; CHECK:           backend_config={
@@ -991,9 +991,9 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward2D1) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[2,4,1,1]{3,2,1,0} bitcast([[P0]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4]{0} parameter(1)
-; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} bitcast([[P1]])
+; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[1,4,1,1]{3,2,1,0} bitcast([[P1]])
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4]{0} parameter(2)
-; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} bitcast([[P2]])
+; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[1,4,1,1]{3,2,1,0} bitcast([[P2]])
 ; CHECK-NEXT:    [[CC0:%[^ ]+]] = (f32[2,4,1,1]{3,2,1,0}, f32[2,1,1,1]{3,2,1,0}, f32[2,1,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P2_BITCAST]]),
 ; CHECK:           custom_call_target="__cudnn$norm",
 ; CHECK:           backend_config={
@@ -1006,7 +1006,7 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward2D1) {
 ; CHECK-DAG:     [[P3_BITCAST:%[^ ]+]] = f32[2,4,1,1]{3,2,1,0} bitcast([[P3]])
 ; CHECK-DAG:     [[GTE1:%[^ ]+]] = f32[2,1,1,1]{3,2,1,0} get-tuple-element([[CC0]]), index=1
 ; CHECK-DAG:     [[GTE2:%[^ ]+]] = f32[2,1,1,1]{3,2,1,0} get-tuple-element([[CC0]]), index=2
-; CHECK-NEXT:    [[CC1:%[^ ]+]] = (f32[2,4,1,1]{3,2,1,0}, f32[4,1,1,1]{3,2,1,0}, f32[4,1,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P3_BITCAST]], [[GTE1]], [[GTE2]]),
+; CHECK-NEXT:    [[CC1:%[^ ]+]] = (f32[2,4,1,1]{3,2,1,0}, f32[1,4,1,1]{3,2,1,0}, f32[1,4,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P3_BITCAST]], [[GTE1]], [[GTE2]]),
 ; CHECK:           custom_call_target="__cudnn$norm",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "epsilon":0
@@ -1014,9 +1014,9 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward2D1) {
 ; CHECK:           }
 ; CHECK-DAG:     [[GTE3:%[^ ]+]] = f32[2,4,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=0
 ; CHECK-DAG:     [[GTE3_BITCAST:%[^ ]+]] = f32[2,4]{1,0} bitcast([[GTE3]])
-; CHECK-DAG:     [[GTE4:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=1
+; CHECK-DAG:     [[GTE4:%[^ ]+]] = f32[1,4,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=1
 ; CHECK-DAG:     [[GTE4_BITCAST:%[^ ]+]] = f32[4]{0} bitcast([[GTE4]])
-; CHECK-DAG:     [[GTE5:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=2
+; CHECK-DAG:     [[GTE5:%[^ ]+]] = f32[1,4,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=2
 ; CHECK-DAG:     [[GTE5_BITCAST:%[^ ]+]] = f32[4]{0} bitcast([[GTE5]])
 ; CHECK-DAG:  ROOT [[OUT:%[^ ]+]] = (f32[2,4]{1,0}, f32[2,4]{1,0}, f32[4]{0}, f32[4]{0}) tuple([[GTE0_BITCAST]], [[GTE3_BITCAST]], [[GTE4_BITCAST]], [[GTE5_BITCAST]])
   )";
@@ -1106,9 +1106,9 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D3) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[48,8,1,1]{3,2,1,0} bitcast([[P0]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[8]{0} parameter(1)
-; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[8,1,1,1]{3,2,1,0} bitcast([[P1]])
+; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[1,8,1,1]{3,2,1,0} bitcast([[P1]])
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[8]{0} parameter(2)
-; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[8,1,1,1]{3,2,1,0} bitcast([[P2]])
+; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[1,8,1,1]{3,2,1,0} bitcast([[P2]])
 ; CHECK-NEXT:    [[CC0:%[^ ]+]] = (f32[48,8,1,1]{3,2,1,0}, f32[48,1,1,1]{3,2,1,0}, f32[48,1,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P2_BITCAST]]),
 ; CHECK:           custom_call_target="__cudnn$norm",
 ; CHECK:           backend_config={
@@ -1121,7 +1121,7 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D3) {
 ; CHECK-DAG:     [[P3_BITCAST:%[^ ]+]] = f32[48,8,1,1]{3,2,1,0} bitcast([[P3]])
 ; CHECK-DAG:     [[GTE1:%[^ ]+]] = f32[48,1,1,1]{3,2,1,0} get-tuple-element([[CC0]]), index=1
 ; CHECK-DAG:     [[GTE2:%[^ ]+]] = f32[48,1,1,1]{3,2,1,0} get-tuple-element([[CC0]]), index=2
-; CHECK-NEXT:    [[CC1:%[^ ]+]] = (f32[48,8,1,1]{3,2,1,0}, f32[8,1,1,1]{3,2,1,0}, f32[8,1,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P3_BITCAST]], [[GTE1]], [[GTE2]]),
+; CHECK-NEXT:    [[CC1:%[^ ]+]] = (f32[48,8,1,1]{3,2,1,0}, f32[1,8,1,1]{3,2,1,0}, f32[1,8,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P3_BITCAST]], [[GTE1]], [[GTE2]]),
 ; CHECK:           custom_call_target="__cudnn$norm",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "epsilon":0
@@ -1129,9 +1129,9 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D3) {
 ; CHECK:           }
 ; CHECK-DAG:     [[GTE3:%[^ ]+]] = f32[48,8,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=0
 ; CHECK-DAG:     [[GTE3_BITCAST:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} bitcast([[GTE3]])
-; CHECK-DAG:     [[GTE4:%[^ ]+]] = f32[8,1,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=1
+; CHECK-DAG:     [[GTE4:%[^ ]+]] = f32[1,8,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=1
 ; CHECK-DAG:     [[GTE4_BITCAST:%[^ ]+]] = f32[8]{0} bitcast([[GTE4]])
-; CHECK-DAG:     [[GTE5:%[^ ]+]] = f32[8,1,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=2
+; CHECK-DAG:     [[GTE5:%[^ ]+]] = f32[1,8,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=2
 ; CHECK-DAG:     [[GTE5_BITCAST:%[^ ]+]] = f32[8]{0} bitcast([[GTE5]])
 ; CHECK-DAG:  ROOT [[OUT:%[^ ]+]] = (f32[2,4,6,8]{3,2,1,0}, f32[2,4,6,8]{3,2,1,0}, f32[8]{0}, f32[8]{0}) tuple([[GTE0_BITCAST]], [[GTE3_BITCAST]], [[GTE4_BITCAST]], [[GTE5_BITCAST]])
   )";
@@ -1222,9 +1222,9 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D2) {
 ; CHECK-NEXT:    [[TRANSPOSE0:%[^ ]+]] = f32[2,4,8,6]{3,2,1,0} transpose([[P0]]), dimensions={0,1,3,2}
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[64,6,1,1]{3,2,1,0} bitcast([[TRANSPOSE0]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[6]{0} parameter(1)
-; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[6,1,1,1]{3,2,1,0} bitcast([[P1]])
+; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[1,6,1,1]{3,2,1,0} bitcast([[P1]])
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[6]{0} parameter(2)
-; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[6,1,1,1]{3,2,1,0} bitcast([[P2]])
+; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[1,6,1,1]{3,2,1,0} bitcast([[P2]])
 ; CHECK-NEXT:    [[CC0:%[^ ]+]] = (f32[64,6,1,1]{3,2,1,0}, f32[64,1,1,1]{3,2,1,0}, f32[64,1,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P2_BITCAST]]),
 ; CHECK:           custom_call_target="__cudnn$norm",
 ; CHECK:           backend_config={
@@ -1237,7 +1237,7 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D2) {
 ; CHECK-DAG:     [[P3_BITCAST:%[^ ]+]] = f32[64,6,1,1]{3,2,1,0} bitcast([[TRANSPOSE1]])
 ; CHECK-DAG:     [[GTE1:%[^ ]+]] = f32[64,1,1,1]{3,2,1,0} get-tuple-element([[CC0]]), index=1
 ; CHECK-DAG:     [[GTE2:%[^ ]+]] = f32[64,1,1,1]{3,2,1,0} get-tuple-element([[CC0]]), index=2
-; CHECK-NEXT:    [[CC1:%[^ ]+]] = (f32[64,6,1,1]{3,2,1,0}, f32[6,1,1,1]{3,2,1,0}, f32[6,1,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P3_BITCAST]], [[GTE1]], [[GTE2]]),
+; CHECK-NEXT:    [[CC1:%[^ ]+]] = (f32[64,6,1,1]{3,2,1,0}, f32[1,6,1,1]{3,2,1,0}, f32[1,6,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P3_BITCAST]], [[GTE1]], [[GTE2]]),
 ; CHECK:           custom_call_target="__cudnn$norm",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "epsilon":0
@@ -1247,9 +1247,9 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D2) {
 ; CHECK-DAG:     [[FUSION:%[^ ]+]] = (f32[2,4,6,8]{3,2,1,0}, f32[2,4,6,8]{3,2,1,0}) fusion([[GTE0]], [[GTE3]]), kind=kLoop, calls=[[FUSED_COMPUTATION:%[^ ]+]]
 ; CHECK-DAG:     [[GTEF0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} get-tuple-element([[FUSION]]), index=0
 ; CHECK-DAG:     [[GTEF1:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} get-tuple-element([[FUSION]]), index=1
-; CHECK-DAG:     [[GTE4:%[^ ]+]] = f32[6,1,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=1
+; CHECK-DAG:     [[GTE4:%[^ ]+]] = f32[1,6,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=1
 ; CHECK-DAG:     [[GTE4_BITCAST:%[^ ]+]] = f32[6]{0} bitcast([[GTE4]])
-; CHECK-DAG:     [[GTE5:%[^ ]+]] = f32[6,1,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=2
+; CHECK-DAG:     [[GTE5:%[^ ]+]] = f32[1,6,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=2
 ; CHECK-DAG:     [[GTE5_BITCAST:%[^ ]+]] = f32[6]{0} bitcast([[GTE5]])
 ; CHECK-DAG:  ROOT [[OUT:%[^ ]+]] = (f32[2,4,6,8]{3,2,1,0}, f32[2,4,6,8]{3,2,1,0}, f32[6]{0}, f32[6]{0}) tuple([[GTEF0]], [[GTEF1]], [[GTE4_BITCAST]], [[GTE5_BITCAST]])
   )";
@@ -1340,9 +1340,9 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D12) {
 ; CHECK-NEXT:    [[TRANSPOSE0:%[^ ]+]] = f32[2,8,4,6]{3,2,1,0} transpose([[P0]]), dimensions={0,3,1,2}
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[16,4,6,1]{3,2,1,0} bitcast([[TRANSPOSE0]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4,6]{1,0} parameter(1)
-; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[4,6,1,1]{3,2,1,0} bitcast([[P1]])
+; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[1,4,6,1]{3,2,1,0} bitcast([[P1]])
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4,6]{1,0} parameter(2)
-; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[4,6,1,1]{3,2,1,0} bitcast([[P2]])
+; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[1,4,6,1]{3,2,1,0} bitcast([[P2]])
 ; CHECK-NEXT:    [[CC0:%[^ ]+]] = (f32[16,4,6,1]{3,2,1,0}, f32[16,1,1,1]{3,2,1,0}, f32[16,1,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P2_BITCAST]]),
 ; CHECK:           custom_call_target="__cudnn$norm",
 ; CHECK:           backend_config={
@@ -1355,7 +1355,7 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D12) {
 ; CHECK-DAG:     [[P3_BITCAST:%[^ ]+]] = f32[16,4,6,1]{3,2,1,0} bitcast([[TRANSPOSE1]])
 ; CHECK-DAG:     [[GTE1:%[^ ]+]] = f32[16,1,1,1]{3,2,1,0} get-tuple-element([[CC0]]), index=1
 ; CHECK-DAG:     [[GTE2:%[^ ]+]] = f32[16,1,1,1]{3,2,1,0} get-tuple-element([[CC0]]), index=2
-; CHECK-NEXT:    [[CC1:%[^ ]+]] = (f32[16,4,6,1]{3,2,1,0}, f32[4,6,1,1]{3,2,1,0}, f32[4,6,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P3_BITCAST]], [[GTE1]], [[GTE2]]),
+; CHECK-NEXT:    [[CC1:%[^ ]+]] = (f32[16,4,6,1]{3,2,1,0}, f32[1,4,6,1]{3,2,1,0}, f32[1,4,6,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P3_BITCAST]], [[GTE1]], [[GTE2]]),
 ; CHECK:           custom_call_target="__cudnn$norm",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "epsilon":0
@@ -1365,9 +1365,9 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D12) {
 ; CHECK-DAG:     [[FUSION:%[^ ]+]] = (f32[2,4,6,8]{3,2,1,0}, f32[2,4,6,8]{3,2,1,0}) fusion([[GTE0]], [[GTE3]]), kind=kLoop, calls=[[FUSED_COMPUTATION:%[^ ]+]]
 ; CHECK-DAG:     [[GTEF0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} get-tuple-element([[FUSION]]), index=0
 ; CHECK-DAG:     [[GTEF1:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} get-tuple-element([[FUSION]]), index=1
-; CHECK-DAG:     [[GTE4:%[^ ]+]] = f32[4,6,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=1
+; CHECK-DAG:     [[GTE4:%[^ ]+]] = f32[1,4,6,1]{3,2,1,0} get-tuple-element([[CC1]]), index=1
 ; CHECK-DAG:     [[GTE4_BITCAST:%[^ ]+]] = f32[4,6]{1,0} bitcast([[GTE4]])
-; CHECK-DAG:     [[GTE5:%[^ ]+]] = f32[4,6,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=2
+; CHECK-DAG:     [[GTE5:%[^ ]+]] = f32[1,4,6,1]{3,2,1,0} get-tuple-element([[CC1]]), index=2
 ; CHECK-DAG:     [[GTE5_BITCAST:%[^ ]+]] = f32[4,6]{1,0} bitcast([[GTE5]])
 ; CHECK-DAG:  ROOT [[OUT:%[^ ]+]] = (f32[2,4,6,8]{3,2,1,0}, f32[2,4,6,8]{3,2,1,0}, f32[4,6]{1,0}, f32[4,6]{1,0}) tuple([[GTEF0]], [[GTEF1]], [[GTE4_BITCAST]], [[GTE5_BITCAST]])
   )";
@@ -1458,9 +1458,9 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D12Degenerate2) {
 ; CHECK-NEXT:    [[TRANSPOSE0:%[^ ]+]] = f32[1,2,8,4]{3,2,1,0} transpose([[P0]]), dimensions={2,0,3,1}
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[16,4,1,1]{3,2,1,0} bitcast([[TRANSPOSE0]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4,1]{1,0} parameter(1)
-; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} bitcast([[P1]])
+; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[1,4,1,1]{3,2,1,0} bitcast([[P1]])
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4,1]{1,0} parameter(2)
-; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} bitcast([[P2]])
+; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[1,4,1,1]{3,2,1,0} bitcast([[P2]])
 ; CHECK-NEXT:    [[CC0:%[^ ]+]] = (f32[16,4,1,1]{3,2,1,0}, f32[16,1,1,1]{3,2,1,0}, f32[16,1,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P2_BITCAST]]),
 ; CHECK:           custom_call_target="__cudnn$norm",
 ; CHECK:           backend_config={
@@ -1473,7 +1473,7 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D12Degenerate2) {
 ; CHECK-DAG:     [[P3_BITCAST:%[^ ]+]] = f32[16,4,1,1]{3,2,1,0} bitcast([[TRANSPOSE1]])
 ; CHECK-DAG:     [[GTE1:%[^ ]+]] = f32[16,1,1,1]{3,2,1,0} get-tuple-element([[CC0]]), index=1
 ; CHECK-DAG:     [[GTE2:%[^ ]+]] = f32[16,1,1,1]{3,2,1,0} get-tuple-element([[CC0]]), index=2
-; CHECK-NEXT:    [[CC1:%[^ ]+]] = (f32[16,4,1,1]{3,2,1,0}, f32[4,1,1,1]{3,2,1,0}, f32[4,1,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P3_BITCAST]], [[GTE1]], [[GTE2]]),
+; CHECK-NEXT:    [[CC1:%[^ ]+]] = (f32[16,4,1,1]{3,2,1,0}, f32[1,4,1,1]{3,2,1,0}, f32[1,4,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P3_BITCAST]], [[GTE1]], [[GTE2]]),
 ; CHECK:           custom_call_target="__cudnn$norm",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "epsilon":0
@@ -1483,9 +1483,9 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D12Degenerate2) {
 ; CHECK-DAG:     [[FUSION0:%[^ ]+]] = (f32[2,4,1,8]{3,2,1,0}, f32[2,4,1,8]{3,2,1,0}) fusion([[GTE0]], [[GTE3]]), kind=kLoop, calls=[[FUSED_COMPUTATION0:%[^ ]+]]
 ; CHECK-DAG:     [[GTEF0:%[^ ]+]] = f32[2,4,1,8]{3,2,1,0} get-tuple-element([[FUSION0]]), index=0
 ; CHECK-DAG:     [[GTEF1:%[^ ]+]] = f32[2,4,1,8]{3,2,1,0} get-tuple-element([[FUSION0]]), index=1
-; CHECK-DAG:     [[GTE4:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=1
+; CHECK-DAG:     [[GTE4:%[^ ]+]] = f32[1,4,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=1
 ; CHECK-DAG:     [[GTE4_BITCAST:%[^ ]+]] = f32[4,1]{1,0} bitcast([[GTE4]])
-; CHECK-DAG:     [[GTE5:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=2
+; CHECK-DAG:     [[GTE5:%[^ ]+]] = f32[1,4,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=2
 ; CHECK-DAG:     [[GTE5_BITCAST:%[^ ]+]] = f32[4,1]{1,0} bitcast([[GTE5]])
 ; CHECK-DAG:  ROOT [[OUT:%[^ ]+]] = (f32[2,4,1,8]{3,2,1,0}, f32[2,4,1,8]{3,2,1,0}, f32[4,1]{1,0}, f32[4,1]{1,0}) tuple([[GTEF0]], [[GTEF1]], [[GTE4_BITCAST]], [[GTE5_BITCAST]])
   )";
@@ -1577,9 +1577,9 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D1DoutputReshapeSplit) {
 ; CHECK-NEXT:    [[TRANSPOSE0:%[^ ]+]] = f32[2,6,8,4]{3,2,1,0} transpose([[P0]]), dimensions={0,2,3,1}
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[96,4,1,1]{3,2,1,0} bitcast([[TRANSPOSE0]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4]{0} parameter(1)
-; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} bitcast([[P1]])
+; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[1,4,1,1]{3,2,1,0} bitcast([[P1]])
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4]{0} parameter(2)
-; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} bitcast([[P2]])
+; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[1,4,1,1]{3,2,1,0} bitcast([[P2]])
 ; CHECK-NEXT:    [[CC0:%[^ ]+]] = (f32[96,4,1,1]{3,2,1,0}, f32[96,1,1,1]{3,2,1,0}, f32[96,1,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P2_BITCAST]]),
 ; CHECK:           custom_call_target="__cudnn$norm",
 ; CHECK:           backend_config={
@@ -1592,7 +1592,7 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D1DoutputReshapeSplit) {
 ; CHECK-DAG:     [[FUSION0_BITCAST:%[^ ]+]] = f32[96,4,1,1]{3,2,1,0} bitcast([[FUSION0]])
 ; CHECK-DAG:     [[GTE1:%[^ ]+]] = f32[96,1,1,1]{3,2,1,0} get-tuple-element([[CC0]]), index=1
 ; CHECK-DAG:     [[GTE2:%[^ ]+]] = f32[96,1,1,1]{3,2,1,0} get-tuple-element([[CC0]]), index=2
-; CHECK-NEXT:    [[CC1:%[^ ]+]] = (f32[96,4,1,1]{3,2,1,0}, f32[4,1,1,1]{3,2,1,0}, f32[4,1,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[FUSION0_BITCAST]], [[GTE1]], [[GTE2]]),
+; CHECK-NEXT:    [[CC1:%[^ ]+]] = (f32[96,4,1,1]{3,2,1,0}, f32[1,4,1,1]{3,2,1,0}, f32[1,4,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[FUSION0_BITCAST]], [[GTE1]], [[GTE2]]),
 ; CHECK:           custom_call_target="__cudnn$norm",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "epsilon":0
@@ -1602,9 +1602,9 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D1DoutputReshapeSplit) {
 ; CHECK-DAG:     [[FUSION1:%[^ ]+]] = (f32[2,4,6,8]{3,2,1,0}, f32[2,4,6,8]{3,2,1,0}) fusion([[GTE0]], [[GTE3]]), kind=kLoop, calls=[[FUSED_COMPUTATION1:%[^ ]+]]
 ; CHECK-DAG:     [[GTEF1:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} get-tuple-element([[FUSION1]]), index=0
 ; CHECK-DAG:     [[GTEF2:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} get-tuple-element([[FUSION1]]), index=1
-; CHECK-DAG:     [[GTE4:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=1
+; CHECK-DAG:     [[GTE4:%[^ ]+]] = f32[1,4,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=1
 ; CHECK-DAG:     [[GTE4_BITCAST:%[^ ]+]] = f32[4]{0} bitcast([[GTE4]])
-; CHECK-DAG:     [[GTE5:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=2
+; CHECK-DAG:     [[GTE5:%[^ ]+]] = f32[1,4,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=2
 ; CHECK-DAG:     [[GTE5_BITCAST:%[^ ]+]] = f32[4]{0} bitcast([[GTE5]])
 ; CHECK-DAG:  ROOT [[OUT:%[^ ]+]] = (f32[2,4,6,8]{3,2,1,0}, f32[2,4,6,8]{3,2,1,0}, f32[4]{0}, f32[4]{0}) tuple([[GTEF1]], [[GTEF2]], [[GTE4_BITCAST]], [[GTE5_BITCAST]])
   )";
@@ -1696,9 +1696,9 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D1DoutputReshapeCombine) {
 ; CHECK-NEXT:    [[TRANSPOSE0:%[^ ]+]] = f32[2,6,8,4]{3,2,1,0} transpose([[P0]]), dimensions={0,2,3,1}
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[96,4,1,1]{3,2,1,0} bitcast([[TRANSPOSE0]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4]{0} parameter(1)
-; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} bitcast([[P1]])
+; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[1,4,1,1]{3,2,1,0} bitcast([[P1]])
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4]{0} parameter(2)
-; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} bitcast([[P2]])
+; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[1,4,1,1]{3,2,1,0} bitcast([[P2]])
 ; CHECK-NEXT:    [[CC0:%[^ ]+]] = (f32[96,4,1,1]{3,2,1,0}, f32[96,1,1,1]{3,2,1,0}, f32[96,1,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P2_BITCAST]]),
 ; CHECK:           custom_call_target="__cudnn$norm",
 ; CHECK:           backend_config={
@@ -1711,7 +1711,7 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D1DoutputReshapeCombine) {
 ; CHECK-DAG:     [[FUSION0_BITCAST:%[^ ]+]] = f32[96,4,1,1]{3,2,1,0} bitcast([[FUSION0]])
 ; CHECK-DAG:     [[GTE1:%[^ ]+]] = f32[96,1,1,1]{3,2,1,0} get-tuple-element([[CC0]]), index=1
 ; CHECK-DAG:     [[GTE2:%[^ ]+]] = f32[96,1,1,1]{3,2,1,0} get-tuple-element([[CC0]]), index=2
-; CHECK-NEXT:    [[CC1:%[^ ]+]] = (f32[96,4,1,1]{3,2,1,0}, f32[4,1,1,1]{3,2,1,0}, f32[4,1,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[FUSION0_BITCAST]], [[GTE1]], [[GTE2]]),
+; CHECK-NEXT:    [[CC1:%[^ ]+]] = (f32[96,4,1,1]{3,2,1,0}, f32[1,4,1,1]{3,2,1,0}, f32[1,4,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[FUSION0_BITCAST]], [[GTE1]], [[GTE2]]),
 ; CHECK:           custom_call_target="__cudnn$norm",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "epsilon":0
@@ -1721,9 +1721,9 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D1DoutputReshapeCombine) {
 ; CHECK-DAG:     [[FUSION1:%[^ ]+]] = (f32[2,4,6,8]{3,2,1,0}, f32[2,4,6,8]{3,2,1,0}) fusion([[GTE0]], [[GTE3]]), kind=kLoop, calls=[[FUSED_COMPUTATION1:%[^ ]+]]
 ; CHECK-DAG:     [[GTEF1:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} get-tuple-element([[FUSION1]]), index=0
 ; CHECK-DAG:     [[GTEF2:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} get-tuple-element([[FUSION1]]), index=1
-; CHECK-DAG:     [[GTE4:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=1
+; CHECK-DAG:     [[GTE4:%[^ ]+]] = f32[1,4,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=1
 ; CHECK-DAG:     [[GTE4_BITCAST:%[^ ]+]] = f32[4]{0} bitcast([[GTE4]])
-; CHECK-DAG:     [[GTE5:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=2
+; CHECK-DAG:     [[GTE5:%[^ ]+]] = f32[1,4,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=2
 ; CHECK-DAG:     [[GTE5_BITCAST:%[^ ]+]] = f32[4]{0} bitcast([[GTE5]])
 ; CHECK-DAG:  ROOT [[OUT:%[^ ]+]] = (f32[2,4,6,8]{3,2,1,0}, f32[2,4,6,8]{3,2,1,0}, f32[4]{0}, f32[4]{0}) tuple([[GTEF1]], [[GTEF2]], [[GTE4_BITCAST]], [[GTE5_BITCAST]])
   )";
diff --git a/third_party/xla/xla/service/gpu/cudnn_support_utils.cc b/third_party/xla/xla/service/gpu/cudnn_support_utils.cc
index 864943884a56f9..30ac372a86ff9e 100644
--- a/third_party/xla/xla/service/gpu/cudnn_support_utils.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_support_utils.cc
@@ -212,5 +212,12 @@ CudnnInferTransposeForBiasReordering(const Shape& shape) {
   return CudnnReorderTransposeConfig{split_shape, shape, permutation};
 }
 
+bool IsWorkspaceAllocationRoot(const HloInstruction& root) {
+  return root.IsRoot() && root.opcode() == HloOpcode::kTuple &&
+         root.operand_count() == 2 &&
+         root.operand(1)->IsCustomCall(kWorkspaceAllocationCustomCallTarget) &&
+         root.operand(1)->operand_count() == 0;
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/cudnn_support_utils.h b/third_party/xla/xla/service/gpu/cudnn_support_utils.h
index 780e0593f9b366..4a1f362b16e168 100644
--- a/third_party/xla/xla/service/gpu/cudnn_support_utils.h
+++ b/third_party/xla/xla/service/gpu/cudnn_support_utils.h
@@ -70,6 +70,12 @@ CudnnInferTransposeForFilterReordering(
 absl::StatusOr<CudnnReorderTransposeConfig>
 CudnnInferTransposeForBiasReordering(const Shape& shape);
 
+inline constexpr absl::string_view kWorkspaceAllocationCustomCallTarget =
+    "__nop";
+
+// Detects `ROOT tuple(..., custom-call())` used to allocate workspace buffers.
+bool IsWorkspaceAllocationRoot(const HloInstruction& root);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/cudnn_workspace_rewriter.cc b/third_party/xla/xla/service/gpu/cudnn_workspace_rewriter.cc
index b9e03c2f2ffc24..55c2c4f913cf8f 100644
--- a/third_party/xla/xla/service/gpu/cudnn_workspace_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_workspace_rewriter.cc
@@ -16,15 +16,16 @@ limitations under the License.
 #include "xla/service/gpu/cudnn_workspace_rewriter.h"
 
 #include <optional>
-#include <string>
+#include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "third_party/gpus/cudnn/cudnn_version.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_clone_context.h"
@@ -32,17 +33,16 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/primitive_util.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/cublas_cudnn.h"
 #include "xla/service/gpu/gpu_fused_mha_runner.h"
-#include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/stream_executor_util.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status_macros.h"
 #include "xla/stream_executor/cuda/cuda_dnn.h"
-#include "xla/stream_executor/cuda/cudnn_frontend_helpers.h"
+#include "xla/stream_executor/dnn.h"
 #include "xla/util.h"
-#include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -50,9 +50,6 @@ namespace gpu {
 
 namespace {
 
-namespace fe = cudnn_frontend;
-namespace graph = fe::graph;
-
 // create cuDNN graphs from HloCustomCall
 absl::StatusOr<se::gpu::CudnnGraph> HloCustomCallToCuDnnGraph(
     se::dnn::DnnSupport& dnn_support,
@@ -62,23 +59,10 @@ absl::StatusOr<se::gpu::CudnnGraph> HloCustomCallToCuDnnGraph(
                         xla::gpu::GetCudnnfMHAKind(custom_call));
     std::optional<Shape> mask_shape, bias_shape;
     {
-      bool has_mask = kind == CudnnfMHAKind::kScaleMaskSoftmax ||
-                      kind == CudnnfMHAKind::kScaleMaskSoftmaxDropout ||
-                      kind == CudnnfMHAKind::kScaleBiasMaskSoftmax ||
-                      kind == CudnnfMHAKind::kScaleBiasMaskSoftmaxDropout;
-      bool has_bias = kind == CudnnfMHAKind::kScaleBiasMaskSoftmax ||
-                      kind == CudnnfMHAKind::kScaleBiasSoftmaxDropout ||
-                      kind == CudnnfMHAKind::kScaleBiasSoftmax ||
+      bool has_bias = kind == CudnnfMHAKind::kScaleBiasSoftmax ||
                       kind == CudnnfMHAKind::kScaleBiasSoftmaxDropout;
 
-      if (has_mask) {
-        const HloInstruction* mask = custom_call->operand(3);
-        mask_shape = mask->shape();
-        if (has_bias) {
-          const HloInstruction* bias = custom_call->operand(4);
-          bias_shape = bias->shape();
-        }
-      } else if (has_bias) {
+      if (has_bias) {
         const HloInstruction* bias = custom_call->operand(3);
         bias_shape = bias->shape();
       }
@@ -97,7 +81,7 @@ absl::StatusOr<se::gpu::CudnnGraph> HloCustomCallToCuDnnGraph(
         xla::ShapeUtil::TupleElementCount(custom_call->shape()) == 3;
     if (has_activation) {
       output_shapes.push_back(
-          ShapeUtil::GetSubshape(custom_call->shape(), {2}));
+          ShapeUtil::GetSubshape(custom_call->shape(), {1}));
     }
 
     Shape q_shape = custom_call->operand(0)->shape();
@@ -107,7 +91,6 @@ absl::StatusOr<se::gpu::CudnnGraph> HloCustomCallToCuDnnGraph(
                         AsCudnnFmhaMaskKind(config.mask_type()));
     GpufMHADescriptor descriptor = {kind,
                                     config,
-                                    config.is_flash_attention(),
                                     cudnn_mask_type,
                                     q_shape,
                                     k_shape,
@@ -129,8 +112,7 @@ absl::StatusOr<se::gpu::CudnnGraph> HloCustomCallToCuDnnGraph(
         se::gpu::GetCudnnFlashAttentionOperationGraph(
             dnn_support, fmha_config.lhs_bmm1, fmha_config.rhs_bmm1,
             fmha_config.rhs_bmm2, fmha_config.output, fmha_config.bias,
-            fmha_config.mask, fmha_config.activation,
-            static_cast<float>(*fmha_config.fmha_scale),
+            fmha_config.activation, static_cast<float>(*fmha_config.fmha_scale),
             fmha_config.dropout_rate && *fmha_config.dropout_rate > 0.0,
             fmha_config.dropout_rate, dnn_mask_type));
     return std::move(graph);
@@ -140,7 +122,6 @@ absl::StatusOr<se::gpu::CudnnGraph> HloCustomCallToCuDnnGraph(
         custom_call->backend_config<xla::gpu::GpuBackendConfig>());
     const xla::gpu::CudnnfMHABackendConfig& config =
         gpu_config.cudnn_fmha_backend_config();
-    bool is_flash_attention = config.is_flash_attention();
 
     int input_index = 0;
     Shape bmm1_grad_gemm1_rhs_shape =
@@ -155,20 +136,10 @@ absl::StatusOr<se::gpu::CudnnGraph> HloCustomCallToCuDnnGraph(
 
     TF_ASSIGN_OR_RETURN(const CudnnfMHAKind kind,
                         GetCudnnfMHAKind(custom_call));
-    bool has_mask = kind == CudnnfMHAKind::kBackwardScaleMaskSoftmax ||
-                    kind == CudnnfMHAKind::kBackwardScaleBiasMaskSoftmax ||
-                    kind == CudnnfMHAKind::kBackwardScaleMaskSoftmaxDropout ||
-                    kind == CudnnfMHAKind::kBackwardScaleBiasMaskSoftmaxDropout;
     std::optional<Shape> mask_shape;
-    if (has_mask) {
-      mask_shape = custom_call->operand(input_index++)->shape();
-    }
 
-    bool has_bias =
-        (kind == CudnnfMHAKind::kBackwardScaleBiasSoftmax ||
-         kind == CudnnfMHAKind::kBackwardScaleBiasSoftmaxDropout ||
-         kind == CudnnfMHAKind::kBackwardScaleBiasMaskSoftmax ||
-         kind == CudnnfMHAKind::kBackwardScaleBiasMaskSoftmaxDropout);
+    bool has_bias = (kind == CudnnfMHAKind::kBackwardScaleBiasSoftmax ||
+                     kind == CudnnfMHAKind::kBackwardScaleBiasSoftmaxDropout);
     std::optional<Shape> bias_shape;
     if (has_bias) {
       bias_shape = custom_call->operand(input_index++)->shape();
@@ -176,6 +147,12 @@ absl::StatusOr<se::gpu::CudnnGraph> HloCustomCallToCuDnnGraph(
 
     std::optional<Shape> fwd_output_shape =
         custom_call->operand(input_index++)->shape();
+    if (config.mask_type() == xla::gpu::CudnnfMHABackendConfig::PADDING ||
+        config.mask_type() ==
+            xla::gpu::CudnnfMHABackendConfig::PADDING_CAUSAL) {
+      // skip q_seqlen and kv_seqlen
+      input_index += 2;
+    }
     TF_RET_CHECK(input_index == custom_call->operand_count());
 
     int output_index = 0;
@@ -185,16 +162,21 @@ absl::StatusOr<se::gpu::CudnnGraph> HloCustomCallToCuDnnGraph(
         ShapeUtil::GetSubshape(custom_call->shape(), {output_index++});
     Shape d_bmm2_rhs_shape =
         ShapeUtil::GetSubshape(custom_call->shape(), {output_index++});
-    output_index++;
     std::optional<Shape> d_s_shape;
     std::optional<Shape> d_bias_shape;
-    TF_RET_CHECK(output_index == custom_call->shape().tuple_shapes().size());
+    bool has_dbias = custom_call->shape().tuple_shapes().size() == 5;
+    if (has_dbias) {
+      d_bias_shape =
+          ShapeUtil::GetSubshape(custom_call->shape(), {output_index++});
+    }
+    // The last one is the workspace.
+    TF_RET_CHECK(output_index ==
+                 custom_call->shape().tuple_shapes().size() - 1);
     TF_ASSIGN_OR_RETURN(CudnnfMHAMaskKind cudnn_mask_type,
                         AsCudnnFmhaMaskKind(config.mask_type()));
     GpufMHABackwardDescriptor descriptor = {
         kind,
         config,
-        is_flash_attention,
         cudnn_mask_type,
         bmm1_grad_gemm1_rhs_shape,
         bmm1_grad_gemm2_rhs_shape,
@@ -229,8 +211,7 @@ absl::StatusOr<se::gpu::CudnnGraph> HloCustomCallToCuDnnGraph(
             fmha_config.d_bmm2_rhs, fmha_config.bias, fmha_config.dropout_rate,
             fmha_config.seed, *fmha_config.fmha_scale,
             fmha_config.dropout_rate && *fmha_config.dropout_rate > 0.0,
-            fmha_config.mask != std::nullopt, fmha_config.bias != std::nullopt,
-            dnn_mask_type));
+            fmha_config.bias != std::nullopt, dnn_mask_type));
     return std::move(graph);
   }
 }
@@ -247,12 +228,6 @@ class CuDnnCustomCallVisitor : public DfsHloRewriteVisitor {
     }
     TF_ASSIGN_OR_RETURN(auto gpu_config,
                         hlo->backend_config<GpuBackendConfig>());
-    const CudnnfMHABackendConfig& config =
-        gpu_config.cudnn_fmha_backend_config();
-    if (!config.is_flash_attention()) {
-      // only flash attention is supported in new cudnn frontend
-      return absl::OkStatus();
-    }
 
     TF_ASSIGN_OR_RETURN(
         se::gpu::CudnnGraph graph,
@@ -260,14 +235,11 @@ class CuDnnCustomCallVisitor : public DfsHloRewriteVisitor {
                                   DynCast<HloCustomCallInstruction>(hlo)));
     auto workspace = graph.Graph().get_workspace_size();
     if (workspace != 0) {
-      // rewrite custom call to have correct scratch spaces
+      // rewrite custom call to have correct workspace size
       VLOG(4) << "Rewriting: " << hlo->ToString();
       Shape* shape = hlo->mutable_shape();
-      if (IsFwdCustomCallTofMHA(*hlo)) {
-        shape->mutable_tuple_shapes(1)->set_dimensions(0, workspace);
-      } else {
-        shape->mutable_tuple_shapes(3)->set_dimensions(0, workspace);
-      }
+      shape->mutable_tuple_shapes(shape->tuple_shapes_size() - 1)
+          ->set_dimensions(0, workspace);
       MarkAsChanged();
     }
     return absl::OkStatus();
diff --git a/third_party/xla/xla/service/gpu/cusolver_context.cc b/third_party/xla/xla/service/gpu/cusolver_context.cc
index 8f9642ba4ea051..5814e33ec122cf 100644
--- a/third_party/xla/xla/service/gpu/cusolver_context.cc
+++ b/third_party/xla/xla/service/gpu/cusolver_context.cc
@@ -143,7 +143,7 @@ cublasFillMode_t GpuBlasUpperLower(se::blas::UpperLower uplo) {
   }
 }
 
-// Converts a cuSolver absl::Status to a Status.
+// Converts a cuSolver absl::Status to a absl::Status.
 absl::Status ConvertStatus(cusolverStatus_t status) {
   switch (status) {
     case CUSOLVER_STATUS_SUCCESS:
diff --git a/third_party/xla/xla/service/gpu/custom_call_test.cc b/third_party/xla/xla/service/gpu/custom_call_test.cc
index feb3d53c79c260..7eb92d96cc1dd8 100644
--- a/third_party/xla/xla/service/gpu/custom_call_test.cc
+++ b/third_party/xla/xla/service/gpu/custom_call_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <functional>
+#include <memory>
 #include <ostream>
 #include <sstream>
 #include <string>
@@ -40,6 +41,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/client/lib/constants.h"
 #include "xla/client/xla_builder.h"
+#include "xla/ffi/execution_context.h"
 #include "xla/ffi/ffi.h"
 #include "xla/ffi/ffi_api.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
@@ -716,5 +718,49 @@ TEST_F(CustomCallTest, WithCalledComputation) {
   EXPECT_THAT(result.data<float>(), ::testing::Each(42));
 }
 
+//===----------------------------------------------------------------------===//
+// XLA:FFI handler with execution context
+//===----------------------------------------------------------------------===//
+
+// Arbitrary user-defined context passed via the execution context side channel
+// to a custom call handlers.
+struct SomeExtraContext {
+  explicit SomeExtraContext(int32_t value) : value(value) {}
+  int32_t value;
+};
+
+static absl::Status ExecutionContext(ffi::Result<ffi::BufferBase>,
+                                     SomeExtraContext* ctx) {
+  if (ctx->value != 42) return absl::InternalError("Unexpected value");
+  return absl::OkStatus();
+}
+
+XLA_FFI_DEFINE_HANDLER(kExecutionContext, ExecutionContext,
+                       ffi::Ffi::Bind()
+                           .Ret<ffi::BufferBase>()
+                           .Ctx<ffi::UserData<SomeExtraContext>>());
+
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "xla.gpu.ffi_execution_context",
+                         PLATFORM, kExecutionContext);
+
+TEST_F(CustomCallTest, FfiExecutionContext) {
+  XlaBuilder b(TestName());
+  CustomCall(&b, "xla.gpu.ffi_execution_context", /*operands=*/{},
+             ShapeUtil::MakeShape(F32, {}),
+             /*opaque=*/"",
+             /*has_side_effect=*/false,
+             /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
+             /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
+             /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
+
+  ffi::ExecutionContext execution_context;
+  TF_ASSERT_OK(execution_context.Emplace<SomeExtraContext>(42));
+
+  ffi::internal::ScopedExecutionContext scoped_execution_context(
+      &execution_context);
+
+  TF_ASSERT_OK(Execute(&b, {}).status());
+}
+
 }  // anonymous namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/data/hlo_algorithm_denylist.pbtxt b/third_party/xla/xla/service/gpu/data/hlo_algorithm_denylist.pbtxt
index 5f22429962c1f0..77ccde6cf7086e 100644
--- a/third_party/xla/xla/service/gpu/data/hlo_algorithm_denylist.pbtxt
+++ b/third_party/xla/xla/service/gpu/data/hlo_algorithm_denylist.pbtxt
@@ -14,4 +14,9 @@ entries {
   algos: [{}, {tensor_ops: true}, {id: 1}, {id:1, tensor_ops: true}]
 }
 
-
+entries {
+  hlo: '(f16[256,112,112,64]{3,2,1,0}, u8[0]{0}) custom-call(f16[256,224,224,4]{3,2,1,0}, f16[7,7,4,64]{2,1,0,3}), window={size=7x7 stride=2x2 pad=3_3x3_3}, dim_labels=b01f_01io->b01f, custom_call_target="__cudnn$convForward", backend_config="{conv_result_scale:1}"'
+  cc: {major: 7, minor: 0}
+  cudnn_version: {major: 7, minor: 6, patch: 2}
+  algos: [{id: 42}, {id:42, tensor_ops: true}]
+}
\ No newline at end of file
diff --git a/third_party/xla/xla/service/gpu/determinism_test.cc b/third_party/xla/xla/service/gpu/determinism_test.cc
index 43799901f2a0eb..02f5ba86a64254 100644
--- a/third_party/xla/xla/service/gpu/determinism_test.cc
+++ b/third_party/xla/xla/service/gpu/determinism_test.cc
@@ -38,7 +38,7 @@ namespace gpu {
 class DeterminismTest : public GpuCodegenTest {
  public:
   DeterminismTest() : debug_options_(HloTestBase::GetDebugOptionsForTest()) {
-    debug_options_.set_xla_gpu_deterministic_ops(true);
+    debug_options_.set_xla_gpu_exclude_nondeterministic_ops(true);
     // Randomize timer durations to better test autotuning does not introduce
     // nondeterminism.
     se::gpu::GpuTimer::ReturnRandomDurationsForTesting();
@@ -109,7 +109,7 @@ ENTRY e {
   AssertDeterminism(kHloText);
 }
 
-TEST_F(DeterminismTest, TritonDot) {
+TEST_F(DeterminismTest, DeterministicTritonGemmUsesDefaultConfig) {
 #if GOOGLE_CUDA
   auto comp = backend()
                   .default_stream_executor()
@@ -130,11 +130,47 @@ ENTRY e {
   ROOT d = f32[128,128] dot(p0_convert, p1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 })";
 
-  debug_options_.set_xla_gpu_triton_gemm_any(true);
-  MatchOptimizedHlo(kHloText, R"(; CHECK: __triton_gemm)");
+  // Disable autotuning.
+  debug_options_.set_xla_gpu_deterministic_ops(true);
+  // Check that triton is used but without autotuning (default config).
+  AutotunerUtil::ClearAutotuneResults();
+  MatchOptimizedHlo(kHloText, R"(
+    CHECK: __triton_gemm
+    CHECK: {"block_m":"32","block_n":"32","block_k":"32","split_k":"1","num_stages":"1","num_warps":"4","num_ctas":"1"}
+  )");
   AssertDeterminism(kHloText, /*num_runs=*/3);
 }
 
+TEST_F(DeterminismTest, ExcludingNonDeterministicOpsDoesNotDisableAutotuning) {
+#if GOOGLE_CUDA
+  auto comp = backend()
+                  .default_stream_executor()
+                  ->GetDeviceDescription()
+                  .cuda_compute_capability();
+  if (!comp.IsAtLeast(se::CudaComputeCapability::VOLTA)) {
+    GTEST_SKIP() << "Triton not used on pre-Volta GPUs";
+  }
+#elif TENSORFLOW_USE_ROCM
+  GTEST_SKIP() << "Triton Gemm rewriter is not yet supported on ROCM";
+#endif  // TENSORFLOW_USE_ROCM
+
+  debug_options_.set_xla_gpu_cublas_fallback(false);
+  ASSERT_FALSE(debug_options_.xla_gpu_deterministic_ops());
+  AutotunerUtil::ClearAutotuneResults();
+  // The default config is not used when autotuning is on.
+  MatchOptimizedHlo(R"(
+ENTRY e {
+  p0 = bf16[128,128] parameter(0)
+  p0_convert = f32[128,128] convert(p0)
+  p1 = f32[128,128] parameter(1)
+  ROOT d = f32[128,128] dot(p0_convert, p1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})",
+                    R"(
+    CHECK: __triton_gemm
+    CHECK-NOT: {"block_m":"32","block_n":"32","block_k":"32","split_k":"1","num_stages":"1","num_warps":"4","num_ctas":"1"}
+  )");
+}
+
 TEST_F(DeterminismTest, Conv) {
   constexpr absl::string_view kHloText = R"(
 ENTRY e {
diff --git a/third_party/xla/xla/service/gpu/dot_operand_converter_test.cc b/third_party/xla/xla/service/gpu/dot_operand_converter_test.cc
index 9a36a288e01b6b..63b0017012f419 100644
--- a/third_party/xla/xla/service/gpu/dot_operand_converter_test.cc
+++ b/third_party/xla/xla/service/gpu/dot_operand_converter_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/primitive_util.h"
 #include "xla/tests/hlo_test_base.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla::gpu {
diff --git a/third_party/xla/xla/service/gpu/dot_sparsity_rewriter.cc b/third_party/xla/xla/service/gpu/dot_sparsity_rewriter.cc
new file mode 100644
index 00000000000000..00f3de94fab99c
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/dot_sparsity_rewriter.cc
@@ -0,0 +1,110 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/dot_sparsity_rewriter.h"
+
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/hlo_creation_utils.h"
+#include "xla/status.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class SparseDotRewriterImpl : public DfsHloRewriteVisitor {
+ public:
+  absl::Status HandleDot(HloInstruction* instr) override {
+    // Only handle sparse dots with a single RHS sparse descriptor.
+    HloDotInstruction* dot = Cast<HloDotInstruction>(instr);
+    if (dot->sparse_operands() != 1 || dot->sparsity().front().index() != 1) {
+      return OkStatus();
+    }
+
+    HloInstruction* lhs = dot->mutable_operand(0);
+    HloInstruction* rhs = dot->mutable_operand(1);
+    HloInstruction* meta = dot->mutable_operand(2);
+
+    // Swap LHS and RHS in the attributes.
+    DotDimensionNumbers dnums = dot->dot_dimension_numbers();
+    std::swap(*dnums.mutable_lhs_batch_dimensions(),
+              *dnums.mutable_rhs_batch_dimensions());
+    std::swap(*dnums.mutable_lhs_contracting_dimensions(),
+              *dnums.mutable_rhs_contracting_dimensions());
+
+    PrecisionConfig precision_config = dot->precision_config();
+    std::swap(precision_config.mutable_operand_precision()->at(0),
+              precision_config.mutable_operand_precision()->at(1));
+
+    SparsityDescriptor sparsity = dot->sparsity().front();
+    sparsity.set_index(0);
+
+    // Create new dot with LHS and RHS swapped.
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * new_dot,
+        MakeDotHlo(rhs, lhs, dnums, precision_config,
+                   dot->shape().element_type(), {std::move(sparsity)}, {meta}));
+    dot->SetupDerivedInstruction(new_dot);
+
+    // Result dimensions: <batch>, <rhs_noncontracting>, <lhs_noncontracting>
+    int batch_dims = dnums.lhs_batch_dimensions().size();
+    int new_lhs_noncontracting = rhs->shape().rank() - batch_dims -
+                                 dnums.lhs_contracting_dimensions().size();
+    int new_rhs_noncontracting = lhs->shape().rank() - batch_dims -
+                                 dnums.rhs_contracting_dimensions().size();
+
+    int rank = dot->shape().rank();
+    DimensionVector dimensions(rank);
+    for (int i = 0; i < batch_dims; ++i) {
+      dimensions[i] = i;
+    }
+    for (int i = 0; i < new_lhs_noncontracting; ++i) {
+      dimensions[i + batch_dims] = i + batch_dims + new_rhs_noncontracting;
+    }
+    for (int i = 0; i < new_rhs_noncontracting; ++i) {
+      dimensions[i + batch_dims + new_lhs_noncontracting] = i + batch_dims;
+    }
+
+    // Transpose the result.
+    TF_ASSIGN_OR_RETURN(HloInstruction * transpose,
+                        MakeTransposeHlo(new_dot, dimensions));
+    transpose->set_metadata(dot->metadata());
+    *transpose->mutable_shape()->mutable_layout() = dot->shape().layout();
+
+    return ReplaceInstruction(dot, transpose);
+  }
+};
+
+}  // namespace
+
+absl::StatusOr<bool> DotSparsityRewriter::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  return SparseDotRewriterImpl().RunOnModule(module, execution_threads);
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/dot_sparsity_rewriter.h b/third_party/xla/xla/service/gpu/dot_sparsity_rewriter.h
new file mode 100644
index 00000000000000..b4221978b74f71
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/dot_sparsity_rewriter.h
@@ -0,0 +1,42 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_DOT_SPARSITY_REWRITER_H_
+#define XLA_SERVICE_GPU_DOT_SPARSITY_REWRITER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Make sure sparse dot requirements are met (sparse operand is LHS).
+class DotSparsityRewriter : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "dot_sparsity_rewriter"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_DOT_SPARSITY_REWRITER_H_
diff --git a/third_party/xla/xla/service/gpu/dot_sparsity_rewriter_test.cc b/third_party/xla/xla/service/gpu/dot_sparsity_rewriter_test.cc
new file mode 100644
index 00000000000000..c608f8d814410a
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/dot_sparsity_rewriter_test.cc
@@ -0,0 +1,85 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/dot_sparsity_rewriter.h"
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/tests/hlo_test_base.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+using ::testing::ElementsAre;
+
+class DotSparsityRewriterTest : public HloTestBase {
+ public:
+  DotSparsityRewriterTest() : HloTestBase(/*verifier_layout_sensitive=*/true) {}
+};
+
+TEST_F(DotSparsityRewriterTest, SparseDotRhsToLhs) {
+  const char* module_string = R"(
+HloModule m
+
+ENTRY e {
+  lhs = f16[4,2,16,8,64] parameter(0)
+  rhs = f16[2,4,8,32,128] parameter(1)
+  meta = u16[2,4,8,4,128] parameter(2)
+  ROOT dot = f16[4,2,16,128] dot(lhs, rhs, meta),
+    lhs_contracting_dims={3,4}, rhs_contracting_dims={2,3},
+    lhs_batch_dims={0,1}, rhs_batch_dims={1,0}, sparsity=R.3@2:4
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool modified,
+                          DotSparsityRewriter().Run(module.get()));
+  EXPECT_TRUE(modified);
+
+  const HloTransposeInstruction* transpose = DynCast<HloTransposeInstruction>(
+      module->entry_computation()->root_instruction());
+  ASSERT_TRUE(transpose != nullptr);
+  EXPECT_THAT(transpose->dimensions(), ElementsAre(0, 1, 3, 2));
+
+  const HloDotInstruction* dot =
+      DynCast<HloDotInstruction>(transpose->operand(0));
+  ASSERT_TRUE(dot != nullptr);
+
+  const DotDimensionNumbers& dnums = dot->dot_dimension_numbers();
+  EXPECT_EQ(dnums.lhs_contracting_dimensions(0), 2);
+  EXPECT_EQ(dnums.lhs_contracting_dimensions(1), 3);
+  EXPECT_EQ(dnums.rhs_contracting_dimensions(0), 3);
+  EXPECT_EQ(dnums.rhs_contracting_dimensions(1), 4);
+  EXPECT_EQ(dnums.lhs_batch_dimensions(0), 1);
+  EXPECT_EQ(dnums.lhs_batch_dimensions(1), 0);
+  EXPECT_EQ(dnums.rhs_batch_dimensions(0), 0);
+  EXPECT_EQ(dnums.rhs_batch_dimensions(1), 1);
+
+  EXPECT_EQ(dot->sparse_operands(), 1);
+  EXPECT_EQ(dot->sparsity().front().index(), 0);
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/double_buffer_loop_unrolling.cc b/third_party/xla/xla/service/gpu/double_buffer_loop_unrolling.cc
new file mode 100644
index 00000000000000..4c10da5b8aa047
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/double_buffer_loop_unrolling.cc
@@ -0,0 +1,412 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "xla/service/gpu/double_buffer_loop_unrolling.h"
+
+#include <cstdint>
+#include <iterator>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_clone_context.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/utils/hlo_query.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/service/flatten_call_graph.h"
+#include "xla/status.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+void SetChannelIdForNewCollective(HloInstruction* new_instr,
+                                  const HloModule* module) {
+  // This is to track mappings of old->new channel id for async collectives
+  // wrapped in the form of HloAsyncInstruction, the start and done need to
+  // have the same unique channel id.
+  absl::flat_hash_map<int64_t, int64_t> old_to_new_channel_id_map;
+  absl::flat_hash_map<int64_t, HloComputation*> channel_id_comp_map;
+  if (new_instr->IsAsynchronous() && hlo_query::IsCollectiveCommunicationOp(
+                                         new_instr->async_wrapped_opcode())) {
+    HloInstruction* wrapped_instr =
+        DynCast<HloAsyncInstruction>(new_instr)->async_wrapped_instruction();
+    int64_t old_channel_id = *wrapped_instr->channel_id();
+    int64_t new_channel_id = old_to_new_channel_id_map[old_channel_id];
+    if (old_to_new_channel_id_map.find(old_channel_id) ==
+        old_to_new_channel_id_map.end()) {
+      new_channel_id = hlo_query::NextChannelId(*module);
+      VLOG(2) << "Generated new channel id " << new_channel_id;
+      old_to_new_channel_id_map[old_channel_id] = new_channel_id;
+    }
+
+    VLOG(2) << "Setting channel id to " << new_channel_id;
+
+    wrapped_instr->set_channel_id(new_channel_id);
+    if (channel_id_comp_map.find(new_channel_id) == channel_id_comp_map.end()) {
+      channel_id_comp_map[new_channel_id] =
+          new_instr->async_wrapped_computation();
+    } else {
+      channel_id_comp_map[new_channel_id]->AddAsyncStart(new_instr);
+    }
+  } else if (hlo_query::IsCollectiveCommunicationOp(new_instr->opcode()) ||
+             hlo_query::IsAsyncCollectiveStartOp(new_instr)) {
+    new_instr->set_channel_id(hlo_query::NextChannelId(*module));
+  }
+}
+
+// Handle control predecessors/successors for every old-new instruction pair.
+// For every new instruction, we find the relevant predecessor/successor
+// relationships of the old instruction and we reconstruct them by looking up
+// new (already created) predecessors/successors.
+//
+// When rewiring dependencies from output of the original body, to the input of
+// the cloned body we skip collectives, and ops in `skip_control_dep_injection`.
+absl::Status HandleControlDependencies(
+    const HloComputation* while_body,
+    const absl::flat_hash_map<HloInstruction*, HloInstruction*>& old_to_new_map,
+    HloInstruction::InstructionVector* old_loop_roots,
+    HloInstruction* input_parameter,
+    const absl::flat_hash_set<HloInstruction*>& skip_control_dep_injection) {
+  for (HloInstruction* old_instr : while_body->MakeInstructionPostOrder()) {
+    if (old_to_new_map.find(old_instr) != old_to_new_map.end()) {
+      HloInstruction* new_instr = old_to_new_map.at(old_instr);
+      VLOG(2) << "Processing control predecessors for "
+              << new_instr->ToString();
+      std::vector<HloInstruction*> new_control_pred;
+      new_control_pred.reserve(old_instr->control_predecessors().size());
+      for (HloInstruction* pred : old_instr->control_predecessors()) {
+        if (!old_to_new_map.contains(pred)) {
+          continue;
+        }
+        new_control_pred.push_back(old_to_new_map.at(pred));
+      }
+
+      TF_RETURN_IF_ERROR(new_instr->DropAllControlDeps());
+      for (HloInstruction* new_pred : new_control_pred) {
+        TF_RETURN_IF_ERROR(new_pred->AddControlDependencyTo(new_instr));
+        VLOG(2) << "Adding " << new_pred->ToString()
+                << " to control dependency of " << new_instr->ToString();
+      }
+    }
+  }
+  for (HloInstruction* input_consumer : input_parameter->users()) {
+    for (HloInstruction* old_input : input_consumer->users()) {
+      if (old_to_new_map.find(old_input) != old_to_new_map.end()) {
+        HloInstruction* new_input = old_to_new_map.at(old_input);
+        if (skip_control_dep_injection.find(old_input) ==
+                skip_control_dep_injection.end() &&
+            !IsCollective(old_input)) {
+          for (HloInstruction* old_root : *old_loop_roots) {
+            TF_RETURN_IF_ERROR(old_root->AddControlDependencyTo(new_input));
+          }
+        }
+      }
+    }
+  }
+
+  return absl::OkStatus();
+}
+
+absl::StatusOr<bool> FullyUnroll(HloInstruction* while_instr,
+                                 HloModule* module) {
+  HloComputation* while_body = while_instr->while_body();
+  bool changed = false;
+  VLOG(2) << "Processing root " << while_body->root_instruction()->ToString();
+
+  auto loop_roots = while_body->root_instruction()->mutable_operands();
+  HloInstruction* input_parameter = while_body->parameter_instruction(0);
+  VLOG(2) << "Processing input parameter " << input_parameter->ToString();
+
+  absl::flat_hash_map<HloInstruction*, HloInstruction*> old_to_new_map;
+  absl::flat_hash_set<HloInstruction*> skip_control_dep_injection;
+  std::string clone_suffix = "full_unroll_clone";
+
+  TF_ASSIGN_OR_RETURN(WhileLoopBackendConfig config,
+                      while_instr->backend_config<WhileLoopBackendConfig>());
+  std::vector<HloInstruction*> ops_to_clone;
+  ops_to_clone.reserve(while_body->MakeInstructionPostOrder().size());
+
+  // Pre-loop prep.
+  HloInstruction* old_input_parameter = input_parameter;
+  HloInstruction* new_input_parameter = while_body->root_instruction();
+  absl::flat_hash_set<HloInstruction*> seen_ops;
+  for (HloInstruction* old_instr : while_body->MakeInstructionPostOrder()) {
+    if (seen_ops.contains(old_instr)) {
+      continue;
+    }
+    ops_to_clone.push_back(old_instr);
+    seen_ops.insert(old_instr);
+  }
+
+  int n = config.known_trip_count().n();
+  while (--n) {
+    std::vector<HloInstruction*> new_ops_to_clone;
+    old_to_new_map[old_input_parameter] = new_input_parameter;
+    for (HloInstruction* old_instr : ops_to_clone) {
+      if (old_to_new_map.contains(old_instr)) {
+        continue;
+      }
+      VLOG(2) << "Cloning instruction " << old_instr->ToString();
+      std::vector<HloInstruction*> new_operands;
+      for (HloInstruction* old_operand : old_instr->mutable_operands()) {
+        new_operands.push_back(old_to_new_map[old_operand]);
+      }
+      HloInstruction* new_instr =
+          while_body->AddInstruction(old_instr->CloneWithNewOperands(
+              old_instr->shape(), new_operands, clone_suffix));
+
+      // If an elementwise instruction with constant operand is present, we
+      // won't inject control dependency at the end to allow more constant
+      // folding opportunities.
+      if (old_instr->IsElementwiseBinary() && old_instr->HasConstantOperand()) {
+        skip_control_dep_injection.insert(old_instr);
+      }
+      SetChannelIdForNewCollective(new_instr, module);
+      old_to_new_map[old_instr] = new_instr;
+      new_ops_to_clone.push_back(new_instr);
+      VLOG(2) << "Added instruction " << new_instr->ToString();
+    }
+
+    while_body->set_root_instruction(
+        old_to_new_map[while_body->root_instruction()]);
+    VLOG(2) << "Replaced with new root "
+            << while_body->root_instruction()->ToString();
+
+    TF_RETURN_IF_ERROR(HandleControlDependencies(
+        while_body, old_to_new_map, &loop_roots, old_input_parameter,
+        skip_control_dep_injection));
+
+    // Inductive step update, clean/update necessary buffers to prepare them for
+    // the next unrolling iteration.
+    old_to_new_map.clear();
+    skip_control_dep_injection.clear();
+    loop_roots = while_body->root_instruction()->mutable_operands();
+    old_input_parameter = new_input_parameter;
+    new_input_parameter = while_body->root_instruction();
+    ops_to_clone = std::move(new_ops_to_clone);
+    changed = true;
+  }
+
+  WhileLoopBackendConfig new_config;
+  new_config.mutable_known_trip_count()->set_n(1);
+  TF_RETURN_IF_ERROR(while_instr->set_backend_config(new_config));
+
+  return changed;
+}
+
+absl::Status PeelInstructionsForOddTripCount(HloModule* module,
+                                             HloInstruction* while_instr) {
+  std::string suffix = "peeled_double_buffer";
+  absl::flat_hash_map<HloInstruction*, HloInstruction*> old_to_new_map;
+  HloComputation* while_body = while_instr->while_body();
+  HloInstruction* input_parameter = while_body->parameter_instruction(0);
+  HloInstruction* input_tuple = while_instr->mutable_operand(0);
+
+  auto old_loop_roots = while_body->root_instruction()->mutable_operands();
+  HloComputation* parent_comp = while_instr->parent();
+  old_to_new_map[input_parameter] = input_tuple;
+
+  for (HloInstruction* old_instr : while_body->MakeInstructionPostOrder()) {
+    if (old_to_new_map.find(old_instr) != old_to_new_map.end()) {
+      continue;
+    }
+    VLOG(2) << "Peeling instruction " << old_instr->ToString();
+    std::vector<HloInstruction*> new_operands(old_instr->operand_count());
+    for (int64_t i = 0; i < old_instr->operand_count(); i++) {
+      new_operands[i] = old_to_new_map[old_instr->mutable_operand(i)];
+    }
+    HloInstruction* new_instr =
+        parent_comp->AddInstruction(old_instr->CloneWithNewOperands(
+            old_instr->shape(), new_operands, suffix));
+
+    SetChannelIdForNewCollective(new_instr, module);
+    old_to_new_map[old_instr] = new_instr;
+    VLOG(2) << "Added instruction " << new_instr->ToString()
+            << " to parent computation.";
+  }
+
+  std::vector<HloInstruction*> new_roots;
+  for (HloInstruction* instr : old_loop_roots) {
+    new_roots.push_back(old_to_new_map[instr]);
+  }
+  TF_RETURN_IF_ERROR(while_instr->ReplaceOperandWith(
+      0, old_to_new_map[while_body->root_instruction()]));
+  VLOG(2) << "Replaced with new input tuple "
+          << while_instr->operand(0)->ToString();
+
+  // Handle existing control dependencies.
+  for (HloInstruction* old_instr : while_body->MakeInstructionPostOrder()) {
+    if (old_to_new_map.find(old_instr) != old_to_new_map.end()) {
+      HloInstruction* new_instr = old_to_new_map[old_instr];
+      VLOG(2) << "Processing control predecessors for peeled instruction "
+              << new_instr->ToString();
+      std::vector<HloInstruction*> new_control_pred(
+          old_instr->control_predecessors().size());
+      for (HloInstruction* pred : old_instr->control_predecessors()) {
+        new_control_pred.push_back(old_to_new_map[pred]);
+      }
+
+      TF_RETURN_IF_ERROR(new_instr->DropAllControlDeps());
+      for (HloInstruction* new_pred : new_control_pred) {
+        TF_RETURN_IF_ERROR(new_pred->AddControlDependencyTo(new_instr));
+        VLOG(2) << "Adding " << new_pred->ToString()
+                << " to control dependency of peeled instruction: "
+                << new_instr->ToString();
+      }
+    }
+  }
+  return absl::OkStatus();
+}
+
+// TODO(olechwierowicz): Extract common logic of this and `FullyUnroll` to
+// a separate function.
+absl::StatusOr<bool> DoubleBufferingUnroll(HloInstruction* while_instr,
+                                           HloModule* module) {
+  TF_ASSIGN_OR_RETURN(auto config,
+                      while_instr->backend_config<WhileLoopBackendConfig>());
+
+  CHECK(config.has_known_trip_count())
+      << "Only loops with known trip count are supported.";
+  int64_t exact_trip_count = config.known_trip_count().n();
+  VLOG(2) << "Processing while loop " << while_instr->ToString()
+          << " with trip count: " << exact_trip_count;
+
+  HloComputation* while_body = while_instr->while_body();
+
+  VLOG(2) << "Processing root " << while_body->root_instruction()->ToString();
+
+  auto old_loop_roots = while_body->root_instruction()->mutable_operands();
+  HloInstruction* input_parameter = while_body->parameter_instruction(0);
+  VLOG(2) << "Processing input parameter " << input_parameter->ToString();
+  absl::flat_hash_map<HloInstruction*, HloInstruction*> old_to_new_map;
+  absl::flat_hash_set<HloInstruction*> skip_control_dep_injection;
+
+  if (exact_trip_count % 2) {
+    VLOG(2) << "Found loops with odd trip count, 1 iteration will be peeled "
+               "outside of the main body.";
+    TF_RETURN_IF_ERROR(PeelInstructionsForOddTripCount(module, while_instr));
+    exact_trip_count -= 1;
+  }
+
+  std::string suffix = "double_buffer_clone";
+  old_to_new_map[input_parameter] = while_body->root_instruction();
+  for (HloInstruction* old_instr : while_body->MakeInstructionPostOrder()) {
+    if (old_to_new_map.find(old_instr) != old_to_new_map.end()) {
+      continue;
+    }
+    VLOG(2) << "Cloning instruction " << old_instr->ToString();
+    std::vector<HloInstruction*> new_operands;
+    for (HloInstruction* old_operand : old_instr->mutable_operands()) {
+      new_operands.push_back(old_to_new_map[old_operand]);
+    }
+    HloInstruction* new_instr =
+        while_body->AddInstruction(old_instr->CloneWithNewOperands(
+            old_instr->shape(), new_operands, suffix));
+
+    // If an elementwise instruction with constant operand is present, we
+    // won't inject control dependency at the end to allow more constant
+    // folding opportunities.
+    if (old_instr->IsElementwiseBinary() && old_instr->HasConstantOperand()) {
+      skip_control_dep_injection.insert(old_instr);
+    }
+    SetChannelIdForNewCollective(new_instr, module);
+    old_to_new_map[old_instr] = new_instr;
+    VLOG(2) << "Added instruction " << new_instr->ToString();
+  }
+
+  while_body->set_root_instruction(
+      old_to_new_map[while_body->root_instruction()]);
+  VLOG(2) << "Replaced with new root "
+          << while_body->root_instruction()->ToString();
+
+  // Handle existing control dependencies.
+  TF_RETURN_IF_ERROR(HandleControlDependencies(while_body, old_to_new_map,
+                                               &old_loop_roots, input_parameter,
+                                               skip_control_dep_injection));
+
+  WhileLoopBackendConfig new_config;
+  new_config.mutable_known_trip_count()->set_n(exact_trip_count / 2);
+  TF_RETURN_IF_ERROR(while_instr->set_backend_config(new_config));
+  return true;  // changed
+}
+
+}  // namespace
+
+absl::StatusOr<bool> DoubleBufferLoopUnrolling::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  bool changed = false;
+  std::vector<HloInstruction*> while_instrs;
+  for (auto comp : module->MakeNonfusionComputations()) {
+    absl::c_copy_if(comp->instructions(), std::back_inserter(while_instrs),
+                    HloPredicateIsOp<HloOpcode::kWhile>);
+  }
+  VLOG(2) << "Processing " << while_instrs.size() << " while loops.";
+
+  for (HloInstruction* while_instr : while_instrs) {
+    TF_ASSIGN_OR_RETURN(WhileLoopBackendConfig config,
+                        while_instr->backend_config<WhileLoopBackendConfig>());
+    if (!config.has_known_trip_count()) {
+      VLOG(2) << while_instr->ToString()
+              << " doesn't have exact trip count, skipping loop unrolling "
+                 "for now";
+      continue;
+    }
+
+    if (unroll_strategy_ == UnrollStrategy::kFullUnroll) {
+      TF_ASSIGN_OR_RETURN(changed, FullyUnroll(while_instr, module));
+    } else if (unroll_strategy_ == UnrollStrategy::kDoubleBuffer) {
+      TF_ASSIGN_OR_RETURN(changed, DoubleBufferingUnroll(while_instr, module));
+    } else {
+      LOG(FATAL) << absl::StrCat("Unhandled unrolling strategy: ",
+                                 unroll_strategy_);
+    }
+  }
+
+  VLOG(2) << "LoopDoubleBufferTransformer output: " << module->ToString();
+
+  // Run necessary cleanup to ensure LoopDoubleBufferTransformer behaves
+  // correctly.
+  if (changed) {
+    // The call graph will not be flat if one of the loops that was unrolled
+    // contains any kind of call to another computation---since the call will
+    // be duplicated, thereby adding a second callsite for that computation.
+    TF_RETURN_IF_ERROR(
+        FlattenCallGraph().Run(module, execution_threads).status());
+  }
+
+  return changed;
+}
+
+}  // end namespace gpu
+}  // end namespace xla
diff --git a/third_party/xla/xla/service/gpu/double_buffer_loop_unrolling.h b/third_party/xla/xla/service/gpu/double_buffer_loop_unrolling.h
new file mode 100644
index 00000000000000..120070dbccd452
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/double_buffer_loop_unrolling.h
@@ -0,0 +1,75 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_DOUBLE_BUFFER_LOOP_UNROLLING_H_
+#define XLA_SERVICE_GPU_DOUBLE_BUFFER_LOOP_UNROLLING_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// With `kDoubleBuffer` strategy:
+//   This pass performs the unrolling-by-2 loop transformation
+//   to effectively achieve double buffering between inputs and outputs
+//   of previously rolled iterations.
+//   This pass only runs on loops with known trip counts.
+//   For even number of iterations, unrolling-by-2 will be done directly.
+//   For odd number of iterations, the first iteration of the loop will be
+//   peeled outside of the while loop to make the trip count an even number,
+//   then proceed to unroll by 2.
+//   It also updates the trip count property of the loop to the correct one
+//   (n/2).
+//
+// With `kFullUnroll` strategy:
+//   This pass will perform the full unroll of the loop with the same strategy
+//   that is used with `kDoubleBuffer` but while loop trip count times.
+//   It updates the trip count of the while loop to 1, and relies on other
+//   passes (like `WhileLoopSimplifier`) to simplify/get rid of the while loop
+//   eventually.
+//
+// Note that this pass will flatten the call graph if any loop has been
+// unrolled.
+// TODO(olechwierowicz): Rename the loop unroller to something more generic like
+// 'DoubleBufferLoopUnrolling'.
+class DoubleBufferLoopUnrolling : public HloModulePass {
+ public:
+  enum class UnrollStrategy { kDoubleBuffer, kFullUnroll };
+
+  explicit DoubleBufferLoopUnrolling(
+      UnrollStrategy unroll_strategy = UnrollStrategy::kDoubleBuffer)
+      : unroll_strategy_(unroll_strategy) {};
+  ~DoubleBufferLoopUnrolling() override = default;
+
+  absl::string_view name() const override {
+    return "loop-double-buffer-transformer";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  UnrollStrategy unroll_strategy_;
+};
+
+}  // end namespace gpu
+}  // end namespace xla
+
+#endif  // XLA_SERVICE_GPU_DOUBLE_BUFFER_LOOP_UNROLLING_H_
diff --git a/third_party/xla/xla/service/gpu/loop_double_buffer_transformer_test.cc b/third_party/xla/xla/service/gpu/double_buffer_loop_unrolling_test.cc
similarity index 55%
rename from third_party/xla/xla/service/gpu/loop_double_buffer_transformer_test.cc
rename to third_party/xla/xla/service/gpu/double_buffer_loop_unrolling_test.cc
index d319bcd3768cbb..2d2f33f160db87 100644
--- a/third_party/xla/xla/service/gpu/loop_double_buffer_transformer_test.cc
+++ b/third_party/xla/xla/service/gpu/double_buffer_loop_unrolling_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/loop_double_buffer_transformer.h"
+#include "xla/service/gpu/double_buffer_loop_unrolling.h"
 
 #include <cstdint>
 #include <memory>
@@ -24,6 +24,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/utils/hlo_query.h"
 #include "xla/service/tuple_simplifier.h"
 #include "xla/test.h"
 #include "xla/tests/hlo_test_base.h"
@@ -64,7 +65,66 @@ class GpuLoopDoubleBufferTransformerTest : public HloTestBase {
   }
 };
 
-TEST_F(GpuLoopDoubleBufferTransformerTest, UnrolledLoopEvenTripCount) {
+TEST_F(GpuLoopDoubleBufferTransformerTest, FullUnrollOddTripCountTest) {
+  const char* const kModuleString = R"(
+HloModule all_gather_overlapping
+condition {
+  input_tuple = (f32[1,128], f32[1,128], f32[2,128], s32[]) parameter(0)
+  cond = s32[] get-tuple-element(input_tuple), index=3
+  trip_count = s32[] constant(10)
+  ROOT done = pred[] compare(cond, trip_count), direction=LT
+}
+
+body {
+ input_tuple = (f32[1,128], f32[1,128], f32[2,128], s32[]) parameter(0)
+ param_0 = f32[1,128] get-tuple-element(input_tuple), index=0
+ param_1 = f32[2,128] get-tuple-element(input_tuple), index=2
+ cond = s32[] get-tuple-element(input_tuple), index=3
+ c0 = f32[] constant(0)
+ splat_c0 = f32[1,128] broadcast(c0), dimensions={}
+ add = f32[1,128] add(splat_c0, param_0)
+ all-gather-start = (f32[1,128], f32[2,128]) all-gather-start(add), channel_id=1337, replica_groups={{0,1}}, dimensions={0}, use_global_device_ids=true
+ c1_s32 = s32[] constant(1)
+ c0_s32 = s32[] constant(0)
+ one = s32[] constant(1)
+ cond_plus_1 = s32[] add(cond, one)
+ dynamic-slice = f32[1,128] dynamic-slice(param_1, c1_s32, c0_s32), dynamic_slice_sizes={1,128}
+ all-gather-done = f32[2,128] all-gather-done(all-gather-start)
+ ROOT output_tuple = (f32[1,128], f32[1,128], f32[2,128], s32[]) tuple(param_0, dynamic-slice, all-gather-done, cond_plus_1)
+}
+
+ENTRY main {
+ param_0 = f32[1,128] parameter(0)
+ param_1 = f32[2,128] parameter(1)
+ param_2 = s32[] constant(0)
+ tuple = (f32[1,128], f32[1,128], f32[2,128], s32[]) tuple(param_0, param_0, param_1, param_2)
+ ROOT while = (f32[1,128], f32[1,128], f32[2,128], s32[]) while(tuple), condition=condition, body=body, backend_config={"known_trip_count":{"n":"11"}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleString));
+  DoubleBufferLoopUnrolling double_buffer(
+      DoubleBufferLoopUnrolling::UnrollStrategy::kFullUnroll);
+  TupleSimplifier tuple_simp;
+  bool changed;
+  TF_ASSERT_OK_AND_ASSIGN(changed, double_buffer.Run(module.get()));
+  EXPECT_TRUE(changed);
+  TF_ASSERT_OK_AND_ASSIGN(changed, tuple_simp.Run(module.get()));
+  EXPECT_TRUE(changed);
+  HloInstruction* while_instruction = hlo_query::GetFirstInstructionWithOpcode(
+      *module->entry_computation(), HloOpcode::kWhile);
+  TF_ASSERT_OK_AND_ASSIGN(
+      WhileLoopBackendConfig config,
+      while_instruction->backend_config<WhileLoopBackendConfig>());
+  int64_t exact_trip_count = config.known_trip_count().n();
+  EXPECT_EQ(exact_trip_count, 1);
+  EXPECT_EQ(CountInstructions((*while_instruction->while_body()),
+                              HloOpcode::kAllGatherStart),
+            11);
+  EXPECT_EQ(CountInstructions((*module), HloOpcode::kAllGatherStart), 11);
+}
+
+TEST_F(GpuLoopDoubleBufferTransformerTest, FullUnrollEvenTripCountTest) {
   const char* const kModuleString = R"(
 HloModule all_gather_overlapping
 condition {
@@ -84,15 +144,11 @@ body {
  add = f32[1,128] add(splat_c0, param_0)
  // Start all-gather communication
  all-gather-start = (f32[1,128], f32[2,128]) all-gather-start(add), channel_id=1337, replica_groups={{0,1}}, dimensions={0}, use_global_device_ids=true
- // Intertwined with the all-gather communication, an operation happens which
- // depends on param_1, but crucially has a different output shape (which
- // excludes reusing param_1's buffer for its output).
  c1_s32 = s32[] constant(1)
  c0_s32 = s32[] constant(0)
  one = s32[] constant(1)
  cond_plus_1 = s32[] add(cond, one)
  dynamic-slice = f32[1,128] dynamic-slice(param_1, c1_s32, c0_s32), dynamic_slice_sizes={1,128}
- // The all-gather communication finishes
  all-gather-done = f32[2,128] all-gather-done(all-gather-start)
  ROOT output_tuple = (f32[1,128], f32[1,128], f32[2,128], s32[]) tuple(param_0, dynamic-slice, all-gather-done, cond_plus_1)
 }
@@ -107,7 +163,8 @@ ENTRY main {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
                           ParseAndReturnVerifiedModule(kModuleString));
-  LoopDoubleBufferTransformer double_buffer;
+  DoubleBufferLoopUnrolling double_buffer(
+      DoubleBufferLoopUnrolling::UnrollStrategy::kFullUnroll);
   TupleSimplifier tuple_simp;
   bool changed;
   TF_ASSERT_OK_AND_ASSIGN(changed, double_buffer.Run(module.get()));
@@ -125,6 +182,70 @@ ENTRY main {
       WhileLoopBackendConfig config,
       while_instruction->backend_config<WhileLoopBackendConfig>());
   int64_t exact_trip_count = config.known_trip_count().n();
+  EXPECT_EQ(exact_trip_count, 1);
+  EXPECT_EQ(CountInstructions((*while_instruction->while_body()),
+                              HloOpcode::kAllGatherStart),
+            10);
+  EXPECT_EQ(CountInstructions((*module), HloOpcode::kAllGatherStart), 10);
+}
+
+TEST_F(GpuLoopDoubleBufferTransformerTest, UnrolledLoopEvenTripCount) {
+  const char* const kModuleString = R"(
+HloModule all_gather_overlapping
+condition {
+  input_tuple = (f32[1,128], f32[1,128], f32[2,128], s32[]) parameter(0)
+  cond = s32[] get-tuple-element(input_tuple), index=3
+  trip_count = s32[] constant(10)
+  ROOT done = pred[] compare(cond, trip_count), direction=LT
+}
+
+body {
+ input_tuple = (f32[1,128], f32[1,128], f32[2,128], s32[]) parameter(0)
+ param_0 = f32[1,128] get-tuple-element(input_tuple), index=0
+ param_1 = f32[2,128] get-tuple-element(input_tuple), index=2
+ cond = s32[] get-tuple-element(input_tuple), index=3
+ c0 = f32[] constant(0)
+ splat_c0 = f32[1,128] broadcast(c0), dimensions={}
+ add = f32[1,128] add(splat_c0, param_0)
+ // Start all-gather communication
+ all-gather-start = (f32[1,128], f32[2,128]) all-gather-start(add), channel_id=1337, replica_groups={{0,1}}, dimensions={0}, use_global_device_ids=true
+ // Intertwined with the all-gather communication, an operation happens which
+ // depends on param_1, but crucially has a different output shape (which
+ // excludes reusing param_1's buffer for its output).
+ c1_s32 = s32[] constant(1)
+ c0_s32 = s32[] constant(0)
+ one = s32[] constant(1)
+ cond_plus_1 = s32[] add(cond, one)
+ dynamic-slice = f32[1,128] dynamic-slice(param_1, c1_s32, c0_s32), dynamic_slice_sizes={1,128}
+ // The all-gather communication finishes
+ all-gather-done = f32[2,128] all-gather-done(all-gather-start)
+ ROOT output_tuple = (f32[1,128], f32[1,128], f32[2,128], s32[]) tuple(param_0, dynamic-slice, all-gather-done, cond_plus_1)
+}
+
+ENTRY main {
+ param_0 = f32[1,128] parameter(0)
+ param_1 = f32[2,128] parameter(1)
+ param_2 = s32[] constant(0)
+ tuple = (f32[1,128], f32[1,128], f32[2,128], s32[]) tuple(param_0, param_0, param_1, param_2)
+ ROOT while = (f32[1,128], f32[1,128], f32[2,128], s32[]) while(tuple), condition=condition, body=body, backend_config={"known_trip_count":{"n":"10"}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleString));
+  DoubleBufferLoopUnrolling double_buffer;
+  TupleSimplifier tuple_simp;
+  bool changed;
+  TF_ASSERT_OK_AND_ASSIGN(changed, double_buffer.Run(module.get()));
+  EXPECT_TRUE(changed);
+  TF_ASSERT_OK_AND_ASSIGN(changed, tuple_simp.Run(module.get()));
+  EXPECT_TRUE(changed);
+
+  HloInstruction* while_instruction = hlo_query::GetFirstInstructionWithOpcode(
+      *module->entry_computation(), HloOpcode::kWhile);
+  TF_ASSERT_OK_AND_ASSIGN(
+      WhileLoopBackendConfig config,
+      while_instruction->backend_config<WhileLoopBackendConfig>());
+  int64_t exact_trip_count = config.known_trip_count().n();
   // We expect that after unrolling, the total trip count is half of original
   // count.
   EXPECT_EQ(exact_trip_count, 5);
@@ -179,19 +300,15 @@ ENTRY main {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
                           ParseAndReturnVerifiedModule(kModuleString));
-  LoopDoubleBufferTransformer double_buffer;
+  DoubleBufferLoopUnrolling double_buffer;
   TupleSimplifier tuple_simp;
   EXPECT_THAT(double_buffer.Run(module.get()), IsOkAndHolds(true));
   EXPECT_THAT(tuple_simp.Run(module.get()), IsOkAndHolds(true));
 
   // We expect that for the while loop, no further copy needs to be added to the
   // module.
-  HloInstruction* while_instruction;
-  for (auto instr : module->entry_computation()->instructions()) {
-    if (instr->opcode() == HloOpcode::kWhile) {
-      while_instruction = instr;
-    }
-  }
+  HloInstruction* while_instruction = hlo_query::GetFirstInstructionWithOpcode(
+      *module->entry_computation(), HloOpcode::kWhile);
   TF_ASSERT_OK_AND_ASSIGN(
       WhileLoopBackendConfig config,
       while_instruction->backend_config<WhileLoopBackendConfig>());
@@ -244,17 +361,13 @@ ENTRY main {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
                           ParseAndReturnVerifiedModule(kModuleString));
-  LoopDoubleBufferTransformer double_buffer;
+  DoubleBufferLoopUnrolling double_buffer;
   TupleSimplifier tuple_simp;
   EXPECT_THAT(double_buffer.Run(module.get()), IsOkAndHolds(true));
   EXPECT_THAT(tuple_simp.Run(module.get()), IsOkAndHolds(true));
 
-  HloInstruction* while_instruction;
-  for (auto instr : module->entry_computation()->instructions()) {
-    if (instr->opcode() == HloOpcode::kWhile) {
-      while_instruction = instr;
-    }
-  }
+  HloInstruction* while_instruction = hlo_query::GetFirstInstructionWithOpcode(
+      *module->entry_computation(), HloOpcode::kWhile);
   TF_ASSERT_OK_AND_ASSIGN(
       WhileLoopBackendConfig config,
       while_instruction->backend_config<WhileLoopBackendConfig>());
@@ -316,17 +429,13 @@ ENTRY main {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
                           ParseAndReturnVerifiedModule(kModuleString));
-  LoopDoubleBufferTransformer double_buffer;
+  DoubleBufferLoopUnrolling double_buffer;
   TupleSimplifier tuple_simp;
   EXPECT_THAT(double_buffer.Run(module.get()), IsOkAndHolds(true));
   EXPECT_THAT(tuple_simp.Run(module.get()), IsOkAndHolds(true));
 
-  HloInstruction* while_instruction;
-  for (auto instr : module->entry_computation()->instructions()) {
-    if (instr->opcode() == HloOpcode::kWhile) {
-      while_instruction = instr;
-    }
-  }
+  HloInstruction* while_instruction = hlo_query::GetFirstInstructionWithOpcode(
+      *module->entry_computation(), HloOpcode::kWhile);
   TF_ASSERT_OK_AND_ASSIGN(
       WhileLoopBackendConfig config,
       while_instruction->backend_config<WhileLoopBackendConfig>());
@@ -340,18 +449,86 @@ ENTRY main {
                               HloOpcode::kAllReduceStart),
             2);
   absl::flat_hash_set<int64_t> channel_ids;
-  for (HloInstruction* ar : while_instruction->while_body()->instructions()) {
-    if (ar->opcode() == HloOpcode::kAllReduceStart) {
-      // We expect that after unrolling, all-reduces should not have any control
-      // deps.
-      EXPECT_EQ(ar->control_predecessors().size(), 0);
-      channel_ids.insert(*(ar->channel_id()));
-    }
-  }
+  hlo_query::ForEachInstructionWithOpcode(
+      *while_instruction->while_body(), HloOpcode::kAllReduceStart,
+      [&channel_ids](HloInstruction* ar) {
+        // We expect that after unrolling, all-reduces should not have any
+        // control deps.
+        EXPECT_EQ(ar->control_predecessors().size(), 0);
+        channel_ids.insert(*(ar->channel_id()));
+      });
   // we expect that all 2 all-reduces will have different channel ids.
   EXPECT_EQ(channel_ids.size(), 2);
 }
 
+TEST_F(GpuLoopDoubleBufferTransformerTest,
+       FullyUnrolledLoopNoControlDepsForCollective) {
+  const char* const kModuleString = R"(
+HloModule loop_unrolling_no_deps
+condition {
+  input_tuple = (f32[], s32[]) parameter(0)
+  cond = s32[] get-tuple-element(input_tuple), index=1
+  trip_count = s32[] constant(10)
+  ROOT done = pred[] compare(cond, trip_count), direction=LT
+}
+
+ar_add {
+  Arg_1 = f32[] parameter(1)
+  Arg_0 = f32[] parameter(0)
+  ROOT add_ar = f32[] add(Arg_1, Arg_0)
+}
+
+body {
+ input_tuple = (f32[], s32[]) parameter(0)
+ param_0 = f32[] get-tuple-element(input_tuple), index=0
+ cond = s32[] get-tuple-element(input_tuple), index=1
+ all-reduce-start = f32[] all-reduce-start(param_0), channel_id=8, replica_groups={{0}}, to_apply=ar_add, backend_config="{\"is_sync\":false}"
+ one = s32[] constant(1)
+ all-reduce-done = f32[] all-reduce-done(all-reduce-start)
+ cond_plus_1 = s32[] add(cond, one)
+ ROOT output_tuple = (f32[], s32[]) tuple(all-reduce-done, cond_plus_1)
+}
+
+ENTRY main {
+ param_0 = f32[] parameter(0)
+ param_2 = s32[] constant(0)
+ tuple = (f32[], s32[]) tuple(param_0, param_2)
+ ROOT while = (f32[], s32[]) while(tuple), condition=condition, body=body, backend_config={"known_trip_count":{"n":"10"}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleString));
+  DoubleBufferLoopUnrolling double_buffer(
+      DoubleBufferLoopUnrolling::UnrollStrategy::kFullUnroll);
+  TupleSimplifier tuple_simp;
+  EXPECT_THAT(double_buffer.Run(module.get()), IsOkAndHolds(true));
+  EXPECT_THAT(tuple_simp.Run(module.get()), IsOkAndHolds(true));
+
+  HloInstruction* while_instruction = hlo_query::GetFirstInstructionWithOpcode(
+      *module->entry_computation(), HloOpcode::kWhile);
+  TF_ASSERT_OK_AND_ASSIGN(
+      WhileLoopBackendConfig config,
+      while_instruction->backend_config<WhileLoopBackendConfig>());
+  int64_t exact_trip_count = config.known_trip_count().n();
+  EXPECT_EQ(exact_trip_count, 1);
+
+  // We expect that after unrolling, there should be 10 all-reduce-starts
+  EXPECT_EQ(CountInstructions((*while_instruction->while_body()),
+                              HloOpcode::kAllReduceStart),
+            10);
+  absl::flat_hash_set<int64_t> channel_ids;
+  hlo_query::ForEachInstructionWithOpcode(
+      *while_instruction->while_body(), HloOpcode::kAllReduceStart,
+      [&channel_ids](HloInstruction* ar) {
+        // We expect that after unrolling, all-reduces should not have any
+        // control deps.
+        EXPECT_EQ(ar->control_predecessors().size(), 0);
+        channel_ids.insert(*(ar->channel_id()));
+      });
+  // we expect that all 10 all-reduces will have different channel ids.
+  EXPECT_EQ(channel_ids.size(), 10);
+}
+
 // The following 2 tests also address the regression described here:
 // https://github.com/openxla/xla/issues/6353
 TEST_F(GpuLoopDoubleBufferTransformerTest, NestedWhileLoopRemainsFlattened) {
@@ -392,20 +569,18 @@ ENTRY main {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
                           ParseAndReturnVerifiedModule(kModuleString));
-  LoopDoubleBufferTransformer double_buffer;
+  DoubleBufferLoopUnrolling double_buffer;
   EXPECT_THAT(double_buffer.Run(module.get()), IsOkAndHolds(true));
 
   absl::flat_hash_set<const HloComputation*> while_loops_callees;
 
-  for (const HloComputation* computation : module->computations()) {
-    for (const HloInstruction* instr : computation->instructions()) {
-      if (instr->opcode() == HloOpcode::kWhile) {
+  hlo_query::ForEachInstructionWithOpcode(
+      *module, HloOpcode::kWhile,
+      [&while_loops_callees](HloInstruction* instr) {
         EXPECT_TRUE(
             while_loops_callees.insert(instr->while_condition()).second);
         EXPECT_TRUE(while_loops_callees.insert(instr->while_body()).second);
-      }
-    }
-  }
+      });
 
   // We expect that the nested while loop has been duplicated, along with its
   // associated computations.
@@ -451,26 +626,94 @@ ENTRY main {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
                           ParseAndReturnVerifiedModule(kModuleString));
-  LoopDoubleBufferTransformer double_buffer;
+  DoubleBufferLoopUnrolling double_buffer;
   EXPECT_THAT(double_buffer.Run(module.get()), IsOkAndHolds(true));
 
   absl::flat_hash_set<const HloComputation*> while_loops_callees;
 
-  for (const HloComputation* computation : module->computations()) {
-    for (const HloInstruction* instr : computation->instructions()) {
-      if (instr->opcode() == HloOpcode::kWhile) {
+  hlo_query::ForEachInstructionWithOpcode(
+      *module, HloOpcode::kWhile,
+      [&while_loops_callees](HloInstruction* instr) {
         EXPECT_TRUE(
             while_loops_callees.insert(instr->while_condition()).second);
         EXPECT_TRUE(while_loops_callees.insert(instr->while_body()).second);
-      }
-    }
-  }
+      });
 
   // We expect that the nested while loop has been duplicated, along with its
   // associated computations.
   EXPECT_EQ(while_loops_callees.size(), 8);
 }
 
+TEST_F(GpuLoopDoubleBufferTransformerTest,
+       NestedWhileLoopRemainsFlattenedWhenFullyUnrolled) {
+  const char* const kModuleString = R"(
+HloModule loop_unrolling_nested_while_loop_remains_flattened
+
+condition_nested {
+  input_tuple = (s32[]) parameter(0)
+  cond = s32[] get-tuple-element(input_tuple), index=0
+  trip_count = s32[] constant(10)
+  ROOT done = pred[] compare(cond, trip_count), direction=LT
+}
+
+body_nested {
+ input_tuple = (s32[]) parameter(0)
+ cond = s32[] get-tuple-element(input_tuple), index=0
+ one = s32[] constant(1)
+ cond_plus_1 = s32[] add(cond, one)
+ ROOT output = (s32[]) tuple(cond_plus_1)
+}
+
+condition {
+  input_tuple = (s32[]) parameter(0)
+  cond = s32[] get-tuple-element(input_tuple), index=0
+  trip_count = s32[] constant(10)
+  ROOT done = pred[] compare(cond, trip_count), direction=LT
+}
+
+body {
+  input_tuple = (s32[]) parameter(0)
+  ROOT output = (s32[]) while(input_tuple), condition=condition_nested, body=body_nested
+}
+
+ENTRY main {
+ param_0 = (s32[]) parameter(0)
+ ROOT while = (s32[]) while(param_0), condition=condition, body=body, backend_config={"known_trip_count":{"n":"10"}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleString));
+  DoubleBufferLoopUnrolling double_buffer(
+      DoubleBufferLoopUnrolling::UnrollStrategy::kFullUnroll);
+  EXPECT_THAT(double_buffer.Run(module.get()), IsOkAndHolds(true));
+
+  absl::flat_hash_set<const HloComputation*> while_loops_callees;
+
+  hlo_query::ForEachInstructionWithOpcode(
+      *module, HloOpcode::kWhile,
+      [&while_loops_callees](HloInstruction* instr) {
+        EXPECT_TRUE(
+            while_loops_callees.insert(instr->while_condition()).second);
+        EXPECT_TRUE(while_loops_callees.insert(instr->while_body()).second);
+      });
+
+  hlo_query::ForEachInstructionWithOpcode(
+      *module->entry_computation(), HloOpcode::kWhile,
+      [](HloInstruction* instr) {
+        TF_ASSERT_OK_AND_ASSIGN(
+            WhileLoopBackendConfig config,
+            instr->backend_config<WhileLoopBackendConfig>());
+        int64_t exact_trip_count = config.known_trip_count().n();
+        EXPECT_EQ(exact_trip_count, 1);
+      });
+
+  // We expect that the nested while loop has been fully duplicated 10
+  // times. The one outer while loop still remains so that's 11 while
+  // instructions. We check whether there are 22 distinct computations for
+  // each while loop body and condition.
+  EXPECT_EQ(while_loops_callees.size(), 22);
+}
+
 TEST_F(GpuLoopDoubleBufferTransformerTest, NestedWhileLoopAreUnrolled) {
   const char* const kModuleString = R"(
 HloModule loop_unrolling_nested_are_unrolled
@@ -504,26 +747,71 @@ ENTRY main {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
                           ParseAndReturnVerifiedModule(kModuleString));
-  LoopDoubleBufferTransformer double_buffer;
+  DoubleBufferLoopUnrolling double_buffer;
   EXPECT_THAT(double_buffer.Run(module.get()), IsOkAndHolds(true));
 
   int64_t num_whiles = 0;
-  for (const HloComputation* computation : module->computations()) {
-    for (const HloInstruction* instr : computation->instructions()) {
-      if (instr->opcode() == HloOpcode::kWhile) {
-        // All loops in the module should be unrolled now and have trip count
-        // of 5.
+  hlo_query::ForEachInstructionWithOpcode(
+      *module, HloOpcode::kWhile, [&num_whiles](HloInstruction* instr) {
         EXPECT_EQ(instr->backend_config<WhileLoopBackendConfig>()
                       ->known_trip_count()
                       .n(),
                   5);
         ++num_whiles;
-      }
-    }
-  }
+      });
   // We expect the number of while loops to be 4 in total after unrolling.
   EXPECT_EQ(num_whiles, 4);
 }
+
+TEST_F(GpuLoopDoubleBufferTransformerTest, NestedWhileLoopAreFullyUnrolled) {
+  const char* const kModuleString = R"(
+HloModule loop_unrolling_nested_are_unrolled
+condition_nested {
+  input_tuple = (s32[]) parameter(0)
+  cond = s32[] get-tuple-element(input_tuple), index=0
+  trip_count = s32[] constant(10)
+  ROOT done = pred[] compare(cond, trip_count), direction=LT
+}
+body_nested {
+ input_tuple = (s32[]) parameter(0)
+ cond = s32[] get-tuple-element(input_tuple), index=0
+ one = s32[] constant(1)
+ cond_plus_1 = s32[] add(cond, one)
+ ROOT output = (s32[]) tuple(cond_plus_1)
+}
+condition {
+  input_tuple = (s32[]) parameter(0)
+  cond = s32[] get-tuple-element(input_tuple), index=0
+  trip_count = s32[] constant(10)
+  ROOT done = pred[] compare(cond, trip_count), direction=LT
+}
+body {
+  input_tuple = (s32[]) parameter(0)
+  ROOT output = (s32[]) while(input_tuple), condition=condition_nested, body=body_nested, backend_config={"known_trip_count":{"n":"11"}}
+}
+ENTRY main {
+ param_0 = (s32[]) parameter(0)
+ ROOT while = (s32[]) while(param_0), condition=condition, body=body, backend_config={"known_trip_count":{"n":"11"}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleString));
+  DoubleBufferLoopUnrolling double_buffer(
+      DoubleBufferLoopUnrolling::UnrollStrategy::kFullUnroll);
+  EXPECT_THAT(double_buffer.Run(module.get()), IsOkAndHolds(true));
+
+  int64_t num_whiles = 0;
+  hlo_query::ForEachInstructionWithOpcode(
+      *module, HloOpcode::kWhile, [&num_whiles](HloInstruction* instr) {
+        EXPECT_EQ(instr->backend_config<WhileLoopBackendConfig>()
+                      ->known_trip_count()
+                      .n(),
+                  1);
+        ++num_whiles;
+      });
+  EXPECT_EQ(num_whiles, 12);
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/elemental_ir_emitter.cc b/third_party/xla/xla/service/gpu/elemental_ir_emitter.cc
index 6d6675f21e46ee..0ff1cc6d3bf654 100644
--- a/third_party/xla/xla/service/gpu/elemental_ir_emitter.cc
+++ b/third_party/xla/xla/service/gpu/elemental_ir_emitter.cc
@@ -55,8 +55,6 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-using absl::StrAppend;
-
 GpuElementalIrEmitter::GpuElementalIrEmitter(
     IrEmitterContext& ir_emitter_context, llvm::IRBuilder<>* b)
     : ElementalIrEmitter(ir_emitter_context.llvm_module(), b),
@@ -133,7 +131,7 @@ llvm_ir::IrArray::Index GpuElementalIrEmitter::GetSourceIndexOfBitcast(
   // Decode the layout of the shape from the Protobufs attached to
   // backend_config_.
   auto gpu_config = hlo->backend_config<GpuBackendConfig>();
-  CHECK(gpu_config.ok());
+  CHECK_OK(gpu_config);
 
   const BitcastBackendConfig& bitcast_config =
       gpu_config.value().bitcast_backend_config();
diff --git a/third_party/xla/xla/service/gpu/execution_stream_assignment.cc b/third_party/xla/xla/service/gpu/execution_stream_assignment.cc
new file mode 100644
index 00000000000000..a41a680563ae6c
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/execution_stream_assignment.cc
@@ -0,0 +1,147 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/execution_stream_assignment.h"
+
+#include <deque>
+#include <memory>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/call_graph.h"
+#include "xla/service/gpu/runtime/thunk.h"
+
+namespace xla::gpu {
+
+ExecutionStreamAssignment::ExecutionStreamAssignment(const HloModule* module) {
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+
+  // We'll walk the `CallGraph` starting from the entrypoint. The instructions
+  // on the entrypoint computation will be assigned `ExecutionStreamId(0)`, and
+  // each invocation of `async-start` will result in the target computation
+  // being assigned a new `ExecutionStreamId`.
+  ExecutionStreamId next_stream_id = ExecutionStreamId(1);
+
+  // Each `Pending` item represents an `HloComputation` that needs to be
+  // processed. We start with the entrypoint and add callees as we discover
+  // them.
+  struct Pending {
+    Pending(HloComputation* node, ExecutionStreamId stream_id)
+        : node(node), stream_id(stream_id) {}
+    HloComputation* node;
+    ExecutionStreamId stream_id;
+  };
+  std::deque<Pending> queue;
+  queue.emplace_back(module->entry_computation(), ExecutionStreamId(0));
+
+  while (!queue.empty()) {
+    Pending pending = queue.front();
+    queue.pop_front();
+
+    // First, we'll assign the current `ExecutionStreamId` to all synchronous
+    // instructions. Asynchronous instructions will be handled afterwards.
+    for (HloInstruction* instruction : pending.node->instructions()) {
+      if (instruction->IsAsynchronous()) continue;
+      CHECK(sync_instructions_.try_emplace(instruction, pending.stream_id)
+                .second);
+    }
+
+    // Next, we'll process all callsites in the current computation.
+    for (const CallSite& callsite :
+         call_graph->GetNode(pending.node).callsites()) {
+      if (callsite.instruction()->IsAsynchronous()) {
+        // Asynchronous calls will result in a new `ExecutionStreamId` being
+        // dispensed for the target computation.
+        CHECK_EQ(callsite.instruction()->opcode(), HloOpcode::kAsyncStart);
+        const ExecutionStreamId async_stream_id = next_stream_id++;
+
+        // Because the `HloModule` is assumed to be flat, all computations must
+        // be invoked from a single call-like instruction.
+        CHECK_EQ(callsite.called_computations().size(), 1);
+        queue.emplace_back(callsite.called_computations()[0], async_stream_id);
+
+        AsyncExecutionStreamIds streams;
+        streams.source_stream_id = pending.stream_id;
+        streams.destination_stream_id = async_stream_id;
+        CHECK(async_instructions_.try_emplace(callsite.instruction(), streams)
+                  .second);
+      } else {
+        // Synchronous calls will result in the target computation being invoked
+        // using the same `ExecutionStreamId`.
+        queue.emplace_back(callsite.called_computations()[0],
+                           pending.stream_id);
+      }
+    }
+
+    // And finally, we need to assign `ExecutionStreamIds` to all asynchronous
+    // instructions that are were not handled by the iteration over callsites
+    // above. These are the `async-updates` and `async-dones`. Both of these
+    // should share the `ExecutionStreamId` as the originating `async-starts`.
+    for (HloInstruction* instruction : pending.node->instructions()) {
+      if (!instruction->IsAsynchronous()) continue;
+      if (instruction->opcode() == HloOpcode::kAsyncStart) {
+        CHECK(async_instructions_.find(instruction) !=
+              async_instructions_.end());
+      } else {
+        HloInstruction* async_start =
+            Cast<HloAsyncInstruction>(instruction)->async_chain_start();
+        AsyncExecutionStreamIds async_start_streams =
+            async_instructions_.at(async_start);
+        CHECK(async_instructions_.try_emplace(instruction, async_start_streams)
+                  .second);
+      }
+    }
+  }
+}
+
+namespace {
+absl::Status StreamNotFoundError(const HloInstruction* instruction) {
+  return absl::NotFoundError(
+      absl::StrCat("No stream found for ", instruction->ToString(),
+                   "; it may not be reachable from the module's entrypoint."));
+}
+}  // namespace
+
+absl::StatusOr<ExecutionStreamId>
+ExecutionStreamAssignment::GetSyncExecutionStreamId(
+    const HloInstruction* instruction) const {
+  CHECK(!instruction->IsAsynchronous());
+  auto stream = sync_instructions_.find(instruction);
+  if (stream == sync_instructions_.end()) {
+    return StreamNotFoundError(instruction);
+  }
+  return stream->second;
+}
+
+absl::StatusOr<ExecutionStreamAssignment::AsyncExecutionStreamIds>
+ExecutionStreamAssignment::GetAsyncExecutionStreamIds(
+    const HloAsyncInstruction* instruction) const {
+  auto streams = async_instructions_.find(instruction);
+  if (streams == async_instructions_.end()) {
+    return StreamNotFoundError(instruction);
+  }
+  return streams->second;
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/execution_stream_assignment.h b/third_party/xla/xla/service/gpu/execution_stream_assignment.h
new file mode 100644
index 00000000000000..8200b7730d9eb8
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/execution_stream_assignment.h
@@ -0,0 +1,80 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_EXECUTION_STREAM_ASSIGNMENT_H_
+#define XLA_SERVICE_GPU_EXECUTION_STREAM_ASSIGNMENT_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/gpu/runtime/thunk.h"
+
+namespace xla::gpu {
+
+// `ExecutionStreamAssignments` represent a mapping from `HloInstructions` to
+// `ExecutionStreamIds`. Asynchronous calls (`async-start`, `async-update`, and
+// `async-done`) result in the target computations being assigned new
+// `ExecutionStreamIds` to support concurrent execution.
+class ExecutionStreamAssignment {
+ public:
+  // The `HloModule` must be flat. In other words, there must be a one-to-one
+  // mapping between callsites and computations. One way to guarantee this is to
+  // pass the module through the `FlattenCallGraph` pass.
+  //
+  // The ExecutionStreamAssignment does not take ownership of the `HloModule`.
+  explicit ExecutionStreamAssignment(const HloModule* module);
+
+  // Returns the `ExecutionStreamId` for the given instruction, which *must* be
+  // synchronous. Returns an error if the instruction is not reachable
+  // from the module's entrypoint.
+  absl::StatusOr<ExecutionStreamId> GetSyncExecutionStreamId(
+      const HloInstruction* instruction) const;
+
+  // Returns the source and destination `ExecutionStreamIds` for the given
+  // instruction, which *must* be asynchronous. Returns an error if the
+  // instruction is not reachable from the module's entrypoint.
+  struct AsyncExecutionStreamIds {
+    // The `ExecutionStreamId` for the calling instruction (e.g. the computation
+    // that invokes `async-start`).
+    ExecutionStreamId source_stream_id;
+    // The `ExecutionStreamId` for the callee computation (e.g. the callee of an
+    // `async-start` instruction).
+    ExecutionStreamId destination_stream_id;
+  };
+  absl::StatusOr<AsyncExecutionStreamIds> GetAsyncExecutionStreamIds(
+      const HloAsyncInstruction* instruction) const;
+
+ private:
+  // Maps from `HloInstructions` to `ExecutionStreamIds` for synchronous and
+  // asynchronous instructions, respectively. Instructions that are not
+  // reachable from the module's entrypoint will not be present.
+  absl::flat_hash_map<HloInstruction*, ExecutionStreamId> sync_instructions_;
+  absl::flat_hash_map<HloInstruction*, AsyncExecutionStreamIds>
+      async_instructions_;
+};
+
+inline bool operator==(
+    const ExecutionStreamAssignment::AsyncExecutionStreamIds& first,
+    const ExecutionStreamAssignment::AsyncExecutionStreamIds& second) {
+  return first.source_stream_id == second.source_stream_id &&
+         first.destination_stream_id == second.destination_stream_id;
+}
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_EXECUTION_STREAM_ASSIGNMENT_H_
diff --git a/third_party/xla/xla/service/gpu/execution_stream_assignment_test.cc b/third_party/xla/xla/service/gpu/execution_stream_assignment_test.cc
new file mode 100644
index 00000000000000..507d6e9fcef6b4
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/execution_stream_assignment_test.cc
@@ -0,0 +1,169 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/execution_stream_assignment.h"
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/gpu/runtime/thunk.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/status_matchers.h"
+#include "tsl/platform/statusor.h"
+
+using ::tsl::testing::IsOkAndHolds;
+using ::tsl::testing::StatusIs;
+
+using AsyncExecutionStreamIds =
+    ::xla::gpu::ExecutionStreamAssignment::AsyncExecutionStreamIds;
+
+namespace xla::gpu {
+namespace {
+
+class ExecutionStreamAssignmentTest : public HloTestBase {
+ protected:
+  // Adds expectations for the `ExecutionStreamId` for all synchronous
+  // `HloInstructions` in the given `HloComputation`.
+  void ExpectExecutionStreamForSyncInstructions(
+      const ExecutionStreamAssignment& assignment, HloComputation* computation,
+      ExecutionStreamId stream) const {
+    for (const HloInstruction* instruction : computation->instructions()) {
+      if (instruction->IsAsynchronous()) continue;
+      EXPECT_THAT(assignment.GetSyncExecutionStreamId(instruction),
+                  IsOkAndHolds(stream));
+    }
+  }
+
+  // Adds expectations for the `ExecutionStreamId` for all asynchronous
+  // `HloInstructions` in the given `HloComputation`.
+  void ExpectExecutionStreamForAsyncInstructions(
+      const ExecutionStreamAssignment& assignment, HloComputation* computation,
+      ExecutionStreamId source_stream,
+      ExecutionStreamId destination_stream) const {
+    for (const HloInstruction* instruction : computation->instructions()) {
+      if (!instruction->IsAsynchronous()) continue;
+      AsyncExecutionStreamIds expected_stream_ids;
+      expected_stream_ids.source_stream_id = source_stream;
+      expected_stream_ids.destination_stream_id = destination_stream;
+      EXPECT_THAT(assignment.GetAsyncExecutionStreamIds(
+                      Cast<HloAsyncInstruction>(instruction)),
+                  IsOkAndHolds(expected_stream_ids));
+    }
+  }
+
+  const Shape kTensorShape = ShapeUtil::MakeShape(F32, {2, 2});
+};
+
+TEST_F(ExecutionStreamAssignmentTest, AsyncFusion) {
+  // We'll create an `HloModule` with two nested `async-fusions`.
+  // ENTRY -> ASYNC-FUSION -> ASYNC-FUSION -> BINARY_OP
+  const char* kModuleStr = R"(
+    HloModule m
+
+    // Leaf computation.
+    leaf {
+      p0 = f32[2,2] parameter(0)
+      ROOT add = f32[2,2] add(p0, p0)
+    }
+
+    // Innermost `async-fusion`.
+    fusion {
+      p0 = f32[2,2] parameter(0)
+      start = ((f32[2,2]), f32[2,2], s32[]) fusion-start(p0),
+          kind=kLoop, calls=leaf
+      update = ((f32[2,2]), f32[2,2], s32[]) fusion-update(start)
+      ROOT done = f32[2,2] fusion-done(update)
+    }
+
+    // Outermost `async-fusion` and entrypoint for the module.
+    ENTRY entry {
+      p0 = f32[2,2] parameter(0)
+      start = ((f32[2,2]), f32[2,2], s32[]) fusion-start(p0),
+          kind=kLoop, calls=fusion
+      update = ((f32[2,2]), f32[2,2], s32[]) fusion-update(start)
+      ROOT done = f32[2,2] fusion-done(update)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+
+  ExecutionStreamAssignment assignment(module.get());
+
+  // The outermost computation should run on `ExecutionStreamId(0)` and launch
+  // asynchronous work on `ExecutionStreamId(1)`.
+  ExpectExecutionStreamForSyncInstructions(
+      assignment, FindComputation(module.get(), "entry"), ExecutionStreamId(0));
+  ExpectExecutionStreamForAsyncInstructions(
+      assignment, FindComputation(module.get(), "entry"), ExecutionStreamId(0),
+      ExecutionStreamId(1));
+
+  // The nested computation should run on `ExecutionStreamId(1)` and launch
+  // asynchronous work on `ExecutionStreamId(2)`.
+  ExpectExecutionStreamForSyncInstructions(
+      assignment, FindComputation(module.get(), "fusion"),
+      ExecutionStreamId(1));
+  ExpectExecutionStreamForAsyncInstructions(
+      assignment, FindComputation(module.get(), "fusion"), ExecutionStreamId(1),
+      ExecutionStreamId(2));
+
+  // The innermost computation should run on `ExecutionStreamId(2)`
+  ExpectExecutionStreamForSyncInstructions(
+      assignment, FindComputation(module.get(), "leaf"), ExecutionStreamId(2));
+}
+
+TEST_F(ExecutionStreamAssignmentTest, UnreachableComputation) {
+  // We'll create an `HloModule` with two computations: the ENTRY computation,
+  // and another unreachable embedded computation.
+  const char* kModuleStr = R"(
+    HloModule m
+
+    // Unreachable computation.
+    unreachable {
+      p0 = f32[2,2] parameter(0)
+      ROOT add = f32[2,2] add(p0, p0)
+    }
+
+    ENTRY entry {
+      p0 = f32[2,2] parameter(0)
+      ROOT add = f32[2,2] add(p0, p0)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+
+  ExecutionStreamAssignment assignment(module.get());
+  ExpectExecutionStreamForSyncInstructions(
+      assignment, FindComputation(module.get(), "entry"), ExecutionStreamId(0));
+
+  // Unreachable instructions should have no assigned `ExecutionStreamId`.
+  for (const HloInstruction* instruction :
+       FindComputation(module.get(), "unreachable")->instructions()) {
+    EXPECT_THAT(assignment.GetSyncExecutionStreamId(instruction),
+                StatusIs(absl::StatusCode::kNotFound));
+  }
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/fusion_merger_test.cc b/third_party/xla/xla/service/gpu/fusion_merger_test.cc
index 447ffce64998cd..de45a4b9d3273e 100644
--- a/third_party/xla/xla/service/gpu/fusion_merger_test.cc
+++ b/third_party/xla/xla/service/gpu/fusion_merger_test.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tests/hlo_test_base.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/fusion_pipeline.cc b/third_party/xla/xla/service/gpu/fusion_pipeline.cc
index 8d309bce85c565..2a184c04a6594c 100644
--- a/third_party/xla/xla/service/gpu/fusion_pipeline.cc
+++ b/third_party/xla/xla/service/gpu/fusion_pipeline.cc
@@ -76,6 +76,7 @@ HloPassPipeline FusionPipeline(
   // we detect as a tiled transpose fusion.
   fusion.AddPass<HloCSE>(/*is_layout_sensitive=*/true,
                          /*only_fusion_computations=*/true);
+  fusion.AddPass<HloDCE>();
   fusion.AddPass<GpuMultiOutputFusion>(gpu_device_info,
                                        shape_size_bytes_function);
   fusion.AddPass<HloCSE>(/*is_layout_sensitive=*/true,
diff --git a/third_party/xla/xla/service/gpu/fusion_wrapper.cc b/third_party/xla/xla/service/gpu/fusion_wrapper.cc
index 5b5120397a2665..940597be29376e 100644
--- a/third_party/xla/xla/service/gpu/fusion_wrapper.cc
+++ b/third_party/xla/xla/service/gpu/fusion_wrapper.cc
@@ -36,7 +36,7 @@ absl::StatusOr<bool> FusionWrapper::Run(
   auto instructions = module->entry_computation()->MakeInstructionPostOrder();
   bool changed = false;
 
-  std::function<Status(HloInstruction*)> handle_instruction;
+  std::function<absl::Status(HloInstruction*)> handle_instruction;
   handle_instruction = [&](HloInstruction* instruction) -> absl::Status {
     switch (instruction->opcode()) {
       case HloOpcode::kConditional:
diff --git a/third_party/xla/xla/service/gpu/fusions/BUILD b/third_party/xla/xla/service/gpu/fusions/BUILD
index 2053e2344f9591..c4bc16601c40d5 100644
--- a/third_party/xla/xla/service/gpu/fusions/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/BUILD
@@ -1,5 +1,4 @@
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
-load("@local_tsl//tsl/platform:build_config_root.bzl", "tf_cuda_tests_tags")
 load("@local_tsl//tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
 load("//xla:xla.bzl", "xla_cc_test")
 load("//xla/tests:build_defs.bzl", "xla_test")
@@ -37,7 +36,7 @@ cc_library(
 xla_cc_test(
     name = "in_place_dynamic_update_slice_test",
     srcs = ["in_place_dynamic_update_slice_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":fusions",
         ":in_place_dynamic_update_slice",
@@ -85,7 +84,7 @@ cc_library(
 xla_cc_test(
     name = "in_place_dynamic_update_slice_mlir_test",
     srcs = ["in_place_dynamic_update_slice_mlir_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":in_place_dynamic_update_slice_mlir",
         ":mlir_emitter_test_base",
@@ -127,6 +126,7 @@ cc_library(
         "//xla:status",
         "//xla:statusor",
         "//xla:util",
+        "//xla/ffi:attribute_map",
         "//xla/ffi:ffi_api",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
@@ -157,6 +157,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AsmParser",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -269,6 +270,7 @@ cc_library(
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:hlo_fusion_analysis",
+        "//xla/service/gpu:hlo_traversal",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu/fusions/mlir:elemental_hlo_to_mlir",
         "@com_google_absl//absl/algorithm:container",
@@ -384,7 +386,7 @@ cc_library(
 xla_cc_test(
     name = "loop_mlir_test",
     srcs = ["loop_mlir_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":loop_mlir",
         ":mlir_emitter_test_base",
@@ -427,6 +429,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
     ],
 )
@@ -434,7 +437,7 @@ cc_library(
 xla_cc_test(
     name = "scatter_mlir_test",
     srcs = ["scatter_mlir_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":mlir_emitter_test_base",
         ":scatter_mlir",
@@ -453,7 +456,7 @@ cc_library(
     srcs = ["transpose_mlir.cc"],
     hdrs = ["transpose_mlir.h"],
     deps = [
-        ":tiling_util",
+        ":fusion_emitter",
         "//xla:permutation_util",
         "//xla:shape_util",
         "//xla:status",
@@ -471,23 +474,23 @@ cc_library(
         "//xla/service/gpu/model:indexing_analysis",
         "//xla/service/gpu/model:indexing_map",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
 xla_cc_test(
     name = "transpose_mlir_test",
     srcs = ["transpose_mlir_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":mlir_emitter_test_base",
         ":transpose_mlir",
@@ -531,6 +534,7 @@ cc_library(
         ":loop",
         "//xla:shape_util",
         "//xla:status",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:ir_emission_utils",
@@ -539,6 +543,7 @@ cc_library(
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:parallel_loop_emitter",
         "//xla/service/gpu/model:indexing_analysis",
+        "//xla/service/gpu/model:indexing_map",
         "//xla/service/llvm_ir:fused_ir_emitter",
         "//xla/service/llvm_ir:ir_array",
         "//xla/service/llvm_ir:llvm_util",
@@ -547,6 +552,8 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:ir_headers",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -564,6 +571,7 @@ xla_cc_test(
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_googletest//:gtest",
+        "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
@@ -576,14 +584,17 @@ cc_library(
     deps = [
         "//xla:shape_util",
         "//xla:util",
+        "//xla:xla_data_proto_cc",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:target_util",
         "//xla/service/llvm_ir:ir_array",
         "//xla/service/llvm_ir:kernel_support_library",
         "//xla/service/llvm_ir:llvm_loop",
         "//xla/service/llvm_ir:llvm_util",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:ir_headers",
@@ -598,9 +609,12 @@ cc_library(
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     deps = [
         ":fusion_emitter",
+        "//xla:shape_util",
+        "//xla:status_macros",
         "//xla:statusor",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:hlo_fusion_analysis",
+        "//xla/service/gpu:hlo_traversal",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:ir_emitter_context",
         "//xla/service/gpu:ir_emitter_triton",
@@ -610,14 +624,17 @@ cc_library(
         "//xla/service/gpu:matmul_utils",
         "//xla/service/gpu:triton_fusion_analysis",
         "//xla/service/gpu/runtime:kernel_thunk",
+        "//xla/service/llvm_ir:ir_array",
         "//xla/service/llvm_ir:llvm_util",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//llvm:Support",
         "@llvm-project//llvm:ir_headers",
-        "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
-        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
@@ -631,6 +648,7 @@ xla_cc_test(
         ":triton",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/stream_executor:device_description",
+        "//xla/stream_executor:device_description_proto_cc",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_googletest//:gtest",
@@ -662,7 +680,7 @@ xla_test(
     name = "cudnn_test",
     srcs = if_cuda_is_configured(["cudnn_test.cc"]),
     backend_tags = {"gpu": [
-        "requires-gpu-sm80",
+        "requires-gpu-sm90",
     ]},
     backends = [
         "gpu",
@@ -676,7 +694,11 @@ xla_test(
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:executable",
+        "//xla/service:pattern_matcher",
+        "//xla/service:pattern_matcher_gmock",
+        "//xla/service/gpu:cudnn_fusion_compiler",
         "//xla/service/gpu:stream_executor_util",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/service/gpu/tests:gpu_codegen_test",
         "//xla/stream_executor:stream_executor_headers",
         "//xla/tests:filecheck",
@@ -720,12 +742,10 @@ cc_library(
         "//xla:shape_util",
         "//xla:status",
         "//xla:status_macros",
-        "//xla:union_find",
         "//xla:util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
-        "//xla/hlo/utils:hlo_query",
         "//xla/service:buffer_assignment",
-        "//xla/service/gpu:gpu_fusible",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:hlo_traversal",
         "//xla/service/gpu:ir_emission_utils",
@@ -737,8 +757,6 @@ cc_library(
         "//xla/service/gpu:parallel_loop_emitter",
         "//xla/service/gpu:reduction_utils",
         "//xla/service/gpu:target_util",
-        "//xla/service/gpu/model:indexing_analysis",
-        "//xla/service/gpu/model:indexing_map",
         "//xla/service/gpu/runtime:kernel_thunk",
         "//xla/service/gpu/runtime:thunk",
         "//xla/service/llvm_ir:fused_ir_emitter",
@@ -748,7 +766,6 @@ cc_library(
         "//xla/service/llvm_ir:llvm_util",
         "//xla/service/llvm_ir:loop_emitter",
         "//xla/stream_executor:device_description",
-        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
@@ -759,7 +776,6 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:ir_headers",
-        "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
@@ -778,6 +794,7 @@ cc_library(
         "//xla:shape_util",
         "//xla:union_find",
         "//xla:util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
         "//xla/service/gpu:gpu_fusible",
@@ -797,6 +814,7 @@ cc_library(
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
     ],
@@ -812,6 +830,7 @@ xla_cc_test(
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:ir_emitter_context",
+        "//xla/service/gpu/model:indexing_analysis",
         "//xla/service/gpu/model:indexing_test_utils",
         "//xla/stream_executor:device_description",
         "//xla/tests:hlo_test_base",
@@ -828,9 +847,9 @@ cc_library(
     srcs = ["reduction_mlir.cc"],
     hdrs = ["reduction_mlir.h"],
     deps = [
+        ":fusion_emitter",
         ":reduction_base",
         "//xla:shape_util",
-        "//xla:status_macros",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:hlo_traversal",
@@ -846,38 +865,32 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:BufferizationDialect",
         "@llvm-project//mlir:BufferizationInterfaces",
-        "@llvm-project//mlir:DataLayoutInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:GPUDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:SCFDialect",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
+        "@llvm-project//mlir:TensorDialect",
     ],
 )
 
 xla_cc_test(
     name = "reduction_mlir_test",
     srcs = ["reduction_mlir_test.cc"],
-    shard_count = 11,
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":mlir_emitter_test_base",
         ":reduction_mlir",
         "//xla:error_spec",
         "//xla/service:gpu_plugin",
-        "//xla/tests:filecheck",
         "//xla/tests:xla_internal_test_main",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/lib/core:status_test_util",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -955,7 +968,7 @@ cc_library(
 xla_cc_test(
     name = "concatenate_mlir_test",
     srcs = ["concatenate_mlir_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":concatenate_mlir",
         ":mlir_emitter_test_base",
@@ -977,8 +990,10 @@ cc_library(
         ":fusion_emitter",
         ":tiling_util",
         "//xla:permutation_util",
+        "//xla:shape_util",
         "//xla:status",
         "//xla:util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:ir_emission_utils",
@@ -991,6 +1006,7 @@ cc_library(
         "//xla/service/llvm_ir:fused_ir_emitter",
         "//xla/service/llvm_ir:ir_array",
         "//xla/service/llvm_ir:llvm_util",
+        "//xla/service/llvm_ir:loop_emitter",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log:check",
@@ -1000,6 +1016,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:ir_headers",
         "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -1010,13 +1027,13 @@ xla_cc_test(
         ":fusions",
         ":transpose",
         "//xla:status_macros",
-        "//xla:statusor",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu/model:indexing_test_utils",
         "//xla/stream_executor:device_description",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:statusor",
@@ -1063,22 +1080,25 @@ cc_library(
     hdrs = ["input_slices_mlir.h"],
     deps = [
         "//xla:status",
-        "//xla:status_macros",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:hlo_fusion_analysis",
+        "//xla/service/gpu:hlo_traversal",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu/fusions/mlir:computation_partitioner",
         "//xla/service/gpu/fusions/mlir:elemental_hlo_to_mlir",
         "//xla/service/gpu/fusions/mlir:mlir_fusion_emitter",
         "//xla/service/gpu/fusions/mlir/ir:xla_gpu",
+        "//xla/service/gpu/model:indexing_analysis",
         "//xla/service/gpu/model:indexing_map",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:TensorDialect",
     ],
 )
@@ -1086,12 +1106,13 @@ cc_library(
 xla_cc_test(
     name = "input_slices_mlir_test",
     srcs = ["input_slices_mlir_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":input_slices_mlir",
         ":mlir_emitter_test_base",
         "//xla:error_spec",
         "//xla/service:gpu_plugin",
+        "//xla/service/gpu/model:indexing_test_utils",
         "//xla/tests:xla_internal_test_main",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/lib/core:status_test_util",
diff --git a/third_party/xla/xla/service/gpu/fusions/concatenate.cc b/third_party/xla/xla/service/gpu/fusions/concatenate.cc
index f0f2eee6e16f18..5963005a248d44 100644
--- a/third_party/xla/xla/service/gpu/fusions/concatenate.cc
+++ b/third_party/xla/xla/service/gpu/fusions/concatenate.cc
@@ -45,12 +45,12 @@ namespace xla {
 namespace gpu {
 
 const Shape& GetLargestConcatOperandShape(const HloFusionAnalysis& analysis) {
-  const HloInstruction* concat = analysis.fusion_heroes().front();
-  int64_t dim = concat->concatenate_dimension();
+  const HloInstruction& concat = analysis.fusion_hero(0).instruction();
+  int64_t dim = concat.concatenate_dimension();
   auto less = [&](const HloInstruction* lhs, const HloInstruction* rhs) {
     return lhs->shape().dimensions(dim) < rhs->shape().dimensions(dim);
   };
-  HloInstruction* operand = *absl::c_max_element(concat->operands(), less);
+  HloInstruction* operand = *absl::c_max_element(concat.operands(), less);
   return operand->shape();
 }
 
@@ -86,18 +86,18 @@ absl::Status ConcatenateFusion::EmitKernel(
   llvm::Type* index_type =
       GetIndexTypeForKernel(&fusion, launch_dims.launch_bound(), builder);
 
-  const HloInstruction* concat = analysis_.fusion_heroes().front();
-  int64_t concat_dim = concat->concatenate_dimension();
+  const HloInstruction& concat = analysis_.fusion_hero(0).instruction();
+  int64_t concat_dim = concat.concatenate_dimension();
   int64_t operand_offset = 0;
 
   // Emit the slices that correspond to the operands of the concat hero.
-  for (const HloInstruction* operand : concat->operands()) {
+  for (const HloInstruction* operand : concat.operands()) {
     llvm_ir::BodyEmitter body_emitter =
         [&](const llvm_ir::IrArray::Index& operand_index) -> absl::Status {
       // Bind concat to generate the current operand.
       TF_ASSIGN_OR_RETURN(auto operand_generator,
                           fused_emitter.GetGenerator(*operand));
-      fused_emitter.BindGenerator(*concat, [&](llvm_ir::IrArray::Index) {
+      fused_emitter.BindGenerator(concat, [&](llvm_ir::IrArray::Index) {
         return operand_generator(operand_index);
       });
 
@@ -111,8 +111,9 @@ absl::Status ConcatenateFusion::EmitKernel(
       for (const auto& [output, root] :
            llvm::zip_equal(outputs, analysis_.fusion_roots())) {
         llvm_ir::IrArray::Index root_index = result_index.SourceIndexOfBitcast(
-            concat->shape(), root->shape(), builder);
-        TF_ASSIGN_OR_RETURN(auto generator, fused_emitter.GetGenerator(*root));
+            concat.shape(), root.shape(), builder);
+        TF_ASSIGN_OR_RETURN(auto generator,
+                            fused_emitter.GetGenerator(root.instruction()));
         TF_ASSIGN_OR_RETURN(llvm::Value * value, generator(root_index));
         output.EmitWriteArrayElement(root_index, value, builder);
       }
diff --git a/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.cc b/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.cc
index aa472039837f1f..d53df96e35589e 100644
--- a/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstdint>
 #include <iterator>
 #include <optional>
+#include <vector>
 
 #include "absl/log/check.h"
 #include "absl/log/log.h"
@@ -68,12 +69,12 @@ MlirConcatenateFusion::ComputeThreadIdToInputIndexing(
                                        ctx);
 }
 
-std::optional<mlir_converter::EpilogueSpecification>
-MlirConcatenateFusion::GetEpilogue(const HloFusionInstruction& fusion,
-                                   mlir::MLIRContext* mlir_context) const {
-  return mlir_converter::EpilogueSpecification::FromIdentityIndexing(
-      analysis_.fusion_heroes().front(), analysis_.fusion_roots().front(),
-      mlir_context);
+std::vector<mlir_converter::EpilogueSpecification>
+MlirConcatenateFusion::GetEpilogues(const HloFusionInstruction& fusion,
+                                    mlir::MLIRContext* mlir_context) const {
+  return {mlir_converter::EpilogueSpecification::FromIdentityIndexing(
+      &analysis_.fusion_hero(0).instruction(),
+      &analysis_.fusion_root(0).instruction(), mlir_context)};
 }
 
 absl::Status MlirConcatenateFusion::EmitEntryFunction(
@@ -83,7 +84,6 @@ absl::Status MlirConcatenateFusion::EmitEntryFunction(
     const HloFusionInstruction& fusion) const {
   const auto& root_computation = computations.FindPartitionedComputation(
       fusion.fused_instructions_computation());
-  const auto* concat = analysis_.fusion_heroes()[0];
   mlir::ImplicitLocOpBuilder builder(entry_function.getLoc(), entry_function);
   builder.setInsertionPointToStart(entry_function.addEntryBlock());
   auto* ctx = entry_function.getContext();
@@ -101,8 +101,10 @@ absl::Status MlirConcatenateFusion::EmitEntryFunction(
       ComputeThreadIdToInputIndexing(
           /*root_index=*/0, /*hero_operand_index=*/0, ctx)
           .value();
-  auto epilogue_indexing = ComputeEpilogueInputToOutputIndexing(concat, ctx);
+  auto epilogue_indexing = ComputeEpilogueInputToOutputIndexing(
+      analysis_.fusion_hero(0), analysis_.fusion_root(0), ctx);
 
+  const auto* concat = &analysis_.fusion_hero(0).instruction();
   for (auto [operand_index, operand] : llvm::enumerate(concat->operands())) {
     auto input_to_output_map =
         *ComputeInputToOutputIndexing(concat, /*input_id=*/operand_index, ctx)
@@ -116,18 +118,19 @@ absl::Status MlirConcatenateFusion::EmitEntryFunction(
         [&, operand_index = operand_index](
             ValueRange output_tensors, ValueRange dim_values,
             ValueRange symbol_values) -> SmallVector<Value> {
-      auto input_indices =
-          mlir_converter::ApplyAffineMap(thread_id_to_input_map.GetAffineMap(),
-                                         dim_values, symbol_values, builder);
-
-      auto result_scalars = mlir_converter::ProvideParameter(
-          root_computation.FindSubgraph(concat), concat, operand_index,
-          input_indices, call_targets, entry_function, builder);
-      auto output_indices =
-          mlir_converter::ApplyAffineMap(thread_id_to_output_map.GetAffineMap(),
-                                         dim_values, symbol_values, builder);
-      result_scalars = EmitEpilogue(computations, entry_function,
-                                    result_scalars, output_indices, builder);
+      auto input_indices = mlir_converter::ApplyIndexing(
+          thread_id_to_input_map, dim_values, symbol_values, builder);
+
+      auto result_scalar = mlir_converter::ProvideParameter(
+          root_computation, concat, operand_index, input_indices, call_targets,
+          entry_function, builder);
+      absl::flat_hash_map<const HloInstruction*, llvm::SmallVector<Value>>
+          hero_value{{concat, result_scalar}};
+      auto output_indices = mlir_converter::ApplyIndexing(
+          thread_id_to_output_map, dim_values, symbol_values, builder);
+      auto result_scalars = EmitEpilogue(
+          /*epilogue_index=*/0, computations, entry_function, hero_value,
+          output_indices, builder)[&analysis_.fusion_root(0).instruction()];
 
       SmallVector<Value> result_tensors;
       result_tensors.reserve(output_tensor_args.size());
diff --git a/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.h b/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.h
index cd2306110df5e7..0b3fc28ce8ab23 100644
--- a/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <optional>
+#include <vector>
 
 #include "absl/status/status.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -55,7 +56,7 @@ class MlirConcatenateFusion : public MlirFusionEmitterBase {
       mlir::func::FuncOp entry_function,
       const HloFusionInstruction& fusion) const override;
 
-  std::optional<mlir_converter::EpilogueSpecification> GetEpilogue(
+  std::vector<mlir_converter::EpilogueSpecification> GetEpilogues(
       const HloFusionInstruction& fusion,
       mlir::MLIRContext* mlir_context) const override;
 
diff --git a/third_party/xla/xla/service/gpu/fusions/concatenate_mlir_test.cc b/third_party/xla/xla/service/gpu/fusions/concatenate_mlir_test.cc
index cbb9877601de71..dce6eb3e7a0f94 100644
--- a/third_party/xla/xla/service/gpu/fusions/concatenate_mlir_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/concatenate_mlir_test.cc
@@ -57,7 +57,7 @@ TEST_F(MlirConcatenateFusionTest, ThreadIdIndexing) {
 
   constexpr auto kIndexing = R"(
     (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (
-    (th_x + bl_x * 128) mod 400)
+    (bl_x * 128 + th_x) mod 400)
     domain:
     th_x in [0, 127]
     th_y in [0, 0]
@@ -102,10 +102,9 @@ TEST_F(MlirConcatenateFusionTest, StandAloneConcatenate) {
     }
   )";
   TF_ASSERT_OK(EmitAndCheckIR(kHloString, R"(
-    // CHECK-DAG: #[[MAP_1:.*]] = affine_map<()[s0, s1] -> (s0 + s1 * 128)>
-    // CHECK-DAG: #[[MAP_2:.*]] = affine_map<()[s0, s1] -> ((s0 + s1 * 128) mod 400)>
-    // CHECK-DAG: #[[MAP_3:.*]] = affine_map<()[s0, s1] -> ((s0 + s1 * 128) mod 400 + 200)>
-    // CHECK-DAG: #[[MAP_4:.*]] = affine_map<()[s0, s1] -> ((s0 + s1 * 128) mod 400 + 600)>
+    // CHECK-DAG: #[[MAP_1:.*]] = affine_map<(d0, d1) -> ((d1 * 128 + d0) mod 400)>
+    // CHECK-DAG: #[[MAP_2:.*]] = affine_map<(d0, d1) -> ((d1 * 128 + d0) mod 400 + 200)>
+    // CHECK-DAG: #[[MAP_3:.*]] = affine_map<(d0, d1) -> ((d1 * 128 + d0) mod 400 + 600)>
 
     // CHECK-LABEL: fused_computation
     // CHECK-SAME:    %[[ARG_0:[a-zA-Z0-9]*]]: {{[^,]*}},
@@ -116,19 +115,19 @@ TEST_F(MlirConcatenateFusionTest, StandAloneConcatenate) {
     // CHECK: %[[THREAD_ID:.*]] = gpu.thread_id x
     // CHECK: %[[BLOCK_ID:.*]] = gpu.block_id x
 
-    // CHECK: %[[INPUT_INDEX_1:.*]] = affine.apply #[[MAP_2]]()[%[[THREAD_ID]], %[[BLOCK_ID]]]
+    // CHECK: %[[INPUT_INDEX_1:.*]] = xla_gpu.apply_indexing #[[MAP_1]]
     // CHECK: %[[IF_1:.*]] = scf.if
     // CHECK:   %[[VAL_1:.*]] = xla_gpu.pure_call @fused_computation_param0
     // CHECK:   %[[INSERTED_1:.*]] = tensor.insert %[[VAL_1:.*]] into %[[OUTPUT]][%[[INPUT_INDEX_1]]]
 
     // CHECK: %[[IF_2:.*]] = scf.if
     // CHECK:   %[[VAL_2:.*]] = xla_gpu.pure_call @fused_computation_param1
-    // CHECK:   %[[OUTPUT_INDEX_2:.*]] = affine.apply #[[MAP_3]]()[%[[THREAD_ID]], %[[BLOCK_ID]]]
+    // CHECK:   %[[OUTPUT_INDEX_2:.*]] = xla_gpu.apply_indexing #[[MAP_2]]
     // CHECK:   %[[INSERTED_2:.*]] = tensor.insert %[[VAL_2:.*]] into {{.*}}[%[[OUTPUT_INDEX_2]]]
 
     // CHECK: %[[IF_3:.*]] = scf.if
     // CHECK:   %[[VAL_3:.*]] = xla_gpu.pure_call @fused_computation_param2
-    // CHECK:   %[[OUTPUT_INDEX_3:.*]] = affine.apply #[[MAP_4]]()[%[[THREAD_ID]], %[[BLOCK_ID]]]
+    // CHECK:   %[[OUTPUT_INDEX_3:.*]] = xla_gpu.apply_indexing #[[MAP_3]]
     // CHECK:   %[[INSERTED_3:.*]] = tensor.insert %[[VAL_3:.*]] into {{.*}}[%[[OUTPUT_INDEX_3]]]
   )"));
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
@@ -153,7 +152,7 @@ TEST_F(MlirConcatenateFusionTest, PrologueEpilogue) {
     }
   )";
   TF_ASSERT_OK(EmitAndCheckIR(kHloString, R"(
-    // CHECK: #[[MAP:.*]] = affine_map<()[s0] -> (s0 + 64)>
+    // CHECK: #[[MAP:.*]] = affine_map<(d0) -> (d0 + 64)>
 
     // CHECK-LABEL: fused_computation
     // CHECK-DAG: %[[C_63:.*]] = arith.constant 63
@@ -162,13 +161,13 @@ TEST_F(MlirConcatenateFusionTest, PrologueEpilogue) {
     // CHECK: %[[IN_BOUND_1:.*]] = arith.cmpi sle, %[[THREAD_ID:.*]], %[[C_63]]
     // CHECK: %[[IF_1:.*]] = scf.if %[[IN_BOUND_1]]
     // CHECK:   %[[VAL_1_1:.*]] = xla_gpu.pure_call @fused_computation_log({{.*}}, %[[THREAD_ID]])
-    // CHECK:   %[[VAL_1_2:.*]] = xla_gpu.pure_call @fused_computation__epilogue__({{.*}}, %[[THREAD_ID]], %[[VAL_1_1]])
+    // CHECK:   %[[VAL_1_2:.*]] = xla_gpu.pure_call @fused_computation__epilogue__neg({{.*}}, %[[THREAD_ID]], %[[VAL_1_1]])
     // CHECK:   %[[INSERTED_1:.*]] = tensor.insert %[[VAL_1_2:.*]] into {{.*}}[%[[THREAD_ID]]]
     // CHECK:   scf.yield %[[INSERTED_1]]
 
     // CHECK: %[[VAL_2_1:.*]] = xla_gpu.pure_call @fused_computation_exp({{.*}}, %[[THREAD_ID]])
-    // CHECK: %[[INDEX_2:.*]] = affine.apply #[[MAP]]()[%[[THREAD_ID]]]
-    // CHECK: %[[VAL_2_2:.*]] = xla_gpu.pure_call @fused_computation__epilogue__({{.*}}, %[[INDEX_2]], %[[VAL_2_1]])
+    // CHECK: %[[INDEX_2:.*]] = xla_gpu.apply_indexing #[[MAP]](%[[THREAD_ID]]
+    // CHECK: %[[VAL_2_2:.*]] = xla_gpu.pure_call @fused_computation__epilogue__neg({{.*}}, %[[INDEX_2]], %[[VAL_2_1]])
     // CHECK: %[[INSERTED_2:.*]] = tensor.insert %[[VAL_2_2:.*]] into {{.*}}[%[[INDEX_2]]]
 
     // CHECK: return %[[INSERTED_2]]
diff --git a/third_party/xla/xla/service/gpu/fusions/concatenate_test.cc b/third_party/xla/xla/service/gpu/fusions/concatenate_test.cc
index b617fd6513d107..4ac8ca3c665497 100644
--- a/third_party/xla/xla/service/gpu/fusions/concatenate_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/concatenate_test.cc
@@ -80,7 +80,7 @@ TEST_F(ConcatenateTest, ThreadIndexing) {
 
   constexpr auto kIndexing = R"(
     (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (
-    (th_x + bl_x * 128) mod 400)
+    (bl_x * 128 + th_x) mod 400)
     domain:
     th_x in [0, 127]
     th_y in [0, 0]
diff --git a/third_party/xla/xla/service/gpu/fusions/copy.h b/third_party/xla/xla/service/gpu/fusions/copy.h
index 00b9ce4506a5a3..53fd1fb3ddfd31 100644
--- a/third_party/xla/xla/service/gpu/fusions/copy.h
+++ b/third_party/xla/xla/service/gpu/fusions/copy.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_FUSIONS_COPY_H_
 #define XLA_SERVICE_GPU_FUSIONS_COPY_H_
 
+#include <utility>
 #include <vector>
 
 #include "absl/status/statusor.h"
diff --git a/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc b/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc
index ca7d2c2b8d59bb..ba664e1029854e 100644
--- a/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc
@@ -26,8 +26,12 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/primitive_util.h"
 #include "xla/service/executable.h"
+#include "xla/service/gpu/cudnn_fusion_compiler.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/gpu/stream_executor_util.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
+#include "xla/service/pattern_matcher.h"
+#include "xla/service/pattern_matcher_gmock.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/stream_executor_pimpl.h"
 #include "xla/tests/filecheck.h"
@@ -75,6 +79,44 @@ class CuDnnFusionTest : public GpuCodegenTest {
 
 using CuDnnFusionExecutionTest = CuDnnFusionTest;
 
+namespace m = ::xla::match;
+
+TEST_F(CuDnnFusionExecutionTest, WorkspaceAllocationWorks) {
+  if (!IsAtLeastCuDnn91()) {
+    GTEST_SKIP() << "This test case requests a workspace only with cuDNN 9.1+.";
+  }
+  const std::string kHloText = R"(
+fusion1 {
+  p0 = f32[32,96] parameter(0)
+  p1 = f32[96,64] parameter(1)
+  ROOT r = f32[32,64] dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f32[32,96] parameter(0)
+  p1 = f32[96,64] parameter(1)
+  ROOT _ = f32[32,64] fusion(p0, p1), kind=kCustom, calls=fusion1,
+    backend_config={"fusion_backend_config": {kind: "__cudnn$fusion"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloText));
+  Thunk::BinaryMap dnn_compiled_graphs;
+  CuDnnFusionCompiler cudnn_compiler(*backend().default_stream_executor(),
+                                     dnn_compiled_graphs);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, cudnn_compiler.Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::GetTupleElement(m::Fusion())));
+  EXPECT_THAT(module->entry_computation()
+                  ->root_instruction()
+                  ->operand(0)
+                  ->fused_instructions_computation()
+                  ->root_instruction(),
+              GmockMatch(m::Tuple(m::Dot(), m::CustomCall())));
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
 TEST_F(CuDnnFusionExecutionTest,
        NoTritonConfigIsAssignedAtZeroAutotuningLevel) {
   EXPECT_EQ(GetDebugOptionsForTest().xla_gpu_autotune_level(), 0);
@@ -335,20 +377,20 @@ ENTRY e {
     backend_config={"fusion_backend_config":{"kind":"__cudnn$fusion","cudnn_fusion_config":{"plan_id":"0"}}}
 })";
 
+  se::StreamExecutorMemoryAllocator allocator(
+      backend().default_stream_executor());
   // Verify that a command buffer is applied.
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<Executable> executable,
-      backend().compiler()->RunBackend(
-          GetOptimizedModule(kHloText).value(),
-          backend().default_stream_executor(),
-          backend().default_stream_executor()->GetAllocator()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Executable> executable,
+                          backend().compiler()->RunBackend(
+                              GetOptimizedModule(kHloText).value(),
+                              backend().default_stream_executor(), &allocator));
   absl::StatusOr<bool> filecheck_result =
       RunFileCheck(executable->module().ToString(), R"(
 ; CHECK: ENTRY
 ; CHECK-NEXT: parameter
 ; CHECK-NEXT: parameter
-; CHECK-NEXT: ROOT
-; CHECK-SAME: command_buffer
+; CHECK: command_buffer
+; CHECK-NOT: fusion
 )");
   TF_ASSERT_OK(filecheck_result.status());
   EXPECT_TRUE(filecheck_result.value());
@@ -479,6 +521,60 @@ ENTRY e {
                             ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
+TEST_F(CuDnnFusionLevel2Test, ConstantExecutesCorrectly) {
+  if (!IsAtLeastCuDnn91()) {
+    GTEST_SKIP() << "Fused scalar constants require cuDNN 9.1+.";
+  }
+  EXPECT_TRUE(RunAndCompare(R"(
+fusion1 {
+  x = bf16[16,32] parameter(0)
+  y = bf16[32,16] parameter(1)
+  x_const = bf16[] constant(-1)
+  y_const = s32[] constant(-2)
+  x_const_bcast = bf16[16,32] broadcast(x_const), dimensions={}
+  y_const_bcast = s32[32,16] broadcast(y_const), dimensions={}
+  y_const_convert = bf16[32,16] convert(y_const_bcast)
+  x_add = bf16[16,32] minimum(x, x_const_bcast)
+  y_add = bf16[32,16] minimum(y, y_const_convert)
+  dot_a = f32[16,16] dot(x_add, y_add), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  c = f32[] constant(0)
+  c_bcast = f32[16,16] broadcast(c), dimensions={}
+  ROOT out = f32[16,16] maximum(dot_a, c_bcast)
+  }
+ENTRY e {
+  p0 = bf16[16,32] parameter(0)
+  p1 = bf16[32,16] parameter(1)
+  ROOT _ = f32[16,16] fusion(p0, p1), kind=kCustom, calls=fusion1,
+    backend_config={"fusion_backend_config": {kind: "__cudnn$fusion"}}
+})",
+                            ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+TEST_F(CuDnnFusionLevel2Test, DotF8ExecutesCorrectly) {
+  EXPECT_TRUE(RunAndCompare(R"(
+
+fusion1 {
+  x = f8e4m3fn[16,32] parameter(0)
+  y = f8e4m3fn[32,16] parameter(1)
+  dot = f32[16,16] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  x_scale = f32[] parameter(2)
+  y_scale = f32[] parameter(3)
+  combined_scale = f32[] multiply(x_scale, y_scale)
+  scale_bcast = f32[16,16] broadcast(combined_scale), dimensions={}
+  ROOT out =  f32[16,16] multiply(dot, scale_bcast)
+}
+
+ENTRY e {
+  p0 = f8e4m3fn[16,32] parameter(0)
+  p1 = f8e4m3fn[32,16] parameter(1)
+  x_scale = f32[] parameter(2)
+  y_scale = f32[] parameter(3)
+  ROOT _ = f32[16,16] fusion(p0, p1, x_scale, y_scale), kind=kCustom, calls=fusion1,
+    backend_config={"fusion_backend_config": {kind: "__cudnn$fusion"}}
+})",
+                            ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
 class CuDnnFusionLevel3Test : public CuDnnFusionExecutionTest {
  public:
   DebugOptions GetDebugOptionsForTest() override {
@@ -588,12 +684,12 @@ ENTRY e {
 INSTANTIATE_TEST_SUITE_P(
     ElementwiseTestSuiteF32, UnaryElementwiseTest,
     ::testing::Combine(::testing::Values(F32),
-                       ::testing::ValuesIn({HloOpcode::kAbs, HloOpcode::kCos,
-                                            HloOpcode::kExp, HloOpcode::kLog,
-                                            HloOpcode::kNegate,
-                                            HloOpcode::kRsqrt, HloOpcode::kSin,
-                                            HloOpcode::kSqrt, HloOpcode::kTan,
-                                            HloOpcode::kTanh}),
+                       ::testing::ValuesIn(
+                           {HloOpcode::kAbs, HloOpcode::kCeil, HloOpcode::kCos,
+                            HloOpcode::kExp, HloOpcode::kFloor, HloOpcode::kLog,
+                            HloOpcode::kNegate, HloOpcode::kRsqrt,
+                            HloOpcode::kSin, HloOpcode::kSqrt, HloOpcode::kTan,
+                            HloOpcode::kTanh}),
                        ::testing::Values(5e-4)),
     ElementwiseTestParamsToString);
 
diff --git a/third_party/xla/xla/service/gpu/fusions/custom.cc b/third_party/xla/xla/service/gpu/fusions/custom.cc
index 8f619ff54f29a3..3efdae3ca831dd 100644
--- a/third_party/xla/xla/service/gpu/fusions/custom.cc
+++ b/third_party/xla/xla/service/gpu/fusions/custom.cc
@@ -33,6 +33,8 @@ limitations under the License.
 #include "mlir/AsmParser/AsmParser.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/ffi/attribute_map.h"
 #include "xla/ffi/ffi_api.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -109,9 +111,11 @@ absl::StatusOr<BufferAllocation::Slice> GetOperandSlice(
                               /*index*/ {});
   }
 
-  auto slice_adaptor =
-      HloFindIf({HloInstructionAdaptor(*start)}, adaptor, [](auto node) {
-        return IsOpcodeAnyOf<HloOpcode::kDynamicSlice, HloOpcode::kSlice>(node);
+  auto slice_adaptor = HloFindIf(
+      {HloInstructionAdaptor(*start, &adaptor)}, adaptor,
+      [](HloInstructionAdaptor node) {
+        return IsOpcodeAnyOf<HloOpcode::kDynamicSlice, HloOpcode::kSlice>(
+            &node.instruction());
       });
   if (slice_adaptor.has_value()) {
     auto* slice_instr =
@@ -224,7 +228,7 @@ absl::StatusOr<BufferAllocation::Slice> GetResultSlice(
   }
 
   auto slice_adaptor = HloFindIf(
-      {HloInstructionAdaptor(*start)}, adaptor,
+      {HloInstructionAdaptor(*start, &adaptor)}, adaptor,
       [](auto node) { return node.opcode() == HloOpcode::kDynamicUpdateSlice; },
       /*visit_operands=*/false);
   if (slice_adaptor.has_value()) {
@@ -331,7 +335,8 @@ absl::StatusOr<FusionEmissionResult> EmitGemm(
   }
 
   bool deterministic_ops =
-      ir_emitter_context.debug_options().xla_gpu_deterministic_ops();
+      ir_emitter_context.debug_options().xla_gpu_deterministic_ops() ||
+      ir_emitter_context.debug_options().xla_gpu_exclude_nondeterministic_ops();
 
   TF_ASSIGN_OR_RETURN(
       GemmConfig config,
@@ -559,8 +564,8 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
       if (!backend_config_str.empty()) {
         mlir::Attribute attr = mlir::parseAttribute(
             backend_config_str, ir_emitter_context.mlir_context());
-        if (auto dict = attr.dyn_cast_or_null<mlir::DictionaryAttr>()) {
-          TF_ASSIGN_OR_RETURN(attributes, BuildAttributesMap(dict));
+        if (auto dict = mlir::dyn_cast_or_null<mlir::DictionaryAttr>(attr)) {
+          TF_ASSIGN_OR_RETURN(attributes, xla::ffi::BuildAttributesMap(dict));
           break;
         }
         return absl::InternalError(
diff --git a/third_party/xla/xla/service/gpu/fusions/fusions.cc b/third_party/xla/xla/service/gpu/fusions/fusions.cc
index 2d5faddf657f1f..c7b622ca87567c 100644
--- a/third_party/xla/xla/service/gpu/fusions/fusions.cc
+++ b/third_party/xla/xla/service/gpu/fusions/fusions.cc
@@ -53,6 +53,7 @@ limitations under the License.
 #include "xla/service/gpu/fusions/transpose_mlir.h"
 #include "xla/service/gpu/fusions/triton.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -76,10 +77,10 @@ bool IsParameterOrGteOfParameter(const HloInstruction* instr) {
 
 bool IsDynamicUpdateSliceFusion(const HloFusionAnalysis& analysis) {
   return absl::c_all_of(
-      analysis.fusion_roots(), [](const HloInstruction* root) {
-        return root->opcode() == HloOpcode::kDynamicUpdateSlice ||
-               (root->opcode() == HloOpcode::kBitcast &&
-                root->operand(0)->opcode() == HloOpcode::kDynamicUpdateSlice);
+      analysis.fusion_roots(), [](const HloInstructionAdaptor& root) {
+        return root.opcode() == HloOpcode::kDynamicUpdateSlice ||
+               (root.opcode() == HloOpcode::kBitcast &&
+                root.GetOperand(0).opcode() == HloOpcode::kDynamicUpdateSlice);
       });
 }
 
@@ -88,7 +89,8 @@ bool IsDynamicUpdateSliceFusion(const HloFusionAnalysis& analysis) {
 std::optional<absl::StatusOr<std::unique_ptr<FusionInterface>>>
 HloFusionInfo::GetCopyFusion() const {
   std::vector<BufferAllocation::Slice> src_buffers;
-  for (auto* root : analysis().fusion_roots()) {
+  for (const HloInstructionAdaptor& root_adaptor : analysis().fusion_roots()) {
+    const HloInstruction* root = &root_adaptor.instruction();
     if (root->opcode() != HloOpcode::kCopy ||
         root->operand(0)->opcode() != HloOpcode::kParameter ||
         !LayoutUtil::Equal(root->operand(0)->shape().layout(),
@@ -135,26 +137,21 @@ absl::StatusOr<std::unique_ptr<FusionInterface>> GetFusionEmitter(
   const auto& analysis = fusion_info.analysis();
   const FusionBackendConfig& backend_config = analysis.fusion_backend_config();
 
-  const auto& opts =
-      analysis.fusion_roots().front()->GetModule()->config().debug_options();
-  auto check_mlir_emitters = [&](std::function<bool(const HloFusionAnalysis&)>
-                                     support_check) {
+  const auto& opts = analysis.fusion_root(0)
+                         .instruction()
+                         .GetModule()
+                         ->config()
+                         .debug_options();
+  auto check_mlir_emitters = [&](bool check = true) {
     if (!opts.xla_gpu_enable_mlir_emitters()) {
       return false;
     }
-    if (!mlir_converter::IsHloConversionSupported(
-            analysis.fusion(),
-            fusion_info.analysis().device_info().gpu_compute_capability())) {
-      VLOG(5) << "Skipping MLIR emission because the fusion contains "
-                 "unsupported instructions.";
-      return false;
-    }
-    if (support_check && !support_check(analysis)) {
-      VLOG(5) << "Skipping MLIR emission because the fusion emitter does not "
-                 "support "
-                 "the fusion.";
-      return false;
-    }
+    CHECK(!check ||
+          mlir_converter::IsHloConversionSupported(
+              analysis.fusion(),
+              fusion_info.analysis().device_info().gpu_compute_capability()))
+        << "Unsupported fusion: "
+        << analysis.fusion_root(0).instruction().parent()->ToString();
 
     static int num_mlir_emitters = 0;
     if (is_emission_phase) {
@@ -187,15 +184,14 @@ absl::StatusOr<std::unique_ptr<FusionInterface>> GetFusionEmitter(
       return std::make_unique<CustomFusion>();
     }
     case HloFusionAnalysis::EmitterFusionKind::kInputSlices:
-      if (check_mlir_emitters(nullptr)) {
+      if (check_mlir_emitters()) {
         return std::make_unique<MlirInputSlicesFusion>(analysis);
       }
       return std::make_unique<InputSlicesFusion>(analysis);
     case HloFusionAnalysis::EmitterFusionKind::kLoop: {
       if (IsDynamicUpdateSliceFusion(analysis) &&
           fusion_info.CanEmitDynamicUpdateSliceInPlace()) {
-        if (check_mlir_emitters(
-                MlirInPlaceDynamicUpdateSliceFusion::IsSupported)) {
+        if (check_mlir_emitters()) {
           return std::make_unique<MlirInPlaceDynamicUpdateSliceFusion>(
               analysis);
         }
@@ -206,30 +202,31 @@ absl::StatusOr<std::unique_ptr<FusionInterface>> GetFusionEmitter(
         return *std::move(copy_fusion);
       }
 
-      if (check_mlir_emitters(nullptr)) {
+      if (check_mlir_emitters()) {
         return std::make_unique<MlirLoopFusion>(analysis);
       }
       return std::make_unique<LoopFusion>(analysis);
     }
     case HloFusionAnalysis::EmitterFusionKind::kReduction:
-      if (check_mlir_emitters(MlirReductionFusion::IsSupported)) {
+      if (check_mlir_emitters()) {
         return std::make_unique<MlirReductionFusion>(analysis);
       }
       return std::make_unique<ReductionFusion>(analysis);
     case HloFusionAnalysis::EmitterFusionKind::kScatter: {
-      if (check_mlir_emitters(MlirScatterFusion::IsSupported)) {
+      if (check_mlir_emitters(false)) {
         return std::make_unique<MlirScatterFusion>(analysis);
       }
       return std::make_unique<ScatterFusion>(analysis);
     }
     case HloFusionAnalysis::EmitterFusionKind::kTranspose: {
-      if (check_mlir_emitters(nullptr)) {
+      if (check_mlir_emitters()) {
         return std::make_unique<MlirTransposeFusion>(analysis);
       }
-      return std::make_unique<TransposeFusion>(analysis);
+      return std::make_unique<TransposeFusion>(analysis.device_info(),
+                                               analysis);
     }
     case HloFusionAnalysis::EmitterFusionKind::kConcatenate: {
-      if (check_mlir_emitters(nullptr)) {
+      if (check_mlir_emitters()) {
         return std::make_unique<MlirConcatenateFusion>(analysis);
       }
       return std::make_unique<ConcatenateFusion>(analysis);
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.h b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.h
index 678796c930364a..741d6b339a6478 100644
--- a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.h
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_FUSIONS_IN_PLACE_DYNAMIC_UPDATE_SLICE_H_
 #define XLA_SERVICE_GPU_FUSIONS_IN_PLACE_DYNAMIC_UPDATE_SLICE_H_
 
+#include <cstdint>
 #include <optional>
 #include <vector>
 
@@ -77,7 +78,7 @@ class InPlaceDynamicUpdateSliceFusion : public KernelFusionEmitterBase {
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      mlir::MLIRContext* ctx) const override;
+      mlir::MLIRContext* indexing_context) const override;
 
  protected:
   absl::Status EmitKernel(IrEmitterContext& ir_emitter_context,
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc
index abab64dfb8064c..a6f019727865e9 100644
--- a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc
@@ -50,13 +50,12 @@ namespace {
 
 using llvm::SmallVector;
 using mlir::ImplicitLocOpBuilder;
-using mlir::MLIRContext;
 using mlir::Value;
 using mlir::ValueRange;
 using mlir::arith::AddIOp;
 using mlir::func::ReturnOp;
 using mlir::tensor::InsertOp;
-using mlir_converter::ApplyAffineMap;
+using mlir_converter::ApplyIndexing;
 using mlir_converter::CallTargetProvider;
 using mlir_converter::ClampIndex;
 using mlir_converter::PartitionedComputations;
@@ -66,11 +65,6 @@ constexpr int kDUSUpdateIndex = 1;
 
 }  // namespace
 
-/*static*/ bool MlirInPlaceDynamicUpdateSliceFusion::IsSupported(
-    const HloFusionAnalysis& analysis) {
-  return analysis.fusion_roots().size() == 1;
-}
-
 LaunchDimensions MlirInPlaceDynamicUpdateSliceFusion::launch_dimensions()
     const {
   const auto& update_shape =
@@ -81,7 +75,7 @@ LaunchDimensions MlirInPlaceDynamicUpdateSliceFusion::launch_dimensions()
 std::optional<IndexingMap>
 MlirInPlaceDynamicUpdateSliceFusion::ComputeThreadIdToInputIndexing(
     int64_t root_index, int64_t hero_operand_index,
-    mlir::MLIRContext* mlir_context) const {
+    mlir::MLIRContext* indexing_context) const {
   // TODO(b/331355203): Implement thread ID -> operand indexing.
   if (hero_operand_index != kDUSUpdateIndex) {
     return std::nullopt;
@@ -91,16 +85,16 @@ MlirInPlaceDynamicUpdateSliceFusion::ComputeThreadIdToInputIndexing(
   const auto& update_shape =
       dus_ops_.front()->operand(kDUSUpdateIndex)->shape();
   return GetDefaultThreadIdIndexingMap(launch_dims, /*unroll_factor=*/1,
-                                       update_shape, mlir_context);
+                                       update_shape, indexing_context);
 }
 
-std::optional<mlir_converter::EpilogueSpecification>
-MlirInPlaceDynamicUpdateSliceFusion::GetEpilogue(
+std::vector<mlir_converter::EpilogueSpecification>
+MlirInPlaceDynamicUpdateSliceFusion::GetEpilogues(
     const HloFusionInstruction& fusion, mlir::MLIRContext* mlir_context) const {
   // We don't actually support epilogues for DUS, but this is how we tell
   // the base class that we don't want it to generate code for the DUS.
-  return mlir_converter::EpilogueSpecification::FromIdentityIndexing(
-      dus_ops_.front(), analysis_.fusion_roots().front(), mlir_context);
+  return {mlir_converter::EpilogueSpecification::FromIdentityIndexing(
+      dus_ops_.front(), &analysis_.fusion_root(0).instruction(), mlir_context)};
 }
 
 absl::Status MlirInPlaceDynamicUpdateSliceFusion::EmitEntryFunction(
@@ -124,51 +118,50 @@ absl::Status MlirInPlaceDynamicUpdateSliceFusion::EmitEntryFunction(
 
   const auto& root_computation = computations.FindPartitionedComputation(
       fusion.fused_instructions_computation());
-  const auto& dus_subgraph = root_computation.FindSubgraph(dus_ops_.front());
-
-  const auto* dus_instr =
-      Cast<HloDynamicUpdateSliceInstruction>(dus_ops_.front());
-  const auto& update_shape = dus_instr->update()->shape();
   auto result_tensors = EmitThreadLoopNest(
       b, output_tensor_args, indexing,
       [&](ValueRange output_tensors, ValueRange dim_values,
           ValueRange symbol_values) -> llvm::SmallVector<Value> {
-        auto input_indices = ApplyAffineMap(indexing.GetAffineMap(), dim_values,
-                                            symbol_values, b);
-        SmallVector<Value> update_indices;
-        for (int i = 0; i < update_shape.rank(); ++i) {
-          int64_t update_size = update_shape.dimensions(i);
-          auto start_index =
-              ProvideParameter(dus_subgraph, dus_instr,
-                               i + dus_instr->first_index_operand_number(), {},
-                               call_targets, entry_function, b)[0];
-          start_index = ClampIndex(
-              start_index,
-              primitive_util::IsUnsignedIntegralType(
-                  dus_instr
-                      ->operand(i + dus_instr->first_index_operand_number())
-                      ->shape()
-                      .element_type()),
-              dus_instr->shape().dimensions(i) - update_size, b);
-
-          update_indices.push_back(
-              b.create<AddIOp>(input_indices[i], start_index));
+        auto input_indices =
+            ApplyIndexing(indexing, dim_values, symbol_values, b);
+        llvm::SmallVector<Value> results;
+        for (auto [instr, root, output] :
+             llvm::zip(dus_ops_, analysis_.fusion_roots(), output_tensors)) {
+          const auto* dus_instr = Cast<HloDynamicUpdateSliceInstruction>(instr);
+          const auto& update_shape = dus_instr->update()->shape();
+          SmallVector<Value> update_indices;
+          auto start_indices = ProvideParameterRange(
+              root_computation, dus_instr,
+              dus_instr->first_index_operand_number(), update_shape.rank(), {},
+              call_targets, entry_function, b);
+          for (int i = 0; i < update_shape.rank(); ++i) {
+            int64_t update_size = update_shape.dimensions(i);
+            auto start_index = ClampIndex(
+                start_indices[i],
+                primitive_util::IsUnsignedIntegralType(
+                    dus_instr
+                        ->operand(i + dus_instr->first_index_operand_number())
+                        ->shape()
+                        .element_type()),
+                dus_instr->shape().dimensions(i) - update_size, b);
+
+            update_indices.push_back(
+                b.create<AddIOp>(input_indices[i], start_index));
+          }
+
+          auto updated_value =
+              ProvideParameter(root_computation, dus_instr, kDUSUpdateIndex,
+                               input_indices, call_targets, entry_function, b);
+          // Handle bitcasts under the DUS.
+          if (dus_instr->shape() != root.shape()) {
+            update_indices = ApplyIndexing(
+                GetBitcastMap(dus_instr->shape(), root.shape(), b.getContext()),
+                update_indices, {}, b);
+          }
+          results.push_back(
+              b.create<InsertOp>(updated_value[0], output, update_indices));
         }
-
-        auto updated_value =
-            ProvideParameter(dus_subgraph, dus_instr, kDUSUpdateIndex,
-                             input_indices, call_targets, entry_function, b)[0];
-        // Handle bitcasts under the DUS.
-        if (dus_instr->shape() != fusion.shape()) {
-          update_indices = ApplyAffineMap(
-              GetBitcastMap(dus_instr->shape(), fusion.shape(), b.getContext())
-                  .GetAffineMap(),
-              update_indices, {}, b);
-        }
-        auto insert = b.create<InsertOp>(updated_value, output_tensors[0],
-                                         update_indices);
-
-        return {insert.getResult()};
+        return results;
       });
 
   b.create<ReturnOp>(result_tensors);
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h
index 73b92ae71582a3..b3086ea29af6a6 100644
--- a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h
@@ -17,10 +17,10 @@ limitations under the License.
 
 #include <cstdint>
 #include <optional>
+#include <vector>
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/Interfaces/DataLayoutInterfaces.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
@@ -50,8 +50,6 @@ class MlirInPlaceDynamicUpdateSliceFusion : public MlirFusionEmitterBase {
         dus_ops_(
             GetOutputDefiningDynamicUpdateSlices(analysis.fusion_roots())) {}
 
-  static bool IsSupported(const HloFusionAnalysis& analysis);
-
   LaunchDimensions launch_dimensions() const override;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
@@ -63,7 +61,7 @@ class MlirInPlaceDynamicUpdateSliceFusion : public MlirFusionEmitterBase {
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      mlir::MLIRContext* indexing_context) const override;
+      mlir::MLIRContext* mlir_context) const override;
 
  protected:
   absl::Status EmitEntryFunction(
@@ -72,7 +70,7 @@ class MlirInPlaceDynamicUpdateSliceFusion : public MlirFusionEmitterBase {
       mlir::func::FuncOp entry_function,
       const HloFusionInstruction& fusion) const override;
 
-  std::optional<mlir_converter::EpilogueSpecification> GetEpilogue(
+  std::vector<mlir_converter::EpilogueSpecification> GetEpilogues(
       const HloFusionInstruction& fusion,
       mlir::MLIRContext* mlir_context) const override;
 
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir_test.cc b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir_test.cc
index b9cd940c102295..e22ea62a6f5d36 100644
--- a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir_test.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h"
 
+#include <optional>
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "xla/error_spec.h"
@@ -98,8 +100,8 @@ TEST_F(MlirInPlaceDynamicUpdateSliceFusionTest, SimpleDUS) {
     }
   )";
   TF_ASSERT_OK(EmitAndCheckIR(kHloString, R"(
-    // CHECK-DAG: #[[MAP_1:.*]] = affine_map<()[s0] -> (s0 floordiv 6)>
-    // CHECK-DAG: #[[MAP_2:.*]] = affine_map<()[s0] -> (s0 mod 6)>
+    // CHECK-DAG: #[[MAP_1:.*]] = affine_map<(d0) -> (d0 floordiv 6)>
+    // CHECK-DAG: #[[MAP_2:.*]] = affine_map<(d0) -> (d0 mod 6)>
     // CHECK:     func.func @fused_computation
     // CHECK-SAME:  %arg0: tensor<20x30xf32>
     // CHECK-SAME:  %arg1: tensor<5x6xf32>
@@ -110,14 +112,14 @@ TEST_F(MlirInPlaceDynamicUpdateSliceFusionTest, SimpleDUS) {
     // CHECK-DAG:   %[[C_15:.*]] = arith.constant 15
     // CHECK-DAG:   %[[C_0:.*]] = arith.constant 0
     // CHECK:       %[[THREAD_ID:.*]] = gpu.thread_id  x
-    // CHECK:       %[[INPUT_INDEX_0:.*]] = affine.apply #[[MAP_1]]()[%[[THREAD_ID]]]
-    // CHECK:       %[[INPUT_INDEX_1:.*]] = affine.apply #[[MAP_2]]()[%[[THREAD_ID]]]
+    // CHECK:       %[[INPUT_INDEX_0:.*]] = xla_gpu.apply_indexing #[[MAP_1]](%[[THREAD_ID]] in [0, 29])
+    // CHECK:       %[[INPUT_INDEX_1:.*]] = xla_gpu.apply_indexing #[[MAP_2]](%[[THREAD_ID]] in [0, 29])
     // CHECK:       %[[I0:.*]] = xla_gpu.pure_call @fused_computation_i0
+    // CHECK:       %[[I1:.*]] = xla_gpu.pure_call @fused_computation_i1
     // CHECK:       %[[IDX0:.*]] = arith.index_cast %[[I0]]
     // CHECK:       %[[MIN0:.*]] = arith.minsi %[[IDX0]], %[[C_15]]
     // CHECK:       %[[MAX0:.*]] = arith.maxsi %[[MIN0]], %[[C_0]]
     // CHECK:       %[[ADD0:.*]] = arith.addi %[[INPUT_INDEX_0]], %[[MAX0]]
-    // CHECK:       %[[I1:.*]] = xla_gpu.pure_call @fused_computation_i1
     // CHECK:       %[[IDX1:.*]] = arith.index_cast %[[I1]]
     // CHECK:       %[[MIN1:.*]] = arith.minsi %[[IDX1]], %[[C_24]]
     // CHECK:       %[[MAX1:.*]] = arith.maxsi %[[MIN1]], %[[C_0]]
@@ -149,8 +151,8 @@ TEST_F(MlirInPlaceDynamicUpdateSliceFusionTest, OutOfBoundDUS) {
     }
   )";
   TF_ASSERT_OK(EmitAndCheckIR(kHloString, R"(
-    // CHECK-DAG: #[[MAP_1:.*]] = affine_map<()[s0] -> (s0 floordiv 3)>
-    // CHECK-DAG: #[[MAP_2:.*]] = affine_map<()[s0] -> (s0 mod 3)>
+    // CHECK-DAG: #[[MAP_1:.*]] = affine_map<(d0) -> (d0 floordiv 3)>
+    // CHECK-DAG: #[[MAP_2:.*]] = affine_map<(d0) -> (d0 mod 3)>
     // CHECK:     func.func @fused_computation
     // CHECK-SAME:  %arg0: tensor<7x8xf32>
     // CHECK-SAME:  %arg1: tensor<2x3xf32>
@@ -160,14 +162,14 @@ TEST_F(MlirInPlaceDynamicUpdateSliceFusionTest, OutOfBoundDUS) {
     // CHECK-DAG:   %[[C_5:.*]] = arith.constant 5
     // CHECK-DAG:   %[[C_0:.*]] = arith.constant 0
     // CHECK:       %[[THREAD_ID:.*]] = gpu.thread_id  x
-    // CHECK:       %[[INPUT_INDEX_0:.*]] = affine.apply #[[MAP_1]]()[%[[THREAD_ID]]]
-    // CHECK:       %[[INPUT_INDEX_1:.*]] = affine.apply #[[MAP_2]]()[%[[THREAD_ID]]]
+    // CHECK:       %[[INPUT_INDEX_0:.*]] = xla_gpu.apply_indexing #[[MAP_1]](%[[THREAD_ID]] in [0, 5])
+    // CHECK:       %[[INPUT_INDEX_1:.*]] = xla_gpu.apply_indexing #[[MAP_2]](%[[THREAD_ID]] in [0, 5])
     // CHECK:       %[[I0:.*]] = xla_gpu.pure_call @fused_computation_i0
+    // CHECK:       %[[I1:.*]] = xla_gpu.pure_call @fused_computation_i1
     // CHECK:       %[[IDX0:.*]] = arith.index_cast %[[I0]]
     // CHECK:       %[[MIN0:.*]] = arith.minsi %[[IDX0]], %[[C_5]]
     // CHECK:       %[[MAX0:.*]] = arith.maxsi %[[MIN0]], %[[C_0]]
     // CHECK:       %[[ADD0:.*]] = arith.addi %[[INPUT_INDEX_0]], %[[MAX0]]
-    // CHECK:       %[[I1:.*]] = xla_gpu.pure_call @fused_computation_i1
     // CHECK:       %[[IDX1:.*]] = arith.index_cast %[[I1]]
     // CHECK:       %[[MIN1:.*]] = arith.minsi %[[IDX1]], %[[C_5]]
     // CHECK:       %[[MAX1:.*]] = arith.maxsi %[[MIN1]], %[[C_0]]
@@ -202,6 +204,88 @@ TEST_F(MlirInPlaceDynamicUpdateSliceFusionTest, BitcastDus) {
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
 
+TEST_F(MlirInPlaceDynamicUpdateSliceFusionTest, MOFDus) {
+  auto kHloString = R"(
+    HloModule module
+
+    fused_computation {
+      p0 = f32[10,11,12] parameter(0)
+      p1 = f32[1,11,12] parameter(1)
+      p2 = f32[8,11,12] parameter(2)
+      p3 = f32[1,11,12] parameter(3)
+      p4 = s32[] parameter(4)
+      c0 = s32[] constant(0)
+      cmp = pred[] compare(p4, c0), direction=EQ
+      broadcast = pred[1,11,12] broadcast(cmp), dimensions={}
+      select = f32[1,11,12] select(broadcast, p1, p3)
+      dus0 = f32[10,11,12] dynamic-update-slice(p0, select, c0, c0, c0)
+      dus1 = f32[8,11,12] dynamic-update-slice(p2, select, c0, c0, c0)
+      ROOT tuple = (f32[10,11,12], f32[8,11,12]) tuple(dus0, dus1)
+    }
+    ENTRY entry {
+      p0 = f32[10,11,12] parameter(0)
+      p1 = f32[1,11,12] parameter(1)
+      p2 = f32[8,11,12] parameter(2)
+      p3 = f32[1,11,12] parameter(3)
+      p4 = s32[] parameter(4)
+      ROOT fusion_root_multiple = (f32[10,11,12], f32[8,11,12])
+        fusion(p0, p1, p2, p3, p4), kind=kLoop, calls=fused_computation
+    }
+  )";
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
+}
+
+TEST_F(MlirInPlaceDynamicUpdateSliceFusionTest, OperandSubgraphWithTwoRoots) {
+  auto kHloString = R"(
+    HloModule in_place_dus
+
+    dus_fusion {
+      param_0.8 = f32[512,512]{1,0} parameter(0)
+      param_1.10 = f32[128,128]{1,0} parameter(1)
+      param_3.32 = s32[] parameter(3)
+      two = s32[] constant(2)
+      param_3_mod_2 = s32[] remainder(param_3.32, two)
+      one = s32[] constant(1)
+      param_3_plus_one = s32[] add(param_3_mod_2, one)
+      param_2.32 = s32[] parameter(2)
+      param_2_plus_one = s32[] add(param_2.32, one)
+      ROOT dynamic-update-slice.5.1 = f32[512,512]{1,0} dynamic-update-slice(param_0.8, param_1.10, param_2_plus_one, param_3_plus_one)
+    }
+    ENTRY entry {
+      p0 = f32[512,512]{1,0} parameter(0)
+      p1 = f32[128,128]{1,0} parameter(1)
+      p2 = s32[] parameter(2)
+      p3 = s32[] parameter(3)
+      ROOT dus = f32[512,512]{1,0} fusion(p0, p1, p2, p3), kind=kLoop, calls=dus_fusion
+    }
+  )";
+  TF_ASSERT_OK(EmitAndCheckIR(kHloString, R"(
+    // CHECK:     func.func @fused_computation(
+    // CHECK-SAME:  %[[ARG0:[^:]+]]: tensor<512x512xf32>
+    // CHECK-SAME:  , %[[ARG1:[^:]+]]: tensor<128x128xf32>
+    // CHECK-SAME:  , %[[ARG2:[^:]+]]: tensor<i32>
+    // CHECK-SAME:  , %[[ARG3:[^:]+]]: tensor<i32>
+    // CHECK-SAME:  , %[[ARG4:[^:]+]]: tensor<512x512xf32>
+    // CHECK-DAG:   %[[C_384:.*]] = arith.constant 384
+    // CHECK-DAG:   %[[C_0:.*]] = arith.constant 0
+    // CHECK:       %[[THREAD_ID:.*]] = gpu.thread_id  x
+    // CHECK:       %[[BLOCK_ID:.*]] = gpu.block_id  x
+    // CHECK:       %[[I0:.*]], %[[I1:.*]] = xla_gpu.pure_call @dus_fusion_param_2_plus_one_param_3_plus_one
+    // CHECK:       %[[IDX0:.*]] = arith.index_cast %[[I0]]
+    // CHECK:       %[[MIN0:.*]] = arith.minsi %[[IDX0]], %[[C_384]]
+    // CHECK:       %[[MAX0:.*]] = arith.maxsi %[[MIN0]], %[[C_0]]
+    // CHECK:       %[[ADD0:.*]] = arith.addi %[[BLOCK_ID]], %[[MAX0]]
+    // CHECK:       %[[IDX1:.*]] = arith.index_cast %[[I1]]
+    // CHECK:       %[[MIN1:.*]] = arith.minsi %[[IDX1]], %[[C_384]]
+    // CHECK:       %[[MAX1:.*]] = arith.maxsi %[[MIN1]], %[[C_0]]
+    // CHECK:       %[[ADD1:.*]] = arith.addi %[[THREAD_ID]], %[[MAX1]]
+    // CHECK:       %[[UPDATE:.*]] = xla_gpu.pure_call @dus_fusion_param_1_10
+    // CHECK:       %[[INSERT:.*]] = tensor.insert %[[UPDATE:.*]] into %[[ARG4]][%[[ADD0]], %[[ADD1]]]
+    // CHECK:       return %[[INSERT]]
+  )"));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-6}));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_test.cc b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_test.cc
index c0382560399c14..f65ebaf8ae641c 100644
--- a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_test.cc
@@ -44,10 +44,12 @@ class InPlaceDynamicUpdateSliceFusionTest : public HloTestBase {
  protected:
   AffineMapPrinter printer_;
   mlir::MLIRContext mlir_context_;
+  stream_executor::DeviceDescription device_info_ =
+      TestGpuDeviceInfo::RTXA6000DeviceInfo();
 };
 
 TEST_F(InPlaceDynamicUpdateSliceFusionTest, ThreadIndexing) {
-  auto module = ParseAndReturnVerifiedModule(R"(
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
     HloModule module
 
     fused_computation {
@@ -64,14 +66,10 @@ TEST_F(InPlaceDynamicUpdateSliceFusionTest, ThreadIndexing) {
       i1 = s32[] constant(3)
       ROOT fusion = f32[20,30] fusion(in, updates, i0, i1), kind=kLoop, calls=fused_computation
     }
-  )")
-                    .value();
-
-  stream_executor::DeviceDescription device_info =
-      TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  )"));
 
   auto* root = module->entry_computation()->root_instruction();
-  auto analysis_fused = AnalyzeFusion(*root, device_info);
+  auto analysis_fused = AnalyzeFusion(*root, device_info_);
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto emitter,
@@ -100,6 +98,44 @@ TEST_F(InPlaceDynamicUpdateSliceFusionTest, ThreadIndexing) {
   EXPECT_THAT(thread_id_dst_indexing, ::testing::Eq(std::nullopt));
 }
 
+TEST_F(InPlaceDynamicUpdateSliceFusionTest, ProduceConsumerFusion) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+    HloModule m
+
+    fused_computation.1 {
+      param_0 = bf16[1,2,5,1,2] parameter(0)
+      bitcast = bf16[1,5,1,2,2] bitcast(param_0)
+      param_1 = bf16[1,1,1,2,2] parameter(1)
+      param_2 = s32[] parameter(2)
+      param_3 = s32[] parameter(3)
+      ROOT dynamic-update-slice = bf16[1,5,1,2,2] dynamic-update-slice(bitcast, param_1, param_2, param_3, param_2, param_2, param_2)
+    }
+
+    ENTRY entry_computation {
+      param_0.2 = bf16[1,2,5,1,2] parameter(3)
+      param_1.2 = bf16[1,1,1,2,2] parameter(0)
+      param_2.2 = s32[] parameter(1)
+      param_3.2 = s32[] parameter(2)
+      fusion = bf16[1,5,1,2,2] fusion(param_0.2, param_1.2, param_2.2, param_3.2), kind=kLoop, calls=fused_computation.1
+      ROOT bitcast.1 = bf16[1,2,5,1,2] bitcast(fusion)
+    }
+  )"));
+
+  auto* root = module->entry_computation()->root_instruction();
+
+  auto analysis_fused =
+      AnalyzeProducerConsumerFusion(*root->operand(0), *root, device_info_);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto emitter,
+      GetFusionEmitter(PreBufferAssignmentFusionInfo{analysis_fused}));
+
+  auto fusion = dynamic_cast<InPlaceDynamicUpdateSliceFusion*>(emitter.get());
+
+  ASSERT_NE(fusion, nullptr);
+  EXPECT_EQ(fusion->launch_dimensions().launch_bound(), 4 /* update size */);
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/input_slices.cc b/third_party/xla/xla/service/gpu/fusions/input_slices.cc
index 35a9f1eca18aa9..ccea6311405544 100644
--- a/third_party/xla/xla/service/gpu/fusions/input_slices.cc
+++ b/third_party/xla/xla/service/gpu/fusions/input_slices.cc
@@ -131,7 +131,7 @@ absl::Status EmitElementForInputFusibleSlices(
             src_multidim[dim],
             index.GetConstantWithIndexType(slice->slice_starts(dim)));
       }
-      llvm_ir::IrArray src_ir_array = outputs[i];
+      const llvm_ir::IrArray& src_ir_array = outputs[i];
       llvm_ir::IrArray::Index slice_dst_index(dst_multidim, slice->shape(),
                                               index.GetType());
       src_ir_array.EmitWriteArrayElement(slice_dst_index, input_ir_values[i],
@@ -178,8 +178,8 @@ absl::StatusOr<Shape> GetConsistentInputShapeForRootSlices(
 }  // namespace
 
 LaunchDimensions InputSlicesFusion::launch_dimensions() const {
-  auto* root = analysis_.fusion_roots().front();
-  const auto& shape = root->operands()[0]->shape();
+  const auto& root = analysis_.fusion_root(0).instruction();
+  const auto& shape = root.operand(0)->shape();
   return CalculateLaunchDimensions(shape, analysis_.device_info(),
                                    {unroll_factor_});
 }
@@ -191,7 +191,7 @@ std::optional<IndexingMap> InputSlicesFusion::ComputeThreadIdToOutputIndexing(
   auto launch_dims = launch_dimensions();
   // The implementation requires the shapes and layouts to be the same, but we
   // still use the requested output's shape for clarity.
-  const auto& shape = analysis_.fusion_roots()[output_id]->shape();
+  const auto& shape = analysis_.fusion_root(output_id).shape();
   return GetDefaultThreadIdIndexingMap(launch_dims, unroll_factor_, shape, ctx);
 }
 
diff --git a/third_party/xla/xla/service/gpu/fusions/input_slices.h b/third_party/xla/xla/service/gpu/fusions/input_slices.h
index d9a68f315be720..70f5c2cca5b2dd 100644
--- a/third_party/xla/xla/service/gpu/fusions/input_slices.h
+++ b/third_party/xla/xla/service/gpu/fusions/input_slices.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include "xla/service/gpu/model/indexing_map.h"
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/status.h"
+#include "xla/util.h"
 
 namespace xla {
 namespace gpu {
@@ -45,7 +46,8 @@ class InputSlicesFusion : public KernelFusionEmitterBase {
  public:
   explicit InputSlicesFusion(const HloFusionAnalysis& analysis)
       : analysis_(analysis),
-        unroll_factor_(analysis.input_output_info().has_4_bit_output ? 2 : 1) {}
+        unroll_factor_(CeilOfRatio(
+            8, analysis.input_output_info().smallest_output_dtype_bits)) {}
   LaunchDimensions launch_dimensions() const override;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
diff --git a/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.cc b/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.cc
index 16c4fbf5062343..6b9f280745c322 100644
--- a/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.cc
@@ -20,14 +20,17 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
 #include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
 #include "mlir/IR/AffineExpr.h"  // from @llvm-project
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
@@ -38,9 +41,10 @@ limitations under the License.
 #include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
 #include "xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h"
 #include "xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.h"
+#include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/model/indexing_map.h"
-#include "xla/status_macros.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
@@ -53,18 +57,36 @@ using mlir::ValueRange;
 std::optional<IndexingMap>
 MlirInputSlicesFusion::ComputeThreadIdToOutputIndexing(
     int64_t output_id, mlir::MLIRContext* ctx) const {
-  // The mapping here is trivial and the same for all outputs - slice offsets
-  // are applied in the indexing from slice outputs to slice inputs.
   auto launch_dims = launch_dimensions();
-  // The implementation requires the shapes and layouts to be the same, but we
-  // still use the requested output's shape for clarity.
-  const auto& shape = analysis_.fusion_roots()[output_id]->shape();
-  return GetDefaultThreadIdIndexingMap(launch_dims, unroll_factor_, shape, ctx);
+  auto* slice = &analysis_.fusion_root(output_id).instruction();
+  const auto& shape = slice->operand(0)->shape();
+  return GetDefaultThreadIdIndexingMap(launch_dims, unroll_factor_, shape,
+                                       ctx) *
+         *ComputeInputToOutputIndexing(slice, 0, ctx)
+              .indexing_maps.front()
+              .begin();
+}
+
+std::vector<mlir_converter::EpilogueSpecification>
+MlirInputSlicesFusion::GetEpilogues(const HloFusionInstruction& fusion,
+                                    mlir::MLIRContext* mlir_context) const {
+  std::vector<const HloInstruction*> roots;
+  roots.reserve(analysis_.fusion_root_count());
+  for (const auto& root : analysis_.fusion_roots()) {
+    roots.push_back(&root.instruction());
+  }
+
+  // We don't actually use epilogues here, but this is how we tell the base
+  // class not to emit code for the slices.
+  return {mlir_converter::EpilogueSpecification::FromOutputIndexing(
+      analysis_, roots, roots, *this, mlir_context)};
 }
 
 LaunchDimensions MlirInputSlicesFusion::launch_dimensions() const {
-  auto* root = analysis_.fusion_roots().front();
-  const auto& shape = root->operands()[0]->shape();
+  // Note: these launch dimensions are not optimal if the input isn't used
+  // fully.
+  const auto& root = analysis_.fusion_root(0).instruction();
+  const auto& shape = root.operand(0)->shape();
   return CalculateLaunchDimensions(shape, analysis_.device_info(),
                                    {unroll_factor_});
 }
@@ -77,39 +99,62 @@ absl::Status MlirInputSlicesFusion::EmitEntryFunction(
   mlir::ImplicitLocOpBuilder builder(entry_function.getLoc(), entry_function);
   builder.setInsertionPointToStart(entry_function.addEntryBlock());
 
-  // We enforce that all the root shapes have identical dimensions in
-  // IsHloOpSupported.
-  auto indexing =
-      ComputeThreadIdToOutputIndexing(0, entry_function.getContext());
-  TF_RET_CHECK(indexing) << "Indexing is never nullopt";
+  auto launch_dims = launch_dimensions();
+  const auto& shape =
+      analysis_.fusion_root(0).instruction().operand(0)->shape();
+  auto input_indexing = GetDefaultThreadIdIndexingMap(
+      launch_dims, unroll_factor_, shape, builder.getContext());
 
   int num_inputs = fusion.fused_instructions_computation()->num_parameters();
   auto output_tensor_args =
       entry_function.getArguments().drop_front(num_inputs);
 
   auto result_tensors = EmitThreadLoopNest(
-      builder, output_tensor_args, *indexing,
+      builder, output_tensor_args, input_indexing,
       [&](ValueRange output_tensors, ValueRange dim_values,
           ValueRange symbol_values) -> SmallVector<Value> {
-        auto output_indices = mlir_converter::ApplyAffineMap(
-            indexing->GetAffineMap(), dim_values, symbol_values, builder);
-        auto root_fn = call_targets(
-            fusion.fused_instructions_computation()->root_instruction());
-
-        SmallVector<Value> operands(
+        auto input_indices = mlir_converter::ApplyIndexing(
+            input_indexing, dim_values, symbol_values, builder);
+        SmallVector<Value> input_operands(
             entry_function.getArguments().take_front(num_inputs));
-        absl::c_copy(output_indices, std::back_inserter(operands));
-
-        auto result_scalars =
-            builder.create<PureCallOp>(root_fn, operands).getResults();
-
+        absl::c_copy(input_indices, std::back_inserter(input_operands));
         SmallVector<Value> result_tensors;
         result_tensors.reserve(output_tensor_args.size());
-        for (auto [tensor, value] : llvm::zip(output_tensors, result_scalars)) {
-          result_tensors.push_back(
-              builder
-                  .create<mlir::tensor::InsertOp>(value, tensor, output_indices)
-                  .getResult());
+
+        absl::flat_hash_map<const HloInstruction*, mlir::Value> input_values;
+        for (const HloInstructionAdaptor& root : analysis_.fusion_roots()) {
+          const auto* arg = root.instruction().operand(0);
+          if (auto& value = input_values[arg]; !value) {
+            value =
+                builder.create<PureCallOp>(call_targets(arg), input_operands)
+                    .getResult(0);
+          }
+        }
+
+        for (auto [output_index, output] : llvm::enumerate(output_tensors)) {
+          auto output_indexing = ComputeThreadIdToOutputIndexing(
+              output_index, entry_function.getContext());
+          mlir::Value in_bounds = mlir_converter::CheckConstraints(
+              *output_indexing, dim_values, symbol_values, builder);
+          auto if_op = builder.create<mlir::scf::IfOp>(
+              in_bounds,
+              [&, output_index = output_index, output = output](
+                  mlir::OpBuilder b, mlir::Location loc) {
+                mlir::ImplicitLocOpBuilder then_builder(loc, b);
+                auto output_indices = mlir_converter::ApplyIndexing(
+                    *output_indexing, dim_values, symbol_values, then_builder);
+                const auto* arg = analysis_.fusion_root(output_index)
+                                      .instruction()
+                                      .operand(0);
+                auto inserted = then_builder.create<mlir::tensor::InsertOp>(
+                    input_values[arg], output, output_indices);
+                then_builder.create<mlir::scf::YieldOp>(inserted.getResult());
+              },
+              [&, output = output](mlir::OpBuilder else_builder,
+                                   mlir::Location loc) {
+                else_builder.create<mlir::scf::YieldOp>(loc, output);
+              });
+          result_tensors.push_back(if_op.getResult(0));
         }
         return result_tensors;
       });
diff --git a/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.h b/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.h
index cd617409ea82e6..cfc6d9a0d1751d 100644
--- a/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <optional>
+#include <vector>
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
@@ -36,7 +37,8 @@ class MlirInputSlicesFusion : public MlirFusionEmitterBase {
  public:
   explicit MlirInputSlicesFusion(const HloFusionAnalysis& analysis)
       : analysis_(analysis),
-        unroll_factor_(analysis.input_output_info().has_4_bit_output ? 2 : 1) {}
+        unroll_factor_(CeilOfRatio(
+            8, analysis.input_output_info().smallest_output_dtype_bits)) {}
   LaunchDimensions launch_dimensions() const override;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
@@ -56,6 +58,10 @@ class MlirInputSlicesFusion : public MlirFusionEmitterBase {
       mlir::func::FuncOp entry_function,
       const HloFusionInstruction& fusion) const override;
 
+  std::vector<mlir_converter::EpilogueSpecification> GetEpilogues(
+      const HloFusionInstruction& fusion,
+      mlir::MLIRContext* mlir_context) const override;
+
  private:
   const HloFusionAnalysis& analysis_;
   const int unroll_factor_;
diff --git a/third_party/xla/xla/service/gpu/fusions/input_slices_mlir_test.cc b/third_party/xla/xla/service/gpu/fusions/input_slices_mlir_test.cc
index b37dcfa7e0c23a..c1ff66242844d7 100644
--- a/third_party/xla/xla/service/gpu/fusions/input_slices_mlir_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/input_slices_mlir_test.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "xla/error_spec.h"
 #include "xla/service/gpu/fusions/mlir_emitter_test_base.h"
-#include "tsl/lib/core/status_test_util.h"
+#include "xla/service/gpu/model/indexing_test_utils.h"
 
 namespace xla {
 namespace gpu {
@@ -25,31 +25,104 @@ namespace {
 
 using MlirInputSlicesFusionTest = MlirEmitterTestBase<MlirInputSlicesFusion>;
 
+TEST_F(MlirInputSlicesFusionTest, ThreadIndexing) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    HloModule module
+
+    fused_computation {
+      %input = f32[4,5] parameter(0)
+      slice0 = f32[3,3] slice(input), slice={[1:4],[0:3]}
+      slice1 = f32[2,3] slice(input), slice={[0:2],[0:3]}
+      ROOT tuple = (f32[3,3], f32[2,3]) tuple(slice0, slice1)
+    }
+
+    ENTRY entry {
+      %input = f32[4,5] parameter(0)
+      ROOT %fusion = (f32[3,3], f32[2,3]) fusion(%input), kind=kLoop, calls=fused_computation
+    })")
+                    .value();
+
+  auto* root = module->entry_computation()->root_instruction();
+  auto analysis = AnalyzeFusion(*root, device_info_);
+
+  auto emitter = GetEmitter(analysis);
+
+  auto thread_id_to_output_indexing_0 =
+      emitter->ComputeThreadIdToOutputIndexing(0, &mlir_context_);
+  thread_id_to_output_indexing_0->Simplify(GetIndexingMapForInstruction);
+  EXPECT_THAT(thread_id_to_output_indexing_0->ToString(thread_id_printer_),
+              MatchIndexingString(R"(
+    (th_x, th_y, th_z, bl_x, bl_y, bl_z)[s0, s1] -> (
+      th_x floordiv 5 - 1,
+      th_x mod 5
+    )
+    domain:
+    th_x in [5, 19]
+    th_y in [0, 0]
+    th_z in [0, 0]
+    bl_x in [0, 0]
+    bl_y in [0, 0]
+    bl_z in [0, 0]
+    s0 in [0, 0]
+    s1 in [0, 0]
+    th_x mod 5 in [0, 2]
+  )"));
+  auto thread_id_to_output_indexing_1 =
+      emitter->ComputeThreadIdToOutputIndexing(1, &mlir_context_);
+  thread_id_to_output_indexing_1->Simplify(GetIndexingMapForInstruction);
+  EXPECT_THAT(thread_id_to_output_indexing_1->ToString(thread_id_printer_),
+              MatchIndexingString(R"(
+    (th_x, th_y, th_z, bl_x, bl_y, bl_z)[s0, s1] -> (
+      th_x floordiv 5,
+      th_x mod 5
+    )
+    domain:
+    th_x in [0, 9]
+    th_y in [0, 0]
+    th_z in [0, 0]
+    bl_x in [0, 0]
+    bl_y in [0, 0]
+    bl_z in [0, 0]
+    s0 in [0, 0]
+    s1 in [0, 0]
+    th_x mod 5 in [0, 2]
+  )"));
+}
+
 TEST_F(MlirInputSlicesFusionTest, SimpleInputSlices) {
   auto kHloString = R"(
     HloModule module
 
     fused_computation {
-      %input = f32[2,3,5,7]{2,1,0,3} parameter(0)
-      slice0 = f32[1,2,3,5]{2,1,0,3} slice(input), slice={[0:1],[1:3],[0:3],[2:7]}
+      %input = f32[2,4,5,7]{2,1,0,3} parameter(0)
+      slice0 = f32[1,3,3,5]{2,1,0,3} slice(input), slice={[0:1],[1:4],[0:3],[2:7]}
       slice1 = f32[1,2,3,5]{2,1,0,3} slice(input), slice={[0:1],[0:2],[0:3],[2:7]}
-      ROOT tuple = (f32[1,2,3,5]{2,1,0,3}, f32[1,2,3,5]{2,1,0,3}) tuple(slice0, slice1)
+      ROOT tuple = (f32[1,3,3,5]{2,1,0,3}, f32[1,2,3,5]{2,1,0,3}) tuple(slice0, slice1)
     }
     ENTRY entry {
-      %input = f32[2,3,5,7]{2,1,0,3} parameter(0)
-      ROOT %fusion = (f32[1,2,3,5]{2,1,0,3}, f32[1,2,3,5]{2,1,0,3}) fusion(%input), kind=kLoop, calls=fused_computation
+      %input = f32[2,4,5,7]{2,1,0,3} parameter(0)
+      ROOT %fusion = (f32[1,3,3,5]{2,1,0,3}, f32[1,2,3,5]{2,1,0,3}) fusion(%input), kind=kLoop, calls=fused_computation
     }
   )";
-  TF_ASSERT_OK(EmitAndCheckIR(kHloString, R"(
-    // CHECK: arith.cmpi sge
-    // CHECK: arith.cmpi sle
-    // CHECK: arith.andi
-    // CHECK: scf.if
-    // CHECK: func.func private @fused_computation_input
-    // CHECK: tensor.extract
-    // CHECK: func.func private @fused_computation_tuple
-    // CHECK-COUNT-2: xla_gpu.pure_call
-  )"));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
+}
+
+TEST_F(MlirInputSlicesFusionTest, SliceOfPad) {
+  auto kHloString = R"(
+    fusion {
+      %p0 = f32[6] parameter(0)
+      %c0 = f32[] constant(0)
+      %pad0 = f32[12] pad(%p0, %c0), padding=0_1_1
+      %slice0 = f32[11] slice(%pad0), slice={[1:12]}
+      %pad1 = f32[12] pad(%p0, %c0), padding=1_0_1
+      %slice1 = f32[11] slice(%pad1), slice={[1:12]}
+      ROOT %tuple.9 = (f32[11], f32[11]) tuple(%slice0, %slice1)
+    }
+
+    ENTRY entry {
+      input = f32[6] parameter(0)
+      ROOT fusion = (f32[11], f32[11]) fusion(input), kind=kLoop, calls=fusion
+    })";
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
 
diff --git a/third_party/xla/xla/service/gpu/fusions/input_slices_test.cc b/third_party/xla/xla/service/gpu/fusions/input_slices_test.cc
index 094bbfac7a27a9..9c10beb8be1e12 100644
--- a/third_party/xla/xla/service/gpu/fusions/input_slices_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/input_slices_test.cc
@@ -80,8 +80,8 @@ TEST_F(InputSlicesTest, ThreadIndexing) {
   EXPECT_THAT(thread_id_to_output_indexing->ToString(printer_),
               MatchIndexingString(R"(
     (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (0,
-      ((th_x + bl_x * 128) floordiv 3) mod 2,
-       (th_x + bl_x * 128) mod 3,
+      ((bl_x * 128 + th_x) floordiv 3) mod 2,
+       (bl_x * 128 + th_x) mod 3,
        ((bl_x * 64 + th_x floordiv 2) floordiv 3) mod 5)
     domain:
     th_x in [0, 127]
diff --git a/third_party/xla/xla/service/gpu/fusions/loop.cc b/third_party/xla/xla/service/gpu/fusions/loop.cc
index 07f0c29454157b..e14c2698ba0cac 100644
--- a/third_party/xla/xla/service/gpu/fusions/loop.cc
+++ b/third_party/xla/xla/service/gpu/fusions/loop.cc
@@ -14,8 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/service/gpu/fusions/loop.h"
 
+#include <algorithm>
 #include <cstdint>
 #include <optional>
+#include <tuple>
 #include <utility>
 #include <vector>
 
@@ -51,7 +53,7 @@ namespace gpu {
 namespace {
 
 const Shape& GetElementShape(const HloFusionAnalysis& analysis) {
-  const Shape* shape = &analysis.fusion_roots().front()->shape();
+  const Shape* shape = &analysis.fusion_root(0).shape();
   while (shape->IsTuple()) {
     shape = &shape->tuple_shapes(0);
   }
@@ -167,18 +169,18 @@ LaunchDimensionsConfig ComputeLoopFusionConfig(
   }
   // CHECK that unroll_factor is a power-of-2, as needed by the logic below.
   CHECK(absl::has_single_bit(static_cast<uint64_t>(unroll_factor)));
-  if (analysis.input_output_info().has_4_bit_output && unroll_factor == 1) {
-    // Ensure a single thread writes to a byte containing two int4 values by
-    // setting unroll_factor to 2. unroll_factor is always a power of 2, so
-    // setting it to 2 here ensures unroll_factor is even when there are 4-bit
-    // outputs. Setting unroll_factor is safe even if there are an odd number of
-    // elements, as the parallel loop emitter will insert a bounds check in this
-    // case to ensure the out-of-bounds element is not computed and written.
-    // Setting unroll_factor is safe even if MayPreventVectorization returns
-    // false, as the MayPreventVectorization check is an optimization, not a
-    // correctness requirement.
-    unroll_factor = 2;
-  }
+  // Ensure a single thread writes to a byte containing multiple values by
+  // setting unroll_factor to an appropriate number. Setting unroll_factor is
+  // safe even if the new unroll_factor doesn't divide the number of elements,
+  // as the parallel loop emitter will insert a bounds check in this case to
+  // ensure the out-of-bounds element is not computed and written. Setting
+  // unroll_factor is safe even if MayPreventVectorization returns false, as
+  // the MayPreventVectorization check is an optimization, not a correctness
+  // requirement.
+  unroll_factor = std::max(
+      unroll_factor,
+      CeilOfRatio(8, analysis.input_output_info().smallest_output_dtype_bits));
+  CHECK(absl::has_single_bit(static_cast<uint64_t>(unroll_factor)));
   VLOG(2) << "Unroll factor: " << unroll_factor;
 
   bool row_vectorized;
@@ -236,7 +238,8 @@ std::optional<IndexingMap> LoopFusion::ComputeThreadIdToInputIndexing(
   if (!thread_id_to_output_indexing.has_value()) {
     return std::nullopt;
   }
-  const HloInstruction* fusion_root = analysis_.fusion_roots()[root_index];
+  const HloInstruction* fusion_root =
+      &analysis_.fusion_root(root_index).instruction();
   auto output_to_input_indexing =
       ComputeOutputToInputIndexing(fusion_root, /*output_id=*/0, ctx);
   IndexingMapSet output_to_input_indexing_set =
diff --git a/third_party/xla/xla/service/gpu/fusions/loop_mlir.cc b/third_party/xla/xla/service/gpu/fusions/loop_mlir.cc
index 0e292ea6674fc1..7bf0df17db3aec 100644
--- a/third_party/xla/xla/service/gpu/fusions/loop_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/loop_mlir.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "xla/service/gpu/fusions/loop_mlir.h"
 
 #include <iterator>
+#include <optional>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -53,12 +54,8 @@ using llvm::SmallVector;
 using mlir::Value;
 using mlir::ValueRange;
 
-const Shape& GetFusionResultShape(const HloFusionAnalysis& analysis) {
-  const Shape* shape = &analysis.fusion_roots().front()->shape();
-  while (shape->IsTuple()) {
-    shape = &shape->tuple_shapes(0);
-  }
-  return *shape;
+const Shape& GetIndexShape(const Shape& shape) {
+  return shape.IsTuple() ? shape.tuple_shapes(0) : shape;
 }
 
 }  // namespace
@@ -66,8 +63,9 @@ const Shape& GetFusionResultShape(const HloFusionAnalysis& analysis) {
 std::optional<IndexingMap> MlirLoopFusion::ComputeThreadIdToOutputIndexing(
     int64_t root_index, mlir::MLIRContext* ctx) const {
   auto launch_dims = launch_dimensions();
-  return GetDefaultThreadIdIndexingMap(launch_dims, config_.unroll_factor,
-                                       GetFusionResultShape(analysis_), ctx);
+  return GetDefaultThreadIdIndexingMap(
+      launch_dims, config_.unroll_factor,
+      GetIndexShape(analysis_.fusion_root(root_index).shape()), ctx);
 }
 
 std::optional<IndexingMap> MlirLoopFusion::ComputeThreadIdToInputIndexing(
@@ -78,7 +76,8 @@ std::optional<IndexingMap> MlirLoopFusion::ComputeThreadIdToInputIndexing(
   if (!thread_id_to_output_indexing.has_value()) {
     return std::nullopt;
   }
-  const HloInstruction* fusion_root = analysis_.fusion_roots()[root_index];
+  const HloInstruction* fusion_root =
+      &analysis_.fusion_root(root_index).instruction();
   auto output_to_input_indexing =
       ComputeOutputToInputIndexing(fusion_root, /*output_id=*/0, ctx);
   IndexingMapSet output_to_input_indexing_set =
@@ -93,8 +92,9 @@ std::optional<IndexingMap> MlirLoopFusion::ComputeThreadIdToInputIndexing(
 }
 
 LaunchDimensions MlirLoopFusion::launch_dimensions() const {
-  return CalculateLaunchDimensions(GetFusionResultShape(analysis_),
-                                   analysis_.device_info(), config_);
+  return CalculateLaunchDimensions(
+      GetIndexShape(analysis_.fusion_root(0).shape()), analysis_.device_info(),
+      config_);
 }
 
 absl::Status MlirLoopFusion::EmitEntryFunction(
@@ -105,8 +105,6 @@ absl::Status MlirLoopFusion::EmitEntryFunction(
   mlir::ImplicitLocOpBuilder builder(entry_function.getLoc(), entry_function);
   builder.setInsertionPointToStart(entry_function.addEntryBlock());
 
-  // We enforce that all the root shapes have identical dimensions in
-  // IsHloOpSupported.
   auto indexing =
       ComputeThreadIdToOutputIndexing(0, entry_function.getContext());
   TF_RET_CHECK(indexing) << "Indexing is never nullopt";
@@ -114,11 +112,22 @@ absl::Status MlirLoopFusion::EmitEntryFunction(
   int num_inputs = fusion.fused_instructions_computation()->num_parameters();
   auto output_tensor_args =
       entry_function.getArguments().drop_front(num_inputs);
+  llvm::SmallVector<const Shape*> result_shapes;
+  for (const HloInstructionAdaptor& root : analysis_.fusion_roots()) {
+    if (root.shape().IsTuple()) {
+      for (const auto& shape : root.shape().tuple_shapes()) {
+        result_shapes.push_back(&shape);
+      }
+    } else {
+      result_shapes.push_back(&root.shape());
+    }
+  }
 
   auto body_builder = [&](ValueRange output_tensors, ValueRange dim_values,
                           ValueRange symbol_values) -> SmallVector<Value> {
-    auto output_indices = mlir_converter::ApplyAffineMap(
-        indexing->GetAffineMap(), dim_values, symbol_values, builder);
+    llvm::SmallVector<Value> first_output_indices;
+    builder.createOrFold<ApplyIndexingOp>(first_output_indices, dim_values,
+                                          symbol_values, *indexing);
     auto root_fn = call_targets(
         fusion.fused_instructions_computation()->root_instruction());
 
@@ -126,13 +135,19 @@ absl::Status MlirLoopFusion::EmitEntryFunction(
     // output indices.
     SmallVector<Value> operands(
         entry_function.getArguments().take_front(num_inputs));
-    absl::c_copy(output_indices, std::back_inserter(operands));
+    absl::c_copy(first_output_indices, std::back_inserter(operands));
     auto result_scalars =
         builder.create<PureCallOp>(root_fn, operands).getResults();
 
     SmallVector<Value> result_tensors;
     result_tensors.reserve(output_tensor_args.size());
-    for (auto [tensor, value] : llvm::zip(output_tensors, result_scalars)) {
+    for (auto [root_shape, tensor, value] :
+         llvm::zip(result_shapes, output_tensors, result_scalars)) {
+      llvm::SmallVector<Value> output_indices;
+      builder.createOrFold<ApplyIndexingOp>(
+          output_indices, first_output_indices, ValueRange{},
+          GetBitcastMap(*result_shapes.front(), *root_shape,
+                        builder.getContext()));
       result_tensors.push_back(builder.create<mlir::tensor::InsertOp>(
           value, tensor, output_indices));
     }
diff --git a/third_party/xla/xla/service/gpu/fusions/loop_mlir_test.cc b/third_party/xla/xla/service/gpu/fusions/loop_mlir_test.cc
index ba2dda49ccc44c..bd319d9d86d60d 100644
--- a/third_party/xla/xla/service/gpu/fusions/loop_mlir_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/loop_mlir_test.cc
@@ -55,7 +55,7 @@ TEST_F(MlirLoopFusionTest, ThreadId_IndexingUnrolled) {
               MatchIndexingString(R"(
   (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (
    (((bl_x * 16 + th_x floordiv 8) floordiv 3 + chunk_id * 5376) floordiv 625) mod 100,
-   (((th_x + bl_x * 128) floordiv 3 + chunk_id * 43008) floordiv 25) mod 200,
+   (((bl_x * 128 + th_x) floordiv 3 + chunk_id * 43008) floordiv 25) mod 200,
    (th_x * 4 + bl_x * 512 + chunk_id * 516096) mod 300 + unroll_id
   )
   domain:
@@ -150,7 +150,7 @@ TEST_F(MlirLoopFusionTest, ThreadId_Broadcast) {
               (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (
                 ((bl_x * 16 + th_x floordiv 8) floordiv 75) mod 10,
                 ((bl_x * 64 + th_x floordiv 2) floordiv 15) mod 20,
-                (th_x + bl_x * 128) mod 30)
+                (bl_x * 128 + th_x) mod 30)
                 domain:
                 th_x in [0, 127]
                 th_y in [0, 0]
@@ -264,7 +264,9 @@ TEST_F(MlirLoopFusionTest, ComplexOps) {
       %p1 = f32[2]{0} parameter(1)
       %p2 = c64[2]{0} parameter(2)
       %complex = c64[2] complex(%p0, %p1)
-      ROOT %add = c64[2] add(%complex, %p2)
+      %add = c64[2] add(%complex, %p2)
+      %cst = c64[2]{0} constant({(2.0, 0.0), (0.0, 2.0)})
+      ROOT %mul = c64[2] multiply(%add, %cst)
     }
     ENTRY entry_computation {
       p0 = f32[2] parameter(0)
@@ -276,16 +278,19 @@ TEST_F(MlirLoopFusionTest, ComplexOps) {
   TF_ASSERT_OK(EmitAndCheckIR(kHloString, R"(
     // CHECK: func.func @fused_computation
     // CHECK-NEXT: gpu.thread_id
-    // CHECK-NEXT: pure_call @fused_computation_add
+    // CHECK-NEXT: pure_call @fused_computation_mul
     // CHECK-NEXT: tensor.insert
     // CHECK-NEXT: return
 
-    // CHECK: func.func private @fused_computation_add
+    // CHECK: func.func private @fused_computation_mul
+    // CHECK-NEXT: arith.constant
     // CHECK-NEXT: tensor.extract
     // CHECK-NEXT: tensor.extract
     // CHECK-NEXT: complex.create
     // CHECK-NEXT: tensor.extract
     // CHECK-NEXT: complex.add
+    // CHECK-NEXT: tensor.extract
+    // CHECK-NEXT: complex.mul
   )"));
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
@@ -388,6 +393,85 @@ TEST_F(MlirLoopFusionTest, MinimumMaximum) {
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
 
+TEST_F(MlirLoopFusionTest, TupleBitcast) {
+  auto kHloString = R"(
+    HloModule Test
+
+    fused_computation {
+      param0 = f64[8] parameter(0)
+      param1 = f64[8] parameter(1)
+
+      minimum = f64[8] minimum(param0, param1)
+      maximum = f64[8] maximum(param0, param1)
+      bc = f64[2, 4] bitcast(maximum)
+      ROOT tuple = (f64[8], f64[2,4]) tuple(minimum, bc)
+    }
+
+    ENTRY main {
+      param0 = f64[8] parameter(0)
+      param1 = f64[8] parameter(1)
+      ROOT fusion = (f64[8], f64[2,4]) fusion(param0, param1),
+        kind=kLoop, calls=fused_computation
+    }
+  )";
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
+}
+
+TEST_F(MlirLoopFusionTest, NestedTuple) {
+  auto kHloString = R"(
+    add {
+      scalar_lhs.0 = f32[] parameter(0)
+      scalar_lhs.1 = f32[] parameter(1)
+      scalar_rhs.0 = f32[] parameter(2)
+      scalar_rhs.1 = f32[] parameter(3)
+      add = f32[] add(scalar_lhs.0, scalar_rhs.0)
+      mul = f32[] multiply(scalar_lhs.1, scalar_rhs.1)
+      ROOT t = (f32[], f32[]) tuple(add, mul)
+    }
+    fused_computation {
+      param_0 = f32[3,4,5]{2,1,0} parameter(0)
+      param_1 = f32[3,4,5]{2,1,0} parameter(1)
+      param_2 = f32[] parameter(2)
+      param_3 = f32[4] parameter(3)
+      reduce = (f32[4], f32[4]) reduce(f32[3,4,5]{2,1,0} param_0,
+          f32[3,4,5]{2,1,0} %param_1, f32[] param_2, f32[] param_2),
+          dimensions={0,2}, to_apply=add
+      log = f32[4] log(param_3)
+      ROOT tuple = ((f32[4], f32[4]), f32[4]) tuple(reduce, log)
+    }
+    ENTRY main {
+      a = f32[3,4,5]{2,1,0} parameter(0)
+      b = f32[3,4,5]{2,1,0} parameter(1)
+      c = f32[] constant(0)
+      d = f32[4] parameter(2)
+      ROOT fusion = ((f32[4], f32[4]), f32[4]) fusion(a, b, c, d),
+        kind=kLoop, calls=fused_computation
+    }
+  )";
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
+}
+
+TEST_F(MlirLoopFusionTest, DynamicSliceWith64BitInput) {
+  // Lowering this kernel with 32 bit indices causes an underflow of `c`,
+  // resulting in slicing the last four elements instead of the first four.
+  constexpr auto kHloString = R"(
+    %fused_computation {
+      %p0 = s64[] parameter(0)
+      %p1 = f64[5] parameter(1)
+      ROOT slice = f64[4] dynamic-slice(%p1, %p0), dynamic_slice_sizes={4}
+    }
+
+    ENTRY main {
+      %c = s64[] constant(-1000000000000)
+      %p0 = f64[5] parameter(0)
+      ROOT %fusion = f64[4]{0} fusion(%c, %p0), kind=kInput, calls=%fused_computation
+    })";
+  TF_ASSERT_OK(EmitAndCheckIR(kHloString, R"(
+    // CHECK: dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<index, 64 : i32>>
+  )"));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/loop_test.cc b/third_party/xla/xla/service/gpu/fusions/loop_test.cc
index d87bc8fa0db98e..4507879c2310ea 100644
--- a/third_party/xla/xla/service/gpu/fusions/loop_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/loop_test.cc
@@ -90,7 +90,7 @@ TEST_F(LoopTest, ThreadIndexingUnrolled) {
               MatchIndexingString(R"(
   (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (
    (((bl_x * 16 + th_x floordiv 8) floordiv 3 + chunk_id * 5376) floordiv 625) mod 100,
-   (((th_x + bl_x * 128) floordiv 3 + chunk_id * 43008) floordiv 25) mod 200,
+   (((bl_x * 128 + th_x) floordiv 3 + chunk_id * 43008) floordiv 25) mod 200,
    (th_x * 4 + bl_x * 512 + chunk_id * 516096) mod 300 + unroll_id
   )
   domain:
@@ -186,7 +186,7 @@ TEST_F(LoopTest, Broadcast) {
               (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (
                 ((bl_x * 16 + th_x floordiv 8) floordiv 75) mod 10,
                 ((bl_x * 64 + th_x floordiv 2) floordiv 15) mod 20,
-                (th_x + bl_x * 128) mod 30)
+                (bl_x * 128 + th_x) mod 30)
                 domain:
                 th_x in [0, 127]
                 th_y in [0, 0]
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/BUILD b/third_party/xla/xla/service/gpu/fusions/mlir/BUILD
index 997ee83be9a228..35d2cf8bcd8a9d 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/BUILD
@@ -24,7 +24,6 @@ cc_library(
         "//xla:union_find",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:hlo_fusion_analysis",
-        "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu/fusions:fusion_emitter",
         "//xla/service/gpu/model:indexing_analysis",
         "//xla/service/gpu/model:indexing_map",
@@ -51,7 +50,7 @@ xla_cc_test(
     deps = [
         ":computation_partitioner",
         "//xla/hlo/ir:hlo",
-        "//xla/service/gpu:launch_dimensions",
+        "//xla/service/gpu/model:indexing_analysis",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_googletest//:gtest",
@@ -101,7 +100,6 @@ cc_library(
         "@llvm-project//mlir:DataLayoutInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:LLVMDialect",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
@@ -122,6 +120,7 @@ xla_cc_test(
         "//xla/service:hlo_parser",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu/fusions/mlir/ir:xla_gpu",
+        "//xla/service/gpu/model:indexing_analysis",
         "//xla/service/gpu/model:indexing_map",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor:launch_dim",
@@ -161,9 +160,12 @@ cc_library(
         "//xla:status_macros",
         "//xla:util",
         "//xla/hlo/ir:hlo",
+        "//xla/mlir/tools/mlir_replay/public:compiler_trace_instrumentation",
+        "//xla/mlir/tools/mlir_replay/public:compiler_trace_proto_cc",
         "//xla/mlir_hlo",
         "//xla/mlir_hlo:mhlo_passes",
         "//xla/service:buffer_assignment",
+        "//xla/service:dump",
         "//xla/service/gpu:ir_emitter_context",
         "//xla/service/gpu:kernel_arguments",
         "//xla/service/gpu:kernel_reuse_cache",
@@ -177,7 +179,6 @@ cc_library(
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -278,8 +279,8 @@ gentbl_cc_library(
 cc_library(
     name = "passes",
     srcs = [
+        "convert_xla_gpu_pure_call_ops.cc",
         "expand_float_ops.cc",
-        "lower_func.cc",
         "lower_tensors.cc",
         "lower_to_llvm.cc",
         "lower_xla_gpu_to_scf.cc",
@@ -300,8 +301,10 @@ cc_library(
         "//xla/service/gpu/fusions/mlir/ir:xla_gpu",
         "//xla/service/gpu/model:indexing_analysis",
         "//xla/service/gpu/model:indexing_map",
+        "//xla/stream_executor:device_description",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AffineDialect",
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/computation_partitioner.cc b/third_party/xla/xla/service/gpu/fusions/mlir/computation_partitioner.cc
index 18c74f4dfc1fe1..3edf17babc28b4 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/computation_partitioner.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/computation_partitioner.cc
@@ -48,6 +48,7 @@ limitations under the License.
 #include "xla/service/gpu/fusions/mlir/type_util.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/model/indexing_analysis.h"
+#include "xla/service/gpu/model/indexing_map.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/shape.h"
 #include "xla/translate/hlo_to_mhlo/hlo_utils.h"
@@ -58,6 +59,14 @@ namespace gpu {
 namespace mlir_converter {
 namespace {
 
+int Arity(const Shape& shape) {
+  return shape.IsTuple() ? shape.tuple_shapes_size() : 1;
+}
+
+const Shape& TupleShape(const Shape& shape, int index) {
+  return shape.IsTuple() ? shape.tuple_shapes(index) : shape;
+}
+
 absl::flat_hash_map<const HloInstruction*, int> PartitionGraphByIndexing(
     const HloComputation& computation) {
   constexpr int kRootIndexing = 0;
@@ -81,6 +90,8 @@ absl::flat_hash_map<const HloInstruction*, int> PartitionGraphByIndexing(
     for (auto* user : instr->users()) {
       auto user_indexing = indexing_for_instr(user);
       if (user->opcode() == HloOpcode::kConcatenate ||
+          user->opcode() == HloOpcode::kSelect ||
+          user->opcode() == HloOpcode::kTuple ||
           (instr_indexing && user_indexing != *instr_indexing)) {
         instr_indexing = std::nullopt;
         break;
@@ -103,8 +114,9 @@ EpilogueSpecification EpilogueSpecification::FromIdentityIndexing(
   EpilogueSpecification result;
   absl::c_copy(root->shape().dimensions(),
                std::back_inserter(result.index_ranges));
-  result.root_indexing.push_back(mlir::AffineMap::getMultiDimIdentityMap(
-      root->shape().rank(), mlir_context));
+  result.roots.push_back(root);
+  result.root_indexing.push_back(
+      CreateIdentityMap(root->shape(), mlir_context));
   result.heroes.push_back(hero);
   return result;
 }
@@ -112,12 +124,26 @@ EpilogueSpecification EpilogueSpecification::FromIdentityIndexing(
 EpilogueSpecification EpilogueSpecification::FromOutputIndexing(
     const HloFusionAnalysis& analysis,
     const std::vector<const HloInstruction*>& heroes,
+    const std::vector<const HloInstruction*>& roots,
     const KernelFusionInterface& fusion, mlir::MLIRContext* mlir_context) {
   EpilogueSpecification result;
 
-  for (auto [index, hero] : llvm::enumerate(analysis.fusion_heroes())) {
-    auto indexing = fusion.ComputeThreadIdToOutputIndexing(index, mlir_context);
-    if (index == 0) {
+  absl::flat_hash_map<const HloInstruction*, const HloInstruction*>
+      root_to_hero;
+  for (auto [root, hero] :
+       llvm::zip(analysis.fusion_roots(), analysis.fusion_heroes())) {
+    root_to_hero[&root.instruction()] = &hero.instruction();
+  }
+  absl::flat_hash_map<const HloInstruction*, int> root_to_index;
+  for (auto [index, root] : llvm::enumerate(analysis.fusion_roots())) {
+    root_to_index[&root.instruction()] = root_to_index.size();
+  }
+
+  result.root_indexing.reserve(roots.size());
+  for (auto* root : roots) {
+    auto indexing = fusion.ComputeThreadIdToOutputIndexing(root_to_index[root],
+                                                           mlir_context);
+    if (result.index_ranges.empty()) {
       result.index_ranges.reserve(indexing->GetDimensionCount() +
                                   indexing->GetSymbolCount());
       for (const auto& dim : indexing->GetDimensionBounds()) {
@@ -127,14 +153,14 @@ EpilogueSpecification EpilogueSpecification::FromOutputIndexing(
         result.index_ranges.push_back(sym.upper + 1);
       }
     }
-
-    auto epilogue_indexing =
-        ComputeEpilogueInputToOutputIndexing(hero, mlir_context);
-    auto root_indexing = ComposeIndexingMaps(*indexing, epilogue_indexing);
-
-    result.root_indexing.push_back(root_indexing.GetAffineMap());
+    auto* hero = root_to_hero[root];
+    auto epilogue_indexing = ComputeEpilogueInputToOutputIndexing(
+        {*hero, &analysis.fusion()}, {*root, &analysis.fusion()}, mlir_context);
+    result.root_indexing.push_back(
+        ComposeIndexingMaps(*indexing, epilogue_indexing));
   }
   result.heroes = heroes;
+  result.roots = roots;
   return result;
 }
 
@@ -209,8 +235,12 @@ PartitionedComputation::PartitionedComputation(
       // which has the benefit of leading to slightly easier to read IR.
       return user->opcode() == HloOpcode::kConcatenate;
     };
+    auto is_tuple = [&](const HloInstruction* user) {
+      return user->shape().IsTuple();
+    };
     can_merge &= absl::c_none_of(instruction->users(), is_bad_gather);
     can_merge &= absl::c_none_of(instruction->users(), is_concat);
+    can_merge &= absl::c_none_of(instruction->users(), is_tuple);
     if (can_merge) {
       auto& set = disjoint_sets[instruction];
       for (auto* user : instruction->users()) {
@@ -236,7 +266,7 @@ PartitionedComputation::PartitionedComputation(
     };
 
     std::vector<const HloInstruction*> roots;
-    std::vector<mlir::AffineMap> root_indexing;
+    std::vector<IndexingMap> root_indexing;
     const xla::Shape* first_root_shape = nullptr;
     for (auto* instruction : instructions) {
       if (instruction->user_count() == 0 ||
@@ -250,17 +280,16 @@ PartitionedComputation::PartitionedComputation(
             root_indexing.push_back(root_indexing.front());
           } else {
             // Bitcast from the first root to the target shape.
-            auto bitcast = GetBitcastMap(*first_root_shape,
-                                         instruction->shape(), mlir_context);
-            root_indexing.push_back(bitcast.GetAffineMap());
+            root_indexing.push_back(GetBitcastMap(
+                *first_root_shape, instruction->shape(), mlir_context));
           }
         } else {
           first_root_shape = &instruction->shape();
-          if (first_root_shape->IsTuple()) {
+          while (first_root_shape->IsTuple()) {
             first_root_shape = &first_root_shape->tuple_shapes()[0];
           }
-          root_indexing.push_back(mlir::AffineMap::getMultiDimIdentityMap(
-              first_root_shape->rank(), mlir_context));
+          root_indexing.push_back(
+              CreateIdentityMap(*first_root_shape, mlir_context));
         }
       }
     }
@@ -289,41 +318,33 @@ PartitionedComputation::PartitionedComputation(
   }
 }
 
-std::optional<PartitionedComputation::Subgraph>
-PartitionedComputation::Subgraph::ForEpilogue(
-    const std::optional<EpilogueSpecification>& epilogue) {
-  if (!epilogue) {
-    return std::nullopt;
-  }
-  const auto* computation = epilogue->heroes.front()->parent();
-  if ((epilogue->heroes.size() == 1 &&
-       epilogue->heroes[0] == computation->root_instruction())) {
-    return std::nullopt;
-  }
-
+PartitionedComputation::Subgraph PartitionedComputation::Subgraph::ForEpilogue(
+    const EpilogueSpecification& epilogue) {
+  if (epilogue.roots.empty()) return {};
+  const auto* computation = epilogue.heroes.front()->parent();
   PartitionedComputation::Subgraph subgraph;
   subgraph.name = llvm_ir::SanitizeFunctionName(
-      absl::StrCat(computation->name(), "__epilogue__"));
-  if (computation->root_instruction()->opcode() == HloOpcode::kTuple) {
-    absl::c_copy(computation->root_instruction()->operands(),
-                 std::back_inserter(subgraph.roots));
-  } else {
-    subgraph.roots = {computation->root_instruction()};
-  }
-
-  for (auto* hero : epilogue->heroes) {
-    if (!subgraph.injected_values.contains(hero)) {
-      int index = subgraph.injected_values.size();
-      subgraph.injected_values[hero] = index;
+      absl::StrCat(computation->name(), "__epilogue__",
+                   absl::StrJoin(epilogue.roots, "_",
+                                 [](std::string* out, const auto* root) {
+                                   absl::StrAppend(out, root->name());
+                                 })));
+  subgraph.roots = epilogue.roots;
+
+  int index = 0;
+  for (auto* hero : epilogue.heroes) {
+    if (subgraph.injected_value_starts.insert({hero, index}).second) {
+      index += Arity(hero->shape());
     }
   }
+  subgraph.num_injected_values = index;
 
   absl::flat_hash_set<const HloInstruction*> seen;
   std::function<void(const HloInstruction*)> visit;
   visit = [&](const HloInstruction* instruction) {
     if (!seen.insert(instruction).second) return;
     for (auto [index, operand] : llvm::enumerate(instruction->operands())) {
-      if (!subgraph.injected_values.contains(operand)) {
+      if (!subgraph.injected_value_starts.contains(operand)) {
         visit(operand);
       }
     }
@@ -331,16 +352,15 @@ PartitionedComputation::Subgraph::ForEpilogue(
 
   visit(computation->root_instruction());
   subgraph.instructions = std::move(seen);
-  subgraph.index_ranges = epilogue->index_ranges;
-  subgraph.root_indexing = epilogue->root_indexing;
+  subgraph.index_ranges = epilogue.index_ranges;
+  subgraph.root_indexing = epilogue.root_indexing;
   return subgraph;
 }
 
 PartitionedComputations::PartitionedComputations(
     const HloComputation* fusion, mlir::MLIRContext* mlir_context,
-    std::optional<EpilogueSpecification> epilogue)
-    : fusion_(fusion),
-      epilogue_(PartitionedComputation::Subgraph::ForEpilogue(epilogue)) {
+    std::vector<EpilogueSpecification> epilogues)
+    : fusion_(fusion) {
   // Collect all transitively called computations (including the fusion itself).
   absl::flat_hash_set<const HloComputation*> seen;
   std::vector<const HloComputation*> computations;
@@ -355,9 +375,12 @@ PartitionedComputations::PartitionedComputations(
   visit(fusion);
 
   absl::flat_hash_set<const HloInstruction*> roots;
-  if (epilogue) {
-    roots = {epilogue->heroes.begin(), epilogue->heroes.end()};
-    for (auto* instruction : epilogue->heroes) {
+  epilogues_.reserve(epilogues.size());
+  for (const auto& epilogue : epilogues) {
+    epilogues_.push_back(
+        PartitionedComputation::Subgraph::ForEpilogue(epilogue));
+    roots.insert(epilogue.heroes.begin(), epilogue.heroes.end());
+    for (auto* instruction : epilogue.heroes) {
       roots.insert(instruction->operands().begin(),
                    instruction->operands().end());
     }
@@ -381,22 +404,21 @@ PartitionedComputations::DeclareFunctions(mlir::ModuleOp module) const {
       mapping;
   mlir::ImplicitLocOpBuilder builder(module.getLoc(), module->getContext());
   builder.setInsertionPointToEnd(module.getBody());
+  auto create_funcs =
+      [&](absl::Span<const PartitionedComputation::Subgraph> subgraphs) {
+        for (const auto& subgraph : subgraphs) {
+          if (subgraph.roots.empty()) continue;
+          auto func_op = CreateSubgraphMlirFunction(subgraph, builder);
+          func_op->setAttr("llvm.linkage", mlir::LLVM::LinkageAttr::get(
+                                               module->getContext(),
+                                               mlir::LLVM::Linkage::Internal));
+          mapping[&subgraph] = func_op;
+        }
+      };
   for (const auto& computation : partitioned_computations_) {
-    for (const auto& subgraph : computation.subgraphs()) {
-      auto func_op = CreateSubgraphMlirFunction(subgraph, builder);
-      func_op->setAttr("llvm.linkage", mlir::LLVM::LinkageAttr::get(
-                                           module->getContext(),
-                                           mlir::LLVM::Linkage::Internal));
-      mapping[&subgraph] = func_op;
-    }
-  }
-  if (epilogue_) {
-    auto func_op = CreateSubgraphMlirFunction(*epilogue_, builder);
-    func_op->setAttr("llvm.linkage",
-                     mlir::LLVM::LinkageAttr::get(
-                         module->getContext(), mlir::LLVM::Linkage::Internal));
-    mapping[&*epilogue_] = func_op;
+    create_funcs(computation.subgraphs());
   }
+  create_funcs(epilogues_);
   return mapping;
 }
 
@@ -429,12 +451,9 @@ mlir::func::FuncOp CreateSubgraphMlirFunction(
   };
 
   for (auto* root : subgraph.roots) {
-    if (root->shape().IsTuple()) {
-      for (auto& shape : root->shape().tuple_shapes()) {
-        result_types.push_back(element_type(shape));
-      }
-    } else {
-      result_types.push_back(element_type(root->shape()));
+    for (auto ty : ShapeToMlirTypes(root->shape(), b)) {
+      result_types.push_back(
+          mlir::cast<mlir::RankedTensorType>(ty).getElementType());
     }
   }
 
@@ -456,11 +475,14 @@ mlir::func::FuncOp CreateSubgraphMlirFunction(
     // Populate arguments for injected parameters (values that are computed
     // outside the function and are passed into it).
     int operand_offset = parameter_types.size();
-    parameter_types.resize(operand_offset + subgraph.injected_values.size());
+    parameter_types.resize(operand_offset + subgraph.num_injected_values);
     arg_attrs.resize(parameter_types.size());
 
-    for (auto [value, index] : subgraph.injected_values) {
-      parameter_types[operand_offset + index] = element_type(value->shape());
+    for (auto [value, start] : subgraph.injected_value_starts) {
+      for (int index = 0; index < Arity(value->shape()); ++index) {
+        parameter_types[operand_offset + start + index] =
+            element_type(TupleShape(value->shape(), index));
+      }
     }
   } else {
     for (auto* param : computation->parameter_instructions()) {
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/computation_partitioner.h b/third_party/xla/xla/service/gpu/fusions/mlir/computation_partitioner.h
index bbbf02f951fe90..ecded59f580d25 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/computation_partitioner.h
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/computation_partitioner.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/fusions/fusion_emitter.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/model/indexing_map.h"
 
 namespace xla {
 namespace gpu {
@@ -45,16 +46,18 @@ struct EpilogueSpecification {
   static EpilogueSpecification FromOutputIndexing(
       const HloFusionAnalysis& analysis,
       const std::vector<const HloInstruction*>& heroes,
+      const std::vector<const HloInstruction*>& roots,
       const KernelFusionInterface& fusion, mlir::MLIRContext* mlir_context);
 
   std::vector<const HloInstruction*> heroes;
+  std::vector<const HloInstruction*> roots;
 
   // The ranges of the indices that the subgraph is called with.
   std::vector<int64_t> index_ranges;
 
   // Indexing maps for each root output. All maps must have the same number of
   // input dimensions.
-  std::vector<mlir::AffineMap> root_indexing;
+  std::vector<IndexingMap> root_indexing;
 };
 
 // Partitions an HLO computation into subgraphs so that all users of a node have
@@ -85,10 +88,10 @@ struct EpilogueSpecification {
 // than its users.
 class PartitionedComputation {
  public:
-  explicit PartitionedComputation(
-      const HloComputation* computation, mlir::MLIRContext* mlir_context,
-      std::function<bool(const HloInstruction*)> is_subgraph_root =
-          [](const HloInstruction*) { return false; });
+  explicit PartitionedComputation(const HloComputation* computation,
+                                  mlir::MLIRContext* mlir_context,
+                                  std::function<bool(const HloInstruction*)>
+                                      is_subgraph_root = HloPredicateFalse);
 
   struct Subgraph {
     // A unique name of the subgraph. Used for function names.
@@ -100,25 +103,26 @@ class PartitionedComputation {
     // The roots (return values of the function).
     std::vector<const HloInstruction*> roots;
 
-    // The ranges of the indices that the subgraph is called with.
+    // The ranges of the indices that the subgraph is called with (dimensions
+    // and symbols).
     std::vector<int64_t> index_ranges;
 
     // Maps from raw indices to root indices.
-    std::vector<mlir::AffineMap> root_indexing;
+    std::vector<IndexingMap> root_indexing;
 
     // For values that are function arguments (not function calls), stores
-    // the mapping from value to the argument index. The arguments always
-    // come after the tensor parameters and output indices; the indices are
-    // relative to the argument after the last index argument.
-    absl::flat_hash_map<const HloInstruction*, int> injected_values;
+    // the mapping from value to the starting argument index. The arguments
+    // always come after the tensor parameters and output indices; the indices
+    // are relative to the argument after the last index argument.
+    absl::flat_hash_map<const HloInstruction*, int> injected_value_starts;
+    // The sum of the arity of the injected values.
+    int num_injected_values = 0;
 
     std::string ToString() const;
 
     // Creates a subgraph for the given heroes' epilogue. The heroes values will
     // be injected into the subgraph.
-    // If there is no epilogue (the root is the hero), returns nullopt.
-    static std::optional<Subgraph> ForEpilogue(
-        const std::optional<EpilogueSpecification>& epilogue);
+    static Subgraph ForEpilogue(const EpilogueSpecification& epilogue);
   };
 
   absl::Span<const Subgraph> subgraphs() const { return subgraphs_; }
@@ -155,7 +159,7 @@ class PartitionedComputations {
   // for the given heroes.
   explicit PartitionedComputations(
       const HloComputation* fusion, mlir::MLIRContext* mlir_context,
-      std::optional<EpilogueSpecification> epilogue = std::nullopt);
+      std::vector<EpilogueSpecification> epilogues = {});
 
   const PartitionedComputation& FindPartitionedComputation(
       const HloComputation* computation) const {
@@ -171,8 +175,8 @@ class PartitionedComputations {
 
   // If the fusion has an epilogue (i.e., the heroes are inside the fusion),
   // returns it.
-  const std::optional<PartitionedComputation::Subgraph>& epilogue() const {
-    return epilogue_;
+  const std::vector<PartitionedComputation::Subgraph>& epilogues() const {
+    return epilogues_;
   }
 
   const HloComputation* fusion() const { return fusion_; }
@@ -195,7 +199,7 @@ class PartitionedComputations {
   absl::flat_hash_map<const HloComputation*, const PartitionedComputation*>
       computation_to_partitioning_;
   const HloComputation* fusion_;
-  std::optional<PartitionedComputation::Subgraph> epilogue_;
+  std::vector<PartitionedComputation::Subgraph> epilogues_;
 };
 
 // Returns an MLIR function declaration for the given subgraph. For subgraphs of
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/computation_partitioner_test.cc b/third_party/xla/xla/service/gpu/fusions/mlir/computation_partitioner_test.cc
index fb51ddd8bdb613..7fb007ad3c9bd5 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/computation_partitioner_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/computation_partitioner_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/tests/hlo_test_base.h"
 
 namespace xla {
@@ -131,7 +132,7 @@ TEST_F(ComputationPartitionerTest, TupleRoot) {
   ASSERT_NE(fusion, nullptr);
   PartitionedComputation computation(fusion, &mlir_context_);
 
-  ASSERT_THAT(computation.subgraphs(), SizeIs(1)) << computation.ToString();
+  ASSERT_THAT(computation.subgraphs(), SizeIs(5)) << computation.ToString();
 }
 
 TEST_F(ComputationPartitionerTest, Epilogue) {
@@ -159,21 +160,21 @@ TEST_F(ComputationPartitionerTest, Epilogue) {
   auto* fused_computation = module->GetComputationWithName("fused_computation");
   EpilogueSpecification epilogue{
       /*heroes=*/{fused_computation->GetInstructionWithName("reduce")},
+      /*roots=*/
+      {fused_computation->GetInstructionWithName("log"),
+       fused_computation->GetInstructionWithName("sign")},
       /*index_ranges=*/{1, 42},
-      {mlir::AffineMap::get(1, 0, mlir::getAffineDimExpr(0, &mlir_context_))}};
-  PartitionedComputations fusion(fused_computation, &mlir_context_, epilogue);
-
-  // The epilogue should be one subgraph.
-  EXPECT_EQ(
-      &fusion.FindSubgraph(
-          fused_computation->GetInstructionWithName("bitcast")),
-      &fusion.FindSubgraph(fused_computation->GetInstructionWithName("tuple")));
+      {CreateIdentityMap(
+          fused_computation->root_instruction()->shape().tuple_shapes(0),
+          &mlir_context_)}};
+  PartitionedComputations fusion(fused_computation, &mlir_context_, {epilogue});
 
   mlir::ImplicitLocOpBuilder builder(mlir::UnknownLoc::get(&mlir_context_),
                                      &mlir_context_);
   EXPECT_EQ(
-      PrintAndErase(CreateSubgraphMlirFunction(*fusion.epilogue(), builder)),
-      "func.func private @fused_computation__epilogue__(tensor<4xf32>, "
+      PrintAndErase(
+          CreateSubgraphMlirFunction(fusion.epilogues().front(), builder)),
+      "func.func private @fused_computation__epilogue__log_sign(tensor<4xf32>, "
       "index {xla.range = [0 : index, 0 : index]}, "
       "index {xla.range = [0 : index, 41 : index]}, "
       "f32) -> (f32, f32)");
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/lower_func.cc b/third_party/xla/xla/service/gpu/fusions/mlir/convert_xla_gpu_pure_call_ops.cc
similarity index 70%
rename from third_party/xla/xla/service/gpu/fusions/mlir/lower_func.cc
rename to third_party/xla/xla/service/gpu/fusions/mlir/convert_xla_gpu_pure_call_ops.cc
index 42ffb6dc1cbc0b..7f06615b91a9d2 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/lower_func.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/convert_xla_gpu_pure_call_ops.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <memory>
+#include <utility>
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
@@ -20,14 +21,10 @@ limitations under the License.
 
 namespace xla {
 namespace gpu {
-
-#define GEN_PASS_DEF_LOWERFUNCPASS
-#include "xla/service/gpu/fusions/mlir/passes.h.inc"
-
 namespace {
 
-using mlir::failure;
-using mlir::success;
+#define GEN_PASS_DEF_CONVERTPURECALLOPSPASS
+#include "xla/service/gpu/fusions/mlir/passes.h.inc"
 
 struct RewriteCall : mlir::OpRewritePattern<PureCallOp> {
   using OpRewritePattern::OpRewritePattern;
@@ -36,28 +33,28 @@ struct RewriteCall : mlir::OpRewritePattern<PureCallOp> {
       PureCallOp op, mlir::PatternRewriter& rewriter) const override {
     rewriter.replaceOpWithNewOp<mlir::func::CallOp>(
         op, op.getResultTypes(), op.getOperands(), op->getAttrs());
-    return success();
+    return mlir::success();
   }
 };
 
-class LowerFuncPass : public impl::LowerFuncPassBase<LowerFuncPass> {
+class ConvertPureCallOpsPass
+    : public impl::ConvertPureCallOpsPassBase<ConvertPureCallOpsPass> {
  public:
-  void runOnOperation() override;
-};
-
-void LowerFuncPass::runOnOperation() {
-  mlir::RewritePatternSet patterns(&getContext());
-  patterns.add<RewriteCall>(&getContext());
-  if (mlir::failed(mlir::applyPatternsAndFoldGreedily(getOperation(),
-                                                      std::move(patterns)))) {
-    signalPassFailure();
+  void runOnOperation() override {
+    auto* ctx = &getContext();
+    mlir::RewritePatternSet patterns(ctx);
+    patterns.add<RewriteCall>(ctx);
+    if (mlir::failed(mlir::applyPatternsAndFoldGreedily(getOperation(),
+                                                        std::move(patterns)))) {
+      signalPassFailure();
+    }
   }
-}
+};
 
 }  // namespace
 
-std::unique_ptr<::mlir::Pass> CreateLowerFuncPass() {
-  return std::make_unique<LowerFuncPass>();
+std::unique_ptr<::mlir::Pass> CreateConvertPureCallOpsPass() {
+  return std::make_unique<ConvertPureCallOpsPass>();
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
index 90b1ea30903a44..81d7ac0431523e 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <functional>
 #include <iterator>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -35,13 +36,11 @@ limitations under the License.
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/MathExtras.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"  // from @llvm-project
 #include "mlir/Dialect/Affine/LoopUtils.h"  // from @llvm-project
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Complex/IR/Complex.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
 #include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
 #include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
 #include "mlir/IR/AffineExpr.h"  // from @llvm-project
@@ -100,14 +99,10 @@ using mlir::OpBuilder;
 using mlir::Value;
 using mlir::ValueRange;
 using mlir::arith::AndIOp;
-using mlir::arith::CmpFOp;
-using mlir::arith::CmpFPredicate;
 using mlir::arith::CmpIOp;
 using mlir::arith::CmpIPredicate;
 using mlir::arith::ConstantIndexOp;
 using mlir::arith::ConstantOp;
-using mlir::arith::SelectOp;
-using mlir::scf::ForOp;
 using mlir::scf::IfOp;
 using mlir::scf::YieldOp;
 
@@ -168,30 +163,6 @@ static auto& kUnsupportedOps =
                                         HloOpcode::kStochasticConvert,
                                         HloOpcode::kCall};
 
-bool IsUnsupportedConstant(const HloInstruction* instr) {
-  return instr->opcode() == HloOpcode::kConstant &&
-         !ShapeUtil::IsEffectiveScalar(instr->shape());
-}
-
-bool IsUnsupportedTuple(const HloInstruction* instr) {
-  if (instr->opcode() != HloOpcode::kTuple) {
-    return false;
-  }
-
-  if (instr->user_count() > 0) {
-    // Internal tuples are unsupported.
-    return true;
-  }
-
-  // Nested tuples and tokens are unsupported.
-  if (absl::c_any_of(instr->operands(),
-                     [&](auto* op) { return !op->shape().IsArray(); })) {
-    return true;
-  }
-
-  return false;
-}
-
 bool IsUnsupportedGather(const HloInstruction* instr) {
   // We assume gather simplifier ran, so we don't need to support all gather
   // forms.
@@ -228,6 +199,7 @@ SmallVector<Value> ConvertToSignless(const SmallVector<Value>& values,
   SmallVector<Value> results;
   results.reserve(values.size());
   for (auto& value : values) {
+    CHECK(value != nullptr);
     auto signless_type = sign_converter.convertType(value.getType());
     results.push_back(
         b.create<mlir::UnrealizedConversionCastOp>(signless_type, value)
@@ -261,9 +233,7 @@ absl::StatusOr<SmallVector<Value>> EmitReduce(
   auto body =
       [&](ValueRange iter_args, ValueRange dim_values,
           ValueRange symbol_values) -> absl::StatusOr<SmallVector<Value>> {
-    auto indices = ApplyAffineMap(indexing_map.GetAffineMap(), dim_values,
-                                  symbol_values, b);
-
+    auto indices = ApplyIndexing(indexing_map, dim_values, symbol_values, b);
     SmallVector<Value> args{iter_args};
     for (int i = 0; i < instr->operand_count() / 2; ++i) {
       TF_ASSIGN_OR_RETURN(
@@ -321,9 +291,7 @@ absl::StatusOr<SmallVector<Value>> EmitReduceWindow(
   auto body =
       [&](ValueRange iter_args, ValueRange dim_values,
           ValueRange symbol_values) -> absl::StatusOr<SmallVector<Value>> {
-    auto indices = ApplyAffineMap(indexing_map.GetAffineMap(), dim_values,
-                                  symbol_values, b);
-
+    auto indices = ApplyIndexing(indexing_map, dim_values, symbol_values, b);
     SmallVector<Value> args{iter_args};
     for (auto [index, input] : llvm::enumerate(reduce_window->inputs())) {
       TF_ASSIGN_OR_RETURN(
@@ -509,9 +477,8 @@ SmallVector<SmallVector<Value>> GetInputIndices(
   SmallVector<SmallVector<Value>> indices;
   for (auto& maps : indexing.indexing_maps) {
     CHECK_EQ(maps.size(), 1);
-    auto map = maps.begin()->GetAffineMap();
     CHECK(!maps.begin()->IsUndefined());
-    indices.emplace_back() = ApplyAffineMap(map, output_indices, {}, b);
+    indices.push_back(ApplyIndexing(*maps.begin(), output_indices, {}, b));
   }
   return indices;
 }
@@ -560,7 +527,7 @@ absl::StatusOr<Value> EmitMulAdd(Value lhs, Value rhs, Value accumulator,
                                  mlir::Type result_element_type,
                                  mlir::Type accumulator_type,
                                  ImplicitLocOpBuilder& b) {
-  if (result_element_type.isa<FloatType>()) {
+  if (mlir::isa<FloatType>(result_element_type)) {
     if (result_element_type.isBF16()) {
       lhs = b.create<arith::ExtFOp>(b.getF32Type(), lhs);
       rhs = b.create<arith::ExtFOp>(b.getF32Type(), rhs);
@@ -599,11 +566,11 @@ absl::StatusOr<SmallVector<Value>> EmitDotLoop(
   auto body =
       [&](ValueRange iter_args, ValueRange dim_values,
           ValueRange symbol_values) -> absl::StatusOr<SmallVector<Value>> {
-    llvm::SmallVector<Value> lhs_indices = ApplyAffineMap(
-        lhs_indexing_map.GetAffineMap(), dim_values, symbol_values, b);
-    llvm::SmallVector<Value> rhs_indices =
-        ApplyAffineMap(rhs_indexing_map.GetAffineMap(), dim_values,
-                       symbol_values.take_front(rhs_symbol_count), b);
+    auto lhs_indices =
+        ApplyIndexing(lhs_indexing_map, dim_values, symbol_values, b);
+    auto rhs_indices =
+        ApplyIndexing(rhs_indexing_map, dim_values,
+                      symbol_values.take_front(rhs_symbol_count), b);
 
     TF_ASSIGN_OR_RETURN(Value lhs_value, GetSingleOperandValue(
                                              operand_provider, instr,
@@ -667,7 +634,7 @@ absl::StatusOr<SmallVector<Value>> EmitParameter(const HloInstruction* instr,
                                                  ValueRange indices,
                                                  ImplicitLocOpBuilder& b) {
   Value value = this_fn.getArgument(instr->parameter_number());
-  if (value.getType().isa<mlir::TensorType>()) {
+  if (mlir::isa<mlir::TensorType>(value.getType())) {
     value = b.create<mlir::tensor::ExtractOp>(value, indices);
   } else {
     TF_RET_CHECK(indices.empty());
@@ -711,16 +678,15 @@ Value ApplyAffineExpr(mlir::AffineExpr expr, ValueRange dims,
   return b.createOrFold<mlir::affine::AffineApplyOp>(expr, args);
 }
 
-SmallVector<Value> ApplyAffineMap(mlir::AffineMap map, ValueRange dims,
-                                  ValueRange symbols, ImplicitLocOpBuilder& b) {
-  CHECK_EQ(map.getNumDims(), dims.size());
-  CHECK_EQ(map.getNumSymbols(), symbols.size());
-  SmallVector<Value> result;
-  result.reserve(map.getNumResults());
-  for (auto expr : map.getResults()) {
-    result.push_back(ApplyAffineExpr(expr, dims, symbols, b));
+SmallVector<Value> ApplyIndexing(const IndexingMap& map, ValueRange dims,
+                                 ValueRange symbols, ImplicitLocOpBuilder& b) {
+  SmallVector<Value> results;
+  for (unsigned int i = 0; i < map.GetAffineMap().getNumResults(); ++i) {
+    SmallVector<Value> result;
+    b.createOrFold<ApplyIndexingOp>(result, dims, symbols, map.GetSubMap(i));
+    results.append(result);
   }
-  return result;
+  return results;
 }
 
 Value CheckConstraint(mlir::Value constrained_value, Interval range,
@@ -737,11 +703,25 @@ Value CheckConstraint(mlir::Value constrained_value, Interval range,
 
 Value CheckConstraints(const IndexingMap& map, ValueRange dims,
                        ValueRange symbols, ImplicitLocOpBuilder& b) {
+  llvm::SmallVector<mlir::AffineExpr> expressions;
+  for (auto&& [expression, _] : map.GetConstraints()) {
+    expressions.push_back(expression);
+  }
+
+  // Construct an indexing for the constraints, so we can use `apply_indexing`.
+  auto input_map = map.GetAffineMap();
+  IndexingMap constraints_map{
+      mlir::AffineMap::get(input_map.getNumDims(), input_map.getNumSymbols(),
+                           expressions, input_map.getContext()),
+      map.GetDimVars(), map.GetRangeVars(), map.GetRTVars()};
+  llvm::SmallVector<Value> constraints_values =
+      ApplyIndexing(constraints_map, dims, symbols, b);
+
   Value ret = b.create<ConstantOp>(b.getIntegerAttr(b.getI1Type(), 1));
-  for (auto&& [expression, range] : map.GetConstraints()) {
+  for (auto&& [value, expression_and_range] :
+       llvm::zip(constraints_values, map.GetConstraints())) {
     ret = b.create<AndIOp>(
-        ret, CheckConstraint(ApplyAffineExpr(expression, dims, symbols, b),
-                             range, b));
+        ret, CheckConstraint(value, expression_and_range.second, b));
   }
   for (auto&& [index, bound] : llvm::enumerate(map.GetDimensionBounds())) {
     ret = b.create<AndIOp>(ret, CheckConstraint(dims[index], bound, b));
@@ -783,14 +763,16 @@ absl::StatusOr<SmallVector<Value>> HloToMlir(
     case HloOpcode::kConcatenate:
       return EmitConcat(instr, result_element_type, indices, operand_provider,
                         builder);
-    case HloOpcode::kConstant:
+    case HloOpcode::kConstant: {
+      TF_ASSIGN_OR_RETURN(auto value_attr, CreateDenseElementsAttrFromLiteral(
+                                               instr->literal(), builder));
+      // Convert to signless if needed.
+      if (result_element_type != element_mlir_type) {
+        value_attr = value_attr.mapValues(
+            result_element_type, [](const llvm::APInt& i) { return i; });
+      }
+
       if (ShapeUtil::IsEffectiveScalar(instr->shape())) {
-        TF_ASSIGN_OR_RETURN(auto value_attr, CreateDenseElementsAttrFromLiteral(
-                                                 instr->literal(), builder));
-        if (result_element_type != element_mlir_type) {
-          value_attr = value_attr.mapValues(
-              result_element_type, [](const llvm::APInt& i) { return i; });
-        }
         if (primitive_util::IsComplexType(element_type)) {
           return {{builder.create<mlir::complex::ConstantOp>(
               element_mlir_type,
@@ -801,8 +783,9 @@ absl::StatusOr<SmallVector<Value>> HloToMlir(
             value_attr.getValues<mlir::Attribute>()[0]);
         return {{builder.create<ConstantOp>(val).getResult()}};
       }
-      return absl::UnimplementedError(
-          absl::StrCat("Unimplemented: ", instr->ToShortString()));
+      auto constant = builder.create<ConstantOp>(value_attr).getResult();
+      return {{builder.create<mlir::tensor::ExtractOp>(constant, indices)}};
+    }
     case HloOpcode::kConvolution:
       return EmitConvolution(instr, result_element_type, indices,
                              operand_provider, builder);
@@ -838,27 +821,33 @@ absl::StatusOr<SmallVector<Value>> HloToMlir(
       return EmitReduceWindow(instr, result_element_type, indices,
                               operand_provider, call_target_provider, builder);
     case HloOpcode::kTuple: {
-      CHECK(!IsUnsupportedTuple(instr));
-      const auto& first_shape = instr->shape().tuple_shapes(0);
-      CHECK_EQ(first_shape.rank(), indices.size())
+      const auto* first_shape = &instr->shape().tuple_shapes(0);
+      while (first_shape->IsTuple()) {
+        first_shape = &first_shape->tuple_shapes(0);
+      }
+      CHECK_EQ(first_shape->rank(), indices.size())
           << "Indices for tuple must be for the first tuple element";
       SmallVector<Value> operands;
       for (int i = 0; i < instr->operand_count(); ++i) {
         llvm::SmallVector<Value> operand_indices;
         // The tuple shapes only need to be bitcast compatible, so insert
         // bitcasts where necessary.
+        const auto* operand = instr->operand(i);
+        const auto* operand_index_shape = &operand->shape();
+        while (operand_index_shape->IsTuple()) {
+          operand_index_shape = &operand_index_shape->tuple_shapes(0);
+        }
         if (i > 0 && !ShapeUtil::EqualIgnoringElementType(
-                         first_shape, instr->operand(i)->shape())) {
-          auto operand_map = GetBitcastMap(
-              first_shape, instr->operand(i)->shape(), mlir_context);
-          operand_indices =
-              ApplyAffineMap(operand_map.GetAffineMap(), indices, {}, builder);
+                         *first_shape, *operand_index_shape)) {
+          auto operand_map =
+              GetBitcastMap(*first_shape, *operand_index_shape, mlir_context);
+          operand_indices = ApplyIndexing(operand_map, indices, {}, builder);
         } else {
           operand_indices = indices;
         }
-        TF_ASSIGN_OR_RETURN(
-            operands.emplace_back(),
-            GetSingleOperandValue(operand_provider, instr, i, operand_indices));
+        TF_ASSIGN_OR_RETURN(auto values,
+                            operand_provider(instr, i, operand_indices));
+        operands.append(values);
       }
       return operands;
     }
@@ -1053,19 +1042,7 @@ absl::StatusOr<SmallVector<Value>> HloToMlir(
 
 bool IsHloOpSupported(const HloInstruction* instr,
                       se::CudaComputeCapability compute_capability) {
-  auto is_unsupported_type = [](const HloInstruction* instr) {
-    auto e = instr->shape().element_type();
-    // TODO(jreiffers): Support fp8.
-    return (primitive_util::IsFloatingPointType(e) &&
-            primitive_util::BitWidth(e) < 16);
-  };
-  if (is_unsupported_type(instr) ||
-      absl::c_any_of(instr->operands(), is_unsupported_type)) {
-    return false;
-  }
-
   return !(kUnsupportedOps.contains(instr->opcode()) ||
-           IsUnsupportedConstant(instr) || IsUnsupportedTuple(instr) ||
            IsUnsupportedGather(instr));
 }
 
@@ -1104,16 +1081,6 @@ bool IsHloConversionSupported(const HloFusionAdaptor& fusion,
   auto cuda_compute_capability =
       std::get<se::CudaComputeCapability>(compute_capability);
 
-  if (fusion.GetRoots().size() > 1) {
-    auto first_shape = fusion.GetRoots()[0].instruction().shape();
-    for (int i = 1; i < fusion.GetRoots().size(); ++i) {
-      if (fusion.GetRoots()[i].instruction().shape().dimensions() !=
-          first_shape.dimensions()) {
-        return false;
-      }
-    }
-  }
-
   return !HloFindIf(
       fusion.GetRoots(), fusion, [=](HloInstructionAdaptor instr) {
         return !absl::c_all_of(instr.instruction().called_computations(),
@@ -1125,38 +1092,58 @@ bool IsHloConversionSupported(const HloFusionAdaptor& fusion,
       });
 }
 
-SmallVector<Value> ProvideParameter(
-    const PartitionedComputation::Subgraph& caller, const HloInstruction* instr,
+llvm::SmallVector<Value> ProvideParameter(
+    const PartitionedComputation& computation, const HloInstruction* instr,
     int operand_index, ValueRange indices,
     const CallTargetProvider& call_target_provider, mlir::func::FuncOp this_fn,
-    ImplicitLocOpBuilder& builder) {
+    ImplicitLocOpBuilder& builder,
+    const PartitionedComputation::Subgraph* caller) {
   auto* operand = instr->operand(operand_index);
 
-  const auto& injected_values = caller.injected_values;
-  if (auto it = injected_values.find(operand); it != injected_values.end()) {
-    auto injected_param_values =
-        this_fn.getArguments().take_back(caller.injected_values.size());
-    return {{injected_param_values[it->second]}};
+  if (!caller) {
+    caller = &computation.FindSubgraph(instr);
+  }
+  const auto& injected_value_starts = caller->injected_value_starts;
+  if (auto it = injected_value_starts.find(operand);
+      it != injected_value_starts.end()) {
+    return {this_fn.getArguments().take_back(
+        caller->num_injected_values)[it->second]};
   }
 
   auto callee = call_target_provider(operand);
   SmallVector<Value> operands(
       this_fn.getArguments().take_front(instr->parent()->num_parameters()));
   absl::c_copy(indices, std::back_inserter(operands));
-  return builder.create<PureCallOp>(callee, operands).getResults();
+  auto results = builder.create<PureCallOp>(callee, operands).getResults();
+  auto callee_subgraph = computation.FindSubgraph(operand);
+  if (callee_subgraph.roots.size() == 1) {
+    CHECK_EQ(callee_subgraph.roots.front(), operand);
+    return results;
+  }
+
+  int offset = 0;
+  for (auto root : callee_subgraph.roots) {
+    int root_arity =
+        root->shape().IsTuple() ? root->shape().tuple_shapes_size() : 1;
+    if (root == operand) {
+      return results.slice(offset, root_arity);
+    }
+    offset += root_arity;
+  }
+  LOG(FATAL) << "Did not find operand " << operand->name() << " in roots of "
+             << callee_subgraph.ToString();
 }
 
 SmallVector<Value> ProvideParameterRange(
-    const PartitionedComputation::Subgraph& caller, const HloInstruction* instr,
+    const PartitionedComputation& computation, const HloInstruction* instr,
     int start, int num, ValueRange indices,
     const CallTargetProvider& call_target_provider, mlir::func::FuncOp this_fn,
     ImplicitLocOpBuilder& builder) {
   SmallVector<Value> scalars;
+  scalars.reserve(num);
   for (int i = 0; i < num; ++i) {
-    auto scalar = ProvideParameter(caller, instr, i + start, indices,
-                                   call_target_provider, this_fn, builder);
-    CHECK_EQ(scalar.size(), 1);
-    scalars.push_back(scalar.front());
+    scalars.append(ProvideParameter(computation, instr, i + start, indices,
+                                    call_target_provider, this_fn, builder));
   }
   return scalars;
 }
@@ -1164,6 +1151,7 @@ SmallVector<Value> ProvideParameterRange(
 namespace {
 
 absl::StatusOr<SmallVector<Value>> SubgraphToMlir(
+    const PartitionedComputation& computation,
     const PartitionedComputation::Subgraph& subgraph,
     mlir::func::FuncOp this_fn, const CallTargetProvider& call_target_provider,
     ValueRange parameters, ValueRange indices, ImplicitLocOpBuilder& builder) {
@@ -1184,39 +1172,43 @@ absl::StatusOr<SmallVector<Value>> SubgraphToMlir(
       return emit_instr(operand, operand_indices);
     }
     return ConvertToSignless(
-        ProvideParameter(subgraph, instr, index, operand_indices,
-                         call_target_provider, this_fn, builder),
+        ProvideParameter(computation, instr, index, operand_indices,
+                         call_target_provider, this_fn, builder, &subgraph),
         builder);
   };
 
   emit_instr = [&](const HloInstruction* instr,
                    ValueRange indices) -> absl::StatusOr<SmallVector<Value>> {
-    // TODO(jreiffers): Check dominance, e.g.:
-    //
-    // padding_value = log(param)
-    // pad = pad(bar, padding_value)
-    // broadcast = broadcast(padding_value)
-    // pad + broadcast
-    //
-    // If padding_value was first emitted in the context of pad, it'll be
-    // inside an scf.if. For now this doesn't matter, because the indexing
-    // is considered to be different, but once the partitioner is smarter,
-    // it will matter.
-    //
-    // Also, this caching should be combined with parameter caching.
     std::vector<void*> indices_ptrs;
     indices_ptrs.reserve(indices.size());
     for (auto index : indices) {
       indices_ptrs.push_back(index.getAsOpaquePointer());
     }
     auto& entry = cached_instructions[std::make_pair(instr, indices_ptrs)];
+    // Only use the entry if its parent block is still in scope. Note that this
+    // should always be the case normally - if not, we risk exponential code
+    // size.
     if (!entry.empty()) {
-      return entry;
+      auto* entry_block = entry.front().getParentBlock();
+      auto* insertion_block = builder.getInsertionBlock();
+      while (insertion_block != nullptr) {
+        if (insertion_block == entry_block) return entry;
+        if (insertion_block->getParentOp()) {
+          insertion_block = insertion_block->getParentOp()->getBlock();
+        } else {
+          insertion_block = nullptr;
+          VLOG(2) << "Failed dominance check while looking up cache for "
+                  << instr->ToShortString()
+                  << ". This is a bug in the computation partitioner.";
+        }
+      }
     }
 
     TF_ASSIGN_OR_RETURN(auto lowered_instr,
                         HloToMlir(instr, this_fn, indices, provide_operand,
                                   call_target_provider, builder));
+    CHECK(!absl::c_linear_search(lowered_instr, nullptr))
+        << "Failed to lower " << instr->name();
 
     entry = ConvertToSignless(lowered_instr, builder);
     TF_RET_CHECK(!absl::c_any_of(
@@ -1230,15 +1222,20 @@ absl::StatusOr<SmallVector<Value>> SubgraphToMlir(
       << subgraph.ToString();
   for (const auto [root, indexing] :
        llvm::zip(subgraph.roots, subgraph.root_indexing)) {
-    TF_RET_CHECK(indexing.getNumDims() + indexing.getNumSymbols() ==
-                 indices.size())
-        << "Incorrect number of indices (got " << indices.size()
-        << ", expected " << indexing.getNumDims() << " dims and "
-        << indexing.getNumSymbols() << "symbols) in " << subgraph.ToString();
-    int num_dims = indexing.getNumDims();
+    if (auto it = subgraph.injected_value_starts.find(root);
+        it != subgraph.injected_value_starts.end()) {
+      auto injected =
+          this_fn.getArguments().take_back(subgraph.num_injected_values);
+      int arity =
+          root->shape().IsTuple() ? root->shape().tuple_shapes_size() : 1;
+      absl::c_copy(injected.slice(it->second, arity),
+                   std::back_inserter(results));
+      continue;
+    }
+    int num_dims = indexing.GetAffineMap().getNumDims();
     auto root_indices =
-        ApplyAffineMap(indexing, /*dims=*/indices.take_front(num_dims),
-                       /*symbols=*/indices.drop_front(num_dims), builder);
+        ApplyIndexing(indexing, /*dims=*/indices.take_front(num_dims),
+                      /*symbols=*/indices.drop_front(num_dims), builder);
     TF_ASSIGN_OR_RETURN(auto root_results, emit_instr(root, root_indices));
     results.append(root_results.begin(), root_results.end());
   }
@@ -1272,18 +1269,17 @@ absl::Status SubgraphToMlirFunction(
       computation.computation().num_parameters());
   auto indices_and_injected_values = func.getArguments().drop_front(
       computation.computation().num_parameters());
-  int num_injected_values = subgraph.injected_values.size();
-  auto indices = indices_and_injected_values.drop_back(num_injected_values);
-  TF_ASSIGN_OR_RETURN(auto results,
-                      SubgraphToMlir(subgraph, func, call_target_provider,
-                                     parameters, indices, builder));
+  auto indices =
+      indices_and_injected_values.drop_back(subgraph.num_injected_values);
+  TF_ASSIGN_OR_RETURN(
+      auto results,
+      SubgraphToMlir(computation, subgraph, func, call_target_provider,
+                     parameters, indices, builder));
 
+  CHECK_EQ(results.size(), func.getResultTypes().size());
   // We have been converting signed types to signless types. To match the
   // function signature, we have to convert back to signed types.
-  auto function = mlir::cast<mlir::func::FuncOp>(
-      results.front().getDefiningOp()->getParentOp());
-  const auto& function_results = function.getFunctionType().getResults();
-  for (auto [index, function_result] : llvm::enumerate(function_results)) {
+  for (auto [index, function_result] : llvm::enumerate(func.getResultTypes())) {
     results[index] =
         builder
             .create<mlir::UnrealizedConversionCastOp>(
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h
index 85d48087a0e983..01a705873b0673 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h
@@ -41,18 +41,22 @@ using OperandProvider =
     std::function<absl::StatusOr<llvm::SmallVector<mlir::Value>>(
         const HloInstruction* instr, int index, mlir::ValueRange indices)>;
 
-// Emits MLIR to produce the value(s) of a parameter. The parameter must be
-// located outside the subgraph.
+// Emits MLIR to produce the value of a parameter. The parameter must be located
+// outside the subgraph. By default, the caller subgraph will be determined by
+// searching in 'computation' for the subgraph that contains 'instr'. If
+// 'instr' does not belong to 'computation', the caller subgraph can be passed
+// directly.
 llvm::SmallVector<mlir::Value> ProvideParameter(
-    const PartitionedComputation::Subgraph& caller, const HloInstruction* instr,
+    const PartitionedComputation& computation, const HloInstruction* instr,
     int operand_index, mlir::ValueRange indices,
     const CallTargetProvider& call_target_provider, mlir::func::FuncOp this_fn,
-    mlir::ImplicitLocOpBuilder& builder);
+    mlir::ImplicitLocOpBuilder& builder,
+    const PartitionedComputation::Subgraph* caller = nullptr);
 
 // Emits MLIR to produce the values of a range of parameters. The parameters
 // must all be scalars. The parameters are all evaluated at the same indices.
 llvm::SmallVector<mlir::Value> ProvideParameterRange(
-    const PartitionedComputation::Subgraph& caller, const HloInstruction* instr,
+    const PartitionedComputation& computation, const HloInstruction* instr,
     int start, int num, mlir::ValueRange indices,
     const CallTargetProvider& call_target_provider, mlir::func::FuncOp this_fn,
     mlir::ImplicitLocOpBuilder& builder);
@@ -82,11 +86,11 @@ mlir::Value ApplyAffineExpr(mlir::AffineExpr expr, mlir::ValueRange dims,
                             mlir::ValueRange symbols,
                             mlir::ImplicitLocOpBuilder& b);
 
-// Creates affine.apply ops for each result of the given map.
-llvm::SmallVector<mlir::Value> ApplyAffineMap(mlir::AffineMap map,
-                                              mlir::ValueRange dims,
-                                              mlir::ValueRange symbols,
-                                              mlir::ImplicitLocOpBuilder& b);
+// Creates an `apply_indexing` op for the given map.
+llvm::SmallVector<mlir::Value> ApplyIndexing(const IndexingMap& map,
+                                             mlir::ValueRange dims,
+                                             mlir::ValueRange symbols,
+                                             mlir::ImplicitLocOpBuilder& b);
 
 // Checks all the constraints and dimension ranges in the map.
 mlir::Value CheckConstraints(const IndexingMap& map, mlir::ValueRange dims,
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir_test.cc b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir_test.cc
index 0baebb83406a20..195db04a7b06f7 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir_test.cc
@@ -39,6 +39,8 @@ limitations under the License.
 #include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
 #include "xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.h"
 #include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/model/indexing_analysis.h"
+#include "xla/service/gpu/model/indexing_map.h"
 #include "xla/service/hlo_parser.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/status_macros.h"
@@ -80,9 +82,9 @@ class ElementalHloToMlirTest : public HloTestBase {
                              builder.getContext()));
     builder.setInsertionPointToStart(module->getBody());
     auto* entry_computation = hlo_module->entry_computation();
-    std::optional<EpilogueSpecification> epilogue_spec = std::nullopt;
+    std::vector<EpilogueSpecification> epilogue_spec;
     if (epilogue_spec_fn) {
-      epilogue_spec = epilogue_spec_fn(entry_computation);
+      epilogue_spec.push_back(epilogue_spec_fn(entry_computation));
     }
     PartitionedComputations partitioned_computations(entry_computation,
                                                      &context_, epilogue_spec);
@@ -96,9 +98,10 @@ class ElementalHloToMlirTest : public HloTestBase {
     TF_RETURN_IF_ERROR(SubgraphToMlirFunction(
         entry_pc, entry_pc.GetRootSubgraph(), entry_func, call_targets));
 
-    if (const auto& epilogue = partitioned_computations.epilogue()) {
-      TF_RETURN_IF_ERROR(SubgraphToMlirFunction(entry_pc, *epilogue,
-                                                fns[&*epilogue], call_targets));
+    if (!partitioned_computations.epilogues().empty()) {
+      const auto& epilogue = partitioned_computations.epilogues().front();
+      TF_RETURN_IF_ERROR(SubgraphToMlirFunction(entry_pc, epilogue,
+                                                fns[&epilogue], call_targets));
     }
 
     // Canonicalize and CSE for better readability of check tests.
@@ -232,12 +235,12 @@ TEST_F(ElementalHloToMlirTest, ReduceWindow) {
     // CHECK:      %[[INIT:.*]] = tensor.extract %[[ARG1]][]
     // CHECK:      %[[RET:.*]] = scf.for %[[I:.*]] = %[[C0]] to %[[C7]]
     // CHECK-SAME:   step %[[C1]] iter_args(%[[ACC:.*]] = %[[INIT]])
-    // CHECK:      %[[J:.*]] = affine.apply affine_map<()[s0] ->
-    // CHECK-SAME: (s0 * 4)>()[%[[Y]]]
-    // CHECK:      %[[K:.*]] = affine.apply affine_map<()[s0, s1] ->
-    // CHECK-SAME: (s0 + s1 - 3)>()[%[[I]], %[[Z]]]
+    // CHECK:      %[[J0:.*]] = xla_gpu.apply_indexing affine_map<(d0) -> (d0 * 4)>
+    // CHECK-SAME:     (%[[Y]] in [0, 2])
+    // CHECK:      %[[J1:.*]] = xla_gpu.apply_indexing affine_map<(d0)[s0] -> (d0 + s0 - 3)>
+    // CHECK-SAME:     (%[[Z]] in [0, 7])[%[[I]] in [0, 6]]
     // CHECK:          %[[VAL:.*]] = tensor.extract %[[ARG0]]
-    // CHECK-SAME:        [%[[X]], %[[J]], %[[K]]]
+    // CHECK-SAME:        [%[[X]], %[[J0]], %[[J1]]]
     // CHECK:          %[[UPD:.*]] = func.call @add_sum(%[[ACC]],
     // CHECK-SAME:                                      %[[VAL]])
     // CHECK:          scf.yield %[[UPD]]
@@ -276,14 +279,15 @@ TEST_F(ElementalHloToMlirTest, ReduceWindowWithRescaling) {
     // CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
     // CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : index
     // CHECK-DAG:  %[[C4:.*]] = arith.constant 4 : index
-
     // We have a window size of 8, but expect a loop from 0 to 4
     // due to the base dilation of 2 and the applied symbol rescaling:
     // CHECK:      scf.for %[[I:.*]] = %[[C0]] to %[[C4]] step %[[C1]]
-    // CHECK:      %[[K:.*]] = affine.apply affine_map<()[s0, s1] ->
     // If symbol rescaling wasn't working we would have a
     // `s0 floordiv <base_dilation>` in the map:
-    // CHECK-SAME: (s0 + s1 * 2)>()[%[[I]], %[[X]]]
+    // CHECK:      %[[K:.*]] = xla_gpu.apply_indexing
+    // CHECK-SAME:   affine_map<(d0)[s0] -> (d0 * 2 + s0)>
+    // CHECK-SAME:   (%[[X]] in [0, 18])[%[[I]] in [0, 3]]
+
     // CHECK:      tensor.extract %[[ARG0]][%[[K]], %[[Y]], %[[Z]]]
   )"));
 }
@@ -404,24 +408,25 @@ TEST_F(ElementalHloToMlirTest, Pad) {
     // CHECK-DAG:    %[[C1:.*]] = arith.constant 1
     // CHECK-DAG:    %[[C4:.*]] = arith.constant 4
     // CHECK-DAG:    %[[C7:.*]] = arith.constant 7
-    // CHECK:        %[[CONSTRAINT_VAL:.*]] = affine.apply
-    // CHECK-SAME:     <()[s0] -> (s0 - ((s0 - 1) floordiv 2) * 2 - 1)>
-    // CHECK-SAME:     ()[%[[X]]]
+    // CHECK:        %[[CONSTRAINT_VAL:.*]] = xla_gpu.apply_indexing
+    // CHECK-SAME:     <(d0) -> ((d0 - 1) mod 2)>(%[[X]] in [1, 7])
     // CHECK:        %[[CONSTRAINT:.*]] = arith.cmpi eq, %[[CONSTRAINT_VAL]], %[[C0]]
-    // CHECK:        %[[X_L:.*]] = arith.cmpi sge, %[[X]], %[[C1]]
-    // CHECK:        %[[X_H:.*]] = arith.cmpi sle, %[[X]], %[[C7]]
+    // CHECK-DAG:        %[[X_L:.*]] = arith.cmpi sge, %[[X]], %[[C1]]
+    // CHECK-DAG:        %[[X_H:.*]] = arith.cmpi sle, %[[X]], %[[C7]]
     // CHECK:        %[[X_BOUNDS:.*]] = arith.andi %[[X_L]], %[[X_H]]
     // CHECK:        %[[X_AND_CONSTRAINT:.*]] = arith.andi %[[CONSTRAINT]], %[[X_BOUNDS]]
-    // CHECK:        %[[Y_L:.*]] = arith.cmpi sge, %[[Y]], %[[C4]]
-    // CHECK:        %[[Y_H:.*]] = arith.cmpi sle, %[[Y]], %[[C7]]
+    // CHECK-DAG:        %[[Y_L:.*]] = arith.cmpi sge, %[[Y]], %[[C4]]
+    // CHECK-DAG:        %[[Y_H:.*]] = arith.cmpi sle, %[[Y]], %[[C7]]
     // CHECK:        %[[Y_BOUNDS:.*]] = arith.andi %[[Y_L]], %[[Y_H]]
     // CHECK:        %[[FROM_INPUT:.*]] = arith.andi %[[X_AND_CONSTRAINT]], %[[Y_BOUNDS]]
     // CHECK:        %[[RET:.*]] = scf.if %[[FROM_INPUT]]
-    // CHECK:          %[[X_IN:.*]] = affine.apply
-    // CHECK-SAME:         <()[s0] -> ((s0 - 1) floordiv 2)>()[%[[X]]]
-    // CHECK:          %[[Y_IN:.*]] = affine.apply
-    // CHECK-SAME:         <()[s0] -> (s0 - 4)>()[%[[Y]]]
-    // CHECK:          %[[VAL:.*]] = tensor.extract %[[ARG0]][%[[X_IN]], %[[Y_IN]]]
+    // CHECK:          %[[IN0:.*]] = xla_gpu.apply_indexing
+    // CHECK-SAME:         <(d0) -> ((d0 - 1) floordiv 2)>
+    // CHECK-SAME:         (%[[X]] in [1, 7])
+    // CHECK:          %[[IN1:.*]] = xla_gpu.apply_indexing
+    // CHECK-SAME:         <(d0) -> (d0 - 4)>
+    // CHECK-SAME:         (%[[Y]] in [4, 7])
+    // CHECK:          %[[VAL:.*]] = tensor.extract %[[ARG0]][%[[IN0]], %[[IN1]]]
     // CHECK:          scf.yield %[[VAL]]
     // CHECK:        } else {
     // CHECK:          %[[PAD_VAL:.*]] = tensor.extract %[[ARG1]][]
@@ -447,24 +452,25 @@ TEST_F(ElementalHloToMlirTest, PadUnsigned) {
     // CHECK-DAG:    %[[C1:.*]] = arith.constant 1
     // CHECK-DAG:    %[[C4:.*]] = arith.constant 4
     // CHECK-DAG:    %[[C7:.*]] = arith.constant 7
-    // CHECK:        %[[CONSTRAINT_VAL:.*]] = affine.apply
-    // CHECK-SAME:     <()[s0] -> (s0 - ((s0 - 1) floordiv 2) * 2 - 1)>
-    // CHECK-SAME:     ()[%[[X]]]
+    // CHECK:        %[[CONSTRAINT_VAL:.*]] = xla_gpu.apply_indexing
+    // CHECK-SAME:     <(d0) -> ((d0 - 1) mod 2)>(%[[X]] in [1, 7])
     // CHECK:        %[[CONSTRAINT:.*]] = arith.cmpi eq, %[[CONSTRAINT_VAL]], %[[C0]]
-    // CHECK:        %[[X_L:.*]] = arith.cmpi sge, %[[X]], %[[C1]]
-    // CHECK:        %[[X_H:.*]] = arith.cmpi sle, %[[X]], %[[C7]]
+    // CHECK-DAG:        %[[X_L:.*]] = arith.cmpi sge, %[[X]], %[[C1]]
+    // CHECK-DAG:        %[[X_H:.*]] = arith.cmpi sle, %[[X]], %[[C7]]
     // CHECK:        %[[X_BOUNDS:.*]] = arith.andi %[[X_L]], %[[X_H]]
     // CHECK:        %[[X_AND_CONSTRAINT:.*]] = arith.andi %[[CONSTRAINT]], %[[X_BOUNDS]]
-    // CHECK:        %[[Y_L:.*]] = arith.cmpi sge, %[[Y]], %[[C4]]
-    // CHECK:        %[[Y_H:.*]] = arith.cmpi sle, %[[Y]], %[[C7]]
+    // CHECK-DAG:        %[[Y_L:.*]] = arith.cmpi sge, %[[Y]], %[[C4]]
+    // CHECK-DAG:        %[[Y_H:.*]] = arith.cmpi sle, %[[Y]], %[[C7]]
     // CHECK:        %[[Y_BOUNDS:.*]] = arith.andi %[[Y_L]], %[[Y_H]]
     // CHECK:        %[[FROM_INPUT:.*]] = arith.andi %[[X_AND_CONSTRAINT]], %[[Y_BOUNDS]]
     // CHECK:        %[[RET:.*]] = scf.if %[[FROM_INPUT]]
-    // CHECK:          %[[X_IN:.*]] = affine.apply
-    // CHECK-SAME:         <()[s0] -> ((s0 - 1) floordiv 2)>()[%[[X]]]
-    // CHECK:          %[[Y_IN:.*]] = affine.apply
-    // CHECK-SAME:         <()[s0] -> (s0 - 4)>()[%[[Y]]]
-    // CHECK:          %[[VAL:.*]] = tensor.extract %[[ARG0]][%[[X_IN]], %[[Y_IN]]]
+    // CHECK:          %[[IN0:.*]] = xla_gpu.apply_indexing
+    // CHECK-SAME:         <(d0) -> ((d0 - 1) floordiv 2)>
+    // CHECK-SAME:         (%[[X]] in [1, 7])
+    // CHECK:          %[[IN1:.*]] = xla_gpu.apply_indexing
+    // CHECK-SAME:         <(d0) -> (d0 - 4)>
+    // CHECK-SAME:         (%[[Y]] in [4, 7])
+    // CHECK:          %[[VAL:.*]] = tensor.extract %[[ARG0]][%[[IN0]], %[[IN1]]]
     // CHECK:          %[[CAST0:.*]] = builtin.unrealized_conversion_cast %[[VAL]]
     // CHECK:          scf.yield %[[CAST0]]
     // CHECK:        } else {
@@ -785,9 +791,13 @@ TEST_F(ElementalHloToMlirTest, ConvolutionSimple) {
     // CHECK-NEXT: %[[R1:.+]] = scf.for %[[Y:.+]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[A1:.+]] = %[[A0]]) -> (f32) {
     // CHECK-NEXT: %[[R2:.+]] = scf.for %[[I:.+]] = %[[C0]] to %[[C4]] step %[[C1]] iter_args(%[[ACC:.+]] = %[[A1]]) -> (f32) {
     // CHECK:      %[[R3:.+]] = scf.if {{.+}} -> (f32) {
-    // CHECK-DAG:    %[[XX:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%[[X]], %[[W]]]
-    // CHECK-DAG:    %[[YY:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%[[Y]], %[[H]]]
-    // CHECK-DAG:    %[[VL:.+]] = tensor.extract %[[LHS]][%[[B]], %[[XX]], %[[YY]], %[[I]]] : tensor<2x8x12x4xf32>
+    // CHECK:      %[[XX0:.+]] = xla_gpu.apply_indexing
+    // CHECK-SAME:   affine_map<(d0)[s0] -> (d0 + s0)>
+    // CHECK-SAME:   (%[[W]] in [0, 5])[%[[X]] in [0, 2]]
+    // CHECK:      %[[XX1:.+]] = xla_gpu.apply_indexing
+    // CHECK-SAME:   affine_map<(d0)[s0] -> (d0 + s0)>
+    // CHECK-SAME:   (%[[H]] in [0, 7])[%[[Y]] in [0, 4]]
+    // CHECK-DAG:    %[[VL:.+]] = tensor.extract %[[LHS]][%[[B]], %[[XX0]], %[[XX1]], %[[I]]] : tensor<2x8x12x4xf32>
     // CHECK-DAG:    %[[VR:.+]] = tensor.extract %[[RHS]][%[[I]], %[[X]], %[[Y]], %[[O]]] : tensor<4x3x5x16xf32>
     // CHECK:        %[[MUL:.+]] = arith.mulf %[[VL]], %[[VR]] : f32
     // CHECK-NEXT:   %[[ADD:.+]] = arith.addf %[[ACC]], %[[MUL]] : f32
@@ -827,9 +837,13 @@ TEST_F(ElementalHloToMlirTest, ConvolutionWithWindowStrides) {
     // CHECK-NEXT: %[[R1:.+]] = scf.for %[[Y:.+]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[A1:.+]] = %[[A0]]) -> (f32) {
     // CHECK-NEXT: %[[R2:.+]] = scf.for %[[I:.+]] = %[[C0]] to %[[C4]] step %[[C1]] iter_args(%[[ACC:.+]] = %[[A1]]) -> (f32) {
     // CHECK:      %[[R3:.+]] = scf.if {{.+}} -> (f32) {
-    // CHECK-DAG:    %[[XX:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 2)>()[%[[X]], %[[W]]]
-    // CHECK-DAG:    %[[YY:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 2)>()[%[[Y]], %[[H]]]
-    // CHECK-DAG:    %[[VL:.+]] = tensor.extract %[[LHS]][%[[B]], %[[XX]], %[[YY]], %[[I]]] : tensor<2x8x12x4xf32>
+    // CHECK:        %[[XX0:.+]] = xla_gpu.apply_indexing
+    // CHECK-SAME:     affine_map<(d0)[s0] -> (d0 * 2 + s0)>
+    // CHECK-SAME:     (%[[W]] in [0, 2])[%[[X]] in [0, 2]]
+    // CHECK:        %[[XX1:.+]] = xla_gpu.apply_indexing
+    // CHECK-SAME:     affine_map<(d0)[s0] -> (d0 * 2 + s0)>
+    // CHECK-SAME:     (%[[H]] in [0, 3])[%[[Y]] in [0, 4]]
+    // CHECK-DAG:    %[[VL:.+]] = tensor.extract %[[LHS]][%[[B]], %[[XX0]], %[[XX1]], %[[I]]] : tensor<2x8x12x4xf32>
     // CHECK-DAG:    %[[VR:.+]] = tensor.extract %[[RHS]][%[[I]], %[[X]], %[[Y]], %[[O]]] : tensor<4x3x5x16xf32>
     // CHECK:        %[[MUL:.+]] = arith.mulf %[[VL]], %[[VR]] : f32
     // CHECK-NEXT:   %[[ADD:.+]] = arith.addf %[[ACC]], %[[MUL]] : f32
@@ -871,18 +885,22 @@ TEST_F(ElementalHloToMlirTest, ConvolutionWithPadding) {
     // CHECK:      %[[R0:.+]] = scf.for %[[X:.+]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[A0:.+]] = %[[INIT]]) -> (f32) {
     // CHECK-NEXT: %[[R1:.+]] = scf.for %[[Y:.+]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[A1:.+]] = %[[A0]]) -> (f32) {
     // CHECK-NEXT: %[[R2:.+]] = scf.for %[[I:.+]] = %[[C0]] to %[[C4]] step %[[C1]] iter_args(%[[ACC:.+]] = %[[A1]]) -> (f32) {
-    // CHECK-DAG:  %[[TESTX:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%[[X]], %[[W]]]
+    // CHECK-DAG:  %[[TESTX:.+]] = xla_gpu.apply_indexing affine_map<(d0)[s0] -> (d0 + s0)>(%[[W]] in [0, 7])[%[[X]] in [0, 2]]
     // CHECK-DAG:  %[[TXGE:.+]] = arith.cmpi sge, %[[TESTX]], %[[C1]] : index
     // CHECK-DAG:  %[[TXLE:.+]] = arith.cmpi sle, %[[TESTX]], %[[C8]] : index
     // CHECK-DAG:  %[[TX:.+]] = arith.andi %[[TXGE]], %[[TXLE]] : i1
-    // CHECK-DAG:  %[[TESTY:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%[[Y]], %[[H]]]
+    // CHECK-DAG:  %[[TESTY:.+]] = xla_gpu.apply_indexing affine_map<(d0)[s0] -> (d0 + s0)>(%[[H]] in [0, 11])[%[[Y]] in [0, 4]]
     // CHECK-DAG:  %[[TYGE:.+]] = arith.cmpi sge, %[[TESTY]], %[[C2]] : index
     // CHECK-DAG:  %[[TYLE:.+]] = arith.cmpi sle, %[[TESTY]], %[[C13]] : index
     // CHECK-DAG:  %[[TY:.+]] = arith.andi %[[TYGE]], %[[TYLE]] : i1
     // CHECK:      %[[R3:.+]] = scf.if {{.+}} -> (f32) {
-    // CHECK-DAG:    %[[XX:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 + s1 - 1)>()[%[[X]], %[[W]]]
-    // CHECK-DAG:    %[[YY:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 + s1 - 2)>()[%[[Y]], %[[H]]]
-    // CHECK-DAG:    %[[VL:.+]] = tensor.extract %[[LHS]][%[[B]], %[[XX]], %[[YY]], %[[I]]] : tensor<2x8x12x4xf32>
+    // CHECK:        %[[XX0:.+]] = xla_gpu.apply_indexing
+    // CHECK-SAME:     affine_map<(d0)[s0] -> (d0 + s0 - 1)>
+    // CHECK-SAME:     (%[[W]] in [0, 7])[%[[X]] in [0, 2]]
+    // CHECK:        %[[XX1:.+]] = xla_gpu.apply_indexing
+    // CHECK-SAME:     affine_map<(d0)[s0] -> (d0 + s0 - 2)>
+    // CHECK-SAME:     (%[[H]] in [0, 11])[%[[Y]] in [0, 4]]
+    // CHECK-DAG:    %[[VL:.+]] = tensor.extract %[[LHS]][%[[B]], %[[XX0]], %[[XX1]], %[[I]]] : tensor<2x8x12x4xf32>
     // CHECK-DAG:    %[[VR:.+]] = tensor.extract %[[RHS]][%[[I]], %[[X]], %[[Y]], %[[O]]] : tensor<4x3x5x16xf32>
     // CHECK:        %[[MUL:.+]] = arith.mulf %[[VL]], %[[VR]] : f32
     // CHECK-NEXT:   %[[ADD:.+]] = arith.addf %[[ACC]], %[[MUL]] : f32
@@ -921,14 +939,18 @@ TEST_F(ElementalHloToMlirTest, ConvolutionWithLhsDilation) {
     // CHECK:      %[[R0:.+]] = scf.for %[[X:.+]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[A0:.+]] = %[[INIT]]) -> (f32) {
     // CHECK-NEXT: %[[R1:.+]] = scf.for %[[Y:.+]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[A1:.+]] = %[[A0]]) -> (f32) {
     // CHECK-NEXT: %[[R2:.+]] = scf.for %[[I:.+]] = %[[C0]] to %[[C4]] step %[[C1]] iter_args(%[[ACC:.+]] = %[[A1]]) -> (f32) {
-    // CHECK-DAG:  %[[TESTX:.+]] = affine.apply affine_map<()[s0, s1] -> ((s0 + s1) mod 2)>()[%[[X]], %[[W]]]
+    // CHECK-DAG:  %[[TESTX:.+]] = xla_gpu.apply_indexing affine_map<(d0)[s0] -> ((d0 + s0) mod 2)>(%[[W]] in [0, 12])[%[[X]] in [0, 2]]
     // CHECK-DAG:  %[[TX:.+]] = arith.cmpi eq, %[[TESTX]], %[[C0]] : index
-    // CHECK-DAG:  %[[TESTY:.+]] = affine.apply affine_map<()[s0, s1] -> ((s0 + s1) mod 2)>()[%[[Y]], %[[H]]]
+    // CHECK-DAG:  %[[TESTY:.+]] = xla_gpu.apply_indexing affine_map<(d0)[s0] -> ((d0 + s0) mod 2)>(%[[H]] in [0, 18])[%[[Y]] in [0, 4]]
     // CHECK-DAG:  %[[TY:.+]] = arith.cmpi eq, %[[TESTY]], %[[C0]] : index
     // CHECK:      %[[R3:.+]] = scf.if {{.+}} -> (f32) {
-    // CHECK-DAG:    %[[XX:.+]] = affine.apply affine_map<()[s0, s1] -> ((s0 + s1) floordiv 2)>()[%[[X]], %[[W]]]
-    // CHECK-DAG:    %[[YY:.+]] = affine.apply affine_map<()[s0, s1] -> ((s0 + s1) floordiv 2)>()[%[[Y]], %[[H]]]
-    // CHECK-DAG:    %[[VL:.+]] = tensor.extract %[[LHS]][%[[B]], %[[XX]], %[[YY]], %[[I]]] : tensor<2x8x12x4xf32>
+    // CHECK:        %[[XX0:.+]] = xla_gpu.apply_indexing
+    // CHECK-SAME:     affine_map<(d0)[s0] -> ((d0 + s0) floordiv 2)>
+    // CHECK-SAME:     (%[[W]] in [0, 12])[%[[X]] in [0, 2]]
+    // CHECK:        %[[XX1:.+]] = xla_gpu.apply_indexing
+    // CHECK-SAME:     affine_map<(d0)[s0] -> ((d0 + s0) floordiv 2)>
+    // CHECK-SAME:     (%[[H]] in [0, 18])[%[[Y]] in [0, 4]]
+    // CHECK-DAG:    %[[VL:.+]] = tensor.extract %[[LHS]][%[[B]], %[[XX0]], %[[XX1]], %[[I]]] : tensor<2x8x12x4xf32>
     // CHECK-DAG:    %[[VR:.+]] = tensor.extract %[[RHS]][%[[I]], %[[X]], %[[Y]], %[[O]]] : tensor<4x3x5x16xf32>
     // CHECK:        %[[MUL:.+]] = arith.mulf %[[VL]], %[[VR]] : f32
     // CHECK-NEXT:   %[[ADD:.+]] = arith.addf %[[ACC]], %[[MUL]] : f32
@@ -968,9 +990,13 @@ TEST_F(ElementalHloToMlirTest, ConvolutionWithRhsDilation) {
     // CHECK-NEXT: %[[R1:.+]] = scf.for %[[Y:.+]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[A1:.+]] = %[[A0]]) -> (f32) {
     // CHECK-NEXT: %[[R2:.+]] = scf.for %[[I:.+]] = %[[C0]] to %[[C4]] step %[[C1]] iter_args(%[[ACC:.+]] = %[[A1]]) -> (f32) {
     // CHECK:      %[[R3:.+]] = scf.if {{.+}} -> (f32) {
-    // CHECK-DAG:    %[[XX:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 * 2 + s1)>()[%[[X]], %[[W]]]
-    // CHECK-DAG:    %[[YY:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 * 2 + s1)>()[%[[Y]], %[[H]]]
-    // CHECK-DAG:    %[[VL:.+]] = tensor.extract %[[LHS]][%[[B]], %[[XX]], %[[YY]], %[[I]]] : tensor<2x8x12x4xf32>
+    // CHECK:        %[[XX0:.+]] = xla_gpu.apply_indexing
+    // CHECK-SAME:     affine_map<(d0)[s0] -> (d0 + s0 * 2)>
+    // CHECK-SAME:     (%[[W]] in [0, 3])[%[[X]] in [0, 2]]
+    // CHECK:        %[[XX1:.+]] = xla_gpu.apply_indexing
+    // CHECK-SAME:     affine_map<(d0)[s0] -> (d0 + s0 * 2)>
+    // CHECK-SAME:     (%[[H]] in [0, 3])[%[[Y]] in [0, 4]]
+    // CHECK-DAG:    %[[VL:.+]] = tensor.extract %[[LHS]][%[[B]], %[[XX0]], %[[XX1]], %[[I]]] : tensor<2x8x12x4xf32>
     // CHECK-DAG:    %[[VR:.+]] = tensor.extract %[[RHS]][%[[I]], %[[X]], %[[Y]], %[[O]]] : tensor<4x3x5x16xf32>
     // CHECK:        %[[MUL:.+]] = arith.mulf %[[VL]], %[[VR]] : f32
     // CHECK-NEXT:   %[[ADD:.+]] = arith.addf %[[ACC]], %[[MUL]] : f32
@@ -1010,10 +1036,19 @@ TEST_F(ElementalHloToMlirTest, ConvolutionWithFeatureGroupCount) {
     // CHECK-NEXT: %[[R1:.+]] = scf.for %[[Y:.+]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[A1:.+]] = %[[A0]]) -> (f32) {
     // CHECK-NEXT: %[[R2:.+]] = scf.for %[[I:.+]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ACC:.+]] = %[[A1]]) -> (f32) {
     // CHECK:      %[[R3:.+]] = scf.if {{.+}} -> (f32) {
-    // CHECK-DAG:    %[[XX:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%[[X]], %[[W]]]
-    // CHECK-DAG:    %[[YY:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%[[Y]], %[[H]]]
-    // CHECK-DAG:    %[[II:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 + (s1 floordiv 8) * 2)>()[%[[I]], %[[O]]]
-    // CHECK-DAG:    %[[VL:.+]] = tensor.extract %[[LHS]][%[[B]], %[[XX]], %[[YY]], %[[II]]] : tensor<2x8x12x4xf32>
+    // CHECK:        %[[XX0:.+]] = xla_gpu.apply_indexing
+    // CHECK-SAME:     affine_map<(d0)[s0] -> (d0 + s0)>
+    // CHECK-SAME:     (%[[W]] in [0, 5])
+    // CHECK-SAME:     [%[[X]] in [0, 2]]
+    // CHECK:        %[[XX1:.+]] = xla_gpu.apply_indexing
+    // CHECK-SAME:     affine_map<(d0)[s0] -> (d0 + s0)>
+    // CHECK-SAME:     (%[[H]] in [0, 7])
+    // CHECK-SAME:     [%[[Y]] in [0, 4]]
+    // CHECK:        %[[XX2:.+]] = xla_gpu.apply_indexing
+    // CHECK-SAME:     affine_map<(d0)[s0] -> ((d0 floordiv 8) * 2 + s0)>
+    // CHECK-SAME:     (%[[O]] in [0, 15])
+    // CHECK-SAME:     [%[[I]] in [0, 1]]
+    // CHECK-DAG:    %[[VL:.+]] = tensor.extract %[[LHS]][%[[B]], %[[XX0]], %[[XX1]], %[[XX2]]] : tensor<2x8x12x4xf32>
     // CHECK-DAG:    %[[VR:.+]] = tensor.extract %[[RHS]][%[[I]], %[[X]], %[[Y]], %[[O]]] : tensor<2x3x5x16xf32>
     // CHECK:        %[[MUL:.+]] = arith.mulf %[[VL]], %[[VR]] : f32
     // CHECK-NEXT:   %[[ADD:.+]] = arith.addf %[[ACC]], %[[MUL]] : f32
@@ -1055,10 +1090,15 @@ TEST_F(ElementalHloToMlirTest, ConvolutionWithBatchGroupCount) {
     // CHECK-NEXT: %[[R2:.+]] = scf.for %[[I:.+]] = %[[C0]] to %[[C4]] step %[[C1]] iter_args(%[[A2:.+]] = %[[A1]]) -> (f32) {
     // CHECK-NEXT: %[[R3:.+]] = scf.for %[[G:.+]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ACC:.+]] = %[[A2]]) -> (f32) {
     // CHECK:      %[[R4:.+]] = scf.if {{.+}} -> (f32) {
-    // CHECK-DAG:    %[[BB:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%[[G]], %[[B]]]
-    // CHECK-DAG:    %[[XX:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%[[X]], %[[W]]]
-    // CHECK-DAG:    %[[YY:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%[[Y]], %[[H]]]
-    // CHECK-DAG:    %[[VL:.+]] = tensor.extract %[[LHS]][%[[BB]], %[[XX]], %[[YY]], %[[I]]] : tensor<2x8x12x4xf32>
+    // CHECK:        %[[XX0:.+]] = xla_gpu.apply_indexing
+    // CHECK-SAME:     affine_map<(d0)[s0] -> (d0 + s0)>
+    // CHECK-SAME:     (%[[W]] in [0, 5])
+    // CHECK-SAME:     [%[[X]] in [0, 2]]
+    // CHECK:        %[[XX1:.+]] = xla_gpu.apply_indexing
+    // CHECK-SAME:     affine_map<(d0)[s0] -> (d0 + s0)>
+    // CHECK-SAME:     (%[[H]] in [0, 7])
+    // CHECK-SAME:     [%[[Y]] in [0, 4]]
+    // CHECK-DAG:    %[[VL:.+]] = tensor.extract %[[LHS]][%[[G]], %[[XX0]], %[[XX1]], %[[I]]] : tensor<2x8x12x4xf32>
     // CHECK-DAG:    %[[VR:.+]] = tensor.extract %[[RHS]][%[[I]], %[[X]], %[[Y]], %[[O]]] : tensor<4x3x5x16xf32>
     // CHECK:        %[[MUL:.+]] = arith.mulf %[[VL]], %[[VR]] : f32
     // CHECK-NEXT:   %[[ADD:.+]] = arith.addf %[[ACC]], %[[MUL]] : f32
@@ -1213,7 +1253,7 @@ TEST_F(ElementalHloToMlirTest, Epilogue) {
         ROOT %add = f32[2,17,16] add(%transpose, %bc)
       })",
       R"(
-      // CHECK:      @main__epilogue__(
+      // CHECK:      @main__epilogue__add(
       // CHECK-SAME:     %[[ARG0:.*]]: tensor<2x16x17xf32>
       // CHECK-SAME:     %[[ARG1:.*]]: tensor<f32>
       // CHECK-SAME:     %[[X:.*]]: index {xla.range = [0 : index, 1 :
@@ -1226,10 +1266,14 @@ TEST_F(ElementalHloToMlirTest, Epilogue) {
       [this](HloComputation* entry) {
         EpilogueSpecification epilogue;
         epilogue.heroes.push_back(entry->GetInstructionWithName("transpose"));
+        epilogue.roots.push_back(entry->GetInstructionWithName("add"));
         epilogue.index_ranges = {2, 16, 17};
         epilogue.root_indexing.push_back(
-            mlir::AffineMap::getMultiDimIdentityMap(3, &context_)
-                .getSubMap({0, 2, 1}));
+            IndexingMap{mlir::AffineMap::getMultiDimIdentityMap(3, &context_)
+                            .getSubMap({0, 2, 1}),
+                        DimVarsFromTensorSizes({2, 17, 17}),
+                        {},
+                        {}});
         return epilogue;
       }));
 }
@@ -1289,6 +1333,26 @@ TEST_F(ElementalHloToMlirTest, ScalarComplexConstant) {
   })"));
 }
 
+TEST_F(ElementalHloToMlirTest, TensorConstant) {
+  TF_EXPECT_OK(Run(R"(
+    ENTRY main {
+      p0 = f32[2,1] parameter(0)
+      c1 = f32[2,1] constant({{1.0}, {2.0}})
+      ROOT add = f32[2,1] add(p0, c1)
+    })",
+                   R"(
+    // CHECK:      @main_add(
+    // CHECK-SAME:     %[[ARG0:.*]]: tensor<2x1xf32>
+    // CHECK-SAME:     %[[X:.*]]: index {{.*}}, %[[Y:.*]]: index {{.*}}
+    // CHECK:        %[[C_1:.*]] = arith.constant dense<[
+    // CHECK-SAME:       [1.000000e+00], [2.000000e+00]]>
+    // CHECK:        %[[A:.*]] = tensor.extract %[[ARG0]][%[[X]], %[[Y]]]
+    // CHECK:        %[[B:.*]] = tensor.extract %[[C_1]][%[[X]], %[[Y]]]
+    // CHECK:        %[[RET:.*]] = arith.addf %[[A]], %[[B]]
+    // CHECK:        return %[[RET]]
+  })"));
+}
+
 TEST_F(ElementalHloToMlirTest, DynamicSlice) {
   TF_EXPECT_OK(Run(R"(
     ENTRY main {
@@ -1468,15 +1532,43 @@ TEST_F(ElementalHloToMlirTest, MixedIndexingTuple) {
     // CHECK-SAME:     %[[P0:.*]]: tensor<10x10xf32>,
     // CHECK-SAME:     %[[P1:.*]]: tensor<100xf32>,
     // CHECK-SAME:     %[[X:.*]]: index {{{.*}}}, %[[Y:.*]]: index {{{.*}}}
-    // CHECK:        %[[A:.*]] = tensor.extract %[[P0]][%[[X]], %[[Y]]]
-    // CHECK:        %[[IDX:.*]] = affine.apply
-    // CHECK-SAME:       affine_map<()[s0, s1] -> (s0 * 10 + s1)>()
-    // CHECK-SAME:       [%[[X]], %[[Y]]]
-    // CHECK:        %[[B:.*]] = tensor.extract %[[P1]][%[[IDX]]]
+    // CHECK:        %[[A:.*]] = xla_gpu.pure_call @main_p0(%[[P0]], %[[P1]], %[[X]], %[[Y]])
+    // CHECK:        %[[IDX:.*]] = xla_gpu.apply_indexing
+    // CHECK-SAME:       affine_map<(d0, d1) -> (d0 * 10 + d1)>
+    // CHECK-SAME:       (%[[X]] in [0, 9], %[[Y]] in [0, 9])
+    // CHECK:        %[[B:.*]] = xla_gpu.pure_call @main_p1(%[[P0]], %[[P1]], %[[IDX]])
     // CHECK:        return %[[A]], %[[B]]
   )"));
 }
 
+TEST_F(ElementalHloToMlirTest, NestedTuple) {
+  TF_EXPECT_OK(Run(R"(
+    ENTRY main {
+      %p0 = f32[10,10] parameter(0)
+      %p1 = f32[100] parameter(1)
+      %t0 = (f32[10,10], f32[100]) tuple(%p0, %p1)
+      %t1 = (f32[100], f32[10,10]) tuple(%p1, %p0)
+      ROOT tuple = ((f32[10,10], f32[100]), f32[100], (f32[100], f32[10,10]))
+        tuple(%t0, %p1, %t1)
+    })",
+                   R"(
+    // CHECK:      @main_tuple(
+    // CHECK-SAME:     %[[P0:.*]]: tensor<10x10xf32>,
+    // CHECK-SAME:     %[[P1:.*]]: tensor<100xf32>,
+    // CHECK-SAME:     %[[X:.*]]: index {{{.*}}}, %[[Y:.*]]: index {{{.*}}}
+    // CHECK:          %[[T0_0:.*]], %[[T0_1:.*]] = xla_gpu.pure_call @main_t0
+    // CHECK-SAME:       (%[[P0]], %[[P1]], %[[X]], %[[Y]])
+    // CHECK:          %[[IDX:.*]] =
+    // CHECK-SAME:       affine_map<(d0, d1) -> (d0 * 10 + d1)>
+    // CHECK-SAME:       (%[[X]] in [0, 9], %[[Y]] in [0, 9])
+    // CHECK:          %[[P:.*]] = xla_gpu.pure_call @main_p1
+    // CHECK-SAME:       (%[[P0]], %[[P1]], %[[IDX]])
+    // CHECK:          %[[T1_0:.*]], %[[T1_1:.*]] = xla_gpu.pure_call @main_t1
+    // CHECK-SAME:       (%[[P0]], %[[P1]], %[[IDX]])
+    // CHECK:          return %[[T0_0]], %[[T0_1]], %[[P]], %[[T1_0]], %[[T1_1]]
+  )"));
+}
+
 TEST_F(ElementalHloToMlirTest, ReducePrecision) {
   TF_EXPECT_OK(Run(R"(
                      ENTRY main {
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/expand_float_ops.cc b/third_party/xla/xla/service/gpu/fusions/mlir/expand_float_ops.cc
index 6ad33a9152e9da..52aa26edf1f57b 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/expand_float_ops.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/expand_float_ops.cc
@@ -12,35 +12,100 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstdint>
 #include <memory>
-#include <type_traits>
 #include <utility>
 
+#include "absl/log/check.h"
+#include "llvm/ADT/APFloat.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/LLVMIR/LLVMTypes.h"  // from @llvm-project
+#include "mlir/Dialect/Math/IR/Math.h"  // from @llvm-project
 #include "mlir/Dialect/Math/Transforms/Passes.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h"
-#include "xla/primitive_util.h"
 #include "xla/service/gpu/fusions/mlir/passes.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
 
+namespace ma = ::mlir::arith;
+
+using ma::SelectOp;
+using mlir::Value;
+
 #define GEN_PASS_DEF_EXPANDFLOATOPSPASS
 #include "xla/service/gpu/fusions/mlir/passes.h.inc"
 
 namespace {
 
-template <typename OpTy, mlir::arith::CmpFPredicate pred>
+// Wraps a Value to provide operator overloading for more readable expressions.
+struct Val {
+  Value value;
+  mlir::ImplicitLocOpBuilder* b;
+
+  operator Value() const { return value; }  // NOLINT
+
+  Val operator+(int64_t rhs) const { return Binop<ma::AddIOp>(rhs); }
+  Val operator+(Value rhs) const { return Binop<ma::AddIOp>(rhs); }
+  Val operator-(int64_t rhs) const { return Binop<ma::SubIOp>(rhs); }
+  Val operator-(Value rhs) const { return Binop<ma::SubIOp>(rhs); }
+  Val operator*(int64_t rhs) const { return Binop<ma::MulIOp>(rhs); }
+  Val operator*(Value rhs) const { return Binop<ma::MulIOp>(rhs); }
+  Val operator&(Value rhs) const { return Binop<ma::AndIOp>(rhs); }
+  Val operator&(int64_t rhs) const { return Binop<ma::AndIOp>(rhs); }
+  Val operator|(Value rhs) const { return Binop<ma::OrIOp>(rhs); }
+  Val operator|(int64_t rhs) const { return Binop<ma::OrIOp>(rhs); }
+  Val operator^(Value rhs) const { return Binop<ma::XOrIOp>(rhs); }
+  Val shl(Value rhs) const { return Binop<ma::ShLIOp>(rhs); }
+  Val shl(int64_t rhs) const { return Binop<ma::ShLIOp>(rhs); }
+  Val shrui(Value rhs) const { return Binop<ma::ShRUIOp>(rhs); }
+  Val shrui(int64_t rhs) const { return Binop<ma::ShRUIOp>(rhs); }
+
+  Val cmp(ma::CmpIPredicate pred, Value rhs) const {
+    return {b->create<ma::CmpIOp>(pred, value, rhs), b};
+  }
+  Val cmp(ma::CmpIPredicate pred, int64_t rhs) const {
+    return cmp(pred, MakeConstant(rhs));
+  }
+  Val operator==(Value rhs) const { return cmp(ma::CmpIPredicate::eq, rhs); }
+  Val operator==(int64_t rhs) const { return cmp(ma::CmpIPredicate::eq, rhs); }
+  Val operator!=(int64_t rhs) const { return cmp(ma::CmpIPredicate::ne, rhs); }
+
+  Val MakeConstant(int64_t c) const {
+    return {b->create<ma::ConstantIntOp>(c, value.getType()), b};
+  }
+
+ private:
+  template <typename Op>
+  Val Binop(Value rhs) const {
+    return {b->create<Op>(value, rhs), b};
+  }
+
+  template <typename Op>
+  Val Binop(int64_t rhs) const {
+    return Binop<Op>(MakeConstant(rhs));
+  }
+};
+
+template <typename OpTy, ma::CmpFPredicate pred>
 struct RewriteToCmpSelect : public mlir::OpRewritePattern<OpTy> {
   using mlir::OpRewritePattern<OpTy>::OpRewritePattern;
 
@@ -53,24 +118,22 @@ struct RewriteToCmpSelect : public mlir::OpRewritePattern<OpTy> {
       return rewriter.notifyMatchFailure(op, "not rewriting f32 min/max");
     }
 
-    auto lhs_is_nan = rewriter.create<mlir::arith::CmpFOp>(
-        op.getLoc(), mlir::arith::CmpFPredicate::UNE, op.getLhs(), op.getLhs());
-    auto rhs_is_not_nan = rewriter.create<mlir::arith::CmpFOp>(
-        op.getLoc(), mlir::arith::CmpFPredicate::OEQ, op.getRhs(), op.getRhs());
+    auto lhs_is_nan = rewriter.create<ma::CmpFOp>(
+        op.getLoc(), ma::CmpFPredicate::UNE, op.getLhs(), op.getLhs());
+    auto rhs_is_not_nan = rewriter.create<ma::CmpFOp>(
+        op.getLoc(), ma::CmpFPredicate::OEQ, op.getRhs(), op.getRhs());
 
-    auto return_lhs = rewriter
-                          .create<mlir::arith::CmpFOp>(op.getLoc(), pred,
-                                                       op.getLhs(), op.getRhs())
-                          .getResult();
+    auto return_lhs =
+        rewriter.create<ma::CmpFOp>(op.getLoc(), pred, op.getLhs(), op.getRhs())
+            .getResult();
 
     // logic: isNaN(lhs) || (!isNan(rhs) && return_lhs) ? lhs : rhs
-    return_lhs = rewriter.create<mlir::arith::OrIOp>(
+    return_lhs = rewriter.create<ma::OrIOp>(
         op.getLoc(), lhs_is_nan,
-        rewriter.create<mlir::arith::AndIOp>(op.getLoc(), rhs_is_not_nan,
-                                             return_lhs));
+        rewriter.create<ma::AndIOp>(op.getLoc(), rhs_is_not_nan, return_lhs));
 
-    rewriter.replaceOpWithNewOp<mlir::arith::SelectOp>(
-        op, op.getResult().getType(), return_lhs, op.getLhs(), op.getRhs());
+    rewriter.replaceOpWithNewOp<SelectOp>(op, op.getResult().getType(),
+                                          return_lhs, op.getLhs(), op.getRhs());
     return mlir::success();
   }
 
@@ -82,7 +145,6 @@ struct RewriteErf32Pattern : public mlir::OpRewritePattern<mlir::math::ErfOp> {
 
   mlir::LogicalResult matchAndRewrite(
       mlir::math::ErfOp op, mlir::PatternRewriter& rewriter) const override {
-    namespace ma = mlir::arith;
     if (!op.getType().isF32()) {
       return rewriter.notifyMatchFailure(op, "not an f32 erf");
     }
@@ -104,12 +166,12 @@ struct RewriteErf32Pattern : public mlir::OpRewritePattern<mlir::math::ErfOp> {
     constexpr float kErfInvOneMinusHalfULP = 3.7439211627767994f;
 
     mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    auto c = [&](float v) -> mlir::Value {
+    auto c = [&](float v) -> Value {
       return b.create<ma::ConstantFloatOp>(llvm::APFloat(v),
                                            rewriter.getF32Type());
     };
 
-    auto poly = [&](auto x, auto coefficients) -> mlir::Value {
+    auto poly = [&](auto x, auto coefficients) -> Value {
       auto r = c(coefficients[0]);
       for (int i = 1; i < coefficients.size(); ++i) {
         r = b.create<mlir::math::FmaOp>(r, x, c(coefficients[i]));
@@ -117,10 +179,10 @@ struct RewriteErf32Pattern : public mlir::OpRewritePattern<mlir::math::ErfOp> {
       return r;
     };
 
-    mlir::Value x = op.getOperand();
+    Value x = op.getOperand();
     x = b.create<ma::MaximumFOp>(x, c(-kErfInvOneMinusHalfULP));
     x = b.create<ma::MinimumFOp>(x, c(kErfInvOneMinusHalfULP));
-    mlir::Value x2 = b.create<ma::MulFOp>(x, x);
+    Value x2 = b.create<ma::MulFOp>(x, x);
 
     rewriter.replaceOpWithNewOp<ma::DivFOp>(
         op, b.create<ma::MulFOp>(x, poly(x2, kAlpha)), poly(x2, kBeta));
@@ -129,18 +191,445 @@ struct RewriteErf32Pattern : public mlir::OpRewritePattern<mlir::math::ErfOp> {
   }
 };
 
+int GetSignificandBits(mlir::FloatType ty) {
+  return llvm::APFloat::semanticsPrecision(ty.getFloatSemantics()) - 1;
+}
+
+int GetExponentBias(mlir::FloatType ty) {
+  return 1 - llvm::APFloat::semanticsMinExponent(ty.getFloatSemantics());
+}
+
+Value IsInf(Value value, mlir::ImplicitLocOpBuilder& b) {
+  auto ty = mlir::cast<mlir::FloatType>(value.getType());
+  if (mlir::LLVM::isCompatibleOuterType(ty)) {
+    value = b.create<mlir::math::AbsFOp>(value);
+    Value inf = b.create<ma::ConstantFloatOp>(
+        llvm::APFloat::getInf(ty.getFloatSemantics()), ty);
+    return b.create<ma::CmpFOp>(ma::CmpFPredicate::OEQ, value, inf);
+  }
+
+  assert(ty.getIntOrFloatBitWidth() == 8);
+  if (!ty.isFloat8E5M2()) {
+    // F8E5M2 is the only 8 bit float with infinities.
+    return b.create<ma::ConstantIntOp>(false, b.getI1Type());
+  }
+  Val bits{b.create<ma::BitcastOp>(b.getI8Type(), value), &b};
+  return (bits & 0x7F) == 0x7C;
+}
+
+Value IsNaN(Value value, mlir::ImplicitLocOpBuilder& b) {
+  auto ty = value.getType();
+  if (mlir::LLVM::isCompatibleOuterType(ty)) {
+    return b.create<ma::CmpFOp>(ma::CmpFPredicate::UNO, value, value);
+  }
+
+  assert(ty.getIntOrFloatBitWidth() == 8);
+  Val bits{b.create<ma::BitcastOp>(b.getI8Type(), value), &b};
+  if (ty.isFloat8E5M2() || ty.isFloat8E4M3FN()) {
+    return (bits & 0x7F) == 0x7F;
+  }
+  return bits == 0x80;
+}
+
+Value EmitReducePrecision(Value value, int exponent_bits, int mantissa_bits,
+                          mlir::ImplicitLocOpBuilder& b) {
+  mlir::mhlo::ReducePrecisionOp::Properties properties;
+  properties.exponent_bits = b.getI32IntegerAttr(exponent_bits);
+  properties.mantissa_bits = b.getI32IntegerAttr(mantissa_bits);
+  return mlir::mhlo::MhloOpToStdScalarOp::mapOpOfType<
+      mlir::mhlo::ReducePrecisionOp>(
+      b.getLoc(), value.getType(), {value.getType()},
+      mlir::mhlo::ReducePrecisionOp::Adaptor(value, nullptr, properties), &b);
+}
+
+Value EmitF16ToF8e5m2(Value in, mlir::ImplicitLocOpBuilder& b) {
+  Val in_bits{b.create<ma::BitcastOp>(b.getI16Type(), in), &b};
+  // Use this method of checking for NaN because it's the same as what's used
+  // in the reduce precision lowering.
+  Value is_nan = (in_bits & 32767).cmp(ma::CmpIPredicate::ugt, 31744);
+
+  Value value = EmitReducePrecision(in, 5, 2, b);
+  value = b.create<ma::BitcastOp>(b.getI16Type(), value);
+  value = b.create<ma::ShRUIOp>(value,
+                                b.create<ma::ConstantIntOp>(8, b.getI16Type()));
+  value = b.create<ma::TruncIOp>(b.getI8Type(), value);
+  // When the input is NaN, just truncating can turn a NaN into an inf if the
+  // mantissa becomes 0.
+  value = b.create<ma::SelectOp>(
+      is_nan, b.create<ma::ConstantIntOp>(0x7F, value.getType()), value);
+  return b.create<ma::BitcastOp>(b.getFloat8E5M2Type(), value);
+}
+
+Value EmitFloatConversion(Value value, mlir::FloatType to_ty,
+                          mlir::ImplicitLocOpBuilder& b) {
+  using ma::CmpIPredicate;
+
+  // This is a port of ConvertImpl in
+  // https://github.com/jax-ml/ml_dtypes/blob/main/ml_dtypes/include/float8.h
+  auto from_ty = mlir::cast<mlir::FloatType>(value.getType());
+  if (to_ty == b.getFloat8E5M2Type() && from_ty == b.getF16Type()) {
+    return EmitF16ToF8e5m2(value, b);
+  }
+
+  int from_mantissa = GetSignificandBits(from_ty);
+  int from_bias = GetExponentBias(from_ty);
+  int from_min_exp =
+      llvm::APFloat::semanticsMinExponent(from_ty.getFloatSemantics());
+  int from_max_exp =
+      llvm::APFloat::semanticsMaxExponent(from_ty.getFloatSemantics());
+  auto from_int_ty = b.getIntegerType(from_ty.getIntOrFloatBitWidth());
+
+  int to_mantissa = GetSignificandBits(to_ty);
+  int to_bias = GetExponentBias(to_ty);
+  int to_min_exp =
+      llvm::APFloat::semanticsMinExponent(to_ty.getFloatSemantics());
+  int to_max_exp =
+      llvm::APFloat::semanticsMaxExponent(to_ty.getFloatSemantics());
+  auto to_int_ty = b.getIntegerType(to_ty.getIntOrFloatBitWidth());
+
+  mlir::IntegerType wide_int_ty;
+  if (from_ty.getWidth() == 8 && to_ty.getWidth() == 8) {
+    wide_int_ty = b.getI16Type();
+  } else {
+    wide_int_ty = b.getIntegerType(
+        std::max(from_int_ty.getWidth(), to_int_ty.getWidth()));
+  }
+  auto convert_int = [&](mlir::Type ty, Value v) -> Val {
+    if (v.getType() == ty) {
+      return {v, &b};
+    }
+    if (ty.getIntOrFloatBitWidth() < v.getType().getIntOrFloatBitWidth()) {
+      return {b.create<ma::TruncIOp>(ty, v), &b};
+    }
+    return {b.create<ma::ExtUIOp>(ty, v), &b};
+  };
+
+  int64_t exp_offset = to_bias - from_bias;
+  int digit_shift = to_mantissa - from_mantissa;
+
+  Val from_bits{
+      b.create<ma::BitcastOp>(
+          b.getIntegerType(value.getType().getIntOrFloatBitWidth()), value),
+      &b};
+
+  auto cst = [&](mlir::Type ty, int64_t n) -> Val {
+    return {b.create<ma::ConstantIntOp>(n, ty), &b};
+  };
+
+  // Shift bits to destination type, without sign bit.
+  Val from_sign_bit =
+      from_bits.shrui(value.getType().getIntOrFloatBitWidth() - 1) != 0;
+
+  from_bits =
+      from_bits & ((1LL << (value.getType().getIntOrFloatBitWidth() - 1)) - 1);
+
+  Value result_is_inf = IsInf(value, b);
+  Value input_is_nan = IsNaN(value, b);
+
+  auto cst_bits = [&](llvm::APFloat f) {
+    return cst(b.getIntegerType(llvm::APFloat::getSizeInBits(f.getSemantics())),
+               f.bitcastToAPInt().getZExtValue());
+  };
+  Value to_inf = cst_bits(llvm::APFloat::getInf(to_ty.getFloatSemantics()));
+  Value to_nan = cst_bits(llvm::APFloat::getNaN(to_ty.getFloatSemantics()));
+  Val to_zero = cst_bits(llvm::APFloat::getZero(to_ty.getFloatSemantics()));
+
+  auto round_bits_to_nearest_even = [&](Val bits, Val roundoff) {
+    assert(bits.value.getType() == roundoff.value.getType());
+    // Round to nearest even by adding a bias term.
+    // Consider a bit pattern
+    //   FFF...FLRTT...T,
+    // where bits RTT...T need to be rounded-off.  We add a bias term to the
+    // bit pattern s.t. a carry is introduced to round up only if
+    // - L is 1, R is 1, OR
+    // - L is 0, R is 1, any T is one.
+    // We do this by adding L to a bit pattern consisting of all T = 1.
+    Val rounded = (bits.shrui(roundoff) & 1) +
+                  (bits.MakeConstant(1).shl(roundoff - 1) - 1);
+    Val bias{b.create<SelectOp>(roundoff == 0, roundoff, rounded), &b};
+    return bits + bias;
+  };
+
+  // Happy path: no subnormals, infinities or NaNs.
+  Value result;
+  {
+    // Round the mantissa if it is shrinking.
+    Val rounded_from_bits = convert_int(wide_int_ty, from_bits);
+    if (digit_shift < 0) {
+      rounded_from_bits = round_bits_to_nearest_even(
+                              from_bits, from_bits.MakeConstant(-digit_shift)) &
+                          ~((1ll << (-digit_shift)) - 1);
+    }
+
+    // Re-bias the exponent.
+    rounded_from_bits = rounded_from_bits + (exp_offset << from_mantissa);
+
+    // Check for overflows by aligning the significands. We always align the
+    // narrower significand to the wider significand.
+    int64_t to_highest = llvm::APFloat::getLargest(to_ty.getFloatSemantics())
+                             .bitcastToAPInt()
+                             .getZExtValue();
+    int64_t aligned_highest = to_highest;
+    if (digit_shift < 0) {
+      aligned_highest <<= -digit_shift;
+      // Shift down, all dropped bits should already be zero.
+      result = rounded_from_bits.shrui(-digit_shift);
+    } else {
+      // Shift up, inserting zeros in the newly created digits.
+      rounded_from_bits = rounded_from_bits.shl(digit_shift);
+      result = rounded_from_bits;
+    }
+    result = convert_int(to_int_ty, result);
+
+    // `From` supports larger values than `To`, we may overflow.
+    if (std::make_pair(to_max_exp, to_mantissa) <
+        std::make_pair(from_max_exp, from_mantissa)) {
+      result = b.create<SelectOp>(
+          rounded_from_bits.cmp(CmpIPredicate::ugt, aligned_highest), to_inf,
+          result);
+    }
+  }
+
+  auto i32_ty = b.getI32Type();
+  Val biased_from_exp = convert_int(i32_ty, from_bits.shrui(from_mantissa));
+
+  if (to_min_exp < from_min_exp) {
+    // `To` supports more exponents near zero which means that some subnormal
+    // values in `From` may become normal.
+
+    // Subnormals.
+    Val bits = convert_int(wide_int_ty, from_bits);
+
+    // Determine exponent in target type.
+    Value normalization_factor =
+        convert_int(i32_ty,
+                    b.create<mlir::math::CountLeadingZerosOp>(from_bits)) -
+        (from_int_ty.getWidth() - from_mantissa - 1);
+
+    Val biased_exponent = cst(i32_ty, exp_offset + 1) - normalization_factor;
+    // If the result is subnormal, adjust the subnormal bits to account for
+    // the difference in exponent bias.
+    Value subnormal_bits = bits;
+    if (exp_offset < wide_int_ty.getWidth()) {
+      subnormal_bits = bits.shl(exp_offset);
+    }
+
+    // Result is normal. Shift the mantissa to account for the number of
+    // leading zero digits, and clear the hidden bit.
+    // Insert the exponent bits.
+    Value normal_bits =
+        (bits.shl(convert_int(wide_int_ty, normalization_factor)) &
+         ~(1 << from_mantissa)) |
+        convert_int(wide_int_ty, biased_exponent).shl(from_mantissa);
+
+    Value biased_exp_sle_zero = biased_exponent.cmp(CmpIPredicate::sle, 0);
+    bits.value =
+        b.create<SelectOp>(biased_exp_sle_zero, subnormal_bits, normal_bits);
+    if (digit_shift > 0) {
+      bits = bits.shl(digit_shift);
+    } else {
+      bits = round_bits_to_nearest_even(bits, bits.MakeConstant(-digit_shift));
+      bits = bits.shrui(-digit_shift);
+    }
+    bits = convert_int(to_int_ty, bits);
+
+    result = b.create<SelectOp>(biased_from_exp == 0, bits, result);
+  } else if (to_min_exp > from_min_exp) {
+    // `To` supports fewer exponents near zero which means that some values in
+    // `From` may become subnormal.
+    Val unbiased_exp = biased_from_exp - from_bias;
+    Val biased_to_exp = unbiased_exp + to_bias;
+    // Subnormals and zero.
+    // Round and shift mantissa down.
+    Val from_has_leading_one = biased_from_exp != 0;
+    Val from_has_leading_one_i32 = convert_int(i32_ty, from_has_leading_one);
+    from_has_leading_one = convert_int(from_int_ty, from_has_leading_one);
+    Val exponent_shift_i32 =
+        (from_has_leading_one_i32 - biased_to_exp) - digit_shift;
+    // Insert the implicit leading 1 bit on the mantissa for normalized
+    // inputs.
+    Val rounded_from_bits = (from_bits & ((1ll << from_mantissa) - 1)) |
+                            from_has_leading_one.shl(from_mantissa);
+
+    // NOTE: we need to round again from the original from_bits,
+    // otherwise the lower precision bits may already be lost.  There is
+    // an edge-case where rounding to a normalized value would normally
+    // round down, but for a subnormal, we need to round up.
+    Val exponent_shift_from_ty = convert_int(from_int_ty, exponent_shift_i32);
+    Val exponent_shift_to_ty = convert_int(to_int_ty, exponent_shift_i32);
+    Val positive_bits = convert_int(
+        to_int_ty,
+        round_bits_to_nearest_even(rounded_from_bits, exponent_shift_from_ty)
+            .shrui(exponent_shift_from_ty));
+    // To avoid UB, limit rounding and shifting to the full mantissa plus
+    // leading 1.
+    positive_bits.value = b.create<SelectOp>(
+        exponent_shift_i32.cmp(CmpIPredicate::sle, from_mantissa + 1),
+        positive_bits, to_zero);
+
+    Val negative_bits = convert_int(to_int_ty, rounded_from_bits)
+                            .shl(to_zero - exponent_shift_to_ty);
+    Value bits =
+        b.create<SelectOp>(exponent_shift_i32.cmp(CmpIPredicate::sgt, 0),
+                           positive_bits, negative_bits);
+    result = b.create<SelectOp>(biased_to_exp.cmp(CmpIPredicate::sle, 0), bits,
+                                result);
+  }
+
+  // Handle types with no unsigned zero.
+  auto is_nuz = [](mlir::FloatType ty) {
+    return ty.isFloat8E4M3B11FNUZ() || ty.isFloat8E4M3FNUZ() ||
+           ty.isFloat8E5M2FNUZ();
+  };
+
+  if (is_nuz(to_ty)) {
+    // Clear the sign bit if the result is zero (the output has no negative
+    // zero).
+    Val result_is_non_zero = Val{result, &b} != 0;
+    from_sign_bit = from_sign_bit & result_is_non_zero;
+  } else if (is_nuz(from_ty)) {
+    // Clear the sign bit if the input is NaN (it's positive but encoded as
+    // negative 0).
+    from_sign_bit = from_sign_bit ^ input_is_nan;
+  }
+
+  result = b.create<SelectOp>(result_is_inf, to_inf, result);
+  result = b.create<SelectOp>(from_bits == 0, to_zero, result);
+  result = b.create<SelectOp>(input_is_nan, to_nan, result);
+
+  Value neg_result = Val{result, &b} | (1ll << (to_int_ty.getWidth() - 1));
+
+  // Insert sign bit.
+  result = b.create<SelectOp>(from_sign_bit, neg_result, result);
+  result = b.create<ma::BitcastOp>(to_ty, result);
+  return result;
+}
+
+struct RewriteTruncFPattern : public mlir::OpRewritePattern<ma::TruncFOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      ma::TruncFOp op, mlir::PatternRewriter& rewriter) const override {
+    using FloatValue = mlir::TypedValue<mlir::FloatType>;
+    auto src = mlir::cast<FloatValue>(op.getOperand());
+    auto dst_ty = mlir::cast<mlir::FloatType>(op.getType());
+    if (dst_ty.getWidth() != 8) {
+      return rewriter.notifyMatchFailure(op, "not an 8 bit truncf");
+    }
+
+    mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+    rewriter.replaceOp(op, EmitFloatConversion(src, dst_ty, b));
+    return mlir::success();
+  }
+};
+
+struct RewriteExtFPattern : public mlir::OpRewritePattern<ma::ExtFOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      ma::ExtFOp op, mlir::PatternRewriter& rewriter) const override {
+    using FloatValue = mlir::TypedValue<mlir::FloatType>;
+    auto src = mlir::cast<FloatValue>(op.getOperand());
+    auto dst_ty = mlir::cast<mlir::FloatType>(op.getType());
+    if (src.getType().getWidth() != 8) {
+      return rewriter.notifyMatchFailure(op, "not an 8 bit extf");
+    }
+
+    mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+    rewriter.replaceOp(op, EmitFloatConversion(src, dst_ty, b));
+    return mlir::success();
+  }
+};
+
+// Lowering for cmpf : f8 for float to pred conversions.
+struct RewriteF8UneCst : public mlir::OpRewritePattern<ma::CmpFOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      ma::CmpFOp op, mlir::PatternRewriter& rewriter) const override {
+    using FloatValue = mlir::TypedValue<mlir::FloatType>;
+    auto lhs = mlir::cast<FloatValue>(op.getLhs());
+    auto rhs = mlir::cast<FloatValue>(op.getRhs());
+
+    llvm::APFloat rhs_cst(rhs.getType().getFloatSemantics());
+    if (lhs.getType().getWidth() != 8 ||
+        op.getPredicate() != ma::CmpFPredicate::UNE ||
+        !mlir::matchPattern(rhs, mlir::m_ConstantFloat(&rhs_cst))) {
+      return rewriter.notifyMatchFailure(
+          op, "not an 8 bit cmpf une with a constant");
+    }
+
+    mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+    Val int_value{b.create<ma::BitcastOp>(rewriter.getI8Type(), lhs), &b};
+    int64_t constant = rhs_cst.bitcastToAPInt().getZExtValue();
+    // If we're comparing to +-0, compare the absolute values.
+    if (rhs_cst.isZero() &&
+        (lhs.getType().isFloat8E4M3FN() || lhs.getType().isFloat8E5M2())) {
+      int_value = int_value & 0x7f;
+      constant &= 0x7f;
+    }
+    auto cst = b.create<ma::ConstantIntOp>(constant, rewriter.getI8Type());
+    rewriter.replaceOpWithNewOp<ma::CmpIOp>(op, ma::CmpIPredicate::ne,
+                                            int_value, cst);
+    return mlir::success();
+  }
+};
+
+struct RewriteAbsFPattern : public mlir::OpRewritePattern<mlir::math::AbsFOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::math::AbsFOp op, mlir::PatternRewriter& rewriter) const override {
+    using FloatValue = mlir::TypedValue<mlir::FloatType>;
+    auto src = mlir::cast<FloatValue>(op.getOperand());
+    // LowerGpuOpsToNVVMOps has a lowering for abs that doesn't work with bf16.
+    // Once that's removed, remove the code for BF16 here.
+    if (src.getType().getWidth() != 8 && !src.getType().isBF16()) {
+      return rewriter.notifyMatchFailure(op, "not an f8 or bf16 absf");
+    }
+    mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+    mlir::Type i_ty = rewriter.getIntegerType(src.getType().getWidth());
+    Val value{b.create<ma::BitcastOp>(i_ty, src), &b};
+    if (src.getType().getWidth() == 8) {
+      value = value & 0x7f;
+    } else {
+      CHECK(src.getType().isBF16());
+      value = value & 0x7fff;
+    }
+    rewriter.replaceOpWithNewOp<ma::BitcastOp>(op, src.getType(), value);
+    return mlir::success();
+  }
+};
+
+template <typename Op>
+struct RewriteIToFpPattern : public mlir::OpRewritePattern<Op> {
+  using mlir::OpRewritePattern<Op>::OpRewritePattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      Op op, mlir::PatternRewriter& rewriter) const override {
+    if (op.getType().getIntOrFloatBitWidth() != 8) {
+      return rewriter.notifyMatchFailure(op, "not an f8 itofp");
+    }
+    Value to_float =
+        rewriter.create<Op>(op.getLoc(), rewriter.getF32Type(), op.getIn());
+    rewriter.replaceOpWithNewOp<ma::TruncFOp>(op, op.getType(), to_float);
+    return mlir::success();
+  }
+};
+
 class ExpandFloatOpsPass
     : public impl::ExpandFloatOpsPassBase<ExpandFloatOpsPass> {
  public:
   using ExpandFloatOpsPassBase::ExpandFloatOpsPassBase;
   void runOnOperation() override {
     mlir::RewritePatternSet patterns(&getContext());
-    patterns.add<RewriteToCmpSelect<mlir::arith::MinimumFOp,
-                                    mlir::arith::CmpFPredicate::OLE>>(
+    patterns.add<RewriteToCmpSelect<ma::MinimumFOp, ma::CmpFPredicate::OLE>>(
         &getContext(), /*include_f32=*/pre_ampere_);
-    patterns.add<RewriteToCmpSelect<mlir::arith::MaximumFOp,
-                                    mlir::arith::CmpFPredicate::OGE>>(
+    patterns.add<RewriteToCmpSelect<ma::MaximumFOp, ma::CmpFPredicate::OGE>>(
         &getContext(), /*include_f32=*/pre_ampere_);
+    patterns.add<RewriteTruncFPattern, RewriteExtFPattern, RewriteAbsFPattern,
+                 RewriteF8UneCst, RewriteIToFpPattern<ma::SIToFPOp>,
+                 RewriteIToFpPattern<ma::UIToFPOp>>(&getContext());
     mlir::populatePolynomialApproximateTanhPattern(patterns);
     patterns.add<RewriteErf32Pattern>(&getContext());
     if (mlir::failed(mlir::applyPatternsAndFoldGreedily(getOperation(),
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/ir/BUILD b/third_party/xla/xla/service/gpu/fusions/mlir/ir/BUILD
index e82df33e965214..f6fb31a75ec6ec 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/ir/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/ir/BUILD
@@ -57,7 +57,10 @@ cc_library(
     hdrs = ["xla_gpu_ops.h"],
     deps = [
         ":xla_gpu_ops_inc_gen",
+        "//xla/service/gpu/model:indexing_analysis",
+        "//xla/service/gpu/model:indexing_map",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:BytecodeOpInterface",
         "@llvm-project//mlir:CallOpInterfaces",
         "@llvm-project//mlir:FuncDialect",
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.cc b/third_party/xla/xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.cc
index 45fe249097fdc8..4cc02c974cb420 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.cc
@@ -15,21 +15,61 @@ limitations under the License.
 
 #include "xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.h"
 
+#include <cstdint>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/TypeSwitch.h"  // IWYU pragma: keep
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/AffineExpr.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project  // IWYU pragma: keep
 #include "mlir/IR/DialectImplementation.h"  // from @llvm-project  // IWYU pragma: keep
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project  // IWYU pragma: keep
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/ValueRange.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
 #include "xla/service/gpu/fusions/mlir/ir/xla_gpu_dialect.cc.inc"
+#include "xla/service/gpu/model/indexing_analysis.h"
+#include "xla/service/gpu/model/indexing_map.h"
 
 namespace xla {
 namespace gpu {
 namespace {
+
+using llvm::ArrayRef;
+using mlir::AffineExpr;
+using mlir::AffineMap;
+using mlir::failure;
+using mlir::getAffineConstantExpr;
+using mlir::getAffineDimExpr;
+using mlir::getAffineSymbolExpr;
+using mlir::LogicalResult;
+using mlir::MLIRContext;
+using mlir::OpBuilder;
+using mlir::OperationState;
+using mlir::PatternRewriter;
+using mlir::RankedTensorType;
+using mlir::Region;
+using mlir::SmallVector;
+using mlir::success;
+using mlir::Type;
+using mlir::Value;
+using mlir::ValueRange;
+
+namespace arith = mlir::arith;
+
 struct XlaGpuInlinerInterface : public mlir::DialectInlinerInterface {
   using DialectInlinerInterface::DialectInlinerInterface;
   // Returns true if the given operation 'callable', that implements the
@@ -75,6 +115,7 @@ struct XlaGpuInlinerInterface : public mlir::DialectInlinerInterface {
     return true;
   }
 };
+
 }  // namespace
 
 void XlaGpuDialect::initialize() {
@@ -86,7 +127,7 @@ void XlaGpuDialect::initialize() {
   addInterfaces<XlaGpuInlinerInterface>();
 }
 
-mlir::LogicalResult PureCallOp::verifySymbolUses(
+LogicalResult PureCallOp::verifySymbolUses(
     mlir::SymbolTableCollection &symbolTable) {
   auto callee = getCalleeAttr();
   auto function =
@@ -105,7 +146,7 @@ mlir::LogicalResult PureCallOp::verifySymbolUses(
                        << "' expects " << func_arg_count;
   }
 
-  return mlir::success();
+  return success();
 }
 
 //===----------------------------------------------------------------------===//
@@ -117,6 +158,395 @@ void AllocateSharedOp::getAsmResultNames(
   setNameFn(getResult(), "shmem");
 }
 
+//===----------------------------------------------------------------------===//
+// ApplyIndexingOp
+//===----------------------------------------------------------------------===//
+
+void ApplyIndexingOp::build(OpBuilder &builder, OperationState &result,
+                            ValueRange dims, ValueRange symbols,
+                            const IndexingMap &indexing_map) {
+  SmallVector<Value, 4> operands;
+  operands.reserve(dims.size() + symbols.size());
+  operands.append(dims.begin(), dims.end());
+  operands.append(symbols.begin(), symbols.end());
+  build(builder, result, operands, indexing_map);
+}
+
+void ApplyIndexingOp::build(OpBuilder &builder, OperationState &result,
+                            ValueRange operands,
+                            const IndexingMap &indexing_map) {
+  build(builder, result, operands, indexing_map.GetAffineMap(),
+        indexing_map.GetDimVars(), indexing_map.GetRangeVars());
+}
+
+void ApplyIndexingOp::build(OpBuilder &builder, OperationState &result,
+                            ValueRange operands, AffineMap affine_map,
+                            ArrayRef<DimVar> dim_vars,
+                            ArrayRef<RangeVar> range_vars) {
+  SmallVector<int64_t, 4> lower_bounds, upper_bounds;
+  for (const DimVar &dim_var : dim_vars) {
+    lower_bounds.push_back(dim_var.bounds.lower);
+    upper_bounds.push_back(dim_var.bounds.upper);
+  }
+  for (const RangeVar &range_var : range_vars) {
+    lower_bounds.push_back(range_var.range.lower);
+    upper_bounds.push_back(range_var.range.upper);
+  }
+  build(builder, result, operands, affine_map, lower_bounds, upper_bounds);
+}
+
+void ApplyIndexingOp::build(OpBuilder &builder, OperationState &result,
+                            ValueRange operands, AffineMap affine_map,
+                            ArrayRef<int64_t> lower_bounds,
+                            ArrayRef<int64_t> upper_bounds) {
+  SmallVector<Type, 2> result_types(affine_map.getNumResults(),
+                                    builder.getIndexType());
+  build(builder, result, result_types, operands, affine_map, lower_bounds,
+        upper_bounds);
+}
+
+// Parser a comma-separated list of type %operand in [lower_bound, upper_bound].
+// Adds the parsed elements to the provided containers.
+mlir::ParseResult parseOperandsWithBoundsList(
+    mlir::OpAsmParser &parser,
+    SmallVector<mlir::OpAsmParser::UnresolvedOperand, 4> *operands,
+    SmallVector<int64_t, 4> *lower_bounds,
+    SmallVector<int64_t, 4> *upper_bounds) {
+  int64_t lower_bound, upper_bound;
+  mlir::OpAsmParser::UnresolvedOperand operand;
+  if (parser.parseCommaSeparatedList([&]() {
+        if (parser.parseOperand(operand) || parser.parseKeyword("in") ||
+            parser.parseLSquare() || parser.parseInteger(lower_bound) ||
+            parser.parseComma() || parser.parseInteger(upper_bound) ||
+            parser.parseRSquare()) {
+          return failure();
+        }
+        operands->push_back(operand);
+        lower_bounds->push_back(lower_bound);
+        upper_bounds->push_back(upper_bound);
+        return success();
+      })) {
+    return failure();
+  }
+  return success();
+}
+
+mlir::ParseResult ApplyIndexingOp::parse(mlir::OpAsmParser &parser,
+                                         OperationState &result) {
+  mlir::Builder &builder = parser.getBuilder();
+  auto index_type = builder.getIndexType();
+
+  mlir::AffineMapAttr affine_map_attr;
+  if (parser.parseAttribute(affine_map_attr, "map", result.attributes)) {
+    return failure();
+  }
+
+  SmallVector<mlir::OpAsmParser::UnresolvedOperand, 4> operands;
+  SmallVector<int64_t, 4> lower_bounds, upper_bounds;
+  if (succeeded(parser.parseOptionalLParen())) {
+    if (parseOperandsWithBoundsList(parser, &operands, &lower_bounds,
+                                    &upper_bounds) ||
+        parser.parseRParen()) {
+      return failure();
+    }
+  }
+  if (succeeded(parser.parseOptionalLSquare())) {
+    if (parseOperandsWithBoundsList(parser, &operands, &lower_bounds,
+                                    &upper_bounds) ||
+        parser.parseRSquare()) {
+      return failure();
+    }
+  }
+  if (parser.resolveOperands(operands, index_type, result.operands) ||
+      parser.parseOptionalAttrDict(result.attributes)) {
+    return failure();
+  }
+  result.addAttribute("lower_bounds",
+                      builder.getDenseI64ArrayAttr(lower_bounds));
+  result.addAttribute("upper_bounds",
+                      builder.getDenseI64ArrayAttr(upper_bounds));
+
+  auto map = affine_map_attr.getAffineMap();
+  result.addTypes(SmallVector<Type, 2>(map.getNumResults(), index_type));
+  return success();
+}
+
+void ApplyIndexingOp::print(mlir::OpAsmPrinter &p) {
+  mlir::AffineMapAttr affine_map_attr = getMapAttr();
+  AffineMap affine_map = affine_map_attr.getAffineMap();
+  p << " " << affine_map_attr;
+
+  auto lower_bounds = getLowerBounds();
+  auto upper_bounds = getUpperBounds();
+  auto operands = getOperands();
+  unsigned num_dimensions = affine_map.getNumDims();
+  if (num_dimensions > 0) {
+    p << '(';
+    for (int dim_id = 0; dim_id < num_dimensions; ++dim_id) {
+      p << operands[dim_id] << " in " << '[' << lower_bounds[dim_id] << ", "
+        << upper_bounds[dim_id] << ']';
+      if (dim_id != num_dimensions - 1) {
+        p << ", ";
+      }
+    }
+    p << ')';
+  }
+  unsigned num_symbols = affine_map.getNumSymbols();
+  if (num_symbols > 0) {
+    p << '[';
+    for (int symbol_id = 0; symbol_id < num_symbols; ++symbol_id) {
+      unsigned operand_id = num_dimensions + symbol_id;
+      p << operands[operand_id] << " in " << '[' << lower_bounds[operand_id]
+        << ", " << upper_bounds[operand_id] << ']';
+      if (symbol_id != num_symbols - 1) {
+        p << ", ";
+      }
+    }
+    p << ']';
+  }
+  p.printOptionalAttrDict((*this)->getAttrs(), /*elidedAttrs=*/{
+                              "map", "lower_bounds", "upper_bounds"});
+}
+
+LogicalResult ApplyIndexingOp::verify() {
+  auto affine_map = getMapAttr().getAffineMap();
+  unsigned num_variables = affine_map.getNumDims() + affine_map.getNumSymbols();
+  if (getOperands().size() != num_variables ||
+      getLowerBounds().size() != num_variables ||
+      getUpperBounds().size() != num_variables) {
+    return emitOpError(
+        "operand, lower_bounds, upper_bounds count and affine map dimension "
+        "and symbol count must match");
+  }
+  return success();
+}
+
+IndexingMap ApplyIndexingOp::getIndexingMap() {
+  auto lower_bounds = getLowerBounds();
+  auto upper_bounds = getUpperBounds();
+
+  AffineMap affine_map = getAffineMap();
+  unsigned num_dimensions = affine_map.getNumDims();
+  std::vector<DimVar> dim_vars;
+  dim_vars.reserve(num_dimensions);
+  for (unsigned id = 0; id < num_dimensions; ++id) {
+    dim_vars.push_back(DimVar{Interval{lower_bounds[id], upper_bounds[id]}});
+  }
+  unsigned num_symbols = affine_map.getNumSymbols();
+  std::vector<RangeVar> range_vars;
+  range_vars.reserve(num_symbols);
+  for (unsigned id = num_dimensions; id < num_symbols + num_dimensions; ++id) {
+    range_vars.push_back(
+        RangeVar{Interval{lower_bounds[id], upper_bounds[id]}});
+  }
+  return IndexingMap(affine_map, std::move(dim_vars), std::move(range_vars),
+                     /*rt_vars=*/{});
+}
+
+namespace {
+
+// Simplifies the indexing map, removes unused variables.
+struct SimplifyIndexingMap : public mlir::OpRewritePattern<ApplyIndexingOp> {
+  using OpRewritePattern<ApplyIndexingOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(ApplyIndexingOp indexing_op,
+                                PatternRewriter &rewriter) const override {
+    IndexingMap indexing_map = indexing_op.getIndexingMap();
+    bool is_simplified = indexing_map.Simplify(GetIndexingMapForInstruction);
+
+    // Remove unused symbols.
+    auto unused_symbols_bit_vector = indexing_map.RemoveUnusedVars();
+    bool symbols_removed = !unused_symbols_bit_vector.empty();
+
+    if (!is_simplified && !symbols_removed) {
+      return rewriter.notifyMatchFailure(indexing_op,
+                                         "IndexingMap stayed unchanged");
+    }
+    if (!unused_symbols_bit_vector.empty()) {
+      SmallVector<Value, 4> operands;
+      operands.reserve(unused_symbols_bit_vector.count());
+      for (int i = 0; i < unused_symbols_bit_vector.size(); ++i) {
+        if (!unused_symbols_bit_vector[i]) {
+          operands.push_back(indexing_op.getOperand(i));
+        }
+      }
+      rewriter.replaceOpWithNewOp<ApplyIndexingOp>(indexing_op, operands,
+                                                   indexing_map);
+    } else {
+      rewriter.replaceOpWithNewOp<ApplyIndexingOp>(
+          indexing_op, indexing_op.getOperands(), indexing_map);
+    }
+    return success();
+  }
+};
+
+// Folds constants into the indexing map.
+struct FoldApplyIndexingOperands
+    : public mlir::OpRewritePattern<ApplyIndexingOp> {
+  using OpRewritePattern<ApplyIndexingOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(ApplyIndexingOp indexing_op,
+                                PatternRewriter &rewriter) const override {
+    AffineMap affine_map = indexing_op.getAffineMap();
+
+    MLIRContext *ctx = affine_map.getContext();
+    unsigned num_operands = indexing_op->getNumOperands();
+    unsigned num_dims = affine_map.getNumDims();
+    unsigned num_symbols = affine_map.getNumSymbols();
+
+    SmallVector<std::optional<int64_t>> constant_values(num_operands,
+                                                        std::nullopt);
+    bool constant_found = false;
+    SmallVector<int64_t> dim_id_map(num_dims, -1);
+    SmallVector<int64_t> symbol_id_map(num_symbols, -1);
+    for (auto &operand : indexing_op->getOpOperands()) {
+      unsigned operand_number = operand.getOperandNumber();
+      auto constant = operand.get().getDefiningOp<arith::ConstantIndexOp>();
+      if (!constant) continue;
+      constant_values[operand_number] = constant.value();
+      constant_found = true;
+    }
+    if (!constant_found) {
+      return rewriter.notifyMatchFailure(indexing_op,
+                                         "No constant operands found");
+    }
+    unsigned new_num_dims = 0;
+    unsigned new_num_symbols = 0;
+    SmallVector<AffineExpr, 2> dim_replacements, symbol_replacements;
+    dim_replacements.reserve(num_dims);
+    symbol_replacements.reserve(num_symbols);
+
+    unsigned new_num_operands = new_num_dims + new_num_symbols;
+    SmallVector<Value, 4> new_operands;
+    new_operands.reserve(new_num_operands);
+    SmallVector<int64_t, 4> new_lbs, new_ubs;
+    new_lbs.reserve(new_num_operands);
+    new_ubs.reserve(new_num_operands);
+
+    for (auto [operand, constant_value, lb, ub] : llvm::zip(
+             indexing_op->getOpOperands(), constant_values,
+             indexing_op.getLowerBounds(), indexing_op.getUpperBounds())) {
+      unsigned operand_id = operand.getOperandNumber();
+      if (constant_value.has_value()) {
+        if (operand_id < num_dims) {
+          dim_replacements.push_back(
+              getAffineConstantExpr(*constant_value, ctx));
+        } else {
+          symbol_replacements.push_back(
+              getAffineConstantExpr(*constant_value, ctx));
+        }
+      } else {
+        if (operand_id < num_dims) {
+          dim_replacements.push_back(getAffineDimExpr(new_num_dims++, ctx));
+        } else {
+          symbol_replacements.push_back(
+              getAffineSymbolExpr(new_num_symbols++, ctx));
+        }
+        new_operands.push_back(operand.get());
+        new_lbs.push_back(lb);
+        new_ubs.push_back(ub);
+      }
+    }
+    rewriter.replaceOpWithNewOp<ApplyIndexingOp>(
+        indexing_op, new_operands,
+        affine_map.replaceDimsAndSymbols(dim_replacements, symbol_replacements,
+                                         new_num_dims, new_num_symbols),
+        new_lbs, new_ubs);
+    return success();
+  }
+};
+
+// Folds constant and dim/symbol expression results.
+struct FoldApplyIndexingResults
+    : public mlir::OpRewritePattern<ApplyIndexingOp> {
+  using OpRewritePattern<ApplyIndexingOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(ApplyIndexingOp indexing_op,
+                                PatternRewriter &rewriter) const override {
+    mlir::Location loc = indexing_op.getLoc();
+    IndexingMap indexing_map = indexing_op.getIndexingMap();
+    if (indexing_map.IsKnownEmpty()) {
+      return rewriter.notifyMatchFailure(indexing_op,
+                                         "Domain of the indexing map is empty");
+    }
+    AffineMap *affine_map = &indexing_map.GetMutableAffineMap();
+    unsigned num_results = affine_map->getNumResults();
+    SmallVector<AffineExpr, 4> new_exprs;
+    new_exprs.reserve(num_results);
+    SmallVector<Value, 4> new_values;
+    new_values.reserve(num_results);
+    Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+    for (mlir::OpResult opresult : indexing_op->getOpResults()) {
+      if (opresult.use_empty()) {
+        new_values.push_back(zero);
+        continue;
+      }
+
+      unsigned id = opresult.getResultNumber();
+      AffineExpr result_expr = affine_map->getResult(id);
+      if (auto const_expr =
+              mlir::dyn_cast<mlir::AffineConstantExpr>(result_expr)) {
+        new_values.push_back(rewriter.create<arith::ConstantIndexOp>(
+            loc, const_expr.getValue()));
+        continue;
+      }
+      if (auto dim_expr = mlir::dyn_cast<mlir::AffineDimExpr>(result_expr)) {
+        new_values.push_back(indexing_op.getOperand(dim_expr.getPosition()));
+        continue;
+      }
+      if (auto symbol_expr =
+              mlir::dyn_cast<mlir::AffineSymbolExpr>(result_expr)) {
+        new_values.push_back(indexing_op.getOperand(
+            indexing_map.GetDimVarsCount() + symbol_expr.getPosition()));
+        continue;
+      }
+      new_exprs.push_back(result_expr);
+      new_values.push_back(Value{});
+    }
+    if (new_exprs.size() == num_results) {
+      return rewriter.notifyMatchFailure(
+          indexing_op, "No constant or dim/symbol expression found");
+    }
+    *affine_map =
+        AffineMap::get(affine_map->getNumDims(), affine_map->getNumSymbols(),
+                       new_exprs, affine_map->getContext());
+    auto new_indexing_op = rewriter.create<ApplyIndexingOp>(
+        loc, indexing_op.getOperands(), indexing_map);
+    for (int new_result_id = 0, new_indexing_op_result_id = 0;
+         new_result_id < new_values.size(); ++new_result_id) {
+      auto &new_value = new_values[new_result_id];
+      if (new_value) continue;
+      new_value = new_indexing_op.getResult(new_indexing_op_result_id++);
+    }
+    rewriter.replaceOp(indexing_op, new_values);
+    return success();
+  }
+};
+
+}  // namespace
+
+void ApplyIndexingOp::getCanonicalizationPatterns(
+    mlir::RewritePatternSet &results, MLIRContext *context) {
+  results.add<FoldApplyIndexingOperands, FoldApplyIndexingResults,
+              SimplifyIndexingMap>(context);
+}
+
+mlir::LogicalResult ApplyIndexingOp::fold(
+    FoldAdaptor adaptor, llvm::SmallVectorImpl<mlir::OpFoldResult> &results) {
+  auto map = getAffineMap();
+  for (auto expr : map.getResults()) {
+    if (auto dim = mlir::dyn_cast<mlir::AffineDimExpr>(expr)) {
+      results.push_back(getOperand(dim.getPosition()));
+    } else if (auto sym = mlir::dyn_cast<mlir::AffineSymbolExpr>(expr)) {
+      results.push_back(getOperand(map.getNumDims() + sym.getPosition()));
+    } else {
+      results.clear();
+      return failure();
+    }
+  }
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // AtomicRMWOp
 //===----------------------------------------------------------------------===//
@@ -126,14 +556,6 @@ void AtomicRMWOp::getAsmResultNames(
   setNameFn(getResult(), "atomic_rmw");
 }
 
-using mlir::OpBuilder;
-using mlir::OperationState;
-using mlir::RankedTensorType;
-using mlir::Region;
-using mlir::Type;
-using mlir::Value;
-using mlir::ValueRange;
-
 void AtomicRMWOp::build(OpBuilder &builder, OperationState &result,
                         Value tensor, ValueRange ivs) {
   OpBuilder::InsertionGuard g(builder);
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.h b/third_party/xla/xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.h
index c4b11a03e66ca6..0b200bc545bff3 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.h
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project  // IWYU pragma: keep
 #include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project  // IWYU pragma : keep
 #include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project  // IWYU pragma : keep
+#include "xla/service/gpu/model/indexing_map.h"
 
 #define GET_OP_CLASSES
 #include "xla/service/gpu/fusions/mlir/ir/xla_gpu_dialect.h.inc"
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.td b/third_party/xla/xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.td
index 1cbc4626eac858..fb89e2bed2164d 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.td
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.td
@@ -233,4 +233,52 @@ def XLAGPU_PredicatedExtractOp : XLAGPU_Op<"predicated_extract",
   }];
 }
 
+def ApplyIndexingOp : XLAGPU_Op<"apply_indexing", [Pure]> {
+  let summary = "Applies indexing map to a list of SSA values";
+  let description = [{
+    The `apply_indexing` operation applies an affine map to a list
+    of SSA values, yielding a single SSA value. The number of dimension and
+    symbol arguments must be equal to the respective number of dimensional and
+    symbolic inputs in the affine map. The affine mapping can be
+    multi-dimensional, and so the `apply_indexing` operation always returns one
+    value. The  operands and results must all have ‘index’ type.
+
+    Example:
+
+    ```mlir
+    #map = affine_map<(d0, d1)[s0] -> (d0 floordiv 8 + d1 floordiv 128, s0)>
+    %results:2 = xla_gpu_ops.apply_indexing #map (%0 in [0, 10], %1 in [0, 11])[%2 in [11, 32]]
+    ```
+  }];
+  let arguments = (ins Variadic<Index>:$operands,
+                       AffineMapAttr:$map,
+                       DenseI64ArrayAttr:$lower_bounds,
+                       DenseI64ArrayAttr:$upper_bounds);
+  let results = (outs Variadic<Index>);
+
+  let builders = [
+    OpBuilder<(ins "mlir::ValueRange":$dims, "mlir::ValueRange":$symbols,
+                   "const IndexingMap&":$indexing_map)>,
+    OpBuilder<(ins "mlir::ValueRange":$operands,
+                   "const IndexingMap&":$indexing_map)>,
+    OpBuilder<(ins "mlir::ValueRange":$operands, "mlir::AffineMap":$affine_map,
+               "llvm::ArrayRef<DimVar>":$dim_vars,
+               "llvm::ArrayRef<RangeVar>":$range_vars)>,
+    OpBuilder<(ins "mlir::ValueRange":$operands,
+                   "mlir::AffineMap":$affine_map,
+                   "llvm::ArrayRef<int64_t>":$lower_bounds,
+                   "llvm::ArrayRef<int64_t>":$upper_bounds)>,
+  ];
+  let extraClassDeclaration = [{
+    // Returns the indexing map constructed from affine_map and the bounds.
+    xla::gpu::IndexingMap getIndexingMap();
+    // Extracts the affine map from the attribute.
+    mlir::AffineMap getAffineMap() { return getMapAttr().getAffineMap(); }
+  }];
+  let hasCustomAssemblyFormat = 1;
+  let hasVerifier = 1;
+  let hasCanonicalizer = 1;
+  let hasFolder = 1;
+}
+
 #endif // MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/lower_tensors.cc b/third_party/xla/xla/service/gpu/fusions/mlir/lower_tensors.cc
index cee198a44dc726..43bc759b067221 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/lower_tensors.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/lower_tensors.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <cstdint>
-#include <limits>
 #include <memory>
+#include <optional>
 #include <string>
 #include <tuple>
 #include <utility>
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"  // from @llvm-project
 #include "mlir/Dialect/Affine/IR/AffineOps.h"  // from @llvm-project
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
@@ -40,6 +41,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/TypeRange.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
@@ -53,28 +55,41 @@ limitations under the License.
 #include "xla/layout_util.h"
 #include "xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.h"
 #include "xla/shape_util.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
+namespace {
 
+#define GEN_PASS_DECL_LOWERTENSORSPASS
 #define GEN_PASS_DEF_LOWERTENSORSPASS
 #include "xla/service/gpu/fusions/mlir/passes.h.inc"
 
-namespace {
-
 using mlir::failure;
+using mlir::Location;
+using mlir::LogicalResult;
+using mlir::MLIRContext;
+using mlir::OpBuilder;
+using mlir::Operation;
 using mlir::success;
+using mlir::Type;
+using mlir::TypedValue;
+using mlir::TypeRange;
 using mlir::Value;
 using mlir::ValueRange;
 
+namespace arith = ::mlir::arith;
+namespace scf = ::mlir::scf;
+namespace ml = ::mlir::LLVM;
+
 struct RewriteFunctionSignatures : mlir::OpRewritePattern<mlir::func::FuncOp> {
   using OpRewritePattern::OpRewritePattern;
 
-  mlir::LogicalResult matchAndRewrite(
+  LogicalResult matchAndRewrite(
       mlir::func::FuncOp op, mlir::PatternRewriter& rewriter) const override {
-    auto is_tensor = [](mlir::Type ty) {
-      return ty.isa<mlir::RankedTensorType>();
+    auto is_tensor = [](Type ty) {
+      return mlir::isa<mlir::RankedTensorType>(ty);
     };
     if (!llvm::any_of(op.getFunctionType().getInputs(), is_tensor)) {
       return rewriter.notifyMatchFailure(op,
@@ -90,7 +105,7 @@ struct RewriteFunctionSignatures : mlir::OpRewritePattern<mlir::func::FuncOp> {
       return failure();
     }
 
-    mlir::TypeRange new_results = op.getFunctionType().getResults();
+    TypeRange new_results = op.getFunctionType().getResults();
     if (some_tensor_result) {
       new_results = {};
       auto terminator = op.getFunctionBody().front().getTerminator();
@@ -98,8 +113,7 @@ struct RewriteFunctionSignatures : mlir::OpRewritePattern<mlir::func::FuncOp> {
       rewriter.replaceOpWithNewOp<mlir::func::ReturnOp>(terminator);
     }
 
-    llvm::SmallVector<mlir::Type> new_operands(
-        op.getFunctionType().getInputs());
+    llvm::SmallVector<Type> new_operands(op.getFunctionType().getInputs());
     for (auto&& [index, operand] : llvm::enumerate(new_operands)) {
       if (is_tensor(operand)) {
         rewriter.setInsertionPointToStart(&op.getBody().front());
@@ -120,12 +134,12 @@ struct RewriteFunctionSignatures : mlir::OpRewritePattern<mlir::func::FuncOp> {
   }
 };
 
-Value GetLinearIndex(mlir::TypedValue<mlir::RankedTensorType> tensor,
+Value GetLinearIndex(TypedValue<mlir::RankedTensorType> tensor,
                      ValueRange indices, mlir::PatternRewriter& rewriter) {
   auto byte_shape = ShapeUtil::MakeShape(U8, tensor.getType().getShape());
   if (auto encoding = tensor.getType().getEncoding()) {
     *byte_shape.mutable_layout() = LayoutUtil::MakeLayout(llvm::to_vector(
-        encoding.cast<mlir::DenseElementsAttr>().getValues<int64_t>()));
+        mlir::cast<mlir::DenseElementsAttr>(encoding).getValues<int64_t>()));
   }
   auto linearize_map = mlir::getAffineConstantExpr(0, rewriter.getContext());
   for (auto [dim, stride] :
@@ -153,9 +167,9 @@ std::tuple<Value, Value> GetI4IndexAndNibble(Value linear_index,
   return {i8_index, is_low_nibble};
 }
 
-mlir::LLVM::GEPOp CreateGep(mlir::TypedValue<mlir::RankedTensorType> tensor,
+mlir::LLVM::GEPOp CreateGep(TypedValue<mlir::RankedTensorType> tensor,
                             Value linear_index, mlir::PatternRewriter& rewriter,
-                            mlir::Type element_type = nullptr) {
+                            Type element_type = nullptr) {
   if (!element_type) {
     element_type = tensor.getType().getElementType();
   }
@@ -172,7 +186,7 @@ mlir::LLVM::GEPOp CreateGep(mlir::TypedValue<mlir::RankedTensorType> tensor,
   return gep;
 }
 
-mlir::LLVM::GEPOp CreateGep(mlir::TypedValue<mlir::RankedTensorType> tensor,
+mlir::LLVM::GEPOp CreateGep(TypedValue<mlir::RankedTensorType> tensor,
                             ValueRange indices,
                             mlir::PatternRewriter& rewriter) {
   return CreateGep(tensor, GetLinearIndex(tensor, indices, rewriter), rewriter);
@@ -181,13 +195,13 @@ mlir::LLVM::GEPOp CreateGep(mlir::TypedValue<mlir::RankedTensorType> tensor,
 struct RewriteTensorExtract : mlir::OpRewritePattern<mlir::tensor::ExtractOp> {
   using OpRewritePattern::OpRewritePattern;
 
-  mlir::LogicalResult matchAndRewrite(
+  LogicalResult matchAndRewrite(
       mlir::tensor::ExtractOp op,
       mlir::PatternRewriter& rewriter) const override {
     mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
     auto linear_index =
         GetLinearIndex(op.getTensor(), op.getIndices(), rewriter);
-    mlir::Type element_type = op.getTensor().getType().getElementType();
+    Type element_type = op.getTensor().getType().getElementType();
     Value is_low_nibble = nullptr;
     if (element_type == rewriter.getI4Type()) {
       element_type = rewriter.getI8Type();
@@ -218,19 +232,19 @@ struct RewriteTensorExtract : mlir::OpRewritePattern<mlir::tensor::ExtractOp> {
 struct RewriteTensorInsert : mlir::OpRewritePattern<mlir::tensor::InsertOp> {
   using OpRewritePattern::OpRewritePattern;
 
-  mlir::LogicalResult matchAndRewrite(
+  LogicalResult matchAndRewrite(
       mlir::tensor::InsertOp op,
       mlir::PatternRewriter& rewriter) const override {
     Value dest = op.getDest();
     while (dest.getDefiningOp()) {
-      int result_number = dest.cast<mlir::OpResult>().getResultNumber();
+      int result_number = mlir::cast<mlir::OpResult>(dest).getResultNumber();
       if (auto insert = dest.getDefiningOp<mlir::tensor::InsertOp>()) {
         dest = insert.getDest();
-      } else if (auto scf_if = dest.getDefiningOp<mlir::scf::IfOp>()) {
+      } else if (auto scf_if = dest.getDefiningOp<scf::IfOp>()) {
         // Pick one of the branches, they're required to yield the same buffers.
         dest = scf_if.getThenRegion().front().getTerminator()->getOperand(
             result_number);
-      } else if (auto scf_for = dest.getDefiningOp<mlir::scf::ForOp>()) {
+      } else if (auto scf_for = dest.getDefiningOp<scf::ForOp>()) {
         dest = scf_for.getInitArgs()[result_number];
       } else if (dest.getDefiningOp<mlir::UnrealizedConversionCastOp>() ||
                  dest.getDefiningOp<AllocateSharedOp>()) {
@@ -241,7 +255,7 @@ struct RewriteTensorInsert : mlir::OpRewritePattern<mlir::tensor::InsertOp> {
     }
 
     mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    auto tensor_dest = dest.cast<mlir::TypedValue<mlir::RankedTensorType>>();
+    auto tensor_dest = mlir::cast<TypedValue<mlir::RankedTensorType>>(dest);
     auto linear_index = GetLinearIndex(tensor_dest, op.getIndices(), rewriter);
     auto element_type = tensor_dest.getType().getElementType();
     Value is_low_nibble = nullptr;
@@ -290,16 +304,16 @@ struct RewriteTensorInsert : mlir::OpRewritePattern<mlir::tensor::InsertOp> {
 struct RewriteCall : mlir::OpRewritePattern<mlir::func::CallOp> {
   using OpRewritePattern::OpRewritePattern;
 
-  mlir::LogicalResult matchAndRewrite(
+  LogicalResult matchAndRewrite(
       mlir::func::CallOp op, mlir::PatternRewriter& rewriter) const override {
-    if (!llvm::any_of(op->getOperandTypes(), [](mlir::Type ty) {
-          return ty.isa<mlir::RankedTensorType>();
+    if (!llvm::any_of(op->getOperandTypes(), [](Type ty) {
+          return mlir::isa<mlir::RankedTensorType>(ty);
         })) {
       return rewriter.notifyMatchFailure(op, "the call has no input tensors");
     }
 
     for (const auto&& [index, arg] : llvm::enumerate(op.getOperands())) {
-      if (arg.getType().isa<mlir::RankedTensorType>()) {
+      if (mlir::isa<mlir::RankedTensorType>(arg.getType())) {
         op.setOperand(
             index,
             rewriter
@@ -313,37 +327,85 @@ struct RewriteCall : mlir::OpRewritePattern<mlir::func::CallOp> {
   }
 };
 
+mlir::LLVM::GlobalOp CreateGlobalOp(mlir::Attribute value,
+                                    const std::string& name_prefix,
+                                    mlir::ShapedType shaped_ty,
+                                    mlir::ModuleOp module, bool is_constant,
+                                    int addr_space,
+                                    mlir::ImplicitLocOpBuilder& b) {
+  if (auto elements = mlir::dyn_cast_or_null<mlir::DenseElementsAttr>(value)) {
+    // The lowering to LLVM only works for 1d tensors or those with trailing
+    // unit dimensions.
+    value = elements.reshape(mlir::RankedTensorType::get(
+        {elements.getNumElements()}, elements.getElementType()));
+  }
+
+  Type element_type = shaped_ty.getElementType();
+  // Needed to support complex element type.
+  mlir::LLVMTypeConverter converter(b.getContext());
+  auto llvm_element_type = converter.convertType(element_type);
+  auto array_ty = mlir::LLVM::LLVMArrayType::get(llvm_element_type,
+                                                 shaped_ty.getNumElements());
+  std::string name;
+  int index = 0;
+  do {
+    name = absl::StrCat(name_prefix, index);
+    ++index;
+  } while (module.lookupSymbol(name));
+  b.setInsertionPointToStart(module.getBody());
+  return b.create<mlir::LLVM::GlobalOp>(
+      array_ty, is_constant,
+      /*linkage=*/mlir::LLVM::Linkage::Private, name, value, /*alignment=*/0,
+      addr_space);
+}
+
 struct RewriteAllocateShared : mlir::OpRewritePattern<AllocateSharedOp> {
   using OpRewritePattern::OpRewritePattern;
 
-  mlir::LogicalResult matchAndRewrite(
+  LogicalResult matchAndRewrite(
       AllocateSharedOp op, mlir::PatternRewriter& rewriter) const override {
     auto module = op->getParentOfType<mlir::ModuleOp>();
-    auto shaped_ty = op.getResult().getType().cast<mlir::ShapedType>();
+    auto shaped_ty = mlir::cast<mlir::ShapedType>(op.getResult().getType());
     constexpr int kGPUSharedMemoryAddrSpace = 3;
-    mlir::Type element_type = shaped_ty.getElementType();
-    if (auto complex_ty = mlir::dyn_cast<mlir::ComplexType>(element_type)) {
-      element_type = mlir::LLVM::LLVMStructType::getLiteral(
-          getContext(),
-          {complex_ty.getElementType(), complex_ty.getElementType()});
-    }
+    mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+
+    auto global =
+        CreateGlobalOp(mlir::Attribute{}, "shared_", shaped_ty, module,
+                       /*is_constant=*/false, kGPUSharedMemoryAddrSpace, b);
+
+    rewriter.setInsertionPoint(op);
+    auto addr = rewriter.create<mlir::LLVM::AddressOfOp>(op.getLoc(), global);
+    rewriter.replaceOpWithNewOp<mlir::UnrealizedConversionCastOp>(
+        op, op.getResult().getType(),
+        rewriter
+            .create<mlir::LLVM::AddrSpaceCastOp>(
+                op.getLoc(), mlir::LLVM::LLVMPointerType::get(op.getContext()),
+                addr)
+            .getResult());
+    return success();
+  }
+};
 
-    auto array_ty = mlir::LLVM::LLVMArrayType::get(element_type,
-                                                   shaped_ty.getNumElements());
+struct RewriteNonScalarConstants
+    : mlir::OpRewritePattern<mlir::arith::ConstantOp> {
+  using OpRewritePattern::OpRewritePattern;
 
-    std::string name;
-    int index = 0;
-    do {
-      name = absl::StrCat("shared_", index);
-      ++index;
-    } while (module.lookupSymbol(name));
+  mlir::LogicalResult matchAndRewrite(
+      mlir::arith::ConstantOp op,
+      mlir::PatternRewriter& rewriter) const override {
+    auto shaped_ty = mlir::dyn_cast<mlir::ShapedType>(op.getValue().getType());
+    // We only need to rewrite non-scalar constants.
+    if (!shaped_ty || shaped_ty.getNumElements() < 2) {
+      return rewriter.notifyMatchFailure(
+          op, "the op is an effective scalar constant");
+    }
 
-    rewriter.setInsertionPointToStart(module.getBody());
-    auto global = rewriter.create<mlir::LLVM::GlobalOp>(
-        op.getLoc(), array_ty, /*isConstant=*/false,
-        /*linkage=*/mlir::LLVM::Linkage::Private, name,
-        /*value=*/mlir::Attribute{},
-        /*alignment=*/0, kGPUSharedMemoryAddrSpace);
+    constexpr int kGPUGlobalMemoryAddrSpace = 0;
+    mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+    auto module = op->getParentOfType<mlir::ModuleOp>();
+    auto global =
+        CreateGlobalOp(op.getValue(), "global_cst_", shaped_ty, module,
+                       /*is_constant=*/true, kGPUGlobalMemoryAddrSpace, b);
 
     rewriter.setInsertionPoint(op);
     auto addr = rewriter.create<mlir::LLVM::AddressOfOp>(op.getLoc(), global);
@@ -361,7 +423,7 @@ struct RewriteAllocateShared : mlir::OpRewritePattern<AllocateSharedOp> {
 struct RewriteSyncThreads : mlir::OpRewritePattern<SyncThreadsOp> {
   using OpRewritePattern::OpRewritePattern;
 
-  mlir::LogicalResult matchAndRewrite(
+  LogicalResult matchAndRewrite(
       SyncThreadsOp op, mlir::PatternRewriter& rewriter) const override {
     rewriter.create<mlir::gpu::BarrierOp>(op.getLoc());
     rewriter.replaceOp(op, op.getOperands());
@@ -369,46 +431,363 @@ struct RewriteSyncThreads : mlir::OpRewritePattern<SyncThreadsOp> {
   }
 };
 
-// Implements atomic binary operations using atomic compare-and-swap
-// (atomicCAS) as follows:
-//   1. Reads the value from the memory pointed to by output_address and
-//     records it as old_output.
-//   2. Uses old_output as one of the source operand to perform the binary
-//     operation and stores the result in new_output.
-//   3. Calls atomicCAS which implements compare-and-swap as an atomic
-//     operation. In particular, atomicCAS reads the value from the memory
-//     pointed to by output_address, and compares the value with old_output. If
-//     the two values equal, new_output is written to the same memory location
-//     and true is returned to indicate that the atomic operation succeeds.
-//     Otherwise, the new value read from the memory is returned. In this case,
-//     the new value is copied to old_output, and steps 2. and 3. are repeated
-//     until atomicCAS succeeds.
-//
-// On Nvidia GPUs, atomicCAS can only operate on 32 bit and 64 bit integers. If
-// the element type of the binary operation is 32 bits or 64 bits, the integer
-// type of the same size is used for the atomicCAS operation. On the other hand,
-// if the element type is smaller than 32 bits, int32_t is used for the
-// atomicCAS operation. In this case, atomicCAS reads and writes 32 bit values
-// from the memory, which is larger than the memory size required by the
-// original atomic binary operation. We mask off the last two bits of the
-// output_address and use the result as an address to read the 32 bit values
-// from the memory. This can avoid out of bound memory accesses if tensor
-// buffers are 4 byte aligned and have a size of 4N, an assumption that the
-// runtime can guarantee.
-struct RewriteAtomicRMW : mlir::OpRewritePattern<AtomicRMWOp> {
+// TODO(jreiffers): Generalize this to support index switches with some used
+// results and upstream it as a canonicalization pattern.
+struct RemoveUnusedIndexSwitchResults
+    : mlir::OpRewritePattern<scf::IndexSwitchOp> {
   using OpRewritePattern::OpRewritePattern;
 
-  mlir::LogicalResult matchAndRewrite(
+  LogicalResult matchAndRewrite(
+      scf::IndexSwitchOp op, mlir::PatternRewriter& rewriter) const override {
+    if (op->getNumResults() == 0 || !op->use_empty()) {
+      return rewriter.notifyMatchFailure(op, "the op has users");
+    }
+
+    auto new_op = rewriter.create<scf::IndexSwitchOp>(
+        op.getLoc(), mlir::TypeRange{}, op.getArg(), op.getCases(),
+        op.getNumCases());
+    for (int i = 0; i < op->getNumRegions(); ++i) {
+      auto& old_region = op->getRegion(i);
+      auto& new_region = new_op->getRegion(i);
+      rewriter.mergeBlocks(&old_region.getBlocks().front(),
+                           &new_region.emplaceBlock());
+      auto yield_op = new_region.getBlocks().front().getTerminator();
+      rewriter.modifyOpInPlace(yield_op, [&]() { yield_op->setOperands({}); });
+    }
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+
+bool IsAtomicIntegral(Type element_type) {
+  unsigned element_bitwidth = element_type.getIntOrFloatBitWidth();
+  return element_type.isInteger() &&
+         (element_bitwidth == 32 || element_bitwidth == 64);
+}
+
+Value CreateBitcast(mlir::ImplicitLocOpBuilder& b, Value value, Type ty) {
+  if (value.getType().isIntOrFloat() && ty.isIntOrFloat()) {
+    return b.create<ml::BitcastOp>(ty, value);
+  }
+
+  mlir::LLVMTypeConverter converter(b.getContext());
+  // If either type is a complex, we need to go through an alloca, since no
+  // direct bitcast from a struct to an int is possible.
+  Type llvm_input_ty = converter.convertType(value.getType());
+  Type llvm_result_ty = converter.convertType(ty);
+  Type ptr_ty = mlir::LLVM::LLVMPointerType::get(b.getContext());
+
+  Value llvm_value =
+      b.create<mlir::UnrealizedConversionCastOp>(llvm_input_ty, value)
+          .getResult(0);
+  Value alloca = b.create<ml::AllocaOp>(
+      ptr_ty, llvm_input_ty, b.create<ml::ConstantOp>(b.getI32Type(), 1));
+  b.create<ml::StoreOp>(llvm_value, alloca);
+  auto result = b.create<ml::LoadOp>(llvm_result_ty, alloca).getResult();
+  return b.create<mlir::UnrealizedConversionCastOp>(ty, result).getResult(0);
+};
+
+class RewriteAtomicRMW : public mlir::OpRewritePattern<AtomicRMWOp> {
+ public:
+  RewriteAtomicRMW(mlir::MLIRContext* context, bool is_amd,
+                   const std::string& gpu_arch)
+      : mlir::OpRewritePattern<AtomicRMWOp>(context),
+        is_amd_(is_amd),
+        gpu_arch_(gpu_arch) {}
+
+  LogicalResult matchAndRewrite(
       AtomicRMWOp op, mlir::PatternRewriter& rewriter) const override {
-    namespace ml = mlir::LLVM;
-    mlir::Location loc = op.getLoc();
+    if (failed(rewriteAsDirectAtomicRMW(op, rewriter))) {
+      rewriteAsAtomicCAS(op, rewriter);
+    }
+    rewriter.replaceOp(op, op.getInput());
+    return success();
+  }
+
+ private:
+  // Returns atomic op modifier and the atomic bin op kind.
+  std::optional<std::pair<Value, ml::AtomicBinOp>> GetAtomicModifierParameters(
+      AtomicRMWOp op) const {
+    Type element_type = op.getInput().getType().getElementType();
+    auto& operations = op.getBody()->getOperations();
+    auto terminator = op.getBody()->getTerminator();
+    if (operations.size() > 2) {
+      return std::nullopt;
+    }
+    // If the body contains only the terminator, then it is an atomic store.
+    if (operations.size() == 1) {
+      // TODO(b/336367145): Support complex<f32> atomic store.
+      if (element_type.isF32() || IsAtomicIntegral(element_type)) {
+        return std::make_pair(terminator->getOperand(0), ml::AtomicBinOp::xchg);
+      }
+      return std::nullopt;
+    }
+    // Match the kind of the atomic op.
+    mlir::Operation* modifier_op = &operations.front();
+    std::optional<ml::AtomicBinOp> kind =
+        llvm::TypeSwitch<Operation*, std::optional<ml::AtomicBinOp>>(
+            modifier_op)
+            // Floating-point operations.
+            .Case([](arith::AddFOp op) { return ml::AtomicBinOp::fadd; })
+            .Case([](arith::MaximumFOp op) { return ml::AtomicBinOp::fmax; })
+            .Case([](arith::MinimumFOp op) { return ml::AtomicBinOp::fmin; })
+            // Integer operations.
+            .Case([&](arith::AddIOp op) {
+              return IsAtomicIntegral(element_type)
+                         ? std::make_optional(ml::AtomicBinOp::add)
+                         : std::nullopt;
+            })
+            .Case([&](arith::MaxUIOp op) {
+              return IsAtomicIntegral(element_type)
+                         ? std::make_optional(ml::AtomicBinOp::umax)
+                         : std::nullopt;
+            })
+            .Case([&](arith::MinUIOp op) {
+              return IsAtomicIntegral(element_type)
+                         ? std::make_optional(ml::AtomicBinOp::umin)
+                         : std::nullopt;
+            })
+            .Case([&](arith::MaxSIOp op) {
+              return IsAtomicIntegral(element_type)
+                         ? std::make_optional(ml::AtomicBinOp::max)
+                         : std::nullopt;
+            })
+            .Case([&](arith::MinSIOp op) {
+              return IsAtomicIntegral(element_type)
+                         ? std::make_optional(ml::AtomicBinOp::min)
+                         : std::nullopt;
+            })
+            .Default([](Operation* op) { return std::nullopt; });
+    if (!kind.has_value()) {
+      return std::nullopt;
+    }
+    // Find the modifier arg that does not match the argument of `atomic_rmw`
+    // body.
+    Value block_arg = op.getBody()->getArgument(0);
+    Value modifier_arg = modifier_op->getOperand(0) == block_arg
+                             ? modifier_op->getOperand(1)
+                             : modifier_op->getOperand(0);
+    return std::make_pair(modifier_arg, *kind);
+  }
+
+  // Certain computations, such as floating-point addition and integer
+  // maximization, can be simply implemented using an LLVM atomic instruction.
+  // If "computation" is one of this kind, emits code to do that and returns
+  // true; otherwise, returns false.
+  LogicalResult rewriteAsDirectAtomicRMW(
+      AtomicRMWOp op, mlir::PatternRewriter& rewriter) const {
+    auto modifier_parameters = GetAtomicModifierParameters(op);
+    if (!modifier_parameters.has_value()) {
+      return failure();
+    }
+    Value modifier_arg = modifier_parameters->first;
+    Type element_type = modifier_arg.getType();
+    ml::AtomicBinOp atomic_bin_op = modifier_parameters->second;
+
+    Location loc = op.getLoc();
+    llvm::StringRef sync_scope = is_amd_ ? "agent" : "";
+    Value addr = CreateGep(op.getInput(), op.getIndices(), rewriter);
+
+    switch (atomic_bin_op) {
+      case ml::AtomicBinOp::xchg: {
+        rewriter.create<ml::StoreOp>(
+            loc, modifier_arg, addr,
+            /*alignment=*/element_type.getIntOrFloatBitWidth(),
+            /*volatile*/ false, /*isNonTemporal=*/false,
+            ml::AtomicOrdering::unordered);
+        return success();
+      }
+      case ml::AtomicBinOp::add:
+      case ml::AtomicBinOp::max:
+      case ml::AtomicBinOp::min:
+      case ml::AtomicBinOp::umax:
+      case ml::AtomicBinOp::umin: {
+        rewriter.create<ml::AtomicRMWOp>(loc, atomic_bin_op, addr, modifier_arg,
+                                         ml::AtomicOrdering::seq_cst,
+                                         sync_scope);
+        return success();
+      }
+      case ml::AtomicBinOp::fadd: {
+        // TODO(b/336367154): Introduce an atomic_rmw op with the binOp attr.
+        return is_amd_ ? emitAMDAtomicFAdd(loc, modifier_arg, addr, sync_scope,
+                                           gpu_arch_, rewriter)
+                       : emitNVidiaAtomicFAdd(loc, modifier_arg, addr,
+                                              sync_scope, gpu_arch_, rewriter);
+      }
+      case ml::AtomicBinOp::fmax: {
+        return rewriteAtomicFMaxAsIntAtomics(loc, modifier_arg, addr,
+                                             sync_scope, rewriter);
+      }
+      default:
+        return failure();
+    }
+    return success();
+  }
+
+  LogicalResult emitNVidiaAtomicFAdd(Location loc, Value modifier_arg,
+                                     Value addr, llvm::StringRef sync_scope,
+                                     llvm::StringRef cuda_arch,
+                                     OpBuilder& b) const {
+    se::CudaComputeCapability cuda_compute_capability(cuda_arch.str());
+    Type element_type = modifier_arg.getType();
+    // "atom.add.f64 requires sm_60 or higher."
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom
+    bool is_supported_f16_atomic =
+        element_type.isF16() &&
+        cuda_compute_capability.IsAtLeast(se::CudaComputeCapability::VOLTA);
+    bool is_supported_f64_atomic =
+        element_type.isF64() &&
+        cuda_compute_capability.IsAtLeast(se::CudaComputeCapability::PASCAL_);
+    if (!element_type.isF32() && !is_supported_f16_atomic &&
+        !is_supported_f64_atomic) {
+      return failure();
+    }
+    b.create<ml::AtomicRMWOp>(loc, ml::AtomicBinOp::fadd, addr, modifier_arg,
+                              ml::AtomicOrdering::seq_cst, sync_scope);
+    return success();
+  }
+
+  LogicalResult emitAMDAtomicFAdd(Location loc, Value modifier_arg, Value addr,
+                                  llvm::StringRef sync_scope,
+                                  llvm::StringRef gcn_arch,
+                                  OpBuilder& b) const {
+    se::RocmComputeCapability rocm_compute_capability(gcn_arch.str());
+    Type element_type = modifier_arg.getType();
+    bool is_supported_f16_atomic =
+        element_type.isF16() &&
+        rocm_compute_capability.has_fp16_atomics_support();
+    if (!element_type.isF32() && !is_supported_f16_atomic) {
+      return failure();
+    }
+    constexpr int kGlobalMemory = 1;
+    constexpr int kSharedMemory = 3;
+    auto addr_type = mlir::cast<ml::LLVMPointerType>(addr.getType());
+    // adds to shared memory are always atomic.
+    if (addr_type.getAddressSpace() != kSharedMemory) {
+      // The compiler will only generate a global_atomic_fadd if the pointer is
+      // in global addrspace (1)
+      addr = b.create<ml::AddrSpaceCastOp>(
+          loc, ml::LLVMPointerType::get(b.getContext(), kGlobalMemory), addr);
+    }
+    b.create<ml::AtomicRMWOp>(loc, ml::AtomicBinOp::fadd, addr, modifier_arg,
+                              ml::AtomicOrdering::seq_cst, sync_scope);
+    return success();
+  }
+
+  LogicalResult rewriteAtomicFMaxAsIntAtomics(Location loc, Value modifier_arg,
+                                              Value addr,
+                                              llvm::StringRef sync_scope,
+                                              OpBuilder& b) const {
+    Type element_type = modifier_arg.getType();
+    if (!element_type.isF32()) {
+      return failure();
+    }
+    // Evaluating floating max using integer atomics has the limitation of not
+    // propagating -NaNs. To handle this, we check if the update value is -NaN
+    // and convert it to a positive one by dropping the sign-bit.
+    Value current = b.create<ml::LoadOp>(loc, element_type, addr);
+    Value current_is_nan =
+        b.create<ml::FCmpOp>(loc, ml::FCmpPredicate::uno, current, current);
+    auto is_current_nan =
+        b.create<scf::IfOp>(loc, /*resultTypes=*/TypeRange{}, current_is_nan,
+                            /*addThenBlock=*/true, /*addElseBlock=*/true);
+    auto if_current_nan_then_builder =
+        OpBuilder::atBlockEnd(is_current_nan.thenBlock(), b.getListener());
+    if_current_nan_then_builder.create<scf::YieldOp>(loc);
+
+    auto if_current_nan_else_builder =
+        OpBuilder::atBlockEnd(is_current_nan.elseBlock(), b.getListener());
+    Value is_modifier_nan = if_current_nan_else_builder.create<ml::FCmpOp>(
+        loc, ml::FCmpPredicate::uno, modifier_arg, modifier_arg);
+    auto f32_nan = mlir::APFloat::getNaN(mlir::APFloat::IEEEsingle());
+    Value nan = if_current_nan_else_builder.create<ml::ConstantOp>(
+        loc, b.getF32Type(), f32_nan);
+    Value no_negative_nan_source =
+        if_current_nan_else_builder.create<ml::SelectOp>(loc, is_modifier_nan,
+                                                         nan, modifier_arg);
+    Value current_less_than_modifier =
+        if_current_nan_else_builder.create<ml::FCmpOp>(
+            loc, ml::FCmpPredicate::ult, current, no_negative_nan_source);
+
+    // This check allows us to skip the atomic update all-together at the
+    // expense of reading the value in memory for every update. Evaluated
+    // against Waymo's benchmarks, adding the check achieves better overall
+    // performance.
+    auto if_need_update = if_current_nan_else_builder.create<scf::IfOp>(
+        loc, /*resultTypes=*/TypeRange{}, current_less_than_modifier,
+        /*withElseRegion=*/true,
+        /*addElseBlock=*/false);
+    if_current_nan_else_builder.create<scf::YieldOp>(loc);
+
+    auto then_builder =
+        OpBuilder::atBlockEnd(if_need_update.thenBlock(), b.getListener());
+    Value source_float_as_int = then_builder.create<ml::BitcastOp>(
+        loc, then_builder.getI32Type(), no_negative_nan_source);
+    Value c0 = then_builder.create<ml::ConstantOp>(loc, b.getI32Type(), 0);
+    Value is_not_negative = then_builder.create<ml::ICmpOp>(
+        loc, ml::ICmpPredicate::sge, source_float_as_int, c0);
+    then_builder.create<scf::IfOp>(
+        loc, is_not_negative,
+        [&](OpBuilder& nested_b, Location nested_loc) {
+          // atomicMax((int *)address, __float_as_int(val))
+          nested_b.create<ml::AtomicRMWOp>(
+              loc, ml::AtomicBinOp::max, addr, source_float_as_int,
+              ml::AtomicOrdering::seq_cst, sync_scope);
+          nested_b.create<scf::YieldOp>(nested_loc);
+        },
+        [&](OpBuilder& nested_b, Location nested_loc) {
+          // atomicMax((int *)address, __float_as_int(val))
+          nested_b.create<ml::AtomicRMWOp>(
+              loc, ml::AtomicBinOp::umin, addr, source_float_as_int,
+              ml::AtomicOrdering::seq_cst, sync_scope);
+          nested_b.create<scf::YieldOp>(nested_loc);
+        });
+    then_builder.create<scf::YieldOp>(loc);
+    return success();
+  }
+
+  // Implements atomic binary operations using atomic compare-and-swap
+  // (atomicCAS) as follows:
+  //   1. Reads the value from the memory pointed to by output_address and
+  //     records it as old_output.
+  //   2. Uses old_output as one of the source operand to perform the binary
+  //     operation and stores the result in new_output.
+  //   3. Calls atomicCAS which implements compare-and-swap as an atomic
+  //     operation. In particular, atomicCAS reads the value from the memory
+  //     pointed to by output_address, and compares the value with old_output.
+  //     If the two values equal, new_output is written to the same memory
+  //     location and true is returned to indicate that the atomic operation
+  //     succeeds. Otherwise, the new value read from the memory is returned. In
+  //     this case, the new value is copied to old_output, and steps 2. and 3.
+  //     are repeated until atomicCAS succeeds.
+  //
+  // On Nvidia GPUs, atomicCAS can only operate on 32 bit and 64 bit integers.
+  // If the element type of the binary operation is 32 bits or 64 bits, the
+  // integer type of the same size is used for the atomicCAS operation. On the
+  // other hand, if the element type is smaller than 32 bits, int32_t is used
+  // for the atomicCAS operation. In this case, atomicCAS reads and writes 32
+  // bit values from the memory, which is larger than the memory size required
+  // by the original atomic binary operation. We mask off the last two bits of
+  // the output_address and use the result as an address to read the 32 bit
+  // values from the memory. This can avoid out of bound memory accesses if
+  // tensor buffers are 4 byte aligned and have a size of 4N, an assumption that
+  // the runtime can guarantee.
+  void rewriteAsAtomicCAS(AtomicRMWOp op,
+                          mlir::PatternRewriter& rewriter) const {
+    Location loc = op.getLoc();
     auto input = op.getInput();
 
     // Use 32-bit atomic type for small input types.
-    mlir::Type result_ty = op.getResult().getType().getElementType();
-    unsigned int result_size = result_ty.getIntOrFloatBitWidth();
+    Type result_ty = op.getResult().getType().getElementType();
+    int result_size;
+    if (auto complex_ty = mlir::dyn_cast<mlir::ComplexType>(result_ty)) {
+      result_size = complex_ty.getElementType().getIntOrFloatBitWidth() * 2;
+    } else {
+      result_size = result_ty.getIntOrFloatBitWidth();
+    }
+
     bool small_type = result_size < 32;
-    mlir::Type atomic_ty =
+    Type atomic_ty =
         mlir::IntegerType::get(op.getContext(), small_type ? 32 : result_size);
 
     // Calculate load address for the input.
@@ -418,7 +797,7 @@ struct RewriteAtomicRMW : mlir::OpRewritePattern<AtomicRMWOp> {
       // Update input pointer by discarding the last two bits - i.e. align to
       // 32-bit boundary for small input types (will not result in OOB, as the
       // input alignment is at least 32 bits).
-      mlir::Type addr_int_ty = rewriter.getI64Type();
+      Type addr_int_ty = rewriter.getI64Type();
       Value addr_int = rewriter.create<ml::PtrToIntOp>(loc, addr_int_ty, addr);
       Value addr_offset = rewriter.create<ml::AndOp>(
           loc, addr_int, rewriter.create<ml::ConstantOp>(loc, addr_int_ty, 3));
@@ -447,20 +826,21 @@ struct RewriteAtomicRMW : mlir::OpRewritePattern<AtomicRMWOp> {
 
     // Load initial atomic value and create the loop.
     Value initial = rewriter.create<ml::LoadOp>(loc, atomic_ty, addr);
-    rewriter.create<mlir::scf::WhileOp>(
-        loc, mlir::TypeRange{atomic_ty}, ValueRange{initial},
-        [&](mlir::OpBuilder& b, mlir::Location loc, ValueRange values) {
+    rewriter.create<scf::WhileOp>(
+        loc, TypeRange{atomic_ty}, ValueRange{initial},
+        [&](mlir::OpBuilder& builder, Location loc, ValueRange values) {
+          mlir::ImplicitLocOpBuilder b(loc, builder);
           Value old_value = values[0];
 
           // Convert atomic value to input value.
           Value input_value;
           if (small_type) {
-            Value short_value = b.create<ml::TruncOp>(
-                loc, b.getIntegerType(result_size),
-                b.create<ml::LShrOp>(loc, old_value, shift));
-            input_value = b.create<ml::BitcastOp>(loc, result_ty, short_value);
+            Value short_value =
+                b.create<ml::TruncOp>(b.getIntegerType(result_size),
+                                      b.create<ml::LShrOp>(old_value, shift));
+            input_value = b.create<ml::BitcastOp>(result_ty, short_value);
           } else {
-            input_value = b.create<ml::BitcastOp>(loc, result_ty, old_value);
+            input_value = CreateBitcast(b, old_value, result_ty);
           }
 
           // Perform computation on the loaded input value.
@@ -473,15 +853,14 @@ struct RewriteAtomicRMW : mlir::OpRewritePattern<AtomicRMWOp> {
           // Convert resulting value to atomic value.
           Value new_value;
           if (small_type) {
-            Value cast_value = rewriter.create<ml::ZExtOp>(
-                loc, atomic_ty,
-                rewriter.create<ml::BitcastOp>(
-                    loc, rewriter.getIntegerType(result_size), result));
-            new_value = rewriter.create<ml::OrOp>(
-                loc, rewriter.create<ml::AndOp>(loc, old_value, mask),
-                rewriter.create<ml::ShlOp>(loc, cast_value, shift));
+            Value cast_value = b.create<ml::ZExtOp>(
+                atomic_ty, b.create<ml::BitcastOp>(
+                               rewriter.getIntegerType(result_size), result));
+            new_value =
+                b.create<ml::OrOp>(b.create<ml::AndOp>(old_value, mask),
+                                   b.create<ml::ShlOp>(cast_value, shift));
           } else {
-            new_value = b.create<ml::BitcastOp>(loc, atomic_ty, result);
+            new_value = CreateBitcast(b, result, atomic_ty);
           }
 
           // Try saving the result atomically, retry if failed.
@@ -489,40 +868,44 @@ struct RewriteAtomicRMW : mlir::OpRewritePattern<AtomicRMWOp> {
               loc, addr, old_value, new_value,
               /*success_ordering=*/ml::AtomicOrdering::seq_cst,
               /*failure_ordering=*/ml::AtomicOrdering::seq_cst);
-          Value next = b.create<ml::ExtractValueOp>(loc, cmpxchg, 0);
-          Value ok = b.create<ml::ExtractValueOp>(loc, cmpxchg, 1);
-          Value low_bit =
-              b.create<ml::ConstantOp>(loc, b.getOneAttr(b.getI1Type()));
-          Value not_ok = b.create<ml::XOrOp>(loc, ok, low_bit);
-          b.create<mlir::scf::ConditionOp>(loc, not_ok, ValueRange{next});
+          Value next = b.create<ml::ExtractValueOp>(cmpxchg, 0);
+          Value ok = b.create<ml::ExtractValueOp>(cmpxchg, 1);
+          Value low_bit = b.create<ml::ConstantOp>(b.getOneAttr(b.getI1Type()));
+          Value not_ok = b.create<ml::XOrOp>(ok, low_bit);
+          b.create<scf::ConditionOp>(not_ok, ValueRange{next});
         },
-        [&](mlir::OpBuilder& b, mlir::Location loc, ValueRange values) {
-          b.create<mlir::scf::YieldOp>(loc, values);
+        [&](mlir::OpBuilder& b, Location loc, ValueRange values) {
+          b.create<scf::YieldOp>(loc, values);
         });
-    rewriter.replaceOp(op, input);
-    return success();
   }
+
+  bool is_amd_;
+  std::string gpu_arch_;
 };
 
 class LowerTensorsPass : public impl::LowerTensorsPassBase<LowerTensorsPass> {
  public:
+  explicit LowerTensorsPass(const LowerTensorsPassOptions& options)
+      : LowerTensorsPassBase(options) {}
+
   void runOnOperation() override {
-    mlir::RewritePatternSet tensor_patterns(&getContext());
+    MLIRContext* mlir_context = &getContext();
+    mlir::RewritePatternSet tensor_patterns(mlir_context);
+    tensor_patterns.add<RewriteAtomicRMW>(mlir_context, is_amd_gpu_, gpu_arch_);
     tensor_patterns
-        .add<RewriteAllocateShared, RewriteSyncThreads, RewriteTensorExtract,
-             RewriteTensorInsert, RewriteAtomicRMW>(&getContext());
+        .add<RewriteAllocateShared, RewriteNonScalarConstants,
+             RewriteSyncThreads, RewriteTensorExtract, RewriteTensorInsert>(
+            mlir_context);
     if (mlir::failed(mlir::applyPatternsAndFoldGreedily(
             getOperation(), std::move(tensor_patterns)))) {
       signalPassFailure();
     }
 
-    mlir::RewritePatternSet function_patterns(&getContext());
-    function_patterns.add<RewriteFunctionSignatures, RewriteCall>(
-        &getContext());
-    mlir::scf::ForOp::getCanonicalizationPatterns(function_patterns,
-                                                  &getContext());
-    mlir::scf::IfOp::getCanonicalizationPatterns(function_patterns,
-                                                 &getContext());
+    mlir::RewritePatternSet function_patterns(mlir_context);
+    function_patterns.add<RewriteFunctionSignatures, RewriteCall,
+                          RemoveUnusedIndexSwitchResults>(mlir_context);
+    scf::ForOp::getCanonicalizationPatterns(function_patterns, mlir_context);
+    scf::IfOp::getCanonicalizationPatterns(function_patterns, mlir_context);
     if (mlir::failed(mlir::applyPatternsAndFoldGreedily(
             getOperation(), std::move(function_patterns)))) {
       signalPassFailure();
@@ -533,8 +916,11 @@ class LowerTensorsPass : public impl::LowerTensorsPassBase<LowerTensorsPass> {
       while (auto gep = addr.getDefiningOp<mlir::LLVM::GEPOp>()) {
         addr = gep.getBase();
       }
-      if (addr.getDefiningOp<mlir::LLVM::AddrSpaceCastOp>()) {
-        // Shared memory - no need to annotate anything.
+      if (addr.getDefiningOp<mlir::LLVM::AddrSpaceCastOp>() ||
+          addr.getDefiningOp<mlir::LLVM::AddressOfOp>() ||
+          addr.getDefiningOp<mlir::LLVM::AllocaOp>()) {
+        // Shared memory, global constant or temporary - no need to annotate
+        // anything.
         return;
       }
       if (auto base = mlir::dyn_cast<mlir::BlockArgument>(addr)) {
@@ -554,8 +940,12 @@ class LowerTensorsPass : public impl::LowerTensorsPassBase<LowerTensorsPass> {
 
 }  // namespace
 
-std::unique_ptr<::mlir::Pass> CreateLowerTensorsPass() {
-  return std::make_unique<LowerTensorsPass>();
+std::unique_ptr<::mlir::Pass> CreateLowerTensorsPass(
+    bool is_amd_gpu, const std::string& gpu_arch) {
+  LowerTensorsPassOptions options;
+  options.is_amd_gpu_ = is_amd_gpu;
+  options.gpu_arch_ = gpu_arch;
+  return std::make_unique<LowerTensorsPass>(options);
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/lower_xla_gpu_to_scf.cc b/third_party/xla/xla/service/gpu/fusions/mlir/lower_xla_gpu_to_scf.cc
index 29e9887caa752f..5a5f6c0df3fbb7 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/lower_xla_gpu_to_scf.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/lower_xla_gpu_to_scf.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/ValueRange.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.h"
@@ -95,7 +96,7 @@ struct RewriteShuffleReduce : mlir::OpRewritePattern<ShuffleReduceOp> {
   mlir::LogicalResult matchAndRewrite(
       ShuffleReduceOp op, mlir::PatternRewriter& rewriter) const override {
     int max_distance =
-        op->getAttr("max_distance").cast<mlir::IntegerAttr>().getInt();
+        mlir::cast<mlir::IntegerAttr>(op->getAttr("max_distance")).getInt();
     // TODO(jreiffers): Do this in a verifier.
     if (max_distance & (max_distance - 1) || max_distance >= WarpSize()) {
       return op->emitOpError("max_distance must be a power of 2 < WarpSize()");
@@ -139,7 +140,7 @@ struct RewriteShuffleReduce : mlir::OpRewritePattern<ShuffleReduceOp> {
       };
 
       auto shuffle = [&](mlir::Value value) -> mlir::Value {
-        if (value.getType().isa<mlir::ComplexType>()) {
+        if (mlir::isa<mlir::ComplexType>(value.getType())) {
           return b.create<mlir::complex::CreateOp>(
               value.getType(),
               shuffle_int_or_float(b.create<mlir::complex::ReOp>(value)),
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/merge_pointers_to_same_slice.cc b/third_party/xla/xla/service/gpu/fusions/mlir/merge_pointers_to_same_slice.cc
index ce9b73648ef554..81d8a796aba998 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/merge_pointers_to_same_slice.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/merge_pointers_to_same_slice.cc
@@ -62,7 +62,7 @@ struct PackedArgs {
       }
 
       auto& target_index = slice_to_operand[static_cast<int>(
-          slice_index.cast<mlir::IntegerAttr>().getInt())];
+          mlir::cast<mlir::IntegerAttr>(slice_index).getInt())];
       if (target_index) {
         replacement_args[idx] = *target_index;
         args_to_erase[idx] = true;
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc b/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc
index fbd479552aa9be..9d78aa64418868 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <utility>
+#include <variant>
+#include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
@@ -73,9 +75,12 @@ limitations under the License.
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/mlir/tools/mlir_replay/public/compiler_trace.pb.h"
+#include "xla/mlir/tools/mlir_replay/public/compiler_trace_instrumentation.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/mlir_hlo/mhlo/transforms/passes.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/service/dump.h"
 #include "xla/service/gpu/fusions/fusion_emitter.h"
 #include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
 #include "xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h"
@@ -148,8 +153,31 @@ bool Needs64Bits(const Shape& shape) {
                          : absl::c_any_of(shape.tuple_shapes(), Needs64Bits);
 }
 
+bool Is64BitIndex(const HloInstruction* instr, int operand) {
+  const auto& shape = instr->operand(operand)->shape();
+  return shape.element_type() == PrimitiveType::S64 ||
+         shape.element_type() == PrimitiveType::U64;
+}
+
 bool Needs64BitIndices(const HloComputation* computation) {
   for (auto* instr : computation->instructions()) {
+    // Check if any HLO instructions directly take 64 bit indices as operands.
+    switch (instr->opcode()) {
+      case HloOpcode::kDynamicSlice:
+      case HloOpcode::kDynamicUpdateSlice:
+        for (int i = 1; i < instr->operand_count(); ++i) {
+          if (Is64BitIndex(instr, i)) return true;
+        }
+        break;
+      case HloOpcode::kGather:
+      case HloOpcode::kScatter:
+        CHECK(instr->shape().IsArray()) << "Variadic scatter is unsupported.";
+        if (Is64BitIndex(instr, 1)) return true;
+        break;
+      default:
+        break;
+    }
+
     if (Needs64Bits(instr->shape()) ||
         absl::c_any_of(instr->called_computations(), Needs64BitIndices)) {
       return true;
@@ -255,8 +283,16 @@ MlirFusionEmitterBase::CreateLLVMModule(
     const se::DeviceDescription& device, const HloFusionInstruction& fusion,
     const std::string& entry_function_name,
     const BufferAssignment* buffer_assignment) const {
-  TF_RET_CHECK(device.cuda_compute_capability().major >= 1)
-      << "Unsupported device type: " << device.name();
+  bool is_amd = std::holds_alternative<se::RocmComputeCapability>(
+      device.gpu_compute_capability());
+  auto* hlo_module = fusion.GetModule();
+  std::unique_ptr<mlir::interpreter::MlirCompilationTrace> trace = nullptr;
+  if (DumpingEnabledForHloModule(*hlo_module) &&
+      DumpingEnabledForHloPass("mlir-fusion-emitter",
+                               hlo_module->config().debug_options())) {
+    trace = std::make_unique<mlir::interpreter::MlirCompilationTrace>();
+  }
+  TF_RET_CHECK(!is_amd) << "Unsupported device type: " << device.name();
   TF_ASSIGN_OR_RETURN(
       auto module, CreateMLIRModule(mlir_context, fusion, entry_function_name,
                                     buffer_assignment));
@@ -268,9 +304,11 @@ MlirFusionEmitterBase::CreateLLVMModule(
   pm.addPass(mlir::createCSEPass());
   pm.addPass(mlir::mhlo::createConvertToSignlessPass());
   pm.addPass(CreatePropagateSliceIndicesPass());
-  pm.addPass(CreateLowerFuncPass());
+  pm.addNestedPass<mlir::func::FuncOp>(CreateConvertPureCallOpsPass());
   pm.addPass(CreateLowerXlaGpuToScfPass());
-  pm.addPass(CreateLowerTensorsPass());
+  pm.addPass(CreateLowerTensorsPass(
+      is_amd, is_amd ? device.rocm_compute_capability().gcn_arch_name()
+                     : device.cuda_compute_capability().ToString()));
   pm.addPass(mlir::createConvertComplexToStandardPass());
   pm.addPass(CreateMergePointersToSameSlicePass());
 
@@ -297,15 +335,13 @@ MlirFusionEmitterBase::CreateLLVMModule(
   pm.addPass(CreateLowerToLLVMPass());
   pm.addPass(mlir::createReconcileUnrealizedCastsPass());
 
-  if (pm.run(module.get()).failed()) {
-    std::string module_dump;
-    llvm::raw_string_ostream os(module_dump);
-    module->print(os);
-    return absl::InternalError(absl::StrFormat(
-        "Failed create LLVM module.\nHloFusionInstruction "
-        "computation:\n%s\nMLIR module:\n%s",
-        fusion.fused_instructions_computation()->ToString(), module_dump));
+  auto pipeline_status = RunPassPipeline(module.get(), pm, trace.get());
+  if (trace) {
+    DumpPerModuleProtobufToFile(
+        *hlo_module, *trace, hlo_module->config().debug_options(),
+        absl::StrCat(entry_function_name, ".mlir-trace"));
   }
+  TF_RETURN_IF_ERROR(pipeline_status);
 
   auto llvm_module = mlir::translateModuleToLLVMIR(module.get(), llvm_context);
   TF_RET_CHECK(llvm_module != nullptr)
@@ -318,7 +354,8 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
 MlirFusionEmitterBase::CreateMLIRModule(
     mlir::MLIRContext& context, const HloFusionInstruction& fusion,
     const std::string& entry_function_name,
-    const BufferAssignment* buffer_assignment) const {
+    const BufferAssignment* buffer_assignment,
+    mlir::interpreter::MlirCompilationTrace* trace) const {
   context.loadDialect<mlir::DLTIDialect, mlir::tensor::TensorDialect,
                       mlir::func::FuncDialect, mlir::affine::AffineDialect,
                       mlir::arith::ArithDialect, mlir::cf::ControlFlowDialect,
@@ -406,15 +443,10 @@ MlirFusionEmitterBase::CreateMLIRModule(
   pm.addPass(CreateSimplifyArithPass());
   pm.addPass(mlir::createCanonicalizerPass());
   pm.addPass(mlir::createCSEPass());
-  if (pm.run(module.get()).failed()) {
-    std::string module_dump;
-    llvm::raw_string_ostream os(module_dump);
-    module->print(os);
-    return absl::InternalError(absl::StrFormat(
-        "Failed to simplify module.\nHloFusionInstruction "
-        "computation:\n%s\nMLIR module:\n%s",
-        fusion.fused_instructions_computation()->ToString(), module_dump));
-  }
+  // We won't dump the trace here if the pipeline fails. This is acceptable,
+  // since failures this early are usually easy to debug from the single MLIR
+  // snapshot that is dumped in RunPassPipeline.
+  TF_RETURN_IF_ERROR(RunPassPipeline(module.get(), pm, trace));
 
   return module;
 }
@@ -432,17 +464,17 @@ SmallVector<Value> MlirFusionEmitterBase::EmitThreadLoopNest(
 absl::Status MlirFusionEmitterBase::EmitMlir(
     mlir::ModuleOp module, mlir::func::FuncOp entry_function,
     const HloFusionInstruction& fusion) const {
-  std::optional<mlir_converter::EpilogueSpecification> epilogue =
-      GetEpilogue(fusion, module->getContext());
+  std::vector<mlir_converter::EpilogueSpecification> epilogues =
+      GetEpilogues(fusion, module->getContext());
   mlir_converter::PartitionedComputations computations(
-      fusion.fused_instructions_computation(), module->getContext(), epilogue);
+      fusion.fused_instructions_computation(), module->getContext(), epilogues);
   auto subgraph_to_mlir_fn = computations.DeclareFunctions(module);
 
   // Erase subgraphs for all heroes that aren't used anywhere else. This is
   // necessary because the instructions may not have elemental implementations
   // (scatter).
-  if (epilogue) {
-    for (auto* custom : epilogue->heroes) {
+  for (const auto& epilogue : epilogues) {
+    for (auto* custom : epilogue.heroes) {
       if (custom->user_count() == 0) {
         subgraph_to_mlir_fn.extract(&computations.FindSubgraph(custom))
             .mapped()
@@ -451,6 +483,14 @@ absl::Status MlirFusionEmitterBase::EmitMlir(
     }
   }
 
+  // The epilogue functions replace the root tuple.
+  auto* root = fusion.fused_instructions_computation()->root_instruction();
+  if (root->opcode() == HloOpcode::kTuple && !epilogues.empty()) {
+    subgraph_to_mlir_fn.extract(&computations.FindSubgraph(root))
+        .mapped()
+        .erase();
+  }
+
   auto call_targets =
       computations.CreateCallTargetProvider(subgraph_to_mlir_fn);
   for (const auto& comp : computations.partitioned_computations()) {
@@ -461,11 +501,12 @@ absl::Status MlirFusionEmitterBase::EmitMlir(
       }
     }
   }
-  if (const auto& epilogue = computations.epilogue()) {
+  for (const auto& epilogue : computations.epilogues()) {
+    if (epilogue.roots.empty()) continue;
     TF_RETURN_IF_ERROR(mlir_converter::SubgraphToMlirFunction(
         computations.FindPartitionedComputation(
             fusion.fused_instructions_computation()),
-        *epilogue, subgraph_to_mlir_fn[&*epilogue], call_targets));
+        epilogue, subgraph_to_mlir_fn[&epilogue], call_targets));
   }
 
   int index_bitwidth =
@@ -480,25 +521,64 @@ absl::Status MlirFusionEmitterBase::EmitMlir(
   return EmitEntryFunction(computations, call_targets, entry_function, fusion);
 }
 
-mlir::ValueRange MlirFusionEmitterBase::EmitEpilogue(
+absl::flat_hash_map<const HloInstruction*, ValueRange>
+MlirFusionEmitterBase::EmitEpilogue(
+    int epilogue_index,
     const mlir_converter::PartitionedComputations& computations,
-    mlir::func::FuncOp entry_fn, mlir::ValueRange hero_values,
-    mlir::ValueRange output_indices,
-    mlir::ImplicitLocOpBuilder& builder) const {
-  const auto& epilogue = computations.epilogue();
-  if (!epilogue) {
-    return hero_values;
+    mlir::func::FuncOp entry_fn,
+    const absl::flat_hash_map<const HloInstruction*, llvm::SmallVector<Value>>&
+        injected,
+    ValueRange output_indices, mlir::ImplicitLocOpBuilder& builder) const {
+  const auto& epilogue = computations.epilogues().at(epilogue_index);
+  if (epilogue.roots.empty()) {
+    return {};
   }
-
   auto epilogue_fn = mlir::cast<mlir::func::FuncOp>(
-      entry_fn->getParentOfType<mlir::ModuleOp>().lookupSymbol(epilogue->name));
-  SmallVector<Value> operands =
-      mlir::ValueRange(entry_fn.getArguments().take_front(
-          computations.fusion()->num_parameters()));
+      entry_fn->getParentOfType<mlir::ModuleOp>().lookupSymbol(epilogue.name));
+  SmallVector<Value> operands = ValueRange(entry_fn.getArguments().take_front(
+      computations.fusion()->num_parameters()));
   absl::c_copy(output_indices, std::back_inserter(operands));
-  absl::c_copy(hero_values, std::back_inserter(operands));
+  int injected_offset = operands.size();
+  operands.resize(injected_offset + epilogue.num_injected_values);
+  for (auto [injected_instruction, start] : epilogue.injected_value_starts) {
+    absl::c_copy(injected.at(injected_instruction),
+                 operands.begin() + injected_offset + start);
+  }
+
+  ValueRange results =
+      builder.create<PureCallOp>(epilogue_fn, operands).getResults();
+  absl::flat_hash_map<const HloInstruction*, ValueRange> results_per_root;
+  for (auto* root : epilogue.roots) {
+    int arity =
+        root->shape().IsTuple() ? root->shape().tuple_shapes().size() : 1;
+    results_per_root[root] = results.take_front(arity);
+    results = results.drop_front(arity);
+  }
+  CHECK_EQ(results.size(), 0);
+  return results_per_root;
+}
 
-  return builder.create<PureCallOp>(epilogue_fn, operands).getResults();
+absl::Status MlirFusionEmitterBase::RunPassPipeline(
+    mlir::ModuleOp module, mlir::PassManager& pm,
+    mlir::interpreter::MlirCompilationTrace* trace) const {
+  if (VLOG_IS_ON(5)) {
+    module.getContext()->disableMultithreading();
+    pm.enableIRPrinting();
+  }
+  if (trace) {
+    module.getContext()->disableMultithreading();
+    pm.addInstrumentation(
+        std::make_unique<mlir::interpreter::MlirCompilerTraceInstrumentation>(
+            *trace));
+  }
+  if (pm.run(module).failed()) {
+    std::string module_dump;
+    llvm::raw_string_ostream os(module_dump);
+    module->print(os);
+    return absl::InternalError(absl::StrFormat(
+        "Failed to run pass pipeline.\nMLIR module:\n%s", module_dump));
+  }
+  return absl::OkStatus();
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h b/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h
index c30325ff75e15b..a555cd0b1b8cbe 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h
@@ -15,14 +15,10 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_FUSIONS_MLIR_MLIR_FUSION_EMITTER_H_
 #define XLA_SERVICE_GPU_FUSIONS_MLIR_MLIR_FUSION_EMITTER_H_
 
-#include <functional>
-#include <memory>
 #include <string>
 
-#include "absl/container/flat_hash_set.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -33,8 +29,9 @@ limitations under the License.
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/ValueRange.h"  // from @llvm-project
-#include "xla/hlo/ir/hlo_instruction.h"
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/mlir/tools/mlir_replay/public/compiler_trace.pb.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/fusions/fusion_emitter.h"
 #include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
@@ -64,16 +61,17 @@ class MlirFusionEmitterBase : public KernelFusionInterface {
   absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateMLIRModule(
       mlir::MLIRContext& context, const HloFusionInstruction& fusion,
       const std::string& entry_function_name,
-      const BufferAssignment* buffer_assignment) const;
+      const BufferAssignment* buffer_assignment,
+      mlir::interpreter::MlirCompilationTrace* trace = nullptr) const;
 
  protected:
   // Returns the set of instructions that will be isolated in the partitioned,
   // i.e., they will get their own subgraph. We won't automatically emit
   // functions for these instructions.
-  virtual std::optional<mlir_converter::EpilogueSpecification> GetEpilogue(
+  virtual std::vector<mlir_converter::EpilogueSpecification> GetEpilogues(
       const HloFusionInstruction& fusion,
       mlir::MLIRContext* mlir_context) const {
-    return std::nullopt;
+    return {};
   }
 
   virtual absl::Status EmitEntryFunction(
@@ -82,11 +80,14 @@ class MlirFusionEmitterBase : public KernelFusionInterface {
       mlir::func::FuncOp entry_function,
       const HloFusionInstruction& fusion) const = 0;
 
-  // Evaluates the epilogue of the fusion. Returns `hero_values` if there is no
-  // epilogue.
-  mlir::ValueRange EmitEpilogue(
+  // Evaluates the epilogue of the fusion. Returns the results for each epilogue
+  // root.
+  absl::flat_hash_map<const HloInstruction*, mlir::ValueRange> EmitEpilogue(
+      int epilogue_index,
       const mlir_converter::PartitionedComputations& computations,
-      mlir::func::FuncOp entry_fn, mlir::ValueRange hero_values,
+      mlir::func::FuncOp entry_fn,
+      const absl::flat_hash_map<const HloInstruction*,
+                                llvm::SmallVector<mlir::Value>>& injected,
       mlir::ValueRange output_indices,
       mlir::ImplicitLocOpBuilder& builder) const;
 
@@ -114,6 +115,9 @@ class MlirFusionEmitterBase : public KernelFusionInterface {
   absl::Status EmitMlir(mlir::ModuleOp module,
                         mlir::func::FuncOp entry_function,
                         const HloFusionInstruction& fusion) const;
+  absl::Status RunPassPipeline(
+      mlir::ModuleOp module, mlir::PassManager& pm,
+      mlir::interpreter::MlirCompilationTrace* trace) const;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/passes.h b/third_party/xla/xla/service/gpu/fusions/mlir/passes.h
index b91fc53a54763b..bd2ae346d95588 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/passes.h
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/passes.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <memory>
 #include <optional>
+#include <string>
 
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project  // IWYU pragma: keep
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"  // from @llvm-project  // IWYU pragma: keep
@@ -35,8 +36,9 @@ namespace gpu {
 std::optional<Interval> GetRange(mlir::Value value);
 
 std::unique_ptr<mlir::Pass> CreateExpandFloatOpsPass(bool pre_ampere);
-std::unique_ptr<mlir::Pass> CreateLowerFuncPass();
-std::unique_ptr<mlir::Pass> CreateLowerTensorsPass();
+std::unique_ptr<mlir::Pass> CreateConvertPureCallOpsPass();
+std::unique_ptr<mlir::Pass> CreateLowerTensorsPass(
+    bool is_amd_gpu = false, const std::string& gpu_arch = "6.0");
 std::unique_ptr<mlir::Pass> CreateLowerToLLVMPass();
 std::unique_ptr<mlir::Pass> CreateLowerXlaGpuToScfPass();
 std::unique_ptr<mlir::Pass> CreateMergePointersToSameSlicePass();
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/passes.td b/third_party/xla/xla/service/gpu/fusions/mlir/passes.td
index 72dbf9ab500f5b..131692f8862d36 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/passes.td
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/passes.td
@@ -34,24 +34,22 @@ def PropagateSliceIndicesPass :
   let constructor = "CreatePropagateSliceIndicesPass()";
 }
 
-def LowerFuncPass : Pass<"xla-gpu-lower-func", "mlir::ModuleOp"> {
-  let summary = "Lowers function calls to func.calls.";
-
+def ConvertPureCallOpsPass
+    : Pass<"xla-gpu-convert-pure-call-ops", "mlir::func::FuncOp"> {
+  let summary = "Converts xla_gpu.pure_call to func.call";
   let description = [{
       We use xla_gpu.pure_call ops for calls to enable CSE and other
       transformations (e.g. LICM). This pass rewrites our custom ops to standard
       ops.
   }];
-
   let dependentDialects = [
-    "mlir::func::FuncDialect", "xla::gpu::XlaGpuDialect"
+    "mlir::func::FuncDialect",
+    "xla::gpu::XlaGpuDialect"
   ];
-
-  let constructor = "CreateLowerFuncPass()";
+  let constructor = "CreateConvertPureCallOpsPass()";
 }
 
-def LowerTensorsPass :
-   Pass<"xla-gpu-lower-tensors", "mlir::ModuleOp"> {
+def LowerTensorsPass : Pass<"xla-gpu-lower-tensors", "mlir::ModuleOp"> {
   let summary = "Lowers tensors to llvm pointers and loads/stores.";
 
   let description = [{
@@ -60,10 +58,18 @@ def LowerTensorsPass :
   }];
 
   let dependentDialects = [
-    "mlir::func::FuncDialect", "mlir::gpu::GPUDialect", "mlir::LLVM::LLVMDialect",
-    "mlir::tensor::TensorDialect", "mlir::scf::SCFDialect",
+    "mlir::LLVM::LLVMDialect",
+    "mlir::func::FuncDialect",
+    "mlir::gpu::GPUDialect",
+    "mlir::scf::SCFDialect",
+    "mlir::tensor::TensorDialect",
+  ];
+  let options = [
+    Option<"is_amd_gpu_", "is_amd_gpu", "bool", /*default=*/"false",
+           "True if AMD GPU.">,
+    Option<"gpu_arch_", "gpu_arch", "std::string", /*default=*/"",
+           "CUDA or ROCm compute capability.">,
   ];
-
   let constructor = "CreateLowerTensorsPass()";
 }
 
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/simplify_affine.cc b/third_party/xla/xla/service/gpu/fusions/mlir/simplify_affine.cc
index e4020c97a57e7d..f27143f1e673c7 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/simplify_affine.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/simplify_affine.cc
@@ -13,14 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <cstdint>
-#include <functional>
-#include <limits>
 #include <memory>
 #include <optional>
 #include <utility>
 #include <vector>
 
 #include "absl/base/optimization.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"  // from @llvm-project
@@ -35,71 +34,101 @@ limitations under the License.
 #include "mlir/IR/Matchers.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/ValueRange.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.h"
 #include "xla/service/gpu/fusions/mlir/passes.h"
 #include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/model/indexing_map.h"
 
 namespace xla {
 namespace gpu {
+namespace {
+
+using mlir::AffineBinaryOpExpr;
+using mlir::AffineConstantExpr;
+using mlir::AffineDimExpr;
+using mlir::AffineExpr;
+using mlir::AffineExprKind;
+using mlir::AffineMap;
+using mlir::AffineSymbolExpr;
+using mlir::ImplicitLocOpBuilder;
+using mlir::LogicalResult;
+using mlir::MLIRContext;
+using mlir::OpRewritePattern;
+using mlir::PatternRewriter;
+using mlir::SmallVector;
+using mlir::Value;
+using mlir::ValueRange;
+using mlir::affine::AffineApplyOp;
+
+namespace arith = mlir::arith;
 
 #define GEN_PASS_DEF_SIMPLIFYAFFINEPASS
 #include "xla/service/gpu/fusions/mlir/passes.h.inc"
 
-std::optional<Interval> GetRange(mlir::Value value) {
-  auto attr_to_range = [](mlir::Attribute attr) -> std::optional<Interval> {
-    if (!attr) {
-      return std::nullopt;
+Value EvaluateExpression(ImplicitLocOpBuilder& b, AffineExpr expr,
+                         unsigned dim_count, ValueRange operands) {
+  if (auto bin_op = mlir::dyn_cast<AffineBinaryOpExpr>(expr)) {
+    auto lhs = EvaluateExpression(b, bin_op.getLHS(), dim_count, operands);
+    auto rhs = EvaluateExpression(b, bin_op.getRHS(), dim_count, operands);
+    switch (expr.getKind()) {
+      case AffineExprKind::Add:
+        return b.create<arith::AddIOp>(lhs, rhs);
+      case AffineExprKind::Mul:
+        return b.create<arith::MulIOp>(lhs, rhs);
+      case AffineExprKind::Mod:
+        return b.create<arith::RemUIOp>(lhs, rhs);
+      case AffineExprKind::FloorDiv:
+        return b.create<arith::DivUIOp>(lhs, rhs);
+      default:
+        ABSL_UNREACHABLE();
     }
-    auto values = llvm::to_vector(
-        attr.cast<mlir::ArrayAttr>().getAsValueRange<mlir::IntegerAttr>());
-    return {{values[0].getSExtValue(), values[1].getSExtValue()}};
-  };
-
-  if (value.getDefiningOp()) {
-    return attr_to_range(value.getDefiningOp()->getAttr("xla.range"));
   }
-
-  auto bbarg = value.dyn_cast<mlir::BlockArgument>();
-  if (!bbarg) {
-    return std::nullopt;
+  switch (expr.getKind()) {
+    case AffineExprKind::Constant:
+      return b.create<arith::ConstantIndexOp>(
+          mlir::cast<AffineConstantExpr>(expr).getValue());
+    case AffineExprKind::DimId:
+      return operands[mlir::cast<AffineDimExpr>(expr).getPosition()];
+    case AffineExprKind::SymbolId:
+      return operands[dim_count +
+                      mlir::cast<AffineSymbolExpr>(expr).getPosition()];
+    default:
+      ABSL_UNREACHABLE();
   }
+}
 
-  auto parent = bbarg.getParentBlock()->getParentOp();
-  if (auto func_op = mlir::dyn_cast<mlir::func::FuncOp>(parent)) {
-    return attr_to_range(func_op.getArgAttr(bbarg.getArgNumber(), "xla.range"));
+bool IsLoweringSupported(AffineExpr expr, RangeEvaluator& range_evaluator) {
+  auto bin_op = llvm::dyn_cast<AffineBinaryOpExpr>(expr);
+  if (!bin_op) {
+    return true;
   }
-
-  if (auto for_op = mlir::dyn_cast<mlir::scf::ForOp>(parent)) {
-    llvm::APInt lb, ub;
-    if (mlir::matchPattern(for_op.getLowerBound(), mlir::m_ConstantInt(&lb)) &&
-        mlir::matchPattern(for_op.getUpperBound(), mlir::m_ConstantInt(&ub))) {
-      return {{lb.getSExtValue(), ub.getSExtValue() - 1}};
+  // Mod and div can be lowered if their LHS is >= 0 and their RHS is a
+  // constant.
+  if (expr.getKind() == AffineExprKind::Mod ||
+      expr.getKind() == AffineExprKind::FloorDiv) {
+    if (!range_evaluator.IsAlwaysPositiveOrZero(bin_op.getLHS()) ||
+        !range_evaluator.ComputeExpressionRange(bin_op.getRHS()).IsPoint()) {
+      return false;
     }
   }
-
-  return std::nullopt;
+  if (expr.getKind() == AffineExprKind::CeilDiv) {
+    return false;
+  }
+  return IsLoweringSupported(bin_op.getLHS(), range_evaluator) &&
+         IsLoweringSupported(bin_op.getRHS(), range_evaluator);
 }
 
-namespace {
-
-class SimplifyAffinePass
-    : public impl::SimplifyAffinePassBase<SimplifyAffinePass> {
- public:
-  void runOnOperation() override;
-};
-
-struct RewriteAffineApply
-    : mlir::OpRewritePattern<mlir::affine::AffineApplyOp> {
+struct RewriteAffineApply : OpRewritePattern<mlir::affine::AffineApplyOp> {
   using OpRewritePattern::OpRewritePattern;
 
-  mlir::LogicalResult matchAndRewrite(
-      mlir::affine::AffineApplyOp op,
-      mlir::PatternRewriter& rewriter) const override {
-    auto affine_map = op.getAffineMap();
+  LogicalResult matchAndRewrite(mlir::affine::AffineApplyOp op,
+                                PatternRewriter& rewriter) const override {
+    AffineMap affine_map = op.getAffineMap();
     std::vector<DimVar> dim_ranges(affine_map.getNumDims());
     std::vector<RangeVar> symbol_ranges(affine_map.getNumSymbols());
 
@@ -115,115 +144,113 @@ struct RewriteAffineApply
       }
     }
 
-    IndexingMap map(op.getAffineMap(), dim_ranges, symbol_ranges,
-                    /*rt_vars=*/{});
-    map.Simplify(GetIndexingMapForInstruction);
-    auto expr = map.GetAffineMap().getResult(0);
-
-    RangeEvaluator range_evaluator(map.GetDimensionBounds(),
-                                   map.GetSymbolBounds(), op->getContext());
-    std::function<bool(mlir::AffineExpr)> can_be_lowered;
-    bool fits_32_bits = true;
-    can_be_lowered = [&](mlir::AffineExpr expr) {
-      auto range = range_evaluator.ComputeExpressionRange(expr);
-      fits_32_bits &= range.upper < std::numeric_limits<int32_t>::max();
-
-      auto bin_op = llvm::dyn_cast<mlir::AffineBinaryOpExpr>(expr);
-      if (!bin_op) {
-        return true;
-      }
-
-      // Mod and div can be lowered if their LHS is >= 0 and their RHS is a
-      // constant.
-      if (expr.getKind() == mlir::AffineExprKind::Mod ||
-          expr.getKind() == mlir::AffineExprKind::FloorDiv) {
-        if (!range_evaluator.IsAlwaysPositiveOrZero(bin_op.getLHS()) ||
-            !range_evaluator.ComputeExpressionRange(bin_op.getRHS())
-                 .IsPoint()) {
-          return false;
-        }
-      }
-      if (expr.getKind() == mlir::AffineExprKind::CeilDiv) {
-        return false;
-      }
+    IndexingMap indexing_map(affine_map, std::move(dim_ranges),
+                             std::move(symbol_ranges),
+                             /*rt_vars=*/{});
+    indexing_map.Simplify(GetIndexingMapForInstruction);
+    auto result_expr = indexing_map.GetAffineMap().getResult(0);
 
-      return can_be_lowered(bin_op.getLHS()) && can_be_lowered(bin_op.getRHS());
-    };
-
-    mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    if (!can_be_lowered(expr)) {
-      auto range = range_evaluator.ComputeExpressionRange(expr);
-      op->setAttr("xla.range", b.getIndexArrayAttr({range.lower, range.upper}));
+    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+    RangeEvaluator range_evaluator = indexing_map.GetRangeEvaluator();
+    if (!IsLoweringSupported(result_expr, range_evaluator)) {
       return rewriter.notifyMatchFailure(op,
                                          "unable to lower the affine apply");
     }
+    b.setInsertionPoint(op);
+    auto result = EvaluateExpression(
+        b, result_expr, indexing_map.GetDimensionCount(), op->getOperands());
+    rewriter.replaceOp(op, result);
+    return mlir::success();
+  }
+};
 
-    std::function<mlir::Value(mlir::AffineExpr)> lower;
+struct RewriteApplyIndexingOp : OpRewritePattern<ApplyIndexingOp> {
+  using OpRewritePattern::OpRewritePattern;
 
-    auto int_ty = fits_32_bits ? b.getI32Type() : b.getI64Type();
-    b.setInsertionPoint(op);
-    lower = [&](mlir::AffineExpr expr) -> mlir::Value {
-      if (auto bin_op = mlir::dyn_cast<mlir::AffineBinaryOpExpr>(expr)) {
-        auto lhs = lower(bin_op.getLHS());
-        auto rhs = lower(bin_op.getRHS());
-        switch (expr.getKind()) {
-          case mlir::AffineExprKind::Add:
-            return b.create<mlir::arith::AddIOp>(lhs, rhs);
-          case mlir::AffineExprKind::Mul:
-            return b.create<mlir::arith::MulIOp>(lhs, rhs);
-          case mlir::AffineExprKind::Mod:
-            return b.create<mlir::arith::RemUIOp>(lhs, rhs);
-          case mlir::AffineExprKind::FloorDiv:
-            return b.create<mlir::arith::DivUIOp>(lhs, rhs);
-          default:
-            ABSL_UNREACHABLE();
-        }
-      }
+  LogicalResult matchAndRewrite(ApplyIndexingOp op,
+                                PatternRewriter& rewriter) const override {
+    auto indexing_map = op.getIndexingMap();
+    indexing_map.Simplify(GetIndexingMapForInstruction);
+    auto affine_map = indexing_map.GetAffineMap();
+    int64_t dim_count = indexing_map.GetDimensionCount();
+    auto operands = op->getOperands();
+
+    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+    RangeEvaluator range_evaluator = indexing_map.GetRangeEvaluator();
 
-      switch (expr.getKind()) {
-        case mlir::AffineExprKind::Constant:
-          return b.create<mlir::arith::ConstantIntOp>(
-              mlir::cast<mlir::AffineConstantExpr>(expr).getValue(), int_ty);
-        case mlir::AffineExprKind::DimId:
-          return b.create<mlir::arith::IndexCastUIOp>(
-              int_ty, op.getDimOperands()[mlir::cast<mlir::AffineDimExpr>(expr)
-                                              .getPosition()]);
-        case mlir::AffineExprKind::SymbolId:
-          return b.create<mlir::arith::IndexCastUIOp>(
-              int_ty,
-              op.getSymbolOperands()[mlir::cast<mlir::AffineSymbolExpr>(expr)
-                                         .getPosition()]);
-        default:
-          ABSL_UNREACHABLE();
+    b.setInsertionPoint(op);
+    SmallVector<Value, 4> results;
+    results.reserve(affine_map.getNumResults());
+    for (unsigned i = 0; i < affine_map.getNumResults(); ++i) {
+      AffineExpr result_expr = affine_map.getResult(i);
+      // If the expression cannot be lowered, we convert it to affine.apply,
+      // since it supports more expression types.
+      if (IsLoweringSupported(result_expr, range_evaluator)) {
+        results.push_back(
+            EvaluateExpression(b, result_expr, dim_count, operands));
+      } else {
+        results.push_back(
+            b.create<AffineApplyOp>(affine_map.getSubMap({i}), operands));
       }
-    };
-
-    auto result = lower(map.GetAffineMap().getResult(0));
-    auto result_range =
-        range_evaluator.ComputeExpressionRange(map.GetAffineMap().getResult(0));
-    rewriter
-        .replaceOpWithNewOp<mlir::arith::IndexCastUIOp>(op, b.getIndexType(),
-                                                        result)
-        ->setAttr("xla.range", b.getIndexArrayAttr(
-                                   {result_range.lower, result_range.upper}));
+    }
+    rewriter.replaceOp(op, results);
     return mlir::success();
   }
 };
 
-void SimplifyAffinePass::runOnOperation() {
-  mlir::RewritePatternSet patterns(&getContext());
-  patterns.add<RewriteAffineApply>(&getContext());
-  mlir::GreedyRewriteConfig config;
-  // There's no point simplifying more than once.
-  config.strictMode = mlir::GreedyRewriteStrictness::ExistingOps;
-  if (mlir::failed(mlir::applyPatternsAndFoldGreedily(
-          getOperation(), std::move(patterns), config))) {
-    signalPassFailure();
+struct SimplifyAffinePass
+    : public impl::SimplifyAffinePassBase<SimplifyAffinePass> {
+ public:
+  void runOnOperation() override {
+    MLIRContext* ctx = &getContext();
+    mlir::RewritePatternSet patterns(ctx);
+    patterns.add<RewriteAffineApply, RewriteApplyIndexingOp>(ctx);
+    mlir::GreedyRewriteConfig config;
+    // There's no point simplifying more than once.
+    config.strictMode = mlir::GreedyRewriteStrictness::ExistingOps;
+    if (mlir::failed(mlir::applyPatternsAndFoldGreedily(
+            getOperation(), std::move(patterns), config))) {
+      signalPassFailure();
+    }
   }
-}
+};
 
 }  // namespace
 
+std::optional<Interval> GetRange(mlir::Value value) {
+  auto attr_to_range = [](mlir::Attribute attr) -> std::optional<Interval> {
+    if (!attr) {
+      return std::nullopt;
+    }
+    auto values = llvm::to_vector(
+        mlir::cast<mlir::ArrayAttr>(attr).getAsValueRange<mlir::IntegerAttr>());
+    return {{values[0].getSExtValue(), values[1].getSExtValue()}};
+  };
+
+  if (value.getDefiningOp()) {
+    return attr_to_range(value.getDefiningOp()->getAttr("xla.range"));
+  }
+
+  auto bbarg = mlir::dyn_cast<mlir::BlockArgument>(value);
+  if (!bbarg) {
+    return std::nullopt;
+  }
+
+  auto parent = bbarg.getParentBlock()->getParentOp();
+  if (auto func_op = mlir::dyn_cast<mlir::func::FuncOp>(parent)) {
+    return attr_to_range(func_op.getArgAttr(bbarg.getArgNumber(), "xla.range"));
+  }
+
+  if (auto for_op = mlir::dyn_cast<mlir::scf::ForOp>(parent)) {
+    llvm::APInt lb, ub;
+    if (mlir::matchPattern(for_op.getLowerBound(), mlir::m_ConstantInt(&lb)) &&
+        mlir::matchPattern(for_op.getUpperBound(), mlir::m_ConstantInt(&ub))) {
+      return {{lb.getSExtValue(), ub.getSExtValue() - 1}};
+    }
+  }
+  return std::nullopt;
+}
+
 std::unique_ptr<mlir::Pass> CreateSimplifyAffinePass() {
   return std::make_unique<SimplifyAffinePass>();
 }
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/tests/canonicalize.mlir b/third_party/xla/xla/service/gpu/fusions/mlir/tests/canonicalize.mlir
new file mode 100644
index 00000000000000..11c8589e8bd1fd
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/tests/canonicalize.mlir
@@ -0,0 +1,98 @@
+// RUN: mlir_fusions_opt %s --split-input-file -canonicalize | FileCheck %s
+
+#map0 = affine_map<()[s0, s1] -> (1 + s0 + s1 mod 3 - s1, s0 mod 2)>
+func.func @simplify_apply_indexing(%s0: index, %s1: index) -> (index, index) {
+  %0:2 = xla_gpu.apply_indexing #map0 [%s0 in [-10, 10], %s1 in [0, 2]]
+  func.return %0#0, %0#1 : index, index
+}
+// CHECK: #[[$MAP:.*]] = affine_map<()[s0] -> (s0 + 1, s0 mod 2)>
+
+// CHECK-LABEL: func.func @simplify_apply_indexing
+// CHECK-SAME:      %[[ARG_0:.*]]: index, %[[ARG_1:.*]]: index)
+// CHECK:         xla_gpu.apply_indexing #[[$MAP]][%[[ARG_0]] in [-10, 10]]
+
+// -----
+
+#map0 = affine_map<(d0, d1, d2)[s0, s1] -> (1 + s0 + s1 mod 4 - s1, s0 mod 2, d0 + d2)>
+func.func @simplify_apply_indexing_remove_dims(%d0: index, %d1: index,
+    %d2: index, %s0: index, %s1: index) -> (index, index, index) {
+  %0:3 = xla_gpu.apply_indexing #map0
+    (%d0 in [0, 1], %d1 in [0, 2], %d2 in [0, 3])
+    [%s0 in [-11, 11], %s1 in [0, 3]]
+  func.return %0#0, %0#1, %0#2 : index, index, index
+}
+// CHECK: #[[$MAP:.*]] = affine_map<(d0, d1)[s0] -> (s0 + 1, s0 mod 2, d0 + d1)>
+
+// CHECK-LABEL: func.func @simplify_apply_indexing_remove_dims
+// CHECK-SAME:      %[[ARG_0:[a-zA-Z0-9_]+]]: index,
+// CHECK-SAME:      %[[ARG_1:[a-zA-Z0-9_]+]]: index,
+// CHECK-SAME:      %[[ARG_2:[a-zA-Z0-9_]+]]: index,
+// CHECK-SAME:      %[[ARG_3:[a-zA-Z0-9_]+]]: index,
+// CHECK-SAME:      %[[ARG_4:[a-zA-Z0-9_]+]]: index)
+// CHECK:       xla_gpu.apply_indexing #[[$MAP]]
+// CHECK-SAME:    (%[[ARG_0]] in [0, 1], %[[ARG_2]] in [0, 3])
+// CHECK-SAME:    [%[[ARG_3]] in [-11, 11]]
+
+// -----
+
+#map0 = affine_map<(d0, d1)[s0] -> (d0 + s0, 4, d1, 1, s0)>
+func.func @fold_indexing_map_results(%d0: index, %d1: index, %s0: index)
+    -> (index, index, index, index, index) {
+  %0:5 = xla_gpu.apply_indexing #map0 (%d0 in [-10, 10], %d1 in [0, 2])[%s0 in [-1, 1]]
+  func.return %0#0, %0#1, %0#2, %0#3, %0#4  : index, index, index, index, index
+}
+// CHECK: #[[$MAP:.*]] = affine_map<(d0)[s0] -> (d0 + s0)>
+
+// CHECK-LABEL: func.func @fold_indexing_map_results
+// CHECK-SAME:  %[[ARG_0:.*]]: index, %[[ARG_1:.*]]: index, %[[ARG_2:.*]]: index)
+
+// CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C4:.*]] = arith.constant 4 : index
+
+// CHECK:       %[[NEW_RESULT:.*]] = xla_gpu.apply_indexing #[[$MAP]]
+// CHECK:       return %[[NEW_RESULT]], %[[C4]], %[[ARG_1]], %[[C1]], %[[ARG_2]]
+
+// -----
+
+#map0 = affine_map<(d0, d1)[s0] -> (d0 + s0, s0 + 4, d1 mod 2, 1 + d1, s0)>
+func.func @remove_unused_results(%d0: index, %d1: index, %s0: index) -> (index) {
+  %0:5 = xla_gpu.apply_indexing #map0 (%d0 in [-10, 10], %d1 in [0, 2])[%s0 in [-1, 1]]
+  func.return %0#2 : index
+}
+// CHECK: #[[$MAP:.*]] = affine_map<(d0) -> (d0 mod 2)>
+
+// CHECK-LABEL: func.func @remove_unused_results
+// CHECK-SAME:  %[[ARG_0:.*]]: index, %[[ARG_1:.*]]: index, %[[ARG_2:.*]]: index)
+
+// CHECK:       %[[NEW_RESULT:.*]] = xla_gpu.apply_indexing #[[$MAP]](%[[ARG_1]] in [0, 2])
+// CHECK:       return %[[NEW_RESULT]]
+
+// -----
+
+#map0 = affine_map<(d0, d1)[s0, s1] -> (d0 + d1 + s0 + s1 mod 3)>
+func.func @fold_operands(%d0: index) -> index {
+  %d1 = arith.constant 1 : index
+  %s0 = arith.constant 2 : index
+  %s1 = arith.constant 3 : index
+  %0 = xla_gpu.apply_indexing #map0 (%d0 in [0, 10], %d1 in [0, 5])
+                                    [%s0 in [-10, 10], %s1 in [0, 4]]
+  func.return %0 : index
+}
+// CHECK: #[[$MAP:.*]] = affine_map<(d0) -> (d0 + 3)>
+
+// CHECK-LABEL: func.func @fold_operands
+// CHECK-SAME:      %[[ARG_0:.*]]: index)
+// CHECK:         xla_gpu.apply_indexing #[[$MAP]](%[[ARG_0]] in [0, 10])
+
+// -----
+
+func.func @fold_operands_and_results(%arg0: index, %arg1: index)
+  -> (index, index) {
+  %0:2 = xla_gpu.apply_indexing affine_map<(d0, d1) -> (0, d1)>
+    (%arg0 in [0, 4], %arg1 in [0, 5])
+  return %0#0, %0#1 : index, index
+}
+// CHECK-LABEL: func.func @fold_operands_and_results
+// CHECK-SAME:      %[[ARG_0:.*]]: index, %[[ARG_1:.*]]: index)
+// CHECK-NEXT: %[[C0:.*]] = arith.constant 0
+// CHECK-NEXT: return %[[C0]], %[[ARG_1]] : index, index
\ No newline at end of file
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/tests/convert_xla_gpu_pure_calls.mlir b/third_party/xla/xla/service/gpu/fusions/mlir/tests/convert_xla_gpu_pure_calls.mlir
new file mode 100644
index 00000000000000..a6296414b9c62f
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/tests/convert_xla_gpu_pure_calls.mlir
@@ -0,0 +1,50 @@
+// RUN: mlir_fusions_opt %s -xla-gpu-convert-pure-call-ops | FileCheck %s
+// RUN: mlir_fusions_opt %s -cse -xla-gpu-convert-pure-call-ops \
+// RUN: | FileCheck %s -check-prefixes=CHECK-CSE
+
+func.func private @callee() -> f32 {
+  %ret = arith.constant 0.0 : f32
+  return %ret : f32
+}
+
+func.func @caller() -> f32 {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c10 = arith.constant 10 : index
+  %call0 = xla_gpu.pure_call @callee() : () -> (f32)
+  %v = scf.for %i = %c0 to %c10 step %c1 iter_args(%r = %call0) -> f32 {
+    %call1 = xla_gpu.pure_call @callee() : () -> (f32)
+    %new_v = arith.addf %call1, %r : f32
+    scf.yield %new_v : f32
+  }
+  return %v : f32
+}
+
+// CHECK-LABEL: @caller
+// CHECK:   call @callee
+// CHECK:   call @callee
+
+// CHECK-CSE: @caller
+// CHECK-CSE: %[[CALL:.*]] = call @callee
+// CHECK-CSE: scf.for {{.*}} iter_args(%[[ITER_ARG:.*]] = %[[CALL]])
+// CHECK-CSE: arith.addf %[[CALL]], %[[ITER_ARG]]
+
+// -----
+
+func.func private @arg_callee(%arg0: f32, %arg1: f32) -> f32 {
+  %ret = arith.addf %arg0, %arg1 : f32
+  return %ret : f32
+}
+
+func.func @arg_caller() -> f32 {
+  %cst0 = arith.constant 0.0 : f32
+  %cst1 = arith.constant 1.0 : f32
+  %call = xla_gpu.pure_call @arg_callee(%cst0, %cst1) : (f32, f32) -> (f32)
+  return %call : f32
+}
+
+// CHECK-LABEL: @arg_caller
+// CHECK: %[[CST0:.*]] = arith.constant 0
+// CHECK: %[[CST1:.*]] = arith.constant 1
+// CHECK: %[[RET:.*]] = call @arg_callee(%[[CST0]], %[[CST1]])
+// CHECK: return %[[RET]]
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/tests/invalid.mlir b/third_party/xla/xla/service/gpu/fusions/mlir/tests/invalid.mlir
new file mode 100644
index 00000000000000..fcc6c6d6d7b129
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/tests/invalid.mlir
@@ -0,0 +1,8 @@
+// RUN: mlir_fusions_opt  %s -split-input-file -verify-diagnostics
+
+#map0 = affine_map<(d0, d1)[s0] -> (d0, d1 + s0)>
+func.func @apply_indexing(%d0: index, %d1: index, %s0: index) -> (index, index) {
+  // expected-error @+1 {{operand, lower_bounds, upper_bounds count and affine map dimension and symbol count must match}}
+  %0:2 = xla_gpu.apply_indexing #map0 (%d0 in [0, 2])
+  func.return %0#0, %0#1 : index, index
+}
\ No newline at end of file
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/tests/lower_func.mlir b/third_party/xla/xla/service/gpu/fusions/mlir/tests/lower_func.mlir
deleted file mode 100644
index 29acbd5e284fa7..00000000000000
--- a/third_party/xla/xla/service/gpu/fusions/mlir/tests/lower_func.mlir
+++ /dev/null
@@ -1,53 +0,0 @@
-// RUN: mlir_fusions_opt %s -xla-gpu-lower-func | FileCheck %s
-// RUN: mlir_fusions_opt %s -cse -xla-gpu-lower-func | FileCheck %s -check-prefixes=CHECK-CSE
-
-module {
-  func.func private @callee() -> f32 {
-    %ret = arith.constant 0.0 : f32
-    return %ret : f32
-  }
-
-  func.func @caller() -> f32 {
-    %c0 = arith.constant 0 : index
-    %c1 = arith.constant 1 : index
-    %c10 = arith.constant 10 : index
-    %call0 = xla_gpu.pure_call @callee() : () -> (f32)
-    %v = scf.for %i = %c0 to %c10 step %c1 iter_args(%r = %call0) -> f32 {
-      %call1 = xla_gpu.pure_call @callee() : () -> (f32)
-      %new_v = arith.addf %call1, %r : f32
-      scf.yield %new_v : f32
-    }
-    return %v : f32
-  }
-}
-
-// CHECK: @caller
-// CHECK:   call @callee
-// CHECK:   call @callee
-
-// CHECK-CSE: @caller
-// CHECK-CSE: %[[CALL:.*]] = call @callee
-// CHECK-CSE: scf.for {{.*}} iter_args(%[[ITER_ARG:.*]] = %[[CALL]])
-// CHECK-CSE: arith.addf %[[CALL]], %[[ITER_ARG]]
-
-// -----
-
-module {
-  func.func private @arg_callee(%arg0: f32, %arg1: f32) -> f32 {
-    %ret = arith.addf %arg0, %arg1 : f32
-    return %ret : f32
-  }
-
-  func.func @arg_caller() -> f32 {
-    %cst0 = arith.constant 0.0 : f32
-    %cst1 = arith.constant 1.0 : f32
-    %call = xla_gpu.pure_call @arg_callee(%cst0, %cst1) : (f32, f32) -> (f32)
-    return %call : f32
-  }
-}
-
-// CHECK: @arg_caller
-// CHECK: %[[CST0:.*]] = arith.constant 0
-// CHECK: %[[CST1:.*]] = arith.constant 1
-// CHECK: %[[RET:.*]] = call @arg_callee(%[[CST0]], %[[CST1]])
-// CHECK: return %[[RET]]
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/tests/lower_tensors.mlir b/third_party/xla/xla/service/gpu/fusions/mlir/tests/lower_tensors.mlir
index 1687d88a74a30c..d93f3ecd514c24 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/tests/lower_tensors.mlir
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/tests/lower_tensors.mlir
@@ -1,4 +1,22 @@
-// RUN: mlir_fusions_opt %s -split-input-file -xla-gpu-lower-tensors | FileCheck %s
+// RUN: mlir_fusions_opt %s --allow-unregistered-dialect -split-input-file \
+// RUN: -xla-gpu-lower-tensors="is_amd_gpu=false gpu_arch=6.0" \
+// RUN: | FileCheck %s
+
+// RUN: mlir_fusions_opt %s --allow-unregistered-dialect -split-input-file \
+// RUN: -xla-gpu-lower-tensors="is_amd_gpu=false gpu_arch=7.0" \
+// RUN: | FileCheck %s --check-prefix=CHECK-VOLTA
+
+// RUN: mlir_fusions_opt %s --allow-unregistered-dialect -split-input-file \
+// RUN: -xla-gpu-lower-tensors="is_amd_gpu=false gpu_arch=8.0" \
+// RUN: | FileCheck %s --check-prefix=CHECK-AMPERE
+
+// RUN: mlir_fusions_opt %s --allow-unregistered-dialect -split-input-file \
+// RUN: -xla-gpu-lower-tensors="is_amd_gpu=true gpu_arch=gfx908:sramecc+:xnack" \
+// RUN: | FileCheck %s --check-prefix=CHECK-GFX908-MI100
+
+// RUN: mlir_fusions_opt %s --allow-unregistered-dialect -split-input-file \
+// RUN: -xla-gpu-lower-tensors="is_amd_gpu=true gpu_arch=gfx90a:sramecc+:xnack" \
+// RUN: | FileCheck %s --check-prefix=CHECK-GFX90A-MI200
 
 module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>} {
   func.func private @add(%arg0: f32, %arg1: f32) -> f32 {
@@ -135,6 +153,27 @@ module {
 
 // -----
 
+module {
+  func.func @extract_from_constant(%arg0: tensor<2x1xf32>,
+      %arg1: index, %arg2: index) -> f32 {
+    %cst = arith.constant dense<[[1.000000e+00], [2.000000e+00]]> : tensor<2x1xf32>
+    %extracted = tensor.extract %arg0[%arg1, %arg2] : tensor<2x1xf32>
+    %extracted_0 = tensor.extract %cst[%arg1, %arg2] : tensor<2x1xf32>
+    %0 = arith.addf %extracted, %extracted_0 : f32
+    return %0 : f32
+  }
+}
+// CHECK: llvm.mlir.global private constant @global_cst_0(dense<
+// CHECK-SAME: [1.000000e+00, 2.000000e+00]> : tensor<2xf32>) {addr_space = 0 : i32} : !llvm.array<2 x f32>
+// CHECK: @extract_from_constant
+// CHECK: %[[ADDR_OF:.*]] = llvm.mlir.addressof @global_cst_0 : !llvm.ptr
+// CHECK: %[[GEP:.*]] = llvm.getelementptr inbounds %[[ADDR_OF]][%{{.*}}] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+// CHECK: %[[LOAD:.*]] = llvm.load %[[GEP]] : !llvm.ptr -> f32
+// CHECK: %[[ADD:.*]] = arith.addf %{{.*}}, %[[LOAD]] : f32
+// CHECK: return %[[ADD]] : f32
+
+// -----
+
 module {
   func.func @complex_tensor_insert(
       %arg0: tensor<10xcomplex<f32>>) -> tensor<10xcomplex<f32>> {
@@ -235,7 +274,7 @@ module {
     %ret = xla_gpu.atomic_rmw %in[%i, %j] : tensor<2x4xf32> {
       ^bb0(%current : f32):
         %c42 = arith.constant 1.0 : f32
-        %add = arith.addf %current, %c42 : f32
+        %add = arith.minimumf %current, %c42 : f32
         xla_gpu.yield %add : f32
     }
     return %ret : tensor<2x4xf32>
@@ -340,3 +379,318 @@ module {
 // CHECK-SAME: -> !llvm.ptr, i8
 // CHECK: llvm.load
 // CHECK: llvm.store
+
+// -----
+
+module {
+  func.func @direct_atomic_rmw_overwrite(%in: tensor<2x4xi32>,
+    %i: index, %j: index) -> (tensor<2x4xi32>) {
+    %c2 = arith.constant 2 : i32
+    %ret = xla_gpu.atomic_rmw %in[%i, %j] : tensor<2x4xi32> {
+      ^bb0(%current : i32):
+        xla_gpu.yield %c2 : i32
+    }
+    return %ret : tensor<2x4xi32>
+  }
+}
+// CHECK: @direct_atomic_rmw_overwrite
+// CHECK: %[[C2:.*]] = arith.constant 2
+// CHECK: %[[ADDR:.*]] = llvm.getelementptr
+// CHECK: llvm.store %[[C2]], %[[ADDR]] atomic unordered {alignment = 32 : i64}
+
+// -----
+
+module {
+  func.func @direct_atomic_rmw_addi(%in: tensor<2x4xi32>,
+    %i: index, %j: index) -> (tensor<2x4xi32>) {
+    %c2 = arith.constant 2 : i32
+    %ret = xla_gpu.atomic_rmw %in[%i, %j] : tensor<2x4xi32> {
+      ^bb0(%current : i32):
+        %min = arith.addi %current, %c2 : i32
+        xla_gpu.yield %c2 : i32
+    }
+    return %ret : tensor<2x4xi32>
+  }
+}
+// CHECK: @direct_atomic_rmw_addi
+// CHECK: %[[C2:.*]] = arith.constant 2
+// CHECK: %[[ADDR:.*]] = llvm.getelementptr
+// CHECK: llvm.atomicrmw add %[[ADDR]], %[[C2]] seq_cst
+
+// -----
+
+module {
+  func.func @direct_atomic_rmw_maxsi(%in: tensor<2x4xi32>,
+    %i: index, %j: index) -> (tensor<2x4xi32>) {
+    %c2 = arith.constant 2 : i32
+    %ret = xla_gpu.atomic_rmw %in[%i, %j] : tensor<2x4xi32> {
+      ^bb0(%current : i32):
+        %min = arith.maxsi %current, %c2 : i32
+        xla_gpu.yield %c2 : i32
+    }
+    return %ret : tensor<2x4xi32>
+  }
+}
+// CHECK: @direct_atomic_rmw_maxsi
+// CHECK: %[[C2:.*]] = arith.constant 2
+// CHECK: %[[ADDR:.*]] = llvm.getelementptr
+// CHECK: llvm.atomicrmw max %[[ADDR]], %[[C2]] seq_cst
+
+// -----
+
+module {
+  func.func @direct_atomic_rmw_maxui(%in: tensor<2x4xi32>,
+    %i: index, %j: index) -> (tensor<2x4xi32>) {
+    %c2 = arith.constant 2 : i32
+    %ret = xla_gpu.atomic_rmw %in[%i, %j] : tensor<2x4xi32> {
+      ^bb0(%current : i32):
+        %min = arith.maxui %current, %c2 : i32
+        xla_gpu.yield %c2 : i32
+    }
+    return %ret : tensor<2x4xi32>
+  }
+}
+// CHECK: @direct_atomic_rmw_maxui
+// CHECK: %[[C2:.*]] = arith.constant 2
+// CHECK: %[[ADDR:.*]] = llvm.getelementptr
+// CHECK: llvm.atomicrmw umax %[[ADDR]], %[[C2]] seq_cst
+
+// -----
+
+module {
+  func.func @direct_atomic_rmw_minsi(%in: tensor<2x4xi32>,
+    %i: index, %j: index) -> (tensor<2x4xi32>) {
+    %c2 = arith.constant 2 : i32
+    %ret = xla_gpu.atomic_rmw %in[%i, %j] : tensor<2x4xi32> {
+      ^bb0(%current : i32):
+        %min = arith.minsi %current, %c2 : i32
+        xla_gpu.yield %c2 : i32
+    }
+    return %ret : tensor<2x4xi32>
+  }
+}
+// CHECK: @direct_atomic_rmw_minsi
+// CHECK: %[[C2:.*]] = arith.constant 2
+// CHECK: %[[ADDR:.*]] = llvm.getelementptr
+// CHECK: llvm.atomicrmw min %[[ADDR]], %[[C2]] seq_cst
+
+// -----
+
+module {
+  func.func @direct_atomic_rmw_minui(%in: tensor<2x4xi32>,
+    %i: index, %j: index) -> (tensor<2x4xi32>) {
+    %c2 = arith.constant 2 : i32
+    %ret = xla_gpu.atomic_rmw %in[%i, %j] : tensor<2x4xi32> {
+      ^bb0(%current : i32):
+        %min = arith.minui %current, %c2 : i32
+        xla_gpu.yield %c2 : i32
+    }
+    return %ret : tensor<2x4xi32>
+  }
+}
+// CHECK: @direct_atomic_rmw_minui
+// CHECK: %[[C2:.*]] = arith.constant 2
+// CHECK: %[[ADDR:.*]] = llvm.getelementptr
+// CHECK: llvm.atomicrmw umin %[[ADDR]], %[[C2]] seq_cst
+
+// -----
+
+module {
+  func.func @direct_atomic_rmw_fadd_f32(%in: tensor<2x4xf32>,
+    %i: index, %j: index) -> (tensor<2x4xf32>) {
+    %c2 = arith.constant 2.0 : f32
+    %ret = xla_gpu.atomic_rmw %in[%i, %j] : tensor<2x4xf32> {
+      ^bb0(%current : f32):
+        %min = arith.addf %current, %c2 : f32
+        xla_gpu.yield %c2 : f32
+    }
+    return %ret : tensor<2x4xf32>
+  }
+}
+// CHECK-LABEL: @direct_atomic_rmw_fadd_f32
+// CHECK: %[[C2:.*]] = arith.constant 2
+// CHECK: %[[ADDR:.*]] = llvm.getelementptr
+// CHECK: llvm.atomicrmw fadd %[[ADDR]], %[[C2]] seq_cst
+
+// CHECK-VOLTA-LABEL: @direct_atomic_rmw_fadd_f32
+// CHECK-VOLTA: %[[C2:.*]] = arith.constant 2
+// CHECK-VOLTA: %[[ADDR:.*]] = llvm.getelementptr
+// CHECK-VOLTA: llvm.atomicrmw fadd %[[ADDR]], %[[C2]] seq_cst
+
+// CHECK-AMPERE-LABEL: @direct_atomic_rmw_fadd_f32
+// CHECK-AMPERE: %[[C2:.*]] = arith.constant 2
+// CHECK-AMPERE: %[[ADDR:.*]] = llvm.getelementptr
+// CHECK-AMPERE: llvm.atomicrmw fadd %[[ADDR]], %[[C2]] seq_cst
+
+// CHECK-GFX908-MI100-LABEL: @direct_atomic_rmw_fadd_f32
+// CHECK-GFX908-MI100: %[[C2:.*]] = arith.constant 2
+// CHECK-GFX908-MI100: %[[ADDR:.*]] = llvm.getelementptr
+// CHECK-GFX908-MI100: %[[ADDR_CAST:.*]] = llvm.addrspacecast %[[ADDR]] : !llvm.ptr to !llvm.ptr<1>
+// CHECK-GFX908-MI100: llvm.atomicrmw fadd %[[ADDR_CAST]], %[[C2]] syncscope("agent") seq_cst
+
+// CHECK-GFX90A-MI200-LABEL: @direct_atomic_rmw_fadd_f32
+// CHECK-GFX90A-MI200: %[[C2:.*]] = arith.constant 2
+// CHECK-GFX90A-MI200: %[[ADDR:.*]] = llvm.getelementptr
+// CHECK-GFX90A-MI200: %[[ADDR_CAST:.*]] = llvm.addrspacecast %[[ADDR]] : !llvm.ptr to !llvm.ptr<1>
+// CHECK-GFX90A-MI200: llvm.atomicrmw fadd %[[ADDR_CAST]], %[[C2]] syncscope("agent") seq_cst
+
+// -----
+
+module {
+  func.func @direct_atomic_rmw_fadd_f16(%in: tensor<2x4xf16>,
+    %i: index, %j: index) -> (tensor<2x4xf16>) {
+    %c2 = arith.constant 2.0 : f16
+    %ret = xla_gpu.atomic_rmw %in[%i, %j] : tensor<2x4xf16> {
+      ^bb0(%current : f16):
+        %min = arith.addf %current, %c2 : f16
+        xla_gpu.yield %c2 : f16
+    }
+    return %ret : tensor<2x4xf16>
+  }
+}
+// CHECK-LABEL: @direct_atomic_rmw_fadd_f16
+// CHECK-NOT: llvm.atomicrmw fadd
+
+// CHECK-VOLTA-LABEL: @direct_atomic_rmw_fadd_f16
+// CHECK-VOLTA: %[[C2:.*]] = arith.constant 2
+// CHECK-VOLTA: %[[ADDR:.*]] = llvm.getelementptr
+// CHECK-VOLTA: llvm.atomicrmw fadd %[[ADDR]], %[[C2]] seq_cst
+
+// CHECK-AMPERE-LABEL: @direct_atomic_rmw_fadd_f16
+// CHECK-AMPERE: %[[C2:.*]] = arith.constant 2
+// CHECK-AMPERE: %[[ADDR:.*]] = llvm.getelementptr
+// CHECK-AMPERE: llvm.atomicrmw fadd %[[ADDR]], %[[C2]] seq_cst
+
+// CHECK-GFX908-MI100-LABEL: @direct_atomic_rmw_fadd_f16
+// CHECK-GFX908-MI100-NOT: llvm.atomicrmw fadd
+
+// CHECK-GFX90A-MI200-LABEL: @direct_atomic_rmw_fadd_f16
+// CHECK-GFX90A-MI200: %[[C2:.*]] = arith.constant 2
+// CHECK-GFX90A-MI200: %[[ADDR:.*]] = llvm.getelementptr
+// CHECK-GFX90A-MI200: %[[ADDR_CAST:.*]] = llvm.addrspacecast %[[ADDR]] : !llvm.ptr to !llvm.ptr<1>
+// CHECK-GFX90A-MI200: llvm.atomicrmw fadd %[[ADDR_CAST]], %[[C2]] syncscope("agent") seq_cst
+
+// -----
+
+module {
+  func.func @direct_atomic_rmw_fadd_f64(%in: tensor<2x4xf64>,
+    %i: index, %j: index) -> (tensor<2x4xf64>) {
+    %c2 = arith.constant 2.0 : f64
+    %ret = xla_gpu.atomic_rmw %in[%i, %j] : tensor<2x4xf64> {
+      ^bb0(%current : f64):
+        %min = arith.addf %current, %c2 : f64
+        xla_gpu.yield %c2 : f64
+    }
+    return %ret : tensor<2x4xf64>
+  }
+}
+// CHECK-LABEL: @direct_atomic_rmw_fadd_f64
+// CHECK: %[[C2:.*]] = arith.constant 2
+// CHECK: %[[ADDR:.*]] = llvm.getelementptr
+// CHECK: llvm.atomicrmw fadd %[[ADDR]], %[[C2]] seq_cst
+
+// CHECK-VOLTA-LABEL: @direct_atomic_rmw_fadd_f64
+// CHECK-VOLTA: %[[C2:.*]] = arith.constant 2
+// CHECK-VOLTA: %[[ADDR:.*]] = llvm.getelementptr
+// CHECK-VOLTA: llvm.atomicrmw fadd %[[ADDR]], %[[C2]] seq_cst
+
+// CHECK-AMPERE-LABEL: @direct_atomic_rmw_fadd_f64
+// CHECK-AMPERE: %[[C2:.*]] = arith.constant 2
+// CHECK-AMPERE: %[[ADDR:.*]] = llvm.getelementptr
+// CHECK-AMPERE: llvm.atomicrmw fadd %[[ADDR]], %[[C2]] seq_cst
+
+// CHECK-GFX908-MI100-LABEL: @direct_atomic_rmw_fadd_f64
+// CHECK-GFX908-MI100-NOT: llvm.atomicrmw fadd
+
+// CHECK-GFX90A-MI200-LABEL: @direct_atomic_rmw_fadd_f64
+// CHECK-GFX90A-MI200-NOT: llvm.atomicrmw fadd
+
+// -----
+
+module {
+  func.func @direct_atomic_rmw_maximumf(%in: tensor<2x4xf32>,
+    %i: index, %j: index) -> (tensor<2x4xf32>) {
+    %c2 = arith.constant 2.0 : f32
+    %ret = xla_gpu.atomic_rmw %in[%i, %j] : tensor<2x4xf32> {
+      ^bb0(%current : f32):
+        %min = arith.maximumf %current, %c2 : f32
+        xla_gpu.yield %c2 : f32
+    }
+    return %ret : tensor<2x4xf32>
+  }
+}
+// CHECK-LABEL: @direct_atomic_rmw_maximumf
+
+// CHECK: %[[MODIFIER:.*]] = arith.constant 2.000000e+00 : f32
+// CHECK: %[[NAN:.*]] = llvm.mlir.constant(0x7FC00000 : f32) : f32
+// CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: %[[ADDR:.*]] = llvm.getelementptr
+// CHECK: %[[CURRENT:.*]] = llvm.load %[[ADDR]] : !llvm.ptr -> f32
+// CHECK: %[[CURRENT_IS_NAN:.*]] = llvm.fcmp "uno" %[[CURRENT]], %[[CURRENT]] : f32
+// CHECK: scf.if %[[CURRENT_IS_NAN]] {
+// CHECK: } else {
+// CHECK:   %[[MODIFIER_IS_NAN:.*]] = llvm.fcmp "uno" %[[MODIFIER]], %[[MODIFIER]] : f32
+// CHECK:   %[[MODIFIER_OR_NAN:.*]] = llvm.select %[[MODIFIER_IS_NAN]], %[[NAN]], %[[MODIFIER]] : i1, f32
+// CHECK:   %[[VAL_13:.*]] = llvm.fcmp "ult" %[[CURRENT]], %[[MODIFIER_OR_NAN]] : f32
+// CHECK:   scf.if %[[VAL_13]] {
+// CHECK:     %[[INT_MODIFIER_OR_NAN:.*]] = llvm.bitcast %[[MODIFIER_OR_NAN]] : f32 to i32
+// CHECK:     %[[IS_POSITIVE:.*]] = llvm.icmp "sge" %[[INT_MODIFIER_OR_NAN]], %[[C0]] : i32
+// CHECK:     scf.if %[[IS_POSITIVE]] {
+// CHECK:       llvm.atomicrmw max %[[ADDR]], %[[INT_MODIFIER_OR_NAN]] seq_cst
+// CHECK:     } else {
+// CHECK:       llvm.atomicrmw umin %[[ADDR]], %[[INT_MODIFIER_OR_NAN]] seq_cst
+// CHECK:     }
+// CHECK:   }
+// CHECK: }
+// CHECK: return
+
+// -----
+
+module {
+  func.func @atomic_rmw_c32(%in: tensor<2x4xcomplex<f32>>, %i: index, %j: index)
+      -> (tensor<2x4xcomplex<f32>>) {
+    %ret = xla_gpu.atomic_rmw %in[%i, %j] : tensor<2x4xcomplex<f32>> {
+      ^bb0(%current : complex<f32>):
+        %a = complex.add %current, %current : complex<f32>
+        xla_gpu.yield %a : complex<f32>
+    }
+    return %ret : tensor<2x4xcomplex<f32>>
+  }
+}
+
+// CHECK-LABEL: @atomic_rmw_c32
+
+// CHECK: scf.while (%[[ITER_ARG:.*]] = %{{.*}}) : (i64) -> i64
+// CHECK: %[[TMP:.*]] = llvm.alloca
+// CHECK: llvm.store %[[ITER_ARG]], %[[TMP]]
+// CHECK: %[[LD:.*]] = llvm.load %[[TMP]] : {{.*}} -> !llvm.struct<(f32, f32)>
+// CHECK: builtin.unrealized_conversion_cast %[[LD]] : {{.*}} to complex<f32>
+
+// -----
+
+module {
+  func.func @unused_index_switch_results(%i: index) -> index {
+    %ret, %ret2 = scf.index_switch %i -> tensor<2x4xi32>, tensor<3xf32>
+    case 0 {
+      %x, %y = "dummy.op1"() : () -> (tensor<2x4xi32>, tensor<3xf32>)
+      scf.yield %x, %y : tensor<2x4xi32>, tensor<3xf32>
+    }
+    default {
+      %x, %y = "dummy.op2"() : () -> (tensor<2x4xi32>, tensor<3xf32>)
+      scf.yield %x, %y : tensor<2x4xi32>, tensor<3xf32>
+    }
+    return %i : index
+  }
+}
+
+// CHECK-LABEL: func.func @unused_index_switch_results
+// CHECK-SAME:      (%[[I:.*]]: index)
+// CHECK-NEXT:    scf.index_switch %[[I]]
+// CHECK-NEXT:    case 0 {
+// CHECK-NEXT:      "dummy.op1"
+// CHECK-NEXT:      scf.yield
+// CHECK-NEXT:    }
+// CHECK-NEXT:    default {
+// CHECK-NEXT:      "dummy.op2"
+// CHECK-NEXT:    }
+// CHECK-NEXT:    return %[[I]] : index
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/tests/ops.mlir b/third_party/xla/xla/service/gpu/fusions/mlir/tests/ops.mlir
index 96391210473c20..ce571070c0bb6e 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/tests/ops.mlir
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/tests/ops.mlir
@@ -1,17 +1,17 @@
-// RUN: mlir_fusions_opt %s -canonicalize | FileCheck %s
-// RUN: mlir_fusions_opt %s -cse | FileCheck %s --check-prefixes=CHECK-CSE
-
-module {
-  func.func @shared_and_sync() -> (tensor<2xf32>, tensor<2xf32>) {
-    %shared1 = xla_gpu.allocate_shared : tensor<2xf32>
-    %shared2 = xla_gpu.allocate_shared : tensor<2xf32>
-    %sync:2 = xla_gpu.sync_threads %shared1, %shared2
-      : tensor<2xf32>, tensor<2xf32>
-    return %sync#0, %sync#1 : tensor<2xf32>, tensor<2xf32>
-  }
-}
+// R-UN: mlir_fusions_opt %s --split-input-file | FileCheck %s
+// Verify the printed output can be parsed.
+// RU-N: mlir_fusions_opt %s --split-input-file | mlir_fusions_opt --split-input-file | FileCheck %s
+// Verify the generic form can be parsed.
+// RUN: mlir_fusions_opt %s --split-input-file --mlir-print-op-generic | mlir_fusions_opt --split-input-file | FileCheck %s
 
-// CHECK: @shared_and_sync
+func.func @shared_and_sync() -> (tensor<2xf32>, tensor<2xf32>) {
+  %shared1 = xla_gpu.allocate_shared : tensor<2xf32>
+  %shared2 = xla_gpu.allocate_shared : tensor<2xf32>
+  %sync:2 = xla_gpu.sync_threads %shared1, %shared2
+    : tensor<2xf32>, tensor<2xf32>
+  return %sync#0, %sync#1 : tensor<2xf32>, tensor<2xf32>
+}
+// CHECK-LABEL: @shared_and_sync
 // CHECK-NEXT: allocate_shared
 // CHECK-NEXT: allocate_shared
 // CHECK-NEXT: sync_threads
@@ -19,39 +19,33 @@ module {
 
 // -----
 
-module {
-  func.func @atomic_rmw(%in: tensor<2x3xf32>, %i: index, %j: index)
-      -> (tensor<2x3xf32>) {
-    %ret = xla_gpu.atomic_rmw %in[%i, %j] : tensor<2x3xf32> {
-      ^bb0(%current : f32):
-        %c42 = arith.constant 42.0 : f32
-        %add = arith.addf %current, %c42 : f32
-        xla_gpu.yield %add : f32
-    }
-    return %ret : tensor<2x3xf32>
+func.func @atomic_rmw(%in: tensor<2x3xf32>, %i: index, %j: index)
+    -> (tensor<2x3xf32>) {
+  %ret = xla_gpu.atomic_rmw %in[%i, %j] : tensor<2x3xf32> {
+    ^bb0(%current : f32):
+      %c42 = arith.constant 42.0 : f32
+      %add = arith.addf %current, %c42 : f32
+      xla_gpu.yield %add : f32
   }
+  return %ret : tensor<2x3xf32>
 }
-
-// CHECK: @atomic_rmw
+// CHECK-LABEL: @atomic_rmw
 // CHECK: xla_gpu.atomic_rmw
 
 // -----
 
-module {
-  func.func private @add(%a: f32, %b: f32) -> f32 {
-    %ret = arith.addf %a, %b : f32
-    return %ret : f32
-  }
-
-  func.func @caller(%a: f32, %b: f32) -> f32 {
-    %c = xla_gpu.pure_call @add(%a, %b) : (f32, f32) -> (f32)
-    %d = xla_gpu.pure_call @add(%a, %b) : (f32, f32) -> (f32)
-    %ret = arith.addf %c, %d : f32
-    return %ret : f32
-  }
+func.func private @add(%a: f32, %b: f32) -> f32 {
+  %ret = arith.addf %a, %b : f32
+  return %ret : f32
 }
 
-// CHECK: @caller
+func.func @caller(%a: f32, %b: f32) -> f32 {
+  %c = xla_gpu.pure_call @add(%a, %b) : (f32, f32) -> (f32)
+  %d = xla_gpu.pure_call @add(%a, %b) : (f32, f32) -> (f32)
+  %ret = arith.addf %c, %d : f32
+  return %ret : f32
+}
+// CHECK-LABEL: @caller
 // CHECK: %[[C:.*]] = xla_gpu.pure_call @add
 // CHECK: %[[D:.*]] = xla_gpu.pure_call @add
 // CHECK: arith.addf %[[C]], %[[D]]
@@ -59,3 +53,44 @@ module {
 // CHECK-CSE: @caller
 // CHECK-CSE: %[[C:.*]] = xla_gpu.pure_call @add
 // CHECK-CSE: arith.addf %[[C]], %[[C]]
+
+// -----
+
+#map0 = affine_map<(d0, d1)[s0] -> (d0, d1 + s0)>
+func.func @apply_indexing(%d0: index, %d1: index, %s0: index) -> (index, index) {
+  %0:2 = xla_gpu.apply_indexing #map0 (%d0 in [0, 2], %d1 in [1, 3])[%s0 in [2, 4]]
+  func.return %0#0, %0#1 : index, index
+}
+// CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1)[s0] -> (d0, d1 + s0)>
+
+// CHECK-LABEL: @apply_indexing
+// CHECK: (%[[d0:.*]]: index, %[[d1:.*]]: index, %[[s0:.*]]: index)
+// CHECK: xla_gpu.apply_indexing #[[$MAP0]]
+// CHECK-SAME:  (%[[d0]] in [0, 2], %[[d1]] in [1, 3])[%[[s0]] in [2, 4]]
+
+// -----
+
+#map0 = affine_map<(d0, d1) -> (d0, d1)>
+func.func @apply_indexing_no_symbols(%d0: index, %d1: index) -> (index, index) {
+  %0:2 = xla_gpu.apply_indexing #map0 (%d0 in [0, 2], %d1 in [1, 3])
+  func.return %0#0, %0#1 : index, index
+}
+// CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1) -> (d0, d1)>
+
+// CHECK-LABEL: @apply_indexing_no_symbols
+// CHECK: (%[[d0:.*]]: index, %[[d1:.*]]: index)
+// CHECK: xla_gpu.apply_indexing #[[$MAP0]]
+// CHECK-SAME:  (%[[d0]] in [0, 2], %[[d1]] in [1, 3])
+
+// -----
+
+#map0 = affine_map<()[s0] -> (s0, s0)>
+func.func @apply_indexing_no_dims(%s0: index) -> (index, index) {
+  %0:2 = xla_gpu.apply_indexing #map0 [%s0 in [2, 4]]
+  func.return %0#0, %0#1 : index, index
+}
+// CHECK: #[[$MAP0:.*]] = affine_map<()[s0] -> (s0, s0)>
+
+// CHECK-LABEL: @apply_indexing_no_dims
+// CHECK: (%[[s0:.*]]: index)
+// CHECK: xla_gpu.apply_indexing #[[$MAP0]][%[[s0]] in [2, 4]]
\ No newline at end of file
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/tests/simplify_affine.mlir b/third_party/xla/xla/service/gpu/fusions/mlir/tests/simplify_affine.mlir
index 6411232f8980b7..eaa05c3b207371 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/tests/simplify_affine.mlir
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/tests/simplify_affine.mlir
@@ -1,84 +1,114 @@
 // RUN: mlir_fusions_opt %s -split-input-file -xla-gpu-simplify-affine | FileCheck %s
 
-module {
-  func.func @op_and_for_ranges(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr) {
-    %c0 = arith.constant 0 : index
-    %c1 = arith.constant 1 : index
-    %c4 = arith.constant 4 : index
-    %0 = gpu.thread_id  x {xla.range = [0 : index, 127 : index]}
-    %1 = gpu.block_id  x {xla.range = [0 : index, 3071 : index]}
-    scf.for %arg3 = %c0 to %c4 step %c1 {
-      %2 = affine.apply affine_map<()[s0, s1, s2] -> (s0 * 512 + s1 * 4 + s2 - ((s1 * 4 + s2) floordiv 256) * 256 + (s1 floordiv 64) * 256 - ((s0 * 2 + s1 floordiv 64) floordiv 3) * 768 + ((s0 * 128 + s1) floordiv 192) * 768 - (((s0 * 128 + s1) floordiv 192) floordiv 1024) * 786432 + (s0 floordiv 1536) * 786432)>()[%1, %0, %arg3]
-      %3 = arith.index_castui %2 : index to i64
-      %4 = llvm.getelementptr %arg0[%3] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %5 = llvm.load %4 invariant : !llvm.ptr -> f32
-      %8 = llvm.getelementptr %arg1[%3] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %9 = llvm.load %8 invariant : !llvm.ptr -> f32
-      %10 = arith.cmpf oge, %5, %9 : f32
-      %11 = llvm.getelementptr %arg2[%3] : (!llvm.ptr, i64) -> !llvm.ptr, i1
-      llvm.store %10, %11 : i1, !llvm.ptr
-    }
-    return
+func.func @op_and_for_ranges(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %0 = gpu.thread_id  x {xla.range = [0 : index, 127 : index]}
+  %1 = gpu.block_id  x {xla.range = [0 : index, 3071 : index]}
+  scf.for %arg3 = %c0 to %c4 step %c1 {
+    %2 = affine.apply affine_map<()[s0, s1, s2] -> (s0 * 512 + s1 * 4 + s2 - ((s1 * 4 + s2) floordiv 256) * 256 + (s1 floordiv 64) * 256 - ((s0 * 2 + s1 floordiv 64) floordiv 3) * 768 + ((s0 * 128 + s1) floordiv 192) * 768 - (((s0 * 128 + s1) floordiv 192) floordiv 1024) * 786432 + (s0 floordiv 1536) * 786432)>()[%1, %0, %arg3]
+    %3 = arith.index_castui %2 : index to i64
+    %4 = llvm.getelementptr %arg0[%3] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %5 = llvm.load %4 invariant : !llvm.ptr -> f32
+    %8 = llvm.getelementptr %arg1[%3] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %9 = llvm.load %8 invariant : !llvm.ptr -> f32
+    %10 = arith.cmpf oge, %5, %9 : f32
+    %11 = llvm.getelementptr %arg2[%3] : (!llvm.ptr, i64) -> !llvm.ptr, i1
+    llvm.store %10, %11 : i1, !llvm.ptr
   }
+  return
 }
 
-// CHECK: @op_and_for_ranges
+// CHECK-LABEL: @op_and_for_ranges
 // CHECK-DAG: %[[C512:.*]] = arith.constant 512
 // CHECK-DAG: %[[C4:.*]] = arith.constant 4
 // CHECK-DAG: %[[TID_X:.*]] = gpu.thread_id x
 // CHECK-DAG: %[[BID_X:.*]] = gpu.block_id x
 // CHECK:     scf.for %[[I:.*]] =
-// CHECK:       %[[BID_32:.*]] = arith.index_castui %[[BID_X]] : index to i32
-// CHECK:       %[[BLOCK_OFFSET:.*]] = arith.muli %[[BID_32]], %[[C512]]
-// CHECK:       %[[TID_32:.*]] = arith.index_castui %[[TID_X]] : index to i32
-// CHECK:       %[[THREAD_OFFSET:.*]] = arith.muli %[[TID_32]], %[[C4]]
+// CHECK:       %[[BLOCK_OFFSET:.*]] = arith.muli %[[BID_X]], %[[C512]]
+// CHECK:       %[[THREAD_OFFSET:.*]] = arith.muli %[[TID_X]], %[[C4]]
 // CHECK:       %[[OFFSET:.*]] = arith.addi %[[BLOCK_OFFSET]], %[[THREAD_OFFSET]]
-// CHECK:       %[[I_32:.*]] = arith.index_castui %[[I]] : index to i32
-// CHECK:       %[[UNROLL_OFFSET:.*]] = arith.addi %[[OFFSET]], %[[I_32]]
-// CHECK:       %[[UNROLL_INDEX:.*]] = arith.index_castui %[[UNROLL_OFFSET]]
-// CHECK-SAME:    {xla.range = [0 : index, 1572863 : index]} : i32 to index
-// CHECK:       arith.index_castui %[[UNROLL_INDEX]] : index to i64
+// CHECK:       arith.addi %[[OFFSET]], %[[I]]
 
 // -----
 
-module {
-  func.func @arg_ranges(%arg0: index {xla.range = [0 : index, 42 : index]}, %arg1: index {xla.range = [0 : index, 1000 : index]}) -> index {
-    %0 = affine.apply affine_map<()[s0, s1] -> (s0 floordiv 100 + s1 floordiv 100)>()[%arg0, %arg1]
-    return %0 : index
-  }
+func.func @arg_ranges(%arg0: index {xla.range = [0 : index, 42 : index]}, %arg1: index {xla.range = [0 : index, 1000 : index]}) -> index {
+  %0 = affine.apply affine_map<()[s0, s1] -> (s0 floordiv 100 + s1 floordiv 100)>()[%arg0, %arg1]
+  return %0 : index
 }
 
-// CHECK: @arg_ranges
+// CHECK-LABEL: @arg_ranges
 // CHECK-NEXT:  %[[C100:.*]] = arith.constant 100
-// CHECK-NEXT:  %[[ARG0_32:.*]] = arith.index_castui {{.*}} : index to i32
-// CHECK-NEXT:  %[[RET_32:.*]] = arith.divui %[[ARG0_32]], %[[C100]]
-// CHECK-NEXT:  %[[RET:.*]] = arith.index_castui %[[RET_32]]
-// CHECK-SAME:     {xla.range = [0 : index, 10 : index]} : i32 to index
+// CHECK-NEXT:  %[[RET:.*]] = arith.divui %{{.*}}, %[[C100]]
 // CHECK-NEXT:  return %[[RET]]
 
 // -----
 
-module {
-  func.func @needs_i64(%arg0: index {xla.range = [0 : index, 1000000000000 : index]}, %arg1: index {xla.range = [0 : index, 10 : index]}) -> index {
-    %0 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg0, %arg1]
-    return %0 : index
-  }
+func.func @cant_lower(%arg0: index {xla.range = [-10 : index, 42 : index]}, %arg1: index {xla.range = [0 : index, 1000 : index]}) -> index {
+  %0 = affine.apply affine_map<()[s0, s1] -> (s0 floordiv 100 + s1 floordiv 100)>()[%arg0, %arg1]
+  return %0 : index
 }
 
-// CHECK: @needs_i64
-// CHECK:   arith.index_castui {{.*}} : index to i64
-// CHECK:   arith.index_castui {{.*}} : index to i64
-// CHECK:   arith.index_castui {{.*}} : i64 to index
+// CHECK-LABEL:       @cant_lower
+// CHECK:       affine.apply
 
 // -----
 
-module {
-  func.func @cant_lower(%arg0: index {xla.range = [-10 : index, 42 : index]}, %arg1: index {xla.range = [0 : index, 1000 : index]}) -> index {
-    %0 = affine.apply affine_map<()[s0, s1] -> (s0 floordiv 100 + s1 floordiv 100)>()[%arg0, %arg1]
-    return %0 : index
+func.func @op_and_for_ranges(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %0 = gpu.thread_id  x
+  %1 = gpu.block_id  x
+  scf.for %i = %c0 to %c4 step %c1 {
+    %2 = xla_gpu.apply_indexing affine_map<()[s0, s1, s2] -> (s0 * 512 + s1 * 4 + s2 - ((s1 * 4 + s2) floordiv 256) * 256 + (s1 floordiv 64) * 256 - ((s0 * 2 + s1 floordiv 64) floordiv 3) * 768 + ((s0 * 128 + s1) floordiv 192) * 768 - (((s0 * 128 + s1) floordiv 192) floordiv 1024) * 786432 + (s0 floordiv 1536) * 786432)>
+      [%1 in [0, 3071], %0 in [0, 127], %i in [0, 3]]
+    %3 = arith.index_castui %2 : index to i64
+    %4 = llvm.getelementptr %arg0[%3] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %5 = llvm.load %4 invariant : !llvm.ptr -> f32
+    %8 = llvm.getelementptr %arg1[%3] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+    %9 = llvm.load %8 invariant : !llvm.ptr -> f32
+    %10 = arith.cmpf oge, %5, %9 : f32
+    %11 = llvm.getelementptr %arg2[%3] : (!llvm.ptr, i64) -> !llvm.ptr, i1
+    llvm.store %10, %11 : i1, !llvm.ptr
   }
+  return
 }
 
-// CHECK:       @cant_lower
-// CHECK:       affine.apply
-// CHECK-SAME:    {xla.range = [-1 : index, 10 : index]}
+// CHECK-LABEL: @op_and_for_ranges
+// CHECK-DAG: %[[C512:.*]] = arith.constant 512
+// CHECK-DAG: %[[C4:.*]] = arith.constant 4
+// CHECK-DAG: %[[TID_X:.*]] = gpu.thread_id x
+// CHECK-DAG: %[[BID_X:.*]] = gpu.block_id x
+// CHECK:     scf.for %[[I:.*]] =
+// CHECK:       %[[BLOCK_OFFSET:.*]] = arith.muli %[[BID_X]], %[[C512]]
+// CHECK:       %[[THREAD_OFFSET:.*]] = arith.muli %[[TID_X]], %[[C4]]
+// CHECK:       %[[OFFSET:.*]] = arith.addi %[[BLOCK_OFFSET]], %[[THREAD_OFFSET]]
+// CHECK:       arith.addi %[[OFFSET]], %[[I]]
+
+// -----
+
+func.func @arg_ranges(%arg0: index, %arg1: index) -> index {
+  %0 = xla_gpu.apply_indexing
+    affine_map<()[s0, s1] -> (s0 floordiv 100 + s1 floordiv 100)>
+    [%arg0 in [0, 42], %arg1 in [0, 1000]]
+  return %0 : index
+}
+
+// CHECK-LABEL: @arg_ranges
+// CHECK-NEXT:  %[[C100:.*]] = arith.constant 100
+// CHECK-NEXT:  %[[RET:.*]] = arith.divui %{{.*}}, %[[C100]]
+// CHECK-NEXT:  return %[[RET]]
+
+// -----
+
+func.func @cant_lower(%arg0: index, %arg1: index) -> (index, index) {
+  %0:2 = xla_gpu.apply_indexing
+    affine_map<()[s0, s1] -> (s0 floordiv 100 + s1 floordiv 100, s0 + s1)>
+    [%arg0 in [-10, 42], %arg1 in [0, 1000]]
+  return %0#0, %0#1 : index, index
+}
+// CHECK-LABEL: @cant_lower
+// CHECK:         affine.apply
+// CHECK-NEXT:    arith.addi
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/type_util.cc b/third_party/xla/xla/service/gpu/fusions/mlir/type_util.cc
index b311af50e72bba..0e91aba27f31fd 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/type_util.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/type_util.cc
@@ -51,7 +51,11 @@ llvm::SmallVector<mlir::Type> ShapeToMlirTypes(const Shape& shape,
   if (shape.IsTuple()) {
     types.reserve(shape.tuple_shapes_size());
     for (auto& tuple_shape : shape.tuple_shapes()) {
-      types.push_back(TensorShapeToMlirType(tuple_shape, b));
+      if (tuple_shape.IsTuple()) {
+        types.append(ShapeToMlirTypes(tuple_shape, b));
+      } else {
+        types.push_back(TensorShapeToMlirType(tuple_shape, b));
+      }
     }
   } else {
     types.push_back(TensorShapeToMlirType(shape, b));
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir_emitter_test_base.h b/third_party/xla/xla/service/gpu/fusions/mlir_emitter_test_base.h
index a299c2ea4007ba..61dbeffddfb240 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir_emitter_test_base.h
+++ b/third_party/xla/xla/service/gpu/fusions/mlir_emitter_test_base.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <string>
 #include <string_view>
 
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
@@ -30,6 +31,7 @@ limitations under the License.
 #include "xla/service/gpu/model/affine_map_printer.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tests/hlo_test_base.h"
+#include "xla/xla.pb.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction.cc b/third_party/xla/xla/service/gpu/fusions/reduction.cc
index 193fc36d20266c..1eba5e2024c4d8 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction.cc
+++ b/third_party/xla/xla/service/gpu/fusions/reduction.cc
@@ -28,9 +28,11 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/container/node_hash_map.h"
 #include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -50,9 +52,11 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/elemental_ir_emitter.h"
 #include "xla/service/gpu/fusions/fusion_emitter.h"
+#include "xla/service/gpu/fusions/reduction_base.h"
 #include "xla/service/gpu/fusions/thunk_util.h"
 #include "xla/service/gpu/fusions/tiling_util.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/ir_emitter_nested.h"
@@ -76,6 +80,7 @@ limitations under the License.
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/status.h"
@@ -147,10 +152,10 @@ class ReductionEmitter {
         index_ty_(GetIndexType(fusion, reduction_codegen_info.GetTiling(),
                                elemental_emitter_.builder())) {
     for (auto hero : analysis.fusion_heroes()) {
-      if (hero->opcode() == HloOpcode::kReduce) {
-        for (int i = 0; i < hero->operand_count() / 2; ++i) {
+      if (hero.opcode() == HloOpcode::kReduce) {
+        for (int i = 0; i < hero.instruction().operand_count() / 2; ++i) {
           CHECK(LayoutUtil::IsMonotonicWithDim0Major(
-              hero->operand(i)->shape().layout()))
+              hero.instruction().operand(i)->shape().layout()))
               << "reduction-layout-normalizer must run before code generation";
         }
       }
@@ -168,8 +173,8 @@ class ReductionEmitter {
   absl::StatusOr<std::unique_ptr<Thunk>> BuildKernelThunkForFusion(
       const LaunchDimensions& launch_dimensions,
       absl::string_view discriminator,
-      std::function<Status(std::vector<llvm_ir::IrArray>,
-                           std::vector<llvm_ir::IrArray>)>
+      std::function<absl::Status(std::vector<llvm_ir::IrArray>,
+                                 std::vector<llvm_ir::IrArray>)>
           kernel_builder_fn);
 
   absl::StatusOr<std::unique_ptr<Thunk>> BuildFusedInitializerThunk(
@@ -387,8 +392,8 @@ void ReductionEmitter::EmitSyncThreads() {
 absl::StatusOr<std::unique_ptr<Thunk>>
 ReductionEmitter::BuildKernelThunkForFusion(
     const LaunchDimensions& launch_dimensions, absl::string_view discriminator,
-    std::function<Status(std::vector<llvm_ir::IrArray>,
-                         std::vector<llvm_ir::IrArray>)>
+    std::function<absl::Status(std::vector<llvm_ir::IrArray>,
+                               std::vector<llvm_ir::IrArray>)>
         kernel_builder_fn) {
   const HloComputation* fused_computation =
       fusion_.fused_instructions_computation();
@@ -481,7 +486,7 @@ ReductionEmitter::BuildFusedInitializerThunk(const HloInstruction* fusion_root,
     return *std::move(constant_init_thunk);
   }
 
-  const Shape dest_shape = fusion_root->shape();
+  const Shape& dest_shape = fusion_root->shape();
 
   LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
       dest_shape, ir_emitter_context_.gpu_device_info());
@@ -1012,10 +1017,10 @@ absl::StatusOr<FusionEmissionResult> ReductionEmitter::EmitInitializers() {
         return absl::OkStatus();
       }));
 
-  absl::Span<const HloInstruction* const> fusion_roots =
+  absl::Span<HloInstructionAdaptor const> fusion_roots =
       analysis_.fusion_roots();
   for (int i = 0; i < fusion_roots.size(); ++i) {
-    const HloInstruction* fusion_root = fusion_roots[i];
+    const HloInstruction* fusion_root = &fusion_roots[i].instruction();
 
     if (IsReductionFromOrToContiguousDimensions(*fusion_root)) {
       TF_ASSIGN_OR_RETURN(
@@ -1046,9 +1051,9 @@ absl::Status ReductionEmitter::EmitKernel(
   ReductionOutputMap result_ir_arrays;
 
   int ir_arrays_idx = 0;
-  for (const HloInstruction* root : analysis_.fusion_roots()) {
-    int get_num_results = GetNumOutputs(root->shape());
-    result_ir_arrays[root] =
+  for (const HloInstructionAdaptor& root : analysis_.fusion_roots()) {
+    int get_num_results = GetNumOutputs(root.shape());
+    result_ir_arrays[&root.instruction()] =
         absl::MakeSpan(outputs).subspan(ir_arrays_idx, get_num_results);
     ir_arrays_idx += get_num_results;
   }
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction_base.cc b/third_party/xla/xla/service/gpu/fusions/reduction_base.cc
index dc146b235313c6..145b4107e30fba 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction_base.cc
+++ b/third_party/xla/xla/service/gpu/fusions/reduction_base.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/container/node_hash_map.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
 #include "mlir/IR/AffineExpr.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
@@ -49,6 +50,7 @@ limitations under the License.
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/union_find.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
@@ -80,9 +82,9 @@ int GetVectorSize(const HloFusionAnalysis& analysis,
     return 1;
   }
 
-  // Enabling vectorization if number of threads is <= warpsize leads to half or
-  // more of the threads not doing any work.
-  if (num_threads <= WarpSize()) {
+  // Enabling vectorization if (number_threads * vector_size) is <=
+  // minor_reduced_dimension otherwise exist threads not doing any work.
+  if (num_threads * 2 > reduction_dimensions.dimensions[kRowMinorReduced]) {
     return 1;
   }
 
@@ -101,12 +103,13 @@ int GetVectorSize(const HloFusionAnalysis& analysis,
   return 1;
 }
 
-ReductionGroups GroupDisjointReductions(const HloFusionAnalysis& analysis) {
-  const int num_fusion_outputs = analysis.fusion_roots().size();
+ReductionGroups GroupDisjointReductions(const HloFusionAnalysis& analysis,
+                                        bool for_mlir) {
+  const int num_fusion_outputs = analysis.fusion_root_count();
 
   CHECK_NE(0, num_fusion_outputs);
   if (num_fusion_outputs == 1) {
-    return {{{analysis.fusion_roots()[0]}}, {0}, {true}};
+    return {{{&analysis.fusion_root(0).instruction()}}, {0}, {true}};
   }
 
   absl::node_hash_map<HloInstructionAdaptor,
@@ -133,7 +136,7 @@ ReductionGroups GroupDisjointReductions(const HloFusionAnalysis& analysis) {
     disjoint_sets[root].Get() = root;
     reachable_outputs[root].insert(root);
     result.is_reduction_root.push_back(
-        IsRealReductionHero(root.instruction(), *hero));
+        IsRealReductionHero(root.instruction(), hero.instruction()));
     if (result.is_reduction_root.back()) {
       roots_with_reduction.insert(root);
     } else if (first_non_reduction_root) {
@@ -143,21 +146,33 @@ ReductionGroups GroupDisjointReductions(const HloFusionAnalysis& analysis) {
     }
   }
 
-  std::vector<HloInstructionAdaptor> instructions;
-  HloBfsConsumersFirstTraversal(
-      roots, analysis.fusion(),
-      [&](HloInstructionAdaptor consumer) {
-        auto& consumer_reachable = reachable_outputs[consumer];
-        for (auto producer : consumer.GetOperands()) {
-          reachable_outputs[producer].insert(consumer_reachable.begin(),
-                                             consumer_reachable.end());
-        }
-        instructions.push_back(consumer);
-        return TraversalResult::kAdvance;
-      },
-      [&](HloInstructionAdaptor argument) {
-        instructions.push_back(argument);
-      });
+  absl::flat_hash_set<HloInstructionAdaptor> instructions;
+
+  auto visit = [&](absl::Span<const HloInstructionAdaptor> roots) {
+    HloBfsConsumersFirstTraversal(
+        roots, analysis.fusion(),
+        [&](HloInstructionAdaptor consumer) {
+          auto& consumer_reachable = reachable_outputs[consumer];
+          for (auto producer : consumer.GetOperands()) {
+            reachable_outputs[producer].insert(consumer_reachable.begin(),
+                                               consumer_reachable.end());
+          }
+          instructions.insert(consumer);
+          return TraversalResult::kAdvance;
+        },
+        [&](HloInstructionAdaptor argument) { instructions.insert(argument); });
+  };
+
+  // The legacy emitter grouping is buggy: it does not visit instructions in the
+  // right order. We fix this only for the MLIR emitters, since we are migrating
+  // to them, and we can't rule out that some models rely on the buggy behavior.
+  if (for_mlir) {
+    for (auto root : roots) {
+      visit({root});
+    }
+  } else {
+    visit(roots);
+  }
 
   for (auto instr : instructions) {
     const auto& reachable = reachable_outputs[instr];
@@ -225,7 +240,8 @@ LaunchDimensions ReductionInfo::launch_dimensions() const {
                         /*y=*/1, /*z=*/1)};
 }
 
-ReductionInfo ReductionInfo::Create(const HloFusionAnalysis& analysis) {
+ReductionInfo ReductionInfo::Create(const HloFusionAnalysis& analysis,
+                                    bool for_mlir) {
   auto* hero_reduction = analysis.FindHeroReduction();
   CHECK_NE(hero_reduction, nullptr);
   Shape input_shape = hero_reduction->operand(0)->shape();
@@ -293,6 +309,21 @@ ReductionInfo ReductionInfo::Create(const HloFusionAnalysis& analysis) {
   absl::InlinedVector<int64_t, 4> tile_per_thread{
       reduction_tiling[0], reduction_tiling[1],
       reduction_tiling[2] / vector_size};
+  if (for_mlir) {
+    // The indexing map simplifier does not currently handle this correctly,
+    // leading to loop bounds that are too large.
+    // TODO(jreiffers): Implement tightening of ranges based on constraints
+    // instead. For example, based on:
+    //
+    //   s1 in [0, 127]
+    //   d0 floordiv 32 + s1 * 32 in [0, 63]
+    //
+    // Tighten the bound of s1 to [0, 1].
+    for (int i = 0; i < num_threads.size(); ++i) {
+      tile_per_thread[i] = std::min(
+          tile_per_thread[i], CeilOfRatio(tiled_shape[i], num_threads[i]));
+    }
+  }
   if (rows_per_warp > 1) {
     // If we produce more than one element per thread, that means the reduced
     // dimension is small and it can't be tiled - we already have more threads
@@ -313,24 +344,28 @@ ReductionInfo ReductionInfo::Create(const HloFusionAnalysis& analysis) {
       hero_reduction->GetModule()->config(), reduction_dimensions);
   return ReductionInfo(analysis, tiling, reduction_dimensions.is_row_reduction,
                        reduction_is_race_free,
-                       GroupDisjointReductions(analysis), hero_reduction);
+                       GroupDisjointReductions(analysis, for_mlir),
+                       hero_reduction);
 }
 
 std::optional<IndexingMap> ReductionInfo::ComputeThreadIdToOutputIndexing(
     int64_t root_index, mlir::MLIRContext* ctx) const {
   if (!groups_.is_reduction_root[root_index]) {
-    // Non-reduction roots are elementwise by definition.
-    return ComputeThreadIdToInputIndexing(root_index, 0, ctx);
+    auto map = ComposeIndexingMaps(
+        GetIndexingMapForTiling(tiling_, ctx),
+        GetBitcastMap(tiling_.GetXlaShape(),
+                      analysis_.fusion_root(root_index).shape(), ctx));
+    AddGroupIdConstraint(map, root_index, ctx);
+    return map;
   }
-  auto* hero = analysis_.fusion_heroes()[root_index];
+  const auto& hero = analysis_.fusion_hero(root_index).instruction();
 
   auto block_offsets = GetBlockOffsetsForTiling(tiling_, ctx);
   auto thread_ids = DelinearizeInBoundsIndex(mlir::getAffineDimExpr(0, ctx),
-                                             tiling_.GetThreadsPerBlock(),
-                                             tiling_.GetThreadStrides());
+                                             tiling_.GetThreadsPerBlock());
 
-  auto physical_shape = ShapeUtil::DeleteDimensions(hero->dimensions(),
-                                                    hero->operand(0)->shape());
+  auto physical_shape =
+      ShapeUtil::DeleteDimensions(hero.dimensions(), hero.operand(0)->shape());
   std::vector<DimVar> dimension_ranges{
       {{0, tiling_.GetNumThreadsPerBlock() - 1}},
       {},
@@ -397,36 +432,45 @@ std::optional<IndexingMap> ReductionInfo::ComputeThreadIdToOutputIndexing(
                       physical_shape, ctx));
   }();
 
-  int group_index = groups_.group_id_per_root[root_index];
-  map.AddConstraint(
-      mlir::getAffineDimExpr(KernelFusionInterface::kIndexingMapBlockIdxDims[1],
-                             ctx),
-      {group_index, group_index});
+  AddGroupIdConstraint(map, root_index, ctx);
   return map;
 }
 
 std::optional<IndexingMap> ReductionInfo::ComputeThreadIdToInputIndexing(
     int64_t root_index, int64_t hero_operand_index,
     mlir::MLIRContext* ctx) const {
-  auto* hero = analysis_.fusion_heroes()[root_index];
+  const auto& hero = analysis_.fusion_hero(root_index).instruction();
   if (groups_.is_reduction_root[root_index] &&
-      hero_operand_index >= hero->operand_count() / 2) {
+      hero_operand_index >= hero.operand_count() / 2) {
     // We don't have indexing for the init values.
     return std::nullopt;
   }
+  if (!groups_.is_reduction_root[root_index]) {
+    return ComposeIndexingMaps(
+        *ComputeThreadIdToOutputIndexing(root_index, ctx),
+        *ComputeOutputToInputIndexing(
+             &analysis_.fusion_root(root_index).instruction(), 0, ctx)
+             .indexing_maps[hero_operand_index]
+             .begin());
+  }
 
   auto map = ComposeIndexingMaps(
       GetIndexingMapForTiling(tiling_, ctx),
       GetBitcastMap(tiling_.GetXlaShape(),
-                    hero->operand(hero_operand_index)->shape(), ctx));
-  // Only threads with the right y block index actually do anything for this
-  // root.
+                    hero.operand(hero_operand_index)->shape(), ctx));
+  AddGroupIdConstraint(map, root_index, ctx);
+  return map;
+}
+
+void ReductionInfo::AddGroupIdConstraint(IndexingMap& map, int64_t root_index,
+                                         mlir::MLIRContext* ctx) const {
+  // Only threads with the right y block index actually do anything for each
+  // particular root.
   int group_index = groups_.group_id_per_root[root_index];
   map.AddConstraint(
       mlir::getAffineDimExpr(KernelFusionInterface::kIndexingMapBlockIdxDims[1],
                              ctx),
       {group_index, group_index});
-  return map;
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction_base.h b/third_party/xla/xla/service/gpu/fusions/reduction_base.h
index 93c2ecc2681f83..7cfe9c72a741a3 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction_base.h
+++ b/third_party/xla/xla/service/gpu/fusions/reduction_base.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <optional>
+#include <utility>
 #include <vector>
 
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
@@ -44,7 +45,7 @@ struct ReductionGroups {
 
 class ReductionInfo {
  public:
-  static ReductionInfo Create(const HloFusionAnalysis& analysis);
+  static ReductionInfo Create(const HloFusionAnalysis& analysis, bool for_mlir);
 
   const Tiling& GetTiling() const { return tiling_; }
   const ReductionGroups& GetGroups() const { return groups_; }
@@ -76,6 +77,9 @@ class ReductionInfo {
         groups_(std::move(groups)),
         first_reduce_(first_reduce) {}
 
+  void AddGroupIdConstraint(IndexingMap& map, int64_t root_index,
+                            mlir::MLIRContext* ctx) const;
+
   const HloFusionAnalysis& analysis_;
   Tiling tiling_;
   bool is_row_reduction_;
@@ -86,11 +90,12 @@ class ReductionInfo {
 
 // Base class for reduction fusions. Computes shared information (reduction
 // grouping) and provides implementations of thread->input/output indexing.
-template <typename Base>
+template <typename Base, bool is_mlir = false>
 class ReductionFusionBase : public Base {
  public:
   explicit ReductionFusionBase(const HloFusionAnalysis& analysis)
-      : analysis_(analysis), reduction_info_(ReductionInfo::Create(analysis)) {}
+      : analysis_(analysis),
+        reduction_info_(ReductionInfo::Create(analysis, is_mlir)) {}
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
       int64_t root_index, mlir::MLIRContext* ctx) const override {
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction_base_test.cc b/third_party/xla/xla/service/gpu/fusions/reduction_base_test.cc
index f0e2d914a3e8cd..7cd9eec245b823 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction_base_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/reduction_base_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emitter_context.h"
+#include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/model/indexing_test_utils.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tests/hlo_test_base.h"
@@ -34,10 +35,14 @@ namespace xla {
 namespace gpu {
 namespace {
 
+using ::testing::ElementsAre;
+using ::testing::SizeIs;
+
 class ReductionTest : public HloTestBase {
  protected:
   stream_executor::DeviceDescription device_info_ =
       TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  mlir::MLIRContext mlir_context_;
 };
 
 class FakeReductionFusion : public ReductionFusionBase<KernelFusionInterface> {
@@ -48,6 +53,15 @@ class FakeReductionFusion : public ReductionFusionBase<KernelFusionInterface> {
   }
 };
 
+class FakeMlirReductionFusion
+    : public ReductionFusionBase<KernelFusionInterface, true> {
+  using ReductionFusionBase::ReductionFusionBase;
+  absl::StatusOr<FusionEmissionResult> Emit(
+      IrEmitterContext&, const HloFusionInstruction&) const override {
+    return absl::UnimplementedError("Unimplemented");
+  }
+};
+
 std::unique_ptr<FakeReductionFusion> GetReductionFusion(
     const HloFusionAnalysis& analysis) {
   return std::make_unique<FakeReductionFusion>(analysis);
@@ -78,15 +92,14 @@ TEST_F(ReductionTest, ThreadIndexingRowReduction) {
   auto* root = module->entry_computation()->root_instruction();
   auto analysis = AnalyzeFusion(*root, device_info_);
   FakeReductionFusion fusion(analysis);
-  mlir::MLIRContext mlir_context;
 
   EXPECT_THAT(
-      fusion.ComputeThreadIdToInputIndexing(0, 0, &mlir_context)->ToString(),
+      fusion.ComputeThreadIdToInputIndexing(0, 0, &mlir_context_)->ToString(),
       MatchIndexingString(R"(
-        (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
+        (d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3] -> (
           (d3 * 8 + d0 floordiv 32) floordiv 64,
           (d3 * 8 + d0 floordiv 32) mod 64,
-          d0 mod 32 + s2 * 32
+          (d0 mod 32 + s2 * 32) * 2 + s3
         )
         domain:
         d0 in [0, 255]
@@ -97,12 +110,13 @@ TEST_F(ReductionTest, ThreadIndexingRowReduction) {
         d5 in [0, 0]
         s0 in [0, 0]
         s1 in [0, 0]
-        s2 in [0, 15]
-        d0 mod 32 + s2 * 32 in [0, 511]
+        s2 in [0, 7]
+        s3 in [0, 1]
+        d0 mod 32 + s2 * 32 in [0, 255]
         d3 * 8 + d0 floordiv 32 in [0, 6399]
       )"));
   EXPECT_THAT(
-      fusion.ComputeThreadIdToOutputIndexing(0, &mlir_context)->ToString(),
+      fusion.ComputeThreadIdToOutputIndexing(0, &mlir_context_)->ToString(),
       MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5) -> (
           (d3 * 8 + d0 floordiv 32) floordiv 64,
@@ -145,10 +159,9 @@ TEST_F(ReductionTest, ThreadIndexingMultiRowReduction) {
   auto* root = module->entry_computation()->root_instruction();
   auto analysis = AnalyzeFusion(*root, device_info_);
   FakeReductionFusion fusion(analysis);
-  mlir::MLIRContext mlir_context;
 
   EXPECT_THAT(
-      fusion.ComputeThreadIdToInputIndexing(0, 0, &mlir_context)->ToString(),
+      fusion.ComputeThreadIdToInputIndexing(0, 0, &mlir_context_)->ToString(),
       MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
           d3 + (d0 floordiv 4) floordiv 64,
@@ -169,7 +182,7 @@ TEST_F(ReductionTest, ThreadIndexingMultiRowReduction) {
         d3 * 64 + d0 floordiv 4 in [0, 6399]
       )"));
   EXPECT_THAT(
-      fusion.ComputeThreadIdToOutputIndexing(0, &mlir_context)->ToString(),
+      fusion.ComputeThreadIdToOutputIndexing(0, &mlir_context_)->ToString(),
       MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5) -> (
           d3 + (d0 floordiv 4) floordiv 64,
@@ -212,10 +225,9 @@ TEST_F(ReductionTest, ThreadIndexingColumnReduction) {
   auto* root = module->entry_computation()->root_instruction();
   auto analysis = AnalyzeFusion(*root, device_info_);
   FakeReductionFusion fusion(analysis);
-  mlir::MLIRContext mlir_context;
 
   EXPECT_THAT(
-      fusion.ComputeThreadIdToInputIndexing(0, 0, &mlir_context)->ToString(),
+      fusion.ComputeThreadIdToInputIndexing(0, 0, &mlir_context_)->ToString(),
       MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
           d3,
@@ -230,7 +242,7 @@ TEST_F(ReductionTest, ThreadIndexingColumnReduction) {
         d0 mod 32 in [0, 31]
       )"));
   EXPECT_THAT(
-      fusion.ComputeThreadIdToOutputIndexing(0, &mlir_context)->ToString(),
+      fusion.ComputeThreadIdToOutputIndexing(0, &mlir_context_)->ToString(),
       MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5) -> (
           d3,
@@ -268,10 +280,9 @@ TEST_F(ReductionTest, ThreadIndexingOutputLayout) {
   auto* root = module->entry_computation()->root_instruction();
   auto analysis = AnalyzeFusion(*root, device_info_);
   FakeReductionFusion fusion(analysis);
-  mlir::MLIRContext mlir_context;
 
   EXPECT_THAT(
-      fusion.ComputeThreadIdToOutputIndexing(0, &mlir_context)->ToString(),
+      fusion.ComputeThreadIdToOutputIndexing(0, &mlir_context_)->ToString(),
       MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5) -> (
           (d3 * 8 + d0 floordiv 32) floordiv 64,
@@ -316,13 +327,12 @@ TEST_F(ReductionTest, ThreadIndexingSideOutput) {
   auto* root = module->entry_computation()->root_instruction();
   auto analysis = AnalyzeFusion(*root, device_info_);
   FakeReductionFusion fusion(analysis);
-  mlir::MLIRContext mlir_context;
 
   constexpr char kExpectedIndexing[] = R"(
-      (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
-        (d3 * 8 + d0 floordiv 32) floordiv 64,
-        (d3 * 8 + d0 floordiv 32) mod 64,
-        d0 mod 32 + s2 * 32
+      (d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3] -> (
+        d3 floordiv 8,
+        (d3 mod 8) * 8 + d0 floordiv 32,
+        (d0 mod 32) * 2 + s2 * 64 + s3
       )
       domain:
       d0 in [0, 255]
@@ -333,19 +343,22 @@ TEST_F(ReductionTest, ThreadIndexingSideOutput) {
       d5 in [0, 0]
       s0 in [0, 0]
       s1 in [0, 0]
-      s2 in [0, 15]
-      d0 mod 32 + s2 * 32 in [0, 511]
-      d3 * 8 + d0 floordiv 32 in [0, 6399]
+      s2 in [0, 7]
+      s3 in [0, 1]
   )";
-  EXPECT_THAT(
-      fusion.ComputeThreadIdToInputIndexing(1, 0, &mlir_context)->ToString(),
-      MatchIndexingString(kExpectedIndexing));
-  EXPECT_THAT(
-      fusion.ComputeThreadIdToOutputIndexing(1, &mlir_context)->ToString(),
-      MatchIndexingString(kExpectedIndexing));
+  auto input_indexing =
+      fusion.ComputeThreadIdToInputIndexing(1, 0, &mlir_context_);
+  input_indexing->Simplify(GetIndexingMapForInstruction);
+  EXPECT_THAT(input_indexing->ToString(),
+              MatchIndexingString(kExpectedIndexing));
+  auto output_indexing =
+      fusion.ComputeThreadIdToOutputIndexing(1, &mlir_context_);
+  output_indexing->Simplify(GetIndexingMapForInstruction);
+  EXPECT_THAT(output_indexing->ToString(),
+              MatchIndexingString(kExpectedIndexing));
 }
 
-TEST_F(ReductionTest, bla) {
+TEST_F(ReductionTest, ThreadIndexingVectorized) {
   auto module = ParseAndReturnVerifiedModule(R"(
     HloModule module
     add {
@@ -368,10 +381,9 @@ TEST_F(ReductionTest, bla) {
   auto* root = module->entry_computation()->root_instruction();
   auto analysis = AnalyzeFusion(*root, device_info_);
   FakeReductionFusion fusion(analysis);
-  mlir::MLIRContext mlir_context;
 
   EXPECT_THAT(
-      fusion.ComputeThreadIdToInputIndexing(0, 0, &mlir_context)->ToString(),
+      fusion.ComputeThreadIdToInputIndexing(0, 0, &mlir_context_)->ToString(),
       MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3] -> (
           d3,
@@ -392,6 +404,178 @@ TEST_F(ReductionTest, bla) {
       )"));
 }
 
+TEST_F(ReductionTest, ThreadIndexingBroadcastSideOutput) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    %add {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT add = f32[] add(p0, p1)
+    }
+    %fusion {
+      %p0 = f32[6,6] parameter(0)
+      %c0 = f32[] constant(0)
+      %reduce = f32[] reduce(%p0, %c0), dimensions={0,1}, to_apply=%add
+      %broadcast = f32[6,6] broadcast(%reduce), dimensions={}
+      ROOT %tuple = (f32[6,6], f32[]) tuple(%broadcast, %reduce)
+    }
+    ENTRY main {
+      %p0 = f32[6,6] parameter(0)
+      ROOT %fusion = (f32[6,6], f32[]) fusion(%p0), kind=kInput, calls=%fusion
+    })")
+                    .value();
+
+  auto* root = module->entry_computation()->root_instruction();
+  auto analysis = AnalyzeFusion(*root, device_info_);
+  FakeReductionFusion fusion(analysis);
+  EXPECT_THAT(
+      fusion.ComputeThreadIdToOutputIndexing(0, &mlir_context_)->ToString(),
+      MatchIndexingString(R"(
+        (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
+          (d0 + s2 * 32) floordiv 6,
+          (d0 + s2 * 32) mod 6
+        )
+        domain:
+        d0 in [0, 31]
+        d1 in [0, 0]
+        d2 in [0, 0]
+        d3 in [0, 0]
+        d4 in [0, 0]
+        d5 in [0, 0]
+        s0 in [0, 0]
+        s1 in [0, 0]
+        s2 in [0, 15]
+        d0 + s2 * 32 in [0, 35]
+      )"));
+  EXPECT_THAT(
+      fusion.ComputeThreadIdToInputIndexing(0, 0, &mlir_context_)->ToString(),
+      MatchIndexingString(R"(
+        (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> ()
+        domain:
+        d0 in [0, 31]
+        d1 in [0, 0]
+        d2 in [0, 0]
+        d3 in [0, 0]
+        d4 in [0, 0]
+        d5 in [0, 0]
+        s0 in [0, 0]
+        s1 in [0, 0]
+        s2 in [0, 15]
+        (d0 + s2 * 32) mod 6 in [0, 5]
+        d0 + s2 * 32 in [0, 35]
+      )"));
+}
+
+TEST_F(ReductionTest, TwoGroups) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    add {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT add = f32[] add(p0, p1)
+    }
+    fusion {
+      %p0 = f32[2] parameter(0)
+      %p1 = f32[2] parameter(1)
+      %c0 = f32[] constant(-inf)
+      %r0 = f32[] reduce(%p0, %c0), dimensions={0}, to_apply=add
+      %c1 = f32[] constant(inf)
+      %r1 = f32[] reduce(%p1, %c1), dimensions={0}, to_apply=add
+      ROOT %tuple = (f32[], f32[]) tuple(%r0, %r1)
+    }
+    ENTRY entry {
+      %p0 = f32[2] parameter(0)
+      %p1 = f32[2] parameter(1)
+      ROOT %fusion = (f32[], f32[]) fusion(%p0, %p1), kind=kInput, calls=fusion
+    })")
+                    .value();
+
+  auto* root = module->entry_computation()->root_instruction();
+  auto analysis = AnalyzeFusion(*root, device_info_);
+  FakeReductionFusion fusion(analysis);
+
+  EXPECT_THAT(fusion.reduction_info().GetGroups().grouped_roots,
+              ElementsAre(ElementsAre(&analysis.fusion_root(0).instruction()),
+                          ElementsAre(&analysis.fusion_root(1).instruction())));
+}
+
+TEST_F(ReductionTest, OneGroup) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    %add {
+      %p0 = c128[] parameter(0)
+      %p1 = c128[] parameter(1)
+      ROOT %add.35 = c128[] add(c128[] %p0, c128[] %p1)
+    }
+    %fusion {
+      %p0 = c128[1,2] parameter(0)
+      %c0 = c128[] constant((0, 0))
+      %reduce = c128[] reduce(%p0, %c0), dimensions={0,1}, to_apply=%add
+      %real = f64[] real(c128[] %reduce)
+      %imag = f64[] imag(c128[] %reduce)
+      %negate = f64[] negate(f64[] %imag)
+      ROOT %tuple.29 = (f64[], f64[]) tuple(f64[] %real, f64[] %negate)
+    }
+    ENTRY entry {
+      %p0 = c128[1,2] parameter(0)
+      ROOT %fusion = (f64[], f64[]) fusion(%p0), kind=kInput, calls=fusion
+    })")
+                    .value();
+
+  auto* root = module->entry_computation()->root_instruction();
+  auto analysis = AnalyzeFusion(*root, device_info_);
+  FakeReductionFusion fusion(analysis);
+
+  EXPECT_THAT(fusion.reduction_info().GetGroups().grouped_roots, SizeIs(2));
+
+  FakeMlirReductionFusion mlir_fusion(analysis);
+  EXPECT_THAT(mlir_fusion.reduction_info().GetGroups().grouped_roots,
+              SizeIs(1));
+}
+
+TEST_F(ReductionTest, MlirColumnReduction) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    add {
+      b = f32[] parameter(1)
+      a = f32[] parameter(0)
+      ROOT out = f32[] add(a, b)
+    }
+    fusion {
+      %p0 = f32[192,64,1536] parameter(0)
+      %c0 = f32[] constant(0)
+      ROOT reduce = f32[192,1536] reduce(p0, c0), dimensions={1}, to_apply=add
+    }
+    ENTRY entry {
+      %p0 = f32[192,64,1536] parameter(0)
+      ROOT %fusion = f32[192,1536] fusion(%p0), kind=kInput, calls=fusion
+    })")
+                    .value();
+
+  auto* root = module->entry_computation()->root_instruction();
+  auto analysis = AnalyzeFusion(*root, device_info_);
+
+  FakeMlirReductionFusion fusion(analysis);
+
+  EXPECT_THAT(
+      fusion.ComputeThreadIdToInputIndexing(0, 0, &mlir_context_)->ToString(),
+      MatchIndexingString(R"(
+        (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
+          d3 floordiv 48,
+          d0 floordiv 32 + s1 * 32,
+          (d3 mod 48) * 32 + d0 mod 32
+        )
+        domain:
+        d0 in [0, 1023]
+        d1 in [0, 0]
+        d2 in [0, 0]
+        d3 in [0, 9215]
+        d4 in [0, 0]
+        d5 in [0, 0]
+        s0 in [0, 0]
+        s1 in [0, 1]
+        s2 in [0, 0]
+        (d3 mod 48) * 32 + d0 mod 32 in [0, 1535]
+        d0 floordiv 32 + s1 * 32 in [0, 63]
+      )"));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction_mlir.cc b/third_party/xla/xla/service/gpu/fusions/reduction_mlir.cc
index 0ef5d05b14d90b..4e9effcabb2e8a 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/reduction_mlir.cc
@@ -15,21 +15,21 @@ limitations under the License.
 #include "xla/service/gpu/fusions/reduction_mlir.h"
 
 #include <cstdint>
-#include <iterator>
+#include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/status/status.h"
-#include "absl/status/statusor.h"
 #include "absl/types/span.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"  // from @llvm-project
 #include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
+#include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
 #include "mlir/IR/AffineExpr.h"  // from @llvm-project
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
@@ -40,34 +40,59 @@ limitations under the License.
 #include "mlir/IR/ValueRange.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/fusions/fusion_emitter.h"
 #include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
 #include "xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h"
 #include "xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.h"
 #include "xla/service/gpu/fusions/mlir/type_util.h"
 #include "xla/service/gpu/fusions/reduction_base.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/model/indexing_map.h"
 #include "xla/service/gpu/reduction_utils.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status_macros.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
 
+namespace ma = mlir::arith;
 using llvm::SmallVector;
 using mlir::Value;
 using mlir::ValueRange;
 using mlir_converter::PartitionedComputations;
 
+using HloValueMap =
+    absl::flat_hash_map<const HloInstruction*, llvm::SmallVector<Value>>;
+
 struct MlirReductionFusion::EmitterState {
-  // Uses the given indexing map to reduce a subset of the inputs in a single
-  // thread. The subset may be a single element.
-  absl::StatusOr<SmallVector<Value>> EmitPerThreadReducedElements(
-      const IndexingMap& input_indexing, const HloInstruction* hero,
-      ValueRange inits);
+  EmitterState(const MlirReductionFusion& owner,
+               mlir::func::FuncOp entry_function,
+               const HloFusionInstruction& fusion,
+               const PartitionedComputations& computations,
+               const mlir_converter::CallTargetProvider& call_target)
+      : owner(owner),
+        entry_function(entry_function),
+        fusion(fusion),
+        computations(computations),
+        call_target(call_target),
+        builder(entry_function.getLoc(), entry_function),
+        computation(computations.FindPartitionedComputation(
+            fusion.fused_instructions_computation())) {
+    int index = 0;
+    for (const auto& root : owner.analysis().fusion_roots()) {
+      fusion_result_index_starts[&root.instruction()] = index;
+      index += root.shape().IsTuple() ? root.shape().tuple_shapes_size() : 1;
+    }
+  }
+
+  // Reduces a subset of the inputs in a single thread. Also writes side outputs
+  // to the output tensors. The map contains the reduced values for reductions
+  // and the written tensors for side outputs.
+  HloValueMap EmitPerThreadReducedElements(int group_id,
+                                           const HloValueMap& inits);
 
   mlir::func::FuncOp GetReducer(const HloInstruction* hero) const {
     return call_target(hero->called_computations()[0]->root_instruction());
@@ -81,103 +106,126 @@ struct MlirReductionFusion::EmitterState {
         fusion.fused_parameters().size()));
   }
 
+  int OutputIndex(const HloInstruction* root, int result_index) {
+    return fusion_result_index_starts[root] + result_index;
+  }
+
   const MlirReductionFusion& owner;
   mlir::func::FuncOp entry_function;
   const HloFusionInstruction& fusion;
   const PartitionedComputations& computations;
   const mlir_converter::CallTargetProvider& call_target;
   mlir::ImplicitLocOpBuilder builder;
+  const mlir_converter::PartitionedComputation& computation;
+  absl::flat_hash_map<const HloInstruction*, int> fusion_result_index_starts;
+  SmallVector<Value> thread_and_block_ids;
 };
 
 MlirReductionFusion::MlirReductionFusion(const HloFusionAnalysis& analysis)
     : ReductionFusionBase(analysis) {
-  for (auto [index, hero] : llvm::enumerate(analysis.fusion_heroes())) {
-    if (reduction_info().GetGroups().is_reduction_root[index]) {
-      reduction_roots_[hero].push_back(index);
+  CHECK(reduction_info().IsRaceFree())
+      << "Non-race-free reductions should have been decomposed. Did "
+         "tree_reduction_rewriter run?";
+
+  const auto& groups = reduction_info().GetGroups();
+  int num_groups = groups.grouped_roots.size();
+  side_output_roots_.resize(num_groups);
+  reduction_heroes_.resize(num_groups);
+  reduction_roots_.resize(num_groups);
+
+  absl::flat_hash_set<const HloInstruction*> seen_heroes;
+  for (auto [root_adaptor, hero_adaptor, is_reduction, group_id] :
+       llvm::zip(analysis.fusion_roots(), analysis.fusion_heroes(),
+                 groups.is_reduction_root, groups.group_id_per_root)) {
+    const HloInstruction* root = &root_adaptor.instruction();
+    const HloInstruction* hero = &hero_adaptor.instruction();
+    if (is_reduction) {
+      if (seen_heroes.insert(hero).second) {
+        reduction_heroes_[group_id].push_back(hero);
+      }
+      reduction_roots_[group_id].push_back(root);
+    } else {
+      side_output_roots_[group_id].push_back(root);
     }
   }
-
-  for (const auto& [hero, _] : reduction_roots_) {
-    reduction_heroes_.push_back(hero);
-  }
-}
-
-bool MlirReductionFusion::IsSupported(const HloFusionAnalysis& analysis) {
-  auto info = ReductionInfo::Create(analysis);
-  return info.GetGroups().grouped_roots.size() == 1 &&
-         !absl::c_linear_search(info.GetGroups().is_reduction_root, false) &&
-         info.IsRaceFree();
 }
 
-std::optional<mlir_converter::EpilogueSpecification>
-MlirReductionFusion::GetEpilogue(const HloFusionInstruction& fusion,
-                                 mlir::MLIRContext* mlir_context) const {
-  return mlir_converter::EpilogueSpecification::FromOutputIndexing(
-      analysis(), reduction_heroes_, *this, mlir_context);
+std::vector<mlir_converter::EpilogueSpecification>
+MlirReductionFusion::GetEpilogues(const HloFusionInstruction& fusion,
+                                  mlir::MLIRContext* mlir_context) const {
+  std::vector<mlir_converter::EpilogueSpecification> epilogues;
+  epilogues.reserve(reduction_heroes_.size());
+  for (const auto& [heroes, roots] :
+       llvm::zip(reduction_heroes_, reduction_roots_)) {
+    epilogues.push_back(
+        mlir_converter::EpilogueSpecification::FromOutputIndexing(
+            analysis(), heroes, roots, *this, mlir_context));
+  }
+  return epilogues;
 }
 
 absl::Status MlirReductionFusion::EmitEntryFunction(
-    const mlir_converter::PartitionedComputations& computations,
+    const PartitionedComputations& computations,
     const mlir_converter::CallTargetProvider& call_targets,
     mlir::func::FuncOp entry_function,
     const HloFusionInstruction& fusion) const {
-  // Reduction groups will probably be implemented in a separate pass, since
-  // they share nothing by definition.
-  TF_RET_CHECK(reduction_info().GetGroups().grouped_roots.size() == 1)
-      << "Only one reduction group is supported.";
-  EmitterState state{*this,        entry_function,
-                     fusion,       computations,
-                     call_targets, {entry_function.getLoc(), entry_function}};
-  state.builder.setInsertionPointToStart(entry_function.addEntryBlock());
-  return EmitReduction(state);
+  EmitterState state{*this, entry_function, fusion, computations, call_targets};
+  auto& b = state.builder;
+  b.setInsertionPointToStart(entry_function.addEntryBlock());
+  state.thread_and_block_ids = EmitThreadAndBlockIds(b);
+  if (reduction_heroes_.size() == 1) {
+    b.create<mlir::func::ReturnOp>(EmitReduction(0, state));
+    return absl::OkStatus();
+  }
+  SmallVector<int64_t> cases(reduction_heroes_.size() - 1);
+  absl::c_iota(cases, 1);  // `default` is region 0.
+  auto switch_op = b.create<mlir::scf::IndexSwitchOp>(
+      entry_function.getResultTypes(), EmitBlockId(b, 1), cases, cases.size());
+  b.create<mlir::func::ReturnOp>(switch_op.getResults());
+  for (auto [id, region] : llvm::enumerate(switch_op->getRegions())) {
+    b.setInsertionPointToStart(&region.emplaceBlock());
+    b.create<mlir::scf::YieldOp>(EmitReduction(id, state));
+  }
+  return absl::OkStatus();
 }
 
-absl::Status MlirReductionFusion::EmitReduction(EmitterState& state) const {
-  CHECK(IsSupported(analysis()))
-      << "Attempting to output code for an unsupported reduction";
-  auto& builder = state.builder;
+llvm::SmallVector<Value> MlirReductionFusion::EmitReduction(
+    int group_id, EmitterState& state) const {
+  auto& b = state.builder;
   const auto& tiling = reduction_info().GetTiling();
+  const auto& threads_per_block = tiling.GetThreadsPerBlock();
+  auto* ctx = state.entry_function.getContext();
 
   // The number of warps working on one element in a row reduction.
-  int num_warps_row = tiling.GetThreadsPerBlock()
-                          [ReductionDimensions::kRowMinorReducedDimension] /
-                      WarpSize();
-  auto ctx = state.entry_function.getContext();
-
-  auto zero = builder.create<mlir::arith::ConstantIndexOp>(0);
-  auto lane_id = builder.create<mlir::gpu::LaneIdOp>();
-  auto is_first_lane = builder.create<mlir::arith::CmpIOp>(
-      mlir::arith::CmpIPredicate::eq, lane_id, zero);
-  auto thread_id = EmitThreadId(builder, 0);
-  auto block_id = EmitBlockId(builder, 0);
-  Value cstTrue = builder.create<mlir::arith::ConstantOp>(
-      builder.getIntegerAttr(builder.getI1Type(), 1));
-
-  auto thread_ids = mlir_converter::ApplyAffineMap(
-      mlir::AffineMap::get(
-          /*dimCount=*/1, /*symbolCount=*/0,
-          DelinearizeInBoundsIndex(mlir::getAffineDimExpr(0, ctx),
-                                   tiling.GetThreadsPerBlock(),
-                                   tiling.GetThreadStrides()),
-          ctx),
-      {thread_id}, {}, builder);
-  SmallVector<Value> thread_and_block_indices{thread_id, zero, zero,
-                                              block_id,  zero, zero};
-
-  auto warp_id = builder.create<mlir::arith::DivUIOp>(
+  int num_warps_row =
+      threads_per_block[ReductionDimensions::kRowMinorReducedDimension] /
+      WarpSize();
+
+  auto zero = b.create<ma::ConstantIndexOp>(0);
+  auto lane_id = b.create<mlir::gpu::LaneIdOp>();
+  auto is_first_lane =
+      b.create<ma::CmpIOp>(ma::CmpIPredicate::eq, lane_id, zero);
+  auto thread_id = state.thread_and_block_ids[0];
+  Value cst_true = b.create<ma::ConstantOp>(b.getOneAttr(b.getI1Type()));
+
+  auto thread_indexing = GetBitcastMap(
+      ShapeUtil::MakeShapeWithDescendingLayout(
+          U8, {tiling.GetNumThreadsPerBlock()}),
+      ShapeUtil::MakeShapeWithDescendingLayout(U8, threads_per_block), ctx);
+  auto thread_ids =
+      mlir_converter::ApplyIndexing(thread_indexing, {thread_id}, {}, b);
+
+  auto warp_id = b.create<ma::DivUIOp>(
       reduction_info().IsRowReduction()
           ? thread_ids[ReductionDimensions::kRowMinorReducedDimension]
           : thread_id,
-      builder.create<mlir::arith::ConstantIndexOp>(WarpSize()));
-
-  auto output_args = state.entry_function.getArguments().drop_front(
-      state.fusion.fused_parameters().size());
+      b.create<ma::ConstantIndexOp>(WarpSize()));
 
   std::vector<int64_t> shared_tile_size;
   SmallVector<Value> shared_write_indices;
   SmallVector<Value> shared_read_indices;
-  Value shared_write_condition = cstTrue;
-  Value shared_read_condition = cstTrue;
+  Value shared_write_condition = cst_true;
+  Value shared_read_condition = cst_true;
   if (!reduction_info().IsRowReduction()) {
     shared_tile_size = {WarpSize(), WarpSize() + 1};
     shared_write_indices = {lane_id, warp_id};
@@ -186,195 +234,201 @@ absl::Status MlirReductionFusion::EmitReduction(EmitterState& state) const {
     auto kKept = ReductionDimensions::kRowKeptDimension;
     shared_tile_size = {tiling.GetThreadsPerBlock()[kKept], num_warps_row};
     shared_write_condition = is_first_lane;
-    shared_read_condition = builder.create<mlir::arith::CmpIOp>(
-        mlir::arith::CmpIPredicate::ult,
+    shared_read_condition = b.create<ma::CmpIOp>(
+        ma::CmpIPredicate::ult,
         thread_ids[ReductionDimensions::kRowMinorReducedDimension],
-        builder.create<mlir::arith::ConstantIndexOp>(num_warps_row));
+        b.create<ma::ConstantIndexOp>(num_warps_row));
     shared_write_indices = {thread_ids[kKept], warp_id};
     shared_read_indices = {thread_ids[kKept], lane_id};
   }
-  bool use_shared = !shared_tile_size.empty();
-
-  llvm::SmallVector<llvm::SmallVector<mlir::Value>> root_output_indices;
-  root_output_indices.resize(analysis().fusion_roots().size());
-  for (const auto& [hero, root_ids] : reduction_roots_) {
-    auto hero_indices = mlir_converter::ApplyAffineMap(
-        ComputeThreadIdToOutputIndexing(root_ids.front(), ctx)->GetAffineMap(),
-        thread_and_block_indices, {}, builder);
-    auto root_indices = mlir_converter::ApplyAffineMap(
-        ComputeEpilogueInputToOutputIndexing(hero, ctx).GetAffineMap(),
-        hero_indices, {}, builder);
-    for (auto root_id : root_ids) {
-      root_output_indices[root_id] = root_indices;
-    }
-  }
 
-  if (analysis().fusion_roots().size() == 1 &&
-      analysis().fusion_roots().front()->shape().IsTuple()) {
-    // This is a variadic reduce. The root indices are the same for all
-    // elements.
-    int num_elements =
-        analysis().fusion_roots().front()->shape().tuple_shapes_size();
-    root_output_indices.reserve(num_elements);
-    for (int i = 0; i < num_elements - 1; ++i) {
-      root_output_indices.push_back(root_output_indices.front());
+  auto evaluate_epilogue = [&](const HloValueMap& results,
+                               llvm::SmallVector<Value> outputs) {
+    const auto& epilogue = state.computations.epilogues()[group_id];
+    if (epilogue.roots.empty()) return outputs;
+
+    llvm::SmallVector<Value> epilogue_input_symbols(
+        epilogue.root_indexing.front().GetAffineMap().getNumSymbols(), zero);
+    auto epilogue_input_indices = state.thread_and_block_ids;
+    epilogue_input_indices.append(epilogue_input_symbols);
+    auto values =
+        EmitEpilogue(group_id, state.computations, state.entry_function,
+                     results, epilogue_input_indices, b);
+    int first_root_index = state.OutputIndex(epilogue.roots.front(), 0);
+    auto thread_has_output = mlir_converter::CheckConstraints(
+        *ComputeThreadIdToOutputIndexing(first_root_index, ctx),
+        state.thread_and_block_ids, {}, b);
+    for (auto [index, root] : llvm::enumerate(epilogue.roots)) {
+      auto output_indices = mlir_converter::ApplyIndexing(
+          epilogue.root_indexing[index], state.thread_and_block_ids,
+          epilogue_input_symbols, b);
+      for (auto [result_index, result] : llvm::enumerate(values.at(root))) {
+        auto& output = outputs[state.OutputIndex(root, result_index)];
+        output = b.create<PredicatedInsertOp>(thread_has_output, result, output,
+                                              output_indices);
+      }
     }
-  }
+    return outputs;
+  };
 
-  auto thread_has_output =
-      mlir_converter::CheckConstraints(*ComputeThreadIdToOutputIndexing(0, ctx),
-                                       thread_and_block_indices, {}, builder);
-
-  llvm::DenseMap<const HloInstruction*, SmallVector<Value>> inits;
-  for (auto [index, hero] : llvm::enumerate(reduction_heroes_)) {
-    int num_inputs = hero->operand_count() / 2;
-    const auto& computation =
-        state.computations.FindPartitionedComputation(hero->parent());
-    inits[hero] = ProvideParameterRange(
-        computation.FindSubgraph(hero), hero, num_inputs, num_inputs, {},
-        state.call_target, state.entry_function, builder);
+  HloValueMap inits;
+  const auto& reductions = reduction_heroes_[group_id];
+  for (auto* hero : reductions) {
+    int arity = hero->operand_count() / 2;
+    inits[hero] =
+        ProvideParameterRange(state.computation, hero, arity, arity, {},
+                              state.call_target, state.entry_function, b);
+  }
+  llvm::SmallVector<Value> outputs =
+      mlir::ValueRange(state.entry_function.getArguments().drop_front(
+          state.fusion.fused_parameters().size()));
+  for (auto* side_output : side_output_roots_[group_id]) {
+    inits[side_output].push_back(outputs[state.OutputIndex(side_output, 0)]);
   }
 
-  auto evaluate_epilogue =
-      [&](SmallVector<SmallVector<Value>> results) -> mlir::ValueRange {
-    if (!state.computations.epilogue()) {
-      return results.front();
-    }
-
-    llvm::SmallVector<Value> hero_values(reduction_heroes_.size());
-    const auto& injected = state.computations.epilogue()->injected_values;
-    for (auto [hero, result] : llvm::zip(reduction_heroes_, results)) {
-      CHECK(result.size() == 1)
-          << "Epilogue fusions are not supported with variadic reduce.";
-      hero_values[injected.at(hero)] = result.front();
-    }
-
-    llvm::SmallVector<Value> indices = EmitThreadAndBlockIds(builder);
-    int num_symbols =
-        state.computations.epilogue()->root_indexing.front().getNumSymbols();
-    for (int i = 0; i < num_symbols; ++i) {
-      indices.push_back(zero);
-    }
-
-    return EmitEpilogue(state.computations, state.entry_function, hero_values,
-                        indices, builder);
-  };
+  auto accumulated = state.EmitPerThreadReducedElements(group_id, inits);
+  for (auto root : side_output_roots_[group_id]) {
+    outputs[state.OutputIndex(root, 0)] = accumulated[root].front();
+  }
 
-  SmallVector<Value> updated_outputs;
-  SmallVector<llvm::SmallVector<Value>> results;
-  for (auto* hero : reduction_heroes_) {
-    auto input_indexing = ComputeThreadIdToInputIndexing(
-        reduction_roots_.at(hero).front(), 0, ctx);
-    TF_ASSIGN_OR_RETURN(
-        auto accumulated,
-        state.EmitPerThreadReducedElements(*input_indexing, hero, inits[hero]));
-
-    // In row reductions, we can do a warp shuffle before writing to shared
-    // memory. In column reductions, the members of the warp process different
-    // output elements, so we need to transpose first.
-    if (reduction_info().IsRowReduction()) {
-      auto reducer = state.GetReducer(hero);
+  // In row reductions, we can do a warp shuffle before writing to shared
+  // memory. In column reductions, the members of the warp process different
+  // output elements, so we need to transpose first.
+  if (reduction_info().IsRowReduction()) {
+    for (auto* reduction : reductions) {
+      auto reducer = state.GetReducer(reduction);
       int max_dist = WarpSize() / 2 / reduction_info().GetRowsPerWarp();
-      accumulated =
-          builder.create<ShuffleReduceOp>(reducer, accumulated, max_dist)
-              .getResults();
+      auto& values = accumulated[reduction];
+      values =
+          b.create<ShuffleReduceOp>(reducer, values, max_dist).getResults();
     }
+  }
 
-    results.push_back(accumulated);
+  if (shared_tile_size.empty()) {
+    return evaluate_epilogue(accumulated, std::move(outputs));
   }
 
-  if (use_shared) {
-    // Write results to shared memory.
-    for (auto [hero, result] : llvm::zip(reduction_heroes_, results)) {
-      auto dest = state.AllocateSharedTiles(hero, shared_tile_size);
-      for (auto [value, output] : llvm::zip(result, dest)) {
-        updated_outputs.push_back(builder.create<PredicatedInsertOp>(
-            shared_write_condition, value, output, shared_write_indices));
-      }
-    }
-  } else {
-    // Evaluate the epilogue, if there is one.
-    auto result_scalars = evaluate_epilogue(results);
-    for (auto [value, output, indices] :
-         llvm::zip(result_scalars, output_args, root_output_indices)) {
-      updated_outputs.push_back(builder.create<PredicatedInsertOp>(
-          thread_has_output, value, output, indices));
+  SmallVector<Value> shared_tiles;
+  // Write results to shared memory.
+  for (auto* hero : reductions) {
+    auto dest = state.AllocateSharedTiles(hero, shared_tile_size);
+    for (auto [value, output] : llvm::zip(accumulated[hero], dest)) {
+      shared_tiles.push_back(b.create<PredicatedInsertOp>(
+          shared_write_condition, value, output, shared_write_indices));
     }
-    builder.create<mlir::func::ReturnOp>(updated_outputs);
-    return absl::OkStatus();
   }
 
   // Wait for the entire tile to be written.
-  auto shared_tiles = builder
-                          .create<SyncThreadsOp>(
-                              mlir::TypeRange(updated_outputs), updated_outputs)
-                          .getResults();
+  auto synced_tiles =
+      b.create<SyncThreadsOp>(mlir::TypeRange(shared_tiles), shared_tiles)
+          .getResults();
   auto write_outputs = [&](mlir::OpBuilder then_builder, mlir::Location loc) {
-    results.clear();
     mlir::ImplicitLocOpBuilder b(loc, then_builder);
     int tile_index = 0;
-    llvm::SmallVector<Value> updated_outputs;
-    for (auto* hero : reduction_heroes_) {
+    for (auto* hero : reductions) {
       // Load from shared memory.
       SmallVector<Value> reduced;
       for (auto init : inits[hero]) {
         // If a warp didn't write anything, use the init values instead.
         reduced.push_back(b.create<PredicatedExtractOp>(
                                shared_read_condition, init,
-                               shared_tiles[tile_index++], shared_read_indices)
+                               synced_tiles[tile_index++], shared_read_indices)
                               .getResult());
       }
-
-      reduced = builder
-                    .create<ShuffleReduceOp>(state.GetReducer(hero), reduced,
-                                             WarpSize() / 2)
-                    .getResults();
-      results.push_back(reduced);
+      const auto& reducer = state.GetReducer(hero);
+      accumulated[hero] =
+          b.create<ShuffleReduceOp>(reducer, reduced, WarpSize() / 2)
+              .getResults();
     }
 
-    auto result_scalars = evaluate_epilogue(results);
-
-    for (auto [output_value, dest, indices] :
-         llvm::zip(result_scalars, output_args, root_output_indices)) {
-      updated_outputs.push_back(b.create<PredicatedInsertOp>(
-          thread_has_output, output_value, dest, indices));
-    }
-    b.create<mlir::scf::YieldOp>(loc, updated_outputs);
+    b.create<mlir::scf::YieldOp>(loc, evaluate_epilogue(accumulated, outputs));
   };
 
-  auto warp_writes = reduction_info().IsRowReduction()
-                         ? builder.create<mlir::arith::CmpIOp>(
-                               mlir::arith::CmpIPredicate::eq, warp_id, zero)
-                         : cstTrue;
-  auto written = builder.create<mlir::scf::IfOp>(
-      warp_writes, write_outputs, [&](mlir::OpBuilder b, mlir::Location loc) {
-        b.create<mlir::scf::YieldOp>(loc, output_args);
-      });
-  builder.create<mlir::func::ReturnOp>(written.getResults());
-
-  return absl::OkStatus();
+  auto warp_writes =
+      reduction_info().IsRowReduction()
+          ? b.create<ma::CmpIOp>(ma::CmpIPredicate::eq, warp_id, zero)
+          : cst_true;
+  auto yield_outputs = [&](mlir::OpBuilder else_builder, mlir::Location loc) {
+    else_builder.create<mlir::scf::YieldOp>(loc, outputs);
+  };
+  return b.create<mlir::scf::IfOp>(warp_writes, write_outputs, yield_outputs)
+      .getResults();
 }
 
-absl::StatusOr<SmallVector<Value>>
-MlirReductionFusion::EmitterState::EmitPerThreadReducedElements(
-    const IndexingMap& input_indexing, const HloInstruction* hero,
-    ValueRange inits) {
-  auto body_builder = [&](ValueRange outputs, ValueRange dim_values,
+HloValueMap MlirReductionFusion::EmitterState::EmitPerThreadReducedElements(
+    int group_id, const HloValueMap& inits) {
+  const auto& tiling = owner.reduction_info().GetTiling();
+  auto tile_indexing = GetIndexingMapForTiling(tiling, builder.getContext());
+  tile_indexing
+      .GetMutableDimensionBound(
+          KernelFusionInterface::kIndexingMapBlockIdxDims[1])
+      .upper = owner.reduction_heroes_.size();
+
+  SmallVector<Value> iter_arg_inits;
+  const auto& side_outputs = owner.side_output_roots_[group_id];
+  const auto& reductions = owner.reduction_heroes_[group_id];
+  absl::flat_hash_map<const HloInstruction*, int> iter_arg_starts;
+  for (const auto& [hero, init] : inits) {
+    iter_arg_starts[hero] = iter_arg_inits.size();
+    iter_arg_inits.append(init);
+  }
+
+  auto body_builder = [&](ValueRange iter_args, ValueRange dim_values,
                           ValueRange symbol_values) -> SmallVector<Value> {
-    auto indices = mlir_converter::ApplyAffineMap(
-        input_indexing.GetAffineMap(), dim_values, symbol_values, builder);
-    auto operands = FusionParams();
-    absl::c_copy(indices, std::back_inserter(operands));
-    auto values = ProvideParameterRange(computations.FindSubgraph(hero), hero,
-                                        0, hero->operand_count() / 2, indices,
-                                        call_target, entry_function, builder);
-
-    SmallVector<Value> reduce_args = outputs;
-    reduce_args.append(values.begin(), values.end());
-    return builder.create<PureCallOp>(GetReducer(hero), reduce_args)
-        .getResults();
+    auto tile_indices = mlir_converter::ApplyIndexing(tile_indexing, dim_values,
+                                                      symbol_values, builder);
+
+    llvm::SmallVector<Value> results(iter_args.size(), nullptr);
+    auto get_input_indices = [&](auto* hero, bool is_reduction) {
+      const auto& input_shape =
+          is_reduction ? hero->operand(0)->shape() : hero->shape();
+      return mlir_converter::ApplyIndexing(
+          GetBitcastMap(tiling.GetXlaShape(), input_shape,
+                        builder.getContext()),
+          tile_indices, {}, builder);
+    };
+    for (auto* reduction : reductions) {
+      int arity = reduction->operand_count() / 2;
+      int start = iter_arg_starts[reduction];
+      SmallVector<Value> reduce_args = iter_args.slice(start, arity);
+      reduce_args.append(ProvideParameterRange(
+          computation, reduction, 0, arity, get_input_indices(reduction, true),
+          call_target, entry_function, builder));
+      const auto& reducer = GetReducer(reduction);
+      absl::c_copy(
+          builder.create<PureCallOp>(reducer, reduce_args).getResults(),
+          results.begin() + start);
+    }
+    struct SideOutput {
+      llvm::SmallVector<Value> indices;
+      Value scalar;
+    };
+    llvm::SmallVector<SideOutput> side_output_values;
+    for (auto* side_output : side_outputs) {
+      auto indices = get_input_indices(side_output, false);
+      auto* root_tuple = fusion.fused_expression_root();
+      Value value = mlir_converter::ProvideParameter(
+          computation, root_tuple, root_tuple->operand_index(side_output),
+          indices, call_target, entry_function, builder)[0];
+      side_output_values.push_back({std::move(indices), value});
+    }
+    for (const auto& [side_output, values] :
+         llvm::zip(side_outputs, side_output_values)) {
+      int offset = iter_arg_starts[side_output];
+      results[offset] = builder.create<mlir::tensor::InsertOp>(
+          values.scalar, iter_args[offset], values.indices);
+    }
+    return results;
   };
-  return owner.EmitThreadLoopNest(builder, inits, input_indexing, body_builder);
+
+  auto results_vector = owner.EmitThreadLoopNest(builder, iter_arg_inits,
+                                                 tile_indexing, body_builder);
+  mlir::ValueRange results = results_vector;
+  HloValueMap results_per_hero;
+  for (const auto& [hero, init] : inits) {
+    results_per_hero[hero] = results.slice(iter_arg_starts[hero], init.size());
+  }
+  return results_per_hero;
 }
 
 SmallVector<Value> MlirReductionFusion::EmitterState::AllocateSharedTiles(
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction_mlir.h b/third_party/xla/xla/service/gpu/fusions/reduction_mlir.h
index 2b6a1a865884fb..9b2243afd52ac7 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/reduction_mlir.h
@@ -15,13 +15,17 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_FUSIONS_REDUCTION_MLIR_H_
 #define XLA_SERVICE_GPU_FUSIONS_REDUCTION_MLIR_H_
 
-#include "absl/container/flat_hash_map.h"
+#include <vector>
+
 #include "absl/status/status.h"
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h"
+#include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
 #include "xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h"
 #include "xla/service/gpu/fusions/reduction_base.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
 
 namespace xla {
 namespace gpu {
@@ -29,7 +33,8 @@ namespace gpu {
 // Reduction fusion. Lowers to LLVM via MLIR. Currently not fully
 // implemented: only single reduction groups, no side outputs, only row
 // reductions.
-class MlirReductionFusion : public ReductionFusionBase<MlirFusionEmitterBase> {
+class MlirReductionFusion
+    : public ReductionFusionBase<MlirFusionEmitterBase, /*is_mlir=*/true> {
  public:
   explicit MlirReductionFusion(const HloFusionAnalysis& analysis);
 
@@ -42,7 +47,7 @@ class MlirReductionFusion : public ReductionFusionBase<MlirFusionEmitterBase> {
       mlir::func::FuncOp entry_function,
       const HloFusionInstruction& fusion) const override;
 
-  std::optional<mlir_converter::EpilogueSpecification> GetEpilogue(
+  std::vector<mlir_converter::EpilogueSpecification> GetEpilogues(
       const HloFusionInstruction& fusion,
       mlir::MLIRContext* mlir_context) const override;
 
@@ -50,11 +55,15 @@ class MlirReductionFusion : public ReductionFusionBase<MlirFusionEmitterBase> {
   struct EmitterState;
   friend struct EmitterState;
 
-  absl::Status EmitReduction(EmitterState& state) const;
+  llvm::SmallVector<mlir::Value> EmitReduction(int group_id,
+                                               EmitterState& state) const;
 
-  std::vector<const HloInstruction*> reduction_heroes_;
-  // The root indices for each reduction hero.
-  absl::flat_hash_map<const HloInstruction*, std::vector<int>> reduction_roots_;
+  // The reduction heroes for each reduction group.
+  std::vector<std::vector<const HloInstruction*>> reduction_heroes_;
+  // The roots that have reduction heroes for each reduction group.
+  std::vector<std::vector<const HloInstruction*>> reduction_roots_;
+  // The side output roots for each reduction group.
+  std::vector<std::vector<const HloInstruction*>> side_output_roots_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction_mlir_test.cc b/third_party/xla/xla/service/gpu/fusions/reduction_mlir_test.cc
index 23edf7ed08f641..a34843735301ab 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction_mlir_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/reduction_mlir_test.cc
@@ -41,26 +41,26 @@ TEST_F(ReductionTest, VariadicRowReduce) {
       ROOT t = (f32[], f32[]) tuple(add.0, add.1)
     }
     fused_computation {
-      param_0 = f32[5,200,2048] parameter(0)
-      param_1 = f32[5,200,2048] parameter(1)
+      param_0 = f32[2, 3, 2048] parameter(0)
+      param_1 = f32[2, 3, 2048] parameter(1)
       param_2 = f32[] parameter(2)
-      ROOT d.1 = (f32[5,200], f32[5,200])
+      ROOT d.1 = (f32[2, 3], f32[2, 3])
         reduce(param_0, param_1, param_2, param_2), dimensions={2}, to_apply=Add
     }
     ENTRY main {
-      a = f32[5, 200, 2048] parameter(0)
-      b = f32[5, 200, 2048] parameter(1)
+      a = f32[2, 3, 2048] parameter(0)
+      b = f32[2, 3, 2048] parameter(1)
       c = f32[] constant(0)
-      ROOT fusion = (f32[5,200], f32[5,200]) fusion(a, b, c),
+      ROOT fusion = (f32[2, 3], f32[2, 3]) fusion(a, b, c),
         kind=kInput, calls=fused_computation
     })";
   TF_ASSERT_OK(EmitAndCheckIR(kHloString, R"(
 // CHECK:      @fused_computation
-// CHECK-SAME:   %[[ARG0:.*]]: tensor<5x200x2048xf32> {xla.slice_index = 0
-// CHECK-SAME:   %[[ARG1:.*]]: tensor<5x200x2048xf32> {xla.slice_index = 1
+// CHECK-SAME:   %[[ARG0:.*]]: tensor<2x3x2048xf32> {xla.slice_index = 0
+// CHECK-SAME:   %[[ARG1:.*]]: tensor<2x3x2048xf32> {xla.slice_index = 1
 // CHECK-SAME:   %[[INIT_TENSOR:.*]]: tensor<f32> {xla.slice_index = 2
-// CHECK-SAME:   %[[OUT0:.*]]: tensor<5x200xf32> {xla.slice_index = 3
-// CHECK-SAME:   %[[OUT1:.*]]: tensor<5x200xf32> {xla.slice_index = 4
+// CHECK-SAME:   %[[OUT0:.*]]: tensor<2x3xf32> {xla.slice_index = 3
+// CHECK-SAME:   %[[OUT1:.*]]: tensor<2x3xf32> {xla.slice_index = 4
 // CHECK:        %[[INIT:.*]] = xla_gpu.pure_call @fused_computation_param_2
 // CHECK:        %[[PER_THREAD:.*]]:2 = scf.for
 // CHECK-SAME:       iter_args(%[[A:.*]] = %[[INIT]], %[[B:.*]] = %[[INIT]])
@@ -124,7 +124,7 @@ TEST_F(ReductionTest, RowReduceMOFEpilogue) {
       ROOT mul = f32[] multiply(lhs, rhs)
     }
     fused_computation {
-      param_0 = f32[8,2048] parameter(0)
+      param_0 = f32[8,1024] parameter(0)
       param_1 = f32[] parameter(1)
       reduce1 = f32[8] reduce(param_0, param_1), dimensions={1}, to_apply=Add
       reduce2 = f32[8] reduce(param_0, param_1), dimensions={1}, to_apply=Mul
@@ -134,7 +134,7 @@ TEST_F(ReductionTest, RowReduceMOFEpilogue) {
       ROOT tuple = (f32[8], f32[8], f32[8]) tuple(log, neg, abs)
     }
     ENTRY main {
-      a = f32[8,2048] parameter(0)
+      a = f32[8,1024] parameter(0)
       c = f32[] constant(0)
       ROOT fusion = (f32[8], f32[8], f32[8]) fusion(a, c), kind=kInput,
         calls=fused_computation
@@ -153,6 +153,37 @@ TEST_F(ReductionTest, RowReduceMOFEpilogue) {
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
 
+TEST_F(ReductionTest, RowReduceMOFGroups) {
+  constexpr auto kHloString = R"(
+    %add_f32 {
+      %x = f32[] parameter(0)
+      %y = f32[] parameter(1)
+      ROOT %add = f32[] add(%x, %y)
+    }
+
+    %fused_computation {
+      %param0 = f32[1024] parameter(0)
+      %param1 = f32[1024] parameter(1)
+      %constant0 = f32[] constant(0)
+      %reduce1 = f32[] reduce(%param0, %constant0), dimensions={0}, to_apply=%add_f32
+      %reduce2 = f32[] reduce(%param1, %constant0), dimensions={0}, to_apply=%add_f32
+      ROOT %tuple = (f32[], f32[]) tuple(%reduce1, %reduce2)
+    }
+
+    ENTRY %cluster {
+      %param0 = f32[1024] parameter(0)
+      %param1 = f32[1024] parameter(1)
+      ROOT %fusion = (f32[], f32[])
+          fusion(%param0, %param1), kind=kInput, calls=%fused_computation
+    })";
+  TF_ASSERT_OK(EmitAndCheckIR(kHloString, R"(
+    // CHECK: scf.index_switch %block_id_y
+    // CHECK: case 1 {
+    // CHECK: default {
+  )"));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
+}
+
 TEST_F(ReductionTest, ColumnReduction) {
   constexpr auto kHloString = R"(
     HloModule Test, is_scheduled=true
@@ -163,14 +194,14 @@ TEST_F(ReductionTest, ColumnReduction) {
       ROOT add = f32[] add(lhs, rhs)
     }
     fused_computation {
-      param_0 = f32[123,2051,321] parameter(0)
+      param_0 = f32[13,1051,321] parameter(0)
       param_1 = f32[] parameter(1)
-      ROOT reduce = f32[123,321] reduce(param_0, param_1), dimensions={1}, to_apply=Add
+      ROOT reduce = f32[13,321] reduce(param_0, param_1), dimensions={1}, to_apply=Add
     }
     ENTRY main {
-      a = f32[123,2051,321] parameter(0)
+      a = f32[13,1051,321] parameter(0)
       c = f32[] constant(0)
-      ROOT fusion = f32[123,321] fusion(a, c), kind=kInput, calls=fused_computation
+      ROOT fusion = f32[13,321] fusion(a, c), kind=kInput, calls=fused_computation
     })";
   TF_ASSERT_OK(EmitAndCheckIR(kHloString, R"(
     // CHECK: xla_gpu.pure_call @Add_add
@@ -340,6 +371,98 @@ TEST_F(ReductionTest, NonTrivialEpilogue) {
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
 
+TEST_F(ReductionTest, SideOutput) {
+  constexpr auto kHloString = R"(
+    HloModule Test, is_scheduled=true
+
+    Add {
+      lhs = f32[] parameter(0)
+      rhs = f32[] parameter(1)
+      ROOT add = f32[] add(lhs, rhs)
+    }
+    fused_computation {
+      param_0 = f32[8,2048] parameter(0)
+      param_1 = f32[] parameter(1)
+      exp = f32[8,2048] exponential(param_0)
+      reduce = f32[8] reduce(param_0, param_1), dimensions={1}, to_apply=Add
+      ROOT t = (f32[8], f32[8,2048]) tuple(reduce, exp)
+    }
+    ENTRY main {
+      a = f32[8,2048] parameter(0)
+      c = f32[] constant(0)
+      ROOT fusion = (f32[8], f32[8,2048]) fusion(a, c), kind=kInput,
+          calls=fused_computation
+    })";
+  TF_ASSERT_OK(EmitAndCheckIR(kHloString, R"(
+    // CHECK: @fused_computation
+    // CHECK: scf.for
+    // CHECK: scf.for
+    // CHECK: %[[SIDE_OUTPUT:.*]] = xla_gpu.pure_call @fused_computation_exp
+    // CHECK-NEXT: tensor.insert %[[SIDE_OUTPUT]]
+  )"));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
+}
+
+TEST_F(ReductionTest, BroadcastSideOutput) {
+  constexpr auto kHloString = R"(
+    %add {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT add = f32[] add(p0, p1)
+    }
+    %fusion {
+      %p0 = f32[6,6] parameter(0)
+      %c0 = f32[] constant(0)
+      %reduce = f32[] reduce(%p0, %c0), dimensions={0,1}, to_apply=%add
+      %broadcast = f32[6,6] broadcast(%reduce), dimensions={}
+      ROOT %tuple = (f32[6,6], f32[]) tuple(%broadcast, %reduce)
+    }
+    ENTRY main {
+      %p0 = f32[6,6] parameter(0)
+      ROOT %fusion = (f32[6,6], f32[]) fusion(%p0), kind=kInput, calls=%fusion
+    })";
+
+  TF_ASSERT_OK(EmitAndCheckIR(kHloString, R"(
+    // CHECK: @fused_computation
+  )"));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
+}
+
+TEST_F(ReductionTest, VariadicMOF) {
+  constexpr auto kHloString = R"(
+    %reducer1 {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT add = f32[] add(p0, p1)
+    }
+    %reducer2 {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      p2 = f32[] parameter(2)
+      p3 = f32[] parameter(3)
+      add0 = f32[] add(p0, p2)
+      add1 = f32[] add(p1, p3)
+      ROOT tuple = (f32[], f32[]) tuple(add0, add1)
+    }
+    %fusion {
+      %p0 = f32[6,6] parameter(0)
+      %c0 = f32[] constant(0)
+      %neg = f32[6,6] negate(%p0)
+      %reduce1 = f32[] reduce(%neg, %c0), dimensions={0,1}, to_apply=%reducer1
+      %reduce2 = (f32[], f32[]) reduce(%p0, %p0, %c0, %c0), dimensions={0,1}, to_apply=%reducer2
+      ROOT %tuple = (f32[], (f32[], f32[]), f32[6,6]) tuple(%reduce1, %reduce2, %neg)
+    }
+    ENTRY main {
+      %p0 = f32[6,6] parameter(0)
+      ROOT %fusion = (f32[], (f32[], f32[]), f32[6,6]) fusion(%p0), kind=kInput, calls=%fusion
+    })";
+
+  TF_ASSERT_OK(EmitAndCheckIR(kHloString, R"(
+    // CHECK: @fused_computation
+  )"));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/scatter.cc b/third_party/xla/xla/service/gpu/fusions/scatter.cc
index 0625f9efd4653b..eb7bfd3086b484 100644
--- a/third_party/xla/xla/service/gpu/fusions/scatter.cc
+++ b/third_party/xla/xla/service/gpu/fusions/scatter.cc
@@ -28,16 +28,22 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Value.h"
+#include "mlir/IR/AffineExpr.h"  // from @llvm-project
+#include "mlir/IR/AffineMap.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/gpu/elemental_ir_emitter.h"
 #include "xla/service/gpu/fusions/loop.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/ir_emitter_nested.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/model/indexing_analysis.h"
+#include "xla/service/gpu/model/indexing_map.h"
 #include "xla/service/gpu/parallel_loop_emitter.h"
 #include "xla/service/llvm_ir/fused_ir_emitter.h"
 #include "xla/service/llvm_ir/ir_array.h"
@@ -45,19 +51,21 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
 
 ScatterFusion::ScatterFusion(const HloFusionAnalysis& analysis)
     : analysis_(analysis), config_(ComputeLoopFusionConfig(analysis)) {
-  CHECK_EQ(analysis.fusion_roots().size(), 1);
-  CHECK_EQ(analysis.fusion_roots()[0]->opcode(), HloOpcode::kScatter);
+  CHECK_EQ(analysis.fusion_root_count(), 1);
+  CHECK_EQ(analysis.fusion_root(0).opcode(), HloOpcode::kScatter);
 }
 
 LaunchDimensions ScatterFusion::launch_dimensions() const {
   const auto& updates_shape =
-      analysis_.fusion_roots().front()->operands().back()->shape();
+      analysis_.fusion_root(0).instruction().operands().back()->shape();
   return CalculateLaunchDimensions(updates_shape, analysis_.device_info());
 }
 
@@ -243,8 +251,8 @@ absl::Status ScatterFusion::EmitKernel(IrEmitterContext& ir_emitter_context,
 std::optional<IndexingMap> ScatterFusion::ComputeThreadIdToInputIndexing(
     int64_t root_index, int64_t hero_operand_index,
     mlir::MLIRContext* ctx) const {
-  auto* scatter =
-      DynCast<HloScatterInstruction>(analysis_.fusion_heroes().front());
+  const auto* scatter =
+      DynCast<HloScatterInstruction>(&analysis_.fusion_hero(0).instruction());
   int64_t scatter_operand_count = scatter->scatter_operand_count();
   // Scatter operands a packed in the following way:
   // Operand IDs [0, scatter_operand_count - 1] for `scatter operands`.
diff --git a/third_party/xla/xla/service/gpu/fusions/scatter.h b/third_party/xla/xla/service/gpu/fusions/scatter.h
index 289328fd7a7c14..f054a3ecf81f9d 100644
--- a/third_party/xla/xla/service/gpu/fusions/scatter.h
+++ b/third_party/xla/xla/service/gpu/fusions/scatter.h
@@ -20,12 +20,14 @@ limitations under the License.
 
 #include "absl/log/check.h"
 #include "llvm/IR/IRBuilder.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/gpu/fusions/fusion_emitter.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/model/indexing_map.h"
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/status.h"
 
diff --git a/third_party/xla/xla/service/gpu/fusions/scatter_mlir.cc b/third_party/xla/xla/service/gpu/fusions/scatter_mlir.cc
index 7e9657f131f558..2bf6c52a76d095 100644
--- a/third_party/xla/xla/service/gpu/fusions/scatter_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/scatter_mlir.cc
@@ -18,11 +18,9 @@ limitations under the License.
 #include <optional>
 #include <vector>
 
-#include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -31,16 +29,19 @@ limitations under the License.
 #include "mlir/IR/AffineExpr.h"  // from @llvm-project
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
 #include "xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h"
 #include "xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/model/indexing_map.h"
@@ -52,29 +53,25 @@ namespace xla {
 namespace gpu {
 namespace {
 
+namespace ma = ::mlir::arith;
+namespace scf = ::mlir::scf;
+
 using llvm::SmallVector;
 using mlir::Location;
 using mlir::OpBuilder;
 using mlir::Value;
 using mlir::ValueRange;
-using mlir::arith::AddIOp;
-using mlir::arith::AndIOp;
-using mlir::arith::CmpIOp;
-using mlir::arith::CmpIPredicate;
-using mlir::arith::ConstantIndexOp;
 using mlir::func::ReturnOp;
 using mlir::tensor::InsertOp;
-using mlir_converter::ApplyAffineMap;
 using mlir_converter::CallTargetProvider;
 using mlir_converter::PartitionedComputations;
 using mlir_converter::ProvideParameter;
 
-namespace scf = ::mlir::scf;
-
 }  // namespace
 
 bool MlirScatterFusion::IsSupported(const HloFusionAnalysis& analysis) {
-  auto* scatter = Cast<HloScatterInstruction>(analysis.fusion_heroes().front());
+  const auto* scatter =
+      Cast<HloScatterInstruction>(&analysis.fusion_hero(0).instruction());
   if (scatter->scatter_operand_count() != 1) {
     LOG(ERROR) << "Variadic scatter is not supported like in the legacy "
                   "emitter, although it is possible to make it work when the "
@@ -92,8 +89,8 @@ std::optional<IndexingMap> MlirScatterFusion::ComputeThreadIdToOutputIndexing(
 std::optional<IndexingMap> MlirScatterFusion::ComputeThreadIdToInputIndexing(
     int64_t root_index, int64_t hero_operand_index,
     mlir::MLIRContext* ctx) const {
-  auto* scatter =
-      DynCast<HloScatterInstruction>(analysis_.fusion_heroes().front());
+  const auto* scatter =
+      DynCast<HloScatterInstruction>(&analysis_.fusion_hero(0).instruction());
   CHECK(ScatterSimplifier::IsSimplifiedScatter(scatter))
       << "Non-simplified HLO Scatter is not supported.";
   int64_t scatter_operand_count = scatter->scatter_operand_count();
@@ -134,20 +131,20 @@ std::optional<IndexingMap> MlirScatterFusion::ComputeThreadIdToInputIndexing(
 }
 
 LaunchDimensions MlirScatterFusion::launch_dimensions() const {
-  auto* scatter = analysis_.fusion_heroes().front();
+  const auto& scatter = analysis_.fusion_hero(0).instruction();
   // Compute thread id mapping based on the shape of update operand.
-  auto& shape = scatter->operands().back()->shape();
+  auto& shape = scatter.operands().back()->shape();
   return CalculateLaunchDimensions(shape, analysis_.device_info());
 }
 
-std::optional<mlir_converter::EpilogueSpecification>
-MlirScatterFusion::GetEpilogue(const HloFusionInstruction& fusion,
-                               mlir::MLIRContext* mlir_context) const {
+std::vector<mlir_converter::EpilogueSpecification>
+MlirScatterFusion::GetEpilogues(const HloFusionInstruction& fusion,
+                                mlir::MLIRContext* mlir_context) const {
   // We don't actually support epilogues for scatter, but this is how we tell
   // the base class that we don't want it to generate code for the scatter.
-  return mlir_converter::EpilogueSpecification::FromIdentityIndexing(
-      analysis_.fusion_heroes().front(), analysis_.fusion_roots().front(),
-      mlir_context);
+  return {mlir_converter::EpilogueSpecification::FromIdentityIndexing(
+      &analysis_.fusion_hero(0).instruction(),
+      &analysis_.fusion_root(0).instruction(), mlir_context)};
 }
 
 mlir::Value EmitScatterComputation(
@@ -160,9 +157,9 @@ mlir::Value EmitScatterComputation(
   auto reducer =
       call_targets(scatter->called_computations()[0]->root_instruction());
   if (scatter->unique_indices()) {
-    auto operand_elem = ProvideParameter(root_computation.FindSubgraph(scatter),
-                                         scatter, kScatterOperandIndex, indices,
-                                         call_targets, entry_function, b)[0];
+    auto operand_elem =
+        ProvideParameter(root_computation, scatter, kScatterOperandIndex,
+                         indices, call_targets, entry_function, b)[0];
     auto reduced_val = mlir_converter::InlineBlock(
         b, reducer.getBody().front(), {operand_elem, update_elem})[0];
 
@@ -185,7 +182,7 @@ absl::Status MlirScatterFusion::EmitEntryFunction(
   constexpr int kScatterOperandIndex = 0;
   constexpr int kScatterIndicesIndex = 1;
   constexpr int kScatterUpdateIndex = 2;
-  const auto* scatter = analysis_.fusion_heroes()[0];
+  const auto* scatter = &analysis_.fusion_hero(0).instruction();
   const HloInstruction* scatter_operand =
       scatter->operand(kScatterOperandIndex);
   const HloInstruction* scatter_indices =
@@ -203,62 +200,57 @@ absl::Status MlirScatterFusion::EmitEntryFunction(
 
   const auto& root_computation = computations.FindPartitionedComputation(
       fusion.fused_instructions_computation());
-  const auto& scatter_subgraph = root_computation.FindSubgraph(scatter);
   mlir::ImplicitLocOpBuilder b(entry_function.getLoc(), entry_function);
   b.setInsertionPointToStart(entry_function.addEntryBlock());
 
   SmallVector<Value> result_tensors{entry_function.getArguments().back()};
-  auto c0 = b.create<ConstantIndexOp>(0);
 
   auto scatter_result = EmitThreadLoopNest(
       b, result_tensors, thread_id_to_update_map,
       [&](ValueRange output_tensors, ValueRange dim_values,
           ValueRange symbol_values) -> SmallVector<Value> {
         // Extract input element.
-        auto update_tensor_indices =
-            ApplyAffineMap(thread_id_to_update_map.GetAffineMap(), dim_values,
-                           symbol_values, b);
-        auto update_elem =
-            ProvideParameter(scatter_subgraph, scatter, kScatterUpdateIndex,
-                             update_tensor_indices, call_targets,
-                             entry_function, b)
-                .front();
+        auto update_tensor_indices = mlir_converter::ApplyIndexing(
+            thread_id_to_update_map, dim_values, symbol_values, b);
+        auto update_elem = ProvideParameter(
+            root_computation, scatter, kScatterUpdateIndex,
+            update_tensor_indices, call_targets, entry_function, b)[0];
 
         // Extract slice offsets from scatter_indices operand, compute if the
         // whole slice of scatter_update operand will fit into the output.
-        mlir::Value is_in_bounds =
-            b.create<mlir::arith::ConstantIntOp>(1, b.getI1Type());
+        mlir::Value in_bounds = b.create<ma::ConstantIntOp>(1, b.getI1Type());
         SmallVector<Value, 4> indices{
             llvm::ArrayRef(update_tensor_indices).drop_front()};
-        for (int i = 0; i < scatter_operand->shape().rank(); ++i) {
-          Value extracted_index = c0;
-          if (i < scatter_indices->shape().dimensions(1)) {
-            SmallVector<Value, 4> indices_tensor_indices = {
-                update_tensor_indices.front(), b.create<ConstantIndexOp>(i)};
-            extracted_index = ProvideParameter(
-                scatter_subgraph, scatter, kScatterIndicesIndex,
-                indices_tensor_indices, call_targets, entry_function, b)[0];
-            if (extracted_index.getType() != b.getIndexType()) {
-              extracted_index = b.create<mlir::arith::IndexCastOp>(
-                  b.getIndexType(), extracted_index);
-            }
+        for (int i = 0; i < scatter_indices->shape().dimensions(1); ++i) {
+          SmallVector<Value, 4> indices_tensor_indices = {
+              update_tensor_indices.front(), b.create<ma::ConstantIndexOp>(i)};
+          auto index = ProvideParameter(
+              root_computation, scatter, kScatterIndicesIndex,
+              indices_tensor_indices, call_targets, entry_function, b)[0];
+          auto index_ty = mlir::cast<mlir::IntegerType>(index.getType());
+          if (index_ty.isUnsigned()) {
+            auto int_ty = b.getIntegerType(index_ty.getWidth());
+            index = b.create<mlir::UnrealizedConversionCastOp>(int_ty, index)
+                        .getResult(0);
+            index = b.create<ma::IndexCastUIOp>(b.getIndexType(), index);
+          } else {
+            index = b.create<ma::IndexCastOp>(b.getIndexType(), index);
           }
-          is_in_bounds = b.create<AndIOp>(
-              is_in_bounds,
-              b.create<CmpIOp>(CmpIPredicate::sge, extracted_index, c0));
-          Value ub = b.create<ConstantIndexOp>(
+          Value ub = b.create<ma::ConstantIndexOp>(
               scatter_operand->shape().dimensions(i) -
               scatter_update->shape().dimensions(i + 1));
-          is_in_bounds = b.create<AndIOp>(
-              is_in_bounds,
-              b.create<CmpIOp>(CmpIPredicate::sle, extracted_index, ub));
-          indices[i] = b.create<AddIOp>(extracted_index, indices[i]);
+          // One bounds check is enough even for signed indices: `sge 0` is
+          // implied by `ule ub`, because `ub >= 0`.
+          in_bounds = b.create<ma::AndIOp>(
+              in_bounds,
+              b.create<ma::CmpIOp>(ma::CmpIPredicate::ule, index, ub));
+          indices[i] = b.create<ma::AddIOp>(index, indices[i]);
         }
         // Call scatter's computation if is_in_bounds.
         Value output_tensor = output_tensors.front();
         Value predicated_update =
             b.create<scf::IfOp>(
-                 is_in_bounds,
+                 in_bounds,
                  [&](OpBuilder& then_builder, Location then_loc) -> void {
                    Value updated_output = EmitScatterComputation(
                        scatter, indices, update_elem, output_tensor,
diff --git a/third_party/xla/xla/service/gpu/fusions/scatter_mlir.h b/third_party/xla/xla/service/gpu/fusions/scatter_mlir.h
index 3b8bfeb4b2694d..5d657afb81d8a5 100644
--- a/third_party/xla/xla/service/gpu/fusions/scatter_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/scatter_mlir.h
@@ -17,12 +17,15 @@ limitations under the License.
 
 #include <cstdint>
 #include <optional>
+#include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Interfaces/DataLayoutInterfaces.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/fusions/loop.h"
+#include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
 #include "xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/launch_dimensions.h"
@@ -55,7 +58,7 @@ class MlirScatterFusion : public MlirFusionEmitterBase {
       mlir::func::FuncOp entry_function,
       const HloFusionInstruction& fusion) const override;
 
-  std::optional<mlir_converter::EpilogueSpecification> GetEpilogue(
+  std::vector<mlir_converter::EpilogueSpecification> GetEpilogues(
       const HloFusionInstruction& fusion,
       mlir::MLIRContext* mlir_context) const override;
 
diff --git a/third_party/xla/xla/service/gpu/fusions/scatter_mlir_test.cc b/third_party/xla/xla/service/gpu/fusions/scatter_mlir_test.cc
index 12fca854ae5fb7..174145e8ff9448 100644
--- a/third_party/xla/xla/service/gpu/fusions/scatter_mlir_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/scatter_mlir_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/service/gpu/fusions/scatter_mlir.h"
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "xla/error_spec.h"
 #include "xla/service/gpu/fusions/mlir_emitter_test_base.h"
@@ -82,7 +83,7 @@ TEST_F(MlirScatterFusionTest, ThreadIdIndexing) {
     (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (
     ((bl_x * 16 + th_x floordiv 8) floordiv 25) mod 42,
     ((bl_x * 32 + th_x floordiv 4) floordiv 5) mod 10,
-    (th_x + bl_x * 128) mod 20)
+    (bl_x * 128 + th_x) mod 20)
     domain:
     th_x in [0, 127]
     th_y in [0, 0]
@@ -184,8 +185,8 @@ TEST_F(MlirScatterFusionTest, Scatter_UniqueIndices) {
     }
   )";
   TF_ASSERT_OK(EmitAndCheckIR(kHloString, R"(
-    // CHECK: #[[$MAP0:.*]] = affine_map<()[s0] -> (s0 floordiv 2)>
-    // CHECK: #[[$MAP1:.*]] = affine_map<()[s0] -> (s0 mod 2)>
+    // CHECK: #[[$MAP0:.*]] = affine_map<(d0) -> (d0 floordiv 2)>
+    // CHECK: #[[$MAP1:.*]] = affine_map<(d0) -> (d0 mod 2)>
 
     // CHECK-LABEL: func.func @fused_computation(
     // CHECK-SAME:    %[[OPERAND:[a-zA-Z0-9]*]]: tensor<10x5xf32>
@@ -197,8 +198,8 @@ TEST_F(MlirScatterFusionTest, Scatter_UniqueIndices) {
     // CHECK-DAG:       %[[C9:.*]] = arith.constant 9 : index
 
     // CHECK:      %[[TH_X:.*]] = gpu.thread_id  x
-    // CHECK:      %[[SLICE_ID:.*]] = affine.apply #[[$MAP0]]()[%[[TH_X]]]
-    // CHECK:      %[[SLICE_X:.*]] = affine.apply #[[$MAP1]]()[%[[TH_X]]]
+    // CHECK:      %[[SLICE_ID:.*]] = xla_gpu.apply_indexing #[[$MAP0]](%[[TH_X]]
+    // CHECK:      %[[SLICE_X:.*]] = xla_gpu.apply_indexing #[[$MAP1]](%[[TH_X]]
 
     // CHECK:      %[[UPD_ELEM:.*]] = xla_gpu.pure_call @scatter_update(
     // CHECK-SAME:  %[[OPERAND]], %[[INDICES]], %[[UPDATES]],
@@ -207,7 +208,7 @@ TEST_F(MlirScatterFusionTest, Scatter_UniqueIndices) {
     // CHECK:      xla_gpu.pure_call @scatter_indices(%[[OPERAND]], %[[INDICES]]
     // CHECK-SAME:  %[[UPDATES]], %[[SLICE_ID]], %[[C0]])
 
-    // CHECK:      %[[IN_BOUNDS:.*]] = arith.andi
+    // CHECK:      %[[IN_BOUNDS:.*]] = arith.cmpi ule
     // CHECK:      scf.if %[[IN_BOUNDS]] -> (tensor<10x5xf32>) {
     // CHECK:        %[[CURRENT:.*]] = xla_gpu.pure_call @scatter_operand(
     // CHECK-SAME:  %[[OPERAND]], %[[INDICES]], %[[UPDATES]],
@@ -223,6 +224,46 @@ TEST_F(MlirScatterFusionTest, Scatter_UniqueIndices) {
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
 
+TEST_F(MlirScatterFusionTest, Scatter_Unsigned) {
+  auto kHloString = R"(
+    HloModule module
+
+    add {
+      %p0 = f32[] parameter(0)
+      %p1 = f32[] parameter(1)
+      ROOT %sum = f32[] add(%p0, %p1)
+    }
+    scatter {
+      %operand = f32[10,5]  parameter(0)
+      %indices = u32[24,1] parameter(1)
+      %update = f32[24,2,3] parameter(2)
+
+      ROOT %scatter = f32[10,5] scatter(%operand, %indices, %update),
+          update_window_dims={1,2},
+          inserted_window_dims={},
+          scatter_dims_to_operand_dims={0},
+          index_vector_dim=1,
+          to_apply=add
+    }
+    ENTRY entry {
+      %c1 = f32[] constant(1)
+      %c1_tensor = f32[10,5]  broadcast(%c1), dimensions={}
+      %indices = u32[24,1] parameter(0)
+      %update = f32[24, 2, 3] parameter(1)
+      ROOT %fusion = f32[10, 5] fusion(%c1_tensor, %indices, %update),
+          kind=kLoop, calls=scatter
+    }
+  )";
+  TF_ASSERT_OK(EmitAndCheckIR(kHloString, R"(
+    // CHECK: func.func @fused_computation(
+    // CHECK: %[[PARAM:.*]] = xla_gpu.pure_call @scatter_indices
+    // CHECK: %[[CVT:.*]] = builtin.unrealized_conversion_cast %[[PARAM]]
+    // CHECK: %[[INDEX:.*]] = arith.index_castui %[[CVT]]
+    // CHECK: arith.cmpi ule, %[[INDEX]]
+  )"));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
+}
+
 TEST_F(MlirScatterFusionTest, Scatter_Add) {
   auto kHloString = R"(
     HloModule module
@@ -266,7 +307,7 @@ TEST_F(MlirScatterFusionTest, Scatter_Add) {
     // CHECK-SAME:    %[[OUT:[a-zA-Z0-9]*]]: tensor<10x5xf32>
 
     // CHECK: %[[UPD_ELEM:.*]] = xla_gpu.pure_call @scatter_update
-    // CHECK: %[[IN_BOUNDS:.*]] = arith.andi
+    // CHECK: %[[IN_BOUNDS:.*]] = arith.cmpi ule
     // CHECK: scf.if %[[IN_BOUNDS]] -> (tensor<10x5xf32>) {
     // CHECK:   %[[RMW:.*]] = xla_gpu.atomic_rmw %[[OUT]]
     // CHECK:   ^bb0(%[[CUR_VALUE:.*]]: f32):
@@ -323,7 +364,7 @@ TEST_F(MlirScatterFusionTest, Scatter_Overwrite) {
     // CHECK-SAME:    %[[OUT:[a-zA-Z0-9]*]]: tensor<10x5xf32>
 
     // CHECK: %[[UPD_ELEM:.*]] = xla_gpu.pure_call @scatter_update
-    // CHECK: %[[IN_BOUNDS:.*]] = arith.andi
+    // CHECK: %[[IN_BOUNDS:.*]] = arith.cmpi ule
     // CHECK: scf.if %[[IN_BOUNDS]] -> (tensor<10x5xf32>) {
     // CHECK:   %[[RMW:.*]] = xla_gpu.atomic_rmw %[[OUT]]
     // CHECK:   ^bb0(%[[CUR_VALUE:.*]]: f32):
@@ -336,6 +377,7 @@ TEST_F(MlirScatterFusionTest, Scatter_Overwrite) {
   )"));
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/scatter_test.cc b/third_party/xla/xla/service/gpu/fusions/scatter_test.cc
index 2be8dc86d75540..ae55b19ac4c3a0 100644
--- a/third_party/xla/xla/service/gpu/fusions/scatter_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/scatter_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/service/gpu/fusions/fusions.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
@@ -148,7 +149,7 @@ TEST_F(ScatterFusionTest, ThreadIdIndexing) {
     (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (
     ((bl_x * 16 + th_x floordiv 8) floordiv 25) mod 42,
     ((bl_x * 32 + th_x floordiv 4) floordiv 5) mod 10,
-    (th_x + bl_x * 128) mod 20)
+    (bl_x * 128 + th_x) mod 20)
     domain:
     th_x in [0, 127]
     th_y in [0, 0]
diff --git a/third_party/xla/xla/service/gpu/fusions/thunk_util.cc b/third_party/xla/xla/service/gpu/fusions/thunk_util.cc
index b356f76d083f8a..af03e01327962b 100644
--- a/third_party/xla/xla/service/gpu/fusions/thunk_util.cc
+++ b/third_party/xla/xla/service/gpu/fusions/thunk_util.cc
@@ -49,7 +49,7 @@ BuildConstantInitializerThunk(IrEmitterContext& ir_emitter_context,
         literal.size_bytes());
     int64_t num_bytes = literal_bytes.size();
 
-    const Shape dest_shape = instr->shape();
+    const Shape& dest_shape = instr->shape();
 
     Thunk::ThunkInfo thunk_info =
         Thunk::ThunkInfo::WithProfileAnnotation(instr);
diff --git a/third_party/xla/xla/service/gpu/fusions/tiling_util.cc b/third_party/xla/xla/service/gpu/fusions/tiling_util.cc
index 24456209e521fb..9ad085faff3a99 100644
--- a/third_party/xla/xla/service/gpu/fusions/tiling_util.cc
+++ b/third_party/xla/xla/service/gpu/fusions/tiling_util.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
@@ -38,6 +39,7 @@ limitations under the License.
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/shape_util.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -201,7 +203,7 @@ absl::StatusOr<TilingThreadIdInfo> EmitThreadIdInfo(llvm::IRBuilder<>* builder,
 
 absl::StatusOr<TilingKernelInfo> EmitTilingKernel(
     llvm::IRBuilder<>* builder, const Tiling& tiling, llvm::Type* index_ty,
-    const TileGenerator& tile_generator) {
+    const TileGenerator& tile_element_generator) {
   absl::Span<const int64_t> dims_in_elems = tiling.GetShape();
   const auto& block_counts = tiling.GetBlockCounts();
   auto constant = [&](uint64_t c) -> llvm::Constant* {
@@ -249,7 +251,7 @@ absl::StatusOr<TilingKernelInfo> EmitTilingKernel(
                                    index_ty);
   }();
 
-  tile_generator(thread_id_info, tile_offset, tile_dimensions);
+  tile_element_generator(thread_id_info, tile_offset, tile_dimensions);
   return {{tile_dimensions, tile_offset, thread_id_info}};
 }
 
diff --git a/third_party/xla/xla/service/gpu/fusions/tiling_util.h b/third_party/xla/xla/service/gpu/fusions/tiling_util.h
index f06ae8ccab4280..732f39a5daa4df 100644
--- a/third_party/xla/xla/service/gpu/fusions/tiling_util.h
+++ b/third_party/xla/xla/service/gpu/fusions/tiling_util.h
@@ -19,11 +19,19 @@ limitations under the License.
 #include <functional>
 #include <string>
 
+#include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
 #include "absl/types/span.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
 #include "xla/service/llvm_ir/ir_array.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
@@ -33,15 +41,14 @@ namespace gpu {
 // Used by reduction and transpose emitters.
 class Tiling {
  public:
-  Tiling(absl::InlinedVector<int64_t, 4> shape,
-         absl::InlinedVector<int64_t, 4> tile_sizes,
-         absl::InlinedVector<int64_t, 4> num_threads,
+  Tiling(absl::Span<const int64_t> shape, absl::Span<const int64_t> tile_sizes,
+         absl::Span<const int64_t> num_threads,
          // By default, don't unroll anything.
          absl::InlinedVector<bool, 4> loops_to_unroll = {})
-      : shape_(shape),
-        tile_sizes_per_thread_(tile_sizes),
+      : shape_{shape.begin(), shape.end()},
+        tile_sizes_per_thread_{tile_sizes.begin(), tile_sizes.end()},
         tile_sizes_per_block_(shape.size()),
-        num_threads_(num_threads),
+        num_threads_{num_threads.begin(), num_threads.end()},
         num_blocks_(shape.size()),
         loops_to_unroll_(loops_to_unroll) {
     for (int64_t i = 0; i < shape.size(); ++i) {
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose.cc b/third_party/xla/xla/service/gpu/fusions/transpose.cc
index ca7b3f7ff79228..7fb3fab849fa8a 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose.cc
+++ b/third_party/xla/xla/service/gpu/fusions/transpose.cc
@@ -48,14 +48,19 @@ limitations under the License.
 #include "xla/service/llvm_ir/fused_ir_emitter.h"
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/service/llvm_ir/llvm_util.h"
+#include "xla/service/llvm_ir/loop_emitter.h"
+#include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
-Tiling ComputeTransposeTiling(const TransposeDescription& tiled_transpose) {
+Tiling ComputeTransposeTiling(const se::DeviceDescription& gpu_device_info,
+                              const TransposeDescription& tiled_transpose) {
   constexpr int kNumRows = 4;
   static_assert(WarpSize() % kNumRows == 0);
 
@@ -77,6 +82,20 @@ Tiling ComputeTransposeTiling(const TransposeDescription& tiled_transpose) {
   absl::InlinedVector<int64_t, 4> num_threads{1, 1, WarpSize()};
   num_threads[permutation[2]] = kNumRows;
 
+  auto capability = gpu_device_info.gpu_compute_capability();
+  std::visit(
+      [&](const auto& capability) {
+        if constexpr (std::is_same_v<std::decay_t<decltype(capability)>,
+                                     stream_executor::RocmComputeCapability>) {
+          // kNumRows = 8 works well on MI300 with wavefront size 64.
+          if (capability.gfx9_mi300()) {
+            tile_sizes[permutation[2]] = gpu_device_info.threads_per_warp() / 8;
+            num_threads[permutation[2]] = 8;
+          }
+        }
+      },
+      capability);
+
   return Tiling(input_dims, tile_sizes, num_threads);
 }
 
@@ -106,12 +125,15 @@ llvm_ir::IrArray::Index PermuteIndex(const llvm_ir::IrArray::Index& index,
 
 }  // namespace
 
-TransposeFusion::TransposeFusion(const HloFusionAnalysis& analysis)
+TransposeFusion::TransposeFusion(const se::DeviceDescription& gpu_device_info,
+                                 const HloFusionAnalysis& analysis)
     : analysis_(analysis),
-      tiling_(ComputeTransposeTiling(analysis.tiled_transpose())) {
+      tiling_(
+          ComputeTransposeTiling(gpu_device_info, analysis.tiled_transpose())) {
   for (auto [root, hero] :
        llvm::zip(analysis_.fusion_roots(), analysis_.fusion_heroes())) {
-    if (auto transpose = GetDescriptionForTiledTransposeEmitter(*root, *hero)) {
+    if (auto transpose = GetDescriptionForTiledTransposeEmitter(
+            root.instruction(), hero.instruction())) {
       permutation_ = transpose->permutation;
       break;
     }
@@ -147,19 +169,21 @@ absl::Status TransposeFusion::EmitKernel(IrEmitterContext& ir_emitter_context,
   std::vector<std::pair<int64_t, const HloInstruction*>> extra_outputs;
 
   for (const auto& [output_idx, root] : llvm::enumerate(hlo_roots)) {
-    const auto& hero = *analysis_.fusion_heroes()[output_idx];
-    auto transpose_descr = GetDescriptionForTiledTransposeEmitter(*root, hero);
+    const auto& hero = analysis_.fusion_hero(output_idx).instruction();
+    auto transpose_descr =
+        GetDescriptionForTiledTransposeEmitter(root.instruction(), hero);
     if (transpose_descr.has_value()) {
       auto iterator_inserted = transposes_to_roots.insert(std::make_pair(
           &hero, std::vector<std::pair<int64_t, const HloInstruction*>>{
-                     {output_idx, root}}));
+                     {output_idx, &root.instruction()}}));
       if (iterator_inserted.second) {
         transposes.push_back(*transpose_descr);
       } else {
-        iterator_inserted.first->second.push_back({output_idx, root});
+        iterator_inserted.first->second.push_back(
+            {output_idx, &root.instruction()});
       }
     } else {
-      extra_outputs.push_back({output_idx, root});
+      extra_outputs.push_back({output_idx, &root.instruction()});
     }
   }
 
@@ -285,8 +309,8 @@ LaunchDimensions TransposeFusion::launch_dimensions() const {
 
 std::optional<IndexingMap> TransposeFusion::ComputeThreadIdToOutputIndexing(
     int64_t root_index, mlir::MLIRContext* ctx) const {
-  const auto& hero = *analysis_.fusion_heroes()[root_index];
-  const auto& root = *analysis_.fusion_roots()[root_index];
+  const auto& hero = analysis_.fusion_hero(root_index).instruction();
+  const auto& root = analysis_.fusion_root(root_index).instruction();
   if (!GetDescriptionForTiledTransposeEmitter(root, hero)) {
     // Non-transpose roots are elementwise by definition.
     return ComputeThreadIdToInputIndexing(root_index, 0, ctx);
@@ -313,7 +337,7 @@ std::optional<IndexingMap> TransposeFusion::ComputeThreadIdToOutputIndexing(
 std::optional<IndexingMap> TransposeFusion::ComputeThreadIdToInputIndexing(
     int64_t root_index, int64_t hero_operand_index,
     mlir::MLIRContext* ctx) const {
-  const auto& hero = *analysis_.fusion_heroes()[root_index];
+  const auto& hero = analysis_.fusion_hero(root_index).instruction();
 
   auto map = ComposeIndexingMaps(
       GetIndexingMapForTiling(tiling_, ctx),
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose.h b/third_party/xla/xla/service/gpu/fusions/transpose.h
index 899b1cb94390ae..f9e7fca7a1aa6f 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose.h
+++ b/third_party/xla/xla/service/gpu/fusions/transpose.h
@@ -60,7 +60,8 @@ namespace gpu {
 // efficient to launch fewer blocks so each transposes many tiles.
 class TransposeFusion : public KernelFusionEmitterBase {
  public:
-  explicit TransposeFusion(const HloFusionAnalysis& analysis);
+  explicit TransposeFusion(const se::DeviceDescription& gpu_device_info,
+                           const HloFusionAnalysis& analysis);
   LaunchDimensions launch_dimensions() const override;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc b/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc
index 28ac6c70bdbd21..9d76a1d09237ae 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc
@@ -14,17 +14,17 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/service/gpu/fusions/transpose_mlir.h"
 
+#include <algorithm>
 #include <cstdint>
 #include <iterator>
 #include <optional>
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "llvm/ADT/ArrayRef.h"
+#include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -33,6 +33,7 @@ limitations under the License.
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/TypeRange.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/ValueRange.h"  // from @llvm-project
@@ -42,10 +43,11 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/mlir/utils/type_util.h"
 #include "xla/permutation_util.h"
+#include "xla/primitive_util.h"
+#include "xla/service/gpu/fusions/fusion_emitter.h"
 #include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
 #include "xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h"
 #include "xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.h"
-#include "xla/service/gpu/fusions/tiling_util.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/launch_dimensions.h"
@@ -55,16 +57,13 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
-using absl::StatusOr;
 using llvm::SmallVector;
 using mlir::AffineExpr;
-using mlir::AffineMap;
 using mlir::MLIRContext;
 using mlir::RankedTensorType;
 using mlir::Value;
@@ -73,259 +72,235 @@ using mlir::func::FuncOp;
 using mlir::func::ReturnOp;
 using mlir::tensor::ExtractOp;
 using mlir::tensor::InsertOp;
-using mlir_converter::ApplyAffineMap;
-using mlir_converter::CallTargetProvider;
-using mlir_converter::PartitionedComputation;
+using mlir_converter::ApplyIndexing;
 
-Tiling ComputeTransposeTiling(const TransposeDescription& tiled_transpose) {
-  constexpr int kNumRows = 4;
-  static_assert(WarpSize() % kNumRows == 0);
-
-  // 3D view over the output shape.
-  Vector3 transposed_dims = tiled_transpose.dimensions;
-  Vector3 permutation = tiled_transpose.permutation;
-
-  // Note: the supported permutations are their own inverses. Therefore we
-  // always use the permutation, even when we want the inverse.
-  CHECK((permutation == Vector3{0, 2, 1}) || (permutation == Vector3{2, 1, 0}));
-
-  absl::InlinedVector<int64_t, 4> input_dims{transposed_dims[permutation[0]],
-                                             transposed_dims[permutation[1]],
-                                             transposed_dims[permutation[2]]};
-
-  // We tile along the minor dimensions pre- and post-transpose.
-  absl::InlinedVector<int64_t, 4> tile_sizes{1, 1, 1};
-  tile_sizes[permutation[2]] = WarpSize() / kNumRows;
-  absl::InlinedVector<int64_t, 4> num_threads{1, 1, WarpSize()};
-  num_threads[permutation[2]] = kNumRows;
-
-  return Tiling(input_dims, tile_sizes, num_threads);
-}
-
-// Returns transpose heroes that should be codegened via shmem.
-std::vector<const HloInstruction*> GetShMemTransposes(
-    const HloFusionAnalysis& analysis) {
-  ConstHloInstructionSet transposes_to_tile;
-  for (const auto [hero, root] :
-       llvm::zip(analysis.fusion_heroes(), analysis.fusion_roots())) {
-    if (GetDescriptionForTiledTransposeEmitter(*root, *hero)) {
-      transposes_to_tile.insert(hero);
-    }
-  }
-  return {transposes_to_tile.begin(), transposes_to_tile.end()};
-}
+constexpr int kNumRows = 4;
+constexpr int kBaseBlockSize = WarpSize();
+constexpr int kNumThreadsPerBlock = 128;
 
 }  // namespace
 
 MlirTransposeFusion::MlirTransposeFusion(const HloFusionAnalysis& analysis)
     : analysis_(analysis),
-      tiling_(ComputeTransposeTiling(analysis.tiled_transpose())),
-      shmem_transposes_(GetShMemTransposes(analysis)) {
+      transpose_(analysis.tiled_transpose()),
+      permutation_(transpose_.permutation),
+      input_shape_(Permute(transpose_.dimensions, permutation_)) {
+  ConstHloInstructionSet transposes_to_tile;
+  int index = 0;
+  int64_t shmem_usage = 0;
+  int max_element_bytes = 0;
   for (auto [root, hero] :
        llvm::zip(analysis_.fusion_roots(), analysis_.fusion_heroes())) {
-    if (auto transpose = GetDescriptionForTiledTransposeEmitter(*root, *hero)) {
-      permutation_ = transpose->permutation;
-      break;
+    if (auto transpose = GetDescriptionForTiledTransposeEmitter(
+            root.instruction(), hero.instruction())) {
+      transposes_to_tile.insert(&hero.instruction());
+      shmem_transpose_roots_.push_back(&root.instruction());
+      int size = primitive_util::ByteWidth(hero.shape().element_type());
+      max_element_bytes = std::max(max_element_bytes, size);
+      shmem_usage += kBaseBlockSize * (kBaseBlockSize + 1) * size;
+      shmem_transpose_root_indices_.push_back(index);
+    } else {
+      side_output_roots_.push_back(&root.instruction());
+      side_output_root_indices_.push_back(index);
     }
+    ++index;
+  }
+  shmem_transposes_ = {transposes_to_tile.begin(), transposes_to_tile.end()};
+
+  auto compute_block_sizes = [this](int vector_size) {
+    vector_size_ = vector_size;
+    block_size_ = kBaseBlockSize * vector_size_;
+    block_sizes_ = {1, 1, block_size_};
+    block_sizes_[permutation_[2]] = block_size_;
+    block_counts_ = {CeilOfRatio(input_shape_[0], block_sizes_[0]),
+                     CeilOfRatio(input_shape_[1], block_sizes_[1]),
+                     CeilOfRatio(input_shape_[2], block_sizes_[2])};
+  };
+  // Compute initial block sizes without vectorization. We use the result to
+  // determine whether we can vectorize.
+  compute_block_sizes(1);
+
+  // Enable vectorization if we have enough work, enough shared memory and
+  // the input dimensions are divisible by the vector size. Vectorizing loads
+  // for large data types does not help (there's already enough parallelism).
+  const auto& device = analysis_.device_info();
+  bool enough_work = Product(block_counts_) * kNumThreadsPerBlock >=
+                     4 * device.core_count() * device.threads_per_core_limit();
+  bool enough_shmem = shmem_usage * 4 <= device.shared_memory_per_block();
+  bool aligned_dims =
+      (input_shape_[2] % 2 == 0) && (input_shape_[permutation_[2]] % 2 == 0);
+  if (max_element_bytes < 4 && enough_work && enough_shmem && aligned_dims) {
+    compute_block_sizes(2);
   }
 }
 
 std::optional<IndexingMap> MlirTransposeFusion::ComputeThreadIdToOutputIndexing(
     int64_t root_index, MLIRContext* mlir_context) const {
-  const auto& hero = *analysis_.fusion_heroes()[root_index];
-  // The block offsets are permuted, but the thread offsets remain the same.
-  auto block_offset = GetBlockOffsetsForTiling(tiling_, mlir_context)
-                          .getSubMap(std::vector<unsigned>{permutation_.begin(),
-                                                           permutation_.end()});
-  auto thread_offset = GetThreadOffsetsForTiling(tiling_, mlir_context);
-  auto permuted_tiled_shape =
-      ShapeUtil::MakeShape(U8, Permute(tiling_.GetShape(), permutation_));
-
-  auto map = ComposeIndexingMaps(
-      GetIndexingMapForTiling(
-          block_offset, thread_offset, tiling_.GetNumThreadsPerBlock(),
-          tiling_.GetNumBlocks(), tiling_.GetThreadTileSize(),
-          permuted_tiled_shape.dimensions()),
-      GetBitcastMap(permuted_tiled_shape, hero.shape(), mlir_context));
-  map.Simplify(GetIndexingMapForInstruction);
-  return map;
-}
-
-IndexingMap MlirTransposeFusion::ComputeThreadIdToInputIndexing(
-    const HloInstruction& hero, MLIRContext* mlir_context) const {
-  auto map = ComposeIndexingMaps(
-      GetIndexingMapForTiling(tiling_, mlir_context),
-      GetBitcastMap(tiling_.GetXlaShape(), hero.operand(0)->shape(),
-                    mlir_context));
-  map.Simplify(GetIndexingMapForInstruction);
-  return map;
+  const auto& hero = analysis_.fusion_hero(root_index).instruction();
+  return GetIndexing(/*input=*/false, hero.shape(), mlir_context);
 }
 
 std::optional<IndexingMap> MlirTransposeFusion::ComputeThreadIdToInputIndexing(
     int64_t root_index, int64_t hero_operand_index,
     MLIRContext* mlir_context) const {
-  const auto& hero = *analysis_.fusion_heroes()[root_index];
-  const auto& root = *analysis_.fusion_roots()[root_index];
-  if (!GetDescriptionForTiledTransposeEmitter(root, hero)) {
-    // Non-transpose roots are elementwise by definition.
-    return ComputeThreadIdToOutputIndexing(root_index, mlir_context);
-  }
-  return ComputeThreadIdToInputIndexing(*analysis_.fusion_heroes()[root_index],
-                                        mlir_context);
+  const auto& hero = analysis_.fusion_hero(root_index).instruction();
+  return GetIndexing(/*input=*/true, hero.operand(hero_operand_index)->shape(),
+                     mlir_context);
 }
 
 LaunchDimensions MlirTransposeFusion::launch_dimensions() const {
-  return LaunchDimensions(tiling_.GetNumBlocks(),
-                          tiling_.GetNumThreadsPerBlock());
+  return LaunchDimensions(Product(block_counts_), kNumThreadsPerBlock);
 }
 
-// Returns an indexing map with block_x, block_y, block_z set to 0.
-IndexingMap GetSharedMemoryWriteIndexingMap(
-    const IndexingMap& thread_id_indexing, int loop_dim) {
-  auto* mlir_context = thread_id_indexing.GetMLIRContext();
-
-  AffineExpr c0 = mlir::getAffineConstantExpr(0, mlir_context);
-  AffineExpr th_x = mlir::getAffineDimExpr(0, mlir_context);
-  SmallVector<AffineExpr, 3> tile_sizes(3);
-  mlir::bindSymbolsList(mlir_context, llvm::MutableArrayRef(tile_sizes));
-
-  IndexingMap shmem_write_indexing{
-      AffineMap::get(
-          thread_id_indexing.GetDimensionCount(),
-          thread_id_indexing.GetSymbolCount(),
-          {c0, th_x.floorDiv(32) + 4 * tile_sizes[loop_dim], th_x % 32},
-          mlir_context),
-      thread_id_indexing.GetDimVars(), thread_id_indexing.GetRangeVars(),
-      thread_id_indexing.GetRTVars(), thread_id_indexing.GetConstraints()};
-  shmem_write_indexing.Simplify(GetIndexingMapForInstruction);
-  return shmem_write_indexing;
+IndexingMap MlirTransposeFusion::GetSharedMemoryIndexing(
+    bool read, mlir::MLIRContext* ctx) const {
+  auto thread_offsets =
+      Permute(GetThreadOffsets(ctx), read ? Vector3{0, 1, 2} : permutation_);
+  return {mlir::AffineMap::get(6, 2, thread_offsets, ctx),
+          DimVarsFromTensorSizes({kNumThreadsPerBlock, 1, 1, 1, 1, 1}),
+          RangeVarsFromTensorSizes({block_size_ / kNumRows, vector_size_}),
+          {}};
 }
 
-// Returns an indexing map with block_x, block_y, block_z set to 0 and swapped
-// 2nd and 3rd results.
-IndexingMap GetSharedMemoryReadIndexingMap(
-    const IndexingMap& thread_id_indexing, int loop_dim) {
-  IndexingMap write_indexing =
-      GetSharedMemoryWriteIndexingMap(thread_id_indexing, loop_dim);
-  return IndexingMap{write_indexing.GetAffineMap().getSubMap({0, 2, 1}),
-                     write_indexing.GetDimVars(), write_indexing.GetRangeVars(),
-                     write_indexing.GetRTVars(),
-                     write_indexing.GetConstraints()};
-}
-
-absl::StatusOr<SmallVector<Value, 4>> MlirTransposeFusion::EmitWriteToShMemMlir(
+MlirTransposeFusion::WriteResult MlirTransposeFusion::EmitWriteToShMemMlir(
     mlir::ImplicitLocOpBuilder& builder, FuncOp entry_function,
     const HloFusionInstruction& fusion,
-    const PartitionedComputation& root_computation,
-    const CallTargetProvider& call_target_provider) const {
-  std::vector<int64_t> shmem_tensor_size(tiling_.GetBlockTileSize().begin(),
-                                         tiling_.GetBlockTileSize().end());
+    const mlir_converter::PartitionedComputation& root_computation,
+    const mlir_converter::CallTargetProvider& call_target_provider,
+    ValueRange output_args) const {
+  MLIRContext* ctx = builder.getContext();
+  auto shmem_tensor_size = block_sizes_;
+  // Avoid bank conflicts.
+  ++shmem_tensor_size.back();
+
+  // Allocate shared memory.
+  SmallVector<Value> inits;
+  for (auto* transpose : shmem_transposes_) {
+    auto elem_type = *ConvertPrimitiveTypeToMlirType(
+        transpose->shape().element_type(), builder);
+    inits.push_back(builder.create<AllocateSharedOp>(
+        RankedTensorType::get(shmem_tensor_size, elem_type)));
+  }
 
+  // Add output arguments for side outputs.
   int num_inputs = fusion.fused_instructions_computation()->num_parameters();
-  int num_outputs = entry_function.getArguments().size() - num_inputs;
+  for (int index : side_output_root_indices_) {
+    inits.push_back(entry_function.getArgument(num_inputs + index));
+  }
 
-  MLIRContext* mlir_context = builder.getContext();
-  SmallVector<Value> shmem_intermediate_result;
-  for (auto* transpose : shmem_transposes_) {
-    auto input_indexing =
-        ComputeThreadIdToInputIndexing(*transpose, mlir_context);
-    IndexingMap shmem_input_indexing =
-        GetSharedMemoryWriteIndexingMap(input_indexing, permutation_[2]);
+  IndexingMap write_indexing = GetSharedMemoryIndexing(/*read=*/false, ctx);
+  auto body_builder = [&](ValueRange output_tensors, ValueRange dim_values,
+                          ValueRange symbol_values) -> SmallVector<Value> {
+    auto input_indices = [&](const HloInstruction* instr) {
+      return ApplyIndexing(GetIndexing(/*input=*/true, instr->shape(), ctx),
+                           dim_values, symbol_values, builder);
+    };
+    SmallVector<Value> result_tensors;
+    auto shmem_indices =
+        ApplyIndexing(write_indexing, dim_values, symbol_values, builder);
+    for (auto [transpose, output] :
+         llvm::zip(shmem_transposes_, output_tensors)) {
+      // Emit loop that writes subgraphs of transpose operands to shmem.
+      auto result_scalar = mlir_converter::ProvideParameter(
+          root_computation, transpose,
+          /*operand_index=*/0, input_indices(transpose->operand(0)),
+          call_target_provider, entry_function, builder)[0];
+      result_tensors.push_back(
+          builder.create<InsertOp>(result_scalar, output, shmem_indices));
+    }
 
-    // Allocate shared memory.
-    const HloInstruction* transpose_operand = transpose->operand(0);
-    auto elem_type = *ConvertPrimitiveTypeToMlirType(
-        transpose_operand->shape().element_type(), builder);
-    auto shmem = builder.create<AllocateSharedOp>(
-        RankedTensorType::get(shmem_tensor_size, elem_type));
-
-    // Emit loop that writes subgraphs of transpose operands to shmem.
-    auto shmem_result = EmitThreadLoopNest(
-        builder, {shmem}, input_indexing,
-        [&](ValueRange output_tensors, ValueRange dim_values,
-            ValueRange symbol_values) -> SmallVector<Value> {
-          auto input_indices =
-              ApplyAffineMap(input_indexing.GetAffineMap(), dim_values,
-                             symbol_values, builder);
-          auto shmem_indices =
-              ApplyAffineMap(shmem_input_indexing.GetAffineMap(), dim_values,
-                             symbol_values, builder);
-
-          auto result_scalars = mlir_converter::ProvideParameter(
-              root_computation.FindSubgraph(transpose), transpose,
-              /*operand_index=*/0, input_indices, call_target_provider,
-              entry_function, builder);
-
-          SmallVector<Value> result_tensors;
-          result_tensors.reserve(num_outputs);
-          for (auto [tensor, value] :
-               llvm::zip(output_tensors, result_scalars)) {
-            result_tensors.push_back(
-                builder.create<InsertOp>(value, tensor, shmem_indices));
-          }
-          return result_tensors;
-        });
-    shmem_intermediate_result.append(shmem_result.begin(), shmem_result.end());
-  }
+    // Produce all side outputs and then write them.
+    SmallVector<Value> side_outputs;
+    SmallVector<SmallVector<Value>> side_output_indices;
+    auto* root_tuple = fusion.fused_expression_root();
+    for (auto root : side_output_roots_) {
+      side_output_indices.push_back(input_indices(root));
+      side_outputs.append(mlir_converter::ProvideParameter(
+          root_computation, root_tuple, root_tuple->operand_index(root),
+          side_output_indices.back(), call_target_provider, entry_function,
+          builder));
+    }
+
+    for (const auto& [value, indices, output] :
+         llvm::zip(side_outputs, side_output_indices,
+                   output_tensors.take_back(side_output_roots_.size()))) {
+      result_tensors.push_back(
+          builder.create<InsertOp>(value, output, indices));
+    }
 
-  return shmem_intermediate_result;
+    return result_tensors;
+  };
+
+  auto indexing = GetIndexing(
+      /*input=*/true, shmem_transposes_.front()->operand(0)->shape(), ctx);
+  auto written_vector =
+      EmitThreadLoopNest(builder, inits, indexing, body_builder);
+  ValueRange written = written_vector;
+  auto shmem_tensors = written.take_front(shmem_transposes_.size());
+
+  WriteResult result;
+  result.shmem_tensors =
+      builder
+          .create<SyncThreadsOp>(mlir::TypeRange(shmem_tensors), shmem_tensors)
+          .getResults();
+  result.updated_outputs = output_args;
+  for (auto [index, side_output_result] :
+       llvm::zip(side_output_root_indices_,
+                 written.take_back(side_output_roots_.size()))) {
+    result.updated_outputs[index] = side_output_result;
+  }
+  return result;
 }
 
-absl::Status MlirTransposeFusion::EmitReadFromShMemMlir(
+void MlirTransposeFusion::EmitReadFromShMemMlir(
     mlir::ImplicitLocOpBuilder& builder, FuncOp entry_function,
     const HloFusionInstruction& fusion,
     const mlir_converter::PartitionedComputations& computations,
-    const CallTargetProvider& call_targets, ValueRange shmem_tensors) const {
-  int num_inputs = fusion.fused_instructions_computation()->num_parameters();
+    const WriteResult& written) const {
   auto* mlir_context = builder.getContext();
-  ValueRange output_tensor_args =
-      entry_function.getArguments().drop_front(num_inputs);
   auto output_indexing = *ComputeThreadIdToOutputIndexing(0, mlir_context);
-  auto shmem_output_indexing =
-      GetSharedMemoryReadIndexingMap(output_indexing, permutation_[2]);
+  auto shmem_read_indexing =
+      GetSharedMemoryIndexing(/*read=*/true, mlir_context);
   auto result_tensors = EmitThreadLoopNest(
-      builder, output_tensor_args, output_indexing,
+      builder, written.updated_outputs, output_indexing,
       [&](ValueRange output_tensors, ValueRange dim_values,
           ValueRange symbol_values) -> SmallVector<Value> {
-        auto shmem_indices =
-            ApplyAffineMap(shmem_output_indexing.GetAffineMap(), dim_values,
-                           symbol_values, builder);
-        llvm::SmallVector<Value> transpose_values;
-        for (auto shmem : shmem_tensors) {
-          transpose_values.push_back(
+        auto shmem_indices = ApplyIndexing(shmem_read_indexing, dim_values,
+                                           symbol_values, builder);
+        absl::flat_hash_map<const HloInstruction*, llvm::SmallVector<Value>>
+            transpose_values;
+        for (auto [transpose, shmem] :
+             llvm::zip(shmem_transposes_, written.shmem_tensors)) {
+          transpose_values[transpose].push_back(
               builder.create<ExtractOp>(shmem, shmem_indices));
         }
         llvm::SmallVector<Value> epilogue_indices = dim_values;
         absl::c_copy(symbol_values, std::back_inserter(epilogue_indices));
         auto result_scalars =
-            EmitEpilogue(computations, entry_function, transpose_values,
-                         epilogue_indices, builder);
-        SmallVector<Value> results;
-        results.reserve(output_tensor_args.size());
-        std::vector<AffineMap> root_indexing;
-        if (computations.epilogue()) {
-          root_indexing = computations.epilogue()->root_indexing;
-        } else {
-          root_indexing.push_back(output_indexing.GetAffineMap());
-        }
-        for (auto [tensor, value, indexing] :
-             llvm::zip(output_tensors, result_scalars, root_indexing)) {
+            EmitEpilogue(/*epilogue_index=*/0, computations, entry_function,
+                         transpose_values, epilogue_indices, builder);
+        SmallVector<Value> results = output_tensors;
+        for (auto [root, indexing, root_index] :
+             llvm::zip(shmem_transpose_roots_,
+                       computations.epilogues().front().root_indexing,
+                       shmem_transpose_root_indices_)) {
           llvm::SmallVector<Value> indices =
-              ApplyAffineMap(indexing, dim_values, symbol_values, builder);
-          results.push_back(builder.create<InsertOp>(value, tensor, indices));
+              ApplyIndexing(indexing, dim_values, symbol_values, builder);
+          results[root_index] = builder.create<InsertOp>(
+              result_scalars.at(root).front(), results[root_index], indices);
         }
         return results;
       });
 
   builder.create<ReturnOp>(result_tensors);
-  return absl::OkStatus();
 }
 
-std::optional<mlir_converter::EpilogueSpecification>
-MlirTransposeFusion::GetEpilogue(const HloFusionInstruction& fusion,
-                                 MLIRContext* mlir_context) const {
-  return mlir_converter::EpilogueSpecification::FromOutputIndexing(
-      analysis_, shmem_transposes_, *this, mlir_context);
+std::vector<mlir_converter::EpilogueSpecification>
+MlirTransposeFusion::GetEpilogues(const HloFusionInstruction& fusion,
+                                  MLIRContext* mlir_context) const {
+  return {mlir_converter::EpilogueSpecification::FromOutputIndexing(
+      analysis_, shmem_transposes_, shmem_transpose_roots_, *this,
+      mlir_context)};
 }
 
 absl::Status MlirTransposeFusion::EmitEntryFunction(
@@ -338,16 +313,55 @@ absl::Status MlirTransposeFusion::EmitEntryFunction(
   // Write intermediate results to shmem.
   mlir::ImplicitLocOpBuilder builder(entry_function.getLoc(), entry_function);
   builder.setInsertionPointToStart(entry_function.addEntryBlock());
-  TF_ASSIGN_OR_RETURN(auto shmem_tensors,
-                      EmitWriteToShMemMlir(builder, entry_function, fusion,
-                                           root_computation, call_targets));
-  // Sync GPU threads before reading from shmem.
-  auto sync_threads = builder.create<SyncThreadsOp>(
-      mlir::TypeRange(shmem_tensors), shmem_tensors);
-
+  auto written = EmitWriteToShMemMlir(
+      builder, entry_function, fusion, root_computation, call_targets,
+      entry_function.getArguments().take_back(analysis_.fusion_roots().size()));
   // Read intermediate results from shmem and compute epilogues.
-  return EmitReadFromShMemMlir(builder, entry_function, fusion, computations,
-                               call_targets, sync_threads.getResults());
+  EmitReadFromShMemMlir(builder, entry_function, fusion, computations, written);
+  return absl::OkStatus();
+}
+
+llvm::SmallVector<mlir::AffineExpr, 4> MlirTransposeFusion::GetThreadOffsets(
+    mlir::MLIRContext* ctx) const {
+  auto thread = mlir::getAffineDimExpr(
+      KernelFusionInterface::kIndexingMapThreadIdxDims[0], ctx);
+  auto loop = mlir::getAffineSymbolExpr(0, ctx);
+  auto vector = mlir::getAffineSymbolExpr(1, ctx);
+  int loop_stride = block_size_ * kNumRows;
+  auto linear_index = loop * loop_stride + thread * vector_size_ + vector;
+  return DelinearizeInBoundsIndex(linear_index, block_sizes_);
+}
+
+IndexingMap MlirTransposeFusion::GetIndexing(bool input,
+                                             const xla::Shape& shape,
+                                             mlir::MLIRContext* ctx) const {
+  auto raw_id = mlir::getAffineDimExpr(
+      KernelFusionInterface::kIndexingMapBlockIdxDims[0], ctx);
+  auto block_ids = Permute(DelinearizeInBoundsIndex(raw_id, block_counts_),
+                           input ? Vector3{0, 1, 2} : permutation_);
+  auto thread_offsets = GetThreadOffsets(ctx);
+  llvm::SmallVector<AffineExpr, 3> offsets;
+  for (auto [block_id, block_size, thread] :
+       llvm::zip(block_ids, block_sizes_, thread_offsets)) {
+    offsets.push_back(block_id * block_size + thread);
+  }
+  IndexingMap result{
+      mlir::AffineMap::get(6, 2, offsets, ctx),
+      DimVarsFromTensorSizes(
+          {kNumThreadsPerBlock, 1, 1, Product(block_counts_), 1, 1}),
+      RangeVarsFromTensorSizes({block_size_ / kNumRows, vector_size_}),
+      {}};
+  auto normalized_shape =
+      input ? ShapeUtil::MakeShape(shape.element_type(), input_shape_)
+            : ShapeUtil::MakeShape(shape.element_type(), transpose_.dimensions);
+  for (auto [size, dim] : llvm::zip(normalized_shape.dimensions(),
+                                    result.GetAffineMap().getResults())) {
+    result.AddConstraint(dim, {0, size - 1});
+  }
+  result =
+      ComposeIndexingMaps(result, GetBitcastMap(normalized_shape, shape, ctx));
+  result.Simplify(GetIndexingMapForInstruction);
+  return result;
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose_mlir.h b/third_party/xla/xla/service/gpu/fusions/transpose_mlir.h
index f789af0f8a1a2e..af2869ecd32ec6 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/transpose_mlir.h
@@ -21,15 +21,17 @@ limitations under the License.
 
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/AffineMap.h"  // from @llvm-project
 #include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
 #include "xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h"
-#include "xla/service/gpu/fusions/tiling_util.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/model/indexing_map.h"
 #include "xla/status.h"
@@ -59,36 +61,57 @@ class MlirTransposeFusion : public MlirFusionEmitterBase {
       mlir::MLIRContext* mlir_context) const override;
 
  protected:
-  IndexingMap ComputeThreadIdToInputIndexing(
-      const HloInstruction& hero, mlir::MLIRContext* mlir_context) const;
-
   absl::Status EmitEntryFunction(
       const mlir_converter::PartitionedComputations& computations,
       const mlir_converter::CallTargetProvider& call_targets,
       mlir::func::FuncOp entry_function,
       const HloFusionInstruction& fusion) const override;
 
-  std::optional<mlir_converter::EpilogueSpecification> GetEpilogue(
+  std::vector<mlir_converter::EpilogueSpecification> GetEpilogues(
       const HloFusionInstruction& fusion,
       mlir::MLIRContext* mlir_context) const override;
 
-  absl::StatusOr<llvm::SmallVector<mlir::Value, 4>> EmitWriteToShMemMlir(
+  struct WriteResult {
+    // All output tensors of the fusion, with side outputs written to them.
+    mlir::SmallVector<mlir::Value> updated_outputs;
+    // Shared memory tiles for transpose heroes.
+    mlir::ValueRange shmem_tensors;
+  };
+
+  WriteResult EmitWriteToShMemMlir(
       mlir::ImplicitLocOpBuilder& builder, mlir::func::FuncOp entry_function,
       const HloFusionInstruction& fusion,
       const mlir_converter::PartitionedComputation& root_computation,
-      const mlir_converter::CallTargetProvider& call_target_provider) const;
-  absl::Status EmitReadFromShMemMlir(
+      const mlir_converter::CallTargetProvider& call_target_provider,
+      mlir::ValueRange output_args) const;
+  void EmitReadFromShMemMlir(
       mlir::ImplicitLocOpBuilder& builder, mlir::func::FuncOp entry_function,
       const HloFusionInstruction& fusion,
       const mlir_converter::PartitionedComputations& computations,
-      const mlir_converter::CallTargetProvider& call_target_provider,
-      mlir::ValueRange shmem_tensors) const;
+      const WriteResult& written) const;
 
  private:
   const HloFusionAnalysis& analysis_;
-  Tiling tiling_;
+
+  IndexingMap GetIndexing(bool input, const xla::Shape& shape,
+                          mlir::MLIRContext* ctx) const;
+  IndexingMap GetSharedMemoryIndexing(bool read, mlir::MLIRContext* ctx) const;
+  llvm::SmallVector<mlir::AffineExpr, 4> GetThreadOffsets(
+      mlir::MLIRContext* ctx) const;
+
+  TransposeDescription transpose_;
   Vector3 permutation_;
+  std::vector<int64_t> input_shape_;
+  std::vector<int64_t> block_sizes_;  // In input elements.
+  std::vector<int64_t> block_counts_;
+  int vector_size_;
+  int block_size_;
+
   std::vector<const HloInstruction*> shmem_transposes_;
+  std::vector<const HloInstruction*> shmem_transpose_roots_;
+  std::vector<int> shmem_transpose_root_indices_;
+  std::vector<const HloInstruction*> side_output_roots_;
+  std::vector<int> side_output_root_indices_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose_mlir_test.cc b/third_party/xla/xla/service/gpu/fusions/transpose_mlir_test.cc
index 2b3ddc04e5a3eb..3236aec62889c6 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose_mlir_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/transpose_mlir_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/service/gpu/fusions/transpose_mlir.h"
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "xla/error_spec.h"
 #include "xla/service/gpu/fusions/mlir_emitter_test_base.h"
@@ -49,9 +50,9 @@ TEST_F(MlirTransposeFusionTest, ThreadIndexing021) {
   EXPECT_THAT(
       fusion.ComputeThreadIdToInputIndexing(0, 0, &mlir_context_)->ToString(),
       MatchIndexingString(R"(
-        (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
+        (d0, d1, d2, d3, d4, d5)[s0, s1] -> (
           d3 floordiv 2,
-          d0 floordiv 32 + s1 * 4,
+          d0 floordiv 32 + s0 * 4,
           (d3 mod 2) * 32 + d0 mod 32
         )
         domain:
@@ -62,16 +63,15 @@ TEST_F(MlirTransposeFusionTest, ThreadIndexing021) {
         d4 in [0, 0]
         d5 in [0, 0]
 
-        s0 in [0, 0]
-        s1 in [0, 7]
-        s2 in [0, 0]
+        s0 in [0, 7]
+        s1 in [0, 0]
       )"));
   EXPECT_THAT(
       fusion.ComputeThreadIdToOutputIndexing(0, &mlir_context_)->ToString(),
       MatchIndexingString(R"(
-        (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
+        (d0, d1, d2, d3, d4, d5)[s0, s1] -> (
           d3 floordiv 2,
-          d0 floordiv 32 + (d3 mod 2) * 32 + s1 * 4,
+          (d3 mod 2) * 32 + s0 * 4 + d0 floordiv 32,
           d0 mod 32
         )
         domain:
@@ -82,9 +82,8 @@ TEST_F(MlirTransposeFusionTest, ThreadIndexing021) {
         d4 in [0, 0]
         d5 in [0, 0]
 
-        s0 in [0, 0]
-        s1 in [0, 7]
-        s2 in [0, 0]
+        s0 in [0, 7]
+        s1 in [0, 0]
       )"));
 }
 
@@ -108,9 +107,9 @@ TEST_F(MlirTransposeFusionTest, ThreadIndexing201) {
   EXPECT_THAT(
       fusion.ComputeThreadIdToInputIndexing(0, 0, &mlir_context_)->ToString(),
       MatchIndexingString(R"(
-        (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
+        (d0, d1, d2, d3, d4, d5)[s0, s1] -> (
           d3 floordiv 2,
-          d0 floordiv 32 + (d3 * 32 + s1 * 4) mod 64,
+          (d3 * 32 + s0 * 4) mod 64 + d0 floordiv 32,
           d0 mod 32
         )
         domain:
@@ -121,15 +120,14 @@ TEST_F(MlirTransposeFusionTest, ThreadIndexing201) {
         d4 in [0, 0]
         d5 in [0, 0]
 
-        s0 in [0, 0]
-        s1 in [0, 7]
-        s2 in [0, 0]
+        s0 in [0, 7]
+        s1 in [0, 0]
       )"));
   EXPECT_THAT(
       fusion.ComputeThreadIdToOutputIndexing(0, &mlir_context_)->ToString(),
       MatchIndexingString(R"(
-        (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
-          d0 floordiv 32 + s1 * 4,
+        (d0, d1, d2, d3, d4, d5)[s0, s1] -> (
+          d0 floordiv 32 + s0 * 4,
           d3 floordiv 2,
           (d3 mod 2) * 32 + d0 mod 32
         )
@@ -141,9 +139,117 @@ TEST_F(MlirTransposeFusionTest, ThreadIndexing201) {
         d4 in [0, 0]
         d5 in [0, 0]
 
-        s0 in [0, 0]
-        s1 in [0, 7]
-        s2 in [0, 0]
+        s0 in [0, 7]
+        s1 in [0, 0]
+      )"));
+}
+
+TEST_F(MlirTransposeFusionTest, ThreadIndexingVectorized021) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+    HloModule module
+    fusion {
+      %input = f16[8192,64,64] parameter(0)
+      ROOT transpose = f16[8192,64,64] transpose(%input), dimensions={0,2,1}
+    }
+    ENTRY entry {
+      %input = f16[8192,64,64] parameter(0)
+      ROOT %fusion = f16[8192,64,64] fusion(%input), kind=kInput, calls=fusion
+    }
+  )"));
+
+  auto* root = module->entry_computation()->root_instruction();
+  auto analysis = AnalyzeFusion(*root, device_info_);
+
+  MlirTransposeFusion fusion(analysis);
+  EXPECT_THAT(
+      fusion.ComputeThreadIdToInputIndexing(0, 0, &mlir_context_)->ToString(),
+      MatchIndexingString(R"(
+        (d0, d1, d2, d3, d4, d5)[s0, s1] -> (
+          d3,
+          d0 floordiv 32 + s0 * 4,
+          (d0 mod 32) * 2 + s1
+        )
+        domain:
+        d0 in [0, 127]
+        d1 in [0, 0]
+        d2 in [0, 0]
+        d3 in [0, 8191]
+        d4 in [0, 0]
+        d5 in [0, 0]
+        s0 in [0, 15]
+        s1 in [0, 1]
+      )"));
+  EXPECT_THAT(
+      fusion.ComputeThreadIdToOutputIndexing(0, &mlir_context_)->ToString(),
+      MatchIndexingString(R"(
+        (d0, d1, d2, d3, d4, d5)[s0, s1] -> (
+          d3,
+          d0 floordiv 32 + s0 * 4,
+          (d0 mod 32) * 2 + s1
+        )
+        domain:
+        d0 in [0, 127]
+        d1 in [0, 0]
+        d2 in [0, 0]
+        d3 in [0, 8191]
+        d4 in [0, 0]
+        d5 in [0, 0]
+        s0 in [0, 15]
+        s1 in [0, 1]
+      )"));
+}
+
+TEST_F(MlirTransposeFusionTest, ThreadIndexingVectorized210) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+    HloModule module
+    fusion {
+      %input = f16[64,64,8192] parameter(0)
+      ROOT transpose = f16[8192,64,64] transpose(%input), dimensions={2,1,0}
+    }
+    ENTRY entry {
+      %input = f16[64,64,8192] parameter(0)
+      ROOT %fusion = f16[8192,64,64] fusion(%input), kind=kInput, calls=fusion
+    })"));
+
+  auto* root = module->entry_computation()->root_instruction();
+  auto analysis = AnalyzeFusion(*root, device_info_);
+  MlirTransposeFusion fusion(analysis);
+
+  EXPECT_THAT(
+      fusion.ComputeThreadIdToInputIndexing(0, 0, &mlir_context_)->ToString(),
+      MatchIndexingString(R"(
+        (d0, d1, d2, d3, d4, d5)[s0, s1] -> (
+          d0 floordiv 32 + s0 * 4,
+          d3 floordiv 128,
+          (d3 mod 128) * 64 + s1 + (d0 mod 32) * 2
+        )
+        domain:
+        d0 in [0, 127]
+        d1 in [0, 0]
+        d2 in [0, 0]
+        d3 in [0, 8191]
+        d4 in [0, 0]
+        d5 in [0, 0]
+        s0 in [0, 15]
+        s1 in [0, 1]
+      )"));
+  EXPECT_THAT(
+      fusion.ComputeThreadIdToOutputIndexing(0, &mlir_context_)->ToString(),
+      MatchIndexingString(R"(
+        (d0, d1, d2, d3, d4, d5)[s0, s1] -> (
+          (d3 mod 128) * 64 + s0 * 4 + d0 floordiv 32,
+          d3 floordiv 128,
+          (d0 mod 32) * 2 + s1
+        )
+        domain:
+        d0 in [0, 127]
+        d1 in [0, 0]
+        d2 in [0, 0]
+        d3 in [0, 8191]
+        d4 in [0, 0]
+        d5 in [0, 0]
+        s0 in [0, 15]
+        s1 in [0, 1]
       )"));
 }
 
@@ -163,7 +269,7 @@ TEST_F(MlirTransposeFusionTest, FusedTranspose021) {
         calls=%fused_computation
     }
   )";
-  TF_ASSERT_OK(EmitAndCheckIR(kHloString, R"(
+  TF_EXPECT_OK(EmitAndCheckIR(kHloString, R"(
     // CHECK-LABEL: func.func @fused_computation(
     // CHECK-SAME:   }, %[[OUT:.*]]: tensor<20x170x160xf32>
     //
@@ -171,7 +277,7 @@ TEST_F(MlirTransposeFusionTest, FusedTranspose021) {
     // CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : index
     // CHECK-DAG:  %[[C8:.*]] = arith.constant 8 : index
 
-    // CHECK:      %[[SHMEM:.*]] = xla_gpu.allocate_shared : tensor<1x32x32xf32>
+    // CHECK:      %[[SHMEM:.*]] = xla_gpu.allocate_shared : tensor<1x32x33xf32>
     // CHECK:      %[[SHMEM_WITH_VALS:.*]] = scf.for
     // CHECK-SAME:     %[[C0]] to %[[C8]] step %[[C1]]
     // CHECK-SAME:     iter_args(%[[SHMEM_:.*]] = %[[SHMEM]])
@@ -189,34 +295,35 @@ TEST_F(MlirTransposeFusionTest, FusedTranspose021) {
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
 
-TEST_F(MlirTransposeFusionTest, Transpose021_Parameter) {
+TEST_F(MlirTransposeFusionTest, FusedTranspose210) {
   auto kHloString = R"(
     HloModule Transpose
 
     %fused_computation {
       %p0 = f32[20,160,170] parameter(0)
-      %transpose = f32[20,170,160] transpose(%p0), dimensions={0,2,1}
-      ROOT %abs = f32[20,170,160] abs(%transpose)
+      %exp = f32[20,160,170] exponential(%p0)
+      %transpose = f32[170,160,20] transpose(%exp), dimensions={2,1,0}
+      ROOT %abs = f32[170,160,20] abs(%transpose)
     }
     ENTRY main {
       %param = f32[20,160,170] parameter(0)
-      ROOT %fusion = f32[20,170,160] fusion(%param), kind=kInput,
+      ROOT %fusion = f32[170,160,20] fusion(%param), kind=kInput,
         calls=%fused_computation
     }
   )";
-  TF_ASSERT_OK(EmitAndCheckIR(kHloString, R"(
+  TF_EXPECT_OK(EmitAndCheckIR(kHloString, R"(
     // CHECK-LABEL: func.func @fused_computation(
-    // CHECK-SAME:   }, %[[OUT:.*]]: tensor<20x170x160xf32>
+    // CHECK-SAME:   }, %[[OUT:.*]]: tensor<170x160x20xf32>
     //
     // CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
     // CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : index
     // CHECK-DAG:  %[[C8:.*]] = arith.constant 8 : index
 
-    // CHECK:      %[[SHMEM:.*]] = xla_gpu.allocate_shared : tensor<1x32x32xf32>
+    // CHECK:      %[[SHMEM:.*]] = xla_gpu.allocate_shared : tensor<32x1x33xf32>
     // CHECK:      %[[SHMEM_WITH_VALS:.*]] = scf.for
     // CHECK-SAME:     %[[C0]] to %[[C8]] step %[[C1]]
     // CHECK-SAME:     iter_args(%[[SHMEM_:.*]] = %[[SHMEM]])
-    // CHECK:        %[[EXP:.*]] = xla_gpu.pure_call @fused_computation_p0
+    // CHECK:        %[[EXP:.*]] = xla_gpu.pure_call @fused_computation_exp
     // CHECK:        tensor.insert %[[EXP]] into %[[SHMEM_]]
 
     // CHECK:      %[[SYNC:.*]] = xla_gpu.sync_threads %[[SHMEM_WITH_VALS]]
@@ -230,13 +337,14 @@ TEST_F(MlirTransposeFusionTest, Transpose021_Parameter) {
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
 
-TEST_F(MlirTransposeFusionTest, Transpose021_NoEpilogue) {
+TEST_F(MlirTransposeFusionTest, Transpose021_Parameter) {
   auto kHloString = R"(
     HloModule Transpose
 
     %fused_computation {
       %p0 = f32[20,160,170] parameter(0)
-      ROOT %transpose = f32[20,170,160] transpose(%p0), dimensions={0,2,1}
+      %transpose = f32[20,170,160] transpose(%p0), dimensions={0,2,1}
+      ROOT %abs = f32[20,170,160] abs(%transpose)
     }
     ENTRY main {
       %param = f32[20,160,170] parameter(0)
@@ -244,14 +352,15 @@ TEST_F(MlirTransposeFusionTest, Transpose021_NoEpilogue) {
         calls=%fused_computation
     }
   )";
-  TF_ASSERT_OK(EmitAndCheckIR(kHloString, R"(
+  TF_EXPECT_OK(EmitAndCheckIR(kHloString, R"(
     // CHECK-LABEL: func.func @fused_computation(
     // CHECK-SAME:   }, %[[OUT:.*]]: tensor<20x170x160xf32>
     //
     // CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
     // CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : index
     // CHECK-DAG:  %[[C8:.*]] = arith.constant 8 : index
-    // CHECK:      %[[SHMEM:.*]] = xla_gpu.allocate_shared : tensor<1x32x32xf32>
+
+    // CHECK:      %[[SHMEM:.*]] = xla_gpu.allocate_shared : tensor<1x32x33xf32>
     // CHECK:      %[[SHMEM_WITH_VALS:.*]] = scf.for
     // CHECK-SAME:     %[[C0]] to %[[C8]] step %[[C1]]
     // CHECK-SAME:     iter_args(%[[SHMEM_:.*]] = %[[SHMEM]])
@@ -263,8 +372,29 @@ TEST_F(MlirTransposeFusionTest, Transpose021_NoEpilogue) {
     // CHECK:      scf.for
     // CHECK-SAME:    %[[C0]] to %[[C8]] step %[[C1]]
     // CHECK-SAME:    iter_args(%[[OUT_:.*]] = %[[OUT]])
-    // CHECK:       %[[SHMEM_ELEM:.*]] = tensor.extract %[[SYNC]]
-    // CHECK:       tensor.insert %[[SHMEM_ELEM]] into %[[OUT_]]
+    // CHECK:       %[[ABS:.*]] = xla_gpu.pure_call @fused_computation__epilogue__
+    // CHECK:       tensor.insert %[[ABS]] into %[[OUT_]]
+  )"));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
+}
+
+TEST_F(MlirTransposeFusionTest, Transpose021_NoEpilogue) {
+  auto kHloString = R"(
+    HloModule Transpose
+
+    %fused_computation {
+      %p0 = f32[20,160,170] parameter(0)
+      ROOT %transpose = f32[20,170,160] transpose(%p0), dimensions={0,2,1}
+    }
+    ENTRY main {
+      %param = f32[20,160,170] parameter(0)
+      ROOT %fusion = f32[20,170,160] fusion(%param), kind=kInput,
+        calls=%fused_computation
+    }
+  )";
+  TF_EXPECT_OK(EmitAndCheckIR(kHloString, R"(
+    // CHECK:       func.func private @fused_computation__epilogue__
+    // CHECK-NEXT:  return %
   )"));
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
@@ -293,13 +423,13 @@ TEST_F(MlirTransposeFusionTest, Transpose_2D) {
     HloModule Transpose
 
     %fused_computation {
-      %param_0 = f64[100, 200] parameter(0)
-      ROOT %transpose= f64[200,100] transpose(f64[100, 200] %param_0),
+      %param_0 = f64[64, 64] parameter(0)
+      ROOT %transpose= f64[64,64] transpose( %param_0),
         dimensions={1,0}
     }
     ENTRY main {
-      %param = f64[100, 200] parameter(0)
-      ROOT %fusion = f64[200,100] fusion(%param), kind=kInput,
+      %param = f64[64,64] parameter(0)
+      ROOT %fusion = f64[64,64] fusion(%param), kind=kInput,
         calls=%fused_computation
     }
   )";
@@ -429,6 +559,42 @@ TEST_F(MlirTransposeFusionTest, SideOutputs) {
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
 
+TEST_F(MlirTransposeFusionTest, VectorizedTranspose021) {
+  auto kHloString = R"(
+    HloModule Transpose
+    %fused_computation {
+      %p0 = bf16[256,128,128] parameter(0)
+      %transpose = bf16[256,128,128] transpose(%p0), dimensions={0,2,1}
+    }
+    ENTRY main {
+      %param = bf16[256,128,128] parameter(0)
+      ROOT %fusion = bf16[256,128,128] fusion(%param), kind=kInput,
+        calls=%fused_computation
+    }
+  )";
+  TF_EXPECT_OK(EmitAndCheckIR(
+      kHloString, "// CHECK: xla_gpu.allocate_shared : tensor<1x64x65xbf16>"));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
+}
+
+TEST_F(MlirTransposeFusionTest, VectorizedTranspose210) {
+  auto kHloString = R"(
+    HloModule Transpose
+    %fused_computation {
+      %p0 = bf16[256,128,128] parameter(0)
+      %transpose = bf16[128,128,256] transpose(%p0), dimensions={2,1,0}
+    }
+    ENTRY main {
+      %param = bf16[256,128,128] parameter(0)
+      ROOT %fusion = bf16[128,128,256] fusion(%param), kind=kInput,
+        calls=%fused_computation
+    }
+  )";
+  TF_EXPECT_OK(EmitAndCheckIR(
+      kHloString, "// CHECK: xla_gpu.allocate_shared : tensor<64x1x65xbf16>"));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose_test.cc b/third_party/xla/xla/service/gpu/fusions/transpose_test.cc
index d7363bbd39f382..6bb842bf9dcb5c 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/transpose_test.cc
@@ -19,13 +19,13 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/statusor.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/service/gpu/fusions/fusions.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/model/indexing_test_utils.h"
 #include "xla/status_macros.h"
-#include "xla/statusor.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/statusor.h"
@@ -34,8 +34,6 @@ namespace xla {
 namespace gpu {
 namespace {
 
-using ::testing::HasSubstr;
-
 class TransposeTest : public HloTestBase {
  protected:
   stream_executor::DeviceDescription device_info_ =
@@ -99,7 +97,7 @@ TEST_F(TransposeTest, ThreadIndexing021) {
       MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
           d3 floordiv 2,
-          d0 floordiv 32 + (d3 mod 2) * 32 + s1 * 4,
+          (d3 mod 2) * 32 + s1 * 4 + d0 floordiv 32,
           d0 mod 32
         )
         domain:
@@ -141,7 +139,7 @@ TEST_F(TransposeTest, ThreadIndexing201) {
       MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
           d3 floordiv 2,
-          d0 floordiv 32 + (d3 * 32 + s1 * 4) mod 64,
+          (d3 * 32 + s1 * 4) mod 64 + d0 floordiv 32,
           d0 mod 32
         )
         domain:
diff --git a/third_party/xla/xla/service/gpu/fusions/triton.cc b/third_party/xla/xla/service/gpu/fusions/triton.cc
index ebbaccdb0bd742..37c2bb69dc77cb 100644
--- a/third_party/xla/xla/service/gpu/fusions/triton.cc
+++ b/third_party/xla/xla/service/gpu/fusions/triton.cc
@@ -18,17 +18,22 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <variant>
+#include <vector>
 
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
-#include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/gpu/fusions/fusion_emitter.h"
+#include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/kernel_arguments.h"
@@ -37,9 +42,10 @@ limitations under the License.
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/runtime/kernel_thunk.h"
 #include "xla/service/gpu/triton_fusion_analysis.h"
+#include "xla/service/llvm_ir/ir_array.h"
 #include "xla/service/llvm_ir/llvm_util.h"
-#include "xla/statusor.h"
-#include "tsl/platform/errors.h"
+#include "xla/shape.h"
+#include "xla/status_macros.h"
 #include "tsl/platform/statusor.h"
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/third_party/xla/xla/service/gpu/fusions/triton.h b/third_party/xla/xla/service/gpu/fusions/triton.h
index c584ca8c56fb7e..7f6ceb6dd70462 100644
--- a/third_party/xla/xla/service/gpu/fusions/triton.h
+++ b/third_party/xla/xla/service/gpu/fusions/triton.h
@@ -17,10 +17,12 @@ limitations under the License.
 
 #include <optional>
 
+#include "absl/status/statusor.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/fusions/fusion_emitter.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emitter_context.h"
+#include "xla/service/gpu/launch_dimensions.h"
 #include "xla/statusor.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/gpu/fusions/triton_test.cc b/third_party/xla/xla/service/gpu/fusions/triton_test.cc
index 49b8f6d966444b..4d85c47cd9c284 100644
--- a/third_party/xla/xla/service/gpu/fusions/triton_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/triton_test.cc
@@ -16,11 +16,11 @@ limitations under the License.
 
 #include <optional>
 
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "xla/service/gpu/fusions/fusions.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/device_description.pb.h"
 #include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/statusor.h"
 
diff --git a/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc b/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
index 52b50c9dabb406..2207d9eac9999d 100644
--- a/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
+++ b/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
@@ -33,8 +33,10 @@ limitations under the License.
 #include "xla/autotuning.pb.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/autotuner_compile_util.h"
 #include "xla/service/gpu/autotuner_util.h"
 #include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/buffer_comparator.h"
 #include "xla/service/gpu/cublas_cudnn.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/stream_executor_util.h"
@@ -47,16 +49,12 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/gpu/redzone_allocator.h"
-#include "xla/stream_executor/scratch_allocator.h"
 #include "xla/tsl/util/proto/proto_utils.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
-
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#include "xla/service/gpu/buffer_comparator.h"
-#endif
+#include "tsl/profiler/lib/scoped_annotation.h"
 
 namespace xla {
 namespace gpu {
@@ -88,16 +86,12 @@ absl::StatusOr<BlasLt::Epilogue> AsBlasLtEpilogue(
   }
 }
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
 class GemmAutotuner {
   const AutotuneConfig& autotune_config_;
-  se::DeviceMemoryBase lhs_buffer_, rhs_buffer_, output_buffer_;
-  std::unique_ptr<se::RedzoneAllocator> redzone_allocator_;
+  RedzoneBuffers rz_buffers_;
   se::Stream* stream_ = nullptr;
   bool deterministic_ops_ = false;
   size_t solutions_limit_ = 0;
-  int64_t rng_state_ = 0;
 
  public:
   explicit GemmAutotuner(const AutotuneConfig& autotune_config)
@@ -114,7 +108,8 @@ class GemmAutotuner {
     TF_ASSIGN_OR_RETURN(stream_, autotune_config_.GetStream());
     const DebugOptions& debug_options =
         gemm->GetModule()->config().debug_options();
-    deterministic_ops_ = debug_options.xla_gpu_deterministic_ops();
+    deterministic_ops_ = debug_options.xla_gpu_deterministic_ops() ||
+                         debug_options.xla_gpu_exclude_nondeterministic_ops();
     solutions_limit_ = debug_options.xla_gpu_autotune_max_solutions();
 
     TF_ASSIGN_OR_RETURN(auto gemm_config, GemmConfig::For(gemm));
@@ -122,14 +117,9 @@ class GemmAutotuner {
     // Don't run autotuning concurrently on the same GPU.
     absl::MutexLock gpu_lock(&GetGpuMutex(stream_->parent()));
 
-    TF_ASSIGN_OR_RETURN(auto buf_alloc, AutotunerUtil::CreateRedzoneAllocator(
-                                            autotune_config_, debug_options));
-    redzone_allocator_ =
-        std::make_unique<se::RedzoneAllocator>(std::move(buf_alloc));
-
-    TF_ASSIGN_OR_RETURN(lhs_buffer_, CreateBuffer(gemm->operand(0)->shape()));
-    TF_ASSIGN_OR_RETURN(rhs_buffer_, CreateBuffer(gemm->operand(1)->shape()));
-    TF_ASSIGN_OR_RETURN(output_buffer_, CreateBuffer(GetOutputShape(gemm)));
+    TF_ASSIGN_OR_RETURN(rz_buffers_, RedzoneBuffers::FromInstruction(
+                                         *gemm, autotune_config_, debug_options,
+                                         RedzoneBuffers::kAllInputsAllOutputs));
 
     return IsCublasLtMatmul(*gemm) || IsCublasLtMatmulF8(*gemm)
                ? TuneGpuBlasLt(gemm, gemm_config)
@@ -137,18 +127,22 @@ class GemmAutotuner {
   }
 
  private:
+  se::DeviceMemoryBase LhsBuffer() { return rz_buffers_.input_buffers().at(0); }
+  se::DeviceMemoryBase RhsBuffer() { return rz_buffers_.input_buffers().at(1); }
+  se::DeviceMemoryBase OutputBuffer() {
+    return rz_buffers_.output_buffers().at(0);
+  }
+
   const Shape& GetOutputShape(const HloInstruction* gemm) {
     return gemm->shape().IsTuple() ? gemm->shape().tuple_shapes(0)
                                    : gemm->shape();
   }
 
-  absl::StatusOr<se::DeviceMemoryBase> CreateBuffer(const Shape& shape) {
-    return AutotunerUtil::CreateBuffer(*redzone_allocator_, shape,
-                                       autotune_config_, rng_state_);
-  }
-
   absl::StatusOr<AutotuneResult> TuneGpuBlasLt(const HloInstruction* gemm,
                                                const GemmConfig& gemm_config) {
+    auto workspace_buffer =
+        rz_buffers_.output_buffers().at(gemm->shape().tuple_shapes_size() - 1);
+
     GpuBackendConfig gpu_config =
         gemm->backend_config<GpuBackendConfig>().value();
     const GemmBackendConfig& backend_config = gpu_config.gemm_backend_config();
@@ -170,37 +164,35 @@ class GemmAutotuner {
         d_scale_buffer, d_amax_buffer, bias_buffer, aux_buffer;
 
     if (has_vector_bias) {
-      TF_ASSIGN_OR_RETURN(
-          bias_buffer,
-          CreateBuffer(gemm->operand(has_matrix_bias ? 3 : 2)->shape()));
+      bias_buffer = rz_buffers_.input_buffers().at(has_matrix_bias ? 3 : 2);
     }
     if (has_aux_output) {
-      TF_ASSIGN_OR_RETURN(aux_buffer,
-                          CreateBuffer(gemm->shape().tuple_shapes(1)));
+      aux_buffer = rz_buffers_.output_buffers().at(1);
     }
 
     TF_ASSIGN_OR_RETURN(auto plan,
                         BlasLt::GetMatmulPlan(stream_, gemm_config, epilogue));
 
-    TF_ASSIGN_OR_RETURN(auto algorithms, plan->GetAlgorithms());
+    TF_ASSIGN_OR_RETURN(
+        auto algorithms,
+        plan->GetAlgorithms(/*max_algorithm_count*/ 128,
+                            /*max_workspace_size*/ workspace_buffer.size()));
 
     auto tuned_func = [&](const BlasLt::MatmulAlgorithm& algorithm)
         -> absl::StatusOr<se::blas::ProfileResult> {
-      se::OwningScratchAllocator<> scratch_allocator(
-          stream_->parent()->device_ordinal(), autotune_config_.GetAllocator());
       // Run a warmup iteration without the profiler active.
       TF_RETURN_IF_ERROR(plan->ExecuteOnStream(
-          stream_, lhs_buffer_, rhs_buffer_, output_buffer_, output_buffer_,
+          stream_, LhsBuffer(), RhsBuffer(), OutputBuffer(), OutputBuffer(),
           bias_buffer, aux_buffer, a_scale_buffer, b_scale_buffer,
           c_scale_buffer, d_scale_buffer, d_amax_buffer, algorithm,
-          scratch_allocator));
+          workspace_buffer));
       se::blas::ProfileResult profile_result;
       profile_result.set_warmup_run_executed(true);
       TF_RETURN_IF_ERROR(plan->ExecuteOnStream(
-          stream_, lhs_buffer_, rhs_buffer_, output_buffer_, output_buffer_,
+          stream_, LhsBuffer(), RhsBuffer(), OutputBuffer(), OutputBuffer(),
           bias_buffer, aux_buffer, a_scale_buffer, b_scale_buffer,
           c_scale_buffer, d_scale_buffer, d_amax_buffer, algorithm,
-          scratch_allocator, &profile_result));
+          workspace_buffer, &profile_result));
       return std::move(profile_result);
     };
 
@@ -210,14 +202,12 @@ class GemmAutotuner {
 
   absl::StatusOr<AutotuneResult> TuneGpuBlas(const HloInstruction* gemm,
                                              const GemmConfig& gemm_config) {
-    TF_ASSIGN_OR_RETURN(auto workspace_shape,
-                        ShapeUtil::TryGetSubshape(gemm->shape(), {1}));
-    TF_ASSIGN_OR_RETURN(auto workspace_buffer, CreateBuffer(*workspace_shape));
+    auto workspace_buffer = rz_buffers_.output_buffers().at(1);
 
     std::vector<se::blas::AlgorithmType> algorithms;
     TF_ASSIGN_OR_RETURN(GemmConfig::DescriptorsTuple desc,
                         gemm_config.GetMatrixDescriptors(
-                            lhs_buffer_, rhs_buffer_, output_buffer_));
+                            LhsBuffer(), RhsBuffer(), OutputBuffer()));
 
     auto blas = stream_->parent()->AsBlas();
     if (blas == nullptr) {
@@ -234,8 +224,8 @@ class GemmAutotuner {
       // error codes when profile_result is passed, as it is in the measurement
       // below, but not otherwise. It is, therefore, consistent to ignore the
       // error code here.
-      static_cast<void>(RunGemm(gemm_config, lhs_buffer_, rhs_buffer_,
-                                output_buffer_, workspace_buffer,
+      static_cast<void>(RunGemm(gemm_config, LhsBuffer(), RhsBuffer(),
+                                OutputBuffer(), workspace_buffer,
                                 deterministic_ops_, stream_, algorithm));
       se::blas::ProfileResult profile_result;
       // Allow GpuTimer to use its delay kernel implementation to improve
@@ -245,8 +235,8 @@ class GemmAutotuner {
       // for all algorithms if we're targeting < sm_50. But because we pass a
       // non-null ProfileResult, DoGemmWithAlgorithm should always return true,
       // and the actual success-ness is returned in ProfileResult::is_valid.
-      TF_RETURN_IF_ERROR(RunGemm(gemm_config, lhs_buffer_, rhs_buffer_,
-                                 output_buffer_, workspace_buffer,
+      TF_RETURN_IF_ERROR(RunGemm(gemm_config, LhsBuffer(), RhsBuffer(),
+                                 OutputBuffer(), workspace_buffer,
                                  deterministic_ops_, stream_, algorithm,
                                  &profile_result));
       return std::move(profile_result);
@@ -274,6 +264,10 @@ class GemmAutotuner {
     if (!stream_->parent()->SynchronizeAllActivity()) {
       return Internal("Failed to synchronize GPU for autotuning.");
     }
+    tsl::profiler::ScopedAnnotation annotation([&] {
+      return absl::StrFormat("XlaAutotunerMeasurement:#hlo_op=%s#",
+                             gemm->name());
+    });
 
     auto& hlo_module_config = gemm->GetModule()->mutable_config();
     const auto& output_shape = GetOutputShape(gemm);
@@ -281,7 +275,7 @@ class GemmAutotuner {
     se::DeviceMemoryBase reference_buffer;
     if (autotune_config_.should_check_correctness()) {
       TF_ASSIGN_OR_RETURN(reference_buffer,
-                          redzone_allocator_->AllocateBytes(
+                          rz_buffers_.RedzoneAllocator().AllocateBytes(
                               ShapeUtil::ByteSizeOf(output_shape)));
     }
 
@@ -299,7 +293,7 @@ class GemmAutotuner {
       if (autotune_config_.should_reinit_output_buffer() && beta != 0) {
         int64_t rng_state = 0;
         InitializeBuffer(stream_, output_shape.element_type(), &rng_state,
-                         output_buffer_);
+                         OutputBuffer());
       }
       TF_ASSIGN_OR_RETURN(auto profile_result, run_benchmark(algorithm));
 
@@ -322,7 +316,7 @@ class GemmAutotuner {
       }
       TF_ASSIGN_OR_RETURN(
           se::RedzoneAllocator::RedzoneCheckStatus rz_check_status,
-          redzone_allocator_->CheckRedzones());
+          rz_buffers_.RedzoneAllocator().CheckRedzones());
 
       if (!rz_check_status.ok()) {
         result.mutable_failure()->set_kind(AutotuneResult::REDZONE_MODIFIED);
@@ -334,14 +328,14 @@ class GemmAutotuner {
       }
 
       if (!reference_algorithm) {
-        TF_RETURN_IF_ERROR(stream_->Memcpy(&reference_buffer, output_buffer_,
-                                           output_buffer_.size()));
+        TF_RETURN_IF_ERROR(stream_->Memcpy(&reference_buffer, OutputBuffer(),
+                                           OutputBuffer().size()));
         reference_algorithm = profile_result.algorithm();
       } else {
         // Perform the comparison.
         TF_ASSIGN_OR_RETURN(
             bool outputs_match,
-            comparator.CompareEqual(stream_, /*current=*/output_buffer_,
+            comparator.CompareEqual(stream_, /*current=*/OutputBuffer(),
                                     /*expected=*/reference_buffer));
         if (!outputs_match) {
           LOG(ERROR) << "Results mismatch between different GEMM algorithms. "
@@ -373,8 +367,6 @@ class GemmAutotuner {
   }  // GetBestAlgorithm
 };  // GemmAutotuner
 
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
 // Do Gemm Autotune without stream executor. Use results from autotune cache
 // only.
 absl::StatusOr<bool> RunOnInstruction(HloInstruction* gemm,
diff --git a/third_party/xla/xla/service/gpu/gemm_algorithm_picker.h b/third_party/xla/xla/service/gpu/gemm_algorithm_picker.h
index 2a58205e81b3de..cc70e0d16a21af 100644
--- a/third_party/xla/xla/service/gpu/gemm_algorithm_picker.h
+++ b/third_party/xla/xla/service/gpu/gemm_algorithm_picker.h
@@ -33,26 +33,12 @@ limitations under the License.
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
-#include "xla/stream_executor/stream_executor.h"
-
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "xla/stream_executor/gpu/redzone_allocator.h"
-#endif
+#include "xla/stream_executor/stream_executor.h"
 
 namespace xla {
 namespace gpu {
 
-absl::StatusOr<AutotuneResult> GetBestBlasAlgorithm(
-    se::Stream* stream, se::RedzoneAllocator& allocator,
-    std::optional<std::string_view> gemm_str,
-    const AutotuneConfig& autotune_config, se::DeviceMemoryBase lhs_buffer,
-    se::DeviceMemoryBase rhs_buffer, se::DeviceMemoryBase output_buffer,
-    absl::Span<const se::blas::AlgorithmType> algorithms,
-    const Shape& output_shape, const HloModuleConfig& hlo_module_config,
-    double beta,
-    const std::function<absl::StatusOr<se::blas::ProfileResult>(
-        const se::blas::AlgorithmType&)>& run_benchmark);
-
 // GemmAlgorithmPicker supports two modes: device and deviceless.
 // In device mode, we run autotuning on the device and store autotune results.
 // In deviceless mode, we pass in some information related to the device and
diff --git a/third_party/xla/xla/service/gpu/gemm_algorithm_picker_test.cc b/third_party/xla/xla/service/gpu/gemm_algorithm_picker_test.cc
index d531d4df1ef2b7..505681cbee9eeb 100644
--- a/third_party/xla/xla/service/gpu/gemm_algorithm_picker_test.cc
+++ b/third_party/xla/xla/service/gpu/gemm_algorithm_picker_test.cc
@@ -99,9 +99,12 @@ ENTRY main {
   se::StreamExecutor* stream_exec = executors[0];
   bool changed = false;
   TF_ASSERT_OK_AND_ASSIGN(
-      changed, RunHloPass(GemmRewriter(stream_exec->GetDeviceDescription()
-                                           .gpu_compute_capability()),
-                          m.get()));
+      changed,
+      RunHloPass(
+          GemmRewriter(
+              stream_exec->GetDeviceDescription().gpu_compute_capability(),
+              /*toolkit_version=*/12040),
+          m.get()));
   changed = false;
   DebugOptions opts;
   AutotuneConfig cfg{DeviceConfig{stream_exec, nullptr}, opts};
@@ -125,9 +128,12 @@ ENTRY main {
   TF_ASSERT_OK_AND_ASSIGN(m, ParseAndReturnVerifiedModule(kHlo, module_cfg));
   changed = false;
   TF_ASSERT_OK_AND_ASSIGN(
-      changed, RunHloPass(GemmRewriter(stream_exec->GetDeviceDescription()
-                                           .gpu_compute_capability()),
-                          m.get()));
+      changed,
+      RunHloPass(
+          GemmRewriter(
+              stream_exec->GetDeviceDescription().gpu_compute_capability(),
+              /*toolkit_version=*/12040),
+          m.get()));
   changed = false;
   TF_ASSERT_OK_AND_ASSIGN(changed,
                           RunHloPass(GemmAlgorithmPicker(cfg), m.get()));
@@ -135,13 +141,8 @@ ENTRY main {
 
   SCOPED_TRACE(m->ToString());
   HloInstruction* dot;
-  if (module_cfg.debug_options().xla_gpu_enable_cublaslt()) {
-    ASSERT_THAT(m->entry_computation()->root_instruction(),
-                GmockMatch(m::CustomCall(&dot)));
-  } else {
-    ASSERT_THAT(m->entry_computation()->root_instruction(),
-                GmockMatch(m::GetTupleElement(m::CustomCall(&dot), 0)));
-  }
+  ASSERT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::GetTupleElement(m::CustomCall(&dot), 0)));
 
   TF_ASSERT_OK_AND_ASSIGN(GpuBackendConfig gpu_config,
                           dot->backend_config<GpuBackendConfig>());
@@ -179,9 +180,12 @@ ENTRY main {
 
   bool changed = false;
   TF_ASSERT_OK_AND_ASSIGN(
-      changed, RunHloPass(GemmRewriter(stream_exec->GetDeviceDescription()
-                                           .gpu_compute_capability()),
-                          m.get()));
+      changed,
+      RunHloPass(
+          GemmRewriter(
+              stream_exec->GetDeviceDescription().gpu_compute_capability(),
+              /*toolkit_version=*/12040),
+          m.get()));
   changed = false;
 
   DebugOptions opts;
@@ -213,9 +217,12 @@ ENTRY main {
       stream_exec->GetDeviceDescription().cuda_compute_capability()};
   AutotuneConfig deviceless_cfg{deviceless_config, opts};
   TF_ASSERT_OK_AND_ASSIGN(
-      changed, RunHloPass(GemmRewriter(stream_exec->GetDeviceDescription()
-                                           .gpu_compute_capability()),
-                          m.get()));
+      changed,
+      RunHloPass(
+          GemmRewriter(
+              stream_exec->GetDeviceDescription().gpu_compute_capability(),
+              /*toolkit_version=*/12040),
+          m.get()));
   changed = false;
   TF_ASSERT_OK_AND_ASSIGN(
       changed, RunHloPass(GemmAlgorithmPicker(deviceless_cfg), m.get()))
@@ -224,13 +231,8 @@ ENTRY main {
   SCOPED_TRACE(m->ToString());
   HloInstruction* dot;
 
-  if (module_cfg.debug_options().xla_gpu_enable_cublaslt()) {
-    ASSERT_THAT(m->entry_computation()->root_instruction(),
-                GmockMatch(m::CustomCall(&dot)));
-  } else {
-    ASSERT_THAT(m->entry_computation()->root_instruction(),
-                GmockMatch(m::GetTupleElement(m::CustomCall(&dot), 0)));
-  }
+  ASSERT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::GetTupleElement(m::CustomCall(&dot), 0)));
 
   TF_ASSERT_OK_AND_ASSIGN(GpuBackendConfig gpu_config,
                           dot->backend_config<GpuBackendConfig>());
diff --git a/third_party/xla/xla/service/gpu/gemm_degenerate_dim_remover.cc b/third_party/xla/xla/service/gpu/gemm_degenerate_dim_remover.cc
new file mode 100644
index 00000000000000..ea81afc66127ba
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/gemm_degenerate_dim_remover.cc
@@ -0,0 +1,147 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/gemm_degenerate_dim_remover.h"
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/layout.h"
+#include "xla/layout_util.h"
+#include "xla/shape.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+// Construct a new layout by adding removing the minor-most dimension to the
+// input layout. For example, {3, 2, 1, 0} is extended to {2, 1, 0}.
+// We expect that the input layout is normalized by LayoutNormalizer, so that
+// the input layout has a descending ordering.
+absl::StatusOr<Layout> GetLayoutWithNewMinorMostDimension(
+    const Layout& layout) {
+  if (!LayoutUtil::IsMonotonicWithDim0Major(layout)) {
+    return absl::InvalidArgumentError("Layout is not normalized.");
+  }
+  return LayoutUtil::MakeDescendingLayout(layout.minor_to_major_size() - 1);
+}
+
+class GemmDegenerateDimRemoverVisitor : public DfsHloRewriteVisitor {
+ public:
+  absl::Status HandleDot(HloInstruction* instr) override {
+    HloDotInstruction* dot = Cast<HloDotInstruction>(instr);
+    HloInstruction* lhs = dot->mutable_operand(0);
+    HloInstruction* rhs = dot->mutable_operand(1);
+
+    HloInstruction* new_lhs = nullptr;
+    HloInstruction* new_rhs = nullptr;
+
+    // The degenerate dimension is the last dimension of the LHS or RHS.
+    if (lhs->shape().dimensions().back() == 1) {
+      if (lhs->opcode() != HloOpcode::kBitcast) {
+        return absl::InternalError("Degenerate operand is not a bitcast.");
+      }
+      new_lhs = lhs->mutable_operand(0);
+      new_rhs = rhs;
+    } else if (rhs->shape().dimensions().back() == 1) {
+      if (rhs->opcode() != HloOpcode::kBitcast) {
+        return absl::InternalError("Degenerate operand is not a bitcast.");
+      }
+      new_lhs = lhs;
+      new_rhs = rhs->mutable_operand(0);
+    } else {
+      return absl::OkStatus();
+    }
+
+    changed_ = true;
+
+    std::vector<int64_t> new_out_dimensions;
+    new_out_dimensions.reserve(dot->shape().dimensions().size() - 1);
+    for (int64_t dim_size : dot->shape().dimensions()) {
+      if (dim_size == 1) {
+        continue;
+      }
+      new_out_dimensions.push_back(dim_size);
+    }
+
+    // GemvRewriter should only add one degenerate dimension.
+    if (new_out_dimensions.size() != dot->shape().dimensions().size() - 1) {
+      return absl::InternalError(
+          "More than one degenerate dimension in the output shape.");
+    }
+
+    Shape new_out_shape(
+        dot->shape().element_type(), new_out_dimensions,
+        absl::InlinedVector<bool, 4>(new_out_dimensions.size(), false),
+        /*tuple_shapes=*/{});
+    TF_ASSIGN_OR_RETURN(
+        *new_out_shape.mutable_layout(),
+        GetLayoutWithNewMinorMostDimension(dot->shape().layout()));
+
+    HloComputation* computation = dot->parent();
+    HloInstruction* new_dot =
+        computation->AddInstruction(HloInstruction::CreateDot(
+            new_out_shape, new_lhs, new_rhs, dot->dot_dimension_numbers(),
+            dot->precision_config()));
+
+    if (dot->user_count() != 1) {
+      return absl::InternalError("Dot should have exactly one user.");
+    }
+    HloInstruction* bitcast = dot->users()[0];
+    if (bitcast->opcode() != HloOpcode::kBitcast) {
+      return absl::InternalError("Dot user should be a bitcast.");
+    }
+    return computation->ReplaceInstruction(bitcast, new_dot);
+  }
+
+  bool changed() const { return changed_; }
+
+ private:
+  bool changed_ = false;
+};
+
+}  // namespace
+
+absl::StatusOr<bool> GemmDegenerateDimRemover::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  GemmDegenerateDimRemoverVisitor visitor;
+  for (HloComputation* computation :
+       module->MakeNonfusionComputations(execution_threads)) {
+    TF_RETURN_IF_ERROR(computation->Accept(&visitor));
+  }
+  return visitor.changed();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gemm_degenerate_dim_remover.h b/third_party/xla/xla/service/gpu/gemm_degenerate_dim_remover.h
new file mode 100644
index 00000000000000..a191e82e2607ec
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/gemm_degenerate_dim_remover.h
@@ -0,0 +1,48 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_GEMM_DEGENERATE_DIM_REMOVER_H_
+#define XLA_SERVICE_GPU_GEMM_DEGENERATE_DIM_REMOVER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Rewrite a gemm with a degenerate dimension to a matrix-vector multiplication.
+// For example, [m x n] @ [n x 1] is rewritten to [m x n] @ [n], and [n x 1]
+// @ [m x n] is rewritten to [n] @ [m x n].
+//
+// The degenerate dimension is introduced by GemvRewriter, we should remove it
+// after GemmFusion is run.
+class GemmDegenerateDimRemover : public HloModulePass {
+ public:
+  absl::string_view name() const override {
+    return "gemm-degenerate-dim-remover";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_GEMM_DEGENERATE_DIM_REMOVER_H_
diff --git a/third_party/xla/xla/service/gpu/gemm_degenerate_dim_remover_test.cc b/third_party/xla/xla/service/gpu/gemm_degenerate_dim_remover_test.cc
new file mode 100644
index 00000000000000..cab3805b57f53d
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/gemm_degenerate_dim_remover_test.cc
@@ -0,0 +1,102 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/gemm_degenerate_dim_remover.h"
+
+#include <memory>
+#include <optional>
+
+#include <gtest/gtest.h>
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla::gpu {
+namespace {
+
+class GemmDegenerateDimRemoverTest : public HloTestBase {};
+
+TEST_F(GemmDegenerateDimRemoverTest, RewriteMatrixVectorMultiplicationToGemm) {
+  const char* hlo = R"(
+  HloModule m
+
+  ENTRY e {
+    p0 = f32[32,7] parameter(0)
+    p1 = f32[7] parameter(1)
+    bitcast = f32[7, 1] bitcast(p1)
+    dot = f32[32,1] dot(p0, bitcast),
+      lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    ROOT result = f32[32] bitcast(dot)
+  })";
+
+  const char* expected = R"()
+// CHECK:  %[[P0:.*]] = f32[32,7]{1,0} parameter(0)
+// CHECK:  %[[P1:.*]] = f32[7]{0} parameter(1)
+// CHECK:  ROOT %[[DOT:.*]] = f32[32]{0} dot(%[[P0]], %[[P1]]), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+
+  RunAndFilecheckHloRewrite(hlo, GemmDegenerateDimRemover(), expected);
+}
+
+TEST_F(GemmDegenerateDimRemoverTest, RewriteVectorMatrixMultiplicationToGemm) {
+  const char* hlo = R"(
+  HloModule m
+
+  ENTRY e {
+    p0 = f32[7] parameter(0)
+    p1 = f32[7,32] parameter(1)
+    bitcast = f32[7, 1] bitcast(p0)
+    dot = f32[1,32] dot(bitcast, p1),
+      lhs_contracting_dims={0}, rhs_contracting_dims={0}
+    ROOT result = f32[32] bitcast(dot)
+  })";
+
+  const char* expected = R"()
+// CHECK:  %[[P0:.*]] = f32[7]{0} parameter(0)
+// CHECK:  %[[P1:.*]] = f32[7,32]{1,0} parameter(1)
+// CHECK:  ROOT %[[DOT:.*]] = f32[32]{0} dot(%[[P0]], %[[P1]]), lhs_contracting_dims={0}, rhs_contracting_dims={0}
+})";
+
+  RunAndFilecheckHloRewrite(hlo, GemmDegenerateDimRemover(), expected);
+}
+
+TEST_F(GemmDegenerateDimRemoverTest,
+       RewriteMatrixVectorMultiplicationWithBatch) {
+  const char* hlo = R"(
+  HloModule m
+
+  ENTRY e {
+    p0 = f32[2,5,32,7] parameter(0)
+    p1 = f32[2,5,7] parameter(1)
+    bitcast  = f32[2,5,7,1]{3,2,1,0} bitcast(p1)
+    d = f32[2,5,32,1] dot(p0, bitcast),
+      lhs_batch_dims={0,1}, rhs_batch_dims={0,1},
+      lhs_contracting_dims={3}, rhs_contracting_dims={2}
+    ROOT result = f32[2,5,32] bitcast(d)
+  })";
+
+  const char* expected = R"()
+// CHECK:  %[[P0:.*]] = f32[2,5,32,7]{3,2,1,0} parameter(0)
+// CHECK:  %[[P1:.*]] = f32[2,5,7]{2,1,0} parameter(1)
+// CHECK:  ROOT %[[DOT:.*]] = f32[2,5,32]{2,1,0} dot(%[[P0]], %[[P1]]),
+// CHECK-SAME: lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+})";
+
+  RunAndFilecheckHloRewrite(hlo, GemmDegenerateDimRemover(), expected);
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.cc b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.cc
index 58615453d7ea65..1644433f48e779 100644
--- a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.cc
+++ b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.cc
@@ -51,6 +51,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_query.h"
+#include "xla/primitive_util.h"
 #include "xla/service/algorithm_util.h"
 #include "xla/service/dump.h"
 #include "xla/service/executable.h"
@@ -63,6 +64,7 @@ limitations under the License.
 #include "xla/service/gpu/gemm_rewriter.h"
 #include "xla/service/gpu/gpu_float_support.h"
 #include "xla/service/gpu/gpu_fusible.h"
+#include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/instruction_fusion.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/matmul_utils.h"
@@ -74,9 +76,9 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/status_macros.h"
-#include "xla/statusor.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/gpu/redzone_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tools/hlo_decomposer.h"
@@ -87,9 +89,12 @@ limitations under the License.
 #include "tsl/lib/core/bits.h"
 #include "tsl/platform/blocking_counter.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/path.h"
+#include "tsl/platform/protobuf.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/threadpool.h"
+#include "tsl/profiler/lib/scoped_annotation.h"
 
 // Log levels used in this file:
 // VLOG(1): Overview
@@ -97,6 +102,7 @@ limitations under the License.
 // VLOG(3): Autotuning progress - more frequent
 // VLOG(4): Print all fusions
 // VLOG(5): Profiling information for every tiling
+// VLOG(10): Print fusion computations and each configuration
 
 // TODO(b/317016172): Update usages of TritonGemmConfig to use newly exposed
 // parameters.
@@ -104,18 +110,29 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+using Config = GemmFusionAutotunerImpl::Config;
 using ProfilingOutput = AutotunerCompileUtil::ProfilingOutput;
 
 namespace {
 
-// Currently supported minimum tile size.
+// Minimum tile size.
 constexpr int kMinTileSize = 16;
-// Not a hard limit, just an assumption that should stay valid.
-constexpr int kMaxTileSize = 512;
 
 // Default tiling when autotuning is disabled.
 constexpr TritonGemmConfig kDefaultGemmTiling = {32, 32, 32, 1, 1, 4};
 
+// Split-K is enabled when the estimate number of waves is lower than the limit.
+constexpr int kMaxWavesForSplitK = 5;
+
+// Search space for exhaustive matmul autotuning.
+constexpr std::array<int, 6> kBlockSizes = {16, 32, 64, 128, 256, 512};
+constexpr std::array<int, 4> kNumStages = {1, 2, 3, 4};
+constexpr std::array<int, 4> kNumWarps = {2, 4, 8, 16};
+constexpr std::array<int, 5> kSplitK = {1, 2, 4, 8, 16};
+constexpr std::array<int, 5> kNumCtas = {1, 2, 4, 8, 16};
+
+using AutoTuneCacheKeyCount = absl::flat_hash_map<AutotuneCacheKey, uint64_t>;
+
 class GemmFusionAutotunerVisitor : public DfsHloRewriteVisitor {
  public:
   explicit GemmFusionAutotunerVisitor(const AutotuneConfig& config)
@@ -186,39 +203,22 @@ class GemmFusionAutotunerVisitor : public DfsHloRewriteVisitor {
   AutotuneConfig config_;
 };
 
-// This contains all alternative Triton GEMM configs related to one fusion.
-struct GemmConfigSet {
-  std::vector<TritonGemmConfig> configs;
-  // Setting this to true disallows verification and fallback to cuBLAS, and
-  // the usage of cuDNN.
-  bool has_sparsity = false;
-};
-
-using CuDnnPlanId = int64_t;
-
-struct ExecutableCandidate {
-  std::variant<TritonGemmConfig, CuDnnPlanId> config;
-  // Not nullptr.
-  std::unique_ptr<Executable> executable;
-};
-
-// This contains all alternative executables related to one fusion.
-struct ExecutableSet {
-  std::vector<ExecutableCandidate> candidates;
-  // Not nullptr.
-  std::unique_ptr<Executable> reference;
-};
+using TilingConfigsMap =
+    absl::flat_hash_map<const HloFusionInstruction*, std::vector<Config>>;
 
 class GemmConfigSetCollector : public ConstDfsHloVisitorWithDefault {
  public:
-  explicit GemmConfigSetCollector(const AutotuneConfig& config)
-      : config_(config) {}
+  explicit GemmConfigSetCollector(GemmFusionAutotunerImpl* impl)
+      : impl_(impl) {}
 
-  absl::StatusOr<
-      absl::flat_hash_map<const HloFusionInstruction*, GemmConfigSet>>
-  CollectGemmConfigSets(
+  // Find configurations to tune.
+  absl::StatusOr<TilingConfigsMap> CollectGemmConfigSets(
       const HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads = {}) {
+    error_out_on_cache_miss_ =
+        module->config()
+            .debug_options()
+            .xla_gpu_require_complete_aot_autotune_results();
     gemm_config_sets_.clear();
     for (HloComputation* computation :
          module->MakeNonfusionComputations(execution_threads)) {
@@ -227,6 +227,10 @@ class GemmConfigSetCollector : public ConstDfsHloVisitorWithDefault {
     return std::move(gemm_config_sets_);
   }
 
+  AutoTuneCacheKeyCount GetFusionsCount() {
+    return std::move(fusion_count_map_);
+  }
+
   absl::Status HandleFusion(const HloInstruction* hlo) override {
     const HloFusionInstruction* fusion = Cast<HloFusionInstruction>(hlo);
 
@@ -235,21 +239,33 @@ class GemmConfigSetCollector : public ConstDfsHloVisitorWithDefault {
     const FusionBackendConfig& backend_config =
         gpu_config.fusion_backend_config();
 
-    AutotuneCacheKey key = AutotunerUtil::GetKey(hlo, config_);
+    AutotuneCacheKey key = AutotunerUtil::GetKey(hlo, impl_->GetConfig());
+
+    auto insertion_result = fusion_count_map_.insert({key, 1});
+    if (!insertion_result.second) {
+      ++(insertion_result.first->second);
+    }
+
     if (AutotunerUtil::IsInCache(key) || handled_fusions_.contains(key)) {
       return absl::OkStatus();
     }
 
-    if (backend_config.kind() == kTritonGemmFusionKind &&
-        !backend_config.has_triton_gemm_config()) {
-      TF_ASSIGN_OR_RETURN(GemmConfigSet gemm_config_set,
-                          GetGemmConfigSet(fusion));
+    bool missing_config = (backend_config.kind() == kTritonGemmFusionKind &&
+                           !backend_config.has_triton_gemm_config()) ||
+                          (backend_config.kind() == kCuDnnFusionKind &&
+                           !backend_config.has_cudnn_fusion_config());
+    if (missing_config) {
+      if (error_out_on_cache_miss_) {
+        return absl::NotFoundError(absl::StrCat(
+            "Complete autotuning results are required, but no cache result "
+            "found for key: ",
+            key.ToString()));
+      }
+
+      TF_ASSIGN_OR_RETURN(std::vector<Config> configs,
+                          impl_->GenerateConfigs(*fusion));
       TF_RET_CHECK(
-          gemm_config_sets_.insert({fusion, std::move(gemm_config_set)})
-              .second);
-    } else if (backend_config.kind() == kCuDnnFusionKind &&
-               !backend_config.has_cudnn_fusion_config()) {
-      TF_RET_CHECK(gemm_config_sets_.insert({fusion, {}}).second);
+          gemm_config_sets_.insert({fusion, std::move(configs)}).second);
     }
 
     handled_fusions_.insert(key);
@@ -261,217 +277,49 @@ class GemmConfigSetCollector : public ConstDfsHloVisitorWithDefault {
   }
 
  private:
-  absl::StatusOr<GemmConfigSet> GetGemmConfigSet(
-      const HloFusionInstruction* fusion) {
-    const DebugOptions& debug_options =
-        fusion->GetModule()->config().debug_options();
-    auto cuda_comp =
-        std::get<se::CudaComputeCapability>(config_.GetGpuComputeCapability());
-    const HloDotInstruction* dot_instr =
-        Cast<HloDotInstruction>(hlo_query::GetFirstInstructionWithOpcode(
-            *fusion->called_computations().at(0), HloOpcode::kDot));
-    TF_ASSIGN_OR_RETURN(auto configs, GetPossibleMatmulAutotuneConfigs(
-                                          *dot_instr, cuda_comp, debug_options,
-                                          config_.ExhaustiveTilingSearch()));
-    return GemmConfigSet{std::move(configs),
-                         /*has_sparsity=*/dot_instr->sparse_operands() > 0};
-  }
-
-  AutotuneConfig config_;
-  absl::flat_hash_map<const HloFusionInstruction*, GemmConfigSet>
-      gemm_config_sets_;
+  bool error_out_on_cache_miss_;
+  GemmFusionAutotunerImpl* impl_;
+  TilingConfigsMap gemm_config_sets_;
+  AutoTuneCacheKeyCount fusion_count_map_;
   absl::flat_hash_set<AutotuneCacheKey> handled_fusions_;
 };
 
 struct TileSizeLimit {
-  int64_t block_m = 0;
-  int64_t block_n = 0;
-  int64_t block_k = 0;
+  int block_m = 0;
+  int block_n = 0;
+  int block_k = 0;
 };
 
-absl::StatusOr<TileSizeLimit> GetUpperLimit(const HloDotInstruction& dot) {
-  TF_ASSIGN_OR_RETURN(int64_t non_contracting_index0,
+absl::StatusOr<TileSizeLimit> GetLimits(const HloDotInstruction& dot) {
+  TF_ASSIGN_OR_RETURN(int64_t non_contracting_index_lhs,
                       NonContractingDimensionIndex(dot, /*operand_number=*/0));
-  TF_ASSIGN_OR_RETURN(int64_t non_contracting_index1,
+  TF_ASSIGN_OR_RETURN(int64_t non_contracting_index_rhs,
                       NonContractingDimensionIndex(dot, /*operand_number=*/1));
-  TF_ASSIGN_OR_RETURN(int64_t contracting_index0,
-                      ContractingDimensionIndex(dot, /*operand_number=*/0));
+  TF_ASSIGN_OR_RETURN(int64_t contracting_index,
+                      ContractingDimensionIndex(dot, /*operand_number=*/1));
   // This is not a sharp upper limit, the actual m value can be much smaller
   // based on how much of the m dimension is physically contiguous.
   // TODO(tdanyluk): Get the exact m value by running a TritonFusionAnalysis.
-  const int64_t m = dot.operand(0)->shape().dimensions(non_contracting_index0);
+  const int max_m = tsl::NextPowerOfTwoS64(
+      dot.operand(0)->shape().dimensions(non_contracting_index_lhs));
   // Theoretically the same is true as for m, but that is not possible in
   // practice with the current implementation.
-  const int64_t n = dot.operand(1)->shape().dimensions(non_contracting_index1);
+  const int max_n = tsl::NextPowerOfTwoS64(
+      dot.operand(1)->shape().dimensions(non_contracting_index_rhs));
   // This is before doing the split-k transform.
-  const int64_t k = dot.operand(0)->shape().dimensions(contracting_index0);
-  const int64_t block_m_limit =
-      std::max<int64_t>(tsl::NextPowerOfTwoS64(m), kMinTileSize);
-  const int64_t block_n_limit =
-      std::max<int64_t>(tsl::NextPowerOfTwoS64(n), kMinTileSize);
-  // Increase minimum tile size for the contracting dimension proportionally
-  // to the sparsity multiplier (assume 2:4 structured sparsity).
-  const int64_t block_k_limit =
-      std::max<int64_t>(tsl::NextPowerOfTwoS64(k),
-                        kMinTileSize * (dot.sparse_operands() ? 2 : 1));
-  return TileSizeLimit{block_m_limit, block_n_limit, block_k_limit};
-}
-
-int64_t GetSplitKLimit(int64_t block_k, int64_t block_k_limit) {
-  return std::max<int64_t>(block_k_limit / block_k, 1);
-}
-
-// Search space for exhaustive matmul autotuning.
-constexpr std::array<int, 6> BLOCK_SIZES = {16, 32, 64, 128, 256, 512};
-constexpr std::array<int, 4> NUM_STAGES = {1, 2, 3, 4};
-constexpr std::array<int, 4> NUM_WARPS = {2, 4, 8, 16};
-constexpr std::array<int, 5> SPLIT_K = {1, 2, 4, 8, 16};
-// This is the number of blocks per cluster.
-//
-// Clusters have 3 dimensions (x,y,z) and only 1 <= x*y*z <= 16 are supported.
-// Triton doesn't support (3,3,1) and possibly other non-"power of 2" values.
-// It's possible that some other values may be(come) supported.
-constexpr std::array<int, 5> NUM_CTAS = {1, 2, 4, 8, 16};
-
-absl::StatusOr<std::vector<TritonGemmConfig>>
-GetExhaustiveMatmulAutotuneConfigs(
-    const HloDotInstruction& dot,
-    const se::CudaComputeCapability compute_capability, const int max_split_k,
-    const DebugOptions& debug_options) {
-  TF_ASSIGN_OR_RETURN(const TileSizeLimit limit, GetUpperLimit(dot));
-  std::vector<TritonGemmConfig> configs;
-  bool mma_layout_v2 =
-      compute_capability.IsAtLeast(se::CudaComputeCapability::AMPERE);
-  bool enable_hopper_optimizations =
-      debug_options.xla_gpu_enable_triton_hopper() &&
-      compute_capability.IsAtLeast(se::CudaComputeCapability::HOPPER);
-
-  for (int num_warps : NUM_WARPS) {
-    for (int num_stages : NUM_STAGES) {
-      // Volta doesn't support num_stages > 2.
-      if (!mma_layout_v2 && num_stages > 2) {
-        continue;
-      }
-      for (int block_m : BLOCK_SIZES) {
-        if (block_m > limit.block_m) {
-          continue;
-        }
-        for (int block_n : BLOCK_SIZES) {
-          if (block_n > limit.block_n) {
-            continue;
-          }
-          for (int block_k : BLOCK_SIZES) {
-            if (block_k > limit.block_k) {
-              continue;
-            }
-            // Sparse meta should have at least one element per thread.
-            // Note: only 2:4 structured sparsity is currently supported.
-            if (dot.sparse_operands() &&
-                block_m * block_k / 16 < num_warps * WarpSize()) {
-              continue;
-            }
-            for (int split_k : SPLIT_K) {
-              if (split_k >
-                  std::min<int64_t>(max_split_k,
-                                    GetSplitKLimit(block_k, limit.block_k))) {
-                continue;
-              }
-              if (!enable_hopper_optimizations) {
-                configs.push_back(TritonGemmConfig(
-                    block_m, block_n, block_k, split_k, num_stages, num_warps));
-                continue;
-              }
-              // Arch >= Hopper autotuning.
-              // We only want to autotune this if it provides any speedup. So
-              // please think about that before adding it to the default
-              // autotuning parameters.
-              for (int num_ctas : NUM_CTAS) {
-                configs.push_back(TritonGemmConfig(block_m, block_n, block_k,
-                                                   split_k, num_stages,
-                                                   num_warps, num_ctas));
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  return configs;
-}
-
-std::vector<TritonGemmConfig> GetFixedMatmulAutotuneConfigs(
-    const se::CudaComputeCapability compute_capability, const int max_split_k) {
-  // Shorter name for better formatting.
-  using Config = TritonGemmConfig;
-  std::vector<Config> configs = {
-      Config(32, 32, 256, 1, 1, 4), Config(64, 32, 32, 16, 1, 4),
-      Config(32, 64, 64, 4, 1, 4),  Config(128, 128, 64, 4, 1, 4),
-      Config(16, 16, 256, 1, 1, 4), Config(16, 128, 32, 16, 1, 4),
-      Config(16, 64, 128, 1, 1, 4), Config(16, 128, 32, 8, 1, 4),
-      Config(16, 16, 512, 1, 1, 4), Config(32, 16, 512, 1, 1, 4),
-      Config(64, 32, 64, 1, 2, 8)};
-  if (compute_capability.IsAtLeast(se::CudaComputeCapability::AMPERE)) {
-    absl::c_copy(
-        std::vector<Config>{
-            Config(128, 256, 32, 1, 3, 8),  Config(256, 128, 32, 1, 3, 8),
-            Config(256, 64, 32, 1, 4, 4),   Config(64, 256, 32, 1, 4, 4),
-            Config(128, 64, 32, 1, 4, 4),   Config(64, 128, 32, 1, 4, 4),
-            Config(256, 128, 128, 1, 3, 8), Config(256, 64, 128, 1, 4, 4),
-            Config(64, 256, 128, 1, 4, 4),  Config(128, 128, 128, 1, 4, 4),
-            Config(128, 64, 64, 1, 4, 4),   Config(64, 128, 64, 1, 4, 4),
-            Config(128, 32, 64, 1, 4, 4),   Config(64, 32, 64, 1, 4, 4),
-            Config(32, 128, 32, 1, 4, 4),   Config(128, 128, 32, 1, 4, 4),
-            Config(16, 16, 256, 1, 3, 4),   Config(128, 128, 64, 2, 1, 8),
-            Config(64, 64, 64, 1, 2, 4),    Config(16, 64, 256, 8, 1, 4),
-            Config(256, 256, 128, 1, 3, 8)},
-        std::back_inserter(configs));
-  }
-  if (compute_capability.IsAtLeast(se::CudaComputeCapability::HOPPER)) {
-    absl::c_copy(
-        std::vector<Config>{
-            Config(16, 32, 32, 8, 1, 2),
-            Config(16, 64, 128, 8, 1, 4),
-            Config(16, 64, 128, 16, 3, 4),
-        },
-        std::back_inserter(configs));
-  }
-  configs.erase(std::remove_if(configs.begin(), configs.end(),
-                               [&](const Config& config) {
-                                 return config.split_k > max_split_k;
-                               }),
-                configs.end());
-  return configs;
-}
-
-// This prefers to take the parameter by moving it.
-absl::StatusOr<std::vector<TritonGemmConfig>> ReduceTileSizes(
-    const HloDotInstruction& dot, std::vector<TritonGemmConfig> configs) {
-  TF_ASSIGN_OR_RETURN(const TileSizeLimit limit, GetUpperLimit(dot));
-  // Decrease the block sizes and split_k if they are unnecessarily big.
-  for (TritonGemmConfig& config : configs) {
-    config.block_m = std::min<int64_t>(config.block_m, limit.block_m);
-    config.block_n = std::min<int64_t>(config.block_n, limit.block_n);
-    config.block_k = std::min<int64_t>(config.block_k, limit.block_k);
-    config.split_k = std::min<int64_t>(
-        config.split_k, GetSplitKLimit(config.block_k, limit.block_k));
-    // Sparse meta should have at least one element per thread.
-    // Note: only 2:4 structured sparsity is currently supported.
-    if (dot.sparse_operands()) {
-      int meta_elements = config.block_m * config.block_k / 16;
-      config.num_warps =
-          std::min<int64_t>(config.num_warps, meta_elements / WarpSize());
-    }
-  }
-
-  // Remove duplicates.
-  absl::flat_hash_set<TritonGemmConfig> configs_so_far;
-  configs.erase(std::remove_if(configs.begin(), configs.end(),
-                               [&](const TritonGemmConfig& config) {
-                                 return !configs_so_far.insert(config).second;
-                               }),
-                configs.end());
-  TF_RET_CHECK(!configs.empty());
-  return configs;
+  const int max_k = tsl::NextPowerOfTwoS64(
+      dot.operand(1)->shape().dimensions(contracting_index));
+
+  // TODO(b/337839570): block_k = 16 is bugged in Triton for dots with 8-bit
+  // input. Setting minimum to 32 instead of 16 for these cases.
+  // TODO(b/337838200): Write the restriction on the minimum tile size to be
+  // generic. Currently we only handle the 8-bit case as this was the bug we
+  // ran into.
+  return TileSizeLimit{
+      /*block_m=*/std::max(max_m, kMinTileSize),
+      /*block_n=*/std::max(max_n, kMinTileSize),
+      /*block_k=*/std::max(max_k, kMinTileSize),
+  };
 }
 
 int GetLogEveryN() { return VLOG_IS_ON(3) ? 100 : 1000; }
@@ -483,8 +331,6 @@ absl::StatusOr<std::unique_ptr<HloModule>> TritonGemmAutotuneExtractor(
     bool allow_filtering_kernels_spilling_registers) {
   std::unique_ptr<HloModule> new_module =
       ExtractInstructionIntoNewModule(*fusion);
-  // Reduce memory usage during compilation by disabling GPU runtime.
-  debug_opts.set_xla_gpu_enable_xla_runtime_executable(false);
   // TODO(anlunx): Disable command buffers for now because it breaks triton
   // autotuner test. Enable this when the function of command buffers is stable.
   debug_opts.clear_xla_gpu_enable_command_buffer();
@@ -532,8 +378,8 @@ absl::StatusOr<std::unique_ptr<HloModule>> TritonGemmAutotuneExtractor(
 }
 
 absl::StatusOr<std::unique_ptr<HloModule>> CublasGemmAutotuneExtractor(
-    const AutotuneConfig& config, const HloFusionInstruction* fusion,
-    const DebugOptions& debug_opts) {
+    const AutotuneConfig& config, const int32_t toolkit_version,
+    const HloFusionInstruction* fusion, const DebugOptions& debug_opts) {
   const HloComputation* fusion_computation =
       fusion->called_computations().at(0);
   std::unique_ptr<HloModule> new_module =
@@ -553,7 +399,7 @@ absl::StatusOr<std::unique_ptr<HloModule>> CublasGemmAutotuneExtractor(
         PrecisionConfig::ALG_DOT_F32_F32_F32);
   }
 
-  GemmRewriter rewriter(config.GetGpuComputeCapability());
+  GemmRewriter rewriter(config.GetGpuComputeCapability(), toolkit_version);
   GpuInstructionFusion fusion_pass(
       /*may_duplicate=*/false, config.GetExecutor()->GetDeviceDescription());
   TF_RETURN_IF_ERROR(rewriter.Run(new_module.get()).status());
@@ -586,11 +432,6 @@ absl::StatusOr<std::unique_ptr<HloModule>> CudnnGemmAutotuneExtractor(
   return new_module;
 }
 
-bool ShouldAllowFilteringKernelsSpillingRegisters(
-    const GemmConfigSet& gemm_config_set) {
-  return gemm_config_set.configs.size() > 1;
-}
-
 bool IsFusionKind(const HloInstruction& hlo, absl::string_view kind) {
   auto gpu_config = hlo.backend_config<GpuBackendConfig>();
   if (!gpu_config.ok()) {
@@ -610,68 +451,270 @@ int GetCuDnnPlanCount(const HloInstruction& hlo,
       *autotune_config.GetExecutor(), *DynCast<HloFusionInstruction>(&hlo));
 }
 
-bool IsCuDnnEnabled(const AutotuneConfig& config,
-                    const DebugOptions& debug_opts) {
-  return !config.IsDeviceless() &&
-         std::get<se::CudaComputeCapability>(config.GetGpuComputeCapability())
-             .IsAtLeastHopper() &&
-         debug_opts.xla_gpu_cudnn_gemm_fusion_level() > 0 &&
-         GetDnnVersionInfo(config.GetExecutor()).major_version() >= 9;
+AutotuneResult FromConfig(const Config& config) {
+  AutotuneResult res;
+  if (std::holds_alternative<GemmFusionAutotunerImpl::CuBlasConfig>(config)) {
+    res.mutable_gemm()->set_algorithm(CUBLAS_GEMM_DEFAULT);
+  } else if (std::holds_alternative<GemmFusionAutotunerImpl::CuDnnConfig>(
+                 config)) {
+    res.mutable_algorithm()->set_algo_id(
+        std::get<GemmFusionAutotunerImpl::CuDnnConfig>(config).plan_id);
+  } else if (std::holds_alternative<TritonGemmConfig>(config)) {
+    *res.mutable_triton() = std::get<TritonGemmConfig>(config).ToProto();
+  } else {
+    LOG(FATAL) << "Unsupported config type: " << config.index();
+  }
+  return res;
 }
 
-bool HasAlgorithmSupportedByCublasOrCublasLt(
-    const HloFusionInstruction& fusion) {
-  const PrecisionConfig::Algorithm algorithm =
-      hlo_query::GetFirstInstructionWithOpcode(*fusion.called_computation(),
-                                               HloOpcode::kDot)
-          ->precision_config()
-          .algorithm();
-  return algorithm_util::IsSupportedByCublasOrCublasLt(algorithm);
+absl::Status DumpAutotunedFusion(const AutotuneConfig& autotune_config,
+                                 const int32_t toolkit_version,
+                                 AutotunerCompileUtil& util,
+                                 const AutotuneResult result,
+                                 const HloFusionInstruction* fusion,
+                                 int fusion_id) {
+  TritonGemmConfig triton_gemm_config;
+  if (!result.has_triton()) {
+    LOG(WARNING) << "Using empty triton GEMM config for op " << fusion->name();
+    // Empty TritonGemmConfig has all zero values which is good enough to keep
+    // fused computation in the dump but illustrate that Triton is not used for
+    // it after autotuning.
+  } else {
+    TF_ASSIGN_OR_RETURN(triton_gemm_config,
+                        TritonGemmConfig::FromProto(result.triton()));
+  }
+  const se::DeviceDescription& device_desc =
+      autotune_config.GetExecutor()->GetDeviceDescription();
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModule> module,
+      util.ExtractModule([&](const DebugOptions& debug_opts) {
+        if (result.has_algorithm()) {
+          return CudnnGemmAutotuneExtractor(autotune_config, fusion, debug_opts,
+                                            result.algorithm().algo_id());
+        } else if (result.has_triton()) {
+          return TritonGemmAutotuneExtractor(
+              triton_gemm_config, device_desc, fusion, debug_opts,
+              /*allow_filtering_kernels_spilling_registers=*/true);
+        } else if (result.has_gemm()) {
+          return CublasGemmAutotuneExtractor(autotune_config, toolkit_version,
+                                             fusion, debug_opts);
+        } else {
+          LOG(FATAL) << "Unknown result type: " << result.DebugString();
+        }
+      }));
+  module->set_name(std::string(fusion->name()));
+  // Using the original module for its debug info and name in the first
+  // parameter. It's better to include the name of both the original module
+  // and the extracted module, to avoid name clashes.
+  DumpToFileInDirOrStdout(
+      /*module=*/*fusion->GetModule(),
+      /*file_prefix=*/"",
+      /*file_suffix=*/
+      absl::StrCat("triton_fusion_", fusion_id, ".", module->name(),
+                   ".optimized.txt"),
+      /*contents=*/module->ToString());
+  return absl::OkStatus();
 }
 
-bool HasAlgorithmSupportedByCudnn(const HloFusionInstruction& fusion) {
-  const PrecisionConfig::Algorithm algorithm =
-      hlo_query::GetFirstInstructionWithOpcode(*fusion.called_computation(),
-                                               HloOpcode::kDot)
-          ->precision_config()
-          .algorithm();
-  return algorithm_util::IsSupportedByCudnn(algorithm);
+std::string Serialize(const Config& config) {
+  if (auto triton_config = std::get_if<TritonGemmConfig>(&config)) {
+    tsl::protobuf::TextFormat::Printer printer;
+    printer.SetSingleLineMode(true);
+    std::string result;
+    printer.PrintToString(triton_config->ToProto(), &result);
+    return result;
+  }
+  return GemmFusionAutotunerImpl::ToString(config);
 }
 
-absl::StatusOr<absl::flat_hash_map<const HloFusionInstruction*, ExecutableSet>>
-CompileMany(const AutotuneConfig& config, AutotunerCompileUtil& util,
-            tsl::thread::ThreadPool* thread_pool,
-            const DebugOptions& debug_opts,
-            const absl::flat_hash_map<const HloFusionInstruction*,
-                                      GemmConfigSet>& gemm_config_sets) {
-  absl::Mutex executable_sets_mu;
-  absl::flat_hash_map<const HloFusionInstruction*, ExecutableSet>
-      executable_sets;
-
-  if (gemm_config_sets.empty()) {
-    return executable_sets;
+}  // anonymous namespace
+
+// Methods required for sorting the configs.
+bool GemmFusionAutotunerImpl::CuBlasConfig::operator<(
+    const CuBlasConfig& other) const {
+  return false;
+}
+bool GemmFusionAutotunerImpl::CuDnnConfig::operator<(
+    const CuDnnConfig& other) const {
+  return plan_id < other.plan_id;
+}
+
+bool GemmFusionAutotunerImpl::IsAutotuningEnabled() const {
+  return debug_options_.xla_gpu_autotune_level() > 0 &&
+         !debug_options_.xla_gpu_deterministic_ops();
+}
+
+/*static*/ std::string GemmFusionAutotunerImpl::ToString(const Config& config) {
+  if (std::holds_alternative<TritonGemmConfig>(config)) {
+    return std::get<TritonGemmConfig>(config).ToString();
+  } else if (std::holds_alternative<CuDnnConfig>(config)) {
+    return absl::StrFormat("cuDNN plan %d",
+                           std::get<CuDnnConfig>(config).plan_id);
+  } else if (std::holds_alternative<CuBlasConfig>(config)) {
+    return "reference (cublas)";
+  } else {
+    LOG(FATAL) << "Unsupported config type: " << config.index();
+  }
+}
+
+absl::StatusOr<std::vector<Config>> GemmFusionAutotunerImpl::GenerateConfigs(
+    const HloFusionInstruction& fusion) {
+  const HloDotInstruction* dot =
+      Cast<HloDotInstruction>(hlo_query::GetFirstInstructionWithOpcode(
+          *fusion.called_computations().at(0), HloOpcode::kDot));
+
+  // Add cuBLAS reference config, if available.
+  std::vector<Config> configs;
+  if (algorithm_util::IsSupportedByCublasOrCublasLt(
+          dot->precision_config().algorithm()) &&
+      !dot->sparse_operands() && IsAutotuningEnabled()) {
+    configs.push_back(CuBlasConfig{});
+  }
+
+  // Add cuDNN plans, if available.
+  bool is_hopper =
+      !config_.IsDeviceless() && GetComputeCapability().IsAtLeastHopper();
+  bool is_cudnn_enabled =
+      debug_options_.xla_gpu_cudnn_gemm_fusion_level() > 0 && is_hopper &&
+      GetDnnVersionInfo(config_.GetExecutor()).major_version() >= 9;
+  if ((IsFusionKind(fusion, kCuDnnFusionKind) && IsAutotuningEnabled()) ||
+      (IsFusionKind(fusion, kTritonGemmFusionKind) && is_cudnn_enabled &&
+       algorithm_util::IsSupportedByCudnn(
+           dot->precision_config().algorithm()) &&
+       !dot->sparse_operands() && IsAutotuningEnabled())) {
+    const int plan_count = GetCuDnnPlanCount(fusion, config_);
+    for (int plan_id = 0; plan_id < plan_count; ++plan_id) {
+      configs.push_back(CuDnnConfig{plan_id});
+    }
+  }
+  if (IsFusionKind(fusion, kCuDnnFusionKind)) {
+    if (!IsAutotuningEnabled()) {
+      configs.push_back(CuDnnConfig{-1});
+    }
+    return configs;
+  }
+
+  // Add triton configs.
+  TF_ASSIGN_OR_RETURN(std::vector<TritonGemmConfig> triton_configs,
+                      GenerateTritonConfigs(*dot));
+  for (TritonGemmConfig& config : triton_configs) {
+    configs.push_back(std::move(config));
+  }
+  return configs;
+}
+
+absl::StatusOr<std::vector<TritonGemmConfig>>
+GemmFusionAutotunerImpl::GenerateTritonConfigs(const HloDotInstruction& dot) {
+  bool has_8_bit_operand = HloAnyOf({&dot}, [&](const HloInstruction* node) {
+    if (node->opcode() != HloOpcode::kConvert) {
+      return false;
+    }
+    auto in_type = node->operand(0)->shape().element_type();
+    return primitive_util::BitWidth(in_type) == 8;
+  });
+
+  std::vector<TritonGemmConfig> result_configs;
+  TF_ASSIGN_OR_RETURN(TileSizeLimit limits, GetLimits(dot));
+
+  // Generate the list of configurations (once).
+  if (triton_configs_.empty()) {
+    triton_configs_ = !IsAutotuningEnabled()
+                          ? std::vector(1, kDefaultGemmTiling)
+                      : debug_options_.xla_gpu_exhaustive_tiling_search()
+                          ? GetExhaustiveTritonConfigs()
+                          : GetDefaultTritonConfigs();
+  }
+
+  // Avoid autotuning tiny fusions.
+  constexpr int kMinGemmElements = 32 * 32;
+  bool small_dot =
+      ShapeUtil::ElementsIn(dot.operand(0)->shape()) <= kMinGemmElements &&
+      ShapeUtil::ElementsIn(dot.operand(1)->shape()) <= kMinGemmElements;
+  std::vector<TritonGemmConfig> triton_configs =
+      small_dot ? std::vector(1, kDefaultGemmTiling) : triton_configs_;
+
+  // Split-K optimization enables more even utilization of a GPU in cases
+  // where tiling just the non-contracting dimensions of a GEMM does not create
+  // a sufficient number of thread block programs to occupy all available cores.
+  // Around 5 full waves completely avoid the need for split-K.
+  // n_tiles = split_k * (M * N) / (block_m * block_n)
+  const int kCoreCount =
+      !config_.IsDeviceless()
+          ? config_.GetExecutor()->GetDeviceDescription().core_count()
+          : 100;  // some sensible default
+  const int64_t kSufficientNumberOfTiles = kMaxWavesForSplitK * kCoreCount;
+  const int64_t result_size = ShapeUtil::ElementsIn(dot.shape());
+
+  // Triton configurations are adjusted and deduplicated.
+  absl::flat_hash_set<TritonGemmConfig> added;
+  bool is_hopper = debug_options_.xla_gpu_enable_triton_hopper() &&
+                   !config_.IsDeviceless() &&
+                   GetComputeCapability().IsAtLeastHopper();
+  for (TritonGemmConfig& config : triton_configs) {
+    config.block_m = std::min(config.block_m, limits.block_m);
+    config.block_n = std::min(config.block_n, limits.block_n);
+    config.block_k = std::min(config.block_k, limits.block_k);
+    int max_split_k = 1;
+    if (debug_options_.xla_gpu_enable_split_k_autotuning()) {
+      int64_t ratio = kSufficientNumberOfTiles * config.block_m *
+                      config.block_n / result_size;
+      max_split_k = 1 << std::max<int>(tsl::Log2Floor64(ratio), 0);
+    }
+    config.split_k = std::min(config.split_k, max_split_k);
+
+    // TODO(b/337839570): block_k = 16 is bugged in Triton for dots with 8-bit
+    // input. Setting minimum to 32 instead of 16 for these cases.
+    // TODO(b/337838200): Write the restriction on the minimum tile size to be
+    // generic. Currently we only handle the 8-bit case as this was the bug we
+    // ran into.
+    if (has_8_bit_operand && config.block_k == kMinTileSize) {
+      config.block_k *= 2;
+    }
+
+    // Hopper `wgmma` instruction requires at least 4 warps and 64 elements
+    // for LHS tile height.
+    if (is_hopper) {
+      config.block_m = std::max(config.block_m, 64);
+      config.num_warps = std::max(config.num_warps, 4);
+    }
+
+    // Sparse meta should have at least one element per thread.
+    // Note: only 2:4 structured sparsity is currently supported.
+    if (dot.sparse_operands()) {
+      config.block_k =
+          std::max(config.block_k, kMinTileSize * (has_8_bit_operand ? 4 : 2));
+      int meta_elements = config.block_m * config.block_k / 16;
+      config.num_warps =
+          std::min<int>(config.num_warps, meta_elements / WarpSize());
+    }
+
+    if (added.insert(config).second) {
+      result_configs.push_back(config);
+    }
   }
+  return result_configs;
+}
 
-  const se::DeviceDescription& gpu_device_info =
-      config.GetExecutor()->GetDeviceDescription();
+absl::StatusOr<absl::flat_hash_map<
+    const HloFusionInstruction*,
+    std::vector<GemmFusionAutotunerImpl::ExecutableCandidate>>>
+GemmFusionAutotunerImpl::CompileAll(
+    AutotunerCompileUtil& compile_util,
+    const absl::flat_hash_map<const HloFusionInstruction*, std::vector<Config>>&
+        task) {
+  tsl::profiler::ScopedAnnotation annotation("XlaAutotunerCompilation");
+  absl::Mutex results_mu;
+  absl::flat_hash_map<const HloFusionInstruction*,
+                      std::vector<ExecutableCandidate>>
+      results;
+  if (task.empty()) {
+    return results;
+  }
 
   const int log_every_n = GetLogEveryN();
   int64_t config_count = 0;
-  for (const auto& key_value : gemm_config_sets) {
-    const HloFusionInstruction& hlo = *key_value.first;
-    const GemmConfigSet& gemm_config_set = key_value.second;
-
-    if (IsFusionKind(hlo, kTritonGemmFusionKind)) {
-      config_count += gemm_config_set.configs.size();
-      if (!gemm_config_set.has_sparsity && IsCuDnnEnabled(config, debug_opts) &&
-          HasAlgorithmSupportedByCudnn(hlo)) {
-        config_count += GetCuDnnPlanCount(hlo, config);
-      }
-    } else if (IsFusionKind(hlo, kCuDnnFusionKind)) {
-      config_count += GetCuDnnPlanCount(hlo, config);
-    }
-    // Reference config for verification (uses cuBLAS).
-    config_count += !gemm_config_set.has_sparsity;
+  for (const auto& key_value : task) {
+    config_count += key_value.second.size();
   }
 
   std::atomic<int> done_count = 0;
@@ -686,68 +729,38 @@ CompileMany(const AutotuneConfig& config, AutotunerCompileUtil& util,
     }
   };
 
-  // Returns true on success.
-  auto compile = [&](const HloFusionInstruction* fusion,
-                     const TritonGemmConfig& conf,
+  auto compile = [&](const HloFusionInstruction* fusion, const Config& config,
                      bool allow_filtering_kernels_spilling_registers)
       -> absl::StatusOr<bool> {
-    CHECK_LE(conf.block_m, kMaxTileSize);
-    CHECK_LE(conf.block_n, kMaxTileSize);
-    CHECK_LE(conf.block_k, kMaxTileSize);
-    // TODO(b/296884861): Reenable GPU runtime, when it will have much smaller
-    // memory overhead (regarding the size of the executables).
-    // We can also remove the force_disable_gpu_runtime argument at that
-    // point.
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
-                        util.Compile([&](const DebugOptions& opts) {
-                          return TritonGemmAutotuneExtractor(
-                              conf, gpu_device_info, fusion, opts,
-                              allow_filtering_kernels_spilling_registers);
-                        }));
-
-    if (executable != nullptr) {
-      absl::MutexLock lock(&executable_sets_mu);
-      ExecutableSet& executable_set = executable_sets[fusion];
-      executable_set.candidates.push_back(
-          ExecutableCandidate{conf, std::move(executable)});
-      return true;
-    }
-
-    return false;
-  };
-
-  // Returns true on success.
-  auto compile_reference_executable =
-      [&](const HloFusionInstruction* fusion) -> absl::StatusOr<bool> {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
-                        util.Compile([&](const DebugOptions& opts) {
-                          return CublasGemmAutotuneExtractor(config, fusion,
-                                                             opts);
-                        }));
-
-    if (executable != nullptr) {
-      absl::MutexLock lock(&executable_sets_mu);
-      ExecutableSet& executable_set = executable_sets[fusion];
-      TF_RET_CHECK(executable_set.reference == nullptr);
-      executable_set.reference = std::move(executable);
-      return true;
+    std::unique_ptr<Executable> executable;
+    if (std::holds_alternative<TritonGemmConfig>(config)) {
+      TF_ASSIGN_OR_RETURN(
+          executable, compile_util.Compile([&](const DebugOptions& opts) {
+            return TritonGemmAutotuneExtractor(
+                std::get<TritonGemmConfig>(config),
+                config_.GetExecutor()->GetDeviceDescription(), fusion, opts,
+                allow_filtering_kernels_spilling_registers);
+          }));
+    } else if (std::holds_alternative<CuDnnConfig>(config)) {
+      executable = compile_util
+                       .Compile([&](const DebugOptions& opts) {
+                         return CudnnGemmAutotuneExtractor(
+                             config_, fusion, opts,
+                             std::get<CuDnnConfig>(config).plan_id);
+                       })
+                       .value_or(nullptr);
+    } else if (std::holds_alternative<CuBlasConfig>(config)) {
+      TF_ASSIGN_OR_RETURN(executable,
+                          compile_util.Compile([&](const DebugOptions& opts) {
+                            return CublasGemmAutotuneExtractor(
+                                config_, toolkit_version_, fusion, opts);
+                          }));
+    } else {
+      LOG(FATAL) << "Unsupported config type: " << config.index();
     }
-
-    return false;
-  };
-
-  auto compile_cudnn_executable = [&](const HloFusionInstruction* fusion,
-                                      const int plan_id) {
-    std::unique_ptr<Executable> executable =
-        util.Compile([&](const DebugOptions& opts) {
-              return CudnnGemmAutotuneExtractor(config, fusion, opts, plan_id);
-            })
-            .value_or(nullptr);
     if (executable != nullptr) {
-      absl::MutexLock lock(&executable_sets_mu);
-      ExecutableSet& executable_set = executable_sets[fusion];
-      executable_set.candidates.push_back(
-          ExecutableCandidate{plan_id, std::move(executable)});
+      absl::MutexLock lock(&results_mu);
+      results[fusion].push_back({config, std::move(executable)});
       return true;
     }
     return false;
@@ -755,223 +768,163 @@ CompileMany(const AutotuneConfig& config, AutotunerCompileUtil& util,
 
   // If the thread pool has only one thread, then it is actually slower to
   // offload the tasks there.
-  if (thread_pool && thread_pool->NumThreads() > 1 &&
-      debug_opts.xla_gpu_force_compilation_parallelism() != 1) {
-    if (gemm_config_sets.size() == 1) {
-      absl::string_view fusion_name = gemm_config_sets.begin()->first->name();
+  if (thread_pool_ && thread_pool_->NumThreads() > 1 &&
+      debug_options_.xla_gpu_force_compilation_parallelism() != 1) {
+    if (task.size() == 1) {
+      absl::string_view fusion_name = task.begin()->first->name();
       VLOG(1) << "Compiling " << config_count << " configs for " << fusion_name
-              << " on " << thread_pool->NumThreads() << " threads.";
+              << " on " << thread_pool_->NumThreads() << " threads.";
     } else {
-      VLOG(1) << "Compiling " << config_count << " configs for "
-              << gemm_config_sets.size() << " fusions on "
-              << thread_pool->NumThreads() << " threads.";
+      VLOG(1) << "Compiling " << config_count << " configs for " << task.size()
+              << " fusions on " << thread_pool_->NumThreads() << " threads.";
     }
 
     tsl::BlockingCounter counter(config_count);
-    for (const auto& key_value : gemm_config_sets) {
+    for (const auto& key_value : task) {
       const HloFusionInstruction* fusion = key_value.first;
-      const GemmConfigSet& gemm_config_set = key_value.second;
-
-      for (const TritonGemmConfig& conf : gemm_config_set.configs) {
-        thread_pool->Schedule([&, fusion] {
-          absl::StatusOr<bool> has_executable = compile(
-              fusion, conf,
-              ShouldAllowFilteringKernelsSpillingRegisters(gemm_config_set));
+      const std::vector<Config>& gemm_config_set = key_value.second;
+
+      VLOG(10) << "Compiling fusion: " << fusion->name();
+      VLOG(10) << "Dumping fusion computation: "
+               << fusion->called_computation()->ToString();
+      for (const Config& config : gemm_config_set) {
+        thread_pool_->Schedule([&, fusion] {
+          VLOG(10) << "Trying configuration forceable through: "
+                      "--xla_gpu_override_gemm_autotuner='"
+                   << Serialize(config) << "'";
+          VLOG(10) << "WARNING: you are running in multithreaded-mode, the "
+                      "last configuration printed out might not be the one "
+                      "causing issues! Use "
+                      "--xla_gpu_force_compilation_parallelism=1 to fix.";
+          absl::StatusOr<bool> has_executable =
+              compile(fusion, config, gemm_config_set.size() > 1);
           TF_CHECK_OK(has_executable.status())
               << "Failure occured when compiling fusion " << fusion->name()
-              << " with config '" << conf.ToString()
+              << " with config '" << ToString(config)
               << "'\nFused HLO computation:\n"
               << fusion->fused_instructions_computation()->ToString();
           log(has_executable.value());
           counter.DecrementCount();
         });
       }
-
-      if (!gemm_config_set.has_sparsity) {
-        thread_pool->Schedule([&, fusion] {
-          absl::StatusOr<bool> has_executable =
-              compile_reference_executable(fusion);
-          TF_CHECK_OK(has_executable.status());
-          log(has_executable.value());
-          counter.DecrementCount();
-        });
-      }
-
-      if (IsFusionKind(*fusion, kCuDnnFusionKind) ||
-          (IsFusionKind(*fusion, kTritonGemmFusionKind) &&
-           !gemm_config_set.has_sparsity &&
-           IsCuDnnEnabled(config, debug_opts) &&
-           HasAlgorithmSupportedByCudnn(*fusion))) {
-        const int plan_count = GetCuDnnPlanCount(*fusion, config);
-        for (int plan_id = 0; plan_id < plan_count; ++plan_id) {
-          thread_pool->Schedule([&, fusion, plan_id] {
-            log(compile_cudnn_executable(fusion, plan_id));
-            counter.DecrementCount();
-          });
-        }
-      }
     }
     counter.Wait();
   } else {
-    if (gemm_config_sets.size() == 1) {
-      absl::string_view fusion_name = gemm_config_sets.begin()->first->name();
+    if (task.size() == 1) {
+      absl::string_view fusion_name = task.begin()->first->name();
       LOG(WARNING) << "Compiling " << config_count << " configs for "
                    << fusion_name << " on a single thread.";
-
     } else {
       LOG(WARNING) << "Compiling " << config_count << " configs for "
-                   << gemm_config_sets.size() << " fusions on a single thread.";
+                   << task.size() << " fusions on a single thread.";
     }
 
-    for (const auto& key_value : gemm_config_sets) {
+    for (const auto& key_value : task) {
       const HloFusionInstruction* fusion = key_value.first;
-      const GemmConfigSet& gemm_config_set = key_value.second;
-
-      for (const TritonGemmConfig& gemm_config : gemm_config_set.configs) {
-        VLOG(5) << "Compiling " << gemm_config.ToString();
+      const auto& gemm_config_set = key_value.second;
+
+      VLOG(10) << "Compiling fusion: " << fusion->name();
+      VLOG(10) << "Dumping fusion computation: "
+               << fusion->called_computation()->ToString();
+      for (const Config& config : gemm_config_set) {
+        VLOG(10) << "Trying configuration forceable through: "
+                    "--xla_gpu_override_gemm_autotuner='"
+                 << Serialize(config) << "'";
         TF_ASSIGN_OR_RETURN(
             bool has_executable,
-            compile(
-                fusion, gemm_config,
-                ShouldAllowFilteringKernelsSpillingRegisters(gemm_config_set)));
+            compile(fusion, config, gemm_config_set.size() > 1));
         log(has_executable);
       }
-
-      if (!gemm_config_set.has_sparsity) {
-        TF_ASSIGN_OR_RETURN(bool has_executable,
-                            compile_reference_executable(fusion));
-        log(has_executable);
-      }
-
-      if (IsFusionKind(*fusion, kCuDnnFusionKind) ||
-          (IsFusionKind(*fusion, kTritonGemmFusionKind) &&
-           !gemm_config_set.has_sparsity &&
-           IsCuDnnEnabled(config, debug_opts) &&
-           HasAlgorithmSupportedByCudnn(*fusion))) {
-        const int plan_count = GetCuDnnPlanCount(*fusion, config);
-        for (int plan_id = 0; plan_id < plan_count; ++plan_id) {
-          log(compile_cudnn_executable(fusion, plan_id));
-        }
-      }
     }
   }
 
   VLOG(1) << "Done compiling (successful: " << good_count.load() << ").";
-
-  return executable_sets;
+  return results;
 }
 
-absl::StatusOr<AutotuneResult> Execute(const AutotuneConfig& config,
-                                       AutotunerCompileUtil& util,
-                                       const DebugOptions& debug_opts,
-                                       const HloFusionInstruction* fusion,
-                                       const ExecutableSet& executable_set) {
-  const HloComputation* fusion_computation =
-      fusion->called_computations().at(0);
+absl::StatusOr<std::vector<AutotuneResult>> GemmFusionAutotunerImpl::Profile(
+    AutotunerCompileUtil& compile_util, const HloFusionInstruction& fusion,
+    absl::Span<const ExecutableCandidate> candidates) {
+  const HloComputation* fusion_computation = fusion.called_computations().at(0);
 
-  se::StreamExecutor* stream_exec = config.GetExecutor();
+  se::StreamExecutor* stream_exec = config_.GetExecutor();
   if (!stream_exec->SynchronizeAllActivity()) {
     return Internal("Failed to synchronize GPU for autotuning.");
   }
-  se::DeviceMemoryAllocator* allocator = config.GetAllocator();
+  tsl::profiler::ScopedAnnotation annotation([&] {
+    return absl::StrFormat("XlaAutotunerMeasurement:#hlo_op=%s#",
+                           fusion.name());
+  });
+  se::DeviceMemoryAllocator* allocator = config_.GetAllocator();
+  std::unique_ptr<se::DeviceMemoryAllocator> owned_allocator;
   if (allocator == nullptr) {
-    allocator = stream_exec->GetAllocator();
+    owned_allocator =
+        std::make_unique<se::StreamExecutorMemoryAllocator>(stream_exec);
+    allocator = owned_allocator.get();
   }
-  TF_ASSIGN_OR_RETURN(se::Stream* const stream,
-                      allocator->GetStream(stream_exec->device_ordinal()));
-  TF_ASSIGN_OR_RETURN(
-      se::RedzoneAllocator rz_allocator,
-      AutotunerUtil::CreateRedzoneAllocator(config, debug_opts));
+  TF_ASSIGN_OR_RETURN(se::Stream* const stream, config_.GetStream());
 
   const HloInstruction& root = *fusion_computation->root_instruction();
   BufferComparator comparator(root.shape(),
                               fusion_computation->parent()->config());
 
-  std::vector<se::DeviceMemoryBase> inputs;
-  inputs.reserve(fusion_computation->parameter_instructions().size());
-  std::vector<Shape> input_shapes;
-  input_shapes.reserve(fusion_computation->parameter_instructions().size());
-  int64_t rng_state = 0;
-  for (const HloInstruction* param :
-       fusion_computation->parameter_instructions()) {
-    TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase param_buffer,
-                        AutotunerUtil::CreateBuffer(
-                            rz_allocator, param->shape(), config, rng_state));
-    inputs.push_back(param_buffer);
-    input_shapes.push_back(param->shape());
-  }
+  TF_ASSIGN_OR_RETURN(auto rz_buffers,
+                      RedzoneBuffers::FromInstruction(
+                          *fusion_computation->FusionInstruction(), config_,
+                          debug_options_, RedzoneBuffers::kAllInputs));
 
-  // Run with cuBLAS (optional).
+  const int log_every_n = GetLogEveryN();
+  std::vector<AutotuneResult> results;
   std::optional<ScopedShapedBuffer> reference_buffer;
-  absl::Duration cublas_duration = absl::InfiniteDuration();
-  if (executable_set.reference != nullptr) {
-    TF_ASSIGN_OR_RETURN(std::optional<ProfilingOutput> output,
-                        util.ProfileExecutable(&*executable_set.reference,
-                                               stream, inputs, input_shapes));
-    TF_RET_CHECK(output.has_value());
-    if (config.should_check_correctness()) {
-      reference_buffer = std::move(output->output);
-    }
-    cublas_duration = output->duration;
-  }
+  for (const ExecutableCandidate& candidate : candidates) {
+    VLOG(5) << "Trying : " << ToString(candidate.config);
+    AutotuneResult res = FromConfig(candidate.config);
 
-  const int log_every_n = GetLogEveryN();
-  const int64_t executable_count = executable_set.candidates.size();
-  int ran_so_far = 0;
-  std::vector<AutotuneResult> triton_results, cudnn_results;
-  VLOG(2) << "Running " << executable_count << " configs for " << fusion->name()
-          << ".";
-  for (const ExecutableCandidate& candidate : executable_set.candidates) {
-    AutotuneResult res;
-
-    std::string candidate_description;
-    if (std::holds_alternative<TritonGemmConfig>(candidate.config)) {
-      candidate_description = absl::StrFormat(
-          "triton tiling %s",
-          std::get<TritonGemmConfig>(candidate.config).ToString());
-      *res.mutable_triton() =
-          std::get<TritonGemmConfig>(candidate.config).ToProto();
-    } else {
-      const int64_t plan_id = std::get<CuDnnPlanId>(candidate.config);
-      candidate_description = absl::StrFormat("cuDNN plan %d", plan_id);
-      res.mutable_algorithm()->set_algo_id(plan_id);
-    }
-    VLOG(5) << "Trying : " << candidate_description;
-
-    TF_ASSIGN_OR_RETURN(std::optional<ProfilingOutput> profiling_output,
-                        util.ProfileExecutable(candidate.executable.get(),
-                                               stream, inputs, input_shapes));
-    ran_so_far += 1;
-    if (ran_so_far % log_every_n == 0) {
-      VLOG(2) << "Ran " << ran_so_far << " configs of " << executable_count
-              << ".";
-    }
+    std::optional<ProfilingOutput> profiling_output;
+    if (IsAutotuningEnabled()) {
+      TF_ASSIGN_OR_RETURN(
+          profiling_output,
+          compile_util.ProfileExecutable(candidate.executable.get(), stream,
+                                         rz_buffers.input_buffers(),
+                                         rz_buffers.input_shapes()));
+      if (std::holds_alternative<CuBlasConfig>(candidate.config) &&
+          config_.should_check_correctness()) {
+        reference_buffer = std::move(profiling_output->output);
+      }
 
-    if (!profiling_output) {
-      VLOG(5) << "Skipping this tiling.";
-      continue;
-    }
+      int ran_so_far = results.size() + 1;
+      if (ran_so_far % log_every_n == 0) {
+        VLOG(2) << "Ran " << ran_so_far << " configs of " << candidates.size()
+                << ".";
+      }
+      if (!profiling_output) {
+        VLOG(5) << "Skipping this tiling.";
+        continue;
+      }
 
-    VLOG(5) << "Running the kernel took: " << profiling_output->duration;
-    if (profiling_output->duration >= absl::Seconds(1)) {
-      LOG(WARNING) << "Slow kernel for " << fusion->name()
-                   << " took: " << profiling_output->duration << ". "
-                   << candidate_description;
+      VLOG(5) << "Running the kernel took: " << profiling_output->duration;
+      if (profiling_output->duration >= absl::Seconds(1)) {
+        LOG(WARNING) << "Slow kernel for "
+                     << fusion.called_computations()[0]->ToString()
+                     << " took: " << profiling_output->duration << ". "
+                     << ToString(candidate.config);
+      }
+      *res.mutable_run_time() =
+          tsl::proto_utils::ToDurationProto(profiling_output->duration);
     }
-    *res.mutable_run_time() =
-        tsl::proto_utils::ToDurationProto(profiling_output->duration);
 
     // Reference buffer is available when `config.should_check_correctness()`
     // is set and reference executable was compiled.
-    if (reference_buffer.has_value()) {
+    if (reference_buffer.has_value() &&
+        !std::holds_alternative<CuBlasConfig>(candidate.config)) {
       TF_ASSIGN_OR_RETURN(
           se::RedzoneAllocator::RedzoneCheckStatus rz_check_status,
-          rz_allocator.CheckRedzones());
+          rz_buffers.RedzoneAllocator().CheckRedzones());
       if (!rz_check_status.ok()) {
         LOG(ERROR) << "Red zone modified";
         res.mutable_failure()->set_kind(AutotuneResult::REDZONE_MODIFIED);
         res.mutable_failure()->set_msg(rz_check_status.RedzoneFailureMsg());
-        CHECK(!config.should_crash_on_check_failure());
+        CHECK(!config_.should_crash_on_check_failure());
         continue;
       }
 
@@ -985,241 +938,254 @@ absl::StatusOr<AutotuneResult> Execute(const AutotuneConfig& config,
             "Results do not match the reference. This is likely a "
             "bug/unexpected loss of precision.";
         LOG(ERROR) << kMessage;
-        CHECK(!config.should_crash_on_check_failure());
+        CHECK(!config_.should_crash_on_check_failure());
         // WRONG_RESULT is not taken seriously by PickBestResult(), so
         // use DISQUALIFIED.
         res.mutable_failure()->set_kind(AutotuneResult::DISQUALIFIED);
         res.mutable_failure()->set_msg(kMessage);
       }
     }
-    if (std::holds_alternative<TritonGemmConfig>(candidate.config)) {
-      triton_results.push_back(res);
-    } else {
-      cudnn_results.push_back(res);
-    }
+    results.push_back(std::move(res));
   }
-
   VLOG(2) << "Done running.";
+  return results;
+}
 
-  VLOG(2) << fusion->name() << ": time with cuBLAS: " << cublas_duration;
-  AutotuneResult best;
-  best.mutable_failure()->set_kind(AutotuneResult::DISQUALIFIED);
-  if (!triton_results.empty()) {
-    TF_ASSIGN_OR_RETURN(const AutotuneResult triton_best,
-                        PickBestResult(triton_results, root.ToString(),
-                                       root.GetModule()->config()));
-    VLOG(2) << "Best time with Triton: "
-            << tsl::proto_utils::FromDurationProto(triton_best.run_time());
-    best = triton_best;
-  }
-  if (!cudnn_results.empty()) {
-    TF_ASSIGN_OR_RETURN(const AutotuneResult cudnn_best,
-                        PickBestResult(cudnn_results, root.ToString(),
-                                       root.GetModule()->config()));
-    VLOG(2) << "Best time with cuDNN: "
-            << tsl::proto_utils::FromDurationProto(cudnn_best.run_time());
-    TF_ASSIGN_OR_RETURN(best,
-                        PickBestResult({best, cudnn_best}, root.ToString(),
-                                       root.GetModule()->config()));
+std::vector<TritonGemmConfig>
+GemmFusionAutotunerImpl::GetExhaustiveTritonConfigs() const {
+  std::vector<TritonGemmConfig> configs;
+  se::CudaComputeCapability cc = GetComputeCapability();
+  bool tune_ctas =
+      debug_options_.xla_gpu_enable_triton_hopper() && cc.IsAtLeastHopper();
+
+  for (int num_stages : kNumStages) {
+    // Volta doesn't support num_stages > 2.
+    if (!cc.IsAtLeastAmpere() && num_stages > 2) {
+      break;
+    }
+    for (int tile_m : kBlockSizes) {
+      for (int tile_n : kBlockSizes) {
+        for (int tile_k : kBlockSizes) {
+          const int tile_lhs = tile_m * tile_k;
+          const int tile_rhs = tile_k * tile_n;
+          for (int num_warps : kNumWarps) {
+            // Each thread should read at least one input element.
+            if (num_warps * WarpSize() > std::min(tile_lhs, tile_rhs)) {
+              break;
+            }
+            for (int split_k : kSplitK) {
+              // Split-K autotuning may be disabled by a flag.
+              if (!debug_options_.xla_gpu_enable_split_k_autotuning() &&
+                  split_k > 1) {
+                break;
+              }
+              for (int num_ctas : kNumCtas) {
+                // Clusters are only supported on Hopper.
+                // Autotuning this parameter is enabled by a flag.
+                if (!tune_ctas && num_ctas > 1) {
+                  break;
+                }
+                if (num_ctas > num_warps) {
+                  break;
+                }
+                configs.push_back(TritonGemmConfig(tile_m, tile_n, tile_k,
+                                                   split_k, num_stages,
+                                                   num_warps, num_ctas));
+              }
+            }
+          }
+        }
+      }
+    }
   }
+  return configs;
+}
 
-  if (debug_opts.xla_gpu_cublas_fallback() &&
-      !debug_opts.xla_gpu_deterministic_ops() &&
-      HasAlgorithmSupportedByCublasOrCublasLt(*fusion)) {
-    if (cublas_duration <
-        tsl::proto_utils::FromDurationProto(best.run_time())) {
-      VLOG(2) << "Falling back to cuBLAS for " << fusion->name();
-
-      AutotuneResult cublas;
-      *cublas.mutable_run_time() =
-          tsl::proto_utils::ToDurationProto(cublas_duration);
-      // We will ignore this value anyway.
-      cublas.mutable_gemm()->set_algorithm(CUBLAS_GEMM_DEFAULT);
-
-      return cublas;
-    }
+std::vector<TritonGemmConfig> GemmFusionAutotunerImpl::GetDefaultTritonConfigs()
+    const {
+  using Config = TritonGemmConfig;
+  std::vector<Config> configs = {
+      Config(32, 32, 256, 1, 1, 4), Config(64, 32, 32, 16, 1, 4),
+      Config(32, 64, 64, 4, 1, 4),  Config(128, 128, 64, 4, 1, 4),
+      Config(16, 16, 256, 1, 1, 4), Config(16, 128, 32, 16, 1, 4),
+      Config(16, 64, 128, 1, 1, 4), Config(16, 128, 32, 8, 1, 4),
+      Config(16, 16, 512, 1, 1, 4), Config(32, 16, 512, 1, 1, 4),
+      Config(64, 32, 64, 1, 2, 8)};
+  if (GetComputeCapability().IsAtLeastAmpere()) {
+    absl::c_copy(
+        std::vector<Config>{
+            Config(128, 256, 32, 1, 3, 8),  Config(256, 128, 32, 1, 3, 8),
+            Config(256, 64, 32, 1, 4, 4),   Config(64, 256, 32, 1, 4, 4),
+            Config(128, 64, 32, 1, 4, 4),   Config(64, 128, 32, 1, 4, 4),
+            Config(256, 128, 128, 1, 3, 8), Config(256, 64, 128, 1, 4, 4),
+            Config(64, 256, 128, 1, 4, 4),  Config(128, 128, 128, 1, 4, 4),
+            Config(128, 64, 64, 1, 4, 4),   Config(64, 128, 64, 1, 4, 4),
+            Config(128, 32, 64, 1, 4, 4),   Config(64, 32, 64, 1, 4, 4),
+            Config(32, 128, 32, 1, 4, 4),   Config(128, 128, 32, 1, 4, 4),
+            Config(16, 16, 256, 1, 3, 4),   Config(128, 128, 64, 2, 1, 8),
+            Config(64, 64, 64, 1, 2, 4),    Config(16, 64, 256, 8, 1, 4),
+            Config(256, 256, 128, 1, 3, 8)},
+        std::back_inserter(configs));
   }
-  if (!best.has_triton()) {
-    VLOG(2) << "Using cuDNN plan " << best.algorithm().algo_id() << " for "
-            << fusion->name();
+  if (GetComputeCapability().IsAtLeastHopper()) {
+    absl::c_copy(
+        std::vector<Config>{
+            Config(16, 32, 32, 8, 1, 2),
+            Config(16, 64, 128, 8, 1, 4),
+            Config(16, 64, 128, 16, 3, 4),
+        },
+        std::back_inserter(configs));
   }
-  return best;
+  return configs;
 }
 
-absl::Status DumpAutotunedFusion(const AutotuneConfig& autotune_config,
-                                 AutotunerCompileUtil& util,
-                                 const AutotuneResult result,
-                                 const HloFusionInstruction* fusion,
-                                 int fusion_id) {
-  TritonGemmConfig triton_gemm_config;
-  if (!result.has_triton()) {
-    LOG(WARNING) << "Using empty triton GEMM config for op " << fusion->name();
-    // Empty TritonGemmConfig has all zero values which is good enough to keep
-    // fused computation in the dump but illustrate that Triton is not used for
-    // it after autotuning.
-  } else {
-    TF_ASSIGN_OR_RETURN(triton_gemm_config,
-                        TritonGemmConfig::FromProto(result.triton()));
+absl::Status DumpAutotuningLogs(const DebugOptions& debug_opts,
+                                const AutotuningLogs& autotuning_logs) {
+  if (absl::string_view file_path = debug_opts.xla_gpu_dump_autotune_logs_to();
+      !file_path.empty()) {
+    std::string resolved_path;
+    if (!tsl::io::ResolveTestPrefixes(file_path, resolved_path)) {
+      return FailedPrecondition("File path can not be resolved: %s", file_path);
+    }
+
+    std::string textproto;
+    tsl::protobuf::TextFormat::PrintToString(autotuning_logs, &textproto);
+
+    TF_RETURN_IF_ERROR(
+        tsl::WriteStringToFile(tsl::Env::Default(), resolved_path, textproto));
+    LOG(INFO) << "Autotune logs serialized to file: " << resolved_path;
   }
-  const se::DeviceDescription& device_desc =
-      autotune_config.GetExecutor()->GetDeviceDescription();
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<HloModule> module,
-      util.ExtractModule([&](const DebugOptions& debug_opts) {
-        if (result.has_algorithm()) {
-          return CudnnGemmAutotuneExtractor(autotune_config, fusion, debug_opts,
-                                            result.algorithm().algo_id());
-        } else {
-          return TritonGemmAutotuneExtractor(
-              triton_gemm_config, device_desc, fusion, debug_opts,
-              /*allow_filtering_kernels_spilling_registers=*/true);
-        }
-      }));
-  module->set_name(std::string(fusion->name()));
-  // Using the original module for its debug info and name in the first
-  // parameter. It's better to include the name of both the original module
-  // and the extracted module, to avoid name clashes.
-  DumpToFileInDirOrStdout(
-      /*module=*/*fusion->GetModule(),
-      /*file_prefix=*/"",
-      /*file_suffix=*/
-      absl::StrCat("triton_fusion_", fusion_id, ".", module->name(),
-                   ".optimized.txt"),
-      /*contents=*/module->ToString());
   return absl::OkStatus();
 }
 
-absl::Status Autotune(
-    const AutotuneConfig& config, AutotunerCompileUtil& util,
-    tsl::thread::ThreadPool* thread_pool, const DebugOptions& debug_opts,
-    const absl::flat_hash_map<const HloFusionInstruction*, GemmConfigSet>&
-        gemm_config_sets) {
-  absl::flat_hash_map<const HloFusionInstruction*, ExecutableSet>
-      executable_sets;
-  TF_ASSIGN_OR_RETURN(
-      executable_sets,
-      CompileMany(config, util, thread_pool, debug_opts, gemm_config_sets));
+absl::Status GemmFusionAutotunerImpl::Autotune(
+    AutotunerCompileUtil& compile_util,
+    const absl::flat_hash_map<const HloFusionInstruction*, std::vector<Config>>&
+        gemm_config_sets,
+    AutoTuneCacheKeyCount fusion_count_map) {
+  TF_ASSIGN_OR_RETURN(auto executable_sets,
+                      CompileAll(compile_util, gemm_config_sets));
 
   // Sort the candidates to make their execution order well-defined for each
   // fusion.
   for (auto& key_value : executable_sets) {
-    ExecutableSet& executable_set = key_value.second;
-    std::vector<ExecutableCandidate>& candidates = executable_set.candidates;
-    absl::c_sort(candidates, [](const ExecutableCandidate& a,
-                                const ExecutableCandidate& b) {
+    absl::c_sort(key_value.second, [](const auto& a, const auto& b) {
       return a.config < b.config;
     });
   }
 
+  AutotuningLogs autotuning_logs;
   int fusion_id = 0;
   for (const auto& key_value : executable_sets) {
     const HloFusionInstruction* fusion = key_value.first;
-    const ExecutableSet& executable_set = key_value.second;
-
-    TF_ASSIGN_OR_RETURN(AutotuneResult result, Execute(config, util, debug_opts,
-                                                       fusion, executable_set));
+    TF_ASSIGN_OR_RETURN(
+        std::vector<AutotuneResult> results,
+        Profile(compile_util, *fusion, /*candidates=*/key_value.second));
+
+    // The reference config (if it exists) will be the first in the results,
+    // due to how sorting the variants work.
+    if (!debug_options_.xla_gpu_cublas_fallback() &&
+        results.front().has_gemm()) {
+      results.erase(results.begin());
+    }
 
-    if (debug_opts.xla_gpu_dump_autotuned_triton_fusions()) {
-      TF_RETURN_IF_ERROR(
-          DumpAutotunedFusion(config, util, result, fusion, fusion_id++));
+    const HloInstruction* root =
+        fusion->called_computations().at(0)->root_instruction();
+    TF_ASSIGN_OR_RETURN(
+        AutotuneResult best,
+        PickBestResult(results, root->ToString(), root->GetModule()->config()));
+    VLOG(2) << "Best time: "
+            << tsl::proto_utils::FromDurationProto(best.run_time());
+
+    if (debug_options_.xla_gpu_dump_autotuned_gemm_fusions()) {
+      TF_RETURN_IF_ERROR(DumpAutotunedFusion(
+          config_, toolkit_version_, compile_util, best, fusion, fusion_id++));
     }
 
-    const AutotuneCacheKey key = AutotunerUtil::GetKey(fusion, config);
-    if (!AutotunerUtil::AddResult(key, std::move(result))) {
+    const AutotuneCacheKey key = AutotunerUtil::GetKey(fusion, config_);
+    if (!AutotunerUtil::AddResult(key, std::move(best))) {
       // In the context of model server, concurrent autotuning is expected and
       // insertion of identical autotuning keys is accepted.
       LOG(WARNING) << "AutotunerUtil::AddResult already existed: "
                    << key.ToString();
     }
-  }
 
-  return absl::OkStatus();
-}
+    if (!debug_options_.xla_gpu_dump_autotune_logs_to().empty()) {
+      auto autotuning_log = autotuning_logs.add_logs();
+      autotuning_log->set_fusion_name(std::string(fusion->name()));
 
-}  // anonymous namespace
+      for (const auto& autotune_result : results) {
+        auto log_result = autotuning_log->add_results();
+        log_result->CopyFrom(autotune_result);
+      }
 
-absl::StatusOr<std::vector<TritonGemmConfig>> GetPossibleMatmulAutotuneConfigs(
-    const HloDotInstruction& dot,
-    const se::CudaComputeCapability compute_capability,
-    const DebugOptions& debug_options, bool exhaustive_tiling_search) {
-  // Avoid autotuning tiny fusions.
-  constexpr int kMinGemmElements = 32 * 32;
-  if (ShapeUtil::ElementsIn(dot.operand(0)->shape()) <= kMinGemmElements &&
-      ShapeUtil::ElementsIn(dot.operand(1)->shape()) <= kMinGemmElements) {
-    return ReduceTileSizes(dot, {kDefaultGemmTiling});
+      if (auto fusion_key_count = fusion_count_map.find(key);
+          fusion_key_count != fusion_count_map.end()) {
+        auto fusion_key = fusion_key_count->first;
+        auto fusion_count = fusion_key_count->second;
+        autotuning_log->set_fusion_count(fusion_count);
+      }
+    }
   }
-  // Split-K optimization enables more even utilization of a GPU in cases
-  // where tiling just the non-contracting dimensions of a GEMM does not create
-  // a sufficient number of thread block programs to occupy all available cores.
-  // Given the typical ~100 cores per GPU 500 tiles make around 5 full
-  // waves that completely avoid the need for split-K. The formula below is
-  // n_tiles = split_k * (M * N) / (block_m * block_n)
-  // with pessimistically assumed maximum block_m and block_n.
-  // Most likely there is no need for split-K already at much smaller output
-  // tensor sizes.
-  constexpr int kSufficientNumberOfTiles = 500;
-  const int max_split_k =
-      debug_options.xla_gpu_enable_split_k_autotuning()
-          ? std::max<int64_t>(1L, kSufficientNumberOfTiles * kMaxTileSize *
-                                      kMaxTileSize /
-                                      ShapeUtil::ElementsIn(dot.shape()))
-          : 1;
-  return exhaustive_tiling_search
-             ? GetExhaustiveMatmulAutotuneConfigs(dot, compute_capability,
-                                                  max_split_k, debug_options)
-             : ReduceTileSizes(dot, GetFixedMatmulAutotuneConfigs(
-                                        compute_capability, max_split_k));
+
+  TF_RETURN_IF_ERROR(DumpAutotuningLogs(debug_options_, autotuning_logs));
+
+  return absl::OkStatus();
 }
 
 absl::StatusOr<bool> GemmFusionAutotuner::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   XLA_SCOPED_LOGGING_TIMER("GEMM fusion autotuner");
-  const DebugOptions& debug_options = module->config().debug_options();
-  TF_ASSIGN_OR_RETURN(std::optional<AutotunerCompileUtil> opt_compile_util,
-                      AutotunerCompileUtil::Create(config_, debug_options));
 
-  GemmConfigSetCollector gemm_config_set_collector(config_);
-  absl::flat_hash_map<const HloFusionInstruction*, GemmConfigSet>
-      gemm_config_sets;
-  TF_ASSIGN_OR_RETURN(gemm_config_sets,
+  const DebugOptions& debug_options = module->config().debug_options();
+  GemmFusionAutotunerImpl autotuner(config_, toolkit_version_, debug_options,
+                                    thread_pool_);
+  GemmConfigSetCollector gemm_config_set_collector(&autotuner);
+  TF_ASSIGN_OR_RETURN(TilingConfigsMap gemm_config_sets,
                       gemm_config_set_collector.CollectGemmConfigSets(
                           module, execution_threads));
 
-  if (debug_options.xla_gpu_autotune_level() == 0 ||
-      debug_options.xla_gpu_deterministic_ops()) {
+  AutoTuneCacheKeyCount fusion_count_map =
+      gemm_config_set_collector.GetFusionsCount();
+
+  if (!autotuner.IsAutotuningEnabled()) {
     // Pick the first option for each gemm instead of autotuning.
     for (const auto& [fusion, tilings] : gemm_config_sets) {
+      const AutotuneCacheKey key = AutotunerUtil::GetKey(fusion, config_);
+      AutotuneResult res = FromConfig(tilings[0]);
+      *res.mutable_run_time() =
+          tsl::proto_utils::ToDurationProto(absl::ZeroDuration());
+      AutotunerUtil::AddResult(key, res);
+    }
+  } else if (!debug_options.xla_gpu_override_gemm_autotuner().empty()) {
+    // TODO(gflegar): support overriding with non-Triton configs (cuBLAS, cuDNN)
+    AutotuneResult::TritonGemmKey gemm_key;
+    CHECK(tsl::protobuf::TextFormat::ParseFromString(
+        debug_options.xla_gpu_override_gemm_autotuner(), &gemm_key));
+    VLOG(1) << "Overriding GEMM autotuner with the following config: "
+            << gemm_key.DebugString();
+    for (const auto& [fusion, unused] : gemm_config_sets) {
       const AutotuneCacheKey key = AutotunerUtil::GetKey(fusion, config_);
       AutotuneResult res;
-      if (IsFusionKind(*fusion, kCuDnnFusionKind)) {
-        res.mutable_algorithm()->set_algo_id(-1);
-      } else {
-        const HloDotInstruction* dot_instr =
-            Cast<HloDotInstruction>(hlo_query::GetFirstInstructionWithOpcode(
-                *fusion->called_computations().at(0), HloOpcode::kDot));
-        TF_ASSIGN_OR_RETURN(auto configs,
-                            ReduceTileSizes(*dot_instr, {kDefaultGemmTiling}));
-        auto config = configs.front();
-        *res.mutable_triton() = config.ToProto();
-      }
+      *res.mutable_triton() = gemm_key;
       *res.mutable_run_time() =
           tsl::proto_utils::ToDurationProto(absl::ZeroDuration());
       AutotunerUtil::AddResult(key, res);
     }
   } else if (!config_.IsDeviceless()) {
+    TF_ASSIGN_OR_RETURN(std::optional<AutotunerCompileUtil> opt_compile_util,
+                        AutotunerCompileUtil::Create(config_, debug_options));
     TF_RET_CHECK(opt_compile_util.has_value());
-    if (!gemm_config_sets.empty()) {
-      std::string correctness_check_str = config_.should_check_correctness()
-                                              ? "(with correctness check)"
-                                              : "(without correctness check)";
-
-      VLOG(1) << "Autotuning " << gemm_config_sets.size() << " fusions "
-              << correctness_check_str << ".";
-      TF_RETURN_IF_ERROR(Autotune(config_, *opt_compile_util, thread_pool_,
-                                  debug_options, gemm_config_sets));
-      VLOG(1) << "Done autotuning.";
-    }
+    std::string correctness_check_str = config_.should_check_correctness()
+                                            ? "(with correctness check)"
+                                            : "(without correctness check)";
+
+    VLOG(1) << "Autotuning " << gemm_config_sets.size() << " fusions "
+            << correctness_check_str << ".";
+    TF_RETURN_IF_ERROR(autotuner.Autotune(*opt_compile_util, gemm_config_sets,
+                                          std::move(fusion_count_map)));
+    VLOG(1) << "Done autotuning.";
   }
 
   return GemmFusionAutotunerVisitor(config_).RunOnModule(module,
diff --git a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.h b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.h
index 18a6e1b76caa44..e7fbbbe6312408 100644
--- a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.h
+++ b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.h
@@ -15,15 +15,25 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_GEMM_FUSION_AUTOTUNER_H_
 #define XLA_SERVICE_GPU_GEMM_FUSION_AUTOTUNER_H_
 
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <variant>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/autotuning.pb.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/executable.h"
+#include "xla/service/gpu/autotuner_compile_util.h"
 #include "xla/service/gpu/autotuner_util.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/hlo_pass_interface.h"
@@ -38,8 +48,11 @@ namespace gpu {
 class GemmFusionAutotuner : public HloModulePass {
  public:
   explicit GemmFusionAutotuner(const AutotuneConfig& config,
+                               const int32_t toolkit_version,
                                tsl::thread::ThreadPool* thread_pool)
-      : config_(config), thread_pool_(thread_pool) {}
+      : config_(config),
+        toolkit_version_(toolkit_version),
+        thread_pool_(thread_pool) {}
 
   absl::string_view name() const override { return "triton-autotuner"; }
 
@@ -49,15 +62,82 @@ class GemmFusionAutotuner : public HloModulePass {
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
  private:
-  AutotuneConfig config_;
+  const AutotuneConfig config_;
+  const int32_t toolkit_version_;
   tsl::thread::ThreadPool* thread_pool_;
 };
 
-// TODO(b/266210099): have a way to generate/load these dynamically.
-// Returns a list of possible tilings for a GEMM performed in Triton.
-absl::StatusOr<std::vector<TritonGemmConfig>> GetPossibleMatmulAutotuneConfigs(
-    const HloDotInstruction& dot, se::CudaComputeCapability compute_capability,
-    const DebugOptions& debug_options, bool exhaustive_tiling_search = false);
+// Autotuner implementation.
+class GemmFusionAutotunerImpl {
+ public:
+  GemmFusionAutotunerImpl(const AutotuneConfig config,
+                          const int32_t toolkit_version,
+                          const DebugOptions debug_options,
+                          tsl::thread::ThreadPool* thread_pool)
+      : config_(std::move(config)),
+        toolkit_version_(toolkit_version),
+        debug_options_(std::move(debug_options)),
+        thread_pool_(thread_pool) {}
+
+  struct CuBlasConfig {
+    bool operator<(const CuBlasConfig& other) const;
+  };
+  struct CuDnnConfig {
+    int64_t plan_id;
+    bool operator<(const CuDnnConfig& other) const;
+  };
+  using Config = std::variant<CuBlasConfig, CuDnnConfig, TritonGemmConfig>;
+
+  struct ExecutableCandidate {
+    Config config;
+    std::unique_ptr<Executable> executable;
+  };
+
+  // Generate all possible configs for a dot operation.
+  absl::StatusOr<std::vector<Config>> GenerateConfigs(
+      const HloFusionInstruction& fusion);
+  absl::StatusOr<std::vector<TritonGemmConfig>> GenerateTritonConfigs(
+      const HloDotInstruction& dot);
+
+  // Compile all executables for all fusions.
+  absl::StatusOr<absl::flat_hash_map<const HloFusionInstruction*,
+                                     std::vector<ExecutableCandidate>>>
+  CompileAll(AutotunerCompileUtil& compile_util,
+             const absl::flat_hash_map<const HloFusionInstruction*,
+                                       std::vector<Config>>& task);
+
+  // Profile all executables for a fusion.
+  absl::StatusOr<std::vector<AutotuneResult>> Profile(
+      AutotunerCompileUtil& compile_util, const HloFusionInstruction& fusion,
+      absl::Span<const ExecutableCandidate> candidates);
+
+  // Autotune and save the results to the autotuning cache.
+  absl::Status Autotune(
+      AutotunerCompileUtil& compile_util,
+      const absl::flat_hash_map<const HloFusionInstruction*,
+                                std::vector<Config>>& gemm_config_sets,
+      absl::flat_hash_map<AutotuneCacheKey, uint64_t> fusion_count_map);
+
+  // Helper methods.
+  const AutotuneConfig& GetConfig() const { return config_; }
+  bool IsAutotuningEnabled() const;
+  static std::string ToString(const Config& config);
+
+ private:
+  se::CudaComputeCapability GetComputeCapability() const {
+    return std::get<se::CudaComputeCapability>(
+        config_.GetGpuComputeCapability());
+  }
+
+  std::vector<TritonGemmConfig> GetDefaultTritonConfigs() const;
+  std::vector<TritonGemmConfig> GetExhaustiveTritonConfigs() const;
+
+  const AutotuneConfig config_;
+  const int32_t toolkit_version_;
+  const DebugOptions debug_options_;
+  tsl::thread::ThreadPool* thread_pool_;
+  std::vector<TritonGemmConfig> triton_configs_;
+};
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner_test.cc b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner_test.cc
index 42be8cd64eb8a2..4a5799aef8483f 100644
--- a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner_test.cc
+++ b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "xla/service/gpu/gemm_fusion_autotuner.h"
 
 #include <algorithm>
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <utility>
@@ -26,6 +27,7 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
+#include "third_party/gpus/cuda/include/cuda.h"
 #include "xla/autotuning.pb.h"
 #include "xla/error_spec.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
@@ -150,6 +152,8 @@ class StatelessAutotunerTest : public HloTestBase {
       : HloTestBase(/*verifier_layout_sensitive=*/true,
                     /*allow_mixed_precision_in_hlo_verifier=*/false) {}
 
+  int32_t GetToolkitVersion() const { return CUDA_VERSION; }
+
   void SetUp() override {
     AutotunerUtil::ClearAutotuneResults();
     HloTestBase::SetUp();
@@ -193,7 +197,7 @@ class GemmFusionAutotunerTest : public StatelessAutotunerTest {
         AutotuneConfig{DeviceConfig{backend().default_stream_executor(),
                                     backend().memory_allocator()},
                        opts},
-        &thread_pool);
+        GetToolkitVersion(), &thread_pool);
 
     RunAndFilecheckHloRewrite(
         hlo, std::move(pipeline), expected, [](const HloModule* m) {
@@ -230,6 +234,17 @@ class GemmFusionAutotunerTestWithMorePreciseReduction
   }
 };
 
+absl::StatusOr<std::vector<TritonGemmConfig>> GetPossibleMatmulAutotuneConfigs(
+    const HloDotInstruction& dot,
+    const se::CudaComputeCapability& compute_capability,
+    const int32_t toolkit_version, const DebugOptions& debug_options) {
+  DevicelessConfig test_config{/*model_str=*/"", compute_capability};
+  AutotuneConfig autotune_config{test_config, debug_options};
+  GemmFusionAutotunerImpl autotuner(autotune_config, toolkit_version,
+                                    debug_options, nullptr);
+  return autotuner.GenerateTritonConfigs(dot);
+}
+
 TEST_F(GemmFusionAutotunerTest, AmpereUsesMoreThanTwoStages) {
   std::unique_ptr<VerifiedHloModule> module = ParseAndReturnVerifiedModule(R"(
 ENTRY e {
@@ -246,7 +261,7 @@ ENTRY e {
       GetPossibleMatmulAutotuneConfigs(
           *Cast<HloDotInstruction>(
               module->entry_computation()->root_instruction()),
-          compute_capability, GetDebugOptionsForTest()));
+          compute_capability, GetToolkitVersion(), GetDebugOptionsForTest()));
   EXPECT_TRUE(std::any_of(
       configs.begin(), configs.end(),
       [](const TritonGemmConfig& config) { return config.num_stages > 2; }));
@@ -268,10 +283,10 @@ ENTRY e {
       GetPossibleMatmulAutotuneConfigs(
           *Cast<HloDotInstruction>(
               module->entry_computation()->root_instruction()),
-          compute_capability, GetDebugOptionsForTest()));
+          compute_capability, GetToolkitVersion(), GetDebugOptionsForTest()));
   EXPECT_TRUE(std::any_of(
       configs.begin(), configs.end(),
-      [](const TritonGemmConfig& config) { return config.split_k >= 16; }));
+      [](const TritonGemmConfig& config) { return config.split_k >= 4; }));
 }
 
 TEST_F(GemmFusionAutotunerTest, LargeOutputDoesNotUseLargeSplitK) {
@@ -290,7 +305,7 @@ ENTRY e {
       GetPossibleMatmulAutotuneConfigs(
           *Cast<HloDotInstruction>(
               module->entry_computation()->root_instruction()),
-          compute_capability, GetDebugOptionsForTest()));
+          compute_capability, GetToolkitVersion(), GetDebugOptionsForTest()));
   EXPECT_FALSE(std::any_of(
       configs.begin(), configs.end(),
       [](const TritonGemmConfig& config) { return config.split_k > 1; }));
@@ -533,7 +548,7 @@ class GemmFusionAutotunerDumpTest : public GemmFusionAutotunerTest {
     DebugOptions debug_options =
         GemmFusionAutotunerTest::GetDebugOptionsForTest();
     debug_options.set_xla_gpu_cublas_fallback(true);
-    debug_options.set_xla_gpu_dump_autotuned_triton_fusions(true);
+    debug_options.set_xla_gpu_dump_autotuned_gemm_fusions(true);
     return debug_options;
   }
 };
@@ -645,7 +660,7 @@ ENTRY e {
                                           ->GetDeviceDescription()
                                           .cuda_compute_capability()},
                      opts},
-      &thread_pool);
+      GetToolkitVersion(), &thread_pool);
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(hlo));
@@ -660,7 +675,7 @@ ENTRY e {
         RunFileCheck(
             module->ToString(HloPrintOptions{}.set_print_operand_shape(false)),
             R"(
-// CHECK: backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"16","block_n":"16","block_k":"16","split_k":"1","num_stages":"1","num_warps":"4","num_ctas":"1"}},"force_earliest_schedule":false}
+// CHECK: backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"16","block_n":"16","block_k":"32","split_k":"1","num_stages":"1","num_warps":"4","num_ctas":"1"}},"force_earliest_schedule":false}
             )"));
     EXPECT_TRUE(filecheck_matches);
   } else {
@@ -731,7 +746,7 @@ ENTRY e {
       GetPossibleMatmulAutotuneConfigs(
           *Cast<HloDotInstruction>(
               module->entry_computation()->root_instruction()),
-          compute_capability, GetDebugOptionsForTest()));
+          compute_capability, GetToolkitVersion(), GetDebugOptionsForTest()));
   EXPECT_TRUE(std::all_of(
       configs.begin(), configs.end(),
       [](const TritonGemmConfig& config) { return config.split_k == 1; }));
@@ -752,14 +767,17 @@ ENTRY wais {
       lhs_contracting_dims={1}, rhs_contracting_dims={0}, sparsity=L.1@2:4
 })";
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
-  auto dot =
-      Cast<HloDotInstruction>(module->entry_computation()->root_instruction());
+  const se::CudaComputeCapability compute_capability{
+      se::CudaComputeCapability::AMPERE, /*minor=*/0};
+  DebugOptions debug_options = GetDebugOptionsForTest();
+  debug_options.set_xla_gpu_exhaustive_tiling_search(GetParam());
 
   TF_ASSERT_OK_AND_ASSIGN(
-      auto configs,
+      const std::vector<TritonGemmConfig> configs,
       GetPossibleMatmulAutotuneConfigs(
-          *dot, se::CudaComputeCapability{8, 0}, GetDebugOptionsForTest(),
-          /*exhaustive_tiling_search=*/GetParam()));
+          *Cast<HloDotInstruction>(
+              module->entry_computation()->root_instruction()),
+          compute_capability, GetToolkitVersion(), debug_options));
   for (const auto& config : configs) {
     int metadata_size = config.block_m * config.block_k / 16;
     EXPECT_LE(config.num_warps * WarpSize(), metadata_size);
diff --git a/third_party/xla/xla/service/gpu/gemm_fusion_test.cc b/third_party/xla/xla/service/gpu/gemm_fusion_test.cc
index e5986c9968b5ea..4a718df503a01c 100644
--- a/third_party/xla/xla/service/gpu/gemm_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/gemm_fusion_test.cc
@@ -258,6 +258,102 @@ ENTRY e {
   EXPECT_FALSE(GemmFusion(cc).Run(module.get()).value());
 }
 
+TEST_F(GemmFusionTest, DynamicSliceIsFused) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+ENTRY e {
+  dot_lhs = f32[2,18] parameter(0)
+  dynamic_slice_input = f32[2,64,2] parameter(1)
+  start_index0 = s32[] parameter(2)
+  start_index1_2 = s32[] constant(0)
+  dynamic_slice = f32[1,64,2] dynamic-slice(dynamic_slice_input, start_index0, start_index1_2, start_index1_2),
+                  dynamic_slice_sizes={1,64,2}
+  reshape = f32[64,2] reshape(dynamic_slice)
+  ROOT dot = f16[18,64] dot(dot_lhs, reshape),
+             lhs_contracting_dims={0}, rhs_contracting_dims={1}
+})"));
+
+  EXPECT_TRUE(GemmFusion(se::CudaComputeCapability{
+                             se::CudaComputeCapability::AMPERE, 0})
+                  .Run(module.get())
+                  .value());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch((m::Fusion(m::Parameter(), m::Parameter(),
+                                    m::Parameter(), m::Constant()))));
+}
+
+TEST_F(GemmFusionTest, DynamicSlicesAreFusedEvenIfTheyShareIndices) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+ENTRY e {
+  p0 = f32[2,64,2] parameter(0)
+  p1 = s32[] parameter(1)
+  p2 = s32[] parameter(2)
+  p3 = s32[] parameter(3)
+  ds0 = f32[1,64,2] dynamic-slice(p0, p1, p2, p3), dynamic_slice_sizes={1,64,2}
+  a = f32[64,2] reshape(ds0)
+  ds1 = f32[1,64,2] dynamic-slice(p0, p3, p2, p1), dynamic_slice_sizes={1,64,2}
+  b = f32[64,2] reshape(ds1)
+  ROOT d = f16[64,64] dot(a, b),
+           lhs_contracting_dims={1}, rhs_contracting_dims={1}
+})"));
+
+  EXPECT_TRUE(GemmFusion(se::CudaComputeCapability{
+                             se::CudaComputeCapability::AMPERE, 0})
+                  .Run(module.get())
+                  .value());
+  // TODO(b/339810582): Don't duplicate scalar parameters to dot fusions,
+  // because they are never tiled differently.
+  // TODO(b/339814210): Don't count scalar parameters towards dot fusion
+  // parameter limit.
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch((m::Fusion(m::Parameter(), m::Parameter(), m::Parameter(),
+                            m::Parameter(), m::Parameter(), m::Parameter(),
+                            m::Parameter(), m::Parameter()))));
+}
+
+TEST_F(GemmFusionTest, DoNotFuseDynamicSliceOfNonMajorFragments) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+ENTRY e {
+  dot_lhs = f32[2,4]{1,0} parameter(0)
+  dynamic_slice_input = f32[4,5,2]{2,1,0} parameter(1)
+  c0 = s32[] constant(0)
+  c2 = s32[] constant(2)
+  dynamic_slice = f32[4,1,2]{2,1,0} dynamic-slice(dynamic_slice_input, c0, c2, c0),
+                  dynamic_slice_sizes={4,1,2}
+  reshape = f32[4,2]{1,0} reshape(dynamic_slice)
+  ROOT dot = f32[4,4]{1,0} dot(dot_lhs, reshape),
+             lhs_contracting_dims={0}, rhs_contracting_dims={1}
+})"));
+  const se::CudaComputeCapability cc{se::CudaComputeCapability::AMPERE, 0};
+  // FusionDecision "Unsupported dynamic slice on non-major-most dimension."
+  EXPECT_FALSE(GemmFusion(cc).Run(module.get()).value());
+}
+
+TEST_F(GemmFusionTest, CanFuseDynamicSliceOfContractingDimIfItIsMajor) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+ENTRY e {
+  dot_lhs = f32[2,4]{1,0} parameter(0)
+  dynamic_slice_input = f32[5,5]{1,0} parameter(1)
+  start_index0 = s32[] constant(2)
+  start_index1 = s32[] constant(0)
+  dynamic_slice = f32[2,5]{1,0} dynamic-slice(dynamic_slice_input, start_index0, start_index1),
+                  dynamic_slice_sizes={2,5}
+  ROOT d = f32[4,5]{1,0} dot(dot_lhs, dynamic_slice),
+           lhs_contracting_dims={0}, rhs_contracting_dims={0}
+})"));
+  EXPECT_TRUE(GemmFusion(se::CudaComputeCapability{
+                             se::CudaComputeCapability::AMPERE, 0})
+                  .Run(module.get())
+                  .value());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch((m::Fusion(m::Parameter(), m::Parameter(),
+                                    m::Constant(), m::Constant()))));
+}
+
 TEST_F(GemmFusionTest, SliceToDegenerateIsSkipped) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(R"(
@@ -889,11 +985,11 @@ TEST_F(GemmFusionLevel2Test, NestedSlicingIsAnalyzedCorrectly) {
                           ParseAndReturnVerifiedModule(R"(
 triton_gemm_d_computation {
   p0 = f32[6,24]{1,0} parameter(0)
-  s1 = f32[5,20]{1,0} slice(p0), slice={[1:6], [3:23]}
-  n1 = f32[5,20]{1,0} negate(s1)
-  s2 = f32[3,7]{1,0} slice(n1), slice={[1:4], [13:20]}
+  slice1 = f32[5,20]{1,0} slice(p0), slice={[1:6], [3:23]}
+  n1 = f32[5,20]{1,0} negate(slice1)
+  slice2 = f32[3,7]{1,0} slice(n1), slice={[1:4], [13:20]}
   p1 = f32[7,37]{1,0} parameter(1)
-  ROOT d = f32[3,37]{1,0} dot(s2, p1),
+  ROOT d = f32[3,37]{1,0} dot(slice2, p1),
     lhs_contracting_dims={1}, rhs_contracting_dims={0}
 }
 
diff --git a/third_party/xla/xla/service/gpu/gemm_rewriter.cc b/third_party/xla/xla/service/gpu/gemm_rewriter.cc
index 0aa610fc92f335..df4b9696c54870 100644
--- a/third_party/xla/xla/service/gpu/gemm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/gemm_rewriter.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
+#include <initializer_list>
 #include <limits>
 #include <memory>
 #include <optional>
@@ -68,9 +69,6 @@ limitations under the License.
 #include "tsl/platform/statusor.h"
 #include "tsl/protobuf/dnn.pb.h"
 
-#if GOOGLE_CUDA
-#include "third_party/gpus/cuda/include/cuda.h"
-#endif
 
 namespace xla {
 namespace gpu {
@@ -167,6 +165,25 @@ HloInstruction *PadOperandToMultipleOf16(absl::Span<const int64_t> batch_dims,
   return PadOperandToTargetShape(padded_shape, x);
 }
 
+// Calculates the reciprocal of scalar when invert is true and converts to FP32.
+absl::StatusOr<HloInstruction *> InvertAndConvertScalar(HloInstruction *scalar,
+                                                        bool invert) {
+  DCHECK(ShapeUtil::IsScalar(scalar->shape()));
+
+  if (invert) {
+    Literal one_literal = LiteralUtil::One(scalar->shape().element_type());
+    HloInstruction *one = scalar->parent()->AddInstruction(
+        HloInstruction::CreateConstant(one_literal.Clone()));
+    TF_ASSIGN_OR_RETURN(scalar, MakeBinaryHlo(HloOpcode::kDivide, one, scalar,
+                                              &scalar->metadata()));
+  }
+  if (scalar->shape().element_type() != F32) {
+    scalar = MakeConvertToHlo(scalar, F32, &scalar->metadata());
+  }
+
+  return scalar;
+}
+
 // Recursively collects unary, divide, dynamic-slice, pad, multiply or select
 // operands of instr and the index of the operand identifying the next op in the
 // sequence until an instruction with FP8 element type is reached. Returns an
@@ -186,7 +203,7 @@ std::vector<std::pair<HloInstruction *, int>> FindF8SubgraphRecursive(
       instr->opcode() == HloOpcode::kDynamicSlice ||
       instr->opcode() == HloOpcode::kPad) {
     return FindF8SubgraphRecursive(instr->mutable_operand(0), visited_instrs,
-                                   subgraph);
+                                   std::move(subgraph));
   } else if (instr->opcode() == HloOpcode::kMultiply ||
              instr->opcode() == HloOpcode::kSelect) {
     for (int k = 0; k < 2; ++k) {
@@ -381,8 +398,8 @@ auto CublasLtMatmul(HloInstruction **instr) {
   return m::CustomCall(instr, {kCublasLtMatmulCallTarget});
 }
 
-auto GemmOrCublasLtMatmul(HloInstruction **instr) {
-  return m::CustomCall(instr, {kGemmCallTarget, kCublasLtMatmulCallTarget});
+auto CublasLtMatmulF8(HloInstruction **instr) {
+  return m::CustomCall(instr, {kCublasLtMatmulF8CallTarget});
 }
 
 auto CublasLtMatmulMaybeF8(HloInstruction **instr) {
@@ -390,6 +407,10 @@ auto CublasLtMatmulMaybeF8(HloInstruction **instr) {
       instr, {kCublasLtMatmulCallTarget, kCublasLtMatmulF8CallTarget});
 }
 
+auto GemmOrCublasLtMatmul(HloInstruction **instr) {
+  return m::CustomCall(instr, {kGemmCallTarget, kCublasLtMatmulCallTarget});
+}
+
 auto GemmOrCublasLtMatmulMaybeF8(HloInstruction **instr) {
   return m::CustomCall(instr, {kGemmCallTarget, kCublasLtMatmulCallTarget,
                                kCublasLtMatmulF8CallTarget});
@@ -407,7 +428,7 @@ auto BcastConstScalarNear(double value) {
         // Not a very robust floating-point comparison, but good enough for our
         // purposes.
         std::optional<double> actual =
-            tensorflow::down_cast<const HloConstantInstruction *>(instr)
+            xla::Cast<const HloConstantInstruction>(instr)
                 ->literal()
                 .GetAsDouble({});
         if (!actual.has_value()) return false;
@@ -485,8 +506,11 @@ auto OptionalBitcast(HloInstruction **optional_bitcast, Pattern pattern) {
 class GemmRewriterVisitor : public DfsHloRewriteVisitor {
  public:
   explicit GemmRewriterVisitor(const se::GpuComputeCapability &gpu_version,
+                               const int32_t toolkit_version,
                                const bool f8_rewrite)
-      : gpu_version_(gpu_version), f8_rewrite_(f8_rewrite) {}
+      : gpu_version_(gpu_version),
+        toolkit_version_(toolkit_version),
+        f8_rewrite_(f8_rewrite) {}
 
   absl::Status HandleDot(HloInstruction *instr) override {
     if (!IsMatrixMultiplication(*instr) &&
@@ -573,17 +597,16 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
                              const_cast<HloInstruction *>(instr), b, b_scale,
                              b_mult_scale, b_ops);
                        })))) {
-#if TENSORFLOW_USE_ROCM
-        if (instr->shape().element_type() != F16 &&
+        if (IsRocm(gpu_version_) && instr->shape().element_type() != F16 &&
             instr->shape().element_type() != F32) {
           TF_ASSIGN_OR_RETURN(instr,
                               TurnF8DotWithUnsupportedOutputTypeIntoF32(instr));
         }
-#endif  // TENSORFLOW_USE_ROCM
-        TF_ASSIGN_OR_RETURN(bool created_call,
-                            CreateF8CustomCall(instr, gpu_backend_config, a, b,
-                                               a_scale, b_scale, a_mult_scale,
-                                               b_mult_scale, a_ops, b_ops));
+        TF_ASSIGN_OR_RETURN(
+            bool created_call,
+            CreateF8CustomCall(instr, gpu_backend_config, a, b, a_scale,
+                               b_scale, a_mult_scale, b_mult_scale,
+                               std::move(a_ops), std::move(b_ops)));
         if (created_call) {
           return absl::OkStatus();
         }
@@ -637,6 +660,13 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
       }
     }
 
+    HloInstruction *d_scale;
+    if (Match(instr, m::MultiplyAnyOrder(
+                         CublasLtMatmulF8(&existing_gemm).WithOneUser(),
+                         m::Broadcast(m::Op(&d_scale)).WithOneUser()))) {
+      return F8ScaleD(instr, existing_gemm, d_scale);
+    }
+
     // Attempt to match approximate GELU activation
     // (https://arxiv.org/abs/1606.08415), where:
     // approx_gelu(x) = x * cdf(x)
@@ -685,6 +715,16 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     return absl::OkStatus();
   }
 
+  // Fuse the scaling of an FP8 GEMM into the Custom Call.
+  absl::Status HandleDivide(HloInstruction *instr) override {
+    HloInstruction *existing_gemm, *d_scale;
+    if (Match(instr, m::Divide(CublasLtMatmulF8(&existing_gemm).WithOneUser(),
+                               m::Broadcast(m::Op(&d_scale)).WithOneUser()))) {
+      return F8ScaleD(instr, existing_gemm, d_scale);
+    }
+    return OkStatus();
+  }
+
   absl::Status HandleAdd(HloInstruction *instr) override {
     HloInstruction *bias, *existing_gemm = nullptr;
     HloInstruction *optional_slice = nullptr;
@@ -857,34 +897,57 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
   }
 
   absl::Status HandleConvert(HloInstruction *instr) override {
-    HloInstruction *clamp_lower, *clamp_upper, *d_scale, *existing_gemm,
-        *binary;
+    HloInstruction *clamp_lower, *clamp_upper, *existing_gemm,
+        *d_scale = nullptr, *binary = nullptr;
     // Attempt to elide the scaling and conversion of the result of an FP8
     // GEMM, including the optional calculation of the maximum of the absolute
     // values before scaling, and adapt the Custom Call.
     if (Match(instr,
               m::Convert(
-                  m::Clamp(m::Broadcast(m::ConstantScalar(&clamp_lower)),
-                           m::AnyOf<HloInstruction>(
-                               m::Divide(
-                                   &binary,
-                                   m::CustomCall(&existing_gemm,
-                                                 {kCublasLtMatmulF8CallTarget}),
-                                   m::Broadcast(m::Op(&d_scale))),
-                               m::MultiplyAnyOrder(
-                                   &binary,
-                                   m::CustomCall(&existing_gemm,
-                                                 {kCublasLtMatmulF8CallTarget}),
-                                   m::Broadcast(m::Op(&d_scale)))),
-                           m::Broadcast(m::ConstantScalar(&clamp_upper)))
+                  m::Clamp(
+                      m::Broadcast(m::ConstantScalar(&clamp_lower)),
+                      m::AnyOf<HloInstruction>(
+                          CublasLtMatmulF8(&existing_gemm),
+                          m::Divide(&binary, CublasLtMatmulF8(&existing_gemm),
+                                    m::Broadcast(m::Op(&d_scale))),
+                          m::MultiplyAnyOrder(&binary,
+                                              CublasLtMatmulF8(&existing_gemm),
+                                              m::Broadcast(m::Op(&d_scale)))),
+                      m::Broadcast(m::ConstantScalar(&clamp_upper)))
                       .WithOneUser()))) {
       return F8ConvertD(
           instr, existing_gemm, d_scale, clamp_lower, clamp_upper,
-          /*mult_scale=*/binary->opcode() == HloOpcode::kMultiply);
+          /*mult_scale=*/(binary && binary->opcode() == HloOpcode::kMultiply));
     }
     return absl::OkStatus();
   }
 
+  static bool IsCuda(const se::GpuComputeCapability &gpu_version) {
+    return std::holds_alternative<se::CudaComputeCapability>(gpu_version);
+  }
+
+  static absl::StatusOr<se::CudaComputeCapability> GetCudaComputeCapability(
+      const se::GpuComputeCapability &gpu_version) {
+    auto *cuda_cc = std::get_if<se::CudaComputeCapability>(&gpu_version);
+    if (cuda_cc == nullptr) {
+      return absl::InvalidArgumentError("Compute Capability is not CUDA.");
+    }
+    return *cuda_cc;
+  }
+
+  static bool IsRocm(const se::GpuComputeCapability &gpu_version) {
+    return std::holds_alternative<se::RocmComputeCapability>(gpu_version);
+  }
+
+  static absl::StatusOr<se::RocmComputeCapability> GetRocmComputeCapability(
+      const se::GpuComputeCapability &gpu_version) {
+    auto rocm_cc = std::get_if<se::RocmComputeCapability>(&gpu_version);
+    if (rocm_cc == nullptr) {
+      return absl::InvalidArgumentError("Compute Capability is not ROCm.");
+    }
+    return *rocm_cc;
+  }
+
   absl::StatusOr<bool> CreateF8CustomCall(
       HloInstruction *instr, GpuBackendConfig &gpu_backend_config,
       HloInstruction *a, HloInstruction *b, HloInstruction *a_scale,
@@ -893,84 +956,82 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
       std::vector<std::pair<HloInstruction *, int>> b_ops) {
     GemmBackendConfig &gemm_backend_config =
         *gpu_backend_config.mutable_gemm_backend_config();
-#if GOOGLE_CUDA
-    auto cuda_compute_capability_ =
-        std::get<se::CudaComputeCapability>(gpu_version_);
-    // FP8 GEMM kernels are only available on Ada, Hopper, and later
-    // architectures.
-    if (!cuda_compute_capability_.IsAtLeast(8, 9)) {
-      VLOG(1)
-          << "FP8 Custom Calls require Ada, Hopper, or later architectures.";
-      return false;
+    if (IsCuda(gpu_version_)) {
+      TF_ASSIGN_OR_RETURN(auto cuda_compute_capability,
+                          GetCudaComputeCapability(gpu_version_));
+      // FP8 GEMM kernels are only available on Ada, Hopper, and later
+      // architectures.
+      if (!cuda_compute_capability.IsAtLeast(8, 9)) {
+        VLOG(1) << "FP8 Custom Calls require Ada, Hopper, or later "
+                   "architectures. Got: "
+                << cuda_compute_capability.ToString()
+                << " and toolkit version: " << toolkit_version_;
+        return false;
+      }
+      // FP8 GEMM kernels are only available with CUDA 12.0 and above
+      if (toolkit_version_ < 12000) {
+        VLOG(1) << "FP8 Custom Calls require CUDA 12.0 or newer.";
+        return false;
+      }
     }
 
-#if CUDA_VERSION < 12000
-    // FP8 GEMM kernels are only available with CUDA 12.0 and above
-    VLOG(1) << "FP8 Custom Calls require CUDA 12.0 or newer.";
-    return false;
-#endif  // CUDA_VERSION < 12000
-
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-    auto isrocm = std::get_if<se::RocmComputeCapability>(&gpu_version_);
-    if (!isrocm->has_fp8_support()) {
-      VLOG(1) << "FP8 Custom Calls require MI300, or later architectures.";
-      return false;
+    if (IsRocm(gpu_version_)) {
+      TF_ASSIGN_OR_RETURN(auto rocm_compute_capability,
+                          GetRocmComputeCapability(gpu_version_));
+      if (!rocm_compute_capability.has_fp8_support()) {
+        VLOG(1) << "FP8 Custom Calls require MI300, or later architectures.";
+        return false;
+      }
+      if (toolkit_version_ < 60000) {
+        // FP8 GEMM kernels are only available with ROCm 6.0 and above
+        VLOG(1) << "FP8 Custom Calls require ROCm 6.0 or newer.";
+        return false;
+      }
     }
 
-#if TF_ROCM_VERSION < 60000
-    // FP8 GEMM kernels are only available with ROCm 6.0 and above
-    VLOG(1) << "FP8 Custom Calls require ROCm 6.0 or newer.";
-    return false;
-#endif  // TF_ROCM_VERSION < 60000
-
-#endif  // TENSORFLOW_USE_ROCM
-
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     PrimitiveType a_type = a->shape().element_type();
     PrimitiveType b_type = b->shape().element_type();
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
     // cuBLASLt FP8 GEMM kernels require one of the two operands to be in
     // F8E4M3FN format.
-#if GOOGLE_CUDA
-    if (a_type == F8E5M2 && b_type == F8E5M2) {
-      VLOG(1)
-          << "Failed to rewrite " << instr->ToShortString()
-          << " into FP8 Custom Call. The element type of one of the operands "
-             "must be F8E4M3FN.";
-      return false;
-    }
-    if ((a_type != F8E5M2 && a_type != F8E4M3FN) ||
-        (b_type != F8E5M2 && b_type != F8E4M3FN)) {
-      VLOG(1) << "Failed to rewrite " << instr->ToShortString()
-              << " into FP8 Custom Call. The input types must be F8E5M2 or "
-                 "F8E4M3FN, but got "
-              << PrimitiveType_Name(a_type) << " and "
-              << PrimitiveType_Name(b_type);
-      return false;
+    if (IsCuda(gpu_version_)) {
+      if (a_type == F8E5M2 && b_type == F8E5M2) {
+        VLOG(1)
+            << "Failed to rewrite " << instr->ToShortString()
+            << " into FP8 Custom Call. The element type of one of the operands "
+               "must be F8E4M3FN.";
+        return false;
+      }
+      if ((a_type != F8E5M2 && a_type != F8E4M3FN) ||
+          (b_type != F8E5M2 && b_type != F8E4M3FN)) {
+        VLOG(1) << "Failed to rewrite " << instr->ToShortString()
+                << " into FP8 Custom Call. The input types must be F8E5M2 or "
+                   "F8E4M3FN, but got "
+                << PrimitiveType_Name(a_type) << " and "
+                << PrimitiveType_Name(b_type);
+        return false;
+      }
     }
-#endif  // GOOGLE_CUDA
 
-#if TENSORFLOW_USE_ROCM
-    if (a_type == F8E5M2FNUZ && b_type == F8E5M2FNUZ) {
-      VLOG(1)
-          << "Failed to rewrite " << instr->ToShortString()
-          << " into FP8 Custom Call. The element type of one of the operands "
-             "must be F8E4M3FNUZ.";
-      return false;
-    }
-    if ((a_type != F8E5M2FNUZ && a_type != F8E4M3FNUZ) ||
-        (b_type != F8E5M2FNUZ && b_type != F8E4M3FNUZ)) {
-      VLOG(1) << "Failed to rewrite " << instr->ToShortString()
-              << " into FP8 Custom Call. The input types must be F8E5M2FNUZ or "
-                 "F8E4M3FNUZ, but got "
-              << PrimitiveType_Name(a_type) << " and "
-              << PrimitiveType_Name(b_type);
-      return false;
+    if (IsRocm(gpu_version_)) {
+      if (a_type == F8E5M2FNUZ && b_type == F8E5M2FNUZ) {
+        VLOG(1)
+            << "Failed to rewrite " << instr->ToShortString()
+            << " into FP8 Custom Call. The element type of one of the operands "
+               "must be F8E4M3FNUZ.";
+        return false;
+      }
+      if ((a_type != F8E5M2FNUZ && a_type != F8E4M3FNUZ) ||
+          (b_type != F8E5M2FNUZ && b_type != F8E4M3FNUZ)) {
+        VLOG(1)
+            << "Failed to rewrite " << instr->ToShortString()
+            << " into FP8 Custom Call. The input types must be F8E5M2FNUZ or "
+               "F8E4M3FNUZ, but got "
+            << PrimitiveType_Name(a_type) << " and "
+            << PrimitiveType_Name(b_type);
+        return false;
+      }
     }
-#endif  // TENSORFLOW_USE_ROCM
 
     absl::Span<const int64_t> batch_dims =
         gemm_backend_config.dot_dimension_numbers().rhs_batch_dimensions();
@@ -1164,6 +1225,48 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     return true;
   }
 
+  absl::Status F8ScaleD(HloInstruction *instr, HloInstruction *existing_gemm,
+                        HloInstruction *d_scale) {
+    if (!ShapeUtil::IsScalar(d_scale->shape())) {
+      return absl::OkStatus();
+    }
+
+    // When the output of an FP8 GEMM is scaled but not type converted to FP8,
+    // cublasLT requires the scaling factor to be forwarded to the Custom Call
+    // as a_scale (chosen here) or b_scale. The scaling factor is fused here
+    // when no input scaling factors were fused during the creation of the
+    // Custom Call. When the maximum of the absolute value of the output of an
+    // FP8 GEMM is calculated and the output is scaled and type converted to
+    // FP8, the scaling of the output is fused in F8ConvertD.
+    if (!existing_gemm->operand(2)->IsConstant() ||
+        existing_gemm->operand(2)->literal().GetAsDouble({}) != 1.) {
+      return absl::OkStatus();
+    }
+
+    // The application of the scaling of the output to the input (see previous
+    // comment) is not valid for epilogues other than ReLU or when a matrix bias
+    // has been fused.
+    TF_ASSIGN_OR_RETURN(auto gpu_backend_config,
+                        existing_gemm->backend_config<GpuBackendConfig>());
+    const GemmBackendConfig &config = gpu_backend_config.gemm_backend_config();
+    if ((config.epilogue() != GemmBackendConfig::DEFAULT &&
+         config.epilogue() != GemmBackendConfig::RELU) ||
+        config.beta() != 0.) {
+      return absl::OkStatus();
+    }
+
+    // If necessary, invert the scaling factor of D and convert to F32.
+    TF_ASSIGN_OR_RETURN(
+        d_scale,
+        InvertAndConvertScalar(d_scale, instr->opcode() == HloOpcode::kDivide));
+
+    TF_RETURN_IF_ERROR(existing_gemm->ReplaceOperandWith(2, d_scale));
+    TF_RETURN_IF_ERROR(ReplaceInstruction(instr, existing_gemm));
+
+    VLOG(1) << "Scaling of FP8 GEMM fused into Custom Call.";
+    return absl::OkStatus();
+  }
+
   absl::Status F8ConvertD(HloInstruction *instr, HloInstruction *existing_gemm,
                           HloInstruction *d_scale, HloInstruction *clamp_lower,
                           HloInstruction *clamp_upper,
@@ -1187,7 +1290,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
       return absl::OkStatus();
     }
 
-    if (!ShapeUtil::IsScalar(d_scale->shape())) {
+    if (d_scale && !ShapeUtil::IsScalar(d_scale->shape())) {
       return absl::OkStatus();
     }
 
@@ -1256,19 +1359,12 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     }
 
     // If necessary, invert the scaling factor of D and convert to F32.
-    if (!mult_scale) {
-      Literal one_literal = LiteralUtil::One(d_scale->shape().element_type());
-      HloInstruction *one = instr->AddInstruction(
-          HloInstruction::CreateConstant(one_literal.Clone()));
-      d_scale = instr->AddInstruction(HloInstruction::CreateBinary(
-          d_scale->shape(), HloOpcode::kDivide, one, d_scale));
+    if (d_scale) {
+      TF_ASSIGN_OR_RETURN(d_scale,
+                          InvertAndConvertScalar(d_scale, !mult_scale));
+      TF_RETURN_IF_ERROR(existing_gemm->ReplaceOperandWith(
+          gemm_backend_config.beta() == 0.0 ? 5 : 6, d_scale));
     }
-    if (d_scale->shape().element_type() != F32) {
-      d_scale = instr->AddInstruction(HloInstruction::CreateConvert(
-          ShapeUtil::MakeScalarShape(F32), d_scale));
-    }
-    TF_RETURN_IF_ERROR(existing_gemm->ReplaceOperandWith(
-        gemm_backend_config.beta() == 0.0 ? 5 : 6, d_scale));
 
     // If present, elide the calculation of the maximum of the absolute values
     // of the result of the GEMM.
@@ -1280,6 +1376,8 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
         existing_gemm->CloneWithNewShape(instr->shape());
     TF_RETURN_IF_ERROR(ReplaceWithNewInstruction(instr, std::move(new_gemm)));
 
+    VLOG(1) << "Conversion" << (reduce_damax ? " and amax calculation" : "")
+            << " fused into FP8 GEMM.";
     return absl::OkStatus();
   }
 
@@ -1293,6 +1391,12 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     HloInstruction *gemm_and_damax =
         instr->AddInstruction(existing_gemm->CloneWithNewShape(tuple_shape));
 
+    TF_ASSIGN_OR_RETURN(auto gpu_config,
+                        gemm_and_damax->backend_config<GpuBackendConfig>());
+    GemmBackendConfig &config = *gpu_config.mutable_gemm_backend_config();
+    config.set_damax_output(true);
+    TF_RETURN_IF_ERROR(gemm_and_damax->set_backend_config(gpu_config));
+
     // Obtain D and DAmax separately from the output tuple.
     HloInstruction *d =
         instr->AddInstruction(HloInstruction::CreateGetTupleElement(
@@ -1618,16 +1722,13 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     if (!SupportsEpilogueFusion(gemm->shape().element_type())) {
       return absl::OkStatus();
     }
-
-#if CUDA_VERSION < 12040
     // For CUDA versions less than 12.3.2, cuBLAS LT returns
     // CUBLAS_STATUS_NOT_SUPPORTED in some cases when fusing gelu into an FP8
     // matmul. We cannot check the patch version, so disable this fusion with
     // CUDA versions less than 12.4.
-    if (IsCublasLtMatmulF8(*gemm)) {
+    if (toolkit_version_ < 12040 && IsCublasLtMatmulF8(*gemm)) {
       return absl::OkStatus();
     }
-#endif
 
     // There are four users of the gemm output within the GELU calculation.
     bool has_aux = gemm->user_count() > 4;
@@ -1671,6 +1772,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
 
  private:
   se::GpuComputeCapability gpu_version_;
+  int32_t toolkit_version_;
   bool f8_rewrite_;
 
   // Choose cublas or cublasLt for the target of the custom call that instr will
@@ -1832,131 +1934,136 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
 
     using se::blas::ComputationType;
     using se::blas::DataType;
+    using TypeCombinations = std::initializer_list<std::tuple<
+        ComputationType, DataType /*scale_type*/, PrimitiveType /*a_dtype*/,
+        PrimitiveType /*b_dtype*/, DataType /*output_dtype*/>>;
     // This matrix of supported types is taken directly from cublasLt
     // documentation.
     // https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmul
-    const std::array<
-        std::tuple<ComputationType, DataType /*scale_type*/,
-                   PrimitiveType /*a_dtype*/, PrimitiveType /*b_dtype*/,
-                   DataType /*output_dtype*/>,
-        32>
-        supported_type_combinations = {{
-#if GOOGLE_CUDA
-            // FP8 types:
-            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FN,
-             PrimitiveType::F8E4M3FN, DataType::kBF16},
-            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FN,
-             PrimitiveType::F8E4M3FN, DataType::kF8E4M3FN},
-            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FN,
-             PrimitiveType::F8E4M3FN, DataType::kHalf},
-            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FN,
-             PrimitiveType::F8E4M3FN, DataType::kFloat},
-
-            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FN,
-             PrimitiveType::F8E5M2, DataType::kBF16},
-            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FN,
-             PrimitiveType::F8E5M2, DataType::kF8E4M3FN},
-            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FN,
-             PrimitiveType::F8E5M2, DataType::kF8E5M2},
-            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FN,
-             PrimitiveType::F8E5M2, DataType::kHalf},
-            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FN,
-             PrimitiveType::F8E5M2, DataType::kFloat},
-
-            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E5M2,
-             PrimitiveType::F8E4M3FN, DataType::kBF16},
-            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E5M2,
-             PrimitiveType::F8E4M3FN, DataType::kF8E4M3FN},
-            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E5M2,
-             PrimitiveType::F8E4M3FN, DataType::kF8E5M2},
-            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E5M2,
-             PrimitiveType::F8E4M3FN, DataType::kHalf},
-            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E5M2,
-             PrimitiveType::F8E4M3FN, DataType::kFloat},
-#endif  // GOOGLE_CUDA
-#if TENSORFLOW_USE_ROCM
-            // FP8 types:
-            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FNUZ,
-             PrimitiveType::F8E4M3FNUZ, DataType::kBF16},
-            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FNUZ,
-             PrimitiveType::F8E4M3FNUZ, DataType::kF8E4M3FNUZ},
-            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FNUZ,
-             PrimitiveType::F8E4M3FNUZ, DataType::kHalf},
-            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FNUZ,
-             PrimitiveType::F8E4M3FNUZ, DataType::kFloat},
-
-            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FNUZ,
-             PrimitiveType::F8E5M2FNUZ, DataType::kBF16},
-            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FNUZ,
-             PrimitiveType::F8E5M2FNUZ, DataType::kF8E4M3FNUZ},
-            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FNUZ,
-             PrimitiveType::F8E5M2FNUZ, DataType::kF8E5M2FNUZ},
-            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FNUZ,
-             PrimitiveType::F8E5M2FNUZ, DataType::kHalf},
-            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FNUZ,
-             PrimitiveType::F8E5M2FNUZ, DataType::kFloat},
-
-            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E5M2FNUZ,
-             PrimitiveType::F8E4M3FNUZ, DataType::kBF16},
-            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E5M2FNUZ,
-             PrimitiveType::F8E4M3FNUZ, DataType::kF8E4M3FNUZ},
-            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E5M2FNUZ,
-             PrimitiveType::F8E4M3FNUZ, DataType::kF8E5M2FNUZ},
-            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E5M2FNUZ,
-             PrimitiveType::F8E4M3FNUZ, DataType::kHalf},
-            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E5M2FNUZ,
-             PrimitiveType::F8E4M3FNUZ, DataType::kFloat},
-#endif  // TENSORFLOW_USE_ROCM
+    const TypeCombinations supported_cublas_type_combinations = {
+        // FP8 types:
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FN,
+         PrimitiveType::F8E4M3FN, DataType::kBF16},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FN,
+         PrimitiveType::F8E4M3FN, DataType::kF8E4M3FN},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FN,
+         PrimitiveType::F8E4M3FN, DataType::kHalf},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FN,
+         PrimitiveType::F8E4M3FN, DataType::kFloat},
+
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FN,
+         PrimitiveType::F8E5M2, DataType::kBF16},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FN,
+         PrimitiveType::F8E5M2, DataType::kF8E4M3FN},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FN,
+         PrimitiveType::F8E5M2, DataType::kF8E5M2},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FN,
+         PrimitiveType::F8E5M2, DataType::kHalf},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FN,
+         PrimitiveType::F8E5M2, DataType::kFloat},
+
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E5M2,
+         PrimitiveType::F8E4M3FN, DataType::kBF16},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E5M2,
+         PrimitiveType::F8E4M3FN, DataType::kF8E4M3FN},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E5M2,
+         PrimitiveType::F8E4M3FN, DataType::kF8E5M2},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E5M2,
+         PrimitiveType::F8E4M3FN, DataType::kHalf},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E5M2,
+         PrimitiveType::F8E4M3FN, DataType::kFloat},
+        // There would be an entry here for A/BType complex int8, but we do
+        // not support that type.
+        {ComputationType::kF32, DataType::kComplexFloat, PrimitiveType::C64,
+         PrimitiveType::C64, DataType::kComplexFloat},
+
+        {ComputationType::kF16AsF32, DataType::kFloat, PrimitiveType::F32,
+         PrimitiveType::F32, DataType::kFloat},
+        {ComputationType::kF16AsF32, DataType::kComplexFloat,
+         PrimitiveType::C64, PrimitiveType::C64, DataType::kComplexFloat},
+        // The next 4 may be supported by hipblaslt, but they are not
+        // covered by any unit tests
+        {ComputationType::kBF16AsF32, DataType::kFloat, PrimitiveType::F32,
+         PrimitiveType::F32, DataType::kFloat},
+        {ComputationType::kBF16AsF32, DataType::kComplexFloat,
+         PrimitiveType::C64, PrimitiveType::C64, DataType::kComplexFloat},
+
+        {ComputationType::kTF32AsF32, DataType::kFloat, PrimitiveType::F32,
+         PrimitiveType::F32, DataType::kFloat},
+        {ComputationType::kTF32AsF32, DataType::kComplexFloat,
+         PrimitiveType::C64, PrimitiveType::C64, DataType::kComplexFloat},
+
+        {ComputationType::kF64, DataType::kDouble, PrimitiveType::F64,
+         PrimitiveType::F64, DataType::kDouble},
+        {ComputationType::kF64, DataType::kComplexDouble, PrimitiveType::C128,
+         PrimitiveType::C128, DataType::kComplexDouble},
+    };
+    if (absl::c_linear_search(supported_cublas_type_combinations,
+                              std::make_tuple(compute_type, scale_type, a_dtype,
+                                              b_dtype, output_dtype))) {
+      return true;
+    }
+    const TypeCombinations supported_hipblas_type_combinations = {
+        // FP8 types:
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FNUZ,
+         PrimitiveType::F8E4M3FNUZ, DataType::kBF16},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FNUZ,
+         PrimitiveType::F8E4M3FNUZ, DataType::kF8E4M3FNUZ},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FNUZ,
+         PrimitiveType::F8E4M3FNUZ, DataType::kHalf},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FNUZ,
+         PrimitiveType::F8E4M3FNUZ, DataType::kFloat},
+
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FNUZ,
+         PrimitiveType::F8E5M2FNUZ, DataType::kBF16},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FNUZ,
+         PrimitiveType::F8E5M2FNUZ, DataType::kF8E4M3FNUZ},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FNUZ,
+         PrimitiveType::F8E5M2FNUZ, DataType::kF8E5M2FNUZ},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FNUZ,
+         PrimitiveType::F8E5M2FNUZ, DataType::kHalf},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FNUZ,
+         PrimitiveType::F8E5M2FNUZ, DataType::kFloat},
+
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E5M2FNUZ,
+         PrimitiveType::F8E4M3FNUZ, DataType::kBF16},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E5M2FNUZ,
+         PrimitiveType::F8E4M3FNUZ, DataType::kF8E4M3FNUZ},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E5M2FNUZ,
+         PrimitiveType::F8E4M3FNUZ, DataType::kF8E5M2FNUZ},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E5M2FNUZ,
+         PrimitiveType::F8E4M3FNUZ, DataType::kHalf},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E5M2FNUZ,
+         PrimitiveType::F8E4M3FNUZ, DataType::kFloat},
+    };
+    if (absl::c_linear_search(supported_hipblas_type_combinations,
+                              std::make_tuple(compute_type, scale_type, a_dtype,
+                                              b_dtype, output_dtype))) {
+      return true;
+    }
+    const TypeCombinations supported_type_combinations = {
         // Other data types:
-            {ComputationType::kF16, DataType::kHalf, PrimitiveType::F16,
-             PrimitiveType::F16, DataType::kHalf},
-
-            {ComputationType::kI32, DataType::kInt32, PrimitiveType::S8,
-             PrimitiveType::S8, DataType::kInt32},
-            {ComputationType::kI32, DataType::kFloat, PrimitiveType::S8,
-             PrimitiveType::S8, DataType::kInt8},
-
-            {ComputationType::kF32, DataType::kFloat, PrimitiveType::BF16,
-             PrimitiveType::BF16, DataType::kBF16},
-            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F16,
-             PrimitiveType::F16, DataType::kHalf},
-            {ComputationType::kF32, DataType::kFloat, PrimitiveType::S8,
-             PrimitiveType::S8, DataType::kFloat},
-            {ComputationType::kF32, DataType::kFloat, PrimitiveType::BF16,
-             PrimitiveType::BF16, DataType::kFloat},
-            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F16,
-             PrimitiveType::F16, DataType::kFloat},
-            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F32,
-             PrimitiveType::F32, DataType::kFloat},
-#if GOOGLE_CUDA
-            // There would be an entry here for A/BType complex int8, but we do
-            // not support that type.
-            {ComputationType::kF32, DataType::kComplexFloat, PrimitiveType::C64,
-             PrimitiveType::C64, DataType::kComplexFloat},
-
-            {ComputationType::kF16AsF32, DataType::kFloat, PrimitiveType::F32,
-             PrimitiveType::F32, DataType::kFloat},
-            {ComputationType::kF16AsF32, DataType::kComplexFloat,
-             PrimitiveType::C64, PrimitiveType::C64, DataType::kComplexFloat},
-            // The next 4 may be supported by hipblaslt, but they are not
-            // covered by any unit tests
-            {ComputationType::kBF16AsF32, DataType::kFloat, PrimitiveType::F32,
-             PrimitiveType::F32, DataType::kFloat},
-            {ComputationType::kBF16AsF32, DataType::kComplexFloat,
-             PrimitiveType::C64, PrimitiveType::C64, DataType::kComplexFloat},
-
-            {ComputationType::kTF32AsF32, DataType::kFloat, PrimitiveType::F32,
-             PrimitiveType::F32, DataType::kFloat},
-            {ComputationType::kTF32AsF32, DataType::kComplexFloat,
-             PrimitiveType::C64, PrimitiveType::C64, DataType::kComplexFloat},
-
-            {ComputationType::kF64, DataType::kDouble, PrimitiveType::F64,
-             PrimitiveType::F64, DataType::kDouble},
-            {ComputationType::kF64, DataType::kComplexDouble,
-             PrimitiveType::C128, PrimitiveType::C128,
-             DataType::kComplexDouble},
-#endif  // GOOGLE_CUDA
-        }};
+        {ComputationType::kF16, DataType::kHalf, PrimitiveType::F16,
+         PrimitiveType::F16, DataType::kHalf},
+
+        {ComputationType::kI32, DataType::kInt32, PrimitiveType::S8,
+         PrimitiveType::S8, DataType::kInt32},
+        {ComputationType::kI32, DataType::kFloat, PrimitiveType::S8,
+         PrimitiveType::S8, DataType::kInt8},
+
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::BF16,
+         PrimitiveType::BF16, DataType::kBF16},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F16,
+         PrimitiveType::F16, DataType::kHalf},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::S8,
+         PrimitiveType::S8, DataType::kFloat},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::BF16,
+         PrimitiveType::BF16, DataType::kFloat},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F16,
+         PrimitiveType::F16, DataType::kFloat},
+        {ComputationType::kF32, DataType::kFloat, PrimitiveType::F32,
+         PrimitiveType::F32, DataType::kFloat},
+    };
 
     return absl::c_linear_search(
         supported_type_combinations,
@@ -2093,7 +2200,6 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     return lhs_non_contracting_dimension_size <= kMaxDimensionSize;
   }
 
-#if TENSORFLOW_USE_ROCM
   // Turns an F8 dot with unsupported output type into an F8 dot with F32
   // output, and converting the F32 output to unsupported output types.
   absl::StatusOr<HloInstruction *> TurnF8DotWithUnsupportedOutputTypeIntoF32(
@@ -2107,7 +2213,6 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     TF_RETURN_IF_ERROR(ReplaceInstruction(instr, convert));
     return f32_dot;
   }
-#endif  // TENSORFLOW_USE_ROCM
 
   // Turns an F8 dot into an F16 dot, converting operands to F16 and
   // converting the output back to F8.
@@ -2151,8 +2256,26 @@ class GemmWorkspaceRewriteVisitor : public DfsHloRewriteVisitor {
       : gpu_version_(gpu_version) {}
 
   absl::Status HandleCustomCall(HloInstruction *instr) override {
-    if (instr->custom_call_target() != kGemmCallTarget ||
-        !instr->shape().IsArray()) {
+    bool has_aux_output = false;
+    if (instr->custom_call_target() == kCublasLtMatmulCallTarget ||
+        instr->custom_call_target() == kCublasLtMatmulF8CallTarget) {
+      TF_ASSIGN_OR_RETURN(const auto gpu_config,
+                          instr->backend_config<xla::gpu::GpuBackendConfig>());
+      const xla::gpu::GemmBackendConfig &config =
+          gpu_config.gemm_backend_config();
+      xla::gpu::GemmBackendConfig_Epilogue epilogue = config.epilogue();
+      TF_ASSIGN_OR_RETURN(
+          has_aux_output,
+          xla::gpu::gpublas_lt::EpilogueHasAuxiliaryOutput(epilogue));
+
+      if (!((instr->shape().IsTuple() &&
+             instr->shape().tuple_shapes_size() ==
+                 has_aux_output + config.damax_output() + 1) ||
+            instr->shape().IsArray())) {
+        return absl::OkStatus();
+      }
+    } else if (instr->custom_call_target() != kGemmCallTarget ||
+               !instr->shape().IsArray()) {
       return absl::OkStatus();
     }
 
@@ -2173,14 +2296,18 @@ class GemmWorkspaceRewriteVisitor : public DfsHloRewriteVisitor {
     //
     // TODO(ezhulenev): This is not based on any measurement, just a common
     // sense, we should tweak it to find the minimal workspace size.
-    int64_t operands_byte_size = 0;
-    for (auto &operand : instr->operands()) {
-      operands_byte_size += ShapeUtil::ByteSizeOf(operand->shape());
+    if (instr->custom_call_target() == kGemmCallTarget) {
+      int64_t operands_byte_size = 0;
+      for (auto &operand : instr->operands()) {
+        operands_byte_size += ShapeUtil::ByteSizeOf(operand->shape());
+      }
+      workspace = std::min(workspace, operands_byte_size);
     }
-    workspace = std::min(workspace, operands_byte_size);
 
     // Append workspace buffer to instruction outputs.
-    std::vector<Shape> output_shapes = {instr->shape()};
+    std::vector<Shape> output_shapes = instr->shape().IsArray()
+                                           ? std::vector<Shape>{instr->shape()}
+                                           : instr->shape().tuple_shapes();
     output_shapes.emplace_back(ShapeUtil::MakeShape(S8, {workspace}));
     Shape output_shape = ShapeUtil::MakeTupleShape(output_shapes);
 
@@ -2194,9 +2321,22 @@ class GemmWorkspaceRewriteVisitor : public DfsHloRewriteVisitor {
       custom_call->set_output_to_operand_aliasing({{{0}, {2, {}}}});
     }
 
-    HloInstruction *get_output = instr->AddInstruction(
-        HloInstruction::CreateGetTupleElement(new_call, 0));
-    return ReplaceInstruction(instr, get_output);
+    if (instr->shape().IsTuple()) {
+      for (auto user : instr->users()) {
+        auto user_get_tuple =
+            dynamic_cast<HloGetTupleElementInstruction *>(user);
+        TF_RET_CHECK(user_get_tuple);
+        HloInstruction *get_output =
+            instr->AddInstruction(HloInstruction::CreateGetTupleElement(
+                new_call, user_get_tuple->tuple_index()));
+        TF_RETURN_IF_ERROR(ReplaceInstruction(user_get_tuple, get_output));
+      }
+      return absl::OkStatus();
+    } else {
+      HloInstruction *get_output = instr->AddInstruction(
+          HloInstruction::CreateGetTupleElement(new_call, 0));
+      return ReplaceInstruction(instr, get_output);
+    }
   }
 
  private:
@@ -2205,8 +2345,9 @@ class GemmWorkspaceRewriteVisitor : public DfsHloRewriteVisitor {
 
 absl::StatusOr<bool> RunOnComputation(HloComputation *computation,
                                       se::GpuComputeCapability gpu_version,
+                                      int32_t toolkit_version,
                                       bool f8_rewrite) {
-  GemmRewriterVisitor visitor(gpu_version, f8_rewrite);
+  GemmRewriterVisitor visitor(gpu_version, toolkit_version, f8_rewrite);
   TF_RETURN_IF_ERROR(computation->Accept(&visitor));
   GemmWorkspaceRewriteVisitor workspace_visitor(gpu_version);
   TF_RETURN_IF_ERROR(computation->Accept(&workspace_visitor));
@@ -2216,8 +2357,10 @@ absl::StatusOr<bool> RunOnComputation(HloComputation *computation,
 }  // anonymous namespace
 
 GemmRewriter::GemmRewriter(se::GpuComputeCapability gpu_version,
-                           bool f8_rewrite)
-    : gpu_version_(gpu_version), f8_rewrite_(f8_rewrite) {}
+                           int32_t toolkit_version, bool f8_rewrite)
+    : gpu_version_(gpu_version),
+      toolkit_version_(toolkit_version),
+      f8_rewrite_(f8_rewrite) {}
 
 absl::StatusOr<bool> GemmRewriter::Run(
     HloModule *module,
@@ -2225,8 +2368,9 @@ absl::StatusOr<bool> GemmRewriter::Run(
   bool changed = false;
   for (HloComputation *computation :
        module->MakeNonfusionComputations(execution_threads)) {
-    TF_ASSIGN_OR_RETURN(
-        bool result, RunOnComputation(computation, gpu_version_, f8_rewrite_));
+    TF_ASSIGN_OR_RETURN(bool result,
+                        RunOnComputation(computation, gpu_version_,
+                                         toolkit_version_, f8_rewrite_));
     changed |= result;
   }
   return changed;
diff --git a/third_party/xla/xla/service/gpu/gemm_rewriter.h b/third_party/xla/xla/service/gpu/gemm_rewriter.h
index 576f75efaeb34a..161a29a3b26bbf 100644
--- a/third_party/xla/xla/service/gpu/gemm_rewriter.h
+++ b/third_party/xla/xla/service/gpu/gemm_rewriter.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_GEMM_REWRITER_H_
 #define XLA_SERVICE_GPU_GEMM_REWRITER_H_
 
+#include <cstdint>
+
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
@@ -47,8 +49,8 @@ class GemmRewriter : public HloModulePass {
  public:
   // When f8_rewrite is true, only FP8 GEMMs are rewritten. Otherwise, non-FP8
   // GEMMs are rewritten.
-  explicit GemmRewriter(se::GpuComputeCapability gpu_version,
-                        bool f8_rewrite = false);
+  GemmRewriter(se::GpuComputeCapability gpu_version, int32_t toolkit_version,
+               bool f8_rewrite = false);
   absl::string_view name() const override { return "cublas-gemm-rewriter"; }
 
   using HloPassInterface::Run;
@@ -58,6 +60,7 @@ class GemmRewriter : public HloModulePass {
 
  private:
   se::GpuComputeCapability gpu_version_;
+  int32_t toolkit_version_;
   bool f8_rewrite_;
 };
 
diff --git a/third_party/xla/xla/service/gpu/gpu_algebraic_simplifier.cc b/third_party/xla/xla/service/gpu/gpu_algebraic_simplifier.cc
new file mode 100644
index 00000000000000..857da0bdc7bba0
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/gpu_algebraic_simplifier.cc
@@ -0,0 +1,101 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/gpu_algebraic_simplifier.h"
+
+#include <variant>
+
+#include "absl/log/check.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/triton_support.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu {
+
+bool IsDotSupportedByGemmFusion(const HloInstruction* dot,
+                                se::GpuComputeCapability compute_capability) {
+  auto supported_output_type = [&](const PrimitiveType t) {
+    auto cuda_compute_capability =
+        std::get_if<se::CudaComputeCapability>(&compute_capability);
+    auto rocm_compute_capability =
+        std::get_if<se::RocmComputeCapability>(&compute_capability);
+
+    CHECK(cuda_compute_capability || rocm_compute_capability);
+
+    switch (t) {
+      case F16:
+      case F32:
+        return true;
+      case BF16:
+        if (cuda_compute_capability) {
+          return true;
+        }
+        if (rocm_compute_capability) {
+          return rocm_compute_capability->has_bf16_dtype_support();
+        }
+        return false;
+      default:
+        return false;
+    }
+  };
+
+  if (!supported_output_type(dot->shape().element_type())) {
+    return false;
+  }
+
+  if (!IsTritonSupportedDataType(dot->operand(0)->shape().element_type(),
+                                 compute_capability) ||
+      !IsTritonSupportedDataType(dot->operand(1)->shape().element_type(),
+                                 compute_capability)) {
+    return false;
+  }
+  return true;
+}
+
+bool GpuAlgebraicSimplifierVisitor::ShouldStrengthReduceDotToReduce(
+    const HloInstruction* hlo) {
+  if (!options_.enable_dot_strength_reduction()) {
+    return false;
+  }
+
+  const HloDotInstruction* dot = DynCast<HloDotInstruction>(hlo);
+  if (dot == nullptr) {
+    return false;
+  }
+
+  const HloInstruction* lhs = dot->operand(0);
+  const HloInstruction* rhs = dot->operand(1);
+  DotDimensionNumbers dnums = dot->dot_dimension_numbers();
+  bool lhs_is_vector = (dnums.lhs_batch_dimensions_size() +
+                            dnums.lhs_contracting_dimensions_size() ==
+                        lhs->shape().rank());
+  bool rhs_is_vector = (dnums.rhs_batch_dimensions_size() +
+                            dnums.rhs_contracting_dimensions_size() ==
+                        rhs->shape().rank());
+  // Strength-reduce vector-vector dots since they are not supported by
+  // GemmFusion.
+  if (lhs_is_vector && rhs_is_vector) {
+    return true;
+  }
+
+  // If GemmFusion cannot handle this dot, we should strength-reduce it so that
+  // it can be handled by the fusion pipeline.
+  return !IsDotSupportedByGemmFusion(dot, compute_capability_);
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/gpu_algebraic_simplifier.h b/third_party/xla/xla/service/gpu/gpu_algebraic_simplifier.h
new file mode 100644
index 00000000000000..855359654395a0
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/gpu_algebraic_simplifier.h
@@ -0,0 +1,78 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_GPU_ALGEBRAIC_SIMPLIFIER_H_
+#define XLA_SERVICE_GPU_GPU_ALGEBRAIC_SIMPLIFIER_H_
+
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/algebraic_simplifier.h"
+#include "xla/service/hlo_pass_interface.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/util.h"
+
+namespace xla::gpu {
+
+class GpuAlgebraicSimplifierVisitor : public AlgebraicSimplifierVisitor {
+ public:
+  explicit GpuAlgebraicSimplifierVisitor(
+      const AlgebraicSimplifierOptions& options,
+      se::GpuComputeCapability compute_capability,
+      AlgebraicSimplifier* simplifier)
+      : AlgebraicSimplifierVisitor(options, simplifier),
+        compute_capability_(std::move(compute_capability)) {}
+
+  bool ShouldStrengthReduceDotToReduce(const HloInstruction* hlo) override;
+
+ private:
+  se::GpuComputeCapability compute_capability_;
+};
+
+class GpuAlgebraicSimplifier : public AlgebraicSimplifier {
+ public:
+  explicit GpuAlgebraicSimplifier(const AlgebraicSimplifierOptions& options,
+                                  se::GpuComputeCapability compute_capability)
+      : AlgebraicSimplifier(options),
+        compute_capability_(std::move(compute_capability)) {}
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(HloModule* module,
+                           const absl::flat_hash_set<absl::string_view>&
+                               execution_threads) override {
+    XLA_VLOG_LINES(
+        2, "GpuAlgebraicSimplifier::Run(), before:\n" + module->ToString());
+    bool changed = false;
+    GpuAlgebraicSimplifierVisitor visitor(options_, compute_capability_, this);
+    for (auto* comp : module->MakeNonfusionComputations(execution_threads)) {
+      if (visitor.Run(comp, options_, this)) {
+        changed = true;
+      }
+    }
+    XLA_VLOG_LINES(
+        2, "GpuAlgebraicSimplifier::Run(), after:\n" + module->ToString());
+    return changed;
+  }
+
+ private:
+  se::GpuComputeCapability compute_capability_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_GPU_ALGEBRAIC_SIMPLIFIER_H_
diff --git a/third_party/xla/xla/service/gpu/gpu_algebraic_simplifier_test.cc b/third_party/xla/xla/service/gpu/gpu_algebraic_simplifier_test.cc
new file mode 100644
index 00000000000000..b0a5cc6a44440a
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/gpu_algebraic_simplifier_test.cc
@@ -0,0 +1,97 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/gpu_algebraic_simplifier.h"
+
+#include <string>
+
+#include <gtest/gtest.h>
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/algebraic_simplifier.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla::gpu {
+namespace {
+
+class GpuAlgebraicSimplifierTest : public HloTestBase {};
+
+TEST_F(GpuAlgebraicSimplifierTest, VectorVectorDotShouldBeStrengthReduced) {
+  const std::string& hlo_string = R"(
+HloModule m
+
+ENTRY entry {
+  p0 = f32[32, 5] parameter(0)
+  p1 = f32[32, 5] parameter(1)
+  ROOT dot = f32[32] dot(p0, p1), lhs_batch_dims={0},
+    lhs_contracting_dims={1}, rhs_batch_dims={0}, rhs_contracting_dims={1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  const HloInstruction* dot = module->entry_computation()->root_instruction();
+  AlgebraicSimplifierOptions options;
+  options.set_enable_dot_strength_reduction(true);
+  se::CudaComputeCapability ampere(8, 0);
+  GpuAlgebraicSimplifier simplifier(options, ampere);
+  GpuAlgebraicSimplifierVisitor visitor(options, ampere, &simplifier);
+  EXPECT_TRUE(visitor.ShouldStrengthReduceDotToReduce(dot));
+}
+
+TEST_F(GpuAlgebraicSimplifierTest, MatrixVectorDotShouldNotBeStrengthReduced) {
+  const std::string& hlo_string = R"(
+HloModule m
+
+ENTRY entry {
+  p0 = f32[32, 5, 7] parameter(0)
+  p1 = f32[32, 5] parameter(1)
+  ROOT dot = f32[32,7] dot(p0, p1), lhs_batch_dims={0},
+    lhs_contracting_dims={1}, rhs_batch_dims={0}, rhs_contracting_dims={1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  const HloInstruction* dot = module->entry_computation()->root_instruction();
+  AlgebraicSimplifierOptions options;
+  options.set_enable_dot_strength_reduction(true);
+  se::CudaComputeCapability ampere(8, 0);
+  GpuAlgebraicSimplifier simplifier(options, ampere);
+  GpuAlgebraicSimplifierVisitor visitor(options, ampere, &simplifier);
+  EXPECT_FALSE(visitor.ShouldStrengthReduceDotToReduce(dot));
+}
+
+TEST_F(GpuAlgebraicSimplifierTest,
+       DotWithTypeUnsupportedByGemmFusionShouldBeStrengthReduced) {
+  const std::string& hlo_string = R"(
+HloModule m
+
+ENTRY entry {
+  p0 = c64[32, 5, 7] parameter(0)
+  p1 = c64[32, 5] parameter(1)
+  ROOT dot = c64[32,7] dot(p0, p1), lhs_batch_dims={0},
+    lhs_contracting_dims={1}, rhs_batch_dims={0}, rhs_contracting_dims={1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  const HloInstruction* dot = module->entry_computation()->root_instruction();
+  AlgebraicSimplifierOptions options;
+  options.set_enable_dot_strength_reduction(true);
+  se::CudaComputeCapability ampere(8, 0);
+  GpuAlgebraicSimplifier simplifier(options, ampere);
+  GpuAlgebraicSimplifierVisitor visitor(options, ampere, &simplifier);
+  EXPECT_TRUE(visitor.ShouldStrengthReduceDotToReduce(dot));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/gpu_all_gather_optimizer.cc b/third_party/xla/xla/service/gpu/gpu_all_gather_optimizer.cc
index 66afc65bfa701f..fe2d2d1e145140 100644
--- a/third_party/xla/xla/service/gpu/gpu_all_gather_optimizer.cc
+++ b/third_party/xla/xla/service/gpu/gpu_all_gather_optimizer.cc
@@ -91,7 +91,7 @@ absl::StatusOr<bool> AllGatherOptimizer::Run(
 
       auto combined = HloInstruction::CreateAllGather(
           left_all_gather->shape(), {index_in_full_shape}, all_gather_dimension,
-          left_all_gather->replica_groups(),
+          left_all_gather->device_list(),
           /*constrain_layout=*/false, left_all_gather->channel_id(),
           Cast<HloAllGatherInstruction>(left_all_gather)
               ->use_global_device_ids());
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index b72cedc879a388..5d789ff717081f 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -16,13 +16,12 @@ limitations under the License.
 #include "xla/service/gpu/gpu_compiler.h"
 
 #include <algorithm>
-#include <cstddef>
 #include <cstdint>
 #include <functional>
 #include <memory>
 #include <optional>
 #include <string>
-#include <tuple>
+#include <string_view>
 #include <utility>
 #include <variant>
 #include <vector>
@@ -55,14 +54,9 @@ limitations under the License.
 #include "llvm/Support/Error.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/SplitModule.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/DialectRegistry.h"  // from @llvm-project
-#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -71,6 +65,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/hlo/transforms/hlo_constant_splitter.h"
+#include "xla/maybe_owning.h"
 #include "xla/service/algebraic_simplifier.h"
 #include "xla/service/all_gather_broadcast_reorder.h"
 #include "xla/service/all_gather_combiner.h"
@@ -84,7 +79,6 @@ limitations under the License.
 #include "xla/service/bitcast_dtypes_expander.h"
 #include "xla/service/broadcast_canonicalizer.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/buffer_value.h"
 #include "xla/service/call_inliner.h"
 #include "xla/service/collective_permute_decomposer.h"
 #include "xla/service/collective_pipeliner.h"
@@ -116,17 +110,16 @@ limitations under the License.
 #include "xla/service/gather_simplifier.h"
 #include "xla/service/gpu/address_computation_fusion_rewriter.h"
 #include "xla/service/gpu/algorithm_checker.h"
-#include "xla/service/gpu/alias_passthrough_params.h"
 #include "xla/service/gpu/all_reduce_blueconnect.h"
 #include "xla/service/gpu/autotuner_util.h"
 #include "xla/service/gpu/collective_permute_cycle_decomposer.h"
 #include "xla/service/gpu/command_buffer_scheduling.h"
 #include "xla/service/gpu/compile_module_to_llvm_ir.h"
 #include "xla/service/gpu/conv_layout_normalization.h"
-#include "xla/service/gpu/copy_fusion.h"
 #include "xla/service/gpu/custom_kernel_fusion_rewriter.h"
 #include "xla/service/gpu/dot_dimension_sorter.h"
 #include "xla/service/gpu/dot_operand_converter.h"
+#include "xla/service/gpu/double_buffer_loop_unrolling.h"
 #include "xla/service/gpu/fusion_pipeline.h"
 #include "xla/service/gpu/fusion_wrapper.h"
 #include "xla/service/gpu/gemm_broadcast_folding_rewriter.h"
@@ -134,7 +127,6 @@ limitations under the License.
 #include "xla/service/gpu/gemm_rewriter.h"
 #include "xla/service/gpu/gpu_all_gather_optimizer.h"
 #include "xla/service/gpu/gpu_async_collective_annotator.h"
-#include "xla/service/gpu/gpu_constants.h"
 #include "xla/service/gpu/gpu_conv_rewriter.h"
 #include "xla/service/gpu/gpu_convert_async_collectives_to_sync.h"
 #include "xla/service/gpu/gpu_executable.h"
@@ -147,16 +139,15 @@ limitations under the License.
 #include "xla/service/gpu/gpu_scatter_expander.h"
 #include "xla/service/gpu/gpu_windowed_einsum_handler.h"
 #include "xla/service/gpu/hlo_fusion_stats.h"
-#include "xla/service/gpu/horizontal_loop_fusion.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/ir_emitter_unnested.h"
-#include "xla/service/gpu/loop_double_buffer_transformer.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/metrics.h"
 #include "xla/service/gpu/model/gpu_cost_model_stats_collection.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/gpu/move_copy_to_users.h"
+#include "xla/service/gpu/pipelined_p2p_rewriter.h"
 #include "xla/service/gpu/prepare_hlo_for_ir_emitting_pipeline.h"
 #include "xla/service/gpu/reduction_degenerate_dim_remover.h"
 #include "xla/service/gpu/reduction_dimension_grouper.h"
@@ -173,6 +164,7 @@ limitations under the License.
 #include "xla/service/gpu/topk_specializer.h"
 #include "xla/service/gpu/topk_splitter.h"
 #include "xla/service/gpu/tree_reduction_rewriter.h"
+#include "xla/service/gpu/triton_fusion_numerics_verifier.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_computation_deduplicator.h"
 #include "xla/service/hlo_constant_folding.h"
@@ -181,7 +173,6 @@ limitations under the License.
 #include "xla/service/hlo_dataflow_analysis.h"
 #include "xla/service/hlo_dce.h"
 #include "xla/service/hlo_module_config.h"
-#include "xla/service/hlo_ordering.h"
 #include "xla/service/hlo_pass_fix.h"
 #include "xla/service/hlo_pass_pipeline.h"
 #include "xla/service/hlo_rematerialization.h"
@@ -192,9 +183,7 @@ limitations under the License.
 #include "xla/service/layout_assignment.h"
 #include "xla/service/layout_normalization.h"
 #include "xla/service/llvm_ir/llvm_util.h"
-#include "xla/service/logical_buffer.h"
 #include "xla/service/logistic_expander.h"
-#include "xla/service/loop_schedule_linearizer.h"
 #include "xla/service/operand_upcaster.h"
 #include "xla/service/optimization_barrier_expander.h"
 #include "xla/service/optimize_input_output_buffer_alias.h"
@@ -203,6 +192,7 @@ limitations under the License.
 #include "xla/service/reduce_decomposer.h"
 #include "xla/service/reduce_scatter_combiner.h"
 #include "xla/service/reduce_scatter_reassociate.h"
+#include "xla/service/reduce_window_rewriter.h"
 #include "xla/service/reshape_decomposer.h"
 #include "xla/service/reshape_mover.h"
 #include "xla/service/result_caster.h"
@@ -233,7 +223,6 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/status_macros.h"
-#include "xla/statusor.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_description.pb.h"
 #include "xla/stream_executor/dnn.h"
@@ -251,19 +240,11 @@ limitations under the License.
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/numbers.h"
-#include "tsl/platform/path.h"
 #include "tsl/platform/protobuf.h"  // IWYU pragma: keep
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/threadpool.h"
 #include "tsl/profiler/lib/traceme.h"
 
-#if GOOGLE_CUDA
-#include "third_party/gpus/cuda/include/cuda.h"
-#include "xla/stream_executor/cuda/cuda_platform_id.h"
-#elif TENSORFLOW_USE_ROCM
-#include "xla/stream_executor/rocm/rocm_platform_id.h"
-#endif
-
 #ifdef PLATFORM_GOOGLE
 #include "xla/hlo/experimental/auto_sharding/auto_sharding.h"
 #endif  // PLATFORM_GOOGLE
@@ -271,38 +252,10 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 namespace {
-// A class for storing either an owned thread pool or a non-owning pointer to an
-// external thread pool.
-class MaybeOwningThreadPool {
- public:
-  // Gets or creates a thread pool.
-  //
-  // See the code for the logic.
-  static MaybeOwningThreadPool GetOrCreate(
-      int parallelism, tsl::thread::ThreadPool* default_thread_pool,
-      int default_parallelism);
-
-  // Not owning (nullptr).
-  MaybeOwningThreadPool();
-  // Not owning.
-  explicit MaybeOwningThreadPool(tsl::thread::ThreadPool* thread_pool);
-  // Owning.
-  explicit MaybeOwningThreadPool(
-      std::unique_ptr<tsl::thread::ThreadPool> thread_pool);
-  tsl::thread::ThreadPool* get();
-  const tsl::thread::ThreadPool* get() const;
-  tsl::thread::ThreadPool* operator->();
-  const tsl::thread::ThreadPool* operator->() const;
-  explicit operator bool() const;
-  bool operator!() const;
 
- private:
-  std::variant<tsl::thread::ThreadPool*,
-               std::unique_ptr<tsl::thread::ThreadPool>>
-      thread_pool_;
-};
+using MaybeOwningThreadPool = MaybeOwning<tsl::thread::ThreadPool>;
 
-/*static*/ MaybeOwningThreadPool MaybeOwningThreadPool::GetOrCreate(
+MaybeOwningThreadPool CreateMaybeOwningThreadPool(
     int parallelism, tsl::thread::ThreadPool* default_thread_pool,
     int default_parallelism) {
   CHECK_GE(parallelism, 0);
@@ -332,41 +285,6 @@ class MaybeOwningThreadPool {
   }
 }
 
-MaybeOwningThreadPool::MaybeOwningThreadPool() : thread_pool_(nullptr) {}
-
-MaybeOwningThreadPool::MaybeOwningThreadPool(
-    tsl::thread::ThreadPool* thread_pool)
-    : thread_pool_(thread_pool) {}
-
-MaybeOwningThreadPool::MaybeOwningThreadPool(
-    std::unique_ptr<tsl::thread::ThreadPool> thread_pool)
-    : thread_pool_(std::move(thread_pool)) {}
-
-tsl::thread::ThreadPool* MaybeOwningThreadPool::get() {
-  if (std::holds_alternative<tsl::thread::ThreadPool*>(thread_pool_)) {
-    return std::get<tsl::thread::ThreadPool*>(thread_pool_);
-  }
-  return std::get<std::unique_ptr<tsl::thread::ThreadPool>>(thread_pool_).get();
-}
-
-const tsl::thread::ThreadPool* MaybeOwningThreadPool::get() const {
-  return const_cast<MaybeOwningThreadPool*>(this)->get();
-}
-
-tsl::thread::ThreadPool* MaybeOwningThreadPool::operator->() {
-  tsl::thread::ThreadPool* thread_pool = get();
-  CHECK_NE(thread_pool, nullptr);
-  return thread_pool;
-}
-
-const tsl::thread::ThreadPool* MaybeOwningThreadPool::operator->() const {
-  return const_cast<MaybeOwningThreadPool*>(this)->operator->();
-}
-
-MaybeOwningThreadPool::operator bool() const { return get() != nullptr; }
-
-bool MaybeOwningThreadPool::operator!() const { return get() == nullptr; }
-
 absl::StatusOr<AutotuneConfig> GetAutotuneConfig(
     se::StreamExecutor* stream_exec, const DebugOptions& debug_options,
     const GpuCompiler::CompileOptions& options,
@@ -465,7 +383,8 @@ GpuThunkAotCompilationResult::LoadExecutable(
       se::Platform * platform,
       se::PlatformManager::PlatformWithId(compiler->PlatformId()));
   std::string platform_name = platform->Name();
-  se::DeviceDescription gpu_device_info = stream_exec->GetDeviceDescription();
+  const se::DeviceDescription& gpu_device_info =
+      stream_exec->GetDeviceDescription();
   mlir::DialectRegistry registry;
   auto mlir_context = std::make_unique<mlir::MLIRContext>(registry);
   llvm::LLVMContext llvm_context;
@@ -485,8 +404,6 @@ GpuThunkAotCompilationResult::LoadExecutable(
       ir_emitter->EmitHloComputation(hlo_module->entry_computation()));
   std::unique_ptr<ThunkSequence> thunk_sequence =
       ir_emitter->ConsumeThunkSequence();
-  ForAllThunks([](Thunk* thunk) { thunk->ClearCompileTimeInfo(); },
-               thunk_sequence.get());
 
   // Get all other fields required by GpuExecutable.
   std::vector<GpuExecutable::ConstantInfo> constants =
@@ -494,10 +411,6 @@ GpuThunkAotCompilationResult::LoadExecutable(
   TF_ASSIGN_OR_RETURN(auto output_info,
                       GetOutputInfo(*hlo_module, *buffer_assignment));
   const Shape& output_shape = hlo_module->result_shape();
-  std::function<std::string()> buffer_assignment_dumper = [] {
-    return std::string();
-  };
-
   int64_t debug_buffer_assignment_show_max =
       hlo_module->config()
           .debug_options()
@@ -661,6 +574,15 @@ absl::Status RunSPMDPasses(
     ReshapeMoverOptions reshape_mover_options;
     reshape_mover_options.reshape_of_1d_broadcast_is_cheap = true;
     spmd_simplify.AddPass<ReshapeMover>(reshape_mover_options);
+    // Run AlgebraicSimplifier directly before HloConstantFolding, because we
+    // need to simplify DynamicSlice(Broadcast) away. Constant folding of
+    // DynamicSlice can be quite costly, as the whole operand will be evaluated.
+    // We run AlgebraicSimplifier as HloPassFix to make sure all simplifications
+    // have been done before running HloConstantFolding. This is necessary
+    // because simplifications create new instructions which may not be visited
+    // in the same iteration of AlgebraicSimplifier.
+    spmd_simplify.AddPass<HloPassFix<AlgebraicSimplifier>>(
+        layout_insensitive_algsimp_opts);
     spmd_simplify.AddPass<HloConstantFolding>();
     spmd_simplify.AddPass<ConditionalSimplifier>();
 
@@ -706,7 +628,9 @@ absl::Status RunSPMDPasses(
             .xla_gpu_threshold_for_windowed_einsum_mib(),
         hlo_module->config()
             .debug_options()
-            .xla_gpu_multi_streamed_windowed_einsum());
+            .xla_gpu_multi_streamed_windowed_einsum(),
+        /*skip_checking_windowed_einsum_users=*/true,
+        /*disable_ag_rewrite_for_multiple_consumers=*/true);
     spmd_pipeline.AddPass<CollectivePermuteMotion>();
     return spmd_pipeline.Run(hlo_module).status();
   } else {
@@ -721,11 +645,11 @@ absl::Status RunSPMDPasses(
 absl::Status RunOptimizationPasses(
     HloModule* hlo_module, const Compiler::TargetConfig& gpu_target_config,
     const AlgebraicSimplifierOptions& layout_insensitive_algsimp_opts) {
+  const DebugOptions& debug_options = hlo_module->config().debug_options();
+
   HloPassPipeline pipeline("optimization");
   AddHloVerifier(&pipeline);
-  if (hlo_module->config()
-          .debug_options()
-          .xla_gpu_multi_streamed_windowed_einsum()) {
+  if (debug_options.xla_gpu_multi_streamed_windowed_einsum()) {
     pipeline.AddPass<GpuWindowedEinsumHandler>();
   }
   pipeline.AddPass<TopKSplitter>();
@@ -765,11 +689,12 @@ absl::Status RunOptimizationPasses(
   // handle it.
   pipeline.AddPass<ZeroSizedHloElimination>();
 
-  if (hlo_module->config().debug_options().xla_gpu_deterministic_ops()) {
+  if (debug_options.xla_gpu_deterministic_ops() ||
+      debug_options.xla_gpu_exclude_nondeterministic_ops()) {
     // Scatter can be indeterministic if indices are not unique or a non
     // associative combiner function is used. Eliminate these Scatter ops.
     pipeline.AddPass<ScatterExpander>(
-        ScatterExpander::kEliminateIndeterminisitcScatters);
+        ScatterExpander::kEliminateIndeterministicScatters);
   }
   // Scatters unsupported on XLA:GPU are eliminated.
   pipeline.AddPass<GpuScatterExpander>();
@@ -803,9 +728,14 @@ absl::Status RunOptimizationPasses(
   pipeline.AddPass<ConditionalCanonicalizer>();
   pipeline.AddPass<DynamicDimensionSimplifier>();
 
+  if (debug_options.xla_reduce_window_rewrite_base_length() != 0) {
+    pipeline.AddPass<HloPassFix<ReduceWindowRewriter>>(
+        debug_options.xla_reduce_window_rewrite_base_length());
+  }
+
   DynamicPadderOptions dynamic_padder_options;
 
-  switch (hlo_module->config().debug_options().xla_gpu_shape_checks()) {
+  switch (debug_options.xla_gpu_shape_checks()) {
     case DebugOptions::IGNORE:
       dynamic_padder_options.shape_check_mode =
           DynamicDimensionInference::ShapeCheckMode::kIgnore;
@@ -919,9 +849,8 @@ absl::Status RunCollectiveOptimizationPasses(
         /*pipelining_direction=*/
         CollectivePipeliner::PipeliningDirection::kForward,
         /*should_process=*/HloPredicateIsOp<HloOpcode::kAllReduce>,
-        /*acceptable_formatting=*/[](const HloInstruction*) { return true; },
-        /*reuse_pipelined_op_buffer=*/
-        [](const HloInstruction*) { return false; }};
+        /*acceptable_formatting=*/HloPredicateTrue,
+        /*reuse_pipelined_op_buffer=*/HloPredicateFalse};
     collectives_pipeline.AddPass<CollectivePipeliner>(config);
   }
   if (debug_options.xla_gpu_enable_pipelined_collectives() ||
@@ -935,9 +864,8 @@ absl::Status RunCollectiveOptimizationPasses(
         /*pipelining_direction=*/
         CollectivePipeliner::PipeliningDirection::kBackward,
         /*should_process=*/HloPredicateIsOp<HloOpcode::kAllGather>,
-        /*acceptable_formatting=*/[](const HloInstruction*) { return true; },
-        /*reuse_pipelined_op_buffer=*/
-        [](const HloInstruction*) { return false; }};
+        /*acceptable_formatting=*/HloPredicateTrue,
+        /*reuse_pipelined_op_buffer=*/HloPredicateFalse};
     collectives_pipeline.AddPass<CollectivePipeliner>(config);
   }
   if (debug_options.xla_gpu_enable_pipelined_collectives() ||
@@ -951,9 +879,8 @@ absl::Status RunCollectiveOptimizationPasses(
         /*pipelining_direction=*/
         CollectivePipeliner::PipeliningDirection::kForward,
         /*should_process=*/HloPredicateIsOp<HloOpcode::kReduceScatter>,
-        /*acceptable_formatting=*/[](const HloInstruction*) { return true; },
-        /*reuse_pipelined_op_buffer=*/
-        [](const HloInstruction*) { return false; }};
+        /*acceptable_formatting=*/HloPredicateTrue,
+        /*reuse_pipelined_op_buffer=*/HloPredicateFalse};
     collectives_pipeline.AddPass<CollectivePipeliner>(config);
   }
 
@@ -1067,36 +994,27 @@ absl::Status RunPostFusionPasses(
     HloModule* hlo_module,
     std::function<absl::Status(HloPassPipeline*, const DebugOptions&)>
         add_custom_kernel_replacement_passes) {
+  const DebugOptions& opts = hlo_module->config().debug_options();
+
   HloPassPipeline pipeline("post-fusion optimization");
   pipeline.AddPass<RenameFusions>();
   pipeline.AddPass<AllGatherCombiner>(
-      hlo_module->config()
-          .debug_options()
-          .xla_gpu_all_gather_combine_threshold_bytes(),
+      opts.xla_gpu_all_gather_combine_threshold_bytes(),
       /*combine_threshold_count=*/256,
-      hlo_module->config()
-          .debug_options()
-          .xla_gpu_enable_all_gather_combine_by_dim());
+      opts.xla_gpu_enable_all_gather_combine_by_dim());
   pipeline.AddPass<AllReduceCombiner>(
-      hlo_module->config()
-          .debug_options()
-          .xla_gpu_all_reduce_combine_threshold_bytes(),
+      opts.xla_gpu_all_reduce_combine_threshold_bytes(),
       /*combine_threshold_count=*/256);
   pipeline.AddPass<ReduceScatterCombiner>(
-      hlo_module->config()
-          .debug_options()
-          .xla_gpu_reduce_scatter_combine_threshold_bytes(),
+      opts.xla_gpu_reduce_scatter_combine_threshold_bytes(),
       /*combine_threshold_count=*/256,
-      hlo_module->config()
-          .debug_options()
-          .xla_gpu_enable_reduce_scatter_combine_by_dim());
+      opts.xla_gpu_enable_reduce_scatter_combine_by_dim());
 
-  if (hlo_module->config().debug_options().xla_gpu_all_reduce_contiguous()) {
+  if (opts.xla_gpu_all_reduce_contiguous()) {
     pipeline.AddPass<AllReduceContiguous>();
   }
 
-  TF_RETURN_IF_ERROR(add_custom_kernel_replacement_passes(
-      &pipeline, hlo_module->config().debug_options()));
+  TF_RETURN_IF_ERROR(add_custom_kernel_replacement_passes(&pipeline, opts));
 
   int32_t blueconnect_num_devices_per_host =
       hlo_module->config()
@@ -1106,10 +1024,26 @@ absl::Status RunPostFusionPasses(
     pipeline.AddPass<AllReduceBlueConnect>(blueconnect_num_devices_per_host);
   }
 
-  if (hlo_module->config()
-          .debug_options()
-          .xla_gpu_enable_while_loop_double_buffering()) {
-    pipeline.AddPass<LoopDoubleBufferTransformer>();
+  std::optional<DoubleBufferLoopUnrolling::UnrollStrategy> unroll_strategy =
+      std::nullopt;
+  // Support old flag.
+  if (opts.xla_gpu_enable_while_loop_double_buffering()) {
+    unroll_strategy = DoubleBufferLoopUnrolling::UnrollStrategy::kDoubleBuffer;
+  }
+  // Support new flag setting style, override the old one.
+  if (opts.xla_gpu_enable_while_loop_unrolling() ==
+      DebugOptions::WHILE_LOOP_UNROLLING_DOUBLE_BUFFER) {
+    unroll_strategy = DoubleBufferLoopUnrolling::UnrollStrategy::kDoubleBuffer;
+  }
+  if (opts.xla_gpu_enable_while_loop_unrolling() ==
+      DebugOptions::WHILE_LOOP_UNROLLING_FULL_UNROLL) {
+    LOG_IF(WARNING, unroll_strategy != std::nullopt)
+        << "Overriding double buffering set via "
+           "`xla_gpu_enable_while_loop_double_buffering` flag.";
+    unroll_strategy = DoubleBufferLoopUnrolling::UnrollStrategy::kFullUnroll;
+  }
+  if (unroll_strategy != std::nullopt) {
+    pipeline.AddPass<DoubleBufferLoopUnrolling>(*unroll_strategy);
     pipeline.AddPass<TupleSimplifier>();
     pipeline.AddPass<HloDCE>();
   }
@@ -1132,40 +1066,31 @@ absl::Status RunPostFusionCollectiveOptimizationPasses(HloModule* hlo_module) {
   config.convert_all_to_all = HloPredicateTrue;
   pipeline.AddPass<AsyncCollectiveCreator>(std::move(config));
 
-  auto convert_to_async = [&hlo_module](const HloInstruction* inst) {
-    const bool enable_all_async =
-        hlo_module->config().debug_options().xla_gpu_enable_async_collectives();
+  absl::flat_hash_set<DebugOptions::CollectiveOpType> disabled_async_ops;
+  for (auto collective_op_type : hlo_module->config()
+                                     .debug_options()
+                                     .xla_gpu_disable_async_collectives()) {
+    disabled_async_ops.insert(
+        static_cast<DebugOptions::CollectiveOpType>(collective_op_type));
+  }
+  auto convert_to_async = [&disabled_async_ops](const HloInstruction* inst) {
     switch (inst->opcode()) {
       case HloOpcode::kAllReduceStart:
-        return enable_all_async || hlo_module->config()
-                                       .debug_options()
-                                       .xla_gpu_enable_async_all_reduce();
-      case HloOpcode::kAllGatherStart:
-        return enable_all_async || hlo_module->config()
-                                       .debug_options()
-                                       .xla_gpu_enable_async_all_gather();
+        return !disabled_async_ops.contains(DebugOptions::ALLREDUCE);
       case HloOpcode::kCollectivePermuteStart:
-        return enable_all_async ||
-               hlo_module->config()
-                   .debug_options()
-                   .xla_gpu_enable_async_collective_permute();
+        return !disabled_async_ops.contains(DebugOptions::COLLECTIVEPERMUTE);
+      case HloOpcode::kAllGatherStart:
+        return !disabled_async_ops.contains(DebugOptions::ALLGATHER);
       case HloOpcode::kAsyncStart: {
         auto async_inst = Cast<HloAsyncInstruction>(inst);
         switch (async_inst->async_wrapped_opcode()) {
           case HloOpcode::kCollectiveBroadcast:
-            return enable_all_async ||
-                   hlo_module->config()
-                       .debug_options()
-                       .xla_gpu_enable_async_collective_broadcast();
+            return !disabled_async_ops.contains(
+                DebugOptions::COLLECTIVEBROADCAST);
           case HloOpcode::kReduceScatter:
-            return enable_all_async ||
-                   hlo_module->config()
-                       .debug_options()
-                       .xla_gpu_enable_async_reduce_scatter();
+            return !disabled_async_ops.contains(DebugOptions::REDUCESCATTER);
           case HloOpcode::kAllToAll:
-            return enable_all_async || hlo_module->config()
-                                           .debug_options()
-                                           .xla_gpu_enable_async_all_to_all();
+            return !disabled_async_ops.contains(DebugOptions::ALLTOALL);
           default:
             return false;
         }
@@ -1203,6 +1128,26 @@ absl::Status RunPostFusionSimplificationPasses(
   return pipeline.Run(hlo_module).status();
 }
 
+absl::Status RunPostFusionVerificationPasses(
+    HloModule* hlo_module, se::StreamExecutor* stream_exec,
+    const GpuCompiler::CompileOptions& options,
+    const Compiler::TargetConfig& gpu_target_config) {
+  HloPassPipeline pipeline("post-fusion-verification-pipeline optimization");
+
+  if (hlo_module->config()
+          .debug_options()
+          .xla_gpu_verify_triton_fusion_numerics()) {
+    TF_ASSIGN_OR_RETURN(
+        AutotuneConfig autotune_config,
+        GetAutotuneConfig(stream_exec, hlo_module->config().debug_options(),
+                          options, gpu_target_config));
+
+    pipeline.AddPass<TritonFusionNumericsVerifier>(autotune_config);
+  }
+
+  return pipeline.Run(hlo_module).status();
+}
+
 }  // namespace
 
 // Runs optimization passes on the given HLO module.
@@ -1212,7 +1157,7 @@ absl::Status GpuCompiler::OptimizeHloModule(
   CheckNotScheduled(hlo_module);
   LogDebugOptions(hlo_module);
 
-  MaybeOwningThreadPool thread_pool = MaybeOwningThreadPool::GetOrCreate(
+  MaybeOwningThreadPool thread_pool = CreateMaybeOwningThreadPool(
       /*parallelism=*/hlo_module->config()
           .debug_options()
           .xla_gpu_force_compilation_parallelism(),
@@ -1278,11 +1223,17 @@ absl::Status GpuCompiler::OptimizeHloModule(
     // duplicate or NOPs, so remove them with algebraic simplification and CSE.
     layout_normalization_pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(
         simplifier_options);
+    // Layout normalization will create broadcasts that are not canonical.
+    layout_normalization_pipeline.AddPass<BroadcastCanonicalizer>();
+    // Layout normalization will create scatters that are not simplified and
+    // also have unsorted update_window_dims.
+    layout_normalization_pipeline.AddPass<ScatterSimplifier>();
   }
   TF_RETURN_IF_ERROR(layout_normalization_pipeline.Run(hlo_module).status());
   // Run target-specific HLO optimization passes after layout assignment.
   TF_RETURN_IF_ERROR(OptimizeHloPostLayoutAssignment(
-      hlo_module, stream_exec, options, gpu_target_config, thread_pool.get()));
+      hlo_module, stream_exec, options, gpu_target_config,
+      thread_pool.get_mutable()));
 
   // This is a "low effort, high impact" fusion that should be run first.
   if (hlo_module->config()
@@ -1296,7 +1247,7 @@ absl::Status GpuCompiler::OptimizeHloModule(
   }
 
   TF_RETURN_IF_ERROR(RunFusionPasses(hlo_module, gpu_target_config,
-                                     thread_pool.get(),
+                                     thread_pool.get_mutable(),
                                      ShapeSizeBytesFunction()));
   TF_RETURN_IF_ERROR(RunPostFusionPasses(
       hlo_module,
@@ -1307,6 +1258,9 @@ absl::Status GpuCompiler::OptimizeHloModule(
   TF_RETURN_IF_ERROR(RunPostFusionSimplificationPasses(
       hlo_module, layout_insensitive_algsimp_opts));
 
+  TF_RETURN_IF_ERROR(RunPostFusionVerificationPasses(
+      hlo_module, stream_exec, options, gpu_target_config));
+
   return absl::OkStatus();
 }  // NOLINT(readability/fn_size)
 
@@ -1365,8 +1319,7 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
     sub_pipeline.AddPass<FloatNormalization>(&f8e5m2fnuz_support);
     sub_pipeline.AddPass<FloatNormalization>(&f8e4m3fnuz_support);
     // Remove `f32 -> bf16 -> f32` casts inserted by bf16 normalization.
-    if (debug_options.xla_allow_excess_precision() &&
-        debug_options.xla_gpu_simplify_all_fp_conversions()) {
+    if (debug_options.xla_allow_excess_precision()) {
       sub_pipeline.AddPass<SimplifyFPConversions>();
     }
   };
@@ -1409,19 +1362,27 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
 
     // Rewrite FP8 GEMMs ahead of Triton which currently lacks support for FP8
     // and may rewrite quantized FP8 GEMMs as higher-precision GEMMs.
-    pipeline.AddPass<GemmRewriter>(gpu_version, /*f8_rewrite=*/true);
+    pipeline.AddPass<GemmRewriter>(gpu_version, GetToolkitVersion(),
+                                   /*f8_rewrite=*/true);
     if (debug_options.xla_gpu_enable_triton_gemm() && cuda_cc != nullptr &&
         cuda_cc->IsAtLeast(se::CudaComputeCapability::AMPERE)) {
       pipeline.AddPass<GemmFusion>(gpu_version);
     }
     // Rewrite non-FP8 GEMMs.
-    pipeline.AddPass<GemmRewriter>(gpu_version, /*f8_rewrite=*/false);
+    pipeline.AddPass<GemmRewriter>(gpu_version, GetToolkitVersion(),
+                                   /*f8_rewrite=*/false);
 
     // Rewrite GEMMs with broadcasted inputs as strided GEMMs.
     pipeline.AddPass<GemmBroadcastFoldingRewriter>();
 
     if (debug_options.xla_gpu_normalize_layouts()) {
       pipeline.AddPass<LayoutNormalization>(&NormalizeLayoutForGpuCustomCalls);
+      // Remove any redundant operations (such as bitcasts) introduced by layout
+      // normalization.
+      pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(simplifier_options);
+      // Layout normalization will create scatters that are not simplified and
+      // also have unsorted update_window_dims.
+      pipeline.AddPass<ScatterSimplifier>();
     }
     pipeline.AddPass<BroadcastCanonicalizer>();
 
@@ -1475,7 +1436,7 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
   pipeline.AddPass<CallInliner>();
   // TODO(tdanyluk): Apply CublasPadForGemms to the cuBLAS GEMMs generated
   // here for possibly better cuBLAS performance.
-  pipeline.AddPass<GemmRewriter>(gpu_version);
+  pipeline.AddPass<GemmRewriter>(gpu_version, GetToolkitVersion());
   // Rewrite GEMMs with broadcasted inputs as strided GEMMs.
   pipeline.AddPass<GemmBroadcastFoldingRewriter>();
 
@@ -1499,8 +1460,7 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
   // duplicate or NOPs, so remove them with algebraic simplification and CSE.
   pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(simplifier_options);
 
-  if (debug_options.xla_allow_excess_precision() &&
-      debug_options.xla_gpu_simplify_all_fp_conversions()) {
+  if (debug_options.xla_allow_excess_precision()) {
     // This pass cleans up chains of compiler-generated converts
     // (i.e. f32 -> bf16 -> f32) that have been produced by the algebraic
     // simplifier by rearranging ops (i.e. by pushing broadcasts towards the
@@ -1533,16 +1493,6 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
 #endif  // NDEBUG
 
   TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
-
-  if (DumpingEnabledForHloModule(*hlo_module)) {
-    TF_ASSIGN_OR_RETURN(
-        std::string autotune_results,
-        AutotunerUtil::SerializeAutotuneResultsForModule(
-            *hlo_module, autotune_config, /*as_textproto=*/true));
-    DumpToFileInDirOrStdout(*hlo_module, "", "autotune_results.pbtxt",
-                            autotune_results);
-  }
-
   return absl::OkStatus();
 }
 
@@ -1569,7 +1519,20 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
     return Compiler::TargetConfig{gpu_target_config_proto};
   }
   if (executor) {
-    return Compiler::TargetConfig{executor};
+    Compiler::TargetConfig target_config = Compiler::TargetConfig{executor};
+    int64_t device_memory_size =
+        target_config.device_description.device_memory_size();
+    // Checking for device_memory_size == -1 is how we detect that we are
+    // running on Nvidia's software simulator. When running on simulation,
+    // the config from StreamExecutor is inaccurate, so we must load the
+    // hard-coded config from a file.
+    if (device_memory_size == -1) {
+      return absl::FailedPreconditionError(
+          "When running on an NVIDIA simulation device, you must use "
+          "--xla_gpu_target_config_filename to pass in target information. "
+          "The target config from StreamExecutor is inaccurate.");
+    }
+    return target_config;
   }
   return absl::InternalError(
       "Either GPU has to be attached, or --xla_gpu_target_config_filename "
@@ -1579,7 +1542,7 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
 absl::StatusOr<std::unique_ptr<HloModule>> GpuCompiler::RunHloPasses(
     std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
     const CompileOptions& options) {
-  const DebugOptions& debug_opts = module->config().debug_options();
+  const DebugOptions debug_opts = module->config().debug_options();
   TF_RETURN_IF_ERROR(LoadAutotuneResultsFromFile(debug_opts));
   bool is_deviceless = options.target_config.has_value() ||
                        !debug_opts.xla_gpu_target_config_filename().empty();
@@ -1611,16 +1574,31 @@ absl::StatusOr<std::unique_ptr<HloModule>> GpuCompiler::RunHloPasses(
   // out we have no way of telling how far through the process we got).
   RecordHloPassesDuration(end_usecs - start_usecs);
 
+  DumpHloModuleMetadataIfEnabled({module.get()});
+
+  AutotuneResults autotune_results;
+  TF_ASSIGN_OR_RETURN(
+      AutotuneConfig autotune_config,
+      GetAutotuneConfig(stream_exec, debug_opts, options, gpu_target_config));
+  if (!is_deviceless) {
+    TF_RETURN_IF_ERROR(
+        AutotunerUtil::SerializeAutotuneResults(&autotune_results));
+    TF_RETURN_IF_ERROR(SerializeAutotuneResultsToFile(debug_opts));
+  }
   const std::optional<std::string> optimized_fingerprint =
-      MaybeUploadOptimizedGpuSymbols(module.get());
+      MaybeUploadOptimizedGpuSymbols(module.get(), autotune_results);
   if (unoptimized_fingerprint.has_value() &&
       optimized_fingerprint.has_value()) {
     MaybeUploadGpuSymbolMapping(*unoptimized_fingerprint,
                                 *optimized_fingerprint);
   }
-  if (!is_deviceless) {
-    TF_RETURN_IF_ERROR(
-        SerializeAutotuneResultsToFile(module->config().debug_options()));
+
+  if (DumpingEnabledForHloModule(*module)) {
+    TF_ASSIGN_OR_RETURN(
+        std::string autotune_results,
+        AutotunerUtil::SerializeAutotuneResults(/*as_textproto=*/true));
+    DumpToFileInDirOrStdout(*module, "", "autotune_results.pbtxt",
+                            autotune_results);
   }
 
   return std::move(module);
@@ -1795,7 +1773,7 @@ GpuCompiler::CompileToTargetBinary(const HloModuleConfig& module_config,
   MaybeOwningThreadPool thread_pool =
       module_config.debug_options()
               .xla_gpu_enable_llvm_module_compilation_parallelism()
-          ? MaybeOwningThreadPool::GetOrCreate(
+          ? CreateMaybeOwningThreadPool(
                 /*parallelism=*/module_config.debug_options()
                     .xla_gpu_force_compilation_parallelism(),
                 /*default_thread_pool=*/options.thread_pool,
@@ -1809,7 +1787,7 @@ GpuCompiler::CompileToTargetBinary(const HloModuleConfig& module_config,
   // Disable multi-threading during deviceless AOT compilation.
   // TODO(anlunx): Enable multi-threading once deviceless AOT compilation is
   // enabled.
-  if (!can_use_link_modules || !thread_pool || !stream_exec) {
+  if (!can_use_link_modules || !thread_pool.get() || !stream_exec) {
     return CompileSingleModule(module_config, gpu_version, debug_module,
 
                                llvm_module, /*relocatable=*/false, options,
@@ -1868,19 +1846,19 @@ GpuCompiler::CompileToTargetBinary(const HloModuleConfig& module_config,
       llvm_modules.size());
   tsl::BlockingCounter counter(llvm_modules.size());
   for (int i = 0; i < llvm_modules.size(); i++) {
-    thread_pool->Schedule([&compile_results, i, &llvm_modules, &counter, this,
-                           &module_config, &gpu_version, &debug_module,
-                           &options] {
-      // Each thread has its own context to avoid race conditions.
-      llvm::LLVMContext new_context;
-      std::unique_ptr<llvm::Module> new_module =
-          CopyToContext(*llvm_modules.at(i), new_context);
-      compile_results.at(i) = CompileSingleModule(
-          module_config, gpu_version, debug_module, new_module.get(),
-          /*relocatable=*/true, options,
-          /*shard_number=*/i);
-      counter.DecrementCount();
-    });
+    thread_pool.get_mutable()->Schedule(
+        [&compile_results, i, &llvm_modules, &counter, this, &module_config,
+         &gpu_version, &debug_module, &options] {
+          // Each thread has its own context to avoid race conditions.
+          llvm::LLVMContext new_context;
+          std::unique_ptr<llvm::Module> new_module =
+              CopyToContext(*llvm_modules.at(i), new_context);
+          compile_results.at(i) = CompileSingleModule(
+              module_config, gpu_version, debug_module, new_module.get(),
+              /*relocatable=*/true, options,
+              /*shard_number=*/i);
+          counter.DecrementCount();
+        });
   }
   counter.Wait();
 
@@ -1956,6 +1934,10 @@ GpuCompiler::CompileToBackendResult(
 absl::StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
     std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
     const CompileOptions& options) {
+  tsl::profiler::ScopedAnnotation backend_annotation{[&] {
+    return absl::StrFormat("XlaCompileBackend:#module=%s,program_id=%d#",
+                           module->name(), module->unique_id());
+  }};
   Thunk::BinaryMap dnn_compiled_graphs;
   if (stream_exec) {
     TF_RETURN_IF_ERROR(RunCudnnFusionCompilerPass(module.get(), stream_exec,
@@ -2028,6 +2010,10 @@ absl::StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
   int64_t debug_buffer_assignment_show_max =
       module->config().debug_options().xla_debug_buffer_assignment_show_max();
 
+  tsl::profiler::ScopedAnnotation annotation([&] {
+    return absl::StrFormat("XlaCreateGpuExecutable:#module=%s#",
+                           module->name());
+  });
   TF_ASSIGN_OR_RETURN(
       auto gpu_executable,
       GpuExecutable::Create(GpuExecutable::Params{
@@ -2082,11 +2068,9 @@ absl::StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
 absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
 GpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
                                 const AotCompilationOptions& options) {
-#if GOOGLE_CUDA
-  CHECK(options.PlatformId() == se::cuda::kCudaPlatformId);
-#elif TENSORFLOW_USE_ROCM
-  CHECK(options.PlatformId() == se::rocm::kROCmPlatformId);
-#endif
+  // Check that we are on the platform (CUDA or ROCm) that was chosen for AOT
+  // compilation.
+  CHECK_EQ(options.PlatformId(), PlatformId());
 
   std::vector<std::unique_ptr<HloModule>> modules =
       module_group->ConsumeModules();
@@ -2096,6 +2080,10 @@ GpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
 
   for (std::unique_ptr<HloModule>& module : modules) {
     if (!module->has_schedule()) {
+      tsl::profiler::ScopedAnnotation annotation{[&] {
+        return absl::StrFormat("XlaCompile:#module=%s,program_id=%d#",
+                               module->name(), module->unique_id());
+      }};
       CompileOptions compile_options;
       compile_options.device_allocator = options.device_allocator();
       compile_options.target_config = options.target_config();
@@ -2163,6 +2151,12 @@ absl::Status GpuCompiler::RunPostSchedulingPipelines(
   {
     HloPassPipeline pipeline("post-scheduling-passes");
 
+    if (module->config()
+            .debug_options()
+            .xla_gpu_enable_pipelined_collectives() ||
+        module->config().debug_options().xla_gpu_enable_pipelined_p2p()) {
+      pipeline.AddPass<PipelinedP2PRewriter>();
+    }
     HloPredicate is_nop =
         HloPredicateIsOp<HloOpcode::kParameter, HloOpcode::kConstant,
                          HloOpcode::kBitcast, HloOpcode::kGetTupleElement>;
@@ -2174,17 +2168,44 @@ absl::Status GpuCompiler::RunPostSchedulingPipelines(
   {
     HloPassPipeline pipeline("remat-pipeline");
 
-    HloCostAnalysis hlo_cost_analysis(ShapeSizeBytesFunction());
+    const bool enable_offloading = module->config()
+                                       .debug_options()
+                                       .xla_gpu_enable_host_memory_offloading();
     HloRematerialization::RematerializationModeConfig
         rematerialization_mode_config(/*recompute=*/true, /*compress=*/true,
-                                      /*host_offload=*/false);
+                                      /*host_offload=*/enable_offloading);
+    HloCostAnalysis::Options hlo_cost_analysis_options;
+    hlo_cost_analysis_options.shape_size = ShapeSizeBytesFunction();
+    std::optional<HloRematerialization::HostMemoryOffloadConfig>
+        offloading_config = std::nullopt;
+    if (enable_offloading) {
+      constexpr float kGiga = 1e+9;
+      // Fused multiply-add means that these two instructions are computed as
+      // one, so for this case the maximum flops is doubled.
+      constexpr float kFma = 2;
+      float flops_per_sec = gpu_device_info.core_count() *
+                            gpu_device_info.fpus_per_core() *
+                            gpu_device_info.clock_rate_ghz() * kGiga * kFma;
+      int64_t host_memory_space_color =
+          static_cast<int64_t>(se::MemoryType::kHost);
+      hlo_cost_analysis_options.set_flops_per_second(flops_per_sec);
+      hlo_cost_analysis_options.set_transcendentals_per_second(flops_per_sec);
+      offloading_config =
+          std::make_optional<HloRematerialization::HostMemoryOffloadConfig>(
+              /*host_memory_space=*/host_memory_space_color,
+              /*bandwidth_to_host_bytes_per_second=*/
+              gpu_device_info.memory_bandwidth(),
+              /*bandwidth_from_host_bytes_per_second=*/
+              gpu_device_info.memory_bandwidth());
+    }
+    HloCostAnalysis hlo_cost_analysis(hlo_cost_analysis_options);
     HloRematerialization::Options options(
         hlo_cost_analysis, rematerialization_mode_config,
         // Assume 75% of the total device memory is available for XLA.
         /*memory_limit_bytes=*/scheduler_mem_limit,
         /*block_size_limit=*/1, /*block_rematerialization_factor=*/1,
         /*min_remat_size=*/0, /*compact_shape_function=*/nullptr,
-        /*host_memory_offload_config=*/std::nullopt);
+        /*host_memory_offload_config=*/offloading_config);
     HloRematerialization::RematerializationSizes sizes;
     pipeline.AddPass<HloRematerialization>(options, sizes);
     pipeline.AddPass<StreamAttributeAnnotator>();
@@ -2208,14 +2229,10 @@ absl::Status GpuCompiler::RunPostSchedulingPipelines(
 
   // After we have a scheduled module and all operations wrapped into fusions we
   // can decide how to wrap them into command buffers.
-  if (!IsXlaRuntimeExecutableEnabled(module->config())) {
+  {
     HloPassPipeline pipeline("command-buffer-scheduling");
     auto driver_version = se::gpu::GpuDriver::GetDriverVersion();
-#if GOOGLE_CUDA
-    constexpr int toolkit_version = CUDA_VERSION;
-#else
-    constexpr int toolkit_version = TF_ROCM_VERSION;
-#endif
+    const int32_t toolkit_version = GetToolkitVersion();
     pipeline.AddPass<CommandBufferScheduling>(
         gpu_device_info, toolkit_version,
         driver_version.value_or(toolkit_version));
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.h b/third_party/xla/xla/service/gpu/gpu_compiler.h
index 9d30a471deda8a..6ee9de66fc851f 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.h
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.h
@@ -116,6 +116,8 @@ class GpuCompiler : public LLVMCompiler {
     return &FusionCanShareBufferHint;
   }
 
+  virtual int32_t GetToolkitVersion() const = 0;
+
  protected:
   struct BackendCompileResult {
     std::string asm_text;
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
index 4e886b46e5408c..bb064b853b74d4 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
@@ -1,4 +1,3 @@
-#include "xla/service/gpu/gpu_compiler.h"
 /* Copyright 2022 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "xla/service/gpu/gpu_compiler.h"
+
 #include <cstddef>
 #include <cstdint>
 #include <memory>
@@ -32,7 +33,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/service/buffer_assignment.h"
 #include "xla/service/executable.h"
 #include "xla/service/gpu/autotuner_util.h"
 #include "xla/service/gpu/gpu_hlo_schedule.h"
@@ -41,10 +41,15 @@ limitations under the License.
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/pattern_matcher_gmock.h"
 #include "xla/service/xla_debug_info_manager.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tests/filecheck.h"
 #include "xla/tests/hlo_test_base.h"
 #include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/casts.h"
 #include "tsl/platform/env.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/path.h"
+#include "tsl/platform/protobuf.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
@@ -395,6 +400,126 @@ ENTRY main {
             triton_disabled_module->computation_count());
 }
 
+TEST_F(GpuCompilerTest, CollectivePermuteDecompositionAndPipelining) {
+  const char* kModuleStr = R"(
+HloModule cp
+
+cond {
+    param = (u32[], f32[1, 1024, 1024]) parameter(0)
+    count = get-tuple-element(%param), index=0
+    ub = u32[] constant(11)
+    ROOT result = pred[] compare(count, ub), direction=LT
+ }
+
+body {
+    param = (u32[], f32[1, 1024, 1024]) parameter(0)
+    count = get-tuple-element(%param), index=0
+    send-data = get-tuple-element(%param), index=1
+
+    recv-data = f32[1, 1024, 1024] collective-permute(send-data),
+      source_target_pairs={{0,1}, {1,2}, {2,3}, {3,4}}, channel_id=1
+
+    // The computation code that uses the current recv-data and
+    // produces the send-data for the next iteration.
+    c1 = u32[] constant(1)
+    new_count = u32[] add(count, c1)
+    replica = u32[] replica-id()
+    c10 = u32[] constant(10)
+    sum = u32[] add(replica, c10)
+    sum2 = u32[] add(sum, count)
+    conv = f32[] convert(sum2)
+    p = f32[1, 1024, 1024] broadcast(conv), dimensions={}
+    b = f32[1, 1024, 1024] add(p, recv-data)
+    c = f32[1, 1024, 1024] multiply(b, b)
+    d = f32[1, 1024, 1024] tan(c)
+    s = f32[1, 1024, 1024] dot(c, d), lhs_batch_dims={0},
+      lhs_contracting_dims={1}, rhs_batch_dims={0}, rhs_contracting_dims={1}
+
+    ROOT result = (u32[], f32[1, 1024, 1024]) tuple(new_count, s)
+}
+
+ENTRY test_computation {
+    c0 = u32[] constant(0)
+    f0 = f32[] constant(0.0)
+    init = f32[1, 1024, 1024] broadcast(f0), dimensions={}
+    while_init = (u32[], f32[1, 1024, 1024]) tuple(c0, init)
+    while_result = (u32[], f32[1, 1024, 1024]) while(while_init), body=body, condition=cond
+    ROOT result = f32[1, 1024, 1024] get-tuple-element(while_result), index=1
+}
+)";
+
+  const char* kExpected = R"(
+CHECK:       recv-done
+CHECK-SAME:    channel_id=[[CHANNEL_ID:[0-9]+]]
+CHECK-SAME:    frontend_attributes={_xla_send_recv_pipeline="0"}
+CHECK:       send-done
+CHECK-SAME:    channel_id=[[CHANNEL_ID]]
+CHECK-SAME:    frontend_attributes={_xla_send_recv_pipeline="0"}
+CHECK:       %[[CUSTOM_CALL:.*]] = custom-call
+CHECK:       %[[AFTER_ALL:.*]] = after-all
+CHECK:       %[[RESULT_RECV:.*]] = recv(%[[AFTER_ALL]])
+CHECK-SAME:    channel_id=[[CHANNEL_ID]]
+CHECK-SAME:    frontend_attributes={_xla_send_recv_pipeline="0",
+CHECK-SAME{LITERAL}:                _xla_send_recv_source_target_pairs="{{0,1},{1,2},{2,3},{3,4}}"},
+CHECK-SAME:                         control-predecessors={%[[CUSTOM_CALL]]}
+CHECK:       %[[RESULT_SEND:.*]] = send(%[[SOME_SEND_ARG:.*]], %[[AFTER_ALL]])
+CHECK-SAME:    channel_id=1
+CHECK-SAME:    frontend_attributes={_xla_send_recv_pipeline="0",
+CHECK-SAME{LITERAL}:                _xla_send_recv_source_target_pairs="{{0,1},{1,2},{2,3},{3,4}}"},
+CHECK-SAME:                         control-predecessors={%[[RESULT_RECV]]}
+CHECK:       ROOT
+// We actually expect both RESULT_RECV and RESULT_SEND to match on this line.
+// However, despite popular belief, CHECK-DAG-SAME is not actually a valid
+// directive. Checking for both without using a DAG would be inherently flaky,
+// so we take the hit and only check for one of them.
+CHECK-SAME:    %[[RESULT_RECV]]
+
+CHECK: ENTRY
+CHECK:       %[[ENTRY_AFTER_ALL:.*]] = after-all
+CHECK:       %[[ENTRY_RECV:.*]] = recv(%[[ENTRY_AFTER_ALL]])
+CHECK-SAME:    channel_id=[[CHANNEL_ID]]
+CHECK-SAME:    frontend_attributes={_xla_send_recv_pipeline="0",
+CHECK-SAME{LITERAL}:                _xla_send_recv_source_target_pairs="{{0,1},{1,2},{2,3},{3,4}}"}
+CHECK:       %[[ENTRY_SEND:.*]] = send(%[[SOME_SEND_ARG:.*]], %[[ENTRY_AFTER_ALL]])
+CHECK-SAME:    channel_id=1
+CHECK-SAME:    frontend_attributes={_xla_send_recv_pipeline="0",
+CHECK-SAME{LITERAL}:                _xla_send_recv_source_target_pairs="{{0,1},{1,2},{2,3},{3,4}}"},
+CHECK-SAME:                         control-predecessors={%[[ENTRY_RECV]]}
+CHECK:       %[[WHILE_INIT:.*]] = tuple
+// Check here that the send argument is likewise passed to the while loop, as
+// a counterpart to the check in the child computation above.
+CHECK-SAME:    %[[ENTRY_SEND]]
+CHECK:       while(%[[WHILE_INIT]])
+CHECK:       recv-done
+CHECK-SAME:    channel_id=[[CHANNEL_ID]]
+CHECK-SAME:    frontend_attributes={_xla_send_recv_pipeline="0"}
+CHECK:       send-done
+CHECK-SAME:    channel_id=[[CHANNEL_ID]]
+CHECK-SAME:    frontend_attributes={_xla_send_recv_pipeline="0"}
+)";
+
+  HloModuleConfig config;
+  DebugOptions debug_options = GetDebugOptionsForTest();
+  debug_options.set_xla_gpu_enable_latency_hiding_scheduler(true);
+  debug_options.set_xla_gpu_collective_permute_decomposer_threshold(1);
+  debug_options.set_xla_gpu_enable_pipelined_p2p(true);
+  debug_options.set_xla_gpu_enable_triton_gemm(false);
+  config.set_debug_options(debug_options);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleStr, config));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
+                          GetOptimizedModule(std::move(module)));
+  TF_ASSERT_OK(Schedule(optimized_module.get()));
+
+  HloPrintOptions options;
+  options.set_print_operand_shape(false);
+  options.set_print_result_shape(false);
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool filecheck_matched,
+      RunFileCheck(optimized_module->ToString(options), kExpected));
+  EXPECT_TRUE(filecheck_matched);
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto b/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto
index 0f81bf9ae86902..be93c508b1e1ee 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto
+++ b/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto
@@ -22,4 +22,16 @@ results {
       nanos: 854688
     }
   }
-}
\ No newline at end of file
+}
+results {
+  device: "sm_8.0 with 42296475648B RAM, 108 cores, 1410000KHz clock, 1215000KHz mem clock, 41943040B L2$"
+  hlo: "{\n  tmp_0 = bf16[1,4,32,1024,1024]{4,3,2,1,0} parameter(0)\n  tmp_1 = f32[1,4,32,1024,1024]{4,3,2,1,0} convert(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_0)\n  tmp_2 = bf16[] constant({...})\n  tmp_3 = bf16[1,4,32,1024,1024]{4,3,2,1,0} broadcast(bf16[] tmp_2), dimensions={}\n  tmp_4 = f32[1,4,32,1024,1024]{4,3,2,1,0} convert(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_3)\n  tmp_5 = f32[1,4,32,1024,1024]{4,3,2,1,0} multiply(f32[1,4,32,1024,1024]{4,3,2,1,0} tmp_1, f32[1,4,32,1024,1024]{4,3,2,1,0} tmp_4)\n  tmp_6 = bf16[1,4,32,1024,1024]{4,3,2,1,0} convert(f32[1,4,32,1024,1024]{4,3,2,1,0} tmp_5)\n  tmp_7 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_6)\n  tmp_8 = bf16[4,32,1024,1024]{3,2,1,0} transpose(bf16[4,32,1024,1024]{3,2,1,0} tmp_7), dimensions={0,1,3,2}\n  tmp_9 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[4,32,1024,1024]{3,2,1,0} tmp_8)\n  tmp_10 = bf16[1,4,32,1024,1024]{4,3,2,1,0} parameter(1)\n  tmp_11 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_10)\n  tmp_12 = bf16[128,1024,1024]{2,1,0} dot(bf16[128,1024,1024]{2,1,0} tmp_9, bf16[128,1024,1024]{2,1,0} tmp_11), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}\n  ROOT tmp_13 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[128,1024,1024]{2,1,0} tmp_12)\n}"
+  result {
+    gemm {
+      algorithm: -1
+    }
+    run_time {
+      nanos: 1576960
+    }
+  }
+}
diff --git a/third_party/xla/xla/service/gpu/gpu_conv_rewriter.cc b/third_party/xla/xla/service/gpu/gpu_conv_rewriter.cc
index 8ba1af16338870..b8dbf25879db5a 100644
--- a/third_party/xla/xla/service/gpu/gpu_conv_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/gpu_conv_rewriter.cc
@@ -65,7 +65,7 @@ bool MaybeConv1dToConv2d(HloInstruction* conv) {
   if (reshape_degenerate.has_value() &&
       reshape_degenerate->deleted_dimensions.empty() &&
       reshape_degenerate->inserted_dimensions.size() == 1) {
-    auto dnums = conv->convolution_dimension_numbers();
+    const auto& dnums = conv->convolution_dimension_numbers();
     for (auto dim : dnums.kernel_spatial_dimensions()) {
       if (dim == reshape_degenerate->inserted_dimensions[0]) {
         return true;
diff --git a/third_party/xla/xla/service/gpu/gpu_conv_rewriter_test.cc b/third_party/xla/xla/service/gpu/gpu_conv_rewriter_test.cc
index add6e0f53ea506..ee5d2a5dbcce4e 100644
--- a/third_party/xla/xla/service/gpu/gpu_conv_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_conv_rewriter_test.cc
@@ -46,8 +46,8 @@ namespace m = ::xla::match;
 class GpuConvRewriterTest : public HloTestBase {
  public:
   GpuConvRewriterTest()
-      : HloTestBase(/*layout_sensitive=*/true,
-                    /*allow_mixed_precision=*/false) {
+      : HloTestBase(/*verifier_layout_sensitive=*/true,
+                    /*allow_mixed_precision_in_hlo_verifier=*/false) {
     for (int i = 0; i < 2; ++i) {
       WindowDimension* window_dim = default_conv_window_.add_dimensions();
       window_dim->set_size(1);
diff --git a/third_party/xla/xla/service/gpu/gpu_executable.cc b/third_party/xla/xla/service/gpu/gpu_executable.cc
index bc5c873b3688d0..cd0121dcdb77e2 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable.cc
+++ b/third_party/xla/xla/service/gpu/gpu_executable.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "llvm/ADT/ArrayRef.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "xla/executable_run_options.h"
 #include "xla/hlo/ir/hlo_input_output_alias_config.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -51,9 +52,10 @@ limitations under the License.
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/gpu_constants.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
-#include "xla/service/gpu/nccl_clique_key.h"
 #include "xla/service/gpu/runtime/annotation.h"
+#include "xla/service/gpu/runtime/for_all_thunks.h"
 #include "xla/service/gpu/runtime/nccl_clique.h"
+#include "xla/service/gpu/runtime/nccl_clique_key.h"
 #include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/gpu/stream_executor_util.h"
 #include "xla/service/hlo_execution_profile.h"
@@ -108,19 +110,6 @@ namespace gpu {
 
 using ::tsl::profiler::ScopedAnnotation;
 
-bool IsXlaRuntimeExecutableEnabled(const HloModuleConfig& config) {
-  bool enabled = config.debug_options().xla_gpu_enable_xla_runtime_executable();
-  if (enabled) {
-    LOG(ERROR)
-        << "XLA:GPU tried to use deprecated xla runtime by setting "
-           "--xla_gpu_enable_xla_runtime_executable flag to `true` but the "
-           "flag value was ignored as XLA:GPU uses default runtime. This flag "
-           "together with the deprecated code will be removed soon. Please "
-           "report bugs to XLA team if this breaks your workloads.";
-  }
-  return false;
-}
-
 static bool NeedsAsyncCommsStream(Thunk& thunk) {
   switch (thunk.kind()) {
     case Thunk::Kind::kNcclAllReduceStart:
@@ -131,28 +120,19 @@ static bool NeedsAsyncCommsStream(Thunk& thunk) {
   }
 }
 
-// Traverses operations in HLO module and collects execution stream ids
-// requested by HLO operations. At run time thunks may use additional streams to
-// launch compute operations in addition to a main one.
-//
-// TODO(ezhulenev): Execution stream requirements should be queried from thunks
-// directly and not from HLO module that might be missing.
+// Returns the set of `ExecutionStreamIds` requested by all `Thunks` in the
+// `GpuExecutable`. At run time `Thunks` may use additional streams to launch
+// compute operations in parallel.
 static absl::flat_hash_set<ExecutionStreamId> GetExecutionStreamIds(
-    const HloModule& module) {
+    const ThunkSequence& thunks) {
   absl::flat_hash_set<ExecutionStreamId> stream_ids;
-  for (const HloComputation* comp : module.computations()) {
-    for (const HloInstruction* hlo : comp->instructions()) {
-      if (hlo->has_backend_config() &&
-          hlo->backend_config<GpuBackendConfig>().ok()) {
-        int64_t op_queue_id = hlo->backend_config<GpuBackendConfig>()
-                                  .value()
-                                  .operation_queue_id();
-        if (op_queue_id > 0) {
-          stream_ids.insert(ExecutionStreamId(op_queue_id));
+  ForAllThunks(
+      [&](const Thunk* thunk) {
+        if (thunk->execution_stream_id() > 0) {
+          stream_ids.insert(thunk->execution_stream_id());
         }
-      }
-    }
-  }
+      },
+      &thunks);
   return stream_ids;
 }
 
@@ -170,9 +150,7 @@ GpuExecutable::GpuExecutable(GpuExecutable::Params params)
       dnn_compiled_graphs_(std::move(params.dnn_compiled_graphs)),
       gpu_version_(params.gpu_version),
       thunks_(std::move(params.executable)),
-      execution_stream_ids_(has_module()
-                                ? GetExecutionStreamIds(module())
-                                : absl::flat_hash_set<ExecutionStreamId>()),
+      execution_stream_ids_(GetExecutionStreamIds(*thunks_)),
       module_name_(params.module_name),
       output_shape_(params.output_shape),
       allocations_(std::move(params.mlir_allocations)),
@@ -206,7 +184,7 @@ absl::Status GpuExecutable::CheckCompatibilityWithServiceExecutableRunOptions(
   se::Stream* main_stream = run_options->stream();
 
   stream_executor::Platform::Id platform_id =
-      main_stream->parent()->platform()->id();
+      main_stream->parent()->GetPlatform()->id();
   if (platform_id == stream_executor::rocm::kROCmPlatformId) {
     auto cc = main_stream->GetRocmComputeCapability();
     std::string stream_arch = cc.gcn_arch_name();
@@ -448,7 +426,8 @@ absl::Status ExecuteThunks(
   // Parameters for executing collective operations.
   TF_ASSIGN_OR_RETURN(Thunk::CollectiveExecuteParams collective_params,
                       Thunk::CollectiveExecuteParams::Create(
-                          *run_options, main_stream->parent()->device_ordinal(),
+                          *run_options, async_comms_streams,
+                          main_stream->parent()->device_ordinal(),
                           collective_max_nchannels, p2p_max_nchannels));
 
   ResourceRequests resource_requests;
@@ -469,9 +448,14 @@ absl::Status ExecuteThunks(
 
   {  // Initialize thunks using prepared resources before execution.
     Thunk::InitializeParams initialize_params{
-        executor,           executable_source,           &buffer_allocations,
-        main_stream,        command_buffer_trace_stream, &collective_params,
-        &collective_cliques};
+        executor,
+        executable_source,
+        &buffer_allocations,
+        main_stream,
+        command_buffer_trace_stream,
+        &collective_params,
+        &collective_cliques,
+        run_options->run_options().ffi_execution_context()};
 
     tsl::profiler::TraceMe trace([&] { return "Thunks::Initialize"; });
     for (const std::unique_ptr<Thunk>& thunk : thunk_sequence) {
@@ -489,8 +473,8 @@ absl::Status ExecuteThunks(
   // Prepare parameters for thunks execution.
   Thunk::ExecuteParams execute_params = Thunk::ExecuteParams::Create(
       *run_options, buffer_allocations, main_stream,
-      command_buffer_trace_stream, async_comms_streams, &collective_params,
-      &collective_cliques, additional_execution_streams);
+      command_buffer_trace_stream, &collective_params, &collective_cliques,
+      std::move(additional_execution_streams));
 
   for (const std::unique_ptr<Thunk>& thunk : thunk_sequence) {
     // Annotate execution of this op if tracing was enabled when we started
@@ -645,7 +629,8 @@ GpuExecutable::ResolveConstantGlobals(se::Stream* stream) {
   // The CUDA driver isn't able to load a PTX and a binary which are both empty.
   // It's okay if we skip loading in this case; if the module isn't loaded, all
   // symbol lookups will fail, just as they should for an empty module.
-  if (!(executor->platform()->id() == stream_executor::cuda::kCudaPlatformId &&
+  if (!(executor->GetPlatform()->id() ==
+            stream_executor::cuda::kCudaPlatformId &&
         binary().empty() && text().empty())) {
     TF_RETURN_IF_ERROR(executor->LoadModule(module_spec, &module_handle));
   }
@@ -657,8 +642,7 @@ GpuExecutable::ResolveConstantGlobals(se::Stream* stream) {
   for (const ConstantInfo& info : constants_) {
     absl::StatusOr<stream_executor::DeviceMemoryBase> global_status;
     if (static_cast<bool>(module_handle)) {
-      global_status =
-          executor->GetUntypedSymbol(info.symbol_name, module_handle);
+      global_status = executor->GetSymbol(info.symbol_name, module_handle);
     }
 
     se::DeviceMemoryBase global;
@@ -752,16 +736,12 @@ absl::StatusOr<se::DeviceMemoryBase> GpuExecutable::BufferForAllocation(
     const int64_t buffer_size = allocation.size();
     se::DeviceMemoryBase buffer_address;
     if (buffer_size > 0) {
-      absl::StatusOr<se::OwningDeviceMemory> buffer =
+      TF_ASSIGN_OR_RETURN(
+          se::OwningDeviceMemory buffer,
           memory_allocator->Allocate(device_ordinal, buffer_size,
                                      /*retry_on_failure=*/true,
-                                     /*memory_space=*/allocation.color());
-      if (!buffer.ok()) {
-        return ResourceExhausted("%s\n%s\n", buffer.status().message(),
-                                 buffer_assignment_->ToVerboseString(
-                                     debug_buffer_assignment_show_max_));
-      }
-      buffer_address = buffer->Release();
+                                     /*memory_space=*/allocation.color()));
+      buffer_address = buffer.Release();
     }
     return buffer_address;
   }
@@ -1082,7 +1062,7 @@ absl::Status GpuExecutable::SetUpMlirAllocation(
         }
       }
       allocations->at(i).set_entry_computation_parameter(
-          param_attr.cast<mlir::IntegerAttr>().getInt(), shape_index,
+          mlir::cast<mlir::IntegerAttr>(param_attr).getInt(), shape_index,
           static_cast<bool>(func.getArgAttr(i, "lmhlo.output_index")));
     }
     // TODO(timshen): this information is redundant. This is here only for
@@ -1096,7 +1076,7 @@ absl::Status GpuExecutable::SetUpMlirAllocation(
       // Reconstruct a shape index from output_index.
       ShapeIndex shape_index;
       for (const llvm::APInt& element :
-           output_index_attr.cast<mlir::DenseIntElementsAttr>()) {
+           mlir::cast<mlir::DenseIntElementsAttr>(output_index_attr)) {
         shape_index.push_back(element.getSExtValue());
       }
       auto& o = (*output_info)[shape_index];
@@ -1107,8 +1087,9 @@ absl::Status GpuExecutable::SetUpMlirAllocation(
         if (func.getArgAttr(i, "lmhlo.must_alias")) {
           kind = HloInputOutputAliasConfig::kMustAlias;
         }
-        o.alias_config.emplace(param_attr.cast<mlir::IntegerAttr>().getInt(),
-                               ShapeIndex{}, kind);
+        o.alias_config.emplace(
+            mlir::cast<mlir::IntegerAttr>(param_attr).getInt(), ShapeIndex{},
+            kind);
       }
       if (func.getArgument(i).use_empty()) {
         o.passthrough = true;
diff --git a/third_party/xla/xla/service/gpu/gpu_executable.h b/third_party/xla/xla/service/gpu/gpu_executable.h
index e2e0daafe7554a..c76eacc809e4b7 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable.h
+++ b/third_party/xla/xla/service/gpu/gpu_executable.h
@@ -50,14 +50,12 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/scoped_module_handle.h"
 #include "xla/stream_executor/stream_executor.h"
 
 namespace xla {
 namespace gpu {
 
-// Returns whether GpuExecutable runs with Xla Runtime.
-bool IsXlaRuntimeExecutableEnabled(const HloModuleConfig& config);
-
 // GPU-targeting implementation of the XLA Executable interface.
 //
 // Launches the given GPU kernel via the StreamExecutor.
@@ -161,7 +159,7 @@ class GpuExecutable : public Executable {
       const ServiceExecutableRunOptions* run_options,
       VariantArguments arguments);
 
-  absl::Span<const BufferAllocation> GetAllocations() const {
+  absl::Span<const BufferAllocation> GetAllocations() const override {
     // A GpuExecutable can get its allocations in three ways:
     // 1 - From a regular compilation that uses allocations from MLIR.
     // 2 - From a regular compilation that uses the original allocations from
diff --git a/third_party/xla/xla/service/gpu/gpu_executable_run_options.cc b/third_party/xla/xla/service/gpu/gpu_executable_run_options.cc
index da269d10f8fa22..ba50a03a767ebc 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable_run_options.cc
+++ b/third_party/xla/xla/service/gpu/gpu_executable_run_options.cc
@@ -21,7 +21,7 @@ limitations under the License.
 
 #include "xla/executable_run_options.h"
 #include "xla/service/global_device_id.h"
-#include "xla/service/gpu/nccl_clique_key.h"
+#include "xla/service/gpu/runtime/nccl_clique_key.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/gpu_executable_run_options.h b/third_party/xla/xla/service/gpu/gpu_executable_run_options.h
index 03481525499292..1fc5fd664bc103 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable_run_options.h
+++ b/third_party/xla/xla/service/gpu/gpu_executable_run_options.h
@@ -21,7 +21,7 @@ limitations under the License.
 
 #include "xla/executable_run_options.h"
 #include "xla/service/global_device_id.h"
-#include "xla/service/gpu/nccl_clique_key.h"
+#include "xla/service/gpu/runtime/nccl_clique_key.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/gpu_fused_mha_runner.cc b/third_party/xla/xla/service/gpu/gpu_fused_mha_runner.cc
index 3dc4763bae91dd..09af1e523ab7f1 100644
--- a/third_party/xla/xla/service/gpu/gpu_fused_mha_runner.cc
+++ b/third_party/xla/xla/service/gpu/gpu_fused_mha_runner.cc
@@ -52,7 +52,6 @@ absl::Status RunFusedMHA(GpufMHAParams params, se::Stream *stream,
                          DeviceMemory<ElementType> rhs_bmm1_buffer,
                          DeviceMemory<ElementType> rhs_bmm2_buffer,
                          DeviceMemory<OutputType> output_buffer,
-                         DeviceMemoryBase mask_buffer,
                          DeviceMemoryBase bias_buffer,
                          DeviceMemoryBase scratch_memory,
                          DeviceMemoryBase activation_output,
@@ -64,45 +63,24 @@ absl::Status RunFusedMHA(GpufMHAParams params, se::Stream *stream,
     local_runner.emplace(params.config->algorithm);
     lazy_runner = &*local_runner;
   }
-  TF_ASSIGN_OR_RETURN(se::dnn::FusedMHAKind kind,
-                      GetDNNFusedMHAKindFromCudnnfMHAKind(params.config->kind));
   std::optional<double> dropout_rate;
   if (params.config->dropout_rate) {
     dropout_rate = *params.config->dropout_rate;
   }
 
-  double scale = 1.0;
-  if (params.config->fmha_scale) {
-    scale = *params.config->fmha_scale;
-  }
-
   std::optional<int64_t> seed;
   if (params.config->seed) {
     seed = *params.config->seed;
   }
-  TF_ASSIGN_OR_RETURN(
-      se::dnn::FMHAMaskKind mask_type,
-      GetDNNFmhaMaskKindFromCudnnFmhaMaskKind(params.config->mask_type));
-  se::dnn::FusedMHAOp::Config config{kind,
-                                     scale,
-                                     params.config->lhs_bmm1,
-                                     params.config->rhs_bmm1,
-                                     params.config->rhs_bmm2,
-                                     params.config->intermediate_lhs_bmm2,
-                                     params.config->output,
-                                     params.config->bias,
-                                     params.config->mask,
-                                     params.config->activation,
-                                     dropout_rate,
-                                     seed,
-                                     params.config->is_flash_attention,
-                                     mask_type};
+
+  TF_ASSIGN_OR_RETURN(se::dnn::FusedMHAOp::Config config,
+                      params.config->AsDnnFusedMHAOpConfig());
   TF_ASSIGN_OR_RETURN(auto *runner,
                       lazy_runner->GetOrCreateRunner(config, stream));
   return (*runner)(stream, options.profile_result, scratch_memory,
                    lhs_bmm1_buffer, rhs_bmm1_buffer, rhs_bmm2_buffer,
-                   output_buffer, mask_buffer, bias_buffer, activation_output,
-                   seqlen_q, seqlen_k);
+                   output_buffer, bias_buffer, activation_output, seqlen_q,
+                   seqlen_k);
 }
 
 template <typename ElementType, typename BiasType, typename OutputType>
@@ -117,9 +95,6 @@ absl::Status RunGpuFMHAImpl(const GpufMHAParams &params, se::Stream *stream,
       params.activation_buffer.has_value()
           ? se::DeviceMemory<OutputType>(*params.activation_buffer)
           : se::DeviceMemoryBase();
-  auto mask_buffer = params.mask_buffer.has_value()
-                         ? se::DeviceMemory<ElementType>(*params.mask_buffer)
-                         : se::DeviceMemoryBase();
   auto bias_buffer = params.bias_buffer.has_value()
                          ? se::DeviceMemory<BiasType>(*params.bias_buffer)
                          : se::DeviceMemoryBase();
@@ -138,19 +113,14 @@ absl::Status RunGpuFMHAImpl(const GpufMHAParams &params, se::Stream *stream,
 
   absl::Status run_status = absl::OkStatus();
   switch (params.config->kind) {
-    case CudnnfMHAKind::kBmmBmm:
     case CudnnfMHAKind::kSoftmaxDropout:
     case CudnnfMHAKind::kSoftmax:
-    case CudnnfMHAKind::kScaleMaskSoftmax:
-    case CudnnfMHAKind::kScaleMaskSoftmaxDropout:
-    case CudnnfMHAKind::kScaleBiasMaskSoftmax:
-    case CudnnfMHAKind::kScaleBiasMaskSoftmaxDropout:
     case CudnnfMHAKind::kScaleBiasSoftmax:
     case CudnnfMHAKind::kScaleBiasSoftmaxDropout:
       run_status = RunFusedMHA<ElementType, BiasType, OutputType>(
           params, stream, options, lhs_bmm1_buffer, rhs_bmm1_buffer,
-          rhs_bmm2_buffer, output_buffer, mask_buffer, bias_buffer,
-          scratch_memory, activation_buffer, seqlen_q_buffer, seqlen_k_buffer);
+          rhs_bmm2_buffer, output_buffer, bias_buffer, scratch_memory,
+          activation_buffer, seqlen_q_buffer, seqlen_k_buffer);
       break;
     default:
       return Internal("Invalid cuDNN fMHA kind");
@@ -181,10 +151,9 @@ absl::Status RunFusedMHABackward(
     DeviceMemory<OutputType> d_bmm1_lhs_buffer,
     DeviceMemory<OutputType> d_bmm1_rhs_buffer,
     DeviceMemory<OutputType> d_bmm2_rhs_buffer, DeviceMemoryBase d_s_buffer,
-    DeviceMemoryBase mask_buffer, DeviceMemoryBase d_bias_buffer,
-    DeviceMemoryBase fwd_output_buffer, DeviceMemoryBase bias_buffer,
-    DeviceMemoryBase scratch_memory, DeviceMemoryBase seqlen_q,
-    DeviceMemoryBase seqlen_k) {
+    DeviceMemoryBase d_bias_buffer, DeviceMemoryBase fwd_output_buffer,
+    DeviceMemoryBase bias_buffer, DeviceMemoryBase scratch_memory,
+    DeviceMemoryBase seqlen_q, DeviceMemoryBase seqlen_k) {
   se::dnn::LazyOpRunner<se::dnn::FusedMHABackwardOp> *lazy_runner =
       options.runner_cache->AsFusedMHABackwardRunner();
   std::optional<se::dnn::LazyOpRunner<se::dnn::FusedMHABackwardOp>>
@@ -193,46 +162,18 @@ absl::Status RunFusedMHABackward(
     local_runner.emplace(params.config->algorithm);
     lazy_runner = &*local_runner;
   }
-  // FMHA TODO: add GetDNNFusedMHAKindFromCudnnfMHAKind here
-  TF_ASSIGN_OR_RETURN(se::dnn::FusedMHAKind kind,
-                      GetDNNFusedMHAKindFromCudnnfMHAKind(params.config->kind));
   std::optional<double> dropout_rate;
   if (params.config->dropout_rate) {
     dropout_rate = *params.config->dropout_rate;
   }
 
-  double scale = 1.0;
-  if (params.config->fmha_scale) {
-    scale = *params.config->fmha_scale;
-  }
-
   std::optional<int64_t> seed;
   if (params.config->seed) {
     seed = *params.config->seed;
   }
 
-  TF_ASSIGN_OR_RETURN(
-      se::dnn::FMHAMaskKind mask_type,
-      GetDNNFmhaMaskKindFromCudnnFmhaMaskKind(params.config->mask_type));
-  se::dnn::FusedMHABackwardOp::Config config{kind,
-                                             scale,
-                                             params.config->bmm1_grad_gemm1_rhs,
-                                             params.config->bmm1_grad_gemm2_rhs,
-                                             params.config->bmm2_grad_gemm1_lhs,
-                                             params.config->bmm2_grad_gemm2_rhs,
-                                             params.config->d_output,
-                                             params.config->d_bmm1_lhs,
-                                             params.config->d_bmm1_rhs,
-                                             params.config->d_bmm2_rhs,
-                                             params.config->d_s,
-                                             params.config->mask,
-                                             params.config->d_bias,
-                                             params.config->fwd_output,
-                                             params.config->bias,
-                                             dropout_rate,
-                                             seed,
-                                             params.config->is_flash_attention,
-                                             mask_type};
+  TF_ASSIGN_OR_RETURN(se::dnn::FusedMHABackwardOp::Config config,
+                      params.config->AsDnnFusedMHABackwardOpConfig());
   TF_ASSIGN_OR_RETURN(auto *runner,
                       lazy_runner->GetOrCreateRunner(config, stream));
   // TODO: pass in real softmax_sum, dQ_accum, fwd_output
@@ -240,7 +181,7 @@ absl::Status RunFusedMHABackward(
                    bmm1_grad_gemm1_rhs_buffer, bmm1_grad_gemm2_rhs_buffer,
                    bmm2_grad_gemm1_lhs_buffer, bmm2_grad_gemm2_rhs_buffer,
                    d_output_buffer, d_bmm1_lhs_buffer, d_bmm1_rhs_buffer,
-                   d_bmm2_rhs_buffer, d_s_buffer, mask_buffer, d_bias_buffer,
+                   d_bmm2_rhs_buffer, d_s_buffer, d_bias_buffer,
                    fwd_output_buffer, bias_buffer, seqlen_q, seqlen_k);
   return absl::OkStatus();
 }
@@ -271,10 +212,6 @@ absl::Status RunGpuFMHABackwardImpl(const GpufMHABackwardParams &params,
                         ? se::DeviceMemory<OutputType>(*params.d_s_buffer)
                         : se::DeviceMemoryBase();
 
-  auto mask_buffer = params.mask_buffer.has_value()
-                         ? se::DeviceMemory<ElementType>(*params.mask_buffer)
-                         : se::DeviceMemoryBase();
-
   auto d_bias_buffer = params.d_bias_buffer.has_value()
                            ? se::DeviceMemory<OutputType>(*params.d_bias_buffer)
                            : se::DeviceMemoryBase();
@@ -305,22 +242,17 @@ absl::Status RunGpuFMHABackwardImpl(const GpufMHABackwardParams &params,
 
   absl::Status run_status = absl::OkStatus();
   switch (params.config->kind) {
-    case CudnnfMHAKind::kBackwardBmmBmm:
     case CudnnfMHAKind::kBackwardSoftmaxDropout:
     case CudnnfMHAKind::kBackwardSoftmax:
     case CudnnfMHAKind::kBackwardScaleBiasSoftmax:
     case CudnnfMHAKind::kBackwardScaleBiasSoftmaxDropout:
-    case CudnnfMHAKind::kBackwardScaleMaskSoftmax:
-    case CudnnfMHAKind::kBackwardScaleMaskSoftmaxDropout:
-    case CudnnfMHAKind::kBackwardScaleBiasMaskSoftmax:
-    case CudnnfMHAKind::kBackwardScaleBiasMaskSoftmaxDropout:
       run_status = RunFusedMHABackward<ElementType, OutputType>(
           params, stream, options, bmm1_grad_gemm1_rhs_buffer,
           bmm1_grad_gemm2_rhs_buffer, bmm2_grad_gemm1_lhs_buffer,
           bmm2_grad_gemm2_rhs_buffer, d_output_buffer, d_bmm1_lhs_buffer,
-          d_bmm1_rhs_buffer, d_bmm2_rhs_buffer, d_s_buffer, mask_buffer,
-          d_bias_buffer, fwd_output_buffer, bias_buffer, scratch_memory,
-          seqlen_q_buffer, seqlen_k_buffer);
+          d_bmm1_rhs_buffer, d_bmm2_rhs_buffer, d_s_buffer, d_bias_buffer,
+          fwd_output_buffer, bias_buffer, scratch_memory, seqlen_q_buffer,
+          seqlen_k_buffer);
       break;
     default:
       return Internal("Invalid cuDNN fMHA kind");
@@ -425,7 +357,6 @@ absl::Status RunGpuFMHABackwardImpl(const GpufMHABackwardParams &params,
                                         bias_shape.layout().minor_to_major());
   }
   config.kind = desc.kind;
-  config.is_flash_attention = desc.is_flash_attention;
   config.mask_type = desc.mask_type;
   const CudnnfMHABackendConfig &backend_config = desc.backend_config;
   config.algorithm = se::dnn::AlgorithmDesc(backend_config.algorithm());
@@ -435,6 +366,21 @@ absl::Status RunGpuFMHABackwardImpl(const GpufMHABackwardParams &params,
   return config;
 }
 
+absl::StatusOr<se::dnn::FusedMHAOp::Config>
+GpufMHAConfig::AsDnnFusedMHAOpConfig() const {
+  double scale = 1.0;
+  if (fmha_scale.has_value()) {
+    scale = *fmha_scale;
+  }
+  TF_ASSIGN_OR_RETURN(se::dnn::FMHAMaskKind mask_type,
+                      GetDNNFmhaMaskKindFromCudnnFmhaMaskKind(mask_type));
+
+  return se::dnn::FusedMHAOp::Config{
+      scale,    lhs_bmm1, rhs_bmm1,   rhs_bmm2,     intermediate_lhs_bmm2,
+      output,   bias,     activation, dropout_rate, seed,
+      mask_type};
+}
+
 /*static*/ absl::StatusOr<GpufMHABackwardConfig> GpufMHABackwardConfig::For(
     const GpufMHABackwardDescriptor &desc) {
   // Get shapes from desc.
@@ -568,7 +514,6 @@ absl::Status RunGpuFMHABackwardImpl(const GpufMHABackwardParams &params,
   }
 
   config.kind = desc.kind;
-  config.is_flash_attention = desc.is_flash_attention;
   config.mask_type = desc.mask_type;
   const CudnnfMHABackendConfig &backend_config = desc.backend_config;
   config.algorithm = se::dnn::AlgorithmDesc(backend_config.algorithm());
@@ -578,11 +523,36 @@ absl::Status RunGpuFMHABackwardImpl(const GpufMHABackwardParams &params,
   return config;
 }
 
+absl::StatusOr<se::dnn::FusedMHABackwardOp::Config>
+GpufMHABackwardConfig::AsDnnFusedMHABackwardOpConfig() const {
+  double scale = 1.0;
+  if (fmha_scale.has_value()) {
+    scale = *fmha_scale;
+  }
+  TF_ASSIGN_OR_RETURN(se::dnn::FMHAMaskKind mask_type,
+                      GetDNNFmhaMaskKindFromCudnnFmhaMaskKind(mask_type));
+  return se::dnn::FusedMHABackwardOp::Config{scale,
+                                             bmm1_grad_gemm1_rhs,
+                                             bmm1_grad_gemm2_rhs,
+                                             bmm2_grad_gemm1_lhs,
+                                             bmm2_grad_gemm2_rhs,
+                                             d_output,
+                                             d_bmm1_lhs,
+                                             d_bmm1_rhs,
+                                             d_bmm2_rhs,
+                                             d_s,
+                                             d_bias,
+                                             fwd_output,
+                                             bias,
+                                             dropout_rate,
+                                             seed,
+                                             mask_type};
+}
+
 /*static*/ absl::StatusOr<GpufMHAParams> GpufMHAParams::For(
     const GpufMHAConfig &config, se::DeviceMemoryBase lhs_bmm1_buffer,
     se::DeviceMemoryBase rhs_bmm1_buffer, se::DeviceMemoryBase rhs_bmm2_buffer,
     se::DeviceMemoryBase output_buffer,
-    std::optional<se::DeviceMemoryBase> mask_buffer,
     std::optional<se::DeviceMemoryBase> bias_buffer,
     std::optional<se::DeviceMemoryBase> activation_buffer,
     std::optional<se::DeviceMemoryBase> seqlen_q_buffer,
@@ -594,7 +564,6 @@ absl::Status RunGpuFMHABackwardImpl(const GpufMHABackwardParams &params,
   params.rhs_bmm2_buffer = rhs_bmm2_buffer;
   params.output_buffer = output_buffer;
   params.activation_buffer = activation_buffer;
-  params.mask_buffer = mask_buffer;
   params.bias_buffer = bias_buffer;
   params.seqlen_q_buffer = seqlen_q_buffer;
   params.seqlen_k_buffer = seqlen_k_buffer;
@@ -612,7 +581,6 @@ absl::Status RunGpuFMHABackwardImpl(const GpufMHABackwardParams &params,
     se::DeviceMemoryBase d_bmm1_rhs_buffer,
     se::DeviceMemoryBase d_bmm2_rhs_buffer,
     std::optional<se::DeviceMemoryBase> d_s_buffer,
-    std::optional<se::DeviceMemoryBase> mask_buffer,
     std::optional<se::DeviceMemoryBase> d_bias_buffer,
     std::optional<se::DeviceMemoryBase> fwd_output_buffer,
     std::optional<se::DeviceMemoryBase> bias_buffer,
@@ -629,7 +597,6 @@ absl::Status RunGpuFMHABackwardImpl(const GpufMHABackwardParams &params,
   params.d_bmm1_rhs_buffer = d_bmm1_rhs_buffer;
   params.d_bmm2_rhs_buffer = d_bmm2_rhs_buffer;
   params.d_s_buffer = d_s_buffer;
-  params.mask_buffer = mask_buffer;
   params.d_bias_buffer = d_bias_buffer;
   params.fwd_output_buffer = fwd_output_buffer;
   params.bias_buffer = bias_buffer;
@@ -644,7 +611,6 @@ absl::Status RunGpuFMHA(const GpufMHAConfig &fmha_config,
                         se::DeviceMemoryBase rhs_bmm2_buffer,
                         se::DeviceMemoryBase output_buffer,
                         se::DeviceMemoryBase scratch_buffer,
-                        std::optional<se::DeviceMemoryBase> mask_buffer,
                         std::optional<se::DeviceMemoryBase> bias_buffer,
                         std::optional<se::DeviceMemoryBase> activation_buffer,
                         std::optional<se::DeviceMemoryBase> seqlen_q_buffer,
@@ -653,9 +619,8 @@ absl::Status RunGpuFMHA(const GpufMHAConfig &fmha_config,
   TF_ASSIGN_OR_RETURN(
       GpufMHAParams params,
       GpufMHAParams::For(fmha_config, lhs_bmm1_buffer, rhs_bmm1_buffer,
-                         rhs_bmm2_buffer, output_buffer, mask_buffer,
-                         bias_buffer, activation_buffer, seqlen_q_buffer,
-                         seqlen_k_buffer));
+                         rhs_bmm2_buffer, output_buffer, bias_buffer,
+                         activation_buffer, seqlen_q_buffer, seqlen_k_buffer));
   PrimitiveType input_primitive_type = fmha_config.input_type;
   switch (input_primitive_type) {
     case F16:
@@ -682,7 +647,6 @@ absl::Status RunGpuFMHABackward(
     se::DeviceMemoryBase d_bmm1_rhs_buffer,
     se::DeviceMemoryBase d_bmm2_rhs_buffer,
     std::optional<se::DeviceMemoryBase> d_s_buffer,
-    std::optional<se::DeviceMemoryBase> mask_buffer,
     std::optional<se::DeviceMemoryBase> d_bias_buffer,
     std::optional<se::DeviceMemoryBase> fwd_output_buffer,
     std::optional<se::DeviceMemoryBase> bias_buffer,
@@ -695,8 +659,8 @@ absl::Status RunGpuFMHABackward(
           fmha_config, bmm1_grad_gemm1_rhs_buffer, bmm1_grad_gemm2_rhs_buffer,
           bmm2_grad_gemm1_lhs_buffer, bmm2_grad_gemm2_rhs_buffer,
           d_output_buffer, d_bmm1_lhs_buffer, d_bmm1_rhs_buffer,
-          d_bmm2_rhs_buffer, d_s_buffer, mask_buffer, d_bias_buffer,
-          fwd_output_buffer, bias_buffer, seqlen_q_buffer, seqlen_k_buffer));
+          d_bmm2_rhs_buffer, d_s_buffer, d_bias_buffer, fwd_output_buffer,
+          bias_buffer, seqlen_q_buffer, seqlen_k_buffer));
   PrimitiveType input_primitive_type = fmha_config.input_type;
   switch (input_primitive_type) {
     case F16:
diff --git a/third_party/xla/xla/service/gpu/gpu_fused_mha_runner.h b/third_party/xla/xla/service/gpu/gpu_fused_mha_runner.h
index 5466e61ff627a6..7ca35805be251e 100644
--- a/third_party/xla/xla/service/gpu/gpu_fused_mha_runner.h
+++ b/third_party/xla/xla/service/gpu/gpu_fused_mha_runner.h
@@ -64,7 +64,6 @@ inline absl::StatusOr<xla::gpu::CudnnfMHAMaskKind> AsCudnnFmhaMaskKind(
 struct GpufMHADescriptor {
   CudnnfMHAKind kind;
   CudnnfMHABackendConfig backend_config;
-  bool is_flash_attention;
   CudnnfMHAMaskKind mask_type;
   Shape lhs_bmm1_shape;
   Shape rhs_bmm1_shape;
@@ -82,7 +81,6 @@ struct GpufMHADescriptor {
 struct GpufMHABackwardDescriptor {
   CudnnfMHAKind kind;
   CudnnfMHABackendConfig backend_config;
-  bool is_flash_attention;
   CudnnfMHAMaskKind mask_type;
   Shape bmm1_grad_gemm1_rhs_shape;
   Shape bmm1_grad_gemm2_rhs_shape;
@@ -103,10 +101,14 @@ struct GpufMHABackwardDescriptor {
   std::optional<Shape> d_bias_shape;
   std::optional<Shape> bias_shape;
 };
+
 // Structure to describe static properties of a GPU fused Multi-Headed
 // Attention.
 struct GpufMHAConfig {
   static absl::StatusOr<GpufMHAConfig> For(const GpufMHADescriptor& fmha_desc);
+
+  absl::StatusOr<se::dnn::FusedMHAOp::Config> AsDnnFusedMHAOpConfig() const;
+
   PrimitiveType
       input_type;  // Capture the primitive type of one of the inputs of BMM1
   PrimitiveType output_type;
@@ -116,7 +118,6 @@ struct GpufMHAConfig {
   std::optional<int64_t> seed;
 
   se::dnn::AlgorithmDesc algorithm;
-  bool is_flash_attention;
   CudnnfMHAMaskKind mask_type;
   // bias -> [1, num_attn_heads, q_seq_len, kv_seq_len]
   // mask -> [batch_size, 1, q_seq_len, kv_seq_len]
@@ -136,6 +137,10 @@ struct GpufMHAConfig {
 struct GpufMHABackwardConfig {
   static absl::StatusOr<GpufMHABackwardConfig> For(
       const GpufMHABackwardDescriptor& fmha_desc);
+
+  absl::StatusOr<se::dnn::FusedMHABackwardOp::Config>
+  AsDnnFusedMHABackwardOpConfig() const;
+
   PrimitiveType
       input_type;  // Capture the primitive type of one of the inputs of BMM1
   PrimitiveType output_type;
@@ -145,7 +150,6 @@ struct GpufMHABackwardConfig {
   std::optional<int64_t> seed;
 
   se::dnn::AlgorithmDesc algorithm;
-  bool is_flash_attention;
   CudnnfMHAMaskKind mask_type;
   // mask -> [batch_size, 1, q_seq_len, kv_seq_len]
   // d_bias -> [1, num_heads, q_seq_len, kv_seq_len]
@@ -170,7 +174,6 @@ struct GpufMHAParams {
       const GpufMHAConfig& config, se::DeviceMemoryBase lhs_bmm1_buffer,
       se::DeviceMemoryBase rhs_bmm1_buffer,
       se::DeviceMemoryBase rhs_bmm2_buffer, se::DeviceMemoryBase output_buffer,
-      std::optional<se::DeviceMemoryBase> mask_buffer,
       std::optional<se::DeviceMemoryBase> bias_buffer,
       std::optional<se::DeviceMemoryBase> activation_buffer,
       std::optional<se::DeviceMemoryBase> seqlen_q_buffer,
@@ -182,7 +185,6 @@ struct GpufMHAParams {
   se::DeviceMemoryBase rhs_bmm2_buffer;
   se::DeviceMemoryBase output_buffer;
   std::optional<se::DeviceMemoryBase> activation_buffer;
-  std::optional<se::DeviceMemoryBase> mask_buffer;
   std::optional<se::DeviceMemoryBase> bias_buffer;
   std::optional<se::DeviceMemoryBase> seqlen_q_buffer;
   std::optional<se::DeviceMemoryBase> seqlen_k_buffer;
@@ -200,7 +202,6 @@ struct GpufMHABackwardParams {
       se::DeviceMemoryBase d_bmm1_rhs_buffer,
       se::DeviceMemoryBase d_bmm2_rhs_buffer,
       std::optional<se::DeviceMemoryBase> d_s_buffer,
-      std::optional<se::DeviceMemoryBase> mask_buffer,
       std::optional<se::DeviceMemoryBase> d_bias_buffer,
       std::optional<se::DeviceMemoryBase> fwd_output_buffer,
       std::optional<se::DeviceMemoryBase> bias_buffer,
@@ -217,7 +218,6 @@ struct GpufMHABackwardParams {
   se::DeviceMemoryBase d_bmm1_rhs_buffer;
   se::DeviceMemoryBase d_bmm2_rhs_buffer;
   std::optional<se::DeviceMemoryBase> d_s_buffer;
-  std::optional<se::DeviceMemoryBase> mask_buffer;
   std::optional<se::DeviceMemoryBase> d_bias_buffer;
   std::optional<se::DeviceMemoryBase> fwd_output_buffer;
   std::optional<se::DeviceMemoryBase> bias_buffer;
@@ -269,15 +269,10 @@ class FusedMultiHeadedAttentionRunner {
   //  the state of any specific instance of the class.
   static Repr CreateRunner(const GpufMHAConfig& config) {
     switch (config.kind) {
-      case CudnnfMHAKind::kBmmBmm:
       case CudnnfMHAKind::kSoftmaxDropout:
       case CudnnfMHAKind::kSoftmax:
       case CudnnfMHAKind::kScaleBiasSoftmax:
       case CudnnfMHAKind::kScaleBiasSoftmaxDropout:
-      case CudnnfMHAKind::kScaleMaskSoftmax:
-      case CudnnfMHAKind::kScaleMaskSoftmaxDropout:
-      case CudnnfMHAKind::kScaleBiasMaskSoftmax:
-      case CudnnfMHAKind::kScaleBiasMaskSoftmaxDropout:
         return std::make_unique<se::dnn::LazyOpRunner<se::dnn::FusedMHAOp>>(
             config.algorithm);
       default:
@@ -348,15 +343,10 @@ class FusedMultiHeadedAttentionBackwardRunner {
   //  rely on the state of any specific instance of the class.
   static Repr CreateRunner(const GpufMHABackwardConfig& config) {
     switch (config.kind) {
-      case CudnnfMHAKind::kBackwardBmmBmm:
       case CudnnfMHAKind::kBackwardSoftmaxDropout:
       case CudnnfMHAKind::kBackwardSoftmax:
       case CudnnfMHAKind::kBackwardScaleBiasSoftmax:
       case CudnnfMHAKind::kBackwardScaleBiasSoftmaxDropout:
-      case CudnnfMHAKind::kBackwardScaleBiasMaskSoftmax:
-      case CudnnfMHAKind::kBackwardScaleBiasMaskSoftmaxDropout:
-      case CudnnfMHAKind::kBackwardScaleMaskSoftmax:
-      case CudnnfMHAKind::kBackwardScaleMaskSoftmaxDropout:
         return std::make_unique<
             se::dnn::LazyOpRunner<se::dnn::FusedMHABackwardOp>>(
             config.algorithm);
@@ -408,7 +398,6 @@ absl::Status RunGpuFMHA(const GpufMHAConfig& fmha_config,
                         se::DeviceMemoryBase rhs_bmm2_buffer,
                         se::DeviceMemoryBase output_buffer,
                         se::DeviceMemoryBase scratch_buffer,
-                        std::optional<se::DeviceMemoryBase> mask_buffer,
                         std::optional<se::DeviceMemoryBase> bias_buffer,
                         std::optional<se::DeviceMemoryBase> activation_buffer,
                         std::optional<se::DeviceMemoryBase> seqlen_q_buffer,
@@ -426,7 +415,6 @@ absl::Status RunGpuFMHABackward(
     se::DeviceMemoryBase d_bmm1_rhs_buffer,
     se::DeviceMemoryBase d_bmm2_rhs_buffer,
     std::optional<se::DeviceMemoryBase> d_s_buffer,
-    std::optional<se::DeviceMemoryBase> mask_buffer,
     std::optional<se::DeviceMemoryBase> d_bias_buffer,
     std::optional<se::DeviceMemoryBase> fwd_output_buffer,
     std::optional<se::DeviceMemoryBase> bias_buffer,
diff --git a/third_party/xla/xla/service/gpu/gpu_fusible.cc b/third_party/xla/xla/service/gpu/gpu_fusible.cc
index 02344a7fb1c27f..93f367b21d868e 100644
--- a/third_party/xla/xla/service/gpu/gpu_fusible.cc
+++ b/third_party/xla/xla/service/gpu/gpu_fusible.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/gpu/gpu_fusible.h"
 
+#include <algorithm>
 #include <cstddef>
 #include <cstdint>
 #include <optional>
@@ -272,7 +273,9 @@ FusionDecision ShapesCompatibleForMultiOutputFusion(
     // Special-case reduction-to-vector ops: The loop dimensions are determined
     // by the shape of the first operand.
     // TODO(jreiffers): Compute the non-trivial hero only once here.
-    const auto& hero = FindNonTrivialHero(*element_instr);
+    const auto& hero = element_instr->parent()->IsFusionComputation()
+                           ? FindNonTrivialHero(*element_instr)
+                           : *element_instr;
     if (IsReductionFromOrToContiguousDimensions(*element_instr) ||
         GetDescriptionForTiledTransposeEmitter(*element_instr, hero)
             .has_value()) {
@@ -717,10 +720,9 @@ FusionDecision FusionFitsInBudget(const HloInstruction& instr1,
       MaxOperandsAndOutputsPerFusion()) {
     return {};
   } else {
-    VLOG(5) << "Operand count of "
-            << "(" << instr1.ToString() << " ) = " << instr1.operand_count()
-            << " and ( " << instr2.ToString()
-            << " ) = " << instr2.operand_count()
+    VLOG(5) << "Operand count of " << "(" << instr1.ToString()
+            << " ) = " << instr1.operand_count() << " and ( "
+            << instr2.ToString() << " ) = " << instr2.operand_count()
             << " and num_output_buffers = " << num_output_buffers
             << " is bigger than the bound of "
             << MaxOperandsAndOutputsPerFusion();
@@ -878,16 +880,17 @@ size_t GetOutputSizeOfFusible(const HloInstruction& instr) {
 // Recursive helper for GetFusionRoots below.
 static void GetFusionRootsRec(const HloInstruction* root,
                               std::vector<const HloInstruction*>& out) {
-  if (root->opcode() == HloOpcode::kGetTupleElement) {
-    return GetFusionRootsRec(root->operand(0), out);
+  if (root->opcode() == HloOpcode::kGetTupleElement &&
+      root->operand(0)->opcode() == HloOpcode::kTuple) {
+    return GetFusionRootsRec(root->operand(0)->operand(root->tuple_index()),
+                             out);
+  } else if (root->opcode() == HloOpcode::kGetTupleElement) {
+    out.push_back(root->operand(0));
   } else if (root->opcode() == HloOpcode::kTuple) {
     for (int i = 0; i < root->operand_count(); i++) {
       GetFusionRootsRec(root->operand(i), out);
     }
   } else {
-    if (!out.empty() && out.back() == root) {
-      return;
-    }
     CHECK(!absl::c_linear_search(out, root))
         << "Fusion root contains instruction " << root->ToString()
         << " multiple times";
@@ -937,5 +940,35 @@ bool MayPreventVectorization(const HloFusionAdaptor& fusion) {
   });
 }
 
+std::vector<HloComputation*> GetFusibleComputations(
+    const HloModule& module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  auto result = module.MakeComputationPostOrder(execution_threads);
+  absl::flat_hash_set<const HloComputation*> computations_not_to_fuse;
+  for (const auto* computation : result) {
+    for (const auto* instr : computation->instructions()) {
+      // Don't fuse within called computations, unless they are for control
+      // flow. See also fusion_wrapper.cc, which does the same.
+      if (HloInstruction::MightHaveCalledComputations(instr->opcode()) &&
+          instr->opcode() != HloOpcode::kWhile &&
+          instr->opcode() != HloOpcode::kConditional &&
+          // No need to add fusion computations, just check the flag.
+          instr->opcode() != HloOpcode::kFusion) {
+        for (auto* called : instr->called_computations()) {
+          computations_not_to_fuse.insert(called);
+        }
+      }
+    }
+  }
+  result.erase(
+      std::remove_if(result.begin(), result.end(),
+                     [&](HloComputation* computation) {
+                       return computation->IsFusionComputation() ||
+                              computations_not_to_fuse.contains(computation);
+                     }),
+      result.end());
+  return result;
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gpu_fusible.h b/third_party/xla/xla/service/gpu/gpu_fusible.h
index 79fef3b70ba310..7971e69e05056f 100644
--- a/third_party/xla/xla/service/gpu/gpu_fusible.h
+++ b/third_party/xla/xla/service/gpu/gpu_fusible.h
@@ -21,7 +21,9 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/hlo_traversal.h"
@@ -63,6 +65,13 @@ struct FusionInfoCache {
   absl::flat_hash_map<const HloInstruction*, int64_t> num_unnested_reductions;
 };
 
+// Returns the computations within `module` whose instructions can still be
+// fused: computations that are not fusion computations, and not called
+// computations that are inlined (reducers, scatter combiners, etc.).
+std::vector<HloComputation*> GetFusibleComputations(
+    const HloModule& module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads);
+
 // Returns projected shared memory usage of a given instruction in bytes.
 int64_t SharedMemoryUsage(const HloInstruction& instr,
                           FusionInfoCache* cache = nullptr);
@@ -187,19 +196,19 @@ absl::InlinedVector<const HloInstruction*, 2> GetOutputsOfFusible(
 size_t GetOutputSizeOfFusible(const HloInstruction& instr);
 
 // Returns instructions which are roots of the fusion, following the operands of
-// GTE instructions in the root tuple. Groups multiple subsequent instructions
-// with the same root. CHECKs that the fusion never outputs the same instruction
-// twice, as well as that there are no explicitly created tuples or nested gtes
-// in fusion output.
+// GTE instructions in the root tuple that extract from a tuple.
 //
-// For input: (tuple (gte R1) (gte R1) O2)
-// Expected output: [R1, O2]
+// For input: (tuple (gte tuple(R1)) (gte tuple(R1)) O2)
+// Expected output: [R1, R1, O2]
 //
 // For input: (tuple R1 R2 O2)
 // Expected output: [R1, R2, O2]
 //
-// For input: (tuple (gte R1) (gte R1) R2 O3)
-// Expected output: [R1, R2, O3]
+// For input: (tuple (gte tuple(R1)) R2 (gte tuple(R1)) O3)
+// Expected output: [R1, R2, R1, O3]
+//
+// For input: (tuple (gte R1) R2 (gte R1) O3)
+// Expected output: [R1, R2, R1, O3]
 //
 // For input: R1
 // Expected output: [R1]
diff --git a/third_party/xla/xla/service/gpu/gpu_fusible_test.cc b/third_party/xla/xla/service/gpu/gpu_fusible_test.cc
index acddac32f94a21..ac13f2fa7e47e4 100644
--- a/third_party/xla/xla/service/gpu/gpu_fusible_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_fusible_test.cc
@@ -28,6 +28,8 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+using ::testing::ElementsAre;
+
 using GpuFusibleTest = HloTestBase;
 
 const char kModulePrefix[] = R"(
@@ -1470,5 +1472,204 @@ ENTRY computation {
             HloInstruction::FusionKind::kInput);
 }
 
+TEST_F(GpuFusibleTest, GetFusionRoots1) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    HloModule test_module
+
+    fusion {
+      p0 = s32[] parameter(0)
+      custom-call = (bf16[], s32[]) custom-call(p0), custom_call_target="my_custom_call"
+      get-tuple-element.0 = bf16[] get-tuple-element(custom-call), index=0
+      get-tuple-element.1 = s32[] get-tuple-element(custom-call), index=1
+      ROOT tuple = (bf16[], s32[], s32[]) tuple(get-tuple-element.0, get-tuple-element.1, p0)
+    }
+
+    ENTRY entry{
+      p0 = s32[] parameter(0)
+      ROOT fusion = (bf16[], s32[], s32[]) fusion(p0), kind=kCustom, calls=fusion
+    }
+  )")
+                    .value();
+  auto called_computations =
+      module->entry_computation()->root_instruction()->called_computations();
+  ASSERT_EQ(called_computations.size(), 1);
+  auto fusion = called_computations.front();
+  auto roots = GetFusionRoots(*fusion);
+  auto custom_call = fusion->root_instruction()->operand(0)->operand(0);
+  auto parameter = fusion->root_instruction()->operand(2);
+  std::vector<const HloInstruction*> expected_roots{custom_call, custom_call,
+                                                    parameter};
+  EXPECT_EQ(roots, expected_roots);
+}
+
+TEST_F(GpuFusibleTest, GetFusionRoots2) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    HloModule test_module
+
+    fusion {
+      p0 = s32[] parameter(0)
+      custom-call.1 = bf16[] custom-call(p0), custom_call_target="my_custom_call1"
+      custom-call.2 = bf16[] custom-call(p0), custom_call_target="my_custom_call2"
+      ROOT tuple = (bf16[], bf16[], s32[]) tuple(custom-call.1, custom-call.2, p0)
+    }
+
+    ENTRY entry{
+      p0 = s32[] parameter(0)
+      ROOT fusion = (bf16[], bf16[], s32[]) fusion(p0), kind=kCustom, calls=fusion
+    }
+  )")
+                    .value();
+  auto called_computations =
+      module->entry_computation()->root_instruction()->called_computations();
+  ASSERT_EQ(called_computations.size(), 1);
+  auto fusion = called_computations.front();
+  auto roots = GetFusionRoots(*fusion);
+  auto custom_call1 = fusion->root_instruction()->operand(0);
+  auto custom_call2 = fusion->root_instruction()->operand(1);
+  auto parameter = fusion->root_instruction()->operand(2);
+  std::vector<const HloInstruction*> expected_roots{custom_call1, custom_call2,
+                                                    parameter};
+  EXPECT_EQ(roots, expected_roots);
+}
+
+TEST_F(GpuFusibleTest, GetFusionRoots3) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    HloModule test_module
+
+    fusion {
+      p0 = s32[] parameter(0)
+      custom-call = (bf16[], s32[]) custom-call(p0), custom_call_target="my_custom_call"
+      get-tuple-element.0 = bf16[] get-tuple-element(custom-call), index=0
+      custom-call.2 = bf16[] custom-call(p0), custom_call_target="my_custom_call2"
+      get-tuple-element.1 = s32[] get-tuple-element(custom-call), index=1
+      ROOT tuple = (bf16[], bf16[], s32[], s32[]) tuple(get-tuple-element.0, custom-call.2, get-tuple-element.1, p0)
+    }
+
+    ENTRY entry{
+      p0 = s32[] parameter(0)
+      ROOT fusion = (bf16[], bf16[], s32[], s32[]) fusion(p0), kind=kCustom, calls=fusion
+    }
+  )")
+                    .value();
+  auto called_computations =
+      module->entry_computation()->root_instruction()->called_computations();
+  ASSERT_EQ(called_computations.size(), 1);
+  auto fusion = called_computations.front();
+  auto roots = GetFusionRoots(*fusion);
+  auto custom_call1 = fusion->root_instruction()->operand(0)->operand(0);
+  auto custom_call2 = fusion->root_instruction()->operand(1);
+  auto parameter = fusion->root_instruction()->operand(3);
+  std::vector<const HloInstruction*> expected_roots{custom_call1, custom_call2,
+                                                    custom_call1, parameter};
+  EXPECT_EQ(roots, expected_roots);
+}
+
+TEST_F(GpuFusibleTest, GetFusionRootsWithGTEMakeTupleSequence) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    HloModule test_module
+
+    fusion {
+      p0 = s32[] parameter(0)
+      p1 = s32[32] parameter(1)
+      custom-call = (bf16[], s32[], u32[]) custom-call(p1), custom_call_target="my_custom_call"
+      get-tuple-element.0 = bf16[] get-tuple-element(custom-call), index=0
+      get-tuple-element.1 = s32[] get-tuple-element(custom-call), index=1
+      bitcast = s32[1] bitcast(get-tuple-element.1)
+      dynamic-update-slice = s32[32] dynamic-update-slice(p1, bitcast, p0)
+      get-tuple-element.2 = u32[] get-tuple-element(custom-call), index=2
+      ROOT tuple = (bf16[], s32[32], u32[]) tuple(get-tuple-element.0, dynamic-update-slice, get-tuple-element.2)
+    }
+
+    ENTRY entry{
+      p0 = s32[] parameter(0)
+      bitcast = s32[32] bitcast(p0)
+      ROOT fusion = (bf16[], s32[32], u32[]) fusion(p0, bitcast), kind=kCustom, calls=fusion
+    }
+  )")
+                    .value();
+
+  auto called_computations =
+      module->entry_computation()->root_instruction()->called_computations();
+  ASSERT_EQ(called_computations.size(), 1);
+  auto fusion = called_computations.front();
+  auto roots = GetFusionRoots(*fusion);
+  auto custom_call = fusion->root_instruction()->operand(0)->operand(0);
+  auto dus = fusion->root_instruction()->operand(1);
+  std::vector<const HloInstruction*> expected_result{custom_call, dus,
+                                                     custom_call};
+  EXPECT_EQ(roots, expected_result);
+}
+
+TEST_F(GpuFusibleTest, GetFusionRootsWithMakeTupleGTESequence) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    HloModule test_module
+
+    fusion {
+      p0 = s32[] parameter(0)
+      p1 = s32[32] parameter(1)
+      custom-call = (bf16[], s32[], u32[]) custom-call(p1), custom_call_target="my_custom_call"
+      get-tuple-element.0 = bf16[] get-tuple-element(custom-call), index=0
+      get-tuple-element.1 = s32[] get-tuple-element(custom-call), index=1
+      bitcast = s32[1] bitcast(get-tuple-element.1)
+      dynamic-update-slice = s32[32] dynamic-update-slice(p1, bitcast, p0)
+      get-tuple-element.2 = u32[] get-tuple-element(custom-call), index=2
+      tuple = (bf16[], s32[32], u32[]) tuple(get-tuple-element.0, dynamic-update-slice, get-tuple-element.2)
+      get-tuple-element.3 = bf16[] get-tuple-element(tuple), index=0
+      get-tuple-element.4 = u32[] get-tuple-element(tuple), index=2
+      ROOT tuple2 = (bf16[], s32[32], u32[]) tuple(get-tuple-element.3, dynamic-update-slice, get-tuple-element.4)
+    }
+
+    ENTRY entry{
+      p0 = s32[] parameter(0)
+      bitcast = s32[32] bitcast(p0)
+      ROOT fusion = (bf16[], s32[32], u32[]) fusion(p0, bitcast), kind=kCustom, calls=fusion
+    }
+  )")
+                    .value();
+
+  auto called_computations =
+      module->entry_computation()->root_instruction()->called_computations();
+  ASSERT_EQ(called_computations.size(), 1);
+  auto fusion = called_computations.front();
+  auto roots = GetFusionRoots(*fusion);
+  auto tuple_inst = fusion->root_instruction()->operand(0)->operand(0);
+  auto custom_call = tuple_inst->operand(0)->operand(0);
+  auto dus = fusion->root_instruction()->operand(1);
+  std::vector<const HloInstruction*> expected_result{custom_call, dus,
+                                                     custom_call};
+  EXPECT_EQ(roots, expected_result);
+}
+
+TEST_F(GpuFusibleTest, GetFusibleComputations) {
+  auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
+    fused_reduce {
+      p0 = f32[128,1024] parameter(0)
+      c0 = f32[] constant(0)
+      ROOT reduce = f32[128]{0} reduce(p0, c0), dimensions={1}, to_apply=scalar_add
+    }
+    body_a {
+      p0 = f32[128,1024] parameter(0)
+      ROOT reduce_fusion = f32[128] fusion(p0), kind=kInput, calls=fused_reduce
+    }
+    body_b {
+      p0 = f32[128,1024] parameter(0)
+      c0 = f32[] constant(0)
+      ROOT bc = f32[128] broadcast(c0), dimensions={}
+    }
+    ENTRY main {
+      p0 = s32[] parameter(0)
+      p1 = f32[128,1024] parameter(1)
+      ROOT conditional = f32[128] conditional(p0, p1, p1),
+        branch_computations={body_a, body_b}
+    })"))
+                    .value();
+
+  // fused_reduce is already fused, scalar_add is not fusible.
+  auto fusible = GetFusibleComputations(*module, {});
+  EXPECT_THAT(fusible, ElementsAre(module->GetComputationWithName("body_a"),
+                                   module->GetComputationWithName("body_b"),
+                                   module->entry_computation()));
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc b/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
index 4edab874aaf513..1dd6d2922dcd9d 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
@@ -618,7 +618,7 @@ TEST_F(GpuHloScheduleTest, LHSSendRecvPairs2) {
     sum = u32[] add(replica, c10)
     sum2 = u32[] add(sum, count)
     conv = f32[] convert(sum2)
-    s1 = f32[1, 1024, 1024] broadcast(conv), dimensions={}
+    bc1 = f32[1, 1024, 1024] broadcast(conv), dimensions={}
 
     after-all-1 = token[] after-all()
     recv-1 = (f32[1, 1024, 1024], u32[], token[]) recv(after-all-1), channel_id=2,
@@ -633,10 +633,10 @@ TEST_F(GpuHloScheduleTest, LHSSendRecvPairs2) {
     send-done-1 = token[] send-done(send-1), channel_id=2
     recv-data-1 = f32[1, 1024, 1024] get-tuple-element(recv-done-1), index=0
 
-    s2 = f32[1, 1024, 1024] add(recv-data-0, s1)
-    s = f32[1, 1024, 1024] add(recv-data-1, s2)
+    add2 = f32[1, 1024, 1024] add(recv-data-0, bc1)
+    add = f32[1, 1024, 1024] add(recv-data-1, add2)
 
-    ROOT result = (u32[], f32[1, 1024, 1024]) tuple(new_count, s)
+    ROOT result = (u32[], f32[1, 1024, 1024]) tuple(new_count, add)
   }
 
   ENTRY test_computation {
@@ -1257,7 +1257,7 @@ TEST_P(GpuHloScheduleParameterizedTest, AsyncAllReduce) {
   HloInstruction* all_reduce_start =
       builder.AddInstruction(HloInstruction::CreateAllReduceStart(
           all_reduce_start_shape, {add0}, reduction_computation,
-          /*replica_groups=*/{}, /*constrain_layout=*/false,
+          /*device_list=*/CollectiveDeviceList(), /*constrain_layout=*/false,
           /*channel_id=*/std::nullopt, /*use_global_device_ids=*/true));
   // In addition, add control_dependency: add1->nonblocking_call.
   TF_CHECK_OK(add1->AddControlDependencyTo(all_reduce_start));
diff --git a/third_party/xla/xla/service/gpu/gpu_layout_assignment.cc b/third_party/xla/xla/service/gpu/gpu_layout_assignment.cc
index 17cb6b31fcda7e..227d965257f932 100644
--- a/third_party/xla/xla/service/gpu/gpu_layout_assignment.cc
+++ b/third_party/xla/xla/service/gpu/gpu_layout_assignment.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/reduction_utils.h"
 #include "xla/service/gpu/stream_executor_util.h"
+#include "xla/service/host_memory_offload_annotations.h"
 #include "xla/service/logical_buffer.h"
 #include "xla/shape.h"
 #include "xla/shape_layout.h"
@@ -127,6 +128,12 @@ HeuristicLayoutAssignment(const HloInstruction* instr,
     return kAllNHWC;
   }
 
+  const auto* rocm_compute_capability =
+      std::get_if<se::RocmComputeCapability>(&gpu_version);
+  if (rocm_compute_capability && input_ty == F16) return kAllNHWC;
+
+  // If we're not Volta or not fp16/bfloat16, or not conv2D, the decision is
+  // easy: Use NCHW.
   const bool isFloat16 = (input_ty == F16) || (input_ty == BF16);
   if (std::holds_alternative<se::CudaComputeCapability>(gpu_version)) {
     // If we're not Volta or not fp16/bfloat16, or not conv2D, the decision is
@@ -568,5 +575,23 @@ bool GpuLayoutAssignment::PropagateReductionLayoutToOperand(
       {/*is_row_reduction=*/true, {1, kept_dimension_size, reduction_size}});
 }
 
+bool GpuLayoutAssignment::InstructionCanChangeLayoutInstance(
+    const HloInstruction* instruction) {
+  // The host offloading custom calls will be eventually removed
+  // by the offloader, so we need to make sure that the calls do not change
+  // the layout and thus cause layout mismatches after the removal.
+  const HloCustomCallInstruction* custom_call =
+      DynCast<HloCustomCallInstruction>(instruction);
+  if (custom_call != nullptr &&
+      (custom_call->custom_call_target() ==
+           host_memory_offload_annotations::kMoveToHostCustomCallTarget ||
+       custom_call->custom_call_target() ==
+           host_memory_offload_annotations::kMoveToDeviceCustomCallTarget)) {
+    return false;
+  }
+
+  return LayoutAssignment::InstructionCanChangeLayoutInstance(instruction);
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gpu_layout_assignment.h b/third_party/xla/xla/service/gpu/gpu_layout_assignment.h
index fa79b8e1aea20d..9056ffb014d881 100644
--- a/third_party/xla/xla/service/gpu/gpu_layout_assignment.h
+++ b/third_party/xla/xla/service/gpu/gpu_layout_assignment.h
@@ -68,6 +68,9 @@ class GpuLayoutAssignment : public LayoutAssignment {
 
   bool PropagateReductionLayoutToOperand(const HloInstruction* user) override;
 
+  bool InstructionCanChangeLayoutInstance(
+      const HloInstruction* instruction) override;
+
   const se::GpuComputeCapability gpu_version_;
   const se::dnn::VersionInfo dnn_version_;
 };
diff --git a/third_party/xla/xla/service/gpu/gpu_layout_assignment_test.cc b/third_party/xla/xla/service/gpu/gpu_layout_assignment_test.cc
index 0a92f45cacc8e0..5d99b49179eb9a 100644
--- a/third_party/xla/xla/service/gpu/gpu_layout_assignment_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_layout_assignment_test.cc
@@ -421,6 +421,62 @@ ENTRY entry {
   expect_layout(call_0->operand(1)->shape(), {1, 2, 0});
 }
 
+TEST_F(LayoutAssignmentTest, MoveToHostCustomCallConstrained) {
+  const char* module_str = R"(
+HloModule TestModule
+
+ENTRY entry {
+  Arg_0 = f32[2,5,5]{2,1,0} parameter(0)
+  custom-call.0 = f32[2,5,5] custom-call(Arg_0), custom_call_target="MoveToHost"
+  ROOT custom-call.1 = f32[2,5,5]{2, 1, 0} custom-call(custom-call.0), custom_call_target="fixed_call", operand_layout_constraints={f32[2,5,5]{1,2,0}}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(module_str));
+  ComputationLayout computation_layout(
+      m->entry_computation()->ComputeProgramShape());
+
+  GpuLayoutAssignment layout_assignment(
+      &computation_layout, GetGpuComputeCapability(), GetDnnVersion());
+
+  EXPECT_THAT(layout_assignment.Run(m.get()), IsOkAndHolds(true));
+
+  const HloInstruction* call_0 = FindInstruction(m.get(), "custom-call.0");
+  const Layout input_layout = call_0->operand(0)->shape().layout();
+  const Layout output_layout = call_0->shape().layout();
+  EXPECT_TRUE(LayoutUtil::Equal(input_layout, output_layout))
+      << "Expected the same input/output layouts.  Input: " << input_layout
+      << ". Output: " << output_layout;
+}
+
+TEST_F(LayoutAssignmentTest, MoveToDeviceCustomCallConstrained) {
+  const char* module_str = R"(
+HloModule TestModule
+
+ENTRY entry {
+  Arg_0 = f32[2,5,5]{2,1,0} parameter(0)
+  custom-call.0 = f32[2,5,5] custom-call(Arg_0), custom_call_target="MoveToDevice"
+  ROOT custom-call.1 = f32[2,5,5]{2, 1, 0} custom-call(custom-call.0), custom_call_target="fixed_call", operand_layout_constraints={f32[2,5,5]{1,2,0}}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(module_str));
+  ComputationLayout computation_layout(
+      m->entry_computation()->ComputeProgramShape());
+
+  GpuLayoutAssignment layout_assignment(
+      &computation_layout, GetGpuComputeCapability(), GetDnnVersion());
+
+  EXPECT_THAT(layout_assignment.Run(m.get()), IsOkAndHolds(true));
+
+  const HloInstruction* call_0 = FindInstruction(m.get(), "custom-call.0");
+  const Layout input_layout = call_0->operand(0)->shape().layout();
+  const Layout output_layout = call_0->shape().layout();
+  EXPECT_TRUE(LayoutUtil::Equal(input_layout, output_layout))
+      << "Expected the same input/output layouts.  Input: " << input_layout
+      << ". Output: " << output_layout;
+}
+
 TEST_F(LayoutAssignmentTest, ConvCuDNNF8) {
   if (!GetCudaComputeCapability().IsAtLeast(
           se::CudaComputeCapability::HOPPER)) {
diff --git a/third_party/xla/xla/service/gpu/gpu_norm_runner.cc b/third_party/xla/xla/service/gpu/gpu_norm_runner.cc
index 5abb58af4e019d..9170de6016e60a 100644
--- a/third_party/xla/xla/service/gpu/gpu_norm_runner.cc
+++ b/third_party/xla/xla/service/gpu/gpu_norm_runner.cc
@@ -44,22 +44,8 @@ absl::Status RunGpuNorm(const gpu::GpuNormConfig& config,
                         se::Stream* stream, RunNormOptions options) {
   se::dnn::LazyOpRunner<se::dnn::NormOp>* lazy_runner =
       options.norm_runner->AsNormRunner();
-  std::optional<se::dnn::LazyOpRunner<se::dnn::NormOp>> local_runner;
-
-  TF_ASSIGN_OR_RETURN(se::dnn::NormKind kind,
-                      GetDNNNormKindFromCudnnNormKind(config.kind));
-
-  se::dnn::NormOp::Config ln_config{kind,
-                                    config.epsilon,
-                                    config.x_descriptor,
-                                    config.scale_descriptor,
-                                    config.y_or_dx_descriptor,
-                                    config.bias_descriptor,
-                                    config.dy_descriptor,
-                                    config.expectation_descriptor,
-                                    config.norm_factor_descriptor,
-                                    config.dscale_descriptor,
-                                    config.dbias_descriptor};
+  TF_ASSIGN_OR_RETURN(se::dnn::NormOp::Config ln_config,
+                      config.AsDnnNormOpConfig());
   TF_ASSIGN_OR_RETURN(auto* runner,
                       lazy_runner->GetOrCreateRunner(ln_config, stream));
 
diff --git a/third_party/xla/xla/service/gpu/gpu_norm_runner.h b/third_party/xla/xla/service/gpu/gpu_norm_runner.h
index 854e3c0892050c..7c9da554a16fa3 100644
--- a/third_party/xla/xla/service/gpu/gpu_norm_runner.h
+++ b/third_party/xla/xla/service/gpu/gpu_norm_runner.h
@@ -118,6 +118,22 @@ struct GpuNormConfig {
     return config;
   }
 
+  absl::StatusOr<se::dnn::NormOp::Config> AsDnnNormOpConfig() const {
+    TF_ASSIGN_OR_RETURN(se::dnn::NormKind norm_kind,
+                        GetDNNNormKindFromCudnnNormKind(kind));
+    return se::dnn::NormOp::Config{norm_kind,
+                                   epsilon,
+                                   x_descriptor,
+                                   scale_descriptor,
+                                   y_or_dx_descriptor,
+                                   bias_descriptor,
+                                   dy_descriptor,
+                                   expectation_descriptor,
+                                   norm_factor_descriptor,
+                                   dscale_descriptor,
+                                   dbias_descriptor};
+  }
+
   double epsilon;
   CudnnNormKind kind;
   se::dnn::AlgorithmDesc algorithm;
@@ -168,7 +184,7 @@ absl::Status RunGpuNorm(const GpuNormConfig& conv_config,
                         const se::DeviceMemoryBase& y_or_dx_buffer,
                         std::optional<se::DeviceMemoryBase> bias_buffer,
                         std::optional<se::DeviceMemoryBase> dy_buffer,
-                        std::optional<se::DeviceMemoryBase> exepctation_buffer,
+                        std::optional<se::DeviceMemoryBase> expectation_buffer,
                         std::optional<se::DeviceMemoryBase> norm_factor_buffer,
                         std::optional<se::DeviceMemoryBase> dscale_buffer,
                         std::optional<se::DeviceMemoryBase> dbias_buffer,
diff --git a/third_party/xla/xla/service/gpu/gpu_offloading_test.cc b/third_party/xla/xla/service/gpu/gpu_offloading_test.cc
index 56abbe6f911408..3099b957575e6f 100644
--- a/third_party/xla/xla/service/gpu/gpu_offloading_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_offloading_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <optional>
+#include <string>
 #include <utility>
 
 #include <gmock/gmock.h>
@@ -28,13 +29,12 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/layout.h"
-#include "xla/service/buffer_assignment.h"
 #include "xla/service/buffer_value.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/stream_attribute_annotator.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/hlo_memory_scheduler.h"
 #include "xla/service/hlo_rematerialization.h"
-#include "xla/service/pattern_matcher.h"
-#include "xla/service/pattern_matcher_gmock.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tests/hlo_test_base.h"
@@ -46,14 +46,9 @@ namespace xla {
 namespace gpu {
 namespace {
 
-namespace m = ::xla::match;
 namespace op = xla::testing::opcode_matchers;
 
-using ::testing::IsEmpty;
-using ::testing::Not;
-using ::testing::TempDir;
-
-class GpuCompilerTest : public HloTestBase {
+class GpuOffloadingTest : public HloTestBase {
  protected:
   absl::StatusOr<bool> RunHloRematerialization(int64_t memory_limit_bytes,
                                                HloModule* module,
@@ -106,7 +101,7 @@ class GpuCompilerTest : public HloTestBase {
   float transcendentals_per_second_{1.0f};
 };
 
-TEST_F(GpuCompilerTest, OriginalTest) {
+TEST_F(GpuOffloadingTest, CopyStartDoneHloStringTest) {
   const char* hlo_text = R"(
   HloModule test
 
@@ -132,10 +127,51 @@ ENTRY %main (param_0: f32[1024], param_1: f32[1024]) -> f32[1024] {
   ROOT %res_11 = f32[1024]{0} tanh(f32[1024]{0} %res_10)
 }
 )";
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6}));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_text, ErrorSpec{1e-3}));
+}
+
+TEST_F(GpuOffloadingTest, FusedComputationOffloadingTest) {
+  const char* hlo_text = R"(
+  HloModule test
+
+  mul {
+    %param_1 = f32[1024]{0} parameter(1)
+    %param_0 = f32[1024]{0} parameter(0)
+    ROOT m = f32[1024]{0} multiply(%param_0, %param_1)
+  }
+
+  exp {
+    %param_0 = f32[1024]{0} parameter(0)
+    e = f32[1024]{0} exponential(%param_0)
+    ROOT t = f32[1024]{0} tanh(e)
+  }
+
+  ENTRY %main (param_0: f32[1024], param_1: f32[1024]) -> f32[1024] {
+  %param_1 = f32[1024]{0} parameter(1)
+  %param_0 = f32[1024]{0} parameter(0)
+  %res_3 = f32[1024]{0} fusion(%param_1, %param_0), kind=kInput, calls=mul
+  %copy-start = (f32[1024]{0:S(5)}, f32[1024]{0}, u32[]) copy-start(f32[1024]{0} %res_3)
+  %res_4 = f32[1024]{0} fusion(%res_3), kind=kInput, calls=exp
+  %copy-start.2 = (f32[1024]{0:S(5)}, f32[1024]{0}, u32[]) copy-start(f32[1024]{0} %res_4)
+  %res_5 = f32[1024]{0} tanh(f32[1024]{0} %res_4)
+  %copy-done = f32[1024]{0:S(5)} copy-done((f32[1024]{0:S(5)}, f32[1024]{0}, u32[]) %copy-start)
+  %res_6 = f32[1024]{0} tanh(f32[1024]{0} %res_5)
+  %copy-done.2 = f32[1024]{0:S(5)} copy-done((f32[1024]{0:S(5)}, f32[1024]{0}, u32[]) %copy-start.2)
+  %copy-start.3 = (f32[1024]{0}, f32[1024]{0:S(5)}, u32[]) copy-start(f32[1024]{0:S(5)} %copy-done.2)
+  %res_7 = f32[1024]{0} add(f32[1024]{0} %res_6, f32[1024]{0} %res_6)
+  %copy-start.1 = (f32[1024]{0}, f32[1024]{0:S(5)}, u32[]) copy-start(f32[1024]{0:S(5)} %copy-done)
+  %res_8 = f32[1024]{0} add(f32[1024]{0} %res_7, f32[1024]{0} %res_5)
+  %copy-done.3 = f32[1024]{0} copy-done((f32[1024]{0}, f32[1024]{0:S(5)}, u32[]) %copy-start.3)
+  %res_9 = f32[1024]{0} add(f32[1024]{0} %res_8, f32[1024]{0} %copy-done.3)
+  %copy-done.1 = f32[1024]{0} copy-done((f32[1024]{0}, f32[1024]{0:S(5)}, u32[]) %copy-start.1)
+  %res_10 = f32[1024]{0} add(f32[1024]{0} %res_9, f32[1024]{0} %copy-done.1)
+  ROOT %res_11 = f32[1024]{0} tanh(f32[1024]{0} %res_10)
+}
+)";
+  EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_text, ErrorSpec{1e-3}));
 }
 
-TEST_F(GpuCompilerTest, CompiledProgramsCount) {
+TEST_F(GpuOffloadingTest, CopyIRCreationTest) {
   const char* hlo_text = R"(
   HloModule test
 
@@ -168,6 +204,18 @@ TEST_F(GpuCompilerTest, CompiledProgramsCount) {
                           RunHloRematerialization(
                               /*memory_limit_bytes=*/10 * 1024, module.get()));
   ASSERT_TRUE(changed);
+  StreamAttributeAnnotator attr_annotator;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed_attr, attr_annotator.Run(module.get()));
+  EXPECT_TRUE(changed_attr);
+  // Verify that the stream attribute for a copy-start is annotated
+  for (std::string i : {"", ".1", ".2", ".3"}) {
+    const HloInstruction* cp_start =
+        FindInstruction(module.get(), "copy-start" + i);
+    EXPECT_TRUE(cp_start->has_backend_config());
+    TF_ASSERT_OK_AND_ASSIGN(GpuBackendConfig gpu_config,
+                            cp_start->backend_config<GpuBackendConfig>());
+    EXPECT_GT(gpu_config.operation_queue_id(), 0);
+  }
 
   // The module should still have a schedule.
   ASSERT_TRUE(module->has_schedule());
diff --git a/third_party/xla/xla/service/gpu/gpu_p2p_pipeliner.cc b/third_party/xla/xla/service/gpu/gpu_p2p_pipeliner.cc
index f8cba55030c9f2..5c48574307faa0 100644
--- a/third_party/xla/xla/service/gpu/gpu_p2p_pipeliner.cc
+++ b/third_party/xla/xla/service/gpu/gpu_p2p_pipeliner.cc
@@ -41,13 +41,25 @@ bool ShouldPipeline(const HloInstruction* instr) {
   if (!HloPredicateIsOp<HloOpcode::kRecvDone, HloOpcode::kSendDone>(instr)) {
     return false;
   }
-
   // Not annotated for pipelining.
   auto it = instr->frontend_attributes().map().find(kSendRecvPipelineAttr);
   if (it == instr->frontend_attributes().map().end()) {
     return false;
   }
 
+  // Allow RecvDone to have a Send as a control predecessor. This control
+  // predecessor will be dropped by the pipeliner, which is what we needed
+  // when we rotate the RecvDone to the beginning of the while-body.
+  auto allowed_predecessor = [&]() {
+    return instr->opcode() == HloOpcode::kRecvDone &&
+           instr->control_predecessors().size() == 1 &&
+           instr->control_predecessors()[0]->opcode() == HloOpcode::kSend;
+  };
+  if (!instr->control_successors().empty() ||
+      (!instr->control_predecessors().empty() && !allowed_predecessor())) {
+    return false;
+  }
+
   // Checks that the SendDone or RecvDone is used for non-trivial computation.
   // This avoids repeatedly pipelining a loop.
   bool is_pipelined =
@@ -64,7 +76,7 @@ bool ShouldAllowLoopVariantParameterInChain(const HloInstruction* instr) {
   return true;
 }
 
-Status PostprocessP2PImpl(
+absl::Status PostprocessP2PImpl(
     HloInstruction* instr,
     std::function<std::string(std::vector<ReplicaGroup>&)> transformer) {
   // The input instruction is a Done instruction.
@@ -94,7 +106,7 @@ Status PostprocessP2PImpl(
 
 // Modifies the loop iteration frontend attribute for the peeled off Send and
 // Recv for the first iteration of a loop.
-Status PostprocessPeeledP2P(HloInstruction* instr) {
+absl::Status PostprocessPeeledP2P(HloInstruction* instr) {
   auto transform_bounds = [&](std::vector<ReplicaGroup>& replica_groups) {
     std::vector<std::pair<int64_t, int64_t>> bounds;
     bounds.reserve(replica_groups.size());
@@ -137,7 +149,7 @@ Status PostprocessPeeledP2P(HloInstruction* instr) {
 
 // Modifies the loop iteration frontend attribute for the rotated Send and Recv
 // for the remaining iterations in a loop.
-Status PostprocessRotatedP2P(HloInstruction* instr) {
+absl::Status PostprocessRotatedP2P(HloInstruction* instr) {
   auto transform_bounds = [&](std::vector<ReplicaGroup>& replica_groups) {
     std::vector<std::pair<int64_t, int64_t>> bounds;
     bounds.reserve(replica_groups.size());
@@ -199,11 +211,15 @@ void AddP2PPipeliner(HloPassPipeline& pipeline) {
       /*pipeline_use_tree=*/false,
       /*process_different_sized_ops=*/true,
       /*pipelining_direction=*/
-      CollectivePipeliner::PipeliningDirection::kBackward, ShouldPipeline,
+      CollectivePipeliner::PipeliningDirection::kBackward,
+      /*should_process=*/ShouldPipeline,
       /*acceptable_formatting=*/HloPredicateTrue,
       /*reuse_pipelined_op_buffer=*/HloPredicateTrue,
-      ShouldAllowLoopVariantParameterInChain, PostprocessPeeledP2P,
-      PostprocessRotatedP2P};
+      /*should_allow_loop_variant_parameter_in_chain=*/
+      ShouldAllowLoopVariantParameterInChain,
+      /*should_allow_control_dependencies=*/true,
+      /*=postprocess_backward_peeled_op*/ PostprocessPeeledP2P,
+      /*=postprocess_backward_rotated_op*/ PostprocessRotatedP2P};
   pipeline.AddPass<CollectivePipeliner>(config);
 }
 
diff --git a/third_party/xla/xla/service/gpu/gpu_p2p_pipeliner_test.cc b/third_party/xla/xla/service/gpu/gpu_p2p_pipeliner_test.cc
index 7e71820ed87a14..e0ee476ea914b1 100644
--- a/third_party/xla/xla/service/gpu/gpu_p2p_pipeliner_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_p2p_pipeliner_test.cc
@@ -96,7 +96,7 @@ TEST_F(GpuP2PPipelinerTest,
     recv-done.0 = (u32[2], token[]) recv-done(recv.0), channel_id=1,
       frontend_attributes={
         _xla_send_recv_pipeline="0"
-      }
+      }, control-predecessors={send.0}
     recv-data = u32[2] get-tuple-element(recv-done.0), index=0
 
     c1 = u32[] constant(1)
diff --git a/third_party/xla/xla/service/gpu/gpu_reduce_scatter_creator.cc b/third_party/xla/xla/service/gpu/gpu_reduce_scatter_creator.cc
index 2f3f0176da2a31..7f1f800d2bc3de 100644
--- a/third_party/xla/xla/service/gpu/gpu_reduce_scatter_creator.cc
+++ b/third_party/xla/xla/service/gpu/gpu_reduce_scatter_creator.cc
@@ -95,7 +95,7 @@ absl::StatusOr<bool> ReduceScatterCreator::Run(
 
       HloInstruction *ars =
           computation->AddInstruction(HloInstruction::CreateReduceScatter(
-              scatter_shape, {rs_input}, ar->to_apply(), ar->replica_groups(),
+              scatter_shape, {rs_input}, ar->to_apply(), ar->device_list(),
               ar->constrain_layout(), channel_id, ar->use_global_device_ids(),
               ar_spec->split_dim));
 
diff --git a/third_party/xla/xla/service/gpu/gpu_sanitize_constant_names.cc b/third_party/xla/xla/service/gpu/gpu_sanitize_constant_names.cc
index d948882dd0781e..771e8cbed8a9a0 100644
--- a/third_party/xla/xla/service/gpu/gpu_sanitize_constant_names.cc
+++ b/third_party/xla/xla/service/gpu/gpu_sanitize_constant_names.cc
@@ -42,7 +42,9 @@ absl::StatusOr<bool> GpuSanitizeConstantNames::Run(
         continue;
       }
 
-      instr->UniquifyName(&instr_name_uniquer);
+      // Record the non-constant HLO instruction name in uniquer, and keep
+      // original instruction name unchanged.
+      instr_name_uniquer.GetUniqueName(instr->name());
     }
   }
 
diff --git a/third_party/xla/xla/service/gpu/gpu_schedule_postprocessing.cc b/third_party/xla/xla/service/gpu/gpu_schedule_postprocessing.cc
index e71232c0a2dda2..a0af798118669d 100644
--- a/third_party/xla/xla/service/gpu/gpu_schedule_postprocessing.cc
+++ b/third_party/xla/xla/service/gpu/gpu_schedule_postprocessing.cc
@@ -66,7 +66,7 @@ absl::StatusOr<bool> IsRelevantAsynchronousStart(const HloInstruction* hlo) {
   }
   TF_ASSIGN_OR_RETURN(GpuBackendConfig gpu_config,
                       hlo->backend_config<GpuBackendConfig>());
-  CollectiveBackendConfig collective_backend_config =
+  const CollectiveBackendConfig& collective_backend_config =
       gpu_config.collective_backend_config();
   return !collective_backend_config.is_sync();
 }
@@ -96,7 +96,8 @@ absl::StatusOr<bool> ProcessComputation(
   // attribute no_parallel_custom_call to true. When we see a custom-call, clear
   // the start ops from the collection and keep their attribute
   // no_parallel_custom_call as false.
-  const std::vector<HloInstruction*> all_instructions = sequence.instructions();
+  const std::vector<HloInstruction*>& all_instructions =
+      sequence.instructions();
   for (HloInstruction* hlo : all_instructions) {
     if (MayInvokeCustomCall(hlo, custom_call_in_computation)) {
       async_starts.clear();
diff --git a/third_party/xla/xla/service/gpu/gpu_symbol_repository.h b/third_party/xla/xla/service/gpu/gpu_symbol_repository.h
index 61044214e7913e..3ad948028e25fa 100644
--- a/third_party/xla/xla/service/gpu/gpu_symbol_repository.h
+++ b/third_party/xla/xla/service/gpu/gpu_symbol_repository.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <optional>
 
+#include "xla/autotune_results.pb.h"
 #include "xla/service/symbol_repository.h"
 #include "xla/xla.pb.h"
 
@@ -26,6 +27,7 @@ namespace xla::gpu {
 // GPU-specific fields for SymbolRepositories.
 struct GpuBackendSpecificData : public BackendSpecificData {
   std::optional<GpuCompilationEnvironment> gpu_compilation_environment;
+  std::optional<AutotuneResults> autotune_results;
 };
 
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/gpu_transfer_manager.cc b/third_party/xla/xla/service/gpu/gpu_transfer_manager.cc
index 8bd97486d34358..f628e17aa5e6c8 100644
--- a/third_party/xla/xla/service/gpu/gpu_transfer_manager.cc
+++ b/third_party/xla/xla/service/gpu/gpu_transfer_manager.cc
@@ -102,8 +102,8 @@ absl::Status GpuTransferManager::ReadDynamicShapes(
   DCHECK(device_shape->is_dynamic());
   Shape original_device_shape = *device_shape;
 
-  TF_ASSIGN_OR_RETURN(auto compiler,
-                      Compiler::GetForPlatform(stream->parent()->platform()));
+  TF_ASSIGN_OR_RETURN(
+      auto compiler, Compiler::GetForPlatform(stream->parent()->GetPlatform()));
   auto shape_size_fn = compiler->ShapeSizeBytesFunction();
 
   // First, figure out which parts of `device_shape` are dynamic and where the
@@ -318,10 +318,7 @@ GpuTransferManager::GetOrCreateStagingBuffer(se::StreamExecutor* executor) {
   TF_ASSIGN_OR_RETURN(auto staging_buffer,
                       executor->HostMemoryAllocate(kStagingBufferSize));
 
-  auto transfer_completed = std::make_unique<se::Event>(executor);
-  if (!transfer_completed->Init()) {
-    return absl::InternalError("Failed to initialize transfer completed event");
-  }
+  TF_ASSIGN_OR_RETURN(auto transfer_completed, executor->CreateEvent());
 
   auto emplaced = staging_buffers_.try_emplace(
       executor, std::move(staging_buffer), std::move(transfer_completed));
diff --git a/third_party/xla/xla/service/gpu/gpu_windowed_einsum_handler.cc b/third_party/xla/xla/service/gpu/gpu_windowed_einsum_handler.cc
index fa5339fc007f44..875d4d5eed8bfb 100644
--- a/third_party/xla/xla/service/gpu/gpu_windowed_einsum_handler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_windowed_einsum_handler.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
+#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -142,6 +143,136 @@ absl::StatusOr<bool> HandleAgWindowedEinsumLoop(HloComputation* comp,
   return changed;
 }
 
+absl::Status ProcessWindowedEinsumLoopForActivationCaching(
+    GpuWindowedEinsumHandler::WindowedEinsumAgLoops& ag_loop) {
+  HloInstruction* loop = ag_loop.loop;
+  // Transform the while body to cache the allgathered result in the
+  // output buffer to be consumed by the dot
+  HloComputation* while_body = loop->while_body();
+  HloInstruction* input_gte;
+  for (HloInstruction* gte : while_body->parameter_instruction(0)->users()) {
+    if (gte->tuple_index() == 0) {
+      input_gte = gte;
+    }
+  }
+  // Get the output operand of the full buffer.
+  HloInstruction* root = while_body->root_instruction();
+  // The full buffer that we will use to cache the accumulated activation
+  // is the 4th operand in the output tuple.
+  int64_t full_cache_buffer_index = 3;
+  HloInstruction* full_buffer_output_gte =
+      root->mutable_operand(full_cache_buffer_index);
+  HloInstruction* new_full_buffer_output;
+  // Find the DUS in the loop body and re-use the slice indices
+  // This should just be a constant(0)
+  HloInstruction* dus_boundary_constant;
+  for (HloInstruction* inst : while_body->MakeInstructionPostOrder()) {
+    HloInstruction* slice_indices;
+    // If we have a DUS(PARAM,DS) pattern, we need to update the output
+    // buffer with the first slice.
+    if (Match(inst,
+              m::DynamicUpdateSlice(
+                  m::GetTupleElement(m::Parameter()), m::Op(),
+                  m::Constant(&dus_boundary_constant),
+                  m::Reshape(m::DynamicSlice(&slice_indices, m::Op(), m::Op())),
+                  m::Op()))) {
+      slice_indices = while_body->AddInstruction(HloInstruction::CreateReshape(
+          dus_boundary_constant->shape(), slice_indices));
+      VLOG(5) << "Created slice op for first slice: "
+              << slice_indices->ToString();
+      full_buffer_output_gte =
+          while_body->AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+              full_buffer_output_gte->shape(), full_buffer_output_gte,
+              input_gte,
+              {dus_boundary_constant, slice_indices, dus_boundary_constant}));
+    }
+    // If we have a DUS(DUS,DS) pattern, then the einsum loop is
+    // unrolled, we need to update the output buffer again with the
+    // second slice. Since the second slice will have different indices,
+    // we need to re-capture slice_indices.
+    if (Match(inst,
+              m::DynamicUpdateSlice(
+                  m::DynamicUpdateSlice(), m::Op(), m::Constant(),
+                  m::Reshape(m::DynamicSlice(&slice_indices, m::Op(), m::Op())),
+                  m::Op()))) {
+      slice_indices = while_body->AddInstruction(HloInstruction::CreateReshape(
+          dus_boundary_constant->shape(), slice_indices));
+      VLOG(5) << "Created slice op for second slice: "
+              << slice_indices->ToString();
+      // The slice we need this time is the output of the first
+      // collective-permute
+      HloInstruction* cp_output;
+      for (HloInstruction* gte_user : input_gte->users()) {
+        if (gte_user->opcode() == HloOpcode::kCollectivePermute) {
+          cp_output = gte_user;
+          break;
+        }
+      }
+      new_full_buffer_output =
+          while_body->AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+              full_buffer_output_gte->shape(), full_buffer_output_gte,
+              cp_output,
+              {dus_boundary_constant, slice_indices, dus_boundary_constant}));
+    }
+  }
+  TF_RETURN_IF_ERROR(root->ReplaceOperandWith(full_cache_buffer_index,
+                                              new_full_buffer_output));
+  return OkStatus();
+}
+
+class WindowedEinsumVisitor : public DfsHloRewriteVisitor {
+ public:
+  explicit WindowedEinsumVisitor(
+      std::vector<GpuWindowedEinsumHandler::WindowedEinsumAgLoops>&
+          all_ag_loops)
+      : all_ag_loops_(all_ag_loops) {}
+  // Rewrites a allgather-dot pattern that shares the same operand
+  // with a windowed einsum loop to consume the output of the loop
+  // and remove the all-gather.
+  absl::Status HandleDot(HloInstruction* dot) override {
+    CHECK_EQ(dot->opcode(), HloOpcode::kDot);
+    for (GpuWindowedEinsumHandler::WindowedEinsumAgLoops ag_loop :
+         all_ag_loops_) {
+      HloInstruction* loop = ag_loop.loop;
+      HloInstruction* ag_operand = nullptr;
+
+      if (Match(dot, m::Dot(m::AllGather(&ag_operand), m::Op())) ||
+          Match(dot, m::Dot(m::Op(), m::AllGather(&ag_operand)))) {
+        HloInstruction* windowed_lhs =
+            loop->mutable_operand(0)->mutable_operand(0);
+        HloInstruction* ag_with_shared_operand = nullptr;
+        if (ag_operand && ag_operand->mutable_operand(0) == windowed_lhs) {
+          ag_with_shared_operand = ag_operand;
+        }
+
+        if (!ag_with_shared_operand) {
+          continue;
+        }
+
+        VLOG(5) << "Found all-gather that shares the same operand with a "
+                   "windowed einsum loop : "
+                << loop->ToString();
+        int64_t cache_output_index = dot->operand_index(ag_with_shared_operand);
+        HloComputation* comp = dot->parent();
+        HloInstruction* new_gte = comp->AddInstruction(
+            HloInstruction::CreateGetTupleElement(loop, 3));
+        TF_RETURN_IF_ERROR(
+            dot->ReplaceOperandWith(cache_output_index, new_gte));
+        TF_RETURN_IF_ERROR(comp->RemoveInstruction(ag_with_shared_operand));
+        if (!ag_loop.consumed) {
+          TF_RETURN_IF_ERROR(
+              ProcessWindowedEinsumLoopForActivationCaching(ag_loop));
+          ag_loop.consumed = true;
+        }
+      }
+    }
+    return absl::OkStatus();
+  }
+
+ private:
+  std::vector<GpuWindowedEinsumHandler::WindowedEinsumAgLoops>& all_ag_loops_;
+};
+
 }  // namespace
 
 absl::StatusOr<bool> GpuWindowedEinsumHandler::Run(
@@ -163,9 +294,40 @@ absl::StatusOr<bool> GpuWindowedEinsumHandler::Run(
       VLOG(5) << "Processing computation: " << comp->name();
       TF_ASSIGN_OR_RETURN(bool comp_result,
                           HandleAgWindowedEinsumLoop(comp, stream_id));
+      all_ag_loops_.push_back(
+          WindowedEinsumAgLoops(comp->WhileCallInstruction()));
       changed = comp_result;
     }
   }
+  // Now that we have processed all loops, we can check if there are any
+  // allgather-dot pattern that we can optimize. We'd want to transform:
+  //                       input
+  //                       /    |
+  //                      /     |
+  //                     AG    windowed loop
+  //                     /
+  //                    /
+  //                   dot
+  // to:
+  //                       input
+  //                       |
+  //                       |
+  //                     windowed loop
+  //                       |
+  //                       |
+  //                      dot
+  // The windowed einsum loop will also be rewritten to output the full input to
+  // be consumed by the dot.
+  // This is advantageous since the chained dot can fully utilize all the
+  // resources on the GPU while comm is hidden by the first collective matmul
+  // loop.
+  for (HloComputation* comp :
+       module->MakeNonfusionComputations(execution_threads)) {
+    WindowedEinsumVisitor visitor(all_ag_loops_);
+    TF_RETURN_IF_ERROR(comp->Accept(&visitor));
+    changed |= visitor.changed();
+  }
+
   XLA_VLOG_LINES(
       5, "GpuWindowedEinsumHandler::Run(), after:\n" + module->ToString());
   return changed;
diff --git a/third_party/xla/xla/service/gpu/gpu_windowed_einsum_handler.h b/third_party/xla/xla/service/gpu/gpu_windowed_einsum_handler.h
index 87ec1474d576f7..8f42cbc00fb3ab 100644
--- a/third_party/xla/xla/service/gpu/gpu_windowed_einsum_handler.h
+++ b/third_party/xla/xla/service/gpu/gpu_windowed_einsum_handler.h
@@ -38,16 +38,24 @@ class GpuWindowedEinsumHandler : public HloModulePass {
     return "gpu-windowed-einsum-handler";
   }
 
+  struct WindowedEinsumAgLoops {
+    WindowedEinsumAgLoops(HloInstruction* loop) : loop(loop) {}
+    HloInstruction* loop;
+    bool consumed = false;
+  };
+
   using HloPassInterface::Run;
   absl::StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
- private:
   constexpr static const char* kWindowedEinsumRsLoopName =
       "windowed_dot_general_body_rs";
   constexpr static const char* kWindowedEinsumAgLoopName =
       "windowed_dot_general_body_ag";
+
+ private:
+  std::vector<WindowedEinsumAgLoops> all_ag_loops_;
 };
 
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/gpu_windowed_einsum_handler_test.cc b/third_party/xla/xla/service/gpu/gpu_windowed_einsum_handler_test.cc
index c70fbf2b08d126..b0bce884392253 100644
--- a/third_party/xla/xla/service/gpu/gpu_windowed_einsum_handler_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_windowed_einsum_handler_test.cc
@@ -23,12 +23,16 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/pattern_matcher.h"
+#include "xla/service/pattern_matcher_gmock.h"
 #include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla::gpu {
 namespace {
 
+namespace m = ::xla::match;
+
 using GpuWindowedEinsumHanlderTest = HloTestBase;
 
 HloInstruction* FindInstructionByName(HloComputation* comp, std::string name) {
@@ -193,5 +197,96 @@ ENTRY main.9_spmd {
       cp1->backend_config<GpuBackendConfig>()->force_earliest_schedule());
 }
 
+TEST_F(GpuWindowedEinsumHanlderTest, AgLoopsMultipleConsumersAreChained) {
+  constexpr absl::string_view kHloString = R"(
+HloModule pjit__unnamed_wrapped_function_, entry_computation_layout={(bf16[2,512,24576]{2,1,0}, bf16[24576,24576]{1,0}, bf16[24576,24576]{1,0})->bf16[2,2048,24576]{2,1,0}}, num_partitions=4
+
+windowed_dot_general_body_ag {
+  param.1 = (bf16[2,512,24576]{2,1,0}, bf16[24576,24576]{1,0}, bf16[2,2048,24576]{2,1,0}, bf16[2,2048,24576]{2,1,0}, u32[]) parameter(0)
+  get-tuple-element.1 = bf16[2,512,24576]{2,1,0} get-tuple-element(param.1), index=0
+  collective-permute = bf16[2,512,24576]{2,1,0} collective-permute(get-tuple-element.1), channel_id=2, source_target_pairs={{0,3},{1,0},{2,1},{3,2}}
+  collective-permute.1 = bf16[2,512,24576]{2,1,0} collective-permute(collective-permute), channel_id=3, source_target_pairs={{0,3},{1,0},{2,1},{3,2}}
+  get-tuple-element.2 = bf16[24576,24576]{1,0} get-tuple-element(param.1), index=1
+  get-tuple-element.3 = bf16[2,2048,24576]{2,1,0} get-tuple-element(param.1), index=2
+  dot = bf16[2,512,24576]{2,1,0} dot(get-tuple-element.1, get-tuple-element.2), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+  constant.2 = s32[] constant(0)
+  constant.3 = s32[4]{0} constant({0, 512, 1024, 1536})
+  get-tuple-element.5 = u32[] get-tuple-element(param.1), index=4
+  partition-id = u32[] partition-id()
+  add = u32[] add(get-tuple-element.5, partition-id)
+  constant.1 = u32[] constant(4)
+  remainder = u32[] remainder(add, constant.1)
+  dynamic-slice = s32[1]{0} dynamic-slice(constant.3, remainder), dynamic_slice_sizes={1}
+  reshape = s32[] reshape(dynamic-slice)
+  dynamic-update-slice = bf16[2,2048,24576]{2,1,0} dynamic-update-slice(get-tuple-element.3, dot, constant.2, reshape, constant.2)
+  dot.1 = bf16[2,512,24576]{2,1,0} dot(collective-permute, get-tuple-element.2), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+  constant.5 = u32[] constant(1)
+  add.1 = u32[] add(get-tuple-element.5, constant.5)
+  add.2 = u32[] add(add.1, partition-id)
+  remainder.1 = u32[] remainder(add.2, constant.1)
+  dynamic-slice.1 = s32[1]{0} dynamic-slice(constant.3, remainder.1), dynamic_slice_sizes={1}
+  reshape.1 = s32[] reshape(dynamic-slice.1)
+  dynamic-update-slice.1 = bf16[2,2048,24576]{2,1,0} dynamic-update-slice(dynamic-update-slice, dot.1, constant.2, reshape.1, constant.2)
+  get-tuple-element.4 = bf16[2,2048,24576]{2,1,0} get-tuple-element(param.1), index=3
+  add.3 = u32[] add(add.1, constant.5)
+  ROOT tuple = (bf16[2,512,24576]{2,1,0}, bf16[24576,24576]{1,0}, bf16[2,2048,24576]{2,1,0}, bf16[2,2048,24576]{2,1,0}, u32[]) tuple(collective-permute.1, get-tuple-element.2, dynamic-update-slice.1, get-tuple-element.4, add.3)
+} // windowed_dot_general_body_ag
+
+windowed_dot_general_cond_ag {
+  param = (bf16[2,512,24576]{2,1,0}, bf16[24576,24576]{1,0}, bf16[2,2048,24576]{2,1,0}, bf16[2,2048,24576]{2,1,0}, u32[]) parameter(0)
+  get-tuple-element = u32[] get-tuple-element(param), index=4
+  constant = u32[] constant(4)
+  ROOT compare = pred[] compare(get-tuple-element, constant), direction=LT
+}
+
+ENTRY main.12_spmd {
+  param.4 = bf16[2,512,24576]{2,1,0} parameter(0), sharding={devices=[1,4,1]<=[4]}
+  param.5 = bf16[24576,24576]{1,0} parameter(1), sharding={devices=[1,4]<=[4]}
+  constant.22 = bf16[] constant(0)
+  broadcast = bf16[2,2048,24576]{2,1,0} broadcast(constant.22), dimensions={}
+  constant.24 = u32[] constant(0)
+  tuple.2 = (bf16[2,512,24576]{2,1,0}, bf16[24576,24576]{1,0}, bf16[2,2048,24576]{2,1,0}, bf16[2,2048,24576]{2,1,0}, u32[]) tuple(param.4, param.5, broadcast, broadcast, constant.24)
+  while = (bf16[2,512,24576]{2,1,0}, bf16[24576,24576]{1,0}, bf16[2,2048,24576]{2,1,0}, bf16[2,2048,24576]{2,1,0}, u32[]) while(tuple.2), condition=windowed_dot_general_cond_ag, body=windowed_dot_general_body_ag
+  get-tuple-element.13 = bf16[2,2048,24576]{2,1,0} get-tuple-element(while), index=2
+  copy.1 = bf16[2,2048,24576]{2,1,0} copy(get-tuple-element.13)
+  all-gather = bf16[2,2048,24576]{2,1,0} all-gather(param.4), channel_id=1, replica_groups={{0,1,2,3}}, dimensions={1}, use_global_device_ids=true
+  param.6 = bf16[24576,24576]{1,0} parameter(2), sharding={devices=[1,4]<=[4]}
+  ROOT dot.7 = bf16[2,2048,24576]{2,1,0} dot(all-gather, param.6), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
+
+  GpuWindowedEinsumHandler gpu_handler;
+  bool changed;
+  TF_ASSERT_OK_AND_ASSIGN(changed, gpu_handler.Run(module.get()));
+  EXPECT_TRUE(changed);
+
+  HloInstruction* ag_loop =
+      FindInstructionByName(module->entry_computation(), "while");
+  HloInstruction* inst =
+      FindInstructionByName(module->entry_computation(), "dot.7");
+  // dot.7 should now consume output of the windowed einsum while loop.
+  EXPECT_EQ(inst->operand(0)->opcode(), HloOpcode::kGetTupleElement);
+  EXPECT_EQ(inst->operand(0)->tuple_index(), 3);
+  EXPECT_EQ(inst->operand(0)->operand(0), ag_loop);
+
+  // while loop's root should now have a chain of DUS.
+  HloInstruction* ag_while_root = ag_loop->while_body()->root_instruction();
+  EXPECT_THAT(ag_while_root,
+              GmockMatch(m::Tuple(
+                  m::Op(), m::Op(), m::Op(),
+                  m::DynamicUpdateSlice(
+                      m::DynamicUpdateSlice(
+                          m::GetTupleElement(m::Parameter())
+                              .WithPredicate([](const HloInstruction* instr) {
+                                return instr->tuple_index() == 3;
+                              }),
+                          m::Op(), m::Op(), m::Op(), m::Op()),
+                      m::Op(), m::Op(), m::Op(), m::Op()),
+                  m::Op())));
+}
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/hlo_algorithm_denylist.cc b/third_party/xla/xla/service/gpu/hlo_algorithm_denylist.cc
index 1679f3d95e91f0..9ea27ca86bc0bb 100644
--- a/third_party/xla/xla/service/gpu/hlo_algorithm_denylist.cc
+++ b/third_party/xla/xla/service/gpu/hlo_algorithm_denylist.cc
@@ -22,7 +22,6 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
-#include "absl/types/span.h"
 #include "xla/debug_options_flags.h"
 #include "xla/service/gpu/gpu_autotuning.pb.h"
 #include "xla/stream_executor/dnn.h"
@@ -34,30 +33,20 @@ namespace gpu {
 
 constexpr char kDefaultDenylist[] = R"pb(
   entries {
-    hlo: "(f32[4,32,32,32]{2,1,3,0}, u8[0]{0}) custom-call(f32[4,32,32,32]{2,1,3,0}, f32[5,5,32,32]{1,0,2,3}), window={size=5x5 pad=2_2x2_2}, dim_labels=b01f_01io->b01f, custom_call_target=\"__cudnn$convForward\", backend_config=\"{conv_result_scale:1}\""
+    hlo: "(f32[512,512,7,7]{3,2,1,0}, u8[0]{0}) custom-call(f32[512,512,7,7]{3,2,1,0}, f32[512,512,3,3]{3,2,1,0}, f32[512]{0}), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_oi01->bf01, custom_call_target=\"__cudnn$convBiasActivationForward\", backend_config={\"operation_queue_id\":\"0\",\"wait_on_operation_queues\":[],\"cudnn_conv_backend_config\":{\"activation_mode\":\"kNone\",\"conv_result_scale\":1,\"side_input_scale\":0,\"leakyrelu_alpha\":0},\"force_earliest_schedule\":false}"
     cc { major: 7 }
-    cudnn_version { major: 7 minor: 6 patch: 4 }
-    algos { id: 7 }
-    blas_version: "10201"
+    cudnn_version { major: 9 }
+    algos { id: 14 }
   }
   entries {
-    hlo: "(f32[4,32,32,32]{2,1,3,0}, u8[0]{0}) custom-call(f32[4,32,32,32]{2,1,3,0}, f32[5,5,32,32]{1,0,2,3}), window={size=5x5 pad=2_2x2_2}, dim_labels=b01f_01io->b01f, custom_call_target=\"__cudnn$convForward\", backend_config=\"{conv_result_scale:1}\""
+    hlo: "(f32[512,512,7,7]{3,2,1,0}, u8[0]{0}) custom-call(f32[512,512,7,7]{3,2,1,0}, f32[512,512,3,3]{3,2,1,0}, f32[512]{0}), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_oi01->bf01, custom_call_target=\"__cudnn$convBiasActivationForward\", backend_config={\"operation_queue_id\":\"0\",\"wait_on_operation_queues\":[],\"cudnn_conv_backend_config\":{\"activation_mode\":\"kNone\",\"conv_result_scale\":1,\"side_input_scale\":0,\"leakyrelu_alpha\":0},\"force_earliest_schedule\":false}"
     cc { major: 7 }
-    cudnn_version { major: 7 minor: 6 patch: 4 }
-    algos { id: 7 tensor_ops: true }
-    blas_version: "10201"
-  }
-  entries {
-    hlo: "(f16[3,3,256,256]{2,1,0,3}, u8[0]{0}) custom-call(f16[2048,7,7,256]{3,2,1,0}, f16[2048,7,7,256]{3,2,1,0}), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, custom_call_target=\"__cudnn$convBackwardFilter\", backend_config=\"{\\\"algorithm\\\":\\\"0\\\",\\\"tensor_ops_enabled\\\":false,\\\"conv_result_scale\\\":1,\\\"activation_mode\\\":\\\"0\\\",\\\"side_input_scale\\\":0}\""
-    cc { major: 7 }
-    cudnn_version { major: 8 minor: 2 patch: 1 } algos
-    [ { id: 0 tensor_ops: true }
-      , { id: 0 }]
-    blas_version: "11402"
+    cudnn_version { major: 9 minor: 1 patch: 1 }
+    algos { id: 14 }
   }
 )pb";
 
-absl::Span<const stream_executor::dnn::AlgorithmDesc> GetDisabledConvAlgorithms(
+std::vector<stream_executor::dnn::AlgorithmDesc> GetDisabledConvAlgorithms(
     ComputeCapability cc, CudnnVersion cudnn_version,
     const std::string& blas_version, const std::string& hlo) {
   // Key is the tuple of canonicalized hlo, compute capability major/minor,
@@ -90,13 +79,26 @@ absl::Span<const stream_executor::dnn::AlgorithmDesc> GetDisabledConvAlgorithms(
     return list;
   }();
 
-  auto iter = denylist->find(std::make_tuple(
-      hlo, cc.major(), cc.minor(), cudnn_version.major(), cudnn_version.minor(),
-      cudnn_version.patch(), std::string(blas_version)));
-  if (iter != denylist->end()) {
-    return iter->second;
-  }
-  return {};
+  std::vector<stream_executor::dnn::AlgorithmDesc> algorithms;
+  auto add_matching_disabled_algorithms_to_result = [&](const auto& key) {
+    auto iter = denylist->find(key);
+    if (iter != denylist->end()) {
+      algorithms.insert(algorithms.end(), iter->second.begin(),
+                        iter->second.end());
+    }
+  };
+
+  // Exclude algorithms with explicit BLAS version set
+  auto key = std::make_tuple(hlo, cc.major(), cc.minor(), cudnn_version.major(),
+                             cudnn_version.minor(), cudnn_version.patch(),
+                             blas_version);
+  add_matching_disabled_algorithms_to_result(key);
+
+  // Exclude algorithms with no BLAS version set
+  std::get<6>(key) = std::string{};
+  add_matching_disabled_algorithms_to_result(key);
+
+  return algorithms;
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/hlo_algorithm_denylist.h b/third_party/xla/xla/service/gpu/hlo_algorithm_denylist.h
index 828903ce99458f..82353e9354e1fc 100644
--- a/third_party/xla/xla/service/gpu/hlo_algorithm_denylist.h
+++ b/third_party/xla/xla/service/gpu/hlo_algorithm_denylist.h
@@ -17,15 +17,15 @@ limitations under the License.
 #define XLA_SERVICE_GPU_HLO_ALGORITHM_DENYLIST_H_
 
 #include <string>
+#include <vector>
 
-#include "absl/types/span.h"
 #include "xla/autotuning.pb.h"
 #include "xla/stream_executor/dnn.h"
 
 namespace xla {
 namespace gpu {
 
-absl::Span<const stream_executor::dnn::AlgorithmDesc> GetDisabledConvAlgorithms(
+std::vector<stream_executor::dnn::AlgorithmDesc> GetDisabledConvAlgorithms(
     ComputeCapability cc, CudnnVersion cudnn_version,
     const std::string& blas_version, const std::string& hlo);
 
diff --git a/third_party/xla/xla/service/gpu/hlo_algorithm_denylist_test.cc b/third_party/xla/xla/service/gpu/hlo_algorithm_denylist_test.cc
index c98ac430b3e479..0e426599f3a3a3 100644
--- a/third_party/xla/xla/service/gpu/hlo_algorithm_denylist_test.cc
+++ b/third_party/xla/xla/service/gpu/hlo_algorithm_denylist_test.cc
@@ -59,11 +59,13 @@ TEST_F(DenylistTest, DefaultTest) {
   auto list = GetDisabledConvAlgorithms(
       cc, cudnn_version, /*blas_version=*/"9000",
       R"((f16[256,112,112,64]{3,2,1,0}, u8[0]{0}) custom-call(f16[256,224,224,4]{3,2,1,0}, f16[7,7,4,64]{2,1,0,3}), window={size=7x7 stride=2x2 pad=3_3x3_3}, dim_labels=b01f_01io->b01f, custom_call_target="__cudnn$convForward", backend_config="{conv_result_scale:1}")");
-  ASSERT_EQ(4, list.size());
-  EXPECT_EQ(stream_executor::dnn::AlgorithmDesc(0, false), list[0]);
-  EXPECT_EQ(stream_executor::dnn::AlgorithmDesc(0, true), list[1]);
-  EXPECT_EQ(stream_executor::dnn::AlgorithmDesc(1, false), list[2]);
-  EXPECT_EQ(stream_executor::dnn::AlgorithmDesc(1, true), list[3]);
+  EXPECT_THAT(list, testing::UnorderedElementsAre(
+                        stream_executor::dnn::AlgorithmDesc{0, true},
+                        stream_executor::dnn::AlgorithmDesc{0, false},
+                        stream_executor::dnn::AlgorithmDesc{1, true},
+                        stream_executor::dnn::AlgorithmDesc{1, false},
+                        stream_executor::dnn::AlgorithmDesc{42, true},
+                        stream_executor::dnn::AlgorithmDesc{42, false}));
 }
 
 TEST_F(DenylistTest, NegativeTest) {
@@ -76,7 +78,23 @@ TEST_F(DenylistTest, NegativeTest) {
   cudnn_version.set_minor(2);
   auto list =
       GetDisabledConvAlgorithms(cc, cudnn_version, "9000", R"(invalid hlo)");
-  ASSERT_EQ(0, list.size());
+  EXPECT_THAT(list, testing::IsEmpty());
+}
+
+TEST_F(DenylistTest, NoBlasVersionSet) {
+  ComputeCapability cc;
+  cc.set_major(7);
+  cc.set_minor(0);
+  CudnnVersion cudnn_version;
+  cudnn_version.set_major(7);
+  cudnn_version.set_minor(6);
+  cudnn_version.set_patch(2);
+  auto list = GetDisabledConvAlgorithms(
+      cc, cudnn_version, /*blas_version=*/"120301",
+      R"((f16[256,112,112,64]{3,2,1,0}, u8[0]{0}) custom-call(f16[256,224,224,4]{3,2,1,0}, f16[7,7,4,64]{2,1,0,3}), window={size=7x7 stride=2x2 pad=3_3x3_3}, dim_labels=b01f_01io->b01f, custom_call_target="__cudnn$convForward", backend_config="{conv_result_scale:1}")");
+  EXPECT_THAT(list, testing::UnorderedElementsAre(
+                        stream_executor::dnn::AlgorithmDesc{42, true},
+                        stream_executor::dnn::AlgorithmDesc{42, false}));
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
index 529ebf9008d5d9..765fd6b80354d8 100644
--- a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
+++ b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
@@ -23,9 +23,11 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -46,16 +48,19 @@ namespace {
 
 // Returns true if the fusion output contains non-strided slices only.
 bool IsInputFusibleNonStridedSlices(
-    const std::vector<const HloInstruction*>& fusion_roots) {
-  return absl::c_all_of(fusion_roots, IsSliceWithUnitStrides);
+    const absl::Span<const HloInstructionAdaptor> fusion_roots) {
+  return absl::c_all_of(fusion_roots, [&](const HloInstructionAdaptor& root) {
+    return IsSliceWithUnitStrides(&root.instruction());
+  });
 }
 
 // Returns true if all slice inputs in a tuple are equal (ignoring type).
 bool AllSliceInputsAreCompatible(
-    const std::vector<const HloInstruction*>& fusion_roots) {
-  const Shape& first_slice_operand_shape = fusion_roots[0]->operand(0)->shape();
-  return absl::c_all_of(fusion_roots, [&](const HloInstruction* slice) {
-    return ShapeUtil::EqualIgnoringElementType(slice->operand(0)->shape(),
+    const absl::Span<const HloInstructionAdaptor> fusion_roots) {
+  const Shape& first_slice_operand_shape =
+      fusion_roots[0].GetOperand(0).shape();
+  return absl::c_all_of(fusion_roots, [&](const HloInstructionAdaptor& slice) {
+    return ShapeUtil::EqualIgnoringElementType(slice.GetOperand(0).shape(),
                                                first_slice_operand_shape);
   });
 }
@@ -66,13 +71,14 @@ bool AllSliceInputsAreCompatible(
 //   * Either the root has a traspose hero with the same normalized dimensions
 //   * Or the root output shape is equal to the the transpose input shape
 std::optional<TransposeDescription> FindConsistentTransposeHero(
-    const std::vector<const HloInstruction*>& hlo_roots,
-    const std::vector<const HloInstruction*>& heroes) {
+    const absl::InlinedVector<HloInstructionAdaptor, 2>& hlo_roots,
+    const absl::InlinedVector<HloInstructionAdaptor, 2>& heroes) {
   std::optional<TransposeDescription> tiled_transpose_hero;
   std::vector<const HloInstruction*> non_transpose_roots;
 
   for (auto [root, hero] : llvm::zip(hlo_roots, heroes)) {
-    if (auto tr = GetDescriptionForTiledTransposeEmitter(*root, *hero)) {
+    if (auto tr = GetDescriptionForTiledTransposeEmitter(root.instruction(),
+                                                         hero.instruction())) {
       if (!tiled_transpose_hero) {
         // First transpose hero found.
         tiled_transpose_hero = tr;
@@ -81,7 +87,7 @@ std::optional<TransposeDescription> FindConsistentTransposeHero(
         return std::nullopt;
       }
     } else {
-      non_transpose_roots.push_back(root);
+      non_transpose_roots.push_back(&root.instruction());
     }
   }
 
@@ -100,12 +106,14 @@ std::optional<TransposeDescription> FindConsistentTransposeHero(
   return tiled_transpose_hero;
 }
 
-int SmallestInputDtypeBits(const std::vector<const HloInstruction*>& args) {
+int SmallestBitWidth(absl::Span<HloInstructionAdaptor const> args) {
   int bits = std::numeric_limits<int>::max();
-  for (const HloInstruction* operand : args) {
-    if (!operand->shape().IsArray()) continue;
-    bits = std::min(bits,
-                    primitive_util::BitWidth(operand->shape().element_type()));
+  for (const HloInstructionAdaptor& operand : args) {
+    if (!operand.shape().IsArray()) continue;
+    bits = std::min(
+        bits, operand.shape().element_type() == PRED
+                  ? 8
+                  : primitive_util::BitWidth(operand.shape().element_type()));
   }
   return bits;
 }
@@ -114,15 +122,15 @@ int SmallestInputDtypeBits(const std::vector<const HloInstruction*>& args) {
 
 HloFusionAnalysis::HloFusionAnalysis(
     FusionBackendConfig fusion_backend_config,
-    std::vector<const HloInstruction*> fusion_roots,
     std::unique_ptr<HloFusionAdaptor> fusion,
-    std::vector<const HloInstruction*> fusion_heroes,
+    absl::InlinedVector<HloInstructionAdaptor, 2> fusion_roots,
+    absl::InlinedVector<HloInstructionAdaptor, 2> fusion_heroes,
     const se::DeviceDescription* device_info,
     std::optional<TransposeDescription> tiled_transpose,
     HloFusionAnalysis::InputOutputInfo input_output_info)
     : fusion_backend_config_(std::move(fusion_backend_config)),
-      fusion_roots_(std::move(fusion_roots)),
       fusion_(std::move(fusion)),
+      fusion_roots_(std::move(fusion_roots)),
       fusion_heroes_(std::move(fusion_heroes)),
       device_info_(device_info),
       tiled_transpose_(tiled_transpose),
@@ -133,33 +141,27 @@ HloFusionAnalysis HloFusionAnalysis::Create(
     FusionBackendConfig backend_config,
     std::unique_ptr<HloFusionAdaptor> fusion,
     const se::DeviceDescription* device_info) {
-  std::vector<const HloInstruction*> roots;
-  std::vector<const HloInstruction*> heroes;
-  for (auto root : fusion->GetRoots()) {
-    roots.push_back(&root.instruction());
-    heroes.push_back(&FindNonTrivialHero(*roots.back(), *fusion));
+  absl::InlinedVector<HloInstructionAdaptor, 2> roots = fusion->GetRoots();
+  absl::InlinedVector<HloInstructionAdaptor, 2> heroes;
+  for (auto root : roots) {
+    heroes.push_back(FindNonTrivialHero(root));
   }
 
-  std::vector<const HloInstruction*> fusion_arguments;
-  FindFusionArguments(*fusion, [&](auto argument) {
-    fusion_arguments.push_back(&argument.instruction());
+  absl::InlinedVector<HloInstructionAdaptor, 2> fusion_arguments;
+  FindFusionArguments(*fusion, [&](const HloInstructionAdaptor& argument) {
+    fusion_arguments.push_back(argument);
   });
 
-  auto is_4bit = [](const HloInstruction* arg) {
-    return primitive_util::Is4BitType(arg->shape().element_type());
-  };
-
   InputOutputInfo input_output_info{
-      .has_4_bit_input = absl::c_any_of(fusion_arguments, is_4bit),
-      .has_4_bit_output = absl::c_any_of(roots, is_4bit),
-      .smallest_input_dtype_bits = SmallestInputDtypeBits(fusion_arguments),
+      .smallest_input_dtype_bits = SmallestBitWidth(fusion_arguments),
+      .smallest_output_dtype_bits = SmallestBitWidth(roots),
   };
 
   std::optional<TransposeDescription> tiled_transpose_hero =
       FindConsistentTransposeHero(roots, heroes);
 
-  return HloFusionAnalysis(std::move(backend_config), std::move(roots),
-                           std::move(fusion), std::move(heroes), device_info,
+  return HloFusionAnalysis(std::move(backend_config), std::move(fusion),
+                           std::move(roots), std::move(heroes), device_info,
                            tiled_transpose_hero, std::move(input_output_info));
 }
 
@@ -182,15 +184,15 @@ bool HloFusionAnalysis::HasConsistentTransposeHeros() const {
 }
 
 static bool UseConcatenateFusion(
-    const std::vector<const HloInstruction*>& roots,
-    const std::vector<const HloInstruction*>& heroes) {
+    absl::Span<const HloInstructionAdaptor> roots,
+    absl::Span<const HloInstructionAdaptor> heroes) {
   if (heroes.size() != 1) return false;
-  if (heroes.front()->opcode() != HloOpcode::kConcatenate) return false;
+  if (heroes.front().opcode() != HloOpcode::kConcatenate) return false;
   // The concat emitter does not support multiple outputs yet. TODO(csigg): fix.
-  if (roots.front()->shape().IsTuple()) return false;
+  if (roots.front().shape().IsTuple()) return false;
   // Limit the number of operands because the concat emitter produces code for
   // each operand, hurting occupancy.
-  if (heroes.front()->operand_count() > 4) return false;
+  if (heroes.front().instruction().operand_count() > 4) return false;
   // The loop emitter is faster when warp divergence and occupancy are both low.
   // TODO(csigg): exclude this case.
   return true;
@@ -202,7 +204,6 @@ HloFusionAnalysis::EmitterFusionKind HloFusionAnalysis::GetEmitterFusionKind()
     return EmitterFusionKind::kCustomFusion;
   }
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   if (fusion_backend_config_.kind() == kTritonGemmFusionKind ||
       fusion_backend_config_.kind() == kTritonSoftmaxFusionKind) {
     return EmitterFusionKind::kTriton;
@@ -211,13 +212,12 @@ HloFusionAnalysis::EmitterFusionKind HloFusionAnalysis::GetEmitterFusionKind()
   if (fusion_backend_config_.kind() == kCuDnnFusionKind) {
     return EmitterFusionKind::kCuDnn;
   }
-#endif
 
-  if (input_output_info_.has_4_bit_input ||
-      input_output_info_.has_4_bit_output) {
-    // Only loop and input slice fusions currently can handle int4
+  if (input_output_info_.smallest_input_dtype_bits < 8 ||
+      input_output_info_.smallest_output_dtype_bits < 8) {
+    // Only loop and input slice fusions currently can handle packed
     // inputs/outputs, due to the special handling with IrArray needed to deal
-    // with two values occupying a single byte.
+    // with multiple values occupying a single byte.
     if (fusion_roots_.size() > 1 &&
         IsInputFusibleNonStridedSlices(fusion_roots_) &&
         AllSliceInputsAreCompatible(fusion_roots_)) {
@@ -226,30 +226,30 @@ HloFusionAnalysis::EmitterFusionKind HloFusionAnalysis::GetEmitterFusionKind()
     return EmitterFusionKind::kLoop;
   }
 
-  const HloInstruction* first_reduce_hero = nullptr;
+  std::optional<HloInstructionAdaptor> first_reduce_hero;
   for (auto [root, hero] : llvm::zip(fusion_roots_, fusion_heroes_)) {
-    if (IsRealReductionHero(*root, *hero)) {
+    if (IsRealReductionHero(root.instruction(), hero.instruction())) {
       first_reduce_hero = hero;
       break;
     }
   }
-  if (first_reduce_hero != nullptr) {
+  if (first_reduce_hero.has_value()) {
     bool valid_shapes = true;
-    Shape hero_operand_shape = first_reduce_hero->operand(0)->shape();
+    Shape hero_operand_shape = first_reduce_hero->GetOperand(0).shape();
     for (auto [root, hero] : llvm::zip(fusion_roots_, fusion_heroes_)) {
-      if (root == first_reduce_hero) {
+      if (root == *first_reduce_hero) {
         continue;
       }
-      if (!IsRealReductionHero(*root, *hero)) {
+      if (!IsRealReductionHero(root.instruction(), hero.instruction())) {
         // Needs to have a compatible shape to the reduce operand (compatible
         // meaning same number of elements).
-        if (ShapeUtil::ElementsIn(root->shape()) !=
+        if (ShapeUtil::ElementsIn(root.shape()) !=
             ShapeUtil::ElementsIn(hero_operand_shape)) {
           valid_shapes = false;
           break;
         }
-      } else if (!AreReductionsMultiOutputFusionCompatible(hero,
-                                                           first_reduce_hero)) {
+      } else if (!AreReductionsMultiOutputFusionCompatible(
+                     &hero.instruction(), &first_reduce_hero->instruction())) {
         valid_shapes = false;
         break;
       }
@@ -272,7 +272,7 @@ HloFusionAnalysis::EmitterFusionKind HloFusionAnalysis::GetEmitterFusionKind()
     return EmitterFusionKind::kLoop;
   }
 
-  if (fusion_roots_[0]->opcode() == HloOpcode::kScatter) {
+  if (fusion_roots_[0].opcode() == HloOpcode::kScatter) {
     return EmitterFusionKind::kScatter;
   }
 
@@ -287,15 +287,15 @@ const HloInstruction* HloFusionAnalysis::FindHeroReduction() const {
   if (GetEmitterFusionKind() != EmitterFusionKind::kReduction) {
     return nullptr;
   }
-  auto roots = fusion_roots();
+  const auto& roots = fusion_roots();
   CHECK(!roots.empty());
   // We always use the first reduce root that triggers unnested reduction
   // emitter as the hero reduction, since all the reductions are required to
   // have the same shape and layout as verified by
   // `IsFusedReductionOutputConsistent()`.
   for (auto [root, hero] : llvm::zip(roots, fusion_heroes_)) {
-    if (IsRealReductionHero(*root, *hero)) {
-      return hero;
+    if (IsRealReductionHero(root.instruction(), hero.instruction())) {
+      return &hero.instruction();
     }
   }
   LOG(FATAL) << "Did not find a hero reduction";
@@ -309,9 +309,7 @@ HloFusionAnalysis AnalyzeProducerConsumerFusion(
           ? consumer.backend_config<GpuBackendConfig>()->fusion_backend_config()
           : producer.backend_config<GpuBackendConfig>()
                 ->fusion_backend_config(),
-      std::make_unique<ProducerConsumerFusion>(
-          HloFusionAdaptor::ForInstruction(&producer),
-          HloFusionAdaptor::ForInstruction(&consumer)),
+      HloFusionAdaptor::ForProducerConsumer(&producer, &consumer),
       &device_info);
 }
 
diff --git a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h
index 26d54247bff47d..146224b394579f 100644
--- a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h
+++ b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h
@@ -16,10 +16,12 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_HLO_FUSION_ANALYSIS_H_
 #define XLA_SERVICE_GPU_HLO_FUSION_ANALYSIS_H_
 
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <vector>
 
+#include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -49,9 +51,8 @@ class HloFusionAnalysis {
   // Precomputed information about inputs (arguments) and outputs (roots) of the
   // fusion.
   struct InputOutputInfo {
-    bool has_4_bit_input;
-    bool has_4_bit_output;
     int smallest_input_dtype_bits;
+    int smallest_output_dtype_bits;
   };
 
   static HloFusionAnalysis Create(FusionBackendConfig backend_config,
@@ -60,13 +61,23 @@ class HloFusionAnalysis {
   static HloFusionAnalysis Create(const HloFusionInstruction* fusion,
                                   const se::DeviceDescription* device_info);
 
-  const std::vector<const HloInstruction*>& fusion_roots() const {
+  const HloFusionAdaptor& fusion() const { return *fusion_; }
+
+  const absl::InlinedVector<HloInstructionAdaptor, 2>& fusion_roots() const {
     return fusion_roots_;
   }
-  const std::vector<const HloInstruction*>& fusion_heroes() const {
+  HloInstructionAdaptor fusion_root(int64_t i) const {
+    return fusion_roots_[i];
+  }
+  int64_t fusion_root_count() const { return fusion_roots_.size(); }
+
+  const absl::InlinedVector<HloInstructionAdaptor, 2>& fusion_heroes() const {
     return fusion_heroes_;
   }
-  const HloFusionAdaptor& fusion() const { return *fusion_; }
+  HloInstructionAdaptor fusion_hero(int64_t i) const {
+    return fusion_heroes_[i];
+  }
+  int64_t fusion_hero_count() const { return fusion_heroes_.size(); }
 
   // Determines the fusion type for the emitter.
   EmitterFusionKind GetEmitterFusionKind() const;
@@ -93,9 +104,9 @@ class HloFusionAnalysis {
 
  private:
   HloFusionAnalysis(FusionBackendConfig fusion_backend_config,
-                    std::vector<const HloInstruction*> fusion_roots,
                     std::unique_ptr<HloFusionAdaptor> fusion,
-                    std::vector<const HloInstruction*> fusion_heroes,
+                    absl::InlinedVector<HloInstructionAdaptor, 2> fusion_roots,
+                    absl::InlinedVector<HloInstructionAdaptor, 2> fusion_heroes,
                     const se::DeviceDescription* device_info,
                     std::optional<TransposeDescription> tiled_transpose,
                     InputOutputInfo input_output_info);
@@ -103,9 +114,18 @@ class HloFusionAnalysis {
   bool HasConsistentTransposeHeros() const;
 
   FusionBackendConfig fusion_backend_config_;
-  std::vector<const HloInstruction*> fusion_roots_;
+
+  // Owning pointer to the fusion adaptor object.
   std::unique_ptr<HloFusionAdaptor> fusion_;
-  std::vector<const HloInstruction*> fusion_heroes_;
+
+  // A list of all roots of the fusion. The instruction adaptors have `fusion_`
+  // as their parent and should not outlive `fusion_`.
+  absl::InlinedVector<HloInstructionAdaptor, 2> fusion_roots_;
+
+  // A list of all heroes of the fusion. The instruction adaptors have `fusion_`
+  // as their parent and should not outlive `fusion_`.
+  absl::InlinedVector<HloInstructionAdaptor, 2> fusion_heroes_;
+
   const se::DeviceDescription* device_info_;
   std::optional<TransposeDescription> tiled_transpose_;
   InputOutputInfo input_output_info_;
diff --git a/third_party/xla/xla/service/gpu/hlo_traversal.cc b/third_party/xla/xla/service/gpu/hlo_traversal.cc
index 33c8588d732666..b3f23860889c5d 100644
--- a/third_party/xla/xla/service/gpu/hlo_traversal.cc
+++ b/third_party/xla/xla/service/gpu/hlo_traversal.cc
@@ -14,12 +14,18 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/service/gpu/hlo_traversal.h"
 
+#include <algorithm>
+#include <cstdint>
 #include <functional>
+#include <iterator>
 #include <memory>
 #include <optional>
 #include <queue>
+#include <sstream>
 #include <string>
+#include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
@@ -34,13 +40,8 @@ namespace {
 
 template <typename F>
 void ResolveUsers(const HloInstruction* value, const HloInstruction* user,
-                  F&& fn) {
-  if (user->opcode() == HloOpcode::kFusion) {
-    auto* param = user->fused_parameter(user->operand_index(value));
-    for (const auto* param_user : param->users()) {
-      fn(param_user);
-    }
-  } else if (user->opcode() == HloOpcode::kTuple && user->IsRoot()) {
+                  const HloFusionAdaptor& fusion_adaptor, F&& fn) {
+  if (user->opcode() == HloOpcode::kTuple && user->IsRoot()) {
     if (auto* fusion = user->parent()->FusionInstruction()) {
       // Skip through the tuple -> get-tuple-element ops and directly go to the
       // "real" users.
@@ -50,100 +51,117 @@ void ResolveUsers(const HloInstruction* value, const HloInstruction* user,
           continue;
         }
         for (const auto* gte_user : gte->users()) {
-          ResolveUsers(gte, gte_user, fn);
+          ResolveUsers(gte, gte_user, fusion_adaptor, fn);
         }
       }
     }
+  } else if (fusion_adaptor.ContainsInstruction(user) &&
+             user->opcode() == HloOpcode::kFusion) {
+    auto* param = user->fused_parameter(user->operand_index(value));
+    for (const auto* param_user : param->users()) {
+      fn(param_user);
+    }
   } else {
     fn(user);
   }
 }
 
-const HloInstruction* ResolveOperand(const HloInstruction* operand) {
-  if (operand->opcode() == HloOpcode::kFusion) {
-    return operand->fused_expression_root();
-  }
+const HloInstruction* ResolveOperand(const HloInstruction* operand,
+                                     const HloFusionAdaptor& fusion_adaptor) {
   // Deal with multi-output fusion operands, which are reached via a
   // get-tuple-element op.
   if (operand->opcode() == HloOpcode::kGetTupleElement &&
       operand->operand(0)->opcode() == HloOpcode::kFusion &&
       operand->operand(0)->fused_expression_root()->opcode() ==
-          HloOpcode::kTuple) {
+          HloOpcode::kTuple &&
+      fusion_adaptor.ContainsInstruction(operand->operand(0))) {
     return operand->operand(0)->fused_expression_root()->operand(
         operand->tuple_index());
   }
+
+  if (!fusion_adaptor.ContainsInstruction(operand)) {
+    return operand;
+  }
+
+  if (operand->opcode() == HloOpcode::kFusion) {
+    return operand->fused_expression_root();
+  }
+
   if (operand->opcode() == HloOpcode::kParameter) {
     if (auto* fusion = operand->parent()->FusionInstruction()) {
-      return ResolveOperand(fusion->operand(operand->parameter_number()));
+      return ResolveOperand(fusion->operand(operand->parameter_number()),
+                            fusion_adaptor);
     }
   }
   return operand;
 }
+}  // namespace
 
-class SingleInstructionFusion : public HloFusionAdaptor {
+class SingleInstructionFusion : public internal::HloFusionInstructionAdaptor {
  public:
-  explicit SingleInstructionFusion(const HloInstruction* instruction)
-      : instruction_(*instruction) {
+  explicit SingleInstructionFusion(const HloInstruction* instruction,
+                                   const HloFusionAdaptor* parent)
+      : instruction_(instruction), parent_(parent) {
     CHECK_NE(instruction->opcode(), HloOpcode::kFusion)
-        << "Use HloFusionFusion";
+        << "Use HloComputationFusion";
   }
 
-  bool ContainsInstruction(HloInstructionAdaptor instruction) const override {
+  bool ContainsInstruction(const HloInstruction* instruction) const override {
     return instruction == instruction_;
   }
 
   absl::InlinedVector<HloInstructionAdaptor, 2> GetRoots() const override {
-    return {instruction_};
+    return {HloInstructionAdaptor{*instruction_, parent_}};
+  }
+
+  absl::InlinedVector<const HloInstruction*, 2> GetParameters() const override {
+    const auto& operands = instruction_->operands();
+    return absl::InlinedVector<const HloInstruction*, 2>(operands.begin(),
+                                                         operands.end());
+  }
+
+  const HloInstruction& FusionInstruction() const override {
+    return *instruction_;
   }
 
   absl::InlinedVector<HloInstructionAdaptor, 2> MakeInstructionPostOrder()
       const override {
-    return {instruction_};
+    return {HloInstructionAdaptor{*instruction_, parent_}};
   }
 
-  std::string ToString() const override { return instruction_.ToString(); }
+  std::string ToString() const override { return instruction_->ToString(); }
 
  private:
-  HloInstructionAdaptor instruction_;
+  const HloInstruction* instruction_;
+  const HloFusionAdaptor* parent_;
 };
 
-class HloComputationFusion : public HloFusionAdaptor {
+class HloComputationFusion : public internal::HloFusionInstructionAdaptor {
  public:
-  explicit HloComputationFusion(const HloComputation* computation)
-      : computation_(computation) {
-    // HloFusionAdaptor should only be created for fusion computations, that
-    // usually have only a few roots, but there is a case when we can it for
-    // non-fusion computations with thousands of roots. It happens inside
-    // `FindNonTrivialHero` and it gets very expensive. Calling
-    // `FindNonTrivialHero` also doesn't make sense on non-fusion computation,
-    // but `InstructionFusion` and `FusionMerger` depend on this behavoiur in
-    // `IsProducerConsumerFusible`.
-    //
+  explicit HloComputationFusion(const HloComputation* computation,
+                                const HloFusionAdaptor* parent)
+      : computation_(computation), parent_(parent) {
     // `FindNonTrivialHero` only call `ContainsInstruction` and doesn't use
     // information about roots, so we can skip looking for roots as performance
     // optimization.
     // TODO(shyshkov): Clean this up once priority fusion is fully launched.
-    if (computation->IsFusionComputation()) {
-      roots_ = FindRoots(computation);
-    }
+    CHECK(computation->IsFusionComputation());
+    roots_ = FindRoots(computation);
   }
 
-  static absl::InlinedVector<HloInstructionAdaptor, 2> FindRoots(
+  absl::InlinedVector<HloInstructionAdaptor, 2> FindRoots(
       const HloComputation* computation) {
     absl::InlinedVector<HloInstructionAdaptor, 2> roots;
 
     std::function<void(const HloInstruction*)> get_roots;
-    absl::flat_hash_set<HloInstructionAdaptor> roots_set;
     get_roots = [&](const HloInstruction* instr) {
       if (instr->opcode() == HloOpcode::kTuple) {
         for (const auto* operand : instr->operands()) {
           get_roots(operand);
         }
       } else {
-        HloInstructionAdaptor wrapped{*instr};
-        if (roots_set.insert(wrapped).second) {
-          roots.push_back(wrapped);
-        }
+        HloInstructionAdaptor wrapped{*instr, parent_};
+        roots.push_back(wrapped);
       }
     };
     get_roots(computation->root_instruction());
@@ -151,8 +169,13 @@ class HloComputationFusion : public HloFusionAdaptor {
     return roots;
   }
 
-  bool ContainsInstruction(HloInstructionAdaptor instruction) const override {
-    return instruction.instruction().parent() == computation_;
+  bool ContainsInstruction(const HloInstruction* instruction) const override {
+    return instruction->parent() == computation_ ||
+           // For convenience, we consider that the adaptor also contains the
+           // parent fusion instruction. This is useful in
+           // ResolveUsers/ResolveOperand to check if the given fusion
+           // instruction is part of the fusion adaptor.
+           instruction == computation_->FusionInstruction();
   }
 
   absl::InlinedVector<HloInstructionAdaptor, 2> GetRoots() const override {
@@ -164,6 +187,16 @@ class HloComputationFusion : public HloFusionAdaptor {
     return roots_;
   }
 
+  absl::InlinedVector<const HloInstruction*, 2> GetParameters() const override {
+    const auto& operands = computation_->FusionInstruction()->operands();
+    return absl::InlinedVector<const HloInstruction*, 2>(operands.begin(),
+                                                         operands.end());
+  }
+
+  const HloInstruction& FusionInstruction() const override {
+    return *computation_->FusionInstruction();
+  }
+
   absl::InlinedVector<HloInstructionAdaptor, 2> MakeInstructionPostOrder()
       const override {
     auto post_order = computation_->MakeInstructionPostOrder();
@@ -181,7 +214,7 @@ class HloComputationFusion : public HloFusionAdaptor {
           (instr->opcode() == HloOpcode::kTuple && instr->IsRoot())) {
         continue;
       }
-      result.emplace_back(*instr);
+      result.emplace_back(*instr, parent_);
     }
     return result;
   }
@@ -191,21 +224,192 @@ class HloComputationFusion : public HloFusionAdaptor {
  private:
   const HloComputation* computation_;
   absl::InlinedVector<HloInstructionAdaptor, 2> roots_;
+  const HloFusionAdaptor* parent_;
 };
 
-}  // namespace
-
+/*static*/
 std::unique_ptr<HloFusionAdaptor> HloFusionAdaptor::ForInstruction(
     const HloInstruction* instruction) {
   if (instruction->opcode() == HloOpcode::kFusion) {
     return ForComputation(instruction->fused_instructions_computation());
   }
-  return std::make_unique<SingleInstructionFusion>(instruction);
+
+  auto fusion_adaptor = std::make_unique<HloFusionAdaptor>();
+  fusion_adaptor->AddInstruction(instruction);
+  return fusion_adaptor;
+}
+
+/*static*/
+std::unique_ptr<HloFusionAdaptor> HloFusionAdaptor::ForProducerConsumer(
+    const HloInstruction* producer, const HloInstruction* consumer) {
+  auto fusion_adaptor = std::make_unique<HloFusionAdaptor>();
+  fusion_adaptor->AddInstruction(producer);
+  fusion_adaptor->AddInstruction(consumer);
+  return fusion_adaptor;
 }
 
+/*static*/
 std::unique_ptr<HloFusionAdaptor> HloFusionAdaptor::ForComputation(
     const HloComputation* computation) {
-  return std::make_unique<HloComputationFusion>(computation);
+  auto fusion_adaptor = std::make_unique<HloFusionAdaptor>();
+  fusion_adaptor->AddComputation(computation);
+  return fusion_adaptor;
+}
+
+bool HloFusionAdaptor::ContainsInstruction(
+    HloInstructionAdaptor instruction) const {
+  return ContainsInstruction(&instruction.instruction());
+}
+
+bool HloFusionAdaptor::ContainsInstruction(
+    const HloInstruction* instruction) const {
+  for (const auto& fusion_instruction : fusion_instructions_) {
+    if (fusion_instruction->ContainsInstruction(instruction)) return true;
+  }
+  return false;
+}
+
+absl::InlinedVector<HloInstructionAdaptor, 2> HloFusionAdaptor::GetRoots()
+    const {
+  auto roots = fusion_instructions_.back()->GetRoots();
+  if (fusion_instructions_.size() == 1) {
+    return roots;
+  }
+  CHECK_EQ(fusion_instructions_.size(), 2);
+  auto producer_roots = fusion_instructions_[0]->GetRoots();
+  const HloInstruction& producer_fusion =
+      fusion_instructions_[0]->FusionInstruction();
+  const HloInstruction& consumer_fusion =
+      fusion_instructions_.back()->FusionInstruction();
+
+  // Check whether there are fusion roots that are parameters which will be
+  // replaced by a producer fusion root.
+  for (auto& root : roots) {
+    if (root.opcode() != HloOpcode::kParameter) {
+      continue;
+    }
+    const HloInstruction* operand =
+        consumer_fusion.operand(root.instruction().parameter_number());
+    int64_t root_index = 0;
+    if (operand->opcode() == HloOpcode::kGetTupleElement) {
+      root_index = operand->tuple_index();
+      operand = operand->operand(0);
+    }
+    if (operand == &producer_fusion) {
+      root = producer_roots[root_index];
+    }
+  }
+
+  if (!producer_fusion.IsMultiOutputFusion()) {
+    return roots;
+  }
+
+  // Also add the roots of the producer fusion if they are used outside of the
+  // merged fusion computations. Skip roots that are parameters.
+  absl::flat_hash_set<int64_t> root_indices_with_outside_usage;
+  for (HloInstruction* instr : producer_fusion.users()) {
+    bool has_outside_user = false;
+    int64_t root_index = 0;
+    if (instr->opcode() == HloOpcode::kGetTupleElement) {
+      for (HloInstruction* user : instr->users()) {
+        if (user != &consumer_fusion) {
+          root_index = instr->tuple_index();
+          has_outside_user = true;
+          break;
+        }
+      }
+    } else if (instr != &consumer_fusion) {
+      has_outside_user = true;
+    }
+    if (has_outside_user) {
+      root_indices_with_outside_usage.insert(root_index);
+    }
+  }
+  for (int64_t i = 0; i < producer_roots.size(); ++i) {
+    if (!root_indices_with_outside_usage.contains(i)) {
+      continue;
+    }
+    // Also check the special case that the root is a parameter. We never fuse a
+    // parameter, instead we would rewire users of such a root to the
+    // corresponding fusion operand.
+    if (producer_roots[i].opcode() != HloOpcode::kParameter) {
+      roots.push_back(producer_roots[i]);
+    }
+  }
+  return roots;
+}
+
+absl::InlinedVector<const HloInstruction*, 2> HloFusionAdaptor::GetParameters()
+    const {
+  if (fusion_instructions_.size() == 1) {
+    return fusion_instructions_.back()->GetParameters();
+  }
+  CHECK_EQ(fusion_instructions_.size(), 2);
+  absl::InlinedVector<const HloInstruction*, 2> combined_parameters;
+  const HloInstruction& producer_fusion =
+      fusion_instructions_[0]->FusionInstruction();
+  for (const auto& param : fusion_instructions_.back()->GetParameters()) {
+    const HloInstruction* operand = param;
+    if (operand->opcode() == HloOpcode::kGetTupleElement) {
+      operand = operand->operand(0);
+    }
+    // Check whether 'param' is a user of the producer fusion.
+    if (operand != &producer_fusion) {
+      combined_parameters.push_back(param);
+    }
+  }
+  absl::flat_hash_set<const HloInstruction*> params(combined_parameters.begin(),
+                                                    combined_parameters.end());
+  auto producer_roots = fusion_instructions_[0]->GetRoots();
+  absl::flat_hash_set<const HloInstruction*> parameters_to_skip;
+  // Skip parameters that have just have a root user. Those will not be fused.
+  for (const auto& root : producer_roots) {
+    if (root.opcode() == HloOpcode::kParameter &&
+        root.instruction().user_count() <= 1) {
+      parameters_to_skip.insert(
+          producer_fusion.operand(root.instruction().parameter_number()));
+    }
+  }
+  for (auto param : fusion_instructions_[0]->GetParameters()) {
+    if (!parameters_to_skip.contains(param) && params.insert(param).second) {
+      combined_parameters.push_back(param);
+    }
+  }
+  return combined_parameters;
+}
+
+absl::InlinedVector<HloInstructionAdaptor, 2>
+HloFusionAdaptor::MakeInstructionPostOrder() const {
+  absl::InlinedVector<HloInstructionAdaptor, 2> result_post_order;
+
+  for (const auto& fusion_instruction : fusion_instructions_) {
+    absl::c_move(fusion_instruction->MakeInstructionPostOrder(),
+                 std::back_inserter(result_post_order));
+  }
+
+  return result_post_order;
+}
+
+std::string HloFusionAdaptor::ToString() const {
+  std::ostringstream ss;
+  for (const auto& fusion_instruction : fusion_instructions_) {
+    ss << fusion_instruction->ToString() << "\n";
+  }
+  return ss.str();
+}
+
+void HloFusionAdaptor::AddInstruction(const HloInstruction* instruction) {
+  if (instruction->opcode() == HloOpcode::kFusion) {
+    AddComputation(instruction->fused_instructions_computation());
+  } else {
+    fusion_instructions_.push_back(
+        std::make_unique<SingleInstructionFusion>(instruction, this));
+  }
+}
+
+void HloFusionAdaptor::AddComputation(const HloComputation* computation) {
+  fusion_instructions_.push_back(
+      std::make_unique<HloComputationFusion>(computation, this));
 }
 
 absl::InlinedVector<HloInstructionAdaptor, 2>
@@ -216,39 +420,40 @@ HloInstructionAdaptor::GetOperands() const {
     // that is also a root. This probably never makes sense, but it technically
     // is valid HLO, so we support it by treating the parameter as an identity
     // function in this context.
-    auto operand = ResolveOperand(instruction_);
+    auto operand = ResolveOperand(instruction_, *parent_);
     if (operand != instruction_) {
-      operands.emplace_back(*operand);
+      operands.emplace_back(*operand, parent_);
     }
   } else {
     for (const auto* operand : instruction_->operands()) {
-      operands.emplace_back(*ResolveOperand(operand));
+      operands.emplace_back(*ResolveOperand(operand, *parent_), parent_);
     }
   }
   return operands;
 }
 
 HloInstructionAdaptor HloInstructionAdaptor::GetOperand(int index) const {
-  return HloInstructionAdaptor{*ResolveOperand(instruction_->operand(index))};
+  return HloInstructionAdaptor{
+      *ResolveOperand(instruction_->operand(index), *parent_), parent_};
 }
 
 absl::InlinedVector<HloInstructionAdaptor, 2> HloInstructionAdaptor::GetUsers()
     const {
   absl::InlinedVector<HloInstructionAdaptor, 2> users;
   auto add_user = [&](const HloInstruction* instr) {
-    users.emplace_back(*instr);
+    users.emplace_back(*instr, parent_);
   };
 
   if (instruction_->IsRoot()) {
     if (auto* fusion = instruction_->parent()->FusionInstruction()) {
       for (auto* user : fusion->users()) {
-        ResolveUsers(fusion, user, add_user);
+        ResolveUsers(fusion, user, *parent_, add_user);
       }
     }
   }
 
   for (auto* user : instruction_->users()) {
-    ResolveUsers(instruction_, user, add_user);
+    ResolveUsers(instruction_, user, *parent_, add_user);
   }
 
   return users;
@@ -339,6 +544,12 @@ bool HloAnyOf(absl::Span<const HloInstructionAdaptor> roots,
   return HloFindIf(roots, fusion, visit, visit_operands).has_value();
 }
 
+bool HloAnyOf(absl::Span<const HloInstruction* const> roots,
+              const std::function<bool(const HloInstruction* node)>& visit,
+              bool visit_operands) {
+  return HloFindIf(roots, visit, visit_operands).has_value();
+}
+
 std::optional<HloInstructionAdaptor> HloFindIf(
     absl::Span<const HloInstructionAdaptor> roots,
     const HloFusionAdaptor& fusion,
@@ -358,5 +569,66 @@ std::optional<HloInstructionAdaptor> HloFindIf(
   return result;
 }
 
+std::optional<const HloInstruction*> HloFindIf(
+    absl::Span<const HloInstruction* const> roots,
+    const std::function<bool(const HloInstruction* node)>& visit,
+    bool visit_operands) {
+  absl::flat_hash_set<const HloInstruction*> visited;
+  std::queue<const HloInstruction*> q;
+  auto enqueue = [&](const HloInstruction* node) {
+    if (visit_operands) {
+      for (const HloInstruction* operand : node->operands()) {
+        if (visited.insert(operand).second) {
+          q.push(operand);
+        }
+      }
+    } else {
+      for (const HloInstruction* operand : node->users()) {
+        if (visited.insert(operand).second) {
+          q.push(operand);
+        }
+      }
+    }
+  };
+  for (auto root : roots) {
+    if (visited.insert(root).second) {
+      q.push(root);
+    }
+  }
+  while (!q.empty()) {
+    const HloInstruction* node = q.front();
+    q.pop();
+    if (visit(node)) {
+      return node;
+    }
+    enqueue(node);
+  }
+  return std::nullopt;
+}
+
+std::vector<HloInstructionAdaptor> HloFindUseChain(HloInstructionAdaptor parent,
+                                                   HloInstructionAdaptor root) {
+  absl::flat_hash_set<HloInstructionAdaptor> visited;
+  std::vector<HloInstructionAdaptor> result;
+  std::function<bool(HloInstructionAdaptor)> visit;
+  visit = [&](HloInstructionAdaptor node) {
+    if (node == root) return true;
+    for (const auto& user : node.GetUsers()) {
+      if (visited.insert(user).second && visit(user)) {
+        result.push_back(user);
+        return true;
+      }
+    }
+    return false;
+  };
+  if (visit(parent)) {
+    result.push_back(parent);
+    std::reverse(result.begin(), result.end());
+  } else {
+    result.clear();
+  }
+  return result;
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/hlo_traversal.h b/third_party/xla/xla/service/gpu/hlo_traversal.h
index d77e669b48f162..e837b7e2be3dd4 100644
--- a/third_party/xla/xla/service/gpu/hlo_traversal.h
+++ b/third_party/xla/xla/service/gpu/hlo_traversal.h
@@ -16,15 +16,12 @@ limitations under the License.
 #define XLA_SERVICE_GPU_HLO_TRAVERSAL_H_
 
 #include <functional>
-#include <iterator>
 #include <memory>
 #include <optional>
 #include <string>
 #include <utility>
 
-#include "absl/algorithm/container.h"
 #include "absl/container/inlined_vector.h"
-#include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -34,12 +31,15 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+class HloFusionAdaptor;
+
 // Treats HloInstructions as if they were unfused.
 class HloInstructionAdaptor {
  public:
   HloInstructionAdaptor() = default;
-  explicit HloInstructionAdaptor(const HloInstruction& instruction)
-      : instruction_(&instruction) {}
+  HloInstructionAdaptor(const HloInstruction& instruction,
+                        const HloFusionAdaptor* parent)
+      : instruction_(&instruction), parent_(parent) {}
 
   HloOpcode opcode() const { return instruction_->opcode(); }
   absl::string_view name() const { return instruction_->name(); }
@@ -57,9 +57,13 @@ class HloInstructionAdaptor {
 
   // Use sparingly; prefer extending the interface.
   const HloInstruction& instruction() const { return *instruction_; }
+  const HloFusionAdaptor& parent() const { return *parent_; }
 
  private:
   const HloInstruction* instruction_;
+
+  // Pointer to the parent fusion adaptor. Can not be null.
+  const HloFusionAdaptor* parent_;
 };
 
 template <typename H>
@@ -69,70 +73,55 @@ H AbslHashValue(H h, const HloInstructionAdaptor& m) {
 }
 
 template <HloOpcode op, HloOpcode... rest>
-bool IsOpcodeAnyOf(const HloInstructionAdaptor& adaptor) {
-  return (adaptor.opcode() == op) || ((adaptor.opcode() == rest) || ...);
+bool IsOpcodeAnyOf(const HloInstruction* instr) {
+  return (instr->opcode() == op) || ((instr->opcode() == rest) || ...);
 }
 
-class HloFusionAdaptor {
+namespace internal {
+
+// An interface to abstract away the difference between single instruction
+// fusion and fused computations.
+class HloFusionInstructionAdaptor {
  public:
-  virtual ~HloFusionAdaptor() = default;
-  virtual bool ContainsInstruction(HloInstructionAdaptor instruction) const = 0;
+  virtual ~HloFusionInstructionAdaptor() = default;
+  virtual bool ContainsInstruction(const HloInstruction* instruction) const = 0;
+  // If it is a regular multi-output fusion, the order of the returned roots
+  // matches the order of the tuple elements of the tuple root of the fusion
+  // computation. We do not deduplicate fusion roots.
   virtual absl::InlinedVector<HloInstructionAdaptor, 2> GetRoots() const = 0;
+  virtual absl::InlinedVector<const HloInstruction*, 2> GetParameters()
+      const = 0;
+  virtual const HloInstruction& FusionInstruction() const = 0;
   virtual absl::InlinedVector<HloInstructionAdaptor, 2>
   MakeInstructionPostOrder() const = 0;
   virtual std::string ToString() const = 0;
-
-  static std::unique_ptr<HloFusionAdaptor> ForInstruction(
-      const HloInstruction* instruction);
-  static std::unique_ptr<HloFusionAdaptor> ForComputation(
-      const HloComputation* computation);
 };
 
-class ProducerConsumerFusion : public HloFusionAdaptor {
- public:
-  ProducerConsumerFusion(std::unique_ptr<HloFusionAdaptor> producer,
-                         std::unique_ptr<HloFusionAdaptor> consumer)
-      : producer_(std::move(producer)), consumer_(std::move(consumer)) {}
-
-  ProducerConsumerFusion(const HloInstruction* producer,
-                         const HloInstruction* consumer)
-      : ProducerConsumerFusion(HloFusionAdaptor::ForInstruction(producer),
-                               HloFusionAdaptor::ForInstruction(consumer)) {}
-
-  bool ContainsInstruction(HloInstructionAdaptor instruction) const override {
-    return producer_->ContainsInstruction(instruction) ||
-           consumer_->ContainsInstruction(instruction);
-  }
-
-  absl::InlinedVector<HloInstructionAdaptor, 2> GetRoots() const override {
-    return consumer_->GetRoots();
-  }
+}  // namespace internal
 
+class HloFusionAdaptor {
+ public:
+  bool ContainsInstruction(HloInstructionAdaptor instruction) const;
+  bool ContainsInstruction(const HloInstruction* instruction) const;
+  absl::InlinedVector<HloInstructionAdaptor, 2> GetRoots() const;
+  absl::InlinedVector<const HloInstruction*, 2> GetParameters() const;
   absl::InlinedVector<HloInstructionAdaptor, 2> MakeInstructionPostOrder()
-      const override {
-    auto producer_post_order = producer_->MakeInstructionPostOrder();
-    auto consumer_post_order = consumer_->MakeInstructionPostOrder();
+      const;
+  std::string ToString() const;
 
-    producer_post_order.reserve(consumer_post_order.size() +
-                                producer_post_order.size());
-
-    absl::c_move(consumer_post_order, std::back_inserter(producer_post_order));
-
-    return producer_post_order;
-  }
-
-  std::string ToString() const override {
-    // TODO: Add a parameter to indent output on nested adaptor for better
-    // visual representation. Nested producer-consumers fusion are not used in
-    // practice yet.
-    return absl::StrJoin({std::string("producer-consumer fusion:"),
-                          producer_->ToString(), consumer_->ToString()},
-                         "\n");
-  }
+  static std::unique_ptr<HloFusionAdaptor> ForInstruction(
+      const HloInstruction* instruction);
+  static std::unique_ptr<HloFusionAdaptor> ForProducerConsumer(
+      const HloInstruction* producer, const HloInstruction* consumer);
+  static std::unique_ptr<HloFusionAdaptor> ForComputation(
+      const HloComputation* computation);
 
  private:
-  std::unique_ptr<HloFusionAdaptor> producer_;
-  std::unique_ptr<HloFusionAdaptor> consumer_;
+  void AddInstruction(const HloInstruction* instruction);
+  void AddComputation(const HloComputation* computation);
+
+  absl::InlinedVector<std::unique_ptr<internal::HloFusionInstructionAdaptor>, 2>
+      fusion_instructions_;
 };
 
 enum class TraversalResult {
@@ -172,8 +161,16 @@ bool HloAnyOf(absl::Span<const HloInstructionAdaptor> roots,
               const std::function<bool(HloInstructionAdaptor node)>& visit,
               bool visit_operands = true);
 
-// Visit the HLO nodes stating from `roots`, returning the first
-// node for which `visit` returns true, or `nullptr` if no node matches. Uses
+// Visit the HLO nodes starting from `roots`, returning true if the return value
+// of `visit` for any of nodes is true. If `visit_operands` is true, the
+// search is going towards the operands, otherwise towards the users. Doesn't
+// require instruction and fusion adaptors.
+bool HloAnyOf(absl::Span<const HloInstruction* const> roots,
+              const std::function<bool(const HloInstruction* node)>& visit,
+              bool visit_operands = true);
+
+// Visit the HLO nodes starting from `roots`, returning the first
+// node for which `visit` returns true, or `nullopt` if no node matches. Uses
 // the same order as `HloBfsConsumersFirstTraversal` if `visit_operands` is
 // true. Otherwise the same order as `HloBfsProducersFirstTraversal` is used.
 std::optional<HloInstructionAdaptor> HloFindIf(
@@ -182,11 +179,25 @@ std::optional<HloInstructionAdaptor> HloFindIf(
     const std::function<bool(HloInstructionAdaptor node)>& visit,
     bool visit_operands = true);
 
+// Visit the HLO nodes starting from `roots`. If `visit_operands` is true, the
+// search is going towards the operands, otherwise towards the users. Returns
+// the first node for which `visit` returns true, or `nullopt` if no node
+// matches.
+std::optional<const HloInstruction*> HloFindIf(
+    absl::Span<const HloInstruction* const> roots,
+    const std::function<bool(const HloInstruction* node)>& visit,
+    bool visit_operands = true);
+
 // Visit the producers of all parameters that are needed by the fusion.
 void FindFusionArguments(
     const HloFusionAdaptor& fusion,
     const std::function<void(HloInstructionAdaptor producer)>& visit);
 
+// Find a use chain from `parent` to `root`. Empty if no chain exists.
+// `[parent]` if `parent` is `root`.
+std::vector<HloInstructionAdaptor> HloFindUseChain(HloInstructionAdaptor parent,
+                                                   HloInstructionAdaptor root);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/hlo_traversal_test.cc b/third_party/xla/xla/service/gpu/hlo_traversal_test.cc
index c7e3f0db3b7b47..0b253863bbb01d 100644
--- a/third_party/xla/xla/service/gpu/hlo_traversal_test.cc
+++ b/third_party/xla/xla/service/gpu/hlo_traversal_test.cc
@@ -16,20 +16,25 @@ limitations under the License.
 
 #include <optional>
 #include <string>
-#include <utility>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/pattern_matcher.h"
+#include "xla/service/pattern_matcher_gmock.h"
 #include "xla/tests/hlo_test_base.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
+namespace m = ::xla::match;
+
 using ::testing::ElementsAre;
+using ::testing::IsEmpty;
 
 MATCHER_P(InstructionAdaptorName, name, "") { return arg.name() == name; }
 
@@ -74,12 +79,15 @@ const char kTestModule[] = R"(
 TEST_F(HloTraversalTest, AdaptorOperands) {
   auto module = ParseAndReturnVerifiedModule(kTestModule).value();
 
-  HloInstructionAdaptor instr{
-      *module->entry_computation()->GetInstructionWithName("select")};
+  auto fusion_adaptor = HloFusionAdaptor::ForProducerConsumer(
+      module->entry_computation()->GetInstructionWithName("fusion2"),
+      module->entry_computation()->GetInstructionWithName("select"));
+
+  HloInstructionAdaptor instr = fusion_adaptor->GetRoots()[0];
 
   EXPECT_THAT(instr.GetOperands(),
               ElementsAre(InstructionAdaptorName("is_positive"),
-                          InstructionAdaptorName("reduce.1"),
+                          InstructionAdaptorName("fusion"),
                           InstructionAdaptorName("p0")));
 }
 
@@ -106,20 +114,37 @@ TEST_F(HloTraversalTest, AdaptorUsers) {
       gte = f32[] get-tuple-element(fusion), index=0
       add.1 = f32[] add(p0, gte)
       fusion2 = f32[] fusion(gte), kind=kLoop, calls=fused_computation_1
-      ROOT res = (f32[], (f32[], f32[]), f32[]) tuple(add.1, fusion, fusion2)
+      exp.1 = f32[] exponential(fusion2)
+      ROOT res = (f32[], (f32[], f32[]), f32[], f32[]) tuple(add.1, fusion, fusion2, exp.1)
     }
   )")
                     .value();
 
+  auto fusion_adaptor1 = HloFusionAdaptor::ForProducerConsumer(
+      module->entry_computation()->GetInstructionWithName("fusion"),
+      module->entry_computation()->GetInstructionWithName("fusion2"));
+
   HloInstructionAdaptor add{*module->GetComputationWithName("fused_computation")
-                                 ->GetInstructionWithName("add")};
+                                 ->GetInstructionWithName("add"),
+                            fusion_adaptor1.get()};
   EXPECT_THAT(add.GetUsers(), ElementsAre(InstructionAdaptorName("add.1"),
                                           InstructionAdaptorName("mul"),
                                           InstructionAdaptorName("res")));
+
+  auto fusion_adaptor2 = HloFusionAdaptor::ForInstruction(
+      module->entry_computation()->GetInstructionWithName("fusion2"));
+
   HloInstructionAdaptor mul{
       *module->GetComputationWithName("fused_computation_1")
-           ->GetInstructionWithName("mul")};
+           ->GetInstructionWithName("mul"),
+      fusion_adaptor2.get()};
   EXPECT_THAT(mul.GetUsers(), ElementsAre(InstructionAdaptorName("neg.1")));
+
+  HloInstructionAdaptor neg{
+      *module->GetComputationWithName("fused_computation_1")
+           ->GetInstructionWithName("neg.1"),
+      fusion_adaptor2.get()};
+  EXPECT_THAT(neg.GetUsers(), ElementsAre(InstructionAdaptorName("exp.1")));
 }
 
 TEST_F(HloTraversalTest, TraverseFusionConsumerFirst) {
@@ -209,16 +234,13 @@ TEST_F(HloTraversalTest, FindArguments) {
 TEST_F(HloTraversalTest, FindArgumentsAfterFusion) {
   // Verifies that we correctly find the arguments after fusing the negation.
   auto module = ParseAndReturnVerifiedModule(kTestModule).value();
-  auto producer = HloFusionAdaptor::ForInstruction(
-      module->entry_computation()->GetInstructionWithName("negate"));
-  auto consumer = HloFusionAdaptor::ForInstruction(
+  auto fusion = HloFusionAdaptor::ForProducerConsumer(
+      module->entry_computation()->GetInstructionWithName("negate"),
       module->entry_computation()->GetInstructionWithName("fusion"));
   std::vector<std::string> producers;
-  FindFusionArguments(
-      ProducerConsumerFusion(std::move(producer), std::move(consumer)),
-      [&](HloInstructionAdaptor producer) {
-        producers.emplace_back(producer.name());
-      });
+  FindFusionArguments(*fusion, [&](HloInstructionAdaptor producer) {
+    producers.emplace_back(producer.name());
+  });
   EXPECT_THAT(producers, ElementsAre("p0", "log"));
 }
 
@@ -278,16 +300,23 @@ const char kTwoFusions[] = R"(
 TEST_F(HloTraversalTest, FuseFusionConsumer) {
   auto module = ParseAndReturnVerifiedModule(kTwoFusions).value();
 
-  auto producer = HloFusionAdaptor::ForInstruction(
-      module->entry_computation()->GetInstructionWithName("negate"));
-  auto consumer = HloFusionAdaptor::ForInstruction(
-      module->entry_computation()->GetInstructionWithName("fusion.1"));
-  ProducerConsumerFusion fusion(std::move(producer), std::move(consumer));
+  auto producer = module->entry_computation()->GetInstructionWithName("negate");
+  auto consumer =
+      module->entry_computation()->GetInstructionWithName("fusion.1");
+  auto fusion = HloFusionAdaptor::ForProducerConsumer(producer, consumer);
+
+  HloInstructionAdaptor reduce_1(
+      *module->GetComputationWithName("fused_computation_1")
+           ->GetInstructionWithName("reduce.1"),
+      fusion.get());
+
+  EXPECT_THAT(reduce_1.GetUsers(),
+              ElementsAre(InstructionAdaptorName("fusion.2")));
 
   std::vector<std::string> nodes;
   std::vector<std::string> params;
   HloBfsConsumersFirstTraversal(
-      fusion.GetRoots(), fusion,
+      fusion->GetRoots(), *fusion,
       [&](HloInstructionAdaptor node) {
         nodes.emplace_back(node.name());
         return TraversalResult::kAdvance;
@@ -301,16 +330,25 @@ TEST_F(HloTraversalTest, FuseFusionConsumer) {
 TEST_F(HloTraversalTest, FuseFusionProducer) {
   auto module = ParseAndReturnVerifiedModule(kTwoFusions).value();
 
-  auto producer = HloFusionAdaptor::ForInstruction(
-      module->entry_computation()->GetInstructionWithName("fusion.2"));
-  auto consumer = HloFusionAdaptor::ForInstruction(
-      module->entry_computation()->GetInstructionWithName("difference"));
-  ProducerConsumerFusion fusion(std::move(producer), std::move(consumer));
+  auto producer =
+      module->entry_computation()->GetInstructionWithName("fusion.2");
+  auto consumer =
+      module->entry_computation()->GetInstructionWithName("difference");
+  auto fusion = HloFusionAdaptor::ForProducerConsumer(producer, consumer);
+
+  HloInstructionAdaptor reduce_2(
+      *module->GetComputationWithName("fused_computation_2")
+           ->GetInstructionWithName("reduce.2"),
+      fusion.get());
+
+  EXPECT_THAT(reduce_2.GetOperands(),
+              ElementsAre(InstructionAdaptorName("negate"),
+                          InstructionAdaptorName("fusion.1")));
 
   std::vector<std::string> nodes;
   std::vector<std::string> params;
   HloBfsConsumersFirstTraversal(
-      fusion.GetRoots(), fusion,
+      fusion->GetRoots(), *fusion,
       [&](HloInstructionAdaptor node) {
         nodes.emplace_back(node.name());
         return TraversalResult::kAdvance;
@@ -318,25 +356,25 @@ TEST_F(HloTraversalTest, FuseFusionProducer) {
       [&](HloInstructionAdaptor arg) { params.emplace_back(arg.name()); });
 
   EXPECT_THAT(nodes, ElementsAre("difference", "reduce.2"));
-  EXPECT_THAT(params, ElementsAre("p0", "negate", "reduce.1"));
+  EXPECT_THAT(params, ElementsAre("p0", "negate", "fusion.1"));
 }
 
 TEST_F(HloTraversalTest, FuseFusionConsumerAndProducer) {
   auto module = ParseAndReturnVerifiedModule(kTwoFusions).value();
-  auto producer = HloFusionAdaptor::ForInstruction(
-      module->entry_computation()->GetInstructionWithName("fusion.1"));
-  auto consumer = HloFusionAdaptor::ForInstruction(
-      module->entry_computation()->GetInstructionWithName("fusion.2"));
-  ProducerConsumerFusion fusion(std::move(producer), std::move(consumer));
+  auto producer =
+      module->entry_computation()->GetInstructionWithName("fusion.1");
+  auto consumer =
+      module->entry_computation()->GetInstructionWithName("fusion.2");
+  auto fusion = HloFusionAdaptor::ForProducerConsumer(producer, consumer);
 
   std::vector<std::string> nodes;
-  HloBfsConsumersFirstTraversal(fusion.GetRoots(), fusion,
+  HloBfsConsumersFirstTraversal(fusion->GetRoots(), *fusion,
                                 [&](HloInstructionAdaptor node) {
                                   nodes.emplace_back(node.name());
                                   return TraversalResult::kAdvance;
                                 });
   std::vector<std::string> params;
-  FindFusionArguments(fusion, [&](const HloInstructionAdaptor& param) {
+  FindFusionArguments(*fusion, [&](const HloInstructionAdaptor& param) {
     params.emplace_back(param.name());
   });
 
@@ -347,14 +385,12 @@ TEST_F(HloTraversalTest, FuseFusionConsumerAndProducer) {
 TEST_F(HloTraversalTest, FuseNonFusionConsumerAndProducer) {
   auto module = ParseAndReturnVerifiedModule(kTestModule).value();
 
-  auto producer = HloFusionAdaptor::ForInstruction(
-      module->entry_computation()->GetInstructionWithName("log"));
-  auto consumer = HloFusionAdaptor::ForInstruction(
-      module->entry_computation()->GetInstructionWithName("negate"));
-  ProducerConsumerFusion fusion(std::move(producer), std::move(consumer));
+  auto producer = module->entry_computation()->GetInstructionWithName("log");
+  auto consumer = module->entry_computation()->GetInstructionWithName("negate");
+  auto fusion = HloFusionAdaptor::ForProducerConsumer(producer, consumer);
 
   std::vector<std::string> nodes;
-  HloBfsConsumersFirstTraversal(fusion.GetRoots(), fusion,
+  HloBfsConsumersFirstTraversal(fusion->GetRoots(), *fusion,
                                 [&](HloInstructionAdaptor node) {
                                   nodes.emplace_back(node.name());
                                   return TraversalResult::kAdvance;
@@ -393,6 +429,29 @@ TEST_F(HloTraversalTest, SingleInstructionFusionOfInstruction) {
   EXPECT_THAT(nodes, ElementsAre("negate"));
 }
 
+TEST_F(HloTraversalTest, MultiOutputFusionDuplicateRoot) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    HloModule test
+
+    fused_computation {
+      p0.1 = f32[128] parameter(0)
+      p1.1 = f32[128] parameter(1)
+      mul = f32[128] multiply(p0.1, p1.1)
+      ROOT res = (f32[128], f32[128]) tuple(mul, mul)
+    }
+
+    ENTRY entry {
+      p0 = f32[128] parameter(0)
+      p1 = f32[128] parameter(1)
+      ROOT fusion = (f32[128], f32[128]) fusion(p0, p1), kind=kLoop, calls=fused_computation
+    })")
+                    .value();
+  auto fusion = HloFusionAdaptor::ForInstruction(
+      module->entry_computation()->GetInstructionWithName("fusion"));
+  EXPECT_THAT(fusion->GetRoots(), ElementsAre(InstructionAdaptorName("mul"),
+                                              InstructionAdaptorName("mul")));
+}
+
 TEST_F(HloTraversalTest, MakeInstructionsPostOrder_SingleInstruction) {
   auto module = ParseAndReturnVerifiedModule(kTwoFusions).value();
   auto fusion = HloFusionAdaptor::ForInstruction(
@@ -404,11 +463,11 @@ TEST_F(HloTraversalTest, MakeInstructionsPostOrder_SingleInstruction) {
 
 TEST_F(HloTraversalTest, MakeInstructionsPostOrder_TwoFusions) {
   auto module = ParseAndReturnVerifiedModule(kTwoFusions).value();
-  auto fusion = ProducerConsumerFusion(
+  auto fusion = HloFusionAdaptor::ForProducerConsumer(
       module->entry_computation()->GetInstructionWithName("fusion.1"),
       module->entry_computation()->GetInstructionWithName("fusion.2"));
 
-  auto nodes = fusion.MakeInstructionPostOrder();
+  auto nodes = fusion->MakeInstructionPostOrder();
   EXPECT_THAT(nodes, ElementsAre(InstructionAdaptorName("mul"),
                                  InstructionAdaptorName("reduce.1"),
                                  InstructionAdaptorName("reduce.2")));
@@ -455,17 +514,153 @@ TEST_F(HloTraversalTest, MakeInstructionsPostOrder_TwoMultiOutputFusions) {
       ROOT res = (f32[], f32[128]) tuple(difference, gte4)
     })")
                     .value();
-  auto fusion = ProducerConsumerFusion(
+  auto fusion = HloFusionAdaptor::ForProducerConsumer(
       module->entry_computation()->GetInstructionWithName("fusion.1"),
       module->entry_computation()->GetInstructionWithName("fusion.2"));
 
-  auto nodes = fusion.MakeInstructionPostOrder();
+  auto nodes = fusion->MakeInstructionPostOrder();
   EXPECT_THAT(nodes, ElementsAre(InstructionAdaptorName("mul"),
                                  InstructionAdaptorName("reduce.1"),
                                  InstructionAdaptorName("neg"),
                                  InstructionAdaptorName("reduce.2")));
 }
 
+const char kTwoMultiOutputFusions[] = R"(
+    HloModule mof
+    mof_producer {
+      param0 = f32[10]{0} parameter(0)
+      param1 = f32[10]{0} parameter(1)
+      param2 = f32[10]{0} parameter(2)
+      add = f32[10]{0} add(param0, param1)
+      sub = f32[10]{0} subtract(param0, param1)
+      ROOT res = (f32[10]{0}, f32[10]{0}, f32[10]{0}, f32[10]{0}, f32[10]{0}) tuple(param1, add, sub, param0, param2)
+    }
+
+    mof_consumer {
+      param0.0 = f32[10]{0} parameter(0)
+      param1.0 = f32[10]{0} parameter(1)
+      param2.0 = f32[10]{0} parameter(2)
+      mul = f32[10]{0} multiply(param0.0, param1.0)
+      div = f32[10]{0} divide(param0.0, param1.0)
+      ROOT res = (f32[10]{0}, f32[10]{0}, f32[10]{0}) tuple(mul, div, param2.0)
+    }
+
+    ENTRY main {
+      p0 = f32[10]{0} parameter(0)
+      p1 = f32[10]{0} parameter(1)
+      p2 = f32[10]{0} parameter(2)
+      producer = (f32[10]{0}, f32[10]{0}, f32[10]{0}, f32[10]{0}, f32[10]{0}) fusion(p0, p1, p2), kind=kLoop, calls=mof_producer
+      gte0 = f32[10]{0} get-tuple-element(producer), index=0
+      gte1 = f32[10]{0} get-tuple-element(producer), index=1
+      gte2 = f32[10]{0} get-tuple-element(producer), index=2
+      gte3 = f32[10]{0} get-tuple-element(producer), index=3
+      gte4 = f32[10]{0} get-tuple-element(producer), index=4
+      consumer = (f32[10]{0}, f32[10]{0}, f32[10]{0}) fusion(gte1, gte2, gte3), kind=kLoop, calls=mof_consumer
+      gte5 = f32[10]{0} get-tuple-element(consumer), index=0
+      gte6 = f32[10]{0} get-tuple-element(consumer), index=1
+      gte7 = f32[10]{0} get-tuple-element(consumer), index=2
+      ROOT res = tuple(gte0, gte1, gte3, gte4, gte5, gte6, gte7)
+    })";
+
+TEST_F(HloTraversalTest, GetParametersMultiOutputFusion) {
+  auto module = ParseAndReturnVerifiedModule(kTwoMultiOutputFusions).value();
+  auto producer =
+      module->entry_computation()->GetInstructionWithName("producer");
+  auto consumer =
+      module->entry_computation()->GetInstructionWithName("consumer");
+  auto fusion_adaptor =
+      HloFusionAdaptor::ForProducerConsumer(producer, consumer);
+  auto p0 = module->entry_computation()->GetInstructionWithName("p0");
+  auto p1 = module->entry_computation()->GetInstructionWithName("p1");
+  EXPECT_THAT(fusion_adaptor->GetParameters(), ElementsAre(p0, p1));
+  // Double-check that after performing the actual fusion, we get the same
+  // parameters.
+  consumer->MergeFusionInstructionIntoMultiOutput(producer);
+  EXPECT_THAT(consumer->operands(), ElementsAre(p0, p1));
+}
+
+TEST_F(HloTraversalTest, GetRootsMultiOutputFusion) {
+  auto module = ParseAndReturnVerifiedModule(kTwoMultiOutputFusions).value();
+  auto consumer_fusion_instr =
+      module->entry_computation()->GetInstructionWithName("consumer");
+  auto producer_fusion_instr =
+      module->entry_computation()->GetInstructionWithName("producer");
+  auto fusion_adaptor = HloFusionAdaptor::ForProducerConsumer(
+      producer_fusion_instr, consumer_fusion_instr);
+  auto producer_computation = module->GetComputationWithName("mof_producer");
+  auto producer = HloFusionAdaptor::ForComputation(producer_computation);
+  auto consumer_computation = module->GetComputationWithName("mof_consumer");
+  auto consumer = HloFusionAdaptor::ForComputation(consumer_computation);
+  EXPECT_THAT(fusion_adaptor->GetRoots(),
+              ElementsAre(
+                  HloInstructionAdaptor{
+                      *consumer_computation->GetInstructionWithName("mul"),
+                      consumer.get()},
+                  HloInstructionAdaptor{
+                      *consumer_computation->GetInstructionWithName("div"),
+                      consumer.get()},
+                  HloInstructionAdaptor{
+                      *producer_computation->GetInstructionWithName("param0"),
+                      producer.get()},
+                  HloInstructionAdaptor{
+                      *producer_computation->GetInstructionWithName("add"),
+                      producer.get()}));
+  // Double-check that after performing the actual fusion, we get the same
+  // roots.
+  consumer_fusion_instr->MergeFusionInstructionIntoMultiOutput(
+      producer_fusion_instr);
+  EXPECT_THAT(consumer_fusion_instr->fused_expression_root(),
+              GmockMatch(m::Tuple(
+                  m::Multiply(m::Add(m::Parameter(0), m::Parameter(1)),
+                              m::Subtract(m::Parameter(0), m::Parameter(1))),
+                  m::Divide(m::Add(m::Parameter(0), m::Parameter(1)),
+                            m::Subtract(m::Parameter(0), m::Parameter(1))),
+                  m::Parameter(0), m::Add(m::Parameter(0), m::Parameter(1)))));
+}
+
+TEST_F(HloTraversalTest, HloFindUseChain) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    fusion {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      negate = f32[] negate(p0)
+      log = f32[] log(p0)
+      sum = f32[] add(p0, log)
+      exp = f32[] exponential(p1)
+      ROOT call = f32[] custom-call(negate, exp, sum), custom_call_target="it"
+    }
+
+    ENTRY main {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT fusion = f32[] fusion(p0, p1), kind=kLoop, calls=fusion
+    }
+    )")
+                    .value();
+
+  auto* fusion_computation = module->GetComputationWithName("fusion");
+  auto fusion = HloFusionAdaptor::ForComputation(fusion_computation);
+  auto get = [&](absl::string_view name) {
+    return HloInstructionAdaptor{
+        *fusion_computation->GetInstructionWithName(name), fusion.get()};
+  };
+  auto p0 = get("p0");
+  auto p1 = get("p1");
+  auto log = get("log");
+  auto sum = get("sum");
+  auto negate = get("negate");
+  auto exp = get("exp");
+  auto call = get("call");
+
+  EXPECT_THAT(HloFindUseChain(p0, p0), ElementsAre(p0));
+  EXPECT_THAT(HloFindUseChain(p0, p1), IsEmpty());
+  EXPECT_THAT(HloFindUseChain(p0, call), ElementsAre(p0, negate, call));
+  EXPECT_THAT(HloFindUseChain(p0, sum), ElementsAre(p0, log, sum));
+  EXPECT_THAT(HloFindUseChain(p1, exp), ElementsAre(p1, exp));
+  EXPECT_THAT(HloFindUseChain(negate, exp), IsEmpty());
+  EXPECT_THAT(HloFindUseChain(call, p0), IsEmpty());
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/horizontal_loop_fusion.cc b/third_party/xla/xla/service/gpu/horizontal_loop_fusion.cc
index c292f9020ba60f..80c46cb7a5d5af 100644
--- a/third_party/xla/xla/service/gpu/horizontal_loop_fusion.cc
+++ b/third_party/xla/xla/service/gpu/horizontal_loop_fusion.cc
@@ -729,7 +729,7 @@ absl::StatusOr<bool> GpuHorizontalLoopFusion::Run(
                       RunOnComputation(module->entry_computation()));
 
   if (changed) {
-    // Correctly set element_size_in_bits for any int4 added slice and
+    // Correctly set element_size_in_bits for any sub-byte added slice and
     // concatenate instructions
     TF_ASSIGN_OR_RETURN(
         [[maybe_unused]] bool unused,
diff --git a/third_party/xla/xla/service/gpu/infeed_manager.cc b/third_party/xla/xla/service/gpu/infeed_manager.cc
index 945e686aaf572e..cfb49bb8f340c7 100644
--- a/third_party/xla/xla/service/gpu/infeed_manager.cc
+++ b/third_party/xla/xla/service/gpu/infeed_manager.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_memory_handle.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
@@ -48,7 +48,7 @@ InfeedManager::InfeedManager(se::StreamExecutor* executor)
     : BlockingXfeedQueue(/*max_pending_xfeeds=*/kMaxInfeedsInFlight),
       stream_(executor->CreateStream().value()) {}
 
-static absl::StatusOr<se::ScopedDeviceMemory<uint8_t>> CopyBufferToDevice(
+static absl::StatusOr<se::DeviceMemoryHandle> CopyBufferToDevice(
     se::Stream* stream, int64_t size, const void* source) {
   if (size > std::numeric_limits<int32_t>::max()) {
     return InvalidArgument("GPU infeed of %d bytes exceeds maximum of %d bytes",
@@ -60,9 +60,9 @@ static absl::StatusOr<se::ScopedDeviceMemory<uint8_t>> CopyBufferToDevice(
   }
 
   se::StreamExecutor* executor = stream->parent();
-  se::ScopedDeviceMemory<uint8_t> buffer(
-      executor, executor->AllocateArray<uint8_t>(size));
-  TF_RETURN_IF_ERROR(stream->Memcpy(buffer.ptr(), source, size));
+  se::DeviceMemoryHandle buffer(executor,
+                                executor->AllocateArray<uint8_t>(size));
+  TF_RETURN_IF_ERROR(stream->Memcpy(buffer.memory_ptr(), source, size));
 
   return std::move(buffer);
 }
@@ -77,7 +77,7 @@ absl::Status InfeedManager::TransferLiteralToInfeed(
 
   // For a tuple, we transfer each of its elements to the device and enqueue the
   // resulting destination device addresses with the infeed manager.
-  ShapeTree<se::ScopedDeviceMemory<uint8_t>> buffer_tree(literal_shape);
+  ShapeTree<se::DeviceMemoryHandle> buffer_tree(literal_shape);
   for (auto& leaf : buffer_tree.leaves()) {
     const Shape& sub_shape = ShapeUtil::GetSubshape(literal_shape, leaf.first);
     CHECK(sub_shape.IsArray()) << ShapeUtil::HumanStringWithLayout(sub_shape);
diff --git a/third_party/xla/xla/service/gpu/infeed_manager.h b/third_party/xla/xla/service/gpu/infeed_manager.h
index 6eff97721d8076..a523691b18b262 100644
--- a/third_party/xla/xla/service/gpu/infeed_manager.h
+++ b/third_party/xla/xla/service/gpu/infeed_manager.h
@@ -27,7 +27,7 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/service/gpu/xfeed_queue.h"
 #include "xla/shape_tree.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_memory_handle.h"
 #include "xla/stream_executor/stream_executor.h"
 
 namespace xla {
@@ -47,7 +47,7 @@ namespace gpu {
 
 // Client-side class used to enqueue infeed buffers.
 class InfeedManager
-    : public BlockingXfeedQueue<ShapeTree<se::ScopedDeviceMemory<uint8_t>>> {
+    : public BlockingXfeedQueue<ShapeTree<se::DeviceMemoryHandle>> {
  public:
   explicit InfeedManager(se::StreamExecutor* executor);
 
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils.cc b/third_party/xla/xla/service/gpu/ir_emission_utils.cc
index 6265c5845d036c..34cd1b1c1b95d0 100644
--- a/third_party/xla/xla/service/gpu/ir_emission_utils.cc
+++ b/third_party/xla/xla/service/gpu/ir_emission_utils.cc
@@ -99,7 +99,7 @@ Shape GetShapeFromTensorType(mlir::Value value) {
 
   mlir::Operation* op = value.getDefiningOp();
   CHECK(op);
-  CHECK(value.getType().isa<mlir::TensorType>());
+  CHECK(mlir::isa<mlir::TensorType>(value.getType()));
   Shape shape;
   if (auto attr = op->getAttrOfType<mlir::StringAttr>(kDefaultLayoutAttrName)) {
     shape = *xla::ParseShape(
@@ -346,7 +346,7 @@ static int64_t GetMemRefSizeInBytes(mlir::MemRefType type) {
   if (type.getElementType().isInteger(/*width=*/1)) {
     return type.getNumElements();
   } else if (auto complexType =
-                 type.getElementType().dyn_cast<mlir::ComplexType>()) {
+                 mlir::dyn_cast<mlir::ComplexType>(type.getElementType())) {
     auto elementType = complexType.getElementType();
     return elementType.getIntOrFloatBitWidth() * type.getNumElements() * 2 /
            CHAR_BIT;
@@ -362,15 +362,15 @@ absl::StatusOr<BufferAllocation::Slice> GetAllocationSlice(
 }
 
 std::vector<const HloInstruction*> GetOutputDefiningDynamicUpdateSlices(
-    const std::vector<const HloInstruction*>& roots) {
+    absl::Span<HloInstructionAdaptor const> roots) {
   std::vector<const HloInstruction*> dus_ops;
-  for (const HloInstruction* root : roots) {
-    while (root->opcode() == HloOpcode::kBitcast) {
-      root = root->operand(0);
+  for (HloInstructionAdaptor root : roots) {
+    while (root.opcode() == HloOpcode::kBitcast) {
+      root = root.GetOperand(0);
     }
 
-    if (root->opcode() == HloOpcode::kDynamicUpdateSlice) {
-      dus_ops.push_back(root);
+    if (root.opcode() == HloOpcode::kDynamicUpdateSlice) {
+      dus_ops.push_back(&root.instruction());
     }
   }
   return dus_ops;
@@ -390,7 +390,7 @@ absl::InlinedVector<const HloInstruction*, 4> GetStartIndices(T instr) {
 absl::StatusOr<bool> CanEmitFusedDynamicUpdateSliceInPlaceForGpu(
     const HloFusionInstruction* fusion,
     const BufferAssignment* buffer_assignment,
-    const std::vector<const HloInstruction*>& roots) {
+    absl::Span<HloInstructionAdaptor const> roots) {
   std::vector<const HloInstruction*> dus_instrs =
       GetOutputDefiningDynamicUpdateSlices(roots);
 
@@ -537,18 +537,19 @@ absl::StatusOr<bool> CanEmitFusedDynamicUpdateSliceInPlaceForGpu(
 
 Shape GetShape(mlir::Value value) {
   Shape shape;
-  if (value.getType().isa<mlir::MemRefType>()) {
+  if (mlir::isa<mlir::MemRefType>(value.getType())) {
     shape = TypeToShape(value.getType());
-  } else if (value.getType().isa<mlir::TensorType>()) {
+  } else if (mlir::isa<mlir::TensorType>(value.getType())) {
     shape = GetShapeFromTensorType(value);
-  } else if (value.getType().isa<mlir::TupleType>()) {
+  } else if (mlir::isa<mlir::TupleType>(value.getType())) {
     shape = TypeToShape(value.getType());
   } else {
     LOG(FATAL) << "Unexpected value type to get shape for";
   }
-  if (primitive_util::Is4BitType(shape.element_type())) {
+  if (primitive_util::IsSubByteNonPredType(shape.element_type())) {
     // 4-bit types are always packed on the GPU
-    shape.mutable_layout()->set_element_size_in_bits(4);
+    shape.mutable_layout()->set_element_size_in_bits(
+        primitive_util::BitWidth(shape.element_type()));
   }
   return shape;
 }
@@ -651,11 +652,11 @@ bool IsIntermediate(const HloInstruction* instr, int allowed_operand_count,
     // TODO(akuegel): Figure out why we still need this check for transpose
     // fusions.
     int64_t num_users =
-        fusion ? absl::c_count_if(HloInstructionAdaptor{*instr}.GetUsers(),
-                                  [&](auto user) {
-                                    return fusion->ContainsInstruction(user);
-                                  })
-               : instr->user_count();
+        fusion
+            ? absl::c_count_if(
+                  HloInstructionAdaptor{*instr, fusion}.GetUsers(),
+                  [&](auto user) { return fusion->ContainsInstruction(user); })
+            : instr->user_count();
     if (num_users > 1) {
       return false;
     }
@@ -687,7 +688,7 @@ bool IsIntermediate(const HloInstruction* instr, int allowed_operand_count,
 }
 
 static std::optional<HloInstructionAdaptor> FindNonTrivialHero(
-    HloInstructionAdaptor root, const HloFusionAdaptor& fusion,
+    const HloInstructionAdaptor& root,
     const std::function<bool(const HloInstruction&)>& predicate) {
   std::optional<HloInstructionAdaptor> hero = std::nullopt;
   auto visitor = [&](HloInstructionAdaptor node) {
@@ -709,7 +710,7 @@ static std::optional<HloInstructionAdaptor> FindNonTrivialHero(
     }
     return TraversalResult::kAdvance;
   };
-  HloBfsConsumersFirstTraversal({root}, fusion, visitor);
+  HloBfsConsumersFirstTraversal({root}, root.parent(), visitor);
   if (!hero) {
     return std::nullopt;
   }
@@ -726,23 +727,23 @@ static std::optional<HloInstructionAdaptor> FindNonTrivialHero(
                            /*add_single_user_check=*/true);
   };
   bool visit_operands = false;
-  if (HloAnyOf(hero->GetUsers(), fusion, is_nontrivial, visit_operands)) {
+  if (HloAnyOf(hero->GetUsers(), hero->parent(), is_nontrivial,
+               visit_operands)) {
     return std::nullopt;
   }
 
   return hero;
 }
 
-const HloInstruction& FindNonTrivialHero(const HloInstruction& instr,
-                                         const HloFusionAdaptor& fusion) {
-  HloInstructionAdaptor hero{instr};
+HloInstructionAdaptor FindNonTrivialHero(const HloInstructionAdaptor& instr) {
+  HloInstructionAdaptor hero = instr;
 
   // Go up the chain of trivial element-wise(+bitcast, -copy) operations. Note
   // that no memoization is needed due to number of operands constraints: we
   // never have to revisit same nodes.
   while (IsIntermediate(&hero.instruction(), /*allowed_operand_count=*/1,
-                        &fusion) &&
-         fusion.ContainsInstruction(hero.GetOperand(0))) {
+                        &hero.parent()) &&
+         hero.parent().ContainsInstruction(hero.GetOperand(0))) {
     hero = hero.GetOperand(0);
   }
 
@@ -752,25 +753,26 @@ const HloInstruction& FindNonTrivialHero(const HloInstruction& instr,
   auto is_transpose = [](const HloInstruction& node) {
     return FindTiledLogicalTranspose(node).has_value();
   };
-  if (auto transpose = FindNonTrivialHero(hero, fusion, is_transpose)) {
-    return transpose->instruction();
+  if (auto transpose = FindNonTrivialHero(hero, is_transpose)) {
+    return *transpose;
   }
   auto is_concatenate = [](const HloInstruction& node) {
     return node.opcode() == HloOpcode::kConcatenate;
   };
-  if (auto concatenate = FindNonTrivialHero(hero, fusion, is_concatenate)) {
-    return concatenate->instruction();
+  if (auto concatenate = FindNonTrivialHero(hero, is_concatenate)) {
+    return *concatenate;
   }
   if (hero.opcode() != HloOpcode::kReduce) {
     return instr;
   }
-  return hero.instruction();
+  return hero;
 }
 
 const HloInstruction& FindNonTrivialHero(const HloInstruction& instr) {
   CHECK_NE(instr.opcode(), HloOpcode::kFusion);
-  return FindNonTrivialHero(instr,
-                            *HloFusionAdaptor::ForComputation(instr.parent()));
+  auto fusion_adaptor = HloFusionAdaptor::ForComputation(instr.parent());
+  HloInstructionAdaptor instr_adaptor(instr, fusion_adaptor.get());
+  return FindNonTrivialHero(instr_adaptor).instruction();
 }
 
 void VLogModule(int level, const llvm::Module& module) {
@@ -858,11 +860,13 @@ absl::StatusOr<DenseDataIntermediate> LiteralToXlaFormat(
   }
 
   int64_t byte_size = literal.size_bytes();
-  if (primitive_util::Is4BitType(element_type)) {
-    std::vector<uint8_t> output(CeilOfRatio(byte_size, int64_t{2}));
+  if (primitive_util::IsSubByteNonPredType(element_type)) {
+    auto bit_width = primitive_util::BitWidth(element_type);
+    std::vector<uint8_t> output(CeilOfRatio<int64_t>(byte_size, 8 / bit_width));
     absl::Span<char> output_span =
         absl::MakeSpan(reinterpret_cast<char*>(output.data()), output.size());
-    PackInt4(
+    PackIntN(
+        bit_width,
         absl::MakeSpan(reinterpret_cast<const char*>(literal.untyped_data()),
                        byte_size),
         output_span);
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils.h b/third_party/xla/xla/service/gpu/ir_emission_utils.h
index 896237855503e6..c2ef21b8e7bb75 100644
--- a/third_party/xla/xla/service/gpu/ir_emission_utils.h
+++ b/third_party/xla/xla/service/gpu/ir_emission_utils.h
@@ -137,7 +137,7 @@ absl::StatusOr<BufferAllocation::Slice> GetAllocationSlice(
 absl::StatusOr<bool> CanEmitFusedDynamicUpdateSliceInPlaceForGpu(
     const HloFusionInstruction* fusion,
     const BufferAssignment* buffer_assignment,
-    const std::vector<const HloInstruction*>& roots);
+    absl::Span<HloInstructionAdaptor const> roots);
 
 // Returns the dynamic-update-slice instructions defining the results of a
 // fusion node. A dynamic slice update is said to be "defining" of a result if
@@ -145,20 +145,16 @@ absl::StatusOr<bool> CanEmitFusedDynamicUpdateSliceInPlaceForGpu(
 // output of a bitcast of a dynamic slice update---since such bitcast may be
 // handled as a no-op.
 std::vector<const HloInstruction*> GetOutputDefiningDynamicUpdateSlices(
-    const std::vector<const HloInstruction*>& roots);
+    absl::Span<HloInstructionAdaptor const> roots);
 
 Shape GetShape(mlir::Value value);
 
-// `is_boundary` returns `true` for edges that are on the boundary of the
-// fusion, i.e., they go from an instruction inside the fusion to one outside,
-// or vice versa.
-// Note: when this is called with a fusion instruction, it will traverse into
-// the fusion (unless the boundary function stops it).
-const HloInstruction& FindNonTrivialHero(const HloInstruction& instr,
-                                         const HloFusionAdaptor& fusion);
+// Returns the first hero instruction reachable from `instr` as root. Hero
+// instruction can be in a different computation if the parent HloFusionAdaptor
+// is a producer-consumer fusion.
+HloInstructionAdaptor FindNonTrivialHero(const HloInstructionAdaptor& instr);
 
-// Like above, but assumes the instruction is inside an HloFusionInstruction.
-// Returns the instruction itself if it is an HloFusionInstruction.
+// Same as above, but fusion is the parent computation of the hlo instruction.
 const HloInstruction& FindNonTrivialHero(const HloInstruction& instr);
 
 /// Description of how to emit a given transposition.
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc b/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc
index 3c776d60a3e24a..79319964ec937f 100644
--- a/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc
@@ -99,16 +99,22 @@ TEST_F(IrEmissionUtilsTest, FindAnyTiledTransposeWithIntermediateUnaryOpS8) {
   const char* hlo = R"(
 HloModule module
 
-ENTRY entry {
+fusion {
   p = f32[32,48,64]{2,1,0} parameter(0)
   t = f32[64,48,32]{2,1,0} transpose(p), dimensions={2,1,0}
   ROOT c = s8[64,48,32]{2,1,0} convert(t)
 }
+
+ENTRY main {
+  p0 = f32[32,48,64]{2,1,0} parameter(0)
+  ROOT f = s8[64,48,32]{2,1,0} fusion(p0), kind=kInput, calls=fusion
+}
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo));
 
-  HloInstruction* r = module->entry_computation()->root_instruction();
+  HloInstruction* r =
+      module->entry_computation()->root_instruction()->fused_expression_root();
   // TODO(b/284431534): Update this test when the shared memory transpose
   // emitter is fast for S8 output.
   EXPECT_FALSE(
@@ -146,8 +152,7 @@ TEST_F(IrEmissionUtilsTest, FindReduceHeroEpilogueFusion) {
 
   HloInstruction* r = module->entry_computation()->root_instruction();
   auto fusion = HloFusionAdaptor::ForInstruction(r);
-  const auto& result =
-      FindNonTrivialHero(fusion->GetRoots()[0].instruction(), *fusion);
+  const auto& result = FindNonTrivialHero(fusion->GetRoots()[0]);
   EXPECT_EQ(result.name(), "reduce.0");
 }
 
@@ -181,11 +186,9 @@ TEST_F(IrEmissionUtilsTest, FindReduceHeroEpilogueFusionTwoRootUsers) {
 
   HloInstruction* r = module->entry_computation()->root_instruction();
   auto fusion = HloFusionAdaptor::ForInstruction(r);
-  const auto& result =
-      FindNonTrivialHero(fusion->GetRoots()[1].instruction(), *fusion);
+  const auto& result = FindNonTrivialHero(fusion->GetRoots()[1]);
   EXPECT_EQ(result.name(), "reduce.1");
-  const auto& result2 =
-      FindNonTrivialHero(fusion->GetRoots()[2].instruction(), *fusion);
+  const auto& result2 = FindNonTrivialHero(fusion->GetRoots()[2]);
   EXPECT_EQ(result2.name(), "reduce.1");
 }
 
@@ -219,13 +222,11 @@ TEST_F(IrEmissionUtilsTest, FindReduceHeroEpilogueFusionHeroAlsoUsedAsNonHero) {
 
   HloInstruction* r = module->entry_computation()->root_instruction();
   auto fusion = HloFusionAdaptor::ForInstruction(r);
-  const auto& result =
-      FindNonTrivialHero(fusion->GetRoots()[1].instruction(), *fusion);
+  const auto& result = FindNonTrivialHero(fusion->GetRoots()[1]);
   // reduce.0 is also an operand of broadcast, but it is not a hero for that
   // root.
   EXPECT_EQ(result.name(), "broadcast");
-  const auto& result2 =
-      FindNonTrivialHero(fusion->GetRoots()[2].instruction(), *fusion);
+  const auto& result2 = FindNonTrivialHero(fusion->GetRoots()[2]);
   EXPECT_EQ(result2.name(), "reduce.0");
 }
 
@@ -252,11 +253,9 @@ TEST_F(IrEmissionUtilsTest, DoNotFindTransposeHeroEpilogueFusionTwoRootUsers) {
 
   HloInstruction* r = module->entry_computation()->root_instruction();
   auto fusion = HloFusionAdaptor::ForInstruction(r);
-  const auto& result =
-      FindNonTrivialHero(fusion->GetRoots()[0].instruction(), *fusion);
+  const auto& result = FindNonTrivialHero(fusion->GetRoots()[0]);
   EXPECT_EQ(result.name(), "bitcast.1");
-  const auto& result2 =
-      FindNonTrivialHero(fusion->GetRoots()[1].instruction(), *fusion);
+  const auto& result2 = FindNonTrivialHero(fusion->GetRoots()[1]);
   EXPECT_EQ(result2.name(), "sign.1");
 }
 
@@ -287,19 +286,25 @@ TEST_F(IrEmissionUtilsTest, FindAnyTiledTransposeWithTwoIntermediateBinaryOps) {
   const char* hlo = R"(
 HloModule module
 
-ENTRY entry {
+fusion {
   p = f32[32,48,64]{2,1,0} parameter(0)
   p2 = f32[64,48,32]{2,1,0} parameter(1)
-  p3 = f32[64,48,32]{2,1,0} parameter(2)
   t = f32[64,48,32]{2,1,0} transpose(p), dimensions={2,1,0}
-  mul = f32[64,48,32]{2,1,0} multiply(t, p3)
-  ROOT add = f32[64,48,32]{2,1,0} add(mul, p3)
+  mul = f32[64,48,32]{2,1,0} multiply(t, p2)
+  ROOT add = f32[64,48,32]{2,1,0} add(mul, p2)
+}
+
+ENTRY main {
+  param0 = f32[32,48,64]{2,1,0} parameter(0)
+  param1 = f32[64,48,32]{2,1,0} parameter(1)
+  ROOT fusion = f32[64,48,32]{2,1,0} fusion(param0, param1), kind=kInput, calls=fusion
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo));
 
-  HloInstruction* r = module->entry_computation()->root_instruction();
+  HloInstruction* r =
+      module->entry_computation()->root_instruction()->fused_expression_root();
   auto result =
       GetDescriptionForTiledTransposeEmitter(*r, FindNonTrivialHero(*r));
   EXPECT_TRUE(result.has_value());
@@ -313,18 +318,25 @@ TEST_F(IrEmissionUtilsTest,
   const char* hlo = R"(
 HloModule module
 
-ENTRY entry {
+fusion {
   p = f32[32,48,64]{2,1,0} parameter(0)
   p2 = f32[48,32,64]{2,1,0} parameter(1)
   t = f32[64,48,32]{2,1,0} transpose(p), dimensions={2,1,0}
   t2 = f32[64,48,32]{2,1,0} transpose(p2), dimensions={2,0,1}
   ROOT add = f32[64,48,32]{2,1,0} add(t, t2)
 }
+
+ENTRY main {
+  param0 = f32[32,48,64]{2,1,0} parameter(0)
+  param1 = f32[48,32,64]{2,1,0} parameter(1)
+  ROOT fusion = f32[64,48,32]{2,1,0} fusion(param0, param1), kind=kInput, calls=fusion
+}
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo));
 
-  HloInstruction* r = module->entry_computation()->root_instruction();
+  HloInstruction* r =
+      module->entry_computation()->root_instruction()->fused_expression_root();
   EXPECT_FALSE(
       GetDescriptionForTiledTransposeEmitter(*r, FindNonTrivialHero(*r))
           .has_value());
@@ -351,16 +363,16 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo));
 
-  HloInstruction* r = module->GetComputationWithName("f")->root_instruction();
   HloInstruction* transpose =
       module->entry_computation()->GetInstructionWithName("t");
   HloInstruction* fusion =
       module->entry_computation()->GetInstructionWithName("fusion");
-  EXPECT_EQ(
-      &FindNonTrivialHero(*r, ProducerConsumerFusion(
-                                  HloFusionAdaptor::ForInstruction(transpose),
-                                  HloFusionAdaptor::ForInstruction(fusion))),
-      transpose);
+  auto fusion_adaptor =
+      HloFusionAdaptor::ForProducerConsumer(transpose, fusion);
+  HloInstructionAdaptor r(
+      *module->GetComputationWithName("f")->root_instruction(),
+      fusion_adaptor.get());
+  EXPECT_EQ(&FindNonTrivialHero(r).instruction(), transpose);
 }
 
 TEST_F(IrEmissionUtilsTest, FindNonTrivialHeroInsideFusion) {
@@ -391,30 +403,35 @@ ENTRY entry {
                                   .front();
   HloInstruction* fusion =
       module->entry_computation()->GetInstructionWithName("fusion");
-  EXPECT_EQ(
-      &FindNonTrivialHero(
-          *r, ProducerConsumerFusion(HloFusionAdaptor::ForInstruction(fusion),
-                                     HloFusionAdaptor::ForInstruction(r))),
-      transpose);
+  auto fusion_adaptor = HloFusionAdaptor::ForProducerConsumer(fusion, r);
+  EXPECT_EQ(&FindNonTrivialHero(HloInstructionAdaptor(*r, fusion_adaptor.get()))
+                 .instruction(),
+            transpose);
 }
 
 TEST_F(IrEmissionUtilsTest, TransposeReachableViaTrivialAndNontrivialOps) {
   const char* hlo = R"(
 HloModule module
 
-ENTRY entry {
+fusion {
   p = f64[16,16]{1,0} parameter(0)
   trans = f64[16,16]{1,0} transpose(p), dimensions={1,0}
   rev = f64[16,16]{1,0} reverse(trans), dimensions={0,1}
   sub = f64[16,16]{1,0} subtract(trans, trans)
   ROOT add = f64[16,16]{1,0} add(rev, sub)
 }
+
+ENTRY main {
+  param = f64[16,16]{1,0} parameter(0)
+  ROOT fusion = f64[16,16]{1,0} fusion(param), kind=kLoop, calls=fusion
+}
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo));
 
-  HloInstruction* r = module->entry_computation()->root_instruction();
+  HloInstruction* r =
+      module->entry_computation()->root_instruction()->fused_expression_root();
   EXPECT_FALSE(
       GetDescriptionForTiledTransposeEmitter(*r, FindNonTrivialHero(*r))
           .has_value());
@@ -425,15 +442,21 @@ TEST_F(IrEmissionUtilsTest, FindTiledTransposeOneSwapDimIsSmall) {
   const char* hlo = R"(
 HloModule module
 
-ENTRY entry {
+fusion {
   p = f32[100,11,12,8]{3,2,1,0} parameter(0)
   ROOT c = f32[100,11,12,8]{1,0,2,3} copy(p)
 }
+
+ENTRY main {
+  param = f32[100,11,12,8]{3,2,1,0} parameter(0)
+  ROOT fusion = f32[100,11,12,8]{1,0,2,3} fusion(param), kind=kInput, calls=fusion
+}
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo));
 
-  HloInstruction* copy = module->entry_computation()->root_instruction();
+  HloInstruction* copy =
+      module->entry_computation()->root_instruction()->fused_expression_root();
   auto result =
       GetDescriptionForTiledTransposeEmitter(*copy, FindNonTrivialHero(*copy));
   EXPECT_TRUE(result.has_value());
@@ -446,15 +469,21 @@ TEST_F(IrEmissionUtilsTest, FindTiledLogicalTransposeOneSwapDimIsSmall) {
   const char* hlo = R"(
 HloModule module
 
-ENTRY entry {
+fusion {
   p = f32[100,11,12,8]{3,2,1,0} parameter(0)
   ROOT t = f32[8,12,100,11]{3,2,1,0} transpose(p), dimensions={3,2,0,1}
 }
+
+ENTRY main {
+  param = f32[100,11,12,8]{3,2,1,0} parameter(0)
+  ROOT fusion = f32[8,12,100,11]{3,2,1,0} fusion(param), kind=kInput, calls=fusion
+}
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo));
 
-  HloInstruction* tr = module->entry_computation()->root_instruction();
+  HloInstruction* tr =
+      module->entry_computation()->root_instruction()->fused_expression_root();
   auto result =
       GetDescriptionForTiledTransposeEmitter(*tr, FindNonTrivialHero(*tr));
   EXPECT_TRUE(result.has_value());
@@ -467,15 +496,21 @@ TEST_F(IrEmissionUtilsTest, FindTiledTransposeOtherSwapDimIsSmall) {
   const char* hlo = R"(
 HloModule module
 
-ENTRY entry {
+fusion {
   p = f32[8,12,100,11]{3,2,1,0} parameter(0)
   ROOT c = f32[8,12,100,11]{0,1,3,2} copy(p)
 }
+
+ENTRY main {
+  param = f32[8,12,100,11]{3,2,1,0} parameter(0)
+  ROOT fusion = f32[8,12,100,11]{0,1,3,2} fusion(param), kind=kInput, calls=fusion
+}
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo));
 
-  HloInstruction* copy = module->entry_computation()->root_instruction();
+  HloInstruction* copy =
+      module->entry_computation()->root_instruction()->fused_expression_root();
   auto result =
       GetDescriptionForTiledTransposeEmitter(*copy, FindNonTrivialHero(*copy));
   EXPECT_TRUE(result.has_value());
@@ -488,15 +523,21 @@ TEST_F(IrEmissionUtilsTest, FindTiledLogicalTransposeOtherSwapDimIsSmall) {
   const char* hlo = R"(
 HloModule module
 
-ENTRY entry {
+fusion {
   p = f32[8,12,100,11]{3,2,1,0} parameter(0)
   ROOT t = f32[100,11,12,8]{3,2,1,0} transpose(p), dimensions={2,3,1,0}
 }
+
+ENTRY main {
+  param = f32[8,12,100,11]{3,2,1,0} parameter(0)
+  ROOT fusion = f32[100,11,12,8]{3,2,1,0} fusion(param), kind=kInput, calls=fusion
+}
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo));
 
-  HloInstruction* tr = module->entry_computation()->root_instruction();
+  HloInstruction* tr =
+      module->entry_computation()->root_instruction()->fused_expression_root();
   auto result =
       GetDescriptionForTiledTransposeEmitter(*tr, FindNonTrivialHero(*tr));
   EXPECT_TRUE(result.has_value());
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_nested.cc b/third_party/xla/xla/service/gpu/ir_emitter_nested.cc
index 7403531a8285f9..c278ee930ee26b 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_nested.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_nested.cc
@@ -391,9 +391,13 @@ bool MaybeEmitDirectAtomicOperation(llvm::IRBuilder<>* builder,
       bool f16_atomic_add_supported =
           ir_emitter_context.cuda_compute_capability().IsAtLeast(
               se::CudaComputeCapability::VOLTA);
+      bool bf16_atomic_add_supported =
+          ir_emitter_context.cuda_compute_capability().IsAtLeast(
+              se::CudaComputeCapability::HOPPER);
       bool atomic_add_supported =
           element_type == F32 ||
           (f16_atomic_add_supported && element_type == F16) ||
+          (bf16_atomic_add_supported && element_type == BF16) ||
           (f64_atomic_add_supported && element_type == F64);
       if (atomic_add_supported) {
         builder->CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, output_address,
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
index dd44b8ec418eba..a0b882de6fe4c4 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/gpu/ir_emitter_triton.h"
 
+#include <array>
 #include <climits>
 #include <cmath>
 #include <cstddef>
@@ -115,6 +116,7 @@ limitations under the License.
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/model/indexing_map.h"
 #include "xla/service/gpu/model/symbolic_tile_analysis.h"
+#include "xla/service/gpu/model/tiled_hlo_computation.h"
 #include "xla/service/gpu/model/tiled_hlo_instruction.h"
 #include "xla/service/gpu/target_util.h"
 #include "xla/service/gpu/triton_fusion_analysis.h"
@@ -139,6 +141,7 @@ limitations under the License.
 #include "triton/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/Triton/IR/Types.h"
+#include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonNvidiaGPU/Transforms/Passes.h"
 
 namespace xla {
@@ -206,10 +209,10 @@ T ScalarConstantValue(const HloInstruction& instr, PrimitiveType dst_type) {
 // Create a scalar constant.
 template <typename T>
 ma::ConstantOp CreateConst(ImplicitLocOpBuilder b, Type type, T value) {
-  if (type.isa<mlir::IntegerType>()) {
+  if (mlir::isa<mlir::IntegerType>(type)) {
     return b.create<ma::ConstantOp>(b.getIntegerAttr(type, value));
   }
-  if (type.isa<mlir::FloatType>()) {
+  if (mlir::isa<mlir::FloatType>(type)) {
     return b.create<ma::ConstantOp>(
         b.getFloatAttr(type, static_cast<double>(value)));
   }
@@ -221,11 +224,11 @@ template <typename T>
 ma::ConstantOp CreateConst(ImplicitLocOpBuilder& b, Type type, T value,
                            ArrayRef<int64_t> shape) {
   auto tensor_type = mlir::RankedTensorType::get(shape, type);
-  if (auto int_type = type.dyn_cast<mlir::IntegerType>()) {
+  if (auto int_type = mlir::dyn_cast<mlir::IntegerType>(type)) {
     return b.create<ma::ConstantOp>(mlir::DenseElementsAttr::get(
         tensor_type, mlir::APInt(int_type.getIntOrFloatBitWidth(), value)));
   }
-  if (auto float_type = type.dyn_cast<mlir::FloatType>()) {
+  if (auto float_type = mlir::dyn_cast<mlir::FloatType>(type)) {
     return b.create<ma::ConstantOp>(mlir::DenseElementsAttr::get(
         tensor_type, b.getFloatAttr(type, static_cast<double>(value))));
   }
@@ -233,7 +236,7 @@ ma::ConstantOp CreateConst(ImplicitLocOpBuilder& b, Type type, T value,
 }
 
 Value ZerosLike(ImplicitLocOpBuilder& b, Value x) {
-  if (auto src_shaped_ty = x.getType().dyn_cast<ShapedType>()) {
+  if (auto src_shaped_ty = mlir::dyn_cast<ShapedType>(x.getType())) {
     Type src_ty = src_shaped_ty.getElementType();
     return CreateConst(b, src_ty, 0, src_shaped_ty.getShape());
   }
@@ -241,7 +244,7 @@ Value ZerosLike(ImplicitLocOpBuilder& b, Value x) {
 }
 
 Value OnesLike(ImplicitLocOpBuilder& b, Value x) {
-  if (auto src_shaped_ty = x.getType().dyn_cast<ShapedType>()) {
+  if (auto src_shaped_ty = mlir::dyn_cast<ShapedType>(x.getType())) {
     Type src_ty = src_shaped_ty.getElementType();
     return CreateConst(b, src_ty, 1, src_shaped_ty.getShape());
   }
@@ -254,7 +257,7 @@ Value Cast(ImplicitLocOpBuilder& b, Value value, Type dst_element_ty) {
   Type src_element_ty = src_ty;
   Type fp32_ty = b.getF32Type();
   Type dst_ty = dst_element_ty;
-  if (auto src_shaped_ty = src_ty.dyn_cast<ShapedType>()) {
+  if (auto src_shaped_ty = mlir::dyn_cast<ShapedType>(src_ty)) {
     src_element_ty = src_shaped_ty.getElementType();
     dst_ty = src_shaped_ty.clone(src_shaped_ty.getShape(), dst_element_ty);
     fp32_ty = src_shaped_ty.clone(src_shaped_ty.getShape(), b.getF32Type());
@@ -275,8 +278,8 @@ Value Cast(ImplicitLocOpBuilder& b, Value value, Type dst_element_ty) {
   }
 
   // float => float
-  auto src_fp_element_ty = src_element_ty.dyn_cast<mlir::FloatType>();
-  auto dst_fp_element_ty = dst_element_ty.dyn_cast<mlir::FloatType>();
+  auto src_fp_element_ty = mlir::dyn_cast<mlir::FloatType>(src_element_ty);
+  auto dst_fp_element_ty = mlir::dyn_cast<mlir::FloatType>(dst_element_ty);
   if (src_fp_element_ty && dst_fp_element_ty) {
     if (src_fp_element_ty.getFPMantissaWidth() >
         dst_fp_element_ty.getFPMantissaWidth()) {
@@ -286,8 +289,8 @@ Value Cast(ImplicitLocOpBuilder& b, Value value, Type dst_element_ty) {
     }
   }
   // int => int
-  if (src_element_ty.isa<mlir::IntegerType>() &&
-      dst_element_ty.isa<mlir::IntegerType>()) {
+  if (mlir::isa<mlir::IntegerType>(src_element_ty) &&
+      mlir::isa<mlir::IntegerType>(dst_element_ty)) {
     if (src_element_ty.getIntOrFloatBitWidth() <
         dst_element_ty.getIntOrFloatBitWidth()) {
       if (src_element_ty.isInteger(1)) {
@@ -298,7 +301,7 @@ Value Cast(ImplicitLocOpBuilder& b, Value value, Type dst_element_ty) {
     return b.create<ma::TruncIOp>(dst_ty, value);
   }
   // int => float
-  if (src_element_ty.isa<mlir::IntegerType>() && dst_fp_element_ty) {
+  if (mlir::isa<mlir::IntegerType>(src_element_ty) && dst_fp_element_ty) {
     // TODO(b/266862493): Support unsigned integer types.
     if (src_element_ty.isInteger(1)) {
       return b.create<ma::UIToFPOp>(dst_ty, value);
@@ -306,7 +309,7 @@ Value Cast(ImplicitLocOpBuilder& b, Value value, Type dst_element_ty) {
     return b.create<ma::SIToFPOp>(dst_ty, value);
   }
   // float => int
-  if (src_fp_element_ty && dst_element_ty.isa<mlir::IntegerType>()) {
+  if (src_fp_element_ty && mlir::isa<mlir::IntegerType>(dst_element_ty)) {
     // TODO(b/266862493): Support unsigned integer types.
     if (dst_element_ty.isInteger(1)) {
       return b.create<ma::CmpFOp>(ma::CmpFPredicate::UNE, value,
@@ -321,7 +324,7 @@ Value Cast(ImplicitLocOpBuilder& b, Value value, Type dst_element_ty) {
 }
 
 Value Subtract(ImplicitLocOpBuilder& b, ValueRange values) {
-  if (mlir::getElementTypeOrSelf(values[0]).isa<mlir::IntegerType>()) {
+  if (mlir::isa<mlir::IntegerType>(mlir::getElementTypeOrSelf(values[0]))) {
     return b.create<ma::SubIOp>(values[0], values[1]);
   } else {
     return b.create<ma::SubFOp>(values[0], values[1]);
@@ -331,7 +334,7 @@ Value Subtract(ImplicitLocOpBuilder& b, ValueRange values) {
 Value Compare(ImplicitLocOpBuilder& b, ValueRange values,
               mlir::mhlo::ComparisonDirection direction) {
   const Type type = mlir::getElementTypeOrSelf(values[0]);
-  if (type.isa<mlir::IntegerType>()) {
+  if (mlir::isa<mlir::IntegerType>(type)) {
     return b.create<ma::CmpIOp>(
         mlir::mhlo::impl::getCmpPredicate<ma::CmpIPredicate>(
             direction,
@@ -348,7 +351,7 @@ Value Compare(ImplicitLocOpBuilder& b, ValueRange values,
 
 Value Maximum(ImplicitLocOpBuilder& b, const se::DeviceDescription& device_info,
               ValueRange values) {
-  if (mlir::getElementTypeOrSelf(values[0]).isa<mlir::FloatType>()) {
+  if (mlir::isa<mlir::FloatType>(mlir::getElementTypeOrSelf(values[0]))) {
     return b.create<ma::MaximumFOp>(values);
   }
   // logic: isNaN(lhs) || (!isNan(rhs) && lhs >= rhs) ? lhs : rhs
@@ -369,7 +372,7 @@ Value Maximum(ImplicitLocOpBuilder& b, const se::DeviceDescription& device_info,
 
 Value Minimum(ImplicitLocOpBuilder& b, const se::DeviceDescription& device_info,
               ValueRange values) {
-  if (mlir::getElementTypeOrSelf(values[0]).isa<mlir::FloatType>()) {
+  if (mlir::isa<mlir::FloatType>(mlir::getElementTypeOrSelf(values[0]))) {
     return b.create<ma::MinimumFOp>(values);
   }
   // logic: isNaN(lhs) || (!isNan(rhs) && lhs <= rhs) ? lhs : rhs
@@ -434,7 +437,7 @@ absl::StatusOr<Value> EmitElementwise(ImplicitLocOpBuilder& b,
     }
   }
   const bool is_integer =
-      mlir::getElementTypeOrSelf(inputs[0]).isa<mlir::IntegerType>();
+      mlir::isa<mlir::IntegerType>(mlir::getElementTypeOrSelf(inputs[0]));
 
   switch (hlo.opcode()) {
     case HloOpcode::kCopy:
@@ -445,6 +448,10 @@ absl::StatusOr<Value> EmitElementwise(ImplicitLocOpBuilder& b,
         return b.create<mm::AbsIOp>(inputs[0]);
       }
       return b.create<mm::AbsFOp>(inputs[0]);
+    case HloOpcode::kCeil:
+      return b.create<mm::CeilOp>(inputs[0]);
+    case HloOpcode::kFloor:
+      return b.create<mm::FloorOp>(inputs[0]);
     case HloOpcode::kNot:
       return b.create<ma::XOrIOp>(inputs[0], OnesLike(b, inputs[0]));
     case HloOpcode::kNegate:
@@ -560,7 +567,7 @@ absl::StatusOr<Value> EmitBroadcast(
       out_shape.push_back(dim.block_size);
     }
   }
-  auto tensor_input = input.dyn_cast<TensorValue>();
+  auto tensor_input = mlir::dyn_cast<TensorValue>(input);
   if (!tensor_input) {
     // Input is scalar.
     return Splat(b, input, out_shape);
@@ -583,7 +590,7 @@ absl::StatusOr<Value> EmitBroadcast(
       ++dim_idx;
     }
   }
-  return Broadcast(b, expanded_input.cast<TensorValue>(), out_shape);
+  return Broadcast(b, mlir::cast<TensorValue>(expanded_input), out_shape);
 }
 
 absl::StatusOr<Value> EmitScope(
@@ -600,7 +607,7 @@ absl::StatusOr<Value> EmitReduce(ImplicitLocOpBuilder& b,
                                  const HloInstruction& hlo_reduce,
                                  Value input) {
   llvm::ArrayRef<int64_t> input_shape =
-      input.cast<TensorValue>().getType().getShape();
+      mlir::cast<TensorValue>(input).getType().getShape();
 
   // At the moment, we should only emit a full reduction over the last axis of
   // a single input.
@@ -691,7 +698,7 @@ absl::StatusOr<Value> EmitReduce(ImplicitLocOpBuilder& b,
   // We want to return a tensor of float32, but the ReturnReduceOp produces an
   // f32 constant when reducing a single dim. To convert to a tensor we splat
   // the result.
-  if (!reduction.getResult().front().dyn_cast<TensorValue>()) {
+  if (!mlir::dyn_cast<TensorValue>(reduction.getResult().front())) {
     result = Splat(b, result, {});
   }
 
@@ -791,7 +798,7 @@ Value EmitTiledBroadcast(
     }
   }
 
-  return Broadcast(b, expanded_input.cast<TensorValue>(),
+  return Broadcast(b, mlir::cast<TensorValue>(expanded_input),
                    padded_output_tile_shape);
 }
 
@@ -850,22 +857,22 @@ absl::StatusOr<Value> EmitTiledHloInstruction(
 absl::StatusOr<Value> EmitTiledScope(
     ImplicitLocOpBuilder& b, absl::string_view libdevice_path,
     const se::DeviceDescription& device_info,
-    const std::vector<std::unique_ptr<TiledHloInstruction>>&
-        tiled_hlo_instructions,
+    const TiledHloComputation& tiled_computation,
     std::function<absl::StatusOr<Value>(const TiledHloInstruction&)>
         emit_param_load_fn,
     absl::flat_hash_map<const TiledHloInstruction*, Value>& values) {
-  for (const auto& tiled_hlo : tiled_hlo_instructions) {
+  for (const TiledHloInstruction* tiled_hlo :
+       tiled_computation.instructions()) {
     TF_ASSIGN_OR_RETURN(
         Value result,
         EmitTiledHloInstruction(b, libdevice_path, device_info, *tiled_hlo,
                                 emit_param_load_fn, values));
-    TF_RET_CHECK(values.insert({tiled_hlo.get(), result}).second)
+    TF_RET_CHECK(values.insert({tiled_hlo, result}).second)
         << tiled_hlo->hlo()->ToString();
     VLOG(8) << "Emitted "
             << tiled_hlo->hlo()->ToString(HloPrintOptions::ShortParsable());
   }
-  return values[tiled_hlo_instructions.back().get()];
+  return values[tiled_computation.GetRoot()];
 }
 
 // Emit sequence of instructions using compatible tiling ordered producers
@@ -879,12 +886,14 @@ absl::StatusOr<Value> EmitScope(
     absl::flat_hash_map<const HloInstruction*, Value>& values) {
   for (const HloInstruction* hlo : instructions) {
     Value result;
-    if (hlo->opcode() == HloOpcode::kConcatenate) {
+    if (hlo->opcode() == HloOpcode::kConcatenate ||
+        hlo->opcode() == HloOpcode::kDynamicSlice) {
       // Parameter loads and their concatenations are handled outside EmitScope.
       TF_RET_CHECK(values.contains(hlo)) << hlo->ToString();
       continue;
     } else if (hlo->opcode() == HloOpcode::kParameter) {
-      if (hlo->users()[0]->opcode() == HloOpcode::kConcatenate) {
+      if (hlo->users()[0]->opcode() == HloOpcode::kConcatenate ||
+          hlo->users()[0]->opcode() == HloOpcode::kDynamicSlice) {
         continue;
       }
       TF_RET_CHECK(values.contains(hlo)) << hlo->ToString();
@@ -957,7 +966,7 @@ void StripParameterAddressSpaces(mlir::RewriterBase& rewriter,
   ml::LLVMFunctionType func_ty = func.getFunctionType();
   SmallVector<Type> generic_func_params(
       llvm::map_range(func_ty.getParams(), [](Type type) -> Type {
-        auto ptr_ty = type.dyn_cast<ml::LLVMPointerType>();
+        auto ptr_ty = mlir::dyn_cast<ml::LLVMPointerType>(type);
         if (!ptr_ty) return type;
         if (ptr_ty.getAddressSpace() != mn::kGlobalMemorySpace) return type;
         return ml::LLVMPointerType::get(ptr_ty.getContext());
@@ -967,8 +976,9 @@ void StripParameterAddressSpaces(mlir::RewriterBase& rewriter,
 
   // Create a function with the new signature.
   SmallVector<mlir::DictionaryAttr> arg_attrs(llvm::map_range(
-      func.getArgAttrsAttr().getValue(),
-      [](mlir::Attribute attr) { return attr.cast<mlir::DictionaryAttr>(); }));
+      func.getArgAttrsAttr().getValue(), [](mlir::Attribute attr) {
+        return mlir::cast<mlir::DictionaryAttr>(attr);
+      }));
   auto generic_func = rewriter.create<ml::LLVMFuncOp>(
       func.getLoc(), func.getSymName(), generic_func_ty, func.getLinkage(),
       func.getDsoLocal(), func.getCConv(), /*comdat=*/nullptr,
@@ -1129,18 +1139,18 @@ struct MatMulLaunchConfig {
   matmul_dims.n = iter_spec->at(0).count;
   // Contracting dimension length.
   if (config.split_k > 1 &&
-      dot.operand(0)->operand(0)->opcode() == HloOpcode::kPad) {
+      dot.operand(1)->operand(0)->opcode() == HloOpcode::kPad) {
     // Unpadded LHS shape:  [..., k, ...]
     // Padded LHS shape:    [..., padded_k, ...]
     // Bitcasted LHS shape: [..., split_k, padded_k / split_k, ...]
-    TF_RET_CHECK(dot.operand(0)->opcode() == HloOpcode::kBitcast);
-    const Shape& unpadded_lhs_shape =
-        dot.operand(0)->operand(0)->operand(0)->shape();
+    TF_RET_CHECK(dot.operand(1)->opcode() == HloOpcode::kBitcast);
+    const Shape& unpadded_rhs_shape =
+        dot.operand(1)->operand(0)->operand(0)->shape();
     matmul_dims.k =
-        unpadded_lhs_shape.dimensions(dims.lhs_contracting_dimensions(0) - 1);
+        unpadded_rhs_shape.dimensions(dims.rhs_contracting_dimensions(0) - 1);
   } else {
     matmul_dims.k =
-        dot.operand(0)->shape().dimensions(dims.lhs_contracting_dimensions(0)) *
+        dot.operand(1)->shape().dimensions(dims.rhs_contracting_dimensions(0)) *
         config.split_k;
   }
 
@@ -1316,7 +1326,8 @@ class MatMulEmitterHelper {
     CHECK(accum_type.ok()) << "Unexpected algorithm: "
                            << PrecisionConfig::Algorithm_Name(algorithm);
     Type mlir_accum_type = TritonType(b_, accum_type.value());
-    if (auto float_accum_type = mlir_accum_type.dyn_cast<mlir::FloatType>()) {
+    if (auto float_accum_type =
+            mlir::dyn_cast<mlir::FloatType>(mlir_accum_type)) {
       return float_accum_type;
     }
     LOG(FATAL) << "Only floating point accumulator types are supported for "
@@ -1367,6 +1378,18 @@ class MatMulEmitterHelper {
         values);
   }
 
+  int64_t GetNonContractingDimIdxForOperandScope(
+      TritonFusionAnalysis::Scope scope) {
+    if (scope == TritonFusionAnalysis::Scope::LHS) {
+      return dims_.lhs_noncontracting_dim_idx;
+    } else if (scope == TritonFusionAnalysis::Scope::RHS) {
+      return dims_.rhs_noncontracting_dim_idx;
+    } else {
+      CHECK(false) << "This shouldn't be called for the output scope.";
+    }
+  }
+
+  // bases: The base pointers of each argument.
   absl::StatusOr<Value> EmitTensorPointer(
       const HloInstruction* hlo, const Side& side, ValueRange bases,
       Value pid_k, std::vector<int32_t>& boundary_checks) {
@@ -1482,7 +1505,74 @@ class MatMulEmitterHelper {
             select_value,
             EmitMultiSelect(b_, pid_offset, concat_boundaries, input_bounds));
         bounds.push_back(select_value);
+        tensor_offsets.push_back(Cst32(specs.front()->at(0).slice_start));
+      } else if (hlo->opcode() == HloOpcode::kDynamicSlice &&
+                 (side.scope == TritonFusionAnalysis::Scope::LHS ||
+                  side.scope == TritonFusionAnalysis::Scope::RHS) &&
+                 properties.index ==
+                     GetNonContractingDimIdxForOperandScope(side.scope)) {
+        // Here we compute the offset of where we should read the slice from.
+        // TODO(b/323255699): Add support for slices of the contracting dim.
+        // Dynamic slices are guaranteed to only be offset along the majormost
+        // dimension.
+
+        // The only fragment of the non-contracting dim of the dot's input in
+        // the current scope:
+        TF_RET_CHECK(specs.back()->size() == 1);
+        const TensorIterationSpec::IterationSpecFragment
+            only_fragment_of_nc_dim = specs.back()->at(0);
+        // The majormost dim index in the dynamic slice's output.
+        const int majormost_dim = hlo->shape().layout().minor_to_major().back();
+
+        // dynamic slice operands are (input, start_index0, start_index1, ...)
+        // so the start index corresponding to the ith dimension is bases[i+1].
+        Value majormost_dim_start_index_ptr_val = bases[majormost_dim + 1];
+        Value majormost_dim_start_index_val = b_.create<mt::LoadOp>(
+            majormost_dim_start_index_ptr_val, mt::CacheModifier::NONE,
+            mt::EvictionPolicy::NORMAL,
+            /*isVolatile=*/false);
+        Value majormost_dim_start_index_lower_limit_val =
+            CreateConst(b_, majormost_dim_start_index_val.getType(), 0);
+        int64_t majormost_dim_start_index_upper_limit =
+            hlo->operand(0)->shape().dimensions(majormost_dim) -
+            hlo->dynamic_slice_sizes().at(majormost_dim);
+        Value majormost_dim_start_index_upper_limit_val =
+            CreateConst(b_, majormost_dim_start_index_val.getType(),
+                        majormost_dim_start_index_upper_limit);
+        // Our Triton codegen only supports signed integers so far.
+        majormost_dim_start_index_val =
+            b_.create<ma::MaxSIOp>(majormost_dim_start_index_val,
+                                   majormost_dim_start_index_lower_limit_val);
+        majormost_dim_start_index_val =
+            b_.create<ma::MinSIOp>(majormost_dim_start_index_val,
+                                   majormost_dim_start_index_upper_limit_val);
+
+        // How many "rows" (non-contracting dim values) are there in a slice of
+        // size 1?
+        int64_t rows_per_majormost_dim = 1;
+        for (int i = 0; i < hlo->shape().dimensions().size() - 1; ++i) {
+          rows_per_majormost_dim *= hlo->shape().dimensions_minor(i);
+        }
+        rows_per_majormost_dim =
+            rows_per_majormost_dim / only_fragment_of_nc_dim.stride;
+        Value rows_per_majormost_dim_val = Cst32(rows_per_majormost_dim);
+
+        Value tensor_offset_val_i32 = b_.create<ma::MulIOp>(
+            majormost_dim_start_index_val, rows_per_majormost_dim_val);
+        tensor_offsets.push_back(tensor_offset_val_i32);
+
+        // tt.make_tensor_ptr expects an i64 for shape and size, but expects
+        // i32 for offsets. We extend the offset to calculate the upper bound.
+        Value tensor_offset_val_i64 =
+            b_.create<ma::ExtSIOp>(i64_ty_, tensor_offset_val_i32);
+        Value sliced_count_val = Cst64(only_fragment_of_nc_dim.sliced_count);
+        Value upper_bound_val =
+            b_.create<ma::AddIOp>(tensor_offset_val_i64, sliced_count_val);
+        bounds.push_back(upper_bound_val);
+
+        block_offsets.push_back(pid_offset);
       } else {
+        tensor_offsets.push_back(Cst32(specs.front()->at(0).slice_start));
         block_offsets.push_back(pid_offset);
         int64_t count = specs.front()->at(0).count;
         if (side.scope == TritonFusionAnalysis::Scope::OUTPUT &&
@@ -1498,7 +1588,6 @@ class MatMulEmitterHelper {
           boundary_checks.push_back(bounds.size() - 1);
         }
       }
-      tensor_offsets.push_back(Cst32(specs.front()->at(0).slice_start));
       block_dims.push_back(properties.block_size);
       dim_order.emplace(dim_order.begin(), dim_order.size());
       return absl::OkStatus();
@@ -1591,11 +1680,10 @@ class MatMulEmitterHelper {
       // Load of a scalar.
       return base;
     }
-    auto tensor_ptr =
+    auto tensor_ptr = mlir::cast<Value>(
         b_.create<mt::MakeTensorPtrOp>(base, bounds, strides, tensor_offsets,
                                        block_dims, dim_order)
-            .getResult()
-            .cast<Value>();
+            .getResult());
     tensor_ptr = b_.create<mt::AdvanceOp>(tensor_ptr.getType(), tensor_ptr,
                                           block_offsets);
     return tensor_ptr;
@@ -1643,13 +1731,17 @@ absl::StatusOr<LaunchDimensions> GetMatMulLaunchDimensions(
   return launch_config.launch_dims;
 }
 
-SmallVector<Value> GetArguments(mlir::triton::FuncOp fn,
-                                const HloInstruction& input) {
+absl::StatusOr<SmallVector<Value>> GetArguments(mlir::triton::FuncOp fn,
+                                                const HloInstruction& input) {
   if (input.opcode() == HloOpcode::kParameter) {
-    return {fn.getArgument(input.parameter_number())};
-  } else if (input.opcode() == HloOpcode::kConcatenate) {
+    return {{fn.getArgument(input.parameter_number())}};
+  } else if (input.opcode() == HloOpcode::kConcatenate ||
+             input.opcode() == HloOpcode::kDynamicSlice) {
+    // As defined in GemmFusion, all inputs of concatenate and dynamic slice are
+    // parameters.
     SmallVector<Value> result;
     for (const HloInstruction* operand : input.operands()) {
+      TF_RET_CHECK(operand->opcode() == HloOpcode::kParameter);
       result.push_back(fn.getArgument(operand->parameter_number()));
     }
     return result;
@@ -1666,7 +1758,8 @@ ConstHloInstructionSet ScopeInputs(const TritonFusionAnalysis& analysis,
   ConstHloInstructionSet result;
   for (const HloInstruction* parameter : analysis.ScopeParameters(scope)) {
     if (absl::c_any_of(parameter->users(), [](const HloInstruction* user) {
-          return user->opcode() == HloOpcode::kConcatenate;
+          return user->opcode() == HloOpcode::kConcatenate ||
+                 user->opcode() == HloOpcode::kDynamicSlice;
         })) {
       // Concatenation is always the only user of its parameters by
       // construction.
@@ -1687,7 +1780,7 @@ ConstHloInstructionSet ScopeInputs(const TritonFusionAnalysis& analysis,
 // zero.
 // It is used for Emit6xBfloat16MatMul.
 Value TruncateToBF16TowardsZero(ImplicitLocOpBuilder& b, Value input) {
-  ShapedType input_type = input.getType().dyn_cast<ShapedType>();
+  ShapedType input_type = mlir::dyn_cast<ShapedType>(input.getType());
   Type input_type_as_i32 = input_type.clone(b.getI32Type());
   Value input_as_i32 = b.create<mt::BitcastOp>(input_type_as_i32, input);
   Value mask = CreateConst<uint32_t>(b, b.getI32Type(), 0xFFFF0000u,
@@ -1723,7 +1816,7 @@ Value RoundToBF16(ImplicitLocOpBuilder& b, Value input) {
 Value CheckFiniteF32(ImplicitLocOpBuilder& b, Value input) {
   Value positive_inf = CreateConst<float>(
       b, b.getF32Type(), std::numeric_limits<float>::infinity(),
-      input.getType().cast<ShapedType>().getShape());
+      mlir::cast<ShapedType>(input.getType()).getShape());
   Value abs_input = b.create<mm::AbsFOp>(input);
   return b.create<ma::CmpFOp>(ma::CmpFPredicate::OGT, positive_inf, abs_input);
 }
@@ -1733,9 +1826,9 @@ Value CheckFiniteF32(ImplicitLocOpBuilder& b, Value input) {
 absl::StatusOr<Value> Emit6xBfloat16MatMul(ImplicitLocOpBuilder& b, Value lhs,
                                            Value rhs, Value acc) {
   Type f32 = b.getF32Type();
-  TF_RET_CHECK(lhs.getType().cast<ShapedType>().getElementType() == f32);
-  TF_RET_CHECK(rhs.getType().cast<ShapedType>().getElementType() == f32);
-  TF_RET_CHECK(acc.getType().cast<ShapedType>().getElementType() == f32);
+  TF_RET_CHECK(mlir::cast<ShapedType>(lhs.getType()).getElementType() == f32);
+  TF_RET_CHECK(mlir::cast<ShapedType>(rhs.getType()).getElementType() == f32);
+  TF_RET_CHECK(mlir::cast<ShapedType>(acc.getType()).getElementType() == f32);
 
   Value lhs_high = RoundToBF16(b, TruncateToBF16TowardsZero(b, lhs));
   Value lhs_middle =
@@ -1752,7 +1845,7 @@ absl::StatusOr<Value> Emit6xBfloat16MatMul(ImplicitLocOpBuilder& b, Value lhs,
   auto bf16_dot = [&](Value lhs_bf16, Value rhs_bf16,
                       Value accumulator) -> Value {
     return b.create<mt::DotOp>(lhs_bf16, rhs_bf16, accumulator,
-                               /*allowTF32=*/false,
+                               /*inputPrecision=*/mt::InputPrecision::IEEE,
                                /*maxNumImpreciseAcc=*/0);
   };
 
@@ -1781,9 +1874,9 @@ absl::StatusOr<Value> Emit6xBfloat16MatMul(ImplicitLocOpBuilder& b, Value lhs,
 absl::StatusOr<Value> Emit3xBfloat16MatMul(ImplicitLocOpBuilder& b, Value lhs,
                                            Value rhs, Value acc) {
   Type f32 = b.getF32Type();
-  TF_RET_CHECK(lhs.getType().cast<ShapedType>().getElementType() == f32);
-  TF_RET_CHECK(rhs.getType().cast<ShapedType>().getElementType() == f32);
-  TF_RET_CHECK(acc.getType().cast<ShapedType>().getElementType() == f32);
+  TF_RET_CHECK(mlir::cast<ShapedType>(lhs.getType()).getElementType() == f32);
+  TF_RET_CHECK(mlir::cast<ShapedType>(rhs.getType()).getElementType() == f32);
+  TF_RET_CHECK(mlir::cast<ShapedType>(acc.getType()).getElementType() == f32);
 
   Value lhs_high = RoundToBF16(b, TruncateToBF16TowardsZero(b, lhs));
   Value lhs_low = RoundToBF16(b, SoftMiddleEight(b, lhs));
@@ -1794,7 +1887,7 @@ absl::StatusOr<Value> Emit3xBfloat16MatMul(ImplicitLocOpBuilder& b, Value lhs,
   auto bf16_dot = [&](Value lhs_bf16, Value rhs_bf16,
                       Value accumulator) -> Value {
     return b.create<mt::DotOp>(lhs_bf16, rhs_bf16, accumulator,
-                               /*allowTF32=*/false,
+                               /*inputPrecision=*/mt::InputPrecision::IEEE,
                                /*maxNumImpreciseAcc=*/0);
   };
 
@@ -1838,8 +1931,10 @@ bool Is6xBfloat16MatMul(const HloDotInstruction* dot_instr,
     return hlo_module->config()
                .debug_options()
                .xla_gpu_enable_bf16_6way_gemm() &&
-           dot_input_lhs.getType().cast<ShapedType>().getElementType() == f32 &&
-           dot_input_rhs.getType().cast<ShapedType>().getElementType() == f32;
+           mlir::cast<ShapedType>(dot_input_lhs.getType()).getElementType() ==
+               f32 &&
+           mlir::cast<ShapedType>(dot_input_rhs.getType()).getElementType() ==
+               f32;
   }
 
   return algorithm == PrecisionConfig::ALG_DOT_BF16_BF16_F32_X6;
@@ -1858,8 +1953,10 @@ bool Is3xBfloat16MatMul(const HloDotInstruction* dot_instr,
     return hlo_module->config()
                .debug_options()
                .xla_gpu_enable_bf16_3way_gemm() &&
-           dot_input_lhs.getType().cast<ShapedType>().getElementType() == f32 &&
-           dot_input_rhs.getType().cast<ShapedType>().getElementType() == f32;
+           mlir::cast<ShapedType>(dot_input_lhs.getType()).getElementType() ==
+               f32 &&
+           mlir::cast<ShapedType>(dot_input_rhs.getType()).getElementType() ==
+               f32;
   }
 
   return algorithm == PrecisionConfig::ALG_DOT_BF16_BF16_F32_X3;
@@ -1917,7 +2014,8 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
   const HloInstruction* instr =
       hlo_query::GetFirstInstructionWithOpcode(*computation, HloOpcode::kDot);
   const HloDotInstruction* dot_instr = DynCast<HloDotInstruction>(instr);
-  TF_RET_CHECK(!dot_instr->sparse_operands());
+  bool is_sparse = dot_instr->sparse_operands() > 0;
+
   // Use 32-bit indexing if addressing any of the inputs or the output (which
   // could grow if split_k is set) does not cross the INT_MAX boundary.
   // Otherwise, fall back to 64-bit indexing, which is slower.
@@ -1930,18 +2028,15 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
   const HloInstruction* root = dot_instr->parent()->root_instruction();
   TF_RET_CHECK(!root->shape().IsTuple());
 
-  HloInstructionAdaptor instr_adaptor{*instr};
-  auto fusion_adaptor = HloFusionAdaptor::ForComputation(computation);
   // TODO(b/320659359) Allow TF32 for 8-bit or less types with F32.
-  bool is_8_bit_or_less_dot_with_F32 = HloAnyOf(
-      instr_adaptor.GetOperands(), *fusion_adaptor,
-      [&](HloInstructionAdaptor node) {
-        if (node.opcode() != HloOpcode::kConvert) {
+  bool is_unsupported_bitwidth =
+      HloAnyOf({dot_instr}, [&](const HloInstruction* node) {
+        if (node->opcode() != HloOpcode::kConvert) {
           return false;
         }
         Type in_type =
-            TritonType(builder, node.GetOperand(0).shape().element_type());
-        Type out_type = TritonType(builder, node.shape().element_type());
+            TritonType(builder, node->operand(0)->shape().element_type());
+        Type out_type = TritonType(builder, node->shape().element_type());
         return in_type.getIntOrFloatBitWidth() <= 8 && out_type.isF32();
       });
 
@@ -1950,7 +2045,6 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
   // time.
   auto loc = mlir::NameLoc::get(builder.getStringAttr(dot_instr->name()));
   ImplicitLocOpBuilder b(loc, builder);
-  Type i32_ty = b.getI32Type();
 
   TF_RETURN_IF_ERROR(ValidateMatMulConfig(config, *dot_instr));
   const int split_k = config.split_k;
@@ -2000,13 +2094,13 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
   absl::flat_hash_map<int, const HloInstruction*> iter_args_to_inputs;
   absl::flat_hash_map<int, std::vector<int32_t>> iter_args_to_boundary_checks;
 
-  Side lhs{
-      TritonFusionAnalysis::Scope::LHS,
-      /*tiled_dims=*/
-      {DimProperties(dims.lhs_noncontracting_dim_idx, pid_m, block_m,
-                     /*split_value=*/1),
-       DimProperties(dims.lhs_contracting_dim_idx, pid_k, block_k, split_k)},
-      dims.lhs_batch_dim_idx};
+  Side lhs{TritonFusionAnalysis::Scope::LHS,
+           /*tiled_dims=*/
+           {DimProperties(dims.lhs_noncontracting_dim_idx, pid_m, block_m,
+                          /*split_value=*/1),
+            DimProperties(dims.lhs_contracting_dim_idx, pid_k,
+                          block_k / (1 + is_sparse), split_k)},
+           dims.lhs_batch_dim_idx};
   Side rhs{
       TritonFusionAnalysis::Scope::RHS,
       /*tiled_dims=*/
@@ -2022,22 +2116,37 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
                           /*split_value=*/1)},
            dims.out_batch_dim_idx};
 
+  std::vector<Side> scopes = {lhs, rhs};
+  if (is_sparse) {
+    scopes.push_back(
+        {TritonFusionAnalysis::Scope::META,
+         /*tiled_dims=*/
+         {DimProperties(dims.lhs_noncontracting_dim_idx, pid_m, block_m,
+                        /*split_value=*/1),
+          DimProperties(dims.lhs_contracting_dim_idx, pid_k, block_k / 16,
+                        split_k)},
+         dims.lhs_batch_dim_idx});
+  }
+
+  constexpr size_t kLhsMetaOperandIdx = HloDotInstruction::kOperands;
+  size_t lsize = ScopeInputs(analysis, TritonFusionAnalysis::Scope::LHS).size();
+  size_t rsize = ScopeInputs(analysis, TritonFusionAnalysis::Scope::RHS).size();
+
   auto body_builder = [&](mlir::OpBuilder&, mlir::Location, Value ki,
                           ValueRange iter_args) {
     SmallVector<Value> iter_args_next;
     iter_args_next.reserve(iter_args.size());
-    absl::flat_hash_map<const HloInstruction*, Value> values_lhs;
-    absl::flat_hash_map<const HloInstruction*, Value> values_rhs;
+    std::array<absl::flat_hash_map<const HloInstruction*, Value>, 3> values;
 
     // Load tiles of all parameters of LHS and RHS scopes and advance pointers.
     for (int i = 0; i < iter_args.size() - 1; ++i) {
-      const bool is_lhs =
-          i < ScopeInputs(analysis, TritonFusionAnalysis::Scope::LHS).size();
-      Side& side = is_lhs ? lhs : rhs;
-      auto& values = is_lhs ? values_lhs : values_rhs;
+      const int index = i < lsize ? 0 : i < lsize + rsize ? 1 : 2;
+      Side& side = scopes[index];
 
       const HloInstruction* param_hlo = iter_args_to_inputs[i];
-      Type param_ty = TritonType(b, param_hlo->shape().element_type());
+      Type param_ty = index == kLhsMetaOperandIdx
+                          ? b.getI16Type()
+                          : TritonType(b, param_hlo->shape().element_type());
       Type param_storage_ty = StorageType(b, param_ty);
       Value param_value =
           EmitParameterLoad(b, iter_args[i], iter_args_to_boundary_checks[i]);
@@ -2046,7 +2155,7 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
         param_value = Cast(b, param_value, param_ty);
       }
 
-      CHECK(values.insert({param_hlo, param_value}).second);
+      CHECK(values[index].insert({param_hlo, param_value}).second);
       SmallVector<Value> increments;
       for (const DimProperties& dim : side.tiled_dims) {
         const TensorIterationSpec::DimIterationSpec* spec =
@@ -2055,8 +2164,9 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
           continue;
         }
         // Only the contracting dimensions are advanced.
-        if (dim.index == (is_lhs ? dims.lhs_contracting_dim_idx
-                                 : dims.rhs_contracting_dim_idx)) {
+        if (dim.index == (index == 0 || index == kLhsMetaOperandIdx
+                              ? dims.lhs_contracting_dim_idx
+                              : dims.rhs_contracting_dim_idx)) {
           increments.push_back(c32(dim.block_size * split_k));
         } else {
           increments.push_back(c32(0));
@@ -2071,8 +2181,10 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
     }
 
     // Emit all operations of LHS and RHS scopes.
-    Value dot_input_lhs = emitter.MakeInput(lhs, 0, values_lhs);
-    Value dot_input_rhs = emitter.MakeInput(rhs, 1, values_rhs);
+    Value dot_input_lhs = emitter.MakeInput(lhs, 0, values[0]);
+    Value dot_input_rhs = emitter.MakeInput(rhs, 1, values[1]);
+    Value dot_input_meta =
+        is_sparse ? emitter.MakeInput(scopes.back(), 2, values[2]) : Value{};
 
     // Operation in the fusion before the dot can alter the elements of the
     // tiles that were zero masked during loads. These have to be zeroed here
@@ -2081,21 +2193,17 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
     // the other two get discarded by the masked store at the end.
     const bool need_masking = dims.k % (block_k * split_k) > 0;
     if (need_masking) {
-      auto elements_in_tile =
-          b.create<ma::SubIOp>(CreateConst(b, i32_ty, dims.k), ki);
-      auto range_k = Range(b, block_k);
-      if (pid_k != nullptr) {
-        range_k = b.create<ma::AddIOp>(
-            range_k,
-            Splat(b,
-                  b.create<ma::MulIOp>(pid_k, CreateConst(b, i32_ty, block_k)),
-                  block_k));
-      }
-      auto apply_mask = [&](int64_t dim, Value input) {
-        auto ty = input.getType().cast<mlir::RankedTensorType>();
-        TensorValue range_expanded = b.create<mt::ExpandDimsOp>(range_k, dim)
-                                         .getResult()
-                                         .cast<TensorValue>();
+      auto apply_mask = [&](int64_t dim, Value input, int denom) {
+        auto elements_in_tile = b.create<ma::SubIOp>(c32(dims.k / denom), ki);
+        int size = block_k / denom;
+        auto range_k = Range(b, size);
+        if (pid_k != nullptr) {
+          range_k = b.create<ma::AddIOp>(
+              range_k, Splat(b, b.create<ma::MulIOp>(pid_k, c32(size)), size));
+        }
+        auto ty = mlir::cast<mlir::RankedTensorType>(input.getType());
+        TensorValue range_expanded = mlir::cast<TensorValue>(
+            b.create<mt::ExpandDimsOp>(range_k, dim).getResult());
         Value mask = b.create<mt::BroadcastOp>(
             ty.clone(b.getI1Type()),
             b.create<ma::CmpIOp>(ma::CmpIPredicate::slt, range_expanded,
@@ -2103,8 +2211,17 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
                                        range_expanded.getType().getShape())));
         return b.create<ma::SelectOp>(mask, input, ZerosLike(b, input));
       };
-      dot_input_lhs = apply_mask(0, dot_input_lhs);
-      dot_input_rhs = apply_mask(1, dot_input_rhs);
+      dot_input_lhs = apply_mask(0, dot_input_lhs, is_sparse ? 2 : 1);
+      dot_input_rhs = apply_mask(1, dot_input_rhs, 1);
+      // Masking the metadata is not necessary, as the inputs are masked
+      // (i.e. zeroed out), so the padded metadata can hold any values.
+    }
+
+    if (is_sparse) {
+      iter_args_next.push_back(b.create<mt::gpu::SparseDotOp>(
+          dot_input_lhs, dot_input_rhs, iter_args.back(), dot_input_meta));
+      b.create<mlir::scf::YieldOp>(iter_args_next);
+      return;
     }
 
     const HloModule* hlo_module = dot_instr->GetModule();
@@ -2133,10 +2250,13 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
       // maxNumImpreciseAcc flag was introduced for Hopper to accumulate in a
       // lower precision than the output type. The change was introduced here:
       // https://github.com/openai/triton/commit/31b0c521427109a8eda609b58d756c380b21599a
+      auto input_precision =
+          IsTf32Allowed(dot_instr) && !is_unsupported_bitwidth
+              ? mt::InputPrecision::TF32
+              : mt::InputPrecision::IEEE;
       accumulator_next =
           b.create<mt::DotOp>(dot_input_lhs, dot_input_rhs, iter_args.back(),
-                              /*allowTF32=*/IsTf32Allowed(dot_instr) &&
-                                  !is_8_bit_or_less_dot_with_F32,
+                              /*inputPrecision=*/input_precision,
                               /*maxNumImpreciseAcc=*/0);
     }
     iter_args_next.push_back(accumulator_next);
@@ -2147,17 +2267,17 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
   // Pointers to inputs of LHS scope, then RHS, then the accumulator
   // that change with every loop iteration and are passed between them.
   SmallVector<Value> iter_args;
-  iter_args.reserve(
-      ScopeInputs(analysis, TritonFusionAnalysis::Scope::LHS).size() +
-      ScopeInputs(analysis, TritonFusionAnalysis::Scope::RHS).size() + 1);
+  iter_args.reserve(lsize + rsize + 1 + is_sparse);
 
-  for (const Side& side : {lhs, rhs}) {
+  for (const Side& side : scopes) {
     for (const HloInstruction* input : ScopeInputs(analysis, side.scope)) {
       TF_RET_CHECK(
           iter_args_to_inputs.insert({iter_args.size(), input}).second);
+      TF_ASSIGN_OR_RETURN(SmallVector<Value> arguments,
+                          GetArguments(fn, *input));
       TF_ASSIGN_OR_RETURN(Value tensor_ptr,
                           emitter.EmitTensorPointer(
-                              input, side, GetArguments(fn, *input), pid_k,
+                              input, side, arguments, pid_k,
                               iter_args_to_boundary_checks[iter_args.size()]));
       iter_args.push_back(tensor_ptr);
     }
@@ -2181,10 +2301,11 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
     for (const HloInstruction* input :
          ScopeInputs(analysis, TritonFusionAnalysis::Scope::OUTPUT)) {
       std::vector<int32_t> boundary_checks;
-      TF_ASSIGN_OR_RETURN(
-          Value tensor_pointer,
-          emitter.EmitTensorPointer(input, out, GetArguments(fn, *input), pid_k,
-                                    boundary_checks));
+      TF_ASSIGN_OR_RETURN(SmallVector<Value> arguments,
+                          GetArguments(fn, *input));
+      TF_ASSIGN_OR_RETURN(Value tensor_pointer,
+                          emitter.EmitTensorPointer(input, out, arguments,
+                                                    pid_k, boundary_checks));
       TF_RET_CHECK(values_out
                        .insert({input, EmitParameterLoad(b, tensor_pointer,
                                                          boundary_checks)})
@@ -2291,9 +2412,8 @@ absl::Status EmitTiledSoftMax(mlir::OpBuilder builder,
       computation->root_instruction()->shape().rank(), 1);
   output_tile_sizes.back() = row_len;
 
-  TF_ASSIGN_OR_RETURN(
-      std::vector<std::unique_ptr<TiledHloInstruction>> tiled_hlo_instructions,
-      analysis->ComputeTiledHloInstructions(output_tile_sizes));
+  TF_ASSIGN_OR_RETURN(TiledHloComputation tiled_hlo_computation,
+                      analysis->ComputeTiledHloInstructions(output_tile_sizes));
 
   // block_size must be a power of two.
   int result_block_size = llvm::PowerOf2Ceil(row_len);
@@ -2341,11 +2461,11 @@ absl::Status EmitTiledSoftMax(mlir::OpBuilder builder,
   absl::flat_hash_map<const TiledHloInstruction*, Value> values_out;
   TF_ASSIGN_OR_RETURN(
       Value result,
-      EmitTiledScope(b, libdevice_path, device_info, tiled_hlo_instructions,
+      EmitTiledScope(b, libdevice_path, device_info, tiled_hlo_computation,
                      emit_param_load, values_out));
 
   Value ptr_offset =
-      ComputeBasePtrOffset(b, pid, *tiled_hlo_instructions.back());
+      ComputeBasePtrOffset(b, pid, *tiled_hlo_computation.GetRoot());
 
   Value store_tensor = b.create<mt::MakeTensorPtrOp>(
       /*base=*/AddPtr(b, fn.getArgument(computation->num_parameters()),
@@ -2555,8 +2675,9 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateTritonModule(
     const HloComputation* hlo_computation,
     const se::DeviceDescription& device_info, const TritonGemmConfig& config,
     TritonIrEmitter ir_emitter, mlir::MLIRContext& mlir_context) {
-  mlir_context.loadDialect<mt::TritonDialect, mlir::arith::ArithDialect,
-                           mlir::affine::AffineDialect>();
+  mlir_context
+      .loadDialect<mt::TritonDialect, mt::gpu::TritonGPUDialect,
+                   mlir::arith::ArithDialect, mlir::affine::AffineDialect>();
 
   mlir::OpBuilder b(&mlir_context);
   auto loc = mlir::NameLoc::get(b.getStringAttr(hlo_computation->name()));
@@ -2567,9 +2688,10 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateTritonModule(
   // Build Triton kernel.
   SmallVector<Type> fn_arg_types;
   for (HloInstruction* p : hlo_computation->parameter_instructions()) {
-    fn_arg_types.push_back(mt::PointerType::get(
-        StorageType(b, TritonType(b, p->shape().element_type())),
-        mn::kGlobalMemorySpace));
+    PrimitiveType type = p->shape().element_type();
+    Type ir_type = type != U16 ? TritonType(b, type) : b.getI16Type();
+    fn_arg_types.push_back(
+        mt::PointerType::get(StorageType(b, ir_type), mn::kGlobalMemorySpace));
   }
 
   for (const ShapeUtil::IndexedShape& s :
@@ -2682,15 +2804,17 @@ absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
       log_stream.emplace(path, err, llvm::sys::fs::OF_None);
       if (err) {
         log_stream.reset();
+        LOG(ERROR) << err.message();
+      } else {
+        pm.getContext()->disableMultithreading();
+        auto print_always = [](mlir::Pass*, mlir::Operation*) { return true; };
+        pm.enableIRPrinting(/*shouldPrintBeforePass=*/print_always,
+                            /*shouldPrintAfterPass=*/print_always,
+                            /*printModuleScope=*/true,
+                            /*printAfterOnlyOnChange=*/false,
+                            /*printAfterOnlyOnFailure=*/true, *log_stream,
+                            /*opPrintingFlags=*/{});
       }
-      pm.getContext()->disableMultithreading();
-      auto print_always = [](mlir::Pass*, mlir::Operation*) { return true; };
-      pm.enableIRPrinting(/*shouldPrintBeforePass=*/print_always,
-                          /*shouldPrintAfterPass=*/print_always,
-                          /*printModuleScope=*/true,
-                          /*printAfterOnlyOnChange=*/false,
-                          /*printAfterOnlyOnFailure=*/true, *log_stream,
-                          /*opPrintingFlags=*/{});
     } else {
       LOG(ERROR) << "--xla_gpu_dump_llvmir is set, but neither the environment "
                  << "variable TEST_UNDECLARED_OUTPUTS_DIR nor the flag "
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.h b/third_party/xla/xla/service/gpu/ir_emitter_triton.h
index 91cd73e70da7c3..a384db6a27965b 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.h
@@ -76,7 +76,7 @@ absl::Status EmitSoftMax(mlir::OpBuilder b, absl::string_view libdevice_path,
                          mlir::triton::FuncOp fn,
                          const TritonGemmConfig& config);
 
-using TritonIrEmitter = std::function<Status(
+using TritonIrEmitter = std::function<absl::Status(
     mlir::OpBuilder, absl::string_view, const se::DeviceDescription&,
     const TritonFusionAnalysis& analysis, const HloComputation*,
     mlir::triton::FuncOp, const TritonGemmConfig&)>;
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_cuda.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_cuda.cc
index c8e31b0e0f87ac..ec2c6e8136f6b3 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_cuda.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_cuda.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "nvidia/include/NVGPUToLLVM/NVGPUToLLVMPass.h"
 #include "nvidia/include/TritonNVIDIAGPUToLLVM/Passes.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_format.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"  // from @llvm-project
 #include "mlir/Conversion/IndexToLLVM/IndexToLLVM.h"  // from @llvm-project
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"  // from @llvm-project
@@ -60,14 +61,18 @@ absl::Status CreateTritonPipeline(
   // Based on make_ttgir() in
   // @triton//:third_party/nvidia/backend/compiler.py
   pm.addPass(mt::createConvertTritonToTritonGPUPass(
-      config.num_warps, threadsPerWarp, config.num_ctas, ccAsInt));
+      absl::StrFormat("cuda:%u", ccAsInt), config.num_warps, threadsPerWarp,
+      config.num_ctas));
   pm.addPass(mt::gpu::createCoalescePass());
+  if (ccCuda.IsAtLeastAmpere()) {
+    pm.addPass(mt::gpu::createF32DotTCPass());
+  }
   pm.addPass(mlir::createTritonNvidiaGPUPlanCTAPass(&out_cluster_info));
   pm.addPass(mt::gpu::createRemoveLayoutConversionsPass());
   pm.addPass(mt::gpu::createOptimizeThreadLocalityPass());
   pm.addPass(mt::gpu::createAccelerateMatmulPass(ccAsInt));
   pm.addPass(mt::gpu::createRemoveLayoutConversionsPass());
-  pm.addPass(mt::gpu::createOptimizeDotOperandsPass());
+  pm.addPass(mt::gpu::createOptimizeDotOperandsPass(ccCuda.IsAtLeastAmpere()));
   pm.addPass(mlir::createCSEPass());
 
   pm.addPass(mt::gpu::createPipelinePass(config.num_stages, config.num_warps,
@@ -77,25 +82,21 @@ absl::Status CreateTritonPipeline(
     pm.addPass(mt::gpu::createPrefetchPass());
   }
 
-  pm.addPass(mt::gpu::createOptimizeDotOperandsPass());
-  // We need to disable this pass because it undoes the hoisting of dot_operand
-  // layout conversion done in
-  // triton/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp in
-  // HoistLayoutConversion pattern.
-  // Bug: b/331360119
-  // pm.addPass(mt::gpu::createRemoveLayoutConversionsPass());
+  pm.addPass(mt::gpu::createOptimizeDotOperandsPass(ccCuda.IsAtLeastAmpere()));
+  pm.addPass(mt::gpu::createRemoveLayoutConversionsPass());
   pm.addPass(mt::gpu::createReduceDataDuplicationPass());
   pm.addPass(mt::gpu::createReorderInstructionsPass());
   pm.addPass(mlir::createCSEPass());
   pm.addPass(mlir::createSymbolDCEPass());
   if (ccCuda.IsAtLeastHopper()) {
     pm.addPass(mlir::createTritonNvidiaGPUFenceInsertionPass(ccAsInt));
+    pm.addPass(mlir::createTritonNvidiaGPUTMALoweringPass());
   }
   pm.addPass(mlir::createCanonicalizerPass());
 
   // Based on make_llir() in
   // @triton//:third_party/nvidia/backend/compiler.py
-  pm.addPass(mt::gpu::createDecomposeUnsupportedConversionsPass());
+  pm.addPass(mt::NVIDIA::createDecomposeUnsupportedConversionsPass());
   pm.addPass(mlir::createConvertSCFToCFPass());
   pm.addPass(mlir::createConvertIndexToLLVMPass());
   pm.addPass(mt::gpu::createAllocateSharedMemoryPass());
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
index 55155ef877f21f..8502a59cacaa16 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
@@ -1579,22 +1579,16 @@ max_computation {
   arg_1 = $0[] parameter(1)
   ROOT maximum = $0[] maximum(arg_0, arg_1)
 }
-add_computation {
-  arg_0.1 = $0[] parameter(0)
-  arg_1.1 = $0[] parameter(1)
-  ROOT add = $0[] add(arg_0.1, arg_1.1)
-}
 ENTRY main {
   param_0 = $0[127,125]{1,0} parameter(0)
-  constant_neg_inf = $0[] constant(0)
-  reduce = $0[127]{0} reduce(param_0, constant_neg_inf), dimensions={1}, to_apply=add_computation
+  constant_neg_inf = $0[] constant(-inf)
+  reduce = $0[127]{0} reduce(param_0, constant_neg_inf), dimensions={1}, to_apply=max_computation
   broadcast = $0[127,125]{1,0} broadcast(reduce), dimensions={0}
   subtract = $0[127,125]{1,0} subtract(param_0, broadcast)
-  multiply = $0[127,125]{1,0} multiply(subtract, subtract)
-  constant_zero = $0[] constant(0)
-  second_reduce = $0[127]{0} reduce(multiply, constant_zero), dimensions={1}, to_apply=add_computation
+  add = $0[127,125]{1,0} add(subtract, subtract)
+  second_reduce = $0[127]{0} reduce(add, constant_neg_inf), dimensions={1}, to_apply=max_computation
   second_broadcast = $0[127,125]{1,0} broadcast(second_reduce), dimensions={0}
-  ROOT multiply_root = $0[127,125]{1,0} multiply(multiply, second_broadcast)
+  ROOT add_root = $0[127,125]{1,0} add(add, second_broadcast)
 }
 )";
   const std::string hlo_text = absl::Substitute(
@@ -1614,22 +1608,29 @@ ENTRY main {
 
   MatchOptimizedHlo(hlo_text, hlo_ref);
 
-  float tolerance;
+  // The precision-changing ops in the kernel above are add & subtract, meaning
+  // a value can be X*2(first add)*2(subtract)*2(second add) larger than it was
+  // originally. In order to fit this into a datatype, we do:
+  // X*2^3 <= 2^(fraction bits of the data type)
+  // 2^(max_bits_of_precision)*2^3 <= 2^(fraction bits of the data type)
+  // max_bits_of_precision = fraction_bits - 3.
+  uint max_bits_of_precision;
   switch (data_type) {
     case F32:
-      tolerance = 1e-6;
+      max_bits_of_precision = 20;
       break;
     case F16:
-      tolerance = 2e-4;
+      max_bits_of_precision = 7;
       break;
     case BF16:
-      tolerance = 2e-2;
+      max_bits_of_precision = 4;
       break;
     default:
       ABSL_UNREACHABLE();
   }
-  EXPECT_TRUE(RunAndCompare(hlo_text,
-                            ErrorSpec(/*aabs=*/tolerance, /*arel=*/tolerance)));
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec(/*aabs=*/0, /*arel=*/0),
+                            /*reference_preprocessor=*/nullptr,
+                            max_bits_of_precision));
 }
 
 TEST_P(
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_rocm.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_rocm.cc
index 1a7aa92c62a9eb..e4331885baa08c 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_rocm.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_rocm.cc
@@ -69,19 +69,22 @@ absl::Status CreateTritonPipeline(
   // Based on make_ttgir() in
   // @triton//:third_party/nvidia/backend/compiler.py
   pm.addPass(mt::createConvertTritonToTritonGPUPass(
-      config.num_warps, threadsPerWarp, config.num_ctas, ccAsInt));
+      absl::StrFormat("cuda:%u", ccAsInt), config.num_warps, threadsPerWarp,
+      config.num_ctas));
   pm.addPass(mt::gpu::createCoalescePass());
   pm.addPass(mt::gpu::createRemoveLayoutConversionsPass());
   pm.addPass(mt::gpu::createOptimizeThreadLocalityPass());
   pm.addPass(mt::gpu::createAccelerateMatmulPass(ccAsInt));
   pm.addPass(mt::gpu::createRemoveLayoutConversionsPass());
-  pm.addPass(mt::gpu::createOptimizeDotOperandsPass());
+  // TODO ROCm Check if we want to compare MI100 and greater
+  pm.addPass(mt::gpu::createOptimizeDotOperandsPass(true));
   pm.addPass(mlir::createCSEPass());
   pm.addPass(mt::gpu::createPipelinePass(config.num_stages, config.num_warps,
                                          config.num_ctas, ccAsInt));
   pm.addPass(mt::gpu::createPrefetchPass());
 
-  pm.addPass(mt::gpu::createOptimizeDotOperandsPass());
+  // TODO ROCm Check if we want to compare MI100 and greater
+  pm.addPass(mt::gpu::createOptimizeDotOperandsPass(true));
   pm.addPass(mt::gpu::createRemoveLayoutConversionsPass());
   pm.addPass(mt::gpu::createReduceDataDuplicationPass());
   pm.addPass(mt::gpu::createReorderInstructionsPass());
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
index 7ff92e7207ad62..966716b6201b35 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/gpu/ir_emitter_triton.h"
 
+#include <cstdlib>
 #include <iterator>
 #include <limits>
 #include <memory>
@@ -27,7 +28,9 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/algorithm/container.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "absl/strings/substitute.h"
 #include "absl/types/span.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/raw_ostream.h"
@@ -117,6 +120,12 @@ class TritonGemmTest : public TritonTest {
     debug_options.set_xla_gpu_gemm_rewrite_size_threshold(0);
     return debug_options;
   }
+
+  void MatchHloModule(HloModule& module, absl::string_view pattern) {
+    TF_ASSERT_OK_AND_ASSIGN(bool filecheck_result,
+                            RunFileCheck(module.ToString(), pattern));
+    EXPECT_TRUE(filecheck_result);
+  }
 };
 
 class TritonGemmTestWithSplitK : public TritonGemmTest {
@@ -200,7 +209,7 @@ ENTRY e {
   TritonGemmConfig config(16, 64, 32, 1, 1, 1);
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul,
                                           "triton_gemm_r", R"(
-CHECK:    tt.func @triton_fn(%[[LHS:.*]]: !tt.ptr<i8, 1> {tt.divisibility = 16 : i32}, %[[RHS:.*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[OUT:.*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
+CHECK:    tt.func @triton_fn(%[[LHS:.*]]: !tt.ptr<i8> {tt.divisibility = 16 : i32}, %[[RHS:.*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[OUT:.*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
 CHECK-DAG:  %[[ZERO_KN:.*]] = arith.constant dense<0.000000e+00> : tensor<32x64xf32>
 CHECK-DAG:  %[[ZERO_MK:.*]] = arith.constant dense<0.000000e+00> : tensor<16x32xf32>
 CHECK-DAG:  %[[ZERO_MN:.*]] = arith.constant dense<0.000000e+00> : tensor<16x64xf32>
@@ -251,11 +260,11 @@ CHECK:        %[[RHS_INBOUNDS_K1:.*]] = arith.cmpi slt, %[[K_OFFSETS_K1]], %[[TI
 CHECK:        %[[RHS_INBOUNDS_KN:.*]] = tt.broadcast %[[RHS_INBOUNDS_K1]] : tensor<32x1xi1> -> tensor<32x64xi1>
 CHECK:        %[[RHS_MASKED:.*]] = arith.select %[[RHS_INBOUNDS_KN]], %[[RHS_TILE]], %[[ZERO_KN]] : tensor<32x64xi1>, tensor<32x64xf32>
 CHECK:        %[[ACC_NEXT:.*]] = tt.dot %[[LHS_MASKED]], %[[RHS_MASKED]], %[[ACC]]
-CHECK:        scf.yield %[[LHS_ITER_PTR_NEXT]], %[[RHS_ITER_PTR_NEXT]], %[[ACC_NEXT]] : !tt.ptr<tensor<16x32xi8>, 1>, !tt.ptr<tensor<32x64xf32>, 1>, tensor<16x64xf32>
+CHECK:        scf.yield %[[LHS_ITER_PTR_NEXT]], %[[RHS_ITER_PTR_NEXT]], %[[ACC_NEXT]] : !tt.ptr<tensor<16x32xi8>>, !tt.ptr<tensor<32x64xf32>>, tensor<16x64xf32>
 CHECK:      }
-CHECK:      %[[OUT_PTR:.*]] = tt.make_tensor_ptr %[[OUT]], [%[[C80]], %[[SIZE_M]]], [%[[SIZE_M]], %[[C1]]], [%[[C0]], %[[C0]]] {order = array<i32: 1, 0>} : <tensor<16x64xf32>, 1>
-CHECK:      %[[OUT_OFFSET:.*]] = tt.advance %[[OUT_PTR]], [%[[TILE_OFFSET_M_LHS]], %[[TILE_OFFSET_N_RHS]]] : <tensor<16x64xf32>, 1>
-CHECK:      tt.store %[[OUT_OFFSET]], %[[FOR]]#2 {boundaryCheck = array<i32: 1>, cache = 1 : i32, evict = 1 : i32} : !tt.ptr<tensor<16x64xf32>, 1>, tensor<16x64xf32>
+CHECK:      %[[OUT_PTR:.*]] = tt.make_tensor_ptr %[[OUT]], [%[[C80]], %[[SIZE_M]]], [%[[SIZE_M]], %[[C1]]], [%[[C0]], %[[C0]]] {order = array<i32: 1, 0>} : <tensor<16x64xf32>>
+CHECK:      %[[OUT_OFFSET:.*]] = tt.advance %[[OUT_PTR]], [%[[TILE_OFFSET_M_LHS]], %[[TILE_OFFSET_N_RHS]]] : <tensor<16x64xf32>>
+CHECK:      tt.store %[[OUT_OFFSET]], %[[FOR]]#2 {boundaryCheck = array<i32: 1>} : !tt.ptr<tensor<16x64xf32>>
 CHECK:      tt.return
 CHECK:    }
 )"));
@@ -284,9 +293,9 @@ ENTRY e {
 })";
 
   TritonGemmConfig config(16, 16, 32, 1, 1, 1);
-  EXPECT_OK(
+  TF_EXPECT_OK(
       CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
-CHECK:    tt.func @triton_fn(%[[LHS:.*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[RHS:.*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[OUT:.*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
+CHECK:    tt.func @triton_fn(%[[LHS:.*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[RHS:.*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[OUT:.*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
 CHECK-DAG:  %[[ZERO_KN:.*]] = arith.constant dense<0.000000e+00> : tensor<32x16xf32>
 CHECK-DAG:  %[[ZERO_MK:.*]] = arith.constant dense<0.000000e+00> : tensor<16x32xf32>
 CHECK-DAG:  %[[ZERO_MN:.*]] = arith.constant dense<0.000000e+00> : tensor<16x16xf32>
@@ -334,12 +343,12 @@ CHECK:      %[[RHS_INBOUNDS_K1:.*]] = arith.cmpi slt, %[[K_OFFSETS_K1]], %[[TILE
 CHECK:      %[[RHS_INBOUNDS_KN:.*]] = tt.broadcast %[[RHS_INBOUNDS_K1]] : tensor<32x1xi1> -> tensor<32x16xi1>
 CHECK:      %[[RHS_MASKED:.*]] = arith.select %[[RHS_INBOUNDS_KN]], %[[RHS_TILE]], %[[ZERO_KN]] : tensor<32x16xi1>, tensor<32x16xf32>
 CHECK:      %[[ACC_NEXT:.*]] = tt.dot %[[LHS_MASKED]], %[[RHS_MASKED]], %[[ACC]]
-CHECK:      scf.yield %[[LHS_ITER_PTR_NEXT]], %[[RHS_ITER_PTR_NEXT]], %[[ACC_NEXT]] : !tt.ptr<tensor<16x32xf32>, 1>, !tt.ptr<tensor<32x16xf32>, 1>, tensor<16x16xf32>
+CHECK:      scf.yield %[[LHS_ITER_PTR_NEXT]], %[[RHS_ITER_PTR_NEXT]], %[[ACC_NEXT]] : !tt.ptr<tensor<16x32xf32>>, !tt.ptr<tensor<32x16xf32>>, tensor<16x16xf32>
 CHECK:    }
 
-CHECK:    %[[OUT_PTR:.*]] = tt.make_tensor_ptr %[[OUT]], [%[[SIZE_M]], %[[C1]]], [%[[C1]], %[[C1]]], [%[[C0]], %[[C0]]] {order = array<i32: 1, 0>} : <tensor<16x16xf32>, 1>
-CHECK:    %[[OUT_OFFSET:.*]] = tt.advance %[[OUT_PTR]], [%[[TILE_OFFSET_M_LHS]], %[[TILE_OFFSET_N_RHS]]] : <tensor<16x16xf32>, 1>
-CHECK:    tt.store %[[OUT_OFFSET]], %[[FOR]]#2 {boundaryCheck = array<i32: 0, 1>, cache = 1 : i32, evict = 1 : i32} : !tt.ptr<tensor<16x16xf32>, 1>, tensor<16x16xf32>
+CHECK:    %[[OUT_PTR:.*]] = tt.make_tensor_ptr %[[OUT]], [%[[SIZE_M]], %[[C1]]], [%[[C1]], %[[C1]]], [%[[C0]], %[[C0]]] {order = array<i32: 1, 0>} : <tensor<16x16xf32>>
+CHECK:    %[[OUT_OFFSET:.*]] = tt.advance %[[OUT_PTR]], [%[[TILE_OFFSET_M_LHS]], %[[TILE_OFFSET_N_RHS]]] : <tensor<16x16xf32>>
+CHECK:    tt.store %[[OUT_OFFSET]], %[[FOR]]#2 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<16x16xf32>>
 CHECK:    tt.return
 CHECK:  }
 )"));
@@ -371,14 +380,14 @@ ENTRY main {
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
                                           "triton_softmax_computation", R"(
 CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
-CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
+CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
 CHECK:            %[[PID:.*]] = tt.get_program_id x : i32
 CHECK:            arith.index_castui %[[PID]] : i32 to index
 CHECK:            tt.addptr %[[P0]]
 CHECK-NEXT:       tt.make_tensor_ptr
-CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-SAME:       <tensor<128xf32>>
 CHECK-NEXT:       tt.load
-CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<128xf32>, 1> -> tensor<128xf32>
+CHECK-SAME:       {boundaryCheck = array<i32: 0>, padding = 1 : i32} : !tt.ptr<tensor<128xf32>>
 CHECK:            tt.reduce
 CHECK-NEXT:       ^bb0(%[[ARG2:[^:]*]]: f32, %[[ARG3:[^:]*]]: f32):
 CHECK-NEXT:           %[[ADD:.*]] = arith.addf %[[ARG2]], %[[ARG3]] : f32
@@ -389,9 +398,9 @@ CHECK:            arith.mulf
 CHECK-SAME:       tensor<128xf32>
 CHECK:            tt.addptr %[[P1]]
 CHECK-NEXT:       tt.make_tensor_ptr
-CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-SAME:       <tensor<128xf32>>
 CHECK-NEXT:       tt.store
-CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32} : !tt.ptr<tensor<128xf32>, 1>, tensor<128xf32>
+CHECK-SAME:       {boundaryCheck = array<i32: 0>} : !tt.ptr<tensor<128xf32>>
 CHECK:            tt.return
 CHECK:        }
 )"));
@@ -424,12 +433,12 @@ ENTRY main {
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
                                           "triton_softmax_computation", R"(
 CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
-CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
+CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
 CHECK-DAG:            %[[PID:.*]] = tt.get_program_id x : i32
 CHECK-DAG:            arith.index_castui %[[PID]] : i32 to index
 CHECK-DAG:            %[[ZERO_OFFSET:.*]] = arith.constant 0 : i64
-CHECK-DAG:            %[[ARG_0:.*]] = tt.addptr %[[P0]], %[[ZERO_OFFSET]] : !tt.ptr<f32, 1>, i64
-CHECK:            tt.load %[[ARG_0]] {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
+CHECK-DAG:            %[[ARG_0:.*]] = tt.addptr %[[P0]], %[[ZERO_OFFSET]] : !tt.ptr<f32>, i64
+CHECK:            tt.load %[[ARG_0]] : !tt.ptr<f32>
 CHECK-NEXT:       tt.splat
 CHECK:            tt.reduce
 CHECK-NEXT:       ^bb0(%[[ARG2:[^:]*]]: f32, %[[ARG3:[^:]*]]: f32):
@@ -441,9 +450,9 @@ CHECK:            arith.mulf
 CHECK-SAME:       tensor<128xf32>
 CHECK:            tt.addptr %[[P1]]
 CHECK-NEXT:       tt.make_tensor_ptr
-CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-SAME:       <tensor<128xf32>>
 CHECK-NEXT:       tt.store
-CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32} : !tt.ptr<tensor<128xf32>, 1>, tensor<128xf32>
+CHECK-SAME:       {boundaryCheck = array<i32: 0>} : !tt.ptr<tensor<128xf32>>
 CHECK:            tt.return
 CHECK:        }
 )"));
@@ -480,23 +489,23 @@ ENTRY main {
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
                                           "triton_softmax_computation", R"(
 CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
-CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
+CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
 CHECK-DAG:        %[[PID:.*]] = tt.get_program_id x : i32
 CHECK-DAG:        %[[PID_INDEX:.*]] = arith.index_castui %[[PID]] : i32 to index
 CHECK-DAG:        %[[C127_i64:.*]] = arith.constant 127 : i64
 CHECK-DAG:        %[[ZERO_OFFSET:.*]] = arith.constant 0 : i64
 CHECK:            %[[ROW_OFFSET_INDEX:.*]] = affine.apply #[[MAP]]()[%[[PID_INDEX]]]
 CHECK:            %[[ROW_OFFSET:.*]] = arith.index_castui %[[ROW_OFFSET_INDEX]] : index to i64
-CHECK:            %[[ARG0:.*]] = tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
+CHECK:            %[[ARG0:.*]] = tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32>, i64
 CHECK-NEXT:       tt.make_tensor_ptr
-CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-SAME:       <tensor<128xf32>>
 CHECK-NEXT:       tt.load
-CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<128xf32>, 1> -> tensor<128xf32>
-CHECK:            %[[ARG1:.*]] = tt.addptr %[[P1]], %[[ZERO_OFFSET]] : !tt.ptr<f32, 1>, i64
+CHECK-SAME:       {boundaryCheck = array<i32: 0>, padding = 1 : i32} : !tt.ptr<tensor<128xf32>>
+CHECK:            %[[ARG1:.*]] = tt.addptr %[[P1]], %[[ZERO_OFFSET]] : !tt.ptr<f32>, i64
 CHECK-NEXT:       tt.make_tensor_ptr
-CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-SAME:       <tensor<128xf32>>
 CHECK-NEXT:       tt.load
-CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<128xf32>, 1> -> tensor<128xf32>
+CHECK-SAME:       {boundaryCheck = array<i32: 0>, padding = 1 : i32} : !tt.ptr<tensor<128xf32>>
 CHECK:            tt.reduce
 CHECK-NEXT:       ^bb0(%[[ARG3:[^:]*]]: f32, %[[ARG4:[^:]*]]: f32):
 CHECK-NEXT:           %[[ADD:.*]] = arith.addf %[[ARG3]], %[[ARG4]] : f32
@@ -504,9 +513,9 @@ CHECK-NEXT:           tt.reduce.return %[[ADD]] : f32
 CHECK-NEXT:       }) : (tensor<128xf32>) -> f32
 CHECK:            tt.addptr %[[P2]]
 CHECK-NEXT:       tt.make_tensor_ptr
-CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-SAME:       <tensor<128xf32>>
 CHECK-NEXT:       tt.store
-CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32} : !tt.ptr<tensor<128xf32>, 1>, tensor<128xf32>
+CHECK-SAME:       {boundaryCheck = array<i32: 0>} : !tt.ptr<tensor<128xf32>>
 CHECK:            tt.return
 CHECK:        }
 )"));
@@ -546,23 +555,23 @@ ENTRY main {
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
                                           "triton_softmax_computation", R"(
 CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
-CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
+CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
 CHECK-DAG:        %[[PID:.*]] = tt.get_program_id x : i32
 CHECK-DAG:        %[[PID_INDEX:.*]] = arith.index_castui %[[PID]] : i32 to index
 CHECK-DAG:        %[[C127_i64:.*]] = arith.constant 127 : i64
 CHECK-DAG:        %[[ZERO_OFFSET:.*]] = arith.constant 0 : i64
 CHECK:            %[[ROW_OFFSET_INDEX:.*]] = affine.apply #[[MAP]]()[%[[PID_INDEX]]]
 CHECK:            %[[ROW_OFFSET:.*]] = arith.index_castui %[[ROW_OFFSET_INDEX]] : index to i64
-CHECK:            %[[ARG1:.*]] = tt.addptr %[[P1]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
+CHECK:            %[[ARG1:.*]] = tt.addptr %[[P1]], %[[ROW_OFFSET]] : !tt.ptr<f32>, i64
 CHECK-NEXT:       tt.make_tensor_ptr
-CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-SAME:       <tensor<128xf32>>
 CHECK-NEXT:       tt.load
-CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<128xf32>, 1> -> tensor<128xf32>
-CHECK:            %[[ARG0:.*]] = tt.addptr %[[P0]], %[[ZERO_OFFSET]] : !tt.ptr<f32, 1>, i64
+CHECK-SAME:       {boundaryCheck = array<i32: 0>, padding = 1 : i32} : !tt.ptr<tensor<128xf32>>
+CHECK:            %[[ARG0:.*]] = tt.addptr %[[P0]], %[[ZERO_OFFSET]] : !tt.ptr<f32>, i64
 CHECK-NEXT:       tt.make_tensor_ptr
-CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-SAME:       <tensor<128xf32>>
 CHECK-NEXT:       tt.load
-CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<128xf32>, 1> -> tensor<128xf32>
+CHECK-SAME:       {boundaryCheck = array<i32: 0>, padding = 1 : i32} : !tt.ptr<tensor<128xf32>>
 CHECK:            tt.reduce
 CHECK-NEXT:       ^bb0(%[[ARG3:[^:]*]]: f32, %[[ARG4:[^:]*]]: f32):
 CHECK-NEXT:           %[[ADD:.*]] = arith.addf %[[ARG3]], %[[ARG4]] : f32
@@ -571,9 +580,9 @@ CHECK-NEXT:       }) : (tensor<128xf32>) -> f32
 CHECK:            tt.splat
 CHECK:            tt.addptr %[[P2]]
 CHECK-NEXT:       tt.make_tensor_ptr
-CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-SAME:       <tensor<128xf32>>
 CHECK-NEXT:       tt.store
-CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32} : !tt.ptr<tensor<128xf32>, 1>, tensor<128xf32>
+CHECK-SAME:       {boundaryCheck = array<i32: 0>} : !tt.ptr<tensor<128xf32>>
 CHECK:            tt.return
 CHECK:        }
 )"));
@@ -610,33 +619,33 @@ ENTRY main {
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
                                           "triton_softmax_computation", R"(
 CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
-CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
+CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
 CHECK-DAG:        %[[PID:.*]] = tt.get_program_id x : i32
 CHECK-DAG:        %[[PID_INDEX:.*]] = arith.index_castui %[[PID]] : i32 to index
 CHECK-DAG:        %[[C127_i64:.*]] = arith.constant 127 : i64
 CHECK-DAG:        %[[ZERO_OFFSET:.*]] = arith.constant 0 : i64
 CHECK:            %[[ROW_OFFSET_INDEX:.*]] = affine.apply #[[MAP]]()[%[[PID_INDEX]]]
 CHECK:            %[[ROW_OFFSET:.*]] = arith.index_castui %[[ROW_OFFSET_INDEX]] : index to i64
-CHECK:            %[[ARG0:.*]] = tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
+CHECK:            %[[ARG0:.*]] = tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32>, i64
 CHECK-NEXT:       tt.make_tensor_ptr
-CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-SAME:       <tensor<128xf32>>
 CHECK-NEXT:       tt.load
-CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<128xf32>, 1> -> tensor<128xf32>
+CHECK-SAME:       {boundaryCheck = array<i32: 0>, padding = 1 : i32} : !tt.ptr<tensor<128xf32>>
 CHECK:            tt.reduce
 CHECK-NEXT:       ^bb0(%[[ARG3:[^:]*]]: f32, %[[ARG4:[^:]*]]: f32):
 CHECK-NEXT:           %[[ADD:.*]] = arith.addf %[[ARG3]], %[[ARG4]] : f32
 CHECK-NEXT:           tt.reduce.return %[[ADD]] : f32
 CHECK-NEXT:       }) : (tensor<128xf32>) -> f32
-CHECK:            %[[ARG1:.*]] = tt.addptr %[[P1]], %[[ZERO_OFFSET]] : !tt.ptr<f32, 1>, i64
+CHECK:            %[[ARG1:.*]] = tt.addptr %[[P1]], %[[ZERO_OFFSET]] : !tt.ptr<f32>, i64
 CHECK-NEXT:       tt.make_tensor_ptr
-CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-SAME:       <tensor<128xf32>>
 CHECK-NEXT:       tt.load
-CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<128xf32>, 1> -> tensor<128xf32>
+CHECK-SAME:       {boundaryCheck = array<i32: 0>, padding = 1 : i32} : !tt.ptr<tensor<128xf32>>
 CHECK:            tt.addptr %[[P2]]
 CHECK-NEXT:       tt.make_tensor_ptr
-CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-SAME:       <tensor<128xf32>>
 CHECK-NEXT:       tt.store
-CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32} : !tt.ptr<tensor<128xf32>, 1>, tensor<128xf32>
+CHECK-SAME:       {boundaryCheck = array<i32: 0>} : !tt.ptr<tensor<128xf32>>
 CHECK:            tt.return
 CHECK:        }
 )"));
@@ -678,26 +687,26 @@ ENTRY main {
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
                                           "triton_softmax_computation", R"(
 CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
-CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P3:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
+CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P3:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
 CHECK-DAG:        %[[C127_i64:.*]] = arith.constant 127 : i64
 CHECK-DAG:        %[[ZERO_OFFSET:.*]] = arith.constant 0 : i64
 CHECK-DAG:        %[[PID:.*]] = tt.get_program_id x : i32
 CHECK-DAG:        %[[PID_INDEX:.*]] = arith.index_castui %[[PID]] : i32 to index
 CHECK:            %[[ROW_OFFSET_INDEX:.*]] = affine.apply #[[MAP]]()[%[[PID_INDEX]]]
 CHECK:            %[[ROW_OFFSET:.*]] = arith.index_castui %[[ROW_OFFSET_INDEX]] : index to i64
-CHECK:            %[[ARG0:.*]] = tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
+CHECK:            %[[ARG0:.*]] = tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32>, i64
 CHECK-NEXT:       tt.make_tensor_ptr
-CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-SAME:       <tensor<128xf32>>
 CHECK-NEXT:       tt.load
-CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<128xf32>, 1> -> tensor<128xf32>
-CHECK:            %[[ARG1:.*]] = tt.addptr %[[P1]], %[[ZERO_OFFSET]] : !tt.ptr<f32, 1>, i64
+CHECK-SAME:       {boundaryCheck = array<i32: 0>, padding = 1 : i32} : !tt.ptr<tensor<128xf32>>
+CHECK:            %[[ARG1:.*]] = tt.addptr %[[P1]], %[[ZERO_OFFSET]] : !tt.ptr<f32>, i64
 CHECK-NEXT:       tt.make_tensor_ptr
-CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-SAME:       <tensor<128xf32>>
 CHECK-NEXT:       tt.load
-CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<128xf32>, 1> -> tensor<128xf32>
+CHECK-SAME:       {boundaryCheck = array<i32: 0>, padding = 1 : i32} : !tt.ptr<tensor<128xf32>>
 CHECK:            %[[PID_i64:.*]] = arith.index_castui %[[PID_INDEX]] : index to i64
-CHECK:            %[[ARG2:.*]] = tt.addptr %[[P2]], %[[PID_i64]] : !tt.ptr<f32, 1>, i64
-CHECK-NEXT:       tt.load %[[ARG2]] {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
+CHECK:            %[[ARG2:.*]] = tt.addptr %[[P2]], %[[PID_i64]] : !tt.ptr<f32>, i64
+CHECK-NEXT:       tt.load %[[ARG2]] : !tt.ptr<f32>
 CHECK:            tt.reduce
 CHECK-NEXT:       ^bb0(%[[ARG4:[^:]*]]: f32, %[[ARG5:[^:]*]]: f32):
 CHECK-NEXT:           %[[ADD:.*]] = arith.addf %[[ARG4]], %[[ARG5]] : f32
@@ -708,9 +717,9 @@ CHECK:            arith.mulf
 CHECK-SAME:       tensor<128xf32>
 CHECK:            tt.addptr %[[P3]]
 CHECK-NEXT:       tt.make_tensor_ptr
-CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-SAME:       <tensor<128xf32>>
 CHECK-NEXT:       tt.store
-CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32} : !tt.ptr<tensor<128xf32>, 1>, tensor<128xf32>
+CHECK-SAME:       {boundaryCheck = array<i32: 0>} : !tt.ptr<tensor<128xf32>>
 CHECK:            tt.return
 CHECK:        }
 )"));
@@ -751,26 +760,26 @@ ENTRY main {
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
                                           "triton_softmax_computation", R"(
 CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
-CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P3:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
+CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %[[P3:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
 CHECK-DAG:        %[[PID:.*]] = tt.get_program_id x : i32
 CHECK-DAG:        %[[PID_INDEX:.*]] = arith.index_castui %[[PID]] : i32 to index
 CHECK-DAG:        %[[C127_i64:.*]] = arith.constant 127 : i64
 CHECK-DAG:        %[[ZERO_OFFSET:.*]] = arith.constant 0 : i64
 CHECK-DAG:        %[[ROW_OFFSET_INDEX:.*]] = affine.apply #[[MAP]]()[%[[PID_INDEX]]]
 CHECK-DAG:        %[[ROW_OFFSET:.*]] = arith.index_castui %[[ROW_OFFSET_INDEX]] : index to i64
-CHECK:            %[[ARG0:.*]] = tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
+CHECK:            %[[ARG0:.*]] = tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32>, i64
 CHECK-NEXT:       tt.make_tensor_ptr
-CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-SAME:       <tensor<128xf32>>
 CHECK-NEXT:       tt.load
-CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<128xf32>, 1> -> tensor<128xf32>
-CHECK:            %[[ARG1:.*]] = tt.addptr %[[P1]], %[[ZERO_OFFSET]] : !tt.ptr<f32, 1>, i64
+CHECK-SAME:       {boundaryCheck = array<i32: 0>, padding = 1 : i32} : !tt.ptr<tensor<128xf32>>
+CHECK:            %[[ARG1:.*]] = tt.addptr %[[P1]], %[[ZERO_OFFSET]] : !tt.ptr<f32>, i64
 CHECK-NEXT:       tt.make_tensor_ptr
-CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-SAME:       <tensor<128xf32>>
 CHECK-NEXT:       tt.load
-CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<128xf32>, 1> -> tensor<128xf32>
+CHECK-SAME:       {boundaryCheck = array<i32: 0>, padding = 1 : i32} : !tt.ptr<tensor<128xf32>>
 CHECK:            %[[PID_i64:.*]] = arith.index_castui %[[PID_INDEX]] : index to i64
-CHECK:            %[[ARG2:.*]] = tt.addptr %[[P2]], %[[PID_i64]] : !tt.ptr<f32, 1>, i64
-CHECK-NEXT:       tt.load %[[ARG2]] {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
+CHECK:            %[[ARG2:.*]] = tt.addptr %[[P2]], %[[PID_i64]] : !tt.ptr<f32>, i64
+CHECK-NEXT:       tt.load %[[ARG2]] : !tt.ptr<f32>
 CHECK:            tt.reduce
 CHECK-NEXT:       ^bb0(%[[ARG4:[^:]*]]: f32, %[[ARG5:[^:]*]]: f32):
 CHECK-NEXT:           %[[ADD:.*]] = arith.addf %[[ARG4]], %[[ARG5]] : f32
@@ -781,9 +790,9 @@ CHECK:            arith.mulf
 CHECK-SAME:       tensor<128xf32>
 CHECK:            tt.addptr %[[P3]]
 CHECK-NEXT:       tt.make_tensor_ptr
-CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-SAME:       <tensor<128xf32>>
 CHECK-NEXT:       tt.store
-CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32} : !tt.ptr<tensor<128xf32>, 1>, tensor<128xf32>
+CHECK-SAME:       {boundaryCheck = array<i32: 0>} : !tt.ptr<tensor<128xf32>>
 CHECK:            tt.return
 CHECK:        }
 )"));
@@ -826,22 +835,22 @@ ENTRY main {
                                           "triton_softmax_computation", R"(
 CHECK: #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 16)>
 CHECK-LABEL:   tt.func @triton_fn(
-CHECK-SAME:        %[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32},
-CHECK-SAME:        %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32},
-CHECK-SAME:        %[[P2:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
+CHECK-SAME:        %[[P0:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32},
+CHECK-SAME:        %[[P1:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32},
+CHECK-SAME:        %[[P2:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
 CHECK-DAG:       %[[ZERO_OFFSET:.*]] = arith.constant 0 : i32
 CHECK-DAG:       %[[C1_i64:.*]] = arith.constant 1 : i64
 CHECK-DAG:       %[[C16_i64:.*]] = arith.constant 16 : i64
 CHECK-DAG:       %[[PID:.*]] = tt.get_program_id x : i32
 CHECK:           %[[PID_INDEX:.*]] = arith.index_castui %[[PID]] : i32 to index
 CHECK:           %[[PID_i64:.*]] = arith.index_castui %[[PID_INDEX]] : index to i64
-CHECK:           tt.addptr %[[P1]], %[[PID_i64]] : !tt.ptr<f32, 1>, i64
+CHECK:           tt.addptr %[[P1]], %[[PID_i64]] : !tt.ptr<f32>, i64
 CHECK:           tt.splat
 CHECK:           %[[ROW_OFFSET_INDEX:.*]] = affine.apply #[[MAP]]()[%[[PID_INDEX]]]
 CHECK:           %[[ROW_OFFSET:.*]] = arith.index_castui %[[ROW_OFFSET_INDEX]] : index to i64
-CHECK:           tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
+CHECK:           tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32>, i64
 CHECK:           tt.make_tensor_ptr
-CHECK-SAME:      <tensor<16xf32>, 1>
+CHECK-SAME:      <tensor<16xf32>>
 CHECK:           tt.load
 CHECK:           tt.reduce
 CHECK-NEXT:      ^bb0(%[[ARG3:.*]]: f32, %[[ARG4:.*]]: f32):
@@ -852,7 +861,7 @@ CHECK:           tt.addptr %[[P2]]
 CHECK:           tt.make_tensor_ptr
 CHECK-SAME:      tensor<16xf32>
 CHECK:           tt.store
-CHECK-SAME:      !tt.ptr<tensor<16xf32>, 1>, tensor<16xf32>
+CHECK-SAME:      !tt.ptr<tensor<16xf32>>
 )"));
 
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/0,
@@ -942,26 +951,26 @@ ENTRY main {
                                           "triton_softmax_computation", R"(
 CHECK: #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 32)>
 CHECK-LABEL:   tt.func @triton_fn(
-CHECK-SAME:        %[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32},
-CHECK-SAME:        %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32},
-CHECK-SAME:        %[[P2:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
+CHECK-SAME:        %[[P0:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32},
+CHECK-SAME:        %[[P1:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32},
+CHECK-SAME:        %[[P2:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
 CHECK-DAG:       %[[ZERO_OFFSET:.*]] = arith.constant 0 : i32
 CHECK-DAG:       %[[C0_i64:.*]] = arith.constant 0 : i64
 CHECK-DAG:       %[[C1_i64:.*]] = arith.constant 1 : i64
 CHECK-DAG:       %[[PID:.*]] = tt.get_program_id x : i32
 CHECK:           %[[PID_INDEX:.*]] = arith.index_castui %[[PID]] : i32 to index
-CHECK:           tt.addptr %[[P1]], %[[C0_i64]] : !tt.ptr<f32, 1>, i64
+CHECK:           tt.addptr %[[P1]], %[[C0_i64]] : !tt.ptr<f32>, i64
 CHECK:           tt.make_tensor_ptr
-CHECK-SAME:      <tensor<32xf32>, 1>
+CHECK-SAME:      <tensor<32xf32>>
 CHECK:           tt.load
-CHECK-SAME:      !tt.ptr<tensor<32xf32>, 1> -> tensor<32xf32>
+CHECK-SAME:      !tt.ptr<tensor<32xf32>>
 CHECK:           %[[ROW_OFFSET_INDEX:.*]] = affine.apply #[[MAP]]()[%[[PID_INDEX]]]
 CHECK:           %[[ROW_OFFSET:.*]] = arith.index_castui %[[ROW_OFFSET_INDEX]] : index to i64
-CHECK:           tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
+CHECK:           tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32>, i64
 CHECK-NEXT:      tt.make_tensor_ptr
-CHECK-SAME:      <tensor<32xf32>, 1>
+CHECK-SAME:      <tensor<32xf32>>
 CHECK:           tt.load
-CHECK-SAME:      !tt.ptr<tensor<32xf32>, 1> -> tensor<32xf32>
+CHECK-SAME:      !tt.ptr<tensor<32xf32>>
 CHECK:           tt.reduce
 CHECK-NEXT:      ^bb0(%[[ARG3:.*]]: f32, %[[ARG4:.*]]: f32):
 CHECK:             %[[MAX:.*]] = arith.maximumf %[[ARG3]], %[[ARG4]] : f32
@@ -969,9 +978,9 @@ CHECK:             tt.reduce.return %[[MAX]] : f32
 CHECK:           }) : (tensor<32xf32>) -> f32
 CHECK:           tt.addptr %[[P2]]
 CHECK:           tt.make_tensor_ptr
-CHECK-SAME:      <tensor<32xf32>, 1>
+CHECK-SAME:      <tensor<32xf32>>
 CHECK:           tt.store
-CHECK-SAME:      !tt.ptr<tensor<32xf32>, 1>, tensor<32xf32>
+CHECK-SAME:      !tt.ptr<tensor<32xf32>>
 )"));
 
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/0, /*arel=*/0}));
@@ -1015,35 +1024,35 @@ ENTRY main {
                                           "triton_softmax_computation", R"(
 // CHECK:         #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 16)>
 // CHECK-LABEL:   tt.func @triton_fn(
-// CHECK-SAME:                       %[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32},
-// CHECK-SAME:                       %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32},
-// CHECK-SAME:                       %[[P2:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
+// CHECK-SAME:                       %[[P0:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32},
+// CHECK-SAME:                       %[[P1:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32},
+// CHECK-SAME:                       %[[P2:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
 // CHECK-DAG:       %[[ZERO_OFFSET_i32:.*]] = arith.constant 0 : i32
 // CHECK-DAG:       %[[C1_i64:.*]] = arith.constant 1 : i64
 // CHECK-DAG:       %[[C16_i64:.*]] = arith.constant 16 : i64
 // CHECK-DAG:       %[[ZERO_OFFSET_i64:.*]] = arith.constant 0 : i64
 // CHECK-DAG:       %[[PID:.*]] = tt.get_program_id x : i32
 // CHECK:           %[[PID_INDEX:.*]] = arith.index_castui %[[PID]] : i32 to index
-// CHECK:           tt.addptr %[[P1]], %[[ZERO_OFFSET_i64]] : !tt.ptr<f32, 1>, i64
+// CHECK:           tt.addptr %[[P1]], %[[ZERO_OFFSET_i64]] : !tt.ptr<f32>, i64
 // CHECK-NEXT:      tt.load
-// CHECK-SAME:      {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
+// CHECK-SAME:      : !tt.ptr<f32>
 // CHECK:           %[[ROW_OFFSET_INDEX:.*]] = affine.apply #[[MAP]]()[%[[PID_INDEX]]]
 // CHECK:           %[[ROW_OFFSET:.*]] = arith.index_castui %[[ROW_OFFSET_INDEX]] : index to i64
-// CHECK:           tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
+// CHECK:           tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32>, i64
 // CHECK:           tt.make_tensor_ptr
-// CHECK-SAME:      <tensor<16xf32>, 1>
+// CHECK-SAME:      <tensor<16xf32>>
 // CHECK:           tt.load
-// CHECK-SAME:      !tt.ptr<tensor<16xf32>, 1> -> tensor<16xf32>
+// CHECK-SAME:      !tt.ptr<tensor<16xf32>>
 // CHECK:           tt.reduce
 // CHECK:           ^bb0(%[[ARG3:.*]]: f32, %[[ARG4:.*]]: f32):
 // CHECK:             %[[MAX:.*]] = arith.maximumf %[[ARG3]], %[[ARG4]] : f32
 // CHECK:             tt.reduce.return %[[MAX]] : f32
 // CHECK:           }) : (tensor<16xf32>) -> f32
-// CHECK:           tt.addptr %[[P2]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
+// CHECK:           tt.addptr %[[P2]], %[[ROW_OFFSET]] : !tt.ptr<f32>, i64
 // CHECK-NEXT:      tt.make_tensor_ptr
-// CHECK-SAME:      <tensor<16xf32>, 1>
+// CHECK-SAME:      <tensor<16xf32>>
 // CHECK:           tt.store
-// CHECK-SAME:      !tt.ptr<tensor<16xf32>, 1>, tensor<16xf32>
+// CHECK-SAME:      !tt.ptr<tensor<16xf32>>
 )"));
 
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/0, /*arel=*/0}));
@@ -1083,41 +1092,41 @@ ENTRY main {
                           ParseAndReturnVerifiedModule(kHloText));
 
   TritonGemmConfig config(16, 64, 32, 1, 1, 1);
-  ASSERT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
-                                       "triton_softmax_computation", R"(
+  TF_ASSERT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
+                                          "triton_softmax_computation", R"(
 // CHECK: #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 16)>
 // CHECK-LABEL:   tt.func @triton_fn(
-// CHECK-SAME:                       %[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32},
-// CHECK-SAME:                       %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32},
-// CHECK-SAME:                       %[[P2:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
+// CHECK-SAME:                       %[[P0:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32},
+// CHECK-SAME:                       %[[P1:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32},
+// CHECK-SAME:                       %[[P2:[^:]*]]: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
 // CHECK-DAG:       %[[ZERO_OFFSET_i32:.*]] = arith.constant 0 : i32
 // CHECK-DAG:       %[[C0_i64:.*]] = arith.constant 0 : i64
 // CHECK-DAG:       %[[C1_i64:.*]] = arith.constant 1 : i64
 // CHECK-DAG:       %[[C16_i64:.*]] = arith.constant 16 : i64
 // CHECK-DAG:       %[[PID:.*]] = tt.get_program_id x : i32
 // CHECK:           %[[PID_INDEX:.*]] = arith.index_castui %[[PID]] : i32 to index
-// CHECK:           tt.addptr %[[P1]], %[[C0_i64]] : !tt.ptr<f32, 1>, i64
+// CHECK:           tt.addptr %[[P1]], %[[C0_i64]] : !tt.ptr<f32>, i64
 // CHECK-NEXT:      tt.make_tensor_ptr
-// CHECK-SAME:      <tensor<16xf32>, 1>
+// CHECK-SAME:      <tensor<16xf32>>
 // CHECK:           tt.load
-// CHECK-SAME:      !tt.ptr<tensor<16xf32>, 1> -> tensor<16xf32>
+// CHECK-SAME:      !tt.ptr<tensor<16xf32>>
 // CHECK:           %[[ROW_OFFSET_INDEX:.*]] = affine.apply #[[MAP]]()[%[[PID_INDEX]]]
 // CHECK:           %[[ROW_OFFSET:.*]] = arith.index_castui %[[ROW_OFFSET_INDEX]] : index to i64
-// CHECK:           tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
+// CHECK:           tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32>, i64
 // CHECK-NEXT:      tt.make_tensor_ptr
-// CHECK-SAME:      <tensor<16xf32>, 1>
+// CHECK-SAME:      <tensor<16xf32>>
 // CHECK-NEXT:      tt.load
-// CHECK-SAME:      !tt.ptr<tensor<16xf32>, 1> -> tensor<16xf32>
+// CHECK-SAME:      !tt.ptr<tensor<16xf32>>
 // CHECK:           tt.reduce
 // CHECK:           ^bb0(%[[ARG3:.*]]: f32, %[[ARG4:.*]]: f32):
 // CHECK:             %[[MAX:.*]] = arith.maximumf %[[ARG3]], %[[ARG4]] : f32
 // CHECK:             tt.reduce.return %[[MAX]] : f32
 // CHECK:           }) : (tensor<16xf32>) -> f32
-// CHECK:           tt.addptr %[[P2]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
+// CHECK:           tt.addptr %[[P2]], %[[ROW_OFFSET]] : !tt.ptr<f32>, i64
 // CHECK-NEXT:      tt.make_tensor_ptr
-// CHECK-SAME:      <tensor<16xf32>, 1>
+// CHECK-SAME:      <tensor<16xf32>>
 // CHECK:           tt.store
-// CHECK-SAME:      !tt.ptr<tensor<16xf32>, 1>, tensor<16xf32>
+// CHECK-SAME:      !tt.ptr<tensor<16xf32>>
 )"));
 
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/0, /*arel=*/0}));
@@ -1159,7 +1168,7 @@ ENTRY e {
   TritonGemmConfig config(16, 16, 16, 1, 1, 1);
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul,
                                           "triton_gemm_computation", R"(
-CHECK: %[[LOAD:.*]] = tt.load %{{.*}} {{.*}} : !tt.ptr<tensor<16x16xi8>, 1> -> tensor<16x16xi8>
+CHECK: %[[LOAD:.*]] = tt.load %{{.*}} {{.*}} : !tt.ptr<tensor<16x16xi8>>
 CHECK: %[[TRUNCI:.*]] = arith.trunci %[[LOAD]] : tensor<16x16xi8> to tensor<16x16xi1>
 CHECK: %{{.*}} = arith.andi %[[TRUNCI]], %{{.*}} : tensor<16x16xi1>
 )"));
@@ -1195,9 +1204,9 @@ ENTRY e {
   TritonGemmConfig config(16, 64, 32, 1, 1, 2);
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul,
                                           "triton_gemm", R"(
-CHECK:   tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32, 1>
-CHECK-SAME:                 %[[P1:[^:]*]]: !tt.ptr<f32, 1>
-CHECK-SAME:                 %[[P2:[^:]*]]: !tt.ptr<f32, 1>
+CHECK:   tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32>
+CHECK-SAME:                 %[[P1:[^:]*]]: !tt.ptr<f32>
+CHECK-SAME:                 %[[P2:[^:]*]]: !tt.ptr<f32>
 CHECK-DAG: %[[ARG_PTR:.*]] = arith.select %[[CONCAT_COND:.*]], %[[P1]], %[[P2]]
 CHECK-DAG: %[[BATCH_STRIDE_P1:.*]] = arith.constant 1280
 CHECK-DAG: %[[BATCH_STRIDE_P2:.*]] = arith.constant 2560
@@ -1208,6 +1217,167 @@ CHECK:     %[[BLOCK_BASE_PTR:.*]] = tt.addptr %[[ARG_PTR]], %[[OFFSET]]
 )"));
 }
 
+TEST_F(TritonFilecheckTest, CodegenDynamicSliceWithCorrectOffsets) {
+  // The start index(es) for the non-majormost dimension(s) are constant zero(s)
+  // because we don't support dynamic slice on those dimensions.
+  constexpr absl::string_view kHloText = R"(
+HloModule t
+
+triton_gemm {
+  dot_lhs = f32[2,4] parameter(0)
+  dynamic_slice_input = f32[4,5,2] parameter(1)
+  start_index0 = s32[] parameter(2)
+  start_index1 = s32[] parameter(3)
+  start_index2 = s32[] parameter(4)
+  dynamic_slice = f32[1,5,2] dynamic-slice(dynamic_slice_input, start_index0, start_index1, start_index2), dynamic_slice_sizes={1,5,2}
+  bitcast = f32[5,2] bitcast(dynamic_slice)
+  ROOT dot = f32[4,5] dot(dot_lhs, bitcast), lhs_contracting_dims={0}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  dot_lhs = f32[2,4] parameter(0)
+  dynamic_slice_input = f32[4,5,2] parameter(1)
+  start_index0 = s32[] parameter(2)
+  start_index1 = s32[] constant(0)
+  start_index2 = s32[] constant(0)
+  ROOT fusion = f32[4,5] fusion(dot_lhs, dynamic_slice_input, start_index0, start_index1, start_index2),
+       kind=kCustom, calls=triton_gemm,
+       backend_config={
+         "fusion_backend_config":{
+           "kind":"__triton_gemm","triton_gemm_config":{
+             "block_m":"32","block_n":"32","block_k":"32","split_k":"1",
+             "num_stages":"1","num_warps":"4","num_ctas":"1"}}}
+})";
+
+  TritonGemmConfig config(16, 64, 32, 1, 1, 2);
+  ASSERT_THAT(CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul,
+                                         "triton_gemm", R"(
+CHECK:     tt.func @triton_fn({{[^,]*}}, %[[DYNAMIC_SLICE_INPUT:[^:]*]]: !tt.ptr<f32> {{[^,]*}}, %[[START_INDEX0_PTR:[^:]*]]: !tt.ptr<i32>
+CHECK-DAG:   %[[C0_i32:.*]] = arith.constant 0 : i32
+CHECK-DAG:   %[[C1_i64:.*]] = arith.constant 1 : i64
+CHECK-DAG:   %[[C2_i64:.*]] = arith.constant 2 : i64
+CHECK-DAG:   %[[C3_i32:.*]] = arith.constant 3 : i32
+CHECK-DAG:   %[[C5_i32:.*]] = arith.constant 5 : i32
+CHECK-DAG:   %[[C5_i64:.*]] = arith.constant 5 : i64
+CHECK-DAG:   %[[START_INDEX0:.*]] = tt.load %[[START_INDEX0_PTR]] : !tt.ptr<i32>
+CHECK-DAG:   %[[SEMI_CLAMPED_START_INDEX0:.*]] = arith.maxsi %[[START_INDEX0]], %[[C0_i32]] : i32
+CHECK-DAG:   %[[CLAMPED_START_INDEX0:.*]] = arith.minsi %[[SEMI_CLAMPED_START_INDEX0]], %[[C3_i32]] : i32
+CHECK-DAG:   %[[ROW_OFFSET:.*]] = arith.muli %[[CLAMPED_START_INDEX0]], %[[C5_i32]] : i32
+CHECK-DAG:   %[[ROW_OFFSET_i64:.*]] = arith.extsi %[[ROW_OFFSET]] : i32 to i64
+CHECK-DAG:   %[[ROW_LIMIT:.*]] = arith.addi %[[ROW_OFFSET_i64]], %[[C5_i64]] : i64
+CHECK-DAG:   tt.make_tensor_ptr %[[DYNAMIC_SLICE_INPUT]], [%[[C2_i64]], %[[ROW_LIMIT]]], [%[[C1_i64]], %[[C2_i64]]], [%[[C0_i32]], %[[ROW_OFFSET]]]
+)"),
+              tsl::testing::IsOk());
+}
+
+TEST_F(TritonFilecheckTest, SparseDot) {
+  const char* kHloText = R"(
+HloModule t
+
+triton_dot {
+  lhs = f16[128,160] parameter(0)
+  rhs = f16[320,64] parameter(1)
+  meta = u16[128,20] parameter(2)
+  ROOT dot = f16[128,64] dot(lhs, rhs, meta),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}, sparsity=L.1@2:4
+}
+
+ENTRY e {
+  lhs = f16[128,160] parameter(0)
+  rhs = f16[320,64] parameter(1)
+  meta = u16[128,20] parameter(2)
+  ROOT _ = f16[128,64] fusion(lhs, rhs, meta), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config:
+    {"block_m":32,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}
+}
+)";
+  TritonGemmConfig config(32, 32, 32, 1, 1, 1);
+  TF_ASSERT_OK(
+      CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
+CHECK: %[[LHS:[0-9]+]] = tt.load
+CHECK: %[[RHS:[0-9]+]] = tt.load
+CHECK: %[[META:[0-9]+]] = tt.load
+CHECK: triton_gpu.sparse_dot %[[LHS]], %[[RHS]], %{{[^:]+}}, %[[META]] :
+    )"));
+}
+
+TEST_F(TritonFilecheckTest, SparseDotWithMasking) {
+  const char* kHloText = R"(
+HloModule t
+
+triton_dot {
+  lhs = f16[32,24] parameter(0)
+  rhs = f16[48,32] parameter(1)
+  meta = u16[32,3] parameter(2)
+  ROOT dot = f16[32,32] dot(lhs, rhs, meta),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}, sparsity=L.1@2:4
+}
+
+ENTRY e {
+  lhs = f16[32,24] parameter(0)
+  rhs = f16[48,32] parameter(1)
+  meta = u16[32,3] parameter(2)
+  ROOT _ = f16[32,32] fusion(lhs, rhs, meta), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config:
+    {"block_m":32,"block_n":32,"block_k":64,"split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}
+}
+)";
+  TritonGemmConfig config(32, 32, 64, 1, 1, 1);
+  TF_ASSERT_OK(
+      CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
+CHECK-DAG: %[[C24:.+]] = arith.constant dense<24>
+CHECK-DAG: %[[C48:.+]] = arith.constant dense<48>
+CHECK: %[[LHS:[0-9]+]] = tt.load %{{.+}} {boundaryCheck = array<i32: 1>
+CHECK: %[[RHS:[0-9]+]] = tt.load %{{.+}} {boundaryCheck = array<i32: 0>
+CHECK: %[[META:[0-9]+]] = tt.load %{{.+}} {boundaryCheck = array<i32: 1>
+CHECK: arith.cmpi slt, %{{.+}}, %[[C24]] :
+CHECK: %[[LHS_MASKED:[0-9]+]] = arith.select %{{.+}}, %[[LHS]],
+CHECK: arith.cmpi slt, %{{.+}}, %[[C48]] :
+CHECK: %[[RHS_MASKED:[0-9]+]] = arith.select %{{.+}}, %[[RHS]],
+CHECK: triton_gpu.sparse_dot %[[LHS_MASKED]], %[[RHS_MASKED]], %{{[^:]+}}, %[[META]] :
+    )"));
+}
+
+TEST_F(TritonFilecheckTest, SparseDotBroadcastMetadata) {
+  const char* kHloText = R"(
+HloModule t
+
+triton_dot {
+  lhs = f16[10,32,64] parameter(0)
+  rhs = f16[10,128,256] parameter(1)
+  meta_partial = u16[8] parameter(2)
+  meta = u16[10,32,8] broadcast(meta_partial), dimensions={2}
+  ROOT dot = f16[10,32,256] dot(lhs, rhs, meta),
+    lhs_batch_dims={0}, lhs_contracting_dims={2},
+    rhs_batch_dims={0}, rhs_contracting_dims={1}, sparsity=L.2@2:4
+}
+
+ENTRY e {
+  lhs = f16[10,32,64] parameter(0)
+  rhs = f16[10,128,256] parameter(1)
+  meta_partial = u16[8] parameter(2)
+  ROOT _ = f16[10,32,256] fusion(lhs, rhs, meta_partial), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config:
+    {"block_m":32,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}
+}
+)";
+  TritonGemmConfig config(32, 32, 32, 1, 1, 1);
+  TF_ASSERT_OK(
+      CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
+CHECK: %[[TWO:.+]] = arith.constant 2 : i32
+CHECK: %[[LHS:[0-9]+]] = tt.load
+CHECK: %[[RHS:[0-9]+]] = tt.load
+CHECK: %[[T1:[0-9]+]] = tt.load %[[PTR:.+]] :
+CHECK: tt.advance %[[PTR]], [%[[TWO]]]
+CHECK: %[[T2:[0-9]+]] = tt.expand_dims %[[T1]]
+CHECK: %[[META:[0-9]+]] = tt.broadcast %[[T2]]
+CHECK: triton_gpu.sparse_dot %[[LHS]], %[[RHS]], %{{[^:]+}}, %[[META]] :
+    )"));
+}
+
 TEST_F(TritonGemmTest, DoNotUseTensorCoresWithNonDefaultPrecision) {
   const std::string kHloText = R"(
 triton_gemm_r {
@@ -1267,6 +1437,29 @@ ENTRY e {
   EXPECT_EQ(paths.size(), 1);
 }
 
+TEST_F(TritonGemmTest, DotWithPredFromCompareProducesCorrectResult) {
+  const std::string hlo_text = R"(
+triton_dot {
+  parameter_0 = s32[4,128]{1,0} parameter(0)
+  broadcast.255 = s32[4,128,64]{2,1,0} broadcast(parameter_0), dimensions={0,1}
+  parameter_1 = s32[4,128,64]{2,1,0} parameter(1)
+  compare.39 = pred[4,128,64]{2,1,0} compare(broadcast.255, parameter_1), direction=EQ
+  bitcast.1097 = pred[512,64]{1,0} reshape(compare.39)
+  convert.229 = bf16[512,64]{1,0} convert(bitcast.1097)
+  parameter_2 = bf16[64,256]{0,1} parameter(2)
+  ROOT dot.21 = bf16[512,256]{1,0} dot(convert.229, parameter_2),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+ENTRY main {
+  p0 = s32[4,128]{1,0} parameter(0)
+  p1 = s32[4,128,64]{2,1,0} parameter(1)
+  p2 = bf16[64,256]{0,1} parameter(2)
+  ROOT gemm_fusion_dot.0 = bf16[512,256]{1,0} fusion(p0, p1, p2), kind=kCustom, calls=triton_dot, backend_config={"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"64","block_n":"128","block_k":"32","split_k":"1","num_stages":"4","num_warps":"4","num_ctas":"1"}}}
+})";
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
 TEST_F(TritonGemmTest, UseTensorCoresForF32OnAmpere) {
   const std::string kHloText = R"(
 triton_gemm_r {
@@ -1408,6 +1601,28 @@ ENTRY e {
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
+TEST_F(TritonGemmTest, PredWithBF16DotProducesCorrectResult) {
+  const std::string hlo_text = R"(
+triton_dot {
+  p0 = pred[8,640]{1,0} parameter(0)
+  cvt = bf16[8,640]{1,0} convert(pred[8,640]{1,0} p0)
+  p1 = bf16[4096,640]{1,0} parameter(1)
+  ROOT dot.10277 = bf16[8,4096]{1,0} dot(cvt, p1), lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  p0 = pred[8,640]{1,0} parameter(0)
+  p1 = bf16[4096,640]{1,0} parameter(1)
+  ROOT dot = bf16[8,4096]{1,0} fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm", triton_gemm_config:
+      {"block_m":16,"block_n":32,"block_k":64,
+      "split_k":1,"num_stages":2,"num_warps":8,
+      "num_ctas":1}}}
+})";
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
 TEST_F(TritonGemmTest, NoPadding) {
   const char* hlo_text = R"(
 HloModule t
@@ -2020,6 +2235,241 @@ ENTRY e {
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6}));
 }
 
+TEST_F(TritonGemmTest, DynamicSliceIsSupportedInLhsEndToEnd) {
+  // The select is used to restrict the start index to values that make sense.
+  // If it was constant, then the dynamic-slice would be optimized to slice. It
+  // is not strictly needed, because we also support clamping the indices.
+  // The start index(es) for the non-majormost dimension(s) are constant zero(s)
+  // because we don't support dynamic slice on those dimensions.
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+ENTRY e {
+  dot_lhs = f32[2,4] parameter(0)
+  dynamic_slice_input = f32[7,2] parameter(1)
+  pred0 = pred[] parameter(2)
+  c1 = s32[] constant(1)
+  c2 = s32[] constant(2)
+  start_index0 = s32[] select(pred0, c1, c2)
+  start_index1 = s32[] constant(0)
+  dynamic_slice = f32[5,2] dynamic-slice(dynamic_slice_input, start_index0, start_index1),
+                  dynamic_slice_sizes={5,2}
+  ROOT dot = f32[4,5] dot(dot_lhs, dynamic_slice),
+          lhs_contracting_dims={0}, rhs_contracting_dims={1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Fusion(m::Parameter(), m::Parameter(),
+                           m::Fusion(m::Parameter()), m::Constant())
+                     .WithFusionKind(HloInstruction::FusionKind::kCustom)));
+  // Check that it's not optimized away.
+  MatchHloModule(*module, "; CHECK: dynamic-slice(");
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-6}));
+}
+
+TEST_F(TritonGemmTest, DynamicSliceIsSupportedInRhs) {
+  // The start index(es) for the non-majormost dimension(s) are constant zero(s)
+  // because we don't support dynamic slice on those dimensions.
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+triton_gemm {
+  dynamic_slice_input = f32[7,2] parameter(0)
+  dot_rhs = f32[2,4] parameter(1)
+  start_index0 = s32[] parameter(2)
+  start_index1 = s32[] parameter(3)
+  dynamic_slice = f32[5,2] dynamic-slice(dynamic_slice_input, start_index0, start_index1),
+                  dynamic_slice_sizes={5,2}
+  ROOT dot = f32[5, 4] dot(dynamic_slice, dot_rhs),
+          lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  dynamic_slice_input = f32[7,2] parameter(0)
+  dot_rhs = f32[2,4] parameter(1)
+  start_index0 = s32[] constant(1)
+  start_index1 = s32[] constant(0)
+  ROOT fusion = f32[5,4] fusion(dynamic_slice_input, dot_rhs, start_index0, start_index1),
+       kind=kCustom, calls=triton_gemm,
+       backend_config={
+         "fusion_backend_config":{
+           "kind":"__triton_gemm","triton_gemm_config":{
+             "block_m":"32","block_n":"32","block_k":"32","split_k":"1",
+             "num_stages":"1","num_warps":"4","num_ctas":"1"}}}
+})";
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      kHloText, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-6}));
+}
+
+class TritonGemmDynamicSliceClampingTest
+    : public TritonTest,
+      public ::testing::WithParamInterface<int> {};
+
+TEST_P(TritonGemmDynamicSliceClampingTest,
+       DynamicSliceIsSupportedWhenTheStartIndexNeedsClamping) {
+  // The start index(es) for the non-majormost dimension(s) are constant zero(s)
+  // because we don't support dynamic slice on those dimensions.
+
+  const std::string hlo_text = absl::Substitute(R"(
+HloModule m
+
+triton_gemm {
+  dynamic_slice_input = f32[7,2] parameter(0)
+  dot_rhs = f32[2,4] parameter(1)
+  start_index0 = s32[] parameter(2)
+  start_index1 = s32[] parameter(3)
+  dynamic_slice = f32[5,2] dynamic-slice(dynamic_slice_input, start_index0, start_index1),
+                  dynamic_slice_sizes={5,2}
+  ROOT dot = f32[5, 4] dot(dynamic_slice, dot_rhs),
+          lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  dynamic_slice_input = f32[7,2] parameter(0)
+  dot_rhs = f32[2,4] parameter(1)
+  start_index0 = s32[] constant($0)
+  start_index1 = s32[] constant(0)
+  ROOT fusion = f32[5,4] fusion(dynamic_slice_input, dot_rhs, start_index0, start_index1),
+       kind=kCustom, calls=triton_gemm,
+       backend_config={
+         "fusion_backend_config":{
+           "kind":"__triton_gemm","triton_gemm_config":{
+             "block_m":"32","block_n":"32","block_k":"32","split_k":"1",
+             "num_stages":"1","num_warps":"4","num_ctas":"1"}}}
+})",
+                                                GetParam());
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      hlo_text, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-6}));
+}
+
+std::string OffsetParamToString(const ::testing::TestParamInfo<int>& data) {
+  return absl::StrCat("WithOffsetEq", data.param < 0 ? "Negative" : "",
+                      std::abs(data.param));
+}
+
+INSTANTIATE_TEST_SUITE_P(All, TritonGemmDynamicSliceClampingTest,
+                         ::testing::Values(-100, 3, 999), OffsetParamToString);
+
+TEST_F(TritonGemmTest, DynamicSliceOfMajormostContractingDimIsSupported) {
+  // Tests that dynamic-slice works on the majormost dimension even if that
+  // dimension is contracted.
+  // The start index(es) for the non-majormost dimension(s) are constant zero(s)
+  // because we don't support dynamic slice on those dimensions.
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+triton_gemm {
+  dot_lhs = f32[2,4] parameter(0)
+  dynamic_slice_input = f32[5,4] parameter(1)
+  start_index0 = s32[] parameter(2)
+  start_index1 = s32[] parameter(3)
+  dynamic_slice = f32[2,4] dynamic-slice(dynamic_slice_input, start_index0, start_index1),
+                  dynamic_slice_sizes={2,4}
+  ROOT dot = f32[4,4] dot(dot_lhs, dynamic_slice),
+             lhs_contracting_dims={0}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  dot_lhs = f32[2,4] parameter(0)
+  dynamic_slice_input = f32[5,4] parameter(1)
+  start_index0 = s32[] constant(2)
+  start_index1 = s32[] constant(0)
+  ROOT fusion = f32[4,4] fusion(dot_lhs, dynamic_slice_input, start_index0, start_index1),
+       kind=kCustom, calls=triton_gemm,
+       backend_config={
+         "fusion_backend_config":{
+           "kind":"__triton_gemm","triton_gemm_config":{
+             "block_m":"32","block_n":"32","block_k":"32","split_k":"1",
+             "num_stages":"1","num_warps":"4","num_ctas":"1"}}}
+})";
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      kHloText, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-6}));
+}
+
+TEST_F(TritonGemmTest, DynamicSliceOfMajormostBatchDimIsSupported) {
+  // Tests that dynamic-slice works on the majormost dimension even if that
+  // dimension is a batch.
+  // The start index(es) for the non-majormost dimension(s) are constant zero(s)
+  // because we don't support dynamic slice on those dimensions.
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+triton_gemm {
+  dot_lhs = f32[2,2,4] parameter(0)
+  dynamic_slice_input = f32[7,2,4] parameter(1)
+  start_index0 = s32[] parameter(2)
+  start_index1 = s32[] parameter(3)
+  start_index2 = s32[] parameter(4)
+  dynamic_slice = f32[2,2,4] dynamic-slice(dynamic_slice_input, start_index0, start_index1, start_index2),
+                  dynamic_slice_sizes={2,2,4}
+  ROOT dot = f32[2,4,4] dot(dot_lhs, dynamic_slice),
+             lhs_batch_dims={0}, rhs_batch_dims={0}, lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  dot_lhs = f32[2,2,4] parameter(0)
+  dynamic_slice_input = f32[7,2,4] parameter(1)
+  start_index0 = s32[] constant(2)
+  start_index1 = s32[] constant(0)
+  start_index2 = s32[] constant(0)
+  ROOT fusion = f32[2,4,4] fusion(dot_lhs, dynamic_slice_input, start_index0, start_index1, start_index2),
+       kind=kCustom, calls=triton_gemm,
+       backend_config={
+         "fusion_backend_config":{
+           "kind":"__triton_gemm","triton_gemm_config":{
+             "block_m":"32","block_n":"32","block_k":"32","split_k":"1",
+             "num_stages":"1","num_warps":"4","num_ctas":"1"}}}
+})";
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      kHloText, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-6}));
+}
+
+TEST_F(TritonGemmTest, DynamicSliceSingleDimensionIntoReshapeIsSupported) {
+  // This directly tests the targeted use case (b/307922364) of iterating over
+  // layer weights and extracting them with dynamic slice.
+  // The start index(es) for the non-majormost dimension(s) are constant zero(s)
+  // because we don't support dynamic slice on those dimensions.
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+triton_gemm {
+  dot_lhs = f32[2,4] parameter(0)
+  dynamic_slice_input = f32[4,5,2] parameter(1)
+  start_index0 = s32[] parameter(2)
+  start_index1 = s32[] parameter(3)
+  start_index2 = s32[] parameter(4)
+  dynamic_slice = f32[1,5,2] dynamic-slice(dynamic_slice_input, start_index0, start_index1, start_index2),
+                             dynamic_slice_sizes={1,5,2}
+  reshape = f32[5,2] reshape(dynamic_slice)
+  ROOT d = f32[4,5] dot(dot_lhs, reshape),
+           lhs_contracting_dims={0}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  dot_lhs = f32[2,4] parameter(0)
+  dynamic_slice_input = f32[4,5,2] parameter(1)
+  start_index0 = s32[] constant(3)
+  start_index1 = s32[] constant(0)
+  start_index2 = s32[] constant(0)
+  ROOT fusion = f32[4,5] fusion(dot_lhs, dynamic_slice_input, start_index0, start_index1, start_index2),
+       kind=kCustom, calls=triton_gemm,
+       backend_config={
+         "fusion_backend_config":{
+           "kind":"__triton_gemm","triton_gemm_config":{
+             "block_m":"32","block_n":"32","block_k":"32","split_k":"1",
+             "num_stages":"1","num_warps":"4","num_ctas":"1"}}}
+})";
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      kHloText, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-6}));
+}
+
 TEST_F(TritonGemmTestAny,
        DoNotFuseConcatenationOfSplitNonContractingDimension) {
   if (SkipBF16Tests()) {
@@ -2641,11 +3091,11 @@ TEST_F(TritonGemmLevel2Test, NestedSlicingWorks) {
   const std::string kHloText = R"(
 ENTRY e {
   p1 = f32[6,24] parameter(1)
-  s1 = f32[5,20] slice(p1), slice={[1:6], [3:23]}
-  n1 = f32[5,20] negate(s1)
-  s2 = f32[3,7] slice(n1), slice={[1:4], [13:20]}
+  slice1 = f32[5,20] slice(p1), slice={[1:6], [3:23]}
+  n1 = f32[5,20] negate(slice1)
+  slice2 = f32[3,7] slice(n1), slice={[1:4], [13:20]}
   p0 = f32[7,37] parameter(0)
-  ROOT d = f32[3,37] dot(s2, p0),
+  ROOT d = f32[3,37] dot(slice2, p0),
     lhs_contracting_dims={1}, rhs_contracting_dims={0}
 })";
 
@@ -4374,7 +4824,7 @@ ENTRY e {
 }
 )";
   TritonGemmConfig config(32, 32, 32, 1, 1, 1);
-  ASSERT_OK(
+  TF_ASSERT_OK(
       CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
 CHECK:          %[[INFINITY:.*]] = arith.constant dense<0x7F800000> : tensor<32x32xf32>
 CHECK:          %[[C_MASK:.*]] = arith.constant dense<-65536> : tensor<32x32xi32>
@@ -4383,11 +4833,11 @@ CHECK:          %[[CAST_I32:.*]] = tt.bitcast %{{.*}} : tensor<32x32xf32> -> ten
 CHECK:          %[[EXTRACT_HI:.*]] = arith.andi %[[CAST_I32]], %[[C_MASK]] : tensor<32x32xi32>
 CHECK:          %[[CAST_HI:.*]] = tt.bitcast %[[EXTRACT_HI]] : tensor<32x32xi32> -> tensor<32x32xf32>
 CHECK:          %[[TRUNC_TO_BF16:.*]] = arith.truncf %[[CAST_HI]] : tensor<32x32xf32> to tensor<32x32xbf16>
-CHECK-COUNT-5:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} {allowTF32 = false, maxNumImpreciseAcc = 0 : i32} : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
+CHECK-COUNT-5:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
 CHECK:          %[[ABS:.*]] = math.absf
 CHECK:          %[[CMP:.*]] = arith.cmpf ogt, %[[INFINITY]], %[[ABS]] : tensor<32x32xf32>
 CHECK:          %[[SELECT:.*]] = arith.select %[[CMP]], %{{.*}}, %[[C0]] : tensor<32x32xi1>, tensor<32x32xf32>
-CHECK:          %[[DOT_LAST:.*]] = tt.dot %{{.*}}, %{{.*}}, %[[SELECT]] {allowTF32 = false, maxNumImpreciseAcc = 0 : i32} : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
+CHECK:          %[[DOT_LAST:.*]] = tt.dot %{{.*}}, %{{.*}}, %[[SELECT]] : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
 CHECK:          %[[ACC:.*]] = arith.addf %[[DOT_LAST]], %[[C0]] : tensor<32x32xf32>
     )"));
 
@@ -4425,11 +4875,11 @@ CHECK:          %[[CAST_I32:.*]] = tt.bitcast %{{.*}} : tensor<32x32xf32> -> ten
 CHECK:          %[[EXTRACT_HI:.*]] = arith.andi %[[CAST_I32]], %[[C_MASK]] : tensor<32x32xi32>
 CHECK:          %[[CAST_HI:.*]] = tt.bitcast %[[EXTRACT_HI]] : tensor<32x32xi32> -> tensor<32x32xf32>
 CHECK:          %[[TRUNC_TO_BF16:.*]] = arith.truncf %[[CAST_HI]] : tensor<32x32xf32> to tensor<32x32xbf16>
-CHECK-COUNT-5:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} {allowTF32 = false, maxNumImpreciseAcc = 0 : i32} : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
+CHECK-COUNT-5:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
 CHECK:          %[[ABS:.*]] = math.absf
 CHECK:          %[[CMP:.*]] = arith.cmpf ogt, %[[INFINITY]], %[[ABS]] : tensor<32x32xf32>
 CHECK:          %[[SELECT:.*]] = arith.select %[[CMP]], %{{.*}}, %[[C0]] : tensor<32x32xi1>, tensor<32x32xf32>
-CHECK:          %[[DOT_LAST:.*]] = tt.dot %{{.*}}, %{{.*}}, %[[SELECT]] {allowTF32 = false, maxNumImpreciseAcc = 0 : i32} : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
+CHECK:          %[[DOT_LAST:.*]] = tt.dot %{{.*}}, %{{.*}}, %[[SELECT]] : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
 CHECK:          %[[ACC:.*]] = arith.addf %[[DOT_LAST]], %[[C0]] : tensor<32x32xf32>
     )"));
 
@@ -4461,7 +4911,7 @@ ENTRY e {
   TritonGemmConfig config(64, 32, 32, 1, 1, 4);
   TF_ASSERT_OK(
       CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
-CHECK-COUNT-6:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} {allowTF32 = false, maxNumImpreciseAcc = 0 : i32} : tensor<64x32xbf16> * tensor<32x32xbf16> -> tensor<64x32xf32>
+CHECK-COUNT-6:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} : tensor<64x32xbf16> * tensor<32x32xbf16> -> tensor<64x32xf32>
     )"));
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, ErrorSpec{/*aabs=*/1e-5,
                                                            /*arel=*/1e-5}));
@@ -4491,7 +4941,7 @@ ENTRY e {
   TritonGemmConfig config(32, 32, 32, 1, 1, 1);
   TF_ASSERT_OK(
       CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
-CHECK-COUNT-6:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} {allowTF32 = false, maxNumImpreciseAcc = 0 : i32} : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
+CHECK-COUNT-6:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
     )"));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           GetOptimizedModule(kHloText));
@@ -4535,7 +4985,7 @@ ENTRY e {
   TritonGemmConfig config(32, 32, 32, 1, 1, 1);
   TF_ASSERT_OK(
       CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
-CHECK-COUNT-6:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} {allowTF32 = false, maxNumImpreciseAcc = 0 : i32} : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
+CHECK-COUNT-6:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
     )"));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           GetOptimizedModule(kHloText));
@@ -4591,7 +5041,7 @@ ENTRY e {
   TritonGemmConfig config(32, 32, 32, 1, 1, 1);
   TF_ASSERT_OK(
       CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
-CHECK-COUNT-6:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} {allowTF32 = false, maxNumImpreciseAcc = 0 : i32} : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
+CHECK-COUNT-6:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
     )"));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           GetOptimizedModule(kHloText));
@@ -4705,7 +5155,7 @@ ENTRY e {
 }
 )";
   TritonGemmConfig config(32, 32, 32, 1, 1, 1);
-  ASSERT_OK(
+  TF_ASSERT_OK(
       CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
 CHECK:          %[[INFINITY:.*]] = arith.constant dense<0x7F800000> : tensor<32x32xf32>
 CHECK:          %[[C_MASK:.*]] = arith.constant dense<-65536> : tensor<32x32xi32>
@@ -4714,11 +5164,11 @@ CHECK:          %[[CAST_I32:.*]] = tt.bitcast %{{.*}} : tensor<32x32xf32> -> ten
 CHECK:          %[[EXTRACT_HI:.*]] = arith.andi %[[CAST_I32]], %[[C_MASK]] : tensor<32x32xi32>
 CHECK:          %[[CAST_HI:.*]] = tt.bitcast %[[EXTRACT_HI]] : tensor<32x32xi32> -> tensor<32x32xf32>
 CHECK:          %[[TRUNC_TO_BF16:.*]] = arith.truncf %[[CAST_HI]] : tensor<32x32xf32> to tensor<32x32xbf16>
-CHECK-COUNT-2:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} {allowTF32 = false, maxNumImpreciseAcc = 0 : i32} : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
+CHECK-COUNT-2:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
 CHECK:          %[[ABS:.*]] = math.absf
 CHECK:          %[[CMP:.*]] = arith.cmpf ogt, %[[INFINITY]], %[[ABS]] : tensor<32x32xf32>
 CHECK:          %[[SELECT:.*]] = arith.select %[[CMP]], %{{.*}}, %[[C0]] : tensor<32x32xi1>, tensor<32x32xf32>
-CHECK:          %[[DOT_LAST:.*]] = tt.dot %{{.*}}, %{{.*}}, %[[SELECT]] {allowTF32 = false, maxNumImpreciseAcc = 0 : i32} : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
+CHECK:          %[[DOT_LAST:.*]] = tt.dot %{{.*}}, %{{.*}}, %[[SELECT]] : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
 CHECK:          %[[ACC:.*]] = arith.addf %[[DOT_LAST]], %[[C0]] : tensor<32x32xf32>
     )"));
 
@@ -4756,11 +5206,11 @@ CHECK:          %[[CAST_I32:.*]] = tt.bitcast %{{.*}} : tensor<32x32xf32> -> ten
 CHECK:          %[[EXTRACT_HI:.*]] = arith.andi %[[CAST_I32]], %[[C_MASK]] : tensor<32x32xi32>
 CHECK:          %[[CAST_HI:.*]] = tt.bitcast %[[EXTRACT_HI]] : tensor<32x32xi32> -> tensor<32x32xf32>
 CHECK:          %[[TRUNC_TO_BF16:.*]] = arith.truncf %[[CAST_HI]] : tensor<32x32xf32> to tensor<32x32xbf16>
-CHECK-COUNT-2:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} {allowTF32 = false, maxNumImpreciseAcc = 0 : i32} : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
+CHECK-COUNT-2:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
 CHECK:          %[[ABS:.*]] = math.absf
 CHECK:          %[[CMP:.*]] = arith.cmpf ogt, %[[INFINITY]], %[[ABS]] : tensor<32x32xf32>
 CHECK:          %[[SELECT:.*]] = arith.select %[[CMP]], %{{.*}}, %[[C0]] : tensor<32x32xi1>, tensor<32x32xf32>
-CHECK:          %[[DOT_LAST:.*]] = tt.dot %{{.*}}, %{{.*}}, %[[SELECT]] {allowTF32 = false, maxNumImpreciseAcc = 0 : i32} : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
+CHECK:          %[[DOT_LAST:.*]] = tt.dot %{{.*}}, %{{.*}}, %[[SELECT]] : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
 CHECK:          %[[ACC:.*]] = arith.addf %[[DOT_LAST]], %[[C0]] : tensor<32x32xf32>
     )"));
 
@@ -4821,7 +5271,7 @@ ENTRY e {
   TritonGemmConfig config(64, 32, 32, 1, 1, 4);
   TF_ASSERT_OK(
       CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
-CHECK-COUNT-3:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} {allowTF32 = false, maxNumImpreciseAcc = 0 : i32} : tensor<64x32xbf16> * tensor<32x32xbf16> -> tensor<64x32xf32>
+CHECK-COUNT-3:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} : tensor<64x32xbf16> * tensor<32x32xbf16> -> tensor<64x32xf32>
     )"));
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, ErrorSpec{/*aabs=*/1e-4,
                                                            /*arel=*/1e-4}));
@@ -4851,7 +5301,7 @@ ENTRY e {
   TritonGemmConfig config(32, 32, 32, 1, 1, 1);
   TF_ASSERT_OK(
       CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
-CHECK-COUNT-3:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} {allowTF32 = false, maxNumImpreciseAcc = 0 : i32} : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
+CHECK-COUNT-3:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
     )"));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           GetOptimizedModule(kHloText));
@@ -4895,7 +5345,7 @@ ENTRY e {
   TritonGemmConfig config(32, 32, 32, 1, 1, 1);
   TF_ASSERT_OK(
       CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
-CHECK-COUNT-3:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} {allowTF32 = false, maxNumImpreciseAcc = 0 : i32} : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
+CHECK-COUNT-3:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
     )"));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           GetOptimizedModule(kHloText));
@@ -4941,7 +5391,7 @@ ENTRY e {
   TritonGemmConfig config(32, 32, 32, 1, 1, 1);
   TF_ASSERT_OK(
       CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
-CHECK-COUNT-3:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} {allowTF32 = false, maxNumImpreciseAcc = 0 : i32} : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
+CHECK-COUNT-3:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
     )"));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           GetOptimizedModule(kHloText));
@@ -4961,45 +5411,6 @@ CHECK-COUNT-3:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} {allowTF32 = false, m
                                ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-4}));
 }
 
-// This test could be modified to allow TF32 once this bug is fixed.
-// TODO(b/320659359) Allow TF32 for 8-bit or less types with F32.
-TEST_F(TritonFilecheckTest, NoTF32For8BitOrLessWithF32) {
-  const std::string hlo_text = R"(
-HloModule t
-
-triton_dot {
-  parameter_0 = s32[11,24]{1,0} parameter(0)
-  broadcast.1747 = s32[11,24,128]{2,1,0} broadcast(parameter_0),
-  dimensions={0,1} parameter_1 = s32[11,24,128]{2,1,0} parameter(1)
-  compare.49 = pred[11,24,128]{2,1,0} compare(broadcast.1747, parameter_1),
-      direction=EQ bitcast.4717 = pred[264,128]{1,0} bitcast(compare.49)
-  convert.142 = f32[264,128]{1,0} convert(bitcast.4717)
-  parameter_2 = f32[128,8]{1,0} parameter(2)
-  ROOT dot.381 = f32[264,8]{1,0} dot(convert.142, parameter_2),
-      lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = s32[11,24]{1,0} parameter(0)
-  p1 = s32[11,24,128]{2,1,0} parameter(1)
-  p2 = f32[128,8]{1,0} parameter(2)
-  ROOT _ = f32[264,8] fusion(p0, p1, p2), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-      triton_gemm_config:
-        {"block_m":32,"block_n":16,"block_k":128,
-         "split_k":1,"num_stages":1,"num_warps":4,
-         "num_ctas":1}}}
-})";
-
-  TritonGemmConfig config(32, 16, 128, 1, 1, 4);
-  ASSERT_OK(
-      CreateTritonIrAndFileCheck(hlo_text, config, EmitMatMul, "triton_dot", R"(
-CHECK: %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} {allowTF32 = false
-  )"));
-
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
-}
-
 TEST_F(Triton3xBF16GemmTest, Emit3xBF16GemmEndToEnd) {
   const char* kHloText = R"(
 HloModule t
@@ -5067,6 +5478,46 @@ ENTRY entry {
               "Triton support is only enabled for Ampere GPUs and up.")));
 }
 
+// This test could be modified to allow TF32 once this bug is fixed.
+// TODO(b/320659359) Allow TF32 for 8-bit or less types with F32.
+TEST_F(TritonFilecheckTest, NoTF32For8BitOrLessWithF32) {
+  const std::string hlo_text = R"(
+HloModule t
+
+triton_dot {
+  parameter_0 = s32[11,24]{1,0} parameter(0)
+  broadcast.1747 = s32[11,24,128]{2,1,0} broadcast(parameter_0),
+  dimensions={0,1} parameter_1 = s32[11,24,128]{2,1,0} parameter(1)
+  compare.49 = pred[11,24,128]{2,1,0} compare(broadcast.1747, parameter_1),
+      direction=EQ bitcast.4717 = pred[264,128]{1,0} bitcast(compare.49)
+  convert.142 = f32[264,128]{1,0} convert(bitcast.4717)
+  parameter_2 = f32[128,8]{1,0} parameter(2)
+  ROOT dot.381 = f32[264,8]{1,0} dot(convert.142, parameter_2),
+      lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = s32[11,24]{1,0} parameter(0)
+  p1 = s32[11,24,128]{2,1,0} parameter(1)
+  p2 = f32[128,8]{1,0} parameter(2)
+  ROOT _ = f32[264,8] fusion(p0, p1, p2), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+      triton_gemm_config:
+        {"block_m":32,"block_n":16,"block_k":128,
+         "split_k":1,"num_stages":1,"num_warps":4,
+         "num_ctas":1}}}
+})";
+
+  TritonGemmConfig config(32, 16, 128, 1, 1, 4);
+  TF_ASSERT_OK(
+      CreateTritonIrAndFileCheck(hlo_text, config, EmitMatMul, "triton_dot", R"(
+CHECK:      tt.dot
+CHECK-NOT:  inputPrecision = tf32
+  )"));
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
index adcf09a0546003..d4461a2ad75582 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
@@ -72,6 +72,7 @@ limitations under the License.
 #include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Export.h"  // from @llvm-project
 #include "xla/ffi/api/c_api.h"
+#include "xla/ffi/attribute_map.h"
 #include "xla/ffi/ffi_api.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -129,6 +130,7 @@ limitations under the License.
 #include "xla/service/gpu/runtime/nccl_collective_broadcast_thunk.h"
 #include "xla/service/gpu/runtime/nccl_collective_permute_thunk.h"
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
 #include "xla/service/gpu/runtime/nccl_recv_thunk.h"
 #include "xla/service/gpu/runtime/nccl_send_thunk.h"
 #include "xla/service/gpu/runtime/norm_thunk.h"
@@ -191,6 +193,7 @@ inline std::pair<bool, int64_t> GetSendRecvAsyncEventsKey(Thunk::Kind kind,
 IrEmitterUnnested::IrEmitterUnnested(IrEmitterContext* ir_emitter_context)
     : IrEmitter(ir_emitter_context, /*is_nested=*/false),
       send_recv_events_(std::make_shared<SendRecvAsyncEvents>()),
+      copy_events_(std::make_shared<CopyThunk::AsyncEvents>()),
       elemental_emitter_(*ir_emitter_context, &b_) {}
 
 std::unique_ptr<IrEmitterUnnested> IrEmitterUnnested::Create(
@@ -207,7 +210,7 @@ absl::Status IrEmitterUnnested::EmitConstant(
   int element_bytes =
       primitive_util::ByteWidth(instr->literal().shape().element_type());
   TF_RET_CHECK(content.span().size() % element_bytes == 0);
-  // Treat int4 constant as int8 constant with half the number of elements.
+  // Treat packed constants as a byte constant.
   int num_elements = content.span().size() / element_bytes;
 
   std::string global_name = llvm_ir::ConstantHloToGlobalName(*instr);
@@ -235,7 +238,7 @@ static ConditionalThunkConfig GetConditionalThunkConfig(
   return config;
 }
 
-Status IrEmitterUnnested::EmitConditional(const HloInstruction* instr) {
+absl::Status IrEmitterUnnested::EmitConditional(const HloInstruction* instr) {
   std::vector<ThunkSequence> branch_thunks;
   branch_thunks.reserve(instr->branch_count());
 
@@ -653,7 +656,9 @@ absl::Status IrEmitterUnnested::EmitGemmThunk(
   }
 
   bool deterministic_ops =
-      ir_emitter_context_->debug_options().xla_gpu_deterministic_ops();
+      ir_emitter_context_->debug_options().xla_gpu_deterministic_ops() ||
+      ir_emitter_context_->debug_options()
+          .xla_gpu_exclude_nondeterministic_ops();
 
   TF_ASSIGN_OR_RETURN(
       GemmConfig config,
@@ -671,7 +676,7 @@ absl::Status IrEmitterUnnested::EmitCublasLtMatmulThunk(
     const HloCustomCallInstruction* instr) {
   TF_ASSIGN_OR_RETURN(const auto gpu_config,
                       instr->backend_config<xla::gpu::GpuBackendConfig>());
-  xla::gpu::GemmBackendConfig config = gpu_config.gemm_backend_config();
+  const xla::gpu::GemmBackendConfig& config = gpu_config.gemm_backend_config();
   xla::gpu::GemmBackendConfig_Epilogue epilogue = config.epilogue();
 
   TF_ASSIGN_OR_RETURN(bool has_vector_bias,
@@ -685,7 +690,7 @@ absl::Status IrEmitterUnnested::EmitCublasLtMatmulThunk(
       bool has_aux_output,
       xla::gpu::gpublas_lt::EpilogueHasAuxiliaryOutput(epilogue));
   xla::ShapeIndex output_index =
-      has_aux_output ? xla::ShapeIndex{0} : xla::ShapeIndex{};
+      instr->shape().IsTuple() ? xla::ShapeIndex{0} : xla::ShapeIndex{};
 
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice a,
                       GetAllocationSliceForHlo(instr->operand(0)));
@@ -711,6 +716,16 @@ absl::Status IrEmitterUnnested::EmitCublasLtMatmulThunk(
     TF_ASSIGN_OR_RETURN(aux, GetAllocationSliceForHlo(instr, {1}));
   }
 
+  std::optional<BufferAllocation::Slice> workspace_buffer;
+  if (instr->shape().IsTuple() &&
+      (instr->shape().tuple_shapes_size() - has_aux_output - 1)) {
+    TF_RET_CHECK((has_aux_output && instr->shape().tuple_shapes_size() == 3) ||
+                 (!has_aux_output && instr->shape().tuple_shapes_size() == 2));
+    TF_ASSIGN_OR_RETURN(workspace_buffer,
+                        GetAllocationSliceForHlo(
+                            instr, {instr->shape().tuple_shapes_size() - 1}));
+  }
+
   TF_ASSIGN_OR_RETURN(
       auto gemm_config,
       GemmConfig::For(static_cast<const HloInstruction*>(instr)));
@@ -727,7 +742,7 @@ absl::Status IrEmitterUnnested::EmitCublasLtMatmulThunk(
   auto thunk = std::make_unique<CublasLtMatmulThunk>(
       Thunk::ThunkInfo::WithProfileAnnotation(instr), std::move(gemm_config),
       blas_lt_epilogue, algorithm, a, b, c, d, bias, aux, a_scale, b_scale,
-      c_scale, d_scale, d_amax);
+      c_scale, d_scale, d_amax, workspace_buffer);
   AddThunkToThunkSequence(std::move(thunk));
   return absl::OkStatus();
 }
@@ -738,14 +753,18 @@ absl::Status IrEmitterUnnested::EmitCublasLtMatmulThunkF8(
                instr->operand_count() == 8);
   TF_ASSIGN_OR_RETURN(const auto gpu_config,
                       instr->backend_config<xla::gpu::GpuBackendConfig>());
-  xla::gpu::GemmBackendConfig config = gpu_config.gemm_backend_config();
+  const xla::gpu::GemmBackendConfig& config = gpu_config.gemm_backend_config();
   xla::gpu::GemmBackendConfig_Epilogue epilogue = config.epilogue();
 
   TF_ASSIGN_OR_RETURN(bool has_vector_bias,
                       xla::gpu::gpublas_lt::EpilogueAddsVectorBias(epilogue));
-  bool has_damax = instr->shape().IsTuple();
-  xla::ShapeIndex output_index =
-      has_damax ? xla::ShapeIndex{0} : xla::ShapeIndex{};
+
+  TF_RET_CHECK(instr->shape().IsTuple());
+  xla::ShapeIndex output_index = xla::ShapeIndex{0};
+
+  TF_ASSIGN_OR_RETURN(
+      bool has_aux_output,
+      xla::gpu::gpublas_lt::EpilogueHasAuxiliaryOutput(epilogue));
 
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice a,
                       GetAllocationSliceForHlo(instr->operand(0)));
@@ -786,7 +805,7 @@ absl::Status IrEmitterUnnested::EmitCublasLtMatmulThunkF8(
   }
 
   BufferAllocation::Slice d_amax;
-  if (has_damax) {
+  if (config.damax_output()) {
     TF_ASSIGN_OR_RETURN(d_amax, GetAllocationSliceForHlo(instr, {1}));
   }
 
@@ -801,13 +820,20 @@ absl::Status IrEmitterUnnested::EmitCublasLtMatmulThunkF8(
           : 0;
 
   BufferAllocation::Slice aux;  // Not used.
+  TF_RET_CHECK(!has_aux_output);
+  std::optional<BufferAllocation::Slice> workspace_buffer;
+  if (instr->shape().tuple_shapes_size() - config.damax_output() == 2) {
+    TF_ASSIGN_OR_RETURN(workspace_buffer,
+                        GetAllocationSliceForHlo(
+                            instr, {instr->shape().tuple_shapes_size() - 1}));
+  }
 
   TF_ASSIGN_OR_RETURN(se::gpu::BlasLt::Epilogue blas_lt_epilogue,
                       gpublas_lt::AsBlasLtEpilogue(epilogue));
   auto thunk = std::make_unique<CublasLtMatmulThunk>(
       Thunk::ThunkInfo::WithProfileAnnotation(instr), std::move(gemm_config),
       blas_lt_epilogue, algorithm, a, b, c, d, bias, aux, a_scale, b_scale,
-      c_scale, d_scale, d_amax);
+      c_scale, d_scale, d_amax, workspace_buffer);
   AddThunkToThunkSequence(std::move(thunk));
   return absl::OkStatus();
 }
@@ -951,11 +977,12 @@ absl::Status IrEmitterUnnested::EmitFusedMHAThunk(
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice output_slice,
                       GetAllocationSliceForHlo(instr, {0}));
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice scratch_slice,
-                      GetAllocationSliceForHlo(instr, {1}));
+                      GetAllocationSliceForHlo(
+                          instr, {instr->shape().tuple_shapes_size() - 1}));
   BufferAllocation::Slice activation_slice;
   bool has_activation = xla::ShapeUtil::TupleElementCount(instr->shape()) == 3;
   if (has_activation) {
-    TF_ASSIGN_OR_RETURN(activation_slice, GetAllocationSliceForHlo(instr, {2}));
+    TF_ASSIGN_OR_RETURN(activation_slice, GetAllocationSliceForHlo(instr, {1}));
   }
 
   TF_ASSIGN_OR_RETURN(const xla::gpu::CudnnfMHAKind kind,
@@ -964,30 +991,15 @@ absl::Status IrEmitterUnnested::EmitFusedMHAThunk(
   BufferAllocation::Slice seqlen_q_slice, seqlen_k_slice;
   std::optional<Shape> mask_shape, bias_shape;
   {
-    bool has_mask = kind == CudnnfMHAKind::kScaleMaskSoftmax ||
-                    kind == CudnnfMHAKind::kScaleMaskSoftmaxDropout ||
-                    kind == CudnnfMHAKind::kScaleBiasMaskSoftmax ||
-                    kind == CudnnfMHAKind::kScaleBiasMaskSoftmaxDropout;
-    bool has_bias = kind == CudnnfMHAKind::kScaleBiasMaskSoftmax ||
-                    kind == CudnnfMHAKind::kScaleBiasSoftmaxDropout ||
-                    kind == CudnnfMHAKind::kScaleBiasSoftmax ||
+    bool has_bias = kind == CudnnfMHAKind::kScaleBiasSoftmax ||
                     kind == CudnnfMHAKind::kScaleBiasSoftmaxDropout;
 
-    if (has_mask) {
-      const HloInstruction* mask = instr->operand(3);
-      TF_ASSIGN_OR_RETURN(mask_slice, GetAllocationSliceForHlo(mask));
-      mask_shape = mask->shape();
-      if (has_bias) {
-        const HloInstruction* bias = instr->operand(4);
-        TF_ASSIGN_OR_RETURN(bias_slice, GetAllocationSliceForHlo(bias));
-        bias_shape = bias->shape();
-      }
-    } else if (has_bias) {
+    if (has_bias) {
       const HloInstruction* bias = instr->operand(3);
       TF_ASSIGN_OR_RETURN(bias_slice, GetAllocationSliceForHlo(bias));
       bias_shape = bias->shape();
     }
-    int64_t seqlen_qk_operand_index = 3 + has_mask + has_bias;
+    int64_t seqlen_qk_operand_index = 3 + has_bias;
     bool has_seqlen_qk = seqlen_qk_operand_index == instr->operand_count() - 2;
     if (has_seqlen_qk) {
       const HloInstruction* seqlen_q = instr->operand(seqlen_qk_operand_index);
@@ -1006,13 +1018,12 @@ absl::Status IrEmitterUnnested::EmitFusedMHAThunk(
   absl::InlinedVector<Shape, 2> output_shapes = {
       ShapeUtil::GetSubshape(instr->shape(), {0})};
   if (has_activation) {
-    output_shapes.push_back(ShapeUtil::GetSubshape(instr->shape(), {2}));
+    output_shapes.push_back(ShapeUtil::GetSubshape(instr->shape(), {1}));
   }
   TF_ASSIGN_OR_RETURN(const auto mask_type,
                       AsCudnnFmhaMaskKind(config.mask_type()));
   GpufMHADescriptor descriptor = {kind,
                                   config,
-                                  config.is_flash_attention(),
                                   mask_type,
                                   lhs_bmm1->shape(),
                                   rhs_bmm1->shape(),
@@ -1040,7 +1051,6 @@ absl::Status IrEmitterUnnested::EmitFusedMHABackwardThunk(
                       instr->backend_config<xla::gpu::GpuBackendConfig>());
   const xla::gpu::CudnnfMHABackendConfig& config =
       gpu_config.cudnn_fmha_backend_config();
-  bool is_flash_attention = config.is_flash_attention();
 
   int input_index = 0;
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice bmm1_grad_gemm1_rhs_slice,
@@ -1059,41 +1069,20 @@ absl::Status IrEmitterUnnested::EmitFusedMHABackwardThunk(
                       GetAllocationSliceForHlo(instr->operand(input_index)));
   Shape bmm2_grad_gemm1_lhs_shape;
 
-  // fmha.getBmm2GradGemm1Lhs() could be bmm2_grad_gemm1_lhs for regular
-  // attention or softmax stats for flash attention here we set the shape to
-  // be bmm2_grad_gemm1_lhs even it is flash attention
-  if (is_flash_attention) {
-    // flash attention TODO: make sure the layout is correct for
-    // bmm2_grad_gemm1_lhs
-    Shape intermediate_tensor_shape(config.intermediate_tensor_shape());
-    bmm2_grad_gemm1_lhs_shape = intermediate_tensor_shape;
-    input_index++;
-  } else {
-    bmm2_grad_gemm1_lhs_shape = instr->operand(input_index++)->shape();
-  }
+  Shape intermediate_tensor_shape(config.intermediate_tensor_shape());
+  bmm2_grad_gemm1_lhs_shape = intermediate_tensor_shape;
+  input_index++;
 
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice d_output_slice,
                       GetAllocationSliceForHlo(instr->operand(input_index)));
   Shape d_output_shape = instr->operand(input_index++)->shape();
 
   TF_ASSIGN_OR_RETURN(const CudnnfMHAKind kind, GetCudnnfMHAKind(instr));
-  bool has_mask = kind == CudnnfMHAKind::kBackwardScaleMaskSoftmax ||
-                  kind == CudnnfMHAKind::kBackwardScaleBiasMaskSoftmax ||
-                  kind == CudnnfMHAKind::kBackwardScaleMaskSoftmaxDropout ||
-                  kind == CudnnfMHAKind::kBackwardScaleBiasMaskSoftmaxDropout;
   BufferAllocation::Slice mask_slice;
   std::optional<Shape> mask_shape;
-  if (has_mask) {
-    TF_ASSIGN_OR_RETURN(mask_slice,
-                        GetAllocationSliceForHlo(instr->operand(input_index)));
-    mask_shape = instr->operand(input_index++)->shape();
-  }
 
-  bool has_bias = is_flash_attention &&
-                  (kind == CudnnfMHAKind::kBackwardScaleBiasSoftmax ||
-                   kind == CudnnfMHAKind::kBackwardScaleBiasSoftmaxDropout ||
-                   kind == CudnnfMHAKind::kBackwardScaleBiasMaskSoftmax ||
-                   kind == CudnnfMHAKind::kBackwardScaleBiasMaskSoftmaxDropout);
+  bool has_bias = (kind == CudnnfMHAKind::kBackwardScaleBiasSoftmax ||
+                   kind == CudnnfMHAKind::kBackwardScaleBiasSoftmaxDropout);
   BufferAllocation::Slice bias_slice;
   std::optional<Shape> bias_shape;
   if (has_bias) {
@@ -1104,11 +1093,10 @@ absl::Status IrEmitterUnnested::EmitFusedMHABackwardThunk(
 
   BufferAllocation::Slice fwd_output_slice;
   std::optional<Shape> fwd_output_shape;
-  if (is_flash_attention) {
-    TF_ASSIGN_OR_RETURN(fwd_output_slice,
-                        GetAllocationSliceForHlo(instr->operand(input_index)));
-    fwd_output_shape = instr->operand(input_index++)->shape();
-  }
+
+  TF_ASSIGN_OR_RETURN(fwd_output_slice,
+                      GetAllocationSliceForHlo(instr->operand(input_index)));
+  fwd_output_shape = instr->operand(input_index++)->shape();
 
   BufferAllocation::Slice seqlen_q_slice, seqlen_k_slice;
   bool has_seqlen_qk = input_index == instr->operand_count() - 2;
@@ -1139,17 +1127,8 @@ absl::Status IrEmitterUnnested::EmitFusedMHABackwardThunk(
 
   BufferAllocation::Slice d_s_slice;
   std::optional<Shape> d_s_shape;
-  if (!is_flash_attention) {
-    TF_ASSIGN_OR_RETURN(d_s_slice,
-                        GetAllocationSliceForHlo(instr, {output_index}));
-    d_s_shape = ShapeUtil::GetSubshape(instr->shape(), {output_index++});
-  }
-
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice scratch_slice,
-                      GetAllocationSliceForHlo(instr, {output_index++}));
 
-  bool has_dbias =
-      instr->shape().tuple_shapes().size() == 6 && !is_flash_attention;
+  bool has_dbias = instr->shape().tuple_shapes().size() == 5;
   BufferAllocation::Slice d_bias_slice;
   std::optional<Shape> d_bias_shape;
   if (has_dbias) {
@@ -1157,13 +1136,14 @@ absl::Status IrEmitterUnnested::EmitFusedMHABackwardThunk(
                         GetAllocationSliceForHlo(instr, {output_index}));
     d_bias_shape = ShapeUtil::GetSubshape(instr->shape(), {output_index++});
   }
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice scratch_slice,
+                      GetAllocationSliceForHlo(instr, {output_index++}));
   TF_RET_CHECK(output_index == instr->shape().tuple_shapes().size());
   TF_ASSIGN_OR_RETURN(const auto mask_type,
                       AsCudnnFmhaMaskKind(config.mask_type()));
   GpufMHABackwardDescriptor descriptor = {
       kind,
       config,
-      is_flash_attention,
       mask_type,
       bmm1_grad_gemm1_rhs_shape,
       bmm1_grad_gemm2_rhs_shape,
@@ -1301,7 +1281,7 @@ absl::Status IrEmitterUnnested::EmitCholeskyThunk(const HloInstruction* instr) {
 
 absl::Status IrEmitterUnnested::EmitCustomCallThunk(
     const HloCustomCallInstruction* instr) {
-  const std::string call_target_name = instr->custom_call_target();
+  const std::string& call_target_name = instr->custom_call_target();
 
   // Typed FFI custom calls is a replacement for legacy custom calls with
   // a rich type safe API. It's under construction and not fully supported.
@@ -1327,12 +1307,6 @@ absl::Status IrEmitterUnnested::EmitCustomCallThunk(
       return absl::OkStatus();
     }
 
-    // TODO(ezhulenev): Custom calls registered with an XLA runtime are not part
-    // of a legacy registry, or an FFI registry. For now we simply ignore them.
-    if (debug_options.xla_gpu_enable_xla_runtime_executable()) {
-      return absl::OkStatus();
-    }
-
     return absl::UnimplementedError(
         absl::StrCat("No registered implementation for custom call to ",
                      call_target_name, " for platform ", platform_name()));
@@ -1430,8 +1404,8 @@ absl::Status IrEmitterUnnested::EmitCustomCallThunk(
       if (!backend_config_str.empty()) {
         mlir::Attribute attr = mlir::parseAttribute(
             backend_config_str, ir_emitter_context_->mlir_context());
-        if (auto dict = attr.dyn_cast_or_null<mlir::DictionaryAttr>()) {
-          TF_ASSIGN_OR_RETURN(attributes, BuildAttributesMap(dict));
+        if (auto dict = mlir::dyn_cast_or_null<mlir::DictionaryAttr>(attr)) {
+          TF_ASSIGN_OR_RETURN(attributes, xla::ffi::BuildAttributesMap(dict));
           break;
         }
         return absl::InternalError(
@@ -1565,7 +1539,7 @@ absl::Status IrEmitterUnnested::EmitTriangularSolveCustomCall(
 absl::Status IrEmitterUnnested::EmitTopKCustomCall(
     const HloCustomCallInstruction* instr) {
   auto operands = instr->operands();
-  auto shape = instr->shape();
+  const auto& shape = instr->shape();
   TF_RET_CHECK(operands.size() == 1)
       << "Expect only 1 operand for TopK custom call.";
   TF_RET_CHECK(shape.IsTuple())
@@ -1624,6 +1598,7 @@ absl::Status IrEmitterUnnested::EmitTritonCustomCall(
 
     auto triton_module =
         mlir::parseSourceString<mlir::ModuleOp>(call.ir, &mlir_context);
+    TF_RET_CHECK(triton_module);
     auto triton_fn =
         triton_module->lookupSymbol<mlir::triton::FuncOp>(call.name);
     triton_fn.setName(kernel_name);
@@ -1715,11 +1690,13 @@ absl::Status IrEmitterUnnested::EmitFusion(const HloFusionInstruction* instr,
 
 absl::Status IrEmitterUnnested::AssertNonDeterminismIsOkay(
     const std::string& op_name) {
-  if (ir_emitter_context_->debug_options().xla_gpu_deterministic_ops()) {
+  if (ir_emitter_context_->debug_options().xla_gpu_deterministic_ops() ||
+      ir_emitter_context_->debug_options()
+          .xla_gpu_exclude_nondeterministic_ops()) {
     return Unimplemented(
         "HLO instruction %s does not have a deterministic implementation, "
-        "but run-to-run determinism is required by "
-        "--xla_gpu_deterministic_ops.",
+        "but run-to-run determinism is required by --xla_gpu_deterministic_ops "
+        "or --xla_gpu_exclude_nondeterministic_ops.",
         op_name);
   }
   return absl::OkStatus();
@@ -1931,7 +1908,7 @@ absl::Status IrEmitterUnnested::EmitSelectAndScatter(
           Load(selected_index_address->getAllocatedType(),
                selected_index_address_slot));
     }
-    const Shape output_shape = instr->shape();
+    const Shape& output_shape = instr->shape();
     llvm::Value* source_value_address =
         source_array.EmitArrayElementAddress(source_index, &b_);
     llvm_ir::IrArray::Index selected_index(selected_multi_index, output_shape,
@@ -2169,7 +2146,7 @@ absl::Status IrEmitterUnnested::EmitReplicaOrPartitionId(
   return absl::OkStatus();
 }
 
-Status IrEmitterUnnested::EmitCollectivePermute(
+absl::Status IrEmitterUnnested::EmitCollectivePermute(
     const HloCollectivePermuteInstruction* instr) {
   TF_RET_CHECK(instr->operand_count() == 1);
   auto* operand = instr->operand(0);
@@ -2342,7 +2319,8 @@ absl::Status IrEmitterUnnested::EmitNcclAsyncDone(Thunk::Kind kind,
       TF_RET_CHECK(it != collectives_async_events.end())
           << "couldn't find async events for channel_id " << channel_id;
       AddThunkToThunkSequence(std::make_unique<NcclCollectiveDoneThunk>(
-          kind, Thunk::ThunkInfo::WithProfileAnnotation(inst), it->second));
+          kind, Thunk::ThunkInfo::WithProfileAnnotation(inst), it->second,
+          GetStreamKindForSendRecv(DynCast<HloSendRecvInstruction>(inst))));
       return absl::OkStatus();
     }
   }
@@ -2357,7 +2335,7 @@ absl::Status IrEmitterUnnested::EmitNcclAsyncDone(Thunk::Kind kind,
   if (async_events.mapped()) {
     AddThunkToThunkSequence(std::make_unique<NcclCollectiveDoneThunk>(
         kind, Thunk::ThunkInfo::WithProfileAnnotation(inst),
-        std::move(async_events.mapped())));
+        std::move(async_events.mapped()), AsyncStreamKind::kCollective));
   }
   return absl::OkStatus();
 }
@@ -2372,7 +2350,7 @@ absl::Status IrEmitterUnnested::EmitWaitForStreamsThunk(
   if (is_async_done) {
     wait_on_streams.push_back(
         ExecutionStreamId(gpu_config.operation_queue_id()));
-  } else if (gpu_config.wait_on_operation_queues().size() == 0) {
+  } else if (gpu_config.wait_on_operation_queues().empty()) {
     // If wait on queue is empty, we just synchronize on the main compute
     // stream from the execution stream.
     wait_on_streams.push_back(Thunk::kDefaultExecutionStreamId);
@@ -2559,60 +2537,71 @@ static std::optional<GlobalDeviceId> DeviceConstraint(
   return std::nullopt;
 }
 
+absl::StatusOr<bool> ShapeHasHostMemorySpace(Shape shape, int index,
+                                             int host_memory_space) {
+  return shape.tuple_shapes(index).has_layout() &&
+         shape.tuple_shapes(index).layout().memory_space() == host_memory_space;
+}
+
 absl::Status IrEmitterUnnested::EmitCopyStartThunk(
-    const HloCopyStartInstruction* instr) {
+    const HloCopyStartInstruction* copy_start_instr) {
   // copy-start has a tuple shape: {host, device, context},
   // or {device, host, context}.
   // Only the destination shape is needed to get the output buffer.
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice dst_buffer,
-                      GetAllocationSliceForHlo(instr,
-                                               /*ShapeIndex=*/{0}));
+                      GetAllocationSliceForHlo(copy_start_instr,
+                                               /*index=*/{0}));
 
-  const HloInstruction* src = instr->operand(0);
+  const HloInstruction* src = copy_start_instr->operand(0);
   const Shape& input_shape = src->shape();
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice src_buffer,
                       GetAllocationSliceForHlo(src, {}));
-  Shape shape = instr->shape();
+  const Shape& shape = copy_start_instr->shape();
   CHECK(shape.IsTuple());
-
-  if (shape.mutable_tuple_shapes(0)->has_layout() &&
-      shape.mutable_tuple_shapes(0)->mutable_layout()->memory_space() ==
-          static_cast<int>(stream_executor::MemoryType::kHost)) {
-    VLOG(3) << "Device to Host: host memory space "
-            << static_cast<int>(stream_executor::MemoryType::kHost);
+  int host_memory_space = static_cast<int>(stream_executor::MemoryType::kHost);
+  TF_ASSIGN_OR_RETURN(bool is_dst_host_memory,
+                      ShapeHasHostMemorySpace(shape, 0, host_memory_space));
+  TF_ASSIGN_OR_RETURN(bool is_src_host_memory,
+                      ShapeHasHostMemorySpace(shape, 1, host_memory_space));
+  if (is_dst_host_memory == is_src_host_memory) {
+    return absl::InternalError(absl::StrFormat(
+        "Copy-start %s doesn't have correct host memory space color S(%d)",
+        copy_start_instr->ToString(),
+        static_cast<int>(stream_executor::MemoryType::kHost)));
+  }
+  if (is_dst_host_memory) {
     auto thunk = std::make_unique<DeviceToHostCopyThunk>(
-        Thunk::ThunkInfo::WithProfileAnnotation(instr),
+        Thunk::ThunkInfo::WithProfileAnnotation(copy_start_instr),
         /*source_buffer=*/src_buffer,
         /*destination_buffer=*/dst_buffer,
-        /*mem_size=*/ShapeUtil::ByteSizeOf(input_shape));
+        /*mem_size=*/ShapeUtil::ByteSizeOf(input_shape),
+        /*copy_events=*/copy_events_,
+        /*copy_start_instr=*/copy_start_instr);
     AddThunkToThunkSequence(std::move(thunk));
-    return absl::OkStatus();
-  }
-  if (shape.mutable_tuple_shapes(1)->has_layout() &&
-      shape.mutable_tuple_shapes(1)->mutable_layout()->memory_space() ==
-          static_cast<int>(stream_executor::MemoryType::kHost)) {
-    VLOG(3) << "Host to Device from the host memory space "
-            << static_cast<int>(stream_executor::MemoryType::kHost);
-    ;
+  } else {
     auto thunk = std::make_unique<HostToDeviceCopyThunk>(
-        Thunk::ThunkInfo::WithProfileAnnotation(instr),
+        Thunk::ThunkInfo::WithProfileAnnotation(copy_start_instr),
         /*source_buffer=*/src_buffer,
         /*destination_buffer=*/dst_buffer,
-        /*mem_size=*/ShapeUtil::ByteSizeOf(input_shape));
+        /*mem_size=*/ShapeUtil::ByteSizeOf(input_shape),
+        /*copy_events=*/copy_events_,
+        /*copy_start_instr=*/copy_start_instr);
     AddThunkToThunkSequence(std::move(thunk));
-    return absl::OkStatus();
   }
 
-  // Disabled the generation of memcpy D2D as only H2D and D2H are useful
-  // for memory offload now.
+  return absl::OkStatus();
+}
 
-  auto thunk = std::make_unique<DeviceToDeviceCopyThunk>(
-      Thunk::ThunkInfo::WithProfileAnnotation(instr),
-      /*source_buffer=*/src_buffer,
-      /*destination_buffer=*/dst_buffer,
-      /*mem_size=*/ShapeUtil::ByteSizeOf(input_shape));
-  AddThunkToThunkSequence(std::move(thunk));
+absl::Status IrEmitterUnnested::EmitCopyDoneThunk(const HloInstruction* instr) {
+  const HloInstruction* copy_start_instr = instr->operand(0);
+  CHECK(copy_start_instr->opcode() == HloOpcode::kCopyStart);
 
+  auto thunk = std::make_unique<CopyDoneThunk>(
+      Thunk::kCopyDone,
+      Thunk::ThunkInfo::WithProfileAnnotation(copy_start_instr),
+      /*copy_events=*/copy_events_,
+      /*copy_start_instr=*/copy_start_instr);
+  AddThunkToThunkSequence(std::move(thunk));
   return absl::OkStatus();
 }
 
@@ -2921,6 +2910,9 @@ absl::Status IrEmitterUnnested::EmitHloInstruction(
       if (instr->custom_call_target() == "__gpu$xla.gpu.triton") {
         return EmitTritonCustomCall(custom_call);
       }
+      if (instr->custom_call_target() == kNopCustomCallTarget) {
+        return absl::OkStatus();
+      }
       return EmitCustomCallThunk(custom_call);
     }
     case HloOpcode::kFusion: {
@@ -2963,6 +2955,8 @@ absl::Status IrEmitterUnnested::EmitHloInstruction(
       return EmitWhile(instr);
     case HloOpcode::kCopyStart:
       return EmitCopyStartThunk(Cast<HloCopyStartInstruction>(instr));
+    case HloOpcode::kCopyDone:
+      return EmitCopyDoneThunk(instr);
 
     // HLO module is already scheduled, so instructions for ordering are noops.
     case HloOpcode::kAddDependency:
@@ -2973,7 +2967,6 @@ absl::Status IrEmitterUnnested::EmitHloInstruction(
     case HloOpcode::kGetTupleElement:
     case HloOpcode::kParameter:
     case HloOpcode::kTuple:
-    case HloOpcode::kCopyDone:
       return absl::OkStatus();
     default:
       return Internal("Unsupported instruction opcode: %s",
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_unnested.h b/third_party/xla/xla/service/gpu/ir_emitter_unnested.h
index f91db2db5ecfeb..99342ce37d74ad 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_unnested.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_unnested.h
@@ -42,6 +42,7 @@ limitations under the License.
 #include "xla/service/gpu/ir_emitter.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/runtime/copy_thunk.h"
 #include "xla/service/gpu/runtime/send_recv_thunk.h"
 #include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/llvm_ir/ir_array.h"
@@ -199,6 +200,8 @@ class IrEmitterUnnested : public IrEmitter {
 
   absl::Status EmitCopyStartThunk(const HloCopyStartInstruction* instr);
 
+  absl::Status EmitCopyDoneThunk(const HloInstruction* instr);
+
   absl::Status EmitHloInstruction(const HloInstruction* instr);
 
   absl::Status EmitTargetElementLoop(
@@ -377,6 +380,9 @@ class IrEmitterUnnested : public IrEmitter {
   // Container for async send/recv events shared by send/recv thunks.
   std::shared_ptr<SendRecvAsyncEvents> send_recv_events_;
 
+  // Container for async copy-start/copy-done events.
+  std::shared_ptr<CopyThunk::AsyncEvents> copy_events_;
+
   // Returns the ShapedSlices for the given operands.
   absl::StatusOr<std::vector<ShapedSlice>> GetShapedSlices(
       mlir::Operation::operand_range operands);
diff --git a/third_party/xla/xla/service/gpu/kernels/BUILD b/third_party/xla/xla/service/gpu/kernels/BUILD
index ae1d8d247c5993..a7add705572576 100644
--- a/third_party/xla/xla/service/gpu/kernels/BUILD
+++ b/third_party/xla/xla/service/gpu/kernels/BUILD
@@ -1,7 +1,10 @@
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
-load("@local_tsl//tsl/platform:build_config_root.bzl", "tf_gpu_tests_tags")
-load("@local_tsl//tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
+load(
+    "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
+    "if_cuda_is_configured",
+    "if_cuda_newer_than",
+)
 load("//xla:xla.bzl", "xla_cc_test")
 load("//xla/service/gpu:build_defs.bzl", "gpu_kernel_library")
 load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
@@ -44,7 +47,6 @@ cc_library(
     hdrs = ["custom_kernel_fusion_pattern.h"],
     visibility = [":friends"],
     deps = [
-        "//xla:statusor",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/stream_executor:device_description",
@@ -88,11 +90,12 @@ cc_library(
         ":cutlass_gemm_custom_kernel",
         "//xla:shape_util",
         "//xla:status",
-        "//xla:statusor",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:pattern_matcher",
+        "//xla/service/gpu:backend_configs_cc",
         "//xla/stream_executor:device_description",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
@@ -140,6 +143,7 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/stream_executor",  # build_cleaner: keep
         "//xla/stream_executor:platform",
+        "//xla/stream_executor:typed_kernel_factory",
         "//xla/stream_executor/gpu:gpu_stream_header",
         "//xla/stream_executor/gpu:gpu_types_header",
         "@com_google_absl//absl/numeric:bits",
@@ -173,16 +177,16 @@ gpu_kernel_library(
 xla_cc_test(
     name = "topk_kernel_test",
     srcs = if_gpu_is_configured(["topk_kernel_test.cc"]),
-    tags = tf_gpu_tests_tags(),
+    use_gpu = True,
     deps = [
         ":topk_kernel",
         "//xla:types",
         "//xla:xla_data_proto_cc",
         "//xla/stream_executor",  # build_cleaner: keep
+        "//xla/stream_executor:device_memory_handle",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor/gpu:gpu_init",
-        "//xla/stream_executor/gpu:gpu_stream_header",
-        "//xla/stream_executor/gpu:gpu_timer_header",
+        "//xla/stream_executor/gpu:gpu_timer",
         "//xla/stream_executor/gpu:gpu_types_header",
         "//xla/stream_executor/host:host_platform",
         "@com_google_absl//absl/log:check",
@@ -229,6 +233,7 @@ xla_test(
         "//xla:xla_data_proto_cc",
         "//xla/service:platform_util",
         "//xla/stream_executor",
+        "//xla/stream_executor:kernel_factory",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor/cuda:cuda_platform",
@@ -265,6 +270,7 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@local_config_cuda//cuda:cuda_headers",
     ],
 )
 
@@ -277,6 +283,7 @@ xla_test(
         ":cutlass_gemm_custom_kernel",
         "//xla:xla_data_proto_cc",
         "//xla/stream_executor",
+        "//xla/stream_executor:kernel_factory",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor/cuda:cuda_platform",
@@ -297,6 +304,7 @@ cc_binary(
         "//xla:xla_data_proto_cc",
         "//xla/service:gpu_plugin",
         "//xla/stream_executor",
+        "//xla/stream_executor:kernel_factory",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor/cuda:cuda_platform",
@@ -348,9 +356,11 @@ cc_library(
     deps = [
         ":cutlass_gemm_kernel_bf16xbf16_to_bf16",
         ":cutlass_gemm_kernel_bf16xbf16_to_bf16_sm80",
-        ":cutlass_gemm_kernel_bf16xbf16_to_bf16_sm90",
         ":cutlass_gemm_kernel_f32xf32_to_f32",
-    ],
+    ] + if_cuda_newer_than(
+        "12_0",
+        [":cutlass_gemm_kernel_bf16xbf16_to_bf16_sm90"],
+    ),
 )
 
 # CUTLASS requires all loops to be unrolled, and in some kernels defined below we force Clang/LLVM
diff --git a/third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion_pattern.cc b/third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion_pattern.cc
index aa5531967b7b2e..b290585680bc2c 100644
--- a/third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion_pattern.cc
+++ b/third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion_pattern.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/statusor.h"
 #include "xla/stream_executor/device_description.h"
 
 namespace xla::gpu {
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel.cc b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel.cc
index 63f9cbe6839022..42f0d84531391d 100644
--- a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel.cc
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel.cc
@@ -25,9 +25,9 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "third_party/gpus/cuda/include/cuda.h"
 #include "xla/service/gpu/kernels/custom_kernel.h"
 #include "xla/service/gpu/kernels/cutlass_gemm.h"
-#include "xla/statusor.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/kernel_spec.h"
@@ -210,10 +210,13 @@ absl::StatusOr<CustomKernel> GetCutlassGemmKernel(
       return Load<F32xF32ToF32<Default>>(std::move(name), m, n, k, indices,
                                          slices, device);
     case PrimitiveType::BF16:
+#if CUDA_VERSION >= 12000
       if (cuda_cc.IsAtLeastHopper()) {
         return Load<Bf16xBf16ToBf16<Sm90>>(std::move(name), m, n, k, indices,
                                            slices, device);
-      } else if (cuda_cc.IsAtLeastAmpere()) {
+      }
+#endif
+      if (cuda_cc.IsAtLeastAmpere()) {
         return Load<Bf16xBf16ToBf16<Sm80>>(std::move(name), m, n, k, indices,
                                            slices, device);
       }
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_benchmarks.cc b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_benchmarks.cc
index e2f7bccea9fe57..b843fac733154c 100644
--- a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_benchmarks.cc
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_benchmarks.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "xla/service/gpu/kernels/cutlass_gemm_custom_kernel.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_factory.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
@@ -56,7 +57,8 @@ static void BM_RowMajorGemm(benchmark::State& state) {
                            /*indices=*/{0, 1, 2}, /*slices=*/{}, device);
 
   TF_ASSERT_OK_AND_ASSIGN(
-      auto gemm, se::Kernel::Create(executor, custom_kernel->kernel_spec()));
+      auto gemm,
+      se::KernelFactory::Create(executor, custom_kernel->kernel_spec()));
 
   // Prepare arguments: a=1.1, b=1.2, c=0.0
   se::DeviceMemory<float> a = executor->AllocateArray<float>(m * k, 0);
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_test.cc b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_test.cc
index 3748ed5251564e..b566fceab8511c 100644
--- a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_test.cc
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_factory.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
@@ -46,7 +47,8 @@ TEST(CutlassGemmKernelTest, SimpleGemm) {
       /*indices=*/{0, 1, 2}, /*slices=*/{}, executor->GetDeviceDescription());
 
   TF_ASSERT_OK_AND_ASSIGN(
-      auto gemm, se::Kernel::Create(executor, custom_kernel->kernel_spec()));
+      auto gemm,
+      se::KernelFactory::Create(executor, custom_kernel->kernel_spec()));
 
   int64_t length = 4 * 4;
   int64_t byte_length = sizeof(float) * length;
@@ -96,7 +98,8 @@ TEST(CutlassGemmKernelTest, LoadFromSharedLibrary) {
       /*indices=*/{0, 1, 2}, /*slices=*/{}, executor->GetDeviceDescription());
 
   TF_ASSERT_OK_AND_ASSIGN(
-      auto gemm, se::Kernel::Create(executor, custom_kernel->kernel_spec()));
+      auto gemm,
+      se::KernelFactory::Create(executor, custom_kernel->kernel_spec()));
 
   int64_t length = 4 * 4;
   int64_t byte_length = sizeof(float) * length;
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc
index 9fdbcd6790633b..8891e407c7ff78 100644
--- a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc
@@ -21,12 +21,14 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/kernels/custom_kernel.h"
 #include "xla/service/gpu/kernels/custom_kernel_fusion.h"
 #include "xla/service/gpu/kernels/custom_kernel_fusion_pattern.h"
@@ -35,7 +37,6 @@ limitations under the License.
 #include "xla/service/pattern_matcher.h"
 #include "xla/shape.h"
 #include "xla/status.h"
-#include "xla/statusor.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/service/gpu/kernels/topk_custom_kernel_test.cc b/third_party/xla/xla/service/gpu/kernels/topk_custom_kernel_test.cc
index d50fd054df2ac3..b083118d0a8d7c 100644
--- a/third_party/xla/xla/service/gpu/kernels/topk_custom_kernel_test.cc
+++ b/third_party/xla/xla/service/gpu/kernels/topk_custom_kernel_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/strings/substitute.h"
 #include "xla/service/platform_util.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_factory.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
@@ -111,7 +112,8 @@ TEST_P(TopKKernelTest, TopKFloat) {
       GetTopKKernel("topk", PrimitiveType::F32, n, k, batch_size);
 
   TF_ASSERT_OK_AND_ASSIGN(
-      auto kernel, se::Kernel::Create(executor, custom_kernel->kernel_spec()));
+      auto kernel,
+      se::KernelFactory::Create(executor, custom_kernel->kernel_spec()));
 
   // Launch topk kernel with device memory arguments.
   se::KernelArgsDeviceMemoryArray arr(
@@ -165,7 +167,8 @@ TEST_P(TopKKernelTest, TopKPackedNegative) {
       GetTopKKernel("topk", PrimitiveType::F32, n, k, batch_size);
 
   TF_ASSERT_OK_AND_ASSIGN(
-      auto kernel, se::Kernel::Create(executor, custom_kernel->kernel_spec()));
+      auto kernel,
+      se::KernelFactory::Create(executor, custom_kernel->kernel_spec()));
 
   // Launch topk kernel with device memory arguments.
   se::KernelArgsDeviceMemoryArray arr(
diff --git a/third_party/xla/xla/service/gpu/kernels/topk_kernel.cc b/third_party/xla/xla/service/gpu/kernels/topk_kernel.cc
index 1261b5ea23402b..1595d823b41fd8 100644
--- a/third_party/xla/xla/service/gpu/kernels/topk_kernel.cc
+++ b/third_party/xla/xla/service/gpu/kernels/topk_kernel.cc
@@ -29,8 +29,10 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "xla/primitive_util.h"
 #include "xla/service/gpu/kernels/topk_kernel_common.h"
+#include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/typed_kernel_factory.h"
 #include "xla/types.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
@@ -83,9 +85,10 @@ absl::Status TypedTopK(se::Stream* stream, se::DeviceMemoryBase data,
   TF_ASSIGN_OR_RETURN(void* kernel_symbol, GetKernel<T>(num_elements, k));
   TF_ASSIGN_OR_RETURN(
       auto kernel,
-      (se::TypedKernel<se::DeviceMemory<T>, size_t, se::DeviceMemory<T>,
-                       se::DeviceMemory<uint32_t>,
-                       size_t>::Create(executor, "topk", kernel_symbol)));
+      (se::TypedKernelFactory<se::DeviceMemory<T>, size_t, se::DeviceMemory<T>,
+                              se::DeviceMemory<uint32_t>,
+                              size_t>::Create(executor, "topk",
+                                              kernel_symbol)));
 
   TF_RETURN_IF_ERROR(stream->ThenLaunch(
       se::ThreadDim(num_threads, 1, 1), se::BlockDim(batch_size, 1, 1),
diff --git a/third_party/xla/xla/service/gpu/kernels/topk_kernel_test.cc b/third_party/xla/xla/service/gpu/kernels/topk_kernel_test.cc
index a5770e9a80e3ed..16ab564cb8266b 100644
--- a/third_party/xla/xla/service/gpu/kernels/topk_kernel_test.cc
+++ b/third_party/xla/xla/service/gpu/kernels/topk_kernel_test.cc
@@ -27,8 +27,8 @@ limitations under the License.
 #include "absl/random/random.h"
 #include "absl/strings/substitute.h"
 #include "absl/time/time.h"
+#include "xla/stream_executor/device_memory_handle.h"
 #include "xla/stream_executor/gpu/gpu_init.h"
-#include "xla/stream_executor/gpu/gpu_stream.h"
 #include "xla/stream_executor/gpu/gpu_timer.h"
 #include "xla/stream_executor/gpu/gpu_types.h"
 #include "xla/stream_executor/platform.h"
@@ -97,28 +97,32 @@ TEST_P(TopkTest, TopKFloat) {
   const auto [n_kb, k, batch_size, offset] = GetParam();
   const size_t n = n_kb * 1024 + offset;
 
-  stream_executor::ScopedDeviceMemory<T> input_buffer(
+  stream_executor::DeviceMemoryHandle input_buffer(
       executor, executor->AllocateArray<T>(n * batch_size));
-  stream_executor::ScopedDeviceMemory<T> output_values(
+  stream_executor::DeviceMemoryHandle output_values(
       executor, executor->AllocateArray<T>(k * batch_size));
-  stream_executor::ScopedDeviceMemory<T> output_indices(
+  stream_executor::DeviceMemoryHandle output_indices(
       executor, executor->AllocateArray<uint32_t>(k * batch_size));
 
-  ASSERT_TRUE(!(input_buffer.is_null() || output_values.is_null() ||
-                output_indices.is_null()));
+  ASSERT_TRUE(!(input_buffer.memory().is_null() ||
+                output_values.memory().is_null() ||
+                output_indices.memory().is_null()));
 
   auto source = RandomVec<T>(n * batch_size);
-  CHECK_OK(stream->Memcpy(input_buffer.ptr(), source.data(),
+  CHECK_OK(stream->Memcpy(input_buffer.memory_ptr(), source.data(),
                           n * batch_size * sizeof(T)));
 
-  ASSERT_TRUE(RunTopk(stream.get(), Get(T()), *input_buffer, n, *output_values,
-                      *output_indices, k, batch_size)
+  ASSERT_TRUE(RunTopk(stream.get(), Get(T()), input_buffer.memory(), n,
+                      output_values.memory(), output_indices.memory(), k,
+                      batch_size)
                   .ok());
   std::vector<T> got(k);
   ASSERT_TRUE(stream->BlockHostUntilDone().ok());
   for (int i = 0; i < batch_size; i++) {
-    CHECK_OK(stream->Memcpy(got.data(), output_values->GetSlice(k * i, k),
-                            k * sizeof(T)));
+    CHECK_OK(stream->Memcpy(
+        got.data(),
+        se::DeviceMemory<T>(output_values.memory()).GetSlice(k * i, k),
+        k * sizeof(T)));
     std::vector<T> slice(source.data() + n * i, source.data() + n * (i + 1));
     std::sort(slice.begin(), slice.end(), std::greater<T>());
     slice.resize(k);
@@ -136,28 +140,32 @@ TEST_P(TopkTest, TopKPackedNegative) {
   const auto [n_kb, k, batch_size, offset] = GetParam();
   const size_t n = n_kb * 1024 + offset;
 
-  stream_executor::ScopedDeviceMemory<T> input_buffer(
+  stream_executor::DeviceMemoryHandle input_buffer(
       executor, executor->AllocateArray<T>(n * batch_size));
-  stream_executor::ScopedDeviceMemory<T> output_values(
+  stream_executor::DeviceMemoryHandle output_values(
       executor, executor->AllocateArray<T>(k * batch_size));
-  stream_executor::ScopedDeviceMemory<T> output_indices(
+  stream_executor::DeviceMemoryHandle output_indices(
       executor, executor->AllocateArray<uint32_t>(k * batch_size));
 
-  ASSERT_TRUE(!(input_buffer.is_null() || output_values.is_null() ||
-                output_indices.is_null()));
+  ASSERT_TRUE(!(input_buffer.memory().is_null() ||
+                output_values.memory().is_null() ||
+                output_indices.memory().is_null()));
 
   auto source = RandomVecNegative<T>(n * batch_size);
-  CHECK_OK(stream->Memcpy(input_buffer.ptr(), source.data(),
+  CHECK_OK(stream->Memcpy(input_buffer.memory_ptr(), source.data(),
                           n * batch_size * sizeof(T)));
 
-  ASSERT_TRUE(RunTopk(stream.get(), Get(T()), *input_buffer, n, *output_values,
-                      *output_indices, k, batch_size)
+  ASSERT_TRUE(RunTopk(stream.get(), Get(T()), input_buffer.memory(), n,
+                      output_values.memory(), output_indices.memory(), k,
+                      batch_size)
                   .ok());
   std::vector<T> got(k);
   ASSERT_TRUE(stream->BlockHostUntilDone().ok());
   for (int i = 0; i < batch_size; i++) {
-    CHECK_OK(stream->Memcpy(got.data(), output_values->GetSlice(k * i, k),
-                            k * sizeof(T)));
+    CHECK_OK(stream->Memcpy(
+        got.data(),
+        se::DeviceMemory<T>(output_values.memory()).GetSlice(k * i, k),
+        k * sizeof(T)));
     std::vector<T> slice(source.data() + n * i, source.data() + n * (i + 1));
     std::sort(slice.begin(), slice.end(), std::greater<T>());
     slice.resize(k);
@@ -193,15 +201,15 @@ void BM_SmallTopk(benchmark::State& state) {
   auto* executor = GetGpuExecutor();
   auto stream = executor->CreateStream().value();
 
-  stream_executor::ScopedDeviceMemory<T> input_buffer(
+  stream_executor::DeviceMemoryHandle input_buffer(
       executor, executor->AllocateArray<T>(n * batch_size));
-  stream_executor::ScopedDeviceMemory<T> output_values(
+  stream_executor::DeviceMemoryHandle output_values(
       executor, executor->AllocateArray<T>(k * batch_size));
-  stream_executor::ScopedDeviceMemory<T> output_indices(
+  stream_executor::DeviceMemoryHandle output_indices(
       executor, executor->AllocateArray<uint32_t>(k * batch_size));
 
-  if (input_buffer.is_null() || output_values.is_null() ||
-      output_indices.is_null()) {
+  if (input_buffer.memory().is_null() || output_values.memory().is_null() ||
+      output_indices.memory().is_null()) {
     state.SkipWithError("Unable to allocate GPU memory: aborting benchmark");
     return;
   }
@@ -210,19 +218,21 @@ void BM_SmallTopk(benchmark::State& state) {
   // use the same random vector for all batches (otherwise it takes too much
   // time to generate random data)
   for (size_t i = 0; i < batch_size; i++) {
-    auto slice = input_buffer->GetSlice(i * n, n);
+    auto slice = se::DeviceMemory<T>(input_buffer.memory()).GetSlice(i * n, n);
     CHECK_OK(stream->Memcpy(&slice, source.data(), n * sizeof(T)));
   }
 
   for (auto _ : state) {
     // Warmup execution without GpuTimer active
-    CHECK_OK(RunTopk(stream.get(), Get(T()), *input_buffer, n, *output_values,
-                     *output_indices, k, batch_size));
+    CHECK_OK(RunTopk(stream.get(), Get(T()), input_buffer.memory(), n,
+                     output_values.memory(), output_indices.memory(), k,
+                     batch_size));
     auto timer = se::gpu::GpuTimer::Create(stream.get(),
                                            true /* warmup run was executed */);
     CHECK_OK(timer.status());
-    CHECK_OK(RunTopk(stream.get(), Get(T()), *input_buffer, n, *output_values,
-                     *output_indices, k, batch_size));
+    CHECK_OK(RunTopk(stream.get(), Get(T()), input_buffer.memory(), n,
+                     output_values.memory(), output_indices.memory(), k,
+                     batch_size));
     auto timer_duration = timer.value().GetElapsedDuration();
     CHECK_OK(timer_duration.status());
     state.SetIterationTime(absl::ToDoubleSeconds(timer_duration.value()));
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD b/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
index 6cef8a063c4396..f91779bd5ba01f 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
@@ -2,6 +2,10 @@ load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm_is_configured",
 )
+load(
+    "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
+    "if_cuda_is_configured",
+)
 load("//xla:xla.bzl", "xla_cc_test")
 load("//xla/tsl:tsl.bzl", "internal_visibility")
 
@@ -28,6 +32,9 @@ cc_library(
         "gpu_backend_lib.h",
         "utils.h",
     ],
+    local_defines = if_cuda_is_configured([
+        "GOOGLE_CUDA=1",
+    ]) + if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
     deps = [
         "//xla:status",
         "//xla:status_macros",
@@ -75,8 +82,11 @@ cc_library(
         "@local_tsl//tsl/platform:rocm_rocdl_path",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/profiler/lib:scoped_annotation",
         "@local_tsl//tsl/profiler/lib:traceme",
-    ] + if_rocm_is_configured([
+    ] + if_cuda_is_configured([
+        "//xla/stream_executor/cuda:cuda_asm_compiler",
+    ]) + if_rocm_is_configured([
         "@local_config_rocm//rocm:rocm_headers",
         "@llvm-project//llvm:AMDGPUCodeGen",
         "@llvm-project//llvm:AMDGPUAsmParser",
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index c84dd8f6092bed..7f4deb747d7c42 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
 
+#include <algorithm>
 #include <cstdint>
 #include <fstream>
 #include <functional>
@@ -23,6 +24,7 @@ limitations under the License.
 #include <mutex>  // NOLINT
 #include <optional>
 #include <string>
+#include <string_view>
 #include <system_error>  // NOLINT
 #include <utility>
 #include <variant>
@@ -91,6 +93,7 @@ limitations under the License.
 #include "tsl/platform/rocm_rocdl_path.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
+#include "tsl/profiler/lib/scoped_annotation.h"
 #include "tsl/profiler/lib/traceme.h"
 
 #if !defined(PLATFORM_GOOGLE) && TENSORFLOW_USE_ROCM
@@ -99,6 +102,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #include "third_party/gpus/cuda/include/cuda.h"
+#include "xla/stream_executor/cuda/cuda_asm_compiler.h"
 #endif
 
 namespace xla {
@@ -217,6 +221,10 @@ std::unique_ptr<llvm::TargetMachine> GetTargetMachine(
 // for the NVPTX target.
 std::string EmitModuleToPTX(llvm::Module* module,
                             llvm::TargetMachine* target_machine) {
+  tsl::profiler::ScopedAnnotation annotation([&] {
+    return absl::StrFormat("XlaEmitGpuAsm:#module=%s#",
+                           module->getName().str());
+  });
   std::string ptx;
   llvm::raw_string_ostream stream(ptx);
   llvm::buffer_ostream pstream(stream);
@@ -316,23 +324,67 @@ absl::Status NVPTXTargetModuleLinker(llvm::Module* module,
   return absl::OkStatus();
 }
 
+#ifdef GOOGLE_CUDA
+namespace {
+constexpr int kFallbackPtxVersion = 65;
+
+int DetermineHighestSupportedPtxVersionFromCudaVersion(
+    stream_executor::ToolVersion cuda_version) {
+  if (cuda_version[0] < 11) {
+    // For everything below CUDA 11 we just fall back to PTX 6.5.
+    // We don't support CUDA below 11 anymore.
+    return kFallbackPtxVersion;
+  }
+
+  // Mapping determined from
+  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release-notes
+  // Examples:
+  // CUDA 11.0 -> PTX 7.0
+  // CUDA 11.1 -> PTX 7.1
+  // CUDA 12.0 -> PTX 8.0
+  // CUDA 12.4 -> PTX 8.4 etc.
+  return (cuda_version[0] - 4) * 10 + cuda_version[1];
+}
+}  // namespace
+#endif
+
 std::unique_ptr<llvm::TargetMachine> NVPTXGetTargetMachine(
     llvm::Triple target_triple, se::CudaComputeCapability compute_capability,
     const DebugOptions& debug_options) {
-  // Figure out the exact name of the processor as known to the NVPTX backend
-  // from the gpu_architecture flag.
-#if defined(GOOGLE_CUDA) && CUDA_VERSION >= 12010
-  // use ptx81 for CUDA >= 12.1
-  return GetTargetMachine(target_triple, GetSmName(compute_capability),
-                          debug_options, /*feature_str=*/"+ptx81");
-#endif
+#ifdef GOOGLE_CUDA
+  absl::StatusOr<stream_executor::ToolVersion> runtime_cuda_version =
+      stream_executor::GetAsmCompilerVersion(
+          debug_options.xla_gpu_cuda_data_dir());
+
+  const stream_executor::ToolVersion kCompileTimeCudaVersion{
+      CUDA_VERSION / 1000, (CUDA_VERSION / 10) % 100, CUDA_VERSION % 10};
+
+  auto highest_supported_cuda_version = [&] {
+    if (runtime_cuda_version.ok()) {
+      return std::min(runtime_cuda_version.value(), kCompileTimeCudaVersion);
+    }
+
+    return kCompileTimeCudaVersion;
+  }();
+
+  auto highest_supported_ptx_version =
+      DetermineHighestSupportedPtxVersionFromCudaVersion(
+          highest_supported_cuda_version);
+
+  VLOG(1) << "Targeting PTX version: " << highest_supported_ptx_version;
+  std::string feature_str =
+      absl::StrFormat("+ptx%d", highest_supported_ptx_version);
+
+#else
+  std::string feature_str;
+#endif  // GOOGLE_CUDA
   return GetTargetMachine(target_triple, GetSmName(compute_capability),
-                          debug_options, /*feature_str=*/"+ptx74");
+                          debug_options, feature_str);
 }
 
 using TargetModuleLinker =
-    std::function<Status(llvm::Module*, se::GpuComputeCapability,
-                         const DebugOptions&, const std::string&)>;
+    std::function<absl::Status(llvm::Module*, se::GpuComputeCapability,
+                               const DebugOptions&, const std::string&)>;
 
 void DumpModule(const std::string output_filename, const llvm::Module* module) {
   std::error_code ec;
@@ -388,6 +440,10 @@ absl::Status LinkAndOptimizeModule(
     const DebugOptions& debug_options, const std::string& device_bitcode_path,
     TargetModuleLinker module_linker, llvm::Triple default_target_triple,
     llvm::TargetMachine* target_machine, int inline_threshold) {
+  tsl::profiler::ScopedAnnotation annotation([&] {
+    return absl::StrFormat("XlaOptimizeLlvmIr:#module=%s#",
+                           module->getName().str());
+  });
   TF_RETURN_IF_ERROR(
       module_linker(module, gpu_version, debug_options, device_bitcode_path));
 
diff --git a/third_party/xla/xla/service/gpu/loop_double_buffer_transformer.cc b/third_party/xla/xla/service/gpu/loop_double_buffer_transformer.cc
deleted file mode 100644
index dffa62b799a499..00000000000000
--- a/third_party/xla/xla/service/gpu/loop_double_buffer_transformer.cc
+++ /dev/null
@@ -1,277 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "xla/service/gpu/loop_double_buffer_transformer.h"
-
-#include <cstdint>
-#include <iterator>
-#include <optional>
-#include <string>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "absl/strings/string_view.h"
-#include "xla/hlo/ir/hlo_casting_utils.h"
-#include "xla/hlo/ir/hlo_clone_context.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/hlo/utils/hlo_query.h"
-#include "xla/service/collective_ops_utils.h"
-#include "xla/service/flatten_call_graph.h"
-#include "xla/status.h"
-#include "xla/util.h"
-#include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
-
-namespace xla {
-namespace gpu {
-
-namespace {
-
-void SetChannelIdForNewCollective(HloInstruction* new_instr,
-                                  const HloModule* module) {
-  // This is to track mappings of old->new channel id for async collectives
-  // wrapped in the form of HloAsyncInstruction, the start and done need to
-  // have the same unique channel id.
-  absl::flat_hash_map<int64_t, int64_t> old_to_new_channel_id_map;
-  absl::flat_hash_map<int64_t, HloComputation*> channel_id_comp_map;
-  if (new_instr->IsAsynchronous() && hlo_query::IsCollectiveCommunicationOp(
-                                         new_instr->async_wrapped_opcode())) {
-    HloInstruction* wrapped_instr =
-        DynCast<HloAsyncInstruction>(new_instr)->async_wrapped_instruction();
-    int64_t old_channel_id = *wrapped_instr->channel_id();
-    int64_t new_channel_id = old_to_new_channel_id_map[old_channel_id];
-    if (old_to_new_channel_id_map.find(old_channel_id) ==
-        old_to_new_channel_id_map.end()) {
-      new_channel_id = hlo_query::NextChannelId(*module);
-      VLOG(2) << "Generated new channel id " << new_channel_id;
-      old_to_new_channel_id_map[old_channel_id] = new_channel_id;
-    }
-
-    VLOG(2) << "Setting channel id to " << new_channel_id;
-
-    wrapped_instr->set_channel_id(new_channel_id);
-    if (channel_id_comp_map.find(new_channel_id) == channel_id_comp_map.end()) {
-      channel_id_comp_map[new_channel_id] =
-          new_instr->async_wrapped_computation();
-    } else {
-      channel_id_comp_map[new_channel_id]->AddAsyncStart(new_instr);
-    }
-  } else if (hlo_query::IsCollectiveCommunicationOp(new_instr->opcode()) ||
-             hlo_query::IsAsyncCollectiveStartOp(new_instr)) {
-    new_instr->set_channel_id(hlo_query::NextChannelId(*module));
-  }
-}
-
-absl::Status PeelInstructionsForOddTripCount(HloModule* module,
-                                             HloInstruction* while_instr) {
-  std::string suffix = "peeled_double_buffer";
-  absl::flat_hash_map<HloInstruction*, HloInstruction*> old_to_new_map;
-  HloComputation* while_body = while_instr->while_body();
-  HloInstruction* input_parameter = while_body->parameter_instruction(0);
-  HloInstruction* input_tuple = while_instr->mutable_operand(0);
-
-  auto old_loop_roots = while_body->root_instruction()->mutable_operands();
-  HloComputation* parent_comp = while_instr->parent();
-  old_to_new_map[input_parameter] = input_tuple;
-
-  for (HloInstruction* old_instr : while_body->MakeInstructionPostOrder()) {
-    if (old_to_new_map.find(old_instr) != old_to_new_map.end()) {
-      continue;
-    }
-    VLOG(2) << "Peeling instruction " << old_instr->ToString();
-    std::vector<HloInstruction*> new_operands(old_instr->operand_count());
-    for (int64_t i = 0; i < old_instr->operand_count(); i++) {
-      new_operands[i] = old_to_new_map[old_instr->mutable_operand(i)];
-    }
-    HloInstruction* new_instr =
-        parent_comp->AddInstruction(old_instr->CloneWithNewOperands(
-            old_instr->shape(), new_operands, suffix));
-
-    SetChannelIdForNewCollective(new_instr, module);
-    old_to_new_map[old_instr] = new_instr;
-    VLOG(2) << "Added instruction " << new_instr->ToString()
-            << " to parent computation.";
-  }
-
-  std::vector<HloInstruction*> new_roots;
-  for (HloInstruction* instr : old_loop_roots) {
-    new_roots.push_back(old_to_new_map[instr]);
-  }
-  TF_RETURN_IF_ERROR(while_instr->ReplaceOperandWith(
-      0, old_to_new_map[while_body->root_instruction()]));
-  VLOG(2) << "Replaced with new input tuple "
-          << while_instr->operand(0)->ToString();
-
-  // Handle existing control dependencies.
-  for (HloInstruction* old_instr : while_body->MakeInstructionPostOrder()) {
-    if (old_to_new_map.find(old_instr) != old_to_new_map.end()) {
-      HloInstruction* new_instr = old_to_new_map[old_instr];
-      VLOG(2) << "Processing control predecessors for peeled instruction "
-              << new_instr->ToString();
-      std::vector<HloInstruction*> new_control_pred(
-          old_instr->control_predecessors().size());
-      for (HloInstruction* pred : old_instr->control_predecessors()) {
-        new_control_pred.push_back(old_to_new_map[pred]);
-      }
-
-      TF_RETURN_IF_ERROR(new_instr->DropAllControlDeps());
-      for (HloInstruction* new_pred : new_control_pred) {
-        TF_RETURN_IF_ERROR(new_pred->AddControlDependencyTo(new_instr));
-        VLOG(2) << "Adding " << new_pred->ToString()
-                << " to control dependency of peeled instruction: "
-                << new_instr->ToString();
-      }
-    }
-  }
-  return absl::OkStatus();
-}
-}  // namespace
-
-absl::StatusOr<bool> LoopDoubleBufferTransformer::Run(
-    HloModule* module,
-    const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  bool changed = false;
-  std::vector<HloInstruction*> while_instrs;
-  for (auto comp : module->MakeNonfusionComputations()) {
-    absl::c_copy_if(comp->instructions(), std::back_inserter(while_instrs),
-                    HloPredicateIsOp<HloOpcode::kWhile>);
-  }
-  VLOG(2) << "Processing " << while_instrs.size() << " while loops.";
-
-  for (HloInstruction* while_instr : while_instrs) {
-    TF_ASSIGN_OR_RETURN(WhileLoopBackendConfig config,
-                        while_instr->backend_config<WhileLoopBackendConfig>());
-    if (!config.has_known_trip_count()) {
-      VLOG(2) << while_instr->ToString()
-              << " doesn't have exact trip count, skipping double buffering "
-                 "for now";
-      continue;
-    }
-    int64_t exact_trip_count = config.known_trip_count().n();
-    VLOG(2) << "Processing while loop " << while_instr->ToString()
-            << " with trip count: " << exact_trip_count;
-
-    HloComputation* while_body = while_instr->while_body();
-
-    VLOG(2) << "Processing root " << while_body->root_instruction()->ToString();
-
-    auto old_loop_roots = while_body->root_instruction()->mutable_operands();
-    HloInstruction* input_parameter = while_body->parameter_instruction(0);
-    VLOG(2) << "Processing input parameter " << input_parameter->ToString();
-    absl::flat_hash_map<HloInstruction*, HloInstruction*> old_to_new_map;
-    absl::flat_hash_set<HloInstruction*> skip_control_dep_injection;
-
-    if (exact_trip_count % 2) {
-      VLOG(2) << "Found loops with odd trip count, 1 iteration will be peeled "
-                 "outside of the main body.";
-      TF_RETURN_IF_ERROR(PeelInstructionsForOddTripCount(module, while_instr));
-      exact_trip_count -= 1;
-    }
-    std::string suffix = "double_buffer_clone";
-    old_to_new_map[input_parameter] = while_body->root_instruction();
-    for (HloInstruction* old_instr : while_body->MakeInstructionPostOrder()) {
-      if (old_to_new_map.find(old_instr) != old_to_new_map.end()) {
-        continue;
-      }
-      VLOG(2) << "Cloning instruction " << old_instr->ToString();
-      std::vector<HloInstruction*> new_operands;
-      for (HloInstruction* old_operand : old_instr->mutable_operands()) {
-        new_operands.push_back(old_to_new_map[old_operand]);
-      }
-      HloInstruction* new_instr =
-          while_body->AddInstruction(old_instr->CloneWithNewOperands(
-              old_instr->shape(), new_operands, suffix));
-
-      // If an elementwise instruction with constant operand is present, we
-      // won't inject control dependency at the end to allow more constant
-      // folding opportunities.
-      if (old_instr->IsElementwiseBinary() && old_instr->HasConstantOperand()) {
-        skip_control_dep_injection.insert(old_instr);
-      }
-      SetChannelIdForNewCollective(new_instr, module);
-      old_to_new_map[old_instr] = new_instr;
-      VLOG(2) << "Added instruction " << new_instr->ToString();
-    }
-
-    while_body->set_root_instruction(
-        old_to_new_map[while_body->root_instruction()]);
-    VLOG(2) << "Replaced with new root "
-            << while_body->root_instruction()->ToString();
-
-    // Handle existing control dependencies.
-    for (HloInstruction* old_instr : while_body->MakeInstructionPostOrder()) {
-      if (old_to_new_map.find(old_instr) != old_to_new_map.end()) {
-        HloInstruction* new_instr = old_to_new_map[old_instr];
-        VLOG(2) << "Processing control predecessors for "
-                << new_instr->ToString();
-        std::vector<HloInstruction*> new_control_pred(
-            old_instr->control_predecessors().size());
-        for (HloInstruction* pred : old_instr->control_predecessors()) {
-          new_control_pred.push_back(old_to_new_map[pred]);
-        }
-
-        TF_RETURN_IF_ERROR(new_instr->DropAllControlDeps());
-        for (HloInstruction* new_pred : new_control_pred) {
-          TF_RETURN_IF_ERROR(new_pred->AddControlDependencyTo(new_instr));
-          VLOG(2) << "Adding " << new_pred->ToString()
-                  << " to control dependency of " << new_instr->ToString();
-        }
-      }
-    }
-    for (HloInstruction* input_consumer : input_parameter->users()) {
-      for (HloInstruction* old_input : input_consumer->users()) {
-        if (old_to_new_map.find(old_input) != old_to_new_map.end()) {
-          HloInstruction* new_input = old_to_new_map[old_input];
-          if (skip_control_dep_injection.find(old_input) ==
-                  skip_control_dep_injection.end() &&
-              !IsCollective(old_input)) {
-            for (HloInstruction* old_root : old_loop_roots) {
-              TF_RETURN_IF_ERROR(old_root->AddControlDependencyTo(new_input));
-            }
-          }
-        }
-      }
-    }
-    WhileLoopBackendConfig new_config;
-    new_config.mutable_known_trip_count()->set_n((exact_trip_count / 2));
-    TF_RETURN_IF_ERROR(while_instr->set_backend_config(new_config));
-    changed = true;
-  }
-
-  VLOG(2) << "LoopDoubleBufferTransformer output: " << module->ToString();
-
-  // Run necessary cleanup to ensure LoopDoubleBufferTransformer behaves
-  // correctly.
-  if (changed) {
-    // The call graph will not be flat if one of the loops that was unrolled
-    // contains any kind of call to another computation---since the call will
-    // be duplicated, thereby adding a second callsite for that computation.
-    TF_RETURN_IF_ERROR(
-        FlattenCallGraph().Run(module, execution_threads).status());
-  }
-
-  return changed;
-}
-
-}  // end namespace gpu
-}  // end namespace xla
diff --git a/third_party/xla/xla/service/gpu/loop_double_buffer_transformer.h b/third_party/xla/xla/service/gpu/loop_double_buffer_transformer.h
deleted file mode 100644
index a95ca18c04c429..00000000000000
--- a/third_party/xla/xla/service/gpu/loop_double_buffer_transformer.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef XLA_SERVICE_GPU_LOOP_DOUBLE_BUFFER_TRANSFORMER_H_
-#define XLA_SERVICE_GPU_LOOP_DOUBLE_BUFFER_TRANSFORMER_H_
-
-#include "absl/container/flat_hash_set.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/service/hlo_pass_interface.h"
-
-namespace xla {
-namespace gpu {
-
-// This pass performs the unrolling-by-2 loop transformation
-// to effectively achieve double buffering between inputs and outputs
-// of previously rolled iterations.
-// This pass only runs on loops with known trip counts.
-// For even number of iterations, unrolling-by-2 will be done directly.
-// For odd number of iterations, the first iteration of the loop will be
-// peeled outside of the while loop to make the trip count an even number,
-// then proceed to unroll by 2.
-// It also updates the trip count property of the loop to the correct one (n/2).
-//
-// Note that this pass will flatten the call graph if any loop has been
-// unrolled.
-class LoopDoubleBufferTransformer : public HloModulePass {
- public:
-  LoopDoubleBufferTransformer() = default;
-  ~LoopDoubleBufferTransformer() override = default;
-  absl::string_view name() const override {
-    return "loop-double-buffer-transformer";
-  }
-
-  using HloPassInterface::Run;
-  absl::StatusOr<bool> Run(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-};
-
-}  // end namespace gpu
-}  // end namespace xla
-
-#endif  // XLA_SERVICE_GPU_LOOP_DOUBLE_BUFFER_TRANSFORMER_H_
diff --git a/third_party/xla/xla/service/gpu/make_batch_pointers.cc b/third_party/xla/xla/service/gpu/make_batch_pointers.cc
index e7788d652dcc40..46e06e7ddfde07 100644
--- a/third_party/xla/xla/service/gpu/make_batch_pointers.cc
+++ b/third_party/xla/xla/service/gpu/make_batch_pointers.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/typed_kernel_factory.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
@@ -47,20 +48,18 @@ absl::Status MakeBatchPointers(se::Stream* stream,
                                se::DeviceMemoryBase base_ptr,
                                size_t stride_bytes, size_t n,
                                se::DeviceMemoryBase ptrs_out) {
-  static constexpr size_t kThreads = 128;
-
-  se::StreamExecutor* executor = stream->parent();
-
 #if TENSORFLOW_USE_ROCM
   stream_executor::gpu::rocm_MakeBatchPointers(
       se::gpu::AsGpuStreamValue(stream),
       reinterpret_cast<char*>(base_ptr.opaque()), stride_bytes, n,
       reinterpret_cast<void**>(ptrs_out.opaque()));
 #else
+  se::StreamExecutor* executor = stream->parent();
+  static constexpr size_t kThreads = 128;
 
   TF_ASSIGN_OR_RETURN(
       auto kernel,
-      (se::TypedKernel<
+      (se::TypedKernelFactory<
           se::DeviceMemoryBase, size_t, size_t,
           se::DeviceMemoryBase>::Create(executor, "make_batch_pointers",
                                         make_batch_pointers::kernel())));
diff --git a/third_party/xla/xla/service/gpu/matmul_utils.cc b/third_party/xla/xla/service/gpu/matmul_utils.cc
index 9a26f9357a9b82..b5a1936ad2ab43 100644
--- a/third_party/xla/xla/service/gpu/matmul_utils.cc
+++ b/third_party/xla/xla/service/gpu/matmul_utils.cc
@@ -52,9 +52,6 @@ limitations under the License.
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 
-#if GOOGLE_CUDA
-#endif  // GOOGLE_CUDA
-
 namespace xla {
 namespace gpu {
 
diff --git a/third_party/xla/xla/service/gpu/model/BUILD b/third_party/xla/xla/service/gpu/model/BUILD
index 100341d255361d..759476d142d306 100644
--- a/third_party/xla/xla/service/gpu/model/BUILD
+++ b/third_party/xla/xla/service/gpu/model/BUILD
@@ -1,6 +1,5 @@
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
-load("@local_tsl//tsl/platform:build_config_root.bzl", "tf_cuda_tests_tags")
 load("@local_tsl//tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
 load("//xla:xla.bzl", "xla_cc_test", "xla_nvml_deps")
 
@@ -211,6 +210,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
     ],
@@ -441,9 +441,6 @@ xla_cc_test(
         ":indexing_analysis",
         ":indexing_map",
         ":indexing_test_utils",
-        "//xla:literal_util",
-        "//xla:shape_util",
-        "//xla/hlo/ir:hlo",
         "//xla/tests:hlo_test_base",
         "//xla/tests:verified_hlo_module",
         "//xla/tests:xla_internal_test_main",
@@ -473,6 +470,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AsmParser",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -509,7 +507,6 @@ xla_cc_test(
     srcs = ["indexing_analysis_test.cc"],
     deps = [
         ":indexing_analysis",
-        ":indexing_map",
         ":indexing_test_utils",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:hlo_traversal",
@@ -529,10 +526,11 @@ cc_library(
     hdrs = ["symbolic_tile.h"],
     deps = [
         ":affine_map_printer",
+        ":indexing_analysis",
         ":indexing_map",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
@@ -548,12 +546,13 @@ xla_cc_test(
         ":indexing_analysis",
         ":indexing_test_utils",
         ":symbolic_tile",
-        "//xla/hlo/ir:hlo",
-        "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:test",
     ],
@@ -626,6 +625,16 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "tiled_hlo_computation",
+    hdrs = ["tiled_hlo_computation.h"],
+    deps = [
+        ":tiled_hlo_instruction",
+        "//xla:util",
+        "@local_tsl//tsl/lib/gtl:iterator_range",
+    ],
+)
+
 cc_library(
     name = "symbolic_tile_analysis",
     srcs = ["symbolic_tile_analysis.cc"],
@@ -635,6 +644,7 @@ cc_library(
         ":indexing_map",
         ":symbolic_tile",
         ":symbolic_tiled_hlo_instruction",
+        ":tiled_hlo_computation",
         ":tiled_hlo_instruction",
         "//xla:status",
         "//xla/hlo/ir:hlo",
@@ -660,6 +670,7 @@ xla_cc_test(
     deps = [
         ":indexing_test_utils",
         ":symbolic_tile_analysis",
+        ":tiled_hlo_computation",
         ":tiled_hlo_instruction",
         "//xla/hlo/ir:hlo",
         "//xla/tests:hlo_test_base",
@@ -845,7 +856,7 @@ xla_cc_test(
     name = "hlo_op_profiler_test",
     srcs = ["hlo_op_profiler_test.cc"],
     local_defines = if_cuda(["GOOGLE_CUDA"]),
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":hlo_op_profiler_lib",
         "//xla/hlo/ir:hlo",
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc b/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
index 83fe09366d8b31..c391ade9379383 100644
--- a/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
@@ -158,8 +158,8 @@ std::optional<GroupedByOpIndexingMap> GetThreadIdToInputMemoryLayoutsMaps(
   for (const auto& [root_index, hero] :
        llvm::enumerate(fusion_analysis.fusion_heroes())) {
     for (const auto& [hero_operand_index, hero_operand] :
-         llvm::enumerate(hero->operands())) {
-      if (hero_operand->shape().rank() == 0) {
+         llvm::enumerate(hero.GetOperands())) {
+      if (hero_operand.shape().rank() == 0) {
         continue;
       }
       // Compute thread ID -> hero operand indexing map.
@@ -170,10 +170,9 @@ std::optional<GroupedByOpIndexingMap> GetThreadIdToInputMemoryLayoutsMaps(
         return std::nullopt;
       }
       // Compute indexing from output to inputs for logical layout.
-      HloInstructionAdaptor hero_operand_adaptor(*hero_operand);
       GroupedByOpIndexingMap instr_indexing_keyed_by_operands =
-          ComputeGroupedOutputToInputIndexing(
-              fusion_adaptor, hero_operand_adaptor, mlir_context);
+          ComputeGroupedOutputToInputIndexing(fusion_adaptor, hero_operand,
+                                              mlir_context);
       // For every operand compute thread ID -> physical layout of operand
       // indexing map.
       for (const HloInstruction* operand : operands) {
@@ -262,19 +261,19 @@ void AssignValuesToOuterLoopIVs(IndexingMap* indexing_map) {
   }
   MLIRContext* mlir_context = indexing_map->GetMLIRContext();
   llvm::SmallVector<AffineExpr, 2> symbol_replacements;
-  for (const RangeVar& range_var : indexing_map->GetRangeVars()) {
-    symbol_replacements.push_back(
-        getAffineConstantExpr(range_var.range.lower, mlir_context));
+  for (int64_t symbol_id = 0; symbol_id < indexing_map->GetRangeVarsCount() - 1;
+       ++symbol_id) {
+    symbol_replacements.push_back(getAffineConstantExpr(
+        indexing_map->GetRangeVar(symbol_id).range.lower, mlir_context));
   }
-  symbol_replacements.push_back(mlir::getAffineSymbolExpr(
-      indexing_map->GetRangeVarsCount() - 1, mlir_context));
+  symbol_replacements.push_back(mlir::getAffineSymbolExpr(0, mlir_context));
 
   AffineMap thread_x_to_input_no_dim_symbols =
       indexing_map->GetAffineMap().replaceDimsAndSymbols(
           {}, symbol_replacements, indexing_map->GetDimVarsCount(), 1);
   *indexing_map = IndexingMap{thread_x_to_input_no_dim_symbols,
                               indexing_map->GetDimVars(),
-                              indexing_map->GetRangeVars(),
+                              {indexing_map->GetRangeVars().back()},
                               {}};
   indexing_map->Simplify(GetIndexingMapForInstruction);
   indexing_map->RemoveUnusedSymbols();
@@ -592,10 +591,11 @@ CoalescingAnalysis::CoalescingAnalysis(
     const HloFusionAnalysis& fusion_analysis,
     KernelFusionInterface* fusion_interface, MLIRContext* mlir_context,
     bool use_heuristic) {
-  ProducerConsumerFusion fusion_adaptor(producer, consumer);
-  if (!use_heuristic &&
-      ComputeCoalescingForAllOperands(fusion_adaptor, operands, fusion_analysis,
-                                      fusion_interface, mlir_context)) {
+  auto fusion_adaptor =
+      HloFusionAdaptor::ForProducerConsumer(producer, consumer);
+  if (!use_heuristic && ComputeCoalescingForAllOperands(
+                            *fusion_adaptor, operands, fusion_analysis,
+                            fusion_interface, mlir_context)) {
     return;
   }
   // If ComputeCoalescingForAllOperands fails, fallback to using the heuristic.
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc b/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
index 63c4dba8c5b489..224b5cb4bdfbe8 100644
--- a/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
@@ -488,6 +488,28 @@ TEST_F(CoalescingTest, UnusedParameter) {
               ElementsAre(true, true));
 }
 
+TEST_F(CoalescingTest, Param) {
+  absl::string_view ir = R"(
+    HloModule module
+    fusion {
+      %p0 = u32[48,2,1280] parameter(0)
+      %p1 = u32[48,1,1280] parameter(1)
+      %p2 = u32[48,1,1280] parameter(2)
+      %concat = u32[48,2,1280] concatenate(u32[48,1,1280] %p1,
+                                           u32[48,1,1280] %p2), dimensions={1}
+      ROOT %shift = u32[48,2,1280] shift-right-logical(
+        u32[48,2,1280] %concat, u32[48,2,1280] %p0)
+    }
+    ENTRY entry {
+      %p0 = u32[48,2,1280] parameter(0)
+      %p1 = u32[48,1,1280] parameter(1)
+      %p2 = u32[48,1,1280] parameter(2)
+      ROOT %fusion = u32[48,2,1280] fusion(p0, p1, p2), kind=kLoop, calls=fusion
+  })";
+  // thread_x to linearized input mapping for thread_x in [0, 31]:
+  EXPECT_THAT(IsReadCoalescedPerOperand(ir), ElementsAre(true, true, true));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/fusion_analysis_cache_test.cc b/third_party/xla/xla/service/gpu/model/fusion_analysis_cache_test.cc
index d507a88a4d803b..1750c38c3c41ae 100644
--- a/third_party/xla/xla/service/gpu/model/fusion_analysis_cache_test.cc
+++ b/third_party/xla/xla/service/gpu/model/fusion_analysis_cache_test.cc
@@ -56,18 +56,15 @@ TEST_F(FusionAnalysisCacheTest, CachesAndInvalidates) {
   auto* negate = computation->GetInstructionWithName("n0");
   auto* fusion = module->entry_computation()->root_instruction();
 
-  EXPECT_THAT(cache_.Get(*fusion).fusion_roots(),
-              ::testing::ElementsAre(negate));
+  EXPECT_EQ(&cache_.Get(*fusion).fusion_root(0).instruction(), negate);
 
   computation->set_root_instruction(broadcast);
 
-  EXPECT_THAT(cache_.Get(*fusion).fusion_roots(),
-              ::testing::ElementsAre(negate))
+  EXPECT_EQ(&cache_.Get(*fusion).fusion_root(0).instruction(), negate)
       << "Analysis should be cached.";
 
   cache_.Invalidate(*fusion);
-  EXPECT_THAT(cache_.Get(*fusion).fusion_roots(),
-              ::testing::ElementsAre(broadcast))
+  EXPECT_EQ(&cache_.Get(*fusion).fusion_root(0).instruction(), broadcast)
       << "Analysis should have been recomputed";
 }
 
diff --git a/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.cc b/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.cc
index 188426ece72a77..c6e701050a0343 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.cc
@@ -268,7 +268,7 @@ GpuPerformanceWithCollectiveModel::ComputeAllreduceTime(
                            num_channels * per_channel_ring_ll128_Bw);
   double actual_bandwidth = bus_bandwidth * cost_analysis->ScalingRatio(instr);
 
-  absl::Duration communication_time = absl::Microseconds(
+  absl::Duration communication_time = absl::Milliseconds(
       cost_analysis->bytes_accessed(instr) / (1e6 * actual_bandwidth));
   total_time += communication_time;
   return total_time;
diff --git a/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.cc b/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.cc
index c1ec02bc5267cf..4bbe929730c661 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.cc
@@ -218,12 +218,19 @@ bool GpuHloCostAnalysis::ProducerConsumerMergedTooLarge(
                      IrBasicBlockSplitCount(consumer);
   VLOG(5) << "Basic block split counts: " << IrBasicBlockSplitCount(producer)
           << ", " << IrBasicBlockSplitCount(consumer) << " -> " << n_splits;
-  if (n_splits > kMaxBasicBlockSplitsPerFusion) {
-    return true;
-  }
   int64_t merged_ir_size =
-      (IrSize(producer) * producer_replication + IrSize(consumer)) *
-      (1 << n_splits);
+      (IrSize(producer) * producer_replication + IrSize(consumer));
+  // The MLIR emitters don't have the problem with cache invalidation, so we
+  // don't need to evaluate basic block split counts.
+  if (!producer.GetModule()
+           ->config()
+           .debug_options()
+           .xla_gpu_enable_mlir_emitters()) {
+    if (n_splits > kMaxBasicBlockSplitsPerFusion) {
+      return true;
+    }
+    merged_ir_size *= (1 << n_splits);
+  }
   VLOG(5) << "IR sizes: " << IrSize(producer) << ", " << IrSize(consumer)
           << " -> " << merged_ir_size;
   return merged_ir_size > kMaxIRSize;
diff --git a/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis_test.cc b/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis_test.cc
index 8495e3cf532399..f1e224f4f4ea3a 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis_test.cc
@@ -160,13 +160,13 @@ f {
   a1 = s8[] add(p1, p1)
   b1 = s8[10000] broadcast(a1), dimensions={}
   a2 = s8[10000] add(b1, b1)
-  s1 = s8[8000] slice(a2), slice={[0:8000]}
-  s2 = s8[8000] slice(a2), slice={[2000:10000]}
+  slice1 = s8[8000] slice(a2), slice={[0:8000]}
+  slice2 = s8[8000] slice(a2), slice={[2000:10000]}
   c = s8[10000] constant({...})
-  sc1 = s8[8000] slice(c), slice={[0:8000]}
-  sc2 = s8[8000] slice(c), slice={[2000:10000]}
-  a3 = s8[8000] add(s1, s2)
-  a4 = s8[8000] add(sc1, sc2)
+  slicec1 = s8[8000] slice(c), slice={[0:8000]}
+  slicec2 = s8[8000] slice(c), slice={[2000:10000]}
+  a3 = s8[8000] add(slice1, slice2)
+  a4 = s8[8000] add(slicec1, slicec2)
   ROOT a5 = s8[8000] add(a3, a4)
 }
 
@@ -254,9 +254,9 @@ f {
   p1 = s8[100] parameter(0)
   i1 = s8[100] iota(), iota_dimension=0
   a1 = s8[100] add(p1, i1)
-  s1 = s8[1] slice(a1), slice={[0:1]}
-  s2 = s8[1] slice(a1), slice={[3:4]}
-  ROOT r = s8[1] add(s1, s2)
+  slice1 = s8[1] slice(a1), slice={[0:1]}
+  slice2 = s8[1] slice(a1), slice={[3:4]}
+  ROOT r = s8[1] add(slice1, slice2)
 }
 
 ENTRY e {
@@ -541,6 +541,39 @@ TEST_F(GpuHloCostAnalysisTest, CommonElementwiseUseParameterAndRoot) {
             0.f);
 }
 
+TEST_F(GpuHloCostAnalysisTest,
+       CommonElementwiseUseParameterAndRootMultiOutputFusion) {
+  const char* hlo_fusion_module_str = R"(
+  HloModule m
+
+  f {
+    p0 = s8[10] parameter(0)
+    p1 = s8[] parameter(1)
+    p1b = s8[10] broadcast(p1)
+    a = s8[10] add(p0, p1b)
+    neg = s8[10] negate(a)
+    ROOT _ = (s8[10], s8[10]) tuple(a, neg)
+  }
+
+  ENTRY _ {
+    p0 = s8[10] parameter(0)
+    p1 = s8[] parameter(1)
+    ROOT _ = (s8[10], s8[10]) fusion(p0, p1), kind=kLoop, calls=f
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_fusion_module_str));
+  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis_));
+  HloInstruction* fusion = module->entry_computation()->root_instruction();
+
+  EXPECT_EQ(analysis_.CommonElementwiseUtilization(
+                fusion->fused_parameter(0), fusion->fused_expression_root()),
+            1.f);
+  EXPECT_EQ(analysis_.CommonElementwiseUtilization(
+                fusion->fused_parameter(1), fusion->fused_expression_root()),
+            0.f);
+}
+
 TEST_F(GpuHloCostAnalysisTest, Reduce) {
   absl::string_view hlo_string = R"(
 HloModule m
diff --git a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc
index 6e8b57d62dcbd3..0f31be70914154 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc
@@ -131,12 +131,11 @@ GpuPerformanceModelWithIndexingAnalysis::EstimateRunTimeForFusion(
 
   for (const auto& [instr, indexing_maps] : grouped_fusion_indexing) {
     VLOG(10) << "instr: " << instr->name();
-    HloInstructionAdaptor instr_adaptor(*instr);
 
     // Instructions inside the fusion are computation and account for FLOPs
     // count. Instructions outside the fusion are operands of the fusion and
     // account for memory read time.
-    bool is_operand = !fusion_adaptor.ContainsInstruction(instr_adaptor);
+    bool is_operand = !fusion_adaptor.ContainsInstruction(instr);
 
     auto element_type = instr->shape().element_type();
     int64_t n_bytes_total = 0;
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc
index 5144094c377b4d..0a740a6ed9bcd3 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <cmath>
 #include <cstdint>
 #include <optional>
-#include <vector>
 
 #include "absl/log/check.h"
 #include "absl/log/log.h"
@@ -43,27 +42,6 @@ limitations under the License.
 
 namespace xla {
 namespace gpu {
-namespace {
-
-std::vector<const HloInstruction*> GetUniqueFusionOperands(
-    const HloInstruction* producer, const HloInstruction* consumer) {
-  std::vector<const HloInstruction*> fusion_operands;
-  for (const HloInstruction* operand : producer->operands()) {
-    fusion_operands.push_back(operand);
-  }
-  for (const HloInstruction* operand : consumer->operands()) {
-    if (operand != producer) {
-      fusion_operands.push_back(operand);
-    }
-  }
-  std::sort(fusion_operands.begin(), fusion_operands.end());
-  fusion_operands.erase(
-      std::unique(fusion_operands.begin(), fusion_operands.end()),
-      fusion_operands.end());
-  return fusion_operands;
-}
-
-}  // namespace
 
 /*static*/ EstimateRunTimeData
 GpuPerformanceModel::EstimateRunTimeForInstruction(
@@ -120,8 +98,12 @@ GpuPerformanceModel::EstimateRunTimeForInstruction(
   VLogResult(flops, bytes_read, bytes_written, num_threads, compute_time,
              read_time, write_time, exec_time);
 
-  return {flops,      bytes_written, num_threads, read_time,
-          write_time, compute_time,  exec_time};
+  EstimateRunTimeData runtime_data = {flops,     bytes_written, num_threads,
+                                      read_time, write_time,    compute_time,
+                                      exec_time};
+  VLOG(3) << "Runtime data for HLO: " << instr->name() << "\n"
+          << runtime_data.ToString();
+  return runtime_data;
 }
 
 /*static*/ EstimateRunTimeData
@@ -202,8 +184,15 @@ absl::Duration GpuPerformanceModel::EstimateUnfusedExecTime(
           << " consumer: " << consumer->name();
   const se::DeviceDescription* device_info = cost_analysis->device_info_;
 
-  float utilization_by_this_consumer = cost_analysis->operand_utilization(
-      *consumer, consumer->operand_index(producer));
+  float utilization_by_this_consumer = 0;
+  for (int64_t i = 0; i < consumer->operand_count(); ++i) {
+    if (consumer->operand(i) == producer ||
+        (consumer->operand(i)->opcode() == HloOpcode::kGetTupleElement &&
+         consumer->operand(i)->operand(0) == producer)) {
+      utilization_by_this_consumer +=
+          cost_analysis->operand_utilization(*consumer, i);
+    }
+  }
 
   std::optional<HloFusionAnalysis> local_analysis_fused;
   if (!config.fusion_analysis_cache) {
@@ -225,8 +214,7 @@ absl::Duration GpuPerformanceModel::EstimateUnfusedExecTime(
   int64_t num_threads = launch_dimensions.launch_bound();
   absl::Duration compute_time = ComputeTime(*device_info, flops, num_threads);
 
-  std::vector<const HloInstruction*> fusion_operands =
-      GetUniqueFusionOperands(producer, consumer);
+  auto fusion_operands = fusion_analysis.fusion().GetParameters();
   CoalescingAnalysis coalescing_analysis(producer, consumer, fusion_operands,
                                          fusion_analysis);
 
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.cc
index 40bb1ff69b1b7f..f1b7a9025cef6a 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cmath>
 #include <cstdint>
 #include <optional>
+#include <vector>
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
@@ -44,31 +45,30 @@ namespace gpu {
 namespace {
 
 // Returns whether a fusion uses the parameter at the given index elementwise
-// from its root.
+// from its root. Also works if 'fusion' is a multi-output fusion.
 bool FusionUsesParameterElementwiseFromRoot(
     const HloInstruction* fusion, int parameter_index,
     const GpuHloCostAnalysis* cost_analysis) {
+  // This checks whether there is a path from fused_expression_root() to the
+  // parameter that only goes through elementwise, Tuple and GetTupleElement
+  // ops.
   return cost_analysis->CommonElementwiseUtilization(
              fusion->fused_parameter(parameter_index),
              fusion->fused_expression_root()) == 1.f;
 }
 
-int GetCoalescingWasteFactor(PrimitiveType element_type) {
+int GetCoalescingWasteFactor(PrimitiveType element_type,
+                             const se::DeviceDescription& gpu_device_info) {
   int64_t element_size_bytes =
       element_type == PrimitiveType::TUPLE ||
               element_type == PrimitiveType::TOKEN
           ? 4 /* Dummy value. TODO(jreiffers): Model this case. */
           : ShapeUtil::ByteSizeOfPrimitiveType(element_type);
-  // Cache line is 128B that is split into 4 sectors of 32B. Default transaction
-  // size from DRAM -> L2 = 64 Bytes = 2 sectors, since V100, but it can be also
-  // configured.
-  // https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21819-optimizing-applications-for-nvidia-ampere-gpu-architecture.pdf
-  // (page 10).
-  constexpr int kDRAMToL2TransactionSizeBytes = 64;
   // Assume we use one element from the cache line and waste the remaining
   // bandwidth. For example, if we're reading f32s, we use 1/16nd of the cache
   // line.
-  return kDRAMToL2TransactionSizeBytes / element_size_bytes;
+  return gpu_device_info.dram_to_l2_transaction_size_bytes() /
+         element_size_bytes;
 }
 
 // Limit the bandwidth for low occupancy cases. Each SM can issue at most
@@ -76,7 +76,8 @@ int GetCoalescingWasteFactor(PrimitiveType element_type) {
 // (1830 MHz) to saturate the memory bandwidth (3.35 TB/s).
 float AdjustBandwidth(const se::DeviceDescription& gpu_device_info,
                       float bandwidth, int64_t num_blocks) {
-  float per_block_bandwidth = gpu_device_info.clock_rate_ghz() * 1.0e9f * 32;
+  float per_block_bandwidth = gpu_device_info.clock_rate_ghz() * 1.0e9f *
+                              gpu_device_info.memory_transactions_per_clock();
   float max_bandwidth = num_blocks * per_block_bandwidth;
 
   return std::min(bandwidth, max_bandwidth);
@@ -88,7 +89,7 @@ std::optional<EstimateRunTimeData> GpuPerformanceModelCache::Get(
     const HloInstruction& instruction) {
   absl::MutexLock lock(&mutex_);
 
-  auto it = instruction_runtime_data_.find(HloInstructionAdaptor(instruction));
+  auto it = instruction_runtime_data_.find(&instruction);
   if (it != instruction_runtime_data_.end()) {
     return it->second;
   }
@@ -99,9 +100,9 @@ std::optional<absl::Duration> GpuPerformanceModelCache::Get(
     const HloInstruction& producer, const HloInstruction& consumer) {
   absl::MutexLock lock(&mutex_);
 
-  auto it = fusion_runtime_data_.find(HloInstructionAdaptor(producer));
+  auto it = fusion_runtime_data_.find(&producer);
   if (it != fusion_runtime_data_.end()) {
-    auto jt = it->second.find(HloInstructionAdaptor(consumer));
+    auto jt = it->second.find(&consumer);
     if (jt != it->second.end()) {
       return jt->second;
     }
@@ -113,34 +114,35 @@ void GpuPerformanceModelCache::Set(const HloInstruction& instruction,
                                    const EstimateRunTimeData& runtime_data) {
   absl::MutexLock lock(&mutex_);
 
-  instruction_runtime_data_[HloInstructionAdaptor(instruction)] = runtime_data;
+  instruction_runtime_data_[&instruction] = runtime_data;
 }
 
 void GpuPerformanceModelCache::Set(const HloInstruction& producer,
                                    const HloInstruction& consumer,
                                    absl::Duration runtime) {
   absl::MutexLock lock(&mutex_);
-  fusion_runtime_data_[HloInstructionAdaptor(producer)]
-                      [HloInstructionAdaptor(consumer)] = runtime;
+  fusion_runtime_data_[&producer][&consumer] = runtime;
 }
 
 void GpuPerformanceModelCache::Invalidate(const HloInstruction& instruction) {
   absl::MutexLock lock(&mutex_);
-  HloInstructionAdaptor adaptor(instruction);
 
   // Remove runtime data for the instruction.
-  instruction_runtime_data_.erase(adaptor);
+  instruction_runtime_data_.erase(&instruction);
 
   // Remove cache for all producer-consumer pairs where the instruction is
   // producer.
-  fusion_runtime_data_.erase(adaptor);
+  fusion_runtime_data_.erase(&instruction);
 
   // Iterate through operands to find all producer-consumer pairs where
   // instruction is consumer and remove them from cache.
   for (auto* operand : instruction.operands()) {
-    auto it = fusion_runtime_data_.find(HloInstructionAdaptor(*operand));
+    if (operand->opcode() == HloOpcode::kGetTupleElement) {
+      operand = operand->mutable_operand(0);
+    }
+    auto it = fusion_runtime_data_.find(operand);
     if (it != fusion_runtime_data_.end()) {
-      it->second.erase(adaptor);
+      it->second.erase(&instruction);
     }
   }
 }
@@ -166,8 +168,8 @@ LaunchDimensions GpuPerformanceModelBase::EstimateFusionLaunchDimensions(
 int64_t GpuPerformanceModelBase::GetOperandBytesAccessed(
     const GpuHloCostAnalysis* cost_analysis, const HloInstruction* instr,
     const HloInstruction* operand) {
-  // When called for a consumer-producer fusion, the operand can be from a
-  // different instruction. GpuHloCostAnalysis can't fail gravefully in this
+  // When called for a producer-consumer fusion, the operand can be from a
+  // different instruction. GpuHloCostAnalysis can't fail gracefully in this
   // case, so we need an explicit check.
   if (!instr->IsUserOf(operand)) {
     return 0;
@@ -181,8 +183,20 @@ int64_t GpuPerformanceModelBase::GetOperandBytesAccessed(
 float GpuPerformanceModelBase::GetOperandUtilization(
     const GpuHloCostAnalysis* cost_analysis, const HloInstruction* instr,
     const HloInstruction* operand) {
-  // When called for a consumer-producer fusion, the operand can be from a
-  // different instruction. GpuHloCostAnalysis can't fail gravefully in this
+  if (operand->IsMultiOutputFusion()) {
+    // If 'operand' is a multi-output fusion, we need to check which of its
+    // outputs are used by 'instr'.
+    float res = 0.f;
+    for (int64_t i = 0; i < instr->operand_count(); ++i) {
+      if (instr->operand(i)->opcode() == HloOpcode::kGetTupleElement &&
+          instr->operand(i)->operand(0) == operand) {
+        res += cost_analysis->operand_utilization(*instr, i);
+      }
+    }
+    return res;
+  }
+  // When called for a producer-consumer fusion, the operand can be from a
+  // different instruction. GpuHloCostAnalysis can't fail gracefully in this
   // case, so we need an explicit check.
   if (!instr->IsUserOf(operand)) {
     return 0.f;
@@ -208,14 +222,27 @@ float GpuPerformanceModelBase::GetCommonUtilization(
                                               cost_analysis))) {
     if (consumer->opcode() == HloOpcode::kFusion) {
       int64_t consumer_idx_of_common_operand = consumer->operand_index(operand);
-      int64_t consumer_idx_of_producer = consumer->operand_index(producer);
-      return cost_analysis->CommonElementwiseUtilization(
-          consumer->fused_parameter(consumer_idx_of_common_operand),
-          consumer->fused_parameter(consumer_idx_of_producer));
-    } else {
-      if (consumer->IsElementwise()) {
-        return 1.f;
+      float res = 0.f;
+      std::vector<int64_t> consumer_indices_of_producer;
+      if (producer->IsMultiOutputFusion()) {
+        for (int64_t i = 0; i < consumer->operand_count(); ++i) {
+          if (consumer->operand(i)->opcode() == HloOpcode::kGetTupleElement &&
+              consumer->operand(i)->operand(0) == producer) {
+            consumer_indices_of_producer.push_back(i);
+          }
+        }
+      } else {
+        consumer_indices_of_producer.push_back(
+            consumer->operand_index(producer));
+      }
+      for (int64_t consumer_idx_of_producer : consumer_indices_of_producer) {
+        res += cost_analysis->CommonElementwiseUtilization(
+            consumer->fused_parameter(consumer_idx_of_common_operand),
+            consumer->fused_parameter(consumer_idx_of_producer));
       }
+      return res;
+    } else if (consumer->IsElementwise()) {
+      return 1.f;
     }
   }
   return 0.f;
@@ -256,7 +283,8 @@ absl::Duration GpuPerformanceModelBase::ReadTime(
   float bandwidth = gpu_device_info.memory_bandwidth();
   if (n_bytes_net < gpu_device_info.l2_cache_size()) {
     bandwidth *= kL2CacheSpeedup;
-    if (n_bytes_net < kL1CacheSizePerSM * gpu_device_info.core_count()) {
+    if (n_bytes_net <
+        gpu_device_info.l1_cache_size_per_SM() * gpu_device_info.core_count()) {
       bandwidth *= kL1CacheSpeedup;
     }
   }
@@ -270,7 +298,8 @@ absl::Duration GpuPerformanceModelBase::ReadTimeWithDRAMHeuristic(
     const se::DeviceDescription& gpu_device_info, int64_t num_blocks,
     int64_t n_bytes_net, int64_t n_bytes_total, PrimitiveType element_type,
     bool coalesced) {
-  int waste_factor = coalesced ? 1 : GetCoalescingWasteFactor(element_type);
+  int waste_factor =
+      coalesced ? 1 : GetCoalescingWasteFactor(element_type, gpu_device_info);
 
   // The first read of the input buffer always happens from DRAM. If reads are
   // no coaleced, bandwidth is reduced by the waste factor.
@@ -283,7 +312,8 @@ absl::Duration GpuPerformanceModelBase::ReadTimeWithDRAMHeuristic(
   float rest_bandwidth = gpu_device_info.memory_bandwidth();
   if (n_bytes_net < gpu_device_info.l2_cache_size()) {
     rest_bandwidth *= kL2CacheSpeedup;
-    if (n_bytes_net < kL1CacheSizePerSM * gpu_device_info.core_count()) {
+    if (n_bytes_net <
+        gpu_device_info.l1_cache_size_per_SM() * gpu_device_info.core_count()) {
       rest_bandwidth *= kL1CacheSpeedup;
     }
   } else {
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.h b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.h
index 7d08a0c68a0bb1..8a1cbcd26abe5e 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.h
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <optional>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -41,6 +42,22 @@ struct EstimateRunTimeData {
   absl::Duration write_time;
   absl::Duration compute_time;
   absl::Duration exec_time;
+
+  std::string ToString() const {
+    return absl::StrFormat(
+        "EstimateRunTimeData{\n"
+        " flops: %d\n"
+        " bytes_written: %d\n"
+        " num_threads: %d\n"
+        " read_time: %s\n"
+        " write_time: %s\n"
+        " compute_time: %s\n"
+        " exec_time: %s\n"
+        "}",
+        flops, bytes_written, num_threads, absl::FormatDuration(read_time),
+        absl::FormatDuration(write_time), absl::FormatDuration(compute_time),
+        absl::FormatDuration(exec_time));
+  }
 };
 
 class GpuPerformanceModelCache {
@@ -66,13 +83,13 @@ class GpuPerformanceModelCache {
   absl::Mutex mutex_;
 
   // Stores unfused runtime data for individual instructions.
-  absl::flat_hash_map<HloInstructionAdaptor, EstimateRunTimeData>
+  absl::flat_hash_map<const HloInstruction*, EstimateRunTimeData>
       instruction_runtime_data_;
 
   // Stores fused runtime data for producer-consumer pairs.
   absl::flat_hash_map<
-      HloInstructionAdaptor,
-      absl::flat_hash_map<HloInstructionAdaptor, absl::Duration>>
+      const HloInstruction*,
+      absl::flat_hash_map<const HloInstruction*, absl::Duration>>
       fusion_runtime_data_;
 };
 
@@ -127,12 +144,6 @@ class GpuPerformanceModelBase {
       absl::Microseconds(5);
   static constexpr float kL2CacheSpeedup = 2.5;
   static constexpr float kL1CacheSpeedup = 8;
-  // A very conservative estimate. L1 size varies because it can be dynamically
-  // configured as shared memory; there is no easy way to query its actual size;
-  // also we do not count what occupies cache, but rather claim that what is
-  // much smaller than the cache size will likely stay in it.
-  // For reference, it can be up to 256 kB per SM on RTX A6000.
-  static constexpr float kL1CacheSizePerSM = 2 * 1024;
 
   // Uses HloFusionAnalysis for computing the actual number of threads and
   // blocks that the IR emitter will use.
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc
index 82984b1193bef8..e9b83d4a548b0e 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc
@@ -385,6 +385,48 @@ ENTRY fusion {
   EXPECT_NEAR(absl::ToInt64Microseconds(t.time_fused), 514, 10);
 }
 
+// Same as FusingTransposeIntoReduceIsSlow, but artificially wrapping the
+// transpose in a multi-output fusion with 1 output, to check that we still get
+// the same results.
+TEST_F(GpuPerformanceModelTest,
+       FusingTransposeMultiOutputFusionIntoReduceIsSlow) {
+  constexpr absl::string_view kHlo = R"(
+HloModule testmodule
+
+max {
+  p0 = f32[] parameter(0)
+  p1 = f32[] parameter(1)
+  ROOT max = f32[] maximum(p0, p1)
+}
+
+transpose_fusion {
+  param0 = f32[1500,32,128] parameter(0)
+  transpose.1 = f32[1500,128,32] transpose(param0), dimensions={0,2,1}
+  ROOT res = (f32[1500,128,32]) tuple(transpose.1)
+}
+
+ENTRY fusion {
+  c = f32[] constant(-inf)
+  p0 = f32[1500,32,128] parameter(0)
+  fusion = (f32[1500,128,32]) fusion(p0), kind=kInput, calls=transpose_fusion
+  gte = f32[1500,128,32] get-tuple-element(fusion), index=0
+  ROOT reduce.1 = f32[1500,32] reduce(gte, c), dimensions={1}, to_apply=max
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHlo));
+  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis_));
+
+  auto* producer =
+      module->entry_computation()->GetInstructionWithName("fusion");
+  std::vector<HloInstruction*> consumers{
+      module->entry_computation()->GetInstructionWithName("reduce.1")};
+
+  auto t = EstimateRunTimesForPriorityFusion(producer, consumers);
+  EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 105, 10);
+  EXPECT_NEAR(absl::ToInt64Microseconds(t.time_fused), 514, 10);
+}
+
 TEST_F(GpuPerformanceModelTest, FusingNonMinorTransposeIntoReduceIsFast) {
   constexpr absl::string_view kHlo = R"(
 HloModule testmodule
diff --git a/third_party/xla/xla/service/gpu/model/hlo_op_profiler_run.cc b/third_party/xla/xla/service/gpu/model/hlo_op_profiler_run.cc
index b71dc91505dbd6..8205ec3b478af4 100644
--- a/third_party/xla/xla/service/gpu/model/hlo_op_profiler_run.cc
+++ b/third_party/xla/xla/service/gpu/model/hlo_op_profiler_run.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tsl/platform/env.h"
 #include "tsl/platform/init_main.h"
 #include "tsl/platform/path.h"
+#include "tsl/platform/protobuf.h"
 #include "tsl/platform/status.h"
 
 namespace xla {
@@ -57,8 +58,9 @@ void WriteOutput(const DeviceHloInstructionProfiles& literal,
     file_name = tsl::io::GetTempFilename(absl::StrCat(name, ".textproto"));
   }
   VLOG(0) << "Writing output to " << file_name;
-  TF_CHECK_OK(tsl::WriteStringToFile(tsl::Env::Default(), file_name,
-                                     literal.DebugString()));
+  TF_CHECK_OK(
+      tsl::WriteStringToFile(tsl::Env::Default(), file_name,
+                             tsl::LegacyUnredactedDebugString(literal)));
 }
 
 int RunProfiler(int argc, char** argv) {
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
index 765429c8f4b26a..e7a56b62d45c75 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
@@ -998,6 +998,35 @@ HloInstructionIndexing ComputeOutputToInputSliceOpIndexing(
   return HloInstructionIndexing::FromIndexingMaps({indexing_map});
 }
 
+HloInstructionIndexing ComputeInputToOutputSliceOpIndexing(
+    const HloSliceInstruction* slice, MLIRContext* mlir_context) {
+  auto output_rank = slice->shape().rank();
+
+  std::vector<AffineExpr> exprs;
+  exprs.reserve(output_rank);
+  for (int64_t dim = 0; dim < output_rank; ++dim) {
+    AffineExpr dim_expr = getAffineDimExpr(dim, mlir_context);
+    exprs.push_back((dim_expr - slice->slice_starts()[dim])
+                        .floorDiv(slice->slice_strides()[dim]));
+  }
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      AffineMap::get(output_rank, /*symbolCount=*/0, exprs, mlir_context),
+      slice->operand(0)->shape().dimensions(), {});
+
+  for (int64_t dim = 0; dim < output_rank; ++dim) {
+    AffineExpr dim_expr = getAffineDimExpr(dim, mlir_context);
+    int64_t lb = slice->slice_starts()[dim];
+    int64_t ub =
+        (slice->shape().dimensions(dim) - 1) * slice->slice_strides()[dim] +
+        slice->slice_starts()[dim];
+    indexing_map.AddConstraint(dim_expr, {lb, ub});
+    indexing_map.AddConstraint((dim_expr - lb) % slice->slice_strides()[dim],
+                               {0, 0});
+  }
+
+  return HloInstructionIndexing::FromIndexingMaps({std::move(indexing_map)});
+}
+
 AffineMap ComputeTransposeIndexingMap(absl::Span<const int64_t> permutation,
                                       MLIRContext* mlir_context) {
   return AffineMap::getPermutationMap(
@@ -1112,8 +1141,7 @@ IndexingMap CreateIdentityMap(const Shape& shape, MLIRContext* mlir_context) {
 }
 
 llvm::SmallVector<AffineExpr, 4> DelinearizeInBoundsIndex(
-    AffineExpr linear, absl::Span<const int64_t> sizes,
-    absl::Span<const int64_t> strides) {
+    AffineExpr linear, absl::Span<const int64_t> sizes) {
   llvm::SmallVector<AffineExpr, 4> result;
   result.reserve(sizes.size());
   if (absl::c_linear_search(sizes, 0)) {
@@ -1123,6 +1151,7 @@ llvm::SmallVector<AffineExpr, 4> DelinearizeInBoundsIndex(
     return result;
   }
 
+  auto strides = ComputeStrides(sizes);
   for (auto [size, stride] : llvm::zip(sizes, strides)) {
     result.push_back(linear.floorDiv(stride) % size);
   }
@@ -1142,7 +1171,7 @@ IndexingMap GetIndexingMapFromPhysicalLayoutToLogical(
     const Shape& shape, MLIRContext* mlir_context) {
   if (shape.rank() == 0) {
     return IndexingMap(AffineMap::get(mlir_context),
-                       /*dim_vars=*/{}, /*range vars=*/{}, /*rt_vars=*/{});
+                       /*dimensions=*/{}, /*range vars=*/{}, /*rt_vars=*/{});
   }
   return IndexingMap::FromTensorSizes(
       ComputeTransposeIndexingMap(
@@ -1157,7 +1186,7 @@ IndexingMap GetIndexingMapFromLogicalToPhysicalLayout(
     const Shape& shape, MLIRContext* mlir_context) {
   if (shape.rank() == 0) {
     return IndexingMap(AffineMap::get(mlir_context),
-                       /*dim_vars=*/{}, /*range vars=*/{}, /*rt_vars=*/{});
+                       /*dimensions=*/{}, /*range vars=*/{}, /*rt_vars=*/{});
   }
   return IndexingMap::FromTensorSizes(
       ComputeTransposeIndexingMap(ToTransposeDimensions(shape.layout()),
@@ -1168,8 +1197,7 @@ IndexingMap GetIndexingMapFromLogicalToPhysicalLayout(
 AffineMap GetBlockOffsetsForTiling(const Tiling& tiling,
                                    MLIRContext* mlir_context) {
   auto offsets = DelinearizeInBoundsIndex(getAffineDimExpr(3, mlir_context),
-                                          tiling.GetBlockCounts(),
-                                          tiling.GetBlockStrides());
+                                          tiling.GetBlockCounts());
   for (auto&& [offset, tile_size] :
        llvm::zip(offsets, tiling.GetBlockTileSize())) {
     offset = offset * tile_size;
@@ -1180,8 +1208,7 @@ AffineMap GetBlockOffsetsForTiling(const Tiling& tiling,
 AffineMap GetThreadOffsetsForTiling(const Tiling& tiling,
                                     MLIRContext* mlir_context) {
   auto offsets = DelinearizeInBoundsIndex(getAffineDimExpr(0, mlir_context),
-                                          tiling.GetThreadsPerBlock(),
-                                          tiling.GetThreadStrides());
+                                          tiling.GetThreadsPerBlock());
   for (int dim = 0; dim < tiling.GetShape().size(); ++dim) {
     if (tiling.GetThreadTileSize()[dim] > 1) {
       offsets[dim] = offsets[dim] + getAffineSymbolExpr(dim, mlir_context) *
@@ -1320,10 +1347,10 @@ GroupedByOpIndexingMap ComputeGroupedOutputToInputIndexing(
   if (fusion_adaptor.ContainsInstruction(target_instr)) {
     if (auto parameter_instr =
             DynCast<HloParameterInstruction>(&target_instr.instruction())) {
-      const HloInstruction* user = parameter_instr->users().front();
-      auto fusion_operand = HloInstructionAdaptor(*user).GetOperand(
-          parameter_instr->parameter_number());
-      grouped_indexing_maps[&fusion_operand.instruction()] = {initial_map};
+      auto fusion_instr = parameter_instr->parent()->FusionInstruction();
+      auto fusion_operand =
+          fusion_instr->operand(parameter_instr->parameter_number());
+      grouped_indexing_maps[fusion_operand] = {initial_map};
       return grouped_indexing_maps;
     }
   }
@@ -1483,6 +1510,9 @@ HloInstructionIndexing ComputeInputToOutputIndexing(const HloInstruction* instr,
   if (auto transpose = DynCast<HloTransposeInstruction>(instr)) {
     return ComputeInputToOutputTransposeOpIndexing(transpose, ctx);
   }
+  if (auto slice = DynCast<HloSliceInstruction>(instr)) {
+    return ComputeInputToOutputSliceOpIndexing(slice, ctx);
+  }
   if (instr->opcode() == HloOpcode::kTuple) {
     return HloInstructionIndexing::FromIndexingMaps(
         {CreateIdentityMap(instr->shape().tuple_shapes(input_id), ctx)});
@@ -1495,19 +1525,19 @@ HloInstructionIndexing ComputeInputToOutputIndexing(const HloInstruction* instr,
 }
 
 IndexingMap ComputeEpilogueInputToOutputIndexing(
-    const HloInstruction* epilogue_root, MLIRContext* mlir_context,
-    std::function<bool(const HloInstruction*)> is_root) {
-  auto* instr = epilogue_root;
-  auto root_indexing = CreateIdentityMap(instr->shape(), mlir_context);
-  while (!is_root(instr)) {
-    // There can be multiple users, but they must have compatible indexing maps.
-    auto* user = instr->users().front();
+    HloInstructionAdaptor epilogue_parent, HloInstructionAdaptor epilogue_root,
+    MLIRContext* mlir_context) {
+  auto chain = HloFindUseChain(epilogue_parent, epilogue_root);
+  CHECK(!chain.empty()) << "There is no use chain from parent to root";
+  auto root_indexing = CreateIdentityMap(epilogue_parent.shape(), mlir_context);
+  for (int i = 1; i < chain.size(); ++i) {
+    const auto& producer = chain[i - 1].instruction();
+    const auto& user = chain[i].instruction();
     auto user_indexing = ComputeInputToOutputIndexing(
-        user, user->operand_index(instr), mlir_context);
+        &user, user.operand_index(&producer), mlir_context);
     root_indexing = root_indexing * *user_indexing.indexing_maps[0].begin();
     root_indexing.Simplify(GetIndexingMapForInstruction);
     root_indexing.RemoveUnusedSymbols();
-    instr = user;
   }
   return root_indexing;
 }
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis.h b/third_party/xla/xla/service/gpu/model/indexing_analysis.h
index 22012ea472f887..8d6125e71645dc 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis.h
@@ -80,7 +80,7 @@ HloInstructionIndexing ComputeInputToOutputIndexing(const HloInstruction* instr,
 
 // Computes the indexing for `epilogue_parent`'s epilogue. For example, if
 // `epilogue_parent` is a transpose, computes the input to output indexing for
-// everything below the transpose.
+// the path from the transpose's output to the root's output.
 //
 //   transpose
 //       |
@@ -88,16 +88,23 @@ HloInstructionIndexing ComputeInputToOutputIndexing(const HloInstruction* instr,
 //       |
 //      ROOT
 //
-// Here, the result will be the input to output indexing for the bitcast.
-// `epilogue_root` may be identical to the root of the fusion (if there is no
-// epilogue). In this case, the result is the identity indexing map.
-// Note: this function assumes the epilogue is compatible with
-// FindNonTrivialHero, i.e., each instruction in the epilogue only has a single
-// user, or the users have identical indexing maps.
+// The root must be specified because in HLO, an instruction can both be a hero
+// and part of a side output:
+//
+//          reduce
+//         /      \
+//   broadcast    log
+//        |        |
+//       neg    bitcast
+//         \      /
+//           ROOT
+//
+// Here, the we must use the path through the `log` for the epilogue indexing,
+// since the other path is not actually an epilogue (it's a side output). This
+// fusion does not make much sense, but they are created sometimes.
 IndexingMap ComputeEpilogueInputToOutputIndexing(
-    const HloInstruction* epilogue_root, mlir::MLIRContext* ctx,
-    std::function<bool(const HloInstruction*)> is_root =
-        [](const HloInstruction* instr) { return instr->IsRoot(); });
+    HloInstructionAdaptor epilogue_parent, HloInstructionAdaptor epilogue_root,
+    mlir::MLIRContext* mlir_context);
 
 using GroupedByOpIndexingMap =
     absl::flat_hash_map<const HloInstruction*, IndexingMapSet>;
@@ -125,32 +132,32 @@ bool FuseProducerConsumerOutputToInputIndexing(
 // Equivalent to linearizing the input_shape index and then delinearizing it
 // to output_shape.
 IndexingMap GetBitcastMap(const Shape& input_shape, const Shape& output_shape,
-                          mlir::MLIRContext* ctx);
+                          mlir::MLIRContext* mlir_context);
 
 // Creates an indexing map from the physical layout of the tensor to its logical
 // layout.
-IndexingMap GetIndexingMapFromPhysicalLayoutToLogical(const Shape& shape,
-                                                      mlir::MLIRContext* ctx);
+IndexingMap GetIndexingMapFromPhysicalLayoutToLogical(
+    const Shape& shape, mlir::MLIRContext* mlir_context);
 
 // Creates an indexing map from the logical layout of the tensor to its physical
 // layout.
-IndexingMap GetIndexingMapFromLogicalToPhysicalLayout(const Shape& shape,
-                                                      mlir::MLIRContext* ctx);
+IndexingMap GetIndexingMapFromLogicalToPhysicalLayout(
+    const Shape& shape, mlir::MLIRContext* mlir_context);
 
 // Creates an indexing map from thread and block IDs to elements of the tiled
 // shape. Uses the same convention as KernelFusionInterface: dimensions 0 to 2
 // are thread indices (currently only 0 is used), dimensions 3 to 5 are block
 // indices (currently only 3 is used).
 mlir::AffineMap GetBlockOffsetsForTiling(const Tiling& tiling,
-                                         mlir::MLIRContext* ctx);
+                                         mlir::MLIRContext* mlir_context);
 mlir::AffineMap GetThreadOffsetsForTiling(const Tiling& tiling,
-                                          mlir::MLIRContext* ctx);
+                                          mlir::MLIRContext* mlir_context);
 
 // Convenience functions for the two functions above
 // (`GetBlockOffsestsForTiling` + `GetThreadOffsetsForTiling`). Also sets up
 // the ranges of dimensions and symbols.
 IndexingMap GetIndexingMapForTiling(const Tiling& tiling,
-                                    mlir::MLIRContext* ctx);
+                                    mlir::MLIRContext* mlir_context);
 IndexingMap GetIndexingMapForTiling(mlir::AffineMap block_offsets,
                                     mlir::AffineMap thread_offsets,
                                     int64_t threads_per_block,
@@ -177,8 +184,7 @@ IndexingMap CreateIdentityMap(const Shape& shape,
                               mlir::MLIRContext* mlir_context);
 
 llvm::SmallVector<mlir::AffineExpr, 4> DelinearizeInBoundsIndex(
-    mlir::AffineExpr linear, absl::Span<const int64_t> sizes,
-    absl::Span<const int64_t> strides);
+    mlir::AffineExpr linear, absl::Span<const int64_t> sizes);
 
 // Returns the output-to-input indexing map of the first output of `instr`
 IndexingMap GetIndexingMapForInstruction(const HloInstruction* instr,
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc b/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
index 80ec956a241ab7..daddff1f4c1436 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
@@ -88,10 +88,10 @@ TEST_F(IndexingAnalysisTest, ComputeGroupedOutputToInputIndexing) {
   const HloInstruction* parameter = root->operand(0);
   const HloInstruction* transpose = root->operand(1);
 
-  auto fusion_adaptor = ProducerConsumerFusion(transpose, root);
+  auto fusion_adaptor = HloFusionAdaptor::ForProducerConsumer(transpose, root);
 
   auto grouped_indexing = ComputeGroupedOutputToInputIndexing(
-      fusion_adaptor, fusion_adaptor.GetRoots()[0], &mlir_context_);
+      *fusion_adaptor, fusion_adaptor->GetRoots()[0], &mlir_context_);
   EXPECT_THAT(grouped_indexing,
               UnorderedElementsAre(
                   Pair(root, ElementsAre(MatchIndexingMap(R"(
@@ -198,7 +198,8 @@ TEST_F(IndexingAnalysisTest, ComputeGroupedOutputToInputIndexing_SingleOp) {
       entry_computation->GetInstructionWithName("p1");
 
   auto fusion_adaptor = HloFusionAdaptor::ForInstruction(exponential);
-  HloInstructionAdaptor parameter_adaptor(*parameter);
+  HloInstructionAdaptor parameter_adaptor =
+      fusion_adaptor->GetRoots()[0].GetOperand(0);
   auto grouped_indexing = ComputeGroupedOutputToInputIndexing(
       *fusion_adaptor, parameter_adaptor, &mlir_context_);
   EXPECT_THAT(grouped_indexing, UnorderedElementsAre(Pair(
@@ -1642,7 +1643,7 @@ TEST_F(IndexingAnalysisTest, ReshapeOpGenericReshape2DTo3D) {
   EXPECT_THAT(input_indexing.indexing_maps,
               ElementsAre(ElementsAre(MatchIndexingMap(R"(
                 (d0, d1, d2) -> (d0 * 2 + d1 floordiv 2,
-                                d2 + (d1 mod 2) * 4)
+                                (d1 mod 2) * 4 + d2)
                 domain:
                 d0 in [0, 1]
                 d1 in [0, 3]
@@ -1661,7 +1662,7 @@ TEST_F(IndexingAnalysisTest, ReshapeOpGenericReshape3DTo2D) {
   EXPECT_THAT(input_indexing.indexing_maps,
               ElementsAre(ElementsAre(MatchIndexingMap(R"(
                             (d0, d1) -> (d0 floordiv 2,
-                                        d1 floordiv 4 + (d0 mod 2) * 2,
+                                        (d0 mod 2) * 2 + d1 floordiv 4,
                                         d1 mod 4)
                             domain:
                             d0 in [0, 3]
@@ -2420,14 +2421,15 @@ TEST_F(IndexingAnalysisTest, ReverseReshape) {
 }
 
 TEST_F(IndexingAnalysisTest, SliceOp) {
-  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+  auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
       p0 = f32[10, 20, 50] parameter(0)
       ROOT slice = f32[5, 3, 25] slice(f32[10, 20, 50] p0),
           slice={[5:10:1], [3:20:7], [0:50:2]}
     }
-  )"));
+  )");
+  auto input_indexing = GetOutputToInputIndexing(root);
   EXPECT_THAT(input_indexing.indexing_maps,
               ElementsAre(ElementsAre(MatchIndexingMap(R"(
                             (d0, d1, d2) -> (d0 + 5, d1 * 7 + 3, d2 * 2)
@@ -2436,6 +2438,21 @@ TEST_F(IndexingAnalysisTest, SliceOp) {
                             d1 in [0, 2]
                             d2 in [0, 24]
                           )"))));
+  auto output_indexing = GetInputToOutputIndexing(root);
+  EXPECT_THAT(output_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2) -> (
+                              d0 - 5,
+                              (d1 - 3) floordiv 7,
+                              d2 floordiv 2
+                            )
+                            domain:
+                            d0 in [5, 9]
+                            d1 in [3, 17]
+                            d2 in [0, 48]
+                            (d1 - 3) mod 7 in [0, 0]
+                            d2 mod 2 in [0, 0]
+                          )"))));
 }
 
 TEST_F(IndexingAnalysisTest, TransposeOp) {
@@ -2598,7 +2615,7 @@ TEST_F(IndexingAnalysisTest, TilingIndexing) {
   EXPECT_THAT(indexing_map.ToString(), MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
           (d3 floordiv 64) * 8 + s0,
-          d0 floordiv 4 + (d3 mod 64) * 4,
+          (d3 mod 64) * 4 + d0 floordiv 4,
           d0 mod 4 + s2 * 4
         )
         domain:
@@ -2618,20 +2635,31 @@ TEST_F(IndexingAnalysisTest, TilingIndexing) {
 TEST_F(IndexingAnalysisTest, EpilogueIndexing) {
   auto module = ParseAndReturnVerifiedModule(R"(
     HloModule m
-    ENTRY e {
+    fused_computation {
       p0 = f32[1000, 1000] parameter(0)
       t = f32[1000, 1000]{0, 1} transpose(p0), dimensions={1, 0}
       a0 = f32[1000000] bitcast(t)
       ROOT log = f32[1000000] log(a0)
     }
+    ENTRY e {
+      p0 = f32[1000, 1000] parameter(0)
+      ROOT fusion = f32[1000000] fusion(p0), kind=kLoop,
+          calls=fused_computation
+    }
   )");
   ASSERT_TRUE(module.ok());
-  EXPECT_THAT(ComputeEpilogueInputToOutputIndexing(
-                  (*module)->entry_computation()->GetInstructionWithName("t"),
-                  &mlir_context_)
-                  .ToString(),
-              MatchIndexingString(R"(
-                  (d0, d1) -> (d0 + d1 * 1000)
+  auto* computation = (*module)->GetComputationWithName("fused_computation");
+  auto fusion = HloFusionAdaptor::ForComputation(computation);
+  HloInstructionAdaptor transpose(*computation->GetInstructionWithName("t"),
+                                  fusion.get());
+  HloInstructionAdaptor log(*computation->GetInstructionWithName("log"),
+                            fusion.get());
+
+  EXPECT_THAT(
+      ComputeEpilogueInputToOutputIndexing(transpose, log, &mlir_context_)
+          .ToString(),
+      MatchIndexingString(R"(
+                  (d0, d1) -> (d1 * 1000 + d0)
                   domain:
                   d0 in [0, 999]
                   d1 in [0, 999]
@@ -2641,17 +2669,26 @@ TEST_F(IndexingAnalysisTest, EpilogueIndexing) {
 TEST_F(IndexingAnalysisTest, EpilogueIndexing_NoEpilogue) {
   auto module = ParseAndReturnVerifiedModule(R"(
     HloModule m
-    ENTRY e {
+    fused_computation {
       p0 = f32[1000, 1000] parameter(0)
       ROOT t = f32[1000, 1000]{0, 1} transpose(p0), dimensions={1, 0}
     }
+    ENTRY e {
+      p0 = f32[1000, 1000] parameter(0)
+      ROOT fusion = f32[1000, 1000] fusion(p0), kind=kLoop,
+          calls=fused_computation
+    }
   )");
   ASSERT_TRUE(module.ok());
-  EXPECT_THAT(ComputeEpilogueInputToOutputIndexing(
-                  (*module)->entry_computation()->GetInstructionWithName("t"),
-                  &mlir_context_)
-                  .ToString(),
-              MatchIndexingString(R"(
+  auto* computation = (*module)->GetComputationWithName("fused_computation");
+  auto fusion = HloFusionAdaptor::ForComputation(computation);
+  HloInstructionAdaptor transpose(*computation->GetInstructionWithName("t"),
+                                  fusion.get());
+
+  EXPECT_THAT(
+      ComputeEpilogueInputToOutputIndexing(transpose, transpose, &mlir_context_)
+          .ToString(),
+      MatchIndexingString(R"(
                   (d0, d1) -> (d0, d1)
                   domain:
                   d0 in [0, 999]
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map.cc b/third_party/xla/xla/service/gpu/model/indexing_map.cc
index eef79d639e7772..168d77f849e9d9 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_map.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/AffineExpr.h"  // from @llvm-project
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
@@ -59,6 +60,7 @@ using mlir::AffineMap;
 using mlir::AffineSymbolExpr;
 using mlir::getAffineBinaryOpExpr;
 using mlir::getAffineConstantExpr;
+using mlir::getAffineDimExpr;
 using mlir::MLIRContext;
 
 class AffineExprSimplifier {
@@ -85,12 +87,13 @@ class AffineExprSimplifier {
   // - Rewrites a / 100 to 0 when a is known to be less than 100.
   mlir::AffineExpr RewriteFloorDiv(mlir::AffineBinaryOpExpr div);
 
-  mlir::AffineExpr RewriteSum(
-      mlir::AffineExpr expr,
-      const std::function<mlir::AffineExpr(mlir::AffineExpr)>& map);
-
-  mlir::AffineExpr RewriteSumIf(
+  // Removes summands from arbitrarily nested sums (e.g, ((a+b)+c)) if `pred`
+  // returns true. In this example, `pred` is evaluated on `a`, `b` and `c`, not
+  // on `a+b`.
+  mlir::AffineExpr RemoveSummands(
       mlir::AffineExpr expr, const std::function<bool(mlir::AffineExpr)>& pred);
+  void VisitSummands(mlir::AffineExpr expr,
+                     const std::function<void(mlir::AffineExpr)>& visit);
 
   // Attempts to simplify the expression, but doesn't attempt to simplify the
   // result further.
@@ -146,45 +149,48 @@ AffineExpr AffineExprSimplifier::RewriteMod(AffineBinaryOpExpr mod) {
            *mul;
   }
 
-  Interval no_multiplier_range{0, 0};
-  int64_t multiplier_gcd = -1;
-
   int64_t extracted_constant = 0;
-  auto new_lhs = RewriteSumIf(lhs_simplified, [&](AffineExpr expr) {
+  auto new_lhs = RemoveSummands(lhs_simplified, [&](AffineExpr expr) {
     if (auto cst = mlir::dyn_cast<AffineConstantExpr>(expr);
         cst && cst.getValue() >= m) {
       extracted_constant += cst.getValue();
-      return false;
+      return true;
     }
     if (auto multiplier = GetConstantRhs(expr, AffineExprKind::Mul)) {
       if (*multiplier % m == 0) {
-        return false;
+        return true;
       }
+    }
+    return false;
+  });
+  new_lhs = new_lhs + (extracted_constant % m);
 
+  Interval no_multiplier_range{0, 0};
+  int64_t multiplier_gcd = -1;
+  VisitSummands(new_lhs, [&](AffineExpr expr) {
+    if (auto multiplier = GetConstantRhs(expr, AffineExprKind::Mul)) {
       if (multiplier_gcd == -1) {
         multiplier_gcd = *multiplier;
       } else {
         multiplier_gcd = std::gcd(multiplier_gcd, *multiplier);
       }
-      return true;
+    } else {
+      auto range = range_evaluator_->ComputeExpressionRange(expr);
+      no_multiplier_range.lower += range.lower;
+      no_multiplier_range.upper += range.upper;
     }
-    auto range = range_evaluator_->ComputeExpressionRange(expr);
-    no_multiplier_range.lower += range.lower;
-    no_multiplier_range.upper += range.upper;
-    return true;
   });
-  new_lhs = new_lhs + (extracted_constant % m);
 
   mlir::AffineExpr extracted = getAffineConstantExpr(0, mod.getContext());
   if (m % multiplier_gcd == 0 && no_multiplier_range.lower >= 0 &&
       no_multiplier_range.upper < multiplier_gcd) {
     // Remove everything that doesn't have a multiplier.
-    new_lhs = RewriteSumIf(new_lhs, [&](AffineExpr expr) {
+    new_lhs = RemoveSummands(new_lhs, [&](AffineExpr expr) {
       if (GetConstantRhs(expr, AffineExprKind::Mul)) {
-        return true;
+        return false;
       }
       extracted = extracted + expr;
-      return false;
+      return true;
     });
   }
   return new_lhs % mod.getRHS() + extracted;
@@ -239,40 +245,22 @@ AffineExpr AffineExprSimplifier::RewriteFloorDiv(AffineBinaryOpExpr div) {
   }
 
   Interval no_multiplier_range{0, 0};
-  int64_t multiplier_gcd = -1;
-  // The maximum GCD of any remaining multiplier inside the div and the divisor.
-  int64_t max_remaining_multiplier_gcd = -1;
   AffineExpr zero = getAffineConstantExpr(0, mlir_context);
   AffineExpr extracted = zero;
-  auto new_dividend = RewriteSumIf(lhs_simplified, [&](AffineExpr expr) {
+  auto new_dividend = RemoveSummands(lhs_simplified, [&](AffineExpr expr) {
     if (auto multiplier = GetConstantRhs(expr, AffineExprKind::Mul)) {
       // (x * 7 + ...) / 3 -> can't extract. We could extract x * 2 and keep
       // one x, but we currently have no reason to do that.
-
       if (*multiplier % d == 0) {
         int64_t factor = *multiplier / d;
         extracted =
             extracted + mlir::cast<AffineBinaryOpExpr>(expr).getLHS() * factor;
         // Remove from dividend.
-        return false;
-      }
-
-      if (*multiplier > 0) {
-        if (multiplier_gcd == -1) {
-          multiplier_gcd = *multiplier;
-        } else {
-          multiplier_gcd = std::gcd(multiplier_gcd, *multiplier);
-        }
-        max_remaining_multiplier_gcd =
-            std::max(max_remaining_multiplier_gcd, std::gcd(*multiplier, d));
         return true;
       }
     }
-    auto range = range_evaluator_->ComputeExpressionRange(expr);
-    no_multiplier_range.lower += range.lower;
-    no_multiplier_range.upper += range.upper;
     // Not a constant multiplier, keep in dividend.
-    return true;
+    return false;
   });
 
   // If we removed everything, skip the div.
@@ -280,13 +268,32 @@ AffineExpr AffineExprSimplifier::RewriteFloorDiv(AffineBinaryOpExpr div) {
     return extracted;
   }
 
+  int64_t multiplier_gcd = -1;
+  // The maximum GCD of any remaining multiplier inside the div and the divisor.
+  int64_t max_remaining_multiplier_gcd = -1;
+  VisitSummands(new_dividend, [&](AffineExpr summand) {
+    if (auto multiplier = GetConstantRhs(summand, AffineExprKind::Mul)) {
+      if (multiplier_gcd == -1) {
+        multiplier_gcd = *multiplier;
+      } else {
+        multiplier_gcd = std::gcd(multiplier_gcd, *multiplier);
+      }
+      max_remaining_multiplier_gcd =
+          std::max(max_remaining_multiplier_gcd, std::gcd(*multiplier, d));
+    } else {
+      auto range = range_evaluator_->ComputeExpressionRange(summand);
+      no_multiplier_range.lower += range.lower;
+      no_multiplier_range.upper += range.upper;
+    }
+  });
+
   if ((d % multiplier_gcd) == 0) {
     if (no_multiplier_range.lower >= 0 &&
         no_multiplier_range.upper < multiplier_gcd) {
       // Remove everything that doesn't have a multiplier.
-      new_dividend = RewriteSumIf(new_dividend, [&](AffineExpr expr) {
+      new_dividend = RemoveSummands(new_dividend, [&](AffineExpr expr) {
         auto mult = GetConstantRhs(expr, AffineExprKind::Mul);
-        return mult.has_value();
+        return !mult.has_value();
       });
     }
   }
@@ -295,7 +302,7 @@ AffineExpr AffineExprSimplifier::RewriteFloorDiv(AffineBinaryOpExpr div) {
   // (x * 128 + y) // 192 -> (x * 2 + y // 64) // 3
   if (max_remaining_multiplier_gcd > 1) {
     AffineExpr partially_extracted = getAffineConstantExpr(0, mlir_context);
-    new_dividend = RewriteSumIf(new_dividend, [&](AffineExpr expr) {
+    new_dividend = RemoveSummands(new_dividend, [&](AffineExpr expr) {
       if (auto multiplier = GetConstantRhs(expr, AffineExprKind::Mul);
           multiplier && (*multiplier > 0) &&
           ((*multiplier % max_remaining_multiplier_gcd) == 0)) {
@@ -304,9 +311,9 @@ AffineExpr AffineExprSimplifier::RewriteFloorDiv(AffineBinaryOpExpr div) {
             partially_extracted +
             expr_lhs * (*multiplier / max_remaining_multiplier_gcd);
         // Remove from dividend.
-        return false;
+        return true;
       }
-      return true;
+      return false;
     });
     return extracted + (partially_extracted +
                         new_dividend.floorDiv(max_remaining_multiplier_gcd))
@@ -335,27 +342,92 @@ std::optional<int64_t> AffineExprSimplifier::GetConstantRhs(
   return bound.lower;
 }
 
-AffineExpr AffineExprSimplifier::RewriteSum(
-    AffineExpr expr, const std::function<AffineExpr(AffineExpr)>& map) {
+AffineExpr AffineExprSimplifier::RemoveSummands(
+    AffineExpr expr, const std::function<bool(AffineExpr)>& pred) {
   if (expr.getKind() == AffineExprKind::Add) {
     auto add = mlir::dyn_cast<AffineBinaryOpExpr>(expr);
-    return RewriteSum(add.getLHS(), map) + RewriteSum(add.getRHS(), map);
+    auto lhs = RemoveSummands(add.getLHS(), pred);
+    auto rhs = RemoveSummands(add.getRHS(), pred);
+    if (lhs == add.getLHS() && rhs == add.getRHS()) {
+      return add;
+    }
+    return lhs + rhs;
   }
-  return map(expr);
+  return pred(expr) ? mlir::getAffineConstantExpr(0, expr.getContext()) : expr;
 }
 
-AffineExpr AffineExprSimplifier::RewriteSumIf(
-    AffineExpr expr, const std::function<bool(AffineExpr)>& pred) {
+void AffineExprSimplifier::VisitSummands(
+    mlir::AffineExpr expr, const std::function<void(mlir::AffineExpr)>& visit) {
   if (expr.getKind() == AffineExprKind::Add) {
     auto add = mlir::dyn_cast<AffineBinaryOpExpr>(expr);
-    auto lhs = RewriteSumIf(add.getLHS(), pred);
-    auto rhs = RewriteSumIf(add.getRHS(), pred);
-    if (lhs == add.getLHS() && rhs == add.getRHS()) {
-      return add;
+    VisitSummands(add.getLHS(), visit);
+    VisitSummands(add.getRHS(), visit);
+  } else {
+    visit(expr);
+  }
+}
+
+// Compares the two expression by their AST. The ordering is arbitrary but
+// similar to what MLIR's simplifier does.
+int CompareExprs(AffineExpr a, AffineExpr b) {
+  if ((b.getKind() == AffineExprKind::Constant) !=
+      (a.getKind() == AffineExprKind::Constant)) {
+    return a.getKind() == AffineExprKind::Constant ? 1 : -1;
+  }
+  if (a.getKind() < b.getKind()) {
+    return -1;
+  }
+  if (a.getKind() > b.getKind()) {
+    return 1;
+  }
+  assert(a.getKind() == b.getKind());
+  int64_t a_value = 0;
+  int64_t b_value = 0;
+  switch (a.getKind()) {
+    case AffineExprKind::Add:
+    case AffineExprKind::FloorDiv:
+    case AffineExprKind::CeilDiv:
+    case AffineExprKind::Mul:
+    case AffineExprKind::Mod: {
+      auto a_bin = mlir::cast<AffineBinaryOpExpr>(a);
+      auto b_bin = mlir::cast<AffineBinaryOpExpr>(b);
+      auto lhs = CompareExprs(a_bin.getLHS(), b_bin.getLHS());
+      if (lhs != 0) {
+        return lhs;
+      }
+      return CompareExprs(a_bin.getRHS(), b_bin.getRHS());
+    }
+    case AffineExprKind::Constant: {
+      a_value = mlir::cast<AffineConstantExpr>(a).getValue();
+      b_value = mlir::cast<AffineConstantExpr>(b).getValue();
+      break;
+    }
+    case AffineExprKind::SymbolId: {
+      a_value = mlir::cast<AffineSymbolExpr>(a).getPosition();
+      b_value = mlir::cast<AffineSymbolExpr>(b).getPosition();
+      break;
+    }
+    case AffineExprKind::DimId: {
+      a_value = mlir::cast<AffineDimExpr>(a).getPosition();
+      b_value = mlir::cast<AffineDimExpr>(b).getPosition();
+      break;
     }
-    return lhs + rhs;
   }
-  return pred(expr) ? expr : mlir::getAffineConstantExpr(0, expr.getContext());
+  return a_value < b_value ? -1 : (a_value > b_value ? 1 : 0);
+}
+
+AffineExpr CanonicalizeOrder(AffineExpr in) {
+  if (auto binop = mlir::dyn_cast<AffineBinaryOpExpr>(in)) {
+    auto lhs = CanonicalizeOrder(binop.getLHS());
+    auto rhs = CanonicalizeOrder(binop.getRHS());
+    if ((binop.getKind() == AffineExprKind::Add ||
+         binop.getKind() == AffineExprKind::Mul) &&
+        CompareExprs(lhs, rhs) > 0) {
+      std::swap(lhs, rhs);
+    }
+    return getAffineBinaryOpExpr(binop.getKind(), lhs, rhs);
+  }
+  return in;
 }
 
 AffineExpr AffineExprSimplifier::SimplifyOnce(AffineExpr expr) {
@@ -477,7 +549,11 @@ AffineMap AffineExprSimplifier::Simplify(AffineMap affine_map) {
     results.push_back(simplified);
   }
   if (nothing_changed) {
-    return affine_map;
+    for (auto& result : results) {
+      result = CanonicalizeOrder(result);
+    }
+    return AffineMap::get(affine_map.getNumDims(), affine_map.getNumSymbols(),
+                          results, affine_map.getContext());
   }
   return Simplify(AffineMap::get(affine_map.getNumDims(),
                                  affine_map.getNumSymbols(), results,
@@ -685,6 +761,11 @@ IndexingMap IndexingMap::FromTensorSizes(
                      /*rt_vars=*/{}};
 }
 
+RangeEvaluator IndexingMap::GetRangeEvaluator() const {
+  return RangeEvaluator(GetDimensionBounds(), GetSymbolBounds(),
+                        GetMLIRContext());
+}
+
 const Interval& IndexingMap::GetDimensionBound(int64_t dim_id) const {
   return dim_vars_[dim_id].bounds;
 }
@@ -884,6 +965,21 @@ std::string IndexingMap::ToString(const AffineMapPrinter& printer) const {
   return ss.str();
 }
 
+void PrintRTVars(const std::vector<RTVar>& rt_vars,
+                 int first_rt_var_symbol_index, std::ostream& out,
+                 const AffineMapPrinter& printer) {
+  for (const auto& [index, rt_var] : llvm::enumerate(rt_vars)) {
+    out << printer.GetSymbolName(
+               static_cast<int64_t>(first_rt_var_symbol_index + index))
+        << " in ";
+    rt_var.feasible_values.Print(out);
+    out << "\n  hlo: "
+        << (rt_var.hlo == nullptr ? "NULL" : rt_var.hlo->ToString()) << "\n  ";
+    printer.Print(out, rt_var.map);
+    out << '\n';
+  }
+}
+
 void IndexingMap::Print(std::ostream& out,
                         const AffineMapPrinter& printer) const {
   printer.Print(out, affine_map_);
@@ -899,15 +995,8 @@ void IndexingMap::Print(std::ostream& out,
     out << '\n';
   }
   int64_t range_vars_count = GetRangeVarsCount();
-  for (const auto& [index, rt_var] : llvm::enumerate(rt_vars_)) {
-    out << printer.GetSymbolName(static_cast<int64_t>(range_vars_count + index))
-        << " in ";
-    rt_var.feasible_values.Print(out);
-    out << "\n  hlo: "
-        << (rt_var.hlo == nullptr ? "NULL" : rt_var.hlo->ToString()) << "\n  ";
-    printer.Print(out, rt_var.map);
-    out << '\n';
-  }
+  PrintRTVars(rt_vars_, /*first_rt_var_symbol_index=*/range_vars_count, out,
+              printer);
   std::vector<std::string> expr_range_strings;
   expr_range_strings.reserve(constraints_.size());
   for (const auto& [expr, range] : constraints_) {
@@ -1079,75 +1168,128 @@ bool IsFunctionOfUnusedDimsAndSymbolsOnly(
   return true;
 }
 
-}  // namespace
+struct UnusedVariables {
+  SmallBitVector unused_dims;
+  SmallBitVector unused_symbols;
+  SmallVector<AffineExpr> constraints_with_unused_vars_only;
+};
 
-void IndexingMap::RemoveUnusedSymbols() {
-  if (IsUndefined()) return;
+// Detects unused dimensions and symbols in the inde
+UnusedVariables DetectUnusedVariables(const IndexingMap& indexing_map) {
+  AffineMap affine_map = indexing_map.GetAffineMap();
 
-  // Remove unused symbols from the affine_map.
-  unsigned num_symbols_before = affine_map_.getNumSymbols();
-  SmallBitVector unused_symbols_bit_vector =
-      mlir::getUnusedSymbolsBitVector({affine_map_});
-  SmallBitVector unused_dims_bit_vector =
-      mlir::getUnusedDimsBitVector({affine_map_});
+  UnusedVariables unused_vars;
+  // Find unused dimensions and symbols in the affine_map.
+  unused_vars.unused_dims = mlir::getUnusedDimsBitVector({affine_map});
+  unused_vars.unused_symbols = mlir::getUnusedSymbolsBitVector({affine_map});
 
   // Check if the symbols that are unused in `affine_map` are also unused in
   // expressions.
-  std::vector<std::pair<AffineExpr, UsedParameters>> candidates_to_remove;
-  for (const auto& [expr, range] : constraints_) {
+  SmallVector<std::pair<AffineExpr, UsedParameters>, 2>
+      unused_constraints_candidates;
+  for (const auto& [expr, range] : indexing_map.GetConstraints()) {
     UsedParameters used_parameters = GetUsedParameters(expr);
     // If the expression uses only symbols and dims that are "unused" in
     // `affine_map`, then we can remove it.
     if (IsFunctionOfUnusedDimsAndSymbolsOnly(used_parameters,
-                                             unused_dims_bit_vector,
-                                             unused_symbols_bit_vector)) {
-      candidates_to_remove.push_back({expr, used_parameters});
+                                             unused_vars.unused_dims,
+                                             unused_vars.unused_symbols)) {
+      unused_constraints_candidates.push_back({expr, used_parameters});
       continue;
     }
-    // Otherwise, we need to mark all symbols of these expr as "used".
+    // Otherwise, we need to mark all dims and symbols of these expr as "used".
+    for (int64_t dim_id : used_parameters.dimension_ids) {
+      unused_vars.unused_dims[dim_id] = false;
+    }
     for (int64_t symbol_id : used_parameters.symbol_ids) {
-      unused_symbols_bit_vector[symbol_id] = false;
+      unused_vars.unused_symbols[symbol_id] = false;
     }
   }
-  for (const auto& [expr, used_parameters] : candidates_to_remove) {
+  for (const auto& [expr, used_parameters] : unused_constraints_candidates) {
     if (IsFunctionOfUnusedDimsAndSymbolsOnly(used_parameters,
-                                             unused_dims_bit_vector,
-                                             unused_symbols_bit_vector)) {
-      constraints_.erase(expr);
+                                             unused_vars.unused_dims,
+                                             unused_vars.unused_symbols)) {
+      unused_vars.constraints_with_unused_vars_only.push_back(expr);
     }
   }
+  return unused_vars;
+}
 
-  // Compress `affine_map` using the updated `unused_symbols_bit_vector`.
-  affine_map_ = mlir::compressSymbols(affine_map_, unused_symbols_bit_vector);
+SmallBitVector ConcatenateBitVectors(const SmallBitVector& lhs,
+                                     const SmallBitVector& rhs) {
+  SmallBitVector concat(lhs.size() + rhs.size(), false);
+  int id = 0;
+  for (int i = 0; i < lhs.size(); ++i, ++id) {
+    concat[id] = lhs[i];
+  }
+  for (int i = 0; i < rhs.size(); ++i, ++id) {
+    concat[id] = rhs[i];
+  }
+  return concat;
+}
 
-  // Remap symbols in the constraint expressions accordingly.
-  unsigned num_symbols_after = affine_map_.getNumSymbols();
-  if (num_symbols_after == num_symbols_before) return;
+}  // namespace
 
-  std::vector<RangeVar> compressed_range_vars;
-  std::vector<RTVar> compressed_rt_vars;
+bool IndexingMap::CompressVars(const llvm::SmallBitVector& unused_dims,
+                               const llvm::SmallBitVector& unused_symbols) {
   MLIRContext* mlir_context = GetMLIRContext();
-  int64_t used_symbols_count = 0;
-  std::vector<AffineExpr> symbol_replacements(
-      num_symbols_before, getAffineConstantExpr(0, mlir_context));
-  auto range_vars_count = range_vars_.size();
-  for (int i = 0; i < unused_symbols_bit_vector.size(); ++i) {
-    if (!unused_symbols_bit_vector[i]) {
-      if (i < range_vars_count) {
-        compressed_range_vars.push_back(range_vars_[i]);
-      } else {
-        compressed_rt_vars.push_back(rt_vars_[i - range_vars_count]);
+
+  bool num_dims_changed = unused_dims.count() > 0;
+  bool num_symbols_changed = unused_symbols.count() > 0;
+  if (!num_dims_changed && !num_symbols_changed) return false;
+
+  unsigned num_dims_before = GetDimensionCount();
+  unsigned num_symbols_before = GetSymbolCount();
+
+  // Compress DimVars.
+  SmallVector<AffineExpr, 2> dim_replacements;
+  if (num_dims_changed) {
+    affine_map_ = mlir::compressDims(affine_map_, unused_dims);
+    std::vector<DimVar> compressed_dim_vars;
+    dim_replacements = SmallVector<AffineExpr, 2>(
+        num_dims_before, getAffineConstantExpr(0, mlir_context));
+    int64_t used_dims_count = 0;
+    for (int i = 0; i < unused_dims.size(); ++i) {
+      if (!unused_dims[i]) {
+        compressed_dim_vars.push_back(dim_vars_[i]);
+        dim_replacements[i] = getAffineDimExpr(used_dims_count++, mlir_context);
+      }
+    }
+    dim_vars_ = std::move(compressed_dim_vars);
+  }
+
+  // Compress RangeVars and RTVars.
+  SmallVector<AffineExpr, 2> symbol_replacements;
+  if (num_symbols_changed) {
+    affine_map_ = mlir::compressSymbols(affine_map_, unused_symbols);
+    symbol_replacements = SmallVector<AffineExpr, 2>(
+        num_symbols_before, getAffineConstantExpr(0, mlir_context));
+    std::vector<RangeVar> compressed_range_vars;
+    std::vector<RTVar> compressed_rt_vars;
+    MLIRContext* mlir_context = GetMLIRContext();
+    int64_t used_symbols_count = 0;
+    auto range_vars_count = range_vars_.size();
+    for (int i = 0; i < unused_symbols.size(); ++i) {
+      if (!unused_symbols[i]) {
+        if (i < range_vars_count) {
+          compressed_range_vars.push_back(range_vars_[i]);
+        } else {
+          compressed_rt_vars.push_back(rt_vars_[i - range_vars_count]);
+        }
+        symbol_replacements[i] =
+            getAffineSymbolExpr(used_symbols_count++, mlir_context);
       }
-      symbol_replacements[i] =
-          getAffineSymbolExpr(used_symbols_count++, mlir_context);
     }
+    range_vars_ = std::move(compressed_range_vars);
+    rt_vars_ = std::move(compressed_rt_vars);
   }
-  range_vars_ = std::move(compressed_range_vars);
-  rt_vars_ = std::move(compressed_rt_vars);
+
+  // Remove constraints.
   std::vector<AffineExpr> to_remove;
   std::vector<std::pair<AffineExpr, Interval>> to_add;
   for (const auto& [expr, range] : constraints_) {
-    auto updated_expr = expr.replaceSymbols(symbol_replacements);
+    auto updated_expr =
+        expr.replaceDimsAndSymbols(dim_replacements, symbol_replacements);
     if (updated_expr == expr) continue;
     to_add.push_back({updated_expr, range});
     to_remove.push_back(expr);
@@ -1158,6 +1300,47 @@ void IndexingMap::RemoveUnusedSymbols() {
   for (const auto& [expr, range] : to_add) {
     AddConstraint(expr, range);
   }
+  return true;
+}
+
+SmallBitVector IndexingMap::RemoveUnusedSymbols() {
+  if (IsUndefined()) return {};
+
+  UnusedVariables unused_vars = DetectUnusedVariables(*this);
+  for (AffineExpr expr : unused_vars.constraints_with_unused_vars_only) {
+    constraints_.erase(expr);
+  }
+  if (!CompressVars(/*unused_dims=*/{}, unused_vars.unused_symbols)) {
+    return {};
+  }
+  return std::move(unused_vars.unused_symbols);
+}
+
+SmallBitVector IndexingMap::RemoveUnusedDimensions() {
+  if (IsUndefined()) return {};
+
+  UnusedVariables unused_vars = DetectUnusedVariables(*this);
+  for (AffineExpr expr : unused_vars.constraints_with_unused_vars_only) {
+    constraints_.erase(expr);
+  }
+  if (!CompressVars(unused_vars.unused_dims, /*unused_symbols=*/{})) {
+    return {};
+  }
+  return std::move(unused_vars.unused_dims);
+}
+
+SmallBitVector IndexingMap::RemoveUnusedVars() {
+  if (IsUndefined()) return {};
+
+  UnusedVariables unused_vars = DetectUnusedVariables(*this);
+  for (AffineExpr expr : unused_vars.constraints_with_unused_vars_only) {
+    constraints_.erase(expr);
+  }
+  if (!CompressVars(unused_vars.unused_dims, unused_vars.unused_symbols)) {
+    return {};
+  }
+  return ConcatenateBitVectors(unused_vars.unused_dims,
+                               unused_vars.unused_symbols);
 }
 
 void IndexingMap::MergeModConstraints() {
@@ -1533,5 +1716,17 @@ bool IndexingMap::ReplaceConstantRTVars(
   return !to_delete.empty();
 }
 
+bool IndexingMap::IsRangeVarSymbol(mlir::AffineSymbolExpr symbol) const {
+  unsigned int position = symbol.getPosition();
+  CHECK_LE(position, GetSymbolCount());
+  return position < range_vars_.size();
+}
+
+bool IndexingMap::IsRTVarSymbol(mlir::AffineSymbolExpr symbol) const {
+  unsigned int position = symbol.getPosition();
+  CHECK_LE(position, GetSymbolCount());
+  return position >= range_vars_.size();
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map.h b/third_party/xla/xla/service/gpu/model/indexing_map.h
index f8994f9000221b..ee0a18ae1ef9cf 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map.h
+++ b/third_party/xla/xla/service/gpu/model/indexing_map.h
@@ -170,6 +170,9 @@ H AbslHashValue(H h, const RangeVar& range_var) {
 struct RTVar {
   Interval feasible_values;
   const HloInstruction* hlo;
+  // This is a map from the iteration space of the corresponding indexing map to
+  // the iteration space of `hlo`. It shows what element of `hlo` we need to
+  // extract to get the runtime value for the RTVar.
   mlir::AffineMap map;
 };
 bool operator==(const RTVar& lhs, const RTVar& rhs);
@@ -259,6 +262,10 @@ class IndexingMap {
 
   // Returns the affine map.
   mlir::AffineMap GetAffineMap() const { return affine_map_; }
+  mlir::AffineMap& GetMutableAffineMap() { return affine_map_; }
+
+  // Returns the range evaluator for the indexing map's domain.
+  RangeEvaluator GetRangeEvaluator() const;
 
   // Getters for dimension vars.
   const DimVar& GetDimVars(int64_t id) const { return dim_vars_[id]; }
@@ -319,13 +326,36 @@ class IndexingMap {
 
   bool IsUndefined() const { return affine_map_ == mlir::AffineMap(); }
 
+  // Removes unused dimensions from the `affine_map_` and constraints.
+  // Returns a bit vector of dimensions that were removed. If none of the
+  // dimensions were removed, returns {}.
+  llvm::SmallBitVector RemoveUnusedDimensions();
+
   // Removes unused symbols from the `affine_map_` and constraints.
-  void RemoveUnusedSymbols();
+  // Returns a bit vector of symbols that were removed. If none of the symbols
+  // were removed, returns {}.
+  llvm::SmallBitVector RemoveUnusedSymbols();
+
+  // Removes unused dimensions and symbols from the `affine_map_` and
+  // constraints. Returns a bit vector of all variables [dimensions, symbols]
+  // that were removed. If none of the symbols were removed, returns {}.
+  llvm::SmallBitVector RemoveUnusedVars();
 
   // Rescales all symbols that are sufficiently constrained through `s? mod x =
   // [N, N]` constraints. Returns true if a rescale took place, otherwise false.
   bool RescaleSymbols();
 
+  // Does `symbol` correspond to a range var?
+  bool IsRangeVarSymbol(mlir::AffineSymbolExpr symbol) const;
+
+  // Does `symbol` correspond to an RTVar?
+  bool IsRTVarSymbol(mlir::AffineSymbolExpr symbol) const;
+
+  IndexingMap GetSubMap(unsigned int result_index) const {
+    return {affine_map_.getSubMap({result_index}), dim_vars_, range_vars_,
+            rt_vars_, constraints_};
+  }
+
  private:
   IndexingMap() = default;
 
@@ -344,6 +374,12 @@ class IndexingMap {
   // Returns true if a replacement was performed, otherwise false.
   bool ReplaceConstantRTVars(IndexingMapProvider indexing_map_provider);
 
+  // Removes DimVars, RangeVars, RTVars that correspond to the unused dimensions
+  // and symbols. If unused_dims is empty, then dims won't be removed. The same
+  // applies to unused_symbols. Returns true, if anything was removed.
+  bool CompressVars(const llvm::SmallBitVector& unused_dims,
+                    const llvm::SmallBitVector& unused_symbols);
+
   mlir::AffineMap affine_map_;
   std::vector<DimVar> dim_vars_;
   std::vector<RangeVar> range_vars_;
@@ -361,6 +397,21 @@ IndexingMap operator*(const IndexingMap& lhs, const IndexingMap& rhs);
 IndexingMap ComposeIndexingMaps(const IndexingMap& first,
                                 const IndexingMap& second);
 
+// Prints the RTVars.
+//
+// This is exposed to allow SymbolicTile to reuse it.
+//
+// `first_rt_var_symbol_index`: The index of the symbol associated with the
+// first RTVar. The RTVars will be printed with consequent symbol indices
+// starting with `first_rt_var_symbol_index`. For example, if `rt_vars.size()
+// == 3` and `first_rt_var_symbol_index == 4`, then the symbol names "s4",
+// "s5" and "s6" will be used.
+//
+// TODO(b/334043862): Unexpose this function if possible.
+void PrintRTVars(const std::vector<RTVar>& rt_vars,
+                 int first_rt_var_symbol_index, std::ostream& out,
+                 const AffineMapPrinter& printer);
+
 template <typename H>
 H AbslHashValue(H h, const IndexingMap& indexing_map) {
   llvm::hash_code affine_map_hash =
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map_test.cc b/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
index 87d06fdd8edb99..f753d945cf15d2 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
@@ -26,8 +26,6 @@ limitations under the License.
 #include "mlir/IR/AffineExpr.h"  // from @llvm-project
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "xla/hlo/ir/hlo_computation.h"
-#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/model/affine_map_printer.h"
 #include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/model/indexing_test_utils.h"
@@ -48,6 +46,15 @@ class IndexingMapTest : public HloTestBase {
   AffineMapPrinter printer_;
 };
 
+std::vector<bool> ConvertToSTL(const llvm::SmallBitVector& bit_vector) {
+  std::vector<bool> result;
+  result.reserve(bit_vector.size());
+  for (int i = 0; i < bit_vector.size(); ++i) {
+    result.push_back(bit_vector[i]);
+  }
+  return result;
+}
+
 TEST_F(IndexingMapTest, RTVar) {
   auto zero_dim_map = AffineMap::get(&mlir_context_);
   std::vector<RTVar> rt_vars{RTVar{Interval{0, 2},
@@ -183,6 +190,92 @@ TEST_F(IndexingMapTest, Composition_ProducerAndConsumerHaveConstraints) {
                         )"));
 }
 
+TEST_F(IndexingMapTest, RemoveUnusedDimensions_ConstraintUsesDim) {
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      ParseAffineMap("(d0, d1)[s0, s1] -> (d1, s0, s1)", &mlir_context_),
+      {50, 60}, {70, 20});
+  // This constraint cannot be removed, because it contains a "used dim".
+  indexing_map.AddConstraint(ParseAffineExpr("s0 + d0", &mlir_context_),
+                             Interval{1, 100});
+  indexing_map.AddConstraint(ParseAffineExpr("s0 mod 3", &mlir_context_),
+                             Interval{0, 0});
+  indexing_map.RemoveUnusedDimensions();
+  EXPECT_THAT(indexing_map, MatchIndexingMap(R"(
+                          (d0, d1)[s0, s1] -> (d1, s0, s1)
+                          domain:
+                          d0 in [0, 49]
+                          d1 in [0, 59]
+                          s0 in [0, 69]
+                          s1 in [0, 19]
+                          d0 + s0 in [1, 100]
+                          s0 mod 3 in [0, 0]
+                        )"));
+}
+
+TEST_F(IndexingMapTest, RemoveUnusedDimensions_ConstraintUsesOnlyUnusedDim) {
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      ParseAffineMap("(d0, d1)[s0, s1] -> (s0, d1, s1)", &mlir_context_),
+      {50, 60}, {70, 20});
+  // This constraint can be removed, because it contains only the unused dim.
+  indexing_map.AddConstraint(ParseAffineExpr("d0 mod 3", &mlir_context_),
+                             Interval{0, 0});
+  indexing_map.RemoveUnusedDimensions();
+  EXPECT_THAT(indexing_map, MatchIndexingMap(R"(
+                          (d0)[s0, s1] -> (s0, d0, s1)
+                          domain:
+                          d0 in [0, 59]
+                          s0 in [0, 69]
+                          s1 in [0, 19]
+                        )"));
+}
+
+TEST_F(IndexingMapTest, RemoveUnusedDimensions_ConstraintsWithManyDims) {
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      ParseAffineMap("(d0, d1, d2, d3, d4)[s0, s1] -> (s0 * 4 + d1 + d3 - 42)",
+                     &mlir_context_),
+      {1, 2, 3, 4, 5}, {32, 64});
+  indexing_map.AddConstraint(
+      ParseAffineExpr("s0 * 4 + d1 + d3", &mlir_context_), Interval{24, 459});
+  indexing_map.RemoveUnusedDimensions();
+  // dimensions d0, d2, d4 will be removed and d1 and d3 will become d0 and d1.
+  EXPECT_THAT(indexing_map, MatchIndexingMap(R"(
+                              (d0, d1)[s0, s1] -> (d0 + s0 * 4 + d1 - 42)
+                              domain:
+                              d0 in [0, 1]
+                              d1 in [0, 3]
+                              s0 in [0, 31]
+                              s1 in [0, 63]
+                              d0 + s0 * 4 + d1 in [24, 459]
+                            )"));
+}
+
+TEST_F(IndexingMapTest, RemoveUnusedVars_ConstraintsWithManyDims) {
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      ParseAffineMap(
+          "(d0, d1, d2, d3, d4)[s0, s1, s2] -> (s0 * 4 + d1 + d3 - 42)",
+          &mlir_context_),
+      {1, 2, 3, 4, 5}, {32, 64, 96});
+  indexing_map.AddConstraint(
+      ParseAffineExpr("s0 * 4 + d1 + d3", &mlir_context_), Interval{24, 459});
+  indexing_map.AddConstraint(ParseAffineExpr("s0 + s2", &mlir_context_),
+                             Interval{0, 512});
+  auto unused_vars = indexing_map.RemoveUnusedVars();
+  // dimensions d0, d2, d4 and symbol s1 will be removed.
+  EXPECT_THAT(indexing_map, MatchIndexingMap(R"(
+                              (d0, d1)[s0, s1] -> (d0 + s0 * 4 + d1 - 42)
+                              domain:
+                              d0 in [0, 1]
+                              d1 in [0, 3]
+                              s0 in [0, 31]
+                              s1 in [0, 95]
+                              d0 + s0 * 4 + d1 in [24, 459]
+                              s0 + s1 in [0, 512]
+                            )"));
+  EXPECT_THAT(ConvertToSTL(unused_vars),
+              ::testing::ElementsAreArray(
+                  {true, false, true, false, true, false, true, false}));
+}
+
 TEST_F(IndexingMapTest, RemoveUnusedSymbols_ConstraintUsesSymbol) {
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap("(d0, d1)[s0, s1] -> (d1, d0, s1)", &mlir_context_),
@@ -482,7 +575,7 @@ TEST_F(IndexingMapTest,
       ParseAffineMap(serialized_map, &mlir_context_), {10, 10, 10}, {});
   indexing_map.Simplify(GetIndexingMapForInstruction);
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
-    (d0, d1, d2) -> (d0 * 2 + (d1 + d2 floordiv 4) floordiv 2,
+    (d0, d1, d2) -> (d0 * 2 + (d2 floordiv 4 + d1) floordiv 2,
                      (d1 * 4 + d2) mod 8)
     domain:
     d0 in [0, 9]
@@ -526,7 +619,7 @@ TEST_F(IndexingMapTest, AffineMapSimplification_SimplifyReshape_Regression) {
       ParseAffineMap(serialized_map, &mlir_context_), {}, {128});
   indexing_map.Simplify(GetIndexingMapForInstruction);
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
-      ()[s0] -> ((s0 * 128) mod 715 + ((s0 * 64) floordiv 715) * 715)
+      ()[s0] -> (((s0 * 64) floordiv 715) * 715 + (s0 * 128) mod 715)
       domain: s0 in [0, 127]
   )"));
 }
@@ -570,7 +663,7 @@ TEST_F(IndexingMapTest, AffineMapSimplification_ExtractFromMod) {
   indexing_map.Simplify(GetIndexingMapForInstruction);
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
       ()[s0, s1, s2, s3] -> (
-        s1 + (s0 * 458752 + s2 * 4 + s3 * 512) mod 20000
+        (s0 * 458752 + s2 * 4 + s3 * 512) mod 20000 + s1
       )
       domain:
       s0 in [0, 871]
diff --git a/third_party/xla/xla/service/gpu/model/indexing_test_utils.cc b/third_party/xla/xla/service/gpu/model/indexing_test_utils.cc
index e7b7e39ac71325..76073e48089755 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_test_utils.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_test_utils.cc
@@ -18,8 +18,10 @@ limitations under the License.
 #include <cctype>
 #include <cstddef>
 #include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
 
-#include <gtest/gtest.h>
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/strings/str_cat.h"
@@ -29,11 +31,10 @@ limitations under the License.
 #include "mlir/IR/AffineExpr.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/model/indexing_map.h"
-#include "xla/tests/hlo_test_base.h"
 
 namespace xla {
 namespace gpu {
@@ -120,8 +121,8 @@ AffineMap ParseAffineMap(absl::string_view serialized_affine_map,
                          MLIRContext* context) {
   std::string full_affine_map_string =
       absl::StrCat("affine_map<", serialized_affine_map, ">");
-  return mlir::parseAttribute(full_affine_map_string, context)
-      .cast<mlir::AffineMapAttr>()
+  return mlir::cast<mlir::AffineMapAttr>(
+             mlir::parseAttribute(full_affine_map_string, context))
       .getValue();
 }
 
@@ -133,8 +134,8 @@ AffineExpr ParseAffineExpr(absl::string_view serialized_affine_expr,
       "affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8, d9)"
       "[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9] -> (",
       serialized_affine_expr, ")>");
-  return mlir::parseAttribute(full_affine_map_string, context)
-      .cast<mlir::AffineMapAttr>()
+  return mlir::cast<mlir::AffineMapAttr>(
+             mlir::parseAttribute(full_affine_map_string, context))
       .getValue()
       .getResult(0);
 }
diff --git a/third_party/xla/xla/service/gpu/model/indexing_test_utils.h b/third_party/xla/xla/service/gpu/model/indexing_test_utils.h
index 62abd0e5e7fdb4..1c784191ebf913 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_test_utils.h
+++ b/third_party/xla/xla/service/gpu/model/indexing_test_utils.h
@@ -33,6 +33,9 @@ namespace xla {
 namespace gpu {
 
 // Matches two strings ignoring whitespaces.
+// 'lhs' may contain regions bounded by the special pattern '###',
+// in which case, each region is parsed as a sequence of terms separated by
+// '+ signs. The function will try to match all permutations of terms.
 bool ApproximateMatch(std::string_view lhs, std::string_view rhs);
 
 MATCHER(UndefinedMap, "") { return arg.IsUndefined(); }
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile.cc
index 98ed3ecc4dff4f..836bc243954812 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile.cc
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile.cc
@@ -16,15 +16,19 @@ limitations under the License.
 #include "xla/service/gpu/model/symbolic_tile.h"
 
 #include <cstdint>
+#include <iterator>
 #include <optional>
 #include <ostream>
 #include <sstream>
 #include <string>
+#include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/types/span.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/IR/AffineExpr.h"  // from @llvm-project
@@ -32,54 +36,56 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "xla/service/gpu/model/affine_map_printer.h"
+#include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/model/indexing_map.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
+using ::mlir::AffineConstantExpr;
+using ::mlir::AffineDimExpr;
 using ::mlir::AffineExpr;
 using ::mlir::AffineExprKind;
 using ::mlir::AffineMap;
+using ::mlir::AffineSymbolExpr;
 using ::mlir::getAffineConstantExpr;
 using ::mlir::getAffineDimExpr;
 using ::mlir::MLIRContext;
 
-// Internal helper that checks whether an affine map describes a tileable space.
-// In simple terms, this currently returns true if "dimensions don't mix", i.e.,
-// every result expression only refers to a single dimension (or symbol).
+// Gets a modified version of `expressions` where both the original dimensions
+// and symbols are replaced with symbols.
 //
-// TODO(b/328427138): this is too restrictive for expressions involving e.g.
-// (output-to-input) split reshapes, where several symbols may appear within the
-// same expression but still yield a tileable space. This will be handled in a
-// forthcoming change.
-bool IndexingMapDescribesTileableSpace(const IndexingMap& indexing_map) {
-  for (AffineExpr result_expr : indexing_map.GetAffineMap().getResults()) {
-    // Using a simple integer here might be overly restrictive, since there may
-    // be cases where the same symbol appears in several places within the
-    // expression. It is a bit unclear whether this is a case that would happen
-    // in practice and whether we would be able to handle it well in all cases
-    // if it did. For that reason, we err on the side of conservatism and
-    // explicitly do not support such cases.
-    int64_t num_hits = 0;
-    result_expr.walk([&num_hits](AffineExpr expr) {
-      if (expr.getKind() == AffineExprKind::SymbolId ||
-          expr.getKind() == AffineExprKind::DimId) {
-        ++num_hits;
-      }
-    });
+// (dimensions)[symbols] -> ()[dimensions, symbols]
+std::vector<AffineExpr> DimsToSymbols(std::vector<AffineExpr> expressions,
+                                      const IndexingMap& indexing_map) {
+  MLIRContext* mlir_context = indexing_map.GetMLIRContext();
+
+  // Move symbols right
+  for (AffineExpr& expression : expressions) {
+    expression =
+        expression.shiftSymbols(/*numSymbols=*/indexing_map.GetSymbolCount(),
+                                /*shift=*/indexing_map.GetDimensionCount());
+  }
 
-    if (num_hits > 1) {
-      return false;
-    }
+  // Convert dimensions to symbols
+  llvm::DenseMap<AffineExpr, AffineExpr> dim_to_symbol_map;
+  for (int i = 0; i < indexing_map.GetDimensionCount(); i++) {
+    dim_to_symbol_map[getAffineDimExpr(i, mlir_context)] =
+        getAffineSymbolExpr(i, mlir_context);
+  }
+  for (AffineExpr& expression : expressions) {
+    expression = expression.replace(dim_to_symbol_map);
   }
-  return true;
+
+  return expressions;
 }
 
 // Helper to perform function application to using the same parameter for every
 // dimension and symbol parameter.
-AffineMap SubstituteAllIndicesAndKnownSymbolsWithSameValue(AffineMap affine_map,
-                                                           AffineExpr value) {
+AffineMap SubstituteAllIndicesAndRangeVarSymbolsWithSameValue(
+    AffineMap affine_map, AffineExpr value, int num_range_vars) {
+  CHECK_LE(num_range_vars, affine_map.getNumSymbols());
   MLIRContext* mlir_context = affine_map.getContext();
   int64_t num_dims = affine_map.getNumDims();
   int64_t num_symbols = affine_map.getNumSymbols();
@@ -89,7 +95,8 @@ AffineMap SubstituteAllIndicesAndKnownSymbolsWithSameValue(AffineMap affine_map,
     indices[getAffineDimExpr(i, mlir_context)] = value;
   }
 
-  for (int64_t i = 0; i < num_symbols; ++i) {
+  // Do not substitute RTVars.
+  for (int64_t i = 0; i < num_range_vars; ++i) {
     indices[getAffineSymbolExpr(i, mlir_context)] = value;
   }
 
@@ -101,13 +108,6 @@ struct SizeAndStrideExpression {
   AffineExpr stride;
 };
 
-// Converts a dimension expression to a symbol expression with the corresponding
-// index.
-AffineExpr ToSymbol(mlir::AffineDimExpr dim_expr) {
-  return mlir::getAffineSymbolExpr(dim_expr.getPosition(),
-                                   dim_expr.getContext());
-}
-
 // Extracts size and stride expressions from the operands to a modulo
 // expression.
 //
@@ -134,10 +134,10 @@ std::optional<SizeAndStrideExpression> ExtractSizeAndStrideFromMod(
   //     = c
   CHECK(modulus.getKind() == AffineExprKind::Constant);
   if (auto dim_expr = llvm::dyn_cast<mlir::AffineDimExpr>(lhs)) {
-    AffineExpr sym = ToSymbol(dim_expr);
-    AffineExpr size = sym - mlir::getAffineBinaryOpExpr(
-                                AffineExprKind::FloorDiv, sym - 1, modulus) *
-                                modulus;
+    AffineExpr size =
+        dim_expr - mlir::getAffineBinaryOpExpr(AffineExprKind::FloorDiv,
+                                               dim_expr - 1, modulus) *
+                       modulus;
     // In this case, stride is effectively 1 mod modulus = 1.
     return SizeAndStrideExpression{
         size, /*stride=*/getAffineConstantExpr(1, lhs.getContext())};
@@ -164,8 +164,8 @@ std::optional<SizeAndStrideExpression> ExtractSizeAndStrideFromFloorDiv(
     //
     // We represent `a ceildiv b` as `(a + b - 1) floordiv b`, since indexing
     // maps are not compatible with CeilDiv affine expressions.
-    AffineExpr size = mlir::getAffineBinaryOpExpr(
-        AffineExprKind::FloorDiv, ToSymbol(dim_expr) + (den - 1), den);
+    AffineExpr size = mlir::getAffineBinaryOpExpr(AffineExprKind::FloorDiv,
+                                                  dim_expr + (den - 1), den);
     return SizeAndStrideExpression{
         size, /*stride=*/getAffineConstantExpr(1, num.getContext())};
   }
@@ -173,13 +173,270 @@ std::optional<SizeAndStrideExpression> ExtractSizeAndStrideFromFloorDiv(
   return std::nullopt;
 }
 
+// See documentation of `DestructureSummation` for an explanation of the
+// algorithm.
+void DestructureSummationImpl(AffineExpr expr,
+                              std::vector<AffineExpr>& summands) {
+  switch (expr.getKind()) {
+    case AffineExprKind::Add: {
+      const auto add = llvm::cast<mlir::AffineBinaryOpExpr>(expr);
+      DestructureSummationImpl(add.getLHS(), summands);
+      DestructureSummationImpl(add.getRHS(), summands);
+      break;
+    }
+    default:
+      // The expression is not a sum.
+      summands.push_back(expr);
+      break;
+  }
+}
+
+// Given an n-ary summation of expressions `e0 + e1 + ... + e{n-1}` with
+// arbitrary order of association, returns the vector `(e0, e1, ..., e{n-1})`.
+// The order of the returned subexpressions is not guaranteed to match the order
+// in which they appear in the original expression.
+//
+// AffineExprKind::Add should be the operation that binds the least tightly,
+// allowing us to simply recursively destructure expressions until we reach an
+// AffineExprKind that is not an AffineExprKind::Add.
+//
+// Note that this will only work correctly for expressions that do no
+// factoring/grouping of summands such as `(d0 + d1) * c` or `(d0 + d1) mod c`.
+// It's unclear at this point whether this restriction will prove problematic,
+// but it isn't really worth thinking about until we are sure this actually
+// has practical implications.
+std::vector<AffineExpr> DestructureSummation(AffineExpr expr) {
+  std::vector<AffineExpr> summands;
+  DestructureSummationImpl(expr, summands);
+  return summands;
+}
+
+std::optional<SizeAndStrideExpression> ExtractSizeAndStride(
+    AffineExpr strided_indexing, absl::Span<Interval const> dimension_intervals,
+    absl::Span<Interval const> symbol_intervals);
+
+// Given a multivariate summation of strided indexing expression, extracts a
+// size and a stride for each summand. Returns std::nullopt if extraction fails
+// for any of the summands.
+std::optional<std::vector<SizeAndStrideExpression>>
+ExtractSizesAndStridesFromMultivariateSummation(
+    AffineExpr summation, absl::Span<Interval const> dimension_intervals,
+    absl::Span<Interval const> symbol_intervals) {
+  std::vector<AffineExpr> summands = DestructureSummation(summation);
+
+  std::vector<SizeAndStrideExpression> sizes_and_strides;
+  sizes_and_strides.reserve(summands.size());
+  for (AffineExpr summand : summands) {
+    std::optional<SizeAndStrideExpression> maybe_size_and_stride =
+        ExtractSizeAndStride(summand, dimension_intervals, symbol_intervals);
+    if (!maybe_size_and_stride.has_value()) {
+      VLOG(1) << "Couldn't extract size and stride from "
+              << AffineMapPrinter().ToString(summand);
+      return std::nullopt;
+    }
+    sizes_and_strides.push_back(*maybe_size_and_stride);
+  }
+  return sizes_and_strides;
+}
+
+// Given a list of sizes and strides, returns the product of all sizes.
+AffineExpr CombineSizes(
+    absl::Span<SizeAndStrideExpression const> sizes_and_strides) {
+  CHECK(!sizes_and_strides.empty());
+  AffineExpr product =
+      getAffineConstantExpr(1, sizes_and_strides[0].size.getContext());
+  for (const SizeAndStrideExpression& size_and_stride : sizes_and_strides) {
+    product = product * size_and_stride.size;
+  }
+  return product;
+}
+
+// Returns an affine expression logically equivalent to
+//   `eq_param != 1 ? true_expr : false_expr`.
+// `eq_param` is assumed to be able to be in the inclusive range
+//    {1, 2, ..., eq_param_inclusive_upper_bound}.
+AffineExpr IfNeqOne(AffineExpr eq_param, AffineExpr true_expr,
+                    AffineExpr false_expr,
+                    int64_t eq_param_inclusive_upper_bound) {
+  // Let e = eq_param, and b = eq_param_inclusive_bound, then we have:
+  //     1 <= e <= b
+  // <=> -b <= e - b - 1 <= -1              (subtract (b + 1))
+  // <=> 1 <= b + 1 - e <= b                (negate)
+  // <=> 0 <= (b + 1 - e) floordiv b <= 1   (divide by b)
+  //
+  // Since (b + 1 - e) floordiv b is an integer, it can only take values 0 or 1.
+  // Let's prove that
+  //   (b + 1 - e) floordiv b = 1 <=> e = 1.
+  //
+  // * If e = 1, then (b + 1 - e) floordiv b = (b + 1 - 1) floordiv b = 1.
+  // * If e != 1, then 1 < e since 1 is the lower bound for e.
+  //     1 < e <=> -e < -1                       (negate)
+  //           <=> b + 1 - e < b                 (add b + 1)
+  //           <=> (b - e + 1) floordiv b < 1.   (divide by b)
+  //   We also know that 0 <= (b + 1 - e) floordiv b. Therefore, we have that
+  //     (b - e + 1) floordiv b = 0.
+  //
+  // Thus,
+  //   (b + 1 - e) floordiv b = 1 <=> e = 1, and
+  //   (b + 1 - e) floordiv b = 0 <=> e != 1
+  // hold.
+  AffineExpr b = getAffineConstantExpr(eq_param_inclusive_upper_bound,
+                                       eq_param.getContext());
+  AffineExpr condition = mlir::getAffineBinaryOpExpr(AffineExprKind::FloorDiv,
+                                                     b + 1 - eq_param, b);
+
+  return condition * false_expr + (1 - condition) * true_expr;
+}
+
+// Sorts a list of `SizeAndStrideExpression`s by stride. There is a precondition
+// that all strides are constant.
+void SortByStride(std::vector<SizeAndStrideExpression>& sizes_and_strides) {
+  absl::c_sort(sizes_and_strides, [](const SizeAndStrideExpression& sas1,
+                                     const SizeAndStrideExpression& sas2) {
+    int64_t stride1 = llvm::cast<AffineConstantExpr>(sas1.stride).getValue();
+    int64_t stride2 = llvm::cast<AffineConstantExpr>(sas2.stride).getValue();
+    return stride1 < stride2;
+  });
+}
+
+// Given a list of sizes and strides, combines the strides into a single
+// expression if it is possible.
+//
+// The current implementation expects that each size captures a single dimension
+// parameter.
+//
+// Let s be an n-dimensional shape that we want to fully collapse. In order to
+// be propagated successfully through the collapse, the pattern of the tiling of
+// s will have to look like the following (in row-major order):
+//   (1*, partial_dim?, full_dims*, 1*)
+// where full_dims are dimensions along which we capture all the elements
+// we can based on the corresponding stride, and partial_dim is a dimension that
+// can be captured with an arbitrary tile.
+//
+// In that case, the stride will be the stride corresponding to the minormost
+// dimension in which we capture more than a single element. This corresponds
+// to the size expression `e` with the smallest stride such that `e` evaluates
+// to another value than 1. Algorithmically, this can be represented as a series
+// of nested if statements:
+//   if size0 != 1 then stride0 else (if size1 != 1 then stride1 else ...)
+// where {size,stride}i = size_and_strides[i].{size,stride} (sizes_and_strides
+// being sorted in ascending order of stride).
+//
+// We generate this nest.
+//
+// If all the sizes are 1, then return a zero stride. Note that this
+// value is arbitrarily chosen.
+std::optional<AffineExpr> CombineStrides(
+    std::vector<SizeAndStrideExpression> sizes_and_strides,
+    absl::Span<Interval const> dimension_intervals) {
+  CHECK(!sizes_and_strides.empty());
+  for (const SizeAndStrideExpression& size_and_stride : sizes_and_strides) {
+    if (size_and_stride.stride.getKind() != AffineExprKind::Constant) {
+      VLOG(1) << "Attempted to combine non-constant stride: "
+              << AffineMapPrinter().ToString(size_and_stride.stride);
+      return std::nullopt;
+    }
+
+    // We know the exact bounds of dimension parameters, since they correspond
+    // to parameters of the initial indexing map. It follows that if a size
+    // expression is exactly a dimension parameter, we know its exact bounds.
+    //
+    // If a size is not exactly a dimension parameter, then it is dubious
+    // whether we know the bounds---and may thus calculate wrong strides.
+    if (size_and_stride.size.getKind() != AffineExprKind::DimId) {
+      VLOG(1) << "Attempted to combine strides but got non-dimension size "
+              << AffineMapPrinter().ToString(size_and_stride.size);
+      return std::nullopt;
+    }
+  }
+
+  SortByStride(sizes_and_strides);
+
+  for (auto [dim_id, size_and_stride] : llvm::enumerate(sizes_and_strides)) {
+    int64_t stride =
+        llvm::cast<AffineConstantExpr>(size_and_stride.stride).getValue();
+
+    // The minormost stride can be anything, but we expect every subsequent
+    // stride to be exactly `p_stride * p_size` where `p_size` is the upper
+    // bound of the size expression of the previous dimension and `p_stride` is
+    // its stride expression.
+    //
+    // For simplicity, we assume that each size expression captures a single
+    // dimension parameter.
+    if (dim_id > 0) {
+      const SizeAndStrideExpression& previous_size_and_stride =
+          sizes_and_strides[dim_id - 1];
+      const auto previous_dimension =
+          llvm::cast<AffineDimExpr>(previous_size_and_stride.size);
+      const Interval& previous_size_interval =
+          dimension_intervals[previous_dimension.getPosition()];
+      if (previous_size_interval.lower != 0) {
+        // TODO(bchetioui): I think we may need to handle this to have reshapes
+        // working well with concatenations. Nevertheless, we can take a look
+        // later.
+        VLOG(1) << "Attempted to combine strides but got dimension "
+                << AffineMapPrinter().ToString(previous_dimension)
+                << " with lower bound " << previous_size_interval.lower
+                << " != 0";
+        return std::nullopt;
+      }
+
+      int64_t previous_stride =
+          llvm::cast<AffineConstantExpr>(previous_size_and_stride.stride)
+              .getValue();
+      // We need to add 1 to the upper bound of the interval to describe the
+      // number of elements being captured, since the interval bounds are
+      // inclusive.
+      if ((previous_size_interval.upper + 1) * previous_stride != stride) {
+        VLOG(1) << "Attempted to combine strides but stride did not grow "
+                << "exactly as expected: got "
+                << (previous_size_interval.upper + 1) << " * "
+                << previous_stride << " != " << stride;
+        return std::nullopt;
+      }
+    }
+  }
+
+  // Produce a nested if statement as described in the function's documentation.
+  MLIRContext* ctx = sizes_and_strides[0].stride.getContext();
+  AffineExpr nested_if = getAffineConstantExpr(0, ctx);
+  for (auto size_and_stride_it = sizes_and_strides.rbegin();
+       size_and_stride_it != sizes_and_strides.rend(); ++size_and_stride_it) {
+    AffineExpr size = size_and_stride_it->size;
+    AffineExpr stride = size_and_stride_it->stride;
+    const Interval& size_interval =
+        dimension_intervals[llvm::cast<AffineDimExpr>(size).getPosition()];
+    nested_if = IfNeqOne(size, stride, nested_if, size_interval.upper + 1);
+  }
+
+  return nested_if;
+}
+
+// See documentation of `CombineSizes` and `CombineStrides` for an explanation
+// of how sizes and strides are combined.
+std::optional<SizeAndStrideExpression> CombineSizesAndStrides(
+    std::vector<SizeAndStrideExpression> sizes_and_strides,
+    absl::Span<Interval const> dimension_intervals) {
+  CHECK(!sizes_and_strides.empty());
+  AffineExpr size = CombineSizes(sizes_and_strides);
+  std::optional<AffineExpr> stride =
+      CombineStrides(std::move(sizes_and_strides), dimension_intervals);
+  if (!stride.has_value()) {
+    return std::nullopt;
+  }
+
+  // TODO(b/326998704): handle reshape constraints here.
+  return SizeAndStrideExpression{size, *stride};
+}
+
 std::optional<SizeAndStrideExpression> ExtractSizeAndStride(
-    AffineExpr strided_indexing, absl::Span<Interval const> symbol_intervals) {
+    AffineExpr strided_indexing, absl::Span<Interval const> dimension_intervals,
+    absl::Span<Interval const> symbol_intervals) {
   MLIRContext* ctx = strided_indexing.getContext();
   // Deal with the symbol case (capturing a whole untiled dimension).
   // TODO(b/330906085): concatenating across a reduction dimension needs to be
   // handled by this code.
-  if (auto symbol = llvm::dyn_cast<mlir::AffineSymbolExpr>(strided_indexing)) {
+  if (auto symbol = llvm::dyn_cast<AffineSymbolExpr>(strided_indexing)) {
     const Interval& symbol_interval = symbol_intervals[symbol.getPosition()];
     if (symbol_interval.lower != 0) {
       return std::nullopt;
@@ -195,19 +452,17 @@ std::optional<SizeAndStrideExpression> ExtractSizeAndStride(
   // TODO(b/328427138): support multivariate size expressions.
   switch (strided_indexing.getKind()) {
     case AffineExprKind::DimId:
-      return SizeAndStrideExpression{
-          /*size=*/ToSymbol(llvm::cast<mlir::AffineDimExpr>(strided_indexing)),
-          /*stride=*/getAffineConstantExpr(1, ctx)};
+      return SizeAndStrideExpression{/*size=*/strided_indexing,
+                                     /*stride=*/getAffineConstantExpr(1, ctx)};
     case mlir::AffineExprKind::Mul: {
       const auto mul = llvm::cast<mlir::AffineBinaryOpExpr>(strided_indexing);
       AffineExpr lhs = mul.getLHS();
       // The stride may not be fully collapsed if it is negative; in that case,
       // we need to extract the negative multiplier first.
-      if (const auto rhs =
-              llvm::dyn_cast<mlir::AffineConstantExpr>(mul.getRHS());
+      if (const auto rhs = llvm::dyn_cast<AffineConstantExpr>(mul.getRHS());
           rhs && rhs.getValue() == -1) {
         std::optional<SizeAndStrideExpression> maybe_size_and_stride =
-            ExtractSizeAndStride(lhs, symbol_intervals);
+            ExtractSizeAndStride(lhs, dimension_intervals, symbol_intervals);
         if (!maybe_size_and_stride.has_value()) {
           return std::nullopt;
         }
@@ -217,9 +472,8 @@ std::optional<SizeAndStrideExpression> ExtractSizeAndStride(
             /*stride=*/maybe_size_and_stride->stride * rhs};
       }
       CHECK(lhs.getKind() == AffineExprKind::DimId);
-      return SizeAndStrideExpression{
-          /*size=*/ToSymbol(llvm::cast<mlir::AffineDimExpr>(lhs)),
-          /*stride=*/mul.getRHS()};
+      return SizeAndStrideExpression{/*size=*/lhs,
+                                     /*stride=*/mul.getRHS()};
     }
     case mlir::AffineExprKind::Mod: {
       auto mod = llvm::cast<mlir::AffineBinaryOpExpr>(strided_indexing);
@@ -229,39 +483,64 @@ std::optional<SizeAndStrideExpression> ExtractSizeAndStride(
       auto floor_div = llvm::cast<mlir::AffineBinaryOpExpr>(strided_indexing);
       return ExtractSizeAndStrideFromFloorDiv(floor_div.getLHS(),
                                               floor_div.getRHS());
-    };
+    }
     case mlir::AffineExprKind::Constant:
       return SizeAndStrideExpression{/*size=*/getAffineConstantExpr(1, ctx),
                                      /*stride=*/getAffineConstantExpr(0, ctx)};
     case mlir::AffineExprKind::SymbolId:
       VLOG(1) << "Encountered complex size expression involving symbol "
               << printer.ToString(strided_indexing);
+      // It's currently not checked separately, but RTVars shouldn't appear in
+      // the strided indexing expressions.
       return std::nullopt;
-    case mlir::AffineExprKind::Add:
+    case mlir::AffineExprKind::Add: {
       // TODO(b/328427138): this should only be necessary in the multivariate
       // case, and will be implemented later.
-      VLOG(1) << "Encountered complex strided indexing expression "
-              << printer.ToString(strided_indexing);
-      return std::nullopt;
+      std::optional<std::vector<SizeAndStrideExpression>>
+          maybe_sizes_and_strides =
+              ExtractSizesAndStridesFromMultivariateSummation(
+                  strided_indexing, dimension_intervals, symbol_intervals);
+      if (!maybe_sizes_and_strides.has_value()) {
+        return std::nullopt;
+      }
+      return CombineSizesAndStrides(std::move(*maybe_sizes_and_strides),
+                                    dimension_intervals);
+    }
     case mlir::AffineExprKind::CeilDiv:
       break;
   };
   LOG(FATAL) << "unreachable";
 }
 
+// Simplifies the given affine expression using the constraints / bounds of
+// the reference indexing map.
+//
+// The dimensions and symbols of the expression should correspond to the
+// dimensions and symbols of the reference indexing map.
+AffineExpr SimplifyAffineExpr(const AffineExpr& expr,
+                              const IndexingMap& reference) {
+  AffineMap tmp_affine_map =
+      AffineMap::get(/*dimCount=*/reference.GetDimVars().size(),
+                     /*symbolCount=*/reference.GetSymbolCount(),
+                     /*results=*/{expr},
+                     /*context=*/reference.GetMLIRContext());
+  IndexingMap tmp_indexing_map(
+      /*affine_map=*/std::move(tmp_affine_map),
+      /*dimensions=*/reference.GetDimVars(),
+      /*range_vars=*/reference.GetRangeVars(),
+      /*rt_vars=*/reference.GetRTVars(),
+      /*constraints=*/reference.GetConstraints());
+  tmp_indexing_map.Simplify(GetIndexingMapForInstruction);
+
+  CHECK_EQ(tmp_indexing_map.GetAffineMap().getResults().size(), 1);
+  return tmp_indexing_map.GetAffineMap().getResults().back();
+}
+
 }  // anonymous namespace
 
 /*static*/ std::optional<SymbolicTile> SymbolicTile::FromIndexingMap(
     const IndexingMap& indexing_map) {
-  // Bail out on runtime offsets.
-  if (indexing_map.GetRTVarsCount()) {
-    return std::nullopt;
-  }
-  // TODO(b/328427138): handle multiple symbols in a single tile to support
-  // merging dimensions.
-  if (!IndexingMapDescribesTileableSpace(indexing_map)) {
-    return std::nullopt;
-  }
+  VLOG(1) << "SymbolicTile::FromIndexingMap: " << indexing_map.ToString();
 
   AffineMap input_affine_map = indexing_map.GetAffineMap();
   MLIRContext* mlir_context = input_affine_map.getContext();
@@ -290,9 +569,13 @@ std::optional<SizeAndStrideExpression> ExtractSizeAndStride(
   //
   // offset_expressions = f(0, ..., 0)[0, ..., 0].
   std::vector<AffineExpr> offset_expressions =
-      SubstituteAllIndicesAndKnownSymbolsWithSameValue(
-          input_affine_map, getAffineConstantExpr(0, mlir_context))
+      SubstituteAllIndicesAndRangeVarSymbolsWithSameValue(
+          input_affine_map, getAffineConstantExpr(0, mlir_context),
+          indexing_map.GetRangeVarsCount())
           .getResults();
+  for (AffineExpr& expr : offset_expressions) {
+    expr = SimplifyAffineExpr(expr, indexing_map);
+  }
 
   std::vector<AffineExpr> size_expressions;
   std::vector<AffineExpr> stride_expressions;
@@ -304,9 +587,12 @@ std::optional<SizeAndStrideExpression> ExtractSizeAndStride(
   for (auto [composite_indexing, offset] :
        llvm::zip(input_affine_map.getResults(), offset_expressions)) {
     std::optional<SizeAndStrideExpression> maybe_size_and_stride =
-        ExtractSizeAndStride(composite_indexing - offset,
+        ExtractSizeAndStride(SimplifyAffineExpr(composite_indexing - offset,
+                                                /*reference=*/indexing_map),
+                             indexing_map.GetDimensionBounds(),
                              indexing_map.GetSymbolBounds());
     if (!maybe_size_and_stride.has_value()) {
+      VLOG(1) << "No size and stride extracted";
       return std::nullopt;
     }
     size_expressions.push_back(maybe_size_and_stride->size);
@@ -314,31 +600,64 @@ std::optional<SizeAndStrideExpression> ExtractSizeAndStride(
   }
 
   // Eliminate negative strides and recalculate offsets.
+  // TODO(b/340555497): handle normalization of more complex expressions.
   std::vector<AffineExpr> dim_replacements, sym_replacements;
   for (auto [offset, size, stride] :
        llvm::zip(offset_expressions, size_expressions, stride_expressions)) {
-    auto constant = llvm::dyn_cast<mlir::AffineConstantExpr>(stride);
-    if (!constant) {
+    auto constant = llvm::dyn_cast<AffineConstantExpr>(stride);
+    if (constant && constant.getValue() < 0) {
+      offset = offset + size * stride - stride;
+      stride = -stride;
+    } else if (!constant) {
       AffineMapPrinter printer;
       VLOG(1) << "Unexpected non-constant stride expression: "
               << printer.ToString(stride);
-      return std::nullopt;
-    }
-    if (constant.getValue() < 0) {
-      offset = offset + size * stride - stride;
-      stride = -stride;
     }
   }
 
-  int64_t num_symbols = input_affine_map.getNumDims();
-  AffineMap offset_map =
-      AffineMap::get(0, num_symbols, offset_expressions, mlir_context);
-  AffineMap size_map =
-      AffineMap::get(0, num_symbols, size_expressions, mlir_context);
-  AffineMap stride_map =
-      AffineMap::get(0, num_symbols, stride_expressions, mlir_context);
+  // DimVars in `indexing_map` represent indices, but in `tile_map` they will
+  // represent the size of the tile. So we need to add 1 to the bounds.
+  // For example: indices: [0, 9] -> sizes: [1, 10].
+  std::vector<DimVar> tile_sizes = indexing_map.GetDimVars();
+  for (DimVar& tile_size : tile_sizes) {
+    tile_size.bounds.lower += 1;
+    tile_size.bounds.upper += 1;
+  }
+
+  std::vector<AffineExpr> results;
+  absl::c_move(std::move(offset_expressions), std::back_inserter(results));
+  absl::c_move(std::move(size_expressions), std::back_inserter(results));
+  absl::c_move(std::move(stride_expressions), std::back_inserter(results));
+
+  AffineMap tile_affine_map =
+      AffineMap::get(/*dimCount=*/tile_sizes.size(),
+                     /*symbolCount=*/indexing_map.GetSymbolCount(),
+                     /*results=*/results,
+                     /*context=*/indexing_map.GetMLIRContext());
+
+  // TODO(b/326998704): Pass constraints derived in ExtractSizeAndStrideFromMod
+  // (and possibly other places) to the constructor. Also consider if we can
+  // derive any constraints from the constraints of the original indexing map.
+  IndexingMap tile_map(
+      /*affine_map=*/std::move(tile_affine_map),
+      /*dimensions=*/std::move(tile_sizes),
+      /*range_vars=*/indexing_map.GetRangeVars(),
+      /*rt_vars=*/indexing_map.GetRTVars());
+  tile_map.RemoveUnusedSymbols();
+  CHECK_EQ(tile_map.GetRangeVarsCount(), 0);
+
+  VLOG(1) << "tile_map: " << tile_map.ToString();
+  return SymbolicTile(std::move(tile_map));
+}
 
-  return SymbolicTile(offset_map, size_map, stride_map);
+std::string SymbolicTile::RtVarsToString(
+    const AffineMapPrinter& printer) const {
+  std::string s;
+  std::stringstream ss(s);
+  PrintRTVars(tile_map_.GetRTVars(),
+              /*first_rt_var_symbol_index=*/tile_map_.GetDimensionCount(), ss,
+              printer);
+  return ss.str();
 }
 
 std::string SymbolicTile::ToString(const AffineMapPrinter& printer) const {
@@ -352,13 +671,68 @@ void SymbolicTile::Print(std::ostream& out,
                          const AffineMapPrinter& printer) const {
   out << "Symbolic tile with \n";
   out << "\toffset_map: ";
-  printer.Print(out, offset_map_);
+  printer.Print(out, offset_map());
   out << "\n\tsize_map: ";
-  printer.Print(out, size_map_);
+  printer.Print(out, size_map());
   out << "\n\tstride_map: ";
-  printer.Print(out, stride_map_);
+  printer.Print(out, stride_map());
+  out << "\n\trt_vars: ";
+  PrintRTVars(tile_map_.GetRTVars(),
+              /*first_rt_var_symbol_index=*/tile_map_.GetDimensionCount(), out,
+              printer);
   out << "\n";
 }
 
+namespace {
+// The results of `SymbolicTile::tile_map_` can be split into 3 groups: offsets,
+// sizes, and strides.
+constexpr int kNumComponentsPerTiledDimension = 3;
+}  // namespace
+
+mlir::AffineMap SymbolicTile::offset_map() const {
+  llvm::ArrayRef<AffineExpr> results = tile_map_.GetAffineMap().getResults();
+  CHECK_EQ(results.size() % kNumComponentsPerTiledDimension, 0);
+  llvm::ArrayRef<AffineExpr> offsets(
+      results.begin(),
+      results.begin() + results.size() / kNumComponentsPerTiledDimension);
+  // RTVars are included in the symbols.
+  return AffineMap::get(
+      /*dimCount=*/0,
+      /*symbolCount=*/tile_map_.GetAffineMap().getNumDims() +
+          tile_map_.GetAffineMap().getNumSymbols(),
+      /*results=*/DimsToSymbols(offsets, tile_map_),
+      /*context=*/tile_map_.GetAffineMap().getContext());
+}
+
+mlir::AffineMap SymbolicTile::size_map() const {
+  llvm::ArrayRef<AffineExpr> results = tile_map_.GetAffineMap().getResults();
+  CHECK_EQ(results.size() % kNumComponentsPerTiledDimension, 0);
+  llvm::ArrayRef<AffineExpr> offsets(
+      results.begin() + results.size() / kNumComponentsPerTiledDimension,
+      results.begin() + results.size() / kNumComponentsPerTiledDimension * 2);
+  // RTVars are *not* included in the symbols.
+  return AffineMap::get(
+      /*dimCount=*/0,
+      /*symbolCount=*/tile_map_.GetAffineMap().getNumDims() +
+          tile_map_.GetAffineMap().getNumSymbols() - tile_map_.GetRTVarsCount(),
+      /*results=*/DimsToSymbols(offsets, tile_map_),
+      /*context=*/tile_map_.GetAffineMap().getContext());
+}
+
+mlir::AffineMap SymbolicTile::stride_map() const {
+  llvm::ArrayRef<AffineExpr> results = tile_map_.GetAffineMap().getResults();
+  CHECK_EQ(results.size() % kNumComponentsPerTiledDimension, 0);
+  llvm::ArrayRef<AffineExpr> offsets(
+      results.begin() + results.size() / kNumComponentsPerTiledDimension * 2,
+      results.end());
+  // RTVars are *not* included in the symbols.
+  return AffineMap::get(
+      /*dimCount=*/0,
+      /*symbolCount=*/tile_map_.GetAffineMap().getNumDims() +
+          tile_map_.GetAffineMap().getNumSymbols() - tile_map_.GetRTVarsCount(),
+      /*results=*/DimsToSymbols(offsets, tile_map_),
+      /*context=*/tile_map_.GetAffineMap().getContext());
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile.h b/third_party/xla/xla/service/gpu/model/symbolic_tile.h
index 93dc5ae25b0318..85059586ef5dd6 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile.h
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile.h
@@ -19,7 +19,9 @@ limitations under the License.
 #include <optional>
 #include <ostream>
 #include <string>
+#include <utility>
 
+#include "mlir/IR/AffineExpr.h"  // from @llvm-project
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
 #include "xla/service/gpu/model/affine_map_printer.h"
 #include "xla/service/gpu/model/indexing_map.h"
@@ -27,52 +29,171 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-// A tile describes a structured subset of indices inside an N-dimensional
-// array, where the set of indices captured along each dimension can be
-// expressed as a strided expression
+// Tiling in the simpler case, when we don't have dynamic offsets (see the
+// general case later):
+//
+// An N-dimensional *tile* describes a structured subset of
+// indices inside an N-dimensional array, where the set of indices captured
+// along each dimension can be expressed as a strided expression
 //     offset + stride * iota(size)
-// with offset, stride, and size three integers, and iota the usual range
-// function. These values may never be negative.
-//
-// A N-dimensional symbolic tile is a function from offsets, strides, and sizes
-// to a N-dimensional tile. It can be represented as three affine maps with
-// domain
-//     ()[size0, ..., size{M-1}]
-// and respective co-domains
-//     (offset0, ..., offset{N-1})     (offset_map())
-//     (size0', ..., size'{N-1})       (size_map())
-//     (stride0, ..., stride{N-1})     (stride_map())
-// where maps respectively encode the offset, size, and stride component of
-// each strided expression in the tile. The parameters to the maps above are all
-// assumed to be strictly positive. The input offsets are assumed to be all 0s,
-// and the input strides are assumed to be all 1s.
+//
+// where offset and stride are non-negative integers, size is a strictly
+// positive integer and `iota` is the usual range function.
+//
+// An *N-dimensional symbolic tile* is a function from an M-dimensional
+// tile to an N-dimensional tile. The input tile is assumed to have all offsets
+// equal to 0 and all strides equal to 1.
+//
+// It is represented with "tile_map()", which is an IndexingMap of this form:
+// (size0, ..., size{n-1}) ->  (offset0, ..., offset{n-1},
+//                              size'0, ..., size'{n-1},
+//                              stride0, ..., stride{n-1})
+//
+// We can get three AffineMap projections of tile_map(), which are just
+// convenience methods to get the components that we need:
+// offset_map(): ()[size0, ..., size{M-1}] -> (offset0, ..., offset{N-1})
+// size_map():   ()[size0, ..., size{M-1}] -> (size'0, ..., size'{N-1})
+// stride_map(): ()[size0, ..., size{M-1}] -> (stride0, ..., stride{N-1})
+//
+// The maps respectively encode the offset, size, and stride component of each
+// strided expression in the result tile.
 //
 // A symbolic tile with M symbols and N results is constructed using an
 // `IndexingMap` with M input dimensions and N results. The construction of the
 // symbolic tile may fail if any one of the resulting expressions is not a
 // strided expression as described above.
+//
+// Tiling in the general case:
+//
+// In general, the offsets of the tile can depend on runtime variables. Runtime
+// variables are evaluated to an element of a tensor at runtime for each
+// multi-index of the output tensor. This allows support for dynamic offsets,
+// for example in dynamic-slice. Until runtime, they are kept in a symbolic
+// form. In the following reasoning, we assume that the runtime variables would
+// evaluate to the same value for all elements of any given input tile. (This is
+// trivially true for dynamic-slice, but we have to choose tiles wisely for
+// gather for example.) In the following math, with the help of the previous
+// assumption, we represent runtime variables as integer parameters. Note that
+// the earlier concepts are here redefined in a more general form.
+//
+// Def. An n-dimensional tile is a function:
+// t: Z^k -> P(N^n) =
+//    rt_vars -> CartesianProduct_{i=1, ..., n-1}({
+//           offsets(rt_vars)[i] + strides[i] * 0,
+//           ...,
+//           offsets(rt_vars)[i] + strides[i] * (sizes[i]-1)
+//         })
+// where
+//    Z is the set of integers,
+//    P is the power set operator (set of all subsets),
+//    N is the set of non-negative integers
+//    N+ is the set of positive integers
+//    N^n meaning the set of n-tuples of non-negative integers
+//
+//    rt_vars: Z^k (so called "runtime variables")
+//    offsets: Z^k -> N^n
+//    strides: N^n
+//    sizes: (N+)^n
+//
+// Notation. We can represent n-dimensional tiles as:
+// (offsets, strides, sizes): (Z^k -> N^n) x N^n x (N+)^n
+// where A x B means a Cartesian product.
+//
+// Def. Let Tiles(n) denote the set of n-dimensional tiles.
+//
+// Def. An n-dimensional "symbolic tile" is a function:
+// s: U_{m: N} (Tiles(m) -> Tiles(n))
+// where U represents a union of sets.
+//
+// Notation. We can represent n-dimensional symbolic tiles of the form
+// (offsets, strides, sizes) : Tiles(m)
+//   -> (offsets', strides', sizes') : Tiles(n)
+// as a vector of functions:
+//   (offset_map, stride_map, size_map) where:
+//     offset_map: ((Z^j -> N^m) x N^m x (N+)^m) -> (Z^k -> N^n)
+//     stride_map: ((Z^j -> N^m) x N^m x (N+)^m) -> N^n
+//     size_map: ((Z^j -> N^m) x N^m x (N+)^m) -> (N+)^n
+// where each "map" returns one component of the result Tile.
+//
+// If we assume that offsets=({} -> {0, ..., 0}) and strides={1, ..., 1}, then
+// we can simplify the definition:
+//     offset_map: (N+)^m -> (Z^k -> N^n)
+//     stride_map: (N+)^m -> N^n
+//     size_map: (N+)^m -> (N+)^n
+//
+// As a notation, we can further simplify the structure of offset_map:
+//   offset_map: (N+)^m x Z^k -> N^n
+// As a notation, we can call the last k parameters of offset_map "rt_vars".
+//
+// In the code we represent a symbolic tile with "tile_map()", which is an
+// IndexingMap of this form:
+// (size0, ..., size{n-1})
+// [rt_var0, ..., rt_var{k-1}] -> (offset0, ..., offset{n-1},
+//                                 size'0, ..., size'{n-1},
+//                                 stride0, ..., stride{n-1})
+//
+// We can get three AffineMap projections of tile_map(), which are just
+// convenience methods to get the components that we need:
+// offset_map(): ()[sizes..., rt_vars...] -> offsets'
+// size_map():   ()[sizes...] -> sizes'
+// stride_map(): ()[sizes...] -> strides'
+//
+// Other than this, the SymbolicTile object also contains a vector of RTVars
+// (rt_vars()) which describe how to evaluate the runtime value of rt_vars.
+//
+// To correctly evaluate the RTVars for a given tile, we have to feed an
+// index from the original tile (a tile of the output tensor) to the RTVar's
+// affine map. (The RTVars in the symbolic tile are not adjusted to take indices
+// from the result tile.)
+//
+// Note: Currently runtime offsets are relative to the whole tensor, while other
+// offsets are local to the position of the input tile. This will be probably
+// simplified later.
 class SymbolicTile {
  public:
   static std::optional<SymbolicTile> FromIndexingMap(
       const IndexingMap& indexing_map);
 
+  // For printing in tests.
+  std::string RtVarsToString(
+      const AffineMapPrinter& printer = AffineMapPrinter()) const;
   std::string ToString(
       const AffineMapPrinter& printer = AffineMapPrinter()) const;
 
   void Print(std::ostream& out, const AffineMapPrinter& printer) const;
 
-  mlir::AffineMap offset_map() const { return offset_map_; }
-  mlir::AffineMap size_map() const { return size_map_; }
-  mlir::AffineMap stride_map() const { return stride_map_; }
+  mlir::AffineMap offset_map() const;
+  mlir::AffineMap size_map() const;
+  mlir::AffineMap stride_map() const;
+
+  // A map from one tile's sizes and RTVars to another tile's offsets, sizes,
+  // and strides.
+  //
+  // (size0, ..., size{n-1})
+  // [rt_var0, ..., rt_var{k-1}] -> (offset0, ..., offset{n-1},
+  //                                 size'0, ..., size'{n-1},
+  //                                 stride0, ..., stride{n-1})
+  //
+  //
+  // Its type is IndexingMap, but it's not a map of indices.
+  // This indexing map wraps the relevant domain constraints.
+  //
+  // Warning: The dimensions and symbols in tile_map do not match the dimensions
+  // and symbols in offset_map, size_map, and stride_map.
+  const IndexingMap& tile_map() const { return tile_map_; }
+
+  // This allows GUnit to print the tile.
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const SymbolicTile& tile) {
+    sink.Append(tile.ToString());
+  }
 
  private:
-  mlir::AffineMap offset_map_;
-  mlir::AffineMap size_map_;
-  mlir::AffineMap stride_map_;
+  // See the comment of tile_map().
+  IndexingMap tile_map_;
 
-  SymbolicTile(mlir::AffineMap offset_map, mlir::AffineMap size_map,
-               mlir::AffineMap stride_map)
-      : offset_map_(offset_map), size_map_(size_map), stride_map_(stride_map) {}
+  explicit SymbolicTile(IndexingMap tile_map)
+      : tile_map_(std::move(tile_map)) {}
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
index 3b9e376c8b32e4..71d4c285ae1103 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "xla/service/gpu/model/indexing_map.h"
 #include "xla/service/gpu/model/symbolic_tile.h"
 #include "xla/service/gpu/model/symbolic_tiled_hlo_instruction.h"
+#include "xla/service/gpu/model/tiled_hlo_computation.h"
 #include "xla/service/gpu/model/tiled_hlo_instruction.h"
 #include "xla/service/instruction_fusion.h"
 #include "xla/status.h"
@@ -240,7 +241,7 @@ absl::StatusOr<IndexingMap> ComputeBlockIdToTileOffsetIndexing(
   return SymbolicTileAnalysis(std::move(tiled_hlo_instructions), ctx);
 }
 
-absl::StatusOr<std::vector<std::unique_ptr<TiledHloInstruction>>>
+absl::StatusOr<TiledHloComputation>
 SymbolicTileAnalysis::ComputeTiledHloInstructions(
     const std::vector<int64_t>& tile_parameters) const {
   IndexingMap block_id_to_root_tile_offset = ComputeBlockIdToOutputTileIndexing(
@@ -311,7 +312,8 @@ SymbolicTileAnalysis::ComputeTiledHloInstructions(
     return topological_order.at(i1.get()) < topological_order.at(i2.get());
   });
 
-  return tiled_hlo_instructions;
+  return TiledHloComputation::FromSortedTiledHloInstructions(
+      std::move(tiled_hlo_instructions));
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
index 40b184d795f382..a880437356c4f0 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
@@ -26,7 +26,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/model/symbolic_tiled_hlo_instruction.h"
-#include "xla/service/gpu/model/tiled_hlo_instruction.h"
+#include "xla/service/gpu/model/tiled_hlo_computation.h"
 #include "xla/service/instruction_fusion.h"
 
 namespace xla {
@@ -50,9 +50,7 @@ class SymbolicTileAnalysis {
       const HloComputation& computation, mlir::MLIRContext* ctx);
 
   // Returns a graph of HLO instructions tiled with the given tile parameters.
-  // Result vector has instructions in def-before-use order.
-  absl::StatusOr<std::vector<std::unique_ptr<TiledHloInstruction>>>
-  ComputeTiledHloInstructions(
+  absl::StatusOr<TiledHloComputation> ComputeTiledHloInstructions(
       const std::vector<int64_t>& tile_parameters) const;
 
   // Returns the tiled root instruction.
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc
index b6340a83636e33..4480481ae935af 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/gpu/model/indexing_test_utils.h"
+#include "xla/service/gpu/model/tiled_hlo_computation.h"
 #include "xla/service/gpu/model/tiled_hlo_instruction.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/verified_hlo_module.h"
@@ -76,10 +77,10 @@ ENTRY main {
   EXPECT_TRUE(SetAnalysis(module.get()));
 
   TF_ASSERT_OK_AND_ASSIGN(
-      std::vector<std::unique_ptr<TiledHloInstruction>> tiled_hlo_instructions,
+      TiledHloComputation tiled_hlo_computation,
       analysis_->ComputeTiledHloInstructions(/*tile_parameters=*/{1, 10}));
 
-  TiledHloInstruction* root = tiled_hlo_instructions.back().get();
+  const TiledHloInstruction* root = tiled_hlo_computation.GetRoot();
 
   EXPECT_THAT(root->block_id_to_tile_offsets_indexing(), MatchIndexingMap(R"(
     (d0) -> (d0 floordiv 10, (d0 mod 10) * 10)
@@ -124,10 +125,10 @@ ENTRY main {
   EXPECT_TRUE(SetAnalysis(module.get()));
 
   TF_ASSERT_OK_AND_ASSIGN(
-      std::vector<std::unique_ptr<TiledHloInstruction>> tiled_hlo_instructions,
+      TiledHloComputation tiled_hlo_computation,
       analysis_->ComputeTiledHloInstructions(/*tile_parameters=*/{1, 10}));
 
-  TiledHloInstruction* root = tiled_hlo_instructions.back().get();
+  const TiledHloInstruction* root = tiled_hlo_computation.GetRoot();
 
   auto p0_from_subtract0 = root->operand(0)->operand(0);
   auto p0_from_subtract1 = root->operand(1)->operand(0);
@@ -146,10 +147,10 @@ ENTRY main {
   EXPECT_TRUE(SetAnalysis(module.get()));
 
   TF_ASSERT_OK_AND_ASSIGN(
-      std::vector<std::unique_ptr<TiledHloInstruction>> tiled_hlo_instructions,
+      TiledHloComputation tiled_hlo_computation,
       analysis_->ComputeTiledHloInstructions(/*tile_parameters=*/{2, 4, 2}));
 
-  TiledHloInstruction* root = tiled_hlo_instructions.back().get();
+  const TiledHloInstruction* root = tiled_hlo_computation.GetRoot();
 
   EXPECT_THAT(root->block_id_to_tile_offsets_indexing(), MatchIndexingMap(R"(
     (d0) -> ((d0 floordiv 16) * 2, ((d0 floordiv 8) mod 2) * 4, (d0 mod 8) * 2)
@@ -178,10 +179,10 @@ ENTRY main {
   EXPECT_TRUE(SetAnalysis(module.get()));
 
   TF_ASSERT_OK_AND_ASSIGN(
-      std::vector<std::unique_ptr<TiledHloInstruction>> tiled_hlo_instructions,
+      TiledHloComputation tiled_hlo_computation,
       analysis_->ComputeTiledHloInstructions(/*tile_parameters=*/{2, 2}));
 
-  TiledHloInstruction* root = tiled_hlo_instructions.back().get();
+  const TiledHloInstruction* root = tiled_hlo_computation.GetRoot();
   const TiledHloInstruction* p0_from_slice0 = root->operand(0)->operand(0);
   const TiledHloInstruction* p0_from_slice1 = root->operand(1)->operand(0);
 
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_test.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile_test.cc
index 74673bd8f4db19..97128095df6f4b 100644
--- a/third_party/xla/xla/service/gpu/model/symbolic_tile_test.cc
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_test.cc
@@ -15,12 +15,22 @@ limitations under the License.
 
 #include "xla/service/gpu/model/symbolic_tile.h"
 
+#include <cstdint>
 #include <optional>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/log/check.h"
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/IR/AffineExpr.h"  // from @llvm-project
+#include "mlir/IR/AffineMap.h"  // from @llvm-project
 #include "xla/service/gpu/model/affine_map_printer.h"
 #include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/model/indexing_test_utils.h"
@@ -30,17 +40,22 @@ namespace xla {
 namespace gpu {
 namespace {
 
+using ::llvm::SmallVector;
+using ::mlir::AffineExpr;
+using ::mlir::AffineMap;
+using ::testing::ElementsAre;
 using ::testing::ExplainMatchResult;
 using ::testing::Optional;
 using ::testing::StrEq;
 
-MATCHER_P3(MatchSymbolicTile, offset_map_string, size_map_string,
-           stride_map_string,
+MATCHER_P4(MatchSymbolicTileWithRtVars, offset_map_string, size_map_string,
+           stride_map_string, rt_vars_string,
            absl::StrCat(negation
                             ? "equals "
                             : "doesn't equal symbolic tile with offset_map_ ",
                         offset_map_string, " and size_map_ ", size_map_string,
-                        " and stride_map_ ", stride_map_string)) {
+                        " and stride_map_ ", stride_map_string, "and rt_vars_ ",
+                        rt_vars_string)) {
   AffineMapPrinter printer;
   return ExplainMatchResult(StrEq(offset_map_string),
                             printer.ToString(arg.offset_map()),
@@ -50,9 +65,49 @@ MATCHER_P3(MatchSymbolicTile, offset_map_string, size_map_string,
                             result_listener) &&
          ExplainMatchResult(StrEq(stride_map_string),
                             printer.ToString(arg.stride_map()),
+                            result_listener) &&
+         // Strip whitespace, so we don't need to add trailing newlines.
+         ExplainMatchResult(StrEq(absl::StripAsciiWhitespace(rt_vars_string)),
+                            absl::StripAsciiWhitespace(arg.RtVarsToString()),
                             result_listener);
 }
 
+MATCHER_P3(MatchSymbolicTile, offset_map_string, size_map_string,
+           stride_map_string,
+           absl::StrCat(negation
+                            ? "equals "
+                            : "doesn't equal symbolic tile with offset_map_ ",
+                        offset_map_string, " and size_map_ ", size_map_string,
+                        " and stride_map_ ", stride_map_string)) {
+  return ExplainMatchResult(
+      MatchSymbolicTileWithRtVars(offset_map_string, size_map_string,
+                                  stride_map_string, ""),
+      arg, result_listener);
+}
+
+std::vector<int64_t> EvaluateMapAt(AffineMap affine_map,
+                                   absl::Span<int64_t const> parameters) {
+  CHECK_EQ(affine_map.getNumSymbols(), parameters.size());
+  CHECK_EQ(affine_map.getNumDims(), 0);
+
+  SmallVector<AffineExpr> symbol_replacements = llvm::to_vector(
+      llvm::map_range(parameters, [affine_map](const int64_t v) -> AffineExpr {
+        return mlir::getAffineConstantExpr(v, affine_map.getContext());
+      }));
+
+  AffineMap simplified_affine_map =
+      mlir::simplifyAffineMap(affine_map.replaceDimsAndSymbols(
+          /*dimReplacements=*/{}, symbol_replacements, /*numResultDims=*/0,
+          /*numResultSyms=*/0));
+
+  SmallVector<int64_t> results = llvm::to_vector(llvm::map_range(
+      simplified_affine_map.getResults(), [](AffineExpr result) -> int64_t {
+        return llvm::cast<mlir::AffineConstantExpr>(result).getValue();
+      }));
+
+  return std::vector<int64_t>(results.begin(), results.end());
+}
+
 using SymbolicTileTest = IndexingTestBase;
 
 TEST_F(SymbolicTileTest, CanPropagateTileFromDotOutputToInputs) {
@@ -115,6 +170,58 @@ TEST_F(SymbolicTileTest,
           "()[s0, s1] -> (0, 1, 1, 1)")));
 }
 
+TEST_F(SymbolicTileTest,
+       CanPropagateTileThroughNonTrivialSplitReshapeFromOutputToInput) {
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[192,4]{1,0} parameter(0)
+      ROOT bitcast = s8[4,8,6,4]{3,2,1,0} bitcast(p0)
+    }
+  )"));
+
+  std::optional<SymbolicTile> symbolic_tile =
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin());
+
+  EXPECT_THAT(
+      symbolic_tile,
+      Optional(MatchSymbolicTile(
+          "()[s0, s1, s2, s3] -> (0, 0)",
+          "()[s0, s1, s2, s3] -> ((s0 * s1) * s2, s3)",
+          // Collapsed dimensions force us to create nested conditionals, since
+          // the stride of the output corresponds to the stride of the minormost
+          // expression along which elements are captured in the composite
+          // expression. Hence, the resulting expression is very ugly.
+          "()[s0, s1, s2, s3] -> "
+          "(((-s2 + 7) floordiv 6) * (((-s1 + 9) floordiv 8) * "
+          "((-((-s0 + 5) floordiv 4) + 1) * 48) + "
+          "(-((-s1 + 9) floordiv 8) + 1) * 6) + -((-s2 + 7) floordiv 6) + 1, "
+          "1)")));
+
+  // Capturing elements along dimensions 0, 1, and 2 makes the stride equal to
+  // 1.
+  EXPECT_THAT(EvaluateMapAt(symbolic_tile->stride_map(), {4, 8, 6, 4}),
+              ElementsAre(1, 1));
+  // Capturing elements along dimension 2 makes the stride equal to 1.
+  EXPECT_THAT(EvaluateMapAt(symbolic_tile->stride_map(), {1, 1, 6, 4}),
+              ElementsAre(1, 1));
+  // Capturing elements only along dimension 1 makes the stride equal to
+  // the length of dimension 2 (6).
+  EXPECT_THAT(EvaluateMapAt(symbolic_tile->stride_map(), {1, 8, 1, 4}),
+              ElementsAre(6, 1));
+  // Capturing elements only along dimension 0 makes the stride equal to the
+  // product of the lengths of dimensions 1 and 2 (8 * 6).
+  EXPECT_THAT(EvaluateMapAt(symbolic_tile->stride_map(), {2, 1, 1, 4}),
+              ElementsAre(48, 1));
+  // Capturing elements along dimension 0 and dimension 1 makes the stride
+  // equal to the length of dimension 2 (6).
+  EXPECT_THAT(EvaluateMapAt(symbolic_tile->stride_map(), {2, 8, 1, 4}),
+              ElementsAre(6, 1));
+  // Capturing a single element in the collapsed dimensions makes the stride 0.
+  EXPECT_THAT(EvaluateMapAt(symbolic_tile->stride_map(), {1, 1, 1, 4}),
+              ElementsAre(0, 1));
+}
+
 TEST_F(SymbolicTileTest, FailsToPropagateTileThroughNonTrivialReshape) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
@@ -276,6 +383,133 @@ TEST_F(SymbolicTileTest, CanPropagateTileThroughPadOpWithoutInteriorPadding) {
                                  "()[s0, s1] -> (1, 1)")));
 }
 
+TEST_F(SymbolicTileTest, CanPropagateTileThroughDynamicSlice) {
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY e {
+      %src = s32[2,2,258] parameter(0)
+      %of1 = s32[] parameter(1)
+      %of2 = s32[] parameter(2)
+      %of3 = s32[] parameter(3)
+      ROOT %ds = s32[1,2,32] dynamic-slice(s32[2,2,258] %src,
+        s32[] %of1, s32[] %of2, s32[] %of3),
+        dynamic_slice_sizes={1, 2, 32}
+    }
+  )"));
+
+  ASSERT_EQ(input_indexing.indexing_maps.size(), 4);
+
+  EXPECT_THAT(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
+      // s0, s1, s2: tile sizes
+      // s3, s4: runtime parameters
+      // Note: We don't have s0 in the size map's rhs, because the first dim
+      // of the tile size can only be 1. The second offset is optimized to 0,
+      // because that is the only possible value.
+      Optional(MatchSymbolicTileWithRtVars(
+          "()[s0, s1, s2, s3, s4] -> (s3, 0, s4)",
+          "()[s0, s1, s2] -> (1, s1, s2)", "()[s0, s1, s2] -> (0, 1, 1)",
+          R"(
+s3 in [0, 1]
+  hlo: %of1 = s32[] parameter(1)
+  (d0, d1, d2) -> ()
+s4 in [0, 226]
+  hlo: %of3 = s32[] parameter(3)
+  (d0, d1, d2) -> ()
+)")));
+
+  for (int i = 1; i <= 3; i++) {
+    EXPECT_THAT(
+        SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[i].begin()),
+        Optional(MatchSymbolicTile("()[s0, s1, s2] -> ()",
+                                   "()[s0, s1, s2] -> ()",
+                                   "()[s0, s1, s2] -> ()")));
+  }
+}
+
+TEST_F(SymbolicTileTest, CanPropagateTileThroughDynamicUpdateSlice) {
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY e {
+      %src = s32[20,30] parameter(0)
+      %upd = s32[5,10] parameter(1)
+      %of1 = s32[] parameter(2)
+      %of2 = s32[] parameter(3)
+      ROOT %dus = s32[20,30] dynamic-update-slice(
+          s32[20,30] %src, s32[5,10] %upd, s32[] %of1, s32[] %of2)
+    }
+  )"));
+
+  ASSERT_EQ(input_indexing.indexing_maps.size(), 4);
+
+  EXPECT_THAT(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
+      // s0, s1: tile sizes
+      // s2, s3: runtime parameters
+      Optional(MatchSymbolicTile("()[s0, s1] -> (0, 0)",
+                                 "()[s0, s1] -> (s0, s1)",
+                                 "()[s0, s1] -> (1, 1)")));
+  EXPECT_THAT(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[1].begin()),
+      // s0, s1: tile sizes
+      // s2, s3: runtime parameters
+      Optional(MatchSymbolicTileWithRtVars("()[s0, s1, s2, s3] -> (-s2, -s3)",
+                                           "()[s0, s1] -> (s0, s1)",
+                                           "()[s0, s1] -> (1, 1)",
+                                           R"(
+s2 in [0, 15]
+  hlo: %of1 = s32[] parameter(2)
+  (d0, d1) -> ()
+s3 in [0, 20]
+  hlo: %of2 = s32[] parameter(3)
+  (d0, d1) -> ()
+)")));
+  for (int i = 2; i <= 3; i++) {
+    EXPECT_THAT(
+        SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[i].begin()),
+        Optional(MatchSymbolicTile("()[s0, s1] -> ()", "()[s0, s1] -> ()",
+                                   "()[s0, s1] -> ()")));
+  }
+}
+
+TEST_F(SymbolicTileTest, CanPropagateTileThroughGather) {
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY main {
+      operand = f32[33,76,70] parameter(0)
+      indices = s32[1806,2] parameter(1)
+      ROOT r = f32[1806,7,8,4] gather(operand, indices), offset_dims={1,2,3},
+                                 collapsed_slice_dims={}, start_index_map={0,1},
+                                 index_vector_dim=1, slice_sizes={7,8,4}
+    }
+  )"));
+
+  ASSERT_EQ(input_indexing.indexing_maps.size(), 2);
+
+  EXPECT_THAT(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
+      // s0, s1, s2, s3: tile sizes
+      // s4, s5: runtime parameters
+      Optional(MatchSymbolicTileWithRtVars(
+          "()[s0, s1, s2, s3, s4, s5] -> (s4, s5, 0)",
+          "()[s0, s1, s2, s3] -> (s1, s2, s3)",
+          "()[s0, s1, s2, s3] -> (1, 1, 1)",
+          R"(
+s4 in [0, 26]
+  hlo: %indices = s32[1806,2]{1,0} parameter(1)
+  (d0, d1, d2, d3) -> (d0, 0)
+s5 in [0, 68]
+  hlo: %indices = s32[1806,2]{1,0} parameter(1)
+  (d0, d1, d2, d3) -> (d0, 1)
+)")));
+
+  EXPECT_THAT(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[1].begin()),
+      Optional(MatchSymbolicTile("()[s0, s1, s2, s3] -> (0, 0)",
+                                 "()[s0, s1, s2, s3] -> (s0, 2)",
+                                 "()[s0, s1, s2, s3] -> (1, 1)")));
+}
+
 TEST_F(SymbolicTileTest, CanPropagateTileThroughSplitReshapeOfReverse) {
   // A split reshape of a reverse creates a negative unit stride atop a
   // floordiv.
diff --git a/third_party/xla/xla/service/gpu/model/tiled_hlo_computation.h b/third_party/xla/xla/service/gpu/model/tiled_hlo_computation.h
new file mode 100644
index 00000000000000..d7fe5e71789311
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/tiled_hlo_computation.h
@@ -0,0 +1,71 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_TILED_HLO_COMPUTATION_H_
+#define XLA_SERVICE_GPU_MODEL_TILED_HLO_COMPUTATION_H_
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "xla/iterator_util.h"
+#include "xla/service/gpu/model/tiled_hlo_instruction.h"
+#include "tsl/lib/gtl/iterator_range.h"
+
+namespace xla {
+namespace gpu {
+
+// Stores TiledHloInstructions in the computation.
+//  * Instructions reference each other with non-owning pointers.
+//  * Instructions with the same tiling parameters are CSE-ed during
+//  construction.
+//  * Instructions are stored in def-before-use order.
+//  * The last element in the vector in the root instruction.
+class TiledHloComputation {
+ public:
+  // Creates a computation from a list of instructions. The instructions are
+  // expected to be sorted in def-before-use order.
+  static TiledHloComputation FromSortedTiledHloInstructions(
+      std::vector<std::unique_ptr<TiledHloInstruction>> instructions) {
+    return TiledHloComputation(std::move(instructions));
+  }
+
+  // Returns an iterator range over the instructions in the computation in
+  // def-before-use order.
+  tsl::gtl::iterator_range<UnwrappingIterator<
+      std::vector<std::unique_ptr<TiledHloInstruction>>::const_iterator>>
+  instructions() const {
+    return {MakeUnwrappingIterator(instructions_.begin()),
+            MakeUnwrappingIterator(instructions_.end())};
+  }
+
+  // Returns the root instruction of the computation.
+  const TiledHloInstruction* GetRoot() const {
+    return instructions_.back().get();
+  }
+
+ private:
+  explicit TiledHloComputation(
+      std::vector<std::unique_ptr<TiledHloInstruction>> instructions)
+      : instructions_(std::move(instructions)) {}
+
+  // Stores instructions in the computation in def-before-use order.
+  std::vector<std::unique_ptr<TiledHloInstruction>> instructions_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_MODEL_TILED_HLO_COMPUTATION_H_
diff --git a/third_party/xla/xla/service/gpu/move_copy_to_users_test.cc b/third_party/xla/xla/service/gpu/move_copy_to_users_test.cc
index 718847168f901f..10179c1b32cacd 100644
--- a/third_party/xla/xla/service/gpu/move_copy_to_users_test.cc
+++ b/third_party/xla/xla/service/gpu/move_copy_to_users_test.cc
@@ -134,11 +134,11 @@ HloModule module
 ENTRY main {
   input = f32[1,17,9,9]{3,2,1,0} parameter(0)
   copy = f32[1,17,9,9]{1,3,2,0} copy(input)
-  s0 = s32[] parameter(1)
-  s1 = s32[] parameter(2)
-  s2 = s32[] parameter(3)
-  s3 = s32[] parameter(4)
-  ROOT ds = f32[1,4,6,6]{1,3,2,0} dynamic-slice(copy, s0, s1, s2, s3), dynamic_slice_sizes={1,4,6,6}
+  p0 = s32[] parameter(1)
+  p1 = s32[] parameter(2)
+  p2 = s32[] parameter(3)
+  p3 = s32[] parameter(4)
+  ROOT ds = f32[1,4,6,6]{1,3,2,0} dynamic-slice(copy, p0, p1, p2, p3), dynamic_slice_sizes={1,4,6,6}
 }
 )";
 
diff --git a/third_party/xla/xla/service/gpu/multi_output_fusion.cc b/third_party/xla/xla/service/gpu/multi_output_fusion.cc
index a6e60230acd924..24a322d20411cf 100644
--- a/third_party/xla/xla/service/gpu/multi_output_fusion.cc
+++ b/third_party/xla/xla/service/gpu/multi_output_fusion.cc
@@ -505,13 +505,10 @@ absl::StatusOr<bool> GpuMultiOutputFusion::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
-  for (auto* computation :
-       module->MakeNonfusionComputations(execution_threads)) {
+  for (auto* computation : GetFusibleComputations(*module, execution_threads)) {
     computation_ = computation;
-    TF_ASSIGN_OR_RETURN(bool fusion_changed, DoMultiOutputFusion());
-    if (fusion_changed) {
-      changed = true;
-    }
+    TF_ASSIGN_OR_RETURN(bool computation_changed, DoMultiOutputFusion());
+    changed |= computation_changed;
   }
   return changed;
 }
diff --git a/third_party/xla/xla/service/gpu/multi_output_fusion_test.cc b/third_party/xla/xla/service/gpu/multi_output_fusion_test.cc
index c28c953a58bdb8..3ff83a2c4100b0 100644
--- a/third_party/xla/xla/service/gpu/multi_output_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/multi_output_fusion_test.cc
@@ -590,6 +590,50 @@ TEST_F(MultiOutputFusionTest, MultiOutputFusionSiblingLoopAndMultiOutputLoop) {
               GmockMatch(m::Tuple(m::Multiply(), m::Exp(), m::Add())));
 }
 
+TEST_F(MultiOutputFusionTest,
+       MultiOutputFusionSiblingMultiOutputLoopAndMultiOutputLoop) {
+  auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
+    fused_computation_1 {
+      p0.1 = f32[8,16]{1,0} parameter(0)
+      mul = f32[8,16]{1,0} multiply(p0.1, p0.1)
+      exp = f32[8,16]{1,0} exponential(p0.1)
+      ROOT tuple = (f32[8,16]{1,0}, f32[8,16]{1,0}) tuple(mul, exp)
+    }
+
+    fused_computation_2 {
+      p0.2 = f32[8,16]{1,0} parameter(0)
+      const.2 = f32[] constant(0)
+      broadcast = f32[8,16]{1,0} broadcast(const.2),
+        dimensions={}
+      add = f32[8,16]{1,0} add(p0.2, broadcast)
+      ROOT tuple.1 = (f32[8,16]{1,0}, f32[8,16]{1,0}) tuple(add, broadcast)
+    }
+
+    ENTRY entry {
+      p0 = f32[8,16]{1,0} parameter(0)
+      fusion.1 = (f32[8,16]{1,0}, f32[8,16]{1,0}) fusion(p0), kind=kLoop,
+        calls=fused_computation_1
+      fusion.2 = (f32[8,16]{1,0}, f32[8,16]{1,0}) fusion(p0), kind=kLoop,
+        calls=fused_computation_2
+      gte0 = f32[8,16]{1,0} get-tuple-element(fusion.1), index=0
+      gte1 = f32[8,16]{1,0} get-tuple-element(fusion.1), index=1
+      gte2 = f32[8,16]{1,0} get-tuple-element(fusion.2), index=0
+      gte3 = f32[8,16]{1,0} get-tuple-element(fusion.2), index=1
+      ROOT root = (f32[8,16]{1,0}, f32[8,16]{1,0}, f32[8,16]{1,0},
+        f32[8,16]{1,0})
+        tuple(gte0, gte1, gte2, gte3)
+    })"))
+                    .value();
+  ASSERT_TRUE(mof_.Run(module.get()).value());
+  SCOPED_TRACE(module->ToString());
+  const HloInstruction* fusion =
+      module->entry_computation()->root_instruction()->operand(0)->operand(0);
+  ASSERT_TRUE(fusion->IsMultiOutputFusion());
+  EXPECT_THAT(
+      fusion->fused_expression_root(),
+      GmockMatch(m::Tuple(m::Multiply(), m::Exp(), m::Add(), m::Broadcast())));
+}
+
 TEST_F(MultiOutputFusionTest,
        MultiOutputFusionSiblingLoopAndMultiOutputLoopDifferentShapes) {
   auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
@@ -2175,5 +2219,32 @@ ENTRY computation {
   )");
 }
 
+TEST_F(ReduceMultiOutputFusionTest, GetTupleElementMakeTupleSequence) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    HloModule test_module
+
+    fusion {
+      p0 = s32[] parameter(0)
+      p1 = s32[32] parameter(1)
+      custom-call = (bf16[], s32[], u32[]) custom-call(p1), custom_call_target="my_custom_call"
+      get-tuple-element.0 = bf16[] get-tuple-element(custom-call), index=0
+      get-tuple-element.1 = s32[] get-tuple-element(custom-call), index=1
+      bitcast = s32[1] bitcast(get-tuple-element.1)
+      dynamic-update-slice = s32[32] dynamic-update-slice(p1, bitcast, p0)
+      get-tuple-element.2 = u32[] get-tuple-element(custom-call), index=2
+      ROOT tuple.30 = (bf16[], s32[32], u32[]) tuple(get-tuple-element.0, dynamic-update-slice, get-tuple-element.2)
+    }
+
+    ENTRY entry{
+      p0 = s32[] parameter(0)
+      bitcast = s32[32] bitcast(p0)
+      ROOT address_computation.7.0 = (bf16[], s32[32], u32[]) fusion(p0, bitcast), kind=kCustom, calls=fusion
+    }
+  )")
+                    .value();
+
+  ASSERT_FALSE(mof_.Run(module.get()).value());
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler.cc b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
index da5682f38ccbc0..8b02969fca6131 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
+#include "third_party/gpus/cuda/include/cuda.h"
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
@@ -63,6 +64,7 @@ limitations under the License.
 #include "xla/service/gpu/cudnn_vectorize_convolutions.h"
 #include "xla/service/gpu/cudnn_workspace_rewriter.h"
 #include "xla/service/gpu/cusolver_rewriter.h"
+#include "xla/service/gpu/dot_sparsity_rewriter.h"
 #include "xla/service/gpu/gemm_algorithm_picker.h"
 #include "xla/service/gpu/gemm_fusion_autotuner.h"
 #include "xla/service/gpu/gpu_asm_opts_util.h"
@@ -147,8 +149,35 @@ class ConvBfloat16Support : public FloatSupport {
   bool is_conv_bf16_supported_;
 };
 
+class MatmulBfloat16Support : public FloatSupport {
+ public:
+  explicit MatmulBfloat16Support(
+      se::CudaComputeCapability cuda_compute_capability)
+      : FloatSupport(BF16),
+        is_matmul_bf16_supported_(cuda_compute_capability.IsAtLeast(
+            se::CudaComputeCapability::AMPERE)) {}
+
+  bool SupportsLowPrecisionOperand(const HloInstruction& hlo,
+                                   int64_t operand_index) const override {
+    return (hlo.opcode() != HloOpcode::kDot) || is_matmul_bf16_supported_;
+  }
+
+  bool SupportsLowPrecisionOutput(const HloInstruction& hlo) const override {
+    return (hlo.opcode() != HloOpcode::kDot) || is_matmul_bf16_supported_;
+  }
+
+  bool SupportsMixedPrecisions(const HloInstruction& hlo) const override {
+    return true;
+  }
+
+ private:
+  bool is_matmul_bf16_supported_;
+};
+
 }  // namespace
 
+int32_t NVPTXCompiler::GetToolkitVersion() const { return CUDA_VERSION; }
+
 absl::Status NVPTXCompiler::OptimizeHloConvolutionCanonicalization(
     HloModule* hlo_module, se::GpuComputeCapability gpu_version,
     se::dnn::VersionInfo dnn_version,
@@ -166,6 +195,10 @@ absl::Status NVPTXCompiler::OptimizeHloConvolutionCanonicalization(
   ConvBfloat16Support conv_bf16_support(dnn_version, cuda_compute_capability);
   pipeline.AddPass<FloatNormalization>(&conv_bf16_support);
 
+  // Convert unsupported bf16 matmuls to f32.
+  MatmulBfloat16Support matmul_bf16_support(cuda_compute_capability);
+  pipeline.AddPass<FloatNormalization>(&matmul_bf16_support);
+
   pipeline.AddPass<GpusolverRewriter>();
   pipeline.AddPass<GpuConvRewriter>();
   pipeline.AddPass<CudnnFusedConvRewriter>(cuda_compute_capability);
@@ -199,7 +232,7 @@ absl::Status NVPTXCompiler::OptimizeHloConvolutionCanonicalization(
           "reshape_mover_after_conv_canonicalization")] {
     ReshapeMoverOptions reshape_mover_options;
     reshape_mover_options.reshape_of_1d_broadcast_is_cheap = true;
-    pipeline.AddPass<HloPassFix<ReshapeMover>>(reshape_mover_options);
+    pipeline.AddPass<ReshapeMover>(reshape_mover_options);
     pipeline.AddPass<AlgebraicSimplifier>(algsimp_options);
   }();
 
@@ -274,6 +307,7 @@ absl::Status NVPTXCompiler::OptimizeHloPostLayoutAssignment(
   }
 
   pre_pipeline.AddPass<DotDimensionMerger>();
+  pre_pipeline.AddPass<DotSparsityRewriter>();
 
   for (const CublasPaddingRequirement& requirement :
        CublasPaddingRequirements) {
@@ -335,7 +369,8 @@ absl::Status NVPTXCompiler::AddConvAndGemmAutotuningPasses(
 absl::Status NVPTXCompiler::AddGemmFusionAutotuningPasses(
     HloPassPipeline* pipeline, HloModule* hlo_module,
     AutotuneConfig& autotune_config, tsl::thread::ThreadPool* thread_pool) {
-  pipeline->AddPass<GemmFusionAutotuner>(autotune_config, thread_pool);
+  pipeline->AddPass<GemmFusionAutotuner>(autotune_config, GetToolkitVersion(),
+                                         thread_pool);
   return absl::OkStatus();
 }
 
@@ -350,6 +385,10 @@ absl::Status NVPTXCompiler::AddCustomKernelReplacementPasses(
 absl::Status NVPTXCompiler::RunCudnnFusionCompilerPass(
     HloModule* module, se::StreamExecutor* stream_exec,
     Thunk::BinaryMap* dnn_compiled_graphs) {
+  tsl::profiler::ScopedAnnotation annotation([&] {
+    return absl::StrFormat("XlaCompileCudnnFusion:#module=%s,program_id=%d#",
+                           module->name(), module->unique_id());
+  });
   CuDnnFusionCompiler cudnn_compiler(*stream_exec, *dnn_compiled_graphs);
   return cudnn_compiler.Run(module).status();
 }
@@ -641,6 +680,9 @@ NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
       absl::StrCat("NVPTXCompiler::CompileGpuAsmOrGetCachedResult for ",
                    module_name),
       !options.is_autotuning_compilation);
+  tsl::profiler::ScopedAnnotation annotation([&] {
+    return absl::StrFormat("XlaCompileGpuAsm:#module=%s#", module_name);
+  });
   tsl::profiler::TraceMe activity("PTX->CUBIN",
                                   tsl::profiler::TraceMeLevel::kInfo);
   CompilationCacheValue* cache_value = nullptr;
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler.h b/third_party/xla/xla/service/gpu/nvptx_compiler.h
index 3d7a770282b134..c25c850a915003 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler.h
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler.h
@@ -53,6 +53,8 @@ class NVPTXCompiler : public GpuCompiler {
  public:
   NVPTXCompiler();
 
+  int32_t GetToolkitVersion() const override;
+
   absl::Status OptimizeHloConvolutionCanonicalization(
       HloModule* hlo_module, se::GpuComputeCapability gpu_version,
       se::dnn::VersionInfo dnn_version,
diff --git a/third_party/xla/xla/service/gpu/parallel_loop_emitter.cc b/third_party/xla/xla/service/gpu/parallel_loop_emitter.cc
index 903ffdbb1d78ca..21d2590e763176 100644
--- a/third_party/xla/xla/service/gpu/parallel_loop_emitter.cc
+++ b/third_party/xla/xla/service/gpu/parallel_loop_emitter.cc
@@ -254,7 +254,7 @@ absl::Status ParallelLoopEmitter::EmitSerialLoop(absl::string_view loop_name,
       // such that it divides num_elements, but for int4 arrays, the caller
       // always sets unroll_factor to a multiple of 2 to prevent different
       // threads from writing to adjacent elements occupying the same byte.
-      CHECK(primitive_util::Is4BitType(shape_.element_type()));
+      CHECK(primitive_util::IsSubByteNonPredType(shape_.element_type()));
       llvm_ir::LlvmIfData if_in_bounds = llvm_ir::EmitIfThenElse(
           b_->CreateICmpULT(array_index.linear(),
                             llvm::ConstantInt::get(index_type, num_elements)),
diff --git a/third_party/xla/xla/service/gpu/pipelined_p2p_rewriter.cc b/third_party/xla/xla/service/gpu/pipelined_p2p_rewriter.cc
new file mode 100644
index 00000000000000..49a1b3ac06f7ec
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/pipelined_p2p_rewriter.cc
@@ -0,0 +1,702 @@
+
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/pipelined_p2p_rewriter.h"
+
+#include <cstdint>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/dfs_hlo_visitor.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/hlo/utils/hlo_query.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status.h"
+#include "xla/util.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+// Maps a computation to a boolean that indicates whether there is any
+// collective operations directly or indirectly invoked in the computation.
+using CollectiveInComputation =
+    absl::flat_hash_map<const HloComputation*, bool>;
+
+using InstructionVector = HloInstruction::InstructionVector;
+
+// Records starting index and the ending index of a pipelined while-op. They
+// are the indices of the while-loop operand.
+struct PipelinedP2PInfo {
+  int64_t opnd_start;
+  int64_t opnd_end;
+};
+
+// Returns whether the instruction is a collective operation.
+bool IsCollectiveOp(const HloInstruction* op) {
+  HloOpcode opcode = op->opcode();
+  // TODO(NVIDIA/4364298): The information is recorded in b/309639264.
+  // we need to avoid custom-calls to overlap with Send/Recv to workaround the
+  // bug. Remove custom-calls here when the bug is fixed.
+  if (opcode == HloOpcode::kCustomCall) {
+    return true;
+  }
+
+  return hlo_query::IsCollectiveCommunicationOp(opcode) ||
+         opcode == HloOpcode::kSend || opcode == HloOpcode::kRecv;
+}
+
+// Returns whether the instruction may invoke collective operations directly
+// or indirectly.
+bool MayInvokeCollectiveOp(
+    const HloInstruction* hlo,
+    const CollectiveInComputation& collective_in_computation) {
+  if (IsCollectiveOp(hlo)) {
+    return true;
+  }
+  for (HloComputation* callee : hlo->called_computations()) {
+    auto collective_in_comp = collective_in_computation.find(callee);
+    CHECK(collective_in_comp != collective_in_computation.end());
+    if (collective_in_comp->second) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Returns the unique get-tuple-element user with the given idx or nullptr if
+// there isn't such a unique user.
+HloInstruction* FindUniqueGTEUserWithIndex(const HloInstruction* op,
+                                           int64_t idx) {
+  CHECK(op->shape().IsTuple());
+
+  HloInstruction* gte = nullptr;
+  for (auto user : op->users()) {
+    if (user->opcode() != HloOpcode::kGetTupleElement) {
+      continue;
+    }
+    if (user->tuple_index() == idx) {
+      if (gte == nullptr) {
+        gte = user;
+      } else {
+        return nullptr;
+      }
+    }
+  }
+  return gte;
+}
+
+// Returns whether there is any get-tuple-element user with the given idx.
+bool HasGTEUserWithIndex(const HloInstruction* op, int64_t idx) {
+  CHECK(op->shape().IsTuple());
+
+  for (auto user : op->users()) {
+    if (user->opcode() != HloOpcode::kGetTupleElement) {
+      continue;
+    }
+    if (user->tuple_index() == idx) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Returns the instruction hidden behind a trivial tuple or `op`. This allows
+// the discovery of recv-done for the following case, for which the indirection
+// would have been removed by tuple-simplification.
+//    gte.0 = f32[1,1024,1024] get-tuple-element(recv-done), index=0
+//    gte.1 = token get-tuple-element(recv-done.p), index=1
+//    op = (f32[1,1024,1024], token[]) tuple(gte.0, gte.1)
+//
+// TODO(bixia): investigate the possible of implementing
+// m::TrivialTuple(m::RecvDone(&instr)) as suggested by code review.
+HloInstruction* MaySkipTrivialTuple(HloInstruction* op) {
+  if (op->opcode() != HloOpcode::kTuple) {
+    return op;
+  }
+  HloInstruction* hidden_op = nullptr;
+  for (auto opnd : op->mutable_operands()) {
+    if (opnd->opcode() != HloOpcode::kGetTupleElement) {
+      return op;
+    }
+    if (hidden_op == nullptr) {
+      hidden_op = opnd->mutable_operand(0);
+    } else if (opnd->mutable_operand(0) != hidden_op) {
+      return op;
+    }
+  }
+  return hidden_op;
+}
+
+// This routine is similar to the non-const version above except that the
+// the given instruction is used for pattern checking only and can't be mutated.
+const HloInstruction* MaySkipTrivialTuple(const HloInstruction* op) {
+  // Use const_cast to avoid repeating the non-const version above to find
+  // operands of the instruction through operands() instead of
+  // mutable_operands().
+  return MaySkipTrivialTuple(const_cast<HloInstruction*>(op));
+}
+
+// Finds a consecutive block of balanced SendDone/RecvDone in the while_init
+// of a while-loop, assuming its while_init is a tuple.
+std::optional<PipelinedP2PInfo>
+FindConsecutiveAndBalanceBlockOfSendDoneRecvDone(
+    const HloInstruction* while_init) {
+  PipelinedP2PInfo pipelined_p2p_info{0, 0};
+  // Return whether the first SendDone/RecvDone has been seen.
+  auto has_started = [&]() {
+    return pipelined_p2p_info.opnd_start != pipelined_p2p_info.opnd_end;
+  };
+  // Record the difference between the number of SendDone and RecvDone in a
+  // consecutive block.
+  int difference = 0;
+  // If SendDone/RecvDone exists in a consecutive block in the while_init
+  // tuple, find such block.
+  for (int64_t i = 0; i < while_init->operand_count(); ++i) {
+    const HloInstruction* op = while_init->operand(i);
+    if ((op->opcode() == HloOpcode::kRecvDone ||
+         op->opcode() == HloOpcode::kSendDone) &&
+        op->frontend_attributes().map().count(kSendRecvPipelineAttr) > 0) {
+      if (op->opcode() == HloOpcode::kRecvDone) {
+        difference++;
+      } else {
+        difference--;
+      }
+      if (!has_started()) {
+        pipelined_p2p_info.opnd_start = i;
+      }
+      pipelined_p2p_info.opnd_end = i + 1;
+    } else {
+      if (has_started()) {
+        VLOG(10) << "End a consecutive block";
+        break;
+      }
+    }
+  }
+
+  if (difference != 0) {
+    VLOG(10) << "Mismatch number of SendDone and RecvDone: " << difference;
+    return std::nullopt;
+  }
+
+  if (has_started()) {
+    // Check for SendDone/RecvDone outside the consecutive block.
+    for (int64_t i = pipelined_p2p_info.opnd_end;
+         i < while_init->operand_count(); ++i) {
+      const HloInstruction* op = while_init->operand(i);
+      if (op->opcode() == HloOpcode::kRecvDone ||
+          op->opcode() == HloOpcode::kSendDone) {
+        VLOG(10) << "SendDone/RecvDone outside the consecutive block";
+        return std::nullopt;
+        break;
+      }
+    }
+  }
+
+  if (!has_started()) {
+    VLOG(10) << "No SendDone/RecvDone in while-init ";
+    return std::nullopt;
+  }
+
+  return pipelined_p2p_info;
+}
+
+// Checks whether the while-op, its while-body and while-condition have a
+// recognized pipelined pattern. If a pipelined pattern is found, returns the
+// first and last indices for the pipelined instruction in the while-init tuple.
+// For pipelined Send/Recv to work, the SendDone/RecvDone doesn't have to be in
+// a consecutive block, but this simplifies the implementation and is the
+// pattern that the current gpu-p2p-pipeliner generated.
+//
+// As a summary, this is what the routine looks for:
+//
+// . The while-init has a tuple with a single user.
+// . The while-init has a consecutive block of SendDone and RecvDone. The
+//   numbers of SendDone and RecvDone are the same, and there isn't any other
+//   SendDone and RecvDone outside the block.
+// . The while-body has a single tuple parameter.
+// . For the while-op result tuple and the while-body parameter tuple:
+//     The index corresponding to the index of SendDone in while-init should not
+//       correspond to any get-element-tuple user.
+//     The index corresponding to the index of RecvDone in while-init should
+//       correspond to a single get-element-tuple user.
+// . In the while-body result tuple, the operand with an index corresponding to
+//   the index in the while-init SendDone and RecvDone should also be a SendDone
+//   or RecvDone.
+//
+// TODO(bixia): support pipelined SendDone/RecvDone not in a consecutive block
+// if the gpu-p2p-pipeliner will ever generate such code in the future.
+std::optional<PipelinedP2PInfo> FindPipelinedP2P(
+    const HloInstruction* while_op) {
+  VLOG(10) << "while_op: " << while_op->ToString();
+  const HloInstruction* while_init = while_op->while_init();
+  if (while_init->opcode() != HloOpcode::kTuple ||
+      while_init->user_count() != 1) {
+    return std::nullopt;
+  }
+
+  // The while-body and while-condition should have one parameter of a tuple
+  // shape.
+  const HloComputation* while_body = while_op->while_body();
+  const HloComputation* while_condition = while_op->while_condition();
+  if (while_body->num_parameters() != 1 ||
+      while_condition->num_parameters() != 1) {
+    return std::nullopt;
+  }
+
+  std::optional<PipelinedP2PInfo> pipelined_p2p_info =
+      FindConsecutiveAndBalanceBlockOfSendDoneRecvDone(while_init);
+  if (!pipelined_p2p_info.has_value()) {
+    return std::nullopt;
+  }
+
+  VLOG(10) << "opnd_start " << pipelined_p2p_info->opnd_start << " opnd_end "
+           << pipelined_p2p_info->opnd_end;
+
+  // In the while-result or while-body parameter, the index for RecvDone should
+  // correspond to one get-tuple-element user and the index for SendDone should
+  // not correspond to any get-tuple-element user.
+  for (int64_t i = pipelined_p2p_info->opnd_start;
+       i < pipelined_p2p_info->opnd_end; ++i) {
+    const HloInstruction* op = while_init->operand(i);
+    if (op->opcode() == HloOpcode::kRecvDone) {
+      if (!FindUniqueGTEUserWithIndex(while_op, i)) {
+        VLOG(10) << "While result get-tuple-element user with index " << i
+                 << " not unique";
+        return std::nullopt;
+      }
+      if (!FindUniqueGTEUserWithIndex(while_body->parameter_instruction(0),
+                                      i)) {
+        VLOG(10) << "While-body parameter get-tuple-element user with index "
+                 << i << " not unique";
+        return std::nullopt;
+      }
+    } else {
+      CHECK(op->opcode() == HloOpcode::kSendDone);
+      if (HasGTEUserWithIndex(while_op, i) ||
+          HasGTEUserWithIndex(while_body->parameter_instruction(0), i)) {
+        VLOG(10) << "SendDone with index " << i << " has unexpected users";
+        return std::nullopt;
+      }
+    }
+  }
+
+  // The element in the while-body result tuple corresponding to the pipelined
+  // SendDone/RecvDone in the while-init have the same opcode.
+  const HloInstruction* root = while_body->root_instruction();
+  for (int64_t i = pipelined_p2p_info->opnd_start;
+       i < pipelined_p2p_info->opnd_end; ++i) {
+    const HloInstruction* op_init = while_init->operand(i);
+    const HloInstruction* op_root = root->operand(i);
+    op_root = MaySkipTrivialTuple(op_root);
+    if (op_init->opcode() != op_root->opcode()) {
+      VLOG(10) << "Mismatching opcode, op_init: " << op_init->ToString()
+               << " op_root: " << op_root->ToString();
+      return std::nullopt;
+    }
+  }
+
+  return pipelined_p2p_info.value();
+}
+
+absl::Status RemoveOpFromParent(HloInstruction* op) {
+  TF_RETURN_IF_ERROR(op->DropAllControlDeps());
+  TF_RETURN_IF_ERROR(op->parent()->RemoveInstruction(op));
+  return absl::OkStatus();
+}
+
+absl::Status ReplaceOpInSequence(HloInstruction* old_op, HloInstruction* new_op,
+                                 HloInstructionSequence& instruction_sequence) {
+  VLOG(10) << "old_op: " << old_op->ToString();
+  VLOG(10) << "new_op: " << new_op->ToString();
+  instruction_sequence.replace_instruction(old_op, new_op);
+  return RemoveOpFromParent(old_op);
+}
+
+absl::Status ReplaceUsesAndUpdateSequence(
+    HloInstruction* old_op, HloInstruction* new_op,
+    HloInstructionSequence& instruction_sequence, bool diff_shape = false) {
+  VLOG(10) << "old_op: " << old_op->ToString();
+  VLOG(10) << "new_op: " << new_op->ToString();
+  if (diff_shape) {
+    TF_RETURN_IF_ERROR(old_op->ReplaceAllUsesWithDifferentShape(new_op));
+  } else {
+    TF_RETURN_IF_ERROR(old_op->ReplaceAllUsesWith(new_op));
+  }
+  return ReplaceOpInSequence(old_op, new_op, instruction_sequence);
+}
+
+absl::Status ReplaceUsesAndUpdateSequence(
+    const InstructionVector& old_ops, const InstructionVector& new_ops,
+    HloInstructionSequence& instruction_sequence) {
+  CHECK(old_ops.size() == new_ops.size());
+  for (int64_t i = 0; i < old_ops.size(); ++i) {
+    TF_RETURN_IF_ERROR(ReplaceUsesAndUpdateSequence(old_ops[i], new_ops[i],
+                                                    instruction_sequence));
+  }
+  return absl::OkStatus();
+}
+
+absl::Status RemoveDoneOpsAndUpdateSequence(
+    const InstructionVector& ops,
+    HloInstructionSequence& instruction_sequence) {
+  auto remove_op = [&](HloInstruction* op) {
+    VLOG(10) << "op: " << op->ToString();
+    TF_RETURN_IF_ERROR(RemoveOpFromParent(op));
+    instruction_sequence.remove_instruction(op);
+    return absl::OkStatus();
+  };
+  for (auto op : ops) {
+    if (op->opcode() == HloOpcode::kTuple) {
+      InstructionVector to_remove;
+      HloInstruction* tuple_op = op;
+      op = MaySkipTrivialTuple(tuple_op);
+      to_remove.push_back(tuple_op);
+      for (auto opnd : tuple_op->mutable_operands()) {
+        to_remove.push_back(opnd);
+      }
+      for (auto opnd : to_remove) {
+        TF_RETURN_IF_ERROR(remove_op(opnd));
+      }
+    }
+    TF_RETURN_IF_ERROR(remove_op(op));
+  }
+  return absl::OkStatus();
+}
+
+bool InsertBeforeFirstCollectiveOp(
+    const InstructionVector& ops,
+    const CollectiveInComputation& collective_in_computation,
+    HloInstructionSequence& instruction_sequence, int64_t& idx,
+    int64_t& idx_tot) {
+  bool inserted = false;
+  while (idx < idx_tot) {
+    HloInstruction* hlo = instruction_sequence.instructions()[idx];
+    if (MayInvokeCollectiveOp(hlo, collective_in_computation)) {
+      for (auto op : ops) {
+        instruction_sequence.insert_instruction(op, idx);
+        idx++;
+        idx_tot++;
+      }
+      inserted = true;
+      break;
+    }
+    idx++;
+  }
+  return inserted;
+}
+
+void CopyInstructionInfo(const HloInstruction* old_op, HloInstruction* new_op) {
+  new_op->set_metadata(old_op->metadata());
+  new_op->add_frontend_attributes(old_op->frontend_attributes());
+  new_op->CopyBackendConfigFrom(old_op);
+}
+
+HloInstruction* CreateRecvDoneFrom(const HloInstruction* old_recv_done,
+                                   HloInstruction* recv,
+                                   HloComputation* computation) {
+  HloInstruction* recv_done =
+      computation->AddInstruction(HloInstruction::CreateRecvDone(
+          recv, old_recv_done->channel_id().value()));
+  CopyInstructionInfo(old_recv_done, recv_done);
+  return recv_done;
+}
+
+HloInstruction* CreateSendDoneFrom(const HloInstruction* old_send_done,
+                                   HloInstruction* send,
+                                   HloComputation* computation) {
+  HloInstruction* send_done =
+      computation->AddInstruction(HloInstruction::CreateSendDone(
+          send, old_send_done->channel_id().value()));
+  CopyInstructionInfo(old_send_done, send_done);
+  return send_done;
+}
+
+absl::Status RewritePipelinedP2PWhileBody(
+    const CollectiveInComputation& collective_in_computation,
+    const std::vector<Shape>& new_parameter_shapes, HloInstruction* while_op,
+    int64_t opnd_start, int64_t opnd_end) {
+  HloComputation* computation = while_op->while_body();
+  HloInstruction* while_init = while_op->while_init();
+  HloInstruction* root = computation->root_instruction();
+  HloInstructionSequence& instruction_sequence =
+      computation->parent()->schedule().GetOrCreateSequence(computation);
+
+  HloInstruction* param = computation->parameter_instruction(0);
+  *param->mutable_shape() = ShapeUtil::MakeTupleShape(new_parameter_shapes);
+
+  InstructionVector recv_dones;
+  InstructionVector new_recv_dones;
+  InstructionVector new_send_dones;
+  for (int64_t i = opnd_start; i < opnd_end; ++i) {
+    const HloInstruction* op = root->operand(i);
+    op = MaySkipTrivialTuple(op);
+    if (op->opcode() == HloOpcode::kRecvDone) {
+      HloInstruction* gte = FindUniqueGTEUserWithIndex(param, i);
+      CHECK(gte != nullptr);
+      recv_dones.push_back(gte);
+
+      // Create the new RecvDone using the new while-body parameter.
+      HloInstruction* recv = computation->AddInstruction(
+          HloInstruction::CreateGetTupleElement(param, i));
+
+      HloInstruction* recv_done = CreateRecvDoneFrom(op, recv, computation);
+      new_recv_dones.push_back(recv_done);
+      continue;
+    }
+    CHECK(op->opcode() == HloOpcode::kSendDone);
+    //  Create the new SendDone using the new while-op result.
+    HloInstruction* send = computation->AddInstruction(
+        HloInstruction::CreateGetTupleElement(param, i));
+    HloInstruction* send_done = CreateSendDoneFrom(op, send, computation);
+    new_send_dones.push_back(send_done);
+  }
+  TF_RETURN_IF_ERROR(ReplaceUsesAndUpdateSequence(recv_dones, new_recv_dones,
+                                                  instruction_sequence));
+
+  // Create a new root tuple.
+  InstructionVector done_ops;
+  InstructionVector new_opnds;
+  for (int64_t i = 0; i < while_init->operand_count(); ++i) {
+    HloInstruction* op = root->mutable_operand(i);
+    if (i >= opnd_start && i < opnd_end) {
+      new_opnds.push_back(MaySkipTrivialTuple(op)->mutable_operand(0));
+      done_ops.push_back(op);
+    } else {
+      new_opnds.push_back(op);
+    }
+  }
+  HloInstruction* new_root =
+      computation->AddInstruction(HloInstruction::CreateTuple(new_opnds));
+  computation->set_root_instruction(new_root,
+                                    /*accept_different_shape=*/true);
+  TF_RETURN_IF_ERROR(computation->RemoveInstruction(root));
+  instruction_sequence.replace_instruction(root, new_root);
+
+  TF_RETURN_IF_ERROR(
+      RemoveDoneOpsAndUpdateSequence(done_ops, instruction_sequence));
+
+  // Find a place to put the new SendDone. It will be either the first
+  // may-invoke-collective ops that is not in the pipelined Send/Recv chain or
+  // the first op in the pipelined Send/Recv chain.
+  int64_t idx = 0;
+  int64_t idx_end = instruction_sequence.size();
+  bool inserted =
+      InsertBeforeFirstCollectiveOp(new_send_dones, collective_in_computation,
+                                    instruction_sequence, idx, idx_end);
+  CHECK(inserted);  // There are Send/Recv in the while-body, expect inserted.
+  CHECK(idx_end == instruction_sequence.size());
+
+  TF_RETURN_IF_ERROR(computation->parent()->schedule().Update());
+  return absl::OkStatus();
+}
+
+void RewritePipelinedP2PWhileCond(
+    const std::vector<Shape>& new_parameter_shapes, HloInstruction* while_op) {
+  HloComputation* computation = while_op->while_condition();
+  HloInstruction* param = computation->parameter_instruction(0);
+  *param->mutable_shape() = ShapeUtil::MakeTupleShape(new_parameter_shapes);
+  VLOG(10) << computation->ToString();
+}
+
+// Rewrites the while-op with a recognized pipelined SendDone/RecvDone pattern
+// to pipeline Send/Recv instead.
+absl::Status TransformLoop(
+    const PipelinedP2PInfo& pipelined_info,
+    const CollectiveInComputation& collective_in_computation, int64_t& idx,
+    int64_t& idx_end, HloInstructionSequence& instruction_sequence,
+    HloInstruction* while_op) {
+  HloComputation* computation = while_op->parent();
+  int64_t opnd_start = pipelined_info.opnd_start;
+  int64_t opnd_end = pipelined_info.opnd_end;
+  VLOG(10) << "Transform pipelined while-op " << while_op->ToString();
+  HloInstruction* while_init = while_op->while_init();
+  InstructionVector new_while_init_opnds;
+  std::vector<Shape> new_parameter_shapes;
+  for (int64_t i = 0; i < while_init->operand_count(); ++i) {
+    HloInstruction* op = while_init->mutable_operand(i);
+    if (i >= opnd_start && i < opnd_end) {
+      // Get Send/Recv from SendDone/RecvDone.
+      new_while_init_opnds.push_back(op->mutable_operand(0));
+    } else {
+      new_while_init_opnds.push_back(op);
+    }
+    new_parameter_shapes.push_back(new_while_init_opnds.back()->shape());
+  }
+
+  RewritePipelinedP2PWhileCond(new_parameter_shapes, while_op);
+  TF_RETURN_IF_ERROR(RewritePipelinedP2PWhileBody(
+      collective_in_computation, new_parameter_shapes, while_op, opnd_start,
+      opnd_end));
+  HloInstruction* new_while_init = computation->AddInstruction(
+      HloInstruction::CreateTuple(new_while_init_opnds), "while-init");
+  VLOG(10) << "new_while_init: " << new_while_init->ToString();
+  HloInstruction* new_while_op = computation->AddInstruction(
+      HloInstruction::CreateWhile(
+          while_op->while_body()->root_instruction()->shape(),
+          while_op->while_condition(), while_op->while_body(), new_while_init),
+      "while-result");
+  CopyInstructionInfo(while_op, new_while_op);
+  VLOG(10) << "new_while_op: " << new_while_op->ToString();
+
+  InstructionVector recv_dones;
+  InstructionVector new_recv_dones;
+  InstructionVector new_send_dones;
+  InstructionVector done_ops;
+  for (int64_t i = opnd_start; i < opnd_end; ++i) {
+    HloInstruction* op = while_init->mutable_operand(i);
+    done_ops.push_back(op);
+    if (op->opcode() == HloOpcode::kRecvDone) {
+      HloInstruction* gte = FindUniqueGTEUserWithIndex(while_op, i);
+      CHECK(gte != nullptr);
+      recv_dones.push_back(gte);
+
+      // Create the new RecvDone using the new while-op result.
+      HloInstruction* recv = computation->AddInstruction(
+          HloInstruction::CreateGetTupleElement(new_while_op, i));
+      HloInstruction* recv_done = computation->AddInstruction(
+          HloInstruction::CreateRecvDone(recv, op->channel_id().value()));
+      new_recv_dones.push_back(recv_done);
+      CopyInstructionInfo(op, recv_done);
+      continue;
+    }
+    CHECK(op->opcode() == HloOpcode::kSendDone);
+    //  Create the new SendDone using the new while-op result.
+    HloInstruction* send = computation->AddInstruction(
+        HloInstruction::CreateGetTupleElement(new_while_op, i));
+    HloInstruction* send_done = computation->AddInstruction(
+        HloInstruction::CreateSendDone(send, op->channel_id().value()));
+    new_send_dones.push_back(send_done);
+    CopyInstructionInfo(op, send_done);
+  }
+
+  TF_RETURN_IF_ERROR(ReplaceUsesAndUpdateSequence(
+      while_op, new_while_op, instruction_sequence, /*diff_shape*/ true));
+  TF_RETURN_IF_ERROR(
+      ReplaceOpInSequence(while_init, new_while_init, instruction_sequence));
+  TF_RETURN_IF_ERROR(ReplaceUsesAndUpdateSequence(recv_dones, new_recv_dones,
+                                                  instruction_sequence));
+  TF_RETURN_IF_ERROR(
+      RemoveDoneOpsAndUpdateSequence(done_ops, instruction_sequence));
+
+  int64_t opnd_tot = opnd_end - opnd_start;
+  // Verify that the numbers of ops we have removed from the sequence is
+  // opnd_tot and they are before the position of the new while-op.
+  CHECK(idx_end == instruction_sequence.size() + opnd_tot);
+  CHECK(instruction_sequence.instructions()[idx - opnd_tot] == new_while_op);
+
+  // Update idx_end to reflect the current size of the instruction sequence.
+  // Update idx to right after the new while-op.
+  idx_end -= opnd_tot;
+  idx = idx - opnd_tot + 1;
+  bool inserted =
+      InsertBeforeFirstCollectiveOp(new_send_dones, collective_in_computation,
+                                    instruction_sequence, idx, idx_end);
+  CHECK(idx_end == instruction_sequence.size());
+  // If there isn't any may-invoke-collective ops after the while-op, add
+  // the new SendDone ops before the last instruction in the sequence.
+  if (!inserted) {
+    CHECK(idx_end == idx);
+    idx--;
+    for (auto send_done : new_send_dones) {
+      instruction_sequence.insert_instruction(send_done, idx++);
+    }
+  }
+  return absl::OkStatus();
+}
+
+// Find while-loop with pipelined Send/Recv and rotates the SendDone/RecvDone
+// for such while-loop.
+absl::StatusOr<bool> ProcessComputation(
+    HloModule* module, HloComputation* computation,
+    CollectiveInComputation& collective_in_computation) {
+  VLOG(10) << "Process compuation " << computation->name();
+  bool changed = false;
+  HloInstructionSequence& instruction_sequence =
+      module->schedule().GetOrCreateSequence(computation);
+  int64_t idx = 0;
+  int64_t idx_end = instruction_sequence.size();
+  while (idx < idx_end) {
+    HloInstruction* hlo = instruction_sequence.instructions()[idx];
+
+    if (MayInvokeCollectiveOp(hlo, collective_in_computation)) {
+      collective_in_computation[computation] = true;
+    }
+
+    if (hlo->opcode() != HloOpcode::kWhile) {
+      idx++;
+      continue;
+    }
+
+    std::optional<PipelinedP2PInfo> pipelined_info = FindPipelinedP2P(hlo);
+    if (!pipelined_info.has_value()) {
+      idx++;
+      continue;
+    }
+    TF_RETURN_IF_ERROR(TransformLoop(pipelined_info.value(),
+                                     collective_in_computation, idx, idx_end,
+                                     instruction_sequence, hlo));
+    changed = true;
+  }
+  return changed;
+}
+}  // namespace
+
+absl::StatusOr<bool> PipelinedP2PRewriter::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  bool changed = false;
+  if (!module->has_schedule()) return changed;
+  CollectiveInComputation collective_in_computation;
+  // Visit the computations in the order of callees to callers, so that
+  // while-body is processed before while-op.
+  for (auto* computation :
+       module->MakeComputationPostOrder(execution_threads)) {
+    if (computation->IsFusionComputation()) {
+      collective_in_computation[computation] = false;
+      continue;
+    }
+
+    TF_ASSIGN_OR_RETURN(
+        bool cur_changed,
+        ProcessComputation(module, computation, collective_in_computation));
+    changed |= cur_changed;
+  }
+
+  if (changed) {
+    TF_RETURN_IF_ERROR(module->schedule().Update());
+  }
+
+  return changed;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/pipelined_p2p_rewriter.h b/third_party/xla/xla/service/gpu/pipelined_p2p_rewriter.h
new file mode 100644
index 00000000000000..88b6bb662f2ed7
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/pipelined_p2p_rewriter.h
@@ -0,0 +1,133 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_PIPELINED_P2P_REWRITER_H_
+#define XLA_SERVICE_GPU_PIPELINED_P2P_REWRITER_H_
+
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// PipelinedP2PRewriter is a pass that rewrites pipelined Send/Recv related
+// code for point-to-point communication to rotate SendDone and RecvDone at the
+// end of a loop iteration to the beginning of the next iteration. This pass
+// operates on scheduled module and updates the instruction sequence.
+//
+// In particular, a pipelined Send/Recv chain with one channel group with this
+// code pattern:
+//
+// main:
+//    recv
+//    send
+//    recv-done
+//    send-done
+//    while-init = (recv-done, send-done, ...)
+//    while-op = while(whiel-init) ...
+//
+// while-body:
+//    ...
+//    recv
+//    send
+//    recv-done
+//    send-done
+//    ROOT tuple(recv-done, send-done, ...)
+//
+// Will be transformed to:
+//
+// main:
+//    recv
+//    send
+//    while-init = (recv, send, ...)
+//    while-op = while(whiel-init) ...
+//    recv-done
+//    send-done
+//
+// while-body:
+//    recv-done
+//    ...
+//    send-done
+//    recv
+//    send
+//    ROOT tuple(recv, send, ...)
+//
+// A pipelined Send/Recv chain with two channel groups with this code pattern:
+//
+// main:
+//    recv.0
+//    send.0
+//    recv.1
+//    send.1
+//    recv-done.0
+//    send-done.0
+//    recv-done.1
+//    send-done.1
+//    while-init = (recv-done.0, send-done.0, recv-done.1, send-done.1, ...)
+//    while-op = while(whiel-init) ...
+//
+// while-body:
+//    ...
+//    recv.0
+//    send.0
+//    recv.1
+//    send.1
+//    recv-done.0
+//    send-done.0
+//    recv-done.1
+//    send-done.1
+//    ROOT = tuple(recv-done.0, send-done.0, recv-done.1, send-done.1, ...)
+//
+// Will be transformed to:
+//
+// main:
+//
+//    recv.0
+//    send.0
+//    recv.1
+//    send.1
+//    while-init = (recv.0, send.0, recv.1, send.1, ...)
+//    while-op = while(while-init) ...
+//    recv-done.0
+//    send-done.0
+//    recv-done.1
+//    send-done.1
+//
+// while-body:
+//    recv-done.0
+//    recv-done.1
+//    ...
+//    send-done.0
+//    send-done.1
+//    recv.0
+//    send.1
+//    recv.1
+//    send.1
+//    ROOT tuple(recv.0, send.0, recv.1, send.1, ...)
+//
+class PipelinedP2PRewriter : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "pipelined-p2p-rewriter"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_PIPELINED_P2P_REWRITER_H_
diff --git a/third_party/xla/xla/service/gpu/pipelined_p2p_rewriter_test.cc b/third_party/xla/xla/service/gpu/pipelined_p2p_rewriter_test.cc
new file mode 100644
index 00000000000000..373618c9e588a3
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/pipelined_p2p_rewriter_test.cc
@@ -0,0 +1,509 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/pipelined_p2p_rewriter.h"
+
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/tests/filecheck.h"
+#include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class PipelinedP2pRewriterTest : public HloTestBase {
+ protected:
+  void DoFileCheck(const HloModule* module, absl::string_view expected) {
+    HloPrintOptions options;
+    options.set_print_operand_shape(false);
+    options.set_print_result_shape(false);
+    TF_ASSERT_OK_AND_ASSIGN(bool filecheck_matched,
+                            RunFileCheck(module->ToString(options), expected));
+    EXPECT_TRUE(filecheck_matched);
+  }
+};
+
+TEST_F(PipelinedP2pRewriterTest, SendRecUnpipelinedNotTransform) {
+  const char* kModuleStr = R"(
+HloModule test
+
+cond {
+    param = (u32[], u32[2]) parameter(0)
+    count = get-tuple-element(%param), index=0
+    ub = u32[] constant(11)
+    ROOT result = pred[] compare(count, ub), direction=LT
+ }
+
+body {
+    param = (u32[], u32[2]) parameter(0)
+    count = get-tuple-element(param), index=0
+    send-data = u32[2] get-tuple-element(param), index=1
+
+    after-all.0.n = token[] after-all()
+    recv.0 = (u32[2], u32[], token[]) recv(after-all.0.n), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_source_target_pairs="{{3,0}}",
+        _xla_send_recv_pipeline="0"
+      }
+    send.0 = (u32[2], u32[], token[]) send(send-data, after-all.0.n),
+      channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_source_target_pairs="{{3,0}}",
+        _xla_send_recv_pipeline="0"
+      }
+    recv-done.0 = (u32[2], token[]) recv-done(recv.0), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
+    send-done.0 = token[] send-done(send.0), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
+
+    recv-data = u32[2] get-tuple-element(recv-done.0), index=0
+
+    c1 = u32[] constant(1)
+    new_count = u32[] add(count, c1)
+
+    r = u32[2] broadcast(c1), dimensions={}
+    s = u32[2] add(r, recv-data)
+
+    ROOT result = (u32[], u32[2]) tuple(new_count, s)
+  }
+
+  ENTRY test_computation {
+    c0 = u32[] constant(0)
+    c1 = u32[] constant(1)
+    r = u32[] replica-id()
+    a = u32[] add(c1, r)
+    init = u32[2] broadcast(a), dimensions={}
+    while_init = (u32[], u32[2]) tuple(c0, init)
+    while_result = (u32[], u32[2]) while(while_init), body=body, condition=cond,
+      backend_config={"known_trip_count":{"n":"11"}}
+    ROOT recv-data = u32[2] get-tuple-element(while_result), index=1
+  }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  PipelinedP2PRewriter rewriter;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, rewriter.Run(module.get()));
+  EXPECT_FALSE(changed);
+}
+
+// Tests the rewrite for a pipelined Send/Recv chain with only one channel
+// group.
+TEST_F(PipelinedP2pRewriterTest, SendRecvPipelined1) {
+  const char* kModuleStr = R"(
+  HloModule test, is_scheduled=true
+
+  while-cond {
+    param = (u32[], (f32[1,1024,1024], token[]), token[]) parameter(0)
+    count = get-tuple-element(param), index=0
+    ub = u32[] constant(25)
+    ROOT cond-result = pred[] compare(count, ub), direction=LT
+  }
+
+  while-body {
+    param = (u32[], (f32[1,1024,1024], token[]), token[]) parameter(0)
+    count = get-tuple-element(param), index=0
+
+    recv-done.q = (f32[1,1024,1024], token[]) get-tuple-element(param), index=1
+    recv-data = f32[1, 1024, 1024] get-tuple-element(recv-done.q), index=0
+
+    c1 = u32[] constant(1)
+    new-count = u32[] add(count, c1)
+    replica = u32[] replica-id()
+    c10 = u32[] constant(10)
+    sum = u32[] add(replica, c10)
+    sum2 = u32[] add(sum, count)
+    conv = f32[] convert(sum2)
+    p = f32[1, 1024, 1024] broadcast(conv), dimensions={}
+    b = f32[1, 1024, 1024] add(p, recv-data)
+    c = f32[1, 1024, 1024] multiply(b, b)
+    d = f32[1, 1024, 1024] tan(c)
+    s = f32[1, 1024, 1024] dot(c, d), lhs_batch_dims={0},
+      lhs_contracting_dims={1}, rhs_batch_dims={0}, rhs_contracting_dims={1}
+    send-data = f32[1, 1024, 1024] add(c, s)
+
+    after-all = token[] after-all()
+    recv = (f32[1, 1024, 1024], u32[], token[]) recv(after-all), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
+        _xla_send_recv_pipeline="0"
+      }
+    send = (f32[1, 1024, 1024], u32[], token[]) send(send-data, after-all),
+      channel_id=1, frontend_attributes={
+        _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
+        _xla_send_recv_pipeline="0"
+      }
+    recv-done.p = (f32[1,1024,1024], token[]) recv-done(recv), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
+    send-done.p = token[] send-done(send), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
+    gte.0 = f32[1,1024,1024] get-tuple-element(recv-done.p), index=0
+    gte.1 = token[] get-tuple-element(recv-done.p), index=1
+    recv-done-tuple = (f32[1,1024,1024], token[]) tuple(gte.0, gte.1)
+    ROOT body-result = (u32[], (f32[1,1024,1024], token[]), token[])
+      tuple(new-count, recv-done-tuple, send-done.p)
+  }
+
+  ENTRY main {
+    c0 = u32[] constant(0)
+    f0 = f32[] constant(0.0)
+    init = f32[1, 1024, 1024] broadcast(f0), dimensions={}
+
+    after-all.1 = token[] after-all()
+    recv.1 = (f32[1, 1024, 1024], u32[], token[]) recv(after-all.1), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
+        _xla_send_recv_pipeline="0"
+      }
+    send.1 = (f32[1, 1024, 1024], u32[], token[]) send(init, after-all.1), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
+        _xla_send_recv_pipeline="0"
+      }
+    recv-done.1.p = (f32[1,1024,1024], token[]) recv-done(recv.1), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
+    send-done.1.p = token[] send-done(send.1), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
+    while-init.p =  (u32[], (f32[1,1024,1024], token[]), token[])
+      tuple(c0, recv-done.1.p, send-done.1.p)
+    while-result.p = (u32[], (f32[1,1024,1024], token[]), token[])
+      while(while-init.p),
+      body=while-body, condition=while-cond,
+      backend_config={"known_trip_count":{"n":"25"}}
+
+    recv-done.1.q = (f32[1,1024,1024], token[]) get-tuple-element(while-result.p), index=1
+
+    ROOT entry-result = f32[1, 1024, 1024] get-tuple-element(recv-done.1.q), index=0
+  }
+  )";
+
+  const char* kExpected = R"(
+  CHECK: %while-body (param.1: (u32[], (f32[1,1024,1024], u32[], token[]), (f32[1,1024,1024], u32[], token[]))) -> (u32[], (f32[1,1024,1024], u32[], token[]), (f32[1,1024,1024], u32[], token[])) {
+  CHECK: %param.1 = parameter(0)
+  CHECK: %get-tuple-element = get-tuple-element(%param.1), index=1
+  CHECK: %get-tuple-element.1 = get-tuple-element(%param.1), index=2
+  CHECK: %count.1 = get-tuple-element(%param.1), index=0
+  CHECK: %recv-done = recv-done(%get-tuple-element), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0"}
+  CHECK: %recv-data = get-tuple-element(%recv-done), index=0
+  CHECK: %c1 = constant(1)
+  CHECK: %new-count = add(%count.1, %c1)
+  CHECK: %replica = replica-id()
+  CHECK: %c10 = constant(10)
+  CHECK: %sum = add(%replica, %c10)
+  CHECK: %sum2 = add(%sum, %count.1)
+  CHECK: %conv = convert(%sum2)
+  CHECK: %p = broadcast(%conv), dimensions={}
+  CHECK: %b = add(%p, %recv-data)
+  CHECK: %c = multiply(%b, %b)
+  CHECK: %d = tan(%c)
+  CHECK: %s = dot(%c, %d), lhs_batch_dims={0}, lhs_contracting_dims={1}, rhs_batch_dims={0}, rhs_contracting_dims={1}
+  CHECK: %send-data = add(%c, %s)
+  CHECK: %after-all = after-all()
+  CHECK: %send-done = send-done(%get-tuple-element.1), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0"}
+  CHECK{LITERAL}: %recv = recv(%after-all), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0",_xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}"}
+  CHECK{LITERAL}: %send = send(%send-data, %after-all), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0",_xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}"}
+  CHECK: ROOT %tuple = tuple(%new-count, %recv, %send)
+  CHECK: }
+
+  CHECK: %while-cond (param: (u32[], (f32[1,1024,1024], u32[], token[]), (f32[1,1024,1024], u32[], token[]))) -> pred[] {
+  CHECK: %param = parameter(0)
+  CHECK: %count = get-tuple-element(%param), index=0
+  CHECK: %ub = constant(25)
+  CHECK: ROOT %cond-result = compare(%count, %ub), direction=LT
+  CHECK: }
+
+  CHECK: ENTRY %main () -> f32[1,1024,1024] {
+  CHECK: %c0 = constant(0)
+  CHECK: %f0 = constant(0)
+  CHECK: %init = broadcast(%f0), dimensions={}
+  CHECK: %after-all.1 = after-all()
+  CHECK{LITERAL}: %recv.1 = recv(%after-all.1), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0",_xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}"}
+  CHECK{LITERAL}: %send.1 = send(%init, %after-all.1), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0",_xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}"}
+  CHECK: %while-init = tuple(%c0, %recv.1, %send.1)
+  CHECK: %while-result = while(%while-init), condition=%while-cond, body=%while-body,
+  CHECK-SAME{LITERAL}: backend_config={"known_trip_count":{"n":"25"}}
+  CHECK: %get-tuple-element.2 = get-tuple-element(%while-result), index=1
+  CHECK: %get-tuple-element.3 = get-tuple-element(%while-result), index=2
+  CHECK: %recv-done.1 = recv-done(%get-tuple-element.2), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0"}
+  CHECK: %send-done.1 = send-done(%get-tuple-element.3), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0"}
+  CHECK: ROOT %entry-result = get-tuple-element(%recv-done.1), index=0
+  CHECK: })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  PipelinedP2PRewriter rewriter;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, rewriter.Run(module.get()));
+  EXPECT_TRUE(changed);
+
+  DoFileCheck(module.get(), kExpected);
+}
+
+// Tests the rewrite for a pipelined Send/Recv chain with two channel groups.
+TEST_F(PipelinedP2pRewriterTest, SendRecvPipelined2) {
+  const char* kModuleStr = R"(
+  HloModule test, is_scheduled=true
+
+  while-cond {
+    param = (u32[], (f32[1,1024,1024], token[]), token[],
+      (f32[1,1024,1024], token[]), token[]) parameter(0)
+    count = get-tuple-element(param), index=0
+    ub = u32[] constant(25)
+    ROOT cond-result = pred[] compare(count, ub), direction=LT
+  }
+
+  while-body {
+    param = (u32[], (f32[1,1024,1024], token[]), token[],
+      (f32[1,1024,1024], token[]), token[]) parameter(0)
+    count = get-tuple-element(param), index=0
+
+    recv-done.0.q = (f32[1,1024,1024], token[]) get-tuple-element(param), index=1
+    recv-data.0 = f32[1, 1024, 1024] get-tuple-element(recv-done.0.q), index=0
+    recv-done.1.q = (f32[1,1024,1024], token[]) get-tuple-element(param), index=3
+    recv-data.1 = f32[1, 1024, 1024] get-tuple-element(recv-done.1.q), index=0
+
+    replica = u32[] replica-id()
+    constant0 = u32[] constant(0)
+    compare0 = pred[] compare(replica, constant0), direction=EQ
+    compare = pred[1, 1024, 1024] broadcast(compare0), dimensions={}
+    recv-data = f32[1, 1024, 1024] select(compare, recv-data.0, recv-data.1)
+
+    c1 = u32[] constant(1)
+    new-count = u32[] add(count, c1)
+    c10 = u32[] constant(10)
+    sum = u32[] add(replica, c10)
+    sum2 = u32[] add(sum, count)
+    conv = f32[] convert(sum2)
+    p = f32[1, 1024, 1024] broadcast(conv), dimensions={}
+    b = f32[1, 1024, 1024] add(p, recv-data)
+    c = f32[1, 1024, 1024] multiply(b, b)
+    d = f32[1, 1024, 1024] tan(c)
+    s = f32[1, 1024, 1024] dot(c, d), lhs_batch_dims={0},
+      lhs_contracting_dims={1}, rhs_batch_dims={0}, rhs_contracting_dims={1}
+    send-data = f32[1, 1024, 1024] add(c, s)
+
+    after-all = token[] after-all()
+    recv = (f32[1, 1024, 1024], u32[], token[]) recv(after-all), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_source_target_pairs="{{3,0}}",
+        _xla_send_recv_pipeline="0"
+      }
+    send = (f32[1, 1024, 1024], u32[], token[]) send(send-data, after-all),
+      channel_id=1, frontend_attributes={
+        _xla_send_recv_source_target_pairs="{{3,0}}",
+        _xla_send_recv_pipeline="0"
+      }
+    recv-done.p = (f32[1,1024,1024], token[]) recv-done(recv), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
+    send-done.p = token[] send-done(send), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
+
+    after-all.1 = token[] after-all()
+    recv.1 = (f32[1, 1024, 1024], u32[], token[]) recv(after-all.1), channel_id=2,
+      frontend_attributes={
+        _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}}",
+        _xla_send_recv_pipeline="1"
+      }
+    send.1 = (f32[1, 1024, 1024], u32[], token[]) send(send-data, after-all.1),
+      channel_id=2, frontend_attributes={
+       _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}}",
+       _xla_send_recv_pipeline="1"
+      }
+    recv-done.1.p = (f32[1,1024,1024], token[]) recv-done(recv.1), channel_id=2,
+      frontend_attributes={
+        _xla_send_recv_pipeline="1"
+      }
+    send-done.1.p = token[] send-done(send.1), channel_id=2,
+      frontend_attributes={
+        _xla_send_recv_pipeline="1"
+      }
+
+    ROOT body-result = (u32[], (f32[1,1024,1024], token[]), token[],
+      (f32[1,1024,1024], token[]), token[])
+      tuple(new-count, recv-done.p, send-done.p, recv-done.1.p, send-done.1.p)
+  }
+
+  ENTRY main {
+    c0 = u32[] constant(0)
+    f0 = f32[] constant(0.0)
+    init = f32[1, 1024, 1024] broadcast(f0), dimensions={}
+
+    after-all.2 = token[] after-all()
+    recv.2 = (f32[1, 1024, 1024], u32[], token[]) recv(after-all.2), channel_id=1,
+      frontend_attributes={
+       _xla_send_recv_source_target_pairs="{{3,0}}",
+       _xla_send_recv_pipeline="0"
+    }
+    send.2 = (f32[1, 1024, 1024], u32[], token[]) send(init, after-all.2), channel_id=1,
+      frontend_attributes={
+       _xla_send_recv_source_target_pairs="{{3,0}}",
+       _xla_send_recv_pipeline="0"
+    }
+    recv-done.2.p = (f32[1,1024,1024], token[]) recv-done(recv.2), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
+    send-done.2.p = token[] send-done(send.2), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
+
+    after-all.3 = token[] after-all()
+    recv.3 = (f32[1, 1024, 1024], u32[], token[]) recv(after-all.3), channel_id=2,
+      frontend_attributes={
+       _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}}",
+       _xla_send_recv_pipeline="1"
+    }
+    send.3 = (f32[1, 1024, 1024], u32[], token[]) send(init, after-all.3), channel_id=2,
+      frontend_attributes={
+       _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}}",
+       _xla_send_recv_pipeline="1"
+    }
+    recv-done.3.p = (f32[1,1024,1024], token[]) recv-done(recv.3), channel_id=2,
+      frontend_attributes={
+        _xla_send_recv_pipeline="1"
+      }
+    send-done.3.p = token[] send-done(send.3), channel_id=2,
+      frontend_attributes={
+        _xla_send_recv_pipeline="1"
+      }
+
+    while-init.p =  (u32[], (f32[1,1024,1024], token[]), token[],
+      (f32[1,1024,1024], token[]), token[]) tuple(c0, recv-done.2.p, send-done.2.p, recv-done.3.p, send-done.3.p)
+    while-result.p = (u32[], (f32[1,1024,1024], token[]), token[],
+      (f32[1,1024,1024], token[]), token[]) while(while-init.p),
+      body=while-body, condition=while-cond,
+      backend_config={"known_trip_count":{"n":"25"}}
+
+    recv-done.2.q = (f32[1,1024,1024], token[]) get-tuple-element(while-result.p), index=1
+    recv-data.2 = f32[1, 1024, 1024] get-tuple-element(recv-done.2.q), index=0
+    recv-done.3.q = (f32[1,1024,1024], token[]) get-tuple-element(while-result.p), index=3
+    recv-data.3 = f32[1, 1024, 1024] get-tuple-element(recv-done.3.q), index=0
+
+    replica = u32[] replica-id()
+    constant0 = u32[] constant(0)
+    compare0 = pred[] compare(replica, constant0), direction=EQ
+    compare = pred[1, 1024, 1024] broadcast(compare0), dimensions={}
+    ROOT entry-result = f32[1, 1024, 1024] select(compare, recv-data.2, recv-data.3)
+  }
+  )";
+
+  const char* kExpected = R"(
+  CHECK: %while-body (param.1: (u32[], (f32[1,1024,1024], u32[], token[]), (f32[1,1024,1024], u32[], token[]), (f32[1,1024,1024], u32[], token[]), (f32[1,1024,1024], u32[], token[]))) -> (u32[], (f32[1,1024,1024], u32[], token[]), (f32[1,1024,1024], u32[], token[]), (f32[1,1024,1024], u32[], token[]), (f32[1,1024,1024], u32[], token[])) {
+  CHECK: %param.1 = parameter(0)
+  CHECK: %get-tuple-element = get-tuple-element(%param.1), index=1
+  CHECK: %get-tuple-element.1 = get-tuple-element(%param.1), index=2
+  CHECK: %get-tuple-element.2 = get-tuple-element(%param.1), index=3
+  CHECK: %get-tuple-element.3 = get-tuple-element(%param.1), index=4
+  CHECK: %count.1 = get-tuple-element(%param.1), index=0
+  CHECK: %recv-done = recv-done(%get-tuple-element), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0"}
+  CHECK: %recv-data.0 = get-tuple-element(%recv-done), index=0
+  CHECK: %recv-done.1 = recv-done(%get-tuple-element.2), channel_id=2, frontend_attributes={_xla_send_recv_pipeline="1"}
+  CHECK: %recv-data.1 = get-tuple-element(%recv-done.1), index=0
+  CHECK: %replica = replica-id()
+  CHECK: %constant0 = constant(0)
+  CHECK: %compare0 = compare(%replica, %constant0), direction=EQ
+  CHECK: %compare = broadcast(%compare0), dimensions={}
+  CHECK: %recv-data.2 = select(%compare, %recv-data.0, %recv-data.1)
+  CHECK: %c1 = constant(1)
+  CHECK: %new-count = add(%count.1, %c1)
+  CHECK: %c10 = constant(10)
+  CHECK: %sum = add(%replica, %c10)
+  CHECK: %sum2 = add(%sum, %count.1)
+  CHECK: %conv = convert(%sum2)
+  CHECK: %p = broadcast(%conv), dimensions={}
+  CHECK: %b = add(%p, %recv-data.2)
+  CHECK: %c = multiply(%b, %b)
+  CHECK: %d = tan(%c)
+  CHECK: %s = dot(%c, %d), lhs_batch_dims={0}, lhs_contracting_dims={1}, rhs_batch_dims={0}, rhs_contracting_dims={1}
+  CHECK: %send-data = add(%c, %s)
+  CHECK: %after-all = after-all()
+  CHECK: %send-done = send-done(%get-tuple-element.1), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0"}
+  CHECK: %send-done.1 = send-done(%get-tuple-element.3), channel_id=2, frontend_attributes={_xla_send_recv_pipeline="1"}
+  CHECK{LITERAL}: %recv = recv(%after-all), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0",_xla_send_recv_source_target_pairs="{{3,0}}"}
+  CHECK{LITERAL}: %send = send(%send-data, %after-all), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0",_xla_send_recv_source_target_pairs="{{3,0}}"}
+  CHECK: %after-all.1 = after-all()
+  CHECK{LITERAL}: %recv.1 = recv(%after-all.1), channel_id=2, frontend_attributes={_xla_send_recv_pipeline="1",_xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}}"}
+  CHECK{LITERAL}: %send.1 = send(%send-data, %after-all.1), channel_id=2, frontend_attributes={_xla_send_recv_pipeline="1",_xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}}"}
+  CHECK: ROOT %tuple = tuple(%new-count, %recv, %send, %recv.1, %send.1)
+  CHECK: }
+
+  CHECK: %while-cond (param: (u32[], (f32[1,1024,1024], u32[], token[]), (f32[1,1024,1024], u32[], token[]), (f32[1,1024,1024], u32[], token[]), (f32[1,1024,1024], u32[], token[]))) -> pred[] {
+  CHECK: %param = parameter(0)
+  CHECK: %count = get-tuple-element(%param), index=0
+  CHECK: %ub = constant(25)
+  CHECK: ROOT %cond-result = compare(%count, %ub), direction=LT
+  CHECK: }
+
+  CHECK: ENTRY %main () -> f32[1,1024,1024] {
+  CHECK: %c0 = constant(0)
+  CHECK: %f0 = constant(0)
+  CHECK: %init = broadcast(%f0), dimensions={}
+  CHECK: %after-all.2 = after-all()
+  CHECK{LITERAL}: %recv.2 = recv(%after-all.2), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0",_xla_send_recv_source_target_pairs="{{3,0}}"}
+  CHECK{LITERAL}: %send.2 = send(%init, %after-all.2), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0",_xla_send_recv_source_target_pairs="{{3,0}}"}
+  CHECK: %after-all.3 = after-all()
+  CHECK{LITERAL}: %recv.3 = recv(%after-all.3), channel_id=2, frontend_attributes={_xla_send_recv_pipeline="1",_xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}}"}
+  CHECK{LITERAL}: %send.3 = send(%init, %after-all.3), channel_id=2, frontend_attributes={_xla_send_recv_pipeline="1",_xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}}"}
+  CHECK: %while-init = tuple(%c0, %recv.2, %send.2, %recv.3, %send.3)
+  CHECK{LITERAL}: %while-result = while(%while-init), condition=%while-cond, body=%while-body, backend_config={"known_trip_count":{"n":"25"}}
+  CHECK: %get-tuple-element.4 = get-tuple-element(%while-result), index=1
+  CHECK: %get-tuple-element.5 = get-tuple-element(%while-result), index=2
+  CHECK: %get-tuple-element.6 = get-tuple-element(%while-result), index=3
+  CHECK: %get-tuple-element.7 = get-tuple-element(%while-result), index=4
+  CHECK: %recv-done.2 = recv-done(%get-tuple-element.4), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0"}
+  CHECK: %recv-data.3 = get-tuple-element(%recv-done.2), index=0
+  CHECK: %recv-done.3 = recv-done(%get-tuple-element.6), channel_id=2, frontend_attributes={_xla_send_recv_pipeline="1"}
+  CHECK: %recv-data.4 = get-tuple-element(%recv-done.3), index=0
+  CHECK: %replica.1 = replica-id()
+  CHECK: %constant0.1 = constant(0)
+  CHECK: %compare0.1 = compare(%replica.1, %constant0.1), direction=EQ
+  CHECK: %compare.1 = broadcast(%compare0.1), dimensions={}
+  CHECK: %send-done.2 = send-done(%get-tuple-element.5), channel_id=1, frontend_attributes={_xla_send_recv_pipeline="0"}
+  CHECK: %send-done.3 = send-done(%get-tuple-element.7), channel_id=2, frontend_attributes={_xla_send_recv_pipeline="1"}
+  CHECK: ROOT %entry-result = select(%compare.1, %recv-data.3, %recv-data.4)
+  CHECK: })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  PipelinedP2PRewriter rewriter;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, rewriter.Run(module.get()));
+  EXPECT_TRUE(changed);
+
+  DoFileCheck(module.get(), kExpected);
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/priority_fusion.cc b/third_party/xla/xla/service/gpu/priority_fusion.cc
index 7d39f4aae766b1..845614ed605503 100644
--- a/third_party/xla/xla/service/gpu/priority_fusion.cc
+++ b/third_party/xla/xla/service/gpu/priority_fusion.cc
@@ -275,13 +275,11 @@ class GpuPriorityFusionQueue {
   // Invalidates all cached value related to this instruction. Called before the
   // instruction is fused. The instruction can be either producer or consumer.
   void InvalidateCaches(HloInstruction* instruction) {
-    HloInstructionAdaptor instruction_adaptor(*instruction);
-
-    can_fuse_cache_.erase(instruction_adaptor);
-    for (auto operand : instruction_adaptor.GetOperands()) {
+    can_fuse_cache_.erase(instruction);
+    for (const HloInstruction* operand : instruction->operands()) {
       auto it = can_fuse_cache_.find(operand);
       if (it != can_fuse_cache_.end()) {
-        it->second.erase(instruction_adaptor);
+        it->second.erase(instruction);
       }
     }
 
@@ -533,14 +531,11 @@ class GpuPriorityFusionQueue {
 
   FusionDecision CanFuseCached(HloInstruction* producer,
                                HloInstruction* consumer) {
-    HloInstructionAdaptor producer_adaptor(*producer);
-    HloInstructionAdaptor consumer_adaptor(*consumer);
-
     {
       absl::MutexLock lock(&can_fuse_cache_mutex_);
-      auto& producer_cache = can_fuse_cache_[producer_adaptor];
+      auto& producer_cache = can_fuse_cache_[producer];
 
-      auto it = producer_cache.find(consumer_adaptor);
+      auto it = producer_cache.find(consumer);
       if (it != producer_cache.end()) {
         return it->second;
       }
@@ -554,7 +549,7 @@ class GpuPriorityFusionQueue {
     // override any value.
     {
       absl::MutexLock lock(&can_fuse_cache_mutex_);
-      can_fuse_cache_[producer_adaptor][consumer_adaptor] = fusion_decision;
+      can_fuse_cache_[producer][consumer] = fusion_decision;
     }
 
     return fusion_decision;
@@ -625,8 +620,8 @@ class GpuPriorityFusionQueue {
   // Caches result of can_fuse for a (producer, consumer) pair. A cache entry is
   // invalidated if producer or consumer is modified.
   absl::flat_hash_map<
-      HloInstructionAdaptor,
-      absl::flat_hash_map<HloInstructionAdaptor, FusionDecision>>
+      const HloInstruction*,
+      absl::flat_hash_map<const HloInstruction*, FusionDecision>>
       can_fuse_cache_;
   absl::Mutex can_fuse_cache_mutex_;
 
@@ -692,6 +687,10 @@ absl::StatusOr<bool> GpuPriorityFusion::Run(
         device_info_.ToGpuProto();
   }
 
+  // Compute the computations within which more fusion is possible.
+  auto fusible_computations =
+      GetFusibleComputations(*module, execution_threads);
+
   // Appends ".0" suffix to all instructions.
   //
   // Every time an instruction is duplicated, the last integer suffix is
@@ -702,8 +701,7 @@ absl::StatusOr<bool> GpuPriorityFusion::Run(
   // With this modification it will be easier to match instructions before and
   // after fusion passes, because they will have the same unique prefix. Names
   // are not used in the pipeline, but it makes debugging much easier.
-  for (auto* computation :
-       GetNonFusionComputations(module, execution_threads)) {
+  for (auto* computation : fusible_computations) {
     for (auto* instruction : computation->instructions()) {
       module->SetAndUniquifyInstrName(instruction,
                                       absl::StrCat(instruction->name(), ".0"));
@@ -721,8 +719,7 @@ absl::StatusOr<bool> GpuPriorityFusion::Run(
           .xla_gpu_enable_triton_softmax_priority_fusion();
 
   int changed = false;
-  for (auto* computation :
-       GetNonFusionComputations(module, execution_threads)) {
+  for (auto* computation : fusible_computations) {
     CHECK(!computation->IsFusionComputation());
 
     auto fusion_queue = std::make_unique<GpuPriorityFusionQueue>(
diff --git a/third_party/xla/xla/service/gpu/priority_fusion_test.cc b/third_party/xla/xla/service/gpu/priority_fusion_test.cc
index 7fef06d680a4e8..5da67641d62855 100644
--- a/third_party/xla/xla/service/gpu/priority_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/priority_fusion_test.cc
@@ -913,5 +913,29 @@ ENTRY main {
   EXPECT_TRUE(IsTritonSoftmaxFusion(*root));
 }
 
+TEST_F(PriorityFusionTest, DoNotFuseInsideReducer) {
+  auto module = *ParseAndReturnVerifiedModule(R"(
+    %reducer {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      add = f32[] add(p0, p1)
+      ROOT max = f32[] maximum(add, p0)
+    }
+
+    %fused_reduce {
+      p0 = f32[256] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT reduce = f32[] reduce(p0, p1), dimensions={0}, to_apply=%reducer
+    }
+
+    ENTRY fusion {
+      p0 = f32[256] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT %reduce = f32[] fusion(p0, p1), kind=kInput, calls=fused_reduce
+    }
+  )");
+  EXPECT_THAT(priority_fusion_.Run(module.get()), IsOkAndHolds(false));
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/reduction_degenerate_dim_remover.cc b/third_party/xla/xla/service/gpu/reduction_degenerate_dim_remover.cc
index f6d422b3674690..ac5419cf28d872 100644
--- a/third_party/xla/xla/service/gpu/reduction_degenerate_dim_remover.cc
+++ b/third_party/xla/xla/service/gpu/reduction_degenerate_dim_remover.cc
@@ -93,6 +93,7 @@ class ReductionDegenerateDimRemoverVisitor : public DfsHloRewriteVisitor {
     std::unique_ptr<HloInstruction> new_reduce = HloInstruction::CreateReduce(
         canonical_reduce_shape, input_reshapes, instr->init_values(),
         updated_reduced_dimensions, instr->to_apply());
+    instr->SetupDerivedInstruction(new_reduce.get());
 
     if (canonical_reduce_shape != instr->shape()) {
       HloInstruction *wrapped_reduce =
diff --git a/third_party/xla/xla/service/gpu/rename_fusions.cc b/third_party/xla/xla/service/gpu/rename_fusions.cc
index 1a6731cdd49190..a2a3048a05655e 100644
--- a/third_party/xla/xla/service/gpu/rename_fusions.cc
+++ b/third_party/xla/xla/service/gpu/rename_fusions.cc
@@ -56,8 +56,7 @@ std::string MakeFusionHeroNames(const HloInstruction* instruction) {
   absl::btree_set<absl::string_view> heroes;
 
   for (auto root : fusion_adaptor->GetRoots()) {
-    heroes.insert(HloOpcodeString(
-        FindNonTrivialHero(root.instruction(), *fusion_adaptor).opcode()));
+    heroes.insert(HloOpcodeString(FindNonTrivialHero(root).opcode()));
   }
   return absl::StrReplaceAll(absl::StrJoin(heroes, "_"), {{"-", "_"}});
 }
diff --git a/third_party/xla/xla/service/gpu/runtime/BUILD b/third_party/xla/xla/service/gpu/runtime/BUILD
index a9a5caee2393fb..f13bfe7b3f474e 100644
--- a/third_party/xla/xla/service/gpu/runtime/BUILD
+++ b/third_party/xla/xla/service/gpu/runtime/BUILD
@@ -1,14 +1,15 @@
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
 load("@local_tsl//tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
+load("//xla:xla.bzl", "xla_cc_test")
 load("//xla/service/gpu:build_defs.bzl", "get_cub_sort_kernel_types")
 load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
 load("//xla/tests:build_defs.bzl", "xla_test")
-load("//xla/tsl:tsl.bzl", "if_google", "if_nccl", "nvtx_headers")
+load("//xla/tsl:tsl.bzl", "if_google", "if_nccl", "internal_visibility", "nvtx_headers")
 load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [":friends"],
+    default_visibility = internal_visibility([":friends"]),
     licenses = ["notice"],
 )
 
@@ -46,23 +47,6 @@ cc_library(
 # Command Buffer Integration
 #===-------------------------------------------------------------------------------------------===//
 
-cc_library(
-    name = "command_buffer_allocations",
-    srcs = ["command_buffer_allocations.cc"],
-    hdrs = ["command_buffer_allocations.h"],
-    deps = [
-        "//xla:status",
-        "//xla:statusor",
-        "//xla/service:buffer_assignment",
-        "//xla/service/gpu:buffer_allocations",
-        "//xla/stream_executor",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
 cc_library(
     name = "command_buffer_cmd",
     srcs = ["command_buffer_cmd.cc"],
@@ -76,6 +60,7 @@ cc_library(
         ":nccl_all_gather_thunk",
         ":nccl_all_reduce_thunk",
         ":nccl_api",
+        ":nccl_clique_key",
         ":nccl_collective_broadcast_thunk",
         ":nccl_collective_thunk",
         "//xla:executable_run_options",
@@ -96,13 +81,15 @@ cc_library(
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:matmul_utils",
-        "//xla/service/gpu:nccl_clique_key",
         "//xla/service/gpu:stream_executor_util",
         "//xla/service/gpu/kernels:custom_kernel",
         "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
+        "//xla/stream_executor:kernel_factory",
+        "//xla/stream_executor:trace_command_buffer_factory",
         "//xla/stream_executor/gpu:gpu_stream_header",
         "//xla/stream_executor/gpu:gpu_types_header",
+        "//xla/tsl/concurrency:ref_count",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -114,7 +101,6 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
@@ -138,6 +124,7 @@ cc_library(
         ":memset_thunk",
         ":nccl_all_gather_thunk",
         ":nccl_all_reduce_thunk",
+        ":nccl_collective_thunk",
         ":replica_id_thunk",
         ":sequential_thunk",
         ":wait_for_streams_thunk",
@@ -207,12 +194,13 @@ cc_library(
     hdrs = ["nccl_api.h"],
     compatible_with = get_compatible_with_portable(),
     deps = [
+        ":nccl_clique_key",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/service:collective_ops_utils",
-        "//xla/service/gpu:nccl_clique_key",
         "//xla/stream_executor",
         "//xla/stream_executor/gpu:gpu_activation",
+        "//xla/tsl/concurrency:ref_count",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/hash",
@@ -221,7 +209,6 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
@@ -230,6 +217,7 @@ cc_library(
         "//xla/stream_executor/cuda:cuda_driver",
         "//xla/stream_executor/cuda:cuda_executor",
     ]) + if_rocm_is_configured([
+        "@local_config_rocm//rocm:rocm_headers",
         "@local_config_rocm//rocm:rccl",
         "//xla/stream_executor/rocm:rocm_driver",
         "//xla/stream_executor/rocm:rocm_executor",
@@ -244,15 +232,15 @@ cc_library(
     hdrs = ["nccl_api.h"],
     compatible_with = get_compatible_with_portable(),
     deps = [
+        ":nccl_clique_key",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/service:collective_ops_utils",
-        "//xla/service/gpu:nccl_clique_key",
         "//xla/stream_executor",
+        "//xla/tsl/concurrency:ref_count",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/platform:logging",
     ],
 )
@@ -263,13 +251,13 @@ cc_library(
     hdrs = ["nccl_clique.h"],
     deps = [
         ":nccl_api",
+        ":nccl_clique_key",
         "//xla:debug_options_flags",
         "//xla:executable_run_options",
         "//xla:status_macros",
         "//xla/service:global_device_id",
         "//xla/service:lockable",
         "//xla/service:rendezvous",
-        "//xla/service/gpu:nccl_clique_key",
         "//xla/stream_executor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
@@ -292,6 +280,35 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "nccl_clique_key",
+    srcs = ["nccl_clique_key.cc"],
+    hdrs = ["nccl_clique_key.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//xla/service:global_device_id",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:btree",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/lib/gtl:int_type",
+    ],
+)
+
+xla_cc_test(
+    name = "nccl_clique_key_test",
+    srcs = ["nccl_clique_key_test.cc"],
+    deps = [
+        ":nccl_clique_key",
+        "//xla/service:global_device_id",
+        "@com_google_absl//absl/container:btree",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
+
 #===-------------------------------------------------------------------------------------------===//
 # XLA Thunks Runtime
 #===-------------------------------------------------------------------------------------------===//
@@ -399,7 +416,6 @@ cc_library(
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     deps = [
         ":annotation",
-        ":command_buffer_allocations",
         ":command_buffer_cmd",
         "//xla:status",
         "//xla:statusor",
@@ -436,7 +452,6 @@ xla_test(
     ],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
     deps = [
-        ":command_buffer_allocations",
         ":command_buffer_cmd",
         ":command_buffer_thunk",
         "//xla:shape_util",
@@ -500,19 +515,23 @@ cc_library(
         "TENSORFLOW_USE_ROCM=1",
     ]),
     deps = [
+        ":thunk",
         "//xla:util",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:gpu_conv_runner",
-        "//xla/service/gpu:stream_executor_util",
-        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
-    ],
+        "@local_tsl//tsl/platform:errors",
+    ] + if_rocm_is_configured([
+        # keep sorted
+        "//xla/service/gpu:stream_executor_util",
+    ]),
 )
 
 cc_library(
@@ -520,12 +539,17 @@ cc_library(
     srcs = ["copy_thunk.cc"],
     hdrs = ["copy_thunk.h"],
     deps = [
+        ":thunk",
         "//xla:status",
+        "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
-        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
-        "@llvm-project//mlir:IR",
+        "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -593,25 +617,23 @@ cc_library(
     srcs = ["fft_thunk.cc"],
     hdrs = ["fft_thunk.h"],
     deps = [
+        ":thunk",
         "//xla:shape_util",
+        "//xla:status_macros",
         "//xla:types",
         "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
-        "//xla/service/gpu:buffer_allocations",
-        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -675,13 +697,15 @@ cc_library(
     srcs = ["infeed_thunk.cc"],
     hdrs = ["infeed_thunk.h"],
     deps = [
+        ":thunk",
+        "//xla:shape_tree",
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:util",
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:io_feed_manager",
-        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
+        "//xla/stream_executor:device_memory_handle",
         "@com_google_absl//absl/status",
     ],
 )
@@ -691,6 +715,7 @@ cc_library(
     srcs = ["kernel_thunk.cc"],
     hdrs = ["kernel_thunk.h"],
     deps = [
+        ":thunk",
         "//xla:status",
         "//xla:types",
         "//xla/hlo/ir:hlo",
@@ -699,8 +724,8 @@ cc_library(
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:stream_executor_util",
         "//xla/service/gpu/kernels:custom_kernel",
-        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
+        "//xla/stream_executor:kernel_factory",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
@@ -708,7 +733,6 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
-        "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
@@ -824,6 +848,7 @@ cc_library(
         ":nccl_api",
         ":nccl_collective_thunk",
         ":nccl_p2p_thunk_common",
+        ":thunk",
         "//xla:status_macros",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -831,15 +856,15 @@ cc_library(
         "//xla/service:global_device_id",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:ir_emission_utils",
-        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "//xla/translate/mhlo_to_hlo:attribute_exporter",
+        "//xla/tsl/concurrency:async_value",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
-        "@local_tsl//tsl/concurrency:async_value",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -855,6 +880,7 @@ cc_library(
     deps = [
         ":nccl_api",
         ":nccl_clique",
+        ":nccl_clique_key",
         "//xla:debug_options_flags",
         "//xla:shape_util",
         "//xla:status",
@@ -868,7 +894,6 @@ cc_library(
         "//xla/service:rendezvous",
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:ir_emission_utils",
-        "//xla/service/gpu:nccl_clique_key",
         "//xla/service/gpu/runtime:thunk",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor",
@@ -882,7 +907,6 @@ cc_library(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -904,15 +928,15 @@ cc_library(
     srcs = ["nccl_p2p_thunk_common.cc"],
     hdrs = ["nccl_p2p_thunk_common.h"],
     deps = [
+        ":nccl_clique_key",
         ":nccl_collective_thunk",
+        "//xla:executable_run_options",
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:collective_ops_utils",
         "//xla/service:hlo_parser",
-        "//xla/service/gpu:nccl_clique_key",
-        "//xla/stream_executor",
         "//xla/stream_executor:stream_executor_headers",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -931,6 +955,7 @@ cc_library(
     hdrs = ["nccl_recv_thunk.h"],
     deps = [
         ":nccl_api",
+        ":nccl_clique_key",
         ":nccl_collective_thunk",
         ":nccl_p2p_thunk_common",
         "//xla:status_macros",
@@ -938,7 +963,6 @@ cc_library(
         "//xla/service:collective_ops_utils",
         "//xla/service:computation_placer",
         "//xla/service:global_device_id",
-        "//xla/service/gpu:nccl_clique_key",
         "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/log",
@@ -956,6 +980,7 @@ cc_library(
     hdrs = ["nccl_send_thunk.h"],
     deps = [
         ":nccl_api",
+        ":nccl_clique_key",
         ":nccl_collective_thunk",
         ":nccl_p2p_thunk_common",
         "//xla:status_macros",
@@ -963,7 +988,6 @@ cc_library(
         "//xla/service:collective_ops_utils",
         "//xla/service:computation_placer",
         "//xla/service:global_device_id",
-        "//xla/service/gpu:nccl_clique_key",
         "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/log",
@@ -1052,13 +1076,13 @@ cc_library(
         "//xla/service:global_device_id",
         "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
+        "//xla/tsl/concurrency:async_value",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
-        "@local_tsl//tsl/concurrency:async_value",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/profiler/lib:traceme",
@@ -1072,7 +1096,9 @@ cc_library(
     deps = [
         ":nccl_api",
         ":nccl_clique",
+        ":nccl_clique_key",
         "//xla:executable_run_options",
+        "//xla/ffi:execution_context",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service:executable",
@@ -1080,7 +1106,6 @@ cc_library(
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:gpu_executable_run_options",
-        "//xla/service/gpu:nccl_clique_key",
         "//xla/stream_executor",
         "//xla/translate/mhlo_to_hlo:location_exporter",
         "@com_google_absl//absl/algorithm:container",
@@ -1098,6 +1123,40 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "for_all_thunks",
+    srcs = ["for_all_thunks.cc"],
+    hdrs = ["for_all_thunks.h"],
+    deps = [
+        ":address_computation_thunk",
+        ":command_buffer_thunk",
+        ":conditional_thunk",
+        ":sequential_thunk",
+        ":thunk",
+        ":while_thunk",
+        "@com_google_absl//absl/functional:function_ref",
+        "@local_tsl//tsl/platform:casts",
+    ],
+)
+
+cc_test(
+    name = "for_all_thunks_test",
+    srcs = ["for_all_thunks_test.cc"],
+    deps = [
+        ":address_computation_thunk",
+        ":command_buffer_cmd",
+        ":command_buffer_thunk",
+        ":conditional_thunk",
+        ":for_all_thunks",
+        ":sequential_thunk",
+        ":thunk",
+        ":while_thunk",
+        "//xla/service:buffer_assignment",
+        "@com_google_absl//absl/status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "triangular_solve_thunk",
     srcs = if_gpu_is_configured(["triangular_solve_thunk.cc"]),
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
index b24a4f2b7cc3b7..2e49de4220e2df 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
@@ -54,7 +54,7 @@ AddressComputationThunk::AddressComputationThunk(
     std::vector<std::optional<uint64_t>> offset_byte_sizes)
     : Thunk(Kind::kAddressComputation, thunk_info),
       embedded_thunk_(std::make_unique<SequentialThunk>(
-          ThunkInfo(thunk_info.op), std::move(*embedded_thunk))),
+          ThunkInfo(), std::move(*embedded_thunk))),
       embedded_thunk_arguments_(std::move(arguments)),
       fake_allocations_(std::move(fake_allocations)),
       offset_buffer_indices_(std::move(offset_buffer_indices)),
@@ -194,8 +194,7 @@ absl::Status AddressComputationThunk::ExecuteOnStream(
   // of bigger ones allocated elsewhere.
   BufferAllocations new_allocations(new_buffers,
                                     orig_allocations.device_ordinal(),
-                                    orig_allocations.memory_allocator(),
-                                    orig_allocations.external_allocations());
+                                    orig_allocations.memory_allocator());
 
   Thunk::ExecuteParams new_params =
       Thunk::ExecuteParams::CloneWithNewAllocations(params, new_allocations);
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
index 8d36751b9d830d..bfc70574d975d5 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
+#include <optional>
 #include <vector>
 
 #include "absl/base/thread_annotations.h"
@@ -55,6 +56,8 @@ class AddressComputationThunk : public Thunk {
   AddressComputationThunk(const AddressComputationThunk&) = delete;
   AddressComputationThunk& operator=(const AddressComputationThunk&) = delete;
 
+  const Thunk* embedded_thunk() const { return embedded_thunk_.get(); }
+
   absl::Status Prepare(const PrepareParams& params,
                        ResourceRequests& resource_requests) override;
   absl::Status Initialize(const InitializeParams& params) override;
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
index 887f23d8e5575b..bd51204ab2f480 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/gpu/gpu_types.h"  // IWYU pragma: keep
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
@@ -129,15 +130,14 @@ TEST(AddressComputationThunkTest, SlicedGemm) {
   // Creating embedded GEMM thunk.
   ThunkSequence seq;
   seq.emplace_back(std::make_unique<GemmThunk>(
-      Thunk::ThunkInfo(nullptr), config.value(), slice_lhs_fake, slice_rhs,
-      slice_out, slice_workspace, /*deterministic=*/true));
+      Thunk::ThunkInfo(), config.value(), slice_lhs_fake, slice_rhs, slice_out,
+      slice_workspace, /*deterministic=*/true));
 
   // Wrapping address computation thunk around the GEMM thunk.
   std::vector<BufferAllocation::Slice> lhs_offsets{slice_lhs_offset_0,
                                                    slice_lhs_offset_1};
   AddressComputationThunk thunk(
-      Thunk::ThunkInfo(nullptr),
-      std::make_unique<ThunkSequence>(std::move(seq)),
+      Thunk::ThunkInfo(), std::make_unique<ThunkSequence>(std::move(seq)),
       {slice_lhs, slice_rhs, slice_out, slice_workspace},
       std::move(fake_allocations),
       {lhs_offsets, std::nullopt, std::nullopt, std::nullopt},
@@ -186,13 +186,12 @@ TEST(AddressComputationThunkTest, SlicedGemm) {
 
   // Preparing parameters for thunk execution.
   ServiceExecutableRunOptions run_options;
+  se::StreamExecutorMemoryAllocator allocator(executor);
   BufferAllocations allocations(
-      {lhs, rhs, out, workspace, lhs_offset_0, lhs_offset_1}, 0,
-      executor->GetAllocator());
+      {lhs, rhs, out, workspace, lhs_offset_0, lhs_offset_1}, 0, &allocator);
 
-  Thunk::ExecuteParams params =
-      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
-                                   stream.get(), {}, nullptr, nullptr);
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);
 
   Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
   TF_ASSERT_OK(thunk.Initialize(
@@ -284,7 +283,7 @@ TEST(AddressComputationThunkTest, SlicedNonContiguousGemm) {
   // Creating embedded GEMM thunk.
   ThunkSequence seq;
   seq.emplace_back(std::make_unique<GemmThunk>(
-      Thunk::ThunkInfo(nullptr), config.value(), slice_lhs_fake, slice_rhs_fake,
+      Thunk::ThunkInfo(), config.value(), slice_lhs_fake, slice_rhs_fake,
       slice_out, slice_workspace, /*deterministic=*/true));
 
   // Wrapping address computation thunk around the GEMM thunk.
@@ -293,8 +292,7 @@ TEST(AddressComputationThunkTest, SlicedNonContiguousGemm) {
   std::vector<BufferAllocation::Slice> rhs_offsets{slice_rhs_offset_0,
                                                    slice_rhs_offset_1};
   AddressComputationThunk thunk(
-      Thunk::ThunkInfo(nullptr),
-      std::make_unique<ThunkSequence>(std::move(seq)),
+      Thunk::ThunkInfo(), std::make_unique<ThunkSequence>(std::move(seq)),
       {slice_lhs, slice_rhs, slice_out, slice_workspace},
       std::move(fake_allocations),
       {lhs_offsets, rhs_offsets, std::nullopt, std::nullopt},
@@ -358,13 +356,13 @@ TEST(AddressComputationThunkTest, SlicedNonContiguousGemm) {
 
   // Preparing parameters for thunk execution.
   ServiceExecutableRunOptions run_options;
+  se::StreamExecutorMemoryAllocator allocator(executor);
   BufferAllocations allocations({lhs, rhs, out, workspace, lhs_offset_0,
                                  lhs_offset_1, rhs_offset_0, rhs_offset_1},
-                                0, executor->GetAllocator());
+                                0, &allocator);
 
-  Thunk::ExecuteParams params =
-      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
-                                   stream.get(), {}, nullptr, nullptr);
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);
 
   Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
   TF_ASSERT_OK(thunk.Initialize(
@@ -449,7 +447,7 @@ TEST(AddressComputationThunkTest, MulipleSlicedOperandsGemm) {
   // Creating embedded GEMM thunk.
   ThunkSequence seq;
   seq.emplace_back(std::make_unique<GemmThunk>(
-      Thunk::ThunkInfo(nullptr), config.value(), slice_lhs_fake, slice_rhs_fake,
+      Thunk::ThunkInfo(), config.value(), slice_lhs_fake, slice_rhs_fake,
       slice_out, slice_workspace, /*deterministic=*/true));
 
   // Wrapping address computation thunk around the GEMM thunk.
@@ -458,8 +456,7 @@ TEST(AddressComputationThunkTest, MulipleSlicedOperandsGemm) {
   std::vector<BufferAllocation::Slice> rhs_offsets{slice_rhs_offset_0,
                                                    slice_rhs_offset_1};
   AddressComputationThunk thunk(
-      Thunk::ThunkInfo(nullptr),
-      std::make_unique<ThunkSequence>(std::move(seq)),
+      Thunk::ThunkInfo(), std::make_unique<ThunkSequence>(std::move(seq)),
       {slice_lhs, slice_rhs, slice_out, slice_workspace},
       std::move(fake_allocations),
       {lhs_offsets, rhs_offsets, std::nullopt, std::nullopt},
@@ -527,13 +524,13 @@ TEST(AddressComputationThunkTest, MulipleSlicedOperandsGemm) {
 
   // Preparing parameters for thunk execution.
   ServiceExecutableRunOptions run_options;
+  se::StreamExecutorMemoryAllocator allocator(executor);
   BufferAllocations allocations({lhs, rhs, out, workspace, lhs_offset_0,
                                  lhs_offset_1, rhs_offset_0, rhs_offset_1},
-                                0, executor->GetAllocator());
+                                0, &allocator);
 
-  Thunk::ExecuteParams params =
-      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
-                                   stream.get(), {}, nullptr, nullptr);
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);
 
   Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
   TF_ASSERT_OK(thunk.Initialize(
@@ -627,7 +624,7 @@ TEST(AddressComputationThunkTest, SlicedMemcpy) {
   // Creating embedded custom call thunk.
   ThunkSequence seq;
   seq.emplace_back(std::make_unique<CustomCallThunk>(
-      Thunk::ThunkInfo(nullptr), registration->handler, operands, results,
+      Thunk::ThunkInfo(), registration->handler, operands, results,
       /*attributes=*/CustomCallThunk::AttributesMap(),
       /*called_computation=*/nullptr));
 
@@ -635,9 +632,9 @@ TEST(AddressComputationThunkTest, SlicedMemcpy) {
   std::vector<BufferAllocation::Slice> slice_offsets{
       slice_offset_0, slice_offset_1, slice_offset_2, slice_offset_3};
   AddressComputationThunk thunk(
-      Thunk::ThunkInfo(nullptr),
-      std::make_unique<ThunkSequence>(std::move(seq)), {slice_src, slice_dst},
-      std::move(fake_allocations), {slice_offsets, std::nullopt},
+      Thunk::ThunkInfo(), std::make_unique<ThunkSequence>(std::move(seq)),
+      {slice_src, slice_dst}, std::move(fake_allocations),
+      {slice_offsets, std::nullopt},
       {ShapeUtil::MakeShape(PrimitiveType::S32, {8, 8, 10, 8}), std::nullopt},
       // Make sure to pass a dst shape with the same rank as src shape (i.e.
       // original slice result and not bitcasted one)
@@ -673,13 +670,12 @@ TEST(AddressComputationThunkTest, SlicedMemcpy) {
 
   // Preparing parameters for thunk execution.
   ServiceExecutableRunOptions run_options;
+  se::StreamExecutorMemoryAllocator allocator(executor);
   BufferAllocations allocations(
-      {src, dst, offset_0, offset_1, offset_2, offset_3}, 0,
-      executor->GetAllocator());
+      {src, dst, offset_0, offset_1, offset_2, offset_3}, 0, &allocator);
 
-  Thunk::ExecuteParams params =
-      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
-                                   stream.get(), {}, nullptr, nullptr);
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);
 
   Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
   TF_ASSERT_OK(thunk.Initialize(
@@ -786,7 +782,7 @@ TEST(AddressComputationThunkTest, SlicedOutputMemcpy) {
   // Creating embedded custom call thunk.
   ThunkSequence seq;
   seq.emplace_back(std::make_unique<CustomCallThunk>(
-      Thunk::ThunkInfo(nullptr), registration->handler, operands, results,
+      Thunk::ThunkInfo(), registration->handler, operands, results,
       /*attributes=*/CustomCallThunk::AttributesMap(),
       /*called_computation=*/nullptr));
 
@@ -798,9 +794,9 @@ TEST(AddressComputationThunkTest, SlicedOutputMemcpy) {
       slice_dst_offset_0, slice_dst_offset_1, slice_dst_offset_2,
       slice_dst_offset_3};
   AddressComputationThunk thunk(
-      Thunk::ThunkInfo(nullptr),
-      std::make_unique<ThunkSequence>(std::move(seq)), {slice_src, slice_dst},
-      std::move(fake_allocations), {slice_src_offsets, slice_dst_offsets},
+      Thunk::ThunkInfo(), std::make_unique<ThunkSequence>(std::move(seq)),
+      {slice_src, slice_dst}, std::move(fake_allocations),
+      {slice_src_offsets, slice_dst_offsets},
       {ShapeUtil::MakeShape(PrimitiveType::S32, {8, 8, 10, 2}),
        ShapeUtil::MakeShape(PrimitiveType::S32, {2, 2, 2, 2})},
       // Make sure to pass a dst shape with the same rank as src shape (i.e.
@@ -861,14 +857,14 @@ TEST(AddressComputationThunkTest, SlicedOutputMemcpy) {
 
   // Preparing parameters for thunk execution.
   ServiceExecutableRunOptions run_options;
+  se::StreamExecutorMemoryAllocator allocator(executor);
   BufferAllocations allocations(
       {src, dst, src_offset_0, src_offset_1, src_offset_2, src_offset_3,
        dst_offset_0, dst_offset_1, dst_offset_2, dst_offset_3},
-      0, executor->GetAllocator());
+      0, &allocator);
 
-  Thunk::ExecuteParams params =
-      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
-                                   stream.get(), {}, nullptr, nullptr);
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);
 
   Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
   TF_ASSERT_OK(thunk.Initialize(
@@ -967,15 +963,14 @@ TEST(AddressComputationThunkTest, SlicedGemmArbitraryArgumentOrder) {
   // Creating embedded GEMM thunk.
   ThunkSequence seq;
   seq.emplace_back(std::make_unique<GemmThunk>(
-      Thunk::ThunkInfo(nullptr), config.value(), slice_lhs_fake, slice_rhs_fake,
+      Thunk::ThunkInfo(), config.value(), slice_lhs_fake, slice_rhs_fake,
       slice_out_fake, slice_workspace_fake, /*deterministic=*/true));
 
   // Wrapping address computation thunk around the GEMM thunk.
   std::vector<BufferAllocation::Slice> lhs_offsets{slice_lhs_offset_0,
                                                    slice_lhs_offset_1};
   AddressComputationThunk thunk(
-      Thunk::ThunkInfo(nullptr),
-      std::make_unique<ThunkSequence>(std::move(seq)),
+      Thunk::ThunkInfo(), std::make_unique<ThunkSequence>(std::move(seq)),
       {slice_lhs, slice_rhs, slice_out, slice_workspace},
       std::move(fake_allocations),
       {lhs_offsets, std::nullopt, std::nullopt, std::nullopt},
@@ -1024,13 +1019,12 @@ TEST(AddressComputationThunkTest, SlicedGemmArbitraryArgumentOrder) {
 
   // Preparing parameters for thunk execution.
   ServiceExecutableRunOptions run_options;
+  se::StreamExecutorMemoryAllocator allocator(executor);
   BufferAllocations allocations(
-      {workspace, lhs, out, rhs, lhs_offset_0, lhs_offset_1}, 0,
-      executor->GetAllocator());
+      {workspace, lhs, out, rhs, lhs_offset_0, lhs_offset_1}, 0, &allocator);
 
-  Thunk::ExecuteParams params =
-      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
-                                   stream.get(), {}, nullptr, nullptr);
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);
 
   Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
   TF_ASSERT_OK(thunk.Initialize(
@@ -1117,15 +1111,14 @@ TEST(AddressComputationThunkTest, SlicedGemmArbitraryNumberOfArguments) {
   // Creating embedded GEMM thunk.
   ThunkSequence seq;
   seq.emplace_back(std::make_unique<GemmThunk>(
-      Thunk::ThunkInfo(nullptr), config.value(), slice_lhs_fake, slice_rhs_fake,
+      Thunk::ThunkInfo(), config.value(), slice_lhs_fake, slice_rhs_fake,
       slice_out_fake, slice_workspace_fake, /*deterministic=*/true));
 
   // Wrapping address computation thunk around the GEMM thunk.
   std::vector<BufferAllocation::Slice> lhs_offsets{slice_lhs_offset_0,
                                                    slice_lhs_offset_1};
   AddressComputationThunk thunk(
-      Thunk::ThunkInfo(nullptr),
-      std::make_unique<ThunkSequence>(std::move(seq)),
+      Thunk::ThunkInfo(), std::make_unique<ThunkSequence>(std::move(seq)),
       {slice_lhs, slice_rhs, slice_out, slice_workspace},
       std::move(fake_allocations),
       {lhs_offsets, std::nullopt, std::nullopt, std::nullopt},
@@ -1174,14 +1167,14 @@ TEST(AddressComputationThunkTest, SlicedGemmArbitraryNumberOfArguments) {
 
   // Preparing parameters for thunk execution.
   ServiceExecutableRunOptions run_options;
+  se::StreamExecutorMemoryAllocator allocator(executor);
   BufferAllocations allocations(
       {workspace, /*garbage, to be ignored*/ se::DeviceMemoryBase(), out, rhs,
        lhs_offset_0, lhs_offset_1, /*garbage, to be ignored*/ rhs, lhs},
-      0, executor->GetAllocator());
+      0, &allocator);
 
-  Thunk::ExecuteParams params =
-      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
-                                   stream.get(), {}, nullptr, nullptr);
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);
 
   Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
   TF_ASSERT_OK(thunk.Initialize(
@@ -1259,15 +1252,14 @@ TEST(AddressComputationThunkTest, SlicedTupledOperandGemm) {
   // Creating embedded GEMM thunk.
   ThunkSequence seq;
   seq.emplace_back(std::make_unique<GemmThunk>(
-      Thunk::ThunkInfo(nullptr), config.value(), slice_lhs_fake, slice_rhs,
-      slice_out, slice_workspace, /*deterministic=*/true));
+      Thunk::ThunkInfo(), config.value(), slice_lhs_fake, slice_rhs, slice_out,
+      slice_workspace, /*deterministic=*/true));
 
   // Wrapping address computation thunk around the GEMM thunk.
   std::vector<BufferAllocation::Slice> lhs_offsets{slice_lhs_offset_0,
                                                    slice_lhs_offset_1};
   AddressComputationThunk thunk(
-      Thunk::ThunkInfo(nullptr),
-      std::make_unique<ThunkSequence>(std::move(seq)),
+      Thunk::ThunkInfo(), std::make_unique<ThunkSequence>(std::move(seq)),
       {slice_lhs, slice_rhs, slice_out, slice_workspace},
       std::move(fake_allocations),
       {lhs_offsets, std::nullopt, std::nullopt, std::nullopt},
@@ -1323,13 +1315,13 @@ TEST(AddressComputationThunkTest, SlicedTupledOperandGemm) {
 
   // Preparing parameters for thunk execution.
   ServiceExecutableRunOptions run_options;
+  se::StreamExecutorMemoryAllocator allocator(executor);
   BufferAllocations allocations(
       {lhs_whole_buffer, rhs, out, workspace, lhs_offset_0, lhs_offset_1}, 0,
-      executor->GetAllocator());
+      &allocator);
 
-  Thunk::ExecuteParams params =
-      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
-                                   stream.get(), {}, nullptr, nullptr);
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);
 
   Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
   TF_ASSERT_OK(thunk.Initialize(
@@ -1429,7 +1421,7 @@ TEST(AddressComputationThunkTest, SlicedMemcpyOOB) {
   // Creating embedded custom call thunk.
   ThunkSequence seq;
   seq.emplace_back(std::make_unique<CustomCallThunk>(
-      Thunk::ThunkInfo(nullptr), registration->handler, operands, results,
+      Thunk::ThunkInfo(), registration->handler, operands, results,
       /*attributes=*/CustomCallThunk::AttributesMap(),
       /*called_computation=*/nullptr));
 
@@ -1441,9 +1433,9 @@ TEST(AddressComputationThunkTest, SlicedMemcpyOOB) {
       slice_dst_offset_0, slice_dst_offset_1, slice_dst_offset_2,
       slice_dst_offset_3};
   AddressComputationThunk thunk(
-      Thunk::ThunkInfo(nullptr),
-      std::make_unique<ThunkSequence>(std::move(seq)), {slice_src, slice_dst},
-      std::move(fake_allocations), {slice_src_offsets, slice_dst_offsets},
+      Thunk::ThunkInfo(), std::make_unique<ThunkSequence>(std::move(seq)),
+      {slice_src, slice_dst}, std::move(fake_allocations),
+      {slice_src_offsets, slice_dst_offsets},
       {ShapeUtil::MakeShape(PrimitiveType::S32, {8, 8, 10, 2}),
        ShapeUtil::MakeShape(PrimitiveType::S32, {2, 2, 2, 2})},
       // Make sure to pass a dst shape with the same rank as src shape (i.e.
@@ -1506,14 +1498,14 @@ TEST(AddressComputationThunkTest, SlicedMemcpyOOB) {
 
   // Preparing parameters for thunk execution.
   ServiceExecutableRunOptions run_options;
+  se::StreamExecutorMemoryAllocator allocator(executor);
   BufferAllocations allocations(
       {src, dst, src_offset_0, src_offset_1, src_offset_2, src_offset_3,
        dst_offset_0, dst_offset_1, dst_offset_2, dst_offset_3},
-      0, executor->GetAllocator());
+      0, &allocator);
 
-  Thunk::ExecuteParams params =
-      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
-                                   stream.get(), {}, nullptr, nullptr);
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);
 
   Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
   TF_ASSERT_OK(thunk.Initialize(
@@ -1611,15 +1603,14 @@ TEST(AddressComputationThunkTest, SlicedOperandsSameBufferGemm) {
   // Creating embedded GEMM thunk.
   ThunkSequence seq;
   seq.emplace_back(std::make_unique<GemmThunk>(
-      Thunk::ThunkInfo(nullptr), config.value(), slice_lhs_fake, slice_rhs_fake,
+      Thunk::ThunkInfo(), config.value(), slice_lhs_fake, slice_rhs_fake,
       slice_out_fake, slice_workspace_fake, /*deterministic=*/true));
 
   // Wrapping address computation thunk around the GEMM thunk.
   std::vector<BufferAllocation::Slice> lhs_offsets{slice_lhs_offset_0,
                                                    slice_lhs_offset_1};
   AddressComputationThunk thunk(
-      Thunk::ThunkInfo(nullptr),
-      std::make_unique<ThunkSequence>(std::move(seq)),
+      Thunk::ThunkInfo(), std::make_unique<ThunkSequence>(std::move(seq)),
       {slice_lhs, slice_rhs, slice_out, slice_workspace},
       std::move(fake_allocations),
       {lhs_offsets, std::nullopt, std::nullopt, std::nullopt},
@@ -1675,12 +1666,12 @@ TEST(AddressComputationThunkTest, SlicedOperandsSameBufferGemm) {
 
   // Preparing parameters for thunk execution.
   ServiceExecutableRunOptions run_options;
+  se::StreamExecutorMemoryAllocator allocator(executor);
   BufferAllocations allocations({buffer, workspace, lhs_offset_0, lhs_offset_1},
-                                0, executor->GetAllocator());
+                                0, &allocator);
 
-  Thunk::ExecuteParams params =
-      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
-                                   stream.get(), {}, nullptr, nullptr);
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);
 
   Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
   TF_ASSERT_OK(thunk.Initialize(
diff --git a/third_party/xla/xla/service/gpu/runtime/annotation.cc b/third_party/xla/xla/service/gpu/runtime/annotation.cc
index 809f02d30d10b3..f0a9600ccd882d 100644
--- a/third_party/xla/xla/service/gpu/runtime/annotation.cc
+++ b/third_party/xla/xla/service/gpu/runtime/annotation.cc
@@ -154,7 +154,7 @@ class SourceLocationVisitor : public ConstDfsHloVisitorWithDefault {
     return std::move(oss).str();
   }
 
-  Status DefaultAction(HloInstruction const* inst) final {
+  absl::Status DefaultAction(HloInstruction const* inst) final {
     OpMetadata const& meta = inst->metadata();
     // The full op_name is split across three places: the module-level
     // annotation shows the prefix that is common to the whole module, the
diff --git a/third_party/xla/xla/service/gpu/runtime/cholesky_thunk.cc b/third_party/xla/xla/service/gpu/runtime/cholesky_thunk.cc
index b91be4449d3b8b..1b6112f2f7e0a2 100644
--- a/third_party/xla/xla/service/gpu/runtime/cholesky_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/cholesky_thunk.cc
@@ -47,8 +47,8 @@ absl::Status DoPotrfBatched(const se::GpuAsmOpts& asm_opts,
   se::DeviceMemory<int> infos(params->info_buffer);
 #if TENSORFLOW_USE_ROCSOLVER
   // hipsolver is not supported so allocate a GPU buffer
-  se::ScopedDeviceMemory<T*> ptrs =
-      stream->parent()->AllocateOwnedArray<T*>(batch_size_);
+  se::ScopedDeviceMemory<T*> ptrs(
+      stream->parent(), stream->parent()->AllocateArray<T*>(batch_size_));
   auto as = *ptrs;
 #else
   se::DeviceMemory<T*> as(params->workspace_buffer);
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_allocations.cc b/third_party/xla/xla/service/gpu/runtime/command_buffer_allocations.cc
deleted file mode 100644
index a8ac270ff8d2e2..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_allocations.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/runtime/command_buffer_allocations.h"
-
-#include <utility>
-
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "absl/strings/str_cat.h"
-#include "xla/service/buffer_assignment.h"
-#include "xla/status.h"
-#include "xla/statusor.h"
-#include "xla/stream_executor/device_memory.h"
-
-namespace xla::gpu {
-
-absl::StatusOr<se::DeviceMemoryBase> CommandBufferAllocations::GetDeviceAddress(
-    BufferAllocation::Index index) const {
-  auto base = allocs_.find(index);
-  if (base == allocs_.end()) {
-    return absl::InternalError(absl::StrCat("Command buffer allocation #",
-                                            index, " was not allocated"));
-  }
-  return allocs_.at(index);
-}
-
-absl::Status CommandBufferAllocations::AddAllocation(
-    BufferAllocation::Index index, se::DeviceMemoryBase memory) {
-  VLOG(2) << "Add comand buffer allocation: index=" << index
-          << "; ptr=" << memory.opaque();
-
-  auto emplaced = allocs_.try_emplace(index, std::move(memory));
-  if (emplaced.second == false) {
-    return absl::InternalError(absl::StrCat("Command buffer allocation #",
-                                            index, " was already allocated"));
-  }
-  return absl::OkStatus();
-}
-
-absl::Status CommandBufferAllocations::EraseAllocation(
-    BufferAllocation::Index index) {
-  VLOG(2) << "Erase comand buffer allocation: index=" << index;
-
-  if (allocs_.erase(index) == 0) {
-    return absl::InternalError(absl::StrCat("Command buffer allocation #",
-                                            index, " was not allocated"));
-  }
-  return absl::OkStatus();
-}
-
-}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_allocations.h b/third_party/xla/xla/service/gpu/runtime/command_buffer_allocations.h
deleted file mode 100644
index c4257d0d92fc0b..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_allocations.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_RUNTIME_COMMAND_BUFFER_ALLOCATIONS_H_
-#define XLA_SERVICE_GPU_RUNTIME_COMMAND_BUFFER_ALLOCATIONS_H_
-
-#include "absl/container/flat_hash_map.h"
-#include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/buffer_allocations.h"
-#include "xla/status.h"
-#include "xla/stream_executor/device_memory.h"
-
-namespace xla::gpu {
-
-// Command buffer allocations tracks external buffer allocations done via the
-// CommandBuffer API and owned by the XLA executable (via instantiated command
-// buffers and memory allocation Gpu graph nodes).
-class CommandBufferAllocations : public BufferAllocations::ExternalAllocations {
- public:
-  absl::StatusOr<se::DeviceMemoryBase> GetDeviceAddress(
-      BufferAllocation::Index index) const override;
-
-  // Adds an external allocation for a given buffer index. Returns error if
-  // allocation already exists.
-  absl::Status AddAllocation(BufferAllocation::Index index,
-                             se::DeviceMemoryBase memory) override;
-
-  // Erases an external allocation for a given buffer index. Returns error if
-  // allocation does not exists.
-  absl::Status EraseAllocation(BufferAllocation::Index index) override;
-
- private:
-  absl::flat_hash_map<BufferAllocation::Index, se::DeviceMemoryBase> allocs_;
-};
-
-}  // namespace xla::gpu
-
-#endif  // XLA_SERVICE_GPU_RUNTIME_COMMAND_BUFFER_ALLOCATIONS_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.cc b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.cc
index 887af3900c9748..0d4305db9cc8a5 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.cc
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.cc
@@ -49,11 +49,11 @@ limitations under the License.
 #include "xla/service/gpu/kernels/custom_kernel.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/nccl_clique_key.h"
 #include "xla/service/gpu/runtime/annotation.h"
 #include "xla/service/gpu/runtime/nccl_all_gather_thunk.h"
 #include "xla/service/gpu/runtime/nccl_all_reduce_thunk.h"
 #include "xla/service/gpu/runtime/nccl_api.h"
+#include "xla/service/gpu/runtime/nccl_clique_key.h"
 #include "xla/service/gpu/runtime/nccl_collective_broadcast_thunk.h"
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/thunk.h"
@@ -63,12 +63,14 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_factory.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/trace_command_buffer_factory.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/types.h"  // IWYU pragma: keep
 #include "xla/util.h"
-#include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
@@ -292,7 +294,6 @@ absl::Status CommandBufferCmdSequence::Record(
     }
   }
 
-  se::StreamExecutor* device = execute_params.stream->parent();
   const ModuleAnnotations* annotations = GetCurrentModuleAnnotations();
 
   // Track the number of commands recorded between barriers.
@@ -309,9 +310,11 @@ absl::Status CommandBufferCmdSequence::Record(
               << num_recorded_commands[execution_scope_id]
               << " recorded commands into the execution scope #"
               << execution_scope_id.value();
-      TF_RETURN_IF_ERROR(command_buffer->Barrier(device, execution_scope_id));
+      TF_RETURN_IF_ERROR(command_buffer->Barrier(execution_scope_id));
       num_recorded_commands.erase(execution_scope_id);
     }
+    VLOG(5) << " Record command buffer with scope id "
+            << execution_scope_id.value();
 
     TF_RETURN_IF_ERROR(
         command.cmd->Record(execute_params, record_params, command_buffer));
@@ -396,8 +399,9 @@ absl::StatusOr<se::CommandBuffer*> TracedCommandBuffer::GetOrTraceCommandBuffer(
     // Create a new entry by calling a user-provided tracing function, move it
     // to front and return a pointer to cached command buffer.
     if (entries_[i].command_buffer == nullptr) {
-      TF_ASSIGN_OR_RETURN(entries_[i].command_buffer,
-                          se::CommandBuffer::Trace(executor, stream, trace));
+      TF_ASSIGN_OR_RETURN(
+          entries_[i].command_buffer,
+          se::TraceCommandBufferFactory::Create(executor, stream, trace));
       entries_[i].recorded_allocs.assign(allocs.begin(), allocs.end());
       return shift_right(i).command_buffer.get();
     }
@@ -406,8 +410,9 @@ absl::StatusOr<se::CommandBuffer*> TracedCommandBuffer::GetOrTraceCommandBuffer(
   // Create a new entry by calling a user-provided tracing function, replace the
   // last entry with it, move it to front and return a pointer to cached command
   // buffer.
-  TF_ASSIGN_OR_RETURN(entries_[capacity_ - 1].command_buffer,
-                      se::CommandBuffer::Trace(executor, stream, trace));
+  TF_ASSIGN_OR_RETURN(
+      entries_[capacity_ - 1].command_buffer,
+      se::TraceCommandBufferFactory::Create(executor, stream, trace));
   entries_[capacity_ - 1].recorded_allocs.assign(allocs.begin(), allocs.end());
   return shift_right(capacity_ - 1).command_buffer.get();
 }
@@ -666,7 +671,7 @@ absl::Status CustomKernelLaunchCmd::Initialize(
 
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<se::Kernel> kernel,
-      se::Kernel::Create(params.executor, custom_kernel_.kernel_spec()));
+      se::KernelFactory::Create(params.executor, custom_kernel_.kernel_spec()));
 
   absl::MutexLock lock(&mutex_);
   kernels_.emplace(params.executor, std::move(kernel));
@@ -847,11 +852,12 @@ absl::Status IfCmd::Record(const Thunk::ExecuteParams& execute_params,
   VLOG(5) << "  pred: " << pred_ << " (" << pred.opaque() << ")";
 
   return command_buffer->If(
-      execution_scope_id, execute_params.stream->parent(),
-      se::DeviceMemory<bool>(pred),
+      execution_scope_id, se::DeviceMemory<bool>(pred),
       CreateBuilder(&then_commands_, &execute_params, &record_params));
 }
 
+bool IfCmd::force_update() { return then_commands_.force_update(); }
+
 CommandBufferCmd::BufferUsageVector IfCmd::buffers() {
   absl::flat_hash_set<CommandBufferCmd::BufferUsage> buffers;
   buffers.emplace(pred_, MemoryAccess::kRead);
@@ -891,12 +897,15 @@ absl::Status IfElseCmd::Record(const Thunk::ExecuteParams& execute_params,
   VLOG(5) << "  pred: " << pred_ << " (" << pred.opaque() << ")";
 
   return command_buffer->IfElse(
-      execution_scope_id, execute_params.stream->parent(),
-      se::DeviceMemory<bool>(pred),
+      execution_scope_id, se::DeviceMemory<bool>(pred),
       CreateBuilder(&then_commands_, &execute_params, &record_params),
       CreateBuilder(&else_commands_, &execute_params, &record_params));
 }
 
+bool IfElseCmd::force_update() {
+  return (then_commands_.force_update() || else_commands_.force_update());
+}
+
 CommandBufferCmd::BufferUsageVector IfElseCmd::buffers() {
   absl::flat_hash_set<CommandBufferCmd::BufferUsage> buffers;
   buffers.emplace(pred_, MemoryAccess::kRead);
@@ -937,12 +946,16 @@ absl::Status CaseCmd::Record(const Thunk::ExecuteParams& execute_params,
   VLOG(5) << "  index: " << index_ << " (" << index.opaque() << ")";
 
   return command_buffer->Case(execution_scope_id,
-                              execute_params.stream->parent(),
                               se::DeviceMemory<int32_t>(index),
                               CreateBuilders(absl::MakeSpan(branches_commands_),
                                              &execute_params, &record_params));
 }
 
+bool CaseCmd::force_update() {
+  return absl::c_any_of(branches_commands_,
+                        [](const auto& seq) { return seq.force_update(); });
+}
+
 CommandBufferCmd::BufferUsageVector CaseCmd::buffers() {
   absl::flat_hash_set<CommandBufferCmd::BufferUsage> buffers;
   buffers.emplace(index_, MemoryAccess::kRead);
@@ -983,11 +996,13 @@ absl::Status ForCmd::Record(const Thunk::ExecuteParams& execute_params,
           << loop_counter.opaque() << ")";
 
   return command_buffer->For(
-      execution_scope_id, execute_params.stream->parent(), num_iterations_,
+      execution_scope_id, num_iterations_,
       se::DeviceMemory<int32_t>(loop_counter),
       CreateBuilder(&body_commands_, &execute_params, &record_params));
 }
 
+bool ForCmd::force_update() { return body_commands_.force_update(); }
+
 CommandBufferCmd::BufferUsageVector ForCmd::buffers() {
   absl::flat_hash_set<CommandBufferCmd::BufferUsage> buffers;
   buffers.emplace(loop_counter_, MemoryAccess::kWrite);
@@ -1028,13 +1043,16 @@ absl::Status WhileCmd::Record(const Thunk::ExecuteParams& execute_params,
   VLOG(5) << "  pred: " << pred_ << " (" << pred.opaque() << ")";
 
   return command_buffer->While(
-      execution_scope_id, execute_params.stream->parent(),
-      se::DeviceMemory<bool>(pred),
+      execution_scope_id, se::DeviceMemory<bool>(pred),
       CreateExecutionScopeBuilder(&cond_commands_, &execute_params,
                                   &record_params),
       CreateBuilder(&body_commands_, &execute_params, &record_params));
 }
 
+bool WhileCmd::force_update() {
+  return (cond_commands_.force_update() || body_commands_.force_update());
+}
+
 CommandBufferCmd::BufferUsageVector WhileCmd::buffers() {
   absl::flat_hash_set<CommandBufferCmd::BufferUsage> buffers;
   buffers.emplace(pred_, MemoryAccess::kWrite);
@@ -1045,60 +1063,6 @@ CommandBufferCmd::BufferUsageVector WhileCmd::buffers() {
   return {buffers.begin(), buffers.end()};
 }
 
-//===----------------------------------------------------------------------===//
-// AllocateCmd
-//===----------------------------------------------------------------------===//
-
-AllocateCmd::AllocateCmd(ExecutionStreamId execution_stream_id,
-                         BufferAllocation allocation)
-    : CommandBufferCmd(execution_stream_id), allocation_(allocation) {}
-
-absl::Status AllocateCmd::Record(const Thunk::ExecuteParams& execute_params,
-                                 const RecordParams& record_params,
-                                 se::CommandBuffer* command_buffer) {
-  // Memory allocation address is returned on graph creation, and there is no
-  // update operation
-  ExecutionScopeId execution_scope_id = GetExecutionScope(record_params);
-  VLOG(2) << "AllocationCmd: index=" << allocation_.index()
-          << "; execution_scope_id=" << execution_scope_id.value();
-
-  TF_ASSIGN_OR_RETURN(
-      se::DeviceMemoryBase buffer,
-      command_buffer->Allocate(execution_scope_id, allocation_.size()));
-  return execute_params.buffer_allocations->AddExternalAllocation(
-      allocation_.index(), buffer);
-}
-
-CommandBufferCmd::BufferUsageVector AllocateCmd::buffers() { return {}; }
-
-//===----------------------------------------------------------------------===//
-// FreeCmd
-//===----------------------------------------------------------------------===//
-
-FreeCmd::FreeCmd(ExecutionStreamId execution_stream_id,
-                 BufferAllocation allocation)
-    : CommandBufferCmd(execution_stream_id), allocation_(allocation) {}
-
-absl::Status FreeCmd::Record(const Thunk::ExecuteParams& execute_params,
-                             const RecordParams& record_params,
-                             se::CommandBuffer* command_buffer) {
-  ExecutionScopeId execution_scope_id = GetExecutionScope(record_params);
-  VLOG(2) << "FreeCmd: index=" << allocation_.index()
-          << "; execution_scope_id=" << execution_scope_id.value();
-
-  se::DeviceMemoryBase address =
-      execute_params.buffer_allocations->GetDeviceAddress(allocation_.index());
-
-  // Free is in the same command buffer
-  TF_RETURN_IF_ERROR(command_buffer->Free(execution_scope_id, address));
-
-  // Remove the buffer from external allocations.
-  return execute_params.buffer_allocations->EraseExternalAllocation(
-      allocation_.index());
-}
-
-CommandBufferCmd::BufferUsageVector FreeCmd::buffers() { return {}; }
-
 //===----------------------------------------------------------------------===//
 // GemmCmd
 //===----------------------------------------------------------------------===//
@@ -1266,7 +1230,7 @@ absl::Status CustomCallCmd::RecordLegacyCustomCall(
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   TF_ASSIGN_OR_RETURN(
       auto nested_cmd,
-      se::CommandBuffer::Trace(
+      se::TraceCommandBufferFactory::Create(
           execute_params.stream->parent(),
           execute_params.command_buffer_trace_stream, [&](se::Stream* stream) {
             se::gpu::GpuStreamHandle gpu_stream =
@@ -1346,7 +1310,7 @@ absl::Status CustomCallCmd::RecordXlaFfiCall(
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   TF_ASSIGN_OR_RETURN(
       auto nested_cmd,
-      se::CommandBuffer::Trace(
+      se::TraceCommandBufferFactory::Create(
           execute_params.stream->parent(),
           execute_params.command_buffer_trace_stream, [&](se::Stream* stream) {
             ExecutableRunOptions run_options;
@@ -1392,7 +1356,6 @@ absl::Status BarrierCmd::Record(const Thunk::ExecuteParams& execute_params,
           << " to stream " << execution_stream_id().value();
   if (from_stream_id_ != execution_stream_id()) {
     TF_RETURN_IF_ERROR(command_buffer->Barrier(
-        execute_params.stream->parent(),
         CommandBufferCmd::GetExecutionScope(record_params, from_stream_id_),
         CommandBufferCmd::GetExecutionScope(record_params,
                                             execution_stream_id())));
@@ -1407,11 +1370,29 @@ BarrierCmd::BufferUsageVector BarrierCmd::buffers() { return {}; }
 //===----------------------------------------------------------------------===//
 
 CollectiveCmd::CollectiveCmd(ExecutionStreamId execution_stream_id,
+                             ExecutionStreamId async_from_stream_id,
                              NcclApi* nccl_api, NcclCollectiveConfig config)
-    : TracedCommandBufferCmd(execution_stream_id),
+    : CommandBufferCmd(execution_stream_id),
+      async_from_stream_id_(async_from_stream_id),
       nccl_api_(nccl_api),
       config_(std::move(config)) {}
 
+absl::Status CollectiveCmd::BarrierIfAsync(
+    se::CommandBuffer* command_buffer, se::StreamExecutor* executor,
+    const CommandBufferCmd::RecordParams& record_params) {
+  if (IsAsync()) {
+    TF_RETURN_IF_ERROR(
+        command_buffer->Barrier(CommandBufferCmd::GetExecutionScope(
+                                    record_params, async_from_stream_id_),
+                                CommandBufferCmd::GetExecutionScope(
+                                    record_params, execution_stream_id())));
+    VLOG(5) << "Insert Async barrier from stream "
+            << async_from_stream_id_.value() << " to stream "
+            << execution_stream_id().value();
+  }
+  return absl::OkStatus();
+}
+
 absl::Status CollectiveCmd::Prepare(
     const Thunk::PrepareParams& params,
     Thunk::ResourceRequests& resource_requests) {
@@ -1436,26 +1417,45 @@ absl::Status CollectiveCmd::Prepare(
       collectives->global_device_id_map ? &local_devices : nullptr);
 
   return resource_requests.AddClique(
-      NcclCliqueKey(std::move(participants), /*stream_id=*/NcclStreamId(0),
+      NcclCliqueKey(std::move(participants), nccl_stream_id(),
                     GetAsyncStreamKind()),
       num_local_participants);
 }
 
+absl::Status CollectiveCmd::AddTracedCommandBuffer(
+    const Thunk::ExecuteParams& execute_params,
+    const RecordParams& record_params, se::CommandBuffer* command_buffer,
+    absl::FunctionRef<absl::Status(se::Stream*)> trace) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<se::CommandBuffer> nested_cmd,
+                      se::TraceCommandBufferFactory::Create(
+                          execute_params.stream->parent(),
+                          execute_params.command_buffer_trace_stream, trace));
+
+  ExecutionScopeId execution_scope_id = GetExecutionScope(record_params);
+  return command_buffer->AddNestedCommandBuffer(execution_scope_id,
+                                                *nested_cmd);
+}
+
 //===----------------------------------------------------------------------===//
 // AllReduceCmd
 //===----------------------------------------------------------------------===//
 
 AllReduceCmd::AllReduceCmd(
-    ExecutionStreamId execution_stream_id, NcclApi* nccl_api,
+    ExecutionStreamId execution_stream_id,
+    ExecutionStreamId async_from_stream_id, NcclApi* nccl_api,
     NcclCollectiveConfig config, ReductionKind reduction_kind,
     absl::Span<const NcclCollectiveThunk::Buffer> buffers)
-    : CollectiveCmd(execution_stream_id, nccl_api, std::move(config)),
+    : CollectiveCmd(execution_stream_id, async_from_stream_id, nccl_api,
+                    std::move(config)),
       reduction_kind_(reduction_kind),
       buffers_(buffers.begin(), buffers.end()) {}
 
 absl::Status AllReduceCmd::Record(const Thunk::ExecuteParams& execute_params,
                                   const RecordParams& record_params,
                                   se::CommandBuffer* command_buffer) {
+  TF_RETURN_IF_ERROR(BarrierIfAsync(
+      command_buffer, execute_params.stream->parent(), record_params));
+
   TF_ASSIGN_OR_RETURN(
       std::vector<DeviceBufferPair> device_buffers,
       ConvertToDeviceBuffers(execute_params.buffer_allocations, buffers_,
@@ -1477,13 +1477,11 @@ absl::Status AllReduceCmd::Record(const Thunk::ExecuteParams& execute_params,
         "AllReduceCmd requires collective parameters and cliques");
   }
 
-  // Today when recording collective operations into command buffers we always
-  // use a sync mode and a stream id `0`.
   TF_ASSIGN_OR_RETURN(
       NcclCommHandleWrapper comm_handle,
       GetNcclComm(*execute_params.collective_params,
                   *execute_params.collective_cliques, config().replica_groups,
-                  config().group_mode, NcclStreamId(0), GetAsyncStreamKind()));
+                  config().group_mode, nccl_stream_id(), GetAsyncStreamKind()));
   NcclApi::NcclCommHandle comm = comm_handle.comm_handle;
   // Use custom allocator for persistent execution plans.
   NcclApi::ScopedPersistentPlanAllocator scoped_allocator(
@@ -1513,16 +1511,21 @@ CommandBufferCmd::BufferUsageVector AllReduceCmd::buffers() {
 //===----------------------------------------------------------------------===//
 
 ReduceScatterCmd::ReduceScatterCmd(
-    ExecutionStreamId execution_stream_id, NcclApi* nccl_api,
+    ExecutionStreamId execution_stream_id,
+    ExecutionStreamId async_from_stream_id, NcclApi* nccl_api,
     NcclCollectiveConfig config, ReductionKind reduction_kind,
     absl::Span<const NcclCollectiveThunk::Buffer> buffers)
-    : CollectiveCmd(execution_stream_id, nccl_api, std::move(config)),
+    : CollectiveCmd(execution_stream_id, async_from_stream_id, nccl_api,
+                    std::move(config)),
       reduction_kind_(reduction_kind),
       buffers_(buffers.begin(), buffers.end()) {}
 
 absl::Status ReduceScatterCmd::Record(
     const Thunk::ExecuteParams& execute_params,
     const RecordParams& record_params, se::CommandBuffer* command_buffer) {
+  TF_RETURN_IF_ERROR(BarrierIfAsync(
+      command_buffer, execute_params.stream->parent(), record_params));
+
   TF_ASSIGN_OR_RETURN(
       std::vector<DeviceBufferPair> device_buffers,
       ConvertToDeviceBuffers(execute_params.buffer_allocations, buffers_,
@@ -1545,13 +1548,11 @@ absl::Status ReduceScatterCmd::Record(
         "ReduceScatterCmd requires collective parameters and cliques");
   }
 
-  // Today when recording collective operations into command buffers we always
-  // use a sync mode and a stream id `0`.
   TF_ASSIGN_OR_RETURN(
       NcclCommHandleWrapper comm_handle,
       GetNcclComm(*execute_params.collective_params,
                   *execute_params.collective_cliques, config().replica_groups,
-                  config().group_mode, NcclStreamId(0), GetAsyncStreamKind()));
+                  config().group_mode, nccl_stream_id(), GetAsyncStreamKind()));
   NcclApi::NcclCommHandle comm = comm_handle.comm_handle;
   // Use custom allocator for persistent execution plans.
   NcclApi::ScopedPersistentPlanAllocator scoped_allocator(
@@ -1581,15 +1582,20 @@ CommandBufferCmd::BufferUsageVector ReduceScatterCmd::buffers() {
 //===----------------------------------------------------------------------===//
 
 AllGatherCmd::AllGatherCmd(
-    ExecutionStreamId execution_stream_id, NcclApi* nccl_api,
+    ExecutionStreamId execution_stream_id,
+    ExecutionStreamId async_from_stream_id, NcclApi* nccl_api,
     NcclCollectiveConfig config,
     absl::Span<const NcclCollectiveThunk::Buffer> buffers)
-    : CollectiveCmd(execution_stream_id, nccl_api, std::move(config)),
+    : CollectiveCmd(execution_stream_id, async_from_stream_id, nccl_api,
+                    std::move(config)),
       buffers_(buffers.begin(), buffers.end()) {}
 
 absl::Status AllGatherCmd::Record(const Thunk::ExecuteParams& execute_params,
                                   const RecordParams& record_params,
                                   se::CommandBuffer* command_buffer) {
+  TF_RETURN_IF_ERROR(BarrierIfAsync(
+      command_buffer, execute_params.stream->parent(), record_params));
+
   TF_ASSIGN_OR_RETURN(
       std::vector<DeviceBufferPair> device_buffers,
       ConvertToDeviceBuffers(execute_params.buffer_allocations, buffers_,
@@ -1610,13 +1616,11 @@ absl::Status AllGatherCmd::Record(const Thunk::ExecuteParams& execute_params,
         "AllGatherCmd requires collective parameters and cliques");
   }
 
-  // Today when recording collective operations into command buffers we always
-  // use a sync mode and a stream id `0`.
   TF_ASSIGN_OR_RETURN(
       NcclCommHandleWrapper comm_handle,
       GetNcclComm(*execute_params.collective_params,
                   *execute_params.collective_cliques, config().replica_groups,
-                  config().group_mode, NcclStreamId(0), GetAsyncStreamKind()));
+                  config().group_mode, nccl_stream_id(), GetAsyncStreamKind()));
   NcclApi::NcclCommHandle comm = comm_handle.comm_handle;
   // Use custom allocator for persistent execution plans.
   NcclApi::ScopedPersistentPlanAllocator scoped_allocator(
@@ -1645,15 +1649,20 @@ CommandBufferCmd::BufferUsageVector AllGatherCmd::buffers() {
 //===----------------------------------------------------------------------===//
 
 CollectiveBroadcastCmd::CollectiveBroadcastCmd(
-    ExecutionStreamId execution_stream_id, NcclApi* nccl_api,
+    ExecutionStreamId execution_stream_id,
+    ExecutionStreamId async_from_stream_id, NcclApi* nccl_api,
     NcclCollectiveConfig config,
     absl::Span<const NcclCollectiveThunk::Buffer> buffers)
-    : CollectiveCmd(execution_stream_id, nccl_api, std::move(config)),
+    : CollectiveCmd(execution_stream_id, async_from_stream_id, nccl_api,
+                    std::move(config)),
       buffers_(buffers.begin(), buffers.end()) {}
 
 absl::Status CollectiveBroadcastCmd::Record(
     const Thunk::ExecuteParams& execute_params,
     const RecordParams& record_params, se::CommandBuffer* command_buffer) {
+  TF_RETURN_IF_ERROR(BarrierIfAsync(
+      command_buffer, execute_params.stream->parent(), record_params));
+
   TF_ASSIGN_OR_RETURN(
       std::vector<DeviceBufferPair> device_buffers,
       ConvertToDeviceBuffers(execute_params.buffer_allocations, buffers_,
@@ -1675,13 +1684,11 @@ absl::Status CollectiveBroadcastCmd::Record(
         "CollectiveBroadcastCmd requires collective parameters and cliques");
   }
 
-  // Today when recording collective operations into command buffers we always
-  // use a sync mode and a stream id `0`.
   TF_ASSIGN_OR_RETURN(
       NcclCommHandleWrapper comm_handle,
       GetNcclComm(*execute_params.collective_params,
                   *execute_params.collective_cliques, config().replica_groups,
-                  config().group_mode, NcclStreamId(0), GetAsyncStreamKind()));
+                  config().group_mode, nccl_stream_id(), GetAsyncStreamKind()));
   NcclApi::NcclCommHandle comm = comm_handle.comm_handle;
   // Use custom allocator for persistent execution plans.
   NcclApi::ScopedPersistentPlanAllocator scoped_allocator(
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h
index 30581b2fb95171..0000eb0c4a221f 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_RUNTIME_COMMAND_BUFFER_CMD_H_
 #define XLA_SERVICE_GPU_RUNTIME_COMMAND_BUFFER_CMD_H_
 
+#include <algorithm>
 #include <cstddef>
 #include <cstdint>
 #include <memory>
@@ -41,9 +42,9 @@ limitations under the License.
 #include "xla/service/gpu/kernels/custom_kernel.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/nccl_clique_key.h"
 #include "xla/service/gpu/runtime/custom_call_thunk.h"
 #include "xla/service/gpu/runtime/nccl_api.h"
+#include "xla/service/gpu/runtime/nccl_clique_key.h"
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/thunk.h"
 #include "xla/status.h"
@@ -193,6 +194,11 @@ class CommandBufferCmd {
                               const RecordParams& record_params,
                               se::CommandBuffer* command_buffer) = 0;
 
+  // For some commands need to force update on Record even the input device
+  // pointers do not change, e.g. command that has state that can be changed by
+  // CPU code.
+  virtual bool force_update() { return false; }
+
   // Returns all buffers used by the cmd. These will be used to track cmd
   // updates, thus they need to be consistent across calls to the function.
   virtual BufferUsageVector buffers() = 0;
@@ -208,8 +214,8 @@ class CommandBufferCmd {
 
   // Return the execution scope created from the execution stream id of the
   // thunk which is lowered to current command.
-  se::CommandBuffer::ExecutionScopeId GetExecutionScope(
-      const RecordParams& record_params) const;
+  virtual se::CommandBuffer::ExecutionScopeId GetExecutionScope(
+      const CommandBufferCmd::RecordParams& record_params) const;
 
   std::string_view profile_annotation() const { return profile_annotation_; }
   void set_profile_annotation(std::string_view profile_annotation) {
@@ -311,6 +317,12 @@ class CommandBufferCmdSequence {
   bool empty() const { return commands_.empty(); }
   size_t size() const { return commands_.size(); }
 
+  bool force_update() const {
+    return absl::c_any_of(commands_, [](const CommandInfo& cmd_info) {
+      return cmd_info.cmd->force_update();
+    });
+  }
+
  private:
   struct CommandInfo {
     std::unique_ptr<CommandBufferCmd> cmd;
@@ -574,6 +586,8 @@ class IfCmd : public CommandBufferCmd {
                       const RecordParams& record_params,
                       se::CommandBuffer* command_buffer) override;
 
+  bool force_update() override;
+
   BufferUsageVector buffers() override;
 
  private:
@@ -598,6 +612,8 @@ class IfElseCmd : public CommandBufferCmd {
                       const RecordParams& record_params,
                       se::CommandBuffer* command_buffer) override;
 
+  bool force_update() override;
+
   BufferUsageVector buffers() override;
 
  private:
@@ -622,6 +638,8 @@ class CaseCmd : public CommandBufferCmd {
                       const RecordParams& record_params,
                       se::CommandBuffer* command_buffer) override;
 
+  bool force_update() override;
+
   BufferUsageVector buffers() override;
 
  private:
@@ -646,6 +664,8 @@ class ForCmd : public CommandBufferCmd {
                       const RecordParams& record_params,
                       se::CommandBuffer* command_buffer) override;
 
+  bool force_update() override;
+
   BufferUsageVector buffers() override;
 
  private:
@@ -671,6 +691,8 @@ class WhileCmd : public CommandBufferCmd {
                       const RecordParams& record_params,
                       se::CommandBuffer* command_buffer) override;
 
+  bool force_update() override;
+
   BufferUsageVector buffers() override;
 
  private:
@@ -679,47 +701,6 @@ class WhileCmd : public CommandBufferCmd {
   CommandBufferCmdSequence body_commands_;
 };
 
-//===----------------------------------------------------------------------===//
-// AllocateCmd
-//===----------------------------------------------------------------------===//
-
-class AllocateCmd : public CommandBufferCmd {
- public:
-  AllocateCmd(ExecutionStreamId execution_stream_id,
-              BufferAllocation allocation);
-
-  // After calling this function, the allocated memory is tracked in
-  // CommandBuffer object.
-  absl::Status Record(const Thunk::ExecuteParams& execute_params,
-                      const RecordParams& record_params,
-                      se::CommandBuffer* command_buffer) override;
-
-  BufferUsageVector buffers() override;
-
- private:
-  BufferAllocation allocation_;
-};
-
-//===----------------------------------------------------------------------===//
-// FreeCmd
-//===----------------------------------------------------------------------===//
-
-class FreeCmd : public CommandBufferCmd {
- public:
-  FreeCmd(ExecutionStreamId execution_stream_id, BufferAllocation allocation);
-
-  // After calling this function, the allocated memory address for dst
-  // BufferAllocation is freed, no update is required.
-  absl::Status Record(const Thunk::ExecuteParams& execute_params,
-                      const RecordParams& record_params,
-                      se::CommandBuffer* command_buffer) override;
-
-  BufferUsageVector buffers() override;
-
- private:
-  BufferAllocation allocation_;
-};
-
 //===----------------------------------------------------------------------===//
 // GemmCmd
 //===----------------------------------------------------------------------===//
@@ -878,23 +859,48 @@ class BarrierCmd : public CommandBufferCmd {
 // CollectiveCmd
 //===----------------------------------------------------------------------===//
 
-class CollectiveCmd : public TracedCommandBufferCmd {
+class CollectiveCmd : public CommandBufferCmd {
  public:
-  CollectiveCmd(ExecutionStreamId execution_stream_id, NcclApi* nccl_api,
+  CollectiveCmd(ExecutionStreamId execution_stream_id,
+                ExecutionStreamId async_from_stream_id, NcclApi* nccl_api,
                 NcclCollectiveConfig config);
 
   absl::Status Prepare(const Thunk::PrepareParams& params,
                        Thunk::ResourceRequests& resource_requests) final;
 
+  bool force_update() override { return true; }
+
   bool IsNestedCommandBuffer() const final { return true; }
 
+  absl::Status AddTracedCommandBuffer(
+      const Thunk::ExecuteParams& execute_params,
+      const RecordParams& record_params, se::CommandBuffer* command_buffer,
+      absl::FunctionRef<absl::Status(se::Stream*)> trace);
+
   virtual AsyncStreamKind GetAsyncStreamKind() = 0;
 
+  bool IsAsync() const {
+    return async_from_stream_id_ != execution_stream_id();
+  }
+
+  NcclStreamId nccl_stream_id() {
+    return xla::gpu::GetStreamId(IsAsync(), GetAsyncStreamKind());
+  }
+
+  ExecutionStreamId async_from_stream_id() const {
+    return async_from_stream_id_;
+  }
+
+  absl::Status BarrierIfAsync(
+      se::CommandBuffer* command_buffer, se::StreamExecutor* executor,
+      const CommandBufferCmd::RecordParams& record_params);
+
  protected:
   NcclApi* nccl_api() const { return nccl_api_; }
   const NcclCollectiveConfig& config() const { return config_; }
 
  private:
+  ExecutionStreamId async_from_stream_id_;
   NcclApi* nccl_api_;
   NcclCollectiveConfig config_;
 };
@@ -905,7 +911,8 @@ class CollectiveCmd : public TracedCommandBufferCmd {
 
 class AllReduceCmd : public CollectiveCmd {
  public:
-  AllReduceCmd(ExecutionStreamId execution_stream_id, NcclApi* nccl_api,
+  AllReduceCmd(ExecutionStreamId execution_stream_id,
+               ExecutionStreamId async_from_stream_id, NcclApi* nccl_api,
                NcclCollectiveConfig config, ReductionKind reduction_kind,
                absl::Span<const NcclCollectiveThunk::Buffer> buffers);
 
@@ -930,7 +937,8 @@ class AllReduceCmd : public CollectiveCmd {
 
 class ReduceScatterCmd : public CollectiveCmd {
  public:
-  ReduceScatterCmd(ExecutionStreamId execution_stream_id, NcclApi* nccl_api,
+  ReduceScatterCmd(ExecutionStreamId execution_stream_id,
+                   ExecutionStreamId async_from_stream_id, NcclApi* nccl_api,
                    NcclCollectiveConfig config, ReductionKind reduction_kind,
                    absl::Span<const NcclCollectiveThunk::Buffer> buffers);
 
@@ -955,7 +963,8 @@ class ReduceScatterCmd : public CollectiveCmd {
 
 class AllGatherCmd : public CollectiveCmd {
  public:
-  AllGatherCmd(ExecutionStreamId execution_stream_id, NcclApi* nccl_api,
+  AllGatherCmd(ExecutionStreamId execution_stream_id,
+               ExecutionStreamId async_from_stream_id, NcclApi* nccl_api,
                NcclCollectiveConfig config,
                absl::Span<const NcclCollectiveThunk::Buffer> buffers);
 
@@ -980,6 +989,7 @@ class AllGatherCmd : public CollectiveCmd {
 class CollectiveBroadcastCmd : public CollectiveCmd {
  public:
   CollectiveBroadcastCmd(ExecutionStreamId execution_stream_id,
+                         ExecutionStreamId async_from_stream_id,
                          NcclApi* nccl_api, NcclCollectiveConfig config,
                          absl::Span<const NcclCollectiveThunk::Buffer> buffers);
 
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_emitter.cc b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_emitter.cc
index e1974c17cfefa7..f4fe94496cbe51 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_emitter.cc
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_emitter.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "xla/service/gpu/runtime/memset_thunk.h"
 #include "xla/service/gpu/runtime/nccl_all_gather_thunk.h"
 #include "xla/service/gpu/runtime/nccl_all_reduce_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/replica_id_thunk.h"
 #include "xla/service/gpu/runtime/sequential_thunk.h"
 #include "xla/service/gpu/runtime/thunk.h"
@@ -142,21 +143,28 @@ static absl::StatusOr<Command> Convert(
 
 static absl::StatusOr<Command> Convert(const NcclAllReduceStartThunk& thunk) {
   return std::make_unique<AllReduceCmd>(
-      thunk.execution_stream_id(), thunk.nccl_api(), thunk.config(),
-      thunk.reduction_kind(), thunk.buffers());
+      thunk.nccl_execution_stream_id(), thunk.execution_stream_id(),
+      thunk.nccl_api(), thunk.config(), thunk.reduction_kind(),
+      thunk.buffers());
 }
 
 static absl::StatusOr<Command> Convert(
     const NcclReduceScatterStartThunk& thunk) {
   return std::make_unique<ReduceScatterCmd>(
-      thunk.execution_stream_id(), thunk.nccl_api(), thunk.config(),
-      thunk.reduction_kind(), thunk.buffers());
+      thunk.nccl_execution_stream_id(), thunk.execution_stream_id(),
+      thunk.nccl_api(), thunk.config(), thunk.reduction_kind(),
+      thunk.buffers());
 }
 
 static absl::StatusOr<Command> Convert(const NcclAllGatherStartThunk& thunk) {
-  return std::make_unique<AllGatherCmd>(thunk.execution_stream_id(),
-                                        thunk.nccl_api(), thunk.config(),
-                                        thunk.buffers());
+  return std::make_unique<AllGatherCmd>(
+      thunk.nccl_execution_stream_id(), thunk.execution_stream_id(),
+      thunk.nccl_api(), thunk.config(), thunk.buffers());
+}
+
+static absl::StatusOr<Command> Convert(const NcclCollectiveDoneThunk& thunk) {
+  return std::make_unique<BarrierCmd>(thunk.execution_stream_id(),
+                                      thunk.nccl_execution_stream_id());
 }
 
 static absl::StatusOr<Command> Convert(const PartitionIdThunk& thunk) {
@@ -254,11 +262,13 @@ static absl::Status AppendCommands(
                             static_cast<const SequentialThunk&>(thunk).thunks(),
                             synchronization_mode);
 
-    // Currently all collective operations recorded on the tracing stream and do
-    // not need to have a separate done command.
     case Thunk::Kind::kNcclAllGatherDone:
     case Thunk::Kind::kNcclAllReduceDone:
     case Thunk::Kind::kNcclReduceScatterDone:
+      return append(Convert<NcclCollectiveDoneThunk>(thunk));
+
+    // Currently all collective operations recorded on the tracing stream and do
+    // not need to have a separate done command.
     case Thunk::Kind::kWaitForStreams:
       return absl::OkStatus();
 
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_test.cc b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_test.cc
index ff689d8441b2cb..f96f4fa5d9e246 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_test.cc
@@ -204,17 +204,18 @@ TEST(CommandBufferCmdTest, MemcpyCmd) {
   commands.Emplace<MemcpyDeviceToDeviceCmd>(s0, slice_b, slice_a, byte_length);
 
   ServiceExecutableRunOptions run_options;
-  BufferAllocations allocations({a, b}, 0, executor->GetAllocator());
+  se::StreamExecutorMemoryAllocator allocator(executor);
+  BufferAllocations allocations({a, b}, 0, &allocator);
 
   CommandBufferCmd::StateManager state;
 
-  Thunk::ExecuteParams params =
-      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
-                                   stream.get(), {}, nullptr, nullptr);
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);
 
   CommandBufferCmd::RecordParams record_params = {state};
 
-  auto command_buffer = se::CommandBuffer::Create(executor).value();
+  auto command_buffer =
+      executor->CreateCommandBuffer(se::CommandBuffer::Mode::kPrimary).value();
   TF_ASSERT_OK(commands.Record(params, record_params, command_buffer.get()));
 
   // Execute command buffer and verify that it copied the memory.
@@ -272,27 +273,38 @@ TEST(CommandBufferCmdTest, BarrierCmd) {
   commands.Emplace<MemcpyDeviceToDeviceCmd>(s1, slice_e, slice_d, byte_length);
 
   ServiceExecutableRunOptions run_options;
-  BufferAllocations allocations({a, b, c, d, e}, 0, executor->GetAllocator());
+  se::StreamExecutorMemoryAllocator allocator(executor);
+  BufferAllocations allocations({a, b, c, d, e}, 0, &allocator);
 
   CommandBufferCmd::StateManager state;
 
-  Thunk::ExecuteParams params =
-      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
-                                   stream.get(), {}, nullptr, nullptr);
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);
 
   CommandBufferCmd::RecordParams record_params = {state};
 
-  auto command_buffer = se::CommandBuffer::Create(executor).value();
+  auto command_buffer =
+      executor->CreateCommandBuffer(se::CommandBuffer::Mode::kPrimary).value();
   TF_ASSERT_OK(commands.Record(params, record_params, command_buffer.get()));
 
   // Execute command buffer and verify that it copied the memory.
   TF_ASSERT_OK(executor->Submit(stream.get(), *command_buffer));
 
-  // Copy `b` data back to host.
-  std::vector<int32_t> dst(4, 0);
-  TF_ASSERT_OK(stream->Memcpy(dst.data(), e, byte_length));
-
-  ASSERT_EQ(dst, std::vector<int32_t>(4, 42));
+  // Copy data back to host, correct executor order should populate all buffers
+  // with expected value.
+  std::vector<int32_t> dst_b(4, 0);
+  std::vector<int32_t> dst_c(4, 0);
+  std::vector<int32_t> dst_d(4, 0);
+  std::vector<int32_t> dst_e(4, 0);
+  TF_ASSERT_OK(stream->Memcpy(dst_b.data(), b, byte_length));
+  TF_ASSERT_OK(stream->Memcpy(dst_c.data(), c, byte_length));
+  TF_ASSERT_OK(stream->Memcpy(dst_d.data(), d, byte_length));
+  TF_ASSERT_OK(stream->Memcpy(dst_e.data(), e, byte_length));
+
+  ASSERT_EQ(dst_b, std::vector<int32_t>(4, 42));
+  ASSERT_EQ(dst_c, std::vector<int32_t>(4, 42));
+  ASSERT_EQ(dst_d, std::vector<int32_t>(4, 42));
+  ASSERT_EQ(dst_e, std::vector<int32_t>(4, 42));
 }
 
 TEST(CommandBufferCmdTest, LaunchCmd) {
@@ -341,15 +353,16 @@ TEST(CommandBufferCmdTest, LaunchCmd) {
   TF_ASSERT_OK(commands.Initialize({executor, source}, state));
 
   ServiceExecutableRunOptions run_options;
-  BufferAllocations allocations({a, b}, 0, executor->GetAllocator());
+  se::StreamExecutorMemoryAllocator allocator(executor);
+  BufferAllocations allocations({a, b}, 0, &allocator);
 
-  Thunk::ExecuteParams params =
-      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
-                                   stream.get(), {}, nullptr, nullptr);
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);
 
   CommandBufferCmd::RecordParams record_params = {state};
 
-  auto command_buffer = se::CommandBuffer::Create(executor).value();
+  auto command_buffer =
+      executor->CreateCommandBuffer(se::CommandBuffer::Mode::kPrimary).value();
   TF_ASSERT_OK(commands.Record(params, record_params, command_buffer.get()));
 
   // Execute command buffer and verify that it copied the memory.
@@ -400,7 +413,8 @@ TEST(TracedCommandBuffer, GetOrUpdateCommandBuffer) {
   se::DeviceMemoryBase mem0(reinterpret_cast<void*>(0x01234567));
   se::DeviceMemoryBase mem1(reinterpret_cast<void*>(0x12345670));
 
-  BufferAllocations allocations({mem0, mem1}, 0, executor->GetAllocator());
+  se::StreamExecutorMemoryAllocator allocator(executor);
+  BufferAllocations allocations({mem0, mem1}, 0, &allocator);
 
   // No-op trace callback to count how many times it was called.
   int64_t num_calls = 0;
@@ -423,7 +437,7 @@ TEST(TracedCommandBuffer, GetOrUpdateCommandBuffer) {
 
   // Check that when memory address changes we re-trace the command buffer.
   se::DeviceMemoryBase mem2(reinterpret_cast<void*>(0x23456701));
-  allocations = BufferAllocations({mem0, mem2}, 0, executor->GetAllocator());
+  allocations = BufferAllocations({mem0, mem2}, 0, &allocator);
 
   TF_ASSERT_OK_AND_ASSIGN(auto* command_buffer2,
                           traced_cmd_buffer.GetOrTraceCommandBuffer(
@@ -433,7 +447,7 @@ TEST(TracedCommandBuffer, GetOrUpdateCommandBuffer) {
   EXPECT_EQ(num_calls, 2);
 
   // Check that we keep first command buffer in cache.
-  allocations = BufferAllocations({mem0, mem1}, 0, executor->GetAllocator());
+  allocations = BufferAllocations({mem0, mem1}, 0, &allocator);
 
   TF_ASSERT_OK_AND_ASSIGN(auto* command_buffer3,
                           traced_cmd_buffer.GetOrTraceCommandBuffer(
@@ -442,7 +456,7 @@ TEST(TracedCommandBuffer, GetOrUpdateCommandBuffer) {
   EXPECT_EQ(num_calls, 2);
 
   // Check that we trace a new graph when buffer allocation pattern is new.
-  allocations = BufferAllocations({mem0, mem0}, 0, executor->GetAllocator());
+  allocations = BufferAllocations({mem0, mem0}, 0, &allocator);
 
   TF_ASSERT_OK_AND_ASSIGN(auto* command_buffer4,
                           traced_cmd_buffer.GetOrTraceCommandBuffer(
@@ -452,7 +466,7 @@ TEST(TracedCommandBuffer, GetOrUpdateCommandBuffer) {
   EXPECT_EQ(num_calls, 3);
 
   // Check that we still keep the previous graph in cache.
-  allocations = BufferAllocations({mem0, mem1}, 0, executor->GetAllocator());
+  allocations = BufferAllocations({mem0, mem1}, 0, &allocator);
 
   TF_ASSERT_OK_AND_ASSIGN(auto* command_buffer5,
                           traced_cmd_buffer.GetOrTraceCommandBuffer(
@@ -479,12 +493,13 @@ static void BM_GetOrTraceCommandBuffer(benchmark::State& state) {
 
   se::DeviceMemoryBase mem0(reinterpret_cast<void*>(0x01234567));
   se::DeviceMemoryBase mem1(reinterpret_cast<void*>(0x12345670));
+  se::StreamExecutorMemoryAllocator allocator(executor);
 
   std::array<BufferAllocations, 4> allocations = {
-      BufferAllocations({mem0, mem1}, 0, executor->GetAllocator()),
-      BufferAllocations({mem1, mem0}, 0, executor->GetAllocator()),
-      BufferAllocations({mem0, mem0}, 0, executor->GetAllocator()),
-      BufferAllocations({mem1, mem1}, 0, executor->GetAllocator()),
+      BufferAllocations({mem0, mem1}, 0, &allocator),
+      BufferAllocations({mem1, mem0}, 0, &allocator),
+      BufferAllocations({mem0, mem0}, 0, &allocator),
+      BufferAllocations({mem1, mem1}, 0, &allocator),
   };
 
   int32_t index = 0;
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk.cc b/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk.cc
index b1de1a0f86f87b..8236ad2a65cb08 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk.cc
@@ -80,6 +80,10 @@ CommandBufferThunk::CommandBufferThunk(CommandBufferCmdSequence commands,
 bool CommandBufferThunk::ExecutorCommandBuffer::ShouldUpdateCommandBuffer(
     const CommandBufferCmdSequence& commands,
     const Thunk::ExecuteParams& params) {
+  if (commands.force_update()) {
+    return true;
+  }
+
   bool should_update = false;
   const BufferAllocations* allocs = params.buffer_allocations;
 
@@ -145,17 +149,21 @@ absl::Status CommandBufferThunk::Initialize(const InitializeParams& params) {
   // for recording commands.
   Thunk::ExecuteParams execute_params(
       params.buffer_allocations, params.stream,
-      params.command_buffer_trace_stream, {}, params.collective_params,
+      params.command_buffer_trace_stream, params.collective_params,
       params.collective_cliques, /*device_to_host_stream=*/nullptr,
       /*host_to_device_stream=*/nullptr,
       /*send_device_memory_function=*/nullptr,
-      /*recv_device_memory_function=*/nullptr);
+      /*recv_device_memory_function=*/nullptr, params.ffi_execution_context);
 
   // If command buffer is in `kCreate` state it means that command buffer
   // sequence was never recorded into it. We initialize all command buffers
   // before execution, because command buffers when instantiated will allocate
   // memory on device and this might lead to deadlocks when we have concurrent
   // NCCL operations in flight.
+  //
+  // If command buffer in any other state we check it is has to be updated, i.e.
+  // if captured pointers changed or command buffer has commands that require
+  // update on each call.
   if (cmd_buffer->command_buffer->state() ==
           se::CommandBuffer::State::kCreate &&
       cmd_buffer->ShouldUpdateCommandBuffer(commands_, execute_params)) {
@@ -181,6 +189,7 @@ absl::Status CommandBufferThunk::Initialize(const InitializeParams& params) {
             << params.executor->device_ordinal() << " in "
             << (end_micros - start_micros)
             << " μs; num_commands=" << commands_.size();
+    cmd_buffer->num_executions = 0;
   }
 
   return absl::OkStatus();
@@ -265,7 +274,9 @@ CommandBufferThunk::GetOrCreateCommandBuffer(se::StreamExecutor* executor) {
   }
 
   // Create a new empty command buffer.
-  TF_ASSIGN_OR_RETURN(auto command_buffer, se::CommandBuffer::Create(executor));
+  TF_ASSIGN_OR_RETURN(
+      auto command_buffer,
+      executor->CreateCommandBuffer(se::CommandBuffer::Mode::kPrimary));
   auto emplaced = state_->command_buffers.emplace(
       executor,
       std::make_shared<ExecutorCommandBuffer>(std::move(command_buffer)));
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk.h b/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk.h
index c1c5f1feba64c9..3c05aada9607e7 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk.h
@@ -26,7 +26,6 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
-#include "xla/service/gpu/runtime/command_buffer_allocations.h"
 #include "xla/service/gpu/runtime/command_buffer_cmd.h"
 #include "xla/service/gpu/runtime/thunk.h"
 #include "xla/stream_executor/command_buffer.h"
@@ -40,6 +39,8 @@ class CommandBufferThunk : public Thunk {
   CommandBufferThunk(CommandBufferCmdSequence commands, ThunkInfo thunk_info,
                      std::optional<ThunkSequence> thunks = std::nullopt);
 
+  const std::optional<ThunkSequence>& thunks() const { return thunks_; }
+
   absl::Status Prepare(const PrepareParams& params,
                        ResourceRequests& resource_requests) override;
   absl::Status Initialize(const InitializeParams& params) override;
@@ -73,14 +74,6 @@ class CommandBufferThunk : public Thunk {
     // sequence to a command buffer.
     CommandBufferCmd::StateManager state ABSL_GUARDED_BY(mutex);
 
-    // TODO(ezhulenev): We need to move command buffer allocations all the way
-    // up to the GpuExecutable as we can have Allocate and Free commands in
-    // different command buffers. Consider making it a part of
-    // BufferAllocations (as std::unique_ptr<ExternalAllocations> member).
-
-    // Memory allocations performed by a `command_buffer`.
-    CommandBufferAllocations allocations ABSL_GUARDED_BY(mutex);
-
     // Mapping from buffer allocation index to the device memory passed at
     // that index to the last call of `commands_.Record(...)` for
     // `command_buffer`. We can just use a vector instead of map because
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk_test.cc b/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk_test.cc
index 4a78b621b3dd4b..e8ab612924474b 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk_test.cc
@@ -28,7 +28,6 @@ limitations under the License.
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/runtime/command_buffer_allocations.h"
 #include "xla/service/gpu/runtime/command_buffer_cmd.h"
 #include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/platform_util.h"
@@ -37,6 +36,7 @@ limitations under the License.
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/gpu/gpu_test_kernels.h"
 #include "xla/stream_executor/gpu/gpu_types.h"  // IWYU pragma: keep
 #include "xla/stream_executor/kernel.h"
@@ -132,14 +132,14 @@ TEST(CommandBufferThunkTest, MemcpyCmd) {
   commands.Emplace<MemcpyDeviceToDeviceCmd>(s0, slice_b, slice_a, byte_length);
 
   // Construct a thunk with command sequence.
-  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo(nullptr));
+  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo());
 
+  se::StreamExecutorMemoryAllocator allocator(executor);
   ServiceExecutableRunOptions run_options;
-  BufferAllocations allocations({a, b}, 0, executor->GetAllocator());
+  BufferAllocations allocations({a, b}, 0, &allocator);
 
-  Thunk::ExecuteParams params =
-      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
-                                   stream.get(), {}, nullptr, nullptr);
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);
 
   // Execute command buffer thunk and verify that it copied the memory.
   TF_ASSERT_OK(thunk.ExecuteOnStream(params));
@@ -186,14 +186,14 @@ TEST(CommandBufferThunkTest, MemzeroCmd) {
   commands.Emplace<MemzeroCmd>(s0, slice_a);
 
   // Construct a thunk with command sequence.
-  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo(nullptr));
+  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo());
 
   ServiceExecutableRunOptions run_options;
-  BufferAllocations allocations({a}, 0, executor->GetAllocator());
+  se::StreamExecutorMemoryAllocator allocator(executor);
+  BufferAllocations allocations({a}, 0, &allocator);
 
-  Thunk::ExecuteParams params =
-      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
-                                   stream.get(), {}, nullptr, nullptr);
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);
 
   // Execute command buffer thunk and verify that it zeroes the memory.
   TF_ASSERT_OK(thunk.ExecuteOnStream(params));
@@ -228,14 +228,14 @@ TEST(CommandBufferThunkTest, Memset32Cmd) {
   commands.Emplace<Memset32Cmd>(s0, slice_a, int32_t{84});
 
   // Construct a thunk with command sequence.
-  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo(nullptr));
+  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo());
 
   ServiceExecutableRunOptions run_options;
-  BufferAllocations allocations({a}, 0, executor->GetAllocator());
+  se::StreamExecutorMemoryAllocator allocator(executor);
+  BufferAllocations allocations({a}, 0, &allocator);
 
-  Thunk::ExecuteParams params =
-      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
-                                   stream.get(), {}, nullptr, nullptr);
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);
 
   // Execute command buffer thunk and verify that it set the memory.
   TF_ASSERT_OK(thunk.ExecuteOnStream(params));
@@ -267,14 +267,14 @@ TEST(CommandBufferThunkTest, Memset32CmdOnDifferentStreams) {
   commands.Emplace<Memset32Cmd>(s1, slice1, int32_t{34});
 
   // Construct a thunk with command sequence.
-  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo(nullptr));
+  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo());
 
   ServiceExecutableRunOptions run_options;
-  BufferAllocations allocations({a}, 0, executor->GetAllocator());
+  se::StreamExecutorMemoryAllocator allocator(executor);
+  BufferAllocations allocations({a}, 0, &allocator);
 
-  Thunk::ExecuteParams params =
-      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
-                                   stream.get(), {}, nullptr, nullptr);
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);
 
   // Execute command buffer thunk and verify that it set the memory.
   TF_ASSERT_OK(thunk.ExecuteOnStream(params));
@@ -287,145 +287,6 @@ TEST(CommandBufferThunkTest, Memset32CmdOnDifferentStreams) {
   ASSERT_EQ(dst, std::vector<int32_t>({12, 34}));
 }
 
-// This test does the following operations:
-// 1. Allocates memory region "a" and "c" outside command buffer.
-// 2. Allocates memory region "b" inside command buffer.
-// 3. MemCopyDeviceToDevice from "a" to "b" inside command buffer.
-
-// 4. MemCopyDeviceToDevice from "b" to "c" inside command buffer.
-// 5. Free memory region "b" inside command buffer.
-// 6. Verify that region "c" has the same content as "a".
-TEST(CommandBufferThunkTest, MemallocFreeCmdSameThunk) {
-  se::StreamExecutor* executor = GpuExecutor();
-
-  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
-
-  // Prepare arguments:
-  int64_t length = 4;
-  int64_t byte_length = sizeof(int32_t) * length;
-
-  BufferAllocation alloc_a(/*index=*/0, byte_length, /*color=*/0);
-  BufferAllocation alloc_b(/*index=*/1, byte_length, /*color=*/0);
-  BufferAllocation alloc_c(/*index=*/2, byte_length, /*color=*/0);
-  BufferAllocation::Slice slice_a(&alloc_a, 0, byte_length);
-  BufferAllocation::Slice slice_b(&alloc_b, 0, byte_length);
-  BufferAllocation::Slice slice_c(&alloc_c, 0, byte_length);
-
-  // Prepare commands sequence for constructing command buffer.
-  CommandBufferCmdSequence commands;
-  commands.Emplace<AllocateCmd>(s0, alloc_b);
-  commands.Emplace<MemcpyDeviceToDeviceCmd>(s0, slice_b, slice_a, byte_length);
-  commands.Emplace<MemcpyDeviceToDeviceCmd>(s0, slice_c, slice_b, byte_length);
-  commands.Emplace<FreeCmd>(s0, alloc_b);
-
-  // Construct a thunk with command sequence.
-  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo(nullptr));
-
-  // Prepare arguments: a=42, b=0
-  se::DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
-  TF_ASSERT_OK(stream->Memset32(&a, 42, byte_length));
-
-  se::DeviceMemory<int32_t> b(se::DeviceMemoryBase(
-      reinterpret_cast<int32_t*>(BufferAllocations::kExternalAllocationMarker),
-      byte_length));
-  se::DeviceMemory<int32_t> c = executor->AllocateArray<int32_t>(length, 0);
-
-  auto external_allocation = std::make_unique<CommandBufferAllocations>();
-
-  BufferAllocations allocations({a, b, c}, 0, executor->GetAllocator(),
-                                external_allocation.get());
-
-  ServiceExecutableRunOptions run_options;
-
-  Thunk::ExecuteParams params =
-      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
-                                   stream.get(), {}, nullptr, nullptr);
-
-  // Execute command buffer thunk and verify that it copied the memory.
-  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
-  TF_ASSERT_OK(stream->BlockHostUntilDone());
-
-  // Copy `b` data back to host.
-  std::vector<int32_t> dst(4, 0);
-  TF_ASSERT_OK(stream->Memcpy(
-      dst.data(), allocations.GetMutableDeviceAddress(2), byte_length));
-
-  ASSERT_EQ(dst, std::vector<int32_t>(4, 42));
-}
-
-// This test does the following operations:
-// 1. Allocates memory region "a" and "c" outside command buffer.
-// 2. Allocates memory region "b" inside command buffer thunk 1.
-// 3. MemCopyDeviceToDevice from "a" to "b" inside command buffer 1.
-// 4. MemCopyDeviceToDevice from "b" to "c" inside command buffer 2.
-// 5. Free memory region "b" inside command buffer 2.
-// 6. Verify that region "c" has the same content as "a".
-TEST(CommandBufferThunkTest, MemallocFreeCmdAcrossThunk) {
-  se::StreamExecutor* executor = GpuExecutor();
-
-  TF_ASSERT_OK_AND_ASSIGN(auto stream, executor->CreateStream());
-
-  // Prepare arguments:
-  int64_t length = 4;
-  int64_t byte_length = sizeof(int32_t) * length;
-
-  BufferAllocation alloc_a(/*index=*/0, byte_length, /*color=*/0);
-  BufferAllocation alloc_b(/*index=*/1, byte_length, /*color=*/0);
-  BufferAllocation alloc_c(/*index=*/2, byte_length, /*color=*/0);
-  BufferAllocation::Slice slice_a(&alloc_a, 0, byte_length);
-  BufferAllocation::Slice slice_b(&alloc_b, 0, byte_length);
-  BufferAllocation::Slice slice_c(&alloc_c, 0, byte_length);
-
-  // =================Thunk 1=================================
-  // Prepare commands sequence for constructing command buffer.
-  CommandBufferCmdSequence commands1;
-  commands1.Emplace<AllocateCmd>(s0, alloc_b);
-  commands1.Emplace<MemcpyDeviceToDeviceCmd>(s0, slice_b, slice_a, byte_length);
-
-  // Construct a thunk with command sequence.
-  CommandBufferThunk thunk1(std::move(commands1), Thunk::ThunkInfo(nullptr));
-
-  // Prepare arguments: a=42, b=0
-  se::DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
-  TF_ASSERT_OK(stream->Memset32(&a, 42, byte_length));
-  se::DeviceMemory<int32_t> b(se::DeviceMemoryBase(
-      reinterpret_cast<int32_t*>(BufferAllocations::kExternalAllocationMarker),
-      byte_length));
-  se::DeviceMemory<int32_t> c = executor->AllocateArray<int32_t>(length, 0);
-
-  auto external_allocation = std::make_unique<CommandBufferAllocations>();
-
-  BufferAllocations allocations({a, b, c}, 0, executor->GetAllocator(),
-                                external_allocation.get());
-
-  ServiceExecutableRunOptions run_options;
-
-  Thunk::ExecuteParams params =
-      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
-                                   stream.get(), {}, nullptr, nullptr);
-
-  // Execute command buffer thunk and verify that it copied the memory.
-  TF_ASSERT_OK(thunk1.ExecuteOnStream(params));
-
-  // =================Thunk 2=================================
-  CommandBufferCmdSequence commands2;
-  commands2.Emplace<MemcpyDeviceToDeviceCmd>(s0, slice_c, slice_b, byte_length);
-  commands2.Emplace<FreeCmd>(s0, alloc_b);
-
-  // Construct a thunk with command sequence.
-  CommandBufferThunk thunk2(std::move(commands2), Thunk::ThunkInfo(nullptr));
-
-  // Execute command buffer thunk and verify that it copied the memory.
-  TF_ASSERT_OK(thunk2.ExecuteOnStream(params));
-
-  // Copy `c` data back to host.
-  std::vector<int32_t> dst(4, 0);
-  TF_ASSERT_OK(stream->Memcpy(
-      dst.data(), allocations.GetMutableDeviceAddress(2), byte_length));
-
-  ASSERT_EQ(dst, std::vector<int32_t>(4, 42));
-}
-
 TEST(CommandBufferThunkTest, LaunchCmd) {
   se::StreamExecutor* executor = GpuExecutor();
 
@@ -459,14 +320,14 @@ TEST(CommandBufferThunkTest, LaunchCmd) {
                               /*shmem_bytes=*/0);
 
   // Construct a thunk with command sequence.
-  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo(nullptr));
+  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo());
 
   ServiceExecutableRunOptions run_options;
-  BufferAllocations allocations({a, b}, 0, executor->GetAllocator());
+  se::StreamExecutorMemoryAllocator allocator(executor);
+  BufferAllocations allocations({a, b}, 0, &allocator);
 
-  Thunk::ExecuteParams params =
-      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
-                                   stream.get(), {}, nullptr, nullptr);
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);
 
   Thunk::ExecutableSource source = ExecutableSource();
   TF_ASSERT_OK(
@@ -487,7 +348,7 @@ TEST(CommandBufferThunkTest, LaunchCmd) {
   TF_ASSERT_OK(stream->MemZero(&c, byte_length));
 
   // Update buffer allocation #1 to buffer `c`.
-  allocations = BufferAllocations({a, c}, 0, executor->GetAllocator());
+  allocations = BufferAllocations({a, c}, 0, &allocator);
 
   // Thunk execution should automatically update underlying command buffer.
   TF_ASSERT_OK(thunk.ExecuteOnStream(params));
@@ -555,14 +416,14 @@ TEST(CommandBufferThunkTest, CustomAddKernelLaunchCmd) {
                               /*shmem_bytes=*/0);
 
   // Construct a thunk with command sequence.
-  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo(nullptr));
+  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo());
 
   ServiceExecutableRunOptions run_options;
-  BufferAllocations allocations({a, b}, 0, executor->GetAllocator());
+  se::StreamExecutorMemoryAllocator allocator(executor);
+  BufferAllocations allocations({a, b}, 0, &allocator);
 
-  Thunk::ExecuteParams params =
-      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
-                                   stream.get(), {}, nullptr, nullptr);
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);
 
   Thunk::ExecutableSource source = ExecutableSource();
   TF_ASSERT_OK(
@@ -583,7 +444,7 @@ TEST(CommandBufferThunkTest, CustomAddKernelLaunchCmd) {
   TF_ASSERT_OK(stream->MemZero(&c, byte_length));
 
   // Update buffer allocation #1 to buffer `c`.
-  allocations = BufferAllocations({a, c}, 0, executor->GetAllocator());
+  allocations = BufferAllocations({a, c}, 0, &allocator);
 
   // Thunk execution should automatically update underlying command buffer.
   TF_ASSERT_OK(thunk.ExecuteOnStream(params));
@@ -670,15 +531,14 @@ TEST(CommandBufferThunkTest, GemmCmd) {
                             /*deterministic=*/true);
 
   // Construct a thunk with command sequence.
-  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo(nullptr));
+  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo());
 
   ServiceExecutableRunOptions run_options;
-  BufferAllocations allocations({lhs, rhs, out, workspace}, 0,
-                                executor->GetAllocator());
+  se::StreamExecutorMemoryAllocator allocator(executor);
+  BufferAllocations allocations({lhs, rhs, out, workspace}, 0, &allocator);
 
-  Thunk::ExecuteParams params =
-      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
-                                   stream.get(), {}, nullptr, nullptr);
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);
 
   Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
   TF_ASSERT_OK(thunk.Initialize(
@@ -699,8 +559,8 @@ TEST(CommandBufferThunkTest, GemmCmd) {
   TF_ASSERT_OK(stream->MemZero(&updated_out, out_length));
 
   // Update buffer allocation to updated `out` buffer.
-  allocations = BufferAllocations({lhs, rhs, updated_out, workspace}, 0,
-                                  executor->GetAllocator());
+  allocations =
+      BufferAllocations({lhs, rhs, updated_out, workspace}, 0, &allocator);
 
   // Thunk execution should automatically update underlying command buffer.
   TF_ASSERT_OK(thunk.ExecuteOnStream(params));
@@ -771,14 +631,14 @@ TEST(CommandBufferThunkTest, MultipleLaunchCmd) {
                               /*shmem_bytes=*/0);
 
   // Construct a thunk with command sequence.
-  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo(nullptr));
+  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo());
 
   ServiceExecutableRunOptions run_options;
-  BufferAllocations allocations({a, b, c, d}, 0, executor->GetAllocator());
+  se::StreamExecutorMemoryAllocator allocator(executor);
+  BufferAllocations allocations({a, b, c, d}, 0, &allocator);
 
-  Thunk::ExecuteParams params =
-      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
-                                   stream.get(), {}, nullptr, nullptr);
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);
 
   Thunk::ExecutableSource source = ExecutableSource();
   TF_ASSERT_OK(
@@ -806,7 +666,7 @@ TEST(CommandBufferThunkTest, MultipleLaunchCmd) {
   TF_ASSERT_OK(stream->MemZero(&e, byte_length));
 
   // Update buffer allocation #1 to buffer `c`.
-  allocations = BufferAllocations({a, b, c, e}, 0, executor->GetAllocator());
+  allocations = BufferAllocations({a, b, c, e}, 0, &allocator);
 
   // Thunk execution should automatically update underlying command buffer.
   TF_ASSERT_OK(thunk.ExecuteOnStream(params));
@@ -886,14 +746,14 @@ TEST(CommandBufferThunkTest, IfCmd) {
   commands.Emplace<IfCmd>(s0, slice_p, std::move(then_commands));
 
   // Construct a thunk with command sequence.
-  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo(nullptr));
+  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo());
 
   ServiceExecutableRunOptions run_options;
-  BufferAllocations allocations({pred, a, b}, 0, executor->GetAllocator());
+  se::StreamExecutorMemoryAllocator allocator(executor);
+  BufferAllocations allocations({pred, a, b}, 0, &allocator);
 
-  Thunk::ExecuteParams params =
-      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
-                                   stream.get(), {}, nullptr, nullptr);
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);
 
   Thunk::ExecutableSource source = ExecutableSource();
   TF_ASSERT_OK(
@@ -914,7 +774,7 @@ TEST(CommandBufferThunkTest, IfCmd) {
   TF_ASSERT_OK(stream->MemZero(&c, byte_length));
 
   // Update buffer allocation #2 to buffer `c`.
-  allocations = BufferAllocations({pred, a, c}, 0, executor->GetAllocator());
+  allocations = BufferAllocations({pred, a, c}, 0, &allocator);
 
   // Thunk execution should automatically update underlying command buffer.
   TF_ASSERT_OK(thunk.ExecuteOnStream(params));
@@ -985,14 +845,14 @@ TEST(CommandBufferThunkTest, IfElseCmd) {
                               std::move(else_commands));
 
   // Construct a thunk with command sequence.
-  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo(nullptr));
+  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo());
 
   ServiceExecutableRunOptions run_options;
-  BufferAllocations allocations({pred, a, b}, 0, executor->GetAllocator());
+  se::StreamExecutorMemoryAllocator allocator(executor);
+  BufferAllocations allocations({pred, a, b}, 0, &allocator);
 
-  Thunk::ExecuteParams params =
-      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
-                                   stream.get(), {}, nullptr, nullptr);
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);
 
   Thunk::ExecutableSource source = ExecutableSource();
   TF_ASSERT_OK(
@@ -1074,14 +934,14 @@ TEST(CommandBufferThunkTest, CaseCmd) {
   commands.Emplace<CaseCmd>(s0, slice_i, std::move(branches));
 
   // Construct a thunk with command sequence.
-  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo(nullptr));
+  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo());
 
   ServiceExecutableRunOptions run_options;
-  BufferAllocations allocations({index, a, b}, 0, executor->GetAllocator());
+  se::StreamExecutorMemoryAllocator allocator(executor);
+  BufferAllocations allocations({index, a, b}, 0, &allocator);
 
-  Thunk::ExecuteParams params =
-      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
-                                   stream.get(), {}, nullptr, nullptr);
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);
 
   Thunk::ExecutableSource source = ExecutableSource();
   TF_ASSERT_OK(
@@ -1153,14 +1013,14 @@ TEST(CommandBufferThunkTest, ForCmd) {
                            std::move(body_commands));
 
   // Construct a thunk with command sequence.
-  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo(nullptr));
+  CommandBufferThunk thunk(std::move(commands), Thunk::ThunkInfo());
 
   ServiceExecutableRunOptions run_options;
-  BufferAllocations allocations({loop_cnt, a, b}, 0, executor->GetAllocator());
+  se::StreamExecutorMemoryAllocator allocator(executor);
+  BufferAllocations allocations({loop_cnt, a, b}, 0, &allocator);
 
-  Thunk::ExecuteParams params =
-      Thunk::ExecuteParams::Create(run_options, allocations, stream.get(),
-                                   stream.get(), {}, nullptr, nullptr);
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);
 
   Thunk::ExecutableSource source = ExecutableSource();
   TF_ASSERT_OK(
diff --git a/third_party/xla/xla/service/gpu/runtime/convolution_thunk.cc b/third_party/xla/xla/service/gpu/runtime/convolution_thunk.cc
index 6e8158d866aaf5..8577fe9f55106e 100644
--- a/third_party/xla/xla/service/gpu/runtime/convolution_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/convolution_thunk.cc
@@ -15,17 +15,29 @@ limitations under the License.
 
 #include "xla/service/gpu/runtime/convolution_thunk.h"
 
+#include <cstdint>
 #include <memory>
 #include <optional>
+#include <utility>
+#include <vector>
 
 #include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
 #include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/gpu_conv_runner.h"
+#if TENSORFLOW_USE_ROCM
 #include "xla/service/gpu/stream_executor_util.h"
+#endif  // TENSORFLOW_USE_ROCM
+#include "xla/service/gpu/gpu_conv_runner.h"
+#include "xla/service/gpu/runtime/thunk.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/util.h"
+#include "tsl/platform/errors.h"
 
 namespace xla {
 namespace gpu {
@@ -87,6 +99,9 @@ absl::Status ConvolutionThunk::ExecuteOnStream(const ExecuteParams& params) {
     TF_ASSIGN_OR_RETURN(se::dnn::DataType input_type,
                         GetDNNDataTypeFromPrimitiveType(config_.input_type));
 
+    TF_ASSIGN_OR_RETURN(se::dnn::DataType output_type,
+                        GetDNNDataTypeFromPrimitiveType(config_.output_type));
+
     TF_ASSIGN_OR_RETURN(auto dnn,
                         se::dnn::internal::GetDnnFromStream(params.stream));
     se::OwningScratchAllocator<> scratch_allocator(
@@ -95,7 +110,7 @@ absl::Status ConvolutionThunk::ExecuteOnStream(const ExecuteParams& params) {
 
     std::vector<se::dnn::ProfileResult> profile_results;
     dnn->GetMIOpenConvolveAlgorithms(
-        kind, input_type, params.stream, config_.input_descriptor,
+        kind, input_type, output_type, params.stream, config_.input_descriptor,
         conv_params.input_buf, config_.filter_descriptor,
         conv_params.filter_buf, config_.output_descriptor,
         conv_params.output_buf, config_.conv_desc, &scratch_allocator,
diff --git a/third_party/xla/xla/service/gpu/runtime/copy_thunk.cc b/third_party/xla/xla/service/gpu/runtime/copy_thunk.cc
index 9a8698b23f8b5c..89b29d75736f82 100644
--- a/third_party/xla/xla/service/gpu/runtime/copy_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/copy_thunk.cc
@@ -16,12 +16,21 @@ limitations under the License.
 #include "xla/service/gpu/runtime/copy_thunk.h"
 
 #include <cstdint>
+#include <memory>
+#include <utility>
 
-#include "mlir/IR/Value.h"  // from @llvm-project
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/runtime/thunk.h"
 #include "xla/status.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/event.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -29,7 +38,7 @@ namespace gpu {
 DeviceToDeviceCopyThunk::DeviceToDeviceCopyThunk(
     ThunkInfo thunk_info, const BufferAllocation::Slice& source_buffer,
     const BufferAllocation::Slice& destination_buffer, uint64_t mem_size)
-    : Thunk(Kind::kCopy, thunk_info),
+    : Thunk(Kind::kCopy, std::move(thunk_info)),
       source_buffer_(source_buffer),
       destination_buffer_(destination_buffer),
       mem_size_(mem_size) {}
@@ -45,11 +54,67 @@ absl::Status DeviceToDeviceCopyThunk::ExecuteOnStream(
   return params.stream->Memcpy(&destination_data, source_data, mem_size_);
 }
 
+//===----------------------------------------------------------------------===//
+// CopyThunk
+//===----------------------------------------------------------------------===//
+
+CopyThunk::CopyThunk(ThunkInfo thunk_info,
+                     const BufferAllocation::Slice& source_buffer,
+                     const BufferAllocation::Slice& destination_buffer,
+                     uint64_t mem_size)
+    : Thunk(Kind::kCopy, std::move(thunk_info)),
+      source_buffer_(source_buffer),
+      destination_buffer_(destination_buffer),
+      mem_size_(mem_size) {}
+
+absl::Status CopyThunk::ExecuteOnStream(const ExecuteParams& params) {
+  return absl::OkStatus();
+}
+
+//===----------------------------------------------------------------------===//
+// CopyAsyncEvents
+//===----------------------------------------------------------------------===//
+
+// Emplace() will insert {key, event} pair into the hash map,
+// and return the event in order to do RecordEvent() for async memcpy.
+absl::Status CopyThunk::AsyncEvents::Emplace(se::StreamExecutor* executor,
+                                             const HloInstruction* instr,
+                                             std::unique_ptr<se::Event> event) {
+  Key key = {executor, instr};
+  absl::MutexLock lock(&mutex_);
+  VLOG(3) << "Emplace event " << event->implementation();
+  if (auto [it, inserted] = events_.try_emplace(key, std::move(event));
+      inserted) {
+    return absl::OkStatus();
+  }
+  return absl::InternalError("Async copy event already exists!");
+}
+
+// Retrieve a completion event started by copy-start instruction
+// `instr`, and remove the event from the collection.
+absl::StatusOr<std::unique_ptr<se::Event>> CopyThunk::AsyncEvents::Extract(
+    se::StreamExecutor* executor, const HloInstruction* instr) {
+  Key key = {executor, instr};
+  absl::MutexLock lock(&mutex_);
+  if (auto event = events_.extract(key)) {
+    VLOG(3) << "Extract event " << event.mapped()->implementation();
+    return std::move(event.mapped());
+  }
+  return absl::InternalError("Async copy event was not found!");
+}
+
+//===----------------------------------------------------------------------===//
+// DeviceToHostCopyThunk
+//===----------------------------------------------------------------------===//
 DeviceToHostCopyThunk::DeviceToHostCopyThunk(
     ThunkInfo thunk_info, const BufferAllocation::Slice& source_buffer,
-    const BufferAllocation::Slice& destination_buffer, uint64_t mem_size)
-    : DeviceToDeviceCopyThunk(thunk_info, source_buffer, destination_buffer,
-                              mem_size) {}
+    const BufferAllocation::Slice& destination_buffer, uint64_t mem_size,
+    std::shared_ptr<CopyThunk::AsyncEvents> async_events,
+    const HloInstruction* instr)
+    : CopyThunk(std::move(thunk_info), source_buffer, destination_buffer,
+                mem_size),
+      async_events_(std::move(async_events)),
+      instr_(instr) {}
 
 absl::Status DeviceToHostCopyThunk::ExecuteOnStream(
     const ExecuteParams& params) {
@@ -58,16 +123,36 @@ absl::Status DeviceToHostCopyThunk::ExecuteOnStream(
   se::DeviceMemoryBase source_data =
       params.buffer_allocations->GetDeviceAddress(source());
   void* cpu_dst = destination_data.opaque();
-  VLOG(3) << "Memcpy D2H for memory offload from " << source_data.opaque()
-          << " to " << cpu_dst;
-  return params.stream->Memcpy(cpu_dst, source_data, size_bytes());
+  TF_ASSIGN_OR_RETURN(
+      se::Stream * stream,
+      GetStreamForExecution(Thunk::execution_stream_id(), params));
+  TF_RETURN_IF_ERROR(stream->Memcpy(cpu_dst, source_data, size_bytes()));
+  if (stream == params.stream) {
+    VLOG(2) << "Memcpy D2H from the main stream";
+    return absl::OkStatus();
+  }
+  VLOG(2) << "Memcpy D2H from the other stream";
+  se::StreamExecutor* executor = params.stream->parent();
+  TF_ASSIGN_OR_RETURN(auto event, executor->CreateEvent());
+  // Record memcpy operation completion.
+  TF_RETURN_IF_ERROR(stream->RecordEvent(event.get()));
+  VLOG(3) << "Emplace events: " << event->implementation()
+          << " for instr: " << instr_->ToString();
+  return async_events_->Emplace(executor, instr_, std::move(event));
 }
 
+//===----------------------------------------------------------------------===//
+// HostToDeviceCopyThunk
+//===----------------------------------------------------------------------===//
 HostToDeviceCopyThunk::HostToDeviceCopyThunk(
     ThunkInfo thunk_info, const BufferAllocation::Slice& source_buffer,
-    const BufferAllocation::Slice& destination_buffer, uint64_t mem_size)
-    : DeviceToDeviceCopyThunk(thunk_info, source_buffer, destination_buffer,
-                              mem_size) {}
+    const BufferAllocation::Slice& destination_buffer, uint64_t mem_size,
+    std::shared_ptr<CopyThunk::AsyncEvents> async_events,
+    const HloInstruction* instr)
+    : CopyThunk(std::move(thunk_info), source_buffer, destination_buffer,
+                mem_size),
+      async_events_(std::move(async_events)),
+      instr_(instr) {}
 
 absl::Status HostToDeviceCopyThunk::ExecuteOnStream(
     const ExecuteParams& params) {
@@ -76,9 +161,42 @@ absl::Status HostToDeviceCopyThunk::ExecuteOnStream(
   se::DeviceMemoryBase source_data =
       params.buffer_allocations->GetDeviceAddress(source());
   void* cpu_src = source_data.opaque();
-  VLOG(3) << "Memcpy H2D for memory offload from " << cpu_src << " to "
-          << destination_data.opaque();
-  return params.stream->Memcpy(&destination_data, cpu_src, size_bytes());
+  TF_ASSIGN_OR_RETURN(
+      se::Stream * stream,
+      GetStreamForExecution(Thunk::execution_stream_id(), params));
+  TF_RETURN_IF_ERROR(stream->Memcpy(&destination_data, cpu_src, size_bytes()));
+  if (stream == params.stream) {
+    VLOG(2) << "Memcpy H2D from the main stream";
+    return absl::OkStatus();
+  }
+  VLOG(2) << "Memcpy H2D from the other stream";
+  se::StreamExecutor* executor = params.stream->parent();
+  TF_ASSIGN_OR_RETURN(auto event, executor->CreateEvent());
+  // Record memcpy operation completion.
+  TF_RETURN_IF_ERROR(stream->RecordEvent(event.get()));
+  VLOG(3) << "Emplace events: " << event->implementation()
+          << " for instr: " << instr_->ToString();
+  return async_events_->Emplace(executor, instr_, std::move(event));
+}
+
+//===----------------------------------------------------------------------===//
+// CopyDoneThunk
+//===----------------------------------------------------------------------===//
+CopyDoneThunk::CopyDoneThunk(
+    Thunk::Kind kind, ThunkInfo thunk_info,
+    std::shared_ptr<CopyThunk::AsyncEvents> async_events,
+    const HloInstruction* copy_start_instr)
+    : Thunk(kind, std::move(thunk_info)),
+      async_events_(std::move(async_events)),
+      copy_start_instr_(copy_start_instr) {}
+
+absl::Status CopyDoneThunk::ExecuteOnStream(const ExecuteParams& params) {
+  VLOG(3) << "CopyDone thunk between a host and a device for: "
+          << copy_start_instr_->ToString();
+  se::StreamExecutor* executor = params.stream->parent();
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<se::Event> event,
+                      async_events_->Extract(executor, copy_start_instr_));
+  return params.stream->WaitFor(event.get());
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/runtime/copy_thunk.h b/third_party/xla/xla/service/gpu/runtime/copy_thunk.h
index 521030ccee233d..0f57cef2ec3e23 100644
--- a/third_party/xla/xla/service/gpu/runtime/copy_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/copy_thunk.h
@@ -18,9 +18,13 @@ limitations under the License.
 
 #include <cstdint>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/runtime/thunk.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/stream_executor.h"
 
 namespace xla {
 namespace gpu {
@@ -41,8 +45,45 @@ class DeviceToDeviceCopyThunk : public Thunk {
 
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
-  void ClearCompileTimeInfo() override { Thunk::ClearCompileTimeInfo(); }
+  const BufferAllocation::Slice& source() const { return source_buffer_; }
+  const BufferAllocation::Slice& destination() const {
+    return destination_buffer_;
+  }
+  uint64_t size_bytes() const { return mem_size_; }
+
+ private:
+  const BufferAllocation::Slice source_buffer_;
+  const BufferAllocation::Slice destination_buffer_;
+  const uint64_t mem_size_;
+};
 
+//===----------------------------------------------------------------------===//
+// CopyThunk
+//===----------------------------------------------------------------------===//
+class CopyThunk : public Thunk {
+ public:
+  class AsyncEvents {
+   public:
+    // Add a new copy-start completion event.
+    absl::Status Emplace(se::StreamExecutor* executor,
+                         const HloInstruction* instr,
+                         std::unique_ptr<se::Event> event);
+
+    // Retrieve a completion event started by copy-start instruction
+    // `instr`, and remove the event from the collection.
+    absl::StatusOr<std::unique_ptr<se::Event>> Extract(
+        se::StreamExecutor* executor, const HloInstruction* instr);
+
+   private:
+    using Key = std::pair<se::StreamExecutor*, const HloInstruction*>;
+    absl::Mutex mutex_;
+    absl::flat_hash_map<Key, std::unique_ptr<se::Event>> events_
+        ABSL_GUARDED_BY(mutex_);
+  };
+  CopyThunk(ThunkInfo thunk_info, const BufferAllocation::Slice& source_buffer,
+            const BufferAllocation::Slice& destination_buffer,
+            uint64_t mem_size);
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
   const BufferAllocation::Slice& source() const { return source_buffer_; }
   const BufferAllocation::Slice& destination() const {
     return destination_buffer_;
@@ -55,22 +96,71 @@ class DeviceToDeviceCopyThunk : public Thunk {
   const uint64_t mem_size_;
 };
 
-class DeviceToHostCopyThunk : public DeviceToDeviceCopyThunk {
+//===----------------------------------------------------------------------===//
+// DeviceToHostCopyThunk
+//===----------------------------------------------------------------------===//
+// The memcpy between a host and a device
+
+// A thunk that copies data from a device buffer to a host buffer.
+class DeviceToHostCopyThunk : public CopyThunk {
  public:
+  // Constructs a DeviceToHostCopyThunk that copies data from `source_buffer` to
+  // the device buffer `destination_buffer`. `mem_size` is the size of the data
+  // in bytes. `events` are the cuda record/wait events.
+  // `instr` is the copy-start instruction.
   DeviceToHostCopyThunk(ThunkInfo thunk_info,
                         const BufferAllocation::Slice& source_buffer,
                         const BufferAllocation::Slice& destination_buffer,
-                        uint64_t mem_size);
+                        uint64_t mem_size,
+                        std::shared_ptr<CopyThunk::AsyncEvents> events,
+                        const HloInstruction* instr);
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+ private:
+  std::shared_ptr<CopyThunk::AsyncEvents> async_events_;
+  const HloInstruction* instr_;
 };
 
-class HostToDeviceCopyThunk : public DeviceToDeviceCopyThunk {
+//===----------------------------------------------------------------------===//
+// HostToDeviceCopyThunk
+//===----------------------------------------------------------------------===//
+// The memcpy between a host and a device
+
+// A thunk that copies data from a host buffer to a device buffer.
+class HostToDeviceCopyThunk : public CopyThunk {
  public:
+  // Constructs a HostToDeviceCopyThunk that copies data from `source_buffer` to
+  // the host buffer `destination_buffer`. `mem_size` is the size of the data
+  // in bytes. `events` are the cuda record/wait events.
+  // `instr` is the copy-start instruction.
   HostToDeviceCopyThunk(ThunkInfo thunk_info,
                         const BufferAllocation::Slice& source_buffer,
                         const BufferAllocation::Slice& destination_buffer,
-                        uint64_t mem_size);
+                        uint64_t mem_size,
+                        std::shared_ptr<CopyThunk::AsyncEvents> events,
+                        const HloInstruction* instr);
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+ private:
+  std::shared_ptr<CopyThunk::AsyncEvents> async_events_;
+  const HloInstruction* instr_;
+};
+
+//===----------------------------------------------------------------------===//
+// CopyDoneThunk
+//===----------------------------------------------------------------------===//
+
+class CopyDoneThunk : public Thunk {
+ public:
+  CopyDoneThunk(Thunk::Kind kind, ThunkInfo thunk_info,
+                std::shared_ptr<CopyThunk::AsyncEvents> events,
+                const HloInstruction* copy_start_instr);
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+ private:
+  std::shared_ptr<CopyThunk::AsyncEvents> async_events_;
+  const HloInstruction* copy_start_instr_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.cc b/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.cc
index 5c4f17fc831546..b110384ce35563 100644
--- a/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.cc
@@ -150,12 +150,13 @@ absl::Status CustomCallThunk::ExecuteFfiHandler(const ExecuteParams& params) {
   builder.AddAttributes(attrs.Build());
   CallFrame call_frame = builder.Build();
 
-  // TODO(ezhulenev): Remove `ServiceExecutableRunOptions` from FFI handler
+  // TODO(b/340104720): Remove `ServiceExecutableRunOptions` from FFI handler
   // execution context, as apparently it's not easily accessible from Thunk.
   ExecutableRunOptions run_options;
   run_options.set_stream(params.stream);
   run_options.set_allocator(params.buffer_allocations->memory_allocator());
   run_options.set_device_ordinal(params.buffer_allocations->device_ordinal());
+  run_options.set_ffi_execution_context(params.ffi_execution_context);
   ServiceExecutableRunOptions service_run_options(run_options);
 
   CallOptions options = {&service_run_options, called_computation_};
@@ -166,68 +167,5 @@ absl::Status CustomCallThunk::ExecuteOnStream(const ExecuteParams& params) {
   return handler_ ? ExecuteFfiHandler(params) : ExecuteCustomCall(params);
 }
 
-absl::StatusOr<CustomCallThunk::AttributesMap> BuildAttributesMap(
-    mlir::DictionaryAttr dict) {
-  CustomCallThunk::AttributesMap attributes;
-  for (auto& kv : dict) {
-    std::string_view name = kv.getName().strref();
-
-    auto integer = [&](mlir::IntegerAttr integer) {
-      switch (integer.getType().getIntOrFloatBitWidth()) {
-        case 32:
-          attributes[name] = static_cast<int32_t>(integer.getInt());
-          return absl::OkStatus();
-        case 64:
-          attributes[name] = static_cast<int64_t>(integer.getInt());
-          return absl::OkStatus();
-        default:
-          return absl::InvalidArgumentError(absl::StrCat(
-              "Unsupported integer attribute bit width for attribute: ", name));
-      }
-    };
-
-    auto fp = [&](mlir::FloatAttr fp) {
-      switch (fp.getType().getIntOrFloatBitWidth()) {
-        case 32:
-          attributes[name] = static_cast<float>(fp.getValue().convertToFloat());
-          return absl::OkStatus();
-        default:
-          return absl::InvalidArgumentError(absl::StrCat(
-              "Unsupported float attribute bit width for attribute: ", name));
-      }
-    };
-
-    auto arr = [&](mlir::DenseArrayAttr arr) {
-      if (auto dense = mlir::dyn_cast<mlir::DenseI32ArrayAttr>(arr)) {
-        attributes[name] = dense.asArrayRef().vec();
-        return absl::OkStatus();
-      } else if (auto dense = mlir::dyn_cast<mlir::DenseI64ArrayAttr>(arr)) {
-        attributes[name] = dense.asArrayRef().vec();
-        return absl::OkStatus();
-      }
-
-      return absl::InvalidArgumentError(
-          absl::StrCat("Unsupported array element type for attribute: ", name));
-    };
-
-    auto str = [&](mlir::StringAttr str) {
-      attributes[name] = str.getValue().str();
-      return absl::OkStatus();
-    };
-
-    TF_RETURN_IF_ERROR(
-        llvm::TypeSwitch<mlir::Attribute, Status>(kv.getValue())
-            .Case<mlir::IntegerAttr>(integer)
-            .Case<mlir::FloatAttr>(fp)
-            .Case<mlir::DenseArrayAttr>(arr)
-            .Case<mlir::StringAttr>(str)
-            .Default([&](mlir::Attribute) {
-              return absl::InvalidArgumentError(absl::StrCat(
-                  "Unsupported attribute type for attribute: ", name));
-            }));
-  }
-  return attributes;
-}
-
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.h b/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.h
index 02679d2e0d21ff..2d797ecea01a6c 100644
--- a/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.h
@@ -120,11 +120,6 @@ class CustomCallThunk : public Thunk {
   const HloComputation* called_computation_ = nullptr;
 };
 
-// Converts MLIR dictionary attribute attached to a custom call operation to a
-// custom call thunk attributes that are forwarded to the FFI handler.
-absl::StatusOr<CustomCallThunk::AttributesMap> BuildAttributesMap(
-    mlir::DictionaryAttr dict);
-
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/runtime/fft_thunk.cc b/third_party/xla/xla/service/gpu/runtime/fft_thunk.cc
index 728c36752aeed5..7d620522146acf 100644
--- a/third_party/xla/xla/service/gpu/runtime/fft_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/fft_thunk.cc
@@ -15,17 +15,30 @@ limitations under the License.
 
 #include "xla/service/gpu/runtime/fft_thunk.h"
 
+#include <cstdint>
+#include <memory>
 #include <string>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/runtime/thunk.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status_macros.h"
+#include "xla/stream_executor/blas.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/fft.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/types.h"
 #include "xla/util.h"
 #include "tsl/platform/logging.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/runtime/for_all_thunks.cc b/third_party/xla/xla/service/gpu/runtime/for_all_thunks.cc
new file mode 100644
index 00000000000000..77932caad72fa0
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/runtime/for_all_thunks.cc
@@ -0,0 +1,128 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/runtime/for_all_thunks.h"
+
+#include <memory>
+#include <optional>
+
+#include "absl/functional/function_ref.h"
+#include "xla/service/gpu/runtime/address_computation_thunk.h"
+#include "xla/service/gpu/runtime/command_buffer_thunk.h"
+#include "xla/service/gpu/runtime/conditional_thunk.h"
+#include "xla/service/gpu/runtime/sequential_thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
+#include "xla/service/gpu/runtime/while_thunk.h"
+#include "tsl/platform/casts.h"
+
+namespace xla::gpu {
+
+void ForAllThunks(absl::FunctionRef<void(const Thunk*)> fn,
+                  const Thunk* thunk) {
+  // Invoke `fn` with the `Thunk` itself first...
+  fn(thunk);
+  // ... and then handle all nested `Thunks` recursively.
+  switch (thunk->kind()) {
+    case Thunk::kAddressComputation:
+      ForAllThunks(fn,
+                   tensorflow::down_cast<const AddressComputationThunk*>(thunk)
+                       ->embedded_thunk());
+      break;
+    case Thunk::kCommandBuffer:
+      if (const std::optional<ThunkSequence>& sequence =
+              tensorflow::down_cast<const CommandBufferThunk*>(thunk)->thunks();
+          sequence.has_value()) {
+        ForAllThunks(fn, &sequence.value());
+      }
+      break;
+    case Thunk::kConditional:
+      for (const std::unique_ptr<SequentialThunk>& branch :
+           tensorflow::down_cast<const ConditionalThunk*>(thunk)
+               ->branch_thunks()) {
+        ForAllThunks(fn, branch.get());
+      }
+      break;
+    case Thunk::kSequential:
+      ForAllThunks(
+          fn, &tensorflow::down_cast<const SequentialThunk*>(thunk)->thunks());
+      break;
+    case Thunk::kWhile:
+      ForAllThunks(fn, tensorflow::down_cast<const WhileThunk*>(thunk)
+                           ->condition_thunk_sequence());
+      ForAllThunks(fn, tensorflow::down_cast<const WhileThunk*>(thunk)
+                           ->body_thunk_sequence());
+      break;
+    case Thunk::kCholesky:
+    case Thunk::kConvolution:
+    case Thunk::kConvolutionReorder:
+    case Thunk::kCopy:
+    case Thunk::kCopyDone:
+    case Thunk::kCubSort:
+    case Thunk::kCublasLtMatmul:
+    case Thunk::kCustomCall:
+    case Thunk::kCustomKernel:
+    case Thunk::kCuDnn:
+    case Thunk::kFft:
+    case Thunk::kFusedMHA:
+    case Thunk::kGemm:
+    case Thunk::kInfeed:
+    case Thunk::kKernel:
+    case Thunk::kMemset32BitValue:
+    case Thunk::kMemzero:
+    case Thunk::kNcclAllGather:
+    case Thunk::kNcclAllGatherStart:
+    case Thunk::kNcclAllGatherDone:
+    case Thunk::kNcclAllReduce:
+    case Thunk::kNcclAllReduceStart:
+    case Thunk::kNcclAllReduceDone:
+    case Thunk::kNcclCollectiveBroadcast:
+    case Thunk::kNcclCollectiveBroadcastStart:
+    case Thunk::kNcclCollectiveBroadcastDone:
+    case Thunk::kNcclCollectivePermute:
+    case Thunk::kNcclCollectivePermuteStart:
+    case Thunk::kNcclCollectivePermuteDone:
+    case Thunk::kNcclReduceScatter:
+    case Thunk::kNcclReduceScatterStart:
+    case Thunk::kNcclReduceScatterDone:
+    case Thunk::kNcclAllToAll:
+    case Thunk::kNcclAllToAllStart:
+    case Thunk::kNcclAllToAllDone:
+    case Thunk::kNcclSend:
+    case Thunk::kNcclSendDone:
+    case Thunk::kNcclRecv:
+    case Thunk::kNcclRecvDone:
+    case Thunk::kNorm:
+    case Thunk::kOutfeed:
+    case Thunk::kPartitionId:
+    case Thunk::kRecv:
+    case Thunk::kRecvDone:
+    case Thunk::kReplicaId:
+    case Thunk::kSend:
+    case Thunk::kSendDone:
+    case Thunk::kTriangularSolve:
+    case Thunk::kWaitForStreams:
+      // No default. All `Thunk::Kinds` must be handled.
+      break;
+  }
+}
+
+void ForAllThunks(absl::FunctionRef<void(const Thunk*)> fn,
+                  const ThunkSequence* thunks) {
+  for (const std::unique_ptr<Thunk>& thunk : *thunks) {
+    ForAllThunks(fn, thunk.get());
+  }
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/runtime/for_all_thunks.h b/third_party/xla/xla/service/gpu/runtime/for_all_thunks.h
new file mode 100644
index 00000000000000..6f6fc61c34c427
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/runtime/for_all_thunks.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_FOR_ALL_THUNKS_H_
+#define XLA_SERVICE_GPU_RUNTIME_FOR_ALL_THUNKS_H_
+
+#include "absl/functional/function_ref.h"
+#include "xla/service/gpu/runtime/thunk.h"
+
+namespace xla::gpu {
+
+// Recursively invokes `fn` for all `Thunks` in `root`, including those nested
+// within other `Thunks` (e.g. the condition `Thunk` within a `WhileThunk`).
+void ForAllThunks(absl::FunctionRef<void(const Thunk*)> fn, const Thunk* thunk);
+
+// Same as above but for a `ThunkSequence` root.
+void ForAllThunks(absl::FunctionRef<void(const Thunk*)> fn,
+                  const ThunkSequence* thunks);
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_FOR_ALL_THUNKS_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/for_all_thunks_test.cc b/third_party/xla/xla/service/gpu/runtime/for_all_thunks_test.cc
new file mode 100644
index 00000000000000..32f7bc6d144af6
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/runtime/for_all_thunks_test.cc
@@ -0,0 +1,139 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/runtime/for_all_thunks.h"
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/runtime/address_computation_thunk.h"
+#include "xla/service/gpu/runtime/command_buffer_cmd.h"
+#include "xla/service/gpu/runtime/command_buffer_thunk.h"
+#include "xla/service/gpu/runtime/conditional_thunk.h"
+#include "xla/service/gpu/runtime/sequential_thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
+#include "xla/service/gpu/runtime/while_thunk.h"
+
+namespace xla::gpu {
+namespace {
+
+using ::testing::IsSupersetOf;
+using ::testing::UnorderedElementsAre;
+
+// Invokes `ForAllThunks` on the `root` and returns a `vector` containing all
+// iterated `Thunks`.
+std::vector<const Thunk*> GetAllThunks(Thunk* root) {
+  std::vector<const Thunk*> thunks;
+  ForAllThunks([&](const Thunk* thunk) { thunks.push_back(thunk); }, root);
+  return thunks;
+}
+
+// A dummy `Thunk` that does nothing.
+struct DummyThunk : public Thunk {
+  DummyThunk() : Thunk(Thunk::Kind::kGemm, Thunk::ThunkInfo()) {}
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override {
+    return absl::OkStatus();
+  }
+};
+
+TEST(ForAllThunksTest, SingleThunk) {
+  DummyThunk thunk;
+  EXPECT_THAT(GetAllThunks(&thunk), UnorderedElementsAre(&thunk));
+}
+
+TEST(ForAllThunksTest, AddressComputationThunk) {
+  auto thunk = std::make_unique<DummyThunk>();
+  Thunk* thunk_ptr = thunk.get();
+
+  auto thunk_sequence = std::make_unique<ThunkSequence>();
+  thunk_sequence->push_back(std::move(thunk));
+
+  AddressComputationThunk address_computation_thunk(
+      Thunk::ThunkInfo(), std::move(thunk_sequence), {}, {}, {}, {}, {}, {});
+  EXPECT_THAT(
+      GetAllThunks(&address_computation_thunk),
+      // `AddressComputationThunk` wraps the `embedded_thunk` in a
+      // `SequentialThunk`, which is why iterate over more than the
+      // two expected `Thunks`.
+      IsSupersetOf<const Thunk*>({thunk_ptr, &address_computation_thunk}));
+}
+
+TEST(ForAllThunksTest, CommandBufferThunk) {
+  auto thunk = std::make_unique<DummyThunk>();
+  Thunk* thunk_ptr = thunk.get();
+
+  ThunkSequence thunk_sequence;
+  thunk_sequence.push_back(std::move(thunk));
+
+  CommandBufferThunk command_buffer_thunk(CommandBufferCmdSequence(),
+                                          Thunk::ThunkInfo(),
+                                          std::move(thunk_sequence));
+  EXPECT_THAT(GetAllThunks(&command_buffer_thunk),
+              UnorderedElementsAre(thunk_ptr, &command_buffer_thunk));
+}
+
+TEST(ForAllThunksTest, ConditionalThunk) {
+  auto thunk = std::make_unique<DummyThunk>();
+  Thunk* thunk_ptr = thunk.get();
+
+  ThunkSequence thunk_sequence;
+  thunk_sequence.push_back(std::move(thunk));
+
+  auto sequential_thunk = std::make_unique<SequentialThunk>(
+      Thunk::ThunkInfo(), std::move(thunk_sequence));
+  SequentialThunk* sequential_thunk_ptr = sequential_thunk.get();
+
+  ConditionalThunkConfig config;
+  config.branch_thunks.push_back(std::move(sequential_thunk));
+  ConditionalThunk conditional_thunk(Thunk::ThunkInfo(), std::move(config),
+                                     BufferAllocation::Slice());
+
+  EXPECT_THAT(GetAllThunks(&conditional_thunk),
+              UnorderedElementsAre(thunk_ptr, sequential_thunk_ptr,
+                                   &conditional_thunk));
+}
+
+TEST(ForAllThunksTest, WhileThunk) {
+  auto condition_thunk = std::make_unique<DummyThunk>();
+  Thunk* condition_thunk_ptr = condition_thunk.get();
+
+  auto condition_thunk_sequence = std::make_unique<ThunkSequence>();
+  condition_thunk_sequence->push_back(std::move(condition_thunk));
+
+  auto body_thunk = std::make_unique<DummyThunk>();
+  Thunk* body_thunk_ptr = body_thunk.get();
+
+  auto body_thunk_sequence = std::make_unique<ThunkSequence>();
+  body_thunk_sequence->push_back(std::move(body_thunk));
+
+  WhileThunk while_thunk(Thunk::ThunkInfo(), BufferAllocation::Slice(),
+                         std::move(condition_thunk_sequence),
+                         std::move(body_thunk_sequence));
+
+  EXPECT_THAT(GetAllThunks(&while_thunk),
+              // `WhileThunk` wraps the `condition_thunk_sequence` and
+              // `body_thunk_sequence` in `SequentialThunks`, which is why
+              // iterate over more than the three expected `Thunks`.
+              IsSupersetOf<const Thunk*>(
+                  {condition_thunk_ptr, body_thunk_ptr, &while_thunk}));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/runtime/fused_mha_thunk.cc b/third_party/xla/xla/service/gpu/runtime/fused_mha_thunk.cc
index bb993ddfb3b013..41613f0121e65f 100644
--- a/third_party/xla/xla/service/gpu/runtime/fused_mha_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/fused_mha_thunk.cc
@@ -37,7 +37,6 @@ FusedMHAThunk::FusedMHAThunk(
       rhs_bmm2_buffer_(rhs_bmm2),
       output_buffer_(output),
       scratch_buffer_(scratch),
-      mask_buffer_(mask),
       bias_buffer_(bias),
       activation_buffer_(activation),
       seqlen_q_buffer_(seqlen_q),
@@ -66,6 +65,13 @@ std::optional<se::DeviceMemoryBase> AssignBufferIfNotNull(
              : std::nullopt;
 }
 
+absl::Status FusedMHAThunk::Initialize(const InitializeParams& params) {
+  se::dnn::LazyOpRunner<se::dnn::FusedMHAOp>* lazy_runner =
+      GetOrCreateRunner(params.stream).AsFusedMHARunner();
+  TF_ASSIGN_OR_RETURN(auto config, config_.AsDnnFusedMHAOpConfig());
+  return lazy_runner->GetOrCreateRunner(config, params.stream).status();
+}
+
 absl::Status FusedMHAThunk::ExecuteOnStream(const ExecuteParams& params) {
   const auto& buffer_allocations = *params.buffer_allocations;
   se::DeviceMemoryBase lhs_bmm1_buffer =
@@ -79,8 +85,6 @@ absl::Status FusedMHAThunk::ExecuteOnStream(const ExecuteParams& params) {
   se::DeviceMemoryBase scratch_buffer =
       buffer_allocations.GetDeviceAddress(scratch_buffer_);
 
-  std::optional<se::DeviceMemoryBase> mask_buffer =
-      AssignBufferIfNotNull(buffer_allocations, mask_buffer_);
   std::optional<se::DeviceMemoryBase> bias_buffer =
       AssignBufferIfNotNull(buffer_allocations, bias_buffer_);
   std::optional<se::DeviceMemoryBase> activation_buffer =
@@ -91,10 +95,10 @@ absl::Status FusedMHAThunk::ExecuteOnStream(const ExecuteParams& params) {
       AssignBufferIfNotNull(buffer_allocations, seqlen_k_buffer_);
   RunFusedMHAOptions opts;
   opts.runner_cache = &GetOrCreateRunner(params.stream);
-  TF_RETURN_IF_ERROR(RunGpuFMHA(
-      config_, lhs_bmm1_buffer, rhs_bmm1_buffer, rhs_bmm2_buffer, output_buffer,
-      scratch_buffer, mask_buffer, bias_buffer, activation_buffer,
-      seqlen_q_buffer, seqlen_k_buffer, params.stream, opts));
+  TF_RETURN_IF_ERROR(RunGpuFMHA(config_, lhs_bmm1_buffer, rhs_bmm1_buffer,
+                                rhs_bmm2_buffer, output_buffer, scratch_buffer,
+                                bias_buffer, activation_buffer, seqlen_q_buffer,
+                                seqlen_k_buffer, params.stream, opts));
 
   if (!params.stream->ok()) {
     return Internal("FusedMHAThunk::ExecuteOnStream failed.");
@@ -124,7 +128,6 @@ FusedMHABackwardThunk::FusedMHABackwardThunk(
       d_bmm1_rhs_buffer_(d_bmm1_rhs),
       d_bmm2_rhs_buffer_(d_bmm2_rhs),
       d_s_buffer_(d_s),
-      mask_buffer_(mask),
       d_bias_buffer_(d_bias),
       fwd_output_buffer_(fwd_output),
       bias_buffer_(bias),
@@ -147,6 +150,13 @@ FusedMHABackwardThunk::GetOrCreateRunner(
   return *it->second;
 }
 
+absl::Status FusedMHABackwardThunk::Initialize(const InitializeParams& params) {
+  se::dnn::LazyOpRunner<se::dnn::FusedMHABackwardOp>* lazy_runner =
+      GetOrCreateRunner(params.stream).AsFusedMHABackwardRunner();
+  TF_ASSIGN_OR_RETURN(auto config, config_.AsDnnFusedMHABackwardOpConfig());
+  return lazy_runner->GetOrCreateRunner(config, params.stream).status();
+}
+
 absl::Status FusedMHABackwardThunk::ExecuteOnStream(
     const ExecuteParams& params) {
   const auto& buffer_allocations = *params.buffer_allocations;
@@ -179,8 +189,6 @@ absl::Status FusedMHABackwardThunk::ExecuteOnStream(
 
   std::optional<se::DeviceMemoryBase> d_s_buffer =
       AssignBufferIfNotNull(buffer_allocations, d_s_buffer_);
-  std::optional<se::DeviceMemoryBase> mask_buffer =
-      AssignBufferIfNotNull(buffer_allocations, mask_buffer_);
   std::optional<se::DeviceMemoryBase> d_bias_buffer =
       AssignBufferIfNotNull(buffer_allocations, d_bias_buffer_);
   std::optional<se::DeviceMemoryBase> fwd_output_buffer =
@@ -199,7 +207,7 @@ absl::Status FusedMHABackwardThunk::ExecuteOnStream(
       config_, bmm1_grad_gemm1_rhs_buffer, bmm1_grad_gemm2_rhs_buffer,
       bmm2_grad_gemm1_lhs_buffer, bmm2_grad_gemm2_rhs_buffer, d_output_buffer,
       scratch_buffer, d_bmm1_lhs_buffer, d_bmm1_rhs_buffer, d_bmm2_rhs_buffer,
-      d_s_buffer, mask_buffer, d_bias_buffer, fwd_output_buffer, bias_buffer,
+      d_s_buffer, d_bias_buffer, fwd_output_buffer, bias_buffer,
       seqlen_q_buffer, seqlen_k_buffer, params.stream, opts));
   if (!params.stream->ok()) {
     return Internal("FusedMHABackwardThunk::ExecuteOnStream failed.");
diff --git a/third_party/xla/xla/service/gpu/runtime/fused_mha_thunk.h b/third_party/xla/xla/service/gpu/runtime/fused_mha_thunk.h
index e89365b6786896..bf9cff354b5027 100644
--- a/third_party/xla/xla/service/gpu/runtime/fused_mha_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/fused_mha_thunk.h
@@ -53,6 +53,7 @@ class FusedMHAThunk : public Thunk {
   FusedMHAThunk(const FusedMHAThunk&) = delete;
   FusedMHAThunk& operator=(const FusedMHAThunk&) = delete;
 
+  absl::Status Initialize(const InitializeParams& params) override;
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
  private:
@@ -61,7 +62,6 @@ class FusedMHAThunk : public Thunk {
   BufferAllocation::Slice rhs_bmm2_buffer_;
   BufferAllocation::Slice output_buffer_;
   BufferAllocation::Slice scratch_buffer_;
-  BufferAllocation::Slice mask_buffer_;
   BufferAllocation::Slice bias_buffer_;
   BufferAllocation::Slice activation_buffer_;
   BufferAllocation::Slice seqlen_q_buffer_;
@@ -102,6 +102,7 @@ class FusedMHABackwardThunk : public Thunk {
   FusedMHABackwardThunk(const FusedMHABackwardThunk&) = delete;
   FusedMHABackwardThunk& operator=(const FusedMHABackwardThunk&) = delete;
 
+  absl::Status Initialize(const InitializeParams& params) override;
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
  private:
@@ -115,7 +116,6 @@ class FusedMHABackwardThunk : public Thunk {
   BufferAllocation::Slice d_bmm1_rhs_buffer_;
   BufferAllocation::Slice d_bmm2_rhs_buffer_;
   BufferAllocation::Slice d_s_buffer_;
-  BufferAllocation::Slice mask_buffer_;
   BufferAllocation::Slice d_bias_buffer_;
   BufferAllocation::Slice fwd_output_buffer_;
   BufferAllocation::Slice bias_buffer_;
diff --git a/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.cc b/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.cc
index a6aaabe5b31896..44081ebf11a725 100644
--- a/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "xla/service/gpu/runtime/thunk.h"
 #include "xla/status.h"
 #include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/scratch_allocator.h"
 #include "tsl/platform/logging.h"
 
 namespace xla {
@@ -35,7 +34,8 @@ CublasLtMatmulThunk::CublasLtMatmulThunk(
     BufferAllocation::Slice bias_buffer, BufferAllocation::Slice aux_buffer,
     BufferAllocation::Slice a_scale, BufferAllocation::Slice b_scale,
     BufferAllocation::Slice c_scale, BufferAllocation::Slice d_scale,
-    BufferAllocation::Slice d_amax)
+    BufferAllocation::Slice d_amax,
+    std::optional<const BufferAllocation::Slice> workspace_buffer)
     : Thunk(Kind::kCublasLtMatmul, thunk_info),
       gemm_config_(std::move(gemm_config)),
       epilogue_(epilogue),
@@ -50,11 +50,17 @@ CublasLtMatmulThunk::CublasLtMatmulThunk(
       b_scale_buffer_(b_scale),
       c_scale_buffer_(c_scale),
       d_scale_buffer_(d_scale),
-      d_amax_buffer_(d_amax) {}
+      d_amax_buffer_(d_amax),
+      workspace_buffer_(workspace_buffer) {}
 
 absl::Status CublasLtMatmulThunk::ExecuteOnStream(const ExecuteParams& params) {
   TF_ASSIGN_OR_RETURN(auto plan, GetMatmulPlan(params.stream));
-  TF_ASSIGN_OR_RETURN(auto algorithm, GetMatmulAlgorithm(plan));
+
+  TF_ASSIGN_OR_RETURN(
+      auto algorithm,
+      GetMatmulAlgorithm(plan, workspace_buffer_.has_value()
+                                   ? workspace_buffer_.value().size()
+                                   : 0));
 
   VLOG(3) << "Running cublas_lt matmul thunk";
   const BufferAllocations& allocs = *params.buffer_allocations;
@@ -84,13 +90,16 @@ absl::Status CublasLtMatmulThunk::ExecuteOnStream(const ExecuteParams& params) {
     aux = allocs.GetDeviceAddress(aux_buffer_);
   }
 
-  se::OwningScratchAllocator<> scratch_allocator(allocs.device_ordinal(),
-                                                 allocs.memory_allocator());
+  std::optional<se::DeviceMemoryBase> workspace;
+  if (workspace_buffer_.has_value()) {
+    workspace = allocs.GetDeviceAddress(workspace_buffer_.value());
+  }
+
   return plan->ExecuteOnStream(
       params.stream, allocs.GetDeviceAddress(a_buffer_),
       allocs.GetDeviceAddress(b_buffer_), allocs.GetDeviceAddress(c_buffer_),
       allocs.GetDeviceAddress(d_buffer_), bias, aux, a_scale, b_scale, c_scale,
-      d_scale, d_amax, algorithm, scratch_allocator);
+      d_scale, d_amax, algorithm, workspace);
 }
 
 absl::StatusOr<se::gpu::BlasLt::MatmulPlan*> CublasLtMatmulThunk::GetMatmulPlan(
@@ -109,14 +118,17 @@ absl::StatusOr<se::gpu::BlasLt::MatmulPlan*> CublasLtMatmulThunk::GetMatmulPlan(
 }
 
 absl::StatusOr<se::gpu::BlasLt::MatmulAlgorithm>
-CublasLtMatmulThunk::GetMatmulAlgorithm(
-    const se::gpu::BlasLt::MatmulPlan* plan) {
+CublasLtMatmulThunk::GetMatmulAlgorithm(const se::gpu::BlasLt::MatmulPlan* plan,
+                                        int64_t max_workspace) {
   {
     absl::MutexLock lock(&matmul_algorithm_cache_mutex_);
     auto it = matmul_algorithm_cache_.find(plan);
     if (it != matmul_algorithm_cache_.end()) return it->second;
   }
-  TF_ASSIGN_OR_RETURN(auto algorithms, plan->GetAlgorithms());
+  TF_ASSIGN_OR_RETURN(
+      auto algorithms,
+      plan->GetAlgorithms(/*max_algorithm_count*/ 128,
+                          /*max_workspace_size*/ max_workspace));
   TF_RET_CHECK(algorithm_idx_ >= 0 && algorithm_idx_ < algorithms.size());
 
   absl::MutexLock lock(&matmul_algorithm_cache_mutex_);
@@ -125,5 +137,12 @@ CublasLtMatmulThunk::GetMatmulAlgorithm(
   return it->second;
 }
 
+absl::Status CublasLtMatmulThunk::Initialize(const InitializeParams& params) {
+  if (!params.executor->AsBlas()) {
+    return absl::InternalError("Failed to initialize BLASLT support");
+  }
+  return absl::OkStatus();
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.h b/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.h
index 90ff6a1d6c24ad..7b6e7d75c858db 100644
--- a/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.h
@@ -34,27 +34,28 @@ namespace gpu {
 
 class CublasLtMatmulThunk : public Thunk {
  public:
-  CublasLtMatmulThunk(ThunkInfo thunk_info, GemmConfig gemm_config,
-                      se::gpu::BlasLt::Epilogue epilogue, int64_t algorithm_idx,
-                      BufferAllocation::Slice a_buffer,
-                      BufferAllocation::Slice b_buffer,
-                      BufferAllocation::Slice c_buffer,
-                      BufferAllocation::Slice d_buffer,
-                      BufferAllocation::Slice bias_buffer /* may be null */,
-                      BufferAllocation::Slice aux_buffer /* may be null */,
-                      BufferAllocation::Slice a_scale_buffer /* may be null */,
-                      BufferAllocation::Slice b_scale_buffer /* may be null */,
-                      BufferAllocation::Slice c_scale_buffer /* may be null */,
-                      BufferAllocation::Slice d_scale_buffer /* may be null */,
-                      BufferAllocation::Slice d_amax_buffer /* may be null */);
+  CublasLtMatmulThunk(
+      ThunkInfo thunk_info, GemmConfig gemm_config,
+      se::gpu::BlasLt::Epilogue epilogue, int64_t algorithm_idx,
+      BufferAllocation::Slice a_buffer, BufferAllocation::Slice b_buffer,
+      BufferAllocation::Slice c_buffer, BufferAllocation::Slice d_buffer,
+      BufferAllocation::Slice bias_buffer /* may be null */,
+      BufferAllocation::Slice aux_buffer /* may be null */,
+      BufferAllocation::Slice a_scale_buffer /* may be null */,
+      BufferAllocation::Slice b_scale_buffer /* may be null */,
+      BufferAllocation::Slice c_scale_buffer /* may be null */,
+      BufferAllocation::Slice d_scale_buffer /* may be null */,
+      BufferAllocation::Slice d_amax_buffer /* may be null */,
+      std::optional<const BufferAllocation::Slice> workspace_buffer);
 
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+  absl::Status Initialize(const InitializeParams& params) override;
 
  private:
   absl::StatusOr<se::gpu::BlasLt::MatmulPlan*> GetMatmulPlan(
       const stream_executor::Stream* stream);
   absl::StatusOr<se::gpu::BlasLt::MatmulAlgorithm> GetMatmulAlgorithm(
-      const se::gpu::BlasLt::MatmulPlan* plan);
+      const se::gpu::BlasLt::MatmulPlan* plan, int64_t max_workspace);
 
   absl::Mutex matmul_plans_cache_mutex_;
   absl::flat_hash_map<const stream_executor::Stream*,
@@ -80,6 +81,7 @@ class CublasLtMatmulThunk : public Thunk {
   BufferAllocation::Slice c_scale_buffer_;
   BufferAllocation::Slice d_scale_buffer_;
   BufferAllocation::Slice d_amax_buffer_;
+  std::optional<const BufferAllocation::Slice> workspace_buffer_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/runtime/infeed_thunk.cc b/third_party/xla/xla/service/gpu/runtime/infeed_thunk.cc
index 02dabdf25e41ce..c3f183fbb7afc2 100644
--- a/third_party/xla/xla/service/gpu/runtime/infeed_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/infeed_thunk.cc
@@ -18,8 +18,10 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/infeed_manager.h"
+#include "xla/shape_tree.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
+#include "xla/stream_executor/device_memory_handle.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/util.h"
 
@@ -35,14 +37,14 @@ absl::Status InfeedThunk::ExecuteOnStream(const ExecuteParams& params) {
   const BufferAllocations& buffer_allocations = *params.buffer_allocations;
 
   VLOG(2) << "Infeeding to GPU";
-  ShapeTree<se::ScopedDeviceMemory<uint8_t>> source_buffers =
+  ShapeTree<se::DeviceMemoryHandle> source_buffers =
       GetOrCreateInfeedManager(stream.parent())->BlockingGetNextDestination();
 
   size_t index = 0;
   for (auto& source : source_buffers.leaves()) {
     // Assert that the shapes are compatible.
     const ShapeIndex& shape_index = source.first;
-    se::ScopedDeviceMemory<uint8_t>& buffer = source.second;
+    se::DeviceMemoryHandle& buffer = source.second;
     const Shape& source_shape =
         ShapeUtil::GetSubshape(source_buffers.shape(), shape_index);
     TF_RET_CHECK(
@@ -54,7 +56,7 @@ absl::Status InfeedThunk::ExecuteOnStream(const ExecuteParams& params) {
     se::DeviceMemoryBase dest_address =
         buffer_allocations.GetDeviceAddress(dest_slices_[index++].slice);
     TF_RETURN_IF_ERROR(
-        stream.Memcpy(&dest_address, *buffer.ptr(), buffer.ptr()->size()));
+        stream.Memcpy(&dest_address, buffer.memory(), buffer.memory().size()));
   }
 
   // Make sure that all dest slices have been copied into.
diff --git a/third_party/xla/xla/service/gpu/runtime/kernel_thunk.cc b/third_party/xla/xla/service/gpu/runtime/kernel_thunk.cc
index 063940d8ab7cd4..e5cca1703a8e1d 100644
--- a/third_party/xla/xla/service/gpu/runtime/kernel_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/kernel_thunk.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "xla/status.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_factory.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "tsl/platform/errors.h"
@@ -188,9 +189,9 @@ absl::Status CustomKernelThunk::Initialize(const InitializeParams& params) {
 
   auto it = kernel_cache_.find(params.executor);
   if (kernel_cache_.end() == it) {
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<se::Kernel> kernel,
-        se::Kernel::Create(params.executor, custom_kernel_.kernel_spec()));
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<se::Kernel> kernel,
+                        se::KernelFactory::Create(
+                            params.executor, custom_kernel_.kernel_spec()));
     kernel_cache_.emplace(params.executor, std::move(kernel));
   }
 
diff --git a/third_party/xla/xla/service/gpu/runtime/kernel_thunk.h b/third_party/xla/xla/service/gpu/runtime/kernel_thunk.h
index 1b50ce6be28694..0f8c36b80a3065 100644
--- a/third_party/xla/xla/service/gpu/runtime/kernel_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/kernel_thunk.h
@@ -82,8 +82,6 @@ class KernelThunk : public Thunk {
   absl::Status Initialize(const InitializeParams& params) override;
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
-  void ClearCompileTimeInfo() override { Thunk::ClearCompileTimeInfo(); }
-
   const std::vector<BufferAllocation::Slice>& arguments() const {
     return args_;
   }
diff --git a/third_party/xla/xla/service/gpu/runtime/memset_thunk.h b/third_party/xla/xla/service/gpu/runtime/memset_thunk.h
index e1eef4c39c5d3a..6d7cf63ebbbafb 100644
--- a/third_party/xla/xla/service/gpu/runtime/memset_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/memset_thunk.h
@@ -37,8 +37,6 @@ class MemzeroThunk : public Thunk {
 
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
-  void ClearCompileTimeInfo() override { Thunk::ClearCompileTimeInfo(); }
-
   const BufferAllocation::Slice& destination() const { return dest_; }
 
  private:
@@ -57,8 +55,6 @@ class Memset32BitValueThunk : public Thunk {
 
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
-  void ClearCompileTimeInfo() override { Thunk::ClearCompileTimeInfo(); }
-
   const BufferAllocation::Slice& destination() const { return dest_; }
   uint32_t value() const { return value_; }
 
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_api.cc b/third_party/xla/xla/service/gpu/runtime/nccl_api.cc
index f360c66cef6744..49abfa4636b44e 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_api.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_api.cc
@@ -32,13 +32,14 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/primitive_util.h"
 #include "xla/service/collective_ops_utils.h"
-#include "xla/service/gpu/nccl_clique_key.h"
+#include "xla/service/gpu/runtime/nccl_clique_key.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/gpu/gpu_activation.h"
 #include "xla/stream_executor/gpu/gpu_stream.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
@@ -54,6 +55,12 @@ limitations under the License.
 #include "third_party/nccl/nccl.h"
 #endif  // TENSORFLOW_USE_ROCM
 
+#if (defined(PLATFORM_GOOGLE) && !defined(TENSORFLOW_USE_ROCM))
+#define WITH_PERSISTENT_PLAN_ALLOCATOR_SUPPORT true
+#else
+#define WITH_PERSISTENT_PLAN_ALLOCATOR_SUPPORT false
+#endif
+
 namespace xla::gpu {
 
 //==-----------------------------------------------------------------------===//
@@ -183,7 +190,7 @@ static ncclComm_t Cast(NcclApi::NcclCommHandle comm) {
   return reinterpret_cast<ncclComm_t>(comm);
 }
 
-#ifdef PLATFORM_GOOGLE
+#if WITH_PERSISTENT_PLAN_ALLOCATOR_SUPPORT
 static ncclPersistentPlanAllocator* Cast(
     NcclApi::NcclPersistentPlanAllocatorHandle handle) {
   return reinterpret_cast<ncclPersistentPlanAllocator*>(handle);
@@ -198,7 +205,7 @@ static NcclApi::NcclPersistentPlanAllocatorHandle Cast(
     ncclPersistentPlanAllocator* ptr) {
   return reinterpret_cast<NcclApi::NcclPersistentPlanAllocatorHandle>(ptr);
 }
-#endif  // PLATFORM_GOOGLE
+#endif  // WITH_PERSISTENT_PLAN_ALLOCATOR_SUPPORT
 
 //==-----------------------------------------------------------------------===//
 // NcclApi::PersistentPlanAllocator
@@ -216,7 +223,7 @@ PersistentPlanAllocator::PersistentPlanAllocator(
       stream_(stream) {
   // NCCL persistent plan allocator is implemented as NCCL patch that is not yet
   // open sourced and can't be used from OSS XLA.
-#ifdef PLATFORM_GOOGLE
+#if WITH_PERSISTENT_PLAN_ALLOCATOR_SUPPORT
   auto* nccl_allocator = new ncclPersistentPlanAllocator;
   nccl_allocator->ctl = this;
 
@@ -237,13 +244,13 @@ PersistentPlanAllocator::PersistentPlanAllocator(
   };
 
   handle_ = Cast(nccl_allocator);
-#endif  // PLATFORM_GOOGLE
+#endif  // WITH_PERSISTENT_PLAN_ALLOCATOR_SUPPORT
 }
 
 PersistentPlanAllocator::~PersistentPlanAllocator() {
-#ifdef PLATFORM_GOOGLE
+#if WITH_PERSISTENT_PLAN_ALLOCATOR_SUPPORT
   delete Cast(handle_);
-#endif  // PLATFORM_GOOGLE
+#endif  // WITH_PERSISTENT_PLAN_ALLOCATOR_SUPPORT
 }
 
 absl::StatusOr<se::DeviceMemoryBase>
@@ -265,22 +272,22 @@ absl::Status PersistentPlanAllocator::Deallocate(se::DeviceMemoryBase mem) {
 ScopedPersistentPlanAllocator::ScopedPersistentPlanAllocator(
     NcclCommHandle comm, tsl::RCReference<PersistentPlanAllocator> allocator)
     : comm_(comm), allocator_(std::move(allocator)) {
-#ifdef PLATFORM_GOOGLE
+#if WITH_PERSISTENT_PLAN_ALLOCATOR_SUPPORT
   XLA_NCCL_CHECK(
       ncclCommGetPersistentPlanAllocator(Cast(comm_), Cast(&recover_)))
       << "Failed to get NCCL persistent plan allocator";
   XLA_NCCL_CHECK(ncclCommSetPersistentPlanAllocator(Cast(comm_),
                                                     Cast(allocator_->handle())))
       << "Failed to set NCCL persistent plan allocator";
-#endif  // PLATFORM_GOOGLE
+#endif  // WITH_PERSISTENT_PLAN_ALLOCATOR_SUPPORT
 }
 
 ScopedPersistentPlanAllocator::~ScopedPersistentPlanAllocator() {
-#ifdef PLATFORM_GOOGLE
+#if WITH_PERSISTENT_PLAN_ALLOCATOR_SUPPORT
   XLA_NCCL_CHECK(
       ncclCommSetPersistentPlanAllocator(Cast(comm_), Cast(recover_)))
       << "Failed to set NCCL persistent plan allocator";
-#endif  // PLATFORM_GOOGLE
+#endif  // WITH_PERSISTENT_PLAN_ALLOCATOR_SUPPORT
 }
 
 //==-----------------------------------------------------------------------===//
@@ -654,6 +661,9 @@ DefaultNcclApi::DeregisterBuffer(NcclCommHandle comm,
 #if (NCCL_VERSION_CODE >= 21901)
   return XLA_NCCL_STATUS(
       ncclCommDeregister(Cast(comm), reinterpret_cast<void*>(handle)));
+#else
+  return absl::UnimplementedError(
+      "ncclCommDeregister is unavailable in this build.");
 #endif  // NCCL_VERSION_CODE >= 21901
 }
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_api.h b/third_party/xla/xla/service/gpu/runtime/nccl_api.h
index 05f230088826c7..76747b64f703c3 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_api.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_api.h
@@ -26,12 +26,12 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "xla/service/collective_ops_utils.h"
-#include "xla/service/gpu/nccl_clique_key.h"
+#include "xla/service/gpu/runtime/nccl_clique_key.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/logging.h"
 
 namespace xla::gpu {
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_api_stub.cc b/third_party/xla/xla/service/gpu/runtime/nccl_api_stub.cc
index a0eab15e445428..b0cfad8fc23dfe 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_api_stub.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_api_stub.cc
@@ -22,11 +22,11 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "xla/service/collective_ops_utils.h"
-#include "xla/service/gpu/nccl_clique_key.h"
 #include "xla/service/gpu/runtime/nccl_api.h"
+#include "xla/service/gpu/runtime/nccl_clique_key.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream.h"
-#include "tsl/concurrency/ref_count.h"
+#include "xla/tsl/concurrency/ref_count.h"
 
 namespace xla::gpu {
 
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_clique.cc b/third_party/xla/xla/service/gpu/runtime/nccl_clique.cc
index 605c449c3c5fd5..f6199f9e6e9d6f 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_clique.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_clique.cc
@@ -42,8 +42,8 @@ limitations under the License.
 #include "xla/debug_options_flags.h"
 #include "xla/executable_run_options.h"
 #include "xla/service/global_device_id.h"
-#include "xla/service/gpu/nccl_clique_key.h"
 #include "xla/service/gpu/runtime/nccl_api.h"
+#include "xla/service/gpu/runtime/nccl_clique_key.h"
 #include "xla/service/lockable.h"
 #include "xla/service/rendezvous.h"
 #include "xla/status_macros.h"
@@ -407,7 +407,7 @@ static absl::StatusOr<std::shared_ptr<NcclClique::Lock>> InitializeNcclClique(
 
     absl::btree_map<int32_t, NcclApi::OwnedNcclComm> comms;
     for (size_t i = 0; i < splitted_comms.size(); ++i) {
-      comms[i] = std::move(splitted_comms[i]);
+      comms[keys[i]] = std::move(splitted_comms[i]);
     }
 
     VLOG(3) << absl::StreamFormat(
@@ -498,10 +498,6 @@ absl::StatusOr<std::shared_ptr<NcclClique::Lock>> AcquireNcclClique(
 
   if (enable_nccl_comm_splitting) {
     for (auto& [acquired_clique_key, acquired_clique] : acquired_cliques) {
-      // We don't support splitting non-local cliques as it requires careful
-      // synchronization between multiple processes.
-      if (!(*acquired_clique)->IsLocal()) continue;
-
       if (clique_key.IsSubsetOf(acquired_clique_key)) {
         return InitializeNcclClique(device, run_id, clique_key, acquired_clique,
                                     num_local_participants, rank, config);
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_clique.h b/third_party/xla/xla/service/gpu/runtime/nccl_clique.h
index d02c68a5e1e7d4..df347646b46dba 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_clique.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_clique.h
@@ -29,8 +29,8 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
 #include "xla/executable_run_options.h"
-#include "xla/service/gpu/nccl_clique_key.h"
 #include "xla/service/gpu/runtime/nccl_api.h"
+#include "xla/service/gpu/runtime/nccl_clique_key.h"
 #include "xla/service/lockable.h"
 #include "xla/stream_executor/stream_executor.h"
 
diff --git a/third_party/xla/xla/service/gpu/nccl_clique_key.cc b/third_party/xla/xla/service/gpu/runtime/nccl_clique_key.cc
similarity index 98%
rename from third_party/xla/xla/service/gpu/nccl_clique_key.cc
rename to third_party/xla/xla/service/gpu/runtime/nccl_clique_key.cc
index 763ee8fe2dc711..a7b068c4a9a0b4 100644
--- a/third_party/xla/xla/service/gpu/nccl_clique_key.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_clique_key.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/nccl_clique_key.h"
+#include "xla/service/gpu/runtime/nccl_clique_key.h"
 
 #include <algorithm>
 #include <cstdint>
diff --git a/third_party/xla/xla/service/gpu/nccl_clique_key.h b/third_party/xla/xla/service/gpu/runtime/nccl_clique_key.h
similarity index 93%
rename from third_party/xla/xla/service/gpu/nccl_clique_key.h
rename to third_party/xla/xla/service/gpu/runtime/nccl_clique_key.h
index 805893c0e95dd3..86c8d69b5380cc 100644
--- a/third_party/xla/xla/service/gpu/nccl_clique_key.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_clique_key.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_NCCL_CLIQUE_KEY_H_
-#define XLA_SERVICE_GPU_NCCL_CLIQUE_KEY_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME_NCCL_CLIQUE_KEY_H_
+#define XLA_SERVICE_GPU_RUNTIME_NCCL_CLIQUE_KEY_H_
 
 #include <array>
 #include <cstdint>
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "xla/service/global_device_id.h"
+#include "tsl/lib/gtl/int_type.h"
 
 namespace xla::gpu {
 
@@ -61,11 +62,8 @@ constexpr static int64_t kAsyncStreamTotal =
 // Assigns a unique ID to a stream for asynchronous or synchronous execution.
 // These IDs can be used, for example, to look up the NCCL communicator.
 inline NcclStreamId GetStreamId(
-    uint64_t main_stream_id, bool is_async,
-    AsyncStreamKind stream_kind = AsyncStreamKind::kCollective) {
-  return NcclStreamId(is_async ? (main_stream_id << 3) +
-                                     static_cast<uint64_t>(stream_kind) + 1
-                               : main_stream_id << 3);
+    bool is_async, AsyncStreamKind stream_kind = AsyncStreamKind::kCollective) {
+  return NcclStreamId(is_async ? static_cast<uint64_t>(stream_kind) + 1 : 0);
 }
 
 //===----------------------------------------------------------------------===//
@@ -165,4 +163,4 @@ using NcclCliqueIdCallback =  // NOLINT
 
 }  // namespace xla::gpu
 
-#endif  // XLA_SERVICE_GPU_NCCL_CLIQUE_KEY_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME_NCCL_CLIQUE_KEY_H_
diff --git a/third_party/xla/xla/service/gpu/nccl_clique_key_test.cc b/third_party/xla/xla/service/gpu/runtime/nccl_clique_key_test.cc
similarity index 97%
rename from third_party/xla/xla/service/gpu/nccl_clique_key_test.cc
rename to third_party/xla/xla/service/gpu/runtime/nccl_clique_key_test.cc
index d8db220362289e..4346f544db20bc 100644
--- a/third_party/xla/xla/service/gpu/nccl_clique_key_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_clique_key_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/nccl_clique_key.h"
+#include "xla/service/gpu/runtime/nccl_clique_key.h"
 
 #include <cstdint>
 #include <functional>
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.cc
index f11b8c850417d6..76d316feededf3 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.cc
@@ -43,7 +43,7 @@ NcclCollectiveBroadcastStartThunk::NcclCollectiveBroadcastStartThunk(
       config_(GetNcclCollectiveConfig(instr, std::nullopt)),
       buffers_(std::move(buffers)) {}
 
-/*static*/ Status NcclCollectiveBroadcastStartThunk::CheckImplementable(
+/*static*/ absl::Status NcclCollectiveBroadcastStartThunk::CheckImplementable(
     const HloInstruction* instr, int64_t replica_count,
     int64_t partition_count) {
   return OkStatus();
@@ -55,7 +55,7 @@ NcclCollectiveBroadcastStartThunk::GetGroupMode(
   return GetNcclCollectiveConfig(inst, std::nullopt).group_mode;
 }
 
-Status NcclCollectiveBroadcastStartThunk::RunNcclCollective(
+absl::Status NcclCollectiveBroadcastStartThunk::RunNcclCollective(
     const ExecuteParams& params, se::Stream& stream,
     NcclCommHandleWrapper comm_wrapper) {
   TF_ASSIGN_OR_RETURN(
@@ -65,9 +65,10 @@ Status NcclCollectiveBroadcastStartThunk::RunNcclCollective(
       device_buffers, stream, comm_wrapper.comm_handle, nccl_api());
 }
 
-Status RunCollectiveBroadcast(std::vector<DeviceBufferPair>& buffers,
-                              se::Stream& stream, NcclApi::NcclCommHandle comm,
-                              NcclApi* nccl_api) {
+absl::Status RunCollectiveBroadcast(std::vector<DeviceBufferPair>& buffers,
+                                    se::Stream& stream,
+                                    NcclApi::NcclCommHandle comm,
+                                    NcclApi* nccl_api) {
   TF_RETURN_IF_ERROR(nccl_api->GroupStart());
   for (auto buffer : buffers) {
     se::DeviceMemoryBase src_addr = buffer.source_buffer;
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.h b/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.h
index 1bf7a021bd46b7..8c0cc19215557d 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.h
@@ -32,9 +32,9 @@ namespace xla::gpu {
 // Thunk that performs a NCCL-based collective broadcast.
 class NcclCollectiveBroadcastStartThunk : public NcclCollectiveThunk {
  public:
-  static Status CheckImplementable(const HloInstruction* instr,
-                                   int64_t replica_count,
-                                   int64_t partition_count);
+  static absl::Status CheckImplementable(const HloInstruction* instr,
+                                         int64_t replica_count,
+                                         int64_t partition_count);
 
   static CollectiveOpGroupMode GetGroupMode(
       const HloCollectiveBroadcastInstruction* inst);
@@ -50,17 +50,19 @@ class NcclCollectiveBroadcastStartThunk : public NcclCollectiveThunk {
       std::vector<Buffer> buffers);
 
  protected:
-  Status RunNcclCollective(const ExecuteParams& params, se::Stream& stream,
-                           NcclCommHandleWrapper comm_wrapper) override;
+  absl::Status RunNcclCollective(const ExecuteParams& params,
+                                 se::Stream& stream,
+                                 NcclCommHandleWrapper comm_wrapper) override;
 
  private:
   const NcclCollectiveConfig config_;
   const std::vector<Buffer> buffers_;
 };
 
-Status RunCollectiveBroadcast(std::vector<DeviceBufferPair>& buffers,
-                              se::Stream& stream, NcclApi::NcclCommHandle comm,
-                              NcclApi* nccl_api);
+absl::Status RunCollectiveBroadcast(std::vector<DeviceBufferPair>& buffers,
+                                    se::Stream& stream,
+                                    NcclApi::NcclCommHandle comm,
+                                    NcclApi* nccl_api);
 
 }  // namespace xla::gpu
 
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.cc
index 804c30b42e3690..02a8a583d754ef 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.cc
@@ -89,7 +89,7 @@ NcclCollectivePermuteStartThunk::NcclCollectivePermuteStartThunk(
     replica_group.add_replica_ids(i);
   }
 
-  const std::vector<std::pair<int64_t, int64_t>> source_target_pairs =
+  const std::vector<std::pair<int64_t, int64_t>>& source_target_pairs =
       instr->source_target_pairs();
 
   for (const std::pair<int64_t, int64_t>& source_target : source_target_pairs) {
@@ -110,7 +110,7 @@ NcclCollectivePermuteStartThunk::NcclCollectivePermuteStartThunk(
     int64_t partition_count) {
   // The collective permute is degenerate if all source-target pairs are
   // identity, and all the IDs appear in the list.
-  const std::vector<std::pair<int64_t, int64_t>> source_target_pairs =
+  const std::vector<std::pair<int64_t, int64_t>>& source_target_pairs =
       instr->source_target_pairs();
   // Each ID can appear only once as a source and as a target. So if all pairs
   // are identity, all IDs must appear in the list is the size == number of
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.h b/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.h
index 0ba3df3ac5b91f..98719401a57aff 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <cstdint>
 
+#include "absl/container/node_hash_map.h"
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -26,8 +27,8 @@ limitations under the License.
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
 #include "xla/stream_executor/stream.h"
-#include "tsl/concurrency/async_value.h"
-#include "tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/concurrency/async_value.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.cc
index 8d6b513c02774e..10a7ef41fd3fcf 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.cc
@@ -44,9 +44,9 @@ limitations under the License.
 #include "xla/service/global_device_id.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/ir_emission_utils.h"
-#include "xla/service/gpu/nccl_clique_key.h"
 #include "xla/service/gpu/runtime/nccl_api.h"
 #include "xla/service/gpu/runtime/nccl_clique.h"
+#include "xla/service/gpu/runtime/nccl_clique_key.h"
 #include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/rendezvous.h"
 #include "xla/shape.h"
@@ -290,9 +290,9 @@ absl::StatusOr<std::vector<DeviceBufferPair>> ConvertToDeviceBuffers(
   return device_buffers;
 }
 
-Status RegisterBufferOnce(NcclApi* nccl_api, int device_ordinal,
-                          NcclApi::NcclCommHandle comm,
-                          se::DeviceMemoryBase buffer) {
+absl::Status RegisterBufferOnce(NcclApi* nccl_api, int device_ordinal,
+                                NcclApi::NcclCommHandle comm,
+                                se::DeviceMemoryBase buffer) {
   // Keep track of which communicators we have registered for already.
   // Each ncclMemAlloc'd buffer needs to be registered once per comm.
   struct RegisteredBuffers {
@@ -332,9 +332,9 @@ Status RegisterBufferOnce(NcclApi* nccl_api, int device_ordinal,
   return OkStatus();
 }
 
-Status MaybeRegisterBuffers(NcclApi* nccl_api, int device_ordinal,
-                            const std::vector<DeviceBufferPair>& buffers,
-                            NcclApi::NcclCommHandle comm) {
+absl::Status MaybeRegisterBuffers(NcclApi* nccl_api, int device_ordinal,
+                                  const std::vector<DeviceBufferPair>& buffers,
+                                  NcclApi::NcclCommHandle comm) {
   for (int i = 0; i < buffers.size(); ++i) {
     if (buffers[i].source_memory_space == kCollectiveMemorySpaceColor) {
       TF_RETURN_IF_ERROR(RegisterBufferOnce(nccl_api, device_ordinal, comm,
@@ -353,11 +353,7 @@ absl::Status NcclCollectiveThunk::AsyncEvents::Initialize(
   absl::MutexLock lock(&mu_);
   if (events_.contains(executor)) return absl::OkStatus();
 
-  se::Event event(executor);
-  if (!event.Init()) {
-    return absl::InternalError(
-        "Failed to initialize collective operation async completion event");
-  }
+  TF_ASSIGN_OR_RETURN(auto event, executor->CreateEvent());
 
   events_.try_emplace(executor, std::move(event));
   return absl::OkStatus();
@@ -373,7 +369,7 @@ absl::StatusOr<se::Event*> NcclCollectiveThunk::AsyncEvents::GetEvent(
         "Collective operation async completion event not initialized");
   }
 
-  return &event->second;
+  return event->second.get();
 }
 
 absl::Status NcclCollectiveThunk::Prepare(const PrepareParams& params,
@@ -402,7 +398,7 @@ absl::Status NcclCollectiveThunk::Prepare(const PrepareParams& params,
       xla::GetDebugOptionsFromFlags().xla_gpu_enable_nccl_per_stream_comms();
   return resource_requests.AddClique(
       NcclCliqueKey(std::move(participants),
-                    enable_per_stream_comms ? GetStreamId() : kNoStreamId,
+                    enable_per_stream_comms ? nccl_stream_id() : kNoStreamId,
                     stream_kind),
       num_local_participants);
 }
@@ -432,10 +428,10 @@ bool operator==(const FirstCallRendezvousKey& a,
 }
 }  // namespace
 
-Status NcclCollectiveThunk::ExecuteOnStream(const ExecuteParams& params) {
+absl::Status NcclCollectiveThunk::ExecuteOnStream(const ExecuteParams& params) {
   VLOG(1) << absl::StreamFormat("Starting %s %s.", IsAsync() ? "async" : "sync",
                                 Thunk::KindToString(kind()));
-  const NcclStreamId stream_id = GetStreamId();
+  const NcclStreamId stream_id = nccl_stream_id();
   AsyncStreamKind stream_kind = GetAsyncStreamKind();
   TF_ASSIGN_OR_RETURN(
       NcclCommHandleWrapper comm_handle,
@@ -447,7 +443,8 @@ Status NcclCollectiveThunk::ExecuteOnStream(const ExecuteParams& params) {
 
   if (IsAsync()) {
     // Launch collective operation on an async stream.
-    se::Stream& async_stream = *params.async_comms_streams[async_stream_idx];
+    se::Stream& async_stream =
+        *params.collective_params->async_streams.at(async_stream_idx);
 
     // Wait for main compute stream to make sure all buffers are ready.
     TF_RETURN_IF_ERROR(async_stream.WaitFor(params.stream));
@@ -514,8 +511,11 @@ std::string NcclCollectiveThunk::GetDeviceString(
 
 NcclCollectiveDoneThunk::NcclCollectiveDoneThunk(
     Thunk::Kind kind, ThunkInfo thunk_info,
-    std::shared_ptr<NcclCollectiveThunk::AsyncEvents> async_events)
-    : Thunk(kind, std::move(thunk_info)), async_events_(async_events) {}
+    std::shared_ptr<NcclCollectiveThunk::AsyncEvents> async_events,
+    AsyncStreamKind async_stream_kind)
+    : Thunk(kind, std::move(thunk_info)),
+      async_events_(async_events),
+      async_stream_kind_(async_stream_kind) {}
 
 absl::Status NcclCollectiveDoneThunk::ExecuteOnStream(
     const ExecuteParams& params) {
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.h b/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.h
index 95715152b7458a..d428966e784d14 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.h
@@ -25,7 +25,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/base/thread_annotations.h"
-#include "absl/container/node_hash_map.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
@@ -39,8 +39,8 @@ limitations under the License.
 #include "xla/service/global_device_id.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/ir_emission_utils.h"
-#include "xla/service/gpu/nccl_clique_key.h"
 #include "xla/service/gpu/runtime/nccl_api.h"
+#include "xla/service/gpu/runtime/nccl_clique_key.h"
 #include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/service/rendezvous.h"
@@ -155,7 +155,7 @@ class NcclCollectiveThunk : public Thunk {
 
    private:
     absl::Mutex mu_;
-    absl::node_hash_map<se::StreamExecutor*, se::Event> events_
+    absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<se::Event>> events_
         ABSL_GUARDED_BY(mu_);
   };
 
@@ -176,6 +176,15 @@ class NcclCollectiveThunk : public Thunk {
     async_events_ = async_events;
   }
 
+  NcclStreamId nccl_stream_id() const {
+    return xla::gpu::GetStreamId(IsAsync(), GetAsyncStreamKind());
+  }
+
+  ExecutionStreamId nccl_execution_stream_id() const {
+    return ExecutionStreamId(execution_stream_id().value() +
+                             nccl_stream_id().value());
+  }
+
  protected:
   virtual absl::Status RunNcclCollective(
       const ExecuteParams& params, se::Stream& stream,
@@ -197,11 +206,6 @@ class NcclCollectiveThunk : public Thunk {
 
  private:
   bool IsAsync() const { return async_events_ != nullptr; }
-  NcclStreamId GetStreamId() const {
-    return xla::gpu::GetStreamId(execution_stream_id().value(), IsAsync(),
-                                 GetAsyncStreamKind());
-  }
-
   NcclApi* nccl_api_;
   std::shared_ptr<AsyncEvents> async_events_;
 
@@ -224,12 +228,22 @@ class NcclCollectiveDoneThunk : public Thunk {
  public:
   NcclCollectiveDoneThunk(
       Thunk::Kind kind, ThunkInfo thunk_info,
-      std::shared_ptr<NcclCollectiveThunk::AsyncEvents> async_events);
+      std::shared_ptr<NcclCollectiveThunk::AsyncEvents> async_events,
+      AsyncStreamKind async_stream_kind);
 
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
+  // return the execution stream id wheer previous async operator was launched
+  // to.
+  ExecutionStreamId nccl_execution_stream_id() const {
+    return ExecutionStreamId(
+        execution_stream_id().value() +
+        xla::gpu::GetStreamId(true, async_stream_kind_).value());
+  }
+
  private:
   std::shared_ptr<NcclCollectiveThunk::AsyncEvents> async_events_;
+  AsyncStreamKind async_stream_kind_ = AsyncStreamKind::kCollective;
 };
 
 //===----------------------------------------------------------------------===//
@@ -257,7 +271,7 @@ absl::Status AddOpDescription(absl::Status status, OpT op,
     str = llvm_ir::DumpToString(op.getOperation());
   }
 
-  return Status(
+  return absl::Status(
       status.code(),
       absl::StrFormat(
           "%s\n"
@@ -306,9 +320,9 @@ absl::StatusOr<std::vector<DeviceBufferPair>> ConvertToDeviceBuffers(
 // communicator to enable zero-copy collectives.
 //
 // https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/bufferreg.html
-Status MaybeRegisterBuffers(NcclApi* nccl_api, int device_ordinal,
-                            const std::vector<DeviceBufferPair>& buffers,
-                            NcclApi::NcclCommHandle comm);
+absl::Status MaybeRegisterBuffers(NcclApi* nccl_api, int device_ordinal,
+                                  const std::vector<DeviceBufferPair>& buffers,
+                                  NcclApi::NcclCommHandle comm);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_p2p_thunk_common.cc b/third_party/xla/xla/service/gpu/runtime/nccl_p2p_thunk_common.cc
index 30b2d3c4ed9880..6d0ae9584fc51a 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_p2p_thunk_common.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_p2p_thunk_common.cc
@@ -26,9 +26,10 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/synchronization/mutex.h"
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "xla/executable_run_options.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/collective_ops_utils.h"
-#include "xla/service/gpu/nccl_clique_key.h"
+#include "xla/service/gpu/runtime/nccl_clique_key.h"
 #include "xla/service/hlo_parser.h"
 #include "xla/shape.h"
 #include "xla/status_macros.h"
@@ -39,19 +40,20 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-absl::Status ExecutionCounters::Initialize(se::StreamExecutor* executor) {
+absl::Status ExecutionCounters::Initialize(se::StreamExecutor* executor,
+                                           RunId run_id) {
   absl::MutexLock lock(&mu_);
-  if (counters_.contains(executor)) return absl::OkStatus();
-
-  counters_.emplace(executor, 0);
+  CounterKey key = {executor, run_id};
+  if (counters_.contains(key)) return absl::OkStatus();
+  counters_.emplace(key, 0);
   return absl::OkStatus();
 }
 
 absl::StatusOr<int64_t*> ExecutionCounters::GetCounter(
-    se::StreamExecutor* executor) {
+    se::StreamExecutor* executor, RunId run_id) {
   absl::MutexLock lock(&mu_);
-
-  auto counter = counters_.find(executor);
+  CounterKey key = {executor, run_id};
+  auto counter = counters_.find(key);
   if (counter == counters_.end()) {
     return absl::InternalError("Execution counter not initialized");
   }
@@ -109,8 +111,8 @@ NcclP2PConfig GetNcclP2PConfigForSendRecv(const HloSendRecvInstruction* instr,
       instr->frontend_attributes().map().at(kSendRecvSourceTargetPairsAttr);
 
   // We currently ignore problems related to the source-target-pair string to
-  // avoid using StatusOr for the return type. This should be ok as Send/Recv
-  // are generated by the compiler.
+  // avoid using absl::StatusOr for the return type. This should be ok as
+  // Send/Recv are generated by the compiler.
   if (!source_target_pairs_string.has_value()) {
     return p2p_config;
   }
@@ -133,7 +135,7 @@ NcclP2PConfig GetNcclP2PConfigForSendRecv(const HloSendRecvInstruction* instr,
       if (!statusor_bounds.ok() ||
           statusor_bounds.value().size() != replica_groups.size()) {
         // Ignore problems related to the source-target-pair string to avoid
-        // using StatusOr for the return type.
+        // using absl::StatusOr for the return type.
         return p2p_config;
       }
       validation_kind = NcclP2PConfig::ValidationKind::kConditional;
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_p2p_thunk_common.h b/third_party/xla/xla/service/gpu/runtime/nccl_p2p_thunk_common.h
index 0a255fc3012ccc..308f113db02c26 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_p2p_thunk_common.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_p2p_thunk_common.h
@@ -28,7 +28,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/gpu/nccl_clique_key.h"
+#include "xla/service/gpu/runtime/nccl_clique_key.h"
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/stream_executor_pimpl.h"
@@ -39,13 +39,15 @@ namespace gpu {
 // Count the number of times a Send or Recv instruction executed on a device.
 class ExecutionCounters {
  public:
-  absl::Status Initialize(se::StreamExecutor* executor);
-  absl::StatusOr<int64_t*> GetCounter(se::StreamExecutor* executor);
+  absl::Status Initialize(se::StreamExecutor* executor, RunId run_id);
+  absl::StatusOr<int64_t*> GetCounter(se::StreamExecutor* executor,
+                                      RunId run_id);
 
  private:
+  using CounterKey = std::pair<se::StreamExecutor*, RunId>;
   absl::Mutex mu_;
-  absl::flat_hash_map<se::StreamExecutor*, int64_t> counters_
-      ABSL_GUARDED_BY(mu_);
+  // TODO(b/338288906): may need to clean up the counters for finished runs.
+  absl::flat_hash_map<CounterKey, int64_t> counters_ ABSL_GUARDED_BY(mu_);
 };
 
 // Records the information for implementing CollectivePermute, Send and Recv.
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.cc
index b6520aaab733ad..800c684ade4aa1 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.cc
@@ -58,7 +58,8 @@ NcclRecvThunk::NcclRecvThunk(ThunkInfo thunk_info, NcclApi* nccl_api,
 absl::Status NcclRecvThunk::Initialize(const InitializeParams& params) {
   TF_RETURN_IF_ERROR(NcclCollectiveThunk::Initialize(params));
   if (execution_counters_) {
-    TF_RETURN_IF_ERROR(execution_counters_->Initialize(params.executor));
+    TF_RETURN_IF_ERROR(execution_counters_->Initialize(
+        params.executor, params.collective_params->run_id));
   }
   return absl::OkStatus();
 }
@@ -111,8 +112,9 @@ absl::Status NcclRecvThunk::RunNcclCollective(
     if (config_.validation_kind ==
         NcclP2PConfig::ValidationKind::kConditional) {
       se::StreamExecutor* executor = params.stream->parent();
-      TF_ASSIGN_OR_RETURN(int64_t * counter,
-                          execution_counters_->GetCounter(executor));
+      TF_ASSIGN_OR_RETURN(int64_t* counter,
+                          execution_counters_->GetCounter(
+                              executor, params.collective_params->run_id));
       auto it = config_.source_target_to_bounds.find(
           std::make_pair(*source_target.source, current_id));
       if (it == config_.source_target_to_bounds.end()) {
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.h b/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.h
index b688f0c2c4abbc..cd68ab408cd073 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.h
@@ -22,8 +22,8 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/gpu/nccl_clique_key.h"
 #include "xla/service/gpu/runtime/nccl_api.h"
+#include "xla/service/gpu/runtime/nccl_clique_key.h"
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
 #include "xla/stream_executor/stream.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.cc
index 874e561d1762d6..8e719449d07f44 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.cc
@@ -59,7 +59,8 @@ NcclSendThunk::NcclSendThunk(ThunkInfo thunk_info, NcclApi* nccl_api,
 absl::Status NcclSendThunk::Initialize(const InitializeParams& params) {
   TF_RETURN_IF_ERROR(NcclCollectiveThunk::Initialize(params));
   if (execution_counters_) {
-    TF_RETURN_IF_ERROR(execution_counters_->Initialize(params.executor));
+    TF_RETURN_IF_ERROR(execution_counters_->Initialize(
+        params.executor, params.collective_params->run_id));
   }
   return absl::OkStatus();
 }
@@ -111,8 +112,9 @@ absl::Status NcclSendThunk::RunNcclCollective(
     if (config_.validation_kind ==
         NcclP2PConfig::ValidationKind::kConditional) {
       se::StreamExecutor* executor = params.stream->parent();
-      TF_ASSIGN_OR_RETURN(int64_t * counter,
-                          execution_counters_->GetCounter(executor));
+      TF_ASSIGN_OR_RETURN(int64_t* counter,
+                          execution_counters_->GetCounter(
+                              executor, params.collective_params->run_id));
       auto it = config_.source_target_to_bounds.find(
           std::make_pair(current_id, *source_target.target));
       if (it == config_.source_target_to_bounds.end()) {
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.h b/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.h
index a622848394c40b..97de1e598286ac 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.h
@@ -23,8 +23,8 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/collective_ops_utils.h"
-#include "xla/service/gpu/nccl_clique_key.h"
 #include "xla/service/gpu/runtime/nccl_api.h"
+#include "xla/service/gpu/runtime/nccl_clique_key.h"
 #include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
 #include "xla/stream_executor/stream.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/norm_thunk.cc b/third_party/xla/xla/service/gpu/runtime/norm_thunk.cc
index d3862f7bfeac74..71c0744686e402 100644
--- a/third_party/xla/xla/service/gpu/runtime/norm_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/norm_thunk.cc
@@ -106,5 +106,14 @@ absl::Status NormThunk::ExecuteOnStream(const ExecuteParams& params) {
   return absl::OkStatus();
 }
 
+absl::Status NormThunk::Initialize(const InitializeParams& params) {
+  // Create the runner at initialization time to avoid hangs if we try to build
+  // the execution plan while a NCCL collective is running.
+  se::dnn::LazyOpRunner<se::dnn::NormOp>* lazy_runner =
+      GetOrCreateRunner(params.stream).AsNormRunner();
+  TF_ASSIGN_OR_RETURN(auto ln_config, config_.AsDnnNormOpConfig());
+  return lazy_runner->GetOrCreateRunner(ln_config, params.stream).status();
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/runtime/norm_thunk.h b/third_party/xla/xla/service/gpu/runtime/norm_thunk.h
index 602d504175fb3d..eca5deca3a68b7 100644
--- a/third_party/xla/xla/service/gpu/runtime/norm_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/norm_thunk.h
@@ -49,6 +49,7 @@ class NormThunk : public Thunk {
   NormThunk& operator=(const NormThunk&) = delete;
 
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+  absl::Status Initialize(const InitializeParams& params) override;
 
  private:
   BufferAllocation::Slice x_buffer_;
diff --git a/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.cc b/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.cc
index ba23806bbbaf18..00a22bc207c59b 100644
--- a/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.cc
@@ -36,8 +36,8 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "tsl/concurrency/async_value.h"
-#include "tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/concurrency/async_value.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/profiler/lib/traceme.h"
@@ -69,9 +69,9 @@ static absl::StatusOr<bool> ShouldSkip(
 // SendRecvAsyncEvents
 //===----------------------------------------------------------------------===//
 
-absl::Status SendRecvAsyncEvents::Emplace(se::StreamExecutor* executor,
-                                          int32_t channel_id,
-                                          tsl::AsyncValueRef<se::Event> event) {
+absl::Status SendRecvAsyncEvents::Emplace(
+    se::StreamExecutor* executor, int32_t channel_id,
+    tsl::AsyncValueRef<std::unique_ptr<se::Event>> event) {
   Key key = {executor, channel_id};
 
   absl::MutexLock lock(&mutex_);
@@ -82,8 +82,8 @@ absl::Status SendRecvAsyncEvents::Emplace(se::StreamExecutor* executor,
       "Async send/recv event already exists (channel_id=%d)", channel_id));
 }
 
-absl::StatusOr<AsyncValueRef<se::Event>> SendRecvAsyncEvents::Extract(
-    se::StreamExecutor* executor, int32_t channel_id) {
+absl::StatusOr<AsyncValueRef<std::unique_ptr<se::Event>>>
+SendRecvAsyncEvents::Extract(se::StreamExecutor* executor, int32_t channel_id) {
   Key key = {executor, channel_id};
 
   absl::MutexLock lock(&mutex_);
@@ -135,7 +135,7 @@ absl::Status SendThunk::ExecuteOnStream(const ExecuteParams& params) {
   // Send buffer to a handler registered with the executable.
   if (auto* send = params.send_device_memory_function) {
     TF_ASSIGN_OR_RETURN(
-        AsyncValueRef<se::Event> done,
+        AsyncValueRef<std::unique_ptr<se::Event>> done,
         (*send)(channel_id_, stream, shape_, src, frontend_attrs_));
     return events_->Emplace(stream->parent(), channel_id_, std::move(done));
   }
@@ -176,7 +176,7 @@ absl::Status SendDoneThunk::ExecuteOnStream(const ExecuteParams& params) {
   VLOG(5) << "Completed Send operation: channel_id=" << channel_id_;
 
   // Once event is recorded we can add a stream dependency.
-  return params.stream->WaitFor(&done_event.get());
+  return params.stream->WaitFor(done_event.get().get());
 }
 
 //===----------------------------------------------------------------------===//
@@ -221,7 +221,7 @@ absl::Status RecvThunk::ExecuteOnStream(const ExecuteParams& params) {
   // Recv buffer from a handler registered with the run options.
   if (auto* recv = params.recv_device_memory_function) {
     TF_ASSIGN_OR_RETURN(
-        AsyncValueRef<se::Event> done,
+        AsyncValueRef<std::unique_ptr<se::Event>> done,
         (*recv)(channel_id_, stream, shape_, &dst, frontend_attrs_));
     return events_->Emplace(stream->parent(), channel_id_, std::move(done));
   }
@@ -261,7 +261,7 @@ absl::Status RecvDoneThunk::ExecuteOnStream(const ExecuteParams& params) {
   VLOG(5) << "Completed Recv operation: channel=" << channel_id_;
 
   // Once event is recorded we can add a stream dependency.
-  return params.stream->WaitFor(&done_event.get());
+  return params.stream->WaitFor(done_event.get().get());
 }
 
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.h b/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.h
index 2c235664e8188d..b146efc6799151 100644
--- a/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.h
@@ -32,8 +32,8 @@ limitations under the License.
 #include "xla/status.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/concurrency/async_value_ref.h"
 
 namespace xla::gpu {
 
@@ -63,18 +63,18 @@ class SendRecvAsyncEvents {
  public:
   // Emplace a new send/recv completion event.
   absl::Status Emplace(se::StreamExecutor* executor, int32_t channel_id,
-                       tsl::AsyncValueRef<se::Event> event);
+                       tsl::AsyncValueRef<std::unique_ptr<se::Event>> event);
 
   // Extract a send/recv completion event.
-  absl::StatusOr<tsl::AsyncValueRef<se::Event>> Extract(
+  absl::StatusOr<tsl::AsyncValueRef<std::unique_ptr<se::Event>>> Extract(
       se::StreamExecutor* executor, int32_t channel_id);
 
  private:
   using Key = std::pair<se::StreamExecutor*, /*channel_id=*/int64_t>;
 
   absl::Mutex mutex_;
-  absl::flat_hash_map<Key, tsl::AsyncValueRef<se::Event>> events_
-      ABSL_GUARDED_BY(mutex_);
+  absl::flat_hash_map<Key, tsl::AsyncValueRef<std::unique_ptr<se::Event>>>
+      events_ ABSL_GUARDED_BY(mutex_);
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/service/gpu/runtime/sequential_thunk.cc b/third_party/xla/xla/service/gpu/runtime/sequential_thunk.cc
index 143ad94a29071d..c58f31b5207a7a 100644
--- a/third_party/xla/xla/service/gpu/runtime/sequential_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/sequential_thunk.cc
@@ -28,8 +28,6 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-using ::tsl::profiler::ScopedAnnotation;
-
 SequentialThunk::SequentialThunk(ThunkInfo thunk_info, ThunkSequence thunks)
     : Thunk(Kind::kSequential, thunk_info), thunks_(std::move(thunks)) {}
 
diff --git a/third_party/xla/xla/service/gpu/runtime/thunk.cc b/third_party/xla/xla/service/gpu/runtime/thunk.cc
index 3c38f28be70106..0c95623db733be 100644
--- a/third_party/xla/xla/service/gpu/runtime/thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/thunk.cc
@@ -31,16 +31,16 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "mlir/IR/Operation.h"  // from @llvm-project
 #include "xla/executable_run_options.h"
+#include "xla/ffi/execution_context.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/global_device_id.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
-#include "xla/service/gpu/nccl_clique_key.h"
 #include "xla/service/gpu/runtime/nccl_api.h"
 #include "xla/service/gpu/runtime/nccl_clique.h"
+#include "xla/service/gpu/runtime/nccl_clique_key.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/translate/mhlo_to_hlo/location_exporter.h"
@@ -127,8 +127,8 @@ static absl::StatusOr<GlobalDeviceId> GetGlobalDeviceId(
 absl::StatusOr<Thunk::CollectiveExecuteParams>
 Thunk::CollectiveExecuteParams::Create(
     const ServiceExecutableRunOptions& run_options,
-    int64_t local_device_ordinal, int64_t collective_max_nchannels,
-    int64_t p2p_max_nchannels) {
+    absl::Span<se::Stream* const> async_streams, int64_t local_device_ordinal,
+    int64_t collective_max_nchannels, int64_t p2p_max_nchannels) {
   const GpuExecutableRunOptions* gpu_options =
       run_options.run_options().gpu_executable_run_options();
 
@@ -145,19 +145,21 @@ Thunk::CollectiveExecuteParams::Create(
 
   return CollectiveExecuteParams(
       run_options.stream()->parent(), run_options.run_options().run_id(),
-      local_device_ordinal, global_device_id,
+      async_streams, local_device_ordinal, global_device_id,
       run_options.run_options().device_assignment(), device_id_map,
       nccl_callback, collective_max_nchannels, p2p_max_nchannels);
 }
 
 Thunk::CollectiveExecuteParams::CollectiveExecuteParams(
-    se::StreamExecutor* executor, RunId run_id, int64_t local_device_ordinal,
+    se::StreamExecutor* executor, RunId run_id,
+    absl::Span<se::Stream* const> async_streams, int64_t local_device_ordinal,
     GlobalDeviceId global_device_id, const DeviceAssignment* device_assn,
     const GlobalDeviceIdMap* global_device_id_map,
     const NcclCliqueIdCallback* nccl_clique_id_callback,
     int64_t collective_max_nchannels, int64_t p2p_max_nchannels)
     : executor(executor),
       run_id(run_id),
+      async_streams(async_streams.begin(), async_streams.end()),
       local_device_ordinal(local_device_ordinal),
       global_device_id(global_device_id),
       device_assn(device_assn),
@@ -174,17 +176,16 @@ Thunk::ExecuteParams Thunk::ExecuteParams::Create(
     const ServiceExecutableRunOptions& run_options,
     const BufferAllocations& buffer_allocations, se::Stream* stream,
     se::Stream* command_buffer_trace_stream,
-    absl::Span<se::Stream* const> async_streams,
     CollectiveExecuteParams* collective_params,
     CollectiveCliques* collective_cliques,
     ExecutionStreamIdMap additional_compute_streams) {
   return ExecuteParams(&buffer_allocations, stream, command_buffer_trace_stream,
-                       {async_streams.begin(), async_streams.end()},
                        collective_params, collective_cliques,
                        run_options.run_options().device_to_host_stream(),
                        run_options.run_options().host_to_device_stream(),
                        run_options.run_options().send_device_memory_function(),
                        run_options.run_options().recv_device_memory_function(),
+                       run_options.run_options().ffi_execution_context(),
                        additional_compute_streams);
 }
 
@@ -193,33 +194,32 @@ Thunk::ExecuteParams Thunk::ExecuteParams::CloneWithNewAllocations(
     const BufferAllocations& buffer_allocations) {
   return ExecuteParams(
       &buffer_allocations, params.stream, params.command_buffer_trace_stream,
-      {params.async_comms_streams.begin(), params.async_comms_streams.end()},
       params.collective_params, params.collective_cliques,
       params.device_to_host_stream, params.host_to_device_stream,
       params.send_device_memory_function, params.recv_device_memory_function,
-      params.additional_compute_streams);
+      params.ffi_execution_context, params.additional_compute_streams);
 }
 
 Thunk::ExecuteParams::ExecuteParams(
     const BufferAllocations* buffer_allocations, se::Stream* stream,
     se::Stream* command_buffer_trace_stream,
-    absl::InlinedVector<se::Stream*, 4> async_comms_streams,
     CollectiveExecuteParams* collective_params,
     CollectiveCliques* collective_cliques, se::Stream* device_to_host_stream,
     se::Stream* host_to_device_stream,
     SendDeviceMemoryFunction* send_device_memory_function,
     RecvDeviceMemoryFunction* recv_device_memory_function,
+    const ffi::ExecutionContext* ffi_execution_context,
     ExecutionStreamIdMap additional_compute_streams)
     : buffer_allocations(buffer_allocations),
       stream(stream),
       command_buffer_trace_stream(command_buffer_trace_stream),
-      async_comms_streams(async_comms_streams),
       collective_params(collective_params),
       collective_cliques(collective_cliques),
       device_to_host_stream(device_to_host_stream),
       host_to_device_stream(host_to_device_stream),
       send_device_memory_function(send_device_memory_function),
       recv_device_memory_function(recv_device_memory_function),
+      ffi_execution_context(ffi_execution_context),
       additional_compute_streams(additional_compute_streams) {}
 
 //===----------------------------------------------------------------------===//
@@ -236,6 +236,7 @@ Thunk::ExecuteParams::ExecuteParams(
     CASE(kConvolution);
     CASE(kConvolutionReorder);
     CASE(kCopy);
+    CASE(kCopyDone);
     CASE(kCubSort);
     CASE(kCublasLtMatmul);
     CASE(kCustomCall);
@@ -338,22 +339,15 @@ bool IsReductionCollective(Thunk::Kind kind) {
          kind == Thunk::kNcclReduceScatterStart;
 }
 
-Thunk::ThunkInfo Thunk::ThunkInfo::WithProfileAnnotation(mlir::Operation* op) {
-  ThunkInfo thunk_info(op);
-  thunk_info.profile_annotation =
-      mlir::mhlo::GetDebugNameFromLocation(op->getLoc());
-  return thunk_info;
-}
-
 Thunk::ThunkInfo Thunk::ThunkInfo::WithProfileAnnotation(
     const HloInstruction* instr) {
-  ThunkInfo thunk_info(nullptr);
+  ThunkInfo thunk_info;
   thunk_info.profile_annotation = instr->name();
   auto gpu_backend_config = instr->backend_config<GpuBackendConfig>();
   if (gpu_backend_config.ok()) {
-    thunk_info.execution_stream_id = std::max<uint64_t>(
-        kDefaultExecutionStreamId.value(),
-        static_cast<uint64_t>(gpu_backend_config->operation_queue_id()));
+    thunk_info.execution_stream_id =
+        std::max<uint64_t>(kDefaultExecutionStreamId.value(),
+                           gpu_backend_config->operation_queue_id());
   }
   return thunk_info;
 }
diff --git a/third_party/xla/xla/service/gpu/runtime/thunk.h b/third_party/xla/xla/service/gpu/runtime/thunk.h
index 633974e4e1c293..a0c3601f2db0f4 100644
--- a/third_party/xla/xla/service/gpu/runtime/thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/thunk.h
@@ -34,13 +34,14 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "xla/executable_run_options.h"
+#include "xla/ffi/execution_context.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/global_device_id.h"
 #include "xla/service/gpu/buffer_allocations.h"
-#include "xla/service/gpu/nccl_clique_key.h"
 #include "xla/service/gpu/runtime/nccl_api.h"
 #include "xla/service/gpu/runtime/nccl_clique.h"
+#include "xla/service/gpu/runtime/nccl_clique_key.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
@@ -118,6 +119,7 @@ class Thunk {
     kConvolution,
     kConvolutionReorder,
     kCopy,
+    kCopyDone,
     kCommandBuffer,
     kCubSort,
     kCublasLtMatmul,
@@ -181,14 +183,10 @@ class Thunk {
   };
 
   struct ThunkInfo {
-    explicit ThunkInfo(mlir::Operation* op) : op(op) {}
-    static ThunkInfo WithProfileAnnotation(mlir::Operation* op);
+    ThunkInfo() = default;  // Disable implicit constructors.
     static ThunkInfo WithProfileAnnotation(const HloInstruction* instr);
 
     std::string profile_annotation;
-    // TODO(b/304613751): This is only needed by the LMHLO. Remove this when
-    // LMHLO is removed from the runtime pipeline.
-    mlir::Operation* op;
 
     ExecutionStreamId execution_stream_id = kDefaultExecutionStreamId;
   };
@@ -247,6 +245,7 @@ class Thunk {
     // missing a global device mapping for a local device ordinal).
     static absl::StatusOr<CollectiveExecuteParams> Create(
         const ServiceExecutableRunOptions& run_options,
+        absl::Span<se::Stream* const> async_streams,
         int64_t local_device_ordinal, int64_t collective_max_nchannels = 0,
         int64_t p2p_max_nchannels = 0);
 
@@ -259,6 +258,9 @@ class Thunk {
     // from different concurrent executions and avoid deadlocks.
     RunId run_id;
 
+    // Streams for asynchronous collective communications.
+    absl::InlinedVector<se::Stream*, 4> async_streams;
+
     int64_t local_device_ordinal;
     GlobalDeviceId global_device_id;
 
@@ -271,6 +273,7 @@ class Thunk {
 
    private:
     CollectiveExecuteParams(se::StreamExecutor* executor, RunId run_id,
+                            absl::Span<se::Stream* const> async_streams,
                             int64_t local_device_ordinal,
                             GlobalDeviceId global_device_id,
                             const DeviceAssignment* device_assn,
@@ -296,9 +299,6 @@ class Thunk {
   // InitializeParams
   //===--------------------------------------------------------------------===//
 
-  // TODO(ezhulenev): Merge InitializeParams and ExecuteParams as they have
-  // almost the same members and tightly coupled.
-
   // Parameters passed to Initialize. At thunk initialization time we do not
   // launch any "work" on device and only initialize thunks for execution, i.e.
   // we pre-load kernels on device and instantiate all command buffers.
@@ -324,6 +324,9 @@ class Thunk {
 
     // Collective cliques acquired based on resource requests.
     CollectiveCliques* collective_cliques = nullptr;
+
+    // XLA FFI execution context.
+    const ffi::ExecutionContext* ffi_execution_context = nullptr;
   };
 
   //===--------------------------------------------------------------------===//
@@ -340,7 +343,6 @@ class Thunk {
         const ServiceExecutableRunOptions& run_options,
         const BufferAllocations& buffer_allocations, se::Stream* stream,
         se::Stream* command_buffer_trace_stream,
-        absl::Span<se::Stream* const> async_streams,
         CollectiveExecuteParams* collective_params,
         CollectiveCliques* collective_cliques,
         ExecutionStreamIdMap additional_compute_streams = {});
@@ -360,10 +362,6 @@ class Thunk {
     // avoid accidental tracing of unrelated activities on a main stream.
     se::Stream* command_buffer_trace_stream;
 
-    // Streams for asynchronous collective communications.
-    // TODO(ezhulenev): Move this into `CollectiveExecuteParams`.
-    absl::InlinedVector<se::Stream*, 4> async_comms_streams;
-
     // Parameters for executing collective operations.
     CollectiveExecuteParams* collective_params;
 
@@ -378,6 +376,9 @@ class Thunk {
     SendDeviceMemoryFunction* send_device_memory_function;
     RecvDeviceMemoryFunction* recv_device_memory_function;
 
+    // XLA FFI execution context.
+    const ffi::ExecutionContext* ffi_execution_context;
+
     // Additional compute streams on which thunks launch operations.
     ExecutionStreamIdMap additional_compute_streams;
 
@@ -386,13 +387,13 @@ class Thunk {
 
     ExecuteParams(const BufferAllocations* buffer_allocations,
                   se::Stream* stream, se::Stream* command_buffer_trace_stream,
-                  absl::InlinedVector<se::Stream*, 4> async_comms_streams,
                   CollectiveExecuteParams* collective_params,
                   CollectiveCliques* collective_cliques,
                   se::Stream* device_to_host_stream,
                   se::Stream* host_to_device_stream,
                   SendDeviceMemoryFunction* send_device_memory_function,
                   RecvDeviceMemoryFunction* recv_device_memory_function,
+                  const ffi::ExecutionContext* ffi_execution_context,
                   ExecutionStreamIdMap additional_compute_streams = {});
   };
 
@@ -404,7 +405,6 @@ class Thunk {
   Thunk(Kind kind, ThunkInfo thunk_info)
       : kind_(kind),
         profile_annotation_(thunk_info.profile_annotation),
-        op_(thunk_info.op),
         execution_stream_id_(thunk_info.execution_stream_id) {}
   virtual ~Thunk() = default;
   Thunk(const Thunk&) = delete;
@@ -414,11 +414,6 @@ class Thunk {
   Kind kind() const { return kind_; }
   std::string_view profile_annotation() const { return profile_annotation_; }
 
-  // Only valid during compilation, i.e., lowering thunks to kernel-launch
-  // related XLA runtime custom calls). nullptr at runtime. MLIR codegen will
-  // cease the practice of lowering thunks to XLA runtime custom calls.
-  mlir::Operation* op() { return op_; }
-
   // Prepares thunk for execution.
   //
   // This may be called multiple times. Its main purpose is to pass resource
@@ -446,9 +441,6 @@ class Thunk {
   // Precondition: Initialize(initialize_params) has been called.
   virtual absl::Status ExecuteOnStream(const ExecuteParams& params) = 0;
 
-  // Clears metadata that is only valid during compile time.
-  virtual void ClearCompileTimeInfo() { op_ = nullptr; }
-
   static absl::string_view KindToString(Thunk::Kind kind);
 
   ExecutionStreamId execution_stream_id() const { return execution_stream_id_; }
@@ -459,7 +451,6 @@ class Thunk {
  private:
   Kind kind_;
   std::string profile_annotation_;
-  mlir::Operation* op_;
   ExecutionStreamId execution_stream_id_;
 };
 
diff --git a/third_party/xla/xla/service/gpu/runtime/while_thunk.cc b/third_party/xla/xla/service/gpu/runtime/while_thunk.cc
index cbd9ce3134133e..56c312252da203 100644
--- a/third_party/xla/xla/service/gpu/runtime/while_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/while_thunk.cc
@@ -44,9 +44,9 @@ WhileThunk::WhileThunk(
     : Thunk(Kind::kWhile, thunk_info),
       condition_result_buffer_index_(condition_result_buffer_index),
       condition_thunk_sequence_(std::make_unique<SequentialThunk>(
-          ThunkInfo(thunk_info.op), std::move(*condition_thunk_sequence))),
+          ThunkInfo(), std::move(*condition_thunk_sequence))),
       body_thunk_sequence_(std::make_unique<SequentialThunk>(
-          ThunkInfo(thunk_info.op), std::move(*body_thunk_sequence))),
+          ThunkInfo(), std::move(*body_thunk_sequence))),
       trip_count_(trip_count) {}
 
 absl::Status WhileThunk::Prepare(const PrepareParams& params,
diff --git a/third_party/xla/xla/service/gpu/sleep_kernel.h b/third_party/xla/xla/service/gpu/sleep_kernel.h
deleted file mode 100644
index 3e040b10860e65..00000000000000
--- a/third_party/xla/xla/service/gpu/sleep_kernel.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_SLEEP_KERNEL_H_
-#define XLA_SERVICE_GPU_SLEEP_KERNEL_H_
-
-namespace xla::gpu {
-
-// Returns a pointer to CUDA kernel that does sleep operation on device.
-void* GetSleepKernel();
-
-}  // namespace xla::gpu
-
-#endif  // XLA_SERVICE_GPU_SLEEP_KERNEL_H_
diff --git a/third_party/xla/xla/service/gpu/softmax_rewriter_triton_test.cc b/third_party/xla/xla/service/gpu/softmax_rewriter_triton_test.cc
index 74e800f9a815cc..e8de7def5f401b 100644
--- a/third_party/xla/xla/service/gpu/softmax_rewriter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/softmax_rewriter_triton_test.cc
@@ -662,8 +662,9 @@ HloModule softmax
 max_computation {
   arg_0 = $0[] parameter(0)
   arg_1 = $0[] parameter(1)
-  floor_0 = $0[] floor(arg_0)
-  ROOT maximum = $0[] maximum(floor_0, arg_1)
+  if_0 = pred[] is-finite(arg_0)
+  c = $0[] convert(if_0)
+  ROOT maximum = $0[] maximum(c, arg_1)
 }
 
 ENTRY main {
@@ -923,11 +924,12 @@ max_computation {
 
 ENTRY main {
   param_0 = $0[127,125]{1,0} parameter(0)
-  floor_0 = $0[127,125] floor(param_0)
+  if_0 = pred[127,125] is-finite(param_0)
+  c = $0[127,125] convert(if_0)
   constant_neg_inf = $0[] constant(-inf)
-  reduce = $0[127]{0} reduce(floor_0, constant_neg_inf), dimensions={1}, to_apply=max_computation
+  reduce = $0[127]{0} reduce(c, constant_neg_inf), dimensions={1}, to_apply=max_computation
   broadcast = $0[127,125]{1,0} broadcast(reduce), dimensions={0}
-  ROOT subtract = $0[127,125]{1,0} subtract(floor_0, broadcast)
+  ROOT subtract = $0[127,125]{1,0} subtract(c, broadcast)
 }
 )";
   const std::string hlo_string =
@@ -941,7 +943,7 @@ ENTRY main {
   EXPECT_TRUE(verifier().Run(module.get()).status().ok());
   VLOG(2) << module->ToString();
   EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Fusion(m::Floor(m::Parameter()))));
+              GmockMatch(m::Fusion(m::IsFinite(m::Parameter()))));
 }
 
 TEST_P(SoftmaxRewriterTritonTest,
diff --git a/third_party/xla/xla/service/gpu/split_k_gemm_rewriter_test.cc b/third_party/xla/xla/service/gpu/split_k_gemm_rewriter_test.cc
index 3a66bc9c2b83c6..51013b4411bd1f 100644
--- a/third_party/xla/xla/service/gpu/split_k_gemm_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/split_k_gemm_rewriter_test.cc
@@ -792,6 +792,30 @@ ENTRY e {
   EXPECT_THAT(transpose->dimensions(), ElementsAre(0, 2, 1, 3));
 }
 
+TEST_F(SplitKTest, MakeSplitKWithTrivialDimension) {
+  const std::string hlo_text = R"(
+triton_gemm_dot {
+  parameter_0 = f32[1001,1]{1,0} parameter(0)
+  parameter_1 = f32[1001,2048]{1,0} parameter(1)
+  ROOT dot = f32[1,2048]{1,0} dot(parameter_0, parameter_1),
+    lhs_contracting_dims={0}, rhs_contracting_dims={0}
+}
+
+ENTRY %entry_computation {
+  p0 = f32[1001,1]{1,0} parameter(0)
+  p1 = f32[1001,2048]{1,0} parameter(1)
+  ROOT fusion = f32[1,2048]{1,0} fusion(p0, p1), kind=kCustom,
+    calls=triton_gemm_dot
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  TritonGemmConfig config(16, 128, 64, 4, 1, 4);
+  TF_EXPECT_OK(MakeDotSplitKBatch(
+      module->entry_computation()->root_instruction(), config));
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Reduce(m::Fusion(), m::Constant())));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/stream_executor_util.cc b/third_party/xla/xla/service/gpu/stream_executor_util.cc
index 16d813cfe64db8..93c0a63bb14566 100644
--- a/third_party/xla/xla/service/gpu/stream_executor_util.cc
+++ b/third_party/xla/xla/service/gpu/stream_executor_util.cc
@@ -52,10 +52,12 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_factory.h"
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/typed_kernel_factory.h"
 #include "xla/tsl/util/env_var.h"
 #include "xla/tsl/util/proto/proto_utils.h"
 #include "xla/util.h"
@@ -348,7 +350,7 @@ absl::Mutex& GetGpuMutex(const se::StreamExecutor* stream_exec) {
   absl::MutexLock global_lock(&mu);
   auto it = mutexes
                 ->emplace(std::piecewise_construct,
-                          std::make_tuple(stream_exec->platform(),
+                          std::make_tuple(stream_exec->GetPlatform(),
                                           stream_exec->device_ordinal()),
                           std::make_tuple())
                 .first;
@@ -368,7 +370,7 @@ absl::StatusOr<std::unique_ptr<se::Kernel>> CreateKernel(
   }
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<se::Kernel> kernel,
-                      se::Kernel::Create(stream_exec, loader_spec));
+                      se::KernelFactory::Create(stream_exec, loader_spec));
 
   se::KernelMetadata m;
   m.set_shared_memory_bytes(shared_mem_bytes);
@@ -416,15 +418,21 @@ typename std::enable_if<std::is_floating_point<T>::value,
   return std::uniform_real_distribution<T>(lhs, rhs)(*gen);
 }
 
+namespace repeat_buffer_kernel {
+void* kernel();
+}
+
 template <typename T>
 static void InitializeTypedBuffer(se::Stream* stream,
                                   se::DeviceMemoryBase buffer,
                                   int64_t* rng_state) {
   // Accesses to static variables are not locked, since the caller is already
   // in a critical section.
+
+  // Use a large prime number to fragment the accesses.
+  constexpr int host_buffer_size = 10069;
   static std::vector<T>* host_buffer = [] {
-    // Use a large prime number to fragment the accesses.
-    auto* ret = new std::vector<T>(10069);
+    auto* ret = new std::vector<T>(host_buffer_size);
     // Default-seeded random numbers.
     std::mt19937 gen;
     for (auto& element : *ret) {
@@ -447,26 +455,59 @@ static void InitializeTypedBuffer(se::Stream* stream,
     }
     return ret;
   }();
-
-  int64_t& host_index = *rng_state;
-
-  char* current_addr = static_cast<char*>(buffer.opaque());
+  // The buffer of random numbers is treated as being circular, and the seed in
+  // *rng_state is the offset in host_buffer that is copied to the zeroth index
+  // on the device. For large buffers then repeatedly copying the data from the
+  // host is expensive, so we just copy it once and use a kernel to repeat the
+  // data as needed.
   CHECK_EQ(0, buffer.size() % sizeof(T));
-  int64_t elements_left = buffer.size() / sizeof(T);
-  while (elements_left > 0) {
-    CHECK_LE(host_index, host_buffer->size());
-    if (host_buffer->size() == host_index) {
-      host_index = 0;
-    }
-    int64_t elements_copied =
-        std::min<int64_t>(host_buffer->size() - host_index, elements_left);
-    se::DeviceMemoryBase mem(current_addr, elements_copied * sizeof(T));
-    TF_CHECK_OK(stream->Memcpy(&mem, host_buffer->data() + host_index,
-                               elements_copied * sizeof(T)));
-    current_addr += elements_copied * sizeof(T);
-    elements_left -= elements_copied;
-    host_index += elements_copied;
+  int64_t elements_to_fill = buffer.size() / sizeof(T);
+  int64_t host_index = *rng_state;
+  CHECK_LT(host_index, host_buffer_size);
+  *rng_state = (*rng_state + elements_to_fill) % host_buffer_size;
+  // Copy the last part of `host_buffer` to the start of `buf` on the device
+  int64_t first_size =
+      std::min<int64_t>(host_buffer_size - host_index, elements_to_fill);
+  TF_CHECK_OK(stream->Memcpy(&buffer, host_buffer->data() + host_index,
+                             first_size * sizeof(T)));
+  elements_to_fill -= first_size;
+  if (elements_to_fill == 0) {
+    // Nothing more to do
+    return;
   }
+  // Issue a second host->device copy to transfer the rest of host_buffer
+  int64_t second_size = std::min<int64_t>(host_index, elements_to_fill);
+  CHECK_LE(first_size + second_size, host_buffer_size);
+  se::DeviceMemoryBase mem =
+      buffer.GetByteSlice(first_size * sizeof(T), second_size * sizeof(T));
+  TF_CHECK_OK(stream->Memcpy(&mem, host_buffer->data(), mem.size()));
+  elements_to_fill -= second_size;
+  if (elements_to_fill == 0) {
+    // Nothing more to do
+    return;
+  }
+#ifdef GOOGLE_CUDA
+  // Repeat the host_buffer_size elements at the start of `buf` to the end
+  CHECK_EQ(elements_to_fill, buffer.size() / sizeof(T) - host_buffer_size);
+  se::StreamExecutor* executor = stream->parent();
+  auto kernel =
+      se::TypedKernelFactory<se::DeviceMemoryBase, int64_t, int64_t>::Create(
+          executor, "RepeatBufferKernel", repeat_buffer_kernel::kernel());
+  if (!kernel.ok()) {
+    LOG(FATAL) << "Could not create RepeatBufferKernel: " << kernel.status();
+  }
+  // Launch the kernel with at least host_buffer_bytes threads. Each thread
+  // will read one byte of `host_buffer` from the start of `buffer`, where the
+  // Memcpy call(s) above put it, and scatter it through the rest of `buffer`.
+  constexpr int64_t host_buffer_bytes = host_buffer_size * sizeof(T);
+  constexpr int threads_per_block = 256;
+  constexpr int blocks_per_grid =
+      (host_buffer_bytes + threads_per_block - 1) / threads_per_block;
+  TF_CHECK_OK(stream->ThenLaunch(se::ThreadDim(threads_per_block, 1, 1),
+                                 se::BlockDim(blocks_per_grid, 1, 1), *kernel,
+                                 buffer, host_buffer_bytes,
+                                 static_cast<int64_t>(buffer.size())));
+#endif
 }
 
 void InitializeBuffer(se::Stream* stream, PrimitiveType buffer_type,
@@ -548,35 +589,6 @@ absl::StatusOr<se::dnn::FMHAMaskKind> GetDNNFmhaMaskKindFromCudnnFmhaMaskKind(
   }
 }
 
-absl::StatusOr<se::dnn::FusedMHAKind> GetDNNFusedMHAKindFromCudnnfMHAKind(
-    CudnnfMHAKind kind) {
-  switch (kind) {
-    case CudnnfMHAKind::kScaleBiasMaskSoftmaxDropout:
-    case CudnnfMHAKind::kScaleMaskSoftmaxDropout:
-    case CudnnfMHAKind::kBmmBmm:
-    case CudnnfMHAKind::kScaleBiasMaskSoftmax:
-    case CudnnfMHAKind::kScaleMaskSoftmax:
-    case CudnnfMHAKind::kScaleBiasSoftmax:
-    case CudnnfMHAKind::kScaleBiasSoftmaxDropout:
-      return se::dnn::FusedMHAKind::BMM1_OUTPUT_INPUT_TYPE;
-    case CudnnfMHAKind::kSoftmaxDropout:
-    case CudnnfMHAKind::kSoftmax:
-      return se::dnn::FusedMHAKind::BMM1_OUTPUT_FLOAT;
-    // backward
-    case CudnnfMHAKind::kBackwardScaleBiasMaskSoftmaxDropout:
-    case CudnnfMHAKind::kBackwardScaleMaskSoftmaxDropout:
-    case CudnnfMHAKind::kBackwardBmmBmm:
-    case CudnnfMHAKind::kBackwardScaleBiasMaskSoftmax:
-    case CudnnfMHAKind::kBackwardScaleMaskSoftmax:
-    case CudnnfMHAKind::kBackwardScaleBiasSoftmax:
-    case CudnnfMHAKind::kBackwardScaleBiasSoftmaxDropout:
-    case CudnnfMHAKind::kBackwardSoftmaxDropout:
-    case CudnnfMHAKind::kBackwardSoftmax:
-      return se::dnn::FusedMHAKind::BMM1_OUTPUT_INPUT_TYPE;
-  }
-  return Internal("Unexpected fMHA kind");
-}
-
 absl::StatusOr<se::dnn::DataType> GetDNNDataTypeFromPrimitiveType(
     PrimitiveType type) {
   switch (type) {
diff --git a/third_party/xla/xla/service/gpu/stream_executor_util.h b/third_party/xla/xla/service/gpu/stream_executor_util.h
index fb901abaa08f60..0f780f7c012815 100644
--- a/third_party/xla/xla/service/gpu/stream_executor_util.h
+++ b/third_party/xla/xla/service/gpu/stream_executor_util.h
@@ -130,9 +130,6 @@ absl::StatusOr<se::dnn::NormKind> GetDNNNormKindFromCudnnNormKind(
 absl::StatusOr<se::dnn::FMHAMaskKind> GetDNNFmhaMaskKindFromCudnnFmhaMaskKind(
     CudnnfMHAMaskKind kind);
 
-absl::StatusOr<se::dnn::FusedMHAKind> GetDNNFusedMHAKindFromCudnnfMHAKind(
-    CudnnfMHAKind kind);
-
 absl::StatusOr<se::dnn::DataType> GetDNNDataTypeFromPrimitiveType(
     PrimitiveType type);
 
diff --git a/third_party/xla/xla/service/gpu/stream_executor_util_kernel.cu.cc b/third_party/xla/xla/service/gpu/stream_executor_util_kernel.cu.cc
new file mode 100644
index 00000000000000..5106d94ec903d9
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/stream_executor_util_kernel.cu.cc
@@ -0,0 +1,37 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+
+namespace xla::gpu::repeat_buffer_kernel {
+namespace {
+// Populate the last `buffer_size - repeat_size` bytes of `buffer` by repeating
+// the first `repeat_size` bytes. This should be launched with at least
+// `repeat_size` threads in total.
+__global__ void RepeatBufferKernel(char* buffer, int64_t repeat_size,
+                                   int64_t buffer_size) {
+  int64_t global_index = blockDim.x * blockIdx.x + threadIdx.x;
+  if (global_index >= repeat_size) {
+    return;
+  }
+  const char src_value = buffer[global_index];
+  for (int64_t dst_index = global_index + repeat_size; dst_index < buffer_size;
+       dst_index += repeat_size) {
+    buffer[dst_index] = src_value;
+  }
+}
+}  // namespace
+void* kernel() { return reinterpret_cast<void*>(RepeatBufferKernel); }
+}  // namespace xla::gpu::repeat_buffer_kernel
diff --git a/third_party/xla/xla/service/gpu/tests/BUILD b/third_party/xla/xla/service/gpu/tests/BUILD
index 55bbeab1148556..11819ccfeba7b5 100644
--- a/third_party/xla/xla/service/gpu/tests/BUILD
+++ b/third_party/xla/xla/service/gpu/tests/BUILD
@@ -8,7 +8,6 @@ load(
 load(
     "@local_tsl//tsl/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
-    "tf_gpu_tests_tags",
 )
 load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
@@ -20,6 +19,7 @@ load(
     "xla_cc_test",
 )
 load("//xla/tests:build_defs.bzl", "xla_test")
+load("//xla/tsl:tsl.bzl", "if_oss")
 load("//xla/tsl:tsl.default.bzl", "filegroup")
 
 package(
@@ -69,7 +69,7 @@ cc_library(
 xla_cc_test(
     name = "element_wise_row_vectorization_test",
     srcs = ["element_wise_row_vectorization_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         "//xla:error_spec",
         "//xla/tests:hlo_test_base",
@@ -80,7 +80,7 @@ xla_cc_test(
 xla_cc_test(
     name = "pred_arithmetic_test",
     srcs = ["pred_arithmetic_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":gpu_codegen_test",
         "//xla:literal_util",
@@ -144,7 +144,7 @@ xla_cc_test(
     name = "gpu_spmd_e2e_compile_test",
     size = "small",
     srcs = ["gpu_spmd_e2e_compile_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":gpu_codegen_test",
         "//xla:debug_options_flags",
@@ -177,15 +177,16 @@ xla_test(
         "//xla/service:buffer_assignment",
         "//xla/service:executable",
         "//xla/service:hlo_module_config",
+        "//xla/service:hlo_pass",
         "//xla/service:pattern_matcher",
         "//xla/service:pattern_matcher_gmock",
         "//xla/service/gpu:gemm_rewriter",
         "//xla/service/gpu:gpu_executable",
-        "//xla/service/gpu:variant_visitor",
         "//xla/stream_executor:device_description",
+        "//xla/stream_executor:device_memory_allocator",
         "//xla/tests:filecheck",
+        "//xla/tests:verified_hlo_module",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -205,7 +206,7 @@ xla_cc_test(
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
-    tags = tf_gpu_tests_tags(),
+    use_gpu = True,
     deps = [
         ":gpu_codegen_test",
         "//xla:error_spec",
@@ -224,7 +225,7 @@ xla_cc_test(
     srcs = [
         "gpu_too_many_blocks_test.cc",
     ],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":gpu_codegen_test",
         "//xla/hlo/ir:hlo",
@@ -261,7 +262,7 @@ xla_cc_test(
     srcs = [
         "reduction_layout_normalizer_test.cc",
     ],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         "//xla:error_spec",
         "//xla/hlo/ir:hlo",
@@ -303,7 +304,8 @@ xla_cc_test(
     srcs = [
         "swap_conv_operands_test.cc",
     ],
-    tags = ["no_rocm"] + tf_cuda_tests_tags(),
+    tags = ["no_rocm"],
+    use_gpu = True,
     deps = [
         ":gpu_codegen_test",
         "//xla:error_spec",
@@ -318,7 +320,7 @@ xla_cc_test(
     srcs = [
         "reduction_vectorization_test.cc",
     ],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":gpu_codegen_test",
         "//xla:error_spec",
@@ -362,7 +364,7 @@ xla_cc_test(
     srcs = [
         "parallel_reduction_test.cc",
     ],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":gpu_codegen_test",
         "//xla:error_spec",
@@ -384,7 +386,7 @@ xla_cc_test(
     srcs = [
         "gpu_compilation_parallelism_test.cc",
     ],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":gpu_codegen_test",
         "//xla:error_spec",
@@ -399,7 +401,7 @@ xla_cc_test(
 xla_cc_test(
     name = "gpu_copy_test",
     srcs = ["gpu_copy_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":gpu_codegen_test",
         "//xla:error_spec",
@@ -418,7 +420,7 @@ xla_cc_test(
     srcs = [
         "gpu_copy_alone_test.cc",
     ],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":gpu_codegen_test",
         "//xla:error_spec",
@@ -433,7 +435,7 @@ xla_cc_test(
 xla_cc_test(
     name = "gpu_dyn_shape_test",
     srcs = ["gpu_dyn_shape_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":gpu_codegen_test",
         "//xla:shape_util",
@@ -470,7 +472,7 @@ xla_test(
 xla_cc_test(
     name = "gpu_ftz_test",
     srcs = ["gpu_ftz_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":gpu_codegen_test",
         "//xla:shape_util",
@@ -483,7 +485,7 @@ xla_cc_test(
 xla_cc_test(
     name = "gpu_index_test",
     srcs = ["gpu_index_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":gpu_codegen_test",
         "//xla:comparison_util",
@@ -503,7 +505,7 @@ xla_cc_test(
 xla_cc_test(
     name = "gpu_infeed_test",
     srcs = ["infeed_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":gpu_codegen_test",  # build_cleaner: keep
         "//xla:array3d",
@@ -526,14 +528,9 @@ xla_cc_test(
 xla_test(
     name = "gpu_kernel_tiling_test",
     srcs = ["gpu_kernel_tiling_test.cc"],
-    backend_tags = {"gpu": [
-        "requires-gpu-nvidia",
-        "requires-gpu-sm60-only",
-    ]},
     backends = [
-        "gpu",
         "gpu_p100",
-    ],
+    ] + if_oss(["gpu_any"]),
     deps = [
         ":gpu_codegen_test",
         "//xla:error_spec",
@@ -550,7 +547,7 @@ xla_test(
 xla_cc_test(
     name = "concatenate_emitter_test",
     srcs = ["concatenate_emitter_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":gpu_codegen_test",
         "//xla:error_spec",
@@ -563,7 +560,7 @@ xla_cc_test(
 xla_cc_test(
     name = "transpose_emitter_test",
     srcs = ["transpose_emitter_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":gpu_codegen_test",
         "//xla:error_spec",
@@ -576,7 +573,7 @@ xla_cc_test(
 xla_cc_test(
     name = "reduction_emitter_test",
     srcs = ["reduction_emitter_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":gpu_codegen_test",
         "//xla:error_spec",
@@ -589,7 +586,7 @@ xla_cc_test(
 xla_cc_test(
     name = "gpu_ldg_test",
     srcs = ["gpu_ldg_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":gpu_codegen_test",
         "//xla:literal",
@@ -606,7 +603,7 @@ xla_cc_test(
 xla_cc_test(
     name = "gpu_noalias_test",
     srcs = ["gpu_noalias_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":gpu_codegen_test",
         "//xla:literal",
@@ -621,7 +618,7 @@ xla_cc_test(
 xla_cc_test(
     name = "gpu_fusion_test",
     srcs = ["gpu_fusion_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":gpu_codegen_test",
         "//xla:shape_util",
@@ -637,7 +634,7 @@ xla_cc_test(
 xla_cc_test(
     name = "gpu_fusion_pipeline_test",
     srcs = ["gpu_fusion_pipeline_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":gpu_codegen_test",
         "//xla:shape_util",
@@ -656,7 +653,7 @@ xla_cc_test(
 xla_cc_test(
     name = "gpu_unrolling_test",
     srcs = ["gpu_unrolling_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":gpu_codegen_test",
         "//xla:debug_options_flags",
@@ -671,7 +668,7 @@ xla_cc_test(
     name = "gpu_alignment_test",
     testonly = True,
     srcs = ["gpu_alignment_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":gpu_codegen_test",
         "//xla/service:custom_call_target_registry",
@@ -686,7 +683,7 @@ xla_cc_test(
 xla_cc_test(
     name = "gpu_atomic_test",
     srcs = ["gpu_atomic_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":gpu_codegen_test",
         "//xla/tests:filecheck",
@@ -698,7 +695,7 @@ xla_cc_test(
 xla_cc_test(
     name = "gpu_input_fusible_slice_test",
     srcs = ["gpu_input_fusible_slice_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":gpu_codegen_test",
         "//xla:error_spec",
@@ -735,7 +732,7 @@ xla_test(
 xla_cc_test(
     name = "select_and_scatter_test",
     srcs = ["select_and_scatter_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":gpu_codegen_test",
         "//xla:error_spec",
@@ -750,7 +747,7 @@ xla_cc_test(
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":gpu_codegen_test",
         "//xla:error_spec",
@@ -768,12 +765,13 @@ xla_cc_test(
 )
 
 lit_test_suite(
-    name = "all_tests",
+    name = "hlo_lit_tests",
     srcs = enforce_glob(
         [
             "add_preds.hlo",
             "calling_convention.hlo",
             "copy.hlo",
+            "dot_bf16.hlo",
             "dynamic_update_slice_inplace.hlo",
             "element_wise_row_vectorization.hlo",
             "fused_scatter.hlo",
@@ -791,6 +789,7 @@ lit_test_suite(
             "reduction_vectorization_sm_all.hlo",
             "rng_get_and_update_state.hlo",
             "scatter.hlo",
+            "scatter_bf16.hlo",
             "select_and_scatter.hlo",
             "single_instruction.hlo",
             "slice_to_dynamic.hlo",
@@ -814,14 +813,20 @@ lit_test_suite(
     ]),
     cfg = "//xla:lit.cfg.py",
     data = [
-        ":test_utilities",
+        "//xla/tools/hlo_opt:gpu_specs/a100_80.txtpb",
+        "//xla/tools/hlo_opt:gpu_specs/a6000.txtpb",
+        "//xla/tools/hlo_opt:gpu_specs/h100.txtpb",
+        "//xla/tools/hlo_opt:gpu_specs/mi200.txtpb",
+        "//xla/tools/hlo_opt:gpu_specs/p100.txtpb",
+        "//xla/tools/hlo_opt:gpu_specs/v100.txtpb",
     ],
     default_tags = tf_cuda_tests_tags(),
     tags_override = {
-        "reduction_vectorization_sm_all.hlo": ["no_rocm"],
         "element_wise_row_vectorization.hlo": ["no_rocm"],
+        "scatter_bf16.hlo": ["no_rocm"],
         "single_instruction.hlo": ["no_rocm"],
         "reduce_unnested.hlo": ["no_rocm"],
+        "reduction_vectorization_sm_all.hlo": ["no_rocm"],
     },
     tools = [
         "//xla/tools:hlo-opt",
@@ -829,25 +834,22 @@ lit_test_suite(
     ],
 )
 
-filegroup(
-    name = "test_utilities",
-    testonly = True,
-    data = [
-        "//xla/tools:hlo-opt",
-        "//xla/tools/hlo_opt:gpu_specs/a100_80.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/a6000.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/h100.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/mi200.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/p100.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/v100.txtpb",
-        "@llvm-project//llvm:FileCheck",
-    ],
-)
+# copybara:uncomment_begin(triton-opt tool doesn't build in OSS)
+# lit_test_suite(
+#     name = "mlir_lit_tests",
+#     srcs = glob(["*.mlir"]),
+#     cfg = "//xla:lit.cfg.py",
+#     tools = [
+#         "@llvm-project//llvm:FileCheck",
+#         "@triton//:triton-opt",
+#     ],
+# )
+# copybara:uncomment_end
 
 xla_cc_test(
     name = "kernel_launch_test",
     srcs = ["kernel_launch_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":gpu_codegen_test",
         "//xla:error_spec",
@@ -859,7 +861,7 @@ xla_cc_test(
 xla_cc_test(
     name = "mock_custom_call_test",
     srcs = ["mock_custom_call_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":gpu_codegen_test",
         "//xla/tests:hlo_test_base",
@@ -870,7 +872,7 @@ xla_cc_test(
 xla_cc_test(
     name = "in_place_op_test",
     srcs = ["in_place_op_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         "//xla:debug_options_flags",
         "//xla/service:gpu_plugin",
@@ -882,7 +884,7 @@ xla_cc_test(
 xla_cc_test(
     name = "dynamic_shared_memory_test",
     srcs = if_cuda_is_configured(["dynamic_shared_memory_test.cc"]),
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         "//xla:shape_util",
         "//xla:types",
@@ -906,13 +908,9 @@ xla_cc_test(
 xla_test(
     name = "tensor_float_32_global_var_test",
     srcs = ["tensor_float_32_global_var_test.cc"],
-    backend_tags = {"gpu": [
-        "requires-gpu-nvidia",
-        "requires-gpu-sm80-only",
-    ]},
     backends = [
-        "gpu",
-    ],
+        "gpu_a100",
+    ] + if_oss(["gpu_any"]),
     deps = [
         "//xla:error_spec",
         "//xla/tests:hlo_test_base",
@@ -921,11 +919,34 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "gpu_sparse_dot_test",
+    srcs = if_cuda_is_configured(["gpu_sparse_dot_test.cc"]),
+    backends = [
+        "gpu_a100",
+        "gpu_h100",
+    ],
+    tags = ["no_oss"],  # b/340304923
+    deps = if_cuda_is_configured(
+        [
+            ":gpu_codegen_test",
+            "@com_google_googletest//:gtest",
+            "@com_google_absl//absl/strings",
+            "@com_google_absl//absl/types:span",
+            "//third_party/half:includes",
+            "//xla:literal",
+            "//xla:literal_util",
+            "//xla/tests:xla_internal_test_main",
+        ],
+        ["@local_tsl//tsl/platform:test_main"],  # b/317293391
+    ),
+)
+
 xla_test(
     name = "gpu_fused_mha_test",
     srcs = ["gpu_fused_mha_test.cc"],
     backend_tags = {"gpu": [
-        "requires-gpu-sm80",
+        "requires-gpu-sm90",
     ]},
     backends = [
         "gpu",
@@ -963,7 +984,6 @@ xla_test(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
@@ -1039,7 +1059,7 @@ xla_test(
 xla_cc_test(
     name = "gpu_int4_test",
     srcs = ["gpu_int4_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":gpu_codegen_test",
         "@local_tsl//tsl/platform:test",
@@ -1050,7 +1070,19 @@ xla_cc_test(
 xla_cc_test(
     name = "simplify_fp_conversions_test",
     srcs = ["simplify_fp_conversions_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
+    deps = [
+        "//xla:xla_proto_cc",
+        "//xla/service:gpu_plugin",
+        "//xla/tests:hlo_test_base",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
+
+xla_cc_test(
+    name = "nop_custom_call_test",
+    srcs = ["nop_custom_call_test.cc"],
+    use_gpu = True,
     deps = [
         "//xla:xla_proto_cc",
         "//xla/service:gpu_plugin",
diff --git a/third_party/xla/xla/service/gpu/tests/add_preds.hlo b/third_party/xla/xla/service/gpu/tests/add_preds.hlo
index 120b6a5ad686bf..d86113ae2ad603 100644
--- a/third_party/xla/xla/service/gpu/tests/add_preds.hlo
+++ b/third_party/xla/xla/service/gpu/tests/add_preds.hlo
@@ -1,6 +1,6 @@
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck %s
 
-// CHECK: define void @fusion({{.*}}%[[ARG0:.*]], {{.*}}%[[ARG1:.*]],
+// CHECK: define{{( amdgpu_kernel)?}} void @fusion({{.*}}%[[ARG0:.*]], {{.*}}%[[ARG1:.*]],
 // CHECK:   %[[A:.*]] = load {{.*}} ptr %[[ARG0]]
 // CHECK:   %[[B:.*]] = load {{.*}} ptr %[[ARG1]]
 // CHECK:   or {{.*}} %[[A]], %[[B]]
diff --git a/third_party/xla/xla/service/gpu/tests/dot_bf16.hlo b/third_party/xla/xla/service/gpu/tests/dot_bf16.hlo
new file mode 100644
index 00000000000000..a2a2c34c37eb88
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/tests/dot_bf16.hlo
@@ -0,0 +1,31 @@
+// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/v100.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM70
+// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/a100_80.txtpb --split-input-file --xla_gpu_autotune_level=0 --xla_gpu_enable_triton_gemm=false | FileCheck %s --check-prefixes=CHECK-SM80
+
+
+// CHECK-SM70: custom-call(f32
+// CHECK-SM80: custom-call(bf16
+
+HloModule module
+
+ENTRY %computation1 {
+    a = s8[1536,6144] parameter(0)
+    c = bf16[1536,6144] convert(a)
+    b = bf16[32,1536] parameter(1)
+    ROOT out = bf16[6144,32] dot(c, b), lhs_contracting_dims={0}, rhs_contracting_dims={1}
+}
+
+// -----
+
+// CHECK-SM70: (f32[6144,32]{1,0}, s8[4194304]{0}) custom-call(f32[1536,6144]{1,0} {{.*}}, f32[32,1536]{1,0} {{.*}}), custom_call_target="__cublas$gemm"
+// CHECK-SM80: (f32[6144,32]{1,0}, s8[4194304]{0}) custom-call(bf16[1536,6144]{1,0} %convert.1.0, bf16[32,1536]{1,0} %b.1), custom_call_target="__cublas$gemm"
+
+HloModule module2
+
+ENTRY %computation1 {
+    a = s8[1536,6144] parameter(0)
+    c = bf16[1536,6144] convert(a)
+    b = bf16[32,1536] parameter(1)
+    ROOT out = f32[6144,32] dot(c, b), lhs_contracting_dims={0}, rhs_contracting_dims={1}
+}
+
+
diff --git a/third_party/xla/xla/service/gpu/tests/fused_scatter.hlo b/third_party/xla/xla/service/gpu/tests/fused_scatter.hlo
index 9a30436ebfa38c..f8cb266bc4c670 100644
--- a/third_party/xla/xla/service/gpu/tests/fused_scatter.hlo
+++ b/third_party/xla/xla/service/gpu/tests/fused_scatter.hlo
@@ -2,7 +2,7 @@
 
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
 
-// CHECK:       define void @wrapped_scatter
+// CHECK:       define{{( amdgpu_kernel)?}} void @wrapped_scatter
 // CHECK:         %[[VAL_70:.*]] = alloca i32, align 4
 // CHECK-PTX:     %[[VAL_71:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x
 // CHECK-GCN:     %[[VAL_71:.*]] = call i32 @llvm.amdgcn.workgroup.id.x
diff --git a/third_party/xla/xla/service/gpu/tests/gemm_broadcast_folding_rewrite_test.cc b/third_party/xla/xla/service/gpu/tests/gemm_broadcast_folding_rewrite_test.cc
index 7d6c68bbc3d244..7e98e974e12d5a 100644
--- a/third_party/xla/xla/service/gpu/tests/gemm_broadcast_folding_rewrite_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gemm_broadcast_folding_rewrite_test.cc
@@ -142,7 +142,7 @@ ENTRY AddDotsFunc {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
   // Use GemmRewriter to generate cublasGemm call.
-  GemmRewriter gemm_rewriter(GpuComputeComp());
+  GemmRewriter gemm_rewriter(GpuComputeComp(), /*toolkit_version=*/12040);
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           this->RunHloPass(&gemm_rewriter, module.get()));
   EXPECT_TRUE(changed);
@@ -168,7 +168,7 @@ ENTRY AddDotsFunc {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
   // Use GemmRewriter to generate cublasGemm call.
-  GemmRewriter gemm_rewriter(GpuComputeComp());
+  GemmRewriter gemm_rewriter(GpuComputeComp(), /*toolkit_version=*/12040);
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           this->RunHloPass(&gemm_rewriter, module.get()));
   EXPECT_TRUE(changed);
@@ -192,7 +192,7 @@ ENTRY %LHSBatchDimNonZero (Arg_1: f32[4,3], Arg_2: f32[4,7,3]) -> f32[4,7,7] {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
   // Use GemmRewriter to generate cublasGemm call.
-  GemmRewriter gemm_rewriter(GpuComputeComp());
+  GemmRewriter gemm_rewriter(GpuComputeComp(), /*toolkit_version=*/12040);
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           this->RunHloPass(&gemm_rewriter, module.get()));
   EXPECT_TRUE(changed);
@@ -215,7 +215,7 @@ ENTRY %RHSBatchDimNonZero (Arg_1: f32[4,3], Arg_2: f32[4,7,3]) -> f32[4,7,7] {
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter gemm_rewriter(GpuComputeComp());
+  GemmRewriter gemm_rewriter(GpuComputeComp(), /*toolkit_version=*/12040);
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           this->RunHloPass(&gemm_rewriter, module.get()));
   EXPECT_TRUE(changed);
diff --git a/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc b/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
index 69e956f62deac4..24f95bd68af283 100644
--- a/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
@@ -15,7 +15,9 @@ limitations under the License.
 
 #include <array>
 #include <cstdint>
+#include <functional>
 #include <memory>
+#include <optional>
 #include <string>
 #include <tuple>
 #include <utility>
@@ -23,7 +25,6 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
-#include "absl/functional/any_invocable.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_replace.h"
 #include "absl/strings/string_view.h"
@@ -37,13 +38,15 @@ limitations under the License.
 #include "xla/service/gpu/gemm_rewriter.h"
 #include "xla/service/gpu/gpu_executable.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
-#include "xla/service/gpu/variant_visitor.h"
 #include "xla/service/hlo_module_config.h"
+#include "xla/service/hlo_pass_interface.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/pattern_matcher_gmock.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/test.h"
 #include "xla/tests/filecheck.h"
+#include "xla/tests/verified_hlo_module.h"
 #include "xla/xla.pb.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/statusor.h"
@@ -66,65 +69,30 @@ class GemmRewriteTest : public GpuCodegenTest {
     return backend().default_stream_executor()->GetDeviceDescription();
   }
 
- public:
-  const se::GpuComputeCapability& GpuComputeComp() {
+ protected:
+  const se::GpuComputeCapability& Capability() {
     return device_desc().gpu_compute_capability();
   }
-  se::GpuComputeCapability CudaHopperOrRocmMI300() {
-    return std::visit(
-        VariantVisitor{[](const se::CudaComputeCapability&) {
-                         return se::GpuComputeCapability{
-                             se::CudaComputeCapability{
-                                 se::CudaComputeCapability::HOPPER, 0}};
-                       },
-                       [](const se::RocmComputeCapability&) {
-                         return se::GpuComputeCapability{
-                             se::RocmComputeCapability{"gfx942"}};
-                       }},
-        GpuComputeComp());
-  }
 
-  enum class Switch : uint32_t {
-    False,  // check always fails
-    True,   // check always succeeds
-  };
-  // Switch based on GPU platform only: true/false for both
-  bool CudaOrRocmCheck(Switch cuda_set, Switch rocm_set) {
-    return CudaOrRocmCheck(
-        [cuda_set](const se::CudaComputeCapability&) {
-          return cuda_set == Switch::True;
-        },
-        [rocm_set](const se::RocmComputeCapability&) {
-          return rocm_set == Switch::True;
-        });
-  }
-  // Major version check for CUDA and true/false for ROCM
-  bool CudaOrRocmCheck(int cuda_major, Switch rocm_set) {
-    return CudaOrRocmCheck(cuda_major, 0, rocm_set);
-  }
-  // Full version check for CUDA and true/false for ROCM
-  bool CudaOrRocmCheck(int cuda_major, int cuda_minor, Switch rocm_set) {
-    return CudaOrRocmCheck(cuda_major, cuda_minor,
-                           [rocm_set](const se::RocmComputeCapability&) {
-                             return rocm_set == Switch::True;
-                           });
+  int32_t GetToolkitVersion() const {
+#if GOOGLE_CUDA
+    return CUDA_VERSION;
+#elif TENSORFLOW_USE_ROCM
+    return TF_ROCM_VERSION;
+#endif
+    return 0;
   }
-  // Full version check for CUDA and generic version for ROCM
-  bool CudaOrRocmCheck(
-      int cuda_major, int cuda_minor,
-      absl::AnyInvocable<bool(const se::RocmComputeCapability&)> rocm_fun) {
-    return CudaOrRocmCheck(
-        [cuda_major, cuda_minor](const se::CudaComputeCapability& cc) {
-          return cc.IsAtLeast(cuda_major, cuda_minor);
-        },
-        std::move(rocm_fun));
+
+  bool IsCuda() {
+    return std::holds_alternative<se::CudaComputeCapability>(Capability());
   }
-  // The most generic version for both platforms
-  bool CudaOrRocmCheck(
-      absl::AnyInvocable<bool(const se::CudaComputeCapability&)> cuda_fun,
-      absl::AnyInvocable<bool(const se::RocmComputeCapability&)> rocm_fun) {
-    return std::visit(VariantVisitor{std::move(cuda_fun), std::move(rocm_fun)},
-                      GpuComputeComp());
+
+  se::GpuComputeCapability CudaHopperOrRocmMI300() {
+    if (IsCuda()) {
+      return se::CudaComputeCapability::Hopper();
+    } else {
+      return se::RocmComputeCapability{"gfx942"};
+    }
   }
 
   DebugOptions GetDebugOptionsForTest() override {
@@ -137,15 +105,23 @@ class GemmRewriteTest : public GpuCodegenTest {
   }
 
   bool SkipGpuBlasLtTest() {
-    return CudaOrRocmCheck(
-        [](const se::CudaComputeCapability&) {  // never skip gpublas-lt tests
-                                                // for CUDA
-          return false;
-        },
-        [this](const se::RocmComputeCapability& rocm) {
-          bool blaslt = GetDebugOptionsForTest().xla_gpu_enable_cublaslt();
-          return (blaslt && !rocm.has_hipblaslt());
-        });
+    return !IsCuda() &&
+           !std::get<se::RocmComputeCapability>(Capability()).has_hipblaslt() &&
+           GetDebugOptionsForTest().xla_gpu_enable_cublaslt();
+  }
+
+  bool HasFp8Support() {
+    if (IsCuda()) {
+      return std::get<se::CudaComputeCapability>(Capability()).IsAtLeast(8, 9);
+    }
+    return std::get<se::RocmComputeCapability>(Capability())
+               .has_fp8_support() &&
+           GetDebugOptionsForTest().xla_gpu_enable_cublaslt();
+  }
+
+  bool HasCudaComputeCapability(const se::CudaComputeCapability& cc) {
+    return IsCuda() &&
+           std::get<se::CudaComputeCapability>(Capability()).IsAtLeast(cc);
   }
 };
 
@@ -176,10 +152,11 @@ ENTRY AddDotsFunc {
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TEST_F(GemmRewriteTest, TestBatchedAutotuning) {
-  if (CudaOrRocmCheck(se::CudaComputeCapability::AMPERE, Switch::False)) {
+  if (HasCudaComputeCapability(se::CudaComputeCapability::Ampere())) {
     GTEST_SKIP()
         << "There is no autotuning starting with the Nvidia Ampere generation";
   }
+
   const char* hlo_text = R"(
 HloModule ComplexDotMultipleNonContracting
 
@@ -225,16 +202,17 @@ ENTRY AddDotsFunc {
   auto get_module = [&]() {
     HloModuleConfig config;
     DebugOptions debug_options = GetDebugOptionsForTest();
-    debug_options.set_xla_gpu_deterministic_ops(true);
+    debug_options.set_xla_gpu_exclude_nondeterministic_ops(true);
     config.set_debug_options(debug_options);
     return ParseAndReturnVerifiedModule(hlo_text, config);
   };
 
+  se::StreamExecutorMemoryAllocator allocator(
+      backend().default_stream_executor());
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<HloModule> optimized_module,
       backend().compiler()->RunHloPasses(
-          *get_module(), backend().default_stream_executor(),
-          backend().default_stream_executor()->GetAllocator()));
+          *get_module(), backend().default_stream_executor(), &allocator));
 
   absl::StatusOr<bool> filecheck_result =
       RunFileCheck(optimized_module->ToString(),
@@ -257,7 +235,8 @@ ENTRY bf16gemm {
 }
   )";
 
-  if (CudaOrRocmCheck(9, 0, Switch::False)) {
+  if (!IsCuda() ||
+      HasCudaComputeCapability(se::CudaComputeCapability::Hopper())) {
     // The Hopper optimized HLO has a BF16 multiply instruction since Hopper has
     // native BF16 multiply support.
     MatchOptimizedHlo(hlo_text, R"(
@@ -824,11 +803,7 @@ ENTRY AddDotsFunc {
 }
 
 TEST_P(ParameterizedGemmRewriteTest, ComplexAlphaSimpleRewrite) {
-  if (CudaOrRocmCheck(
-          [](se::CudaComputeCapability) { return false; },
-          [this](se::RocmComputeCapability rocm) {
-            return GetDebugOptionsForTest().xla_gpu_enable_cublaslt();
-          })) {
+  if (!IsCuda() && GetDebugOptionsForTest().xla_gpu_enable_cublaslt()) {
     GTEST_SKIP() << "TODO: Unsupported C64 gpublas-lt datatype on ROCM";
   }
   const char* hlo_text = R"(
@@ -962,18 +937,15 @@ ENTRY bf16gemm {
   )";
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
 
-  if (CudaOrRocmCheck(se::CudaComputeCapability::AMPERE, Switch::True)) {
+  if (!IsCuda() ||
+      HasCudaComputeCapability(se::CudaComputeCapability::Ampere())) {
     MatchOptimizedHlo(hlo_text,
                       R"(
 ; CHECK: {{.*}} custom-call(bf16[16,8]{1,0} {{.*}}, bf16[8,8]{1,0} {{.*}}), custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>"
   )",
                       /*print_operand_shape=*/true);
   } else {
-    MatchOptimizedHlo(hlo_text,
-                      R"(
-; CHECK: {{.*}} custom-call(bf16[12,4]{1,0} [[P0:%[^ ]+]], bf16[4,8]{1,0} [[P1:%[^ ]+]]), custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>"
-  )",
-                      /*print_operand_shape=*/true);
+    GTEST_SKIP() << "Pre-Ampere casts up bf16 to fp32";
   }
 }
 
@@ -990,24 +962,15 @@ ENTRY bf16gemm {
   )";
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
 
-  if (CudaOrRocmCheck(se::CudaComputeCapability::AMPERE, Switch::True)) {
+  if (!IsCuda() ||
+      HasCudaComputeCapability(se::CudaComputeCapability::Ampere())) {
     MatchOptimizedHlo(hlo_text,
                       R"(
     ; CHECK: {{.*}} custom-call(bf16[3,8,8]{2,1,0} {{.*}}, bf16[3,8,8]{2,1,0} {{.*}}), custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>"
     )",
                       /*print_operand_shape=*/true);
-  } else if (GetParam()) {
-    MatchOptimizedHlo(hlo_text,
-                      R"(
-    ; CHECK: ROOT [[OUT:%[^ ]+]] = bf16[3,4,2]{2,1,0} custom-call(bf16[3,3,4]{2,1,0} [[A:%[^ ]+]], bf16[3,3,2]{2,1,0} [[B:%[^ ]+]]), custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>"
-    )",
-                      /*print_operand_shape=*/true);
   } else {
-    MatchOptimizedHlo(hlo_text,
-                      R"(
-    ; CHECK: {{.*}} custom-call(bf16[3,3,4]{2,1,0} [[A:%[^ ]+]], bf16[3,3,2]{2,1,0} [[B:%[^ ]+]]), custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>"
-    )",
-                      /*print_operand_shape=*/true);
+    GTEST_SKIP() << "Pre-Ampere casts up bf16 to fp32";
   }
 }
 
@@ -1023,7 +986,8 @@ ENTRY int8gemm {
   )";
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
 
-  if (CudaOrRocmCheck(se::CudaComputeCapability::VOLTA, Switch::True)) {
+  if (!IsCuda() ||
+      HasCudaComputeCapability(se::CudaComputeCapability::Volta())) {
     MatchOptimizedHlo(hlo_text,
                       R"(
 ; CHECK: {{.*}} custom-call(s8[12,4]{1,0} [[A:%[^ ]+]], s8[4,8]{0,1} [[B:%[^ ]+]]), custom_call_target="__cublas$gemm"
@@ -1040,7 +1004,7 @@ ENTRY int8gemm {
 }
 
 TEST_F(GemmRewriteTest, Int8GemmRankGreaterThanTwo) {
-  if (CudaOrRocmCheck(Switch::False, Switch::True)) {
+  if (!IsCuda()) {
     GTEST_SKIP() << "DoBlasGemmWithAlgorithm is not yet implemented on ROCm";
   }
 
@@ -1057,7 +1021,8 @@ ENTRY main.4 {
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
 
-  if (CudaOrRocmCheck(se::CudaComputeCapability::VOLTA, Switch::True)) {
+  if (!IsCuda() ||
+      HasCudaComputeCapability(se::CudaComputeCapability::Volta())) {
     MatchOptimizedHlo(hlo_text,
                       R"(
 ; CHECK: [[GEMM:%[^ ]+]] = (s32[8,4]{1,0}, s8[{{[0-9]+}}]{0}) custom-call(s8[8,4]{1,0} %{{.*}}, s8[4,4]{0,1} %{{.*}}), custom_call_target="__cublas$gemm",
@@ -1081,7 +1046,8 @@ ENTRY int8gemm {
   )";
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
 
-  if (CudaOrRocmCheck(se::CudaComputeCapability::VOLTA, Switch::True)) {
+  if (!IsCuda() ||
+      HasCudaComputeCapability(se::CudaComputeCapability::Volta())) {
     MatchOptimizedHlo(hlo_text,
                       R"(
 ; CHECK: {{.*}} custom-call(s8[12,4]{1,0} [[A:%[^ ]+]], s8[4,8]{0,1} [[B:%[^ ]+]]),
@@ -1115,7 +1081,8 @@ ENTRY int8gemm {
   )";
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
 
-  if (CudaOrRocmCheck(se::CudaComputeCapability::VOLTA, Switch::True)) {
+  if (!IsCuda() ||
+      HasCudaComputeCapability(se::CudaComputeCapability::Volta())) {
     MatchOptimizedHlo(hlo_text,
                       R"(
 ; CHECK: {{.*}} custom-call(s8[12,4]{1,0} [[A:%[^ ]+]], s8[4,8]{0,1} [[B:%[^ ]+]]),
@@ -1137,7 +1104,7 @@ ENTRY int8gemm {
 }
 
 TEST_P(ParameterizedGemmRewriteTest, Int8GemmNotMultipleOfFour) {
-  if (CudaOrRocmCheck(Switch::False, Switch::True)) {
+  if (!IsCuda()) {
     GTEST_SKIP() << "DoBlasGemmWithAlgorithm is not yet implemented on ROCm";
   }
 
@@ -1152,7 +1119,8 @@ ENTRY int8gemm {
   )";
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
 
-  if (CudaOrRocmCheck(se::CudaComputeCapability::VOLTA, Switch::True)) {
+  if (!IsCuda() ||
+      HasCudaComputeCapability(se::CudaComputeCapability::Volta())) {
     MatchOptimizedHlo(hlo_text,
                       R"(
 ; CHECK: {{.*}} custom-call(s8[16,4]{1,0} [[A:%[^ ]+]], s8[4,12]{0,1} [[B:%[^ ]+]]), custom_call_target="__cublas$gemm"
@@ -1169,7 +1137,7 @@ ENTRY int8gemm {
 }
 
 TEST_P(ParameterizedGemmRewriteTest, GemmTypeCombinationCheck) {
-  if (CudaOrRocmCheck(Switch::False, Switch::True)) {
+  if (!IsCuda()) {
     GTEST_SKIP() << "DoBlasGemmWithAlgorithm is not yet implemented on ROCm";
   }
 
@@ -1188,8 +1156,9 @@ TEST_P(ParameterizedGemmRewriteTest, GemmTypeCombinationCheck) {
                            {"f16", "f32", true},
                            {"bf16", "f32", true}};
 
-  if (CudaOrRocmCheck(se::CudaComputeCapability::VOLTA, Switch::True)) {
-    // For compute capabilities before volta, we always do upcasting, so it
+  if (!IsCuda() ||
+      HasCudaComputeCapability(se::CudaComputeCapability::Ampere())) {
+    // For compute capabilities before Ampere, we may do upcasting, so it
     // would be impossible for this test to fail. That is why we only add these
     // cases when the compute capability is at least Volta.
     std::vector<std::tuple<absl::string_view, absl::string_view, bool>>
@@ -1252,7 +1221,7 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(GpuComputeComp());
+  GemmRewriter pass(Capability(), GetToolkitVersion());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -1276,7 +1245,7 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(GpuComputeComp());
+  GemmRewriter pass(Capability(), GetToolkitVersion());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -1300,18 +1269,12 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(GpuComputeComp());
+  GemmRewriter pass(Capability(), GetToolkitVersion());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
-
-  if (GetParam()) {
-    EXPECT_THAT(module->entry_computation()->root_instruction(),
-                GmockMatch(m::CustomCall({CustomCallTarget()})));
-  } else {
-    EXPECT_THAT(
-        module->entry_computation()->root_instruction(),
-        GmockMatch(m::GetTupleElement(m::CustomCall({CustomCallTarget()}), 0)));
-  }
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::GetTupleElement(m::CustomCall({CustomCallTarget()}), 0)));
 }
 
 TEST_P(ParameterizedGemmRewriteTest, UpcastingF16ToF64) {
@@ -1327,7 +1290,7 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(GpuComputeComp());
+  GemmRewriter pass(Capability(), GetToolkitVersion());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -1351,7 +1314,7 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(GpuComputeComp());
+  GemmRewriter pass(Capability(), GetToolkitVersion());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -1379,20 +1342,15 @@ ENTRY main {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(GpuComputeComp());
+  GemmRewriter pass(Capability(), GetToolkitVersion());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
   // input fp16 and output fp32 combination is supported by legacy cublas and
   // cublasLt, expect GemmRewriter to fuse the convert into gemm.
-  if (GetParam()) {
-    EXPECT_THAT(module->entry_computation()->root_instruction(),
-                GmockMatch(m::Convert(m::CustomCall({CustomCallTarget()}))));
-  } else {
-    EXPECT_THAT(module->entry_computation()->root_instruction(),
-                GmockMatch(m::Convert(m::GetTupleElement(
-                    m::CustomCall({CustomCallTarget()}), 0))));
-  }
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Convert(
+                  m::GetTupleElement(m::CustomCall({CustomCallTarget()}), 0))));
 }
 
 TEST_P(ParameterizedGemmRewriteTest, UnsupportedMixTypeGemm) {
@@ -1412,20 +1370,15 @@ ENTRY main {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(GpuComputeComp());
+  GemmRewriter pass(Capability(), GetToolkitVersion());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
   // u8 is not supported by legacy cublas and cublasLt, expect
   // GemmRewriter to not fuse the convert into gemm.
-  if (GetParam()) {
-    EXPECT_THAT(module->entry_computation()->root_instruction(),
-                GmockMatch(m::Convert(m::CustomCall({CustomCallTarget()}))));
-  } else {
-    EXPECT_THAT(module->entry_computation()->root_instruction(),
-                GmockMatch(m::Convert(m::GetTupleElement(
-                    m::CustomCall({CustomCallTarget()}), 0))));
-  }
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Convert(
+                  m::GetTupleElement(m::CustomCall({CustomCallTarget()}), 0))));
 }
 
 TEST_P(ParameterizedGemmRewriteTest, CheckIsGemmAliasedBeforeFusion) {
@@ -1447,21 +1400,16 @@ ENTRY main {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(GpuComputeComp());
+  GemmRewriter pass(Capability(), GetToolkitVersion());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
   // input fp16 and output fp32 combination is supported by legacy cublas and
   // cublasLt, but gemm output is already aliased with one of the input expect
   // GemmRewriter to not fuse the convert into gemm.
-  if (GetParam()) {
-    EXPECT_THAT(module->entry_computation()->root_instruction(),
-                GmockMatch(m::Convert(m::CustomCall({CustomCallTarget()}))));
-  } else {
-    EXPECT_THAT(module->entry_computation()->root_instruction(),
-                GmockMatch(m::Convert(m::GetTupleElement(
-                    m::CustomCall({CustomCallTarget()}), 0))));
-  }
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Convert(
+                  m::GetTupleElement(m::CustomCall({CustomCallTarget()}), 0))));
 }
 
 INSTANTIATE_TEST_SUITE_P(CublasTestsBothLegacyAndLt,
@@ -1490,10 +1438,12 @@ ENTRY e {
     lhs_contracting_dims={0}, rhs_contracting_dims={0}
 })";
 
-  RunAndFilecheckHloRewrite(hlo_text,
-                            GemmRewriter(se::CudaComputeCapability{
-                                se::CudaComputeCapability::AMPERE, 0}),
-                            R"(
+  RunAndFilecheckHloRewrite(
+      hlo_text,
+      GemmRewriter(
+          se::CudaComputeCapability{se::CudaComputeCapability::AMPERE, 0},
+          /*toolkit_version=*/12040),
+      R"(
 ; CHECK:  %[[P0:.+]] = f32[2048]{0} parameter(0)
 ; CHECK:  %[[P1:.+]] = f32[2048,16384]{1,0} parameter(1)
 ; CHECK:  %[[CUSTOM_CALL:.+]] = (f32[16384]{0}, s8[4194304]{0}) custom-call(%[[P0]], %[[P1]]), custom_call_target="__cublas$gemm"
@@ -1512,10 +1462,12 @@ ENTRY e {
    lhs_contracting_dims={2}, rhs_contracting_dims={2}
 })";
 
-  RunAndFilecheckHloRewrite(hlo_text,
-                            GemmRewriter(se::CudaComputeCapability{
-                                se::CudaComputeCapability::AMPERE, 0}),
-                            R"(
+  RunAndFilecheckHloRewrite(
+      hlo_text,
+      GemmRewriter(
+          se::CudaComputeCapability{se::CudaComputeCapability::AMPERE, 0},
+          /*toolkit_version=*/12040),
+      R"(
 ; CHECK:  %[[P0:.+]] = f32[10,10,2048]{2,1,0} parameter(0)
 ; CHECK:  %[[P1:.+]] = f32[10,10,2048,16384]{3,2,1,0} parameter(1)
 ; CHECK:  %[[CUSTOM_CALL:.+]] = (f32[10,10,16384]{2,1,0}, s8[4194304]{0}) custom-call(%[[P0]], %[[P1]]), custom_call_target="__cublas$gemm"
@@ -1534,7 +1486,8 @@ ENTRY main {
       lhs_contracting_dims={1}, rhs_contracting_dims={0}, sparsity=L.1@2:4
 })";
   auto hlo_pass = GemmRewriter(
-      se::CudaComputeCapability{se::CudaComputeCapability::AMPERE, 0});
+      se::CudaComputeCapability{se::CudaComputeCapability::AMPERE, 0},
+      /*toolkit_version=*/12040);
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo_text));
   TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloPass(&hlo_pass, module.get()));
   EXPECT_FALSE(changed);
@@ -1618,7 +1571,7 @@ ENTRY AddDotsFunc {
 ; CHECK-LABEL: ENTRY %AddDotsFunc ({{.*}}: f32[2,2], {{.*}}: f32[2,2], {{.*}}: f32[2,2]) -> f32[2,2] {
 ; CHECK-DAG:     [[P0:%[^ ]+]] = f32[2,2]{1,0} parameter(0)
 ; CHECK-DAG:     [[P1:%[^ ]+]] = f32[2,2]{1,0} parameter(1)
-; CHECK-NEXT:    [[GEMM:%[^ ]+]] = (f32[2,2]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1]]),
+; CHECK-NEXT:    [[CUSTOM_CALL:%[^ ]+]] = (f32[2,2]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="__cublas$gemm",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":3
@@ -1635,6 +1588,7 @@ ENTRY AddDotsFunc {
 ; CHECK-DAG:         }
 ; CHECK-DAG:         "epilogue":"DEFAULT"
 ; CHECK:           }
+
 )");
 }
 
@@ -1831,6 +1785,12 @@ ENTRY BF16GemmWithBias {
   )";
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{2e-3, 2e-3}));
+
+  if (IsCuda() &&
+      !HasCudaComputeCapability(se::CudaComputeCapability::Ampere())) {
+    GTEST_SKIP() << "Pre-Ampere casts up bf16 to fp32";
+  }
+
   MatchOptimizedHlo(hlo_text,
                     R"(
 ; CHECK-LABEL: ENTRY %BF16GemmWithBias ({{.*}}: bf16[8,8], {{.*}}: bf16[8,8], {{.*}}: bf16[8,8]) -> bf16[8,8] {
@@ -2002,6 +1962,12 @@ ENTRY test {
     replacements["<<DType>>"] = std::get<1>(type_combination);
     const auto hlo_text = absl::StrReplaceAll(hlo_text_template, replacements);
     EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
+
+    if (std::get<0>(type_combination) == "bf16" && IsCuda() &&
+        !HasCudaComputeCapability(se::CudaComputeCapability::Ampere())) {
+      continue;
+    }
+
     TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
                             GetOptimizedModule(hlo_text));
     EXPECT_THAT(optimized_module->entry_computation()->root_instruction(),
@@ -2039,6 +2005,11 @@ ENTRY test {
     const auto hlo_text = absl::StrReplaceAll(hlo_text_template, replacements);
     EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
 
+    if (std::get<0>(type_combination) == "bf16" && IsCuda() &&
+        !HasCudaComputeCapability(se::CudaComputeCapability::Ampere())) {
+      continue;
+    }
+
     TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
                             GetOptimizedModule(hlo_text));
     EXPECT_THAT(optimized_module->entry_computation()->root_instruction(),
@@ -2097,6 +2068,12 @@ ENTRY test {
 )";
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
+
+  if (IsCuda() &&
+      !HasCudaComputeCapability(se::CudaComputeCapability::Ampere())) {
+    GTEST_SKIP() << "Pre-Ampere casts up bf16 to fp32";
+  }
+
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
                           GetOptimizedModule(hlo_text));
   MatchOptimizedHlo(hlo_text, R"(
@@ -2120,7 +2097,7 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(GpuComputeComp());
+  GemmRewriter pass(Capability(), GetToolkitVersion());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -2170,7 +2147,7 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(GpuComputeComp());
+  GemmRewriter pass(Capability(), GetToolkitVersion());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   SCOPED_TRACE(module->ToString());
   EXPECT_TRUE(changed);
@@ -2235,7 +2212,7 @@ ENTRY AddDotsFunc {
 ; CHECK-DAG:     [[X:%[^ ]+]] = f32[2,2]{1,0} parameter(0)
 ; CHECK-DAG:     [[Y:%[^ ]+]] = f32[2,2]{1,0} parameter(1)
 ; CHECK-DAG:     [[BIAS:%[^ ]+]] = f32[2,2]{1,0} parameter(2)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,2]{1,0} custom-call([[X]], [[Y]], [[BIAS]]),
+; CHECK-NEXT:    [[GEMM:%[^ ]+]] = (f32[2,2]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[X]], [[Y]], [[BIAS]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":3
@@ -2252,6 +2229,7 @@ ENTRY AddDotsFunc {
 ; CHECK-DAG:         }
 ; CHECK-DAG:         "epilogue":"DEFAULT"
 ; CHECK:           }
+; CHECK-NEXT  ROOT [[OUT:%[^ ]+]] = f32[2,2]{1,0} get-tuple-element(%cublas-lt-matmul.2.0), index=0
 )");
 }
 
@@ -2279,7 +2257,7 @@ ENTRY AddDotsFunc {
 ; CHECK-DAG:     [[P0:%[^ ]+]] = f32[2,2]{1,0} parameter(0)
 ; CHECK-DAG:     [[P1:%[^ ]+]] = f32[2,2]{1,0} parameter(1)
 ; CHECK-DAG:     [[BIAS:%[^ ]+]] = f32[2,2]{1,0} parameter(2)
-; CHECK-NEXT:    [[GEMM:%[^ ]+]] = f32[2,2]{1,0} custom-call([[P0]], [[P1]], [[BIAS]]),
+; CHECK-NEXT:    [[GEMM:%[^ ]+]] = (f32[2,2]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1]], [[BIAS]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK-NOT:       output_to_operand_aliasing
 ; CHECK:           backend_config={
@@ -2322,7 +2300,7 @@ ENTRY AddDotsFunc {
 ; CHECK-DAG:     [[P0:%[^ ]+]] = f32[1024,1024]{1,0} parameter(0)
 ; CHECK-DAG:     [[P1:%[^ ]+]] = f32[1024,1024]{1,0} parameter(1)
 ; CHECK-DAG:     [[BIAS:%[^ ]+]] = f32[1024,1024]{1,0} parameter(2)
-; CHECK-NEXT:    [[GEMM:%[^ ]+]] = f32[1024,1024]{1,0} custom-call([[P0]], [[P1]], [[BIAS]]),
+; CHECK-NEXT:    [[GEMM_TUPLE:%[^ ]+]] = (f32[1024,1024]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1]], [[BIAS]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -2339,7 +2317,8 @@ ENTRY AddDotsFunc {
 ; CHECK-DAG:         }
 ; CHECK-DAG:         "epilogue":"DEFAULT"
 ; CHECK:           }
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[1024,1024]{1,0} add([[GEMM]], [[BIAS]])
+; CHECK-NEXT:  [[GEMM:%[^ ]+]] = f32[1024,1024]{1,0} get-tuple-element([[GEMM_TUPLE]]), index=0
+; CHECK-NEXT:  ROOT [[OUT:%[^ ]+]] = f32[1024,1024]{1,0} add([[GEMM]], [[BIAS]])
 )");
 }
 
@@ -2357,13 +2336,19 @@ ENTRY BF16GemmWithBias {
   )";
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
+
+  if (IsCuda() &&
+      !HasCudaComputeCapability(se::CudaComputeCapability::Ampere())) {
+    GTEST_SKIP() << "Pre-Ampere casts up bf16 to fp32";
+  }
+
   MatchOptimizedHlo(hlo_text,
                     R"(
 ; CHECK-LABEL: ENTRY %BF16GemmWithBias ({{.*}}: bf16[8,8], {{.*}}: bf16[8,8], {{.*}}: bf16[8,8]) -> bf16[8,8] {
 ; CHECK-DAG:    [[X:%[^ ]+]] = bf16[8,8]{1,0} parameter(0)
 ; CHECK-DAG:    [[Y:%[^ ]+]] = bf16[8,8]{1,0} parameter(1)
 ; CHECK-DAG:    [[BIAS:%[^ ]+]] = bf16[8,8]{1,0} parameter(2)
-; CHECK-NEXT:   ROOT [[GEMM:%[^ ]+]] = bf16[8,8]{1,0} custom-call([[X]], [[Y]], [[BIAS]]),
+; CHECK-NEXT:   [[GEMM:%[^ ]+]] = (bf16[8,8]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[X]], [[Y]], [[BIAS]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -2404,7 +2389,7 @@ ENTRY test {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,3]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[2,4]{1,0} parameter(2)
-; CHECK-NEXT:    ROOT [[GEMM:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
+; CHECK-NEXT:    [[GEMM:%[^ ]+]] = (f32[2,4]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -2448,7 +2433,7 @@ ENTRY test {
 ; CHECK-DAG:     [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
 ; CHECK-DAG:     [[P2:%[^ ]+]] = f32[2,3]{1,0} parameter(2)
 ; CHECK-DAG:     [[P3:%[^ ]+]] = f32[3,4]{1,0} parameter(3)
-; CHECK-NEXT:    [[FIRST_GEMM:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]]),
+; CHECK-NEXT:    [[FIRST_GEMM_TUPLE:%[^ ]+]] = (f32[2,4]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -2465,9 +2450,12 @@ ENTRY test {
 ; CHECK-DAG:         }
 ; CHECK-DAG:         "epilogue":"DEFAULT"
 ; CHECK:           }
-; CHECK-NEXT:    ROOT [[SECOND_GEMM:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P2]], [[P3]], [[FIRST_GEMM]]),
+; CHECK:         [[FIRST_GEMM:%[^ ]+]] = f32[2,4]{1,0} get-tuple-element([[FIRST_GEMM_TUPLE]]), index=0
+; CHECK-NEXT:    [[SECOND_GEMM:%[^ ]+]] = (f32[2,4]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P2]], [[P3]], [[FIRST_GEMM]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           output_to_operand_aliasing={{{{}: \(2, {}\)}}},
+; CHECK:           output_to_operand_aliasing={
+; CHECK:              {0}: (2, {})
+; CHECK:           }
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
 ; CHECK-DAG:         "alpha_imag":0
@@ -2508,7 +2496,7 @@ ENTRY test {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,3]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4]{0} parameter(2)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
+; CHECK-NEXT:    [[OUT:%[^ ]+]] = (f32[2,4]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -2562,7 +2550,7 @@ ENTRY test {
 ; CHECK-LABEL: ENTRY %test ({{.*}}: f32[4,4], {{.*}}: f32[4,4], {{.*}}: f32[4]) -> f32[4,4] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[4,4]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4,4]{1,0} parameter(1)
-; CHECK-NEXT:    [[MATMUL0:%[^ ]+]] = f32[4,4]{1,0} custom-call([[P0]], [[P1]]),
+; CHECK-NEXT:    [[MATMUL0_TUPLE:%[^ ]+]] = (f32[4,4]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -2579,9 +2567,10 @@ ENTRY test {
 ; CHECK-DAG:         }
 ; CHECK-DAG:         "epilogue":"DEFAULT"
 ; CHECK:           }
+; CHECK-NEXT:    [[MATMUL0:%[^ ]+]] = f32[4,4]{1,0} get-tuple-element([[MATMUL0_TUPLE]]), index=0
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4]{0} parameter(2)
 ; CHECK-NEXT:    [[FUSION:%[^ ]+]] = f32[4,4]{1,0} fusion([[MATMUL0]], [[P2]]), kind=kLoop, calls=[[FUSED_COMPUTATION]]
-; CHECK:         [[MATMUL1:%[^ ]+]] = f32[4,4]{1,0} custom-call([[MATMUL0]]
+; CHECK:         [[MATMUL1_TUPLE:%[^ ]+]] = (f32[4,4]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[MATMUL0]]
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -2598,7 +2587,8 @@ ENTRY test {
 ; CHECK-DAG:         }
 ; CHECK-DAG:         "epilogue":"DEFAULT"
 ; CHECK:           }
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[4,4]{1,0} custom-call([[FUSION]], [[MATMUL1]]),
+; CHECK-NEXT:    [[MATMUL1:%[^ ]+]] = f32[4,4]{1,0} get-tuple-element([[MATMUL1_TUPLE]]), index=0
+; CHECK-NEXT:    [[OUT:%[^ ]+]] = (f32[4,4]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[FUSION]], [[MATMUL1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -2638,9 +2628,11 @@ ENTRY test {
                     R"(
 
 ; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,3,4], {{.*}}: f32[4,5,6], {{.*}}: f32[3,5,6]) -> f32[2,3,5,6] {
-; CHECK:         [[MATMUL:%[^ ]+]] = f32[6,30]{1,0} custom-call(
+; CHECK:         [[MATMUL_TUPLE:%[^ ]+]] = (f32[6,30]{1,0}, s8[{{[0-9]+}}]{0}) custom-call(
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           output_to_operand_aliasing={{[{][{]}}}: (2, {})},
+; CHECK:           output_to_operand_aliasing={
+; CHECK:              {0}: (2, {})
+; CHECK:           }
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
 ; CHECK-DAG:         "alpha_imag":0
@@ -2656,6 +2648,7 @@ ENTRY test {
 ; CHECK-DAG:         }
 ; CHECK-DAG:         "epilogue":"DEFAULT"
 ; CHECK:           }
+; CHECK-NEXT:   [[MATMUL:%[^ ]+]] = f32[6,30]{1,0} get-tuple-element([[MATMUL_TUPLE]]), index=0
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,3,5,6]{3,2,1,0} bitcast([[MATMUL]])
       )");
 }
@@ -2680,9 +2673,11 @@ ENTRY test {
                     R"(
 
 ; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,3,4], {{.*}}: f32[4,5,6], {{.*}}: f32[6]) -> f32[2,3,5,6] {
-; CHECK:         [[MATMUL:%[^ ]+]] = f32[6,30]{1,0} custom-call(
+; CHECK:         [[MATMUL_TUPLE:%[^ ]+]] = (f32[6,30]{1,0}, s8[{{[0-9]+}}]{0}) custom-call(
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           output_to_operand_aliasing={{[{][{]}}}: (2, {})},
+; CHECK:           output_to_operand_aliasing={
+; CHECK:              {0}: (2, {})
+; CHECK:           }
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
 ; CHECK-DAG:         "alpha_imag":0
@@ -2698,6 +2693,7 @@ ENTRY test {
 ; CHECK-DAG:         }
 ; CHECK-DAG:         "epilogue":"DEFAULT"
 ; CHECK:           }
+; CHECK:         [[MATMUL:%[^ ]+]] = f32[6,30]{1,0} get-tuple-element([[MATMUL_TUPLE]]), index=0
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,3,5,6]{3,2,1,0} bitcast([[MATMUL]])
       )");
 }
@@ -2725,7 +2721,7 @@ ENTRY test {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,3]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[2]{0} parameter(2)
-; CHECK-NEXT:    [[MATMUL:%[^ ]+]] = f32[2,4]{0,1} custom-call([[P0]], [[P1]], [[P2]]),
+; CHECK-NEXT:    [[MATMUL_TUPLE:%[^ ]+]] = (f32[2,4]{0,1}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -2742,6 +2738,7 @@ ENTRY test {
 ; CHECK-DAG:         }
 ; CHECK-DAG:         "epilogue":"BIAS"
 ; CHECK:           }
+; CHECK-NEXT:    [[MATMUL:%[^ ]+]] = f32[2,4]{0,1} get-tuple-element([[MATMUL_TUPLE]]), index=0
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[4,2]{1,0} bitcast([[MATMUL]])
 )");
 }
@@ -2770,7 +2767,7 @@ ENTRY test {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[4,3]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[3]{0} parameter(2)
-; CHECK-NEXT:    [[MATMUL:%[^ ]+]] = f32[4,4]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
+; CHECK-NEXT:    [[MATMUL:%[^ ]+]] = (f32[4,4]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -2787,7 +2784,8 @@ ENTRY test {
 ; CHECK-DAG:         }
 ; CHECK-DAG:         "epilogue":"BIAS"
 ; CHECK:           }
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,3]{1,0} slice([[MATMUL]]), slice={[0:2], [0:3]}
+; CHECK-NEXT:    [[GETTUPLE:%[^ ]+]] = f32[4,4]{1,0} get-tuple-element([[MATMUL]]), index=0
+; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,3]{1,0} slice([[GETTUPLE]]), slice={[0:2], [0:3]}
       )");
 }
 
@@ -2819,7 +2817,7 @@ ENTRY test {
 ; CHECK-DAG:     [[P0:%[^ ]+]] = f32[2,3]{1,0} parameter(0)
 ; CHECK-DAG:     [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
 ; CHECK-DAG:     [[P2:%[^ ]+]] = f32[2]{0} parameter(2)
-; CHECK-NEXT:    [[MATMUL0:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]]),
+; CHECK-NEXT:    [[MATMUL0_TUPLE:%[^ ]+]] = (f32[2,4]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -2836,7 +2834,7 @@ ENTRY test {
 ; CHECK-DAG:         }
 ; CHECK-DAG:         "epilogue":"DEFAULT"
 ; CHECK:           }
-; CHECK:         [[MATMUL1:%[^ ]+]] = f32[2,2]{1,0} custom-call(
+; CHECK:         [[MATMUL1_TUPLE:%[^ ]+]] = (f32[2,2]{1,0}, s8[{{[0-9]+}}]{0}) custom-call(
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -2853,7 +2851,8 @@ ENTRY test {
 ; CHECK-DAG:         }
 ; CHECK-DAG:         "epilogue":"DEFAULT"
 ; CHECK:           }
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,2]{1,0} custom-call{{.*}}[[MATMUL1]]
+; CHECK:         [[MATMUL1:%[^ ]+]] = f32[2,2]{1,0} get-tuple-element([[MATMUL1_TUPLE]]), index=0
+; CHECK-NEXT:    [[OUT:%[^ ]+]] = (f32[2,2]{1,0}, s8[{{[0-9]+}}]{0}) custom-call{{.*}}[[MATMUL1]]
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -2894,7 +2893,7 @@ ENTRY test {
 ; CHECK:    [[P0:%[^ ]+]] = f32[2,3]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P2_BCAST:%[^ ]+]] = f32[2,4]{1,0} parameter(3)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]], [[P2_BCAST]]),
+; CHECK-NEXT:    [[OUT:%[^ ]+]] = (f32[2,4]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1]], [[P2_BCAST]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -2939,7 +2938,7 @@ ENTRY test {
 ; CHECK-DAG:     [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
 ; CHECK-DAG:     [[VECTOR_BIAS:%[^ ]+]] = f32[4]{0} parameter(2)
 ; CHECK-DAG:     [[MATRIX_BIAS:%[^ ]+]] = f32[2,4]{1,0} parameter(3)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]], [[MATRIX_BIAS]], [[VECTOR_BIAS]]),
+; CHECK-NEXT:    [[OUT:%[^ ]+]] = (f32[2,4]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1]], [[MATRIX_BIAS]], [[VECTOR_BIAS]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -2975,6 +2974,12 @@ ENTRY test {
 )";
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{3e-3, 1e-3}));
+
+  if (IsCuda() &&
+      !HasCudaComputeCapability(se::CudaComputeCapability::Ampere())) {
+    GTEST_SKIP() << "Pre-Ampere casts up bf16 to fp32";
+  }
+
   MatchOptimizedHlo(hlo_text,
                     R"(
 
@@ -2982,7 +2987,7 @@ ENTRY test {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = bf16[16,24]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = bf16[24,32]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = bf16[32]{0} parameter(2)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = bf16[16,32]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
+; CHECK-NEXT:    [[OUT:%[^ ]+]] = (bf16[16,32]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -3002,7 +3007,8 @@ ENTRY test {
 }
 
 TEST_F(CublasLtGemmRewriteTest, BF16VectorBiasPadded) {
-  if (!CudaOrRocmCheck(se::CudaComputeCapability::AMPERE, Switch::True)) {
+  if (IsCuda() &&
+      !HasCudaComputeCapability(se::CudaComputeCapability::Ampere())) {
     GTEST_SKIP() << "Padding of GEMM bf16 operands only implemented on "
                     "architectures with bf16 Tensor Cores.";
   }
@@ -3048,7 +3054,7 @@ ENTRY test {
 ; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,3], {{.*}}: f32[3,4]) -> f32[2,4] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,3]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]]),
+; CHECK-NEXT:    [[OUT:%[^ ]+]] = (f32[2,4]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -3092,7 +3098,7 @@ ENTRY test {
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[6,4]{1,0} bitcast([[P0]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4,5,6]{2,1,0} parameter(1)
 ; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[4,30]{1,0}
-; CHECK-NEXT:    [[MATMUL:%[^ ]+]] = f32[6,30]{1,0} custom-call([[P0_BITCAST]], [[P1_BITCAST]]),
+; CHECK-NEXT:    [[MATMUL_TUPLE:%[^ ]+]] = (f32[6,30]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -3109,6 +3115,7 @@ ENTRY test {
 ; CHECK-DAG:         }
 ; CHECK-DAG:         "epilogue":"RELU"
 ; CHECK:           }
+; CHECK:         [[MATMUL:%[^ ]+]] = f32[6,30]{1,0} get-tuple-element([[MATMUL_TUPLE]]), index=0
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,3,5,6]{3,2,1,0} bitcast([[MATMUL]])
       )");
 }
@@ -3136,7 +3143,7 @@ ENTRY test {
 ; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,3], {{.*}}: f32[3,4]) -> f32[2,2] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,3]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
-; CHECK-NEXT:    [[MATMUL:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]]),
+; CHECK-NEXT:    [[MATMUL_TUPLE:%[^ ]+]] = (f32[2,4]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -3153,6 +3160,7 @@ ENTRY test {
 ; CHECK-DAG:         }
 ; CHECK-DAG:         "epilogue":"RELU"
 ; CHECK:           }
+; CHECK:         [[MATMUL:%[^ ]+]] = f32[2,4]{1,0} get-tuple-element([[MATMUL_TUPLE]]), index=0
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,2]{1,0} slice([[MATMUL]]), slice={[0:2], [0:2]}
       )");
 }
@@ -3182,7 +3190,7 @@ ENTRY test {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,3]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[2,4]{1,0} parameter(2)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
+; CHECK-NEXT:    [[OUT:%[^ ]+]] = (f32[2,4]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -3227,7 +3235,7 @@ ENTRY test {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[4,4]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4,4]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4,4]{1,0} parameter(2)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[4,4]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
+; CHECK-NEXT:    [[OUT:%[^ ]+]] = (f32[4,4]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -3273,7 +3281,7 @@ ENTRY test {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,3]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4]{0} parameter(2)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
+; CHECK-NEXT:    [[OUT:%[^ ]+]] = (f32[2,4]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -3316,7 +3324,7 @@ ENTRY test {
                     R"(
 
 ; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,3,4], {{.*}}: f32[4,5,6], {{.*}}: f32[3,5,6]) -> f32[2,3,5,6] {
-; CHECK:         [[MATMUL:%[^ ]+]] = f32[6,30]{1,0} custom-call(
+; CHECK:         [[MATMUL_TUPLE:%[^ ]+]] = (f32[6,30]{1,0}, s8[{{[0-9]+}}]{0}) custom-call(
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -3333,6 +3341,7 @@ ENTRY test {
 ; CHECK-DAG:         }
 ; CHECK-DAG:         "epilogue":"RELU"
 ; CHECK:           }
+; CHECK-NEXT:    [[MATMUL:%[^ ]+]] = f32[6,30]{1,0} get-tuple-element([[MATMUL_TUPLE]]), index=0
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,3,5,6]{3,2,1,0} bitcast([[MATMUL]])
       )");
 }
@@ -3364,7 +3373,7 @@ ENTRY test {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,3]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[2]{0} parameter(2)
-; CHECK-NEXT:    [[MATMUL:%[^ ]+]] = f32[2,4]{0,1} custom-call([[P0]], [[P1]], [[P2]]),
+; CHECK-NEXT:    [[MATMUL_TUPLE:%[^ ]+]] = (f32[2,4]{0,1}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config={
 ; CHECK-DAG:       "alpha_real":1
@@ -3381,6 +3390,7 @@ ENTRY test {
 ; CHECK-DAG:         }
 ; CHECK-DAG:         "epilogue":"BIAS_RELU"
 ; CHECK:           }
+; CHECK-NEXT:    [[MATMUL:%[^ ]+]] = f32[2,4]{0,1} get-tuple-element([[MATMUL_TUPLE]]), index=0
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[4,2]{1,0} bitcast([[MATMUL]])
       )");
 }
@@ -3414,7 +3424,7 @@ ENTRY test {
 ; CHECK-DAG:     [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
 ; CHECK-DAG:     [[P2:%[^ ]+]] = f32[4]{0} parameter(2)
 ; CHECK-DAG:     [[P3:%[^ ]+]] = f32[2,4]{1,0} parameter(3)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]], [[P3]], [[P2]]),
+; CHECK-NEXT:    [[OUT:%[^ ]+]] = (f32[2,4]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1]], [[P3]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -3470,7 +3480,7 @@ ENTRY test {
 ; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,3], {{.*}}: f32[3,4]) -> f32[2,4] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,3]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]]),
+; CHECK-NEXT:    [[OUT:%[^ ]+]] = (f32[2,4]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -3529,11 +3539,11 @@ ENTRY test {
 
 TEST_F(CublasLtGemmRewriteTest, VectorBiasThenApproxGeluActivation) {
 #if TENSORFLOW_USE_ROCM && TF_ROCM_VERSION >= 60000
-  auto rocm_switch = Switch::False;  // GELU is only available from ROCM 6.0
+  auto rocm_switch = false;  // GELU is only available from ROCM 6.0
 #else
-  auto rocm_switch = Switch::True;
+  auto rocm_switch = true;
 #endif
-  if (CudaOrRocmCheck(Switch::False, rocm_switch)) {
+  if (!IsCuda() && rocm_switch) {
     GTEST_SKIP() << "TODO: Unsupported blas-lt epilogue on ROCM";
   }
   const char* hlo_text = R"(
@@ -3575,7 +3585,7 @@ ENTRY test {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,3]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4]{0} parameter(2)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
+; CHECK-NEXT:    [[OUT:%[^ ]+]] = (f32[2,4]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -3596,7 +3606,7 @@ ENTRY test {
 }
 
 TEST_F(CublasLtGemmRewriteTest, ApproxGeluActivationWithAux) {
-  if (CudaOrRocmCheck(Switch::False, Switch::True)) {
+  if (!IsCuda()) {
     GTEST_SKIP() << "TODO: Unsupported blas-lt epilogue on ROCM";
   }
   const char* hlo_text = R"(
@@ -3635,7 +3645,7 @@ ENTRY test {
 ; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,3], {{.*}}: f32[3,4]) -> (f32[2,4], f32[2,4]) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,3]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = (f32[2,4]{1,0}, f32[2,4]{1,0}) custom-call([[P0]], [[P1]]),
+; CHECK-NEXT:    [[OUT:%[^ ]+]] = (f32[2,4]{1,0}, f32[2,4]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -3656,7 +3666,7 @@ ENTRY test {
 }
 
 TEST_F(CublasLtGemmRewriteTest, VectorBiasThenApproxGeluActivationWithAux) {
-  if (CudaOrRocmCheck(Switch::False, Switch::True)) {
+  if (!IsCuda()) {
     GTEST_SKIP() << "TODO: Unsupported blas-lt epilogue on ROCM";
   }
   const char* hlo_text = R"(
@@ -3699,7 +3709,7 @@ ENTRY test {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,3]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4]{0} parameter(2)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = (f32[2,4]{1,0}, f32[2,4]{1,0}) custom-call([[P0]], [[P1]], [[P2]]),
+; CHECK-NEXT:    [[OUT:%[^ ]+]] = (f32[2,4]{1,0}, f32[2,4]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -3720,7 +3730,8 @@ ENTRY test {
 }
 
 TEST_F(CublasLtGemmRewriteTest, ApproxGeluActivationBF16) {
-  if (!CudaOrRocmCheck(se::CudaComputeCapability::AMPERE, Switch::True)) {
+  if (IsCuda() &&
+      !HasCudaComputeCapability(se::CudaComputeCapability::Ampere())) {
     GTEST_SKIP() << "Padding of GEMM bf16 operands only implemented on "
                     "architectures with bf16 Tensor Cores.";
   }
@@ -3790,16 +3801,19 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(GpuComputeComp());
+  GemmRewriter pass(Capability(), GetToolkitVersion());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Bitcast(m::CustomCall(
-                                        {"__cublas$lt$matmul"},
-                                        m::Parameter(0).WithShape(F32, {2, 3}),
-                                        m::Parameter(1).WithShape(F32, {3, 4})))
-                             .WithShape(F32, {2, 2, 2})));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(
+          m::Bitcast(m::GetTupleElement(
+                         m::CustomCall({"__cublas$lt$matmul"},
+                                       m::Parameter(0).WithShape(F32, {2, 3}),
+                                       m::Parameter(1).WithShape(F32, {3, 4})),
+                         0))
+              .WithShape(F32, {2, 2, 2})));
 }
 
 // For F16, the sizes of all dimensions of the operands are required to be
@@ -3826,7 +3840,7 @@ ENTRY test {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f16[8,16]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f16[16,8]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f16[8,8]{1,0} parameter(2)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f16[8,8]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
+; CHECK-NEXT:    [[OUT:%[^ ]+]] = (f16[8,8]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -3864,17 +3878,20 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(GpuComputeComp());
+  GemmRewriter pass(Capability(), GetToolkitVersion());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
   EXPECT_THAT(
       module->entry_computation()->root_instruction(),
-      GmockMatch(m::Bitcast(m::CustomCall({"__cublas$lt$matmul"},
-                                          m::Parameter(0), m::Parameter(1),
-                                          m::Parameter(2).WithShape(F32, {2}))
-                                .WithShape(F32, {2, 4}))
-                     .WithShape(F32, {4, 2})));
+      GmockMatch(
+          m::Bitcast(m::GetTupleElement(
+                         m::CustomCall({"__cublas$lt$matmul"}, m::Parameter(0),
+                                       m::Parameter(1),
+                                       m::Parameter(2).WithShape(F32, {2})),
+                         0)
+                         .WithShape(F32, {2, 4}))
+              .WithShape(F32, {4, 2})));
 }
 
 // For F16, the operands are padded on GPUs with Tensor Cores (i.e. Volta and
@@ -3901,7 +3918,8 @@ ENTRY test {
 }
 
 TEST_F(CublasLtGemmRewriteTest, VectorBiasF16Padded) {
-  if (!CudaOrRocmCheck(se::CudaComputeCapability::VOLTA, Switch::True)) {
+  if (IsCuda() &&
+      !HasCudaComputeCapability(se::CudaComputeCapability::Volta())) {
     GTEST_SKIP() << "Padding of GEMM operands only implemented on "
                     "architectures with Tensor Cores.";
   }
@@ -3951,7 +3969,8 @@ ENTRY test {
 }
 
 TEST_F(CublasLtGemmRewriteTest, ReluActivationF16Padded) {
-  if (!CudaOrRocmCheck(se::CudaComputeCapability::VOLTA, Switch::True)) {
+  if (IsCuda() &&
+      !HasCudaComputeCapability(se::CudaComputeCapability::Volta())) {
     GTEST_SKIP() << "Padding of GEMM operands only implemented on "
                     "architectures with Tensor Cores.";
   }
@@ -4000,7 +4019,7 @@ ENTRY test {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f16[8,16]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f16[16,8]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f16[8,8]{1,0} parameter(2)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f16[8,8]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
+; CHECK-NEXT:    [[OUT:%[^ ]+]] = (f16[8,8]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -4047,7 +4066,8 @@ ENTRY test {
 }
 
 TEST_F(CublasLtGemmRewriteTest, VectorBiasReluActivationF16Padded) {
-  if (!CudaOrRocmCheck(se::CudaComputeCapability::VOLTA, Switch::True)) {
+  if (IsCuda() &&
+      !HasCudaComputeCapability(se::CudaComputeCapability::Volta())) {
     GTEST_SKIP() << "Padding of GEMM operands only implemented on "
                     "architectures with Tensor Cores.";
   }
@@ -4091,6 +4111,12 @@ ENTRY test {
 )";
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
+
+  if (IsCuda() ||
+      !HasCudaComputeCapability(se::CudaComputeCapability::Ampere())) {
+    GTEST_SKIP() << "Pre-Ampere casts up bf16 to fp32";
+  }
+
   MatchOptimizedHlo(hlo_text,
                     R"(
 
@@ -4098,7 +4124,7 @@ ENTRY test {
 ; CHECK-DAG:     [[P0:%[^ ]+]] = bf16[8,16]{1,0} parameter(0)
 ; CHECK-DAG:     [[P1:%[^ ]+]] = bf16[16,8]{1,0} parameter(1)
 ; CHECK-DAG:     [[P2:%[^ ]+]] = bf16[8,8]{1,0} parameter(2)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = bf16[8,8]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
+; CHECK-NEXT:    [[OUT:%[^ ]+]] = (bf16[8,8]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -4135,18 +4161,21 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(GpuComputeComp());
+  GemmRewriter pass(Capability(), GetToolkitVersion());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
   EXPECT_THAT(
       module->entry_computation()->root_instruction(),
       GmockMatch(
-          m::Bitcast(m::CustomCall(
-                         {"__cublas$lt$matmul"},
-                         m::Parameter(0).WithShape(BF16, {8, 16}),
-                         m::Parameter(1).WithShape(BF16, {16, 8}),
-                         m::Bitcast(m::Parameter(2)).WithShape(BF16, {8, 8})))
+          m::Bitcast(
+              m::GetTupleElement(
+                  m::CustomCall(
+                      {"__cublas$lt$matmul"},
+                      m::Parameter(0).WithShape(BF16, {8, 16}),
+                      m::Parameter(1).WithShape(BF16, {16, 8}),
+                      m::Bitcast(m::Parameter(2)).WithShape(BF16, {8, 8})),
+                  0))
               .WithShape(BF16, {2, 4, 8})));
 }
 
@@ -4174,7 +4203,8 @@ ENTRY test {
 }
 
 TEST_F(CublasLtGemmRewriteTest, VectorBiasBF16Padded) {
-  if (!CudaOrRocmCheck(se::CudaComputeCapability::AMPERE, Switch::True)) {
+  if (IsCuda() &&
+      !HasCudaComputeCapability(se::CudaComputeCapability::Ampere())) {
     GTEST_SKIP() << "Padding of GEMM operands in bfloat16 only implemented on "
                     "Ampere and newer architectures.";
   }
@@ -4223,7 +4253,8 @@ ENTRY test {
 }
 
 TEST_F(CublasLtGemmRewriteTest, ReluActivationBF16Padded) {
-  if (!CudaOrRocmCheck(se::CudaComputeCapability::AMPERE, Switch::True)) {
+  if (IsCuda() &&
+      !HasCudaComputeCapability(se::CudaComputeCapability::Ampere())) {
     GTEST_SKIP() << "Padding of GEMM operands in bfloat16 only implemented on "
                     "Ampere and newer architectures.";
   }
@@ -4275,7 +4306,8 @@ ENTRY test {
 }
 
 TEST_F(CublasLtGemmRewriteTest, VectorBiasReluActivationBF16Padded) {
-  if (!CudaOrRocmCheck(se::CudaComputeCapability::AMPERE, Switch::True)) {
+  if (IsCuda() &&
+      !HasCudaComputeCapability(se::CudaComputeCapability::Ampere())) {
     GTEST_SKIP() << "Padding of GEMM operands in bfloat16 only implemented on "
                     "Ampere and newer architectures.";
   }
@@ -4304,7 +4336,7 @@ ENTRY test {
 }
 
 TEST_F(CublasLtGemmRewriteTest, VectorBiasReluActivationF64) {
-  if (CudaOrRocmCheck(Switch::False, Switch::True)) {
+  if (!IsCuda()) {
     GTEST_SKIP() << "TODO: Unsupported blas-lt F64 datatype on ROCM";
   }
   const char* hlo_text = R"(
@@ -4332,7 +4364,7 @@ ENTRY test {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f64[2,3]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f64[3,4]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f64[4]{0} parameter(2)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f64[2,4]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
+; CHECK-NEXT:    [[OUT:%[^ ]+]] = (f64[2,4]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -4381,7 +4413,7 @@ ENTRY test {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,3]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4]{0} parameter(2)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
+; CHECK-NEXT:    [[OUT:%[^ ]+]] = (f32[2,4]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":3
@@ -4429,7 +4461,7 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(GpuComputeComp());
+  GemmRewriter pass(Capability(), GetToolkitVersion());
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   SCOPED_TRACE(module->ToString());
   EXPECT_TRUE(changed);
@@ -4437,10 +4469,18 @@ ENTRY test {
   EXPECT_THAT(
       module->entry_computation()->root_instruction(),
       GmockMatch(m::Tuple(
-          m::CustomCall(m::Parameter(0), m::Parameter(1), m::Parameter()),
-          m::CustomCall(m::Parameter(0), m::Parameter(1), m::Constant()),
-          m::CustomCall(m::Parameter(0), m::Parameter(1), m::Constant()),
-          m::CustomCall(m::Parameter(0), m::Parameter(1), m::Constant()))));
+          m::GetTupleElement(
+              m::CustomCall(m::Parameter(0), m::Parameter(1), m::Parameter()),
+              0),
+          m::GetTupleElement(
+              m::CustomCall(m::Parameter(0), m::Parameter(1), m::Constant()),
+              0),
+          m::GetTupleElement(
+              m::CustomCall(m::Parameter(0), m::Parameter(1), m::Constant()),
+              0),
+          m::GetTupleElement(
+              m::CustomCall(m::Parameter(0), m::Parameter(1), m::Constant()),
+              0))));
 }
 
 TEST_F(CublasLtGemmRewriteTest, MultipleMaximumUsers) {
@@ -4484,7 +4524,7 @@ ENTRY main {
 // Test gemm matrix bias add fusion with mix type and out of place update(C !=
 // D)
 TEST_F(CublasLtGemmRewriteTest, MatrixBiasMixTypeOutOfPlace) {
-  if (CudaOrRocmCheck(Switch::False, Switch::True)) {
+  if (!IsCuda()) {
     GTEST_SKIP() << "TODO: Unsupported mixed datatypes on ROCM";
   }
   std::vector<std::tuple<absl::string_view, absl::string_view>>
@@ -4510,18 +4550,26 @@ ENTRY test {
     replacements["<<DType>>"] = std::get<1>(type_combination);
     const auto hlo_text = absl::StrReplaceAll(hlo_text_template, replacements);
     EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
+
+    if (std::get<0>(type_combination) == "bf16" && IsCuda() &&
+        !HasCudaComputeCapability(se::CudaComputeCapability::Ampere())) {
+      continue;
+    }
+
     TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
                             GetOptimizedModule(hlo_text));
-    EXPECT_THAT(optimized_module->entry_computation()->root_instruction(),
-                GmockMatch(m::CustomCall(m::Parameter(0), m::Parameter(1),
-                                         m::Parameter(2))));
+    EXPECT_THAT(
+        optimized_module->entry_computation()->root_instruction(),
+        GmockMatch(m::GetTupleElement(
+            m::CustomCall(m::Parameter(0), m::Parameter(1), m::Parameter(2)),
+            0)));
   }
 }
 
 // Test batch gemm matrix bias add fusion with mix type and out of place
 // update(C != D)
 TEST_F(CublasLtGemmRewriteTest, MatrixBiasMixTypeOutOfPlaceBatched) {
-  if (CudaOrRocmCheck(Switch::False, Switch::True)) {
+  if (!IsCuda()) {
     GTEST_SKIP() << "TODO: Unsupported mixed datatypes on ROCM";
   }
   std::vector<std::tuple<absl::string_view, absl::string_view>>
@@ -4547,17 +4595,25 @@ ENTRY test {
     replacements["<<DType>>"] = std::get<1>(type_combination);
     const auto hlo_text = absl::StrReplaceAll(hlo_text_template, replacements);
     EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
+
+    if (std::get<0>(type_combination) == "bf16" && IsCuda() &&
+        !HasCudaComputeCapability(se::CudaComputeCapability::Ampere())) {
+      continue;
+    }
+
     TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
                             GetOptimizedModule(hlo_text));
-    EXPECT_THAT(optimized_module->entry_computation()->root_instruction(),
-                GmockMatch(m::CustomCall(m::Parameter(0), m::Parameter(1),
-                                         m::Parameter(2))));
+    EXPECT_THAT(
+        optimized_module->entry_computation()->root_instruction(),
+        GmockMatch(m::GetTupleElement(
+            m::CustomCall(m::Parameter(0), m::Parameter(1), m::Parameter(2)),
+            0)));
   }
 }
 
 // Test gemm matrix bias add fusion with mix type and in place update(C = D)
 TEST_F(CublasLtGemmRewriteTest, MatrixBiasMixTypeInPlace) {
-  if (CudaOrRocmCheck(Switch::False, Switch::True)) {
+  if (!IsCuda()) {
     GTEST_SKIP() << "TODO: Unsupported mixed datatypes on ROCM";
   }
   std::vector<std::tuple<absl::string_view, absl::string_view>>
@@ -4584,11 +4640,19 @@ ENTRY test {
     replacements["<<DType>>"] = std::get<1>(type_combination);
     const auto hlo_text = absl::StrReplaceAll(hlo_text_template, replacements);
     EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
+
+    if (std::get<0>(type_combination) == "bf16" && IsCuda() &&
+        !HasCudaComputeCapability(se::CudaComputeCapability::Ampere())) {
+      continue;
+    }
+
     TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
                             GetOptimizedModule(hlo_text));
     EXPECT_THAT(optimized_module->entry_computation()->root_instruction(),
-                GmockMatch(m::CustomCall(m::Parameter(0), m::Parameter(1),
-                                         m::Negate(m::Parameter(2)))));
+                GmockMatch(m::GetTupleElement(
+                    m::CustomCall(m::Parameter(0), m::Parameter(1),
+                                  m::Negate(m::Parameter(2))),
+                    0)));
   }
 }
 
@@ -4610,11 +4674,18 @@ ENTRY test {
 )";
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
+
+  if (IsCuda() &&
+      !HasCudaComputeCapability(se::CudaComputeCapability::Ampere())) {
+    GTEST_SKIP() << "Pre-Ampere casts up bf16 to fp32";
+  }
+
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
                           GetOptimizedModule(hlo_text));
   MatchOptimizedHlo(hlo_text, R"(
 ; CHECK:        %[[custom_call:.*]] = {{.*}} custom-call{{.*}}__cublas$lt$matmul
-; CHECK:        ROOT {{.*}} fusion({{.*}}%[[custom_call]]
+; CHECK:        %[[tuple:.*]] = bf16[16,16]{1,0} get-tuple-element(%[[custom_call]]), index=0
+; CHECK:        ROOT {{.*}} fusion({{.*}}%[[tuple]]
 )");
 }
 
@@ -4646,9 +4717,7 @@ class ParameterizedFp8GemmRewriteTest : public ParameterizedGemmRewriteTest {
   // architectures (Ada, Hopper, and later).
   void CheckFp8IfSupported(absl::string_view hlo_text,
                            ErrorSpec error_spec = ErrorSpec{1e-2, 1e-2}) {
-    if (!CudaOrRocmCheck(8, 9, [](const se::RocmComputeCapability& cc) {
-          return cc.has_fp8_support();
-        })) {
+    if (!HasFp8Support()) {
       return;
     }
     std::string replaced_hlo_text =
@@ -4703,9 +4772,7 @@ class ParameterizedFp8GemmRewriteTest : public ParameterizedGemmRewriteTest {
 };
 
 TEST_P(ParameterizedFp8GemmRewriteTest, DoNotRewriteToF8OnPreAda) {
-  if (CudaOrRocmCheck(8, 9, [](const se::RocmComputeCapability& cc) {
-        return cc.has_fp8_support();
-      })) {
+  if (HasFp8Support()) {
     GTEST_SKIP() << "Test requires a pre-Ada GPU or an AMD GPU prior to MI300.";
   }
   const char* hlo_text = R"(
@@ -4730,9 +4797,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, DoNotRewriteToF8OnPreAda) {
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, DoNotRewriteOnPreAdaWithF32Output) {
-  if (CudaOrRocmCheck(8, 9, [](const se::RocmComputeCapability& cc) {
-        return cc.has_fp8_support();
-      })) {
+  if (HasFp8Support()) {
     GTEST_SKIP() << "Test requires a pre-Ada GPU or an AMD GPU prior to MI300.";
   }
   const char* hlo_text = R"(
@@ -4778,9 +4843,10 @@ TEST_P(ParameterizedFp8GemmRewriteTest, UnsupportedTypesF8) {
 )";
   EXPECT_TRUE(RunAndCompare(absl::StrReplaceAll(hlo_text, replacements_),
                             ErrorSpec{1e-2, 1e-2}));
-  RunAndFilecheckHloRewrite(hlo_text,
-                            GemmRewriter(GpuComputeComp(), /*f8_rewrite=*/true),
-                            R"(
+  RunAndFilecheckHloRewrite(
+      hlo_text,
+      GemmRewriter(Capability(), GetToolkitVersion(), /*f8_rewrite=*/true),
+      R"(
 ; CHECK-LABEL: ENTRY %unsupported_types ({{.*}}: <<F8E5M2>>[16,16], {{.*}}: <<F8E5M2>>[16,16]) -> <<F8E5M2>>[16,16] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = <<F8E5M2>>[16,16]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P0_CONVERT:%[^ ]+]] = f16[16,16]{1,0} convert([[P0]])
@@ -4813,7 +4879,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, UnscaledABUnscaledDF8) {
 
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
-      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16]) -> <<F8E4M3>>[16,16] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} parameter(0)
@@ -4821,7 +4889,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, UnscaledABUnscaledDF8) {
 ; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} transpose([[P1]]), dimensions={1,0}
 ; CHECK-NEXT:    [[C1:[^ ]+]] = f32[] constant(1)
 ; CHECK-GCN-NEXT:    [[OUT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[C1]], [[C1]], [[C1]], /*index=5*/[[C1]]),
-; CHECK-PTX-NEXT:    ROOT [[OUT:%[^ ]+]] = <<F8E4M3>>[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[C1]], [[C1]], [[C1]], /*index=5*/[[C1]]),
+; CHECK-PTX-NEXT:    [[OUT:%[^ ]+]] = (<<F8E4M3>>[16,16]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1_TRANSPOSE]], [[C1]], [[C1]], [[C1]], /*index=5*/[[C1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -4866,15 +4934,17 @@ TEST_P(ParameterizedFp8GemmRewriteTest, UnscaledABUnscaledDMatrixBiasF8) {
 
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
-      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: <<F8E4M3>>[16,16]) -> <<F8E4M3>>[16,16] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = <<F8E4M3>>[32,16]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} transpose([[P1]]), dimensions={1,0}
 ; CHECK-NEXT:    [[C1:[^ ]+]] = f32[] constant(1)
-; CHECK-GCN-NEXT:    [[DOT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[C1]], [[C1]], [[C1]], /*index=5*/[[C1]]),
-; CHECK-PTX-NEXT:    [[DOT:%[^ ]+]] = <<F8E4M3>>[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[C1]], [[C1]], [[C1]], /*index=5*/[[C1]]),
+; CHECK-GCN-NEXT:    [[DOT_TUPLE:%[^ ]+]] = (f32[16,16]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1_TRANSPOSE]], [[C1]], [[C1]], [[C1]], /*index=5*/[[C1]]),
+; CHECK-PTX-NEXT:    [[DOT_TUPLE:%[^ ]+]] = (<<F8E4M3>>[16,16]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1_TRANSPOSE]], [[C1]], [[C1]], [[C1]], /*index=5*/[[C1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -4891,8 +4961,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, UnscaledABUnscaledDMatrixBiasF8) {
 ; CHECK-DAG:         }
 ; CHECK-DAG:         "epilogue":"DEFAULT"
 ; CHECK:           }
+; CHECK:         [[DOT:%[^ ]+]] = <<F8E4M3>>[16,16]{1,0} get-tuple-element([[DOT_TUPLE]]), index=0
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = <<F8E4M3>>[16,16]{1,0} parameter(2)
-; CHECK-NEXT:    [[ROOT:%[^ ]+]] = <<F8E4M3>>[16,16]{1,0} add([[DOT]], [[P2]])
+; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = <<F8E4M3>>[16,16]{1,0} add([[DOT]], [[P2]])
       )");
 }
 
@@ -4926,7 +4997,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDF8) {
 
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
-      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: f32[], {{.*}}: f32[]) -> f32[16,16] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} parameter(0)
@@ -4935,7 +5008,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDF8) {
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[] parameter(2)
 ; CHECK-NEXT:    [[P3:%[^ ]+]] = f32[] parameter(3)
 ; CHECK-NEXT:    [[C1:%[^ ]+]] = f32[] constant(1)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C1]], /*index=5*/[[C1]]),
+; CHECK-NEXT:    [[OUT:%[^ ]+]] = (f32[16,16]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C1]], /*index=5*/[[C1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -4985,7 +5058,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDPaddedF8) {
 
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
-      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[13,17], {{.*}}: <<F8E4M3>>[17,31], {{.*}}: f32[], {{.*}}: f32[]) -> f32[13,31] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = <<F8E4M3>>[13,17]{1,0} parameter(0)
@@ -4998,7 +5073,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDPaddedF8) {
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[] parameter(2)
 ; CHECK-NEXT:    [[P3:%[^ ]+]] = f32[] parameter(3)
 ; CHECK-NEXT:    [[C4:%[^ ]+]] = f32[] constant(1)
-; CHECK-NEXT:    [[DOT:%[^ ]+]] = f32[16,32]{1,0} custom-call([[P0_PADDED]], [[P1_TRANSPOSE_PADDED]], [[P2]], [[P3]], [[C4]], /*index=5*/[[C4]]),
+; CHECK-NEXT:    [[DOT_TUPLE:%[^ ]+]] = (f32[16,32]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0_PADDED]], [[P1_TRANSPOSE_PADDED]], [[P2]], [[P3]], [[C4]], /*index=5*/[[C4]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -5015,6 +5090,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDPaddedF8) {
 ; CHECK-DAG:         }
 ; CHECK-DAG:         "epilogue":"DEFAULT"
 ; CHECK:           }
+; CHECK-NEXT:  [[DOT:%[^ ]+]] = f32[16,32]{1,0} get-tuple-element([[DOT_TUPLE]]), index=0
 ; CHECK-NEXT: ROOT [[OUT:%[^ ]+]] = f32[13,31]{1,0} slice([[DOT]]), slice={[0:13], [0:31]}
       )");
 }
@@ -5050,14 +5126,15 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDBitcastF8) {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true);
+  GemmRewriter pass(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                    /*f8_rewrite=*/true);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
   EXPECT_THAT(
       module->entry_computation()->root_instruction(),
-      GmockMatch(
-          m::CustomCall({"__cublas$lt$matmul$f8"}).WithShape(F32, {16, 16})));
+      GmockMatch(m::GetTupleElement(m::CustomCall({"__cublas$lt$matmul$f8"}), 0)
+                     .WithShape(F32, {16, 16})));
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDUnaryOpsF8) {
@@ -5095,7 +5172,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDUnaryOpsF8) {
 
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
-      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
       R"(
 
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[3], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: f32[], {{.*}}: f32[]) -> f32[16,16] {
@@ -5111,7 +5190,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDUnaryOpsF8) {
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[] parameter(2)
 ; CHECK-NEXT:    [[P3:%[^ ]+]] = f32[] parameter(3)
 ; CHECK-NEXT:    [[C2:%[^ ]+]] = f32[] constant(1)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0_U3]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C2]], /*index=5*/[[C2]]),
+; CHECK-NEXT:    [[OUT:%[^ ]+]] = (f32[16,16]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0_U3]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C2]], /*index=5*/[[C2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -5162,13 +5241,16 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDDynamicSliceF8) {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true);
+  GemmRewriter pass(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                    /*f8_rewrite=*/true);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
-      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[32,32], {{.*}}: <<F8E4M3>>[16,32], {{.*}}: f32[], {{.*}}: f32[]) -> f32[16,16] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = <<F8E4M3>>[32,32]{1,0} parameter(0)
@@ -5178,7 +5260,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDDynamicSliceF8) {
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[] parameter(2)
 ; CHECK-NEXT:    [[P3:%[^ ]+]] = f32[] parameter(3)
 ; CHECK-NEXT:    [[C1:%[^ ]+]] = f32[] constant(1)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[DYN_SLICE]], [[P1]], [[P2]], [[P3]], [[C1]], /*index=5*/[[C1]]),
+; CHECK-NEXT:    [[OUT:%[^ ]+]] = (f32[16,16]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[DYN_SLICE]], [[P1]], [[P2]], [[P3]], [[C1]], /*index=5*/[[C1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -5231,13 +5313,16 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDSelectF8) {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true);
+  GemmRewriter pass(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                    /*f8_rewrite=*/true);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
-      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[16,32], {{.*}}: f32[], {{.*}}: f32[], {{.*}}: pred[16,32]) -> f32[16,16] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} parameter(0)
@@ -5250,7 +5335,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDSelectF8) {
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[] parameter(2)
 ; CHECK-NEXT:    [[P3:%[^ ]+]] = f32[] parameter(3)
 ; CHECK-NEXT:    [[C1:%[^ ]+]] = f32[] constant(1)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0]], [[SELECT]], [[P2]], [[P3]], [[C1]], /*index=5*/[[C1]]),
+; CHECK-NEXT:    [[OUT:%[^ ]+]] = (f32[16,16]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[SELECT]], [[P2]], [[P3]], [[C1]], /*index=5*/[[C1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -5304,7 +5389,8 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true);
+  GemmRewriter pass(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                    /*f8_rewrite=*/true);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_FALSE(changed);
 }
@@ -5339,7 +5425,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, BatchedScaledABUnscaledDF8) {
 
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
-      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[10,16,32], {{.*}}: <<F8E4M3>>[10,32,16], {{.*}}: f32[], {{.*}}: f32[]) -> f32[10,16,16] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = <<F8E4M3>>[10,16,32]{2,1,0} parameter(0)
@@ -5348,7 +5436,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, BatchedScaledABUnscaledDF8) {
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[] parameter(2)
 ; CHECK-NEXT:    [[P3:%[^ ]+]] = f32[] parameter(3)
 ; CHECK-NEXT:    [[C1:%[^ ]+]] = f32[] constant(1)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[10,16,16]{2,1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C1]], /*index=5*/[[C1]]),
+; CHECK-NEXT:    [[OUT:%[^ ]+]] = (f32[10,16,16]{2,1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C1]], /*index=5*/[[C1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -5401,7 +5489,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABAlphaDF8) {
 
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
-      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
       R"(
 
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: f32[], {{.*}}: f32[]) -> f32[16,16] {
@@ -5411,7 +5501,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABAlphaDF8) {
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[] parameter(2)
 ; CHECK-NEXT:    [[P3:%[^ ]+]] = f32[] parameter(3)
 ; CHECK-NEXT:    [[C1:%[^ ]+]] = f32[] constant(1)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C1]], /*index=5*/[[C1]]),
+; CHECK-NEXT:    [[OUT:%[^ ]+]] = (f32[16,16]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C1]], /*index=5*/[[C1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":3
@@ -5464,7 +5554,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDReluActivationF8) {
 
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
-      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
       R"(
 
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: f32[], {{.*}}: f32[]) -> f32[16,16] {
@@ -5474,7 +5566,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDReluActivationF8) {
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[] parameter(2)
 ; CHECK-NEXT:    [[P3:%[^ ]+]] = f32[] parameter(3)
 ; CHECK-NEXT:    [[C1:%[^ ]+]] = f32[] constant(1)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C1]], /*index=5*/[[C1]]),
+; CHECK-NEXT:    [[OUT:%[^ ]+]] = (f32[16,16]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C1]], /*index=5*/[[C1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -5490,6 +5582,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDReluActivationF8) {
 ; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
 ; CHECK-DAG:         "epilogue":"RELU"
+
 ; CHECK:           }
       )");
 }
@@ -5547,7 +5640,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 // than 12.4.
 #if (GOOGLE_CUDA && CUDA_VERSION >= 12040) || TENSORFLOW_USE_ROCM
   RunAndFilecheckHloRewrite(
-      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: bf16[], {{.*}}: bf16[], {{.*}}: bf16[16]) -> bf16[16,16] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} parameter(0)
@@ -5559,8 +5654,8 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 ; CHECK-NEXT:    [[XS1:%[^ ]+]] = f32[] convert([[P3]])
 ; CHECK-NEXT:    [[C1:%[^ ]+]] = f32[] constant(1)
 ; CHECK-PTX-NEXT:    [[B:%[^ ]+]] = bf16[16]{0} parameter(4)
-; CHECK-PTX-NEXT:    ROOT [[OUT:%[^ ]+]] = bf16[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[XS]], [[XS1]], [[C1]], /*index=5*/[[C1]], [[B]]),
-; CHECK-GCN-NEXT:    [[OUT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[XS]], [[XS1]], [[C1]], /*index=5*/[[C1]]),
+; CHECK-PTX-NEXT:    [[OUT:%[^ ]+]] = (bf16[16,16]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1_TRANSPOSE]], [[XS]], [[XS1]], [[C1]], /*index=5*/[[C1]], [[B]]),
+; CHECK-GCN-NEXT:    [[OUT:%[^ ]+]] = (f32[16,16]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1_TRANSPOSE]], [[XS]], [[XS1]], [[C1]], /*index=5*/[[C1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -5634,7 +5729,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
   // Currently, hipBlasLt does not support output datatype bf16 for fp8 matmul.
   // And no fusion was done for such cases.
   RunAndFilecheckHloRewrite(
-      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: bf16[], {{.*}}: bf16[]) -> bf16[16,16] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} parameter(0)
@@ -5645,8 +5742,8 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 ; CHECK-NEXT:    [[P3:%[^ ]+]] = bf16[] parameter(3)
 ; CHECK-NEXT:    [[XS1:%[^ ]+]] = f32[] convert([[P3]])
 ; CHECK-NEXT:    [[C1:%[^ ]+]] = f32[] constant(1)
-; CHECK-PTX-NEXT:    ROOT [[OUT:%[^ ]+]] = bf16[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[XS]], [[XS1]], [[C1]], /*index=5*/[[C1]]),
-; CHECK-GCN-NEXT:    [[OUT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[XS]], [[XS1]], [[C1]], /*index=5*/[[C1]]),
+; CHECK-PTX-NEXT:    [[OUT:%[^ ]+]] = (bf16[16,16]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1_TRANSPOSE]], [[XS]], [[XS1]], [[C1]], /*index=5*/[[C1]]),
+; CHECK-GCN-NEXT:    [[OUT:%[^ ]+]] = (f32[16,16]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1_TRANSPOSE]], [[XS]], [[XS1]], [[C1]], /*index=5*/[[C1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -5698,7 +5795,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, InvScaledABUnscaledDF8) {
 
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
-      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
       R"(
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
       )");
@@ -5736,7 +5835,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDMatrixBiasF8) {
 
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
-      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
       R"(
 
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: f32[16,16], {{.*}}: f32[], {{.*}}: f32[]) -> f32[16,16] {
@@ -5747,7 +5848,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDMatrixBiasF8) {
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[] parameter(3)
 ; CHECK-NEXT:    [[P3:%[^ ]+]] = f32[] parameter(4)
 ; CHECK-NEXT:    [[C1:%[^ ]+]] = f32[] constant(1)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[C0]], [[P2]], [[P3]], /*index=5*/[[C1]], [[C1]]),
+; CHECK-NEXT:    [[OUT:%[^ ]+]] = (f32[16,16]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1_TRANSPOSE]], [[C0]], [[P2]], [[P3]], /*index=5*/[[C1]], [[C1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -5799,7 +5900,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDMatrixBiasPaddedF8) {
 
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
-      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
       R"(
 
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[14,31], {{.*}}: <<F8E4M3>>[31,14], {{.*}}: f32[14,14], {{.*}}: f32[], {{.*}}: f32[]) -> f32[14,14] {
@@ -5816,7 +5919,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDMatrixBiasPaddedF8) {
 ; CHECK-NEXT:    [[P3:%[^ ]+]] = f32[] parameter(3)
 ; CHECK-NEXT:    [[P4:%[^ ]+]] = f32[] parameter(4)
 ; CHECK-NEXT:    [[C3:%[^ ]+]] = f32[] constant(1)
-; CHECK-NEXT:    [[DOT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0_PADDED]], [[P1_TRANSPOSE_PADDED]], [[P2_PADDED]], [[P3]], [[P4]], /*index=5*/[[C3]], [[C3]]),
+; CHECK-NEXT:    [[DOT_TUPLE:%[^ ]+]] = (f32[16,16]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0_PADDED]], [[P1_TRANSPOSE_PADDED]], [[P2_PADDED]], [[P3]], [[P4]], /*index=5*/[[C3]], [[C3]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -5833,6 +5936,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDMatrixBiasPaddedF8) {
 ; CHECK-DAG:         }
 ; CHECK-DAG:         "epilogue":"DEFAULT"
 ; CHECK:           }
+; CHECK:      [[DOT:%[^ ]+]] = f32[16,16]{1,0} get-tuple-element([[DOT_TUPLE]]), index=0
 ; CHECK-NEXT: ROOT [[OUT:%[^ ]+]] = f32[14,14]{1,0} slice([[DOT]]), slice={[0:14], [0:14]}
       )");
 }
@@ -5868,17 +5972,77 @@ TEST_P(ParameterizedFp8GemmRewriteTest, UnscaledABScaledDF8) {
 
   CheckFp8IfSupported(hlo_text, ErrorSpec{1e-2, 1e-1});
   RunAndFilecheckHloRewrite(
-      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: f32[]) -> <<F8E4M3>>[16,16] {
 ; CHECK:         [[P0:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = <<F8E4M3>>[32,16]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} transpose([[P1]]), dimensions={1,0}
-; CHECK-NEXT:    [[C1:%[^ ]+]] = f32[] constant(1)
-; CHECK-PTX-NEXT:    [[C2:%[^ ]+]] = f32[] constant(1)
+; CHECK-NEXT:    [[C0:%[^ ]+]] = f32[] constant(1)
+; CHECK-PTX-NEXT:    [[P2:%[^ ]+]] = f32[] parameter(2)
+; CHECK-PTX-NEXT:    [[P2_INV:%[^ ]+]] = f32[] divide([[C0]], [[P2]])
+; CHECK-PTX-NEXT:    [[C1:%[^ ]+]] = f32[] constant(1)
+; CHECK-PTX-NEXT:    [[OUT:%[^ ]+]] = (<<F8E4M3>>[16,16]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1_TRANSPOSE]], [[P2_INV]], [[C1]], [[C1]], /*index=5*/[[C1]]),
+; CHECK-GCN-NEXT:    [[OUT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[C1]], [[C1]], [[C1]], /*index=5*/[[C1]]),
+; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
+; CHECK-DAG:         }
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
+; CHECK-DAG:         }
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
+      )");
+}
+
+TEST_P(ParameterizedFp8GemmRewriteTest, UnscaledABScaledF32DF8) {
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
+  GTEST_SKIP() << "F8 GEMM rewrite requires CUDA 12 and above.";
+#endif  // CUDA_VERSION < 12000
+
+#if TENSORFLOW_USE_ROCM && TF_ROCM_VERSION < 60000
+  GTEST_SKIP() << "F8 gemm rewrite is only supported in ROCm 6.0 and above.";
+#endif  // TF_ROCM_VERSION < 60000
+
+  const char* hlo_text = R"(
+    HloModule test
+
+    ENTRY test {
+      x = <<F8E4M3>>[16,32] parameter(0)
+      y = <<F8E4M3>>[32,16] parameter(1)
+      z_scale = f32[] parameter(2)
+      z_scale_bcast = f32[16,16] broadcast(z_scale), dimensions={}
+      dot_a = f32[16,16] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+      ROOT dot_a_scaled = f32[16,16] divide(dot_a, z_scale_bcast)
+          }
+
+)";
+
+  CheckFp8IfSupported(hlo_text, ErrorSpec{1e-2, 1e-1});
+  RunAndFilecheckHloRewrite(
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
+      R"(
+; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: f32[]) -> f32[16,16] {
+; CHECK:         [[P0:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} parameter(0)
+; CHECK-NEXT:    [[P1:%[^ ]+]] = <<F8E4M3>>[32,16]{1,0} parameter(1)
+; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} transpose([[P1]]), dimensions={1,0}
+; CHECK-NEXT:    [[C0:%[^ ]+]] = f32[] constant(1)
 ; CHECK-PTX-NEXT:    [[P2:%[^ ]+]] = f32[] parameter(2)
-; CHECK-PTX-NEXT:    [[P2_INV:%[^ ]+]] = f32[] divide([[C2]], [[P2]])
-; CHECK-PTX-NEXT:    ROOT [[OUT:%[^ ]+]] = <<F8E4M3>>[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[C1]], [[C1]], [[C1]], /*index=5*/[[P2_INV]]),
+; CHECK-PTX-NEXT:    [[P2_INV:%[^ ]+]] = f32[] divide([[C0]], [[P2]])
+; CHECK-PTX-NEXT:    [[C1:%[^ ]+]] = f32[] constant(1)
+; CHECK-PTX-NEXT:    [[OUT:%[^ ]+]] = (f32[16,16]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1_TRANSPOSE]], [[P2_INV]], [[C1]], [[C1]], /*index=5*/[[C1]]),
 ; CHECK-GCN-NEXT:    [[OUT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[C1]], [[C1]], [[C1]], /*index=5*/[[C1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
 ; CHECK:           backend_config={
@@ -5899,6 +6063,126 @@ TEST_P(ParameterizedFp8GemmRewriteTest, UnscaledABScaledDF8) {
       )");
 }
 
+TEST_P(ParameterizedFp8GemmRewriteTest, UnscaledABInvScaledF32DF8) {
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
+  GTEST_SKIP() << "F8 GEMM rewrite requires CUDA 12 and above.";
+#endif  // CUDA_VERSION < 12000
+
+#if TENSORFLOW_USE_ROCM && TF_ROCM_VERSION < 60000
+  GTEST_SKIP() << "F8 gemm rewrite is only supported in ROCm 6.0 and above.";
+#endif  // TF_ROCM_VERSION < 60000
+
+  const char* hlo_text = R"(
+    HloModule test
+
+    ENTRY test {
+      x = <<F8E4M3>>[16,32] parameter(0)
+      y = <<F8E4M3>>[32,16] parameter(1)
+      z_scale = f32[] parameter(2)
+      z_scale_bcast = f32[16,16] broadcast(z_scale), dimensions={}
+      dot_a = f32[16,16] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+      ROOT dot_a_scaled = f32[16,16] multiply(dot_a, z_scale_bcast)
+          }
+
+)";
+
+  CheckFp8IfSupported(hlo_text, ErrorSpec{1e-2, 1e-1});
+  RunAndFilecheckHloRewrite(
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
+      R"(
+; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: f32[]) -> f32[16,16] {
+; CHECK:         [[P0:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} parameter(0)
+; CHECK-NEXT:    [[P1:%[^ ]+]] = <<F8E4M3>>[32,16]{1,0} parameter(1)
+; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} transpose([[P1]]), dimensions={1,0}
+; CHECK-PTX-NEXT:    [[P2:%[^ ]+]] = f32[] parameter(2)
+; CHECK-PTX-NEXT:    [[C0:%[^ ]+]] = f32[] constant(1)
+; CHECK-PTX-NEXT:    [[OUT:%[^ ]+]] = (f32[16,16]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1_TRANSPOSE]], [[P2]], [[C0]], [[C0]], /*index=5*/[[C0]]),
+; CHECK-GCN-NEXT:    [[OUT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[C0]], [[C0]], [[C0]], /*index=5*/[[C0]]),
+; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
+; CHECK-DAG:         }
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
+; CHECK-DAG:         }
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
+      )");
+}
+
+// Do not fuse output scaling without type conversion when a matrix bias was
+// fused.
+TEST_P(ParameterizedFp8GemmRewriteTest, UnscaledABScaledF32DMatrixBiasF8) {
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
+  GTEST_SKIP() << "F8 GEMM rewrite requires CUDA 12 and above.";
+#endif  // CUDA_VERSION < 12000
+
+#if TENSORFLOW_USE_ROCM && TF_ROCM_VERSION < 60000
+  GTEST_SKIP() << "F8 gemm rewrite is only supported in ROCm 6.0 and above.";
+#endif  // TF_ROCM_VERSION < 60000
+
+  const char* hlo_text = R"(
+    HloModule test
+
+    ENTRY test {
+      x = <<F8E4M3>>[16,32] parameter(0)
+      y = <<F8E4M3>>[32,16] parameter(1)
+      b = f32[16,16] parameter(2)
+      z_scale = f32[] parameter(3)
+      z_scale_bcast = f32[16,16] broadcast(z_scale), dimensions={}
+      dot_a = f32[16,16] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+      dot_a_bias = f32[16,16] add(dot_a, b)
+      ROOT dot_a_scaled = f32[16,16] divide(dot_a_bias, z_scale_bcast)
+          }
+
+)";
+
+  CheckFp8IfSupported(hlo_text, ErrorSpec{1e-2, 1e-1});
+  RunAndFilecheckHloRewrite(
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
+      R"(
+; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: f32[]) -> f32[16,16] {
+; CHECK:         [[P0:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} parameter(0)
+; CHECK-NEXT:    [[P1:%[^ ]+]] = <<F8E4M3>>[32,16]{1,0} parameter(1)
+; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} transpose([[P1]]), dimensions={1,0}
+; CHECK-PTX-NEXT:    [[P2:%[^ ]+]] = f32[16,16]{1,0} parameter(2)
+; CHECK-PTX-NEXT:    [[C0:%[^ ]+]] = f32[] constant(1)
+; CHECK-PTX-NEXT:    [[GEMM_TUPLE:%[^ ]+]] = (f32[16,16]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1_TRANSPOSE]], [[P2]], [[C0]], [[C0]], /*index=5*/[[C0]], [[C0]]),
+; CHECK-GCN-NEXT:    [[GEMM:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[C0]], [[C0]], [[C0]], /*index=5*/[[C0]]),
+; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":1
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
+; CHECK-DAG:         }
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
+; CHECK-DAG:         }
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK-PTX-NEXT:    [[GEMM:%[^ ]+]] = f32[16,16]{1,0} get-tuple-element([[GEMM_TUPLE]]), index=0
+; CHECK-PTX-NEXT:    [[P3:%[^ ]+]] = f32[] parameter(3)
+; CHECK-PTX-NEXT:    [[P3_BCAST:%[^ ]+]] = f32[16,16]{1,0} broadcast([[P3]]), dimensions={}
+; CHECK-PTX-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[16,16]{1,0} divide([[GEMM]], [[P3_BCAST]])
+; CHECK:           }
+      )");
+}
+
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDF8) {
 #if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
@@ -5938,7 +6222,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDF8) {
 
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
-      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: f32[], {{.*}}: f32[], {{.*}}: f32[]) -> <<F8E4M3>>[16,16] {
 ; CHECK:         [[P0:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} parameter(0)
@@ -5950,7 +6236,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDF8) {
 ; CHECK-PTX-NEXT:    [[C2:%[^ ]+]] = f32[] constant(1)
 ; CHECK-PTX-NEXT:    [[P4:%[^ ]+]] = f32[] parameter(4)
 ; CHECK-PTX-NEXT:    [[P4_INV:%[^ ]+]] = f32[] divide([[C2]], [[P4]])
-; CHECK-PTX-NEXT:    ROOT [[OUT:%[^ ]+]] = <<F8E4M3>>[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C1]], /*index=5*/[[P4_INV]]),
+; CHECK-PTX-NEXT:    [[OUT:%[^ ]+]] = (<<F8E4M3>>[16,16]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C1]], /*index=5*/[[P4_INV]]),
 ; CHECK-GCN-NEXT:    [[OUT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C1]], /*index=5*/[[C1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
 ; CHECK:           backend_config={
@@ -6010,7 +6296,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABInvScaledDF8) {
 
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
-      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
       R"(
 
 ; CHECK-NOT:     divide
@@ -6060,7 +6348,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDReluActivationF8) {
 
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
-      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: f32[], {{.*}}: f32[], {{.*}}: f32[]) -> <<F8E4M3>>[16,16] {
 ; CHECK:         [[P0:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} parameter(0)
@@ -6072,7 +6362,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDReluActivationF8) {
 ; CHECK-PTX-NEXT:    [[C2:%[^ ]+]] = f32[] constant(1)
 ; CHECK-PTX-NEXT:    [[P4:%[^ ]+]] = f32[] parameter(4)
 ; CHECK-PTX-NEXT:    [[P4_INV:%[^ ]+]] = f32[] divide([[C2]], [[P4]])
-; CHECK-PTX-NEXT:    ROOT [[OUT:%[^ ]+]] = <<F8E4M3>>[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C1]], /*index=5*/[[P4_INV]]),
+; CHECK-PTX-NEXT:    [[OUT:%[^ ]+]] = (<<F8E4M3>>[16,16]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C1]], /*index=5*/[[P4_INV]]),
 ; CHECK-GCN-NEXT:    [[OUT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C1]], /*index=5*/[[C1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
 ; CHECK:           backend_config={
@@ -6134,7 +6424,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDMatrixBiasF8) {
 
   CheckFp8IfSupported(hlo_text, ErrorSpec{0.1, 0.1});
   RunAndFilecheckHloRewrite(
-      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
       R"(
 
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: f16[16,16], {{.*}}: f16[], {{.*}}: f16[], {{.*}}: f16[]) -> <<F8E4M3>>[16,16] {
@@ -6146,7 +6438,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDMatrixBiasF8) {
 ; CHECK:         [[P3:%[^ ]+]] = f16[] parameter(4)
 ; CHECK:         [[C1:%[^ ]+]] = f32[] constant(1)
 ; CHECK-PTX:         [[P4:%[^ ]+]] = f16[] parameter(5)
-; CHECK-PTX:       ROOT [[OUT:%[^ ]+]] = <<F8E4M3>>[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[C0]], [[DUMMY0:%[^ ]+]], [[DUMMY1:%[^ ]+]], /*index=5*/[[C1]], [[DUMMY2:%[^ ]+]]),
+; CHECK-PTX:       [[OUT:%[^ ]+]] = (<<F8E4M3>>[16,16]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1_TRANSPOSE]], [[C0]], [[DUMMY0:%[^ ]+]], [[DUMMY1:%[^ ]+]], /*index=5*/[[C1]], [[DUMMY2:%[^ ]+]]),
 ; CHECK-GCN:       [[OUT:%[^ ]+]] = f16[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[C0]], [[DUMMY0:%[^ ]+]], [[DUMMY1:%[^ ]+]], /*index=5*/[[C1]], [[DUMMY2:%[^ ]+]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
 ; CHECK:           backend_config={
@@ -6209,7 +6501,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDVectorBiasF8) {
 
   CheckFp8IfSupported(hlo_text, ErrorSpec{0.1, 0.1});
   RunAndFilecheckHloRewrite(
-      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
       R"(
 
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: f16[16], {{.*}}: f16[], {{.*}}: f16[], {{.*}}: f16[]) -> <<F8E4M3>>[16,16] {
@@ -6226,7 +6520,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDVectorBiasF8) {
 ; CHECK-PTX-NEXT:    [[DV:%[^ ]+]] = f16[] divide([[C2]], [[P4]])
 ; CHECK-PTX-NEXT:    [[CV2:%[^ ]+]] = f32[] convert([[DV]])
 ; CHECK-NEXT:    [[VB:%[^ ]+]] = f16[16]{0} parameter(2)
-; CHECK-PTX:         ROOT [[OUT:%[^ ]+]] = <<F8E4M3>>[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[CV]], [[CV1]], [[C]], /*index=5*/[[CV2]], [[VB]]),
+; CHECK-PTX:         [[OUT:%[^ ]+]] = (<<F8E4M3>>[16,16]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1_TRANSPOSE]], [[CV]], [[CV1]], [[C]], /*index=5*/[[CV2]], [[VB]]),
 ; CHECK-GCN:         [[OUT:%[^ ]+]] = f16[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[CV]], [[CV1]], [[C]], /*index=5*/[[C]], [[VB]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
 ; CHECK:           backend_config={
@@ -6282,7 +6576,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDF32VectorBiasF8) {
 
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
-      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: f32[16], {{.*}}: f32[], {{.*}}: f32[]) -> f32[16,16] {
 ; CHECK:         [[P0:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} parameter(0)
@@ -6293,7 +6589,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDF32VectorBiasF8) {
 ; CHECK-NEXT:    [[C:%[^ ]+]] = f32[] constant(1)
 ; CHECK-NEXT:    [[VB:%[^ ]+]] = f32[16]{0} parameter(2)
 ; CHECK-NEXT:    [[VBC:%[^ ]+]] = bf16[16]{0} convert([[VB]])
-; CHECK:         ROOT [[OUT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C]], /*index=5*/[[C]], [[VBC]]),
+; CHECK:         [[OUT:%[^ ]+]] = (f32[16,16]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C]], /*index=5*/[[C]], [[VBC]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -6349,7 +6645,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 
   CheckFp8IfSupported(hlo_text, ErrorSpec{2e-3, 0.});
   RunAndFilecheckHloRewrite(
-      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: f16[16], {{.*}}: f16[], {{.*}}: f16[]) -> f16[16,16] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} parameter(0)
@@ -6415,17 +6713,22 @@ TEST_P(ParameterizedFp8GemmRewriteTest, Rank3ScaledABUnscaledDVectorBiasF8) {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true);
+  GemmRewriter pass(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                    /*f8_rewrite=*/true);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Bitcast(m::CustomCall({"__cublas$lt$matmul$f8"})
-                                        .WithShape(F16, {64, 32}))
-                             .WithShape(F16, {4, 16, 32})));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Bitcast(m::GetTupleElement(
+                                m::CustomCall({"__cublas$lt$matmul$f8"}), 0)
+                                .WithShape(F16, {64, 32}))
+                     .WithShape(F16, {4, 16, 32})));
 
   RunAndFilecheckHloRewrite(
-      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[4,16,16], {{.*}}: <<F8E4M3>>[16,32], {{.*}}: f32[32], {{.*}}: f16[], {{.*}}: f16[]) -> f16[4,16,32] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = <<F8E4M3>>[4,16,16]{2,1,0} parameter(0)
@@ -6439,7 +6742,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, Rank3ScaledABUnscaledDVectorBiasF8) {
 ; CHECK-NEXT:    [[C:%[^ ]+]] = f32[] constant(1)
 ; CHECK-NEXT:    [[B:%[^ ]+]] = f32[32]{0} parameter(2)
 ; CHECK-NEXT:    [[B_F16:%[^ ]+]] = f16[32]{0} convert([[B]])
-; CHECK-NEXT:    [[GEMM:%[^ ]+]] = f16[64,32]{1,0} custom-call([[P0_BITCAST]], [[P1_TRANSPOSE]], [[P2_CV]], [[P3_CV]], [[C]], /*index=5*/[[C]], [[B_F16]]),
+; CHECK-NEXT:    [[GEMM_TUPLE:%[^ ]+]] = (f16[64,32]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0_BITCAST]], [[P1_TRANSPOSE]], [[P2_CV]], [[P3_CV]], [[C]], /*index=5*/[[C]], [[B_F16]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -6456,6 +6759,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, Rank3ScaledABUnscaledDVectorBiasF8) {
 ; CHECK-DAG:         }
 ; CHECK-DAG:         "epilogue":"BIAS"
 ; CHECK:           }
+; CHECK:         [[GEMM:%[^ ]+]] = f16[64,32]{1,0} get-tuple-element([[GEMM_TUPLE]]), index=0
 ; CHECK:         ROOT [[OUT:%[^ ]+]] = f16[4,16,32]{2,1,0} bitcast([[GEMM]])
       )");
 }
@@ -6495,19 +6799,24 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true);
+  GemmRewriter pass(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                    /*f8_rewrite=*/true);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
   EXPECT_THAT(
       module->entry_computation()->root_instruction(),
-      GmockMatch(m::Bitcast(m::Slice(m::CustomCall({"__cublas$lt$matmul$f8"})
-                                         .WithShape(F16, {64, 32}))
-                                .WithShape(F16, {60, 31}))
-                     .WithShape(F16, {4, 15, 31})));
+      GmockMatch(
+          m::Bitcast(m::Slice(m::GetTupleElement(
+                                  m::CustomCall({"__cublas$lt$matmul$f8"}), 0)
+                                  .WithShape(F16, {64, 32}))
+                         .WithShape(F16, {60, 31}))
+              .WithShape(F16, {4, 15, 31})));
 
   RunAndFilecheckHloRewrite(
-      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[4,15,15], {{.*}}: <<F8E4M3>>[15,31], {{.*}}: f32[31], {{.*}}: f16[], {{.*}}: f16[]) -> f16[4,15,31] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = <<F8E4M3>>[4,15,15]{2,1,0} parameter(0)
@@ -6527,7 +6836,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 ; CHECK-NEXT:    [[B_F16:%[^ ]+]] = f16[31]{0} convert([[B]])
 ; CHECK-NEXT:    [[C3:%[^ ]+]] = f16[] constant(0)
 ; CHECK-NEXT:    [[P2_PAD:%[^ ]+]] = f16[32]{0} pad([[B_F16]], [[C3]]), padding=0_1
-; CHECK-NEXT:    [[GEMM:%[^ ]+]] = f16[64,32]{1,0} custom-call([[P0_PAD]], [[P1_PAD]], [[P2_CV]], [[P3_CV]], [[C]], /*index=5*/[[C]], [[P2_PAD]]),
+; CHECK-NEXT:    [[GEMM_TUPLE:%[^ ]+]] = (f16[64,32]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0_PAD]], [[P1_PAD]], [[P2_CV]], [[P3_CV]], [[C]], /*index=5*/[[C]], [[P2_PAD]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -6544,6 +6853,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 ; CHECK-DAG:         }
 ; CHECK-DAG:         "epilogue":"BIAS"
 ; CHECK:           }
+; CHECK:          [[GEMM:%[^ ]+]] = f16[64,32]{1,0} get-tuple-element([[GEMM_TUPLE]]), index=0
 ; CHECK-NEXT:     [[SLICE:%[^ ]+]] = f16[60,31]{1,0} slice([[GEMM]]), slice={[0:60], [0:31]}
 ; CHECK-NEXT:     ROOT [[OUT:%[^ ]+]] = f16[4,15,31]{2,1,0} bitcast([[SLICE]])
       )");
@@ -6581,17 +6891,22 @@ TEST_P(ParameterizedFp8GemmRewriteTest, Rank3ScaledABUnscaledDMatrixBiasF8) {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true);
+  GemmRewriter pass(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                    /*f8_rewrite=*/true);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Bitcast(m::CustomCall({"__cublas$lt$matmul$f8"})
-                                        .WithShape(F32, {64, 32}))
-                             .WithShape(F32, {4, 16, 32})));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Bitcast(m::GetTupleElement(
+                                m::CustomCall({"__cublas$lt$matmul$f8"}), 0)
+                                .WithShape(F32, {64, 32}))
+                     .WithShape(F32, {4, 16, 32})));
 
   RunAndFilecheckHloRewrite(
-      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[4,16,16], {{.*}}: <<F8E4M3>>[16,32], {{.*}}: f32[4,16,32], {{.*}}: f32[], {{.*}}: f32[]) -> f32[4,16,32] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = <<F8E4M3>>[4,16,16]{2,1,0} parameter(0)
@@ -6603,7 +6918,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, Rank3ScaledABUnscaledDMatrixBiasF8) {
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[] parameter(3)
 ; CHECK-NEXT:    [[P3:%[^ ]+]] = f32[] parameter(4)
 ; CHECK-NEXT:    [[C:%[^ ]+]] = f32[] constant(1)
-; CHECK-NEXT:    [[GEMM:%[^ ]+]] = f32[64,32]{1,0} custom-call([[P0_BITCAST]], [[P1_TRANSPOSE]], [[B_BITCAST]], [[P2]], [[P3]], /*index=5*/[[C]], [[C]]),
+; CHECK-NEXT:    [[GEMM_TUPLE:%[^ ]+]] = (f32[64,32]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0_BITCAST]], [[P1_TRANSPOSE]], [[B_BITCAST]], [[P2]], [[P3]], /*index=5*/[[C]], [[C]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -6620,6 +6935,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, Rank3ScaledABUnscaledDMatrixBiasF8) {
 ; CHECK-DAG:         }
 ; CHECK-DAG:         "epilogue":"DEFAULT"
 ; CHECK:           }
+; CHECK:         [[GEMM:%[^ ]+]] = f32[64,32]{1,0} get-tuple-element([[GEMM_TUPLE]]), index=0
 ; CHECK:         ROOT [[OUT:%[^ ]+]] = f32[4,16,32]{2,1,0} bitcast([[GEMM]])
       )");
 }
@@ -6657,19 +6973,24 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true);
+  GemmRewriter pass(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                    /*f8_rewrite=*/true);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
   EXPECT_THAT(
       module->entry_computation()->root_instruction(),
-      GmockMatch(m::Bitcast(m::Slice(m::CustomCall({"__cublas$lt$matmul$f8"})
-                                         .WithShape(F32, {48, 32}))
-                                .WithShape(F32, {45, 31}))
-                     .WithShape(F32, {3, 15, 31})));
+      GmockMatch(
+          m::Bitcast(m::Slice(m::GetTupleElement(
+                                  m::CustomCall({"__cublas$lt$matmul$f8"}), 0)
+                                  .WithShape(F32, {48, 32}))
+                         .WithShape(F32, {45, 31}))
+              .WithShape(F32, {3, 15, 31})));
 
   RunAndFilecheckHloRewrite(
-      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[3,15,15], {{.*}}: <<F8E4M3>>[15,31], {{.*}}: f32[3,15,31], {{.*}}: f32[], {{.*}}: f32[]) -> f32[3,15,31] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = <<F8E4M3>>[3,15,15]{2,1,0} parameter(0)
@@ -6687,7 +7008,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[] parameter(3)
 ; CHECK-NEXT:    [[P3:%[^ ]+]] = f32[] parameter(4)
 ; CHECK-NEXT:    [[C:%[^ ]+]] = f32[] constant(1)
-; CHECK-NEXT:    [[GEMM:%[^ ]+]] = f32[48,32]{1,0} custom-call([[P0_PADDED]], [[P1_PADDED]], [[P2_PADDED]], [[P2]], [[P3]], /*index=5*/[[C]], [[C]]),
+; CHECK-NEXT:    [[GEMM_TUPLE:%[^ ]+]] = (f32[48,32]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0_PADDED]], [[P1_PADDED]], [[P2_PADDED]], [[P2]], [[P3]], /*index=5*/[[C]], [[C]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -6704,6 +7025,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 ; CHECK-DAG:         }
 ; CHECK-DAG:         "epilogue":"DEFAULT"
 ; CHECK:           }
+; CHECK-NEXT:      [[GEMM:%[^ ]+]] = f32[48,32]{1,0} get-tuple-element([[GEMM_TUPLE]]), index=0
 ; CHECK-NEXT:      [[SLICE:%[^ ]+]] = f32[45,31]{1,0} slice([[GEMM]]), slice={[0:45], [0:31]}
 ; CHECK-NEXT:      ROOT [[OUT:%[^ ]+]] = f32[3,15,31]{2,1,0} bitcast([[SLICE]])
       )");
@@ -6743,12 +7065,15 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true);
+  GemmRewriter pass(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                    /*f8_rewrite=*/true);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
   RunAndFilecheckHloRewrite(
-      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[48,16], {{.*}}: <<F8E4M3>>[16,32], {{.*}}: f32[32,16], {{.*}}: f32[], {{.*}}: f32[]) -> f32[32,16] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = <<F8E4M3>>[48,16]{1,0} parameter(0)
@@ -6757,7 +7082,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[] parameter(3)
 ; CHECK-NEXT:    [[P3:%[^ ]+]] = f32[] parameter(4)
 ; CHECK-NEXT:    [[C:%[^ ]+]] = f32[] constant(1)
-; CHECK-NEXT:    [[GEMM:%[^ ]+]] = f32[48,32]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C]], /*index=5*/[[C]]),
+; CHECK-NEXT:    [[GEMM_TUPLE:%[^ ]+]] = (f32[48,32]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C]], /*index=5*/[[C]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -6774,6 +7099,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 ; CHECK-DAG:         }
 ; CHECK-DAG:         "epilogue":"DEFAULT"
 ; CHECK:           }
+; CHECK:           [[GEMM:%[^_]+]] = f32[48,32]{1,0} get-tuple-element([[GEMM_TUPLE]]), index=0
 ; CHECK-NEXT:      [[SLICE:%[^ ]+]] = f32[32,16]{1,0} slice([[GEMM]]), slice={[16:48], [16:32]}
 ; CHECK-NEXT:      [[B:%[^ ]+]] = f32[32,16]{1,0} parameter(2)
 ; CHECK-NEXT:      ROOT [[OUT:%[^ ]+]] = f32[32,16]{1,0} add([[SLICE]], [[B]])
@@ -6814,7 +7140,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDWithAllGatherF8) {
   config.set_num_partitions(8);
 
   RunAndFilecheckHloRewrite(
-      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[16,32], {{.*}}: f32[], {{.*}}: f32[]) -> f32[16,32] {
 ; CHECK:         [[P0:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} parameter(0)
@@ -6825,7 +7153,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDWithAllGatherF8) {
 ; CHECK:         [[P2:%[^ ]+]] = f32[] parameter(2)
 ; CHECK:         [[P3:%[^ ]+]] = f32[] parameter(3)
 ; CHECK:         [[C:%[^ ]+]] = f32[] constant(1)
-; CHECK:         ROOT [[GEMM:%[^ ]+]] = f32[16,32]{1,0} custom-call([[AG]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C]], /*index=5*/[[C]]),
+; CHECK:         [[GEMM_TUPLE:%[^ ]+]] = (f32[16,32]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[AG]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C]], /*index=5*/[[C]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -6842,6 +7170,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDWithAllGatherF8) {
 ; CHECK-DAG:         }
 ; CHECK-DAG:         "epilogue":"DEFAULT"
 ; CHECK:           }
+; CHECK:           ROOT [[GEMM:%[^_]+]] = f32[16,32]{1,0} get-tuple-element([[GEMM_TUPLE]]), index=0
       )",
       nullptr, &config);
 }
@@ -6879,7 +7208,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDWithAllToAllF8) {
   config.set_num_partitions(8);
 
   RunAndFilecheckHloRewrite(
-      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[16,32], {{.*}}: f32[], {{.*}}: f32[]) -> f32[16,16] {
 ; CHECK:         [[P0:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} parameter(0)
@@ -6888,7 +7219,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDWithAllToAllF8) {
 ; CHECK:         [[P2:%[^ ]+]] = f32[] parameter(2)
 ; CHECK:         [[P3:%[^ ]+]] = f32[] parameter(3)
 ; CHECK:         [[C:%[^ ]+]] = f32[] constant(1)
-; CHECK:         ROOT [[GEMM:%[^ ]+]] = f32[16,16]{1,0} custom-call([[AA]], [[P1]], [[P2]], [[P3]], [[C]], /*index=5*/[[C]]),
+; CHECK:         [[GEMM:%[^ ]+]] = (f32[16,16]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[AA]], [[P1]], [[P2]], [[P3]], [[C]], /*index=5*/[[C]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -6943,7 +7274,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
   config.set_num_partitions(8);
 
   RunAndFilecheckHloRewrite(
-      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[16,32], {{.*}}: f32[], {{.*}}: f32[]) -> f32[16,16] {
 ; CHECK:         [[P0:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} parameter(0)
@@ -6952,7 +7285,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 ; CHECK:         [[P2:%[^ ]+]] = f32[] parameter(2)
 ; CHECK:         [[P3:%[^ ]+]] = f32[] parameter(3)
 ; CHECK:         [[C:%[^ ]+]] = f32[] constant(1)
-; CHECK:         ROOT [[GEMM:%[^ ]+]] = f32[16,16]{1,0} custom-call([[AA]], [[P1]], [[P2]], [[P3]], [[C]], /*index=5*/[[C]]),
+; CHECK:         [[GEMM:%[^ ]+]] = (f32[16,16]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[AA]], [[P1]], [[P2]], [[P3]], [[C]], /*index=5*/[[C]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -7009,7 +7342,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 
   CheckFp8IfSupported(hlo_text, ErrorSpec{2e-3, 0.});
   RunAndFilecheckHloRewrite(
-      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
       R"(
 ; CHECK-LABEL:   ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: f16[16], {{.*}}: f16[16,16], {{.*}}: f16[], {{.*}}: f16[]) -> f16[16,16] {
 ; CHECK-DAG:     [[P0:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} parameter(0)
@@ -7021,7 +7356,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 ; CHECK-NEXT:    [[P3:%[^ ]+]] = f16[] parameter(5)
 ; CHECK-NEXT:    [[CV1:%[^ ]+]] = f32[] convert([[P3]])
 ; CHECK:         [[C1:%[^ ]+]] = f32[] constant(1)
-; CHECK:         [[GEMMOUT:%[^ ]+]] = f16[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[MB]], [[CV0]], [[CV1]], /*index=5*/[[C1]], [[C1]]),
+; CHECK:         [[GEMMOUT_TUPLE:%[^ ]+]] = (f16[16,16]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1_TRANSPOSE]], [[MB]], [[CV0]], [[CV1]], /*index=5*/[[C1]], [[C1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -7038,6 +7373,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 ; CHECK-DAG:         }
 ; CHECK-DAG:         "epilogue":"DEFAULT"
 ; CHECK:           }
+; CHECK:         [[GEMMOUT:%[^ ]+]] = f16[16,16]{1,0} get-tuple-element([[GEMMOUT_TUPLE]]), index=0
 ; CHECK:         [[VB:%[^ ]+]] = f16[16]{0} parameter(2)
 ; CHECK:         [[VBC:%[^ ]+]] = f16[16,16]{1,0} broadcast([[VB]]), dimensions={1}
 ; CHECK:         ROOT [[OUT:%[^ ]+]] = f16[16,16]{1,0} add([[GEMMOUT]], [[VBC]])
@@ -7093,7 +7429,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDWithDAmaxF8) {
 
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
-      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: f32[], {{.*}}: f32[], {{.*}}: f32[]) -> (<<F8E4M3>>[16,16], f32[]) {
 ; CHECK:    [[P0:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} parameter(0)
@@ -7105,8 +7443,8 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDWithDAmaxF8) {
 ; CHECK-PTX-NEXT:    [[C2:%[^ ]+]] = f32[] constant(1)
 ; CHECK-PTX-NEXT:    [[P4:%[^ ]+]] = f32[] parameter(4)
 ; CHECK-PTX-NEXT:    [[P4_INV:%[^ ]+]] = f32[] divide([[C2]], [[P4]])
-; CHECK-PTX-NEXT:    [[OUT:%[^ ]+]] = (<<F8E4M3>>[16,16]{1,0}, f32[]) custom-call([[P0]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C1]], /*index=5*/[[P4_INV]]),
-; CHECK-GCN-NEXT:    [[OUT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C1]], /*index=5*/[[C1]]),
+; CHECK-PTX-NEXT:    [[OUT:%[^ ]+]] = (<<F8E4M3>>[16,16]{1,0}, f32[], s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C1]], /*index=5*/[[P4_INV]]),
+; CHECK-GCN-NEXT:    [[OUT:%[^ ]+]] = (f32[16,16]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C1]], /*index=5*/[[C1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -7178,7 +7516,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
-      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: f16[], {{.*}}: f16[], {{.*}}: f16[]) -> (<<F8E4M3>>[16,16], f16[]) {
 ; CHECK:    [[P0:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} parameter(0)
@@ -7193,8 +7533,8 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 ; CHECK-PTX-NEXT:    [[P4:%[^ ]+]] = f16[] parameter(4)
 ; CHECK-PTX-NEXT:    [[P4_INV:%[^ ]+]] = f16[] divide([[C2]], [[P4]])
 ; CHECK-PTX-NEXT:    [[P4_INV_CONVERT:%[^ ]+]] = f32[] convert([[P4_INV]])
-; CHECK-PTX-NEXT:    [[OUT:%[^ ]+]] = (<<F8E4M3>>[16,16]{1,0}, f32[]) custom-call([[P0]], [[P1_TRANSPOSE]], [[P2_CONVERT]], [[P3_CONVERT]], [[C1]], /*index=5*/[[P4_INV_CONVERT]]),
-; CHECK-GCN-NEXT:    [[OUT:%[^ ]+]] = f16[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[P2_CONVERT]], [[P3_CONVERT]], [[C1]], /*index=5*/[[C1]]),
+; CHECK-PTX-NEXT:    [[OUT:%[^ ]+]] = (<<F8E4M3>>[16,16]{1,0}, f32[], s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1_TRANSPOSE]], [[P2_CONVERT]], [[P3_CONVERT]], [[C1]], /*index=5*/[[P4_INV_CONVERT]]),
+; CHECK-GCN-NEXT:    [[OUT:%[^ ]+]] = (f16[16,16]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1_TRANSPOSE]], [[P2_CONVERT]], [[P3_CONVERT]], [[C1]], /*index=5*/[[C1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -7266,7 +7606,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
-      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: f32[], {{.*}}: f32[], {{.*}}: f32[]) -> (<<F8E4M3>>[16,16], f32[]) {
 ; CHECK:    [[P0:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} parameter(0)
@@ -7278,8 +7620,8 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 ; CHECK-PTX-NEXT:    [[C2:%[^ ]+]] = f32[] constant(1)
 ; CHECK-PTX-NEXT:    [[P4:%[^ ]+]] = f32[] parameter(4)
 ; CHECK-PTX-NEXT:    [[P4_INV:%[^ ]+]] = f32[] divide([[C2]], [[P4]])
-; CHECK-PTX-NEXT:    [[OUT:%[^ ]+]] = (<<F8E4M3>>[16,16]{1,0}, f32[]) custom-call([[P0]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C1]], /*index=5*/[[P4_INV]]),
-; CHECK-GCN-NEXT:    [[OUT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C1]], /*index=5*/[[C1]]),
+; CHECK-PTX-NEXT:    [[OUT:%[^ ]+]] = (<<F8E4M3>>[16,16]{1,0}, f32[], s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C1]], /*index=5*/[[P4_INV]]),
+; CHECK-GCN-NEXT:    [[OUT:%[^ ]+]] = (f32[16,16]{1,0}, s8[{{[0-9]+}}]{0}) custom-call([[P0]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C1]], /*index=5*/[[C1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
@@ -7405,7 +7747,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDF8Parameterized) {
     CheckFp8IfSupported(hlo_text);
 
     RunAndFilecheckHloRewrite(
-        hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+        hlo_text,
+        GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                     /*f8_rewrite=*/true),
         R"(
     ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
           )");
@@ -7479,7 +7823,9 @@ ENTRY f {
     CheckFp8IfSupported(hlo_text);
 
     RunAndFilecheckHloRewrite(
-        hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+        hlo_text,
+        GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                     /*f8_rewrite=*/true),
         R"(
     ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
           )");
@@ -7516,7 +7862,9 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDF8TF32E5M2) {
 
   CheckFp8IfSupported(hlo_text);
   RunAndFilecheckHloRewrite(
-      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
       R"(
     ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
           )");
@@ -7552,14 +7900,17 @@ TEST_P(ParameterizedFp8GemmRewriteTest, FnuzTypeF8) {
 #if GOOGLE_CUDA
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true);
+  GemmRewriter pass(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                    /*f8_rewrite=*/true);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_FALSE(changed);
 #endif
 #if TENSORFLOW_USE_ROCM
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-2, 1e-2}));
   RunAndFilecheckHloRewrite(
-      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      hlo_text,
+      GemmRewriter(CudaHopperOrRocmMI300(), GetToolkitVersion(),
+                   /*f8_rewrite=*/true),
       R"(
 ; CHECK-LABEL: ENTRY %test ({{.*}}: f8e4m3fnuz[16,32], {{.*}}: f8e4m3fnuz[32,16], {{.*}}: f32[], {{.*}}: f32[]) -> f32[16,16] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fnuz[16,32]{1,0} parameter(0)
@@ -7632,11 +7983,15 @@ class GemmRewriteAllocationTest : public GpuCodegenTest {
                                 int expected_number_of_allocations) {
     TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
                             GetOptimizedModule(hlo));
+    if (allocator_ == nullptr) {
+      allocator_ = std::make_unique<se::StreamExecutorMemoryAllocator>(
+          backend().default_stream_executor());
+    }
     TF_ASSERT_OK_AND_ASSIGN(
         std::unique_ptr<Executable> executable,
-        backend().compiler()->RunBackend(
-            std::move(optimized_module), backend().default_stream_executor(),
-            backend().default_stream_executor()->GetAllocator()));
+        backend().compiler()->RunBackend(std::move(optimized_module),
+                                         backend().default_stream_executor(),
+                                         allocator_.get()));
     GpuExecutable* gpu_executable =
         static_cast<GpuExecutable*>(executable.get());
     absl::Span<const BufferAllocation> allocations =
@@ -7650,6 +8005,9 @@ class GemmRewriteAllocationTest : public GpuCodegenTest {
     debug_options.set_xla_gpu_gemm_rewrite_size_threshold(0);
     return debug_options;
   }
+
+ private:
+  std::unique_ptr<se::DeviceMemoryAllocator> allocator_;
 };
 
 TEST_F(GemmRewriteAllocationTest, SharedBufferAssignment) {
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc
index 313cd58ef1c77e..a0cb225ff911ff 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc
@@ -24,8 +24,6 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/match.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "xla/array4d.h"
 #include "xla/client/xla_builder.h"
@@ -76,11 +74,6 @@ class MultiHeadedAttentionTest : public GpuCodegenTest {
           "compute capability == 0.";
       return;
     }
-    if (GetDnnVersionInfo(backend().default_stream_executor()) <
-        se::dnn::VersionInfo(8, 8, 0)) {
-      skip_reason_ = "cuDNN Fused MHA requires cuDNN 8.8.0 or later.";
-      return;
-    }
   }
 
   se::CudaComputeCapability GetCudaComputeCapability() {
@@ -90,26 +83,22 @@ class MultiHeadedAttentionTest : public GpuCodegenTest {
         .cuda_compute_capability();
   }
 
-  ErrorSpec error_spec_{2.5E-3, 1e-5};
+  ErrorSpec mha_error_spec_{2.5E-3, 1e-5};
 
  protected:
   DebugOptions GetDebugOptionsForTest() override {
     auto debug_options = HloTestBase::GetDebugOptionsForTest();
-    debug_options.set_xla_gpu_enable_xla_runtime_executable(false);
-    debug_options.set_xla_gpu_enable_cudnn_fmha(false);
+    debug_options.set_xla_gpu_enable_cudnn_fmha(true);
     return debug_options;
   }
 
-  absl::StatusOr<int> CountFMHACalls(absl::string_view hlo_string,
-                                     const HloModuleConfig &config) {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> verified_module,
-                        ParseAndReturnVerifiedModule(hlo_string, config));
-
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> optimized_verified_module,
-                        GetOptimizedModule(std::move(verified_module)));
+  absl::StatusOr<int> CountFMHACalls(
+      std::unique_ptr<HloModule> unoptimized_module) {
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> optimized_module,
+                        GetOptimizedModule(std::move(unoptimized_module)));
 
     return absl::c_count_if(
-        optimized_verified_module->entry_computation()->instructions(),
+        optimized_module->entry_computation()->instructions(),
         [&](const HloInstruction *inst) {
           return inst->opcode() == HloOpcode::kCustomCall &&
                  absl::StrContains(inst->custom_call_target(), "__cudnn$fmha");
@@ -119,32 +108,30 @@ class MultiHeadedAttentionTest : public GpuCodegenTest {
   void ExecuteAndCompare(absl::string_view hlo_string,
                          const std::vector<Literal *> &literals,
                          int expected_num_fmha_calls = 1) {
-    HloModuleConfig config;
-    DebugOptions debug_options = GetDebugOptionsForTest();
-    config.set_debug_options(debug_options);
-    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                            ParseAndReturnVerifiedModule(hlo_string, config));
-    auto expected_result = ExecuteAndTransfer(std::move(module), literals);
-
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> reference_module,
+                            ParseAndReturnVerifiedModule(hlo_string));
+    {
+      DebugOptions debug_options = GetDebugOptionsForTest();
+      debug_options.set_xla_gpu_enable_cudnn_fmha(false);
+      reference_module->mutable_config().set_debug_options(debug_options);
+    }
     // Sanity check to ensure the first computation doesn't use FMHA.
     TF_ASSERT_OK_AND_ASSIGN(int num_fmha_calls,
-                            CountFMHACalls(hlo_string, config));
+                            CountFMHACalls(reference_module->Clone()));
     EXPECT_EQ(num_fmha_calls, 0);
+    const Literal expected_result =
+        ExecuteAndTransfer(std::move(reference_module), literals);
 
-    debug_options.set_xla_gpu_enable_cudnn_fmha(true);
-    HloModuleConfig config_with_fmha;
-    config_with_fmha.set_debug_options(debug_options);
-
-    TF_ASSERT_OK_AND_ASSIGN(
-        std::unique_ptr<HloModule> new_module,
-        ParseAndReturnVerifiedModule(hlo_string, config_with_fmha));
-    auto actual_result = ExecuteAndTransfer(std::move(new_module), literals);
-    EXPECT_TRUE(
-        LiteralTestUtil::Near(expected_result, actual_result, error_spec_));
-
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> test_module,
+                            ParseAndReturnVerifiedModule(hlo_string));
     TF_ASSERT_OK_AND_ASSIGN(num_fmha_calls,
-                            CountFMHACalls(hlo_string, config_with_fmha));
+                            CountFMHACalls(test_module->Clone()));
     EXPECT_EQ(num_fmha_calls, expected_num_fmha_calls);
+    const Literal actual_result =
+        ExecuteAndTransfer(std::move(test_module), literals);
+
+    EXPECT_TRUE(
+        LiteralTestUtil::Near(expected_result, actual_result, mha_error_spec_));
   }
 
   template <typename T>
@@ -158,6 +145,16 @@ class MultiHeadedAttentionTest : public GpuCodegenTest {
         input_data, LayoutUtil::MakeLayout(minor_to_major));
   }
 
+  template <typename T>
+  Literal GetInput3DLiteral(std::vector<int64_t> dimensions,
+                            std::vector<int64_t> minor_to_major) {
+    Array3D<T> input_data(dimensions[0], dimensions[1], dimensions[2]);
+    input_data.FillRandom(/*stddev=*/static_cast<T>(0.023), 0.001);
+
+    return LiteralUtil::CreateR3FromArray3DWithLayout(
+        input_data, LayoutUtil::MakeLayout(minor_to_major));
+  }
+
   Literal GetMask4DLiteral(std::vector<int64_t> dimensions,
                            std::vector<int64_t> minor_to_major) {
     Array4D<bool> input_data(dimensions[0], dimensions[1], dimensions[2],
@@ -177,1042 +174,6 @@ class MultiHeadedAttentionTest : public GpuCodegenTest {
   std::optional<absl::string_view> skip_reason_;
 };
 
-class MultiHeadedAttentionBMMBMM : public MultiHeadedAttentionTest {
- protected:
-  std::string GetModuleFMHABMM_BMM_vanilla_HloString_F16() {
-    const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_.10, entry_computation_layout={(f16[16,16,256,64]{3,2,1,0},f16[16,16,256,64]{3,2,1,0},f16[16,16,256,64]{3,2,1,0})->f16[16,16,256,64]{3,2,1,0}}
-
-    ENTRY main.15 {
-      Arg_2.3 = f16[16,16,256,64]{3,2,1,0} parameter(2)
-      Arg_0.1 = f16[16,16,256,64]{3,2,1,0} parameter(0)
-      Arg_1.2 = f16[16,16,256,64]{3,2,1,0} parameter(1)
-      dot.0 = f16[16,16,256,256]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}, metadata={}
-      ROOT dot.1 = f16[16,16,256,64]{3,2,1,0} dot(dot.0, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}, metadata={}
-    }
-  )";
-    return hlo_text;
-  }
-
-  std::string GetModuleFMHABMM_BMM_vanilla_HloString_BF16() {
-    const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_.10, entry_computation_layout={(bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0})->bf16[16,16,256,64]{3,2,1,0}}
-
-    ENTRY main.15 {
-      Arg_2.3 = bf16[16,16,256,64]{3,2,1,0} parameter(2)
-      Arg_0.1 = bf16[16,16,256,64]{3,2,1,0} parameter(0)
-      Arg_1.2 = bf16[16,16,256,64]{3,2,1,0} parameter(1)
-      dot.0 = bf16[16,16,256,256]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}, metadata={}
-      ROOT dot.1 = bf16[16,16,256,64]{3,2,1,0} dot(dot.0, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}, metadata={}
-    }
-  )";
-    return hlo_text;
-  }
-
-  const std::string                                    // NOLINT
-  GetModuleFMHABMM_BMM_arg_reversal_HloString_F16() {  // NOLINT
-    const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_.10, entry_computation_layout={(f16[16,16,256,64]{3,2,1,0},f16[16,16,256,64]{3,2,1,0},f16[16,16,256,64]{3,2,1,0})->f16[16,16,64,256]{3,2,1,0}}
-
-    ENTRY main.15 {
-      Arg_2.3 = f16[16,16,256,64]{3,2,1,0} parameter(2)
-      Arg_0.1 = f16[16,16,256,64]{3,2,1,0} parameter(0)
-      Arg_1.2 = f16[16,16,256,64]{3,2,1,0} parameter(1)
-      dot.0 = f16[16,16,256,256]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}, metadata={}
-      ROOT dot.1 = f16[16,16,64,256]{3,2,1,0} dot(Arg_2.3, dot.0), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}, metadata={}
-    }
-  )";
-    return hlo_text;
-  }
-
-  const std::string                                     // NOLINT
-  GetModuleFMHABMM_BMM_arg_reversal_HloString_BF16() {  // NOLINT
-    const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_.10, entry_computation_layout={(bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0})->bf16[16,16,64,256]{3,2,1,0}}
-
-    ENTRY main.15 {
-      Arg_2.3 = bf16[16,16,256,64]{3,2,1,0} parameter(2)
-      Arg_0.1 = bf16[16,16,256,64]{3,2,1,0} parameter(0)
-      Arg_1.2 = bf16[16,16,256,64]{3,2,1,0} parameter(1)
-      dot.0 = bf16[16,16,256,256]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}, metadata={}
-      ROOT dot.1 = bf16[16,16,64,256]{3,2,1,0} dot(Arg_2.3, dot.0), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}, metadata={}
-    }
-  )";
-    return hlo_text;
-  }
-
-  std::string
-  GetModuleFMHABMM_BMM_arg_layout_manipulation_arg_reversal_HloString_BF16() {  // NOLINT
-    const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_.10, entry_computation_layout={(bf16[16,256,16,64]{3,2,1,0},bf16[16,256,16,64]{3,2,1,0},bf16[16,256,16,64]{3,2,1,0})->bf16[16,16,64,256]{3,2,1,0}}
-
-    ENTRY main.15 {
-      Arg_2.3 = bf16[16,256,16,64]{3,2,1,0} parameter(2)
-      Arg_0.1 = bf16[16,256,16,64]{3,2,1,0} parameter(0)
-      Arg_1.2 = bf16[16,256,16,64]{3,2,1,0} parameter(1)
-      dot.0 = bf16[16,16,256,256]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,2}, lhs_contracting_dims={3}, rhs_batch_dims={0,2}, rhs_contracting_dims={3}, metadata={}
-      ROOT dot.1 = bf16[16,16,64,256]{3,2,1,0} dot(Arg_2.3, dot.0), lhs_batch_dims={0,2}, lhs_contracting_dims={1}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}, metadata={}
-    }
-  )";
-    return hlo_text;
-  }
-
-  std::string
-  GetModuleFMHABMM_BMM_arg_layout_manipulation_arg_reversal_HloString_F16() {  // NOLINT
-    const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_.10, entry_computation_layout={(f16[16,256,16,64]{3,2,1,0},f16[16,256,16,64]{3,2,1,0},f16[16,256,16,64]{3,2,1,0})->f16[16,16,64,256]{3,2,1,0}}
-
-    ENTRY main.15 {
-      Arg_2.3 = f16[16,256,16,64]{3,2,1,0} parameter(2)
-      Arg_0.1 = f16[16,256,16,64]{3,2,1,0} parameter(0)
-      Arg_1.2 = f16[16,256,16,64]{3,2,1,0} parameter(1)
-      dot.0 = f16[16,16,256,256]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,2}, lhs_contracting_dims={3}, rhs_batch_dims={0,2}, rhs_contracting_dims={3}, metadata={}
-      ROOT dot.1 = f16[16,16,64,256]{3,2,1,0} dot(Arg_2.3, dot.0), lhs_batch_dims={0,2}, lhs_contracting_dims={1}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}, metadata={}
-    }
-  )";
-    return hlo_text;
-  }
-
-  const std::string  // NOLINT
-  GetModuleFMHABMM_BMM_arg_reversal_epilogue_transpose_fusion_HloString_F16() {  // NOLINT
-    const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_.10, entry_computation_layout={(f16[16,16,256,64]{3,2,1,0},f16[16,16,256,64]{3,2,1,0},f16[16,16,256,64]{3,2,1,0})->f16[16,256,16,64]{3,2,1,0}}
-
-    ENTRY main.15 {
-      Arg_2.3 = f16[16,16,256,64]{3,2,1,0} parameter(2)
-      Arg_0.1 = f16[16,16,256,64]{3,2,1,0} parameter(0)
-      Arg_1.2 = f16[16,16,256,64]{3,2,1,0} parameter(1)
-      dot.0 = f16[16,16,256,256]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}, metadata={}
-      dot.1 = f16[16,16,64,256]{3,2,1,0} dot(Arg_2.3, dot.0), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}, metadata={}
-      ROOT transpose.0 = f16[16,256,16,64]{1,3,2,0} transpose(dot.1), dimensions={0,3,1,2}, metadata={}
-    }
-  )";
-    return hlo_text;
-  }
-
-  const std::string  // NOLINT
-  GetModuleFMHABMM_BMM_arg_layout_manipulation_arg_reversal_prologue_transpose_fusion_HloString_F16() {  // NOLINT
-    const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_.10, entry_computation_layout={(f16[16,256,16,64]{3,2,1,0},f16[16,256,16,64]{3,2,1,0},f16[16,256,16,64]{3,2,1,0})->f16[16,16,64,256]{3,2,1,0}}
-
-    ENTRY main.15 {
-      Arg_2.3 = f16[16,256,16,64]{3,2,1,0} parameter(2)
-      Arg_0.1 = f16[16,256,16,64]{3,2,1,0} parameter(0)
-      Arg_1.2 = f16[16,256,16,64]{3,2,1,0} parameter(1)
-      dot.0 = f16[16,16,256,256]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,2}, lhs_contracting_dims={3}, rhs_batch_dims={0,2}, rhs_contracting_dims={3}, metadata={}
-      ROOT dot.1 = f16[16,16,64,256]{3,2,1,0} dot(Arg_2.3, dot.0), lhs_batch_dims={0,2}, lhs_contracting_dims={1}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}, metadata={}
-    }
-  )";
-    return hlo_text;
-  }
-
-  const std::string  // NOLINT
-  GetModuleFMHABMM_BMM_arg_layout_manipulation_arg_reversal_prologue_transpose_fusion_HloString_BF16() {  // NOLINT
-    const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_.10, entry_computation_layout={(bf16[16,256,16,64]{3,2,1,0},bf16[16,256,16,64]{3,2,1,0},bf16[16,256,16,64]{3,2,1,0})->bf16[16,16,64,256]{3,2,1,0}}
-
-    ENTRY main.15 {
-      Arg_2.3 = bf16[16,256,16,64]{3,2,1,0} parameter(2)
-      Arg_0.1 = bf16[16,256,16,64]{3,2,1,0} parameter(0)
-      Arg_1.2 = bf16[16,256,16,64]{3,2,1,0} parameter(1)
-      dot.0 = bf16[16,16,256,256]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,2}, lhs_contracting_dims={3}, rhs_batch_dims={0,2}, rhs_contracting_dims={3}, metadata={}
-      ROOT dot.1 = bf16[16,16,64,256]{3,2,1,0} dot(Arg_2.3, dot.0), lhs_batch_dims={0,2}, lhs_contracting_dims={1}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}, metadata={}
-    }
-  )";
-    return hlo_text;
-  }
-
-  const std::string                                             // NOLINT
-  GetModuleFMHABMM_BMM_all_canonicalization_HloString_BF16() {  // NOLINT
-    const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_.10, entry_computation_layout={(bf16[16,256,16,64]{3,2,1,0},bf16[16,256,16,64]{3,2,1,0},bf16[16,256,16,64]{3,2,1,0})->bf16[16,256,16,64]{3,2,1,0}}
-
-    ENTRY main.15 {
-      Arg_2.3 = bf16[16,256,16,64]{3,2,1,0} parameter(2)
-      Arg_0.1 = bf16[16,256,16,64]{3,2,1,0} parameter(0)
-      Arg_1.2 = bf16[16,256,16,64]{3,2,1,0} parameter(1)
-      dot.0 = bf16[16,16,256,256]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,2}, lhs_contracting_dims={3}, rhs_batch_dims={0,2}, rhs_contracting_dims={3}, metadata={}
-      dot.1 = bf16[16,16,64,256]{3,2,1,0} dot(Arg_2.3, dot.0), lhs_batch_dims={0,2}, lhs_contracting_dims={1}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}, metadata={}
-      ROOT transpose.0 = bf16[16,256,16,64]{1,3,2,0} transpose(dot.1), dimensions={0,3,1,2}, metadata={}
-    }
-  )";
-    return hlo_text;
-  }
-
-  const std::string                                            // NOLINT
-  GetModuleFMHABMM_BMM_all_canonicalization_HloString_F16() {  // NOLINT
-    const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_.10, entry_computation_layout={(f16[16,256,16,64]{3,2,1,0},f16[16,256,16,64]{3,2,1,0},f16[16,256,16,64]{3,2,1,0})->f16[16,256,16,64]{3,2,1,0}}
-
-    ENTRY main.15 {
-      Arg_2.3 = f16[16,256,16,64]{3,2,1,0} parameter(2)
-      Arg_0.1 = f16[16,256,16,64]{3,2,1,0} parameter(0)
-      Arg_1.2 = f16[16,256,16,64]{3,2,1,0} parameter(1)
-      dot.0 = f16[16,16,256,256]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,2}, lhs_contracting_dims={3}, rhs_batch_dims={0,2}, rhs_contracting_dims={3}, metadata={}
-      dot.1 = f16[16,16,64,256]{3,2,1,0} dot(Arg_2.3, dot.0), lhs_batch_dims={0,2}, lhs_contracting_dims={1}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}, metadata={}
-      ROOT transpose.0 = f16[16,256,16,64]{1,3,2,0} transpose(dot.1), dimensions={0,3,1,2}, metadata={}
-    }
-  )";
-    return hlo_text;
-  }
-
-  const std::string  // NOLINT
-  GetModuleFMHABMM_BMM_all_canonicalization_transpose_fusion_HloString_F16() {  // NOLINT
-    const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_.10, entry_computation_layout={(f16[16,256,16,64]{3,2,1,0},f16[16,256,16,64]{3,2,1,0},f16[16,256,16,64]{3,2,1,0})->f16[16,256,16,64]{3,2,1,0}}
-
-    ENTRY main.15 {
-      Arg_2.3 = f16[16,256,16,64]{3,2,1,0} parameter(2)
-      Arg_0.1 = f16[16,256,16,64]{3,2,1,0} parameter(0)
-      Arg_1.2 = f16[16,256,16,64]{3,2,1,0} parameter(1)
-      dot.0 = f16[16,16,256,256]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,2}, lhs_contracting_dims={3}, rhs_batch_dims={0,2}, rhs_contracting_dims={3}, metadata={}
-      dot.1 = f16[16,16,64,256]{3,2,1,0} dot(Arg_2.3, dot.0), lhs_batch_dims={0,2}, lhs_contracting_dims={1}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}, metadata={}
-      ROOT transpose.0 = f16[16,256,16,64]{1,3,2,0} transpose(dot.1), dimensions={0,3,1,2}, metadata={}
-    }
-  )";
-    return hlo_text;
-  }
-
-  const std::string  // NOLINT
-  GetModuleFMHABMM_BMM_all_canonicalization_transpose_fusion_HloString_BF16() {  // NOLINT
-    const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_.10, entry_computation_layout={(bf16[16,256,16,64]{3,2,1,0},bf16[16,256,16,64]{3,2,1,0},bf16[16,256,16,64]{3,2,1,0})->bf16[16,256,16,64]{3,2,1,0}}
-
-    ENTRY main.15 {
-      Arg_2.3 = bf16[16,256,16,64]{3,2,1,0} parameter(2)
-      Arg_0.1 = bf16[16,256,16,64]{3,2,1,0} parameter(0)
-      Arg_1.2 = bf16[16,256,16,64]{3,2,1,0} parameter(1)
-      dot.0 = bf16[16,16,256,256]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,2}, lhs_contracting_dims={3}, rhs_batch_dims={0,2}, rhs_contracting_dims={3}, metadata={}
-      dot.1 = bf16[16,16,64,256]{3,2,1,0} dot(Arg_2.3, dot.0), lhs_batch_dims={0,2}, lhs_contracting_dims={1}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}, metadata={}
-      ROOT transpose.0 = bf16[16,256,16,64]{1,3,2,0} transpose(dot.1), dimensions={0,3,1,2}, metadata={}
-    }
-  )";
-    return hlo_text;
-  }
-
-  const std::string  // NOLINT
-  GetModuleFMHABMM_BMM_all_canonicalization_transpose_fusion_HloString_F16_small() {  // NOLINT
-    const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_.10, entry_computation_layout={(f16[2,4,2,64]{3,2,1,0},f16[2,4,2,64]{3,2,1,0},f16[2,4,2,64]{3,2,1,0})->f16[2,4,2,64]{3,2,1,0}}
-
-    ENTRY main.15 {
-      Arg_2.3 = f16[2,4,2,64]{3,2,1,0} parameter(2)
-      Arg_0.1 = f16[2,4,2,64]{3,2,1,0} parameter(0)
-      Arg_1.2 = f16[2,4,2,64]{3,2,1,0} parameter(1)
-      dot.0 = f16[2,2,4,4]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,2}, lhs_contracting_dims={3}, rhs_batch_dims={0,2}, rhs_contracting_dims={3}, metadata={}
-      dot.1 = f16[2,2,64,4]{3,2,1,0} dot(Arg_2.3, dot.0), lhs_batch_dims={0,2}, lhs_contracting_dims={1}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}, metadata={}
-      ROOT transpose.0 = f16[2,4,2,64]{1,3,2,0} transpose(dot.1), dimensions={0,3,1,2}, metadata={}
-    }
-  )";
-    return hlo_text;
-  }
-
-  const std::string  // NOLINT
-  GetModuleFMHABMM_BMM1_contracting_dim_stride_not_1_HloString_F16() {  // NOLINT
-    const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_.10, entry_computation_layout={(f16[16,16,256,64]{2,3,1,0},f16[16,16,256,64]{2,3,1,0},f16[16,16,256,64]{3,2,1,0})->f16[16,16,256,64]{3,2,1,0}}
-
-    ENTRY main.15 {
-      Arg_2.3 = f16[16,16,256,64]{3,2,1,0} parameter(2)
-      Arg_0.1 = f16[16,16,256,64]{2,3,1,0} parameter(0)
-      Arg_1.2 = f16[16,16,256,64]{2,3,1,0} parameter(1)
-      dot.0 = f16[16,16,256,256]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}, metadata={}
-      ROOT dot.1 = f16[16,16,256,64]{3,2,1,0} dot(dot.0, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}, metadata={}
-    }
-  )";
-    return hlo_text;
-  }
-
-  std::string
-  GetModuleFMHABMM_BMM2_non_contracting_dim_stride_not_1_HloString_F16() {  // NOLINT
-    const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_.10, entry_computation_layout={(f16[16,16,256,64]{3,2,1,0},f16[16,16,256,64]{3,2,1,0},f16[16,16,256,64]{2,3,1,0})->f16[16,16,256,64]{3,2,1,0}}
-
-    ENTRY main.15 {
-      Arg_2.3 = f16[16,16,256,64]{2,3,1,0} parameter(2)
-      Arg_0.1 = f16[16,16,256,64]{3,2,1,0} parameter(0)
-      Arg_1.2 = f16[16,16,256,64]{3,2,1,0} parameter(1)
-      dot.0 = f16[16,16,256,256]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}, metadata={}
-      ROOT dot.1 = f16[16,16,256,64]{3,2,1,0} dot(dot.0, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}, metadata={}
-    }
-  )";
-    return hlo_text;
-  }
-
-  const std::string  // NOLINT
-  GetModuleFMHABMM_BMM_arg_reversal_epilogue_transpose_fusion_HloString_BF16() {  // NOLINT
-    const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_.10, entry_computation_layout={(bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0})->bf16[16,256,16,64]{3,2,1,0}}
-
-    ENTRY main.15 {
-      Arg_2.3 = bf16[16,16,256,64]{3,2,1,0} parameter(2)
-      Arg_0.1 = bf16[16,16,256,64]{3,2,1,0} parameter(0)
-      Arg_1.2 = bf16[16,16,256,64]{3,2,1,0} parameter(1)
-      dot.0 = bf16[16,16,256,256]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}, metadata={}
-      dot.1 = bf16[16,16,64,256]{3,2,1,0} dot(Arg_2.3, dot.0), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}, metadata={}
-      ROOT transpose.0 = bf16[16,256,16,64]{1,3,2,0} transpose(dot.1), dimensions={0,3,1,2}, metadata={}
-    }
-  )";
-    return hlo_text;
-  }
-
-  template <typename T>
-  void TestImpl_FMHABMM_BMM_vanilla() {
-    if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-    XlaBuilder builder(TestName());
-
-    auto lhs_bmm1_literal =
-        GetInput4DLiteral<T>({16, 16, 256, 64}, {3, 2, 1, 0});
-    auto rhs_bmm1_literal =
-        GetInput4DLiteral<T>({16, 16, 256, 64}, {3, 2, 1, 0});
-    auto rhs_bmm2_literal =
-        GetInput4DLiteral<T>({16, 16, 256, 64}, {3, 2, 1, 0});
-
-    std::string hlo_string = "";
-    if (std::is_same<T, Eigen::half>::value) {
-      hlo_string = GetModuleFMHABMM_BMM_vanilla_HloString_F16();
-    } else if (std::is_same<T, bfloat16>::value) {
-      hlo_string = GetModuleFMHABMM_BMM_vanilla_HloString_BF16();
-    }
-
-    ExecuteAndCompare(
-        hlo_string, {&lhs_bmm1_literal, &rhs_bmm1_literal, &rhs_bmm2_literal});
-  }
-
-  template <typename T>
-  void TestImpl_FMHABMM_BMM_arg_reversal() {
-    if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-    XlaBuilder builder(TestName());
-
-    auto lhs_bmm1_literal =
-        GetInput4DLiteral<T>({16, 16, 256, 64}, {3, 2, 1, 0});
-    auto rhs_bmm1_literal =
-        GetInput4DLiteral<T>({16, 16, 256, 64}, {3, 2, 1, 0});
-    auto rhs_bmm2_literal =
-        GetInput4DLiteral<T>({16, 16, 256, 64}, {3, 2, 1, 0});
-
-    std::string hlo_string = "";
-    if (std::is_same<T, Eigen::half>::value) {
-      hlo_string = GetModuleFMHABMM_BMM_arg_reversal_HloString_F16();
-    } else if (std::is_same<T, bfloat16>::value) {
-      hlo_string = GetModuleFMHABMM_BMM_arg_reversal_HloString_BF16();
-    }
-    ExecuteAndCompare(
-        hlo_string, {&lhs_bmm1_literal, &rhs_bmm1_literal, &rhs_bmm2_literal});
-  }
-
-  template <typename T>
-  void TestImpl_FMHABMM_BMM_arg_layout_manipulation_arg_reversal_fusion() {
-    if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-    XlaBuilder builder(TestName());
-
-    auto lhs_bmm1_literal =
-        GetInput4DLiteral<T>({16, 256, 16, 64}, {3, 2, 1, 0});
-    auto rhs_bmm1_literal =
-        GetInput4DLiteral<T>({16, 256, 16, 64}, {3, 2, 1, 0});
-    auto rhs_bmm2_literal =
-        GetInput4DLiteral<T>({16, 256, 16, 64}, {3, 2, 1, 0});
-
-    std::string hlo_string = "";
-    if (std::is_same<T, Eigen::half>::value) {
-      hlo_string =
-          GetModuleFMHABMM_BMM_arg_layout_manipulation_arg_reversal_HloString_F16();  // NOLINT
-    } else if (std::is_same<T, bfloat16>::value) {
-      hlo_string =
-          GetModuleFMHABMM_BMM_arg_layout_manipulation_arg_reversal_HloString_BF16();  // NOLINT
-    }
-    ExecuteAndCompare(
-        hlo_string, {&lhs_bmm1_literal, &rhs_bmm1_literal, &rhs_bmm2_literal});
-  }
-
-  template <typename T>
-  void TestImpl_FMHABMM_BMM_arg_reversal_epilogue_transpose_fusion() {
-    if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-    XlaBuilder builder(TestName());
-
-    auto lhs_bmm1_literal =
-        GetInput4DLiteral<T>({16, 16, 256, 64}, {3, 2, 1, 0});
-    auto rhs_bmm1_literal =
-        GetInput4DLiteral<T>({16, 16, 256, 64}, {3, 2, 1, 0});
-    auto rhs_bmm2_literal =
-        GetInput4DLiteral<T>({16, 16, 256, 64}, {3, 2, 1, 0});
-
-    std::string hlo_string = "";
-    if (std::is_same<T, Eigen::half>::value) {
-      hlo_string =
-          GetModuleFMHABMM_BMM_arg_reversal_epilogue_transpose_fusion_HloString_F16();  // NOLINT
-    } else if (std::is_same<T, bfloat16>::value) {
-      hlo_string =
-          GetModuleFMHABMM_BMM_arg_reversal_epilogue_transpose_fusion_HloString_BF16();  // NOLINT
-    }
-    ExecuteAndCompare(
-        hlo_string, {&lhs_bmm1_literal, &rhs_bmm1_literal, &rhs_bmm2_literal});
-  }
-
-  template <typename T>
-  void
-  TestImpl_FMHABMM_BMM_arg_layout_manipulation_arg_reversal_prologue_transpose_fusion() {  // NOLINT
-    if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-    XlaBuilder builder(TestName());
-
-    auto lhs_bmm1_literal =
-        GetInput4DLiteral<T>({16, 256, 16, 64}, {3, 2, 1, 0});
-    auto rhs_bmm1_literal =
-        GetInput4DLiteral<T>({16, 256, 16, 64}, {3, 2, 1, 0});
-    auto rhs_bmm2_literal =
-        GetInput4DLiteral<T>({16, 256, 16, 64}, {3, 2, 1, 0});
-
-    std::string hlo_string = "";
-    if (std::is_same<T, Eigen::half>::value) {
-      hlo_string =
-          GetModuleFMHABMM_BMM_arg_layout_manipulation_arg_reversal_prologue_transpose_fusion_HloString_F16();  // NOLINT
-    } else if (std::is_same<T, bfloat16>::value) {
-      hlo_string =
-          GetModuleFMHABMM_BMM_arg_layout_manipulation_arg_reversal_prologue_transpose_fusion_HloString_BF16();  // NOLINT
-    }
-    ExecuteAndCompare(
-        hlo_string, {&lhs_bmm1_literal, &rhs_bmm1_literal, &rhs_bmm2_literal});
-  }
-
-  template <typename T>
-  void TestImpl_FMHABMM_BMM_all_canonicalization_transpose_fusion() {
-    if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-    XlaBuilder builder(TestName());
-
-    auto lhs_bmm1_literal =
-        GetInput4DLiteral<T>({16, 256, 16, 64}, {3, 2, 1, 0});
-    auto rhs_bmm1_literal =
-        GetInput4DLiteral<T>({16, 256, 16, 64}, {3, 2, 1, 0});
-    auto rhs_bmm2_literal =
-        GetInput4DLiteral<T>({16, 256, 16, 64}, {3, 2, 1, 0});
-
-    std::string hlo_string = "";
-    if (std::is_same<T, Eigen::half>::value) {
-      hlo_string =
-          GetModuleFMHABMM_BMM_all_canonicalization_transpose_fusion_HloString_F16();  // NOLINT
-    } else if (std::is_same<T, bfloat16>::value) {
-      hlo_string =
-          GetModuleFMHABMM_BMM_all_canonicalization_transpose_fusion_HloString_BF16();  // NOLINT
-    }
-    ExecuteAndCompare(
-        hlo_string, {&lhs_bmm1_literal, &rhs_bmm1_literal, &rhs_bmm2_literal});
-  }
-
-  template <typename T>
-  void TestImpl_FMHABMM_BMM_all_canonicalization() {
-    if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-    XlaBuilder builder(TestName());
-
-    auto lhs_bmm1_literal =
-        GetInput4DLiteral<T>({16, 256, 16, 64}, {3, 2, 1, 0});
-    auto rhs_bmm1_literal =
-        GetInput4DLiteral<T>({16, 256, 16, 64}, {3, 2, 1, 0});
-    auto rhs_bmm2_literal =
-        GetInput4DLiteral<T>({16, 256, 16, 64}, {3, 2, 1, 0});
-
-    std::string hlo_string = "";
-    if (std::is_same<T, Eigen::half>::value) {
-      hlo_string = GetModuleFMHABMM_BMM_all_canonicalization_HloString_F16();
-    } else if (std::is_same<T, bfloat16>::value) {
-      hlo_string = GetModuleFMHABMM_BMM_all_canonicalization_HloString_BF16();
-    }
-    ExecuteAndCompare(
-        hlo_string, {&lhs_bmm1_literal, &rhs_bmm1_literal, &rhs_bmm2_literal});
-  }
-
-  template <typename T>
-  void TestImpl_FMHABMM_BMM_all_canonicalization_transpose_fusion_small() {
-    if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-    XlaBuilder builder(TestName());
-
-    auto lhs_bmm1_literal = GetInput4DLiteral<T>({2, 4, 2, 64}, {3, 2, 1, 0});
-    auto rhs_bmm1_literal = GetInput4DLiteral<T>({2, 4, 2, 64}, {3, 2, 1, 0});
-    auto rhs_bmm2_literal = GetInput4DLiteral<T>({2, 4, 2, 64}, {3, 2, 1, 0});
-
-    std::string hlo_string = "";
-    if (std::is_same<T, Eigen::half>::value) {
-      hlo_string =
-          GetModuleFMHABMM_BMM_all_canonicalization_transpose_fusion_HloString_F16_small();  // NOLINT
-    }
-    ExecuteAndCompare(
-        hlo_string, {&lhs_bmm1_literal, &rhs_bmm1_literal, &rhs_bmm2_literal});
-  }
-
-  template <typename T>
-  void TestImpl_BMM_BMM1_contracting_dim_stride_not_1() {
-    if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-    XlaBuilder builder(TestName());
-
-    auto lhs_bmm1_literal =
-        GetInput4DLiteral<T>({16, 16, 256, 64}, {2, 3, 1, 0});
-    auto rhs_bmm1_literal =
-        GetInput4DLiteral<T>({16, 16, 256, 64}, {2, 3, 1, 0});
-    auto rhs_bmm2_literal =
-        GetInput4DLiteral<T>({16, 16, 256, 64}, {3, 2, 1, 0});
-
-    std::string hlo_string = "";
-    if (std::is_same<T, Eigen::half>::value) {
-      hlo_string =
-          GetModuleFMHABMM_BMM1_contracting_dim_stride_not_1_HloString_F16();
-    }
-
-    ExecuteAndCompare(
-        hlo_string, {&lhs_bmm1_literal, &rhs_bmm1_literal, &rhs_bmm2_literal});
-  }
-
-  template <typename T>
-  void TestImpl_BMM_BMM2_non_contracting_dim_stride_not_1() {
-    if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-    XlaBuilder builder(TestName());
-
-    auto lhs_bmm1_literal =
-        GetInput4DLiteral<T>({16, 16, 256, 64}, {3, 2, 1, 0});
-    auto rhs_bmm1_literal =
-        GetInput4DLiteral<T>({16, 16, 256, 64}, {3, 2, 1, 0});
-    auto rhs_bmm2_literal =
-        GetInput4DLiteral<T>({16, 16, 256, 64}, {2, 3, 1, 0});
-
-    std::string hlo_string = "";
-    if (std::is_same<T, Eigen::half>::value) {
-      hlo_string =
-          GetModuleFMHABMM_BMM2_non_contracting_dim_stride_not_1_HloString_F16();  // NOLINT
-    }
-
-    ExecuteAndCompare(
-        hlo_string, {&lhs_bmm1_literal, &rhs_bmm1_literal, &rhs_bmm2_literal});
-  }
-};
-
-// Bmm1 - Softmax - Bmm2
-class MultiHeadedAttentionBMMSoftmaxBMM : public MultiHeadedAttentionTest {
- protected:
-  std::string GetModuleFMHABMM1_Softmax_BMM2_HloString_F16() {
-    const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(f16[16,16,256,64]{3,2,1,0},f16[16,16,256,64]{3,2,1,0},f16[16,16,256,64]{3,2,1,0})->f16[16,16,256,64]{3,2,1,0}}
-
-    region_0.7 {
-      Arg_0.8 = f16[] parameter(0)
-      Arg_1.9 = f16[] parameter(1)
-      ROOT maximum.10 = f16[] maximum(Arg_0.8, Arg_1.9)
-    }
-
-    region_1.19 {
-      Arg_0.20 = f32[] parameter(0)
-      Arg_1.21 = f32[] parameter(1)
-      ROOT add.22 = f32[] add(Arg_0.20, Arg_1.21)
-    }
-
-    ENTRY main.31 {
-      Arg_0.1 = f16[16,16,256,64]{3,2,1,0} parameter(0)
-      Arg_1.2 = f16[16,16,256,64]{3,2,1,0} parameter(1)
-      dot.6 = f16[16,16,256,256]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-      constant.5 = f16[] constant(-inf)
-      reduce.11 = f16[16,16,256]{2,1,0} reduce(dot.6, constant.5), dimensions={3}, to_apply=region_0.7
-      reshape.12 = f16[16,16,256,1]{3,2,1,0} reshape(reduce.11)
-      broadcast.13 = f16[16,16,256,1]{3,2,1,0} broadcast(reshape.12), dimensions={0,1,2,3}
-      reshape.14 = f16[16,16,256]{2,1,0} reshape(broadcast.13)
-      broadcast.15 = f16[16,16,256,256]{3,2,1,0} broadcast(reshape.14), dimensions={0,1,2}
-      subtract.16 = f16[16,16,256,256]{3,2,1,0} subtract(dot.6, broadcast.15)
-      exponential.17 = f16[16,16,256,256]{3,2,1,0} exponential(subtract.16)
-      convert.18 = f32[16,16,256,256]{3,2,1,0} convert(exponential.17)
-      constant.4 = f32[] constant(0)
-      reduce.23 = f32[16,16,256]{2,1,0} reduce(convert.18, constant.4), dimensions={3}, to_apply=region_1.19
-      reshape.24 = f32[16,16,256,1]{3,2,1,0} reshape(reduce.23)
-      convert.25 = f16[16,16,256,1]{3,2,1,0} convert(reshape.24)
-      broadcast.26 = f16[16,16,256,1]{3,2,1,0} broadcast(convert.25), dimensions={0,1,2,3}
-      reshape.27 = f16[16,16,256]{2,1,0} reshape(broadcast.26)
-      broadcast.28 = f16[16,16,256,256]{3,2,1,0} broadcast(reshape.27), dimensions={0,1,2}
-      divide.29 = f16[16,16,256,256]{3,2,1,0} divide(exponential.17, broadcast.28)
-      Arg_2.3 = f16[16,16,256,64]{3,2,1,0} parameter(2)
-      ROOT dot.30 = f16[16,16,256,64]{3,2,1,0} dot(divide.29, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-    }
-  )";
-
-    return hlo_text;
-  }
-
-  std::string GetModuleFMHABMM1_Softmax_BMM2_HloString_BF16() {
-    const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0})->bf16[16,16,256,64]{3,2,1,0}}
-
-    region_0.7 {
-      Arg_0.8 = bf16[] parameter(0)
-      Arg_1.9 = bf16[] parameter(1)
-      ROOT maximum.10 = bf16[] maximum(Arg_0.8, Arg_1.9)
-    }
-
-    region_1.19 {
-      Arg_0.20 = f32[] parameter(0)
-      Arg_1.21 = f32[] parameter(1)
-      ROOT add.22 = f32[] add(Arg_0.20, Arg_1.21)
-    }
-
-    ENTRY main.31 {
-      Arg_0.1 = bf16[16,16,256,64]{3,2,1,0} parameter(0)
-      Arg_1.2 = bf16[16,16,256,64]{3,2,1,0} parameter(1)
-      dot.6 = bf16[16,16,256,256]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-      constant.5 = bf16[] constant(-inf)
-      reduce.11 = bf16[16,16,256]{2,1,0} reduce(dot.6, constant.5), dimensions={3}, to_apply=region_0.7
-      reshape.12 = bf16[16,16,256,1]{3,2,1,0} reshape(reduce.11)
-      broadcast.13 = bf16[16,16,256,1]{3,2,1,0} broadcast(reshape.12), dimensions={0,1,2,3}
-      reshape.14 = bf16[16,16,256]{2,1,0} reshape(broadcast.13)
-      broadcast.15 = bf16[16,16,256,256]{3,2,1,0} broadcast(reshape.14), dimensions={0,1,2}
-      subtract.16 = bf16[16,16,256,256]{3,2,1,0} subtract(dot.6, broadcast.15)
-      exponential.17 = bf16[16,16,256,256]{3,2,1,0} exponential(subtract.16)
-      convert.18 = f32[16,16,256,256]{3,2,1,0} convert(exponential.17)
-      constant.4 = f32[] constant(0)
-      reduce.23 = f32[16,16,256]{2,1,0} reduce(convert.18, constant.4), dimensions={3}, to_apply=region_1.19
-      reshape.24 = f32[16,16,256,1]{3,2,1,0} reshape(reduce.23)
-      convert.25 = bf16[16,16,256,1]{3,2,1,0} convert(reshape.24)
-      broadcast.26 = bf16[16,16,256,1]{3,2,1,0} broadcast(convert.25), dimensions={0,1,2,3}
-      reshape.27 = bf16[16,16,256]{2,1,0} reshape(broadcast.26)
-      broadcast.28 = bf16[16,16,256,256]{3,2,1,0} broadcast(reshape.27), dimensions={0,1,2}
-      divide.29 = bf16[16,16,256,256]{3,2,1,0} divide(exponential.17, broadcast.28)
-      Arg_2.3 = bf16[16,16,256,64]{3,2,1,0} parameter(2)
-      ROOT dot.30 = bf16[16,16,256,64]{3,2,1,0} dot(divide.29, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-    }
-  )";
-
-    return hlo_text;
-  }
-
-  // BMM1 - Softmax - BMM2
-  template <typename T>
-  void TestImpl_FMHABMM1_Softmax_BMM2_vanilla() {
-    if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-    XlaBuilder builder(TestName());
-
-    auto lhs_bmm1_literal =
-        GetInput4DLiteral<T>({16, 16, 256, 64}, {3, 2, 1, 0});
-    auto rhs_bmm1_literal =
-        GetInput4DLiteral<T>({16, 16, 256, 64}, {3, 2, 1, 0});
-    auto rhs_bmm2_literal =
-        GetInput4DLiteral<T>({16, 16, 256, 64}, {3, 2, 1, 0});
-
-    std::string hlo_string = "";
-    if (std::is_same<T, Eigen::half>::value) {
-      hlo_string = GetModuleFMHABMM1_Softmax_BMM2_HloString_F16();
-    } else if (std::is_same<T, bfloat16>::value) {
-      hlo_string = GetModuleFMHABMM1_Softmax_BMM2_HloString_BF16();
-    }
-
-    ExecuteAndCompare(
-        hlo_string, {&lhs_bmm1_literal, &rhs_bmm1_literal, &rhs_bmm2_literal});
-  }
-};
-
-class MultiHeadedAttentionBMMScaleBiasSoftmaxBMM
-    : public MultiHeadedAttentionTest {
- protected:
-  // Bmm1 - Scale - Bias - Softmax - Bmm2
-  std::string GetModuleFMHABMM1_Scale_Bias_Softmax_BMM2_HloString_F16() {
-    const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(f16[16,16,256,64]{3,2,1,0},f16[16,16,256,64]{3,2,1,0},f16[16,16,256,64]{3,2,1,0})->f16[16,16,256,64]{3,2,1,0}}
-
-    region_0.13 {
-      Arg_0.14 = f16[] parameter(0)
-      Arg_1.15 = f16[] parameter(1)
-      ROOT maximum.16 = f16[] maximum(Arg_0.14, Arg_1.15)
-    }
-
-    region_1.25 {
-      Arg_0.26 = f32[] parameter(0)
-      Arg_1.27 = f32[] parameter(1)
-      ROOT add.28 = f32[] add(Arg_0.26, Arg_1.27)
-    }
-
-    ENTRY main.37 {
-      Arg_0.1 = f16[16,16,256,64]{3,2,1,0} parameter(0)
-      Arg_1.2 = f16[16,16,256,64]{3,2,1,0} parameter(1)
-      dot.10 = f16[16,16,256,256]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-      constant.4 = f16[] constant(2)
-      broadcast.5 = f16[16,16,256,256]{3,2,1,0} broadcast(constant.4), dimensions={}
-      multiply.11 = f16[16,16,256,256]{3,2,1,0} multiply(dot.10, broadcast.5)
-      constant.8 = f16[] constant(1)
-      broadcast.9 = f16[16,16,256,256]{3,2,1,0} broadcast(constant.8), dimensions={}
-      add.12 = f16[16,16,256,256]{3,2,1,0} add(multiply.11, broadcast.9)
-      constant.7 = f16[] constant(-inf)
-      reduce.17 = f16[16,16,256]{2,1,0} reduce(add.12, constant.7), dimensions={3}, to_apply=region_0.13
-      reshape.18 = f16[16,16,256,1]{3,2,1,0} reshape(reduce.17)
-      broadcast.19 = f16[16,16,256,1]{3,2,1,0} broadcast(reshape.18), dimensions={0,1,2,3}
-      reshape.20 = f16[16,16,256]{2,1,0} reshape(broadcast.19)
-      broadcast.21 = f16[16,16,256,256]{3,2,1,0} broadcast(reshape.20), dimensions={0,1,2}
-      subtract.22 = f16[16,16,256,256]{3,2,1,0} subtract(add.12, broadcast.21)
-      exponential.23 = f16[16,16,256,256]{3,2,1,0} exponential(subtract.22)
-      convert.24 = f32[16,16,256,256]{3,2,1,0} convert(exponential.23)
-      constant.6 = f32[] constant(0)
-      reduce.29 = f32[16,16,256]{2,1,0} reduce(convert.24, constant.6), dimensions={3}, to_apply=region_1.25
-      reshape.30 = f32[16,16,256,1]{3,2,1,0} reshape(reduce.29)
-      convert.31 = f16[16,16,256,1]{3,2,1,0} convert(reshape.30)
-      broadcast.32 = f16[16,16,256,1]{3,2,1,0} broadcast(convert.31), dimensions={0,1,2,3}
-      reshape.33 = f16[16,16,256]{2,1,0} reshape(broadcast.32)
-      broadcast.34 = f16[16,16,256,256]{3,2,1,0} broadcast(reshape.33), dimensions={0,1,2}
-      divide.35 = f16[16,16,256,256]{3,2,1,0} divide(exponential.23, broadcast.34)
-      Arg_2.3 = f16[16,16,256,64]{3,2,1,0} parameter(2)
-      ROOT dot.36 = f16[16,16,256,64]{3,2,1,0} dot(divide.35, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-    }
-  )";
-
-    return hlo_text;
-  }
-
-  const std::string  // NOLINT
-  GetModuleFMHA_Training_BMM1_Scale_Bias_Softmax_BMM2_HloString_BF16() {  // NOLINT
-    const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[8,128,8,64]{3,2,1,0},bf16[8,128,8,64]{3,2,1,0},bf16[8,128,8,64]{3,2,1,0},bf16[1,8,128,128]{3,2,1,0},pred[8,1,128,128]{3,2,1,0},bf16[8,128,8,64]{3,2,1,0})->(bf16[8,128,8,64]{3,2,1,0}, bf16[8,128,8,64]{3,2,1,0}, bf16[8,128,8,64]{3,2,1,0}, bf16[8,128,8,64]{3,2,1,0}, bf16[1,8,128,128]{3,2,1,0})}, allow_spmd_sharding_propagation_to_output={true,true,true,true,true}
-
-    region_0.35 {
-      Arg_0.36 = bf16[] parameter(0)
-      Arg_1.37 = bf16[] parameter(1)
-      ROOT maximum.38 = bf16[] maximum(Arg_0.36, Arg_1.37)
-    }
-
-    region_1.47 {
-      Arg_0.48 = f32[] parameter(0)
-      Arg_1.49 = f32[] parameter(1)
-      ROOT add.50 = f32[] add(Arg_0.48, Arg_1.49)
-    }
-
-    region_2.71 {
-      Arg_0.72 = bf16[] parameter(0)
-      Arg_1.73 = bf16[] parameter(1)
-      ROOT add.74 = bf16[] add(Arg_0.72, Arg_1.73)
-    }
-
-    region_3.83 {
-      Arg_0.84 = f32[] parameter(0)
-      Arg_1.85 = f32[] parameter(1)
-      ROOT add.86 = f32[] add(Arg_0.84, Arg_1.85)
-    }
-
-    region_4.92 {
-      Arg_0.93 = bf16[] parameter(0)
-      Arg_1.94 = bf16[] parameter(1)
-      ROOT add.95 = bf16[] add(Arg_0.93, Arg_1.94)
-    }
-
-    ENTRY main.104 {
-      Arg_2.3 = bf16[8,128,8,64]{3,2,1,0} parameter(2)
-      Arg_0.1 = bf16[8,128,8,64]{3,2,1,0} parameter(0)
-      constant.7 = bf16[] constant(2)
-      broadcast.8 = bf16[8,128,8,64]{3,2,1,0} broadcast(constant.7), dimensions={}
-      divide.32 = bf16[8,128,8,64]{3,2,1,0} divide(Arg_0.1, broadcast.8)
-      Arg_1.2 = bf16[8,128,8,64]{3,2,1,0} parameter(1)
-      dot.33 = bf16[8,8,128,128]{3,2,1,0} dot(divide.32, Arg_1.2), lhs_batch_dims={0,2}, lhs_contracting_dims={3}, rhs_batch_dims={0,2}, rhs_contracting_dims={3}
-      Arg_4.5 = pred[8,1,128,128]{3,2,1,0} parameter(4)
-      convert.20 = s32[8,1,128,128]{3,2,1,0} convert(Arg_4.5)
-      constant.15 = s32[] constant(0)
-      broadcast.16 = s32[8,1,128,128]{3,2,1,0} broadcast(constant.15), dimensions={}
-      compare.21 = pred[8,1,128,128]{3,2,1,0} compare(convert.20, broadcast.16), direction=GT
-      constant.13 = f32[] constant(0)
-      broadcast.14 = f32[8,1,128,128]{3,2,1,0} broadcast(constant.13), dimensions={}
-      convert.22 = bf16[8,1,128,128]{3,2,1,0} convert(broadcast.14)
-      constant.11 = f32[] constant(-1e+10)
-      broadcast.12 = f32[8,1,128,128]{3,2,1,0} broadcast(constant.11), dimensions={}
-      convert.23 = bf16[8,1,128,128]{3,2,1,0} convert(broadcast.12)
-      select.24 = bf16[8,1,128,128]{3,2,1,0} select(compare.21, convert.22, convert.23)
-      broadcast.25 = bf16[8,1,128,128]{3,2,1,0} broadcast(select.24), dimensions={0,1,2,3}
-      reshape.26 = bf16[8,128,128]{2,1,0} reshape(broadcast.25)
-      broadcast.27 = bf16[8,8,128,128]{3,2,1,0} broadcast(reshape.26), dimensions={0,2,3}
-      Arg_3.4 = bf16[1,8,128,128]{3,2,1,0} parameter(3)
-      broadcast.28 = bf16[1,8,128,128]{3,2,1,0} broadcast(Arg_3.4), dimensions={0,1,2,3}
-      reshape.29 = bf16[8,128,128]{2,1,0} reshape(broadcast.28)
-      broadcast.30 = bf16[8,8,128,128]{3,2,1,0} broadcast(reshape.29), dimensions={1,2,3}
-      add.31 = bf16[8,8,128,128]{3,2,1,0} add(broadcast.27, broadcast.30)
-      add.34 = bf16[8,8,128,128]{3,2,1,0} add(dot.33, add.31)
-      constant.18 = bf16[] constant(-inf)
-      reduce.39 = bf16[8,8,128]{2,1,0} reduce(add.34, constant.18), dimensions={3}, to_apply=region_0.35
-      reshape.40 = bf16[8,8,128,1]{3,2,1,0} reshape(reduce.39)
-      broadcast.41 = bf16[8,8,128,1]{3,2,1,0} broadcast(reshape.40), dimensions={0,1,2,3}
-      reshape.42 = bf16[8,8,128]{2,1,0} reshape(broadcast.41)
-      broadcast.43 = bf16[8,8,128,128]{3,2,1,0} broadcast(reshape.42), dimensions={0,1,2}
-      subtract.44 = bf16[8,8,128,128]{3,2,1,0} subtract(add.34, broadcast.43)
-      exponential.45 = bf16[8,8,128,128]{3,2,1,0} exponential(subtract.44)
-      convert.46 = f32[8,8,128,128]{3,2,1,0} convert(exponential.45)
-      constant.19 = f32[] constant(0)
-      reduce.51 = f32[8,8,128]{2,1,0} reduce(convert.46, constant.19), dimensions={3}, to_apply=region_1.47
-      reshape.52 = f32[8,8,128,1]{3,2,1,0} reshape(reduce.51)
-      convert.53 = bf16[8,8,128,1]{3,2,1,0} convert(reshape.52)
-      broadcast.54 = bf16[8,8,128,1]{3,2,1,0} broadcast(convert.53), dimensions={0,1,2,3}
-      reshape.55 = bf16[8,8,128]{2,1,0} reshape(broadcast.54)
-      broadcast.56 = bf16[8,8,128,128]{3,2,1,0} broadcast(reshape.55), dimensions={0,1,2}
-      divide.57 = bf16[8,8,128,128]{3,2,1,0} divide(exponential.45, broadcast.56)
-      dot.60 = bf16[8,8,64,128]{3,2,1,0} dot(Arg_2.3, divide.57), lhs_batch_dims={0,2}, lhs_contracting_dims={1}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-      transpose.61 = bf16[8,128,8,64]{1,3,2,0} transpose(dot.60), dimensions={0,3,1,2}
-      Arg_5.6 = bf16[8,128,8,64]{3,2,1,0} parameter(5)
-      transpose.62 = bf16[8,8,64,128]{2,1,3,0} transpose(Arg_5.6), dimensions={0,2,3,1}
-      dot.63 = bf16[8,8,128,128]{3,2,1,0} dot(transpose.62, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,2}, rhs_contracting_dims={3}
-      broadcast.78 = bf16[8,8,128,1]{3,2,1,0} broadcast(convert.53), dimensions={0,1,2,3}
-      reshape.79 = bf16[8,8,128]{2,1,0} reshape(broadcast.78)
-      broadcast.80 = bf16[8,8,128,128]{3,2,1,0} broadcast(reshape.79), dimensions={0,1,2}
-      divide.81 = bf16[8,8,128,128]{3,2,1,0} divide(dot.63, broadcast.80)
-      constant.9 = bf16[] constant(1)
-      broadcast.10 = bf16[8,8,128,1]{3,2,1,0} broadcast(constant.9), dimensions={}
-      multiply.58 = bf16[8,8,128,1]{3,2,1,0} multiply(convert.53, convert.53)
-      divide.59 = bf16[8,8,128,1]{3,2,1,0} divide(broadcast.10, multiply.58)
-      broadcast.66 = bf16[8,8,128,1]{3,2,1,0} broadcast(divide.59), dimensions={0,1,2,3}
-      reshape.67 = bf16[8,8,128]{2,1,0} reshape(broadcast.66)
-      broadcast.68 = bf16[8,8,128,128]{3,2,1,0} broadcast(reshape.67), dimensions={0,1,2}
-      multiply.69 = bf16[8,8,128,128]{3,2,1,0} multiply(dot.63, broadcast.68)
-      multiply.70 = bf16[8,8,128,128]{3,2,1,0} multiply(multiply.69, exponential.45)
-      constant.17 = bf16[] constant(0)
-      reduce.75 = bf16[8,8,128]{2,1,0} reduce(multiply.70, constant.17), dimensions={3}, to_apply=region_2.71
-      reshape.76 = bf16[8,8,128,1]{3,2,1,0} reshape(reduce.75)
-      negate.77 = bf16[8,8,128,1]{3,2,1,0} negate(reshape.76)
-      convert.82 = f32[8,8,128,1]{3,2,1,0} convert(negate.77)
-      reduce.87 = f32[8,8,128]{2,1,0} reduce(convert.82, constant.19), dimensions={3}, to_apply=region_3.83
-      broadcast.88 = f32[8,8,128,128]{3,2,1,0} broadcast(reduce.87), dimensions={0,1,2}
-      convert.89 = bf16[8,8,128,128]{3,2,1,0} convert(broadcast.88)
-      add.90 = bf16[8,8,128,128]{3,2,1,0} add(divide.81, convert.89)
-      multiply.91 = bf16[8,8,128,128]{3,2,1,0} multiply(add.90, exponential.45)
-      dot.100 = bf16[8,8,128,64]{3,2,1,0} dot(multiply.91, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,2}, rhs_contracting_dims={1}
-      transpose.101 = bf16[8,128,8,64]{3,1,2,0} transpose(dot.100), dimensions={0,2,1,3}
-      divide.102 = bf16[8,128,8,64]{3,1,2,0} divide(transpose.101, broadcast.8)
-      dot.98 = bf16[8,8,128,64]{3,2,1,0} dot(multiply.91, divide.32), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,2}, rhs_contracting_dims={1}
-      transpose.99 = bf16[8,128,8,64]{3,1,2,0} transpose(dot.98), dimensions={0,2,1,3}
-      dot.64 = bf16[8,8,64,128]{3,2,1,0} dot(transpose.62, divide.57), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-      transpose.65 = bf16[8,128,8,64]{1,3,2,0} transpose(dot.64), dimensions={0,3,1,2}
-      reduce.96 = bf16[8,128,128]{2,1,0} reduce(multiply.91, constant.17), dimensions={0}, to_apply=region_4.92
-      reshape.97 = bf16[1,8,128,128]{3,2,1,0} reshape(reduce.96)
-      ROOT tuple.103 = (bf16[8,128,8,64]{1,3,2,0}, bf16[8,128,8,64]{3,1,2,0}, bf16[8,128,8,64]{3,1,2,0}, bf16[8,128,8,64]{1,3,2,0}, bf16[1,8,128,128]{3,2,1,0}) tuple(transpose.61, divide.102, transpose.99, transpose.65, reshape.97)
-    }
-  )";
-
-    return hlo_text;
-  }
-
-  const std::string  // NOLINT
-  GetModuleFMHA_Training_BMM1_Scale_Bias_Softmax_BMM2_HloString_F16() {  // NOLINT
-    const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(f16[8,128,8,64]{3,2,1,0},f16[8,128,8,64]{3,2,1,0},f16[8,128,8,64]{3,2,1,0},f16[1,8,128,128]{3,2,1,0},pred[8,1,128,128]{3,2,1,0},f16[8,128,8,64]{3,2,1,0})->(f16[8,128,8,64]{3,2,1,0}, f16[8,128,8,64]{3,2,1,0}, f16[8,128,8,64]{3,2,1,0}, f16[8,128,8,64]{3,2,1,0}, f16[1,8,128,128]{3,2,1,0})}, allow_spmd_sharding_propagation_to_output={true,true,true,true,true}
-
-    region_0.35 {
-      Arg_0.36 = f16[] parameter(0)
-      Arg_1.37 = f16[] parameter(1)
-      ROOT maximum.38 = f16[] maximum(Arg_0.36, Arg_1.37)
-    }
-
-    region_1.47 {
-      Arg_0.48 = f32[] parameter(0)
-      Arg_1.49 = f32[] parameter(1)
-      ROOT add.50 = f32[] add(Arg_0.48, Arg_1.49)
-    }
-
-    region_2.71 {
-      Arg_0.72 = f16[] parameter(0)
-      Arg_1.73 = f16[] parameter(1)
-      ROOT add.74 = f16[] add(Arg_0.72, Arg_1.73)
-    }
-
-    region_3.83 {
-      Arg_0.84 = f32[] parameter(0)
-      Arg_1.85 = f32[] parameter(1)
-      ROOT add.86 = f32[] add(Arg_0.84, Arg_1.85)
-    }
-
-    region_4.92 {
-      Arg_0.93 = f16[] parameter(0)
-      Arg_1.94 = f16[] parameter(1)
-      ROOT add.95 = f16[] add(Arg_0.93, Arg_1.94)
-    }
-
-    ENTRY main.104 {
-      Arg_2.3 = f16[8,128,8,64]{3,2,1,0} parameter(2)
-      Arg_0.1 = f16[8,128,8,64]{3,2,1,0} parameter(0)
-      constant.7 = f16[] constant(2)
-      broadcast.8 = f16[8,128,8,64]{3,2,1,0} broadcast(constant.7), dimensions={}
-      divide.32 = f16[8,128,8,64]{3,2,1,0} divide(Arg_0.1, broadcast.8)
-      Arg_1.2 = f16[8,128,8,64]{3,2,1,0} parameter(1)
-      dot.33 = f16[8,8,128,128]{3,2,1,0} dot(divide.32, Arg_1.2), lhs_batch_dims={0,2}, lhs_contracting_dims={3}, rhs_batch_dims={0,2}, rhs_contracting_dims={3}
-      Arg_4.5 = pred[8,1,128,128]{3,2,1,0} parameter(4)
-      convert.20 = s32[8,1,128,128]{3,2,1,0} convert(Arg_4.5)
-      constant.15 = s32[] constant(0)
-      broadcast.16 = s32[8,1,128,128]{3,2,1,0} broadcast(constant.15), dimensions={}
-      compare.21 = pred[8,1,128,128]{3,2,1,0} compare(convert.20, broadcast.16), direction=GT
-      constant.13 = f32[] constant(0)
-      broadcast.14 = f32[8,1,128,128]{3,2,1,0} broadcast(constant.13), dimensions={}
-      convert.22 = f16[8,1,128,128]{3,2,1,0} convert(broadcast.14)
-      constant.11 = f32[] constant(-1e+10)
-      broadcast.12 = f32[8,1,128,128]{3,2,1,0} broadcast(constant.11), dimensions={}
-      convert.23 = f16[8,1,128,128]{3,2,1,0} convert(broadcast.12)
-      select.24 = f16[8,1,128,128]{3,2,1,0} select(compare.21, convert.22, convert.23)
-      broadcast.25 = f16[8,1,128,128]{3,2,1,0} broadcast(select.24), dimensions={0,1,2,3}
-      reshape.26 = f16[8,128,128]{2,1,0} reshape(broadcast.25)
-      broadcast.27 = f16[8,8,128,128]{3,2,1,0} broadcast(reshape.26), dimensions={0,2,3}
-      Arg_3.4 = f16[1,8,128,128]{3,2,1,0} parameter(3)
-      broadcast.28 = f16[1,8,128,128]{3,2,1,0} broadcast(Arg_3.4), dimensions={0,1,2,3}
-      reshape.29 = f16[8,128,128]{2,1,0} reshape(broadcast.28)
-      broadcast.30 = f16[8,8,128,128]{3,2,1,0} broadcast(reshape.29), dimensions={1,2,3}
-      add.31 = f16[8,8,128,128]{3,2,1,0} add(broadcast.27, broadcast.30)
-      add.34 = f16[8,8,128,128]{3,2,1,0} add(dot.33, add.31)
-      constant.18 = f16[] constant(-inf)
-      reduce.39 = f16[8,8,128]{2,1,0} reduce(add.34, constant.18), dimensions={3}, to_apply=region_0.35
-      reshape.40 = f16[8,8,128,1]{3,2,1,0} reshape(reduce.39)
-      broadcast.41 = f16[8,8,128,1]{3,2,1,0} broadcast(reshape.40), dimensions={0,1,2,3}
-      reshape.42 = f16[8,8,128]{2,1,0} reshape(broadcast.41)
-      broadcast.43 = f16[8,8,128,128]{3,2,1,0} broadcast(reshape.42), dimensions={0,1,2}
-      subtract.44 = f16[8,8,128,128]{3,2,1,0} subtract(add.34, broadcast.43)
-      exponential.45 = f16[8,8,128,128]{3,2,1,0} exponential(subtract.44)
-      convert.46 = f32[8,8,128,128]{3,2,1,0} convert(exponential.45)
-      constant.19 = f32[] constant(0)
-      reduce.51 = f32[8,8,128]{2,1,0} reduce(convert.46, constant.19), dimensions={3}, to_apply=region_1.47
-      reshape.52 = f32[8,8,128,1]{3,2,1,0} reshape(reduce.51)
-      convert.53 = f16[8,8,128,1]{3,2,1,0} convert(reshape.52)
-      broadcast.54 = f16[8,8,128,1]{3,2,1,0} broadcast(convert.53), dimensions={0,1,2,3}
-      reshape.55 = f16[8,8,128]{2,1,0} reshape(broadcast.54)
-      broadcast.56 = f16[8,8,128,128]{3,2,1,0} broadcast(reshape.55), dimensions={0,1,2}
-      divide.57 = f16[8,8,128,128]{3,2,1,0} divide(exponential.45, broadcast.56)
-      dot.60 = f16[8,8,64,128]{3,2,1,0} dot(Arg_2.3, divide.57), lhs_batch_dims={0,2}, lhs_contracting_dims={1}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-      transpose.61 = f16[8,128,8,64]{1,3,2,0} transpose(dot.60), dimensions={0,3,1,2}
-      Arg_5.6 = f16[8,128,8,64]{3,2,1,0} parameter(5)
-      transpose.62 = f16[8,8,64,128]{2,1,3,0} transpose(Arg_5.6), dimensions={0,2,3,1}
-      dot.63 = f16[8,8,128,128]{3,2,1,0} dot(transpose.62, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,2}, rhs_contracting_dims={3}
-      broadcast.78 = f16[8,8,128,1]{3,2,1,0} broadcast(convert.53), dimensions={0,1,2,3}
-      reshape.79 = f16[8,8,128]{2,1,0} reshape(broadcast.78)
-      broadcast.80 = f16[8,8,128,128]{3,2,1,0} broadcast(reshape.79), dimensions={0,1,2}
-      divide.81 = f16[8,8,128,128]{3,2,1,0} divide(dot.63, broadcast.80)
-      constant.9 = f16[] constant(1)
-      broadcast.10 = f16[8,8,128,1]{3,2,1,0} broadcast(constant.9), dimensions={}
-      multiply.58 = f16[8,8,128,1]{3,2,1,0} multiply(convert.53, convert.53)
-      divide.59 = f16[8,8,128,1]{3,2,1,0} divide(broadcast.10, multiply.58)
-      broadcast.66 = f16[8,8,128,1]{3,2,1,0} broadcast(divide.59), dimensions={0,1,2,3}
-      reshape.67 = f16[8,8,128]{2,1,0} reshape(broadcast.66)
-      broadcast.68 = f16[8,8,128,128]{3,2,1,0} broadcast(reshape.67), dimensions={0,1,2}
-      multiply.69 = f16[8,8,128,128]{3,2,1,0} multiply(dot.63, broadcast.68)
-      multiply.70 = f16[8,8,128,128]{3,2,1,0} multiply(multiply.69, exponential.45)
-      constant.17 = f16[] constant(0)
-      reduce.75 = f16[8,8,128]{2,1,0} reduce(multiply.70, constant.17), dimensions={3}, to_apply=region_2.71
-      reshape.76 = f16[8,8,128,1]{3,2,1,0} reshape(reduce.75)
-      negate.77 = f16[8,8,128,1]{3,2,1,0} negate(reshape.76)
-      convert.82 = f32[8,8,128,1]{3,2,1,0} convert(negate.77)
-      reduce.87 = f32[8,8,128]{2,1,0} reduce(convert.82, constant.19), dimensions={3}, to_apply=region_3.83
-      broadcast.88 = f32[8,8,128,128]{3,2,1,0} broadcast(reduce.87), dimensions={0,1,2}
-      convert.89 = f16[8,8,128,128]{3,2,1,0} convert(broadcast.88)
-      add.90 = f16[8,8,128,128]{3,2,1,0} add(divide.81, convert.89)
-      multiply.91 = f16[8,8,128,128]{3,2,1,0} multiply(add.90, exponential.45)
-      dot.100 = f16[8,8,128,64]{3,2,1,0} dot(multiply.91, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,2}, rhs_contracting_dims={1}
-      transpose.101 = f16[8,128,8,64]{3,1,2,0} transpose(dot.100), dimensions={0,2,1,3}
-      divide.102 = f16[8,128,8,64]{3,1,2,0} divide(transpose.101, broadcast.8)
-      dot.98 = f16[8,8,128,64]{3,2,1,0} dot(multiply.91, divide.32), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,2}, rhs_contracting_dims={1}
-      transpose.99 = f16[8,128,8,64]{3,1,2,0} transpose(dot.98), dimensions={0,2,1,3}
-      dot.64 = f16[8,8,64,128]{3,2,1,0} dot(transpose.62, divide.57), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-      transpose.65 = f16[8,128,8,64]{1,3,2,0} transpose(dot.64), dimensions={0,3,1,2}
-      reduce.96 = f16[8,128,128]{2,1,0} reduce(multiply.91, constant.17), dimensions={0}, to_apply=region_4.92
-      reshape.97 = f16[1,8,128,128]{3,2,1,0} reshape(reduce.96)
-      ROOT tuple.103 = (f16[8,128,8,64]{1,3,2,0}, f16[8,128,8,64]{3,1,2,0}, f16[8,128,8,64]{3,1,2,0}, f16[8,128,8,64]{1,3,2,0}, f16[1,8,128,128]{3,2,1,0}) tuple(transpose.61, divide.102, transpose.99, transpose.65, reshape.97)
-    }
-  )";
-
-    return hlo_text;
-  }
-
-  const std::string                                             // NOLINT
-  GetModuleFMHABMM1_Scale_Bias_Softmax_BMM2_HloString_BF16() {  // NOLINT
-    const std::string hlo_text = R"(
-    HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0},bf16[16,16,256,64]{3,2,1,0})->bf16[16,16,256,64]{3,2,1,0}}
-
-    region_0.13 {
-      Arg_0.14 = bf16[] parameter(0)
-      Arg_1.15 = bf16[] parameter(1)
-      ROOT maximum.16 = bf16[] maximum(Arg_0.14, Arg_1.15)
-    }
-
-    region_1.25 {
-      Arg_0.26 = f32[] parameter(0)
-      Arg_1.27 = f32[] parameter(1)
-      ROOT add.28 = f32[] add(Arg_0.26, Arg_1.27)
-    }
-
-    ENTRY main.37 {
-      Arg_0.1 = bf16[16,16,256,64]{3,2,1,0} parameter(0)
-      Arg_1.2 = bf16[16,16,256,64]{3,2,1,0} parameter(1)
-      dot.10 = bf16[16,16,256,256]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
-      constant.4 = bf16[] constant(2)
-      broadcast.5 = bf16[16,16,256,256]{3,2,1,0} broadcast(constant.4), dimensions={}
-      multiply.11 = bf16[16,16,256,256]{3,2,1,0} multiply(dot.10, broadcast.5)
-      constant.8 = bf16[] constant(1)
-      broadcast.9 = bf16[16,16,256,256]{3,2,1,0} broadcast(constant.8), dimensions={}
-      add.12 = bf16[16,16,256,256]{3,2,1,0} add(multiply.11, broadcast.9)
-      constant.7 = bf16[] constant(-inf)
-      reduce.17 = bf16[16,16,256]{2,1,0} reduce(add.12, constant.7), dimensions={3}, to_apply=region_0.13
-      reshape.18 = bf16[16,16,256,1]{3,2,1,0} reshape(reduce.17)
-      broadcast.19 = bf16[16,16,256,1]{3,2,1,0} broadcast(reshape.18), dimensions={0,1,2,3}
-      reshape.20 = bf16[16,16,256]{2,1,0} reshape(broadcast.19)
-      broadcast.21 = bf16[16,16,256,256]{3,2,1,0} broadcast(reshape.20), dimensions={0,1,2}
-      subtract.22 = bf16[16,16,256,256]{3,2,1,0} subtract(add.12, broadcast.21)
-      exponential.23 = bf16[16,16,256,256]{3,2,1,0} exponential(subtract.22)
-      convert.24 = f32[16,16,256,256]{3,2,1,0} convert(exponential.23)
-      constant.6 = f32[] constant(0)
-      reduce.29 = f32[16,16,256]{2,1,0} reduce(convert.24, constant.6), dimensions={3}, to_apply=region_1.25
-      reshape.30 = f32[16,16,256,1]{3,2,1,0} reshape(reduce.29)
-      convert.31 = bf16[16,16,256,1]{3,2,1,0} convert(reshape.30)
-      broadcast.32 = bf16[16,16,256,1]{3,2,1,0} broadcast(convert.31), dimensions={0,1,2,3}
-      reshape.33 = bf16[16,16,256]{2,1,0} reshape(broadcast.32)
-      broadcast.34 = bf16[16,16,256,256]{3,2,1,0} broadcast(reshape.33), dimensions={0,1,2}
-      divide.35 = bf16[16,16,256,256]{3,2,1,0} divide(exponential.23, broadcast.34)
-      Arg_2.3 = bf16[16,16,256,64]{3,2,1,0} parameter(2)
-      ROOT dot.36 = bf16[16,16,256,64]{3,2,1,0} dot(divide.35, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-    }
-  )";
-
-    return hlo_text;
-  }
-
-  // BMM1 - Scale - bias - Softmax - BMM2
-  template <typename T>
-  void TestImpl_FMHABMM1_Scale_Bias_Softmax_BMM2_vanilla() {
-    if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-    XlaBuilder builder(TestName());
-
-    auto lhs_bmm1_literal =
-        GetInput4DLiteral<T>({16, 16, 256, 64}, {3, 2, 1, 0});
-    auto rhs_bmm1_literal =
-        GetInput4DLiteral<T>({16, 16, 256, 64}, {3, 2, 1, 0});
-    auto rhs_bmm2_literal =
-        GetInput4DLiteral<T>({16, 16, 256, 64}, {3, 2, 1, 0});
-
-    std::string hlo_string = "";
-    if (std::is_same<T, Eigen::half>::value) {
-      hlo_string = GetModuleFMHABMM1_Scale_Bias_Softmax_BMM2_HloString_F16();
-    } else if (std::is_same<T, bfloat16>::value) {
-      hlo_string = GetModuleFMHABMM1_Scale_Bias_Softmax_BMM2_HloString_BF16();
-    }
-
-    ExecuteAndCompare(
-        hlo_string, {&lhs_bmm1_literal, &rhs_bmm1_literal, &rhs_bmm2_literal});
-  }
-  // Training BMM1 - Scale - bias - Softmax - BMM2
-  template <typename T>
-  void TestImpl_FMHA_Training_BMM1_Scale_Bias_Softmax_BMM2_vanilla() {
-    if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-    if (GetDnnVersionInfo(backend().default_stream_executor()) <
-        se::dnn::VersionInfo(8, 9, 1)) {
-      GTEST_SKIP() << "Backward fused MHA requires cuDNN >= 8.9.1.";
-    }
-    XlaBuilder builder(TestName());
-
-    auto lhs_bmm1_literal = GetInput4DLiteral<T>({8, 128, 8, 64}, {3, 2, 1, 0});
-    auto rhs_bmm1_literal = GetInput4DLiteral<T>({8, 128, 8, 64}, {3, 2, 1, 0});
-    auto rhs_bmm2_literal = GetInput4DLiteral<T>({8, 128, 8, 64}, {3, 2, 1, 0});
-    auto bias_literal = GetInput4DLiteral<T>({1, 8, 128, 128}, {3, 2, 1, 0});
-    auto mask_literal = GetMask4DLiteral({8, 1, 128, 128}, {3, 2, 1, 0});
-    auto do_literal = GetInput4DLiteral<T>({8, 128, 8, 64}, {3, 2, 1, 0});
-    std::string hlo_string = "";
-    if (std::is_same<T, Eigen::half>::value) {
-      hlo_string =
-          GetModuleFMHA_Training_BMM1_Scale_Bias_Softmax_BMM2_HloString_F16();
-    } else if (std::is_same<T, bfloat16>::value) {
-      hlo_string =
-          GetModuleFMHA_Training_BMM1_Scale_Bias_Softmax_BMM2_HloString_BF16();
-    }
-
-    ExecuteAndCompare(hlo_string,
-                      {&lhs_bmm1_literal, &rhs_bmm1_literal, &rhs_bmm2_literal,
-                       &bias_literal, &mask_literal, &do_literal},
-                      /*expected_num_fmha_calls=*/2);
-  }
-};
-
 class FlashAttentionBMMScaleCausalMaskSoftmaxBMM
     : public MultiHeadedAttentionTest {
  protected:
@@ -1392,8 +353,8 @@ class FlashAttentionBMMScaleCausalMaskSoftmaxBMM
   void TestImpl_Flash_Attention_BMM1_CausalMask_Softmax_BMM2() {
     if (skip_reason_) GTEST_SKIP() << *skip_reason_;
     if (GetDnnVersionInfo(backend().default_stream_executor()) <
-        se::dnn::VersionInfo(8, 9, 3)) {
-      GTEST_SKIP() << "Flash Attention requires cuDNN >= 8.9.3.";
+        se::dnn::VersionInfo(8, 9, 4)) {
+      GTEST_SKIP() << "Flash Attention requires cuDNN >= 8.9.4.";
     }
     XlaBuilder builder(TestName());
     auto lhs_bmm1_literal =
@@ -1413,8 +374,8 @@ class FlashAttentionBMMScaleCausalMaskSoftmaxBMM
   void TestImpl_Flash_Attention_Training_BMM1_CausalMask_Softmax_BMM2() {
     if (skip_reason_) GTEST_SKIP() << *skip_reason_;
     if (GetDnnVersionInfo(backend().default_stream_executor()) <
-        se::dnn::VersionInfo(8, 9, 3)) {
-      GTEST_SKIP() << "Flash Attention requires cuDNN >= 8.9.3.";
+        se::dnn::VersionInfo(8, 9, 4)) {
+      GTEST_SKIP() << "Flash Attention requires cuDNN >= 8.9.4.";
     }
     XlaBuilder builder(TestName());
     auto lhs_bmm1_literal =
@@ -1625,12 +586,91 @@ class FlashAttentionBMMScaleBiasSoftmaxBMM : public MultiHeadedAttentionTest {
   )";
     return hlo_text;
   }
+
+  const std::string  // NOLINT
+  GetModuleFlash_Attention_BMM1_Bias_Softmax_BMM2_Dbias_HloString_BF16() {  // NOLINT
+    const std::string hlo_text = R"(
+      HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[2,1024,4,64]{3,2,1,0}, bf16[2,1024,4,64]{3,2,1,0}, bf16[2,1024,4,64]{3,2,1,0}, bf16[2,1024,4,64]{3,2,1,0}, bf16[4,1024,1024]{2,1,0})->(bf16[2,1024,4,64]{3,2,1,0}, bf16[2,1024,4,64]{3,2,1,0}, bf16[2,1024,4,64]{3,2,1,0}, bf16[2,1024,4,64]{3,2,1,0}, bf16[4,1024,1024]{2,1,0})}, allow_spmd_sharding_propagation_to_parameters={true,true,true,true,true}, allow_spmd_sharding_propagation_to_output={true,true,true,true,true}
+
+      region_0.14 {
+        Arg_0.15 = bf16[] parameter(0)
+        Arg_1.16 = bf16[] parameter(1)
+        ROOT maximum = bf16[] maximum(Arg_0.15, Arg_1.16)
+      }
+
+      region_1.27 {
+        Arg_0.28 = f32[] parameter(0)
+        Arg_1.29 = f32[] parameter(1)
+        ROOT add = f32[] add(Arg_0.28, Arg_1.29)
+      }
+
+      region_2.56 {
+        Arg_0.57 = bf16[] parameter(0)
+        Arg_1.58 = bf16[] parameter(1)
+        ROOT add.1 = bf16[] add(Arg_0.57, Arg_1.58)
+      }
+
+      ENTRY main.87 {
+        Arg_2.3 = bf16[2,1024,4,64]{3,2,1,0} parameter(2)
+        transpose.12 = bf16[2,4,64,1024]{3,2,1,0} transpose(Arg_2.3), dimensions={0,2,3,1}
+        Arg_0.1 = bf16[2,1024,4,64]{3,2,1,0} parameter(0)
+        transpose.13 = bf16[2,4,1024,64]{3,2,1,0} transpose(Arg_0.1), dimensions={0,2,1,3}
+        Arg_1.2 = bf16[2,1024,4,64]{3,2,1,0} parameter(1)
+        transpose.15 = bf16[2,4,64,1024]{3,2,1,0} transpose(Arg_1.2), dimensions={0,2,3,1}
+        dot = bf16[2,4,1024,1024]{3,2,1,0} dot(transpose.13, transpose.15), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+        Arg_4.5 = bf16[4,1024,1024]{2,1,0} parameter(4)
+        broadcast.9 = bf16[2,4,1024,1024]{3,2,1,0} broadcast(Arg_4.5), dimensions={1,2,3}
+        add.2 = bf16[2,4,1024,1024]{3,2,1,0} add(dot, broadcast.9)
+        constant.10 = bf16[] constant(-inf)
+        reduce.18 = bf16[2,4,1024]{2,1,0} reduce(add.2, constant.10), dimensions={3}, to_apply=region_0.14
+        broadcast.10 = bf16[2,4,1024,1024]{3,2,1,0} broadcast(reduce.18), dimensions={0,1,2}
+        subtract = bf16[2,4,1024,1024]{3,2,1,0} subtract(add.2, broadcast.10)
+        exponential = bf16[2,4,1024,1024]{3,2,1,0} exponential(subtract)
+        convert.5 = f32[2,4,1024,1024]{3,2,1,0} convert(exponential)
+        constant.9 = f32[] constant(0)
+        reduce.31 = f32[2,4,1024]{2,1,0} reduce(convert.5, constant.9), dimensions={3}, to_apply=region_1.27
+        convert.6 = bf16[2,4,1024]{2,1,0} convert(reduce.31)
+        broadcast.11 = bf16[2,4,1024,1024]{3,2,1,0} broadcast(convert.6), dimensions={0,1,2}
+        divide.2 = bf16[2,4,1024,1024]{3,2,1,0} divide(exponential, broadcast.11)
+        dot.1 = bf16[2,4,64,1024]{3,2,1,0} dot(transpose.12, divide.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
+        transpose.22 = bf16[2,1024,4,64]{3,2,1,0} transpose(dot.1), dimensions={0,3,1,2}
+        Arg_3.4 = bf16[2,1024,4,64]{3,2,1,0} parameter(3)
+        transpose.17 = bf16[2,4,1024,64]{3,2,1,0} transpose(Arg_3.4), dimensions={0,2,1,3}
+        dot.2 = bf16[2,4,1024,1024]{3,2,1,0} dot(transpose.17, transpose.12), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+        divide.3 = bf16[2,4,1024,1024]{3,2,1,0} divide(dot.2, broadcast.11)
+        constant.0 = bf16[] constant(1)
+        broadcast.13 = bf16[2,4,1024]{2,1,0} broadcast(constant.0), dimensions={}
+        multiply.2 = bf16[2,4,1024]{2,1,0} multiply(convert.6, convert.6)
+        divide.4 = bf16[2,4,1024]{2,1,0} divide(broadcast.13, multiply.2)
+        broadcast.14 = bf16[2,4,1024,1024]{3,2,1,0} broadcast(divide.4), dimensions={0,1,2}
+        multiply.3 = bf16[2,4,1024,1024]{3,2,1,0} multiply(dot.2, broadcast.14)
+        multiply.4 = bf16[2,4,1024,1024]{3,2,1,0} multiply(multiply.3, exponential)
+        constant.8 = bf16[] constant(0)
+        reduce.60 = bf16[2,4,1024]{2,1,0} reduce(multiply.4, constant.8), dimensions={3}, to_apply=region_2.56
+        negate.1 = bf16[2,4,1024]{2,1,0} negate(reduce.60)
+        broadcast.15 = bf16[2,4,1024,1024]{3,2,1,0} broadcast(negate.1), dimensions={0,1,2}
+        add.3 = bf16[2,4,1024,1024]{3,2,1,0} add(divide.3, broadcast.15)
+        multiply.5 = bf16[2,4,1024,1024]{3,2,1,0} multiply(add.3, exponential)
+        transpose.18 = bf16[2,4,1024,64]{3,2,1,0} transpose(Arg_1.2), dimensions={0,2,1,3}
+        dot.4 = bf16[2,4,1024,64]{3,2,1,0} dot(multiply.5, transpose.18), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+        transpose.23 = bf16[2,1024,4,64]{3,2,1,0} transpose(dot.4), dimensions={0,2,1,3}
+        dot.3 = bf16[2,4,1024,64]{3,2,1,0} dot(multiply.5, transpose.13), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+        transpose.24 = bf16[2,1024,4,64]{3,2,1,0} transpose(dot.3), dimensions={0,2,1,3}
+        transpose.20 = bf16[2,4,64,1024]{3,2,1,0} transpose(Arg_3.4), dimensions={0,2,3,1}
+        dot.49 = bf16[2,4,64,1024]{3,2,1,0} dot(transpose.20, divide.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+        transpose.25 = bf16[2,1024,4,64]{3,2,1,0} transpose(dot.49), dimensions={0,3,1,2}
+        reduce.81 = bf16[4,1024,1024]{2,1,0} reduce(multiply.5, constant.8), dimensions={0}, to_apply=region_2.56
+        ROOT tuple = (bf16[2,1024,4,64]{3,2,1,0}, bf16[2,1024,4,64]{3,2,1,0}, bf16[2,1024,4,64]{3,2,1,0}, bf16[2,1024,4,64]{3,2,1,0}, bf16[4,1024,1024]{2,1,0}) tuple(transpose.22, transpose.23, transpose.24, transpose.25, reduce.81)
+      } // main.87
+  )";
+    return hlo_text;
+  }
   template <typename T>
   void TestImpl_Flash_Attention_BMM1_Bias_Softmax_BMM2() {
     if (skip_reason_) GTEST_SKIP() << *skip_reason_;
     if (GetDnnVersionInfo(backend().default_stream_executor()) <
-        se::dnn::VersionInfo(8, 9, 3)) {
-      GTEST_SKIP() << "Flash Attention requires cuDNN >= 8.9.3.";
+        se::dnn::VersionInfo(8, 9, 4)) {
+      GTEST_SKIP() << "Flash Attention requires cuDNN >= 8.9.4.";
     }
     XlaBuilder builder(TestName());
     auto lhs_bmm1_literal =
@@ -1651,8 +691,8 @@ class FlashAttentionBMMScaleBiasSoftmaxBMM : public MultiHeadedAttentionTest {
   void TestImpl_Flash_Attention_Training_BMM1_Bias_Softmax_BMM2() {
     if (skip_reason_) GTEST_SKIP() << *skip_reason_;
     if (GetDnnVersionInfo(backend().default_stream_executor()) <
-        se::dnn::VersionInfo(8, 9, 3)) {
-      GTEST_SKIP() << "Flash Attention requires cuDNN >= 8.9.3.";
+        se::dnn::VersionInfo(8, 9, 4)) {
+      GTEST_SKIP() << "Flash Attention requires cuDNN >= 8.9.4.";
     }
     XlaBuilder builder(TestName());
     auto lhs_bmm1_literal =
@@ -1694,6 +734,33 @@ class FlashAttentionBMMScaleBiasSoftmaxBMM : public MultiHeadedAttentionTest {
     ExecuteAndCompare(hlo_string, {&lhs_bmm1_literal, &rhs_bmm1_literal,
                                    &rhs_bmm2_literal, &bias_literal});
   }
+
+  template <typename T>
+  void TestImpl_Flash_Attention_BMM1_Bias_Softmax_BMM2_Dbias() {
+    if (skip_reason_) GTEST_SKIP() << *skip_reason_;
+    auto cc = GetCudaComputeCapability();
+    if (GetDnnVersionInfo(backend().default_stream_executor()) <
+            se::dnn::VersionInfo(8, 9, 6) ||
+        !cc.IsAtLeastHopper() || cc.minor != 0) {
+      GTEST_SKIP()
+          << "Flash Attention dbias requires cuDNN >= 8.9.6 and Hopper arch.";
+    }
+    XlaBuilder builder(TestName());
+    auto lhs_bmm1_literal =
+        GetInput4DLiteral<T>({2, 1024, 4, 64}, {3, 2, 1, 0});
+    auto rhs_bmm1_literal =
+        GetInput4DLiteral<T>({2, 1024, 4, 64}, {3, 2, 1, 0});
+    auto rhs_bmm2_literal =
+        GetInput4DLiteral<T>({2, 1024, 4, 64}, {3, 2, 1, 0});
+    auto bias_literal = GetInput3DLiteral<T>({4, 1024, 1024}, {2, 1, 0});
+    auto do_literal = GetInput4DLiteral<T>({2, 1024, 4, 64}, {3, 2, 1, 0});
+    std::string hlo_string =
+        GetModuleFlash_Attention_BMM1_Bias_Softmax_BMM2_Dbias_HloString_BF16();  // NOLINT
+    ExecuteAndCompare(hlo_string,
+                      {&lhs_bmm1_literal, &rhs_bmm1_literal, &rhs_bmm2_literal,
+                       &do_literal, &bias_literal},
+                      /*expected_num_fmha_calls=*/2);
+  }
 };
 
 class FlashAttentionBMMScaleSoftmaxBMM : public MultiHeadedAttentionTest {
@@ -1793,8 +860,8 @@ class FlashAttentionBMMScaleSoftmaxBMM : public MultiHeadedAttentionTest {
   void TestImpl_Flash_Attention_Training_BMM1_Softmax_BMM2() {
     if (skip_reason_) GTEST_SKIP() << *skip_reason_;
     if (GetDnnVersionInfo(backend().default_stream_executor()) <
-        se::dnn::VersionInfo(8, 9, 3)) {
-      GTEST_SKIP() << "Flash Attention requires cuDNN >= 8.9.3.";
+        se::dnn::VersionInfo(8, 9, 4)) {
+      GTEST_SKIP() << "Flash Attention requires cuDNN >= 8.9.4.";
     }
     XlaBuilder builder(TestName());
     auto lhs_bmm1_literal =
@@ -1814,85 +881,156 @@ class FlashAttentionBMMScaleSoftmaxBMM : public MultiHeadedAttentionTest {
   }
 };
 
-// BMM1 - BMM2
-XLA_TEST_F(MultiHeadedAttentionBMMBMM, FMHABMM_BMM_vanilla_F16) {
-  TestImpl_FMHABMM_BMM_vanilla<Eigen::half>();
-}
-
-XLA_TEST_F(MultiHeadedAttentionBMMBMM, FMHABMM_BMM_vanilla_BF16) {
-  TestImpl_FMHABMM_BMM_vanilla<bfloat16>();
-}
-
-XLA_TEST_F(MultiHeadedAttentionBMMBMM, FMHABMM_BMM_arg_reversal_F16) {
-  TestImpl_FMHABMM_BMM_arg_reversal<Eigen::half>();
-}
-
-XLA_TEST_F(MultiHeadedAttentionBMMBMM, FMHABMM_BMM_arg_reversal_BF16) {
-  TestImpl_FMHABMM_BMM_arg_reversal<bfloat16>();
-}
-
-XLA_TEST_F(MultiHeadedAttentionBMMBMM,
-           FMHABMM_BMM_arg_layout_manipulation_arg_reversal_F16) {
-  TestImpl_FMHABMM_BMM_arg_layout_manipulation_arg_reversal_fusion<
-      Eigen::half>();
-}
-
-XLA_TEST_F(MultiHeadedAttentionBMMBMM,
-           FMHABMM_BMM_arg_layout_manipulation_arg_reversal_fusion_BF16) {
-  TestImpl_FMHABMM_BMM_arg_layout_manipulation_arg_reversal_fusion<bfloat16>();
-}
-
-XLA_TEST_F(MultiHeadedAttentionBMMBMM, FMHABMM_BMM_all_canonicalization_F16) {
-  TestImpl_FMHABMM_BMM_all_canonicalization<Eigen::half>();
-}
-
-XLA_TEST_F(MultiHeadedAttentionBMMBMM, FMHABMM_BMM_all_canonicalization_BF16) {
-  TestImpl_FMHABMM_BMM_all_canonicalization<bfloat16>();
-}
-
-XLA_TEST_F(MultiHeadedAttentionBMMBMM,
-           FMHABMM_BMM_BMM1_contracting_dim_stride_not_1_F16) {
-  TestImpl_BMM_BMM1_contracting_dim_stride_not_1<Eigen::half>();
-}
-
-XLA_TEST_F(MultiHeadedAttentionBMMBMM,
-           FMHABMM_BMM_BMM2_non_contracting_dim_stride_not_1_F16) {
-  TestImpl_BMM_BMM2_non_contracting_dim_stride_not_1<Eigen::half>();
-}
-
-// BMM1 - Softmax - BMM2
-XLA_TEST_F(MultiHeadedAttentionBMMSoftmaxBMM,
-           FMHABMM1_Softmax_BMM2_vanilla_F16) {
-  TestImpl_FMHABMM1_Softmax_BMM2_vanilla<Eigen::half>();
-}
-
-XLA_TEST_F(MultiHeadedAttentionBMMSoftmaxBMM,
-           FMHABMM1_Softmax_BMM2_vanilla_BF16) {
-  TestImpl_FMHABMM1_Softmax_BMM2_vanilla<Eigen::half>();
-}
-
-// BMM1 - Scale - bias - Softmax - BMM2
-XLA_TEST_F(MultiHeadedAttentionBMMScaleBiasSoftmaxBMM,
-           FMHABMM1_Scale_Bias_Softmax_BMM2_vanilla_F16) {
-  TestImpl_FMHABMM1_Scale_Bias_Softmax_BMM2_vanilla<Eigen::half>();
-}
-
-XLA_TEST_F(MultiHeadedAttentionBMMScaleBiasSoftmaxBMM,
-           FMHABMM1_Scale_Bias_Softmax_BMM2_vanilla_BF16) {
-  TestImpl_FMHABMM1_Scale_Bias_Softmax_BMM2_vanilla<bfloat16>();
-}
+class FlashAttentionBMMScalePaddingMaskSoftmaxBMM
+    : public MultiHeadedAttentionTest {
+ protected:
+  const std::string  // NOLINT
+  GetModuleFlash_Attention_Training_BMM1_PaddingMask_As_Bias_Softmax_BMM2_HloString_BF16() {  // NOLINT
+    const std::string hlo_text = R"(
+    HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[4,1024,4,64]{3,2,1,0}, bf16[4,1024,4,64]{3,2,1,0}, bf16[4,1024,4,64]{3,2,1,0}, bf16[4,1024,4,64]{3,2,1,0})->(bf16[4,1024,4,64]{3,2,1,0}, bf16[4,1024,4,64]{3,2,1,0}, bf16[4,1024,4,64]{3,2,1,0}, bf16[4,1024,4,64]{3,2,1,0})}, allow_spmd_sharding_propagation_to_parameters={true,true,true,true}, allow_spmd_sharding_propagation_to_output={true,true,true,true}
+
+    region_0.32 {
+      Arg_0.33 = bf16[] parameter(0)
+      Arg_1.34 = bf16[] parameter(1)
+      ROOT maximum = bf16[] maximum(Arg_0.33, Arg_1.34)
+    }
+
+    region_1.45 {
+      Arg_0.46 = f32[] parameter(0)
+      Arg_1.47 = f32[] parameter(1)
+      ROOT add = f32[] add(Arg_0.46, Arg_1.47)
+    }
+
+    region_2.80 {
+      Arg_0.81 = bf16[] parameter(0)
+      Arg_1.82 = bf16[] parameter(1)
+      ROOT add.1 = bf16[] add(Arg_0.81, Arg_1.82)
+    }
+
+    ENTRY main.106 {
+      Arg_2.3 = bf16[4,1024,4,64]{3,2,1,0} parameter(2)
+      transpose.16 = bf16[4,4,64,1024]{3,2,1,0} transpose(Arg_2.3), dimensions={0,2,3,1}
+      Arg_0.1 = bf16[4,1024,4,64]{3,2,1,0} parameter(0)
+      transpose.17 = bf16[4,4,1024,64]{3,2,1,0} transpose(Arg_0.1), dimensions={0,2,1,3}
+      Arg_1.2 = bf16[4,1024,4,64]{3,2,1,0} parameter(1)
+      transpose.19 = bf16[4,4,64,1024]{3,2,1,0} transpose(Arg_1.2), dimensions={0,2,3,1}
+      dot = bf16[4,4,1024,1024]{3,2,1,0} dot(transpose.17, transpose.19), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+      iota = s32[1024]{0} iota(), iota_dimension=0
+      constant.9 = s32[] constant(512)
+      broadcast.15 = s32[1024]{0} broadcast(constant.9), dimensions={}
+      compare = pred[1024]{0} compare(iota, broadcast.15), direction=GE
+      broadcast.16 = pred[1024,1024]{1,0} broadcast(compare), dimensions={0}
+      broadcast.17 = pred[1024,1024]{1,0} broadcast(compare), dimensions={1}
+      or = pred[1024,1024]{1,0} or(broadcast.16, broadcast.17)
+      constant.7 = bf16[] constant(-2.199e+12)
+      broadcast.18 = bf16[1024,1024]{1,0} broadcast(constant.7), dimensions={}
+      constant.11 = bf16[] constant(0)
+      broadcast.19 = bf16[1024,1024]{1,0} broadcast(constant.11), dimensions={}
+      select.1 = bf16[1024,1024]{1,0} select(or, broadcast.18, broadcast.19)
+      broadcast.20 = bf16[4,4,1024,1024]{3,2,1,0} broadcast(select.1), dimensions={2,3}
+      add.2 = bf16[4,4,1024,1024]{3,2,1,0} add(dot, broadcast.20)
+      constant.13 = bf16[] constant(-inf)
+      reduce.36 = bf16[4,4,1024]{2,1,0} reduce(add.2, constant.13), dimensions={3}, to_apply=region_0.32
+      broadcast.22 = bf16[4,4,1024,1024]{3,2,1,0} broadcast(reduce.36), dimensions={0,1,2}
+      subtract = bf16[4,4,1024,1024]{3,2,1,0} subtract(add.2, broadcast.22)
+      exponential = bf16[4,4,1024,1024]{3,2,1,0} exponential(subtract)
+      convert.5 = f32[4,4,1024,1024]{3,2,1,0} convert(exponential)
+      constant.12 = f32[] constant(0)
+      reduce.49 = f32[4,4,1024]{2,1,0} reduce(convert.5, constant.12), dimensions={3}, to_apply=region_1.45
+      convert.6 = bf16[4,4,1024]{2,1,0} convert(reduce.49)
+      broadcast.25 = bf16[4,4,1024,1024]{3,2,1,0} broadcast(convert.6), dimensions={0,1,2}
+      divide.2 = bf16[4,4,1024,1024]{3,2,1,0} divide(exponential, broadcast.25)
+      dot.1 = bf16[4,4,64,1024]{3,2,1,0} dot(transpose.16, divide.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
+      iota.1 = s32[1024]{0} iota(), iota_dimension=0
+      compare.1 = pred[1024]{0} compare(iota.1, broadcast.15), direction=LT
+      convert.7 = bf16[1024]{0} convert(compare.1)
+      broadcast.27 = bf16[4,4,64,1024]{3,2,1,0} broadcast(convert.7), dimensions={3}
+      multiply.3 = bf16[4,4,64,1024]{3,2,1,0} multiply(dot.1, broadcast.27)
+      transpose.26 = bf16[4,1024,4,64]{3,2,1,0} transpose(multiply.3), dimensions={0,3,1,2}
+      Arg_3.4 = bf16[4,1024,4,64]{3,2,1,0} parameter(3)
+      transpose.21 = bf16[4,4,1024,64]{3,2,1,0} transpose(Arg_3.4), dimensions={0,2,1,3}
+      broadcast.28 = bf16[4,4,1024,64]{3,2,1,0} broadcast(convert.7), dimensions={2}
+      multiply.4 = bf16[4,4,1024,64]{3,2,1,0} multiply(transpose.21, broadcast.28)
+      dot.2 = bf16[4,4,1024,1024]{3,2,1,0} dot(multiply.4, transpose.16), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+      divide.3 = bf16[4,4,1024,1024]{3,2,1,0} divide(dot.2, broadcast.25)
+      constant.0 = bf16[] constant(1)
+      broadcast.29 = bf16[4,4,1024]{2,1,0} broadcast(constant.0), dimensions={}
+      multiply.5 = bf16[4,4,1024]{2,1,0} multiply(convert.6, convert.6)
+      divide.4 = bf16[4,4,1024]{2,1,0} divide(broadcast.29, multiply.5)
+      broadcast.31 = bf16[4,4,1024,1024]{3,2,1,0} broadcast(divide.4), dimensions={0,1,2}
+      multiply.6 = bf16[4,4,1024,1024]{3,2,1,0} multiply(dot.2, broadcast.31)
+      multiply.7 = bf16[4,4,1024,1024]{3,2,1,0} multiply(multiply.6, exponential)
+      reduce.84 = bf16[4,4,1024]{2,1,0} reduce(multiply.7, constant.11), dimensions={3}, to_apply=region_2.80
+      negate.1 = bf16[4,4,1024]{2,1,0} negate(reduce.84)
+      broadcast.32 = bf16[4,4,1024,1024]{3,2,1,0} broadcast(negate.1), dimensions={0,1,2}
+      add.3 = bf16[4,4,1024,1024]{3,2,1,0} add(divide.3, broadcast.32)
+      multiply.8 = bf16[4,4,1024,1024]{3,2,1,0} multiply(add.3, exponential)
+      transpose.22 = bf16[4,4,1024,64]{3,2,1,0} transpose(Arg_1.2), dimensions={0,2,1,3}
+      dot.4 = bf16[4,4,1024,64]{3,2,1,0} dot(multiply.8, transpose.22), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+      transpose.27 = bf16[4,1024,4,64]{3,2,1,0} transpose(dot.4), dimensions={0,2,1,3}
+      dot.3 = bf16[4,4,1024,64]{3,2,1,0} dot(multiply.8, transpose.17), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+      transpose.28 = bf16[4,1024,4,64]{3,2,1,0} transpose(dot.3), dimensions={0,2,1,3}
+      transpose.24 = bf16[4,4,64,1024]{3,2,1,0} transpose(multiply.4), dimensions={0,1,3,2}
+      dot.73 = bf16[4,4,64,1024]{3,2,1,0} dot(transpose.24, divide.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+      transpose.29 = bf16[4,1024,4,64]{3,2,1,0} transpose(dot.73), dimensions={0,3,1,2}
+      ROOT tuple = (bf16[4,1024,4,64]{3,2,1,0}, bf16[4,1024,4,64]{3,2,1,0}, bf16[4,1024,4,64]{3,2,1,0}, bf16[4,1024,4,64]{3,2,1,0}) tuple(transpose.26, transpose.27, transpose.28, transpose.29)
+    } // main.106
+  )";
+    return hlo_text;
+  }
 
-XLA_TEST_F(MultiHeadedAttentionBMMScaleBiasSoftmaxBMM,
-           FMHA_Training_BMM1_Scale_Bias_Softmax_BMM2_vanilla_F16) {
-  TestImpl_FMHA_Training_BMM1_Scale_Bias_Softmax_BMM2_vanilla<Eigen::half>();
-}
+  const std::string  // NOLINT
+  GetModuleFlash_Attention_Training_BMM1_PaddingMask_Generation_Softmax_BMM2_HloString_BF16() {  // NOLINT
+    const std::string hlo_text = R"(
+    HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[4,1024,4,64]{3,2,1,0}, bf16[4,1024,4,64]{3,2,1,0}, bf16[4,1024,4,64]{3,2,1,0}, bf16[4,1024,4,64]{3,2,1,0})->(bf16[4,1024,4,64]{3,2,1,0}, bf16[4,1024,4,64]{3,2,1,0}, bf16[4,1024,4,64]{3,2,1,0}, bf16[4,1024,4,64]{3,2,1,0})}, allow_spmd_sharding_propagation_to_parameters={true,true,true,true}, allow_spmd_sharding_propagation_to_output={true,true,true,true}
+
+    ENTRY main.21 {
+      Arg_0.1 = bf16[4,1024,4,64]{3,2,1,0} parameter(0)
+      Arg_1.2 = bf16[4,1024,4,64]{3,2,1,0} parameter(1)
+      Arg_2.3 = bf16[4,1024,4,64]{3,2,1,0} parameter(2)
+      constant.5 = s32[] constant(512)
+      broadcast.6 = s32[4]{0} broadcast(constant.5), dimensions={}
+      custom-call.7 = (bf16[4,4,1024,64]{3,1,2,0}, f32[4,4,1024]{2,1,0}, u8[0]{0}) custom-call(Arg_0.1, Arg_1.2, Arg_2.3, broadcast.6, broadcast.6), custom_call_target="__cudnn$fmhaSoftmax", operand_layout_constraints={bf16[4,1024,4,64]{3,2,1,0}, bf16[4,1024,4,64]{3,2,1,0}, bf16[4,1024,4,64]{3,2,1,0}, s32[4]{0}, s32[4]{0}}, api_version=API_VERSION_STATUS_RETURNING, backend_config={"operation_queue_id": "0", "wait_on_operation_queues": [], "cudnn_fmha_backend_config": {"algorithm": {"algo_id": "0", "math_type": "TENSOR_OP_MATH", "tuning_knobs": {"17": "1", "24": "0"}, "is_cudnn_frontend": true, "workspace_size": "0"}, "fmha_scale": 1.0, "dropout_rate": 0, "intermediate_tensor_shape": {"element_type": "BF16", "dimensions": ["4", "4", "1024", "1024"], "tuple_shapes": [], "layout": {"dim_level_types": [], "dim_unique": [], "dim_ordered": [], "minor_to_major": ["3", "2", "1", "0"], "tiles": [], "element_size_in_bits": "0", "memory_space": "0", "index_primitive_type": "PRIMITIVE_TYPE_INVALID", "pointer_primitive_type": "PRIMITIVE_TYPE_INVALID", "dynamic_shape_metadata_prefix_bytes": "0"}, "is_dynamic_dimension": [false, false, false, false]}, "seed": 42, "is_flash_attention": true, "mask_type": "PADDING", "bmm1_dot_dimension_numbers": {"lhs_contracting_dimensions": ["3"], "rhs_contracting_dimensions": ["3"], "lhs_batch_dimensions": ["0", "2"], "rhs_batch_dimensions": ["0", "2"]}, "bmm2_dot_dimension_numbers": {"lhs_contracting_dimensions": ["3"], "rhs_contracting_dimensions": ["1"], "lhs_batch_dimensions": ["0", "1"], "rhs_batch_dimensions": ["0", "2"]}}}
+      get-tuple-element.9 = u8[0]{0} get-tuple-element(custom-call.7), index=2
+      get-tuple-element.10 = f32[4,4,1024]{2,1,0} get-tuple-element(custom-call.7), index=1
+      Arg_3.4 = bf16[4,1024,4,64]{3,2,1,0} parameter(3)
+      get-tuple-element.8 = bf16[4,4,1024,64]{3,1,2,0} get-tuple-element(custom-call.7), index=0
+      transpose.11 = bf16[4,1024,4,64]{3,2,1,0} transpose(get-tuple-element.8), dimensions={0,2,1,3}
+      custom-call.12 = (bf16[4,4,1024,64]{3,1,2,0}, bf16[4,4,1024,64]{3,1,2,0}, bf16[4,4,1024,64]{3,1,2,0}, u8[0]{0}) custom-call(Arg_0.1, Arg_1.2, Arg_2.3, get-tuple-element.10, Arg_3.4, /*index=5*/transpose.11, broadcast.6, broadcast.6), custom_call_target="__cudnn$fmhaSoftmaxBackward", operand_layout_constraints={bf16[4,1024,4,64]{3,2,1,0}, bf16[4,1024,4,64]{3,2,1,0}, bf16[4,1024,4,64]{3,2,1,0}, f32[4,4,1024]{2,1,0}, bf16[4,1024,4,64]{3,2,1,0}, bf16[4,1024,4,64]{3,2,1,0}, s32[4]{0}, s32[4]{0}}, api_version=API_VERSION_STATUS_RETURNING, backend_config={"operation_queue_id": "0", "wait_on_operation_queues": [], "cudnn_fmha_backend_config": {"algorithm": {"algo_id": "0", "math_type": "TENSOR_OP_MATH", "tuning_knobs": {"17": "1", "24": "0"}, "is_cudnn_frontend": true, "workspace_size": "0"}, "fmha_scale": 1.0, "dropout_rate": 0, "intermediate_tensor_shape": {"element_type": "BF16", "dimensions": ["4", "4", "1024", "1024"], "tuple_shapes": [], "layout": {"dim_level_types": [], "dim_unique": [], "dim_ordered": [], "minor_to_major": ["3", "2", "1", "0"], "tiles": [], "element_size_in_bits": "0", "memory_space": "0", "index_primitive_type": "PRIMITIVE_TYPE_INVALID", "pointer_primitive_type": "PRIMITIVE_TYPE_INVALID", "dynamic_shape_metadata_prefix_bytes": "0"}, "is_dynamic_dimension": [false, false, false, false]}, "seed": 42, "is_flash_attention": true, "mask_type": "PADDING", "bmm1_grad_gemm1_dot_dimension_numbers": {"lhs_contracting_dimensions": ["2"], "rhs_contracting_dimensions": ["1"], "lhs_batch_dimensions": ["0", "1"], "rhs_batch_dimensions": ["0", "2"]}, "bmm1_grad_gemm2_dot_dimension_numbers": {"lhs_contracting_dimensions": ["3"], "rhs_contracting_dimensions": ["1"], "lhs_batch_dimensions": ["0", "1"], "rhs_batch_dimensions": ["0", "2"]}, "bmm2_grad_gemm1_dot_dimension_numbers": {"lhs_contracting_dimensions": ["2"], "rhs_contracting_dimensions": ["1"], "lhs_batch_dimensions": ["0", "1"], "rhs_batch_dimensions": ["0", "2"]}, "bmm2_grad_gemm2_dot_dimension_numbers": {"lhs_contracting_dimensions": ["3"], "rhs_contracting_dimensions": ["3"], "lhs_batch_dimensions": ["0", "2"], "rhs_batch_dimensions": ["0", "2"]}}}
+      get-tuple-element.16 = u8[0]{0} get-tuple-element(custom-call.12), index=3
+      get-tuple-element.13 = bf16[4,4,1024,64]{3,1,2,0} get-tuple-element(custom-call.12), index=0
+      transpose.17 = bf16[4,1024,4,64]{3,2,1,0} transpose(get-tuple-element.13), dimensions={0,2,1,3}
+      get-tuple-element.14 = bf16[4,4,1024,64]{3,1,2,0} get-tuple-element(custom-call.12), index=1
+      transpose.18 = bf16[4,1024,4,64]{3,2,1,0} transpose(get-tuple-element.14), dimensions={0,2,1,3}
+      get-tuple-element.15 = bf16[4,4,1024,64]{3,1,2,0} get-tuple-element(custom-call.12), index=2
+      transpose.19 = bf16[4,1024,4,64]{3,2,1,0} transpose(get-tuple-element.15), dimensions={0,2,1,3}
+      ROOT tuple.20 = (bf16[4,1024,4,64]{3,2,1,0}, bf16[4,1024,4,64]{3,2,1,0}, bf16[4,1024,4,64]{3,2,1,0}, bf16[4,1024,4,64]{3,2,1,0}) tuple(transpose.11, transpose.17, transpose.18, transpose.19)
+    } // main.21
+    )";
+    return hlo_text;
+  }
 
-XLA_TEST_F(MultiHeadedAttentionBMMScaleBiasSoftmaxBMM,
-           FMHA_Training_BMM1_Scale_Bias_Softmax_BMM2_vanilla_BF16) {
-  TestImpl_FMHA_Training_BMM1_Scale_Bias_Softmax_BMM2_vanilla<bfloat16>();
-}
+  template <typename T>
+  void TestImpl_Flash_Attention_Training_BMM1_PaddingMask_Softmax_BMM2() {
+    if (skip_reason_) GTEST_SKIP() << *skip_reason_;
+    if (GetDnnVersionInfo(backend().default_stream_executor()) <
+        se::dnn::VersionInfo(8, 9, 3)) {
+      GTEST_SKIP() << "Flash Attention requires cuDNN >= 8.9.3.";
+    }
+    XlaBuilder builder(TestName());
+    // pass padding mask as bias
+    std::string hlo_string =
+        GetModuleFlash_Attention_Training_BMM1_PaddingMask_As_Bias_Softmax_BMM2_HloString_BF16();  // NOLINT
+    // generate padding mask in cuDNN directly
+    // XLA pattern match does not support pattern matching padding mask
+    // so directly lower to custom call instead for reference
+    std::string hlo_string_ref =
+        GetModuleFlash_Attention_Training_BMM1_PaddingMask_Generation_Softmax_BMM2_HloString_BF16();  // NOLINT
+    EXPECT_TRUE(RunAndCompareTwoModules(hlo_string, hlo_string_ref,
+                                        ErrorSpec{1e-5, 1e-5}));
+  }
+};
 
-// flash attention
 // BMM1 - Scale - CausalMask - Softmax - BMM2
 XLA_TEST_F(FlashAttentionBMMScaleCausalMaskSoftmaxBMM,
            Flash_Attention_BMM1_CausalMask_Softmax_BMM2_BF16) {
@@ -1920,11 +1058,22 @@ XLA_TEST_F(FlashAttentionBMMScaleBiasSoftmaxBMM,
   TestImpl_Flash_Attention_BMM1_Bias_Softmax_BMM2_Cross_Attention<bfloat16>();
 }
 
+XLA_TEST_F(FlashAttentionBMMScaleBiasSoftmaxBMM,
+           Flash_Attention_BMM1_Bias_Softmax_BMM2_BF16_Dbias) {
+  TestImpl_Flash_Attention_BMM1_Bias_Softmax_BMM2_Dbias<bfloat16>();
+}
+
 // BMM1 - Scale - Softmax - BMM2
 XLA_TEST_F(FlashAttentionBMMScaleSoftmaxBMM,
            Flash_Attention_Training_BMM1_Softmax_BMM2_BF16) {
   TestImpl_Flash_Attention_Training_BMM1_Softmax_BMM2<bfloat16>();
 }
+
+// BMM1 - Scale - PaddingMask - Softmax - BMM2
+XLA_TEST_F(FlashAttentionBMMScalePaddingMaskSoftmaxBMM,
+           Flash_Attention_Training_BMM1_PaddingMask_Softmax_BMM2_BF16) {
+  TestImpl_Flash_Attention_Training_BMM1_PaddingMask_Softmax_BMM2<bfloat16>();
+}
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_sparse_dot_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_sparse_dot_test.cc
new file mode 100644
index 00000000000000..3ccc419df67dac
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/tests/gpu_sparse_dot_test.cc
@@ -0,0 +1,176 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <limits>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/str_cat.h"
+#include "absl/strings/substitute.h"
+#include "absl/types/span.h"
+#include "third_party/half/half.hpp"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
+#include "xla/service/gpu/tests/gpu_codegen_test.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class SparseDotTest
+    : public GpuCodegenTest,
+      public ::testing::WithParamInterface<std::tuple<int, int, int>> {
+ protected:
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_enable_triton_gemm(true);
+    debug_options.set_xla_gpu_autotune_level(0);
+    return debug_options;
+  }
+
+  // Combinations of 2-item indices in the 4-group, where the first index is
+  // less than the second.
+  const int indices_2_in_4_[6] = {0b0100, 0b1000, 0b1100,
+                                  0b1001, 0b1101, 0b1110};
+
+  std::vector<uint16_t> CreateInput(int width, int height) {
+    std::vector<uint16_t> input(width * height);
+    for (int i = 0; i < height; ++i) {
+      for (int j = 0; j < width; ++j) {
+        int value = (i * 2 + j) % 100 + 1;
+        input[i * width + j] =
+            half_float::detail::int2half<std::round_to_nearest>(value);
+      }
+    }
+    return input;
+  }
+
+  std::vector<uint16_t> CreateMeta(int width, int height) {
+    std::vector<uint16_t> meta(width * height);
+    for (int i = 0; i < height; ++i) {
+      for (int j = 0; j < width; ++j) {
+        uint16_t bitmask = 0;
+        for (int k = 0; k < 4; ++k) {
+          int index = (i + j * 2 + k * 3) % 6;
+          bitmask |= indices_2_in_4_[index] << (k * 4);
+        }
+        meta[i * width + j] = bitmask;
+      }
+    }
+    return meta;
+  }
+
+  std::vector<uint16_t> Sparsify(absl::Span<uint16_t> input,
+                                 absl::Span<uint16_t> meta) {
+    std::vector<uint16_t> result(input.size());
+    for (int i = 0; i < input.size(); i += 16) {
+      for (int j = 0; j < 4; ++j) {
+        int mask = meta[i / 16] >> (j * 4);
+        int p1 = i + j * 4 + (mask & 0b0011);
+        int p2 = i + j * 4 + (mask & 0b1100) / 4;
+        result[p1] = input[p1];
+        result[p2] = input[p2];
+      }
+    }
+    return result;
+  }
+
+  std::vector<uint16_t> Compress(absl::Span<uint16_t> input) {
+    std::vector<uint16_t> result;
+    for (uint16_t value : input) {
+      if (value != 0) result.push_back(value);
+    }
+    return result;
+  }
+};
+
+TEST_P(SparseDotTest, CompareWithDense) {
+  int m, n, k;
+  std::tie(m, n, k) = GetParam();
+
+  auto in1 = CreateInput(m, k);
+  auto in2 = CreateInput(n, k);
+  auto meta = CreateMeta(m, k / 16);
+  auto sparse_zeros = Sparsify(absl::MakeSpan(in1), absl::MakeSpan(meta));
+  auto sparse_packed = Compress(absl::MakeSpan(sparse_zeros));
+
+  // Execute dense dot.
+  const char* kDenseTpl = R"(
+HloModule TestDense
+
+ENTRY main {
+  lhs = f16[$0,$1] parameter(0)
+  rhs = f16[$2,$1] parameter(1)
+  ROOT dot = f32[$0,$2] dot(lhs, rhs),
+      lhs_contracting_dims={1}, rhs_contracting_dims={1}
+})";
+  auto dense_hlo = absl::Substitute(kDenseTpl, m, k, n);
+
+  Literal dense_lhs =
+      LiteralUtil::CreateR1<uint16_t>(absl::MakeSpan(sparse_zeros));
+  Literal dense_rhs = LiteralUtil::CreateR1<uint16_t>(absl::MakeSpan(in2));
+
+  auto dense_module = ParseAndReturnVerifiedModule(dense_hlo);
+  EXPECT_OK(dense_module);
+  auto dense_result =
+      Execute(std::move(*dense_module), {&dense_lhs, &dense_rhs});
+  EXPECT_OK(dense_result);
+
+  // Execute sparse dot.
+  const char* kSparseTpl = R"(
+HloModule TestSparse
+
+ENTRY main {
+  lhs = f16[$0,$1] parameter(0)
+  rhs = f16[$2,$3] parameter(1)
+  meta = u16[$0,$4] parameter(2)
+  ROOT dot = f32[$0,$2] dot(lhs, rhs, meta),
+      lhs_contracting_dims={1}, rhs_contracting_dims={1}, sparsity=L.1@2:4
+})";
+  auto sparse_hlo = absl::Substitute(kSparseTpl, m, k / 2, n, k, k / 16);
+
+  Literal sparse_lhs =
+      LiteralUtil::CreateR1<uint16_t>(absl::MakeSpan(sparse_packed));
+  Literal sparse_rhs = LiteralUtil::CreateR1<uint16_t>(absl::MakeSpan(in2));
+  Literal sparse_meta = LiteralUtil::CreateR1<uint16_t>(absl::MakeSpan(meta));
+
+  auto sparse_module = ParseAndReturnVerifiedModule(sparse_hlo);
+  EXPECT_OK(sparse_module);
+  auto sparse_result = Execute(std::move(*sparse_module),
+                               {&sparse_lhs, &sparse_rhs, &sparse_meta});
+  EXPECT_OK(sparse_result);
+
+  // Compare the results.
+  EXPECT_EQ(*dense_result, *sparse_result);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    Sparsity, SparseDotTest,
+    ::testing::Combine(/*m=*/::testing::Values(32, 256),
+                       /*n=*/::testing::Values(32, 256),
+                       /*k=*/::testing::Values(64, 512)),
+    [](const ::testing::TestParamInfo<SparseDotTest::ParamType>& info) {
+      return absl::StrCat("m", std::get<0>(info.param), "n",
+                          std::get<1>(info.param), "k",
+                          std::get<2>(info.param));
+    });
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_spmd_e2e_compile_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_spmd_e2e_compile_test.cc
index 0276247d3b1131..e247abe6872847 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_spmd_e2e_compile_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_spmd_e2e_compile_test.cc
@@ -32,7 +32,14 @@ namespace xla {
 namespace gpu {
 namespace {
 
-class GpuSpmdE2ECompileTest : public GpuCodegenTest {};
+class GpuSpmdE2ECompileTest : public GpuCodegenTest {
+ public:
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_autotune_level(0);
+    return debug_options;
+  }
+};
 
 TEST_F(GpuSpmdE2ECompileTest, SinglePartition) {
   // Module with "Sharding" custom call and use_spmd_partitioning enabled.
@@ -73,7 +80,7 @@ ENTRY main {
   HloModuleConfig config;
   config.set_use_spmd_partitioning(true);
   config.set_num_partitions(2);
-  config.set_debug_options(GetDebugOptionsFromFlags());
+  config.set_debug_options(GetDebugOptionsForTest());
   auto hlo_module = ParseAndReturnVerifiedModule(hlo_string, config).value();
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
@@ -106,7 +113,7 @@ ENTRY main {
   HloModuleConfig config;
   config.set_use_spmd_partitioning(true);
   config.set_num_partitions(4);
-  config.set_debug_options(GetDebugOptionsFromFlags());
+  config.set_debug_options(GetDebugOptionsForTest());
   auto hlo_module = ParseAndReturnVerifiedModule(hlo_string, config).value();
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_too_many_blocks_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_too_many_blocks_test.cc
index 87b2c797232686..9832fb8895a13b 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_too_many_blocks_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_too_many_blocks_test.cc
@@ -51,10 +51,12 @@ ENTRY primitive_computation_mul.8 {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
                           GetOptimizedModule(hlo_text));
 
+  se::StreamExecutorMemoryAllocator allocator(
+      backend().default_stream_executor());
   absl::StatusOr<std::unique_ptr<Executable>> failed_executable =
-      backend().compiler()->RunBackend(
-          std::move(optimized_module), backend().default_stream_executor(),
-          backend().default_stream_executor()->GetAllocator());
+      backend().compiler()->RunBackend(std::move(optimized_module),
+                                       backend().default_stream_executor(),
+                                       &allocator);
 
   EXPECT_FALSE(failed_executable.ok());
   EXPECT_THAT(
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_triton_custom_call_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_triton_custom_call_test.cc
index 52351018c743bb..71ab9fbf17f98f 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_triton_custom_call_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_triton_custom_call_test.cc
@@ -56,14 +56,14 @@ std::unique_ptr<HloInstruction> CreateAddTritonCustomCall(
   module {
     tt.func public @add_one(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 32 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 32 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 32 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 32 : i32}) {
       %0 = tt.get_program_id x : i32
-      %1 = tt.load %arg0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
-      %2 = tt.load %arg1 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
+      %1 = tt.load %arg0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : !tt.ptr<f32>
+      %2 = tt.load %arg1 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : !tt.ptr<f32>
       %cst = arith.constant 1.000000e+00 : f32
       %3 = arith.addf %1, %cst : f32
-      %4 = tt.load %arg2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
-      tt.store %arg2, %3 {cache = 1 : i32, evict = 1 : i32} : f32
-      %5 = tt.load %arg3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
-      tt.store %arg3, %2 {cache = 1 : i32, evict = 1 : i32} : f32
+      %4 = tt.load %arg2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : !tt.ptr<f32>
+      tt.store %arg2, %3 {cache = 1 : i32, evict = 1 : i32} : !tt.ptr<f32>
+      %5 = tt.load %arg3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : !tt.ptr<f32>
+      tt.store %arg3, %2 {cache = 1 : i32, evict = 1 : i32} : !tt.ptr<f32>
       tt.return
     }
   }
diff --git a/third_party/xla/xla/service/gpu/tests/launch_dimensions.hlo b/third_party/xla/xla/service/gpu/tests/launch_dimensions.hlo
index bcfa37733f7e67..3d05dcf9892ad5 100644
--- a/third_party/xla/xla/service/gpu/tests/launch_dimensions.hlo
+++ b/third_party/xla/xla/service/gpu/tests/launch_dimensions.hlo
@@ -2,7 +2,7 @@
 // This tests that we do not increase the grid launch size when
 // few_waves is enabled.
 
-// CHECK-LABEL:   define void @wrapped_b
+// CHECK-LABEL:   define{{( amdgpu_kernel)?}} void @wrapped_b
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
 // CHECK-GCN-DAG: call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]]
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
@@ -27,7 +27,7 @@ ENTRY main {
 
 // This tests that we cap grid launch code when few_waves is enabled.
 
-// CHECK-LABEL:   define void @wrapped_b
+// CHECK-LABEL:   define{{( amdgpu_kernel)?}} void @wrapped_b
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
 // CHECK-GCN-DAG: call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]]
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
@@ -53,7 +53,7 @@ ENTRY main {
 // This tests that we cap grid launch code when few_waves is enabled
 // and scalar broadcast are present.
 
-// CHECK-LABEL:   define void @fusion_3
+// CHECK-LABEL:   define{{( amdgpu_kernel)?}} void @fusion_3
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
 // CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 1008}
@@ -84,7 +84,7 @@ ENTRY main {
 // This tests that we enable few_waves in a simple fusion. It is the baseline
 // for the tests below.
 
-// CHECK-LABEL:   define void @fusion
+// CHECK-LABEL:   define{{( amdgpu_kernel)?}} void @fusion
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
 // CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 1008}
@@ -113,7 +113,7 @@ ENTRY main {
 
 // This tests that we keep few_waves enabled for large constants.
 
-// CHECK-LABEL:   define void @fusion
+// CHECK-LABEL:   define{{( amdgpu_kernel)?}} void @fusion
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
 // CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 1008}
@@ -141,7 +141,7 @@ ENTRY main {
 
 // This tests that we disable few_waves if a non-elementwise op is present.
 
-// CHECK-LABEL:   define void @fusion
+// CHECK-LABEL:   define{{( amdgpu_kernel)?}} void @fusion
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
 // CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 195313}
@@ -175,7 +175,7 @@ ENTRY main {
 // - the fusion is not row-vectorizable
 // It serves as a baseline for the tests below.
 
-// CHECK-LABEL:   define void @fusion
+// CHECK-LABEL:   define{{( amdgpu_kernel)?}} void @fusion
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
 // CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 7813}
@@ -219,7 +219,7 @@ ENTRY main {
 // - the fusion IS row-vectorizable
 // In this case, the block count is changed from 7813 to 2000.
 
-// CHECK-LABEL:   define void @fusion
+// CHECK-LABEL:   define{{( amdgpu_kernel)?}} void @fusion
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
 // CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 2000}
@@ -260,7 +260,7 @@ ENTRY main {
 // - the fusion is not row-vectorizable
 // In this case, the block count is changed from 7813 to 1008.
 
-// CHECK-LABEL:   define void @fusion
+// CHECK-LABEL:   define{{( amdgpu_kernel)?}} void @fusion
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
 // CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 1008}
@@ -300,7 +300,7 @@ ENTRY main {
 // This tests the GELU kernel. The original kernel that
 // motivated few_waves implementation.
 
-// CHECK-LABEL:   define void @fusion
+// CHECK-LABEL:   define{{( amdgpu_kernel)?}} void @fusion
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
 // CHECK-GCN-DAG: call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]]
 // CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
diff --git a/third_party/xla/xla/service/gpu/tests/nop_custom_call_test.cc b/third_party/xla/xla/service/gpu/tests/nop_custom_call_test.cc
new file mode 100644
index 00000000000000..d979d18aa8ac9d
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/tests/nop_custom_call_test.cc
@@ -0,0 +1,53 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "xla/tests/hlo_test_base.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class NopCustomCallTest : public HloTestBase {};
+
+TEST_F(NopCustomCallTest, RunAllocateBufferAndUpdate) {
+  // The test uses a custom call with the AllocateBuffer target (also known as
+  // kNopCustomCallTarget) to allocate an output buffer. Then it verifies
+  // we can successfully modify the buffer.
+  const char* hlo_text = R"(
+  HloModule AllocateBuffer, is_scheduled=true
+
+  overwrite_one {
+    p0 = s32[1] parameter(0)
+    c0 = s32[] constant(0)
+    c1 = s32[1] constant({1})
+    ROOT dus0 = s32[1] dynamic-update-slice(p0, c1, c0)
+  }
+
+  ENTRY main {
+    buffer = s32[1] custom-call(), custom_call_target="AllocateBuffer"
+    ROOT fusion = s32[1] fusion(buffer), kind=kLoop, calls=overwrite_one
+  })";
+  auto module = ParseAndReturnVerifiedModule(hlo_text).value();
+
+  Literal result = ExecuteNoHloPasses(std::move(module), {});
+  Literal expected = LiteralUtil::CreateR1<int32_t>({1});
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/tests/reduce_atomic_min.hlo b/third_party/xla/xla/service/gpu/tests/reduce_atomic_min.hlo
index cff4ff8a79a5b9..0fcecb093bded5 100644
--- a/third_party/xla/xla/service/gpu/tests/reduce_atomic_min.hlo
+++ b/third_party/xla/xla/service/gpu/tests/reduce_atomic_min.hlo
@@ -39,7 +39,7 @@ ENTRY reduce.1 {
 // CHECK:       wrapped_reduce.in_bounds-after:                   ; preds = %[[VAL_9]], %[[VAL_11:.*]]
 // CHECK:         ret void
 // CHECK:       wrapped_reduce.in_bounds-true:                    ; preds = %[[VAL_11]]
-// CHECK:         %[[VAL_12:.*]] = load float, ptr %[[VAL_13:.*]], align 4, !invariant.load !5
+// CHECK:         %[[VAL_12:.*]] = load float, ptr %[[VAL_13:.*]], align 4, !invariant.load
 // CHECK:         store float %[[VAL_12]], ptr %[[VAL_14:.*]], align 4
 // CHECK:         br label %[[VAL_10]]
 // CHECK:       entry:
@@ -149,7 +149,6 @@ ENTRY reduce.1 {
 // CHECK:       loop2.loop_body:                                  ; preds = %[[VAL_107]]
 // CHECK:         %[[VAL_111:.*]] = add nuw nsw i32 %[[VAL_109]], 1
 // CHECK:         store i32 %[[VAL_111]], ptr %[[VAL_43]], align 4
-// CHECK:         %[[VAL_112:.*]] = icmp eq i32 %[[VAL_109]], 0
 // CHECK:         %[[OFFSET_2:.*]] = add i32 %loop2.indvar, %thread.id.2
 // CHECK:         store i32 0, ptr %loop3.invar_address, align 4
 // CHECK:         br label %loop3.loop_header
@@ -189,7 +188,6 @@ ENTRY reduce.1 {
 // CHECK:       loop2.loop_body5:
 // CHECK:         %[[VAL_139:.*]] = add nuw nsw i32 %[[VAL_136]], 1
 // CHECK:         store i32 %[[VAL_139]], ptr %[[LOOP2_I_2]], align 4
-// CHECK:         %[[VAL_140:.*]] = icmp eq i32 %[[VAL_136]], 0
 // CHECK:         %[[VAL_141:.*]] = add i32 %[[VAL_136]], %thread.id.2
 // CHECK:         %[[VAL_144:.*]] = icmp ult i32 %[[VAL_141]], %tile_bound.2
 // CHECK:         br i1 %[[VAL_144]], label %x_in_tile-true, label %x_in_tile-after
diff --git a/third_party/xla/xla/service/gpu/tests/reduce_column_layout_change.hlo b/third_party/xla/xla/service/gpu/tests/reduce_column_layout_change.hlo
index 4c90b12e02ca73..122929f3df280a 100644
--- a/third_party/xla/xla/service/gpu/tests/reduce_column_layout_change.hlo
+++ b/third_party/xla/xla/service/gpu/tests/reduce_column_layout_change.hlo
@@ -43,7 +43,7 @@ ENTRY kernel_entry {
 // CHECK:         ret void
 // CHECK:       reduce-group-0-true:                              ; preds = %[[VAL_20]]
 // CHECK:         %[[VAL_21:.*]] = load float, ptr @0, align 4
-// CHECK:         store float %[[VAL_21]], ptr %[[VAL_13]], align 4
+// CHECK:         store float %[[VAL_21]], ptr{{( addrspace\(5\))?}} %[[VAL_13]], align 4
 // CHECK-PTX:     %thread.id.x = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !3
 // CHECK-GCN:     %thread.id.x = call i32 @llvm.amdgcn.workitem.id.x
 // CHECK-PTX:     %block.id.x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !4
@@ -62,73 +62,84 @@ ENTRY kernel_entry {
 // CHECK:         %tile_origin.0 = mul i32 %[[VAL_27]], 1
 // CHECK:         %tile_origin.1 = mul i32 %[[VAL_26]], 4096
 // CHECK:         %tile_origin.2 = mul i32 %[[VAL_24]], 32
-// CHECK:         store i32 %thread.id.1, ptr %[[VAL_12]], align 4
+// CHECK:         store i32 %thread.id.1, ptr{{( addrspace\(5\))?}} %[[VAL_12]], align 4
 // CHECK:         br label %[[VAL_29:.*]]
 // CHECK:       loop1.loop_header:                                ; preds = %[[VAL_30:.*]], %[[VAL_17]]
-// CHECK:         %[[VAL_31:.*]] = load i32, ptr %[[VAL_12]], align 4
+// CHECK:         %[[VAL_31:.*]] = load i32, ptr{{( addrspace\(5\))?}} %[[VAL_12]], align 4
 // CHECK:         %[[VAL_32:.*]] = icmp uge i32 %[[VAL_31]], %tile_bound.1
 // CHECK:         br i1 %[[VAL_32]], label %[[VAL_33:.*]], label %[[VAL_34:.*]]
 // CHECK:       loop1.loop_body:                                  ; preds = %[[VAL_29]]
 // CHECK:         %[[VAL_35:.*]] = add nuw nsw i32 %[[VAL_31]], 32
-// CHECK:         store i32 %[[VAL_35]], ptr %[[VAL_12]], align 4
-// CHECK:         %[[VAL_36:.*]] = icmp eq i32 %[[VAL_31]], %thread.id.1
-// CHECK:         store i32 0, ptr %[[VAL_11]], align 4
+// CHECK:         store i32 %[[VAL_35]], ptr{{( addrspace\(5\))?}} %[[VAL_12]], align 4
+// CHECK:         store i32 0, ptr{{( addrspace\(5\))?}} %[[VAL_11]], align 4
 // CHECK:         br label %[[VAL_37:.*]]
 // CHECK:       loop2.loop_header:                                ; preds = %[[VAL_38:.*]], %[[VAL_34]]
-// CHECK:         %[[VAL_39:.*]] = load i32, ptr %[[VAL_11]], align 4
+// CHECK:         %[[VAL_39:.*]] = load i32, ptr{{( addrspace\(5\))?}} %[[VAL_11]], align 4
 // CHECK:         %[[VAL_40:.*]] = icmp uge i32 %[[VAL_39]], 32
 // CHECK:         br i1 %[[VAL_40]], label %[[VAL_30]], label %[[VAL_41:.*]]
 // CHECK:       loop2.loop_body:                                  ; preds = %[[VAL_37]]
 // CHECK:         %[[VAL_42:.*]] = add nuw nsw i32 %[[VAL_39]], 32
-// CHECK:         store i32 %[[VAL_42]], ptr %[[VAL_11]], align 4
-// CHECK:         %[[VAL_43:.*]] = icmp eq i32 %[[VAL_39]], 0
+// CHECK:         store i32 %[[VAL_42]], ptr{{( addrspace\(5\))?}} %[[VAL_11]], align 4
 // CHECK:         %[[VAL_44:.*]] = add i32 %[[VAL_39]], %thread.id.2
 // CHECK:         %[[VAL_45:.*]] = icmp ult i32 %[[VAL_44]], 32
 // CHECK:         br i1 %[[VAL_45]], label %[[VAL_46:.*]], label %[[VAL_38]]
 // CHECK:       x_in_tile-after:                                  ; preds = %[[VAL_46]], %[[VAL_41]]
-// CHECK:         br label %[[VAL_37]], !llvm.loop !5
+// CHECK:         br label %[[VAL_37]], !llvm.loop
 // CHECK:       loop2.loop_exit:                                  ; preds = %[[VAL_37]]
-// CHECK:         br label %[[VAL_29]], !llvm.loop !8
+// CHECK:         br label %[[VAL_29]], !llvm.loop
 // CHECK:       loop1.loop_exit:                                  ; preds = %[[VAL_29]]
-// CHECK:         %[[VAL_47:.*]] = load float, ptr %[[VAL_13]], align 4
+// CHECK:         %[[VAL_47:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_13]], align 4
 // CHECK:         %[[VAL_48:.*]] = getelementptr inbounds [32 x [33 x float]], ptr addrspace(3) @shared_cache, i32 0, i32 %thread.id.2, i32 %thread.id.1
 // CHECK:         %[[VAL_49:.*]] = addrspacecast ptr addrspace(3) %[[VAL_48]] to ptr
 // CHECK:         store float %[[VAL_47]], ptr %[[VAL_49]], align 4
-// CHECK:         call void @llvm.nvvm.barrier0()
+// CHECK-PTX:         call void @llvm.nvvm.barrier0()
+// CHECK-GCN:         call void @llvm.amdgcn.s.barrier()
 // CHECK:         %[[VAL_50:.*]] = getelementptr inbounds [32 x [33 x float]], ptr addrspace(3) @shared_cache, i32 0, i32 %thread.id.1, i32 %thread.id.2
 // CHECK:         %[[VAL_51:.*]] = addrspacecast ptr addrspace(3) %[[VAL_50]] to ptr
 // CHECK:         %[[VAL_52:.*]] = load float, ptr %[[VAL_51]], align 4
-// CHECK:         %[[VAL_53:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_52]], i32 16, i32 31)
-// CHECK:         store float %[[VAL_53]], ptr %[[VAL_9]], align 4
-// CHECK:         call void @[[REDUCTION0:reduction0.*]](ptr %[[VAL_51]], ptr %[[VAL_9]], ptr %[[VAL_8]])
-// CHECK:         %[[VAL_54:.*]] = load float, ptr %[[VAL_8]], align 4
+// CHECK-PTX:         %[[VAL_53:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_52]], i32 16, i32 31)
+// CHECK-GCN:         %[[VAL_53_:.*]] = call i32 @__ockl_readuplane_i32
+// CHECK-GCN:         %[[VAL_53:.*]] = bitcast i32
+// CHECK:         store float %[[VAL_53]], ptr{{( addrspace\(5\))?}} %[[VAL_9]], align 4
+// CHECK-PTX:         call void @[[REDUCTION0:reduction0.*]](ptr %[[VAL_51]], ptr %[[VAL_9]], ptr %[[VAL_8]])
+// CHECK:         %[[VAL_54:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_8]], align 4
 // CHECK:         store float %[[VAL_54]], ptr %[[VAL_51]], align 4
-// CHECK:         %[[VAL_55:.*]] = load float, ptr %[[VAL_51]], align 4
-// CHECK:         %[[VAL_56:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_55]], i32 8, i32 31)
-// CHECK:         store float %[[VAL_56]], ptr %[[VAL_7]], align 4
-// CHECK:         call void @[[REDUCTION0]](ptr %[[VAL_51]], ptr %[[VAL_7]], ptr %[[VAL_6]])
-// CHECK:         %[[VAL_57:.*]] = load float, ptr %[[VAL_6]], align 4
-// CHECK:         store float %[[VAL_57]], ptr %[[VAL_51]], align 4
-// CHECK:         %[[VAL_58:.*]] = load float, ptr %[[VAL_51]], align 4
-// CHECK:         %[[VAL_59:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_58]], i32 4, i32 31)
-// CHECK:         store float %[[VAL_59]], ptr %[[VAL_5]], align 4
-// CHECK:         call void @[[REDUCTION0]](ptr %[[VAL_51]], ptr %[[VAL_5]], ptr %[[VAL_4]])
-// CHECK:         %[[VAL_60:.*]] = load float, ptr %[[VAL_4]], align 4
-// CHECK:         store float %[[VAL_60]], ptr %[[VAL_51]], align 4
-// CHECK:         %[[VAL_61:.*]] = load float, ptr %[[VAL_51]], align 4
-// CHECK:         %[[VAL_62:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_61]], i32 2, i32 31)
-// CHECK:         store float %[[VAL_62]], ptr %[[VAL_3]], align 4
-// CHECK:         call void @[[REDUCTION0]](ptr %[[VAL_51]], ptr %[[VAL_3]], ptr %[[VAL_2]])
-// CHECK:         %[[VAL_63:.*]] = load float, ptr %[[VAL_2]], align 4
-// CHECK:         store float %[[VAL_63]], ptr %[[VAL_51]], align 4
-// CHECK:         %[[VAL_64:.*]] = load float, ptr %[[VAL_51]], align 4
-// CHECK:         %[[VAL_65:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_64]], i32 1, i32 31)
-// CHECK:         store float %[[VAL_65]], ptr %[[VAL_1]], align 4
-// CHECK:         call void @[[REDUCTION0]](ptr %[[VAL_51]], ptr %[[VAL_1]], ptr %[[VAL_0]])
-// CHECK:         %[[VAL_66:.*]] = load float, ptr %[[VAL_0]], align 4
-// CHECK:         store float %[[VAL_66]], ptr %[[VAL_51]], align 4
-// CHECK:         %[[VAL_67:.*]] = icmp ult i32 %thread.id.1, 32
-// CHECK:         %[[VAL_68:.*]] = icmp ult i32 %thread.id.2, %tile_bound.1
+// CHECK:         %[[VAL_55:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_51]], align 4
+// CHECK-PTX:         %[[VAL_56:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_55]], i32 8, i32 31)
+// CHECK-GCN:         %[[VAL_56_1_:.*]] = call i32 @__ockl_readuplane_i32
+// CHECK-GCN:         %[[VAL_56:.*]] = bitcast i32
+// CHECK:         store float %[[VAL_56]], ptr{{( addrspace\(5\))?}} %[[VAL_7]], align 4
+// CHECK-PTX:         call void @[[REDUCTION0]](ptr %[[VAL_51]], ptr %[[VAL_7]], ptr %[[VAL_6]])
+// CHECK:         %[[VAL_57:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_6]], align 4
+// CHECK:         store float %[[VAL_57]], ptr{{( addrspace\(5\))?}} %[[VAL_51]], align 4
+// CHECK:         %[[VAL_58:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_51]], align 4
+// CHECK-PTX:         %[[VAL_59:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_58]], i32 4, i32 31)
+// CHECK-GCN:         %[[VAL_59_:.*]] = call i32 @__ockl_readuplane_i32
+// CHECK-GCN:         %[[VAL_59:.*]] = bitcast i32
+// CHECK:         store float %[[VAL_59]], ptr{{( addrspace\(5\))?}} %[[VAL_5]], align 4
+// CHECK-PTX:         call void @[[REDUCTION0]](ptr %[[VAL_51]], ptr %[[VAL_5]], ptr %[[VAL_4]])
+// CHECK:         %[[VAL_60:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_4]], align 4
+// CHECK:         store float %[[VAL_60]], ptr{{( addrspace\(5\))?}} %[[VAL_51]], align 4
+// CHECK:         %[[VAL_61:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_51]], align 4
+// CHECK-PTX:         %[[VAL_62:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_61]], i32 2, i32 31)
+// CHECK-GCN:         %[[VAL_62_:.*]] = call i32 @__ockl_readuplane_i32
+// CHECK-GCN:         %[[VAL_62:.*]] = bitcast i32
+// CHECK:         store float %[[VAL_62]], ptr{{( addrspace\(5\))?}} %[[VAL_3]], align 4
+// CHECK-PTX:         call void @[[REDUCTION0]](ptr %[[VAL_51]], ptr %[[VAL_3]], ptr %[[VAL_2]])
+// CHECK:         %[[VAL_63:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_2]], align 4
+// CHECK:         store float %[[VAL_63]], ptr{{( addrspace\(5\))?}} %[[VAL_51]], align 4
+// CHECK:         %[[VAL_64:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_51]], align 4
+// CHECK-PTX:         %[[VAL_65:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_64]], i32 1, i32 31)
+// CHECK-GCN:         %[[VAL_65_:.*]] = call i32 @__ockl_readuplane_i32
+// CHECK-GCN:         %[[VAL_65:.*]] = bitcast i32
+// CHECK:         store float %[[VAL_65]], ptr{{( addrspace\(5\))?}} %[[VAL_1]], align 4
+// CHECK-PTX:         call void @[[REDUCTION0]](ptr %[[VAL_51]], ptr %[[VAL_1]], ptr %[[VAL_0]])
+// CHECK:         %[[VAL_66:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_0]], align 4
+// CHECK:         store float %[[VAL_66]], ptr{{( addrspace\(5\))?}} %[[VAL_51]], align 4
+// CHECK-PTX:         %[[VAL_67:.*]] = icmp ult i32 %thread.id.1, 32
+// CHECK-PTX:         %[[VAL_68:.*]] = icmp ult i32 %thread.id.2, %tile_bound.1
+// CHECK-GCN:         %[[VAL_68:.*]] = icmp ult i32 %thread.id.2, %tile_bound.1
+// CHECK-GCN:         %[[VAL_67:.*]] = icmp ult i32 %thread.id.1, 32
 // CHECK:         %[[VAL_69:.*]] = and i1 %[[VAL_67]], %[[VAL_68]]
 // CHECK:         %[[VAL_70:.*]] = icmp eq i32 %lane_id, 0
 // CHECK:         %[[VAL_71:.*]] = and i1 %[[VAL_69]], %[[VAL_70]]
@@ -160,11 +171,11 @@ ENTRY kernel_entry {
 // CHECK:         %[[VAL_94:.*]] = mul nuw nsw i32 %[[VAL_73]], 1
 // CHECK:         %[[VAL_95:.*]] = add nuw nsw i32 0, %[[VAL_94]]
 // CHECK:         %[[VAL_96:.*]] = getelementptr inbounds [12 x [3 x [32 x [16 x [32 x [4 x [3 x [12 x float]]]]]]]], ptr %[[VAL_97:.*]], i32 0, i32 %[[VAL_92]], i32 %[[VAL_91]], i32 %[[VAL_89]], i32 %[[VAL_85]], i32 %[[VAL_84]], i32 %[[VAL_82]], i32 %[[VAL_80]], i32 %[[VAL_78]]
-// CHECK:         %[[VAL_98:.*]] = load float, ptr %[[VAL_96]], align 4, !invariant.load !9
-// CHECK:         store float %[[VAL_98]], ptr %[[VAL_14]], align 4
-// CHECK:         call void @[[REDUCTION0]](ptr %[[VAL_13]], ptr %[[VAL_14]], ptr %[[VAL_10]])
-// CHECK:         %[[VAL_99:.*]] = load float, ptr %[[VAL_10]], align 4
-// CHECK:         store float %[[VAL_99]], ptr %[[VAL_13]], align 4
+// CHECK:         %[[VAL_98:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_96]], align 4, !invariant.load
+// CHECK:         store float %[[VAL_98]], ptr{{( addrspace\(5\))?}} %[[VAL_14]], align 4
+// CHECK-PTX:         call void @[[REDUCTION0]](ptr %[[VAL_13]], ptr %[[VAL_14]], ptr %[[VAL_10]])
+// CHECK:         %[[VAL_99:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_10]], align 4
+// CHECK:         store float %[[VAL_99]], ptr{{( addrspace\(5\))?}} %[[VAL_13]], align 4
 // CHECK:         br label %[[VAL_38]]
 // CHECK:       reduction_write_output-true:                      ; preds = %[[VAL_33]]
 // CHECK:         %[[VAL_100:.*]] = add i32 %tile_origin.2, %thread.id.1
@@ -182,15 +193,15 @@ ENTRY kernel_entry {
 // CHECK:         %[[VAL_112:.*]] = mul nuw nsw i32 %tile_origin.0, 1
 // CHECK:         %[[VAL_113:.*]] = add nuw nsw i32 0, %[[VAL_112]]
 // CHECK:         %[[VAL_114:.*]] = getelementptr inbounds [12 x [16 x [4 x [3 x [32 x float]]]]], ptr %[[VAL_115:.*]], i32 0, i32 %[[VAL_103]], i32 %[[VAL_110]], i32 %[[VAL_107]], i32 %[[VAL_105]], i32 %[[VAL_109]]
-// CHECK:         %[[VAL_116:.*]] = load float, ptr %[[VAL_51]], align 4
-// CHECK:         store float %[[VAL_116]], ptr %[[VAL_114]], align 4
+// CHECK:         %[[VAL_116:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_51]], align 4
+// CHECK:         store float %[[VAL_116]], ptr{{( addrspace\(5\))?}} %[[VAL_114]], align 4
 // CHECK:         br label %[[VAL_19]]
 // CHECK:       entry:
 // CHECK:         %[[VAL_117:.*]] = alloca float, align 4
-// CHECK:         %[[VAL_118:.*]] = load float, ptr %[[VAL_119:.*]], align 4
-// CHECK:         %[[VAL_120:.*]] = load float, ptr %[[VAL_121:.*]], align 4
+// CHECK:         %[[VAL_118:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_119:.*]], align 4
+// CHECK:         %[[VAL_120:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_121:.*]], align 4
 // CHECK:         %[[VAL_122:.*]] = fadd float %[[VAL_118]], %[[VAL_120]]
-// CHECK:         store float %[[VAL_122]], ptr %[[VAL_117]], align 4
-// CHECK:         %[[VAL_123:.*]] = load float, ptr %[[VAL_117]], align 4
-// CHECK:         store float %[[VAL_123]], ptr %[[VAL_124:.*]], align 4
+// CHECK:         store float %[[VAL_122]], ptr{{( addrspace\(5\))?}} %[[VAL_117]], align 4
+// CHECK:         %[[VAL_123:.*]] = load float, ptr{{( addrspace\(5\))?}} %[[VAL_117]], align 4
+// CHECK:         store float %[[VAL_123]], ptr{{( addrspace\(5\))?}} %[[VAL_124:.*]], align 4
 // CHECK:         ret void
diff --git a/third_party/xla/xla/service/gpu/tests/reduce_f64_column.hlo b/third_party/xla/xla/service/gpu/tests/reduce_f64_column.hlo
index abfab462332389..982e45863e2547 100644
--- a/third_party/xla/xla/service/gpu/tests/reduce_f64_column.hlo
+++ b/third_party/xla/xla/service/gpu/tests/reduce_f64_column.hlo
@@ -78,7 +78,6 @@ ENTRY e {
 // CHECK:       loop1.loop_body:                                  ; preds = %[[VAL_29]]
 // CHECK:         %[[VAL_35:.*]] = add nuw nsw i32 %[[VAL_31]], 32
 // CHECK:         store i32 %[[VAL_35]], ptr{{.*}}%[[VAL_12]], align 4
-// CHECK:         %[[VAL_36:.*]] = icmp eq i32 %[[VAL_31]], %thread.id.1
 // CHECK:         store i32 0, ptr{{.*}}%[[VAL_11]], align 4
 // CHECK:         br label %[[VAL_37:.*]]
 // CHECK:       loop2.loop_header:                                ; preds = %[[VAL_38:.*]], %[[VAL_34]]
@@ -88,7 +87,6 @@ ENTRY e {
 // CHECK:       loop2.loop_body:                                  ; preds = %[[VAL_37]]
 // CHECK:         %[[VAL_42:.*]] = add nuw nsw i32 %[[VAL_39]], 32
 // CHECK:         store i32 %[[VAL_42]], ptr{{.*}}%[[VAL_11]], align 4
-// CHECK:         %[[VAL_43:.*]] = icmp eq i32 %[[VAL_39]], 0
 // CHECK:         %[[VAL_44:.*]] = add i32 %[[VAL_39]], %thread.id.2
 // CHECK:         %[[VAL_45:.*]] = icmp ult i32 %[[VAL_44]], 32
 // CHECK:         br i1 %[[VAL_45]], label %[[VAL_46:.*]], label %[[VAL_38]]
diff --git a/third_party/xla/xla/service/gpu/tests/reduce_large_row_to_scalar.hlo b/third_party/xla/xla/service/gpu/tests/reduce_large_row_to_scalar.hlo
index 21d32aebf1915e..ba61b0b7921ab4 100644
--- a/third_party/xla/xla/service/gpu/tests/reduce_large_row_to_scalar.hlo
+++ b/third_party/xla/xla/service/gpu/tests/reduce_large_row_to_scalar.hlo
@@ -46,10 +46,10 @@ ENTRY reduce.1 {
 // CHECK:         %[[VAL_23:.*]] = alloca i32, align 4
 // CHECK:         %[[VAL_24:.*]] = alloca i32, align 4
 // CHECK:         %[[VAL_25:.*]] = alloca %[[VAL_1]], align 8
-// CHECK:         %[[VAL_26:.*]] = alloca i32, align 4
-// CHECK:         %[[VAL_27:.*]] = alloca i32, align 4
-// CHECK:         %[[VAL_28:.*]] = alloca %[[VAL_1]], align 8
-// CHECK:         %[[VAL_29:.*]] = alloca %[[VAL_1]], align 8
+// CHECK-DAG:         %[[VAL_26:.*]] = alloca i32, align 4
+// CHECK-DAG:         %[[VAL_27:.*]] = alloca i32, align 4
+// CHECK-DAG:         %[[VAL_28:.*]] = alloca %[[VAL_1]], align 8
+// CHECK-DAG:         %[[VAL_29:.*]] = alloca %[[VAL_1]], align 8
 // CHECK-PTX:     %[[VAL_30:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !range !2
 // CHECK-GCN:     %[[VAL_30:.*]] = call i32 @llvm.amdgcn.workgroup.id.y
 // CHECK:         %[[VAL_31:.*]] = icmp eq i32 %[[VAL_30]], 0
@@ -190,7 +190,6 @@ ENTRY reduce.1 {
 // CHECK:       loop2.loop_body:                                  ; preds = %[[VAL_133]]
 // CHECK:         %[[VAL_138:.*]] = add nuw nsw i32 %[[VAL_135]], 640
 // CHECK:         store i32 %[[VAL_138]], ptr %[[VAL_27]], align 4
-// CHECK:         %[[VAL_139:.*]] = icmp eq i32 %[[VAL_135]], 0
 // CHECK:         %[[VAL_140:.*]] = add i32 %[[VAL_135]], %thread.id.2
 // CHECK:         store i32 0, ptr %[[VAL_26]], align 4
 // CHECK:         br label %[[VAL_141:.*]]
@@ -201,7 +200,6 @@ ENTRY reduce.1 {
 // CHECK:       loop3.loop_body:                                  ; preds = %[[VAL_141]]
 // CHECK:         %[[VAL_145:.*]] = add nuw nsw i32 %[[VAL_143]], 1
 // CHECK:         store i32 %[[VAL_145]], ptr %[[VAL_26]], align 4
-// CHECK:         %[[VAL_146:.*]] = icmp eq i32 %[[VAL_143]], 0
 // CHECK:         %[[VAL_147:.*]] = add i32 %tile_origin.0, 0
 // CHECK:         %[[VAL_148:.*]] = add i32 %tile_origin.1, 0
 // CHECK:         %[[VAL_149:.*]] = add i32 %tile_origin.2, %[[VAL_140]]
@@ -236,7 +234,6 @@ ENTRY reduce.1 {
 // CHECK:       loop2.loop_body5:                                 ; preds = %[[VAL_164]]
 // CHECK:         %[[VAL_169:.*]] = add nuw nsw i32 %[[VAL_166]], 640
 // CHECK:         store i32 %[[VAL_169]], ptr %[[VAL_24]], align 4
-// CHECK:         %[[VAL_170:.*]] = icmp eq i32 %[[VAL_166]], 0
 // CHECK:         %[[VAL_171:.*]] = add i32 %[[VAL_166]], %thread.id.2
 // CHECK:         %[[VAL_172:.*]] = icmp ult i32 %[[VAL_171]], %tile_bound.2
 // CHECK:         br i1 %[[VAL_172]], label %[[VAL_173:.*]], label %[[VAL_165]]
@@ -254,7 +251,6 @@ ENTRY reduce.1 {
 // CHECK:       loop3.loop_body11:                                ; preds = %[[VAL_175]]
 // CHECK:         %[[VAL_179:.*]] = add nuw nsw i32 %[[VAL_177]], 1
 // CHECK:         store i32 %[[VAL_179]], ptr %[[VAL_23]], align 4
-// CHECK:         %[[VAL_180:.*]] = icmp eq i32 %[[VAL_177]], 0
 // CHECK:         %[[VAL_181:.*]] = add i32 %tile_origin.0, 0
 // CHECK:         %[[VAL_182:.*]] = add i32 %tile_origin.1, 0
 // CHECK:         %[[VAL_183:.*]] = add i32 %tile_origin.2, %[[VAL_171]]
diff --git a/third_party/xla/xla/service/gpu/tests/reduce_row_vectorized.hlo b/third_party/xla/xla/service/gpu/tests/reduce_row_vectorized.hlo
index b85eeb0ac8831a..bba7986d830fb3 100644
--- a/third_party/xla/xla/service/gpu/tests/reduce_row_vectorized.hlo
+++ b/third_party/xla/xla/service/gpu/tests/reduce_row_vectorized.hlo
@@ -89,7 +89,6 @@ ENTRY reduce.1 {
 // CHECK:       loop1.loop_body:                                  ; preds = %[[VAL_44]]
 // CHECK:         %[[VAL_50:.*]] = add nuw nsw i32 %[[VAL_46]], 4
 // CHECK:         store i32 %[[VAL_50]], ptr %[[VAL_27]], align 4
-// CHECK:         %[[VAL_51:.*]] = icmp eq i32 %[[VAL_46]], %thread.id.1
 // CHECK:         br i1 true, label %[[VAL_52:.*]], label %[[VAL_53:.*]]
 // CHECK:       is_full_tile-after:                               ; preds = %[[VAL_54:.*]], %[[VAL_55:.*]]
 // CHECK:         br label %[[VAL_44]], !llvm.loop !5
@@ -139,7 +138,6 @@ ENTRY reduce.1 {
 // CHECK:       loop2.loop_body:                                  ; preds = %[[VAL_74]]
 // CHECK:         %[[VAL_79:.*]] = add nuw nsw i32 %[[VAL_76]], 64
 // CHECK:         store i32 %[[VAL_79]], ptr %[[VAL_26]], align 4
-// CHECK:         %[[VAL_80:.*]] = icmp eq i32 %[[VAL_76]], 0
 // CHECK:         %[[VAL_81:.*]] = add i32 %[[VAL_76]], %thread.id.2
 // CHECK:         store i32 0, ptr %[[VAL_25]], align 4
 // CHECK:         br label %[[VAL_82:.*]]
@@ -150,7 +148,6 @@ ENTRY reduce.1 {
 // CHECK:       loop3.loop_body:                                  ; preds = %[[VAL_82]]
 // CHECK:         %[[VAL_86:.*]] = add nuw nsw i32 %[[VAL_84]], 1
 // CHECK:         store i32 %[[VAL_86]], ptr %[[VAL_25]], align 4
-// CHECK:         %[[VAL_87:.*]] = icmp eq i32 %[[VAL_84]], 0
 // CHECK:         %[[VAL_88:.*]] = add i32 %tile_origin.0, 0
 // CHECK:         %[[VAL_89:.*]] = add i32 %tile_origin.1, %[[VAL_46]]
 // CHECK:         %[[VAL_90:.*]] = add i32 %tile_origin.2, %[[VAL_81]]
@@ -186,7 +183,6 @@ ENTRY reduce.1 {
 // CHECK:       loop2.loop_body6:                                 ; preds = %[[VAL_106]]
 // CHECK:         %[[VAL_111:.*]] = add nuw nsw i32 %[[VAL_108]], 64
 // CHECK:         store i32 %[[VAL_111]], ptr %[[VAL_23]], align 4
-// CHECK:         %[[VAL_112:.*]] = icmp eq i32 %[[VAL_108]], 0
 // CHECK:         %[[VAL_113:.*]] = add i32 %[[VAL_108]], %thread.id.2
 // CHECK:         %[[VAL_114:.*]] = icmp ult i32 %[[VAL_113]], 512
 // CHECK:         br i1 %[[VAL_114]], label %[[VAL_115:.*]], label %[[VAL_107]]
@@ -204,7 +200,6 @@ ENTRY reduce.1 {
 // CHECK:       loop3.loop_body12:                                ; preds = %[[VAL_117]]
 // CHECK:         %[[VAL_121:.*]] = add nuw nsw i32 %[[VAL_119]], 1
 // CHECK:         store i32 %[[VAL_121]], ptr %[[VAL_22]], align 4
-// CHECK:         %[[VAL_122:.*]] = icmp eq i32 %[[VAL_119]], 0
 // CHECK:         %[[VAL_123:.*]] = add i32 %tile_origin.0, 0
 // CHECK:         %[[VAL_124:.*]] = add i32 %tile_origin.1, %[[VAL_46]]
 // CHECK:         %[[VAL_125:.*]] = add i32 %tile_origin.2, %[[VAL_113]]
diff --git a/third_party/xla/xla/service/gpu/tests/reduce_variadic_column.hlo b/third_party/xla/xla/service/gpu/tests/reduce_variadic_column.hlo
index 2032a64930717b..36008daa5ceda8 100644
--- a/third_party/xla/xla/service/gpu/tests/reduce_variadic_column.hlo
+++ b/third_party/xla/xla/service/gpu/tests/reduce_variadic_column.hlo
@@ -106,7 +106,6 @@ ENTRY main {
 // CHECK:       loop0.loop_body:                                  ; preds = %[[VAL_56]]
 // CHECK:         %[[VAL_62:.*]] = add nuw nsw i32 %[[VAL_58]], 1
 // CHECK:         store i32 %[[VAL_62]], ptr{{.*}}%[[VAL_36]], align 4
-// CHECK:         %[[VAL_63:.*]] = icmp eq i32 %[[VAL_58]], 0
 // CHECK:         store i32 %thread.id.1, ptr{{.*}}%[[VAL_35]], align 4
 // CHECK:         br label %[[VAL_64:.*]]
 // CHECK:       loop1.loop_header:                                ; preds = %[[VAL_65:.*]], %[[VAL_61]]
@@ -116,7 +115,6 @@ ENTRY main {
 // CHECK:       loop1.loop_body:                                  ; preds = %[[VAL_64]]
 // CHECK:         %[[VAL_69:.*]] = add nuw nsw i32 %[[VAL_66]], 8
 // CHECK:         store i32 %[[VAL_69]], ptr{{.*}}%[[VAL_35]], align 4
-// CHECK:         %[[VAL_70:.*]] = icmp eq i32 %[[VAL_66]], %thread.id.1
 // CHECK:         %[[VAL_71:.*]] = icmp eq i32 512, %tile_bound.2
 // CHECK:         br i1 %[[VAL_71]], label %[[VAL_72:.*]], label %[[VAL_73:.*]]
 // CHECK:       is_full_tile-after:                               ; preds = %[[VAL_74:.*]], %[[VAL_75:.*]]
@@ -294,7 +292,6 @@ ENTRY main {
 // CHECK:       loop2.loop_body:                                  ; preds = %[[VAL_119]]
 // CHECK:         %[[VAL_123:.*]] = add nuw nsw i32 %[[VAL_121]], 32
 // CHECK:         store i32 %[[VAL_123]], ptr{{.*}}%[[VAL_34]], align 4
-// CHECK:         %[[VAL_124:.*]] = icmp eq i32 %[[VAL_121]], 0
 // CHECK:         %[[VAL_125:.*]] = add i32 %[[VAL_121]], %thread.id.2
 // CHECK:         %[[VAL_126:.*]] = add i32 %tile_origin.0, %[[VAL_58]]
 // CHECK:         %[[VAL_127:.*]] = add i32 %tile_origin.1, %[[VAL_66]]
@@ -337,7 +334,6 @@ ENTRY main {
 // CHECK:       loop2.loop_body10:                                ; preds = %[[VAL_139]]
 // CHECK:         %[[VAL_144:.*]] = add nuw nsw i32 %[[VAL_141]], 32
 // CHECK:         store i32 %[[VAL_144]], ptr{{.*}}%[[VAL_30]], align 4
-// CHECK:         %[[VAL_145:.*]] = icmp eq i32 %[[VAL_141]], 0
 // CHECK:         %[[VAL_146:.*]] = add i32 %[[VAL_141]], %thread.id.2
 // CHECK:         %[[VAL_147:.*]] = icmp ult i32 %[[VAL_146]], %tile_bound.2
 // CHECK:         br i1 %[[VAL_147]], label %[[VAL_148:.*]], label %[[VAL_140]]
diff --git a/third_party/xla/xla/service/gpu/tests/scatter_bf16.hlo b/third_party/xla/xla/service/gpu/tests/scatter_bf16.hlo
new file mode 100644
index 00000000000000..c8126ffe76cb91
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/tests/scatter_bf16.hlo
@@ -0,0 +1,34 @@
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/a6000.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM86
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/h100.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM90
+// RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/h100.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-PTX-SM90
+
+HloModule TensorFlowScatter_Add, is_scheduled=true
+
+add_bf16 (lhs: bf16[], rhs: bf16[]) -> bf16[] {
+  lhs = bf16[] parameter(0)
+  rhs = bf16[] parameter(1)
+  ROOT add = bf16[] add(bf16[] lhs, bf16[] rhs)
+}
+
+fused_computation {
+  operand = bf16[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  updates = bf16[2,3] parameter(2)
+  ROOT scatter_TensorFlowScatter_Mul = bf16[3,3] scatter(operand, indices, updates),
+      to_apply=add_bf16,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1
+}
+
+ENTRY main {
+  p0 = bf16[3,3] parameter(0)
+  p1 = s32[2] parameter(1)
+  p2 = bf16[2,3] parameter(2)
+  ROOT wrapped_scatter = bf16[3,3] fusion(p0, p1, p2), kind=kInput, calls=fused_computation
+}
+
+// CHECK-SM86-NOT: atomicrmw fadd
+// CHECK-SM90: atomicrmw fadd
+// CHECK-PTX-SM90: atom.global.add.noftz.bf16
diff --git a/third_party/xla/xla/service/gpu/tests/sparse_add_layout.mlir b/third_party/xla/xla/service/gpu/tests/sparse_add_layout.mlir
new file mode 100644
index 00000000000000..55faf1b76bda3d
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/tests/sparse_add_layout.mlir
@@ -0,0 +1,15 @@
+// RUN: triton-opt %s -split-input-file -convert-triton-to-tritongpu='target=cuda:80 num-warps=4' | FileCheck %s
+
+// CHECK-COUNT-4: #triton_gpu.blocked
+module {
+  tt.func @sparse_dot() {
+    %A = arith.constant dense<1.00e+00> : tensor<64x32xf16>
+    %meta = arith.constant dense<0x3333> : tensor<64x4xi16>
+    %B = arith.constant dense<2.00e+00> : tensor<64x64xf16>
+    %C = arith.constant dense<0.00e+00> : tensor<64x64xf32>
+    // CHECK-COUNT-4: triton_gpu.convert_layout
+    // CHECK: triton_gpu.sparse_dot {{.+}} #triton_gpu.sparse_dot_meta
+    %D = triton_gpu.sparse_dot %A, %B, %C, %meta : tensor<64x32xf16> meta tensor<64x4xi16> * tensor<64x64xf16> -> tensor<64x64xf32>
+    tt.return
+  }
+}
diff --git a/third_party/xla/xla/service/gpu/tests/sparse_convert_to_llvm_ampere.mlir b/third_party/xla/xla/service/gpu/tests/sparse_convert_to_llvm_ampere.mlir
new file mode 100644
index 00000000000000..801ea1c07c1c89
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/tests/sparse_convert_to_llvm_ampere.mlir
@@ -0,0 +1,26 @@
+// RUN: triton-opt %s --allocate-shared-memory --convert-triton-gpu-to-llvm=compute-capability=80 | FileCheck %s
+
+#blocked0 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#shared0 = #triton_gpu.shared<{vec = 1, perPhase=1, maxPhase=1, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#mma0 = #triton_gpu.nvidia_mma<{versionMajor = 2, warpsPerCTA = [2, 2], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], instrShape = [16, 8]}>
+#dot_operand_a = #triton_gpu.dot_op<{opIdx=0, parent=#mma0, kWidth=2}>
+#dot_operand_b = #triton_gpu.dot_op<{opIdx=1, parent=#mma0, kWidth=2}>
+#dot_meta_enc = #triton_gpu.sparse_dot_meta<{parent=#mma0}>
+
+module attributes {"triton_gpu.num-warps" = 4 : i32} {
+  tt.func @sparse_dot(%A: tensor<32x32xf16, #blocked0>, %B: tensor<64x32xf16, #blocked0>, %meta: tensor<32x4xi16, #blocked0>) {
+    // CHECK-COUNT-2: ldmatrix.sync.aligned.m8n8.x4.shared.b16
+    %A_alloc = triton_gpu.local_alloc %A {allocation.offset = 0 : i32} : (tensor<32x32xf16, #blocked0>) -> !tt.memdesc<32x32xf16, #shared0>
+    %A_dot = triton_gpu.local_load %A_alloc : !tt.memdesc<32x32xf16, #shared0> -> tensor<32x32xf16, #dot_operand_a>
+    // CHECK-COUNT-4: ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16
+    %B_alloc = triton_gpu.local_alloc %B {allocation.offset = 2048 : i32} : (tensor<64x32xf16, #blocked0>) -> !tt.memdesc<64x32xf16, #shared0>
+    %B_dot = triton_gpu.local_load %B_alloc : !tt.memdesc<64x32xf16, #shared0> -> tensor<64x32xf16, #dot_operand_b>
+    // CHECK-COUNT-4: llvm.load %[[_:.*]] : !llvm.ptr<3> -> i16
+    %meta_alloc = triton_gpu.local_alloc %meta {allocation.offset = 6144 : i32} : (tensor<32x4xi16, #blocked0>) -> !tt.memdesc<32x4xi16, #shared0>
+    %meta_reg = triton_gpu.local_load %meta_alloc : !tt.memdesc<32x4xi16, #shared0> -> tensor<32x4xi16, #dot_meta_enc>
+    // CHECK-COUNT-4: mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32
+    %acc = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma0>
+    %D = triton_gpu.sparse_dot %A_dot, %B_dot, %acc, %meta_reg : tensor<32x32xf16, #dot_operand_a> meta tensor<32x4xi16, #dot_meta_enc> * tensor<64x32xf16, #dot_operand_b> -> tensor<32x32xf32, #mma0>
+    tt.return
+  }
+}
diff --git a/third_party/xla/xla/service/gpu/tests/sparse_convert_to_llvm_hopper.mlir b/third_party/xla/xla/service/gpu/tests/sparse_convert_to_llvm_hopper.mlir
new file mode 100644
index 00000000000000..008f40744c9d25
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/tests/sparse_convert_to_llvm_hopper.mlir
@@ -0,0 +1,28 @@
+// RUN: triton-opt %s --allocate-shared-memory --convert-triton-gpu-to-llvm=compute-capability=90 | FileCheck %s
+
+#blocked0 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#shared0 = #triton_gpu.shared<{vec = 1, perPhase=2, maxPhase=4, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#shared1 = #triton_gpu.shared<{vec = 1, perPhase=1, maxPhase=1, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#mma0 = #triton_gpu.nvidia_mma<{versionMajor = 3, warpsPerCTA = [4, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], instrShape = [16, 64, 16]}>
+#dot_meta_enc = #triton_gpu.sparse_dot_meta<{parent=#mma0}>
+
+module attributes {"triton_gpu.num-warps" = 4 : i32} {
+  tt.func @sparse_dot(%A: tensor<64x32xf16, #blocked0>, %B: tensor<64x64xf16, #blocked0>, %meta: tensor<64x4xi16, #blocked0>) {
+    %A_alloc = triton_gpu.local_alloc %A {allocation.offset = 0 : i32} : (tensor<64x32xf16, #blocked0>) -> !tt.memdesc<64x32xf16, #shared0>
+    %B_alloc = triton_gpu.local_alloc %B {allocation.offset = 4096 : i32} : (tensor<64x64xf16, #blocked0>) -> !tt.memdesc<64x64xf16, #shared0>
+    // CHECK-COUNT-2: llvm.load %[[_:.*]] : !llvm.ptr<3> -> i16
+    %meta_alloc = triton_gpu.local_alloc %meta {allocation.offset = 12288 : i32} : (tensor<64x4xi16, #blocked0>) -> !tt.memdesc<64x4xi16, #shared0>
+    %meta_reg = triton_gpu.local_load %meta_alloc : !tt.memdesc<64x4xi16, #shared0> -> tensor<64x4xi16, #dot_meta_enc>
+    // CHECK: nvgpu.wgmma_fence
+    // CHECK-COUNT-2: nvgpu.wgmma_sp %[[A:.*]] meta %[[M:.*]], %[[B:.*]], %[[C:.*]] {
+    // CHECK-DAG: layoutA = 0 : i32
+    // CHECK-DAG: layoutB = 0 : i32
+    // CHECK-DAG: m = 64 : i32
+    // CHECK-DAG: n = 64 : i32
+    // CHECK-DAG: k = 32 : i32
+    // CHECK: nvgpu.wgmma_commit_group
+    %acc = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #mma0>
+    %D = triton_gpu.sparse_dot %A_alloc, %B_alloc, %acc, %meta_reg : !tt.memdesc<64x32xf16, #shared0> meta tensor<64x4xi16, #dot_meta_enc> * !tt.memdesc<64x64xf16, #shared0> -> tensor<64x64xf32, #mma0>
+    tt.return
+  }
+}
diff --git a/third_party/xla/xla/service/gpu/tests/sparse_test_wgmma.mlir b/third_party/xla/xla/service/gpu/tests/sparse_test_wgmma.mlir
new file mode 100644
index 00000000000000..e3bed55c02de31
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/tests/sparse_test_wgmma.mlir
@@ -0,0 +1,14 @@
+// RUN: triton-opt %s -split-input-file --convert-nv-gpu-to-llvm | FileCheck %s
+
+module attributes {"triton_gpu.num-warps" = 4 : i32} {
+  tt.func @wgmma_sp(%descA: i64, %metaA: i32, %descB: i64, %acc: !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>) {
+    // CHECK: @wgmma_sp(%[[LHS:.*]]: i64, %[[META:.*]]: i32, %[[RHS:.*]]: i64,
+    // CHECK: llvm.inline_asm has_side_effects asm_dialect = att operand_attrs = []
+    // CHECK-SAME: "wgmma.mma_async.sp.sync.aligned.m64n16k32.f32.bf16.bf16 {$0,$1,$2,$3,$4,$5,$6,$7}, $16, $17, $18, 0, 1, 1, 1, 0, 0;"
+    // CHECK-SAME: "=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,l,l,r" %0, %1, %2, %3, %4, %5, %6, %7, %[[LHS]], %[[RHS]], %[[META]]
+    %acc0 = nvgpu.wgmma_sp %descA meta %metaA, %descB, %acc
+    {eltTypeA = 5 : i32, eltTypeB = 5 : i32, eltTypeC = 7 : i32, layoutA = 0 : i32, layoutB = 1 : i32, m = 64 : i32, n = 16 : i32, k = 32 : i32} :
+    (i64, i32, i64, !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
+    tt.return
+  }
+}
diff --git a/third_party/xla/xla/service/gpu/tests/sparse_ttg_accelerate_matmul.mlir b/third_party/xla/xla/service/gpu/tests/sparse_ttg_accelerate_matmul.mlir
new file mode 100644
index 00000000000000..ee2a0d928ce748
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/tests/sparse_ttg_accelerate_matmul.mlir
@@ -0,0 +1,27 @@
+// RUN: ENABLE_MMA_V3=1 triton-opt %s -split-input-file -tritongpu-accelerate-matmul=compute-capability=90 | FileCheck %s
+// RUN: triton-opt %s -split-input-file -tritongpu-accelerate-matmul=compute-capability=80 | FILECHECK_OPTS= FileCheck %s --check-prefix=CHECK-80
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+// CHECK: #[[MMA:.+]] = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}>
+// CHECK-80: #[[MMA:.+]] = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [16, 8]}>
+#lhs = #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>
+#rhs = #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>
+module attributes {"triton_gpu.num-warps" = 4 : i32} {
+  tt.func @sparse_dot(%A: tensor<64x32xf16, #lhs>, %B: tensor<64x64xf16, #rhs>, %meta: tensor<64x4xi16, #blocked>) -> tensor<64x64xf32, #blocked> {
+    %C = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked>
+    // CHECK-DAG: %[[LHS:.+]] = triton_gpu.local_alloc {{.+}} : (tensor<64x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>) -> !tt.memdesc<64x32xf16, #{{.+}}>
+    // CHECK-DAG: %[[RHS:.+]] = triton_gpu.local_alloc {{.+}} : (tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #blocked}>>) -> !tt.memdesc<64x64xf16, #{{.+}}>
+    // CHECK-DAG: %[[ACC:.+]] = triton_gpu.convert_layout {{.+}} : tensor<64x64xf32, #blocked> -> tensor<64x64xf32, #[[MMA]]>
+    // CHECK-DAG: %[[META:.+]] = triton_gpu.convert_layout {{.+}} : tensor<64x4xi16, #blocked> -> tensor<64x4xi16, #triton_gpu.sparse_dot_meta<{parent = #[[MMA]]}>>
+    // CHECK: %[[OUT:.+]] = triton_gpu.sparse_dot %[[LHS]], %[[RHS]], %[[ACC]], %[[META]] : {{.+}} -> tensor<64x64xf32, #[[MMA]]>
+    // CHECK-80-DAG: %[[LHS:.+]] = triton_gpu.convert_layout {{.+}} : {{.+}} -> tensor<64x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[MMA]], kWidth = 2}>>
+    // CHECK-80-DAG: %[[RHS:.+]] = triton_gpu.convert_layout {{.+}} : {{.+}} -> tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[MMA]], kWidth = 2}>>
+    // CHECK-80-DAG: %[[ACC:.+]] = triton_gpu.convert_layout {{.+}} : {{.+}} -> tensor<64x64xf32, #[[MMA]]>
+    // CHECK-80-DAG: %[[META:.+]] = triton_gpu.convert_layout {{.+}} : {{.+}} -> tensor<64x4xi16, #triton_gpu.sparse_dot_meta<{parent = #[[MMA]]}>>
+    // CHECK-80: %[[OUT:.+]] = triton_gpu.sparse_dot %[[LHS]], %[[RHS]], %[[ACC]], %[[META]] : {{.+}} -> tensor<64x64xf32, #[[MMA]]>
+    %D = triton_gpu.sparse_dot %A, %B, %C, %meta : tensor<64x32xf16, #lhs> meta tensor<64x4xi16, #blocked> * tensor<64x64xf16, #rhs> -> tensor<64x64xf32, #blocked>
+    // CHECK: triton_gpu.convert_layout %[[OUT]] : tensor<64x64xf32, #[[MMA]]> -> tensor<64x64xf32, #blocked>
+    // CHECK-80: triton_gpu.convert_layout %[[OUT]] : tensor<64x64xf32, #[[MMA]]> -> tensor<64x64xf32, #blocked>
+    tt.return %D : tensor<64x64xf32, #blocked>
+  }
+}
diff --git a/third_party/xla/xla/service/gpu/tests/sparse_ttg_fence_insertion.mlir b/third_party/xla/xla/service/gpu/tests/sparse_ttg_fence_insertion.mlir
new file mode 100644
index 00000000000000..07035df4739770
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/tests/sparse_ttg_fence_insertion.mlir
@@ -0,0 +1,18 @@
+// RUN: ENABLE_MMA_V3=1 triton-opt %s -split-input-file -triton-nvidia-gpu-fence-insertion | FileCheck %s
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
+#mma = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}>
+#lhs = #triton_gpu.dot_op<{opIdx = 0, parent = #mma}>
+#rhs = #triton_gpu.dot_op<{opIdx = 1, parent = #mma}>
+#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}>
+module attributes {"triton_gpu.num-warps" = 4 : i32} {
+  tt.func public @sparse_dot_fence(%A: tensor<64x32xf16, #lhs>, %B: tensor<64x64xf16, #rhs>, %meta: tensor<64x4xi16, #blocked>) {
+    %C = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #mma>
+    %0 = triton_gpu.local_alloc %A : (tensor<64x32xf16, #lhs>) -> !tt.memdesc<64x32xf16, #shared>
+    %1 = triton_gpu.local_alloc %B : (tensor<64x64xf16, #rhs>) -> !tt.memdesc<64x64xf16, #shared>
+    %2 = triton_gpu.convert_layout %meta : tensor<64x4xi16, #blocked> -> tensor<64x4xi16, #triton_gpu.sparse_dot_meta<{parent = #mma}>>
+    // CHECK: triton_nvidia_gpu.fence_async_shared
+    %3 = triton_gpu.sparse_dot %0, %1, %C, %2 : !tt.memdesc<64x32xf16, #shared> meta tensor<64x4xi16, #triton_gpu.sparse_dot_meta<{parent = #mma}>> * !tt.memdesc<64x64xf16, #shared> -> tensor<64x64xf32, #mma>
+    tt.return
+  }
+}
diff --git a/third_party/xla/xla/service/gpu/tests/sparse_ttg_loop_pipeline.mlir b/third_party/xla/xla/service/gpu/tests/sparse_ttg_loop_pipeline.mlir
new file mode 100644
index 00000000000000..f367113c36c74d
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/tests/sparse_ttg_loop_pipeline.mlir
@@ -0,0 +1,61 @@
+// RUN: triton-opt %s -split-input-file -tritongpu-pipeline=num-stages=3 | FileCheck %s
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
+#sliced = #triton_gpu.slice<{parent=#blocked, dim=0}>
+#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, warpsPerCTA = [4, 1]}>
+#dot_operand_a = #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth=2}>
+#dot_operand_b = #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth=2}>
+#dot_meta_enc = #triton_gpu.sparse_dot_meta<{parent=#mma}>
+
+module attributes {"triton_gpu.num-warps" = 4 : i32} {
+  tt.func @sparse_dot_loop(%lb : index, %ub : index, %step : index,
+        %A : !tt.ptr<f16> {tt.divisibility = 16 : i32},
+        %B : !tt.ptr<f16> {tt.divisibility = 16 : i32},
+        %A_meta : !tt.ptr<i16> {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #mma> {
+    // CHECK-COUNT-6: triton_gpu.async_copy_global_to_local
+    // CHECK: triton_gpu.async_wait {{.+}}, {{.+}} {num = 3 : i32}
+    %a_ptr_splat = tt.splat %A : !tt.ptr<f16> -> tensor<128x32x!tt.ptr<f16>, #blocked>
+    %a_tmp0 = tt.make_range {end = 32: i32, start = 0: i32} : tensor<32xi32, #sliced>
+    %a_tmp1 = tt.expand_dims %a_tmp0 {axis = 0 : i32} : tensor<32xi32, #sliced> -> tensor<1x32xi32, #blocked>
+    %a_offs = tt.broadcast %a_tmp1 : tensor<1x32xi32, #blocked> -> tensor<128x32xi32, #blocked>
+    %a_ptr_init = tt.addptr %a_ptr_splat, %a_offs : tensor<128x32x!tt.ptr<f16>, #blocked>, tensor<128x32xi32, #blocked>
+
+    %b_ptr_splat = tt.splat %B : !tt.ptr<f16> -> tensor<64x128x!tt.ptr<f16>, #blocked>
+    %b_tmp0 = tt.make_range {end = 128: i32, start = 0: i32} : tensor<128xi32, #sliced>
+    %b_tmp1 = tt.expand_dims %b_tmp0 {axis = 0 : i32} : tensor<128xi32, #sliced> -> tensor<1x128xi32, #blocked>
+    %b_offs = tt.broadcast %b_tmp1 : tensor<1x128xi32, #blocked> -> tensor<64x128xi32, #blocked>
+    %b_ptr_init = tt.addptr %b_ptr_splat, %b_offs : tensor<64x128x!tt.ptr<f16>, #blocked>, tensor<64x128xi32, #blocked>
+
+    %meta_ptr_splat = tt.splat %A_meta : !tt.ptr<i16> -> tensor<128x4x!tt.ptr<i16>, #blocked>
+    %meta_tmp0 = tt.make_range {end = 4: i32, start = 0: i32} : tensor<4xi32, #sliced>
+    %meta_tmp1 = tt.expand_dims %meta_tmp0 {axis = 0 : i32} : tensor<4xi32, #sliced> -> tensor<1x4xi32, #blocked>
+    %meta_offs = tt.broadcast %meta_tmp1 : tensor<1x4xi32, #blocked> -> tensor<128x4xi32, #blocked>
+    %meta_ptr_init = tt.addptr %meta_ptr_splat, %meta_offs : tensor<128x4x!tt.ptr<i16>, #blocked>, tensor<128x4xi32, #blocked>
+
+    %a_off = arith.constant dense<4> : tensor<128x32xi32, #blocked>
+    %b_off = arith.constant dense<4> : tensor<64x128xi32, #blocked>
+    %meta_off = arith.constant dense<4> : tensor<128x4xi32, #blocked>
+    %c_init = arith.constant dense<0.00e+00> : tensor<128x128xf32, #mma>
+
+    // CHECK: scf.for
+    %loop:4 = scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %b_ptr = %b_ptr_init, %c = %c_init, %meta_ptr = %meta_ptr_init)
+        -> (tensor<128x32x!tt.ptr<f16>, #blocked>, tensor<64x128x!tt.ptr<f16>, #blocked>, tensor<128x128xf32, #mma>, tensor<128x4x!tt.ptr<i16>, #blocked>) {
+      // CHECK-COUNT-3: triton_gpu.local_load
+      // CHECK: triton_gpu.sparse_dot
+      // CHECK-COUNT-3: triton_gpu.async_copy_global_to_local
+      %a_ = tt.load %a_ptr {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x32x!tt.ptr<f16>, #blocked>
+      %a = triton_gpu.convert_layout %a_ : tensor<128x32xf16, #blocked> -> tensor<128x32xf16, #dot_operand_a>
+      %b_ = tt.load %b_ptr {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<64x128x!tt.ptr<f16>, #blocked>
+      %b = triton_gpu.convert_layout %b_ : tensor<64x128xf16, #blocked> -> tensor<64x128xf16, #dot_operand_b>
+      %meta_ = tt.load %meta_ptr {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x4x!tt.ptr<i16>, #blocked>
+      %meta = triton_gpu.convert_layout %meta_ : tensor<128x4xi16, #blocked> -> tensor<128x4xi16, #dot_meta_enc>
+      %d = triton_gpu.sparse_dot %a, %b, %c, %meta : tensor<128x32xf16, #dot_operand_a> meta tensor<128x4xi16, #dot_meta_enc> * tensor<64x128xf16, #dot_operand_b> -> tensor<128x128xf32, #mma>
+
+      %a_ptr_next = tt.addptr %a_ptr, %a_off : tensor<128x32x!tt.ptr<f16>, #blocked>, tensor<128x32xi32, #blocked>
+      %b_ptr_next = tt.addptr %b_ptr, %b_off : tensor<64x128x!tt.ptr<f16>, #blocked>, tensor<64x128xi32, #blocked>
+      %meta_ptr_next = tt.addptr %meta_ptr, %meta_off : tensor<128x4x!tt.ptr<i16>, #blocked>, tensor<128x4xi32, #blocked>
+      scf.yield %a_ptr_next, %b_ptr_next, %d, %meta_ptr_next : tensor<128x32x!tt.ptr<f16>, #blocked>, tensor<64x128x!tt.ptr<f16>, #blocked>, tensor<128x128xf32, #mma>, tensor<128x4x!tt.ptr<i16>, #blocked>
+    }
+    tt.return %loop#2: tensor<128x128xf32, #mma>
+  }
+}
diff --git a/third_party/xla/xla/service/gpu/tests/sparse_ttg_reduce_data_duplication.mlir b/third_party/xla/xla/service/gpu/tests/sparse_ttg_reduce_data_duplication.mlir
new file mode 100644
index 00000000000000..2462512ae61b4d
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/tests/sparse_ttg_reduce_data_duplication.mlir
@@ -0,0 +1,13 @@
+// RUN: triton-opt %s -split-input-file -tritongpu-reduce-data-duplication | FileCheck %s
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
+#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [16, 8]}>
+// CHECK: #[[SHARED:.+]] = #triton_gpu.shared
+module attributes {"triton_gpu.num-warps" = 4 : i32} {
+  tt.func @sparse_dot_metadata(%meta: tensor<64x4xi16, #blocked>) {
+    // CHECK: %[[META:.+]] = triton_gpu.local_alloc {{.+}} : (tensor<64x4xi16, #blocked>) -> !tt.memdesc<64x4xi16, #[[SHARED]]>
+    // CHECK: triton_gpu.local_load %[[META]] : !tt.memdesc<64x4xi16, #[[SHARED]]> -> tensor<64x4xi16, #triton_gpu.sparse_dot_meta<{parent = #mma}>>
+    %0 = triton_gpu.convert_layout %meta : tensor<64x4xi16, #blocked> -> tensor<64x4xi16, #triton_gpu.sparse_dot_meta<{parent = #mma}>>
+    tt.return
+  }
+}
diff --git a/third_party/xla/xla/service/gpu/tests/sparse_validation.mlir b/third_party/xla/xla/service/gpu/tests/sparse_validation.mlir
new file mode 100644
index 00000000000000..281bd3412c92b5
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/tests/sparse_validation.mlir
@@ -0,0 +1,129 @@
+// RUN: triton-opt --split-input-file --verify-diagnostics %s
+
+tt.func @sparse_dot(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
+  tt.return
+}
+
+// -----
+tt.func @sparse_dot_invalid_lhs_type(%lhs: tensor<128x32xf32>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
+  // expected-error @+1 {{element type of operand A is not supported}}
+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xf32> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
+  tt.return
+}
+
+// -----
+tt.func @sparse_dot_invalid_lhs_shape(%lhs: tensor<1x128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
+  // expected-error @+1 {{shape of operand A is incorrect}}
+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<1x128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
+  tt.return
+}
+
+// -----
+tt.func @sparse_dot_invalid_rhs_type(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xf32>, %meta: tensor<128x4xi16>) {
+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
+  // expected-error @+1 {{element type of operand B is not supported}}
+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xf32> -> tensor<128x128xf32>
+  tt.return
+}
+
+// -----
+tt.func @sparse_dot_invalid_rhs_shape(%lhs: tensor<128x32xbf16>, %rhs: tensor<1x64x128xbf16>, %meta: tensor<128x4xi16>) {
+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
+  // expected-error @+1 {{shape of operand B is incorrect}}
+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<1x64x128xbf16> -> tensor<128x128xf32>
+  tt.return
+}
+
+// -----
+tt.func @sparse_dot_invalid_acc_type(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xbf16>
+  // expected-error @+1 {{element type of operand C is not supported}}
+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x128xbf16>
+  tt.return
+}
+
+// -----
+tt.func @sparse_dot_invalid_acc_shape(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
+  %acc = arith.constant dense<0.00e+00> : tensor<16384xf32>
+  // expected-error @+1 {{shape of operand C is incorrect}}
+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<16384xf32>
+  tt.return
+}
+
+// -----
+tt.func @sparse_dot_mismatch_lhs_acc(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
+  %acc = arith.constant dense<0.00e+00> : tensor<64x128xf32>
+  // expected-error @+1 {{operand shape dimensions are incorrect}}
+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<64x128xf32>
+  tt.return
+}
+
+// -----
+tt.func @sparse_dot_mismatch_rhs_acc(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
+  %acc = arith.constant dense<0.00e+00> : tensor<128x64xf32>
+  // expected-error @+1 {{operand shape dimensions are incorrect}}
+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x64xf32>
+  tt.return
+}
+
+// -----
+tt.func @sparse_dot_mismatch_lhs_rhs(%lhs: tensor<128x32xbf16>, %rhs: tensor<32x128xbf16>, %meta: tensor<128x4xi16>) {
+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
+  // expected-error @+1 {{operand shape dimensions are incorrect}}
+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi16> * tensor<32x128xbf16> -> tensor<128x128xf32>
+  tt.return
+}
+
+// -----
+tt.func @sparse_dot_mismatch_input_types(%lhs: tensor<128x32xf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
+  // expected-error @+1 {{operand element types do not match}}
+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xf16> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
+  tt.return
+}
+
+// -----
+tt.func @sparse_dot_invalid_meta_type(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi8>) {
+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
+  // expected-error @+1 {{sparse metadata tensor is invalid}}
+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x4xi8> * tensor<64x128xbf16> -> tensor<128x128xf32>
+  tt.return
+}
+
+// -----
+tt.func @sparse_dot_invalid_meta_shape(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<512xi16>) {
+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
+  // expected-error @+1 {{sparse metadata tensor is invalid}}
+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<512xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
+  tt.return
+}
+
+// -----
+tt.func @sparse_dot_mismatch_meta_noncontracting(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<64x4xi16>) {
+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
+  // expected-error @+1 {{sparse metadata shape dimensions are incorrect}}
+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<64x4xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
+  tt.return
+}
+
+// -----
+tt.func @sparse_dot_mismatch_meta_contracting(%lhs: tensor<128x32xbf16>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x8xi16>) {
+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
+  // expected-error @+1 {{sparse metadata shape dimensions are incorrect}}
+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16> meta tensor<128x8xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
+  tt.return
+}
+
+// -----
+#mma0 = #triton_gpu.nvidia_mma<{versionMajor = 2, warpsPerCTA = [2, 2], instrShape = [16, 8]}>
+#enc0 = #triton_gpu.dot_op<{opIdx=0, parent=#mma0, kWidth=2}>
+tt.func @sparse_dot_encoding_operand_mismatch(%lhs: tensor<128x32xbf16, #enc0>, %rhs: tensor<64x128xbf16>, %meta: tensor<128x4xi16>) {
+  %acc = arith.constant dense<0.00e+00> : tensor<128x128xf32>
+  // expected-error @+1 {{mismatching encoding between A and B operands}}
+  %res = triton_gpu.sparse_dot %lhs, %rhs, %acc, %meta : tensor<128x32xbf16, #enc0> meta tensor<128x4xi16> * tensor<64x128xbf16> -> tensor<128x128xf32>
+  tt.return
+}
diff --git a/third_party/xla/xla/service/gpu/tests/transpose_021.hlo b/third_party/xla/xla/service/gpu/tests/transpose_021.hlo
index 370779aed2142a..7d3e1fe0ffb9f6 100644
--- a/third_party/xla/xla/service/gpu/tests/transpose_021.hlo
+++ b/third_party/xla/xla/service/gpu/tests/transpose_021.hlo
@@ -47,7 +47,6 @@ ENTRY main {
 // CHECK:       loop1.loop_body:                                  ; preds = %[[VAL_12]]
 // CHECK:         %[[VAL_19:.*]] = add nuw nsw i32 %[[VAL_15]], 4
 // CHECK:         store i32 %[[VAL_19]], ptr{{.*}} %[[VAL_3]], align 4
-// CHECK:         %[[VAL_20:.*]] = icmp eq i32 %[[VAL_15]], %thread.id.1
 // CHECK:         store i32 %thread.id.2, ptr{{.*}} %[[VAL_2]], align 4
 // CHECK:         br label %[[VAL_21:.*]]
 // CHECK:       loop2.loop_header:                                ; preds = %[[VAL_22:.*]], %[[VAL_18]]
@@ -57,7 +56,6 @@ ENTRY main {
 // CHECK:       loop2.loop_body:                                  ; preds = %[[VAL_21]]
 // CHECK:         %[[VAL_25:.*]] = add nuw nsw i32 %[[VAL_23]], 32
 // CHECK:         store i32 %[[VAL_25]], ptr{{.*}} %[[VAL_2]], align 4
-// CHECK:         %[[VAL_26:.*]] = icmp eq i32 %[[VAL_23]], %thread.id.2
 // CHECK:         %[[VAL_27:.*]] = add i32 %tile_origin.0, 0
 // CHECK:         %[[VAL_28:.*]] = add i32 %tile_origin.1, %[[VAL_15]]
 // CHECK:         %[[VAL_29:.*]] = add i32 %tile_origin.2, %[[VAL_23]]
@@ -81,7 +79,6 @@ ENTRY main {
 // CHECK:       loop1.loop_body5:                                 ; preds = %[[VAL_35]]
 // CHECK:         %[[VAL_41:.*]] = add nuw nsw i32 %[[VAL_37]], 4
 // CHECK:         store i32 %[[VAL_41]], ptr{{.*}} %[[VAL_1]], align 4
-// CHECK:         %[[VAL_42:.*]] = icmp eq i32 %[[VAL_37]], %thread.id.1
 // CHECK:         store i32 %thread.id.2, ptr{{.*}} %[[VAL_0]], align 4
 // CHECK:         br label %[[VAL_43:.*]]
 // CHECK:       loop2.loop_header10:                              ; preds = %[[VAL_44:.*]], %[[VAL_40]]
@@ -91,7 +88,6 @@ ENTRY main {
 // CHECK:       loop2.loop_body11:                                ; preds = %[[VAL_43]]
 // CHECK:         %[[VAL_47:.*]] = add nuw nsw i32 %[[VAL_45]], 32
 // CHECK:         store i32 %[[VAL_47]], ptr{{.*}} %[[VAL_0]], align 4
-// CHECK:         %[[VAL_48:.*]] = icmp eq i32 %[[VAL_45]], %thread.id.2
 // CHECK:         %[[VAL_49:.*]] = add i32 %tile_origin.0, 0
 // CHECK:         %[[VAL_50:.*]] = add i32 %tile_origin.2, %[[VAL_37]]
 // CHECK:         %[[VAL_51:.*]] = add i32 %tile_origin.1, %[[VAL_45]]
diff --git a/third_party/xla/xla/service/gpu/tests/transpose_021_extra_output.hlo b/third_party/xla/xla/service/gpu/tests/transpose_021_extra_output.hlo
index b285b335e91610..5e638321294f1e 100644
--- a/third_party/xla/xla/service/gpu/tests/transpose_021_extra_output.hlo
+++ b/third_party/xla/xla/service/gpu/tests/transpose_021_extra_output.hlo
@@ -50,7 +50,6 @@ ENTRY main {
 // CHECK:       loop1.loop_body:                                  ; preds = %[[VAL_12]]
 // CHECK:         %[[VAL_19:.*]] = add nuw nsw i32 %[[VAL_15]], 4
 // CHECK:         store i32 %[[VAL_19]], ptr{{.*}} %[[VAL_3]], align 4
-// CHECK:         %[[VAL_20:.*]] = icmp eq i32 %[[VAL_15]], %thread.id.1
 // CHECK:         store i32 %thread.id.2, ptr{{.*}} %[[VAL_2]], align 4
 // CHECK:         br label %[[VAL_21:.*]]
 // CHECK:       loop2.loop_header:                                ; preds = %[[VAL_22:.*]], %[[VAL_18]]
@@ -60,7 +59,6 @@ ENTRY main {
 // CHECK:       loop2.loop_body:                                  ; preds = %[[VAL_21]]
 // CHECK:         %[[VAL_25:.*]] = add nuw nsw i32 %[[VAL_23]], 32
 // CHECK:         store i32 %[[VAL_25]], ptr{{.*}} %[[VAL_2]], align 4
-// CHECK:         %[[VAL_26:.*]] = icmp eq i32 %[[VAL_23]], %thread.id.2
 // CHECK:         %[[VAL_27:.*]] = add i32 %tile_origin.0, 0
 // CHECK:         %[[VAL_28:.*]] = add i32 %tile_origin.1, %[[VAL_15]]
 // CHECK:         %[[VAL_29:.*]] = add i32 %tile_origin.2, %[[VAL_23]]
@@ -89,7 +87,6 @@ ENTRY main {
 // CHECK:       loop1.loop_body7:                                 ; preds = %[[VAL_40]]
 // CHECK:         %[[VAL_46:.*]] = add nuw nsw i32 %[[VAL_42]], 4
 // CHECK:         store i32 %[[VAL_46]], ptr{{.*}} %[[VAL_1]], align 4
-// CHECK:         %[[VAL_47:.*]] = icmp eq i32 %[[VAL_42]], %thread.id.1
 // CHECK:         store i32 %thread.id.2, ptr{{.*}} %[[VAL_0]], align 4
 // CHECK:         br label %[[VAL_48:.*]]
 // CHECK:       loop2.loop_header12:                              ; preds = %[[VAL_49:.*]], %[[VAL_45]]
@@ -99,7 +96,6 @@ ENTRY main {
 // CHECK:       loop2.loop_body13:                                ; preds = %[[VAL_48]]
 // CHECK:         %[[VAL_52:.*]] = add nuw nsw i32 %[[VAL_50]], 32
 // CHECK:         store i32 %[[VAL_52]], ptr{{.*}} %[[VAL_0]], align 4
-// CHECK:         %[[VAL_53:.*]] = icmp eq i32 %[[VAL_50]], %thread.id.2
 // CHECK:         %[[VAL_54:.*]] = add i32 %tile_origin.0, 0
 // CHECK:         %[[VAL_55:.*]] = add i32 %tile_origin.2, %[[VAL_42]]
 // CHECK:         %[[VAL_56:.*]] = add i32 %tile_origin.1, %[[VAL_50]]
diff --git a/third_party/xla/xla/service/gpu/tests/transpose_210.hlo b/third_party/xla/xla/service/gpu/tests/transpose_210.hlo
index cf83fa7a8c0292..f37bd17ffe2a6e 100644
--- a/third_party/xla/xla/service/gpu/tests/transpose_210.hlo
+++ b/third_party/xla/xla/service/gpu/tests/transpose_210.hlo
@@ -46,7 +46,6 @@ ENTRY main {
 // CHECK:       loop0.loop_body:                                  ; preds = %[[VAL_12]]
 // CHECK:         %[[VAL_19:.*]] = add nuw nsw i32 %[[VAL_15]], 4
 // CHECK:         store i32 %[[VAL_19]], ptr{{.*}} %[[VAL_3]], align 4
-// CHECK:         %[[VAL_20:.*]] = icmp eq i32 %[[VAL_15]], %thread.id.0
 // CHECK:         store i32 %thread.id.2, ptr{{.*}} %[[VAL_2]], align 4
 // CHECK:         br label %[[VAL_21:.*]]
 // CHECK:       loop2.loop_header:                                ; preds = %[[VAL_22:.*]], %[[VAL_18]]
@@ -56,7 +55,6 @@ ENTRY main {
 // CHECK:       loop2.loop_body:                                  ; preds = %[[VAL_21]]
 // CHECK:         %[[VAL_25:.*]] = add nuw nsw i32 %[[VAL_23]], 32
 // CHECK:         store i32 %[[VAL_25]], ptr{{.*}} %[[VAL_2]], align 4
-// CHECK:         %[[VAL_26:.*]] = icmp eq i32 %[[VAL_23]], %thread.id.2
 // CHECK:         %[[VAL_27:.*]] = add i32 %tile_origin.0, %[[VAL_15]]
 // CHECK:         %[[VAL_28:.*]] = add i32 %tile_origin.1, 0
 // CHECK:         %[[VAL_29:.*]] = add i32 %tile_origin.2, %[[VAL_23]]
@@ -80,7 +78,6 @@ ENTRY main {
 // CHECK:       loop0.loop_body5:                                 ; preds = %[[VAL_35]]
 // CHECK:         %[[VAL_41:.*]] = add nuw nsw i32 %[[VAL_37]], 4
 // CHECK:         store i32 %[[VAL_41]], ptr{{.*}} %[[VAL_1]], align 4
-// CHECK:         %[[VAL_42:.*]] = icmp eq i32 %[[VAL_37]], %thread.id.0
 // CHECK:         store i32 %thread.id.2, ptr{{.*}} %[[VAL_0]], align 4
 // CHECK:         br label %[[VAL_43:.*]]
 // CHECK:       loop2.loop_header10:                              ; preds = %[[VAL_44:.*]], %[[VAL_40]]
@@ -90,7 +87,6 @@ ENTRY main {
 // CHECK:       loop2.loop_body11:                                ; preds = %[[VAL_43]]
 // CHECK:         %[[VAL_47:.*]] = add nuw nsw i32 %[[VAL_45]], 32
 // CHECK:         store i32 %[[VAL_47]], ptr{{.*}} %[[VAL_0]], align 4
-// CHECK:         %[[VAL_48:.*]] = icmp eq i32 %[[VAL_45]], %thread.id.2
 // CHECK:         %[[VAL_49:.*]] = add i32 %tile_origin.2, %[[VAL_37]]
 // CHECK:         %[[VAL_50:.*]] = add i32 %tile_origin.1, 0
 // CHECK:         %[[VAL_51:.*]] = add i32 %tile_origin.0, %[[VAL_45]]
diff --git a/third_party/xla/xla/service/gpu/tests/transpose_210_extra_output.hlo b/third_party/xla/xla/service/gpu/tests/transpose_210_extra_output.hlo
index 9581099deed5c4..a3831d2da1de52 100644
--- a/third_party/xla/xla/service/gpu/tests/transpose_210_extra_output.hlo
+++ b/third_party/xla/xla/service/gpu/tests/transpose_210_extra_output.hlo
@@ -48,7 +48,6 @@ ENTRY main {
 // CHECK:       loop0.loop_body:                                  ; preds = %[[VAL_12]]
 // CHECK:         %[[VAL_19:.*]] = add nuw nsw i32 %[[VAL_15]], 4
 // CHECK:         store i32 %[[VAL_19]], ptr{{.*}} %[[VAL_3]], align 4
-// CHECK:         %[[VAL_20:.*]] = icmp eq i32 %[[VAL_15]], %thread.id.0
 // CHECK:         store i32 %thread.id.2, ptr{{.*}} %[[VAL_2]], align 4
 // CHECK:         br label %[[VAL_21:.*]]
 // CHECK:       loop2.loop_header:                                ; preds = %[[VAL_22:.*]], %[[VAL_18]]
@@ -58,7 +57,6 @@ ENTRY main {
 // CHECK:       loop2.loop_body:                                  ; preds = %[[VAL_21]]
 // CHECK:         %[[VAL_25:.*]] = add nuw nsw i32 %[[VAL_23]], 32
 // CHECK:         store i32 %[[VAL_25]], ptr{{.*}} %[[VAL_2]], align 4
-// CHECK:         %[[VAL_26:.*]] = icmp eq i32 %[[VAL_23]], %thread.id.2
 // CHECK:         %[[VAL_27:.*]] = add i32 %tile_origin.0, %[[VAL_15]]
 // CHECK:         %[[VAL_28:.*]] = add i32 %tile_origin.1, 0
 // CHECK:         %[[VAL_29:.*]] = add i32 %tile_origin.2, %[[VAL_23]]
@@ -87,7 +85,6 @@ ENTRY main {
 // CHECK:       loop0.loop_body7:                                 ; preds = %[[VAL_40]]
 // CHECK:         %[[VAL_46:.*]] = add nuw nsw i32 %[[VAL_42]], 4
 // CHECK:         store i32 %[[VAL_46]], ptr{{.*}} %[[VAL_1]], align 4
-// CHECK:         %[[VAL_47:.*]] = icmp eq i32 %[[VAL_42]], %thread.id.0
 // CHECK:         store i32 %thread.id.2, ptr{{.*}} %[[VAL_0]], align 4
 // CHECK:         br label %[[VAL_48:.*]]
 // CHECK:       loop2.loop_header12:                              ; preds = %[[VAL_49:.*]], %[[VAL_45]]
@@ -97,7 +94,6 @@ ENTRY main {
 // CHECK:       loop2.loop_body13:                                ; preds = %[[VAL_48]]
 // CHECK:         %[[VAL_52:.*]] = add nuw nsw i32 %[[VAL_50]], 32
 // CHECK:         store i32 %[[VAL_52]], ptr{{.*}} %[[VAL_0]], align 4
-// CHECK:         %[[VAL_53:.*]] = icmp eq i32 %[[VAL_50]], %thread.id.2
 // CHECK:         %[[VAL_54:.*]] = add i32 %tile_origin.2, %[[VAL_42]]
 // CHECK:         %[[VAL_55:.*]] = add i32 %tile_origin.1, 0
 // CHECK:         %[[VAL_56:.*]] = add i32 %tile_origin.0, %[[VAL_50]]
diff --git a/third_party/xla/xla/service/gpu/tree_reduction_rewriter.cc b/third_party/xla/xla/service/gpu/tree_reduction_rewriter.cc
index 3ed9e365102cd8..8a3250bc15a237 100644
--- a/third_party/xla/xla/service/gpu/tree_reduction_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/tree_reduction_rewriter.cc
@@ -55,7 +55,14 @@ class ReductionRewriterVisitor : public DfsHloRewriteVisitor {
       : gpu_version_(gpu_version) {}
 
   absl::Status HandleReduce(HloInstruction *hlo) override {
-    if (IsMinMaxReduction(hlo)) {
+    // MLIR emitters only support race-free reductions.
+    // TODO(jreiffers: Verify performance and implement atomics for reductions
+    // if needed.
+    if (!hlo->GetModule()
+             ->config()
+             .debug_options()
+             .xla_gpu_enable_mlir_emitters() &&
+        IsMinMaxReduction(hlo)) {
       // TODO(cheshire): Also enable for integers.
       VLOG(1) << "Not performing tree expansion on min/max-reduction: "
               << hlo->ToString() << " since min/max operations are associative";
diff --git a/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc b/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc
index 7af7b2a484188c..1e849849fc8fbe 100644
--- a/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc
+++ b/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
+#include <optional>
 #include <queue>
 #include <string>
 #include <utility>
@@ -33,6 +34,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_query.h"
+#include "xla/service/gpu/cudnn_support_utils.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/triton_tiling_propagation.h"
 #include "xla/service/instruction_fusion.h"
@@ -298,6 +300,11 @@ absl::Status TritonFusionAnalysis::ExecuteForDotFusion(
   while (!output->IsRoot()) {
     TF_RET_CHECK(output->user_count() == 1);
     const HloInstruction* input = output;
+    // Tuple with a custom call can be added at root to allocate a workspace
+    // buffer. These do not need to participate in propagation of dimensions.
+    if (IsWorkspaceAllocationRoot(*output->users()[0])) {
+      break;
+    }
     output = output->users()[0];
     DimOrdersAndReqsOrError result = GetPropagatedDimOrdersAndRequirements(
         *output, context.dim_orders().at(input),
@@ -320,6 +327,17 @@ absl::Status TritonFusionAnalysis::ExecuteForDotFusion(
   return absl::OkStatus();
 }
 
+std::optional<TritonFusionAnalysis::Scope>
+TritonFusionAnalysis::QueryInstructionScope(const HloInstruction& hlo) const {
+  for (const Scope& scope : {Scope::LHS, Scope::RHS, Scope::OUTPUT}) {
+    if (iter_specs_.at(scope).count(&hlo) > 0) {
+      return scope;
+    }
+  }
+  LOG(WARNING) << "No scope for hlo: " << hlo.ToString();
+  return std::nullopt;
+}
+
 const TensorIterationSpec::DimIterationSpec* TritonFusionAnalysis::IterSpec(
     const TritonFusionAnalysis::Scope scope, const HloInstruction* hlo,
     const int dimension) const {
diff --git a/third_party/xla/xla/service/gpu/triton_fusion_analysis.h b/third_party/xla/xla/service/gpu/triton_fusion_analysis.h
index 8459f86c8ffcea..6bc460524f5519 100644
--- a/third_party/xla/xla/service/gpu/triton_fusion_analysis.h
+++ b/third_party/xla/xla/service/gpu/triton_fusion_analysis.h
@@ -79,6 +79,10 @@ class TritonFusionAnalysis {
     return parameters_.at(scope);
   }
 
+  // Returns the given instruction's scope, if there is no scope, returns
+  // nullopt instead.
+  std::optional<Scope> QueryInstructionScope(const HloInstruction& hlo) const;
+
   std::string ToString() const;
 
  private:
diff --git a/third_party/xla/xla/service/gpu/triton_fusion_analysis_test.cc b/third_party/xla/xla/service/gpu/triton_fusion_analysis_test.cc
index 4b0e2a481cd0d2..0ff9e93f6f9bb6 100644
--- a/third_party/xla/xla/service/gpu/triton_fusion_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/triton_fusion_analysis_test.cc
@@ -674,6 +674,62 @@ ENTRY e {
   (void)analysis;
 }
 
+TEST_F(TritonDotAnalysisTest, DynamicSliceIsSupported) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+triton_gemm {
+  dot_lhs = f32[2,18]{1,0} parameter(0)
+  dynamic_slice_input = f32[96,2]{1,0} parameter(1)
+  start_index0 = s32[] parameter(2)
+  start_index1 = s32[] parameter(3)
+  dynamic_slice = f32[64,2]{1,0} dynamic-slice(dynamic_slice_input,
+                                               start_index0, start_index1),
+                  dynamic_slice_sizes={64,2}
+  ROOT dot = f32[18,64]{1,0} dot(dot_lhs, dynamic_slice),
+             lhs_contracting_dims={0}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  dot_lhs = f32[2,18]{1,0} parameter(0)
+  dynamic_slice_input = f32[96,2]{1,0} parameter(1)
+  start_index0 = s32[] parameter(2)
+  start_index1 = s32[] parameter(3)
+  ROOT triton_gemm_d = f32[18,64]{1,0} fusion(dot_lhs, dynamic_slice_input,
+                                              start_index0, start_index1),
+      kind=kCustom,
+      calls=triton_gemm,
+      backend_config={"kind":"__triton_gemm"}
+}
+)"));
+
+  const HloComputation* dot_computation =
+      module->entry_computation()->root_instruction()->called_computations()[0];
+  TF_ASSERT_OK_AND_ASSIGN(const auto analysis,
+                          TritonFusionAnalysis::Execute(*dot_computation));
+  const HloInstruction* p0 = dot_computation->parameter_instruction(0);
+  const HloInstruction* p1 = dot_computation->parameter_instruction(1);
+  EXPECT_EQ(*analysis.ScopeParameters(TritonFusionAnalysis::Scope::LHS).begin(),
+            p0);
+  EXPECT_EQ(*analysis.ScopeParameters(TritonFusionAnalysis::Scope::RHS).begin(),
+            p1);
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::LHS, p0, 0),
+              ElementsAre(FieldsAre(/*stride=*/18, /*count=*/2,
+                                    /*slice_start=*/0, /*sliced_count=*/2,
+                                    /*subfragments=*/ElementsAre(2))));
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::LHS, p0, 1),
+              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/18,
+                                    /*slice_start=*/0, /*sliced_count=*/18,
+                                    /*subfragments=*/ElementsAre(18))));
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, p1, 0),
+              ElementsAre(FieldsAre(/*stride=*/2, /*count=*/96,
+                                    /*slice_start=*/0, /*sliced_count=*/96,
+                                    /*subfragments=*/ElementsAre(96))));
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, p1, 1),
+              ElementsAre(FieldsAre(/*stride=*/1, /*count=*/2,
+                                    /*slice_start=*/0, /*sliced_count=*/2,
+                                    /*subfragments=*/ElementsAre(2))));
+}
+
 TEST_F(TritonDotAnalysisTest, SparseDot) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(R"(
@@ -703,6 +759,41 @@ ENTRY main {
               ::testing::SizeIs(1));
 }
 
+TEST_F(TritonDotAnalysisTest, QueryScopeAlwaysWorks) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+triton_gemm_r {
+  Arg_0.1 = s8[30,913,8,21]{3,2,1,0} parameter(0)
+  bitcast.6 = s8[30,8,21,913]{2,1,3,0} bitcast(Arg_0.1)
+  copy.7 = s8[30,8,21,913]{3,2,1,0} copy(bitcast.6)
+  bitcast.8 = s8[5040,913]{1,0} bitcast(copy.7)
+  convert.9 = bf16[5040,913]{1,0} convert(bitcast.8)
+  bitcast.32 = bf16[58,913]{1,0} parameter(1)
+  dot.33 = bf16[5040,58]{1,0} dot(convert.9, bitcast.32),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  bitcast.34 = bf16[30,8,21,58]{3,2,1,0} bitcast(dot.33)
+  copy.35 = bf16[30,8,21,58]{2,1,3,0} copy(bitcast.34)
+  ROOT bitcast.41 = bf16[30,1,58,8,21]{4,3,2,1,0} bitcast(copy.35)
+}
+
+ENTRY e {
+  Arg_0.1 = s8[30,913,8,21]{3,2,1,0} parameter(0)
+  Arg_1.2 = bf16[58,913]{1,0} parameter(1)
+  ROOT r = bf16[30,1,58,8,21]{4,3,2,1,0} fusion(Arg_0.1, Arg_1.2), kind=kCustom,
+    calls=triton_gemm_r,
+    backend_config={kind: "__triton_gemm"}
+})"));
+  const HloComputation* dot_computation =
+      module->entry_computation()->root_instruction()->called_computations()[0];
+  TF_ASSERT_OK_AND_ASSIGN(const auto analysis,
+                          TritonFusionAnalysis::Execute(*dot_computation));
+  for (const auto& hlo : dot_computation->instructions()) {
+    if (hlo->opcode() != HloOpcode::kDot) {
+      EXPECT_TRUE(analysis.QueryInstructionScope(*hlo).has_value());
+    }
+  }
+}
+
 using TritonSoftmaxAnalysisTest = HloTestBase;
 
 TEST_F(TritonSoftmaxAnalysisTest, DegenerateBatchDimensionIsSupported) {
@@ -1012,6 +1103,49 @@ ENTRY main {
           .ok());
 }
 
+TEST_F(TritonDotAnalysisTest, PadWithTrivialDimension) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule t
+
+triton_gemm_dot {
+  parameter_0 = f32[1001,1]{1,0} parameter(0)
+  constant = f32[] constant(0)
+  pad = f32[1004,1]{1,0} pad(parameter_0, constant), padding=0_3x0_0
+  bitcast = f32[4,251,1]{2,1,0} bitcast(pad)
+  parameter_1 = f32[4,251,2048]{2,1,0} parameter(1)
+  ROOT dot = f32[4,1,2048]{2,1,0} dot(bitcast, parameter_1),
+    lhs_batch_dims={0}, lhs_contracting_dims={1}, rhs_batch_dims={0},
+    rhs_contracting_dims={1}
+})"));
+  const HloComputation* dot_computation = *module->computations().begin();
+  TF_ASSERT_OK_AND_ASSIGN(
+      TritonFusionAnalysis analysis,
+      TritonFusionAnalysis::Execute(*dot_computation, /*split_k=*/4));
+  const HloInstruction* p0 = dot_computation->parameter_instruction(0);
+  const HloInstruction* p1 = dot_computation->parameter_instruction(1);
+  EXPECT_EQ(*analysis.ScopeParameters(TritonFusionAnalysis::Scope::LHS).begin(),
+            p0);
+  EXPECT_EQ(*analysis.ScopeParameters(TritonFusionAnalysis::Scope::RHS).begin(),
+            p1);
+  EXPECT_THAT(
+      *analysis.IterSpec(TritonFusionAnalysis::Scope::LHS, p0, /*dimension=*/1),
+      ElementsAre(FieldsAre(/*stride=*/1, /*count=*/1001, /*slice_start=*/0,
+                            /*slice_limit=*/1001, ElementsAre(1001))));
+  EXPECT_THAT(
+      *analysis.IterSpec(TritonFusionAnalysis::Scope::LHS, p0, /*dimension=*/2),
+      ElementsAre(FieldsAre(/*stride=*/1, /*count=*/1, /*slice_start=*/0,
+                            /*slice_limit=*/1, ElementsAre(1))));
+  EXPECT_THAT(
+      *analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, p1, /*dimension=*/1),
+      ElementsAre(FieldsAre(/*stride=*/2048, /*count=*/1004, /*slice_start=*/0,
+                            /*slice_limit=*/1004, ElementsAre(251, 4))));
+  EXPECT_THAT(
+      *analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, p1, /*dimension=*/2),
+      ElementsAre(FieldsAre(/*stride=*/1, /*count=*/2048, /*slice_start=*/0,
+                            /*slice_limit=*/2048, ElementsAre(2048))));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/triton_fusion_numerics_verifier.cc b/third_party/xla/xla/service/gpu/triton_fusion_numerics_verifier.cc
new file mode 100644
index 00000000000000..c3e92082e3c04e
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/triton_fusion_numerics_verifier.cc
@@ -0,0 +1,191 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/triton_fusion_numerics_verifier.h"
+
+#include <memory>
+#include <optional>
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/executable.h"
+#include "xla/service/gpu/autotuner_compile_util.h"
+#include "xla/service/gpu/autotuner_util.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/buffer_comparator.h"
+#include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/service/shaped_buffer.h"
+#include "xla/shape.h"
+#include "xla/status_macros.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/tools/hlo_decomposer.h"
+#include "xla/util.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla::gpu {
+
+namespace {
+
+using ProfilingOutput = AutotunerCompileUtil::ProfilingOutput;
+
+// Returns the input instruction as a fusion instruction, if it represents a
+// Triton fusion. Otherwise, returns nullptr.
+//
+// TODO(b/326274248): Visit all Triton fusions. Now it's Softmax only.
+absl::StatusOr<const HloFusionInstruction*> AsTritonFusion(
+    const HloInstruction* hlo) {
+  if (hlo->opcode() != HloOpcode::kFusion) {
+    return nullptr;
+  }
+  const HloFusionInstruction* fusion = Cast<HloFusionInstruction>(hlo);
+  TF_ASSIGN_OR_RETURN(auto gpu_config,
+                      fusion->backend_config<GpuBackendConfig>());
+  const FusionBackendConfig& backend_config =
+      gpu_config.fusion_backend_config();
+  if (backend_config.kind() == kTritonSoftmaxFusionKind) {
+    return fusion;
+  }
+  return nullptr;
+}
+
+std::unique_ptr<HloModule> NewHloModuleFromFusion(
+    const HloFusionInstruction& fusion, const DebugOptions& debug_opts,
+    bool clear_backend_config) {
+  std::unique_ptr<HloModule> new_module =
+      ExtractInstructionIntoNewModule(fusion);
+  if (clear_backend_config) {
+    new_module->entry_computation()->root_instruction()->clear_backend_config();
+  }
+  new_module->mutable_config().set_debug_options(debug_opts);
+
+  return new_module;
+}
+
+}  // namespace
+
+namespace triton_fusion_numerics_pass_internal {
+
+absl::StatusOr<ScopedShapedBuffer> CompileAndRunFusion(
+    AutotunerCompileUtil& util, const HloFusionInstruction& fusion,
+    const AutotuneConfig& config, const DebugOptions& debug_opts,
+    bool clear_backend_config) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
+                      util.Compile([&](const DebugOptions& opts) {
+                        return NewHloModuleFromFusion(fusion, opts,
+                                                      clear_backend_config);
+                      }));
+  TF_ASSIGN_OR_RETURN(auto rz_buffers, RedzoneBuffers::FromInstruction(
+                                           fusion, config, debug_opts,
+                                           RedzoneBuffers::kAllInputs));
+  TF_ASSIGN_OR_RETURN(auto stream, config.GetStream());
+  TF_ASSIGN_OR_RETURN(std::optional<ProfilingOutput> profiling_output,
+                      util.ProfileExecutable(executable.get(), stream,
+                                             rz_buffers.input_buffers(),
+                                             rz_buffers.input_shapes()));
+  if (!profiling_output.has_value()) {
+    return Internal("No output after a successful verification run.");
+  }
+
+  return std::move(profiling_output->output);
+}
+
+absl::Status CompareBuffers(const ScopedShapedBuffer& current,
+                            const ScopedShapedBuffer& expected,
+                            const Shape& shape, const HloModuleConfig& config,
+                            se::Stream* stream) {
+  BufferComparator comparator(shape, config);
+  TF_ASSIGN_OR_RETURN(bool outputs_match,
+                      comparator.CompareEqual(stream, current.root_buffer(),
+                                              expected.root_buffer()));
+
+  if (!outputs_match) {
+    return Internal("Triton fusion output does not match emitters output.");
+  }
+  return absl::OkStatus();
+}
+
+absl::Status ForAllTritonFusions(
+    const HloModule& module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads,
+    absl::AnyInvocable<absl::Status(const HloFusionInstruction&)> fn) {
+  for (HloComputation* computation :
+       module.MakeNonfusionComputations(execution_threads)) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      TF_ASSIGN_OR_RETURN(auto triton_fusion, AsTritonFusion(instruction));
+      if (triton_fusion != nullptr) {
+        TF_RETURN_IF_ERROR(fn(*triton_fusion));
+      }
+    }
+  }
+  return absl::OkStatus();
+}
+
+}  // namespace triton_fusion_numerics_pass_internal
+
+namespace {
+absl::Status VerifyTritonFusion(AutotunerCompileUtil& util,
+                                const HloFusionInstruction& fusion,
+                                const AutotuneConfig& config,
+                                const DebugOptions& debug_opts) {
+  TF_ASSIGN_OR_RETURN(auto triton_result,
+                      triton_fusion_numerics_pass_internal::CompileAndRunFusion(
+                          util, fusion, config, debug_opts,
+                          /*clear_backend_config=*/false));
+  TF_ASSIGN_OR_RETURN(auto emitters_result,
+                      triton_fusion_numerics_pass_internal::CompileAndRunFusion(
+                          util, fusion, config, debug_opts,
+                          /*clear_backend_config=*/true));
+
+  TF_ASSIGN_OR_RETURN(auto stream, config.GetStream());
+  return triton_fusion_numerics_pass_internal::CompareBuffers(
+      triton_result, emitters_result, fusion.shape(),
+      fusion.GetModule()->config(), stream);
+}
+
+}  // namespace
+
+absl::StatusOr<bool> TritonFusionNumericsVerifier::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  if (config_.IsDeviceless()) {
+    return absl::InternalError(
+        "Cannot run TritonFusionNumericsVerifier on a deviceless compilation.");
+  }
+
+  const DebugOptions& debug_options = module->config().debug_options();
+  TF_ASSIGN_OR_RETURN(std::optional<AutotunerCompileUtil> opt_compile_util,
+                      AutotunerCompileUtil::Create(config_, debug_options));
+  TF_RET_CHECK(opt_compile_util.has_value());
+
+  TF_RETURN_IF_ERROR(triton_fusion_numerics_pass_internal::ForAllTritonFusions(
+      *module, execution_threads, [&](const HloFusionInstruction& fusion) {
+        return VerifyTritonFusion(*opt_compile_util, fusion, config_,
+                                  debug_options);
+      }));
+  return false;
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/triton_fusion_numerics_verifier.h b/third_party/xla/xla/service/gpu/triton_fusion_numerics_verifier.h
new file mode 100644
index 00000000000000..6d74f46c1cfda0
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/triton_fusion_numerics_verifier.h
@@ -0,0 +1,74 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRITON_FUSION_NUMERICS_VERIFIER_H_
+#define XLA_SERVICE_GPU_TRITON_FUSION_NUMERICS_VERIFIER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/gpu/autotuner_compile_util.h"
+#include "xla/service/gpu/autotuner_util.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/service/hlo_pass_interface.h"
+#include "xla/service/shaped_buffer.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/stream.h"
+
+namespace xla::gpu {
+
+// For each Triton fusion in the Hlo module this pass checks that the output
+// of the fusion generated via Triton matches the output of the fusion if
+// generated with the regular emitters.
+class TritonFusionNumericsVerifier : public HloModulePass {
+ public:
+  explicit TritonFusionNumericsVerifier(const AutotuneConfig& config)
+      : config_(config) {}
+
+  static absl::string_view Name() { return "triton-numerics-verifier"; }
+  absl::string_view name() const override { return Name(); }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  AutotuneConfig config_;
+};
+
+namespace triton_fusion_numerics_pass_internal {
+// These are exposed only for testing. Do not use.
+absl::StatusOr<ScopedShapedBuffer> CompileAndRunFusion(
+    AutotunerCompileUtil& util, const HloFusionInstruction& fusion,
+    const AutotuneConfig& config, const DebugOptions& debug_opts,
+    bool clear_backend_config);
+absl::Status CompareBuffers(const ScopedShapedBuffer& current,
+                            const ScopedShapedBuffer& expected,
+                            const Shape& shape, const HloModuleConfig& config,
+                            se::Stream* stream);
+absl::Status ForAllTritonFusions(
+    const HloModule& module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads,
+    absl::AnyInvocable<absl::Status(const HloFusionInstruction&)> fn);
+}  // namespace triton_fusion_numerics_pass_internal
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_TRITON_FUSION_NUMERICS_VERIFIER_H_
diff --git a/third_party/xla/xla/service/gpu/triton_fusion_numerics_verifier_test.cc b/third_party/xla/xla/service/gpu/triton_fusion_numerics_verifier_test.cc
new file mode 100644
index 00000000000000..1d35d1927b2a58
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/triton_fusion_numerics_verifier_test.cc
@@ -0,0 +1,195 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/triton_fusion_numerics_verifier.h"
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "absl/strings/substitute.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/primitive_util.h"
+#include "xla/service/gpu/autotuner_compile_util.h"
+#include "xla/service/gpu/autotuner_util.h"
+#include "xla/service/platform_util.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/test_helpers.h"
+#include "xla/tests/hlo_test_base.h"
+#include "tsl/lib/core/status_test_util.h"
+
+namespace xla::gpu {
+namespace {
+
+class TritonFusionNumericsVerifierTest
+    : public HloTestBase,
+      public ::testing::WithParamInterface<PrimitiveType> {
+ public:
+  DebugOptions GetDebugOptionsForTest() override {
+    auto options = HloTestBase::GetDebugOptionsForTest();
+    options.set_xla_gpu_enable_triton_softmax_fusion(true);
+    options.set_xla_gpu_verify_triton_fusion_numerics(true);
+    return options;
+  }
+
+ protected:
+  std::unique_ptr<xla::HloModule> Module(absl::string_view hlo_text_template,
+                                         absl::string_view type) {
+    auto m = GetOptimizedModule(absl::Substitute(hlo_text_template, type));
+    TF_EXPECT_OK(m);
+    return std::move(m.value());
+  }
+
+  const HloFusionInstruction* TritonFusion(const xla::HloModule& module) {
+    const HloFusionInstruction* fusion_result = nullptr;
+
+    absl::Status res =
+        triton_fusion_numerics_pass_internal::ForAllTritonFusions(
+            module, /*execution_threads=*/{},
+            [&](const HloFusionInstruction& fusion) -> absl::Status {
+              EXPECT_EQ(fusion_result, nullptr);
+              fusion_result = &fusion;
+              return absl::OkStatus();
+            });
+    return fusion_result;
+  }
+
+  AutotuneConfig CreateAutotuneConfig() {
+    se::Platform* platform = PlatformUtil::GetDefaultPlatform().value();
+    auto executors_or = PlatformUtil::GetStreamExecutors(platform);
+    TF_EXPECT_OK(executors_or);
+    return AutotuneConfig{DeviceConfig{executors_or->at(0), nullptr},
+                          GetDebugOptionsForTest()};
+  }
+
+  AutotunerCompileUtil CreateAutotunerCompileUtil(AutotuneConfig& config) {
+    auto opt_compile_util_or =
+        AutotunerCompileUtil::Create(config, GetDebugOptionsForTest());
+    TF_EXPECT_OK(opt_compile_util_or);
+    EXPECT_TRUE(opt_compile_util_or->has_value());
+    return std::move(opt_compile_util_or->value());
+  }
+};
+
+constexpr absl::string_view kSoftmaxHlo = R"(
+HloModule softmax
+max_computation {
+  arg_0 = $0[] parameter(0)
+  arg_1 = $0[] parameter(1)
+  ROOT maximum = $0[] maximum(arg_0, arg_1)
+}
+add_computation {
+  arg_0.1 = $0[] parameter(0)
+  arg_1.1 = $0[] parameter(1)
+  ROOT add = $0[] add(arg_0.1, arg_1.1)
+}
+ENTRY main {
+  param_0 = $0[127,125]{1,0} parameter(0)
+  constant_neg_inf = $0[] constant(-inf)
+  reduce = $0[127]{0} reduce(param_0, constant_neg_inf), dimensions={1}, to_apply=max_computation
+  broadcast = $0[127,125]{1,0} broadcast(reduce), dimensions={0}
+  subtract = $0[127,125]{1,0} subtract(param_0, broadcast)
+  exponential = $0[127,125]{1,0} exponential(subtract)
+  constant_zero = $0[] constant(0)
+  second_reduce = $0[127]{0} reduce(exponential, constant_zero), dimensions={1}, to_apply=add_computation
+  second_broadcast = $0[127,125]{1,0} broadcast(second_reduce), dimensions={0}
+  ROOT divide = $0[127,125]{1,0} divide(exponential, second_broadcast)
+}
+)";
+
+bool HloPassHasRun(const HloModule& module, absl::string_view pass_name) {
+  for (const auto& pass_metadata : module.metadata().proto().pass_metadata()) {
+    if (pass_metadata.pass_name() == pass_name) {
+      return true;
+    }
+  }
+  return false;
+}
+
+TEST_P(TritonFusionNumericsVerifierTest, VerifyExactSoftmaxFusionNumerics) {
+  PrimitiveType data_type = GetParam();
+
+  auto module = Module(kSoftmaxHlo,
+                       primitive_util::LowercasePrimitiveTypeName(data_type));
+
+  // At this point all HLO passes have been executed successfully, because the
+  // Module() function hasn't failed. In particular the numerics verification
+  // pass should have also run and **not** found any issues. Below we just
+  // ensure that the pass has indeed been correctly enabled and that there are
+  // Triton Fusions in the input module.
+
+  EXPECT_TRUE(HloPassHasRun(*module, TritonFusionNumericsVerifier::Name()));
+  auto fusion = TritonFusion(*module);
+  EXPECT_NE(fusion, nullptr);
+}
+
+TEST_F(TritonFusionNumericsVerifierTest, CheckMismatch) {
+  // This test intentionally compares two different Triton modules to each
+  // other. This is to test that the verifier functions correctly catch and
+  // report mismatches.
+  //
+  // Note that as part of computing the two modules below, the numerics verifier
+  // pass also runs individually for each module. These runs compare the
+  // modules to the corresponding emitters generated version, which matches. In
+  // that sense this test covers what is being tested by
+  // VerifyExactSoftmaxFusionNumerics. The reason to keep two tests is that
+  // VerifyExactSoftmaxFusionNumerics is minimal and will be easier to debug if
+  // it fails.
+
+  auto module_f16 = Module(kSoftmaxHlo, "f16");
+  auto fusion_f16 = TritonFusion(*module_f16);
+  EXPECT_NE(fusion_f16, nullptr);
+
+  auto module_f32 = Module(kSoftmaxHlo, "f32");
+  auto fusion_f32 = TritonFusion(*module_f32);
+  EXPECT_NE(fusion_f32, nullptr);
+
+  AutotuneConfig autotune_config = CreateAutotuneConfig();
+  AutotunerCompileUtil compile_util =
+      CreateAutotunerCompileUtil(autotune_config);
+  const DebugOptions& debug_options = GetDebugOptionsForTest();
+
+  auto f16_result = triton_fusion_numerics_pass_internal::CompileAndRunFusion(
+      compile_util, *fusion_f16, autotune_config, debug_options,
+      /*clear_backend_config=*/false);
+  TF_EXPECT_OK(f16_result);
+
+  auto f32_result = triton_fusion_numerics_pass_internal::CompileAndRunFusion(
+      compile_util, *fusion_f32, autotune_config, debug_options,
+      /*clear_backend_config=*/false);
+  TF_EXPECT_OK(f32_result);
+
+  auto stream = autotune_config.GetStream();
+  TF_EXPECT_OK(stream);
+
+  // Intentionally compare the fusions from the different modules, triggering a
+  // mismatch.
+  auto cmp = triton_fusion_numerics_pass_internal::CompareBuffers(
+      *f16_result, *f32_result, fusion_f16->shape(),
+      fusion_f16->GetModule()->config(), *stream);
+
+  EXPECT_FALSE(cmp.ok());
+}
+
+INSTANTIATE_TEST_SUITE_P(TritonFusionNumericsVerifierTestSuite,
+                         TritonFusionNumericsVerifierTest,
+                         ::testing::Values(F32, F16, BF16));
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/triton_support.cc b/third_party/xla/xla/service/gpu/triton_support.cc
index 66631dbd19ad3d..05eea435bd70dd 100644
--- a/third_party/xla/xla/service/gpu/triton_support.cc
+++ b/third_party/xla/xla/service/gpu/triton_support.cc
@@ -91,7 +91,8 @@ std::vector<HloOpcode> TritonSupportedUnaryElementwise(
       element_type == PrimitiveType::BF16 ||
       element_type == PrimitiveType::F64) {
     absl::c_copy(std::vector<HloOpcode>{HloOpcode::kCos, HloOpcode::kExp,
-                                        HloOpcode::kExpm1, HloOpcode::kLog,
+                                        HloOpcode::kExpm1, HloOpcode::kFloor,
+                                        HloOpcode::kCeil, HloOpcode::kLog,
                                         HloOpcode::kLog1p, HloOpcode::kRsqrt,
                                         HloOpcode::kSin, HloOpcode::kSqrt,
                                         HloOpcode::kCbrt, HloOpcode::kTan,
diff --git a/third_party/xla/xla/service/gpu/triton_support_test.cc b/third_party/xla/xla/service/gpu/triton_support_test.cc
index e3ad43b2f0f783..8a8b2242040a97 100644
--- a/third_party/xla/xla/service/gpu/triton_support_test.cc
+++ b/third_party/xla/xla/service/gpu/triton_support_test.cc
@@ -48,6 +48,7 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/status_matchers.h"
 #include "tsl/platform/statusor.h"
 
@@ -165,7 +166,7 @@ ENTRY e {
       hlo_query::GetFirstInstructionWithOpcode(*computation, opcode);
   if (IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())) {
     float tolerance = getTolerance(data_type);
-    EXPECT_OK(ApplyFloatNormalization(module.get()));
+    TF_EXPECT_OK(ApplyFloatNormalization(module.get()));
     EXPECT_TRUE(RunAndCompareNoHloPasses(
         std::move(module), ErrorSpec{/*aabs=*/tolerance, /*arel=*/tolerance}));
   } else {
@@ -193,8 +194,9 @@ INSTANTIATE_TEST_SUITE_P(
 INSTANTIATE_TEST_SUITE_P(
     UnaryMathTestSuite, UnaryElementwiseTest,
     ::testing::Combine(::testing::Values(F16, F32, BF16),
-                       ::testing::Values(HloOpcode::kCos, HloOpcode::kExp,
-                                         HloOpcode::kExpm1, HloOpcode::kLog,
+                       ::testing::Values(HloOpcode::kCeil, HloOpcode::kCos,
+                                         HloOpcode::kExp, HloOpcode::kExpm1,
+                                         HloOpcode::kFloor, HloOpcode::kLog,
                                          HloOpcode::kLog1p, HloOpcode::kRsqrt,
                                          HloOpcode::kSin, HloOpcode::kSqrt,
                                          HloOpcode::kCbrt, HloOpcode::kTan,
@@ -245,7 +247,7 @@ ENTRY e {
       hlo_query::GetFirstInstructionWithOpcode(*computation, opcode);
   if (IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())) {
     float tolerance = getTolerance(data_type);
-    EXPECT_OK(ApplyFloatNormalization(module.get()));
+    TF_EXPECT_OK(ApplyFloatNormalization(module.get()));
     EXPECT_TRUE(RunAndCompareNoHloPasses(
         std::move(module), ErrorSpec{/*aabs=*/tolerance, /*arel=*/tolerance}));
   } else {
@@ -328,7 +330,7 @@ ENTRY e {
       hlo_query::GetFirstInstructionWithOpcode(*computation, opcode);
   if (IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())) {
     float tolerance = getTolerance(data_type);
-    EXPECT_OK(ApplyFloatNormalization(module.get()));
+    TF_EXPECT_OK(ApplyFloatNormalization(module.get()));
     EXPECT_TRUE(RunAndCompareNoHloPasses(
         std::move(module), ErrorSpec{/*aabs=*/tolerance, /*arel=*/tolerance}));
   } else {
@@ -392,7 +394,7 @@ ENTRY e {
       hlo_query::GetFirstInstructionWithOpcode(*computation, opcode);
   if (IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())) {
     float tolerance = getTolerance(data_type);
-    EXPECT_OK(ApplyFloatNormalization(module.get()));
+    TF_EXPECT_OK(ApplyFloatNormalization(module.get()));
     EXPECT_TRUE(RunAndCompareNoHloPasses(
         std::move(module), ErrorSpec{/*aabs=*/tolerance, /*arel=*/tolerance}));
   } else {
@@ -448,7 +450,7 @@ ENTRY e {
   const HloInstruction* instr =
       hlo_query::GetFirstInstructionWithOpcode(*computation, opcode);
   if (IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())) {
-    EXPECT_OK(ApplyFloatNormalization(module.get()));
+    TF_EXPECT_OK(ApplyFloatNormalization(module.get()));
     EXPECT_TRUE(RunAndCompareNoHloPasses(
         std::move(module), ErrorSpec{/*aabs=*/2e-4, /*arel=*/2e-4}));
   } else {
@@ -629,7 +631,7 @@ ENTRY main {
       hlo_query::GetFirstInstructionWithOpcode(*computation, opcode);
   if (IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())) {
     float tolerance = getTolerance(data_type);
-    EXPECT_OK(ApplyFloatNormalization(module.get()));
+    TF_EXPECT_OK(ApplyFloatNormalization(module.get()));
     EXPECT_TRUE(RunAndCompareNoHloPasses(
         std::move(module), ErrorSpec{/*aabs=*/tolerance, /*arel=*/tolerance}));
   } else {
@@ -692,7 +694,7 @@ ENTRY main {
       *computation, HloOpcode::kReduce);
   EXPECT_TRUE(IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())
                   .CanFuse());
-  EXPECT_OK(ApplyFloatNormalization(hlo_module.get()));
+  TF_EXPECT_OK(ApplyFloatNormalization(hlo_module.get()));
   EXPECT_TRUE(RunAndCompareNoHloPasses(
       std::move(hlo_module), ErrorSpec{/*aabs=*/2e-4, /*arel=*/2e-4}));
 }
diff --git a/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc b/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
index 9abec4560784a9..58b9492fda2bd3 100644
--- a/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
+++ b/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_query.h"
+#include "xla/layout.h"
 #include "xla/permutation_util.h"
 #include "xla/service/gpu/triton_support.h"
 #include "xla/service/instruction_fusion.h"
@@ -797,16 +798,27 @@ DimOrderMapOrError GetPropagatedDimOrdersForDimAlteringOp(
           // TritonFusionAnalysis after the padding and the split-k transform
           // are applied.
           const std::vector<Fragment*>& fragments = src_logical[i];
-          // We must have 2 fragments at this point.
-          CHECK_EQ(fragments.size(), 2);
-          // The dst_dim_numbers must be the same for the 2 fragments of the
-          // contracting dimension after applying split-k.
-          CHECK_EQ(fragments[0]->dst_dim_number(),
-                   fragments[1]->dst_dim_number());
 
+          // We must have 2 non-trivial fragments at this point. We may have
+          // more than 2 fragments if there are trivial fragments with count 1.
+          CHECK_GE(fragments.size(), 2);
+          // The dst_dim_numbers must be the same for all fragments of the
+          // contracting dimension after applying split-k.
+          CHECK(absl::c_all_of(fragments, [&](const Fragment* fragment) {
+            return fragment->dst_dim_number() ==
+                   fragments.front()->dst_dim_number();
+          }));
+
+          std::vector<Fragment*> non_trivial_fragments;
+          absl::c_copy_if(fragments, std::back_inserter(non_trivial_fragments),
+                          [](const Fragment* fragment) {
+                            return fragment->full_count() > 1;
+                          });
+          CHECK_EQ(non_trivial_fragments.size(), 2);
           new_fragments.emplace_back(
-              fragments[0]->dst_dim_number(),
-              fragments[0]->full_count() * fragments[1]->full_count() -
+              non_trivial_fragments[0]->dst_dim_number(),
+              non_trivial_fragments[0]->full_count() *
+                      non_trivial_fragments[1]->full_count() -
                   padding);
           dst_logical[i] = {&new_fragments.back()};
         }
@@ -829,6 +841,29 @@ DimOrderMapOrError GetPropagatedDimOrdersForDimAlteringOp(
               fragment->sliced_count());
         }
       }
+    } else if (hlo.opcode() == HloOpcode::kDynamicSlice) {
+      // All operands after idx 0 are scalar indices. As such, we do not want
+      // to explicitly define dim orders.
+      if (dst != &hlo && hlo.operand_index(dst) >= 1) {
+        continue;
+      }
+      const auto dynamic_slice = Cast<HloDynamicSliceInstruction>(&hlo);
+      dst_logical.resize(src_logical.size());
+      for (int dim = 0; dim < src_logical.size(); ++dim) {
+        dst_logical[dim] = src_logical[dim];
+        if (dynamic_slice->slice_sizes(dim) != dst->shape().dimensions(dim)) {
+          if (dst_logical[dim].size() > 1) {
+            return FusionDecision("Slicing of fragmented dimension.");
+          }
+          auto fragment = dst_logical[dim].front();
+          fragment->set_count(dst->shape().dimensions(dim));
+
+          // As we do not know which section of the tensor we keep, we retain
+          // the whole part.
+          fragment->set_slice(fragment->slice_start(),
+                              dst->shape().dimensions(dim));
+        }
+      }
     } else {
       return FusionDecision("Function called on a wrong instruction.");
     }
@@ -878,9 +913,10 @@ DimOrderMapOrError GetPropagatedDimOrders(const HloInstruction& hlo,
   if (hlo.opcode() != HloOpcode::kParameter &&
       direction == TransformDirection::kOutputToInput &&
       absl::c_any_of(hlo.users(), [](const HloInstruction* user) {
-        return user->opcode() == HloOpcode::kConcatenate;
+        return (user->opcode() == HloOpcode::kConcatenate ||
+                user->opcode() == HloOpcode::kDynamicSlice);
       })) {
-    return "No fusion into concatenations";
+    return "No fusion into concatenations or dynamic slice.";
   }
   if (hlo.opcode() == HloOpcode::kParameter ||
       hlo_query::IsScalarConstant(&hlo)) {
@@ -930,6 +966,38 @@ DimOrderMapOrError GetPropagatedDimOrders(const HloInstruction& hlo,
     if (direction != TransformDirection::kOutputToInput) {
       return "Unsupported slice direction.";
     }
+
+    return GetPropagatedDimOrdersForDimAlteringOp(hlo, direction, src_dim_order,
+                                                  properties);
+  } else if (hlo.opcode() == HloOpcode::kDynamicSlice &&
+             direction == TransformDirection::kOutputToInput) {
+    // We handle the dynamic slice within EmitTensorPointer, which is only
+    // used for GEMM fusions.
+    if (!std::holds_alternative<DotProperties>(properties)) {
+      return "Dynamic slices for now are only supported in GEMM fusions.";
+    }
+
+    // Similar to normal slice, we cannot slice a non-major-most dimension as
+    // that would introduce non-contiguous strides under tiling. The existing
+    // check against this in GetRequirementsIfSupportedOrder is not suitable for
+    // dynamic slices, so we instead check for this here.
+    const HloInstruction* input = hlo.operand(0);
+    Layout in_layout = input->shape().layout();
+    int64_t majormost =
+        in_layout.minor_to_major(in_layout.minor_to_major_size() - 1);
+    const HloDynamicSliceInstruction* dynamic_slice =
+        Cast<HloDynamicSliceInstruction>(&hlo);
+
+    for (int i = 0; i < input->shape().dimensions_size(); ++i) {
+      if (i == majormost) {
+        continue;
+      } else if (input->shape().dimensions(i) !=
+                 dynamic_slice->slice_sizes(i)) {
+        return FusionDecision(
+            "Unsupported dynamic slice on non-major-most dimension.");
+      }
+    }
+
     return GetPropagatedDimOrdersForDimAlteringOp(hlo, direction, src_dim_order,
                                                   properties);
   } else if (hlo.opcode() == HloOpcode::kReshape) {
diff --git a/third_party/xla/xla/service/gpu_compilation_environment.cc b/third_party/xla/xla/service/gpu_compilation_environment.cc
index d598c02df3d5f6..c525726fa3f033 100644
--- a/third_party/xla/xla/service/gpu_compilation_environment.cc
+++ b/third_party/xla/xla/service/gpu_compilation_environment.cc
@@ -66,10 +66,7 @@ absl::StatusOr<GpuCompilationEnvironment> CreateGpuCompEnvFromEnvVar() {
   GpuCompilationEnvironment env;
   std::vector<tsl::Flag> flag_objects;
   InitializeFlagsForGpuCompEnv(&flag_objects, &env);
-  bool result = ParseFlagsFromEnvAndIgnoreUnknown("XLA_FLAGS", flag_objects);
-  if (!result) {
-    return InvalidArgument("Could not parse XLA_FLAGS.");
-  }
+  ParseFlagsFromEnvAndIgnoreUnknown("XLA_FLAGS", flag_objects);
   return env;
 }
 
@@ -79,7 +76,8 @@ GpuCompilationEnvironment CreateGpuCompEnvWithDefaultValues() {
   return env;
 }
 
-Status InitializeMissingFieldsFromXLAFlags(GpuCompilationEnvironment& env) {
+absl::Status InitializeMissingFieldsFromXLAFlags(
+    GpuCompilationEnvironment& env) {
   TF_ASSIGN_OR_RETURN(GpuCompilationEnvironment from_env,
                       CreateGpuCompEnvFromEnvVar());
 
diff --git a/third_party/xla/xla/service/gpu_compilation_environment.h b/third_party/xla/xla/service/gpu_compilation_environment.h
index 23f2a30273c8a9..a5ca8f67c2a87e 100644
--- a/third_party/xla/xla/service/gpu_compilation_environment.h
+++ b/third_party/xla/xla/service/gpu_compilation_environment.h
@@ -32,7 +32,8 @@ GpuCompilationEnvironment CreateGpuCompEnvWithDefaultValues();
 
 // Returns non-OK status if XLA_FLAGS env var has malformed values or
 // if it has conflict with the GpuCompilationEnvironment proto
-Status InitializeMissingFieldsFromXLAFlags(GpuCompilationEnvironment& env);
+absl::Status InitializeMissingFieldsFromXLAFlags(
+    GpuCompilationEnvironment& env);
 
 }  // namespace xla
 #endif  // XLA_SERVICE_GPU_COMPILATION_ENVIRONMENT_H_
diff --git a/third_party/xla/xla/service/gpu_compilation_environment_test.cc b/third_party/xla/xla/service/gpu_compilation_environment_test.cc
index 85dde778f8aee3..170d928b2665d1 100644
--- a/third_party/xla/xla/service/gpu_compilation_environment_test.cc
+++ b/third_party/xla/xla/service/gpu_compilation_environment_test.cc
@@ -71,13 +71,6 @@ TEST(CreateGpuCompEnvFromFlagStringsTest, InvalidFlagName) {
   ASSERT_EQ(flags.size(), 1);
 }
 
-TEST(CreateGpuCompEnvFromFlagStringsTest, InvalidFlagValue) {
-  std::vector<std::string> flags = {"--dummy_flag=foo"};
-
-  EXPECT_THAT(CreateGpuCompEnvFromFlagStrings(flags, /*strict=*/false),
-              StatusIs(tsl::error::INVALID_ARGUMENT));
-}
-
 TEST(CreateGpuCompEnvFromEnvVarTest, ValidFlags) {
   set_xla_flags_env_var("--dummy_flag=4");
 
@@ -87,13 +80,6 @@ TEST(CreateGpuCompEnvFromEnvVarTest, ValidFlags) {
   ASSERT_EQ(gpu_comp_env.dummy_flag(), 4);
 }
 
-TEST(CreateGpuCompEnvFromEnvVarTest, InvalidFlagValue) {
-  set_xla_flags_env_var("--dummy_flag=foo");
-
-  EXPECT_THAT(CreateGpuCompEnvFromEnvVar(),
-              StatusIs(tsl::error::INVALID_ARGUMENT));
-}
-
 TEST(InitializeMissingFieldsFromXLAFlagsTest, BothProtoAndEnvVarUnset) {
   set_xla_flags_env_var("");
   GpuCompilationEnvironment env;
diff --git a/third_party/xla/xla/service/heap_simulator/heap_simulator.cc b/third_party/xla/xla/service/heap_simulator/heap_simulator.cc
index 0b6223e3344f31..7ca0490f6db776 100644
--- a/third_party/xla/xla/service/heap_simulator/heap_simulator.cc
+++ b/third_party/xla/xla/service/heap_simulator/heap_simulator.cc
@@ -196,7 +196,7 @@ absl::StatusOr<HeapSimulator::Result<HloValue>> HeapSimulator::Run(
 
 // Runs a heap simulation for the given 'computation', assuming the given
 // 'instruction_sequence'.
-Status HeapSimulator::RunComputation(
+absl::Status HeapSimulator::RunComputation(
     const HloComputation& computation,
     const HloInstructionSequence& instruction_sequence,
     const HloAliasAnalysis& alias_analysis, HloLiveRange* hlo_live_range) {
@@ -383,7 +383,7 @@ Status HeapSimulator::RunComputation(
       Free(value, value->instruction());
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 HeapSimulator::HeapSimulator(
@@ -557,7 +557,7 @@ void NoFragmentationStatsHeap<BufferType>::Free(const BufferType* buffer,
 }
 
 template <typename BufferType>
-StatusOr<HeapSimulator::Result<BufferType>>
+absl::StatusOr<HeapSimulator::Result<BufferType>>
 NoFragmentationStatsHeap<BufferType>::Finish() {
   // The result.chunk_map is empty, since we only collect stats, and don't
   // actually compute chunk assignments.
@@ -939,7 +939,9 @@ void GlobalDecreasingSizeBestFitHeap<BufferType>::SlicedBufferInterval::Slice(
         full_buffer_interval_.need_allocation});
   }
 
-  CHECK_EQ(size_total, full_buffer_interval_.size);
+  CHECK_EQ(size_total, full_buffer_interval_.size)
+      << " slice sizes: {" << absl::StrJoin(slice_sizes_sorted_by_offset, ", ")
+      << "};";
 }
 
 template <typename BufferType>
@@ -1868,9 +1870,8 @@ GlobalDecreasingSizeBestFitHeap<BufferType>::SlicedAllocationFinder::Find()
   if (preferred_offset_ >= 0) {
     ChunksSortedBySliceTime chunks = FindForOffset(preferred_offset_);
     if (!chunks.empty()) {
-      VLOG(1) << "SlicedAllocationFinder found chunks: "
-              << "{ " << absl::StrJoin(chunks, ", ", absl::StreamFormatter())
-              << " }";
+      VLOG(1) << "SlicedAllocationFinder found chunks: " << "{ "
+              << absl::StrJoin(chunks, ", ", absl::StreamFormatter()) << " }";
       return chunks;
     }
   }
@@ -1902,9 +1903,8 @@ GlobalDecreasingSizeBestFitHeap<BufferType>::SlicedAllocationFinder::Find()
     VLOG(3) << "SlicedAllocationFinder::Find() searching " << root->ToString();
     ChunksSortedBySliceTime chunks = FindInRoot(*root);
     if (!chunks.empty()) {
-      VLOG(1) << "SlicedAllocationFinder found chunks: "
-              << "{ " << absl::StrJoin(chunks, ", ", absl::StreamFormatter())
-              << " }";
+      VLOG(1) << "SlicedAllocationFinder found chunks: " << "{ "
+              << absl::StrJoin(chunks, ", ", absl::StreamFormatter()) << " }";
       return chunks;
     }
   }
@@ -1938,10 +1938,11 @@ GlobalDecreasingSizeBestFitHeap<
 }
 
 template <typename BufferType>
-Status GlobalDecreasingSizeBestFitHeap<BufferType>::SlicedAllocationFinder::
-    DoesPermutationFit(absl::Span<const int64_t> permutation_of_slice_times,
-                       const FreeChunkRoot& root, int64_t offset) const {
-  Status result =
+absl::Status GlobalDecreasingSizeBestFitHeap<BufferType>::
+    SlicedAllocationFinder::DoesPermutationFit(
+        absl::Span<const int64_t> permutation_of_slice_times,
+        const FreeChunkRoot& root, int64_t offset) const {
+  absl::Status result =
       DoesPermutationFitImpl(permutation_of_slice_times, root, offset);
   VLOG(3) << "SlicedAllocationFinder::DoesPermutationFit\n"
           << "  permutation of slice times: [ "
@@ -1953,9 +1954,10 @@ Status GlobalDecreasingSizeBestFitHeap<BufferType>::SlicedAllocationFinder::
 }
 
 template <typename BufferType>
-Status GlobalDecreasingSizeBestFitHeap<BufferType>::SlicedAllocationFinder::
-    DoesPermutationFitImpl(absl::Span<const int64_t> permutation_of_slice_times,
-                           const FreeChunkRoot& root, int64_t offset) const {
+absl::Status GlobalDecreasingSizeBestFitHeap<BufferType>::
+    SlicedAllocationFinder::DoesPermutationFitImpl(
+        absl::Span<const int64_t> permutation_of_slice_times,
+        const FreeChunkRoot& root, int64_t offset) const {
   if (permutation_of_slice_times.size() != sorted_slice_sizes_.size()) {
     return InvalidArgumentStrCat(
         sorted_slice_sizes_.size(), " slices times expected in permutation. ",
@@ -2036,7 +2038,7 @@ Status GlobalDecreasingSizeBestFitHeap<BufferType>::SlicedAllocationFinder::
                           "have caught such a condition earlier.");
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Future opportunities:
@@ -2119,7 +2121,7 @@ GlobalDecreasingSizeBestFitHeap<BufferType>::SlicedAllocationFinder::
 }
 
 template <typename BufferType>
-StatusOr<HeapSimulator::Result<BufferType>>
+absl::StatusOr<HeapSimulator::Result<BufferType>>
 GlobalDecreasingSizeBestFitHeap<BufferType>::Finish() {
   std::vector<BufferInterval> sorted_buffer_intervals =
       GetSortedBufferIntervals();
@@ -2396,7 +2398,7 @@ ConstrainedGlobalDecreasingSizeBestFitHeap::Finish() {
 }
 
 template <typename BufferType>
-StatusOr<HeapSimulator::Result<BufferType>>
+absl::StatusOr<HeapSimulator::Result<BufferType>>
 ChooseBestHeapAlgorithm<BufferType>::Finish() {
   DCHECK(!algorithms_.empty());
   std::vector<Result> results(algorithms_.size());
diff --git a/third_party/xla/xla/service/heap_simulator/heap_simulator.h b/third_party/xla/xla/service/heap_simulator/heap_simulator.h
index 849be24cb4b086..d6eae82ce8aa6f 100644
--- a/third_party/xla/xla/service/heap_simulator/heap_simulator.h
+++ b/third_party/xla/xla/service/heap_simulator/heap_simulator.h
@@ -215,10 +215,10 @@ class HeapSimulator {
                     memory_by_computation = nullptr);
   ~HeapSimulator();
 
-  Status RunComputation(const HloComputation& computation,
-                        const HloInstructionSequence& instruction_sequence,
-                        const HloAliasAnalysis& alias_analysis,
-                        HloLiveRange* live_range);
+  absl::Status RunComputation(
+      const HloComputation& computation,
+      const HloInstructionSequence& instruction_sequence,
+      const HloAliasAnalysis& alias_analysis, HloLiveRange* live_range);
 
   bool IgnoreBuffer(const HloValue* buffer) const;
   void Alloc(const HloValue* buffer, const HloInstruction* instruction);
@@ -756,14 +756,14 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
     // (spatially) should be allocated. Such a slice has size
     // sorted_slice_sizes_[i] and would be allocated at offset +
     // sum(sorted_slice_sizes[j], for j in [0, i-1]).
-    Status DoesPermutationFit(
+    absl::Status DoesPermutationFit(
         absl::Span<const int64_t> permutation_of_slice_times,
         const FreeChunkRoot& root, int64_t offset) const;
 
     // Only DoesSlicedPermutationFit() should call this method directly. Other
     // callers should call DoesSlicedPermutationFit(), which contains some
     // wrapper VLOGGING.
-    Status DoesPermutationFitImpl(
+    absl::Status DoesPermutationFitImpl(
         absl::Span<const int64_t> permutation_of_slice_times,
         const FreeChunkRoot& root, int64_t offset) const;
 
@@ -811,7 +811,7 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
   void ShareWith(const BufferType* buffer, const BufferType* share_with,
                  int64_t size) override;
 
-  StatusOr<Result> Finish() override;
+  absl::StatusOr<Result> Finish() override;
 
   // Return a BufferIntervalCompare function that sort by spatial size. We don't
   // look at co-locates as they should have the same size.
@@ -992,7 +992,7 @@ class ChooseBestHeapAlgorithm : public HeapAlgorithm<BufferType> {
     }
   }
 
-  StatusOr<Result> Finish() override;
+  absl::StatusOr<Result> Finish() override;
 
  private:
   std::vector<std::unique_ptr<HeapAlgorithm<BufferType>>> algorithms_;
diff --git a/third_party/xla/xla/service/hlo_alias_analysis.cc b/third_party/xla/xla/service/hlo_alias_analysis.cc
index 444255d9c344b2..3d6f5a87e1a760 100644
--- a/third_party/xla/xla/service/hlo_alias_analysis.cc
+++ b/third_party/xla/xla/service/hlo_alias_analysis.cc
@@ -316,7 +316,7 @@ std::vector<const HloBuffer*> HloAliasAnalysis::ComputeBuffersAt(
   return buffers;
 }
 
-Status HloAliasAnalysis::Verify() const {
+absl::Status HloAliasAnalysis::Verify() const {
   // Verify consistency between the value_to_buffer_ map and
   // HloBuffer::values().
   for (const auto& pair : value_to_buffer_) {
diff --git a/third_party/xla/xla/service/hlo_alias_analysis.h b/third_party/xla/xla/service/hlo_alias_analysis.h
index b0896a93c6883e..197a4d60109ec8 100644
--- a/third_party/xla/xla/service/hlo_alias_analysis.h
+++ b/third_party/xla/xla/service/hlo_alias_analysis.h
@@ -103,7 +103,7 @@ class HloAliasAnalysis {
   explicit HloAliasAnalysis(const HloModule* module);
 
   // Verify various invariants of the alias analysis.
-  Status Verify() const;
+  absl::Status Verify() const;
 
   const HloModule* module_;
 
diff --git a/third_party/xla/xla/service/hlo_computation_test.cc b/third_party/xla/xla/service/hlo_computation_test.cc
index a0a2f5ecb01a31..37d2c2c2e5c345 100644
--- a/third_party/xla/xla/service/hlo_computation_test.cc
+++ b/third_party/xla/xla/service/hlo_computation_test.cc
@@ -253,14 +253,14 @@ TEST_F(HloComputationTest, VisitWithMultipleRoots) {
     explicit TestVisitor(HloComputation* computation)
         : computation_(computation) {}
 
-    Status DefaultAction(HloInstruction* hlo_instruction) override {
+    absl::Status DefaultAction(HloInstruction* hlo_instruction) override {
       EXPECT_FALSE(visited_set_.contains(hlo_instruction));
       visited_set_.insert(hlo_instruction);
       last_visited_ = hlo_instruction;
       return OkStatus();
     }
 
-    Status FinishVisit(HloInstruction* root) override {
+    absl::Status FinishVisit(HloInstruction* root) override {
       EXPECT_EQ(computation_->root_instruction(), root);
       ++finish_visit_calls_;
       return OkStatus();
@@ -489,6 +489,42 @@ TEST_F(HloComputationTest, RemoveInstructionWithDuplicateOperand) {
   EXPECT_EQ(negate, computation->root_instruction());
 }
 
+TEST_F(HloComputationTest, RemoveSeveralUnusedFusionParameters) {
+  const char* const kHloModule = R"(
+  HloModule test
+
+  f {
+    p0 = f32[] parameter(0)
+    p1 = f32[] parameter(1)
+    p2 = f32[] parameter(2)
+    add = f32[] add(p0, p2)
+    ROOT neg = f32[] negate(p1)
+  }
+
+  ENTRY main {
+    param0 = f32[] parameter(0)
+    param1 = f32[] parameter(1)
+    param2 = f32[] parameter(2)
+    ROOT res = f32[] fusion(param0, param1, param2), kind=kLoop, calls=f
+  }
+  )";
+  // Unverified because we don't allow a fusion with dead instructions. But we
+  // can run into this case if we have a multi-output fusion with an unused
+  // tuple output and then remove the tuple output.
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule(kHloModule));
+  auto root = module->entry_computation()->root_instruction();
+  auto dead_add = FindInstruction(module.get(), "add");
+  ASSERT_IS_OK(root->fused_instructions_computation()
+                   ->RemoveInstructionAndUnusedOperands(dead_add));
+  root = module->entry_computation()->root_instruction();
+  // We don't remove unused parameters in entry computations, so we still expect
+  // the parameter number of the remaining fusion operand to be 1.
+  EXPECT_THAT(root, GmockMatch(m::Fusion(m::Parameter(1))));
+  EXPECT_THAT(root->fused_expression_root(),
+              GmockMatch(m::Negate(m::Parameter(0))));
+}
+
 TEST_F(HloComputationTest, ReplaceParameter) {
   const char* const kHloModule = R"(
     HloModule ModuleWithWhile
diff --git a/third_party/xla/xla/service/hlo_constant_folding.cc b/third_party/xla/xla/service/hlo_constant_folding.cc
index 732ef23bd654ed..c0154e6e3b5dc4 100644
--- a/third_party/xla/xla/service/hlo_constant_folding.cc
+++ b/third_party/xla/xla/service/hlo_constant_folding.cc
@@ -163,20 +163,19 @@ absl::StatusOr<bool> HloConstantFolding::Run(
         continue;
       }
 
-      // Don't constant fold unless it's a net positive or the output is small.
+      // Don't constant fold unless output and operand sizes are small.
       if (instruction->shape().IsArray()) {
-        int64_t elements_in_removed_operands = 0;
+        int64_t elements_in_operands = 0;
         for (HloInstruction* operand : instruction->operands()) {
-          if (operand->user_count() == 1 && operand->shape().IsArray()) {
-            elements_in_removed_operands +=
-                ShapeUtil::ElementsIn(operand->shape());
+          if (operand->shape().IsArray()) {
+            elements_in_operands += ShapeUtil::ElementsIn(operand->shape());
           }
         }
         int64_t elements_in_constant =
             ShapeUtil::ElementsIn(instruction->shape());
 
         static const int64_t kMaximumConstantSizeElements = 45 * 1000 * 1000;
-        if (std::max(elements_in_constant, elements_in_removed_operands) >
+        if (std::max(elements_in_constant, elements_in_operands) >
             kMaximumConstantSizeElements) {
           continue;
         }
diff --git a/third_party/xla/xla/service/hlo_constant_folding_test.cc b/third_party/xla/xla/service/hlo_constant_folding_test.cc
index 20007d2e32cfc4..313667881d465a 100644
--- a/third_party/xla/xla/service/hlo_constant_folding_test.cc
+++ b/third_party/xla/xla/service/hlo_constant_folding_test.cc
@@ -301,6 +301,27 @@ TEST_F(HloConstantFoldingTest, DoesNotFoldLargePad) {
               GmockMatch(m::Pad(m::Constant(), m::Constant())));
 }
 
+TEST_F(HloConstantFoldingTest, DoesNotFoldSlicesWithLargeOperand) {
+  const char* const kModuleStr = R"(
+  HloModule test
+
+  ENTRY r {
+    a = f32[] constant(42)
+    broadcast = f32[1000000000]{0} broadcast(a), dimensions={}
+    slice1 = f32[10000]{0} slice(broadcast), slice={[0:10000]}
+    slice2 = f32[10000]{0} slice(broadcast), slice={[10000:20000]}
+    ROOT add = f32[10000]{0} add(slice1, slice2)
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  HloConstantFolding const_folder;
+  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module.get()));
+  EXPECT_FALSE(result);
+
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Add(m::Slice(), m::Slice())));
+}
+
 TEST_F(HloConstantFoldingTest, DontFoldSubcomputationContainingAfterAll) {
   const char* const kModuleStr = R"(
   HloModule test
diff --git a/third_party/xla/xla/service/hlo_cost_analysis.cc b/third_party/xla/xla/service/hlo_cost_analysis.cc
index c60a4193aaee20..74b584851d815b 100644
--- a/third_party/xla/xla/service/hlo_cost_analysis.cc
+++ b/third_party/xla/xla/service/hlo_cost_analysis.cc
@@ -47,7 +47,7 @@ HloCostAnalysis::HloCostAnalysis(ShapeSizeFunction shape_size,
                                  const Properties& per_second_rates)
     : HloCostAnalysis(Options{shape_size, per_second_rates}) {}
 
-Status HloCostAnalysis::Preprocess(const HloInstruction* hlo) {
+absl::Status HloCostAnalysis::Preprocess(const HloInstruction* hlo) {
   // Set current instruction cost values to reasonable default values. Each
   // handler can overwrite these values. In Postprocess, these values are
   // accumulated and written to the per-instruction maps.
@@ -71,7 +71,7 @@ Status HloCostAnalysis::Preprocess(const HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::Postprocess(const HloInstruction* hlo) {
+absl::Status HloCostAnalysis::Postprocess(const HloInstruction* hlo) {
   if (current_should_compute_bottleneck_time_) {
     // Compute the time as the time of the bottleneck, i.e. the slowest property
     // given the per-second rate of each property.
@@ -101,7 +101,7 @@ Status HloCostAnalysis::Postprocess(const HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::RemoveInstruction(HloInstruction* instruction) {
+absl::Status HloCostAnalysis::RemoveInstruction(HloInstruction* instruction) {
   // Subtract the previously calculated properties of the instruction
   // from HLO graph's total properties_sum_ if instruction was analyzed before.
   auto it = hlo_properties_.find(instruction);
@@ -114,7 +114,7 @@ Status HloCostAnalysis::RemoveInstruction(HloInstruction* instruction) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::RevisitInstruction(HloInstruction* instruction) {
+absl::Status HloCostAnalysis::RevisitInstruction(HloInstruction* instruction) {
   TF_RETURN_IF_ERROR(RemoveInstruction(instruction));
   // Now do Preprocess() -> Visit() -> Postprocess() for the instruction same
   // way it is done during the complete analysis.
@@ -124,7 +124,7 @@ Status HloCostAnalysis::RevisitInstruction(HloInstruction* instruction) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleElementwiseOp(
+absl::Status HloCostAnalysis::HandleElementwiseOp(
     const HloInstruction* hlo_instruction) {
   const auto& shape = hlo_instruction->shape();
   // For element-wise operations, the number of computations is the same as the
@@ -241,7 +241,7 @@ int64_t HloCostAnalysis::FusionParameterReadBytes(
   return size;
 }
 
-Status HloCostAnalysis::FusionCalculateUtilizations(
+absl::Status HloCostAnalysis::FusionCalculateUtilizations(
     const HloInstruction* fusion) {
   // Default trivial implementation: assume 100% utilization of every fusion
   // instruction.
@@ -256,27 +256,29 @@ Status HloCostAnalysis::FusionCalculateUtilizations(
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleElementwiseUnary(const HloInstruction* hlo) {
+absl::Status HloCostAnalysis::HandleElementwiseUnary(
+    const HloInstruction* hlo) {
   return HandleElementwiseOp(hlo);
 }
 
-Status HloCostAnalysis::HandleElementwiseBinary(const HloInstruction* hlo) {
+absl::Status HloCostAnalysis::HandleElementwiseBinary(
+    const HloInstruction* hlo) {
   return HandleElementwiseOp(hlo);
 }
 
-Status HloCostAnalysis::HandleCompare(const HloInstruction* compare) {
+absl::Status HloCostAnalysis::HandleCompare(const HloInstruction* compare) {
   return HandleElementwiseOp(compare);
 }
 
-Status HloCostAnalysis::HandleClamp(const HloInstruction* clamp) {
+absl::Status HloCostAnalysis::HandleClamp(const HloInstruction* clamp) {
   return HandleElementwiseOp(clamp);
 }
 
-Status HloCostAnalysis::HandleReducePrecision(const HloInstruction* hlo) {
+absl::Status HloCostAnalysis::HandleReducePrecision(const HloInstruction* hlo) {
   return HandleElementwiseOp(hlo);
 }
 
-Status HloCostAnalysis::HandleParameter(const HloInstruction*) {
+absl::Status HloCostAnalysis::HandleParameter(const HloInstruction*) {
   current_should_compute_bottleneck_time_ = false;
   current_properties_[kBytesAccessedKey] = 0;
   current_properties_.set_output_bytes_accessed(0);
@@ -284,7 +286,7 @@ Status HloCostAnalysis::HandleParameter(const HloInstruction*) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleConstant(const HloInstruction*) {
+absl::Status HloCostAnalysis::HandleConstant(const HloInstruction*) {
   current_should_compute_bottleneck_time_ = false;
   current_properties_[kBytesAccessedKey] = 0;
   current_properties_.set_output_bytes_accessed(0);
@@ -292,9 +294,11 @@ Status HloCostAnalysis::HandleConstant(const HloInstruction*) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleIota(const HloInstruction*) { return OkStatus(); }
+absl::Status HloCostAnalysis::HandleIota(const HloInstruction*) {
+  return OkStatus();
+}
 
-Status HloCostAnalysis::HandleGetTupleElement(
+absl::Status HloCostAnalysis::HandleGetTupleElement(
     const HloInstruction* get_tuple_element) {
   // GetTupleElement forwards a pointer and does not touch each element in the
   // output.
@@ -306,15 +310,15 @@ Status HloCostAnalysis::HandleGetTupleElement(
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleSelect(const HloInstruction* hlo) {
+absl::Status HloCostAnalysis::HandleSelect(const HloInstruction* hlo) {
   return HandleElementwiseOp(hlo);
 }
 
-Status HloCostAnalysis::HandleReverse(const HloInstruction*) {
+absl::Status HloCostAnalysis::HandleReverse(const HloInstruction*) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleSlice(const HloInstruction* slice) {
+absl::Status HloCostAnalysis::HandleSlice(const HloInstruction* slice) {
   const int64_t output_shape_size = GetShapeSize(slice->shape());
 
   const int64_t num_input_elements =
@@ -329,7 +333,7 @@ Status HloCostAnalysis::HandleSlice(const HloInstruction* slice) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleDynamicSlice(
+absl::Status HloCostAnalysis::HandleDynamicSlice(
     const HloInstruction* dynamic_slice) {
   const int64_t output_shape_size = GetShapeSize(dynamic_slice->shape());
   const int64_t start_indices_shape_size =
@@ -350,7 +354,7 @@ Status HloCostAnalysis::HandleDynamicSlice(
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleDynamicUpdateSlice(
+absl::Status HloCostAnalysis::HandleDynamicUpdateSlice(
     const HloInstruction* dynamic_update_slice) {
   const int64_t update_shape_size =
       GetShapeSize(dynamic_update_slice->operand(1)->shape());
@@ -377,7 +381,7 @@ Status HloCostAnalysis::HandleDynamicUpdateSlice(
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleTuple(const HloInstruction* tuple) {
+absl::Status HloCostAnalysis::HandleTuple(const HloInstruction* tuple) {
   // The tuple instruction only gathers pointers from inputs (it doesn't iterate
   // through them). The memory touched is then only the size of the output
   // index table of the tuple.
@@ -390,17 +394,19 @@ Status HloCostAnalysis::HandleTuple(const HloInstruction* tuple) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleConcatenate(const HloInstruction*) {
+absl::Status HloCostAnalysis::HandleConcatenate(const HloInstruction*) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleConvert(const HloInstruction* convert) {
+absl::Status HloCostAnalysis::HandleConvert(const HloInstruction* convert) {
   return HandleElementwiseOp(convert);
 }
 
-Status HloCostAnalysis::HandleCopy(const HloInstruction*) { return OkStatus(); }
+absl::Status HloCostAnalysis::HandleCopy(const HloInstruction*) {
+  return OkStatus();
+}
 
-Status HloCostAnalysis::HandleDomain(const HloInstruction* domain) {
+absl::Status HloCostAnalysis::HandleDomain(const HloInstruction* domain) {
   // Domain does not have any computation or data transfer.
   current_should_compute_bottleneck_time_ = false;
   current_properties_[kBytesAccessedKey] = 0;
@@ -425,13 +431,13 @@ int64_t HloCostAnalysis::GetDotFlops(const Shape& lhs_shape,
   return kFmaFlops * ShapeUtil::ElementsIn(result_shape) * reduction_width;
 }
 
-Status HloCostAnalysis::HandleDot(const HloInstruction* dot) {
+absl::Status HloCostAnalysis::HandleDot(const HloInstruction* dot) {
   current_properties_[kFlopsKey] = GetDotFlops(
       dot->operand(0)->shape(), dot->shape(), dot->dot_dimension_numbers());
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleInfeed(const HloInstruction* infeed) {
+absl::Status HloCostAnalysis::HandleInfeed(const HloInstruction* infeed) {
   // Count nested infeed output tuples.
   int64_t size = 0;
   ShapeUtil::ForEachLeafShape(
@@ -445,7 +451,7 @@ Status HloCostAnalysis::HandleInfeed(const HloInstruction* infeed) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleOutfeed(const HloInstruction* outfeed) {
+absl::Status HloCostAnalysis::HandleOutfeed(const HloInstruction* outfeed) {
   // Count nested outfeed operand tuples.
   current_properties_[kBytesAccessedKey] = 0;
   for (int64_t i = 0; i < outfeed->operand_count(); ++i) {
@@ -465,7 +471,7 @@ Status HloCostAnalysis::HandleOutfeed(const HloInstruction* outfeed) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleMap(const HloInstruction* map) {
+absl::Status HloCostAnalysis::HandleMap(const HloInstruction* map) {
   // Compute properties of the mapped function.
   TF_ASSIGN_OR_RETURN(const Properties sub_properties,
                       ProcessSubcomputation(map->to_apply()));
@@ -480,7 +486,7 @@ Status HloCostAnalysis::HandleMap(const HloInstruction* map) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleReduce(const HloInstruction* reduce) {
+absl::Status HloCostAnalysis::HandleReduce(const HloInstruction* reduce) {
   HloComputation* function = reduce->to_apply();
   // Compute the cost of the user function.
   TF_ASSIGN_OR_RETURN(const Properties sub_properties,
@@ -504,7 +510,7 @@ Status HloCostAnalysis::HandleReduce(const HloInstruction* reduce) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleReduceWindow(
+absl::Status HloCostAnalysis::HandleReduceWindow(
     const HloInstruction* reduce_window) {
   const Window& window = reduce_window->window();
   auto function = reduce_window->to_apply();
@@ -587,7 +593,7 @@ Status HloCostAnalysis::HandleReduceWindow(
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleSelectAndScatter(
+absl::Status HloCostAnalysis::HandleSelectAndScatter(
     const HloInstruction* instruction) {
   // Compute the properties of the select and scatter function.
   // Compute the properties of the reduction function.
@@ -620,7 +626,7 @@ Status HloCostAnalysis::HandleSelectAndScatter(
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleBitcast(const HloInstruction*) {
+absl::Status HloCostAnalysis::HandleBitcast(const HloInstruction*) {
   // A bitcast does no computation and touches no memory.
   current_properties_[kBytesAccessedKey] = 0;
   current_properties_.set_output_bytes_accessed(0);
@@ -629,7 +635,7 @@ Status HloCostAnalysis::HandleBitcast(const HloInstruction*) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleBroadcast(const HloInstruction* broadcast) {
+absl::Status HloCostAnalysis::HandleBroadcast(const HloInstruction* broadcast) {
   if (options_.count_multiple_input_accesses) {
     current_properties_.set_operand_bytes_accessed(
         0, GetShapeSize(broadcast->shape()));
@@ -640,74 +646,81 @@ Status HloCostAnalysis::HandleBroadcast(const HloInstruction* broadcast) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandlePad(const HloInstruction*) { return OkStatus(); }
+absl::Status HloCostAnalysis::HandlePad(const HloInstruction*) {
+  return OkStatus();
+}
 
-Status HloCostAnalysis::HandleAsyncStart(const HloInstruction* async_start) {
+absl::Status HloCostAnalysis::HandleAsyncStart(
+    const HloInstruction* async_start) {
   TF_ASSIGN_OR_RETURN(
       current_properties_,
       ProcessSubcomputation(async_start->called_computations()[0]));
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleAsyncUpdate(const HloInstruction*) {
+absl::Status HloCostAnalysis::HandleAsyncUpdate(const HloInstruction*) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleAsyncDone(const HloInstruction*) {
+absl::Status HloCostAnalysis::HandleAsyncDone(const HloInstruction*) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleCopyStart(const HloInstruction*) {
+absl::Status HloCostAnalysis::HandleCopyStart(const HloInstruction*) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleCopyDone(const HloInstruction*) {
+absl::Status HloCostAnalysis::HandleCopyDone(const HloInstruction*) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleSend(const HloInstruction*) { return OkStatus(); }
+absl::Status HloCostAnalysis::HandleSend(const HloInstruction*) {
+  return OkStatus();
+}
 
-Status HloCostAnalysis::HandleSendDone(const HloInstruction*) {
+absl::Status HloCostAnalysis::HandleSendDone(const HloInstruction*) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleRecv(const HloInstruction*) { return OkStatus(); }
+absl::Status HloCostAnalysis::HandleRecv(const HloInstruction*) {
+  return OkStatus();
+}
 
-Status HloCostAnalysis::HandleRecvDone(const HloInstruction*) {
+absl::Status HloCostAnalysis::HandleRecvDone(const HloInstruction*) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleReshape(const HloInstruction*) {
+absl::Status HloCostAnalysis::HandleReshape(const HloInstruction*) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleDynamicReshape(const HloInstruction*) {
+absl::Status HloCostAnalysis::HandleDynamicReshape(const HloInstruction*) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleBatchNormTraining(const HloInstruction*) {
+absl::Status HloCostAnalysis::HandleBatchNormTraining(const HloInstruction*) {
   // TODO(b/62294698): Implement cost analysis for batch-norm-training.
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleBatchNormInference(const HloInstruction*) {
+absl::Status HloCostAnalysis::HandleBatchNormInference(const HloInstruction*) {
   // TODO(b/62294698): Implement cost analysis for batch-norm-inference.
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleBatchNormGrad(const HloInstruction*) {
+absl::Status HloCostAnalysis::HandleBatchNormGrad(const HloInstruction*) {
   // TODO(b/62294698): Implement cost analysis for batch-norm-grad.
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleTranspose(const HloInstruction* transpose) {
+absl::Status HloCostAnalysis::HandleTranspose(const HloInstruction* transpose) {
   if (transpose->IsEffectiveBitcast()) {
     return HandleBitcast(transpose);
   }
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleAfterAll(const HloInstruction* token) {
+absl::Status HloCostAnalysis::HandleAfterAll(const HloInstruction* token) {
   // This instruction is used to enforce ordering at compile time. No code is
   // emitted.
   current_should_compute_bottleneck_time_ = false;
@@ -720,7 +733,7 @@ Status HloCostAnalysis::HandleAfterAll(const HloInstruction* token) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleAddDependency(
+absl::Status HloCostAnalysis::HandleAddDependency(
     const HloInstruction* add_dependency) {
   // This instruction is used to enforce ordering at compile time. No code is
   // emitted.
@@ -879,12 +892,13 @@ int64_t HloCostAnalysis::GetConvolutionFlops(const HloInstruction* convolution,
   return fma_count * kFmaFlops;
 }
 
-Status HloCostAnalysis::HandleConvolution(const HloInstruction* convolution) {
+absl::Status HloCostAnalysis::HandleConvolution(
+    const HloInstruction* convolution) {
   current_properties_[kFlopsKey] = GetConvolutionFlops(convolution);
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleFft(const HloInstruction* fft) {
+absl::Status HloCostAnalysis::HandleFft(const HloInstruction* fft) {
   auto real_shape =
       fft->operand(0)->shape().IsTuple()
           ? ShapeUtil::GetTupleElementShape(fft->operand(0)->shape(), 0)
@@ -899,7 +913,7 @@ Status HloCostAnalysis::HandleFft(const HloInstruction* fft) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleTriangularSolve(const HloInstruction* hlo) {
+absl::Status HloCostAnalysis::HandleTriangularSolve(const HloInstruction* hlo) {
   // Half of operand 0 is read.
   float bytes_accessed = GetShapeSize(hlo->shape());
   current_properties_.set_output_bytes_accessed(GetShapeSize(hlo->shape()));
@@ -920,7 +934,7 @@ Status HloCostAnalysis::HandleTriangularSolve(const HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleCholesky(const HloInstruction* hlo) {
+absl::Status HloCostAnalysis::HandleCholesky(const HloInstruction* hlo) {
   // Half of operand 0 is read and half of the output will be written.
   float bytes_accessed = GetShapeSize(hlo->operand(0)->shape()) / 2.0f;
   current_properties_.set_output_bytes_accessed(
@@ -938,24 +952,25 @@ Status HloCostAnalysis::HandleCholesky(const HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleOptimizationBarrier(
+absl::Status HloCostAnalysis::HandleOptimizationBarrier(
     const HloInstruction* /*hlo*/) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleAllGather(const HloInstruction* /*hlo*/) {
+absl::Status HloCostAnalysis::HandleAllGather(const HloInstruction* /*hlo*/) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleAllGatherStart(const HloInstruction* hlo) {
+absl::Status HloCostAnalysis::HandleAllGatherStart(const HloInstruction* hlo) {
   return HandleAllGather(hlo);
 }
 
-Status HloCostAnalysis::HandleAllGatherDone(const HloInstruction* /*hlo*/) {
+absl::Status HloCostAnalysis::HandleAllGatherDone(
+    const HloInstruction* /*hlo*/) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleAllReduce(const HloInstruction* crs) {
+absl::Status HloCostAnalysis::HandleAllReduce(const HloInstruction* crs) {
   // We assume 2 replicas, so that each output element is the sum of two input
   // elements.
   //
@@ -980,50 +995,52 @@ Status HloCostAnalysis::HandleAllReduce(const HloInstruction* crs) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleReduceScatter(const HloInstruction* hlo) {
+absl::Status HloCostAnalysis::HandleReduceScatter(const HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleAllReduceStart(const HloInstruction* hlo) {
+absl::Status HloCostAnalysis::HandleAllReduceStart(const HloInstruction* hlo) {
   return HandleAllReduce(hlo);
 }
 
-Status HloCostAnalysis::HandleAllReduceDone(const HloInstruction* /*hlo*/) {
+absl::Status HloCostAnalysis::HandleAllReduceDone(
+    const HloInstruction* /*hlo*/) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleAllToAll(const HloInstruction* hlo) {
+absl::Status HloCostAnalysis::HandleAllToAll(const HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleCollectiveBroadcast(
+absl::Status HloCostAnalysis::HandleCollectiveBroadcast(
     const HloInstruction* /*hlo*/) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleCollectivePermute(const HloInstruction* /*hlo*/) {
+absl::Status HloCostAnalysis::HandleCollectivePermute(
+    const HloInstruction* /*hlo*/) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleCollectivePermuteStart(
+absl::Status HloCostAnalysis::HandleCollectivePermuteStart(
     const HloInstruction* /*hlo*/) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleCollectivePermuteDone(
+absl::Status HloCostAnalysis::HandleCollectivePermuteDone(
     const HloInstruction* /*hlo*/) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandlePartitionId(const HloInstruction* /*hlo*/) {
+absl::Status HloCostAnalysis::HandlePartitionId(const HloInstruction* /*hlo*/) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleReplicaId(const HloInstruction* /*hlo*/) {
+absl::Status HloCostAnalysis::HandleReplicaId(const HloInstruction* /*hlo*/) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleRng(const HloInstruction* random) {
+absl::Status HloCostAnalysis::HandleRng(const HloInstruction* random) {
   // TODO(b/26346211): Implement better estimates for the RNG cost, since the
   // cost changes with the implementation and the distribution. For now, assume
   // the cost of each RNG is same as a transcendental operation.
@@ -1032,7 +1049,8 @@ Status HloCostAnalysis::HandleRng(const HloInstruction* random) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleRngBitGenerator(const HloInstruction* random) {
+absl::Status HloCostAnalysis::HandleRngBitGenerator(
+    const HloInstruction* random) {
   // TODO(b/26346211): Implement better estimates for the RNG cost, since the
   // cost changes with the implementation and the distribution. For now, assume
   // the cost of each RNG is same as a transcendental operation.
@@ -1041,12 +1059,12 @@ Status HloCostAnalysis::HandleRngBitGenerator(const HloInstruction* random) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleRngGetAndUpdateState(
+absl::Status HloCostAnalysis::HandleRngGetAndUpdateState(
     const HloInstruction* random) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::FusionProcessOutputBytesAccessed(
+absl::Status HloCostAnalysis::FusionProcessOutputBytesAccessed(
     const HloInstruction* fusion) {
   // Fusion nodes that produce a tuple also produce the entries in the tuple.
   // Ignore the memory accessed inside fused ops, since fusion is supposed to
@@ -1118,7 +1136,7 @@ Status HloCostAnalysis::FusionProcessOutputBytesAccessed(
   return OkStatus();
 }
 
-Status HloCostAnalysis::FusionProcessOperandBytesRead(
+absl::Status HloCostAnalysis::FusionProcessOperandBytesRead(
     const HloInstruction* fusion) {
   for (int64_t i = 0; i < fusion->fused_parameters().size(); ++i) {
     const HloInstruction* operand = fusion->fused_parameter(i);
@@ -1160,7 +1178,7 @@ Status HloCostAnalysis::FusionProcessOperandBytesRead(
   return OkStatus();
 }
 
-Status HloCostAnalysis::FusionCountConstantsMemoryAccess(
+absl::Status HloCostAnalysis::FusionCountConstantsMemoryAccess(
     const HloInstruction* fusion) {
   // Count memory access to all large constants.
   for (const HloInstruction* instr :
@@ -1179,7 +1197,7 @@ Status HloCostAnalysis::FusionCountConstantsMemoryAccess(
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleFusion(const HloInstruction* fusion) {
+absl::Status HloCostAnalysis::HandleFusion(const HloInstruction* fusion) {
   VLOG(8) << "Processing fusion " << fusion->ToString();
 
   if (fusion->IsCustomFusion()) {
@@ -1206,14 +1224,15 @@ Status HloCostAnalysis::HandleFusion(const HloInstruction* fusion) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleCall(const HloInstruction* call) {
+absl::Status HloCostAnalysis::HandleCall(const HloInstruction* call) {
   TF_ASSIGN_OR_RETURN(current_properties_,
                       ProcessSubcomputation(call->to_apply()));
   current_should_compute_bottleneck_time_ = false;
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleCustomCall(const HloInstruction* custom_call) {
+absl::Status HloCostAnalysis::HandleCustomCall(
+    const HloInstruction* custom_call) {
   // Mark applicable fields as "unknown", since we don't know what this
   // CustomCall does.  This is better than returning an error, which would stop
   // iteration, and therefore would prevent us from getting *any* stats for a
@@ -1229,7 +1248,7 @@ Status HloCostAnalysis::HandleCustomCall(const HloInstruction* custom_call) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleSort(const HloInstruction* sort) {
+absl::Status HloCostAnalysis::HandleSort(const HloInstruction* sort) {
   // This assumes a comparison based N*log(N) algorithm. As for all ops, the
   // actual properties of the op depend on the backend implementation.
   int64_t elements = ShapeUtil::ElementsIn(sort->operand(0)->shape());
@@ -1237,12 +1256,12 @@ Status HloCostAnalysis::HandleSort(const HloInstruction* sort) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleTopK(const HloInstruction* topk) {
+absl::Status HloCostAnalysis::HandleTopK(const HloInstruction* topk) {
   // TODO(cheshire): Cost analysis for TopK.
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleWhile(const HloInstruction* xla_while) {
+absl::Status HloCostAnalysis::HandleWhile(const HloInstruction* xla_while) {
   // Since the number of iterations of the while node will not always be
   // something that we can statically analyze, we cannot precisely compute the
   // cost of a while node. For now compute the cost of a single iteration.
@@ -1264,7 +1283,8 @@ Status HloCostAnalysis::HandleWhile(const HloInstruction* xla_while) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleConditional(const HloInstruction* conditional) {
+absl::Status HloCostAnalysis::HandleConditional(
+    const HloInstruction* conditional) {
   // Compute the cost of the branch computations and take the maximum from those
   // for each property.
   TF_ASSIGN_OR_RETURN(
@@ -1286,7 +1306,7 @@ Status HloCostAnalysis::HandleConditional(const HloInstruction* conditional) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleGather(const HloInstruction* gather) {
+absl::Status HloCostAnalysis::HandleGather(const HloInstruction* gather) {
   // Gather doesn't read the whole input buffer, it's equivalent to a copy the
   // size of the output shape and a read of the gather indices.
   int64_t output_size = GetShapeSize(gather->shape());
@@ -1303,7 +1323,7 @@ Status HloCostAnalysis::HandleGather(const HloInstruction* gather) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleScatter(const HloInstruction* hlo) {
+absl::Status HloCostAnalysis::HandleScatter(const HloInstruction* hlo) {
   auto* scatter = Cast<HloScatterInstruction>(hlo);
   // Scatter accesses the equivalent of 3N update shapes (input, output, and
   // updates), and the scatter indices.
@@ -1333,17 +1353,17 @@ Status HloCostAnalysis::HandleScatter(const HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleGetDimensionSize(
+absl::Status HloCostAnalysis::HandleGetDimensionSize(
     const HloInstruction* /*get_size*/) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::HandleSetDimensionSize(
+absl::Status HloCostAnalysis::HandleSetDimensionSize(
     const HloInstruction* /*set_size*/) {
   return OkStatus();
 }
 
-Status HloCostAnalysis::FinishVisit(const HloInstruction*) {
+absl::Status HloCostAnalysis::FinishVisit(const HloInstruction*) {
   return OkStatus();
 }
 
diff --git a/third_party/xla/xla/service/hlo_cost_analysis.h b/third_party/xla/xla/service/hlo_cost_analysis.h
index 0a6ecebec265c3..5c79b3ddcb1371 100644
--- a/third_party/xla/xla/service/hlo_cost_analysis.h
+++ b/third_party/xla/xla/service/hlo_cost_analysis.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_format.h"
 #include "xla/hlo/ir/dfs_hlo_visitor.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -331,6 +332,28 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
       }
     }
 
+    std::string ToString() const {
+      return absl::StrFormat(
+          "HloCostAnalysis::Properties{\n"
+          " flops: %f,\n"
+          " transcendentals: %f\n"
+          " bytes_accessed: %f\n"
+          " optimal_seconds: %f\n"
+          " utilization: %f\n"
+          " operand0_utilization: %f\n"
+          " operand1_utilization: %f\n"
+          " operand0_bytes_accessed: %f\n"
+          " operand1_bytes_accessed: %f\n"
+          " output_root_bytes_accessed: %f\n"
+          " reserved0: %f\n"
+          " reserved1: %f\n"
+          "}",
+          flops_, transcendentals_, bytes_accessed_, optimal_seconds_,
+          utilization_, operand0_utilization_, operand1_utilization_,
+          operand0_bytes_accessed_, operand1_bytes_accessed_,
+          output_root_bytes_accessed_, reserved0_, reserved1_);
+    }
+
    private:
     // These must match GetOperandUtilizationKey(0, {}) etc.
     static inline constexpr absl::string_view kOperand0UtilizationKey =
@@ -402,106 +425,119 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
     float per_second_rate(absl::string_view key) const {
       return per_second_rates[key];
     }
+
+    std::string ToString() const {
+      return absl::StrFormat(
+          "HloCostAnalysis::Options{\n"
+          " per_second_rates: %s\n"
+          " count_multiple_input_accesses: %d\n"
+          "}",
+          per_second_rates.ToString(), count_multiple_input_accesses);
+    }
   };
 
   explicit HloCostAnalysis(const Options& options);
   explicit HloCostAnalysis(ShapeSizeFunction shape_size,
                            const Properties& per_second_rates = {});
 
-  Status HandleElementwiseUnary(const HloInstruction* hlo) override;
-  Status HandleElementwiseBinary(const HloInstruction* hlo) override;
-  Status HandleConstant(const HloInstruction* constant) override;
-  Status HandleIota(const HloInstruction* iota) override;
-  Status HandleGetTupleElement(
+  absl::Status HandleElementwiseUnary(const HloInstruction* hlo) override;
+  absl::Status HandleElementwiseBinary(const HloInstruction* hlo) override;
+  absl::Status HandleConstant(const HloInstruction* constant) override;
+  absl::Status HandleIota(const HloInstruction* iota) override;
+  absl::Status HandleGetTupleElement(
       const HloInstruction* get_tuple_element) override;
-  Status HandleSelect(const HloInstruction* hlo) override;
-  Status HandleCompare(const HloInstruction* compare) override;
-  Status HandleClamp(const HloInstruction* clamp) override;
-  Status HandleReducePrecision(const HloInstruction* hlo) override;
-  Status HandleConcatenate(const HloInstruction* concatenate) override;
-  Status HandleAsyncStart(const HloInstruction* async_start) override;
-  Status HandleAsyncUpdate(const HloInstruction* async_update) override;
-  Status HandleAsyncDone(const HloInstruction* async_done) override;
-  Status HandleCopyStart(const HloInstruction* send) override;
-  Status HandleCopyDone(const HloInstruction* send_done) override;
-  Status HandleSend(const HloInstruction* send) override;
-  Status HandleSendDone(const HloInstruction* send_done) override;
-  Status HandleRecv(const HloInstruction* recv) override;
-  Status HandleRecvDone(const HloInstruction* recv_done) override;
-  Status HandleConvert(const HloInstruction* convert) override;
-  Status HandleCopy(const HloInstruction* copy) override;
-  Status HandleDomain(const HloInstruction* domain) override;
-  Status HandleDot(const HloInstruction* dot) override;
-  Status HandleConvolution(const HloInstruction* convolution) override;
-  Status HandleFft(const HloInstruction* fft) override;
-  Status HandleTriangularSolve(const HloInstruction* hlo) override;
-  Status HandleCholesky(const HloInstruction* hlo) override;
-  Status HandleOptimizationBarrier(const HloInstruction* hlo) override;
-  Status HandleAllGather(const HloInstruction* hlo) override;
-  Status HandleAllGatherStart(const HloInstruction* hlo) override;
-  Status HandleAllGatherDone(const HloInstruction* hlo) override;
-  Status HandleAllReduce(const HloInstruction* crs) override;
-  Status HandleReduceScatter(const HloInstruction* hlo) override;
-  Status HandleAllReduceStart(const HloInstruction* hlo) override;
-  Status HandleAllReduceDone(const HloInstruction* hlo) override;
-  Status HandleAllToAll(const HloInstruction* hlo) override;
-  Status HandleCollectiveBroadcast(const HloInstruction* hlo) override;
-  Status HandleCollectivePermute(const HloInstruction* hlo) override;
-  Status HandleCollectivePermuteStart(const HloInstruction* hlo) override;
-  Status HandleCollectivePermuteDone(const HloInstruction* hlo) override;
-  Status HandleReplicaId(const HloInstruction* hlo) override;
-  Status HandlePartitionId(const HloInstruction* hlo) override;
-  Status HandleInfeed(const HloInstruction* infeed) override;
-  Status HandleOutfeed(const HloInstruction* outfeed) override;
-  Status HandleRng(const HloInstruction* random) override;
-  Status HandleRngBitGenerator(const HloInstruction* random) override;
-  Status HandleRngGetAndUpdateState(const HloInstruction* random) override;
-  Status HandleReverse(const HloInstruction* reverse) override;
-  Status HandleSort(const HloInstruction* sort) override;
-  Status HandleParameter(const HloInstruction* parameter) override;
-  Status HandleReduce(const HloInstruction* reduce) override;
-  Status HandleBatchNormTraining(
+  absl::Status HandleSelect(const HloInstruction* hlo) override;
+  absl::Status HandleCompare(const HloInstruction* compare) override;
+  absl::Status HandleClamp(const HloInstruction* clamp) override;
+  absl::Status HandleReducePrecision(const HloInstruction* hlo) override;
+  absl::Status HandleConcatenate(const HloInstruction* concatenate) override;
+  absl::Status HandleAsyncStart(const HloInstruction* async_start) override;
+  absl::Status HandleAsyncUpdate(const HloInstruction* async_update) override;
+  absl::Status HandleAsyncDone(const HloInstruction* async_done) override;
+  absl::Status HandleCopyStart(const HloInstruction* send) override;
+  absl::Status HandleCopyDone(const HloInstruction* send_done) override;
+  absl::Status HandleSend(const HloInstruction* send) override;
+  absl::Status HandleSendDone(const HloInstruction* send_done) override;
+  absl::Status HandleRecv(const HloInstruction* recv) override;
+  absl::Status HandleRecvDone(const HloInstruction* recv_done) override;
+  absl::Status HandleConvert(const HloInstruction* convert) override;
+  absl::Status HandleCopy(const HloInstruction* copy) override;
+  absl::Status HandleDomain(const HloInstruction* domain) override;
+  absl::Status HandleDot(const HloInstruction* dot) override;
+  absl::Status HandleConvolution(const HloInstruction* convolution) override;
+  absl::Status HandleFft(const HloInstruction* fft) override;
+  absl::Status HandleTriangularSolve(const HloInstruction* hlo) override;
+  absl::Status HandleCholesky(const HloInstruction* hlo) override;
+  absl::Status HandleOptimizationBarrier(const HloInstruction* hlo) override;
+  absl::Status HandleAllGather(const HloInstruction* hlo) override;
+  absl::Status HandleAllGatherStart(const HloInstruction* hlo) override;
+  absl::Status HandleAllGatherDone(const HloInstruction* hlo) override;
+  absl::Status HandleAllReduce(const HloInstruction* crs) override;
+  absl::Status HandleReduceScatter(const HloInstruction* hlo) override;
+  absl::Status HandleAllReduceStart(const HloInstruction* hlo) override;
+  absl::Status HandleAllReduceDone(const HloInstruction* hlo) override;
+  absl::Status HandleAllToAll(const HloInstruction* hlo) override;
+  absl::Status HandleCollectiveBroadcast(const HloInstruction* hlo) override;
+  absl::Status HandleCollectivePermute(const HloInstruction* hlo) override;
+  absl::Status HandleCollectivePermuteStart(const HloInstruction* hlo) override;
+  absl::Status HandleCollectivePermuteDone(const HloInstruction* hlo) override;
+  absl::Status HandleReplicaId(const HloInstruction* hlo) override;
+  absl::Status HandlePartitionId(const HloInstruction* hlo) override;
+  absl::Status HandleInfeed(const HloInstruction* infeed) override;
+  absl::Status HandleOutfeed(const HloInstruction* outfeed) override;
+  absl::Status HandleRng(const HloInstruction* random) override;
+  absl::Status HandleRngBitGenerator(const HloInstruction* random) override;
+  absl::Status HandleRngGetAndUpdateState(
+      const HloInstruction* random) override;
+  absl::Status HandleReverse(const HloInstruction* reverse) override;
+  absl::Status HandleSort(const HloInstruction* sort) override;
+  absl::Status HandleParameter(const HloInstruction* parameter) override;
+  absl::Status HandleReduce(const HloInstruction* reduce) override;
+  absl::Status HandleBatchNormTraining(
       const HloInstruction* batch_norm_training) override;
-  Status HandleBatchNormInference(
+  absl::Status HandleBatchNormInference(
       const HloInstruction* batch_norm_inference) override;
-  Status HandleBatchNormGrad(const HloInstruction* batch_norm_grad) override;
-  Status HandleFusion(const HloInstruction* fusion) override;
-  Status HandleCall(const HloInstruction* call) override;
-  Status HandleCustomCall(const HloInstruction* custom_call) override;
-  Status HandleSlice(const HloInstruction* slice) override;
-  Status HandleDynamicSlice(const HloInstruction* dynamic_slice) override;
-  Status HandleDynamicUpdateSlice(
+  absl::Status HandleBatchNormGrad(
+      const HloInstruction* batch_norm_grad) override;
+  absl::Status HandleFusion(const HloInstruction* fusion) override;
+  absl::Status HandleCall(const HloInstruction* call) override;
+  absl::Status HandleCustomCall(const HloInstruction* custom_call) override;
+  absl::Status HandleSlice(const HloInstruction* slice) override;
+  absl::Status HandleDynamicSlice(const HloInstruction* dynamic_slice) override;
+  absl::Status HandleDynamicUpdateSlice(
       const HloInstruction* dynamic_update_slice) override;
-  Status HandleTuple(const HloInstruction* tuple) override;
-  Status HandleMap(const HloInstruction* map) override;
-  Status HandleReduceWindow(const HloInstruction* reduce_window) override;
-  Status HandleSelectAndScatter(const HloInstruction* instruction) override;
-  Status HandleBitcast(const HloInstruction* bitcast) override;
-  Status HandleBroadcast(const HloInstruction* broadcast) override;
-  Status HandlePad(const HloInstruction* pad) override;
-  Status HandleReshape(const HloInstruction* reshape) override;
-  Status HandleDynamicReshape(const HloInstruction* reshape) override;
-  Status HandleAddDependency(const HloInstruction* add_dependency) override;
-  Status HandleAfterAll(const HloInstruction* token) override;
-  Status HandleTranspose(const HloInstruction* transpose) override;
-  Status HandleWhile(const HloInstruction* xla_while) override;
-  Status HandleConditional(const HloInstruction* conditional) override;
-  Status HandleGather(const HloInstruction* gather) override;
-  Status HandleScatter(const HloInstruction* hlo) override;
-  Status HandleGetDimensionSize(const HloInstruction* get_size) override;
-  Status HandleSetDimensionSize(const HloInstruction* set_size) override;
-  Status HandleTopK(const HloInstruction* topk) override;
-  Status FinishVisit(const HloInstruction* root) override;
-
-  Status Preprocess(const HloInstruction* hlo) override;
-  Status Postprocess(const HloInstruction* hlo) override;
+  absl::Status HandleTuple(const HloInstruction* tuple) override;
+  absl::Status HandleMap(const HloInstruction* map) override;
+  absl::Status HandleReduceWindow(const HloInstruction* reduce_window) override;
+  absl::Status HandleSelectAndScatter(
+      const HloInstruction* instruction) override;
+  absl::Status HandleBitcast(const HloInstruction* bitcast) override;
+  absl::Status HandleBroadcast(const HloInstruction* broadcast) override;
+  absl::Status HandlePad(const HloInstruction* pad) override;
+  absl::Status HandleReshape(const HloInstruction* reshape) override;
+  absl::Status HandleDynamicReshape(const HloInstruction* reshape) override;
+  absl::Status HandleAddDependency(
+      const HloInstruction* add_dependency) override;
+  absl::Status HandleAfterAll(const HloInstruction* token) override;
+  absl::Status HandleTranspose(const HloInstruction* transpose) override;
+  absl::Status HandleWhile(const HloInstruction* xla_while) override;
+  absl::Status HandleConditional(const HloInstruction* conditional) override;
+  absl::Status HandleGather(const HloInstruction* gather) override;
+  absl::Status HandleScatter(const HloInstruction* hlo) override;
+  absl::Status HandleGetDimensionSize(const HloInstruction* get_size) override;
+  absl::Status HandleSetDimensionSize(const HloInstruction* set_size) override;
+  absl::Status HandleTopK(const HloInstruction* topk) override;
+  absl::Status FinishVisit(const HloInstruction* root) override;
+
+  absl::Status Preprocess(const HloInstruction* hlo) override;
+  absl::Status Postprocess(const HloInstruction* hlo) override;
 
   // Enable efficient updates if a known small set of instructions within an
   // HLO graph was modified.
   // Updates the cost analysis by removing one instruction.
-  Status RemoveInstruction(HloInstruction* instruction);
+  absl::Status RemoveInstruction(HloInstruction* instruction);
   // Updates the cost analysis by re-doing the analysis of one instruction.
-  Status RevisitInstruction(HloInstruction* instruction);
+  absl::Status RevisitInstruction(HloInstruction* instruction);
 
   // Decorates shape_size_ by returning 0 immediately if the shape does not have
   // a layout.
@@ -575,14 +611,17 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
  protected:
   // Computes the bytes accessed based on the outputs produced by the fusion
   // instruction.
-  virtual Status FusionProcessOutputBytesAccessed(const HloInstruction* fusion);
+  virtual absl::Status FusionProcessOutputBytesAccessed(
+      const HloInstruction* fusion);
 
   // Computes the bytes accessed (read) based on the inputs consumed by the
   // fusion instruction.
-  virtual Status FusionProcessOperandBytesRead(const HloInstruction* fusion);
+  virtual absl::Status FusionProcessOperandBytesRead(
+      const HloInstruction* fusion);
 
   // Computes memory access to all larger constants in the fusion instruction.
-  virtual Status FusionCountConstantsMemoryAccess(const HloInstruction* fusion);
+  virtual absl::Status FusionCountConstantsMemoryAccess(
+      const HloInstruction* fusion);
 
   // Allows exclusion of certain types of inputs from bytes accessed during
   // FusionProcessOperandBytesRead.
@@ -625,7 +664,7 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
       HloComputation* computation);
 
   // Utility function to handle all element-wise operations.
-  Status HandleElementwiseOp(const HloInstruction* hlo_instruction);
+  absl::Status HandleElementwiseOp(const HloInstruction* hlo_instruction);
 
   // Returns 0.0f if the hlo is not present in hlo_to_properties or if the key
   // is not present in hlo_to_properties[hlo]. Otherwise, returns the value that
@@ -640,7 +679,8 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
 
   // Traverses a fusion counting total utilization of every instruction inside.
   // Currently implemented non-trivially only in the GPU cost analysis.
-  virtual Status FusionCalculateUtilizations(const HloInstruction* fusion);
+  virtual absl::Status FusionCalculateUtilizations(
+      const HloInstruction* fusion);
 
   HloToProperties hlo_properties_;
 
diff --git a/third_party/xla/xla/service/hlo_creation_utils.cc b/third_party/xla/xla/service/hlo_creation_utils.cc
index b919bd75faa2d5..3ed1087be0c2f8 100644
--- a/third_party/xla/xla/service/hlo_creation_utils.cc
+++ b/third_party/xla/xla/service/hlo_creation_utils.cc
@@ -26,6 +26,7 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
@@ -328,8 +329,9 @@ HloInstruction* MakeConvertToHlo(HloInstruction* hlo, PrimitiveType type,
     return hlo;
   }
   Shape shape = ShapeUtil::ChangeElementType(hlo->shape(), type);
-  if (primitive_util::Is4BitType(shape.element_type())) {
-    shape.mutable_layout()->set_element_size_in_bits(4);
+  if (primitive_util::IsSubByteNonPredType(shape.element_type())) {
+    shape.mutable_layout()->set_element_size_in_bits(
+        primitive_util::BitWidth(shape.element_type()));
   } else {
     shape.mutable_layout()->set_element_size_in_bits(0);
   }
@@ -868,4 +870,20 @@ HloInstruction* ExpandDegenerateReshape(HloInstruction* inst) {
   return nullptr;
 }
 
+std::unique_ptr<HloInstruction> MakeConstantWithShape(const Shape& shape,
+                                                      int64_t value) {
+  return primitive_util::PrimitiveTypeSwitch<std::unique_ptr<HloInstruction>>(
+      [&](auto literal_constant) -> std::unique_ptr<HloInstruction> {
+        if constexpr (primitive_util::IsIntegralType(literal_constant)) {
+          using NativeT = primitive_util::NativeTypeOf<literal_constant>;
+          auto constant = HloInstruction::CreateConstant(
+              LiteralUtil::CreateR0(static_cast<NativeT>(value)));
+          *constant->mutable_shape() = shape;
+          return std::move(constant);
+        }
+        LOG(FATAL) << "Literal is of non-integral type";
+      },
+      shape.element_type());
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_creation_utils.h b/third_party/xla/xla/service/hlo_creation_utils.h
index 95313b9396e2f2..8ca3ac5ed716e0 100644
--- a/third_party/xla/xla/service/hlo_creation_utils.h
+++ b/third_party/xla/xla/service/hlo_creation_utils.h
@@ -390,6 +390,10 @@ absl::StatusOr<std::unique_ptr<HloComputation>> CreateComputationWithSignature(
 // adding and removing reshapes that changes only a single dimension.
 HloInstruction* ExpandDegenerateReshape(HloInstruction* inst);
 
+// Creates an integral constant with the given shape and integer value.
+std::unique_ptr<HloInstruction> MakeConstantWithShape(const Shape& shape,
+                                                      int64_t value);
+
 }  // namespace xla
 
 #endif  // XLA_SERVICE_HLO_CREATION_UTILS_H_
diff --git a/third_party/xla/xla/service/hlo_cse_test.cc b/third_party/xla/xla/service/hlo_cse_test.cc
index bd93f7cd88b8cc..106eea0923b0be 100644
--- a/third_party/xla/xla/service/hlo_cse_test.cc
+++ b/third_party/xla/xla/service/hlo_cse_test.cc
@@ -903,6 +903,40 @@ TEST_F(HloCseTest, IgnoreControlDependencies) {
   EXPECT_EQ(changed, true);
 }
 
+TEST_F(HloCseTest, MultiOutputFusion) {
+  const char* const hlo_string = R"(
+    HloModule m
+
+    f {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      add.0 = f32[] add(p0, p1)
+      add.1 = f32[] add(p0, p1)
+      ROOT res = (f32[], f32[]) tuple(add.0, add.1)
+    }
+
+    ENTRY entry {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT root = (f32[], f32[]) fusion(p0, p1), kind=kLoop, calls=f
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  HloCSE cse(/*is_layout_sensitive=*/false);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloPass(&cse, m.get()));
+
+  SCOPED_TRACE(absl::StrCat("Module after CSE:\n", m->ToString()));
+  EXPECT_EQ(changed, true);
+  HloInstruction* add0;
+  HloInstruction* add1;
+  ASSERT_THAT(
+      m->entry_computation()->root_instruction()->fused_expression_root(),
+      GmockMatch(m::Tuple(m::Add(&add0, m::Parameter(0), m::Parameter(1)),
+                          m::Add(&add1, m::Parameter(0), m::Parameter(1)))));
+  EXPECT_EQ(add0, add1);
+}
+
 class HloCseCommutativeOpTest
     : public HloCseTest,
       public ::testing::WithParamInterface<std::string /*op*/> {};
diff --git a/third_party/xla/xla/service/hlo_dataflow_analysis.cc b/third_party/xla/xla/service/hlo_dataflow_analysis.cc
index a60e0211ca5648..7d23d9d60427d3 100644
--- a/third_party/xla/xla/service/hlo_dataflow_analysis.cc
+++ b/third_party/xla/xla/service/hlo_dataflow_analysis.cc
@@ -1418,7 +1418,7 @@ InstructionValueSet& HloDataflowAnalysis::GetInstructionValueSet(
   return *value_sets_.find(instruction)->second;
 }
 
-Status HloDataflowAnalysis::InitializeInstructionValueSets() {
+absl::Status HloDataflowAnalysis::InitializeInstructionValueSets() {
   for (const HloComputation* computation : module_.MakeComputationSorted()) {
     if (!HloInstruction::IsThreadIncluded(computation->execution_thread(),
                                           execution_threads_)) {
@@ -1731,7 +1731,7 @@ absl::StatusOr<std::unique_ptr<HloDataflowAnalysis>> HloDataflowAnalysis::Run(
   return std::move(dataflow_analysis);
 }
 
-Status HloDataflowAnalysis::Verify() const {
+absl::Status HloDataflowAnalysis::Verify() const {
   // Verify each HloValue appears in the value sets that the value's positions()
   // indicate.
   for (const HloValue* value : values()) {
diff --git a/third_party/xla/xla/service/hlo_dataflow_analysis.h b/third_party/xla/xla/service/hlo_dataflow_analysis.h
index a5cf84409aae4d..ddc97b0c7df12c 100644
--- a/third_party/xla/xla/service/hlo_dataflow_analysis.h
+++ b/third_party/xla/xla/service/hlo_dataflow_analysis.h
@@ -226,7 +226,7 @@ class HloDataflowAnalysis {
   GetInPlaceInputOutputPairs(const HloInstruction* instruction);
 
   // Verifies various invariants of the dataflow analysis.
-  Status Verify() const;
+  absl::Status Verify() const;
 
  private:
   static bool AreTransitiveUsesElementwiseOrTuple(const HloInstruction* inst);
@@ -264,7 +264,7 @@ class HloDataflowAnalysis {
   // Constructs and initializes the InstructionValueSets of all instructions to
   // contain exactly the HloValues defined by each instruction. These values can
   // then propagated throughout the HLO graph by calling Propagate.
-  Status InitializeInstructionValueSets();
+  absl::Status InitializeInstructionValueSets();
 
   // Updates the value set of the given instruction based on the values flowing
   // into the instruction (operands and cross-computation dataflow).
diff --git a/third_party/xla/xla/service/hlo_dataflow_analysis_test.cc b/third_party/xla/xla/service/hlo_dataflow_analysis_test.cc
index 2244f5e37fb923..89ce2c4bea22f8 100644
--- a/third_party/xla/xla/service/hlo_dataflow_analysis_test.cc
+++ b/third_party/xla/xla/service/hlo_dataflow_analysis_test.cc
@@ -2105,7 +2105,7 @@ TEST_F(HloDataflowAnalysisTest, AllGatherStartAndDoneWithTuple) {
   )";
   TF_ASSERT_OK_AND_ASSIGN(module_, ParseAndReturnVerifiedModule(hlo_text));
   const HloDataflowAnalysis& analysis = RunAnalysis(/*ssa_form=*/false);
-  Status status = analysis.Verify();
+  absl::Status status = analysis.Verify();
   EXPECT_TRUE(status.ok()) << status.ToString();
 
   HloInstruction* done = module_->entry_computation()->root_instruction();
diff --git a/third_party/xla/xla/service/hlo_dce.cc b/third_party/xla/xla/service/hlo_dce.cc
index 1a898850c38508..bd71c1d484452d 100644
--- a/third_party/xla/xla/service/hlo_dce.cc
+++ b/third_party/xla/xla/service/hlo_dce.cc
@@ -15,11 +15,17 @@ limitations under the License.
 
 #include "xla/service/hlo_dce.h"
 
+#include <algorithm>
+#include <cstdint>
+#include <iterator>
+#include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -27,6 +33,8 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
@@ -66,6 +74,81 @@ bool IsRemovableWhile(HloInstruction* instruction,
   bool changed = false;
   VLOG(3) << "Before dce:";
   XLA_VLOG_LINES(3, computation->ToString());
+  // Cleanup unused tuple elements in multi-output fusion roots. We do this
+  // first, because it may create dead roots which we can clean up next.
+  if (auto* fusion_instruction = computation->FusionInstruction();
+      fusion_instruction != nullptr &&
+      computation->root_instruction()->opcode() == HloOpcode::kTuple &&
+      !computation->root_instruction()->has_sharding() &&
+      fusion_instruction->output_operand_aliasing().empty() &&
+      !fusion_instruction->HasControlDependencies() &&
+      fusion_instruction->user_count() <
+          computation->root_instruction()->operand_count() &&
+      !fusion_instruction->IsCustomFusion()) {
+    std::vector<int64_t> used_tuple_elements;
+    used_tuple_elements.reserve(fusion_instruction->user_count());
+    // We only support this cleanup if all users of the fusion instruction are
+    // GetTupleElement ops, and there is at least one user of
+    // 'fusion_instruction'.
+    bool supported = fusion_instruction->user_count() > 0;
+    for (HloInstruction* gte : fusion_instruction->users()) {
+      if (gte->opcode() != HloOpcode::kGetTupleElement) {
+        supported = false;
+        break;
+      }
+      used_tuple_elements.push_back(gte->tuple_index());
+    }
+    if (supported) {
+      std::sort(used_tuple_elements.begin(), used_tuple_elements.end());
+      std::vector<Shape> tuple_shapes;
+      tuple_shapes.reserve(used_tuple_elements.size());
+      for (int64_t tuple_index : used_tuple_elements) {
+        tuple_shapes.push_back(
+            fusion_instruction->shape().tuple_shapes(tuple_index));
+      }
+      Shape new_shape = tuple_shapes.size() == 1
+                            ? tuple_shapes[0]
+                            : ShapeUtil::MakeTupleShape(tuple_shapes);
+      *fusion_instruction->mutable_shape() = std::move(new_shape);
+
+      // Update the users of the old fusion instruction.
+      if (tuple_shapes.size() > 1) {
+        for (HloInstruction* gte : fusion_instruction->users()) {
+          auto it =
+              std::lower_bound(used_tuple_elements.begin(),
+                               used_tuple_elements.end(), gte->tuple_index());
+          int64_t new_tuple_index =
+              std::distance(used_tuple_elements.begin(), it);
+          gte->set_tuple_index(new_tuple_index);
+        }
+      } else {
+        HloInstruction* gte = fusion_instruction->users()[0];
+        TF_RETURN_IF_ERROR(gte->ReplaceAllUsesWith(fusion_instruction));
+        TF_RETURN_IF_ERROR(
+            gte->parent()->RemoveInstructionAndUnusedOperands(gte));
+      }
+
+      // Update the root of the fusion computation.
+      if (tuple_shapes.size() > 1) {
+        std::vector<HloInstruction*> new_operands;
+        new_operands.reserve(used_tuple_elements.size());
+        for (int64_t tuple_index : used_tuple_elements) {
+          new_operands.push_back(
+              computation->root_instruction()->mutable_operand(tuple_index));
+        }
+        auto new_tuple = computation->AddInstruction(
+            HloInstruction::CreateTuple(new_operands));
+        TF_RETURN_IF_ERROR(computation->ReplaceInstructionWithDifferentShape(
+            computation->root_instruction(), new_tuple));
+      } else {
+        TF_RETURN_IF_ERROR(
+            computation->root_instruction()->ReplaceAllUsesWithDifferentShape(
+                computation->root_instruction()->mutable_operand(
+                    used_tuple_elements[0])));
+      }
+    }
+  }
+
   // Remove any dead roots and their dead transitive operands. Collect them
   // into a separate list first to avoid problems with iterating through the
   // computation's instruction while simultaneously removing instructions.
@@ -100,7 +183,7 @@ bool IsRemovableWhile(HloInstruction* instruction,
   return changed;
 }
 
-Status HloDCE::RecursivelyRemoveDeadComputation(
+absl::Status HloDCE::RecursivelyRemoveDeadComputation(
     HloModule* module, HloComputation* computation,
     absl::flat_hash_map<HloComputation*, int>& live_call_counts) {
   std::vector<HloComputation*> to_be_deleted;
@@ -186,9 +269,12 @@ absl::StatusOr<bool> HloDCE::Run(
   VLOG(2) << "Before dce:";
   XLA_VLOG_LINES(2, module->ToString());
 
-  // Run DCE on each computation.
-  for (auto* computation :
-       module->MakeComputationPostOrder(execution_threads)) {
+  // Run DCE on each computation. Use reverse post order so that we cleanup dead
+  // get-tuple-element users of MultiOutput fusions before cleaning up the
+  // fusion computation.
+  auto computations = module->MakeComputationPostOrder(execution_threads);
+  std::reverse(computations.begin(), computations.end());
+  for (auto* computation : computations) {
     TF_ASSIGN_OR_RETURN(
         bool changed_for_computation,
         RunOnComputation(computation, remove_cross_partition_collective_ops_));
diff --git a/third_party/xla/xla/service/hlo_dce.h b/third_party/xla/xla/service/hlo_dce.h
index 8e8e3fab4d58ec..c6330d30c38ef8 100644
--- a/third_party/xla/xla/service/hlo_dce.h
+++ b/third_party/xla/xla/service/hlo_dce.h
@@ -66,7 +66,7 @@ class HloDCE : public HloModulePass {
   // computations and checks if any of the subcomputations become dead after the
   // removal. Returns whether all dead computations were successfully removed
   // from the module.
-  Status RecursivelyRemoveDeadComputation(
+  absl::Status RecursivelyRemoveDeadComputation(
       HloModule* module, HloComputation* computation,
       absl::flat_hash_map<HloComputation*, int>& live_call_counts);
 
diff --git a/third_party/xla/xla/service/hlo_dce_test.cc b/third_party/xla/xla/service/hlo_dce_test.cc
index e80cb84c67a5e7..867b285250fd6d 100644
--- a/third_party/xla/xla/service/hlo_dce_test.cc
+++ b/third_party/xla/xla/service/hlo_dce_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 
+#include <gmock/gmock.h>
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -28,6 +29,8 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/layout_util.h"
 #include "xla/literal_util.h"
+#include "xla/service/pattern_matcher.h"
+#include "xla/service/pattern_matcher_gmock.h"
 #include "xla/shape_util.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/literal_test_util.h"
@@ -39,6 +42,8 @@ limitations under the License.
 namespace xla {
 namespace {
 
+namespace m = ::xla::match;
+
 class HloDceTest : public HloTestBase {
  protected:
   HloDceTest() {}
@@ -611,5 +616,82 @@ TEST_F(HloDceTest, RemovedNestedDeadComputations) {
   EXPECT_EQ(module->MakeComputationPostOrder().size(), 1);
 }
 
+TEST_F(HloDceTest, MultiOutputFusionRemoveUnusedTupleElementsRemoveTuple) {
+  constexpr char kHloString[] = R"(
+  HloModule test_module
+  fused_add {
+    p0 = f32[32,32]{1,0} parameter(0)
+    p1 = f32[32,32]{1,0} parameter(1)
+    p2 = f32[32,32]{1,0} parameter(2) // becomes dead
+    add = f32[32,32]{1,0} add(p0, p1)
+    ROOT res = (f32[32,32]{1,0}, f32[32,32]{1,0}) tuple(p2, add)
+  }
+
+  ENTRY reduce {
+    param0 = f32[32,32]{1,0} parameter(0)
+    param1 = f32[32,32]{1,0} parameter(1)
+    param2 = f32[32,32]{1,0} parameter(2)
+    fusion = (f32[32,32]{1,0}, f32[32,32]{1,0}) fusion(param0, param1, param2), kind=kLoop, calls=fused_add
+    gte.0 = f32[32,32]{1,0} get-tuple-element(fusion), index=0  // dead
+    ROOT gte.1 = f32[32,32]{1,0} get-tuple-element(fusion), index=1
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kHloString));
+  HloDCE dce;
+  auto changed = dce.Run(module.get());
+  ASSERT_TRUE(changed.ok());
+  EXPECT_TRUE(*changed);
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  // We expect that the dead parameter and the dead tuple entry are removed.
+  EXPECT_THAT(root, GmockMatch(m::Fusion(m::Parameter(0), m::Parameter(1))
+                                   .WithShape(F32, {32, 32})));
+  EXPECT_THAT(
+      root->fused_expression_root(),
+      GmockMatch(
+          m::Add(m::Parameter(0), m::Parameter(1)).WithShape(F32, {32, 32})));
+  EXPECT_EQ(module->MakeComputationPostOrder().size(), 2);
+}
+
+TEST_F(HloDceTest, MultiOutputFusionRemoveUnusedTupleElementAdjustTuple) {
+  constexpr char kHloString[] = R"(
+  HloModule test_module
+  fused_add {
+    p0 = f32[32,32]{1,0} parameter(0)
+    p1 = f32[32,32]{1,0} parameter(1)
+    add = f32[32,32]{1,0} add(p0, p1)
+    neg = f32[32,32]{1,0} negate(add)
+    ROOT res = (f32[32,32]{1,0}, f32[32,32]{1,0}, f32[32,32]{1,0}) tuple(neg, p0, add)
+  }
+
+  ENTRY reduce {
+    param0 = f32[32,32]{1,0} parameter(0)
+    param1 = f32[32,32]{1,0} parameter(1)
+    fusion = (f32[32,32]{1,0}, f32[32,32]{1,0}, f32[32,32]{1,0}) fusion(param0, param1), kind=kLoop, calls=fused_add
+    gte.0 = f32[32,32]{1,0} get-tuple-element(fusion), index=0
+    gte.1 = f32[32,32]{1,0} get-tuple-element(fusion), index=1
+    gte.2 = f32[32,32]{1,0} get-tuple-element(fusion), index=2
+    ROOT add = f32[32,32]{1,0} add(gte.0, gte.2)
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kHloString));
+  HloDCE dce;
+  auto changed = dce.Run(module.get());
+  ASSERT_TRUE(changed.ok());
+  EXPECT_TRUE(*changed);
+  Shape shape = ShapeUtil::MakeShape(F32, {32, 32});
+  Shape expected_shape = ShapeUtil::MakeTupleShape({shape, shape});
+  HloInstruction* fusion;
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Add(
+                  m::GetTupleElement(
+                      m::Fusion(&fusion).WithShapeEqualTo(&expected_shape), 0),
+                  m::GetTupleElement(m::Fusion(), 1))));
+  EXPECT_THAT(
+      fusion->fused_expression_root(),
+      GmockMatch(
+          m::Tuple(m::Negate(), m::Add()).WithShapeEqualTo(&expected_shape)));
+  EXPECT_EQ(module->MakeComputationPostOrder().size(), 2);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_domain_map.cc b/third_party/xla/xla/service/hlo_domain_map.cc
index 6aa3933244de8f..79ae9f04476ad1 100644
--- a/third_party/xla/xla/service/hlo_domain_map.cc
+++ b/third_party/xla/xla/service/hlo_domain_map.cc
@@ -60,7 +60,7 @@ int64_t HloDomainMap::GetDomainMetadataId(
   return FindOrDie(domain_metadata_id_, instruction);
 }
 
-Status HloDomainMap::TryProcessEmptyDomain(HloInstruction* instruction) {
+absl::Status HloDomainMap::TryProcessEmptyDomain(HloInstruction* instruction) {
   TF_RET_CHECK(instruction->opcode() == HloOpcode::kDomain);
   // We only check operands, so we are sure to not process the empty domain from
   // both sides.
@@ -80,7 +80,7 @@ Status HloDomainMap::TryProcessEmptyDomain(HloInstruction* instruction) {
   return OkStatus();
 }
 
-Status HloDomainMap::Populate(HloComputation* computation) {
+absl::Status HloDomainMap::Populate(HloComputation* computation) {
   InstructionOrderMap instructions_post_order;
   int64_t count = 0;
   for (HloInstruction* instruction : computation->MakeInstructionPostOrder()) {
@@ -106,7 +106,7 @@ Status HloDomainMap::Populate(HloComputation* computation) {
   return OkStatus();
 }
 
-Status HloDomainMap::PopulateDomainMetadataMap() {
+absl::Status HloDomainMap::PopulateDomainMetadataMap() {
   auto hash = [](const DomainMetadata* m) { return m->Hash(); };
   auto equal = [](const DomainMetadata* a, const DomainMetadata* b) {
     return a->Matches(*b);
@@ -142,7 +142,7 @@ Status HloDomainMap::PopulateDomainMetadataMap() {
   return OkStatus();
 }
 
-Status HloDomainMap::InsertDomain(
+absl::Status HloDomainMap::InsertDomain(
     std::unique_ptr<DomainMetadata::Domain> domain) {
   int64_t domain_id = instruction_domains_.size();
   instruction_domains_.push_back(std::move(domain));
@@ -152,8 +152,8 @@ Status HloDomainMap::InsertDomain(
   return OkStatus();
 }
 
-Status HloDomainMap::ExpandDomain(HloInstruction* instruction,
-                                  DomainMetadata::Domain* domain) const {
+absl::Status HloDomainMap::ExpandDomain(HloInstruction* instruction,
+                                        DomainMetadata::Domain* domain) const {
   std::vector<HloInstruction*> in_queue;
   in_queue.push_back(instruction);
   while (!in_queue.empty()) {
diff --git a/third_party/xla/xla/service/hlo_domain_map.h b/third_party/xla/xla/service/hlo_domain_map.h
index 97814571274337..5dcae5ff4cf01a 100644
--- a/third_party/xla/xla/service/hlo_domain_map.h
+++ b/third_party/xla/xla/service/hlo_domain_map.h
@@ -87,13 +87,13 @@ class HloDomainMap {
   // kDomain instruction of the same kind, hence defining an empty domain.
   // If that is the case, create the empty domain and call the proper
   // normalizer.
-  Status TryProcessEmptyDomain(HloInstruction* instruction);
+  absl::Status TryProcessEmptyDomain(HloInstruction* instruction);
 
-  Status Populate(HloComputation* computation);
+  absl::Status Populate(HloComputation* computation);
 
   // Inserts the provided domain into the ones tracked by this object,
   // creating a new domain ID.
-  Status InsertDomain(std::unique_ptr<DomainMetadata::Domain> domain);
+  absl::Status InsertDomain(std::unique_ptr<DomainMetadata::Domain> domain);
 
   // From the given instruction, expands operand and user wise, the set of
   // instructions which can be reached without crossing a kDomain instruction
@@ -101,8 +101,8 @@ class HloDomainMap {
   // The domain data structure will be populated with all the reached
   // instructions, and the boundaries of the domain, with the kDomain
   // instructions encountered while expanding the reach.
-  Status ExpandDomain(HloInstruction* instruction,
-                      DomainMetadata::Domain* domain) const;
+  absl::Status ExpandDomain(HloInstruction* instruction,
+                            DomainMetadata::Domain* domain) const;
 
   // Creates a domain data structure using the ExpandDomain() API.
   absl::StatusOr<std::unique_ptr<DomainMetadata::Domain>> CreateDomain(
@@ -117,7 +117,7 @@ class HloDomainMap {
 
   // Populates domain_metadata_id_ that maps each HloInstruction to the unique
   // ID of its associated domain metatadata.
-  Status PopulateDomainMetadataMap();
+  absl::Status PopulateDomainMetadataMap();
 
   std::string domain_kind_;
   std::vector<std::unique_ptr<DomainMetadata::Domain>> instruction_domains_;
diff --git a/third_party/xla/xla/service/hlo_domain_remover.cc b/third_party/xla/xla/service/hlo_domain_remover.cc
index 3d35a25595e1ba..dad5f41ea71885 100644
--- a/third_party/xla/xla/service/hlo_domain_remover.cc
+++ b/third_party/xla/xla/service/hlo_domain_remover.cc
@@ -36,13 +36,13 @@ class HloDomainRemover::RunContext {
  private:
   // Verifies the consistency of the domain, and normalizes the instructions
   // within it.
-  Status VerifyAndNormalizeDomain(const DomainMetadata::Domain& domain);
+  absl::Status VerifyAndNormalizeDomain(const DomainMetadata::Domain& domain);
 
   HloModule* module_;
   HloDomainRemover* remover_;
 };
 
-Status HloDomainRemover::RunContext::VerifyAndNormalizeDomain(
+absl::Status HloDomainRemover::RunContext::VerifyAndNormalizeDomain(
     const DomainMetadata::Domain& domain) {
   TF_ASSIGN_OR_RETURN(const DomainMetadata* ref_metadata,
                       HloDomainVerifier::VerifyDomain(domain));
diff --git a/third_party/xla/xla/service/hlo_domain_remover.h b/third_party/xla/xla/service/hlo_domain_remover.h
index 9cda66d36e8451..6510ab684f9c37 100644
--- a/third_party/xla/xla/service/hlo_domain_remover.h
+++ b/third_party/xla/xla/service/hlo_domain_remover.h
@@ -37,8 +37,8 @@ class HloDomainRemover : public HloModulePass {
   // function is tasked at applying attribute normalization on the instructions
   // within such domain.
   HloDomainRemover(absl::string_view kind,
-                   std::function<Status(const DomainMetadata::Domain&,
-                                        const DomainMetadata* metadata)>
+                   std::function<absl::Status(const DomainMetadata::Domain&,
+                                              const DomainMetadata* metadata)>
                        normalizer)
       : kind_(kind), normalizer_(std::move(normalizer)) {}
 
@@ -58,8 +58,8 @@ class HloDomainRemover : public HloModulePass {
   class RunContext;
 
   std::string kind_;
-  std::function<Status(const DomainMetadata::Domain&,
-                       const DomainMetadata* metadata)>
+  std::function<absl::Status(const DomainMetadata::Domain&,
+                             const DomainMetadata* metadata)>
       normalizer_;
 };
 
diff --git a/third_party/xla/xla/service/hlo_domain_test.cc b/third_party/xla/xla/service/hlo_domain_test.cc
index 8e82878602f1a6..d0c7312868d826 100644
--- a/third_party/xla/xla/service/hlo_domain_test.cc
+++ b/third_party/xla/xla/service/hlo_domain_test.cc
@@ -121,8 +121,8 @@ class OpNameDomainCreator {
   }
 };
 
-Status OpNameDomainNormalizer(const DomainMetadata::Domain& domain,
-                              const DomainMetadata* metadata) {
+absl::Status OpNameDomainNormalizer(const DomainMetadata::Domain& domain,
+                                    const DomainMetadata* metadata) {
   // Nothing to do for the particular use this test make of the OpName domains.
   return OkStatus();
 }
diff --git a/third_party/xla/xla/service/hlo_domain_verifier.cc b/third_party/xla/xla/service/hlo_domain_verifier.cc
index 0d83a318dbf9ea..50f874945bf382 100644
--- a/third_party/xla/xla/service/hlo_domain_verifier.cc
+++ b/third_party/xla/xla/service/hlo_domain_verifier.cc
@@ -31,19 +31,20 @@ class HloDomainVerifier::RunContext {
   RunContext(HloModule* module, HloDomainVerifier* verifier)
       : module_(module), verifier_(verifier) {}
 
-  Status Run(const absl::flat_hash_set<absl::string_view>& execution_threads);
+  absl::Status Run(
+      const absl::flat_hash_set<absl::string_view>& execution_threads);
 
  private:
   // If the verifier caller passed an empty vector for kinds, we collect all the
   // available domain types.
-  Status PopulateDomainKinds(
+  absl::Status PopulateDomainKinds(
       const absl::flat_hash_set<absl::string_view>& execution_threads);
 
   HloModule* module_;
   HloDomainVerifier* verifier_;
 };
 
-Status HloDomainVerifier::RunContext::PopulateDomainKinds(
+absl::Status HloDomainVerifier::RunContext::PopulateDomainKinds(
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   if (verifier_->kinds_.empty()) {
     // The caller specified no domain kinds, collect all the ones available.
@@ -65,7 +66,7 @@ Status HloDomainVerifier::RunContext::PopulateDomainKinds(
   return OkStatus();
 }
 
-Status HloDomainVerifier::RunContext::Run(
+absl::Status HloDomainVerifier::RunContext::Run(
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(4) << "Running HLO Domain Verifier";
   TF_RETURN_IF_ERROR(PopulateDomainKinds(execution_threads));
diff --git a/third_party/xla/xla/service/hlo_graph_dumper.cc b/third_party/xla/xla/service/hlo_graph_dumper.cc
index 8a2dfe9b4ef63b..16fd21fd506821 100644
--- a/third_party/xla/xla/service/hlo_graph_dumper.cc
+++ b/third_party/xla/xla/service/hlo_graph_dumper.cc
@@ -283,10 +283,11 @@ std::string NodeColorAttributes(ColorScheme color) {
                    node_colors.stroke_color, node_colors.fill_color);
 }
 
-// Replaces <> with &lt;&gt;, so that this string is safe(er) for use in a
-// graphviz HTML-like string.
+// Replaces <> with &lt;&gt; and " with &quot;, so that this string is safe(er)
+// for use in a graphviz HTML-like string.
 std::string HtmlLikeStringSanitize(absl::string_view s) {
-  return absl::StrReplaceAll(s, {{"<", "&lt;"}, {">", "&gt;"}});
+  return absl::StrReplaceAll(s,
+                             {{"<", "&lt;"}, {">", "&gt;"}, {"\"", "&quot;"}});
 }
 
 bool IsFusedBroadcastOfConstantEffectiveScalar(const HloInstruction* instr) {
@@ -1842,14 +1843,14 @@ static absl::StatusOr<std::string> CompressAndEncode(absl::string_view input) {
     explicit WritableStringFile(std::string* data) : data_(data){};
     ~WritableStringFile() override = default;
 
-    Status Append(absl::string_view data) override {
+    absl::Status Append(absl::string_view data) override {
       absl::StrAppend(data_, data);
       return OkStatus();
     }
 
-    Status Close() override { return OkStatus(); }
-    Status Flush() override { return OkStatus(); }
-    Status Sync() override { return OkStatus(); }
+    absl::Status Close() override { return OkStatus(); }
+    absl::Status Flush() override { return OkStatus(); }
+    absl::Status Sync() override { return OkStatus(); }
 
    private:
     std::string* data_;
diff --git a/third_party/xla/xla/service/hlo_instruction_test.cc b/third_party/xla/xla/service/hlo_instruction_test.cc
index 408ae55b2aaec2..45f19b0145ef19 100644
--- a/third_party/xla/xla/service/hlo_instruction_test.cc
+++ b/third_party/xla/xla/service/hlo_instruction_test.cc
@@ -30,6 +30,8 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/protobuf_util.h"
 #include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/pattern_matcher.h"
+#include "xla/service/pattern_matcher_gmock.h"
 #include "xla/shape_util.h"
 #include "xla/test.h"
 #include "xla/test_helpers.h"
@@ -42,6 +44,8 @@ limitations under the License.
 namespace xla {
 namespace {
 
+namespace m = ::xla::match;
+
 using ::testing::ElementsAre;
 using ::testing::UnorderedElementsAre;
 
@@ -55,24 +59,24 @@ class HloInstructionTest : public HloTestBase {
 // before their users, nodes not visited twice, etc.)
 class OpAndUserCollectingVisitor : public DfsHloVisitorWithDefault {
  public:
-  Status DefaultAction(HloInstruction* hlo_instruction) override {
+  absl::Status DefaultAction(HloInstruction* hlo_instruction) override {
     return Unimplemented("not implemented %s",
                          HloOpcodeString(hlo_instruction->opcode()));
   }
 
-  Status HandleParameter(HloInstruction* parameter) override {
+  absl::Status HandleParameter(HloInstruction* parameter) override {
     EXPECT_FALSE(count_.contains(parameter));
     count_[parameter] = GetCountsForNode(parameter);
     return OkStatus();
   }
 
-  Status HandleConstant(HloInstruction* constant) override {
+  absl::Status HandleConstant(HloInstruction* constant) override {
     EXPECT_FALSE(count_.contains(constant));
     count_[constant] = GetCountsForNode(constant);
     return OkStatus();
   }
 
-  Status HandleAdd(HloInstruction* add) override {
+  absl::Status HandleAdd(HloInstruction* add) override {
     auto lhs = add->operand(0);
     auto rhs = add->operand(1);
     EXPECT_FALSE(count_.contains(add));
@@ -82,7 +86,7 @@ class OpAndUserCollectingVisitor : public DfsHloVisitorWithDefault {
     return OkStatus();
   }
 
-  Status HandleNegate(HloInstruction* negate) override {
+  absl::Status HandleNegate(HloInstruction* negate) override {
     auto operand = negate->operand(0);
     EXPECT_FALSE(count_.contains(negate));
     EXPECT_TRUE(count_.contains(operand));
@@ -90,7 +94,7 @@ class OpAndUserCollectingVisitor : public DfsHloVisitorWithDefault {
     return OkStatus();
   }
 
-  Status HandleMap(HloInstruction* map) override {
+  absl::Status HandleMap(HloInstruction* map) override {
     EXPECT_FALSE(count_.contains(map));
     for (HloInstruction* arg : map->operands()) {
       EXPECT_TRUE(count_.contains(arg));
@@ -99,7 +103,7 @@ class OpAndUserCollectingVisitor : public DfsHloVisitorWithDefault {
     return OkStatus();
   }
 
-  Status HandleReduce(HloInstruction* reduce) override {
+  absl::Status HandleReduce(HloInstruction* reduce) override {
     auto arg = reduce->operand(0);
     auto init_value = reduce->operand(1);
     EXPECT_FALSE(count_.contains(reduce));
@@ -572,12 +576,12 @@ class NodeCollectorAndPostProcessor : public DfsHloVisitorWithDefault {
  public:
   NodeCollectorAndPostProcessor() {}
 
-  Status Postprocess(HloInstruction* hlo) override {
+  absl::Status Postprocess(HloInstruction* hlo) override {
     post_processed_nodes_.push_back(hlo);
     return OkStatus();
   }
 
-  Status DefaultAction(HloInstruction* hlo_instruction) override {
+  absl::Status DefaultAction(HloInstruction* hlo_instruction) override {
     visited_nodes_.push_back(hlo_instruction);
     return OkStatus();
   }
@@ -2047,8 +2051,8 @@ TEST_F(HloInstructionTest, StringifyAsyncOpsWithReduceScatter) {
     HloInstruction* param = async_builder.AddInstruction(
         HloInstruction::CreateParameter(0, rs_input_shape, "pasync"));
     async_builder.AddInstruction(HloInstruction::CreateReduceScatter(
-        rs_output_shape, {param}, add_computation.get(), {}, false,
-        std::nullopt, false, 0));
+        rs_output_shape, {param}, add_computation.get(), CollectiveDeviceList(),
+        false, std::nullopt, false, 0));
     async_computation = async_builder.Build();
   }
 
@@ -2568,8 +2572,8 @@ TEST_F(HloInstructionTest, VerifyToApplyRegionPointsToReduceScatter) {
   HloInstruction* param = main_builder.AddInstruction(
       HloInstruction::CreateParameter(0, rs_input_shape, "input"));
   main_builder.AddInstruction(HloInstruction::CreateReduceScatter(
-      rs_output_shape, {param}, add_computation.get(), {}, false, std::nullopt,
-      false, 0));
+      rs_output_shape, {param}, add_computation.get(), CollectiveDeviceList(),
+      false, std::nullopt, false, 0));
 
   auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(main_builder.Build());
@@ -2606,8 +2610,8 @@ TEST_F(HloInstructionTest, VerifyToApplyRegionPointsToAllReduce) {
   HloInstruction* param = main_builder.AddInstruction(
       HloInstruction::CreateParameter(0, ar_input_shape, "input"));
   main_builder.AddInstruction(HloInstruction::CreateAllReduce(
-      ar_input_shape, {param}, add_computation.get(), {}, false, std::nullopt,
-      false));
+      ar_input_shape, {param}, add_computation.get(), CollectiveDeviceList(),
+      false, std::nullopt, false));
 
   auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(main_builder.Build());
@@ -2871,5 +2875,165 @@ TEST_F(HloInstructionTest, BackendConfigNotCopiedToDerivedWithDiffOpcode) {
   EXPECT_FALSE(add2->has_backend_config());
 }
 
+TEST_F(HloInstructionTest,
+       MergeMultiOutputProducerFusionIntoMultiOutputFusion) {
+  const std::string& hlo_string = R"(
+    HloModule mof
+    mof_producer {
+      param0 = f32[10]{0} parameter(0)
+      param1 = f32[10]{0} parameter(1)
+      add = f32[10]{0} add(param0, param1)
+      sub = f32[10]{0} subtract(param0, param1)
+      ROOT res = (f32[10]{0}, f32[10]{0}, f32[10]{0}, f32[10]{0}) tuple(param1, add, sub, param0)
+    }
+
+    mof_consumer {
+      param0.0 = f32[10]{0} parameter(0)
+      param1.0 = f32[10]{0} parameter(1)
+      param2.0 = f32[10]{0} parameter(2)
+      mul = f32[10]{0} multiply(param0.0, param1.0)
+      div = f32[10]{0} divide(param0.0, param1.0)
+      ROOT res = (f32[10]{0}, f32[10]{0}, f32[10]{0}) tuple(mul, div, param2.0)
+    }
+
+    ENTRY main {
+      p0 = f32[10]{0} parameter(0)
+      p1 = f32[10]{0} parameter(1)
+      producer = (f32[10]{0}, f32[10]{0}, f32[10]{0}, f32[10]{0}) fusion(p0, p1), kind=kLoop, calls=mof_producer
+      gte0 = f32[10]{0} get-tuple-element(producer), index=0
+      gte1 = f32[10]{0} get-tuple-element(producer), index=1
+      gte2 = f32[10]{0} get-tuple-element(producer), index=2
+      gte3 = f32[10]{0} get-tuple-element(producer), index=3
+      consumer = (f32[10]{0}, f32[10]{0}, f32[10]{0}) fusion(gte1, gte2, gte3), kind=kLoop, calls=mof_consumer
+      gte4 = f32[10]{0} get-tuple-element(consumer), index=0
+      gte5 = f32[10]{0} get-tuple-element(consumer), index=1
+      gte6 = f32[10]{0} get-tuple-element(consumer), index=2
+      ROOT res = (f32[10]{0}, f32[10]{0}, f32[10]{0}, f32[10]{0}, f32[10]{0}, f32[10]{0}) tuple(gte0, gte1, gte3, gte4, gte5, gte6)
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  HloInstruction* producer = FindInstruction(module.get(), "producer");
+  HloInstruction* consumer = FindInstruction(module.get(), "consumer");
+  consumer->MergeFusionInstructionIntoMultiOutput(producer);
+  HloInstruction* fusion = nullptr;
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Tuple(
+                  m::Parameter(1), m::GetTupleElement(m::Fusion(&fusion), 3),
+                  m::Parameter(0), m::GetTupleElement(m::Fusion(), 0),
+                  m::GetTupleElement(m::Fusion(), 1),
+                  m::GetTupleElement(m::Fusion(), 2))));
+  EXPECT_THAT(fusion->fused_instructions_computation()->root_instruction(),
+              GmockMatch(m::Tuple(
+                  m::Multiply(m::Add(m::Parameter(0), m::Parameter(1)),
+                              m::Subtract(m::Parameter(0), m::Parameter(1))),
+                  m::Divide(m::Add(m::Parameter(0), m::Parameter(1)),
+                            m::Subtract(m::Parameter(0), m::Parameter(1))),
+                  m::Parameter(0), m::Add(m::Parameter(0), m::Parameter(1)))));
+}
+
+TEST_F(HloInstructionTest,
+       MergeMultiOutputProducerFusionIntoMultiOutputFusionAvoidDuplicateRoots) {
+  const std::string& hlo_string = R"(
+    HloModule mof
+    mof_producer {
+      param0 = f32[10]{0} parameter(0)
+      param1 = f32[10]{0} parameter(1)
+      add = f32[10]{0} add(param0, param1)
+      sub = f32[10]{0} subtract(param0, param1)
+      ROOT res = (f32[10]{0}, f32[10]{0}) tuple(add, sub)
+    }
+
+    mof_consumer {
+      param0.0 = f32[10]{0} parameter(0)
+      param1.0 = f32[10]{0} parameter(1)
+      mul = f32[10]{0} multiply(param0.0, param1.0)
+      div = f32[10]{0} divide(param0.0, param1.0)
+      ROOT res = (f32[10]{0}, f32[10]{0}, f32[10]{0}) tuple(mul, div, param0.0)
+    }
+
+    ENTRY main {
+      p0 = f32[10]{0} parameter(0)
+      p1 = f32[10]{0} parameter(1)
+      producer = (f32[10]{0}, f32[10]{0}) fusion(p0, p1), kind=kLoop, calls=mof_producer
+      gte1 = f32[10]{0} get-tuple-element(producer), index=0
+      gte2 = f32[10]{0} get-tuple-element(producer), index=1
+      consumer = (f32[10]{0}, f32[10]{0}, f32[10]{0}) fusion(gte1, gte2), kind=kLoop, calls=mof_consumer
+      gte3 = f32[10]{0} get-tuple-element(consumer), index=0
+      gte4 = f32[10]{0} get-tuple-element(consumer), index=1
+      gte5 = f32[10]{0} get-tuple-element(consumer), index=2
+      ROOT res = (f32[10]{0}, f32[10]{0}, f32[10]{0}, f32[10]{0}) tuple(gte1, gte3, gte4, gte5)
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  HloInstruction* producer = FindInstruction(module.get(), "producer");
+  HloInstruction* consumer = FindInstruction(module.get(), "consumer");
+  consumer->MergeFusionInstructionIntoMultiOutput(producer);
+  HloInstruction* fusion = nullptr;
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Tuple(m::GetTupleElement(m::Fusion(&fusion), 2),
+                                  m::GetTupleElement(m::Fusion(), 0),
+                                  m::GetTupleElement(m::Fusion(), 1),
+                                  m::GetTupleElement(m::Fusion(), 2))));
+  EXPECT_THAT(fusion->fused_instructions_computation()->root_instruction(),
+              GmockMatch(m::Tuple(
+                  m::Multiply(m::Add(m::Parameter(0), m::Parameter(1)),
+                              m::Subtract(m::Parameter(0), m::Parameter(1))),
+                  m::Divide(m::Add(m::Parameter(0), m::Parameter(1)),
+                            m::Subtract(m::Parameter(0), m::Parameter(1))),
+                  m::Add(m::Parameter(0), m::Parameter(1)))));
+}
+
+TEST_F(HloInstructionTest,
+       MergeMultiOutputSiblingFusionsAvoidDuplicateFusionParameters) {
+  const std::string& hlo_string = R"(
+    HloModule mof
+    mof_sibling1 {
+      param0 = f32[10]{0} parameter(0)
+      param1 = f32[10]{0} parameter(1)
+      add = f32[10]{0} add(param0, param1)
+      ROOT res = (f32[10]{0}, f32[10]{0}) tuple(param1, add)
+    }
+
+    mof_sibling2 {
+      param0.0 = f32[10]{0} parameter(0)
+      param1.0 = f32[10]{0} parameter(1)
+      mul = f32[10]{0} multiply(param0.0, param1.0)
+      ROOT res = (f32[10]{0}, f32[10]{0}) tuple(mul, param1.0)
+    }
+
+    ENTRY main {
+      p0 = f32[10]{0} parameter(0)
+      p1 = f32[10]{0} parameter(1)
+      sibling1 = (f32[10]{0}, f32[10]{0}) fusion(p0, p1), kind=kLoop, calls=mof_sibling1
+      gte0 = f32[10]{0} get-tuple-element(sibling1), index=0
+      gte1 = f32[10]{0} get-tuple-element(sibling1), index=1
+      sibling2 = (f32[10]{0}, f32[10]{0}) fusion(p0, p1), kind=kLoop, calls=mof_sibling2
+      gte2 = f32[10]{0} get-tuple-element(sibling2), index=0
+      gte3 = f32[10]{0} get-tuple-element(sibling2), index=1
+      ROOT res = (f32[10]{0}, f32[10]{0}, f32[10]{0}, f32[10]{0}) tuple(gte0, gte1, gte2, gte3)
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  HloInstruction* sibling1 = FindInstruction(module.get(), "sibling1");
+  HloInstruction* sibling2 = FindInstruction(module.get(), "sibling2");
+  sibling2->MergeFusionInstructionIntoMultiOutput(sibling1);
+  HloInstruction* fusion = nullptr;
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Tuple(m::Parameter(1),
+                                  m::GetTupleElement(m::Fusion(&fusion), 2),
+                                  m::GetTupleElement(m::Fusion(), 0),
+                                  m::GetTupleElement(m::Fusion(), 1))));
+  EXPECT_THAT(fusion->fused_instructions_computation()->root_instruction(),
+              GmockMatch(m::Tuple(m::Multiply(m::Parameter(0), m::Parameter(1)),
+                                  m::Parameter(1),
+                                  m::Add(m::Parameter(0), m::Parameter(1)))));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_memory_scheduler.cc b/third_party/xla/xla/service/hlo_memory_scheduler.cc
index f073b34556ef57..07382fd441efcc 100644
--- a/third_party/xla/xla/service/hlo_memory_scheduler.cc
+++ b/third_party/xla/xla/service/hlo_memory_scheduler.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "tsl/lib/gtl/map_util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
+#include "tsl/profiler/lib/scoped_annotation.h"
 
 namespace xla {
 namespace {
@@ -725,6 +726,10 @@ absl::StatusOr<HloSchedule> ScheduleModule(
     const ModuleSchedulerAlgorithm& algorithm,
     const absl::flat_hash_set<absl::string_view>& execution_threads,
     int64_t* peak_memory) {
+  tsl::profiler::ScopedAnnotation annotation([&] {
+    return absl::StrFormat("XlaMemoryScheduler:#module=%s,program_id=%d#",
+                           module->name(), module->unique_id());
+  });
   TF_ASSIGN_OR_RETURN(std::unique_ptr<TuplePointsToAnalysis> points_to_analysis,
                       TuplePointsToAnalysis::Run(module));
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
diff --git a/third_party/xla/xla/service/hlo_module_config_test.cc b/third_party/xla/xla/service/hlo_module_config_test.cc
index 952aec1244feca..6481dc2b511a1a 100644
--- a/third_party/xla/xla/service/hlo_module_config_test.cc
+++ b/third_party/xla/xla/service/hlo_module_config_test.cc
@@ -27,7 +27,7 @@ namespace {
 using ::testing::EqualsProto;
 
 template <typename MessageType>
-StatusOr<MessageType> ParseTextProto(const std::string& text_proto) {
+absl::StatusOr<MessageType> ParseTextProto(const std::string& text_proto) {
   tsl::protobuf::TextFormat::Parser parser;
   MessageType parsed_proto;
   tsl::protobuf::io::ArrayInputStream input_stream(text_proto.data(),
diff --git a/third_party/xla/xla/service/hlo_module_group_metadata.cc b/third_party/xla/xla/service/hlo_module_group_metadata.cc
index 2ab8c03c7ec030..1a21714791de56 100644
--- a/third_party/xla/xla/service/hlo_module_group_metadata.cc
+++ b/third_party/xla/xla/service/hlo_module_group_metadata.cc
@@ -64,12 +64,12 @@ HloModuleGroupMetadata::Build(absl::Span<HloModule* const> modules) {
   return std::move(metadata);
 }
 
-Status HloModuleGroupMetadata::Build() {
+absl::Status HloModuleGroupMetadata::Build() {
   TF_RETURN_IF_ERROR(RecordInstructions());
   TF_RETURN_IF_ERROR(VerifyChannelInstructions());
 
   // Record all companion while instructions.
-  const auto visitor = [this](HloInstruction* hlo) -> Status {
+  const auto visitor = [this](HloInstruction* hlo) -> absl::Status {
     // We only need to process if the instruction is within the computation
     // of a companion instruction, like in the condition or body computation
     // of a While.
@@ -159,7 +159,7 @@ Status HloModuleGroupMetadata::Build() {
   return OkStatus();
 }
 
-Status HloModuleGroupMetadata::VerifyCompanionSets() const {
+absl::Status HloModuleGroupMetadata::VerifyCompanionSets() const {
   for (const auto& companions : companion_sets_) {
     // A companion set must be composed at most of an instruction per
     // device/module.
@@ -320,8 +320,8 @@ int64_t HloModuleGroupMetadata::GetDeviceModulesCount() const {
   return modules_.size();
 }
 
-Status HloModuleGroupMetadata::RecordInstructions() {
-  const auto visitor = [this](HloInstruction* hlo) -> Status {
+absl::Status HloModuleGroupMetadata::RecordInstructions() {
+  const auto visitor = [this](HloInstruction* hlo) -> absl::Status {
     if (hlo->opcode() == HloOpcode::kWhile) {
       tracked_instructions_[hlo->while_condition()] =
           TrackedInstruction(hlo, ComputationKind::kWhileCondition);
@@ -404,8 +404,8 @@ Status HloModuleGroupMetadata::RecordInstructions() {
   return OkStatus();
 }
 
-Status HloModuleGroupMetadata::AddCompanion(HloInstruction* instruction1,
-                                            HloInstruction* instruction2) {
+absl::Status HloModuleGroupMetadata::AddCompanion(
+    HloInstruction* instruction1, HloInstruction* instruction2) {
   TF_RET_CHECK(instruction1->opcode() == HloOpcode::kWhile ||
                instruction1->opcode() == HloOpcode::kConditional ||
                instruction1->opcode() == HloOpcode::kCall);
@@ -449,7 +449,7 @@ Status HloModuleGroupMetadata::AddCompanion(HloInstruction* instruction1,
   return OkStatus();
 }
 
-Status HloModuleGroupMetadata::VerifyChannelInstructions() {
+absl::Status HloModuleGroupMetadata::VerifyChannelInstructions() {
   for (const Channel& channel : channels_) {
     if (channel.send == nullptr) {
       return FailedPrecondition("missing send for id : %d", channel.id);
@@ -530,7 +530,7 @@ Status HloModuleGroupMetadata::VerifyChannelInstructions() {
   return OkStatus();
 }
 
-Status HloModuleGroupMetadata::CheckCommunicatingInstruction(
+absl::Status HloModuleGroupMetadata::CheckCommunicatingInstruction(
     HloInstruction* instruction) const {
   HloComputation* computation = instruction->parent();
   const HloModule* module = computation->parent();
diff --git a/third_party/xla/xla/service/hlo_module_group_metadata.h b/third_party/xla/xla/service/hlo_module_group_metadata.h
index f977b19ab2ed77..1b19d0b069c359 100644
--- a/third_party/xla/xla/service/hlo_module_group_metadata.h
+++ b/third_party/xla/xla/service/hlo_module_group_metadata.h
@@ -215,11 +215,11 @@ class HloModuleGroupMetadata {
   }
 
  private:
-  Status Build();
+  absl::Status Build();
 
   // Record all channel instructions, cross-module AllReduce instructions, and
   // While/Conditional/Call instructions.
-  Status RecordInstructions();
+  absl::Status RecordInstructions();
 
   // Verifies the given HloModules are well-formed and follow the specification,
   // in particular with respect to using channel instructions.
@@ -229,20 +229,20 @@ class HloModuleGroupMetadata {
   // * The nest level of channel instructions match.
   // * Channel instructions are used in allowed computations, i.e., in the
   //   entry computation of the module or condition/body of While computations.
-  Status VerifyChannelInstructions();
+  absl::Status VerifyChannelInstructions();
 
   // Adds metadata that the given two instructions are companions.
-  Status AddCompanion(HloInstruction* instruction1,
-                      HloInstruction* instruction2);
+  absl::Status AddCompanion(HloInstruction* instruction1,
+                            HloInstruction* instruction2);
 
   // Checks whether a communicating instruction is placed in a valid position
   // within the graph.
-  Status CheckCommunicatingInstruction(HloInstruction* instruction) const;
+  absl::Status CheckCommunicatingInstruction(HloInstruction* instruction) const;
 
   // Performs a consistency check on the companion sets built for the input
   // modules. Checks that each instruction in a companion set is in a different
   // module/device.
-  Status VerifyCompanionSets() const;
+  absl::Status VerifyCompanionSets() const;
 
   // Retrieves a pointer to the stored TrackedInstruction associated with a
   // tracked computation, or nullptr in case such computation is not tracked.
diff --git a/third_party/xla/xla/service/hlo_module_group_util.cc b/third_party/xla/xla/service/hlo_module_group_util.cc
index 362c3ce46c3e31..3c199c44fd1cba 100644
--- a/third_party/xla/xla/service/hlo_module_group_util.cc
+++ b/third_party/xla/xla/service/hlo_module_group_util.cc
@@ -235,10 +235,9 @@ std::string HloModuleGroupUtil::CycleToString(
   return absl::StrJoin(names, " --> ");
 }
 
-Status HloModuleGroupUtil::VisitTopologicalOrder(VisitStates* visit_state,
-                                                 VisitFunction visit_function,
-                                                 HloInstruction* root,
-                                                 bool send_recv_as_one_group) {
+absl::Status HloModuleGroupUtil::VisitTopologicalOrder(
+    VisitStates* visit_state, VisitFunction visit_function,
+    HloInstruction* root, bool send_recv_as_one_group) {
   // Stack of HLO instructions visited in DFS order.
   std::stack<HloInstruction*> stack;
   stack.push(root);
@@ -332,7 +331,7 @@ Status HloModuleGroupUtil::VisitTopologicalOrder(VisitStates* visit_state,
   return OkStatus();
 }
 
-Status HloModuleGroupUtil::VerifyComputations(
+absl::Status HloModuleGroupUtil::VerifyComputations(
     absl::Span<HloComputation* const> computations) {
   auto visit_function =
       [&](HloInstruction* instruction,
diff --git a/third_party/xla/xla/service/hlo_module_group_util.h b/third_party/xla/xla/service/hlo_module_group_util.h
index 9d5e9af41e478b..dc084615791950 100644
--- a/third_party/xla/xla/service/hlo_module_group_util.h
+++ b/third_party/xla/xla/service/hlo_module_group_util.h
@@ -68,7 +68,7 @@ class HloModuleGroupUtil {
 
   // Function called on each instruction group during the DFS traversal. See the
   // comment for VisitTopologicalOrder()).
-  using VisitFunction = absl::FunctionRef<Status(
+  using VisitFunction = absl::FunctionRef<absl::Status(
       HloInstruction* hlo,
       const std::vector<HloInstruction*>& instruction_group)>;
 
@@ -91,13 +91,14 @@ class HloModuleGroupUtil {
   // * send_recv_as_one_group: if true, treat (Recv, Send, RecvDone, SendDone)
   // as one group.
   using VisitStates = absl::flat_hash_map<HloInstruction*, VisitState>;
-  Status VisitTopologicalOrder(VisitStates* visit_state,
-                               VisitFunction visit_function,
-                               HloInstruction* root,
-                               bool send_recv_as_one_group = false);
+  absl::Status VisitTopologicalOrder(VisitStates* visit_state,
+                                     VisitFunction visit_function,
+                                     HloInstruction* root,
+                                     bool send_recv_as_one_group = false);
 
   // Verifies that the computations are well-formed (e.g., no cycles).
-  Status VerifyComputations(absl::Span<HloComputation* const> computations);
+  absl::Status VerifyComputations(
+      absl::Span<HloComputation* const> computations);
 
   // Below Reachability utils resemble those in HloComputation, except that
   // they can handle instructions across multiple computations.
diff --git a/third_party/xla/xla/service/hlo_module_util.cc b/third_party/xla/xla/service/hlo_module_util.cc
index eca1d682061e6e..ef9fdae98bb48a 100644
--- a/third_party/xla/xla/service/hlo_module_util.cc
+++ b/third_party/xla/xla/service/hlo_module_util.cc
@@ -32,8 +32,8 @@ namespace xla {
 
 namespace {
 
-Status ValidateResultShape(const Shape& client_shape,
-                           const Shape& result_shape) {
+absl::Status ValidateResultShape(const Shape& client_shape,
+                                 const Shape& result_shape) {
   TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(client_shape));
   if (!ShapeUtil::Compatible(client_shape, result_shape)) {
     return InvalidArgument(
diff --git a/third_party/xla/xla/service/hlo_parser.cc b/third_party/xla/xla/service/hlo_parser.cc
index 2937c1ff918616..9102e59601bf4d 100644
--- a/third_party/xla/xla/service/hlo_parser.cc
+++ b/third_party/xla/xla/service/hlo_parser.cc
@@ -45,6 +45,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "Eigen/Core"  // from @eigen_archive
 #include "xla/comparison_util.h"
+#include "xla/hlo/ir/collective_device_list.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_domain_metadata.h"
 #include "xla/hlo/ir/hlo_input_output_alias_config.h"
@@ -243,7 +244,7 @@ class HloParserImpl : public HloParser {
 
   // Runs the parser and constructs the resulting HLO in the given (empty)
   // HloModule. Returns the error status in case an error occurred.
-  Status Run(HloModule* module) override;
+  absl::Status Run(HloModule* module) override;
 
   // Returns the error information.
   std::string GetError() const { return StrJoin(error_, "\n"); }
@@ -694,7 +695,7 @@ bool HloParserImpl::TokenError(absl::string_view msg) {
   return Error(lexer_.GetLoc(), msg);
 }
 
-Status HloParserImpl::Run(HloModule* module) {
+absl::Status HloParserImpl::Run(HloModule* module) {
   lexer_.Lex();
   if ((lexer_.GetKind() == TokKind::kw_HloModule) ||
       (lexer_.GetKind() == TokKind::kw_ENTRY) ||
@@ -1149,7 +1150,7 @@ bool HloParserImpl::ParseHloModule(HloModule* module,
   if (aliasing_data) {
     HloInputOutputAliasConfig alias_config(module->result_shape());
     for (auto& p : *aliasing_data) {
-      Status st =
+      absl::Status st =
           alias_config.SetUpAlias(p.first, p.second.parameter_number,
                                   p.second.parameter_index, p.second.kind);
       if (!st.ok()) {
@@ -1161,7 +1162,7 @@ bool HloParserImpl::ParseHloModule(HloModule* module,
   if (buffer_donor_data) {
     HloBufferDonorConfig buffer_donor_config;
     for (auto& p : *buffer_donor_data) {
-      Status st =
+      absl::Status st =
           buffer_donor_config.AddBufferDonor(p.param_number, p.param_index);
       if (!st.ok()) {
         return TokenError(st.message());
@@ -1423,7 +1424,7 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
   }
   if (predecessors) {
     for (auto* pre : *predecessors) {
-      Status status = pre->AddControlDependencyTo(instruction);
+      absl::Status status = pre->AddControlDependencyTo(instruction);
       if (!status.ok()) {
         return Error(name_loc, StrCat("error adding control dependency for: ",
                                       name, " status: ", status.ToString()));
@@ -1679,14 +1680,15 @@ HloInstruction* HloParserImpl::CreateInstruction(  // NOLINT
       if (tmp_groups) {
         replica_groups = CreateReplicaGroups(*tmp_groups);
       }
+      CollectiveDeviceList device_list(replica_groups);
       if (opcode == HloOpcode::kAllGather) {
         return builder->AddInstruction(HloInstruction::CreateAllGather(
-            *shape, operands, dimensions->at(0), replica_groups,
+            *shape, operands, dimensions->at(0), device_list,
             constrain_layout ? *constrain_layout : false, channel_id,
             use_global_device_ids ? *use_global_device_ids : false));
       }
       return builder->AddInstruction(HloInstruction::CreateAllGatherStart(
-          *shape, operands, dimensions->at(0), replica_groups,
+          *shape, operands, dimensions->at(0), device_list,
           constrain_layout ? *constrain_layout : false, channel_id,
           use_global_device_ids ? *use_global_device_ids : false));
     }
@@ -1721,20 +1723,21 @@ HloInstruction* HloParserImpl::CreateInstruction(  // NOLINT
       if (tmp_groups) {
         replica_groups = CreateReplicaGroups(*tmp_groups);
       }
+      CollectiveDeviceList device_list(replica_groups);
       if (opcode == HloOpcode::kAllReduce) {
         return builder->AddInstruction(HloInstruction::CreateAllReduce(
-            *shape, operands, *to_apply, replica_groups,
+            *shape, operands, *to_apply, device_list,
             constrain_layout ? *constrain_layout : false, channel_id,
             use_global_device_ids ? *use_global_device_ids : false));
       } else if (opcode == HloOpcode::kReduceScatter) {
         return builder->AddInstruction(HloInstruction::CreateReduceScatter(
-            *shape, operands, *to_apply, replica_groups,
+            *shape, operands, *to_apply, device_list,
             constrain_layout ? *constrain_layout : false, channel_id,
             use_global_device_ids ? *use_global_device_ids : false,
             dimensions->at(0)));
       }
       return builder->AddInstruction(HloInstruction::CreateAllReduceStart(
-          *shape, operands, *to_apply, replica_groups,
+          *shape, operands, *to_apply, device_list,
           constrain_layout ? *constrain_layout : false, channel_id,
           use_global_device_ids ? *use_global_device_ids : false));
     }
@@ -1764,7 +1767,7 @@ HloInstruction* HloParserImpl::CreateInstruction(  // NOLINT
         split_dimension = dimensions->at(0);
       }
       return builder->AddInstruction(HloInstruction::CreateAllToAll(
-          *shape, operands, replica_groups,
+          *shape, operands, CollectiveDeviceList(replica_groups),
           constrain_layout ? *constrain_layout : false, channel_id,
           split_dimension));
     }
@@ -1783,7 +1786,8 @@ HloInstruction* HloParserImpl::CreateInstruction(  // NOLINT
         replica_groups = CreateReplicaGroups(*tmp_groups);
       }
       return builder->AddInstruction(HloInstruction::CreateCollectiveBroadcast(
-          *shape, operands, replica_groups, false, channel_id));
+          *shape, operands, CollectiveDeviceList(replica_groups), false,
+          channel_id));
     }
     case HloOpcode::kCollectivePermute:
     case HloOpcode::kCollectivePermuteStart: {
@@ -2022,6 +2026,9 @@ HloInstruction* HloParserImpl::CreateInstruction(  // NOLINT
           !ParseAttributes(attrs, allow_attributes)) {
         return nullptr;
       }
+      if (shape.has_value()) {
+        return builder->AddInstruction(HloInstruction::CreateReplicaId(*shape));
+      }
       return builder->AddInstruction(HloInstruction::CreateReplicaId());
     }
     case HloOpcode::kPartitionId: {
@@ -2030,6 +2037,10 @@ HloInstruction* HloParserImpl::CreateInstruction(  // NOLINT
           !ParseAttributes(attrs, allow_attributes)) {
         return nullptr;
       }
+      if (shape.has_value()) {
+        return builder->AddInstruction(
+            HloInstruction::CreatePartitionId(*shape));
+      }
       return builder->AddInstruction(HloInstruction::CreatePartitionId());
     }
     case HloOpcode::kDynamicReshape: {
diff --git a/third_party/xla/xla/service/hlo_parser.h b/third_party/xla/xla/service/hlo_parser.h
index 1a29800d6fedcd..bf95bfaa3f8129 100644
--- a/third_party/xla/xla/service/hlo_parser.h
+++ b/third_party/xla/xla/service/hlo_parser.h
@@ -93,7 +93,7 @@ class HloParser {
  public:
   // Runs the parser and constructs the resulting HLO in the given (empty)
   // HloModule. Returns the error status in case an error occurred.
-  virtual Status Run(HloModule* module) = 0;
+  virtual absl::Status Run(HloModule* module) = 0;
   virtual ~HloParser() {}
 
  private:
diff --git a/third_party/xla/xla/service/hlo_parser_test.cc b/third_party/xla/xla/service/hlo_parser_test.cc
index 7ff7b661208ac8..904cc5351c1a7a 100644
--- a/third_party/xla/xla/service/hlo_parser_test.cc
+++ b/third_party/xla/xla/service/hlo_parser_test.cc
@@ -5286,5 +5286,25 @@ TEST_F(HloParserTest, PipelinedSendRecv) {
   EXPECT_EQ(OkStatus(), result.status());
 }
 
+TEST_F(HloParserTest, ReplicaIdWithLayout) {
+  const char* const hlo_string = R"(
+  HloModule ReplicaId
+
+  ENTRY ReplicaId {
+    ROOT replica-id.18600 = u32[]{:T(128)} replica-id()
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(hlo_string));
+  EXPECT_TRUE(
+      module->entry_computation()->root_instruction()->shape().has_layout());
+  EXPECT_FALSE(module->entry_computation()
+                   ->root_instruction()
+                   ->shape()
+                   .layout()
+                   .tiles()
+                   .empty());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_pass_fix.h b/third_party/xla/xla/service/hlo_pass_fix.h
index 3835a013b9031b..f360da9fff2c97 100644
--- a/third_party/xla/xla/service/hlo_pass_fix.h
+++ b/third_party/xla/xla/service/hlo_pass_fix.h
@@ -38,9 +38,10 @@ class HloPassFix : public Pass {
   template <typename... Args>
   explicit HloPassFix(Args&&... args) : Pass(args...) {}
 
-  Status RunOnChangedComputations(HloModule* module, RunState* outer_run_state,
-                                  const absl::flat_hash_set<absl::string_view>&
-                                      execution_threads) override {
+  absl::Status RunOnChangedComputations(
+      HloModule* module, RunState* outer_run_state,
+      const absl::flat_hash_set<absl::string_view>& execution_threads)
+      override {
     RunState run_state;
     run_state.changed_last_iteration = outer_run_state->changed_last_iteration;
     TF_RETURN_IF_ERROR(RunToFixPoint(module, &run_state, execution_threads));
@@ -85,7 +86,7 @@ class HloPassFix : public Pass {
   }
 
  private:
-  Status RunToFixPoint(
+  absl::Status RunToFixPoint(
       HloModule* module, RunState* run_state,
       const absl::flat_hash_set<absl::string_view>& execution_threads) {
     VLOG(3) << "Running HloPassFix on " << Pass::name();
@@ -108,7 +109,7 @@ class HloPassFix : public Pass {
     return OkStatus();
   }
 
-  Status RunOnChangedComputationsOnce(
+  absl::Status RunOnChangedComputationsOnce(
       HloModule* module, RunState* run_state,
       const absl::flat_hash_set<absl::string_view>& execution_threads) {
     // If Pass overrides RunOnChangedComputations, just forward to it.
diff --git a/third_party/xla/xla/service/hlo_pass_interface.h b/third_party/xla/xla/service/hlo_pass_interface.h
index 64fee1155c7f14..7e6aaa3804fd23 100644
--- a/third_party/xla/xla/service/hlo_pass_interface.h
+++ b/third_party/xla/xla/service/hlo_pass_interface.h
@@ -77,7 +77,7 @@ class HloPassInterface {
   //      absl::string_view name() const override { return "my-new-pass"; }
   //
   //      using HloPassInterface::Run;
-  //      StatusOr<bool> Run(
+  //      absl::StatusOr<bool> Run(
   //        HloModule* module,
   //        const absl::flat_hash_set<absl::string_view>& execution_threads)
   //        override;
@@ -97,7 +97,7 @@ class HloPassInterface {
   // NOTE: This is a temporary default implementation that conservatively treats
   // all computations as changed. Eventually all passes should override this
   // method instead of Run() and Run() will call into this method instead.
-  virtual Status RunOnChangedComputations(
+  virtual absl::Status RunOnChangedComputations(
       HloModule* module, RunState* run_state,
       const absl::flat_hash_set<absl::string_view>& execution_threads) {
     TF_ASSIGN_OR_RETURN(bool changed, Run(module, execution_threads));
@@ -126,7 +126,7 @@ class HloPassInterface {
   //      absl::string_view name() const override { return "my-new-pass"; }
   //
   //      using HloPassInterface::RunOnModuleGroup;
-  //      StatusOr<bool> RunOnModuleGroup(
+  //      absl::StatusOr<bool> RunOnModuleGroup(
   //        HloModuleGroup* module_group,
   //        const absl::flat_hash_set<absl::string_view>& execution_threads)
   //        override;
diff --git a/third_party/xla/xla/service/hlo_pass_pipeline.cc b/third_party/xla/xla/service/hlo_pass_pipeline.cc
index ece25312e9dcfd..f45f81cd186a42 100644
--- a/third_party/xla/xla/service/hlo_pass_pipeline.cc
+++ b/third_party/xla/xla/service/hlo_pass_pipeline.cc
@@ -40,7 +40,7 @@ namespace {
 void RecordPassStartMetadata(HloModule& module, const std::string& pass_name,
                              const std::string& pipeline_name) {
   module.metadata()->RecordPassStart();
-  // An HloPassMetadata was just created so Status should always be OK.
+  // An HloPassMetadata was just created so absl::Status should always be OK.
   TF_CHECK_OK(module.metadata()->set_current_pass_name(pass_name));
   TF_CHECK_OK(module.metadata()->set_current_pass_pipeline_name(pipeline_name));
 }
@@ -53,9 +53,9 @@ void RecordPassStartMetadata(HloModuleGroup& module_group,
   }
 }
 
-Status AttemptRecordPassEndMetadata(HloModule& module,
-                                    const std::string& pass_name,
-                                    bool module_changed) {
+absl::Status AttemptRecordPassEndMetadata(HloModule& module,
+                                          const std::string& pass_name,
+                                          bool module_changed) {
   // Module id is set here instead of RecordPassStartMetadata because it may
   // change in the middle of the pass, and we want the final id.
   TF_RETURN_IF_ERROR(
@@ -68,16 +68,16 @@ Status AttemptRecordPassEndMetadata(HloModule& module,
 
 void RecordPassEndMetadata(HloModule& module, const std::string& pass_name,
                            bool module_changed) {
-  Status status =
+  absl::Status status =
       AttemptRecordPassEndMetadata(module, pass_name, module_changed);
   if (!status.ok()) {
     LOG(FATAL) << status;
   }
 }
 
-Status AttemptRecordPassEndMetadata(HloModuleGroup& module_group,
-                                    const std::string& pass_name,
-                                    bool module_changed) {
+absl::Status AttemptRecordPassEndMetadata(HloModuleGroup& module_group,
+                                          const std::string& pass_name,
+                                          bool module_changed) {
   for (HloModule* module : module_group.modules()) {
     for (HloModule* other_module : module_group.modules()) {
       TF_RETURN_IF_ERROR(
@@ -92,7 +92,7 @@ Status AttemptRecordPassEndMetadata(HloModuleGroup& module_group,
 
 void RecordPassEndMetadata(HloModuleGroup& module_group,
                            const std::string& pass_name, bool module_changed) {
-  Status status =
+  absl::Status status =
       AttemptRecordPassEndMetadata(module_group, pass_name, module_changed);
   if (!status.ok()) {
     LOG(FATAL) << status;
@@ -102,7 +102,7 @@ void RecordPassEndMetadata(HloModuleGroup& module_group,
 }  // namespace
 
 template <typename HloT>
-Status HloPassPipeline::RunInvariantCheckers(
+absl::Status HloPassPipeline::RunInvariantCheckers(
     HloT* hlo, absl::string_view after_pass_name,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   for (auto& invariant_checker : invariant_checkers_) {
@@ -169,8 +169,10 @@ absl::StatusOr<bool> HloPassPipeline::RunPassesInternal(
     HloPassInterface* pass = passes[i];
     std::string pass_name = std::string(pass->name());
     XLA_SCOPED_LOGGING_TIMER(absl::StrCat("HLO pass: ", pass_name));
-    tsl::profiler::ScopedAnnotation annotation{
-        [&] { return "XlaPass:" + pass_name; }};
+    tsl::profiler::ScopedAnnotation annotation{[&] {
+      return absl::StrFormat("XlaPass:#name=%s,module=%s,program_id=%s#",
+                             pass_name, hlo->name(), UniqueId(*hlo));
+    }};
     VLOG(1) << "  HLO pass " << pass_name;
     VLOG(2) << "  Module hash " << absl::HashOf(*hlo);
     if (!pass->IsPassPipeline()) {
@@ -269,7 +271,8 @@ void HloPassPipeline::MaybeDumpHloAndSaveFilenames(
     absl::string_view before_pass_name) {
   for (const std::string& filename : DumpHloModuleBetweenPassesIfEnabled(
            name(), before_pass_name, after_pass_name, module)) {
-    Status status = module.metadata()->add_current_pass_dump_filename(filename);
+    absl::Status status =
+        module.metadata()->add_current_pass_dump_filename(filename);
     if (!status.ok()) {
       LOG(FATAL) << status;
     }
diff --git a/third_party/xla/xla/service/hlo_pass_pipeline.h b/third_party/xla/xla/service/hlo_pass_pipeline.h
index 6ee1fc5e200566..e9fdec03dd21ea 100644
--- a/third_party/xla/xla/service/hlo_pass_pipeline.h
+++ b/third_party/xla/xla/service/hlo_pass_pipeline.h
@@ -114,11 +114,12 @@ class HloPassPipeline : public HloPassInterface {
   // `execution_threads`. Empty `execution_threads` means all execution threads
   // are included. HloT can be either HloModule or HloModuleGroup.
   template <typename HloT>
-  Status RunInvariantCheckers(HloT* hlo, absl::string_view after_pass_name) {
+  absl::Status RunInvariantCheckers(HloT* hlo,
+                                    absl::string_view after_pass_name) {
     return RunInvariantCheckers(hlo, after_pass_name, /*execution_threads=*/{});
   }
   template <typename HloT>
-  Status RunInvariantCheckers(
+  absl::Status RunInvariantCheckers(
       HloT* hlo, absl::string_view after_pass_name,
       const absl::flat_hash_set<absl::string_view>& execution_threads);
 
diff --git a/third_party/xla/xla/service/hlo_pass_pipeline_test.cc b/third_party/xla/xla/service/hlo_pass_pipeline_test.cc
index 252f9ccb7124e8..d5ad880f72c7a7 100644
--- a/third_party/xla/xla/service/hlo_pass_pipeline_test.cc
+++ b/third_party/xla/xla/service/hlo_pass_pipeline_test.cc
@@ -319,7 +319,7 @@ ENTRY main {
     pipeline.AddInvariantChecker<BarBlowerUpper>();
     pipeline.AddPass<FooToBarModulePass>();
 
-    Status status = pipeline.Run(module.get()).status();
+    absl::Status status = pipeline.Run(module.get()).status();
     ASSERT_IS_NOT_OK(status);
     EXPECT_THAT(status.message(),
                 ::testing::HasSubstr("Module has instruction named bar"));
@@ -331,7 +331,7 @@ ENTRY main {
     HloPassPipeline pipeline(TestName());
     pipeline.AddInvariantChecker<BarBlowerUpper>();
 
-    Status status = pipeline.Run(module.get()).status();
+    absl::Status status = pipeline.Run(module.get()).status();
     ASSERT_IS_NOT_OK(status);
     EXPECT_THAT(status.message(),
                 ::testing::HasSubstr("Module has instruction named bar"));
@@ -356,7 +356,7 @@ ENTRY main {
   HloPassPipeline pipeline(TestName());
   pipeline.AddPass<BazToQuxModuleGroupPass>();
 
-  Status status = pipeline.Run(module.get()).status();
+  absl::Status status = pipeline.Run(module.get()).status();
   ASSERT_IS_NOT_OK(status);
   EXPECT_THAT(
       status.message(),
diff --git a/third_party/xla/xla/service/hlo_rematerialization.cc b/third_party/xla/xla/service/hlo_rematerialization.cc
index a524e3cca559dc..69cfeed1b05852 100644
--- a/third_party/xla/xla/service/hlo_rematerialization.cc
+++ b/third_party/xla/xla/service/hlo_rematerialization.cc
@@ -22,8 +22,8 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <set>
-#include <sstream>
 #include <string>
+#include <string_view>
 #include <tuple>
 #include <utility>
 #include <vector>
@@ -56,7 +56,6 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/status_macros.h"
-#include "xla/statusor.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 
@@ -522,7 +521,7 @@ class MemoryUsageTracker {
   // EndInstruction) to accurately model memory usage. At BeginInstruction the
   // memory for the output value(s) of the current instruction is allocated. At
   // EndInstruction memory for dead operand(s) is freed.
-  Status BeginInstruction(Item* item);
+  absl::Status BeginInstruction(Item* item);
 
   int64_t RematerializationCost(const std::vector<Item*>& items,
                                 int64_t memory_reduced,
@@ -552,7 +551,7 @@ class MemoryUsageTracker {
   // Finishes the placement of the current instruction. This frees any dead
   // operands or dead result of the instruction. This must be called after
   // each call to BeginInstruction.
-  Status EndInstruction();
+  absl::Status EndInstruction();
 
   // Returns the number of bytes that the current memory usage will be reduced
   // if the given instruction is compact.
@@ -564,16 +563,18 @@ class MemoryUsageTracker {
   int64_t MemoryReducedIfRematerialized(
       absl::Span<const Item* const> items) const;
 
-  Status AddCompressInstructions(Item* original_item, Item* compressed_item,
-                                 Item* uncompressed_item);
+  absl::Status AddCompressInstructions(Item* original_item,
+                                       Item* compressed_item,
+                                       Item* uncompressed_item);
 
   // Adjusts memory usage to account for the rematerialization of
   // original_item for all remaining unplaced uses. The rematerialization
   // is remat_item. This method should be called after the HLO graph has
   // been transformed (rematerialization instruction created and connected
   // to uses).
-  Status AddRematerializedInstruction(Item* original_item, Item* remat_item,
-                                      absl::Span<Item*> indirect_users);
+  absl::Status AddRematerializedInstruction(Item* original_item,
+                                            Item* remat_item,
+                                            absl::Span<Item*> indirect_users);
 
   // Given a list of uses return two lists where one is the ones which are
   // placed and the other is ones which are not yet placed.
@@ -584,11 +585,11 @@ class MemoryUsageTracker {
   // Given the newly created instructions for host memory offload, create new
   // buffers, link their uses to their users, and update the current memory
   // usage.
-  Status AddHostOffloadCopyInstructions(Item* original_item,
-                                        Item* copy_start_to_host_item,
-                                        Item* copy_done_to_host_item,
-                                        Item* copy_start_to_device_item,
-                                        Item* copy_done_to_device_item);
+  absl::Status AddHostOffloadCopyInstructions(Item* original_item,
+                                              Item* copy_start_to_host_item,
+                                              Item* copy_done_to_host_item,
+                                              Item* copy_start_to_device_item,
+                                              Item* copy_done_to_device_item);
 
   // Counts the bytes that this item occupies by summing up the buffers defined
   // by this item. If only_count_unplaced_users is true, only count users of
@@ -715,7 +716,7 @@ class MemoryUsageTracker {
 
   // Adjust our tracked memory usage as a result of this item going out of
   // scope.
-  Status CountFreedMemory(Item* item);
+  absl::Status CountFreedMemory(Item* item);
 
   // Buffers have users and users have buffers used, this function resolves
   // outstanding issues in that bidirectional dependency.
@@ -941,7 +942,7 @@ void MemoryUsageTracker::CountAllocatedMemory(Item* item) {
   }
 }
 
-Status MemoryUsageTracker::CountFreedMemory(Item* item) {
+absl::Status MemoryUsageTracker::CountFreedMemory(Item* item) {
   for (BufferId buffer_id : item->buffers_used) {
     Buffer& buffer = buffers_.at(buffer_id);
     buffer.unfinished_user_count--;
@@ -970,7 +971,7 @@ Status MemoryUsageTracker::CountFreedMemory(Item* item) {
   return OkStatus();
 }
 
-Status MemoryUsageTracker::BeginInstruction(Item* item) {
+absl::Status MemoryUsageTracker::BeginInstruction(Item* item) {
   const HloInstruction* instruction = item->instruction;
   VLOG(3) << "BeginInstruction " << instruction->name();
   TF_RET_CHECK(in_progress_item_ == nullptr);
@@ -992,7 +993,7 @@ Status MemoryUsageTracker::BeginInstruction(Item* item) {
   return OkStatus();
 }
 
-Status MemoryUsageTracker::EndInstruction() {
+absl::Status MemoryUsageTracker::EndInstruction() {
   TF_RET_CHECK(in_progress_item_ != nullptr);
   VLOG(3) << "EndInstruction " << in_progress_item_->instruction->name();
 
@@ -1117,9 +1118,8 @@ void MemoryUsageTracker::ReplaceUsesInUsersOfBuffer(Buffer& buffer,
   }
 }
 
-Status MemoryUsageTracker::AddCompressInstructions(Item* original_item,
-                                                   Item* compressed_item,
-                                                   Item* uncompressed_item) {
+absl::Status MemoryUsageTracker::AddCompressInstructions(
+    Item* original_item, Item* compressed_item, Item* uncompressed_item) {
   CHECK(original_item->placed)
       << "Compressing instruction, but the original is not yet placed.";
   CHECK_EQ(original_item->buffers_output.size(), 1)
@@ -1186,7 +1186,7 @@ Status MemoryUsageTracker::AddCompressInstructions(Item* original_item,
   return OkStatus();
 }
 
-Status MemoryUsageTracker::AddRematerializedInstruction(
+absl::Status MemoryUsageTracker::AddRematerializedInstruction(
     Item* original_item, Item* remat_item, absl::Span<Item*> indirect_users) {
   VLOG(3) << "AddRematerializedInstruction: original_instruction = "
           << original_item->instruction->name()
@@ -1331,7 +1331,7 @@ Status MemoryUsageTracker::AddRematerializedInstruction(
   return OkStatus();
 }
 
-Status MemoryUsageTracker::AddHostOffloadCopyInstructions(
+absl::Status MemoryUsageTracker::AddHostOffloadCopyInstructions(
     Item* original_item, Item* copy_start_to_host_item,
     Item* copy_done_to_host_item, Item* copy_start_to_device_item,
     Item* copy_done_to_device_item) {
@@ -2060,7 +2060,8 @@ absl::StatusOr<int64_t> RematerializeInstructions(
                 HloInstruction::CreateGetTupleElement(
                     ShapeUtil::GetTupleElementShape(remat_use->shape(),
                                                     *user.index),
-                    remat_use, *user.index));
+                    remat_use, *user.index),
+                /*new_name=*/"gte.remat");
             indirect_users.push_back(instruction_list->CreateItem(remat_use));
             gte_cache[*user.index] = remat_use;
           } else {
@@ -2069,7 +2070,8 @@ absl::StatusOr<int64_t> RematerializeInstructions(
         }
         if (user_operand->shape() != remat_use->shape()) {
           remat_use = computation->AddInstruction(
-              HloInstruction::CreateBitcast(user_operand->shape(), remat_use));
+              HloInstruction::CreateBitcast(user_operand->shape(), remat_use),
+              /*new_name=*/"bitcast.remat");
           indirect_users.push_back(instruction_list->CreateItem(remat_use));
         }
         TF_RETURN_IF_ERROR(user.user->instruction->ReplaceOperandWith(
@@ -2759,7 +2761,9 @@ absl::StatusOr<bool> HloRematerialization::RematerializeComputation(
       // in the callee computations.
       for (HloComputation* called_computation :
            callsite->called_computations()) {
-        if (!ContainsKey(rematerialized_computations_, called_computation)) {
+        if (!ContainsKey(rematerialized_computations_, called_computation) &&
+            HloInstruction::IsThreadIncluded(
+                called_computation->execution_thread(), execution_threads)) {
           // Memory limit for the subcomputation is the memory limit less the
           // amount of memory used at this point in the computation.
           int64_t subcomputation_memory_limit_bytes = std::max<int64_t>(
@@ -2860,17 +2864,70 @@ absl::StatusOr<bool> HloRematerialization::Run(
         module_output_size += options_.hlo_cost_analysis.GetShapeSize(subshape);
       });
 
-  const int64_t adjusted_memory_limit_bytes =
+  int64_t adjusted_memory_limit_bytes =
       std::max<int64_t>(0, options_.memory_limit_bytes - module_output_size);
   VLOG(1) << "Adjusted memory limit accounting for output ("
           << HumanReadableNumBytes(module_output_size)
           << "): " << HumanReadableNumBytes(adjusted_memory_limit_bytes);
 
+  call_graph_ = CallGraph::Build(module);
+
+  // Buffer assignment allocates a single stack for all asynchronous
+  // computations of the same thread, which persists for the entire duration of
+  // the program. We need to account for this by adjusting the memory limit.
+  int64_t total_async_peak_memory = 0;
+  if (!options_.async_computation_parallelism.empty()) {
+    // We cannot compute memory usage for both the main and asynchronous threads
+    // at the same time, as that will cause the asynchronous callee usage to be
+    // added to the main thread callers usage. The callee's memory is
+    // preallocated, so the caller doesn't pay for it.
+    absl::flat_hash_set<std::string_view> async_threads;
+    for (const auto& [computation, _] :
+         options_.async_computation_parallelism) {
+      async_threads.insert(computation->execution_thread());
+    }
+    TF_RETURN_IF_ERROR(call_graph_->VisitNodes(
+        [this, module, &async_threads](const CallGraphNode& node) -> Status {
+          auto callee_thread = node.computation()->execution_thread();
+          if (node.context() == CallContext::kControlFlow &&
+              HloInstruction::IsThreadIncluded(callee_thread, async_threads)) {
+            TF_ASSIGN_OR_RETURN(computation_peak_memory_[node.computation()],
+                                ComputePeakMemory(node.computation(),
+                                                  module->schedule().sequence(
+                                                      node.computation()),
+                                                  {callee_thread}));
+          }
+          return OkStatus();
+        },
+        /*visit_unreachable_nodes=*/false));
+
+    int64_t async_peak_memory = 0;
+    // Only consider asynchronous computations invoked from the main thread.
+    for (const auto [entry_computation, parallel_threads] :
+         options_.async_computation_parallelism) {
+      const int64_t peak_memory =
+          computation_peak_memory_.at(entry_computation);
+      // Adjust memory usage for parallel execution of the same computation
+      // on different devices.
+      const int64_t parallel_peak_memory = peak_memory * parallel_threads;
+      async_peak_memory = std::max(async_peak_memory, parallel_peak_memory);
+    }
+    adjusted_memory_limit_bytes =
+        std::max<int64_t>(0, adjusted_memory_limit_bytes - async_peak_memory);
+    total_async_peak_memory += async_peak_memory;
+    VLOG(1) << "Adjusted memory limit accounting for async computations ("
+            << HumanReadableNumBytes(async_peak_memory)
+            << "): " << HumanReadableNumBytes(adjusted_memory_limit_bytes);
+
+    // Reset back to a clean state, since we don't expect to utilize the
+    // async computation memory usage anymore.
+    computation_peak_memory_.clear();
+  }
   // Compute peak memory usage of all computations in the module called in a
   // sequential context.
-  call_graph_ = CallGraph::Build(module);
   TF_RETURN_IF_ERROR(call_graph_->VisitNodes(
-      [this, module, &execution_threads](const CallGraphNode& node) -> Status {
+      [this, module,
+       &execution_threads](const CallGraphNode& node) -> absl::Status {
         if (node.context() == CallContext::kControlFlow &&
             HloInstruction::IsThreadIncluded(
                 node.computation()->execution_thread(), execution_threads)) {
@@ -2885,12 +2942,13 @@ absl::StatusOr<bool> HloRematerialization::Run(
       /*visit_unreachable_nodes=*/false));
 
   // The peak memory usage of the module equals the peak memory use of the entry
-  // computation plus the output size of the computation. This is because the
-  // peak memory for a computation does not include the output as this is
-  // typically accounted for in the caller.
+  // computation plus the output size of the computation plus memory use of
+  // asynchronous computations. This is because the peak memory for a
+  // computation does not include the output as this is typically accounted for
+  // in the caller.
   const int64_t before_peak_memory =
       computation_peak_memory_.at(module->entry_computation()) +
-      module_output_size;
+      module_output_size + total_async_peak_memory;
   VLOG(1) << "Peak memory usage of module (before): "
           << HumanReadableNumBytes(before_peak_memory);
 
@@ -2926,7 +2984,7 @@ absl::StatusOr<bool> HloRematerialization::Run(
           << net_instructions_added_ << " net instructions added";
   const int64_t current_peak_memory =
       computation_peak_memory_.at(module->entry_computation()) +
-      module_output_size;
+      module_output_size + total_async_peak_memory;
   VLOG(1) << "Peak memory usage of module now "
           << HumanReadableNumBytes(current_peak_memory) << " ("
           << current_peak_memory << " bytes), was "
diff --git a/third_party/xla/xla/service/hlo_rematerialization.h b/third_party/xla/xla/service/hlo_rematerialization.h
index 4fae1cc2ddbbd0..e5bb5cc8aa8d2e 100644
--- a/third_party/xla/xla/service/hlo_rematerialization.h
+++ b/third_party/xla/xla/service/hlo_rematerialization.h
@@ -90,13 +90,15 @@ class HloRematerialization : public HloModulePass {
   static Shape DefaultCompactShapeFunction(const Shape& shape) { return shape; }
 
   struct Options {
-    explicit Options(
-        HloCostAnalysis& hlo_cost_analysis,
-        const RematerializationModeConfig& remat_mode_config,
-        int64_t memory_limit_bytes, int block_size_limit,
-        int block_rematerialization_factor, int64_t min_remat_size,
-        CompactShapeFunction compact_shape_function,
-        std::optional<HostMemoryOffloadConfig> host_memory_offload_config)
+    explicit Options(HloCostAnalysis& hlo_cost_analysis,
+                     const RematerializationModeConfig& remat_mode_config,
+                     int64_t memory_limit_bytes, int block_size_limit,
+                     int block_rematerialization_factor, int64_t min_remat_size,
+                     CompactShapeFunction compact_shape_function,
+                     std::optional<HostMemoryOffloadConfig>
+                         host_memory_offload_config = std::nullopt,
+                     absl::flat_hash_map<HloComputation*, int64_t>
+                         async_computation_parallelism = {})
         : hlo_cost_analysis(hlo_cost_analysis),
           remat_mode_config(remat_mode_config),
           memory_limit_bytes(memory_limit_bytes),
@@ -106,7 +108,8 @@ class HloRematerialization : public HloModulePass {
           compact_shape_function(compact_shape_function == nullptr
                                      ? DefaultCompactShapeFunction
                                      : std::move(compact_shape_function)),
-          host_memory_offload_config(host_memory_offload_config) {}
+          host_memory_offload_config(host_memory_offload_config),
+          async_computation_parallelism(async_computation_parallelism) {}
 
     // The cost model used for decisions during rematerialization for host
     // memory offload. It is also used for getting Shape size.
@@ -133,7 +136,7 @@ class HloRematerialization : public HloModulePass {
     // return for potentially reduced memory consumption.
     int block_rematerialization_factor;
 
-    // The minimim size, in bytes, of a tensor to be considered for
+    // The minimum size, in bytes, of a tensor to be considered for
     // rematerialization. All tensors smaller than this size will be skipped
     // over.
     int64_t min_remat_size;
@@ -143,6 +146,10 @@ class HloRematerialization : public HloModulePass {
     CompactShapeFunction compact_shape_function;
 
     std::optional<HostMemoryOffloadConfig> host_memory_offload_config;
+
+    // Collection of async entry computations and their number of parallel
+    // invocations.
+    absl::flat_hash_map<HloComputation*, int64_t> async_computation_parallelism;
   };
 
   explicit HloRematerialization(Options options, RematerializationSizes& sizes)
diff --git a/third_party/xla/xla/service/hlo_rematerialization_test.cc b/third_party/xla/xla/service/hlo_rematerialization_test.cc
index bf3ef3cb0e2efa..b30cf8293e48e9 100644
--- a/third_party/xla/xla/service/hlo_rematerialization_test.cc
+++ b/third_party/xla/xla/service/hlo_rematerialization_test.cc
@@ -22,6 +22,9 @@ limitations under the License.
 #include <string>
 
 #include <gmock/gmock.h>
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -35,6 +38,7 @@ limitations under the License.
 #include "xla/util.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/statusor.h"
+#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
@@ -43,6 +47,88 @@ namespace op = xla::testing::opcode_matchers;
 
 using ::testing::_;
 
+class AsyncRematerializationTest : public RematerializationTestBase {
+ protected:
+  absl::StatusOr<bool> RunHloRematerialization(
+      int64_t memory_limit_bytes, HloModule* module,
+      const absl::flat_hash_map<HloComputation*, int64_t>&
+          async_computation_parallelism,
+      int64_t min_remat_size = 0) {
+    TF_EXPECT_OK(verifier().Run(module).status());
+    if (!module->has_schedule()) {
+      HloMemoryScheduler scheduler(
+          [](const BufferValue& buffer) { return ByteSizeOf(buffer.shape()); },
+          ComputationSchedulerToModuleScheduler(DefaultMemoryScheduler));
+      TF_EXPECT_OK(scheduler.Run(module).status());
+    }
+    HloRematerialization::RematerializationModeConfig config(
+        /*recompute=*/true, /*compress=*/true, /*host_offload=*/false);
+    auto shape_size_func = [](const Shape& shape) { return ByteSizeOf(shape); };
+    HloCostAnalysis cost_analysis(shape_size_func);
+    HloRematerialization::Options options(
+        cost_analysis, config, memory_limit_bytes,
+        /*block_size_limit=*/1, /*block_rematerialization_factor=*/1,
+        min_remat_size, /*compact_shape_function=*/nullptr,
+        /*host_memory_offload_config=*/std::nullopt,
+        /*async_computation_parallelism=*/async_computation_parallelism);
+    HloRematerialization::RematerializationSizes sizes;
+    HloRematerialization remat(options, sizes);
+    return remat.Run(module, {HloInstruction::kMainExecutionThread});
+  }
+
+  static constexpr int64_t kNumParallelThreads = 16;
+};
+
+TEST_F(AsyncRematerializationTest, AsyncComputation) {
+  constexpr std::string_view hlo = R"(
+HloModule async, is_scheduled=true
+
+%offload_computation {
+  %param = f32[1]{0} parameter(0)
+  %reshape = f32[] reshape(f32[1]{0} %param)
+  %broadcast = f32[1024]{0} broadcast(f32[] %reshape), dimensions={}
+  %negate = f32[1024]{0} negate(f32[1024]{0} %broadcast)
+  %concatenate = f32[2048]{0} concatenate(f32[1024]{0} %negate, f32[1024]{0} %negate), dimensions={0}
+  %slice = f32[1]{0} slice(f32[2048]{0} %concatenate), slice={[0:1]}
+  %concatenate.1 = f32[1025]{0} concatenate(f32[1024]{0} %broadcast, f32[1]{0} %slice), dimensions={0}
+  ROOT %slice.1 = f32[1]{0} slice(f32[1025]{0} %concatenate.1), slice={[0:1]}
+}
+
+%main_computation {
+  %param = f32[1]{0} parameter(0)
+  %reshape = f32[] reshape(f32[1]{0} %param)
+  %broadcast = f32[1024]{0} broadcast(f32[] %reshape), dimensions={}
+  %negate = f32[1024]{0} negate(f32[1024]{0} %broadcast)
+  %concatenate = f32[2048]{0} concatenate(f32[1024]{0} %negate, f32[1024]{0} %negate), dimensions={0}
+  %slice = f32[1]{0} slice(f32[2048]{0} %concatenate), slice={[0:1]}
+  %concatenate.1 = f32[1025]{0} concatenate(f32[1024]{0} %broadcast, f32[1]{0} %slice), dimensions={0}
+  ROOT %slice.1 = f32[1]{0} slice(f32[1025]{0} %concatenate.1), slice={[0:1]}
+}
+
+ENTRY %main {
+  %param = f32[1]{0} parameter(0)
+  %call-start = ((f32[1]{0}), f32[1]{0}, s32[]) call-start(f32[1]{0} %param), to_apply=%offload_computation, async_execution_thread="offload"
+  %call-done = f32[1]{0} call-done(((f32[1]{0}), f32[1]{0}, s32[]) %call-start)
+  ROOT %call = f32[1]{0} call(f32[1]{0} %call-done), to_apply=%main_computation
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
+
+  HloInstruction* call_start = FindInstruction(module.get(), "call-start");
+  // Computation requires 16KB without rematerialization, but uses only 12KB
+  // with rematerialization so pick a memory limit between these values (14KB).
+  // Asynchronous computation will run on 16 devices and we do not rematerialize
+  // it, so it will reserve 16 * 16Kb from the memory limit.
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      RunHloRematerialization(
+          /*memory_limit_bytes=*/kNumParallelThreads * 16 * 1024 + 14 * 1024,
+          module.get(),
+          {{call_start->async_wrapped_computation(), kNumParallelThreads}}));
+  EXPECT_TRUE(changed);
+}
+
 // Inherits methods to create rematerializable computations. See
 // RematerializationTestBase for more.
 class RecomputeAndCompressHloRematerializationTest
@@ -58,6 +144,16 @@ class RecomputeAndCompressHloRematerializationTest
           ComputationSchedulerToModuleScheduler(DefaultMemoryScheduler));
       TF_EXPECT_OK(scheduler.Run(module).status());
     }
+
+    // First, get a set of instruction names before running remat.
+    for (const HloComputation* computation : module->computations()) {
+      before_computation_names_.insert(computation->name());
+      for (const HloInstruction* instruction : computation->instructions()) {
+        before_instruction_names_.insert(instruction->name());
+      }
+    }
+
+    // Run remat.
     HloRematerialization::RematerializationModeConfig config(
         /*recompute=*/true, /*compress=*/true, /*host_offload=*/false);
     auto shape_size_func = [](const Shape& shape) { return ByteSizeOf(shape); };
@@ -66,11 +162,45 @@ class RecomputeAndCompressHloRematerializationTest
         cost_analysis, config, memory_limit_bytes,
         /*block_size_limit=*/1, /*block_rematerialization_factor=*/1,
         min_remat_size, /*compact_shape_function=*/nullptr,
-        /*host_memory_offload_config=*/std::nullopt);
+        /*host_memory_offload_config=*/std::nullopt,
+        /*async_threads=*/{});
     HloRematerialization::RematerializationSizes sizes;
     HloRematerialization remat(options, sizes);
-    return remat.Run(module);
+    absl::StatusOr<bool> result = remat.Run(module);
+
+    // Finally, get a set of instruction names after running remat.
+    for (const HloComputation* computation : module->computations()) {
+      if (!before_computation_names_.contains(computation->name())) {
+        // This computation was cloned by remat. Skip.
+        continue;
+      }
+      for (const HloInstruction* instruction : computation->instructions()) {
+        after_instruction_names_.insert(instruction->name());
+      }
+    }
+
+    return result;
+  }
+
+  void CheckForRematInInstructionNames(absl::string_view test_case_name) {
+    constexpr const absl::string_view kRematInstructionNameMustContain =
+        ".remat";
+    for (const auto& instruction_name : after_instruction_names_) {
+      if (!before_instruction_names_.contains(instruction_name)) {
+        // This is a newly inserted instruction by remat, check that it contains
+        // the target name.
+        EXPECT_TRUE(absl::StrContains(instruction_name,
+                                      kRematInstructionNameMustContain))
+            << "[" << test_case_name << "] Instruction \"" << instruction_name
+            << "\" must contain \"" << kRematInstructionNameMustContain << "\"";
+      }
+    }
   }
+
+ private:
+  absl::flat_hash_set<absl::string_view> before_computation_names_;
+  absl::flat_hash_set<absl::string_view> before_instruction_names_;
+  absl::flat_hash_set<absl::string_view> after_instruction_names_;
 };
 
 // Test rematerialization of a single computation produced by
@@ -111,6 +241,8 @@ TEST_F(RecomputeAndCompressHloRematerializationTest, SingleComputation) {
                 .sequence(computation)
                 .instructions()[computation->instruction_count() - 3],
             remat_bcast);
+  CheckForRematInInstructionNames(
+      ::testing::UnitTest::GetInstance()->current_test_info()->name());
 }
 
 // Test rematerialization of a single computation that contains nodes that
@@ -191,6 +323,8 @@ TEST_F(RecomputeAndCompressHloRematerializationTest, RematerializeAroundWhile) {
   // Only the entry computation should have a rematerialized instruction added.
   EXPECT_EQ(entry_computation->instruction_count(), 8);
   EXPECT_EQ(body_computation->instruction_count(), 8);
+  CheckForRematInInstructionNames(
+      ::testing::UnitTest::GetInstance()->current_test_info()->name());
 }
 
 // Test rematerialization of a computation which calls another computation via a
@@ -225,6 +359,8 @@ TEST_F(RecomputeAndCompressHloRematerializationTest,
   // Both computations should have rematerialized instructions added.
   EXPECT_EQ(entry_computation->instruction_count(), 9);
   EXPECT_EQ(body_computation->instruction_count(), 9);
+  CheckForRematInInstructionNames(
+      ::testing::UnitTest::GetInstance()->current_test_info()->name());
 }
 
 // Test rematerialization of a doubly nested computation. All computations
@@ -268,6 +404,8 @@ TEST_F(RecomputeAndCompressHloRematerializationTest,
   EXPECT_EQ(entry_computation->instruction_count(), 9);
   EXPECT_EQ(middle_computation->instruction_count(), 9);
   EXPECT_EQ(inner_computation->instruction_count(), 9);
+  CheckForRematInInstructionNames(
+      ::testing::UnitTest::GetInstance()->current_test_info()->name());
 }
 
 TEST_F(RecomputeAndCompressHloRematerializationTest, RngNotRematerialized) {
@@ -336,6 +474,8 @@ TEST_F(RecomputeAndCompressHloRematerializationTest, RngNotRematerialized) {
   EXPECT_EQ(count_rngs(entry_computation), 1);
   // There should have been rematerialization.
   EXPECT_GT(entry_computation->instruction_count(), original_instruction_count);
+  CheckForRematInInstructionNames(
+      ::testing::UnitTest::GetInstance()->current_test_info()->name());
 }
 
 TEST_F(RecomputeAndCompressHloRematerializationTest,
@@ -438,6 +578,8 @@ TEST_F(RecomputeAndCompressHloRematerializationTest,
   EXPECT_THAT(add_3->operand(0), op::Broadcast(param));
   EXPECT_NE(add_4->operand(0), bcast);
   EXPECT_THAT(add_4->operand(0), op::Broadcast(param));
+  CheckForRematInInstructionNames(
+      ::testing::UnitTest::GetInstance()->current_test_info()->name());
 }
 
 TEST_F(RecomputeAndCompressHloRematerializationTest, CopyNotRematerialized) {
@@ -484,6 +626,8 @@ TEST_F(RecomputeAndCompressHloRematerializationTest, CopyNotRematerialized) {
   EXPECT_TRUE(changed);
 
   EXPECT_EQ(count_copies(entry_computation), 1);
+  CheckForRematInInstructionNames(
+      ::testing::UnitTest::GetInstance()->current_test_info()->name());
 }
 
 // Test rematerialization of values through bitcasts
@@ -549,6 +693,8 @@ ENTRY %mycomp (param: f32[1]) -> f32[1] {
                 .sequence(computation)
                 .instructions()[computation->instruction_count() - 4],
             remat_broadcast);
+  CheckForRematInInstructionNames(
+      ::testing::UnitTest::GetInstance()->current_test_info()->name());
 }
 
 // Test that the "deny list for move remats" engages when we rematerialize
@@ -584,6 +730,8 @@ ENTRY %mycomp (param: f32[1]) -> f32[1024] {
   ASSERT_THAT(add, op::Add(op::Bitcast(op::Broadcast(_)),
                            op::Bitcast(op::Broadcast(_))));
   EXPECT_TRUE(changed);
+  CheckForRematInInstructionNames(
+      ::testing::UnitTest::GetInstance()->current_test_info()->name());
 }
 
 TEST_F(RecomputeAndCompressHloRematerializationTest, RematTupleShape) {
@@ -625,8 +773,9 @@ ENTRY %entry {
                               /*memory_limit_bytes=*/11 * 1024, module.get()));
   EXPECT_TRUE(changed);
   ASSERT_THAT(
-      add, op::Add(op::Multiply(), op::GetTupleElement(AllOf(
-                                       op::Fusion(), ::testing::Ne(fusion)))));
+      add, op::Add(op::Multiply(), AllOf(op::Fusion(), ::testing::Ne(fusion))));
+  CheckForRematInInstructionNames(
+      ::testing::UnitTest::GetInstance()->current_test_info()->name());
 }
 
 TEST_F(RecomputeAndCompressHloRematerializationTest, RematTupleShapeDoubleUse) {
@@ -680,6 +829,8 @@ ENTRY %entry {
   // Check that the rematerialized fusion is the same for both ops.
   EXPECT_EQ(add->operand(0)->operand(1)->operand(0),
             add->operand(1)->operand(0));
+  CheckForRematInInstructionNames(
+      ::testing::UnitTest::GetInstance()->current_test_info()->name());
 }
 
 TEST_F(RecomputeAndCompressHloRematerializationTest,
@@ -725,9 +876,11 @@ ENTRY %entry {
                           RunHloRematerialization(
                               /*memory_limit_bytes=*/11 * 1024, module.get()));
   EXPECT_TRUE(changed);
-  ASSERT_THAT(add, op::Add(op::Bitcast(op::Multiply()),
-                           op::Bitcast(op::GetTupleElement(
-                               AllOf(op::Fusion(), ::testing::Ne(fusion))))));
+  ASSERT_THAT(add,
+              op::Add(op::Bitcast(op::Multiply()),
+                      op::Bitcast(AllOf(op::Fusion(), ::testing::Ne(fusion)))));
+  CheckForRematInInstructionNames(
+      ::testing::UnitTest::GetInstance()->current_test_info()->name());
 }
 
 TEST_F(RecomputeAndCompressHloRematerializationTest, RematThroughTuple) {
@@ -775,10 +928,11 @@ ENTRY %entry {
                           RunHloRematerialization(
                               /*memory_limit_bytes=*/11 * 1024, module.get()));
   EXPECT_TRUE(changed);
-  ASSERT_THAT(
-      add, op::Add(op::GetTupleElement(AllOf(op::Fusion(), ::testing::Ne(tuple),
-                                             ::testing::Ne(fusion))),
-                   op::Add()));
+  ASSERT_THAT(add, op::Add(AllOf(op::Fusion(), ::testing::Ne(tuple),
+                                 ::testing::Ne(fusion)),
+                           op::Add()));
+  CheckForRematInInstructionNames(
+      ::testing::UnitTest::GetInstance()->current_test_info()->name());
 }
 
 // Make sure when rematerializing all-gathers we increment channel_ids properly.
@@ -834,6 +988,8 @@ ENTRY %mycomp (param: f32[1]) -> f32[1] {
   EXPECT_TRUE(original_ag->channel_id().has_value());
   EXPECT_TRUE(remat_ag->channel_id().has_value());
   EXPECT_EQ(*remat_ag->channel_id(), *original_ag->channel_id() + 1);
+  CheckForRematInInstructionNames(
+      ::testing::UnitTest::GetInstance()->current_test_info()->name());
 }
 
 TEST_F(RecomputeAndCompressHloRematerializationTest, RematTupleArgFusion) {
@@ -895,6 +1051,8 @@ ENTRY %entry {
   ASSERT_THAT(
       root, op::Tuple(op::Reduce(),
                       op::Fusion(AllOf(op::Fusion(), ::testing::Ne(fusion0)))));
+  CheckForRematInInstructionNames(
+      ::testing::UnitTest::GetInstance()->current_test_info()->name());
 }
 
 TEST_F(RecomputeAndCompressHloRematerializationTest,
@@ -969,6 +1127,8 @@ ENTRY %entry {
       (*it)->called_computations()[0]));
   EXPECT_TRUE(module->schedule().is_computation_scheduled(
       (*it2)->called_computations()[0]));
+  CheckForRematInInstructionNames(
+      ::testing::UnitTest::GetInstance()->current_test_info()->name());
 }
 
 class CompressingRematerializationTest : public RematerializationTestBase {
@@ -1028,7 +1188,8 @@ class CompressingRematerializationTest : public RematerializationTestBase {
         cost_analysis, config, memory_limit_bytes,
         /*block_size_limit=*/1, /*block_rematerialization_factor=*/1,
         min_remat_size, ChooseCompactLayoutForShape,
-        /*host_memory_offload_config=*/std::nullopt);
+        /*host_memory_offload_config=*/std::nullopt,
+        /*async_threads=*/{});
     HloRematerialization::RematerializationSizes sizes;
     HloRematerialization remat(options, sizes);
     return remat.Run(module);
@@ -1237,7 +1398,8 @@ class OffloadingRematerializationTest : public RematerializationTestBase {
         cost_analysis, config, memory_limit_bytes,
         /*block_size_limit=*/1, /*block_rematerialization_factor=*/1,
         min_remat_size, /*compact_shape_function=*/nullptr,
-        host_memory_offload_config);
+        host_memory_offload_config,
+        /*async_threads=*/{});
     HloRematerialization::RematerializationSizes sizes;
     HloRematerialization remat(options, sizes);
     return remat.Run(module);
@@ -1463,6 +1625,8 @@ TEST_P(IndirectUseTest, IndirectUseRematerialized) {
     EXPECT_TRUE(changed);
     EXPECT_EQ(entry_computation->instruction_count(), 9);
   }
+  CheckForRematInInstructionNames(
+      ::testing::UnitTest::GetInstance()->current_test_info()->name());
 }
 
 INSTANTIATE_TEST_SUITE_P(IndirectUseTestInstantiation, IndirectUseTest,
diff --git a/third_party/xla/xla/service/hlo_replication_analysis.cc b/third_party/xla/xla/service/hlo_replication_analysis.cc
index b92b6fe816e3df..e8b19284f4ca10 100644
--- a/third_party/xla/xla/service/hlo_replication_analysis.cc
+++ b/third_party/xla/xla/service/hlo_replication_analysis.cc
@@ -354,6 +354,9 @@ bool HloReplicationAnalysis::ComputeHloReplicationOnComputation(
         shape_tree.CopySubtreeFrom(hlo_replication_[inst->operand(i)], {}, {i});
       }
       changed |= assign_or_combine_shapetree(std::move(shape_tree), inst);
+    } else if (inst->opcode() == HloOpcode::kOptimizationBarrier) {
+      ShapeTree<HloReplication> shape_tree = hlo_replication_[inst->operand(0)];
+      changed |= assign_or_combine_shapetree(std::move(shape_tree), inst);
     } else if (inst->opcode() == HloOpcode::kGetTupleElement) {
       ShapeTree<HloReplication> shape_tree(
           inst->shape(), HloReplication::ReplicatedOnAllDevices());
@@ -396,7 +399,7 @@ bool HloReplicationAnalysis::ComputeHloReplicationOnComputation(
   return changed;
 }
 
-Status HloReplicationAnalysis::ComputeHloReplication() {
+absl::Status HloReplicationAnalysis::ComputeHloReplication() {
   // Add entry parameters to the above sets according to user annotation.
   // Replicated modules read from `parameter_replicated_at_leaf_buffers` whereas
   // SPMD partitioned modules read from HloSharding attributes.
@@ -407,7 +410,7 @@ Status HloReplicationAnalysis::ComputeHloReplication() {
                                          HloReplication::UniqueOnAllDevices());
     const auto& replication = param->parameter_replicated_at_leaf_buffers();
     int leaf_index = 0;
-    Status status = ShapeUtil::ForEachSubshapeWithStatus(
+    absl::Status status = ShapeUtil::ForEachSubshapeWithStatus(
         param->shape(), [&](const Shape& subshape, const ShapeIndex& index) {
           if (!ShapeUtil::IsLeafIndex(param->shape(), index)) {
             return OkStatus();
diff --git a/third_party/xla/xla/service/hlo_replication_analysis.h b/third_party/xla/xla/service/hlo_replication_analysis.h
index e6f680214d7992..c16a1093baf9dd 100644
--- a/third_party/xla/xla/service/hlo_replication_analysis.h
+++ b/third_party/xla/xla/service/hlo_replication_analysis.h
@@ -114,7 +114,7 @@ class HloReplicationAnalysis {
         support_partial_replication_(support_partial_replication) {}
 
   // Computes hlo_replication_.
-  Status ComputeHloReplication();
+  absl::Status ComputeHloReplication();
 
   // A helper function to recursively compute hlo_replication on a computation.
   // Returns whether hlo_replication_ is changed.
diff --git a/third_party/xla/xla/service/hlo_replication_analysis_test.cc b/third_party/xla/xla/service/hlo_replication_analysis_test.cc
index 388797c35437e9..4cb5b9b8c43792 100644
--- a/third_party/xla/xla/service/hlo_replication_analysis_test.cc
+++ b/third_party/xla/xla/service/hlo_replication_analysis_test.cc
@@ -675,5 +675,40 @@ ENTRY entry {
       FindInstruction(module.get(), "dynamic-slice"), {}, replica_groups_2));
 }
 
+TEST_F(HloReplicationAnalysisTest, OptimizationBarrier) {
+  const std::string module_str = R"(
+HloModule OptimizationBarrier
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  param = (f32[], f32[]) parameter(0)
+  get-tuple-element.0 = f32[] get-tuple-element(param), index=0
+  get-tuple-element.1 = f32[] get-tuple-element(param), index=1
+  ar0 = f32[] all-reduce(get-tuple-element.0), to_apply=sum, replica_groups={{0,1}}
+  ar1 = f32[] all-reduce(get-tuple-element.1), to_apply=sum, replica_groups={{0},{1}}
+  tuple = (f32[], f32[]) tuple(ar0, ar1)
+  opt-barrier = (f32[], f32[]) opt-barrier(tuple)
+  gte.0 = f32[] get-tuple-element(opt-barrier), index=0
+  gte.1 = f32[] get-tuple-element(opt-barrier), index=1
+  ROOT tuple.1 = (f32[], f32[]) tuple(gte.0, gte.1)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(
+                                           module_str, /*replica_count=*/2));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloReplicationAnalysis> analysis,
+                          HloReplicationAnalysis::Run(
+                              module.get(), /*cross_partition_spmd=*/false));
+  EXPECT_TRUE(analysis->HloInstructionIsReplicatedAt(
+      FindInstruction(module.get(), "gte.0"), {}));
+  EXPECT_FALSE(analysis->HloInstructionIsReplicatedAt(
+      FindInstruction(module.get(), "gte.1"), {}));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_runner.cc b/third_party/xla/xla/service/hlo_runner.cc
index f08b6ea5e320e3..564256b42b33fc 100644
--- a/third_party/xla/xla/service/hlo_runner.cc
+++ b/third_party/xla/xla/service/hlo_runner.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "xla/service/transfer_manager.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/stream_executor/device_memory_allocator.h"
 #include "tsl/platform/blocking_counter.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
@@ -50,6 +51,14 @@ HloRunner::HloRunner(se::Platform* platform, int intra_op_parallelism_threads) {
 
 HloRunner::~HloRunner() {}
 
+se::DeviceMemoryAllocator* HloRunner::GetAllocator() {
+  if (allocator_ == nullptr) {
+    allocator_ = std::make_unique<se::StreamExecutorMemoryAllocator>(
+        backend().default_stream_executor());
+  }
+  return allocator_.get();
+}
+
 absl::StatusOr<ScopedShapedBuffer> HloRunner::TransferLiteralToDevice(
     const Literal& literal, int64_t param_no) {
   auto shape_representation_fn = [this, param_no](const Shape& shape) {
@@ -290,7 +299,7 @@ absl::StatusOr<ExecutionOutput> HloRunner::ExecuteWithDeviceBuffers(
       ExecutionInputsFromScopedShapedBuffers(
           arguments, executable->module().input_output_alias_config(),
           backend().default_stream_executor()->device_ordinal(),
-          backend().default_stream_executor()->GetAllocator());
+          GetAllocator());
   return ExecuteWithExecutionInputs(executable, std::move(execution_arguments),
                                     profile);
 }
@@ -329,8 +338,7 @@ absl::StatusOr<ExecutionOutput> HloRunner::ExecuteWithMovedDeviceBuffers(
   ExecutionInputsFromMovedScopedShapedBuffers(
       &execution_arguments, &owned_arguments, std::move(arguments),
       executable->module().input_output_alias_config(),
-      backend().default_stream_executor()->device_ordinal(),
-      backend().default_stream_executor()->GetAllocator());
+      backend().default_stream_executor()->device_ordinal(), GetAllocator());
 
   TF_ASSIGN_OR_RETURN(ExecutionOutput retval,
                       ExecuteWithExecutionInputs(
diff --git a/third_party/xla/xla/service/hlo_runner.h b/third_party/xla/xla/service/hlo_runner.h
index d62786fbd329ff..8b473e99f2a540 100644
--- a/third_party/xla/xla/service/hlo_runner.h
+++ b/third_party/xla/xla/service/hlo_runner.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "xla/service/hlo_runner_interface.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
+#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/types.h"
 #include "xla/util.h"
@@ -214,8 +215,13 @@ class HloRunner : public HloRunnerInterface {
       const ReplicatedExecuteOptions& options,
       DeviceAssignment* device_assignment);
 
+  // Gets or creates the DeviceMemoryAllocator.
+  se::DeviceMemoryAllocator* GetAllocator();
+
   std::unique_ptr<Backend> backend_;
 
+  std::unique_ptr<se::DeviceMemoryAllocator> allocator_;
+
   DeviceShapeRepresentationFn device_shape_representation_fn_;
 
   const ComputationLayout* entry_computation_layout_ = nullptr;
diff --git a/third_party/xla/xla/service/hlo_runner_pjrt.cc b/third_party/xla/xla/service/hlo_runner_pjrt.cc
index 27228fe523e38a..b698a245d48af0 100644
--- a/third_party/xla/xla/service/hlo_runner_pjrt.cc
+++ b/third_party/xla/xla/service/hlo_runner_pjrt.cc
@@ -15,25 +15,102 @@ limitations under the License.
 
 #include "xla/service/hlo_runner_pjrt.h"
 
+#include <cstdint>
 #include <functional>
 #include <memory>
 #include <optional>
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/types/span.h"
 #include "xla/client/xla_computation.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/layout.h"
+#include "xla/pjrt/host_memory_spaces.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_future.h"
+#include "xla/service/computation_layout.h"
 #include "xla/service/executable.h"
 #include "xla/service/hlo_module_util.h"
+#include "xla/shape_layout.h"
+#include "xla/shape_util.h"
+#include "xla/status.h"
+#include "xla/status_macros.h"
 #include "xla/statusor.h"
+#include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
 
+namespace {
+
+absl::Status SanityCheckParameterLayouts(
+    const ComputationLayout& entry_layout) {
+  const std::vector<ShapeLayout>& layouts = entry_layout.parameter_layouts();
+  bool has_nested_tuples =
+      absl::c_any_of(layouts, [](const auto& shape_layout) {
+        return ShapeUtil::IsNestedTuple(shape_layout.shape());
+      });
+  if (has_nested_tuples) {
+    return InvalidArgument(
+        "PJRT does not support nested tuples as input parameters");
+  }
+  int num_tuples = absl::c_count_if(layouts, [](const auto& shape_layout) {
+    return shape_layout.shape().IsTuple();
+  });
+  if (num_tuples > 1) {
+    return InvalidArgument(
+        "PJRT does not support more than one tuple as input parameters"
+        " (found %d tuples)",
+        num_tuples);
+  }
+  if (num_tuples == 1 && num_tuples != layouts.size()) {
+    return InvalidArgument(
+        "PJRT does not support mixing tuples and non-tuples as input "
+        "parameters (found 1 tuple out of %d arguments)",
+        layouts.size());
+  }
+  return OkStatus();
+}
+
+absl::StatusOr<bool> MustFlattenInputTuple(
+    const ComputationLayout& entry_layout) {
+  TF_RETURN_IF_ERROR(SanityCheckParameterLayouts(entry_layout));
+  // Strictly, we only need to flatten tuples with mixed host/device leaves
+  // because mixed host/device PjRtBuffer's are not supported.
+  // However, splitting all tuples makes the code simpler and is the way
+  // PJRT is commonly used by JAX.
+  return entry_layout.parameter_count() == 1 &&
+         entry_layout.parameter_shape(0).IsTuple();
+}
+
+absl::StatusOr<ExecuteOptions> GenerateExecuteOptions(const HloModule& module) {
+  ExecuteOptions execute_options;
+
+  // If any output leaf buffer is in host memory, PJRT requires untuple_result.
+  auto output_has_tuple_leaf_in_host_memory_space =
+      [&module]() -> absl::StatusOr<bool> {
+    if (!module.result_shape().IsTuple()) {
+      return false;
+    }
+    TF_ASSIGN_OR_RETURN(
+        std::vector<Layout> output_layouts,
+        module.entry_computation_layout().FlattenedResultLayouts());
+    return absl::c_any_of(output_layouts, [](const Layout& layout) {
+      return layout.memory_space() == Layout::kHostMemorySpace;
+    });
+  };
+  TF_ASSIGN_OR_RETURN(execute_options.untuple_result,
+                      output_has_tuple_leaf_in_host_memory_space());
+
+  return execute_options;
+}
+
+}  // namespace
+
 // TODO(b/245550554): Remove the use of PjRtWrappedExecutable.
 class PjRtWrappedExecutable : public Executable {
  public:
@@ -102,6 +179,13 @@ absl::StatusOr<CompileOptions> HloRunnerPjRt::GenerateDefaultCompileOptions(
   }
   compile_options.argument_layouts = parameter_shapes;
 
+  TF_ASSIGN_OR_RETURN(
+      bool flatten, MustFlattenInputTuple(module->entry_computation_layout()));
+  compile_options.parameter_is_tupled_arguments = flatten;
+
+  compile_options.executable_build_options.set_result_layout(
+      module->entry_computation_layout().result_shape());
+
   return compile_options;
 }
 
@@ -114,28 +198,68 @@ absl::StatusOr<Literal> HloRunnerPjRt::TransferLiteralFromDevice(
 }
 
 absl::StatusOr<std::unique_ptr<PjRtBuffer>>
-HloRunnerPjRt::TransferLiteralToDevice(const Literal& literal) {
+HloRunnerPjRt::TransferLiteralToDevice(const Literal& literal,
+                                       int64_t memory_space) {
   auto devices = pjrt_client_->addressable_devices();
+  PjRtDevice* device = devices[kDeviceIdx];
 
-  TF_ASSIGN_OR_RETURN(auto assignment, pjrt_client_->BufferFromHostLiteral(
-                                           literal, devices[kDeviceIdx]));
+  if (pjrt_client_->memory_spaces().empty()) {
+    TF_ASSIGN_OR_RETURN(auto assignment,
+                        pjrt_client_->BufferFromHostLiteral(literal, device));
+    return std::move(assignment);
+  }
 
+  auto get_pjrt_memory_space = [](PjRtDevice* pjrt_device,
+                                  int64_t xla_memory_space) {
+    if (xla_memory_space == Layout::kHostMemorySpace) {
+      return pjrt_device->memory_space_by_kind(PinnedHostMemorySpace::kKind);
+    }
+    return pjrt_device->default_memory_space();
+  };
+  TF_ASSIGN_OR_RETURN(PjRtMemorySpace * pjrt_memory_space,
+                      get_pjrt_memory_space(device, memory_space));
+  TF_ASSIGN_OR_RETURN(auto assignment, pjrt_client_->BufferFromHostLiteral(
+                                           literal, pjrt_memory_space));
   return std::move(assignment);
 }
 
 absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
 HloRunnerPjRt::TransferLiteralsToDevice(
+    const ComputationLayout& entry_layout,
     absl::Span<const Literal* const> literals) {
-  std::vector<std::unique_ptr<PjRtBuffer>> buffers;
-  buffers.reserve(literals.size());
-  for (const Literal* literal : literals) {
-    TF_RET_CHECK(literal != nullptr);
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtBuffer> buffer,
-                        TransferLiteralToDevice(*literal));
-    TF_RETURN_IF_ERROR(buffer->GetReadyFuture().Await());
-    buffers.push_back(std::move(buffer));
+  TF_ASSIGN_OR_RETURN(bool flatten, MustFlattenInputTuple(entry_layout));
+  TF_ASSIGN_OR_RETURN(std::vector<Layout> parameter_layouts,
+                      entry_layout.FlattenedParameterLayouts());
+
+  auto transfer_literals = [&parameter_layouts, this](
+                               absl::Span<const Literal* const> input_literals)
+      -> absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> {
+    TF_RET_CHECK(parameter_layouts.size() == input_literals.size());
+    std::vector<std::unique_ptr<PjRtBuffer>> buffers;
+    buffers.reserve(input_literals.size());
+    for (int i = 0; i < input_literals.size(); ++i) {
+      const Literal* literal = input_literals[i];
+      TF_RET_CHECK(literal != nullptr);
+      int64_t memory_space = parameter_layouts[i].memory_space();
+      TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtBuffer> buffer,
+                          TransferLiteralToDevice(*literal, memory_space));
+      TF_RETURN_IF_ERROR(buffer->GetReadyFuture().Await());
+      buffers.push_back(std::move(buffer));
+    }
+    return std::move(buffers);
+  };
+
+  if (flatten) {
+    Literal cloned_literal = literals[0]->Clone();
+    std::vector<Literal> flattened = cloned_literal.DecomposeTuple();
+    std::vector<const Literal*> flattened_ptrs;
+    flattened_ptrs.reserve(flattened.size());
+    for (const Literal& literal : flattened) {
+      flattened_ptrs.push_back(&literal);
+    }
+    return transfer_literals(flattened_ptrs);
   }
-  return std::move(buffers);
+  return transfer_literals(literals);
 }
 
 absl::StatusOr<Literal> HloRunnerPjRt::Execute(
@@ -180,15 +304,13 @@ HloRunnerPjRt::CreateExecutable(HloModule* module,
                                 CompileOptions compile_options) {
   XlaComputation computation(module->ToProto());
 
-  return pjrt_client_->Compile(computation, compile_options);
+  return pjrt_client_->Compile(computation, std::move(compile_options));
 }
 
 absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
 HloRunnerPjRt::ExecuteWithDeviceBuffers(
-    PjRtLoadedExecutable* executable,
+    PjRtLoadedExecutable* executable, const ExecuteOptions& execute_options,
     const std::vector<std::unique_ptr<PjRtBuffer>>& arguments) {
-  ExecuteOptions execute_options;
-
   std::vector<PjRtBuffer*> argument_ptrs = BufferVecToPointerVec(arguments);
 
   auto devices = pjrt_client_->addressable_devices();
@@ -209,17 +331,33 @@ absl::StatusOr<Literal> HloRunnerPjRt::ExecuteWithExecutable(
   PjRtWrappedExecutable* wrapped_executable =
       static_cast<PjRtWrappedExecutable*>(executable);
 
-  TF_ASSIGN_OR_RETURN(auto argument_handles,
-                      TransferLiteralsToDevice(arguments));
+  auto* pjrt_executable = wrapped_executable->GetPjRtLoadedExecutable();
+  TF_ASSIGN_OR_RETURN(std::vector<std::shared_ptr<HloModule>> hlo_modules,
+                      pjrt_executable->GetHloModules());
+  TF_RET_CHECK(hlo_modules.size() == 1);
+  const HloModule& module = *hlo_modules.front();
 
+  TF_ASSIGN_OR_RETURN(ExecuteOptions execute_options,
+                      GenerateExecuteOptions(module));
   TF_ASSIGN_OR_RETURN(
-      auto output_buffer,
-      ExecuteWithDeviceBuffers(wrapped_executable->GetPjRtLoadedExecutable(),
-                               std::move(argument_handles)));
-  // TODO (b/245550554): Support more than 1 output.
-  CHECK_EQ(output_buffer.size(), 1);
+      auto argument_handles,
+      TransferLiteralsToDevice(module.entry_computation_layout(), arguments));
 
-  return TransferLiteralFromDevice(*output_buffer[0]);
+  TF_ASSIGN_OR_RETURN(
+      std::vector<std::unique_ptr<PjRtBuffer>> output_buffers,
+      ExecuteWithDeviceBuffers(wrapped_executable->GetPjRtLoadedExecutable(),
+                               execute_options, std::move(argument_handles)));
+  if (!execute_options.untuple_result) {
+    TF_RET_CHECK(output_buffers.size() == 1)
+        << ", got " << output_buffers.size();
+    return TransferLiteralFromDevice(*output_buffers[0]);
+  }
+  std::vector<Literal> result_leaves;
+  for (const auto& leaf_buffer : output_buffers) {
+    TF_ASSIGN_OR_RETURN(Literal leaf, TransferLiteralFromDevice(*leaf_buffer));
+    result_leaves.push_back(std::move(leaf));
+  }
+  return Literal::MoveIntoTuple(absl::MakeSpan(result_leaves));
 }
 
 absl::StatusOr<std::unique_ptr<Executable>> HloRunnerPjRt::CreateExecutable(
diff --git a/third_party/xla/xla/service/hlo_runner_pjrt.h b/third_party/xla/xla/service/hlo_runner_pjrt.h
index baa71fb33b44a0..f0d4d2ec9051a1 100644
--- a/third_party/xla/xla/service/hlo_runner_pjrt.h
+++ b/third_party/xla/xla/service/hlo_runner_pjrt.h
@@ -40,9 +40,10 @@ class HloRunnerPjRt : public HloRunnerInterface {
 
   // Transfers data between the host and device.
   absl::StatusOr<std::unique_ptr<PjRtBuffer>> TransferLiteralToDevice(
-      const Literal& literal);
+      const Literal& literal, int64_t memory_space);
   absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
-  TransferLiteralsToDevice(absl::Span<const Literal* const> literals);
+  TransferLiteralsToDevice(const ComputationLayout& entry_layout,
+                           absl::Span<const Literal* const> literals);
   absl::StatusOr<Literal> TransferLiteralFromDevice(PjRtBuffer& buffer);
 
   // Executes the given module with given literals as input and returns the
@@ -56,7 +57,7 @@ class HloRunnerPjRt : public HloRunnerInterface {
   // buffers.
   absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
   ExecuteWithDeviceBuffers(
-      PjRtLoadedExecutable* executable,
+      PjRtLoadedExecutable* executable, const ExecuteOptions& execute_options,
       const std::vector<std::unique_ptr<PjRtBuffer>>& arguments);
 
   // Creates an executable object for an HloModule.
diff --git a/third_party/xla/xla/service/hlo_value_semantics_analysis.cc b/third_party/xla/xla/service/hlo_value_semantics_analysis.cc
index 093fd302b42adb..26317ebd471ae5 100644
--- a/third_party/xla/xla/service/hlo_value_semantics_analysis.cc
+++ b/third_party/xla/xla/service/hlo_value_semantics_analysis.cc
@@ -1746,7 +1746,7 @@ absl::Status HloValueSemanticsPropagation::HandleConditional(
   std::vector<HloValueSemantics> merged_semantics_leaves;
   TF_RETURN_IF_ERROR(semantics_tree_vec[0].ForEachElementWithStatus(
       [&](const ShapeIndex& index,
-          const HloValueSemantics* semantics) -> Status {
+          const HloValueSemantics* semantics) -> absl::Status {
         std::vector<HloValueSemantics> semantics_vector;
         for (size_t i = 0; i < semantics_tree_vec.size(); ++i) {
           semantics_vector.push_back(
diff --git a/third_party/xla/xla/service/hlo_value_semantics_analysis.h b/third_party/xla/xla/service/hlo_value_semantics_analysis.h
index 48b0bbd68e1809..b005be32603728 100644
--- a/third_party/xla/xla/service/hlo_value_semantics_analysis.h
+++ b/third_party/xla/xla/service/hlo_value_semantics_analysis.h
@@ -61,8 +61,8 @@ class HloPreOrderDFS {
  public:
   HloPreOrderDFS() = default;
   ~HloPreOrderDFS() = default;
-  Status Run(const HloComputation& computation,
-             DfsHloVisitorBase<HloInstruction*>* visitor);
+  absl::Status Run(const HloComputation& computation,
+                   DfsHloVisitorBase<HloInstruction*>* visitor);
 
  private:
   bool IsReady(const HloInstruction* instruction) const;
@@ -93,43 +93,46 @@ class EinsumDepthAnalysis : public DfsHloVisitorWithDefault {
       const HloComputation& computation,
       const SendRecvGroupMap& send_recv_group_map);
   ~EinsumDepthAnalysis() override = default;
-  Status DefaultAction(HloInstruction* instruction) override;
-  Status HandleTuple(HloInstruction* tuple) override;
-  Status HandleGetTupleElement(HloInstruction* get_tuple_element) override;
-  Status HandleDot(HloInstruction* dot) override;
-  Status HandleConvolution(HloInstruction* convolution) override;
-  Status HandleCall(HloInstruction* call) override;
-  Status HandleFusion(HloInstruction* fusion) override;
-  Status HandleWhile(HloInstruction* xla_while) override;
-  Status HandleConditional(HloInstruction* conditional) override;
-  Status HandleAfterAll(HloInstruction* after_all) override;
-  Status HandleSend(HloInstruction* send) override;
-  Status HandleRecv(HloInstruction* recv) override;
-  Status HandleSendDone(HloInstruction* send_done) override;
-  Status HandleRecvDone(HloInstruction* recv_done) override;
-  Status HandleAllReduce(HloInstruction* all_reduce) override;
-  Status HandleAsyncStart(HloInstruction* async_start) override;
-  Status HandleAsyncDone(HloInstruction* async_done) override;
+  absl::Status DefaultAction(HloInstruction* instruction) override;
+  absl::Status HandleTuple(HloInstruction* tuple) override;
+  absl::Status HandleGetTupleElement(
+      HloInstruction* get_tuple_element) override;
+  absl::Status HandleDot(HloInstruction* dot) override;
+  absl::Status HandleConvolution(HloInstruction* convolution) override;
+  absl::Status HandleCall(HloInstruction* call) override;
+  absl::Status HandleFusion(HloInstruction* fusion) override;
+  absl::Status HandleWhile(HloInstruction* xla_while) override;
+  absl::Status HandleConditional(HloInstruction* conditional) override;
+  absl::Status HandleAfterAll(HloInstruction* after_all) override;
+  absl::Status HandleSend(HloInstruction* send) override;
+  absl::Status HandleRecv(HloInstruction* recv) override;
+  absl::Status HandleSendDone(HloInstruction* send_done) override;
+  absl::Status HandleRecvDone(HloInstruction* recv_done) override;
+  absl::Status HandleAllReduce(HloInstruction* all_reduce) override;
+  absl::Status HandleAsyncStart(HloInstruction* async_start) override;
+  absl::Status HandleAsyncDone(HloInstruction* async_done) override;
   const EinsumDepthMap& GetEinsumDepthMap() const { return einsum_depth_map_; }
 
  private:
   explicit EinsumDepthAnalysis(const SendRecvGroupMap& send_recv_group_map)
       : send_recv_group_map_(&send_recv_group_map) {}
-  Status RunInternal(const HloComputation& computation,
-                     const std::optional<ShapeTree<int>>& root_depth);
+  absl::Status RunInternal(const HloComputation& computation,
+                           const std::optional<ShapeTree<int>>& root_depth);
   ShapeTree<int>& GetOrCreateDepthTree(const HloInstruction* instruction);
   ShapeTree<int>& GetDepthTreeOrDie(const HloInstruction* instruction);
-  Status SetInstructionDepth(const HloInstruction* instruction, int depth);
-  Status SetInstructionDepth(const HloInstruction* instruction,
-                             const ShapeTree<int>& depth);
-  Status SetInstructionDepthFromTupleDepth(
+  absl::Status SetInstructionDepth(const HloInstruction* instruction,
+                                   int depth);
+  absl::Status SetInstructionDepth(const HloInstruction* instruction,
+                                   const ShapeTree<int>& depth);
+  absl::Status SetInstructionDepthFromTupleDepth(
       const HloInstruction* instruction, const ShapeTree<int>& tuple_depth_tree,
       int tuple_index);
-  Status HandleDepthIncrementInstruction(HloInstruction* instruction);
-  Status HandleCalledComputation(const HloComputation& called_computation,
-                                 const ShapeTree<int>& root_depth,
-                                 absl::Span<HloInstruction* const> operands);
-  Status HandleTupleLike(HloInstruction* tuple_like);
+  absl::Status HandleDepthIncrementInstruction(HloInstruction* instruction);
+  absl::Status HandleCalledComputation(
+      const HloComputation& called_computation,
+      const ShapeTree<int>& root_depth,
+      absl::Span<HloInstruction* const> operands);
+  absl::Status HandleTupleLike(HloInstruction* tuple_like);
   EinsumDepthMap einsum_depth_map_;
   const SendRecvGroupMap* const send_recv_group_map_;
 };
@@ -146,22 +149,23 @@ class EinsumHeightAnalysis : public DfsHloVisitorWithDefault {
       const HloComputation& computation,
       const SendRecvGroupMap& send_recv_group_map);
   ~EinsumHeightAnalysis() override = default;
-  Status DefaultAction(HloInstruction* instruction) override;
-  Status HandleTuple(HloInstruction* tuple) override;
-  Status HandleGetTupleElement(HloInstruction* get_tuple_element) override;
-  Status HandleDot(HloInstruction* dot) override;
-  Status HandleConvolution(HloInstruction* convolution) override;
-  Status HandleCall(HloInstruction* call) override;
-  Status HandleFusion(HloInstruction* fusion) override;
-  Status HandleWhile(HloInstruction* xla_while) override;
-  Status HandleConditional(HloInstruction* conditional) override;
-  Status HandleSend(HloInstruction* send) override;
-  Status HandleRecv(HloInstruction* recv) override;
-  Status HandleSendDone(HloInstruction* send_done) override;
-  Status HandleRecvDone(HloInstruction* recv_done) override;
-  Status HandleAllReduce(HloInstruction* all_reduce) override;
-  Status HandleAsyncStart(HloInstruction* async_start) override;
-  Status HandleAsyncDone(HloInstruction* async_done) override;
+  absl::Status DefaultAction(HloInstruction* instruction) override;
+  absl::Status HandleTuple(HloInstruction* tuple) override;
+  absl::Status HandleGetTupleElement(
+      HloInstruction* get_tuple_element) override;
+  absl::Status HandleDot(HloInstruction* dot) override;
+  absl::Status HandleConvolution(HloInstruction* convolution) override;
+  absl::Status HandleCall(HloInstruction* call) override;
+  absl::Status HandleFusion(HloInstruction* fusion) override;
+  absl::Status HandleWhile(HloInstruction* xla_while) override;
+  absl::Status HandleConditional(HloInstruction* conditional) override;
+  absl::Status HandleSend(HloInstruction* send) override;
+  absl::Status HandleRecv(HloInstruction* recv) override;
+  absl::Status HandleSendDone(HloInstruction* send_done) override;
+  absl::Status HandleRecvDone(HloInstruction* recv_done) override;
+  absl::Status HandleAllReduce(HloInstruction* all_reduce) override;
+  absl::Status HandleAsyncStart(HloInstruction* async_start) override;
+  absl::Status HandleAsyncDone(HloInstruction* async_done) override;
   const EinsumHeightMap& GetEinsumHeightMap() const {
     return einsum_height_map_;
   }
@@ -169,18 +173,20 @@ class EinsumHeightAnalysis : public DfsHloVisitorWithDefault {
  private:
   explicit EinsumHeightAnalysis(const SendRecvGroupMap& send_recv_group_map)
       : send_recv_group_map_(&send_recv_group_map) {}
-  Status RunInternal(const HloComputation& computation,
-                     absl::Span<HloInstruction* const> operands);
+  absl::Status RunInternal(const HloComputation& computation,
+                           absl::Span<HloInstruction* const> operands);
   ShapeTree<int>& GetOrCreateHeightTree(const HloInstruction* instruction);
   ShapeTree<int>& GetHeightTreeOrDie(const HloInstruction* instruction);
   bool HasHeightFor(const HloInstruction* instruction) const;
-  Status SetInstructionHeight(const HloInstruction* instruction, int height);
-  Status SetInstructionHeight(const HloInstruction* instruction,
-                              const ShapeTree<int>& height);
-  Status HandleHeightIncrementInstruction(HloInstruction* instruction);
-  Status HandleCalledComputation(const HloComputation& computation,
-                                 absl::Span<HloInstruction* const> operands);
-  Status HandleTupleLike(HloInstruction* tuple_like);
+  absl::Status SetInstructionHeight(const HloInstruction* instruction,
+                                    int height);
+  absl::Status SetInstructionHeight(const HloInstruction* instruction,
+                                    const ShapeTree<int>& height);
+  absl::Status HandleHeightIncrementInstruction(HloInstruction* instruction);
+  absl::Status HandleCalledComputation(
+      const HloComputation& computation,
+      absl::Span<HloInstruction* const> operands);
+  absl::Status HandleTupleLike(HloInstruction* tuple_like);
 
   EinsumHeightMap einsum_height_map_;
   const SendRecvGroupMap* const send_recv_group_map_;
@@ -269,19 +275,20 @@ class HloValueSemanticsAnalysis {
  protected:
   friend class HloValueSemanticsPropagation;
   explicit HloValueSemanticsAnalysis(const HloModule& module);
-  virtual Status InitializeEinsumDepth();
-  virtual Status InitializeEinsumHeight();
+  virtual absl::Status InitializeEinsumDepth();
+  virtual absl::Status InitializeEinsumHeight();
   // We match send and recv HLOs to propagate semantics from send to recv.
   virtual void InitializeSendRecvGroups();
   void AnnotateWeights();
 
   // Infer semantics for all instructions in the computation. Computation
   // parameters are assigned the semantics of the corresponding operand.
-  Status RunOnComputation(const HloComputation& computation,
-                          absl::Span<const HloInstruction* const> operands);
+  absl::Status RunOnComputation(
+      const HloComputation& computation,
+      absl::Span<const HloInstruction* const> operands);
   // Same as the above RunOnComputation, but computation parameters have
   // already been assigned with semantics.
-  virtual Status RunOnComputation(const HloComputation& computation);
+  virtual absl::Status RunOnComputation(const HloComputation& computation);
   HloValueSemantics::Id NextId();
   const HloValueSemantics* NewHloValueSemantics(HloValueSemanticLabel label,
                                                 const HloPosition& origin);
@@ -314,50 +321,52 @@ class HloValueSemanticsAnalysis {
 class HloValueSemanticsPropagation : public DfsHloVisitorWithDefault {
  public:
   explicit HloValueSemanticsPropagation(HloValueSemanticsAnalysis* analysis);
-  Status Run(const HloComputation& computation);
+  absl::Status Run(const HloComputation& computation);
   // Infer the output semantics from all operands of the instruction.
-  Status DefaultAction(HloInstruction* instruction) override;
-  Status HandleParameter(HloInstruction* parameter) override;
-  Status HandleConstant(HloInstruction* constant) override;
-  Status HandleIota(HloInstruction* iota) override;
-  Status HandlePartitionId(HloInstruction* partition_id) override;
-  Status HandleReplicaId(HloInstruction* replica_id) override;
-  Status HandleClamp(HloInstruction* clamp) override;
-  Status HandleTuple(HloInstruction* tuple) override;
-  Status HandleGetTupleElement(HloInstruction* get_tuple_element) override;
-  Status HandleCall(HloInstruction* call) override;
-  Status HandleFusion(HloInstruction* fusion) override;
-  Status HandleCustomCall(HloInstruction* custom_call) override;
-  Status HandleWhile(HloInstruction* xla_while) override;
-  Status HandleConditional(HloInstruction* conditional) override;
-  Status HandleSelect(HloInstruction* select) override;
-  Status HandleConcatenate(HloInstruction* concatenate) override;
-  Status HandleDynamicSlice(HloInstruction* dynamic_slice) override;
-  Status HandleDynamicUpdateSlice(
+  absl::Status DefaultAction(HloInstruction* instruction) override;
+  absl::Status HandleParameter(HloInstruction* parameter) override;
+  absl::Status HandleConstant(HloInstruction* constant) override;
+  absl::Status HandleIota(HloInstruction* iota) override;
+  absl::Status HandlePartitionId(HloInstruction* partition_id) override;
+  absl::Status HandleReplicaId(HloInstruction* replica_id) override;
+  absl::Status HandleClamp(HloInstruction* clamp) override;
+  absl::Status HandleTuple(HloInstruction* tuple) override;
+  absl::Status HandleGetTupleElement(
+      HloInstruction* get_tuple_element) override;
+  absl::Status HandleCall(HloInstruction* call) override;
+  absl::Status HandleFusion(HloInstruction* fusion) override;
+  absl::Status HandleCustomCall(HloInstruction* custom_call) override;
+  absl::Status HandleWhile(HloInstruction* xla_while) override;
+  absl::Status HandleConditional(HloInstruction* conditional) override;
+  absl::Status HandleSelect(HloInstruction* select) override;
+  absl::Status HandleConcatenate(HloInstruction* concatenate) override;
+  absl::Status HandleDynamicSlice(HloInstruction* dynamic_slice) override;
+  absl::Status HandleDynamicUpdateSlice(
       HloInstruction* dynamic_update_slice) override;
-  Status HandleCopyStart(HloInstruction* copy_start) override;
-  Status HandleCopyDone(HloInstruction* copy_done) override;
-  Status HandleAllGatherStart(HloInstruction* all_gather_start) override;
-  Status HandleAllGatherDone(HloInstruction* all_gather_done) override;
-  Status HandleCollectivePermuteStart(
+  absl::Status HandleCopyStart(HloInstruction* copy_start) override;
+  absl::Status HandleCopyDone(HloInstruction* copy_done) override;
+  absl::Status HandleAllGatherStart(HloInstruction* all_gather_start) override;
+  absl::Status HandleAllGatherDone(HloInstruction* all_gather_done) override;
+  absl::Status HandleCollectivePermuteStart(
       HloInstruction* collective_permute_start) override;
-  Status HandleCollectivePermuteDone(
+  absl::Status HandleCollectivePermuteDone(
       HloInstruction* collective_permute_done) override;
-  Status HandleGather(HloInstruction* gather) override;
-  Status HandleScatter(HloInstruction* scatter) override;
-  Status HandleAfterAll(HloInstruction* after_all) override;
-  Status HandleAllReduce(HloInstruction* all_reduce) override;
-  Status HandleAsyncStart(HloInstruction* async_start) override;
-  Status HandleAsyncDone(HloInstruction* async_done) override;
-  Status HandleInfeed(HloInstruction* infeed) override;
-  Status HandleOutfeed(HloInstruction* outfeed) override;
-  Status HandleDomain(HloInstruction* domain) override;
-  Status HandleOptimizationBarrier(HloInstruction* opt_barrier) override;
-  Status HandleRngBitGenerator(HloInstruction* rng_bit_generator) override;
-  Status HandleSend(HloInstruction* send) override;
-  Status HandleRecv(HloInstruction* recv) override;
-  Status HandleSendDone(HloInstruction* send_done) override;
-  Status HandleRecvDone(HloInstruction* recv_done) override;
+  absl::Status HandleGather(HloInstruction* gather) override;
+  absl::Status HandleScatter(HloInstruction* scatter) override;
+  absl::Status HandleAfterAll(HloInstruction* after_all) override;
+  absl::Status HandleAllReduce(HloInstruction* all_reduce) override;
+  absl::Status HandleAsyncStart(HloInstruction* async_start) override;
+  absl::Status HandleAsyncDone(HloInstruction* async_done) override;
+  absl::Status HandleInfeed(HloInstruction* infeed) override;
+  absl::Status HandleOutfeed(HloInstruction* outfeed) override;
+  absl::Status HandleDomain(HloInstruction* domain) override;
+  absl::Status HandleOptimizationBarrier(HloInstruction* opt_barrier) override;
+  absl::Status HandleRngBitGenerator(
+      HloInstruction* rng_bit_generator) override;
+  absl::Status HandleSend(HloInstruction* send) override;
+  absl::Status HandleRecv(HloInstruction* recv) override;
+  absl::Status HandleSendDone(HloInstruction* send_done) override;
+  absl::Status HandleRecvDone(HloInstruction* recv_done) override;
 
  protected:
   HloValueSemantics CopySemantics(const HloValueSemantics& semantics) const;
@@ -419,9 +428,9 @@ class HloValueSemanticsPropagation : public DfsHloVisitorWithDefault {
   absl::StatusOr<HloValueSemantics> ComputeSemanticsFromOperands(
       HloInstruction* instruction, absl::Span<const int64_t> operand_indices,
       absl::Span<const ShapeIndex> operand_shape_indices = {}) const;
-  Status HandleTupleLike(HloInstruction* tuple_like);
-  Status HandleCollectiveOrCopyStart(HloInstruction* op_start);
-  Status HandleCollectiveOrCopyDone(HloInstruction* op_done);
+  absl::Status HandleTupleLike(HloInstruction* tuple_like);
+  absl::Status HandleCollectiveOrCopyStart(HloInstruction* op_start);
+  absl::Status HandleCollectiveOrCopyDone(HloInstruction* op_done);
   HloValueSemanticsAnalysis* analysis_;
 };
 
diff --git a/third_party/xla/xla/service/hlo_verifier.cc b/third_party/xla/xla/service/hlo_verifier.cc
index 5c4390fe141d6f..841f5ee0a93548 100644
--- a/third_party/xla/xla/service/hlo_verifier.cc
+++ b/third_party/xla/xla/service/hlo_verifier.cc
@@ -43,6 +43,7 @@ limitations under the License.
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_input_output_alias_config.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -71,7 +72,7 @@ bool IsCallerInstruction(HloInstruction* hlo) {
   return HloInstruction::MightHaveCalledComputations(hlo->opcode());
 }
 
-Status CheckOperandCount(const HloInstruction* hlo, int expected) {
+absl::Status CheckOperandCount(const HloInstruction* hlo, int expected) {
   if (hlo->operand_count() != expected) {
     return Internal("Expected %d operands for %s instruction: %s", expected,
                     HloOpcodeString(hlo->opcode()), hlo->ToString());
@@ -105,8 +106,8 @@ int64_t GetSubgroupSize(HloCollectiveInstruction* hlo,
   }
 }
 
-Status CheckNestedComputationThreadNameEqual(const HloComputation* comp,
-                                             bool skip_nested_async_op_check) {
+absl::Status CheckNestedComputationThreadNameEqual(
+    const HloComputation* comp, bool skip_nested_async_op_check) {
   for (const HloInstruction* instr : comp->instructions()) {
     if (skip_nested_async_op_check && instr->IsAsynchronous()) {
       continue;
@@ -126,7 +127,7 @@ Status CheckNestedComputationThreadNameEqual(const HloComputation* comp,
 }
 }  // namespace
 
-/*static*/ Status ShapeVerifier::CheckParameterCount(
+/*static*/ absl::Status ShapeVerifier::CheckParameterCount(
     const HloInstruction* calling_instruction,
     const HloComputation* computation, int expected) {
   if (computation->num_parameters() != expected) {
@@ -138,7 +139,7 @@ Status CheckNestedComputationThreadNameEqual(const HloComputation* comp,
   return OkStatus();
 }
 
-Status ShapeVerifier::Preprocess(HloInstruction* hlo) {
+absl::Status ShapeVerifier::Preprocess(HloInstruction* hlo) {
   if (!hlo->called_computations().empty() && !IsCallerInstruction(hlo)) {
     return Internal(
         "Called computations specified for non-caller instruction %s",
@@ -155,23 +156,23 @@ Status ShapeVerifier::Preprocess(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status ShapeVerifier::HandleElementwiseUnary(HloInstruction* hlo) {
+absl::Status ShapeVerifier::HandleElementwiseUnary(HloInstruction* hlo) {
   return CheckUnaryShape(hlo);
 }
 
-Status ShapeVerifier::HandleElementwiseBinary(HloInstruction* hlo) {
+absl::Status ShapeVerifier::HandleElementwiseBinary(HloInstruction* hlo) {
   return CheckBinaryShape(hlo);
 }
 
-Status ShapeVerifier::HandleClamp(HloInstruction* clamp) {
+absl::Status ShapeVerifier::HandleClamp(HloInstruction* clamp) {
   return CheckTernaryShape(clamp);
 }
 
-Status ShapeVerifier::HandleSelect(HloInstruction* select) {
+absl::Status ShapeVerifier::HandleSelect(HloInstruction* select) {
   return CheckTernaryShape(select);
 }
 
-Status ShapeVerifier::HandleConcatenate(HloInstruction* concatenate) {
+absl::Status ShapeVerifier::HandleConcatenate(HloInstruction* concatenate) {
   std::vector<const Shape*> operand_shapes;
   for (const HloInstruction* operand : concatenate->operands()) {
     operand_shapes.push_back(&operand->shape());
@@ -181,30 +182,30 @@ Status ShapeVerifier::HandleConcatenate(HloInstruction* concatenate) {
                         operand_shapes, concatenate->concatenate_dimension()));
 }
 
-Status ShapeVerifier::HandleConvert(HloInstruction* convert) {
+absl::Status ShapeVerifier::HandleConvert(HloInstruction* convert) {
   return CheckShape(convert, ShapeInference::InferConvertShape(
                                  convert->operand(0)->shape(),
                                  convert->shape().element_type()));
 }
 
-Status ShapeVerifier::HandleBitcastConvert(HloInstruction* convert) {
+absl::Status ShapeVerifier::HandleBitcastConvert(HloInstruction* convert) {
   return CheckShape(convert, ShapeInference::InferBitcastConvertShape(
                                  convert->operand(0)->shape(),
                                  convert->shape().element_type()));
 }
 
-Status ShapeVerifier::HandleStochasticConvert(HloInstruction* convert) {
+absl::Status ShapeVerifier::HandleStochasticConvert(HloInstruction* convert) {
   return CheckShape(
       convert, ShapeInference::InferStochasticConvertShape(
                    convert->operand(0)->shape(), convert->operand(1)->shape(),
                    convert->shape().element_type()));
 }
 
-Status ShapeVerifier::HandleCopy(HloInstruction* copy) {
+absl::Status ShapeVerifier::HandleCopy(HloInstruction* copy) {
   return CheckUnaryShape(copy);
 }
 
-Status ShapeVerifier::HandleDot(HloInstruction* dot) {
+absl::Status ShapeVerifier::HandleDot(HloInstruction* dot) {
   auto sparsity = Cast<HloDotInstruction>(dot)->sparsity();
   TF_RETURN_IF_ERROR(
       CheckOperandCount(dot, HloDotInstruction::kOperands + sparsity.size()));
@@ -256,7 +257,7 @@ Status ShapeVerifier::HandleDot(HloInstruction* dot) {
   return CheckShape(dot, expected);
 }
 
-Status ShapeVerifier::HandleConvolution(HloInstruction* convolution) {
+absl::Status ShapeVerifier::HandleConvolution(HloInstruction* convolution) {
   TF_ASSIGN_OR_RETURN(
       Shape expected,
       ShapeInference::InferConvolveShape(
@@ -303,7 +304,7 @@ Status ShapeVerifier::HandleConvolution(HloInstruction* convolution) {
   return CheckShape(convolution, expected);
 }
 
-Status ShapeVerifier::HandleFft(HloInstruction* fft) {
+absl::Status ShapeVerifier::HandleFft(HloInstruction* fft) {
   TF_ASSIGN_OR_RETURN(
       const Shape expected,
       ShapeInference::InferFftShape(fft->operand(0)->shape(), fft->fft_type(),
@@ -311,7 +312,7 @@ Status ShapeVerifier::HandleFft(HloInstruction* fft) {
   return CheckShape(fft, expected);
 }
 
-Status ShapeVerifier::HandleTriangularSolve(HloInstruction* hlo) {
+absl::Status ShapeVerifier::HandleTriangularSolve(HloInstruction* hlo) {
   TF_ASSIGN_OR_RETURN(const Shape expected,
                       ShapeInference::InferTriangularSolveShape(
                           hlo->operand(0)->shape(), hlo->operand(1)->shape(),
@@ -319,14 +320,14 @@ Status ShapeVerifier::HandleTriangularSolve(HloInstruction* hlo) {
   return CheckShape(hlo, expected);
 }
 
-Status ShapeVerifier::HandleCholesky(HloInstruction* hlo) {
+absl::Status ShapeVerifier::HandleCholesky(HloInstruction* hlo) {
   TF_RETURN_IF_ERROR(CheckOperandCount(hlo, 1));
   TF_ASSIGN_OR_RETURN(const Shape expected, ShapeInference::InferCholeskyShape(
                                                 hlo->operand(0)->shape()));
   return CheckShape(hlo, expected);
 }
 
-Status ShapeVerifier::HandleOptimizationBarrier(HloInstruction* hlo) {
+absl::Status ShapeVerifier::HandleOptimizationBarrier(HloInstruction* hlo) {
   TF_RETURN_IF_ERROR(CheckOperandCount(hlo, 1));
   return CheckShape(hlo, hlo->operand(0)->shape());
 }
@@ -352,9 +353,9 @@ bool ShapeVerifier::ShapesSame(const Shape& a, const Shape& b,
 // In general, if replica groups is not empty, all replica groups should be of
 // the same size. The exception is all-reduce, where non-uniform replica groups
 // are allowed. This is controlled by `uniform_replica_group_size`.
-static Status CheckReplicaGroups(HloInstruction* hlo,
-                                 CollectiveOpGroupMode group_mode,
-                                 bool uniform_replica_group_size = true) {
+static absl::Status CheckReplicaGroups(HloInstruction* hlo,
+                                       CollectiveOpGroupMode group_mode,
+                                       bool uniform_replica_group_size = true) {
   if (!hlo->replica_groups().empty()) {
     absl::flat_hash_set<int64_t> replicas_seen;
     for (const ReplicaGroup& g : hlo->replica_groups()) {
@@ -426,8 +427,8 @@ static Status CheckReplicaGroups(HloInstruction* hlo,
   return OkStatus();
 }
 
-static Status CheckCommonAllGatherInvariants(HloInstruction* hlo,
-                                             int64_t* computed_shard_count) {
+static absl::Status CheckCommonAllGatherInvariants(
+    HloInstruction* hlo, int64_t* computed_shard_count) {
   auto ag = Cast<HloAllGatherInstruction>(hlo);
   CHECK_NE(computed_shard_count, nullptr) << "Expected a shard count as input";
   TF_ASSIGN_OR_RETURN(CollectiveOpGroupMode group_mode,
@@ -470,7 +471,7 @@ static Status CheckCommonAllGatherInvariants(HloInstruction* hlo,
   return OkStatus();
 }
 
-Status ShapeVerifier::HandleAllGather(HloInstruction* hlo) {
+absl::Status ShapeVerifier::HandleAllGather(HloInstruction* hlo) {
   auto ag = Cast<HloAllGatherInstruction>(hlo);
   int64_t shard_count;
   TF_RETURN_IF_ERROR(CheckCommonAllGatherInvariants(hlo, &shard_count));
@@ -483,7 +484,7 @@ Status ShapeVerifier::HandleAllGather(HloInstruction* hlo) {
               operand_shapes, ag->all_gather_dimension(), shard_count));
 }
 
-Status ShapeVerifier::HandleAllGatherStart(HloInstruction* hlo) {
+absl::Status ShapeVerifier::HandleAllGatherStart(HloInstruction* hlo) {
   auto ag = Cast<HloAllGatherInstruction>(hlo);
   int64_t shard_count;
   TF_RETURN_IF_ERROR(CheckCommonAllGatherInvariants(hlo, &shard_count));
@@ -496,12 +497,12 @@ Status ShapeVerifier::HandleAllGatherStart(HloInstruction* hlo) {
               operand_shapes, ag->all_gather_dimension(), shard_count));
 }
 
-Status ShapeVerifier::HandleAllGatherDone(HloInstruction* hlo) {
+absl::Status ShapeVerifier::HandleAllGatherDone(HloInstruction* hlo) {
   return CheckShape(
       hlo, ShapeInference::InferAllGatherDoneShape(hlo->operand(0)->shape()));
 }
 
-Status ShapeVerifier::HandleAllReduce(HloInstruction* hlo) {
+absl::Status ShapeVerifier::HandleAllReduce(HloInstruction* hlo) {
   auto ar = Cast<HloAllReduceInstruction>(hlo);
   TF_ASSIGN_OR_RETURN(CollectiveOpGroupMode group_mode,
                       GetCollectiveOpGroupMode(ar->channel_id().has_value(),
@@ -516,7 +517,7 @@ Status ShapeVerifier::HandleAllReduce(HloInstruction* hlo) {
   return CheckShape(hlo, ShapeInference::InferAllReduceShape(operand_shapes));
 }
 
-Status ShapeVerifier::HandleReduceScatter(HloInstruction* hlo) {
+absl::Status ShapeVerifier::HandleReduceScatter(HloInstruction* hlo) {
   auto ars = Cast<HloReduceScatterInstruction>(hlo);
   TF_ASSIGN_OR_RETURN(CollectiveOpGroupMode group_mode,
                       GetCollectiveOpGroupMode(ars->channel_id().has_value(),
@@ -556,7 +557,7 @@ Status ShapeVerifier::HandleReduceScatter(HloInstruction* hlo) {
                         operand_shapes, ars->scatter_dimension(), shard_count));
 }
 
-Status ShapeVerifier::HandleAllReduceStart(HloInstruction* hlo) {
+absl::Status ShapeVerifier::HandleAllReduceStart(HloInstruction* hlo) {
   auto ar = Cast<HloAllReduceInstruction>(hlo);
   TF_ASSIGN_OR_RETURN(CollectiveOpGroupMode group_mode,
                       GetCollectiveOpGroupMode(ar->channel_id().has_value(),
@@ -572,12 +573,12 @@ Status ShapeVerifier::HandleAllReduceStart(HloInstruction* hlo) {
                     ShapeInference::InferAllReduceStartShape(operand_shapes));
 }
 
-Status ShapeVerifier::HandleAllReduceDone(HloInstruction* hlo) {
+absl::Status ShapeVerifier::HandleAllReduceDone(HloInstruction* hlo) {
   return CheckShape(
       hlo, ShapeInference::InferAllReduceDoneShape(hlo->operand(0)->shape()));
 }
 
-Status ShapeVerifier::HandleAllToAll(HloInstruction* hlo) {
+absl::Status ShapeVerifier::HandleAllToAll(HloInstruction* hlo) {
   auto* all_to_all = Cast<HloAllToAllInstruction>(hlo);
   TF_ASSIGN_OR_RETURN(CollectiveOpGroupMode group_mode,
                       GetCollectiveOpGroupMode(
@@ -604,18 +605,18 @@ Status ShapeVerifier::HandleAllToAll(HloInstruction* hlo) {
   }
 }
 
-Status ShapeVerifier::HandlePartitionId(HloInstruction* hlo) {
+absl::Status ShapeVerifier::HandlePartitionId(HloInstruction* hlo) {
   return CheckShape(hlo, ShapeUtil::MakeShape(U32, {}));
 }
 
-Status ShapeVerifier::HandleReplicaId(HloInstruction* hlo) {
+absl::Status ShapeVerifier::HandleReplicaId(HloInstruction* hlo) {
   return CheckShape(hlo, ShapeUtil::MakeShape(U32, {}));
 }
 
 namespace {
 
-Status CheckBufferOffset(const Shape& buffer_shape,
-                         const Shape& buffer_offset_shape) {
+absl::Status CheckBufferOffset(const Shape& buffer_shape,
+                               const Shape& buffer_offset_shape) {
   if (!buffer_offset_shape.IsTuple()) {
     return Internal("Buffer offset is not tuple.");
   }
@@ -651,7 +652,7 @@ Status CheckBufferOffset(const Shape& buffer_shape,
   return OkStatus();
 }
 
-Status CheckInplaceCollectivePermute(HloInstruction* collective_permute) {
+absl::Status CheckInplaceCollectivePermute(HloInstruction* collective_permute) {
   if (collective_permute->operand_count() == 1) {
     return OkStatus();
   }
@@ -666,12 +667,12 @@ Status CheckInplaceCollectivePermute(HloInstruction* collective_permute) {
   const Shape& output_offset_shape = collective_permute->operand(3)->shape();
 
   if (input_buffer_shape.IsArray() && output_buffer_shape.IsArray()) {
-    Status check_input_buffer_offset =
+    absl::Status check_input_buffer_offset =
         CheckBufferOffset(input_buffer_shape, input_offset_shape);
     if (!check_input_buffer_offset.ok()) {
       return check_input_buffer_offset;
     }
-    Status check_output_buffer_offset =
+    absl::Status check_output_buffer_offset =
         CheckBufferOffset(output_buffer_shape, output_offset_shape);
     if (!check_output_buffer_offset.ok()) {
       return check_output_buffer_offset;
@@ -687,7 +688,7 @@ Status CheckInplaceCollectivePermute(HloInstruction* collective_permute) {
       return Internal("Unmatching input buffers and input offset.");
     }
     for (int i = 0; i < input_buffer_shape.tuple_shapes_size(); ++i) {
-      Status check_input_buffer_offset =
+      absl::Status check_input_buffer_offset =
           CheckBufferOffset(input_buffer_shape.tuple_shapes(i),
                             input_offset_shape.tuple_shapes(i));
       if (!check_input_buffer_offset.ok()) {
@@ -700,7 +701,7 @@ Status CheckInplaceCollectivePermute(HloInstruction* collective_permute) {
       return Internal("Unmatching output buffers and output offset.");
     }
     for (int i = 0; i < output_buffer_shape.tuple_shapes_size(); ++i) {
-      Status check_output_buffer_offset =
+      absl::Status check_output_buffer_offset =
           CheckBufferOffset(output_buffer_shape.tuple_shapes(i),
                             output_offset_shape.tuple_shapes(i));
       if (!check_output_buffer_offset.ok()) {
@@ -713,8 +714,8 @@ Status CheckInplaceCollectivePermute(HloInstruction* collective_permute) {
   return OkStatus();
 }
 
-Status CheckDuplicatedSourceOrTarget(HloInstruction* hlo,
-                                     CollectiveOpGroupMode group_mode) {
+absl::Status CheckDuplicatedSourceOrTarget(HloInstruction* hlo,
+                                           CollectiveOpGroupMode group_mode) {
   // A source or target cannot appear twice in the collective-permute's
   // source-target pairs. Also, based on the group formation mode, check if the
   // source and target IDs are within expected range.
@@ -794,7 +795,7 @@ Status CheckDuplicatedSourceOrTarget(HloInstruction* hlo,
 
 }  // namespace
 
-Status ShapeVerifier::HandleCollectiveBroadcast(HloInstruction* hlo) {
+absl::Status ShapeVerifier::HandleCollectiveBroadcast(HloInstruction* hlo) {
   std::vector<const Shape*> operand_shapes;
   for (const HloInstruction* operand : hlo->operands()) {
     operand_shapes.push_back(&operand->shape());
@@ -803,7 +804,7 @@ Status ShapeVerifier::HandleCollectiveBroadcast(HloInstruction* hlo) {
       hlo, ShapeInference::InferCollectiveBroadcastShape(operand_shapes));
 }
 
-Status ShapeVerifier::HandleCollectivePermute(HloInstruction* hlo) {
+absl::Status ShapeVerifier::HandleCollectivePermute(HloInstruction* hlo) {
   TF_ASSIGN_OR_RETURN(
       CollectiveOpGroupMode group_mode,
       GetCollectiveOpGroupMode(hlo->channel_id().has_value(),
@@ -818,7 +819,7 @@ Status ShapeVerifier::HandleCollectivePermute(HloInstruction* hlo) {
       hlo, ShapeInference::InferCollectivePermuteShape(operand_shapes));
 }
 
-Status ShapeVerifier::HandleCollectivePermuteStart(HloInstruction* hlo) {
+absl::Status ShapeVerifier::HandleCollectivePermuteStart(HloInstruction* hlo) {
   TF_ASSIGN_OR_RETURN(
       CollectiveOpGroupMode group_mode,
       GetCollectiveOpGroupMode(hlo->channel_id().has_value(),
@@ -838,20 +839,21 @@ Status ShapeVerifier::HandleCollectivePermuteStart(HloInstruction* hlo) {
                              operand_shapes, context_shapes));
 }
 
-Status ShapeVerifier::HandleCollectivePermuteDone(HloInstruction* hlo) {
+absl::Status ShapeVerifier::HandleCollectivePermuteDone(HloInstruction* hlo) {
   return CheckShape(hlo, ShapeInference::InferCollectivePermuteDoneShape(
                              hlo->operand(0)->shape()));
 }
 
-Status ShapeVerifier::HandleReducePrecision(HloInstruction* reduce_precision) {
+absl::Status ShapeVerifier::HandleReducePrecision(
+    HloInstruction* reduce_precision) {
   return CheckShape(reduce_precision, ShapeInference::InferReducePrecisionShape(
                                           reduce_precision->operand(0)->shape(),
                                           reduce_precision->exponent_bits(),
                                           reduce_precision->mantissa_bits()));
 }
 
-Status ShapeVerifier::CheckIsTokenOperand(const HloInstruction* instruction,
-                                          int64_t operand_no) {
+absl::Status ShapeVerifier::CheckIsTokenOperand(
+    const HloInstruction* instruction, int64_t operand_no) {
   const HloInstruction* token = instruction->operand(operand_no);
   if (!ShapeUtil::Equal(token->shape(), ShapeUtil::MakeTokenShape())) {
     return Internal(
@@ -862,7 +864,7 @@ Status ShapeVerifier::CheckIsTokenOperand(const HloInstruction* instruction,
   return OkStatus();
 }
 
-Status ShapeVerifier::CheckOperandAndParameter(
+absl::Status ShapeVerifier::CheckOperandAndParameter(
     const HloInstruction* instruction, int64_t operand_number,
     const HloComputation* computation, int64_t parameter_number) {
   const HloInstruction* operand = instruction->operand(operand_number);
@@ -876,7 +878,7 @@ Status ShapeVerifier::CheckOperandAndParameter(
   return OkStatus();
 }
 
-Status ShapeVerifier::HandleInfeed(HloInstruction* instruction) {
+absl::Status ShapeVerifier::HandleInfeed(HloInstruction* instruction) {
   HloInfeedInstruction* infeed = Cast<HloInfeedInstruction>(instruction);
   TF_RETURN_IF_ERROR(CheckIsTokenOperand(instruction, 0));
 
@@ -887,7 +889,7 @@ Status ShapeVerifier::HandleInfeed(HloInstruction* instruction) {
                     /*only_compare_minor_to_major_in_layout=*/true);
 }
 
-Status ShapeVerifier::HandleOutfeed(HloInstruction* instruction) {
+absl::Status ShapeVerifier::HandleOutfeed(HloInstruction* instruction) {
   HloOutfeedInstruction* outfeed = Cast<HloOutfeedInstruction>(instruction);
   TF_RETURN_IF_ERROR(CheckIsTokenOperand(instruction, 1));
 
@@ -913,7 +915,7 @@ bool ShapeVerifier::HasCompatibleElementTypes(const Shape& shape_0,
                                                          result_shape)));
 }
 
-Status ShapeVerifier::HandleRng(HloInstruction* instruction) {
+absl::Status ShapeVerifier::HandleRng(HloInstruction* instruction) {
   TF_RETURN_IF_ERROR(CheckOperandCount(instruction, 2));
 
   const Shape& shape_0 = instruction->operand(0)->shape();
@@ -962,7 +964,7 @@ Status ShapeVerifier::HandleRng(HloInstruction* instruction) {
   return OkStatus();
 }
 
-Status ShapeVerifier::HandleRngBitGenerator(HloInstruction* hlo) {
+absl::Status ShapeVerifier::HandleRngBitGenerator(HloInstruction* hlo) {
   if (!hlo->shape().IsTuple()) {
     return OkStatus();
   }
@@ -982,7 +984,8 @@ Status ShapeVerifier::HandleRngBitGenerator(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status ShapeVerifier::HandleRngGetAndUpdateState(HloInstruction* instruction) {
+absl::Status ShapeVerifier::HandleRngGetAndUpdateState(
+    HloInstruction* instruction) {
   TF_RETURN_IF_ERROR(CheckOperandCount(instruction, 0));
   const Shape& result_shape = instruction->shape();
   const Shape expected_shape = ShapeUtil::MakeShape(U64, {2});
@@ -995,19 +998,19 @@ Status ShapeVerifier::HandleRngGetAndUpdateState(HloInstruction* instruction) {
   return OkStatus();
 }
 
-Status ShapeVerifier::HandleReverse(HloInstruction* reverse) {
+absl::Status ShapeVerifier::HandleReverse(HloInstruction* reverse) {
   return CheckShape(
       reverse, ShapeInference::InferReverseShape(reverse->operand(0)->shape(),
                                                  reverse->dimensions()));
 }
 
-Status ShapeVerifier::HandleTopK(HloInstruction* hlo) {
+absl::Status ShapeVerifier::HandleTopK(HloInstruction* hlo) {
   return CheckShape(
       hlo, ShapeInference::InferTopKShape(hlo->operand(0)->shape(),
                                           Cast<HloTopKInstruction>(hlo)->k()));
 }
 
-Status ShapeVerifier::HandleSort(HloInstruction* hlo) {
+absl::Status ShapeVerifier::HandleSort(HloInstruction* hlo) {
   HloSortInstruction* sort = Cast<HloSortInstruction>(hlo);
   if (sort->operand_count() < 1) {
     return Internal("Expected at least 1 operand for %s instruction: %s",
@@ -1071,7 +1074,7 @@ Status ShapeVerifier::HandleSort(HloInstruction* hlo) {
   return CheckVariadicShape(sort);
 }
 
-Status ShapeVerifier::HandleConstant(HloInstruction* constant) {
+absl::Status ShapeVerifier::HandleConstant(HloInstruction* constant) {
   if (!Cast<HloConstantInstruction>(constant)->HasLiteral()) {
     return Internal("Constant is required to have a valid literal: %s",
                     constant->ToString());
@@ -1080,7 +1083,7 @@ Status ShapeVerifier::HandleConstant(HloInstruction* constant) {
                     /*only_compare_minor_to_major_in_layout=*/true);
 }
 
-Status ShapeVerifier::HandleIota(HloInstruction* hlo) {
+absl::Status ShapeVerifier::HandleIota(HloInstruction* hlo) {
   auto* iota = Cast<HloIotaInstruction>(hlo);
   if (!iota->shape().IsArray()) {
     return Internal("Iota does not support non-array result.");
@@ -1109,7 +1112,8 @@ Status ShapeVerifier::HandleIota(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status ShapeVerifier::HandleGetTupleElement(HloInstruction* get_tuple_element) {
+absl::Status ShapeVerifier::HandleGetTupleElement(
+    HloInstruction* get_tuple_element) {
   return CheckShape(get_tuple_element,
                     ShapeInference::InferGetTupleElementShape(
                         get_tuple_element->operand(0)->shape(),
@@ -1117,7 +1121,7 @@ Status ShapeVerifier::HandleGetTupleElement(HloInstruction* get_tuple_element) {
 }
 
 namespace {
-Status SameElementTypesForOperandsAndToApplyParameters(
+absl::Status SameElementTypesForOperandsAndToApplyParameters(
     const HloInstruction& instruction, int64_t num_operands_to_check) {
   const ProgramShape& to_apply = instruction.to_apply()->ComputeProgramShape();
   for (int i = 0; i < num_operands_to_check; ++i) {
@@ -1134,7 +1138,7 @@ Status SameElementTypesForOperandsAndToApplyParameters(
 }
 }  // namespace
 
-Status ShapeVerifier::HandleReduce(HloInstruction* reduce) {
+absl::Status ShapeVerifier::HandleReduce(HloInstruction* reduce) {
   if (reduce->operand_count() % 2 != 0) {
     return Internal(
         "Expected an even number of operands for %s instruction: %s",
@@ -1156,7 +1160,7 @@ Status ShapeVerifier::HandleReduce(HloInstruction* reduce) {
                    *reduce, reduce->operand_count());
 }
 
-Status ShapeVerifier::HandleBitcast(HloInstruction* bitcast) {
+absl::Status ShapeVerifier::HandleBitcast(HloInstruction* bitcast) {
   const Shape& output_shape = bitcast->shape();
   const Shape& operand_shape = bitcast->operand(0)->shape();
   if (opts_.layout_sensitive &&
@@ -1179,7 +1183,7 @@ Status ShapeVerifier::HandleBitcast(HloInstruction* bitcast) {
   return OkStatus();
 }
 
-Status ShapeVerifier::HandleBroadcast(HloInstruction* broadcast) {
+absl::Status ShapeVerifier::HandleBroadcast(HloInstruction* broadcast) {
   // HLO broadcast has no exact analog at the client level so there is no
   // ShapeInference method. Check the output shape explicitly.
   const Shape& operand_shape = broadcast->operand(0)->shape();
@@ -1200,7 +1204,8 @@ Status ShapeVerifier::HandleBroadcast(HloInstruction* broadcast) {
   return OkStatus();
 }
 
-Status ShapeVerifier::HandleDynamicReshape(HloInstruction* dynamic_reshape) {
+absl::Status ShapeVerifier::HandleDynamicReshape(
+    HloInstruction* dynamic_reshape) {
   // Check for mixed precision.
   const Shape& operand_shape = dynamic_reshape->operand(0)->shape();
   TF_RET_CHECK(SameElementType(dynamic_reshape->shape(), operand_shape));
@@ -1214,7 +1219,7 @@ Status ShapeVerifier::HandleDynamicReshape(HloInstruction* dynamic_reshape) {
   return OkStatus();
 }
 
-Status ShapeVerifier::HandleReshape(HloInstruction* reshape) {
+absl::Status ShapeVerifier::HandleReshape(HloInstruction* reshape) {
   // Check for mixed precision.
   const Shape& operand_shape = reshape->operand(0)->shape();
   TF_RET_CHECK(SameElementType(reshape->shape(), operand_shape));
@@ -1223,17 +1228,17 @@ Status ShapeVerifier::HandleReshape(HloInstruction* reshape) {
   return OkStatus();
 }
 
-Status ShapeVerifier::HandleTranspose(HloInstruction* transpose) {
+absl::Status ShapeVerifier::HandleTranspose(HloInstruction* transpose) {
   return CheckShape(
       transpose, ShapeInference::InferTransposeShape(
                      transpose->operand(0)->shape(), transpose->dimensions()));
 }
 
-Status ShapeVerifier::HandleParameter(HloInstruction* hlo) {
+absl::Status ShapeVerifier::HandleParameter(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status ShapeVerifier::HandleFusion(HloInstruction* fusion) {
+absl::Status ShapeVerifier::HandleFusion(HloInstruction* fusion) {
   if (fusion->called_computations().size() != 1) {
     return Internal("Fusion has a non-unary number of called computations (%s)",
                     fusion->ToString().c_str());
@@ -1290,7 +1295,7 @@ Status ShapeVerifier::HandleFusion(HloInstruction* fusion) {
   return OkStatus();
 }
 
-Status ShapeVerifier::HandleCall(HloInstruction* call) {
+absl::Status ShapeVerifier::HandleCall(HloInstruction* call) {
   TF_RETURN_IF_ERROR(
       CheckParameterCount(call, call->to_apply(), call->operand_count()));
   for (int64_t i = 0; i < call->to_apply()->num_parameters(); ++i) {
@@ -1300,7 +1305,7 @@ Status ShapeVerifier::HandleCall(HloInstruction* call) {
   return CheckShape(call, call->to_apply()->root_instruction()->shape());
 }
 
-Status ShapeVerifier::HandleCustomCall(HloInstruction* instruction) {
+absl::Status ShapeVerifier::HandleCustomCall(HloInstruction* instruction) {
   const HloCustomCallInstruction* custom_call =
       DynCast<const HloCustomCallInstruction>(instruction);
   TF_RET_CHECK(custom_call != nullptr);
@@ -1346,14 +1351,14 @@ Status ShapeVerifier::HandleCustomCall(HloInstruction* instruction) {
   return OkStatus();
 }
 
-Status ShapeVerifier::HandleSlice(HloInstruction* slice) {
+absl::Status ShapeVerifier::HandleSlice(HloInstruction* slice) {
   return CheckShape(slice,
                     ShapeInference::InferSliceShape(
                         slice->operand(0)->shape(), slice->slice_starts(),
                         slice->slice_limits(), slice->slice_strides()));
 }
 
-Status ShapeVerifier::HandleDynamicSlice(HloInstruction* dynamic_slice) {
+absl::Status ShapeVerifier::HandleDynamicSlice(HloInstruction* dynamic_slice) {
   return CheckShape(
       dynamic_slice,
       ShapeInference::InferDynamicSliceShape(
@@ -1362,7 +1367,7 @@ Status ShapeVerifier::HandleDynamicSlice(HloInstruction* dynamic_slice) {
           dynamic_slice->dynamic_slice_sizes()));
 }
 
-Status ShapeVerifier::HandleDynamicUpdateSlice(
+absl::Status ShapeVerifier::HandleDynamicUpdateSlice(
     HloInstruction* dynamic_update_slice) {
   return CheckShape(
       dynamic_update_slice,
@@ -1373,11 +1378,11 @@ Status ShapeVerifier::HandleDynamicUpdateSlice(
               ->index_shapes()));
 }
 
-Status ShapeVerifier::HandleTuple(HloInstruction* tuple) {
+absl::Status ShapeVerifier::HandleTuple(HloInstruction* tuple) {
   return CheckVariadicShape(tuple);
 }
 
-Status ShapeVerifier::HandleMap(HloInstruction* map) {
+absl::Status ShapeVerifier::HandleMap(HloInstruction* map) {
   std::vector<const Shape*> operand_shapes;
   int64_t max_operand_rank = 0;
   for (const HloInstruction* operand : map->operands()) {
@@ -1400,7 +1405,7 @@ Status ShapeVerifier::HandleMap(HloInstruction* map) {
                    *map, map->operand_count());
 }
 
-Status ShapeVerifier::HandleReduceWindow(HloInstruction* reduce_window) {
+absl::Status ShapeVerifier::HandleReduceWindow(HloInstruction* reduce_window) {
   auto reduce_window_instr = Cast<HloReduceWindowInstruction>(reduce_window);
   auto input_shapes = reduce_window_instr->input_shapes();
   auto init_shapes = reduce_window_instr->init_value_shapes();
@@ -1415,7 +1420,8 @@ Status ShapeVerifier::HandleReduceWindow(HloInstruction* reduce_window) {
                    *reduce_window, reduce_window->operand_count());
 }
 
-Status ShapeVerifier::HandleSelectAndScatter(HloInstruction* instruction) {
+absl::Status ShapeVerifier::HandleSelectAndScatter(
+    HloInstruction* instruction) {
   return CheckShape(
       instruction,
       ShapeInference::InferSelectAndScatterShape(
@@ -1425,7 +1431,7 @@ Status ShapeVerifier::HandleSelectAndScatter(HloInstruction* instruction) {
           instruction->scatter()->ComputeProgramShape()));
 }
 
-Status ShapeVerifier::HandleWhile(HloInstruction* xla_while) {
+absl::Status ShapeVerifier::HandleWhile(HloInstruction* xla_while) {
   TF_RETURN_IF_ERROR(
       CheckParameterCount(xla_while, xla_while->while_body(), 1));
   TF_RETURN_IF_ERROR(
@@ -1449,7 +1455,7 @@ Status ShapeVerifier::HandleWhile(HloInstruction* xla_while) {
                     xla_while->while_body()->root_instruction()->shape());
 }
 
-Status ShapeVerifier::HandleConditional(HloInstruction* conditional) {
+absl::Status ShapeVerifier::HandleConditional(HloInstruction* conditional) {
   if (!ShapeUtil::IsScalar(conditional->operand(0)->shape())) {
     return InvalidArgument(
         "The first operand of conditional must be a scalar. Got %s",
@@ -1481,14 +1487,14 @@ Status ShapeVerifier::HandleConditional(HloInstruction* conditional) {
   return OkStatus();
 }
 
-Status ShapeVerifier::HandlePad(HloInstruction* pad) {
+absl::Status ShapeVerifier::HandlePad(HloInstruction* pad) {
   return CheckShape(pad, ShapeInference::InferPadShape(pad->operand(0)->shape(),
                                                        pad->operand(1)->shape(),
                                                        pad->padding_config()));
 }
 
 namespace {
-Status CheckAsyncOpOperand(const HloInstruction* async_op) {
+absl::Status CheckAsyncOpOperand(const HloInstruction* async_op) {
   const HloInstruction* operand = async_op->operand(0);
   if (operand->opcode() != HloOpcode::kAsyncStart &&
       operand->opcode() != HloOpcode::kAsyncUpdate) {
@@ -1513,7 +1519,7 @@ Status CheckAsyncOpOperand(const HloInstruction* async_op) {
   return OkStatus();
 }
 
-Status CheckAsyncOpComputationThreadName(const HloInstruction* async_op) {
+absl::Status CheckAsyncOpComputationThreadName(const HloInstruction* async_op) {
   absl::string_view async_execution_thread = async_op->async_execution_thread();
   if (async_execution_thread !=
       async_op->async_wrapped_computation()->execution_thread()) {
@@ -1528,8 +1534,8 @@ Status CheckAsyncOpComputationThreadName(const HloInstruction* async_op) {
       /*skip_nested_async_op_check=*/false);
 }
 
-Status CheckCallableInstructionThreadName(const HloInstruction* instruction,
-                                          bool skip_nested_async_op_check) {
+absl::Status CheckCallableInstructionThreadName(
+    const HloInstruction* instruction, bool skip_nested_async_op_check) {
   for (const HloComputation* computation : instruction->called_computations()) {
     if (instruction->parent() != nullptr) {
       if (instruction->parent()->execution_thread() !=
@@ -1548,7 +1554,7 @@ Status CheckCallableInstructionThreadName(const HloInstruction* instruction,
 }
 }  // namespace
 
-Status ShapeVerifier::CheckAsyncOpComputationShapes(
+absl::Status ShapeVerifier::CheckAsyncOpComputationShapes(
     const HloInstruction* async_op, const Shape& async_shape) {
   if (!async_shape.IsTuple() || async_shape.tuple_shapes_size() < 2) {
     return Internal(
@@ -1556,6 +1562,13 @@ Status ShapeVerifier::CheckAsyncOpComputationShapes(
         "elements, found %s.",
         HloOpcodeString(async_op->opcode()), async_shape.ToString());
   }
+
+  // The semantics of an async custom call are defined by the custom call
+  // implementation, so we stop checking here.
+  if (async_op->async_wrapped_opcode() == HloOpcode::kCustomCall) {
+    return OkStatus();
+  }
+
   ProgramShape computation_shape =
       async_op->async_wrapped_computation()->ComputeProgramShape();
   Shape param_shape = ShapeUtil::MakeTupleShape(computation_shape.parameters());
@@ -1578,7 +1591,7 @@ Status ShapeVerifier::CheckAsyncOpComputationShapes(
   return OkStatus();
 }
 
-Status ShapeVerifier::HandleAsyncStart(HloInstruction* async_start) {
+absl::Status ShapeVerifier::HandleAsyncStart(HloInstruction* async_start) {
   TF_RETURN_IF_ERROR(
       CheckAsyncOpComputationShapes(async_start, async_start->shape()));
   TF_RETURN_IF_ERROR(CheckAsyncOpComputationThreadName(async_start));
@@ -1597,7 +1610,7 @@ Status ShapeVerifier::HandleAsyncStart(HloInstruction* async_start) {
   return OkStatus();
 }
 
-Status ShapeVerifier::HandleAsyncUpdate(HloInstruction* async_update) {
+absl::Status ShapeVerifier::HandleAsyncUpdate(HloInstruction* async_update) {
   TF_RETURN_IF_ERROR(CheckAsyncOpComputationThreadName(async_update));
   if (!ShapesSame(async_update->operand(0)->shape(), async_update->shape())) {
     return Internal(
@@ -1611,7 +1624,7 @@ Status ShapeVerifier::HandleAsyncUpdate(HloInstruction* async_update) {
   return CheckAsyncOpOperand(async_update);
 }
 
-Status ShapeVerifier::HandleAsyncDone(HloInstruction* async_done) {
+absl::Status ShapeVerifier::HandleAsyncDone(HloInstruction* async_done) {
   TF_RETURN_IF_ERROR(CheckAsyncOpComputationThreadName(async_done));
   TF_RETURN_IF_ERROR(CheckAsyncOpComputationShapes(
       async_done, async_done->operand(0)->shape()));
@@ -1626,7 +1639,7 @@ Status ShapeVerifier::HandleAsyncDone(HloInstruction* async_done) {
   return CheckAsyncOpOperand(async_done);
 }
 
-Status ShapeVerifier::HandleCopyStart(HloInstruction* copy_start) {
+absl::Status ShapeVerifier::HandleCopyStart(HloInstruction* copy_start) {
   return CheckShape(copy_start,
                     ShapeUtil::MakeTupleShape({copy_start->operand(0)->shape(),
                                                copy_start->operand(0)->shape(),
@@ -1634,7 +1647,7 @@ Status ShapeVerifier::HandleCopyStart(HloInstruction* copy_start) {
                     /*only_compare_minor_to_major_in_layout=*/true);
 }
 
-Status ShapeVerifier::HandleCopyDone(HloInstruction* copy_done) {
+absl::Status ShapeVerifier::HandleCopyDone(HloInstruction* copy_done) {
   const Shape& operand_shape = copy_done->operand(0)->shape();
   const Shape& dest_shape = ShapeUtil::GetTupleElementShape(operand_shape, 0);
   const Shape& src_shape = ShapeUtil::GetTupleElementShape(operand_shape, 1);
@@ -1650,7 +1663,7 @@ Status ShapeVerifier::HandleCopyDone(HloInstruction* copy_done) {
                                    copy_done->operand(0)->shape(), 0));
 }
 
-Status ShapeVerifier::HandleSend(HloInstruction* send) {
+absl::Status ShapeVerifier::HandleSend(HloInstruction* send) {
   return CheckShape(send,
                     ShapeUtil::MakeTupleShape({send->operand(0)->shape(),
                                                ShapeUtil::MakeShape(U32, {}),
@@ -1658,11 +1671,11 @@ Status ShapeVerifier::HandleSend(HloInstruction* send) {
                     /*only_compare_minor_to_major_in_layout=*/true);
 }
 
-Status ShapeVerifier::HandleSendDone(HloInstruction* send_done) {
+absl::Status ShapeVerifier::HandleSendDone(HloInstruction* send_done) {
   return CheckShape(send_done, ShapeUtil::MakeTokenShape());
 }
 
-Status ShapeVerifier::HandleRecv(HloInstruction* recv) {
+absl::Status ShapeVerifier::HandleRecv(HloInstruction* recv) {
   return CheckShape(
       recv,
       ShapeUtil::MakeTupleShape(
@@ -1671,7 +1684,7 @@ Status ShapeVerifier::HandleRecv(HloInstruction* recv) {
       /*only_compare_minor_to_major_in_layout=*/true);
 }
 
-Status ShapeVerifier::HandleRecvDone(HloInstruction* recv_done) {
+absl::Status ShapeVerifier::HandleRecvDone(HloInstruction* recv_done) {
   return CheckShape(
       recv_done,
       ShapeUtil::MakeTupleShape(
@@ -1679,7 +1692,7 @@ Status ShapeVerifier::HandleRecvDone(HloInstruction* recv_done) {
            ShapeUtil::MakeTokenShape()}));
 }
 
-Status ShapeVerifier::HandleBatchNormTraining(
+absl::Status ShapeVerifier::HandleBatchNormTraining(
     HloInstruction* batch_norm_training) {
   return CheckShape(batch_norm_training,
                     ShapeInference::InferBatchNormTrainingShape(
@@ -1689,7 +1702,7 @@ Status ShapeVerifier::HandleBatchNormTraining(
                         batch_norm_training->feature_index()));
 }
 
-Status ShapeVerifier::HandleBatchNormInference(
+absl::Status ShapeVerifier::HandleBatchNormInference(
     HloInstruction* batch_norm_inference) {
   return CheckShape(batch_norm_inference,
                     ShapeInference::InferBatchNormInferenceShape(
@@ -1701,7 +1714,8 @@ Status ShapeVerifier::HandleBatchNormInference(
                         batch_norm_inference->feature_index()));
 }
 
-Status ShapeVerifier::HandleBatchNormGrad(HloInstruction* batch_norm_grad) {
+absl::Status ShapeVerifier::HandleBatchNormGrad(
+    HloInstruction* batch_norm_grad) {
   return CheckShape(batch_norm_grad, ShapeInference::InferBatchNormGradShape(
                                          batch_norm_grad->operand(0)->shape(),
                                          batch_norm_grad->operand(1)->shape(),
@@ -1715,7 +1729,7 @@ namespace {
 
 // Checks that the instruction does not have mixed precision floating point
 // inputs.
-Status CheckMixedPrecisionOperands(const HloInstruction* instruction) {
+absl::Status CheckMixedPrecisionOperands(const HloInstruction* instruction) {
   switch (instruction->opcode()) {
     // Allow-list the following opcodes for mixed-precision check, because
     // they involve data pass through or grouping via tuples, where the
@@ -1782,7 +1796,7 @@ Status CheckMixedPrecisionOperands(const HloInstruction* instruction) {
 
 }  // namespace
 
-Status ShapeVerifier::HandleGather(HloInstruction* gather) {
+absl::Status ShapeVerifier::HandleGather(HloInstruction* gather) {
   return CheckShape(
       gather,
       ShapeInference::InferGatherShape(
@@ -1790,7 +1804,7 @@ Status ShapeVerifier::HandleGather(HloInstruction* gather) {
           gather->gather_dimension_numbers(), gather->gather_slice_sizes()));
 }
 
-Status ShapeVerifier::HandleScatter(HloInstruction* scatter) {
+absl::Status ShapeVerifier::HandleScatter(HloInstruction* scatter) {
   absl::InlinedVector<const Shape*, 3> arg_shapes;
   arg_shapes.reserve(scatter->operand_count());
   for (const HloInstruction* operand : scatter->operands()) {
@@ -1802,7 +1816,7 @@ Status ShapeVerifier::HandleScatter(HloInstruction* scatter) {
                         scatter->scatter_dimension_numbers()));
 }
 
-Status ShapeVerifier::HandleAfterAll(HloInstruction* token) {
+absl::Status ShapeVerifier::HandleAfterAll(HloInstruction* token) {
   std::vector<const Shape*> operand_shapes;
   for (const HloInstruction* operand : token->operands()) {
     operand_shapes.push_back(&operand->shape());
@@ -1810,27 +1824,28 @@ Status ShapeVerifier::HandleAfterAll(HloInstruction* token) {
   return CheckShape(token, ShapeUtil::MakeTokenShape());
 }
 
-Status ShapeVerifier::HandleAddDependency(HloInstruction* add_dependency) {
+absl::Status ShapeVerifier::HandleAddDependency(
+    HloInstruction* add_dependency) {
   TF_RETURN_IF_ERROR(CheckIsTokenOperand(add_dependency, 1));
   return CheckShape(add_dependency, add_dependency->operand(0)->shape());
 }
 
-Status ShapeVerifier::HandleGetDimensionSize(HloInstruction* get_size) {
+absl::Status ShapeVerifier::HandleGetDimensionSize(HloInstruction* get_size) {
   return CheckShape(get_size,
                     ShapeInference::InferGetDimensionSizeShape(
                         get_size->operand(0)->shape(), get_size->dimension()));
 }
 
-Status ShapeVerifier::HandleSetDimensionSize(HloInstruction* set_size) {
+absl::Status ShapeVerifier::HandleSetDimensionSize(HloInstruction* set_size) {
   return CheckShape(set_size,
                     ShapeInference::InferSetDimensionSizeShape(
                         set_size->operand(0)->shape(),
                         set_size->operand(1)->shape(), set_size->dimension()));
 }
 
-Status ShapeVerifier::CheckShape(const HloInstruction* instruction,
-                                 const Shape& inferred_shape,
-                                 bool only_compare_minor_to_major_in_layout) {
+absl::Status ShapeVerifier::CheckShape(
+    const HloInstruction* instruction, const Shape& inferred_shape,
+    bool only_compare_minor_to_major_in_layout) {
   // If allow_mixed_precision_ is false, check if there are operands with
   // different precisions. We need this check because ShapeInference allows
   // mixed precision inputs.
@@ -1909,11 +1924,11 @@ Status ShapeVerifier::CheckShape(const HloInstruction* instruction,
   return OkStatus();
 }
 
-Status ShapeVerifier::CheckShape(
+absl::Status ShapeVerifier::CheckShape(
     const HloInstruction* instruction,
     const absl::StatusOr<Shape>& inferred_shape_status) {
   if (!inferred_shape_status.ok()) {
-    Status s = inferred_shape_status.status();
+    absl::Status s = inferred_shape_status.status();
     tsl::errors::AppendToMessage(&s, ", for instruction ",
                                  instruction->ToString());
     return s;
@@ -1921,33 +1936,37 @@ Status ShapeVerifier::CheckShape(
   return CheckShape(instruction, inferred_shape_status.value());
 }
 
-Status ShapeVerifier::CheckUnaryShape(const HloInstruction* instruction) {
+absl::Status ShapeVerifier::CheckUnaryShape(const HloInstruction* instruction) {
   return CheckShape(instruction,
                     ShapeInference::InferUnaryOpShape(instruction->opcode(),
                                                       instruction->operand(0)));
 }
 
-Status ShapeVerifier::CheckBinaryShape(const HloInstruction* instruction) {
+absl::Status ShapeVerifier::CheckBinaryShape(
+    const HloInstruction* instruction) {
   return CheckShape(
       instruction, ShapeInference::InferBinaryOpShape(instruction->opcode(),
                                                       instruction->operand(0),
                                                       instruction->operand(1)));
 }
 
-Status ShapeVerifier::CheckTernaryShape(const HloInstruction* instruction) {
+absl::Status ShapeVerifier::CheckTernaryShape(
+    const HloInstruction* instruction) {
   return CheckShape(instruction,
                     ShapeInference::InferTernaryOpShape(
                         instruction->opcode(), instruction->operand(0),
                         instruction->operand(1), instruction->operand(2)));
 }
 
-Status ShapeVerifier::CheckVariadicShape(const HloInstruction* instruction) {
+absl::Status ShapeVerifier::CheckVariadicShape(
+    const HloInstruction* instruction) {
   return CheckShape(instruction,
                     ShapeInference::InferVariadicOpShape(
                         instruction->opcode(), instruction->operands()));
 }
 
-Status ShapeVerifier::VerifyEntryComputationLayout(const HloModule& module) {
+absl::Status ShapeVerifier::VerifyEntryComputationLayout(
+    const HloModule& module) {
   const HloComputation* computation = module.entry_computation();
   const auto& layout = module.entry_computation_layout();
   const ShapeLayout& result_layout = layout.result_layout();
@@ -1997,6 +2016,37 @@ Status ShapeVerifier::VerifyEntryComputationLayout(const HloModule& module) {
     }
   }
 
+  // If result is aliased with a parameter, entry computation layout must have
+  // same shape, layout and memory space for them (for example we can't alias
+  // parameter and result if they have different memory spaces).
+  const auto& alias_config = module.input_output_alias_config();
+  TF_RETURN_IF_ERROR(alias_config.ForEachAliasWithStatus(
+      [&](ShapeIndex result_index,
+          HloInputOutputAliasConfig::Alias alias) -> absl::Status {
+        // We skip may-alias buffers as they do not force aliasing.
+        if (!alias.must_alias()) {
+          return absl::OkStatus();
+        }
+
+        const Shape& result_shape =
+            ShapeUtil::GetSubshape(result_layout.shape(), result_index);
+        const Shape& parameter_shape = ShapeUtil::GetSubshape(
+            layout.parameter_layout(alias.parameter_number).shape(),
+            alias.parameter_index);
+
+        if (result_shape != parameter_shape) {
+          return Internal(
+              "Shape and memory space of the result at index %s (%s) "
+              "must be the same as the shape and memory spaceof aliased "
+              "parameter %d at index %s (%s)",
+              result_index.ToString(), StringifyShape(result_shape),
+              alias.parameter_number, alias.parameter_index.ToString(),
+              StringifyShape(parameter_shape));
+        }
+
+        return absl::OkStatus();
+      }));
+
   return OkStatus();
 }
 
@@ -2018,7 +2068,7 @@ std::string ComputationsToString(
 //
 // (3) the operands of each instruction are non-null and are in the same
 // computation as the instruction.
-Status VerifyHloStructure(HloModule* module) {
+absl::Status VerifyHloStructure(HloModule* module) {
   for (const HloComputation* computation : module->computations()) {
     if (computation == nullptr) {
       return Internal("Computation in module %s is a null pointer",
@@ -2099,8 +2149,8 @@ bool ShapeContainsToken(const Shape& shape) {
 }
 
 // Checks if the given two instructions share the same channel id.
-Status CheckSameChannel(const HloInstruction* instr1,
-                        const HloInstruction* instr2) {
+absl::Status CheckSameChannel(const HloInstruction* instr1,
+                              const HloInstruction* instr2) {
   if (instr1->channel_id() != instr2->channel_id()) {
     return Internal(
         "Expected to have the same channel id, actual channel ids are: %s "
@@ -2114,8 +2164,8 @@ Status CheckSameChannel(const HloInstruction* instr1,
 // Checks if the given two instructions have the same is_host_transfer
 // attribute value. Instructions must be send/recv instructions or their
 // 'done' variant.
-Status CheckSameIsHostTransfer(const HloInstruction* instr1,
-                               const HloInstruction* instr2) {
+absl::Status CheckSameIsHostTransfer(const HloInstruction* instr1,
+                                     const HloInstruction* instr2) {
   const HloSendRecvInstruction* send_recv1 =
       DynCast<const HloSendRecvInstruction>(instr1);
   const HloSendRecvInstruction* send_recv2 =
@@ -2132,8 +2182,9 @@ Status CheckSameIsHostTransfer(const HloInstruction* instr1,
   return OkStatus();
 }
 
-Status VerifySingleUser(const HloInstruction* instruction,
-                        const absl::flat_hash_set<HloOpcode>& expected_users) {
+absl::Status VerifySingleUser(
+    const HloInstruction* instruction,
+    const absl::flat_hash_set<HloOpcode>& expected_users) {
   TF_RET_CHECK(instruction->users().size() == 1)
       << "The " << instruction->opcode()
       << " instruction requires one consumer, found "
@@ -2151,8 +2202,9 @@ Status VerifySingleUser(const HloInstruction* instruction,
   return OkStatus();
 }
 
-Status VerifySingleOperand(const HloInstruction* instruction,
-                           const std::vector<HloOpcode>& expected_operands) {
+absl::Status VerifySingleOperand(
+    const HloInstruction* instruction,
+    const std::vector<HloOpcode>& expected_operands) {
   TF_RET_CHECK(instruction->operands().size() == 1)
       << "The " << instruction->opcode()
       << " instruction requires one consumer, found "
@@ -2172,7 +2224,7 @@ Status VerifySingleOperand(const HloInstruction* instruction,
 }
 
 // Checks asynchronous instruction pairs.
-Status VerifyAsynchronousInstructionPairs(const HloModule& module) {
+absl::Status VerifyAsynchronousInstructionPairs(const HloModule& module) {
   // CopyStart must have a single CopyDone user.
   for (const HloComputation* computation : module.computations()) {
     for (const HloInstruction* instruction : computation->instructions()) {
@@ -2234,7 +2286,7 @@ Status VerifyAsynchronousInstructionPairs(const HloModule& module) {
 
 // Checks that the asynchronous computation only has a root and parameter
 // instructions.
-Status VerifyAsyncComputation(const HloComputation* async_computation) {
+absl::Status VerifyAsyncComputation(const HloComputation* async_computation) {
   if (!async_computation->CanExpandIntoSingleInstruction()) {
     return FailedPrecondition(
         "Asynchronous computation %s expected to contain only the root and "
@@ -2246,7 +2298,7 @@ Status VerifyAsyncComputation(const HloComputation* async_computation) {
 
 // Checks that AllReduce instructions in the module are either all layout
 // constrained or all unconstrained.
-Status VerifyLayoutConstrainedAllReduce(const HloModule& module) {
+absl::Status VerifyLayoutConstrainedAllReduce(const HloModule& module) {
   const HloAllReduceInstruction* reference = nullptr;
   for (const HloComputation* computation : module.computations()) {
     for (const HloInstruction* instruction : computation->instructions()) {
@@ -2270,7 +2322,7 @@ Status VerifyLayoutConstrainedAllReduce(const HloModule& module) {
 
 // Checks various invariants of channel instructions (send/recv and
 // collectives).
-Status VerifyChannels(const HloModule& module) {
+absl::Status VerifyChannels(const HloModule& module) {
   absl::flat_hash_map<int64_t, std::vector<const HloInstruction*>>
       channel_instructions;
 
@@ -2438,7 +2490,7 @@ Status VerifyChannels(const HloModule& module) {
 }
 
 // CHECKs various invariants of a fusion instruction.
-Status CheckFusionInstruction(HloInstruction* fusion) {
+absl::Status CheckFusionInstruction(HloInstruction* fusion) {
   // The parent fusion instruction of the fusion computation must be 'fusion'.
   HloComputation* fused_computation = fusion->fused_instructions_computation();
   if (fusion != fused_computation->FusionInstruction()) {
@@ -2564,7 +2616,7 @@ Status CheckFusionInstruction(HloInstruction* fusion) {
 
 // Checks that the operand shapes are compatible to the output shape, i.e.,
 // that there are no implicit broadcasts.
-Status CheckElementwiseInstruction(HloInstruction* instruction) {
+absl::Status CheckElementwiseInstruction(HloInstruction* instruction) {
   const Shape& out_shape = instruction->shape();
   for (HloInstruction* operand : instruction->operands()) {
     const Shape& operand_shape = operand->shape();
@@ -2621,15 +2673,15 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
     }
   }
 
-  Status DefaultAction(HloInstruction*) override { return OkStatus(); }
+  absl::Status DefaultAction(HloInstruction*) override { return OkStatus(); }
 
-  Status HandleFusion(HloInstruction* fusion) override {
+  absl::Status HandleFusion(HloInstruction* fusion) override {
     TF_RETURN_IF_ERROR(CheckCallableInstructionThreadName(
         fusion, /*skip_nested_async_op_check*/ false));
     return CheckFusionInstruction(fusion);
   }
 
-  Status HandleBroadcast(HloInstruction* broadcast) override {
+  absl::Status HandleBroadcast(HloInstruction* broadcast) override {
     // If you see this failure then someone has confused the difference
     // between the HLO broadcast op, and the UserComputation broadcast
     // op. See https://groups.google.com/forum/#!topic/xla-dev/9LqijHmTt_I
@@ -2648,12 +2700,12 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
     return OkStatus();
   }
 
-  Status HandleBitcastConvert(HloInstruction* c) override {
+  absl::Status HandleBitcastConvert(HloInstruction* c) override {
     // Shape verifier will check all we need.
     return OkStatus();
   }
 
-  Status HandleWhile(HloInstruction* xla_while) override {
+  absl::Status HandleWhile(HloInstruction* xla_while) override {
     auto* while_cond = xla_while->while_condition();
     auto* while_body = xla_while->while_body();
     if (while_cond->num_parameters() != 1) {
@@ -2685,13 +2737,13 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
     return OkStatus();
   }
 
-  Status HandleCall(HloInstruction* call) override {
+  absl::Status HandleCall(HloInstruction* call) override {
     // Allow kCall to contain computations on separate thread.
     return CheckCallableInstructionThreadName(
         call, /*skip_nested_async_op_check=*/true);
   }
 
-  Status HandleConditional(HloInstruction* conditional) override {
+  absl::Status HandleConditional(HloInstruction* conditional) override {
     const std::vector<HloComputation*> branch_computations =
         conditional->branch_computations();
     std::vector<const HloInstruction*> sharding_check_instructions;
@@ -2720,20 +2772,20 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
     return OkStatus();
   }
 
-  Status HandleElementwiseUnary(HloInstruction* instruction) override {
+  absl::Status HandleElementwiseUnary(HloInstruction* instruction) override {
     return CheckElementwiseInstruction(instruction);
   }
 
-  Status HandleElementwiseBinary(HloInstruction* instruction) override {
+  absl::Status HandleElementwiseBinary(HloInstruction* instruction) override {
     return CheckElementwiseInstruction(instruction);
   }
 
-  Status HandleGetTupleElement(HloInstruction* gte) override {
+  absl::Status HandleGetTupleElement(HloInstruction* gte) override {
     TF_RET_CHECK(gte->operand(0)->shape().IsTuple());
     return OkStatus();
   }
 
-  Status HandleTranspose(HloInstruction* transpose) override {
+  absl::Status HandleTranspose(HloInstruction* transpose) override {
     const Shape& shape = transpose->shape();
     const HloInstruction* operand = transpose->operand(0);
     TF_RET_CHECK(shape.dimensions().size() == transpose->dimensions().size());
@@ -2749,7 +2801,7 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
     return OkStatus();
   }
 
-  Status HandleAllReduce(HloInstruction* crs) override {
+  absl::Status HandleAllReduce(HloInstruction* crs) override {
     if (crs->channel_id().has_value()) {
       TF_RET_CHECK(crs->channel_id().value() > 0)
           << "All reduce channel id must be greater than 0 for "
@@ -2758,7 +2810,7 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
     return OkStatus();
   }
 
-  Status HandleReshape(HloInstruction* hlo) override {
+  absl::Status HandleReshape(HloInstruction* hlo) override {
     if (opts_.verify_reshape_is_bitcast && !hlo->IsFused()) {
       TF_RET_CHECK(
           ShapeUtil::ReshapeIsBitcast(hlo->operand(0)->shape(), hlo->shape()))
@@ -2767,7 +2819,7 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
     return OkStatus();
   }
 
-  Status HandleCustomCall(HloInstruction* hlo) override {
+  absl::Status HandleCustomCall(HloInstruction* hlo) override {
     if (opts_.verify_custom_call_nested_computation_thread_name) {
       // Allow kCustomCall to contain computations on separate thread.
       return CheckCallableInstructionThreadName(
@@ -2776,7 +2828,7 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
     return OkStatus();
   }
 
-  Status HandleScatter(HloInstruction* scatter) override {
+  absl::Status HandleScatter(HloInstruction* scatter) override {
     int64_t rank = scatter->operand(0)->shape().rank();
     for (int64_t operand_dim :
          scatter->scatter_dimension_numbers().scatter_dims_to_operand_dims()) {
@@ -2789,7 +2841,7 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
     return OkStatus();
   }
 
-  Status Preprocess(HloInstruction* instruction) override {
+  absl::Status Preprocess(HloInstruction* instruction) override {
     auto [it, inserted] =
         instructions_by_name_.emplace(instruction->name(), instruction);
     TF_RET_CHECK(inserted) << "HLO has name that is not unique within module:\n"
@@ -2800,10 +2852,10 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
                            << it->second->parent()->name();
 
     if (instruction->has_sharding()) {
-      Status status =
+      absl::Status status =
           instruction->sharding().Validate(instruction->shape(), num_devices_);
       if (!status.ok()) {
-        return Status(
+        return absl::Status(
             status.code(),
             absl::StrCat("Invalid sharding for instruction: ",
                          instruction->ToString(), ": ", status.message()));
@@ -2813,7 +2865,7 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
     return OkStatus();
   }
 
-  Status Postprocess(HloInstruction* instruction) override {
+  absl::Status Postprocess(HloInstruction* instruction) override {
     if (!opts_.InstructionCanChangeLayout(instruction) &&
         LayoutUtil::IsDenseArray(instruction->shape()) &&
         instruction->shape().has_layout()) {
@@ -2849,7 +2901,7 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
   }
 
  private:
-  static Status VerifyConsistentSharding(
+  static absl::Status VerifyConsistentSharding(
       const HloInstruction* parent,
       absl::Span<const HloInstruction* const> instructions) {
     const HloInstruction* common_sharding_inst = nullptr;
@@ -2874,7 +2926,7 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
   // memory space from `operand_memory_space` to `result_memory_space`.
   // Returns OkStatus() if the instruction's layout changes are valid;
   // otherwise, returns an appropriate error status.
-  static Status HostOffloadInstructionCanChangeMemorySpace(
+  static absl::Status HostOffloadInstructionCanChangeMemorySpace(
       const HloInstruction* instruction, const int64_t operand_memory_space,
       const int64_t result_memory_space) {
     TF_RET_CHECK(!(operand_memory_space == Layout::kGenericFastMemorySpace &&
@@ -2968,9 +3020,9 @@ absl::StatusOr<bool> HloVerifier::Run(
   if (status_or_changed.ok()) {
     return status_or_changed.value();
   }
-  return Status(status_or_changed.status().code(),
-                absl::StrCat("during context [", context_,
-                             "]: ", status_or_changed.status().message()));
+  return absl::Status(status_or_changed.status().code(),
+                      absl::StrCat("during context [", context_, "]: ",
+                                   status_or_changed.status().message()));
 }
 
 MetadataTracker::MetadataTracker(absl::string_view prefix) : prefix_(prefix) {}
@@ -3032,7 +3084,7 @@ void MetadataTracker::HandleMetadata(const OpMetadata& metadata) {
   }
 }
 
-Status MetadataTracker::DefaultAction(HloInstruction* instruction) {
+absl::Status MetadataTracker::DefaultAction(HloInstruction* instruction) {
   HandleMetadata(instruction->metadata());
   return OkStatus();
 }
diff --git a/third_party/xla/xla/service/hlo_verifier.h b/third_party/xla/xla/service/hlo_verifier.h
index 91806e35976a06..2604645ec7ad54 100644
--- a/third_party/xla/xla/service/hlo_verifier.h
+++ b/third_party/xla/xla/service/hlo_verifier.h
@@ -155,93 +155,95 @@ class ShapeVerifier : public DfsHloVisitor {
 
   // Verifies that entry computation layout matches parameters and root shape of
   // the module's entry computation.
-  virtual Status VerifyEntryComputationLayout(const HloModule& module);
-
-  Status Preprocess(HloInstruction* hlo) override;
-
-  Status HandleElementwiseUnary(HloInstruction* hlo) override;
-  Status HandleElementwiseBinary(HloInstruction* hlo) override;
-  Status HandleClamp(HloInstruction* clamp) override;
-  Status HandleSelect(HloInstruction* select) override;
-  Status HandleConcatenate(HloInstruction* concatenate) override;
-  Status HandleIota(HloInstruction* hlo) override;
-  Status HandleConvert(HloInstruction* convert) override;
-  Status HandleBitcastConvert(HloInstruction* convert) override;
-  Status HandleStochasticConvert(HloInstruction* convert) override;
-  Status HandleCopy(HloInstruction* copy) override;
-  Status HandleDot(HloInstruction* dot) override;
-  Status HandleConvolution(HloInstruction* convolution) override;
-  Status HandleFft(HloInstruction* fft) override;
-  Status HandleCholesky(HloInstruction* hlo) override;
-  Status HandleTriangularSolve(HloInstruction* hlo) override;
-  Status HandleAllGather(HloInstruction* hlo) override;
-  Status HandleAllGatherStart(HloInstruction* hlo) override;
-  Status HandleAllGatherDone(HloInstruction* hlo) override;
-  Status HandleAllReduce(HloInstruction* hlo) override;
-  Status HandleAllReduceStart(HloInstruction* hlo) override;
-  Status HandleAllReduceDone(HloInstruction* hlo) override;
-  Status HandleAllToAll(HloInstruction* hlo) override;
-  Status HandleCollectiveBroadcast(HloInstruction* hlo) override;
-  Status HandleCollectivePermute(HloInstruction* hlo) override;
-  Status HandleCollectivePermuteStart(HloInstruction* hlo) override;
-  Status HandleCollectivePermuteDone(HloInstruction* hlo) override;
-  Status HandlePartitionId(HloInstruction* hlo) override;
-  Status HandleReplicaId(HloInstruction* hlo) override;
-  Status HandleReducePrecision(HloInstruction* reduce_precision) override;
-  Status HandleInfeed(HloInstruction*) override;
-  Status HandleOptimizationBarrier(HloInstruction* hlo) override;
-  Status HandleOutfeed(HloInstruction*) override;
-  Status HandleRng(HloInstruction*) override;
-  Status HandleRngBitGenerator(HloInstruction*) override;
-  Status HandleRngGetAndUpdateState(HloInstruction*) override;
-  Status HandleReverse(HloInstruction* reverse) override;
-  Status HandleSort(HloInstruction* hlo) override;
-  Status HandleTopK(HloInstruction* hlo) override;
-  Status HandleConstant(HloInstruction* constant) override;
-  Status HandleGetTupleElement(HloInstruction* get_tuple_element) override;
-  Status HandleReduce(HloInstruction* reduce) override;
-  Status HandleBitcast(HloInstruction* bitcast) override;
-  Status HandleBroadcast(HloInstruction* broadcast) override;
-  Status HandleReshape(HloInstruction* reshape) override;
-  Status HandleDynamicReshape(HloInstruction* dynamic_reshape) override;
-  Status HandleTranspose(HloInstruction* transpose) override;
-  Status HandleParameter(HloInstruction*) override;
-  Status HandleFusion(HloInstruction*) override;
-  Status HandleCall(HloInstruction* call) override;
-  Status HandleCustomCall(HloInstruction*) override;
-  Status HandleSlice(HloInstruction* slice) override;
-  Status HandleDynamicSlice(HloInstruction* dynamic_slice) override;
-  Status HandleDynamicUpdateSlice(
+  virtual absl::Status VerifyEntryComputationLayout(const HloModule& module);
+
+  absl::Status Preprocess(HloInstruction* hlo) override;
+
+  absl::Status HandleElementwiseUnary(HloInstruction* hlo) override;
+  absl::Status HandleElementwiseBinary(HloInstruction* hlo) override;
+  absl::Status HandleClamp(HloInstruction* clamp) override;
+  absl::Status HandleSelect(HloInstruction* select) override;
+  absl::Status HandleConcatenate(HloInstruction* concatenate) override;
+  absl::Status HandleIota(HloInstruction* hlo) override;
+  absl::Status HandleConvert(HloInstruction* convert) override;
+  absl::Status HandleBitcastConvert(HloInstruction* convert) override;
+  absl::Status HandleStochasticConvert(HloInstruction* convert) override;
+  absl::Status HandleCopy(HloInstruction* copy) override;
+  absl::Status HandleDot(HloInstruction* dot) override;
+  absl::Status HandleConvolution(HloInstruction* convolution) override;
+  absl::Status HandleFft(HloInstruction* fft) override;
+  absl::Status HandleCholesky(HloInstruction* hlo) override;
+  absl::Status HandleTriangularSolve(HloInstruction* hlo) override;
+  absl::Status HandleAllGather(HloInstruction* hlo) override;
+  absl::Status HandleAllGatherStart(HloInstruction* hlo) override;
+  absl::Status HandleAllGatherDone(HloInstruction* hlo) override;
+  absl::Status HandleAllReduce(HloInstruction* hlo) override;
+  absl::Status HandleAllReduceStart(HloInstruction* hlo) override;
+  absl::Status HandleAllReduceDone(HloInstruction* hlo) override;
+  absl::Status HandleAllToAll(HloInstruction* hlo) override;
+  absl::Status HandleCollectiveBroadcast(HloInstruction* hlo) override;
+  absl::Status HandleCollectivePermute(HloInstruction* hlo) override;
+  absl::Status HandleCollectivePermuteStart(HloInstruction* hlo) override;
+  absl::Status HandleCollectivePermuteDone(HloInstruction* hlo) override;
+  absl::Status HandlePartitionId(HloInstruction* hlo) override;
+  absl::Status HandleReplicaId(HloInstruction* hlo) override;
+  absl::Status HandleReducePrecision(HloInstruction* reduce_precision) override;
+  absl::Status HandleInfeed(HloInstruction*) override;
+  absl::Status HandleOptimizationBarrier(HloInstruction* hlo) override;
+  absl::Status HandleOutfeed(HloInstruction*) override;
+  absl::Status HandleRng(HloInstruction*) override;
+  absl::Status HandleRngBitGenerator(HloInstruction*) override;
+  absl::Status HandleRngGetAndUpdateState(HloInstruction*) override;
+  absl::Status HandleReverse(HloInstruction* reverse) override;
+  absl::Status HandleSort(HloInstruction* hlo) override;
+  absl::Status HandleTopK(HloInstruction* hlo) override;
+  absl::Status HandleConstant(HloInstruction* constant) override;
+  absl::Status HandleGetTupleElement(
+      HloInstruction* get_tuple_element) override;
+  absl::Status HandleReduce(HloInstruction* reduce) override;
+  absl::Status HandleBitcast(HloInstruction* bitcast) override;
+  absl::Status HandleBroadcast(HloInstruction* broadcast) override;
+  absl::Status HandleReshape(HloInstruction* reshape) override;
+  absl::Status HandleDynamicReshape(HloInstruction* dynamic_reshape) override;
+  absl::Status HandleTranspose(HloInstruction* transpose) override;
+  absl::Status HandleParameter(HloInstruction*) override;
+  absl::Status HandleFusion(HloInstruction*) override;
+  absl::Status HandleCall(HloInstruction* call) override;
+  absl::Status HandleCustomCall(HloInstruction*) override;
+  absl::Status HandleSlice(HloInstruction* slice) override;
+  absl::Status HandleDynamicSlice(HloInstruction* dynamic_slice) override;
+  absl::Status HandleDynamicUpdateSlice(
       HloInstruction* dynamic_update_slice) override;
-  Status HandleTuple(HloInstruction* tuple) override;
-  Status HandleMap(HloInstruction* map) override;
-  Status HandleReduceScatter(HloInstruction* hlo) override;
-  Status HandleReduceWindow(HloInstruction* reduce_window) override;
-  Status HandleSelectAndScatter(HloInstruction* instruction) override;
-  Status HandleWhile(HloInstruction* xla_while) override;
-  Status HandleConditional(HloInstruction* conditional) override;
-  Status HandlePad(HloInstruction* pad) override;
-  Status HandleAsyncStart(HloInstruction* async_start) override;
-  Status HandleAsyncUpdate(HloInstruction* async_update) override;
-  Status HandleAsyncDone(HloInstruction* async_done) override;
-  Status HandleCopyStart(HloInstruction* copy_start) override;
-  Status HandleCopyDone(HloInstruction* copy_done) override;
-  Status HandleSend(HloInstruction* send) override;
-  Status HandleSendDone(HloInstruction* send_done) override;
-  Status HandleRecv(HloInstruction* recv) override;
-  Status HandleRecvDone(HloInstruction* recv_done) override;
-  Status HandleBatchNormTraining(HloInstruction* batch_norm_training) override;
-  Status HandleBatchNormInference(
+  absl::Status HandleTuple(HloInstruction* tuple) override;
+  absl::Status HandleMap(HloInstruction* map) override;
+  absl::Status HandleReduceScatter(HloInstruction* hlo) override;
+  absl::Status HandleReduceWindow(HloInstruction* reduce_window) override;
+  absl::Status HandleSelectAndScatter(HloInstruction* instruction) override;
+  absl::Status HandleWhile(HloInstruction* xla_while) override;
+  absl::Status HandleConditional(HloInstruction* conditional) override;
+  absl::Status HandlePad(HloInstruction* pad) override;
+  absl::Status HandleAsyncStart(HloInstruction* async_start) override;
+  absl::Status HandleAsyncUpdate(HloInstruction* async_update) override;
+  absl::Status HandleAsyncDone(HloInstruction* async_done) override;
+  absl::Status HandleCopyStart(HloInstruction* copy_start) override;
+  absl::Status HandleCopyDone(HloInstruction* copy_done) override;
+  absl::Status HandleSend(HloInstruction* send) override;
+  absl::Status HandleSendDone(HloInstruction* send_done) override;
+  absl::Status HandleRecv(HloInstruction* recv) override;
+  absl::Status HandleRecvDone(HloInstruction* recv_done) override;
+  absl::Status HandleBatchNormTraining(
+      HloInstruction* batch_norm_training) override;
+  absl::Status HandleBatchNormInference(
       HloInstruction* batch_norm_inference) override;
-  Status HandleBatchNormGrad(HloInstruction* batch_norm_grad) override;
-  Status HandleGather(HloInstruction* gather) override;
-  Status HandleScatter(HloInstruction* scatter) override;
-  Status HandleAfterAll(HloInstruction* token) override;
-  Status HandleGetDimensionSize(HloInstruction* get_size) override;
-  Status HandleSetDimensionSize(HloInstruction* set_size) override;
-  Status HandleAddDependency(HloInstruction* add_dependency) override;
+  absl::Status HandleBatchNormGrad(HloInstruction* batch_norm_grad) override;
+  absl::Status HandleGather(HloInstruction* gather) override;
+  absl::Status HandleScatter(HloInstruction* scatter) override;
+  absl::Status HandleAfterAll(HloInstruction* token) override;
+  absl::Status HandleGetDimensionSize(HloInstruction* get_size) override;
+  absl::Status HandleSetDimensionSize(HloInstruction* set_size) override;
+  absl::Status HandleAddDependency(HloInstruction* add_dependency) override;
 
-  Status FinishVisit(HloInstruction*) override { return OkStatus(); }
+  absl::Status FinishVisit(HloInstruction*) override { return OkStatus(); }
 
  protected:
   // Helpers that switch on layout_sensitive_.
@@ -249,23 +251,23 @@ class ShapeVerifier : public DfsHloVisitor {
 
   // Check the instruction's shape against the shape given by ShapeInference
   // and return an appropriate error if there is a mismatch.
-  Status CheckShape(const HloInstruction* instruction,
-                    const Shape& inferred_shape,
-                    bool only_compare_minor_to_major_in_layout = false);
+  absl::Status CheckShape(const HloInstruction* instruction,
+                          const Shape& inferred_shape,
+                          bool only_compare_minor_to_major_in_layout = false);
 
-  // Overload which takes a StatusOr to reduce boilerplate in the caller.
-  Status CheckShape(const HloInstruction* instruction,
-                    const absl::StatusOr<Shape>& inferred_shape_status);
+  // Overload which takes a absl::StatusOr to reduce boilerplate in the caller.
+  absl::Status CheckShape(const HloInstruction* instruction,
+                          const absl::StatusOr<Shape>& inferred_shape_status);
 
-  static Status CheckParameterCount(const HloInstruction* calling_instruction,
-                                    const HloComputation* computation,
-                                    int expected);
+  static absl::Status CheckParameterCount(
+      const HloInstruction* calling_instruction,
+      const HloComputation* computation, int expected);
 
   // Check a unary (binary, etc) instruction's shape against the inferred shape.
-  Status CheckUnaryShape(const HloInstruction* instruction);
-  Status CheckBinaryShape(const HloInstruction* instruction);
-  Status CheckTernaryShape(const HloInstruction* instruction);
-  Status CheckVariadicShape(const HloInstruction* instruction);
+  absl::Status CheckUnaryShape(const HloInstruction* instruction);
+  absl::Status CheckBinaryShape(const HloInstruction* instruction);
+  absl::Status CheckTernaryShape(const HloInstruction* instruction);
+  absl::Status CheckVariadicShape(const HloInstruction* instruction);
 
  private:
   std::string StringifyShape(const Shape& s) {
@@ -281,20 +283,20 @@ class ShapeVerifier : public DfsHloVisitor {
   }
 
   // Checks that the given operand of the given instruction is of type TOKEN.
-  Status CheckIsTokenOperand(const HloInstruction* instruction,
-                             int64_t operand_no);
+  absl::Status CheckIsTokenOperand(const HloInstruction* instruction,
+                                   int64_t operand_no);
 
   // Checks that the shape of the given operand of the given instruction matches
   // the given parameter of the given computation.
-  Status CheckOperandAndParameter(const HloInstruction* instruction,
-                                  int64_t operand_number,
-                                  const HloComputation* computation,
-                                  int64_t parameter_number);
+  absl::Status CheckOperandAndParameter(const HloInstruction* instruction,
+                                        int64_t operand_number,
+                                        const HloComputation* computation,
+                                        int64_t parameter_number);
 
   // Checks that the shape of async op operands and results match the called
   // computation parameters and root.
-  Status CheckAsyncOpComputationShapes(const HloInstruction* async_op,
-                                       const Shape& async_shape);
+  absl::Status CheckAsyncOpComputationShapes(const HloInstruction* async_op,
+                                             const Shape& async_shape);
 
   // Returns true if the shapes of the two operands have the same element type,
   // and the result shape either has the same element type as the operand shapes
@@ -395,7 +397,7 @@ class MetadataTracker : public DfsHloVisitorWithDefault {
  public:
   explicit MetadataTracker(absl::string_view prefix);
   ~MetadataTracker() override;
-  Status DefaultAction(HloInstruction* instruction) override;
+  absl::Status DefaultAction(HloInstruction* instruction) override;
   void HandleMetadata(const OpMetadata& metadata);
 
  private:
diff --git a/third_party/xla/xla/service/hlo_verifier_test.cc b/third_party/xla/xla/service/hlo_verifier_test.cc
index 15dd3208efb25a..c7e125a75d36c9 100644
--- a/third_party/xla/xla/service/hlo_verifier_test.cc
+++ b/third_party/xla/xla/service/hlo_verifier_test.cc
@@ -15,29 +15,38 @@ limitations under the License.
 
 #include "xla/service/hlo_verifier.h"
 
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <string>
-#include <utility>
 #include <vector>
 
 #include <gmock/gmock.h>
+#include <gtest/gtest.h>
 #include "absl/base/log_severity.h"
 #include "absl/log/scoped_mock_log.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
 #include "absl/strings/str_replace.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/literal_util.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_parser.h"
 #include "xla/service/layout_assignment.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/status.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/statusor.h"
+#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
@@ -1058,13 +1067,14 @@ TEST_F(HloVerifierTest, AsyncOpComputationParamWrongType) {
   HloModule Module
 
   async_computation {
-    p = f32[2,3] parameter(0)
-    ROOT custom-call = f32[3,2] custom-call(p), custom_call_target="foo"
+    p0 = f32[2,3] parameter(0)
+    ROOT p1 = f32[3,2] parameter(1)
   }
 
   ENTRY AsyncStartAndAsyncDone {
     p0 = f32[2,3] parameter(0)
-    async-start = ((f32[3,2]), f32[3,2], u32[]) async-start(p0), calls=async_computation
+    p1 = f32[3,2] parameter(1)
+    async-start = ((f32[3,2], f32[3,2]), f32[3,2], u32[]) async-start(p0, p1), calls=async_computation
     ROOT async-done = f32[3,2] async-done(async-start), calls=async_computation
   }
   )";
@@ -1083,13 +1093,14 @@ TEST_F(HloVerifierTest, AsyncOpComputationRootWrongType) {
   HloModule Module
 
   async_computation {
-    p = f32[2,3] parameter(0)
-    ROOT custom-call = f32[3,2] custom-call(p), custom_call_target="foo"
+    p0 = f32[2,3] parameter(0)
+    ROOT p1 = f32[3,2] parameter(1)
   }
 
   ENTRY AsyncStartAndAsyncDone {
     p0 = f32[2,3] parameter(0)
-    async-start = ((f32[2,3]), f32[2,3], u32[]) async-start(p0), calls=async_computation
+    p1 = f32[3,2] parameter(1)
+    async-start = ((f32[2,3], f32[3,2]), f32[2,3], u32[]) async-start(p0, p1), calls=async_computation
     ROOT async-done = f32[3,2] async-done(async-start), calls=async_computation
   }
   )";
@@ -2821,7 +2832,7 @@ ENTRY entry {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnUnverifiedModule(kHlo));
-  Status status = verifier().Run(module.get()).status();
+  absl::Status status = verifier().Run(module.get()).status();
 
   TF_ASSERT_OK(status);
 }
@@ -2839,7 +2850,7 @@ ENTRY entry {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnUnverifiedModule(kHlo));
-  Status status = verifier().Run(module.get()).status();
+  absl::Status status = verifier().Run(module.get()).status();
 
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(status.message(), HasSubstr("should be compatible"));
@@ -2858,7 +2869,7 @@ ENTRY entry {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnUnverifiedModule(kHlo));
-  Status status = verifier().Run(module.get()).status();
+  absl::Status status = verifier().Run(module.get()).status();
 
   TF_ASSERT_OK(status);
 }
@@ -2876,12 +2887,31 @@ ENTRY entry {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnUnverifiedModule(kHlo));
-  Status status = verifier().Run(module.get()).status();
+  absl::Status status = verifier().Run(module.get()).status();
 
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(status.message(), HasSubstr("should be compatible"));
 }
 
+TEST_F(HloVerifierTestLayoutSensitive, AliasedMemorySpaceMismatchReported) {
+  constexpr absl::string_view kHlo = R"(
+HloModule module, input_output_alias={{}: (0, {}, must-alias)},
+                  entry_computation_layout={(f32[10]{0:S(5)})->f32[10]{0}}
+
+ENTRY entry {
+  x = f32[10]{0} parameter(0)
+  ROOT add = f32[10]{0} add(x, x)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule(kHlo));
+  absl::Status status = verifier().Run(module.get()).status();
+
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.message(),
+              HasSubstr("Shape and memory space of the result"));
+}
+
 TEST_F(HloVerifierTestLayoutSensitive, LayoutOK) {
   constexpr absl::string_view kHlo = R"(
 HloModule module, entry_computation_layout={(f32[10,10]{1,0},f32[10,10]{1,0})->f32[10,10]{1,0}}
@@ -2895,7 +2925,7 @@ ENTRY entry {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnUnverifiedModule(kHlo));
-  Status status = verifier().Run(module.get()).status();
+  absl::Status status = verifier().Run(module.get()).status();
 
   TF_ASSERT_OK(status);
 }
@@ -2912,7 +2942,7 @@ ENTRY entry {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnUnverifiedModule(kHlo));
-  Status status = verifier().Run(module.get()).status();
+  absl::Status status = verifier().Run(module.get()).status();
 
   TF_ASSERT_OK(status);
 }
diff --git a/third_party/xla/xla/service/host_memory_transfer_asyncifier.cc b/third_party/xla/xla/service/host_memory_transfer_asyncifier.cc
index c704183e325d61..6fe52707d88ede 100644
--- a/third_party/xla/xla/service/host_memory_transfer_asyncifier.cc
+++ b/third_party/xla/xla/service/host_memory_transfer_asyncifier.cc
@@ -42,13 +42,13 @@ class HostMemoryTransferAsyncifierVisitor : public DfsHloVisitorWithDefault {
       : kHostMemorySpaceColor(host_memory_space_color) {}
   bool Changed() const { return changed_; }
 
-  Status DefaultAction(HloInstruction* hlo_instruction) override {
+  absl::Status DefaultAction(HloInstruction* hlo_instruction) override {
     return OkStatus();
   }
 
   // Replace all dynamic-slice ops which slice from host memory to device memory
   // with an asynchronous dynamic-slice.
-  Status HandleDynamicSlice(HloInstruction* dynamic_slice) override {
+  absl::Status HandleDynamicSlice(HloInstruction* dynamic_slice) override {
     // Check that the dynamic_slice and its first operand have layouts. This
     // pass must only be run after LayoutAssignment.
     HloInstruction* dynamic_slice_operand = dynamic_slice->mutable_operand(0);
@@ -95,7 +95,7 @@ class HostMemoryTransferAsyncifierVisitor : public DfsHloVisitorWithDefault {
 
   // Replace all dynamic-update-slice ops which update into host memory from
   // device memory with an asynchronous dynamic-update-slice.
-  Status HandleDynamicUpdateSlice(
+  absl::Status HandleDynamicUpdateSlice(
       HloInstruction* dynamic_update_slice) override {
     // Check that the dynamic-update-slice and its first two operands have
     // layouts. This pass must only be run after LayoutAssignment.
@@ -152,7 +152,7 @@ class HostMemoryTransferAsyncifierVisitor : public DfsHloVisitorWithDefault {
 
   // Replace all copy ops which copy from host memory to device memory or from
   // device memory to host memory with an asynchronous copy.
-  Status HandleCopy(HloInstruction* copy) override {
+  absl::Status HandleCopy(HloInstruction* copy) override {
     HloInstruction* operand = copy->mutable_operand(0);
     if (!operand->shape().has_layout()) {
       return InternalStrCat(operand->name(), " does not have a layout.");
diff --git a/third_party/xla/xla/service/host_offload_legalize.cc b/third_party/xla/xla/service/host_offload_legalize.cc
index aa46246ad5b2fc..b460f93e41d015 100644
--- a/third_party/xla/xla/service/host_offload_legalize.cc
+++ b/third_party/xla/xla/service/host_offload_legalize.cc
@@ -144,9 +144,9 @@ absl::StatusOr<bool> DuplicateBroadcastForEachUse(HloModule* module) {
   return split_at_least_one;
 }
 
-// Walk up in the chain of memory offloaded instructions. Status not-ok when
-// an instructions not supported or end of chain reached.
-// Walks one instruction at a time.
+// Walk up in the chain of memory offloaded instructions. absl::Status not-ok
+// when an instructions not supported or end of chain reached. Walks one
+// instruction at a time.
 absl::StatusOr<std::pair<HloInstruction*, int>> WalkUpMemoryOffload(
     std::pair<HloInstruction*, int> current_value,
     const CallGraph& call_graph) {
@@ -220,10 +220,10 @@ absl::StatusOr<std::pair<HloInstruction*, int>> WalkUpMemoryOffload(
   }
 }
 
-// Walk down in the chain of memory offloaded instructions. Status not-ok when
-// an instructions not supported or end of chain reached.
-// Walks one instruction at a time, but returns multiple instructions for each
-// conforming user.
+// Walk down in the chain of memory offloaded instructions. absl::Status not-ok
+// when an instructions not supported or end of chain reached. Walks one
+// instruction at a time, but returns multiple instructions for each conforming
+// user.
 absl::StatusOr<std::vector<std::pair<HloInstruction*, int>>>
 WalkDownMemoryOffload(const std::pair<HloInstruction*, int64_t>& current_value,
                       const CallGraph& call_graph) {
@@ -232,7 +232,8 @@ WalkDownMemoryOffload(const std::pair<HloInstruction*, int64_t>& current_value,
   VLOG(5) << "Current value in progress: " << current_value.first->ToString()
           << " idx: " << current_value.second;
   std::vector<std::pair<HloInstruction*, int>> results;
-  auto add_gte_for_idx = [&results](HloInstruction* instr, int idx) -> Status {
+  auto add_gte_for_idx = [&results](HloInstruction* instr,
+                                    int idx) -> absl::Status {
     HloInstruction* gte = nullptr;
     for (HloInstruction* user : instr->users()) {
       if (user->opcode() != HloOpcode::kGetTupleElement) {
@@ -342,6 +343,9 @@ absl::StatusOr<bool> ProcessAnnotationForCopyMovement(
   if (instruction->IsRoot()) {
     return false;
   }
+  if (instruction->user_count() == 0) {
+    return false;
+  }
   HloInstruction* starting_instr =
       FindDUSFromAnnotation(instruction->users().at(0));
   // If it's the pure copy case reset instruction.
diff --git a/third_party/xla/xla/service/host_offloader.cc b/third_party/xla/xla/service/host_offloader.cc
index 34f335f0a4966e..d1c12b8888d1f9 100644
--- a/third_party/xla/xla/service/host_offloader.cc
+++ b/third_party/xla/xla/service/host_offloader.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <array>
 #include <cstdint>
+#include <optional>
 #include <string>
 #include <vector>
 
@@ -43,12 +44,16 @@ limitations under the License.
 #include "xla/status_macros.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
 
 namespace {
 
+using ::xla::host_memory_offload_annotations::kMoveToDeviceCustomCallTarget;
+using ::xla::host_memory_offload_annotations::kMoveToHostCustomCallTarget;
+
 void SetMemorySpace(Shape* shape, int64_t memory_space_color) {
   CHECK(shape->has_layout());
   shape->mutable_layout()->set_memory_space(memory_space_color);
@@ -149,74 +154,93 @@ absl::StatusOr<std::vector<HloInstruction*>> GetBufferUsersOfType(
   return result;
 }
 
-HloInstruction* FindDSAnnotation(HloInstruction* hlo) {
-  while (!hlo->IsCustomCall(
-      host_memory_offload_annotations::kMoveToDeviceCustomCallTarget)) {
-    if (hlo->opcode() != HloOpcode::kReshape &&
-        hlo->opcode() != HloOpcode::kBitcast) {
-      break;
-    }
-    if (hlo->user_count() != 1) {
+// Returns true if the instruction passed in preserves the underlying buffer,
+// And the buffer is passed through the first operand.
+// This is used to trace the graph between an annotation and its relevant slice.
+bool CanTraverseOpBetweenAnnotation(HloInstruction* hlo) {
+  if (hlo->opcode() == HloOpcode::kBitcast ||
+      hlo->opcode() == HloOpcode::kCopy) {
+    return true;
+  } else if (hlo->opcode() == HloOpcode::kReshape) {
+    return ShapeUtil::ReshapeIsBitcast(hlo->operand(0)->shape(), hlo->shape());
+  } else if (hlo->opcode() == HloOpcode::kReduce) {
+    // TODO(b/333902007): Remove this once trivial reduces no longer appear.
+    return ShapeUtil::TrueRank(hlo->operand(0)->shape()) ==
+           ShapeUtil::TrueRank(hlo->shape());
+  }
+  return false;
+}
+
+// Starting from a slice or dynamic-slice, trace the graph down through reshapes
+// and bitcasts to find the annotation that signals that the data is being moved
+// to the device from the host. If no custom call is found, returns an empty
+// optional.
+std::optional<HloInstruction*> FindAnnotationFromDS(HloInstruction* hlo) {
+  CHECK(hlo->opcode() == HloOpcode::kDynamicSlice ||
+        hlo->opcode() == HloOpcode::kSlice)
+      << "Expected a dynamic-slice or slice as input.";
+  if (hlo->user_count() != 1) {
+    return std::nullopt;
+  }
+  hlo = hlo->users()[0];
+  while (!hlo->IsCustomCall(kMoveToDeviceCustomCallTarget)) {
+    if (!CanTraverseOpBetweenAnnotation(hlo) || hlo->user_count() != 1) {
       break;
     }
     hlo = hlo->users()[0];
   }
-  return hlo;
+  if (hlo->IsCustomCall(kMoveToDeviceCustomCallTarget)) {
+    return hlo;
+  }
+  return std::nullopt;
 }
 
-}  // namespace
-
-absl::StatusOr<bool> HostOffloader::TryOutputStreaming(
-    HloInstruction* custom_call) {
-  const HloBuffer& unique_buffer =
-      alias_analysis_->GetUniqueBufferAt(custom_call);
-  bool is_used_as_output_with_host_memory_space = false;
-  const HloComputation* const entry_computation =
-      custom_call->GetModule()->entry_computation();
-  for (const HloValue* value : unique_buffer.values()) {
-    // Check if this is memory-only.
-    if (!AllPositionsAreAllowed(value)) {
-      // Found a position which is not allowed.
-      return false;
+// Starting from a MoveToHost custom call, trace the graph down through reshapes
+// and bitcasts to return the dynamic-update-slice that moves the data from the
+// host to the device. If no DUS is found, returns an empty optional.
+std::optional<HloInstruction*> FindDUSFromAnnotation(HloInstruction* hlo) {
+  CHECK(hlo->IsCustomCall(kMoveToHostCustomCallTarget))
+      << "Expected a MoveToHost custom call as input.";
+  if (hlo->user_count() != 1) {
+    return std::nullopt;
+  }
+  hlo = hlo->users()[0];
+  while (hlo->opcode() != HloOpcode::kDynamicUpdateSlice) {
+    if (!CanTraverseOpBetweenAnnotation(hlo) || hlo->user_count() != 1) {
+      break;
     }
+    hlo = hlo->users()[0];
+  }
+  if (hlo->opcode() == HloOpcode::kDynamicUpdateSlice) {
+    return hlo;
+  }
+  return std::nullopt;
+}
 
-    // Look for a value used as a output.
-    for (const auto& position : value->positions()) {
-      const HloInstruction* instruction = position.instruction;
-      const ShapeIndex& index = position.index;
-      if (instruction->parent() == entry_computation && instruction->IsRoot()) {
-        const Shape& output_shape =
-            ShapeUtil::GetSubshape(entry_computation->parent()
-                                       ->entry_computation_layout()
-                                       .result_shape(),
-                                   index);
-        CHECK(output_shape.has_layout());
-
-        if (output_shape.layout().memory_space() != kHostMemorySpaceColor) {
-          return FailedPrecondition(
-              "Output buffer is annotated with %s but is not marked with host "
-              "memory space in the entry computation.",
-              custom_call->name());
-        }
-        is_used_as_output_with_host_memory_space = true;
-      }
+// Starting from a dynamic-update-slice, trace the graph up reshapes,
+// bitcasts and reduces to return the MoveToHost custom call that feeds into the
+// DUS, if it exists. If no MoveToHost call is found, returns an empty optional.
+std::optional<HloInstruction*> FindAnnotationFromDUS(HloInstruction* hlo) {
+  CHECK(hlo->opcode() == HloOpcode::kDynamicUpdateSlice)
+      << "Expected a dynamic-update-slice as input.";
+  // We expect the custom call to come from the written slice, i.e. operand 1.
+  hlo = hlo->mutable_operand(1);
+  while (!hlo->IsCustomCall(kMoveToHostCustomCallTarget)) {
+    if (!CanTraverseOpBetweenAnnotation(hlo)) {
+      break;
     }
+    hlo = hlo->mutable_operand(0);
   }
-  if (!is_used_as_output_with_host_memory_space) {
-    VLOG(1) << "Buffer annotated by " << custom_call->name()
-            << " is not used as an output with host memory space.";
-    return false;
+  if (hlo->IsCustomCall(kMoveToHostCustomCallTarget)) {
+    return hlo;
   }
-
-  VLOG(3) << "Found an output buffer annotated with " << custom_call->name()
-          << ". Expecting that we'll need to insert copies.";
-
-  annotations_for_copy_to_host_to_insert_.emplace(custom_call);
-  AddAllPositionsToBeMovedToHostMemory(unique_buffer);
-  return true;
+  return std::nullopt;
 }
 
-Status HostOffloader::HandleMoveToHostCustomCall(HloInstruction* custom_call) {
+}  // namespace
+
+absl::Status HostOffloader::HandleMoveToHostCustomCall(
+    HloInstruction* custom_call) {
   VLOG(2) << "Found a custom call annotating start-of-host-offload: "
           << custom_call->ToString();
   // Save a pointer to this custom call for when we want to remove it later.
@@ -251,16 +275,12 @@ Status HostOffloader::HandleMoveToHostCustomCall(HloInstruction* custom_call) {
   } else if (consumer != nullptr && consumer->opcode() == HloOpcode::kCopy) {
     TF_RETURN_IF_ERROR(MemoryOnlyOffloadStartingWithCopy(consumer));
   } else {
-    TF_ASSIGN_OR_RETURN(bool did_output_streaming,
-                        TryOutputStreaming(custom_call));
-    if (!did_output_streaming) {
-      TF_RETURN_IF_ERROR(MemoryOnlyOffloadInsertCopies(custom_call));
-    }
+    TF_RETURN_IF_ERROR(MemoryOnlyOffloadInsertCopies(custom_call));
   }
   return OkStatus();
 }
 
-Status HostOffloader::MemoryOnlyOffloadStartingWithDus(
+absl::Status HostOffloader::MemoryOnlyOffloadStartingWithDus(
     const HloInstruction* dynamic_update_slice) {
   // The user wants to offload the data defined by this dynamic-update-slice.
   VLOG(2) << "Host memory offload starts with a dynamic-update-slice: "
@@ -357,7 +377,8 @@ Status HostOffloader::MemoryOnlyOffloadStartingWithDus(
                              out->append(inst->name());
                            })
           << ']';
-  if (consuming_slices.empty()) {
+  if (!dus_for_streamed_buffer_.contains(dynamic_update_slice) &&
+      consuming_slices.empty()) {
     return Internal(
         "The dynamic-update-slice (%s) never feeds into a slice nor "
         "dynamic-slice.",
@@ -379,13 +400,15 @@ Status HostOffloader::MemoryOnlyOffloadStartingWithDus(
                           out->append(inst->name());
                         }));
     }
-    HloInstruction* consuming_slice_user =
-        FindDSAnnotation(consuming_slice->users()[0]);
-    if (consuming_slice_user->opcode() != HloOpcode::kCustomCall) {
+    std::optional<HloInstruction*> slice_user =
+        FindAnnotationFromDS(consuming_slice);
+    if (!slice_user.has_value()) {
       return Internal(
           "Slice/Dynamic-slice %s does not have a matching annotation.",
           consuming_slice->name());
     }
+
+    HloInstruction* consuming_slice_user = slice_user.value();
     if (consuming_slice_user->custom_call_target() !=
         host_memory_offload_annotations::kMoveToDeviceCustomCallTarget) {
       return Internal(
@@ -413,7 +436,7 @@ void HostOffloader::AddAllPositionsToBeMovedToHostMemory(
   }
 }
 
-Status HostOffloader::MemoryOnlyOffloadStartingWithCopy(
+absl::Status HostOffloader::MemoryOnlyOffloadStartingWithCopy(
     const HloInstruction* copy) {
   // The user wants to offload the data defined by this copy.
   VLOG(2) << "Host memory offload starts with a copy: " << copy->name();
@@ -496,7 +519,7 @@ Status HostOffloader::MemoryOnlyOffloadStartingWithCopy(
   return OkStatus();
 }
 
-Status HostOffloader::MemoryOnlyOffloadInsertCopies(
+absl::Status HostOffloader::MemoryOnlyOffloadInsertCopies(
     HloInstruction* custom_call) {
   VLOG(3) << "Found an offload annotation (" << custom_call->name()
           << "). Expecting that we'll need to insert copies";
@@ -519,12 +542,6 @@ Status HostOffloader::MemoryOnlyOffloadInsertCopies(
           unique_buffer,
           match::CustomCall({host_memory_offload_annotations::
                                  kMoveToDeviceCustomCallTarget})));
-  if (matching_annotations.empty()) {
-    return Internal(
-        "The offloaded data (from %s) never feeds into a matching \"load\" "
-        "annotation.",
-        custom_call->name());
-  }
 
   // This fits the pattern that we're looking for. Save these annotations to
   // later insert copies around.
@@ -540,7 +557,7 @@ Status HostOffloader::MemoryOnlyOffloadInsertCopies(
   return OkStatus();
 }
 
-Status HostOffloader::DynamifySlice(HloInstruction* slice) {
+absl::Status HostOffloader::DynamifySlice(HloInstruction* slice) {
   VLOG(3) << "Dynamifying slice " << slice->ToString();
   std::vector<HloInstruction*> start_constants;
   for (int64_t start : slice->slice_starts()) {
@@ -566,7 +583,8 @@ Status HostOffloader::DynamifySlice(HloInstruction* slice) {
 // Taking an instruction representing a move-to-device custom call, creates a
 // copy to device for that operand and replaces all uses of the operand of the
 // load annotation with the copy.
-Status HostOffloader::CreateCopyForInputStreaming(HloInstruction* custom_call) {
+absl::Status HostOffloader::CreateCopyForInputStreaming(
+    HloInstruction* custom_call) {
   HloInstruction* operand_of_load_annotation = custom_call->mutable_operand(0);
   Shape copy_shape = operand_of_load_annotation->shape();
   SetMemorySpace(&copy_shape, Layout::kDefaultMemorySpace);
@@ -615,9 +633,28 @@ Status HostOffloader::CreateCopyForInputStreaming(HloInstruction* custom_call) {
 
 // From a unique buffer on host memory, finds move-to-device custom calls
 // for this buffer and inserts the appropriate copies.
-Status HostOffloader::HandleStreamedBuffer(const HloBuffer& unique_buffer) {
+absl::Status HostOffloader::HandleStreamedBuffer(
+    const HloBuffer& unique_buffer) {
   // Find all move-to-device custom calls that are using this buffer.
   for (const HloValue* value : unique_buffer.values()) {
+    // First, handle the defining instruction of this buffer, as a potential
+    // move-to-host custom call.
+    if (value->defining_instruction()->IsCustomCall(
+            kMoveToHostCustomCallTarget)) {
+      annotations_for_copy_to_host_to_insert_.emplace(
+          value->defining_instruction());
+      AddAllPositionsToBeMovedToHostMemory(unique_buffer);
+    } else if (value->defining_instruction()->opcode() ==
+               HloOpcode::kDynamicUpdateSlice) {
+      std::optional<HloInstruction*> dus =
+          FindAnnotationFromDUS(value->defining_instruction());
+      if (dus.has_value()) {
+        dus_for_streamed_buffer_.emplace(value->defining_instruction());
+        AddAllPositionsToBeMovedToHostMemory(unique_buffer);
+      }
+    }
+    // Next, handle uses of this buffer as potential move-to-device custom
+    // calls.
     for (const HloUse& use : value->GetUses()) {
       if (use.instruction->IsCustomCall(
               host_memory_offload_annotations::kMoveToDeviceCustomCallTarget)) {
@@ -629,6 +666,16 @@ Status HostOffloader::HandleStreamedBuffer(const HloBuffer& unique_buffer) {
             CreateCopyForInputStreaming(move_to_device_custom_call));
         expected_host_to_device_annotations_.emplace(
             move_to_device_custom_call);
+      } else if (use.instruction->opcode() == HloOpcode::kDynamicSlice ||
+                 use.instruction->opcode() == HloOpcode::kSlice) {
+        std::optional<HloInstruction*> move_to_device_custom_call =
+            FindAnnotationFromDS(use.instruction);
+        if (move_to_device_custom_call.has_value()) {
+          TF_RETURN_IF_ERROR(
+              CreateCopyForInputStreaming(move_to_device_custom_call.value()));
+          expected_host_to_device_annotations_.emplace(
+              move_to_device_custom_call.value());
+        }
       }
     }
   }
@@ -640,47 +687,53 @@ Status HostOffloader::HandleStreamedBuffer(const HloBuffer& unique_buffer) {
 // corresponding move-to-device custom calls for these parameters. Once found,
 // adds these move-to-device custom calls to the expected host-to-device
 // annotations, and creates the necessary copies for input streaming.
-Status HostOffloader::HandleInputStreaming(HloComputation* computation) {
+absl::Status HostOffloader::HandleInputStreaming(HloComputation* computation) {
   const ComputationLayout& entry_computation_layout =
       computation->parent()->entry_computation_layout();
 
   for (int i = 0; i < entry_computation_layout.parameter_count(); ++i) {
-    if (entry_computation_layout.parameter_shape(i).IsTuple()) {
-      // Handle tuple parameters, which may contain streamed elements. Nested
-      // tuples are not supported.
-      const Shape& tuple_shape = entry_computation_layout.parameter_shape(i);
-      for (int j = 0; j < tuple_shape.tuple_shapes_size(); ++j) {
-        const Shape& tuple_element_shape = tuple_shape.tuple_shapes(j);
-        // TODO(b/335498881): Support nested tuples.
-        if (tuple_element_shape.IsTuple()) {
-          LOG(WARNING)
-              << "Nested tuple parameters are not supported for streaming.";
-          continue;
-        }
-        TF_RET_CHECK(tuple_element_shape.has_layout());
-        if (tuple_element_shape.layout().memory_space() ==
-            kHostMemorySpaceColor) {
-          VLOG(4) << "Handling streamed element in tuple parameter: "
-                  << tuple_element_shape.ToString(/*print_layout=*/true);
-          const HloBuffer& unique_buffer = alias_analysis_->GetUniqueBufferAt(
-              computation->parameter_instruction(i), {j});
-          TF_RETURN_IF_ERROR(HandleStreamedBuffer(unique_buffer));
-        }
-      }
-    } else if (entry_computation_layout.parameter_layout(i)
-                   .layout()
-                   .memory_space() == kHostMemorySpaceColor) {
-      HloInstruction* streamed_input = computation->parameter_instruction(i);
-      VLOG(4) << "Handling streamed input: " << streamed_input->ToString();
-      const HloBuffer& unique_buffer =
-          alias_analysis_->GetUniqueBufferAt(streamed_input);
-
-      TF_RETURN_IF_ERROR(HandleStreamedBuffer(unique_buffer));
+    if (entry_computation_layout.parameter_shape(i).IsToken()) {
+      LOG(WARNING) << "Token parameters are not supported for streaming.";
+      continue;
     }
+    ShapeUtil::ForEachSubshape(
+        entry_computation_layout.parameter_shape(i),
+        [&](const Shape& subshape, const ShapeIndex& index) {
+          if (subshape.has_layout() &&
+              subshape.layout().memory_space() == kHostMemorySpaceColor) {
+            VLOG(4) << "Handling streamed element in input with shape: "
+                    << subshape.ToString(true);
+            const HloBuffer& unique_buffer = alias_analysis_->GetUniqueBufferAt(
+                computation->parameter_instruction(i), {index});
+            TF_CHECK_OK(HandleStreamedBuffer(unique_buffer));
+          }
+        });
   }
   return OkStatus();
 }
 
+// Starts from the result of the entry computation and looks for a case of
+// output streaming. This function will not change any hlo, it will only mark
+// instructions to be converted to host memory space.
+absl::Status HostOffloader::HandleOutputStreaming(HloComputation* computation) {
+  const ComputationLayout& entry_computation_layout =
+      computation->parent()->entry_computation_layout();
+
+  ShapeUtil::ForEachSubshape(
+      entry_computation_layout.result_shape(),
+      [&](const Shape& subshape, const ShapeIndex& index) {
+        if (subshape.has_layout() &&
+            subshape.layout().memory_space() == kHostMemorySpaceColor) {
+          VLOG(4) << "Handling streamed element in result with shape: "
+                  << subshape.ToString(true);
+          const HloBuffer& unique_buffer = alias_analysis_->GetUniqueBufferAt(
+              computation->root_instruction(), {index});
+          TF_CHECK_OK(HandleStreamedBuffer(unique_buffer));
+        }
+      });
+  return OkStatus();
+}
+
 absl::StatusOr<bool> HostOffloader::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
@@ -691,12 +744,12 @@ absl::StatusOr<bool> HostOffloader::Run(
   // Run HloAliasAnalysis on module.
   TF_ASSIGN_OR_RETURN(alias_analysis_, HloAliasAnalysis::Run(module));
 
+  TF_RETURN_IF_ERROR(HandleInputStreaming(module->entry_computation()));
+  TF_RETURN_IF_ERROR(HandleOutputStreaming(module->entry_computation()));
+
   // Iterate over all instructions and look for XLA host offload annotations.
   for (HloComputation* computation :
        module->MakeNonfusionComputations(execution_threads)) {
-    if (computation->IsEntryComputation()) {
-      TF_RETURN_IF_ERROR(HandleInputStreaming(computation));
-    }
     for (HloInstruction* instruction :
          computation->MakeInstructionPostOrder()) {
       if (instruction->opcode() != HloOpcode::kCustomCall) {
diff --git a/third_party/xla/xla/service/host_offloader.h b/third_party/xla/xla/service/host_offloader.h
index b56df4878df7c9..f4fba494c2bfc9 100644
--- a/third_party/xla/xla/service/host_offloader.h
+++ b/third_party/xla/xla/service/host_offloader.h
@@ -63,6 +63,7 @@ class HostOffloader : public HloModulePass {
   absl::flat_hash_set<HloInstruction*> annotations_for_copy_to_host_to_insert_;
   absl::flat_hash_set<HloInstruction*>
       annotations_for_copy_to_device_to_insert_;
+  absl::flat_hash_set<HloInstruction*> dus_for_streamed_buffer_;
   std::unique_ptr<CallGraph> call_graph_;
 
   // Positions of all HloValues of the given HloBuffer will be added to
@@ -71,30 +72,33 @@ class HostOffloader : public HloModulePass {
 
   // Process streamed inputs for the given computation, finding the relevant
   // move-to-device custom calls and inserting the appropriate copies.
-  Status HandleInputStreaming(HloComputation* computation);
+  absl::Status HandleInputStreaming(HloComputation* computation);
+  // Process streamed outputs for the given computation, finding the relevant
+  // move-to-host custom calls and inserting the appropriate copies.
+  absl::Status HandleOutputStreaming(HloComputation* computation);
   // From a unique buffer on host memory, finds move-to-device custom calls
   // for this buffer and inserts the appropriate copies.
-  Status HandleStreamedBuffer(const HloBuffer& unique_buffer);
+  absl::Status HandleStreamedBuffer(const HloBuffer& unique_buffer);
   // Creates a copy to device for the input streaming custom call.
-  Status CreateCopyForInputStreaming(HloInstruction* custom_call);
+  absl::Status CreateCopyForInputStreaming(HloInstruction* custom_call);
   absl::StatusOr<bool> TryParameterStreaming(HloInstruction* custom_call);
   absl::StatusOr<bool> TryOutputStreaming(HloInstruction* custom_call);
-  Status HandleMoveToHostCustomCall(HloInstruction* custom_call);
+  absl::Status HandleMoveToHostCustomCall(HloInstruction* custom_call);
 
   // Handle memory-only offloading where the data is written to the host via a
   // dynamic-update-slice and is read back via a dynamic-slice.
-  Status MemoryOnlyOffloadStartingWithDus(
+  absl::Status MemoryOnlyOffloadStartingWithDus(
       const HloInstruction* dynamic_update_slice);
 
   // Handle memory-only offloading where the data is written to the host via a
   // copy and is read back via a copy.
-  Status MemoryOnlyOffloadStartingWithCopy(const HloInstruction* copy);
+  absl::Status MemoryOnlyOffloadStartingWithCopy(const HloInstruction* copy);
 
   // Handle memory-only offloading where there are no ops yet for data movement.
   // We will insert copies at the points where the annotations are.
-  Status MemoryOnlyOffloadInsertCopies(HloInstruction* custom_call);
+  absl::Status MemoryOnlyOffloadInsertCopies(HloInstruction* custom_call);
 
-  Status DynamifySlice(HloInstruction* slice);
+  absl::Status DynamifySlice(HloInstruction* slice);
 
   static constexpr std::array kAllowedPositionOpcodes = {
       HloOpcode::kBitcast,
diff --git a/third_party/xla/xla/service/host_offloader_test.cc b/third_party/xla/xla/service/host_offloader_test.cc
index 0f2851a0dc4254..01fa2ecbe07c66 100644
--- a/third_party/xla/xla/service/host_offloader_test.cc
+++ b/third_party/xla/xla/service/host_offloader_test.cc
@@ -28,6 +28,8 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/layout.h"
+#include "xla/service/hlo_verifier.h"
 #include "xla/service/host_memory_offload_annotations.h"
 #include "xla/service/host_offload_legalize.h"
 #include "xla/service/pattern_matcher.h"
@@ -175,6 +177,30 @@ ENTRY main {
   EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
 }
 
+TEST_F(HostOffloaderTest, BasicAsyncCustomCallWithAliasing) {
+  const std::string& hlo_string = R"(
+HloModule m, input_output_alias={{}: (0, {}, must-alias)},
+             entry_computation_layout={(f32[4096]{0:T(128)S(5)})->f32[4096]{0:T(128)S(5)}}
+
+ENTRY %main (a: f32[4096]) -> f32[4096] {
+  %a = f32[4096]{0} parameter(0)
+  %async-start = ((f32[4096]{0}), f32[4096]{0}, u32[]) custom-call-start(%a),
+                 custom_call_target="Foo",
+                 output_to_operand_aliasing={{}: (0, {})}
+  ROOT %async-done = f32[4096]{0} custom-call-done(%async-start)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
+  EXPECT_TRUE(changed);
+
+  HloInstruction* async_done = FindInstruction(module.get(), "async-done");
+  TestShapeHasMemorySpace(async_done->shape(), kHostMemorySpaceColor);
+}
+
 TEST_F(HostOffloaderTest, ParameterStreamingWithXposeCopyFeedingIntoWhile) {
   const std::string& hlo_string = R"(
 HloModule jit__prefill_impl, entry_computation_layout={(bf16[2,16,16]{2,1,0:T(8,128)(2,1)S(5)})->bf16[2,16,16]{1,2,0:T(8,128)(2,1)}}
@@ -272,6 +298,138 @@ ENTRY main {
   VLOG(1) << "module after: " << module->ToString();
 }
 
+TEST_F(HostOffloaderTest, ParameterStreamingInScanLoop) {
+  const std::string& hlo_string = R"(
+HloModule m,
+  entry_computation_layout={(f32[8,2]{0,1:T(2,128)S(5)})->(f32[]{:T(256)}, f32[8,2]{0,1:T(2,128)})},
+  allow_spmd_sharding_propagation_to_output={true,true}
+
+add {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(lhs, rhs)
+}
+
+while_body {
+  arg_tuple.8 = (s32[]{:T(256)}, f32[8,2]{0,1:T(2,128)}, f32[8,2]{0,1:T(2,128)}) parameter(0)
+  get-tuple-element.9 = s32[]{:T(256)} get-tuple-element(arg_tuple.8), index=0
+  constant.12 = s32[]{:T(256)} constant(1)
+  add.29 = s32[]{:T(256)} add(get-tuple-element.9, constant.12)
+  get-tuple-element.10 = f32[8,2]{0,1:T(2,128)} get-tuple-element(arg_tuple.8), index=1
+  get-tuple-element.11 = f32[8,2]{0,1:T(2,128)} get-tuple-element(arg_tuple.8), index=2
+  constant.16 = s32[]{:T(256)} constant(0)
+  dynamic-slice.20 = f32[1,2]{0,1:T(2,128)} dynamic-slice(get-tuple-element.11, get-tuple-element.9,  constant.16), dynamic_slice_sizes={1,2}
+  constant.1 = f32[] constant(-0)
+  reduce = f32[2]{0:T(256)} reduce(dynamic-slice.20, constant.1), dimensions={0}, to_apply=add
+  custom-call = f32[2]{0:T(256)} custom-call(reduce), custom_call_target="MoveToDevice"
+  constant.13 = f32[]{:T(256)} constant(1)
+  broadcast.14 = f32[2]{0:T(256)} broadcast(constant.13), dimensions={}
+  add.23 = f32[2]{0:T(256)} add(custom-call, broadcast.14)
+  reshape.24 = f32[1,2]{0,1:T(2,128)} reshape(add.23)
+  dynamic-update-slice.28 = f32[8,2]{0,1:T(2,128)} dynamic-update-slice(get-tuple-element.10, reshape.24, get-tuple-element.9, constant.16)
+  ROOT tuple.30 = (s32[]{:T(256)}, f32[8,2]{0,1:T(2,128)}, f32[8,2]{0,1:T(2,128)}) tuple(add.29, dynamic-update-slice.28,  get-tuple-element.11)
+}
+
+condition {
+  arg_tuple.32 = (s32[]{:T(256)}, f32[8,2]{0,1:T(2,128)}, f32[8,2]{0,1:T(2,128)}) parameter(0)
+  get-tuple-element.33 = s32[]{:T(256)} get-tuple-element(arg_tuple.32), index=0
+  constant.36 = s32[]{:T(256)} constant(8)
+  ROOT compare.37 = pred[]{:T(1024)} compare(get-tuple-element.33, constant.36), direction=LT
+}
+
+ENTRY e {
+  constant.3 = f32[]{:T(256)} constant(1)
+  constant.2 = s32[]{:T(256)} constant(0)
+  constant.4 = f32[]{:T(256)} constant(0)
+  broadcast.5 = f32[8,2]{0,1:T(2,128)} broadcast(constant.4), dimensions={}
+  Arg_0.1 = f32[8,2]{0,1:T(2,128)} parameter(0), sharding={replicated}
+  tuple.6 = (s32[]{:T(256)}, f32[8,2]{0,1:T(2,128)}, f32[8,2]{0,1:T(2,128)}) tuple(constant.2, broadcast.5, Arg_0.1)
+  while.38 = (s32[]{:T(256)}, f32[8,2]{0,1:T(2,128)}, f32[8,2]{0,1:T(2,128)}) while(tuple.6), condition=condition, body=while_body
+  get-tuple-element.40 = f32[8,2]{0,1:T(2,128)} get-tuple-element(while.38), index=1
+  ROOT tuple.42 = (f32[]{:T(256)}, f32[8,2]{0,1:T(2,128)}) tuple(constant.3, get-tuple-element.40)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, RunHostOffloader(module.get(), /*after_layout=*/true));
+  EXPECT_TRUE(changed);
+  EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
+  HloVerifier verifier(/*layout_sensitive=*/true,
+                       /*allow_mixed_precision=*/true);
+  TF_EXPECT_OK(verifier.Run(module.get()).status());
+}
+
+TEST_F(HostOffloaderTest, OutputStreamingInScanLoop) {
+  const std::string& hlo_string = R"(
+HloModule m,
+  entry_computation_layout={(f32[4,1]{0,1:T(2,128)})->(f32[]{:T(256)}, f32[8,2]{0,1:T(2,128)S(5)})},
+  allow_spmd_sharding_propagation_to_output={true,true}
+
+add {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(lhs, rhs)
+}
+
+while_body {
+  param.1 = (s32[]{:T(256)}, f32[8,2]{0,1:T(2,128)}, f32[4,1]{0,1:T(2,128)}) parameter(0)
+  get-tuple-element.1 = s32[]{:T(256)} get-tuple-element(param.1), index=0
+  constant.9 = s32[]{:T(256)} constant(1)
+  add.1 = s32[]{:T(256)} add(get-tuple-element.1, constant.9)
+  get-tuple-element.2 = f32[8,2]{0,1:T(2,128)} get-tuple-element(param.1), index=1
+  get-tuple-element.3 = f32[4,1]{0,1:T(2,128)} get-tuple-element(param.1), index=2
+  bitcast = f32[1,4,1]{1,2,0:T(2,128)} bitcast(get-tuple-element.3)
+  all-gather.2 = f32[4,4,1]{1,2,0:T(2,128)} all-gather(bitcast), channel_id=2, replica_groups={{0,1,2,3}}, dimensions={0}, use_global_device_ids=true
+  constant.20 = f32[] constant(-0)
+  reduce = f32[4,4]{1,0:T(4,128)} reduce(all-gather.2, constant.20), dimensions={2}, to_apply=add
+  bitcast.1 = f32[2,4,2,1]{1,2,0,3:T(2,128)} bitcast(reduce)
+  copy.1 = f32[2,4,2,1]{1,0,2,3:T(2,128)} copy(bitcast.1)
+  reshape.6 = f32[8,2]{0,1:T(2,128)} reshape(copy.1)
+  constant.10 = s32[]{:T(256)} constant(0)
+  dynamic-slice.0 = f32[1,2]{0,1:T(2,128)} dynamic-slice(reshape.6, get-tuple-element.1, constant.10), dynamic_slice_sizes={1,2}
+  constant.11 = f32[]{:T(256)} constant(1)
+  broadcast.4 = f32[1,2]{0,1:T(2,128)} broadcast(constant.11), dimensions={}
+  add.2 = f32[1,2]{0,1:T(2,128)} add(dynamic-slice.0, broadcast.4)
+  reduce.1 = f32[2]{0:T(256)} reduce(add.2, constant.20), dimensions={0}, to_apply=add
+  custom-call.1 = f32[2]{0:T(256)} custom-call(reduce.1), custom_call_target="MoveToHost"
+  reshape.8 = f32[1,2]{0,1:T(2,128)} reshape(custom-call.1)
+  dynamic-update-slice.0 = f32[8,2]{0,1:T(2,128)} dynamic-update-slice(get-tuple-element.2, reshape.8, get-tuple-element.1, constant.10)
+  ROOT tuple = (s32[]{:T(256)}, f32[8,2]{0,1:T(2,128)}, f32[4,1]{0,1:T(2,128)}) tuple(add.1, dynamic-update-slice.0, get-tuple-element.3)
+}
+
+condition {
+  param = (s32[]{:T(256)}, f32[8,2]{0,1:T(2,128)}, f32[4,1]{0,1:T(2,128)}) parameter(0)
+  get-tuple-element = s32[]{:T(256)} get-tuple-element(param), index=0
+  constant.8 = s32[]{:T(256)} constant(8)
+  ROOT compare.0 = pred[]{:T(1024)} compare(get-tuple-element, constant.8), direction=LT
+}
+
+ENTRY e {
+  constant.17 = f32[]{:T(256)} constant(1)
+  constant.18 = s32[]{:T(256)} constant(0)
+  constant.19 = f32[]{:T(256)} constant(0)
+  broadcast.6 = f32[8,2]{0,1:T(2,128)} broadcast(constant.19), dimensions={}
+  param.2 = f32[4,1]{0,1:T(2,128)} parameter(0), sharding={devices=[2,2]<=[4]}
+  tuple.1 = (s32[]{:T(256)}, f32[8,2]{0,1:T(2,128)}, f32[4,1]{0,1:T(2,128)}) tuple(constant.18, broadcast.6, param.2)
+  while = (s32[]{:T(256)}, f32[8,2]{0,1:T(2,128)}, f32[4,1]{0,1:T(2,128)}) while(tuple.1), condition=condition, body=while_body
+  get-tuple-element.4 = f32[8,2]{0,1:T(2,128)} get-tuple-element(while), index=1
+  ROOT tuple.2 = (f32[]{:T(256)}, f32[8,2]{0,1:T(2,128)}) tuple(constant.17, get-tuple-element.4)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, RunHostOffloader(module.get(), /*after_layout=*/true));
+  EXPECT_TRUE(changed);
+  EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
+  HloVerifier verifier(/*layout_sensitive=*/true,
+                       /*allow_mixed_precision=*/true);
+  TF_EXPECT_OK(verifier.Run(module.get()).status());
+}
+
 TEST_F(HostOffloaderTest, BasicNoCopy) {
   const std::string& hlo_string = R"(
 HloModule my_module
@@ -2083,6 +2241,77 @@ TEST_F(HostOffloaderTest, OutputStreaming) {
   EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
 }
 
+TEST_F(HostOffloaderTest, OutputStreamingWithoutTuple) {
+  const std::string& hlo_string = R"(
+    HloModule ParameterStreaming, entry_computation_layout={(s32[2,1]{1,0:T(2,128)}, s32[2,1]{1,0:T(2,128)})->s32[2,1]{1,0:T(2,128)S(5)}}
+
+    ENTRY main {
+      param_0 = s32[2,1]{1,0} parameter(0)
+      param_1 = s32[2,1]{1,0} parameter(1)
+      constant_2 = s32[] constant(2)
+      constant_4 = s32[] constant(4)
+      broadcast_0 = s32[2,1]{1,0} broadcast(constant_2), dimensions={}
+      multiply_0 = s32[2,1]{1,0} multiply(param_1, broadcast_0)
+      multiply_1 = s32[2,1]{1,0} multiply(multiply_0, param_0)
+      broadcast_1 = s32[2,1]{1,0} broadcast(constant_4), dimensions={}
+      multiply_2 = s32[2,1]{1,0} multiply(multiply_1, broadcast_1)
+      ROOT custom_call = s32[2,1]{1,0} custom-call(multiply_2), custom_call_target="MoveToHost"
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
+
+  EXPECT_TRUE(changed);
+
+  // Look for the following pattern:
+  //         constant
+  //            |
+  // param1 broadcast  param0
+  //     \  /          /
+  //   multiply       /
+  //       \         /
+  //        \       /
+  //         multiply   constant
+  //         |     |       |
+  //         |  ---+---broadcast
+  //         | /
+  //      multiply
+  //          |
+  //         copy
+
+  HloInstruction* param_1;
+  HloInstruction* broadcast_0;
+  HloInstruction* multiply_0;
+  HloInstruction* param_0;
+  HloInstruction* multiply_1;
+  HloInstruction* broadcast_1;
+  HloInstruction* multiply_2;
+  HloInstruction* copy;
+  auto multiplyPattern =
+      m::Multiply(&multiply_1,
+                  m::Multiply(&multiply_0, m::Parameter(&param_1),
+                              m::Broadcast(&broadcast_0, m::ConstantScalar(2))),
+                  m::Parameter(&param_0));
+  ASSERT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Copy(
+                  &copy, m::Multiply(&multiply_2, multiplyPattern,
+                                     m::Broadcast(&broadcast_1,
+                                                  m::ConstantScalar(4))))));
+  TestShapeHasMemorySpace(param_1->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(broadcast_0->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(multiply_0->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(param_0->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(multiply_1->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(broadcast_1->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(multiply_2->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(copy->shape(), kHostMemorySpaceColor);
+
+  EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
+}
+
 TEST_F(HostOffloaderTest, OutputStreamingCustomCallRoot) {
   const std::string& hlo_string = R"(
     HloModule ParameterStreaming, entry_computation_layout={(s32[2,1]{1,0:T(2,128)}, s32[2,1]{1,0:T(2,128)})->s32[2,1]{1,0:T(2,128)S(5)}}
@@ -2153,6 +2382,84 @@ TEST_F(HostOffloaderTest, OutputStreamingCustomCallRoot) {
   EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
 }
 
+TEST_F(HostOffloaderTest, OutputStreamingInUnrolledScanLoop) {
+  const std::string& hlo_string = R"(
+HloModule m,
+entry_computation_layout={(s32[16,16,8]{1,2,0:T(8,128)})->s32[16,16,8]{1,2,0:T(8,128)S(5)}},
+allow_spmd_sharding_propagation_to_output={true}, num_partitions=2
+
+body {
+  loop_peel_param = (s32[]{:T(256)}, s32[16,16,8]{1,2,0:T(8,128)}, s32[16,16,8]{1,2,0:T(8,128)}, s32[]{:T(256)}, s32[16,8]{0,1:T(8,128)}) parameter(0)
+  get-tuple-element.12 = s32[]{:T(256)} get-tuple-element(loop_peel_param), index=0
+  constant.29 = s32[]{:T(256)} constant(1)
+  add.5 = s32[]{:T(256)} add(get-tuple-element.12, constant.29)
+  get-tuple-element.13 = s32[16,16,8]{1,2,0:T(8,128)} get-tuple-element(loop_peel_param), index=1
+  get-tuple-element.18 = s32[16,8]{0,1:T(8,128)} get-tuple-element(loop_peel_param), index=4
+  custom-call.3 = s32[16,8]{0,1:T(8,128)} custom-call(get-tuple-element.18), custom_call_target="MoveToHost"
+  bitcast = s32[1,16,8]{1,2,0:T(8,128)} bitcast(custom-call.3)
+  get-tuple-element.15 = s32[]{:T(256)} get-tuple-element(loop_peel_param), index=3
+  constant.30 = s32[]{:T(256)} constant(0)
+  dynamic-update-slice.2 = s32[16,16,8]{1,2,0:T(8,128)} dynamic-update-slice(get-tuple-element.13, bitcast, get-tuple-element.15, constant.30, constant.30), backend_config={"flag_configs":[],"scoped_memory_configs":[],"indices_config":{"index_known_bits":[{"zeroes":"0","ones":"0","bitwidth":"32"},{"zeroes":"4294967295","ones":"0","bitwidth":"32"},{"zeroes":"4294967295","ones":"0","bitwidth":"32"}]},"compute_type":"COMPUTE_TYPE_DEFAULT","device_type":"DEVICE_TYPE_INVALID","used_scoped_memory_configs":[]}
+  get-tuple-element.14 = s32[16,16,8]{1,2,0:T(8,128)} get-tuple-element(loop_peel_param), index=2
+  dynamic-slice.2 = s32[1,16,8]{1,2,0:T(8,128)} dynamic-slice(get-tuple-element.14, get-tuple-element.12, constant.30, constant.30), dynamic_slice_sizes={1,16,8}
+  broadcast.8 = s32[1,16,8]{1,2,0:T(8,128)} broadcast(constant.29), dimensions={}
+  add.6 = s32[1,16,8]{1,2,0:T(8,128)} add(dynamic-slice.2, broadcast.8)
+  bitcast.1 = s32[16,8]{0,1:T(8,128)} bitcast(add.6)
+  ROOT tuple.3 = (s32[]{:T(256)}, s32[16,16,8]{1,2,0:T(8,128)}, s32[16,16,8]{1,2,0:T(8,128)}, s32[]{:T(256)}, s32[16,8]{0,1:T(8,128)}) tuple(add.5, dynamic-update-slice.2, get-tuple-element.14, get-tuple-element.12, bitcast.1)
+} // body
+
+condition {
+  loop_peel_cond_param = (s32[]{:T(256)}, s32[16,16,8]{1,2,0:T(8,128)}, s32[16,16,8]{1,2,0:T(8,128)}, s32[]{:T(256)}, s32[16,8]{0,1:T(8,128)}) parameter(0)
+  get-tuple-element.11 = s32[]{:T(256)} get-tuple-element(loop_peel_cond_param), index=0
+  constant.28 = s32[]{:T(256)} constant(16)
+  ROOT compare.1 = pred[]{:T(1024)} compare(get-tuple-element.11, constant.28), direction=LT
+}
+
+ENTRY entry {
+  constant.26 = s32[]{:T(256)} constant(1)
+  constant.24 = s32[]{:T(256)} constant(0)
+  broadcast.6 = s32[16,16,8]{1,2,0:T(8,128)} broadcast(constant.24), dimensions={}
+  param.2 = s32[16,16,8]{1,2,0:T(8,128)} parameter(0), sharding={devices=[1,1,2]<=[2]}
+  slice = s32[1,16,8]{1,2,0:T(8,128)} slice(param.2), slice={[0:1], [0:16], [0:8]}
+  broadcast.7 = s32[1,16,8]{1,2,0:T(8,128)} broadcast(constant.26), dimensions={}
+  add.4 = s32[1,16,8]{1,2,0:T(8,128)} add(slice, broadcast.7)
+  bitcast.2 = s32[16,8]{0,1:T(8,128)} bitcast(add.4)
+  tuple.4 = (s32[]{:T(256)}, s32[16,16,8]{1,2,0:T(8,128)}, s32[16,16,8]{1,2,0:T(8,128)}, s32[]{:T(256)}, s32[16,8]{0,1:T(8,128)}) tuple(constant.26, broadcast.6, param.2, constant.24, bitcast.2)
+  while.1 = (s32[]{:T(256)}, s32[16,16,8]{1,2,0:T(8,128)}, s32[16,16,8]{1,2,0:T(8,128)}, s32[]{:T(256)}, s32[16,8]{0,1:T(8,128)}) while(tuple.4), condition=condition, body=body
+  get-tuple-element.17 = s32[16,16,8]{1,2,0:T(8,128)} get-tuple-element(while.1), index=1
+  get-tuple-element.19 = s32[16,8]{0,1:T(8,128)} get-tuple-element(while.1), index=4
+  custom-call.4 = s32[16,8]{0,1:T(8,128)} custom-call(get-tuple-element.19), custom_call_target="MoveToHost"
+  bitcast.3 = s32[1,16,8]{1,2,0:T(8,128)} bitcast(custom-call.4)
+  get-tuple-element.16 = s32[]{:T(256)} get-tuple-element(while.1), index=3
+  ROOT dynamic-update-slice.3 = s32[16,16,8]{1,2,0:T(8,128)} dynamic-update-slice(get-tuple-element.17, bitcast.3, get-tuple-element.16, constant.24, constant.24), backend_config={"flag_configs":[],"scoped_memory_configs":[],"indices_config":{"index_known_bits":[{"zeroes":"0","ones":"0","bitwidth":"32"},{"zeroes":"4294967295","ones":"0","bitwidth":"32"},{"zeroes":"4294967295","ones":"0","bitwidth":"32"}]},"compute_type":"COMPUTE_TYPE_DEFAULT","device_type":"DEVICE_TYPE_INVALID","used_scoped_memory_configs":[]}
+} // entry
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
+
+  EXPECT_TRUE(changed);
+
+  HloInstruction* bitcast;
+  HloInstruction* gte_0;
+  HloInstruction* gte_1;
+  HloInstruction* dus;
+  ASSERT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::DynamicUpdateSlice(
+                  &dus, m::GetTupleElement(&gte_0), m::Bitcast(&bitcast),
+                  m::GetTupleElement(&gte_1), m::ConstantScalar(0),
+                  m::ConstantScalar(0))));
+
+  TestShapeHasMemorySpace(bitcast->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(gte_0->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(gte_1->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(dus->shape(), kHostMemorySpaceColor);
+
+  EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
+}
+
 }  // namespace
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/host_offloading_prepare.cc b/third_party/xla/xla/service/host_offloading_prepare.cc
new file mode 100644
index 00000000000000..9176d0dc3f84a2
--- /dev/null
+++ b/third_party/xla/xla/service/host_offloading_prepare.cc
@@ -0,0 +1,94 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/host_offloading_prepare.h"
+
+#include <memory>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/call_graph.h"
+#include "xla/service/host_memory_offload_annotations.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+
+absl::StatusOr<bool> HostOffloadingPrepare::RemoveSurroundingMoveCustomCalls(
+    HloInstruction* async_start) {
+  // If any of the operands are a MoveToHost custom call, remove them.
+  bool removed = false;
+  for (HloInstruction* operand : async_start->operands()) {
+    // TODO(b/338463228): It could be the case that custom-calls are on the
+    // otherside of a bitcast or tuple.
+    if (operand->IsCustomCall(
+            host_memory_offload_annotations::kMoveToHostCustomCallTarget)) {
+      CHECK_EQ(operand->operands().size(), 1);
+      VLOG(1) << "Replacing " << operand->ToString() << " with "
+              << operand->operands().at(0)->ToString();
+      TF_RETURN_IF_ERROR(
+          operand->ReplaceAllUsesWith(operand->mutable_operand(0)));
+      TF_RETURN_IF_ERROR(async_start->parent()->RemoveInstruction(operand));
+      removed = true;
+    }
+  }
+  return removed;
+}
+
+absl::StatusOr<bool> HostOffloadingPrepare::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  bool changed = false;
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  for (HloComputation* computation : module->computations()) {
+    if (computation->execution_thread() != HloInstruction::kHostThread) {
+      continue;
+    }
+    // This is a computation to be offloaded to the host.
+    std::vector<HloInstruction*> callers =
+        call_graph->GetComputationCallers(computation);
+    for (HloInstruction* caller : callers) {
+      VLOG(2) << "Hlo computation " << computation->name()
+              << " is offloaded to host and has caller " << caller->ToString();
+      if (caller->parent()->execution_thread() == HloInstruction::kHostThread) {
+        VLOG(3) << "Nested host computation, must be a async-wrapper";
+        continue;
+      }
+      VLOG(2) << "Going to adjust before and after " << caller->name();
+    }
+  }
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kAsyncStart &&
+          instruction->async_execution_thread() ==
+              HloInstruction::kHostThread) {
+        VLOG(2) << "Found async start of host computation: "
+                << instruction->ToString() << " done must be "
+                << instruction->users().at(0)->ToString();
+        TF_ASSIGN_OR_RETURN(bool removed,
+                            RemoveSurroundingMoveCustomCalls(instruction));
+        changed = changed || removed;
+      }
+    }
+  }
+  return changed;
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/service/host_offloading_prepare.h b/third_party/xla/xla/service/host_offloading_prepare.h
new file mode 100644
index 00000000000000..073e8b209ba961
--- /dev/null
+++ b/third_party/xla/xla/service/host_offloading_prepare.h
@@ -0,0 +1,43 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ ==============================================================================*/
+#ifndef XLA_SERVICE_HOST_OFFLOADING_PREPARE_H_
+#define XLA_SERVICE_HOST_OFFLOADING_PREPARE_H_
+
+#include "xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// Host compute offload requires that all inputs be in device memory if they're
+// temporaries. If they're streamed inputs, they can either be device or host
+// memory.
+class HostOffloadingPrepare : public HloModulePass {
+ public:
+  ~HostOffloadingPrepare() override = default;
+
+  absl::string_view name() const override { return "host-offloading-prepare"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  absl::StatusOr<bool> RemoveSurroundingMoveCustomCalls(
+      HloInstruction* async_start);
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_HOST_OFFLOADING_PREPARE_H_
diff --git a/third_party/xla/xla/service/host_offloading_prepare_test.cc b/third_party/xla/xla/service/host_offloading_prepare_test.cc
new file mode 100644
index 00000000000000..fcd6869ac47324
--- /dev/null
+++ b/third_party/xla/xla/service/host_offloading_prepare_test.cc
@@ -0,0 +1,318 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/host_offloading_prepare.h"
+
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/host_memory_offload_annotations.h"
+#include "xla/tests/hlo_test_base.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace {
+
+class HostOffloadingPrepareTest : public HloTestBase {
+ protected:
+  absl::StatusOr<bool> RunPass(HloModule* module) {
+    TF_EXPECT_OK(verifier().Run(module).status());
+    if (module->has_schedule()) {
+      return absl::InternalError("Expected a non-scheduled module");
+    }
+    HostOffloadingPrepare pass;
+    TF_ASSIGN_OR_RETURN(bool changed, pass.Run(module));
+    return changed;
+  }
+
+  std::vector<const HloInstruction*> GetHostOffloadAsyncStartInstructions(
+      const HloModule* module) {
+    std::vector<const HloInstruction*> result;
+    for (const HloComputation* computation : module->computations()) {
+      for (const HloInstruction* instruction : computation->instructions()) {
+        if (instruction->opcode() == HloOpcode::kAsyncStart &&
+            instruction->async_execution_thread() ==
+                HloInstruction::kHostThread) {
+          result.push_back(instruction);
+        }
+      }
+    }
+    return result;
+  }
+};
+
+TEST_F(HostOffloadingPrepareTest, SingleInputHasMoveToHost) {
+  const std::string& hlo_string = R"(
+HloModule my_module, entry_computation_layout={(s32[32]{0:T(128)})->s32[32]{0:T(128)}}
+
+host_computation {
+  Arg_0.0 = s32[32]{0} parameter(0)
+  ROOT multiply.0 = s32[32]{0} multiply(Arg_0.0, Arg_0.0)
+}, execution_thread="host"
+
+async_computation {
+  param_0 = s32[32]{0} parameter(0)
+  ROOT custom-call.cloned = s32[32]{0} custom-call(param_0), custom_call_target="HostExecute", called_computations={host_computation}, frontend_attributes={_xla_compute_type="host"}
+}, execution_thread="host"
+
+ENTRY main {
+  Arg_0.1 = s32[32]{0:T(128)} parameter(0)
+  constant.2 = s32[]{:T(128)} constant(2)
+  broadcast.3 = s32[32]{0:T(128)} broadcast(constant.2), dimensions={}
+  multiply.4 = s32[32]{0:T(128)} multiply(Arg_0.1, broadcast.3)
+  custom-call.1 = s32[32]{0:T(128)} custom-call(multiply.4), custom_call_target="MoveToHost"
+  custom-call.cloned.call-start = ((s32[32]{0:T(128)}), s32[32]{0:T(128)}, u32[]{:T(128)}) async-start(custom-call.1), async_execution_thread="host", calls=async_computation
+  ROOT custom-call.cloned.call-done = s32[32]{0:T(128)} async-done(custom-call.cloned.call-start), frontend_attributes={_xla_compute_type="host"}, backend_config={"flag_configs":[],"scoped_memory_configs":[],"compute_type":"COMPUTE_TYPE_DEFAULT","device_type":"DEVICE_TYPE_HOST","used_scoped_memory_configs":[]}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunPass(module.get()));
+
+  EXPECT_TRUE(changed);
+
+  for (const HloInstruction* instruction :
+       GetHostOffloadAsyncStartInstructions(module.get())) {
+    // None of the inputs should be a "to host" custom call.
+    for (const HloInstruction* operand : instruction->operands()) {
+      EXPECT_FALSE(operand->IsCustomCall(
+          {host_memory_offload_annotations::kMoveToHostCustomCallTarget}));
+    }
+    // None of the outputs should be a "to device" custom call.
+    for (const HloInstruction* user : instruction->users()) {
+      EXPECT_FALSE(user->IsCustomCall(
+          {host_memory_offload_annotations::kMoveToDeviceCustomCallTarget}));
+    }
+  }
+}
+
+TEST_F(HostOffloadingPrepareTest, MultipleInputHasOneMoveToHost) {
+  const std::string& hlo_string = R"(
+HloModule my_module, entry_computation_layout={(s32[32]{0:T(128)})->s32[32]{0:T(128)}}
+
+host_computation {
+  Arg_0.0 = s32[32]{0} parameter(0)
+  Arg_0.1 = s32[32]{0} parameter(1)
+  ROOT multiply.0 = s32[32]{0} multiply(Arg_0.0, Arg_0.1)
+}, execution_thread="host"
+
+async_computation {
+  param_0 = s32[32]{0} parameter(0)
+  param_1 = s32[32]{0} parameter(1)
+  ROOT custom-call.cloned = s32[32]{0} custom-call(param_0, param_1), custom_call_target="HostExecute", called_computations={host_computation}, frontend_attributes={_xla_compute_type="host"}
+}, execution_thread="host"
+
+ENTRY main {
+  Arg_0.1 = s32[32]{0:T(128)} parameter(0)
+  constant.2 = s32[]{:T(128)} constant(2)
+  broadcast.3 = s32[32]{0:T(128)} broadcast(constant.2), dimensions={}
+  multiply.4 = s32[32]{0:T(128)} multiply(Arg_0.1, broadcast.3)
+  custom-call.1 = s32[32]{0:T(128)} custom-call(multiply.4), custom_call_target="MoveToHost"
+  custom-call.cloned.call-start = ((s32[32]{0:T(128)}, s32[32]{0:T(128)}), s32[32]{0:T(128)}, u32[]{:T(128)}) async-start(custom-call.1, custom-call.1), async_execution_thread="host", calls=async_computation
+  ROOT custom-call.cloned.call-done = s32[32]{0:T(128)} async-done(custom-call.cloned.call-start), frontend_attributes={_xla_compute_type="host"}, backend_config={"flag_configs":[],"scoped_memory_configs":[],"compute_type":"COMPUTE_TYPE_DEFAULT","device_type":"DEVICE_TYPE_HOST","used_scoped_memory_configs":[]}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunPass(module.get()));
+
+  EXPECT_TRUE(changed);
+
+  for (const HloInstruction* instruction :
+       GetHostOffloadAsyncStartInstructions(module.get())) {
+    // None of the inputs should be a "to host" custom call.
+    for (const HloInstruction* operand : instruction->operands()) {
+      EXPECT_FALSE(operand->IsCustomCall(
+          {host_memory_offload_annotations::kMoveToHostCustomCallTarget}));
+    }
+    // None of the outputs should be a "to device" custom call.
+    for (const HloInstruction* user : instruction->users()) {
+      EXPECT_FALSE(user->IsCustomCall(
+          {host_memory_offload_annotations::kMoveToDeviceCustomCallTarget}));
+    }
+  }
+}
+
+TEST_F(HostOffloadingPrepareTest, MultipleInputHasMultipleMoveToHost) {
+  const std::string& hlo_string = R"(
+HloModule my_module, entry_computation_layout={(s32[32]{0:T(128)})->s32[32]{0:T(128)}}
+
+host_computation {
+  Arg_0.0 = s32[32]{0} parameter(0)
+  Arg_0.1 = s32[32]{0} parameter(1)
+  ROOT multiply.0 = s32[32]{0} multiply(Arg_0.0, Arg_0.1)
+}, execution_thread="host"
+
+async_computation {
+  param_0 = s32[32]{0} parameter(0)
+  param_1 = s32[32]{0} parameter(1)
+  ROOT custom-call.cloned = s32[32]{0} custom-call(param_0, param_1), custom_call_target="HostExecute", called_computations={host_computation}, frontend_attributes={_xla_compute_type="host"}
+}, execution_thread="host"
+
+ENTRY main {
+  Arg_0.1 = s32[32]{0:T(128)} parameter(0)
+  constant.2 = s32[]{:T(128)} constant(2)
+  broadcast.3 = s32[32]{0:T(128)} broadcast(constant.2), dimensions={}
+  multiply.4 = s32[32]{0:T(128)} multiply(Arg_0.1, broadcast.3)
+  custom-call.1 = s32[32]{0:T(128)} custom-call(multiply.4), custom_call_target="MoveToHost"
+  custom-call.2 = s32[32]{0:T(128)} custom-call(multiply.4), custom_call_target="MoveToHost"
+  custom-call.cloned.call-start = ((s32[32]{0:T(128)}, s32[32]{0:T(128)}), s32[32]{0:T(128)}, u32[]{:T(128)}) async-start(custom-call.1, custom-call.2), async_execution_thread="host", calls=async_computation
+  ROOT custom-call.cloned.call-done = s32[32]{0:T(128)} async-done(custom-call.cloned.call-start), frontend_attributes={_xla_compute_type="host"}, backend_config={"flag_configs":[],"scoped_memory_configs":[],"compute_type":"COMPUTE_TYPE_DEFAULT","device_type":"DEVICE_TYPE_HOST","used_scoped_memory_configs":[]}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunPass(module.get()));
+
+  EXPECT_TRUE(changed);
+
+  for (const HloInstruction* instruction :
+       GetHostOffloadAsyncStartInstructions(module.get())) {
+    // None of the inputs should be a "to host" custom call.
+    for (const HloInstruction* operand : instruction->operands()) {
+      EXPECT_FALSE(operand->IsCustomCall(
+          {host_memory_offload_annotations::kMoveToHostCustomCallTarget}));
+    }
+    // None of the outputs should be a "to device" custom call.
+    for (const HloInstruction* user : instruction->users()) {
+      EXPECT_FALSE(user->IsCustomCall(
+          {host_memory_offload_annotations::kMoveToDeviceCustomCallTarget}));
+    }
+  }
+}
+
+TEST_F(HostOffloadingPrepareTest, SingleInputHasMoveToDevice) {
+  const std::string& hlo_string = R"(
+HloModule my_module, entry_computation_layout={(s32[32]{0:T(128)})->s32[32]{0:T(128)}}
+
+host_computation {
+  Arg_0.0 = s32[32]{0} parameter(0)
+  ROOT multiply.0 = s32[32]{0} multiply(Arg_0.0, Arg_0.0)
+}, execution_thread="host"
+
+async_computation {
+  param_0 = s32[32]{0} parameter(0)
+  ROOT custom-call.cloned = s32[32]{0} custom-call(param_0), custom_call_target="HostExecute", called_computations={host_computation}, frontend_attributes={_xla_compute_type="host"}
+}, execution_thread="host"
+
+ENTRY main {
+  Arg_0.1 = s32[32]{0:T(128)} parameter(0)
+  constant.2 = s32[]{:T(128)} constant(2)
+  broadcast.3 = s32[32]{0:T(128)} broadcast(constant.2), dimensions={}
+  multiply.4 = s32[32]{0:T(128)} multiply(Arg_0.1, broadcast.3)
+  custom-call.1 = s32[32]{0:T(128)} custom-call(multiply.4), custom_call_target="MoveToHost"
+  custom-call.2 = s32[32]{0:T(128)} custom-call(multiply.4), custom_call_target="MoveToDevice"
+  custom-call.cloned.call-start = ((s32[32]{0:T(128)}), s32[32]{0:T(128)}, u32[]{:T(128)}) async-start(custom-call.2), async_execution_thread="host", calls=async_computation
+  ROOT custom-call.cloned.call-done = s32[32]{0:T(128)} async-done(custom-call.cloned.call-start), frontend_attributes={_xla_compute_type="host"}, backend_config={"flag_configs":[],"scoped_memory_configs":[],"compute_type":"COMPUTE_TYPE_DEFAULT","device_type":"DEVICE_TYPE_HOST","used_scoped_memory_configs":[]}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunPass(module.get()));
+
+  EXPECT_FALSE(changed);
+}
+
+TEST_F(HostOffloadingPrepareTest, MultipleInputHasOneMoveToDevice) {
+  const std::string& hlo_string = R"(
+HloModule my_module, entry_computation_layout={(s32[32]{0:T(128)})->s32[32]{0:T(128)}}
+
+host_computation {
+  Arg_0.0 = s32[32]{0} parameter(0)
+  Arg_0.1 = s32[32]{0} parameter(1)
+  ROOT multiply.0 = s32[32]{0} multiply(Arg_0.0, Arg_0.1)
+}, execution_thread="host"
+
+async_computation {
+  param_0 = s32[32]{0} parameter(0)
+  param_1 = s32[32]{0} parameter(1)
+  ROOT custom-call.cloned = s32[32]{0} custom-call(param_0, param_1), custom_call_target="HostExecute", called_computations={host_computation}, frontend_attributes={_xla_compute_type="host"}
+}, execution_thread="host"
+
+ENTRY main {
+  Arg_0.1 = s32[32]{0:T(128)} parameter(0)
+  constant.2 = s32[]{:T(128)} constant(2)
+  broadcast.3 = s32[32]{0:T(128)} broadcast(constant.2), dimensions={}
+  multiply.4 = s32[32]{0:T(128)} multiply(Arg_0.1, broadcast.3)
+  custom-call.1 = s32[32]{0:T(128)} custom-call(multiply.4), custom_call_target="MoveToHost"
+  custom-call.2 = s32[32]{0:T(128)} custom-call(multiply.4), custom_call_target="MoveToDevice"
+  custom-call.cloned.call-start = ((s32[32]{0:T(128)}, s32[32]{0:T(128)}), s32[32]{0:T(128)}, u32[]{:T(128)}) async-start(custom-call.2, custom-call.2), async_execution_thread="host", calls=async_computation
+  ROOT custom-call.cloned.call-done = s32[32]{0:T(128)} async-done(custom-call.cloned.call-start), frontend_attributes={_xla_compute_type="host"}, backend_config={"flag_configs":[],"scoped_memory_configs":[],"compute_type":"COMPUTE_TYPE_DEFAULT","device_type":"DEVICE_TYPE_HOST","used_scoped_memory_configs":[]}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunPass(module.get()));
+
+  EXPECT_FALSE(changed);
+}
+
+TEST_F(HostOffloadingPrepareTest, MultipleInputHasMultipleMoveToDevice) {
+  const std::string& hlo_string = R"(
+HloModule my_module, entry_computation_layout={(s32[32]{0:T(128)})->s32[32]{0:T(128)}}
+
+host_computation {
+  Arg_0.0 = s32[32]{0} parameter(0)
+  Arg_0.1 = s32[32]{0} parameter(1)
+  ROOT multiply.0 = s32[32]{0} multiply(Arg_0.0, Arg_0.1)
+}, execution_thread="host"
+
+async_computation {
+  param_0 = s32[32]{0} parameter(0)
+  param_1 = s32[32]{0} parameter(1)
+  ROOT custom-call.cloned = s32[32]{0} custom-call(param_0, param_1), custom_call_target="HostExecute", called_computations={host_computation}, frontend_attributes={_xla_compute_type="host"}
+}, execution_thread="host"
+
+ENTRY main {
+  Arg_0.1 = s32[32]{0:T(128)} parameter(0)
+  constant.2 = s32[]{:T(128)} constant(2)
+  broadcast.3 = s32[32]{0:T(128)} broadcast(constant.2), dimensions={}
+  multiply.4 = s32[32]{0:T(128)} multiply(Arg_0.1, broadcast.3)
+  custom-call.1 = s32[32]{0:T(128)} custom-call(multiply.4), custom_call_target="MoveToHost"
+  custom-call.2 = s32[32]{0:T(128)} custom-call(multiply.4), custom_call_target="MoveToHost"
+  custom-call.3 = s32[32]{0:T(128)} custom-call(multiply.4), custom_call_target="MoveToDevice"
+  custom-call.4 = s32[32]{0:T(128)} custom-call(multiply.4), custom_call_target="MoveToDevice"
+  custom-call.cloned.call-start = ((s32[32]{0:T(128)}, s32[32]{0:T(128)}), s32[32]{0:T(128)}, u32[]{:T(128)}) async-start(custom-call.3, custom-call.4), async_execution_thread="host", calls=async_computation
+  ROOT custom-call.cloned.call-done = s32[32]{0:T(128)} async-done(custom-call.cloned.call-start), frontend_attributes={_xla_compute_type="host"}, backend_config={"flag_configs":[],"scoped_memory_configs":[],"compute_type":"COMPUTE_TYPE_DEFAULT","device_type":"DEVICE_TYPE_HOST","used_scoped_memory_configs":[]}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunPass(module.get()));
+
+  EXPECT_FALSE(changed);
+}
+
+}  // namespace
+
+}  // namespace xla
diff --git a/third_party/xla/xla/service/indexed_array_analysis.cc b/third_party/xla/xla/service/indexed_array_analysis.cc
index fe4b20cdc3781a..1b711e59082a9c 100644
--- a/third_party/xla/xla/service/indexed_array_analysis.cc
+++ b/third_party/xla/xla/service/indexed_array_analysis.cc
@@ -92,7 +92,7 @@ absl::StatusOr<Analysis::Array*> IndexedArrayAnalysis::GetArrayFor(
   return FindOrDie(cache_, instr);
 }
 
-Status IndexedArrayAnalysis::TraverseAndPopulateCache(
+absl::Status IndexedArrayAnalysis::TraverseAndPopulateCache(
     const HloInstruction* root) {
   // Depth first search over the DAG, invoking ComputeArrayFor in post order.
   // The HLO instructions already in the cache are considered leaves.
@@ -1146,10 +1146,6 @@ absl::StatusOr<Analysis::Array*> IndexedArrayAnalysis::ComputeArrayForDot(
   return nullptr;
 }
 
-absl::string_view IndexedArrayAnalysisPrinterPass::name() const {
-  return "indexed-array-analysis-printer-pass";
-}
-
 absl::StatusOr<bool> IndexedArrayAnalysisPrinterPass::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
diff --git a/third_party/xla/xla/service/indexed_array_analysis.h b/third_party/xla/xla/service/indexed_array_analysis.h
index c0746b1f68dc6f..ce56c6b287246c 100644
--- a/third_party/xla/xla/service/indexed_array_analysis.h
+++ b/third_party/xla/xla/service/indexed_array_analysis.h
@@ -253,7 +253,7 @@ class IndexedArrayAnalysis {
  private:
   // Helper function that ensures that every HLO instruction that is
   // transitively used by `root` has an entry in `cache_`.
-  Status TraverseAndPopulateCache(const HloInstruction* root);
+  absl::Status TraverseAndPopulateCache(const HloInstruction* root);
 
   // Creates an Array instance for `instr` under the assumption that all
   // operations of `instr` are present in `cache_`.
@@ -370,7 +370,9 @@ class IndexedArrayAnalysis {
 // unconditionally add to the regular HLO pass pipeline.
 class IndexedArrayAnalysisPrinterPass : public HloModulePass {
  public:
-  absl::string_view name() const override;
+  absl::string_view name() const override {
+    return "indexed-array-analysis-printer-pass";
+  }
   using HloPassInterface::Run;
   absl::StatusOr<bool> Run(
       HloModule* module,
diff --git a/third_party/xla/xla/service/latency_hiding_scheduler.cc b/third_party/xla/xla/service/latency_hiding_scheduler.cc
index c007eb9af345eb..83eb5c5ebffc38 100644
--- a/third_party/xla/xla/service/latency_hiding_scheduler.cc
+++ b/third_party/xla/xla/service/latency_hiding_scheduler.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <cstdlib>
 #include <functional>
 #include <limits>
-#include <list>
 #include <memory>
 #include <optional>
 #include <string>
@@ -31,10 +30,14 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -1099,13 +1102,30 @@ class ReadySetLt {
   }
 };
 
+enum SkipNodeReason {
+  kShouldSkipNodeFunction,
+  kExceedsOverlapLimit,
+};
+
+absl::string_view SkipNodeReasonString(SkipNodeReason reason) {
+  switch (reason) {
+    case SkipNodeReason::kShouldSkipNodeFunction:
+      return "Skipped due to kShouldSkipNodeFunction.";
+    case SkipNodeReason::kExceedsOverlapLimit:
+      return "Skipped due to kExceedsOverlapLimit.";
+  }
+}
+
 }  // namespace
 
 // Helper function to find the best node from the queues of scheduling state for
 // scheduling.
-HloGraphNode* DefaultSchedulerCore::FindAndExtractBestNodeAvailable(
+absl::StatusOr<HloGraphNode*>
+DefaultSchedulerCore::FindAndExtractBestNodeAvailable(
     DefaultSchedulerCore::SchedulingState& sched_state,
     DefaultSchedulerCore::ShouldSkipNodeFunction should_skip_node) {
+  absl::InlinedVector<std::pair<HloGraphNode*, SkipNodeReason>, 2>
+      skipped_nodes_and_reasons;
   auto scheduling_instruction_crosses_overlap_limit =
       [&sched_state](const HloInstruction& instr) {
         for (const auto& [resource, limit] :
@@ -1138,12 +1158,20 @@ HloGraphNode* DefaultSchedulerCore::FindAndExtractBestNodeAvailable(
             e = sched_state.ready_set.end();
        ready_node_it != e; ++ready_node_it) {
     if (should_skip_node && should_skip_node(*ready_node_it)) {
+      if (ready_chosen.node == nullptr) {
+        skipped_nodes_and_reasons.push_back(
+            {*ready_node_it, SkipNodeReason::kShouldSkipNodeFunction});
+      }
       continue;
     }
     // If this node would cause the max_concurrent_resource count to go beyond
     // the limit do not schedule it and pass to the next node.
     if (scheduling_instruction_crosses_overlap_limit(
             (*ready_node_it)->GetInstr())) {
+      if (ready_chosen.node == nullptr) {
+        skipped_nodes_and_reasons.push_back(
+            {*ready_node_it, SkipNodeReason::kExceedsOverlapLimit});
+      }
       continue;
     }
     ScheduleCandidate ready_candidate =
@@ -1187,7 +1215,15 @@ HloGraphNode* DefaultSchedulerCore::FindAndExtractBestNodeAvailable(
     }
   }
   if (ready_chosen.node == nullptr) {
-    return nullptr;
+    return absl::InternalError(absl::StrCat(
+        "FindAndExtractBestNodeAvailable failed to find a node to "
+        "schedule, skipped nodes: ",
+        absl::StrJoin(skipped_nodes_and_reasons, "; ",
+                      [](std::string* out, const auto& pair) {
+                        absl::StrAppend(out, pair.first->GetInstr().name(),
+                                        ": ",
+                                        SkipNodeReasonString(pair.second));
+                      })));
   }
   CHECK(chosen_it != sched_state.ready_set.end());
   std::swap(*chosen_it, sched_state.ready_set.back());
@@ -1518,6 +1554,7 @@ HloScheduleGraph::HloScheduleGraph(
   HloComputation* comp = (*post_order_instructions)[0]->parent();
   auto reachability = HloReachabilityMap::Build(comp);
   int64_t current_pos = 0;
+  std::vector<const HloInstruction*> while_instrs;
   // Allocating the graph nodes. One for each of the instructions in the
   // original instructions order.
   for (HloInstruction* instr : *post_order_instructions) {
@@ -1536,6 +1573,10 @@ HloScheduleGraph::HloScheduleGraph(
     new_node_it->second->occupied_shareable_resources_ =
         async_tracker->GetOccupiedShareableResourcesFromVector(
             new_node_it->second->GetResources());
+    // Gather while instructions for subsequent send-done dependency checks.
+    if (instr->opcode() == HloOpcode::kWhile) {
+      while_instrs.push_back(instr);
+    }
   }
   auto add_dependency_helper = [latency_estimator](HloGraphNode* from,
                                                    HloGraphNode* to) {
@@ -1549,6 +1590,7 @@ HloScheduleGraph::HloScheduleGraph(
     ++to->indegree_;
     ++from->outdegree_;
   };
+
   // Add dependencies edges between each of the graph nodes.
   for (const HloInstruction* instr : *post_order_instructions) {
     auto node_it = nodes_.find(instr);
@@ -1613,6 +1655,53 @@ HloScheduleGraph::HloScheduleGraph(
         }
       }
     }
+    // Add dependent edges from send-done operations to while loops which are
+    // dependent on the recv-done control predecessor of the send-done.
+    // This prevents send-done operations from being scheduled after dependent
+    // while loops, which can caused send/recv overlap limits to be violated.
+    //
+    // Example HLO sequence:
+    //
+    //   %0 = recv-done --->
+    //                     |
+    //   %1 = send-done <--|
+    //   %2 = while <------|
+    //
+    if (instr->opcode() == HloOpcode::kSendDone) {
+      for (const auto* ctrl_pred : instr->control_predecessors()) {
+        if (ctrl_pred->opcode() != HloOpcode::kRecvDone) {
+          continue;
+        }
+        const HloInstruction* dependent_while_instr = nullptr;
+        for (const auto* while_hlo : while_instrs) {
+          if (!reachability->IsReachable(ctrl_pred, while_hlo)) {
+            continue;
+          }
+          if (dependent_while_instr == nullptr) {
+            dependent_while_instr = while_hlo;
+            continue;
+          }
+          if (OriginalInstructionPosition(while_hlo) <
+              OriginalInstructionPosition(dependent_while_instr)) {
+            dependent_while_instr = while_hlo;
+          }
+        }
+        // Add dependency edge from 'instr' to 'dependent_while_instr'.
+        if (dependent_while_instr != nullptr) {
+          auto send_done_it = nodes_.find(instr);
+          CHECK(send_done_it != nodes_.end());
+          HloGraphNode* send_done_node = send_done_it->second.get();
+          auto while_it = nodes_.find(dependent_while_instr);
+          CHECK(while_it != nodes_.end());
+          HloGraphNode* while_node = while_it->second.get();
+          send_done_node->successors_.push_back(HloEdge(1, while_node));
+          while_node->predecessors_.push_back(HloEdge(1, send_done_node));
+          ++send_done_node->outdegree_;
+          ++while_node->indegree_;
+        }
+        break;
+      }
+    }
   }
 }
 
@@ -1721,7 +1810,8 @@ void HloScheduleGraph::InitializeGraphAnalysis(
   }
 }
 
-Status DefaultSchedulerCore::InitializeScheduler(const HloModule* module) {
+absl::Status DefaultSchedulerCore::InitializeScheduler(
+    const HloModule* module) {
   TF_ASSIGN_OR_RETURN(alias_analysis_, HloAliasAnalysis::Run(module));
   module_pressure_state_ = std::make_unique<ModulePressureState>(
       module, alias_analysis_.get(), shape_size_bytes_);
@@ -1730,11 +1820,13 @@ Status DefaultSchedulerCore::InitializeScheduler(const HloModule* module) {
   return OkStatus();
 }
 
-Status DefaultSchedulerCore::SchedulingStep(SchedulingState* sched_state) {
+absl::Status DefaultSchedulerCore::SchedulingStep(
+    SchedulingState* sched_state) {
   // Get the first available node for scheduling that is the node that
   // satisfies our ready heuristic the best.
-  HloGraphNode* node = FindAndExtractBestNodeAvailable(
-      *sched_state, /*should_skip_node=*/nullptr);
+  TF_ASSIGN_OR_RETURN(HloGraphNode * node,
+                      FindAndExtractBestNodeAvailable(
+                          *sched_state, /*should_skip_node=*/nullptr));
   CHECK(node != nullptr);
   TF_ASSIGN_OR_RETURN(sched_state->current_time,
                       ScheduleNode(node, sched_state));
diff --git a/third_party/xla/xla/service/latency_hiding_scheduler.h b/third_party/xla/xla/service/latency_hiding_scheduler.h
index 30a31761ef4303..99e148cfb64018 100644
--- a/third_party/xla/xla/service/latency_hiding_scheduler.h
+++ b/third_party/xla/xla/service/latency_hiding_scheduler.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/inlined_vector.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -276,7 +277,7 @@ class AsyncTracker {
 // Base class for the core scheduling algorithm.
 class SchedulerCore {
  public:
-  virtual Status InitializeScheduler(const HloModule* module) = 0;
+  virtual absl::Status InitializeScheduler(const HloModule* module) = 0;
   virtual absl::StatusOr<std::vector<HloInstruction*>> ScheduleComputation(
       const HloComputation* computation) = 0;
   virtual ~SchedulerCore() = default;
@@ -840,7 +841,7 @@ class DefaultSchedulerCore : public SchedulerCore {
         target_scheduling_rule_(target_scheduling_rule),
         early_target_scheduling_rule_(early_target_scheduling_rule),
         post_processing_fn_(post_processing_fn) {}
-  Status InitializeScheduler(const HloModule* module) override;
+  absl::Status InitializeScheduler(const HloModule* module) override;
   absl::StatusOr<std::vector<HloInstruction*>> ScheduleComputation(
       const HloComputation* computation) override;
   static bool AddOccupierToResource(
@@ -865,9 +866,9 @@ class DefaultSchedulerCore : public SchedulerCore {
       HloGraphNode* n, SchedulingState* sched_state) const;
   // Perform the scheduling of one or more instructions. Called every time the
   // ready set is not empty.
-  virtual Status SchedulingStep(SchedulingState* sched_state);
+  virtual absl::Status SchedulingStep(SchedulingState* sched_state);
   // Pick a node to schedule according to cost model.
-  virtual HloGraphNode* FindAndExtractBestNodeAvailable(
+  virtual absl::StatusOr<HloGraphNode*> FindAndExtractBestNodeAvailable(
       SchedulingState& sched_state,
       DefaultSchedulerCore::ShouldSkipNodeFunction should_skip_node);
   void DumpLatencyHidingSchedule(
diff --git a/third_party/xla/xla/service/latency_hiding_scheduler_test.cc b/third_party/xla/xla/service/latency_hiding_scheduler_test.cc
index 2782cbc9522d91..ca228544fcbdd3 100644
--- a/third_party/xla/xla/service/latency_hiding_scheduler_test.cc
+++ b/third_party/xla/xla/service/latency_hiding_scheduler_test.cc
@@ -3050,4 +3050,53 @@ ENTRY AddR2 {
   XLA_VLOG_LINES(1, hlo_module->ToString());
 }
 
+TEST_F(LatencyHidingSchedulerTest, ScheduleLoopPeeledSendDoneBeforeWhile) {
+  absl::string_view hlo_string = R"(
+HloModule module, is_scheduled=true
+
+while_cond {
+  param = (bf16[1,1,4096,1344]{2,3,1,0:T(8,128)(2,1)}, bf16[1,1,4096,1344]{2,3,1,0:T(8,128)(2,1)}, pred[]) parameter(0)
+  ROOT gte = pred[] get-tuple-element(param), index=2
+}
+
+while_body {
+  param = (bf16[1,1,4096,1344]{2,3,1,0:T(8,128)(2,1)}, bf16[1,1,4096,1344]{2,3,1,0:T(8,128)(2,1)}, pred[]) parameter(0)
+  gte0 = bf16[1,1,4096,1344]{2,3,1,0:T(8,128)(2,1)} get-tuple-element(param), index=0
+  gte1 = bf16[1,1,4096,1344]{2,3,1,0:T(8,128)(2,1)} get-tuple-element(param), index=1
+  %add.0 = bf16[1,1,4096,1344]{2,3,1,0:T(8,128)(2,1)} add(gte0, gte1)
+  gte2 = pred[] get-tuple-element(param), index=2
+  ROOT tuple = (bf16[1,1,4096,1344]{2,3,1,0:T(8,128)(2,1)}, bf16[1,1,4096,1344]{2,3,1,0:T(8,128)(2,1)}, pred[]) tuple(%add.0, gte1, gte2)
+}
+
+ENTRY %entry {
+  %p0 = bf16[1,1,4096,1344]{2,3,1,0:T(8,128)(2,1)} parameter(0)
+  %p1 = bf16[1,1,4096,1344]{2,3,1,0:T(8,128)(2,1)} parameter(1)
+  %after-all = token[] after-all()
+  %send = (bf16[1,1,4096,1344]{2,3,1,0:T(8,128)(2,1)}, u32[], token[]) send(bf16[1,1,4096,1344]{2,3,1,0:T(8,128)(2,1)} %p0, token[] %after-all), channel_id=1246, is_host_transfer=true, frontend_attributes={_xla_host_transfer_handler_name="xla_megascale_runtime",_xla_host_transfer_rendezvous="collective-permute.145_0",_xla_megascale_target="{{200000->100000},{200001->100001},{200002->100002},{200003->100003},{200004->100004},{200005->100005},{200006->100006},{200007->100007},{200008->100008},{200009->100009},{200010->100010},{200011->100011},{200012->100012},{200013->100013},{200014->100014},{200015->100015},{200016->100016},{200017->100017},{200018->100018},{200019->100019},{200020->100020},{200021->100021},{200022->100022},{200023->100023},{200024->100024},{200025->100025},{200026->100026},{200027->100027},{200028->100028},{200029->100029},{200030->100030},{200031->100031},{200032->100032},{200033->100033},{200034->100034},{200035->100035},{200036->100036},{200037->100037},{200038->100038},{200039->100039},{200040->100040},{200041->100041},{200042->100042},{200043->100043},{200044->100044},{200045->100045},{200046->100046},{200047->100047},{200048->100048},{200049->100049},{200050->100050},{200051->100051},{200052->100052},{200053->100053},{200054->100054},{200055->100055},{200056->100056},{200057->100057},{200058->100058},{200059->100059},{200060->100060},{200061->100061},{200062->100062},{200063->100063},{200064->100064},{200065->100065},{200066->100066},{200067->100067},{200068->100068},{200069->100069},{200070->100070},{200071->100071},{200072->100072},{200073->100073},{200074->100074},{200075->100075},{200076->100076},{200077->100077},{200078->100078},{200079->100079},{200080->100080},{200081->100081},{200082->100082},{200083->100083},{200084->100084},{200085->100085},{200086->100086},{200087->100087},{200088->100088},{200089->100089},{200090->100090},{200091->100091},{200092->100092},{200093->100093},{200094->100094},{200095->100095},{200096->100096},{200097->100097},{200098->100098},{200099->100099},{200100->100100},{200101->100101},{200102->100102},{200103->100103},{200104->100104},{200105->100105},{200106->100106},{200107->100107},{200108->100108},{200109->100109},{200110->100110},{200111->100111},{200112->100112},{200113->100113},{200114->100114},{200115->100115},{200116->100116},{200117->100117},{200118->100118},{200119->100119},{200120->100120},{200121->100121},{200122->100122},{200123->100123},{200124->100124},{200125->100125},{200126->100126},{200127->100127}}",_xla_megascale_transfer_type="ONE_TO_ONE"}, backend_config={"flag_configs":[],"scoped_memory_configs":[],"compute_type":"COMPUTE_TYPE_DEFAULT","device_type":"DEVICE_TYPE_INVALID","used_scoped_memory_configs":[],"customized_send_recv_config":{"dcn_collective_permute_send":{"non_source_slice_ids":[0]}}}
+  %recv = (bf16[1,1,4096,1344]{2,3,1,0:T(8,128)(2,1)}, u32[], token[]) recv(token[] %after-all), channel_id=1247, is_host_transfer=true, frontend_attributes={_xla_host_transfer_handler_name="xla_megascale_runtime",_xla_host_transfer_rendezvous="collective-permute.145_0",_xla_megascale_target="{{200000->100000},{200001->100001},{200002->100002},{200003->100003},{200004->100004},{200005->100005},{200006->100006},{200007->100007},{200008->100008},{200009->100009},{200010->100010},{200011->100011},{200012->100012},{200013->100013},{200014->100014},{200015->100015},{200016->100016},{200017->100017},{200018->100018},{200019->100019},{200020->100020},{200021->100021},{200022->100022},{200023->100023},{200024->100024},{200025->100025},{200026->100026},{200027->100027},{200028->100028},{200029->100029},{200030->100030},{200031->100031},{200032->100032},{200033->100033},{200034->100034},{200035->100035},{200036->100036},{200037->100037},{200038->100038},{200039->100039},{200040->100040},{200041->100041},{200042->100042},{200043->100043},{200044->100044},{200045->100045},{200046->100046},{200047->100047},{200048->100048},{200049->100049},{200050->100050},{200051->100051},{200052->100052},{200053->100053},{200054->100054},{200055->100055},{200056->100056},{200057->100057},{200058->100058},{200059->100059},{200060->100060},{200061->100061},{200062->100062},{200063->100063},{200064->100064},{200065->100065},{200066->100066},{200067->100067},{200068->100068},{200069->100069},{200070->100070},{200071->100071},{200072->100072},{200073->100073},{200074->100074},{200075->100075},{200076->100076},{200077->100077},{200078->100078},{200079->100079},{200080->100080},{200081->100081},{200082->100082},{200083->100083},{200084->100084},{200085->100085},{200086->100086},{200087->100087},{200088->100088},{200089->100089},{200090->100090},{200091->100091},{200092->100092},{200093->100093},{200094->100094},{200095->100095},{200096->100096},{200097->100097},{200098->100098},{200099->100099},{200100->100100},{200101->100101},{200102->100102},{200103->100103},{200104->100104},{200105->100105},{200106->100106},{200107->100107},{200108->100108},{200109->100109},{200110->100110},{200111->100111},{200112->100112},{200113->100113},{200114->100114},{200115->100115},{200116->100116},{200117->100117},{200118->100118},{200119->100119},{200120->100120},{200121->100121},{200122->100122},{200123->100123},{200124->100124},{200125->100125},{200126->100126},{200127->100127}}",_xla_megascale_transfer_type="ONE_TO_ONE"}, control-predecessors={%send}, backend_config={"flag_configs":[],"scoped_memory_configs":[],"compute_type":"COMPUTE_TYPE_DEFAULT","device_type":"DEVICE_TYPE_INVALID","used_scoped_memory_configs":[],"customized_send_recv_config":{"dcn_collective_permute_recv":{"non_target_slice_ids":[1]}}}
+  %recv-done = (bf16[1,1,4096,1344]{2,3,1,0:T(8,128)(2,1)}, token[]) recv-done((bf16[1,1,4096,1344]{2,3,1,0:T(8,128)(2,1)}, u32[], token[]) %recv), channel_id=1247, is_host_transfer=true, backend_config={"flag_configs":[],"scoped_memory_configs":[],"compute_type":"COMPUTE_TYPE_DEFAULT","device_type":"DEVICE_TYPE_INVALID","used_scoped_memory_configs":[],"customized_send_recv_config":{"dcn_collective_permute_recv":{"non_target_slice_ids":[1]}}}
+  %get-tuple-element = bf16[1,1,4096,1344]{2,3,1,0:T(8,128)(2,1)} get-tuple-element((bf16[1,1,4096,1344]{2,3,1,0:T(8,128)(2,1)}, token[]) %recv-done), index=0
+  %send-done = token[] send-done((bf16[1,1,4096,1344]{2,3,1,0:T(8,128)(2,1)}, u32[], token[]) %send), channel_id=1246, is_host_transfer=true, control-predecessors={%recv-done}, backend_config={"flag_configs":[],"scoped_memory_configs":[],"compute_type":"COMPUTE_TYPE_DEFAULT","device_type":"DEVICE_TYPE_INVALID","used_scoped_memory_configs":[],"customized_send_recv_config":{"dcn_collective_permute_send":{"non_source_slice_ids":[0]}}}
+  %p2 = pred[] parameter(2)
+  tuple = (bf16[1,1,4096,1344]{2,3,1,0:T(8,128)(2,1)}, bf16[1,1,4096,1344]{2,3,1,0:T(8,128)(2,1)}, pred[]) tuple(%get-tuple-element, %p1, %p2)
+  while = (bf16[1,1,4096,1344]{2,3,1,0:T(8,128)(2,1)}, bf16[1,1,4096,1344]{2,3,1,0:T(8,128)(2,1)}, pred[]) while(tuple), condition=while_cond, body=while_body
+  ROOT gte0 = bf16[1,1,4096,1344]{2,3,1,0:T(8,128)(2,1)} get-tuple-element(while), index=0
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, ParseHloText(hlo_string));
+  HloSchedule& module_schedule = hlo_module->schedule();
+  EXPECT_TRUE(hlo_module->has_entry_computation());
+  auto sched_config = GetDefaultSchedConfig();
+  sched_config.collective_permute_overlap_limit = 2;
+  sched_config.all_gather_overlap_limit = 2;
+  EXPECT_TRUE(RunScheduler(hlo_module.get(), sched_config).ok());
+  EXPECT_TRUE(hlo_module->has_entry_computation());
+
+  std::vector<HloInstruction*> new_instruction_sequence =
+      module_schedule.sequence(hlo_module->entry_computation()).instructions();
+  // Check that 'send-done' is scheduled before 'while'.
+  EXPECT_LT(GetIndex(new_instruction_sequence, "send-done"),
+            GetIndex(new_instruction_sequence, "while"));
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/service/layout_assignment.cc b/third_party/xla/xla/service/layout_assignment.cc
index af985a05c6b2ec..4335df8aa8642d 100644
--- a/third_party/xla/xla/service/layout_assignment.cc
+++ b/third_party/xla/xla/service/layout_assignment.cc
@@ -266,11 +266,11 @@ bool LayoutAssignment::AllOperandBuffersForwarded(
   });
 }
 
-Status LayoutAssignment::SetBufferLayout(const Layout& layout,
-                                         const LogicalBuffer& buffer,
-                                         bool mandatory, bool dfs,
-                                         int64_t priority,
-                                         const HloInstruction* user) {
+absl::Status LayoutAssignment::SetBufferLayout(const Layout& layout,
+                                               const LogicalBuffer& buffer,
+                                               bool mandatory, bool dfs,
+                                               int64_t priority,
+                                               const HloInstruction* user) {
   VLOG(3) << "SetBufferLayout : " << buffer << " : "
           << LayoutUtil::HumanString(layout) << " with priority " << priority
           << "; mandatory = " << mandatory << "; dfs = " << dfs << "\n";
@@ -341,10 +341,9 @@ Status LayoutAssignment::SetBufferLayout(const Layout& layout,
   return OkStatus();
 }
 
-Status LayoutAssignment::SetOperandLayout(const Shape& shape_with_layout,
-                                          const HloInstruction* instruction,
-                                          int64_t operand_no, bool mandatory,
-                                          bool dfs, int64_t priority) {
+absl::Status LayoutAssignment::SetOperandLayout(
+    const Shape& shape_with_layout, const HloInstruction* instruction,
+    int64_t operand_no, bool mandatory, bool dfs, int64_t priority) {
   if (shape_with_layout.IsArray() && shape_with_layout.rank() == 0) {
     return OkStatus();
   }
@@ -419,7 +418,7 @@ void LayoutAssignment::PushAddedConstraints(
   }
 }
 
-Status LayoutAssignment::SetArrayOperandLayout(
+absl::Status LayoutAssignment::SetArrayOperandLayout(
     const Layout& layout, const HloInstruction* instruction, int64_t operand_no,
     bool mandatory, bool dfs, int64_t priority) {
   const HloInstruction* operand = instruction->operand(operand_no);
@@ -431,7 +430,7 @@ Status LayoutAssignment::SetArrayOperandLayout(
                           priority);
 }
 
-Status LayoutAssignment::LayoutConstraints::SetResultLayout(
+absl::Status LayoutAssignment::LayoutConstraints::SetResultLayout(
     LayoutAssignment* assignment, const Shape& shape_with_layout,
     int64_t priority) {
   VLOG(3) << "  : " << ShapeUtil::HumanStringWithLayout(shape_with_layout)
@@ -443,11 +442,9 @@ Status LayoutAssignment::LayoutConstraints::SetResultLayout(
   return OkStatus();
 }
 
-Status LayoutAssignment::SetInstructionLayout(const Layout& layout,
-                                              const HloInstruction* instruction,
-                                              bool mandatory, bool dfs,
-                                              bool allow_alias,
-                                              int64_t priority) {
+absl::Status LayoutAssignment::SetInstructionLayout(
+    const Layout& layout, const HloInstruction* instruction, bool mandatory,
+    bool dfs, bool allow_alias, int64_t priority) {
   if (priority < 0) {
     priority = current_priority_;
   }
@@ -467,7 +464,7 @@ Status LayoutAssignment::SetInstructionLayout(const Layout& layout,
   return ShapeUtil::ForEachSubshapeWithStatus(
       instruction->shape(),
       [this, layout, instruction, mandatory, allow_alias, priority](
-          const Shape& subshape, const ShapeIndex& index) -> Status {
+          const Shape& subshape, const ShapeIndex& index) -> absl::Status {
         auto buffers =
             points_to_analysis_->GetPointsToSet(instruction).element(index);
         CHECK_EQ(1, buffers.size());
@@ -483,11 +480,9 @@ Status LayoutAssignment::SetInstructionLayout(const Layout& layout,
       });
 }
 
-Status LayoutAssignment::SetInstructionLayout(const Shape& shape_with_layout,
-                                              const HloInstruction* instruction,
-                                              bool mandatory, bool dfs,
-                                              bool allow_alias,
-                                              int64_t priority) {
+absl::Status LayoutAssignment::SetInstructionLayout(
+    const Shape& shape_with_layout, const HloInstruction* instruction,
+    bool mandatory, bool dfs, bool allow_alias, int64_t priority) {
   VLOG(3) << "SetInstructionLayout : " << instruction->name() << ", "
           << ShapeUtil::HumanStringWithLayout(shape_with_layout)
           << ": priority = " << priority << " : mandatory = " << mandatory
@@ -505,7 +500,7 @@ Status LayoutAssignment::SetInstructionLayout(const Shape& shape_with_layout,
   TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
       shape_with_layout,
       [this, dfs, instruction, mandatory, allow_alias, priority](
-          const Shape& subshape, const ShapeIndex& index) -> Status {
+          const Shape& subshape, const ShapeIndex& index) -> absl::Status {
         auto buffers =
             points_to_analysis_->GetPointsToSet(instruction).element(index);
         CHECK_EQ(1, buffers.size());
@@ -614,7 +609,7 @@ bool IsHostSendRecv(const HloInstruction* instruction) {
 
 }  // namespace
 
-Status LayoutAssignment::BuildHostChannelConstraints(
+absl::Status LayoutAssignment::BuildHostChannelConstraints(
     HloComputation* computation) {
   for (auto* instruction : computation->instructions()) {
     const HloSendRecvInstruction* send_recv_instr =
@@ -649,9 +644,9 @@ bool IsLayoutConstrainedCustomCall(HloInstruction* instruction) {
   return custom_call != nullptr && custom_call->layout_constrained();
 }
 
-Status PropagateParameterLayoutToUsers(const HloInstruction* instruction,
-                                       const Shape& shape,
-                                       LayoutAssignment* constraints) {
+absl::Status PropagateParameterLayoutToUsers(const HloInstruction* instruction,
+                                             const Shape& shape,
+                                             LayoutAssignment* constraints) {
   for (auto* user : instruction->users()) {
     // Excluding tuple operations as they do not participate in layout
     // propagations (they do not create or aliase buffers).
@@ -679,7 +674,7 @@ Status PropagateParameterLayoutToUsers(const HloInstruction* instruction,
 
 }  // namespace
 
-Status LayoutAssignment::AddMandatoryConstraints(
+absl::Status LayoutAssignment::AddMandatoryConstraints(
     ChannelLayoutConstraints* channel_constraints,
     LayoutConstraints* constraints) {
   VLOG(2) << "Adding mandatory layout constraints to computation "
@@ -968,7 +963,7 @@ bool LayoutsInShapesEqual(const Shape& lhs, const Shape& rhs) {
 
 // Operands of layout-constrained custom calls must match the expected
 // constrained layouts.
-Status CheckCustomCallLayout(HloInstruction* instruction) {
+absl::Status CheckCustomCallLayout(HloInstruction* instruction) {
   if (IsLayoutConstrainedCustomCall(instruction)) {
     const HloCustomCallInstruction* custom_call =
         DynCast<HloCustomCallInstruction>(instruction);
@@ -987,9 +982,10 @@ Status CheckCustomCallLayout(HloInstruction* instruction) {
 //   (3) body computation parameter
 //   (4) body computation result
 //   (5) while instruction result
-Status CheckWhileLayout(HloInstruction* while_inst,
-                        const ComputationLayout& condition_computation_layout,
-                        const ComputationLayout& body_computation_layout) {
+absl::Status CheckWhileLayout(
+    HloInstruction* while_inst,
+    const ComputationLayout& condition_computation_layout,
+    const ComputationLayout& body_computation_layout) {
   auto init_shape = while_inst->operand(0)->shape();
   TF_RET_CHECK(
       condition_computation_layout.parameter_layout(0).MatchesLayoutInShape(
@@ -1002,12 +998,12 @@ Status CheckWhileLayout(HloInstruction* while_inst,
   return OkStatus();
 }
 
-Status CheckOptimizationBarrierLayout(HloInstruction* inst) {
+absl::Status CheckOptimizationBarrierLayout(HloInstruction* inst) {
   TF_RET_CHECK(LayoutsInShapesEqual(inst->operand(0)->shape(), inst->shape()));
   return OkStatus();
 }
 
-Status CheckConditionalLayout(
+absl::Status CheckConditionalLayout(
     HloInstruction* instruction,
     absl::Span<const ComputationLayout> branch_computation_layouts) {
   for (int j = 0; j < instruction->branch_count(); ++j) {
@@ -1035,7 +1031,7 @@ Status CheckConditionalLayout(
 // Fusion parameters must match the layout of the fusion instructions operands,
 // and the root of the fusion expression must match the layout of the fusion
 // instruction.
-Status CheckFusionLayout(HloInstruction* fusion) {
+absl::Status CheckFusionLayout(HloInstruction* fusion) {
   TF_RET_CHECK(HloOpcode::kFusion == fusion->opcode());
 
   TF_RET_CHECK(LayoutsInShapesEqual(fusion->shape(),
@@ -1049,8 +1045,8 @@ Status CheckFusionLayout(HloInstruction* fusion) {
 
 // The layout of a parameter must match the respective layout in the
 // computation's ComputationLayout.
-Status CheckParameterLayout(HloInstruction* parameter,
-                            const ComputationLayout& computation_layout) {
+absl::Status CheckParameterLayout(HloInstruction* parameter,
+                                  const ComputationLayout& computation_layout) {
   const ShapeLayout& parameter_layout =
       computation_layout.parameter_layout(parameter->parameter_number());
   return ShapeUtil::ForEachSubshapeWithStatus(
@@ -1074,7 +1070,7 @@ Status CheckParameterLayout(HloInstruction* parameter,
 }
 
 // The layout of a constant instruction must match the layout of its literal.
-Status CheckConstantLayout(HloInstruction* constant) {
+absl::Status CheckConstantLayout(HloInstruction* constant) {
   if (!LayoutsInShapesEqual(constant->literal().shape(), constant->shape())) {
     return Internal(
         "constant instruction %s does not match the layout of its literal %s",
@@ -1097,7 +1093,7 @@ Layout GetBroadcastLayoutFromOutput(const Layout& layout,
   return shape.layout();
 }
 
-Status CheckBroadcastLayout(HloInstruction* broadcast) {
+absl::Status CheckBroadcastLayout(HloInstruction* broadcast) {
   CHECK_EQ(broadcast->opcode(), HloOpcode::kBroadcast);
   Shape shape = ShapeUtil::FilterDimensions(
       [&](int64_t dim) {
@@ -1114,7 +1110,7 @@ Status CheckBroadcastLayout(HloInstruction* broadcast) {
 
 }  // namespace
 
-Status LayoutAssignment::CheckCallLayout(
+absl::Status LayoutAssignment::CheckCallLayout(
     HloInstruction* call, const ComputationLayout& computation_layout) {
   HloComputation* computation = call->to_apply();
   TF_RET_CHECK(computation->num_parameters() == call->operand_count());
@@ -1188,7 +1184,7 @@ absl::StatusOr<HloInstruction*> LayoutAssignment::CreateCopyWithNewLayout(
 // Creates a copy of the given operand if the operand's layout does not match
 // the given layout. This copy replaces the use in the given instruction. Tuple
 // operands will be deep-copied.
-Status LayoutAssignment::CopyOperandIfLayoutsDiffer(
+absl::Status LayoutAssignment::CopyOperandIfLayoutsDiffer(
     const ShapeLayout& operand_layout, HloInstruction* instruction,
     int64_t operand_no) {
   HloInstruction* operand = instruction->mutable_operand(operand_no);
@@ -1266,9 +1262,10 @@ void LayoutAssignment::SetupCopiedInstruction(const HloInstruction& instruction,
     }
   }
   copy->set_metadata(instruction.metadata());
+  copy->set_frontend_attributes(instruction.frontend_attributes());
 }
 
-Status LayoutAssignment::CheckLayouts(
+absl::Status LayoutAssignment::CheckLayouts(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   TF_ASSIGN_OR_RETURN(auto points_to_analysis,
@@ -1287,8 +1284,9 @@ Status LayoutAssignment::CheckLayouts(
       const PointsToSet& points_to_set =
           points_to_analysis->GetPointsToSet(instruction);
       TF_RETURN_IF_ERROR(points_to_set.ForEachElementWithStatus(
-          [&instruction](ShapeIndex index,
-                         const PointsToSet::BufferList& buffers) -> Status {
+          [&instruction](
+              ShapeIndex index,
+              const PointsToSet::BufferList& buffers) -> absl::Status {
             if (ShapeUtil::IsLeafIndex(instruction->shape(), index)) {
               const Shape& instruction_subshape =
                   ShapeUtil::GetSubshape(instruction->shape(), index);
@@ -1601,7 +1599,8 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOutputLayoutFromOperandLayout(
   return nullptr;
 }
 
-Status LayoutAssignment::PropagateConstraints(LayoutConstraints* constraints) {
+absl::Status LayoutAssignment::PropagateConstraints(
+    LayoutConstraints* constraints) {
   // Gathers all initial constraints in a worklist and propagates them in
   // depth-first order. DFS order seems to be better than BFS because a
   // constraint is propagated as far as possible before propagating unrelated
@@ -1682,7 +1681,7 @@ std::vector<std::pair<const HloInstruction*, int64_t>> GetArrayUsesOfBuffer(
 
 }  // namespace
 
-Status LayoutAssignment::PropagateUseConstraintToDefs(
+absl::Status LayoutAssignment::PropagateUseConstraintToDefs(
     const ShapeLayout& shape_layout, const HloInstruction* instruction,
     LayoutConstraints* constraints, int64_t priority,
     const HloInstruction* user) {
@@ -1693,7 +1692,7 @@ Status LayoutAssignment::PropagateUseConstraintToDefs(
   return points_to_set.ForEachElementWithStatus(
       [&shape_layout, this, priority, user](
           const ShapeIndex& index,
-          const PointsToSet::BufferList& buffers) -> Status {
+          const PointsToSet::BufferList& buffers) -> absl::Status {
         const auto& subshape =
             ShapeUtil::GetSubshape(shape_layout.shape(), index);
         if (ShapeUtil::IsLeafIndex(shape_layout.shape(), index) &&
@@ -1735,7 +1734,7 @@ bool InstructionShouldPropagateDepthFirst(const HloInstruction& hlo) {
 
 }  // namespace
 
-Status LayoutAssignment::PropagateOperandConstraint(
+absl::Status LayoutAssignment::PropagateOperandConstraint(
     const OperandLayoutConstraint& operand_constraint,
     LayoutConstraints* constraints) {
   if (IsAtMostRank1(operand_constraint.operand()->shape())) {
@@ -1899,7 +1898,7 @@ Status LayoutAssignment::PropagateOperandConstraint(
   return OkStatus();
 }
 
-Status LayoutAssignment::PropagateBufferConstraintToOperands(
+absl::Status LayoutAssignment::PropagateBufferConstraintToOperands(
     const BufferLayoutConstraint& buffer_constraint,
     LayoutConstraints* constraints) {
   const LogicalBuffer& buffer = buffer_constraint.buffer();
@@ -1964,7 +1963,7 @@ Status LayoutAssignment::PropagateBufferConstraintToOperands(
   return OkStatus();
 }
 
-Status LayoutAssignment::PropagateBufferConstraint(
+absl::Status LayoutAssignment::PropagateBufferConstraint(
     const BufferLayoutConstraint& buffer_constraint,
     LayoutConstraints* constraints) {
   // Only propagate array layouts.
@@ -1977,7 +1976,7 @@ Status LayoutAssignment::PropagateBufferConstraint(
   return PropagateBufferConstraintToUses(buffer_constraint, constraints);
 }
 
-Status LayoutAssignment::PropagateBufferConstraintToUses(
+absl::Status LayoutAssignment::PropagateBufferConstraintToUses(
     const BufferLayoutConstraint& buffer_constraint,
     LayoutConstraints* constraints) {
   VLOG(5) << "PropagateBufferConstraintToUses: "
@@ -2035,7 +2034,7 @@ Status LayoutAssignment::PropagateBufferConstraintToUses(
   return OkStatus();
 }
 
-Status LayoutAssignment::PropagateResultConstraint(
+absl::Status LayoutAssignment::PropagateResultConstraint(
     const ComputationLayoutConstraint& layout_constraint,
     LayoutConstraints* constraints) {
   ShapeLayout result_layout =
@@ -2111,7 +2110,7 @@ namespace {
 // to match the layout of its corresponding fusion instruction operand. Also,
 // set the layout of the fused root to match the layout of the fusion
 // instruction itself.
-Status SetFusionLayouts(HloInstruction* fusion) {
+absl::Status SetFusionLayouts(HloInstruction* fusion) {
   TF_RET_CHECK(fusion->opcode() == HloOpcode::kFusion);
   for (auto* fused_instruction :
        fusion->fused_instructions_computation()->MakeInstructionPostOrder()) {
@@ -2155,7 +2154,7 @@ Status SetFusionLayouts(HloInstruction* fusion) {
 
 }  // namespace
 
-Status LayoutAssignment::AssignLayouts(LayoutConstraints& constraints) {
+absl::Status LayoutAssignment::AssignLayouts(LayoutConstraints& constraints) {
   HloComputation* computation = constraints.computation();
   VLOG(2) << "Assigning layouts to computation: " << computation->name();
 
@@ -2283,7 +2282,7 @@ Status LayoutAssignment::AssignLayouts(LayoutConstraints& constraints) {
   return OkStatus();
 }
 
-Status LayoutAssignment::CalculateComputationLayout(
+absl::Status LayoutAssignment::CalculateComputationLayout(
     LayoutConstraints* constraints) {
   // Process instructions that contain nested computations and may require
   // additional layouts to be assigned on the instructions nested inside.
@@ -2307,10 +2306,11 @@ Status LayoutAssignment::CalculateComputationLayout(
     return change;
   };
 
-  auto SetCalleeLayout =
-      [this, UpdateLayout](const HloInstruction* result,
-                           absl::Span<const HloInstruction* const> operands,
-                           LayoutConstraints* callee, int priority) -> Status {
+  auto SetCalleeLayout = [this, UpdateLayout](
+                             const HloInstruction* result,
+                             absl::Span<const HloInstruction* const> operands,
+                             LayoutConstraints* callee,
+                             int priority) -> absl::Status {
     CHECK_NE(result, nullptr);
     ComputationLayoutConstraint* callee_constraint =
         callee->mutable_computation_constraint();
@@ -2408,7 +2408,8 @@ Status LayoutAssignment::CalculateComputationLayout(
   return OkStatus();
 }
 
-Status LayoutAssignment::ClearComputationLayouts(HloComputation* computation) {
+absl::Status LayoutAssignment::ClearComputationLayouts(
+    HloComputation* computation) {
   // Clear existing layouts of the instructions.  All layouts must be assigned
   // by the LayoutAssignment pass, except for those on parameters, the
   // computation result, and a couple special cases. The former two are
@@ -2434,7 +2435,7 @@ Status LayoutAssignment::ClearComputationLayouts(HloComputation* computation) {
   return OkStatus();
 }
 
-Status LayoutAssignment::RunOnComputation(
+absl::Status LayoutAssignment::RunOnComputation(
     LayoutConstraints* constraints,
     ChannelLayoutConstraints* channel_constraints) {
   HloComputation* computation = constraints->computation();
@@ -2516,7 +2517,7 @@ Status LayoutAssignment::RunOnComputation(
   return OkStatus();
 }
 
-Status LayoutAssignment::ConstrainChannelLayouts(
+absl::Status LayoutAssignment::ConstrainChannelLayouts(
     HloComputation* computation,
     ChannelLayoutConstraints* channel_constraints) {
   for (HloInstruction* instruction : computation->MakeInstructionPostOrder()) {
@@ -2531,7 +2532,7 @@ Status LayoutAssignment::ConstrainChannelLayouts(
   return OkStatus();
 }
 
-Status LayoutAssignment::PropagateComputationLayouts(
+absl::Status LayoutAssignment::PropagateComputationLayouts(
     HloComputation* computation, ComputationLayout* computation_layout) {
   ComputationLayout computed_computation_layout(
       computation->ComputeProgramShape(),
@@ -2946,7 +2947,7 @@ bool LayoutAssignment::IsAtMostRank1(const Shape& shape) {
   });
 }
 
-Status LayoutAssignment::Init(HloModule* module) {
+absl::Status LayoutAssignment::Init(HloModule* module) {
   computation_layouts_.clear();
   conditional_mismatch_.clear();
   current_priority_ = LayoutConstraint::kBeginningPriority;
@@ -2977,7 +2978,7 @@ Status LayoutAssignment::Init(HloModule* module) {
   return OkStatus();
 }
 
-Status LayoutAssignment::ClearPreviousPassSideEffects(
+absl::Status LayoutAssignment::ClearPreviousPassSideEffects(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(5) << "Clearing previous side effects";
@@ -2992,8 +2993,8 @@ Status LayoutAssignment::ClearPreviousPassSideEffects(
   buffer_sets_cache_.clear();
   return OkStatus();
 }
-Status LayoutAssignment::AddCopyForOperand(HloInstruction* instruction,
-                                           int64_t operand_number) {
+absl::Status LayoutAssignment::AddCopyForOperand(HloInstruction* instruction,
+                                                 int64_t operand_number) {
   HloInstruction* operand = instruction->mutable_operand(operand_number);
   if (operand->opcode() != HloOpcode::kCopy || operand->user_count() > 1) {
     HloInstruction* copy =
diff --git a/third_party/xla/xla/service/layout_assignment.h b/third_party/xla/xla/service/layout_assignment.h
index 22586493917105..2d774be6aee485 100644
--- a/third_party/xla/xla/service/layout_assignment.h
+++ b/third_party/xla/xla/service/layout_assignment.h
@@ -299,8 +299,9 @@ class LayoutAssignment : public HloModulePass {
     OperandLayoutConstraint* InsertOperandLayoutConstraint(
         const HloInstruction* instruction, int64_t operand_no,
         const OperandLayoutConstraint& constraint);
-    Status SetResultLayout(LayoutAssignment* assignment,
-                           const Shape& shape_with_layout, int64_t priority);
+    absl::Status SetResultLayout(LayoutAssignment* assignment,
+                                 const Shape& shape_with_layout,
+                                 int64_t priority);
 
     const ComputationLayout& computation_layout() const {
       return computation_constraint_.computation_layout();
@@ -357,57 +358,63 @@ class LayoutAssignment : public HloModulePass {
   static bool IsAtMostRank1(const Shape& shape);
   // Convenience wrapper around SetOperandLayout for setting the layout of a
   // operand using a Layout object. The operand must be array-shaped.
-  Status SetArrayOperandLayout(const Layout& layout,
-                               const HloInstruction* instruction,
-                               int64_t operand_no, bool mandatory = true,
-                               bool dfs = true) {
+  absl::Status SetArrayOperandLayout(const Layout& layout,
+                                     const HloInstruction* instruction,
+                                     int64_t operand_no, bool mandatory = true,
+                                     bool dfs = true) {
     return SetArrayOperandLayout(layout, instruction, operand_no, mandatory,
                                  dfs, current_priority_);
   }
-  Status SetArrayOperandLayout(const Layout& layout,
-                               const HloInstruction* instruction,
-                               int64_t operand_no, bool mandatory, bool dfs,
-                               int64_t priority);
+  absl::Status SetArrayOperandLayout(const Layout& layout,
+                                     const HloInstruction* instruction,
+                                     int64_t operand_no, bool mandatory,
+                                     bool dfs, int64_t priority);
   // Convenience wrapper around SetBufferLayout. Sets the layouts of all buffers
   // created by the instruction to the layouts in the given shape. The
   // instruction must define every logical buffer in its output.
   // If `allow_alias` is false, the function will check that all output buffers
   // are defined by `instruction`, not aliased to an instruction elsewhere.
-  Status SetInstructionLayout(const Shape& shape_with_layout,
-                              const HloInstruction* instruction,
-                              bool mandatory = true, bool dfs = true,
-                              bool allow_alias = false) {
+  absl::Status SetInstructionLayout(const Shape& shape_with_layout,
+                                    const HloInstruction* instruction,
+                                    bool mandatory = true, bool dfs = true,
+                                    bool allow_alias = false) {
     return SetInstructionLayout(shape_with_layout, instruction, mandatory, dfs,
                                 allow_alias, current_priority_);
   }
-  Status SetInstructionLayout(const Shape& shape_with_layout,
-                              const HloInstruction* instruction, bool mandatory,
-                              bool dfs, bool allow_alias, int64_t priority);
+  absl::Status SetInstructionLayout(const Shape& shape_with_layout,
+                                    const HloInstruction* instruction,
+                                    bool mandatory, bool dfs, bool allow_alias,
+                                    int64_t priority);
   // Set the same given layout across all components of the instruction output.
   // It works the same as the API above if the output is a single array.
-  Status SetInstructionLayout(const Layout& layout,
-                              const HloInstruction* instruction,
-                              bool mandatory = true, bool dfs = true,
-                              bool allow_alias = false, int64_t priority = -1);
+  absl::Status SetInstructionLayout(const Layout& layout,
+                                    const HloInstruction* instruction,
+                                    bool mandatory = true, bool dfs = true,
+                                    bool allow_alias = false,
+                                    int64_t priority = -1);
   // Add a constraint on the layout of a LogicalBuffer, the layout of the
   // operand of the instruction, or the layout of the result of the computation,
   // respectively.
-  Status SetBufferLayout(const Layout& layout, const LogicalBuffer& buffer,
-                         bool mandatory = true, bool dfs = true) {
+  absl::Status SetBufferLayout(const Layout& layout,
+                               const LogicalBuffer& buffer,
+                               bool mandatory = true, bool dfs = true) {
     return SetBufferLayout(layout, buffer, mandatory, dfs, current_priority_);
   }
-  Status SetBufferLayout(const Layout& layout, const LogicalBuffer& buffer,
-                         bool mandatory, bool dfs, int64_t priority,
-                         const HloInstruction* from_user = nullptr);
-  Status SetOperandLayout(const Shape& shape_with_layout,
-                          const HloInstruction* instruction, int64_t operand_no,
-                          bool mandatory = true, bool dfs = true) {
+  absl::Status SetBufferLayout(const Layout& layout,
+                               const LogicalBuffer& buffer, bool mandatory,
+                               bool dfs, int64_t priority,
+                               const HloInstruction* from_user = nullptr);
+  absl::Status SetOperandLayout(const Shape& shape_with_layout,
+                                const HloInstruction* instruction,
+                                int64_t operand_no, bool mandatory = true,
+                                bool dfs = true) {
     return SetOperandLayout(shape_with_layout, instruction, operand_no,
                             mandatory, dfs, current_priority_);
   }
-  Status SetOperandLayout(const Shape& shape_with_layout,
-                          const HloInstruction* instruction, int64_t operand_no,
-                          bool mandatory, bool dfs, int64_t priority);
+  absl::Status SetOperandLayout(const Shape& shape_with_layout,
+                                const HloInstruction* instruction,
+                                int64_t operand_no, bool mandatory, bool dfs,
+                                int64_t priority);
   bool reverse_computation_order() const { return reverse_computation_order_; }
 
   ComputationLayout& saved_entry_computation_layout() {
@@ -450,13 +457,13 @@ class LayoutAssignment : public HloModulePass {
   //
   // Backends can override these methods with backend-specific propagation
   // rules.
-  virtual Status PropagateBufferConstraint(
+  virtual absl::Status PropagateBufferConstraint(
       const BufferLayoutConstraint& buffer_constraint,
       LayoutConstraints* constraints);
-  virtual Status PropagateOperandConstraint(
+  virtual absl::Status PropagateOperandConstraint(
       const OperandLayoutConstraint& operand_constraint,
       LayoutConstraints* constraints);
-  virtual Status PropagateResultConstraint(
+  virtual absl::Status PropagateResultConstraint(
       const ComputationLayoutConstraint& layout_constraint,
       LayoutConstraints* constraints);
 
@@ -465,11 +472,11 @@ class LayoutAssignment : public HloModulePass {
   }
   // Called after layouts of an instruction have been finalized to allow
   // subclasses to check for platform specific assumptions.
-  virtual Status Verify(const HloInstruction* instruction) {
+  virtual absl::Status Verify(const HloInstruction* instruction) {
     return OkStatus();
   }
 
-  Status PropagateUnconstraintedBuffers(LayoutConstraints* constraints);
+  absl::Status PropagateUnconstraintedBuffers(LayoutConstraints* constraints);
   const BufferLayoutConstraint* GetBufferLayoutConstraint(
       const LogicalBuffer& buffer) const;
   absl::StatusOr<const BufferLayoutConstraint*>
@@ -491,18 +498,17 @@ class LayoutAssignment : public HloModulePass {
                                           const ShapeIndex& index);
 
   // Propagates a buffer layout constraint into the operands that use it.
-  Status PropagateBufferConstraintToUses(
+  absl::Status PropagateBufferConstraintToUses(
       const BufferLayoutConstraint& buffer_constraint,
       LayoutConstraints* constraints);
 
   // Propagates a layout constraint on the use of the result of the given
   // instruction to the definitions of the LogicalBuffers which make up the
   // result.
-  Status PropagateUseConstraintToDefs(const ShapeLayout& shape_layout,
-                                      const HloInstruction* instruction,
-                                      LayoutConstraints* constraints,
-                                      int64_t priority,
-                                      const HloInstruction* user = nullptr);
+  absl::Status PropagateUseConstraintToDefs(
+      const ShapeLayout& shape_layout, const HloInstruction* instruction,
+      LayoutConstraints* constraints, int64_t priority,
+      const HloInstruction* user = nullptr);
 
   // Chooses a layout of operand `operand_no` of `instruction` that minimizes
   // the cost of `instruction`. `output_layout` is the layout of `instruction`.
@@ -546,17 +552,18 @@ class LayoutAssignment : public HloModulePass {
   // The operands of a call must match the layouts of parameters in the
   // ComputationLayout, and the call instruction itself must match the result
   // layout in the ComputationLayout.
-  Status CheckCallLayout(HloInstruction* call,
-                         const ComputationLayout& computation_layout);
+  absl::Status CheckCallLayout(HloInstruction* call,
+                               const ComputationLayout& computation_layout);
 
  private:
   // Initializes the layout assignment object for a new Run() call.
-  Status Init(HloModule* module);
+  absl::Status Init(HloModule* module);
 
   // Adds constraints which must be satisfied for correctness on all
   // backends. Called once prior to propagating constraints.
-  Status AddMandatoryConstraints(ChannelLayoutConstraints* channel_constraints,
-                                 LayoutConstraints* constraints);
+  absl::Status AddMandatoryConstraints(
+      ChannelLayoutConstraints* channel_constraints,
+      LayoutConstraints* constraints);
 
   // Return a vector containing the constraints which have been added to the
   // LayoutConstraints object since the construction of the object or since the
@@ -573,7 +580,7 @@ class LayoutAssignment : public HloModulePass {
   // layout of the instructions of a computation. This method is called after
   // all mandatory constraints have been added via AddMandatoryConstraints
   // and before propagating constraints.
-  virtual Status AddBackendConstraints(LayoutConstraints* constraints) {
+  virtual absl::Status AddBackendConstraints(LayoutConstraints* constraints) {
     return OkStatus();
   }
 
@@ -583,40 +590,40 @@ class LayoutAssignment : public HloModulePass {
   // computation instruction constraints.
   // Layouts constraints are added, then propagated until all LogicalBuffers in
   // the computation are constrained.
-  Status RunOnComputation(LayoutConstraints* constraints,
-                          ChannelLayoutConstraints* channel_constraints);
+  absl::Status RunOnComputation(LayoutConstraints* constraints,
+                                ChannelLayoutConstraints* channel_constraints);
 
   // Assign layouts to the instructions of a computation which satisfy the given
   // layout constraints. Copies may be added to satisfy the constraints. The
   // given LayoutConstraints must have layout constraints every logical buffer
   // in the computation.
-  Status AssignLayouts(LayoutConstraints& constraints);
+  absl::Status AssignLayouts(LayoutConstraints& constraints);
 
   // Propagates layout constraints from a set of initial constraints in order to
   // minimize the local cost of the computation. This propagation is *not*
   // required for correctness.
-  Status PropagateConstraints(LayoutConstraints* constraints);
+  absl::Status PropagateConstraints(LayoutConstraints* constraints);
 
-  Status PropagateBufferConstraintToOperands(
+  absl::Status PropagateBufferConstraintToOperands(
       const BufferLayoutConstraint& buffer_constraint,
       LayoutConstraints* constraints);
 
   // Check that all layouts in the module have been set and satisfy all
   // necessary conditions.
-  Status CheckLayouts(
+  absl::Status CheckLayouts(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads);
 
   // Computes the ComputationLayout of the given constraints based of the
   // layouts assigned to parameters and root instruction. Also propagate
   // constraints to computation nested inside.
-  Status CalculateComputationLayout(LayoutConstraints* constraints);
+  absl::Status CalculateComputationLayout(LayoutConstraints* constraints);
 
   // Clears all the layouts which can be cleared within a computation.
-  Status ClearComputationLayouts(HloComputation* computation);
+  absl::Status ClearComputationLayouts(HloComputation* computation);
 
   // Clears the side effects of a previous pass, like added copy instructions.
-  Status ClearPreviousPassSideEffects(
+  absl::Status ClearPreviousPassSideEffects(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads);
 
@@ -625,8 +632,8 @@ class LayoutAssignment : public HloModulePass {
   // This API propagates missing layout, and also checks that the caller
   // specified have been respected, by comparing those with the parameters and
   // root computation instruction.
-  Status PropagateComputationLayouts(HloComputation* computation,
-                                     ComputationLayout* computation_layout);
+  absl::Status PropagateComputationLayouts(
+      HloComputation* computation, ComputationLayout* computation_layout);
 
   // The pointer to the ComputationLayout passed as constructor parameter.
   ComputationLayout* entry_computation_layout_;
@@ -658,9 +665,9 @@ class LayoutAssignment : public HloModulePass {
   // Creates a copy of the given operand if the operand's layout does not match
   // the given layout. This copy replaces the use in the given instruction.
   // Tuple operands will be deep-copied.
-  virtual Status CopyOperandIfLayoutsDiffer(const ShapeLayout& operand_layout,
-                                            HloInstruction* instruction,
-                                            int64_t operand_no);
+  virtual absl::Status CopyOperandIfLayoutsDiffer(
+      const ShapeLayout& operand_layout, HloInstruction* instruction,
+      int64_t operand_no);
 
   // Registers a copy instruction added by the layout assignment pass.
   void RegisterAddedCopy(HloInstruction* copy) {
@@ -671,13 +678,15 @@ class LayoutAssignment : public HloModulePass {
   // Adds a copy for the operand of an instruction, unless such operand is
   // already a copy, and has a single user (which is forcibly the instruction
   // itself).
-  Status AddCopyForOperand(HloInstruction* instruction, int64_t operand_number);
+  absl::Status AddCopyForOperand(HloInstruction* instruction,
+                                 int64_t operand_number);
 
   // Apply the channel layout constraints by populating the channel_constraints
   // data structure passed in at constructor time. Eventually adds copies in
   // case two ends of a channel ended up with a different leyout.
-  Status ConstrainChannelLayouts(HloComputation* computation,
-                                 ChannelLayoutConstraints* channel_constraints);
+  absl::Status ConstrainChannelLayouts(
+      HloComputation* computation,
+      ChannelLayoutConstraints* channel_constraints);
 
   // Resets the input ChannelLayoutConstraints to the original copy received
   // from the constructor input.
@@ -688,7 +697,7 @@ class LayoutAssignment : public HloModulePass {
   }
 
   // Adds constraints related to host Send/Recv instructions.
-  Status BuildHostChannelConstraints(HloComputation* computation);
+  absl::Status BuildHostChannelConstraints(HloComputation* computation);
 
   // Module points to analysis that can be updated for cloned computations.
   std::unique_ptr<TuplePointsToAnalysis> points_to_analysis_;
diff --git a/third_party/xla/xla/service/layout_assignment_test.cc b/third_party/xla/xla/service/layout_assignment_test.cc
index d4a3982d2e1a88..4e9e9f5a7d145c 100644
--- a/third_party/xla/xla/service/layout_assignment_test.cc
+++ b/third_party/xla/xla/service/layout_assignment_test.cc
@@ -481,7 +481,7 @@ class OperandsMustBeTheSameLayoutAssignment : public LayoutAssignment {
       : LayoutAssignment(entry_computation_layout) {}
 
  protected:
-  Status PropagateBufferConstraint(
+  absl::Status PropagateBufferConstraint(
       const BufferLayoutConstraint& buffer_constraint,
       LayoutConstraints* constraints) override {
     const LogicalBuffer& buffer = buffer_constraint.buffer();
@@ -803,7 +803,7 @@ TEST_F(LayoutAssignmentTest, LayoutAssignmentToTupleSiblingOperand) {
   ComputationLayout computation_layout(
       m->entry_computation()->ComputeProgramShape());
   LayoutAssignment layout_assignment(&computation_layout);
-  Status error_status = layout_assignment.Run(m.get()).status();
+  absl::Status error_status = layout_assignment.Run(m.get()).status();
   EXPECT_TRUE(error_status.ok());
 }
 
@@ -820,7 +820,7 @@ TEST_F(LayoutAssignmentTest, InternalErrorOnBitcast) {
   ComputationLayout computation_layout(
       m->entry_computation()->ComputeProgramShape());
   LayoutAssignment layout_assignment(&computation_layout);
-  Status error_status = layout_assignment.Run(m.get()).status();
+  absl::Status error_status = layout_assignment.Run(m.get()).status();
   EXPECT_FALSE(error_status.ok());
   EXPECT_THAT(
       error_status.message(),
@@ -1328,7 +1328,7 @@ ENTRY %CustomCallLayoutConstrainedTupleResult (p0: f32[4,4]) -> (f32[4,4]{1,0},
   ExpectTupleLayoutIs(custom_call->shape(), {{1, 0}, {0, 1}});
 }
 
-Status AssignLayoutsToComputation(
+absl::Status AssignLayoutsToComputation(
     HloModule* m, ChannelLayoutConstraints* channel_constraints = nullptr) {
   if (!m->entry_computation_layout().result_layout().LayoutIsSet()) {
     m->mutable_entry_computation_layout()
diff --git a/third_party/xla/xla/service/layout_normalization.cc b/third_party/xla/xla/service/layout_normalization.cc
index 6837437f8f93e2..e5cc2ffdf8ebab 100644
--- a/third_party/xla/xla/service/layout_normalization.cc
+++ b/third_party/xla/xla/service/layout_normalization.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/layout_util.h"
 #include "xla/permutation_util.h"
@@ -58,7 +59,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
       : custom_call_transformer_(custom_call_transformer) {}
 
   // To handle a constant, just give the literal data a new layout.
-  Status HandleConstant(HloInstruction* hlo) override {
+  absl::Status HandleConstant(HloInstruction* hlo) override {
     Literal& literal = *Cast<HloConstantInstruction>(hlo)->mutable_literal();
     if (literal.shape().IsTuple()) {
       // TODO(cheshire): Tuple constants.
@@ -83,7 +84,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
 
   // Slice is layout-preserving, so handling is analoguous to elementwise unary,
   // and transposing the elements inside the metadata.
-  Status HandleSlice(HloInstruction* hlo) override {
+  absl::Status HandleSlice(HloInstruction* hlo) override {
     HloInstruction* operand = hlo->mutable_operand(0);
     const Shape& s = hlo->shape();
     const Shape& operand_shape = operand->shape();
@@ -118,7 +119,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
   //
   // Bitcast to descending layout and then bitcast back to make sure that shapes
   // match.
-  Status DefaultAction(HloInstruction* hlo) override {
+  absl::Status DefaultAction(HloInstruction* hlo) override {
     if (!hlo->user_count()) {
       // The local postcondition does not have to apply to the case when there
       // are no users.
@@ -144,7 +145,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
   //
   // With respect to layouts, concatenations are simple, as they are
   // layout-preserving.
-  Status HandleConcatenate(HloInstruction* hlo) override {
+  absl::Status HandleConcatenate(HloInstruction* hlo) override {
     const Shape& s = hlo->shape();
     int64_t orig_concat_dim = hlo->dimensions(0);
 
@@ -156,7 +157,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
     auto normalized_shape = Normalize(s);
     auto layout_as_permutation = ToTransposeDimensions(s.layout());
     int64_t normalized_concat_dim =
-        FindIndex(layout_as_permutation, orig_concat_dim);
+        InversePermutation(layout_as_permutation)[orig_concat_dim];
     auto normalized_concat =
         hlo->AddInstruction(HloInstruction::CreateConcatenate(
             normalized_shape, normalized_inputs, normalized_concat_dim));
@@ -166,7 +167,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
     return OkStatus();
   }
 
-  Status HandleReduceWindow(HloInstruction* hlo) override {
+  absl::Status HandleReduceWindow(HloInstruction* hlo) override {
     if (hlo->shape().IsTuple()) {
       // TODO(cheshire): Handle variadic reductions.
       return OkStatus();
@@ -212,7 +213,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
   // Into:
   //
   //  A{I} -> broadcast[S']{I} -> bitcast[S]{L'}
-  Status HandleBroadcast(HloInstruction* hlo) override {
+  absl::Status HandleBroadcast(HloInstruction* hlo) override {
     VLOG(3) << "Input broadcast: " << hlo->ToString();
     auto s = hlo->shape();
     auto operand = hlo->mutable_operand(0);
@@ -224,10 +225,12 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
         ToTransposeDimensions(s.layout());
     std::vector<int64_t> br_dimensions;
     if (!hlo->dimensions().empty()) {
-      br_dimensions = Permute(hlo->dimensions(), layout_as_permutation);
-    }
-    for (int64_t& d : br_dimensions) {
-      d = FindIndex(orig_output_layout_as_permutation, d);
+      br_dimensions.reserve(hlo->dimensions().size());
+      auto inverse_perm = InversePermutation(orig_output_layout_as_permutation);
+      for (int64_t dim :
+           ComposePermutations(hlo->dimensions(), layout_as_permutation)) {
+        br_dimensions.push_back(inverse_perm[dim]);
+      }
     }
     auto normalized_broadcast = MakeBroadcastHlo(
         normalized_input, br_dimensions, normalized_shape, &hlo->metadata());
@@ -238,15 +241,14 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
     return OkStatus();
   }
 
-  Status HandleIota(HloInstruction* hlo) override {
+  absl::Status HandleIota(HloInstruction* hlo) override {
     VLOG(3) << "Input iota: " << hlo->ToString();
     auto s = hlo->shape();
     auto normalized_shape = Normalize(s);
     std::vector<int64_t> orig_output_layout_as_permutation =
         ToTransposeDimensions(s.layout());
-    int64_t iota_dimension = hlo->dimensions()[0];
-    int64_t new_iota_dimension =
-        FindIndex(orig_output_layout_as_permutation, iota_dimension);
+    int64_t new_iota_dimension = InversePermutation(
+        orig_output_layout_as_permutation)[hlo->dimensions()[0]];
     auto normalized_iota = hlo->AddInstruction(
         HloInstruction::CreateIota(normalized_shape, new_iota_dimension));
     SetVisited(*normalized_iota);
@@ -257,7 +259,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
   }
 
   // BitcastConvert is only layout-preserving if it doesn't change the rank.
-  Status HandleBitcastConvert(HloInstruction* hlo) override {
+  absl::Status HandleBitcastConvert(HloInstruction* hlo) override {
     // If the rank isn't changing this is just an unary op.
     if (hlo->shape().rank() == hlo->operand(0)->shape().rank()) {
       return HandleElementwiseUnary(hlo);
@@ -276,7 +278,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
   //    H_0{I} -> U{I} -> B{L}
   //
   // where {I} denotes default layout.
-  Status HandleElementwiseUnary(HloInstruction* hlo) override {
+  absl::Status HandleElementwiseUnary(HloInstruction* hlo) override {
     auto s = hlo->shape();
     auto operand = hlo->mutable_operand(0);
     auto operand_shape = operand->shape();
@@ -330,7 +332,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
   //         B{I} - bitcast{L}
   //        /
   //  A2{I}
-  Status HandleElementwiseBinary(HloInstruction* hlo) override {
+  absl::Status HandleElementwiseBinary(HloInstruction* hlo) override {
     auto s = hlo->shape();
     auto a = hlo->mutable_operand(0);
     auto b = hlo->mutable_operand(1);
@@ -362,7 +364,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
   //
   // A{I} -> R [S']{I} -> bitcast[S]{L2}
   //
-  Status HandleReshape(HloInstruction* hlo) override {
+  absl::Status HandleReshape(HloInstruction* hlo) override {
     auto s = hlo->shape();
     auto operand = hlo->mutable_operand(0);
     TF_RET_CHECK(ShapeUtil::ReshapeIsBitcast(s, operand->shape()));
@@ -376,6 +378,127 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
     return OkStatus();
   }
 
+  // Scatter is layout-preserving regarding the scatter operands, so we only
+  // have to permute values inside the ScatterDimensionNumbers.
+  Status HandleScatter(HloInstruction* hlo) override {
+    auto* scatter = Cast<HloScatterInstruction>(hlo);
+    std::vector<HloInstruction*> normalized_operands;
+    normalized_operands.reserve(scatter->scatter_operand_count());
+    Shape operand_shape = scatter->scatter_operands().front()->shape();
+    for (HloInstruction* operand : scatter->scatter_operands()) {
+      if (operand->shape().layout() != operand_shape.layout()) {
+        return FailedPrecondition(
+            "All scatter operands must have the same layout");
+      }
+      TF_ASSIGN_OR_RETURN(auto normalized_operand, GetNormalizedInput(operand));
+      normalized_operands.push_back(normalized_operand);
+    }
+    std::vector<HloInstruction*> normalized_updates;
+    normalized_updates.reserve(scatter->scatter_operand_count());
+    Shape update_shape = scatter->scatter_updates().front()->shape();
+    for (HloInstruction* operand : scatter->scatter_updates()) {
+      if (operand->shape().layout() != update_shape.layout()) {
+        return FailedPrecondition(
+            "All scatter updates must have the same layout");
+      }
+      TF_ASSIGN_OR_RETURN(auto normalized_update, GetNormalizedInput(operand));
+      normalized_updates.push_back(normalized_update);
+    }
+
+    // Since normalization might reorder the 'scatter_updates' operands
+    // differently than the 'scatter_indices' update, we have no way to specify
+    // the order of 'scatter' (batch) dimensions, as that is not an attribute in
+    // ScatterDimensionNumbers. Scatter implicitly assumes that the 'scatter'
+    // dimensions appear in the same order in 'scatter_updates' and
+    // 'scatter_indices'. So we require that there is just a single
+    // 'scatter' dimension. This is ensured by the ScatterSimplifier pass.
+    const auto& dims = scatter->scatter_dimension_numbers();
+    if (scatter->scatter_updates().front()->shape().rank() -
+            dims.update_window_dims_size() >
+        1) {
+      return FailedPrecondition(
+          "There should be just a single scatter dimension. Make sure to run "
+          "ScatterSimplifier before LayoutNormalization");
+    }
+    TF_ASSIGN_OR_RETURN(auto normalized_indices,
+                        GetNormalizedInput(scatter->scatter_indices()));
+
+    // The scatter operands are normalized by applying a permutation such that
+    // perm(layout) = standard layout -> inverse layout permutation is applied.
+    auto indices_permutation = InversePermutation(
+        ToTransposeDimensions(scatter->scatter_indices()->shape().layout()));
+
+    auto layout_permutation =
+        ToTransposeDimensions(scatter->scatter_operands()[0]->shape().layout());
+    auto operand_permutation = InversePermutation(layout_permutation);
+
+    auto update_permutation = InversePermutation(
+        ToTransposeDimensions(scatter->scatter_updates()[0]->shape().layout()));
+
+    // scatter_dims_to_operand_dims -> mapping from scatter dimensions to
+    // operand dimensions. scatter dimension i corresponds to
+    // scatter_dims_to_operand_dims[i] operand dimension.
+
+    ScatterDimensionNumbers normalized_dims;
+    normalized_dims.set_index_vector_dim(
+        indices_permutation[dims.index_vector_dim()]);
+    for (int64_t dim : dims.scatter_dims_to_operand_dims()) {
+      normalized_dims.add_scatter_dims_to_operand_dims(
+          operand_permutation[dim]);
+    }
+    std::vector<int64_t> normalized_update_window_dims;
+    normalized_update_window_dims.reserve(dims.update_window_dims_size());
+    for (int64_t dim : dims.update_window_dims()) {
+      normalized_update_window_dims.push_back(update_permutation[dim]);
+    }
+
+    // Now reorder 'normalized_update_window_dims' and 'inserted_window_dims'
+    // according to the output permutation, so that the window dimensions again
+    // appear in the same order as in the output. First we need to build a
+    // combined array of window dimensions. Note: 'inserted_window_dims' and
+    // 'update_window_dims' must be sorted according to shape inference/hlo
+    // verifier. We will temporarily create an unsorted update_window_dims
+    // attribute and rely on ScatterSimplifier to clean this up.
+    std::vector<int64_t> window_dimensions(operand_permutation.size());
+    for (int64_t i = 0, j = 0, k = 0; i < window_dimensions.size(); ++i) {
+      if (j < dims.inserted_window_dims_size() &&
+          dims.inserted_window_dims(j) == i) {
+        window_dimensions[i] = -1;
+        ++j;
+      } else {
+        window_dimensions[i] = normalized_update_window_dims[k];
+        ++k;
+      }
+    }
+    std::vector<int64_t> permuted_window_dimensions =
+        ComposePermutations(window_dimensions, layout_permutation);
+    for (int64_t i = 0; i < permuted_window_dimensions.size(); ++i) {
+      if (permuted_window_dimensions[i] == -1) {
+        normalized_dims.add_inserted_window_dims(i);
+      } else {
+        normalized_dims.add_update_window_dims(permuted_window_dimensions[i]);
+      }
+    }
+
+    auto normalized_shape = normalized_operands.front()->shape();
+    if (scatter->shape().IsTuple()) {
+      std::vector<Shape> tuple_shapes;
+      tuple_shapes.reserve(normalized_operands.size());
+      for (HloInstruction* operand : normalized_operands) {
+        tuple_shapes.push_back(operand->shape());
+      }
+      normalized_shape = ShapeUtil::MakeTupleShape(tuple_shapes);
+    }
+    auto normalized_scatter = hlo->AddInstruction(HloInstruction::CreateScatter(
+        normalized_shape, normalized_operands, normalized_indices,
+        normalized_updates, scatter->to_apply(), normalized_dims,
+        scatter->indices_are_sorted(), scatter->unique_indices()));
+    SetVisited(*normalized_scatter);
+    auto bc_to_orig = MakeBitcastHlo(normalized_scatter, scatter->shape());
+    TF_RETURN_IF_ERROR(ReplaceInstruction(scatter, bc_to_orig));
+    return OkStatus();
+  }
+
   // For bitcasting transposes, converts:
   //
   // A{I} -> bitcast[S]{L} -> transpose{L2}
@@ -400,7 +523,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
   //
   // where dim_0 is dimensions of the original transposition, and `o` denotes
   // permutation composition.
-  Status HandleTranspose(HloInstruction* hlo) override {
+  absl::Status HandleTranspose(HloInstruction* hlo) override {
     auto s = hlo->shape();
     auto operand = hlo->mutable_operand(0);
     auto operand_s = operand->shape();
@@ -441,7 +564,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
   //
   // Where S' is normalization of [S]{L'}, and transposition dimensions are
   // given by L'.
-  Status HandleCopy(HloInstruction* hlo) override {
+  absl::Status HandleCopy(HloInstruction* hlo) override {
     VLOG(3) << "Processing copy: " << hlo->ToString();
     auto s = hlo->shape();
     auto operand = hlo->mutable_operand(0);
@@ -460,7 +583,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
   }
 
   // The reverse HLO has a list of dimensions it reverses.
-  Status HandleReverse(HloInstruction* hlo) override {
+  absl::Status HandleReverse(HloInstruction* hlo) override {
     auto s = hlo->shape();
     auto operand = hlo->mutable_operand(0);
     TF_ASSIGN_OR_RETURN(auto a0, GetNormalizedInput(operand));
@@ -468,8 +591,9 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
         ToTransposeDimensions(hlo->shape().layout());
     std::vector<int64_t> new_dimensions;
     new_dimensions.reserve(hlo->dimensions().size());
+    auto inverse_perm = InversePermutation(layout_as_permutation);
     for (int64_t dim : hlo->dimensions()) {
-      new_dimensions.push_back(FindIndex(layout_as_permutation, dim));
+      new_dimensions.push_back(inverse_perm[dim]);
     }
     absl::c_sort(new_dimensions);
     auto normalized_reverse = hlo->AddInstruction(
@@ -482,7 +606,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
 
   // Padding is layout-preserving, so we only have to permute values inside the
   // padding config.
-  Status HandlePad(HloInstruction* hlo) override {
+  absl::Status HandlePad(HloInstruction* hlo) override {
     auto s = hlo->shape();
     auto operand = hlo->mutable_operand(0);
     auto padded_by = hlo->mutable_operand(1);
@@ -499,8 +623,9 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
       new_padding.add_dimensions();
     }
 
+    auto inverse_perm = InversePermutation(layout_as_permutation);
     for (int dim = 0; dim < s.dimensions_size(); dim++) {
-      int tr_dim = static_cast<int>(FindIndex(layout_as_permutation, dim));
+      int tr_dim = static_cast<int>(inverse_perm[dim]);
       *new_padding.mutable_dimensions(tr_dim) = padded_config.dimensions(dim);
     }
 
@@ -512,7 +637,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
     return OkStatus();
   }
 
-  Status HandleCustomCall(HloInstruction* hlo) override {
+  absl::Status HandleCustomCall(HloInstruction* hlo) override {
     if (custom_call_transformer_) {
       TF_ASSIGN_OR_RETURN(
           std::optional<HloInstruction*> transformed_custom_call,
@@ -528,14 +653,14 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
 
   // Pushes down bitcast across the ternary select operation: same logic as
   // HandleElementwiseBinary.
-  Status HandleSelect(HloInstruction* hlo) override {
+  absl::Status HandleSelect(HloInstruction* hlo) override {
     return HandleTernary(hlo);
   }
 
   // DyanmicSlice is layout-preserving, so handling is analoguous to elementwise
   // unary, and transposing the elements inside the metadata, as well as the
   // operands specifying dimension sizes.
-  Status HandleDynamicSlice(HloInstruction* hlo) override {
+  absl::Status HandleDynamicSlice(HloInstruction* hlo) override {
     const Shape& s = hlo->shape();
     HloInstruction* operand = hlo->mutable_operand(0);
     const Shape& operand_shape = operand->shape();
@@ -567,7 +692,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
     return OkStatus();
   }
 
-  Status HandleDynamicUpdateSlice(HloInstruction* hlo) override {
+  absl::Status HandleDynamicUpdateSlice(HloInstruction* hlo) override {
     const Shape& s = hlo->shape();
     HloInstruction* operand = hlo->mutable_operand(0);
     HloInstruction* update = hlo->mutable_operand(1);
@@ -596,13 +721,13 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
     return OkStatus();
   }
 
-  Status HandleClamp(HloInstruction* hlo) override {
+  absl::Status HandleClamp(HloInstruction* hlo) override {
     return HandleTernary(hlo);
   }
 
  private:
   // Replace clamp/select ternary operation with a normalized one.
-  Status HandleTernary(HloInstruction* hlo) {
+  absl::Status HandleTernary(HloInstruction* hlo) {
     Shape s = hlo->shape();
     HloOpcode opcode = hlo->opcode();
     TF_RET_CHECK(opcode == HloOpcode::kClamp || opcode == HloOpcode::kSelect);
diff --git a/third_party/xla/xla/service/layout_normalization_test.cc b/third_party/xla/xla/service/layout_normalization_test.cc
index 6cebe0f2a858cc..d2b9d92d2fb934 100644
--- a/third_party/xla/xla/service/layout_normalization_test.cc
+++ b/third_party/xla/xla/service/layout_normalization_test.cc
@@ -15,10 +15,15 @@ limitations under the License.
 
 #include "xla/service/layout_normalization.h"
 
+#include <functional>
 #include <optional>
 #include <utility>
 
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/scatter_simplifier.h"
 #include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/status.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
@@ -26,9 +31,11 @@ namespace {
 
 class LayoutNormalizationTest : public HloTestBase {
  public:
-  void CheckLayoutNormalization(absl::string_view hlo,
-                                std::optional<absl::string_view> expected) {
-    RunAndFilecheckHloRewrite(hlo, LayoutNormalization{}, expected);
+  void CheckLayoutNormalization(
+      absl::string_view hlo, std::optional<absl::string_view> expected,
+      std::function<void(HloModule*)> after_pass_checks = nullptr) {
+    RunAndFilecheckHloRewrite(hlo, LayoutNormalization{}, expected,
+                              after_pass_checks);
   }
 };
 
@@ -235,6 +242,24 @@ ENTRY main {
 )");
 }
 
+TEST_F(LayoutNormalizationTest, BroadcastOperandLayoutNotInverseOfItself) {
+  const char* hlo = R"(
+HloModule module
+
+ENTRY main {
+  a = f32[4,3,5]{0,2,1} parameter(0)
+  b = f32[4,3,2,5]{0,1,2,3} broadcast(a), dimensions={0,1,3}
+  ROOT out = abs(b)
+}
+)";
+
+  CheckLayoutNormalization(hlo, R"(
+// CHECK: [[bitcast_1:%[^ ]+]] = f32[3,5,4]{2,1,0} bitcast
+// CHECK: [[broadcast_0:%[^ ]+]] = f32[5,2,3,4]{3,2,1,0} broadcast([[bitcast_1]]), dimensions={2,0,3}
+// CHECK: [[abs_2:%[^ ]+]] = f32[5,2,3,4]{3,2,1,0} abs([[broadcast_0]])
+)");
+}
+
 TEST_F(LayoutNormalizationTest, BroadcastCustomOutputLayout) {
   const char* hlo = R"(
 HloModule module
@@ -636,10 +661,10 @@ HloModule module
 
 ENTRY main {
   input = f32[3,4,32]{1,0,2} parameter(0)
-  s1 = s32[] parameter(1)
-  s2 = s32[] parameter(2)
-  s3 = s32[] parameter(3)
-  ROOT out = f32[1,4,32]{1,0,2} dynamic-slice(input, s1, s2, s3), dynamic_slice_sizes={1,4,32}, metadata={op_name="test"}
+  p1 = s32[] parameter(1)
+  p2 = s32[] parameter(2)
+  p3 = s32[] parameter(3)
+  ROOT out = f32[1,4,32]{1,0,2} dynamic-slice(input, p1, p2, p3), dynamic_slice_sizes={1,4,32}, metadata={op_name="test"}
 }
   )";
   CheckLayoutNormalization(hlo, R"(
@@ -653,10 +678,10 @@ HloModule module
 
 ENTRY main {
   input = f32[1,4,32]{1,0,2} parameter(0)
-  s1 = s32[] parameter(1)
-  s2 = s32[] parameter(2)
-  s3 = s32[] parameter(3)
-  ROOT out = f32[1,4,32]{1,0,2} dynamic-slice(input, s1, s2, s3), dynamic_slice_sizes={1,4,32}, metadata={op_name="test"}
+  p1 = s32[] parameter(1)
+  p2 = s32[] parameter(2)
+  p3 = s32[] parameter(3)
+  ROOT out = f32[1,4,32]{1,0,2} dynamic-slice(input, p1, p2, p3), dynamic_slice_sizes={1,4,32}, metadata={op_name="test"}
 }
   )";
   CheckLayoutNormalization(hlo, R"(
@@ -751,5 +776,101 @@ ENTRY main {
 )");
 }
 
+TEST_F(LayoutNormalizationTest, Scatter) {
+  const char* hlo = R"(
+HloModule simplified_scatter
+
+region_0.10 {
+  Arg_0.11 = s16[] parameter(0)
+  Arg_1.12 = s16[] parameter(1)
+  ROOT maximum.13 = s16[] maximum(Arg_0.11, Arg_1.12)
+}
+
+ENTRY main.17 {
+  p0 = s16[3,2,2,14,16]{0,1,4,3,2} parameter(0)
+  p1 = s32[2,11]{0,1} parameter(1)
+  p2 = s16[11,3,5]{2,0,1} parameter(2)
+  ROOT scatter = s16[3,2,2,14,16]{0,1,4,3,2} scatter(p0, p1, p2), update_window_dims={1,2}, inserted_window_dims={1,2,3}, scatter_dims_to_operand_dims={4,0}, index_vector_dim=0, to_apply=region_0.10
+}
+)";
+
+  CheckLayoutNormalization(
+      hlo, R"(
+// CHECK: scatter({{.*}}),
+// CHECK-SAME: update_window_dims={2,0}, inserted_window_dims={0,1,3}, scatter_dims_to_operand_dims={2,4}, index_vector_dim=1, to_apply=%region_0.10
+)",
+      // Run the ScatterSimplifier afterwards, otherwise the verifier will
+      // complain!
+      [](HloModule* module) {
+        TF_CHECK_OK(ScatterSimplifier().Run(module).status());
+      });
+}
+
+TEST_F(LayoutNormalizationTest, SimplifiedScatter) {
+  const char* hlo = R"(
+HloModule simplified_scatter
+
+region_0.10 {
+  Arg_0.11 = s16[] parameter(0)
+  Arg_1.12 = s16[] parameter(1)
+  ROOT maximum.13 = s16[] maximum(Arg_0.11, Arg_1.12)
+}
+
+ENTRY main.17 {
+  p0 = s16[16,3,2,2,14]{0,4,3,2,1} parameter(0)
+  p1 = s32[528,2]{1,0} parameter(1)
+  p2 = s16[528,5,3,1,1,1]{1,2,0,5,4,3} parameter(2)
+  ROOT scatter = s16[16,3,2,2,14]{0,4,3,2,1} scatter(p0, p1, p2), update_window_dims={1,2,3,4,5}, inserted_window_dims={}, scatter_dims_to_operand_dims={0,1}, index_vector_dim=1, to_apply=region_0.10
+}
+)";
+
+  CheckLayoutNormalization(
+      hlo, R"(
+// CHECK: scatter({{.*}}),
+// CHECK-SAME: update_window_dims={4,0,1,2,5}, inserted_window_dims={}, scatter_dims_to_operand_dims={4,0}, index_vector_dim=1, to_apply=%region_0.10
+)",
+      // Run the ScatterSimplifier afterwards, otherwise the verifier will
+      // complain!
+      [](HloModule* module) {
+        TF_CHECK_OK(ScatterSimplifier().Run(module).status());
+      });
+}
+
+TEST_F(LayoutNormalizationTest, VariadicScatter) {
+  const char* hlo = R"(
+HloModule simplified_scatter
+
+region_0.10 {
+  Arg_0.11 = s16[] parameter(0)
+  Arg_1.12 = s16[] parameter(1)
+  Arg_2.13 = s16[] parameter(2)
+  Arg_3.14 = s16[] parameter(3)
+  maximum.15 = s16[] maximum(Arg_0.11, Arg_1.12)
+  maximum.16 = s16[] maximum(Arg_2.13, Arg_3.14)
+  ROOT res = (s16[], s16[]) tuple(maximum.15, maximum.16)
+}
+
+ENTRY main.17 {
+  p0 = s16[16,3,2,2,14]{0,4,3,2,1} parameter(0)
+  p1 = s16[16,3,2,2,14]{0,4,3,2,1} parameter(1)
+  p2 = s32[528,2]{1,0} parameter(2)
+  p3 = s16[528,5,3,1,1,1]{1,2,0,5,4,3} parameter(3)
+  p4 = s16[528,5,3,1,1,1]{1,2,0,5,4,3} parameter(4)
+  ROOT scatter = (s16[16,3,2,2,14]{0,4,3,2,1}, s16[16,3,2,2,14]{0,4,3,2,1}) scatter(p0, p1, p2, p3, p4), update_window_dims={1,2,3,4,5}, inserted_window_dims={}, scatter_dims_to_operand_dims={0,1}, index_vector_dim=1, to_apply=region_0.10
+}
+)";
+
+  CheckLayoutNormalization(
+      hlo, R"(
+// CHECK: scatter({{.*}}),
+// CHECK-SAME: update_window_dims={4,0,1,2,5}, inserted_window_dims={}, scatter_dims_to_operand_dims={4,0}, index_vector_dim=1, to_apply=%region_0.10
+)",
+      // Run the ScatterSimplifier afterwards, otherwise the verifier will
+      // complain!
+      [](HloModule* module) {
+        TF_CHECK_OK(ScatterSimplifier().Run(module).status());
+      });
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/llvm_compiler.cc b/third_party/xla/xla/service/llvm_compiler.cc
index c34b286b7c0280..4bbcca0b484cd2 100644
--- a/third_party/xla/xla/service/llvm_compiler.cc
+++ b/third_party/xla/xla/service/llvm_compiler.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/service/llvm_compiler.h"
 
 #include "tsl/platform/denormal.h"
+#include "tsl/profiler/lib/scoped_annotation.h"
 
 #ifdef __FAST_MATH__
 #error "Don't build XLA with -ffast-math"
@@ -42,6 +43,10 @@ absl::StatusOr<std::vector<std::unique_ptr<Executable>>> LLVMCompiler::Compile(
   std::vector<std::unique_ptr<HloModule>> modules =
       module_group->ConsumeModules();
   for (size_t i = 0; i < modules.size(); i++) {
+    tsl::profiler::ScopedAnnotation annotation{[&] {
+      return absl::StrFormat("XlaCompile:#module=%s,program_id=%d#",
+                             modules[i]->name(), modules[i]->unique_id());
+    }};
     TF_ASSIGN_OR_RETURN(modules[i], RunHloPasses(std::move(modules[i]),
                                                  stream_execs[i][0], options));
     TF_ASSIGN_OR_RETURN(
diff --git a/third_party/xla/xla/service/llvm_compiler.h b/third_party/xla/xla/service/llvm_compiler.h
index cd2699c2074fd6..4bd0e8d7c9d24f 100644
--- a/third_party/xla/xla/service/llvm_compiler.h
+++ b/third_party/xla/xla/service/llvm_compiler.h
@@ -58,11 +58,11 @@ class LLVMCompiler : public Compiler {
   void RemovePostOptimizationHook() { user_post_optimization_hook_ = nullptr; }
 
   // Bring in
-  //   StatusOr<std::unique_ptr<Executable>> RunBackend(
+  //   absl::StatusOr<std::unique_ptr<Executable>> RunBackend(
   //       std::unique_ptr<HloModule> module,
   //       se::StreamExecutor* stream_exec,
   //       se::DeviceMemoryAllocator* device_allocator)
-  //   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
+  //   absl::StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
   //       std::unique_ptr<HloModule> module,
   //       se::StreamExecutor* stream_exec,
   //       se::DeviceMemoryAllocator* device_allocator)
diff --git a/third_party/xla/xla/service/llvm_ir/BUILD b/third_party/xla/xla/service/llvm_ir/BUILD
index 34a94ec25e8e04..5a09d977cdfefe 100644
--- a/third_party/xla/xla/service/llvm_ir/BUILD
+++ b/third_party/xla/xla/service/llvm_ir/BUILD
@@ -2,6 +2,10 @@
 #    Libraries for helping construct LLVM IR for XLA backends.
 
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load(
+    "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
+    "if_cuda_is_configured",
+)
 load("//xla:xla.bzl", "xla_cc_test")
 load("//xla/tsl:tsl.bzl", "internal_visibility")
 load("//xla/tsl:tsl.default.bzl", "filegroup")
@@ -50,9 +54,10 @@ xla_cc_test(
     name = "alias_analysis_test",
     srcs = ["alias_analysis_test.cc"],
     deps = [
-        "//xla/service:custom_call_status_public_headers",
-        "//xla/service:custom_call_target_registry",
+        "//xla/ffi",
+        "//xla/ffi:ffi_api",
         "//xla/service/cpu/tests:cpu_codegen_test",
+        "@com_google_absl//absl/status",
         "@local_tsl//tsl/platform:test",
     ],
 )
@@ -61,6 +66,7 @@ cc_library(
     name = "llvm_util",
     srcs = ["llvm_util.cc"],
     hdrs = ["llvm_util.h"],
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     deps = [
         ":llvm_type_conversion_util",
         "//xla:literal",
@@ -85,6 +91,7 @@ cc_library(
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/profiler/lib:scoped_annotation",
     ],
 )
 
diff --git a/third_party/xla/xla/service/llvm_ir/alias_analysis_test.cc b/third_party/xla/xla/service/llvm_ir/alias_analysis_test.cc
index 8016fd24aaa9d3..40740f5117fbf2 100644
--- a/third_party/xla/xla/service/llvm_ir/alias_analysis_test.cc
+++ b/third_party/xla/xla/service/llvm_ir/alias_analysis_test.cc
@@ -13,9 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "absl/status/status.h"
+#include "xla/ffi/ffi.h"
+#include "xla/ffi/ffi_api.h"
 #include "xla/service/cpu/tests/cpu_codegen_test.h"
-#include "xla/service/custom_call_status.h"
-#include "xla/service/custom_call_target_registry.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
@@ -23,9 +24,20 @@ namespace cpu {
 namespace {
 class AliasAnalysisTest : public CpuCodegenTest {};
 
-void FakeCustomCallTarget(float* out, float** in, XlaCustomCallStatus*) {}
+static absl::Status FakeCustomCallTarget(ffi::BufferBase,
+                                         ffi::Result<ffi::BufferBase>) {
+  return absl::OkStatus();
+}
+
+XLA_FFI_DEFINE_HANDLER(kFakeCustomCallTarget, FakeCustomCallTarget,
+                       ffi::Ffi::Bind()
+                           .Arg<ffi::BufferBase>()  // in
+                           .Ret<ffi::BufferBase>()  // out
+);
 
-XLA_CPU_REGISTER_CUSTOM_CALL_TARGET(FakeCustomCallTarget);
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(),
+                         "__xla_test$$FakeCustomCallTarget", "Host",
+                         kFakeCustomCallTarget);
 
 TEST_F(AliasAnalysisTest, EmbeddedComputationParamsMayAliasTemps) {
   const char* hlo_string = R"(
@@ -40,7 +52,7 @@ body {
 condition {
   const.100 = f32[] constant(100)
   condition.state = f32[] parameter(0)
-  addend = f32[] custom-call(condition.state), custom_call_target="FakeCustomCallTarget", api_version=API_VERSION_STATUS_RETURNING
+  addend = f32[] custom-call(condition.state), custom_call_target="__xla_test$$FakeCustomCallTarget", api_version=API_VERSION_TYPED_FFI
   add = f32[] add(addend, condition.state)
   ROOT greater-than = pred[] compare(const.100, add), direction=GT
 }
diff --git a/third_party/xla/xla/service/llvm_ir/dynamic_update_slice_util.cc b/third_party/xla/xla/service/llvm_ir/dynamic_update_slice_util.cc
index 6b17b942364799..b7b802fb0a6d0e 100644
--- a/third_party/xla/xla/service/llvm_ir/dynamic_update_slice_util.cc
+++ b/third_party/xla/xla/service/llvm_ir/dynamic_update_slice_util.cc
@@ -100,7 +100,7 @@ bool CanEmitFusedDynamicUpdateSliceInPlace(HloInstruction* fusion,
 // Emits a sequential loop if launch_dimensions is null.
 using IndexGenerator = std::function<absl::StatusOr<llvm::Value*>(int64_t)>;
 
-static Status EmitDynamicUpdateSliceInPlaceImpl(
+static absl::Status EmitDynamicUpdateSliceInPlaceImpl(
     const Shape& update_shape, const IndexGenerator& start_indices_generator,
     bool is_signed, ElementGenerator update_array_generator,
     const IrArray& output_array, const gpu::LaunchDimensions* launch_dimensions,
@@ -135,7 +135,8 @@ static Status EmitDynamicUpdateSliceInPlaceImpl(
                         max_bound, start_multi_index[i]);
   }
 
-  auto loop_body_emitter = [&](const IrArray::Index& update_index) -> Status {
+  auto loop_body_emitter =
+      [&](const IrArray::Index& update_index) -> absl::Status {
     // Calculate output_index, where we'll write the value from update.  For
     // each dimension,
     //
@@ -165,10 +166,9 @@ static Status EmitDynamicUpdateSliceInPlaceImpl(
   return LoopEmitter(loop_body_emitter, update_shape, b).EmitLoop(name);
 }
 
-Status EmitDynamicUpdateSliceInPlace(absl::Span<const IrArray> operand_arrays,
-                                     const IrArray& output_array,
-                                     absl::string_view name,
-                                     llvm::IRBuilder<>* b) {
+absl::Status EmitDynamicUpdateSliceInPlace(
+    absl::Span<const IrArray> operand_arrays, const IrArray& output_array,
+    absl::string_view name, llvm::IRBuilder<>* b) {
   VLOG(2) << "EmitDynamicUpdateSliceInPlace for " << name;
 
   // No need to use operand_arrays[0], the input array of the
@@ -196,7 +196,7 @@ Status EmitDynamicUpdateSliceInPlace(absl::Span<const IrArray> operand_arrays,
 // EmitParallelFusedDynamicUpdateSliceInPlace.
 //
 // Emits a sequential loop if launch_dimensions is null.
-static Status EmitFusedDynamicUpdateSliceInPlaceImpl(
+static absl::Status EmitFusedDynamicUpdateSliceInPlaceImpl(
     const HloComputation* fusion,
     const std::vector<std::pair<const HloInstruction*, const IrArray>>&
         dus_and_output_array,
@@ -252,10 +252,9 @@ static Status EmitFusedDynamicUpdateSliceInPlaceImpl(
   return OkStatus();
 }
 
-Status EmitFusedDynamicUpdateSliceInPlace(HloInstruction* fusion,
-                                          const IrArray& fusion_output_array,
-                                          FusedIrEmitter* fused_emitter,
-                                          llvm::IRBuilder<>* b) {
+absl::Status EmitFusedDynamicUpdateSliceInPlace(
+    HloInstruction* fusion, const IrArray& fusion_output_array,
+    FusedIrEmitter* fused_emitter, llvm::IRBuilder<>* b) {
   HloInstruction* dus = fusion->called_computations()[0]->root_instruction();
   CHECK_EQ(dus->opcode(), HloOpcode::kDynamicUpdateSlice);
   std::vector<std::pair<const HloInstruction*, const IrArray>>
@@ -266,7 +265,7 @@ Status EmitFusedDynamicUpdateSliceInPlace(HloInstruction* fusion,
       /*launch_dimensions=*/nullptr, b);
 }
 
-Status EmitParallelFusedDynamicUpdateSliceInPlace(
+absl::Status EmitParallelFusedDynamicUpdateSliceInPlace(
     const HloComputation* fusion,
     const std::vector<std::pair<const HloInstruction*, const IrArray>>&
         dus_and_output_array,
diff --git a/third_party/xla/xla/service/llvm_ir/dynamic_update_slice_util.h b/third_party/xla/xla/service/llvm_ir/dynamic_update_slice_util.h
index cf015c8692226e..ea70465d0b5207 100644
--- a/third_party/xla/xla/service/llvm_ir/dynamic_update_slice_util.h
+++ b/third_party/xla/xla/service/llvm_ir/dynamic_update_slice_util.h
@@ -64,24 +64,22 @@ bool CanEmitFusedDynamicUpdateSliceInPlace(HloInstruction* fusion,
 // Emits IR for running the given dynamic-update-slice op in-place -- that is,
 // where the input and output buffers share the same slice, so we can simply
 // modify the input/output buffer without touching any of the other elements.
-Status EmitDynamicUpdateSliceInPlace(absl::Span<const IrArray> operand_arrays,
-                                     const IrArray& output_array,
-                                     absl::string_view name,
-                                     llvm::IRBuilder<>* b);
+absl::Status EmitDynamicUpdateSliceInPlace(
+    absl::Span<const IrArray> operand_arrays, const IrArray& output_array,
+    absl::string_view name, llvm::IRBuilder<>* b);
 
 // Given a loop-fusion node whose root is a dynamic-update-slice op whose
 // array-to-be-updated and output share the same buffer slice, emits
 // (sequential) code for a fusion node that does the dynamic-update-slice in
 // place.
-Status EmitFusedDynamicUpdateSliceInPlace(HloInstruction* fusion,
-                                          const IrArray& fusion_output_array,
-                                          FusedIrEmitter* fused_emitter,
-                                          llvm::IRBuilder<>* b);
+absl::Status EmitFusedDynamicUpdateSliceInPlace(
+    HloInstruction* fusion, const IrArray& fusion_output_array,
+    FusedIrEmitter* fused_emitter, llvm::IRBuilder<>* b);
 
 // Same as EmitFusedDynamicUpdateSliceInPlace, except emits a parallel loop with
 // the given launch dimensions for arbitrarily many independent dynamic slice
 // updates.
-Status EmitParallelFusedDynamicUpdateSliceInPlace(
+absl::Status EmitParallelFusedDynamicUpdateSliceInPlace(
     const HloComputation* fusion,
     const std::vector<std::pair<const HloInstruction*, const IrArray>>&
         dus_and_output_array,
diff --git a/third_party/xla/xla/service/llvm_ir/ir_array.cc b/third_party/xla/xla/service/llvm_ir/ir_array.cc
index 25785d175b108c..62ec82249b572e 100644
--- a/third_party/xla/xla/service/llvm_ir/ir_array.cc
+++ b/third_party/xla/xla/service/llvm_ir/ir_array.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/llvm_ir/ir_array.h"
 
+#include <cstdint>
 #include <optional>
 #include <tuple>
 #include <utility>
@@ -486,13 +487,16 @@ llvm::Value* IrArray::Index::Linearize(
   return logical_linear_index;
 }
 
-llvm::Value* IrArray::EmitArrayElementAddress(
-    const IrArray::Index& index, llvm::IRBuilder<>* b, absl::string_view name,
-    bool use_linear_index, llvm::Value** is_high_order_bits) const {
+llvm::Value* IrArray::EmitArrayElementAddress(const IrArray::Index& index,
+                                              llvm::IRBuilder<>* b,
+                                              absl::string_view name,
+                                              bool use_linear_index,
+                                              llvm::Value** bit_offset) const {
   if (ShapeUtil::IsScalar(shape_)) {
-    if (primitive_util::Is4BitType(shape_.element_type())) {
-      CHECK_NE(is_high_order_bits, nullptr);
-      *is_high_order_bits = b->getTrue();
+    if (primitive_util::IsSubByteNonPredType(shape_.element_type())) {
+      CHECK_NE(bit_offset, nullptr);
+      *bit_offset =
+          b->getInt8(8 - primitive_util::BitWidth(shape_.element_type()));
     }
     // Special handling of scalars: a scalar pretends to have the same value for
     // every index, thus effectively implementing broadcasting of its value
@@ -505,11 +509,11 @@ llvm::Value* IrArray::EmitArrayElementAddress(
       << " is not compatible with " << shape_.ToString(true);
 
   if (use_linear_index && index.LinearValidOnShape(shape_)) {
-    return EmitLinearArrayElementAddress(index, b, name, is_high_order_bits);
+    return EmitLinearArrayElementAddress(index, b, name, bit_offset);
   }
 
-  if (primitive_util::Is4BitType(shape_.element_type())) {
-    // Int4 arrays require the use of a linear index, because the GEP
+  if (primitive_util::IsSubByteNonPredType(shape_.element_type())) {
+    // Subbyte types require the use of a linear index, because the GEP
     // multi-indexing logic below does not properly handle packed subbyte types.
     IrArray::Index linear_index = index;
     if (!index.LinearValidOnShape(shape_)) {
@@ -521,8 +525,7 @@ llvm::Value* IrArray::EmitArrayElementAddress(
       llvm::Value* linearized = index.Linearize(dimensions, b);
       linear_index = IrArray::Index(linearized, shape_, b);
     }
-    return EmitLinearArrayElementAddress(linear_index, b, name,
-                                         is_high_order_bits);
+    return EmitLinearArrayElementAddress(linear_index, b, name, bit_offset);
   }
 
   std::vector<llvm::Value*> actual_index;
@@ -553,11 +556,11 @@ llvm::Value* IrArray::EmitArrayElementAddress(
 
 llvm::Value* IrArray::EmitLinearArrayElementAddress(
     const IrArray::Index& index, llvm::IRBuilder<>* b, absl::string_view name,
-    llvm::Value** is_high_order_bits) const {
+    llvm::Value** bit_offset) const {
   CHECK(index.LinearValidOnShape(shape_));
   llvm::Module* module = b->GetInsertBlock()->getParent()->getParent();
   llvm::Type* type = PrimitiveTypeToIrType(shape_.element_type(), module);
-  if (!primitive_util::Is4BitType(shape_.element_type())) {
+  if (!primitive_util::IsSubByteNonPredType(shape_.element_type())) {
     auto linear_index = llvm::dyn_cast<llvm::BinaryOperator>(index.linear());
     if (linear_index && (linear_index->getOpcode() == llvm::Instruction::Add)) {
       llvm::Value* index_operand_0 = linear_index->getOperand(0);
@@ -573,17 +576,22 @@ llvm::Value* IrArray::EmitLinearArrayElementAddress(
     }
   }
 
-  // Handle int4 case by dividing index by 2. Int4 arrays are represented in
-  // LLVM IR as an array of i8 value where each i8 value stores two int4
-  // numbers.
+  // Handle sub-bytes by dividing index by the number of elements per byte.
+  // Arrays are represented in LLVM IR as an array of i8 value where each i8
+  // value stores multiple values.
   llvm::Type* index_type = index.linear()->getType();
-  llvm::Value* zero = llvm::ConstantInt::get(index_type, 0);
-  llvm::Value* two = llvm::ConstantInt::get(index_type, 2);
-  llvm::Value* remainder = b->CreateSRem(index.linear(), two);
-  llvm::Value* byte_offset = b->CreateUDiv(index.linear(), two);
-  // is_high_order_bits must be set for int4 arrays.
-  CHECK_NE(is_high_order_bits, nullptr);
-  *is_high_order_bits = b->CreateICmpEQ(remainder, zero);
+  auto bit_width = primitive_util::BitWidth(shape_.element_type());
+  llvm::Value* elements_per_byte =
+      llvm::ConstantInt::get(index_type, 8 / bit_width);
+  llvm::Value* remainder = b->CreateURem(index.linear(), elements_per_byte);
+  llvm::Value* byte_offset = b->CreateUDiv(index.linear(), elements_per_byte);
+
+  CHECK_NE(bit_offset, nullptr);
+  *bit_offset = b->CreateIntCast(
+      b->CreateSub(llvm::ConstantInt::get(index_type, 8 - bit_width),
+                   b->CreateMul(remainder,
+                                llvm::ConstantInt::get(index_type, bit_width))),
+      b->getInt8Ty(), /*isSigned=*/false);
   return b->CreateInBoundsGEP(b->getInt8Ty(), base_ptr_, byte_offset,
                               llvm_ir::AsStringRef(name));
 }
@@ -604,21 +612,21 @@ llvm::Value* IrArray::EmitReadArrayElement(const Index& index,
                                            llvm::IRBuilder<>* b,
                                            absl::string_view name,
                                            bool use_linear_index) const {
-  llvm::Value* is_high_order_bits = nullptr;
-  llvm::Value* element_address = EmitArrayElementAddress(
-      index, b, name, use_linear_index, &is_high_order_bits);
-  llvm::Type* load_type = primitive_util::Is4BitType(shape_.element_type())
-                              ? b->getInt8Ty()
-                              : element_type_;
+  llvm::Value* bit_offset = nullptr;
+  llvm::Value* element_address =
+      EmitArrayElementAddress(index, b, name, use_linear_index, &bit_offset);
+  llvm::Type* load_type =
+      primitive_util::IsSubByteNonPredType(shape_.element_type())
+          ? b->getInt8Ty()
+          : element_type_;
   llvm::LoadInst* load =
       b->CreateLoad(load_type, element_address, llvm_ir::AsStringRef(name));
   AnnotateLoadStoreInstructionWithMetadata(load);
   llvm::Value* elem = load;
-  if (primitive_util::Is4BitType(shape_.element_type())) {
-    llvm::Type* type = load->getType();
-    llvm::Value* shifted = b->CreateLShr(load, llvm::ConstantInt::get(type, 4));
-    elem = b->CreateSelect(is_high_order_bits, shifted, load);
-    elem = b->CreateTrunc(elem, b->getIntNTy(4));
+  if (primitive_util::IsSubByteNonPredType(shape_.element_type())) {
+    llvm::Value* shifted = b->CreateLShr(load, bit_offset);
+    elem = b->CreateTrunc(
+        shifted, b->getIntNTy(primitive_util::BitWidth(shape_.element_type())));
   }
   return elem;
 }
@@ -626,31 +634,25 @@ llvm::Value* IrArray::EmitReadArrayElement(const Index& index,
 void IrArray::EmitWriteArrayElement(const Index& index, llvm::Value* value,
                                     llvm::IRBuilder<>* b,
                                     bool use_linear_index) const {
-  llvm::Value* is_high_order_bits = nullptr;
-  llvm::Value* element_address = EmitArrayElementAddress(
-      index, b, "", use_linear_index, &is_high_order_bits);
-  if (primitive_util::Is4BitType(shape_.element_type())) {
+  llvm::Value* bit_offset = nullptr;
+  llvm::Value* element_address =
+      EmitArrayElementAddress(index, b, "", use_linear_index, &bit_offset);
+  if (primitive_util::IsSubByteNonPredType(shape_.element_type())) {
     // Read a byte, replace the high-order or low-order bits with 'value',
     // and write it back.
     llvm::LoadInst* load = b->CreateLoad(b->getInt8Ty(), element_address);
     AnnotateLoadStoreInstructionWithMetadata(load);
-    llvm::Type* type = load->getType();
     value = b->CreateIntCast(value, b->getInt8Ty(),
-                             /*isSigned=*/shape_.element_type() == S4);
-
-    llvm::Value* high_order_value =
-        b->CreateShl(value, llvm::ConstantInt::get(type, 4));
-    high_order_value =
-        b->CreateOr(high_order_value,
-                    b->CreateAnd(load, llvm::ConstantInt::get(type, 0x0F)));
-
-    llvm::Value* low_order_value =
-        b->CreateAnd(value, llvm::ConstantInt::get(type, 0xF));
-    low_order_value =
-        b->CreateOr(low_order_value,
-                    b->CreateAnd(load, llvm::ConstantInt::get(type, 0xF0)));
-    value =
-        b->CreateSelect(is_high_order_bits, high_order_value, low_order_value);
+                             /*isSigned=*/false);
+    value = b->CreateShl(value, bit_offset);
+    auto bit_width = primitive_util::BitWidth(shape_.element_type());
+    // This is equivalent to:
+    // mask = ~(LsbMask(bit_width) << bit_offset)
+    llvm::Value* mask = b->getInt8(~LsbMask<uint8_t>(bit_width));
+    mask = b->CreateIntrinsic(b->getInt8Ty(), llvm::Intrinsic::fshl,
+                              {mask, mask, bit_offset});
+    llvm::Value* masked_load = b->CreateAnd(load, mask);
+    value = b->CreateOr(masked_load, value);
   }
   llvm::StoreInst* store = b->CreateStore(value, element_address);
   AnnotateLoadStoreInstructionWithMetadata(store);
diff --git a/third_party/xla/xla/service/llvm_ir/ir_array.h b/third_party/xla/xla/service/llvm_ir/ir_array.h
index 0212e90e8da0d0..4bb0404d49fbb6 100644
--- a/third_party/xla/xla/service/llvm_ir/ir_array.h
+++ b/third_party/xla/xla/service/llvm_ir/ir_array.h
@@ -237,14 +237,13 @@ class IrArray {
   // base_ptr is a pointer type pointing to the first element(lowest address)
   // of the array.
   //
-  // For int4 arrays, base_ptr should have half the number of bytes as array
-  // elements (rounded up), as two int4 values are packed into a byte.
-  // pointee_type should be an i4 array in this case, and reads and writes will
-  // return or take in i4 values. IrArray internally reads or writes i8 values,
-  // by treating base_ptr as an i8 array and masking out the high- or low-order
-  // 4 bits of the byte. IrArray does not directly read/write i4 values, since
-  // arrays of i4 values in LLVM are not packed (every element of an LLVM IR
-  // array must have unique address).
+  // For packed arrays, base_ptr points to packed memory with the correct number
+  // of elements when unpacked. pointee_type should be an iN array in this case,
+  // and reads and writes will return or take in iN values. IrArray internally
+  // reads or writes i8 values, by treating base_ptr as an i8 array and
+  // masking/shifting on the fly. IrArray does not directly read/write iN
+  // values, since arrays of iN values in LLVM are not packed (every element of
+  // an LLVM IR array must have unique address).
   IrArray(llvm::Value* base_ptr, llvm::Type* pointee_type, Shape shape);
 
   // Default implementations of copying and moving.
@@ -266,14 +265,10 @@ class IrArray {
   // The optional name is useful for debugging when looking at
   // the emitted LLVM IR.
   //
-  // For int4 values, 'is_high_order_bits' must be non-null and this function
-  // sets '*is_high_order_bits' to a boolean value indicating whether the 4-bit
-  // value resides in the high-order or low- order bits of the byte that the
-  // address points to.
+  // `bit_offset` contains the offset of the element inside the address.
   llvm::Value* EmitArrayElementAddress(
       const Index& index, llvm::IRBuilder<>* b, absl::string_view name = "",
-      bool use_linear_index = true,
-      llvm::Value** is_high_order_bits = nullptr) const;
+      bool use_linear_index = true, llvm::Value** bit_offset = nullptr) const;
 
   // Attach metadata this IrArray instance knows about to "instruction".
   void AnnotateLoadStoreInstructionWithMetadata(
@@ -296,10 +291,10 @@ class IrArray {
   // 'use_linear_index' can be used to specify whether the linear index (if
   // available) or the multi-dimensional index should be used.
   //
-  // For int4 arrays, only 4 bits of a byte in the array are written. First the
-  // appropriate byte is read from the array, then 4 bits are modified and
-  // written back. To avoid race conditions, the caller must ensure that the two
-  // different 4-bit values within a byte are not written to in parallel.
+  // For packed arrays, only part of the byte in the array is written. First
+  // the appropriate byte is read from the array, then a subset of bits are
+  // modified and written back. To avoid race conditions, the caller must ensure
+  // that the different values within a byte are not written to in parallel.
   void EmitWriteArrayElement(const Index& index, llvm::Value* value,
                              llvm::IRBuilder<>* b,
                              bool use_linear_index = true) const;
@@ -361,7 +356,7 @@ class IrArray {
   // Like EmitArrayElementAddress, but always uses a linear index.
   llvm::Value* EmitLinearArrayElementAddress(
       const Index& index, llvm::IRBuilder<>* b, absl::string_view name = "",
-      llvm::Value** is_high_order_bits = nullptr) const;
+      llvm::Value** bit_offset = nullptr) const;
 
   // Address of the base of the array as an LLVM Value.
   llvm::Value* base_ptr_;
diff --git a/third_party/xla/xla/service/llvm_ir/ir_array_test.cc b/third_party/xla/xla/service/llvm_ir/ir_array_test.cc
index 138f4efe8004ed..356a6f03f00d5e 100644
--- a/third_party/xla/xla/service/llvm_ir/ir_array_test.cc
+++ b/third_party/xla/xla/service/llvm_ir/ir_array_test.cc
@@ -145,17 +145,17 @@ TEST_F(IrArrayTest, EmitArrayElementAddressInt4) {
   IrArray ir_array(array_ptr, type, shape);
 
   IrArray::Index index(array_index, shape, &builder_);
-  llvm::Value* is_high_order_bits;
+  llvm::Value* bit_offset;
   ir_array.EmitArrayElementAddress(index, &builder_, /*name=*/"",
                                    /*use_linear_index=*/true,
-                                   /*is_high_order_bits=*/&is_high_order_bits);
+                                   /*bit_offset=*/&bit_offset);
   std::string ir_str = DumpToString(&module_);
 
   // The index is divided by 2 and used as an index to the i8 array. A remainder
-  // is also computed to calculate is_high_order_bits.
+  // is also computed to calculate bit_offset.
   const char* filecheck_pattern = R"(
     CHECK: define void @test_function(ptr %[[ptr:[0-9]+]], i32 %[[idx:[0-9]+]]) {
-    CHECK: %[[rem:[0-9]+]] = srem i32 %[[idx]], 2
+    CHECK: %[[rem:[0-9]+]] = urem i32 %[[idx]], 2
     CHECK: %[[div:[0-9]+]] = udiv i32 %[[idx]], 2
     CHECK: getelementptr inbounds i8, ptr %[[ptr]], i32 %[[div]]
   )";
@@ -179,10 +179,10 @@ TEST_F(IrArrayTest, EmitArrayElementAddressInt4NonLinear) {
 
   IrArray::Index index({array_index0, array_index1}, shape,
                        builder_.getInt32Ty());
-  llvm::Value* is_high_order_bits;
+  llvm::Value* bit_offset;
   ir_array.EmitArrayElementAddress(index, &builder_, /*name=*/"",
                                    /*use_linear_index=*/false,
-                                   /*is_high_order_bits=*/&is_high_order_bits);
+                                   /*bit_offset=*/&bit_offset);
   std::string ir_str = DumpToString(&module_);
 
   // The index is linearized despite use_linear_index=false being passed because
@@ -220,16 +220,17 @@ TEST_F(IrArrayTest, EmitReadArrayElementInt4) {
     CHECK: define void @test_function(ptr %[[ptr:[0-9]+]], i32 %[[idx0:[0-9]+]]) {
 
     COM: Calculate the address.
-    CHECK: %[[srem:[0-9]+]] = srem i32 %[[idx0]], 2
+    CHECK: %[[urem:[0-9]+]] = urem i32 %[[idx0]], 2
     CHECK: %[[addr:[0-9]+]] = udiv i32 %[[idx0]], 2
-    CHECK: %[[iseven:[0-9]+]] = icmp eq i32 %[[srem]], 0
+    CHECK: %[[mul:[0-9]+]] = mul i32 %[[urem]], 4
+    CHECK: %[[sub:[0-9]+]] = sub i32 4, %[[mul]]
+    CHECK: %[[trunc:[0-9]+]] = trunc i32 %[[sub]] to i8
     CHECK: %[[gep:[0-9]+]] = getelementptr inbounds i8, ptr %[[ptr]], i32 %[[addr]]
 
     COM: Load the element, optionally shift, and truncate.
     CHECK: %[[load:[0-9]+]] = load i8, ptr %[[gep]], align 1
-    CHECK: %[[shift:[0-9]+]] = lshr i8 %[[load]], 4
-    CHECK: %[[select:[0-9]+]] = select i1 %[[iseven]], i8 %[[shift]], i8 %[[load]]
-    CHECK: trunc i8 %[[select]] to i4
+    CHECK: %[[shift:[0-9]+]] = lshr i8 %[[load]], %[[trunc]]
+    CHECK: trunc i8 %[[shift]] to i4
   )";
 
   TF_ASSERT_OK_AND_ASSIGN(bool filecheck_match,
@@ -256,21 +257,20 @@ TEST_F(IrArrayTest, EmitWriteArrayElementInt4) {
     CHECK: define void @test_function(ptr %[[ptr:[0-9]+]], i32 %[[idx0:[0-9]+]], i4 %[[val:[0-9]+]]) {
 
     COM: Calculate the address.
-    CHECK: %[[srem:[0-9]+]] = srem i32 %[[idx0]], 2
+    CHECK: %[[urem:[0-9]+]] = urem i32 %[[idx0]], 2
     CHECK: %[[addr:[0-9]+]] = udiv i32 %[[idx0]], 2
-    CHECK: %[[isodd:[0-9]+]] = icmp eq i32 %[[srem]], 0
+    CHECK: %[[mul:[0-9]+]] = mul i32 %[[urem]], 4
+    CHECK: %[[sub:[0-9]+]] = sub i32 4, %[[mul]]
+    CHECK: %[[trunc:[0-9]+]] = trunc i32 %[[sub]] to i8
     CHECK: %[[gep:[0-9]+]] = getelementptr inbounds i8, ptr %[[ptr]], i32 %[[addr]]
 
     COM: Load address, replace 4 bits with the value, and write to address.
     CHECK: %[[load:[0-9]+]] = load i8, ptr %[[gep]], align 1
-    CHECK: %[[sext:[0-9]+]] = sext i4 %[[val]] to i8
-    CHECK: %[[shl:[0-9]+]] = shl i8 %[[sext]], 4
-    CHECK: %[[and1:[0-9]+]] = and i8 %[[load]], 15
-    CHECK: %[[or1:[0-9]+]] = or i8 %[[shl]], %[[and1]]
-    CHECK: %[[and2:[0-9]+]] = and i8 %[[sext]], 15
-    CHECK: %[[and3:[0-9]+]] = and i8 %[[load]], -16
-    CHECK: %[[or2:[0-9]+]] = or i8 %[[and2]], %[[and3]]
-    CHECK: %[[towrite:[0-9]+]] = select i1 %[[isodd]], i8 %[[or1]], i8 %[[or2]]
+    CHECK: %[[zext:[0-9]+]] = zext i4 %[[val]] to i8
+    CHECK: %[[shifted_val:[0-9]+]] = shl i8 %[[zext]], %[[trunc]]
+    CHECK: %[[mask:[0-9]+]] = call i8 @llvm.fshl.i8(i8 -16, i8 -16, i8 %[[trunc]])
+    CHECK: %[[and:[0-9]+]] = and i8 %[[load]], %[[mask]]
+    CHECK: %[[towrite:[0-9]+]] = or i8 %[[and]], %[[shifted_val]]
     CHECK: store i8 %[[towrite]], ptr %[[gep]], align 1
   )";
 
diff --git a/third_party/xla/xla/service/llvm_ir/ir_builder_mixin.h b/third_party/xla/xla/service/llvm_ir/ir_builder_mixin.h
index 23a00c242f9ad4..5e3e5fd4007503 100644
--- a/third_party/xla/xla/service/llvm_ir/ir_builder_mixin.h
+++ b/third_party/xla/xla/service/llvm_ir/ir_builder_mixin.h
@@ -107,6 +107,12 @@ class IrBuilderMixin {
         std::forward<Args>(args)...);
   }
 
+  template <class... Args>
+  llvm::Value* ConstInBoundsGEP1_64(Args&&... args) {
+    return mixin_builder()->CreateConstInBoundsGEP1_64(
+        std::forward<Args>(args)...);
+  }
+
   template <class... Args>
   llvm::Value* FAdd(Args&&... args) {
     return mixin_builder()->CreateFAdd(std::forward<Args>(args)...);
diff --git a/third_party/xla/xla/service/llvm_ir/kernel_support_library.cc b/third_party/xla/xla/service/llvm_ir/kernel_support_library.cc
index 7d72be76a0f7f5..782d308417a765 100644
--- a/third_party/xla/xla/service/llvm_ir/kernel_support_library.cc
+++ b/third_party/xla/xla/service/llvm_ir/kernel_support_library.cc
@@ -19,11 +19,11 @@ limitations under the License.
 #include "xla/service/llvm_ir/llvm_util.h"
 
 namespace xla {
-Status KernelSupportLibrary::ForWithStatus(
+absl::Status KernelSupportLibrary::ForWithStatus(
     absl::string_view name, llvm::Value* start, llvm::Value* end,
     llvm::Value* step,
-    const std::function<Status(llvm::Value*, bool)>& for_body_generator) {
-  return IfWithStatus(b_->CreateICmpSLT(start, end), [&]() -> Status {
+    const std::function<absl::Status(llvm::Value*, bool)>& for_body_generator) {
+  return IfWithStatus(b_->CreateICmpSLT(start, end), [&]() -> absl::Status {
     TF_RETURN_IF_ERROR(for_body_generator(start, /*is_first_iteration=*/true));
     return ForWithStatus(
         name, b_->CreateAdd(start, step), end, step,
@@ -31,36 +31,24 @@ Status KernelSupportLibrary::ForWithStatus(
   });
 }
 
-Status KernelSupportLibrary::ForWithStatus(
+absl::Status KernelSupportLibrary::ForWithStatus(
     absl::string_view name, llvm::Value* start, llvm::Value* end,
-    llvm::Value* step, bool peel_first_iteration,
-    const std::function<Status(llvm::Value*, llvm::Value*)>&
-        for_body_generator) {
-  if (peel_first_iteration) {
-    return ForWithStatus(
-        name, start, end, step, true,
-        [&](llvm::Value* indvar, bool is_first_iteration) -> Status {
-          return for_body_generator(indvar, b_->getInt1(is_first_iteration));
-        });
-  } else {
-    std::unique_ptr<llvm_ir::ForLoop> loop = llvm_ir::ForLoop::EmitForLoop(
-        name, start, end, step, b_,
-        /*unroll_mode=*/unroll_mode_,
-        /*prevent_vectorization=*/prevent_vectorization_);
-    b_->SetInsertPoint(&loop->GetBodyBasicBlock()->back());
-    TF_RETURN_IF_ERROR(
-        for_body_generator(loop->GetIndVarValue(),
-                           /*is_first_iteration=*/b_->CreateICmpEQ(
-                               loop->GetIndVarValue(), start)));
-    llvm_ir::SetToLastInsertPoint(loop->GetExitBasicBlock(), b_);
-    return OkStatus();
-  }
+    llvm::Value* step,
+    const std::function<absl::Status(llvm::Value*)>& for_body_generator) {
+  std::unique_ptr<llvm_ir::ForLoop> loop = llvm_ir::ForLoop::EmitForLoop(
+      name, start, end, step, b_,
+      /*unroll_mode=*/unroll_mode_,
+      /*prevent_vectorization=*/prevent_vectorization_);
+  b_->SetInsertPoint(&loop->GetBodyBasicBlock()->back());
+  TF_RETURN_IF_ERROR(for_body_generator(loop->GetIndVarValue()));
+  llvm_ir::SetToLastInsertPoint(loop->GetExitBasicBlock(), b_);
+  return OkStatus();
 }
 
-Status KernelSupportLibrary::IfWithStatus(
+absl::Status KernelSupportLibrary::IfWithStatus(
     absl::string_view name, llvm::Value* condition,
-    const std::function<Status()>& true_block_generator,
-    const std::function<Status()>& false_block_generator) {
+    const std::function<absl::Status()>& true_block_generator,
+    const std::function<absl::Status()>& false_block_generator) {
   llvm_ir::LlvmIfData if_data =
       llvm_ir::EmitIfThenElse(condition, name, b_,
                               /*emit_else=*/false_block_generator != nullptr);
diff --git a/third_party/xla/xla/service/llvm_ir/kernel_support_library.h b/third_party/xla/xla/service/llvm_ir/kernel_support_library.h
index b395877c8b0a67..3580e76f2512a4 100644
--- a/third_party/xla/xla/service/llvm_ir/kernel_support_library.h
+++ b/third_party/xla/xla/service/llvm_ir/kernel_support_library.h
@@ -45,14 +45,14 @@ class KernelSupportLibrary {
   //
   //   if (`start` < `end`) {
   //     `for_body_generator(/*ind_var=*/start, /*is_first_iteration=*/true)`;
-  //     for (i64 i = `start` + `step`; i s< `end`; i += `step`)
-  //       `for_body_generator(/*ind_var=*/,i, /*is_first_iteration=*/false)`;
+  //     for (i64 i = `start` + `step`; i < `end`; i += `step`)
+  //       `for_body_generator(/*ind_var=*/i, /*is_first_iteration=*/false)`;
   //   }
-  Status ForWithStatus(
+  absl::Status ForWithStatus(
       absl::string_view name, llvm::Value* start, llvm::Value* end,
       llvm::Value* step,
-      const std::function<Status(llvm::Value* ind_var,
-                                 bool is_first_iteration)>& for_body_generator);
+      const std::function<absl::Status(
+          llvm::Value* ind_var, bool is_first_iteration)>& for_body_generator);
 
   void For(
       absl::string_view name, llvm::Value* start, llvm::Value* end,
@@ -60,17 +60,17 @@ class KernelSupportLibrary {
       const std::function<void(llvm::Value* ind_var, bool is_first_iteration)>&
           for_body_generator) {
     CHECK_EQ(OkStatus(),
-             ForWithStatus(
-                 name, start, end, step,
-                 [&](llvm::Value* ind_var, bool is_first_iteration) -> Status {
-                   for_body_generator(ind_var, is_first_iteration);
-                   return OkStatus();
-                 }));
+             ForWithStatus(name, start, end, step,
+                           [&](llvm::Value* ind_var,
+                               bool is_first_iteration) -> absl::Status {
+                             for_body_generator(ind_var, is_first_iteration);
+                             return OkStatus();
+                           }));
   }
 
-  Status ForWithStatus(
+  absl::Status ForWithStatus(
       absl::string_view name, int64_t start, int64_t end, int64_t step,
-      const std::function<Status(
+      const std::function<absl::Status(
           llvm::Value* ind_var, bool is_first_iteration)>& for_body_generator) {
     return ForWithStatus(name, /*start=*/b_->getInt64(start),
                          /*end=*/b_->getInt64(end),
@@ -86,91 +86,36 @@ class KernelSupportLibrary {
         /*step=*/b_->getInt64(step), for_body_generator);
   }
 
-  // Generates the following control flow structure if `peel_first_iteration` is
-  // true:
-  //
-  //   if (`start` < `end`) {
-  //     `for_body_generator(/*ind_var=*/start, /*is_first_iteration=*/,true)`;
-  //     for (i64 i = `start` + `step`; i s< `end`; i += `step`)
-  //       `for_body_generator(/*ind_var=*/,i, /*is_first_iteration=*/,false)`;
-  //   }
-  //
-  // and the following if `peel_first_iteration` is false:
+  // Generates the following control flow structure:
   //
-  //   for (i64 i = `start`; i s< `end`; i += `step`)
-  //     `for_body_generator(/*ind_var=*/,i,
-  //                         /*is_first_iteration=*/,(i != `start`))`;
-  Status ForWithStatus(
-      absl::string_view name, llvm::Value* start, llvm::Value* end,
-      llvm::Value* step, bool peel_first_iteration,
-      const std::function<Status(llvm::Value* ind_var,
-                                 llvm::Value* is_first_iteration)>&
-          for_body_generator);
-
-  void For(absl::string_view name, llvm::Value* start, llvm::Value* end,
-           llvm::Value* step, bool peel_first_iteration,
-           const std::function<void(llvm::Value* ind_var,
-                                    llvm::Value* is_first_iteration)>&
-               for_body_generator) {
-    TF_CHECK_OK(ForWithStatus(
-        name, start, end, step, peel_first_iteration,
-        [&](llvm::Value* ind_var, llvm::Value* is_first_iteration) -> Status {
-          for_body_generator(ind_var, is_first_iteration);
-          return OkStatus();
-        }));
-  }
-
-  Status ForWithStatus(
-      absl::string_view name, llvm::Value* start, llvm::Value* end,
-      int64_t step, bool peel_first_iteration,
-      const std::function<Status(llvm::Value* ind_var,
-                                 llvm::Value* is_first_iteration)>&
-          for_body_generator) {
-    return ForWithStatus(
-        name, /*start=*/start, /*end=*/end,
-        /*step=*/llvm::ConstantInt::get(start->getType(), step),
-        peel_first_iteration, for_body_generator);
-  }
-
-  void For(absl::string_view name, llvm::Value* start, llvm::Value* end,
-           int64_t step, bool peel_first_iteration,
-           const std::function<void(llvm::Value* ind_var,
-                                    llvm::Value* is_first_iteration)>&
-               for_body_generator) {
-    For(name, /*start=*/start, /*end=*/end,
-        /*step=*/llvm::ConstantInt::get(start->getType(), step),
-        peel_first_iteration, for_body_generator);
-  }
-
-  Status ForWithStatus(
+  //   for (i64 i = `start`; i < `end`; i += `step`)
+  //     `for_body_generator(/*ind_var=*/i)`;
+  absl::Status ForWithStatus(
       absl::string_view name, llvm::Value* start, llvm::Value* end,
       llvm::Value* step,
-      const std::function<Status(llvm::Value* ind_var)>& for_body_generator) {
-    return ForWithStatus(name, start, end, step,
-                         /*peel_first_iteration=*/false,
-                         [&](llvm::Value* indvar, llvm::Value*) -> Status {
-                           return for_body_generator(indvar);
-                         });
-  }
+      const std::function<absl::Status(llvm::Value* ind_var)>&
+          for_body_generator);
 
   void For(
       absl::string_view name, llvm::Value* start, llvm::Value* end,
       llvm::Value* step,
       const std::function<void(llvm::Value* ind_var)>& for_body_generator) {
-    For(name, start, end, step,
-        /*peel_first_iteration=*/false, [&](llvm::Value* indvar, llvm::Value*) {
-          return for_body_generator(indvar);
-        });
+    CHECK_EQ(OkStatus(),
+             ForWithStatus(name, start, end, step,
+                           [&](llvm::Value* ind_var) -> absl::Status {
+                             for_body_generator(ind_var);
+                             return OkStatus();
+                           }));
   }
 
-  Status ForWithStatus(
+  absl::Status ForWithStatus(
       absl::string_view name, llvm::Value* start, llvm::Value* end,
       int64_t step,
-      const std::function<Status(llvm::Value* ind_var)>& for_body_generator) {
+      const std::function<absl::Status(llvm::Value* ind_var)>&
+          for_body_generator) {
     return ForWithStatus(name, start, end,
                          llvm::ConstantInt::get(start->getType(), step),
-                         /*peel_first_iteration=*/false,
-                         [&](llvm::Value* indvar, llvm::Value*) -> Status {
+                         [&](llvm::Value* indvar) -> absl::Status {
                            return for_body_generator(indvar);
                          });
   }
@@ -183,9 +128,10 @@ class KernelSupportLibrary {
         for_body_generator);
   }
 
-  Status ForWithStatus(
+  absl::Status ForWithStatus(
       absl::string_view name, int64_t start, int64_t end, int64_t step,
-      const std::function<Status(llvm::Value* ind_var)>& for_body_generator) {
+      const std::function<absl::Status(llvm::Value* ind_var)>&
+          for_body_generator) {
     return ForWithStatus(name, /*start=*/b_->getInt64(start),
                          /*end=*/b_->getInt64(end),
                          /*step=*/b_->getInt64(step), for_body_generator);
@@ -206,17 +152,16 @@ class KernelSupportLibrary {
   //   else
   //      `false_block_generator()`;
   // The else is skipped if false_block_generator is null.
-  Status IfWithStatus(
+  absl::Status IfWithStatus(
       absl::string_view name, llvm::Value* condition,
-      const std::function<Status()>& true_block_generator,
-      const std::function<Status()>& false_block_generator = nullptr);
+      const std::function<absl::Status()>& true_block_generator,
+      const std::function<absl::Status()>& false_block_generator = nullptr);
 
-  Status IfWithStatus(
+  absl::Status IfWithStatus(
       llvm::Value* condition,
-      const std::function<Status()>& true_block_generator,
-      const std::function<Status()>& false_block_generator = []() -> Status {
-        return OkStatus();
-      }) {
+      const std::function<absl::Status()>& true_block_generator,
+      const std::function<absl::Status()>& false_block_generator =
+          []() -> absl::Status { return OkStatus(); }) {
     return IfWithStatus("", condition, true_block_generator,
                         false_block_generator);
   }
diff --git a/third_party/xla/xla/service/llvm_ir/llvm_util.cc b/third_party/xla/xla/service/llvm_ir/llvm_util.cc
index 332c36ddd15345..498f224a13758e 100644
--- a/third_party/xla/xla/service/llvm_ir/llvm_util.cc
+++ b/third_party/xla/xla/service/llvm_ir/llvm_util.cc
@@ -76,6 +76,7 @@ limitations under the License.
 #include "tsl/platform/errors.h"
 #include "tsl/platform/file_system.h"
 #include "tsl/platform/logging.h"
+#include "tsl/profiler/lib/scoped_annotation.h"
 
 namespace xla {
 namespace llvm_ir {
@@ -332,9 +333,12 @@ llvm::Constant* ConvertLiteralToIrConstant(const Literal& literal,
   int64_t size_bytes = literal.size_bytes();
   CHECK_EQ(module->getDataLayout().isLittleEndian(), tsl::port::kLittleEndian);
   std::vector<char> packed_data;
-  if (primitive_util::Is4BitType(literal.shape().element_type())) {
-    packed_data.resize((size_bytes + 1) / 2);
-    PackInt4(absl::MakeSpan(data, size_bytes), absl::MakeSpan(packed_data));
+  if (primitive_util::IsSubByteNonPredType(literal.shape().element_type())) {
+    auto bit_width = primitive_util::BitWidth(literal.shape().element_type());
+    int elements_per_byte = 8 / bit_width;
+    packed_data.resize(CeilOfRatio<int64_t>(size_bytes, elements_per_byte));
+    PackIntN(bit_width, absl::MakeSpan(data, size_bytes),
+             absl::MakeSpan(packed_data));
     data = packed_data.data();
     size_bytes = packed_data.size();
   }
@@ -705,9 +709,9 @@ std::map<int, llvm::MDNode*> MergeMetadata(
   return result;
 }
 
-static Status CreateAndWriteStringToFile(const std::string& directory_name,
-                                         const std::string& file_name,
-                                         const std::string& text) {
+static absl::Status CreateAndWriteStringToFile(
+    const std::string& directory_name, const std::string& file_name,
+    const std::string& text) {
   std::unique_ptr<tsl::WritableFile> f;
   TF_RETURN_IF_ERROR(tsl::Env::Default()->RecursivelyCreateDir(directory_name));
   TF_RETURN_IF_ERROR(tsl::Env::Default()->NewWritableFile(file_name, &f));
@@ -722,6 +726,10 @@ void DumpIrIfEnabled(const HloModule& hlo_module,
   if (!DumpingEnabledForHloModule(hlo_module)) {
     return;
   }
+  tsl::profiler::ScopedAnnotation annotation([&] {
+    return absl::StrFormat("XlaDumpLlvmIr:#module=%s,program_id=%d#",
+                           hlo_module.name(), hlo_module.unique_id());
+  });
   // We can end up compiling different modules with the same name when using
   // XlaJitCompiledCpuFunction::Compile.  Avoid overwriting IR files previously
   // dumped from the same process in such cases.
diff --git a/third_party/xla/xla/service/llvm_ir/llvm_util.h b/third_party/xla/xla/service/llvm_ir/llvm_util.h
index 41945f6cbbeb8b..fab4e6d9e90b4c 100644
--- a/third_party/xla/xla/service/llvm_ir/llvm_util.h
+++ b/third_party/xla/xla/service/llvm_ir/llvm_util.h
@@ -60,13 +60,11 @@ std::string DumpToString(const llvm::Module* module);
 std::string DumpToString(const llvm::Type* type);
 std::string DumpToString(const llvm::Value* value);
 
-// This also works for mlir::Op<...> descendants, such as mlir::ModuleOp and
-// mlir::lmhlo::FusionOp.
+// This also works for mlir::Op<...> descendants, such as mlir::ModuleOp.
 //
 // For findability:
 //   std::string DumpToString(mlir::Op<...>& op);
 //   std::string DumpToString(mlir::ModuleOp& module_op);
-//   std::string DumpToString(mlir::lmhlo::FusionOp& fusion_op);
 //
 // The `operation` parameter is not const, because the used print() method is
 // not const.
diff --git a/third_party/xla/xla/service/llvm_ir/loop_emitter.cc b/third_party/xla/xla/service/llvm_ir/loop_emitter.cc
index 8e07365a88c034..c1b8eed3ac1c94 100644
--- a/third_party/xla/xla/service/llvm_ir/loop_emitter.cc
+++ b/third_party/xla/xla/service/llvm_ir/loop_emitter.cc
@@ -72,7 +72,7 @@ BodyEmitter MakeBodyEmitter(const ElementGenerator& target_element_generator,
                                          target_arrays.end());
   if (!is_tuple) {
     CHECK_EQ(target_arrays.size(), 1);
-    return [=](const llvm_ir::IrArray::Index array_index) -> Status {
+    return [=](const llvm_ir::IrArray::Index array_index) -> absl::Status {
       // Convert target_element_generator to a BodyEmitter.
       TF_ASSIGN_OR_RETURN(llvm::Value * target_element,
                           target_element_generator(array_index));
@@ -178,8 +178,8 @@ std::vector<IrArray::Index> LoopEmitter::EmitIndexAndSetExitBasicBlock(
   return {array_index};
 }
 
-Status LoopEmitter::EmitLoop(absl::string_view loop_name,
-                             llvm::Type* index_type) {
+absl::Status LoopEmitter::EmitLoop(absl::string_view loop_name,
+                                   llvm::Type* index_type) {
   if (index_type == nullptr) {
     index_type = b_->getInt64Ty();
   }
diff --git a/third_party/xla/xla/service/llvm_ir/loop_emitter.h b/third_party/xla/xla/service/llvm_ir/loop_emitter.h
index da23eeb5ce37c2..3182de779f9987 100644
--- a/third_party/xla/xla/service/llvm_ir/loop_emitter.h
+++ b/third_party/xla/xla/service/llvm_ir/loop_emitter.h
@@ -35,7 +35,7 @@ namespace llvm_ir {
 // llvm::Value*.
 using ElementGenerator =
     std::function<absl::StatusOr<llvm::Value*>(const IrArray::Index& index)>;
-using BodyEmitter = std::function<Status(const IrArray::Index& index)>;
+using BodyEmitter = std::function<absl::Status(const IrArray::Index& index)>;
 
 // Creates the body emitter from target arrays.
 BodyEmitter MakeBodyEmitter(const ElementGenerator& target_element_generator,
@@ -79,8 +79,8 @@ class LoopEmitter {
       llvm::Value* base_index);
 
   // Emits a complete loop nest for every element in the given shape.
-  Status EmitLoop(absl::string_view loop_name = "",
-                  llvm::Type* index_type = nullptr);
+  absl::Status EmitLoop(absl::string_view loop_name = "",
+                        llvm::Type* index_type = nullptr);
 
  protected:
   // An IR emitter that generates the loop body.
diff --git a/third_party/xla/xla/service/llvm_ir/sort_util.cc b/third_party/xla/xla/service/llvm_ir/sort_util.cc
index 22ed179ae40958..b9773a2646ddf4 100644
--- a/third_party/xla/xla/service/llvm_ir/sort_util.cc
+++ b/third_party/xla/xla/service/llvm_ir/sort_util.cc
@@ -48,7 +48,7 @@ namespace llvm_ir {
 namespace {
 
 // Adds the inner comparison loop body where we compare elements.
-Status EmitCompareLoopBody(
+absl::Status EmitCompareLoopBody(
     int64_t iteration_bound, int64_t num_values,
     llvm::Value* element_pair_index, int64_t xor_mask, llvm::Type* index_type,
     std::function<llvm::Value*(int64_t operand, llvm::Value* index)>
@@ -153,7 +153,7 @@ Status EmitCompareLoopBody(
   });
 }
 
-Status EmitTiledCompareLoop(
+absl::Status EmitTiledCompareLoop(
     const IrArray::Index& tiled_keys_index, int64_t dimension_to_sort,
     int64_t dimension_to_sort_bound, absl::Span<const int64_t> xor_masks,
     const std::vector<IrArray>& params,
@@ -319,7 +319,7 @@ Status EmitTiledCompareLoop(
 }
 }  // namespace
 
-Status EmitSortInPlace(
+absl::Status EmitSortInPlace(
     int64_t dimension_to_sort, const std::vector<IrArray>& values_arrays,
     absl::string_view name, absl::Span<const int64_t> xor_masks,
     llvm::IRBuilder<>* b, const gpu::LaunchDimensions& launch_dimensions,
@@ -369,7 +369,7 @@ Status EmitSortInPlace(
   }
 
   auto compare_loop_body_emitter =
-      [&](const IrArray::Index& tiles_index) -> Status {
+      [&](const IrArray::Index& tiles_index) -> absl::Status {
     // Naive C++ code for the inner compare loop:
     //
     // for (int64_t i = 0; i < dimension_to_sort_bound; ++i) {
diff --git a/third_party/xla/xla/service/llvm_ir/sort_util.h b/third_party/xla/xla/service/llvm_ir/sort_util.h
index e84c2c9194144b..7cf78f9e7bc425 100644
--- a/third_party/xla/xla/service/llvm_ir/sort_util.h
+++ b/third_party/xla/xla/service/llvm_ir/sort_util.h
@@ -28,12 +28,12 @@ limitations under the License.
 namespace xla {
 namespace llvm_ir {
 using EmitCallToNestedComputationCallback =
-    std::function<Status(absl::Span<llvm::Value* const>, llvm::Value*)>;
+    std::function<absl::Status(absl::Span<llvm::Value* const>, llvm::Value*)>;
 // Emits llvm IR to do pairwise comparisons/swaps in the 'dimension_to_sort'
 // dimension of each array in 'values_arrays'. All other dimensions are kept
 // as-is. This implements the inner loop of BitonicSort. It is assumed that
 // 'xor_masks' contains only powers of 2, or values 2^k - 1 (k > 0).
-Status EmitSortInPlace(
+absl::Status EmitSortInPlace(
     int64_t dimension_to_sort, const std::vector<IrArray>& values_arrays,
     absl::string_view name, absl::Span<const int64_t> xor_masks,
     llvm::IRBuilder<>* b, const gpu::LaunchDimensions& launch_dimensions,
diff --git a/third_party/xla/xla/service/logical_buffer_analysis.cc b/third_party/xla/xla/service/logical_buffer_analysis.cc
index 38bda9003bbc1a..1ec1f488b05c7d 100644
--- a/third_party/xla/xla/service/logical_buffer_analysis.cc
+++ b/third_party/xla/xla/service/logical_buffer_analysis.cc
@@ -51,7 +51,7 @@ LogicalBufferAnalysis::Run(const HloModule* module) {
   return std::move(analysis);
 }
 
-Status LogicalBufferAnalysis::Analyze() {
+absl::Status LogicalBufferAnalysis::Analyze() {
   // Empirically we usually have a few more logical buffers than instructions,
   // so reserve 10% more than the number of instructions to avoid frequent
   // resizes.
@@ -95,7 +95,8 @@ void LogicalBufferAnalysis::NewLogicalBuffer(HloInstruction* instruction,
   logical_buffers_.push_back(std::move(buffer));
 }
 
-Status LogicalBufferAnalysis::DefaultAction(HloInstruction* hlo_instruction) {
+absl::Status LogicalBufferAnalysis::DefaultAction(
+    HloInstruction* hlo_instruction) {
   // Create a logical buffer for each output of the instruction.
   ShapeUtil::ForEachSubshape(
       hlo_instruction->shape(),
@@ -106,38 +107,38 @@ Status LogicalBufferAnalysis::DefaultAction(HloInstruction* hlo_instruction) {
   return OkStatus();
 }
 
-Status LogicalBufferAnalysis::HandleGetTupleElement(HloInstruction*) {
+absl::Status LogicalBufferAnalysis::HandleGetTupleElement(HloInstruction*) {
   // GetTupleElement does not create buffers.
   return OkStatus();
 }
 
-Status LogicalBufferAnalysis::HandleAddDependency(
+absl::Status LogicalBufferAnalysis::HandleAddDependency(
     HloInstruction* add_dependency) {
   // AddDependency just forwards the value of its zero-th operand and does not
   // create buffers.
   return OkStatus();
 }
 
-Status LogicalBufferAnalysis::HandleCopy(HloInstruction* copy) {
+absl::Status LogicalBufferAnalysis::HandleCopy(HloInstruction* copy) {
   // The top-level buffer (index={}) for kCopy is newly created, but all other
   // buffers (in the case of a tuple shape) come from the operand
   NewLogicalBuffer(copy, /*index=*/{});
   return OkStatus();
 }
 
-Status LogicalBufferAnalysis::HandleBitcast(HloInstruction*) {
+absl::Status LogicalBufferAnalysis::HandleBitcast(HloInstruction*) {
   // A kBitcast instruction aliases its operand. That is, the buffer of its
   // result *is* the buffer of its operand.
   return OkStatus();
 }
 
-Status LogicalBufferAnalysis::HandleDomain(HloInstruction*) {
+absl::Status LogicalBufferAnalysis::HandleDomain(HloInstruction*) {
   // A kDomain instruction aliases its operand. That is, the buffer of its
   // result *is* the buffer of its operand.
   return OkStatus();
 }
 
-Status LogicalBufferAnalysis::HandleRecvDone(HloInstruction* recv_done) {
+absl::Status LogicalBufferAnalysis::HandleRecvDone(HloInstruction* recv_done) {
   // RecvDone produces a two-element tuple containing the data value (which
   // aliases part of its operand) and a token. Only the tuple index table and
   // the token are defined by the RecvDone.
@@ -146,7 +147,7 @@ Status LogicalBufferAnalysis::HandleRecvDone(HloInstruction* recv_done) {
   return OkStatus();
 }
 
-Status LogicalBufferAnalysis::HandleSend(HloInstruction* send) {
+absl::Status LogicalBufferAnalysis::HandleSend(HloInstruction* send) {
   // Send creates new buffers for the top-level tuple, the context (tuple
   // element at {1}), and the token (tuple element at {2}). Tuple element at {0}
   // is an alias of the Send operand, so we don't need to create a new Logical
@@ -157,7 +158,8 @@ Status LogicalBufferAnalysis::HandleSend(HloInstruction* send) {
   return OkStatus();
 }
 
-Status LogicalBufferAnalysis::HandleCopyStart(HloInstruction* copy_start) {
+absl::Status LogicalBufferAnalysis::HandleCopyStart(
+    HloInstruction* copy_start) {
   // CopyStart defines the tuple, target buffer at index {0}, and context at
   // index {2}.
   NewLogicalBuffer(copy_start, /*index=*/{});
@@ -166,19 +168,20 @@ Status LogicalBufferAnalysis::HandleCopyStart(HloInstruction* copy_start) {
   return OkStatus();
 }
 
-Status LogicalBufferAnalysis::HandleCopyDone(HloInstruction* copy_done) {
+absl::Status LogicalBufferAnalysis::HandleCopyDone(HloInstruction* copy_done) {
   // The output of CopyDone aliases with operand {0}. CopyDone doesn't create
   // any buffers.
   return OkStatus();
 }
 
-Status LogicalBufferAnalysis::HandleTuple(HloInstruction* tuple) {
+absl::Status LogicalBufferAnalysis::HandleTuple(HloInstruction* tuple) {
   // A Tuple instruction only creates the top-level buffer.
   NewLogicalBuffer(tuple, /*index=*/{});
   return OkStatus();
 }
 
-Status LogicalBufferAnalysis::HandleCustomCall(HloInstruction* custom_call) {
+absl::Status LogicalBufferAnalysis::HandleCustomCall(
+    HloInstruction* custom_call) {
   auto ccall = Cast<HloCustomCallInstruction>(custom_call);
   absl::flat_hash_set<ShapeIndex> aliased_outputs;
   for (const auto& pair : ccall->output_to_operand_aliasing()) {
@@ -200,7 +203,7 @@ Status LogicalBufferAnalysis::HandleCustomCall(HloInstruction* custom_call) {
 // accommodate the output-operand aliases (similar to HandleCustomCall).
 // TODO (sacer): We might want to consider the pairs discovered by
 // GetFusionInstructionInPlaceInputOutputPairs() here as well.
-Status LogicalBufferAnalysis::HandleFusion(HloInstruction* fusion) {
+absl::Status LogicalBufferAnalysis::HandleFusion(HloInstruction* fusion) {
   auto cfusion = Cast<HloFusionInstruction>(fusion);
   absl::flat_hash_set<ShapeIndex> aliased_outputs;
   for (const auto& pair : cfusion->output_to_operand_aliasing()) {
diff --git a/third_party/xla/xla/service/logical_buffer_analysis.h b/third_party/xla/xla/service/logical_buffer_analysis.h
index b7ca006ea6b023..cb2bde21028167 100644
--- a/third_party/xla/xla/service/logical_buffer_analysis.h
+++ b/third_party/xla/xla/service/logical_buffer_analysis.h
@@ -46,7 +46,7 @@ class LogicalBufferAnalysis : public DfsHloVisitorWithDefault {
 
  private:
   explicit LogicalBufferAnalysis(const HloModule* module) : module_(module) {}
-  Status Analyze();
+  absl::Status Analyze();
 
   // The module this analysis is performed on.
   const HloModule* module_;
@@ -56,19 +56,20 @@ class LogicalBufferAnalysis : public DfsHloVisitorWithDefault {
   // accessed with GetBuffer.
   void NewLogicalBuffer(HloInstruction* instruction, const ShapeIndex& index);
 
-  Status DefaultAction(HloInstruction* hlo_instruction) override;
-  Status HandleTuple(HloInstruction* tuple) override;
-  Status HandleGetTupleElement(HloInstruction* get_tuple_element) override;
-  Status HandleBitcast(HloInstruction* bitcast) override;
-  Status HandleDomain(HloInstruction* domain) override;
-  Status HandleCopy(HloInstruction* copy) override;
-  Status HandleCopyStart(HloInstruction* copy_start) override;
-  Status HandleCopyDone(HloInstruction* copy_done) override;
-  Status HandleRecvDone(HloInstruction* recv_done) override;
-  Status HandleSend(HloInstruction* send) override;
-  Status HandleAddDependency(HloInstruction* add_dependency) override;
-  Status HandleCustomCall(HloInstruction* custom_call) override;
-  Status HandleFusion(HloInstruction* fusion) override;
+  absl::Status DefaultAction(HloInstruction* hlo_instruction) override;
+  absl::Status HandleTuple(HloInstruction* tuple) override;
+  absl::Status HandleGetTupleElement(
+      HloInstruction* get_tuple_element) override;
+  absl::Status HandleBitcast(HloInstruction* bitcast) override;
+  absl::Status HandleDomain(HloInstruction* domain) override;
+  absl::Status HandleCopy(HloInstruction* copy) override;
+  absl::Status HandleCopyStart(HloInstruction* copy_start) override;
+  absl::Status HandleCopyDone(HloInstruction* copy_done) override;
+  absl::Status HandleRecvDone(HloInstruction* recv_done) override;
+  absl::Status HandleSend(HloInstruction* send) override;
+  absl::Status HandleAddDependency(HloInstruction* add_dependency) override;
+  absl::Status HandleCustomCall(HloInstruction* custom_call) override;
+  absl::Status HandleFusion(HloInstruction* fusion) override;
 
   // A map from the buffer ID to the logical buffer
   std::vector<std::unique_ptr<LogicalBuffer>> logical_buffers_;
diff --git a/third_party/xla/xla/service/map_inliner.cc b/third_party/xla/xla/service/map_inliner.cc
index eea543fe2d6e21..8d5915a681870a 100644
--- a/third_party/xla/xla/service/map_inliner.cc
+++ b/third_party/xla/xla/service/map_inliner.cc
@@ -39,11 +39,11 @@ class MapInlinerVisitor : public DfsHloVisitorWithDefault {
       : computation_(computation) {}
 
   // Default visitor action is to do nothing and return OK.
-  Status DefaultAction(HloInstruction* /*hlo_instruction*/) override {
+  absl::Status DefaultAction(HloInstruction* /*hlo_instruction*/) override {
     return OkStatus();
   }
 
-  Status HandleMap(HloInstruction* map) override;
+  absl::Status HandleMap(HloInstruction* map) override;
 
   // Runs the visitor on a computation.
   absl::StatusOr<bool> Run(HloComputation* computation);
@@ -63,7 +63,7 @@ absl::StatusOr<bool> MapInlinerVisitor::Run(HloComputation* computation) {
   return changed_;
 }
 
-Status MapInlinerVisitor::HandleMap(HloInstruction* map) {
+absl::Status MapInlinerVisitor::HandleMap(HloInstruction* map) {
   HloComputation* function = map->to_apply();
   HloInstruction& root = *function->root_instruction();
   // Only inlining functions that are simply a single operation until a better
diff --git a/third_party/xla/xla/service/mapped_ptr_container_sorter.h b/third_party/xla/xla/service/mapped_ptr_container_sorter.h
index 1b1dc5df19d5cc..e55f49179a3316 100644
--- a/third_party/xla/xla/service/mapped_ptr_container_sorter.h
+++ b/third_party/xla/xla/service/mapped_ptr_container_sorter.h
@@ -94,9 +94,9 @@ class MappedPtrContainerSorter {
   // - unmapped_index() returns an invalid index
   // - An internal error occurs. (This should theoretically not happen.)
   template <typename OrderedTy, typename UnorderedTy>
-  static Status Sort(MapPtrFn map_ptr, UnmappedPtrIndexFn unmapped_index,
-                     const OrderedTy& ordered_container,
-                     UnorderedTy& unordered_container);
+  static absl::Status Sort(MapPtrFn map_ptr, UnmappedPtrIndexFn unmapped_index,
+                           const OrderedTy& ordered_container,
+                           UnorderedTy& unordered_container);
 
  private:
   // A class for sorting the indices of the unordered_container.
@@ -114,8 +114,8 @@ class MappedPtrContainerSorter {
     // Specify the partial ordering value of a mapped element from the
     // unordered container. The partial ordering is amongst other mapped
     // elements.
-    Status AddMappedElement(size_t unordered_container_index,
-                            size_t partial_order);
+    absl::Status AddMappedElement(size_t unordered_container_index,
+                                  size_t partial_order);
 
     // Specify the index (amongst mapped elements), where an unmapped element
     // should be inserted. The unmapped element is inserted just after the
@@ -227,7 +227,8 @@ MappedPtrContainerSorter<PointedToTy>::InvalidIndexFn() {
 }
 
 template <typename PointedToTy>
-Status MappedPtrContainerSorter<PointedToTy>::SortedIndices::AddMappedElement(
+absl::Status
+MappedPtrContainerSorter<PointedToTy>::SortedIndices::AddMappedElement(
     size_t unordered_container_index, size_t partial_order) {
   if (partial_order >= mapped_element_indices_by_partial_order_.size()) {
     return InternalStrCat("invalid partial order: ", partial_order, " v max(",
@@ -440,7 +441,7 @@ void MappedPtrContainerSorter<PointedToTy>::Reorder(
 
 template <typename PointedToTy>
 template <typename OrderedTy, typename UnorderedTy>
-Status MappedPtrContainerSorter<PointedToTy>::Sort(
+absl::Status MappedPtrContainerSorter<PointedToTy>::Sort(
     MapPtrFn map_ptr, UnmappedPtrIndexFn unmapped_index,
     const OrderedTy& ordered_container, UnorderedTy& unordered_container) {
   std::vector<size_t> indices;
diff --git a/third_party/xla/xla/service/memory_space_assignment/BUILD b/third_party/xla/xla/service/memory_space_assignment/BUILD
index 4b0f937c26e7d2..55b865c94a1e52 100644
--- a/third_party/xla/xla/service/memory_space_assignment/BUILD
+++ b/third_party/xla/xla/service/memory_space_assignment/BUILD
@@ -442,7 +442,6 @@ cc_library(
         "//xla/service:call_graph",
         "//xla/service:hlo_alias_analysis",
         "//xla/service:hlo_buffer",
-        "//xla/service:hlo_cost_analysis",
         "//xla/service:hlo_dataflow_analysis",
         "//xla/service:hlo_proto_cc",
         "//xla/service:hlo_value",
diff --git a/third_party/xla/xla/service/memory_space_assignment/algorithm.cc b/third_party/xla/xla/service/memory_space_assignment/algorithm.cc
index b294141f1fbbb8..b2b9ef640b5545 100644
--- a/third_party/xla/xla/service/memory_space_assignment/algorithm.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/algorithm.cc
@@ -56,7 +56,6 @@ limitations under the License.
 #include "xla/service/heap_simulator/heap_simulator.h"
 #include "xla/service/hlo_alias_analysis.h"
 #include "xla/service/hlo_buffer.h"
-#include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/hlo_dataflow_analysis.h"
 #include "xla/service/hlo_value.h"
 #include "xla/service/memory_space_assignment/allocation.h"
@@ -943,8 +942,9 @@ void MsaAlgorithm::DumpDebugStringsIfEnabled() const {
   options_.dump_fn("scheduleinfo", instruction_schedule_str_);
 }
 
-Status MsaAlgorithm::OptimizeMemoryBoundLoop(int loop_start_idx,
-                                             int loop_end_idx, int loop_size) {
+absl::Status MsaAlgorithm::OptimizeMemoryBoundLoop(int loop_start_idx,
+                                                   int loop_end_idx,
+                                                   int loop_size) {
   // The MemoryBoundLoopOptimizer works with a minimum of three unrolled loop
   // iterations: previous, current, and next. So, we pick the second iteration
   // out of the loop as the current iteration.
@@ -1033,7 +1033,7 @@ Status MsaAlgorithm::OptimizeMemoryBoundLoop(int loop_start_idx,
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 namespace {
@@ -1477,7 +1477,7 @@ absl::StatusOr<HeapSimulator::Result<HloValue>> MsaAlgorithm::Finish() {
         VLOG(2) << "Repacking.";
         auto repack_status =
             options_.repacker->Repack(absl::MakeSpan(repack_allocation_blocks));
-        CHECK_EQ(repack_status.status(), OkStatus());
+        CHECK_EQ(repack_status.status(), absl::OkStatus());
         VLOG(2) << "Repack complete. Modified = " << *repack_status;
         // For debug and testing purpose, also update allocations if
         // repack_after_every_allocation is on.
@@ -1525,7 +1525,7 @@ absl::StatusOr<HeapSimulator::Result<HloValue>> MsaAlgorithm::Finish() {
     VLOG(2) << "Final Repacking.";
     auto repack_status =
         options_.repacker->Repack(absl::MakeSpan(repack_allocation_blocks));
-    CHECK_EQ(repack_status.status(), OkStatus());
+    CHECK_EQ(repack_status.status(), absl::OkStatus());
     VLOG(2) << "Final Repack complete. Modified = " << *repack_status;
   }
 
@@ -1708,16 +1708,15 @@ MsaAlgorithm::GetInefficientAllocationSites(
           const HloPosition& defining_position =
               allocation->defining_position();
           int64_t accessed =
-              options_.cost_analysis->hlo_cost_analysis().output_bytes_accessed(
+              options_.cost_analysis->base_costs().OutputBytesAccessed(
                   *defining_position.instruction, defining_position.index);
           VLOG(3) << "  pos: " << defining_position.ToString()
                   << ", accessed: " << accessed << " / " << size;
         }
         for (const HloUse& use : allocation->uses()) {
           int64_t accessed =
-              options_.cost_analysis->hlo_cost_analysis()
-                  .operand_bytes_accessed(*use.instruction, use.operand_number,
-                                          use.operand_index);
+              options_.cost_analysis->base_costs().OperandBytesAccessed(
+                  *use.instruction, use.operand_number, use.operand_index);
           VLOG(3) << "  use: " << use.ToString() << ", accessed: " << accessed
                   << " / " << size;
         }
@@ -1745,17 +1744,15 @@ MsaAlgorithm::GetInefficientAllocationSites(
         copy_bytes += size;
       }
       if (position_memory_space == MemorySpace::kAlternate) {
-        use_bytes +=
-            options_.cost_analysis->hlo_cost_analysis().output_bytes_accessed(
-                *allocation->defining_position().instruction,
-                allocation->defining_position().index);
+        use_bytes += options_.cost_analysis->base_costs().OutputBytesAccessed(
+            *allocation->defining_position().instruction,
+            allocation->defining_position().index);
       }
       if (allocation->memory_space() == MemorySpace::kAlternate) {
         for (const HloUse& use : allocation->uses()) {
           use_bytes +=
-              options_.cost_analysis->hlo_cost_analysis()
-                  .operand_bytes_accessed(*use.instruction, use.operand_number,
-                                          use.operand_index);
+              options_.cost_analysis->base_costs().OperandBytesAccessed(
+                  *use.instruction, use.operand_number, use.operand_index);
         }
       }
     }
@@ -1898,263 +1895,25 @@ absl::StatusOr<MsaAlgorithm::Result> MsaAlgorithm::AllocateAllocationValues(
       preferred_offset = preferred_offset_it->second;
     }
 
+    const AllocationValue::Use* previous_use = nullptr;
     // Iterate over the uses.
-    for (int use_idx = 0; use_idx < allocation_value.uses().size(); ++use_idx) {
-      const AllocationValue::Use& use = allocation_value.uses().at(use_idx);
-      const HloUse hlo_use = use.hlo_use;
-      int64_t use_time = instruction_schedule.at(hlo_use.instruction);
-      bool allow_no_copy_alternate_mem_allocation = true;
-      bool allow_prefetch = true;
-      bool prefer_no_copy_alternate_mem_allocation = false;
-      // TODO(b/318886791):  Rename boundary variables (here and other places)
-      // like `latest_prefetch_time` and `earliest_prefetch_time` indicate
-      // whether they are exclusive or inclusive boundaries.
-      int64_t latest_prefetch_time = use_time;
-      std::optional<int64_t> earliest_prefetch_time = std::nullopt;
-
-      // Assign the required assignment offset as a preferred offset.
-      std::optional<RequiredMemoryAssignment> required_assignment =
-          AliasedRequiredAssignmentForUse(use);
-      if (required_assignment &&
-          required_assignment->memory_space == MemorySpace::kAlternate) {
-        if (preferred_offset) {
-          CHECK_EQ(preferred_offset, required_assignment->offset);
-        } else {
-          preferred_offset = required_assignment->offset;
-          VLOG(3)
-              << "Setting preferred offset due to required assignment for use: "
-              << preferred_offset->offset;
-        }
-      }
-
-      // Control flow  calls include kWhile, kCall, and kConditional opcodes.
-      bool is_sequential_call =
-          (GetInstructionCallContext(hlo_use.instruction->opcode()) ==
-           CallContext::kControlFlow);
-      if (is_sequential_call) {
-        for (const HloComputation* called_computation :
-             hlo_use.instruction->called_computations()) {
-          const HloLiveRange::TimeBound& computation_span =
-              hlo_live_range_.computation_span_times().at(called_computation);
-          latest_prefetch_time =
-              std::min(computation_span.start - 1, latest_prefetch_time);
-        }
-        if (hlo_use.instruction->opcode() == HloOpcode::kWhile) {
-          // Given an example while loop and flattened schedule (logical times
-          // shown on the left):
-          //
-          // 0:  a = ...
-          // 1:  ...
-          //     cond {
-          // 2:   p = param(0)
-          // 3:   ...
-          //     }
-          //     body {
-          // 4:   p = param(0)
-          // 5:   ...
-          // 6:   ROOT ...
-          //     }
-          // 7:  w = while(a), body=body, cond=cond
-          //
-          // When processing "a" (time 0) and its while use (time 7), we update
-          // the interval to time 0-4. This is so that the remaining interval
-          // (5-6) can be allocated separately and this buffer doesn't waste
-          // alternate memory space within the while loop body.
-          HloComputation* while_body = hlo_use.instruction->while_body();
-          // We require while body ROOTs to be the last in the schedule.
-          CHECK_EQ(instruction_schedule.at(while_body->root_instruction()) + 1,
-                   instruction_schedule.at(hlo_use.instruction))
-              << "While body ROOTs need to be the last in the schedule! "
-                 "Please run RootInstructionSinker.";
-          // Replace the use time with the parameter time so that we can decide
-          // on alternate memory allocations within the while loop body when we
-          // look at uses within the while loop body.
-          use_time =
-              instruction_schedule.at(while_body->parameter_instruction(0));
-        } else if (hlo_use.instruction->opcode() == HloOpcode::kConditional) {
-          // Replace the use time with the earliest parameter of called
-          // computations.
-          for (const HloComputation* called_computation :
-               hlo_use.instruction->called_computations()) {
-            use_time = std::min(
-                use_time, instruction_schedule.at(
-                              called_computation->parameter_instruction(0)));
-          }
-        }
-      }
-
-      // Add a required assignment in default memory if the use not allowed in
-      // alternate memory.
-      if (!IsUseAllowedInAlternateMemory(allocation_value, hlo_use)) {
-        if (require_no_copy_alternate_mem_allocation) {
-          LOG(WARNING)
-              << "The value " << allocation_value.value()->ToShortString()
-              << " is pre-colored for alternate memory but the use "
-              << hlo_use.ToString()
-              << " is not allowed in the alternate memory. Respecting the "
-                 "color but this may break things later in compilation.";
-        } else {
-          AddRequiredAssignment(allocation_value.value(), hlo_use.instruction,
-                                MemorySpace::kDefault, use_time);
-        }
-      } else if (use_idx > 0) {
-        // We allow buffers in alternate memory that are passed into
-        // conditionals to give up their alternate memory allocation inside the
-        // called computation. This means that if a conditional operator has an
-        // alternate memory allocation, subsequent uses cannot use the same
-        // alternate memory allocation in order not to clobber data. So we force
-        // default memory allocation for these subsequent uses.
-        const AllocationValue::Use& previous_use =
-            allocation_value.uses().at(use_idx - 1);
-        if (previous_use.hlo_use.instruction->opcode() ==
-                HloOpcode::kConditional &&
-            previous_use.hlo_use.instruction != hlo_use.instruction) {
-          allow_no_copy_alternate_mem_allocation = false;
-          earliest_prefetch_time =
-              instruction_schedule.at(previous_use.hlo_use.instruction);
-          VLOG(3) << "Previous use (" << previous_use.hlo_use.ToString()
-                  << ") of use (" << hlo_use.ToString()
-                  << ") is a conditional, so this use will need to evict. "
-                  << "Earliest prefetch time = " << *earliest_prefetch_time;
-        }
-      }
-
-      // Bitcasts don't define buffers and don't directly consume buffers. Skip
-      // allocating buffers for bitcast uses (unless they are the root
+    for (AllocationValue::Use& use : allocation_value.uses()) {
+      preferred_offset = UpdatePreferredOffsetForUse(use, preferred_offset);
+      AllocationRequest request = CreateAllocationRequest(
+          allocation_value, use, previous_use, preferred_offset,
+          definition_time, require_no_copy_alternate_mem_allocation,
+          all_use_times);
+      // Bitcasts don't define buffers and don't directly consume buffers.
+      // Skip allocating buffers for bitcast uses (unless they are the root
       // instruction). The uses that feed from bitcasts will be handled
       // specially.
-      if (hlo_use.instruction->opcode() != HloOpcode::kBitcast ||
-          hlo_use.instruction ==
-              hlo_use.instruction->parent()->root_instruction()) {
-        std::optional<int64_t> preferred_prefetch_time = std::nullopt;
-        auto loop_optimized_allocation_it =
-            loop_optimized_allocations_map_.find(use.hlo_use);
-        if (loop_optimized_allocation_it !=
-            loop_optimized_allocations_map_.end()) {
-          const LoopOptimizedAllocationInfo& loop_optimized_allocation_info =
-              loop_optimized_allocation_it->second;
-          const Allocation* allocation =
-              loop_optimized_allocation_info.loop_optimized_allocation;
-          VLOG(3) << "Found optimized allocation for " << use.hlo_use.ToString()
-                  << " (loop idx: " << loop_optimized_allocation_info.use_index
-                  << "): " << allocation->ToString();
-          if (require_no_copy_alternate_mem_allocation) {
-            if (allocation->is_copy_allocation() ||
-                allocation->memory_space() == MemorySpace::kDefault) {
-              LOG(WARNING) << "Optimized allocation could not be applied "
-                              "because the tensor is pre-colored, allocation: "
-                           << allocation->ToString();
-            }
-          } else if (allocation->is_copy_allocation()) {
-            allow_no_copy_alternate_mem_allocation = true;
-            const CopyAllocation* copy_allocation =
-                static_cast<const CopyAllocation*>(allocation);
-            int64_t effective_copy_start_time =
-                copy_allocation->copy_start_schedule_after();
-            if (copy_allocation->copy_start_schedule_after() ==
-                    loop_optimized_allocation_info.loop_size - 1 &&
-                copy_allocation->copy_done_schedule_before() == 0) {
-              effective_copy_start_time =
-                  -loop_optimized_allocation_info.loop_size;
-            } else if (copy_allocation->copy_start_schedule_after() + 1 >=
-                       copy_allocation->copy_done_schedule_before()) {
-              effective_copy_start_time -=
-                  loop_optimized_allocation_info.loop_size;
-            }
-            preferred_prefetch_time =
-                hlo_live_range_.instruction_schedule().at(hlo_use.instruction) -
-                loop_optimized_allocation_info.use_index +
-                effective_copy_start_time;
-            VLOG(3) << "Prefer prefetch at " << *preferred_prefetch_time
-                    << " (effective: " << effective_copy_start_time << ")";
-          } else if (allocation->memory_space() == MemorySpace::kDefault) {
-            allow_prefetch = false;
-            allow_no_copy_alternate_mem_allocation = false;
-            VLOG(3) << "Disallowing alternate memory allocation.";
-          } else {
-            CHECK(allocation->memory_space() == MemorySpace::kAlternate);
-            prefer_no_copy_alternate_mem_allocation = true;
-            VLOG(3) << "Prefer no-copy alternate memory allocation.";
-          }
-        }
-
-        if (options_.use_repeated_instance_for_preferred_prefetch_time) {
-          const std::vector<const HloInstruction*>* repeated_insts =
-              GetRepeatedInstructionList(hlo_use.instruction);
-          if (repeated_insts) {
-            for (int i = 0; i < repeated_insts->size(); ++i) {
-              const HloInstruction* repeated = repeated_insts->at(i);
-              VLOG(4) << "Repeated instruction for use: " << repeated->name()
-                      << " "
-                      << hlo_live_range_.instruction_schedule().at(repeated);
-              if (repeated == hlo_use.instruction && i > 0) {
-                const HloInstruction* prev_repeated = repeated_insts->at(i - 1);
-                if (prev_repeated->parent() == hlo_use.instruction->parent()) {
-                  preferred_prefetch_time =
-                      hlo_live_range_.instruction_schedule().at(prev_repeated) +
-                      1;
-                  VLOG(3) << "Found a previous repeated ("
-                          << prev_repeated->name() << ") at "
-                          << (*preferred_prefetch_time - 1)
-                          << ". Setting preferred prefetch time = "
-                          << *preferred_prefetch_time;
-                }
-              }
-            }
-          }
-        }
-        AllocationRequest request;
-
-        int64_t live_range_start_time =
-            (earliest_prefetch_time.has_value()
-                 ? earliest_prefetch_time.value()
-                 : std::min(definition_time, use_time));
-        auto overridden_preferred_prefetch_time =
-            GetOverriddenPreferredPrefetchTime(
-                options_.preferred_prefetch_overrides, allocation_value.size(),
-                hlo_use, instruction_schedule, live_range_start_time,
-                latest_prefetch_time);
-        TF_CHECK_OK(overridden_preferred_prefetch_time.status());
-        if (overridden_preferred_prefetch_time.value().has_value()) {
-          LOG(INFO) << "Overriding preferred prefetch for "
-                    << hlo_use.instruction->name() << " operand number "
-                    << hlo_use.operand_number << " operand index "
-                    << hlo_use.operand_index.ToString() << " size "
-                    << allocation_value.size() << " live range ("
-                    << live_range_start_time << ", " << latest_prefetch_time
-                    << ") from "
-                    << (preferred_prefetch_time.has_value()
-                            ? preferred_prefetch_time.value()
-                            : -1)
-                    << " to "
-                    << overridden_preferred_prefetch_time.value().value();
-          preferred_prefetch_time = overridden_preferred_prefetch_time.value();
-        }
-
-        // Rarely, (e.g., when conditional true and false parameters are the
-        // same), definition time can be the time of the conditional and use
-        // time is the parameter use, which is less.
-        request.inclusive_start_time = std::min(definition_time, use_time);
-        request.end_time = use_time;
-        request.latest_prefetch_time = latest_prefetch_time;
-        request.size = allocation_value.size();
-        request.prefer_no_copy_alternate_mem_allocation =
-            prefer_no_copy_alternate_mem_allocation;
-        request.allow_no_copy_alternate_mem_allocation =
-            allow_no_copy_alternate_mem_allocation;
-        request.allow_prefetch = allow_prefetch;
-        request.require_no_copy_alternate_mem_allocation =
-            require_no_copy_alternate_mem_allocation;
-        request.earliest_prefetch_time = earliest_prefetch_time;
-        request.preferred_prefetch_time = preferred_prefetch_time;
-        request.preferred_offset = preferred_offset;
-        request.use = &use;
-        request.allocation_value = &allocation_value;
-        request.all_use_times = all_use_times;
+      if (use.hlo_use.instruction->opcode() != HloOpcode::kBitcast ||
+          use.hlo_use.instruction ==
+              use.hlo_use.instruction->parent()->root_instruction()) {
         result_mark(AllocateSegment(request), result);
         if (request.require_no_copy_alternate_mem_allocation &&
             result != Result::kSuccess) {
-          Status failed_precondition = FailedPrecondition(
+          absl::Status failed_precondition = FailedPrecondition(
               "The value defined at %s requires allocation in the alternate "
               "memory, which could not be satisfied. This typically happens "
               "because more pinned buffers are live than the alternate memory "
@@ -2170,98 +1929,363 @@ absl::StatusOr<MsaAlgorithm::Result> MsaAlgorithm::AllocateAllocationValues(
           return result;
         }
 
-        // If there are multiple uses, they can try using the memory allocation
-        // already at the alternate memory.
-        definition_time = instruction_schedule.at(hlo_use.instruction);
+        // If there are multiple uses, they can try using the memory
+        // allocation already at the alternate memory.
+        definition_time = instruction_schedule.at(use.hlo_use.instruction);
+        previous_use = &use;
+      }
+      const auto use_time = request.end_time;
+      UpdateAllocationRequirementForUseAliases(allocation_value, use, use_time);
+      MaybeCreateMirroredParentAllocationForWhileUse(
+          allocation_value, use, use_time, allocation_values,
+          preferred_offset_for_computation);
+    }
+  }
+  return result;
+}
+
+MsaAlgorithm::AliasedOffset* MsaAlgorithm::UpdatePreferredOffsetForUse(
+    const AllocationValue::Use& use,
+    MsaAlgorithm::AliasedOffset* preferred_offset) const {
+  // Assign the required assignment offset as a preferred offset.
+  std::optional<RequiredMemoryAssignment> required_assignment =
+      AliasedRequiredAssignmentForUse(use);
+  if (required_assignment &&
+      required_assignment->memory_space == MemorySpace::kAlternate) {
+    if (preferred_offset) {
+      CHECK_EQ(preferred_offset, required_assignment->offset);
+    } else {
+      preferred_offset = required_assignment->offset;
+      VLOG(3) << "Setting preferred offset due to required assignment for use: "
+              << preferred_offset->offset;
+    }
+  }
+  return preferred_offset;
+}
+
+MsaAlgorithm::AllocationRequest MsaAlgorithm::CreateAllocationRequest(
+    AllocationValue& allocation_value, const AllocationValue::Use& use,
+    const AllocationValue::Use* previous_use, AliasedOffset* preferred_offset,
+    int64_t definition_time, bool require_no_copy_alternate_mem_allocation,
+    const std::vector<int64_t>& all_use_times) {
+  const HloUse& hlo_use = use.hlo_use;
+  const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
+  int64_t use_time = instruction_schedule.at(hlo_use.instruction);
+  bool allow_no_copy_alternate_mem_allocation = true;
+  bool allow_prefetch = true;
+  bool prefer_no_copy_alternate_mem_allocation = false;
+  // TODO(b/318886791):  Rename boundary variables (here and other places)
+  // like `latest_prefetch_time` and `earliest_prefetch_time` indicate
+  // whether they are exclusive or inclusive boundaries.
+  int64_t latest_prefetch_time = use_time;
+  std::optional<int64_t> earliest_prefetch_time = std::nullopt;
+
+  // Control flow  calls include kWhile, kCall, and kConditional opcodes.
+  bool is_sequential_call =
+      (GetInstructionCallContext(hlo_use.instruction->opcode()) ==
+       CallContext::kControlFlow);
+  if (is_sequential_call) {
+    for (const HloComputation* called_computation :
+         hlo_use.instruction->called_computations()) {
+      const HloLiveRange::TimeBound& computation_span =
+          hlo_live_range_.computation_span_times().at(called_computation);
+      latest_prefetch_time =
+          std::min(computation_span.start - 1, latest_prefetch_time);
+    }
+    if (hlo_use.instruction->opcode() == HloOpcode::kWhile) {
+      // Given an example while loop and flattened schedule (logical times
+      // shown on the left):
+      //
+      // 0:  a = ...
+      // 1:  ...
+      //     cond {
+      // 2:   p = param(0)
+      // 3:   ...
+      //     }
+      //     body {
+      // 4:   p = param(0)
+      // 5:   ...
+      // 6:   ROOT ...
+      //     }
+      // 7:  w = while(a), body=body, cond=cond
+      //
+      // When processing "a" (time 0) and its while use (time 7), we update
+      // the interval to time 0-4. This is so that the remaining interval
+      // (5-6) can be allocated separately and this buffer doesn't waste
+      // alternate memory space within the while loop body.
+      HloComputation* while_body = hlo_use.instruction->while_body();
+      // We require while body ROOTs to be the last in the schedule.
+      CHECK_EQ(instruction_schedule.at(while_body->root_instruction()) + 1,
+               instruction_schedule.at(hlo_use.instruction))
+          << "While body ROOTs need to be the last in the schedule! "
+             "Please run RootInstructionSinker.";
+      // Replace the use time with the parameter time so that we can decide
+      // on alternate memory allocations within the while loop body when we
+      // look at uses within the while loop body.
+      use_time = instruction_schedule.at(while_body->parameter_instruction(0));
+    } else if (hlo_use.instruction->opcode() == HloOpcode::kConditional) {
+      // Replace the use time with the earliest parameter of called
+      // computations.
+      for (const HloComputation* called_computation :
+           hlo_use.instruction->called_computations()) {
+        use_time = std::min(use_time,
+                            instruction_schedule.at(
+                                called_computation->parameter_instruction(0)));
       }
+    }
+  }
 
-      // Propagate the allocation to any aliases this use might have had.
-      Allocation* aliased_allocation = GetLiveAllocationAt(
-          *allocation_value.allocation_sequence(), use_time);
-      for (const HloPosition& aliased_position : use.aliases) {
-        AddAliasedRequiredAssignment(aliased_position.instruction,
-                                     aliased_position.index,
-                                     aliased_allocation);
+  // Add a required assignment in default memory if the use not allowed in
+  // alternate memory.
+  if (!IsUseAllowedInAlternateMemory(allocation_value, hlo_use)) {
+    if (require_no_copy_alternate_mem_allocation) {
+      LOG(WARNING) << "The value " << allocation_value.value()->ToShortString()
+                   << " is pre-colored for alternate memory but the use "
+                   << hlo_use.ToString()
+                   << " is not allowed in the alternate memory. Respecting the "
+                      "color but this may break things later in compilation.";
+    } else {
+      AddRequiredAssignment(allocation_value.value(), hlo_use.instruction,
+                            MemorySpace::kDefault, use_time);
+    }
+  } else if (previous_use != nullptr) {
+    // We allow buffers in alternate memory that are passed into
+    // conditionals to give up their alternate memory allocation inside the
+    // called computation. This means that if a conditional operator has an
+    // alternate memory allocation, subsequent uses cannot use the same
+    // alternate memory allocation in order not to clobber data. So we force
+    // default memory allocation for these subsequent uses.
+    if (previous_use->hlo_use.instruction->opcode() ==
+            HloOpcode::kConditional &&
+        previous_use->hlo_use.instruction != hlo_use.instruction) {
+      allow_no_copy_alternate_mem_allocation = false;
+      earliest_prefetch_time =
+          instruction_schedule.at(previous_use->hlo_use.instruction);
+      VLOG(3) << "Previous use (" << previous_use->hlo_use.ToString()
+              << ") of use (" << hlo_use.ToString()
+              << ") is a conditional, so this use will need to evict. "
+              << "Earliest prefetch time = " << *earliest_prefetch_time;
+    }
+  }
+
+  AllocationRequest request;
+  // Bitcasts don't define buffers and don't directly consume buffers. Skip
+  // allocating buffers for bitcast uses (unless they are the root
+  // instruction). The uses that feed from bitcasts will be handled
+  // specially.
+  if (hlo_use.instruction->opcode() != HloOpcode::kBitcast ||
+      hlo_use.instruction ==
+          hlo_use.instruction->parent()->root_instruction()) {
+    std::optional<int64_t> preferred_prefetch_time = std::nullopt;
+    auto loop_optimized_allocation_it =
+        loop_optimized_allocations_map_.find(use.hlo_use);
+    if (loop_optimized_allocation_it != loop_optimized_allocations_map_.end()) {
+      const LoopOptimizedAllocationInfo& loop_optimized_allocation_info =
+          loop_optimized_allocation_it->second;
+      const Allocation* allocation =
+          loop_optimized_allocation_info.loop_optimized_allocation;
+      VLOG(3) << "Found optimized allocation for " << use.hlo_use.ToString()
+              << " (loop idx: " << loop_optimized_allocation_info.use_index
+              << "): " << allocation->ToString();
+      if (require_no_copy_alternate_mem_allocation) {
+        if (allocation->is_copy_allocation() ||
+            allocation->memory_space() == MemorySpace::kDefault) {
+          LOG(WARNING) << "Optimized allocation could not be applied "
+                          "because the tensor is pre-colored, allocation: "
+                       << allocation->ToString();
+        }
+      } else if (allocation->is_copy_allocation()) {
+        allow_no_copy_alternate_mem_allocation = true;
+        const CopyAllocation* copy_allocation =
+            static_cast<const CopyAllocation*>(allocation);
+        int64_t effective_copy_start_time =
+            copy_allocation->copy_start_schedule_after();
+        if (copy_allocation->copy_start_schedule_after() ==
+                loop_optimized_allocation_info.loop_size - 1 &&
+            copy_allocation->copy_done_schedule_before() == 0) {
+          effective_copy_start_time = -loop_optimized_allocation_info.loop_size;
+        } else if (copy_allocation->copy_start_schedule_after() + 1 >=
+                   copy_allocation->copy_done_schedule_before()) {
+          effective_copy_start_time -= loop_optimized_allocation_info.loop_size;
+        }
+        preferred_prefetch_time =
+            hlo_live_range_.instruction_schedule().at(hlo_use.instruction) -
+            loop_optimized_allocation_info.use_index +
+            effective_copy_start_time;
+        VLOG(3) << "Prefer prefetch at " << *preferred_prefetch_time
+                << " (effective: " << effective_copy_start_time << ")";
+      } else if (allocation->memory_space() == MemorySpace::kDefault) {
+        allow_prefetch = false;
+        allow_no_copy_alternate_mem_allocation = false;
+        VLOG(3) << "Disallowing alternate memory allocation.";
+      } else {
+        CHECK(allocation->memory_space() == MemorySpace::kAlternate);
+        prefer_no_copy_alternate_mem_allocation = true;
+        VLOG(3) << "Prefer no-copy alternate memory allocation.";
       }
+    }
 
-      if (hlo_use.instruction->opcode() == HloOpcode::kWhile &&
-          aliased_allocation->memory_space() == MemorySpace::kAlternate) {
-        // For while uses that are allocated in the alternate memory space, if
-        // they also have an allocation in the default memory space in their
-        // allocation sequence, create a "parent" allocation that mirrors this
-        // default memory space allocation. When we process the parent
-        // allocation, we add an additional parameter to the while that is a
-        // reference to the buffer in the default memory space. With parent
-        // allocations, we don't need to unnecessarily evict buffers since they
-        // already have a copy in the default memory space. We search backwards
-        // (latest to earliest in execution time) for a suitable allocation in
-        // order to find the most recent one.
-        if (options_.enable_while_redundant_eviction_elimination &&
-            absl::c_find_if(allocation_value.value()->positions(),
-                            [&hlo_use](const HloPosition& position) {
-                              return position.instruction ==
-                                         hlo_use.instruction &&
-                                     position.index == hlo_use.operand_index;
-                            }) != allocation_value.value()->positions().end()) {
-          auto allocation_sequence = allocation_value.allocation_sequence();
-          auto prev_allocation_in_default_mem_it = std::find_if(
-              allocation_sequence->rbegin(), allocation_sequence->rend(),
-              [&](const auto& allocation) {
-                return allocation->memory_space() == MemorySpace::kDefault &&
-                       allocation->defining_position() ==
-                           allocation_value.defining_position();
-              });
-          if (prev_allocation_in_default_mem_it !=
-              allocation_sequence->rend()) {
-            VLOG(3) << "Found a prev allocation in default mem for while use: "
-                    << (*prev_allocation_in_default_mem_it)->ToString();
-            auto body_allocation_value_it = absl::c_find_if(
-                allocation_values, [&](const AllocationValue& value) {
-                  return value.computation() ==
-                             hlo_use.instruction->while_body() &&
-                         value.defining_instruction()->opcode() ==
-                             HloOpcode::kParameter;
-                });
-            CHECK_NE(body_allocation_value_it, allocation_values.end());
-            VLOG(3) << "Body allocation value: "
-                    << body_allocation_value_it->ToShortString();
-            int64_t body_parameter_time = instruction_schedule.at(
-                body_allocation_value_it->defining_instruction());
-            body_allocation_value_it->mutable_allocation_sequence()->push_back(
-                std::make_unique<ParentAllocation>(
-                    **prev_allocation_in_default_mem_it, hlo_use.instruction,
-                    body_allocation_value_it->defining_position(),
-                    body_parameter_time));
-            VLOG(3) << "Created: "
-                    << body_allocation_value_it->allocation_sequence()
-                           ->back()
-                           ->ToString();
-
-            auto after_while_allocation_value_it = absl::c_find_if(
-                allocation_values, [&](const AllocationValue& value) {
-                  return value.defining_instruction() == hlo_use.instruction;
-                });
-            CHECK_NE(after_while_allocation_value_it, allocation_values.end());
-            VLOG(3) << "After while allocation value: "
-                    << after_while_allocation_value_it->ToShortString();
-            int64_t while_time = instruction_schedule.at(hlo_use.instruction);
-            after_while_allocation_value_it->mutable_allocation_sequence()
-                ->push_back(std::make_unique<MirroredAllocation>(
-                    **prev_allocation_in_default_mem_it, while_time));
-            VLOG(3) << "Created: "
-                    << after_while_allocation_value_it->allocation_sequence()
-                           ->back()
-                           ->ToString();
+    if (options_.use_repeated_instance_for_preferred_prefetch_time) {
+      const std::vector<const HloInstruction*>* repeated_insts =
+          GetRepeatedInstructionList(hlo_use.instruction);
+      if (repeated_insts) {
+        for (int i = 0; i < repeated_insts->size(); ++i) {
+          const HloInstruction* repeated = repeated_insts->at(i);
+          VLOG(4) << "Repeated instruction for use: " << repeated->name() << " "
+                  << hlo_live_range_.instruction_schedule().at(repeated);
+          if (repeated == hlo_use.instruction && i > 0) {
+            const HloInstruction* prev_repeated = repeated_insts->at(i - 1);
+            if (prev_repeated->parent() == hlo_use.instruction->parent()) {
+              preferred_prefetch_time =
+                  hlo_live_range_.instruction_schedule().at(prev_repeated) + 1;
+              VLOG(3) << "Found a previous repeated (" << prev_repeated->name()
+                      << ") at " << (*preferred_prefetch_time - 1)
+                      << ". Setting preferred prefetch time = "
+                      << *preferred_prefetch_time;
+            }
           }
         }
-        // Special case for while loops since the root offset must agree with
-        // other offsets: remember the preferred offset for the while loop body.
-        preferred_offset_for_computation[hlo_use.instruction->while_body()] =
-            GetAliasedOffset(*aliased_allocation);
       }
     }
+
+    int64_t live_range_start_time = (earliest_prefetch_time.has_value()
+                                         ? earliest_prefetch_time.value()
+                                         : std::min(definition_time, use_time));
+    auto overridden_preferred_prefetch_time =
+        GetOverriddenPreferredPrefetchTime(
+            options_.preferred_prefetch_overrides, allocation_value.size(),
+            hlo_use, instruction_schedule, live_range_start_time,
+            latest_prefetch_time);
+    TF_CHECK_OK(overridden_preferred_prefetch_time.status());
+    if (overridden_preferred_prefetch_time.value().has_value()) {
+      LOG(INFO) << "Overriding preferred prefetch for "
+                << hlo_use.instruction->name() << " operand number "
+                << hlo_use.operand_number << " operand index "
+                << hlo_use.operand_index.ToString() << " size "
+                << allocation_value.size() << " live range ("
+                << live_range_start_time << ", " << latest_prefetch_time
+                << ") from "
+                << (preferred_prefetch_time.has_value()
+                        ? preferred_prefetch_time.value()
+                        : -1)
+                << " to " << overridden_preferred_prefetch_time.value().value();
+      preferred_prefetch_time = overridden_preferred_prefetch_time.value();
+    }
+
+    // Rarely, (e.g., when conditional true and false parameters are the
+    // same), definition time can be the time of the conditional and use
+    // time is the parameter use, which is less.
+    request.inclusive_start_time = std::min(definition_time, use_time);
+    request.latest_prefetch_time = latest_prefetch_time;
+    request.size = allocation_value.size();
+    request.prefer_no_copy_alternate_mem_allocation =
+        prefer_no_copy_alternate_mem_allocation;
+    request.allow_no_copy_alternate_mem_allocation =
+        allow_no_copy_alternate_mem_allocation;
+    request.allow_prefetch = allow_prefetch;
+    request.require_no_copy_alternate_mem_allocation =
+        require_no_copy_alternate_mem_allocation;
+    request.earliest_prefetch_time = earliest_prefetch_time;
+    request.preferred_prefetch_time = preferred_prefetch_time;
+    request.preferred_offset = preferred_offset;
+    request.use = &use;
+    request.allocation_value = &allocation_value;
+    request.all_use_times = all_use_times;
+  }
+
+  request.end_time = use_time;
+
+  return request;
+}
+
+void MsaAlgorithm::UpdateAllocationRequirementForUseAliases(
+    const AllocationValue& allocation_value, const AllocationValue::Use& use,
+    int64_t use_time) {
+  Allocation* aliased_allocation =
+      GetLiveAllocationAt(*allocation_value.allocation_sequence(), use_time);
+  VLOG(4) << "Aliased allocation at time " << use_time << ": "
+          << (aliased_allocation ? aliased_allocation->ToString()
+                                 : "couldn't find the aliased allocation");
+
+  for (const HloPosition& aliased_position : use.aliases) {
+    AddAliasedRequiredAssignment(aliased_position.instruction,
+                                 aliased_position.index, aliased_allocation);
   }
-  return result;
+}
+
+void MsaAlgorithm::MaybeCreateMirroredParentAllocationForWhileUse(
+    const AllocationValue& allocation_value, const AllocationValue::Use& use,
+    int64_t use_time, absl::Span<AllocationValue> allocation_values,
+    absl::flat_hash_map<const HloComputation*, AliasedOffset*>&
+        preferred_offset_for_computation) {
+  const HloUse& hlo_use = use.hlo_use;
+
+  if (hlo_use.instruction->opcode() != HloOpcode::kWhile) return;
+
+  Allocation* aliased_allocation =
+      GetLiveAllocationAt(*allocation_value.allocation_sequence(), use_time);
+  if (aliased_allocation->memory_space() != MemorySpace::kAlternate) return;
+
+  const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
+  if (options_.enable_while_redundant_eviction_elimination &&
+      absl::c_find_if(allocation_value.value()->positions(),
+                      [&hlo_use](const HloPosition& position) {
+                        return position.instruction == hlo_use.instruction &&
+                               position.index == hlo_use.operand_index;
+                      }) != allocation_value.value()->positions().end()) {
+    auto allocation_sequence = allocation_value.allocation_sequence();
+    auto prev_allocation_in_default_mem_it = std::find_if(
+        allocation_sequence->rbegin(), allocation_sequence->rend(),
+        [&](const auto& allocation) {
+          return allocation->memory_space() == MemorySpace::kDefault &&
+                 allocation->defining_position() ==
+                     allocation_value.defining_position();
+        });
+    if (prev_allocation_in_default_mem_it != allocation_sequence->rend()) {
+      VLOG(3) << "Found a prev allocation in default mem for while use: "
+              << (*prev_allocation_in_default_mem_it)->ToString();
+      auto body_allocation_value_it =
+          absl::c_find_if(allocation_values, [&](const AllocationValue& value) {
+            return value.computation() == hlo_use.instruction->while_body() &&
+                   value.defining_instruction()->opcode() ==
+                       HloOpcode::kParameter;
+          });
+      CHECK_NE(body_allocation_value_it, allocation_values.end());
+      VLOG(3) << "Body allocation value: "
+              << body_allocation_value_it->ToShortString();
+      int64_t body_parameter_time = instruction_schedule.at(
+          body_allocation_value_it->defining_instruction());
+      body_allocation_value_it->mutable_allocation_sequence()->push_back(
+          std::make_unique<ParentAllocation>(
+              **prev_allocation_in_default_mem_it, hlo_use.instruction,
+              body_allocation_value_it->defining_position(),
+              body_parameter_time));
+      VLOG(3) << "Created: "
+              << body_allocation_value_it->allocation_sequence()
+                     ->back()
+                     ->ToString();
+
+      auto after_while_allocation_value_it =
+          absl::c_find_if(allocation_values, [&](const AllocationValue& value) {
+            return value.defining_instruction() == hlo_use.instruction;
+          });
+      CHECK_NE(after_while_allocation_value_it, allocation_values.end());
+      VLOG(3) << "After while allocation value: "
+              << after_while_allocation_value_it->ToShortString();
+      int64_t while_time = instruction_schedule.at(hlo_use.instruction);
+      after_while_allocation_value_it->mutable_allocation_sequence()->push_back(
+          std::make_unique<MirroredAllocation>(
+              **prev_allocation_in_default_mem_it, while_time));
+      VLOG(3) << "Created: "
+              << after_while_allocation_value_it->allocation_sequence()
+                     ->back()
+                     ->ToString();
+    }
+  }
+  // Special case for while loops since the root offset must agree with
+  // other offsets: remember the preferred offset for the while loop body.
+  preferred_offset_for_computation[hlo_use.instruction->while_body()] =
+      GetAliasedOffset(*aliased_allocation);
 }
 
 bool operator<(const AsynchronousCopy& a, const AsynchronousCopy& b) {
@@ -2859,6 +2883,56 @@ void MsaAlgorithm::AllocateReservedScopedAllocations() {
   ClearPendingChunks();
 }
 
+int64_t MsaAlgorithm::GetCorrectedUseTime(const HloUse& use) const {
+  const absl::flat_hash_map<const HloInstruction*, int64_t>& schedule =
+      hlo_live_range_.instruction_schedule();
+  if (use.instruction->opcode() == HloOpcode::kWhile) {
+    // Given an example while loop and flattened schedule (logical times shown
+    // on the left):
+    //
+    // 0:  a = ...
+    // 1:  ...
+    //     cond {
+    // 2:   p = param(0)
+    // 3:   ...
+    //     }
+    //     body {
+    // 4:   p = param(0)
+    // 5:   ...
+    // 6:   ROOT ...
+    //     }
+    // 7:  w = while(a), body=body, cond=cond
+    //
+    // When processing "a" (time 0) and its while use (time 7), we update the
+    // interval to time 0-4. This is so that the remaining interval (5-6) can be
+    // allocated separately and this buffer doesn't waste alternate memory space
+    // within the while loop body.
+    HloComputation* while_body = use.instruction->while_body();
+    // We require while body ROOTs to be the last in the schedule.
+    CHECK_EQ(schedule.at(while_body->root_instruction()) + 1,
+             schedule.at(use.instruction))
+        << "While body ROOTs need to be the last in the schedule! "
+           "Please run RootInstructionSinker.";
+    // The corrected use is the parameter time. This is so that we can decide on
+    // alternate memory allocations within the while loop body when we look at
+    // uses within the while loop body.
+    return schedule.at(while_body->parameter_instruction(0));
+  }
+  if (use.instruction->opcode() == HloOpcode::kConditional) {
+    // The corrected use time is the earliest parameter of the called
+    // computations.
+    int64_t use_time = std::numeric_limits<int64_t>::max();
+    for (const HloComputation* called_computation :
+         use.instruction->called_computations()) {
+      use_time = std::min(
+          use_time, schedule.at(called_computation->parameter_instruction(0)));
+    }
+    return use_time;
+  }
+  // Otherwise, just return the time of the use instruction.
+  return hlo_live_range_.instruction_schedule().at(use.instruction);
+}
+
 std::optional<MsaAlgorithm::RequiredMemoryAssignment>
 MsaAlgorithm::RequiredMemoryAssignmentAt(const HloValue* buffer,
                                          int64_t time) const {
@@ -2967,8 +3041,7 @@ void MsaAlgorithm::AddRequiredAssignment(const HloUse& use,
                                          bool add_to_pending) {
   const HloValue* value = &alias_analysis_.dataflow_analysis().GetUniqueValueAt(
       use.instruction->operand(use.operand_number), use.operand_index);
-  int64_t instruction_time =
-      hlo_live_range_.instruction_schedule().at(use.instruction);
+  int64_t instruction_time = GetCorrectedUseTime(use);
   AddRequiredAssignment(value, use.instruction, memory_space, instruction_time,
                         offset, add_to_pending);
 }
@@ -3301,10 +3374,10 @@ void MsaAlgorithm::ImportRepackedSlicedAllocation(
           << "; Allocation: " << allocation->ToString();
 }
 
-Status MsaAlgorithm::AreRepackedSlicesValid(
+absl::Status MsaAlgorithm::AreRepackedSlicesValid(
     const RepackAllocationBlock& block) {
   if (!block.repacked_slice_data.has_value()) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   if (!block.original_slice_data.has_value()) {
     return InvalidArgumentStrCat(
@@ -3345,7 +3418,7 @@ Status MsaAlgorithm::AreRepackedSlicesValid(
         "mappings.");
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void MsaAlgorithm::UncommitPendingChunks(
@@ -3414,8 +3487,9 @@ void MsaAlgorithm::UncommitPendingChunks(
 
 void MsaAlgorithm::FinalizeAllocations(
     absl::Span<AllocationValue> allocation_values) {
-  absl::flat_hash_map<const AliasedOffset*, std::vector<Allocation*>>
-      colocation_map;
+  std::vector<std::pair<const AliasedOffset*, std::vector<Allocation*>>>
+      colocation_vector;
+  absl::flat_hash_map<const AliasedOffset*, size_t> offset_to_index;
   for (AllocationValue& allocation_value : allocation_values) {
     for (auto& allocation : *allocation_value.mutable_allocation_sequence()) {
       if ((allocation->memory_space() == MemorySpace::kAlternate) &&
@@ -3433,15 +3507,23 @@ void MsaAlgorithm::FinalizeAllocations(
       allocations_->push_back(std::move(allocation));
       Allocation* inserted_allocation = allocations_->back().get();
       if (inserted_allocation->memory_space() == MemorySpace::kAlternate) {
-        colocation_map[GetAliasedOffset(*inserted_allocation)].push_back(
-            inserted_allocation);
+        auto* aliased_offset = GetAliasedOffset(*inserted_allocation);
+        auto [it, inserted] =
+            offset_to_index.emplace(aliased_offset, colocation_vector.size());
+        if (inserted) {
+          colocation_vector.emplace_back(aliased_offset,
+                                         std::vector{inserted_allocation});
+        } else {
+          size_t index = it->second;
+          colocation_vector[index].second.push_back(inserted_allocation);
+        }
       }
     }
   }
   // The allocations that have the same AliasedOffset need to be colocated.
   // Export these to repack_allocation_blocks_ so that we can repack them to
   // reduce fragmentation.
-  for (auto& colocation : colocation_map) {
+  for (auto& colocation : colocation_vector) {
     std::vector<AllocationBlock*> colocations;
     for (Allocation* colocated_allocation : colocation.second) {
       repack_allocation_blocks_.push_back(MakeRepackAllocationBlock(
@@ -3541,13 +3623,12 @@ MsaAlgorithm::Result MsaAlgorithm::AllocateSegment(
             << " use benefit = "
             << options_.cost_analysis->GetAlternateMemoryBenefit(
                    request.use->hlo_use);
-    VLOG(3)
-        << "Definition bytes accessed = "
-        << options_.cost_analysis->hlo_cost_analysis().output_bytes_accessed(
-               *defining_position.instruction, defining_position.index)
-        << ", use bytes accessed = "
-        << options_.cost_analysis->hlo_cost_analysis().operand_bytes_accessed(
-               *use.instruction, use.operand_number, use.operand_index);
+    VLOG(3) << "Definition bytes accessed = "
+            << options_.cost_analysis->base_costs().OutputBytesAccessed(
+                   *defining_position.instruction, defining_position.index)
+            << ", use bytes accessed = "
+            << options_.cost_analysis->base_costs().OperandBytesAccessed(
+                   *use.instruction, use.operand_number, use.operand_index);
   }
 
   // There could be a requirement to pin this buffer to default memory either
diff --git a/third_party/xla/xla/service/memory_space_assignment/algorithm.h b/third_party/xla/xla/service/memory_space_assignment/algorithm.h
index 1202346ad95a50..ea775b8a9a7369 100644
--- a/third_party/xla/xla/service/memory_space_assignment/algorithm.h
+++ b/third_party/xla/xla/service/memory_space_assignment/algorithm.h
@@ -684,8 +684,8 @@ class MsaAlgorithm : public GlobalDecreasingSizeBestFitHeap<HloValue> {
   // For the given loop with the start and end index and loop size, run the
   // MemoryBoundLoopOptimizer and record its outputs into
   // optimized_allocations_map_.
-  Status OptimizeMemoryBoundLoop(int loop_start_idx, int loop_end_idx,
-                                 int loop_size);
+  absl::Status OptimizeMemoryBoundLoop(int loop_start_idx, int loop_end_idx,
+                                       int loop_size);
 
   // Identify memory-bound loops in the graph and call OptimizeMemoryBoundLoop
   // for the found loops.
@@ -714,6 +714,47 @@ class MsaAlgorithm : public GlobalDecreasingSizeBestFitHeap<HloValue> {
   bool IsUseAllowedInAlternateMemory(const AllocationValue& value,
                                      const HloUse& use) const;
 
+  // Adjusts the preferred memory offset for a given use, taking aliasing
+  // constraints into account. If the use already has a preferred offset in the
+  // alternate memory space (e.g., due to prior allocations), the offset derived
+  // from aliasing considerations must match the existing preferred offset.
+  AliasedOffset* UpdatePreferredOffsetForUse(
+      const AllocationValue::Use& use, AliasedOffset* preferred_offset) const;
+
+  // Propagate the allocation at the use time to any aliases that this use might
+  // have had.
+  void UpdateAllocationRequirementForUseAliases(
+      const AllocationValue& allocation_value, const AllocationValue::Use& use,
+      int64_t use_time);
+
+  // For while uses that are allocated in the alternate memory space, if
+  // they also have an allocation in the default memory space in their
+  // allocation sequence, create a "parent" allocation that mirrors this
+  // default memory space allocation. When we process the parent
+  // allocation, we add an additional parameter to the while that is a
+  // reference to the buffer in the default memory space. With parent
+  // allocations, we don't need to unnecessarily evict buffers since they
+  // already have a copy in the default memory space. We search backwards
+  // (latest to earliest in execution time) for a suitable allocation in
+  // order to find the most recent one.
+  void MaybeCreateMirroredParentAllocationForWhileUse(
+      const AllocationValue& allocation_value, const AllocationValue::Use& use,
+      int64_t use_time, absl::Span<AllocationValue> allocation_values,
+      absl::flat_hash_map<const HloComputation*, AliasedOffset*>&
+          preferred_offset_for_computation);
+
+  // Creates a detailed memory allocation request for a given use of an
+  // allocation value. Analyzes the usage pattern of the use to determine if it
+  // can be placed in alternate memory, considering the restrictions for loops
+  // and conditionals. Also calculates the timing for prefetching, taking into
+  // account instruction schedules, operation type (e.g., sequential vs.
+  // non-sequential calls), and prior usage patterns.
+  AllocationRequest CreateAllocationRequest(
+      AllocationValue& allocation_value, const AllocationValue::Use& use,
+      const AllocationValue::Use* previous_use, AliasedOffset* preferred_offset,
+      int64_t definition_time, bool require_no_copy_alternate_mem_allocation,
+      const std::vector<int64_t>& all_use_times);
+
   // Finds allocations for allocation values generated from colocated intervals.
   // All of the allocation values have a must-alias relationship with each
   // other. Returns either kSuccess if all of the sites could be placed in the
@@ -793,6 +834,14 @@ class MsaAlgorithm : public GlobalDecreasingSizeBestFitHeap<HloValue> {
       const AllocationRequest& request, const AliasedOffset* preferred_offset,
       SlicedBufferInterval* alternate_mem_interval) const;
 
+  // Returns the corrected schedule time of an HloUse. The corrected time is
+  // equivalent to the actual time of the use instructions for all instructions
+  // except for while and conditional instructions. For while instructions, the
+  // corrected time is the time of the body parameter, and for conditional, the
+  // corrected time is the time of the parameter of the earliest-scheduled
+  // called computation.
+  int64_t GetCorrectedUseTime(const HloUse& use) const;
+
   // Returns the required assignment at a particular time, if available.
   std::optional<RequiredMemoryAssignment> RequiredMemoryAssignmentAt(
       const HloValue* buffer, int64_t time) const;
@@ -896,7 +945,7 @@ class MsaAlgorithm : public GlobalDecreasingSizeBestFitHeap<HloValue> {
   // Helper functions to implement ImportRepackedAllocations.
   void ImportRepackedNonSlicedAllocation(RepackAllocationBlock& block);
   void ImportRepackedSlicedAllocation(RepackAllocationBlock& block);
-  Status AreRepackedSlicesValid(const RepackAllocationBlock& block);
+  absl::Status AreRepackedSlicesValid(const RepackAllocationBlock& block);
 
   // Adds an asynchronous copy to allocations.
   void AddAsyncCopy(
diff --git a/third_party/xla/xla/service/memory_space_assignment/allocation.cc b/third_party/xla/xla/service/memory_space_assignment/allocation.cc
index 1ca05bfa8c5d14..00e22355fc6239 100644
--- a/third_party/xla/xla/service/memory_space_assignment/allocation.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/allocation.cc
@@ -152,8 +152,8 @@ void Allocation::AddUse(HloUse use) {
   uses_.push_back(use);
 }
 
-Status Allocation::UpdateUses(HloComputation* computation,
-                              HloInstruction* producing_instruction) {
+absl::Status Allocation::UpdateUses(HloComputation* computation,
+                                    HloInstruction* producing_instruction) {
   for (const HloUse& use : uses()) {
     HloInstruction* replacement_instruction = producing_instruction;
     Shape operand_shape = use.instruction->operand(use.operand_number)->shape();
@@ -177,7 +177,7 @@ Status Allocation::UpdateUses(HloComputation* computation,
     TF_RETURN_IF_ERROR(use.instruction->ReplaceOperandWith(
         use.operand_number, replacement_instruction));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool Allocation::is_copy_like_allocation() const {
@@ -259,10 +259,10 @@ bool PinnedAllocation::operator==(const Allocation& other) const {
   return casted_other != nullptr && (*this) == (*casted_other);
 }
 
-Status PinnedAllocation::Process() {
+absl::Status PinnedAllocation::Process() {
   if (is_scoped_allocation()) {
     // Nothing to do here for scoped allocations.
-    return OkStatus();
+    return absl::OkStatus();
   }
   HloInstruction* producing_instruction = AddGetTupleElements();
   HloComputation* computation = producing_instruction->parent();
@@ -313,7 +313,7 @@ int64_t CopyAllocation::earliest_available_time() const {
   return copy_done_schedule_before_;
 }
 
-Status CopyAllocation::Process() {
+absl::Status CopyAllocation::Process() {
   // Copy allocations need to insert asynchronous copy nodes.
   Shape shape = defining_position().shape();
   HloInstruction* producing_instruction = AddGetTupleElements();
@@ -439,7 +439,7 @@ SlicedCopyAllocation::SlicedCopyAllocation(
   }
 }
 
-Status SlicedCopyAllocation::Process() {
+absl::Status SlicedCopyAllocation::Process() {
   Shape shape = defining_position().shape();
   HloInstruction* producing_instruction = AddGetTupleElements();
 
@@ -619,14 +619,14 @@ std::string SlicedCopyAllocation::ToString() const {
       ", uses: ", UsesToString(uses()), ", from ", prev_allocation_.ToString());
 }
 
-Status SlicedCopyAllocation::CreateBitcastConcat(
+absl::Status SlicedCopyAllocation::CreateBitcastConcat(
     const Shape& shape, absl::Span<HloInstruction* const> slices) {
   CHECK(!slices.empty());
   concat_ =
       slices.front()->parent()->AddInstruction(HloInstruction::CreateCustomCall(
           shape, slices,
           xla::memory_space_assignment::kConcatBitcastCustomCall));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 std::string SlicedCopyAllocation::SliceDetail::ToString() const {
@@ -649,7 +649,7 @@ bool SlicedCopyAllocation::SliceDetail::operator==(
   return SliceDetailToTuple(*this) == SliceDetailToTuple(other);
 }
 
-Status SlicedCopyAllocation::SliceDetail::CreateAsyncSlice(
+absl::Status SlicedCopyAllocation::SliceDetail::CreateAsyncSlice(
     const Shape& original_shape, HloInstruction& producer,
     HloComputation& parent) {
   if (original_shape.rank() != slice_decision.sizing.slice_params.size()) {
@@ -695,7 +695,7 @@ Status SlicedCopyAllocation::SliceDetail::CreateAsyncSlice(
                                      slice, {ShapeUtil::MakeShape(S32, {})}));
   copy_start = copy_done->mutable_operand(0);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool SlicedCopyAllocation::operator==(const Allocation& other) const {
@@ -728,11 +728,11 @@ MirroredAllocation::MirroredAllocation(const Allocation& original_allocation,
                  /*cross_program_prefetch_index=*/std::nullopt),
       original_allocation_(original_allocation) {}
 
-Status MirroredAllocation::Process() {
+absl::Status MirroredAllocation::Process() {
   set_original_defining_position(original_allocation_.defining_position());
   if (is_scoped_allocation()) {
     // Nothing to do here for scoped allocations.
-    return OkStatus();
+    return absl::OkStatus();
   }
   HloInstruction* producing_instruction = AddGetTupleElements();
   HloComputation* computation = producing_instruction->parent();
@@ -754,7 +754,7 @@ HloPosition ParentAllocation::defining_position() const {
   return original_defining_position();
 }
 
-Status ParentAllocation::Process() {
+absl::Status ParentAllocation::Process() {
   // Add an additional parameter to the while HLO with a reference to the buffer
   // in the default memory space.
   HloInstruction* producing_instruction =
@@ -789,14 +789,14 @@ Status ParentAllocation::Process() {
 
   if (is_scoped_allocation()) {
     // Nothing to do here for scoped allocations.
-    return OkStatus();
+    return absl::OkStatus();
   }
   HloInstruction* final_instruction = AddGetTupleElements();
   HloComputation* computation = final_instruction->parent();
   return UpdateUses(computation, final_instruction);
 }
 
-Status ParentAllocation::PostProcess() {
+absl::Status ParentAllocation::PostProcess() {
   // Update the root of the while body with the new parameter. The reason why we
   // need a separate post-process for this is because other allocations may have
   // while body root as a use, so they would update the old root instead of the
@@ -809,7 +809,7 @@ Status ParentAllocation::PostProcess() {
                           original_defining_position().index));
   while_body->set_root_instruction(new_while_body_root,
                                    /*accept_different_shape=*/true);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void ParentAllocation::MarkIfNeeded(
diff --git a/third_party/xla/xla/service/memory_space_assignment/allocation.h b/third_party/xla/xla/service/memory_space_assignment/allocation.h
index accbe1bcff581f..f523adc5df420f 100644
--- a/third_party/xla/xla/service/memory_space_assignment/allocation.h
+++ b/third_party/xla/xla/service/memory_space_assignment/allocation.h
@@ -122,8 +122,8 @@ class Allocation {
   // Adds a use to this allocation.
   void AddUse(HloUse use);
   // Replaces all uses of the allocation with the copy_complete instruction.
-  Status UpdateUses(HloComputation* computation,
-                    HloInstruction* producing_instruction);
+  absl::Status UpdateUses(HloComputation* computation,
+                          HloInstruction* producing_instruction);
 
   // Allocation type methods
   // --------------------------------------------------------------------------
@@ -140,10 +140,10 @@ class Allocation {
   // After all of the time ranges for the allocations have been assigned,
   // Process morphs the instructions affected to assign the memory spaces and
   // insert asynchronous copy instructions if necessary.
-  virtual Status Process() = 0;
+  virtual absl::Status Process() = 0;
   // An optional post-process step that will be called after all allocations
   // have been processed.
-  virtual Status PostProcess() = 0;
+  virtual absl::Status PostProcess() = 0;
   // Marks (adds this allocation to needed_allocations) if this allocation is
   // needed. PinnedAllocation and CopyAllocations are always needed and
   // ParentAllocations are needed if they have any uses or if other
@@ -209,8 +209,8 @@ class PinnedAllocation final : public Allocation {
   int64_t earliest_available_time() const override { return start_time(); }
   bool is_copy_allocation() const override { return false; }
   bool is_sliced_copy_allocation() const override { return false; }
-  Status Process() override;
-  Status PostProcess() override { return OkStatus(); }
+  absl::Status Process() override;
+  absl::Status PostProcess() override { return absl::OkStatus(); }
   void MarkIfNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
       const override;
   void MarkNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
@@ -246,8 +246,8 @@ class CopyAllocation final : public Allocation {
   int64_t earliest_available_time() const override;
   bool is_copy_allocation() const override { return true; }
   bool is_sliced_copy_allocation() const override { return false; }
-  Status Process() override;
-  Status PostProcess() override { return OkStatus(); }
+  absl::Status Process() override;
+  absl::Status PostProcess() override { return absl::OkStatus(); }
   void MarkIfNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
       const override;
   void MarkNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
@@ -317,8 +317,9 @@ class SlicedCopyAllocation final : public Allocation {
 
     // Create the instructions to copy the slice. This method updates
     // copy_start and copy_done.
-    Status CreateAsyncSlice(const Shape& original_shape,
-                            HloInstruction& producer, HloComputation& parent);
+    absl::Status CreateAsyncSlice(const Shape& original_shape,
+                                  HloInstruction& producer,
+                                  HloComputation& parent);
 
     SliceDecision slice_decision;
     int64_t copy_start_after_time = -1;
@@ -347,8 +348,8 @@ class SlicedCopyAllocation final : public Allocation {
   bool is_sliced_copy_allocation() const override { return true; }
   // MemorySpaceAssignment::Process() calls Process() to create asynchronous
   // slice copies, and a bitcast-concat call to glue the slices back together.
-  Status Process() override;
-  Status PostProcess() override { return OkStatus(); }
+  absl::Status Process() override;
+  absl::Status PostProcess() override { return absl::OkStatus(); }
   // Marks the allocation as needed.
   void MarkIfNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
       const override;
@@ -372,8 +373,8 @@ class SlicedCopyAllocation final : public Allocation {
   SlicedCopyAllocation() = delete;
 
   // Create an instruction to concatenate the slices. Populates concat_.
-  Status CreateBitcastConcat(const Shape& shape,
-                             absl::Span<HloInstruction* const> slices);
+  absl::Status CreateBitcastConcat(const Shape& shape,
+                                   absl::Span<HloInstruction* const> slices);
 
   Shape original_shape_to_slice_;
   const Allocation& prev_allocation_;
@@ -403,8 +404,8 @@ class MirroredAllocation final : public Allocation {
   int64_t earliest_available_time() const override { return start_time(); }
   bool is_copy_allocation() const override { return false; }
   bool is_sliced_copy_allocation() const override { return false; }
-  Status Process() override;
-  Status PostProcess() override { return OkStatus(); }
+  absl::Status Process() override;
+  absl::Status PostProcess() override { return absl::OkStatus(); }
   void MarkIfNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
       const override;
   void MarkNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
@@ -435,8 +436,8 @@ class ParentAllocation final : public Allocation {
   int64_t earliest_available_time() const override { return start_time(); }
   bool is_copy_allocation() const override { return false; }
   bool is_sliced_copy_allocation() const override { return false; }
-  Status Process() override;
-  Status PostProcess() override;
+  absl::Status Process() override;
+  absl::Status PostProcess() override;
   void MarkIfNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
       const override;
   void MarkNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
diff --git a/third_party/xla/xla/service/memory_space_assignment/cost_analysis.cc b/third_party/xla/xla/service/memory_space_assignment/cost_analysis.cc
index 72f1c226e9652c..ad9a2a09652a83 100644
--- a/third_party/xla/xla/service/memory_space_assignment/cost_analysis.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/cost_analysis.cc
@@ -43,8 +43,47 @@ limitations under the License.
 
 namespace xla {
 namespace memory_space_assignment {
+
+HloCostAnalysisCosts::HloCostAnalysisCosts(
+    const HloCostAnalysis& hlo_cost_analysis)
+    : hlo_cost_analysis_(hlo_cost_analysis) {}
+
+int64_t HloCostAnalysisCosts::GetShapeSize(const Shape& shape) {
+  return hlo_cost_analysis_.GetShapeSize(shape);
+}
+
+float HloCostAnalysisCosts::BytesAccessed(const HloInstruction& instruction) {
+  return static_cast<float>(hlo_cost_analysis_.bytes_accessed(instruction));
+}
+
+float HloCostAnalysisCosts::OperandBytesAccessed(
+    const HloInstruction& instruction, int64_t operand_num,
+    const ShapeIndex& shape_index) {
+  return static_cast<float>(hlo_cost_analysis_.operand_bytes_accessed(
+      instruction, operand_num, shape_index));
+}
+
+float HloCostAnalysisCosts::OutputBytesAccessed(
+    const HloInstruction& instruction, const ShapeIndex& shape_index) {
+  return static_cast<float>(
+      hlo_cost_analysis_.output_bytes_accessed(instruction, shape_index));
+}
+
+float HloCostAnalysisCosts::BytesPerSecond() {
+  return hlo_cost_analysis_.per_second_rate(HloCostAnalysis::kBytesAccessedKey);
+}
+
+float HloCostAnalysisCosts::ComputeSeconds(const HloInstruction& instruction) {
+  return std::max(
+      static_cast<float>(hlo_cost_analysis_.flop_count(instruction)) /
+          hlo_cost_analysis_.per_second_rate(HloCostAnalysis::kFlopsKey),
+      static_cast<float>(hlo_cost_analysis_.transcendental_count(instruction)) /
+          hlo_cost_analysis_.per_second_rate(
+              HloCostAnalysis::kTranscendentalsKey));
+}
+
 /*static*/ absl::StatusOr<std::unique_ptr<CostAnalysis>> CostAnalysis::Create(
-    const HloCostAnalysis& cost_analysis, const CostAnalysisOptions& options,
+    BaseCosts& base_costs, const CostAnalysisOptions& options,
     const HloModule& module) {
   TF_ASSIGN_OR_RETURN(auto alias_analysis, HloAliasAnalysis::Run(&module));
   TF_ASSIGN_OR_RETURN(auto hlo_live_range,
@@ -53,7 +92,7 @@ namespace memory_space_assignment {
   auto call_graph = CallGraph::Build(&module);
   // Using `new` to access a non-public constructor.
   return absl::WrapUnique(
-      new CostAnalysis(cost_analysis, options, std::move(alias_analysis),
+      new CostAnalysis(base_costs, options, std::move(alias_analysis),
                        std::move(hlo_live_range), std::move(call_graph)));
 }
 
@@ -208,7 +247,7 @@ float CostAnalysis::GetDefaultMemoryAccessOverhead(
   //          = (window_size / bytes_accessed) * compute_elapsed
   const float window_size_bytes =
       options_.pipeline_overhead_window_size_mib * 1024 * 1024;
-  const float bytes_accessed = hlo_cost_analysis_.bytes_accessed(instruction);
+  const float bytes_accessed = base_costs_.BytesAccessed(instruction);
   const float default_memory_bytes_accessed =
       bytes_accessed -
       GetBytesAccessedFromAlternateMemory(
@@ -228,12 +267,11 @@ float CostAnalysis::GetDefaultMemoryBandwidthIdleTime(
     absl::Span<const std::pair<int64_t, ShapeIndex>> operands_in_alternate_mem,
     absl::Span<const ShapeIndex> outputs_in_alternate_mem) const {
   const float default_memory_bytes_accessed =
-      hlo_cost_analysis_.bytes_accessed(instruction) -
+      base_costs_.BytesAccessed(instruction) -
       GetBytesAccessedFromAlternateMemory(
           instruction, operands_in_alternate_mem, outputs_in_alternate_mem);
   const float elapsed_due_to_default_mem =
-      default_memory_bytes_accessed /
-      hlo_cost_analysis_.per_second_rate(HloCostAnalysis::kBytesAccessedKey);
+      default_memory_bytes_accessed / base_costs_.BytesPerSecond();
   const float elapsed = GetInstructionElapsedInAlternateMemory(
       instruction, operands_in_alternate_mem, outputs_in_alternate_mem);
   return elapsed - elapsed_due_to_default_mem;
@@ -245,15 +283,14 @@ float CostAnalysis::GetBytesAccessedFromAlternateMemory(
     absl::Span<const ShapeIndex> outputs_in_alternate_mem) const {
   float bytes_accessed_from_alternate_mem = 0.0;
   for (auto& operand : operands_in_alternate_mem) {
-    const float operand_bytes_accessed =
-        hlo_cost_analysis_.operand_bytes_accessed(instruction, operand.first,
-                                                  operand.second);
+    const float operand_bytes_accessed = base_costs_.OperandBytesAccessed(
+        instruction, operand.first, operand.second);
     bytes_accessed_from_alternate_mem += operand_bytes_accessed;
   }
 
   for (auto& shape_idx : outputs_in_alternate_mem) {
     const float output_bytes_accessed =
-        hlo_cost_analysis_.output_bytes_accessed(instruction, shape_idx);
+        base_costs_.OutputBytesAccessed(instruction, shape_idx);
     bytes_accessed_from_alternate_mem += output_bytes_accessed;
   }
   return bytes_accessed_from_alternate_mem;
@@ -282,12 +319,7 @@ float CostAnalysis::GetInstructionElapsedDueToCompute(
   if (ExcludeInstructionFromElapsed(instruction)) {
     return 0.0f;
   }
-  return std::max(
-      hlo_cost_analysis_.flop_count(instruction) /
-          hlo_cost_analysis_.per_second_rate(HloCostAnalysis::kFlopsKey),
-      hlo_cost_analysis_.transcendental_count(instruction) /
-          hlo_cost_analysis_.per_second_rate(
-              HloCostAnalysis::kTranscendentalsKey));
+  return base_costs_.ComputeSeconds(instruction);
 }
 
 float CostAnalysis::GetInstructionElapsedDueToMemory(
@@ -297,7 +329,7 @@ float CostAnalysis::GetInstructionElapsedDueToMemory(
   if (ExcludeInstructionFromElapsed(instruction)) {
     return 0.0f;
   }
-  float total_bytes_accessed = hlo_cost_analysis_.bytes_accessed(instruction);
+  float total_bytes_accessed = base_costs_.BytesAccessed(instruction);
   float bytes_accessed_from_alternate_mem = GetBytesAccessedFromAlternateMemory(
       instruction, operands_in_alternate_mem, outputs_in_alternate_mem);
   float elapsed_due_to_alternate_mem =
@@ -305,7 +337,7 @@ float CostAnalysis::GetInstructionElapsedDueToMemory(
       options_.alternate_mem_bandwidth_bytes_per_second;
   float elapsed_due_to_default_mem =
       (total_bytes_accessed - bytes_accessed_from_alternate_mem) /
-      hlo_cost_analysis_.per_second_rate(HloCostAnalysis::kBytesAccessedKey);
+      base_costs_.BytesPerSecond();
   return elapsed_due_to_alternate_mem + elapsed_due_to_default_mem;
 }
 
@@ -315,7 +347,7 @@ float CostAnalysis::GetInstructionElapsedDueToMemory(
   if (ExcludeInstructionFromElapsed(instruction)) {
     return 0.0f;
   }
-  float total_bytes_accessed = hlo_cost_analysis_.bytes_accessed(instruction);
+  float total_bytes_accessed = base_costs_.BytesAccessed(instruction);
   float bytes_accessed_from_alternate_mem = 0.0;
   for (int operand_num = 0; operand_num < instruction.operand_count();
        ++operand_num) {
@@ -327,8 +359,8 @@ float CostAnalysis::GetInstructionElapsedDueToMemory(
           }
           if (is_in_alternate_mem(operand_num, index, subshape)) {
             bytes_accessed_from_alternate_mem +=
-                hlo_cost_analysis_.operand_bytes_accessed(instruction,
-                                                          operand_num, index);
+                base_costs_.OperandBytesAccessed(instruction, operand_num,
+                                                 index);
           }
         });
   }
@@ -339,7 +371,7 @@ float CostAnalysis::GetInstructionElapsedDueToMemory(
     }
     if (is_in_alternate_mem(/*operand_num=*/std::nullopt, index, subshape)) {
       bytes_accessed_from_alternate_mem +=
-          hlo_cost_analysis_.output_bytes_accessed(instruction, index);
+          base_costs_.OutputBytesAccessed(instruction, index);
     }
   });
   float elapsed_due_to_alternate_mem =
@@ -347,7 +379,7 @@ float CostAnalysis::GetInstructionElapsedDueToMemory(
       options_.alternate_mem_bandwidth_bytes_per_second;
   float elapsed_due_to_default_mem =
       (total_bytes_accessed - bytes_accessed_from_alternate_mem) /
-      hlo_cost_analysis_.per_second_rate(HloCostAnalysis::kBytesAccessedKey);
+      base_costs_.BytesPerSecond();
   return elapsed_due_to_alternate_mem + elapsed_due_to_default_mem;
 }
 
@@ -389,7 +421,7 @@ float CostAnalysis::GetInstructionElapsedInAlternateMemory(
 }
 
 float CostAnalysis::GetAsyncCopyElapsed(const Shape& shape) const {
-  int64_t size_in_bytes = hlo_cost_analysis_.GetShapeSize(shape);
+  int64_t size_in_bytes = base_costs_.GetShapeSize(shape);
   return static_cast<float>(size_in_bytes) /
          (options_.async_copy_bandwidth_bytes_per_second *
           options_.async_copy_bandwidth_scaling_factor);
diff --git a/third_party/xla/xla/service/memory_space_assignment/cost_analysis.h b/third_party/xla/xla/service/memory_space_assignment/cost_analysis.h
index b8152dbc23fda7..4f1e685d150b0c 100644
--- a/third_party/xla/xla/service/memory_space_assignment/cost_analysis.h
+++ b/third_party/xla/xla/service/memory_space_assignment/cost_analysis.h
@@ -65,7 +65,62 @@ struct CostAnalysisOptions {
   float async_copy_bandwidth_scaling_factor = 1.0;
 };
 
-// A wrapper class around HloCostAnalysis with additional knowledge about the
+// An interface for getting basic HLO costs.
+class BaseCosts {
+ public:
+  virtual ~BaseCosts() = default;
+
+  // The size of shape in bytes
+  virtual int64_t GetShapeSize(const Shape& shape) = 0;
+
+  // The number of operand and output bytes accessed by instruction.
+  virtual float BytesAccessed(const HloInstruction& instruction) = 0;
+
+  // The number of bytes accessed by instruction, for operand operand_num, at
+  // shape_index.
+  virtual float OperandBytesAccessed(const HloInstruction& instruction,
+                                     int64_t operand_num,
+                                     const ShapeIndex& shape_index) = 0;
+
+  // The number of bytes accessed by instruction, in its output, at shape_index.
+  virtual float OutputBytesAccessed(const HloInstruction& instruction,
+                                    const ShapeIndex& shape_index) = 0;
+
+  // The bandwidth of copies to/from alternate memory.
+  virtual float BytesPerSecond() = 0;
+
+  // The compute cost of instruction. The compute cost assumes 0 memory transer
+  // is required.
+  virtual float ComputeSeconds(const HloInstruction& instruction) = 0;
+
+ protected:
+  BaseCosts() = default;
+};
+
+// An implementation of BaseCosts based on HloCostAnalysis.
+class HloCostAnalysisCosts : public BaseCosts {
+ public:
+  explicit HloCostAnalysisCosts(const HloCostAnalysis& hlo_cost_analysis);
+
+  ~HloCostAnalysisCosts() override = default;
+
+  int64_t GetShapeSize(const Shape& shape) override;
+  float BytesAccessed(const HloInstruction& instruction) override;
+  float OperandBytesAccessed(const HloInstruction& instruction,
+                             int64_t operand_num,
+                             const ShapeIndex& shape_index) override;
+  float OutputBytesAccessed(const HloInstruction& instruction,
+                            const ShapeIndex& shape_index) override;
+  float BytesPerSecond() override;
+  float ComputeSeconds(const HloInstruction& instruction) override;
+
+ private:
+  HloCostAnalysisCosts() = default;
+
+  const HloCostAnalysis& hlo_cost_analysis_;
+};
+
+// A wrapper class around BaseCosts with additional knowledge about the
 // bandwidths of different memory spaces.
 class CostAnalysis {
  public:
@@ -85,12 +140,10 @@ class CostAnalysis {
   virtual ~CostAnalysis() = default;
 
   static absl::StatusOr<std::unique_ptr<CostAnalysis>> Create(
-      const HloCostAnalysis& cost_analysis, const CostAnalysisOptions& options,
+      BaseCosts& base_costs, const CostAnalysisOptions& options,
       const HloModule& module);
 
-  const HloCostAnalysis& hlo_cost_analysis() const {
-    return hlo_cost_analysis_;
-  }
+  BaseCosts& base_costs() const { return base_costs_; }
 
   // Returns a heuristic value that captures how much putting this tensor to the
   // alternate memory would help if the op is memory bound, or otherwise how far
@@ -213,19 +266,18 @@ class CostAnalysis {
   const HloLiveRange& hlo_live_range() const { return *hlo_live_range_; }
 
  protected:
-  CostAnalysis(const HloCostAnalysis& hlo_cost_analysis,
-               const CostAnalysisOptions& options,
+  CostAnalysis(BaseCosts& base_costs, const CostAnalysisOptions& options,
                std::unique_ptr<HloAliasAnalysis> alias_analysis,
                std::unique_ptr<HloLiveRange> hlo_live_range,
                std::unique_ptr<CallGraph> call_graph)
-      : hlo_cost_analysis_(hlo_cost_analysis),
+      : base_costs_(base_costs),
         options_(options),
         alias_analysis_(std::move(alias_analysis)),
         hlo_live_range_(std::move(hlo_live_range)),
         call_graph_(std::move(call_graph)) {}
 
  private:
-  const HloCostAnalysis& hlo_cost_analysis_;
+  BaseCosts& base_costs_;
   const CostAnalysisOptions options_;
   std::unique_ptr<HloAliasAnalysis> alias_analysis_;
   std::unique_ptr<HloLiveRange> hlo_live_range_;
diff --git a/third_party/xla/xla/service/memory_space_assignment/cost_analysis_test.cc b/third_party/xla/xla/service/memory_space_assignment/cost_analysis_test.cc
index 54567bb3855d0f..47df6481df60f9 100644
--- a/third_party/xla/xla/service/memory_space_assignment/cost_analysis_test.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/cost_analysis_test.cc
@@ -45,8 +45,8 @@ int64_t ShapeSize(const Shape& shape) {
 
 class MemorySpaceAssignmentCostAnalysisTest : public HloTestBase {
  protected:
-  Status Initialize(const HloModule* module,
-                    float pipeline_overhead_window_size_mib = 0.0) {
+  absl::Status Initialize(const HloModule* module,
+                          float pipeline_overhead_window_size_mib = 0.0) {
     HloCostAnalysis::Options options;
     options_.alternate_mem_bandwidth_bytes_per_second = 128;
     options_.async_copy_bandwidth_bytes_per_second = 32;
@@ -59,14 +59,19 @@ class MemorySpaceAssignmentCostAnalysisTest : public HloTestBase {
     hlo_cost_analysis_ = std::make_unique<HloCostAnalysis>(options);
     TF_RETURN_IF_ERROR(
         module->entry_computation()->Accept(hlo_cost_analysis_.get()));
+    hlo_cost_analysis_costs_ =
+        std::make_unique<memory_space_assignment::HloCostAnalysisCosts>(
+            *hlo_cost_analysis_);
     TF_ASSIGN_OR_RETURN(
         cost_analysis_,
-        CostAnalysis::Create(*hlo_cost_analysis_, options_, *module));
-    return OkStatus();
+        CostAnalysis::Create(*hlo_cost_analysis_costs_, options_, *module));
+    return absl::OkStatus();
   }
 
   CostAnalysisOptions options_;
   std::unique_ptr<HloCostAnalysis> hlo_cost_analysis_;
+  std::unique_ptr<memory_space_assignment::HloCostAnalysisCosts>
+      hlo_cost_analysis_costs_;
   std::unique_ptr<CostAnalysis> cost_analysis_;
 };
 
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_bound_loop_optimizer.cc b/third_party/xla/xla/service/memory_space_assignment/memory_bound_loop_optimizer.cc
index c2f481c458be82..3e3cf4a10a0cba 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_bound_loop_optimizer.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_bound_loop_optimizer.cc
@@ -92,7 +92,7 @@ MemoryBoundLoopOptimizer::MemoryBoundLoopOptimizer(
       size_function_(size_function),
       reserved_scoped_memory_fn_(reserved_scoped_memory_fn) {}
 
-Status MemoryBoundLoopOptimizer::Initialize() {
+absl::Status MemoryBoundLoopOptimizer::Initialize() {
   const auto& instruction_sequence =
       hlo_live_range_.flattened_instruction_sequence().instructions();
   VLOG(3) << "MemoryBoundLoopOptimizer::Initialize, loop start: " << loop_start_
@@ -158,7 +158,7 @@ Status MemoryBoundLoopOptimizer::Initialize() {
   for (const HloBuffer* buffer : buffers_to_process) {
     MaybeCreateLoopValue(*buffer, loop_computation);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void MemoryBoundLoopOptimizer::MaybeCreateLoopValue(
@@ -227,9 +227,8 @@ void MemoryBoundLoopOptimizer::MaybeCreateLoopValue(
 
       // Keep track of bytes accessed by this value.
       if (loop_index || prev_iteration_index) {
-        float bytes_accessed =
-            cost_analysis_.hlo_cost_analysis().output_bytes_accessed(
-                *position.instruction, position.index);
+        float bytes_accessed = cost_analysis_.base_costs().OutputBytesAccessed(
+            *position.instruction, position.index);
         pos_bytes += bytes_accessed;
         VLOG(3) << " accessed: " << bytes_accessed;
       }
@@ -258,9 +257,8 @@ void MemoryBoundLoopOptimizer::MaybeCreateLoopValue(
 
       // Keep track of bytes accessed by this value.
       if (loop_index || next_iteration_index) {
-        float bytes_accessed =
-            cost_analysis_.hlo_cost_analysis().operand_bytes_accessed(
-                *use.instruction, use.operand_number, use.operand_index);
+        float bytes_accessed = cost_analysis_.base_costs().OperandBytesAccessed(
+            *use.instruction, use.operand_number, use.operand_index);
         use_bytes += bytes_accessed;
         VLOG(3) << " accessed: " << bytes_accessed;
       }
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_bound_loop_optimizer.h b/third_party/xla/xla/service/memory_space_assignment/memory_bound_loop_optimizer.h
index 1f75ae693d34b3..cef1dffa52c693 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_bound_loop_optimizer.h
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_bound_loop_optimizer.h
@@ -207,7 +207,7 @@ class MemoryBoundLoopOptimizer {
       const ReservedScopedMemoryFunction& reserved_scoped_memory_fn);
 
   // Initializes the data structures used by the optimizer.
-  Status Initialize();
+  absl::Status Initialize();
 
   // Given an HloBuffer object, determines if this buffer represents a LoopValue
   // that can be optimized by the optimizer, and if so it adds a LoopValue to
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_bound_loop_optimizer_test.cc b/third_party/xla/xla/service/memory_space_assignment/memory_bound_loop_optimizer_test.cc
index 8403316ffdb2d3..0d4fdcdbe4c7b2 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_bound_loop_optimizer_test.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_bound_loop_optimizer_test.cc
@@ -91,8 +91,8 @@ class MemoryBoundLoopOptimizerTest : public HloTestBase {
   const int64_t kAlternateMemorySpace = 1;
   const int64_t kDefaultMemorySpace = 0;
 
-  Status Initialize(const HloModule* module,
-                    uint64_t alternate_memory_size = 256) {
+  absl::Status Initialize(const HloModule* module,
+                          uint64_t alternate_memory_size = 256) {
     HloCostAnalysis::Options options;
     MemoryBoundLoopOptimizerOptions optimizer_options;
     optimizer_options.set_enabled(true);
@@ -110,14 +110,16 @@ class MemoryBoundLoopOptimizerTest : public HloTestBase {
     hlo_cost_analysis_ = std::make_unique<HloCostAnalysis>(options);
     TF_RETURN_IF_ERROR(
         module->entry_computation()->Accept(hlo_cost_analysis_.get()));
+    hlo_cost_analysis_costs_ =
+        std::make_unique<HloCostAnalysisCosts>(*hlo_cost_analysis_);
     TF_ASSIGN_OR_RETURN(cost_analysis_,
-                        CostAnalysis::Create(*hlo_cost_analysis_,
+                        CostAnalysis::Create(*hlo_cost_analysis_costs_,
                                              cost_analysis_options_, *module));
     TF_ASSIGN_OR_RETURN(alias_analysis_, HloAliasAnalysis::Run(module));
     TF_ASSIGN_OR_RETURN(live_range_,
                         HloLiveRange::Run(module->schedule(), *alias_analysis_,
                                           module->entry_computation()));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   absl::StatusOr<MemoryBoundLoopOptimizer*> CreateOptimizer(
@@ -368,8 +370,8 @@ ENTRY Entry {
     return preset_assignments;
   }
 
-  Status VerifyMsaEquivalence(HloModule* module,
-                              bool expect_unsupported_allocations = false) {
+  absl::Status VerifyMsaEquivalence(
+      HloModule* module, bool expect_unsupported_allocations = false) {
     // Create a map indexed by instruction number and operand number.
     absl::flat_hash_map<std::pair<int, int>, const Allocation*> allocation_map;
     for (const MemoryBoundLoopOptimizer::LoopValue& value :
@@ -494,13 +496,14 @@ ENTRY Entry {
         }
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
   Options options_;
   CostAnalysisOptions cost_analysis_options_;
   std::unique_ptr<HloCostAnalysis> hlo_cost_analysis_;
+  std::unique_ptr<HloCostAnalysisCosts> hlo_cost_analysis_costs_;
   std::unique_ptr<CostAnalysis> cost_analysis_;
   std::unique_ptr<HloAliasAnalysis> alias_analysis_;
   std::unique_ptr<HloLiveRange> live_range_;
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
index 4f4fdc29e50d42..ae22ed48ffb0ed 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
@@ -70,18 +70,18 @@ namespace xla {
 namespace memory_space_assignment {
 namespace {
 
-Status InsertInstructionAndEnsureOperandsInserted(
+absl::Status InsertInstructionAndEnsureOperandsInserted(
     HloInstruction* new_instruction, HloInstructionSequence* new_sequence,
     absl::flat_hash_set<HloInstruction*>* inserted_instructions);
 
 // Insert an instruction to the schedule, and make sure its dependencies
 // (operands) are already in the schedule. If not, insert these operands
 // before the instruction.
-Status EnsureInstructionAndOperandsInserted(
+absl::Status EnsureInstructionAndOperandsInserted(
     HloInstruction* new_instruction, HloInstructionSequence* new_sequence,
     absl::flat_hash_set<HloInstruction*>* inserted_instructions) {
   if (inserted_instructions->contains(new_instruction)) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   return InsertInstructionAndEnsureOperandsInserted(
       new_instruction, new_sequence, inserted_instructions);
@@ -90,7 +90,7 @@ Status EnsureInstructionAndOperandsInserted(
 // Same as above, but does not check if instruction is already inserted. This is
 // used when the caller already knows the instruction isn't inserted yet, to
 // speed up compilation.
-Status InsertInstructionAndEnsureOperandsInserted(
+absl::Status InsertInstructionAndEnsureOperandsInserted(
     HloInstruction* new_instruction, HloInstructionSequence* new_sequence,
     absl::flat_hash_set<HloInstruction*>* inserted_instructions) {
   for (HloInstruction* operand : new_instruction->operands()) {
@@ -100,7 +100,7 @@ Status InsertInstructionAndEnsureOperandsInserted(
   VLOG(4) << "inserting: " << new_instruction->ToShortString();
   new_sequence->push_back(new_instruction);
   TF_RET_CHECK(inserted_instructions->insert(new_instruction).second);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 std::string InstructionScheduleToString(const HloLiveRange& hlo_live_range) {
@@ -373,7 +373,7 @@ MemorySpaceAssignment::RunMemorySpaceAssignment(
   return std::move(preset_assignments_);
 }
 
-Status MemorySpaceAssignment::FindAllocationSequence(
+absl::Status MemorySpaceAssignment::FindAllocationSequence(
     const HloLiveRange& hlo_live_range,
     const HloAliasAnalysis& alias_analysis) {
   auto algorithm = std::make_unique<MsaAlgorithm>(
@@ -387,7 +387,7 @@ Status MemorySpaceAssignment::FindAllocationSequence(
                                         options_.size_fn,
                                         heap_simulator_options)
                          .status());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 float MemorySpaceAssignment::ComputeEstimatedElapsedTime(
@@ -442,7 +442,8 @@ float MemorySpaceAssignment::ComputeEstimatedElapsedTime(
   return total_elapsed;
 }
 
-Status MemorySpaceAssignment::Process(const HloLiveRange& hlo_live_range) {
+absl::Status MemorySpaceAssignment::Process(
+    const HloLiveRange& hlo_live_range) {
   VLOG(1) << "Processing assigned buffers...";
   // Since some parent allocations may not be needed (e.g. when they don't have
   // any uses and if there is no other (non-parent) allocation that depends on
@@ -509,10 +510,10 @@ Status MemorySpaceAssignment::Process(const HloLiveRange& hlo_live_range) {
       TF_RETURN_IF_ERROR(allocation->PostProcess());
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status MemorySpaceAssignment::ExportAndColorBuffers() {
+absl::Status MemorySpaceAssignment::ExportAndColorBuffers() {
   VLOG(1) << "Exporting buffers...";
   TF_ASSIGN_OR_RETURN(auto alias_analysis, HloAliasAnalysis::Run(module_));
   absl::flat_hash_map<int64_t, int64_t> seen_buffer_offsets;
@@ -577,7 +578,7 @@ Status MemorySpaceAssignment::ExportAndColorBuffers() {
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void MemorySpaceAssignment::RemoveAssignmentForInstruction(
@@ -603,7 +604,7 @@ void MemorySpaceAssignment::RemoveAssignmentForInstruction(
   }
 }
 
-Status MemorySpaceAssignment::SimplifyGraph() {
+absl::Status MemorySpaceAssignment::SimplifyGraph() {
   VLOG(1) << "Simplifying graph...";
   for (HloComputation* computation : module_->MakeNonfusionComputations()) {
     // Parallel computations aren't in the schedule and don't need to be
@@ -700,7 +701,7 @@ Status MemorySpaceAssignment::SimplifyGraph() {
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 namespace {
@@ -927,7 +928,7 @@ void MemorySpaceAssignment::ScheduleAsynchronousCopies() {
   }
 }
 
-Status MemorySpaceAssignment::FixSchedule() {
+absl::Status MemorySpaceAssignment::FixSchedule() {
   VLOG(1) << "Fixing schedule...";
   TF_RET_CHECK(module_->has_schedule());
   HloSchedule& schedule = module_->schedule();
@@ -1018,10 +1019,10 @@ Status MemorySpaceAssignment::FixSchedule() {
 
   TF_RETURN_IF_ERROR(schedule.Update());
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status MemorySpaceAssignment::VerifyAndExportHeapSimulatorTrace() {
+absl::Status MemorySpaceAssignment::VerifyAndExportHeapSimulatorTrace() {
   VLOG(1) << "Verifying...";
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
                       HloAliasAnalysis::Run(module_));
@@ -1064,7 +1065,7 @@ Status MemorySpaceAssignment::VerifyAndExportHeapSimulatorTrace() {
       }
     }
     interval_tree.Add(start_time, end_time - 1, chunk);
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   // Go through all instructions in the module to ensure CopyStart/CopyDone
@@ -1113,8 +1114,8 @@ Status MemorySpaceAssignment::VerifyAndExportHeapSimulatorTrace() {
         }
       }
 
-      std::function<Status(const HloInstruction*, int64_t, int64_t,
-                           absl::string_view)>
+      std::function<absl::Status(const HloInstruction*, int64_t, int64_t,
+                                 absl::string_view)>
           split_conditional_buffer;
       split_conditional_buffer = [&](const HloInstruction* use_instruction,
                                      int64_t start_time, int64_t end_time,
@@ -1170,7 +1171,7 @@ Status MemorySpaceAssignment::VerifyAndExportHeapSimulatorTrace() {
                 << ")";
         TF_RETURN_IF_ERROR(add_allocation_and_verify(
             start_time, earliest_computation_start_time - 1, chunk, value));
-        return OkStatus();
+        return absl::OkStatus();
       };
 
       if (last_use_instruction &&
@@ -1237,7 +1238,7 @@ Status MemorySpaceAssignment::VerifyAndExportHeapSimulatorTrace() {
   }
   VLOG(1) << "Max memory usage ignoring fragmentation: " << max_memory_usage;
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace memory_space_assignment
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h
index 9e0e6d1a9a45b2..e37f5ad53aeae0 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h
@@ -297,7 +297,7 @@ class MemorySpaceAssignment {
 
   // Verify that the memory space assignment is free of overlapping buffers and
   // export heap simulator trace to be used by buffer_assignment.
-  Status VerifyAndExportHeapSimulatorTrace();
+  absl::Status VerifyAndExportHeapSimulatorTrace();
 
  protected:
   // Main driver of the memory space assignment pass.
@@ -307,8 +307,9 @@ class MemorySpaceAssignment {
 
   // Finds an AllocationSequence for placing buffers in alternate memory using
   // the MsaAlgorithm algorithm. Must be set before Process() is called.
-  virtual Status FindAllocationSequence(const HloLiveRange& hlo_live_range,
-                                        const HloAliasAnalysis& alias_analysis);
+  virtual absl::Status FindAllocationSequence(
+      const HloLiveRange& hlo_live_range,
+      const HloAliasAnalysis& alias_analysis);
 
   const Options& options() const { return options_; }
 
@@ -337,20 +338,20 @@ class MemorySpaceAssignment {
  private:
   // Process calls Process methods of the allocations after the allocations have
   // been finalized.
-  Status Process(const HloLiveRange& hlo_live_range);
+  absl::Status Process(const HloLiveRange& hlo_live_range);
 
   // Process() might have altered the computation graph by inserting kTuple and
   // kGetTupleElement instructions. SimplifyGraph performs a simple DCE and
   // tuple simplification operation (e.g., given GetTupleElement(Tuple(a, b),
   // 1), simply forwards b). Runs to fixed point.
-  Status SimplifyGraph();
+  absl::Status SimplifyGraph();
 
   // FixSchedule inserts asynchronous copies in the schedule.
-  Status FixSchedule();
+  absl::Status FixSchedule();
 
   // Export the alternate memory assignments to the PresetAssignments and color
   // the HLO graph with the determined memory spaces.
-  Status ExportAndColorBuffers();
+  absl::Status ExportAndColorBuffers();
 
   // Schedules asynchronous copies and ensures that the CopyStarts and their
   // corresponding CopyDones follow the same order.
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
index 192985928c9844..7517fa78ca21ef 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
@@ -113,7 +113,7 @@ int64_t ReservedScopedMemoryFn(
 }
 
 template <typename MessageType>
-StatusOr<MessageType> ParseTextProto(const std::string& text_proto) {
+absl::StatusOr<MessageType> ParseTextProto(const std::string& text_proto) {
   tsl::protobuf::TextFormat::Parser parser;
   MessageType parsed_proto;
   tsl::protobuf::io::ArrayInputStream input_stream(text_proto.data(),
@@ -222,10 +222,11 @@ class MemorySpaceAssignmentTestBase : public HloTestBase {
     if (cost_analysis_options_override) {
       cost_analysis_options = *cost_analysis_options_override;
     }
+    HloCostAnalysisCosts hlo_cost_analysis_costs(hlo_cost_analysis);
 
-    auto cost_analysis =
-        CostAnalysis::Create(hlo_cost_analysis, cost_analysis_options, *module)
-            .value();
+    auto cost_analysis = CostAnalysis::Create(hlo_cost_analysis_costs,
+                                              cost_analysis_options, *module)
+                             .value();
     memory_space_options.cost_analysis = cost_analysis.get();
     CostAnalysisPrefetchIntervalPicker prefetch_interval_picker(
         CostAnalysisPrefetchIntervalPicker(
@@ -7747,6 +7748,65 @@ ENTRY entry {
       /*hlo_cost_options_override=*/hlo_cost_options);
 }
 
+TEST_P(MemorySpaceAssignmentTest,
+       CalledComputationInefficientAllocationLiveLockBug) {
+  // Parameter 2 is passed into the conditional but it is not actually used. A
+  // bug in inefficient allocation pinned this buffer to the default memory at
+  // the use time, but we should really use the "corrected" use time which is
+  // the earliest-scheduled called computation parameter.
+  absl::string_view hlo_string = R"(
+  HloModule CondAllocation, is_scheduled=true
+
+  true_computation {
+    p0 = (f32[3], f32[3]) parameter(0)
+    gte = f32[3] get-tuple-element(p0), index=0
+    neg1 = f32[3] negate(gte)
+    ROOT tuple1 = (f32[3]) tuple(neg1)
+  }
+
+  false_computation {
+    p0 = (f32[3], f32[3]) parameter(0)
+    gte = f32[3] get-tuple-element(p0), index=0
+    neg2 = f32[3] negate(gte)
+    ROOT tuple2 = (f32[3]) tuple(neg2)
+  }
+
+  ENTRY entry {
+    p0 = f32[3] parameter(0)
+    p1 = pred[] parameter(1)
+    p2 = f32[3] parameter(2)
+    copy0 = f32[3] copy(p0)
+    negate0 = f32[3] negate(p0)
+    negate1 = f32[3] negate(negate0)
+    negate2 = f32[3] negate(negate1)
+    negate3 = f32[3] negate(negate2)
+    negate4 = f32[3] negate(negate3)
+    negate5 = f32[3] negate(negate4)
+    negate6 = f32[3] negate(negate5)
+    negate7 = f32[3] negate(negate6)
+    negate8 = f32[3] negate(negate7)
+    tuple = (f32[3], f32[3]) tuple(copy0, p2)
+    conditional = (f32[3]) conditional(p1, tuple, tuple), true_computation=true_computation, false_computation=false_computation
+    gte = f32[3] get-tuple-element(conditional), index=0
+    ROOT add = f32[3] add(gte, negate8)
+  }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  Options options = DefaultMemorySpaceOptions();
+  options.enable_cross_program_prefetch = false;
+  options.inefficient_use_to_copy_ratio = 0.5;
+  HloCostAnalysis::Options hlo_cost_options = DefaultHloCostAnalysisOptions();
+  hlo_cost_options.set_transcendentals_per_second(0.4);
+
+  AssignMemorySpaceUsingCostAnalysis(
+      module.get(), /*memory_space_options_override=*/options,
+      /*cost_analysis_options_override=*/std::nullopt,
+      /*hlo_cost_options_override=*/hlo_cost_options);
+}
+
 TEST_P(MemorySpaceAssignmentTest, AsyncOpElapsedTime) {
   // Test that async ops are treated to take no time. We assume async operations
   // are efficiently scheduled. So, in this example, collective-permute-start
@@ -9718,9 +9778,11 @@ ENTRY main {
   // Setup cost analysis so it takes 2 instructions to prefetch anything.
   HloCostAnalysis hlo_cost_analysis(ShapeSize);
   CostAnalysisOptions cost_analysis_options;
-  TF_ASSERT_OK_AND_ASSIGN(auto cost_analysis,
-                          FakeCostAnalysis::Create(hlo_cost_analysis, *module,
-                                                   cost_analysis_options));
+  HloCostAnalysisCosts hlo_cost_analysis_costs(hlo_cost_analysis);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto cost_analysis,
+      FakeCostAnalysis::Create(hlo_cost_analysis_costs, *module,
+                               cost_analysis_options));
   cost_analysis->SetOverrideForGetInstructionElapsed(
       [](const HloInstruction& instruction) -> float { return 10.0; });
   cost_analysis->SetOverrideForGetAsyncCopyElapsed(
@@ -10282,7 +10344,7 @@ class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
   // REQUIRES:
   // - Concat-bitcast and all slices were found in the schedule used to
   //   construct schedule_to_class.
-  static Status ConcatBitcastAndSlicesAfterInstruction(
+  static absl::Status ConcatBitcastAndSlicesAfterInstruction(
       const std::vector<HloInstruction*>& schedule,
       const std::vector<InstructionClass>& schedule_to_class,
       int slices_start_after_index) {
@@ -10302,13 +10364,13 @@ class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
       }
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // REQUIRES:
   // - Concat-bitcast and all slices were found in the schedule used to
   //   construct schedule_to_class.
-  static Status AtLeastOneNonCopyLikeInstructionBetweenSliceStarts(
+  static absl::Status AtLeastOneNonCopyLikeInstructionBetweenSliceStarts(
       const std::vector<HloInstruction*>& schedule,
       const std::vector<InstructionClass>& schedule_to_class) {
     bool found_non_copy_since_last_slice_start = true;
@@ -10332,13 +10394,13 @@ class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
       }
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // REQUIRES:
   // - Concat-bitcast and all slices were found in the schedule used to
   //   construct schedule_to_class.
-  static Status OneSliceStartAfterInstructionWithNoCopyLikeBetween(
+  static absl::Status OneSliceStartAfterInstructionWithNoCopyLikeBetween(
       const std::vector<HloInstruction*>& schedule,
       const std::vector<InstructionClass>& schedule_to_class,
       int slices_start_after_index) {
@@ -10385,13 +10447,13 @@ class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
                     first_slice_start_after_schedule_after, "."));
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // REQUIRES:
   // - Concat-bitcast and all slices were found in the schedule used to
   //   construct schedule_to_class.
-  static Status ConcatBitcastAndSlicesBeforeInstruction(
+  static absl::Status ConcatBitcastAndSlicesBeforeInstruction(
       const std::vector<HloInstruction*>& schedule,
       const std::vector<InstructionClass>& schedule_to_class,
       int slices_done_before_index) {
@@ -10412,13 +10474,13 @@ class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
       }
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // REQUIRES:
   // - Concat-bitcast and all slices were found in the schedule used to
   //   construct schedule_to_class.
-  static Status
+  static absl::Status
   ConcatBitcastAndSliceDonesBeforeInstructionWithNoCopyLikeBetween(
       const std::vector<HloInstruction*>& schedule,
       const std::vector<InstructionClass>& schedule_to_class,
@@ -10445,13 +10507,13 @@ class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
       }
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // REQUIRES:
   // - Concat-bitcast and all slices were found in the schedule used to
   //   construct schedule_to_class.
-  static Status ConcatBitcastAfterSliceDones(
+  static absl::Status ConcatBitcastAfterSliceDones(
       const std::vector<HloInstruction*>& schedule,
       const std::vector<InstructionClass>& schedule_to_class) {
     int concat_bitcast_index = -1;
@@ -10475,7 +10537,7 @@ class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
       }
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Return an OK status iff:
@@ -10491,7 +10553,7 @@ class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
   //   slices_done_before_instruction_name in the schedule, with no
   //   non-copy-like instruction between AND
   // - concat_bitcast comes after all slice dones AND
-  static Status CheckSchedule(
+  static absl::Status CheckSchedule(
       const HloModule& module, const HloInstruction* concat_bitcast,
       std::string_view slices_start_after_instruction_name,
       std::string_view slices_done_before_instruction_name,
@@ -10567,10 +10629,10 @@ class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
     TF_RETURN_IF_ERROR(
         ConcatBitcastAfterSliceDones(entry_schedule, schedule_to_class));
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  // Returns OkStatus iff:
+  // Returns absl::OkStatus iff:
   // - Each slice is assigned a chunk that is the same size as the slice
   //   instruction's shape.
   // - When the slices of sliced_copy_result are sorted in expected spatial
@@ -10581,9 +10643,9 @@ class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
   //   the slice chunks AND
   // - The size of the chunk assigned to the sliced_copy_result has the same
   //   size as the instruction's shape
-  static Status CheckSliceChunks(const PresetAssignments& assignments,
-                                 const HloInstruction* sliced_copy_result,
-                                 bool expect_bitcasted_io = false) {
+  static absl::Status CheckSliceChunks(const PresetAssignments& assignments,
+                                       const HloInstruction* sliced_copy_result,
+                                       bool expect_bitcasted_io = false) {
     const HloInstruction* concat_bitcast =
         (expect_bitcasted_io ? sliced_copy_result->operand(0)
                              : sliced_copy_result);
@@ -10637,7 +10699,7 @@ class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
             << (result_chunk.has_value() ? result_chunk->ToString()
                                          : "no chunk assigned");
     if (sorted_slices.empty()) {
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     // Check that slices are assigned contiguous chunks that are spatially
@@ -10701,7 +10763,7 @@ class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
                              ", to match its shape."));
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   SlicedPrefetchTest() {
diff --git a/third_party/xla/xla/service/memory_space_assignment/prefetch_interval_picker_test.cc b/third_party/xla/xla/service/memory_space_assignment/prefetch_interval_picker_test.cc
index 7b8cac3fcab70a..0ea6a0dfb5ba54 100644
--- a/third_party/xla/xla/service/memory_space_assignment/prefetch_interval_picker_test.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/prefetch_interval_picker_test.cc
@@ -79,9 +79,10 @@ TEST_F(CostAnalysisPrefetchIntervalPickerTest, PrefetchIntervalOrder) {
 
   HloCostAnalysis hlo_cost_analysis(ShapeSize);
   CostAnalysisOptions options;
+  HloCostAnalysisCosts hlo_cost_analysis_costs(hlo_cost_analysis);
   TF_ASSERT_OK_AND_ASSIGN(
       auto cost_analysis,
-      FakeCostAnalysis::Create(hlo_cost_analysis, *module, options));
+      FakeCostAnalysis::Create(hlo_cost_analysis_costs, *module, options));
   CostAnalysisPrefetchIntervalPicker interval_picker(
       *cost_analysis,
       /*min_overlap_to_async_copy_ratio=*/1.0,
@@ -178,9 +179,10 @@ TEST_F(CostAnalysisPrefetchIntervalPickerTest, PrefetchIntervalOrderWhile) {
 
   HloCostAnalysis hlo_cost_analysis(ShapeSize);
   CostAnalysisOptions options;
+  HloCostAnalysisCosts hlo_cost_analysis_costs(hlo_cost_analysis);
   TF_ASSERT_OK_AND_ASSIGN(
       auto cost_analysis,
-      FakeCostAnalysis::Create(hlo_cost_analysis, *module, options));
+      FakeCostAnalysis::Create(hlo_cost_analysis_costs, *module, options));
   CostAnalysisPrefetchIntervalPicker interval_picker(
       *cost_analysis,
       /*min_overlap_to_async_copy_ratio=*/1.0,
@@ -261,9 +263,10 @@ TEST_F(CostAnalysisPrefetchIntervalPickerTest, NestedWhile) {
 
   HloCostAnalysis hlo_cost_analysis(ShapeSize);
   CostAnalysisOptions options;
+  HloCostAnalysisCosts hlo_cost_analysis_costs(hlo_cost_analysis);
   TF_ASSERT_OK_AND_ASSIGN(
       auto cost_analysis,
-      FakeCostAnalysis::Create(hlo_cost_analysis, *module, options));
+      FakeCostAnalysis::Create(hlo_cost_analysis_costs, *module, options));
   CostAnalysisPrefetchIntervalPicker interval_picker(
       *cost_analysis,
       /*min_overlap_to_async_copy_ratio=*/1.0,
@@ -329,9 +332,10 @@ TEST_F(CostAnalysisPrefetchIntervalPickerTest, ConsecutiveConditionals) {
 
   HloCostAnalysis hlo_cost_analysis(ShapeSize);
   CostAnalysisOptions options;
+  HloCostAnalysisCosts hlo_cost_analysis_costs(hlo_cost_analysis);
   TF_ASSERT_OK_AND_ASSIGN(
       auto cost_analysis,
-      FakeCostAnalysis::Create(hlo_cost_analysis, *module, options));
+      FakeCostAnalysis::Create(hlo_cost_analysis_costs, *module, options));
   CostAnalysisPrefetchIntervalPicker interval_picker(
       *cost_analysis,
       /*min_overlap_to_async_copy_ratio=*/1.0,
@@ -374,9 +378,10 @@ TEST_F(CostAnalysisPrefetchIntervalPickerTest, EarliestLatestWindowTooSmall) {
 
   HloCostAnalysis hlo_cost_analysis(ShapeSize);
   CostAnalysisOptions options;
+  HloCostAnalysisCosts hlo_cost_analysis_costs(hlo_cost_analysis);
   TF_ASSERT_OK_AND_ASSIGN(
       auto cost_analysis,
-      FakeCostAnalysis::Create(hlo_cost_analysis, *module, options));
+      FakeCostAnalysis::Create(hlo_cost_analysis_costs, *module, options));
   cost_analysis->SetOverrideForGetInstructionElapsed(
       [](const HloInstruction& hlo) {
         if (hlo.opcode() == HloOpcode::kTanh) {
diff --git a/third_party/xla/xla/service/memory_space_assignment/testing_utils.h b/third_party/xla/xla/service/memory_space_assignment/testing_utils.h
index ccea37b88b470c..5d93516a1bb793 100644
--- a/third_party/xla/xla/service/memory_space_assignment/testing_utils.h
+++ b/third_party/xla/xla/service/memory_space_assignment/testing_utils.h
@@ -42,16 +42,16 @@ namespace memory_space_assignment {
 class FakeCostAnalysis : public CostAnalysis {
  public:
   static absl::StatusOr<std::unique_ptr<FakeCostAnalysis>> Create(
-      const HloCostAnalysis& cost_analysis, const HloModule& module,
+      HloCostAnalysisCosts& cost_analysis_costs, const HloModule& module,
       const CostAnalysisOptions& options) {
     TF_ASSIGN_OR_RETURN(auto alias_analysis, HloAliasAnalysis::Run(&module));
     TF_ASSIGN_OR_RETURN(auto hlo_live_range,
                         HloLiveRange::Run(module.schedule(), *alias_analysis,
                                           module.entry_computation()));
     auto call_graph = CallGraph::Build(&module);
-    return absl::WrapUnique(
-        new FakeCostAnalysis(cost_analysis, options, std::move(alias_analysis),
-                             std::move(hlo_live_range), std::move(call_graph)));
+    return absl::WrapUnique(new FakeCostAnalysis(
+        cost_analysis_costs, options, std::move(alias_analysis),
+        std::move(hlo_live_range), std::move(call_graph)));
   }
 
   float GetInstructionElapsed(
@@ -104,12 +104,12 @@ class FakeCostAnalysis : public CostAnalysis {
   }
 
  protected:
-  FakeCostAnalysis(const HloCostAnalysis& cost_analysis,
+  FakeCostAnalysis(HloCostAnalysisCosts& cost_analysis_costs,
                    const CostAnalysisOptions& options,
                    std::unique_ptr<HloAliasAnalysis> alias_analysis,
                    std::unique_ptr<HloLiveRange> hlo_live_range,
                    std::unique_ptr<CallGraph> call_graph)
-      : CostAnalysis(cost_analysis, options, std::move(alias_analysis),
+      : CostAnalysis(cost_analysis_costs, options, std::move(alias_analysis),
                      std::move(hlo_live_range), std::move(call_graph)) {}
 
  private:
diff --git a/third_party/xla/xla/service/memory_space_propagation_test.cc b/third_party/xla/xla/service/memory_space_propagation_test.cc
index da2059817cd4d1..940a4ebbcc400e 100644
--- a/third_party/xla/xla/service/memory_space_propagation_test.cc
+++ b/third_party/xla/xla/service/memory_space_propagation_test.cc
@@ -29,7 +29,9 @@ class MemorySpacePropagationTest : public HloTestBase {
         verifier_(/*layout_sensitive=*/false, /*allow_mixed_precision*/ false) {
   }
 
-  Status Verify(HloModule* module) { return verifier_.Run(module).status(); }
+  absl::Status Verify(HloModule* module) {
+    return verifier_.Run(module).status();
+  }
 
  private:
   HloVerifier verifier_;
diff --git a/third_party/xla/xla/service/optimize_input_output_buffer_alias.cc b/third_party/xla/xla/service/optimize_input_output_buffer_alias.cc
index c2a16f372df5ad..7f7844c9c54dac 100644
--- a/third_party/xla/xla/service/optimize_input_output_buffer_alias.cc
+++ b/third_party/xla/xla/service/optimize_input_output_buffer_alias.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/log.h"
 #include "absl/strings/string_view.h"
@@ -44,13 +45,13 @@ absl::StatusOr<bool> OptimizeInputOutputBufferAlias::Build(
     return false;
   }
 
-  // Collects all buffer donors in a vector.
+  // For each memory space, collects all buffer donors in a vector.
   struct DonorEntry {
     int64_t param_number;
     ShapeIndex index;
     int64_t shape_size;
   };
-  std::vector<DonorEntry> donor_vectors;
+  absl::flat_hash_map<int64_t, std::vector<DonorEntry>> donors;
 
   for (int64_t param_number = 0; param_number < input_shapes.size();
        ++param_number) {
@@ -69,17 +70,18 @@ absl::StatusOr<bool> OptimizeInputOutputBufferAlias::Build(
           !buffer_donor_config->ParameterIsBufferDonor(param_number, index)) {
         return;
       }
-      donor_vectors.emplace_back(
+      int64_t memory_space = subshape.layout().memory_space();
+      donors[memory_space].emplace_back(
           DonorEntry{param_number, index, shape_size_fn_(subshape)});
     });
   }
 
-  // Collects all buffer donees in a vector.
+  // For each memory space, collects all buffer donees in a vector.
   struct DoneeEntry {
     ShapeIndex index;
     int64_t shape_size;
   };
-  std::vector<DoneeEntry> donee_vectors;
+  absl::flat_hash_map<int64_t, std::vector<DoneeEntry>> donees;
   TF_RET_CHECK(LayoutUtil::HasLayout(output_shape));
   VLOG(1) << "output_shape: " << output_shape.ToString();
   ShapeUtil::ForEachSubshape(
@@ -90,40 +92,50 @@ absl::StatusOr<bool> OptimizeInputOutputBufferAlias::Build(
         if (alias_config->OutputHasAlias(index)) {
           return;
         }
-        donee_vectors.emplace_back(DoneeEntry{index, shape_size_fn_(subshape)});
+        int64_t memory_space = subshape.layout().memory_space();
+        donees[memory_space].emplace_back(
+            DoneeEntry{index, shape_size_fn_(subshape)});
       });
 
-  // Sort donor and donees by their shape size in non-increasing order.
-  absl::c_stable_sort(donor_vectors,
-                      [](const DonorEntry& a, const DonorEntry& b) -> bool {
-                        return a.shape_size > b.shape_size;
-                      });
-  absl::c_stable_sort(donee_vectors,
-                      [](const DoneeEntry& a, const DoneeEntry& b) -> bool {
-                        return a.shape_size > b.shape_size;
-                      });
-
-  // Match donors and donees with two pointers. The larger size a donee has, the
-  // more prioritized the donee will get matched.
-  int64_t donor_vector_index = 0;
-  int64_t donee_vector_index = 0;
-  while (donor_vector_index < donor_vectors.size() &&
-         donee_vector_index < donee_vectors.size()) {
-    const auto& donor = donor_vectors[donor_vector_index];
-    const auto& donee = donee_vectors[donee_vector_index];
-    if (donor.shape_size > donee.shape_size) {
-      donor_vector_index += 1;
-    } else if (donor.shape_size < donee.shape_size) {
-      donee_vector_index += 1;
-    } else {
-      // The current donor and donee match.
-      TF_RETURN_IF_ERROR(alias_config->SetUpAlias(
-          donee.index, donor.param_number, donor.index));
-      TF_RETURN_IF_ERROR(buffer_donor_config->RemoveBufferDonor(
-          donor.param_number, donor.index));
-      donor_vector_index += 1;
-      donee_vector_index += 1;
-      changed = true;
+  for (auto& [memory_space, donor_vector] : donors) {
+    auto donee_it = donees.find(memory_space);
+    if (donee_it == donees.end()) {
+      continue;
+    }
+    auto& donee_vector = donee_it->second;
+
+    // Sort donor and donees by their shape size in non-increasing order.
+    absl::c_stable_sort(donor_vector,
+                        [](const DonorEntry& a, const DonorEntry& b) -> bool {
+                          return a.shape_size > b.shape_size;
+                        });
+    absl::c_stable_sort(donee_vector,
+                        [](const DoneeEntry& a, const DoneeEntry& b) -> bool {
+                          return a.shape_size > b.shape_size;
+                        });
+
+    // Match donors and donees with two pointers. The larger size a donee has,
+    // the more prioritized the donee will get matched.
+    int64_t donor_vector_index = 0;
+    int64_t donee_vector_index = 0;
+    while (donor_vector_index < donor_vector.size() &&
+           donee_vector_index < donee_vector.size()) {
+      const auto& donor = donor_vector[donor_vector_index];
+      const auto& donee = donee_vector[donee_vector_index];
+      if (donor.shape_size > donee.shape_size) {
+        donor_vector_index += 1;
+      } else if (donor.shape_size < donee.shape_size) {
+        donee_vector_index += 1;
+      } else {
+        // The current donor and donee match.
+        TF_RETURN_IF_ERROR(alias_config->SetUpAlias(
+            donee.index, donor.param_number, donor.index));
+        TF_RETURN_IF_ERROR(buffer_donor_config->RemoveBufferDonor(
+            donor.param_number, donor.index));
+        donor_vector_index += 1;
+        donee_vector_index += 1;
+        changed = true;
+      }
     }
   }
 
diff --git a/third_party/xla/xla/service/optimize_input_output_buffer_alias_test.cc b/third_party/xla/xla/service/optimize_input_output_buffer_alias_test.cc
index fd63ae727e66ec..bd035053cc7970 100644
--- a/third_party/xla/xla/service/optimize_input_output_buffer_alias_test.cc
+++ b/third_party/xla/xla/service/optimize_input_output_buffer_alias_test.cc
@@ -246,4 +246,19 @@ TEST_F(OptimizeInputOutputBufferAliasTest, DynamicShapeBufferInput) {
   EXPECT_EQ(AliasCount(), 0);
 }
 
+// Shapes are the same, but are in different memory spaces.
+TEST_F(OptimizeInputOutputBufferAliasTest, AllDifferentMemorySpaces) {
+  CreatePassAndBufferDonorConfig(false);
+  std::vector<Shape> input = {
+      ShapeUtil::MakeTupleShape({r1f32_, r2f32_, r3f32_, r4f32_})};
+  Shape output = ShapeUtil::MakeTupleShape({r1f32_, r2f32_, r3f32_, r4f32_});
+  for (int i = 0; i < output.tuple_shapes_size(); ++i) {
+    output.mutable_tuple_shapes(i)->mutable_layout()->set_memory_space(
+        Layout::kHostMemorySpace);
+  }
+  bool changed = BuildAliasConfig(input, output);
+  EXPECT_FALSE(changed);
+  EXPECT_EQ(AliasCount(), 0);
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/service/p2p_schedule_preparation.cc b/third_party/xla/xla/service/p2p_schedule_preparation.cc
index 7508e73810f2fb..a41aebc99e7fd9 100644
--- a/third_party/xla/xla/service/p2p_schedule_preparation.cc
+++ b/third_party/xla/xla/service/p2p_schedule_preparation.cc
@@ -236,7 +236,7 @@ static constexpr int kPipelinedParentNodeIdx = 1;
 // If a group forms a cycle with another group, records the other group as a
 // complement group.
 struct P2PGroup {
-  Status RecordP2POpForUnpipelinedGroup(HloSendRecvInstruction* p2p) {
+  absl::Status RecordP2POpForUnpipelinedGroup(HloSendRecvInstruction* p2p) {
     if (kind == kUnrecognized) {
       // Leave unrecognized P2P groups alone.
       return OkStatus();
@@ -251,7 +251,7 @@ struct P2PGroup {
     return OkStatus();
   }
 
-  Status RecordP2POpForPipelinedGroup(HloSendRecvInstruction* p2p) {
+  absl::Status RecordP2POpForPipelinedGroup(HloSendRecvInstruction* p2p) {
     if (kind == kUnrecognized) {
       // Leave unrecognized P2P groups alone.
       return OkStatus();
@@ -269,7 +269,7 @@ struct P2PGroup {
     return OkStatus();
   }
 
-  Status RecordWhileOpToPipelinedGroup(HloInstruction* while_op) {
+  absl::Status RecordWhileOpToPipelinedGroup(HloInstruction* while_op) {
     if (kind == kUnrecognized) {
       // Leave unrecognized P2P groups alone.
       return OkStatus();
@@ -304,7 +304,7 @@ struct P2PGroup {
 
   // Records the other group that forms a cycle with this group, assuming that
   // we handle only two groups that form a cycle.
-  Status RecordComplementGroup(P2PGroupMap& p2p_group_map) {
+  absl::Status RecordComplementGroup(P2PGroupMap& p2p_group_map) {
     CHECK(!complement_group_channel.has_value() && runtime_stream == kStream1);
     for (auto& [channel, p2p_group] : p2p_group_map) {
       if (&p2p_group == this ||
@@ -417,9 +417,9 @@ bool MayInvokeCollectiveOp(
 // Send operand of the while-op, we assume these two P2P groups belong to the
 // same pipelined P2P sequence. Adds the WhileOp to the pipelined group
 // representation in this case.
-Status MayAddWhileOpToPipelinedGroup(HloInstruction* while_op,
-                                     P2PInComputation& p2p_in_computation,
-                                     P2PGroupMap& p2p_group_map) {
+absl::Status MayAddWhileOpToPipelinedGroup(HloInstruction* while_op,
+                                           P2PInComputation& p2p_in_computation,
+                                           P2PGroupMap& p2p_group_map) {
   if (while_op->while_init()->opcode() != HloOpcode::kTuple) {
     // A while-init should contain the loop index variable. So if a while-init
     // is not a tuple, it only contains the loop index variable and shouldn't
@@ -455,7 +455,7 @@ Status MayAddWhileOpToPipelinedGroup(HloInstruction* while_op,
   return OkStatus();
 }
 
-Status OrderBefore(HloInstruction* i1, HloInstruction* i2) {
+absl::Status OrderBefore(HloInstruction* i1, HloInstruction* i2) {
   TF_RETURN_IF_ERROR(i1->AddControlDependencyTo(i2));
   VLOG(10) << "Add control predecessor " << i2->ToString();
   return OkStatus();
@@ -463,7 +463,7 @@ Status OrderBefore(HloInstruction* i1, HloInstruction* i2) {
 
 // Adds control dependence to enforce this ordering:
 //   recv => send => recv-done => send-done.
-Status ConnectP2P1NodeChain(const P2PGroupNode& node) {
+absl::Status ConnectP2P1NodeChain(const P2PGroupNode& node) {
   HloRecvDoneInstruction* recv_done = node.recv_done;
   HloRecvInstruction* recv = node.recv;
   HloSendDoneInstruction* send_done = node.send_done;
@@ -477,14 +477,14 @@ Status ConnectP2P1NodeChain(const P2PGroupNode& node) {
 // For an unpipelined Send-Recv chain, adds control dependence to enforce this
 // ordering:
 //   recv => send => recv-done => send-done.
-Status ConnectUnpipelinedP2P(const P2PGroup& p2p_group) {
+absl::Status ConnectUnpipelinedP2P(const P2PGroup& p2p_group) {
   return ConnectP2P1NodeChain(p2p_group.GetChild());
 }
 
 // For a single pipelined Send-Recv chain in a while-body, adds control
 // dependence toenforce this ordering:
 //   recv => send => recv-done => send-done
-Status ConnectPipelined1P2PChild(const P2PGroup& p2p_group) {
+absl::Status ConnectPipelined1P2PChild(const P2PGroup& p2p_group) {
   return ConnectP2P1NodeChain(p2p_group.GetChild());
 }
 
@@ -492,8 +492,8 @@ Status ConnectPipelined1P2PChild(const P2PGroup& p2p_group) {
 // enforce this ordering:
 //   recv.0 => send.0 => recv.1 => send.1 =>
 //   recv-done.0 => recv-done.1 => send-done.0 => send-done.1
-Status ConnectP2P2NodeChain(const P2PGroupNode& node0,
-                            const P2PGroupNode& node1) {
+absl::Status ConnectP2P2NodeChain(const P2PGroupNode& node0,
+                                  const P2PGroupNode& node1) {
   HloSendRecvInstruction* recv_done0 = node0.recv_done;
   HloRecvInstruction* recv0 = node0.recv;
   HloSendRecvInstruction* send_done0 = node0.send_done;
@@ -520,8 +520,8 @@ Status ConnectP2P2NodeChain(const P2PGroupNode& node0,
 // while-body computation, we enforce this ordering:
 //   recv.0 => send.0 => recv.1 => send.1 =>
 //   recv-done.0 => recv-done.1 => send-done.0 => send-done.1
-Status ConnectPipelined2P2PChild(const P2PGroup& p2p_group,
-                                 const P2PGroupMap& p2p_group_map) {
+absl::Status ConnectPipelined2P2PChild(const P2PGroup& p2p_group,
+                                       const P2PGroupMap& p2p_group_map) {
   return ConnectP2P2NodeChain(
       p2p_group.GetComplementGroup(p2p_group_map)->GetChild(),
       p2p_group.GetChild());
@@ -530,7 +530,7 @@ Status ConnectPipelined2P2PChild(const P2PGroup& p2p_group,
 // For a pipelined Send-Recv chain with one group in the while-body calling
 // computation, we enforce this ordering:
 //   recv => send => recv-done => send-done
-Status ConnectPipelined1P2PParent(const P2PGroup& p2p_group) {
+absl::Status ConnectPipelined1P2PParent(const P2PGroup& p2p_group) {
   return ConnectP2P1NodeChain(p2p_group.GetParent());
 }
 
@@ -538,8 +538,8 @@ Status ConnectPipelined1P2PParent(const P2PGroup& p2p_group) {
 // in the while-body calling computation, we enforce this ordering:
 //   recv.0 => send.0 => recv.1 => send.1 => =>
 //   recv-done.0 => recv-done.1 => send-done.0 => send-done.1
-Status ConnectPipelined2P2PParent(const P2PGroup& p2p_group,
-                                  const P2PGroupMap& p2p_group_map) {
+absl::Status ConnectPipelined2P2PParent(const P2PGroup& p2p_group,
+                                        const P2PGroupMap& p2p_group_map) {
   return ConnectP2P2NodeChain(
       p2p_group.GetComplementGroup(p2p_group_map)->GetParent(),
       p2p_group.GetParent());
@@ -550,8 +550,8 @@ Status ConnectPipelined2P2PParent(const P2PGroup& p2p_group,
 // enforece this ordering:
 //   recv.0 => send.0 => recv.1 => send.1 =>
 //   recv-done.0 => recv-done.1 => send-done.0 => send-done.1
-Status ConnectUnpipelined2P2P(const P2PGroup& p2p_group,
-                              const P2PGroupMap& p2p_group_map) {
+absl::Status ConnectUnpipelined2P2P(const P2PGroup& p2p_group,
+                                    const P2PGroupMap& p2p_group_map) {
   CHECK(p2p_group.runtime_stream == kStream1);
   return ConnectP2P2NodeChain(
       p2p_group.GetComplementGroup(p2p_group_map)->GetChild(),
@@ -562,7 +562,7 @@ Status ConnectUnpipelined2P2P(const P2PGroup& p2p_group,
 // groups them by channel IDs, records pipeline decision for groups and connects
 // groups that form a cycle for pipelining. Also records whether the computation
 // invokes collective operation directly or indirectly.
-Status GatherP2PGroupsAndCollectiveInfo(
+absl::Status GatherP2PGroupsAndCollectiveInfo(
     const HloComputation* computation, P2PInComputation& p2p_in_computation,
     P2PGroupMap& p2p_group_map,
     CollectiveInComputation& collective_in_computation) {
@@ -728,8 +728,8 @@ absl::StatusOr<std::pair<int, const P2PGroup*>> ConnectP2PChain(
   return std::make_pair(num_p2p_chains, pipelined_group);
 }
 
-Status OrderBefore(HloReachabilityMap* reachability, HloInstruction* a,
-                   HloInstruction* b) {
+absl::Status OrderBefore(HloReachabilityMap* reachability, HloInstruction* a,
+                         HloInstruction* b) {
   VLOG(10) << "OrderBefore " << a->ToString() << " " << b->ToString();
   if (!reachability->IsReachable(a, b)) {
     TF_RETURN_IF_ERROR(a->AddControlDependencyTo(b));
@@ -748,7 +748,7 @@ Status OrderBefore(HloReachabilityMap* reachability, HloInstruction* a,
 //      Recv-Done.1 => Send-Done.1 (pipelined 2)
 // We intend to schedule collective ops ordered before the beginning of such a
 // chain or after the ending of such a chain.
-Status LinearizeCollectivesWithOtherP2P(
+absl::Status LinearizeCollectivesWithOtherP2P(
     const P2PGroupMap& p2p_group_map, const P2PGroup& group,
     const CollectiveInComputation& collective_in_computation,
     const std::vector<HloInstruction*>::iterator& chain_start_iter,
@@ -835,7 +835,7 @@ Status LinearizeCollectivesWithOtherP2P(
 // Adds control dependence to linearize other collective ops with respect to
 // the given pipelined P2P chain in the computation for the pipelined
 // while-loop. All Collective ops should be scheduled before the chain.
-Status LinearizeCollectivesWithPipelinedP2PChild(
+absl::Status LinearizeCollectivesWithPipelinedP2PChild(
     const P2PGroupMap& p2p_group_map, const P2PGroup& group,
     const CollectiveInComputation& collective_in_computation,
     HloComputation* computation, HloReachabilityMap* reachability) {
diff --git a/third_party/xla/xla/service/pattern_matcher.h b/third_party/xla/xla/service/pattern_matcher.h
index 242f2eff0d6346..b17c53a9baf699 100644
--- a/third_party/xla/xla/service/pattern_matcher.h
+++ b/third_party/xla/xla/service/pattern_matcher.h
@@ -2680,12 +2680,14 @@ XLA_UNOP_PATTERN(Cos)
 XLA_UNOP_PATTERN(AllReduceStart)
 XLA_UNOP_PATTERN(AllReduceDone)
 XLA_UNOP_PATTERN(AllToAll)
+XLA_UNOP_PATTERN(AsyncDone)
 XLA_UNOP_PATTERN(CollectiveBroadcast)
 XLA_UNOP_PATTERN(CollectivePermute)
 XLA_UNOP_PATTERN(CollectivePermuteStart)
 XLA_UNOP_PATTERN(CollectivePermuteDone)
 XLA_UNOP_PATTERN(Domain)
 XLA_UNOP_PATTERN(Exp)
+XLA_UNOP_PATTERN(Expm1)
 XLA_UNOP_PATTERN(Fft)
 XLA_UNOP_PATTERN(Floor)
 XLA_UNOP_PATTERN(GetTupleElement)
@@ -2849,6 +2851,7 @@ inline auto WithOperands(Matcher&& m, int64_t operand_num, FirstArg&& first_arg,
 XLA_VARIADIC_OP_PATTERN(AfterAll);
 XLA_VARIADIC_OP_PATTERN(AllGather)
 XLA_VARIADIC_OP_PATTERN(AllReduce)
+XLA_VARIADIC_OP_PATTERN(AsyncStart)
 XLA_VARIADIC_OP_PATTERN(Concatenate);
 XLA_VARIADIC_OP_PATTERN(Conditional);
 XLA_VARIADIC_OP_PATTERN(DynamicSlice)
diff --git a/third_party/xla/xla/service/platform_util.cc b/third_party/xla/xla/service/platform_util.cc
index 1d7cb0c2088d24..dc4d13b9e750ff 100644
--- a/third_party/xla/xla/service/platform_util.cc
+++ b/third_party/xla/xla/service/platform_util.cc
@@ -135,7 +135,7 @@ PlatformUtil::GetSupportedPlatforms() {
 // by XLA.
 static bool IsDeviceSupported(se::StreamExecutor* executor) {
   const auto& description = executor->GetDeviceDescription();
-  if (executor->platform()->id() == se::cuda::kCudaPlatformId) {
+  if (executor->GetPlatform()->id() == se::cuda::kCudaPlatformId) {
     // CUDA devices must have a minimum compute capability.
     se::CudaComputeCapability cc = description.cuda_compute_capability();
     if (!cc.IsAtLeast(kMinCudaComputeCapabilityMajor,
@@ -148,7 +148,7 @@ static bool IsDeviceSupported(se::StreamExecutor* executor) {
                 << "device is " << cc.ToString();
       return false;
     }
-  } else if (executor->platform()->id() == se::rocm::kROCmPlatformId) {
+  } else if (executor->GetPlatform()->id() == se::rocm::kROCmPlatformId) {
     auto rocm_compute_capability = description.rocm_compute_capability();
     if (!rocm_compute_capability.is_supported_gfx_version()) {
       LOG(INFO) << "StreamExecutor ROCM device (" << executor->device_ordinal()
diff --git a/third_party/xla/xla/service/qr_expander.cc b/third_party/xla/xla/service/qr_expander.cc
index e817b66b61d2c8..8eb3abfa353200 100644
--- a/third_party/xla/xla/service/qr_expander.cc
+++ b/third_party/xla/xla/service/qr_expander.cc
@@ -106,8 +106,8 @@ XlaOp Norm(std::vector<XlaOp> xs) {
 //   return (v, tau, beta)
 // TODO(phawkins): LAPACK's xLARFG implementation has code for handling
 // overflows in the norm/beta calculations. Perhaps do the same here.
-Status House(XlaOp x, XlaOp k, absl::Span<const int64_t> batch_dims,
-             const int64_t m, XlaOp* v, XlaOp* tau, XlaOp* beta) {
+absl::Status House(XlaOp x, XlaOp k, absl::Span<const int64_t> batch_dims,
+                   const int64_t m, XlaOp* v, XlaOp* tau, XlaOp* beta) {
   XlaBuilder* const builder = x.builder();
   TF_ASSIGN_OR_RETURN(Shape x_shape, builder->GetShape(x));
   const PrimitiveType type = x_shape.element_type();
diff --git a/third_party/xla/xla/service/reduce_decomposer.cc b/third_party/xla/xla/service/reduce_decomposer.cc
index 0e43381063f1b3..507be9637120a7 100644
--- a/third_party/xla/xla/service/reduce_decomposer.cc
+++ b/third_party/xla/xla/service/reduce_decomposer.cc
@@ -33,7 +33,7 @@ namespace {
 // Enforces property that all inputs to variadic reduction have same layout.
 class VariadicReductionLayoutEqualizer : public DfsHloRewriteVisitor {
  public:
-  Status HandleReduce(HloInstruction* hlo) override {
+  absl::Status HandleReduce(HloInstruction* hlo) override {
     auto reduce = Cast<HloReduceInstruction>(hlo);
     std::vector<HloInstruction*> new_inputs;
     bool changed = false;
@@ -70,7 +70,7 @@ class ReduceDecomposerVisitor : public DfsHloRewriteVisitor {
   explicit ReduceDecomposerVisitor(HloPredicate custom_layout_allowed)
       : custom_layout_allowed_(std::move(custom_layout_allowed)) {}
 
-  Status HandleReduce(HloInstruction* hlo) override {
+  absl::Status HandleReduce(HloInstruction* hlo) override {
     auto reduce = Cast<HloReduceInstruction>(hlo);
     auto shape = reduce->shape();
     if (custom_layout_allowed_ && custom_layout_allowed_(reduce)) {
diff --git a/third_party/xla/xla/service/reduce_scatter_combiner.cc b/third_party/xla/xla/service/reduce_scatter_combiner.cc
index 7bef0b9f9fdf76..468371fb4751d1 100644
--- a/third_party/xla/xla/service/reduce_scatter_combiner.cc
+++ b/third_party/xla/xla/service/reduce_scatter_combiner.cc
@@ -77,7 +77,8 @@ using ReduceScatterKey =
 // Combines the elements of to_combine into a single ReduceScatter op. All
 // entries in to_combine must be ReduceScatter ops with exactly one operand
 // and the same reduction operation.
-Status CombineReduceScatters(absl::Span<HloInstruction* const> to_combine) {
+absl::Status CombineReduceScatters(
+    absl::Span<HloInstruction* const> to_combine) {
   if (to_combine.size() < 2) {
     return OkStatus();
   }
@@ -140,7 +141,7 @@ Status CombineReduceScatters(absl::Span<HloInstruction* const> to_combine) {
   TF_RET_CHECK(operands.size() >= 2);
   combined = computation.AddInstruction(HloInstruction::CreateReduceScatter(
       ShapeUtil::MakeTupleShape(output_shapes), operands, reduction,
-      to_combine.front()->replica_groups(),
+      to_combine.front()->device_list(),
       /*constrain_layout=*/false, to_combine.front()->channel_id(),
       Cast<HloReduceScatterInstruction>(to_combine.front())
           ->use_global_device_ids(),
diff --git a/third_party/xla/xla/service/reduce_scatter_decomposer.cc b/third_party/xla/xla/service/reduce_scatter_decomposer.cc
index da2fed224a53f5..4c093cc0425d00 100644
--- a/third_party/xla/xla/service/reduce_scatter_decomposer.cc
+++ b/third_party/xla/xla/service/reduce_scatter_decomposer.cc
@@ -64,7 +64,7 @@ absl::StatusOr<bool> ReduceScatterDecomposer::Run(
       HloInstruction *ar =
           computation->AddInstruction(HloInstruction::CreateAllReduce(
               rs->operand(0)->shape(), rs->operands(), apply_clone,
-              rs->replica_groups(), rs->constrain_layout(), channel_id,
+              rs->device_list(), rs->constrain_layout(), channel_id,
               rs->use_global_device_ids()));
       apply_clone->SetCollectiveCallInstruction(ar);
 
diff --git a/third_party/xla/xla/service/reduce_window_rewriter.cc b/third_party/xla/xla/service/reduce_window_rewriter.cc
new file mode 100644
index 00000000000000..934167beeed39d
--- /dev/null
+++ b/third_party/xla/xla/service/reduce_window_rewriter.cc
@@ -0,0 +1,546 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/reduce_window_rewriter.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status.h"
+#include "xla/status_macros.h"
+#include "xla/util.h"
+#include "xla/window_util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+
+static size_t FlattenShapeIndex(const ShapeIndex& shape_index) {
+  if (shape_index.empty()) {
+    return 0;
+  }
+  CHECK_EQ(shape_index.size(), 1);
+  return shape_index.back();
+}
+
+static Shape ShapeAtIndex(const Shape& shape, const ShapeIndex& shape_index) {
+  if (shape_index.empty()) {
+    return shape;
+  }
+  CHECK_EQ(shape_index.size(), 1);
+  return ShapeUtil::GetTupleElementShape(shape, shape_index.back());
+}
+
+static HloInstruction* GetAtIndex(HloInstruction* hlo,
+                                  const ShapeIndex& shape_index) {
+  if (shape_index.empty()) {
+    return hlo;
+  }
+  CHECK_EQ(shape_index.size(), 1);
+  return hlo->parent()->AddInstruction(HloInstruction::CreateGetTupleElement(
+      ShapeAtIndex(hlo->shape(), shape_index), hlo, shape_index.back()));
+}
+
+// Transform reduce-win(x) ->
+//   if rank(x) == 1:
+//   then: reshape_r2_r1(reduce-win(reshape_r1_r2(x)))
+//   else: no change
+absl::Status ReduceWindowRewriter::ReplaceReduceWindowWithReshape(
+    HloReduceWindowInstruction* reduce_window) {
+  VLOG(2) << "Converting R1 reduce window: " << reduce_window->ToString();
+
+  std::vector<Shape> r2_output_shapes;
+  ShapeUtil::ForEachSubshape(
+      reduce_window->shape(),
+      [&](const Shape& subshape, const ShapeIndex& shape_index) {
+        if (!ShapeUtil::IsLeafIndex(reduce_window->shape(), shape_index)) {
+          return;
+        }
+        Shape r2_output_shape = subshape;
+        ShapeUtil::AppendMajorDimension(1, &r2_output_shape);
+        UpdateLayout(&r2_output_shape);
+        r2_output_shapes.push_back(r2_output_shape);
+
+        VLOG(2) << "ReduceWindowRewriter: Converting R2 result to R1: "
+                << ShapeUtil::HumanStringWithLayout(r2_output_shape);
+      });
+
+  Window r2_window = reduce_window->window();
+  WindowDimension* dim = r2_window.add_dimensions();
+  dim->set_size(1);
+  dim->set_stride(1);
+  dim->set_base_dilation(1);
+  dim->set_window_dilation(1);
+
+  std::vector<HloInstruction*> r2_operands;
+  for (HloInstruction* operand : reduce_window->inputs()) {
+    Shape r2_input_shape = operand->shape();
+    ShapeUtil::AppendMajorDimension(1, &r2_input_shape);
+    UpdateLayout(&r2_input_shape);
+
+    VLOG(2) << "ReduceWindowRewriter: Converting R1 operand to R2: "
+            << ShapeUtil::HumanStringWithLayout(r2_input_shape);
+    HloInstruction* r2_operand = operand->parent()->AddInstruction(
+        HloInstruction::CreateReshape(r2_input_shape, operand));
+    VLOG(2) << "R2 new operand: " << r2_operand->ToString();
+    r2_operands.push_back(r2_operand);
+  }
+  HloInstruction* new_reduce_window = reduce_window->parent()->AddInstruction(
+      HloInstruction::CreateReduceWindow(
+          reduce_window->shape().IsTuple()
+              ? ShapeUtil::MakeTupleShape(r2_output_shapes)
+              : r2_output_shapes[0],
+          r2_operands, reduce_window->init_values(), r2_window,
+          reduce_window->to_apply()));
+
+  VLOG(2) << "R2 resulting reduce window: " << new_reduce_window->ToString();
+
+  std::vector<HloInstruction*> final_reshapes;
+  ShapeUtil::ForEachSubshape(
+      reduce_window->shape(),
+      [&](const Shape& subshape, const ShapeIndex& shape_index) {
+        if (!ShapeUtil::IsLeafIndex(reduce_window->shape(), shape_index)) {
+          return;
+        }
+        HloInstruction* final_reshape =
+            new_reduce_window->parent()->AddInstruction(
+                HloInstruction::CreateReshape(
+                    subshape, GetAtIndex(new_reduce_window, shape_index)));
+        final_reshapes.push_back(final_reshape);
+      });
+  HloInstruction* result;
+  if (reduce_window->shape().IsTuple()) {
+    result = new_reduce_window->parent()->AddInstruction(
+        HloInstruction::CreateTuple(final_reshapes));
+  } else {
+    CHECK_EQ(final_reshapes.size(), 1);
+    result = final_reshapes[0];
+  }
+  TF_RETURN_IF_ERROR(reduce_window->ReplaceAllUsesWith(result));
+  TF_RETURN_IF_ERROR(
+      new_reduce_window->parent()->RemoveInstruction(reduce_window));
+
+  return OkStatus();
+}
+
+absl::StatusOr<bool> ReduceWindowRewriter::TryOptimizeCumSumOrProd(
+    HloReduceWindowInstruction* reduce_window) {
+  const Shape& operand_shape = reduce_window->inputs().front()->shape();
+
+  // Try to find the scan axis. We expect all window dimensions to be trivial,
+  // except for one.
+  int64_t rank = operand_shape.rank();
+  const Window& window = reduce_window->window();
+  int64_t scan_dim_num = -1;
+  for (int i = 0; i < rank; ++i) {
+    const WindowDimension& window_dim = window.dimensions(i);
+    if (window_util::IsTrivialWindowDimension(window_dim)) {
+      continue;
+    }
+    if (scan_dim_num != -1) {
+      // At least two non-trivial dimensions exist, so, no cigar.
+      return false;
+    }
+    scan_dim_num = i;
+  }
+
+  if (scan_dim_num == -1) {
+    return false;
+  }
+
+  const int64_t scan_length = operand_shape.dimensions(scan_dim_num);
+  absl::Span<HloInstruction* const> init_values = reduce_window->init_values();
+  const WindowDimension& scan_window_dim = window.dimensions(scan_dim_num);
+
+  bool forward_scan = (scan_window_dim.padding_low() == scan_length - 1 ||
+                       scan_window_dim.padding_low() == scan_length) &&
+                      scan_window_dim.padding_high() == 0;
+  bool reverse_scan = (scan_window_dim.padding_high() == scan_length - 1 ||
+                       scan_window_dim.padding_high() == scan_length) &&
+                      scan_window_dim.padding_low() == 0;
+  // We accept two values for low padding: the input length for exclusive scan,
+  // and scan_length - 1 for inclusive scan.
+  if (scan_window_dim.stride() != 1 || scan_window_dim.size() != scan_length ||
+      (!forward_scan && !reverse_scan) || scan_window_dim.window_reversal() ||
+      scan_window_dim.base_dilation() != 1 ||
+      scan_window_dim.window_dilation() != 1) {
+    return false;
+  }
+  bool is_exclusive = forward_scan
+                          ? (scan_window_dim.padding_low() == scan_length)
+                          : (scan_window_dim.padding_high() == scan_length);
+
+  if (scan_length <= base_length_) {
+    return false;
+  }
+
+  if (reduce_window->to_apply()->root_instruction()->shape().IsTuple() &&
+      reduce_window->to_apply()->root_instruction()->opcode() !=
+          HloOpcode::kTuple) {
+    return false;
+  }
+
+  VLOG(2) << "Rewriting Scan: " << reduce_window->ToString();
+  HloComputation* parent = reduce_window->parent();
+  std::vector<HloInstruction*> sources(reduce_window->inputs().begin(),
+                                       reduce_window->inputs().end());
+
+  // Since we need to tile this dimension, it's convenient to have it logically
+  // last.
+  std::vector<int64_t> permutation(rank);
+  absl::c_iota(permutation, 0);
+  permutation[scan_dim_num] = rank - 1;
+  permutation[rank - 1] = scan_dim_num;
+  if (scan_dim_num != rank - 1) {
+    for (size_t i = 0; i < sources.size(); ++i) {
+      sources[i] = parent->AddInstruction(HloInstruction::CreateTranspose(
+          ShapeUtil::PermuteDimensions(permutation, sources[i]->shape()),
+          sources[i], permutation));
+    }
+  }
+
+  // We don't actually need to match the computation - this transformation will
+  // work for an commutative/associative reducer, which is what we assume for
+  // ReduceWindow anyway.
+
+  // Break the scan into an "inner" and an "outer" scan - this is basically a
+  // tree reduction:
+  // (The explanation below assumes an R1 scan for simplicity. For Rk scan, all
+  // shapes have k-1 "batch" dimensions that need to be preserved.)
+  //
+  // 1) If necessary, pad input from {N} to {K}, where K is a multiple of 128.
+  // 2) Reshape from {K} to {K / 128, 128}.
+  // 3) Scan each 128 dimension.
+  // 4) Slice out the last column.
+  // 5) Exclusive scan across the last column.
+  // 6) Broadcast it back into {K / 128, 128}
+  // 7) Add up the results of (3) and (6).
+  // 8) Reshape back into {K}
+  // 9) Slice off the padding.
+  //
+  // For example, consider a cumulative sum over an R1 of length 9, with a base
+  // case of 3 instead of 128. Let the input be:
+  // [0 1 2 3 4 5 6 7 8]
+  //
+  // We need no padding, so we go directly to (2):
+  // [0 1 2
+  //  3 4 5
+  //  6 7 8]
+  //
+  // The result of the scan in (3) is:
+  // [0  1  3
+  //  3  7 12
+  //  6 13 21]
+  //
+  // Slicing out the last column we get (4):
+  // [ 3
+  //  12
+  //  21]
+  //
+  // And after scanning and broadcasting (5 and 6):
+  // [ 0  0  0
+  //   3  3  3
+  //  15 15 15]
+  //
+  // Finally, we add up the two scans (3) and (6), getting (7):
+  // [ 0  1  3
+  //   6 10 15
+  //  21 28 36]
+  //
+  // And reshape back into [0 1 3 6 10 15 21 28 36].
+  //
+  // For reverse scans, we perform the same as forward scans, except: we perform
+  // a reverse scan at (3), slice out the first column at (4), and perform an
+  // exclusive reverse scan of the first columnt at (5).
+
+  // Pad.
+  const int64_t padded_length = RoundUpTo(scan_length, base_length_);
+  if (scan_length != padded_length) {
+    for (size_t i = 0; i < sources.size(); ++i) {
+      auto* source = sources[i];
+      Shape padded_shape = source->shape();
+      padded_shape.set_dimensions(rank - 1, padded_length);
+
+      UpdateLayout(&padded_shape);
+      auto padding_config = MakeNoPaddingConfig(rank);
+      padding_config.mutable_dimensions(rank - 1)->set_edge_padding_high(
+          padded_length - scan_length);
+
+      sources[i] = parent->AddInstruction(HloInstruction::CreatePad(
+          padded_shape, source, init_values[i], padding_config));
+    }
+  }
+
+  // Reshape to R(k+1).
+  const int64_t num_columns = padded_length / base_length_;
+  std::vector<HloInstruction*> tiled_sources;
+  std::vector<Shape> tiled_shapes;
+  for (size_t i = 0; i < sources.size(); ++i) {
+    auto* source = sources[i];
+    Shape tiled_shape = source->shape();
+    tiled_shape.set_dimensions(rank - 1, num_columns);
+
+    UpdateLayout(&tiled_shape);
+    ShapeUtil::AppendMajorDimension(base_length_, &tiled_shape);
+    tiled_shapes.push_back(tiled_shape);
+    tiled_sources.push_back(parent->AddInstruction(
+        HloInstruction::CreateReshape(tiled_shape, source)));
+  }
+
+  // Outer scan.
+  Window outer_window =
+      window_util::MakeWindow(std::vector<int64_t>(rank + 1, 1));
+  outer_window.mutable_dimensions(rank)->set_size(base_length_);
+  if (forward_scan) {
+    outer_window.mutable_dimensions(rank)->set_padding_low(base_length_ - 1);
+  } else {
+    outer_window.mutable_dimensions(rank)->set_padding_high(base_length_ - 1);
+  }
+  auto outer_reduce_window =
+      parent->AddInstruction(HloInstruction::CreateReduceWindow(
+          reduce_window->shape().IsTuple()
+              ? ShapeUtil::MakeTupleShape(tiled_shapes)
+              : tiled_shapes[0],
+          tiled_sources, init_values, outer_window, reduce_window->to_apply()));
+
+  // Slice out the last (first if reverse scan) column.
+  std::vector<Shape> column_shapes;
+  std::vector<HloInstruction*> last_cols;
+  ShapeUtil::ForEachSubshape(
+      outer_reduce_window->shape(),
+      [&](const Shape& subshape, const ShapeIndex& shape_index) {
+        if (!ShapeUtil::IsLeafIndex(outer_reduce_window->shape(),
+                                    shape_index)) {
+          return;
+        }
+        Shape column_shape = subshape;
+        column_shape.set_dimensions(rank, 1);
+
+        UpdateLayout(&column_shape);
+        std::vector<int64_t> col_slice_starts(rank + 1, 0);
+        std::vector<int64_t> col_slice_limits(
+            SpanToVector(subshape.dimensions()));
+        if (forward_scan) {
+          col_slice_starts[rank] = base_length_ - 1;
+        } else {
+          col_slice_limits[rank] = 1;
+        }
+        auto last_col = parent->AddInstruction(HloInstruction::CreateSlice(
+            column_shape, GetAtIndex(outer_reduce_window, shape_index),
+            col_slice_starts, col_slice_limits,
+            std::vector<int64_t>(rank + 1, 1)));
+        column_shape.DeleteDimension(rank);
+        last_col = parent->AddInstruction(
+            HloInstruction::CreateReshape(column_shape, last_col));
+        last_cols.push_back(last_col);
+
+        column_shape.set_dimensions(rank - 1, num_columns + 1);
+        UpdateLayout(&column_shape);
+        column_shapes.push_back(column_shape);
+      });
+
+  // Inner scan
+  Window inner_window = window_util::MakeWindow(std::vector<int64_t>(rank, 1));
+  inner_window.mutable_dimensions(rank - 1)->set_size(num_columns);
+  if (forward_scan) {
+    inner_window.mutable_dimensions(rank - 1)->set_padding_low(num_columns);
+  } else {
+    inner_window.mutable_dimensions(rank - 1)->set_padding_high(num_columns);
+  }
+  auto inner_reduce_window =
+      parent->AddInstruction(HloInstruction::CreateReduceWindow(
+          reduce_window->shape().IsTuple()
+              ? ShapeUtil::MakeTupleShape(column_shapes)
+              : column_shapes[0],
+          last_cols, init_values, inner_window, reduce_window->to_apply()));
+  std::vector<int64_t> exclusive_slice_starts(rank, 0);
+  std::vector<int64_t> exclusive_slice_limits =
+      SpanToVector(column_shapes[0].dimensions());
+  if (forward_scan) {
+    exclusive_slice_limits[rank - 1] = num_columns;
+  } else {
+    exclusive_slice_starts[rank - 1] = 1;
+    exclusive_slice_limits[rank - 1] = num_columns + 1;
+  }
+  std::vector<HloInstruction*> inner_scan_components;
+  ShapeUtil::ForEachSubshape(
+      inner_reduce_window->shape(),
+      [&](const Shape& subshape, const ShapeIndex& shape_index) {
+        if (!ShapeUtil::IsLeafIndex(inner_reduce_window->shape(),
+                                    shape_index)) {
+          return;
+        }
+        size_t idx = FlattenShapeIndex(shape_index);
+        auto last_col = last_cols[idx];
+        auto* inner_slice = parent->AddInstruction(HloInstruction::CreateSlice(
+            last_col->shape(), GetAtIndex(inner_reduce_window, shape_index),
+            exclusive_slice_starts, exclusive_slice_limits,
+            std::vector<int64_t>(rank, 1)));
+
+        std::vector<int64_t> rank_iota(rank);
+        absl::c_iota(rank_iota, 0);
+        auto* inner_scan_component =
+            parent->AddInstruction(HloInstruction::CreateBroadcast(
+                tiled_shapes[idx], inner_slice, rank_iota));
+        inner_scan_components.push_back(inner_scan_component);
+      });
+
+  // Combine inner and outer scans.
+  std::vector<HloInstruction*> map_operands;
+  ShapeUtil::ForEachSubshape(
+      outer_reduce_window->shape(),
+      [&](const Shape& subshape, const ShapeIndex& shape_index) {
+        if (!ShapeUtil::IsLeafIndex(outer_reduce_window->shape(),
+                                    shape_index)) {
+          return;
+        }
+        map_operands.push_back(GetAtIndex(outer_reduce_window, shape_index));
+      });
+  map_operands.insert(map_operands.end(), inner_scan_components.begin(),
+                      inner_scan_components.end());
+
+  // Reshape back to Rk and slice out the padding.
+  std::vector<HloInstruction*> scans;
+  auto status = ShapeUtil::ForEachSubshapeWithStatus(
+      outer_reduce_window->shape(),
+      [&](const Shape& subshape,
+          const ShapeIndex& shape_index) -> absl::Status {
+        if (!ShapeUtil::IsLeafIndex(outer_reduce_window->shape(),
+                                    shape_index)) {
+          return OkStatus();
+        }
+        size_t idx = FlattenShapeIndex(shape_index);
+        auto source = sources[idx];
+        HloComputation* map_computation;
+        auto reduce_function_root =
+            reduce_window->to_apply()->root_instruction();
+        if (reduce_function_root->shape().IsTuple()) {
+          TF_RET_CHECK(reduce_function_root->opcode() == HloOpcode::kTuple);
+          // This corresponds to step 7: combining the inner scan with the outer
+          // scan using a map function.
+          auto* map_computation_root = reduce_function_root->operand(idx);
+          absl::flat_hash_map<const HloInstruction*,
+                              std::unique_ptr<HloInstruction>>
+              replacements;
+          replacements[reduce_function_root] = nullptr;
+          map_computation = parent->parent()->AddEmbeddedComputation(
+              reduce_window->to_apply()->CloneWithReplacements(
+                  &replacements,
+                  /*extra_parameters=*/{}, nullptr, "clone",
+                  map_computation_root));
+        } else {
+          map_computation = reduce_window->to_apply();
+        }
+        auto scan = parent->AddInstruction(HloInstruction::CreateMap(
+            ShapeAtIndex(outer_reduce_window->shape(), shape_index),
+            map_operands, map_computation));
+        scan = parent->AddInstruction(
+            HloInstruction::CreateReshape(source->shape(), scan));
+
+        // If necessary, transpose back to the original order.
+        if (scan_dim_num != rank - 1) {
+          scan = parent->AddInstruction(HloInstruction::CreateTranspose(
+              ShapeUtil::PermuteDimensions(permutation, source->shape()), scan,
+              permutation));
+        }
+
+        // Remove the padding to the base length.
+        if (padded_length != scan_length) {
+          scan = parent->AddInstruction(HloInstruction::CreateSlice(
+              operand_shape, scan, std::vector<int64_t>(rank, 0),
+              operand_shape.dimensions(), std::vector<int64_t>(rank, 1)));
+        }
+
+        if (is_exclusive) {
+          auto padding_config = MakeNoPaddingConfig(rank);
+          if (forward_scan) {
+            padding_config.mutable_dimensions(scan_dim_num)
+                ->set_edge_padding_low(1);
+          } else {
+            padding_config.mutable_dimensions(scan_dim_num)
+                ->set_edge_padding_high(1);
+          }
+          scan = parent->AddInstruction(HloInstruction::CreatePad(
+              ShapeAtIndex(reduce_window->shape(), shape_index), scan,
+              init_values[idx], padding_config));
+        }
+        scans.push_back(scan);
+        return OkStatus();
+      });
+  TF_RETURN_IF_ERROR(status);
+
+  HloInstruction* scan;
+  if (reduce_window->shape().IsTuple()) {
+    scan = parent->AddInstruction(HloInstruction::CreateTuple(scans));
+  } else {
+    CHECK_EQ(scans.size(), 1);
+    scan = scans[0];
+  }
+  TF_RETURN_IF_ERROR(reduce_window->ReplaceAllUsesWith(scan));
+  TF_RETURN_IF_ERROR(parent->RemoveInstruction(reduce_window));
+
+  return true;
+}
+
+absl::StatusOr<bool> ReduceWindowRewriter::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  bool changed = false;
+  for (const auto& computation : module->computations(execution_threads)) {
+    for (HloInstruction* instruction :
+         computation->MakeInstructionPostOrder()) {
+      HloReduceWindowInstruction* reduce_window =
+          DynCast<HloReduceWindowInstruction>(instruction);
+      if (!reduce_window) {
+        continue;
+      }
+      TF_ASSIGN_OR_RETURN(bool made_change,
+                          TryOptimizeCumSumOrProd(reduce_window));
+      if (made_change) {
+        changed = true;
+        continue;
+      }
+
+      if (reduce_window->inputs().front()->shape().rank() != 1) {
+        continue;
+      }
+      TF_RETURN_IF_ERROR(ReplaceReduceWindowWithReshape(reduce_window));
+
+      changed = true;
+    }
+  }
+  return changed;
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/service/reduce_window_rewriter.h b/third_party/xla/xla/service/reduce_window_rewriter.h
new file mode 100644
index 00000000000000..591056a3f094ba
--- /dev/null
+++ b/third_party/xla/xla/service/reduce_window_rewriter.h
@@ -0,0 +1,72 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_REDUCE_WINDOW_REWRITER_H_
+#define XLA_SERVICE_REDUCE_WINDOW_REWRITER_H_
+
+#include <cstdint>
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/hlo_pass_interface.h"
+#include "xla/status.h"
+#include "xla/statusor.h"
+
+namespace xla {
+
+// Rewrite ReduceWindow to be more performant in cases it is written in a
+// quadratic way:
+//
+// 1) Work around unimplemented cases in the implementation of ReduceWindow.
+//
+// This rewrites all R1 ReduceWindow nodes. We reshape the operand to an
+// R2, perform the operation, and reshape back to R1. The reshapes correspond to
+// a bitcast if the tensor length is less than or equal to a passed parameter.
+// The motivation for this is to avoid use of overly large reductions and the
+// complexities and restrictions therein.
+//
+// 2) Rewrite ReduceWindow ops that represent a CumSum/CumProd into a
+// tree-reduction (see details in the implementation).
+// Note that this may itself generate R1 ReduceWindow ops, which means this pass
+// needs to be run to a fixed point.
+class ReduceWindowRewriter : public HloModulePass {
+ public:
+  // `base_length` is a size of a reduce-window we are comfortable with
+  // executing.
+  explicit ReduceWindowRewriter(int64_t base_length)
+      : base_length_(base_length) {}
+
+  absl::string_view name() const override { return "reduce-window-rewriter"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  absl::Status ReplaceReduceWindowWithReshape(
+      HloReduceWindowInstruction* reduce_window);
+
+  absl::StatusOr<bool> TryOptimizeCumSumOrProd(
+      HloReduceWindowInstruction* reduce_window);
+
+  int64_t base_length_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_REDUCE_WINDOW_REWRITER_H_
diff --git a/third_party/xla/xla/service/reduce_window_rewriter_test.cc b/third_party/xla/xla/service/reduce_window_rewriter_test.cc
new file mode 100644
index 00000000000000..b40314f6e4da4b
--- /dev/null
+++ b/third_party/xla/xla/service/reduce_window_rewriter_test.cc
@@ -0,0 +1,184 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/reduce_window_rewriter.h"
+
+#include <optional>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "xla/test.h"
+#include "xla/tests/hlo_test_base.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace {
+
+class ReduceWindowRewriterTest : public HloTestBase {
+ public:
+  void CheckReduceWindowRewrite(absl::string_view hlo,
+                                std::optional<absl::string_view> expected) {
+    RunAndFilecheckHloRewrite(hlo, ReduceWindowRewriter{128}, expected);
+  }
+};
+
+TEST_F(ReduceWindowRewriterTest, EliminateR1) {
+  const char* hlo = R"(
+%binary_add {
+  %a = f32[] parameter(0)
+  %b = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %a, f32[] %b)
+}
+
+ENTRY %EliminateR1 (input: f32[10]) -> f32[10] {
+  %input = f32[10]{0} parameter(0)
+  %constant = f32[] constant(0)
+  ROOT %reduce-window = f32[10]{0} reduce-window(f32[10]{0} %input, f32[] %constant), window={size=5 pad=2_2}, to_apply=%binary_add
+}
+)";
+
+  CheckReduceWindowRewrite(hlo, R"(
+// CHECK: [[reduce_window_1_0:%[^ ]+]] = f32[10,1]{0,1} reduce-window([[reshape_1:%[^ ]+]], [[constant_2:%[^ ]+]]), window={size=5x1 pad=2_2x0_0}, to_apply=[[binary_add_3:%[^ ]+]]
+// CHECK-NEXT: ROOT [[reshape_1_4:%[^ ]+]] = f32[10]{0} reshape([[reduce_window_1_0]])
+)");
+}
+
+TEST_F(ReduceWindowRewriterTest, EliminateR1Variadic) {
+  const char* hlo = R"(
+HloModule reduce-window
+
+add_float {
+  lhs.0 = f32[] parameter(0)
+  lhs.1 = f32[] parameter(1)
+  rhs.0 = f32[] parameter(2)
+  rhs.1 = f32[] parameter(3)
+  sum.0 = f32[] add(lhs.0, rhs.0)
+  sum.1 = f32[] add(lhs.1, rhs.1)
+  ROOT root = (f32[], f32[]) tuple(sum.0, sum.1)
+}
+
+ENTRY entry (arg: f32[10]) -> (f32[10], f32[10]) {
+  arg = f32[10]{0} parameter(0)
+  constant = f32[] constant(0)
+  ROOT reduce-window = (f32[10]{0}, f32[10]{0}) reduce-window(f32[10]{0} %arg, f32[10]{0} %arg, f32[] %constant, f32[] %constant), window={size=5 pad=2_2}, to_apply=%add_float
+})";
+
+  CheckReduceWindowRewrite(hlo, R"(
+// CHECK: ENTRY %entry (arg: f32[10]) -> (f32[10], f32[10]) {
+// CHECK-NEXT:  [[arg_0:%[^ ]+]] = f32[10]{0} parameter(0)
+// CHECK-NEXT:  [[reshape_1:%[^ ]+]] = f32[10,1]{0,1} reshape([[arg_0]])
+// CHECK-NEXT:  [[reshape_1_2:%[^ ]+]] = f32[10,1]{0,1} reshape([[arg_0]])
+// CHECK-NEXT:  [[constant_3:%[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  [[reduce_window_1_4:%[^ ]+]] = (f32[10,1]{0,1}, f32[10,1]{0,1}) reduce-window([[reshape_1]], [[reshape_1_2]], [[constant_3]], [[constant_3]]), window={size=5x1 pad=2_2x0_0}, to_apply=[[add_float_5:%[^ ]+]]
+// CHECK-NEXT:  [[get_tuple_element_6:%[^ ]+]] = f32[10,1]{0,1} get-tuple-element([[reduce_window_1_4]]), index=0
+// CHECK-NEXT:  [[reshape_2_7:%[^ ]+]] = f32[10]{0} reshape([[get_tuple_element_6]])
+// CHECK-NEXT:  [[get_tuple_element_1_8:%[^ ]+]] = f32[10,1]{0,1} get-tuple-element([[reduce_window_1_4]]), index=1
+// CHECK-NEXT:  [[reshape_3_9:%[^ ]+]] = f32[10]{0} reshape([[get_tuple_element_1_8]])
+// CHECK-NEXT:  ROOT [[tuple_10:%[^ ]+]] = (f32[10]{0}, f32[10]{0}) tuple([[reshape_2_7]], [[reshape_3_9]])
+// CHECK-NEXT:}
+)");
+}
+
+TEST_F(ReduceWindowRewriterTest, OptimizeR1InclusiveScan) {
+  const char* hlo = R"(
+HloModule reduce-window
+
+add_float {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT mul = f32[] multiply(lhs, rhs)
+}
+
+ENTRY entry (arg: f32[46592]) -> f32[46592] {
+  arg = f32[46592]{0} parameter(0)
+  constant = f32[] constant(0)
+  ROOT reduce-window = f32[46592]{0} reduce-window(f32[46592]{0} %arg, f32[] %constant), window={size=46592 pad=46591_0}, to_apply=%add_float
+})";
+
+  CheckReduceWindowRewrite(hlo, R"(
+// CHECK: ENTRY %entry (arg: f32[46592]) -> f32[46592] {
+// CHECK-NEXT:  [[arg_0:%[^ ]+]] = f32[46592]{0} parameter(0)
+// CHECK-NEXT:  [[reshape_1:%[^ ]+]] = f32[364,128]{0,1} reshape([[arg_0]])
+// CHECK-NEXT:  [[constant_2:%[^ ]+]] = f32[] constant(0)
+// CHECK-NEXT:  [[reduce_window_1_3:%[^ ]+]] = f32[364,128]{0,1} reduce-window([[reshape_1]], [[constant_2]]), window={size=1x128 pad=0_0x127_0}, to_apply=[[add_float_4:%[^ ]+]]
+// CHECK-NEXT:  [[slice_5:%[^ ]+]] = f32[364,1]{0,1} slice([[reduce_window_1_3]]), slice={[0:364], [127:128]}
+// CHECK-NEXT:  [[reshape_1_6:%[^ ]+]] = f32[364]{0} reshape([[slice_5]])
+// CHECK-NEXT:  [[reduce_window_2_7:%[^ ]+]] = f32[365]{0} reduce-window([[reshape_1_6]], [[constant_2]]), window={size=364 pad=364_0}, to_apply=[[add_float_4]]
+// CHECK-NEXT:  [[slice_1_8:%[^ ]+]] = f32[364]{0} slice([[reduce_window_2_7]]), slice={[0:364]}
+// CHECK-NEXT:  [[broadcast_9:%[^ ]+]] = f32[364,128]{0,1} broadcast([[slice_1_8]]), dimensions={0}
+// CHECK-NEXT:  [[map_10:%[^ ]+]] = f32[364,128]{0,1} map([[reduce_window_1_3]], [[broadcast_9]]), dimensions={0,1}, to_apply=[[add_float_4]]
+// CHECK-NEXT:  ROOT [[reshape_2_11:%[^ ]+]] = f32[46592]{0} reshape([[map_10]])
+// CHECK-NEXT:}
+)");
+}
+
+TEST_F(ReduceWindowRewriterTest, OptimizeR1InclusiveScanVariadic) {
+  const std::string hlo_string = R"(
+HloModule reduce-window
+
+MaxMin {
+  l.max = f32[] parameter(0)
+  l.min = f32[] parameter(1)
+  r.max = f32[] parameter(2)
+  r.min = f32[] parameter(3)
+  max = f32[] maximum(l.max, r.max)
+  min = f32[] minimum(l.min, r.min)
+  ROOT root = (f32[], f32[]) tuple(max, min)
+}
+
+ENTRY entry (arg_0: f32[46592], arg_1: f32[46592]) -> (f32[46592], f32[46592]) {
+  arg.0 = f32[46592]{0} parameter(0)
+  arg.1 = f32[46592]{0} parameter(1)
+  init_ninf = f32[] constant(-inf)
+  init_inf = f32[] constant(inf)
+  ROOT reduce-window = (f32[46592]{0}, f32[46592]{0}) reduce-window(f32[46592]{0} %arg.0, f32[46592]{0} %arg.1, f32[] %init_ninf, f32[] %init_inf), window={size=46592 pad=46591_0}, to_apply=%MaxMin
+}
+)";
+
+  CheckReduceWindowRewrite(hlo_string, R"(
+// CHECK: ENTRY %entry (arg.0: f32[46592], arg.1: f32[46592]) -> (f32[46592], f32[46592]) {
+// CHECK-NEXT:   [[arg_0_0:%[^ ]+]] = f32[46592]{0} parameter(0)
+// CHECK-NEXT:   [[reshape_1:%[^ ]+]] = f32[364,128]{0,1} reshape([[arg_0_0]])
+// CHECK-NEXT:   [[arg_1_2:%[^ ]+]] = f32[46592]{0} parameter(1)
+// CHECK-NEXT:   [[reshape_1_3:%[^ ]+]] = f32[364,128]{0,1} reshape([[arg_1_2]])
+// CHECK-NEXT:   [[init_ninf_4:%[^ ]+]] = f32[] constant(-inf)
+// CHECK-NEXT:   [[init_inf_5:%[^ ]+]] = f32[] constant(inf)
+// CHECK-NEXT:   [[reduce_window_1_6:%[^ ]+]] = (f32[364,128]{0,1}, f32[364,128]{0,1}) reduce-window([[reshape_1]], [[reshape_1_3]], [[init_ninf_4]], [[init_inf_5]]), window={size=1x128 pad=0_0x127_0}, to_apply=[[MaxMin_7:%[^ ]+]]
+// CHECK-NEXT:   [[get_tuple_element_4_8:%[^ ]+]] = f32[364,128]{0,1} get-tuple-element([[reduce_window_1_6]]), index=0
+// CHECK-NEXT:   [[get_tuple_element_5_9:%[^ ]+]] = f32[364,128]{0,1} get-tuple-element([[reduce_window_1_6]]), index=1
+// CHECK-NEXT:   [[get_tuple_element_10:%[^ ]+]] = f32[364,128]{0,1} get-tuple-element([[reduce_window_1_6]]), index=0
+// CHECK-NEXT:   [[slice_11:%[^ ]+]] = f32[364,1]{0,1} slice([[get_tuple_element_10]]), slice={[0:364], [127:128]}
+// CHECK-NEXT:   [[reshape_2_12:%[^ ]+]] = f32[364]{0} reshape([[slice_11]])
+// CHECK-NEXT:   [[get_tuple_element_1_13:%[^ ]+]] = f32[364,128]{0,1} get-tuple-element([[reduce_window_1_6]]), index=1
+// CHECK-NEXT:   [[slice_1_14:%[^ ]+]] = f32[364,1]{0,1} slice([[get_tuple_element_1_13]]), slice={[0:364], [127:128]}
+// CHECK-NEXT:   [[reshape_3_15:%[^ ]+]] = f32[364]{0} reshape([[slice_1_14]])
+// CHECK-NEXT:   [[reduce_window_2_16:%[^ ]+]] = (f32[365]{0}, f32[365]{0}) reduce-window([[reshape_2_12]], [[reshape_3_15]], [[init_ninf_4]], [[init_inf_5]]), window={size=364 pad=364_0}, to_apply=[[MaxMin_7]]
+// CHECK-NEXT:   [[get_tuple_element_2_17:%[^ ]+]] = f32[365]{0} get-tuple-element([[reduce_window_2_16]]), index=0
+// CHECK-NEXT:   [[slice_2_18:%[^ ]+]] = f32[364]{0} slice([[get_tuple_element_2_17]]), slice={[0:364]}
+// CHECK-NEXT:   [[broadcast_19:%[^ ]+]] = f32[364,128]{0,1} broadcast([[slice_2_18]]), dimensions={0}
+// CHECK-NEXT:   [[get_tuple_element_3_20:%[^ ]+]] = f32[365]{0} get-tuple-element([[reduce_window_2_16]]), index=1
+// CHECK-NEXT:   [[slice_3_21:%[^ ]+]] = f32[364]{0} slice([[get_tuple_element_3_20]]), slice={[0:364]}
+// CHECK-NEXT:   [[broadcast_1_22:%[^ ]+]] = f32[364,128]{0,1} broadcast([[slice_3_21]]), dimensions={0}
+// CHECK-NEXT:   [[map_23:%[^ ]+]] = f32[364,128]{0,1} map([[get_tuple_element_4_8]], [[get_tuple_element_5_9]], [[broadcast_19]], [[broadcast_1_22]]), dimensions={0,1}, to_apply=[[MaxMin_7]].clone
+// CHECK-NEXT:   [[reshape_4_24:%[^ ]+]] = f32[46592]{0} reshape([[map_23]])
+// CHECK-NEXT:   [[map_1_25:%[^ ]+]] = f32[364,128]{0,1} map([[get_tuple_element_4_8]], [[get_tuple_element_5_9]], [[broadcast_19]], [[broadcast_1_22]]), dimensions={0,1}, to_apply=[[MaxMin_7]].clone.1
+// CHECK-NEXT:   [[reshape_5_26:%[^ ]+]] = f32[46592]{0} reshape([[map_1_25]])
+// CHECK-NEXT:   ROOT [[tuple_27:%[^ ]+]] = (f32[46592]{0}, f32[46592]{0}) tuple([[reshape_4_24]], [[reshape_5_26]])
+// CHECK-NEXT: }
+  )");
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/service/reshape_decomposer.cc b/third_party/xla/xla/service/reshape_decomposer.cc
index 2e3ab87569abc4..64fa392adf0851 100644
--- a/third_party/xla/xla/service/reshape_decomposer.cc
+++ b/third_party/xla/xla/service/reshape_decomposer.cc
@@ -25,7 +25,7 @@ namespace {
 
 class ReshapeDecomposerVisitor : public DfsHloRewriteVisitor {
  public:
-  Status HandleReshape(HloInstruction* reshape) override {
+  absl::Status HandleReshape(HloInstruction* reshape) override {
     HloInstruction* operand = reshape->mutable_operand(0);
     auto s = reshape->shape();
     auto s0 = operand->shape();
diff --git a/third_party/xla/xla/service/reshape_mover_test.cc b/third_party/xla/xla/service/reshape_mover_test.cc
index 02867a54a3b90f..d853d42bac89af 100644
--- a/third_party/xla/xla/service/reshape_mover_test.cc
+++ b/third_party/xla/xla/service/reshape_mover_test.cc
@@ -37,8 +37,8 @@ namespace m = xla::match;
 class ReshapeMoverTest : public HloTestBase {
  protected:
   // ReshapeMover relies on algsimp for cleanup.
-  Status RunPass(HloModule* module, bool change_expected,
-                 ReshapeMoverOptions options = ReshapeMoverOptions{}) {
+  absl::Status RunPass(HloModule* module, bool change_expected,
+                       ReshapeMoverOptions options = ReshapeMoverOptions{}) {
     TF_ASSIGN_OR_RETURN(bool changed,
                         RunHloPass(ReshapeMover(options), module));
     SCOPED_TRACE(module->ToString());
diff --git a/third_party/xla/xla/service/scan_loop_accumulator_input_unification.cc b/third_party/xla/xla/service/scan_loop_accumulator_input_unification.cc
new file mode 100644
index 00000000000000..34a3c081eb5dc8
--- /dev/null
+++ b/third_party/xla/xla/service/scan_loop_accumulator_input_unification.cc
@@ -0,0 +1,351 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/scan_loop_accumulator_input_unification.h"
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/literal_util.h"
+#include "xla/service/call_graph.h"
+#include "xla/service/hlo_alias_analysis.h"
+#include "xla/service/hlo_dataflow_analysis.h"
+#include "xla/service/pattern_matcher.h"
+#include "xla/service/tuple_simplifier.h"
+#include "xla/service/while_loop_simplifier.h"
+#include "xla/service/while_loop_unroller.h"
+#include "xla/shape_util.h"
+#include "xla/util.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace {
+
+// Check if `instr` is a dynamic index instruction, i.e., dynamic-slice or
+// dynamic-update-slice with the given first operand that operates on the entire
+// shape of the instruction. To satisfy this:
+// 1. All start indices must be constant zero except only a single dimension.
+// 2. The start index of that dimension should be equal to the enclosing loop
+// induction variable.
+// 3. And, the size of that dimension must match the loop trip count.
+bool MatchShapeCoveringDynamicIndexInstruction(HloInstruction* instr,
+                                               HloInstruction* first_operand,
+                                               HloOpcode opcode,
+                                               const WhileLoopConfig& config) {
+  // Based on the instruction type, start indices start from index 1 or 2 of the
+  // operands.
+  int64_t start_indices_offset;
+  if (instr->opcode() == HloOpcode::kDynamicSlice) {
+    start_indices_offset = 1;
+  } else if (instr->opcode() == HloOpcode::kDynamicUpdateSlice) {
+    start_indices_offset = 2;
+  } else {
+    return false;
+  }
+  HloInstruction* operand = instr->mutable_operand(0);
+  if (operand != first_operand) {
+    return false;
+  }
+
+  int64_t dynamic_index = -1;
+  for (int64_t start_index = start_indices_offset;
+       start_index < instr->operand_count(); ++start_index) {
+    HloInstruction* index = instr->mutable_operand(start_index);
+    // All constants must be zero in order to slice the entire shape.
+    if (Match(index, match::ConstantScalar())) {
+      std::optional<int64_t> offset =
+          LiteralUtil::LiteralAsScalarInt64(index->literal());
+      if (offset.value() != 0) {
+        return false;
+      }
+    }
+
+    // Check that the slice offset is the loop induction variable.
+    if (Match(index, match::GetTupleElement(match::Parameter(),
+                                            config.induction_var_idx))) {
+      // In order to cover the whole shape only a single non-constant index is
+      // allowed.
+      if (dynamic_index != -1) {
+        return false;
+      }
+      dynamic_index = start_index - start_indices_offset;
+    }
+  }
+
+  if (dynamic_index == -1) {
+    return false;
+  }
+
+  // The shape's broadcast_dim must be exactly equal to the loop trip count.
+  if (operand->shape().dimensions(dynamic_index) != config.trip_count) {
+    return false;
+  }
+
+  return true;
+}
+
+// This function checks whether the operand of the loop at the given index is
+// read-only.
+bool LoopIndexIsReadOnly(const HloAliasAnalysis& alias_analysis,
+                         HloInstruction* while_instr, int64_t idx) {
+  const HloDataflowAnalysis& dataflow_analysis =
+      alias_analysis.dataflow_analysis();
+  return !(
+      dataflow_analysis.GetValueSet(while_instr->while_init(), {idx})
+              .values()
+              .size() > 1 ||
+      dataflow_analysis.GetValueSet(while_instr, {idx}).values().size() > 1 ||
+      dataflow_analysis.GetUniqueValueAt(while_instr, {idx}) !=
+          dataflow_analysis.GetUniqueValueAt(while_instr->while_init(), {idx}));
+}
+
+// This function finds the pairs of accumulator-input pairs in the scan loop.
+// An accumulator-input pair is a pair of instructions that satisfy the
+// following conditions:
+// 1. The accumulator is updated in the loop body with a dynamic-update-slice
+//  instruction that covers the whole shape (see the comment for
+//  MatchShapeCoveringDynamicIndexInstruction function).
+// 2. The scan loop itself must be within another loop.
+// 3. The output of the scan loop at accumulator location must be passed as the
+//  input to the scan loop (next iteration of the outer loop)
+// 4. The input is a shape-covering read-only instruction in the loop body.
+std::vector<std::pair<HloInstruction*, HloInstruction*>>
+FindAccumulatorInputPairs(const HloAliasAnalysis& alias_analysis,
+                          HloInstruction* while_instr,
+                          const WhileLoopConfig& config) {
+  HloComputation* computation = while_instr->while_body();
+  HloInstruction* body_param = computation->parameter_instruction(0);
+
+  // Finding the accumulator instructions
+  std::vector<HloInstruction*> possible_acc;
+  for (int64_t param_idx = 0;
+       param_idx < while_instr->while_init()->operand_count(); ++param_idx) {
+    for (HloInstruction* gte : body_param->users()) {
+      if (!Match(gte, match::GetTupleElement().WithTupleIndex(param_idx))) {
+        continue;
+      }
+      if (gte->operand(0) != body_param) {
+        continue;
+      }
+
+      // The accumulator should only be used exactly once as the operand of
+      // dynamic-update-slice.
+      if (gte->user_count() > 1 || gte->user_count() == 0) {
+        continue;
+      }
+      HloInstruction* gte_user = gte->users().at(0);
+      if (MatchShapeCoveringDynamicIndexInstruction(
+              gte_user, gte, HloOpcode::kDynamicUpdateSlice, config)) {
+        // The accumulator should be written at the same index
+        if (computation->root_instruction()->mutable_operand(param_idx) ==
+            gte_user) {
+          possible_acc.push_back(gte);
+          VLOG(3) << "accumulator index: " << param_idx << " = " << gte->name();
+        }
+      }
+    }
+  }
+
+  // If operand is actually an operand of the instr, returns the index of the
+  // operand, otherwise returns -1.
+  auto operand_index = [](HloInstruction* instr,
+                          HloInstruction* operand) -> int64_t {
+    for (int64_t i = 0; i < instr->operand_count(); ++i) {
+      if (operand == instr->operand(i)) {
+        return i;
+      }
+    }
+    return -1;
+  };
+
+  // Returns the first GTE instruction in the parent computation of the tuple
+  // with the form of get-tuple-element(tuple), index=idx
+  auto find_gte_instr = [](HloInstruction* tuple,
+                           int64_t idx) -> HloInstruction* {
+    for (HloInstruction* instr : tuple->parent()->MakeInstructionPostOrder()) {
+      HloInstruction* operand;
+      if (Match(instr, match::GetTupleElement()
+                           .WithOperand(0, match::Op(&operand))
+                           .WithTupleIndex(idx))) {
+        if (operand != tuple) {
+          continue;
+        }
+        return instr;
+      }
+    }
+    return nullptr;
+  };
+
+  auto check_single_user_not_null = [](HloInstruction* instr) -> bool {
+    if (instr == nullptr || instr->user_count() != 1) {
+      return false;
+    }
+    return true;
+  };
+
+  // Find corresponding inputs for the possible accumulators.
+  std::vector<std::pair<HloInstruction*, HloInstruction*>> acc_input_pairs;
+  HloComputation* outer_while_body = while_instr->parent();
+  for (HloInstruction* acc : possible_acc) {
+    VLOG(3) << "Looking for corresponding input for " << acc->name();
+    HloInstruction* acc_gte_outer_body =
+        find_gte_instr(while_instr, acc->tuple_index());
+    if (acc_gte_outer_body == nullptr) {
+      continue;
+    }
+    int64_t idx =
+        operand_index(outer_while_body->root_instruction(), acc_gte_outer_body);
+    VLOG(3) << "Accumulator output of the scan in the outer body = "
+            << acc_gte_outer_body->name() << ", index = " << idx;
+    if (idx == -1) {
+      continue;
+    }
+    HloInstruction* input_gte_outer =
+        find_gte_instr(outer_while_body->parameter_instruction(0), idx);
+    if (!check_single_user_not_null(input_gte_outer)) {
+      continue;
+    }
+    if (input_gte_outer->users().at(0) != while_instr->while_init()) {
+      continue;
+    }
+    VLOG(3) << "Input parameter outer body = " << input_gte_outer->name()
+            << ", index = " << input_gte_outer->tuple_index();
+
+    // Find the corresponding gte in the body of the inner loop
+    int64_t input_idx_inner =
+        operand_index(while_instr->while_init(), input_gte_outer);
+
+    HloInstruction* input_gte_inner =
+        find_gte_instr(computation->parameter_instruction(0), input_idx_inner);
+
+    if (!LoopIndexIsReadOnly(alias_analysis, while_instr, input_idx_inner)) {
+      continue;
+    }
+    VLOG(3) << "Input parameter scan body = " << input_gte_inner->name()
+            << ", index = " << input_gte_inner->tuple_index();
+
+    HloInstruction* gte_user = input_gte_inner->users().at(0);
+    // Check if the input_gte_inner is a shape covering read-only instruction
+    if (MatchShapeCoveringDynamicIndexInstruction(
+            gte_user, input_gte_inner, HloOpcode::kDynamicUpdateSlice,
+            config)) {
+      acc_input_pairs.emplace_back(acc, input_gte_inner);
+    }
+  }
+  return acc_input_pairs;
+}
+
+// Given a list of unrollable loops and their config, finds all the
+// accumulator/input pairs of nested scan loops and removes the unnecessary
+// accumulator and replace it with the input.
+absl::StatusOr<bool> UnifyAccumulatorWithInput(
+    const HloAliasAnalysis& alias_analysis,
+    std::vector<std::pair<HloInstruction*, WhileLoopConfig>> unrollable_loops) {
+  // TODO(b/333521102): Helper function to check if a computation is a body of a
+  // while call. Currently, IsWhileBodyComputation api call does not work
+  // properly so we check it ourself. We should switch to IsWhileBodyComputation
+  // when it's fixed.
+  std::unique_ptr<CallGraph> call_graph =
+      CallGraph::Build(&alias_analysis.dataflow_analysis().module());
+  auto is_while_body = [&](HloComputation* comp) {
+    std::vector<HloInstruction*> callers =
+        call_graph->GetComputationCallers(comp);
+    return !callers.empty() && callers.at(0)->opcode() == HloOpcode::kWhile;
+  };
+
+  std::vector<HloInstruction*> changed_loops;
+  bool unified = false;
+  for (auto& [while_instr, loop_config] : unrollable_loops) {
+    // We only consider nested loops. The overhead of doing copy where there is
+    // not nesting is considered to be negligible.
+    if (!is_while_body(while_instr->parent())) {
+      continue;
+    }
+    auto acc_input_pairs =
+        FindAccumulatorInputPairs(alias_analysis, while_instr, loop_config);
+    for (const auto& [acc, input] : acc_input_pairs) {
+      // We only consider accumulators that are allocated inside the loop.
+      // Therefore, we skip accumulators that are passed as the loop input.
+      if (Match(while_instr->while_init()->mutable_operand(acc->tuple_index()),
+                match::GetTupleElement(match::Parameter()))) {
+        continue;
+      }
+      VLOG(3) << while_instr->name() << " -> " << "<accumulator_@"
+              << acc->tuple_index() << ": " << acc->name() << ", " << "input_@"
+              << input->tuple_index() << ": " << input->name() << ">";
+      TF_RETURN_IF_ERROR(input->ReplaceAllUsesWith(acc));
+      TF_RETURN_IF_ERROR(while_instr->while_init()->ReplaceOperandWith(
+          acc->tuple_index(),
+          while_instr->while_init()->mutable_operand(input->tuple_index())));
+      if (input->user_count() == 0) {
+        TF_RETURN_IF_ERROR(while_instr->while_body()->RemoveInstruction(input));
+      }
+      unified = true;
+    }
+  }
+  return unified;
+}
+
+}  // namespace
+
+absl::StatusOr<bool> ScanLoopAccumulatorInputUnification::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  VLOG(2) << "HLO module before ScanLoopAccumulatorInputUnification:";
+  XLA_VLOG_LINES(2, module->ToString());
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
+                      HloAliasAnalysis::Run(module));
+
+  // This pass can only be applied to unrollable loops since we need to find the
+  // accumulators and inputs that are by definition updated and read fully via
+  // dynamic-update-slice and dynamic-sliced within a loop.
+  std::vector<std::pair<HloInstruction*, WhileLoopConfig>> unrollable_loops =
+      WhileLoopUnroller::GetUnrollableLoops(module, execution_threads);
+
+  // TODO(b/337883537): We might want to simplify compare instructions before
+  // this. It helps us identify more inputs and accumulators.
+  TF_ASSIGN_OR_RETURN(bool changed, UnifyAccumulatorWithInput(
+                                        *alias_analysis, unrollable_loops));
+
+  if (changed) {
+    for (auto& [while_instr, loop_config] : unrollable_loops) {
+      TF_RETURN_IF_ERROR(TryRemoveDeadWhileParams(while_instr).status());
+    }
+    TF_RETURN_IF_ERROR(TupleSimplifier{}.Run(module).status());
+    TF_RETURN_IF_ERROR(module->RemoveUnusedComputations());
+
+    VLOG(2) << "HLO module after ScanLoopAccumulatorInputUnification:";
+    XLA_VLOG_LINES(2, module->ToString());
+  } else {
+    VLOG(2) << "HLO module unchanged after ScanLoopAccumulatorInputUnification";
+  }
+
+  return changed;
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/service/scan_loop_accumulator_input_unification.h b/third_party/xla/xla/service/scan_loop_accumulator_input_unification.h
new file mode 100644
index 00000000000000..34f828167bcc73
--- /dev/null
+++ b/third_party/xla/xla/service/scan_loop_accumulator_input_unification.h
@@ -0,0 +1,97 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SCAN_LOOP_ACCUMULATOR_INPUT_UNIFICATION_H_
+#define XLA_SERVICE_SCAN_LOOP_ACCUMULATOR_INPUT_UNIFICATION_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/hlo_pass_interface.h"
+
+namespace xla {
+// This pass looks at the nested loops with accumulator patterns and unifies the
+// accumulation buffer with the input. The accumulation pattern usually comes
+// from jax.scan function. This transformation is beneficial by removing the
+// unnecessary copy of the accumulation buffer in the outer body.
+// Below is the pattern that this pass identifies:
+//   +-while------------------------------------+
+//   | param = tuple(..., prev_acc, ...)        |
+//   | ...                                      |
+//   | input = gte(param), index=@prev_acc      |
+//   | acc = allocate-buffer()                  |
+//   | ...                                      |
+//   | +-scan----------------------------------+|
+//   | | param = tuple(..., acc, input, ...)   ||
+//   | | ...                                   ||
+//   | | slice = ds(input, i, 0, ...)          ||
+//   | | slice' = f(slice, ...)                ||
+//   | | acc'  = dus(acc, slice', i, 0, ...)   ||
+//   | | ...                                   ||
+//   | | ROOT = tuple(..., acc', input, ...)   ||
+//   | +---------------------------------------+|
+//   | new_acc = gte(scan), index=@acc'         |
+//   | copy_acc = copy(new_acc)                 |
+//   | ...                                      |
+//   | ROOT = tuple(..., copy_acc, ...)         |
+//   +------------------------------------------+
+//
+// To apply the unification we need to find pair of (acc,input). The
+// accumulators are found by simply looking for shape-covering write-only
+// instructions, in this case acc is written to by dynamic-update-slice that
+// covers the entire shape across all the iterations of the scan loop. To find
+// the input that corresponds to the accumulator, we follow the accumulated
+// output of the scan loop (index @acc') through the outer loop (index
+// @prev_acc) and find the index in which it is passed to the scan loop. Below
+// is the simplified program after unification:
+//
+//   +-while------------------------------------+
+//   | param = tuple(..., prev_acc, ...)        |
+//   | ...                                      |
+//   | input = gte(param), index=@prev_acc      |
+//   | ...                                      |
+//   | +-scan----------------------------------+|
+//   | | param = tuple(..., input, ...)        ||
+//   | | ...                                   ||
+//   | | slice = ds(input, i, 0, ...)          ||
+//   | | slice' = f(slice, ...)                ||
+//   | | acc'  = dus(input, slice', i, 0, ...) ||
+//   | | ...                                   ||
+//   | | ROOT = tuple(..., acc', ...)          ||
+//   | +---------------------------------------+|
+//   | new_acc = gte(scan), index=@acc'         |
+//   | ...                                      |
+//   | ROOT = tuple(..., new_acc, ...)          |
+//   +------------------------------------------+
+//
+class ScanLoopAccumulatorInputUnification : public HloModulePass {
+ public:
+  ~ScanLoopAccumulatorInputUnification() override = default;
+
+  explicit ScanLoopAccumulatorInputUnification() = default;
+
+  absl::string_view name() const override {
+    return "scan_loop_accumulator_input_unification";
+  }
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SCAN_LOOP_ACCUMULATOR_INPUT_UNIFICATION_H_
diff --git a/third_party/xla/xla/service/scan_loop_accumulator_input_unification_test.cc b/third_party/xla/xla/service/scan_loop_accumulator_input_unification_test.cc
new file mode 100644
index 00000000000000..a8a1911663eb1f
--- /dev/null
+++ b/third_party/xla/xla/service/scan_loop_accumulator_input_unification_test.cc
@@ -0,0 +1,582 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/scan_loop_accumulator_input_unification.h"
+
+#include <memory>
+#include <optional>
+#include <utility>
+
+#include <gtest/gtest.h>
+#include "absl/log/log.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/copy_insertion.h"
+#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/verified_hlo_module.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace {
+
+using ScanLoopAccumulatorInputUnificationTest = HloTestBase;
+
+HloInstruction* GetTopLevelWhileInstruction(HloModule* module) {
+  for (HloInstruction* instr :
+       module->entry_computation()->MakeInstructionPostOrder()) {
+    if (instr->opcode() == HloOpcode::kWhile) {
+      return instr;
+    }
+  }
+  return nullptr;
+}
+
+TEST_F(ScanLoopAccumulatorInputUnificationTest, UnifyAccumulatorInput) {
+  [[maybe_unused]] constexpr char kModule[] = R"(
+  HloModule jit_scan
+
+  wide.region_0.7 {
+    wide.arg_tuple.8 = (s32[], s32[], s32[8], s32[8]) parameter(0)
+    get-tuple-element.46 = s32[] get-tuple-element(wide.arg_tuple.8), index=0
+    get-tuple-element.47 = s32[] get-tuple-element(wide.arg_tuple.8), index=1
+    get-tuple-element.48 = s32[8] get-tuple-element(wide.arg_tuple.8), index=2
+    get-tuple-element.54 = s32[8] get-tuple-element(wide.arg_tuple.8), index=3
+    
+    dynamic-slice.0 = s32[1] dynamic-slice(get-tuple-element.54, get-tuple-element.46), dynamic_slice_sizes={1}
+    reshape.2 = s32[] reshape(dynamic-slice.0)
+    add.1 = s32[] add(get-tuple-element.47, reshape.2)
+
+    reshape.3 = s32[1] reshape(add.1)
+    dynamic-update-slice.0 = s32[8] dynamic-update-slice(get-tuple-element.48, reshape.3, get-tuple-element.46)
+    
+    const = s32[] constant(1)
+    add.0 = s32[] add(get-tuple-element.46, const)
+    ROOT tuple.10 = (s32[], s32[], s32[8], s32[8]) tuple(add.0, add.1, dynamic-update-slice.0, get-tuple-element.54)
+  } // wide.region_0.7
+
+  wide.region_1.29 {
+    constant.5 = s32[] constant(8)
+    wide.arg_tuple.30 = (s32[], s32[], s32[8], s32[8]) parameter(0)
+    get-tuple-element.16 = s32[] get-tuple-element(wide.arg_tuple.30), index=0
+    ROOT compare.0 = pred[] compare(get-tuple-element.16, constant.5), direction=LT
+  }
+
+  outer_body {
+    wide.arg_tuple.8 = (s32[], s32[], s32[8]) parameter(0)
+    get-tuple-element.46 = s32[] get-tuple-element(wide.arg_tuple.8), index=0
+    get-tuple-element.47 = s32[] get-tuple-element(wide.arg_tuple.8), index=1
+    get-tuple-element.48 = s32[8] get-tuple-element(wide.arg_tuple.8), index=2
+
+    constant.3 = s32[] constant(0)
+    broadcast = s32[8] broadcast(constant.3), dimensions={}
+
+    tuple.8 = (s32[], s32[], s32[8], s32[8]) tuple(constant.3, get-tuple-element.47, broadcast, get-tuple-element.48)
+    while = (s32[], s32[], s32[8], s32[8]) while(tuple.8), condition=wide.region_1.29, body=wide.region_0.7
+    get-tuple-element.40 = s32[8] get-tuple-element(while), index=2
+
+    const = s32[] constant(1)
+    add.0 = s32[] add(get-tuple-element.46, const)
+    ROOT out = (s32[], s32[], s32[8]) tuple(add.0, get-tuple-element.47, get-tuple-element.40)
+  }
+   
+  outer_cond {
+    constant.5 = s32[] constant(8)
+    wide.arg_tuple.30 = (s32[], s32[], s32[8]) parameter(0)
+    get-tuple-element.16 = s32[] get-tuple-element(wide.arg_tuple.30), index=0
+    ROOT compare.0 = pred[] compare(get-tuple-element.16, constant.5), direction=LT
+  }
+
+  main.43 {
+    constant.3 = s32[] constant(0)
+    init = s32[] constant(0)
+    array = s32[8] constant({1,2,3,4,5,6,7,8})
+    tuple.8 = (s32[], s32[], s32[8]) tuple(constant.3, init, array)
+    while = (s32[], s32[], s32[8]) while(tuple.8), condition=outer_cond, body=outer_body
+    ROOT get-tuple-element.40 = s32[8] get-tuple-element(while), index=2
+  } // main.43
+  
+  )";
+
+  auto module = ParseAndReturnVerifiedModule(kModule).value();
+  auto module_clone = module->Clone();
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool simplified_loop,
+      ScanLoopAccumulatorInputUnification().Run(module.get()));
+  EXPECT_TRUE(simplified_loop);
+
+  // Index 2 and 3 of the while are replaced with the input arrays.
+  for (HloInstruction* instr :
+       module->entry_computation()->MakeInstructionPostOrder()) {
+    if (instr->opcode() == HloOpcode::kWhile) {
+      EXPECT_EQ(instr->while_init()->operand(2)->opcode(),
+                HloOpcode::kConstant);
+    }
+  }
+
+  EXPECT_TRUE(RunAndCompareTwoModules(
+      std::move(module), std::move(module_clone), {}, std::nullopt, true));
+}
+
+TEST_F(ScanLoopAccumulatorInputUnificationTest, UnifyAccumulatorInput2) {
+  [[maybe_unused]] constexpr char kModule[] = R"(
+  HloModule jit_scan
+
+  wide.region_0.7 {
+    wide.arg_tuple.8 = (s32[], s32[], s32[8], s32[8], s32[8], s32[8]) parameter(0)
+    get-tuple-element.46 = s32[] get-tuple-element(wide.arg_tuple.8), index=0
+    get-tuple-element.47 = s32[] get-tuple-element(wide.arg_tuple.8), index=1
+    get-tuple-element.48 = s32[8] get-tuple-element(wide.arg_tuple.8), index=2
+    get-tuple-element.54 = s32[8] get-tuple-element(wide.arg_tuple.8), index=3
+    get-tuple-element.55 = s32[8] get-tuple-element(wide.arg_tuple.8), index=4
+    get-tuple-element.56 = s32[8] get-tuple-element(wide.arg_tuple.8), index=5
+    
+    dynamic-slice.0 = s32[1] dynamic-slice(get-tuple-element.54, get-tuple-element.46), dynamic_slice_sizes={1}
+    reshape.2 = s32[] reshape(dynamic-slice.0)
+    add.1 = s32[] add(get-tuple-element.47, reshape.2)
+
+    reshape.3 = s32[1] reshape(add.1)
+    dynamic-update-slice.0 = s32[8] dynamic-update-slice(get-tuple-element.48, reshape.3, get-tuple-element.46)
+    
+    dynamic-slice.1 = s32[1] dynamic-slice(get-tuple-element.56, get-tuple-element.46), dynamic_slice_sizes={1}
+    reshape.4 = s32[] reshape(dynamic-slice.1)
+    add.2 = s32[] multiply(get-tuple-element.47, reshape.4)
+
+    reshape.5 = s32[1] reshape(add.2)
+    dynamic-update-slice.1 = s32[8] dynamic-update-slice(get-tuple-element.55, reshape.5, get-tuple-element.46)
+    
+    const = s32[] constant(1)
+    add.0 = s32[] add(get-tuple-element.46, const)
+    ROOT tuple.10 = (s32[], s32[], s32[8], s32[8], s32[8], s32[8]) tuple(add.0, add.1, dynamic-update-slice.0, get-tuple-element.54, dynamic-update-slice.1, get-tuple-element.56)
+  } // wide.region_0.7
+
+  wide.region_1.29 {
+    constant.5 = s32[] constant(8)
+    wide.arg_tuple.30 = (s32[], s32[], s32[8], s32[8], s32[8], s32[8]) parameter(0)
+    get-tuple-element.16 = s32[] get-tuple-element(wide.arg_tuple.30), index=0
+    ROOT compare.0 = pred[] compare(get-tuple-element.16, constant.5), direction=LT
+  }
+
+  outer_body {
+    wide.arg_tuple.8 = (s32[], s32[], s32[8], s32[8]) parameter(0)
+    get-tuple-element.46 = s32[] get-tuple-element(wide.arg_tuple.8), index=0
+    get-tuple-element.47 = s32[] get-tuple-element(wide.arg_tuple.8), index=1
+    get-tuple-element.48 = s32[8] get-tuple-element(wide.arg_tuple.8), index=2
+    get-tuple-element.54 = s32[8] get-tuple-element(wide.arg_tuple.8), index=3
+
+    constant.3 = s32[] constant(0)
+    broadcast = s32[8] broadcast(constant.3), dimensions={}
+    broadcast2 = s32[8] broadcast(constant.3), dimensions={}
+
+    tuple.8 = (s32[], s32[], s32[8], s32[8], s32[8], s32[8]) tuple(constant.3, get-tuple-element.47, broadcast, get-tuple-element.48, broadcast2, get-tuple-element.54)
+    while = (s32[], s32[], s32[8], s32[8], s32[8], s32[8]) while(tuple.8), condition=wide.region_1.29, body=wide.region_0.7
+    get-tuple-element.40 = s32[8] get-tuple-element(while), index=2
+    get-tuple-element.41 = s32[8] get-tuple-element(while), index=4
+    
+    const = s32[] constant(1)
+    add.0 = s32[] add(get-tuple-element.46, const)
+    ROOT out = (s32[], s32[], s32[8], s32[8]) tuple(add.0, get-tuple-element.47, get-tuple-element.40, get-tuple-element.41)
+  }
+   
+  outer_cond {
+    constant.5 = s32[] constant(8)
+    wide.arg_tuple.30 = (s32[], s32[], s32[8], s32[8]) parameter(0)
+    get-tuple-element.16 = s32[] get-tuple-element(wide.arg_tuple.30), index=0
+    ROOT compare.0 = pred[] compare(get-tuple-element.16, constant.5), direction=LT
+  }
+
+  main.43 {
+    constant.3 = s32[] constant(0)
+    init = s32[] constant(0)
+    array = s32[8] constant({1,2,3,4,5,6,7,8})
+    array2 = s32[8] constant({10,20,30,40,50,60,70,80})
+    tuple.8 = (s32[], s32[], s32[8], s32[8]) tuple(constant.3, init, array, array2)
+    while = (s32[], s32[], s32[8], s32[8]) while(tuple.8), condition=outer_cond, body=outer_body
+    get-tuple-element.40 = s32[8] get-tuple-element(while), index=2
+    get-tuple-element.41 = s32[8] get-tuple-element(while), index=3
+    ROOT out = (s32[8],s32[8]) tuple(get-tuple-element.40, get-tuple-element.41)
+  } // main.43
+  
+  )";
+
+  auto module = ParseAndReturnVerifiedModule(kModule).value();
+  auto module_clone = module->Clone();
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool simplified_loop,
+      ScanLoopAccumulatorInputUnification().Run(module.get()));
+  EXPECT_TRUE(simplified_loop);
+
+  // Index 2 and 3 of the while are replaced with the input arrays.
+  for (HloInstruction* instr :
+       module->entry_computation()->MakeInstructionPostOrder()) {
+    if (instr->opcode() == HloOpcode::kWhile) {
+      EXPECT_EQ(instr->while_init()->operand(2)->opcode(),
+                HloOpcode::kConstant);
+      EXPECT_EQ(instr->while_init()->operand(3)->opcode(),
+                HloOpcode::kConstant);
+    }
+  }
+
+  EXPECT_TRUE(RunAndCompareTwoModules(
+      std::move(module), std::move(module_clone), {}, std::nullopt, true));
+}
+
+TEST_F(ScanLoopAccumulatorInputUnificationTest, AccumulatorAllocateOutside) {
+  [[maybe_unused]] constexpr char kModule[] = R"(
+  HloModule jit_scan
+
+  wide.region_0.7 {
+    wide.arg_tuple.8 = (s32[], s32[], s32[8], s32[8]) parameter(0)
+    get-tuple-element.46 = s32[] get-tuple-element(wide.arg_tuple.8), index=0
+    get-tuple-element.47 = s32[] get-tuple-element(wide.arg_tuple.8), index=1
+    get-tuple-element.48 = s32[8] get-tuple-element(wide.arg_tuple.8), index=2
+    get-tuple-element.54 = s32[8] get-tuple-element(wide.arg_tuple.8), index=3
+    
+    dynamic-slice.0 = s32[1] dynamic-slice(get-tuple-element.54, get-tuple-element.46), dynamic_slice_sizes={1}
+    reshape.2 = s32[] reshape(dynamic-slice.0)
+    add.1 = s32[] add(get-tuple-element.47, reshape.2)
+
+    reshape.3 = s32[1] reshape(add.1)
+    dynamic-update-slice.0 = s32[8] dynamic-update-slice(get-tuple-element.48, reshape.3, get-tuple-element.46)
+    
+    const = s32[] constant(1)
+    add.0 = s32[] add(get-tuple-element.46, const)
+    ROOT tuple.10 = (s32[], s32[], s32[8], s32[8]) tuple(add.0, add.1, dynamic-update-slice.0, get-tuple-element.54)
+  } // wide.region_0.7
+
+  wide.region_1.29 {
+    constant.5 = s32[] constant(8)
+    wide.arg_tuple.30 = (s32[], s32[], s32[8], s32[8]) parameter(0)
+    get-tuple-element.16 = s32[] get-tuple-element(wide.arg_tuple.30), index=0
+    ROOT compare.0 = pred[] compare(get-tuple-element.16, constant.5), direction=LT
+  }
+
+  outer_body {
+    wide.arg_tuple.8 = (s32[], s32[], s32[8], s32[8]) parameter(0)
+    get-tuple-element.46 = s32[] get-tuple-element(wide.arg_tuple.8), index=0
+    get-tuple-element.47 = s32[] get-tuple-element(wide.arg_tuple.8), index=1
+    get-tuple-element.48 = s32[8] get-tuple-element(wide.arg_tuple.8), index=2
+    get-tuple-element.54 = s32[8] get-tuple-element(wide.arg_tuple.8), index=3
+
+    constant.3 = s32[] constant(0)
+    tuple.8 = (s32[], s32[], s32[8], s32[8]) tuple(constant.3, get-tuple-element.47, get-tuple-element.54, get-tuple-element.48)
+    while = (s32[], s32[], s32[8], s32[8]) while(tuple.8), condition=wide.region_1.29, body=wide.region_0.7
+    get-tuple-element.40 = s32[8] get-tuple-element(while), index=2
+
+    const = s32[] constant(1)
+    add.0 = s32[] add(get-tuple-element.46, const)
+    ROOT out = (s32[], s32[], s32[8], s32[8]) tuple(add.0, get-tuple-element.47, get-tuple-element.48, get-tuple-element.40)
+  }
+   
+  outer_cond {
+    constant.5 = s32[] constant(8)
+    wide.arg_tuple.30 = (s32[], s32[], s32[8], s32[8]) parameter(0)
+    get-tuple-element.16 = s32[] get-tuple-element(wide.arg_tuple.30), index=0
+    ROOT compare.0 = pred[] compare(get-tuple-element.16, constant.5), direction=LT
+  }
+
+  main.43 {
+    constant.3 = s32[] constant(0)
+    init = s32[] constant(0)
+    array = s32[8] constant({1,2,3,4,5,6,7,8})
+    buffer = s32[8] broadcast(constant.3), dimensions={}
+    tuple.8 = (s32[], s32[], s32[8], s32[8]) tuple(constant.3, init, array, buffer)
+    while = (s32[], s32[], s32[8], s32[8]) while(tuple.8), condition=outer_cond, body=outer_body
+    ROOT get-tuple-element.40 = s32[8] get-tuple-element(while), index=3
+  } // main.43
+  
+  )";
+
+  auto module = ParseAndReturnVerifiedModule(kModule).value();
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool simplified_loop,
+      ScanLoopAccumulatorInputUnification().Run(module.get()));
+  // Buffer is not replaced with input since it is allocated outside the outer
+  // loop.
+  EXPECT_FALSE(simplified_loop);
+}
+
+TEST_F(ScanLoopAccumulatorInputUnificationTest, InputDifferentShape) {
+  [[maybe_unused]] constexpr char kModule[] = R"(
+  HloModule jit_scan
+
+  wide.region_0.7 {
+    wide.arg_tuple.8 = (s32[], s32[], s32[8], s32[8,10]) parameter(0)
+    get-tuple-element.46 = s32[] get-tuple-element(wide.arg_tuple.8), index=0
+    get-tuple-element.47 = s32[] get-tuple-element(wide.arg_tuple.8), index=1
+    get-tuple-element.48 = s32[8] get-tuple-element(wide.arg_tuple.8), index=2
+    get-tuple-element.54 = s32[8,10] get-tuple-element(wide.arg_tuple.8), index=3
+  
+    zero = s32[] constant(0)
+    dynamic-slice.0 = s32[1,10] dynamic-slice(get-tuple-element.54, get-tuple-element.46, zero), dynamic_slice_sizes={1,10}
+    reshape.2 = s32[10] reshape(dynamic-slice.0)
+  
+    dynamic-slice.1 = s32[1] dynamic-slice(reshape.2, get-tuple-element.46), dynamic_slice_sizes={1}
+    reshape.3 = s32[] reshape(dynamic-slice.1)
+
+    add.1 = s32[] add(get-tuple-element.47, reshape.3)
+
+    reshape.4 = s32[1] reshape(add.1)
+    dynamic-update-slice.0 = s32[8] dynamic-update-slice(get-tuple-element.48, reshape.4, get-tuple-element.46)
+    
+    const = s32[] constant(1)
+    add.0 = s32[] add(get-tuple-element.46, const)
+    ROOT tuple.10 = (s32[], s32[], s32[8], s32[8,10]) tuple(add.0, add.1, dynamic-update-slice.0, get-tuple-element.54)
+  } // wide.region_0.7
+
+  wide.region_1.29 {
+    constant.5 = s32[] constant(8)
+    wide.arg_tuple.30 = (s32[], s32[], s32[8], s32[8,10]) parameter(0)
+    get-tuple-element.16 = s32[] get-tuple-element(wide.arg_tuple.30), index=0
+    ROOT compare.0 = pred[] compare(get-tuple-element.16, constant.5), direction=LT
+  }
+
+  ENTRY main.43 {
+    constant.3 = s32[] constant(0)
+    init = s32[] constant(0)
+    array = s32[8,10] parameter(0)
+    broadcast.5 = s32[8] broadcast(constant.3), dimensions={}
+    
+    tuple.8 = (s32[], s32[], s32[8], s32[8,10]) tuple(constant.3, init, broadcast.5, array)
+    while = (s32[], s32[], s32[8], s32[8,10]) while(tuple.8), condition=wide.region_1.29, body=wide.region_0.7
+    get-tuple-element.39 = s32[] get-tuple-element(while), index=1
+    ROOT get-tuple-element.40 = s32[8] get-tuple-element(while), index=2
+  } // main.43
+  
+  )";
+
+  auto module = ParseAndReturnVerifiedModule(kModule).value();
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool simplified_loop,
+      ScanLoopAccumulatorInputUnification().Run(module.get()));
+  EXPECT_FALSE(simplified_loop);
+}
+
+TEST_F(ScanLoopAccumulatorInputUnificationTest, MultipleUsersInput) {
+  [[maybe_unused]] constexpr char kModule[] = R"(
+  HloModule jit_scan
+
+  wide.region_0.7 {
+    wide.arg_tuple.8 = (s32[], s32[], s32[8], s32[8], s32[8], s32[8]) parameter(0)
+    get-tuple-element.46 = s32[] get-tuple-element(wide.arg_tuple.8), index=0
+    get-tuple-element.47 = s32[] get-tuple-element(wide.arg_tuple.8), index=1
+    // buffer
+    get-tuple-element.48 = s32[8] get-tuple-element(wide.arg_tuple.8), index=2
+    // input with multiple users
+    get-tuple-element.54 = s32[8] get-tuple-element(wide.arg_tuple.8), index=3
+    // buffer
+    get-tuple-element.55 = s32[8] get-tuple-element(wide.arg_tuple.8), index=4
+    // input
+    get-tuple-element.56 = s32[8] get-tuple-element(wide.arg_tuple.8), index=5
+    
+    // this is here only to have another user for gte.54
+    mult = s32[8] multiply(get-tuple-element.54, get-tuple-element.54)
+    
+    dynamic-slice.0 = s32[1] dynamic-slice(get-tuple-element.54, get-tuple-element.46), dynamic_slice_sizes={1}
+    reshape.2 = s32[] reshape(dynamic-slice.0)
+    add.1 = s32[] add(get-tuple-element.47, reshape.2)
+
+    reshape.3 = s32[1] reshape(add.1)
+    dynamic-update-slice.0 = s32[8] dynamic-update-slice(get-tuple-element.48, reshape.3, get-tuple-element.46)
+    
+    dynamic-slice.1 = s32[1] dynamic-slice(get-tuple-element.56, get-tuple-element.46), dynamic_slice_sizes={1}
+    reshape.4 = s32[] reshape(dynamic-slice.1)
+    add.2 = s32[] multiply(get-tuple-element.47, reshape.4)
+
+    reshape.5 = s32[1] reshape(add.2)
+    dynamic-update-slice.1 = s32[8] dynamic-update-slice(get-tuple-element.55, reshape.5, get-tuple-element.46)
+    
+    const = s32[] constant(1)
+    add.0 = s32[] add(get-tuple-element.46, const)
+    ROOT tuple.10 = (s32[], s32[], s32[8], s32[8], s32[8], s32[8]) tuple(add.0, add.1, dynamic-update-slice.0, get-tuple-element.54, dynamic-update-slice.1, get-tuple-element.56)
+  } // wide.region_0.7
+
+  wide.region_1.29 {
+    constant.5 = s32[] constant(8)
+    wide.arg_tuple.30 = (s32[], s32[], s32[8], s32[8], s32[8], s32[8]) parameter(0)
+    get-tuple-element.16 = s32[] get-tuple-element(wide.arg_tuple.30), index=0
+    ROOT compare.0 = pred[] compare(get-tuple-element.16, constant.5), direction=LT
+  }
+  
+  outer_body {
+    wide.arg_tuple.8 = (s32[], s32[], s32[8], s32[8]) parameter(0)
+    get-tuple-element.46 = s32[] get-tuple-element(wide.arg_tuple.8), index=0
+    get-tuple-element.47 = s32[] get-tuple-element(wide.arg_tuple.8), index=1
+    get-tuple-element.54 = s32[8] get-tuple-element(wide.arg_tuple.8), index=2
+    get-tuple-element.56 = s32[8] get-tuple-element(wide.arg_tuple.8), index=3
+    
+    constant.3 = s32[] constant(0)
+    broadcast = s32[8] broadcast(constant.3), dimensions={}
+    broadcast2 = s32[8] broadcast(constant.3), dimensions={}
+
+    tuple.8 = (s32[], s32[], s32[8], s32[8], s32[8], s32[8]) tuple(constant.3, get-tuple-element.47, broadcast, get-tuple-element.54, broadcast2, get-tuple-element.56)
+    while = (s32[], s32[], s32[8], s32[8], s32[8], s32[8]) while(tuple.8), condition=wide.region_1.29, body=wide.region_0.7
+    get-tuple-element.40 = s32[8] get-tuple-element(while), index=2
+    get-tuple-element.41 = s32[8] get-tuple-element(while), index=4
+
+    const = s32[] constant(1)
+    add.0 = s32[] add(get-tuple-element.46, const)
+    ROOT out = (s32[], s32[], s32[8], s32[8]) tuple(add.0, get-tuple-element.47, get-tuple-element.40, get-tuple-element.41)
+  }
+  
+  outer_cond {
+    constant.5 = s32[] constant(8)
+    wide.arg_tuple.30 = (s32[], s32[], s32[8], s32[8]) parameter(0)
+    get-tuple-element.16 = s32[] get-tuple-element(wide.arg_tuple.30), index=0
+    ROOT compare.0 = pred[] compare(get-tuple-element.16, constant.5), direction=LT
+  }
+
+  ENTRY main.43 {
+    constant.3 = s32[] constant(0)
+    init = s32[] constant(0)
+    array = s32[8] constant({1,2,3,4,5,6,7,8})
+    array2 = s32[8] constant({10,20,30,40,50,60,70,80})
+    tuple.8 = (s32[], s32[], s32[8], s32[8]) tuple(constant.3, init, array, array2)
+    while = (s32[], s32[], s32[8], s32[8]) while(tuple.8), condition=outer_cond, body=outer_body
+    get-tuple-element.40 = s32[8] get-tuple-element(while), index=2
+    get-tuple-element.41 = s32[8] get-tuple-element(while), index=3
+    ROOT out = (s32[8],s32[8]) tuple(get-tuple-element.40, get-tuple-element.41)
+  } // main.43
+  
+  )";
+
+  auto module = ParseAndReturnVerifiedModule(kModule).value();
+  auto module_clone = module->Clone();
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool simplified_loop,
+      ScanLoopAccumulatorInputUnification().Run(module.get()));
+  EXPECT_TRUE(simplified_loop);
+
+  // Only index 2 is replaced with the array.
+  for (HloInstruction* instr :
+       module->entry_computation()->MakeInstructionPostOrder()) {
+    if (instr->opcode() == HloOpcode::kWhile) {
+      EXPECT_EQ(instr->while_init()->operand(2)->opcode(),
+                HloOpcode::kConstant);
+    }
+  }
+
+  EXPECT_TRUE(RunAndCompareTwoModules(
+      std::move(module), std::move(module_clone), {}, std::nullopt, true));
+}
+
+TEST_F(ScanLoopAccumulatorInputUnificationTest,
+       UnifyAccumulatorInputCheckCopy) {
+  [[maybe_unused]] constexpr char kModule[] = R"(
+  HloModule jit_scan
+
+  wide.region_0.7 {
+    wide.arg_tuple.8 = (s32[], s32[], s32[8], s32[8], s32[10]) parameter(0)
+    get-tuple-element.46 = s32[] get-tuple-element(wide.arg_tuple.8), index=0
+    get-tuple-element.47 = s32[] get-tuple-element(wide.arg_tuple.8), index=1
+    get-tuple-element.48 = s32[8] get-tuple-element(wide.arg_tuple.8), index=2
+    get-tuple-element.54 = s32[8] get-tuple-element(wide.arg_tuple.8), index=3
+    get-tuple-element.55 = s32[10] get-tuple-element(wide.arg_tuple.8), index=4
+    dynamic-slice.0 = s32[1] dynamic-slice(get-tuple-element.54, get-tuple-element.46), dynamic_slice_sizes={1}
+    reshape.2 = s32[] reshape(dynamic-slice.0)
+    dynamic-slice.1 = s32[1] dynamic-slice(get-tuple-element.55, get-tuple-element.46), dynamic_slice_sizes={1}
+    reshape.3 = s32[] reshape(dynamic-slice.1)
+    add.1 = s32[] add(reshape.3, reshape.2)
+    add.2 = s32[] add(add.1, get-tuple-element.47)
+    
+    reshape.4 = s32[1] reshape(add.2)
+    dynamic-update-slice.0 = s32[8] dynamic-update-slice(get-tuple-element.48, reshape.4, get-tuple-element.46)
+    const = s32[] constant(1)
+    add.0 = s32[] add(get-tuple-element.46, const)
+    ROOT tuple.10 = (s32[], s32[], s32[8], s32[8], s32[10]) tuple(add.0, add.1, dynamic-update-slice.0, get-tuple-element.54, get-tuple-element.55)
+  } // wide.region_0.7
+
+  wide.region_1.29 {
+    constant.5 = s32[] constant(8)
+    wide.arg_tuple.30 = (s32[], s32[], s32[8], s32[8], s32[10]) parameter(0)
+    get-tuple-element.16 = s32[] get-tuple-element(wide.arg_tuple.30), index=0
+    ROOT compare.0 = pred[] compare(get-tuple-element.16, constant.5), direction=LT
+  }
+  
+  outer_body {
+    wide.arg_tuple.8 = (s32[], s32[], s32[8], s32[10]) parameter(0)
+    get-tuple-element.46 = s32[] get-tuple-element(wide.arg_tuple.8), index=0
+    get-tuple-element.47 = s32[] get-tuple-element(wide.arg_tuple.8), index=1
+    get-tuple-element.48 = s32[8] get-tuple-element(wide.arg_tuple.8), index=2
+    get-tuple-element.55 = s32[10] get-tuple-element(wide.arg_tuple.8), index=3
+    
+    constant.3 = s32[] constant(0)
+    broadcast = s32[8] broadcast(constant.3), dimensions={}
+    
+    tuple.8 = (s32[], s32[], s32[8], s32[8], s32[10]) tuple(constant.3, get-tuple-element.47, broadcast, get-tuple-element.48, get-tuple-element.55)
+    while = (s32[], s32[], s32[8], s32[8], s32[10]) while(tuple.8), condition=wide.region_1.29, body=wide.region_0.7
+    get-tuple-element.40 = s32[8] get-tuple-element(while), index=2
+    
+    const = s32[] constant(1)
+    add.0 = s32[] add(get-tuple-element.46, const)
+    ROOT out = (s32[], s32[], s32[8], s32[10]) tuple(add.0, get-tuple-element.47, get-tuple-element.40, get-tuple-element.55)
+  }
+  
+  outer_cond {
+    constant.5 = s32[] constant(8)
+    wide.arg_tuple.30 = (s32[], s32[], s32[8], s32[10]) parameter(0)
+    get-tuple-element.16 = s32[] get-tuple-element(wide.arg_tuple.30), index=0
+    ROOT compare.0 = pred[] compare(get-tuple-element.16, constant.5), direction=LT
+  }
+
+  ENTRY main.43 {
+    constant.3 = s32[] constant(0)
+    init = s32[] constant(0)
+    array = s32[8] constant({1,2,3,4,5,6,7,8})
+    other_input = s32[10] constant({10,20,30,40,50,60,70,80,90,100})
+    tuple.8 = (s32[], s32[], s32[8], s32[10]) tuple(constant.3, init, array, other_input)
+    while = (s32[], s32[], s32[8], s32[10]) while(tuple.8), condition=outer_cond, body=outer_body
+    get-tuple-element.39 = s32[8] get-tuple-element(while), index=2
+    get-tuple-element.40 = s32[10] get-tuple-element(while), index=3
+    ROOT out = (s32[8],s32[10]) tuple(get-tuple-element.39, get-tuple-element.40)
+  } // main.43
+  )";
+
+  auto module = ParseAndReturnVerifiedModule(kModule).value();
+
+  // Check the inserted copies before applying the copy insertion pass.
+  auto module_clone = module->Clone();
+  TF_ASSERT_OK_AND_ASSIGN(bool clone_copy_inserted,
+                          CopyInsertion().Run(module_clone.get()));
+  EXPECT_TRUE(clone_copy_inserted);
+  HloInstruction* while_instruction =
+      GetTopLevelWhileInstruction(module_clone.get());
+  EXPECT_EQ(
+      while_instruction->while_body()->root_instruction()->operand(2)->opcode(),
+      HloOpcode::kCopy);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool simplified_loop,
+      ScanLoopAccumulatorInputUnification().Run(module.get()));
+  EXPECT_TRUE(simplified_loop);
+
+  // Check the inserted copies after applying the copy insertion pass and
+  // removing double buffers.
+  TF_ASSERT_OK_AND_ASSIGN(bool copy_inserted,
+                          CopyInsertion().Run(module.get()));
+  EXPECT_TRUE(copy_inserted);
+  VLOG(3) << "After copy_insertion:\n" << module->ToString();
+  while_instruction = GetTopLevelWhileInstruction(module.get());
+  EXPECT_NE(
+      while_instruction->while_body()->root_instruction()->operand(2)->opcode(),
+      HloOpcode::kCopy);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/service/scatter_expander.cc b/third_party/xla/xla/service/scatter_expander.cc
index f76c88cabda01f..ec5d3a7726c2bc 100644
--- a/third_party/xla/xla/service/scatter_expander.cc
+++ b/third_party/xla/xla/service/scatter_expander.cc
@@ -525,7 +525,7 @@ bool ScatterExpander::InstructionMatchesPattern(HloInstruction* inst) {
   return (scatter != nullptr) && (mode_ == kEliminateAllScatters ||
                                   (mode_ == kEliminateSimpleScatters &&
                                    ScatterTripCount(scatter) == 1) ||
-                                  (mode_ == kEliminateIndeterminisitcScatters &&
+                                  (mode_ == kEliminateIndeterministicScatters &&
                                    !IsDeterministic(scatter)));
 }
 
diff --git a/third_party/xla/xla/service/scatter_expander.h b/third_party/xla/xla/service/scatter_expander.h
index d87bb52fc07380..0658e962017454 100644
--- a/third_party/xla/xla/service/scatter_expander.h
+++ b/third_party/xla/xla/service/scatter_expander.h
@@ -23,7 +23,7 @@ namespace xla {
 // This pass rewrites scatter operations into (roughly) while loops of
 // dynamic-update-slices.
 //
-// This pass can be used in two ways:
+// This pass can be used in three ways:
 //
 //   - kEliminateAllScatters: For backends that don't support scatter, this pass
 //     can convert every scatter into a loop.
@@ -32,8 +32,8 @@ namespace xla {
 //     pass can strength-reduce "simple" scatters -- specifically, scatters that
 //     can be represented without a loop -- to dynamic-update-slices.
 //
-//   - kEliminateIndeterminisitcScatters: For backends that *do* support
-//     scatter, this pass converts scatters with potentially indeterminisitc
+//   - kEliminateIndeterministicScatters: For backends that *do* support
+//     scatter, this pass converts scatters with potentially indeterministic
 //     behavior, because of non-unique indices or non-associative combiner
 //     functions. There may be false positives, but no false negatives, i.e.
 //     some scatters are converted even when deterministic in practice.
@@ -46,7 +46,7 @@ class ScatterExpander : public OpExpanderPass {
   enum Mode {
     kEliminateAllScatters,
     kEliminateSimpleScatters,
-    kEliminateIndeterminisitcScatters,
+    kEliminateIndeterministicScatters,
   };
 
   explicit ScatterExpander(Mode m) : mode_(m) {}
diff --git a/third_party/xla/xla/service/scatter_expander_test.cc b/third_party/xla/xla/service/scatter_expander_test.cc
index 71c76374f0d085..a74eabf4080ef7 100644
--- a/third_party/xla/xla/service/scatter_expander_test.cc
+++ b/third_party/xla/xla/service/scatter_expander_test.cc
@@ -270,7 +270,7 @@ TEST_F(ScatterExpanderTest, DoNotEliminateScatterWithAssociativeCombiner) {
                           ParseAndReturnVerifiedModule(kModuleStr));
 
   ScatterExpander scatter_expander(
-      ScatterExpander::kEliminateIndeterminisitcScatters);
+      ScatterExpander::kEliminateIndeterministicScatters);
   TF_ASSERT_OK_AND_ASSIGN(bool result,
                           RunHloPass(&scatter_expander, module.get()));
   EXPECT_FALSE(result);
@@ -300,7 +300,7 @@ TEST_F(ScatterExpanderTest, EliminateScatterWithNonAssociativeCombiner) {
                           ParseAndReturnVerifiedModule(kModuleStr));
 
   ScatterExpander scatter_expander(
-      ScatterExpander::kEliminateIndeterminisitcScatters);
+      ScatterExpander::kEliminateIndeterministicScatters);
   TF_ASSERT_OK_AND_ASSIGN(bool result,
                           RunHloPass(&scatter_expander, module.get()));
   EXPECT_TRUE(result);
@@ -330,7 +330,7 @@ TEST_F(ScatterExpanderTest, DoNotEliminateScatterWithAssociativeFp32Combiner) {
                           ParseAndReturnVerifiedModule(kModuleStr));
 
   ScatterExpander scatter_expander(
-      ScatterExpander::kEliminateIndeterminisitcScatters);
+      ScatterExpander::kEliminateIndeterministicScatters);
   TF_ASSERT_OK_AND_ASSIGN(bool result,
                           RunHloPass(&scatter_expander, module.get()));
   EXPECT_FALSE(result);
diff --git a/third_party/xla/xla/service/scatter_simplifier.cc b/third_party/xla/xla/service/scatter_simplifier.cc
index b6b12a3dc3b9ad..c5265cd61eb585 100644
--- a/third_party/xla/xla/service/scatter_simplifier.cc
+++ b/third_party/xla/xla/service/scatter_simplifier.cc
@@ -215,10 +215,11 @@ bool ScatterSimplifier::IsSimplifiedScatter(
       !IsIdentityPermutation(dims.scatter_dims_to_operand_dims());
   bool scatter_dim_not_first =
       absl::c_linear_search(dims.update_window_dims(), 0);
+  bool update_window_dims_sorted = absl::c_is_sorted(dims.update_window_dims());
 
   return !(nonstandard_index_vector_dim || num_scatter_dims > 1 ||
            scatter_indices_reordered || scatter_dim_not_first ||
-           !dims.inserted_window_dims().empty());
+           !update_window_dims_sorted || !dims.inserted_window_dims().empty());
 }
 
 bool ScatterSimplifier::InstructionMatchesPattern(HloInstruction* inst) {
diff --git a/third_party/xla/xla/service/scatter_simplifier_test.cc b/third_party/xla/xla/service/scatter_simplifier_test.cc
index 36d82b4b487eee..b10054347a4fc7 100644
--- a/third_party/xla/xla/service/scatter_simplifier_test.cc
+++ b/third_party/xla/xla/service/scatter_simplifier_test.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <optional>
 
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/service/hlo_parser.h"
 #include "xla/service/hlo_pass_fix.h"
 #include "xla/service/hlo_pass_pipeline.h"
 #include "xla/tests/hlo_test_base.h"
@@ -152,7 +154,7 @@ TEST_F(ScatterSimplifierTest, MovesIndexVectorDim) {
     })";
 
   RunAndFilecheckHloRewrite(kModuleStr, ScatterSimplifier(), R"(
-           CHECK: %[[TRANSPOSED_INDICES:.*]] = s32[1,2]{0,1}
+           CHECK: %[[TRANSPOSED_INDICES:.*]] = s32[1,2]{1,0}
       CHECK-SAME:     transpose(%indices), dimensions={1,0}
            CHECK: scatter(%operand, %[[TRANSPOSED_INDICES]], %update),
       CHECK-SAME:     index_vector_dim=1
@@ -183,9 +185,9 @@ TEST_F(ScatterSimplifierTest, TransformsUpdatesAndOperandUsingScatterDims) {
     })";
 
   RunAndFilecheckHloRewrite(kModuleStr, ScatterSimplifier(), R"(
-           CHECK: %[[T_OPERAND:.*]] = f32[5,3,4]{0,2,1} transpose(%operand),
+           CHECK: %[[T_OPERAND:.*]] = f32[5,3,4]{2,1,0} transpose(%operand),
       CHECK-SAME:     dimensions={2,0,1}
-           CHECK: %[[T_UPDATES:.*]] = f32[2,3,1,1]{1,3,2,0} transpose(%update),
+           CHECK: %[[T_UPDATES:.*]] = f32[2,3,1,1]{3,2,1,0} transpose(%update),
       CHECK-SAME:     dimensions={0,3,1,2}
            CHECK: %[[SCATTER:.*]] = {{.*}} scatter(
       CHECK-SAME:     %[[T_OPERAND]], %indices, %[[T_UPDATES]])
@@ -218,7 +220,7 @@ TEST_F(ScatterSimplifierTest, MakesScatterDimensionsLeadingInUpdates) {
     })";
 
   RunAndFilecheckHloRewrite(kModuleStr, ScatterSimplifier(), R"(
-           CHECK: %[[TRANSPOSED_UPDATES:.*]] = f32[1,2]{0,1}
+           CHECK: %[[TRANSPOSED_UPDATES:.*]] = f32[1,2]{1,0}
       CHECK-SAME:     transpose(%update), dimensions={1,0}
            CHECK: scatter(
       CHECK-SAME:     %[[TRANSPOSED_UPDATES]]
@@ -252,5 +254,32 @@ TEST_F(ScatterSimplifierTest, ZeroDimScatterIndices) {
     )");
 }
 
+TEST_F(ScatterSimplifierTest,
+       IsSimplifiedScatterReturnsFalseForUnsortedWindowDims) {
+  constexpr absl::string_view kModuleStr = R"(
+    HloModule scatter_simplifier
+
+    scatter_computation {
+      %p0 = f32[] parameter(0)
+      ROOT result = f32[] parameter(1)
+    }
+
+    ENTRY kernel_entry {
+      operand = f32[3,2] parameter(0)
+      indices = s32[1,1] parameter(1)
+      update = f32[1,2,2] parameter(2)
+      ROOT scatter = f32[3,2] scatter(operand, indices, update),
+          to_apply=scatter_computation,
+          update_window_dims={2,1},
+          inserted_window_dims={},
+          scatter_dims_to_operand_dims={0},
+          index_vector_dim=1
+    })";
+  auto module = ParseAndReturnUnverifiedModule(kModuleStr).value();
+  auto scatter = module->entry_computation()->root_instruction();
+  EXPECT_FALSE(ScatterSimplifier::IsSimplifiedScatter(
+      Cast<HloScatterInstruction>(scatter)));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/service.cc b/third_party/xla/xla/service/service.cc
index 9017774ce2e51d..42eeb8c8b85e4f 100644
--- a/third_party/xla/xla/service/service.cc
+++ b/third_party/xla/xla/service/service.cc
@@ -71,9 +71,9 @@ using absl::StrCat;
 using absl::StrFormat;
 
 // Records the arguments used to invoke a computation in an HloSnapshot proto.
-Status RecordArguments(const absl::Span<const ShapedBuffer* const> arguments,
-                       se::Stream* stream, TransferManager* transfer_manager,
-                       HloSnapshot* module) {
+absl::Status RecordArguments(
+    const absl::Span<const ShapedBuffer* const> arguments, se::Stream* stream,
+    TransferManager* transfer_manager, HloSnapshot* module) {
   module->clear_arguments();
   for (const ShapedBuffer* argument : arguments) {
     TF_ASSIGN_OR_RETURN(
@@ -85,8 +85,9 @@ Status RecordArguments(const absl::Span<const ShapedBuffer* const> arguments,
 }
 
 // Records the result of a computation in a HloSnapshot proto.
-Status RecordResult(const ShapedBuffer& result, se::Stream* stream,
-                    TransferManager* transfer_manager, HloSnapshot* module) {
+absl::Status RecordResult(const ShapedBuffer& result, se::Stream* stream,
+                          TransferManager* transfer_manager,
+                          HloSnapshot* module) {
   module->clear_result();
   TF_ASSIGN_OR_RETURN(
       Literal literal,
@@ -159,18 +160,18 @@ Service::Service(const ServiceOptions& options,
   }
 }
 
-Status Service::CreateChannelHandle(const CreateChannelHandleRequest* arg,
-                                    CreateChannelHandleResponse* result) {
+absl::Status Service::CreateChannelHandle(const CreateChannelHandleRequest* arg,
+                                          CreateChannelHandleResponse* result) {
   TF_ASSIGN_OR_RETURN(*result->mutable_channel(),
                       channel_tracker_.NewChannel(arg->channel_type()));
   return OkStatus();
 }
 
-Status Service::Unregister(const UnregisterRequest* arg,
-                           UnregisterResponse* result) {
-  Status status;
+absl::Status Service::Unregister(const UnregisterRequest* arg,
+                                 UnregisterResponse* result) {
+  absl::Status status;
   for (auto& data : arg->data()) {
-    Status unregister_status = allocation_tracker_.Unregister(data);
+    absl::Status unregister_status = allocation_tracker_.Unregister(data);
     if (!unregister_status.ok() && status.ok()) {
       status = unregister_status;
     }
@@ -179,8 +180,8 @@ Status Service::Unregister(const UnregisterRequest* arg,
 }
 
 // Deconstructs a previously-allocated global handle.
-Status Service::DeconstructTuple(const DeconstructTupleRequest* arg,
-                                 DeconstructTupleResponse* result) {
+absl::Status Service::DeconstructTuple(const DeconstructTupleRequest* arg,
+                                       DeconstructTupleResponse* result) {
   TF_ASSIGN_OR_RETURN(
       std::vector<GlobalDataHandle> elements,
       allocation_tracker_.DeconstructTuple(arg->tuple_handle()));
@@ -191,8 +192,8 @@ Status Service::DeconstructTuple(const DeconstructTupleRequest* arg,
   return OkStatus();
 }
 
-Status Service::ValidateResultShape(const Shape& client_shape,
-                                    const Shape& result_shape) {
+absl::Status Service::ValidateResultShape(const Shape& client_shape,
+                                          const Shape& result_shape) {
   TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(client_shape));
   if (!ShapeUtil::Compatible(client_shape, result_shape)) {
     return InvalidArgument(
@@ -421,7 +422,7 @@ Service::ExecuteParallelAndRegisterResult(
 
   // Wait for all executions to complete.
   for (int64_t i = 0, end = streams.size(); i < end; ++i) {
-    Status block_status = streams[i]->BlockHostUntilDone();
+    absl::Status block_status = streams[i]->BlockHostUntilDone();
     if (!block_status.ok()) {
       return Internal("failed to complete execution for stream %d: %s", i,
                       block_status.message());
@@ -530,8 +531,8 @@ Service::GetArguments(
   return replicated_arguments;
 }
 
-Status Service::ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg,
-                                     ExecuteParallelResponse* result) {
+absl::Status Service::ExecuteGraphParallel(
+    const ExecuteGraphParallelRequest* arg, ExecuteParallelResponse* result) {
   VLOG(1) << "running execute-graph-parallel request";
 
   std::vector<std::vector<std::vector<const ShapedBuffer*>>> all_arguments;
@@ -655,7 +656,7 @@ Status Service::ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg,
   // basically the same thing.
   ExecutionProfile profile;
   std::vector<GlobalDataHandle> outputs;
-  Status execution_status = OkStatus();
+  absl::Status execution_status = OkStatus();
 
   if (executable_ptrs.size() == 1) {
     absl::StatusOr<GlobalDataHandle> output_or_status =
@@ -714,8 +715,8 @@ Status Service::ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg,
   return OkStatus();
 }
 
-Status Service::GetDeviceHandles(const GetDeviceHandlesRequest* arg,
-                                 GetDeviceHandlesResponse* result) {
+absl::Status Service::GetDeviceHandles(const GetDeviceHandlesRequest* arg,
+                                       GetDeviceHandlesResponse* result) {
   const int64_t available_device_count = execute_backend_->device_count();
   const int64_t replica_count = options_.number_of_replicas();
   if (replica_count <= 0) {
@@ -748,7 +749,8 @@ absl::StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
       module_proto.name());
 
   tsl::profiler::ScopedAnnotation annotation{[&] {
-    return absl::StrCat("XlaCompile:#module=", module_proto.name(), "#");
+    // module's unique_id is not available yet
+    return absl::StrFormat("XlaCompile:#module=%s#", module_proto.name());
   }};
 
   TF_ASSIGN_OR_RETURN(
@@ -771,9 +773,6 @@ absl::StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
                                     std::move(module), executor, options));
   }
 
-  tsl::profiler::ScopedAnnotation backend_annotation{[&] {
-    return absl::StrCat("XlaCompileBackend:#module=", module_proto.name(), "#");
-  }};
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Executable> executable,
       backend->compiler()->RunBackend(std::move(module), executor, options));
@@ -796,7 +795,8 @@ absl::StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
   return executable;
 }
 
-Status Service::Compile(const CompileRequest* arg, CompileResponse* result) {
+absl::Status Service::Compile(const CompileRequest* arg,
+                              CompileResponse* result) {
   VLOG(1) << "running compile request";
   if (!arg->has_computation()) {
     return InvalidArgument("computations may not be empty");
@@ -837,7 +837,8 @@ Status Service::Compile(const CompileRequest* arg, CompileResponse* result) {
   return OkStatus();
 }
 
-Status Service::Execute(const ExecuteRequest* arg, ExecuteResponse* result) {
+absl::Status Service::Execute(const ExecuteRequest* arg,
+                              ExecuteResponse* result) {
   VLOG(1) << "running execute request";
   if (!arg->has_handle()) {
     return InvalidArgument("execution handle should not be empty");
@@ -907,8 +908,8 @@ Status Service::Execute(const ExecuteRequest* arg, ExecuteResponse* result) {
   return OkStatus();
 }
 
-Status Service::TransferToClient(const TransferToClientRequest* arg,
-                                 TransferToClientResponse* result) {
+absl::Status Service::TransferToClient(const TransferToClientRequest* arg,
+                                       TransferToClientResponse* result) {
   TF_ASSIGN_OR_RETURN(const ShapedBuffer* shaped_buffer,
                       allocation_tracker_.ResolveForReplica(arg->data(), 0));
 
@@ -950,8 +951,8 @@ Status Service::TransferToClient(const TransferToClientRequest* arg,
   return OkStatus();
 }
 
-Status Service::TransferToServer(const TransferToServerRequest* arg,
-                                 TransferToServerResponse* result) {
+absl::Status Service::TransferToServer(const TransferToServerRequest* arg,
+                                       TransferToServerResponse* result) {
   TF_ASSIGN_OR_RETURN(Literal literal,
                       Literal::CreateFromProto(arg->literal()));
   const Shape& shape = literal.shape();
@@ -993,8 +994,8 @@ Status Service::TransferToServer(const TransferToServerRequest* arg,
   return OkStatus();
 }
 
-Status Service::TransferToInfeed(const TransferToInfeedRequest* arg,
-                                 TransferToInfeedResponse* result) {
+absl::Status Service::TransferToInfeed(const TransferToInfeedRequest* arg,
+                                       TransferToInfeedResponse* result) {
   const int64_t replica_count = options_.number_of_replicas();
   if (arg->replica_id() < 0 || arg->replica_id() >= replica_count) {
     return FailedPrecondition(
@@ -1022,8 +1023,8 @@ Status Service::TransferToInfeed(const TransferToInfeedRequest* arg,
                                                                        literal);
 }
 
-Status Service::TransferFromOutfeed(const TransferFromOutfeedRequest* arg,
-                                    TransferFromOutfeedResponse* result) {
+absl::Status Service::TransferFromOutfeed(const TransferFromOutfeedRequest* arg,
+                                          TransferFromOutfeedResponse* result) {
   const int64_t replica_count = options_.number_of_replicas();
   if (arg->replica_id() < 0 || arg->replica_id() >= replica_count) {
     return FailedPrecondition(
@@ -1052,13 +1053,13 @@ Status Service::TransferFromOutfeed(const TransferFromOutfeedRequest* arg,
   return OkStatus();
 }
 
-Status Service::ResetDevice(const ResetDeviceRequest* arg,
-                            ResetDeviceResponse* result) {
+absl::Status Service::ResetDevice(const ResetDeviceRequest* arg,
+                                  ResetDeviceResponse* result) {
   return execute_backend_->ResetDevices();
 }
 
-Status Service::ComputeConstantGraph(const ComputeConstantGraphRequest* arg,
-                                     ComputeConstantResponse* result) {
+absl::Status Service::ComputeConstantGraph(
+    const ComputeConstantGraphRequest* arg, ComputeConstantResponse* result) {
   if (!arg->has_computation()) {
     return InvalidArgument("computations may not be empty");
   }
@@ -1119,14 +1120,15 @@ Status Service::ComputeConstantGraph(const ComputeConstantGraphRequest* arg,
   return OkStatus();
 }
 
-Status Service::GetShape(const GetShapeRequest* arg, GetShapeResponse* result) {
+absl::Status Service::GetShape(const GetShapeRequest* arg,
+                               GetShapeResponse* result) {
   TF_ASSIGN_OR_RETURN(const ShapedBuffer* buffer,
                       allocation_tracker_.ResolveForReplica(arg->data(), 0));
   *result->mutable_shape() = buffer->on_device_shape().ToProto();
   return OkStatus();
 }
 
-Status Service::GetComputationGraphStats(
+absl::Status Service::GetComputationGraphStats(
     const ComputationGraphStatsRequest* arg, ComputationStatsResponse* result) {
   if (!arg->has_computation()) {
     return InvalidArgument("Computations may not be empty.");
diff --git a/third_party/xla/xla/service/service.h b/third_party/xla/xla/service/service.h
index f606698b1a8582..897b12cd83e91e 100644
--- a/third_party/xla/xla/service/service.h
+++ b/third_party/xla/xla/service/service.h
@@ -84,28 +84,30 @@ class Service : public ServiceInterface {
   //
   // If the handle given is not currently allocated, a NOT_FOUND status is
   // returned.
-  Status Unregister(const UnregisterRequest* arg,
-                    UnregisterResponse* result) override;
+  absl::Status Unregister(const UnregisterRequest* arg,
+                          UnregisterResponse* result) override;
 
   // Deconstructs a tuple. Returns a newly created GlobalDataHandle for each
   // element in the tuple.
-  Status DeconstructTuple(const DeconstructTupleRequest* arg,
-                          DeconstructTupleResponse* result) override;
+  absl::Status DeconstructTuple(const DeconstructTupleRequest* arg,
+                                DeconstructTupleResponse* result) override;
 
   // Compiles a computation into an executable. The request contains the whole
   // computation graph. Returns the handle to the executable.
-  Status Compile(const CompileRequest* arg, CompileResponse* result) override;
+  absl::Status Compile(const CompileRequest* arg,
+                       CompileResponse* result) override;
 
   // Executes an executable with the provided global data passes as immutable
   // arguments. The request contains the handle to the executable. Returns
   // global data output and execution timing.
-  Status Execute(const ExecuteRequest* arg, ExecuteResponse* result) override;
+  absl::Status Execute(const ExecuteRequest* arg,
+                       ExecuteResponse* result) override;
 
   // Executes one or more computations in parallel with the provided global data
   // passed as immutable arguments. Returns global data output for each
   // computation.
-  Status ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg,
-                              ExecuteParallelResponse* result) override;
+  absl::Status ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg,
+                                    ExecuteParallelResponse* result) override;
 
   // Requests one or more device handles from the target.
   //
@@ -115,26 +117,27 @@ class Service : public ServiceInterface {
   // the first set of replicas, and the next R devices to the second set of
   // replicas, etc. Each returned device handle represents the device with the
   // replica id 0.
-  Status GetDeviceHandles(const GetDeviceHandlesRequest* arg,
-                          GetDeviceHandlesResponse* result) override;
+  absl::Status GetDeviceHandles(const GetDeviceHandlesRequest* arg,
+                                GetDeviceHandlesResponse* result) override;
 
   // Requests that global data be transferred to the client in literal form.
-  Status TransferToClient(const TransferToClientRequest* arg,
-                          TransferToClientResponse* result) override;
+  absl::Status TransferToClient(const TransferToClientRequest* arg,
+                                TransferToClientResponse* result) override;
 
   // Transfers data from a literal provided by the client, into device memory.
-  Status TransferToServer(const TransferToServerRequest* arg,
-                          TransferToServerResponse* result) override;
+  absl::Status TransferToServer(const TransferToServerRequest* arg,
+                                TransferToServerResponse* result) override;
 
   // Transfers data from a literal provided by the client, into the Infeed
   // buffer of the device.
-  Status TransferToInfeed(const TransferToInfeedRequest* arg,
-                          TransferToInfeedResponse* result) override;
+  absl::Status TransferToInfeed(const TransferToInfeedRequest* arg,
+                                TransferToInfeedResponse* result) override;
 
   // Transfers data from the Outfeed othe device to the literal provided by the
   // client.
-  Status TransferFromOutfeed(const TransferFromOutfeedRequest* arg,
-                             TransferFromOutfeedResponse* result) override;
+  absl::Status TransferFromOutfeed(
+      const TransferFromOutfeedRequest* arg,
+      TransferFromOutfeedResponse* result) override;
 
   // Resets devices, clearing all existing state on all the devices associated
   // with this service (including memory allocated on the devices).
@@ -145,25 +148,27 @@ class Service : public ServiceInterface {
   // ResetDevice should be called before an Execution that expect the device to
   // be in the reset state. For example, if the prior Execution modifies device
   // state (e.g., architectural state) that the next Execution depends on.
-  Status ResetDevice(const ResetDeviceRequest* arg,
-                     ResetDeviceResponse* result) override;
+  absl::Status ResetDevice(const ResetDeviceRequest* arg,
+                           ResetDeviceResponse* result) override;
 
-  Status ComputeConstantGraph(const ComputeConstantGraphRequest* arg,
-                              ComputeConstantResponse* result) override;
+  absl::Status ComputeConstantGraph(const ComputeConstantGraphRequest* arg,
+                                    ComputeConstantResponse* result) override;
 
   // Returns the shape (with layout) of an array associated with a given data
   // handle.
-  Status GetShape(const GetShapeRequest* arg,
-                  GetShapeResponse* result) override;
+  absl::Status GetShape(const GetShapeRequest* arg,
+                        GetShapeResponse* result) override;
 
   // Retrieves the statistics of a computation.
-  Status GetComputationGraphStats(const ComputationGraphStatsRequest* arg,
-                                  ComputationStatsResponse* result) override;
+  absl::Status GetComputationGraphStats(
+      const ComputationGraphStatsRequest* arg,
+      ComputationStatsResponse* result) override;
 
   // Creates a unique channel handle that can be used for Send/Recv
   // instructions.
-  Status CreateChannelHandle(const CreateChannelHandleRequest* arg,
-                             CreateChannelHandleResponse* result) override;
+  absl::Status CreateChannelHandle(
+      const CreateChannelHandleRequest* arg,
+      CreateChannelHandleResponse* result) override;
 
   // Returns the backend used to execute computations.
   const Backend& backend() const { return *execute_backend_; }
@@ -180,8 +185,8 @@ class Service : public ServiceInterface {
   // Convenience function which checks whether the given client_shape
   // (presumably passed by the client to set the result layout) is valid for the
   // given computation result shape.
-  static Status ValidateResultShape(const Shape& client_shape,
-                                    const Shape& result_shape);
+  static absl::Status ValidateResultShape(const Shape& client_shape,
+                                          const Shape& result_shape);
 
  private:
   // A private overload for Service itself, used by other methods within this
diff --git a/third_party/xla/xla/service/service_executable_run_options.h b/third_party/xla/xla/service/service_executable_run_options.h
index a2f57e6bbae635..49631dae39cdfd 100644
--- a/third_party/xla/xla/service/service_executable_run_options.h
+++ b/third_party/xla/xla/service/service_executable_run_options.h
@@ -64,7 +64,8 @@ class ServiceExecutableRunOptions {
       int device_ordinal,
       se::StreamPriority priority = se::StreamPriority::Default) const {
     if (!stream_borrower_) {
-      return Status(absl::StatusCode::kUnimplemented, "No stream borrower");
+      return absl::Status(absl::StatusCode::kUnimplemented,
+                          "No stream borrower");
     }
 
     TF_ASSIGN_OR_RETURN(
@@ -79,7 +80,8 @@ class ServiceExecutableRunOptions {
       se::StreamPriority priority = se::StreamPriority::Default) const {
     return stream_borrower_
                ? stream_borrower_(device_ordinal, num_streams, priority)
-               : Status(absl::StatusCode::kUnimplemented, "No stream borrower");
+               : absl::Status(absl::StatusCode::kUnimplemented,
+                              "No stream borrower");
   }
 
  private:
diff --git a/third_party/xla/xla/service/shape_inference.cc b/third_party/xla/xla/service/shape_inference.cc
index a6ba2645d35273..fd7c82825680c4 100644
--- a/third_party/xla/xla/service/shape_inference.cc
+++ b/third_party/xla/xla/service/shape_inference.cc
@@ -78,7 +78,7 @@ bool CompatibleDimensionSizes(int64_t size_a, int64_t size_b) {
          size_a == size_b;
 }
 
-Status ExpectArray(const Shape& shape, absl::string_view op_type) {
+absl::Status ExpectArray(const Shape& shape, absl::string_view op_type) {
   if (!shape.IsArray()) {
     return InvalidArgument("Expected array argument for %s, but got %s.",
                            std::string(op_type), ShapeUtil::HumanString(shape));
@@ -86,10 +86,10 @@ Status ExpectArray(const Shape& shape, absl::string_view op_type) {
   return OkStatus();
 }
 
-Status VerifyReducerShape(const ProgramShape& reducer_shape,
-                          absl::Span<const Shape* const> init_value_shapes,
-                          absl::Span<const PrimitiveType> input_element_types,
-                          int64_t inputs) {
+absl::Status VerifyReducerShape(
+    const ProgramShape& reducer_shape,
+    absl::Span<const Shape* const> init_value_shapes,
+    absl::Span<const PrimitiveType> input_element_types, int64_t inputs) {
   if (reducer_shape.parameters_size() != inputs * 2) {
     return InvalidArgument(
         "Reduction function must take %d parameters, but "
@@ -791,7 +791,7 @@ absl::StatusOr<DimAndBound> InferMostSpecificDimAndBound(int64_t dim,
 
 namespace {
 
-Status ValidateDotDimensionNumbers(
+absl::Status ValidateDotDimensionNumbers(
     const Shape& lhs, const Shape& rhs,
     const DotDimensionNumbers& dimension_numbers) {
   // Check that dimension numbers are in range.
@@ -850,7 +850,7 @@ Status ValidateDotDimensionNumbers(
   TF_RETURN_IF_ERROR(ExpectArray(lhs, "lhs of dot"));
   TF_RETURN_IF_ERROR(ExpectArray(rhs, "rhs of dot"));
 
-  auto fail = [lhs, rhs](const std::string& addendum) -> Status {
+  auto fail = [lhs, rhs](const std::string& addendum) -> absl::Status {
     std::string message =
         StrFormat("Cannot infer shape for dot operation: %s <dot> %s.",
                   ShapeUtil::HumanString(lhs), ShapeUtil::HumanString(rhs));
@@ -2248,8 +2248,10 @@ ShapeInference::InferScalarBroadcastShape(absl::Span<const Shape> shapes) {
       }
       RET_CHECK_RANK(in);
       for (int i = 0; i < fft_rank; i++) {
-        if (in.dimensions(in.dimensions_size() - fft_rank + i) !=
-            fft_length[i]) {
+        if (!IsUnboundedDynamicSize(
+                in.dimensions(in.dimensions_size() - fft_rank + i)) &&
+            in.dimensions(in.dimensions_size() - fft_rank + i) !=
+                fft_length[i]) {
           return InvalidArgument(
               "RFFT requires innermost dimensions match fft_length but "
               "dimension %d is %d and should be %d.",
@@ -2264,6 +2266,9 @@ ShapeInference::InferScalarBroadcastShape(absl::Span<const Shape> shapes) {
       if (fft_length[fft_rank - 1] != 0) {
         result.set_dimensions(result.dimensions_size() - 1,
                               fft_length[fft_rank - 1] / 2 + 1);
+        if (in.is_unbounded_dynamic_dimension(result.dimensions_size() - 1)) {
+          result.set_dynamic_dimension(result.dimensions_size() - 1, false);
+        }
       }
       return result;
     }
@@ -2286,18 +2291,21 @@ ShapeInference::InferScalarBroadcastShape(absl::Span<const Shape> shapes) {
         }
       }
       // The size of zero-sized dimensions is preserved.
-      if ((in.dimensions(in.dimensions_size() - 1) != 0 ||
-           fft_length[fft_rank - 1] != 0) &&
-          in.dimensions(in.dimensions_size() - 1) !=
-              fft_length[fft_rank - 1] / 2 + 1) {
+      int64_t last_in_dimension_size = in.dimensions(in.dimensions_size() - 1);
+      if ((last_in_dimension_size != 0 || fft_length[fft_rank - 1] != 0) &&
+          !IsUnboundedDynamicSize(last_in_dimension_size) &&
+          last_in_dimension_size != fft_length[fft_rank - 1] / 2 + 1) {
         return InvalidArgument(
             "IRFFT requires innermost dimension matches fft_length/2+1, but "
             "dimension %d is %d and should be %d.",
-            in.dimensions_size() - 1, in.dimensions(in.dimensions_size() - 1),
+            in.dimensions_size() - 1, last_in_dimension_size,
             fft_length[fft_rank - 1] / 2 + 1);
       }
       result.set_dimensions(result.dimensions_size() - 1,
                             fft_length[fft_rank - 1]);
+      if (in.is_unbounded_dynamic_dimension(result.dimensions_size() - 1)) {
+        result.set_dynamic_dimension(result.dimensions_size() - 1, false);
+      }
       return result;
     }
     default:
@@ -2492,6 +2500,8 @@ ShapeInference::InferScalarBroadcastShape(absl::Span<const Shape> shapes) {
     const Shape& shape, int64_t split_dimension, int64_t concat_dimension,
     int64_t split_count) {
   TF_RET_CHECK(split_count > 0);
+  TF_RET_CHECK(!shape.is_bounded_dynamic())
+      << "AllToAll does not support bounded dynamic shapes";
   if (split_dimension >= shape.rank() || split_dimension < 0) {
     return InvalidArgument(
         "AllToAll split_dimension %d is out-of-bounds in shape %s.",
@@ -2502,25 +2512,41 @@ ShapeInference::InferScalarBroadcastShape(absl::Span<const Shape> shapes) {
         "AllToAll concat_dimension %d is out-of-bounds in shape %s.",
         concat_dimension, ShapeUtil::HumanString(shape));
   }
-  if (shape.dimensions(split_dimension) % split_count != 0) {
+  int64_t split_dimension_size = shape.dimensions(split_dimension);
+  if (!IsUnboundedDynamicSize(split_dimension_size) &&
+      split_dimension_size % split_count != 0) {
     return InvalidArgument(
         "AllToAll split dimension size %d must be dividable by split_count "
         "%d.",
-        shape.dimensions(split_dimension), split_count);
+        split_dimension_size, split_count);
   }
   std::vector<int64_t> new_dimensions(shape.dimensions().begin(),
                                       shape.dimensions().end());
-  new_dimensions[split_dimension] /= split_count;
-  new_dimensions[concat_dimension] *= split_count;
-  return ShapeUtil::MakeShape(shape.element_type(), new_dimensions);
+  new_dimensions[split_dimension] =
+      IsUnboundedDynamicSize(new_dimensions[split_dimension])
+          ? Shape::kUnboundedSize
+          : new_dimensions[split_dimension] / split_count;
+  new_dimensions[concat_dimension] =
+      IsUnboundedDynamicSize(new_dimensions[concat_dimension])
+          ? Shape::kUnboundedSize
+          : new_dimensions[concat_dimension] * split_count;
+
+  const std::vector<bool> dynamic_dimensions(shape.dynamic_dimensions().begin(),
+                                             shape.dynamic_dimensions().end());
+  return ShapeUtil::MakeShape(shape.element_type(), new_dimensions,
+                              dynamic_dimensions);
 }
 
 /* static */ absl::StatusOr<Shape> ShapeInference::InferAllToAllTupleShape(
     absl::Span<const Shape* const> operand_shapes) {
-  // An Alltoall HLO instruction receives N operands (with the same shape) and
+  // An AllToAll HLO instruction receives N operands (with the same shape) and
   // returns a tuple that contains N array shapes.
   TF_RET_CHECK(!operand_shapes.empty());
   for (int i = 0; i < operand_shapes.size(); i++) {
+    if (operand_shapes[i]->is_unbounded_dynamic()) {
+      return InvalidArgument(
+          "AllToAllTuple does not support unbounded dynamic shapes");
+    }
     if (!Shape::Equal().IgnoreMemorySpaceInLayout()(*operand_shapes[0],
                                                     *operand_shapes[i])) {
       return InvalidArgument(
@@ -3766,7 +3792,7 @@ ShapeInference::InferCollectivePermuteDoneShape(const Shape& operand_shape) {
   return to_apply.result();
 }
 
-static Status ValidateGatherDimensionNumbers(
+static absl::Status ValidateGatherDimensionNumbers(
     const Shape& input_shape, absl::Span<const int64_t> start_indices_shape,
     const GatherDimensionNumbers& dim_numbers) {
   if (!absl::c_is_sorted(dim_numbers.offset_dims())) {
@@ -4002,7 +4028,7 @@ static Status ValidateGatherDimensionNumbers(
 
 namespace {
 
-Status ValidateScatterDimensionNumbers(
+absl::Status ValidateScatterDimensionNumbers(
     const Shape& operand_shape, absl::Span<const int64_t> scatter_indices_shape,
     const Shape& updates_shape, const ScatterDimensionNumbers& dim_numbers) {
   // Validate update_window_dims in ScatterDimensionNumbers.
diff --git a/third_party/xla/xla/service/shape_inference_test.cc b/third_party/xla/xla/service/shape_inference_test.cc
index 3afa14fa302eb8..14c3e804563815 100644
--- a/third_party/xla/xla/service/shape_inference_test.cc
+++ b/third_party/xla/xla/service/shape_inference_test.cc
@@ -987,6 +987,16 @@ TEST_F(ShapeInferenceTest, InferFftShapeTestFftRanks) {
   fft::Fail(shape, type, {64, 32, 16, 8}, fft::unsupported_rank);
 }
 
+TEST_F(ShapeInferenceTest, InferFftShapeTestFftRanksBounded) {
+  FftType type = FftType::FFT;
+  const Shape shape = ShapeUtil::MakeShape(C64, {16, 8}, {false, true});
+  fft::Fail(shape, type, {}, fft::unsupported_rank);
+  fft::Pass(shape, type, {8}, shape);
+  fft::Pass(shape, type, {16, 8}, shape);
+  fft::Fail(shape, type, {32, 16, 8}, fft::invalid_rank);
+  fft::Fail(shape, type, {64, 32, 16, 8}, fft::unsupported_rank);
+}
+
 TEST_F(ShapeInferenceTest, InferFftShapeTestFftTypes) {
   FftType type = FftType::FFT;
   const Shape shape_f32 = ShapeUtil::MakeShape(F32, {16, 8});
@@ -1005,6 +1015,16 @@ TEST_F(ShapeInferenceTest, InferFftShapeTestIfftRanks) {
   fft::Fail(shape, type, {64, 32, 16, 8}, fft::unsupported_rank);
 }
 
+TEST_F(ShapeInferenceTest, InferFftShapeTestIfftRanksBounded) {
+  FftType type = FftType::IFFT;
+  const Shape shape = ShapeUtil::MakeShape(C64, {16, 8}, {false, true});
+  fft::Fail(shape, type, {}, fft::unsupported_rank);
+  fft::Pass(shape, type, {8}, shape);
+  fft::Pass(shape, type, {16, 8}, shape);
+  fft::Fail(shape, type, {32, 16, 8}, fft::invalid_rank);
+  fft::Fail(shape, type, {64, 32, 16, 8}, fft::unsupported_rank);
+}
+
 TEST_F(ShapeInferenceTest, InferFftShapeTestIfftTypes) {
   FftType type = FftType::IFFT;
   const Shape shape_f32 = ShapeUtil::MakeShape(F32, {16, 8});
@@ -1042,6 +1062,12 @@ TEST_F(ShapeInferenceTest, InferFftShapeTestRfftDimensions) {
   const Shape shape_out = ShapeUtil::MakeShape(C64, {16, 5});
   fft::Pass(even_shape_in, type, {16, 8}, shape_out);
   fft::Pass(odd_shape_in, type, {16, 9}, shape_out);
+
+  const Shape bounded_shape_in =
+      ShapeUtil::MakeShape(F32, {16, 8}, {false, true});
+  const Shape bounded_shape_out =
+      ShapeUtil::MakeShape(C64, {16, 5}, {false, true});
+  fft::Pass(bounded_shape_in, type, {16, 8}, bounded_shape_out);
 }
 
 TEST_F(ShapeInferenceTest, InferFftShapeTestRfftTypes) {
@@ -1080,6 +1106,12 @@ TEST_F(ShapeInferenceTest, InferFftShapeTestIrfftDimensions) {
   const Shape odd_shape_out = ShapeUtil::MakeShape(F32, {16, 9});
   fft::Pass(shape, type, {16, 8}, even_shape_out);
   fft::Pass(shape, type, {16, 9}, odd_shape_out);
+
+  const Shape bounded_shape_in =
+      ShapeUtil::MakeShape(C64, {16, 5}, {false, true});
+  const Shape bounded_shape_out =
+      ShapeUtil::MakeShape(F32, {16, 9}, {false, true});
+  fft::Pass(bounded_shape_in, type, {16, 9}, bounded_shape_out);
 }
 
 TEST_F(ShapeInferenceTest, InferFftShapeTestIrfftTypes) {
@@ -4056,6 +4088,32 @@ TEST_F(ShapeInferenceTest, UnboundedAllReduce) {
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
+TEST_F(ShapeInferenceTest, UnboundedAllToAll) {
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(
+      const Shape inferred_shape,
+      ShapeInference::InferAllToAllShape(/*shape=*/operand,
+                                         /*split_dimension=*/0,
+                                         /*concat_dimension=*/0,
+                                         /*split_count=*/3));
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
+      << " expected: " << ShapeUtil::HumanString(expected);
+}
+
+TEST_F(ShapeInferenceTest, UnboundedAllToAllTupleUnsupported) {
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
+                          ParseShape("(f32[?, 10], f32[?, 10])"));
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferAllToAllTupleShape(
+          /*operand_shapes=*/{&operand, &operand});
+  EXPECT_THAT(
+      inferred_shape.status().message(),
+      HasSubstr("AllToAllTuple does not support unbounded dynamic shapes"));
+}
+
 TEST_P(UnboundedLogicalOpShapeInferenceTest, UnboundedAnd) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
   TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
@@ -4502,6 +4560,55 @@ TEST_F(ShapeInferenceTest, UnboundedDynamicUpdateSlice) {
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
+TEST_F(ShapeInferenceTest, UnboundedFftWithFFT) {
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("c64[2, <=5, ?]"));
+  const std::vector<int64_t> fft_length = {5, 10};
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("c64[2, <=5, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape inferred_shape,
+                          ShapeInference::InferFftShape(
+                              operand, /*fft_type=*/FftType::FFT, fft_length));
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
+      << " expected: " << ShapeUtil::HumanString(expected);
+}
+
+TEST_F(ShapeInferenceTest, UnboundedFftWithIFFT) {
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("c64[2, <=5, ?]"));
+  const std::vector<int64_t> fft_length = {5, 10};
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("c64[2, <=5, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape inferred_shape,
+                          ShapeInference::InferFftShape(
+                              operand, /*fft_type=*/FftType::IFFT, fft_length));
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
+      << " expected: " << ShapeUtil::HumanString(expected);
+}
+
+TEST_F(ShapeInferenceTest, UnboundedFftWithRFFT) {
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f64[2, <=5, ?]"));
+  const std::vector<int64_t> fft_length = {5, 10};
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("c128[2, <=5, 6]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape inferred_shape,
+                          ShapeInference::InferFftShape(
+                              operand, /*fft_type=*/FftType::RFFT, fft_length));
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
+      << " expected: " << ShapeUtil::HumanString(expected);
+}
+
+TEST_F(ShapeInferenceTest, UnboundedFftWithIRFFT) {
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("c128[2, <=5, ?]"));
+  const std::vector<int64_t> fft_length = {5, 10};
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f64[2, <=5, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(
+      const Shape inferred_shape,
+      ShapeInference::InferFftShape(operand, /*fft_type=*/FftType::IRFFT,
+                                    fft_length));
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
+      << " expected: " << ShapeUtil::HumanString(expected);
+}
+
 TEST_F(ShapeInferenceTest, UnboundedGather) {
   TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[3, 4, 2]"));
   TF_ASSERT_OK_AND_ASSIGN(const Shape start_indices,
diff --git a/third_party/xla/xla/service/shaped_buffer_test.cc b/third_party/xla/xla/service/shaped_buffer_test.cc
index 08ebed27077600..f204f4e19e83cb 100644
--- a/third_party/xla/xla/service/shaped_buffer_test.cc
+++ b/third_party/xla/xla/service/shaped_buffer_test.cc
@@ -17,11 +17,12 @@ limitations under the License.
 
 #include <memory>
 #include <utility>
+#include <vector>
 
 #include "xla/service/platform_util.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_memory_allocator.h"
-#include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/stream_executor_interface.h"
 #include "xla/test.h"
 #include "tsl/platform/test_benchmark.h"
 
@@ -33,7 +34,9 @@ TEST(ShapedBufferTest, ScopedShapeBufferAsShapedBufferB71629047) {
                           xla::PlatformUtil::GetDefaultPlatform());
   TF_ASSERT_OK_AND_ASSIGN(auto executors,
                           xla::PlatformUtil::GetStreamExecutors(platform));
-  xla::se::StreamExecutorMemoryAllocator allocator(platform, executors);
+  xla::se::StreamExecutorMemoryAllocator allocator(
+      platform, std::vector<se::StreamExecutorInterface*>{executors.begin(),
+                                                          executors.end()});
   const xla::Shape shape = xla::ShapeUtil::MakeShape(xla::F32, {});
   const int kDeviceOrdinal = 0;
   auto scoped_buffer = std::make_unique<xla::ScopedShapedBuffer>(
@@ -69,7 +72,8 @@ class TestAllocator : public se::DeviceMemoryAllocator {
                                   device_ordinal, this);
   }
 
-  Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override {
+  absl::Status Deallocate(int device_ordinal,
+                          se::DeviceMemoryBase mem) override {
     if (mem.is_null()) {
       return OkStatus();
     }
diff --git a/third_party/xla/xla/service/sharding_propagation.cc b/third_party/xla/xla/service/sharding_propagation.cc
index 75777bc12f5ea9..183910e4e856f7 100644
--- a/third_party/xla/xla/service/sharding_propagation.cc
+++ b/third_party/xla/xla/service/sharding_propagation.cc
@@ -24,10 +24,13 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <tuple>
 #include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/base/attributes.h"
+#include "absl/base/call_once.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
@@ -44,6 +47,8 @@ limitations under the License.
 #include "xla/hlo/utils/hlo_sharding_util.h"
 #include "xla/protobuf_util.h"
 #include "xla/service/dot_as_convolution_util.h"
+#include "xla/service/host_memory_offload_annotations.h"
+#include "xla/service/spmd/shard_barrier_partitioner.h"
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
@@ -277,9 +282,12 @@ bool IsPassthroughCustomOps(const HloInstruction* hlo) {
       hlo->operand(0)->shape().rank() != hlo->shape().rank()) {
     return false;
   }
-  return hlo->IsCustomCall({"ResizeNearest", "ResizeBilinear",
-                            "ResizeNearestGrad", "ResizeBilinearGrad",
-                            "Cholesky"});
+
+  return hlo->IsCustomCall(
+      {"ResizeNearest", "ResizeBilinear", "ResizeNearestGrad",
+       "ResizeBilinearGrad", "Cholesky",
+       host_memory_offload_annotations::kMoveToDeviceCustomCallTarget,
+       host_memory_offload_annotations::kMoveToHostCustomCallTarget});
 }
 
 // Return the operand which is the most suitable for determining the sharding
@@ -486,11 +494,9 @@ bool SupportSpatialPartitioning(
     case HloOpcode::kWhile:
     case HloOpcode::kReduce:
     case HloOpcode::kRngBitGenerator:
-      return true;
     case HloOpcode::kAllReduce:
     case HloOpcode::kReduceScatter:
-      // Only if channel_id is not specified.
-      return instruction->channel_id() == std::nullopt;
+      return true;
     case HloOpcode::kParameter:
       return allow_spmd_sharding_propagation_to_parameters ||
              computation_map.find(instruction->parent()) !=
@@ -885,7 +891,7 @@ bool RemoveShardingMetadata(
 // instructions with a device assignment are on D. Further, annotate the root
 // instruction of the while body to ensure that HLO partitioning will keep the
 // entire while instruction on D.
-Status CheckAndUpdateDeviceAssignmentsInWhileBody(
+absl::Status CheckAndUpdateDeviceAssignmentsInWhileBody(
     HloInstruction* while_instruction) {
   auto bad_status = [](HloInstruction* instruction, int64_t device,
                        HloInstruction* channel_instruction,
@@ -1219,6 +1225,49 @@ bool InferUnspecifiedDimsFromUsers(HloInstruction* annotate_op,
   return changed;
 }
 
+bool InferUnspecifiedDimsFromShardGroup(
+    HloInstruction* annotate_op, absl::Span<const int64_t> unspecified_dims,
+    const absl::flat_hash_set<HloInstruction*>& shard_group) {
+  // ProcessShardingInstruction will either keep the "Sharding" custom call as
+  // is or replace it with a copy.
+  CHECK(annotate_op->IsCustomCall("Sharding") ||
+        annotate_op->opcode() == HloOpcode::kCopy);
+
+  // Do not propagate sharding to ShardBarrierTo custom-call.
+  if (annotate_op->IsCustomCall(spmd::kShardBarrierTo)) {
+    return false;
+  }
+
+  bool changed = false;
+  for (const HloInstruction* member : shard_group) {
+    if (member == annotate_op) {
+      continue;
+    }
+    // Do not propagate sharding from ShardBarrierFrom custom-call.
+    if (member->IsCustomCall(spmd::kShardBarrierFrom)) {
+      continue;
+    }
+    if (!IsSpatiallyPartitioned(member)) {
+      continue;
+    }
+    const HloSharding& member_sharding = member->sharding();
+    if (!member_sharding.IsTiled()) {
+      continue;
+    }
+    HloSharding partial_replicated =
+        hlo_sharding_util::PartiallyReplicateTiledShardingOnAllDimsExcept(
+            member_sharding, unspecified_dims);
+    HloSharding sharding = annotate_op->sharding();
+    if (!hlo_sharding_util::MergeShardingIfCompatible(
+            partial_replicated, sharding.NumTiles() + 1, &sharding)) {
+      continue;
+    }
+    annotate_op->set_sharding(sharding);
+    changed |= true;
+  }
+  return changed;
+}
+
 // Returns whether an op is a target for CSE prevention.
 bool IsCSEPreventionTarget(const HloInstruction* instruction) {
   // Scalar broadcasts are the most common CSE target that causes cross-layer
@@ -1558,41 +1607,75 @@ absl::StatusOr<bool> ProcessShardingInstruction(
   const bool use_shard_group = instruction_to_shard_group_id &&
                                shard_group_id_to_shard_as_group &&
                                shard_group_id_to_shard_like_group;
-  auto process_shard_group_instruction = [&](HloInstruction* instruction,
-                                             HloSharding sharding) {
-    if (use_shard_group && sharding.IsShardGroup()) {
-      // Store shard group relations.
-      const int64_t shard_group_id = sharding.GetShardGroup().shard_group_id;
-      (*instruction_to_shard_group_id)[instruction] = shard_group_id;
-      if (sharding.IsShardAs()) {
-        auto& shard_as_group =
-            (*shard_group_id_to_shard_as_group)[shard_group_id];
-        if (!shard_as_group.empty()) {
-          CHECK(ShapeUtil::SameDimensions(instruction->shape(),
-                                          (*shard_as_group.begin())->shape()))
-              << "Instruction: " << instruction->ToString()
-              << " has different shape from the shapes of the other "
-                 "instructions within the same shard_as group: "
-              << (*shard_as_group.begin())->shape().ToString();
+  // Process shard group instruction and returns if current instruction needs
+  // to be removed.
+  auto process_shard_group_instruction =
+      [&](HloInstruction* instruction,
+          bool replaced_with_copy) -> absl::StatusOr<bool> {
+    // Run shard group processing IFF it's not CSE prevention.
+    if (replace_sharding_with_copy) {
+      if (use_shard_group && instruction->has_sharding() &&
+          instruction->sharding().IsShardGroup()) {
+        if (instruction->IsCustomCall("Sharding")) {
+          CHECK(instruction->operand(0)->opcode() != HloOpcode::kParameter ||
+                (allow_spmd_sharding_propagation_to_parameters_vector &&
+                 allow_spmd_sharding_propagation_to_parameters_vector->size() ==
+                     module->entry_computation()->num_parameters() &&
+                 allow_spmd_sharding_propagation_to_parameters_vector->at(
+                     instruction->operand(0)->parameter_number())));
         }
-        shard_as_group.insert(instruction);
-      } else {
-        auto& shard_like_group =
-            (*shard_group_id_to_shard_like_group)[shard_group_id];
-        if (!shard_like_group.empty()) {
-          CHECK(ShapeUtil::SameDimensions(instruction->shape(),
-                                          (*shard_like_group.begin())->shape()))
-              << "Instruction: " << instruction->ToString()
-              << " has different shape from the shapes of the other "
-                 "instructions within the same shard_like group: "
-              << (*shard_like_group.begin())->shape().ToString();
+        if (instruction->IsCustomCall("Sharding") && !replaced_with_copy) {
+          // Pass shard group to operand sharding custom-call if it's not
+          // replaced with a copy, meaning that the shardings are to annotate
+          // shard_group.
+          HloSharding operand_sharding =
+              instruction->operand(0)->has_sharding()
+                  ? instruction->operand(0)->sharding()
+                  : HloSharding::Unknown();
+          operand_sharding.SetShardGroup(
+              instruction->sharding().GetShardGroup());
+          instruction->mutable_operand(0)->set_sharding(
+              std::move(operand_sharding));
+          return true;
+        } else {
+          // Otherwise store the shard group relations.
+          const int64_t shard_group_id =
+              instruction->sharding().GetShardGroup().shard_group_id;
+          (*instruction_to_shard_group_id)[instruction] = shard_group_id;
+          if (instruction->sharding().IsShardAs()) {
+            auto& shard_as_group =
+                (*shard_group_id_to_shard_as_group)[shard_group_id];
+            if (!shard_as_group.empty()) {
+              CHECK(ShapeUtil::SameDimensions(
+                  instruction->shape(), (*shard_as_group.begin())->shape()))
+                  << "Instruction: " << instruction->ToString()
+                  << " has different shape from the shapes of the other "
+                     "instructions within the same shard_as group: "
+                  << (*shard_as_group.begin())->shape().ToString();
+            }
+            shard_as_group.insert(instruction);
+          } else {
+            auto& shard_like_group =
+                (*shard_group_id_to_shard_like_group)[shard_group_id];
+            if (!shard_like_group.empty()) {
+              CHECK(ShapeUtil::SameDimensions(
+                  instruction->shape(), (*shard_like_group.begin())->shape()))
+                  << "Instruction: " << instruction->ToString()
+                  << " has different shape from the shapes of the other "
+                     "instructions within the same shard_like group: "
+                  << (*shard_like_group.begin())->shape().ToString();
+            }
+            shard_like_group.insert(instruction);
+          }
+          HloSharding sharding = instruction->sharding();
+          sharding.ClearShardGroup();
+          instruction->set_sharding(std::move(sharding));
         }
-        shard_like_group.insert(instruction);
       }
-      sharding.ClearShardGroup();
     }
-    return sharding;
+    return false;
   };
+
   for (HloComputation* computation : module->computations(execution_threads)) {
     auto instructions = computation->MakeInstructionPostOrder();
     for (auto it = instructions.rbegin(); it != instructions.rend(); ++it) {
@@ -1608,44 +1691,48 @@ absl::StatusOr<bool> ProcessShardingInstruction(
             Cast<HloCustomCallInstruction>(instruction)->opaque(),
             &unspec_dims));
 
-        // Replace it with a copy node so that it does not need special
-        // handling.
-        if (replace_sharding_with_copy) {
+        bool replaced_with_copy =
+            replace_sharding_with_copy &&
+            (!original_sharding.IsUnknown() ||
+             instruction->operand(0)->opcode() == HloOpcode::kParameter);
+        // Replace the sharding instruction with a copy node so that it does not
+        // need special handling.
+        if (replaced_with_copy) {
           auto copy = computation->AddInstruction(HloInstruction::CreateUnary(
               instruction->shape(), HloOpcode::kCopy,
               instruction->mutable_operand(0)));
-          TF_RETURN_IF_ERROR(
-              computation->ReplaceInstruction(instruction, copy));
-          // Add into shard group.
-          HloSharding sharding =
-              process_shard_group_instruction(copy, original_sharding);
-          copy->set_sharding(sharding);
+          TF_ASSIGN_OR_RETURN(
+              std::ignore, computation->ReplaceInstruction(
+                               instruction, copy, /*preserve_sharding=*/false,
+                               /*relay_control_dependency=*/false,
+                               /*remove_unused_operands=*/false));
+          copy->set_sharding(std::move(original_sharding));
           instruction = copy;
           changed = true;
         }
-        // Strip the sharding of the shard group related annotations.
+
+        TF_ASSIGN_OR_RETURN(
+            bool shard_group_remove_instruction,
+            process_shard_group_instruction(instruction, replaced_with_copy));
         if (!unspec_dims.empty()) {
           absl::c_sort(unspec_dims);
           unspecified_dims->emplace(instruction, std::move(unspec_dims));
         } else if (!instruction->operand(0)->has_sharding()) {
-          HloSharding sharding = original_sharding;
-          if (instruction->operand(0)->opcode() != HloOpcode::kParameter ||
-              (allow_spmd_sharding_propagation_to_parameters_vector &&
-               allow_spmd_sharding_propagation_to_parameters_vector->size() ==
-                   module->entry_computation()->num_parameters() &&
-               allow_spmd_sharding_propagation_to_parameters_vector->at(
-                   instruction->operand(0)->parameter_number()))) {
-            // Add operand(i.e. the annotated op) into shard group.
-            sharding = process_shard_group_instruction(
-                instruction->mutable_operand(0), sharding);
-          }
-          instruction->mutable_operand(0)->set_sharding(std::move(sharding));
+          instruction->mutable_operand(0)->set_sharding(
+              instruction->sharding());
+        }
+        if (shard_group_remove_instruction) {
+          TF_ASSIGN_OR_RETURN(std::ignore,
+                              computation->ReplaceInstruction(
+                                  instruction, instruction->mutable_operand(0),
+                                  /*preserve_sharding=*/false,
+                                  /*relay_control_dependency=*/false,
+                                  /*remove_unused_operands=*/false));
         }
-      } else if (instruction->has_sharding()) {
-        // Handle shard group in parameters/outputs.
-        HloSharding sharding = process_shard_group_instruction(
-            instruction, instruction->sharding());
-        instruction->set_sharding(std::move(sharding));
+      } else {
+        TF_ASSIGN_OR_RETURN(std::ignore,
+                            process_shard_group_instruction(
+                                instruction, /*replaced_with_copy=*/false));
       }
     }
   }
@@ -1683,7 +1770,7 @@ int64_t ComputeNonRootUsers(const HloInstruction* instr) {
   return non_root_users;
 }
 
-/*static*/ Status ShardingPropagation::NormalizeDomain(
+/*static*/ absl::Status ShardingPropagation::NormalizeDomain(
     const DomainMetadata::Domain& domain, const DomainMetadata* metadata) {
   if (metadata != nullptr) {
     TF_ASSIGN_OR_RETURN(const auto& sharding_metadata,
@@ -2192,10 +2279,15 @@ bool ShardingPropagation::InferShardingFromShardGroup(
   if (instruction->has_sharding() && instruction->sharding().IsManual()) {
     return false;
   }
+  // Do not propagate sharding to ShardBarrierTo custom-call.
+  if (instruction->IsCustomCall(spmd::kShardBarrierTo)) {
+    return false;
+  }
   // Propagate manual sharding.
   if (!instruction->has_sharding() || instruction->sharding().IsTileMaximal()) {
     for (const HloInstruction* member : shard_group) {
-      if (!member->has_sharding() || !member->sharding().IsManual()) {
+      if (!member->has_sharding() || !member->sharding().IsManual() ||
+          member == instruction) {
         continue;
       }
       instruction->set_sharding(member->sharding());
@@ -2206,6 +2298,11 @@ bool ShardingPropagation::InferShardingFromShardGroup(
   const bool may_combine_partial_sharding = is_spmd_ && aggressiveness > 0;
   bool changed = false;
   for (const HloInstruction* member : shard_group) {
+    // Do not propagate sharding from ShardBarrierFrom custom-call.
+    if (member == instruction ||
+        member->IsCustomCall(spmd::kShardBarrierFrom)) {
+      continue;
+    }
     changed |= MaybeImproveInstructionSharding(member->sharding(), instruction,
                                                may_combine_partial_sharding);
   }
@@ -2865,59 +2962,20 @@ bool ShardingPropagation::InferShardingFromUsers(
   return improved_sharding;
 }
 
-Status SetParameterShapes(
-    HloModule* module, const std::vector<Shape>& parameter_shapes,
-    const std::vector<bool>&
-        allow_spmd_sharding_propagation_to_parameters_vector) {
-  for (int64_t i = 0; i < module->entry_computation()->num_parameters(); ++i) {
-    if (!allow_spmd_sharding_propagation_to_parameters_vector[i]) {
-      continue;
-    }
-    TF_RETURN_IF_ERROR(module->mutable_config()
-                           .mutable_entry_computation_layout()
-                           ->mutable_parameter_layout(i)
-                           ->CopyLayoutFromShape(parameter_shapes[i]));
-  }
-  return OkStatus();
-}
-
-Status SetResultShape(HloModule* module, const Shape& result_shape) {
-  if (!module->entry_computation_layout().LayoutIsSet() ||
-      !module->entry_computation_layout().result_layout().LayoutIsSet()) {
-    return OkStatus();
-  }
-  TF_RETURN_IF_ERROR(module->mutable_config()
-                         .mutable_entry_computation_layout()
-                         ->mutable_result_layout()
-                         ->CopyLayoutFromShape(result_shape));
-  return OkStatus();
-}
-
-Status ShardingPropagation::CanonicalizeLayouts(HloModule* module) {
-  if (!allow_spmd_sharding_propagation_to_output_ &&
-      !allow_spmd_sharding_propagation_to_parameters_) {
-    return OkStatus();
-  }
-  if (!module->layout_canonicalization_callback()) {
-    LOG(INFO) << "There is no registered layout_canonicalization_callback.";
-    return OkStatus();
-  }
-  TF_ASSIGN_OR_RETURN(auto shapes_with_layout,
-                      module->layout_canonicalization_callback()(*module));
-  if (allow_spmd_sharding_propagation_to_parameters_) {
-    TF_RETURN_IF_ERROR(SetParameterShapes(
-        module, shapes_with_layout.first,
-        allow_spmd_sharding_propagation_to_parameters_vector_));
-  }
-  if (allow_spmd_sharding_propagation_to_output_) {
-    TF_RETURN_IF_ERROR(SetResultShape(module, shapes_with_layout.second));
-  }
-  return OkStatus();
-}
-
 absl::StatusOr<bool> ShardingPropagation::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  // Register custom-call partitioner for SharBarrierFrom and ShardBarrierTo.
+  ABSL_CONST_INIT static absl::once_flag did_registration;
+  absl::call_once(did_registration, [] {
+    RegisterCustomCallPartitioner(
+        spmd::kShardBarrierFrom,
+        std::make_unique<spmd::ShardBarrierFromPartitioner>());
+    RegisterCustomCallPartitioner(
+        spmd::kShardBarrierTo,
+        std::make_unique<spmd::ShardBarrierToPartitioner>());
+  });
+
   std::optional<absl::flat_hash_map<const HloInstruction*, HloSharding>>
       original_sharding;
   bool any_changed = false;
@@ -2971,6 +3029,23 @@ absl::StatusOr<bool> ShardingPropagation::Run(
           &shard_group_id_to_shard_like_group,
           &allow_spmd_sharding_propagation_to_parameters_vector_));
   any_changed |= changed;
+
+  for (const auto& [shard_group_id, shard_as_group] :
+       shard_group_id_to_shard_as_group) {
+    VLOG(5) << "Shard-As group " << shard_group_id << " contains:";
+    for (auto instruction : shard_as_group) {
+      VLOG(5) << "  " << instruction->ToString();
+    }
+  }
+
+  for (const auto& [shard_group_id, shard_like_group] :
+       shard_group_id_to_shard_like_group) {
+    VLOG(5) << "Shard-Like group " << shard_group_id << " contains:";
+    for (auto instruction : shard_like_group) {
+      VLOG(5) << "  " << instruction->ToString();
+    }
+  }
+
   // Check sizes of the given allow_spmd_sharding_propagation vectors
   if (allow_spmd_sharding_propagation_to_output_) {
     CHECK(!module->entry_computation()->root_instruction()->has_sharding() ||
@@ -3171,7 +3246,8 @@ absl::StatusOr<bool> ShardingPropagation::Run(
   int64_t iterations = 0;
 
   std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
-  auto run_to_fix_point = [&](int64_t aggressiveness) {
+  auto run_to_fix_point = [&](int64_t aggressiveness,
+                              bool propagate_shard_group) {
     absl::flat_hash_set<const HloInstruction*>
         already_inferred_from_shard_group;
     absl::flat_hash_set<const HloInstruction*> already_inferred_from_operands;
@@ -3205,6 +3281,14 @@ absl::StatusOr<bool> ShardingPropagation::Run(
           }
           for (auto user : hlo_for_users->users()) {
             already_inferred_from_operands.erase(user);
+            // If the user has called computations, then the parameter
+            // instructions of these called computations are also removed from
+            // already_inferred_from_operands.
+            for (auto c : user->called_computations()) {
+              for (auto parameter : c->parameter_instructions()) {
+                already_inferred_from_operands.erase(parameter);
+              }
+            }
           }
           if (instruction_to_shard_group_id.contains(hlo)) {
             const int64_t shard_group_id =
@@ -3222,36 +3306,52 @@ absl::StatusOr<bool> ShardingPropagation::Run(
         };
         // 1. Iterate the shard groups to take shardings from instructions of
         // the same group.
-        for (HloInstruction* instruction : instructions) {
-          if (already_inferred_from_shard_group.contains(instruction)) {
-            continue;
-          }
-          if (!instruction_to_shard_group_id.contains(instruction)) {
-            continue;
-          }
-          const int64_t shard_group_id =
-              instruction_to_shard_group_id.at(instruction);
-          const absl::flat_hash_set<HloInstruction*>& shard_group =
-              shard_group_id_to_shard_as_group.contains(shard_group_id)
-                  ? shard_group_id_to_shard_as_group.at(shard_group_id)
-                  : shard_group_id_to_shard_like_group.at(shard_group_id);
-          if (provided_shardings.contains(instruction)) {
-            continue;
-          }
-          already_inferred_from_shard_group.insert(instruction);
-          if (InferShardingFromShardGroup(instruction, computation_map,
-                                          aggressiveness, shard_group)) {
-            ++inferred_from_shard_group_counter;
-            any_changed = true;
-            VLOG(2) << "Add sharding (shard group): "
-                    << instruction->ToString();
-            absl::flat_hash_set<HloInstruction*> changed_in_comp_prop;
-            maybe_computation_propagation(instruction, &changed_in_comp_prop);
-            clear_cache(instruction);
-            for (auto hlo : changed_in_comp_prop) {
-              clear_cache(hlo);
+        if (propagate_shard_group) {
+          for (HloInstruction* instruction : instructions) {
+            if (already_inferred_from_shard_group.contains(instruction)) {
+              continue;
+            }
+            if (!instruction_to_shard_group_id.contains(instruction)) {
+              continue;
+            }
+            const int64_t shard_group_id =
+                instruction_to_shard_group_id.at(instruction);
+            const absl::flat_hash_set<HloInstruction*>& shard_group =
+                shard_group_id_to_shard_as_group.contains(shard_group_id)
+                    ? shard_group_id_to_shard_as_group.at(shard_group_id)
+                    : shard_group_id_to_shard_like_group.at(shard_group_id);
+            if (provided_shardings.contains(instruction)) {
+              if (!may_merge_partial) {
+                continue;
+              }
+              auto it = unspecified_dims.find(instruction);
+              if (it != unspecified_dims.end() &&
+                  InferUnspecifiedDimsFromShardGroup(instruction, it->second,
+                                                     shard_group)) {
+                ++inferred_from_shard_group_counter;
+                VLOG(2) << "Refined partial sharding (shard group): "
+                        << instruction->ToString();
+                clear_cache(instruction);
+                already_inferred_from_shard_group.insert(instruction);
+                changed_last_iter = true;
+              }
+              continue;
+            }
+            already_inferred_from_shard_group.insert(instruction);
+            if (InferShardingFromShardGroup(instruction, computation_map,
+                                            aggressiveness, shard_group)) {
+              ++inferred_from_shard_group_counter;
+              any_changed = true;
+              VLOG(2) << "Add sharding (shard group): "
+                      << instruction->ToString();
+              absl::flat_hash_set<HloInstruction*> changed_in_comp_prop;
+              maybe_computation_propagation(instruction, &changed_in_comp_prop);
+              clear_cache(instruction);
+              for (auto hlo : changed_in_comp_prop) {
+                clear_cache(hlo);
+              }
+              changed_last_iter = true;
             }
-            changed_last_iter = true;
           }
         }
         // 2. Iterate the HLO graph in post order taking shardings from
@@ -3364,7 +3464,8 @@ absl::StatusOr<bool> ShardingPropagation::Run(
     return OkStatus();
   };
   for (int64_t aggressiveness = 0; aggressiveness < 4; ++aggressiveness) {
-    TF_RETURN_IF_ERROR(run_to_fix_point(aggressiveness));
+    TF_RETURN_IF_ERROR(
+        run_to_fix_point(aggressiveness, /*propagate_shard_group=*/true));
   }
 
   // Align the shardings from the same shard_as group so that they will adopt
@@ -3374,20 +3475,96 @@ absl::StatusOr<bool> ShardingPropagation::Run(
     // If all the inferred shardings of the instructions from the same shard
     // group are compatible with each other, then we will merge all of them to
     // get the most specific sharding. If some of them are not compatible, then
-    // it will just choose the a random sharding among them(say the first one).
+    // it will just choose the a random sharding among them(say the first one),
+    // with the guarantee that the defaultly chosen sharding will not be from a
+    // ShardBarrierFrom op if there is one within the ShardAs group.
+    HloSharding default_sharding = HloSharding::Replicate();
     std::vector<HloSharding> shardings;
     for (HloInstruction* instruction : shard_as_group) {
-      shardings.push_back(instruction->sharding());
+      if (instruction->has_sharding()) {
+        shardings.push_back(instruction->sharding());
+        if (!instruction->IsCustomCall(spmd::kShardBarrierFrom) &&
+            default_sharding.IsReplicated()) {
+          default_sharding = instruction->sharding();
+        }
+      }
     }
+
     HloSharding common_sharding =
-        hlo_sharding_util::FindCommonSharding(shardings);
+        hlo_sharding_util::FindCommonSharding(shardings, default_sharding);
     VLOG(2) << "Aligning shard group: " << shard_as_group_id
             << " to sharding:" << common_sharding.ToString();
     for (HloInstruction* member : shard_as_group) {
+      if (member->IsCustomCall(spmd::kShardBarrierTo)) {
+        continue;
+      }
+      if (provided_shardings.contains(member)) {
+        auto it = unspecified_dims.find(member);
+        if (it != unspecified_dims.end()) {
+          HloSharding partial_replicated =
+              hlo_sharding_util::PartiallyReplicateTiledShardingOnAllDimsExcept(
+                  common_sharding, it->second);
+          HloSharding sharding = member->sharding();
+          if (hlo_sharding_util::MergeShardingIfCompatible(
+                  partial_replicated, sharding.NumTiles() + 1, &sharding)) {
+            member->set_sharding(sharding);
+          }
+        }
+      }
       member->set_sharding(common_sharding);
     }
   }
 
+  //  If a ShardBarrierFrom custom-call op is in a shard as group, and relay
+  // the shard as sharding to its original op, do not relay shardings for
+  // ShardbarrierTo op. Then run sharding propagation once more at highest
+  // aggressiveness without propagating shard group.
+  for (HloComputation* computation : module->computations(execution_threads)) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      if (instruction->IsCustomCall(spmd::kShardBarrierFrom) &&
+          instruction_to_shard_group_id.contains(instruction) &&
+          shard_group_id_to_shard_as_group.contains(
+              instruction_to_shard_group_id.at(instruction))) {
+        HloSharding sharding = instruction->sharding();
+        hlo_sharding_util::MergeShardingIfCompatible(
+            instruction->mutable_operand(0)->sharding(), sharding.NumTiles(),
+            &sharding);
+        instruction->mutable_operand(0)->set_sharding(std::move(sharding));
+      }
+    }
+  }
+  TF_RETURN_IF_ERROR(
+      run_to_fix_point(/*aggressiveness=*/3, /*propagate_shard_group=*/false));
+
+  // Post-process to remove all "shard-barrier-from" and "shard-barrier-to"
+  // custom-calls.
+  for (HloComputation* computation : module->computations(execution_threads)) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      // If a ShardBarrierFrom custom-call op is in a shard as group, and relay
+      // the shard as sharding to its original op, do not relay shardings for
+      // ShardbarrierTo op.
+      if (instruction->IsCustomCall(spmd::kShardBarrierFrom) &&
+          instruction_to_shard_group_id.contains(instruction) &&
+          shard_group_id_to_shard_as_group.contains(
+              instruction_to_shard_group_id.at(instruction))) {
+        HloSharding sharding = instruction->sharding();
+        hlo_sharding_util::MergeShardingIfCompatible(
+            instruction->mutable_operand(0)->sharding(), sharding.NumTiles(),
+            &sharding);
+        instruction->mutable_operand(0)->set_sharding(std::move(sharding));
+      }
+      if (instruction->IsCustomCall(spmd::kShardBarrierFrom) ||
+          instruction->IsCustomCall(spmd::kShardBarrierTo)) {
+        TF_ASSIGN_OR_RETURN(std::ignore,
+                            computation->ReplaceInstruction(
+                                instruction, instruction->mutable_operand(0),
+                                /*preserve_sharding=*/false,
+                                /*relay_control_dependency=*/false,
+                                /*remove_unused_operands=*/false));
+      }
+    }
+  }
+
   // Post-process for CSE prevention.
   if (cse_prevention_only_) {
     for (auto computation : module->computations(execution_threads)) {
@@ -3541,7 +3718,11 @@ absl::StatusOr<bool> ShardingPropagation::Run(
       params[0]->set_sharding(std::move(param_sharding));
     }
   }
-  TF_RETURN_IF_ERROR(CanonicalizeLayouts(module));
+
+  TF_RETURN_IF_ERROR(
+      hlo_sharding_util::CanonicalizeLayoutAfterShardingPropagation(
+          module, allow_spmd_sharding_propagation_to_output_,
+          allow_spmd_sharding_propagation_to_parameters_));
 
   VLOG(1) << "Sharding propagation completed after " << iterations
           << " iterations";
diff --git a/third_party/xla/xla/service/sharding_propagation.h b/third_party/xla/xla/service/sharding_propagation.h
index cdede317345428..28505bbd58940f 100644
--- a/third_party/xla/xla/service/sharding_propagation.h
+++ b/third_party/xla/xla/service/sharding_propagation.h
@@ -130,22 +130,14 @@ class ShardingPropagation : public HloModulePass {
   // given domain. It will apply the sharding into the exit edges of the domain
   // and then rely on the rest of sharding propagation to ensure that the
   // intermediate nodes get the correct sharding.
-  static Status NormalizeDomain(const DomainMetadata::Domain& domain,
-                                const DomainMetadata* metadata);
+  static absl::Status NormalizeDomain(const DomainMetadata::Domain& domain,
+                                      const DomainMetadata* metadata);
 
   static std::optional<HloSharding> GetShardingFromUser(
       const HloInstruction& instruction, const HloInstruction& user,
       int64_t aggressiveness, bool is_spmd, const CallGraph& call_graph,
       const CustomCallShardingHelper* sharding_helper);
 
-  // Canonicalizes entry_computation_layouts by calling
-  // module.layout_canonicalization_callback(), which gives canolicalized
-  // argument and result layouts based on current module. Currently used by
-  // PJRT which assigns layouts based on runtime shapes: see
-  // DetermineArgumentLayoutsFromCompileOptions() in
-  //     tensorflow/compiler/xla/pjrt/utils.cc
-  Status CanonicalizeLayouts(HloModule* module);
-
  private:
   bool InferShardingFromShardGroup(
       HloInstruction* instruction, const ComputationMap& computation_map,
diff --git a/third_party/xla/xla/service/sharding_propagation_test.cc b/third_party/xla/xla/service/sharding_propagation_test.cc
index f4a97ff8c1dd57..e06ecd3d3ca2a9 100644
--- a/third_party/xla/xla/service/sharding_propagation_test.cc
+++ b/third_party/xla/xla/service/sharding_propagation_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <ostream>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -417,6 +418,33 @@ ENTRY %broadcast {
   }
 }
 
+// Regression Test for b/129569657.
+TEST_P(ParameterizedMetadataTestWithOutput, BroadcastForwardPassWithBarrier) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %broadcast {
+  %param0 = f32[3,2048,2048]{2,1,0} parameter(0),
+    sharding={devices=[1,2,2]0,1,2,3 metadata={op_name="a"}}
+  %shard-barrier-from = f32[3,2048,2048]{2,1,0} custom-call(%param0), custom_call_target="ShardBarrierFrom", custom_call_has_side_effect=true
+  %broadcast = f32[3,2048,2048,3]{3,2,1,0} broadcast(%shard-barrier-from), dimensions={0,1,2}
+  ROOT %copy = f32[3,2048,2048,3]{3,2,1,0} copy(%broadcast)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::ignore,
+      ShardingPropagation(/*is_spmd=*/true, GetParam().propagate_metadata,
+                          {GetParam().allow_root_sharding_propagation})
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  auto* instruction = FindInstruction(module.get(), "broadcast");
+  ASSERT_NE(instruction, nullptr);
+  EXPECT_FALSE(instruction->has_sharding());
+}
+
 TEST_P(ParameterizedMetadataTest, BroadcastBackwardPass) {
   const char* const hlo_string = R"(
 HloModule module
@@ -448,6 +476,32 @@ ENTRY %broadcast {
   }
 }
 
+TEST_P(ParameterizedMetadataTest, BroadcastBackwardPassWithBarrier) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %broadcast {
+  %param0 = f32[13]{0} parameter(0)
+  %param0_copy = f32[13]{0} copy(param0)
+  %shard-barrier-to = f32[13]{0} custom-call(%param0_copy), custom_call_target="ShardBarrierTo", custom_call_has_side_effect=true
+  %broadcast = f32[5,7,11,13]{3,2,1,0} broadcast(%shard-barrier-to), dimensions={3}
+  ROOT %copy = f32[5,7,11,13]{3,2,1,0} copy(%broadcast),
+    sharding={devices=[1,1,2,2]0,1,2,3 metadata={op_name="a"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::ignore,
+      ShardingPropagation(/*is_spmd=*/true, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  auto* instruction = FindInstruction(module.get(), "param0_copy");
+  ASSERT_NE(instruction, nullptr);
+  EXPECT_THAT(instruction, op::Sharding("{replicated}"));
+}
+
 TEST_P(ParameterizedMetadataTest, Broadcast1DBackwardNoChange) {
   const char* const hlo_string = R"(
 HloModule module
@@ -688,6 +742,37 @@ ENTRY %reduce {
   }
 }
 
+TEST_P(ParameterizedMetadataTest, ReduceForwardPassWithBarrier) {
+  const char* const hlo_string = R"(
+HloModule module
+%add {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  ROOT %add = f32[] add(%lhs, %rhs)
+}
+ENTRY %reduce {
+  %param0 = f32[5,7,11,13]{3,2,1,0} parameter(0),
+    sharding={devices=[1,2,2,1]0,1,2,3 metadata={op_name="a"}}
+  %init = f32[] parameter(1)
+  %shard-barrier-from = f32[5,7,11,13]{3,2,1,0} custom-call(%param0), custom_call_target="ShardBarrierFrom", custom_call_has_side_effect=true
+  %reduce = f32[7,11]{1,0} reduce(%shard-barrier-from, %init), dimensions={0,3}, to_apply=%add
+  ROOT %copy = f32[7,11]{0,1} copy(f32[7,11]{1,0} %reduce)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::ignore,
+      ShardingPropagation(/*is_spmd=*/false, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  auto* instruction = FindInstruction(module.get(), "reduce");
+  ASSERT_NE(instruction, nullptr);
+  EXPECT_FALSE(instruction->has_sharding());
+}
+
 TEST_P(ParameterizedMetadataTest, ReducePartiallyOnTiledDims) {
   const char* const hlo_string = R"(
 HloModule module
@@ -804,6 +889,38 @@ ENTRY %reduce {
   }
 }
 
+TEST_P(ParameterizedMetadataTest, ReduceBackwardWithBarrier) {
+  const char* const hlo_string = R"(
+HloModule module
+%add {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  ROOT %add = f32[] add(%lhs, %rhs)
+}
+ENTRY %reduce {
+  %param0 = f32[8,8] parameter(0)
+  %input = f32[8,8] copy(%param0)
+  %init = f32[] parameter(1)
+  %shard-barrier-to = f32[8,8] custom-call(%input), custom_call_target="ShardBarrierTo", custom_call_has_side_effect=true
+  %reduce = f32[8] reduce(%shard-barrier-to, %init), dimensions={0}, to_apply=%add,
+    sharding={devices=[2,2]0,1,2,3 last_tile_dim_replicate metadata={op_name="a"}}
+  ROOT %copy = f32[8] copy(%reduce)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::ignore,
+      ShardingPropagation(/*is_spmd=*/false, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  auto* instruction = FindInstruction(module.get(), "input");
+  ASSERT_NE(instruction, nullptr);
+  EXPECT_FALSE(instruction->has_sharding());
+}
+
 TEST_P(ParameterizedMetadataTestWithOutput,
        ShardedOnNonReduceDimTupleReduceForwardAndBackwardPass) {
   const char* const hlo_string = R"(
@@ -994,6 +1111,35 @@ ENTRY %gte {
   }
 }
 
+TEST_P(ParameterizedMetadataTestWithOutput,
+       GetTupleElementForwardPassWithBarrier) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %gte {
+  %param0 = f32[5,7,11,13]{3,2,1,0} parameter(0)
+  %tuple = (f32[5,7,11,13]{3,2,1,0}, f32[5,7,11,13]{3,2,1,0}) tuple(
+    %param0, %param0), sharding={{devices=[1,2,2,1]0,1,2,3 metadata={op_name="a"}},
+    {replicated metadata={op_name="b"}}}
+  %shard-barrier-from = (f32[5,7,11,13]{3,2,1,0}, f32[5,7,11,13]{3,2,1,0}) custom-call(%tuple), custom_call_target="ShardBarrierFrom", custom_call_has_side_effect=true
+  %gte = f32[5,7,11,13]{3,2,1,0} get-tuple-element(%shard-barrier-from), index=0
+  ROOT %copy = f32[5,7,11,13]{3,2,1,0} copy(%gte)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::ignore,
+      ShardingPropagation(/*is_spmd=*/false, GetParam().propagate_metadata,
+                          {GetParam().allow_root_sharding_propagation})
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  auto* gte = FindInstruction(module.get(), "gte");
+  ASSERT_NE(gte, nullptr);
+  EXPECT_FALSE(gte->has_sharding());
+}
+
 TEST_P(ParameterizedMetadataTest, TupleForwardPass) {
   const char* const hlo_string = R"(
 HloModule module
@@ -1379,6 +1525,31 @@ ENTRY %transpose {
   }
 }
 
+TEST_P(ParameterizedMetadataTest, TransposeForwardPassWithBarrier) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %transpose {
+  %param = f32[7,11,13]{2,1,0} parameter(0),
+    sharding={devices=[2,1,2]0,1,2,3 metadata={op_name="a"}}
+  %shard-barrier-from = f32[7,11,13]{2,1,0} custom-call(%param), custom_call_target="ShardBarrierFrom", custom_call_has_side_effect=true
+  %transpose = f32[11,13,7]{2,1,0} transpose(%shard-barrier-from), dimensions={1,2,0}
+  ROOT %copy = f32[11,13,7]{2,1,0} copy(%transpose)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::ignore,
+      ShardingPropagation(/*is_spmd=*/false, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  auto* instruction = FindInstruction(module.get(), "transpose");
+  ASSERT_NE(instruction, nullptr);
+  EXPECT_FALSE(instruction->has_sharding());
+}
+
 TEST_P(ParameterizedMetadataTest, TransposeBackwardPass) {
   const char* const hlo_string = R"(
 HloModule module
@@ -1410,6 +1581,31 @@ ENTRY %transpose {
   }
 }
 
+TEST_P(ParameterizedMetadataTest, TransposeBackwardPassWithBarrier) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %transpose {
+  %param = f32[7,11,13]{2,1,0} parameter(0)
+  %copy = f32[7,11,13]{2,1,0} copy(%param)
+  %shard-barrier-to = f32[7,11,13]{2,1,0} custom-call(%copy), custom_call_target="ShardBarrierTo", custom_call_has_side_effect=true
+  ROOT %transpose = f32[11,13,7]{2,1,0} transpose(%shard-barrier-to), dimensions={1,2,0},
+    sharding={devices=[1,2,2]0,1,2,3 metadata={op_name="a"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::ignore,
+      ShardingPropagation(/*is_spmd=*/false, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  auto* instruction = FindInstruction(module.get(), "copy");
+  ASSERT_NE(instruction, nullptr);
+  EXPECT_FALSE(instruction->has_sharding());
+}
+
 TEST_P(ParameterizedMetadataTest, ReshapeForwardPass) {
   const char* const hlo_string = R"(
 HloModule module
@@ -1441,6 +1637,31 @@ ENTRY %reshape {
   }
 }
 
+TEST_P(ParameterizedMetadataTest, ReshapeForwardPassWithBarrier) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %reshape {
+  %param0 = f32[1430,1]{1,0} parameter(0),
+    sharding={devices=[2,1]0,1 metadata={op_name="a"}}
+  %shard-barrier-from = f32[1430,1]{1,0} custom-call(%param0), custom_call_target="ShardBarrierFrom", custom_call_has_side_effect=true
+  %reshape = f32[10,11,13]{2,1,0} reshape(%shard-barrier-from)
+  ROOT %copy = f32[10,11,13]{2,1,0} copy(%reshape)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::ignore,
+      ShardingPropagation(/*is_spmd=*/false, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  auto* instruction = FindInstruction(module.get(), "reshape");
+  ASSERT_NE(instruction, nullptr);
+  EXPECT_FALSE(instruction->has_sharding());
+}
+
 TEST_P(ParameterizedMetadataTest, ReshapeForwardPassPartialMatch) {
   const char* const hlo_string = R"(
 HloModule module
@@ -1507,6 +1728,51 @@ ENTRY %reshape {
   }
 }
 
+TEST_P(ParameterizedMetadataTest, ReshapeForwardPassTranspose) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %reshape {
+  %param0 = f32[6,4,5] parameter(0), sharding={devices=[6,2,1]<=[12] metadata={op_name="a"}}
+  %reshape.1 = f32[2,3,20] reshape(%param0)
+  %reshape.2 = f32[2,4,3,5] reshape(%param0)
+  %reshape.3 = f32[20,6] reshape(%param0)
+  %reshape.4 = f32[3,5,8] reshape(%param0)
+  %reshape.5 = f32[10,4,3] reshape(%param0)
+  %reshape.6 = f32[5,8,3] reshape(%param0)
+  ROOT %tuple = tuple(%reshape.1, %reshape.2, %reshape.3, %reshape.4, %reshape.5, %reshape.6)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      ShardingPropagation(/*is_spmd=*/false, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_TRUE(changed);
+
+  std::vector<std::pair<std::string, std::string>> instruction_and_sharding = {
+      {"reshape.1", "{devices=[2,3,2]<=[12]}"},
+      {"reshape.2", "{devices=[2,1,1,1,6]<=[12] last_tile_dim_replicate}"},
+      {"reshape.3", "{devices=[2,1,6]<=[12] last_tile_dim_replicate}"},
+      {"reshape.4", "{devices=[3,1,1,4]<=[12] last_tile_dim_replicate}"},
+      {"reshape.5", "{devices=[2,1,1,6]<=[12] last_tile_dim_replicate}"},
+      {"reshape.6", "{replicated}"}};
+  for (const auto& [name, sharding] : instruction_and_sharding) {
+    auto* instruction = FindInstruction(module.get(), name);
+    ASSERT_NE(instruction, nullptr);
+    EXPECT_THAT(instruction, op::Sharding(sharding));
+    if (GetParam().propagate_metadata && !GetParam().clear_metadata) {
+      EXPECT_THAT(instruction->sharding(),
+                  ShardingMetadata({CreateMetadata("a")}));
+    } else {
+      EXPECT_THAT(instruction->sharding(), ShardingMetadata({}));
+    }
+  }
+}
+
 TEST_P(ParameterizedMetadataTest, ReshapeBackwardPass) {
   const char* const hlo_string = R"(
 HloModule module
@@ -1538,6 +1804,31 @@ ENTRY %reshape {
   }
 }
 
+TEST_P(ParameterizedMetadataTest, ReshapeBackwardPassWithBarrier) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %reshape {
+  %param0 = f32[2002,1]{1,0} parameter(0)
+  %copy = f32[2002,1]{1,0} copy(f32[2002,1]{1,0} %param0)
+  %shard-barrier-to = f32[2002,1]{1,0} custom-call(%copy), custom_call_target="ShardBarrierTo", custom_call_has_side_effect=true
+  ROOT %reshape = f32[14,11,13]{2,1,0} reshape(%shard-barrier-to),
+    sharding={devices=[2,1,1]0,1 metadata={op_name="a"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::ignore,
+      ShardingPropagation(/*is_spmd=*/false, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  auto* instruction = FindInstruction(module.get(), "copy");
+  ASSERT_NE(instruction, nullptr);
+  EXPECT_FALSE(instruction->has_sharding());
+}
+
 TEST_P(ParameterizedMetadataTest, PadForwardPass) {
   const char* const hlo_string = R"(
 HloModule module
@@ -1916,6 +2207,38 @@ ENTRY %reduce_window {
   }
 }
 
+TEST_P(ParameterizedMetadataTest, ReduceWindowBackwardPassWithBarrier) {
+  const char* const hlo_string = R"(
+HloModule module
+%add (lhs: f32[], rhs: f32[]) -> f32[] {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  ROOT %add = f32[] add(%lhs, %rhs)
+}
+ENTRY %reduce_window {
+  %param = f32[13,17]{1,0} parameter(0)
+  %param.copy = f32[13,17]{1,0} copy(%param)
+  %init = f32[] parameter(1)
+  %shard-barrier-to = f32[13,17]{1,0} custom-call(%param.copy), custom_call_target="ShardBarrierTo", custom_call_has_side_effect=true
+  ROOT %reduce-window = f32[7,17]{1,0} reduce-window(%shard-barrier-to, %init),
+    window={size=3x2 stride=2x1 pad=1_1x0_1}, to_apply=%add,
+    sharding={devices=[2,1]0,1 metadata={op_name="a"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::ignore,
+      ShardingPropagation(/*is_spmd=*/false, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  auto* param_copy = FindInstruction(module.get(), "param.copy");
+  ASSERT_NE(param_copy, nullptr);
+  EXPECT_FALSE(param_copy->has_sharding());
+}
+
 TEST_P(ParameterizedMetadataTest, VariadicReduceWindowBackwardPass) {
   const char* const hlo_string = R"(
 HloModule module
@@ -2118,6 +2441,65 @@ ENTRY %concat {
   }
 }
 
+TEST_P(ParameterizedMetadataTest, ConcatenateForwardWithBarrier) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY %concat {
+  %param.0 = f32[5,7] parameter(0),
+    sharding={devices=[2,1]0,1 metadata={op_name="a"}}
+  %param.1 = f32[5,9] parameter(1),
+    sharding={devices=[2,1]0,1 metadata={op_name="b"}}
+  %shard-barrier-from.0 = f32[5,7] custom-call(%param.0), custom_call_target="ShardBarrierFrom", custom_call_has_side_effect=true
+  %shard-barrier-from.1 = f32[5,9] custom-call(%param.1), custom_call_target="ShardBarrierFrom", custom_call_has_side_effect=true
+  %concat = f32[5,16] concatenate(%shard-barrier-from.0, %shard-barrier-from.1),
+    dimensions={1}
+  ROOT %tuple = (f32[5,16]) tuple(%concat)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::ignore,
+      ShardingPropagation(/*is_spmd=*/false, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  auto* instruction = FindInstruction(module.get(), "concat");
+  ASSERT_NE(instruction, nullptr);
+  EXPECT_FALSE(instruction->has_sharding());
+}
+
+TEST_P(ParameterizedMetadataTest, ConcatenateBackwardWithBarrier) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY %concat {
+  %param.0 = f32[5,7] parameter(0)
+  %copy.0 = f32[5,7] copy(%param.0)
+  %param.1 = f32[5,9] parameter(1)
+  %copy.1 = f32[5,9] copy(%param.1)
+  %shard-barrier-to = f32[5,9] custom-call(%copy.1), custom_call_target="ShardBarrierTo", custom_call_has_side_effect=true
+  %concat = f32[5,16] concatenate(%copy.0, %shard-barrier-to),
+    dimensions={1}, sharding={devices=[2,1]0,1 metadata={op_name="a"}}
+  ROOT %tuple = (f32[5,16]) tuple(%concat)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::ignore,
+      ShardingPropagation(/*is_spmd=*/false, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  auto* instruction = FindInstruction(module.get(), "copy.1");
+  ASSERT_NE(instruction, nullptr);
+  EXPECT_FALSE(instruction->has_sharding());
+}
+
 TEST_P(ParameterizedMetadataTest, TupleBackwardPass) {
   const char* const hlo_string = R"(
 HloModule module
@@ -2200,7 +2582,7 @@ ENTRY %entry {
   EXPECT_THAT(crs_f_tiled, op::Sharding("{devices=[2]0,1}"));
   auto* crs_f_none = FindInstruction(module.get(), "crs_f.none");
   ASSERT_NE(crs_f_none, nullptr);
-  EXPECT_THAT(crs_f_none, op::NoSharding());
+  EXPECT_THAT(crs_f_none, op::Sharding("{devices=[2]0,1}"));
   auto* crs_b_replicated = FindInstruction(module.get(), "crs_b.replicated");
   ASSERT_NE(crs_b_replicated, nullptr);
   EXPECT_THAT(crs_b_replicated, op::Sharding("{replicated}"));
@@ -2929,6 +3311,64 @@ ENTRY %conv {
   }
 }
 
+TEST_P(ParameterizedMetadataTest, ForwardDotWithBarrier) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %conv {
+  %p0 = f32[8,256,512] parameter(0),
+    sharding={devices=[2,2,2]0,1,2,3,4,5,6,7 metadata={op_name="a"}}
+  %p1 = f32[8,128,512] parameter(1)
+  %shard-barrier-from = f32[8,256,512] custom-call(%p0), custom_call_target="ShardBarrierFrom", custom_call_has_side_effect=true
+  %dot = f32[8,256,128] dot(%shard-barrier-from, %p1),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2}
+  ROOT %copy = f32[8,256,128] copy(%dot)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::ignore,
+      ShardingPropagation(/*is_spmd=*/true, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  auto* instruction = FindInstruction(module.get(), "dot");
+  ASSERT_NE(instruction, nullptr);
+  EXPECT_FALSE(instruction->has_sharding());
+}
+
+TEST_P(ParameterizedMetadataTest, BackwardDotWithBarrier) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %conv {
+  %p0 = f32[8,256,512] parameter(0),
+    sharding={devices=[2,2,2]0,1,2,3,4,5,6,7 metadata={op_name="a"}}
+  %p1 = f32[8,128,512] parameter(1)
+  %copy1 = f32[8,128,512] copy(%p1)
+  %shard-barrier-to = f32[8,128,512] custom-call(%copy1), custom_call_target="ShardBarrierTo", custom_call_has_side_effect=true
+  %dot = f32[8,256,128] dot(%p0, %shard-barrier-to),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2},
+    sharding={devices=[2,1,2,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate metadata={op_name="b"}}
+  ROOT %copy = f32[8,256,128] copy(%dot)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::ignore,
+      ShardingPropagation(/*is_spmd=*/true, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  auto* instruction = FindInstruction(module.get(), "copy1");
+  ASSERT_NE(instruction, nullptr);
+  EXPECT_THAT(instruction, op::Sharding("{replicated}"));
+}
+
 TEST_P(ParameterizedMetadataTest, BackwardDotFromContracting) {
   const char* const hlo_string = R"(
 HloModule module
@@ -3002,7 +3442,7 @@ ENTRY %dot {
   }
 }
 
-TEST_P(ParameterizedMetadataTest, ConvAsDotOnTrivialDims) {
+TEST_P(ParameterizedMetadataTest, ConvAsDotOnTrivialDimsForward) {
   const char* const hlo_string = R"(
 HloModule module
 ENTRY %conv {
@@ -3036,6 +3476,34 @@ ENTRY %conv {
   }
 }
 
+TEST_P(ParameterizedMetadataTest, ConvAsDotForwardWithBarrier) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %conv {
+  %lhs = f32[128,1,1,1001] parameter(0),
+    sharding={devices=[1,2,1,1]0,1 metadata={op_name="a"}}
+  %rhs = f32[1,1,1024,1001] parameter(1),
+    sharding={devices=[1,2,1,1]0,1 metadata={op_name="b"}}
+  %shard-barrier-from = f32[1,1,1024,1001] custom-call(%rhs), custom_call_target="ShardBarrierFrom", custom_call_has_side_effect=true
+  %convolution = f32[128,1,1,1024] convolution(%lhs, %shard-barrier-from),
+    window={size=1x1 rhs_reversal=1x1}, dim_labels=b01f_01oi->b01f
+  ROOT %copy = f32[128,1,1,1024] copy(%convolution)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::ignore,
+      ShardingPropagation(/*is_spmd=*/true, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  auto* instruction = FindInstruction(module.get(), "convolution");
+  ASSERT_NE(instruction, nullptr);
+  EXPECT_THAT(instruction, op::Sharding("{replicated}"));
+}
+
 TEST_P(ParameterizedMetadataTest, ConvAsDotOnTrivialDimsBackward) {
   const char* const hlo_string = R"(
 HloModule module
@@ -3075,6 +3543,36 @@ ENTRY %conv {
   }
 }
 
+TEST_P(ParameterizedMetadataTest, ConvAsDotBackwardWithBarrier) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %conv {
+  %p0 = f32[128,5,5,128] parameter(0)
+  %lhs = f32[128,5,5,128] copy(%p0)
+  %p1 = f32[5,5,128,768] parameter(1)
+  %rhs = f32[5,5,128,768] copy(%p1)
+  %shard-barrier-from = f32[128,5,5,128] custom-call(%lhs), custom_call_target="ShardBarrierFrom", custom_call_has_side_effect=true
+  %convolution = f32[128,1,1,768] convolution(%shard-barrier-from, %rhs), window={size=5x5},
+    dim_labels=b01f_01io->b01f,
+    sharding={devices=[1,2,1,1]0,1 metadata={op_name="a"}}
+  ROOT %copy = f32[128,1,1,768] copy(%convolution)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      ShardingPropagation(/*is_spmd=*/true, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_TRUE(changed);
+  auto* lhs = FindInstruction(module.get(), "lhs");
+  ASSERT_NE(lhs, nullptr);
+  EXPECT_THAT(lhs, op::Sharding("{replicated}"));
+}
+
 TEST_P(ParameterizedMetadataTest,
        ConvolutionFilterIFOFPartitionedInputPartialReplicate) {
   const char* const hlo_string = R"(
@@ -3345,26 +3843,43 @@ TEST_P(ParameterizedMetadataTest, Conditional) {
   const char* const hlo_string = R"(
 HloModule module
 
+%add-call {
+  %x = f32[4,4] parameter(0)
+  ROOT %add = f32[4,4] add(%x, %x)
+}
+
 %true_comp {
-  %tp = (f32[3,5]) parameter(0)
-  %tgte = f32[3,5] get-tuple-element(%tp), index=0
-  %ttr = f32[5,3] transpose(%tgte), dimensions={1,0}
-  ROOT %tr = (f32[5,3]) tuple(%ttr)
+  %tp = (f32[3,5], f32[4,4]) parameter(0)
+  %tgte.0 = f32[3,5] get-tuple-element(%tp), index=0
+  %ttr = f32[5,3] transpose(%tgte.0), dimensions={1,0}
+  %tgte.1 = f32[4,4] get-tuple-element(%tp), index=1
+  %tadd = f32[4,4] call(%tgte.1), to_apply=%add-call
+  ROOT %tr = (f32[5,3], f32[4,4]) tuple(%ttr, %tadd)
+}
+
+%mul-call {
+  %y = f32[4,4] parameter(0)
+  ROOT %mul = f32[4,4] multiply(%y, %y)
 }
 
 %false_comp {
-  %fp = (f32[5,3]) parameter(0)
-  %fgte = f32[5,3] get-tuple-element(%fp), index=0
-  ROOT %fr = (f32[5,3]) tuple(%fgte)
+  %fp = (f32[5,3], f32[4,4]) parameter(0)
+  %fgte.0 = f32[5,3] get-tuple-element(%fp), index=0
+  %fgte.1 = f32[4,4] get-tuple-element(%fp), index=1
+  %fmul = f32[4,4] call(%fgte.1), to_apply=%mul-call
+  ROOT %fr = (f32[5,3], f32[4,4]) tuple(%fgte.0, %fmul)
 }
 
 ENTRY entry {
   %cond = pred[] parameter(0)
-  %true_param = (f32[3,5]) parameter(1),
-    sharding={{devices=[1,2]0,1 metadata={op_name="a"}}}
-  %false_param = (f32[5,3]) parameter(2),
-    sharding={{devices=[1,3]0,1,2 metadata={op_name="b"}}}
-  %conditional = (f32[5,3]) conditional(
+  %tp.0 = f32[3,5] parameter(1), sharding={devices=[1,2]0,1 metadata={op_name="a"}}
+  %fp.0 = f32[5,3] parameter(2), sharding={devices=[1,3]0,1,2 metadata={op_name="b"}}
+  %constant = f32[4] constant({1,2,3,4}), sharding={devices=[4]0,1,2,3 metadata={op_name="c"}}
+  %broadcast = f32[4,4] broadcast(%constant), dimensions={1}
+  %add = f32[4,4] add(%broadcast, %broadcast)
+  %true_param = (f32[3,5], f32[4,4]) tuple(%tp.0, %add)
+  %false_param = (f32[5,3], f32[4,4]) tuple(%fp.0, %add)
+  %conditional = (f32[5,3], f32[4,4]) conditional(
       %cond, %true_param, %false_param),
     true_computation=%true_comp,
     false_computation=%false_comp
@@ -3381,60 +3896,76 @@ ENTRY entry {
           .Run(module.get()));
   XLA_VLOG_LINES(1, module->ToString());
   EXPECT_TRUE(changed);
+
   auto* tp = FindInstruction(module.get(), "tp");
-  ASSERT_NE(tp, nullptr);
-  EXPECT_THAT(tp, op::Sharding("{{devices=[1,2]0,1}}"));
-  auto* tgte = FindInstruction(module.get(), "tgte");
-  ASSERT_NE(tgte, nullptr);
-  EXPECT_THAT(tgte, op::Sharding("{devices=[1,2]0,1}"));
+  auto* tgte_0 = FindInstruction(module.get(), "tgte.0");
   auto* ttr = FindInstruction(module.get(), "ttr");
-  ASSERT_NE(ttr, nullptr);
-  EXPECT_THAT(ttr, op::Sharding("{devices=[2,1]0,1}"));
+  auto* tgte_1 = FindInstruction(module.get(), "tgte.1");
+  auto* tadd = FindInstruction(module.get(), "tadd");
   auto* tr = FindInstruction(module.get(), "tr");
-  ASSERT_NE(tr, nullptr);
-  EXPECT_THAT(tr, op::Sharding("{{devices=[1,3]0,1,2}}"));
+
   auto* fp = FindInstruction(module.get(), "fp");
-  ASSERT_NE(fp, nullptr);
-  EXPECT_THAT(fp, op::Sharding("{{devices=[1,3]0,1,2}}"));
-  auto* fgte = FindInstruction(module.get(), "fgte");
-  ASSERT_NE(fgte, nullptr);
-  EXPECT_THAT(fgte, op::Sharding("{devices=[1,3]0,1,2}"));
+  auto* fgte_0 = FindInstruction(module.get(), "fgte.0");
+  auto* fgte_1 = FindInstruction(module.get(), "fgte.1");
+  auto* fmul = FindInstruction(module.get(), "fmul");
   auto* fr = FindInstruction(module.get(), "fr");
-  ASSERT_NE(fr, nullptr);
-  EXPECT_THAT(fr, op::Sharding("{{devices=[1,3]0,1,2}}"));
+
+  auto* x = FindInstruction(module.get(), "x");
+  auto* add = FindInstruction(module.get(), "add");
+  auto* y = FindInstruction(module.get(), "y");
+  auto* mul = FindInstruction(module.get(), "mul");
+
   auto* conditional = FindInstruction(module.get(), "conditional");
-  ASSERT_NE(conditional, nullptr);
-  EXPECT_THAT(conditional, op::Sharding("{{devices=[1,3]0,1,2}}"));
 
-  auto check_metadata = [](const HloSharding& sharding,
-                           const OpMetadata& metadata) {
-    if (sharding.IsTuple()) {
-      EXPECT_THAT(sharding.tuple_elements()[0], ShardingMetadata({metadata}));
-    } else {
-      EXPECT_THAT(sharding, ShardingMetadata({metadata}));
-    }
-  };
+  const std::vector<HloInstruction*> instructions(
+      {tp, tgte_0, ttr, tgte_1, tadd, tr, fp, fgte_0, fgte_1, fmul, fr, x, add,
+       y, mul, conditional});
 
-  auto check_empty_metadata = [](const HloSharding& sharding) {
-    if (sharding.IsTuple()) {
-      EXPECT_THAT(sharding.tuple_elements()[0], ShardingMetadata({}));
-    } else {
-      EXPECT_THAT(sharding, ShardingMetadata({}));
-    }
-  };
+  for (HloInstruction* instruction : instructions) {
+    EXPECT_NE(instruction, nullptr);
+    EXPECT_TRUE(instruction->has_sharding());
+  }
 
-  for (HloInstruction* instruction : {tp, tgte, ttr}) {
-    if (GetParam().propagate_metadata && !GetParam().clear_metadata) {
-      check_metadata(instruction->sharding(), CreateMetadata("a"));
-    } else {
-      check_empty_metadata(instruction->sharding());
-    }
+  for (HloInstruction* instruction :
+       {tgte_1, tadd, fgte_1, fmul, x, add, y, mul}) {
+    EXPECT_THAT(instruction, op::Sharding("{devices=[1,4]0,1,2,3}"));
   }
-  for (HloInstruction* instruction : {tr, fp, fgte, fr, conditional}) {
-    if (GetParam().propagate_metadata && !GetParam().clear_metadata) {
-      check_metadata(instruction->sharding(), CreateMetadata("b"));
-    } else {
-      check_empty_metadata(instruction->sharding());
+  for (HloInstruction* instruction : {tr, fr, conditional, fp}) {
+    EXPECT_THAT(instruction,
+                op::Sharding("{{devices=[1,3]0,1,2}, {devices=[1,4]0,1,2,3}}"));
+  }
+  EXPECT_THAT(tp, op::Sharding("{{devices=[1,2]0,1}, {devices=[1,4]0,1,2,3}}"));
+  EXPECT_THAT(tgte_0, op::Sharding("{devices=[1,2]0,1}"));
+  EXPECT_THAT(ttr, op::Sharding("{devices=[2,1]0,1}"));
+  EXPECT_THAT(fgte_0, op::Sharding("{devices=[1,3]0,1,2}"));
+
+  if (GetParam().propagate_metadata && !GetParam().clear_metadata) {
+    for (HloInstruction* instruction :
+         {tgte_1, tadd, fgte_1, fmul, x, add, y, mul}) {
+      EXPECT_THAT(instruction->sharding(),
+                  ShardingMetadata({CreateMetadata("c")}));
+    }
+    for (HloInstruction* instruction : {tr, fr, conditional, fp}) {
+      const std::vector<HloSharding>& shardings =
+          instruction->sharding().tuple_elements();
+      EXPECT_THAT(shardings[0], ShardingMetadata({CreateMetadata("b")}));
+      EXPECT_THAT(shardings[1], ShardingMetadata({CreateMetadata("c")}));
+    }
+    for (HloInstruction* instruction : {tgte_0, ttr}) {
+      EXPECT_THAT(instruction->sharding(),
+                  ShardingMetadata({CreateMetadata("a")}));
+    }
+    EXPECT_THAT(fgte_0->sharding(), ShardingMetadata({CreateMetadata("b")}));
+  } else {
+    for (HloInstruction* instruction : instructions) {
+      if (instruction->sharding().IsTuple()) {
+        for (const HloSharding& tuple_element :
+             instruction->sharding().tuple_elements()) {
+          EXPECT_THAT(tuple_element, ShardingMetadata({}));
+        }
+      } else {
+        EXPECT_THAT(instruction->sharding(), ShardingMetadata({}));
+      }
     }
   }
 }
@@ -3496,6 +4027,35 @@ ENTRY %entry {
   }
 }
 
+TEST_P(ParameterizedMetadataTest, DynamicSliceForwardPassWithBarrier) {
+  const char* hlo_string = R"(
+HloModule module
+ENTRY %entry {
+  %p0 = f32[11,13,15] parameter(0)
+  %c0 = f32[11,13,15] copy(%p0),
+    sharding={devices=[1,1,2]0,1 metadata={op_name="a"}}
+  %p1 = s32[] parameter(1)
+  %i0 = s32[] constant(0)
+  %shard-barrier-from = f32[11,13,15] custom-call(%c0), custom_call_target="ShardBarrierFrom", custom_call_has_side_effect=true
+  %ds = f32[11,1,15] dynamic-slice(%shard-barrier-from, %i0, %p1, %i0),
+    dynamic_slice_sizes={11,1,15}
+  ROOT %root = (f32[11,1,15]) tuple(%ds)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::ignore,
+      ShardingPropagation(/*is_spmd=*/false, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  auto* instruction = FindInstruction(module.get(), "ds");
+  ASSERT_NE(instruction, nullptr);
+  EXPECT_FALSE(instruction->has_sharding());
+}
+
 TEST_P(ParameterizedMetadataTest, DynamicSliceForwardPass) {
   const char* hlo_string = R"(
 HloModule module
@@ -3566,6 +4126,35 @@ ENTRY %entry {
   }
 }
 
+TEST_P(ParameterizedMetadataTest, DynamicSliceBackwardPassWithBarrier) {
+  const char* hlo_string = R"(
+HloModule module
+ENTRY %entry {
+  %p0 = f32[11,13,15] parameter(0)
+  %c0 = f32[11,13,15] copy(%p0)
+  %p1 = s32[] parameter(1)
+  %i0 = s32[] constant(0)
+  %shard-barrier-to = f32[11,13,15] custom-call(%c0), custom_call_target="ShardBarrierTo", custom_call_has_side_effect=true
+  %ds = f32[11,1,15] dynamic-slice(%shard-barrier-to, %i0, %p1, %i0),
+    dynamic_slice_sizes={11,1,15},
+    sharding={devices=[1,1,2]0,1 metadata={op_name="a"}}
+  ROOT %root = (f32[11,1,15]) tuple(%ds)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::ignore,
+      ShardingPropagation(/*is_spmd=*/false, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  auto* instruction = FindInstruction(module.get(), "c0");
+  ASSERT_NE(instruction, nullptr);
+  EXPECT_FALSE(instruction->has_sharding());
+}
+
 TEST_P(ParameterizedMetadataTest, DynamicUpdateSliceForwardPassBase) {
   const char* hlo_string = R"(
 HloModule module
@@ -3607,6 +4196,36 @@ ENTRY %entry {
   }
 }
 
+TEST_P(ParameterizedMetadataTest, DynamicUpdateSliceForwardPassWithBarrier) {
+  const char* hlo_string = R"(
+HloModule module
+ENTRY %entry {
+  %p0 = f32[11,13,15] parameter(0)
+  %c0 = f32[11,13,15] copy(%p0),
+    sharding={devices=[1,1,2]0,1 metadata={op_name="a"}}
+  %p1 = f32[11,1,15] parameter(1)
+  %c1 = f32[11,1,15] copy(%p1)
+  %p2 = s32[] parameter(2)
+  %i0 = s32[] constant(0)
+  %shard-barrier-from = f32[11,13,15] custom-call(%c0), custom_call_target="ShardBarrierFrom", custom_call_has_side_effect=true
+  %dus = f32[11,13,15] dynamic-update-slice(%shard-barrier-from, %c1, %i0, %p2, %i0)
+  ROOT %root = (f32[11,13,15]) tuple(%dus)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::ignore,
+      ShardingPropagation(/*is_spmd=*/false, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  auto* dus = FindInstruction(module.get(), "dus");
+  ASSERT_NE(dus, nullptr);
+  EXPECT_FALSE(dus->has_sharding());
+}
+
 TEST_P(ParameterizedMetadataTest, DynamicUpdateSliceForwardPassUpdate) {
   const char* hlo_string = R"(
 HloModule module
@@ -3689,6 +4308,36 @@ ENTRY %entry {
   }
 }
 
+TEST_P(ParameterizedMetadataTest, DynamicUpdateSliceBackwardPassWithBarrier) {
+  const char* hlo_string = R"(
+HloModule module
+ENTRY %entry {
+  %p0 = f32[11,13,15] parameter(0)
+  %c0 = f32[11,13,15] copy(%p0)
+  %p1 = f32[11,1,15] parameter(1)
+  %c1 = f32[11,1,15] copy(%p1)
+  %p2 = s32[] parameter(2)
+  %i0 = s32[] constant(0)
+  %shard-barrier-to = f32[11,13,15] custom-call(%c0), custom_call_target="ShardBarrierTo", custom_call_has_side_effect=true
+  %dus = f32[11,13,15] dynamic-update-slice(%shard-barrier-to, %c1, %i0, %p2, %i0),
+    sharding={devices=[1,1,2]0,1 metadata={op_name="a"}}
+  ROOT %root = (f32[11,13,15]) tuple(%dus)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::ignore,
+      ShardingPropagation(/*is_spmd=*/false, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  auto* c0 = FindInstruction(module.get(), "c0");
+  ASSERT_NE(c0, nullptr);
+  EXPECT_FALSE(c0->has_sharding());
+}
+
 TEST_P(ParameterizedMetadataTestWithOutput, EinsumLHSBatchPartitioned) {
   const char* hlo_string = R"(
 HloModule module
@@ -5606,33 +6255,113 @@ ENTRY %transpose {
     ClearMetadata(module.get());
   }
   TF_ASSERT_OK_AND_ASSIGN(
-      bool changed,
+      bool changed,
+      ShardingPropagation(/*is_spmd=*/true, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_TRUE(changed);
+  auto* instruction = FindInstruction(module.get(), "transpose");
+  ASSERT_NE(instruction, nullptr);
+  EXPECT_THAT(
+      instruction,
+      op::Sharding(
+          "{devices=[1,2,2,2]0,1,4,5,2,3,6,7 last_tile_dim_replicate}"));
+  if (GetParam().propagate_metadata && !GetParam().clear_metadata) {
+    EXPECT_THAT(instruction->sharding(),
+                ShardingMetadata({CreateMetadata("a")}));
+  } else {
+    EXPECT_THAT(instruction->sharding(), ShardingMetadata({}));
+  }
+}
+
+TEST_P(ParameterizedMetadataTest, PartialShardingTransposeBackwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %transpose {
+  %param = f32[7,11,13]{2,1,0} parameter(0)
+  %copy = f32[7,11,13]{2,1,0} copy(%param)
+  ROOT %transpose = f32[11,13,7]{2,1,0} transpose(%copy), dimensions={1,2,0},
+    sharding={devices=[1,2,2,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate metadata={op_name="a"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      ShardingPropagation(/*is_spmd=*/true, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_TRUE(changed);
+  auto* instruction = FindInstruction(module.get(), "copy");
+  ASSERT_NE(instruction, nullptr);
+  EXPECT_THAT(
+      instruction,
+      op::Sharding(
+          "{devices=[2,1,2,2]0,1,4,5,2,3,6,7 last_tile_dim_replicate}"));
+  if (GetParam().propagate_metadata && !GetParam().clear_metadata) {
+    EXPECT_THAT(instruction->sharding(),
+                ShardingMetadata({CreateMetadata("a")}));
+  } else {
+    EXPECT_THAT(instruction->sharding(), ShardingMetadata({}));
+  }
+}
+
+TEST_P(ParameterizedMetadataTest, GatherForwardPassWithBarrier) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY %module {
+  %parameter.0 = s32[8,4,2,2]{3,2,1,0} parameter(0),
+    sharding={devices=[8,1,1,1]0,1,4,5,2,3,6,7 metadata={op_name="a"}}
+  %iota = s32[1,8,4]{2,1,0} iota(), iota_dimension=1
+  %iota2 = s32[1,8,4]{2,1,0} iota(), iota_dimension=2
+  %concatenate.19 = s32[2,8,4]{2,1,0} concatenate(s32[1,8,4]{2,1,0} %iota,
+    s32[1,8,4]{2,1,0} %iota2), dimensions={0}
+  %shard-barrier-from.0 = s32[8,4,2,2]{3,2,1,0} custom-call(%parameter.0), custom_call_target="ShardBarrierFrom", custom_call_has_side_effect=true
+  %shard-barrier-from.1 = s32[2,8,4]{2,1,0} custom-call(%concatenate.19), custom_call_target="ShardBarrierFrom", custom_call_has_side_effect=true
+  %gather = s32[8,4,2,2]{3,2,1,0} gather(
+    s32[8,4,2,2]{3,2,1,0} %shard-barrier-from.0,
+    s32[2,8,4]{2,1,0} %shard-barrier-from.1), offset_dims={2,3},
+    collapsed_slice_dims={0,1}, start_index_map={0,1}, index_vector_dim=0,
+    slice_sizes={1,1,2,2}
+  ROOT %copy = s32[8,4,2,2]{3,2,1,0} copy(%gather)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::ignore,
       ShardingPropagation(/*is_spmd=*/true, GetParam().propagate_metadata)
           .Run(module.get()));
   XLA_VLOG_LINES(1, module->ToString());
-  EXPECT_TRUE(changed);
-  auto* instruction = FindInstruction(module.get(), "transpose");
+  auto* instruction = FindInstruction(module.get(), "gather");
   ASSERT_NE(instruction, nullptr);
-  EXPECT_THAT(
-      instruction,
-      op::Sharding(
-          "{devices=[1,2,2,2]0,1,4,5,2,3,6,7 last_tile_dim_replicate}"));
-  if (GetParam().propagate_metadata && !GetParam().clear_metadata) {
-    EXPECT_THAT(instruction->sharding(),
-                ShardingMetadata({CreateMetadata("a")}));
-  } else {
-    EXPECT_THAT(instruction->sharding(), ShardingMetadata({}));
-  }
+  EXPECT_FALSE(instruction->has_sharding());
 }
 
-TEST_P(ParameterizedMetadataTest, PartialShardingTransposeBackwardPass) {
+TEST_P(ParameterizedMetadataTest, GatherBackwardPassWithBarrier) {
   const char* const hlo_string = R"(
 HloModule module
-ENTRY %transpose {
-  %param = f32[7,11,13]{2,1,0} parameter(0)
-  %copy = f32[7,11,13]{2,1,0} copy(%param)
-  ROOT %transpose = f32[11,13,7]{2,1,0} transpose(%copy), dimensions={1,2,0},
-    sharding={devices=[1,2,2,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate metadata={op_name="a"}}
+
+ENTRY %module {
+  %parameter.0 = s32[8,4,2,2]{3,2,1,0} parameter(0)
+  %copy.p = s32[8,4,2,2]{3,2,1,0} copy(%parameter.0)
+  %iota = s32[1,8,4]{2,1,0} iota(), iota_dimension=1
+  %iota2 = s32[1,8,4]{2,1,0} iota(), iota_dimension=2
+  %concatenate = s32[2,8,4]{2,1,0} concatenate(s32[1,8,4]{2,1,0} %iota,
+    s32[1,8,4]{2,1,0} %iota2), dimensions={0}
+  %shard-barrier-to = s32[8,4,2,2]{3,2,1,0} custom-call(%copy.p), custom_call_target="ShardBarrierTo", custom_call_has_side_effect=true
+  %gather = s32[8,4,2,2]{3,2,1,0} gather(
+    s32[8,4,2,2]{3,2,1,0} %shard-barrier-to,
+    s32[2,8,4]{2,1,0} %concatenate), offset_dims={2,3},
+    collapsed_slice_dims={0,1}, start_index_map={0,1}, index_vector_dim=0,
+    slice_sizes={1,1,2,2},
+    sharding={devices=[8,1,1,1]0,1,4,5,2,3,6,7 metadata={op_name="a"}}
+  ROOT %copy = s32[8,4,2,2]{3,2,1,0} copy(%gather)
 })";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
@@ -5645,18 +6374,12 @@ ENTRY %transpose {
           .Run(module.get()));
   XLA_VLOG_LINES(1, module->ToString());
   EXPECT_TRUE(changed);
-  auto* instruction = FindInstruction(module.get(), "copy");
-  ASSERT_NE(instruction, nullptr);
-  EXPECT_THAT(
-      instruction,
-      op::Sharding(
-          "{devices=[2,1,2,2]0,1,4,5,2,3,6,7 last_tile_dim_replicate}"));
-  if (GetParam().propagate_metadata && !GetParam().clear_metadata) {
-    EXPECT_THAT(instruction->sharding(),
-                ShardingMetadata({CreateMetadata("a")}));
-  } else {
-    EXPECT_THAT(instruction->sharding(), ShardingMetadata({}));
-  }
+  auto* concatenate = FindInstruction(module.get(), "concatenate");
+  ASSERT_NE(concatenate, nullptr);
+  EXPECT_THAT(concatenate, op::Sharding("{devices=[1,8,1]0,1,4,5,2,3,6,7}"));
+  auto* copy_p = FindInstruction(module.get(), "copy.p");
+  ASSERT_NE(copy_p, nullptr);
+  EXPECT_THAT(copy_p, op::Sharding("{replicated}"));
 }
 
 TEST_P(ParameterizedMetadataTest, ParallelGatherFromOperandForwardPass) {
@@ -6021,6 +6744,107 @@ ENTRY %module {
   }
 }
 
+TEST_P(ParameterizedMetadataTest, ScatterForwardPassWithBarrier) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  rhs = s32[] parameter(1)
+  ROOT sum = s32[] add(lhs, rhs)
+}
+
+ENTRY %module {
+  %parameter.0 = s32[8,4,2,2]{3,2,1,0} parameter(0),
+    sharding={devices=[8,1,1,1]0,1,4,5,2,3,6,7 metadata={op_name="a"}}
+  %iota = s32[1,8,4]{2,1,0} iota(), iota_dimension=1
+  %iota2 = s32[1,8,4]{2,1,0} iota(), iota_dimension=2
+  %concatenate = s32[2,8,4]{2,1,0} concatenate(s32[1,8,4]{2,1,0} %iota,
+    s32[1,8,4]{2,1,0} %iota2), dimensions={0}
+  %parameter.1 = s32[8,4,2,2]{3,2,1,0} parameter(1)
+  %shard-barrier-from.0 = s32[8,4,2,2]{3,2,1,0} custom-call(%parameter.0), custom_call_target="ShardBarrierFrom", custom_call_has_side_effect=true
+  %shard-barrier-from.1 = s32[2,8,4]{2,1,0} custom-call(%concatenate), custom_call_target="ShardBarrierFrom", custom_call_has_side_effect=true
+  %shard-barrier-from.2 = s32[8,4,2,2]{3,2,1,0} custom-call(%parameter.1), custom_call_target="ShardBarrierFrom", custom_call_has_side_effect=true
+  %scatter = s32[8,4,2,2]{3,2,1,0} scatter(
+    s32[8,4,2,2]{3,2,1,0} %shard-barrier-from.0,
+    s32[2,8,4]{2,1,0} %shard-barrier-from.1,
+    s32[8,4,2,2]{3,2,1,0} %shard-barrier-from.2),
+    to_apply=add,
+    update_window_dims={2,3},
+    inserted_window_dims={0,1},
+    scatter_dims_to_operand_dims={0,1},
+    index_vector_dim=0
+  ROOT %copy = s32[8,4,2,2]{3,2,1,0} copy(%scatter)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::ignore,
+      ShardingPropagation(/*is_spmd=*/true, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  auto* instruction = FindInstruction(module.get(), "scatter");
+  ASSERT_NE(instruction, nullptr);
+  EXPECT_FALSE(instruction->has_sharding());
+}
+
+TEST_P(ParameterizedMetadataTest, ScatterBackwardPassWithBarrier) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  rhs = s32[] parameter(1)
+  ROOT sum = s32[] add(lhs, rhs)
+}
+
+ENTRY %module {
+  %parameter.0 = s32[8,4,2,2]{3,2,1,0} parameter(0)
+  %copy.p0 = s32[8,4,2,2]{3,2,1,0} copy(%parameter.0)
+  %iota = s32[1,8,4]{2,1,0} iota(), iota_dimension=1
+  %iota2 = s32[1,8,4]{2,1,0} iota(), iota_dimension=2
+  %concatenate = s32[2,8,4]{2,1,0} concatenate(s32[1,8,4]{2,1,0} %iota,
+    s32[1,8,4]{2,1,0} %iota2), dimensions={0}
+  %parameter.1 = s32[8,4,2,2]{3,2,1,0} parameter(1)
+  %copy.p1 = s32[8,4,2,2]{3,2,1,0} copy(%parameter.1)
+  %shard-barrier-to.0 = s32[8,4,2,2]{3,2,1,0} custom-call(%copy.p0), custom_call_target="ShardBarrierTo", custom_call_has_side_effect=true
+  %scatter = s32[8,4,2,2]{3,2,1,0} scatter(
+    s32[8,4,2,2]{3,2,1,0} %shard-barrier-to.0,
+    s32[2,8,4]{2,1,0} %concatenate,
+    s32[8,4,2,2]{3,2,1,0} %copy.p1),
+    to_apply=add,
+    update_window_dims={2,3},
+    inserted_window_dims={0,1},
+    scatter_dims_to_operand_dims={0,1},
+    index_vector_dim=0,
+    sharding={devices=[8,1,1,1]0,1,4,5,2,3,6,7 metadata={op_name="a"}}
+  ROOT %copy = s32[8,4,2,2]{3,2,1,0} copy(%scatter)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      ShardingPropagation(/*is_spmd=*/true, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_TRUE(changed);
+  auto* concatenate = FindInstruction(module.get(), "concatenate");
+  ASSERT_NE(concatenate, nullptr);
+  EXPECT_THAT(concatenate, op::Sharding("{devices=[1,8,1]0,1,4,5,2,3,6,7}"));
+  auto* copy_p0 = FindInstruction(module.get(), "copy.p0");
+  ASSERT_NE(copy_p0, nullptr);
+  EXPECT_THAT(copy_p0, op::Sharding("{replicated}"));
+  auto* copy_p1 = FindInstruction(module.get(), "copy.p1");
+  ASSERT_NE(copy_p1, nullptr);
+  EXPECT_THAT(copy_p1, op::Sharding("{devices=[8,1,1,1]0,1,4,5,2,3,6,7}"));
+}
+
 TEST_P(ParameterizedMetadataTest, ParallelScatterFromOperandForwardPass) {
   const char* const hlo_string = R"(
 HloModule module
@@ -9374,6 +10198,38 @@ ENTRY %reshape {
   EXPECT_THAT(instruction, op::Sharding("{devices=[1,2,2]0,1,2,3}"));
 }
 
+TEST_F(ShardingPropagationTest, OffloadingPropagation) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %offloading {
+  %param0 = f32[1,256,128] parameter(0), sharding={devices=[1,1,4]0,1,2,3}
+  %zero = f32[] constant(0.0)
+  %broadcast = f32[256,256,128] broadcast(%zero), dimensions={}
+  %izero = s32[] constant(0)
+  %custom-call.0 = f32[1,256,128] custom-call(f32[1,256,128] %param0), custom_call_target="MoveToHost"
+  %dynamic-update-slice = f32[256,256,128] dynamic-update-slice(%broadcast, %custom-call.0, %izero, %izero, %izero)
+  %dynamic-slice = f32[1,256,128] dynamic-slice(%dynamic-update-slice, %izero, %izero, %izero), dynamic_slice_sizes={1,256,128}
+  %custom-call.1 = f32[1,256,128] custom-call(f32[1,256,128] %dynamic-slice), custom_call_target="MoveToDevice"
+  ROOT %copy = f32[1,256,128] copy(%custom-call.1), sharding={devices=[1,4,1]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      ShardingPropagation(/*is_spmd=*/true, /*propagate_metadata=*/true)
+          .Run(module.get()));
+
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_TRUE(changed);
+
+  auto* to_host = FindInstruction(module.get(), "custom-call.0");
+  EXPECT_THAT(to_host, op::Sharding("{devices=[1,1,4]0,1,2,3}"));
+
+  auto* from_host_input =
+      FindInstruction(module.get(), "custom-call.1")->operand(0);
+  EXPECT_THAT(from_host_input, op::Sharding("{devices=[1,1,4]0,1,2,3}"));
+}
+
 TEST_P(ParameterizedMetadataTest, PropagateThroughSingleUsers) {
   const char* const hlo_string = R"(
 HloModule module
@@ -9719,6 +10575,68 @@ ENTRY %main.21 {
               op::Sharding("{devices=[2,1]0,1}"));
 }
 
+TEST_F(ShardingPropagationTest, SortForwardWithBarrier) {
+  const char* const hlo_string = R"(
+HloModule module
+
+compare {
+  p.0.lhs = f32[] parameter(0), sharding={replicated}
+  p.0.rhs = f32[] parameter(1), sharding={replicated}
+  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT, sharding={replicated}
+}
+
+ENTRY entry {
+  param.0 = f32[1024,1024]{1,0} parameter(0)
+  negate.0 = f32[1024,1024]{1,0} negate(param.0), sharding={devices=[1,8]0,1,2,3,4,5,6,7}
+  %shard-barrier-from = f32[1024,1024]{1,0} custom-call(%negate.0), custom_call_target="ShardBarrierFrom", custom_call_has_side_effect=true
+  sort.0 = f32[1024,1024]{1,0} sort(shard-barrier-from), dimensions={1}, is_stable=true, to_apply=compare
+  ROOT copy.0 = f32[1024,1024]{1,0} copy(sort.0)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::ignore,
+      ShardingPropagation(/*is_spmd=*/true, /*propagate_metadata=*/true)
+          .Run(module.get()));
+
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_FALSE(FindInstruction(module.get(), "sort.0")->has_sharding());
+}
+
+TEST_F(ShardingPropagationTest, SortBackwardWithBarrier) {
+  const char* const hlo_string = R"(
+HloModule module
+
+compare {
+  p.0.lhs = f32[] parameter(0), sharding={replicated}
+  p.0.rhs = f32[] parameter(1), sharding={replicated}
+  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT, sharding={replicated}
+}
+
+ENTRY entry {
+  param.0 = f32[1024,1024]{1,0} parameter(0)
+  negate.0 = f32[1024,1024]{1,0} negate(param.0)
+  %shard-barrier-to = f32[1024,1024]{1,0} custom-call(%negate.0), custom_call_target="ShardBarrierTo", custom_call_has_side_effect=true
+  sort.0 = f32[1024,1024]{1,0} sort(shard-barrier-to), dimensions={1}, is_stable=true, to_apply=compare,
+    sharding={devices=[1,8]0,1,2,3,4,5,6,7}
+  ROOT copy.0 = f32[1024,1024]{1,0} copy(sort.0)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::ignore,
+      ShardingPropagation(/*is_spmd=*/true, /*propagate_metadata=*/true)
+          .Run(module.get()));
+
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_THAT(FindInstruction(module.get(), "negate.0"),
+              op::Sharding("{replicated}"));
+}
+
 TEST_F(ShardingPropagationTest, SortOperandShardedOnSortDim_RankOne) {
   const char* const hlo_string = R"(
 HloModule module, entry_computation_layout={(f32[1024]{0})->(f32[1024]{0}, s32[1024]{0})}
@@ -10684,8 +11602,8 @@ ENTRY main.6 {
       bool changed,
       ShardingPropagation(
           /*is_spmd=*/true, /*propagate_metadata=*/true,
-          /*allow_spmd_sharding_propagation_to_output=*/{false},
-          /*allow_spmd_sharding_propagation_to_parameters=*/{false, false})
+          /*allow_spmd_sharding_propagation_to_output=*/{true},
+          /*allow_spmd_sharding_propagation_to_parameters=*/{true})
           .Run(module.get()));
   EXPECT_TRUE(changed);
   VLOG(1) << module->ToString();
@@ -10698,7 +11616,7 @@ TEST_F(ShardingPropagationTest, PropagateShardAsBetweenInputOutput2) {
 HloModule jit_f, entry_computation_layout={(f32[8]{0:T(256)})->(f32[8]{0:T(256)}, f32[8]{0:T(256)})}, allow_spmd_sharding_propagation_to_output={true,true}, num_partitions=4
 
 ENTRY main.9 {
-  Arg_0.1 = f32[8]{0} parameter(0), sharding={replicated}
+  Arg_0.1 = f32[8]{0} parameter(0)
   custom-call.6 = f32[8]{0} custom-call(Arg_0.1), custom_call_target="Sharding", custom_call_has_side_effect=true, sharding={unknown shard_as 0}, metadata={op_name="jit(f)/jit(main)/shard_alike" source_file="third_party/py/jax/tests/shard_alike_test.py" source_line=206}
   custom-call.4 = f32[8]{0} custom-call(Arg_0.1), custom_call_target="Sharding", sharding={devices=[4]<=[4]}, metadata={op_name="jit(f)/jit(main)/sharding_constraint[sharding=GSPMDSharding({devices=[4]<=[4]}) resource_env=ResourceEnv(mesh=Mesh(), ()) unconstrained_dims=set()]" source_file="third_party/py/jax/tests/shard_alike_test.py" source_line=204}
   constant.0 = f32[] constant(2)
@@ -10714,7 +11632,7 @@ ENTRY main.9 {
       ShardingPropagation(
           /*is_spmd=*/true, /*propagate_metadata=*/true,
           /*allow_spmd_sharding_propagation_to_output=*/{true, true},
-          /*allow_spmd_sharding_propagation_to_parameters=*/{false})
+          /*allow_spmd_sharding_propagation_to_parameters=*/{true})
           .Run(module.get()));
   EXPECT_TRUE(changed);
   VLOG(1) << module->ToString();
@@ -10908,5 +11826,79 @@ ENTRY entry_computation {
   }
 }
 
+TEST_F(ShardingPropagationTest, ShardAsWithShardBarrier) {
+  const char* const hlo_string = R"(
+HloModule pjit_f
+
+ENTRY main.11 {
+  Arg_0.1 = bf16[384,1408]{1,0} parameter(0), sharding={devices=[1,16,512]<=[8,16,64]T(1,0,2) last_tile_dim_replicate}
+  broadcast.4 = bf16[8,384,1408]{2,1,0} broadcast(Arg_0.1), dimensions={1,2}
+  custom-call.5 = bf16[8,384,1408]{2,1,0} custom-call(broadcast.4), custom_call_target="Sharding", custom_call_has_side_effect=true, sharding={unknown shard_as 1}
+  broadcast.2 = bf16[8,384,1408]{2,1,0} broadcast(Arg_0.1), dimensions={1,2}
+  custom-call.3 = bf16[8,384,1408]{2,1,0} custom-call(broadcast.2), custom_call_target="Sharding", sharding={devices=[8,1,1,1024]<=[8192] last_tile_dim_replicate}, backend_config="unspecified_dims=[1,2]"
+  custom-call.6 = bf16[8,384,1408]{2,1,0} custom-call(custom-call.3), custom_call_target="Sharding", custom_call_has_side_effect=true, sharding={unknown shard_as 1}
+  %shard-barrier-to = bf16[8,384,1408]{2,1,0} custom-call(%custom-call.6), custom_call_target="ShardBarrierTo", custom_call_has_side_effect=true
+  slice.7 = bf16[1,384,1408]{2,1,0} slice(shard-barrier-to), slice={[1:2], [0:384], [0:1408]}
+  reshape.8 = bf16[384,1408]{1,0} reshape(slice.7)
+  tuple.9 = (bf16[384,1408]{1,0}) tuple(reshape.8)
+  get-tuple-element.10 = bf16[384,1408]{1,0} get-tuple-element(tuple.9), index=0, sharding={devices=[16,1,512]<=[8,16,64]T(1,0,2) last_tile_dim_replicate}
+  ROOT tuple.13 = (bf16[384,1408]{1,0}, bf16[8,384,1408]{2,1,0}) tuple(get-tuple-element.10, custom-call.5)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      ShardingPropagation(
+          /*is_spmd=*/true, /*propagate_metadata=*/true,
+          /*allow_spmd_sharding_propagation_to_output=*/{true},
+          /*allow_spmd_sharding_propagation_to_parameters=*/{false, false})
+          .Run(module.get()));
+  EXPECT_TRUE(changed);
+
+  XLA_VLOG_LINES(1, module->ToString());
+  auto* broadcast_4 = FindInstruction(module.get(), "broadcast.4");
+  ASSERT_NE(broadcast_4, nullptr);
+  EXPECT_THAT(
+      broadcast_4,
+      op::Sharding("{devices=[8,1,16,64]<=[8192] last_tile_dim_replicate}"));
+  auto* copy = FindInstruction(module.get(), "copy");
+  ASSERT_NE(copy, nullptr);
+  EXPECT_THAT(
+      copy,
+      op::Sharding("{devices=[8,1,16,64]<=[8192] last_tile_dim_replicate}"));
+}
+
+TEST_F(ShardingPropagationTest, ShardAsWithShardBarrier2) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %elementwise {
+  %param0 = f32[5,7,11,13]{3,2,1,0} parameter(0)
+  %custom-call.0 = f32[5,7,11,13]{3,2,1,0} custom-call(param0), custom_call_target="Sharding", sharding={devices=[2,1,1,1,4]<=[8] last_tile_dim_replicate}, backend_config="unspecified_dims=[1,2,3]"
+  %shard-barrier-from = f32[5,7,11,13]{3,2,1,0} custom-call(%custom-call.0), custom_call_target="ShardBarrierFrom", custom_call_has_side_effect=true
+  %custom-call.2 = f32[5,7,11,13]{3,2,1,0} custom-call(shard-barrier-from), custom_call_target="Sharding", custom_call_has_side_effect=true, sharding={unknown shard_as 1}
+  %param1 = f32[5,7,11,13]{3,2,1,0} parameter(1)
+  %custom-call.1 = f32[5,7,11,13]{3,2,1,0} custom-call(param1), custom_call_target="Sharding", sharding={devices=[1,2,2,1,2]<=[2,4]T(1,0) last_tile_dim_replicate}, backend_config="unspecified_dims=[0]"
+  %custom-call.3 = f32[5,7,11,13]{3,2,1,0} custom-call(custom-call.1), custom_call_target="Sharding", custom_call_has_side_effect=true, sharding={unknown shard_as 1}
+  ROOT %tuple = (f32[5,7,11,13]{3,2,1,0}, f32[5,7,11,13]{3,2,1,0}) tuple(%custom-call.0, %custom-call.3)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      ShardingPropagation(
+          /*is_spmd=*/true, /*propagate_metadata=*/true,
+          /*allow_spmd_sharding_propagation_to_output=*/{true},
+          /*allow_spmd_sharding_propagation_to_parameters=*/{false, false})
+          .Run(module.get()));
+  EXPECT_TRUE(changed);
+
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::Sharding(
+          "{{devices=[2,2,2,1]<=[8]}, {devices=[1,2,2,1,2]<=[2,4]T(1,0) "
+          "last_tile_dim_replicate}}"));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/slice_sinker.cc b/third_party/xla/xla/service/slice_sinker.cc
index dc7559444436c6..baf18e087ef774 100644
--- a/third_party/xla/xla/service/slice_sinker.cc
+++ b/third_party/xla/xla/service/slice_sinker.cc
@@ -164,8 +164,9 @@ std::optional<std::vector<HloInstruction*>> FindElementwiseOperationGroup(
 // Generates a new elementwise operation using the slice_sources as operands,
 // and replaces the uses of elementwise operation_on_slices with slices of the
 // new elementwise operations.
-Status SinkSlices(const std::vector<HloInstruction*>& slice_sources,
-                  const std::vector<HloInstruction*>& operation_on_slices) {
+absl::Status SinkSlices(
+    const std::vector<HloInstruction*>& slice_sources,
+    const std::vector<HloInstruction*>& operation_on_slices) {
   const Shape shape = slice_sources[0]->shape();
   PrimitiveType element_type = operation_on_slices[0]->shape().element_type();
   Shape new_shape = ShapeUtil::ChangeElementType(shape, element_type);
diff --git a/third_party/xla/xla/service/source_map_util.h b/third_party/xla/xla/service/source_map_util.h
index 5fd9db9dca751b..b5efa41186dded 100644
--- a/third_party/xla/xla/service/source_map_util.h
+++ b/third_party/xla/xla/service/source_map_util.h
@@ -25,9 +25,9 @@ namespace source_map_util {
 
 // Creates an INVALID_ARGUMENT status with the given format string.
 template <typename... Args>
-Status InvalidParameterArgument(const OpMetadata& op_metadata,
-                                const absl::FormatSpec<Args...>& format,
-                                const Args&... args) {
+absl::Status InvalidParameterArgument(const OpMetadata& op_metadata,
+                                      const absl::FormatSpec<Args...>& format,
+                                      const Args&... args) {
   std::string message = absl::StrFormat(format, args...);
   if (!op_metadata.source_file().empty()) {
     absl::StrAppendFormat(&message, " (%s:%d)", op_metadata.source_file(),
@@ -44,9 +44,10 @@ Status InvalidParameterArgument(const OpMetadata& op_metadata,
 // executable may be nullptr, but parameter_number should not be out of bounds
 // or a CHECK-failure may occur.
 template <typename... Args>
-Status InvalidParameterArgument(Executable* executable, int parameter_number,
-                                const absl::FormatSpec<Args...>& format,
-                                const Args&... args) {
+absl::Status InvalidParameterArgument(Executable* executable,
+                                      int parameter_number,
+                                      const absl::FormatSpec<Args...>& format,
+                                      const Args&... args) {
   if (executable != nullptr && executable->has_module()) {
     const HloModule& module = executable->module();
     const HloComputation& computation = *module.entry_computation();
diff --git a/third_party/xla/xla/service/space_to_batch_converter.cc b/third_party/xla/xla/service/space_to_batch_converter.cc
index 4cf5c6bdb09137..adaf2eb14daaae 100644
--- a/third_party/xla/xla/service/space_to_batch_converter.cc
+++ b/third_party/xla/xla/service/space_to_batch_converter.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstddef>
+#include <cstdint>
 #include <iterator>
 #include <map>
 #include <memory>
@@ -64,7 +65,7 @@ namespace m = match;
 class ConvolutionVisitor {
  public:
   // Top-level function to begin space-to-batch conversion.
-  Status PerformSpaceToBatchOnConvolution(HloInstruction* convolution);
+  absl::Status PerformSpaceToBatchOnConvolution(HloInstruction* convolution);
 
   // Struct containing details about a convolution.
   struct ConvDetails {
@@ -112,6 +113,8 @@ class ConvolutionVisitor {
   // This function checks if the HLO instruction supports propagation.
   bool SupportedOpForPropagation(HloInstruction* consumer,
                                  HloInstruction* producer);
+  bool SupportedDotForPropagation(HloInstruction* consumer,
+                                  HloInstruction* producer);
 
   // Method that checks validity of Broadcast propagation.
   bool IsBroadcastPropagatable(HloInstruction* broadcast,
@@ -161,23 +164,23 @@ class ConvolutionVisitor {
 
   // Perform space-to-batch propagation on the convolution. Assumes the
   // activations were already space-to-batched.
-  Status PropagateOnConv(HloInstruction* convolution);
+  absl::Status PropagateOnConv(HloInstruction* convolution);
 
   // Perform space-to-batch propagation on concatenate.
-  Status PropagateOnConcat(HloInstruction* concat);
+  absl::Status PropagateOnConcat(HloInstruction* concat);
 
   // Perform space-to-batch propagation on reverse.
-  Status PropagateOnReverse(HloInstruction* reverse);
+  absl::Status PropagateOnReverse(HloInstruction* reverse);
 
   // Perform space-to-batch propagation on pad.
-  Status PropagateOnPad(HloInstruction* pad);
+  absl::Status PropagateOnPad(HloInstruction* pad);
 
   // Perform space-to-batch propagation on slice.
-  Status PropagateOnSlice(HloInstruction* slice);
+  absl::Status PropagateOnSlice(HloInstruction* slice);
 
   // Perform space-to-batch propagation on the backprop filter convolution.
   // Assumes the activations and kernel were already space-to-batched.
-  Status PropagateOnBackpropFilterConv(HloInstruction* convolution);
+  absl::Status PropagateOnBackpropFilterConv(HloInstruction* convolution);
 
   // Method that checks validity of space-to-batch on a given convolution.
   bool IsConvSuitableForSpaceToBatch(HloInstruction* convolution);
@@ -187,7 +190,7 @@ class ConvolutionVisitor {
 
   // Once a convolution has been space-to-batch'ed, this function will
   // transitively propagate the space-to-batch-ness on rest of the graph.
-  Status PropagateOnUsers(HloInstruction* old_conv);
+  absl::Status PropagateOnUsers(HloInstruction* old_conv);
 
   // Generates masked output with valid data. This is useful when larger shapes
   // are generated due to space-to-batch.
@@ -1566,6 +1569,50 @@ bool ConvolutionVisitor::IsOpcodeNonPropagatable(HloInstruction* consumer) {
   }
 }
 
+bool ConvolutionVisitor::SupportedDotForPropagation(HloInstruction* consumer,
+                                                    HloInstruction* producer) {
+  if (consumer->opcode() != HloOpcode::kDot) {
+    return false;
+  }
+  auto operand = consumer->mutable_operand(0);
+  if (operand != producer || !instr_to_dim_map_.contains(operand)) {
+    return false;
+  }
+  const auto& dnums = consumer->dot_dimension_numbers();
+  const auto& contracting_dims = dnums.lhs_contracting_dimensions();
+  const auto& batch_dims = dnums.lhs_batch_dimensions();
+  auto result = instr_to_dim_map_[operand];
+  const int64_t old_batch_dim = result[DimMapper(SpaceToBatchDimMap::kBatch)];
+  const int64_t old_space_dim = result[DimMapper(SpaceToBatchDimMap::kSpace0)];
+  const int64_t old_feature_dim =
+      result[DimMapper(SpaceToBatchDimMap::kFeature)];
+  // No feature dimension in output
+  if (consumer->operand(1)->shape().rank() ==
+      batch_dims.size() + contracting_dims.size()) {
+    return false;
+  }
+  // If the convolution space or batch dimension are contracting or batch on
+  // the dot, do not propagate.
+  bool found = false;
+  for (auto dim : batch_dims) {
+    if (dim == old_batch_dim || dim == old_space_dim) {
+      return false;
+    }
+    if (dim == old_feature_dim) {
+      found = true;
+    }
+  }
+  if (!found) {
+    return false;
+  }
+  for (auto dim : contracting_dims) {
+    if (dim == old_batch_dim || dim == old_space_dim) {
+      return false;
+    }
+  }
+  return true;
+}
+
 bool ConvolutionVisitor::SupportedOpForPropagation(HloInstruction* consumer,
                                                    HloInstruction* producer) {
   if (IsOpcodeNonPropagatable(consumer)) {
@@ -1680,6 +1727,10 @@ bool ConvolutionVisitor::SupportedOpForPropagation(HloInstruction* consumer,
     return true;
   }
 
+  if (SupportedDotForPropagation(consumer, producer)) {
+    return true;
+  }
+
   if (consumer->opcode() == HloOpcode::kReduce) {
     // Support only the trivial case where both batch and split spatial dim are
     // being reduced
@@ -1962,6 +2013,50 @@ absl::StatusOr<bool> ConvolutionVisitor::Propagate(HloInstruction* consumer,
     return true;
   }
 
+  if (consumer->opcode() == HloOpcode::kDot) {
+    auto dim_map_val = instr_to_dim_map_[producer];
+    const int64_t old_batch_dim =
+        dim_map_val[DimMapper(SpaceToBatchDimMap::kBatch)];
+    const int64_t old_space_dim =
+        dim_map_val[DimMapper(SpaceToBatchDimMap::kSpace0)];
+    int64_t new_batch_dim = -1;
+    int64_t new_space_dim = -1;
+    int64_t outer = 0;
+    for (int64_t i = 0; i < producer->shape().rank(); ++i) {
+      if (absl::c_linear_search(
+              consumer->dot_dimension_numbers().lhs_batch_dimensions(), i) ||
+          absl::c_linear_search(
+              consumer->dot_dimension_numbers().lhs_contracting_dimensions(),
+              i)) {
+        continue;
+      }
+      if (i == old_batch_dim) {
+        new_batch_dim =
+            outer +
+            consumer->dot_dimension_numbers().lhs_batch_dimensions_size();
+      }
+      if (i == old_space_dim) {
+        new_batch_dim =
+            outer +
+            consumer->dot_dimension_numbers().lhs_batch_dimensions_size();
+      }
+      ++outer;
+    }
+    std::vector<int64_t> dim_map(NumMappedDims());
+    dim_map[DimMapper(SpaceToBatchDimMap::kBatch)] = new_batch_dim;
+    dim_map[DimMapper(SpaceToBatchDimMap::kSpace0)] = new_space_dim;
+    dim_map[DimMapper(SpaceToBatchDimMap::kFeature)] =
+        consumer->shape().rank() - 1;
+    instr_to_dim_map_[consumer] = dim_map;
+    auto new_consumer = computation->AddInstruction(consumer->Clone());
+    new_consumer->mutable_shape()->mutable_dimensions()[new_batch_dim] =
+        producer->shape().dimensions(old_batch_dim);
+    new_consumer->mutable_shape()->mutable_dimensions()[new_space_dim] =
+        producer->shape().dimensions(old_space_dim);
+    old_to_new_instrs_[consumer] = new_consumer;
+    return true;
+  }
+
   // TODO(b/189500737) : Consider a common way of propagation for
   // slice/pad/reduce-window.
   if (consumer->opcode() == HloOpcode::kPad) {
@@ -2539,7 +2634,7 @@ absl::StatusOr<HloInstruction*> ConvolutionVisitor::BatchToSpace(
   return output_transpose;
 }
 
-Status ConvolutionVisitor::PropagateOnUsers(HloInstruction* old_conv) {
+absl::Status ConvolutionVisitor::PropagateOnUsers(HloInstruction* old_conv) {
   std::queue<std::pair<HloInstruction*, HloInstruction*>> propagation_worklist;
 
   if (old_conv->user_count() == 0) {
@@ -2632,7 +2727,7 @@ Status ConvolutionVisitor::PropagateOnUsers(HloInstruction* old_conv) {
   return OkStatus();
 }
 
-Status ConvolutionVisitor::PropagateOnConv(HloInstruction* convolution) {
+absl::Status ConvolutionVisitor::PropagateOnConv(HloInstruction* convolution) {
   auto activations_old = convolution->mutable_operand(0);
 
   CHECK(old_to_new_instrs_.contains(activations_old));
@@ -2839,7 +2934,7 @@ Status ConvolutionVisitor::PropagateOnConv(HloInstruction* convolution) {
   return OkStatus();
 }
 
-Status ConvolutionVisitor::PropagateOnConcat(HloInstruction* concat) {
+absl::Status ConvolutionVisitor::PropagateOnConcat(HloInstruction* concat) {
   auto first_operand = old_to_new_instrs_[concat->mutable_operand(0)];
   auto permute_dims = instr_to_dim_permute_map_[first_operand];
   const int64_t new_concat_dim =
@@ -2862,7 +2957,7 @@ Status ConvolutionVisitor::PropagateOnConcat(HloInstruction* concat) {
   return OkStatus();
 }
 
-Status ConvolutionVisitor::PropagateOnReverse(HloInstruction* reverse) {
+absl::Status ConvolutionVisitor::PropagateOnReverse(HloInstruction* reverse) {
   auto first_operand = old_to_new_instrs_[reverse->mutable_operand(0)];
   auto permute_dims = instr_to_dim_permute_map_[first_operand];
 
@@ -2883,7 +2978,7 @@ Status ConvolutionVisitor::PropagateOnReverse(HloInstruction* reverse) {
   return OkStatus();
 }
 
-Status ConvolutionVisitor::PropagateOnPad(HloInstruction* pad) {
+absl::Status ConvolutionVisitor::PropagateOnPad(HloInstruction* pad) {
   auto first_operand = old_to_new_instrs_[pad->mutable_operand(0)];
   auto permute_dims = instr_to_dim_permute_map_[first_operand];
 
@@ -2914,7 +3009,7 @@ Status ConvolutionVisitor::PropagateOnPad(HloInstruction* pad) {
   return OkStatus();
 }
 
-Status ConvolutionVisitor::PropagateOnSlice(HloInstruction* slice) {
+absl::Status ConvolutionVisitor::PropagateOnSlice(HloInstruction* slice) {
   auto operand = old_to_new_instrs_[slice->mutable_operand(0)];
   auto permute_dims = instr_to_dim_permute_map_[operand];
 
@@ -3135,7 +3230,7 @@ absl::StatusOr<HloInstruction*> ConvolutionVisitor::PropagateOnConstant(
   return new_consumer;
 }
 
-Status ConvolutionVisitor::PropagateOnBackpropFilterConv(
+absl::Status ConvolutionVisitor::PropagateOnBackpropFilterConv(
     HloInstruction* convolution) {
   auto activations_old = convolution->mutable_operand(0);
 
@@ -3617,7 +3712,8 @@ ConvolutionVisitor::DoesConvolutionFeedReduceWindowOrSelectAndScatter(
     // Stop the search if these ops are encountered.
     if (user->opcode() == HloOpcode::kConvolution ||
         user->opcode() == HloOpcode::kPad ||
-        user->opcode() == HloOpcode::kTranspose) {
+        user->opcode() == HloOpcode::kTranspose ||
+        user->opcode() == HloOpcode::kDot) {
       continue;
     }
     auto ret =
@@ -3769,7 +3865,7 @@ ConvolutionVisitor::ConvDetails ConvolutionVisitor::GetConvolutionDetails(
                      input_dim_size};
 }
 
-Status ConvolutionVisitor::PerformSpaceToBatchOnConvolution(
+absl::Status ConvolutionVisitor::PerformSpaceToBatchOnConvolution(
     HloInstruction* convolution) {
   if (!ConsumeFuel("space-to-batch-converter", [&] {
         return "Skipping space-to-batch propagation because fuel over\n";
@@ -3984,7 +4080,6 @@ Status ConvolutionVisitor::PerformSpaceToBatchOnConvolution(
   }
   TF_CHECK_OK(PropagateOnUsers(original_conv));
 
-
   return OkStatus();
 }
 
diff --git a/third_party/xla/xla/service/space_to_batch_converter.h b/third_party/xla/xla/service/space_to_batch_converter.h
index 2d9dba06a2b581..0686c12f91f333 100644
--- a/third_party/xla/xla/service/space_to_batch_converter.h
+++ b/third_party/xla/xla/service/space_to_batch_converter.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef XLA_SERVICE_SPACE_TO_BATCH_CONVERTER_H_
 #define XLA_SERVICE_SPACE_TO_BATCH_CONVERTER_H_
 
+#include <stdbool.h>
+
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo_pass_interface.h"
@@ -29,6 +31,7 @@ struct SpaceToBatchController {
   bool enable_propagations_on_trivial_window_dilations;
   bool disable_starting_on_small_chains;
   int64_t limit_on_batch_size;
+  bool enable_propagations_on_dots = false;
   int64_t dimension_from_end_to_convert = 1;
   // We choose the new batch size to be number_of_splits times that of the old
   // batch so that space-to-batch propagation through several convolutional
diff --git a/third_party/xla/xla/service/space_to_batch_converter_test.cc b/third_party/xla/xla/service/space_to_batch_converter_test.cc
index 7198ae21140119..e2ed3314bc4f6f 100644
--- a/third_party/xla/xla/service/space_to_batch_converter_test.cc
+++ b/third_party/xla/xla/service/space_to_batch_converter_test.cc
@@ -247,5 +247,30 @@ ENTRY computation {
   EXPECT_GT(previous_reshape->operand(0)->shape().dimensions(batch_dim), 4);
 }
 
+TEST_F(SpaceToBatchConverterTest, PropagateThroughDot) {
+  std::string hlo_string = R"(
+  HloModule module
+
+  ENTRY computation {
+    %p0 = bf16[1,258,258,32] parameter(0)
+    %p1 = bf16[3,3,32,32] parameter(1)
+    %convolution = bf16[1,256,256,32] convolution(%p0, %p1), window={size=3x3},
+    dim_labels=b01f_01io->b01f
+    %p2 = bf16[32,32] parameter(2)
+    ROOT %dot.5010 = bf16[1,256,256,32] dot(%convolution, %p2),
+      lhs_contracting_dims={3},
+      rhs_contracting_dims={0}
+  }
+
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  SpaceToBatchConverter converter(
+      SpaceToBatchController{true, true, true, true, 8});
+  // Test that we do not start space-to-batch on conv->dot chains.
+  ASSERT_TRUE(converter.Run(module.get()).value());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/spmd/BUILD b/third_party/xla/xla/service/spmd/BUILD
index 93c0c84c6f0944..cb83f5527766c8 100644
--- a/third_party/xla/xla/service/spmd/BUILD
+++ b/third_party/xla/xla/service/spmd/BUILD
@@ -52,6 +52,7 @@ cc_library(
         "//xla/client/lib:comparators",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/ir:hlo_reachability",
+        "//xla/hlo/ir:tile_assignment",
         "//xla/hlo/utils:hlo_query",
         "//xla/hlo/utils:hlo_sharding_util",
         "//xla/service:call_graph",
@@ -66,6 +67,7 @@ cc_library(
         "//xla/service:hlo_module_config",
         "//xla/service:hlo_pass",
         "//xla/service:hlo_pass_pipeline",
+        "//xla/service:host_memory_offload_annotations_hdr",
         "//xla/service:pattern_matcher",
         "//xla/service:shape_inference",
         "//xla/service:sharding_propagation",
@@ -95,18 +97,22 @@ xla_cc_test(
     deps = [
         ":spmd_partitioner",
         ":spmd_prepare",
+        "//xla:status",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_matchers",
         "//xla/hlo/utils:hlo_sharding_util",
+        "//xla/service:hlo_module_config",
         "//xla/service:hlo_pass_pipeline",
         "//xla/service:hlo_verifier",
         "//xla/service:sharding_format_picker",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -127,6 +133,15 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "shard_barrier_partitioner",
+    hdrs = ["shard_barrier_partitioner.h"],
+    deps = [
+        "//xla/hlo/ir:hlo",
+        "//xla/service:custom_call_sharding_helper",
+    ],
+)
+
 cc_library(
     name = "canonicalize_all_gather_for_cse",
     srcs = ["canonicalize_all_gather_for_cse.cc"],
diff --git a/third_party/xla/xla/service/spmd/canonicalize_all_gather_for_cse.cc b/third_party/xla/xla/service/spmd/canonicalize_all_gather_for_cse.cc
index 68e13c5978b124..e2fd08e40e4845 100644
--- a/third_party/xla/xla/service/spmd/canonicalize_all_gather_for_cse.cc
+++ b/third_party/xla/xla/service/spmd/canonicalize_all_gather_for_cse.cc
@@ -81,7 +81,7 @@ absl::StatusOr<bool> CanonicalizeAllGatherForCSE::RunOnComputation(
     HloInstruction* new_ag =
         comp->AddInstruction(HloInstruction::CreateAllGather(
             new_ag_shape, {real_data}, /*all_gather_dimension=*/new_ag_dim,
-            ag->replica_groups(), ag->constrain_layout(), new_channel_id,
+            ag->device_list(), ag->constrain_layout(), new_channel_id,
             ag->use_global_device_ids()));
     ag->SetupDerivedInstruction(new_ag);
     HloInstruction* new_formatting = comp->AddInstruction(
diff --git a/third_party/xla/xla/service/spmd/canonicalize_all_gather_for_cse_test.cc b/third_party/xla/xla/service/spmd/canonicalize_all_gather_for_cse_test.cc
index b7e3b2a8c7a8b1..bcba61a080748d 100644
--- a/third_party/xla/xla/service/spmd/canonicalize_all_gather_for_cse_test.cc
+++ b/third_party/xla/xla/service/spmd/canonicalize_all_gather_for_cse_test.cc
@@ -43,7 +43,8 @@ class AllGatherCanonicalizeTest : public HloTestBase {
     TF_RETURN_IF_ERROR(pipeline.Run(module.get()).status());
     return absl::StatusOr<std::unique_ptr<HloModule>>(std::move(module));
   }
-  Status RunPassOnModule(HloModule* module, int64_t distance_threshold = 100) {
+  absl::Status RunPassOnModule(HloModule* module,
+                               int64_t distance_threshold = 100) {
     HloPassPipeline pipeline("all-gather-cse");
     pipeline.AddPass<CanonicalizeAllGatherForCSE>();
     TF_RETURN_IF_ERROR(pipeline.Run(module).status());
diff --git a/third_party/xla/xla/service/spmd/convolution_handler.cc b/third_party/xla/xla/service/spmd/convolution_handler.cc
index 2a88430331db97..84b9d65ff11177 100644
--- a/third_party/xla/xla/service/spmd/convolution_handler.cc
+++ b/third_party/xla/xla/service/spmd/convolution_handler.cc
@@ -1017,7 +1017,7 @@ absl::StatusOr<HloInstruction*> PartitionConvolution(
   return nullptr;
 }
 
-Status SpmdPartitioningVisitor::HandleConvolution(HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::HandleConvolution(HloInstruction* hlo) {
   if (hlo->sharding().HasUniqueDevice()) {
     return DefaultAction(hlo);
   }
diff --git a/third_party/xla/xla/service/spmd/custom_call_handler.cc b/third_party/xla/xla/service/spmd/custom_call_handler.cc
index e28aca78b12b45..919ced6bc29837 100644
--- a/third_party/xla/xla/service/spmd/custom_call_handler.cc
+++ b/third_party/xla/xla/service/spmd/custom_call_handler.cc
@@ -44,6 +44,7 @@ limitations under the License.
 #include "xla/service/custom_call_sharding_helper.h"
 #include "xla/service/hlo_lexer.h"
 #include "xla/service/hlo_module_config.h"
+#include "xla/service/host_memory_offload_annotations.h"
 #include "xla/service/spmd/spmd_partitioner.h"
 #include "xla/service/spmd/spmd_partitioner_util.h"
 #include "xla/shape.h"
@@ -83,7 +84,8 @@ constexpr char kSPMDOpRotateRight[] = "_SPMDInternalOp_RotateRight";
 
 }  // namespace
 
-Status SpmdPartitioningVisitor::HandleCustomCallTopK(HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::HandleCustomCallTopK(
+    HloInstruction* hlo) {
   if (!hlo->operand(0)->has_sharding()) {
     return DefaultAction(hlo);
   }
@@ -253,7 +255,7 @@ Status SpmdPartitioningVisitor::HandleCustomCallTopK(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status SpmdPartitioningVisitor::HandleCustomCallSPMDInternal_RotateRight(
+absl::Status SpmdPartitioningVisitor::HandleCustomCallSPMDInternal_RotateRight(
     HloInstruction* hlo) {
   TF_ASSIGN_OR_RETURN(auto attrs, ParseOpaqueAsAttributes(hlo));
   auto dim_it = attrs.find("dimension");
@@ -389,7 +391,7 @@ std::unique_ptr<HloInstruction> CreateCustomCallSPMDInternal_RotateRight(
                                           kSPMDOpRotateRight, opaque);
 }
 
-Status SpmdPartitioningVisitor::HandleCustomCall(HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::HandleCustomCall(HloInstruction* hlo) {
   if (auto* partitioner = GetCustomCallPartitioner(hlo->custom_call_target())) {
     return partitioner->Partition(this, hlo);
   }
@@ -457,6 +459,27 @@ Status SpmdPartitioningVisitor::HandleCustomCall(HloInstruction* hlo) {
     return HandleCustomCallTopK(hlo);
   }
 
+  if (hlo->custom_call_target() ==
+      host_memory_offload_annotations::kMoveToHostCustomCallTarget) {
+    return HandleElementwise(hlo);
+  }
+
+  if (hlo->custom_call_target() ==
+      host_memory_offload_annotations::kMoveToDeviceCustomCallTarget) {
+    // Use the operand's sharding to shard the move-to-device op. This avoids
+    // inserting any resharding before the custom call so that the
+    // host-offloader pass can pattern match the offloading sequences correctly.
+    const HloSharding& sharding = hlo->operand(0)->sharding();
+    HloInstruction* move_to_device = b_.AddInstruction(
+        hlo->CloneWithNewOperands(MakePartitionedShape(hlo->shape(), sharding),
+                                  {GetPartitionedHlo(hlo->operand(0)).hlo()}));
+    move_to_device->set_sharding(sharding);
+    SetPartitionedHlo(hlo, PartitionedHlo(move_to_device, hlo->shape(),
+                                          MakePartitioningState())
+                               .Reshard(hlo->sharding()));
+    return OkStatus();
+  }
+
   return DefaultAction(hlo);
 }
 
diff --git a/third_party/xla/xla/service/spmd/dot_handler.cc b/third_party/xla/xla/service/spmd/dot_handler.cc
index 8115a78794672d..d19f074b6a93ef 100644
--- a/third_party/xla/xla/service/spmd/dot_handler.cc
+++ b/third_party/xla/xla/service/spmd/dot_handler.cc
@@ -64,7 +64,7 @@ namespace {
 using hlo_sharding_util::GroupedSharding;
 }  // namespace
 
-Status SpmdPartitioningVisitor::HandleDot(HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::HandleDot(HloInstruction* hlo) {
   DotConvDimsMapping mapping;
   const auto& dnums = hlo->dot_dimension_numbers();
   int64_t next_output_dim = 0;
@@ -1909,22 +1909,57 @@ absl::StatusOr<HloInstruction*> PartitionBaseCase(
     }
   }
 
+  // If we see a dot that shares the same operand with a windowed einsum ag loop
+  // and disable_ag_rewrite_for_multiple_consumers is true. We skip rewriting
+  // the current dot. We also skip any reshape operand as long as it only has
+  // the lhs or rhs of the dot as the only user since reshape ops won't change
+  // the functional meaning of the pattern.
+  auto has_reshape_operand = [](PartitionedHlo& hlo) -> bool {
+    return hlo.hlo()->opcode() == HloOpcode::kReshape ||
+           hlo.hlo()->opcode() == HloOpcode::kBitcast ||
+           hlo.hlo()->opcode() == HloOpcode::kTranspose;
+  };
+  bool should_skip_windowed_einsum = false;
+  if (options.disable_ag_rewrite_for_multiple_consumers) {
+    auto lhs_operand =
+        has_reshape_operand(lhs) ? lhs.hlo()->operand(0) : lhs.hlo();
+    auto rhs_operand =
+        has_reshape_operand(rhs) ? rhs.hlo()->operand(0) : rhs.hlo();
+    for (auto loop : *windowed_dot_general_loops) {
+      if (loop.while_loop->while_body()->name().find(
+              "windowed_dot_general_body_ag") == 0) {
+        auto cm_lhs = loop.while_loop->operand(0)->operand(0);
+        if (cm_lhs == lhs_operand || cm_lhs == rhs_operand) {
+          VLOG(2) << "Skip processing: " << original_hlo->ToString();
+          VLOG(2) << "It shares the same operand with "
+                  << loop.while_loop->ToString()
+                  << " and disable_ag_rewrite_for_multiple_consumers is set to "
+                     "true.";
+          should_skip_windowed_einsum = true;
+        }
+      }
+    }
+  }
+
   // Hard limit on iteration count based on empirical data (above this amount
   // there's pretty significant overhead).
   constexpr int64_t kMaxIterations = 32;
-  std::optional<WindowedEinsumConfig> e_config = GetWindowedEinsumConfiguration(
-      num_partitions, output_lhs_non_contracting_partitions,
-      output_rhs_non_contracting_partitions, rhs_contracting_partitions,
-      rhs_non_contracting_partitions, rhs_batch_partitions,
-      lhs_contracting_partitions, lhs_non_contracting_partitions,
-      lhs_batch_partitions, ShapeSizeInBytes(rhs.base_shape()),
-      ShapeSizeInBytes(lhs.base_shape()), ShapeSizeInBytes(output_base_shape),
-      options, output_sharding_transposed_to_match_lhs,
-      output_sharding_transposed_to_match_rhs,
-      lhs_sharding_transposed_to_match_rhs,
-      rhs_sharding_transposed_to_match_lhs, lhs_sharding, rhs_sharding,
-      conv_window, dims_mapping, visitor->call_graph(), kMaxIterations,
-      original_hlo, &lhs, &rhs, create_sharded_dot, b, module, visitor);
+  std::optional<WindowedEinsumConfig> e_config = std::nullopt;
+  if (!should_skip_windowed_einsum) {
+    e_config = GetWindowedEinsumConfiguration(
+        num_partitions, output_lhs_non_contracting_partitions,
+        output_rhs_non_contracting_partitions, rhs_contracting_partitions,
+        rhs_non_contracting_partitions, rhs_batch_partitions,
+        lhs_contracting_partitions, lhs_non_contracting_partitions,
+        lhs_batch_partitions, ShapeSizeInBytes(rhs.base_shape()),
+        ShapeSizeInBytes(lhs.base_shape()), ShapeSizeInBytes(output_base_shape),
+        options, output_sharding_transposed_to_match_lhs,
+        output_sharding_transposed_to_match_rhs,
+        lhs_sharding_transposed_to_match_rhs,
+        rhs_sharding_transposed_to_match_lhs, lhs_sharding, rhs_sharding,
+        conv_window, dims_mapping, visitor->call_graph(), kMaxIterations,
+        original_hlo, &lhs, &rhs, create_sharded_dot, b, module, visitor);
+  }
   if (e_config) {
     VLOG(2) << "Emit windowed dot.";
     return EmitWindowedDotGeneral(
@@ -4229,7 +4264,7 @@ absl::StatusOr<HloInstruction*> PartitionDot(
 
 }  // namespace
 
-Status SpmdPartitioningVisitor::HandleDotHelper(
+absl::Status SpmdPartitioningVisitor::HandleDotHelper(
     HloInstruction* hlo, const DotConvDimsMapping& dims_mapping,
     absl::FunctionRef<absl::StatusOr<HloInstruction*>(
         HloInstruction*, HloInstruction*, SpmdBuilder*,
@@ -4327,7 +4362,7 @@ FindInputNodesIfOnlyDependOnSmallOperands(HloInstruction* hlo) {
 //
 // Later optimization passes (TpuPadSliceMover) will merge the dynamic slice
 // with the input nodes.
-Status SinkInputNodesIntoWindowedDotGeneralLoopOnContractingDimensions(
+absl::Status SinkInputNodesIntoWindowedDotGeneralLoopOnContractingDimensions(
     HloInstruction* loop, int64_t non_windowed_operand_index) {
   auto input_tuple = loop->mutable_operand(0);
   auto old_operand = input_tuple->mutable_operand(non_windowed_operand_index);
@@ -4461,7 +4496,7 @@ bool CheckOperandsRecursive(
 //
 // Later optimization passes (TpuPadSliceMover) will merge the dynamic slice
 // with the input nodes (broadcast).
-Status MoveUsersIntoWindowedDotGeneralLoopOnNonContractingDimensions(
+absl::Status MoveUsersIntoWindowedDotGeneralLoopOnNonContractingDimensions(
     HloInstruction* loop, const SpmdPartitionerOptions& options) {
   CHECK_EQ(loop->user_count(), 1);
   // There should be a single direct user of the while loop, which is the
@@ -4919,7 +4954,7 @@ Status MoveUsersIntoWindowedDotGeneralLoopOnNonContractingDimensions(
 
 }  // namespace
 
-Status SpmdPartitioningVisitor::DoCodeMotionForWindowedDotGeneralLoops(
+absl::Status SpmdPartitioningVisitor::DoCodeMotionForWindowedDotGeneralLoops(
     HloComputation* computation, const SpmdPartitionerOptions& options) {
   for (auto& loop : windowed_dot_general_loops_) {
     if (loop.windowed_in_contracting_dims || loop.windowed_in_batch_dims ||
diff --git a/third_party/xla/xla/service/spmd/fft_handler.cc b/third_party/xla/xla/service/spmd/fft_handler.cc
index 76cf9abedd8ea6..25984d96c981ea 100644
--- a/third_party/xla/xla/service/spmd/fft_handler.cc
+++ b/third_party/xla/xla/service/spmd/fft_handler.cc
@@ -346,7 +346,7 @@ HloInstruction* SliceValidData(HloInstruction* hlo, const Shape& target_shape,
 }  // namespace
 
 // Distributed FFT using the algorithm described in go/tpu-spmd-fft.
-Status SpmdPartitioningVisitor::HandleFft(HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::HandleFft(HloInstruction* hlo) {
   if (hlo->operand(0)->shape().rank() < 3 || hlo->fft_type() != FftType::FFT) {
     return DefaultAction(hlo);
   }
diff --git a/third_party/xla/xla/service/spmd/gather_scatter_handler.cc b/third_party/xla/xla/service/spmd/gather_scatter_handler.cc
index e8b97b25d4d175..c088e041cec891 100644
--- a/third_party/xla/xla/service/spmd/gather_scatter_handler.cc
+++ b/third_party/xla/xla/service/spmd/gather_scatter_handler.cc
@@ -825,7 +825,7 @@ absl::StatusOr<HloInstruction*> PartitionGather(
 
 }  // namespace
 
-Status SpmdPartitioningVisitor::HandleGather(HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::HandleGather(HloInstruction* hlo) {
   if (hlo->sharding().HasUniqueDevice()) {
     return DefaultAction(hlo);
   }
@@ -1613,7 +1613,7 @@ absl::StatusOr<HloInstruction*> PartitionScatter(
 
 }  // namespace
 
-Status SpmdPartitioningVisitor::HandleScatter(HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::HandleScatter(HloInstruction* hlo) {
   if (hlo->sharding().HasUniqueDevice()) {
     return DefaultAction(hlo);
   }
diff --git a/third_party/xla/xla/service/spmd/shard_barrier_partitioner.h b/third_party/xla/xla/service/spmd/shard_barrier_partitioner.h
new file mode 100644
index 00000000000000..e7975b0a717f87
--- /dev/null
+++ b/third_party/xla/xla/service/spmd/shard_barrier_partitioner.h
@@ -0,0 +1,82 @@
+/* Copyright 2024 The OpenXLA Authors.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_SPMD_SHARD_BARRIER_PARTITIONER_H_
+#define XLA_SERVICE_SPMD_SHARD_BARRIER_PARTITIONER_H_
+
+#include <optional>
+
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/service/custom_call_sharding_helper.h"
+
+namespace xla {
+namespace spmd {
+
+constexpr char kShardBarrierFrom[] = "ShardBarrierFrom";
+constexpr char kShardBarrierTo[] = "ShardBarrierTo";
+
+// Custom-call partitioner shard barrier.
+class ShardBarrierPartitioner : public CustomCallPartitioner {
+ public:
+  // This allows ShardBarrier related custom-call ops to be propagated during
+  // sharding propagation.
+  bool IsCustomCallShardable(const HloInstruction* instruction) const override {
+    return true;
+  }
+
+  // This allows replicated sharding on custom-call op to pass checks at spmd
+  // partitioner preprocess stage.
+  bool CanSideEffectingHaveReplicatedSharding() const override { return true; }
+};
+
+// Custom-call partitioner for ShardBarrierFrom.
+class ShardBarrierFromPartitioner : public ShardBarrierPartitioner {
+ public:
+  // Always do not allow forward propagation with ShardBarrierFrom.
+  std::optional<HloSharding> InferShardingFromOperands(
+      const HloInstruction* instruction) const override {
+    return std::nullopt;
+  }
+
+  // Always let backward propagation run through with ShardBarrierFrom.
+  HloSharding PropagateUserSharding(
+      const HloInstruction* instruction, const HloInstruction* user,
+      const HloSharding& sharding) const override {
+    return sharding;
+  }
+};
+
+// Custom-call partitioner for TPU logger.
+class ShardBarrierToPartitioner : public ShardBarrierPartitioner {
+ public:
+  // Always let forward propagation run through with ShardBarrierTo.
+  std::optional<HloSharding> InferShardingFromOperands(
+      const HloInstruction* instruction) const override {
+    if (instruction->operand(0)->has_sharding()) {
+      return instruction->operand(0)->sharding();
+    }
+    return std::nullopt;
+  }
+
+  // Always do not allow backward propagation with ShardBarrierTo.
+  HloSharding PropagateUserSharding(
+      const HloInstruction* instruction, const HloInstruction* user,
+      const HloSharding& sharding) const override {
+    return HloSharding::Replicate();
+  }
+};
+
+}  // namespace spmd
+}  // namespace xla
+
+#endif  // XLA_SERVICE_SPMD_SHARD_BARRIER_PARTITIONER_H_
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner.cc b/third_party/xla/xla/service/spmd/spmd_partitioner.cc
index f812cceee299dc..9fe9eb43b3293a 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner.cc
@@ -39,12 +39,14 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/array.h"
 #include "xla/comparison_util.h"
+#include "xla/hlo/ir/collective_device_list.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/hlo/ir/tile_assignment.h"
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/hlo/utils/hlo_sharding_util.h"
 #include "xla/layout_util.h"
@@ -67,7 +69,6 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/status_macros.h"
-#include "xla/statusor.h"
 #include "xla/types.h"
 #include "xla/util.h"
 #include "xla/window_util.h"
@@ -208,7 +209,7 @@ bool ShouldKeepSharding(const HloInstruction* hlo) {
 
 // Clears all sharding attributes from instructions in the module. This must be
 // called only after all SPMD transformation is complete.
-Status ClearShardingAttributes(
+absl::Status ClearShardingAttributes(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   for (HloComputation* computation : module->computations(execution_threads)) {
@@ -222,28 +223,6 @@ Status ClearShardingAttributes(
   return OkStatus();
 }
 
-std::vector<std::vector<int64_t>> GetPartitionGroupsForReplication(
-    const HloSharding& sharding, absl::Span<const int64_t> replication_dims) {
-  int64_t group_size = 1;
-  for (int64_t i : replication_dims) {
-    group_size *= sharding.tile_assignment().dim(i);
-  }
-  std::vector<std::vector<int64_t>> partition_groups(
-      sharding.tile_assignment().num_elements() / group_size);
-  sharding.tile_assignment().Each(
-      [&](absl::Span<const int64_t> indices, int64_t partition) {
-        int64_t group_id = 0;
-        for (int64_t i = 0; i < indices.size(); ++i) {
-          if (!absl::c_linear_search(replication_dims, i)) {
-            group_id *= sharding.tile_assignment().dim(i);
-            group_id += indices[i];
-          }
-        }
-        partition_groups[group_id].push_back(partition);
-      });
-  return partition_groups;
-}
-
 // Returns a sharding that is replicated on all the dimensions where the given
 // window is not unary.
 HloSharding GetShardingReplicatedOnWindowedDimension(
@@ -2287,7 +2266,31 @@ std::vector<ReplicaGroup> SpmdPartitioningVisitor::CreateReplicaGroups(
   return device_groups;
 }
 
-Status SpmdPartitioningVisitor::DefaultAction(HloInstruction* hlo) {
+Status SpmdPartitioningVisitor::HandleCall(HloInstruction* hlo) {
+  std::vector<HloInstruction*> call_args;
+  HloComputation* computation = hlo->called_computations()[0];
+  for (int64_t i = 0; i < hlo->operand_count(); ++i) {
+    // Shardings of the computation parameter and its argument must be
+    // the same.
+    computation->parameter_instruction(i)->set_sharding(
+        hlo->operand(i)->sharding());
+    call_args.push_back(GetPartitionedHlo(hlo->operand(i)).hlo());
+  }
+
+  TF_RETURN_IF_ERROR(partitioner_
+                         ->PartitionComputation(computation, hlo->sharding(),
+                                                next_channel_id_, logger_,
+                                                call_graph_)
+                         .status());
+  SetPartitionedHlo(hlo, [&] {
+    return b_.AddInstruction(HloInstruction::CreateCall(
+        MakePartitionedShape(hlo->shape(), hlo->sharding()), call_args,
+        hlo->called_computations()[0]));
+  });
+  return OkStatus();
+}
+
+absl::Status SpmdPartitioningVisitor::DefaultAction(HloInstruction* hlo) {
   if (hlo->HasSideEffect() && !hlo->sharding().HasUniqueDevice()) {
     return Unimplemented("Side-effect ops cannot be replicated: %s",
                          hlo->ToString());
@@ -2332,7 +2335,7 @@ Status SpmdPartitioningVisitor::DefaultAction(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status SpmdPartitioningVisitor::Preprocess(HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::Preprocess(HloInstruction* hlo) {
   visiting_hlo_ = hlo;
   b_.set_visiting_hlo(hlo);
   // Temporarily replace manual sharding to one-device sharding so that the
@@ -2364,7 +2367,8 @@ Status SpmdPartitioningVisitor::Preprocess(HloInstruction* hlo) {
       hlo->opcode() != HloOpcode::kParameter &&
       hlo->opcode() != HloOpcode::kWhile && hlo->opcode() != HloOpcode::kRng &&
       hlo->opcode() != HloOpcode::kOutfeed &&
-      hlo->opcode() != HloOpcode::kAllReduce) {
+      hlo->opcode() != HloOpcode::kAllReduce &&
+      hlo->opcode() != HloOpcode::kCall) {
     const bool has_manual_sharding =
         hlo->sharding().IsManual() ||
         (hlo->sharding().IsTuple() &&
@@ -2402,6 +2406,7 @@ Status SpmdPartitioningVisitor::Preprocess(HloInstruction* hlo) {
                             return sharding.IsManualSubgroup();
                           }));
       if (has_manual_subgroup && !hlo->IsCustomCall("SPMDFullToShardShape") &&
+          !hlo->IsCustomCall("SPMDShardToFullShape") &&
           hlo->opcode() != HloOpcode::kGetTupleElement) {
         auto get_grouped_sharding =
             [&](const HloSharding& sharding, const Shape& shape,
@@ -2496,7 +2501,7 @@ Status SpmdPartitioningVisitor::Preprocess(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status SpmdPartitioningVisitor::Postprocess(HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::Postprocess(HloInstruction* hlo) {
   logger_->RegisterLogEntry(hlo, b_.derived_instructions(hlo));
   visiting_hlo_ = nullptr;
   b_.set_visiting_hlo(nullptr);
@@ -2535,7 +2540,7 @@ Status SpmdPartitioningVisitor::Postprocess(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status SpmdPartitioningVisitor::HandleElementwise(HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::HandleElementwise(HloInstruction* hlo) {
   std::vector<HloInstruction*> new_operands;
   for (HloInstruction* operand : hlo->operands()) {
     new_operands.push_back(
@@ -2548,7 +2553,7 @@ Status SpmdPartitioningVisitor::HandleElementwise(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status SpmdPartitioningVisitor::HandleConcatenate(HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::HandleConcatenate(HloInstruction* hlo) {
   const HloSharding& sharding = hlo->sharding();
   if (sharding.IsTileMaximal()) {
     return DefaultAction(hlo);
@@ -2601,7 +2606,8 @@ Status SpmdPartitioningVisitor::HandleConcatenate(HloInstruction* hlo) {
   int64_t offset = 0;
   auto state = MakePartitioningState();
   for (HloInstruction* operand : hlo->operands()) {
-    auto spmd_operand = GetPartitionedHlo(operand).Reshard(sharding).hlo();
+    auto spmd_operand =
+        GetPartitionedHlo(operand).Reshard(sharding).PadWithZero().hlo();
     std::vector<HloInstruction*> start_indices(
         hlo->shape().rank(), b_.AddInstruction(HloInstruction::CreateConstant(
                                  LiteralUtil::Zero(S32))));
@@ -2644,7 +2650,7 @@ Status SpmdPartitioningVisitor::HandleConcatenate(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status SpmdPartitioningVisitor::HandleSlice(HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::HandleSlice(HloInstruction* hlo) {
   const HloSharding& sharding = hlo->sharding();
   if (sharding.IsTileMaximal()) {
     return DefaultAction(hlo);
@@ -2673,7 +2679,7 @@ Status SpmdPartitioningVisitor::HandleSlice(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status SpmdPartitioningVisitor::HandleSort(HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::HandleSort(HloInstruction* hlo) {
   HloSharding sharding = hlo->sharding();
   int64_t input_count = 1;
   if (hlo->shape().IsTuple()) {
@@ -2932,7 +2938,7 @@ Status SpmdPartitioningVisitor::HandleSort(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status SpmdPartitioningVisitor::HandleTranspose(HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::HandleTranspose(HloInstruction* hlo) {
   const HloSharding& sharding = hlo->sharding();
   if (sharding.IsTileMaximal()) {
     return DefaultAction(hlo);
@@ -2955,43 +2961,50 @@ Status SpmdPartitioningVisitor::HandleTranspose(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status SpmdPartitioningVisitor::HandleReshape(HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::HandleReshape(HloInstruction* hlo) {
   const HloSharding& sharding = hlo->sharding();
   if (sharding.IsTileMaximal()) {
     return DefaultAction(hlo);
   }
 
   auto operand = GetPartitionedHlo(hlo->operand(0));
-  // The output shape is the source and the operand shape is the target to get
-  // the aligned sharding for the operand.
-  std::optional<HloSharding> desired_operand_sharding =
-      hlo_sharding_util::ReshapeSharding(hlo->shape(), hlo->operand(0)->shape(),
-                                         hlo->sharding());
-  // Use the desired operand sharding only if the number of tiles returned
-  // matches the number of tiles in the output.
-  if (desired_operand_sharding.has_value() &&
-      hlo->sharding().NumTiles() == desired_operand_sharding->NumTiles()) {
-    auto operand_hlo = operand.Reshard(*desired_operand_sharding).hlo();
-    SetPartitionedHlo(hlo, [&] {
+  auto desired_operand = [&](const HloSharding& output_sharding)
+      -> std::optional<HloInstruction*> {
+    // The output shape is the source and the operand shape is the target to get
+    // desired_operand_sharding.
+    std::optional<HloSharding> desired_operand_sharding =
+        hlo_sharding_util::ReshapeSharding(
+            hlo->shape(), hlo->operand(0)->shape(), output_sharding);
+    if (desired_operand_sharding.has_value() &&
+        output_sharding.NumTiles() == desired_operand_sharding->NumTiles()) {
       return b_.AddInstruction(hlo->CloneWithNewOperands(
-          MakePartitionedShape(hlo->shape(), hlo->sharding()), {operand_hlo}));
-    });
+          MakePartitionedShape(hlo->shape(), output_sharding),
+          {operand.Reshard(*desired_operand_sharding).hlo()}));
+    }
+    return std::nullopt;
+  };
+
+  // Try the original output sharding at first.
+  if (auto operand_hlo = desired_operand(hlo->sharding())) {
+    SetPartitionedHlo(hlo, [&] { return *operand_hlo; });
     return OkStatus();
   }
+
+  // Then try the desired_output_sharding.
   std::optional<HloSharding> desired_output_sharding =
       hlo_sharding_util::ReshapeSharding(hlo->operand(0)->shape(), hlo->shape(),
                                          operand.sharding());
   if (desired_output_sharding.has_value()) {
-    auto reshape = b_.AddInstruction(hlo->CloneWithNewOperands(
-        MakePartitionedShape(hlo->shape(), *desired_output_sharding),
-        {operand.hlo()}));
-    reshape->set_sharding(*desired_output_sharding);
-    SetPartitionedHlo(hlo, [&] {
-      return PartitionedHlo(reshape, hlo->shape(), MakePartitioningState())
-          .Reshard(sharding)
-          .hlo();
-    });
-    return OkStatus();
+    if (auto operand_hlo = desired_operand(*desired_output_sharding)) {
+      (*operand_hlo)->set_sharding(*desired_output_sharding);
+      SetPartitionedHlo(hlo, [&] {
+        return PartitionedHlo(*operand_hlo, hlo->shape(),
+                              MakePartitioningState())
+            .Reshard(hlo->sharding())
+            .hlo();
+      });
+      return OkStatus();
+    }
   }
 
   auto shard_reshape =
@@ -3235,7 +3248,7 @@ Status SpmdPartitioningVisitor::HandleReshape(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status SpmdPartitioningVisitor::HandleIota(HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::HandleIota(HloInstruction* hlo) {
   const HloSharding& sharding = hlo->sharding();
   if (sharding.IsTileMaximal()) {
     return DefaultAction(hlo);
@@ -3270,7 +3283,8 @@ Status SpmdPartitioningVisitor::HandleIota(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status SpmdPartitioningVisitor::HandleSingleDevice(const HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::HandleSingleDevice(
+    const HloInstruction* hlo) {
   TF_RET_CHECK(hlo->sharding().HasUniqueDevice());
   int64_t device = hlo->sharding().GetUniqueDevice();
   const HloSharding sharding = HloSharding::AssignDevice(device);
@@ -3326,7 +3340,7 @@ Status SpmdPartitioningVisitor::HandleSingleDevice(const HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status SpmdPartitioningVisitor::HandleAllReduce(HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::HandleAllReduce(HloInstruction* hlo) {
   if (hlo->IsCrossReplicaAllReduce() && hlo->operand_count() == 1) {
     return HandleElementwise(hlo);
   }
@@ -3376,7 +3390,7 @@ Status SpmdPartitioningVisitor::HandleAllReduce(HloInstruction* hlo) {
   return DefaultAction(hlo);
 }
 
-Status SpmdPartitioningVisitor::HandleBroadcast(HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::HandleBroadcast(HloInstruction* hlo) {
   if (hlo->sharding().IsTileMaximal()) {
     return DefaultAction(hlo);
   }
@@ -3403,7 +3417,7 @@ Status SpmdPartitioningVisitor::HandleBroadcast(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status SpmdPartitioningVisitor::HandleConstant(HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::HandleConstant(HloInstruction* hlo) {
   const Literal& literal = hlo->literal();
   if (literal.shape().IsTuple() ||
       (!hlo->sharding().IsTileMaximal() &&
@@ -3423,7 +3437,7 @@ Status SpmdPartitioningVisitor::HandleConstant(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status SpmdPartitioningVisitor::HandleDynamicSlice(HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::HandleDynamicSlice(HloInstruction* hlo) {
   if (hlo->sharding().IsTileMaximal()) {
     return DefaultAction(hlo);
   }
@@ -3460,7 +3474,8 @@ Status SpmdPartitioningVisitor::HandleDynamicSlice(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status SpmdPartitioningVisitor::HandleDynamicUpdateSlice(HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::HandleDynamicUpdateSlice(
+    HloInstruction* hlo) {
   if (hlo->sharding().IsTileMaximal()) {
     return DefaultAction(hlo);
   }
@@ -3666,7 +3681,8 @@ Status SpmdPartitioningVisitor::HandleDynamicUpdateSlice(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status SpmdPartitioningVisitor::HandleGetTupleElement(HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::HandleGetTupleElement(
+    HloInstruction* hlo) {
   if (hlo->sharding().IsManual()) {
     return DefaultAction(hlo);
   }
@@ -3685,7 +3701,7 @@ Status SpmdPartitioningVisitor::HandleGetTupleElement(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status SpmdPartitioningVisitor::HandleInfeed(HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::HandleInfeed(HloInstruction* hlo) {
   const Shape& shape = ShapeUtil::GetTupleElementShape(hlo->shape(), 0);
   auto token = GetPartitionedHlo(hlo->operand(0)).hlo();
   if (ShapeUtil::GetLeafCount(shape) == 0) {
@@ -3817,7 +3833,7 @@ Status SpmdPartitioningVisitor::HandleInfeed(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status SpmdPartitioningVisitor::HandlePad(HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::HandlePad(HloInstruction* hlo) {
   if (hlo->sharding().IsTileMaximal()) {
     return DefaultAction(hlo);
   }
@@ -3846,7 +3862,7 @@ Status SpmdPartitioningVisitor::HandlePad(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status SpmdPartitioningVisitor::HandleParameter(HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::HandleParameter(HloInstruction* hlo) {
   SetPartitionedHlo(hlo, [&]() {
     auto shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
     auto new_param = b_.AddInstruction(HloInstruction::CreateParameter(
@@ -3860,7 +3876,7 @@ Status SpmdPartitioningVisitor::HandleParameter(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status SpmdPartitioningVisitor::HandleReduce(HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::HandleReduce(HloInstruction* hlo) {
   int64_t input_count = 1;
   if (hlo->shape().IsTuple()) {
     input_count = hlo->shape().tuple_shapes_size();
@@ -3998,7 +4014,7 @@ Status SpmdPartitioningVisitor::HandleReduce(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status SpmdPartitioningVisitor::HandleReverse(HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::HandleReverse(HloInstruction* hlo) {
   auto reverse = Cast<HloReverseInstruction>(hlo);
   if (reverse->sharding().IsTileMaximal()) {
     return DefaultAction(hlo);
@@ -4018,7 +4034,7 @@ Status SpmdPartitioningVisitor::HandleReverse(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status SpmdPartitioningVisitor::HandleWhile(HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::HandleWhile(HloInstruction* hlo) {
   const HloSharding& sharding = hlo->sharding();
 
   // Shardings for the body parameter, body root, and cond parameter must be
@@ -4050,7 +4066,7 @@ Status SpmdPartitioningVisitor::HandleWhile(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status SpmdPartitioningVisitor::HandleConditional(HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::HandleConditional(HloInstruction* hlo) {
   std::vector<HloInstruction*> branch_args;
   for (int64_t i = 0; i < hlo->branch_count(); ++i) {
     HloComputation* computation = hlo->branch_computation(i);
@@ -4088,11 +4104,12 @@ Status SpmdPartitioningVisitor::HandleConditional(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status SpmdPartitioningVisitor::HandleOptimizationBarrier(HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::HandleOptimizationBarrier(
+    HloInstruction* hlo) {
   return HandleElementwise(hlo);
 }
 
-Status SpmdPartitioningVisitor::HandleOutfeed(HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::HandleOutfeed(HloInstruction* hlo) {
   if (hlo->sharding().HasUniqueDevice()) {
     return HandleSingleDevice(hlo);
   }
@@ -4274,7 +4291,7 @@ Status SpmdPartitioningVisitor::HandleOutfeed(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status SpmdPartitioningVisitor::HandleRng(HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::HandleRng(HloInstruction* hlo) {
   if (hlo->sharding().HasUniqueDevice()) {
     return HandleSingleDevice(hlo);
   }
@@ -4343,7 +4360,7 @@ Status SpmdPartitioningVisitor::HandleRng(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status SpmdPartitioningVisitor::HandleReduceWindow(HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::HandleReduceWindow(HloInstruction* hlo) {
   if (hlo->sharding().IsTileMaximal()) {
     return DefaultAction(hlo);
   }
@@ -4405,7 +4422,8 @@ Status SpmdPartitioningVisitor::HandleReduceWindow(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status SpmdPartitioningVisitor::HandleSelectAndScatter(HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::HandleSelectAndScatter(
+    HloInstruction* hlo) {
   if (hlo->sharding().IsTileMaximal()) {
     return DefaultAction(hlo);
   }
@@ -4626,7 +4644,7 @@ Status SpmdPartitioningVisitor::HandleSelectAndScatter(HloInstruction* hlo) {
   return OkStatus();
 }
 
-Status SpmdPartitioningVisitor::HandleTuple(HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::HandleTuple(HloInstruction* hlo) {
   std::vector<HloInstruction*> new_operands;
   for (int64_t i = 0; i < hlo->operand_count(); ++i) {
     new_operands.push_back(
@@ -4663,7 +4681,7 @@ absl::StatusOr<bool> SpmdPartitioningVisitor::DoPartition(
   return changed_;
 }
 
-Status SpmdPartitioningVisitor::HandlePartitionId(HloInstruction* hlo) {
+absl::Status SpmdPartitioningVisitor::HandlePartitionId(HloInstruction* hlo) {
   if (hlo->has_sharding() && hlo->sharding().IsManual()) {
     hlo->set_sharding(HloSharding::AssignDevice(0));
     return DefaultAction(hlo);
@@ -4712,7 +4730,25 @@ SPMDCollectiveOpsCreator GetDefaultCollectiveOpsCreator(int64_t num_partitions,
                 reduction->Clone(), false);
         HloInstruction* all_reduce =
             b->AddInstruction(HloInstruction::CreateAllReduce(
-                operand->shape(), {operand}, reduction_clone, device_groups,
+                operand->shape(), {operand}, reduction_clone,
+                CollectiveDeviceList(device_groups),
+                /*constrain_layout=*/false, channel_id,
+                /*use_global_device_ids=*/true));
+        reduction_clone->SetCollectiveCallInstruction(all_reduce);
+        return all_reduce;
+      },
+      [num_replicas, num_partitions](
+          SpmdBuilder* b, HloInstruction* operand, HloComputation* reduction,
+          const IotaReplicaGroupList& partition_group_list,
+          int64_t channel_id) {
+        HloComputation* reduction_clone =
+            reduction->parent()->AddComputationAndUnifyNamesAndIds(
+                reduction->Clone(), false);
+        HloInstruction* all_reduce =
+            b->AddInstruction(HloInstruction::CreateAllReduce(
+                operand->shape(), {operand}, reduction_clone,
+                ExpandPartitionGroupListAcrossReplicas(
+                    partition_group_list, num_replicas, num_partitions),
                 /*constrain_layout=*/false, channel_id,
                 /*use_global_device_ids=*/true));
         reduction_clone->SetCollectiveCallInstruction(all_reduce);
@@ -4757,7 +4793,7 @@ SPMDCollectiveOpsCreator GetDefaultCollectiveOpsCreator(int64_t num_partitions,
           }
         }
         return b->AddInstruction(HloInstruction::CreateAllToAll(
-            output_shape, operands, groups,
+            output_shape, operands, CollectiveDeviceList(groups),
             /*constrain_layout=*/false, channel_id, split_dimension));
       },
       [num_replicas, num_partitions](
@@ -4775,11 +4811,22 @@ SPMDCollectiveOpsCreator GetDefaultCollectiveOpsCreator(int64_t num_partitions,
           }
         }
         return b->AddInstruction(HloInstruction::CreateAllGather(
-            ag_shape, {operand}, all_gather_dimension, device_groups,
+            ag_shape, {operand}, all_gather_dimension,
+            CollectiveDeviceList(device_groups),
             /*constrain_layout=*/false, channel_id,
             /*use_global_device_ids=*/true));
       },
-  };
+      [num_replicas, num_partitions](
+          SpmdBuilder* b, HloInstruction* operand, const Shape& ag_shape,
+          const IotaReplicaGroupList& partition_group_list, int64_t channel_id,
+          int64_t all_gather_dimension) {
+        return b->AddInstruction(HloInstruction::CreateAllGather(
+            ag_shape, {operand}, all_gather_dimension,
+            ExpandPartitionGroupListAcrossReplicas(
+                partition_group_list, num_replicas, num_partitions),
+            /*constrain_layout=*/false, channel_id,
+            /*use_global_device_ids=*/true));
+      }};
 }
 
 SpmdPartitioner::SpmdPartitioner(int64_t num_partitions, int64_t num_replicas,
@@ -4814,16 +4861,34 @@ SpmdPartitioner::AllGatherShardsInternal(
       if (sharding.tile_assignment().dim(*it) == 1) {
         continue;
       }
-      auto partition_subgroups =
-          GetPartitionGroupsForReplication(sharding, {*it});
-      result_shape.set_dimensions(
-          *it, result_shape.dimensions(*it) * partition_subgroups[0].size());
-      result = collectives_creator.create_cross_partition_all_gather(
-          b, result, result_shape, partition_subgroups, (*next_channel_id)++,
-          /*all_gather_dimension=*/*it);
+      // Attempt to generate partition groups in iota format. If infeasible,
+      // fallback to list of lists representation.
+      auto partition_group_list = GetIotaPartitionGroupsForReplication(
+          sharding, {*it}, num_partitions_);
+      if (partition_group_list.has_value() &&
+          collectives_creator
+              .create_cross_partition_all_gather_with_iota_device_list) {
+        result_shape.set_dimensions(
+            *it, result_shape.dimensions(*it) *
+                     partition_group_list.value().num_devices_per_group());
+        result = collectives_creator
+                     .create_cross_partition_all_gather_with_iota_device_list(
+                         b, result, result_shape, partition_group_list.value(),
+                         (*next_channel_id)++,
+                         /*all_gather_dimension=*/*it);
+      } else {
+        auto partition_subgroups =
+            GetPartitionGroupsForReplication(sharding, {*it});
+        result_shape.set_dimensions(
+            *it, result_shape.dimensions(*it) * partition_subgroups[0].size());
+        result = collectives_creator.create_cross_partition_all_gather(
+            b, result, result_shape, partition_subgroups, (*next_channel_id)++,
+            /*all_gather_dimension=*/*it);
+      }
     }
     return std::make_pair(result, result);
   }
+
   std::vector<int64_t> shape;
   shape.push_back(1);
   for (int64_t dim : operand->shape().dimensions()) {
@@ -4834,13 +4899,31 @@ SpmdPartitioner::AllGatherShardsInternal(
       ShapeUtil::MakeShape(operand->shape().element_type(), shape), operand));
   HloInstruction* ag = nullptr;
   HloInstruction* result = reshape;
-  auto partition_subgroups =
-      GetPartitionGroupsForReplication(sharding, selected_dims);
-  shape[0] *= partition_subgroups[0].size();
-  result = collectives_creator.create_cross_partition_all_gather(
-      b, result, ShapeUtil::MakeShape(operand->shape().element_type(), shape),
-      partition_subgroups, (*next_channel_id)++,
-      /*all_gather_dimension=*/0);
+
+  // Attempt to generate partition groups in iota format. If infeasible,
+  // fallback to list of lists representation.
+  auto partition_group_list = GetIotaPartitionGroupsForReplication(
+      sharding, selected_dims, num_partitions_);
+  if (partition_group_list.has_value() &&
+      collectives_creator
+          .create_cross_partition_all_gather_with_iota_device_list) {
+    shape[0] *= partition_group_list.value().num_devices_per_group();
+    result =
+        collectives_creator
+            .create_cross_partition_all_gather_with_iota_device_list(
+                b, result,
+                ShapeUtil::MakeShape(operand->shape().element_type(), shape),
+                partition_group_list.value(), (*next_channel_id)++,
+                /*all_gather_dimension=*/0);
+  } else {
+    auto partition_subgroups =
+        GetPartitionGroupsForReplication(sharding, selected_dims);
+    shape[0] *= partition_subgroups[0].size();
+    result = collectives_creator.create_cross_partition_all_gather(
+        b, result, ShapeUtil::MakeShape(operand->shape().element_type(), shape),
+        partition_subgroups, (*next_channel_id)++,
+        /*all_gather_dimension=*/0);
+  }
   ag = result;
   // If n > 1 dimensions are partitioned, split the leading dimension to n.
   std::vector<int64_t> tiled_dims;
@@ -4908,20 +4991,46 @@ HloInstruction* SpmdPartitioner::AllReduceAlongShardingDimsInternal(
     const SPMDCollectiveOpsCreator& collectives_creator,
     HloComputation* reduction, bool per_dim_ar) {
   if (!per_dim_ar) {
-    auto partition_subgroups =
-        GetPartitionGroupsForReplication(sharding, selected_dims);
-    return collectives_creator.create_cross_partition_all_reduce(
-        b, operand, reduction, partition_subgroups, (*next_channel_id)++);
+    // Attempt to generate partition groups in iota format. If infeasible,
+    // fallback to list of lists representation.
+    auto partition_group_list = GetIotaPartitionGroupsForReplication(
+        sharding, selected_dims, num_partitions_);
+    if (partition_group_list.has_value() &&
+        collectives_creator
+            .create_cross_partition_all_reduce_with_iota_device_list) {
+      return collectives_creator
+          .create_cross_partition_all_reduce_with_iota_device_list(
+              b, operand, reduction, partition_group_list.value(),
+              (*next_channel_id)++);
+    } else {
+      auto partition_subgroups =
+          GetPartitionGroupsForReplication(sharding, selected_dims);
+      return collectives_creator.create_cross_partition_all_reduce(
+          b, operand, reduction, partition_subgroups, (*next_channel_id)++);
+    }
   }
   auto result = operand;
   for (auto it = selected_dims.rbegin(); it != selected_dims.rend(); ++it) {
     if (sharding.tile_assignment().dim(*it) == 1) {
       continue;
     }
-    auto partition_subgroups =
-        GetPartitionGroupsForReplication(sharding, {*it});
-    result = collectives_creator.create_cross_partition_all_reduce(
-        b, result, reduction, partition_subgroups, (*next_channel_id)++);
+    // Attempt to generate partition groups in iota format. If infeasible,
+    // fallback to list of lists representation.
+    auto partition_group_list =
+        GetIotaPartitionGroupsForReplication(sharding, {*it}, num_partitions_);
+    if (partition_group_list.has_value() &&
+        collectives_creator
+            .create_cross_partition_all_reduce_with_iota_device_list) {
+      result = collectives_creator
+                   .create_cross_partition_all_reduce_with_iota_device_list(
+                       b, result, reduction, partition_group_list.value(),
+                       (*next_channel_id)++);
+    } else {
+      auto partition_subgroups =
+          GetPartitionGroupsForReplication(sharding, {*it});
+      result = collectives_creator.create_cross_partition_all_reduce(
+          b, result, reduction, partition_subgroups, (*next_channel_id)++);
+    }
   }
   return result;
 }
@@ -5100,7 +5209,7 @@ absl::StatusOr<bool> SpmdPartitioner::Run(
   return changed;
 }
 
-Status SpmdPartitioner::PreprocessSharding(
+absl::Status SpmdPartitioner::PreprocessSharding(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   for (HloComputation* computation : module->computations(execution_threads)) {
@@ -5168,7 +5277,7 @@ Status SpmdPartitioner::PreprocessSharding(
   return OkStatus();
 }
 
-Status SpmdPartitioner::PreprocessHlos(
+absl::Status SpmdPartitioner::PreprocessHlos(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   auto skip_copy_operands = [](HloInstruction* operand,
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner.h b/third_party/xla/xla/service/spmd/spmd_partitioner.h
index 9877f60e835091..700377f27ba1ba 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner.h
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <tuple>
+#include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
@@ -29,6 +30,7 @@ limitations under the License.
 #include "absl/container/node_hash_map.h"
 #include "absl/functional/function_ref.h"
 #include "absl/strings/string_view.h"
+#include "xla/hlo/ir/collective_device_list.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -87,6 +89,10 @@ struct SpmdPartitionerOptions {
   bool enable_windowed_einsum_for_all_gather = true;
   // Enables windowed einsum for result reduce-scatter.
   bool enable_windowed_einsum_for_reduce_scatter = true;
+
+  // Whether disable rewrite for dots that share the same
+  // operand as an already rewritten windowed einsum loop.
+  bool disable_ag_rewrite_for_multiple_consumers = false;
 };
 
 // Class to wrap the computation builder to capture information during SPMD
@@ -150,6 +156,16 @@ struct SPMDCollectiveOpsCreator {
       int64_t channel_id)>
       create_cross_partition_all_reduce;
 
+  // Function used to create a cross-partition all-reduce HLO using device list
+  // in iota format. This function is optional: if it is a nullptr, use
+  // create_cross_partition_all_reduce.
+  // TODO(b/316622399): Merge this and create_cross_partition_all_reduce into a
+  // function that uses CollectiveDeviceList.
+  std::function<HloInstruction*(
+      SpmdBuilder*, HloInstruction* operand, HloComputation* reduction,
+      const IotaReplicaGroupList& partition_group_list, int64_t channel_id)>
+      create_cross_partition_all_reduce_with_iota_device_list;
+
   // Function used to create a cross-partition collective-permute HLO.
   std::function<HloInstruction*(
       SpmdBuilder*, HloInstruction* operand,
@@ -171,6 +187,17 @@ struct SPMDCollectiveOpsCreator {
       const std::vector<std::vector<int64_t>>& partition_subgroups,
       int64_t channel_id, int64_t all_gather_dimension)>
       create_cross_partition_all_gather;
+
+  // Function used to create a cross-partition all-gather HLO using device list
+  // in iota format. This function is optional: if it is a nullptr, use
+  // create_cross_partition_all_gather.
+  // TODO(b/316622399): Merge this and create_cross_partition_all_gather into a
+  // function that uses CollectiveDeviceList.
+  std::function<HloInstruction*(
+      SpmdBuilder*, HloInstruction* operand, const Shape& ag_shape,
+      const IotaReplicaGroupList& partition_group_list, int64_t channel_id,
+      int64_t all_gather_dimension)>
+      create_cross_partition_all_gather_with_iota_device_list;
 };
 
 // Create a default SPMDCollectiveOpsCreator.
@@ -299,7 +326,7 @@ class SpmdPartitioner : public HloModulePass {
 
   // Verifies that the sharding of instructions in the module are valid, and
   // also fill in missing sharding information.
-  virtual Status PreprocessSharding(
+  virtual absl::Status PreprocessSharding(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads);
 
@@ -320,7 +347,7 @@ class SpmdPartitioner : public HloModulePass {
   // Preprocesses the graph to simplify some communication patterns. E.g., merge
   // pad->slice into a single pad with potentially negative padding to avoid
   // multiple halo exchanges.
-  Status PreprocessHlos(
+  absl::Status PreprocessHlos(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads);
 
@@ -525,58 +552,59 @@ class SpmdPartitioningVisitor : public DfsHloVisitorWithDefault {
 
   SpmdPartitioningVisitor(const SpmdPartitioningVisitor& src);
 
-  Status DefaultAction(HloInstruction* hlo) override;
-  Status HandleAllReduce(HloInstruction* hlo) override;
-  Status HandleBroadcast(HloInstruction* hlo) override;
-  Status HandleConstant(HloInstruction* hlo) override;
-  Status HandleCustomCall(HloInstruction* hlo) override;
-  Status HandleDot(HloInstruction* hlo) override;
-  Status HandleDynamicSlice(HloInstruction* hlo) override;
-  Status HandleDynamicUpdateSlice(HloInstruction* hlo) override;
-  Status HandleFft(HloInstruction* hlo) override;
-  Status HandleGather(HloInstruction* hlo) override;
-  Status HandleGetTupleElement(HloInstruction* hlo) override;
-  Status HandleInfeed(HloInstruction* hlo) override;
-  Status HandleOptimizationBarrier(HloInstruction* hlo) override;
-  Status HandleOutfeed(HloInstruction* hlo) override;
-  Status HandlePad(HloInstruction* hlo) override;
-  Status HandleParameter(HloInstruction* hlo) override;
-  Status HandleReduce(HloInstruction* hlo) override;
-  Status HandleReverse(HloInstruction* hlo) override;
-  Status HandleWhile(HloInstruction* hlo) override;
-  Status HandleConditional(HloInstruction* hlo) override;
-  Status HandleReduceWindow(HloInstruction* hlo) override;
-  Status HandleSelectAndScatter(HloInstruction* hlo) override;
-  Status HandleTuple(HloInstruction* hlo) override;
-  Status HandleRng(HloInstruction* hlo) override;
-  Status HandleConvolution(HloInstruction* hlo) override;
-  Status HandleConcatenate(HloInstruction* hlo) override;
-  Status HandleScatter(HloInstruction* hlo) override;
-  Status HandleSlice(HloInstruction* hlo) override;
-  Status HandleSort(HloInstruction* hlo) override;
-  Status HandleTranspose(HloInstruction* hlo) override;
-  Status HandleReshape(HloInstruction* hlo) override;
-  Status HandleIota(HloInstruction* hlo) override;
-  Status HandlePartitionId(HloInstruction* hlo) override;
+  absl::Status DefaultAction(HloInstruction* hlo) override;
+  absl::Status HandleAllReduce(HloInstruction* hlo) override;
+  absl::Status HandleBroadcast(HloInstruction* hlo) override;
+  absl::Status HandleCall(HloInstruction* hlo) override;
+  absl::Status HandleConstant(HloInstruction* hlo) override;
+  absl::Status HandleCustomCall(HloInstruction* hlo) override;
+  absl::Status HandleDot(HloInstruction* hlo) override;
+  absl::Status HandleDynamicSlice(HloInstruction* hlo) override;
+  absl::Status HandleDynamicUpdateSlice(HloInstruction* hlo) override;
+  absl::Status HandleFft(HloInstruction* hlo) override;
+  absl::Status HandleGather(HloInstruction* hlo) override;
+  absl::Status HandleGetTupleElement(HloInstruction* hlo) override;
+  absl::Status HandleInfeed(HloInstruction* hlo) override;
+  absl::Status HandleOptimizationBarrier(HloInstruction* hlo) override;
+  absl::Status HandleOutfeed(HloInstruction* hlo) override;
+  absl::Status HandlePad(HloInstruction* hlo) override;
+  absl::Status HandleParameter(HloInstruction* hlo) override;
+  absl::Status HandleReduce(HloInstruction* hlo) override;
+  absl::Status HandleReverse(HloInstruction* hlo) override;
+  absl::Status HandleWhile(HloInstruction* hlo) override;
+  absl::Status HandleConditional(HloInstruction* hlo) override;
+  absl::Status HandleReduceWindow(HloInstruction* hlo) override;
+  absl::Status HandleSelectAndScatter(HloInstruction* hlo) override;
+  absl::Status HandleTuple(HloInstruction* hlo) override;
+  absl::Status HandleRng(HloInstruction* hlo) override;
+  absl::Status HandleConvolution(HloInstruction* hlo) override;
+  absl::Status HandleConcatenate(HloInstruction* hlo) override;
+  absl::Status HandleScatter(HloInstruction* hlo) override;
+  absl::Status HandleSlice(HloInstruction* hlo) override;
+  absl::Status HandleSort(HloInstruction* hlo) override;
+  absl::Status HandleTranspose(HloInstruction* hlo) override;
+  absl::Status HandleReshape(HloInstruction* hlo) override;
+  absl::Status HandleIota(HloInstruction* hlo) override;
+  absl::Status HandlePartitionId(HloInstruction* hlo) override;
 
   // Implementation of dot partitioning given DotGeneralDimsMapping.
-  Status HandleDotHelper(HloInstruction* hlo,
-                         const DotConvDimsMapping& dims_mapping,
-                         absl::FunctionRef<absl::StatusOr<HloInstruction*>(
-                             HloInstruction*, HloInstruction*, SpmdBuilder*,
-                             const Window& conv_window)>
-                             create_sharded_dot);
+  absl::Status HandleDotHelper(
+      HloInstruction* hlo, const DotConvDimsMapping& dims_mapping,
+      absl::FunctionRef<absl::StatusOr<HloInstruction*>(
+          HloInstruction*, HloInstruction*, SpmdBuilder*,
+          const Window& conv_window)>
+          create_sharded_dot);
 
   // Common handle for elementwise HLOs.
-  Status HandleElementwise(HloInstruction* hlo);
+  absl::Status HandleElementwise(HloInstruction* hlo);
 
   // Common handle for HLOs that runs on a single device.
-  Status HandleSingleDevice(const HloInstruction* hlo);
+  absl::Status HandleSingleDevice(const HloInstruction* hlo);
 
   // CustomCall handlers per call target.
-  Status HandleCustomCallTopK(HloInstruction* hlo);
+  absl::Status HandleCustomCallTopK(HloInstruction* hlo);
   // Convenient custom ops defined by the partitioner itself.
-  Status HandleCustomCallSPMDInternal_RotateRight(HloInstruction* hlo);
+  absl::Status HandleCustomCallSPMDInternal_RotateRight(HloInstruction* hlo);
 
   virtual std::unique_ptr<SpmdPartitioningVisitor> Clone() const;
 
@@ -664,13 +692,13 @@ class SpmdPartitioningVisitor : public DfsHloVisitorWithDefault {
   };
 
  protected:
-  Status Preprocess(HloInstruction* hlo) override;
-  Status Postprocess(HloInstruction* hlo) override;
+  absl::Status Preprocess(HloInstruction* hlo) override;
+  absl::Status Postprocess(HloInstruction* hlo) override;
 
   // Performs code motion for windowed dot-general loops in
   // windowed_dot_general_loops_. Invoked after the visitor finishes traversing
   // the graph.
-  Status DoCodeMotionForWindowedDotGeneralLoops(
+  absl::Status DoCodeMotionForWindowedDotGeneralLoops(
       HloComputation* computation, const SpmdPartitionerOptions& options);
 
   bool changed_;
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc b/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
index d8bcc31a0250e2..85352c6451247d 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
@@ -15,25 +15,35 @@ limitations under the License.
 
 #include "xla/service/spmd/spmd_partitioner.h"
 
+#include <algorithm>
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <utility>
+#include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/algorithm/container.h"
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/collective_device_list.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/hlo/utils/hlo_sharding_util.h"
+#include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_pass_pipeline.h"
 #include "xla/service/hlo_verifier.h"
 #include "xla/service/sharding_format_picker.h"
 #include "xla/service/spmd/spmd_prepare.h"
+#include "xla/status.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace spmd {
@@ -223,6 +233,36 @@ ENTRY entry {
           op::Shape("s32[1,3]")));
 }
 
+TEST_P(SpmdPartitioningTest, PartitionCall) {
+  absl::string_view hlo_string = R"(
+HloModule jit_f
+
+g {
+  Arg_0.6 = s32[8,2]{1,0} parameter(0), sharding={devices=[2,2]<=[4]}
+  constant.0 = s32[] constant(2), sharding={replicated}
+  broadcast.0 = s32[8,2]{1,0} broadcast(constant.0), dimensions={}, sharding={devices=[2,2]<=[4]}
+  ROOT multiply.9 = s32[8,2]{1,0} multiply(Arg_0.6, broadcast.0), sharding={devices=[2,2]<=[4]}
+}
+
+ENTRY main {
+  Arg_0.1 = s32[8,2]{1,0} parameter(0), sharding={devices=[2,2]<=[4]}
+  constant.1 = s32[] constant(3), sharding={replicated}
+  broadcast.1 = s32[8,2]{1,0} broadcast(constant.1), dimensions={}, sharding={devices=[2,2]<=[4]}
+  multiply.4 = s32[8,2]{1,0} multiply(Arg_0.1, broadcast.1), sharding={devices=[2,2]<=[4]}
+  ROOT call = s32[8,2]{1,0} call(multiply.4), to_apply=g, sharding={devices=[2,2]<=[4]}, backend_config={"flag_configs":[],"scoped_memory_configs":[],"compute_type":"COMPUTE_TYPE_DEFAULT","device_type":"DEVICE_TYPE_HOST","used_scoped_memory_configs":[]}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Call(), op::Shape("s32[4,1]")));
+  HloInstruction* call_comp_root =
+      root->called_computations()[0]->root_instruction();
+  EXPECT_THAT(call_comp_root, AllOf(op::Multiply(op::Parameter(0),
+                                                 op::Broadcast(op::Constant())),
+                                    op::Shape("s32[4,1]")));
+}
+
 TEST_P(SpmdPartitioningTest, TiledToReplicated) {
   absl::string_view hlo_string = R"(
 HloModule module
@@ -245,6 +285,47 @@ ENTRY entry {
           op::Shape("s32[2,3]")))));
 }
 
+TEST_P(SpmdPartitioningTest,
+       TiledToReplicatedWhenV2ShardingGeneratesReplicaGroupV2) {
+  // Skip when input sharding is not V2.
+  if (GetParam() != ShardingFormatPicker::ShardingType::kBestEffortV2) {
+    GTEST_SKIP() << "This test only runs when input sharding is in V2 format.";
+  }
+  absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %constant = s32[4,3]{1,0} constant({{1,1,1},{1,1,1},{1,1,1},{1,1,1}}),
+    sharding={devices=[4,1]<=[4]}
+  ROOT %copy = s32[4,3]{1,0} copy(%constant), sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  // Verify all-reduce instruction is generated.
+  auto all_reduce_instruction =
+      std::find_if(module->entry_computation()->instructions().begin(),
+                   module->entry_computation()->instructions().end(),
+                   HloPredicateIsOp<HloOpcode::kAllReduce>);
+  EXPECT_NE(all_reduce_instruction,
+            module->entry_computation()->instructions().end());
+
+  // Verify all-reduce instruction contains ReplicaGroupV2.
+  EXPECT_TRUE((*all_reduce_instruction)
+                  ->device_list()
+                  .iota_replica_group_list()
+                  .has_value());
+  IotaReplicaGroupList list = (*all_reduce_instruction)
+                                  ->device_list()
+                                  .iota_replica_group_list()
+                                  .value();
+  EXPECT_EQ(list.num_replica_groups(), 1);
+  EXPECT_EQ(list.num_devices_per_group(), 4);
+  EXPECT_THAT(list.reshape_dims(), ::testing::ElementsAre(4));
+  EXPECT_THAT(list.transpose_perm(), ::testing::ElementsAre(0));
+}
+
 TEST_P(SpmdPartitioningTest, TiledToSingleDevice) {
   absl::string_view hlo_string = R"(
 HloModule module
@@ -1604,6 +1685,50 @@ ENTRY entry {
                           op::Shape("f32[1,1,64,256]")));
 }
 
+TEST_P(SpmdPartitioningTest,
+       ConvolutionLhsTiledRhsTiledWhenV2ShardingGeneratesReplicaGroupV2) {
+  // Skip when input sharding is not V2.
+  if (GetParam() != ShardingFormatPicker::ShardingType::kBestEffortV2) {
+    GTEST_SKIP() << "This test only runs when input sharding is in V2 format.";
+  }
+  absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,56,56,64] parameter(0)
+  %lhs.copy = f32[128,56,56,64] copy(%lhs), sharding={devices=[1,8,1,1]<=[8]}
+  %rhs = f32[128,56,56,256] parameter(1)
+  %rhs.copy = f32[128,56,56,256] copy(%rhs), sharding={devices=[1,8,1,1]<=[8]}
+  ROOT %conv = f32[1,1,64,256] convolution(%lhs.copy, %rhs.copy),
+    window={size=56x56}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+
+  // Verify all-reduce instruction is generated.
+  auto all_reduce_instruction =
+      std::find_if(module->entry_computation()->instructions().begin(),
+                   module->entry_computation()->instructions().end(),
+                   HloPredicateIsOp<HloOpcode::kAllReduce>);
+  EXPECT_NE(all_reduce_instruction,
+            module->entry_computation()->instructions().end());
+
+  // Verify all-reduce instruction contains ReplicaGroupV2.
+  EXPECT_TRUE((*all_reduce_instruction)
+                  ->device_list()
+                  .iota_replica_group_list()
+                  .has_value());
+  IotaReplicaGroupList list = (*all_reduce_instruction)
+                                  ->device_list()
+                                  .iota_replica_group_list()
+                                  .value();
+  EXPECT_EQ(list.num_replica_groups(), 1);
+  EXPECT_EQ(list.num_devices_per_group(), 8);
+  EXPECT_THAT(list.reshape_dims(), ::testing::ElementsAre(8));
+  EXPECT_THAT(list.transpose_perm(), ::testing::ElementsAre(0));
+}
+
 TEST_P(SpmdPartitioningTest, ConvolutionLhsTiledRhsTiledWindowReversal) {
   absl::string_view hlo_string = R"(
 HloModule module
@@ -2112,13 +2237,17 @@ ENTRY entry {
       AllOf(op::Copy(op::DynamicSlice(op::Pad(op::Parameter(), op::Constant()),
                                       op::Constant(), op::Reshape())),
             op::Shape("f32[14,129]"));
+  auto param0_adjusted =
+      AllOf(op::Select(op::Compare(op::Add(), op::Broadcast(op::Constant())),
+                       param0, op::Broadcast(op::Constant())),
+            op::Shape("f32[14,129]"));
   auto param1 = AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(),
                                                 op::Reshape())),
                       op::Shape("f32[14,58]"));
   EXPECT_THAT(root, AllOf(op::DynamicSlice(
                               AllOf(op::AllReduce(op::DynamicUpdateSlice(
                                         op::DynamicUpdateSlice(
-                                            op::Broadcast(), param0,
+                                            op::Broadcast(), param0_adjusted,
                                             op::Constant(), op::Multiply()),
                                         param1, op::Constant(), op::Add())),
                                     op::Shape("f32[14,374]")),
@@ -2143,11 +2272,15 @@ ENTRY entry {
 
   const auto root = module->entry_computation()->root_instruction();
   auto param0 = AllOf(op::Parameter(0), op::Shape("f32[7,129]"));
+  auto param0_adjusted =
+      AllOf(op::Select(op::Compare(op::Add(), op::Broadcast(op::Constant())),
+                       param0, op::Broadcast(op::Constant())),
+            op::Shape("f32[7,129]"));
   auto param1 = AllOf(op::Parameter(1), op::Shape("f32[7,58]"));
   EXPECT_THAT(root, AllOf(op::DynamicSlice(
                               AllOf(op::AllReduce(op::DynamicUpdateSlice(
                                         op::DynamicUpdateSlice(
-                                            op::Broadcast(), param0,
+                                            op::Broadcast(), param0_adjusted,
                                             op::Constant(), op::Multiply()),
                                         param1, op::Constant(), op::Add())),
                                     op::Shape("f32[7,374]")),
@@ -3656,6 +3789,45 @@ ENTRY entry {
                               op::AllToAll(op::Reshape(local_reshape))))));
 }
 
+// The test case is derived from b/338145758.
+TEST_P(SpmdPartitioningTest, ReshapeWithReshard3) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY %reshape {
+  p0 = bf16[80,64,2,2,2,2,2] parameter(0), sharding={devices=[16,8,1,1,1,1,1]<=[128]}
+  ROOT reshape = bf16[5120,4,8] reshape(p0), sharding={devices=[128,1,1]<=[128]}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, PartitionComputation(hlo_string, /*num_devices=*/128));
+  VLOG(1) << module->ToString();
+  const auto root = module->entry_computation()->root_instruction();
+  auto reshape = AllOf(op::Reshape(op::AllReduce(op::DynamicUpdateSlice(
+                           _, op::Parameter(0), _, _, _, _, _, _, _))),
+                       op::Shape("bf16[320,4,8]"));
+  EXPECT_THAT(root, AllOf(op::DynamicSlice(reshape, _, _, _),
+                          op::Shape("bf16[40,4,8]")));
+}
+
+TEST_P(SpmdPartitioningTest, ReshapeWithReshard4) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY %reshape {
+  p0 = bf16[80,64,8,2,2,2,2] parameter(0), sharding={devices=[16,1,8,1,1,1,1]<=[128]}
+  ROOT reshape = bf16[5120,16,8] reshape(p0), sharding={devices=[128,1,1]<=[128]}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, PartitionComputation(hlo_string, /*num_devices=*/128));
+  VLOG(1) << module->ToString();
+  const auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              AllOf(op::Reshape(op::Reshape(op::Transpose(op::AllToAll()))),
+                    op::Shape("bf16[40,16,8]")));
+}
+
 TEST_P(SpmdPartitioningTest, PartialReplicateShardableReshape) {
   absl::string_view hlo_string = R"(
 HloModule module
@@ -7087,7 +7259,7 @@ TEST_P(SpmdPartitioningTest, ManualPartitionId) {
 HloModule module
 
 ENTRY entry {
-  ROOT %lhs = s32[] partition-id(), sharding={manual}
+  ROOT %lhs = u32[] partition-id(), sharding={manual}
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
@@ -8122,6 +8294,27 @@ ENTRY entry {
   EXPECT_THAT(root, op::Tuple(op::Copy(mul)));
 }
 
+TEST_P(SpmdPartitioningTest, NestedManual) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  p.0 = s32[16,16,16] parameter(0), sharding={devices=[2,2,2]<=[8]}
+  m.0 = s32[8,8,8] custom-call(p.0), custom_call_target="SPMDFullToShardShape", sharding={manual}
+  m.1 = s32[16,8,8] custom-call(m.0), custom_call_target="SPMDShardToFullShape", sharding={devices=[2,1,1,4]<=[8] last_tile_dims={manual}}
+  m.2 = s32[16,16,8] custom-call(m.1), custom_call_target="SPMDShardToFullShape", sharding={devices=[2,2,1,2]<=[8] last_tile_dims={manual}}
+  ROOT out.0 = s32[16,16,16] custom-call(m.2), custom_call_target="SPMDShardToFullShape", sharding={devices=[2,2,2]<=[8]}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              AllOf(op::Shape("s32[8,8,8]"),
+                    op::Copy(op::Copy(op::Copy(op::Copy(op::Parameter(0)))))));
+}
+
 TEST_P(SpmdPartitioningTest, SubgroupAllToAllReshard) {
   absl::string_view hlo_string = R"(
 HloModule module
@@ -14181,6 +14374,44 @@ ENTRY entry {
             op::Shape("(f32[1,4096,4096]{2,1,0}, s32[1,4096,4096]{2,1,0})")));
 }
 
+TEST_P(SpmdPartitioningTest, PartitionOffloading) {
+  const char* const hlo_string = R"(
+HloModule module, entry_computation_layout={(f32[1,256,128]{2,1,0})->f32[1,256,128]{2,1,0}}
+ENTRY offloading (param0: f32[1,256,128]) -> f32[1,256,128] {
+  zero = f32[] constant(0), sharding={replicated}
+  broadcast = f32[256,256,128]{2,1,0} broadcast(zero), dimensions={}, sharding={devices=[1,1,4]0,1,2,3}
+  param0 = f32[1,256,128]{2,1,0} parameter(0), sharding={devices=[1,1,4]0,1,2,3}
+  move-to-host  = f32[1,256,128]{2,1,0} custom-call(param0), custom_call_target="MoveToHost", sharding={devices=[1,1,4]0,1,2,3}
+  izero = s32[] constant(0)
+  dynamic-update-slice = f32[256,256,128]{2,1,0} dynamic-update-slice(broadcast, move-to-host, izero, izero, izero), sharding={devices=[1,1,4]0,1,2,3}
+  dynamic-slice = f32[1,256,128]{2,1,0} dynamic-slice(dynamic-update-slice, izero, izero, izero), dynamic_slice_sizes={1,256,128}, sharding={devices=[1,1,4]0,1,2,3}
+  move-to-device = f32[1,256,128]{2,1,0} custom-call(dynamic-slice), custom_call_target="MoveToDevice", sharding={devices=[1,4,1]0,1,2,3}
+  ROOT copy = f32[1,256,128]{2,1,0} copy(move-to-device), sharding={devices=[1,4,1]0,1,2,3}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module,
+      PartitionComputation(
+          hlo_string, /*num_devices=*/4,
+          /*conv_halo_exchange_always_on_lhs=*/true,
+          /*xla_tpu_enable_log_recorder_partitioned_logging=*/true));
+  XLA_VLOG_LINES(1, module->ToString());
+
+  // Check that the partitioner does not insert any sharding code between
+  // the offloading ops and the slicing ops.
+  auto move_to_host = FindInstruction(module.get(), "move-to-host.1");
+  auto move_to_device = FindInstruction(module.get(), "move-to-device.1");
+  EXPECT_EQ(
+      FindInstruction(module.get(), HloOpcode::kDynamicUpdateSlice)->operand(1),
+      move_to_host);
+  EXPECT_EQ(move_to_device->operand(0)->opcode(), HloOpcode::kDynamicSlice);
+
+  // Verify that the offloading ops are indeed partitioned.
+  EXPECT_THAT(move_to_host, op::Shape("f32[1,256,32]"));
+  EXPECT_THAT(move_to_device, op::Shape("f32[1,256,32]"));
+}
+
 }  // namespace
 }  // namespace spmd
 }  // namespace xla
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc b/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc
index 7bb7927b3f8353..70296f0dbe5914 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <limits>
 #include <map>
 #include <memory>
+#include <numeric>
 #include <optional>
 #include <string>
 #include <utility>
@@ -33,6 +34,7 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
 #include "xla/comparison_util.h"
+#include "xla/hlo/ir/collective_device_list.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -46,6 +48,7 @@ limitations under the License.
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/shape_inference.h"
 #include "xla/service/spmd/spmd_partitioner.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/util.h"
 #include "xla/window_util.h"
@@ -2404,5 +2407,123 @@ HloInstruction* PadDataFromWindowReshard(
   return sharded_data;
 }
 
+std::vector<std::vector<int64_t>> GetPartitionGroupsForReplication(
+    const HloSharding& sharding, absl::Span<const int64_t> replication_dims) {
+  int64_t group_size = 1;
+  for (int64_t i : replication_dims) {
+    group_size *= sharding.tile_assignment().dim(i);
+  }
+  std::vector<std::vector<int64_t>> partition_groups(
+      sharding.tile_assignment().num_elements() / group_size);
+  sharding.tile_assignment().Each(
+      [&](absl::Span<const int64_t> indices, int64_t partition) {
+        int64_t group_id = 0;
+        for (int64_t i = 0; i < indices.size(); ++i) {
+          if (!absl::c_linear_search(replication_dims, i)) {
+            group_id *= sharding.tile_assignment().dim(i);
+            group_id += indices[i];
+          }
+        }
+        partition_groups[group_id].push_back(partition);
+      });
+  return partition_groups;
+}
+
+// Returns partition groups in an iota format.
+std::optional<IotaReplicaGroupList> GetIotaPartitionGroupsForReplication(
+    const HloSharding& sharding, absl::Span<const int64_t> replication_dims,
+    int64_t num_partitions) {
+  // If provided sharding is not HloShardingV2, we cannot generate partition
+  // groups in an iota format.
+  if (!sharding.tile_assignment().iota().has_value()) {
+    return std::nullopt;
+  }
+
+  // If the sharding does not utilize all the partitions, we skip generating
+  // compressed format.
+  if (sharding.tile_assignment().num_elements() != num_partitions) {
+    return std::nullopt;
+  }
+
+  int64_t group_size = 1;
+  for (int64_t i : replication_dims) {
+    group_size *= sharding.tile_assignment().dim(i);
+  }
+
+  int64_t num_replica_groups =
+      sharding.tile_assignment().num_elements() / group_size;
+
+  // The compressed replica group list involves transposing and reshaping the
+  // initial tile assignment. We transpose the original tile assignment so that
+  // the dims we want to replicate across are at the end, essentially making
+  // these dims the most minor dims. We then reshape the initial tile assignment
+  // into a tile assignment with dims [M, N], where M is the number of replica
+  // groups and N is the size of each replica group.
+  std::vector<int> transpose_dims(sharding.tile_assignment().num_dimensions());
+  std::iota(transpose_dims.begin(), transpose_dims.end(), 0);
+
+  // Sorting is not necessary but is done to match the non-optimized equivalent
+  // function.
+  std::vector<int> replication_dims_sorted(replication_dims.begin(),
+                                           replication_dims.end());
+  std::sort(replication_dims_sorted.begin(), replication_dims_sorted.end());
+  for (int64_t i : replication_dims_sorted) {
+    auto it = std::find(transpose_dims.begin(), transpose_dims.end(), i);
+    if (it != transpose_dims.end()) {
+      transpose_dims.erase(it);
+      transpose_dims.push_back(i);
+    }
+  }
+
+  // If after the transpose, we do not have an iota tile assignment
+  // (HloShardingV2), we cannot generate a compressed format. Currently sharding
+  // V2 only supports some types of transposes.
+  auto transpose_iota_tile_assignment =
+      sharding.tile_assignment().iota()->Transpose(transpose_dims);
+  if (!transpose_iota_tile_assignment.has_value()) {
+    return std::nullopt;
+  }
+
+  // We generate a compressed device list with this information.
+  return IotaReplicaGroupList(num_replica_groups, group_size,
+                              transpose_iota_tile_assignment->reshape_dims(),
+                              transpose_iota_tile_assignment->transpose_perm());
+}
+
+// Expands partition group list across all replicas. Expects that provided
+// partition group list utilizes all the partitions.
+CollectiveDeviceList ExpandPartitionGroupListAcrossReplicas(
+    IotaReplicaGroupList partition_group_list, int num_replicas,
+    int num_partitions) {
+  int partition_group_count = partition_group_list.num_replica_groups();
+  int partition_group_size = partition_group_list.num_devices_per_group();
+  // Verify that partition group list utilizes all partitions.
+  CHECK_EQ((partition_group_count * partition_group_size), num_partitions);
+
+  // Total number of replica groups is number of partitions groups * num of
+  // replicas.
+  int replica_group_count = partition_group_count * num_replicas;
+
+  // Newly generated replica group list expands the pattern within one replica
+  // across all replicas. For example, if we want to expand a partition group
+  // list with 10 groups of size 5 with reshape dims [2,5,5] and
+  // transpose dims of [0,2,1] to 2 replicas, we would generate a replica group
+  // list with 20 groups of size 5 with reshape dims [2,2,5,5] and
+  // transpose dims [0,1,3,2].
+  std::vector<int64_t> new_reshape_dims(
+      partition_group_list.reshape_dims().begin(),
+      partition_group_list.reshape_dims().end());
+  new_reshape_dims.insert(new_reshape_dims.begin(), num_replicas);
+  std::vector<int> new_transpose_dims;
+  new_transpose_dims.push_back(0);
+  for (int64_t dim : partition_group_list.transpose_perm()) {
+    new_transpose_dims.push_back(dim + 1);
+  }
+
+  return CollectiveDeviceList(
+      IotaReplicaGroupList(replica_group_count, partition_group_size,
+                           new_reshape_dims, new_transpose_dims));
+}
+
 }  // namespace spmd
 }  // namespace xla
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_util.h b/third_party/xla/xla/service/spmd/spmd_partitioner_util.h
index ae59b9f8737220..4b1649d8ba2a50 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_util.h
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_util.h
@@ -34,6 +34,7 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/str_replace.h"
 #include "absl/utility/utility.h"
+#include "xla/hlo/ir/collective_device_list.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -580,6 +581,32 @@ HloInstruction* PadDataFromWindowReshard(
     const PartitionedHlo::WindowedInputShardReturnValue& reshard_operand,
     HloInstruction* pad_value, SpmdBuilder* b);
 
+// Generates partition groups (groups of devices that will communicate via a
+// collective) from sharding and provided replication_dims.
+std::vector<std::vector<int64_t>> GetPartitionGroupsForReplication(
+    const HloSharding& sharding, absl::Span<const int64_t> replication_dims);
+
+// Generates partition groups (groups of devices that will communicate via a
+// collective) in iota format from sharding and provided replication_dims.
+// NOTE: If provided sharding does not utilize all the partitions, we skip
+// generating a compressed format. This is because this device ids
+// (IotaReplicaGroupList) generated by this method are partition ids, but later
+// they have to be expanded across replicas into global device ids (see
+// ExpandPartitionGroupListAcrossReplicas) before they are inserted into a
+// collective. The expansion to global device ids while retaining the compressed
+// format is only possible if the device list generated covers all partitions.
+// The generated device list can cover all partitions if the provided
+// sharding covers all partitions.
+std::optional<IotaReplicaGroupList> GetIotaPartitionGroupsForReplication(
+    const HloSharding& sharding, absl::Span<const int64_t> replication_dims,
+    int64_t num_partitions);
+
+// Expands partition group list across all replicas. Expects that provided
+// partition_group_list utilizes all the partitions.
+CollectiveDeviceList ExpandPartitionGroupListAcrossReplicas(
+    IotaReplicaGroupList partition_group_list, int num_replicas,
+    int num_partitions);
+
 namespace detail {
 
 // Check if a type is SpmdPartitioningVisitor* type.
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_util_test.cc b/third_party/xla/xla/service/spmd/spmd_partitioner_util_test.cc
index 663bb45e21cb05..d2881d543b2f26 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_util_test.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_util_test.cc
@@ -15,9 +15,13 @@ limitations under the License.
 
 #include "xla/service/spmd/spmd_partitioner_util.h"
 
+#include <cstdint>
+#include <optional>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "xla/hlo/ir/collective_device_list.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/hlo/ir/tile_assignment.h"
 
@@ -69,6 +73,90 @@ TEST(SPMDPartitionerUtilTest, PartialReplicateReshardCompatibleSharding2) {
   }
 }
 
+TEST(SPMDPartitionerUtilTest, GetPartitionGroupsForReplication) {
+  HloSharding sharding = HloSharding::IotaTile({2, 2, 2});
+  std::vector<std::vector<int64_t>> actual_partition_groups =
+      GetPartitionGroupsForReplication(sharding, {1});
+  std::vector<std::vector<int64_t>> expected_partition_groups = {
+      {0, 2}, {1, 3}, {4, 6}, {5, 7}};
+  EXPECT_THAT(actual_partition_groups,
+              testing::ContainerEq(expected_partition_groups));
+}
+
+TEST(SPMDPartitionerUtilTest, GetPartitionGroupsForReplication2) {
+  HloSharding sharding = HloSharding::IotaTile({2, 2, 2}, {2, 2, 2}, {0, 2, 1});
+  std::vector<std::vector<int64_t>> actual_partition_groups =
+      GetPartitionGroupsForReplication(sharding, {0, 2});
+  std::vector<std::vector<int64_t>> expected_partition_groups = {{0, 2, 4, 6},
+                                                                 {1, 3, 5, 7}};
+  EXPECT_THAT(actual_partition_groups,
+              testing::ContainerEq(expected_partition_groups));
+}
+
+TEST(SPMDPartitionerUtilTest, GetIotaPartitionGroupsForReplication) {
+  HloSharding sharding = HloSharding::IotaTile({2, 2, 2});
+  std::optional<IotaReplicaGroupList> actual_partition_group_list =
+      GetIotaPartitionGroupsForReplication(sharding, {1}, 8);
+  EXPECT_TRUE(actual_partition_group_list.has_value());
+  EXPECT_EQ(actual_partition_group_list->num_replica_groups(), 4);
+  EXPECT_EQ(actual_partition_group_list->num_devices_per_group(), 2);
+  EXPECT_THAT(actual_partition_group_list->reshape_dims(),
+              testing::ElementsAre(2, 2, 2));
+  EXPECT_THAT(actual_partition_group_list->transpose_perm(),
+              testing::ElementsAre(0, 2, 1));
+}
+
+TEST(SPMDPartitionerUtilTest, GetIotaPartitionGroupsForReplication2) {
+  HloSharding sharding = HloSharding::IotaTile({2, 2, 2}, {2, 2, 2}, {0, 2, 1});
+  std::optional<IotaReplicaGroupList> actual_partition_group_list =
+      GetIotaPartitionGroupsForReplication(sharding, {0, 2}, 8);
+  EXPECT_TRUE(actual_partition_group_list.has_value());
+  EXPECT_EQ(actual_partition_group_list->num_replica_groups(), 2);
+  EXPECT_EQ(actual_partition_group_list->num_devices_per_group(), 4);
+  EXPECT_THAT(actual_partition_group_list->reshape_dims(),
+              testing::ElementsAre(4, 2));
+  EXPECT_THAT(actual_partition_group_list->transpose_perm(),
+              testing::ElementsAre(1, 0));
+}
+
+TEST(SPMDPartitionerUtilTest,
+     GetIotaPartitionGroupsForReplicationSkipWhenNotUsingAllPartitions) {
+  HloSharding simple_sharding = HloSharding::IotaTile({2, 2, 2});
+  std::optional<IotaReplicaGroupList> actual_partition_group_list =
+      GetIotaPartitionGroupsForReplication(simple_sharding, {1}, 16);
+  EXPECT_FALSE(actual_partition_group_list.has_value());
+}
+
+TEST(SPMDPartitionerUtilTest, ExpandPartitionGroupListAcrossReplicas) {
+  IotaReplicaGroupList partition_group_list =
+      IotaReplicaGroupList(10, 5, {2, 5, 5}, {0, 2, 1});
+  IotaReplicaGroupList expanded_partition_group_list =
+      ExpandPartitionGroupListAcrossReplicas(partition_group_list, 2, 50)
+          .iota_replica_group_list()
+          .value();
+  EXPECT_EQ(expanded_partition_group_list.num_replica_groups(), 20);
+  EXPECT_EQ(expanded_partition_group_list.num_devices_per_group(), 5);
+  EXPECT_THAT(expanded_partition_group_list.reshape_dims(),
+              testing::ElementsAre(4, 5, 5));
+  EXPECT_THAT(expanded_partition_group_list.transpose_perm(),
+              testing::ElementsAre(0, 2, 1));
+}
+
+TEST(SPMDPartitionerUtilDeathTest, ExpandPartitionGroupListAcrossReplicas) {
+  IotaReplicaGroupList partition_group_list =
+      IotaReplicaGroupList(10, 5, {2, 5, 5}, {0, 2, 1});
+  // If we try to expand partition group list across replicas for a partition
+  // group list that does not cover all available partitions, we should exit
+  // with a failure.
+  ASSERT_DEATH(
+      {
+        auto expanded_partition_group_list =
+            ExpandPartitionGroupListAcrossReplicas(partition_group_list, 2, 60);
+      },
+      "Check failed: \\(partition_group_count \\* partition_group_size\\) == "
+      "num_partitions \\(50 vs\\. 60\\)");
+}
+
 }  // namespace
 }  // namespace spmd
 }  // namespace xla
diff --git a/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner.cc b/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner.cc
index 2d575fa6a627b6..f267d737d8b087 100644
--- a/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner.cc
+++ b/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner.cc
@@ -24,7 +24,7 @@ limitations under the License.
 namespace xla {
 namespace spmd {
 
-Status StatefulRngSpmdPartitioningVisitor::HandleRngGetAndUpdateState(
+absl::Status StatefulRngSpmdPartitioningVisitor::HandleRngGetAndUpdateState(
     HloInstruction* hlo) {
   if (hlo->sharding().HasUniqueDevice()) {
     return HandleSingleDevice(hlo);
@@ -57,7 +57,7 @@ StatefulRngSpmdPartitioner::CreateVisitor(
       next_channel_id, logger, std::move(options), this, call_graph);
 }
 
-Status StatefulRngSpmdPartitioner::PreprocessSharding(
+absl::Status StatefulRngSpmdPartitioner::PreprocessSharding(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // For rng-get-and-update-status with no sharding, set sharding to be
diff --git a/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner.h b/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner.h
index 5d9170e67e6277..1fad5d1621ffef 100644
--- a/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner.h
+++ b/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner.h
@@ -40,18 +40,23 @@ class StatefulRngSpmdPartitioningVisitor
                                       collective_ops_creator, next_channel_id,
                                       logger, std::move(options), partitioner,
                                       call_graph) {}
-  Status HandleRngGetAndUpdateState(HloInstruction* hlo) override;
+  absl::Status HandleRngGetAndUpdateState(HloInstruction* hlo) override;
 };
 
 class StatefulRngSpmdPartitioner : public spmd::SpmdPartitioner {
  public:
-  StatefulRngSpmdPartitioner(int64_t num_partitions, int64_t num_replicas,
-                             int64_t threshold_for_windowed_einsum_mib = 100000,
-                             bool windowed_einsum_use_multiple_streams = false)
-      : spmd::SpmdPartitioner(
-            num_partitions, num_replicas,
-            GetSpmdPartitionerOptions(threshold_for_windowed_einsum_mib,
-                                      windowed_einsum_use_multiple_streams)) {}
+  StatefulRngSpmdPartitioner(
+      int64_t num_partitions, int64_t num_replicas,
+      int64_t threshold_for_windowed_einsum_mib = 100000,
+      bool windowed_einsum_use_multiple_streams = false,
+      bool skip_checking_windowed_einsum_users = false,
+      bool disable_ag_rewrite_for_multiple_consumers = false)
+      : spmd::SpmdPartitioner(num_partitions, num_replicas,
+                              GetSpmdPartitionerOptions(
+                                  threshold_for_windowed_einsum_mib,
+                                  windowed_einsum_use_multiple_streams,
+                                  skip_checking_windowed_einsum_users,
+                                  disable_ag_rewrite_for_multiple_consumers)) {}
 
  protected:
   std::unique_ptr<spmd::SpmdPartitioningVisitor> CreateVisitor(
@@ -61,7 +66,7 @@ class StatefulRngSpmdPartitioner : public spmd::SpmdPartitioner {
       spmd::SpmdPartitionerOptions options,
       const CallGraph& call_graph) override;
 
-  Status PreprocessSharding(
+  absl::Status PreprocessSharding(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
   bool CanSideEffectingHaveReplicatedSharding(
@@ -70,12 +75,18 @@ class StatefulRngSpmdPartitioner : public spmd::SpmdPartitioner {
  private:
   static spmd::SpmdPartitionerOptions GetSpmdPartitionerOptions(
       int64_t threshold_for_windowed_einsum_mib,
-      bool windowed_einsum_use_multiple_streams = false) {
+      bool windowed_einsum_use_multiple_streams = false,
+      bool skip_checking_windowed_einsum_users = false,
+      bool disable_ag_rewrite_for_multiple_consumers = false) {
     spmd::SpmdPartitionerOptions options;
     options.allow_module_signature_change = true;
     options.threshold_for_windowed_einsum_mib =
         threshold_for_windowed_einsum_mib;
     options.unroll_windowed_einsum = windowed_einsum_use_multiple_streams;
+    options.skip_checking_windowed_einsum_users =
+        skip_checking_windowed_einsum_users;
+    options.disable_ag_rewrite_for_multiple_consumers =
+        disable_ag_rewrite_for_multiple_consumers;
     return options;
   }
 };
diff --git a/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner_test.cc b/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner_test.cc
index d301f0ccdcdd74..0ae7efe0daf4f4 100644
--- a/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner_test.cc
+++ b/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner_test.cc
@@ -34,11 +34,25 @@ namespace xla {
 namespace spmd {
 namespace {
 
+int64_t CountInstructions(const HloComputation &computation, HloOpcode opcode) {
+  int64_t count = 0;
+  for (const auto &instruction : computation.instructions()) {
+    if (instruction->opcode() == opcode) {
+      count++;
+    }
+  }
+  return count;
+}
+
 class StatefulRngSpmdPartitionerTest : public HloTestBase {
  public:
   absl::StatusOr<std::unique_ptr<HloModule>> PartitionComputation(
       absl::string_view hlo_module, int64_t num_partitions,
-      std::function<void(HloPassPipeline &pipeline)> add_passes = nullptr) {
+      std::function<void(HloPassPipeline &pipeline)> add_passes = nullptr,
+      int64_t threshold_for_windowed_einsum_mib = 1000000,
+      bool windowed_einsum_use_multiple_streams = false,
+      bool skip_checking_windowed_einsum_users = false,
+      bool disable_ag_rewrite_for_multiple_consumers = false) {
     TF_ASSIGN_OR_RETURN(
         auto module, ParseAndReturnVerifiedModule(
                          hlo_module, GetModuleConfigForTest(
@@ -51,8 +65,12 @@ class StatefulRngSpmdPartitionerTest : public HloTestBase {
       add_passes(pass);
     }
     pass.AddPass<ShardingPropagation>(/*is_spmd=*/true);
-    pass.AddPass<StatefulRngSpmdPartitioner>(num_partitions,
-                                             /*num_replicas=*/1);
+    pass.AddPass<StatefulRngSpmdPartitioner>(
+        num_partitions,
+        /*num_replicas=*/1, threshold_for_windowed_einsum_mib,
+        windowed_einsum_use_multiple_streams,
+        skip_checking_windowed_einsum_users,
+        disable_ag_rewrite_for_multiple_consumers);
     pass.AddPass<HloVerifier>(/*layout_sensitive=*/false,
                               /*allow_mixed_precision=*/false);
     TF_RETURN_IF_ERROR(pass.Run(module.get()).status());
@@ -116,6 +134,42 @@ ENTRY entry {
   VerifyNoAllReduce(module.get());
 }
 
+TEST_F(StatefulRngSpmdPartitionerTest,
+       EinsumDisableRewriteForAgWithMultipleConsumers) {
+  absl::string_view hlo_string = R"(
+HloModule test, entry_computation_layout={(bf16[2,2048,24576]{2,1,0}, bf16[24576,98304]{1,0}, bf16[24576,98304]{1,0})->bf16[2,2048,98304]{2,1,0}}, num_partitions=4
+
+ENTRY main {
+  Arg_0.1 = bf16[2,2048,24576]{2,1,0} parameter(0), sharding={devices=[1,4,1]<=[4]}
+  Arg_1.2 = bf16[24576,98304]{1,0} parameter(1), sharding={devices=[1,4]<=[4]}
+  dot.5 = bf16[2,2048,98304]{2,1,0} dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={2}, rhs_contracting_dims={0}, sharding={devices=[1,1,4]<=[4]}
+  Arg_2.3 = bf16[24576,98304]{1,0} parameter(2), sharding={devices=[1,4]<=[4]}
+  dot.6 = bf16[2,2048,98304]{2,1,0} dot(Arg_0.1, Arg_2.3), lhs_contracting_dims={2}, rhs_contracting_dims={0}, sharding={devices=[1,1,4]<=[4]}
+  ROOT add.8 = bf16[2,2048,98304]{2,1,0} add(dot.5, dot.6), sharding={devices=[1,1,4]<=[4]}
+}
+
+)";
+  // With disable_ag_rewrite_for_multiple_consumers set to true, we expect only
+  // 1 while loop to exist which is the rewritten windowed einsum loop for the
+  // first ag->dot pattern. The second dot which shares the same operand with
+  // the loop will remain as is.
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, PartitionComputation(
+                       hlo_string, /*num_partitions=*/4, /*add_passes=*/nullptr,
+                       /*threshold_for_windowed_einsum_mib=*/0,
+                       /*windowed_einsum_use_multiple_streams=*/true,
+                       /*skip_checking_windowed_einsum_users=*/true,
+                       /*disable_ag_rewrite_for_multiple_consumers=*/true));
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_EQ(CountInstructions(*module->entry_computation(), HloOpcode::kWhile),
+            1);
+  EXPECT_EQ(CountInstructions(*module->entry_computation(), HloOpcode::kDot),
+            1);
+  EXPECT_EQ(
+      CountInstructions(*module->entry_computation(), HloOpcode::kAllGather),
+      1);
+}
+
 TEST_F(StatefulRngSpmdPartitionerTest, VerifyThresholdSetCorrectly) {
   auto debug_options = HloTestBase::GetDebugOptionsForTest();
   int64_t threshold = 400;
diff --git a/third_party/xla/xla/service/spmd/whole_graph_manual_pass_test.cc b/third_party/xla/xla/service/spmd/whole_graph_manual_pass_test.cc
index db4a7838edb9d4..0720e5b7da647b 100644
--- a/third_party/xla/xla/service/spmd/whole_graph_manual_pass_test.cc
+++ b/third_party/xla/xla/service/spmd/whole_graph_manual_pass_test.cc
@@ -45,7 +45,8 @@ class WholeGraphManualPassTest : public HloTestBase {
     TF_RETURN_IF_ERROR(pipeline.Run(module.get()).status());
     return absl::StatusOr<std::unique_ptr<HloModule>>(std::move(module));
   }
-  Status RunPassOnModule(HloModule* module, int64_t distance_threshold = 100) {
+  absl::Status RunPassOnModule(HloModule* module,
+                               int64_t distance_threshold = 100) {
     HloPassPipeline pipeline("all-gather-cse");
     pipeline.AddPass<WholeGraphManualPass>();
     TF_RETURN_IF_ERROR(pipeline.Run(module).status());
@@ -109,7 +110,7 @@ HloModule module
    val.0 = f32[2] get-tuple-element(p_body), index=0
    val.1 = f32[2] get-tuple-element(p_body), index=1
    t = token[] after-all()
-   p = s32[] partition-id()
+   p = u32[] partition-id()
    ag = f32[8] all-gather(val.1), dimensions={0}, replica_groups={{0,1,2,3}}, use_global_device_ids=true, channel_id=1
    s = (f32[8], s32[], token[]) send(ag, t), channel_id=2
    sd = token[] send-done(s), channel_id=2
diff --git a/third_party/xla/xla/service/stochastic_convert_decomposer.cc b/third_party/xla/xla/service/stochastic_convert_decomposer.cc
index 8829b42ed86456..e4a9adb0c1ef6d 100644
--- a/third_party/xla/xla/service/stochastic_convert_decomposer.cc
+++ b/third_party/xla/xla/service/stochastic_convert_decomposer.cc
@@ -32,8 +32,8 @@ limitations under the License.
 
 namespace xla {
 
-Status DecomposeStochasticConvert(HloComputation* comp,
-                                  HloInstruction* instruction) {
+absl::Status DecomposeStochasticConvert(HloComputation* comp,
+                                        HloInstruction* instruction) {
   CHECK(instruction->opcode() == HloOpcode::kStochasticConvert)
       << "requires a stochastic_convert instruction to decompose, but got: "
       << instruction->opcode();
diff --git a/third_party/xla/xla/service/sub_byte_normalization.cc b/third_party/xla/xla/service/sub_byte_normalization.cc
index b113de71c07806..1806bcbdedc7d8 100644
--- a/third_party/xla/xla/service/sub_byte_normalization.cc
+++ b/third_party/xla/xla/service/sub_byte_normalization.cc
@@ -46,8 +46,8 @@ bool UpdateLayout(Layout* layout, PrimitiveType type,
     case SubByteNormalization::REMOVE_ELEMENT_SIZE:
       return set_element_size(0);
     case SubByteNormalization::SET_ELEMENT_SIZE:
-      if (primitive_util::Is4BitType(type)) {
-        return set_element_size(4);
+      if (primitive_util::IsSubByteNonPredType(type)) {
+        return set_element_size(primitive_util::BitWidth(type));
       } else {
         return set_element_size(0);
       }
@@ -88,7 +88,7 @@ absl::StatusOr<bool> SubByteNormalization::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
-  FunctionVisitor visitor([&](HloInstruction* hlo) -> Status {
+  FunctionVisitor visitor([&](HloInstruction* hlo) -> absl::Status {
     auto* shape = hlo->mutable_shape();
     changed |= UpdateShape(shape, mode_);
     return OkStatus();
diff --git a/third_party/xla/xla/service/sub_byte_normalization.h b/third_party/xla/xla/service/sub_byte_normalization.h
index ea70be74a19b4b..4d67137b60fbaa 100644
--- a/third_party/xla/xla/service/sub_byte_normalization.h
+++ b/third_party/xla/xla/service/sub_byte_normalization.h
@@ -26,17 +26,17 @@ namespace xla {
 
 // A pass that can modify the sub-byte element_size_in_bits annotation on
 // layouts. Depending on the constructor argument, it either removes the
-// element_size_in_bits annotation for platforms that doesn't support
-// nibble-packed types, or it sets element_size_in_bits to 4 for 4-bit values.
+// element_size_in_bits annotation for platforms that don't support packed
+// types, or it sets element_size_in_bits to N for N-bit values.
 class SubByteNormalization : public HloModulePass {
  public:
   enum Mode {
     // Remove element_size_in_bits on all layouts. Useful for platforms which
-    // do not support nibble-packed types.
+    // do not support packed types.
     REMOVE_ELEMENT_SIZE,
-    // Set element_size_in_bits to 4 for layouts of int4 types (S4, U4), and to
-    // 0 for all other layouts. Useful for platforms which support nibble-packed
-    // types.
+    // Set element_size_in_bits to bitwidth(type) for layouts of types < 8 bits
+    // (S4, U4, etc.), and to 0 for all other layouts. Useful for platforms
+    // which support packed types.
     SET_ELEMENT_SIZE,
   };
 
@@ -47,9 +47,9 @@ class SubByteNormalization : public HloModulePass {
   absl::string_view name() const override {
     switch (mode_) {
       case REMOVE_ELEMENT_SIZE:
-        return "int4-size-removal";
+        return "sub-byte-size-removal";
       case SET_ELEMENT_SIZE:
-        return "int4-size-setter";
+        return "sub-byte-size-setter";
     }
   }
   using HloPassInterface::Run;
diff --git a/third_party/xla/xla/service/topk_rewriter.cc b/third_party/xla/xla/service/topk_rewriter.cc
index 257163c0faf7a2..aee82d6b655a92 100644
--- a/third_party/xla/xla/service/topk_rewriter.cc
+++ b/third_party/xla/xla/service/topk_rewriter.cc
@@ -460,7 +460,7 @@ class TopkDecomposerVisitor : public DfsHloRewriteVisitor {
   explicit TopkDecomposerVisitor(HloPredicate should_decompose)
       : should_decompose_(should_decompose) {}
 
-  Status HandleCustomCall(HloInstruction* inst) override {
+  absl::Status HandleCustomCall(HloInstruction* inst) override {
     if (should_decompose_ && !should_decompose_(inst)) {
       return OkStatus();
     }
@@ -472,7 +472,7 @@ class TopkDecomposerVisitor : public DfsHloRewriteVisitor {
     return DecomposeTopK(call, comparator);
   }
 
-  Status HandleTopK(HloInstruction* topk) override {
+  absl::Status HandleTopK(HloInstruction* topk) override {
     if (should_decompose_ && !should_decompose_(topk)) {
       return OkStatus();
     }
@@ -506,8 +506,8 @@ class TopkDecomposerVisitor : public DfsHloRewriteVisitor {
     return comparator;
   }
 
-  Status DecomposeTopK(HloInstruction* call,
-                       HloComputation* variadic_comparator) {
+  absl::Status DecomposeTopK(HloInstruction* call,
+                             HloComputation* variadic_comparator) {
     HloComputation* comp = call->parent();
     HloInstruction* input = call->mutable_operand(0);
     Shape iota_shape = input->shape();
diff --git a/third_party/xla/xla/service/tpu_computation_placer.cc b/third_party/xla/xla/service/tpu_computation_placer.cc
index 23e849ca135366..e541ac0b970456 100644
--- a/third_party/xla/xla/service/tpu_computation_placer.cc
+++ b/third_party/xla/xla/service/tpu_computation_placer.cc
@@ -23,9 +23,6 @@ limitations under the License.
 namespace tensorflow {
 namespace tpu {
 
-template <typename T>
-using StatusOr = TpuComputationPlacer::StatusOr<T>;
-
 TpuComputationPlacer::TpuComputationPlacer() {
   placer_ = stream_executor::tpu::ExecutorApiFn()->TpuComputationPlacer_NewFn();
 }
@@ -34,38 +31,50 @@ TpuComputationPlacer::~TpuComputationPlacer() {
   stream_executor::tpu::ExecutorApiFn()->TpuComputationPlacer_FreeFn(placer_);
 }
 
-StatusOr<int> TpuComputationPlacer::DeviceId(int replica, int computation,
-                                             int replica_count,
-                                             int computation_count) {
+absl::StatusOr<int> TpuComputationPlacer::DeviceId(int replica, int computation,
+                                                   int replica_count,
+                                                   int computation_count) {
   LOG(FATAL) << "Unimplemented.";
 }
 
-StatusOr<xla::DeviceAssignment> TpuComputationPlacer::AssignDevices(
+absl::StatusOr<xla::DeviceAssignment> TpuComputationPlacer::AssignDevices(
     int replica_count, int computation_count) {
   StatusHelper status;
   xla::DeviceAssignment result(replica_count, computation_count);
+  xla::Array2D<int> result_int32(replica_count, computation_count);
   stream_executor::tpu::ExecutorApiFn()->TpuComputationPlacer_AssignDevicesFn(
-      placer_, replica_count, computation_count, result.data(),
+      placer_, replica_count, computation_count, result_int32.data(),
       status.c_status);
   if (!status.ok()) {
     return status.status();
   }
+  // Upcast to 64-bit.
+  for (int i = 0; i < replica_count; ++i) {
+    for (int j = 0; j < computation_count; ++j)
+      result(i, j) = result_int32(i, j);
+  }
   return result;
 }
 
-/*static*/ StatusOr<xla::DeviceAssignment>
+/*static*/ absl::StatusOr<xla::DeviceAssignment>
 TpuComputationPlacer::AssignLocalDevices(TpuHostLocationExternal host_location,
                                          int replica_count,
                                          int computation_count) {
   StatusHelper status;
   xla::DeviceAssignment result(replica_count, computation_count);
+  xla::Array2D<int> result_int32(replica_count, computation_count);
   stream_executor::tpu::ExecutorApiFn()
       ->TpuComputationPlacer_AssignLocalDevicesFn(
-          host_location.impl(), replica_count, computation_count, result.data(),
-          status.c_status);
+          host_location.impl(), replica_count, computation_count,
+          result_int32.data(), status.c_status);
   if (!status.ok()) {
     return status.status();
   }
+  // Upcast to 64-bit.
+  for (int i = 0; i < replica_count; ++i) {
+    for (int j = 0; j < computation_count; ++j)
+      result(i, j) = result_int32(i, j);
+  }
   return result;
 }
 
diff --git a/third_party/xla/xla/service/tpu_computation_placer.h b/third_party/xla/xla/service/tpu_computation_placer.h
index 60345010683a26..b1780f22b77672 100644
--- a/third_party/xla/xla/service/tpu_computation_placer.h
+++ b/third_party/xla/xla/service/tpu_computation_placer.h
@@ -26,19 +26,16 @@ namespace tpu {
 
 class TpuComputationPlacer : public xla::ComputationPlacer {
  public:
-  template <typename T>
-  using StatusOr = absl::StatusOr<T>;
-
   TpuComputationPlacer();
   ~TpuComputationPlacer() override;
 
-  StatusOr<int> DeviceId(int replica, int computation, int replica_count,
-                         int computation_count) override;
+  absl::StatusOr<int> DeviceId(int replica, int computation, int replica_count,
+                               int computation_count) override;
 
-  StatusOr<xla::DeviceAssignment> AssignDevices(int replica_count,
-                                                int computation_count) override;
+  absl::StatusOr<xla::DeviceAssignment> AssignDevices(
+      int replica_count, int computation_count) override;
 
-  static StatusOr<xla::DeviceAssignment> AssignLocalDevices(
+  static absl::StatusOr<xla::DeviceAssignment> AssignLocalDevices(
       TpuHostLocationExternal host_location, int replica_count,
       int computation_count);
 
diff --git a/third_party/xla/xla/service/transfer_manager.cc b/third_party/xla/xla/service/transfer_manager.cc
index 4335351bc8db27..a01b4f04060a40 100644
--- a/third_party/xla/xla/service/transfer_manager.cc
+++ b/third_party/xla/xla/service/transfer_manager.cc
@@ -62,7 +62,7 @@ absl::StatusOr<Literal> TransferManager::TransferLiteralFromDevice(
   return std::move(literal);
 }
 
-Status TransferManager::TransferLiteralFromDevice(
+absl::Status TransferManager::TransferLiteralFromDevice(
     se::Stream* stream, const ShapedBuffer& device_buffer,
     const MutableBorrowingLiteral& literal,
     const TransferMetadata* transfer_metadata) {
@@ -70,11 +70,11 @@ Status TransferManager::TransferLiteralFromDevice(
   TF_RETURN_IF_ERROR(substream->WaitFor(stream));
   absl::Cleanup cleanup = [&]() { stream->ReturnSubStream(substream); };
 
-  Status ret;
+  absl::Status ret;
   tsl::Notification n;
   TransferLiteralFromDevice(
       substream, device_buffer, literal,
-      [&](Status status) {
+      [&](absl::Status status) {
         ret = status;
         n.Notify();
       },
@@ -83,7 +83,7 @@ Status TransferManager::TransferLiteralFromDevice(
   return ret;
 }
 
-Status TransferManager::TransferLiteralToDevice(
+absl::Status TransferManager::TransferLiteralToDevice(
     se::Stream* stream, const LiteralSlice& literal,
     const ShapedBuffer& device_buffer,
     const TransferMetadata* transfer_metadata) {
@@ -112,7 +112,7 @@ absl::StatusOr<Literal> TransferManager::TransferArrayFromDevice(
   return std::move(literal);
 }
 
-Status TransferManager::TransferArrayToDevice(
+absl::Status TransferManager::TransferArrayToDevice(
     se::Stream* stream, const LiteralSlice& literal,
     const se::DeviceMemoryBase& dest,
     const TransferMetadata* transfer_metadata) {
@@ -127,7 +127,7 @@ Status TransferManager::TransferArrayToDevice(
   return substream->BlockHostUntilDone();
 }
 
-Status TransferManager::TransferArrayToDeviceAsync(
+absl::Status TransferManager::TransferArrayToDeviceAsync(
     se::Stream* stream, const LiteralSlice& literal,
     const se::DeviceMemoryBase& dest,
     const TransferMetadata* transfer_metadata) {
@@ -139,15 +139,15 @@ Status TransferManager::TransferArrayToDeviceAsync(
                                       transfer_metadata);
 }
 
-Status TransferManager::ReadDynamicShapes(se::Stream* stream,
-                                          const ShapedBuffer* device_buffer,
-                                          Shape* device_shape) {
+absl::Status TransferManager::ReadDynamicShapes(
+    se::Stream* stream, const ShapedBuffer* device_buffer,
+    Shape* device_shape) {
   DCHECK(device_shape->is_dynamic());
   Shape original_device_shape = *device_shape;
   TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
 
-  TF_ASSIGN_OR_RETURN(auto compiler,
-                      Compiler::GetForPlatform(stream->parent()->platform()));
+  TF_ASSIGN_OR_RETURN(
+      auto compiler, Compiler::GetForPlatform(stream->parent()->GetPlatform()));
   TF_RETURN_IF_ERROR(device_buffer->buffers().ForEachElementWithStatus(
       [&](const ShapeIndex& index,
           const se::DeviceMemoryBase& buffer) -> absl::Status {
@@ -223,19 +223,20 @@ Status TransferManager::ReadDynamicShapes(se::Stream* stream,
   return it->second.manager.get();
 }
 
-Status TransferManager::WriteTupleIndexTables(
+absl::Status TransferManager::WriteTupleIndexTables(
     se::Stream* stream, const ShapedBuffer& device_buffer) {
   TF_RETURN_IF_ERROR(WriteTupleIndexTablesAsync(stream, device_buffer));
   return stream->BlockHostUntilDone();
 }
 
-Status TransferManager::WriteTupleIndexTablesAsync(
+absl::Status TransferManager::WriteTupleIndexTablesAsync(
     se::Stream* stream, const ShapedBuffer& device_buffer) {
   VLOG(2) << "Writing tuple index tables for " << device_buffer;
 
   return ShapeUtil::ForEachSubshapeWithStatus(
       device_buffer.on_device_shape(),
-      [&](const Shape& device_subshape, const ShapeIndex& index) -> Status {
+      [&](const Shape& device_subshape,
+          const ShapeIndex& index) -> absl::Status {
         if (device_subshape.IsTuple() &&
             ShapeUtil::TupleElementCount(device_subshape) > 0) {
           se::DeviceMemoryBase device_memory = device_buffer.buffer(index);
@@ -258,7 +259,7 @@ Status TransferManager::WriteTupleIndexTablesAsync(
       });
 }
 
-Status TransferManager::WriteRootTupleIndexTable(
+absl::Status TransferManager::WriteRootTupleIndexTable(
     se::Stream* stream, const ShapedBuffer& device_buffer) {
   TF_RET_CHECK(device_buffer.on_device_shape().IsTuple());
   if (ShapeUtil::TupleElementCount(device_buffer.on_device_shape()) == 0) {
@@ -277,7 +278,7 @@ Status TransferManager::WriteRootTupleIndexTable(
       stream, elements, device_buffer.on_device_shape(), &device_memory);
 }
 
-Status TransferManager::WriteRootTupleIndexTable(
+absl::Status TransferManager::WriteRootTupleIndexTable(
     se::Stream* stream, const ShapeTree<MaybeOwningDeviceMemory>& buffer_tree) {
   TF_RET_CHECK(buffer_tree.shape().IsTuple());
   if (ShapeUtil::TupleElementCount(buffer_tree.shape()) == 0) {
diff --git a/third_party/xla/xla/service/transfer_manager.h b/third_party/xla/xla/service/transfer_manager.h
index b29a93ee6eb551..4ead3279bee2ac 100644
--- a/third_party/xla/xla/service/transfer_manager.h
+++ b/third_party/xla/xla/service/transfer_manager.h
@@ -81,7 +81,7 @@ class TransferManager {
       se::Stream* stream, const ShapedBuffer& device_buffer,
       const TransferMetadata* transfer_metadata = nullptr);
 
-  Status TransferLiteralFromDevice(
+  absl::Status TransferLiteralFromDevice(
       se::Stream* stream, const ShapedBuffer& device_buffer,
       const MutableBorrowingLiteral& literal,
       const TransferMetadata* transfer_metadata = nullptr);
@@ -122,7 +122,7 @@ class TransferManager {
   //
   // Optionally caller can specify platform-specific transfer metadata that
   // tells the actual implementation to do something special.
-  Status TransferLiteralToDevice(
+  absl::Status TransferLiteralToDevice(
       se::Stream* stream, const LiteralSlice& literal,
       const ShapedBuffer& device_buffer,
       const TransferMetadata* transfer_metadata = nullptr);
@@ -143,14 +143,14 @@ class TransferManager {
   //
   // Optionally caller can specify platform-specific transfer metadata that
   // tells the actual implementation to do something special.
-  virtual Status TransferLiteralToDeviceAsync(
+  virtual absl::Status TransferLiteralToDeviceAsync(
       se::Stream* stream, const LiteralSlice& literal,
       const ShapedBuffer& device_buffer,
       const TransferMetadata* transfer_metadata) = 0;
 
-  Status TransferLiteralToDeviceAsync(se::Stream* stream,
-                                      const LiteralSlice& literal,
-                                      const ShapedBuffer& device_buffer) {
+  absl::Status TransferLiteralToDeviceAsync(se::Stream* stream,
+                                            const LiteralSlice& literal,
+                                            const ShapedBuffer& device_buffer) {
     return TransferLiteralToDeviceAsync(stream, literal, device_buffer,
                                         nullptr);
   }
@@ -161,12 +161,12 @@ class TransferManager {
   //
   // Optionally caller can specify platform-specific transfer metadata that
   // tells the actual implementation to do something special.
-  Status TransferArrayToDevice(
+  absl::Status TransferArrayToDevice(
       se::Stream* stream, const LiteralSlice& literal,
       const se::DeviceMemoryBase& dest,
       const TransferMetadata* transfer_metadata = nullptr);
 
-  Status TransferArrayToDeviceAsync(
+  absl::Status TransferArrayToDeviceAsync(
       se::Stream* stream, const LiteralSlice& literal,
       const se::DeviceMemoryBase& dest,
       const TransferMetadata* transfer_metadata = nullptr);
@@ -181,39 +181,39 @@ class TransferManager {
   // shapes, and returns static shapes with dynamic shapes updated.
   // The shape of the buffer also have to be compatible with the host shape and
   // device shape.
-  virtual Status ReadDynamicShapes(se::Stream* stream,
-                                   const ShapedBuffer* device_buffer,
-                                   Shape* device_shape);
+  virtual absl::Status ReadDynamicShapes(se::Stream* stream,
+                                         const ShapedBuffer* device_buffer,
+                                         Shape* device_shape);
 
   // Transfers the given literal into the Infeed interface of the device,
   // using the given executor.
-  virtual Status TransferLiteralToInfeed(se::StreamExecutor* executor,
-                                         const LiteralSlice& literal) = 0;
+  virtual absl::Status TransferLiteralToInfeed(se::StreamExecutor* executor,
+                                               const LiteralSlice& literal) = 0;
 
   // Transfers the given literal from the Outfeed interface of the device,
   // using the given executor. The shape and layout are determined by the
   // shape and layout of `literal`.
-  virtual Status TransferLiteralFromOutfeed(
+  virtual absl::Status TransferLiteralFromOutfeed(
       se::StreamExecutor* executor, MutableBorrowingLiteral literal) = 0;
 
   // Resets the devices associated with this transfer manager.
-  virtual Status ResetDevices(
+  virtual absl::Status ResetDevices(
       absl::Span<se::StreamExecutor* const> executor) = 0;
 
   // Given an allocated ShapedBuffer, constructs the tuple index table(s) in
   // each buffer of the given ShapedBuffer corresponding to tuple shapes. If the
   // ShapedBuffer is array-shaped this method does nothing.
-  Status WriteTupleIndexTables(se::Stream* stream,
-                               const ShapedBuffer& device_buffer);
-  Status WriteTupleIndexTablesAsync(se::Stream* stream,
-                                    const ShapedBuffer& device_buffer);
+  absl::Status WriteTupleIndexTables(se::Stream* stream,
+                                     const ShapedBuffer& device_buffer);
+  absl::Status WriteTupleIndexTablesAsync(se::Stream* stream,
+                                          const ShapedBuffer& device_buffer);
 
   // Writes a tuple index buffer for the root of 'device_buffer', which must
   // be a tuple. Unlike WriteTupleIndexTables, only writes the root buffer,
   // rather than writing all subbuffers. This method is always asynchronous.
-  Status WriteRootTupleIndexTable(se::Stream* stream,
-                                  const ShapedBuffer& device_buffer);
-  Status WriteRootTupleIndexTable(
+  absl::Status WriteRootTupleIndexTable(se::Stream* stream,
+                                        const ShapedBuffer& device_buffer);
+  absl::Status WriteRootTupleIndexTable(
       se::Stream* stream,
       const ShapeTree<MaybeOwningDeviceMemory>& buffer_tree);
 
@@ -294,7 +294,7 @@ class TransferManager {
   // Writes the given device-memory pointers in 'elements' to the given region
   // to construct a tuple index table in the platform-specific tuple
   // representation.
-  virtual Status WriteSingleTupleIndexTable(
+  virtual absl::Status WriteSingleTupleIndexTable(
       se::Stream* stream, absl::Span<const se::DeviceMemoryBase> elements,
       const Shape& shape, se::DeviceMemoryBase* region) = 0;
 
diff --git a/third_party/xla/xla/service/transpose_folding.cc b/third_party/xla/xla/service/transpose_folding.cc
index b954f307f82d4f..2d6dbb2ab3e5a1 100644
--- a/third_party/xla/xla/service/transpose_folding.cc
+++ b/third_party/xla/xla/service/transpose_folding.cc
@@ -78,7 +78,7 @@ using InstructionOperandsPair =
     std::pair<HloInstruction*, TransposeFolding::OperandIndices>;
 
 // Folds the operands of `dot` that are foldable transposes.
-Status FoldTransposeIntoDot(InstructionOperandsPair& pair) {
+absl::Status FoldTransposeIntoDot(InstructionOperandsPair& pair) {
   HloInstruction* dot = pair.first;
 
   DotDimensionNumbers new_dot_dims = dot->dot_dimension_numbers();
diff --git a/third_party/xla/xla/service/tree_reduction_rewriter.cc b/third_party/xla/xla/service/tree_reduction_rewriter.cc
index 1c505358648bd8..633e3925ed3ebe 100644
--- a/third_party/xla/xla/service/tree_reduction_rewriter.cc
+++ b/third_party/xla/xla/service/tree_reduction_rewriter.cc
@@ -41,7 +41,7 @@ class ReductionRewriterVisitor : public DfsHloRewriteVisitor {
   explicit ReductionRewriterVisitor(int64_t reduce_window_size)
       : reduce_window_size_(reduce_window_size) {}
 
-  Status HandleReduce(HloInstruction *hlo) override {
+  absl::Status HandleReduce(HloInstruction *hlo) override {
     HloInstruction *reduced_op = hlo->mutable_operand(0);
     HloInstruction *initial_value = hlo->mutable_operand(1);
     const Shape &input_shape = reduced_op->shape();
diff --git a/third_party/xla/xla/service/tuple_points_to_analysis.cc b/third_party/xla/xla/service/tuple_points_to_analysis.cc
index c90e8836a12ff5..7e91cd787d2d7f 100644
--- a/third_party/xla/xla/service/tuple_points_to_analysis.cc
+++ b/third_party/xla/xla/service/tuple_points_to_analysis.cc
@@ -146,7 +146,7 @@ TuplePointsToAnalysis::Run(const HloModule* module) {
   return std::move(analysis);
 }
 
-Status TuplePointsToAnalysis::Analyze() {
+absl::Status TuplePointsToAnalysis::Analyze() {
   per_instruction_.clear();
   per_instruction_.reserve(module_->instruction_count());
 
@@ -177,7 +177,7 @@ Status TuplePointsToAnalysis::Analyze() {
   return OkStatus();
 }
 
-Status TuplePointsToAnalysis::PopulateDefinedBuffersAndAliases(
+absl::Status TuplePointsToAnalysis::PopulateDefinedBuffersAndAliases(
     const decltype(std::declval<HloComputation>()
                        .instructions())& instructions) {
   for (auto* instruction : instructions) {
@@ -199,7 +199,8 @@ Status TuplePointsToAnalysis::PopulateDefinedBuffersAndAliases(
   return OkStatus();
 }
 
-Status TuplePointsToAnalysis::DefaultAction(HloInstruction* hlo_instruction) {
+absl::Status TuplePointsToAnalysis::DefaultAction(
+    HloInstruction* hlo_instruction) {
   // Create trivial points-to set for instruction. Each points-to set at index i
   // contains a single element LogicalBuffer(hlo_instruction, i). This indicates
   // that this instruction is the source of all buffers in its own output.
@@ -220,7 +221,7 @@ Status TuplePointsToAnalysis::DefaultAction(HloInstruction* hlo_instruction) {
   return OkStatus();
 }
 
-Status TuplePointsToAnalysis::HandleGetTupleElement(
+absl::Status TuplePointsToAnalysis::HandleGetTupleElement(
     HloInstruction* get_tuple_element) {
   // GetTupleElement forwards a pointer to a particular element of the tuple
   // operand.
@@ -252,7 +253,7 @@ Status TuplePointsToAnalysis::HandleGetTupleElement(
   return OkStatus();
 }
 
-Status TuplePointsToAnalysis::HandleCopy(HloInstruction* copy) {
+absl::Status TuplePointsToAnalysis::HandleCopy(HloInstruction* copy) {
   // A kCopy instruction performs a shallow copy of the operand. The top-level
   // buffer (index={}) is newly created, but all other buffers (in the case of a
   // tuple shape) come from the operand
@@ -265,7 +266,7 @@ Status TuplePointsToAnalysis::HandleCopy(HloInstruction* copy) {
   return OkStatus();
 }
 
-Status TuplePointsToAnalysis::HandleBitcast(HloInstruction* bitcast) {
+absl::Status TuplePointsToAnalysis::HandleBitcast(HloInstruction* bitcast) {
   // A kBitcast instruction aliases its operand. That is, the buffer of its
   // result *is* the buffer of its operand, so just copy the operands points-to
   // set.
@@ -273,7 +274,7 @@ Status TuplePointsToAnalysis::HandleBitcast(HloInstruction* bitcast) {
   return OkStatus();
 }
 
-Status TuplePointsToAnalysis::HandleDomain(HloInstruction* domain) {
+absl::Status TuplePointsToAnalysis::HandleDomain(HloInstruction* domain) {
   // A kDomain instruction aliases its operand. That is, the buffer of its
   // result *is* the buffer of its operand, so just copy the operands points-to
   // set.
@@ -281,14 +282,14 @@ Status TuplePointsToAnalysis::HandleDomain(HloInstruction* domain) {
   return OkStatus();
 }
 
-Status TuplePointsToAnalysis::HandleAddDependency(
+absl::Status TuplePointsToAnalysis::HandleAddDependency(
     HloInstruction* add_dependency) {
   // AddDependency just forwards the value of its zero-th operand.
   CreateCopiedPointsToSet(add_dependency, add_dependency->operand(0));
   return OkStatus();
 }
 
-Status TuplePointsToAnalysis::HandleRecvDone(HloInstruction* recv_done) {
+absl::Status TuplePointsToAnalysis::HandleRecvDone(HloInstruction* recv_done) {
   // RecvDone aliases its input (Recv) tuple element {0} to element {0} of its
   // output. The other indices ({} and {1}) define their own buffers.
   PointsToSet& points_to_set = CreateEmptyPointsToSet(recv_done);
@@ -318,7 +319,8 @@ Status TuplePointsToAnalysis::HandleRecvDone(HloInstruction* recv_done) {
   return OkStatus();
 }
 
-Status TuplePointsToAnalysis::HandleAsyncStart(HloInstruction* async_start) {
+absl::Status TuplePointsToAnalysis::HandleAsyncStart(
+    HloInstruction* async_start) {
   // AsyncStart forwards its aliased operands to {0}.
   PointsToSet& points_to_set = CreateEmptyPointsToSet(async_start);
 
@@ -342,7 +344,8 @@ Status TuplePointsToAnalysis::HandleAsyncStart(HloInstruction* async_start) {
   return OkStatus();
 }
 
-Status TuplePointsToAnalysis::HandleAsyncUpdate(HloInstruction* async_update) {
+absl::Status TuplePointsToAnalysis::HandleAsyncUpdate(
+    HloInstruction* async_update) {
   // AsyncUpdate forwards its aliased operand to {}.
   PointsToSet& points_to_set = CreateEmptyPointsToSet(async_update);
   const PointsToSet& operand_points_to_set =
@@ -360,7 +363,8 @@ Status TuplePointsToAnalysis::HandleAsyncUpdate(HloInstruction* async_update) {
   return OkStatus();
 }
 
-Status TuplePointsToAnalysis::HandleAsyncDone(HloInstruction* async_done) {
+absl::Status TuplePointsToAnalysis::HandleAsyncDone(
+    HloInstruction* async_done) {
   // AsyncDone forwards its aliased operand.
   PointsToSet& points_to_set = CreateEmptyPointsToSet(async_done);
   const PointsToSet& operand_points_to_set =
@@ -383,7 +387,8 @@ Status TuplePointsToAnalysis::HandleAsyncDone(HloInstruction* async_done) {
   return OkStatus();
 }
 
-Status TuplePointsToAnalysis::HandleCopyStart(HloInstruction* copy_start) {
+absl::Status TuplePointsToAnalysis::HandleCopyStart(
+    HloInstruction* copy_start) {
   // CopyStart forwards its aliased operand to {1}.
   PointsToSet& points_to_set = CreateEmptyPointsToSet(copy_start);
   const PointsToSet& operand_points_to_set =
@@ -407,7 +412,7 @@ Status TuplePointsToAnalysis::HandleCopyStart(HloInstruction* copy_start) {
   return OkStatus();
 }
 
-Status TuplePointsToAnalysis::HandleCopyDone(HloInstruction* copy_done) {
+absl::Status TuplePointsToAnalysis::HandleCopyDone(HloInstruction* copy_done) {
   // CopyDone forwards its aliased operand.
   PointsToSet& points_to_set = CreateEmptyPointsToSet(copy_done);
   const PointsToSet& operand_points_to_set =
@@ -430,7 +435,7 @@ Status TuplePointsToAnalysis::HandleCopyDone(HloInstruction* copy_done) {
   return OkStatus();
 }
 
-Status TuplePointsToAnalysis::HandleSend(HloInstruction* send) {
+absl::Status TuplePointsToAnalysis::HandleSend(HloInstruction* send) {
   // Send creates a tuple of {aliased operand, U32 context, token}.
   PointsToSet& points_to_set = CreateEmptyPointsToSet(send);
 
@@ -469,7 +474,7 @@ Status TuplePointsToAnalysis::HandleSend(HloInstruction* send) {
   return OkStatus();
 }
 
-Status TuplePointsToAnalysis::HandleTuple(HloInstruction* tuple) {
+absl::Status TuplePointsToAnalysis::HandleTuple(HloInstruction* tuple) {
   absl::Span<HloInstruction* const> operands(tuple->operands());
   PointsToSet& points_to_set = CreateEmptyPointsToSet(tuple);
   points_to_set.AddPointedToBuffer(
@@ -508,7 +513,8 @@ Status TuplePointsToAnalysis::HandleTuple(HloInstruction* tuple) {
   return OkStatus();
 }
 
-Status TuplePointsToAnalysis::HandleCustomCall(HloInstruction* custom_call) {
+absl::Status TuplePointsToAnalysis::HandleCustomCall(
+    HloInstruction* custom_call) {
   auto ccall = Cast<HloCustomCallInstruction>(custom_call);
   PointsToSet& points_to_set = CreateEmptyPointsToSet(custom_call);
   absl::flat_hash_map<ShapeIndex, std::pair<int64_t, ShapeIndex>>
@@ -542,7 +548,7 @@ Status TuplePointsToAnalysis::HandleCustomCall(HloInstruction* custom_call) {
 // WARNING:
 // Adding this, which essentially does the same thing as HandleCustomCall
 // Not sure if it is really needed or it will break anything
-Status TuplePointsToAnalysis::HandleFusion(HloInstruction* fusion) {
+absl::Status TuplePointsToAnalysis::HandleFusion(HloInstruction* fusion) {
   auto cfusion = Cast<HloFusionInstruction>(fusion);
   PointsToSet& points_to_set = CreateEmptyPointsToSet(fusion);
   absl::flat_hash_map<ShapeIndex, std::pair<int64_t, ShapeIndex>>
@@ -573,7 +579,7 @@ Status TuplePointsToAnalysis::HandleFusion(HloInstruction* fusion) {
   return OkStatus();
 }
 
-Status TuplePointsToAnalysis::HandleOptimizationBarrier(
+absl::Status TuplePointsToAnalysis::HandleOptimizationBarrier(
     HloInstruction* barrier) {
   // A kOptimizationBarrier instruction is a no-op.
   CreateCopiedPointsToSet(barrier, barrier->operand(0));
@@ -602,7 +608,8 @@ bool TuplePointsToAnalysis::InstructionDefinesBufferAtIndex(
   return (buffers.size() == 1 && buffers[0]->instruction() == instruction);
 }
 
-Status TuplePointsToAnalysis::VerifyBuffer(const LogicalBuffer& buffer) const {
+absl::Status TuplePointsToAnalysis::VerifyBuffer(
+    const LogicalBuffer& buffer) const {
   if (!InstructionDefinesBufferAtIndex(buffer.instruction(), buffer.index())) {
     return FailedPrecondition(
         "LogicalBuffer %s is ill-defined: instruction %s does not define a "
@@ -654,7 +661,7 @@ TuplePointsToAnalysis::GetBuffersDefinedByInstruction(
   return PerInst(instruction)->instruction_defined_buffers;
 }
 
-Status TuplePointsToAnalysis::GatherBuffersDefinedByInstruction(
+absl::Status TuplePointsToAnalysis::GatherBuffersDefinedByInstruction(
     const HloInstruction* instruction,
     TuplePointsToAnalysis::BufferDefinitionVector* buffers) {
   GetPointsToSet(instruction)
diff --git a/third_party/xla/xla/service/tuple_points_to_analysis.h b/third_party/xla/xla/service/tuple_points_to_analysis.h
index 2b58b1c2e97ace..a80aeaa149ea00 100644
--- a/third_party/xla/xla/service/tuple_points_to_analysis.h
+++ b/third_party/xla/xla/service/tuple_points_to_analysis.h
@@ -132,7 +132,7 @@ class PointsToSet {
     });
   }
   template <typename Fn>
-  Status ForEachElementWithStatus(const Fn& fn) const {
+  absl::Status ForEachElementWithStatus(const Fn& fn) const {
     return tree_.ForEachElementWithStatus(
         [&fn](const ShapeIndex& index, const Elem& elem) {
           return fn(index, elem.buffers);
@@ -241,25 +241,26 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
   // which is not defined is a tuple element in a Tuple instruction. In this
   // case, the Tuple instruction does not define the LogicalBuffer, rather that
   // index aliases one of its operands.
-  Status VerifyBuffer(const LogicalBuffer& buffer) const;
-
-  Status DefaultAction(HloInstruction* hlo_instruction) override;
-  Status HandleTuple(HloInstruction* tuple) override;
-  Status HandleGetTupleElement(HloInstruction* get_tuple_element) override;
-  Status HandleAsyncStart(HloInstruction* async_start) override;
-  Status HandleAsyncUpdate(HloInstruction* async_update) override;
-  Status HandleAsyncDone(HloInstruction* async_done) override;
-  Status HandleBitcast(HloInstruction* bitcast) override;
-  Status HandleDomain(HloInstruction* domain) override;
-  Status HandleCopy(HloInstruction* copy) override;
-  Status HandleCopyStart(HloInstruction* copy_start) override;
-  Status HandleCopyDone(HloInstruction* copy_done) override;
-  Status HandleRecvDone(HloInstruction* recv_done) override;
-  Status HandleSend(HloInstruction* send) override;
-  Status HandleAddDependency(HloInstruction* add_dependency) override;
-  Status HandleCustomCall(HloInstruction* custom_call) override;
-  Status HandleFusion(HloInstruction* fusion) override;
-  Status HandleOptimizationBarrier(HloInstruction* barrier) override;
+  absl::Status VerifyBuffer(const LogicalBuffer& buffer) const;
+
+  absl::Status DefaultAction(HloInstruction* hlo_instruction) override;
+  absl::Status HandleTuple(HloInstruction* tuple) override;
+  absl::Status HandleGetTupleElement(
+      HloInstruction* get_tuple_element) override;
+  absl::Status HandleAsyncStart(HloInstruction* async_start) override;
+  absl::Status HandleAsyncUpdate(HloInstruction* async_update) override;
+  absl::Status HandleAsyncDone(HloInstruction* async_done) override;
+  absl::Status HandleBitcast(HloInstruction* bitcast) override;
+  absl::Status HandleDomain(HloInstruction* domain) override;
+  absl::Status HandleCopy(HloInstruction* copy) override;
+  absl::Status HandleCopyStart(HloInstruction* copy_start) override;
+  absl::Status HandleCopyDone(HloInstruction* copy_done) override;
+  absl::Status HandleRecvDone(HloInstruction* recv_done) override;
+  absl::Status HandleSend(HloInstruction* send) override;
+  absl::Status HandleAddDependency(HloInstruction* add_dependency) override;
+  absl::Status HandleCustomCall(HloInstruction* custom_call) override;
+  absl::Status HandleFusion(HloInstruction* fusion) override;
+  absl::Status HandleOptimizationBarrier(HloInstruction* barrier) override;
 
   std::string ToString() const;
 
@@ -280,11 +281,11 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
 
   // Perform the analysis. Should be called immediately after constructing the
   // object and before calling GetPointsToSet.
-  Status Analyze();
+  absl::Status Analyze();
 
   // Populates instruction-defined buffers and aliases for each instruction
   // in 'instructions'.
-  Status PopulateDefinedBuffersAndAliases(
+  absl::Status PopulateDefinedBuffersAndAliases(
       const decltype(std::declval<HloComputation>()
                          .instructions())& instructions);
 
@@ -298,8 +299,8 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
                                        const HloInstruction* src);
 
   // Adds the buffers defined by the given instruction to the given vector.
-  Status GatherBuffersDefinedByInstruction(const HloInstruction* instruction,
-                                           BufferDefinitionVector* buffers);
+  absl::Status GatherBuffersDefinedByInstruction(
+      const HloInstruction* instruction, BufferDefinitionVector* buffers);
 
   // Print points-to set for 'instruction' to 'output'.
   void InstructionToString(const HloInstruction* instruction,
diff --git a/third_party/xla/xla/service/value_range_test.cc b/third_party/xla/xla/service/value_range_test.cc
index 5fc8cd48dcfec5..d95f044c6c5ead 100644
--- a/third_party/xla/xla/service/value_range_test.cc
+++ b/third_party/xla/xla/service/value_range_test.cc
@@ -173,9 +173,9 @@ ENTRY entry {
   c1 = s32[] constant(12)
   c2 = s32[] constant(5)
   p0 = s32[] parameter(0)
-  s = s32[] subtract(p0, c0)
-  a = s32[] add(s, c1)
-  s2 = s32[] subtract(c2, a)
+  sub = s32[] subtract(p0, c0)
+  a = s32[] add(sub, c1)
+  sub2 = s32[] subtract(c2, a)
 }
 )";
   auto module =
diff --git a/third_party/xla/xla/service/while_loop_all_reduce_code_motion.cc b/third_party/xla/xla/service/while_loop_all_reduce_code_motion.cc
index 19fa63daca07bf..0e2bf23d303a13 100644
--- a/third_party/xla/xla/service/while_loop_all_reduce_code_motion.cc
+++ b/third_party/xla/xla/service/while_loop_all_reduce_code_motion.cc
@@ -673,7 +673,7 @@ WhileInitContext CreateNewWhileInit(
 // When moving reduce-scatter outside the while body, change the associated
 // accumulation buffers to use the shape of the operand of the reduce-scatter
 // (i.e., the pre-scatter shape).
-Status ChangeAccumulatorShapesInLoopBodies(
+absl::Status ChangeAccumulatorShapesInLoopBodies(
     HloInstruction* old_while_instruction,
     const HloInstructionMap<std::vector<AccumulationContext>>&
         all_reduce_to_accumulations) {
@@ -825,7 +825,7 @@ absl::flat_hash_map<int, HloInstruction*> CreateSinkedAllReduces(
             while_parent->AddInstruction(HloInstruction::CreateAllReduce(
                 all_reduce_operand->shape(), {all_reduce_operand},
                 old_all_reduce->called_computations()[0],
-                old_all_reduce->replica_groups(),
+                old_all_reduce->device_list(),
                 old_all_reduce->constrain_layout(),
                 hlo_query::NextChannelId(*(while_parent->parent())),
                 old_all_reduce->use_global_device_ids()));
@@ -836,7 +836,7 @@ absl::flat_hash_map<int, HloInstruction*> CreateSinkedAllReduces(
             while_parent->AddInstruction(HloInstruction::CreateReduceScatter(
                 old_reduce_scatter->shape(), {all_reduce_operand},
                 old_reduce_scatter->called_computations()[0],
-                old_reduce_scatter->replica_groups(),
+                old_reduce_scatter->device_list(),
                 old_reduce_scatter->constrain_layout(),
                 hlo_query::NextChannelId(*(while_parent->parent())),
                 old_reduce_scatter->use_global_device_ids(),
@@ -893,7 +893,7 @@ HloInstruction* CreateNewWhileResult(
 // The all-reduce outputs are then added to the original accumulation buffers.
 // Creates a tuple that groups the while loop output and the accumulated
 // buffers and replaces all uses of the old while with this new tuple.
-Status AddSinkedAllReducesAndReplaceWhile(
+absl::Status AddSinkedAllReducesAndReplaceWhile(
     HloInstruction* while_instruction,
     const HloInstructionMap<std::vector<AccumulationContext>>&
         all_reduce_to_accumulations) {
diff --git a/third_party/xla/xla/service/while_loop_concat_code_motion.cc b/third_party/xla/xla/service/while_loop_concat_code_motion.cc
index d7cf929ae7c541..015d97aa82682a 100644
--- a/third_party/xla/xla/service/while_loop_concat_code_motion.cc
+++ b/third_party/xla/xla/service/while_loop_concat_code_motion.cc
@@ -652,9 +652,9 @@ std::vector<bool> TupleElementsUsedInCond(HloInstruction* loop) {
 
 // Adds copies to returned values to keep RewriteLoopWithConcatGroups simple:
 // the copies do not have other users and only appear once in the root tuple.
-Status AddCopiesToRoot(HloComputation* body,
-                       absl::Span<HloInstruction* const> param_gtes,
-                       ConcatGroups* groups) {
+absl::Status AddCopiesToRoot(HloComputation* body,
+                             absl::Span<HloInstruction* const> param_gtes,
+                             ConcatGroups* groups) {
   auto root = body->root_instruction();
   CHECK_EQ(root->opcode(), HloOpcode::kTuple);
   std::vector<HloInstruction*> copies(root->operand_count(), nullptr);
@@ -691,7 +691,7 @@ Status AddCopiesToRoot(HloComputation* body,
   return OkStatus();
 }
 
-Status RemoveCopiesFromRoot(HloComputation* body) {
+absl::Status RemoveCopiesFromRoot(HloComputation* body) {
   auto root = body->root_instruction();
   CHECK_EQ(root->opcode(), HloOpcode::kTuple);
   for (int64_t i = 0; i < root->operand_count(); ++i) {
@@ -703,9 +703,9 @@ Status RemoveCopiesFromRoot(HloComputation* body) {
   return OkStatus();
 }
 
-Status RewriteLoopWithConcatGroups(HloInstruction* loop,
-                                   absl::Span<HloInstruction* const> param_gtes,
-                                   ConcatGroups& groups) {
+absl::Status RewriteLoopWithConcatGroups(
+    HloInstruction* loop, absl::Span<HloInstruction* const> param_gtes,
+    ConcatGroups& groups) {
   VLOG(1) << "RewriteLoopWithConcatGroups with " << groups.Groups().size()
           << " groups.";
   // For simplicity, for each group, we rewrite the first element into full
diff --git a/third_party/xla/xla/service/while_loop_concat_code_motion.h b/third_party/xla/xla/service/while_loop_concat_code_motion.h
index dfb82ae5a009af..503614c10b94be 100644
--- a/third_party/xla/xla/service/while_loop_concat_code_motion.h
+++ b/third_party/xla/xla/service/while_loop_concat_code_motion.h
@@ -64,9 +64,9 @@ class WhileLoopConcatCodeMotion : public HloModulePass {
   ~WhileLoopConcatCodeMotion() override = default;
 
   absl::string_view name() const override {
-    static constexpr absl::string_view kName = "while-loop-concat-code-motion";
-    return kName;
+    return "while-loop-concat-code-motion";
   }
+
   using HloPassInterface::Run;
   absl::StatusOr<bool> Run(
       HloModule* module,
diff --git a/third_party/xla/xla/service/while_loop_constant_sinking.cc b/third_party/xla/xla/service/while_loop_constant_sinking.cc
index d714cdb25a0216..339ea76b4d8be8 100644
--- a/third_party/xla/xla/service/while_loop_constant_sinking.cc
+++ b/third_party/xla/xla/service/while_loop_constant_sinking.cc
@@ -28,10 +28,9 @@ namespace {
 // `while_body_root` (which must be a tuple instruction) at index `tuple_index`.
 // This utility helps us replace an instruction in the while body with a
 // constant while still keeping it trivially loop invariant.
-Status ReplaceUsesWhileKeepingLoopInvariance(HloInstruction* old_instr,
-                                             HloInstruction* new_instr,
-                                             HloInstruction* while_body_root,
-                                             int64_t tuple_index) {
+absl::Status ReplaceUsesWhileKeepingLoopInvariance(
+    HloInstruction* old_instr, HloInstruction* new_instr,
+    HloInstruction* while_body_root, int64_t tuple_index) {
   CHECK_EQ(while_body_root->opcode(), HloOpcode::kTuple);
 
   std::vector<HloInstruction*> users;
diff --git a/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion.h b/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion.h
index ac26df8f5b3980..a8422eb714e51c 100644
--- a/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion.h
+++ b/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion.h
@@ -19,9 +19,16 @@ limitations under the License.
 #include <functional>
 #include <utility>
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo_pass_interface.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/statusor.h"
+#include "xla/util.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion_test.cc b/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion_test.cc
index 33d3d0e414933b..9aa7a15876f969 100644
--- a/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion_test.cc
+++ b/third_party/xla/xla/service/while_loop_expensive_invariant_code_motion_test.cc
@@ -15,9 +15,16 @@ limitations under the License.
 
 #include "xla/service/while_loop_expensive_invariant_code_motion.h"
 
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/service/hlo_parser.h"
 #include "xla/tests/hlo_test_base.h"
+#include "xla/util.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
diff --git a/third_party/xla/xla/service/while_loop_fusible_sinking.cc b/third_party/xla/xla/service/while_loop_fusible_sinking.cc
index 07b0943d9304ec..d1fd7acd8ca110 100644
--- a/third_party/xla/xla/service/while_loop_fusible_sinking.cc
+++ b/third_party/xla/xla/service/while_loop_fusible_sinking.cc
@@ -19,15 +19,18 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/while_util.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/service/while_loop_fusible_sinking.h b/third_party/xla/xla/service/while_loop_fusible_sinking.h
index a8ac53ec46d7c1..b3d4bfa3a1c9bb 100644
--- a/third_party/xla/xla/service/while_loop_fusible_sinking.h
+++ b/third_party/xla/xla/service/while_loop_fusible_sinking.h
@@ -18,6 +18,8 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
diff --git a/third_party/xla/xla/service/while_loop_invariant_code_motion.cc b/third_party/xla/xla/service/while_loop_invariant_code_motion.cc
index 1f670664434c02..ed44547af3fca4 100644
--- a/third_party/xla/xla/service/while_loop_invariant_code_motion.cc
+++ b/third_party/xla/xla/service/while_loop_invariant_code_motion.cc
@@ -15,6 +15,11 @@ limitations under the License.
 
 #include "xla/service/while_loop_invariant_code_motion.h"
 
+#include <cstdint>
+#include <iterator>
+#include <string>
+#include <vector>
+
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
@@ -32,7 +37,6 @@ limitations under the License.
 #include "xla/service/while_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/statusor.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
@@ -112,10 +116,14 @@ static void CreateLoopInvariantCopy(
 }
 
 // Returns true if `instruction` is worth hoisting only if it lets us hoist some
-// instruction using it.  The rationale is that hoisting these instructions will
-// prevent simplification and fusion in the while body.
+// instruction using it. The rationale is that hoisting these instructions will
+// prevent simplification, fusion, and sharding annotation in the while body.
 bool WhileLoopInvariantCodeMotion::NotWorthHoistingIndividually(
     const HloInstruction& instruction) {
+  if (instruction.IsCustomCall("Sharding")) {
+    return true;
+  }
+
   switch (instruction.opcode()) {
     default:
       return false;
diff --git a/third_party/xla/xla/service/while_loop_invariant_code_motion.h b/third_party/xla/xla/service/while_loop_invariant_code_motion.h
index 249605bacf4736..49c907fdc6ba2e 100644
--- a/third_party/xla/xla/service/while_loop_invariant_code_motion.h
+++ b/third_party/xla/xla/service/while_loop_invariant_code_motion.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define XLA_SERVICE_WHILE_LOOP_INVARIANT_CODE_MOTION_H_
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
diff --git a/third_party/xla/xla/service/while_loop_invariant_code_motion_test.cc b/third_party/xla/xla/service/while_loop_invariant_code_motion_test.cc
index 5a9a35e31fbe9b..57f2768b458c0e 100644
--- a/third_party/xla/xla/service/while_loop_invariant_code_motion_test.cc
+++ b/third_party/xla/xla/service/while_loop_invariant_code_motion_test.cc
@@ -639,7 +639,7 @@ TEST_F(WhileLoopInvariantCodeMotionTest, NoHoistInflating) {
   EXPECT_FALSE(simplified_loop);
 }
 
-TEST_F(WhileLoopInvariantCodeMotionTest, DoesNotHoistShardingCustomCalls) {
+TEST_F(WhileLoopInvariantCodeMotionTest, DoesNotHoistSPMDFullToShardShape) {
   auto m = CreateNewVerifiedModule();
   auto array_s32 = ShapeUtil::MakeShape(S32, {4});
   Shape while_shape =
@@ -690,5 +690,43 @@ TEST_F(WhileLoopInvariantCodeMotionTest, DoesNotHoistShardingCustomCalls) {
   EXPECT_FALSE(simplified_loop);
 }
 
+TEST_F(WhileLoopInvariantCodeMotionTest, DoesNotHoistShardingCustomCalls) {
+  const char* const kHloModule = R"(
+    HloModule ModuleWithWhile
+
+    body {
+      p_body = (f32[2], f32[2], s32[]) parameter(0)
+      gte.0 = f32[2] get-tuple-element(p_body), index=0
+      gte.1 = f32[2] get-tuple-element(p_body), index=1
+      sharding.0 = f32[2] custom-call(gte.0), custom_call_target="Sharding", sharding={devices=[2]<=[2]}
+      sharding.1 = f32[2] custom-call(gte.1), custom_call_target="Sharding", sharding={replicated}
+      add.0 = f32[2] add(sharding.0, sharding.1)
+      gte.2 = s32[] get-tuple-element(p_body), index=2
+      const = s32[] constant(1)
+      add.1 = s32[] add(gte.2, const)
+      ROOT root = (f32[2], f32[2], s32[]) tuple(gte.0, add.0, add.1)
+    }
+
+    condition {
+      p_cond = (f32[2], f32[2], s32[]) parameter(0)
+      gte = s32[] get-tuple-element(p_cond), index=2
+      const = s32[] constant(5)
+      ROOT result = pred[] compare(gte, const), direction=LT
+    }
+
+    ENTRY entry {
+      param.0 = f32[2] parameter(0)
+      param.1 = s32[] parameter(1)
+      while_init = (f32[2], f32[2], s32[]) tuple(param.0, param.0, param.1)
+      ROOT while = (f32[2], f32[2], s32[]) while(while_init), condition=condition, body=body
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloModule));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
+                          WhileLoopInvariantCodeMotion{}.Run(module.get()));
+  EXPECT_FALSE(simplified_loop);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/while_loop_simplifier.cc b/third_party/xla/xla/service/while_loop_simplifier.cc
index 190ccaba461841..2ca8d1884a80bb 100644
--- a/third_party/xla/xla/service/while_loop_simplifier.cc
+++ b/third_party/xla/xla/service/while_loop_simplifier.cc
@@ -44,7 +44,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
-#include "xla/statusor.h"
 #include "xla/union_find.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -131,6 +130,12 @@ void CopyFrontendAttributes(HloInstruction* old_while_op,
   new_while_op->add_frontend_attributes(old_while_op->frontend_attributes());
 }
 
+// A helper function that copy the metadata from the old while op to
+// the new one.
+void CopyMetadata(HloInstruction* old_while_op, HloInstruction* new_while_op) {
+  new_while_op->set_metadata(old_while_op->metadata());
+}
+
 // This is a utility function that removes the given tuple indices from the
 // while loop init, body, and condition. The final shape returned is still the
 // same as before. If set index_for_replaced will replace any use of the removed
@@ -261,6 +266,7 @@ static absl::StatusOr<HloInstruction*> RemoveDeadTupleIndices(
       new_while_init));
   new_while_op->CopyBackendConfigFrom(while_op);
   CopyFrontendAttributes(while_op, new_while_op);
+  CopyMetadata(while_op, new_while_op);
 
   // Create a tuple op that recreates the output of the old while op.  That is,
   // we transform to
@@ -305,14 +311,7 @@ static absl::StatusOr<HloInstruction*> RemoveDeadTupleIndices(
   return new_while_op;
 }
 
-// Tries to remove elements in a while loop's tuple that aren't used within the
-// loop.
-//
-// Specifically, if a loop is tuple-shaped, and there exists some element of
-// that tuple that is not used by the loop condition and is not used by the loop
-// body except to pass it to the next iteration of the loop, then we can remove
-// that element from the loop's tuples.
-static absl::StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
+absl::StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
   CHECK_EQ(while_op->opcode(), HloOpcode::kWhile);
 
   // Don't try this transformation if the while loop isn't removable, since if
@@ -898,6 +897,7 @@ static absl::StatusOr<bool> TryRemoveConstantParams(HloInstruction* while_op) {
       add_new_instr(remove_constant_elems(while_init))));
   new_while_op->CopyBackendConfigFrom(while_op);
   CopyFrontendAttributes(while_op, new_while_op);
+  CopyMetadata(while_op, new_while_op);
   TF_RETURN_IF_ERROR(computation->ReplaceWithNewInstruction(
       while_op, add_constant_elems(new_while_op)));
   for (auto& instr : new_instrs) {
@@ -1231,6 +1231,7 @@ static absl::StatusOr<bool> TryFlattenNestedTuples(HloInstruction* while_op) {
       computation->AddInstruction(flattened(while_init))));
   new_while_op->CopyBackendConfigFrom(while_op);
   CopyFrontendAttributes(while_op, new_while_op);
+  CopyMetadata(while_op, new_while_op);
   TF_RETURN_IF_ERROR(
       computation->ReplaceWithNewInstruction(while_op, nested(new_while_op)));
   for (auto& instr : new_instrs) {
@@ -1472,6 +1473,7 @@ static absl::StatusOr<HloInstruction*> TryMergeInductionVariables(
       get_new_while_init(while_init)));
   new_while->CopyBackendConfigFrom(while_op);
   CopyFrontendAttributes(while_op, new_while);
+  CopyMetadata(while_op, new_while);
   TF_RETURN_IF_ERROR(computation->ReplaceWithNewInstruction(
       while_op, convert_to_old_form(new_while)));
   for (auto& instr : new_instrs) {
diff --git a/third_party/xla/xla/service/while_loop_simplifier.h b/third_party/xla/xla/service/while_loop_simplifier.h
index 3aacd3b0c70efc..851fdb0c39667f 100644
--- a/third_party/xla/xla/service/while_loop_simplifier.h
+++ b/third_party/xla/xla/service/while_loop_simplifier.h
@@ -17,13 +17,23 @@ limitations under the License.
 #define XLA_SERVICE_WHILE_LOOP_SIMPLIFIER_H_
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo_pass_interface.h"
-#include "xla/statusor.h"
 
 namespace xla {
 
+// Tries to remove elements in a while loop's tuple that aren't used within the
+// loop.
+//
+// Specifically, if a loop is tuple-shaped, and there exists some element of
+// that tuple that is not used by the loop condition and is not used by the loop
+// body except to pass it to the next iteration of the loop, then we can remove
+// that element from the loop's tuple.
+absl::StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op);
+
 // HLO pass that makes the following transformations on while loops:
 //
 //  - A while loop with static trip count of 0 is deleted.
diff --git a/third_party/xla/xla/service/while_loop_simplifier_test.cc b/third_party/xla/xla/service/while_loop_simplifier_test.cc
index 2733e1cc69d980..c82e29c06728eb 100644
--- a/third_party/xla/xla/service/while_loop_simplifier_test.cc
+++ b/third_party/xla/xla/service/while_loop_simplifier_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <string>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/algorithm/container.h"
 #include "absl/strings/str_cat.h"
@@ -486,6 +487,54 @@ TEST_F(WhileLoopSimplifierTest, RemoveUnusedLoopOperands) {
                      op::GetTupleElement(op::Parameter(0), /*tuple_index=*/1)));
 }
 
+// This while loop has three tuple elements.  Element 0 is unused and should be
+// removed. Element 1 is used by the loop body, and element 2 is used by the
+// loop condition; these two should stay.
+TEST_F(WhileLoopSimplifierTest, RemoveUnusedLoopOperandsCheckMetadata) {
+  const std::string hlo_string = R"(
+  HloModule RemoveUnusedOperands
+  RemoveUnusedOperands.body {
+    loop_var = (s32[], s32[], s32[]) parameter(0)
+    get-tuple-element.1 = s32[] get-tuple-element((s32[], s32[],
+      s32[]) loop_var), index=0
+    get-tuple-element.2 = s32[] get-tuple-element((s32[], s32[],
+      s32[]) loop_var), index=1
+    constant.1 = s32[] constant(1)
+    add = s32[] add(s32[] get-tuple-element.2, s32[] constant.1)
+    get-tuple-element.3 = s32[] get-tuple-element((s32[], s32[], s32[])
+      loop_var), index=2
+    ROOT tuple = (s32[], s32[], s32[]) tuple(s32[] get-tuple-element.1,
+      s32[] add, s32[] get-tuple-element.3)
+  }
+  RemoveUnusedOperands.loop_condition {
+    constant.2 = s32[] constant(0)
+    param0 = (s32[], s32[], s32[]) parameter(0)
+    get-tuple-element = s32[] get-tuple-element((s32[], s32[], s32[]) param0),
+      index=2
+    ROOT equal-to = pred[] compare(s32[] constant.2, s32[] get-tuple-element), direction=EQ
+  }
+  ENTRY RemoveUnusedOperands {
+    x = s32[] parameter(0)
+    constant.3 = s32[] constant(0)
+    y = s32[] parameter(1)
+    tuple.1 = (s32[], s32[], s32[]) tuple(s32[] x, s32[] constant.3,
+      s32[] y)
+    ROOT while = (s32[], s32[], s32[]) while((s32[], s32[], s32[]) tuple.1),
+      condition=RemoveUnusedOperands.loop_condition,
+      body=RemoveUnusedOperands.body, metadata={op_name="while"}
+  }
+  )";
+
+  auto m = ParseAndReturnVerifiedModule(hlo_string).value();
+  EXPECT_TRUE(WhileLoopSimplifier().Run(m.get()).value());
+  OpMetadata while_metadata;
+  while_metadata.set_op_name("while");
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              AllOf(op::Tuple(), op::Metadata(while_metadata)));
+  EXPECT_THAT(m->entry_computation()->GetInstructionWithName("while.1"),
+              AllOf(op::While(), op::Metadata(while_metadata)));
+}
+
 // Check that we can remove unused loop operands even if the loop contains a
 // side-effecting instruction.
 TEST_F(WhileLoopSimplifierTest,
diff --git a/third_party/xla/xla/service/while_loop_trip_count_annotator.cc b/third_party/xla/xla/service/while_loop_trip_count_annotator.cc
index ca92eb6fc78977..037a6151fa37f3 100644
--- a/third_party/xla/xla/service/while_loop_trip_count_annotator.cc
+++ b/third_party/xla/xla/service/while_loop_trip_count_annotator.cc
@@ -16,13 +16,13 @@ limitations under the License.
 #include "xla/service/while_loop_trip_count_annotator.h"
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/while_loop_analysis.h"
-#include "xla/statusor.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 
diff --git a/third_party/xla/xla/service/while_loop_trip_count_annotator.h b/third_party/xla/xla/service/while_loop_trip_count_annotator.h
index 440e5e6d6184b2..95220f2d2aa1c3 100644
--- a/third_party/xla/xla/service/while_loop_trip_count_annotator.h
+++ b/third_party/xla/xla/service/while_loop_trip_count_annotator.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define XLA_SERVICE_WHILE_LOOP_TRIP_COUNT_ANNOTATOR_H_
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo_pass_interface.h"
diff --git a/third_party/xla/xla/service/while_loop_unroller.cc b/third_party/xla/xla/service/while_loop_unroller.cc
index 57fe2cdba2de2d..179ea488b47d22 100644
--- a/third_party/xla/xla/service/while_loop_unroller.cc
+++ b/third_party/xla/xla/service/while_loop_unroller.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <iterator>
 #include <memory>
 #include <optional>
+#include <string_view>
 #include <utility>
 #include <vector>
 
@@ -44,18 +45,18 @@ limitations under the License.
 #include "xla/service/call_inliner.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/flatten_call_graph.h"
+#include "xla/service/hlo_creation_utils.h"
 #include "xla/service/hlo_cse.h"
 #include "xla/service/hlo_pass_fix.h"
+#include "xla/service/pattern_matcher.h"
 #include "xla/service/tuple_simplifier.h"
 #include "xla/service/while_loop_analysis.h"
 #include "xla/service/while_loop_constant_sinking.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -67,124 +68,43 @@ const int kUnrollTripCountThreshold = 64;
 const int kUnrollInstructionCountThreshold = 800;
 const int kUnrollExpandFactorThreshold = 10000;
 
-// A utility function that decides whether a loop is unrollable or not.
-std::optional<WhileLoopConfig> IsLoopUnrollable(HloInstruction* while_op) {
-  CHECK_EQ(while_op->opcode(), HloOpcode::kWhile);
-
-  // TODO(b/300668690): Add support for unrolling loops with control dependency.
-  // For now, we bail.
-  //
-  // Finding all the while loops where other instructions have explicit control
-  // dependencies on them.
-  std::vector<HloInstruction*> while_dependees;
-  for (HloComputation* comp : while_op->GetModule()->computations()) {
-    for (HloInstruction* instr : comp->instructions()) {
-      for (HloInstruction* control_dep : instr->control_predecessors()) {
-        if (control_dep->opcode() == HloOpcode::kWhile) {
-          while_dependees.push_back(control_dep);
-        }
-      }
-    }
-  }
-  if (absl::linear_search(while_dependees.begin(), while_dependees.end(),
-                          while_op)) {
-    VLOG(2) << "Not attempting to unroll " << while_op->name()
-            << " due to control dependency: " << while_op->ToShortString();
-    return std::nullopt;
-  }
-
-  // We can't remove while loops that contain send/recv nodes, because we
-  // rely on the particular loop structure around the node matching on the
-  // send and recv sides.
-  if (ContainsInstrWithOpcode(while_op->while_body(),
-                              {HloOpcode::kSend, HloOpcode::kSendDone,
-                               HloOpcode::kRecv, HloOpcode::kRecvDone}) ||
-      ContainsInstrWithOpcode(while_op->while_condition(),
-                              {HloOpcode::kSend, HloOpcode::kSendDone,
-                               HloOpcode::kRecv, HloOpcode::kRecvDone})) {
-    VLOG(2) << "Not attempting to unroll " << while_op->name()
-            << " because it contains a send/recv node: "
-            << while_op->ToShortString();
-    return std::nullopt;
-  }
-
-  if (while_op->operand(0)->opcode() != HloOpcode::kTuple) {
-    VLOG(2) << "Not attempting to unroll " << while_op->name()
-            << " because the operand is not a tuple: "
-            << while_op->ToShortString();
-    return std::nullopt;
-  }
-
-  // We cannot unroll loops that have side effecting condition because the
-  // condition will be removed after unrolling. This might be relaxed
-  // later when we add partial unrolling.
-  if (while_op->while_condition()->HasSideEffect()) {
-    VLOG(2) << "Not attempting to remove while loop whose condition contains "
-               "side-effecting instructions: "
-            << while_op->ToShortString();
-    return std::nullopt;
-  }
-
-  std::optional<int64_t> indvar_tuple_idx =
-      GetLoopInductionVarTupleIdx(while_op);
-  if (!indvar_tuple_idx.has_value()) {
-    return std::nullopt;
-  }
-
-  HloEvaluator evaluator(/*max_loop_iterations=*/0);
-  const HloInstruction* while_init = while_op->operand(0);
-  const HloInstruction* indvar_init = while_init->operand(*indvar_tuple_idx);
-  absl::StatusOr<Literal> indvar_init_result = evaluator.Evaluate(indvar_init);
-  if (!indvar_init_result.ok()) {
-    VLOG(2) << "Couldn't evaluate induction variable init, "
-            << indvar_init_result.status() << ", " << indvar_init->ToString();
-    return std::nullopt;
-  }
-  Literal indvar_iter_val = std::move(indvar_init_result).value();
-
-  std::optional<int64_t> trip_count =
-      MatchTrivialLoopTripCount(while_op, *indvar_tuple_idx, indvar_iter_val);
-  if (!trip_count.has_value()) {
-    return std::nullopt;
-  }
+// Helper function to create a condition for a single iteration while loop in
+// the form of 'i <= init_value' where i is the induction variable.
+std::unique_ptr<HloComputation> MakeTrivialLoopCondition(
+    HloInstruction* while_op, std::string_view name, int64_t induction_idx,
+    int64_t init_value) {
+  auto condition_builder = HloComputation::Builder(name);
 
-  VLOG(3) << "Loop trip count " << trip_count.value();
+  absl::StatusOr<HloInstruction*> param_instruction =
+      condition_builder.AddParameter(
+          while_op->while_condition()->parameter_instruction(0)->Clone());
 
-  WhileLoopConfig config;
-  config.init =
-      LiteralUtil::LiteralAsScalarInt64(std::move(indvar_iter_val)).value();
-  config.trip_count = trip_count.value();
-  config.induction_var_idx = *indvar_tuple_idx;
+  HloInstruction* indvar_instruction =
+      condition_builder.AddInstruction(HloInstruction::CreateGetTupleElement(
+          param_instruction.value(), induction_idx));
 
-  return config;
-}
+  HloInstruction* init_value_constant = condition_builder.AddInstruction(
+      MakeConstantWithShape(indvar_instruction->shape(), init_value));
 
-std::unique_ptr<HloInstruction> GetConstantWithPrimitiveType(PrimitiveType type,
-                                                             int64_t value) {
-  return primitive_util::PrimitiveTypeSwitch<std::unique_ptr<HloInstruction>>(
-      [&](auto literal_constant) -> std::unique_ptr<HloInstruction> {
-        if constexpr (primitive_util::IsIntegralType(literal_constant)) {
-          using NativeT = primitive_util::NativeTypeOf<literal_constant>;
-          return HloInstruction::CreateConstant(
-              LiteralUtil::CreateR0(static_cast<NativeT>(value)));
-        }
-        LOG(FATAL) << "literal is of non-integral type";
-      },
-      type);
+  return condition_builder.Build(
+      condition_builder.AddInstruction(HloInstruction::CreateCompare(
+          ShapeUtil::MakeShape(PrimitiveType::PRED, {}), indvar_instruction,
+          init_value_constant, ComparisonDirection::kLe)));
 }
 
 // Helper function that replaces a single iteration of a while loop with
 // induction variable equal to induction_value.
 absl::StatusOr<std::unique_ptr<HloComputation>>
 UnrollSingleIterationOfTrivialLoop(HloInstruction* while_op,
-                                   const int64_t indvar_idx,
+                                   WhileLoopConfig config,
                                    const int64_t induction_value) {
   // We clone the body since we are changing the computation.
   std::unique_ptr<HloComputation> while_body_clone =
-      while_op->while_body()->Clone(absl::StrCat(induction_value));
+      while_op->while_body()->Clone(
+          absl::StrCat(while_op->name(), induction_value));
 
   HloInstruction* induction_var_hlo =
-      while_op->mutable_operand(0)->mutable_operand(indvar_idx);
+      while_op->mutable_operand(0)->mutable_operand(config.induction_var_idx);
 
   // We record the next channel id to utilize when unrolling loops with
   // collective communication instructions. During unrolling a single iteration
@@ -205,37 +125,40 @@ UnrollSingleIterationOfTrivialLoop(HloInstruction* while_op,
       body_inst->set_channel_id(unique_channel_id++);
     }
 
-    if (body_inst->opcode() != HloOpcode::kGetTupleElement) {
+    // We only consider induction variable instructions of the following form.
+    if (!Match(body_inst,
+               match::GetTupleElement(match::Parameter().WithParameterNum(0))
+                   .WithTupleIndex(config.induction_var_idx))) {
       continue;
     }
-    if (body_inst->operand(0) != while_body_clone->parameter_instruction(0)) {
-      continue;
-    }
-    const int64_t idx = body_inst->tuple_index();
 
-    std::vector<HloInstruction*> body_uses;
-    body_uses.reserve(body_inst->users().size());
+    // Store users of the induction variable in a separate vector to go over.
+    std::vector<HloInstruction*> indvar_uses;
+    indvar_uses.reserve(body_inst->users().size());
     for (HloInstruction* indvar_use : body_inst->users()) {
-      body_uses.push_back(indvar_use);
+      indvar_uses.push_back(indvar_use);
     }
 
-    // We found our instruction
-    if (idx == indvar_idx) {
-      // Finds all the uses of induction var within the while body
-      for (HloInstruction* indvar_use : body_uses) {
-        for (int64_t i = 0; i < indvar_use->operand_count(); ++i) {
-          const HloInstruction* indvar_use_operand = indvar_use->operand(i);
-          // Found the induction var as an operand of body instruction.
-          if (indvar_use_operand == body_inst) {
-            std::unique_ptr<HloInstruction> constant =
-                GetConstantWithPrimitiveType(
-                    induction_var_hlo->shape().element_type(), induction_value);
-            // Assign the same shape of the old instruction to the new
-            // instruction.
-            *constant->mutable_shape() = body_inst->shape();
-            CHECK_OK(indvar_use->ReplaceOperandWith(
-                i, while_body_clone->AddInstruction(std::move(constant))));
-          }
+    HloInstruction* induction_value_constant = while_body_clone->AddInstruction(
+        MakeConstantWithShape(induction_var_hlo->shape(), induction_value));
+
+    // Finds all the uses of induction var within the while body and replace it
+    // with the constant.
+    for (HloInstruction* indvar_use : indvar_uses) {
+      // Skip the induction variable increment instruction. We need this
+      // instruction to remain in the loop if we are doing wrapped unrolling. We
+      // rely on this instruction to later find and remove these trivial loops.
+      if (Match(indvar_use, match::Add(match::GetTupleElement().WithTupleIndex(
+                                           config.induction_var_idx),
+                                       match::Constant()))) {
+        continue;
+      }
+
+      for (int64_t i = 0; i < indvar_use->operand_count(); ++i) {
+        const HloInstruction* indvar_use_operand = indvar_use->operand(i);
+        // Found the induction var user.
+        if (indvar_use_operand == body_inst) {
+          CHECK_OK(indvar_use->ReplaceOperandWith(i, induction_value_constant));
         }
       }
     }
@@ -243,31 +166,6 @@ UnrollSingleIterationOfTrivialLoop(HloInstruction* while_op,
   return while_body_clone;
 }
 
-// Helper function to create a condition for a single iteration while loop in
-// the form of 'i <= init_value' where i is the induction variable.
-std::unique_ptr<HloComputation> MakeSingleIterWhileCond(
-    HloInstruction* while_op, int64_t induction_idx, int64_t init_value) {
-  auto condition_builder =
-      HloComputation::Builder(absl::StrCat("unrolled-cond-", while_op->name()));
-
-  auto param_instruction = condition_builder.AddParameter(
-      while_op->while_condition()->parameter_instruction(0)->Clone());
-
-  CHECK_OK(param_instruction);
-
-  HloInstruction* indvar_instruction = condition_builder.AddInstruction(
-      HloInstruction::CreateGetTupleElement(*param_instruction, induction_idx));
-
-  auto init_value_constant =
-      condition_builder.AddInstruction(GetConstantWithPrimitiveType(
-          indvar_instruction->shape().element_type(), init_value));
-
-  return condition_builder.Build(
-      condition_builder.AddInstruction(HloInstruction::CreateCompare(
-          ShapeUtil::MakeShape(PrimitiveType::PRED, {}), indvar_instruction,
-          init_value_constant, ComparisonDirection::kLe)));
-}
-
 absl::Status InitialFeasibilityCheck(HloInstruction* while_op,
                                      WhileLoopConfig config,
                                      int64_t unroll_factor) {
@@ -347,9 +245,7 @@ absl::StatusOr<bool> UnrollInternal(HloInstruction* while_op,
     CHECK(OverflowSafeAdd(i, (int64_t)1).has_value());
 
     HloComputation* unrolled_body = module->AddEmbeddedComputation(
-        UnrollSingleIterationOfTrivialLoop(while_op, config.induction_var_idx,
-                                           i)
-            .value());
+        UnrollSingleIterationOfTrivialLoop(while_op, config, i).value());
     unrolled_body_call_op =
         computation->AddInstruction(HloInstruction::CreateCall(
             while_op->shape(), call_operands, unrolled_body));
@@ -375,32 +271,38 @@ absl::StatusOr<bool> UnrollInternalWrapped(HloInstruction* while_op,
           << while_op->while_body()->instruction_count();
 
   HloModule* module = while_op->GetModule();
+
   HloComputation* computation = while_op->parent();
   HloInstruction* unrolled_body_call_op;
+  std::vector<HloInstruction*> call_operands;
 
   auto body_builder =
       HloComputation::Builder(absl::StrCat("unrolled-body-", while_op->name()));
   absl::StatusOr<HloInstruction*> p = body_builder.AddParameter(
       while_op->while_body()->parameter_instruction(0)->Clone());
 
-  std::vector<HloInstruction*> call_operands = {p.value()};
+  // We assume while has only one tuple parameter
+  call_operands.emplace_back(std::move(p.value()));
+
   for (int64_t i = config.init; i < config.trip_count + config.init; ++i) {
     CHECK(OverflowSafeAdd(i, (int64_t)1).has_value());
 
     HloComputation* unrolled_body = module->AddEmbeddedComputation(
-        UnrollSingleIterationOfTrivialLoop(while_op, config.induction_var_idx,
-                                           i)
-            .value());
+        UnrollSingleIterationOfTrivialLoop(while_op, config, i).value());
+
     unrolled_body_call_op =
         body_builder.AddInstruction(HloInstruction::CreateCall(
             while_op->shape(), call_operands, unrolled_body));
+
     call_operands.clear();
     call_operands.emplace_back(unrolled_body_call_op);
   }
   HloComputation* new_body =
       module->AddEmbeddedComputation(body_builder.Build(unrolled_body_call_op));
-  HloComputation* new_cond = module->AddEmbeddedComputation(
-      MakeSingleIterWhileCond(while_op, config.induction_var_idx, config.init));
+  HloComputation* new_cond =
+      module->AddEmbeddedComputation(MakeTrivialLoopCondition(
+          while_op, absl::StrCat("unrolled", while_op->name(), "-cond"),
+          config.induction_var_idx, config.init));
 
   HloInstruction* new_while_op =
       computation->AddInstruction(HloInstruction::CreateWhile(
@@ -416,7 +318,99 @@ absl::StatusOr<bool> UnrollInternalWrapped(HloInstruction* while_op,
 
 };  // namespace
 
-absl::StatusOr<bool> PrepareModuleForUnrolling(
+/*static*/ std::optional<WhileLoopConfig> WhileLoopUnroller::IsLoopUnrollable(
+    HloInstruction* while_op) {
+  CHECK_EQ(while_op->opcode(), HloOpcode::kWhile);
+
+  // TODO(b/300668690): Add support for unrolling loops with control dependency.
+  // For now, we bail.
+  //
+  // Finding all the while loops where other instructions have explicit control
+  // dependencies on them.
+  std::vector<HloInstruction*> while_dependees;
+  for (HloComputation* comp : while_op->GetModule()->computations()) {
+    for (HloInstruction* instr : comp->instructions()) {
+      for (HloInstruction* control_dep : instr->control_predecessors()) {
+        if (control_dep->opcode() == HloOpcode::kWhile) {
+          while_dependees.push_back(control_dep);
+        }
+      }
+    }
+  }
+  if (absl::linear_search(while_dependees.begin(), while_dependees.end(),
+                          while_op)) {
+    VLOG(2) << "Not attempting to unroll " << while_op->name()
+            << " due to control dependency: " << while_op->ToShortString();
+    return std::nullopt;
+  }
+
+  // We can't remove while loops that contain send/recv nodes, because we
+  // rely on the particular loop structure around the node matching on the
+  // send and recv sides.
+  if (ContainsInstrWithOpcode(while_op->while_body(),
+                              {HloOpcode::kSend, HloOpcode::kSendDone,
+                               HloOpcode::kRecv, HloOpcode::kRecvDone}) ||
+      ContainsInstrWithOpcode(while_op->while_condition(),
+                              {HloOpcode::kSend, HloOpcode::kSendDone,
+                               HloOpcode::kRecv, HloOpcode::kRecvDone})) {
+    VLOG(2) << "Not attempting to unroll " << while_op->name()
+            << " because it contains a send/recv node: "
+            << while_op->ToShortString();
+    return std::nullopt;
+  }
+
+  if (while_op->operand(0)->opcode() != HloOpcode::kTuple) {
+    VLOG(2) << "Not attempting to unroll " << while_op->name()
+            << " because the operand is not a tuple: "
+            << while_op->ToShortString();
+    return std::nullopt;
+  }
+
+  // We cannot unroll loops that have side effecting condition because the
+  // condition will be removed after unrolling. This might be relaxed
+  // later when we add partial unrolling.
+  if (while_op->while_condition()->HasSideEffect()) {
+    VLOG(2) << "Not attempting to remove while loop whose condition contains "
+               "side-effecting instructions: "
+            << while_op->ToShortString();
+    return std::nullopt;
+  }
+
+  std::optional<int64_t> indvar_tuple_idx =
+      GetLoopInductionVarTupleIdx(while_op);
+  if (!indvar_tuple_idx.has_value()) {
+    return std::nullopt;
+  }
+
+  HloEvaluator evaluator(/*max_loop_iterations=*/0);
+  const HloInstruction* while_init = while_op->operand(0);
+  const HloInstruction* indvar_init = while_init->operand(*indvar_tuple_idx);
+  absl::StatusOr<Literal> indvar_init_result = evaluator.Evaluate(indvar_init);
+  if (!indvar_init_result.ok()) {
+    VLOG(2) << "Couldn't evaluate induction variable init, "
+            << indvar_init_result.status() << ", " << indvar_init->ToString();
+    return std::nullopt;
+  }
+  Literal indvar_iter_val = std::move(indvar_init_result).value();
+
+  std::optional<int64_t> trip_count =
+      MatchTrivialLoopTripCount(while_op, *indvar_tuple_idx, indvar_iter_val);
+  if (!trip_count.has_value()) {
+    return std::nullopt;
+  }
+
+  VLOG(3) << "Loop trip count " << trip_count.value();
+
+  WhileLoopConfig config;
+  config.init =
+      LiteralUtil::LiteralAsScalarInt64(std::move(indvar_iter_val)).value();
+  config.trip_count = trip_count.value();
+  config.induction_var_idx = *indvar_tuple_idx;
+
+  return config;
+}
+
+/*static*/ absl::StatusOr<bool> WhileLoopUnroller::PrepareModuleForUnrolling(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
@@ -449,7 +443,8 @@ absl::StatusOr<bool> PrepareModuleForUnrolling(
   return changed;
 }
 
-std::vector<std::pair<HloInstruction*, WhileLoopConfig>> GetUnrollableLoops(
+/*static*/ std::vector<std::pair<HloInstruction*, WhileLoopConfig>>
+WhileLoopUnroller::GetUnrollableLoops(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Processing the while loops in the reverse topological order. If the body
@@ -470,8 +465,9 @@ std::vector<std::pair<HloInstruction*, WhileLoopConfig>> GetUnrollableLoops(
   return while_loop_configs;
 }
 
-absl::StatusOr<bool> Unroll(HloInstruction* while_op, int64_t unroll_factor,
-                            bool wrap_in_trivial_loop) {
+/*static*/ absl::StatusOr<bool> WhileLoopUnroller::Unroll(
+    HloInstruction* while_op, int64_t unroll_factor,
+    bool wrap_in_trivial_loop) {
   bool changed = false;
   HloModule* module = while_op->GetModule();
 
diff --git a/third_party/xla/xla/service/while_loop_unroller.h b/third_party/xla/xla/service/while_loop_unroller.h
index 337ec9d348a8df..630031cde60cd3 100644
--- a/third_party/xla/xla/service/while_loop_unroller.h
+++ b/third_party/xla/xla/service/while_loop_unroller.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define XLA_SERVICE_WHILE_LOOP_UNROLLER_H_
 
 #include <cstdint>
+#include <optional>
 #include <utility>
 #include <vector>
 
@@ -40,26 +41,6 @@ struct WhileLoopConfig {
   int64_t induction_var_idx;
 };
 
-// Runs a sequence of passes that are necessary to prepare loops for unrolling.
-// Failure to run these passes will prevent unroller from unrolling loops that
-// would have been otherwise unrollable.
-absl::StatusOr<bool> PrepareModuleForUnrolling(
-    HloModule* module,
-    const absl::flat_hash_set<absl::string_view>& execution_threads);
-
-// Returns the list of unrollable loops in the given module
-
-std::vector<std::pair<HloInstruction*, WhileLoopConfig>> GetUnrollableLoops(
-    HloModule* module,
-    const absl::flat_hash_set<absl::string_view>& execution_threads);
-
-// Unrolls the given while loop with the default behaviour set to full unroll.
-// If wrap_in_trivial_loop is set, the unrolled body of the loop will be wrapped
-// in a loop with trip count of one.
-absl::StatusOr<bool> Unroll(HloInstruction* while_op,
-                            int64_t unroll_factor = -1,
-                            bool wrap_in_trivial_loop = false);
-
 // This pass unrolls while loops with the given unrolling factor. The value of
 // unroll_factor = -1 will fully unroll the loop.
 //
@@ -87,6 +68,31 @@ class WhileLoopUnroller : public HloModulePass {
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
+  // Runs a sequence of passes that are necessary to prepare loops for
+  // unrolling. Failure to run these passes will prevent unroller from unrolling
+  // loops that would have been otherwise unrollable.
+  static absl::StatusOr<bool> PrepareModuleForUnrolling(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads);
+
+  // Function that decides whether a loop is unrollable or not and returns the
+  // loop config.
+  static std::optional<WhileLoopConfig> IsLoopUnrollable(
+      HloInstruction* while_op);
+
+  // Returns the list of unrollable loops in the given module
+  static std::vector<std::pair<HloInstruction*, WhileLoopConfig>>
+  GetUnrollableLoops(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads);
+
+  // Unrolls the given while loop with the default behaviour set to full unroll.
+  // If wrap_in_trivial_loop is set, the unrolled body of the loop will be
+  // wrapped in a loop with trip count of one.
+  static absl::StatusOr<bool> Unroll(HloInstruction* while_op,
+                                     int64_t unroll_factor = -1,
+                                     bool wrap_in_trivial_loop = false);
+
  private:
   int64_t unroll_factor_;
   // Whether to wrap the unrolled computation in a loop with trip count of one.
diff --git a/third_party/xla/xla/service/while_loop_unroller_test.cc b/third_party/xla/xla/service/while_loop_unroller_test.cc
index 02a6db1be5fe87..a3263b3aa8bf5f 100644
--- a/third_party/xla/xla/service/while_loop_unroller_test.cc
+++ b/third_party/xla/xla/service/while_loop_unroller_test.cc
@@ -78,25 +78,25 @@ WhileLoopUnrollerTest::MakeModuleWithSimpleLoop(int num_iters) {
   std::string hlo_string_template = R"(
   HloModule SimpleLoop
   SimpleLoop.body {
-    loop_var.1 = (s32[], s32[3]{0}) parameter(0)
-    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=0
-    constant.1 = s32[] constant(1)
-    idx = s32[] add(get-tuple-element.1, constant.1)
+    loop_var.1 = (s32[]{:T(128)}, s32[3]{0}) parameter(0)
+    get-tuple-element.1 = s32[]{:T(128)} get-tuple-element(loop_var.1), index=0
+    constant.1 = s32[]{:T(128)} constant(1)
+    idx = s32[]{:T(128)} add(get-tuple-element.1, constant.1)
     get-tuple-element.2 = s32[3]{0} get-tuple-element(loop_var.1), index=1
     output = s32[3]{0} add(get-tuple-element.2, get-tuple-element.2)
-    ROOT tuple = (s32[], s32[3]{0}) tuple(idx, output)
+    ROOT tuple = (s32[]{:T(128)}, s32[3]{0}) tuple(idx, output)
   }
   SimpleLoop.condition {
-    loop_var.2 = (s32[], s32[3]{0}) parameter(0)
+    loop_var.2 = (s32[]{:T(128)}, s32[3]{0}) parameter(0)
     get-tuple-element.3 = s32[] get-tuple-element(loop_var.2), index=0
-    constant.2 = s32[] constant({{LOOP_BOUND}})
+    constant.2 = s32[]{:T(128)} constant({{LOOP_BOUND}})
     ROOT less-than = pred[] compare(get-tuple-element.3, constant.2), direction=LT
   }
   ENTRY SimpleLoop {
-    constant.3 = s32[] constant(0)
+    constant.3 = s32[]{:T(128)} constant(0)
     constant.4 = s32[3]{0} constant({0, 1, 2})
-    tuple.1 = (s32[], s32[3]{0}) tuple(constant.3, constant.4)
-    ROOT while = (s32[], s32[3]{0}) while(tuple.1), condition=
+    tuple.1 = (s32[]{:T(128)}, s32[3]{0}) tuple(constant.3, constant.4)
+    ROOT while = (s32[]{:T(128)}, s32[3]{0}) while(tuple.1), condition=
       SimpleLoop.condition, body=SimpleLoop.body
   }
   )";
@@ -530,7 +530,8 @@ TEST_F(WhileLoopUnrollerTest, GetUnrollableLoops) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  auto unrollable_loops = GetUnrollableLoops(module.get(), {});
+  auto unrollable_loops =
+      WhileLoopUnroller::GetUnrollableLoops(module.get(), {});
   // Only while1 and while2 are unrollable
   EXPECT_EQ(unrollable_loops.size(), 2);
 }
@@ -591,7 +592,8 @@ TEST_F(WhileLoopUnrollerTest, UnrollMutipleLoops) {
   // Unroll the first loop
   TF_ASSERT_OK_AND_ASSIGN(
       bool unrolled1,
-      Unroll(module->entry_computation()->GetInstructionWithName("while1")));
+      WhileLoopUnroller::Unroll(
+          module->entry_computation()->GetInstructionWithName("while1")));
   EXPECT_TRUE(unrolled1);
 
   // There should be no call instructions after unrolling either loops since we
@@ -606,7 +608,8 @@ TEST_F(WhileLoopUnrollerTest, UnrollMutipleLoops) {
   // Unroll the second loop
   TF_ASSERT_OK_AND_ASSIGN(
       bool unrolled2,
-      Unroll(module->entry_computation()->GetInstructionWithName("while2")));
+      WhileLoopUnroller::Unroll(
+          module->entry_computation()->GetInstructionWithName("while2")));
   EXPECT_TRUE(unrolled2);
   std::vector<HloInstruction*> call_instrs_2;
   for (auto* comp : module->MakeComputationPostOrder()) {
diff --git a/third_party/xla/xla/service/while_util.cc b/third_party/xla/xla/service/while_util.cc
index 2f944bf472423c..d7f85d5e4cb3cc 100644
--- a/third_party/xla/xla/service/while_util.cc
+++ b/third_party/xla/xla/service/while_util.cc
@@ -40,7 +40,6 @@ limitations under the License.
 #include "xla/service/tuple_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/statusor.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/service/while_util.h b/third_party/xla/xla/service/while_util.h
index 03cd85605cf140..acb507ef03a4bc 100644
--- a/third_party/xla/xla/service/while_util.h
+++ b/third_party/xla/xla/service/while_util.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/function_ref.h"
+#include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/call_inliner.h"
diff --git a/third_party/xla/xla/service/while_util_test.cc b/third_party/xla/xla/service/while_util_test.cc
index e18975f7bdd01a..a760bbf9f10318 100644
--- a/third_party/xla/xla/service/while_util_test.cc
+++ b/third_party/xla/xla/service/while_util_test.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <memory>
 
 #include "absl/algorithm/container.h"
+#include "absl/status/statusor.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_matchers.h"
-#include "xla/statusor.h"
 #include "xla/test.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/verified_hlo_module.h"
diff --git a/third_party/xla/xla/service/xla_compile_main.cc b/third_party/xla/xla/service/xla_compile_main.cc
index 761412c554d23d..5ec29917963e24 100644
--- a/third_party/xla/xla/service/xla_compile_main.cc
+++ b/third_party/xla/xla/service/xla_compile_main.cc
@@ -57,44 +57,43 @@ const char kUsageHeader[] =
 // Read the input file containing the MHLO module, and write a Serialized
 // AotCompilationResult or Executable to the output file.
 int main(int argc, char* argv[]) {
-  std::string module_path;
-  std::string output_path;
-  std::string platform;
-  std::string gpu_target_config_path;
-  std::string autotune_results_path;
-  std::string symbol_repository;
-  std::string symbol_id;
-  bool use_attached_device = false;
-  bool wait_for_uploads = false;
-  std::string result_output_file;
+  xla::XlaCompileOptions options;
   std::vector<tsl::Flag> flag_list = {
-      tsl::Flag("module_file", &module_path,
+      tsl::Flag("module_file", &options.module_path,
                 "The path to the HLO, MHLO or StableHLO file"),
-      tsl::Flag("output_file", &output_path, "The path to the output file"),
-      tsl::Flag("platform", &platform,
+      tsl::Flag("output_file", &options.output_path,
+                "The path to the output file"),
+      tsl::Flag("platform", &options.platform,
                 "The platform on which the built executable runs"),
-      tsl::Flag("gpu_target_config", &gpu_target_config_path,
+      tsl::Flag("gpu_target_config",
+                &options.gpu_options.gpu_target_config_path,
                 "The path to a text-format GpuTargetConfig. If not provided, "
                 "an attached GPU will be used."),
-      tsl::Flag("autotune_results", &autotune_results_path,
+      tsl::Flag("autotune_results", &options.gpu_options.autotune_results_path,
                 "The path to AutotuneResults, optional when compiling for"
                 " GPU"),
-      tsl::Flag("symbol_repo", &symbol_repository,
+      tsl::Flag("symbol_repo", &options.repo_options.symbol_repo,
                 "Which SymbolRepository to look up --symbol_reference in. If "
                 "the repository contains a GpuTargetConfig, "
                 "--gpu_target_config will take precedence if it is also set."),
-      tsl::Flag("symbol_reference", &symbol_id,
+      tsl::Flag("symbol_reference", &options.repo_options.symbol_id,
                 "Symbol ID to look up in a SymbolRepository. Overrides "
                 "--module_file."),
-      tsl::Flag("use_attached_device", &use_attached_device,
+      tsl::Flag(
+          "optimized_symbol_reference",
+          &options.repo_options.optimized_symbol_id,
+          "Optimized symbol ID to look up in a SymbolRepository. Overrides "
+          "--autotune_results_path."),
+
+      tsl::Flag("use_attached_device", &options.gpu_options.use_attached_device,
                 "Whether to use the attached GPU or not. Overrides the "
                 "AOT-vs-device-backed inference based on the presence of "
                 "--gpu_target_config, which is relevant when a GpuTargetConfig "
                 "can be found in the symbol repository."),
-      tsl::Flag("wait_for_uploads", &wait_for_uploads,
+      tsl::Flag("wait_for_uploads", &options.repo_options.wait_for_uploads,
                 "Whether to wait for uploads to a symbol repository to "
                 "complete. See export_hlo.h for more on uploads."),
-      tsl::Flag("result_output_file", &result_output_file,
+      tsl::Flag("result_output_file", &options.result_output_file,
                 "File to write a serialized xla.CompilationResult proto to."),
   };
 
@@ -110,10 +109,7 @@ int main(int argc, char* argv[]) {
 
   tsl::port::InitMain(usage.c_str(), &argc, &argv);
 
-  absl::Status result = xla::XlaCompileMain(
-      module_path, output_path, platform, gpu_target_config_path,
-      autotune_results_path, symbol_repository, symbol_id, use_attached_device,
-      wait_for_uploads, result_output_file);
+  absl::Status result = xla::XlaCompileMain(options);
   if (!result.ok()) {
     LOG(ERROR) << "Compilation failed: " << result;
     return 1;
diff --git a/third_party/xla/xla/service/zero_sized_hlo_elimination.cc b/third_party/xla/xla/service/zero_sized_hlo_elimination.cc
index 22c7cf0f00beb3..ce784875dff930 100644
--- a/third_party/xla/xla/service/zero_sized_hlo_elimination.cc
+++ b/third_party/xla/xla/service/zero_sized_hlo_elimination.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/service/zero_sized_hlo_elimination.h"
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -24,7 +25,6 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/statusor.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 
diff --git a/third_party/xla/xla/service/zero_sized_hlo_elimination.h b/third_party/xla/xla/service/zero_sized_hlo_elimination.h
index f45b9134f6d343..bd804ee02324c1 100644
--- a/third_party/xla/xla/service/zero_sized_hlo_elimination.h
+++ b/third_party/xla/xla/service/zero_sized_hlo_elimination.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define XLA_SERVICE_ZERO_SIZED_HLO_ELIMINATION_H_
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo_pass_interface.h"
diff --git a/third_party/xla/xla/service_interface.h b/third_party/xla/xla/service_interface.h
index c1bf58dc5c2a23..a3c974600d0afa 100644
--- a/third_party/xla/xla/service_interface.h
+++ b/third_party/xla/xla/service_interface.h
@@ -31,53 +31,57 @@ class ServiceInterface {
   ServiceInterface() {}
   virtual ~ServiceInterface() = default;
 
-  // TODO(b/31824348): Convert to use StatusOr.
-  virtual Status TransferToClient(const TransferToClientRequest* arg,
-                                  TransferToClientResponse* result) = 0;
+  // TODO(b/31824348): Convert to use absl::StatusOr.
+  virtual absl::Status TransferToClient(const TransferToClientRequest* arg,
+                                        TransferToClientResponse* result) = 0;
 
-  virtual Status TransferToServer(const TransferToServerRequest* arg,
-                                  TransferToServerResponse* result) = 0;
+  virtual absl::Status TransferToServer(const TransferToServerRequest* arg,
+                                        TransferToServerResponse* result) = 0;
 
-  virtual Status TransferToInfeed(const TransferToInfeedRequest* arg,
-                                  TransferToInfeedResponse* result) = 0;
+  virtual absl::Status TransferToInfeed(const TransferToInfeedRequest* arg,
+                                        TransferToInfeedResponse* result) = 0;
 
-  virtual Status TransferFromOutfeed(const TransferFromOutfeedRequest* arg,
-                                     TransferFromOutfeedResponse* result) = 0;
+  virtual absl::Status TransferFromOutfeed(
+      const TransferFromOutfeedRequest* arg,
+      TransferFromOutfeedResponse* result) = 0;
 
-  virtual Status ResetDevice(const ResetDeviceRequest* arg,
-                             ResetDeviceResponse* result) = 0;
+  virtual absl::Status ResetDevice(const ResetDeviceRequest* arg,
+                                   ResetDeviceResponse* result) = 0;
 
-  virtual Status Compile(const CompileRequest* arg,
-                         CompileResponse* result) = 0;
+  virtual absl::Status Compile(const CompileRequest* arg,
+                               CompileResponse* result) = 0;
 
-  virtual Status Execute(const ExecuteRequest* arg,
-                         ExecuteResponse* result) = 0;
+  virtual absl::Status Execute(const ExecuteRequest* arg,
+                               ExecuteResponse* result) = 0;
 
-  virtual Status ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg,
-                                      ExecuteParallelResponse* result) = 0;
+  virtual absl::Status ExecuteGraphParallel(
+      const ExecuteGraphParallelRequest* arg,
+      ExecuteParallelResponse* result) = 0;
 
-  virtual Status DeconstructTuple(const DeconstructTupleRequest* arg,
-                                  DeconstructTupleResponse* result) = 0;
+  virtual absl::Status DeconstructTuple(const DeconstructTupleRequest* arg,
+                                        DeconstructTupleResponse* result) = 0;
 
-  virtual Status GetComputationGraphStats(
+  virtual absl::Status GetComputationGraphStats(
       const ComputationGraphStatsRequest* arg,
       ComputationStatsResponse* result) = 0;
 
-  virtual Status GetShape(const GetShapeRequest* arg,
-                          GetShapeResponse* result) = 0;
+  virtual absl::Status GetShape(const GetShapeRequest* arg,
+                                GetShapeResponse* result) = 0;
 
-  virtual Status CreateChannelHandle(const CreateChannelHandleRequest* arg,
-                                     CreateChannelHandleResponse* result) = 0;
+  virtual absl::Status CreateChannelHandle(
+      const CreateChannelHandleRequest* arg,
+      CreateChannelHandleResponse* result) = 0;
 
-  virtual Status GetDeviceHandles(const GetDeviceHandlesRequest* arg,
-                                  GetDeviceHandlesResponse* result) = 0;
+  virtual absl::Status GetDeviceHandles(const GetDeviceHandlesRequest* arg,
+                                        GetDeviceHandlesResponse* result) = 0;
 
-  virtual Status ComputeConstantGraph(const ComputeConstantGraphRequest* arg,
-                                      ComputeConstantResponse* result) = 0;
+  virtual absl::Status ComputeConstantGraph(
+      const ComputeConstantGraphRequest* arg,
+      ComputeConstantResponse* result) = 0;
 
   // Methods used by GlobalData.
-  virtual Status Unregister(const UnregisterRequest* arg,
-                            UnregisterResponse* result) = 0;
+  virtual absl::Status Unregister(const UnregisterRequest* arg,
+                                  UnregisterResponse* result) = 0;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/shape_layout.cc b/third_party/xla/xla/shape_layout.cc
index 8181a4e297a8b5..a810e0bda9a896 100644
--- a/third_party/xla/xla/shape_layout.cc
+++ b/third_party/xla/xla/shape_layout.cc
@@ -22,27 +22,28 @@ limitations under the License.
 #include "xla/status.h"
 #include "xla/util.h"
 #include "tsl/platform/logging.h"  // IWYU pragma: keep
+#include "tsl/platform/status.h"
 
 namespace xla {
 
-Status ShapeLayout::CopyLayoutFromShape(const Shape& other_shape) {
+absl::Status ShapeLayout::CopyLayoutFromShape(const Shape& other_shape) {
   if (!ShapeUtil::Compatible(other_shape, shape_)) {
     return InvalidArgument("Shape %s is not compatible with shape %s",
                            ShapeUtil::HumanString(other_shape),
                            ShapeUtil::HumanString(shape()));
   }
   shape_ = other_shape;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status ShapeLayout::AssignLayoutToShape(Shape* to_shape) const {
+absl::Status ShapeLayout::AssignLayoutToShape(Shape* to_shape) const {
   if (!ShapeUtil::Compatible(*to_shape, shape_)) {
     return InvalidArgument("Shape %s is not compatible with shape %s",
                            ShapeUtil::HumanString(*to_shape),
                            ShapeUtil::HumanString(shape()));
   }
   *to_shape = shape_;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void ShapeLayout::SetToDefaultLayout() {
diff --git a/third_party/xla/xla/shape_layout.h b/third_party/xla/xla/shape_layout.h
index 9d6353e27f3a61..6262acb2851a1c 100644
--- a/third_party/xla/xla/shape_layout.h
+++ b/third_party/xla/xla/shape_layout.h
@@ -41,7 +41,7 @@ class ShapeLayout {
   // Assigns the layouts in this ShapeLayout to the Layout fields of the given
   // shape. 'to_shape' and the shape of the ShapeLayout object must be
   // compatible.
-  Status AssignLayoutToShape(Shape* to_shape) const;
+  absl::Status AssignLayoutToShape(Shape* to_shape) const;
 
   // Returns true if the Layouts in this ShapeLayout match the layouts in the
   // given shape. Returns false otherwise. If the given shape is not compatible
@@ -54,7 +54,7 @@ class ShapeLayout {
 
   // Copies the layout from the given shape into this ShapeLayout. 'other_shape'
   // must be compatible with the ShapeLayout's shape.
-  Status CopyLayoutFromShape(const Shape& other_shape);
+  absl::Status CopyLayoutFromShape(const Shape& other_shape);
 
   // Clears (Layout::Clear) all the Layouts stored in this object.
   void Clear();
diff --git a/third_party/xla/xla/shape_tree.h b/third_party/xla/xla/shape_tree.h
index 4e6a0bf6495602..3d514d23e53158 100644
--- a/third_party/xla/xla/shape_tree.h
+++ b/third_party/xla/xla/shape_tree.h
@@ -229,22 +229,23 @@ class ShapeTree {
     }
   }
 
-  // Like ForEach(Mutable)Element, but the callable returns a Status instead of
-  // void.  The first non-OK return value is returned by the ForEach* function.
-  Status ForEachElementWithStatus(
-      absl::FunctionRef<Status(const ShapeIndex&, const T&)> func) const {
+  // Like ForEach(Mutable)Element, but the callable returns a absl::Status
+  // instead of void.  The first non-OK return value is returned by the ForEach*
+  // function.
+  absl::Status ForEachElementWithStatus(
+      absl::FunctionRef<absl::Status(const ShapeIndex&, const T&)> func) const {
     for (const Node& node : nodes_) {
       TF_RETURN_IF_ERROR(func(node.first, node.second));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status ForEachMutableElementWithStatus(
-      absl::FunctionRef<Status(const ShapeIndex&, T*)> func) {
+  absl::Status ForEachMutableElementWithStatus(
+      absl::FunctionRef<absl::Status(const ShapeIndex&, T*)> func) {
     for (Node& node : nodes_) {
       TF_RETURN_IF_ERROR(func(node.first, &node.second));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Like the above, but traverses in post-order.  Note children are visited in
@@ -263,20 +264,20 @@ class ShapeTree {
     }
   }
 
-  Status ForEachElementPostOrderWithStatus(
-      absl::FunctionRef<Status(const ShapeIndex&, const T&)> func) const {
+  absl::Status ForEachElementPostOrderWithStatus(
+      absl::FunctionRef<absl::Status(const ShapeIndex&, const T&)> func) const {
     for (auto node = nodes_.rbegin(); node != nodes_.rend(); ++node) {
       TF_RETURN_IF_ERROR(func(node->first, node->second));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status ForEachMutableElementPostOrderWithStatus(
-      absl::FunctionRef<Status(const ShapeIndex&, T*)> func) {
+  absl::Status ForEachMutableElementPostOrderWithStatus(
+      absl::FunctionRef<absl::Status(const ShapeIndex&, T*)> func) {
     for (auto node = nodes_.rbegin(); node != nodes_.rend(); ++node) {
       TF_RETURN_IF_ERROR(func(node->first, &node->second));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Maps each element to generate a new tree with the same shape.
@@ -294,6 +295,22 @@ class ShapeTree {
     return result;
   }
 
+  template <typename U>
+  absl::StatusOr<ShapeTree<U>> MapWithStatus(
+      absl::FunctionRef<absl::StatusOr<U>(const T&)> func) {
+    typename ShapeTree<U>::Nodes result_nodes;
+    result_nodes.reserve(nodes_.size());
+    for (const Node& node : nodes_) {
+      TF_ASSIGN_OR_RETURN(U result, func(node.second));
+      result_nodes.push_back({node.first, std::move(result)});
+    }
+
+    ShapeTree<U> result(shape_, std::move(result_nodes));
+    result.index_table_ = index_table_;
+    result.shape_storage_ = shape_storage_;
+    return result;
+  }
+
   // Copy the subtree of values from 'other' rooted at ShapeIndex 'src_index'
   // into the subtree of value in this ShapeTree rooted at 'dst_index'.
   //
@@ -324,7 +341,7 @@ class ShapeTree {
     });
   }
 
-  StatusOr<ShapeTree<T>> SubShapeTree(const ShapeIndex& index) const {
+  absl::StatusOr<ShapeTree<T>> SubShapeTree(const ShapeIndex& index) const {
     TF_ASSIGN_OR_RETURN(const Shape* sub_shape,
                         ShapeUtil::TryGetSubshape(shape(), index));
     size_t count = ShapeUtil::SubshapeCount(*sub_shape);
diff --git a/third_party/xla/xla/shape_util.cc b/third_party/xla/xla/shape_util.cc
index 4d7c88588388fc..739dacdc26266c 100644
--- a/third_party/xla/xla/shape_util.cc
+++ b/third_party/xla/xla/shape_util.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "absl/base/optimization.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/function_ref.h"
+#include "absl/log/log.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
@@ -50,7 +51,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/status.h"
 #include "xla/status_macros.h"
-#include "xla/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/cpu_info.h"
@@ -214,10 +214,10 @@ Shape MakeTupleShapeImpl(absl::Span<ShapePtrOrRef> shapes) {
                                             const Shape& rhs) {
   bool equal = true;
   ForEachSubshape(lhs, [&](const Shape& /*subshape*/, const ShapeIndex& index) {
-    equal &= IndexIsValid(rhs, index);
+    equal = equal && IndexIsValid(rhs, index);
   });
   ForEachSubshape(rhs, [&](const Shape& /*subshape*/, const ShapeIndex& index) {
-    equal &= IndexIsValid(lhs, index);
+    equal = equal && IndexIsValid(lhs, index);
   });
 
   return equal;
@@ -437,7 +437,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
   return new_shape;
 }
 
-/* static */ Status ShapeUtil::PopulateShape(
+/* static */ absl::Status ShapeUtil::PopulateShape(
     PrimitiveType element_type, absl::Span<const int64_t> dimensions,
     Shape* shape) {
   shape->Clear();
@@ -929,7 +929,7 @@ Shape ShapeUtil::PrependMajorDimension(int64_t bound, Shape shape) {
       shape,
       [&](const Shape& subshape, const ShapeIndex& index) -> absl::Status {
         if (subshape.IsTuple()) {
-          return OkStatus();
+          return absl::OkStatus();
         }
         if (!subshape.IsArray()) {
           return InvalidArgument("Shape cannot be serialiized: %s",
@@ -941,19 +941,22 @@ Shape ShapeUtil::PrependMajorDimension(int64_t bound, Shape shape) {
         if (subshape.element_type() == PRED) {
           // PRED is packed 8 elements per byte.
           size += CeilOfRatio<int64_t>(ElementsIn(subshape), 8);
-        } else if (primitive_util::Is4BitType(subshape.element_type())) {
+        } else if (primitive_util::IsSubByteNonPredType(
+                       subshape.element_type())) {
           // 4-bit types are packed 2 elements per byte.
-          size += CeilOfRatio<int64_t>(ElementsIn(subshape), 2);
+          size += CeilOfRatio<int64_t>(
+              ElementsIn(subshape),
+              8 / primitive_util::BitWidth(subshape.element_type()));
         } else {
           size += ByteSizeOfElements(subshape);
         }
-        return OkStatus();
+        return absl::OkStatus();
       }));
 
   return size;
 }
 
-/* static */ Status ShapeUtil::ValidateShapeWithOptionalLayoutInternal(
+/* static */ absl::Status ShapeUtil::ValidateShapeWithOptionalLayoutInternal(
     const Shape& shape) {
   if (shape.element_type() == PRIMITIVE_TYPE_INVALID ||
       !PrimitiveType_IsValid(shape.element_type())) {
@@ -968,7 +971,7 @@ Shape ShapeUtil::PrependMajorDimension(int64_t bound, Shape shape) {
       TF_RETURN_IF_ERROR(
           ValidateShapeWithOptionalLayoutInternal(element_shape));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Non-tuple shape.
@@ -990,7 +993,7 @@ Shape ShapeUtil::PrependMajorDimension(int64_t bound, Shape shape) {
           primitive_util::LowercasePrimitiveTypeName(shape.element_type()),
           shape.ShortDebugString());
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   bool any_overflows = false;
@@ -1015,14 +1018,14 @@ Shape ShapeUtil::PrependMajorDimension(int64_t bound, Shape shape) {
   }
 
   TF_RETURN_IF_ERROR(ValidateShapeSize(shape));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-/* static */ Status ShapeUtil::ValidateShapeSize(const Shape& shape) {
+/* static */ absl::Status ShapeUtil::ValidateShapeSize(const Shape& shape) {
   VLOG(3) << "Validating shape size: " << ShapeUtil::HumanString(shape);
 
   if (!shape.IsArray()) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   auto [extent_product, extent_overflow] =
@@ -1036,10 +1039,10 @@ Shape ShapeUtil::PrependMajorDimension(int64_t bound, Shape shape) {
   }
 
   VLOG(3) << "Shape size is valid: " << dense_shape_size;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-/* static */ Status ShapeUtil::ValidateShapeWithOptionalLayout(
+/* static */ absl::Status ShapeUtil::ValidateShapeWithOptionalLayout(
     const Shape& shape) {
   TF_RETURN_IF_ERROR(ValidateShapeWithOptionalLayoutInternal(shape));
 
@@ -1047,7 +1050,7 @@ Shape ShapeUtil::PrependMajorDimension(int64_t bound, Shape shape) {
                                            /*allow_missing_layouts=*/true);
 }
 
-/* static */ Status ShapeUtil::ValidateShape(const Shape& shape) {
+/* static */ absl::Status ShapeUtil::ValidateShape(const Shape& shape) {
   TF_RETURN_IF_ERROR(ValidateShapeWithOptionalLayoutInternal(shape));
 
   return LayoutUtil::ValidateLayoutInShape(shape);
@@ -1793,7 +1796,7 @@ ShapeUtil::DecomposeBitcastToTrt(const Shape& input_shape,
   return compatible;
 }
 
-/* static */ Status ShapeUtil::ForEachIndexWithStatus(
+/* static */ absl::Status ShapeUtil::ForEachIndexWithStatus(
     const Shape& shape, absl::Span<const int64_t> base,
     absl::Span<const int64_t> count, absl::Span<const int64_t> incr,
     const ForEachVisitorFunction& visitor_function) {
@@ -1824,7 +1827,7 @@ ShapeUtil::DecomposeBitcastToTrt(const Shape& input_shape,
                                              visitor_function));
 }
 
-/* static */ Status ShapeUtil::ForEachIndexParallelWithStatus(
+/* static */ absl::Status ShapeUtil::ForEachIndexParallelWithStatus(
     const Shape& shape, absl::Span<const int64_t> base,
     absl::Span<const int64_t> count, absl::Span<const int64_t> incr,
     const ForEachParallelVisitorFunction& visitor_function) {
@@ -1839,7 +1842,7 @@ ShapeUtil::DecomposeBitcastToTrt(const Shape& input_shape,
   TF_CHECK_OK(ForEachIndexParallelWithStatus(shape, visitor_function));
 }
 
-/* static */ Status ShapeUtil::ForEachIndexParallelWithStatus(
+/* static */ absl::Status ShapeUtil::ForEachIndexParallelWithStatus(
     const Shape& shape,
     const ForEachParallelVisitorFunction& visitor_function) {
   std::vector<int64_t> base(shape.dimensions_size());
@@ -1849,13 +1852,13 @@ ShapeUtil::DecomposeBitcastToTrt(const Shape& input_shape,
                                         visitor_function);
 }
 
-/* static */ Status ShapeUtil::ForEachIndexInternal(
+/* static */ absl::Status ShapeUtil::ForEachIndexInternal(
     const Shape& shape, absl::Span<const int64_t> base,
     absl::Span<const int64_t> count, absl::Span<const int64_t> incr,
     const ForEachVisitorFunction& visitor_function) {
   ForEachState s(shape, base, count, incr);
   if (s.IsZeroElementArray()) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   // Allows handling R0 arrays, such that the visitor function will be called
   // once with the proper empty indexes.
@@ -1869,7 +1872,7 @@ ShapeUtil::DecomposeBitcastToTrt(const Shape& input_shape,
     // Increments dimensions in minor to major order.
     n = s.IncrementDim();
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 /* static */ void ShapeUtil::ForEachIndexInternalNoStatus(
@@ -1910,13 +1913,13 @@ struct ParallelState {
 
   absl::Mutex mu;
   tsl::thread::ThreadPool* pool;
-  Status status;  // Guarded by mu
+  absl::Status status;  // Guarded by mu
   absl::BlockingCounter counter;
 };
 
 }  // anonymous namespace
 
-/* static */ Status ShapeUtil::ForEachIndexInternalParallel(
+/* static */ absl::Status ShapeUtil::ForEachIndexInternalParallel(
     const Shape& shape, absl::Span<const int64_t> base,
     absl::Span<const int64_t> count, absl::Span<const int64_t> incr,
     const ForEachParallelVisitorFunction& visitor_function) {
@@ -2132,7 +2135,8 @@ Shape ShapeUtil::DeviceShapeToHostShape(Shape s) {
 }
 
 /*static*/
-Status ShapeUtil::ByteStrides(const Shape& shape, absl::Span<int64_t> strides) {
+absl::Status ShapeUtil::ByteStrides(const Shape& shape,
+                                    absl::Span<int64_t> strides) {
   TF_RET_CHECK(shape.IsArray());
   TF_RET_CHECK(shape.has_layout());
   TF_RET_CHECK(shape.dimensions_size() == strides.size());
@@ -2142,7 +2146,7 @@ Status ShapeUtil::ByteStrides(const Shape& shape, absl::Span<int64_t> strides) {
     strides.at(i) = stride;
     stride *= shape.dimensions(i);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 /*static*/
diff --git a/third_party/xla/xla/shape_util.h b/third_party/xla/xla/shape_util.h
index 548d39961b9ce4..0339df04b20755 100644
--- a/third_party/xla/xla/shape_util.h
+++ b/third_party/xla/xla/shape_util.h
@@ -478,19 +478,19 @@ class ShapeUtil {
       const Shape& shape);
 
   // As MakeShape, but the object to write to is passed in.
-  static Status PopulateShape(PrimitiveType element_type,
-                              absl::Span<const int64_t> dimensions,
-                              Shape* shape);
+  static absl::Status PopulateShape(PrimitiveType element_type,
+                                    absl::Span<const int64_t> dimensions,
+                                    Shape* shape);
 
   // Validates that the provided shape satisfies invariants.
-  static Status ValidateShape(const Shape& shape);
+  static absl::Status ValidateShape(const Shape& shape);
 
   // Validates the provided shape satisfies invariants, except those that
   // pertain to layout.
   //
   // Layout is optional for client-provided shapes, so that the compiler may
   // determine and assign an optimized layout.
-  static Status ValidateShapeWithOptionalLayout(const Shape& shape);
+  static absl::Status ValidateShapeWithOptionalLayout(const Shape& shape);
 
   // Returns whether the element type of the shape is integral (signed or
   // unsigned). Note that predicates are not considered integral here, since
@@ -583,7 +583,7 @@ class ShapeUtil {
     ForEachSubshapeWithStatus(shape, [&](const Shape& subshape,
                                          const ShapeIndex& index) {
       fn(subshape, index);
-      return OkStatus();
+      return absl::OkStatus();
     }).IgnoreError();
   }
   template <typename Fn>
@@ -591,7 +591,7 @@ class ShapeUtil {
     ForEachMutableSubshapeWithStatus(shape, [&](Shape* subshape,
                                                 const ShapeIndex& index) {
       fn(subshape, index);
-      return OkStatus();
+      return absl::OkStatus();
     }).IgnoreError();
   }
 
@@ -612,24 +612,25 @@ class ShapeUtil {
                     });
   }
 
-  // Variants of ForEach(Mutable)Subshape which propagate Status from the
+  // Variants of ForEach(Mutable)Subshape which propagate absl::Status from the
   // visitor function.
   //
   // Visitor function must have the signature
   //
-  //   Status fn(const Shape& subshape, const ShapeIndex& index), or
-  //   Status fn(Shape* subshape, const ShapeIndex& index) (mutable version)
+  //   absl::Status fn(const Shape& subshape, const ShapeIndex& index), or
+  //   absl::Status fn(Shape* subshape, const ShapeIndex& index) (mutable
+  //   version)
   //
   template <typename Fn>
-  static Status ForEachSubshapeWithStatus(const Shape& shape, Fn&& fn) {
+  static absl::Status ForEachSubshapeWithStatus(const Shape& shape, Fn&& fn) {
     return ForEachMutableSubshapeWithStatus(
         const_cast<Shape*>(&shape),
-        [&](Shape* subshape, const ShapeIndex& index) -> Status {
+        [&](Shape* subshape, const ShapeIndex& index) -> absl::Status {
           return fn(*const_cast<const Shape*>(subshape), index);
         });
   }
   template <typename Fn>
-  static Status ForEachMutableSubshapeWithStatus(Shape* shape, Fn&& fn) {
+  static absl::Status ForEachMutableSubshapeWithStatus(Shape* shape, Fn&& fn) {
     ShapeIndex index;
     return ForEachMutableSubshapeWithStatusHelper(shape, fn, &index);
   }
@@ -647,7 +648,7 @@ class ShapeUtil {
     ForEachSubshapePostOrderWithStatus(shape, [&](const Shape& subshape,
                                                   const ShapeIndex& index) {
       fn(subshape, index);
-      return OkStatus();
+      return absl::OkStatus();
     }).IgnoreError();
   }
   template <typename Fn>
@@ -656,31 +657,32 @@ class ShapeUtil {
         shape,
         [&](Shape* subshape, const ShapeIndex& index) {
           fn(subshape, index);
-          return OkStatus();
+          return absl::OkStatus();
         })
         .IgnoreError();
   }
 
-  // Variants of ForEach(Mutable)SubshapePostOrder which propagate Status from
-  // the visitor function.
+  // Variants of ForEach(Mutable)SubshapePostOrder which propagate absl::Status
+  // from the visitor function.
   //
   // Visitor function must have the signature
   //
-  //   Status fn(const Shape& subshape, const ShapeIndex& index), or
-  //   Status fn(Shape* subshape, const ShapeIndex& index) (mutable version)
+  //   absl::Status fn(const Shape& subshape, const ShapeIndex& index), or
+  //   absl::Status fn(Shape* subshape, const ShapeIndex& index) (mutable
+  //   version)
   //
   template <typename Fn>
-  static Status ForEachSubshapePostOrderWithStatus(const Shape& shape,
-                                                   Fn&& fn) {
+  static absl::Status ForEachSubshapePostOrderWithStatus(const Shape& shape,
+                                                         Fn&& fn) {
     return ForEachMutableSubshapePostOrderWithStatus(
         const_cast<Shape*>(&shape),
-        [&](Shape* subshape, const ShapeIndex& index) -> Status {
+        [&](Shape* subshape, const ShapeIndex& index) -> absl::Status {
           return fn(*const_cast<const Shape*>(subshape), index);
         });
   }
   template <typename Fn>
-  static Status ForEachMutableSubshapePostOrderWithStatus(Shape* shape,
-                                                          Fn&& fn) {
+  static absl::Status ForEachMutableSubshapePostOrderWithStatus(Shape* shape,
+                                                                Fn&& fn) {
     ShapeIndex index;
     return ForEachMutableSubshapePostOrderWithStatusHelper(shape, fn, &index);
   }
@@ -881,7 +883,7 @@ class ShapeUtil {
   // count (index[i] < base[i] + count[i]), and calls the visitor_function
   // with the current index. The visitor_function visitor function should
   // return true if it wants to continue, or false otherwise.
-  static Status ForEachIndexWithStatus(
+  static absl::Status ForEachIndexWithStatus(
       const Shape& shape, absl::Span<const int64_t> base,
       absl::Span<const int64_t> count, absl::Span<const int64_t> incr,
       const ForEachVisitorFunction& visitor_function);
@@ -894,7 +896,7 @@ class ShapeUtil {
   };
 
   template <typename FnTy>
-  static Status ForEachIndexWithStatus(
+  static absl::Status ForEachIndexWithStatus(
       const Shape& shape, const IndexIterationSpace& iteration_space,
       FnTy&& function) {
     return ShapeUtil::ForEachIndexWithStatus(
@@ -915,7 +917,7 @@ class ShapeUtil {
   // These convenience wrappers don't take `base`, `count` and `incr`
   // explicitly, but iterate over every element in `shape` instead.
 
-  static Status ForEachIndexWithStatus(
+  static absl::Status ForEachIndexWithStatus(
       const Shape& shape, const ForEachVisitorFunction& visitor_function) {
     std::vector<int64_t> base(shape.dimensions_size());
     std::vector<int64_t> incr(shape.dimensions_size(), 1);
@@ -959,7 +961,7 @@ class ShapeUtil {
   // Returns the number of threads in the threadpool of ForEachIndexParallel*.
   static int GetForEachIndexParallelThreadCount();
 
-  static Status ForEachIndexParallelWithStatus(
+  static absl::Status ForEachIndexParallelWithStatus(
       const Shape& shape, absl::Span<const int64_t> base,
       absl::Span<const int64_t> count, absl::Span<const int64_t> incr,
       const ForEachParallelVisitorFunction& visitor_function);
@@ -970,7 +972,7 @@ class ShapeUtil {
       const Shape& shape,
       const ForEachParallelVisitorFunction& visitor_function);
 
-  static Status ForEachIndexParallelWithStatus(
+  static absl::Status ForEachIndexParallelWithStatus(
       const Shape& shape,
       const ForEachParallelVisitorFunction& visitor_function);
 
@@ -1012,7 +1014,8 @@ class ShapeUtil {
   // Computes byte strides of an array shape `shape`. `shape` must have a
   // layout. Ignores tiling. `strides` must have size equal to the number of
   // dimensions of `shape`.
-  static Status ByteStrides(const Shape& shape, absl::Span<int64_t> strides);
+  static absl::Status ByteStrides(const Shape& shape,
+                                  absl::Span<int64_t> strides);
   // Same as above but returns the stride array, or std::nullopt if error.
   static std::optional<absl::InlinedVector<int64_t, 4>> ByteStrides(
       const Shape& shape);
@@ -1033,17 +1036,18 @@ class ShapeUtil {
 
   // Validates the shape size is sane. This makes sure it's safe to do
   // calculations in int64_t without overflowing.
-  static Status ValidateShapeSize(const Shape& shape);
+  static absl::Status ValidateShapeSize(const Shape& shape);
 
   // Validates all of the non-layout properties of the shape -- this is a helper
   // used by both the layout-optional and layout-required public method.
-  static Status ValidateShapeWithOptionalLayoutInternal(const Shape& shape);
+  static absl::Status ValidateShapeWithOptionalLayoutInternal(
+      const Shape& shape);
 
   // Helper for ForEachSubshape which visits the subshapes of the given shape in
   // DFS pre-order starting with the index.
   template <typename Fn>
-  static Status ForEachMutableSubshapeWithStatusHelper(Shape* shape, Fn&& fn,
-                                                       ShapeIndex* index) {
+  static absl::Status ForEachMutableSubshapeWithStatusHelper(
+      Shape* shape, Fn&& fn, ShapeIndex* index) {
     TF_RETURN_IF_ERROR(fn(shape, *index));
     if (shape->IsTuple()) {
       for (int64_t i = 0; i < ShapeUtil::TupleElementCount(*shape); ++i) {
@@ -1053,13 +1057,13 @@ class ShapeUtil {
         index->pop_back();
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Helper for ForEachSubshapePost which visits the subshapes of the given
   // shape in DFS post-order.
   template <typename Fn>
-  static Status ForEachMutableSubshapePostOrderWithStatusHelper(
+  static absl::Status ForEachMutableSubshapePostOrderWithStatusHelper(
       Shape* shape, Fn&& fn, ShapeIndex* index) {
     if (shape->IsTuple()) {
       for (int64_t i = 0; i < ShapeUtil::TupleElementCount(*shape); ++i) {
@@ -1070,7 +1074,7 @@ class ShapeUtil {
       }
     }
     TF_RETURN_IF_ERROR(fn(shape, *index));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Keeps track of the iteration state for the ForEach...Internal routines
@@ -1100,7 +1104,7 @@ class ShapeUtil {
     int64_t CalculateNumSteps() const;
   };
 
-  static Status ForEachIndexInternal(
+  static absl::Status ForEachIndexInternal(
       const Shape& shape, absl::Span<const int64_t> base,
       absl::Span<const int64_t> count, absl::Span<const int64_t> incr,
       const ForEachVisitorFunction& visitor_function);
@@ -1110,7 +1114,7 @@ class ShapeUtil {
       absl::Span<const int64_t> count, absl::Span<const int64_t> incr,
       const ForEachVisitorFunctionNoStatus& visitor_function);
 
-  static Status ForEachIndexInternalParallel(
+  static absl::Status ForEachIndexInternalParallel(
       const Shape& shape, absl::Span<const int64_t> base,
       absl::Span<const int64_t> count, absl::Span<const int64_t> incr,
       const ForEachParallelVisitorFunction& visitor_function);
diff --git a/third_party/xla/xla/shape_util_test.cc b/third_party/xla/xla/shape_util_test.cc
index cee69daab1ee44..6ebda8e53ed79a 100644
--- a/third_party/xla/xla/shape_util_test.cc
+++ b/third_party/xla/xla/shape_util_test.cc
@@ -30,10 +30,7 @@ limitations under the License.
 #include "xla/layout_util.h"
 #include "xla/shape.h"
 #include "xla/status.h"
-#include "xla/statusor.h"
 #include "xla/test.h"
-#include "xla/test_helpers.h"
-#include "xla/types.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/env.h"
@@ -644,7 +641,7 @@ TEST(ShapeUtilTest, ForEachIndexWithStatus) {
     return true;
   };
 
-  Status error_status = ShapeUtil::ForEachIndexWithStatus(
+  absl::Status error_status = ShapeUtil::ForEachIndexWithStatus(
       shape, /*base=*/{0, 0}, /*count=*/{10, 10}, /*incr=*/{0, 1},
       increment_func);
 
diff --git a/third_party/xla/xla/sharding_op_util.cc b/third_party/xla/xla/sharding_op_util.cc
index d3c6f0aa14167d..d2dc84ffc3d405 100644
--- a/third_party/xla/xla/sharding_op_util.cc
+++ b/third_party/xla/xla/sharding_op_util.cc
@@ -23,7 +23,9 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/service/hlo_lexer.h"
+#include "xla/status.h"
 #include "xla/status_macros.h"
+#include "xla/util.h"
 
 namespace xla {
 namespace sharding_op_util {
@@ -36,8 +38,8 @@ std::string EncodeAttributes(absl::Span<const int64_t> unspecified_dims) {
                       absl::StrJoin(unspecified_dims, ","), "]");
 }
 
-Status ParseAttributes(absl::string_view opaque,
-                       std::vector<int64_t>* unspecified_dims) {
+absl::Status ParseAttributes(absl::string_view opaque,
+                             std::vector<int64_t>* unspecified_dims) {
   HloLexer lexer(opaque);
   while (lexer.Lex() != TokKind::kEof) {
     if (lexer.GetKind() != TokKind::kAttributeName) {
@@ -59,7 +61,7 @@ Status ParseAttributes(absl::string_view opaque,
                                    attr_name);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace sharding_op_util
diff --git a/third_party/xla/xla/sharding_op_util.h b/third_party/xla/xla/sharding_op_util.h
index be71cab8464ff3..8b59f69a270777 100644
--- a/third_party/xla/xla/sharding_op_util.h
+++ b/third_party/xla/xla/sharding_op_util.h
@@ -31,8 +31,8 @@ namespace sharding_op_util {
 std::string EncodeAttributes(absl::Span<const int64_t> unspecified_dims);
 
 // Parses the opaque string of Sharding and auto/manual conversion custom ops.
-Status ParseAttributes(absl::string_view opaque,
-                       std::vector<int64_t>* unspecified_dims);
+absl::Status ParseAttributes(absl::string_view opaque,
+                             std::vector<int64_t>* unspecified_dims);
 
 }  // namespace sharding_op_util
 }  // namespace xla
diff --git a/third_party/xla/xla/status_macros.cc b/third_party/xla/xla/status_macros.cc
index 3587586a0efc9e..3c4b8a8e608aa5 100644
--- a/third_party/xla/xla/status_macros.cc
+++ b/third_party/xla/xla/status_macros.cc
@@ -18,11 +18,14 @@ limitations under the License.
 #include <algorithm>
 #include <string>
 
+#include "absl/base/attributes.h"
+#include "absl/base/optimization.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
-#include "xla/types.h"
+#include "xla/status.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/stacktrace.h"
+#include "tsl/platform/status.h"
 
 namespace xla {
 namespace status_macros {
@@ -36,7 +39,7 @@ ABSL_CONST_INIT const char kPossibleAutoJitAlternative[] =
 
 // Log the error at the given severity, optionally with a stack trace.
 // If log_severity is NUM_SEVERITIES, nothing is logged.
-static void LogError(const Status& status, const char* filename, int line,
+static void LogError(const absl::Status& status, const char* filename, int line,
                      int log_severity, bool should_log_stack_trace) {
   if (ABSL_PREDICT_TRUE(log_severity != tsl::NUM_SEVERITIES)) {
     std::string stack_trace;
@@ -64,20 +67,21 @@ static void LogError(const Status& status, const char* filename, int line,
   }
 }
 
-// Make a Status with a code, error message and payload,
+// Make a absl::Status with a code, error message and payload,
 // and also send it to LOG(<log_severity>) using the given filename
 // and line (unless should_log is false, or log_severity is
 // NUM_SEVERITIES).  If should_log_stack_trace is true, the stack
 // trace is included in the log message (ignored if should_log is
 // false).
-static Status MakeError(const char* filename, int line, absl::StatusCode code,
-                        const std::string& message, bool should_log,
-                        int log_severity, bool should_log_stack_trace) {
+static absl::Status MakeError(const char* filename, int line,
+                              absl::StatusCode code, const std::string& message,
+                              bool should_log, int log_severity,
+                              bool should_log_stack_trace) {
   if (ABSL_PREDICT_FALSE(code == absl::StatusCode::kOk)) {
     LOG(ERROR) << "Cannot create error with status OK";
     code = absl::StatusCode::kUnknown;
   }
-  const Status status = Status(code, message);
+  const absl::Status status = absl::Status(code, message);
   if (ABSL_PREDICT_TRUE(should_log)) {
     LogError(status, filename, line, log_severity, should_log_stack_trace);
   }
@@ -106,7 +110,7 @@ MakeErrorStream::Impl::Impl(const char* file, int line, tsl::error::Code code,
       should_log_stack_trace_(false),
       make_error_stream_with_output_wrapper_(error_stream) {}
 
-MakeErrorStream::Impl::Impl(const Status& status,
+MakeErrorStream::Impl::Impl(const absl::Status& status,
                             PriorMessageHandling prior_message_handling,
                             const char* file, int line,
                             MakeErrorStream* error_stream)
@@ -131,20 +135,20 @@ MakeErrorStream::Impl::~Impl() {
   // Note: error messages refer to the public MakeErrorStream class.
 
   if (!is_done_) {
-    LOG(ERROR) << "MakeErrorStream destructed without getting Status: " << file_
-               << ":" << line_ << " " << stream_.str();
+    LOG(ERROR) << "MakeErrorStream destructed without getting absl::Status: "
+               << file_ << ":" << line_ << " " << stream_.str();
   }
 }
 
-Status MakeErrorStream::Impl::GetStatus() {
+absl::Status MakeErrorStream::Impl::GetStatus() {
   // Note: error messages refer to the public MakeErrorStream class.
 
-  // Getting a Status object out more than once is not harmful, but
+  // Getting a absl::Status object out more than once is not harmful, but
   // it doesn't match the expected pattern, where the stream is constructed
-  // as a temporary, loaded with a message, and then casted to Status.
+  // as a temporary, loaded with a message, and then casted to absl::Status.
   if (is_done_) {
-    LOG(ERROR) << "MakeErrorStream got Status more than once: " << file_ << ":"
-               << line_ << " " << stream_.str();
+    LOG(ERROR) << "MakeErrorStream got absl::Status more than once: " << file_
+               << ":" << line_ << " " << stream_.str();
   }
 
   is_done_ = true;
@@ -167,8 +171,8 @@ Status MakeErrorStream::Impl::GetStatus() {
 
 void MakeErrorStream::Impl::CheckNotDone() const {
   if (is_done_) {
-    LOG(ERROR) << "MakeErrorStream shift called after getting Status: " << file_
-               << ":" << line_ << " " << stream_.str();
+    LOG(ERROR) << "MakeErrorStream shift called after getting absl::Status: "
+               << file_ << ":" << line_ << " " << stream_.str();
   }
 }
 
diff --git a/third_party/xla/xla/status_macros.h b/third_party/xla/xla/status_macros.h
index 59bf958a9985ce..02621a8f2c1e01 100644
--- a/third_party/xla/xla/status_macros.h
+++ b/third_party/xla/xla/status_macros.h
@@ -23,8 +23,12 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/base/optimization.h"
+#include "absl/status/status.h"
 #include "xla/status.h"
 #include "xla/statusor.h"
+#include "tsl/platform/macros.h"
+#include "tsl/platform/status.h"
 
 namespace xla {
 namespace status_macros {
@@ -36,13 +40,13 @@ extern const char kPossibleAutoJitAlternative[];
 // Stream object used to collect error messages in MAKE_ERROR macros
 // or append error messages with APPEND_ERROR.  It accepts any
 // arguments with operator<< to build an error string, and then has an
-// implicit cast operator to Status, which converts the
-// logged string to a Status object and returns it, after logging the
+// implicit cast operator to absl::Status, which converts the
+// logged string to a absl::Status object and returns it, after logging the
 // error.  At least one call to operator<< is required; a compile time
 // error will be generated if none are given. Errors will only be
 // logged by default for certain status codes, as defined in
 // IsLoggedByDefault. This class will give ERROR errors if you don't
-// retrieve a Status exactly once before destruction.
+// retrieve a absl::Status exactly once before destruction.
 //
 // The class converts into an intermediate wrapper object
 // MakeErrorStreamWithOutput to check that the error stream gets at least one
@@ -52,7 +56,7 @@ class MakeErrorStream {
   // Wrapper around MakeErrorStream that only allows for output. This
   // is created as output of the first operator<< call on
   // MakeErrorStream. The bare MakeErrorStream does not have a
-  // Status operator. The net effect of that is that you
+  // absl::Status operator. The net effect of that is that you
   // have to call operator<< at least once or else you'll get a
   // compile time error.
   class MakeErrorStreamWithOutput {
@@ -66,10 +70,10 @@ class MakeErrorStream {
       return *this;
     }
 
-    // Implicit cast operators to Status and StatusOr.
+    // Implicit cast operators to absl::Status and absl::StatusOr.
     // Exactly one of these must be called exactly once before destruction.
     // NOLINTNEXTLINE(google-explicit-constructor)
-    operator Status() { return wrapped_error_stream_->GetStatus(); }
+    operator absl::Status() { return wrapped_error_stream_->GetStatus(); }
     template <typename T>
     // NOLINTNEXTLINE(google-explicit-constructor)
     operator absl::StatusOr<T>() {
@@ -113,13 +117,14 @@ class MakeErrorStream {
    public:
     Impl(const char* file, int line, tsl::error::Code code,
          MakeErrorStream* error_stream, bool is_logged_by_default = true);
-    Impl(const Status& status, PriorMessageHandling prior_message_handling,
-         const char* file, int line, MakeErrorStream* error_stream);
+    Impl(const absl::Status& status,
+         PriorMessageHandling prior_message_handling, const char* file,
+         int line, MakeErrorStream* error_stream);
 
     ~Impl();
 
     // This must be called exactly once before destruction.
-    Status GetStatus();
+    absl::Status GetStatus();
 
     void CheckNotDone() const;
 
@@ -130,16 +135,16 @@ class MakeErrorStream {
 
     PriorMessageHandling prior_message_handling_ = kAppendToPriorMessage;
     std::string prior_message_;
-    bool is_done_;  // true after Status object has been returned
+    bool is_done_;  // true after absl::Status object has been returned
     std::ostringstream stream_;
     bool should_log_;
     int log_severity_;
     bool should_log_stack_trace_;
 
     // Wrapper around the MakeErrorStream object that has a
-    // Status conversion. The first << operator called on
+    // absl::Status conversion. The first << operator called on
     // MakeErrorStream will return this object, and only this object
-    // can implicitly convert to Status. The net effect of
+    // can implicitly convert to absl::Status. The net effect of
     // this is that you'll get a compile time error if you call
     // MAKE_ERROR etc. without adding any output.
     MakeErrorStreamWithOutput make_error_stream_with_output_wrapper_;
@@ -152,7 +157,7 @@ class MakeErrorStream {
   void CheckNotDone() const;
 
   // Returns the status. Used by MakeErrorStreamWithOutput.
-  Status GetStatus() const { return impl_->GetStatus(); }
+  absl::Status GetStatus() const { return impl_->GetStatus(); }
 
   // Store the actual data on the heap to reduce stack frame sizes.
   std::unique_ptr<Impl> impl_;
@@ -171,17 +176,18 @@ TF_ATTRIBUTE_NOINLINE MakeErrorStream::MakeErrorStream(const char* file,
 // that declares a variable.
 class StatusAdaptorForMacros {
  public:
-  explicit StatusAdaptorForMacros(Status status) : status_(std::move(status)) {}
+  explicit StatusAdaptorForMacros(absl::Status status)
+      : status_(std::move(status)) {}
 
   StatusAdaptorForMacros(const StatusAdaptorForMacros&) = delete;
   StatusAdaptorForMacros& operator=(const StatusAdaptorForMacros&) = delete;
 
   explicit operator bool() const { return ABSL_PREDICT_TRUE(status_.ok()); }
 
-  Status&& Consume() { return std::move(status_); }
+  absl::Status&& Consume() { return std::move(status_); }
 
  private:
-  Status status_;
+  absl::Status status_;
 };
 
 }  // namespace status_macros
diff --git a/third_party/xla/xla/status_macros_test.cc b/third_party/xla/xla/status_macros_test.cc
index fe09a008143db1..99eaebdf96b697 100644
--- a/third_party/xla/xla/status_macros_test.cc
+++ b/third_party/xla/xla/status_macros_test.cc
@@ -18,44 +18,46 @@ limitations under the License.
 #include <functional>
 #include <utility>
 
-#include "xla/statusor.h"
+#include "absl/status/statusor.h"
+#include "xla/status.h"
 #include "xla/test.h"
 #include "xla/test_helpers.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 
-Status RetCheckFail() {
+absl::Status RetCheckFail() {
   TF_RET_CHECK(2 > 3);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status RetCheckFailWithExtraMessage() {
+absl::Status RetCheckFailWithExtraMessage() {
   TF_RET_CHECK(2 > 3) << "extra message";
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status RetCheckSuccess() {
+absl::Status RetCheckSuccess() {
   TF_RET_CHECK(3 > 2);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 TEST(StatusMacros, RetCheckFailing) {
-  Status status = RetCheckFail();
+  absl::Status status = RetCheckFail();
   EXPECT_EQ(status.code(), tsl::error::INTERNAL);
   EXPECT_THAT(status.message(),
               ::testing::ContainsRegex("RET_CHECK failure.*2 > 3"));
 }
 
 TEST(StatusMacros, RetCheckFailingWithExtraMessage) {
-  Status status = RetCheckFailWithExtraMessage();
+  absl::Status status = RetCheckFailWithExtraMessage();
   EXPECT_EQ(status.code(), tsl::error::INTERNAL);
   EXPECT_THAT(status.message(),
               ::testing::ContainsRegex("RET_CHECK.*2 > 3 extra message"));
 }
 
 TEST(StatusMacros, RetCheckSucceeding) {
-  Status status = RetCheckSuccess();
+  absl::Status status = RetCheckSuccess();
   EXPECT_IS_OK(status);
 }
 
@@ -70,11 +72,11 @@ TEST(StatusMacros, AssignOrAssertOnOK) {
   EXPECT_EQ(42, result);
 }
 
-Status ReturnStatusOK() { return OkStatus(); }
+absl::Status ReturnStatusOK() { return absl::OkStatus(); }
 
-Status ReturnStatusError() { return (tsl::errors::Internal("foobar")); }
+absl::Status ReturnStatusError() { return (tsl::errors::Internal("foobar")); }
 
-using StatusReturningFunction = std::function<Status()>;
+using StatusReturningFunction = std::function<absl::Status()>;
 
 absl::StatusOr<int> CallStatusReturningFunction(
     const StatusReturningFunction& func) {
@@ -95,19 +97,19 @@ TEST(StatusMacros, ReturnIfErrorOnError) {
 }
 
 TEST(StatusMacros, AssignOrReturnSuccessfully) {
-  Status status = []() {
+  absl::Status status = []() {
     TF_ASSIGN_OR_RETURN(int value, CreateIntSuccessfully());
     EXPECT_EQ(value, 42);
-    return OkStatus();
+    return absl::OkStatus();
   }();
   EXPECT_IS_OK(status);
 }
 
 TEST(StatusMacros, AssignOrReturnUnsuccessfully) {
-  Status status = []() {
+  absl::Status status = []() {
     TF_ASSIGN_OR_RETURN(int value, CreateIntUnsuccessfully());
     (void)value;
-    return OkStatus();
+    return absl::OkStatus();
   }();
   EXPECT_FALSE(status.ok());
   EXPECT_EQ(status.code(), tsl::error::INTERNAL);
diff --git a/third_party/xla/xla/statusor.h b/third_party/xla/xla/statusor.h
index a8704bfd45d065..66c02ab4e74489 100644
--- a/third_party/xla/xla/statusor.h
+++ b/third_party/xla/xla/statusor.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 namespace xla {
 
-// Use steam_executor's StatusOr so we don't duplicate code.
+// Use steam_executor's absl::StatusOr so we don't duplicate code.
 using tsl::StatusOr;  // TENSORFLOW_STATUS_OK
 
 }  // namespace xla
diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index 4bbdea5e08ff4b..d3cc88ebc1314f 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -1,8 +1,9 @@
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 load("@local_tsl//tsl/platform:build_config_root.bzl", "if_static")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//xla:xla.bzl", "xla_cc_test")
-load("//xla/stream_executor:build_defs.bzl", "stream_executor_friends", "stream_executor_internal")
+load("//xla/stream_executor:build_defs.bzl", "stream_executor_build_defs_bzl_deps", "stream_executor_friends", "stream_executor_internal")
 load("//xla/tsl:tsl.bzl", "internal_visibility", "transitive_hdrs")
 load("//xla/tsl:tsl.default.bzl", "filegroup")
 
@@ -25,6 +26,16 @@ package_group(
     packages = stream_executor_internal(),
 )
 
+bzl_library(
+    name = "build_defs_bzl",
+    srcs = ["build_defs.bzl"],
+    deps = [
+        "@local_config_cuda//cuda:build_defs_bzl",
+        "@local_tsl//tsl/platform:rules_cc_bzl",
+        "@local_tsl//tsl/platform/default:cuda_build_defs_bzl",
+    ] + stream_executor_build_defs_bzl_deps(),
+)
+
 #===--------------------------------------------------------------------------------------------===#
 # StreamExecutor public API
 #===--------------------------------------------------------------------------------------------===#
@@ -128,6 +139,7 @@ cc_library(
         ":stream_executor_plugin_headers",
     ],
     deps = STREAM_EXECUTOR_DEPENDENCIES + [
+        ":stream_executor_interface",
         ":stream_executor_pimpl",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
@@ -175,6 +187,56 @@ cc_library(
     deps = ["@local_tsl//tsl/platform:logging"],
 )
 
+cc_library(
+    name = "device_memory_handle",
+    srcs = ["device_memory_handle.cc"],
+    hdrs = ["device_memory_handle.h"],
+    deps = [
+        ":device_memory",
+        ":stream_executor_interface",
+    ],
+)
+
+cc_library(
+    name = "module_spec",
+    hdrs = ["module_spec.h"],
+    deps = [
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:logging",
+    ],
+)
+
+cc_library(
+    name = "scoped_module_handle",
+    hdrs = ["scoped_module_handle.h"],
+    deps = [
+        ":module_spec",
+        ":stream_executor_interface",
+        "@com_google_absl//absl/log:check",
+    ],
+)
+
+cc_library(
+    name = "mock_stream_executor",
+    testonly = True,
+    hdrs = ["mock_stream_executor.h"],
+    deps = [
+        ":device_description",
+        ":device_memory",
+        ":event_interface",
+        ":kernel_spec",
+        ":module_spec",
+        ":stream_executor_headers",
+        ":stream_executor_interface",
+        ":stream_interface",
+        "//xla:test",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/types:span",
+    ] + if_static(["@com_google_protobuf//:protobuf"]),
+)
+
 cc_library(
     name = "data_type",
     hdrs = ["data_type.h"],
@@ -187,18 +249,24 @@ cc_library(
 
 cc_library(
     name = "device_memory_allocator",
+    srcs = ["device_memory_allocator.cc"],
     hdrs = ["device_memory_allocator.h"],
     deps = [
         ":device_memory",
         ":platform",
+        ":stream_executor_headers",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:numbers",
         "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -453,33 +521,35 @@ cc_library(
 
 cc_library(
     name = "command_buffer",
-    srcs = ["command_buffer.cc"],
     hdrs = ["command_buffer.h"],
     visibility = ["//visibility:private"],
     deps = [
-        ":stream_executor_headers",
-        ":stream_executor_interface",
+        ":device_memory",
+        ":kernel",
+        ":launch_dim",
+        ":platform",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/lib/gtl:int_type",
         "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
 cc_library(
-    name = "event",
-    srcs = ["event.cc"],
-    hdrs = ["event.h"],
-    visibility = ["//visibility:private"],
+    name = "trace_command_buffer_factory",
+    srcs = ["trace_command_buffer_factory.cc"],
+    hdrs = ["trace_command_buffer_factory.h"],
+    visibility = [":internal"],
     deps = [
-        ":event_interface",
+        ":command_buffer",
         ":stream_executor_headers",
         ":stream_executor_interface",
-        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -504,7 +574,7 @@ cc_library(
     name = "kernel_spec",
     srcs = ["kernel_spec.cc"],
     hdrs = ["kernel_spec.h"],
-    visibility = ["//visibility:private"],
+    visibility = [":internal"],
     deps = [
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -521,11 +591,10 @@ cc_library(
     deps = [
         ":device_memory",
         ":kernel_spec",
+        ":launch_dim",
         ":platform",
-        ":stream_executor_headers",
-        ":stream_executor_interface",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/meta:type_traits",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -538,6 +607,35 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "kernel_factory",
+    hdrs = ["kernel_factory.h"],
+    deps = [
+        ":kernel",
+        ":kernel_spec",
+        ":stream_executor_interface",
+        "@com_google_absl//absl/status:statusor",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+cc_library(
+    name = "typed_kernel_factory",
+    hdrs = ["typed_kernel_factory.h"],
+    deps = [
+        ":kernel",
+        ":kernel_factory",
+        ":kernel_spec",
+        ":stream_executor_interface",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "scratch_allocator",
     hdrs = ["scratch_allocator.h"],
@@ -568,6 +666,7 @@ transitive_hdrs(
 cc_library(
     name = "stream_executor_pimpl",
     srcs = [
+        "event.cc",
         "stream.cc",
         "stream_executor_pimpl.cc",
     ],
@@ -578,6 +677,7 @@ cc_library(
         ":blas",  # build_cleaner: keep
         ":command_buffer",  # build_cleaner: keep
         ":dnn",  # build_cleaner: keep
+        ":event_interface",
         ":fft",
         ":host_memory_allocation",  # build_cleaner: keep
         ":kernel_spec",
@@ -616,7 +716,6 @@ cc_library(
         ":device_description",
         ":device_memory",
         ":dnn",
-        ":event",
         ":executor_cache",
         ":kernel",
         ":kernel_spec",
@@ -640,6 +739,7 @@ xla_cc_test(
     deps = [
         ":device_memory",
         ":stream_executor",
+        ":typed_kernel_factory",
         "//xla/stream_executor/host:host_platform",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_benchmark",
@@ -684,6 +784,32 @@ xla_cc_test(
     ],
 )
 
+xla_cc_test(
+    name = "scoped_module_handle_test",
+    srcs = ["scoped_module_handle_test.cc"],
+    deps = [
+        ":device_description",
+        ":mock_stream_executor",
+        ":module_spec",
+        ":scoped_module_handle",
+        "@com_google_absl//absl/status",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
+
+xla_cc_test(
+    name = "device_memory_handle_test",
+    srcs = ["device_memory_handle_test.cc"],
+    deps = [
+        ":device_memory",
+        ":device_memory_handle",
+        ":mock_stream_executor",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
+
 #===--------------------------------------------------------------------------------------------===#
 # Aliases for StreamExecutor platforms
 #===--------------------------------------------------------------------------------------------===#
@@ -704,7 +830,6 @@ cc_library(
     visibility = [":internal"],
     deps = [
         ":dnn",
-        ":event",
         ":platform_manager",
         ":scratch_allocator",
         ":stream_executor",
diff --git a/third_party/xla/xla/stream_executor/build_defs.bzl b/third_party/xla/xla/stream_executor/build_defs.bzl
index b619fd8823b57a..e32cd011e0641b 100644
--- a/third_party/xla/xla/stream_executor/build_defs.bzl
+++ b/third_party/xla/xla/stream_executor/build_defs.bzl
@@ -92,3 +92,6 @@ def cuda_only_cc_library(name, tags = [], **kwargs):
         restricted_to = kwargs.get("restricted_to"),
         target_compatible_with = kwargs.get("target_compatible_with"),
     )
+
+def stream_executor_build_defs_bzl_deps():
+    return []
diff --git a/third_party/xla/xla/stream_executor/command_buffer.h b/third_party/xla/xla/stream_executor/command_buffer.h
index 998e4038def710..5cb39e857f7fbb 100644
--- a/third_party/xla/xla/stream_executor/command_buffer.h
+++ b/third_party/xla/xla/stream_executor/command_buffer.h
@@ -19,25 +19,21 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <functional>
-#include <memory>
 #include <variant>
 #include <vector>
 
 #include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
-#include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/launch_dim.h"
-#include "xla/stream_executor/platform.h"
 #include "tsl/lib/gtl/int_type.h"
 #include "tsl/platform/errors.h"
 
 namespace stream_executor {
 
 class Stream;
-class StreamExecutor;
 
 //===----------------------------------------------------------------------===//
 // CommandBuffer
@@ -161,40 +157,6 @@ class CommandBuffer {
   //
   enum class Mode { kPrimary, kNested };
 
-  //===--------------------------------------------------------------------===//
-  // Command buffer constructors
-  //===--------------------------------------------------------------------===//
-
-  // TODO(b/323534971): Command buffer constructors should be moved to
-  // StreamExecutor or a dedicated CommandBufferFactory accessible via
-  // StreamExecutor.
-
-  // Creates a new empty command buffer on the given executor.
-  static absl::StatusOr<std::unique_ptr<CommandBuffer>> Create(
-      StreamExecutor* executor, Mode mode = Mode::kPrimary);
-
-  // Creates a new command buffer on the given executor by tracing `function`
-  // invocation. All StreamExecutor operations on a Stream argument will be
-  // recorded into the command buffer. Returned command buffer is finalized, and
-  // can't be updated.
-  //
-  // Command buffer tracing should be used only when it is impossible to use
-  // explicit construction APIs, e.g. when calling external libraries. By
-  // default we construct traced command buffers in nested mode because the
-  // primary use case for traced command buffers is to be inserted into primary
-  // command buffers constructed with explicit APIs.
-  static absl::StatusOr<std::unique_ptr<CommandBuffer>> Trace(
-      StreamExecutor* executor,
-      absl::AnyInvocable<absl::Status(Stream*)> function,
-      Mode mode = Mode::kNested);
-
-  // Creates a new command buffer on the given executor by tracing `function`
-  // invocation using a user provided stream that will be passed to `function`.
-  static absl::StatusOr<std::unique_ptr<CommandBuffer>> Trace(
-      StreamExecutor* executor, Stream* stream,
-      absl::AnyInvocable<absl::Status(Stream*)> function,
-      Mode mode = Mode::kNested);
-
   //===--------------------------------------------------------------------===//
   // Command buffer API
   //===--------------------------------------------------------------------===//
@@ -202,25 +164,20 @@ class CommandBuffer {
   // Adds an execution barrier to a given execution scope: all commands added
   // before a barrier in a the execution scope will complete before any of the
   // commands added after a barrier in the same execution scope.
-  virtual absl::Status Barrier(StreamExecutor* executor,
-                               ExecutionScopeId execution_scope_id) = 0;
+  virtual absl::Status Barrier(ExecutionScopeId execution_scope_id) = 0;
 
   // Adds an execution barrier that synchronizes commands across multiple
   // execution scopes. See example #2 in execution scope id documentation.
   virtual absl::Status Barrier(
-      StreamExecutor* executor,
       absl::Span<const ExecutionScopeId> execution_scope_ids) = 0;
 
   // Adds an execution barrier from execution scope `from_execution_scope_id` to
   // execution scope `to_execution_scope_id`. See example #3 for details.
-  virtual absl::Status Barrier(StreamExecutor* executor,
-                               ExecutionScopeId from_execution_scope_id,
+  virtual absl::Status Barrier(ExecutionScopeId from_execution_scope_id,
                                ExecutionScopeId to_execution_scope_id) = 0;
 
   // Adds an execution barrier to the default execution scope.
-  absl::Status Barrier(StreamExecutor* executor) {
-    return Barrier(executor, kDefaulExecutionScope);
-  }
+  absl::Status Barrier() { return Barrier(kDefaulExecutionScope); }
 
   // Adds a kernel launch command.
   virtual absl::Status Launch(ExecutionScopeId execution_scope_id,
@@ -285,28 +242,6 @@ class CommandBuffer {
     return Memset(kDefaulExecutionScope, dst, bit_pattern, num_elements);
   }
 
-  //--------------------------------------------------------------------------//
-  // Command buffer memory allocation API
-  //--------------------------------------------------------------------------//
-
-  // Adds a device memory allocation command.
-  virtual absl::StatusOr<DeviceMemoryBase> Allocate(
-      ExecutionScopeId execution_scope_id, size_t bytes) = 0;
-
-  // Adds a device memory allocation command to the default execution scope.
-  absl::StatusOr<DeviceMemoryBase> Allocate(size_t bytes) {
-    return Allocate(kDefaulExecutionScope, bytes);
-  }
-
-  // Adds a device memory free command.
-  virtual absl::Status Free(ExecutionScopeId execution_scope_id,
-                            DeviceMemoryBase dst) = 0;
-
-  // Adds a device memory free command to the default execution scope.
-  absl::Status Free(DeviceMemoryBase dst) {
-    return Free(kDefaulExecutionScope, dst);
-  }
-
   //--------------------------------------------------------------------------//
   // Command buffer condtitional commands API
   //--------------------------------------------------------------------------//
@@ -314,27 +249,24 @@ class CommandBuffer {
   // Adds a conditional operation that will execute a command buffer constructed
   // by `then_builder` if `pred` value is `true`.
   virtual absl::Status If(ExecutionScopeId execution_scope_id,
-                          StreamExecutor* executor, DeviceMemory<bool> pred,
-                          Builder then_builder) = 0;
+                          DeviceMemory<bool> pred, Builder then_builder) = 0;
 
   // Adds a conditional If operation to default execution scope.
-  absl::Status If(StreamExecutor* executor, DeviceMemory<bool> pred,
-                  Builder then_builder) {
-    return If(kDefaulExecutionScope, executor, pred, then_builder);
+  absl::Status If(DeviceMemory<bool> pred, Builder then_builder) {
+    return If(kDefaulExecutionScope, pred, then_builder);
   }
 
   // Adds a conditional operation that will execute a command buffer constructed
   // by `then_builder` if `pred` value is `true`, or a command buffer
   // constructed by `else_builder` if `pred` is `false`.
   virtual absl::Status IfElse(ExecutionScopeId execution_scope_id,
-                              StreamExecutor* executor, DeviceMemory<bool> pred,
-                              Builder then_builder, Builder else_builder) = 0;
+                              DeviceMemory<bool> pred, Builder then_builder,
+                              Builder else_builder) = 0;
 
   // Adds a conditional IfElse operation to default execution scope.
-  absl::Status IfElse(StreamExecutor* executor, DeviceMemory<bool> pred,
-                      Builder then_builder, Builder else_builder) {
-    return IfElse(kDefaulExecutionScope, executor, pred, then_builder,
-                  else_builder);
+  absl::Status IfElse(DeviceMemory<bool> pred, Builder then_builder,
+                      Builder else_builder) {
+    return IfElse(kDefaulExecutionScope, pred, then_builder, else_builder);
   }
 
   // Adds a conditional operation that will execute a command buffer constructed
@@ -343,14 +275,13 @@ class CommandBuffer {
   //
   // See: https://github.com/openxla/stablehlo/blob/main/docs/spec.md#case
   virtual absl::Status Case(ExecutionScopeId execution_scope_id,
-                            StreamExecutor* executor,
                             DeviceMemory<int32_t> index,
                             std::vector<Builder> branches) = 0;
 
   // Adds a conditional Case operation to default execution scope.
-  absl::Status Case(StreamExecutor* executor, DeviceMemory<int32_t> index,
+  absl::Status Case(DeviceMemory<int32_t> index,
                     std::vector<Builder> branches) {
-    return Case(kDefaulExecutionScope, executor, index, branches);
+    return Case(kDefaulExecutionScope, index, branches);
   }
 
   // Adds a conditional operation that will execute a command buffer constructed
@@ -358,14 +289,14 @@ class CommandBuffer {
   // condition is known at compile time (`num_iteration` < `loop_counter`), and
   // does not require a `cond_builder`.
   virtual absl::Status For(ExecutionScopeId execution_scope_id,
-                           StreamExecutor* executor, int32_t num_iteration,
+                           int32_t num_iteration,
                            DeviceMemory<int32_t> loop_counter,
                            Builder body_builder) = 0;
 
   // Adds a conditional For operation to default execution scope.
-  absl::Status For(StreamExecutor* executor, int32_t num_iteration,
-                   DeviceMemory<int32_t> loop_counter, Builder body_builder) {
-    return For(kDefaulExecutionScope, executor, num_iteration, loop_counter,
+  absl::Status For(int32_t num_iteration, DeviceMemory<int32_t> loop_counter,
+                   Builder body_builder) {
+    return For(kDefaulExecutionScope, num_iteration, loop_counter,
                body_builder);
   }
 
@@ -386,15 +317,14 @@ class CommandBuffer {
   // condition twice: (1) before the conditional node in the scope defined by
   // `execution_scope_id` (2) inside the loop body with default execution scope.
   virtual absl::Status While(ExecutionScopeId execution_scope_id,
-                             StreamExecutor* executor, DeviceMemory<bool> pred,
+                             DeviceMemory<bool> pred,
                              ExecutionScopeBuilder cond_builder,
                              Builder body_builder) = 0;
 
   // Adds a conditional While operation to default execution scope.
-  absl::Status While(StreamExecutor* executor, DeviceMemory<bool> pred,
+  absl::Status While(DeviceMemory<bool> pred,
                      ExecutionScopeBuilder cond_builder, Builder body_builder) {
-    return While(kDefaulExecutionScope, executor, pred, cond_builder,
-                 body_builder);
+    return While(kDefaulExecutionScope, pred, cond_builder, body_builder);
   }
 
   //--------------------------------------------------------------------------//
@@ -419,6 +349,7 @@ class CommandBuffer {
   // Command buffer tracing API
   //--------------------------------------------------------------------------//
  private:
+  friend class TraceCommandBufferFactory;
   // Tracing APIs are private because they do not compose with command buffer
   // updates. Instead of tracing directly into the command buffer users should
   // create traced command buffers using factory methods and add them to primary
diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index a55f5bc68de494..a756c861863cdf 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -2,7 +2,6 @@ load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
 load(
     "@local_tsl//tsl/platform:build_config_root.bzl",
     "if_static",
-    "tf_cuda_tests_tags",
 )
 load(
     "@local_tsl//tsl/platform:rules_cc.bzl",
@@ -193,10 +192,8 @@ cuda_only_cc_library(
 xla_cc_test(
     name = "cuda_driver_test",
     srcs = ["cuda_driver_test.cc"],
-    tags = tf_cuda_tests_tags() + [
-        "no_rocm",
-        "requires-gpu-nvidia",
-    ],
+    tags = ["no_rocm"],
+    use_gpu = True,
     deps = [
         ":cuda_driver",
         "//xla/stream_executor/gpu:gpu_driver_header",
@@ -393,7 +390,7 @@ cuda_only_cc_library(
         "//xla/stream_executor/gpu:gpu_driver_header",
         "//xla/stream_executor/gpu:gpu_executor_header",
         "//xla/stream_executor/gpu:gpu_stream",
-        "//xla/stream_executor/gpu:gpu_timer_header",
+        "//xla/stream_executor/gpu:gpu_timer",
         "//xla/stream_executor/platform",
         "//xla/tsl/cuda:cudnn",
         "//xla/tsl/util:env_var",
@@ -461,8 +458,8 @@ cuda_only_cc_library(
     srcs = ["cuda_event.cc"],
     hdrs = ["cuda_event.h"],
     deps = [
+        ":cuda_driver",
         "//xla/stream_executor",
-        "//xla/stream_executor/gpu:gpu_driver_header",
         "//xla/stream_executor/gpu:gpu_event",
         "//xla/stream_executor/gpu:gpu_executor_header",
         "@com_google_absl//absl/log",
@@ -543,7 +540,7 @@ cc_library(
 xla_cc_test(
     name = "ptx_compiler_test",
     srcs = ["ptx_compiler_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":ptx_compiler",
         ":ptx_compiler_support",
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.cc b/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.cc
index 859e298112fefc..af146b1ec3671e 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.cc
@@ -46,7 +46,6 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_helpers.h"
 #include "xla/stream_executor/gpu/gpu_stream.h"
 #include "xla/stream_executor/gpu/gpu_timer.h"
-#include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/types.h"
 #include "xla/util.h"
@@ -398,6 +397,20 @@ absl::Status BlasLt::MatmulPlan::ValidateInputs(
   return absl::OkStatus();
 }
 
+absl::Status BlasLt::MatmulPlan::DoMatmul(
+    Stream* stream, const void* alpha, DeviceMemoryBase a, DeviceMemoryBase b,
+    const void* beta, DeviceMemoryBase c, DeviceMemoryBase d,
+    const MatmulAlgorithm& algorithm, DeviceMemoryBase bias,
+    DeviceMemoryBase aux, DeviceMemoryBase a_scale, DeviceMemoryBase b_scale,
+    DeviceMemoryBase c_scale, DeviceMemoryBase d_scale, DeviceMemoryBase d_amax,
+    std::optional<DeviceMemoryBase> workspace,
+    blas::ProfileResult* profile_result) const {
+  return DoMatmul(stream, alpha, a, b, beta, c, d, algorithm, bias, aux,
+                  a_scale, b_scale, c_scale, d_scale, d_amax, workspace,
+                  std::nullopt, profile_result);
+}
+
+// Tensorflow use this API
 absl::Status BlasLt::MatmulPlan::DoMatmul(
     Stream* stream, const void* alpha, DeviceMemoryBase a, DeviceMemoryBase b,
     const void* beta, DeviceMemoryBase c, DeviceMemoryBase d,
@@ -406,18 +419,39 @@ absl::Status BlasLt::MatmulPlan::DoMatmul(
     DeviceMemoryBase b_scale, DeviceMemoryBase c_scale,
     DeviceMemoryBase d_scale, DeviceMemoryBase d_amax,
     blas::ProfileResult* profile_result) const {
+  return DoMatmul(stream, alpha, a, b, beta, c, d, algorithm, bias, aux,
+                  a_scale, b_scale, c_scale, d_scale, d_amax, std::nullopt,
+                  &scratch_allocator, profile_result);
+}
+
+absl::Status BlasLt::MatmulPlan::DoMatmul(
+    Stream* stream, const void* alpha, DeviceMemoryBase a, DeviceMemoryBase b,
+    const void* beta, DeviceMemoryBase c, DeviceMemoryBase d,
+    const MatmulAlgorithm& algorithm, DeviceMemoryBase bias,
+    DeviceMemoryBase aux, DeviceMemoryBase a_scale, DeviceMemoryBase b_scale,
+    DeviceMemoryBase c_scale, DeviceMemoryBase d_scale, DeviceMemoryBase d_amax,
+    std::optional<DeviceMemoryBase> workspace,
+    std::optional<ScratchAllocator*> scratch_allocator,
+    blas::ProfileResult* profile_result = nullptr) const {
   TF_ASSIGN_OR_RETURN(
       std::optional<gpu::GpuTimer> timer,
       gpu::GpuTimer::CreateIfNeeded(
           stream, profile_result && profile_result->warmup_run_executed(),
           profile_result != nullptr));
 
-  void* workspace = nullptr;
-  if (algorithm.workspace_size > 0) {
+  void* workspace_addr;
+  uint64_t workspace_size = 0;
+  if (workspace.has_value()) {
+    workspace_addr = workspace.value().opaque();
+    workspace_size = workspace.value().size();
+    TF_RET_CHECK(workspace_size >= algorithm.workspace_size);
+  } else if (algorithm.workspace_size > 0) {
+    TF_RET_CHECK(scratch_allocator.has_value());
     TF_ASSIGN_OR_RETURN(
         DeviceMemory<uint8_t> alloc,
-        scratch_allocator.AllocateBytes(algorithm.workspace_size));
-    workspace = gpu::GpuMemoryMutable(&alloc);
+        scratch_allocator.value()->AllocateBytes(algorithm.workspace_size));
+    workspace_addr = gpu::GpuMemoryMutable(&alloc);
+    workspace_size = algorithm.workspace_size;
   }
 
   auto palgo = std::any_cast<cublasLtMatmulAlgo_t>(&algorithm.opaque_algo);
@@ -500,8 +534,8 @@ absl::Status BlasLt::MatmulPlan::DoMatmul(
       SE_CUBLAS_RETURN_IF_ERROR(cublasLtMatmul(
           blas_lt_ref_.blas_lt_.get(), op_desc_.get(), alpha, a.opaque(),
           a_desc_.get(), b.opaque(), b_desc_.get(), beta, c.opaque(),
-          c_desc_.get(), d.opaque(), d_desc_.get(), palgo, workspace,
-          algorithm.workspace_size, gpu::AsGpuStreamValue(stream)));
+          c_desc_.get(), d.opaque(), d_desc_.get(), palgo, workspace_addr,
+          workspace_size, gpu::AsGpuStreamValue(stream)));
     } else {
       return absl::InternalError("cublaslt: Invalid algorithm type");
     }
@@ -565,7 +599,8 @@ absl::Status BlasLt::MatmulPlan::ExecuteOnStream(
     DeviceMemoryBase d, DeviceMemoryBase bias, DeviceMemoryBase aux,
     DeviceMemoryBase a_scale, DeviceMemoryBase b_scale,
     DeviceMemoryBase c_scale, DeviceMemoryBase d_scale, DeviceMemoryBase d_amax,
-    const MatmulAlgorithm& algorithm, ScratchAllocator& scratch_allocator,
+    const MatmulAlgorithm& algorithm, std::optional<DeviceMemoryBase> workspace,
+    std::optional<ScratchAllocator*> scratch_allocator,
     blas::ProfileResult* profile_result) const {
   if (must_swap_operands_) {
     std::swap(a, b);
@@ -574,66 +609,192 @@ absl::Status BlasLt::MatmulPlan::ExecuteOnStream(
   std::tuple operand_types{a_desc_.type(), b_desc_.type(), c_desc_.type(),
                            d_desc_.type()};
 
-#define TYPED_MATMUL(SCALENTYPE, ATYPE, BTYPE, CTYPE, DTYPE)                \
+#define TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(SCALENTYPE, ATYPE, BTYPE, CTYPE, \
+                                            DTYPE)                           \
+  if (operand_types == std::make_tuple(ATYPE, BTYPE, CTYPE, DTYPE)) {        \
+    return gpu::BlasLt::MatmulPlan::DoMatmul<                                \
+        SCALENTYPE, CudaToNativeT<ATYPE>::type, CudaToNativeT<BTYPE>::type,  \
+        CudaToNativeT<CTYPE>::type, CudaToNativeT<DTYPE>::type>(             \
+        stream, alpha_, a, b, beta_, c, d, bias, aux, a_scale, b_scale,      \
+        c_scale, d_scale, d_amax, algorithm, *scratch_allocator.value(),     \
+        profile_result);                                                     \
+  }
+
+#define TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(SCALENTYPE, ATYPE, BTYPE,   \
+                                                CTYPE, DTYPE)               \
   if (operand_types == std::make_tuple(ATYPE, BTYPE, CTYPE, DTYPE)) {       \
     return gpu::BlasLt::MatmulPlan::DoMatmul<                               \
         SCALENTYPE, CudaToNativeT<ATYPE>::type, CudaToNativeT<BTYPE>::type, \
         CudaToNativeT<CTYPE>::type, CudaToNativeT<DTYPE>::type>(            \
         stream, alpha_, a, b, beta_, c, d, bias, aux, a_scale, b_scale,     \
-        c_scale, d_scale, d_amax, algorithm, scratch_allocator,             \
-        profile_result);                                                    \
+        c_scale, d_scale, d_amax, algorithm, workspace, profile_result);    \
   }
 
+  if (workspace.has_value()) {
 #if CUDA_VERSION >= 11080
-  // FP8 compatible type combinations (see cuBLASLt documentation):
-  TYPED_MATMUL(float, CUDA_R_8F_E4M3, CUDA_R_8F_E4M3, CUDA_R_16BF, CUDA_R_16BF)
-  TYPED_MATMUL(float, CUDA_R_8F_E4M3, CUDA_R_8F_E4M3, CUDA_R_16BF,
-               CUDA_R_8F_E4M3)
-  TYPED_MATMUL(float, CUDA_R_8F_E4M3, CUDA_R_8F_E4M3, CUDA_R_16F,
-               CUDA_R_8F_E4M3)
-  TYPED_MATMUL(float, CUDA_R_8F_E4M3, CUDA_R_8F_E4M3, CUDA_R_16F, CUDA_R_16F)
-  TYPED_MATMUL(float, CUDA_R_8F_E4M3, CUDA_R_8F_E4M3, CUDA_R_32F, CUDA_R_32F)
-
-  TYPED_MATMUL(float, CUDA_R_8F_E4M3, CUDA_R_8F_E5M2, CUDA_R_16BF, CUDA_R_16BF)
-  TYPED_MATMUL(float, CUDA_R_8F_E4M3, CUDA_R_8F_E5M2, CUDA_R_16BF,
-               CUDA_R_8F_E4M3)
-  TYPED_MATMUL(float, CUDA_R_8F_E4M3, CUDA_R_8F_E5M2, CUDA_R_16BF,
-               CUDA_R_8F_E5M2)
-  TYPED_MATMUL(float, CUDA_R_8F_E4M3, CUDA_R_8F_E5M2, CUDA_R_16F,
-               CUDA_R_8F_E4M3)
-  TYPED_MATMUL(float, CUDA_R_8F_E4M3, CUDA_R_8F_E5M2, CUDA_R_16F,
-               CUDA_R_8F_E5M2)
-  TYPED_MATMUL(float, CUDA_R_8F_E4M3, CUDA_R_8F_E5M2, CUDA_R_16F, CUDA_R_16F)
-  TYPED_MATMUL(float, CUDA_R_8F_E4M3, CUDA_R_8F_E5M2, CUDA_R_32F, CUDA_R_32F)
-
-  TYPED_MATMUL(float, CUDA_R_8F_E5M2, CUDA_R_8F_E4M3, CUDA_R_16BF, CUDA_R_16BF)
-  TYPED_MATMUL(float, CUDA_R_8F_E5M2, CUDA_R_8F_E4M3, CUDA_R_16BF,
-               CUDA_R_8F_E4M3)
-  TYPED_MATMUL(float, CUDA_R_8F_E5M2, CUDA_R_8F_E4M3, CUDA_R_16BF,
-               CUDA_R_8F_E5M2)
-  TYPED_MATMUL(float, CUDA_R_8F_E5M2, CUDA_R_8F_E4M3, CUDA_R_16F,
-               CUDA_R_8F_E4M3)
-  TYPED_MATMUL(float, CUDA_R_8F_E5M2, CUDA_R_8F_E4M3, CUDA_R_16F,
-               CUDA_R_8F_E5M2)
-  TYPED_MATMUL(float, CUDA_R_8F_E5M2, CUDA_R_8F_E4M3, CUDA_R_16F, CUDA_R_16F)
-  TYPED_MATMUL(float, CUDA_R_8F_E5M2, CUDA_R_8F_E4M3, CUDA_R_32F, CUDA_R_32F)
+    // FP8 compatible type combinations (see cuBLASLt documentation):
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(
+        float, CUDA_R_8F_E4M3, CUDA_R_8F_E4M3, CUDA_R_16BF, CUDA_R_16BF)
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(
+        float, CUDA_R_8F_E4M3, CUDA_R_8F_E4M3, CUDA_R_16BF, CUDA_R_8F_E4M3)
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(
+        float, CUDA_R_8F_E4M3, CUDA_R_8F_E4M3, CUDA_R_16F, CUDA_R_8F_E4M3)
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(
+        float, CUDA_R_8F_E4M3, CUDA_R_8F_E4M3, CUDA_R_16F, CUDA_R_16F)
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(
+        float, CUDA_R_8F_E4M3, CUDA_R_8F_E4M3, CUDA_R_32F, CUDA_R_32F)
+
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(
+        float, CUDA_R_8F_E4M3, CUDA_R_8F_E5M2, CUDA_R_16BF, CUDA_R_16BF)
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(
+        float, CUDA_R_8F_E4M3, CUDA_R_8F_E5M2, CUDA_R_16BF, CUDA_R_8F_E4M3)
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(
+        float, CUDA_R_8F_E4M3, CUDA_R_8F_E5M2, CUDA_R_16BF, CUDA_R_8F_E5M2)
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(
+        float, CUDA_R_8F_E4M3, CUDA_R_8F_E5M2, CUDA_R_16F, CUDA_R_8F_E4M3)
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(
+        float, CUDA_R_8F_E4M3, CUDA_R_8F_E5M2, CUDA_R_16F, CUDA_R_8F_E5M2)
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(
+        float, CUDA_R_8F_E4M3, CUDA_R_8F_E5M2, CUDA_R_16F, CUDA_R_16F)
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(
+        float, CUDA_R_8F_E4M3, CUDA_R_8F_E5M2, CUDA_R_32F, CUDA_R_32F)
+
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(
+        float, CUDA_R_8F_E5M2, CUDA_R_8F_E4M3, CUDA_R_16BF, CUDA_R_16BF)
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(
+        float, CUDA_R_8F_E5M2, CUDA_R_8F_E4M3, CUDA_R_16BF, CUDA_R_8F_E4M3)
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(
+        float, CUDA_R_8F_E5M2, CUDA_R_8F_E4M3, CUDA_R_16BF, CUDA_R_8F_E5M2)
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(
+        float, CUDA_R_8F_E5M2, CUDA_R_8F_E4M3, CUDA_R_16F, CUDA_R_8F_E4M3)
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(
+        float, CUDA_R_8F_E5M2, CUDA_R_8F_E4M3, CUDA_R_16F, CUDA_R_8F_E5M2)
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(
+        float, CUDA_R_8F_E5M2, CUDA_R_8F_E4M3, CUDA_R_16F, CUDA_R_16F)
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(
+        float, CUDA_R_8F_E5M2, CUDA_R_8F_E4M3, CUDA_R_32F, CUDA_R_32F)
 #endif
 
-  // Other data types:
-  TYPED_MATMUL(float, CUDA_R_16BF, CUDA_R_16BF, CUDA_R_16BF, CUDA_R_16BF)
-  TYPED_MATMUL(float, CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_16F)
-  TYPED_MATMUL(float, CUDA_R_16BF, CUDA_R_16BF, CUDA_R_32F, CUDA_R_32F)
-  TYPED_MATMUL(float, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F, CUDA_R_32F)
-  TYPED_MATMUL(float, CUDA_R_32F, CUDA_R_32F, CUDA_R_32F, CUDA_R_32F)
-  TYPED_MATMUL(double, CUDA_R_64F, CUDA_R_64F, CUDA_R_64F, CUDA_R_64F)
-  TYPED_MATMUL(xla::complex64, CUDA_C_32F, CUDA_C_32F, CUDA_C_32F, CUDA_C_32F)
-  TYPED_MATMUL(xla::complex128, CUDA_C_64F, CUDA_C_64F, CUDA_C_64F, CUDA_C_64F)
+    // Other data types:
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(float, CUDA_R_16BF, CUDA_R_16BF,
+                                            CUDA_R_16BF, CUDA_R_16BF)
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(float, CUDA_R_16F, CUDA_R_16F,
+                                            CUDA_R_16F, CUDA_R_16F)
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(float, CUDA_R_16BF, CUDA_R_16BF,
+                                            CUDA_R_32F, CUDA_R_32F)
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(float, CUDA_R_16F, CUDA_R_16F,
+                                            CUDA_R_32F, CUDA_R_32F)
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(float, CUDA_R_32F, CUDA_R_32F,
+                                            CUDA_R_32F, CUDA_R_32F)
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(double, CUDA_R_64F, CUDA_R_64F,
+                                            CUDA_R_64F, CUDA_R_64F)
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(xla::complex64, CUDA_C_32F,
+                                            CUDA_C_32F, CUDA_C_32F, CUDA_C_32F)
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(xla::complex128, CUDA_C_64F,
+                                            CUDA_C_64F, CUDA_C_64F, CUDA_C_64F)
+  } else if (scratch_allocator.has_value()) {
+#if CUDA_VERSION >= 11080
+    // FP8 compatible type combinations (see cuBLASLt documentation):
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(float, CUDA_R_8F_E4M3, CUDA_R_8F_E4M3,
+                                        CUDA_R_16BF, CUDA_R_16BF)
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(float, CUDA_R_8F_E4M3, CUDA_R_8F_E4M3,
+                                        CUDA_R_16BF, CUDA_R_8F_E4M3)
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(float, CUDA_R_8F_E4M3, CUDA_R_8F_E4M3,
+                                        CUDA_R_16F, CUDA_R_8F_E4M3)
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(float, CUDA_R_8F_E4M3, CUDA_R_8F_E4M3,
+                                        CUDA_R_16F, CUDA_R_16F)
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(float, CUDA_R_8F_E4M3, CUDA_R_8F_E4M3,
+                                        CUDA_R_32F, CUDA_R_32F)
+
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(float, CUDA_R_8F_E4M3, CUDA_R_8F_E5M2,
+                                        CUDA_R_16BF, CUDA_R_16BF)
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(float, CUDA_R_8F_E4M3, CUDA_R_8F_E5M2,
+                                        CUDA_R_16BF, CUDA_R_8F_E4M3)
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(float, CUDA_R_8F_E4M3, CUDA_R_8F_E5M2,
+                                        CUDA_R_16BF, CUDA_R_8F_E5M2)
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(float, CUDA_R_8F_E4M3, CUDA_R_8F_E5M2,
+                                        CUDA_R_16F, CUDA_R_8F_E4M3)
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(float, CUDA_R_8F_E4M3, CUDA_R_8F_E5M2,
+                                        CUDA_R_16F, CUDA_R_8F_E5M2)
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(float, CUDA_R_8F_E4M3, CUDA_R_8F_E5M2,
+                                        CUDA_R_16F, CUDA_R_16F)
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(float, CUDA_R_8F_E4M3, CUDA_R_8F_E5M2,
+                                        CUDA_R_32F, CUDA_R_32F)
+
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(float, CUDA_R_8F_E5M2, CUDA_R_8F_E4M3,
+                                        CUDA_R_16BF, CUDA_R_16BF)
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(float, CUDA_R_8F_E5M2, CUDA_R_8F_E4M3,
+                                        CUDA_R_16BF, CUDA_R_8F_E4M3)
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(float, CUDA_R_8F_E5M2, CUDA_R_8F_E4M3,
+                                        CUDA_R_16BF, CUDA_R_8F_E5M2)
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(float, CUDA_R_8F_E5M2, CUDA_R_8F_E4M3,
+                                        CUDA_R_16F, CUDA_R_8F_E4M3)
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(float, CUDA_R_8F_E5M2, CUDA_R_8F_E4M3,
+                                        CUDA_R_16F, CUDA_R_8F_E5M2)
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(float, CUDA_R_8F_E5M2, CUDA_R_8F_E4M3,
+                                        CUDA_R_16F, CUDA_R_16F)
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(float, CUDA_R_8F_E5M2, CUDA_R_8F_E4M3,
+                                        CUDA_R_32F, CUDA_R_32F)
+#endif
 
-#undef TYPED_MATMUL
+    // Other data types:
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(float, CUDA_R_16BF, CUDA_R_16BF,
+                                        CUDA_R_16BF, CUDA_R_16BF)
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(float, CUDA_R_16F, CUDA_R_16F,
+                                        CUDA_R_16F, CUDA_R_16F)
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(float, CUDA_R_16BF, CUDA_R_16BF,
+                                        CUDA_R_32F, CUDA_R_32F)
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(float, CUDA_R_16F, CUDA_R_16F,
+                                        CUDA_R_32F, CUDA_R_32F)
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(float, CUDA_R_32F, CUDA_R_32F,
+                                        CUDA_R_32F, CUDA_R_32F)
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(double, CUDA_R_64F, CUDA_R_64F,
+                                        CUDA_R_64F, CUDA_R_64F)
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(xla::complex64, CUDA_C_32F, CUDA_C_32F,
+                                        CUDA_C_32F, CUDA_C_32F)
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(xla::complex128, CUDA_C_64F, CUDA_C_64F,
+                                        CUDA_C_64F, CUDA_C_64F)
+  }
+
+#undef TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR
+#undef TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE
 
   return xla::Internal("Unexpected dtype");
 }
 
+absl::Status BlasLt::MatmulPlan::ExecuteOnStream(
+    Stream* stream, DeviceMemoryBase a_buffer, DeviceMemoryBase b_buffer,
+    DeviceMemoryBase c_buffer, DeviceMemoryBase d_buffer,
+    DeviceMemoryBase bias_buffer,  // may be null
+    DeviceMemoryBase aux_buffer,   // may be null
+    DeviceMemoryBase a_scale_buffer, DeviceMemoryBase b_scale_buffer,
+    DeviceMemoryBase c_scale_buffer, DeviceMemoryBase d_scale_buffer,
+    DeviceMemoryBase d_amax_buffer, const MatmulAlgorithm& algorithm,
+    ScratchAllocator& scratch_allocator,
+    blas::ProfileResult* profile_result) const {
+  return ExecuteOnStream(stream, a_buffer, b_buffer, c_buffer, d_buffer,
+                         bias_buffer, aux_buffer, a_scale_buffer,
+                         b_scale_buffer, c_scale_buffer, d_scale_buffer,
+                         d_amax_buffer, algorithm, std::nullopt,
+                         &scratch_allocator, profile_result);
+}
+
+absl::Status BlasLt::MatmulPlan::ExecuteOnStream(
+    Stream* stream, DeviceMemoryBase a_buffer, DeviceMemoryBase b_buffer,
+    DeviceMemoryBase c_buffer, DeviceMemoryBase d_buffer,
+    DeviceMemoryBase bias_buffer,  // may be null
+    DeviceMemoryBase aux_buffer,   // may be null
+    DeviceMemoryBase a_scale_buffer, DeviceMemoryBase b_scale_buffer,
+    DeviceMemoryBase c_scale_buffer, DeviceMemoryBase d_scale_buffer,
+    DeviceMemoryBase d_amax_buffer, const MatmulAlgorithm& algorithm,
+    std::optional<DeviceMemoryBase> workspace,
+    blas::ProfileResult* profile_result) const {
+  return ExecuteOnStream(
+      stream, a_buffer, b_buffer, c_buffer, d_buffer, bias_buffer, aux_buffer,
+      a_scale_buffer, b_scale_buffer, c_scale_buffer, d_scale_buffer,
+      d_amax_buffer, algorithm, workspace, std::nullopt, profile_result);
+}
+
 }  // namespace cuda
 
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.h b/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.h
index 2a0a5611b81ce0..23deb83fcb0c39 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/gpu_blas_lt.h"
+#include "xla/stream_executor/scratch_allocator.h"
 #include "xla/types.h"
 
 namespace stream_executor {
@@ -115,6 +116,29 @@ class BlasLt : public gpu::BlasLt {
         ScratchAllocator& scratch_allocator,
         blas::ProfileResult* profile_result = nullptr) const override;
 
+    absl::Status ExecuteOnStream(
+        Stream* stream, DeviceMemoryBase a_buffer, DeviceMemoryBase b_buffer,
+        DeviceMemoryBase c_buffer, DeviceMemoryBase d_buffer,
+        DeviceMemoryBase bias_buffer,  // may be null
+        DeviceMemoryBase aux_buffer,   // may be null
+        DeviceMemoryBase a_scale_buffer, DeviceMemoryBase b_scale_buffer,
+        DeviceMemoryBase c_scale_buffer, DeviceMemoryBase d_scale_buffer,
+        DeviceMemoryBase d_amax_buffer, const MatmulAlgorithm& algorithm,
+        std::optional<DeviceMemoryBase> workspace,
+        blas::ProfileResult* profile_result = nullptr) const override;
+
+    absl::Status ExecuteOnStream(
+        Stream* stream, DeviceMemoryBase a_buffer, DeviceMemoryBase b_buffer,
+        DeviceMemoryBase c_buffer, DeviceMemoryBase d_buffer,
+        DeviceMemoryBase bias_buffer,  // may be null
+        DeviceMemoryBase aux_buffer,   // may be null
+        DeviceMemoryBase a_scale_buffer, DeviceMemoryBase b_scale_buffer,
+        DeviceMemoryBase c_scale_buffer, DeviceMemoryBase d_scale_buffer,
+        DeviceMemoryBase d_amax_buffer, const MatmulAlgorithm& algorithm,
+        std::optional<DeviceMemoryBase> workspace,
+        std::optional<ScratchAllocator*> scratch_allocator,
+        blas::ProfileResult* profile_result = nullptr) const;
+
     absl::StatusOr<std::vector<MatmulAlgorithm>> GetAlgorithms(
         size_t max_algorithm_count, size_t max_workspace_size) const override;
 
@@ -124,6 +148,7 @@ class BlasLt : public gpu::BlasLt {
                                 blas::DataType B_type, blas::DataType C_type,
                                 blas::DataType D_type) const override;
 
+    // API that uses scratch_allocator to allocate workspace
     absl::Status DoMatmul(Stream* stream, const void* alpha, DeviceMemoryBase a,
                           DeviceMemoryBase b, const void* beta,
                           DeviceMemoryBase c, DeviceMemoryBase d,
@@ -135,6 +160,30 @@ class BlasLt : public gpu::BlasLt {
                           DeviceMemoryBase d_amax,
                           blas::ProfileResult* profile_result) const override;
 
+    // API that uses pre-allocated buffer as workspace
+    absl::Status DoMatmul(Stream* stream, const void* alpha, DeviceMemoryBase a,
+                          DeviceMemoryBase b, const void* beta,
+                          DeviceMemoryBase c, DeviceMemoryBase d,
+                          const MatmulAlgorithm& algorithm,
+                          DeviceMemoryBase bias, DeviceMemoryBase aux,
+                          DeviceMemoryBase a_scale, DeviceMemoryBase b_scale,
+                          DeviceMemoryBase c_scale, DeviceMemoryBase d_scale,
+                          DeviceMemoryBase d_amax,
+                          std::optional<DeviceMemoryBase> workspace,
+                          blas::ProfileResult* profile_result) const override;
+
+    absl::Status DoMatmul(Stream* stream, const void* alpha, DeviceMemoryBase a,
+                          DeviceMemoryBase b, const void* beta,
+                          DeviceMemoryBase c, DeviceMemoryBase d,
+                          const MatmulAlgorithm& algorithm,
+                          DeviceMemoryBase bias, DeviceMemoryBase aux,
+                          DeviceMemoryBase a_scale, DeviceMemoryBase b_scale,
+                          DeviceMemoryBase c_scale, DeviceMemoryBase d_scale,
+                          DeviceMemoryBase d_amax,
+                          std::optional<DeviceMemoryBase> workspace,
+                          std::optional<ScratchAllocator*> scratch_allocator,
+                          blas::ProfileResult* profile_result) const override;
+
    private:
     const BlasLt& blas_lt_ref_;
     // TODO(cjfj): Add consistency checks for types, shapes, etc.?
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
index fc0eddb2c500f6..9c33472189148d 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
@@ -3903,306 +3903,6 @@ absl::StatusOr<cudnn_frontend::Tensor> CreateCudnnMaskFwdTensor(
 
   return mask_out_tensor;
 }
-
-// Returns a cudnn tensor that's the output of the alpha scale
-absl::StatusOr<cudnn_frontend::Tensor> CreateCudnnScaleTensor(
-    std::vector<cudnn_frontend::Operation>& ops, absl::Span<const int64_t> dims,
-    absl::Span<const int64_t> strides, dnn::DataType dtype,
-    cudnn_frontend::Tensor& input_tensor) {
-  std::vector<int64_t> scale_dims(dims.size(), 1);
-  std::vector<int64_t> scale_strides(strides.size(), 1);
-
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_alpha_scale,
-      CreateCudnnTensor(
-          scale_dims, scale_strides, CudnnfMHAUid::ALPHA_SCALE_ID, dtype, 1, -1,
-          /* is_virtual */ false,
-          /*cudnn_tensor_order_type*/ CUDNN_TENSOR_REORDERING_NONE,
-          /*is_value*/ true));
-  TF_ASSIGN_OR_RETURN(auto scale_desc,
-                      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_MUL));
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_alpha_scale_out,
-      CreateCudnnTensor(dims, strides, CudnnfMHAUid::VIRTUAL_ID + 200, dtype, 1,
-                        -1, /* is_virtual */ true));
-
-  TF_ASSIGN_OR_RETURN(auto scale_op,
-                      CreateBinaryPwOp(input_tensor, tensor_alpha_scale,
-                                       tensor_alpha_scale_out, scale_desc));
-  // Add scale to op list
-  ops.push_back(std::move(scale_op));
-
-  return tensor_alpha_scale_out;
-}
-
-// Returns a cudnn tensor that's the output of the bias addition op
-absl::StatusOr<cudnn_frontend::Tensor> CreateCudnnBiasTensor(
-    std::vector<cudnn_frontend::Operation>& ops, absl::Span<const int64_t> dims,
-    absl::Span<const int64_t> strides, dnn::DataType dtype,
-    cudnn_frontend::Tensor& input_tensor, bool use_mask) {
-  // Create the bias tensor.
-  TF_ASSIGN_OR_RETURN(
-      auto bias_tensor,
-      CreateCudnnTensor(dims, strides, CudnnfMHAUid::BIAS_ID, dtype, 1, -1));
-
-  // Create the bias output tensor
-  dnn::DataType bias_out_type = use_mask ? dtype : dnn::DataType::kFloat;
-  TF_ASSIGN_OR_RETURN(
-      auto bias_out_tensor,
-      CreateCudnnTensor(dims, strides, CudnnfMHAUid::VIRTUAL_ID + 300,
-                        bias_out_type, 1, -1,
-                        /*is_virtual=*/true));
-
-  // Define the bias descriptor
-  auto bias_desc = cudnn_frontend::PointWiseDescBuilder()
-                       .setMode(CUDNN_POINTWISE_ADD)
-                       .setComputeType(CUDNN_DATA_FLOAT)
-                       .build();
-  // Create the bias op.
-  auto bias_op = cudnn_frontend::OperationBuilder(
-                     CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
-                     .setxDesc(input_tensor)
-                     .setbDesc(bias_tensor)
-                     .setyDesc(bias_out_tensor)
-                     .setpwDesc(bias_desc)
-                     .build();
-
-  RETURN_MSG_IF_CUDNN_ERROR(bias_op);
-  // Add bias to op list
-  ops.push_back(std::move(bias_op));
-
-  return bias_out_tensor;
-}
-
-// Returns a cudnn tensor that's the output of the softmax op
-absl::StatusOr<cudnn_frontend::Tensor> CreateCudnnSoftmaxFwdTensor(
-    std::vector<cudnn_frontend::Operation>& ops, absl::Span<const int64_t> dims,
-    absl::Span<const int64_t> strides, dnn::DataType dtype,
-    cudnn_frontend::Tensor& input_tensor, bool is_virtual = false) {
-  // softmax's typical computation is:
-  // exp(input - reduce_max(input)) / reduce_sum(exp(input - reduce_max(input)))
-  // We need to create each op and add it to the op list sequentially.
-
-  // Copy all dims except the last dim since it's reduced to 1.
-  std::vector<int64_t> reduction_output_dim(dims.begin(), dims.end() - 1);
-  reduction_output_dim.push_back(1);
-
-  // Divide every stride by the last dim value.
-  std::vector<int64_t> reduction_output_stride;
-  int64_t reduced_dim_len = dims.back();
-  for (auto stride : strides) {
-    reduction_output_stride.push_back(stride / reduced_dim_len);
-  }
-
-  // Softmax output should be float
-  dnn::DataType softmax_output_type = dnn::DataType::kFloat;
-
-  // Create output tensor of the first max reduction.
-  TF_ASSIGN_OR_RETURN(
-      auto max_reduction_output_tensor,
-      CreateCudnnTensor(reduction_output_dim, reduction_output_stride,
-                        CudnnfMHAUid::VIRTUAL_ID + 500, dnn::DataType::kFloat,
-                        1, -1, /*is_virtual=*/true));
-
-  // Create the reduction descriptor
-  auto max_reduction_desc =
-      cudnn_frontend::ReductionDescBuilder()
-          .setComputeType(ToCudnnDataType(softmax_output_type))
-          .setReductionOp(CUDNN_REDUCE_TENSOR_MAX)
-          .build();
-
-  // Create a reduction max node.
-  auto max_reduction_op = cudnn_frontend::OperationBuilder(
-                              CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR)
-                              .setxDesc(input_tensor)
-                              .setyDesc(max_reduction_output_tensor)
-                              .setreductionDesc(max_reduction_desc)
-                              .build();
-  RETURN_MSG_IF_CUDNN_ERROR(max_reduction_op);
-
-  // Create output tensor of the subtraction op.
-  TF_ASSIGN_OR_RETURN(
-      auto subtract_output_tensor,
-      CreateCudnnTensor(dims, strides, CudnnfMHAUid::VIRTUAL_ID + 501,
-                        dnn::DataType::kFloat, 1, -1,
-                        /*is_virtual=*/true));
-  // Create the subtraction descriptor
-  TF_ASSIGN_OR_RETURN(auto subtract_desc,
-                      CreatePwDesc(softmax_output_type, CUDNN_POINTWISE_SUB));
-
-  // Create a subtraction node.
-  TF_ASSIGN_OR_RETURN(
-      auto subtract_op,
-      CreateBinaryPwOp(input_tensor, max_reduction_output_tensor,
-                       subtract_output_tensor, subtract_desc));
-  // Create output tensor of the exp op.
-  TF_ASSIGN_OR_RETURN(
-      auto exp_output_tensor,
-      CreateCudnnTensor(dims, strides, CudnnfMHAUid::VIRTUAL_ID + 502,
-                        dnn::DataType::kFloat, 1, -1,
-                        /*is_virtual=*/true));
-  // Create the exponetial descriptor
-  TF_ASSIGN_OR_RETURN(auto exp_desc,
-                      CreatePwDesc(softmax_output_type, CUDNN_POINTWISE_EXP));
-
-  // Create a exponetial node.
-  TF_ASSIGN_OR_RETURN(
-      auto exp_op,
-      CreateUnaryPwOp(subtract_output_tensor, exp_output_tensor, exp_desc));
-
-  // Create output tensor of the sum reduction.
-  TF_ASSIGN_OR_RETURN(
-      auto sum_reduction_output_tensor,
-      CreateCudnnTensor(reduction_output_dim, reduction_output_stride,
-                        CudnnfMHAUid::VIRTUAL_ID + 503, dnn::DataType::kFloat,
-                        1, -1, /*is_virtual=*/true));
-  // Create the reduction descriptor
-  auto sum_reduction_desc =
-      cudnn_frontend::ReductionDescBuilder()
-          .setComputeType(ToCudnnDataType(softmax_output_type))
-          .setReductionOp(CUDNN_REDUCE_TENSOR_ADD)
-          .build();
-
-  // Create a reduction sum node.
-  auto sum_reduction_op = cudnn_frontend::OperationBuilder(
-                              CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR)
-                              .setxDesc(exp_output_tensor)
-                              .setyDesc(sum_reduction_output_tensor)
-                              .setreductionDesc(sum_reduction_desc)
-                              .build();
-  RETURN_MSG_IF_CUDNN_ERROR(sum_reduction_op);
-
-  // Create output tensor of the divide op.
-  auto uid = is_virtual ? CudnnfMHAUid::VIRTUAL_ID + 504 : CudnnfMHAUid::P_ID;
-  TF_ASSIGN_OR_RETURN(
-      auto divide_output_tensor,
-      CreateCudnnTensor(
-          dims, strides, uid, dtype, 1, -1,
-          /*is_virtual*/ is_virtual,
-          /*cudnn_tensor_order_type*/ CUDNN_TENSOR_REORDERING_F16x16));
-  // Create the divide descriptor
-  TF_ASSIGN_OR_RETURN(auto divide_desc,
-                      CreatePwDesc(softmax_output_type, CUDNN_POINTWISE_DIV));
-
-  // Create a divide node.
-  TF_ASSIGN_OR_RETURN(
-      auto divide_op,
-      CreateBinaryPwOp(exp_output_tensor, sum_reduction_output_tensor,
-                       divide_output_tensor, divide_desc));
-
-  // Add max reduction to op list
-  ops.push_back(std::move(max_reduction_op));
-  // Add subtract to op list
-  ops.push_back(std::move(subtract_op));
-  // Add exponetial to op list
-  ops.push_back(std::move(exp_op));
-  // Add sum reduction to op list
-  ops.push_back(std::move(sum_reduction_op));
-  // Add divide to op list
-  ops.push_back(std::move(divide_op));
-
-  return divide_output_tensor;
-}
-
-// Returns a cudnn tensor that's the output of the dropout op
-absl::StatusOr<cudnn_frontend::Tensor> CreateCudnnDropoutFwdTensor(
-    std::vector<cudnn_frontend::Operation>& ops, absl::Span<const int64_t> dims,
-    absl::Span<const int64_t> strides, dnn::DataType dtype,
-    cudnn_frontend::Tensor& input_tensor, double dropout_rate, int64_t seed,
-    bool is_virtual = false) {
-  // Create scale tensor
-  std::vector<int64_t> scale_dims(dims.size(), 1);
-  std::vector<int64_t> scale_strides(strides.size(), 1);
-
-  // Create tensor for dropout's mask.
-  TF_ASSIGN_OR_RETURN(
-      auto mask_tensor,
-      CreateCudnnTensor(dims, strides, CudnnfMHAUid::VIRTUAL_ID + 600,
-                        dnn::DataType::kFloat, 1, -1,
-                        /*is_virtual*/ true));
-  // Create output tensor of dropout node
-  auto uid = is_virtual ? CudnnfMHAUid::VIRTUAL_ID + 601 : CudnnfMHAUid::P_ID;
-  TF_ASSIGN_OR_RETURN(
-      auto dropout_out_tensor,
-      CreateCudnnTensor(
-          dims, strides, uid, dtype, 1, -1, /*is_virtual*/ is_virtual,
-          /*cudnn_tensor_order_type*/
-          cudnnBackendTensorReordering_t::CUDNN_TENSOR_REORDERING_F16x16));
-  // Create offset tensor of dropout node
-  TF_ASSIGN_OR_RETURN(
-      auto dropout_offset_tensor,
-      CreateCudnnTensor(
-          scale_dims, scale_strides, CudnnfMHAUid::D_OFFSET_ID,
-          dnn::DataType::kInt64, 1, -1, /*is_virtual*/ false,
-          /*cudnn_tensor_order_type*/
-          cudnnBackendTensorReordering_t::CUDNN_TENSOR_REORDERING_NONE,
-          /*is_value*/ true));
-
-  // Create seed tensor of dropout node
-  TF_ASSIGN_OR_RETURN(
-      auto dropout_seed_tensor,
-      CreateCudnnTensor(
-          scale_dims, scale_strides, CudnnfMHAUid::D_SEED_ID,
-          dnn::DataType::kInt64, 1, -1, /*is_virtual*/ false,
-          /*cudnn_tensor_order_type*/
-          cudnnBackendTensorReordering_t::CUDNN_TENSOR_REORDERING_NONE,
-          /*is_value*/ true));
-
-  // Create description for rng node
-  auto rng_desc = cudnn_frontend::RngDescBuilder()
-                      .setRngDistribution(CUDNN_RNG_DISTRIBUTION_BERNOULLI)
-                      .setBernoulliDistProbability(1.0 - dropout_rate)
-                      .build();
-  // Create the rng Node.
-  auto rng_op =
-      cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR)
-          .setyDesc(mask_tensor)
-          .setSeedDesc(dropout_seed_tensor)
-          .setOffsetDesc(dropout_offset_tensor)
-          .setRngDesc(rng_desc)
-          .build();
-  RETURN_MSG_IF_CUDNN_ERROR(rng_op);
-
-  // Create the masking node desc after mask tensor
-  TF_ASSIGN_OR_RETURN(auto masking_desc,
-                      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_MUL));
-
-  // Create the scaling op
-  TF_ASSIGN_OR_RETURN(auto masking_op,
-                      CreateBinaryPwOp(input_tensor, mask_tensor,
-                                       dropout_out_tensor, masking_desc));
-
-  TF_ASSIGN_OR_RETURN(
-      auto dropout_scale_tensor,
-      CreateCudnnTensor(
-          scale_dims, scale_strides, CudnnfMHAUid::DROPOUT_SCALE_ID, dtype, 1,
-          -1,
-          /*is_virtual*/ false,
-          /*cudnn_tensor_order_type*/ CUDNN_TENSOR_REORDERING_NONE,
-          /*is_value*/ true));
-
-  // Create output of scale node
-  TF_ASSIGN_OR_RETURN(
-      auto dropout_scale_out_tensor,
-      CreateCudnnTensor(dims, strides, CudnnfMHAUid::VIRTUAL_ID + 602, dtype, 1,
-                        -1, /*is_virtual*/ true));
-  // Create the scaling desc
-  TF_ASSIGN_OR_RETURN(auto scale_desc,
-                      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_MUL));
-
-  // Create the scaling op
-  TF_ASSIGN_OR_RETURN(auto scale_op,
-                      CreateBinaryPwOp(dropout_out_tensor, dropout_scale_tensor,
-                                       dropout_scale_out_tensor, scale_desc));
-  // Add rng op to op list
-  ops.push_back(std::move(rng_op));
-  // Add masking op to op list
-  ops.push_back(std::move(masking_op));
-  // Add scaling op to op list
-  ops.push_back(std::move(scale_op));
-
-  return dropout_scale_out_tensor;
-}
 #endif  // CUDNN_VERSION >= 8800
 
 absl::StatusOr<std::unique_ptr<cudnn_frontend::OperationGraph>>
@@ -5247,989 +4947,6 @@ GetCudnnFusedMatmulGraph(dnn::DataType input_type, dnn::DataType bias_type,
 
   return std::make_unique<cudnn_frontend::OperationGraph>(std::move(op_graph));
 }
-
-#if CUDNN_VERSION >= 8800
-absl::StatusOr<std::unique_ptr<cudnn_frontend::OperationGraph>>
-GetCudnnFusedMHAOperationGraph(
-    const dnn::MatmulTensorDescriptor& bmm1_lhs_descriptor,
-    const dnn::MatmulTensorDescriptor& bmm1_rhs_descriptor,
-    const dnn::MatmulTensorDescriptor& bmm2_rhs_descriptor,
-    const dnn::MatmulTensorDescriptor& intermediate_bmm2_lhs_descriptor,
-    const dnn::TensorDescriptor& output_descriptor,
-    std::optional<dnn::TensorDescriptor> mask_descriptor,
-    std::optional<dnn::TensorDescriptor> bias_descriptor,
-    std::optional<dnn::TensorDescriptor> activation_descriptor,
-    dnn::FusedMHAKind kind, std::optional<double> dropout_rate,
-    std::optional<int64_t> seed, CudnnHandle& cudnn, double scale,
-    std::vector<int64_t>& intermediate_shape, bool use_dropout = false,
-    bool use_mask = false, bool use_bias = false) {
-  if (VLOG_IS_ON(4)) {
-    VLOG(4) << "\n bmm1_lhs(q): " << bmm1_lhs_descriptor.ToString()
-            << "\n bmm1_rhs(k): " << bmm1_rhs_descriptor.ToString()
-            << "\n bmm2_lhs(s): " << intermediate_bmm2_lhs_descriptor.ToString()
-            << "\n bmm2_rhs(v): " << bmm2_rhs_descriptor.ToString()
-            << "\n out(o): " << output_descriptor.ToString();
-    if (activation_descriptor) {
-      VLOG(4) << "\n activation(s): " << (*activation_descriptor).ToString();
-    }
-  }
-  // cnn_infer needs to be preloaded for fMHA as well. Reusing the function
-  // created for convolution for fMHA.
-  PreloadCudnnSubLibsHelper(dnn::ConvolutionKind::FORWARD);
-
-  std::vector<cudnn_frontend::Operation const*> ops;
-  std::vector<cudnn_frontend::Operation> intermediate_ops;
-
-  // Batched Matmul: bmm1_lhs: tensor_q, bmm1_rhs:tensor_k; output: tensor_s
-  // (virtual)
-  // Batched Matmul: bmm2_lhs: tensor_s, bmm2_rhs:tensor_v; output: tensor_o
-  std::vector<int64_t> bmm1_lhs_dims =
-      bmm1_lhs_descriptor.GetCudnnCompatibleDimensions(true);
-  std::vector<int64_t> bmm1_lhs_strides =
-      bmm1_lhs_descriptor.GetCudnnCompatibleStrides(true);
-
-  VLOG(2) << "\n cuDNN compatible bmm1_lhs_dims: "
-          << absl::StrJoin(bmm1_lhs_dims, ",")
-          << "\n cuDNN compatible bmm1_lhs_strides: "
-          << absl::StrJoin(bmm1_lhs_strides, ",");
-
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_q,
-      CreateCudnnTensor(bmm1_lhs_dims, bmm1_lhs_strides, CudnnfMHAUid::Q_ID,
-                        bmm1_lhs_descriptor.type(), 1, -1));
-
-  std::vector<int64_t> bmm1_rhs_dims =
-      bmm1_rhs_descriptor.GetCudnnCompatibleDimensions(false);
-  std::vector<int64_t> bmm1_rhs_strides =
-      bmm1_rhs_descriptor.GetCudnnCompatibleStrides(false);
-
-  VLOG(2) << "\n cuDNN compatible bmm1_rhs_dims: "
-          << absl::StrJoin(bmm1_rhs_dims, ",")
-          << "\n cuDNN compatible bmm1_rhs_strides: "
-          << absl::StrJoin(bmm1_rhs_strides, ",");
-
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_k,
-      CreateCudnnTensor(bmm1_rhs_dims, bmm1_rhs_strides, CudnnfMHAUid::K_ID,
-                        bmm1_rhs_descriptor.type(), 1, -1));
-
-  std::vector<int64_t> intermediate_bmm2_lhs_dims =
-      intermediate_bmm2_lhs_descriptor.GetCudnnCompatibleDimensions(true);
-  std::vector<int64_t> intermediate_bmm2_lhs_strides =
-      intermediate_bmm2_lhs_descriptor.GetCudnnCompatibleStrides(true);
-
-  VLOG(2) << "\n cuDNN compatible intermediate_bmm2_lhs_dims: "
-          << absl::StrJoin(intermediate_bmm2_lhs_dims, ",")
-          << "\n cuDNN compatible intermediate_bmm2_lhs_strides: "
-          << absl::StrJoin(intermediate_bmm2_lhs_strides, ",");
-  intermediate_shape = intermediate_bmm2_lhs_dims;
-  dnn::DataType s_tensor_type = kind == dnn::FusedMHAKind::BMM1_OUTPUT_FLOAT
-                                    ? dnn::DataType::kFloat
-                                    : intermediate_bmm2_lhs_descriptor.type();
-  bool has_activation = activation_descriptor != std::nullopt;
-  bool is_s_virtual = use_dropout || use_mask ||
-                      kind == dnn::FusedMHAKind::BMM1_OUTPUT_FLOAT ||
-                      use_bias || !has_activation;
-
-  auto uid = is_s_virtual ? CudnnfMHAUid::VIRTUAL_ID + 100 : CudnnfMHAUid::P_ID;
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_s,
-      CreateCudnnTensor(intermediate_bmm2_lhs_dims,
-                        intermediate_bmm2_lhs_strides, uid, s_tensor_type, 1,
-                        -1, /*is_virtual=*/is_s_virtual));
-
-  // Create scale op and tensor
-  TF_ASSIGN_OR_RETURN(
-      auto alpha_scale_out,
-      CreateCudnnScaleTensor(intermediate_ops, bmm1_rhs_dims, bmm1_rhs_strides,
-                             bmm1_rhs_descriptor.type(), tensor_k));
-
-  auto bmm1_desc = cudnn_frontend::MatMulDescBuilder()
-                       .setComputeType(CUDNN_DATA_FLOAT)
-                       .build();
-  RETURN_MSG_IF_CUDNN_ERROR(bmm1_desc);
-  auto bmm1_op = cudnn_frontend::OperationBuilder(
-                     CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
-                     .setaMatDesc(tensor_q)
-                     .setbMatDesc(alpha_scale_out)
-                     .setcMatDesc(tensor_s)
-                     .setmatmulDesc(bmm1_desc)
-                     .build();
-  RETURN_MSG_IF_CUDNN_ERROR(bmm1_op);
-
-  VLOG(4) << "\nTensor_s: " << tensor_s.describe()
-          << "\nBMM1_op: " << bmm1_op.describe();
-
-  cudnn_frontend::Tensor bmm2_input_tensor = std::move(tensor_s);
-  intermediate_ops.push_back(std::move(bmm1_op));
-
-  if (is_s_virtual) {
-    if (use_bias) {
-      // Create bias op and tensor
-      DCHECK(bias_descriptor != std::nullopt);
-      TF_ASSIGN_OR_RETURN(
-          auto bias_out,
-          CreateCudnnBiasTensor(intermediate_ops, intermediate_bmm2_lhs_dims,
-                                intermediate_bmm2_lhs_strides,
-                                (*bias_descriptor).type(), bmm2_input_tensor,
-                                use_mask));
-      bmm2_input_tensor = std::move(bias_out);
-    }
-    if (use_mask) {
-      // Create mask op and tensor
-      TF_ASSIGN_OR_RETURN(
-          auto mask_out,
-          CreateCudnnMaskFwdTensor(intermediate_ops, intermediate_bmm2_lhs_dims,
-                                   intermediate_bmm2_lhs_strides,
-                                   intermediate_bmm2_lhs_descriptor.type(),
-                                   bmm2_input_tensor));
-      bmm2_input_tensor = std::move(mask_out);
-    }
-    if (kind == dnn::FusedMHAKind::BMM1_OUTPUT_FLOAT || use_bias ||
-        use_dropout || use_mask) {
-      // Create Softmax tensor
-      // The output is always a virtual for inference mode.
-      bool should_output_softmax = !use_dropout && has_activation;
-      TF_ASSIGN_OR_RETURN(auto softmax_fwd_out,
-                          CreateCudnnSoftmaxFwdTensor(
-                              intermediate_ops, intermediate_bmm2_lhs_dims,
-                              intermediate_bmm2_lhs_strides,
-                              intermediate_bmm2_lhs_descriptor.type(),
-                              /*input_tensor*/ bmm2_input_tensor,
-                              /*is_virtual*/ !should_output_softmax));
-      bmm2_input_tensor = std::move(softmax_fwd_out);
-    }
-
-    if (use_dropout) {
-      // Create dropout tensor
-      bool dropout_virtual = (activation_descriptor == std::nullopt);
-      TF_ASSIGN_OR_RETURN(auto dropout_out,
-                          CreateCudnnDropoutFwdTensor(
-                              intermediate_ops, intermediate_bmm2_lhs_dims,
-                              intermediate_bmm2_lhs_strides,
-                              intermediate_bmm2_lhs_descriptor.type(),
-                              /*input_tensor*/ bmm2_input_tensor, *dropout_rate,
-                              *seed, /*is_virtual*/ dropout_virtual));
-      bmm2_input_tensor = std::move(dropout_out);
-    }
-  }
-  std::vector<int64_t> bmm2_rhs_dims =
-      bmm2_rhs_descriptor.GetCudnnCompatibleDimensions(false);
-  std::vector<int64_t> bmm2_rhs_strides =
-      bmm2_rhs_descriptor.GetCudnnCompatibleStrides(false);
-
-  VLOG(2) << "\n cuDNN compatible bmm2_rhs_dims: "
-          << absl::StrJoin(bmm2_rhs_dims, ",")
-          << "\n cuDNN compatible bmm2_rhs_strides: "
-          << absl::StrJoin(bmm2_rhs_strides, ",");
-
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_v,
-      CreateCudnnTensor(bmm2_rhs_dims, bmm2_rhs_strides, CudnnfMHAUid::V_ID,
-                        bmm2_rhs_descriptor.type(), 1, -1));
-
-  std::vector<int64_t> output_dims = output_descriptor.dimensions();
-  std::vector<int64_t> output_strides = output_descriptor.GetLogicalStrides();
-
-  VLOG(2) << "\n Out Dims: " << absl::StrJoin(output_dims, ",")
-          << "\n Out Strides: " << absl::StrJoin(output_strides, ",");
-
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_o,
-      CreateCudnnTensor(output_dims, output_strides, CudnnfMHAUid::O_ID,
-                        output_descriptor.type(), 1, -1));
-  auto bmm2_desc = cudnn_frontend::MatMulDescBuilder()
-                       .setComputeType(CUDNN_DATA_FLOAT)
-                       .build();
-  RETURN_MSG_IF_CUDNN_ERROR(bmm2_desc);
-  auto bmm2_op = cudnn_frontend::OperationBuilder(
-                     CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
-                     .setaMatDesc(bmm2_input_tensor)
-                     .setbMatDesc(tensor_v)
-                     .setcMatDesc(tensor_o)
-                     .setmatmulDesc(bmm2_desc)
-                     .build();
-  RETURN_MSG_IF_CUDNN_ERROR(bmm2_op);
-
-  VLOG(4) << "\nBMM2_op: " << bmm2_op.describe();
-
-  // Create an Operation Graph. In this case it is gemm-gemm
-  intermediate_ops.push_back(std::move(bmm2_op));
-  ops.reserve(intermediate_ops.size());
-  for (auto& intermediate_op : intermediate_ops) {
-    ops.emplace_back(&intermediate_op);
-  }
-
-  auto op_graph = cudnn_frontend::OperationGraphBuilder()
-                      .setHandle(cudnn.handle())
-                      .setOperationGraph(ops.size(), ops.data())
-                      .build();
-  RETURN_MSG_IF_CUDNN_ERROR(op_graph);
-
-  VLOG(4) << "\nTensor_q: " << tensor_q.describe()
-          << "\nTensor_v: " << tensor_v.describe()
-          << "\nTensor_o: " << tensor_o.describe()
-          << "\nBMM1: " << bmm1_desc.describe()
-          << "\nBMM2: " << bmm2_desc.describe()
-          << "\nOpGraph: " << op_graph.describe();
-  return std::make_unique<cudnn_frontend::OperationGraph>(std::move(op_graph));
-}
-
-absl::StatusOr<cudnn_frontend::Tensor> CreateCudnnDropoutBwdTensor(
-    std::vector<cudnn_frontend::Operation>& ops, absl::Span<const int64_t> dims,
-    absl::Span<const int64_t> strides, dnn::DataType dtype,
-    cudnn_frontend::Tensor const& tensor_dropout_scale,
-    cudnn_frontend::Tensor const& tensor_p,
-    cudnn_frontend::Tensor const& tensor_p_abs,
-    cudnn_frontend::Tensor const& tensor_dp) {
-  // create zero tensor
-  std::vector<int64_t> scale_dims(dims.size(), 1);
-  std::vector<int64_t> scale_strides(strides.size(), 1);
-
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_zero,
-      CreateCudnnTensor(
-          scale_dims, scale_strides, CudnnfMHAUid::ZERO_VAL_ID,
-          dnn::DataType::kFloat, 1, -1,
-          /* is_virtual */ false,
-          /*cudnn_tensor_order_type*/ CUDNN_TENSOR_REORDERING_NONE,
-          /*is_value*/ true));
-
-  auto tensor_dropout_mask = cudnn_frontend::TensorBuilder()
-                                 .setDim(dims.size(), dims.data())
-                                 .setStride(strides.size(), strides.data())
-                                 .setId(VIRTUAL_ID + 400)
-                                 .setAlignment(32)
-                                 .setDataType(CUDNN_DATA_BOOLEAN)
-                                 .setVectorCountAndDimension(1, -1)
-                                 .setVirtual(true)
-#if CUDNN_VERSION >= 8300
-                                 .setReorderType(CUDNN_TENSOR_REORDERING_NONE)
-#endif
-                                 .setByValue(false)
-                                 .build();
-  TF_ASSIGN_OR_RETURN(
-      auto greater_than_0_desc,
-      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_CMP_GT));
-
-  TF_ASSIGN_OR_RETURN(
-      auto greater_than_0_op,
-      CreateBinaryPwOp(tensor_p, tensor_zero, tensor_dropout_mask,
-                       greater_than_0_desc));
-
-  ops.push_back(std::move(greater_than_0_op));
-  // scale for the dropout
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_dp_scale,
-      CreateCudnnTensor(dims, strides, VIRTUAL_ID + 401, dnn::DataType::kFloat,
-                        1,
-                        -1,  // FMHA TODO TYPE: why it is float here?
-                        /* is_virtual */ true));
-
-  TF_ASSIGN_OR_RETURN(auto mul_0_desc,
-                      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_MUL));
-
-  TF_ASSIGN_OR_RETURN(auto mul_0_op,
-                      CreateBinaryPwOp(tensor_dp, tensor_dropout_scale,
-                                       tensor_dp_scale, mul_0_desc));
-  ops.push_back(std::move(mul_0_op));
-
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_dp_scale_dropout,
-      CreateCudnnTensor(dims, strides, VIRTUAL_ID + 402, dnn::DataType::kFloat,
-                        1,
-                        -1,  // FMHA TODO TYPE: why it is float here?
-                        /* is_virtual */ true));
-
-  TF_ASSIGN_OR_RETURN(
-      auto selection_0_desc,
-      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_BINARY_SELECT));
-  TF_ASSIGN_OR_RETURN(
-      auto selection_0_op,
-      CreateTernaryPwOp(tensor_dp_scale, tensor_zero, tensor_dropout_mask,
-                        tensor_dp_scale_dropout, selection_0_desc));
-  ops.push_back(std::move(selection_0_op));
-  return tensor_dp_scale_dropout;
-}
-
-// Returns a cudnn tensor that's the output of the softmax backward op
-absl::StatusOr<cudnn_frontend::Tensor> CreateCudnnSoftmaxBwdTensor(
-    std::vector<cudnn_frontend::Operation>& ops, absl::Span<const int64_t> dims,
-    absl::Span<const int64_t> strides, dnn::DataType dtype,
-    cudnn_frontend::Tensor const& tensor_y,
-    cudnn_frontend::Tensor const& tensor_dy) {
-  // softmax's typical backward computation is:
-  // y * (dy - sum(y  * dy))
-  // we also do alpha scale here
-  // We need to create each op and add it to the op list sequentially.
-  std::vector<int64_t> p_reduction_dims(dims.begin(), dims.end() - 1);
-  p_reduction_dims.push_back(1);
-
-  // Divide every stride by the last dim value.
-  std::vector<int64_t> p_reduction_strides;
-  int64_t reduced_dim_len = dims.back();
-  for (auto stride : strides) {
-    p_reduction_strides.push_back(stride / reduced_dim_len);
-  }
-
-  std::vector<int64_t> scale_dims(dims.size(), 1);
-  std::vector<int64_t> scale_strides(strides.size(), 1);
-
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_alpha_scale,
-      CreateCudnnTensor(
-          scale_dims, scale_strides, CudnnfMHAUid::ALPHA_SCALE_ID,
-          dnn::DataType::kFloat, 1, -1, /* is_virtual */ false,
-          /*cudnn_tensor_order_type*/ CUDNN_TENSOR_REORDERING_NONE,
-          /*is_value*/ true));
-
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_y_mul_dy,
-      CreateCudnnTensor(dims, strides, VIRTUAL_ID + 500, dnn::DataType::kFloat,
-                        1, -1, /* is_virtual */ true));
-
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_mul_reduction,
-      CreateCudnnTensor(p_reduction_dims, p_reduction_strides, VIRTUAL_ID + 501,
-                        dnn::DataType::kFloat, 1, -1, /* is_virtual */ true));
-
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_mul_reduction_sub,
-      CreateCudnnTensor(dims, strides, VIRTUAL_ID + 502, dnn::DataType::kFloat,
-                        1, -1, /* is_virtual */ true));
-
-  TF_ASSIGN_OR_RETURN(auto tensor_mul_reduction_sub_mul,
-                      CreateCudnnTensor(dims, strides, VIRTUAL_ID + 503,
-                                        dnn::DataType::kFloat, 1, -1,
-                                        /* is_virtual */ true));
-
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_mul_reduction_sub_mul_alpha_scale,
-      CreateCudnnTensor(dims, strides, VIRTUAL_ID + 504, dnn::DataType::kFloat,
-                        1, -1, /* is_virtual */ true));
-  // mul (y * dy)
-  TF_ASSIGN_OR_RETURN(auto mul_1_desc,
-                      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_MUL));
-
-  TF_ASSIGN_OR_RETURN(
-      auto mul_1_op,
-      CreateBinaryPwOp(tensor_y, tensor_dy, tensor_y_mul_dy, mul_1_desc));
-
-  // reduction add sum (y * dy)
-  auto reduction_add_desc = cudnn_frontend::ReductionDescBuilder()
-                                .setComputeType(CUDNN_DATA_FLOAT)
-                                .setReductionOp(CUDNN_REDUCE_TENSOR_ADD)
-                                .build();
-  RETURN_MSG_IF_CUDNN_ERROR(reduction_add_desc);
-  auto reduction_add_op = cudnn_frontend::OperationBuilder(
-                              CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR)
-                              .setxDesc(tensor_y_mul_dy)
-                              .setyDesc(tensor_mul_reduction)
-                              .setreductionDesc(reduction_add_desc)
-                              .build();
-  RETURN_MSG_IF_CUDNN_ERROR(reduction_add_op);
-
-  // subtraction (dy - sum(y * dy))
-  TF_ASSIGN_OR_RETURN(auto sub_0_desc,
-                      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_SUB));
-
-  TF_ASSIGN_OR_RETURN(auto sub_0_op,
-                      CreateBinaryPwOp(tensor_dy, tensor_mul_reduction,
-                                       tensor_mul_reduction_sub, sub_0_desc));
-
-  // mul (y * (dy - sum(y * dy)))
-  TF_ASSIGN_OR_RETURN(auto mul_2_desc,
-                      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_MUL));
-
-  TF_ASSIGN_OR_RETURN(
-      auto mul_2_op,
-      CreateBinaryPwOp(tensor_y, tensor_mul_reduction_sub,
-                       tensor_mul_reduction_sub_mul, mul_2_desc));
-
-  // mul (scale * dx)
-  TF_ASSIGN_OR_RETURN(auto mul_3_desc,
-                      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_MUL));
-
-  TF_ASSIGN_OR_RETURN(
-      auto mul_3_op,
-      CreateBinaryPwOp(tensor_mul_reduction_sub_mul, tensor_alpha_scale,
-                       tensor_mul_reduction_sub_mul_alpha_scale, mul_3_desc));
-
-  ops.push_back(std::move(mul_1_op));
-  ops.push_back(std::move(reduction_add_op));
-  ops.push_back(std::move(sub_0_op));
-  ops.push_back(std::move(mul_2_op));
-  ops.push_back(std::move(mul_3_op));
-
-  return tensor_mul_reduction_sub_mul_alpha_scale;
-}
-
-absl::StatusOr<cudnn_frontend::Tensor> CreateCudnnMaskBwdTensor(
-    std::vector<cudnn_frontend::Operation>& ops, absl::Span<const int64_t> dims,
-    absl::Span<const int64_t> strides, dnn::DataType dtype,
-    cudnn_frontend::Tensor const& input_tensor, bool use_mask) {
-  std::vector<int64_t> scale_dims(dims.size(), 1);
-  std::vector<int64_t> scale_strides(strides.size(), 1);
-
-  // with mask input: dummy_binary_selection(mul(input, mask_input), 0, 1)
-  // without use_mask: dummy_binary_selection(input, 0, 1)
-  if (use_mask) {
-    TF_ASSIGN_OR_RETURN(
-        auto mask_tensor,
-        CreateCudnnTensor(dims, strides, CudnnfMHAUid::MASK_ID, dtype, 1, -1));
-
-    TF_ASSIGN_OR_RETURN(
-        auto mask_out_tensor,
-        CreateCudnnTensor(dims, strides, CudnnfMHAUid::VIRTUAL_ID + 700,
-                          dnn::DataType::kFloat, 1, -1,
-                          /*is_virtual=*/true));
-
-    TF_ASSIGN_OR_RETURN(
-        auto zero_tensor,
-        CreateCudnnTensor(
-            scale_dims, scale_strides, CudnnfMHAUid::ZERO_VAL_ID,
-            dnn::DataType::kFloat, 1, -1,
-            /*is_virtual=*/false,
-            /*cudnn_tensor_order_type*/ CUDNN_TENSOR_REORDERING_NONE,
-            /*is_value*/ true));
-
-    // Create the mask tensor
-    TF_ASSIGN_OR_RETURN(
-        auto one_tensor,
-        CreateCudnnTensor(
-            scale_dims, scale_strides, CudnnfMHAUid::ONE_VAL_ID,
-            dnn::DataType::kFloat, 1, -1,
-            /*is_virtual=*/false,
-            /*cudnn_tensor_order_type*/ CUDNN_TENSOR_REORDERING_NONE,
-            /*is_value*/ true));
-
-    // Create the mask output tensor
-    TF_ASSIGN_OR_RETURN(
-        auto dummy_mask_out_tensor,
-        CreateCudnnTensor(
-            dims, strides, CudnnfMHAUid::dS_ID, dtype, 1, -1,
-            /*is_virtual=*/false,
-            /*cudnn_tensor_order_type*/ CUDNN_TENSOR_REORDERING_F16x16));
-
-    // create mask op
-    TF_ASSIGN_OR_RETURN(auto mul_desc, CreatePwDesc(dnn::DataType::kFloat,
-                                                    CUDNN_POINTWISE_MUL));
-
-    TF_ASSIGN_OR_RETURN(
-        auto mul_op,
-        CreateBinaryPwOp(input_tensor, mask_tensor, mask_out_tensor, mul_desc));
-
-    // Create the dummpy binary selection mask op.
-    TF_ASSIGN_OR_RETURN(
-        auto mask_desc,
-        CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_BINARY_SELECT));
-
-    TF_ASSIGN_OR_RETURN(
-        auto mask_op,
-        CreateTernaryPwOp(mask_out_tensor, zero_tensor, one_tensor,
-                          dummy_mask_out_tensor, mask_desc));
-
-    // Add mask to op list
-    ops.push_back(std::move(mul_op));
-    ops.push_back(std::move(mask_op));
-
-    return dummy_mask_out_tensor;
-  } else {
-    TF_ASSIGN_OR_RETURN(
-        auto zero_tensor,
-        CreateCudnnTensor(
-            scale_dims, scale_strides, CudnnfMHAUid::ZERO_VAL_ID,
-            dnn::DataType::kFloat, 1, -1,
-            /*is_virtual=*/false,
-            /*cudnn_tensor_order_type*/ CUDNN_TENSOR_REORDERING_NONE,
-            /*is_value*/ true));
-
-    // Create the mask tensor
-    TF_ASSIGN_OR_RETURN(
-        auto one_tensor,
-        CreateCudnnTensor(
-            scale_dims, scale_strides, CudnnfMHAUid::ONE_VAL_ID,
-            dnn::DataType::kFloat, 1, -1,
-            /*is_virtual=*/false,
-            /*cudnn_tensor_order_type*/ CUDNN_TENSOR_REORDERING_NONE,
-            /*is_value*/ true));
-
-    // Create the mask output tensor
-    TF_ASSIGN_OR_RETURN(
-        auto dummy_mask_out_tensor,
-        CreateCudnnTensor(
-            dims, strides, CudnnfMHAUid::dS_ID, dtype, 1, -1,
-            /*is_virtual=*/false,
-            /*cudnn_tensor_order_type*/ CUDNN_TENSOR_REORDERING_F16x16));
-
-    // Create the dummpy binary selection mask op.
-    TF_ASSIGN_OR_RETURN(
-        auto mask_desc,
-        CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_BINARY_SELECT));
-
-    TF_ASSIGN_OR_RETURN(auto mask_op,
-                        CreateTernaryPwOp(input_tensor, zero_tensor, one_tensor,
-                                          dummy_mask_out_tensor, mask_desc));
-
-    // Add mask to op list
-    ops.push_back(std::move(mask_op));
-
-    return dummy_mask_out_tensor;
-  }
-}
-#if CUDNN_VERSION >= 8901
-absl::StatusOr<cudnn_frontend::Tensor> CreateCudnnBiasBwdTensor(
-    std::vector<cudnn_frontend::Operation>& ops, absl::Span<const int64_t> dims,
-    absl::Span<const int64_t> strides, dnn::DataType dtype,
-    cudnn_frontend::Tensor const& input_tensor) {
-  std::vector<int64_t> scale_dims(dims.size(), 1);
-  std::vector<int64_t> scale_strides(strides.size(), 1);
-  std::vector<int64_t> dbias_dims(dims.begin(), dims.end());
-  std::vector<int64_t> dbias_strides(strides.begin(), strides.end());
-  // reduction over batch dim
-  dbias_dims[0] = 1;
-
-  TF_ASSIGN_OR_RETURN(auto dbias_tensor,
-                      CreateCudnnTensor(dbias_dims, dbias_strides,
-                                        CudnnfMHAUid::dBIAS_ID, dtype, 1, -1));
-
-  TF_ASSIGN_OR_RETURN(
-      auto alpha_scale_tensor,
-      CreateCudnnTensor(
-          scale_dims, scale_strides, CudnnfMHAUid::ALPHA_SCALE_ID,
-          dnn::DataType::kFloat, 1, -1,
-          /*is_virtual=*/false,
-          /*cudnn_tensor_order_type*/ CUDNN_TENSOR_REORDERING_NONE,
-          /*is_value*/ true));
-
-  TF_ASSIGN_OR_RETURN(auto alpha_scale_reciprocal_tensor,
-                      CreateCudnnTensor(scale_dims, scale_strides,
-                                        CudnnfMHAUid::VIRTUAL_ID + 600,
-                                        dnn::DataType::kFloat, 1, -1,
-                                        /*is_virtual=*/true));
-
-  TF_ASSIGN_OR_RETURN(
-      auto dbias_before_scale_tensor,
-      CreateCudnnTensor(dbias_dims, dbias_strides,
-                        CudnnfMHAUid::VIRTUAL_ID + 601, dnn::DataType::kFloat,
-                        1, -1, /*is_virtual=*/true));
-
-  // Create the reduction op.
-  auto reduction_add_desc = cudnn_frontend::ReductionDescBuilder()
-                                .setComputeType(CUDNN_DATA_FLOAT)
-                                .setReductionOp(CUDNN_REDUCE_TENSOR_ADD)
-                                .build();
-  RETURN_MSG_IF_CUDNN_ERROR(reduction_add_desc);
-  auto reduction_add_op = cudnn_frontend::OperationBuilder(
-                              CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR)
-                              .setxDesc(input_tensor)
-                              .setyDesc(dbias_before_scale_tensor)
-                              .setreductionDesc(reduction_add_desc)
-                              .build();
-  RETURN_MSG_IF_CUDNN_ERROR(reduction_add_op);
-
-  // take the reciprocal of the scale
-  TF_ASSIGN_OR_RETURN(
-      auto reciprocal_scale_desc,
-      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_RECIPROCAL));
-
-  TF_ASSIGN_OR_RETURN(
-      auto reciprocal_scale_op,
-      CreateUnaryPwOp(alpha_scale_tensor, alpha_scale_reciprocal_tensor,
-                      reciprocal_scale_desc));
-
-  // apply the scale
-  TF_ASSIGN_OR_RETURN(auto dBias_scale_desc,
-                      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_MUL));
-
-  TF_ASSIGN_OR_RETURN(
-      auto dBias_scale_op,
-      CreateBinaryPwOp(dbias_before_scale_tensor, alpha_scale_reciprocal_tensor,
-                       dbias_tensor, dBias_scale_desc));
-  // Add mask to op list
-  ops.push_back(std::move(reduction_add_op));
-  ops.push_back(std::move(reciprocal_scale_op));
-  ops.push_back(std::move(dBias_scale_op));
-
-  return dbias_tensor;
-}
-#endif  // CUDNN_VERSION >= 8901
-absl::StatusOr<std::unique_ptr<cudnn_frontend::OperationGraph>>
-GetCudnnFusedMHABackwardOperationGraph(
-    const dnn::MatmulTensorDescriptor& bmm1_grad_gemm1_rhs_descriptor,
-    const dnn::MatmulTensorDescriptor& bmm1_grad_gemm2_rhs_descriptor,
-    const dnn::MatmulTensorDescriptor& bmm2_grad_gemm1_lhs_descriptor,
-    const dnn::MatmulTensorDescriptor& bmm2_grad_gemm2_rhs_descriptor,
-    const dnn::MatmulTensorDescriptor& d_output_descriptor,
-    const dnn::TensorDescriptor& d_bmm1_lhs_descriptor,
-    const dnn::TensorDescriptor& d_bmm1_rhs_descriptor,
-    const dnn::TensorDescriptor& d_bmm2_rhs_descriptor, dnn::FusedMHAKind kind,
-    std::optional<double> dropout_rate, std::optional<int64_t> seed,
-    CudnnHandle& cudnn, double scale, std::vector<int64_t>& intermediate_shape,
-    bool use_dropout = false, bool use_mask = false, bool use_bias = false) {
-  if (VLOG_IS_ON(4)) {
-    VLOG(4) << "\n bmm1_grad_gemm1_rhs(q): "
-            << bmm1_grad_gemm1_rhs_descriptor.ToString()
-            << "\n bmm1_grad_gemm2_rhs(k): "
-            << bmm1_grad_gemm2_rhs_descriptor.ToString()
-            << "\n bmm2_grad_gemm1_lhs(p): "
-            << bmm2_grad_gemm1_lhs_descriptor.ToString()
-            << "\n bmm2_grad_gemm2_rhs(v^t): "
-            << bmm2_grad_gemm2_rhs_descriptor.ToString()
-            << "\n d_output(do): " << d_output_descriptor.ToString()
-            << "\n d_bmm1_lhs(dq): " << d_bmm1_lhs_descriptor.ToString()
-            << "\n d_bmm1_rhs(dk): " << d_bmm1_rhs_descriptor.ToString()
-            << "\n d_bmm2_rhs(dv): " << d_bmm2_rhs_descriptor.ToString();
-  }
-  // cnn_infer needs to be preloaded for fMHA as well. Reusing the function
-  // created for convolution for fMHA.
-  PreloadCudnnSubLibsHelper(dnn::ConvolutionKind::FORWARD);
-
-  std::vector<cudnn_frontend::Operation const*> ops;
-  std::vector<cudnn_frontend::Operation> intermediate_ops;
-
-  // fp16 or bf16 is required
-  auto dtype = bmm1_grad_gemm1_rhs_descriptor.type();
-
-  std::vector<int64_t> q_dims =
-      bmm1_grad_gemm1_rhs_descriptor.GetCudnnCompatibleDimensions(false);
-  std::vector<int64_t> q_strides =
-      bmm1_grad_gemm1_rhs_descriptor.GetCudnnCompatibleStrides(false);
-
-  // used for create scale tensor or zero tensor
-  std::vector<int64_t> scale_dims(q_dims.size(), 1);
-  std::vector<int64_t> scale_strides(q_strides.size(), 1);
-
-  VLOG(2) << "\n cuDNN compatible bmm1_grad_gemm1_rhs_dims: "
-          << absl::StrJoin(q_dims, ",")
-          << "\n cuDNN compatible bmm1_grad_gemm1_rhs_strides: "
-          << absl::StrJoin(q_strides, ",");
-
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_q,
-      CreateCudnnTensor(q_dims, q_strides, CudnnfMHAUid::Q_ID, dtype, 1, -1));
-
-  std::vector<int64_t> k_dims =
-      bmm1_grad_gemm2_rhs_descriptor.GetCudnnCompatibleDimensions(false);
-  std::vector<int64_t> k_strides =
-      bmm1_grad_gemm2_rhs_descriptor.GetCudnnCompatibleStrides(false);
-
-  VLOG(2) << "\n cuDNN compatible bmm1_grad_gemm2_rhs_dims: "
-          << absl::StrJoin(k_dims, ",")
-          << "\n cuDNN compatible bmm1_grad_gemm2_rhs_strides: "
-          << absl::StrJoin(k_strides, ",");
-
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_k,
-      CreateCudnnTensor(k_dims, k_strides, CudnnfMHAUid::K_ID, dtype, 1, -1));
-
-  // P^T is lhs of bmm2grad1 dV = dot(P^T, dO) so we set is_lhs = false here to
-  // get correct P dim and stride
-  std::vector<int64_t> p_dims =
-      bmm2_grad_gemm1_lhs_descriptor.GetCudnnCompatibleDimensions(false);
-  std::vector<int64_t> p_strides =
-      bmm2_grad_gemm1_lhs_descriptor.GetCudnnCompatibleStrides(false);
-
-  // used for calculate offset increment
-  intermediate_shape = p_dims;
-  VLOG(2) << "\n cuDNN compatible bmm2_grad_gemm1_lhs_dims: "
-          << absl::StrJoin(p_dims, ",")
-          << "\n cuDNN compatible bmm2_grad_gemm1_lhs_strides: "
-          << absl::StrJoin(p_strides, ",");
-
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_p,
-      CreateCudnnTensor(
-          p_dims, p_strides, CudnnfMHAUid::P_ID, dtype, 1, -1,
-          /*is_virtual*/ false,
-          /*cudnn_tensor_order_type*/ CUDNN_TENSOR_REORDERING_F16x16));
-
-  std::vector<int64_t> v_dims =
-      bmm2_grad_gemm2_rhs_descriptor.GetCudnnCompatibleDimensions(false);
-  std::vector<int64_t> v_strides =
-      bmm2_grad_gemm2_rhs_descriptor.GetCudnnCompatibleStrides(false);
-
-  VLOG(2) << "\n cuDNN compatible bmm2_grad_gemm2_rhs_dims: "
-          << absl::StrJoin(v_dims, ",")
-          << "\n cuDNN compatible bmm2_grad_gemm2_rhs_strides: "
-          << absl::StrJoin(v_strides, ",");
-
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_vt,
-      CreateCudnnTensor(v_dims, v_strides, CudnnfMHAUid::V_ID, dtype, 1, -1));
-
-  // FMHA TODO: be really careful here about dim
-  std::vector<int64_t> do_dims =
-      d_output_descriptor.GetCudnnCompatibleDimensions(false);
-  std::vector<int64_t> do_strides =
-      d_output_descriptor.GetCudnnCompatibleStrides(false);
-  VLOG(2) << "\n cuDNN compatible d_output_dims: "
-          << absl::StrJoin(do_dims, ",")
-          << "\n cuDNN compatible d_output_strides: "
-          << absl::StrJoin(do_strides, ",");
-
-  TF_ASSIGN_OR_RETURN(auto tensor_do,
-                      CreateCudnnTensor(do_dims, do_strides,
-                                        CudnnfMHAUid::dO_ID, dtype, 1, -1));
-
-  std::vector<int64_t> dq_dims = d_bmm1_lhs_descriptor.dimensions();
-  std::vector<int64_t> dq_strides = d_bmm1_lhs_descriptor.GetLogicalStrides();
-
-  VLOG(2) << "\n cuDNN compatible d_bmm1_lhs_dims: "
-          << absl::StrJoin(dq_dims, ",")
-          << "\n cuDNN compatible d_bmm1_lhs_strides: "
-          << absl::StrJoin(dq_strides, ",");
-
-  TF_ASSIGN_OR_RETURN(auto tensor_dq,
-                      CreateCudnnTensor(dq_dims, dq_strides,
-                                        CudnnfMHAUid::dQ_ID, dtype, 1, -1));
-
-  std::vector<int64_t> dk_dims = d_bmm1_rhs_descriptor.dimensions();
-  std::vector<int64_t> dk_strides = d_bmm1_rhs_descriptor.GetLogicalStrides();
-
-  VLOG(2) << "\n cuDNN compatible d_bmm1_rhs_dims: "
-          << absl::StrJoin(dk_dims, ",")
-          << "\n cuDNN compatible d_bmm1_rhs_strides: "
-          << absl::StrJoin(dk_strides, ",");
-
-  TF_ASSIGN_OR_RETURN(auto tensor_dk,
-                      CreateCudnnTensor(dk_dims, dk_strides,
-                                        CudnnfMHAUid::dK_ID, dtype, 1, -1));
-
-  std::vector<int64_t> dv_dims = d_bmm2_rhs_descriptor.dimensions();
-  std::vector<int64_t> dv_strides = d_bmm2_rhs_descriptor.GetLogicalStrides();
-
-  VLOG(2) << "\n cuDNN compatible d_bmm2_rhs_dims: "
-          << absl::StrJoin(dv_dims, ",")
-          << "\n cuDNN compatible d_bmm2_rhs_strides: "
-          << absl::StrJoin(dv_strides, ",");
-
-  TF_ASSIGN_OR_RETURN(auto tensor_dv,
-                      CreateCudnnTensor(dv_dims, dv_strides,
-                                        CudnnfMHAUid::dV_ID, dtype, 1, -1));
-
-  // reshape + scale dropout + abs for p
-  auto p_transpose_dims = p_dims;
-  auto p_transpose_strides = p_strides;
-  auto rank = p_transpose_dims.size();
-  std::swap(p_transpose_dims[rank - 1], p_transpose_dims[rank - 2]);
-  std::swap(p_transpose_strides[rank - 1], p_transpose_strides[rank - 2]);
-
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_p_transpose,
-      CreateCudnnTensor(p_transpose_dims, p_transpose_strides,
-                        CudnnfMHAUid::VIRTUAL_ID + 300, dtype, 1, -1,
-                        /* is_virtual */ true));
-
-  auto reshape_op = cudnn_frontend::OperationBuilder(
-                        CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR)
-                        .setxDesc(tensor_p)
-                        .setyDesc(tensor_p_transpose)
-                        .build();
-  RETURN_MSG_IF_CUDNN_ERROR(reshape_op);
-
-  // Create scale tensor
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_dropout_scale,
-      CreateCudnnTensor(
-          scale_dims, scale_strides, CudnnfMHAUid::DROPOUT_SCALE_ID,
-          dnn::DataType::kFloat, 1, -1,
-          /*is_virtual*/ false,
-          /*cudnn_tensor_order_type*/ CUDNN_TENSOR_REORDERING_NONE,
-          /*is_value*/ true));
-
-  // Create output of scale
-  TF_ASSIGN_OR_RETURN(auto tensor_p_transpose_scale,
-                      CreateCudnnTensor(p_transpose_dims, p_transpose_strides,
-                                        CudnnfMHAUid::VIRTUAL_ID + 301, dtype,
-                                        1, -1, /*is_virtual*/ true));
-  // Create the scaling desc
-  TF_ASSIGN_OR_RETURN(auto scale_desc,
-                      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_MUL));
-
-  // Create the scaling op
-  TF_ASSIGN_OR_RETURN(auto scale_op,
-                      CreateBinaryPwOp(tensor_p_transpose, tensor_dropout_scale,
-                                       tensor_p_transpose_scale, scale_desc));
-  // create abs operation here to clear the sign bit
-  // sign bit is used to store the mask for dropout
-  TF_ASSIGN_OR_RETURN(auto tensor_p_transpose_scale_abs,
-                      CreateCudnnTensor(p_transpose_dims, p_transpose_strides,
-                                        CudnnfMHAUid::VIRTUAL_ID + 302, dtype,
-                                        1, -1, /*is_virtual*/ true));
-
-  TF_ASSIGN_OR_RETURN(auto abs_desc,
-                      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_ABS));
-
-  TF_ASSIGN_OR_RETURN(auto abs_op,
-                      CreateUnaryPwOp(tensor_p_transpose_scale,
-                                      tensor_p_transpose_scale_abs, abs_desc));
-
-  intermediate_ops.push_back(std::move(reshape_op));
-  intermediate_ops.push_back(std::move(scale_op));
-  intermediate_ops.push_back(std::move(abs_op));
-
-  // matmul to calculate dv
-  auto bmm2_grad_gemm1_desc = cudnn_frontend::MatMulDescBuilder()
-                                  .setComputeType(CUDNN_DATA_FLOAT)
-                                  .build();
-  RETURN_MSG_IF_CUDNN_ERROR(bmm2_grad_gemm1_desc);
-
-  auto bmm2_grad_gemm1_op = cudnn_frontend::OperationBuilder(
-                                CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
-                                .setaMatDesc(tensor_p_transpose_scale_abs)
-                                .setbMatDesc(tensor_do)
-                                .setcMatDesc(tensor_dv)
-                                .setmatmulDesc(bmm2_grad_gemm1_desc)
-                                .build();
-  RETURN_MSG_IF_CUDNN_ERROR(bmm2_grad_gemm1_op);
-  VLOG(4) << "\nBMM2_grad_gemm1: " << bmm2_grad_gemm1_desc.describe()
-          << "\nBMM2_grad_gemm1_op: " << bmm2_grad_gemm1_op.describe();
-
-  intermediate_ops.push_back(std::move(bmm2_grad_gemm1_op));
-
-  // matmul to calculate dp
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_dp,
-      CreateCudnnTensor(p_dims, p_strides, CudnnfMHAUid::VIRTUAL_ID + 303,
-                        dnn::DataType::kFloat, 1,
-                        -1,  // FMHA TODO TYPE: why it is float here?
-                        /* is_virtual */ true));
-
-  auto bmm2_grad_gemm2_desc = cudnn_frontend::MatMulDescBuilder()
-                                  .setComputeType(CUDNN_DATA_FLOAT)
-                                  .build();
-  RETURN_MSG_IF_CUDNN_ERROR(bmm2_grad_gemm2_desc);
-  auto bmm2_grad_gemm2_op = cudnn_frontend::OperationBuilder(
-                                CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
-                                .setaMatDesc(tensor_do)
-                                .setbMatDesc(tensor_vt)
-                                .setcMatDesc(tensor_dp)
-                                .setmatmulDesc(bmm2_grad_gemm2_desc)
-                                .build();
-  RETURN_MSG_IF_CUDNN_ERROR(bmm2_grad_gemm2_op);
-  VLOG(4) << "\nBMM2_grad_gemm2: " << bmm2_grad_gemm2_desc.describe()
-          << "\nBMM2_grad_gemm2_op: " << bmm2_grad_gemm2_op.describe();
-
-  intermediate_ops.push_back(std::move(bmm2_grad_gemm2_op));
-
-  // mask out the sign bit here
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_p_abs,
-      CreateCudnnTensor(p_dims, p_strides, CudnnfMHAUid::VIRTUAL_ID + 304,
-                        dtype, 1, -1,
-                        /* is_virtual */ true));
-
-  TF_ASSIGN_OR_RETURN(auto p_abs_desc,
-                      CreatePwDesc(dnn::DataType::kFloat, CUDNN_POINTWISE_ABS));
-
-  TF_ASSIGN_OR_RETURN(auto p_abs_op,
-                      CreateUnaryPwOp(tensor_p, tensor_p_abs, p_abs_desc));
-  intermediate_ops.push_back(std::move(p_abs_op));
-
-  // dropout backward
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_dp_scale_dropout,
-      CreateCudnnDropoutBwdTensor(intermediate_ops, p_dims, p_strides, dtype,
-                                  tensor_dropout_scale, tensor_p, tensor_p_abs,
-                                  tensor_dp));
-  // softmax backward
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_ds,
-      CreateCudnnSoftmaxBwdTensor(intermediate_ops, p_dims, p_strides, dtype,
-                                  tensor_p_abs, tensor_dp_scale_dropout));
-  // mask backward
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_ds_mask,
-      CreateCudnnMaskBwdTensor(intermediate_ops, p_dims, p_strides, dtype,
-                               tensor_ds, use_mask));
-
-  // bias backward
-  if (use_bias) {
-#if CUDNN_VERSION >= 8901
-    TF_ASSIGN_OR_RETURN(
-        auto tensor_dbias,
-        CreateCudnnBiasBwdTensor(intermediate_ops, p_dims, p_strides, dtype,
-                                 tensor_ds_mask));
-#else
-    return absl::InternalError("Bias backward op requires cudnn >= 8.9.1");
-#endif
-  }
-
-  // calculate dq
-  auto bmm1_grad_gemm2_desc = cudnn_frontend::MatMulDescBuilder()
-                                  .setComputeType(CUDNN_DATA_FLOAT)
-                                  .build();
-  RETURN_MSG_IF_CUDNN_ERROR(bmm1_grad_gemm2_desc);
-  auto bmm1_grad_gemm2_op = cudnn_frontend::OperationBuilder(
-                                CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
-                                .setaMatDesc(tensor_ds_mask)
-                                .setbMatDesc(tensor_k)
-                                .setcMatDesc(tensor_dq)
-                                .setmatmulDesc(bmm1_grad_gemm2_desc)
-                                .build();
-  RETURN_MSG_IF_CUDNN_ERROR(bmm1_grad_gemm2_op);
-  VLOG(4) << "\nBMM1_grad_gemm2: " << bmm1_grad_gemm2_desc.describe()
-          << "\nBMM1_grad_gemm2_op: " << bmm1_grad_gemm2_op.describe();
-
-  intermediate_ops.push_back(std::move(bmm1_grad_gemm2_op));
-
-  // calculate dk
-  TF_ASSIGN_OR_RETURN(
-      auto tensor_ds_mask_reshape,
-      CreateCudnnTensor(p_transpose_dims, p_transpose_strides,
-                        CudnnfMHAUid::VIRTUAL_ID + 305, dtype, 1, -1,
-                        /* is_virtual */ true));
-
-  auto reshape_2_op = cudnn_frontend::OperationBuilder(
-                          CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR)
-                          .setxDesc(tensor_ds_mask)
-                          .setyDesc(tensor_ds_mask_reshape)
-                          .build();
-
-  intermediate_ops.push_back(std::move(reshape_2_op));
-  auto bmm1_grad_gemm1_desc = cudnn_frontend::MatMulDescBuilder()
-                                  .setComputeType(CUDNN_DATA_FLOAT)
-                                  .build();
-  RETURN_MSG_IF_CUDNN_ERROR(bmm1_grad_gemm1_desc);
-  auto bmm1_grad_gemm1_op = cudnn_frontend::OperationBuilder(
-                                CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
-                                .setaMatDesc(tensor_ds_mask_reshape)
-                                .setbMatDesc(tensor_q)
-                                .setcMatDesc(tensor_dk)
-                                .setmatmulDesc(bmm1_grad_gemm1_desc)
-                                .build();
-  RETURN_MSG_IF_CUDNN_ERROR(bmm1_grad_gemm1_op);
-  VLOG(4) << "\nBMM1_grad_gemm1: " << bmm1_grad_gemm1_desc.describe()
-          << "\nBMM1_grad_gemm1_op: " << bmm1_grad_gemm1_op.describe();
-
-  intermediate_ops.push_back(std::move(bmm1_grad_gemm1_op));
-  ops.reserve(intermediate_ops.size());
-
-  for (auto& intermediate_op : intermediate_ops) {
-    ops.emplace_back(&intermediate_op);
-  }
-
-  auto op_graph = cudnn_frontend::OperationGraphBuilder()
-                      .setHandle(cudnn.handle())
-                      .setOperationGraph(ops.size(), ops.data())
-                      .build();
-  RETURN_MSG_IF_CUDNN_ERROR(op_graph);
-
-  VLOG(4) << "\nTensor_q: " << tensor_q.describe()
-          << "\nTensor_k: " << tensor_k.describe()
-          << "\nTensor_p: " << tensor_p.describe()
-          << "\nTensor_vt: " << tensor_vt.describe()
-          << "\nTensor_do: " << tensor_do.describe()
-          << "\nTensor_dq: " << tensor_dq.describe()
-          << "\nTensor_dk: " << tensor_dk.describe()
-          << "\nTensor_dv: " << tensor_dv.describe()
-          << "\nOpGraph: " << op_graph.describe();
-  return std::make_unique<cudnn_frontend::OperationGraph>(std::move(op_graph));
-}
-
-#endif  // CUDNN_VERSION >= 8800
-
 }  // namespace
 
 static absl::StatusOr<cudnn_frontend::ExecutionPlan> GetExecPlanFromHeuristics(
@@ -6267,13 +4984,13 @@ static absl::StatusOr<cudnn_frontend::ExecutionPlan> GetExecPlanFromHeuristics(
     } else {
       VLOG(4) << "Failed to build cuDNN execution plan for opGraph "
               << opGraph.getTag()
-              << ". Status: " << CudnnStatusToString(status);
+              << ". absl::Status: " << CudnnStatusToString(status);
     }
   }
 
   LOG(FATAL) << "Failed to generate cuDNN execution plan for opGraph "
              << opGraph.getTag()
-             << ". Status of final plan: " << CudnnStatusToString(status);
+             << ". absl::Status of final plan: " << CudnnStatusToString(status);
 #else
   return absl::UnimplementedError("Supported only for cuDNN >= 8.8.0");
 #endif
@@ -6342,7 +5059,6 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionOperationGraph(
     const dnn::MatmulTensorDescriptor& v_descriptor,
     const dnn::TensorDescriptor& o_descriptor,
     const std::optional<dnn::TensorDescriptor> bias_descriptor,
-    const std::optional<dnn::TensorDescriptor> mask_descriptor,
     const std::optional<dnn::TensorDescriptor> stats_descriptor,
     const float scale, const bool use_dropout,
     const std::optional<double> dropout_rate,
@@ -6358,9 +5074,6 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionOperationGraph(
     if (bias_descriptor) {
       VLOG(4) << "\n bias(b): " << bias_descriptor->ToString();
     }
-    if (mask_descriptor) {
-      VLOG(4) << "\n mask(m): " << mask_descriptor->ToString();
-    }
     if (stats_descriptor) {
       VLOG(4) << "\n activation(s): " << stats_descriptor->ToString();
     }
@@ -6420,6 +5133,30 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionOperationGraph(
                          .set_uid(CudnnfMHAUid::BIAS_ID));
     sdpa_options.set_bias(bias_tensor);
   }
+  // Setting actual seqlen
+  bool is_padding = mask_type == dnn::FMHAMaskKind::PADDING ||
+                    mask_type == dnn::FMHAMaskKind::PADDING_CAUSAL;
+  if (is_padding) {
+    auto q_dim = q_descriptor.GetCudnnCompatibleDimensions(true);
+    auto b = q_dim[0];
+    auto seq_q_tensor =
+        graph.tensor(Tensor_attributes()
+                         .set_name("seq_q")
+                         .set_dim({b, 1, 1, 1})
+                         .set_stride({1, 1, 1, 1})
+                         .set_uid(CudnnfMHAUid::Q_SEQLEN_ID)
+                         .set_data_type(cudnn_frontend::DataType_t::INT32));
+    auto seq_kv_tensor =
+        graph.tensor(Tensor_attributes()
+                         .set_name("seq_kv")
+                         .set_dim({b, 1, 1, 1})
+                         .set_stride({1, 1, 1, 1})
+                         .set_uid(CudnnfMHAUid::K_SEQLEN_ID)
+                         .set_data_type(cudnn_frontend::DataType_t::INT32));
+    sdpa_options.set_padding_mask(true);
+    sdpa_options.set_seq_len_q(seq_q_tensor);
+    sdpa_options.set_seq_len_kv(seq_kv_tensor);
+  }
   // Setting seed and offset
   if (use_dropout) {
     auto seed_tensor =
@@ -6471,7 +5208,7 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionOperationGraph(
   if (!supported) {
     return absl::InternalError("cuDNN graph is not supported.");
   }
-  TF_RETURN_IF_ERROR(cudnnGraph.Build(dnn_support, /*plan_id=*/0));
+  TF_RETURN_IF_ERROR(cudnnGraph.Build(dnn_support, std::nullopt));
 
   if (VLOG_IS_ON(4)) {
     VLOG(4) << "\b flash attention operation graph: " << graph;
@@ -6493,7 +5230,7 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionBackwardOperationGraph(
     const dnn::TensorDescriptor& dv_desc,
     const std::optional<dnn::TensorDescriptor> bias_descriptor,
     std::optional<double> dropout_rate, std::optional<int64_t> seed,
-    double scale, bool use_dropout, bool use_mask, bool use_bias,
+    double scale, bool use_dropout, bool use_bias,
     dnn::FMHAMaskKind mask_type) {
 #if CUDNN_VERSION >= 8904
   if (VLOG_IS_ON(4)) {
@@ -6587,6 +5324,11 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionBackwardOperationGraph(
   // Setting bias
   if (use_bias) {
     DCHECK(bias_descriptor != std::nullopt);
+    auto bias_dim = bias_descriptor->dimensions();
+    auto q_dim = q_desc.GetCudnnCompatibleDimensions(false);
+    auto b = bias_dim[0];
+    auto n = bias_dim[1];
+    auto q_n = q_dim[1];
     auto bias_tensor =
         graph.tensor(Tensor_attributes()
                          .set_name("bias")
@@ -6594,6 +5336,42 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionBackwardOperationGraph(
                          .set_stride(bias_descriptor->GetLogicalStrides())
                          .set_uid(CudnnfMHAUid::BIAS_ID));
     sdpa_backward_options.set_bias(bias_tensor);
+
+    // shapes [1, 1, s, s], [b, 1, s, s], [b, h, s, s] are not supported for
+    // dbias calculation but they are supported for forward bias calculation
+    if (b == 1 && n == q_n) {
+      auto d_bias_tensor =
+          graph.tensor(Tensor_attributes()
+                           .set_name("dBias")
+                           .set_dim(bias_descriptor->dimensions())
+                           .set_stride(bias_descriptor->GetLogicalStrides())
+                           .set_uid(CudnnfMHAUid::dBIAS_ID));
+      sdpa_backward_options.set_dbias(d_bias_tensor);
+    }
+  }
+  // Setting actual seqlen
+  bool is_padding = mask_type == dnn::FMHAMaskKind::PADDING ||
+                    mask_type == dnn::FMHAMaskKind::PADDING_CAUSAL;
+  if (is_padding) {
+    auto q_dim = q_desc.GetCudnnCompatibleDimensions(false);
+    auto b = q_dim[0];
+    auto seq_q_tensor =
+        graph.tensor(Tensor_attributes()
+                         .set_name("seq_q")
+                         .set_dim({b, 1, 1, 1})
+                         .set_stride({1, 1, 1, 1})
+                         .set_uid(CudnnfMHAUid::Q_SEQLEN_ID)
+                         .set_data_type(cudnn_frontend::DataType_t::INT32));
+    auto seq_kv_tensor =
+        graph.tensor(Tensor_attributes()
+                         .set_name("seq_kv")
+                         .set_dim({b, 1, 1, 1})
+                         .set_stride({1, 1, 1, 1})
+                         .set_uid(CudnnfMHAUid::K_SEQLEN_ID)
+                         .set_data_type(cudnn_frontend::DataType_t::INT32));
+    sdpa_backward_options.set_padding_mask(true);
+    sdpa_backward_options.set_seq_len_q(seq_q_tensor);
+    sdpa_backward_options.set_seq_len_kv(seq_kv_tensor);
   }
   // Setting seed and offset
   if (use_dropout) {
@@ -6645,7 +5423,7 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionBackwardOperationGraph(
   if (!supported) {
     return absl::InternalError("cuDNN graph is not supported.");
   }
-  TF_RETURN_IF_ERROR(cudnnGraph.Build(dnn_support, /*plan_id=*/0));
+  TF_RETURN_IF_ERROR(cudnnGraph.Build(dnn_support, std::nullopt));
 
   if (VLOG_IS_ON(4)) {
     VLOG(4) << "\b flash attention operation backward graph: " << graph;
@@ -6789,8 +5567,7 @@ class CudnnLegacyConvRunner : public dnn::ConvRunner {
                           DeviceMemoryBase output_data) const override {
     auto algo = MakeAlgorithmDesc();
 
-    if (static_cast<StreamExecutorInterface*>(parent_) !=
-        stream->parent()->implementation()) {
+    if (parent_ != stream->parent()) {
       return tsl::errors::Internal(
           "CudnnLegacyConvRunner cached across multiple StreamExecutors.");
     }
@@ -6922,7 +5699,7 @@ class CudnnLegacyConvRunner : public dnn::ConvRunner {
         filter_(std::move(filter)),
         conv_(std::move(conv)) {}
 
-  // Internal form of ToAlgorithmDesc without the StatusOr.
+  // Internal form of ToAlgorithmDesc without the absl::StatusOr.
   dnn::AlgorithmDesc MakeAlgorithmDesc() const {
     return {algo_id_, tensor_ops_enabled_, workspace_size_};
   }
@@ -7198,8 +5975,7 @@ class CudnnExecutionPlanRunner<void(Args...)>
   absl::Status operator()(Stream* stream, dnn::ProfileResult* profile_result,
                           DeviceMemoryBase scratch_memory,
                           Args... inputs) const override {
-    if (static_cast<StreamExecutorInterface*>(parent_) !=
-        stream->parent()->implementation()) {
+    if (parent_ != stream->parent()) {
       return tsl::errors::Internal(
           "CudnnExecutionPlanRunner cached across multiple StreamExecutors.");
     }
@@ -7236,20 +6012,6 @@ class CudnnExecutionPlanRunner<void(Args...)>
       }
     }
 
-    if (!data_ptrs_vec.empty() && data_ptrs_vec.back() == nullptr &&
-        !has_activation_output_) {
-      data_ptrs_vec.pop_back();
-    }
-
-    if (sizeof...(Args) == 9 || sizeof...(Args) == 15) {
-      // is attention fwd or bwd
-      data_ptrs_vec.erase(
-          std::remove(data_ptrs_vec.begin(), data_ptrs_vec.end(), nullptr),
-          data_ptrs_vec.end());
-      // ensure the size is equal after removing useless pointers
-      CHECK(data_ptrs_vec.size() == data_uids_vec.size());
-    }
-
     if (should_add_scalars) {
       data_uids_vec.insert(data_uids_vec.end(), scalar_input_uids_.begin(),
                            scalar_input_uids_.end());
@@ -7258,20 +6020,7 @@ class CudnnExecutionPlanRunner<void(Args...)>
             const_cast<void*>(scalar_input_values_[i].ToVoidPointer()));
       }
     }
-    if (offset_increment_ > 0) {
-#if CUDNN_VERSION >= 8800
-      initial_offset_ += offset_increment_;
-      data_uids_vec.push_back(CudnnfMHAUid::D_SEED_ID);
-      data_uids_vec.push_back(CudnnfMHAUid::D_OFFSET_ID);
-      data_ptrs_vec.push_back(
-          static_cast<void*>(const_cast<int64_t*>(&rng_seed_)));
-      data_ptrs_vec.push_back(static_cast<void*>(&initial_offset_));
-#else
-      return absl::UnimplementedError(
-          "Cudnn dropout offset and seed are only supported with Cudnn >= "
-          "8.8.");
-#endif  // CUDNN_VERSION >= 8800
-    }
+
     auto variantPack =
         cudnn_frontend::VariantPackBuilder()
             .setWorkspacePointer(scratch_memory.opaque())
@@ -7305,7 +6054,7 @@ class CudnnExecutionPlanRunner<void(Args...)>
               << profile_result->elapsed_time_in_ms() << "ms";
     }
 
-    return tsl::OkStatus();
+    return absl::OkStatus();
   }
 
   static absl::StatusOr<CudnnExecutionPlanRunner> Create(
@@ -7320,25 +6069,19 @@ class CudnnExecutionPlanRunner<void(Args...)>
              workspace_size,
              uids,
              need_side_input,
-             false,
-             {},
              {},
-             0,
-             0}};
+             {}}};
   }
 
   static absl::StatusOr<CudnnExecutionPlanRunner> Create(
       GpuExecutor* parent, CudnnAccess* cudnn,
       cudnn_frontend::ExecutionPlan plan, absl::Span<const int64_t> uids,
-      bool need_side_input, bool has_activation_output,
-      std::vector<int64_t> scalar_input_uids,
-      std::vector<ScalingParam> scalar_input_values, int64_t dropout_rng_seed,
-      int64_t dropout_rng_offset) {
+      bool need_side_input, std::vector<int64_t> scalar_input_uids,
+      std::vector<ScalingParam> scalar_input_values) {
     auto workspace_size = static_cast<uint64_t>(plan.getWorkspaceSize());
     RETURN_MSG_IF_CUDNN_ERROR(plan);
     return {{parent, cudnn, std::move(plan), workspace_size, uids,
-             need_side_input, has_activation_output, scalar_input_uids,
-             scalar_input_values, dropout_rng_seed, dropout_rng_offset}};
+             need_side_input, scalar_input_uids, scalar_input_values}};
   }
 
  private:
@@ -7346,34 +6089,24 @@ class CudnnExecutionPlanRunner<void(Args...)>
                            cudnn_frontend::ExecutionPlan plan,
                            size_t workspace_size,
                            absl::Span<const int64_t> uids, bool need_side_input,
-                           bool has_activation_output,
                            std::vector<int64_t> scalar_input_uids,
-                           std::vector<ScalingParam> scalar_input_values,
-                           int64_t dropout_rng_seed, int64_t dropout_rng_offset)
+                           std::vector<ScalingParam> scalar_input_values)
       : parent_(parent),
         cudnn_(cudnn),
         plan_(std::move(plan)),
         workspace_size_(workspace_size),
         data_uids_(uids.begin(), uids.end()),
         need_side_input_(need_side_input),
-        has_activation_output_(has_activation_output),
         scalar_input_uids_(scalar_input_uids),
-        scalar_input_values_(scalar_input_values),
-        offset_increment_(dropout_rng_offset),
-        rng_seed_(dropout_rng_seed) {}
+        scalar_input_values_(scalar_input_values) {}
   GpuExecutor* parent_;
   CudnnAccess* cudnn_;
   cudnn_frontend::ExecutionPlan plan_;
   size_t workspace_size_;
   absl::InlinedVector<int64_t, sizeof...(Args)> data_uids_;
   bool need_side_input_;
-  bool has_activation_output_;
   std::vector<int64_t> scalar_input_uids_;
   std::vector<ScalingParam> scalar_input_values_;
-  // This is the state kept for rng if cudnn graph contains dropout.
-  mutable int64_t initial_offset_ = 0;
-  int64_t offset_increment_ = 0;
-  int64_t rng_seed_;
 };
 #endif  // CUDNN_VERSION >= 8100
 
@@ -7404,8 +6137,7 @@ class CudnnGraphRunner<void(Args...)> : public dnn::OpRunner<void(Args...)> {
   absl::Status operator()(Stream* stream, dnn::ProfileResult* profile_result,
                           DeviceMemoryBase scratch_memory,
                           Args... inputs) const override {
-    if (static_cast<StreamExecutorInterface*>(parent_) !=
-        stream->parent()->implementation()) {
+    if (parent_ != stream->parent()) {
       return tsl::errors::Internal(
           "CudnnExecutionPlanRunner cached across multiple StreamExecutors.");
     }
@@ -7440,7 +6172,7 @@ class CudnnGraphRunner<void(Args...)> : public dnn::OpRunner<void(Args...)> {
     RETURN_IF_CUDNN_FRONTEND_ERROR(graph_.Graph().execute(
         handle.handle(), variant_pack, scratch_memory.opaque()));
 
-    return tsl::OkStatus();
+    return absl::OkStatus();
   }
 
   static absl::StatusOr<CudnnGraphRunner> Create(
@@ -7874,8 +6606,7 @@ class CudnnLegacyFusedConvRunner : public dnn::FusedConvRunner {
                           DeviceMemoryBase side_input_data,
                           DeviceMemoryBase bias_data,
                           DeviceMemoryBase output_data) const override {
-    if (static_cast<StreamExecutorInterface*>(parent_) !=
-        stream->parent()->implementation()) {
+    if (parent_ != stream->parent()) {
       return tsl::errors::Internal(
           "CudnnLegacyFusedConvRunner cached across multiple "
           "StreamExecutors.");
@@ -7981,7 +6712,7 @@ class CudnnLegacyFusedConvRunner : public dnn::FusedConvRunner {
         conv_(std::move(conv)),
         activation_desc_(std::move(activation_desc)) {}
 
-  // Internal form of ToAlgorithmDesc without the StatusOr.
+  // Internal form of ToAlgorithmDesc without the absl::StatusOr.
   dnn::AlgorithmDesc MakeAlgorithmDesc() const {
     return {algo_id_, tensor_ops_enabled_, workspace_size_};
   }
@@ -8328,7 +7059,7 @@ CudnnSupport::NormRunnerFromDesc(
   };
 
   auto create_cudnn_tensor = [next_uid](dnn::TensorDescriptor tensor_descriptor)
-      -> tsl::StatusOr<cudnn_frontend::Tensor> {
+      -> absl::StatusOr<cudnn_frontend::Tensor> {
     return CreateCudnnTensor(tensor_descriptor.dimensions(),
                              tensor_descriptor.GetPhysicalStridesMajorToMinor(),
                              next_uid(), tensor_descriptor.type(), 1, -1);
@@ -8438,9 +7169,7 @@ CudnnSupport::NormRunnerFromDesc(
       auto runner,
       CudnnExecutionPlanRunner<dnn::NormSignature>::Create(
           parent_, cudnn_.get(), std::move(execution_plan), uids,
-          /*need_side_input=*/false, /*has_activation_output=*/false,
-          scalar_uids, scalar_input_values, /*dropout_rng_seed=*/0,
-          /*dropout_rng_offset=*/0));
+          /*need_side_input=*/false, scalar_uids, scalar_input_values));
   return {std::make_unique<CudnnExecutionPlanRunner<dnn::NormSignature>>(
       std::move(runner))};
 
@@ -8466,120 +7195,68 @@ int64_t GetDropoutRngOffset(std::vector<int64_t>& intermediate_shape) {
 absl::StatusOr<std::unique_ptr<const dnn::FusedMHARunner>>
 CudnnSupport::FusedMHARunnerFromDesc(
     Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
-    dnn::FusedMHAKind kind,
     const dnn::MatmulTensorDescriptor& bmm1_lhs_descriptor,
     const dnn::MatmulTensorDescriptor& bmm1_rhs_descriptor,
     const dnn::MatmulTensorDescriptor& bmm2_rhs_descriptor,
     const dnn::MatmulTensorDescriptor& intermediate_bmm2_lhs_descriptor,
     const dnn::TensorDescriptor& output_descriptor,
     std::optional<dnn::TensorDescriptor> activation_descriptor,
-    std::optional<dnn::TensorDescriptor> mask_descriptor,
     std::optional<dnn::TensorDescriptor> bias_descriptor, double scale,
     std::optional<double> dropout_rate, std::optional<int64_t> seed,
-    bool is_flash_attention, dnn::FMHAMaskKind mask_type) {
-#if CUDNN_VERSION >= 8800
+    dnn::FMHAMaskKind mask_type) {
+#if CUDNN_VERSION >= 8904
   auto cudnn = cudnn_->GetHandle(parent_, stream);
   bool use_dropout = dropout_rate && *dropout_rate > 0.0;
   std::vector<int64_t> intermediate_shape;
 
-  if (is_flash_attention) {
-    TF_ASSIGN_OR_RETURN(
-        auto graph,
-        GetCudnnFlashAttentionOperationGraph(
-            *this, /*q_descriptor=*/bmm1_lhs_descriptor,
-            /*k_descriptor=*/bmm1_rhs_descriptor,
-            /*v_descriptor=*/bmm2_rhs_descriptor,
-            /*o_descriptor=*/output_descriptor, bias_descriptor,
-            mask_descriptor, /*stats_descriptor=*/activation_descriptor,
-            /*scale=*/static_cast<float>(scale), use_dropout, dropout_rate,
-            mask_type));
-
-    std::vector<int64_t> intermediate_bmm2_lhs_dims =
-        intermediate_bmm2_lhs_descriptor.GetCudnnCompatibleDimensions(true);
-    intermediate_shape = intermediate_bmm2_lhs_dims;
-    int64_t dropout_rng_offset = GetDropoutRngOffset(intermediate_shape);
-    int64_t dropout_rng_seed = seed.has_value() ? *seed : 0;
-    std::vector<std::optional<int64_t>> uids = {
-        CudnnfMHAUid::Q_ID, CudnnfMHAUid::K_ID, CudnnfMHAUid::V_ID,
-        CudnnfMHAUid::O_ID,
-        /*mask=*/std::nullopt};
-    uids.emplace_back(bias_descriptor.has_value()
-                          ? std::optional<CudnnfMHAUid>(CudnnfMHAUid::BIAS_ID)
-                          : std::nullopt);
-    uids.emplace_back(activation_descriptor.has_value()
-                          ? std::optional<CudnnfMHAUid>(CudnnfMHAUid::P_ID)
-                          : std::nullopt);
-    TF_ASSIGN_OR_RETURN(auto runner,
-                        CudnnGraphRunner<dnn::FusedMHASignature>::Create(
-                            parent_, cudnn_.get(), std::move(graph),
-                            dropout_rng_seed, dropout_rng_offset, uids));
-
-    return {std::make_unique<CudnnGraphRunner<dnn::FusedMHASignature>>(
-        std::move(runner))};
-  }
+  TF_ASSIGN_OR_RETURN(auto graph,
+                      GetCudnnFlashAttentionOperationGraph(
+                          *this, /*q_descriptor=*/bmm1_lhs_descriptor,
+                          /*k_descriptor=*/bmm1_rhs_descriptor,
+                          /*v_descriptor=*/bmm2_rhs_descriptor,
+                          /*o_descriptor=*/output_descriptor, bias_descriptor,
+                          /*stats_descriptor=*/activation_descriptor,
+                          /*scale=*/static_cast<float>(scale), use_dropout,
+                          dropout_rate, mask_type));
 
-  TF_ASSIGN_OR_RETURN(
-      auto op_graph,
-      GetCudnnFusedMHAOperationGraph(
-          bmm1_lhs_descriptor, bmm1_rhs_descriptor, bmm2_rhs_descriptor,
-          intermediate_bmm2_lhs_descriptor, output_descriptor, mask_descriptor,
-          bias_descriptor, activation_descriptor, kind, dropout_rate, seed,
-          cudnn, scale, intermediate_shape, use_dropout,
-          /*use_mask*/ mask_descriptor != std::nullopt,
-          /*use_bias*/ bias_descriptor != std::nullopt));
-
-  TF_ASSIGN_OR_RETURN(auto execution_plan,
-                      GetExecPlanFromHeuristics(std::move(*op_graph), cudnn));
-  std::vector<int64_t> u_ids = {CudnnfMHAUid::Q_ID, CudnnfMHAUid::K_ID,
-                                CudnnfMHAUid::V_ID, CudnnfMHAUid::O_ID};
-  if (mask_descriptor) {
-    u_ids.push_back(CudnnfMHAUid::MASK_ID);
-  }
-
-  if (bias_descriptor) {
-    u_ids.push_back(CudnnfMHAUid::BIAS_ID);
-  }
-
-  if (activation_descriptor) {
-    u_ids.push_back(CudnnfMHAUid::P_ID);
-  }
-
-  std::vector<ScalingParam> scalar_input_values;
-  std::vector<int64_t> scalar_input_uids;
-
-  int64_t dropout_rng_seed = seed == std::nullopt ? 0 : *seed;
-  int64_t dropout_rng_offset = 0;
+  std::vector<int64_t> intermediate_bmm2_lhs_dims =
+      intermediate_bmm2_lhs_descriptor.GetCudnnCompatibleDimensions(true);
+  intermediate_shape = intermediate_bmm2_lhs_dims;
+  int64_t dropout_rng_offset = GetDropoutRngOffset(intermediate_shape);
+  int64_t dropout_rng_seed = seed.has_value() ? *seed : 0;
+  std::vector<std::optional<int64_t>> uids = {
+      CudnnfMHAUid::Q_ID, CudnnfMHAUid::K_ID, CudnnfMHAUid::V_ID,
+      CudnnfMHAUid::O_ID};
+  uids.emplace_back(bias_descriptor.has_value()
+                        ? std::optional<CudnnfMHAUid>(CudnnfMHAUid::BIAS_ID)
+                        : std::nullopt);
+  uids.emplace_back(activation_descriptor.has_value()
+                        ? std::optional<CudnnfMHAUid>(CudnnfMHAUid::P_ID)
+                        : std::nullopt);
+  bool is_padding = mask_type == dnn::FMHAMaskKind::PADDING ||
+                    mask_type == dnn::FMHAMaskKind::PADDING_CAUSAL;
+  uids.emplace_back(is_padding
+                        ? std::optional<CudnnfMHAUid>(CudnnfMHAUid::Q_SEQLEN_ID)
+                        : std::nullopt);
+  uids.emplace_back(is_padding
+                        ? std::optional<CudnnfMHAUid>(CudnnfMHAUid::K_SEQLEN_ID)
+                        : std::nullopt);
+  TF_ASSIGN_OR_RETURN(auto runner,
+                      CudnnGraphRunner<dnn::FusedMHASignature>::Create(
+                          parent_, cudnn_.get(), std::move(graph),
+                          dropout_rng_seed, dropout_rng_offset, uids));
 
-  ScalingParam alpha_scale(scale, bmm1_lhs_descriptor.type());
-  scalar_input_values = {alpha_scale};
-  scalar_input_uids = {CudnnfMHAUid::ALPHA_SCALE_ID};
-  if (use_dropout) {
-    scalar_input_uids.push_back(CudnnfMHAUid::DROPOUT_SCALE_ID);
-    double dropout_scale_value = 1.0f / (1.0f - *dropout_rate);
-    ScalingParam dropout_scale(dropout_scale_value, bmm1_lhs_descriptor.type());
-    scalar_input_values.push_back(dropout_scale);
-    dropout_rng_offset = GetDropoutRngOffset(intermediate_shape);
-  }
-  TF_ASSIGN_OR_RETURN(
-      auto runner,
-      CudnnExecutionPlanRunner<dnn::FusedMHASignature>::Create(
-          parent_, cudnn_.get(), std::move(execution_plan), u_ids,
-          /*need_side_input*/ true,
-          /*has_activation_output*/ (activation_descriptor != std::nullopt),
-          scalar_input_uids, scalar_input_values, dropout_rng_seed,
-          dropout_rng_offset));
-  return {std::make_unique<CudnnExecutionPlanRunner<dnn::FusedMHASignature>>(
+  return {std::make_unique<CudnnGraphRunner<dnn::FusedMHASignature>>(
       std::move(runner))};
 #else
   return absl::UnimplementedError(
-      "Cudnn execution plans are only supported with Cudnn >= 8.8.");
-#endif  // CUDNN_VERSION >= 8800
+      "Cudnn flash attention are only supported with Cudnn >= 8.9.4");
+#endif  // CUDNN_VERSION >= 8904
 }
 
 absl::StatusOr<std::unique_ptr<const dnn::FusedMHABackwardRunner>>
 CudnnSupport::FusedMHABackwardRunnerFromDesc(
     Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
-    dnn::FusedMHAKind kind,
     const dnn::MatmulTensorDescriptor& bmm1_grad_gemm1_rhs_descriptor,
     const dnn::MatmulTensorDescriptor& bmm1_grad_gemm2_rhs_descriptor,
     const dnn::MatmulTensorDescriptor& bmm2_grad_gemm1_lhs_descriptor,
@@ -8589,118 +7266,62 @@ CudnnSupport::FusedMHABackwardRunnerFromDesc(
     const dnn::TensorDescriptor& d_bmm1_rhs_descriptor,
     const dnn::TensorDescriptor& d_bmm2_rhs_descriptor,
     std::optional<dnn::TensorDescriptor> d_s_descriptor,
-    std::optional<dnn::TensorDescriptor> mask_descriptor,
     std::optional<dnn::TensorDescriptor> d_bias_descriptor,
     std::optional<dnn::TensorDescriptor> fwd_output_descriptor,
     std::optional<dnn::TensorDescriptor> bias_descriptor, double scale,
     std::optional<double> dropout_rate, std::optional<int64_t> seed,
-    bool is_flash_attention, dnn::FMHAMaskKind mask_type) {
-#if CUDNN_VERSION >= 8800
+    dnn::FMHAMaskKind mask_type) {
+#if CUDNN_VERSION >= 8904
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
   bool use_dropout = dropout_rate && *dropout_rate > 0.0;
   std::vector<int64_t> intermediate_shape;
 
-  if (is_flash_attention) {
-    TF_ASSIGN_OR_RETURN(
-        auto graph,
-        GetCudnnFlashAttentionBackwardOperationGraph(
-            *this, bmm1_grad_gemm1_rhs_descriptor,
-            bmm1_grad_gemm2_rhs_descriptor, bmm2_grad_gemm1_lhs_descriptor,
-            bmm2_grad_gemm2_rhs_descriptor, d_output_descriptor,
-            d_bmm1_lhs_descriptor, d_bmm1_rhs_descriptor, d_bmm2_rhs_descriptor,
-            bias_descriptor, dropout_rate, seed, scale, use_dropout,
-            /*use_mask*/ mask_descriptor != std::nullopt,
-            /*use_bias*/ bias_descriptor != std::nullopt, mask_type));
-
-    std::vector<int64_t> p_dims =
-        bmm2_grad_gemm1_lhs_descriptor.GetCudnnCompatibleDimensions(false);
-    intermediate_shape = p_dims;
-    int64_t dropout_rng_offset = GetDropoutRngOffset(intermediate_shape);
-    int64_t dropout_rng_seed = seed.has_value() ? *seed : 0;
-
-    std::vector<std::optional<int64_t>> uids;
-    uids = {CudnnfMHAUid::Q_ID,  CudnnfMHAUid::K_ID,  CudnnfMHAUid::P_ID,
-            CudnnfMHAUid::V_ID,  CudnnfMHAUid::dO_ID, CudnnfMHAUid::dQ_ID,
-            CudnnfMHAUid::dK_ID, CudnnfMHAUid::dV_ID, std::nullopt,
-            std::nullopt,        std::nullopt,        CudnnfMHAUid::O_ID};
-    uids.emplace_back(bias_descriptor.has_value()
-                          ? std::optional<CudnnfMHAUid>(CudnnfMHAUid::BIAS_ID)
-                          : std::nullopt);
-    TF_ASSIGN_OR_RETURN(
-        auto runner, CudnnGraphRunner<dnn::FusedMHABackwardSignature>::Create(
-                         parent_, cudnn_.get(), graph, dropout_rng_seed,
-                         dropout_rng_offset, uids));
-    return {std::make_unique<CudnnGraphRunner<dnn::FusedMHABackwardSignature>>(
-        std::move(runner))};
-  }
-
   TF_ASSIGN_OR_RETURN(
-      auto op_graph,
-      GetCudnnFusedMHABackwardOperationGraph(
-          bmm1_grad_gemm1_rhs_descriptor, bmm1_grad_gemm2_rhs_descriptor,
+      auto graph,
+      GetCudnnFlashAttentionBackwardOperationGraph(
+          *this, bmm1_grad_gemm1_rhs_descriptor, bmm1_grad_gemm2_rhs_descriptor,
           bmm2_grad_gemm1_lhs_descriptor, bmm2_grad_gemm2_rhs_descriptor,
           d_output_descriptor, d_bmm1_lhs_descriptor, d_bmm1_rhs_descriptor,
-          d_bmm2_rhs_descriptor, kind, dropout_rate, seed, cudnn, scale,
-          intermediate_shape, use_dropout,
-          /*use_mask*/ mask_descriptor != std::nullopt,
-          /*use_bias*/ d_bias_descriptor != std::nullopt));
-  // The function  GetExecPlanFromHeuristics uses
-  // cudnn_frontend::cudnnException which is currently not recommended for
-  // use by Google. Hence commenting out the call.
-  // TODO - Create a status wrapper to wrap the exception to avoid it being
-  // exposed to the runtime. TF_ASSIGN_OR_RETURN(auto execution_plan,
-  //                       GetExecPlanFromHeuristics(std::move(*op_graph),
-  //                       cudnn));
-
-  TF_ASSIGN_OR_RETURN(auto execution_plan,
-                      GetExecPlanFromHeuristics(std::move(*op_graph), cudnn));
-
-  int64_t dropout_rng_seed = seed == std::nullopt ? 0 : *seed;
-  int64_t dropout_rng_offset = 0;
-  std::vector<int64_t> scalar_uids;
-  std::vector<ScalingParam> scalar_values;
-  std::vector<int64_t> uids;
+          d_bmm2_rhs_descriptor, bias_descriptor, dropout_rate, seed, scale,
+          use_dropout, bias_descriptor != std::nullopt, mask_type));
 
-  // TODO cudnn doesn't support no dropout, so setting dropout rate to 0 here
-  // to mimic no dropout. Change this when cudnn graph is more flexible.
-  scalar_uids = {CudnnfMHAUid::ALPHA_SCALE_ID, CudnnfMHAUid::ZERO_VAL_ID,
-                 CudnnfMHAUid::ONE_VAL_ID, CudnnfMHAUid::DROPOUT_SCALE_ID};
-  ScalingParam alpha_scale(scale, dnn::DataType::kFloat);
-  double zero_value = 0.0f;
-  ScalingParam zero(zero_value, dnn::DataType::kFloat);
-  double one_value = 1.0f;
-  ScalingParam one(one_value, dnn::DataType::kFloat);
-  double dropout_scale_value =
-      use_dropout ? (1.0 / (1.0 - *dropout_rate)) : 1.0;
-  ScalingParam dropout_scale(dropout_scale_value, dnn::DataType::kFloat);
-  scalar_values = {alpha_scale, zero, one, dropout_scale};
+  std::vector<int64_t> p_dims =
+      bmm2_grad_gemm1_lhs_descriptor.GetCudnnCompatibleDimensions(false);
+  intermediate_shape = p_dims;
+  int64_t dropout_rng_offset = GetDropoutRngOffset(intermediate_shape);
+  int64_t dropout_rng_seed = seed.has_value() ? *seed : 0;
 
+  std::vector<std::optional<int64_t>> uids;
   uids = {CudnnfMHAUid::Q_ID,  CudnnfMHAUid::K_ID,  CudnnfMHAUid::P_ID,
           CudnnfMHAUid::V_ID,  CudnnfMHAUid::dO_ID, CudnnfMHAUid::dQ_ID,
-          CudnnfMHAUid::dK_ID, CudnnfMHAUid::dV_ID, CudnnfMHAUid::dS_ID};
-  if (mask_descriptor != std::nullopt) {
-    uids.push_back(CudnnfMHAUid::MASK_ID);
-  }
-  if (d_bias_descriptor != std::nullopt) {
-    uids.push_back(CudnnfMHAUid::dBIAS_ID);
-  }
-
-  TF_ASSIGN_OR_RETURN(
-      auto runner,
-      CudnnExecutionPlanRunner<dnn::FusedMHABackwardSignature>::Create(
-          parent_, cudnn_.get(), std::move(execution_plan), uids,
-          /*need_side_input*/ true, /*has_activation_output*/ false,
-          scalar_uids, scalar_values, dropout_rng_seed,
-          /*dropout_rng_offset*/ dropout_rng_offset));
-  return {std::make_unique<
-      CudnnExecutionPlanRunner<dnn::FusedMHABackwardSignature>>(
+          CudnnfMHAUid::dK_ID, CudnnfMHAUid::dV_ID, std::nullopt};
+  uids.emplace_back(d_bias_descriptor.has_value()
+                        ? std::optional<CudnnfMHAUid>(CudnnfMHAUid::dBIAS_ID)
+                        : std::nullopt);
+  uids.push_back(CudnnfMHAUid::O_ID);
+  uids.emplace_back(bias_descriptor.has_value()
+                        ? std::optional<CudnnfMHAUid>(CudnnfMHAUid::BIAS_ID)
+                        : std::nullopt);
+  bool is_padding = mask_type == dnn::FMHAMaskKind::PADDING ||
+                    mask_type == dnn::FMHAMaskKind::PADDING_CAUSAL;
+  uids.emplace_back(is_padding
+                        ? std::optional<CudnnfMHAUid>(CudnnfMHAUid::Q_SEQLEN_ID)
+                        : std::nullopt);
+  uids.emplace_back(is_padding
+                        ? std::optional<CudnnfMHAUid>(CudnnfMHAUid::K_SEQLEN_ID)
+                        : std::nullopt);
+  TF_ASSIGN_OR_RETURN(auto runner,
+                      CudnnGraphRunner<dnn::FusedMHABackwardSignature>::Create(
+                          parent_, cudnn_.get(), graph, dropout_rng_seed,
+                          dropout_rng_offset, uids));
+  return {std::make_unique<CudnnGraphRunner<dnn::FusedMHABackwardSignature>>(
       std::move(runner))};
 #else
   return absl::UnimplementedError(
-      "Cudnn execution plans with dbias calculation in bwd are only "
-      "supported with Cudnn >= 8.8.");
-#endif  // CUDNN_VERSION >= 8800
+      "Cudnn flash attention bwd are only "
+      "supported with Cudnn >= 8.9.4");
+#endif  // CUDNN_VERSION >= 8904
 }
 
 bool CudnnSupport::GetRnnAlgorithms(
@@ -9768,30 +8389,31 @@ absl::StatusOr<bool> CudnnGraph::Prepare(dnn::DnnSupport& dnn_support) {
 }
 
 absl::Status CudnnGraph::Build(dnn::DnnSupport& dnn_support,
-                               const int64_t plan_id) {
+                               const std::optional<int64_t> plan_id) {
   const CudnnSupport& cudnn_support = static_cast<CudnnSupport&>(dnn_support);
   TF_ASSIGN_OR_RETURN(auto cudnn, cudnn_support.cudnn_->GetLocalHandle());
-  RETURN_IF_CUDNN_FRONTEND_ERROR(
-      graph_.build_plan_at_index(cudnn->handle(), plan_id));
+  if (plan_id.has_value()) {
+    RETURN_IF_CUDNN_FRONTEND_ERROR(
+        graph_.build_plan_at_index(cudnn->handle(), *plan_id));
+  } else {
+    RETURN_IF_CUDNN_FRONTEND_ERROR(graph_.build_plans(cudnn->handle()));
+  }
   return absl::OkStatus();
 }
 
 absl::Status CudnnGraph::Execute(Stream& stream,
                                  absl::Span<DeviceMemoryBase> operands) const {
-  std::unordered_map<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>,
-                     void*>
-      tensor_to_ptr_map;
+  std::unordered_map<int64_t, void*> tensor_to_ptr_map;
+  absl::Span<DeviceMemoryBase> operands_without_workspace = operands;
+  DeviceMemoryBase workspace;
+  if (graph_.get_workspace_size() != 0) {
+    workspace = operands.back();
+    CHECK_EQ(graph_.get_workspace_size(), workspace.size());
+    operands_without_workspace = operands.first(operands.size() - 1);
+  }
   int operand_number = 0;
-
-  CHECK_EQ(graph_.get_workspace_size(), 0);
-  for (DeviceMemoryBase operand : operands) {
-    const cudnn_frontend::graph::Tensor_attributes attr =
-        cudnn_frontend::graph::Tensor_attributes().set_uid(
-            CuDnnTensorUID(operand_number));
-    ++operand_number;
-    tensor_to_ptr_map
-        [std::make_shared<cudnn_frontend::graph::Tensor_attributes>(attr)] =
-            operand.opaque();
+  for (DeviceMemoryBase operand : operands_without_workspace) {
+    tensor_to_ptr_map[CuDnnTensorUID(operand_number++)] = operand.opaque();
   }
   const CudnnSupport& dnn_support =
       static_cast<CudnnSupport&>(*stream.parent()->AsDnn());
@@ -9799,7 +8421,7 @@ absl::Status CudnnGraph::Execute(Stream& stream,
       dnn_support.cudnn_
           ->GetHandle(ExtractGpuExecutor(stream.parent()), &stream)
           .handle(),
-      tensor_to_ptr_map, /*workspace=*/nullptr));
+      tensor_to_ptr_map, workspace.opaque()));
   return absl::OkStatus();
 }
 
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h
index d2ab9ad7d61ab3..b804260010f8d6 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h
@@ -62,7 +62,7 @@ class CudnnGraph : public dnn::DnnGraph {
   // Prepares a graph and checks whether it is generally supported.
   absl::StatusOr<bool> Prepare(dnn::DnnSupport&) override;
   // Builds single plan of the graph with given ID.
-  absl::Status Build(dnn::DnnSupport&, int64_t plan_id) override;
+  absl::Status Build(dnn::DnnSupport&, std::optional<int64_t> plan_id) override;
   // Builds all the plans
   absl::Status Execute(Stream& stream,
                        absl::Span<DeviceMemoryBase> operands) const override;
@@ -338,22 +338,19 @@ class CudnnSupport : public dnn::DnnSupport {
   absl::StatusOr<std::unique_ptr<const dnn::FusedMHARunner>>
   FusedMHARunnerFromDesc(
       Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
-      dnn::FusedMHAKind kind,
       const dnn::MatmulTensorDescriptor& bmm1_lhs_descriptor,
       const dnn::MatmulTensorDescriptor& bmm1_rhs_descriptor,
       const dnn::MatmulTensorDescriptor& bmm2_rhs_descriptor,
       const dnn::MatmulTensorDescriptor& intermediate_bmm2_lhs_descriptor,
       const dnn::TensorDescriptor& output_descriptor,
       std::optional<dnn::TensorDescriptor> activation_descriptor,
-      std::optional<dnn::TensorDescriptor> mask_descriptor,
       std::optional<dnn::TensorDescriptor> bias_descriptor, double scale,
       std::optional<double> dropout_rate, std::optional<int64_t> seed,
-      bool is_flash_attention, dnn::FMHAMaskKind mask_type) override;
+      dnn::FMHAMaskKind mask_type) override;
 
   absl::StatusOr<std::unique_ptr<const dnn::FusedMHABackwardRunner>>
   FusedMHABackwardRunnerFromDesc(
       Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
-      dnn::FusedMHAKind kind,
       const dnn::MatmulTensorDescriptor& bmm1_grad_gemm1_rhs_descriptor,
       const dnn::MatmulTensorDescriptor& bmm1_grad_gemm2_rhs_descriptor,
       const dnn::MatmulTensorDescriptor& bmm2_grad_gemm1_lhs_descriptor,
@@ -363,12 +360,11 @@ class CudnnSupport : public dnn::DnnSupport {
       const dnn::TensorDescriptor& d_bmm1_rhs_descriptor,
       const dnn::TensorDescriptor& d_bmm2_rhs_descriptor,
       std::optional<dnn::TensorDescriptor> d_s_descriptor,
-      std::optional<dnn::TensorDescriptor> mask_descriptor,
       std::optional<dnn::TensorDescriptor> d_bias_descriptor,
       std::optional<dnn::TensorDescriptor> fwd_output_descriptor,
       std::optional<dnn::TensorDescriptor> bias_descriptor, double scale,
       std::optional<double> dropout_rate, std::optional<int64_t> seed,
-      bool is_flash_attention, dnn::FMHAMaskKind mask_type);
+      dnn::FMHAMaskKind mask_type);
 
   bool GetRnnAlgorithms(
       std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
@@ -727,7 +723,6 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionOperationGraph(
     const dnn::MatmulTensorDescriptor& v_descriptor,
     const dnn::TensorDescriptor& o_descriptor,
     const std::optional<dnn::TensorDescriptor> bias_descriptor,
-    const std::optional<dnn::TensorDescriptor> mask_descriptor,
     const std::optional<dnn::TensorDescriptor> stats_descriptor,
     const float scale, const bool use_dropout,
     const std::optional<double> dropout_rate,
@@ -743,7 +738,7 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionBackwardOperationGraph(
     const dnn::TensorDescriptor& dv_desc,
     const std::optional<dnn::TensorDescriptor> bias_descriptor,
     std::optional<double> dropout_rate, std::optional<int64_t> seed,
-    double scale, bool use_dropout, bool use_mask, bool use_bias,
+    double scale, bool use_dropout, bool use_bias,
     const dnn::FMHAMaskKind mask_type);
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_driver.cc b/third_party/xla/xla/stream_executor/cuda/cuda_driver.cc
index 9d9054f09a40c3..336e821eba2bbb 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_driver.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_driver.cc
@@ -306,6 +306,17 @@ int GetFlagsFromEnv() {
 
 }  // namespace
 
+absl::StatusOr<CUresult> QueryEvent(GpuContext* context, CUevent event) {
+  ScopedActivateContext activated{context};
+  CUresult res = cuEventQuery(event);
+  if (res != CUDA_SUCCESS && res != CUDA_ERROR_NOT_READY) {
+    return absl::InternalError(
+        absl::StrFormat("failed to query event: %s", ToString(res)));
+  }
+
+  return res;
+}
+
 /* static */ absl::Status GpuDriver::Init() {
   // Cached return value from calling InternalInit(), as cuInit need only be
   // called once, but GpuDriver::Init may be called many times.
@@ -427,10 +438,6 @@ int GetFlagsFromEnv() {
   CreatedContexts::Remove(context->context());
 }
 
-/* static */ CUcontext GpuDriver::GetContextHandle(GpuContext* context) {
-  return context->context();
-}
-
 /* static */ absl::Status GpuDriver::FuncGetAttribute(
     CUfunction_attribute attribute, CUfunction func, int* attribute_value) {
   RETURN_IF_CUDA_RES_ERROR(cuFuncGetAttribute(attribute_value, attribute, func),
@@ -1844,18 +1851,6 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64_t bytes) {
   return absl::OkStatus();
 }
 
-/* static */ absl::StatusOr<CUresult> GpuDriver::QueryEvent(GpuContext* context,
-                                                            CUevent event) {
-  ScopedActivateContext activated{context};
-  CUresult res = cuEventQuery(event);
-  if (res != CUDA_SUCCESS && res != CUDA_ERROR_NOT_READY) {
-    return absl::InternalError(
-        absl::StrFormat("failed to query event: %s", ToString(res)));
-  }
-
-  return res;
-}
-
 /* static */ bool GpuDriver::GetEventElapsedTime(GpuContext* context,
                                                  float* elapsed_milliseconds,
                                                  CUevent start, CUevent stop) {
@@ -2264,7 +2259,7 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64_t bytes) {
 }
 
 // Helper function that turns the integer output of cuDeviceGetAttribute to type
-// T and wraps it in a StatusOr.
+// T and wraps it in a absl::StatusOr.
 template <typename T>
 static absl::StatusOr<T> GetSimpleAttribute(CUdevice device,
                                             CUdevice_attribute attribute) {
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_driver.h b/third_party/xla/xla/stream_executor/cuda/cuda_driver.h
index a72740ef4ead86..110cbe238fcc90 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_driver.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_driver.h
@@ -48,6 +48,11 @@ static std::string ToString(CUresult result) {
   return absl::StrCat(error_name, ": ", error_string);
 }
 
+// Polls (without blocking) to determine the status of an event - pending or
+// complete (or an error status).
+// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g6f0704d755066b0ee705749ae911deef
+absl::StatusOr<CUresult> QueryEvent(GpuContext* context, CUevent event);
+
 // CUDAContext wraps a cuda CUcontext handle, and includes a unique id. The
 // unique id is positive, and ids are not repeated within the process.
 class GpuContext {
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_event.cc b/third_party/xla/xla/stream_executor/cuda/cuda_event.cc
index f42cf47e86e1ab..8f80e387387672 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_event.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_event.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/statusor.h"
 #include "third_party/gpus/cuda/include/cuda.h"
+#include "xla/stream_executor/cuda/cuda_driver.h"
 #include "xla/stream_executor/event.h"
-#include "xla/stream_executor/gpu/gpu_driver.h"
 #include "xla/stream_executor/gpu/gpu_event.h"
 #include "xla/stream_executor/gpu/gpu_executor.h"
 
@@ -26,7 +26,7 @@ namespace gpu {
 
 Event::Status GpuEvent::PollForStatus() {
   absl::StatusOr<CUresult> status =
-      GpuDriver::QueryEvent(parent_->gpu_context(), gpu_event_);
+      QueryEvent(parent_->gpu_context(), gpu_event_);
   if (!status.ok()) {
     LOG(ERROR) << "Error polling for event status: "
                << status.status().message();
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
index 062775371b79a3..bf2918195cb359 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <string>
 #include <tuple>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/base/casts.h"
@@ -708,7 +709,8 @@ absl::Status GpuExecutor::Memcpy(Stream* stream, void* host_dst,
   bool ok = GpuDriver::AsynchronousMemcpyD2H(context_, host_dst,
                                              AsCudaDevicePtr(gpu_src), size,
                                              AsGpuStreamValue(stream));
-  // TODO(b/326130105): Change AsynchronousMemcpyD2H calls to return Status.
+  // TODO(b/326130105): Change AsynchronousMemcpyD2H calls to return
+  // absl::Status.
   if (!ok) {
     return absl::InternalError("Failed to memcpy from device to host.");
   }
@@ -720,7 +722,8 @@ absl::Status GpuExecutor::Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst,
   bool ok = GpuDriver::AsynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
                                              host_src, size,
                                              AsGpuStreamValue(stream));
-  // TODO(b/326130105): Change AsynchronousMemcpyD2H calls to return Status.
+  // TODO(b/326130105): Change AsynchronousMemcpyD2H calls to return
+  // absl::Status.
   if (!ok) {
     return absl::InternalError("Failed to memcpy from device to host.");
   }
@@ -755,10 +758,6 @@ bool GpuExecutor::HostCallback(Stream* stream,
   delete callback;
 }
 
-absl::Status GpuExecutor::AllocateEvent(Event* event) {
-  return AsGpuEvent(event)->Init();
-}
-
 absl::Status GpuExecutor::DeallocateEvent(Event* event) {
   return AsGpuEvent(event)->Destroy();
 }
@@ -793,13 +792,6 @@ Event::Status GpuExecutor::PollForEventStatus(Event* event) {
   return AsGpuEvent(event)->PollForStatus();
 }
 
-bool GpuExecutor::AllocateStream(Stream* stream) {
-  absl::MutexLock l(&alive_gpu_streams_mu_);
-  bool out = AsGpuStream(stream)->Init();
-  alive_gpu_streams_[stream->platform_specific_handle().stream] = stream;
-  return out;
-}
-
 void GpuExecutor::DeallocateStream(Stream* stream) {
   {
     absl::MutexLock lock(&mu_);
@@ -910,27 +902,34 @@ bool GpuExecutor::DeviceMemoryUsage(int64_t* free, int64_t* total) const {
   return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
 }
 
-bool GpuExecutor::GetSymbol(const std::string& symbol_name,
-                            ModuleHandle module_handle, void** mem,
-                            size_t* bytes) {
+absl::StatusOr<DeviceMemoryBase> GpuExecutor::GetSymbol(
+    const std::string& symbol_name, ModuleHandle module_handle) {
+  void* mem = nullptr;
+  size_t bytes = 0;
+
   CHECK(static_cast<bool>(module_handle));
 
   auto lookup_in_module = [&](CUmodule module) {
     CHECK(module != nullptr);
     return GpuDriver::GetModuleSymbol(context_, module, symbol_name.c_str(),
-                                      reinterpret_cast<CUdeviceptr*>(mem),
-                                      bytes);
+                                      reinterpret_cast<CUdeviceptr*>(&mem),
+                                      &bytes);
   };
 
   {  // give limited scope to mutex_lock
     absl::MutexLock lock{&in_memory_modules_mu_};
     auto it = gpu_binary_to_module_.find(module_handle.id());
     CHECK(it != gpu_binary_to_module_.end());
-    return lookup_in_module(it->second.first);
+    if (lookup_in_module(it->second.first)) {
+      return DeviceMemoryBase(mem, bytes);
+    }
   }
 
   LOG(INFO) << "Failed to find symbol: " << symbol_name;
-  return false;
+  return absl::NotFoundError(
+      absl::StrCat("Check if module containing symbol ", symbol_name,
+                   " is loaded (module_handle = ",
+                   reinterpret_cast<uintptr_t>(module_handle.id()), ")"));
 }
 
 absl::Status FillBlockDimLimit(GpuDeviceHandle device,
@@ -947,12 +946,32 @@ absl::Status FillBlockDimLimit(GpuDeviceHandle device,
   return absl::OkStatus();
 }
 
-std::unique_ptr<EventInterface> GpuExecutor::CreateEventImplementation() {
-  return std::unique_ptr<EventInterface>(new GpuEvent(this));
+absl::StatusOr<std::unique_ptr<Event>> GpuExecutor::CreateEvent() {
+  auto gpu_event = std::make_unique<GpuEvent>(this);
+  TF_RETURN_IF_ERROR(gpu_event->Init());
+  return std::make_unique<Event>(this, std::move(gpu_event));
 }
 
-std::unique_ptr<StreamInterface> GpuExecutor::GetStreamImplementation() {
-  return std::unique_ptr<StreamInterface>(new GpuStream(this));
+absl::StatusOr<std::unique_ptr<Stream>> GpuExecutor::CreateStream(
+    std::optional<std::variant<StreamPriority, int>> priority) {
+  auto gpu_stream = std::make_unique<GpuStream>(this);
+  if (priority.has_value()) {
+    if (std::holds_alternative<StreamPriority>(*priority)) {
+      gpu_stream->SetPriority(std::get<StreamPriority>(*priority));
+    } else {
+      gpu_stream->SetPriority(std::get<int>(*priority));
+    }
+  }
+  absl::MutexLock l(&alive_gpu_streams_mu_);
+  bool init_worked = gpu_stream->Init();
+  if (init_worked) {
+    auto platform_specific_stream = gpu_stream->platform_specific_stream();
+    auto stream = std::make_unique<Stream>(this, std::move(gpu_stream));
+    alive_gpu_streams_[platform_specific_stream] = stream.get();
+    return std::move(stream);
+  } else {
+    return absl::InvalidArgumentError("Failed to initialize gpu stream");
+  }
 }
 
 absl::StatusOr<std::unique_ptr<Kernel>> GpuExecutor::CreateKernel() {
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc b/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc
index 432994123b89bd..bdace571118435 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc
@@ -132,8 +132,7 @@ absl::StatusOr<StreamExecutor*> CudaPlatform::GetExecutor(
 
 absl::StatusOr<std::unique_ptr<StreamExecutor>>
 CudaPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
-  auto executor = std::make_unique<StreamExecutor>(
-      this, std::make_unique<GpuExecutor>(config.ordinal));
+  auto executor = std::make_unique<GpuExecutor>(this, config.ordinal);
   auto init_status = executor->Init();
   if (!init_status.ok()) {
     return absl::InternalError(absl::StrFormat(
diff --git a/third_party/xla/xla/stream_executor/device_description.cc b/third_party/xla/xla/stream_executor/device_description.cc
index 7361a799b56bb4..0e4ec2ce20a40d 100644
--- a/third_party/xla/xla/stream_executor/device_description.cc
+++ b/third_party/xla/xla/stream_executor/device_description.cc
@@ -107,6 +107,10 @@ GpuDeviceInfoProto DeviceDescription::ToGpuProto() const {
   return proto;
 }
 
+std::string DeviceDescription::ToString() const {
+  return ToGpuProto().DebugString();
+}
+
 const GpuComputeCapability &DeviceDescription::gpu_compute_capability() const {
   return gpu_compute_capability_;
 }
diff --git a/third_party/xla/xla/stream_executor/device_description.h b/third_party/xla/xla/stream_executor/device_description.h
index b2ada3ece94f5a..74227faf3dae76 100644
--- a/third_party/xla/xla/stream_executor/device_description.h
+++ b/third_party/xla/xla/stream_executor/device_description.h
@@ -20,6 +20,7 @@ limitations under the License.
 #ifndef XLA_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_
 #define XLA_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_
 
+#include <cassert>
 #include <cstdint>
 #include <memory>
 #include <string>
@@ -58,14 +59,37 @@ struct CudaComputeCapability {
     this->major = major;
     this->minor = minor;
   }
+  // cuda arch format "major.minor", example: "8.6".
+  explicit CudaComputeCapability(const std::string &cuda_arch_name) {
+    std::vector<std::string> split = absl::StrSplit(cuda_arch_name, '.');
+    assert(split.size() == 2);
+    this->major = std::stoi(split[0]);
+    this->minor = std::stoi(split[1]);
+  }
 
   explicit CudaComputeCapability(const CudaComputeCapabilityProto &proto) {
     this->major = proto.major();
     this->minor = proto.minor();
   }
 
+  static CudaComputeCapability Hopper() {
+    return CudaComputeCapability{HOPPER, 0};
+  }
+
+  static CudaComputeCapability Volta() {
+    return CudaComputeCapability{VOLTA, 0};
+  }
+
+  static CudaComputeCapability Ampere() {
+    return CudaComputeCapability{AMPERE, 0};
+  }
+
   bool IsAtLeast(int other_major, int other_minor = 0) const {
-    return !(*this < CudaComputeCapability{other_major, other_minor});
+    return IsAtLeast(CudaComputeCapability{other_major, other_minor});
+  }
+
+  bool IsAtLeast(const CudaComputeCapability &cc) const {
+    return !(*this < cc);
   }
 
   bool IsAtLeastVolta() const {
@@ -159,6 +183,15 @@ class RocmComputeCapability {
     return absl::StrJoin(kSupportedGfxVersions, ", ");
   }
 
+  bool gfx9_mi100() const { return gfx_version() == "gfx908"; }
+
+  bool gfx9_mi200() const { return gfx_version() == "gfx90a"; }
+
+  bool gfx9_mi300() const {
+    static constexpr absl::string_view kList[] = {"gfx940", "gfx941", "gfx942"};
+    return absl::c_count(kList, gfx_version()) != 0;
+  }
+
   bool gfx9_mi100_or_later() const {
     static constexpr absl::string_view kList[] = {"gfx908", "gfx90a", "gfx940",
                                                   "gfx941", "gfx942"};
@@ -171,11 +204,6 @@ class RocmComputeCapability {
     return absl::c_count(kList, gfx_version()) != 0;
   }
 
-  bool gfx9_mi300() const {
-    static constexpr absl::string_view kList[] = {"gfx940", "gfx941", "gfx942"};
-    return absl::c_count(kList, gfx_version()) != 0;
-  }
-
   bool navi21() const { return gfx_version() == "gfx1030"; }
 
   bool navi31() const { return gfx_version() == "gfx1100"; }
@@ -372,7 +400,71 @@ class DeviceDescription {
     return shared_memory_per_block_optin_;
   }
 
+  // L1 size varies because it can be dynamically
+  // configured as shared memory; there is no easy way to query its actual size;
+  // also we do not count what occupies cache, but rather claim that what is
+  // much smaller than the cache size will likely stay in it.
+  constexpr int64_t l1_cache_size_per_SM() const {
+    return std::visit(
+        [](const auto &capability) -> int64_t {
+          if constexpr (std::is_same_v<std::decay_t<decltype(capability)>,
+                                       RocmComputeCapability>) {
+            // MI100 and MI200 has 16KB L1 cache per CU.
+            if (capability.gfx9_mi100() || capability.gfx9_mi200()) {
+              return 16 * 1024;
+            }
+            // MI300 has 32KB L1 cache per CU.
+            if (capability.gfx9_mi300()) {
+              return 32 * 1024;
+            }
+          }
+          // Default return for other GPUs (e.g., RTX A6000).
+          return 2 * 1024;
+        },
+        gpu_compute_capability_);
+  }
+
+  constexpr int64_t dram_to_l2_transaction_size_bytes() const {
+    return std::visit(
+        [](const auto &capability) -> int {
+          if constexpr (std::is_same_v<std::decay_t<decltype(capability)>,
+                                       RocmComputeCapability>) {
+            // DRAM->L2 bus is 128 Byte width for MI300.
+            if (capability.gfx9_mi300()) {
+              return 128;
+            }
+          }
+          // Cache line is 128B that is split into 4 sectors of 32B. Default
+          // transaction size from DRAM -> L2 = 64 Bytes = 2 sectors, since
+          // V100, but it can be also configured.
+          // https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21819-optimizing-applications-for-nvidia-ampere-gpu-architecture.pdf
+          // (page 10).
+          // return 64 Bytes by default.
+          return 64;
+        },
+        gpu_compute_capability_);
+  }
+
+  constexpr int64_t memory_transactions_per_clock() const {
+    return std::visit(
+        [](const auto &capability) -> int {
+          if constexpr (std::is_same_v<std::decay_t<decltype(capability)>,
+                                       RocmComputeCapability>) {
+            // 16 works well on MI300.
+            if (capability.gfx9_mi300()) {
+              return 16;
+            }
+          }
+          // Default return for other GPUs.
+          return 32;
+        },
+        gpu_compute_capability_);
+  }
+
   GpuDeviceInfoProto ToGpuProto() const;
+
+  std::string ToString() const;
+
   explicit DeviceDescription(const GpuDeviceInfoProto &proto);
 
   // For string values that are not available via the underlying platform, this
diff --git a/third_party/xla/xla/stream_executor/device_memory.h b/third_party/xla/xla/stream_executor/device_memory.h
index edf4ace76edbd1..c8ff116be2231f 100644
--- a/third_party/xla/xla/stream_executor/device_memory.h
+++ b/third_party/xla/xla/stream_executor/device_memory.h
@@ -52,7 +52,13 @@ class DeviceMemoryBase {
   // region. An opaque pointer may be provided -- see header for details on the
   // opacity of that pointer.
   explicit DeviceMemoryBase(void *opaque = nullptr, uint64_t size = 0)
-      : opaque_(opaque), size_(size) {}
+      : opaque_(opaque), size_(size) {
+    // TODO(b/336267585): This constructor dangerously encourages
+    //                 DeviceMemoryBase(mem) which would imply
+    //                 DeviceMemoryBase(mem, 0)
+    //                 We should delete & resolve any dependencies.
+    //  explicit DeviceMemoryBase(void *opaque) = delete;
+  }
 
   // Returns whether the backing memory is the null pointer.
   // A `== nullptr` convenience method is also provided.
@@ -143,6 +149,12 @@ class DeviceMemory final : public DeviceMemoryBase {
   // allocation.
   uint64_t ElementCount() const { return size() / sizeof(ElemT); }
 
+  // Returns pointer to the allocated data
+  ElemT *base() { return reinterpret_cast<ElemT *>(opaque()); }
+  const ElemT *base() const {
+    return reinterpret_cast<const ElemT *>(opaque());
+  }
+
   // Creates a typed area of DeviceMemory with a given opaque pointer and the
   // quantity of bytes in the allocation. This function is broken out to
   // distinguish bytes from an element count.
diff --git a/third_party/xla/xla/stream_executor/device_memory_allocator.cc b/third_party/xla/xla/stream_executor/device_memory_allocator.cc
new file mode 100644
index 00000000000000..d659322c263fae
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/device_memory_allocator.cc
@@ -0,0 +1,116 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/device_memory_allocator.h"
+
+#include <cstdint>
+#include <tuple>
+#include <utility>
+
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor_interface.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/numbers.h"
+#include "tsl/platform/statusor.h"
+
+namespace stream_executor {
+
+StreamExecutorMemoryAllocator::StreamExecutorMemoryAllocator(
+    StreamExecutorInterface* executor)
+    : DeviceMemoryAllocator(executor->GetPlatform()) {
+  stream_executors_ = {executor};
+}
+
+StreamExecutorMemoryAllocator::StreamExecutorMemoryAllocator(
+    const Platform* platform,
+    absl::Span<StreamExecutorInterface* const> stream_executors)
+    : DeviceMemoryAllocator(platform),
+      stream_executors_(stream_executors.begin(), stream_executors.end()) {}
+
+absl::StatusOr<OwningDeviceMemory> StreamExecutorMemoryAllocator::Allocate(
+    int device_ordinal, uint64_t size, bool retry_on_failure,
+    int64_t memory_space) {
+  TF_ASSIGN_OR_RETURN(StreamExecutorInterface * executor,
+                      GetStreamExecutor(device_ordinal));
+  DeviceMemoryBase result =
+      executor->AllocateArray<uint8_t>(size, memory_space);
+  if (size > 0 && result == nullptr) {
+    return absl::ResourceExhaustedError(absl::StrFormat(
+        "Failed to allocate request for %s (%uB) on device ordinal %d",
+        tsl::strings::HumanReadableNumBytes(size), size, device_ordinal));
+  }
+  VLOG(3) << absl::StreamFormat("Allocated %s (%uB) on device ordinal %d: %p",
+                                tsl::strings::HumanReadableNumBytes(size), size,
+                                device_ordinal, result.opaque());
+  return OwningDeviceMemory(result, device_ordinal, this);
+}
+
+absl::Status StreamExecutorMemoryAllocator::Deallocate(int device_ordinal,
+                                                       DeviceMemoryBase mem) {
+  if (!mem.is_null()) {
+    TF_ASSIGN_OR_RETURN(StreamExecutorInterface * executor,
+                        GetStreamExecutor(device_ordinal));
+    VLOG(3) << absl::StreamFormat("Freeing %p on device ordinal %d",
+                                  mem.opaque(), device_ordinal);
+    executor->Deallocate(&mem);
+  }
+  return absl::OkStatus();
+}
+
+absl::StatusOr<StreamExecutorInterface*>
+StreamExecutorMemoryAllocator::GetStreamExecutor(int device_ordinal) const {
+  if (device_ordinal < 0) {
+    return absl::InvalidArgumentError(absl::StrFormat(
+        "device ordinal value (%d) must be non-negative", device_ordinal));
+  }
+  for (StreamExecutorInterface* se : stream_executors_) {
+    if (se->device_ordinal() == device_ordinal) {
+      return se;
+    }
+  }
+  return absl::NotFoundError(
+      absl::StrFormat("Device %s:%d present but not supported",
+                      platform()->Name(), device_ordinal));
+}
+
+bool StreamExecutorMemoryAllocator::AllowsAsynchronousDeallocation() const {
+  return false;
+}
+
+absl::StatusOr<Stream*> StreamExecutorMemoryAllocator::GetStream(
+    int device_ordinal) {
+  CHECK(!AllowsAsynchronousDeallocation())
+      << "The logic below only works for synchronous allocators";
+  TF_ASSIGN_OR_RETURN(StreamExecutorInterface * executor,
+                      GetStreamExecutor(device_ordinal));
+  absl::MutexLock lock(&mutex_);
+  if (!streams_.count(device_ordinal)) {
+    TF_ASSIGN_OR_RETURN(auto stream, executor->CreateStream());
+    auto stream_ptr = stream.get();
+    streams_.emplace(device_ordinal, std::move(stream));
+    return stream_ptr;
+  }
+  return streams_.at(device_ordinal).get();
+}
+
+}  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/device_memory_allocator.h b/third_party/xla/xla/stream_executor/device_memory_allocator.h
index be77b335e76d44..bab6812458036b 100644
--- a/third_party/xla/xla/stream_executor/device_memory_allocator.h
+++ b/third_party/xla/xla/stream_executor/device_memory_allocator.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream_executor_interface.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/status.h"
 
@@ -69,12 +70,6 @@ class ScopedDeviceMemory {
     DCHECK_GE(device_ordinal_, 0);
   }
 
-  // A helper constructor to generate a scoped device memory given an already
-  // allocated memory and a stream executor.
-  //
-  // Precondition: memory was allocated by the stream executor `parent`.
-  ScopedDeviceMemory(StreamExecutor *parent, DeviceMemoryBase value);
-
   // Moves ownership of the memory from other to the constructed
   // object.
   //
@@ -166,8 +161,9 @@ class DeviceMemoryAllocator {
 
   // Allocates memory on the device.
   //
-  // If size > 0 and the returned StatusOr is OK, the wrapped OwningDeviceMemory
-  // must not be null.  If size == 0, must return a null OwningDeviceMemory.
+  // If size > 0 and the returned absl::StatusOr is OK, the wrapped
+  // OwningDeviceMemory must not be null.  If size == 0, must return a null
+  // OwningDeviceMemory.
   //
   // 'retry_on_failure': If false, and the first attempt to allocate the memory
   // fails, the allocation should return immediately without retrying.  An
@@ -233,14 +229,14 @@ class StreamExecutorMemoryAllocator : public DeviceMemoryAllocator {
  public:
   // Create an allocator supporting a single device, corresponding to the passed
   // executor.
-  explicit StreamExecutorMemoryAllocator(StreamExecutor *executor);
+  explicit StreamExecutorMemoryAllocator(StreamExecutorInterface *executor);
 
   // Create an allocator supporting multiple stream executors.
   //
   // Precondition: all stream_executors have different device ordinals.
   StreamExecutorMemoryAllocator(
       const Platform *platform,
-      absl::Span<StreamExecutor *const> stream_executors);
+      absl::Span<StreamExecutorInterface *const> stream_executors);
 
   absl::StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64_t size,
                                               bool retry_on_failure,
@@ -258,17 +254,18 @@ class StreamExecutorMemoryAllocator : public DeviceMemoryAllocator {
   absl::StatusOr<Stream *> GetStream(int device_ordinal) override;
 
   // Gets the stream executor for given device ordinal.
-  absl::StatusOr<StreamExecutor *> GetStreamExecutor(int device_ordinal) const;
+  absl::StatusOr<StreamExecutorInterface *> GetStreamExecutor(
+      int device_ordinal) const;
 
  private:
   // Available stream executors. Each stream executor has a different device
   // ordinal.
-  std::vector<StreamExecutor *> stream_executors_;
+  std::vector<StreamExecutorInterface *> stream_executors_;
 
   absl::Mutex mutex_;
 
   // Cache of streams for GetStream.
-  std::map<int, Stream> streams_ ABSL_GUARDED_BY(mutex_);
+  std::map<int, std::unique_ptr<Stream>> streams_ ABSL_GUARDED_BY(mutex_);
 };
 
 template <typename ElemT>
diff --git a/third_party/xla/xla/stream_executor/device_memory_handle.cc b/third_party/xla/xla/stream_executor/device_memory_handle.cc
new file mode 100644
index 00000000000000..c21fbca804265b
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/device_memory_handle.cc
@@ -0,0 +1,50 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "xla/stream_executor/device_memory_handle.h"
+
+#include <utility>
+
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/stream_executor_interface.h"
+
+namespace stream_executor {
+
+DeviceMemoryHandle::DeviceMemoryHandle(StreamExecutorInterface *executor,
+                                       DeviceMemoryBase memory)
+    : memory_(std::move(memory)), executor_(executor) {}
+
+DeviceMemoryHandle::DeviceMemoryHandle(DeviceMemoryHandle &&other) noexcept
+    : memory_(std::move(other.memory_)), executor_(other.executor_) {
+  other.memory_ = DeviceMemoryBase();
+}
+
+DeviceMemoryHandle::~DeviceMemoryHandle() { Free(); }
+
+void DeviceMemoryHandle::Free() {
+  if (!memory_.is_null()) {
+    executor_->Deallocate(&memory_);
+  }
+}
+
+DeviceMemoryHandle &DeviceMemoryHandle::operator=(
+    DeviceMemoryHandle &&other) noexcept {
+  Free();
+  memory_ = std::move(other.memory_);
+  other.memory_ = DeviceMemoryBase();
+  executor_ = other.executor_;
+  return *this;
+}
+
+}  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/device_memory_handle.h b/third_party/xla/xla/stream_executor/device_memory_handle.h
new file mode 100644
index 00000000000000..aedfbfa7a21f7e
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/device_memory_handle.h
@@ -0,0 +1,57 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_DEVICE_MEMORY_HANDLE_H_
+#define XLA_STREAM_EXECUTOR_DEVICE_MEMORY_HANDLE_H_
+
+#include <utility>
+
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/stream_executor_interface.h"
+
+namespace stream_executor {
+
+// This class will deallocate the held DeviceMemoryBase upon destruction.
+class DeviceMemoryHandle {
+ public:
+  DeviceMemoryHandle() : memory_(), executor_(nullptr) {}
+
+  // A helper constructor to generate a scoped device memory given an already
+  // allocated memory and a stream executor.
+  DeviceMemoryHandle(StreamExecutorInterface *executor,
+                     DeviceMemoryBase memory);
+  ~DeviceMemoryHandle();
+
+  // Moves ownership of the memory from other to the constructed
+  // object.
+  DeviceMemoryHandle(DeviceMemoryHandle &&other) noexcept;
+
+  // Moves ownership of the memory from other to this object.
+  DeviceMemoryHandle &operator=(DeviceMemoryHandle &&other) noexcept;
+
+  // Accessors for the DeviceMemoryBase.
+  const DeviceMemoryBase &memory() const { return memory_; }
+  DeviceMemoryBase *memory_ptr() { return &memory_; }
+
+ private:
+  // Frees the associated memory.
+  void Free();
+
+  DeviceMemoryBase memory_;            // Value we wrap with scoped-release.
+  StreamExecutorInterface *executor_;  // Null if this object is inactive.
+};
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_DEVICE_MEMORY_HANDLE_H_
diff --git a/third_party/xla/xla/stream_executor/device_memory_handle_test.cc b/third_party/xla/xla/stream_executor/device_memory_handle_test.cc
new file mode 100644
index 00000000000000..48ebf0d53267f9
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/device_memory_handle_test.cc
@@ -0,0 +1,62 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "xla/stream_executor/device_memory_handle.h"
+
+#include <utility>
+
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/mock_stream_executor.h"
+#include "tsl/platform/test.h"
+
+namespace stream_executor {
+namespace {
+
+TEST(DeviceMemoryHandle, NullMemoryNoDeallocate) {
+  DeviceMemoryBase null_memory;
+  MockStreamExecutor executor;
+  EXPECT_CALL(executor, Deallocate).Times(0);
+  { DeviceMemoryHandle releaser(&executor, null_memory); }
+}
+
+TEST(DeviceMemoryHandle, Deallocates) {
+  MockStreamExecutor executor;
+  DeviceMemoryBase memory(&executor, sizeof(executor));
+  EXPECT_CALL(executor, Deallocate).Times(1);
+  { DeviceMemoryHandle releaser(&executor, memory); }
+}
+
+TEST(DeviceMemoryHandle, MoveDeallocatesOnce) {
+  MockStreamExecutor executor;
+  DeviceMemoryBase memory(&executor, sizeof(executor));
+  EXPECT_CALL(executor, Deallocate).Times(1);
+  {
+    DeviceMemoryHandle releaser(&executor, memory);
+    DeviceMemoryHandle releaser_moved(std::move(releaser));
+  }
+}
+
+TEST(DeviceMemoryHandle, MoveAssignmentDeallocatesOnce) {
+  MockStreamExecutor executor;
+  DeviceMemoryBase memory(&executor, sizeof(executor));
+  EXPECT_CALL(executor, Deallocate).Times(1);
+  {
+    DeviceMemoryHandle releaser(&executor, memory);
+    DeviceMemoryHandle releaser2;
+    releaser2 = std::move(releaser);
+  }
+}
+
+}  // namespace
+}  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/dnn.cc b/third_party/xla/xla/stream_executor/dnn.cc
index 7cfc21b78a06ff..2b0d03516f4ca7 100644
--- a/third_party/xla/xla/stream_executor/dnn.cc
+++ b/third_party/xla/xla/stream_executor/dnn.cc
@@ -252,24 +252,21 @@ DnnSupport::NormRunnerFromDesc(
 absl::StatusOr<std::unique_ptr<const dnn::FusedMHARunner>>
 DnnSupport::FusedMHARunnerFromDesc(
     Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
-    dnn::FusedMHAKind kind,
     const dnn::MatmulTensorDescriptor& bmm1_lhs_descriptor,
     const dnn::MatmulTensorDescriptor& bmm1_rhs_descriptor,
     const dnn::MatmulTensorDescriptor& bmm2_rhs_descriptor,
     const dnn::MatmulTensorDescriptor& intermediate_bmm2_lhs_descriptor,
     const dnn::TensorDescriptor& output_descriptor,
     std::optional<dnn::TensorDescriptor> activation_descriptor,
-    std::optional<dnn::TensorDescriptor> mask_descriptor,
     std::optional<dnn::TensorDescriptor> bias_descriptor, double scale,
     std::optional<double> dropout_rate, std::optional<int64_t> seed,
-    bool is_flash_attention, dnn::FMHAMaskKind mask_type) {
+    dnn::FMHAMaskKind mask_type) {
   return absl::UnimplementedError("FusedMHARunnerFromDesc not implemented.");
 }
 
 absl::StatusOr<std::unique_ptr<const dnn::FusedMHABackwardRunner>>
 DnnSupport::FusedMHABackwardRunnerFromDesc(
     Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
-    dnn::FusedMHAKind kind,
     const MatmulTensorDescriptor& bmm1_grad_gemm1_rhs_descriptor,
     const MatmulTensorDescriptor& bmm1_grad_gemm2_rhs_descriptor,
     const MatmulTensorDescriptor& bmm2_grad_gemm1_lhs_descriptor,
@@ -279,19 +276,19 @@ DnnSupport::FusedMHABackwardRunnerFromDesc(
     const TensorDescriptor& d_bmm1_rhs_descriptor,
     const TensorDescriptor& d_bmm2_rhs_descriptor,
     std::optional<dnn::TensorDescriptor> d_s_descriptor,
-    std::optional<dnn::TensorDescriptor> mask_descriptor,
     std::optional<dnn::TensorDescriptor> d_bias_descriptor,
     std::optional<dnn::TensorDescriptor> fwd_output_descriptor,
     std::optional<dnn::TensorDescriptor> bias_descriptor, double scale,
     std::optional<double> dropout_rate, std::optional<int64_t> seed,
-    bool is_flash_attention, dnn::FMHAMaskKind mask_type) {
+    dnn::FMHAMaskKind mask_type) {
   return absl::UnimplementedError(
       "FusedMHABackwardRunnerFromDesc not implemented.");
 }
 
 bool DnnSupport::GetMIOpenConvolveAlgorithms(
     dnn::ConvolutionKind /*kind*/, dnn::DataType /*element_type*/,
-    Stream* /*stream*/, const dnn::BatchDescriptor& /*input_descriptor*/,
+    dnn::DataType /*output_type*/, Stream* /*stream*/,
+    const dnn::BatchDescriptor& /*input_descriptor*/,
     DeviceMemoryBase input_data,
     const dnn::FilterDescriptor& /*filter_descriptor*/,
     DeviceMemoryBase filter_data,
diff --git a/third_party/xla/xla/stream_executor/dnn.h b/third_party/xla/xla/stream_executor/dnn.h
index c312ccd802a131..299d8cd2049bfe 100644
--- a/third_party/xla/xla/stream_executor/dnn.h
+++ b/third_party/xla/xla/stream_executor/dnn.h
@@ -997,7 +997,6 @@ using FusedMHASignature = void(DeviceMemoryBase /*BMM1_inputA_data*/,
                                DeviceMemoryBase /* BMM1_inputB_data */,
                                DeviceMemoryBase /* BMM2_inputA_data */,
                                DeviceMemoryBase /* output_data */,
-                               DeviceMemoryBase /* mask_data */,
                                DeviceMemoryBase /* bias_data */,
                                DeviceMemoryBase /* activation_data */,
                                DeviceMemoryBase /* seqlen_q_data */,
@@ -1013,9 +1012,9 @@ using FusedMHABackwardSignature = void(
     DeviceMemoryBase /* d_BMM1_inputA_data */,
     DeviceMemoryBase /* d_BMM1_inputB_data */,
     DeviceMemoryBase /* d_BMM2_inputB_data */, DeviceMemoryBase /* d_S_data */,
-    DeviceMemoryBase /* mask_data */, DeviceMemoryBase /* d_bias_data */,
-    DeviceMemoryBase /* fwd_output_data */, DeviceMemoryBase /* bias_data */,
-    DeviceMemoryBase /* seqlen_q_data */, DeviceMemoryBase /* seqlen_k_data */);
+    DeviceMemoryBase /* d_bias_data */, DeviceMemoryBase /* fwd_output_data */,
+    DeviceMemoryBase /* bias_data */, DeviceMemoryBase /* seqlen_q_data */,
+    DeviceMemoryBase /* seqlen_k_data */);
 using FusedMHABackwardRunner = OpRunner<FusedMHABackwardSignature>;
 
 // Describes the configuration for the algorithms that will used.
@@ -1263,7 +1262,7 @@ class DnnGraph {
   // false on expected ones (graph is valid but not supported),
   // true on success.
   virtual absl::StatusOr<bool> Prepare(DnnSupport&) = 0;
-  virtual absl::Status Build(DnnSupport&, int64_t plan_id) = 0;
+  virtual absl::Status Build(DnnSupport&, std::optional<int64_t> plan_id) = 0;
   virtual absl::Status Execute(Stream& stream,
                                absl::Span<DeviceMemoryBase> operands) const = 0;
 };
@@ -1738,21 +1737,20 @@ class DnnSupport {
 
   virtual absl::StatusOr<std::unique_ptr<const FusedMHARunner>>
   FusedMHARunnerFromDesc(
-      Stream* stream, const AlgorithmDesc& algorithm_desc, FusedMHAKind kind,
+      Stream* stream, const AlgorithmDesc& algorithm_desc,
       const MatmulTensorDescriptor& bmm1_lhs_descriptor,
       const MatmulTensorDescriptor& bmm1_rhs_descriptor,
       const MatmulTensorDescriptor& bmm2_rhs_descriptor,
       const MatmulTensorDescriptor& intermediate_bmm2_lhs_descriptor,
       const TensorDescriptor& output_descriptor,
       std::optional<TensorDescriptor> activation_descriptor,
-      std::optional<TensorDescriptor> mask_descriptor,
       std::optional<TensorDescriptor> bias_descriptor, double scale,
       std::optional<double> dropout_rate, std::optional<int64_t> seed,
-      bool is_flash_attention, dnn::FMHAMaskKind mask_type);
+      dnn::FMHAMaskKind mask_type);
 
   virtual absl::StatusOr<std::unique_ptr<const FusedMHABackwardRunner>>
   FusedMHABackwardRunnerFromDesc(
-      Stream* stream, const AlgorithmDesc& algorithm_desc, FusedMHAKind kind,
+      Stream* stream, const AlgorithmDesc& algorithm_desc,
       const MatmulTensorDescriptor& bmm1_grad_gemm1_rhs_descriptor,
       const MatmulTensorDescriptor& bmm1_grad_gemm2_rhs_descriptor,
       const MatmulTensorDescriptor& bmm2_grad_gemm1_lhs_descriptor,
@@ -1762,18 +1760,18 @@ class DnnSupport {
       const TensorDescriptor& d_bmm1_rhs_descriptor,
       const TensorDescriptor& d_bmm2_rhs_descriptor,
       std::optional<TensorDescriptor> d_s_descriptor,
-      std::optional<TensorDescriptor> mask_descriptor,
       std::optional<TensorDescriptor> d_bias_descriptor,
       std::optional<TensorDescriptor> fwd_output_descriptor,
       std::optional<TensorDescriptor> bias_descriptor, double scale,
       std::optional<double> dropout_rate, std::optional<int64_t> seed,
-      bool is_flash_attention, dnn::FMHAMaskKind mask_type);
+      dnn::FMHAMaskKind mask_type);
 
   virtual bool GetMIOpenConvolveAlgorithms(
-      ConvolutionKind kind, DataType element_type, Stream* stream,
-      const BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
-      const FilterDescriptor& filter_descriptor, DeviceMemoryBase filter_data,
-      const BatchDescriptor& output_descriptor, DeviceMemoryBase output_data,
+      ConvolutionKind kind, DataType element_type, DataType output_type,
+      Stream* stream, const BatchDescriptor& input_descriptor,
+      DeviceMemoryBase input_data, const FilterDescriptor& filter_descriptor,
+      DeviceMemoryBase filter_data, const BatchDescriptor& output_descriptor,
+      DeviceMemoryBase output_data,
       const ConvolutionDescriptor& convolution_descriptor,
       ScratchAllocator* scratch_allocator,
       std::vector<ProfileResult>* out_algorithms);
diff --git a/third_party/xla/xla/stream_executor/event.cc b/third_party/xla/xla/stream_executor/event.cc
index ba4f7d15d54c4a..5cca5e4c1e78f5 100644
--- a/third_party/xla/xla/stream_executor/event.cc
+++ b/third_party/xla/xla/stream_executor/event.cc
@@ -16,18 +16,19 @@ limitations under the License.
 #include "xla/stream_executor/event.h"
 
 #include <cstdint>
+#include <memory>
+#include <utility>
 
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "xla/stream_executor/event_interface.h"
-#include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/stream_executor_interface.h"
 
 namespace stream_executor {
 
-Event::Event(StreamExecutor* stream_exec)
-    : stream_exec_(stream_exec),
-      implementation_(
-          stream_exec_->implementation()->CreateEventImplementation()) {}
+Event::Event(StreamExecutorInterface* stream_exec,
+             std::unique_ptr<EventInterface> implementation)
+    : stream_exec_(stream_exec), implementation_(std::move(implementation)) {}
 
 Event::~Event() {
   // Deal with nullptr implementation_, as this event may have been std::moved.
@@ -42,16 +43,6 @@ Event::~Event() {
 Event::Event(Event&&) = default;
 Event& Event::operator=(Event&&) = default;
 
-bool Event::Init() {
-  auto status = stream_exec_->AllocateEvent(this);
-  if (!status.ok()) {
-    LOG(ERROR) << status.message();
-    return false;
-  }
-
-  return true;
-}
-
 Event::Status Event::PollForStatus() {
   return stream_exec_->PollForEventStatus(this);
 }
diff --git a/third_party/xla/xla/stream_executor/event.h b/third_party/xla/xla/stream_executor/event.h
index ecfe0194f99648..64c871471178d3 100644
--- a/third_party/xla/xla/stream_executor/event.h
+++ b/third_party/xla/xla/stream_executor/event.h
@@ -24,8 +24,7 @@ limitations under the License.
 namespace stream_executor {
 
 class EventInterface;
-class Stream;
-class StreamExecutor;
+class StreamExecutorInterface;
 
 // The Event class, when supported by a platform, enables low-overhead status
 // reporting for a Stream. An Event is inserted at a location in a stream via
@@ -44,15 +43,12 @@ class Event {
     kComplete,
   };
 
-  explicit Event(StreamExecutor* stream_exec);  // NOLINT
+  Event(StreamExecutorInterface* stream_exec,
+        std::unique_ptr<EventInterface> implementation);
 
   // Releases any resources held by the Event object.
   ~Event();
 
-  // Performs any platform-specific or potentially error-generating
-  // initialization.
-  bool Init();
-
   // Returns the current Status for the event.
   Status PollForStatus();
 
@@ -67,11 +63,9 @@ class Event {
   Event& operator=(Event&&);
 
  private:
-  friend class Stream;
-
-  // Pointer to the StreamExecutor interface used to create this object.
-  // Not owned.
-  StreamExecutor* stream_exec_;
+  // Pointer to the StreamExecutorInterface interface used to create this
+  // object. Not owned.
+  StreamExecutorInterface* stream_exec_;
 
   // Pointer to the platform-specific EventInterface implementation underlying
   // the object. Owned.
diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index 32103dc986dd2d..86e594ce824f3a 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -13,7 +13,6 @@ load(
 load(
     "@local_tsl//tsl/platform:build_config_root.bzl",
     "if_static",
-    "tf_gpu_tests_tags",
 )
 load(
     "@local_tsl//tsl/platform:rules_cc.bzl",
@@ -153,6 +152,7 @@ gpu_only_cc_library(
         ":gpu_types_header",
         "//xla/stream_executor",
         "//xla/stream_executor:stream_executor_interface",
+        "//xla/stream_executor:typed_kernel_factory",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:any_invocable",
@@ -319,49 +319,68 @@ gpu_only_cc_library(
 )
 
 gpu_only_cc_library(
-    name = "gpu_timer_kernel_header",
-    hdrs = ["gpu_timer_kernel.h"],
+    name = "gpu_semaphore",
+    srcs = ["gpu_semaphore.cc"],
+    hdrs = ["gpu_semaphore.h"],
+    deps = [
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:memory_allocation",
+        "//xla/stream_executor:stream_executor_headers",
+        "@com_google_absl//absl/status:statusor",
+        "@local_tsl//tsl/platform:statusor",
+    ],
 )
 
 gpu_kernel_library(
-    name = "gpu_timer_kernel",
-    srcs = if_gpu_is_configured(["gpu_timer_kernel.cu.cc"]),
+    name = "gpu_timer_kernel_cuda",
+    srcs = [
+        "gpu_timer_kernel.h",
+        "gpu_timer_kernel_cuda.cu.cc",
+    ],
+    tags = ["manual"],
     deps = [
-        ":gpu_timer_kernel_header",
-    ] + if_cuda_is_configured([
-        "@local_config_cuda//cuda:cuda_headers",
-    ]) + if_rocm_is_configured([
-        "@local_config_rocm//rocm:rocm_headers",
-    ]),
+        ":gpu_driver_header",
+        ":gpu_executor_header",
+        ":gpu_semaphore",
+        ":gpu_stream",
+        "//xla/stream_executor",
+        "//xla/stream_executor:typed_kernel_factory",
+        "@com_google_absl//absl/status:statusor",
+    ],
 )
 
-gpu_only_cc_library(
-    name = "gpu_timer_header",
-    hdrs = ["gpu_timer.h"],
+cc_library(
+    name = "gpu_timer_kernel_rocm",
+    srcs = [
+        "gpu_timer_kernel.h",
+        "gpu_timer_kernel_rocm.cc",
+    ],
+    tags = ["manual"],
     deps = [
-        ":gpu_executor_header",
-        ":gpu_timer_kernel_header",
-        ":gpu_types_header",
+        ":gpu_semaphore",
+        ":gpu_stream",
+        "//xla/stream_executor",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/time",
     ],
 )
 
 gpu_only_cc_library(
     name = "gpu_timer",
-    srcs = ["gpu_timer.cc"],
-    hdrs = ["gpu_timer.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
+    srcs = [
+        "gpu_timer.cc",
+        "gpu_timer_kernel.h",
+    ],
+    hdrs = [
+        "gpu_timer.h",
+    ],
     deps = [
         ":gpu_driver_header",
         ":gpu_executor_header",
+        ":gpu_semaphore",
         ":gpu_stream",
-        ":gpu_timer_kernel_header",
         ":gpu_types_header",
         "//xla/stream_executor",
-        "//xla/stream_executor:stream_executor_interface",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -371,14 +390,11 @@ gpu_only_cc_library(
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/utility",
         "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
-    ] + if_gpu_is_configured([
-        ":gpu_timer_kernel",
-    ]) + if_cuda_is_configured([
-        "//xla/stream_executor/cuda:cuda_driver",
+    ] + if_cuda_is_configured([
+        ":gpu_timer_kernel_cuda",
     ]) + if_rocm_is_configured([
-        "//xla/stream_executor/rocm:rocm_driver",
+        ":gpu_timer_kernel_rocm",
     ]),
 )
 
@@ -453,54 +469,76 @@ gpu_only_cc_library(
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:subprocess",
-    ] + if_cuda_is_configured([
-        "//xla/stream_executor/cuda:cuda_driver",
-    ]) + if_rocm_is_configured([
+    ] + if_rocm_is_configured([
         "//xla/stream_executor/rocm:rocm_driver",
     ]),
 )
 
 gpu_kernel_library(
-    name = "redzone_allocator_kernel",
-    srcs = if_gpu_is_configured(["redzone_allocator.cu.cc"]),
-    hdrs = if_gpu_is_configured(["redzone_allocator.h"]),
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
-    deps = if_gpu_is_configured([
+    name = "redzone_allocator_kernel_cuda",
+    srcs = [
+        "redzone_allocator_kernel.h",
+        "redzone_allocator_kernel_cuda.cc",
+    ],
+    tags = ["manual"],
+    deps = [
         ":gpu_asm_opts",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:stream_executor_headers",
+        "//xla/stream_executor:typed_kernel_factory",
+        "//xla/stream_executor/cuda:cuda_asm_compiler",
+        "//xla/stream_executor/cuda:cuda_driver",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
-    ]) + if_cuda_is_configured([
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
         "@local_config_cuda//cuda:cuda_headers",
-    ]) + if_rocm_is_configured([
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+gpu_kernel_library(
+    name = "redzone_allocator_kernel_rocm",
+    srcs = [
+        "redzone_allocator_kernel.h",
+        "redzone_allocator_kernel_rocm.cu.cc",
+    ],
+    tags = ["manual"],
+    deps = [
+        ":gpu_asm_opts",
+        "//xla/stream_executor",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:stream_executor_headers",
+        "//xla/stream_executor:typed_kernel_factory",
+        "@com_google_absl//absl/status:statusor",
         "@local_config_rocm//rocm:rocm_headers",
-    ]),
+        "@local_tsl//tsl/platform:statusor",
+    ],
 )
 
 gpu_only_cc_library(
     name = "redzone_allocator",
-    srcs = ["redzone_allocator.cc"],
+    srcs = [
+        "redzone_allocator.cc",
+        "redzone_allocator_kernel.h",
+    ],
     hdrs = ["redzone_allocator.h"],
-    copts = tsl_copts(),
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
     visibility = internal_visibility([
         "//xla/service/gpu:__subpackages__",
         "//xla/stream_executor:__subpackages__",
         "//tensorflow/core/kernels:__subpackages__",
     ]),
     deps = [
-        ":asm_compiler",
         ":gpu_asm_opts",
         "//xla/stream_executor",
         "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:device_memory_allocator",
-        "@com_google_absl//absl/base",
+        "//xla/stream_executor:device_memory_handle",
         "@com_google_absl//absl/container:fixed_array",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -508,27 +546,21 @@ gpu_only_cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:optional",
-        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/framework:allocator",
         "@local_tsl//tsl/lib/math:math_util",
         "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
     ] + if_rocm_is_configured([
-        ":redzone_allocator_kernel",
+        ":redzone_allocator_kernel_rocm",
     ]) + if_cuda_is_configured([
-        "//xla/stream_executor/cuda:cuda_asm_compiler",
+        ":redzone_allocator_kernel_cuda",
     ]),
 )
 
 xla_cc_test(
     name = "redzone_allocator_test",
-    srcs = if_gpu_is_configured(["redzone_allocator_test.cc"]),
-    tags = tf_gpu_tests_tags() + [
-        "no_cuda_asan",  # TODO(b/171512140): re-enable.
-    ],
+    srcs = ["redzone_allocator_test.cc"],
+    use_gpu = True,
     deps = [
         ":gpu_asm_opts",
         ":gpu_init",
@@ -640,6 +672,7 @@ xla_test(
         "//xla/stream_executor",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
+        "//xla/stream_executor:typed_kernel_factory",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/lib/core:status_test_util",
@@ -666,6 +699,8 @@ xla_test(
         "//xla/stream_executor",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
+        "//xla/stream_executor:trace_command_buffer_factory",
+        "//xla/stream_executor:typed_kernel_factory",
         "//xla/stream_executor/gpu:gpu_driver_header",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -691,7 +726,7 @@ xla_cc_test(
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
-    tags = tf_gpu_tests_tags(),
+    use_gpu = True,
     deps = [
         "//xla/stream_executor",
         "//xla/stream_executor:device_memory",
@@ -714,13 +749,12 @@ xla_cc_test(
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
-    tags = tf_gpu_tests_tags() + [
-        "requires-gpu-nvidia",
-    ],
+    use_gpu = True,
     deps = [
         "//xla/stream_executor",
         "//xla/stream_executor/host:host_platform",
         "@com_google_absl//absl/status:statusor",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
     ] + if_cuda([
diff --git a/third_party/xla/xla/stream_executor/gpu/asm_compiler.h b/third_party/xla/xla/stream_executor/gpu/asm_compiler.h
index 8acc94b9fd73e4..97caf40c7f4d23 100644
--- a/third_party/xla/xla/stream_executor/gpu/asm_compiler.h
+++ b/third_party/xla/xla/stream_executor/gpu/asm_compiler.h
@@ -16,28 +16,11 @@ limitations under the License.
 #ifndef XLA_STREAM_EXECUTOR_GPU_ASM_COMPILER_H_
 #define XLA_STREAM_EXECUTOR_GPU_ASM_COMPILER_H_
 
-#include <array>
 #include <cstdint>
 #include <string>
-#include <string_view>
 #include <vector>
 
-#include "absl/base/const_init.h"
-#include "absl/base/thread_annotations.h"
-#include "absl/container/node_hash_map.h"
-#include "absl/log/check.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
-#include "absl/synchronization/mutex.h"
-#include "absl/types/span.h"
-#include "xla/stream_executor/gpu/gpu_asm_opts.h"
-#include "xla/stream_executor/kernel.h"
-#include "xla/stream_executor/stream_executor.h"
-#include "tsl/platform/statusor.h"
-#if GOOGLE_CUDA
-#include "third_party/gpus/cuda/include/cuda.h"
-#include "xla/stream_executor/cuda/cuda_driver.h"
-#endif  // GOOGLE_CUDA
 
 namespace stream_executor {
 
@@ -51,36 +34,6 @@ struct HsacoImage {
 absl::StatusOr<std::vector<uint8_t>> BundleGpuAsm(
     std::vector<HsacoImage> images, const std::string rocm_root_dir);
 
-#if GOOGLE_CUDA
-// Maintains a cache of pointers to loaded kernels
-template <typename... Args>
-absl::StatusOr<TypedKernel<Args...>*> LoadKernelOrGetPtr(
-    StreamExecutor* executor, absl::string_view kernel_name,
-    absl::string_view ptx, absl::Span<const uint8_t> cubin_data) {
-  using KernelPtrCacheKey =
-      std::tuple<CUcontext, absl::string_view, absl::string_view>;
-
-  static absl::Mutex kernel_ptr_cache_mutex(absl::kConstInit);
-  static auto& kernel_ptr_cache ABSL_GUARDED_BY(kernel_ptr_cache_mutex) =
-      *new absl::node_hash_map<KernelPtrCacheKey, TypedKernel<Args...>>();
-  CUcontext current_context = cuda::CurrentContextOrDie();
-  KernelPtrCacheKey kernel_ptr_cache_key{current_context, kernel_name, ptx};
-  absl::MutexLock lock(&kernel_ptr_cache_mutex);
-
-  auto it = kernel_ptr_cache.find(kernel_ptr_cache_key);
-  if (it == kernel_ptr_cache.end()) {
-    TF_ASSIGN_OR_RETURN(
-        TypedKernel<Args...> loaded,
-        (TypedKernel<Args...>::Create(executor, kernel_name, ptx, cubin_data)));
-    it =
-        kernel_ptr_cache.emplace(kernel_ptr_cache_key, std::move(loaded)).first;
-  }
-
-  CHECK(it != kernel_ptr_cache.end());
-  return &it->second;
-}
-#endif  // GOOGLE_CUDA
-
 }  // namespace stream_executor
 
 #endif  // XLA_STREAM_EXECUTOR_GPU_ASM_COMPILER_H_
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.h b/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.h
index f6c11388787454..ff6e21e15fe894 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.h
@@ -145,6 +145,11 @@ struct BlasLt {
   };
 
   struct MatmulPlan {
+    // DoMatmul provides two sets of API for maintaning compatibility for XLA,
+    // and TF. One set API uses scratch_allocator to allocate workspace, and one
+    // set API allow uses to provide pre-allocated buffer as workspace.
+    //
+    // API that uses scratch_allocator to allocate workspace
     template <typename A, typename B, typename C, typename D, typename Scale>
     absl::Status DoMatmul(Stream* stream,
                           const HostOrDeviceScalar<Scale>& alpha,
@@ -167,8 +172,8 @@ struct BlasLt {
           blas::ToDataType<C>::value, blas::ToDataType<D>::value));
 
       return DoMatmul(stream, alpha.opaque(), a, b, beta.opaque(), c, d,
-                      algorithm, scratch_allocator, bias, aux, a_scale, b_scale,
-                      c_scale, d_scale, d_amax, profile_result);
+                      algorithm, bias, aux, a_scale, b_scale, c_scale, d_scale,
+                      d_amax, std::nullopt, &scratch_allocator, profile_result);
     }
 
     template <typename A, typename B, typename C, typename D, typename Scale>
@@ -187,6 +192,46 @@ struct BlasLt {
                       profile_result);
     }
 
+    // API that uses pre-allocated buffer as workspace
+    template <typename A, typename B, typename C, typename D, typename Scale>
+    absl::Status DoMatmul(
+        Stream* stream, const HostOrDeviceScalar<Scale>& alpha,
+        const DeviceMemory<A>& a, const DeviceMemory<B>& b,
+        const HostOrDeviceScalar<Scale>& beta, const DeviceMemory<C>& c,
+        DeviceMemory<D>& d, const MatmulAlgorithm& algorithm,
+        const DeviceMemory<C>& bias = {},
+        const DeviceMemoryBase& aux = DeviceMemory<uint8_t>{},
+        const DeviceMemory<Scale>& a_scale = {},
+        const DeviceMemory<Scale>& b_scale = {},
+        const DeviceMemory<Scale>& c_scale = {},
+        const DeviceMemory<Scale>& d_scale = {},
+        const DeviceMemory<Scale>& d_amax = {},
+        std::optional<DeviceMemoryBase> workspace = std::nullopt,
+        blas::ProfileResult* profile_result = nullptr) const {
+      TF_RETURN_IF_ERROR(ValidateInputs(
+          blas::ToDataType<Scale>::value, alpha.on_device(), beta.on_device(),
+          blas::ToDataType<A>::value, blas::ToDataType<B>::value,
+          blas::ToDataType<C>::value, blas::ToDataType<D>::value));
+
+      return DoMatmul(stream, alpha.opaque(), a, b, beta.opaque(), c, d,
+                      algorithm, bias, aux, a_scale, b_scale, c_scale, d_scale,
+                      d_amax, workspace, std::nullopt, profile_result);
+    }
+
+    template <typename A, typename B, typename C, typename D, typename Scale>
+    absl::Status DoMatmul(
+        Stream* stream, const HostOrDeviceScalar<Scale>& alpha,
+        const DeviceMemory<A>& a, const DeviceMemory<B>& b,
+        const HostOrDeviceScalar<Scale>& beta, const DeviceMemory<C>& c,
+        DeviceMemory<D>& d, const MatmulAlgorithm& algorithm,
+        const DeviceMemory<C>& bias = {},
+        const DeviceMemoryBase& aux = DeviceMemory<uint8_t>{},
+        std::optional<DeviceMemoryBase> workspace = std::nullopt,
+        blas::ProfileResult* profile_result = nullptr) const {
+      return DoMatmul(stream, alpha, a, b, beta, c, d, algorithm, bias, aux, {},
+                      {}, {}, {}, {}, workspace, profile_result);
+    }
+
     virtual absl::Status ExecuteOnStream(
         Stream* stream, DeviceMemoryBase a_buffer, DeviceMemoryBase b_buffer,
         DeviceMemoryBase c_buffer, DeviceMemoryBase d_buffer,
@@ -198,6 +243,17 @@ struct BlasLt {
         ScratchAllocator& scratch_allocator,
         blas::ProfileResult* profile_result = nullptr) const = 0;
 
+    virtual absl::Status ExecuteOnStream(
+        Stream* stream, DeviceMemoryBase a_buffer, DeviceMemoryBase b_buffer,
+        DeviceMemoryBase c_buffer, DeviceMemoryBase d_buffer,
+        DeviceMemoryBase bias_buffer,  // may be null
+        DeviceMemoryBase aux_buffer,   // may be null
+        DeviceMemoryBase a_scale_buffer, DeviceMemoryBase b_scale_buffer,
+        DeviceMemoryBase c_scale_buffer, DeviceMemoryBase d_scale_buffer,
+        DeviceMemoryBase d_amax_buffer, const MatmulAlgorithm& algorithm,
+        std::optional<DeviceMemoryBase> workspace,
+        blas::ProfileResult* profile_result = nullptr) const = 0;
+
     // Returns a list of supported algorithms for DoMatmul. The algorithms are
     // returned in the order of increasing estimated compute time according to
     // an internal heuristic.
@@ -231,7 +287,7 @@ struct BlasLt {
       Scale sbeta = static_cast<Scale>(beta);
 
       DeviceMemory<D> output(d);
-      return DoMatmul(
+      return DoMatmul<A, B, C, D, Scale>(
           stream, HostOrDeviceScalar<Scale>(salpha), DeviceMemory<A>(a),
           DeviceMemory<B>(b), HostOrDeviceScalar<Scale>(sbeta),
           DeviceMemory<C>(c), output, algorithm, scratch_allocator,
@@ -241,6 +297,37 @@ struct BlasLt {
           profile_result);
     }
 
+    template <typename Scale, typename A, typename B = A, typename C = A,
+              typename D = A>
+    absl::Status DoMatmul(Stream* stream, xla::complex128 alpha,
+                          DeviceMemoryBase a, DeviceMemoryBase b, double beta,
+                          DeviceMemoryBase c, DeviceMemoryBase d,
+                          DeviceMemoryBase bias, DeviceMemoryBase aux,
+                          DeviceMemoryBase a_scale, DeviceMemoryBase b_scale,
+                          DeviceMemoryBase c_scale, DeviceMemoryBase d_scale,
+                          DeviceMemoryBase d_amax,
+                          const MatmulAlgorithm& algorithm,
+                          std::optional<DeviceMemoryBase> workspace,
+                          blas::ProfileResult* profile_result = nullptr) const {
+      Scale salpha;
+      if constexpr (std::is_same_v<Scale, xla::complex64> ||
+                    std::is_same_v<Scale, xla::complex128>) {
+        salpha = static_cast<Scale>(alpha);
+      } else {
+        salpha = static_cast<Scale>(alpha.real());
+      }
+      Scale sbeta = static_cast<Scale>(beta);
+
+      DeviceMemory<D> output(d);
+      return DoMatmul<A, B, C, D, Scale>(
+          stream, HostOrDeviceScalar<Scale>(salpha), DeviceMemory<A>(a),
+          DeviceMemory<B>(b), HostOrDeviceScalar<Scale>(sbeta),
+          DeviceMemory<C>(c), output, algorithm, DeviceMemory<C>(bias), aux,
+          DeviceMemory<Scale>(a_scale), DeviceMemory<Scale>(b_scale),
+          DeviceMemory<Scale>(c_scale), DeviceMemory<Scale>(d_scale),
+          DeviceMemory<Scale>(d_amax), workspace, profile_result);
+    }
+
     // used internally by template DoMatmul function to validate inputs
     virtual absl::Status ValidateInputs(
         blas::DataType scale_type, bool alpha_on_device, bool beta_on_device,
@@ -255,7 +342,28 @@ struct BlasLt {
         DeviceMemoryBase aux, DeviceMemoryBase a_scale,
         DeviceMemoryBase b_scale, DeviceMemoryBase c_scale,
         DeviceMemoryBase d_scale, DeviceMemoryBase d_amax,
-        blas::ProfileResult* profile_result) const = 0;
+        blas::ProfileResult* profile_result = nullptr) const = 0;
+
+    virtual absl::Status DoMatmul(
+        Stream* stream, const void* alpha, DeviceMemoryBase a,
+        DeviceMemoryBase b, const void* beta, DeviceMemoryBase c,
+        DeviceMemoryBase d, const MatmulAlgorithm& algorithm,
+        DeviceMemoryBase bias, DeviceMemoryBase aux, DeviceMemoryBase a_scale,
+        DeviceMemoryBase b_scale, DeviceMemoryBase c_scale,
+        DeviceMemoryBase d_scale, DeviceMemoryBase d_amax,
+        std::optional<DeviceMemoryBase> workspace,
+        blas::ProfileResult* profile_result = nullptr) const = 0;
+
+    virtual absl::Status DoMatmul(
+        Stream* stream, const void* alpha, DeviceMemoryBase a,
+        DeviceMemoryBase b, const void* beta, DeviceMemoryBase c,
+        DeviceMemoryBase d, const MatmulAlgorithm& algorithm,
+        DeviceMemoryBase bias, DeviceMemoryBase aux, DeviceMemoryBase a_scale,
+        DeviceMemoryBase b_scale, DeviceMemoryBase c_scale,
+        DeviceMemoryBase d_scale, DeviceMemoryBase d_amax,
+        std::optional<DeviceMemoryBase> workspace,
+        std::optional<ScratchAllocator*> scratch_allocator,
+        blas::ProfileResult* profile_result = nullptr) const = 0;
   };  // class MatmulPlan
 
   using MatmulPlanPtr = std::unique_ptr<MatmulPlan>;
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc
index d9dd03fa667f48..bcfc6a3790507f 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc
@@ -46,6 +46,7 @@ limitations under the License.
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/typed_kernel_factory.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
@@ -213,71 +214,77 @@ GpuCommandBuffer::Dependencies GpuCommandBuffer::GetBarrier(
 }
 
 absl::StatusOr<GpuCommandBuffer::SetIfConditionKernel*>
-GpuCommandBuffer::GetSetIfConditionKernel(StreamExecutor* executor) {
+GpuCommandBuffer::GetSetIfConditionKernel() {
   if (!set_if_condition_kernel_) {
     MultiKernelLoaderSpec spec(/*arity=*/2);
     spec.AddCudaPtxInMemory(gpu::GetSetIfConditionKernel(), "set_if_condition");
-    TF_ASSIGN_OR_RETURN(set_if_condition_kernel_,
-                        SetIfConditionKernel::Create(executor, spec));
+    TF_ASSIGN_OR_RETURN(
+        set_if_condition_kernel_,
+        SetIfConditionKernel::FactoryType::Create(parent_, spec));
   }
   return &set_if_condition_kernel_;
 }
 
 absl::StatusOr<GpuCommandBuffer::SetIfElseConditionKernel*>
-GpuCommandBuffer::GetSetIfElseConditionKernel(StreamExecutor* executor) {
+GpuCommandBuffer::GetSetIfElseConditionKernel() {
   if (!set_if_else_condition_kernel_) {
     MultiKernelLoaderSpec spec(/*arity=*/3);
     spec.AddCudaPtxInMemory(gpu::GetSetIfElseConditionKernel(),
                             "set_if_else_condition");
-    TF_ASSIGN_OR_RETURN(set_if_else_condition_kernel_,
-                        SetIfElseConditionKernel::Create(executor, spec));
+    TF_ASSIGN_OR_RETURN(
+        set_if_else_condition_kernel_,
+        SetIfElseConditionKernel::FactoryType::Create(parent_, spec));
   }
   return &set_if_else_condition_kernel_;
 }
 
 absl::StatusOr<GpuCommandBuffer::SetCaseConditionKernel*>
-GpuCommandBuffer::GetSetCaseConditionKernel(StreamExecutor* executor) {
+GpuCommandBuffer::GetSetCaseConditionKernel() {
   if (!set_case_condition_kernel_) {
     MultiKernelLoaderSpec spec(/*arity=*/10);
     spec.AddCudaPtxInMemory(gpu::GetSetCaseConditionKernel(),
                             "set_case_condition");
-    TF_ASSIGN_OR_RETURN(set_case_condition_kernel_,
-                        SetCaseConditionKernel::Create(executor, spec));
+    TF_ASSIGN_OR_RETURN(
+        set_case_condition_kernel_,
+        SetCaseConditionKernel::FactoryType::Create(parent_, spec));
   }
   return &set_case_condition_kernel_;
 }
 
 absl::StatusOr<GpuCommandBuffer::SetForConditionKernel*>
-GpuCommandBuffer::GetSetForConditionKernel(StreamExecutor* executor) {
+GpuCommandBuffer::GetSetForConditionKernel() {
   if (!set_for_condition_kernel_) {
     MultiKernelLoaderSpec spec(/*arity=*/3);
     spec.AddCudaPtxInMemory(gpu::GetSetForConditionKernel(),
                             "set_for_condition");
-    TF_ASSIGN_OR_RETURN(set_for_condition_kernel_,
-                        SetForConditionKernel::Create(executor, spec));
+    TF_ASSIGN_OR_RETURN(
+        set_for_condition_kernel_,
+        SetForConditionKernel::FactoryType::Create(parent_, spec));
   }
   return &set_for_condition_kernel_;
 }
 
 absl::StatusOr<GpuCommandBuffer::SetWhileConditionKernel*>
-GpuCommandBuffer::GetSetWhileConditionKernel(StreamExecutor* executor) {
+GpuCommandBuffer::GetSetWhileConditionKernel() {
   if (!set_while_condition_kernel_) {
     MultiKernelLoaderSpec spec(/*arity=*/2);
     spec.AddCudaPtxInMemory(gpu::GetSetWhileConditionKernel(),
                             "set_while_condition");
-    TF_ASSIGN_OR_RETURN(set_while_condition_kernel_,
-                        SetWhileConditionKernel::Create(executor, spec));
+    TF_ASSIGN_OR_RETURN(
+        set_while_condition_kernel_,
+        SetWhileConditionKernel::FactoryType::Create(parent_, spec));
   }
   return &set_while_condition_kernel_;
 }
 
-absl::StatusOr<GpuCommandBuffer::NoOpKernel*> GpuCommandBuffer::GetNoOpKernel(
-    StreamExecutor* executor) {
+absl::StatusOr<GpuCommandBuffer::NoOpKernel*>
+GpuCommandBuffer::GetNoOpKernel() {
 #if !defined(TENSORFLOW_USE_ROCM)
   if (!noop_kernel_) {
     MultiKernelLoaderSpec spec(/*arity=*/0);
     spec.AddCudaPtxInMemory(gpu::kNoOpKernel, "noop");
-    TF_ASSIGN_OR_RETURN(noop_kernel_, NoOpKernel::Create(executor, spec));
+    TF_ASSIGN_OR_RETURN(noop_kernel_,
+                        NoOpKernel::FactoryType::Create(parent_, spec));
   }
   return &noop_kernel_;
 #else
@@ -325,13 +332,13 @@ absl::Status GpuCommandBuffer::CheckNumCommandBuffers(
 }
 
 absl::StatusOr<GpuGraphNodeHandle> GpuCommandBuffer::CreateBarrierNode(
-    StreamExecutor* executor, const Dependencies& dependencies) {
+    const Dependencies& dependencies) {
   GpuGraphNodeHandle barrier_handle = nullptr;
 #if !defined(TENSORFLOW_USE_ROCM)
   // TODO(b/316343054): Instead of empty nodes we create no-op kernel nodes as
   // barriers because CUDA 12.3 does not support empty nodes inside
   // conditional command buffers. This should be fixed in CUDA 12.4.
-  TF_ASSIGN_OR_RETURN(NoOpKernel * noop, GetNoOpKernel(executor));
+  TF_ASSIGN_OR_RETURN(NoOpKernel * noop, GetNoOpKernel());
 
   TF_RETURN_IF_ERROR(GpuDriver::GraphAddKernelNode(
       &barrier_handle, graph_, dependencies, "noop",
@@ -360,8 +367,7 @@ GpuCommandBuffer::Dependencies GpuCommandBuffer::GetBarrierDependencies(
   return dependencies;
 }
 
-absl::Status GpuCommandBuffer::Barrier(StreamExecutor* executor,
-                                       ExecutionScopeId execution_scope_id) {
+absl::Status GpuCommandBuffer::Barrier(ExecutionScopeId execution_scope_id) {
   ExecutionScope& execution_scope = execution_scopes_[execution_scope_id];
 
   if (state_ == State::kCreate) {
@@ -389,8 +395,7 @@ absl::Status GpuCommandBuffer::Barrier(StreamExecutor* executor,
 
     // If we have multiple dependencies or no existing barriers we have to
     // create a new empty node acting as an execution barrier.
-    TF_ASSIGN_OR_RETURN(auto barrier_handle,
-                        CreateBarrierNode(executor, dependencies));
+    TF_ASSIGN_OR_RETURN(auto barrier_handle, CreateBarrierNode(dependencies));
     execution_scope.barriers.push_back({barrier_handle, true, nodes_offset});
     return absl::OkStatus();
   }
@@ -412,19 +417,18 @@ absl::Status GpuCommandBuffer::Barrier(StreamExecutor* executor,
 }
 
 absl::Status GpuCommandBuffer::Barrier(
-    StreamExecutor* executor,
     absl::Span<const ExecutionScopeId> execution_scope_ids) {
   // Nothing to synchronize here.
   if (execution_scope_ids.empty()) return absl::OkStatus();
 
   // Do not create two-level barriers for single execution scope.
   if (execution_scope_ids.size() == 1) {
-    return Barrier(executor, execution_scope_ids[0]);
+    return Barrier(execution_scope_ids[0]);
   }
 
   // Add a new barrier to every synchronized execution scope.
   for (ExecutionScopeId execution_scope_id : execution_scope_ids) {
-    TF_RETURN_IF_ERROR(Barrier(executor, execution_scope_id));
+    TF_RETURN_IF_ERROR(Barrier(execution_scope_id));
   }
 
   if (state_ == State::kCreate) {
@@ -436,8 +440,7 @@ absl::Status GpuCommandBuffer::Barrier(
     }
 
     // Create a new barrier that joins all per-scope barriers together.
-    TF_ASSIGN_OR_RETURN(auto barrier_handle,
-                        CreateBarrierNode(executor, dependencies));
+    TF_ASSIGN_OR_RETURN(auto barrier_handle, CreateBarrierNode(dependencies));
 
     // Broadcast new barrier to all participating execution scopes.
     for (ExecutionScopeId execution_scope_id : execution_scope_ids) {
@@ -468,17 +471,16 @@ absl::Status GpuCommandBuffer::Barrier(
   return UnsupportedStateError(state_);
 }
 
-absl::Status GpuCommandBuffer::Barrier(StreamExecutor* executor,
-                                       ExecutionScopeId from_execution_scope_id,
+absl::Status GpuCommandBuffer::Barrier(ExecutionScopeId from_execution_scope_id,
                                        ExecutionScopeId to_execution_scope_id) {
   // If scopes are the same simply add a barrier to it.
   if (from_execution_scope_id == to_execution_scope_id) {
-    return Barrier(executor, from_execution_scope_id);
+    return Barrier(from_execution_scope_id);
   }
 
   // Create new barriers in both execution scopes.
-  TF_RETURN_IF_ERROR(Barrier(executor, from_execution_scope_id));
-  TF_RETURN_IF_ERROR(Barrier(executor, to_execution_scope_id));
+  TF_RETURN_IF_ERROR(Barrier(from_execution_scope_id));
+  TF_RETURN_IF_ERROR(Barrier(to_execution_scope_id));
 
   if (state_ == State::kCreate) {
     // Collect barriers from each scope as dependencies.
@@ -487,8 +489,7 @@ absl::Status GpuCommandBuffer::Barrier(StreamExecutor* executor,
         execution_scopes_[to_execution_scope_id].barriers.back().handle};
 
     // Create a new barrier that joins `from` and `to` scopes.
-    TF_ASSIGN_OR_RETURN(auto barrier_handle,
-                        CreateBarrierNode(executor, dependencies));
+    TF_ASSIGN_OR_RETURN(auto barrier_handle, CreateBarrierNode(dependencies));
 
     // Add a new barrier only to the `to_execution_scope_id`.
     ExecutionScope& execution_scope = execution_scopes_[to_execution_scope_id];
@@ -662,74 +663,6 @@ absl::Status GpuCommandBuffer::Memset(ExecutionScopeId execution_scope_id,
   return UnsupportedStateError(state_);
 }
 
-absl::StatusOr<DeviceMemoryBase> GpuCommandBuffer::Allocate(
-    ExecutionScopeId execution_scope_id, size_t bytes) {
-  ExecutionScope& execution_scope = execution_scopes_[execution_scope_id];
-
-  TF_RETURN_IF_ERROR(CheckNotFinalized());
-
-  // Adds a new memory allocation node to the graph under construction.
-  if (state_ == State::kCreate) {
-    Dependencies barrier = GetBarrier(execution_scope_id);
-    GpuGraphNodeInfo& node_info = execution_scope.nodes.emplace_back();
-
-    GpuDevicePtr ptr;
-    TF_RETURN_IF_ERROR(GpuDriver::GraphAddMemAllocNode(
-        &node_info.handle, graph_, barrier,
-        GpuDriver::MemAccessFlags::kReadWrite,
-        GpuDriver::MemLocationType::kDevice, parent_->device_ordinal(),
-        GpuDriver::MemAllocationType::kPinned, bytes, &ptr));
-    // For CUDA impl, VA range is reserved when adding memory allocation node.
-    CHECK(ptr) << "CUDA graph memory allocation node returned nullptr";
-
-    VLOG(2) << "Setting device memory base with opaque pointer "
-            << reinterpret_cast<void*>(ptr)
-            << " device ordinal: " << parent_->device_ordinal();
-    return DeviceMemoryBase(reinterpret_cast<void*>(ptr), bytes);
-  }
-
-  if (state_ == State::kUpdate) {
-    // Memory allocation node implemented through CUDA graph does not allocate
-    // new memory region on update, just return the memory region allocated
-    // during the create step.
-    GpuGraphNodeHandle node =
-        execution_scope.nodes[execution_scope.update_state.node_idx++].handle;
-    TF_ASSIGN_OR_RETURN(AllocationResult params,
-                        GpuDriver::GraphGetMemAllocNodeParams(node));
-    return DeviceMemoryBase(reinterpret_cast<void*>(params.first),
-                            params.second);
-  }
-
-  return UnsupportedStateError(state_);
-}
-
-absl::Status GpuCommandBuffer::Free(ExecutionScopeId execution_scope_id,
-                                    DeviceMemoryBase dst) {
-  ExecutionScope& execution_scope = execution_scopes_[execution_scope_id];
-
-  TF_RETURN_IF_ERROR(CheckNotFinalized());
-
-  // Adds a new memfree node to the graph under construction.
-  if (state_ == State::kCreate) {
-    Dependencies barrier = GetBarrier(execution_scope_id);
-    GpuGraphNodeInfo& node_info = execution_scope.nodes.emplace_back();
-    GpuDevicePtr gpu_dptr = AsDevicePtr(dst);
-    TF_RETURN_IF_ERROR(GpuDriver::GraphAddMemFreeNode(&node_info.handle, graph_,
-                                                      barrier, gpu_dptr));
-    return absl::OkStatus();
-  }
-
-  if (state_ == State::kUpdate) {
-    // memfree node implemented through CUDA graph only free buffers that is
-    // allocated through memory alloc node, so buffer address will not change,
-    // no update is required.
-    execution_scope.update_state.node_idx++;
-    return absl::OkStatus();
-  }
-
-  return UnsupportedStateError(state_);
-}
-
 //--------------------------------------------------------------------------//
 // Command buffer condtitional commands API
 //--------------------------------------------------------------------------//
@@ -825,9 +758,8 @@ GpuCommandBuffer::CreateConditionalNodes(
 }
 
 absl::Status GpuCommandBuffer::CreateConditionalCommand(
-    ExecutionScopeId execution_scope_id, StreamExecutor* executor,
-    ConditionType type, SetConditionFn set_condition,
-    absl::Span<const ConditionBuilder> builders) {
+    ExecutionScopeId execution_scope_id, ConditionType type,
+    SetConditionFn set_condition, absl::Span<const ConditionBuilder> builders) {
   ExecutionScope& execution_scope = execution_scopes_[execution_scope_id];
 
   TF_RETURN_IF_ERROR(CheckNotFinalized());
@@ -842,7 +774,7 @@ absl::Status GpuCommandBuffer::CreateConditionalCommand(
     TF_RETURN_IF_ERROR(set_condition(execution_scope_id, handles));
 
     // Add a barrier between conditional handles and conditional nodes.
-    TF_RETURN_IF_ERROR(Barrier(executor, execution_scope_id));
+    TF_RETURN_IF_ERROR(Barrier(execution_scope_id));
 
     // Create conditional command buffer for each builder.
     TF_ASSIGN_OR_RETURN(
@@ -870,7 +802,7 @@ absl::Status GpuCommandBuffer::CreateConditionalCommand(
         set_condition(execution_scope_id, cond_cmd_buffers.handles));
 
     // Update a barrier between conditional handles and conditional nodes.
-    TF_RETURN_IF_ERROR(Barrier(executor, execution_scope_id));
+    TF_RETURN_IF_ERROR(Barrier(execution_scope_id));
 
     // Skip updating conditional nodes.
     execution_scope.update_state.node_idx += num_handles;
@@ -884,13 +816,10 @@ absl::Status GpuCommandBuffer::CreateConditionalCommand(
 }
 
 absl::Status GpuCommandBuffer::If(ExecutionScopeId execution_scope_id,
-                                  StreamExecutor* executor,
                                   DeviceMemory<bool> predicate,
                                   Builder then_builder) {
-  DCHECK(executor->implementation() == parent_);
-
   TF_ASSIGN_OR_RETURN(SetIfConditionKernel * set_if_condition,
-                      GetSetIfConditionKernel(executor));
+                      GetSetIfConditionKernel());
 
   auto set_cond_fn = [&](ExecutionScopeId id, ConditionalHandles handles) {
     return CommandBuffer::Launch(*set_if_condition, id, ThreadDim(), BlockDim(),
@@ -900,19 +829,16 @@ absl::Status GpuCommandBuffer::If(ExecutionScopeId execution_scope_id,
   std::array<ConditionBuilder, 1> builders = {
       ToConditionBuilder(std::move(then_builder))};
 
-  return CreateConditionalCommand(execution_scope_id, executor,
-                                  ConditionType::kIf, set_cond_fn, builders);
+  return CreateConditionalCommand(execution_scope_id, ConditionType::kIf,
+                                  set_cond_fn, builders);
 }
 
 absl::Status GpuCommandBuffer::IfElse(ExecutionScopeId execution_scope_id,
-                                      StreamExecutor* executor,
                                       DeviceMemory<bool> predicate,
                                       Builder then_builder,
                                       Builder else_builder) {
-  DCHECK(executor->implementation() == parent_);
-
   TF_ASSIGN_OR_RETURN(SetIfElseConditionKernel * set_if_else_condition,
-                      GetSetIfElseConditionKernel(executor));
+                      GetSetIfElseConditionKernel());
 
   auto set_cond_fn = [&](ExecutionScopeId id, ConditionalHandles handles) {
     return CommandBuffer::Launch(*set_if_else_condition, id, ThreadDim(),
@@ -923,16 +849,13 @@ absl::Status GpuCommandBuffer::IfElse(ExecutionScopeId execution_scope_id,
       ToConditionBuilder(std::move(then_builder)),
       ToConditionBuilder(std::move(else_builder))};
 
-  return CreateConditionalCommand(execution_scope_id, executor,
-                                  ConditionType::kIf, set_cond_fn, builders);
+  return CreateConditionalCommand(execution_scope_id, ConditionType::kIf,
+                                  set_cond_fn, builders);
 }
 
 absl::Status GpuCommandBuffer::Case(ExecutionScopeId execution_scope_id,
-                                    StreamExecutor* executor,
                                     DeviceMemory<int32_t> index,
                                     std::vector<Builder> branches) {
-  DCHECK(executor->implementation() == parent_);
-
   // TODO(ezhulenev): Relax this constraint, we can launch multiple back to back
   // kernels to update conditional handles in batches of size 8.
   if (branches.size() > 8) {
@@ -941,7 +864,7 @@ absl::Status GpuCommandBuffer::Case(ExecutionScopeId execution_scope_id,
   }
 
   TF_ASSIGN_OR_RETURN(SetCaseConditionKernel * set_case_condition,
-                      GetSetCaseConditionKernel(executor));
+                      GetSetCaseConditionKernel());
 
   auto set_cond_fn = [&](ExecutionScopeId id, ConditionalHandles handles) {
     int32_t num_handles = handles.size();
@@ -965,23 +888,20 @@ absl::Status GpuCommandBuffer::Case(ExecutionScopeId execution_scope_id,
     builders.push_back(ToConditionBuilder(std::move(branch)));
   }
 
-  return CreateConditionalCommand(execution_scope_id, executor,
-                                  ConditionType::kIf, set_cond_fn, builders);
+  return CreateConditionalCommand(execution_scope_id, ConditionType::kIf,
+                                  set_cond_fn, builders);
 }
 
 absl::Status GpuCommandBuffer::For(ExecutionScopeId execution_scope_id,
-                                   StreamExecutor* executor,
                                    int32_t num_iteration,
                                    DeviceMemory<int32_t> loop_counter,
                                    Builder body_builder) {
-  DCHECK(executor->implementation() == parent_);
-
   TF_ASSIGN_OR_RETURN(SetForConditionKernel * set_for_condition,
-                      GetSetForConditionKernel(executor));
+                      GetSetForConditionKernel());
 
   // Reset loop counter to zero.
   TF_RETURN_IF_ERROR(Memset(execution_scope_id, &loop_counter, uint32_t{0}, 1));
-  TF_RETURN_IF_ERROR(Barrier(executor, execution_scope_id));
+  TF_RETURN_IF_ERROR(Barrier(execution_scope_id));
 
   auto set_cond_fn = [&](ExecutionScopeId id, ConditionalHandles handles) {
     return CommandBuffer::Launch(*set_for_condition, id, ThreadDim(),
@@ -991,7 +911,7 @@ absl::Status GpuCommandBuffer::For(ExecutionScopeId execution_scope_id,
 
   auto body = [&](CommandBuffer* body, GpuGraphConditionalHandle handle) {
     TF_RETURN_IF_ERROR(body_builder(body));
-    TF_RETURN_IF_ERROR(body->Barrier(executor));
+    TF_RETURN_IF_ERROR(body->Barrier());
 
     // Decide if we want to continue loop iteration.
     return body->Launch(*set_for_condition, ThreadDim(), BlockDim(), handle,
@@ -1000,23 +920,20 @@ absl::Status GpuCommandBuffer::For(ExecutionScopeId execution_scope_id,
 
   std::array<ConditionBuilder, 1> builders = {std::move(body)};
 
-  return CreateConditionalCommand(execution_scope_id, executor,
-                                  ConditionType::kWhile, set_cond_fn, builders);
+  return CreateConditionalCommand(execution_scope_id, ConditionType::kWhile,
+                                  set_cond_fn, builders);
 }
 
 absl::Status GpuCommandBuffer::While(ExecutionScopeId execution_scope_id,
-                                     StreamExecutor* executor,
                                      DeviceMemory<bool> pred,
                                      ExecutionScopeBuilder cond_builder,
                                      Builder body_builder) {
-  DCHECK(executor->implementation() == parent_);
-
   TF_ASSIGN_OR_RETURN(SetWhileConditionKernel * set_while_condition,
-                      GetSetWhileConditionKernel(executor));
+                      GetSetWhileConditionKernel());
 
   // Record condition commands into the parent command buffer.
   TF_RETURN_IF_ERROR(cond_builder(execution_scope_id, this));
-  TF_RETURN_IF_ERROR(Barrier(executor, execution_scope_id));
+  TF_RETURN_IF_ERROR(Barrier(execution_scope_id));
 
   auto set_cond_fn = [&](ExecutionScopeId id, ConditionalHandles handles) {
     return CommandBuffer::Launch(*set_while_condition, id, ThreadDim(),
@@ -1025,17 +942,17 @@ absl::Status GpuCommandBuffer::While(ExecutionScopeId execution_scope_id,
 
   auto body = [&](CommandBuffer* body, GpuGraphConditionalHandle handle) {
     TF_RETURN_IF_ERROR(body_builder(body));
-    TF_RETURN_IF_ERROR(body->Barrier(executor));
+    TF_RETURN_IF_ERROR(body->Barrier());
     TF_RETURN_IF_ERROR(cond_builder(kDefaulExecutionScope, body));
-    TF_RETURN_IF_ERROR(body->Barrier(executor));
+    TF_RETURN_IF_ERROR(body->Barrier());
     return body->Launch(*set_while_condition, ThreadDim(), BlockDim(), handle,
                         pred);
   };
 
   std::array<ConditionBuilder, 1> builders = {std::move(body)};
 
-  return CreateConditionalCommand(execution_scope_id, executor,
-                                  ConditionType::kWhile, set_cond_fn, builders);
+  return CreateConditionalCommand(execution_scope_id, ConditionType::kWhile,
+                                  set_cond_fn, builders);
 }
 
 absl::Status GpuCommandBuffer::Finalize() {
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
index e156e956d52b00..aaa0666a7f89bd 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
@@ -77,15 +77,12 @@ class GpuCommandBuffer : public CommandBuffer {
                    bool is_owned_graph = true);
   ~GpuCommandBuffer() override;
 
-  absl::Status Barrier(StreamExecutor* executor,
-                       ExecutionScopeId execution_scope_id) override;
+  absl::Status Barrier(ExecutionScopeId execution_scope_id) override;
 
   absl::Status Barrier(
-      StreamExecutor* executor,
       absl::Span<const ExecutionScopeId> execution_scope_ids) override;
 
-  absl::Status Barrier(StreamExecutor* executor,
-                       ExecutionScopeId from_execution_scope_id,
+  absl::Status Barrier(ExecutionScopeId from_execution_scope_id,
                        ExecutionScopeId to_execution_scope_id) override;
 
   absl::Status Launch(ExecutionScopeId execution_scope_id,
@@ -104,30 +101,23 @@ class GpuCommandBuffer : public CommandBuffer {
                       DeviceMemoryBase* dst, BitPattern bit_pattern,
                       size_t num_elements) override;
 
-  absl::StatusOr<DeviceMemoryBase> Allocate(ExecutionScopeId execution_scope_id,
-                                            size_t bytes) override;
-
-  absl::Status Free(ExecutionScopeId execution_scope_id,
-                    DeviceMemoryBase dst) override;
-
-  absl::Status If(ExecutionScopeId execution_scope_id, StreamExecutor* executor,
+  absl::Status If(ExecutionScopeId execution_scope_id,
                   DeviceMemory<bool> predicate, Builder then_builder) override;
 
   absl::Status IfElse(ExecutionScopeId execution_scope_id,
-                      StreamExecutor* executor, DeviceMemory<bool> predicate,
-                      Builder then_builder, Builder else_builder) override;
+                      DeviceMemory<bool> predicate, Builder then_builder,
+                      Builder else_builder) override;
 
   absl::Status Case(ExecutionScopeId execution_scope_id,
-                    StreamExecutor* executor, DeviceMemory<int32_t> index,
+                    DeviceMemory<int32_t> index,
                     std::vector<Builder> branches) override;
 
-  absl::Status For(ExecutionScopeId execution_scope_id,
-                   StreamExecutor* executor, int32_t num_iteration,
+  absl::Status For(ExecutionScopeId execution_scope_id, int32_t num_iteration,
                    DeviceMemory<int32_t> loop_counter,
                    Builder body_builder) override;
 
   absl::Status While(ExecutionScopeId execution_scope_id,
-                     StreamExecutor* executor, DeviceMemory<bool> pred,
+                     DeviceMemory<bool> pred,
                      ExecutionScopeBuilder cond_builder,
                      Builder body_builder) override;
 
@@ -253,25 +243,20 @@ class GpuCommandBuffer : public CommandBuffer {
       absl::Span<const GpuGraphConditionalHandle> handles);
 
   absl::Status CreateConditionalCommand(
-      ExecutionScopeId execution_scope_id, StreamExecutor* executor,
-      ConditionType type, SetConditionFn set_condition,
+      ExecutionScopeId execution_scope_id, ConditionType type,
+      SetConditionFn set_condition,
       absl::Span<const ConditionBuilder> builders);
 
   Dependencies GetBarrier(ExecutionScopeId execution_scope_id);
 
   // Returns loaded auxiliary kernels, or loads them on a given stream executor.
   // Loaded kernels owned by a current command buffer.
-  absl::StatusOr<SetIfConditionKernel*> GetSetIfConditionKernel(
-      StreamExecutor* executor);
-  absl::StatusOr<SetIfElseConditionKernel*> GetSetIfElseConditionKernel(
-      StreamExecutor* executor);
-  absl::StatusOr<SetCaseConditionKernel*> GetSetCaseConditionKernel(
-      StreamExecutor* executor);
-  absl::StatusOr<SetForConditionKernel*> GetSetForConditionKernel(
-      StreamExecutor* executor);
-  absl::StatusOr<SetWhileConditionKernel*> GetSetWhileConditionKernel(
-      StreamExecutor* executor);
-  absl::StatusOr<NoOpKernel*> GetNoOpKernel(StreamExecutor* executor);
+  absl::StatusOr<SetIfConditionKernel*> GetSetIfConditionKernel();
+  absl::StatusOr<SetIfElseConditionKernel*> GetSetIfElseConditionKernel();
+  absl::StatusOr<SetCaseConditionKernel*> GetSetCaseConditionKernel();
+  absl::StatusOr<SetForConditionKernel*> GetSetForConditionKernel();
+  absl::StatusOr<SetWhileConditionKernel*> GetSetWhileConditionKernel();
+  absl::StatusOr<NoOpKernel*> GetNoOpKernel();
 
   // Recursively disable all nodes corresponding to barriers (including nested
   // conditional command buffers). This is work around the fact that we can't
@@ -296,7 +281,7 @@ class GpuCommandBuffer : public CommandBuffer {
 
   // Creates a new no-op node acting as a barrier.
   absl::StatusOr<GpuGraphNodeHandle> CreateBarrierNode(
-      StreamExecutor* executor, const Dependencies& dependencies);
+      const Dependencies& dependencies);
 
   // Collects a set of dependencies for a new barrier.
   Dependencies GetBarrierDependencies(ExecutionScopeId execution_scope_id);
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer_test.cc b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer_test.cc
index 7ff757aab9aa5d..14c5897118ead7 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer_test.cc
@@ -35,6 +35,8 @@ limitations under the License.
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/trace_command_buffer_factory.h"
+#include "xla/stream_executor/typed_kernel_factory.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/status.h"
@@ -66,14 +68,16 @@ static MultiKernelLoaderSpec GetAddI32KernelSpec() {
   return spec;
 }
 
-using AddI32Kernel = TypedKernel<DeviceMemory<int32_t>, DeviceMemory<int32_t>,
-                                 DeviceMemory<int32_t>>;
-using MulI32Kernel = TypedKernel<DeviceMemory<int32_t>, DeviceMemory<int32_t>,
-                                 DeviceMemory<int32_t>>;
+using AddI32Kernel =
+    TypedKernelFactory<DeviceMemory<int32_t>, DeviceMemory<int32_t>,
+                       DeviceMemory<int32_t>>;
+using MulI32Kernel =
+    TypedKernelFactory<DeviceMemory<int32_t>, DeviceMemory<int32_t>,
+                       DeviceMemory<int32_t>>;
 using IncAndCmpKernel =
-    TypedKernel<DeviceMemory<int32_t>, DeviceMemory<bool>, int32_t>;
+    TypedKernelFactory<DeviceMemory<int32_t>, DeviceMemory<bool>, int32_t>;
 
-using AddI32Ptrs3 = TypedKernel<internal::Ptrs3<int32_t>>;
+using AddI32Ptrs3 = TypedKernelFactory<internal::Ptrs3<int32_t>>;
 
 static constexpr auto nested = CommandBuffer::Mode::kNested;    // NOLINT
 static constexpr auto primary = CommandBuffer::Mode::kPrimary;  // NOLINT
@@ -125,7 +129,7 @@ TEST(GpuCommandBufferTest, LaunchSingleKernel) {
   TF_ASSERT_OK(stream->MemZero(&c, byte_length));
 
   // Create a command buffer with a single kernel launch.
-  auto cmd_buffer = CommandBuffer::Create(executor).value();
+  auto cmd_buffer = executor->CreateCommandBuffer(primary).value();
   TF_ASSERT_OK(cmd_buffer->Launch(add, ThreadDim(), BlockDim(4), a, b, c));
   TF_ASSERT_OK(cmd_buffer->Finalize());
 
@@ -199,7 +203,7 @@ TEST(CudaCommandBufferTest, TraceSingleKernel) {
   KernelArgsDeviceMemoryArray args({a, b, c}, 0);
 
   // Create a command buffer by tracing kernel launch operations.
-  auto cmd_buffer = CommandBuffer::Trace(
+  auto cmd_buffer = TraceCommandBufferFactory::Create(
       executor,
       [&](Stream* stream) {
         return executor->Launch(stream, ThreadDim(), BlockDim(4), *add, args);
@@ -239,8 +243,8 @@ TEST(GpuCommandBufferTest, LaunchNestedCommandBuffer) {
   TF_ASSERT_OK(stream->MemZero(&c, byte_length));
 
   // Create a command buffer with a single kernel launch.
-  auto primary_cmd = CommandBuffer::Create(executor).value();
-  auto nested_cmd = CommandBuffer::Create(executor, nested).value();
+  auto primary_cmd = executor->CreateCommandBuffer(primary).value();
+  auto nested_cmd = executor->CreateCommandBuffer(nested).value();
   TF_ASSERT_OK(nested_cmd->Launch(add, ThreadDim(), BlockDim(4), a, b, c));
   TF_ASSERT_OK(primary_cmd->AddNestedCommandBuffer(*nested_cmd));
   TF_ASSERT_OK(primary_cmd->Finalize());
@@ -260,7 +264,7 @@ TEST(GpuCommandBufferTest, LaunchNestedCommandBuffer) {
 
   // Update command buffer to write into `d` buffer by creating a new nested
   // command buffer.
-  nested_cmd = CommandBuffer::Create(executor, nested).value();
+  nested_cmd = executor->CreateCommandBuffer(nested).value();
   TF_ASSERT_OK(nested_cmd->Launch(add, ThreadDim(), BlockDim(4), a, b, d));
   TF_ASSERT_OK(primary_cmd->Update());
   TF_ASSERT_OK(primary_cmd->AddNestedCommandBuffer(*nested_cmd));
@@ -290,7 +294,7 @@ TEST(GpuCommandBufferTest, MemcpyDeviceToDevice) {
   TF_ASSERT_OK(stream->Memset32(&a, 42, byte_length));
 
   // Create a command buffer with a single a to b memcpy command.
-  auto cmd_buffer = CommandBuffer::Create(executor).value();
+  auto cmd_buffer = executor->CreateCommandBuffer(primary).value();
   TF_ASSERT_OK(cmd_buffer->MemcpyDeviceToDevice(&b, a, byte_length));
   TF_ASSERT_OK(cmd_buffer->Finalize());
 
@@ -331,7 +335,7 @@ TEST(GpuCommandBufferTest, Memset) {
   DeviceMemory<int32_t> a = executor->AllocateArray<int32_t>(length, 0);
 
   // Create a command buffer with a single memset command.
-  auto cmd_buffer = CommandBuffer::Create(executor).value();
+  auto cmd_buffer = executor->CreateCommandBuffer(primary).value();
   TF_ASSERT_OK(cmd_buffer->Memset(&a, uint32_t{42}, length));
   TF_ASSERT_OK(cmd_buffer->Finalize());
 
@@ -382,27 +386,27 @@ TEST(GpuCommandBufferTest, Barriers) {
 
   auto record = [&](CommandBuffer* cmd_buffer, uint32_t bit_pattern) {
     // Check that root barrier ignored.
-    TF_RETURN_IF_ERROR(cmd_buffer->Barrier(executor));
+    TF_RETURN_IF_ERROR(cmd_buffer->Barrier());
     TF_RETURN_IF_ERROR(cmd_buffer->Memset(&buffers[0], bit_pattern + 0, 1));
     // Check barrier after a single command.
-    TF_RETURN_IF_ERROR(cmd_buffer->Barrier(executor));
+    TF_RETURN_IF_ERROR(cmd_buffer->Barrier());
     TF_RETURN_IF_ERROR(cmd_buffer->Memset(&buffers[1], bit_pattern + 1, 1));
     // Check that repeated barriers are no-op.
-    TF_RETURN_IF_ERROR(cmd_buffer->Barrier(executor));
-    TF_RETURN_IF_ERROR(cmd_buffer->Barrier(executor));
+    TF_RETURN_IF_ERROR(cmd_buffer->Barrier());
+    TF_RETURN_IF_ERROR(cmd_buffer->Barrier());
     TF_RETURN_IF_ERROR(cmd_buffer->Memset(&buffers[2], bit_pattern + 2, 1));
     TF_RETURN_IF_ERROR(cmd_buffer->Memset(&buffers[3], bit_pattern + 3, 1));
     // Check that barrier can have multiple dependencies.
-    TF_RETURN_IF_ERROR(cmd_buffer->Barrier(executor));
+    TF_RETURN_IF_ERROR(cmd_buffer->Barrier());
     TF_RETURN_IF_ERROR(cmd_buffer->Memset(&buffers[4], bit_pattern + 4, 1));
     TF_RETURN_IF_ERROR(cmd_buffer->Memset(&buffers[5], bit_pattern + 5, 1));
     // Check that barrier can be that last command.
-    TF_RETURN_IF_ERROR(cmd_buffer->Barrier(executor));
+    TF_RETURN_IF_ERROR(cmd_buffer->Barrier());
     return cmd_buffer->Finalize();
   };
 
   // Create a command buffer with a DAG of memset commands.
-  auto cmd_buffer = CommandBuffer::Create(executor).value();
+  auto cmd_buffer = executor->CreateCommandBuffer(primary).value();
   TF_ASSERT_OK(record(cmd_buffer.get(), 42));
   TF_ASSERT_OK(executor->Submit(stream.get(), *cmd_buffer));
 
@@ -476,13 +480,13 @@ TEST(GpuCommandBufferTest, IndependentExecutionScopes) {
     TF_RETURN_IF_ERROR(cmd_buffer->Memset(s0, &buffers[1], bit_pattern + 1, 1));
     TF_RETURN_IF_ERROR(cmd_buffer->Memset(s1, &buffers[2], bit_pattern + 2, 1));
     TF_RETURN_IF_ERROR(cmd_buffer->Memset(s1, &buffers[3], bit_pattern + 3, 1));
-    TF_RETURN_IF_ERROR(cmd_buffer->Barrier(executor, s0));
-    TF_RETURN_IF_ERROR(cmd_buffer->Barrier(executor, s1));
+    TF_RETURN_IF_ERROR(cmd_buffer->Barrier(s0));
+    TF_RETURN_IF_ERROR(cmd_buffer->Barrier(s1));
     return cmd_buffer->Finalize();
   };
 
   // Create a command buffer with a DAG of memset commands.
-  auto cmd_buffer = CommandBuffer::Create(executor).value();
+  auto cmd_buffer = executor->CreateCommandBuffer(primary).value();
   TF_ASSERT_OK(record(cmd_buffer.get(), 42));
   TF_ASSERT_OK(executor->Submit(stream.get(), *cmd_buffer));
 
@@ -548,7 +552,7 @@ TEST(GpuCommandBufferTest, ExecutionScopeBarriers) {
     TF_RETURN_IF_ERROR(cmd_buffer->Memset(s1, &buffers[2], bit_pattern + 2, 1));
     TF_RETURN_IF_ERROR(cmd_buffer->Memset(s1, &buffers[3], bit_pattern + 3, 1));
     // This will synchronize scopes 0 and 1 and also create an empty scope 2.
-    TF_RETURN_IF_ERROR(cmd_buffer->Barrier(executor, {s0, s1, s2}));
+    TF_RETURN_IF_ERROR(cmd_buffer->Barrier({s0, s1, s2}));
     TF_RETURN_IF_ERROR(cmd_buffer->Memset(s0, &buffers[4], bit_pattern + 4, 1));
     TF_RETURN_IF_ERROR(cmd_buffer->Memset(s1, &buffers[5], bit_pattern + 5, 1));
     TF_RETURN_IF_ERROR(cmd_buffer->Memset(s2, &buffers[6], bit_pattern + 6, 1));
@@ -556,7 +560,7 @@ TEST(GpuCommandBufferTest, ExecutionScopeBarriers) {
   };
 
   // Create a command buffer with a DAG of memset commands.
-  auto cmd_buffer = CommandBuffer::Create(executor).value();
+  auto cmd_buffer = executor->CreateCommandBuffer(primary).value();
   TF_ASSERT_OK(record(cmd_buffer.get(), 42));
   TF_ASSERT_OK(executor->Submit(stream.get(), *cmd_buffer));
 
@@ -639,14 +643,14 @@ TEST(GpuCommandBufferTest, ExecutionScopeOneDirectionalBarriers) {
     TF_RETURN_IF_ERROR(cmd_buffer->Memset(s1, &buffers[2], bit_pattern + 2, 1));
     TF_RETURN_IF_ERROR(cmd_buffer->Memset(s1, &buffers[3], bit_pattern + 3, 1));
     // This will synchronize scopes 0 and 1.
-    TF_RETURN_IF_ERROR(cmd_buffer->Barrier(executor, s0, s1));
+    TF_RETURN_IF_ERROR(cmd_buffer->Barrier(s0, s1));
     TF_RETURN_IF_ERROR(cmd_buffer->Memset(s0, &buffers[4], bit_pattern + 4, 1));
     TF_RETURN_IF_ERROR(cmd_buffer->Memset(s1, &buffers[5], bit_pattern + 5, 1));
     return cmd_buffer->Finalize();
   };
 
   // Create a command buffer with a DAG of memset commands.
-  auto cmd_buffer = CommandBuffer::Create(executor).value();
+  auto cmd_buffer = executor->CreateCommandBuffer(primary).value();
   TF_ASSERT_OK(record(cmd_buffer.get(), 42));
   TF_ASSERT_OK(executor->Submit(stream.get(), *cmd_buffer));
 
@@ -720,8 +724,8 @@ TEST(GpuCommandBufferTest, ConditionalIf) {
   };
 
   // Create a command buffer with a single conditional operation.
-  auto cmd_buffer = CommandBuffer::Create(executor).value();
-  TF_ASSERT_OK(cmd_buffer->If(executor, pred, then_builder));
+  auto cmd_buffer = executor->CreateCommandBuffer(primary).value();
+  TF_ASSERT_OK(cmd_buffer->If(pred, then_builder));
   TF_ASSERT_OK(cmd_buffer->Finalize());
 
   TF_ASSERT_OK(executor->Submit(stream.get(), *cmd_buffer));
@@ -760,7 +764,7 @@ TEST(GpuCommandBufferTest, ConditionalIf) {
 
   // Update command buffer with a conditional to use new builder.
   TF_ASSERT_OK(cmd_buffer->Update());
-  TF_ASSERT_OK(cmd_buffer->If(executor, pred, then_builder));
+  TF_ASSERT_OK(cmd_buffer->If(pred, then_builder));
   TF_ASSERT_OK(cmd_buffer->Finalize());
 
   TF_ASSERT_OK(executor->Submit(stream.get(), *cmd_buffer));
@@ -817,8 +821,8 @@ TEST(GpuCommandBufferTest, ConditionalIfElse) {
   };
 
   // Create a command buffer with a single conditional operation.
-  auto cmd_buffer = CommandBuffer::Create(executor).value();
-  TF_ASSERT_OK(cmd_buffer->IfElse(executor, pred, then_builder, else_builder));
+  auto cmd_buffer = executor->CreateCommandBuffer(primary).value();
+  TF_ASSERT_OK(cmd_buffer->IfElse(pred, then_builder, else_builder));
   TF_ASSERT_OK(cmd_buffer->Finalize());
 
   TF_ASSERT_OK(executor->Submit(stream.get(), *cmd_buffer));
@@ -855,7 +859,7 @@ TEST(GpuCommandBufferTest, ConditionalIfElse) {
 
   // Update command buffer with a conditional to use new `else` builder.
   TF_ASSERT_OK(cmd_buffer->Update());
-  TF_ASSERT_OK(cmd_buffer->IfElse(executor, pred, then_builder, else_builder));
+  TF_ASSERT_OK(cmd_buffer->IfElse(pred, then_builder, else_builder));
   TF_ASSERT_OK(cmd_buffer->Finalize());
 
   TF_ASSERT_OK(executor->Submit(stream.get(), *cmd_buffer));
@@ -912,8 +916,8 @@ TEST(GpuCommandBufferTest, ConditionalCase) {
   };
 
   // Create a command buffer with a single conditional operation.
-  auto cmd_buffer = CommandBuffer::Create(executor).value();
-  TF_ASSERT_OK(cmd_buffer->Case(executor, index, {branch0, branch1}));
+  auto cmd_buffer = executor->CreateCommandBuffer(primary).value();
+  TF_ASSERT_OK(cmd_buffer->Case(index, {branch0, branch1}));
   TF_ASSERT_OK(cmd_buffer->Finalize());
 
   TF_ASSERT_OK(executor->Submit(stream.get(), *cmd_buffer));
@@ -991,9 +995,8 @@ TEST(GpuCommandBufferTest, ConditionalFor) {
   int32_t num_iters = 10;
 
   // Create a command buffer with a single conditional operation.
-  auto cmd_buffer = CommandBuffer::Create(executor).value();
-  TF_ASSERT_OK(
-      cmd_buffer->For(executor, num_iters, loop_counter, body_builder));
+  auto cmd_buffer = executor->CreateCommandBuffer(primary).value();
+  TF_ASSERT_OK(cmd_buffer->For(num_iters, loop_counter, body_builder));
   TF_ASSERT_OK(cmd_buffer->Finalize());
 
   TF_ASSERT_OK(executor->Submit(stream.get(), *cmd_buffer));
@@ -1059,8 +1062,8 @@ TEST(GpuCommandBufferTest, ConditionalWhile) {
   };
 
   // Create a command buffer with a single conditional operation.
-  auto cmd_buffer = CommandBuffer::Create(executor).value();
-  TF_ASSERT_OK(cmd_buffer->While(executor, pred, cond_builder, body_builder));
+  auto cmd_buffer = executor->CreateCommandBuffer(primary).value();
+  TF_ASSERT_OK(cmd_buffer->While(pred, cond_builder, body_builder));
   TF_ASSERT_OK(cmd_buffer->Finalize());
 
   TF_ASSERT_OK(executor->Submit(stream.get(), *cmd_buffer));
@@ -1112,22 +1115,21 @@ TEST(GpuCommandBufferTest, ConditionalIfInExecutionScope) {
     TF_RETURN_IF_ERROR(cmd_buffer->Memset(s0, &buffers[1], bit_pattern + 1, 1));
 
     // Record If in execution scope #1
-    TF_RETURN_IF_ERROR(
-        cmd_buffer->If(s1, executor, pred, [&](CommandBuffer* then_cmd) {
-          return then_cmd->Memset(&buffers[2], bit_pattern + 2, 1);
-        }));
+    TF_RETURN_IF_ERROR(cmd_buffer->If(s1, pred, [&](CommandBuffer* then_cmd) {
+      return then_cmd->Memset(&buffers[2], bit_pattern + 2, 1);
+    }));
 
     // Create a barrier in execution scope #0.
-    TF_RETURN_IF_ERROR(cmd_buffer->Barrier(executor, s0));
+    TF_RETURN_IF_ERROR(cmd_buffer->Barrier(s0));
 
     // Create a barrier between two execution scopes.
-    TF_RETURN_IF_ERROR(cmd_buffer->Barrier(executor, {s0, s1}));
+    TF_RETURN_IF_ERROR(cmd_buffer->Barrier({s0, s1}));
 
     return cmd_buffer->Finalize();
   };
 
   // Create a command buffer with a DAG of memset commands.
-  auto cmd_buffer = CommandBuffer::Create(executor).value();
+  auto cmd_buffer = executor->CreateCommandBuffer(primary).value();
   TF_ASSERT_OK(record(cmd_buffer.get(), 42));
   TF_ASSERT_OK(executor->Submit(stream.get(), *cmd_buffer));
 
@@ -1210,7 +1212,7 @@ TEST(GpuCommandBufferTest, ConditionalWhileInExecutionScope) {
 
     // Record While in execution scope #1
     TF_RETURN_IF_ERROR(cmd_buffer->While(
-        s1, executor, pred,
+        s1, pred,
         // Loop cond: loop_counter++ < num_iters;
         [&](ExecutionScopeId id, CommandBuffer* cond_cmd) {
           return cond_cmd->Launch(inc_and_cmp, id, ThreadDim(), BlockDim(),
@@ -1222,13 +1224,13 @@ TEST(GpuCommandBufferTest, ConditionalWhileInExecutionScope) {
         }));
 
     // Create a barrier between two execution scopes.
-    TF_RETURN_IF_ERROR(cmd_buffer->Barrier(executor, {s0, s1}));
+    TF_RETURN_IF_ERROR(cmd_buffer->Barrier({s0, s1}));
 
     return cmd_buffer->Finalize();
   };
 
   // Create a command buffer with a single conditional operation.
-  auto cmd_buffer = CommandBuffer::Create(executor).value();
+  auto cmd_buffer = executor->CreateCommandBuffer(primary).value();
   TF_ASSERT_OK(record(cmd_buffer.get(), 42, 10));
   TF_ASSERT_OK(executor->Submit(stream.get(), *cmd_buffer));
 
@@ -1292,7 +1294,7 @@ static void BM_CreateCommandBuffer(benchmark::State& state) {
   DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(1, 0);
 
   for (auto s : state) {
-    auto cmd_buffer = CommandBuffer::Create(executor, nested).value();
+    auto cmd_buffer = executor->CreateCommandBuffer(nested).value();
     for (int i = 1; i < state.range(0); ++i) {
       CHECK_OK(cmd_buffer->Launch(add, ThreadDim(), BlockDim(4), b, b, b));
     }
@@ -1322,7 +1324,8 @@ static void BM_TraceCommandBuffer(benchmark::State& state) {
       return absl::OkStatus();
     };
 
-    CHECK_OK(CommandBuffer::Trace(executor, launch_kernels, nested));
+    CHECK_OK(
+        TraceCommandBufferFactory::Create(executor, launch_kernels, nested));
   }
 }
 
@@ -1338,7 +1341,7 @@ static void BM_UpdateCommandBuffer(benchmark::State& state) {
 
   DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(1, 0);
 
-  auto cmd_buffer = CommandBuffer::Create(executor, primary).value();
+  auto cmd_buffer = executor->CreateCommandBuffer(primary).value();
   for (int i = 1; i < state.range(0); ++i) {
     CHECK_OK(cmd_buffer->Launch(add, ThreadDim(), BlockDim(4), b, b, b));
   }
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_driver.h b/third_party/xla/xla/stream_executor/gpu/gpu_driver.h
index 91690f69649ce5..c44f47b47999a7 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_driver.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_driver.h
@@ -251,10 +251,6 @@ class GpuDriver {
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g27a365aebb0eb548166309f58a1e8b8e
   static void DestroyContext(GpuContext* context);
 
-  // Returns the context handle (CUcontext for CUDA and hipCtx_t for ROCm) of a
-  // GpuContext.
-  static GpuContextHandle GetContextHandle(GpuContext* context);
-
   // Queries the runtime for the specified attribute of the specified function.
   // cuFuncGetAttribute (the underlying CUDA driver API routine) only operates
   // in terms of integer-sized values, so there's no potential for overrun (as
@@ -781,12 +777,6 @@ class GpuDriver {
   static absl::Status RecordEvent(GpuContext* context, GpuEventHandle event,
                                   GpuStreamHandle stream);
 
-  // Polls (without blocking) to determine the status of an event - pending or
-  // complete (or an error status).
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g6f0704d755066b0ee705749ae911deef
-  static absl::StatusOr<GpuStatus> QueryEvent(GpuContext* context,
-                                              GpuEventHandle event);
-
   // -- Pointer-specific calls.
 
   // Returns the context in which pointer was allocated or registered.
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_executor.h b/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
index 27c21202661dd4..e72d60a0f582f0 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
@@ -72,7 +72,7 @@ class GpuCommandBuffer;
 
 // CUDA-platform implementation of the platform-agnostic
 // StreamExecutorInterface.
-class GpuExecutor : public StreamExecutorInterface {
+class GpuExecutor : public StreamExecutor {
   // Helper classes to attach a type erased state to the GpuExecutor. Currently,
   // we just need to support some XLA specific state.
   class Object {
@@ -103,8 +103,9 @@ class GpuExecutor : public StreamExecutorInterface {
  public:
   // sub_platform indicates the subplatform used in this executor; it must
   // be a CUDA type.
-  explicit GpuExecutor(int device_ordinal)
-      : device_(0),
+  GpuExecutor(Platform* platform, int device_ordinal)
+      : StreamExecutor(platform),
+        device_(0),
         context_(nullptr),
         device_ordinal_(device_ordinal),
         cc_major_(0),
@@ -228,14 +229,10 @@ class GpuExecutor : public StreamExecutorInterface {
   bool HostCallback(Stream* stream,
                     absl::AnyInvocable<absl::Status() &&> callback) override;
 
-  bool AllocateStream(Stream* stream) override;
-
   void DeallocateStream(Stream* stream) override;
 
   bool CreateStreamDependency(Stream* dependent, Stream* other) override;
 
-  absl::Status AllocateEvent(Event* event) override;
-
   absl::Status DeallocateEvent(Event* event) override;
 
   absl::Status RecordEvent(Stream* stream, Event* event) override;
@@ -255,11 +252,8 @@ class GpuExecutor : public StreamExecutorInterface {
 
   bool DeviceMemoryUsage(int64_t* free, int64_t* total) const override;
 
-  // Search for the symbol in the given module and returns a device pointer and
-  // size. Returns false if symbol does not exist. 'module_handle' must not
-  // be null.
-  bool GetSymbol(const std::string& symbol_name, ModuleHandle module_handle,
-                 void** mem, size_t* bytes) override;
+  absl::StatusOr<DeviceMemoryBase> GetSymbol(
+      const std::string& symbol_name, ModuleHandle module_handle) override;
 
   absl::StatusOr<std::unique_ptr<DeviceDescription>> CreateDeviceDescription()
       const override {
@@ -275,9 +269,11 @@ class GpuExecutor : public StreamExecutorInterface {
 
   dnn::DnnSupport* AsDnn() override;
 
-  std::unique_ptr<EventInterface> CreateEventImplementation() override;
+  absl::StatusOr<std::unique_ptr<Event>> CreateEvent() override;
 
-  std::unique_ptr<StreamInterface> GetStreamImplementation() override;
+  absl::StatusOr<std::unique_ptr<Stream>> CreateStream(
+      std::optional<std::variant<StreamPriority, int>> priority =
+          std::nullopt) override;
 
   absl::StatusOr<std::unique_ptr<Kernel>> CreateKernel() override;
 
@@ -319,6 +315,28 @@ class GpuExecutor : public StreamExecutorInterface {
   int cc_major() const { return cc_major_; }
   int cc_minor() const { return cc_minor_; }
 
+  absl::StatusOr<std::vector<ApiTrace>> ExtractApiTrace() override {
+    absl::MutexLock lock(&logger_mu_);
+    return std::move(argument_logs_);
+  }
+
+  absl::Status RecordApiTrace(ApiTrace call) override {
+    absl::MutexLock lock(&logger_mu_);
+    if (std::holds_alternative<GemmCallTrace>(call) &&
+        (argument_logging_mode_ & kLogGemm)) {
+      argument_logs_.push_back(call);
+    }
+    return absl::OkStatus();
+  }
+
+  bool SetArgumentLoggingMode(uint64_t mode) override {
+    absl::MutexLock lock(&logger_mu_);
+    argument_logging_mode_ = mode;
+    return true;
+  }
+
+  uint64_t GetArgumentLoggingMode() const { return argument_logging_mode_; }
+
  private:
   // Host callback landing routine invoked by CUDA.
   // data: User-provided callback provided to HostCallback() above, captured
@@ -439,12 +457,18 @@ class GpuExecutor : public StreamExecutorInterface {
   // for a BLAS interface.
   std::unique_ptr<blas::BlasSupport> blas_ ABSL_GUARDED_BY(mu_);
 
+  absl::Mutex logger_mu_;
+
+  mutable std::vector<ApiTrace> argument_logs_ ABSL_GUARDED_BY(logger_mu_);
+
+  uint64_t argument_logging_mode_ = 0;
+
   GpuExecutor(const GpuExecutor&) = delete;
   void operator=(const GpuExecutor&) = delete;
 };
 
 inline GpuExecutor* ExtractGpuExecutor(StreamExecutor* stream_exec) {
-  return static_cast<GpuExecutor*>(stream_exec->implementation());
+  return static_cast<GpuExecutor*>(stream_exec);
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_kernel_test.cc b/third_party/xla/xla/stream_executor/gpu/gpu_kernel_test.cc
index 591d417348776f..507fbfa477520f 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_kernel_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_kernel_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/typed_kernel_factory.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
@@ -33,8 +34,9 @@ limitations under the License.
 namespace stream_executor::gpu {
 
 TEST(GpuKernelTest, Add) {
-  using AddI32Kernel = TypedKernel<DeviceMemory<int32_t>, DeviceMemory<int32_t>,
-                                   DeviceMemory<int32_t>>;
+  using AddI32Kernel =
+      TypedKernelFactory<DeviceMemory<int32_t>, DeviceMemory<int32_t>,
+                         DeviceMemory<int32_t>>;
   auto name = absl::AsciiStrToUpper(
       xla::PlatformUtil::CanonicalPlatformName("gpu").value());
   Platform* platform = PlatformManager::PlatformWithName(name).value();
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_semaphore.cc b/third_party/xla/xla/stream_executor/gpu/gpu_semaphore.cc
new file mode 100644
index 00000000000000..d5e2706135d03c
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_semaphore.cc
@@ -0,0 +1,40 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/gpu/gpu_semaphore.h"
+
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "tsl/platform/statusor.h"
+
+namespace stream_executor {
+absl::StatusOr<GpuSemaphore> GpuSemaphore::Create(StreamExecutor* executor) {
+  // Allocate the value in pinned host memory that can be read from both
+  // host and device.
+  TF_ASSIGN_OR_RETURN(auto alloc,
+                      executor->HostMemoryAllocate(sizeof(GpuSemaphoreState)));
+  return GpuSemaphore{std::move(alloc)};
+}
+
+DeviceMemory<GpuSemaphoreState> GpuSemaphore::device() {
+  // This assumes unified addressing, as we do not explicitly translate the
+  // host pointer into a device pointer.
+  return DeviceMemory<GpuSemaphoreState>::MakeFromByteSize(
+      ptr_->opaque(), sizeof(GpuSemaphoreState));
+}
+}  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_semaphore.h b/third_party/xla/xla/stream_executor/gpu/gpu_semaphore.h
new file mode 100644
index 00000000000000..4436c97014d8a8
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_semaphore.h
@@ -0,0 +1,56 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_GPU_SEMAPHORE_H_
+#define XLA_STREAM_EXECUTOR_GPU_GPU_SEMAPHORE_H_
+
+#include <memory>
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace stream_executor {
+enum struct GpuSemaphoreState { kHold, kRelease, kTimedOut };
+
+// A basic semaphore that allows synchronization between host and GPU.
+// It uses pinned host memory as the communication channel.
+class GpuSemaphore {
+ public:
+  // Creates an invalid semaphore instance
+  GpuSemaphore() = default;
+
+  // Creates a valid semaphore. Allocates some pinned host memory using
+  // `executor`.
+  static absl::StatusOr<GpuSemaphore> Create(StreamExecutor* executor);
+
+  // Returns true if this semaphore is valid, otherwise false.
+  explicit operator bool() const { return bool{ptr_}; }
+
+  GpuSemaphoreState& operator*() {
+    return *static_cast<GpuSemaphoreState*>(ptr_->opaque());
+  }
+  DeviceMemory<GpuSemaphoreState> device();
+
+ private:
+  explicit GpuSemaphore(std::unique_ptr<MemoryAllocation> alloc)
+      : ptr_{std::move(alloc)} {}
+  std::unique_ptr<MemoryAllocation> ptr_;
+};
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_GPU_SEMAPHORE_H_
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_stream.h b/third_party/xla/xla/stream_executor/gpu/gpu_stream.h
index ebd143953e54b5..a06839b5c1c79b 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_stream.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_stream.h
@@ -45,8 +45,7 @@ class GpuStream : public StreamInterface {
 
   void* platform_specific_stream() override { return gpu_stream_; }
 
-  // Explicitly initialize the CUDA resources associated with this stream, used
-  // by StreamExecutor::AllocateStream().
+  // Explicitly initialize the CUDA resources associated with this stream.
   bool Init();
 
   void SetPriority(StreamPriority priority) override {
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_timer.cc b/third_party/xla/xla/stream_executor/gpu/gpu_timer.cc
index e5928e7a7cc782..433424ab84b0bd 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_timer.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_timer.cc
@@ -16,8 +16,10 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_timer.h"
 
 #include <cmath>
+#include <cstdlib>
 #include <optional>
 #include <random>
+#include <string_view>
 #include <utility>
 
 #include "absl/base/const_init.h"
@@ -31,8 +33,11 @@ limitations under the License.
 #include "absl/utility/utility.h"
 #include "xla/stream_executor/gpu/gpu_driver.h"
 #include "xla/stream_executor/gpu/gpu_executor.h"
+#include "xla/stream_executor/gpu/gpu_semaphore.h"
 #include "xla/stream_executor/gpu/gpu_stream.h"
+#include "xla/stream_executor/gpu/gpu_timer_kernel.h"
 #include "xla/stream_executor/gpu/gpu_types.h"
+#include "xla/stream_executor/stream.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
@@ -92,25 +97,8 @@ GpuTimer::CreateIfNeeded(GpuStream* stream, bool is_needed) {
   return std::nullopt;
 }
 
-/*static*/ absl::StatusOr<GpuTimer::GpuSemaphore>
-GpuTimer::GpuSemaphore::Create(StreamExecutor* executor) {
-  // Allocate the value in pinned host memory that can be read from both
-  // host and device.
-  TF_ASSIGN_OR_RETURN(auto alloc,
-                      executor->HostMemoryAllocate(sizeof(GpuSemaphoreState)));
-  return GpuSemaphore{std::move(alloc)};
-}
-
-DeviceMemory<GpuSemaphoreState> GpuTimer::GpuSemaphore::device() {
-  // This assumes unified addressing, as we do not explicitly translate the
-  // host pointer into a device pointer.
-  return DeviceMemory<GpuSemaphoreState>::MakeFromByteSize(
-      ptr_->opaque(), sizeof(GpuSemaphoreState));
-}
-
 /*static*/ absl::StatusOr<GpuTimer> GpuTimer::Create(Stream* real_stream,
                                                      bool use_delay_kernel) {
-  StreamExecutor* executor = real_stream->parent();
   GpuStream* stream = AsGpuStream(real_stream);
   GpuExecutor* parent = stream->parent();
   GpuContext* context = parent->gpu_context();
@@ -126,37 +114,15 @@ DeviceMemory<GpuSemaphoreState> GpuTimer::GpuSemaphore::device() {
     LOG(WARNING)
         << "Skipping the delay kernel, measurement accuracy will be reduced";
   }
-#ifdef GOOGLE_CUDA
+
   if (use_delay_kernel && ShouldLaunchDelayKernel()) {
-    // Check the assumption that this device supports unified addressing,
-    // otherwise skip the delay kernel
-    TF_ASSIGN_OR_RETURN(int status, GpuDriver::GetDeviceAttribute(
-                                        CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING,
-                                        parent->device()));
-    if (!status) {
-      LOG(WARNING) << "Skipping the delay kernel because the device does not "
-                      "support unified addressing";
-    } else {
-      // Allocate a semaphore value that will be used to signal to the delay
-      // kernel that it may exit.
-      TF_ASSIGN_OR_RETURN(semaphore, GpuSemaphore::Create(executor));
-      *semaphore = GpuSemaphoreState::Hold;
-      // In principle the kernel could be loaded lazily and shared across
-      // multiple GpuTimer objects.
-      TF_ASSIGN_OR_RETURN(
-          auto kernel,
-          (TypedKernel<DeviceMemory<GpuSemaphoreState>,
-                       GpuSemaphoreState>::Create(executor, "DelayKernel",
-                                                  delay_kernel::kernel())));
-      // Launch a delay kernel into this stream, which will spin until
-      // GetElapsedDuration() is called, the timer is destroyed, or the timeout
-      // in the kernel is reached.
-      TF_RETURN_IF_ERROR(real_stream->ThenLaunch(
-          ThreadDim(1, 1, 1), BlockDim(1, 1, 1), kernel, semaphore.device(),
-          GpuSemaphoreState::Release));
+    TF_ASSIGN_OR_RETURN(bool is_supported, DelayKernelIsSupported(stream));
+
+    if (is_supported) {
+      TF_ASSIGN_OR_RETURN(semaphore, LaunchDelayKernel(real_stream));
     }
   }
-#endif  // GOOGLE_CUDA
+
   // The start event goes after the delay kernel in the stream
   TF_RETURN_IF_ERROR(GpuDriver::RecordEvent(parent->gpu_context(), start_event,
                                             stream->gpu_stream()));
@@ -181,7 +147,7 @@ GpuTimer::~GpuTimer() {
   GpuContext* context = parent_->gpu_context();
   if (semaphore_ && !is_stopped_) {
     // Signal the delay kernel that it can exit
-    *semaphore_ = GpuSemaphoreState::Release;
+    *semaphore_ = GpuSemaphoreState::kRelease;
     // Wait for the delay kernel to exit before destroying the value that it is
     // watching.
     absl::Status status =
@@ -212,14 +178,14 @@ absl::StatusOr<absl::Duration> GpuTimer::GetElapsedDuration() {
                                             stream_->gpu_stream()));
   // If we launched the delay kernel then check if it already timed out.
   if (semaphore_) {
-    if (*semaphore_ == GpuSemaphoreState::TimedOut) {
+    if (*semaphore_ == GpuSemaphoreState::kTimedOut) {
       // The delay kernel did not achieve the intended result.
       LOG(ERROR) << "Delay kernel timed out: measured time has sub-optimal "
                     "accuracy. There may be a missing warmup execution, please "
                     "investigate in Nsight Systems.";
     } else {
       // Signal that the kernel can exit
-      *semaphore_ = GpuSemaphoreState::Release;
+      *semaphore_ = GpuSemaphoreState::kRelease;
     }
   }
   float elapsed_milliseconds = NAN;
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_timer.h b/third_party/xla/xla/stream_executor/gpu/gpu_timer.h
index 97124e611b4456..1cba7bfb9914ef 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_timer.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_timer.h
@@ -16,15 +16,15 @@ limitations under the License.
 #ifndef XLA_STREAM_EXECUTOR_GPU_GPU_TIMER_H_
 #define XLA_STREAM_EXECUTOR_GPU_GPU_TIMER_H_
 
-#include <memory>
 #include <optional>
 #include <utility>
 
 #include "absl/status/statusor.h"
 #include "absl/time/time.h"
 #include "xla/stream_executor/gpu/gpu_executor.h"
-#include "xla/stream_executor/gpu/gpu_timer_kernel.h"
+#include "xla/stream_executor/gpu/gpu_semaphore.h"
 #include "xla/stream_executor/gpu/gpu_types.h"
+#include "xla/stream_executor/stream.h"
 
 namespace xla {
 namespace gpu {
@@ -46,21 +46,6 @@ class GpuStream;
 // to be measured more accurately.
 class GpuTimer {
  public:
-  class GpuSemaphore {
-   public:
-    GpuSemaphore() = default;
-    static absl::StatusOr<GpuSemaphore> Create(StreamExecutor* executor);
-    explicit operator bool() const { return bool{ptr_}; }
-    GpuSemaphoreState& operator*() {
-      return *static_cast<GpuSemaphoreState*>(ptr_->opaque());
-    }
-    DeviceMemory<GpuSemaphoreState> device();
-
-   private:
-    explicit GpuSemaphore(std::unique_ptr<MemoryAllocation> alloc)
-        : ptr_{std::move(alloc)} {}
-    std::unique_ptr<MemoryAllocation> ptr_;
-  };
   static absl::StatusOr<GpuTimer> Create(Stream* stream, bool use_delay_kernel);
   [[deprecated("Pass Stream* not GpuStream*")]] static absl::StatusOr<GpuTimer>
   Create(GpuStream* stream);
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.h b/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.h
index 2ac358b4ee56c5..cb0c5d1a3ccff3 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.h
@@ -16,11 +16,21 @@ limitations under the License.
 #ifndef XLA_STREAM_EXECUTOR_GPU_GPU_TIMER_KERNEL_H_
 #define XLA_STREAM_EXECUTOR_GPU_GPU_TIMER_KERNEL_H_
 
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/gpu/gpu_semaphore.h"
+#include "xla/stream_executor/gpu/gpu_stream.h"
+#include "xla/stream_executor/stream.h"
+
 namespace stream_executor::gpu {
-enum struct GpuSemaphoreState { Hold, Release, TimedOut };
-namespace delay_kernel {
-void* kernel();  // returns a pointer to a CUDA C++ device function
-}  // namespace delay_kernel
+// Returns true if the current backend and GPU supports the delay kernel for
+// time measurement. It might return an error if checking for the support at
+// runtime failed.
+absl::StatusOr<bool> DelayKernelIsSupported(GpuStream* stream);
+
+// Launches the delay kernel on the given stream. The caller is responsible for
+// keeping the returned semaphore alive until the kernel finished executing.
+// Setting the semaphore to `kRelease` makes the kernel quit.
+absl::StatusOr<GpuSemaphore> LaunchDelayKernel(Stream* stream);
 }  // namespace stream_executor::gpu
 
 #endif  // XLA_STREAM_EXECUTOR_GPU_GPU_TIMER_KERNEL_H_
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.cu.cc b/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel_cuda.cu.cc
similarity index 50%
rename from third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.cu.cc
rename to third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel_cuda.cu.cc
index 0ce4b1d9fbb323..8a320c583547c6 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.cu.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel_cuda.cu.cc
@@ -12,10 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "xla/stream_executor/gpu/gpu_timer_kernel.h"
 
 #include <cstddef>
 
+#include "xla/stream_executor/gpu/gpu_driver.h"
+#include "xla/stream_executor/gpu/gpu_executor.h"
+#include "xla/stream_executor/gpu/gpu_semaphore.h"
+#include "xla/stream_executor/gpu/gpu_timer_kernel.h"
+#include "xla/stream_executor/typed_kernel_factory.h"
+
 namespace stream_executor::gpu {
 namespace {
 // Wait for the value pointed to by `semaphore` to have value `target`, timing
@@ -40,11 +45,50 @@ __global__ void DelayKernel(volatile GpuSemaphoreState* semaphore,
   if (target_not_reached) {
     // We are exiting due to the timeout. Signal this back to the host so that
     // we can emit a warning, as it probably indicates suboptimal usage.
-    *semaphore = GpuSemaphoreState::TimedOut;
+    *semaphore = GpuSemaphoreState::kTimedOut;
   }
 }
 }  // namespace
 
+absl::StatusOr<GpuSemaphore> LaunchDelayKernel(Stream* stream) {
+  StreamExecutor* executor = stream->parent();
+
+  // Allocate a semaphore value that will be used to signal to the delay
+  // kernel that it may exit.
+  TF_ASSIGN_OR_RETURN(auto semaphore, GpuSemaphore::Create(executor));
+  *semaphore = GpuSemaphoreState::kHold;
+  // In principle the kernel could be loaded lazily and shared across
+  // multiple GpuTimer objects.
+  TF_ASSIGN_OR_RETURN(
+      auto kernel,
+      (TypedKernelFactory<DeviceMemory<GpuSemaphoreState>,
+                          GpuSemaphoreState>::Create(executor, "DelayKernel",
+                                                     reinterpret_cast<void*>(
+                                                         DelayKernel))));
+  // Launch a delay kernel into this stream, which will spin until
+  // GetElapsedDuration() is called, the timer is destroyed, or the timeout
+  // in the kernel is reached.
+  TF_RETURN_IF_ERROR(stream->ThenLaunch(ThreadDim(1, 1, 1), BlockDim(1, 1, 1),
+                                        kernel, semaphore.device(),
+                                        GpuSemaphoreState::kRelease));
+
+  return semaphore;
+}
+
+absl::StatusOr<bool> DelayKernelIsSupported(GpuStream* stream) {
+  // Check the assumption that this device supports unified addressing,
+  // otherwise skip the delay kernel
+  TF_ASSIGN_OR_RETURN(int status, GpuDriver::GetDeviceAttribute(
+                                      CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING,
+                                      stream->parent()->device()));
+  if (!status) {
+    LOG(WARNING) << "Skipping the delay kernel because the device does not "
+                    "support unified addressing";
+  }
+
+  return static_cast<bool>(status);
+}
+
 namespace delay_kernel {
 void* kernel() { return reinterpret_cast<void*>(DelayKernel); }
 }  // namespace delay_kernel
diff --git a/third_party/xla/xla/service/gpu/sleep_kernel.cu.cc b/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel_rocm.cc
similarity index 54%
rename from third_party/xla/xla/service/gpu/sleep_kernel.cu.cc
rename to third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel_rocm.cc
index 8f37d47e677347..2ee3680fa3f757 100644
--- a/third_party/xla/xla/service/gpu/sleep_kernel.cu.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel_rocm.cc
@@ -1,4 +1,4 @@
-/* Copyright 2023 The OpenXLA Authors.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,20 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/sleep_kernel.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/gpu/gpu_semaphore.h"
+#include "xla/stream_executor/gpu/gpu_stream.h"
+#include "xla/stream_executor/stream.h"
 
-namespace xla::gpu {
-namespace {
+namespace stream_executor::gpu {
 
-// Use busy waiting instead of __nanosleep() to make the code more portable
-// (__nanosleep requires __CUDA_ARCH__ >= 700)
-__global__ void sleep(int64_t num_clocks) {
-  int64_t start = clock64();
-  while (clock64() - start < num_clocks) continue;
-}
-
-}  // namespace
+absl::StatusOr<bool> DelayKernelIsSupported(GpuStream*) { return false; }
 
-void* GetSleepKernel() { return reinterpret_cast<void*>(&sleep); }
+absl::StatusOr<GpuSemaphore> LaunchDelayKernel(Stream* stream) {
+  return absl::UnimplementedError("Not implemented");
+}
 
-}  // namespace xla::gpu
+}  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_types.h b/third_party/xla/xla/stream_executor/gpu/gpu_types.h
index c8d6266b35dfe6..3964b0777eaeff 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_types.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_types.h
@@ -42,7 +42,6 @@ struct UnsupportedGpuFeature {};
 
 #if TENSORFLOW_USE_ROCM
 
-using GpuContextHandle = hipCtx_t;
 using GpuStreamHandle = hipStream_t;
 using GpuEventHandle = hipEvent_t;
 using GpuFunctionHandle = hipFunction_t;
@@ -52,7 +51,6 @@ using GpuDevicePtr = hipDeviceptr_t;
 using GpuDeviceAttribute = hipDeviceAttribute_t;
 using GpuDeviceProperty = hipDeviceProp_t;
 using GpuModuleHandle = hipModule_t;
-using GpuStatus = hipError_t;
 using GpuFuncCachePreference = hipFuncCache_t;
 using GpuSharedMemConfig = hipSharedMemConfig;
 using GpuComplexType = hipComplex;
@@ -64,7 +62,6 @@ using GpuGraphNodeHandle = hipGraphNode_t;
 using GpuGraphConditionalHandle = UnsupportedGpuFeature;
 #else  // CUDA
 
-using GpuContextHandle = CUcontext;
 using GpuStreamHandle = CUstream;
 using GpuEventHandle = CUevent;
 using GpuFunctionHandle = CUfunction;
@@ -74,7 +71,6 @@ using GpuDevicePtr = CUdeviceptr;
 using GpuDeviceAttribute = CUdevice_attribute;
 using GpuDeviceProperty = CUdevprop;
 using GpuModuleHandle = CUmodule;
-using GpuStatus = CUresult;
 using GpuFuncCachePreference = CUfunc_cache;
 using GpuSharedMemConfig = CUsharedconfig;
 using GpuComplexType = cuComplex;
diff --git a/third_party/xla/xla/stream_executor/gpu/redzone_allocator.cc b/third_party/xla/xla/stream_executor/gpu/redzone_allocator.cc
index 0b56378f4f0eed..0f6b85ed4327d4 100644
--- a/third_party/xla/xla/stream_executor/gpu/redzone_allocator.cc
+++ b/third_party/xla/xla/stream_executor/gpu/redzone_allocator.cc
@@ -23,18 +23,17 @@ limitations under the License.
 #include <string>
 #include <utility>
 
-#include "absl/base/call_once.h"
+#include "absl/container/fixed_array.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
-#include "absl/types/span.h"
 #include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
-#include "xla/stream_executor/gpu/asm_compiler.h"
+#include "xla/stream_executor/device_memory_handle.h"
 #include "xla/stream_executor/gpu/gpu_asm_opts.h"
+#include "xla/stream_executor/gpu/redzone_allocator_kernel.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/stream.h"
@@ -44,10 +43,6 @@ limitations under the License.
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
-#ifdef GOOGLE_CUDA
-#include "xla/stream_executor/cuda/cuda_asm_compiler.h"
-#endif
-
 namespace stream_executor {
 
 // Rounds the value up to a multiple of the divisor by first calling CeilOfRatio
@@ -131,67 +126,6 @@ absl::StatusOr<DeviceMemory<uint8_t>> RedzoneAllocator::AllocateBytes(
   return data_chunk;
 }
 
-// PTX blob for the function which checks that every byte in
-// input_buffer (length is buffer_length) is equal to redzone_pattern.
-//
-// On mismatch, increment the counter pointed to by out_mismatch_cnt_ptr.
-//
-// Generated from:
-// __global__ void redzone_checker(unsigned char* input_buffer,
-//                                 unsigned char redzone_pattern,
-//                                 unsigned long long buffer_length,
-//                                 int* out_mismatched_ptr) {
-//   unsigned long long idx = threadIdx.x + blockIdx.x * blockDim.x;
-//   if (idx >= buffer_length) return;
-//   if (input_buffer[idx] != redzone_pattern) atomicAdd(out_mismatched_ptr, 1);
-// }
-//
-// Code must compile for the oldest GPU XLA may be compiled for.
-static const char* redzone_checker_ptx = R"(
-.version 4.2
-.target sm_30
-.address_size 64
-
-.visible .entry redzone_checker(
-  .param .u64 input_buffer,
-  .param .u8 redzone_pattern,
-  .param .u64 buffer_length,
-  .param .u64 out_mismatch_cnt_ptr
-)
-{
-  .reg .pred   %p<3>;
-  .reg .b16   %rs<3>;
-  .reg .b32   %r<6>;
-  .reg .b64   %rd<8>;
-
-  ld.param.u64   %rd6, [buffer_length];
-  mov.u32   %r1, %tid.x;
-  mov.u32   %r2, %ctaid.x;
-  mov.u32   %r3, %ntid.x;
-  mad.lo.s32   %r4, %r3, %r2, %r1;
-  cvt.u64.u32   %rd3, %r4;
-  setp.ge.u64   %p1, %rd3, %rd6;
-  @%p1 bra   LBB6_3;
-  ld.param.u8   %rs1, [redzone_pattern];
-  ld.param.u64   %rd4, [input_buffer];
-  cvta.to.global.u64   %rd2, %rd4;
-  add.s64   %rd7, %rd2, %rd3;
-  ld.global.u8   %rs2, [%rd7];
-  setp.eq.s16   %p2, %rs2, %rs1;
-  @%p2 bra   LBB6_3;
-  ld.param.u64   %rd5, [out_mismatch_cnt_ptr];
-  cvta.to.global.u64   %rd1, %rd5;
-  atom.global.add.u32   %r5, [%rd1], 1;
-LBB6_3:
-  ret;
-}
-)";
-
-// The PTX in redzone_checker_ptx has to be launched with specified types
-// in the specified order.
-using ComparisonKernelT = TypedKernel<DeviceMemory<uint8_t>, uint8_t, uint64_t,
-                                      DeviceMemory<uint64_t>>;
-
 // Check that redzones weren't overwritten on a host.
 //
 // Slower, but gives a more useful error message.
@@ -232,7 +166,7 @@ static absl::StatusOr<RedzoneCheckStatus> CheckRedzoneHost(
 static absl::Status RunRedzoneChecker(
     Stream* stream, const DeviceMemory<uint8_t>& redzone,
     uint8_t redzone_pattern, const DeviceMemory<uint64_t>& out_param,
-    const ComparisonKernelT& comparison_kernel) {
+    const ComparisonKernel& comparison_kernel) {
   StreamExecutor* executor = stream->parent();
 
   if (redzone.size() == 0) {
@@ -272,7 +206,7 @@ static absl::Status ReinitializeRedzone(Stream* stream,
 static absl::StatusOr<RedzoneCheckStatus> CheckRedzonesForBuffer(
     Stream* stream, DeviceMemoryBase memory,
     const DeviceMemory<uint64_t>& out_param,
-    const ComparisonKernelT& comparison_kernel, int64_t user_allocation_size,
+    const ComparisonKernel& comparison_kernel, int64_t user_allocation_size,
     uint64_t redzone_size, uint8_t redzone_pattern) {
   int64_t rhs_slop =
       RoundUpToNearest<int64_t>(user_allocation_size, kRhsRedzoneAlign) -
@@ -323,46 +257,21 @@ static absl::StatusOr<RedzoneCheckStatus> CheckRedzonesForBuffer(
 absl::StatusOr<RedzoneCheckStatus> RedzoneAllocator::CheckRedzones() const {
   StreamExecutor* executor = stream_->parent();
 
-#if GOOGLE_CUDA
-  absl::Span<const uint8_t> compiled_ptx = {};
-  absl::StatusOr<absl::Span<const uint8_t>> compiled_ptx_or =
-      CompileGpuAsmOrGetCached(executor->device_ordinal(), redzone_checker_ptx,
-                               gpu_compilation_opts_);
-  if (compiled_ptx_or.ok()) {
-    compiled_ptx = compiled_ptx_or.value();
-  } else {
-    static absl::once_flag ptxas_not_found_logged;
-    absl::call_once(ptxas_not_found_logged, [&]() {
-      LOG(WARNING) << compiled_ptx_or.status()
-                   << "\nRelying on driver to perform ptx compilation. "
-                   << "\nModify $PATH to customize ptxas location."
-                   << "\nThis message will be only logged once.";
-    });
-  }
-
   TF_ASSIGN_OR_RETURN(
-      ComparisonKernelT * kernel_ptr,
-      (LoadKernelOrGetPtr<DeviceMemory<uint8_t>, uint8_t, uint64_t,
-                          DeviceMemory<uint64_t>>(
-          executor, "redzone_checker", redzone_checker_ptx, compiled_ptx)));
-#elif TENSORFLOW_USE_ROCM
-  TF_ASSIGN_OR_RETURN(
-      ComparisonKernelT loaded_kernel,
-      (TypedKernel<DeviceMemory<uint8>, uint8, uint64_t,
-                   DeviceMemory<uint64_t>>::Create(executor, "redzone_checker",
-                                                   kernel_symbol())));
-  // CUDA side returns a pointer => hence get a pointer to the loaded kernel
-  auto* kernel_ptr = &loaded_kernel;
-#endif  // GOOGLE_CUDA
+      const ComparisonKernel* kernel,
+      GetComparisonKernel(stream_->parent(), gpu_compilation_opts_));
 
-  auto out_param = executor->AllocateOwnedScalar<uint64_t>();
-  TF_RETURN_IF_ERROR(stream_->MemZero(out_param.ptr(), sizeof(uint64_t)));
+  stream_executor::DeviceMemoryHandle out_param(
+      executor, executor->AllocateScalar<uint64_t>());
+  TF_RETURN_IF_ERROR(
+      stream_->MemZero(out_param.memory_ptr(), sizeof(uint64_t)));
 
   for (const auto& buf_and_size : allocated_buffers_) {
     TF_ASSIGN_OR_RETURN(
         RedzoneCheckStatus redzone_status,
-        CheckRedzonesForBuffer(stream_, *buf_and_size.first, out_param.cref(),
-                               *kernel_ptr, buf_and_size.second, redzone_size_,
+        CheckRedzonesForBuffer(stream_, *buf_and_size.first,
+                               DeviceMemory<uint64_t>(out_param.memory()),
+                               *kernel, buf_and_size.second, redzone_size_,
                                redzone_pattern_));
     if (!redzone_status.ok()) {
       return redzone_status;
diff --git a/third_party/xla/xla/stream_executor/gpu/redzone_allocator.h b/third_party/xla/xla/stream_executor/gpu/redzone_allocator.h
index 8fcac1ea2b7678..842289fbed6cab 100644
--- a/third_party/xla/xla/stream_executor/gpu/redzone_allocator.h
+++ b/third_party/xla/xla/stream_executor/gpu/redzone_allocator.h
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/gpu/gpu_asm_opts.h"
 #include "xla/stream_executor/scratch_allocator.h"
@@ -103,9 +104,6 @@ class RedzoneAllocator : public ScratchAllocator {
 
   Stream* stream() const { return stream_; }
 
-  // Return a pointer to in-process kernel symbol (used to check redzones).
-  void* kernel_symbol() const;
-
  private:
   const int device_ordinal_;
   Stream* stream_;
diff --git a/third_party/xla/xla/stream_executor/gpu/redzone_allocator_kernel.h b/third_party/xla/xla/stream_executor/gpu/redzone_allocator_kernel.h
new file mode 100644
index 00000000000000..6f6cdbb0389b02
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/gpu/redzone_allocator_kernel.h
@@ -0,0 +1,42 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_GPU_REDZONE_ALLOCATOR_KERNEL_H_
+#define XLA_STREAM_EXECUTOR_GPU_REDZONE_ALLOCATOR_KERNEL_H_
+
+#include <cstdint>
+
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/gpu_asm_opts.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace stream_executor {
+using ComparisonKernel = TypedKernel<DeviceMemory<uint8_t>, uint8_t, uint64_t,
+                                     DeviceMemory<uint64_t>>;
+
+// Returns a GPU kernel that checks a memory location for redzone patterns.
+// Parameters are (buffer_address, redzone_pattern, buffer_length,
+// mismatch_count_ptr). For each byte in buffer `[buffer_address :
+// buffer_address
+// + buffer_length]` that is not equal to `redzone_pattern`,
+// `*mismatch_count_ptr` gets incremented by 1.
+absl::StatusOr<const ComparisonKernel*> GetComparisonKernel(
+    StreamExecutor* executor, GpuAsmOpts gpu_asm_opts);
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_GPU_REDZONE_ALLOCATOR_KERNEL_H_
diff --git a/third_party/xla/xla/stream_executor/gpu/redzone_allocator_kernel_cuda.cc b/third_party/xla/xla/stream_executor/gpu/redzone_allocator_kernel_cuda.cc
new file mode 100644
index 00000000000000..6e6e5ad1077590
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/gpu/redzone_allocator_kernel_cuda.cc
@@ -0,0 +1,147 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <tuple>
+#include <utility>
+
+#include "absl/base/call_once.h"
+#include "absl/base/const_init.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/container/node_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "xla/stream_executor/cuda/cuda_asm_compiler.h"
+#include "xla/stream_executor/cuda/cuda_driver.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/redzone_allocator_kernel.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/stream_executor_pimpl.h"
+#include "xla/stream_executor/typed_kernel_factory.h"
+#include "tsl/platform/statusor.h"
+
+namespace stream_executor {
+// Maintains a cache of pointers to loaded kernels
+template <typename... Args>
+static absl::StatusOr<TypedKernel<Args...>*> LoadKernelOrGetPtr(
+    StreamExecutor* executor, absl::string_view kernel_name,
+    absl::string_view ptx, absl::Span<const uint8_t> cubin_data) {
+  using KernelPtrCacheKey =
+      std::tuple<CUcontext, absl::string_view, absl::string_view>;
+
+  static absl::Mutex kernel_ptr_cache_mutex(absl::kConstInit);
+  static auto& kernel_ptr_cache ABSL_GUARDED_BY(kernel_ptr_cache_mutex) =
+      *new absl::node_hash_map<KernelPtrCacheKey, TypedKernel<Args...>>();
+  CUcontext current_context = cuda::CurrentContextOrDie();
+  KernelPtrCacheKey kernel_ptr_cache_key{current_context, kernel_name, ptx};
+  absl::MutexLock lock(&kernel_ptr_cache_mutex);
+
+  auto it = kernel_ptr_cache.find(kernel_ptr_cache_key);
+  if (it == kernel_ptr_cache.end()) {
+    TF_ASSIGN_OR_RETURN(TypedKernel<Args...> loaded,
+                        (TypedKernelFactory<Args...>::Create(
+                            executor, kernel_name, ptx, cubin_data)));
+    it =
+        kernel_ptr_cache.emplace(kernel_ptr_cache_key, std::move(loaded)).first;
+  }
+
+  CHECK(it != kernel_ptr_cache.end());
+  return &it->second;
+}
+
+// PTX blob for the function which checks that every byte in
+// input_buffer (length is buffer_length) is equal to redzone_pattern.
+//
+// On mismatch, increment the counter pointed to by out_mismatch_cnt_ptr.
+//
+// Generated from:
+// __global__ void redzone_checker(unsigned char* input_buffer,
+//                                 unsigned char redzone_pattern,
+//                                 unsigned long long buffer_length,
+//                                 int* out_mismatched_ptr) {
+//   unsigned long long idx = threadIdx.x + blockIdx.x * blockDim.x;
+//   if (idx >= buffer_length) return;
+//   if (input_buffer[idx] != redzone_pattern) atomicAdd(out_mismatched_ptr, 1);
+// }
+//
+// Code must compile for the oldest GPU XLA may be compiled for.
+static const char* redzone_checker_ptx = R"(
+.version 4.2
+.target sm_30
+.address_size 64
+
+.visible .entry redzone_checker(
+  .param .u64 input_buffer,
+  .param .u8 redzone_pattern,
+  .param .u64 buffer_length,
+  .param .u64 out_mismatch_cnt_ptr
+)
+{
+  .reg .pred   %p<3>;
+  .reg .b16   %rs<3>;
+  .reg .b32   %r<6>;
+  .reg .b64   %rd<8>;
+
+  ld.param.u64   %rd6, [buffer_length];
+  mov.u32   %r1, %tid.x;
+  mov.u32   %r2, %ctaid.x;
+  mov.u32   %r3, %ntid.x;
+  mad.lo.s32   %r4, %r3, %r2, %r1;
+  cvt.u64.u32   %rd3, %r4;
+  setp.ge.u64   %p1, %rd3, %rd6;
+  @%p1 bra   LBB6_3;
+  ld.param.u8   %rs1, [redzone_pattern];
+  ld.param.u64   %rd4, [input_buffer];
+  cvta.to.global.u64   %rd2, %rd4;
+  add.s64   %rd7, %rd2, %rd3;
+  ld.global.u8   %rs2, [%rd7];
+  setp.eq.s16   %p2, %rs2, %rs1;
+  @%p2 bra   LBB6_3;
+  ld.param.u64   %rd5, [out_mismatch_cnt_ptr];
+  cvta.to.global.u64   %rd1, %rd5;
+  atom.global.add.u32   %r5, [%rd1], 1;
+LBB6_3:
+  ret;
+}
+)";
+
+absl::StatusOr<const ComparisonKernel*> GetComparisonKernel(
+    StreamExecutor* executor, GpuAsmOpts gpu_asm_opts) {
+  absl::Span<const uint8_t> compiled_ptx = {};
+  absl::StatusOr<absl::Span<const uint8_t>> compiled_ptx_or =
+      CompileGpuAsmOrGetCached(executor->device_ordinal(), redzone_checker_ptx,
+                               gpu_asm_opts);
+  if (compiled_ptx_or.ok()) {
+    compiled_ptx = compiled_ptx_or.value();
+  } else {
+    static absl::once_flag ptxas_not_found_logged;
+    absl::call_once(ptxas_not_found_logged, [&]() {
+      LOG(WARNING) << compiled_ptx_or.status()
+                   << "\nRelying on driver to perform ptx compilation. "
+                   << "\nModify $PATH to customize ptxas location."
+                   << "\nThis message will be only logged once.";
+    });
+  }
+
+  return LoadKernelOrGetPtr<DeviceMemory<uint8_t>, uint8_t, uint64_t,
+                            DeviceMemory<uint64_t>>(
+      executor, "redzone_checker", redzone_checker_ptx, compiled_ptx);
+}
+}  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/gpu/redzone_allocator.cu.cc b/third_party/xla/xla/stream_executor/gpu/redzone_allocator_kernel_rocm.cu.cc
similarity index 52%
rename from third_party/xla/xla/stream_executor/gpu/redzone_allocator.cu.cc
rename to third_party/xla/xla/stream_executor/gpu/redzone_allocator_kernel_rocm.cu.cc
index d6a5108ef37ca8..c1e7200cddbff7 100644
--- a/third_party/xla/xla/stream_executor/gpu/redzone_allocator.cu.cc
+++ b/third_party/xla/xla/stream_executor/gpu/redzone_allocator_kernel_rocm.cu.cc
@@ -1,4 +1,4 @@
-/* Copyright 2023 The OpenXLA Authors.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,32 +13,40 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/stream_executor/gpu/redzone_allocator.h"
+#include <cstdint>
 
-
-namespace stream_executor {
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/redzone_allocator_kernel.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/stream_executor_pimpl.h"
+#include "xla/stream_executor/typed_kernel_factory.h"
+#include "tsl/platform/statusor.h"
 
 namespace {
-#if TENSORFLOW_USE_ROCM
-
 __global__ void redzone_checker_kernel(uint8_t* input_buffer,
                                        uint8_t redzone_pattern,
                                        uint64_t buffer_length,
-                                       int* out_mismatched_ptr) {
+                                       uint32_t* out_mismatched_ptr) {
   uint64_t idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx >= buffer_length) return;
   if (input_buffer[idx] != redzone_pattern) atomicAdd(out_mismatched_ptr, 1);
 }
-
-#endif
 }  // namespace
 
-void* RedzoneAllocator::kernel_symbol() const {
-#if TENSORFLOW_USE_ROCM
-  return reinterpret_cast<void*>(&redzone_checker_kernel);
-#else
-  return nullptr;
-#endif
+namespace stream_executor {
+
+absl::StatusOr<const ComparisonKernel*> GetComparisonKernel(
+    StreamExecutor* executor, GpuAsmOpts /*gpu_asm_opts*/) {
+  static auto kernel = TypedKernelFactory<
+      DeviceMemory<uint8>, uint8, uint64_t,
+      DeviceMemory<uint64_t>>::Create(executor, "redzone_checker",
+                                      reinterpret_cast<void*>(
+                                          redzone_checker_kernel));
+
+  if (!kernel.ok()) return kernel.status();
+  return &kernel.value();
 }
 
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/gpu/stream_search_test.cc b/third_party/xla/xla/stream_executor/gpu/stream_search_test.cc
index 9f91c63ae0d972..c0f66159400039 100644
--- a/third_party/xla/xla/stream_executor/gpu/stream_search_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/stream_search_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
 namespace stream_executor {
diff --git a/third_party/xla/xla/stream_executor/host/BUILD b/third_party/xla/xla/stream_executor/host/BUILD
index 43af63f32502bf..cd52f851ca9ec8 100644
--- a/third_party/xla/xla/stream_executor/host/BUILD
+++ b/third_party/xla/xla/stream_executor/host/BUILD
@@ -1,6 +1,14 @@
 # Description:
 #   Host-platform specific StreamExecutor support code.
 
+load(
+    "@local_tsl//tsl/platform:build_config_root.bzl",
+    "if_llvm_aarch32_available",
+    "if_llvm_aarch64_available",
+    "if_llvm_powerpc_available",
+    "if_llvm_system_z_available",
+    "if_llvm_x86_available",
+)
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//xla:xla.bzl", "xla_cc_test")
 load("//xla/stream_executor:build_defs.bzl", "stream_executor_friends")
@@ -89,7 +97,6 @@ cc_library(
         ":host_kernel_c_api",
         "//xla/stream_executor",
         "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:stream_executor_interface",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
@@ -102,9 +109,14 @@ xla_cc_test(
     deps = [
         ":host_kernel",
         ":host_kernel_c_api",
+        ":host_platform",
+        ":jit_host_kernel_function",
         "//xla/stream_executor",
         "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:kernel_factory",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
     ],
@@ -119,17 +131,21 @@ cc_library(
         "host_executor.h",
     ],
     deps = [
+        ":host_kernel",
         ":host_stream",
         "//xla/stream_executor",
         "//xla/stream_executor:event_interface",
         "//xla/stream_executor:stream_executor_interface",
         "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:platform_port",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform/profile_utils:profile_utils_cpu_utils",
     ],
     alwayslink = True,
@@ -151,3 +167,53 @@ xla_cc_test(
         "@local_tsl//tsl/platform:test_main",
     ],
 )
+
+cc_library(
+    name = "jit_host_kernel_function",
+    srcs = ["jit_host_kernel_function.cc"],
+    hdrs = ["jit_host_kernel_function.h"],
+    deps = [
+        ":host_executor",
+        ":host_kernel",
+        ":host_kernel_c_api",
+        "//xla/stream_executor:kernel_spec",
+        "//xla/stream_executor/platform",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Analysis",
+        "@llvm-project//llvm:AsmParser",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:ExecutionEngine",
+        "@llvm-project//llvm:JITLink",
+        "@llvm-project//llvm:OrcJIT",
+        "@llvm-project//llvm:OrcShared",
+        "@llvm-project//llvm:Passes",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:Target",
+        "@llvm-project//llvm:TargetParser",
+        "@llvm-project//llvm:TransformUtils",
+        "@llvm-project//llvm:ir_headers",
+        "@local_tsl//tsl/platform:statusor",
+    ] + if_llvm_aarch32_available([
+        "@llvm-project//llvm:ARMAsmParser",
+        "@llvm-project//llvm:ARMCodeGen",
+    ]) + if_llvm_aarch64_available([
+        "@llvm-project//llvm:AArch64AsmParser",
+        "@llvm-project//llvm:AArch64CodeGen",
+    ]) + if_llvm_powerpc_available([
+        "@llvm-project//llvm:PowerPCAsmParser",
+        "@llvm-project//llvm:PowerPCCodeGen",
+    ]) + if_llvm_system_z_available([
+        "@llvm-project//llvm:SystemZAsmParser",
+        "@llvm-project//llvm:SystemZCodeGen",
+    ]) + if_llvm_x86_available([
+        "@llvm-project//llvm:X86AsmParser",
+        "@llvm-project//llvm:X86CodeGen",
+    ]),
+    alwayslink = 1,  # static kernel function loader registration
+)
diff --git a/third_party/xla/xla/stream_executor/host/host_executor.cc b/third_party/xla/xla/stream_executor/host/host_executor.cc
index fdaf3bfe3f685b..159d60d3357630 100644
--- a/third_party/xla/xla/stream_executor/host/host_executor.cc
+++ b/third_party/xla/xla/stream_executor/host/host_executor.cc
@@ -22,24 +22,30 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
+#include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/numbers.h"
-#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "absl/synchronization/notification.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/event_interface.h"
+#include "xla/stream_executor/host/host_kernel.h"
 #include "xla/stream_executor/host/host_stream.h"
+#include "xla/stream_executor/kernel_spec.h"
+#include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_interface.h"
 #include "tsl/platform/mem.h"
 #include "tsl/platform/profile_utils/cpu_utils.h"
+#include "tsl/platform/statusor.h"
 
 namespace stream_executor {
 namespace host {
@@ -49,8 +55,52 @@ HostStream* AsHostStream(Stream* stream) {
   return dynamic_cast<HostStream*>(stream->implementation());
 }
 
+static std::vector<HostExecutor::KernelFunctionLoader>&
+KernelFunctionLoaderRegistry() {
+  static auto* registry = new std::vector<HostExecutor::KernelFunctionLoader>();
+  return *registry;
+}
+
+void HostExecutor::RegisterKernelFunctionLoader(KernelFunctionLoader loader) {
+  KernelFunctionLoaderRegistry().push_back(std::move(loader));
+}
+
 absl::Status HostExecutor::Init() { return absl::OkStatus(); }
 
+absl::StatusOr<std::unique_ptr<Kernel>> HostExecutor::CreateKernel() {
+  return std::make_unique<HostKernel>();
+}
+
+absl::Status HostExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
+                                     Kernel* kernel) {
+  HostKernel* host_kernel = AsHostKernel(kernel);
+  host_kernel->SetArity(spec.arity());
+
+  VLOG(3) << "GetKernel on kernel " << kernel << " : " << kernel->name();
+
+  for (auto& loader : KernelFunctionLoaderRegistry()) {
+    auto loaded = loader(spec);
+    if (!loaded.has_value()) continue;
+
+    TF_ASSIGN_OR_RETURN(auto kernel_function, *std::move(loaded));
+    host_kernel->SetExecutionEngine(std::move(kernel_function));
+    return absl::OkStatus();
+  }
+
+  return absl::InternalError("No method of loading host kernel provided");
+}
+
+absl::Status HostExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
+                                  const BlockDim& block_dims,
+                                  const Kernel& kernel,
+                                  const KernelArgs& args) {
+  // const HostKernel* host_kernel = AsHostKernel(&kernel);
+
+  // TODO(tsilytskyi): convert args into proper format
+  // host_kernel->Launch(thread_dims, args);
+  return absl::UnimplementedError("Not Implemented");
+}
+
 bool HostExecutor::DeviceMemoryUsage(int64_t* free, int64_t* total) const {
   tsl::port::MemoryInfo mem_info = tsl::port::GetMemoryInfo();
   *free = (mem_info.free != INT64_MAX) ? mem_info.free : -1;
@@ -162,8 +212,6 @@ bool HostExecutor::HostCallback(
   return true;
 }
 
-bool HostExecutor::AllocateStream(Stream* stream) { return true; }
-
 void HostExecutor::DeallocateStream(Stream* stream) {}
 
 bool HostExecutor::CreateStreamDependency(Stream* dependent, Stream* other) {
@@ -187,8 +235,8 @@ class HostEvent : public EventInterface {
   std::shared_ptr<absl::Notification> notification_;
 };
 
-std::unique_ptr<EventInterface> HostExecutor::CreateEventImplementation() {
-  return std::unique_ptr<EventInterface>(new HostEvent());
+absl::StatusOr<std::unique_ptr<Event>> HostExecutor::CreateEvent() {
+  return std::make_unique<Event>(this, std::make_unique<HostEvent>());
 }
 
 static HostEvent* AsHostEvent(Event* event) {
@@ -196,10 +244,6 @@ static HostEvent* AsHostEvent(Event* event) {
   return static_cast<HostEvent*>(event->implementation());
 }
 
-absl::Status HostExecutor::AllocateEvent(Event* /*event*/) {
-  return absl::OkStatus();
-}
-
 absl::Status HostExecutor::DeallocateEvent(Event* /*event*/) {
   return absl::OkStatus();
 }
@@ -252,8 +296,9 @@ HostExecutor::CreateDeviceDescription(int device_ordinal) {
   return builder.Build();
 }
 
-std::unique_ptr<StreamInterface> HostExecutor::GetStreamImplementation() {
-  return std::make_unique<HostStream>();
+absl::StatusOr<std::unique_ptr<Stream>> HostExecutor::CreateStream(
+    std::optional<std::variant<StreamPriority, int>> priority) {
+  return std::make_unique<Stream>(this, std::make_unique<HostStream>());
 }
 
 }  // namespace host
diff --git a/third_party/xla/xla/stream_executor/host/host_executor.h b/third_party/xla/xla/stream_executor/host/host_executor.h
index eaf9468841a993..2ee91d60a466ee 100644
--- a/third_party/xla/xla/stream_executor/host/host_executor.h
+++ b/third_party/xla/xla/stream_executor/host/host_executor.h
@@ -21,7 +21,10 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdint>
+#include <functional>
 #include <memory>
+#include <optional>
+#include <variant>
 
 #include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
@@ -29,11 +32,13 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/event.h"
+#include "xla/stream_executor/host/host_kernel.h"
 #include "xla/stream_executor/host_memory_allocation.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_interface.h"
 
@@ -49,21 +54,30 @@ namespace host {
 // This is useful for evaluating the performance of host-based or fallback
 // routines executed under the context of a GPU executor.
 // See stream_executor.h for description of the below operations.
-class HostExecutor : public StreamExecutorInterface {
+class HostExecutor : public StreamExecutor {
  public:
-  explicit HostExecutor(int device_ordinal) : device_ordinal_(device_ordinal) {}
+  // A function that loads a kernel function from a given spec. If spec is not
+  // supported it returns an empty optional.
+  using KernelFunctionLoader = std::function<std::optional<
+      absl::StatusOr<std::unique_ptr<HostKernel::KernelFunction>>>(
+      const MultiKernelLoaderSpec& spec)>;
+
+  // Registers a kernel function loader in a static registry.
+  static void RegisterKernelFunctionLoader(KernelFunctionLoader loader);
+
+  HostExecutor(Platform* platform, int device_ordinal)
+      : StreamExecutor(platform), device_ordinal_(device_ordinal) {}
 
   absl::Status Init() override;
 
   absl::Status GetKernel(const MultiKernelLoaderSpec& spec,
-                         Kernel* kernel) override {
-    return absl::UnimplementedError("Not Implemented");
-  }
+                         Kernel* kernel) override;
+
+  absl::StatusOr<std::unique_ptr<Kernel>> CreateKernel() override;
+
   absl::Status Launch(Stream* stream, const ThreadDim& thread_dims,
                       const BlockDim& block_dims, const Kernel& kernel,
-                      const KernelArgs& args) override {
-    return absl::UnimplementedError("Not Implemented");
-  }
+                      const KernelArgs& args) override;
 
   DeviceMemoryBase Allocate(uint64_t size, int64_t memory_space) override;
   void Deallocate(DeviceMemoryBase* mem) override;
@@ -105,13 +119,11 @@ class HostExecutor : public StreamExecutorInterface {
   bool HostCallback(Stream* stream,
                     absl::AnyInvocable<absl::Status() &&> callback) override;
 
-  absl::Status AllocateEvent(Event* event) override;
   absl::Status DeallocateEvent(Event* event) override;
   absl::Status RecordEvent(Stream* stream, Event* event) override;
   absl::Status WaitForEvent(Stream* stream, Event* event) override;
   Event::Status PollForEventStatus(Event* event) override;
 
-  bool AllocateStream(Stream* stream) override;
   void DeallocateStream(Stream* stream) override;
   bool CreateStreamDependency(Stream* dependent, Stream* other) override;
 
@@ -136,9 +148,10 @@ class HostExecutor : public StreamExecutorInterface {
     return true;
   }
 
-  std::unique_ptr<EventInterface> CreateEventImplementation() override;
+  absl::StatusOr<std::unique_ptr<Event>> CreateEvent() override;
 
-  std::unique_ptr<StreamInterface> GetStreamImplementation() override;
+  absl::StatusOr<std::unique_ptr<Stream>> CreateStream(
+      std::optional<std::variant<StreamPriority, int>> priority) override;
 
  private:
   int device_ordinal_;
diff --git a/third_party/xla/xla/stream_executor/host/host_kernel.cc b/third_party/xla/xla/stream_executor/host/host_kernel.cc
index e2d37085a1c756..faee158cf8c03b 100644
--- a/third_party/xla/xla/stream_executor/host/host_kernel.cc
+++ b/third_party/xla/xla/stream_executor/host/host_kernel.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/stream_executor/host/host_kernel.h"
 
 #include <cstdint>
+#include <memory>
 #include <vector>
 
 #include "absl/status/status.h"
@@ -27,10 +28,11 @@ limitations under the License.
 namespace stream_executor::host {
 
 HostKernel::HostKernel(unsigned arity, SE_HOST_Kernel* kernel)
-    : arity_(arity), kernel_(kernel) {}
+    : function_(std::make_unique<KernelFunctionPtr>(kernel)), arity_(arity) {}
 
-absl::Status HostKernel::Launch(const ThreadDim& thread_dims,
-                                absl::Span<const DeviceMemoryBase> buffers) {
+absl::Status HostKernel::Launch(
+    const ThreadDim& thread_dims,
+    absl::Span<const DeviceMemoryBase> buffers) const {
   SE_HOST_KernelThreadDim kernel_thread_dims = {thread_dims.x, thread_dims.y,
                                                 thread_dims.z};
 
@@ -45,6 +47,8 @@ absl::Status HostKernel::Launch(const ThreadDim& thread_dims,
   // for different threads (blocks) concurrently. For now it's the most trivial
   // implementation that runs tasks sequentially.
 
+  SE_HOST_Kernel* kernel = function_->kernel();
+
   for (uint64_t z = 0; z < thread_dims.z; ++z) {
     for (uint64_t y = 0; y < thread_dims.y; ++y) {
       for (uint64_t x = 0; x < thread_dims.x; ++x) {
@@ -53,7 +57,7 @@ absl::Status HostKernel::Launch(const ThreadDim& thread_dims,
         SE_HOST_KernelCallFrame call_frame = {
             &kernel_thread_dims, &kernel_thread, args.size(), args.data()};
 
-        SE_HOST_KernelError* error = (*kernel_)(&call_frame);
+        SE_HOST_KernelError* error = (*kernel)(&call_frame);
 
         if (error != nullptr) {
           return absl::InternalError("Failed to call host kernel");
diff --git a/third_party/xla/xla/stream_executor/host/host_kernel.h b/third_party/xla/xla/stream_executor/host/host_kernel.h
index ee8f67738bf08c..99d132e1e77e68 100644
--- a/third_party/xla/xla/stream_executor/host/host_kernel.h
+++ b/third_party/xla/xla/stream_executor/host/host_kernel.h
@@ -18,6 +18,9 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdint>
+#include <memory>
+#include <type_traits>
+#include <utility>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -25,19 +28,46 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/host/host_kernel_c_api.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/launch_dim.h"
 
 namespace stream_executor::host {
 
+class HostExecutor;
+
 class HostKernel : public Kernel {
  public:
-  HostKernel(unsigned arity, SE_HOST_Kernel* kernel);
+  // Virtual base class that owns the function behind the host kernel. It can be
+  // a function in a jit-compiled LLVM module or simply a pointer to the
+  // in-process function written in C++. HostKernel is responsible for launching
+  // the kernel function owned by the KernelFunction with given user-provided
+  // arguments potentially on a thread pool.
+  class KernelFunction {
+   public:
+    virtual ~KernelFunction() = default;
+    virtual SE_HOST_Kernel* kernel() const = 0;
+  };
+
+  // A wrapper around function pointer that implements SE_HOST_Kernel API.
+  class KernelFunctionPtr final : public KernelFunction {
+   public:
+    explicit KernelFunctionPtr(SE_HOST_Kernel* ptr) : ptr_(ptr) {}
+    SE_HOST_Kernel* kernel() const override { return ptr_; }
+
+   private:
+    SE_HOST_Kernel* ptr_;  // not owned
+  };
+
+  explicit HostKernel() = default;
 
-  // TODO(b/331430625): Connect this API to Launch API defined at StreamExecutor
-  // level, which requires refactoring how arguments passed to kernels, as
-  // current KernelArgs structure tied to the GPU kernel ABI.
+  // TODO(tsilytskyi): make this implementation detail private
+  explicit HostKernel(unsigned arity, SE_HOST_Kernel* kernel);
+
+  // TODO(b/331430625): Connect this API to Launch API defined at
+  // StreamExecutor level, which requires refactoring how arguments passed to
+  // kernels, as current KernelArgs structure tied to the GPU kernel ABI.
   absl::Status Launch(const ThreadDim& thread_dims,
-                      absl::Span<const DeviceMemoryBase> buffers);
+                      absl::Span<const DeviceMemoryBase> buffers) const;
 
   // For host platform, we assume that a core is a thread, and we can run at
   // most one instance of a kernel on a given thread.
@@ -46,13 +76,29 @@ class HostKernel : public Kernel {
     return 1;
   };
 
+  void SetArity(unsigned arity) { arity_ = arity; };
   unsigned Arity() const override { return arity_; };
 
+  template <typename T,
+            std::enable_if_t<std::is_base_of_v<KernelFunction, T>>* = nullptr>
+  void SetExecutionEngine(std::unique_ptr<T> execution_engine) {
+    function_ = std::move(execution_engine);
+  }
+
  private:
+  std::unique_ptr<KernelFunction> function_;
+
   unsigned arity_;
-  SE_HOST_Kernel* kernel_ = nullptr;
 };
 
+inline const HostKernel* AsHostKernel(const Kernel* kernel) {
+  return static_cast<const HostKernel*>(kernel);
+}
+
+inline HostKernel* AsHostKernel(Kernel* kernel) {
+  return static_cast<HostKernel*>(kernel);
+}
+
 }  // namespace stream_executor::host
 
 #endif  // XLA_STREAM_EXECUTOR_HOST_HOST_KERNEL_H_
diff --git a/third_party/xla/xla/stream_executor/host/host_kernel_test.cc b/third_party/xla/xla/stream_executor/host/host_kernel_test.cc
index 6bf3439d2e95e9..aa1bed20c361db 100644
--- a/third_party/xla/xla/stream_executor/host/host_kernel_test.cc
+++ b/third_party/xla/xla/stream_executor/host/host_kernel_test.cc
@@ -16,12 +16,21 @@ limitations under the License.
 #include "xla/stream_executor/host/host_kernel.h"
 
 #include <cstdint>
+#include <memory>
+#include <string>
 #include <vector>
 
+#include "absl/types/span.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/host/host_kernel_c_api.h"
+#include "xla/stream_executor/kernel_factory.h"
+#include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
 namespace stream_executor::host {
@@ -41,6 +50,38 @@ static SE_HOST_KernelError* AddI32(const SE_HOST_KernelCallFrame* call_frame) {
   return nullptr;
 }
 
+static const char* llvm_kernel_add = R"(
+%SE_HOST_KernelCallFrame = type { ptr, ptr, i64, ptr }
+%struct.SE_HOST_KernelArg = type { ptr, i64 }
+
+define ptr @LlvmAddI32(ptr noundef %0) {
+  %2 = getelementptr inbounds %SE_HOST_KernelCallFrame, ptr %0, i32 0, i32 3
+  %3 = load ptr, ptr %2, align 8
+  %4 = getelementptr inbounds %struct.SE_HOST_KernelArg, ptr %3, i64 1
+  %5 = getelementptr inbounds %struct.SE_HOST_KernelArg, ptr %3, i64 2
+  %6 = load ptr, ptr %3, align 8
+  %7 = load ptr, ptr %4, align 8
+  %8 = load ptr, ptr %5, align 8
+  %9 = getelementptr inbounds %SE_HOST_KernelCallFrame, ptr %0, i32 0, i32 1
+  %10 = load ptr, ptr %9, align 8
+  %11 = load i64, ptr %10, align 8
+  %12 = getelementptr inbounds i32, ptr %6, i64 %11
+  %13 = load i32, ptr %12, align 4
+  %14 = getelementptr inbounds i32, ptr %7, i64 %11
+  %15 = load i32, ptr %14, align 4
+  %16 = add nsw i32 %13, %15
+  %17 = getelementptr inbounds i32, ptr %8, i64 %11
+  store i32 %16, ptr %17, align 4
+  ret ptr null
+}
+)";
+
+static std::unique_ptr<StreamExecutor> NewStreamExecutor() {
+  Platform* platform = PlatformManager::PlatformWithName("Host").value();
+  StreamExecutorConfig config(/*ordinal=*/0);
+  return platform->GetUncachedExecutor(config).value();
+}
+
 TEST(HostKernelTest, Addition) {
   HostKernel kernel(/*arity=*/3, AddI32);
 
@@ -59,4 +100,31 @@ TEST(HostKernelTest, Addition) {
   EXPECT_EQ(out, expected);
 }
 
+TEST(HostKernelTest, JitAddition) {
+  std::vector<int32_t> lhs = {1, 2, 3, 4};
+  std::vector<int32_t> rhs = {5, 6, 7, 8};
+  std::vector<int32_t> out = {0, 0, 0, 0};
+
+  DeviceMemoryBase lhs_mem(lhs.data(), lhs.size() * sizeof(int32_t));
+  DeviceMemoryBase rhs_mem(rhs.data(), rhs.size() * sizeof(int32_t));
+  DeviceMemoryBase out_mem(out.data(), out.size() * sizeof(int32_t));
+  std::vector<DeviceMemoryBase> args = {lhs_mem, rhs_mem, out_mem};
+
+  MultiKernelLoaderSpec spec(/*arity=*/3);
+  spec.AddLlvmHostKernel(llvm_kernel_add, "LlvmAddI32", "LlvmAddI32",
+                         absl::Span<std::string>());
+
+  auto executor = NewStreamExecutor();
+  auto eg = executor.get();
+  EXPECT_NE(eg, nullptr);
+  TF_ASSERT_OK_AND_ASSIGN(auto add, KernelFactory::Create(eg, spec));
+
+  // TODO(tsilytskyi): implement Launch part
+  // TF_ASSERT_OK(executor->Launch(ThreadDim(4), args));
+
+  // std::vector<int32_t> expected = {6, 8, 10, 12};
+  // EXPECT_EQ(out, expected);
+  // EXPECT_TRUE(true);
+}
+
 }  // namespace stream_executor::host
diff --git a/third_party/xla/xla/stream_executor/host/host_platform.cc b/third_party/xla/xla/stream_executor/host/host_platform.cc
index c9fdb8ef5a7429..1398caa8a81555 100644
--- a/third_party/xla/xla/stream_executor/host/host_platform.cc
+++ b/third_party/xla/xla/stream_executor/host/host_platform.cc
@@ -66,8 +66,7 @@ absl::StatusOr<StreamExecutor*> HostPlatform::GetExecutor(
 
 absl::StatusOr<std::unique_ptr<StreamExecutor>>
 HostPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
-  auto executor = std::make_unique<StreamExecutor>(
-      this, std::make_unique<HostExecutor>(config.ordinal));
+  auto executor = std::make_unique<HostExecutor>(this, config.ordinal);
   auto init_status = executor->Init();
   if (!init_status.ok()) {
     return absl::InternalError(absl::StrFormat(
diff --git a/third_party/xla/xla/stream_executor/host/jit_host_kernel_function.cc b/third_party/xla/xla/stream_executor/host/jit_host_kernel_function.cc
new file mode 100644
index 00000000000000..c46ad5cd5771ed
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/host/jit_host_kernel_function.cc
@@ -0,0 +1,458 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/host/jit_host_kernel_function.h"
+
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/CGSCCPassManager.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/ExecutionEngine/JITEventListener.h"
+#include "llvm/ExecutionEngine/ObjectCache.h"
+#include "llvm/ExecutionEngine/Orc/CompileUtils.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
+#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
+#include "llvm/ExecutionEngine/Orc/LLJIT.h"
+#include "llvm/ExecutionEngine/Orc/Mangling.h"
+#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
+#include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h"
+#include "llvm/ExecutionEngine/Orc/TaskDispatch.h"
+#include "llvm/ExecutionEngine/Orc/ThreadSafeModule.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Passes/OptimizationLevel.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/TargetParser/Triple.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "xla/stream_executor/host/host_executor.h"
+#include "xla/stream_executor/host/host_kernel.h"
+#include "xla/stream_executor/host/host_kernel_c_api.h"
+#include "xla/stream_executor/kernel_spec.h"
+#include "xla/stream_executor/platform/initialize.h"
+#include "tsl/platform/statusor.h"
+
+namespace stream_executor::host {
+
+using llvm::Expected;
+using llvm::MemoryBuffer;
+using llvm::SectionMemoryManager;
+using llvm::Triple;
+
+using llvm::orc::ExecutionSession;
+using llvm::orc::ExecutorAddr;
+using llvm::orc::InPlaceTaskDispatcher;
+using llvm::orc::IRCompileLayer;
+using llvm::orc::JITTargetMachineBuilder;
+using llvm::orc::RTDyldObjectLinkingLayer;
+using llvm::orc::SelfExecutorProcessControl;
+using llvm::orc::SimpleCompiler;
+using llvm::orc::SymbolMap;
+using llvm::orc::ThreadSafeModule;
+
+namespace {
+
+// This compiler keeps weak pointers to the TargetMachine and the ObjectCache.
+//
+// This allows releasing the memory of those objects, even though the LLJIT
+// keeps the compiler alive.
+//
+// We wrote this class based on the code of llvm::orc::ConcurrentIRCompiler.
+class WeakCompiler : public IRCompileLayer::IRCompiler {
+ public:
+  static llvm::orc::IRSymbolMapper::ManglingOptions
+  IrManglingOptionsForWeakTargetMachine(
+      std::weak_ptr<llvm::TargetMachine> weak_target_machine) {
+    std::shared_ptr<llvm::TargetMachine> target_machine =
+        weak_target_machine.lock();
+    CHECK(target_machine != nullptr)
+        << "Compiler should not be used after the TargetMachine is destroyed.";
+
+    return llvm::orc::irManglingOptionsFromTargetOptions(
+        target_machine->Options);
+  }
+
+  // It's not recommended to allocate the parameters with std::make_shared,
+  // because that would allocate the object and the control block in one
+  // allocation, so the weak_ptr would keep alive the memory of the object as
+  // well.
+  explicit WeakCompiler(std::weak_ptr<llvm::TargetMachine> weak_target_machine)
+      : IRCompiler(IrManglingOptionsForWeakTargetMachine(weak_target_machine)),
+        weak_target_machine_(std::move(weak_target_machine)) {}
+
+  Expected<std::unique_ptr<MemoryBuffer>> operator()(
+      llvm::Module &module) override {
+    std::shared_ptr<llvm::TargetMachine> target_machine =
+        weak_target_machine_.lock();
+    CHECK(target_machine != nullptr)
+        << "Compiler should not be used after the TargetMachine is destroyed.";
+
+    SimpleCompiler compiler(*target_machine);
+    return compiler(module);
+  }
+
+ private:
+  std::weak_ptr<llvm::TargetMachine> weak_target_machine_;
+};
+
+}  // namespace
+
+namespace internal {
+// A minimal LLVM ORC JIT compilation stack to jit-compile LLVM modules to
+// executable functions.
+class ExecutionEngine {
+ public:
+  using ExportedFunctionPtr = const SE_HOST_Kernel *;
+
+  // Callback to run optimization passes on the compiled LLVM module.
+  using OptimizingTransformer = std::function<llvm::Error(llvm::Module *)>;
+
+  // Callback to construct an optimizing transformer for the given options.
+  using MakeOptimizingTransformer =
+      std::function<OptimizingTransformer(llvm::TargetMachine *targetMachine)>;
+
+  // Options for creating execution engine from an LLVM module.
+  struct Options {
+    // User-provided codegen optimization level.
+    llvm::CodeGenOptLevel opt_level = llvm::CodeGenOptLevel::Default;
+
+    // User-provided target machine specification.
+    std::shared_ptr<llvm::TargetMachine> target_machine = nullptr;
+
+    // User-provided builder for the optimizing transformer.
+    MakeOptimizingTransformer make_optimizing_transformer;
+
+    // User-provided memory mapper for allocating memory for executables.
+    llvm::SectionMemoryManager::MemoryMapper *section_memory_mapper = nullptr;
+
+    // Notify the llvm's global GDB notifications listener.
+    bool enable_gdb_listener = false;
+
+    // Notify the llvm's global Perf notifications listener.
+    bool enable_perf_listener = false;
+  };
+
+  // Creates a new execution engine by compiling the provided LLVM module to
+  // a native executable using LLVM ORC stack.
+  static absl::StatusOr<std::unique_ptr<ExecutionEngine>> CreateFromModule(
+      std::unique_ptr<llvm::LLVMContext> ctx,
+      std::unique_ptr<llvm::Module> module, Options options,
+      absl::Span<const std::string_view> exported);
+
+  // Returns a pointer to the exported function.
+  absl::Span<const ExportedFunctionPtr> exported() const { return exported_; }
+
+  ExportedFunctionPtr exported(unsigned ordinal) const {
+    return exported_[ordinal];
+  }
+
+  // Return a memory buffer with a object file behind this execution engine. Can
+  // be null if execution engine didn't save the compiled object file.
+  std::unique_ptr<llvm::MemoryBuffer> obj_file() const;
+
+ private:
+  ExecutionEngine(bool enable_gdb_listener, bool enable_perf_listener);
+
+  // We build execution engine on top of the ORC LLJIT API, which owns all
+  // compiled/loaded object files and does the linking at run time.
+  //
+  // TODO(ezhulenev): Instead of keeping LLJIT alive we should be able to keep
+  // only llvm::orc::JITDylibSP owning main dylib and the object layer owning
+  // memory-mapped regions holding object files. Once we are done with
+  // executable compilation this jit is defunct because it holds an expired
+  // weak_ptr to an llvm::orc::TargetMachine instance.
+  std::unique_ptr<llvm::orc::LLJIT> jit_;
+
+  // Pointers to resolved exported functions. Indexed by function ordinal.
+  std::vector<ExportedFunctionPtr> exported_;
+
+  // Object file behind the compiled executable. Can be null.
+  std::unique_ptr<llvm::MemoryBuffer> obj_file_;
+
+  llvm::JITEventListener *gdb_listener_ = nullptr;
+  llvm::JITEventListener *perf_listener_ = nullptr;
+};
+
+ExecutionEngine::ExecutionEngine(bool enable_gdb_listener,
+                                 bool enable_perf_listener) {
+  if (enable_gdb_listener)
+    gdb_listener_ = llvm::JITEventListener::createGDBRegistrationListener();
+  if (enable_perf_listener)
+    perf_listener_ = llvm::JITEventListener::createPerfJITEventListener();
+}
+
+std::unique_ptr<MemoryBuffer> ExecutionEngine::obj_file() const {
+  return obj_file_ ? MemoryBuffer::getMemBuffer(obj_file_->getMemBufferRef())
+                   : nullptr;
+}
+
+static std::string ToString(const llvm::Error &err) {
+  std::string str;
+  llvm::raw_string_ostream(str) << err;
+  return str;
+}
+
+absl::StatusOr<std::unique_ptr<ExecutionEngine>>
+ExecutionEngine::CreateFromModule(std::unique_ptr<llvm::LLVMContext> ctx,
+                                  std::unique_ptr<llvm::Module> module,
+                                  Options options,
+                                  absl::Span<const std::string_view> exported) {
+  auto engine = std::unique_ptr<ExecutionEngine>(new ExecutionEngine(
+      options.enable_gdb_listener, options.enable_perf_listener));
+
+  // We'll need module pointer later to lookup object file in the cache.
+  llvm::Module *module_ptr = module.get();
+
+  // Set up the target machine details.
+  if (!options.target_machine) {
+    return absl::InternalError("target machine was not provided");
+  }
+  module->setDataLayout(options.target_machine->createDataLayout());
+  module->setTargetTriple(options.target_machine->getTargetTriple().str());
+
+  // Run an optimization pipeline over the LLVM module (alway run with default
+  // opt level independent of the options).
+  //
+  // TODO(ezhulenev): We should have out own optimizing transformer pipelines
+  // for different Xla backends, e.g. there is absolutely no need to run
+  // SLV vectorizer for Xla Gpi host side executable.
+  auto transformer =
+      options.make_optimizing_transformer(options.target_machine.get());
+  if (auto err = transformer(module_ptr))
+    return absl::InternalError(absl::StrFormat(
+        "failed to run optimization pipeline: %s", ToString(err)));
+
+  // Callback to create the object layer with a user-provided section memory
+  // mapper and JIT event listeners.
+  auto obj_layer_creator = [&](ExecutionSession &session, const Triple &tt) {
+    auto obj_layer = std::make_unique<RTDyldObjectLinkingLayer>(
+        session, [section_memory_mapper = options.section_memory_mapper]() {
+          return std::make_unique<SectionMemoryManager>(section_memory_mapper);
+        });
+
+    // Register JIT event listeners if they are enabled.
+    if (engine->gdb_listener_)
+      obj_layer->registerJITEventListener(*engine->gdb_listener_);
+    if (engine->perf_listener_)
+      obj_layer->registerJITEventListener(*engine->perf_listener_);
+
+    return obj_layer;
+  };
+
+  // Callback to compile IR module on demand.
+  auto compile_function_creator =
+      [weak_target_machine = std::weak_ptr<llvm::TargetMachine>(
+           options.target_machine)](JITTargetMachineBuilder)
+      -> Expected<std::unique_ptr<IRCompileLayer::IRCompiler>> {
+    return std::make_unique<WeakCompiler>(weak_target_machine);
+  };
+
+  // Use in-process executor process control with in-place task dispatcher.
+  auto executorProcessControl = SelfExecutorProcessControl::Create(
+      nullptr, std::make_unique<InPlaceTaskDispatcher>());
+
+  if (auto err = executorProcessControl.takeError()) {
+    return absl::InternalError(absl::StrFormat(
+        "failed to create executor process control: %s", ToString(err)));
+  }
+
+  // TODO(b/286475799): Concurrent compilation leads to spurious memory
+  // corruptions and segfaults at run time, however nothing shows up in tsan
+  // or asan builds. This is a hack that for some unknown reason helps.
+  static auto *lljit_mu = new absl::Mutex();
+  std::optional<absl::MutexLock> lljit_lock(lljit_mu);
+
+  // Construct the LLJIT with the given compiler and object linking layers.
+  auto jit = llvm::orc::LLJITBuilder()
+                 .setCompileFunctionCreator(std::move(compile_function_creator))
+                 .setObjectLinkingLayerCreator(obj_layer_creator)
+                 .setExecutorProcessControl(std::move(*executorProcessControl))
+                 .setNumCompileThreads(0)  // disable multi-threading
+                 .create();
+
+  if (auto err = jit.takeError()) {
+    return absl::InternalError(
+        absl::StrFormat("failed to construct LLJIT: %s", ToString(err)));
+  }
+
+  lljit_lock.reset();
+
+  // Register input module with the LLJIT.
+  ThreadSafeModule tsm(std::move(module), std::move(ctx));
+  if (auto err = (*jit)->addIRModule(std::move(tsm))) {
+    return absl::InternalError(
+        absl::StrFormat("failed to add source module: %s", ToString(err)));
+  }
+
+  llvm::DataLayout data_layout = (*jit)->getDataLayout();
+
+  // Resolve all exported functions to function pointers.
+  for (std::string_view name : exported) {
+    // Trigger compilation by looking up the exported function.
+    // TODO(tsilytskyi):
+    //   - Do we need to mangle function name?
+    //   - Do we need to verify/adapt function proto to expected API?
+    Expected<ExecutorAddr> addr = (*jit)->lookup(name);
+    if (auto err = addr.takeError()) {
+      return absl::InternalError(absl::StrFormat(
+          "failed to compile exported function %s: %s", name, ToString(err)));
+    }
+
+    // Check that we found an address of an exported function.
+    auto ptr = addr->toPtr<ExportedFunctionPtr>();
+    if (!ptr) {
+      return absl::InternalError(
+          absl::StrFormat("exported function %s resolved to null", name));
+    }
+
+    engine->exported_.push_back(ptr);
+  }
+
+  // Fill remaining fields and return constructed ExecutionEngine to the caller.
+  engine->jit_ = std::move(*jit);
+  return std::move(engine);
+}
+
+}  // namespace internal
+
+JitHostKernelFunction::JitHostKernelFunction(
+    std::unique_ptr<internal::ExecutionEngine> exec_engine)
+    : engine_(std::move(exec_engine)) {
+  kernel_ = reinterpret_cast<SE_HOST_Kernel *>(engine_->exported(0));
+};
+
+static std::function<llvm::Error(llvm::Module *)>
+MakeOptimizingTransformerForJit(llvm::TargetMachine *targetMachine) {
+  return [targetMachine](llvm::Module *m) -> llvm::Error {
+    llvm::LoopAnalysisManager lam;
+    llvm::FunctionAnalysisManager fam;
+    llvm::CGSCCAnalysisManager cgam;
+    llvm::ModuleAnalysisManager mam;
+
+    llvm::PipelineTuningOptions tuningOptions;
+    // LLVM's loop unrolling isn't well tuned for the loops we emit. Turn it off
+    // as it consumes compile time with little benefit.
+    tuningOptions.LoopUnrolling = false;
+    // Vectorization happens at the MLIR level.
+    tuningOptions.LoopVectorization = false;
+    llvm::PassBuilder pb(targetMachine, tuningOptions);
+
+    pb.registerModuleAnalyses(mam);
+    pb.registerCGSCCAnalyses(cgam);
+    pb.registerFunctionAnalyses(fam);
+    pb.registerLoopAnalyses(lam);
+    pb.crossRegisterProxies(lam, fam, cgam, mam);
+
+    llvm::ModulePassManager mpm;
+    mpm.addPass(pb.buildPerModuleDefaultPipeline(llvm::OptimizationLevel::O2));
+    mpm.run(*m, mam);
+    return llvm::Error::success();
+  };
+}
+
+absl::StatusOr<std::unique_ptr<HostKernel::KernelFunction>>
+JitHostKernelFunction::CreateFromLlvmIr(absl::string_view name,
+                                        absl::string_view entry,
+                                        absl::string_view ir,
+                                        absl::Span<const std::string> options) {
+  llvm::InitializeNativeTarget();
+  llvm::InitializeNativeTargetAsmPrinter();
+  auto llvm_ctx = std::make_unique<llvm::LLVMContext>();
+  llvm::SMDiagnostic diagnostic;
+  llvm::MemoryBufferRef ir_buffer(ir, name);
+  std::unique_ptr<llvm::Module> llvm_module =
+      llvm::parseAssembly(ir_buffer, diagnostic, *llvm_ctx, nullptr);
+
+  // Prepare JIT target machine for code generation.
+  auto builder = llvm::orc::JITTargetMachineBuilder::detectHost();
+  if (!builder) return absl::InternalError(toString(builder.takeError()));
+
+  llvm::Expected<std::unique_ptr<llvm::TargetMachine>> target_machine =
+      builder->createTargetMachine();
+  if (!target_machine)
+    return absl::InternalError(toString(target_machine.takeError()));
+
+  // Set target triple
+  llvm_module->setTargetTriple(
+      llvm::StringRef(target_machine.get()->getTargetTriple().getTriple()));
+
+  // Construct options for the XLA runtime execution engine.
+  internal::ExecutionEngine::Options engine_options;
+  engine_options.target_machine = std::move(target_machine.get());
+  engine_options.make_optimizing_transformer = MakeOptimizingTransformerForJit;
+
+  std::vector<std::string_view> exported = {entry};
+
+  // Compile input module to the native function.
+  TF_ASSIGN_OR_RETURN(auto engine,
+                      internal::ExecutionEngine::CreateFromModule(
+                          std::move(llvm_ctx), std::move(llvm_module),
+                          std::move(engine_options), exported));
+
+  return std::unique_ptr<HostKernel::KernelFunction>(
+      new JitHostKernelFunction(std::move(engine)));
+}
+
+static void RegisterJitKernelFunctionLoader() {
+  using CompiledFunction = std::optional<
+      absl::StatusOr<std::unique_ptr<HostKernel::KernelFunction>>>;
+
+  HostExecutor::RegisterKernelFunctionLoader(
+      [](const MultiKernelLoaderSpec &spec) -> CompiledFunction {
+        if (!spec.has_llvm_host_kernel()) return std::nullopt;
+
+        const LlvmHostKernel &llvm_host_kernel = spec.llvm_host_kernel();
+        std::string_view name = llvm_host_kernel.kernel_name();
+        std::string_view entry = llvm_host_kernel.entrypoint();
+        std::string_view ir = llvm_host_kernel.ir();
+        absl::Span<const std::string> options = llvm_host_kernel.options();
+
+        return JitHostKernelFunction::CreateFromLlvmIr(name, entry, ir,
+                                                       options);
+      });
+}
+
+}  // namespace stream_executor::host
+
+STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(
+    jot_kernel_function_loader,
+    stream_executor::host::RegisterJitKernelFunctionLoader());
diff --git a/third_party/xla/xla/stream_executor/host/jit_host_kernel_function.h b/third_party/xla/xla/stream_executor/host/jit_host_kernel_function.h
new file mode 100644
index 00000000000000..73991761db0109
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/host/jit_host_kernel_function.h
@@ -0,0 +1,53 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_HOST_JIT_HOST_KERNEL_FUNCTION_H_
+#define XLA_STREAM_EXECUTOR_HOST_JIT_HOST_KERNEL_FUNCTION_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/host/host_kernel.h"
+#include "xla/stream_executor/host/host_kernel_c_api.h"
+
+namespace stream_executor::host {
+
+namespace internal {
+class ExecutionEngine;
+}
+
+// A host kernel function compiled from LLVM IR at run time
+class JitHostKernelFunction : public HostKernel::KernelFunction {
+ public:
+  SE_HOST_Kernel *kernel() const override { return kernel_; }
+
+  static absl::StatusOr<std::unique_ptr<HostKernel::KernelFunction>>
+  CreateFromLlvmIr(absl::string_view name, absl::string_view entry,
+                   absl::string_view ir, absl::Span<const std::string> options);
+
+ private:
+  explicit JitHostKernelFunction(
+      std::unique_ptr<internal::ExecutionEngine> exec_engine);
+
+  std::unique_ptr<internal::ExecutionEngine> engine_;
+  SE_HOST_Kernel *kernel_;
+};
+
+}  // namespace stream_executor::host
+
+#endif  // XLA_STREAM_EXECUTOR_HOST_JIT_HOST_KERNEL_FUNCTION_H_
diff --git a/third_party/xla/xla/stream_executor/integrations/BUILD b/third_party/xla/xla/stream_executor/integrations/BUILD
index 862f9b4a875a6e..df5e1cc4462bc3 100644
--- a/third_party/xla/xla/stream_executor/integrations/BUILD
+++ b/third_party/xla/xla/stream_executor/integrations/BUILD
@@ -55,6 +55,7 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@local_tsl//tsl/framework:allocator",
         "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
@@ -89,6 +90,7 @@ xla_cc_test(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:node_hash_set",
         "@com_google_absl//absl/log:check",
+        "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
diff --git a/third_party/xla/xla/stream_executor/integrations/device_mem_allocator.h b/third_party/xla/xla/stream_executor/integrations/device_mem_allocator.h
index ee7767ce5aadff..534b0fdafdd45b 100644
--- a/third_party/xla/xla/stream_executor/integrations/device_mem_allocator.h
+++ b/third_party/xla/xla/stream_executor/integrations/device_mem_allocator.h
@@ -85,7 +85,7 @@ class DeviceMemAllocator : public tsl::SubAllocator {
         auto status = stream_exec_->CollectiveMemoryDeallocate(ptr);
         CHECK(status.ok()) << status.message();
       } else if (memory_type_ == MemoryType::kHost) {
-        stream_exec_->HostMemoryDeallocate(ptr, num_bytes);
+        stream_exec_->HostMemoryDeallocate(ptr);
       } else {
         DeviceMemoryBase device_ptr(ptr);
         stream_exec_->Deallocate(&device_ptr);
diff --git a/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter.cc b/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter.cc
index 97a6c7c7b09e3e..c1ea8add16272d 100644
--- a/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter.cc
+++ b/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter.cc
@@ -26,7 +26,7 @@ limitations under the License.
 namespace stream_executor {
 
 TfAllocatorAdapter::TfAllocatorAdapter(tsl::Allocator *wrapped, Stream *stream)
-    : DeviceMemoryAllocator(stream->parent()->platform()),
+    : DeviceMemoryAllocator(stream->parent()->GetPlatform()),
       wrapped_(wrapped),
       stream_(stream) {}
 
diff --git a/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter.h b/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter.h
index 0a1b2bbf37d4e9..ec64d66349b216 100644
--- a/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter.h
+++ b/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter.h
@@ -31,6 +31,7 @@ limitations under the License.
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "tsl/framework/allocator.h"
+#include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
 
 namespace stream_executor {
@@ -141,9 +142,18 @@ class MultiDeviceAdapter : public DeviceMemoryAllocator {
       absl::MutexLock lock(&mu_);
       auto it = buffer_memory_spaces_.find({device_ordinal, mem.opaque()});
       if (it == buffer_memory_spaces_.end()) {
-        return absl::InternalError(
-            absl::StrFormat("Memory %p was not allocated on device %d.",
-                            mem.opaque(), device_ordinal));
+        // There might be situation when device memory was allocated somewhere
+        // outside of the current allocator. For backward compatibility in
+        // this case we are falling back to the first allocator to deallocate
+        // the memory.
+        // See b/325527293 for more details.
+        LOG(WARNING)
+            << "Memory: " << mem.opaque()
+            << " was not allocated by the current allocator on device: "
+            << device_ordinal << "."
+            << "Fallback on the first allocator to deallocate the memory.";
+        return memory_space_to_per_device_allocators_[0][device_ordinal]
+            ->Deallocate(device_ordinal, mem);
       }
       memory_space = it->second;
       buffer_memory_spaces_.erase(it);
diff --git a/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter_test.cc b/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter_test.cc
index e98569cdbf4059..55f932ad761ada 100644
--- a/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter_test.cc
+++ b/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter_test.cc
@@ -30,37 +30,44 @@ limitations under the License.
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "tsl/framework/allocator.h"
+#include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
 namespace se = stream_executor;
 
-// Each allocatotion will have an incrementing address.
+// Each allocation will have an incrementing address.
 class TestAllocator : public tsl::Allocator {
  public:
-  explicit TestAllocator(size_t start_address)
-      : start_address_(start_address) {}
+  explicit TestAllocator(
+      size_t start_address,
+      std::shared_ptr<absl::flat_hash_set<void*>> allocations = nullptr)
+      : start_address_(start_address), allocations_(allocations) {
+    if (allocations_ == nullptr) {
+      allocations_ = std::make_shared<absl::flat_hash_set<void*>>();
+    }
+  }
 
   std::string Name() override { return "test"; }
 
   void* AllocateRaw(size_t alignment, size_t num_bytes) override {
     void* ptr = reinterpret_cast<void*>(++start_address_);
-    allocations_.insert(ptr);
+    allocations_->insert(ptr);
     return ptr;
   }
 
   void DeallocateRaw(void* ptr) override {
-    auto it = allocations_.find(ptr);
-    if (it == allocations_.end()) {
+    auto it = allocations_->find(ptr);
+    if (it == allocations_->end()) {
       ADD_FAILURE() << "Allocation not found (double free?)";
     } else {
-      allocations_.erase(it);
+      allocations_->erase(it);
     }
   }
 
  private:
-  absl::flat_hash_set<void*> allocations_;
   size_t start_address_;
+  std::shared_ptr<absl::flat_hash_set<void*>> allocations_;
 };
 
 TEST(MultiDeviceAdapter, UsesCorrectAllocator) {
@@ -103,3 +110,43 @@ TEST(MultiDeviceAdapter, UsesCorrectAllocator) {
       allocator->Allocate(/*device_ordinal=*/1, 4, false, /*memory_space=*/1));
   CHECK_EQ(reinterpret_cast<size_t>(buff4->opaque()), 0x4001);
 }
+
+TEST(MultiDeviceAdapter, DeallocationWithDifferentAllocator) {
+  TF_ASSERT_OK_AND_ASSIGN(auto* platform,
+                          xla::PlatformUtil::GetDefaultPlatform());
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<se::StreamExecutor*> executors,
+                          xla::PlatformUtil::GetStreamExecutors(platform));
+  TF_ASSERT_OK_AND_ASSIGN(auto stream, executors[0]->CreateStream());
+
+  std::shared_ptr<absl::flat_hash_set<void*>> allocations =
+      std::make_shared<absl::flat_hash_set<void*>>();
+  std::vector<se::MultiDeviceAdapter::AllocatorInfo> info_allocator;
+  info_allocator.emplace_back(
+      std::make_unique<TestAllocator>(0x1000, allocations), stream.get(),
+      /*memory_space=*/0, /*device_ordinal=*/0);
+
+  std::unique_ptr<se::DeviceMemoryAllocator> allocator =
+      std::make_unique<se::MultiDeviceAdapter>(platform,
+                                               std::move(info_allocator));
+
+  std::vector<se::MultiDeviceAdapter::AllocatorInfo> info_deallocator;
+  info_deallocator.emplace_back(
+      std::make_unique<TestAllocator>(0x1000, allocations), stream.get(),
+      /*memory_space=*/0, /*device_ordinal=*/0);
+  std::unique_ptr<se::DeviceMemoryAllocator> deallocator =
+      std::make_unique<se::MultiDeviceAdapter>(platform,
+                                               std::move(info_deallocator));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      se::OwningDeviceMemory buff0,
+      allocator->Allocate(/*device_ordinal=*/0, 4, false, /*memory_space=*/0));
+  CHECK_EQ(allocations->size(), 1);
+  CHECK_EQ(reinterpret_cast<size_t>(buff0->opaque()), 0x1001);
+
+  TF_CHECK_OK(deallocator->Deallocate(/*device_ordinal=*/0, buff0.cref()));
+  CHECK_EQ(allocations->size(), 0);
+
+  // Place back memory pointer to remove it during with ScopedDeviceMemory
+  // destruction.
+  allocations->insert(buff0->opaque());
+}
diff --git a/third_party/xla/xla/stream_executor/kernel.cc b/third_party/xla/xla/stream_executor/kernel.cc
index 7324ca0d82618d..8f40daee24f284 100644
--- a/third_party/xla/xla/stream_executor/kernel.cc
+++ b/third_party/xla/xla/stream_executor/kernel.cc
@@ -16,20 +16,12 @@ limitations under the License.
 #include "xla/stream_executor/kernel.h"
 
 #include <cstdint>
-#include <memory>
 #include <optional>
 #include <string>
 
-#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/strip.h"
-#include "xla/stream_executor/kernel_spec.h"
-#include "xla/stream_executor/platform.h"
-#include "xla/stream_executor/stream_executor.h"
-#include "xla/stream_executor/stream_executor_interface.h"
 #include "tsl/platform/demangle.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
 
 namespace stream_executor {
 
@@ -53,13 +45,6 @@ void KernelMetadata::set_shared_memory_bytes(int shared_memory_bytes) {
 // Kernel
 //===----------------------------------------------------------------------===//
 
-absl::StatusOr<std::unique_ptr<Kernel>> Kernel::Create(
-    StreamExecutor *executor, const MultiKernelLoaderSpec &spec) {
-  TF_ASSIGN_OR_RETURN(auto kernel, executor->implementation()->CreateKernel());
-  TF_RETURN_IF_ERROR(executor->GetKernel(spec, kernel.get()));
-  return kernel;
-}
-
 void Kernel::set_name(absl::string_view name) {
   name_ = std::string(name);
 
diff --git a/third_party/xla/xla/stream_executor/kernel.h b/third_party/xla/xla/stream_executor/kernel.h
index edf0e24b31a119..0f51a6ff447eb2 100644
--- a/third_party/xla/xla/stream_executor/kernel.h
+++ b/third_party/xla/xla/stream_executor/kernel.h
@@ -1,3 +1,4 @@
+#include "absl/base/attributes.h"
 /* Copyright 2015 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -91,12 +92,10 @@ limitations under the License.
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "tsl/platform/logging.h"
-#include "tsl/platform/statusor.h"
 
 namespace stream_executor {
 
 class Kernel;
-class StreamExecutor;
 
 //===----------------------------------------------------------------------===//
 // Kernel cache config
@@ -228,13 +227,6 @@ class Kernel {
       std::function<absl::StatusOr<std::unique_ptr<KernelArgsPackedArrayBase>>(
           const Kernel &kernel, const KernelArgs &args)>;
 
-  // TODO(b/323534971): Kernel constructor should be moved to StreamExecutor or
-  // a dedicated KernelFactory accessible via StreamExecutor.
-
-  // Creates kernel on a given executor from a given kernel specification.
-  static absl::StatusOr<std::unique_ptr<Kernel>> Create(
-      StreamExecutor *executor, const MultiKernelLoaderSpec &spec);
-
   Kernel() = default;
   virtual ~Kernel() = default;
 
@@ -282,6 +274,8 @@ class Kernel {
 //===----------------------------------------------------------------------===//
 // Typed kernel
 //===----------------------------------------------------------------------===//
+template <typename... Params>
+class TypedKernelFactory;
 
 // Typed kernel is a typed smart-pointer-like wrapper around untyped Kernel.
 template <typename... Params>
@@ -289,37 +283,6 @@ class TypedKernel {
  public:
   static constexpr size_t kNumberOfParameters = sizeof...(Params);
 
-  // Creates a typed kernel on a given executor from a kernel specification.
-  static absl::StatusOr<TypedKernel> Create(StreamExecutor *executor,
-                                            const MultiKernelLoaderSpec &spec) {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<Kernel> kernel,
-                        Kernel::Create(executor, spec));
-    return TypedKernel(std::move(kernel));
-  }
-
-  // Creates a kernel which can be launched with `stream.ThenLaunch(...)` from a
-  // PTX (and optional CUBIN), such that the types of the arguments provided for
-  // launch would have to match types of the arguments provided at creation
-  // time. The canonical storage for both ptx and cubin_data should outlive the
-  // lifetime of the kernel.
-  static absl::StatusOr<TypedKernel> Create(
-      StreamExecutor *executor, absl::string_view kernel_name,
-      absl::string_view ptx, absl::Span<const uint8_t> cubin_data);
-
-  // Creates a kernel which can be launched with `stream.ThenLaunch(...)` from
-  // an in-process symbol pointer.
-  static absl::StatusOr<TypedKernel> Create(StreamExecutor *executor,
-                                            absl::string_view kernel_name,
-                                            void *symbol);
-
-  // Creates a kernel which can be launched with `stream.ThenLaunch(...)` from
-  // an LLVM IR.
-  static absl::StatusOr<TypedKernel> Create(StreamExecutor *executor,
-                                            absl::string_view ir,
-                                            absl::string_view entrypoint,
-                                            absl::string_view kernel_name,
-                                            absl::Span<std::string> options);
-
   TypedKernel() = default;
 
   Kernel &operator*() { return *kernel_; }
@@ -330,7 +293,11 @@ class TypedKernel {
 
   operator bool() const { return static_cast<bool>(kernel_); }  // NOLINT
 
+  // Type of factory used to create a TypedKernel.
+  using FactoryType = TypedKernelFactory<Params...>;
+
  private:
+  friend class TypedKernelFactory<Params...>;
   explicit TypedKernel(std::unique_ptr<Kernel> kernel)
       : kernel_(std::move(kernel)) {}
 
@@ -742,40 +709,6 @@ std::unique_ptr<KernelArgsPackedArrayBase> PackKernelArgs(
   return std::make_unique<PackedArgs>(std::forward<Args>(args)..., shmem_bytes);
 }
 
-template <typename... Args>
-inline absl::StatusOr<TypedKernel<Args...>> TypedKernel<Args...>::Create(
-    StreamExecutor *executor, absl::string_view kernel_name,
-    absl::string_view ptx, absl::Span<const uint8_t> cubin_data) {
-  MultiKernelLoaderSpec loader_spec(TypedKernel<Args...>::kNumberOfParameters);
-  loader_spec.AddCudaPtxInMemory(ptx, kernel_name);
-
-  if (!cubin_data.empty()) {
-    loader_spec.AddCudaCubinInMemory(cubin_data, kernel_name);
-  }
-
-  return TypedKernel<Args...>::Create(executor, loader_spec);
-}
-
-template <typename... Args>
-inline absl::StatusOr<TypedKernel<Args...>> TypedKernel<Args...>::Create(
-    StreamExecutor *executor, absl::string_view kernel_name, void *symbol) {
-  MultiKernelLoaderSpec loader_spec(TypedKernel<Args...>::kNumberOfParameters);
-  loader_spec.AddInProcessSymbol(symbol, kernel_name);
-
-  return TypedKernel<Args...>::Create(executor, loader_spec);
-}
-
-template <typename... Args>
-inline absl::StatusOr<TypedKernel<Args...>> TypedKernel<Args...>::Create(
-    StreamExecutor *executor, absl::string_view ir,
-    absl::string_view entrypoint, absl::string_view kernel_name,
-    absl::Span<std::string> options) {
-  MultiKernelLoaderSpec loader_spec(TypedKernel<Args...>::kNumberOfParameters);
-  loader_spec.AddLlvmHostKernel(ir, entrypoint, kernel_name, options);
-
-  return TypedKernel<Args...>::Create(executor, loader_spec);
-}
-
 }  // namespace stream_executor
 
 #endif  // XLA_STREAM_EXECUTOR_KERNEL_H_
diff --git a/third_party/xla/xla/stream_executor/kernel_factory.h b/third_party/xla/xla/stream_executor/kernel_factory.h
new file mode 100644
index 00000000000000..2dd955efe5ed2c
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/kernel_factory.h
@@ -0,0 +1,44 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_KERNEL_FACTORY_H_
+#define XLA_STREAM_EXECUTOR_KERNEL_FACTORY_H_
+
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_spec.h"
+#include "xla/stream_executor/stream_executor_interface.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace stream_executor {
+
+// Creates Kernels from kernel specifications.
+class KernelFactory {
+ public:
+  // Creates kernel on a given executor from a given kernel specification.
+  static inline absl::StatusOr<std::unique_ptr<Kernel>> Create(
+      StreamExecutorInterface *executor, const MultiKernelLoaderSpec &spec) {
+    TF_ASSIGN_OR_RETURN(auto kernel, executor->CreateKernel());
+    TF_RETURN_IF_ERROR(executor->GetKernel(spec, kernel.get()));
+    return kernel;
+  }
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_KERNEL_FACTORY_H_
diff --git a/third_party/xla/xla/stream_executor/kernel_test.cc b/third_party/xla/xla/stream_executor/kernel_test.cc
index 205c559170e729..cf63e5b0a55281 100644
--- a/third_party/xla/xla/stream_executor/kernel_test.cc
+++ b/third_party/xla/xla/stream_executor/kernel_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/typed_kernel_factory.h"
 #include "tsl/platform/test.h"
 #include "tsl/platform/test_benchmark.h"
 
@@ -124,7 +125,7 @@ TEST(KernelTest, FailToCreateTypedKernelFromEmptySpec) {
   MultiKernelLoaderSpec empty_spec(/*arity=*/0);
 
   auto executor = NewStreamExecutor();
-  auto kernel = TypedKernel<>::Create(executor.get(), empty_spec);
+  auto kernel = TypedKernelFactory<>::Create(executor.get(), empty_spec);
   EXPECT_FALSE(kernel.ok());
 }
 
diff --git a/third_party/xla/xla/stream_executor/lazy_op_runner.h b/third_party/xla/xla/stream_executor/lazy_op_runner.h
index 8f99d6f6b23f54..c211244b426542 100644
--- a/third_party/xla/xla/stream_executor/lazy_op_runner.h
+++ b/third_party/xla/xla/stream_executor/lazy_op_runner.h
@@ -66,7 +66,7 @@ inline absl::StatusOr<DnnSupport*> GetDnnFromStream(Stream* stream) {
 //   struct Config;
 //
 //   // Use a StreamExecutor to create an OpRunner.
-//   static StatusOr<OpRunner<Config>> OpRunnerFromDesc(
+//   static absl::StatusOr<OpRunner<Config>> OpRunnerFromDesc(
 //       const AlgorithmDesc& desc, Config config, StreamExecutor* stream);
 // };
 template <typename Op>
@@ -283,7 +283,6 @@ struct FusedMatmulOp {
 struct FusedMHAOp {
   using Signature = FusedMHASignature;
   struct Config {
-    FusedMHAKind kind;
     double scale;
     const MatmulTensorDescriptor& bmm1_lhs_descriptor;
     const MatmulTensorDescriptor& bmm1_rhs_descriptor;
@@ -291,11 +290,9 @@ struct FusedMHAOp {
     const MatmulTensorDescriptor& intermediate_bmm2_lhs_descriptor;
     const TensorDescriptor& output_descriptor;
     std::optional<TensorDescriptor> bias_descriptor;
-    std::optional<TensorDescriptor> mask_descriptor;
     std::optional<TensorDescriptor> activation_descriptor;
     std::optional<double> dropout_rate;
     std::optional<int64_t> seed;
-    bool is_flash_attention;
     FMHAMaskKind mask_type;
   };
 
@@ -304,12 +301,11 @@ struct FusedMHAOp {
                           Stream* stream) {
     TF_ASSIGN_OR_RETURN(auto dnn, internal::GetDnnFromStream(stream));
     return dnn->FusedMHARunnerFromDesc(
-        stream, desc, config.kind, config.bmm1_lhs_descriptor,
-        config.bmm1_rhs_descriptor, config.bmm2_rhs_descriptor,
-        config.intermediate_bmm2_lhs_descriptor, config.output_descriptor,
-        config.activation_descriptor, config.mask_descriptor,
+        stream, desc, config.bmm1_lhs_descriptor, config.bmm1_rhs_descriptor,
+        config.bmm2_rhs_descriptor, config.intermediate_bmm2_lhs_descriptor,
+        config.output_descriptor, config.activation_descriptor,
         config.bias_descriptor, config.scale, config.dropout_rate, config.seed,
-        config.is_flash_attention, config.mask_type);
+        config.mask_type);
   }
 };
 
@@ -317,7 +313,6 @@ struct FusedMHABackwardOp {
   using Signature = FusedMHABackwardSignature;
 
   struct Config {
-    FusedMHAKind kind;
     double scale;
     const MatmulTensorDescriptor& bmm1_grad_gemm1_rhs_descriptor;
     const MatmulTensorDescriptor& bmm1_grad_gemm2_rhs_descriptor;
@@ -328,13 +323,11 @@ struct FusedMHABackwardOp {
     const TensorDescriptor& d_bmm1_rhs_descriptor;
     const TensorDescriptor& d_bmm2_rhs_descriptor;
     std::optional<TensorDescriptor> d_s_descriptor;
-    std::optional<TensorDescriptor> mask_descriptor;
     std::optional<TensorDescriptor> d_bias_descriptor;
     std::optional<TensorDescriptor> fwd_output_descriptor;
     std::optional<TensorDescriptor> bias_descriptor;
     std::optional<double> dropout_rate;
     std::optional<int64_t> seed;
-    bool is_flash_attention;
     FMHAMaskKind mask_type;
   };
 
@@ -344,15 +337,14 @@ struct FusedMHABackwardOp {
                           Stream* stream) {
     TF_ASSIGN_OR_RETURN(auto dnn, internal::GetDnnFromStream(stream));
     return dnn->FusedMHABackwardRunnerFromDesc(
-        stream, desc, config.kind, config.bmm1_grad_gemm1_rhs_descriptor,
+        stream, desc, config.bmm1_grad_gemm1_rhs_descriptor,
         config.bmm1_grad_gemm2_rhs_descriptor,
         config.bmm2_grad_gemm1_lhs_descriptor,
         config.bmm2_grad_gemm2_rhs_descriptor, config.d_output_descriptor,
         config.d_bmm1_lhs_descriptor, config.d_bmm1_rhs_descriptor,
         config.d_bmm2_rhs_descriptor, config.d_s_descriptor,
-        config.mask_descriptor, config.d_bias_descriptor,
-        config.fwd_output_descriptor, config.bias_descriptor, config.scale,
-        config.dropout_rate, config.seed, config.is_flash_attention,
+        config.d_bias_descriptor, config.fwd_output_descriptor,
+        config.bias_descriptor, config.scale, config.dropout_rate, config.seed,
         config.mask_type);
   }
 };
diff --git a/third_party/xla/xla/stream_executor/mock_stream_executor.h b/third_party/xla/xla/stream_executor/mock_stream_executor.h
new file mode 100644
index 00000000000000..8f6fc02f863aad
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/mock_stream_executor.h
@@ -0,0 +1,183 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_MOCK_STREAM_EXECUTOR_H_
+#define XLA_STREAM_EXECUTOR_MOCK_STREAM_EXECUTOR_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <variant>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/allocator_stats.h"
+#include "xla/stream_executor/command_buffer.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/event_interface.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_spec.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/module_spec.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor_interface.h"
+#include "xla/stream_executor/stream_interface.h"
+#include "xla/test.h"
+
+namespace stream_executor {
+
+namespace fft {
+class FftSupport;
+}
+namespace dnn {
+class DnnSupport;
+}
+namespace blas {
+class BlasSupport;
+}
+
+// Implements StreamExecutorInterface for testing.
+class MockStreamExecutor : public StreamExecutorInterface {
+ public:
+  MockStreamExecutor() = default;
+  MOCK_METHOD(absl::Status, Init, (), (override));
+  MOCK_METHOD(int, device_ordinal, (), (const, override));
+  MOCK_METHOD(absl::Status, GetKernel,
+              (const MultiKernelLoaderSpec& spec, Kernel* kernel), (override));
+  MOCK_METHOD(bool, UnloadModule, (ModuleHandle module_handle), (override));
+  MOCK_METHOD(absl::Status, LoadModule,
+              (const MultiModuleLoaderSpec& spec, ModuleHandle* module_handle),
+              (override));
+  MOCK_METHOD(absl::StatusOr<std::shared_ptr<DeviceMemoryBase>>,
+              CreateOrShareConstant,
+              (Stream * stream, absl::Span<const uint8_t> content), (override));
+  MOCK_METHOD(absl::Status, Launch,
+              (Stream * stream, const ThreadDim& thread_dims,
+               const BlockDim& block_dims, const Kernel& k,
+               const KernelArgs& args),
+              (override));
+  MOCK_METHOD(absl::Status, Launch,
+              (Stream * stream, const ThreadDim& thread_dims,
+               const BlockDim& block_dims, const ClusterDim& cluster_dims,
+               const Kernel& k, const KernelArgs& args),
+              (override));
+  MOCK_METHOD(absl::Status, Submit,
+              (Stream * stream, const CommandBuffer& command_buffer));
+  MOCK_METHOD(void, UnloadKernel, (const Kernel* kernel), (override));
+  MOCK_METHOD(DeviceMemoryBase, Allocate, (uint64_t size, int64_t memory_space),
+              (override));
+  MOCK_METHOD(void, Deallocate, (DeviceMemoryBase * mem), (override));
+  MOCK_METHOD(void*, UnifiedMemoryAllocate, (uint64_t size), (override));
+  MOCK_METHOD(void, UnifiedMemoryDeallocate, (void* mem), (override));
+  MOCK_METHOD(absl::StatusOr<void*>, CollectiveMemoryAllocate, (uint64_t size),
+              (override));
+  MOCK_METHOD(absl::Status, CollectiveMemoryDeallocate, (void* mem),
+              (override));
+  MOCK_METHOD(absl::StatusOr<std::unique_ptr<MemoryAllocation>>,
+              HostMemoryAllocate, (uint64_t size), (override));
+  MOCK_METHOD(void, HostMemoryDeallocate, (void* mem), (override));
+  MOCK_METHOD(bool, SynchronizeAllActivity, (), (override));
+  MOCK_METHOD(absl::Status, SynchronousMemZero,
+              (DeviceMemoryBase * location, uint64_t size), (override));
+  MOCK_METHOD(absl::Status, SynchronousMemcpy,
+              (DeviceMemoryBase * device_dst, const void* host_src,
+               uint64_t size),
+              (override));
+  MOCK_METHOD(absl::Status, SynchronousMemcpy,
+              (void* host_dst, const DeviceMemoryBase& device_src,
+               uint64_t size),
+              (override));
+  MOCK_METHOD(absl::Status, MemZero,
+              (Stream * stream, DeviceMemoryBase* location, uint64_t size),
+              (override));
+  MOCK_METHOD(absl::Status, Memset,
+              (Stream * stream, DeviceMemoryBase* location, uint8_t pattern,
+               uint64_t size),
+              (override));
+  MOCK_METHOD(absl::Status, Memset32,
+              (Stream * stream, DeviceMemoryBase* location, uint32_t pattern,
+               uint64_t size),
+              (override));
+  MOCK_METHOD(absl::Status, Memcpy,
+              (Stream * stream, void* host_dst,
+               const DeviceMemoryBase& device_src, uint64_t size),
+              (override));
+  MOCK_METHOD(absl::Status, Memcpy,
+              (Stream * stream, DeviceMemoryBase* device_dst,
+               const void* host_src, uint64_t size),
+              (override));
+  MOCK_METHOD(bool, MemcpyDeviceToDevice,
+              (Stream * stream, DeviceMemoryBase* device_dst,
+               const DeviceMemoryBase& device_src, uint64_t size),
+              (override));
+  MOCK_METHOD(bool, HostCallback,
+              (Stream * stream, absl::AnyInvocable<absl::Status() &&> callback),
+              (override));
+  MOCK_METHOD(absl::Status, DeallocateEvent, (Event * event), (override));
+  MOCK_METHOD(absl::Status, RecordEvent, (Stream * stream, Event* event),
+              (override));
+  MOCK_METHOD(absl::Status, WaitForEvent, (Stream * stream, Event* event),
+              (override));
+  MOCK_METHOD(absl::Status, WaitForEventOnExternalStream,
+              (std::intptr_t stream, Event* event), (override));
+  MOCK_METHOD(Event::Status, PollForEventStatus, (Event * event), (override));
+  MOCK_METHOD(void, DeallocateStream, (Stream * stream), (override));
+  MOCK_METHOD(bool, CreateStreamDependency, (Stream * dependent, Stream* other),
+              (override));
+  MOCK_METHOD(absl::Status, BlockHostUntilDone, (Stream * stream), (override));
+  MOCK_METHOD(absl::Status, GetStatus, (Stream * stream));
+  MOCK_METHOD(absl::Status, EnablePeerAccessTo,
+              (StreamExecutorInterface * other), (override));
+  MOCK_METHOD(bool, CanEnablePeerAccessTo, (StreamExecutorInterface * other),
+              (override));
+  MOCK_METHOD(bool, DeviceMemoryUsage, (int64_t* free, int64_t* total),
+              (const, override));
+  MOCK_METHOD(absl::StatusOr<DeviceMemoryBase>, GetSymbol,
+              (const std::string& symbol_name, ModuleHandle module_handle),
+              (override));
+  MOCK_METHOD(absl::StatusOr<std::unique_ptr<DeviceDescription>>,
+              CreateDeviceDescription, (), (const, override));
+  MOCK_METHOD(blas::BlasSupport*, AsBlas, (), (override));
+  MOCK_METHOD(fft::FftSupport*, AsFft, (), (override));
+  MOCK_METHOD(dnn::DnnSupport*, AsDnn, (), (override));
+  MOCK_METHOD(absl::StatusOr<std::unique_ptr<Kernel>>, CreateKernel, (),
+              (override));
+  MOCK_METHOD(absl::StatusOr<std::unique_ptr<CommandBuffer>>,
+              CreateCommandBuffer, (CommandBuffer::Mode mode), (override));
+  MOCK_METHOD(std::optional<AllocatorStats>, GetAllocatorStats, (), (override));
+  MOCK_METHOD(bool, ClearAllocatorStats, (), (override));
+  MOCK_METHOD(absl::Status, FlushCompilationCache, (), (override));
+  MOCK_METHOD(Stream*, FindAllocatedStream, (void* device_stream), (override));
+  MOCK_METHOD(const Platform*, GetPlatform, (), (const, override));
+  MOCK_METHOD(absl::StatusOr<std::unique_ptr<Stream>>, CreateStream,
+              ((std::optional<std::variant<StreamPriority, int>>)), (override));
+  MOCK_METHOD(int64_t, GetMemoryLimitBytes, (), (const.override));
+  MOCK_METHOD(const DeviceDescription&, GetDeviceDescription, (),
+              (const, override));
+  MOCK_METHOD(absl::StatusOr<std::unique_ptr<Event>>, CreateEvent, (),
+              (override));
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_MOCK_STREAM_EXECUTOR_H_
diff --git a/third_party/xla/xla/stream_executor/rocm/BUILD b/third_party/xla/xla/stream_executor/rocm/BUILD
index 71ad43a60c0dcc..d8ce19414a63fe 100644
--- a/third_party/xla/xla/stream_executor/rocm/BUILD
+++ b/third_party/xla/xla/stream_executor/rocm/BUILD
@@ -45,6 +45,7 @@ cc_library(
         "//xla/stream_executor/gpu:gpu_diagnostics_header",
         "//xla/stream_executor/platform",
         "@local_tsl//tsl/platform:platform_port",
+        "@local_tsl//tsl/platform:logging",
     ]),
 )
 
@@ -56,17 +57,25 @@ cc_library(
         "rocm_driver.h",
     ]),
     deps = if_rocm_is_configured([
+        # keep sorted
         ":rocm_diagnostics",
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/container:node_hash_map",
-        "@com_google_absl//absl/strings",
         "//xla/stream_executor",
+        "//xla/stream_executor/gpu:gpu_diagnostics_header",
         "//xla/stream_executor/gpu:gpu_driver_header",
         "//xla/stream_executor/platform",
         "//xla/stream_executor/platform:dso_loader",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
+        "@local_config_rocm//rocm:hip",
         "@local_config_rocm//rocm:rocm_headers",
         "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:numbers",
         "@local_tsl//tsl/platform:stacktrace",
     ]),
@@ -80,17 +89,24 @@ cc_library(
         "rocm_driver.h",
     ]),
     deps = if_rocm_is_configured([
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
+        # keep sorted
         "//xla/stream_executor",
+        "//xla/stream_executor/gpu:gpu_driver_header",
         "//xla/stream_executor/gpu:gpu_runtime_header",
         "//xla/stream_executor/gpu:gpu_types_header",
-        "//xla/stream_executor/gpu:gpu_driver_header",
         "//xla/stream_executor/platform",
         "//xla/stream_executor/platform:dso_loader",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
         "@local_config_rocm//rocm:rocm_headers",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
     ]),
 )
@@ -99,10 +115,12 @@ cc_library(
     name = "rocm_collectives",
     srcs = if_rocm_is_configured(["rocm_collectives.cc"]),
     deps = if_rocm_is_configured([
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
+        # keep sorted
         "//xla/stream_executor/gpu:gpu_collectives_header",
         "//xla/stream_executor/gpu:gpu_driver_header",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
     ]),
 )
 
@@ -111,12 +129,13 @@ cc_library(
     srcs = [],
     hdrs = if_rocm_is_configured(["rocm_activation.h"]),
     deps = if_rocm_is_configured([
+        # keep sorted
         ":rocm_driver",
-        "@local_config_rocm//rocm:rocm_headers",
         "//xla/stream_executor",
         "//xla/stream_executor:stream_executor_interface",
         "//xla/stream_executor/gpu:gpu_activation",
         "//xla/stream_executor/platform",
+        "@local_config_rocm//rocm:rocm_headers",
     ]),
 )
 
@@ -124,6 +143,7 @@ cc_library(
     name = "rocm_event",
     srcs = if_rocm_is_configured(["rocm_event.cc"]),
     deps = if_rocm_is_configured([
+        # keep sorted
         ":rocm_driver",
         "//xla/stream_executor",
         "//xla/stream_executor/gpu:gpu_event_header",
@@ -136,30 +156,40 @@ cc_library(
     name = "rocm_executor",
     srcs = if_rocm_is_configured(["rocm_executor.cc"]),
     deps = if_rocm_is_configured([
+        # keep sorted
+        ":rocm_collectives",
         ":rocm_diagnostics",
         ":rocm_driver",
         ":rocm_event",
         ":rocm_kernel",
         ":rocm_platform_id",
         ":rocm_runtime",
-        ":rocm_collectives",
-        "@com_google_absl//absl/functional:any_invocable",
-        "@com_google_absl//absl/strings",
         "//xla/stream_executor",
         "//xla/stream_executor:kernel",
         "//xla/stream_executor:plugin_registry",
+        "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor/gpu:gpu_activation_header",
+        "//xla/stream_executor/gpu:gpu_collectives_header",
+        "//xla/stream_executor/gpu:gpu_command_buffer",
+        "//xla/stream_executor/gpu:gpu_driver_header",
         "//xla/stream_executor/gpu:gpu_event",
+        "//xla/stream_executor/gpu:gpu_executor_header",
         "//xla/stream_executor/gpu:gpu_kernel_header",
-        "//xla/stream_executor/gpu:gpu_command_buffer",
         "//xla/stream_executor/gpu:gpu_runtime_header",
         "//xla/stream_executor/gpu:gpu_stream",
         "//xla/stream_executor/gpu:gpu_timer",
+        "//xla/stream_executor/integrations:device_mem_allocator",
         "//xla/stream_executor/platform",
         "//xla/stream_executor/platform:dso_loader",
-        "//xla/stream_executor/integrations:device_mem_allocator",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:fingerprint",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:statusor",
     ]),
     alwayslink = True,
 )
@@ -186,16 +216,23 @@ cc_library(
     hdrs = if_rocm_is_configured(["rocm_platform.h"]),
     visibility = ["//visibility:public"],
     deps = if_rocm_is_configured([
+        # keep sorted
+        ":rocm_collectives",
         ":rocm_driver",
         ":rocm_executor",
         ":rocm_platform_id",
         ":rocm_runtime",
-        ":rocm_collectives",
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/memory",
         "//xla/stream_executor",  # buildcleaner: keep
         "//xla/stream_executor:executor_cache",
+        "//xla/stream_executor:stream_executor_headers",
+        "//xla/stream_executor/gpu:gpu_driver_header",
+        "//xla/stream_executor/gpu:gpu_executor_header",
         "//xla/stream_executor/platform",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/platform:errors",
     ]),
     alwayslink = True,  # Registers itself with the PlatformManager.
 )
@@ -223,17 +260,19 @@ cc_library(
 
 cc_library(
     name = "rocblas_wrapper",
-    srcs = if_rocm_is_configured(["rocblas_wrapper.h"]),
     hdrs = if_rocm_is_configured(["rocblas_wrapper.h"]),
     deps = if_rocm_is_configured([
+        # keep sorted
         ":rocblas_if_static",
         ":rocm_executor",
         ":rocm_platform_id",
-        "@local_config_rocm//rocm:rocm_headers",
+        "//xla/stream_executor/gpu:gpu_activation",
         "//xla/stream_executor/platform",
         "//xla/stream_executor/platform:dso_loader",
-        "@local_tsl//tsl/platform:env",
         "//xla/tsl/util:determinism_for_kernels",
+        "@local_config_rocm//rocm:rocm_headers",
+        "@local_tsl//tsl/platform",
+        "@local_tsl//tsl/platform:env",
     ]),
     alwayslink = True,
 )
@@ -244,26 +283,35 @@ cc_library(
     hdrs = if_rocm_is_configured(["rocm_blas.h"]),
     visibility = ["//visibility:public"],
     deps = if_rocm_is_configured([
+        # keep sorted
+        ":hipblas_lt_header",
         ":rocblas_if_static",
         ":rocblas_wrapper",
-        ":hipblas_lt_header",
         ":rocm_executor",
+        ":rocm_helpers",
         ":rocm_platform_id",
-        "@eigen_archive//:eigen3",
         "//xla/stream_executor",
+        "//xla/stream_executor:blas",
         "//xla/stream_executor:host_or_device_scalar",
         "//xla/stream_executor:plugin_registry",
+        "//xla/stream_executor:stream_executor_interface",
         "//xla/stream_executor/gpu:gpu_activation",
+        "//xla/stream_executor/gpu:gpu_blas_lt",
+        "//xla/stream_executor/gpu:gpu_executor_header",
         "//xla/stream_executor/gpu:gpu_helpers_header",
         "//xla/stream_executor/gpu:gpu_stream_header",
-        "//xla/stream_executor/gpu:gpu_timer_header",
-        "//xla/stream_executor/gpu:gpu_blas_lt",
+        "//xla/stream_executor/gpu:gpu_timer",
         "//xla/stream_executor/platform",
-        "//xla/stream_executor:blas",
         "//xla/stream_executor/platform:dso_loader",
+        "//xla/tsl/util:determinism_hdr_lib",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
+        "@eigen_archive//:eigen3",
         "@local_config_rocm//rocm:rocm_headers",
+        "@local_tsl//tsl/platform:logging",
     ]),
     alwayslink = True,
 )
@@ -288,20 +336,23 @@ cc_library(
     hdrs = if_rocm_is_configured(["rocm_fft.h"]),
     visibility = ["//visibility:public"],
     deps = if_rocm_is_configured([
+        # keep sorted
         ":hipfft_if_static",
         ":rocm_platform_id",
         "//xla/stream_executor",
         "//xla/stream_executor:fft",
         "//xla/stream_executor:plugin_registry",
+        "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor/gpu:gpu_activation",
-        "//xla/stream_executor/gpu:gpu_helpers_header",
         "//xla/stream_executor/gpu:gpu_executor_header",
-        "//xla/stream_executor/gpu:gpu_stream_header",
+        "//xla/stream_executor/gpu:gpu_helpers_header",
         "//xla/stream_executor/gpu:gpu_kernel_header",
+        "//xla/stream_executor/gpu:gpu_stream_header",
         "//xla/stream_executor/platform",
         "//xla/stream_executor/platform:dso_loader",
         "@local_config_rocm//rocm:rocm_headers",
         "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:logging",
     ]),
     alwayslink = True,
 )
@@ -331,29 +382,41 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = if_rocm_is_configured([
+        # keep sorted
         ":miopen_if_static",
         ":rocm_diagnostics",
         ":rocm_driver",
         ":rocm_executor",
+        ":rocm_helpers",
         ":rocm_platform_id",
-        "@eigen_archive//:eigen3",
         "//xla/stream_executor",
+        "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:dnn",
         "//xla/stream_executor:plugin_registry",
-        "//xla/stream_executor/gpu:gpu_activation_header",
+        "//xla/stream_executor:stream_interface",
+        "//xla/stream_executor/gpu:gpu_activation",
+        "//xla/stream_executor/gpu:gpu_driver_header",
+        "//xla/stream_executor/gpu:gpu_executor_header",
         "//xla/stream_executor/gpu:gpu_stream_header",
-        "//xla/stream_executor/gpu:gpu_timer_header",
+        "//xla/stream_executor/gpu:gpu_timer",
         "//xla/stream_executor/gpu:gpu_types_header",
-        "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor/platform",
         "//xla/stream_executor/platform:dso_loader",
+        "//xla/tsl/util:determinism_for_kernels",
+        "//xla/tsl/util:env_var",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
+        "@eigen_archive//:eigen3",
         "@local_config_rocm//rocm:rocm_headers",
+        "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:env_impl",
-        "//xla/tsl/util:env_var",
-        "//xla/tsl/util:determinism_for_kernels",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:hash",
+        "@local_tsl//tsl/platform:logging",
     ]),
     alwayslink = True,
 )
@@ -448,7 +511,6 @@ cc_library(
 
 cc_library(
     name = "hipsolver_wrapper",
-    srcs = if_rocm_is_configured(["hipsolver_wrapper.h"]),
     hdrs = if_rocm_is_configured(["hipsolver_wrapper.h"]),
     deps = if_rocm_is_configured([
         ":rocm_executor",
@@ -478,21 +540,31 @@ cc_library(
         "hip_blas_utils.h",
     ]),
     deps = if_rocm_is_configured([
-        ":rocm_executor",
-        ":rocm_platform_id",
-        ":rocblas_plugin",
+        # keep sorted
         ":hip_blas_utils",
         ":hipblas_lt_header",
-        "@local_config_rocm//rocm:rocm_headers",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:errors",
+        ":rocblas_plugin",
+        ":rocm_executor",
+        ":rocm_platform_id",
+        "//xla:shape_util",
         "//xla:status",
-        "//xla/stream_executor/platform",
-        "//xla/stream_executor/platform:dso_loader",
+        "//xla:status_macros",
+        "//xla:types",
+        "//xla:util",
         "//xla/stream_executor",
-        "//xla/stream_executor/gpu:gpu_helpers_header",
+        "//xla/stream_executor:host_or_device_scalar",
+        "//xla/stream_executor/gpu:gpu_activation",
         "//xla/stream_executor/gpu:gpu_blas_lt",
-        "//xla:util",
+        "//xla/stream_executor/gpu:gpu_helpers_header",
+        "//xla/stream_executor/gpu:gpu_stream_header",
+        "//xla/stream_executor/gpu:gpu_timer",
+        "//xla/stream_executor/platform",
+        "//xla/stream_executor/platform:dso_loader",
+        "@com_google_absl//absl/status",
+        "@local_config_rocm//rocm:rocm_headers",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:status",
     ]) + if_static([
         ":hipblaslt_if_static",
     ]),
@@ -508,13 +580,19 @@ cc_library(
     ]),
     visibility = ["//visibility:public"],
     deps = if_rocm_is_configured([
-        "@local_config_rocm//rocm:rocm_headers",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:errors",
+        # keep sorted
         "//xla:status",
-        "//xla/stream_executor:host_or_device_scalar",
+        "//xla:types",
         "//xla/stream_executor",
+        "//xla/stream_executor:host_or_device_scalar",
+        "//xla/stream_executor/gpu:gpu_blas_lt",
         "//xla/stream_executor/platform",
+        "//xla/stream_executor/platform:dso_loader",
+        "@com_google_absl//absl/status",
+        "@local_config_rocm//rocm:rocm_headers",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:status",
     ]),
 )
 
@@ -523,13 +601,15 @@ cc_library(
     srcs = if_rocm_is_configured(["hip_blas_utils.cc"]),
     hdrs = if_rocm_is_configured(["hip_blas_utils.h"]),
     deps = if_rocm_is_configured([
-        ":rocblas_plugin",
+        # keep sorted
         ":hipblas_lt_header",
+        ":rocblas_plugin",
+        "//xla/stream_executor",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@local_config_rocm//rocm:rocm_headers",
-        "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:errors",
-        "//xla/stream_executor",
+        "@local_tsl//tsl/platform:status",
     ]),
 )
 
@@ -552,12 +632,14 @@ cc_library(
     srcs = if_rocm_is_configured(["roctracer_wrapper.h"]),
     hdrs = if_rocm_is_configured(["roctracer_wrapper.h"]),
     deps = if_rocm_is_configured([
+        # keep sorted
         ":rocm_executor",
         ":rocm_platform_id",
         ":roctracer_if_static",
-        "@local_config_rocm//rocm:rocm_headers",
         "//xla/stream_executor/platform",
         "//xla/stream_executor/platform:dso_loader",
+        "@local_config_rocm//rocm:rocm_headers",
+        "@local_tsl//tsl/platform",
         "@local_tsl//tsl/platform:env",
     ]),
     alwayslink = True,
diff --git a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc
index 5dd9b20b2ca3fe..2061708a736f6f 100644
--- a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc
+++ b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc
@@ -382,6 +382,20 @@ absl::Status BlasLt::MatmulPlan::ValidateInputs(
   return absl::OkStatus();
 }
 
+absl::Status BlasLt::MatmulPlan::DoMatmul(
+    Stream* stream, const void* alpha, DeviceMemoryBase a, DeviceMemoryBase b,
+    const void* beta, DeviceMemoryBase c, DeviceMemoryBase d,
+    const MatmulAlgorithm& algorithm, DeviceMemoryBase bias,
+    DeviceMemoryBase aux, DeviceMemoryBase a_scale, DeviceMemoryBase b_scale,
+    DeviceMemoryBase c_scale, DeviceMemoryBase d_scale, DeviceMemoryBase d_amax,
+    std::optional<DeviceMemoryBase> workspace,
+    blas::ProfileResult* profile_result) const {
+  return DoMatmul(stream, alpha, a, b, beta, c, d, algorithm, bias, aux,
+                  a_scale, b_scale, c_scale, d_scale, d_amax, workspace,
+                  std::nullopt, profile_result);
+}
+
+// Tensorflow use this API
 absl::Status BlasLt::MatmulPlan::DoMatmul(
     Stream* stream, const void* alpha, DeviceMemoryBase a, DeviceMemoryBase b,
     const void* beta, DeviceMemoryBase c, DeviceMemoryBase d,
@@ -390,18 +404,44 @@ absl::Status BlasLt::MatmulPlan::DoMatmul(
     DeviceMemoryBase b_scale, DeviceMemoryBase c_scale,
     DeviceMemoryBase d_scale, DeviceMemoryBase d_amax,
     blas::ProfileResult* profile_result) const {
+  return DoMatmul(stream, alpha, a, b, beta, c, d, algorithm, bias, aux,
+                  a_scale, b_scale, c_scale, d_scale, d_amax, std::nullopt,
+                  &scratch_allocator, profile_result);
+}
+
+absl::Status BlasLt::MatmulPlan::DoMatmul(
+    Stream* stream, const void* alpha, DeviceMemoryBase a, DeviceMemoryBase b,
+    const void* beta, DeviceMemoryBase c, DeviceMemoryBase d,
+    const MatmulAlgorithm& algorithm, DeviceMemoryBase bias,
+    DeviceMemoryBase aux, DeviceMemoryBase a_scale, DeviceMemoryBase b_scale,
+    DeviceMemoryBase c_scale, DeviceMemoryBase d_scale, DeviceMemoryBase d_amax,
+    std::optional<DeviceMemoryBase> workspace,
+    std::optional<ScratchAllocator*> scratch_allocator,
+    blas::ProfileResult* profile_result) const {
+  absl::Status status = blas_lt_ref_.parent_->RecordApiTrace(
+      StreamExecutorInterface::GemmCallTrace{
+          StreamExecutorInterface::GemmCallTrace::GemmType::kBlasLt, 0,
+          a.size(), b.size()});
+
   TF_ASSIGN_OR_RETURN(
       std::optional<gpu::GpuTimer> timer,
       gpu::GpuTimer::CreateIfNeeded(
           stream, profile_result && profile_result->warmup_run_executed(),
           profile_result));
 
-  void* workspace = nullptr;
-  if (algorithm.workspace_size > 0) {
+  void* workspace_addr = nullptr;
+  uint64_t workspace_size = 0;
+  if (workspace.has_value()) {
+    workspace_addr = workspace.value().opaque();
+    workspace_size = workspace.value().size();
+    TF_RET_CHECK(workspace_size >= algorithm.workspace_size);
+  } else if (algorithm.workspace_size > 0) {
+    TF_RET_CHECK(scratch_allocator.has_value());
     TF_ASSIGN_OR_RETURN(
         DeviceMemory<uint8_t> alloc,
-        scratch_allocator.AllocateBytes(algorithm.workspace_size));
-    workspace = gpu::GpuMemoryMutable(&alloc);
+        scratch_allocator.value()->AllocateBytes(algorithm.workspace_size));
+    workspace_addr = gpu::GpuMemoryMutable(&alloc);
+    workspace_size = algorithm.workspace_size;
   }
 
   auto palgo = std::any_cast<hipblasLtMatmulAlgo_t>(&algorithm.opaque_algo);
@@ -452,8 +492,8 @@ absl::Status BlasLt::MatmulPlan::DoMatmul(
       SE_HIPBLAS_RETURN_IF_ERROR(wrap::hipblasLtMatmul(
           blas_lt_ref_.blas_lt_.get(), op_desc_.get(), alpha, a.opaque(),
           a_desc_.get(), b.opaque(), b_desc_.get(), beta, c.opaque(),
-          c_desc_.get(), d.opaque(), d_desc_.get(), palgo, workspace,
-          algorithm.workspace_size, gpu::AsGpuStreamValue(stream)));
+          c_desc_.get(), d.opaque(), d_desc_.get(), palgo, workspace_addr,
+          workspace_size, gpu::AsGpuStreamValue(stream)));
     } else {
       return absl::InternalError("hipblaslt: Invalid algorithm type");
     }
@@ -525,7 +565,8 @@ absl::Status BlasLt::MatmulPlan::ExecuteOnStream(
     DeviceMemoryBase d, DeviceMemoryBase bias, DeviceMemoryBase aux,
     DeviceMemoryBase a_scale, DeviceMemoryBase b_scale,
     DeviceMemoryBase c_scale, DeviceMemoryBase d_scale, DeviceMemoryBase d_amax,
-    const MatmulAlgorithm& algorithm, ScratchAllocator& scratch_allocator,
+    const MatmulAlgorithm& algorithm, std::optional<DeviceMemoryBase> workspace,
+    std::optional<ScratchAllocator*> scratch_allocator,
     blas::ProfileResult* profile_result) const {
   if (must_swap_operands_) {
     std::swap(a, b);
@@ -534,48 +575,136 @@ absl::Status BlasLt::MatmulPlan::ExecuteOnStream(
   std::tuple operand_types{a_desc_.type(), b_desc_.type(), c_desc_.type(),
                            d_desc_.type()};
 
-#define TYPED_MATMUL(SCALENTYPE, ATYPE, BTYPE, CTYPE, DTYPE)              \
+#define TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(SCALENTYPE, ATYPE, BTYPE, CTYPE, \
+                                            DTYPE)                           \
+  if (operand_types == std::make_tuple(ATYPE, BTYPE, CTYPE, DTYPE)) {        \
+    return gpu::BlasLt::MatmulPlan::DoMatmul<                                \
+        SCALENTYPE, HipToNativeT<ATYPE>::type, HipToNativeT<BTYPE>::type,    \
+        HipToNativeT<CTYPE>::type, HipToNativeT<DTYPE>::type>(               \
+        stream, alpha_, a, b, beta_, c, d, bias, aux, a_scale, b_scale,      \
+        c_scale, d_scale, d_amax, algorithm, *scratch_allocator.value(),     \
+        profile_result);                                                     \
+  }
+
+#define TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(SCALENTYPE, ATYPE, BTYPE, \
+                                                CTYPE, DTYPE)             \
   if (operand_types == std::make_tuple(ATYPE, BTYPE, CTYPE, DTYPE)) {     \
     return gpu::BlasLt::MatmulPlan::DoMatmul<                             \
         SCALENTYPE, HipToNativeT<ATYPE>::type, HipToNativeT<BTYPE>::type, \
         HipToNativeT<CTYPE>::type, HipToNativeT<DTYPE>::type>(            \
         stream, alpha_, a, b, beta_, c, d, bias, aux, a_scale, b_scale,   \
-        c_scale, d_scale, d_amax, algorithm, scratch_allocator,           \
-        profile_result);                                                  \
+        c_scale, d_scale, d_amax, algorithm, workspace, profile_result);  \
   }
 
+  if (workspace.has_value()) {
 #if TF_ROCM_VERSION >= 60000
-  TYPED_MATMUL(float, HIP_R_8F_E4M3_FNUZ, HIP_R_8F_E4M3_FNUZ, HIP_R_16F,
-               HIP_R_16F)
-  TYPED_MATMUL(float, HIP_R_8F_E4M3_FNUZ, HIP_R_8F_E4M3_FNUZ, HIP_R_32F,
-               HIP_R_32F)
-
-  TYPED_MATMUL(float, HIP_R_8F_E4M3_FNUZ, HIP_R_8F_E5M2_FNUZ, HIP_R_16F,
-               HIP_R_16F)
-  TYPED_MATMUL(float, HIP_R_8F_E4M3_FNUZ, HIP_R_8F_E5M2_FNUZ, HIP_R_32F,
-               HIP_R_32F)
-
-  TYPED_MATMUL(float, HIP_R_8F_E5M2_FNUZ, HIP_R_8F_E4M3_FNUZ, HIP_R_16F,
-               HIP_R_16F)
-  TYPED_MATMUL(float, HIP_R_8F_E5M2_FNUZ, HIP_R_8F_E4M3_FNUZ, HIP_R_32F,
-               HIP_R_32F)
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(
+        float, HIP_R_8F_E4M3_FNUZ, HIP_R_8F_E4M3_FNUZ, HIP_R_16F, HIP_R_16F)
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(
+        float, HIP_R_8F_E4M3_FNUZ, HIP_R_8F_E4M3_FNUZ, HIP_R_32F, HIP_R_32F)
+
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(
+        float, HIP_R_8F_E4M3_FNUZ, HIP_R_8F_E5M2_FNUZ, HIP_R_16F, HIP_R_16F)
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(
+        float, HIP_R_8F_E4M3_FNUZ, HIP_R_8F_E5M2_FNUZ, HIP_R_32F, HIP_R_32F)
+
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(
+        float, HIP_R_8F_E5M2_FNUZ, HIP_R_8F_E4M3_FNUZ, HIP_R_16F, HIP_R_16F)
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(
+        float, HIP_R_8F_E5M2_FNUZ, HIP_R_8F_E4M3_FNUZ, HIP_R_32F, HIP_R_32F)
 #endif
 
-  // Other data types:
-  TYPED_MATMUL(float, HIP_R_16BF, HIP_R_16BF, HIP_R_16BF, HIP_R_16BF)
-  TYPED_MATMUL(float, HIP_R_16F, HIP_R_16F, HIP_R_16F, HIP_R_16F)
-  TYPED_MATMUL(float, HIP_R_16BF, HIP_R_16BF, HIP_R_32F, HIP_R_32F)
-  TYPED_MATMUL(float, HIP_R_16F, HIP_R_16F, HIP_R_32F, HIP_R_32F)
-  TYPED_MATMUL(float, HIP_R_32F, HIP_R_32F, HIP_R_32F, HIP_R_32F)
-  TYPED_MATMUL(double, HIP_R_64F, HIP_R_64F, HIP_R_64F, HIP_R_64F)
-  TYPED_MATMUL(complex64, HIP_C_32F, HIP_C_32F, HIP_C_32F, HIP_C_32F)
-  TYPED_MATMUL(complex128, HIP_C_64F, HIP_C_64F, HIP_C_64F, HIP_C_64F)
+    // Other data types:
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(float, HIP_R_16BF, HIP_R_16BF,
+                                            HIP_R_16BF, HIP_R_16BF)
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(float, HIP_R_16F, HIP_R_16F,
+                                            HIP_R_16F, HIP_R_16F)
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(float, HIP_R_16BF, HIP_R_16BF,
+                                            HIP_R_32F, HIP_R_32F)
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(float, HIP_R_16F, HIP_R_16F,
+                                            HIP_R_32F, HIP_R_32F)
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(float, HIP_R_32F, HIP_R_32F,
+                                            HIP_R_32F, HIP_R_32F)
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(double, HIP_R_64F, HIP_R_64F,
+                                            HIP_R_64F, HIP_R_64F)
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(complex64, HIP_C_32F, HIP_C_32F,
+                                            HIP_C_32F, HIP_C_32F)
+    TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE(complex128, HIP_C_64F, HIP_C_64F,
+                                            HIP_C_64F, HIP_C_64F)
+  } else if (scratch_allocator.has_value()) {
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(
+        float, HIP_R_8F_E4M3_FNUZ, HIP_R_8F_E4M3_FNUZ, HIP_R_16F, HIP_R_16F)
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(
+        float, HIP_R_8F_E4M3_FNUZ, HIP_R_8F_E4M3_FNUZ, HIP_R_32F, HIP_R_32F)
+
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(
+        float, HIP_R_8F_E4M3_FNUZ, HIP_R_8F_E5M2_FNUZ, HIP_R_16F, HIP_R_16F)
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(
+        float, HIP_R_8F_E4M3_FNUZ, HIP_R_8F_E5M2_FNUZ, HIP_R_32F, HIP_R_32F)
+
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(
+        float, HIP_R_8F_E5M2_FNUZ, HIP_R_8F_E4M3_FNUZ, HIP_R_16F, HIP_R_16F)
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(
+        float, HIP_R_8F_E5M2_FNUZ, HIP_R_8F_E4M3_FNUZ, HIP_R_32F, HIP_R_32F)
+
+    // Other data types:
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(float, HIP_R_16BF, HIP_R_16BF,
+                                        HIP_R_16BF, HIP_R_16BF)
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(float, HIP_R_16F, HIP_R_16F, HIP_R_16F,
+                                        HIP_R_16F)
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(float, HIP_R_16BF, HIP_R_16BF,
+                                        HIP_R_32F, HIP_R_32F)
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(float, HIP_R_16F, HIP_R_16F, HIP_R_32F,
+                                        HIP_R_32F)
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(float, HIP_R_32F, HIP_R_32F, HIP_R_32F,
+                                        HIP_R_32F)
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(double, HIP_R_64F, HIP_R_64F, HIP_R_64F,
+                                        HIP_R_64F)
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(complex64, HIP_C_32F, HIP_C_32F,
+                                        HIP_C_32F, HIP_C_32F)
+    TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR(complex128, HIP_C_64F, HIP_C_64F,
+                                        HIP_C_64F, HIP_C_64F)
+  }
 
-#undef TYPED_MATMUL
+#undef TYPED_MATMUL_WITH_SCRATCH_ALLOCATOR
+#undef TYPED_MATMUL_WITH_PREALLOCATE_WORKSPACE
 
   return xla::Internal("Unexpected dtype");
 }
 
+absl::Status BlasLt::MatmulPlan::ExecuteOnStream(
+    Stream* stream, DeviceMemoryBase a_buffer, DeviceMemoryBase b_buffer,
+    DeviceMemoryBase c_buffer, DeviceMemoryBase d_buffer,
+    DeviceMemoryBase bias_buffer,  // may be null
+    DeviceMemoryBase aux_buffer,   // may be null
+    DeviceMemoryBase a_scale_buffer, DeviceMemoryBase b_scale_buffer,
+    DeviceMemoryBase c_scale_buffer, DeviceMemoryBase d_scale_buffer,
+    DeviceMemoryBase d_amax_buffer, const MatmulAlgorithm& algorithm,
+    ScratchAllocator& scratch_allocator,
+    blas::ProfileResult* profile_result) const {
+  return ExecuteOnStream(stream, a_buffer, b_buffer, c_buffer, d_buffer,
+                         bias_buffer, aux_buffer, a_scale_buffer,
+                         b_scale_buffer, c_scale_buffer, d_scale_buffer,
+                         d_amax_buffer, algorithm, std::nullopt,
+                         &scratch_allocator, profile_result);
+}
+
+absl::Status BlasLt::MatmulPlan::ExecuteOnStream(
+    Stream* stream, DeviceMemoryBase a_buffer, DeviceMemoryBase b_buffer,
+    DeviceMemoryBase c_buffer, DeviceMemoryBase d_buffer,
+    DeviceMemoryBase bias_buffer,  // may be null
+    DeviceMemoryBase aux_buffer,   // may be null
+    DeviceMemoryBase a_scale_buffer, DeviceMemoryBase b_scale_buffer,
+    DeviceMemoryBase c_scale_buffer, DeviceMemoryBase d_scale_buffer,
+    DeviceMemoryBase d_amax_buffer, const MatmulAlgorithm& algorithm,
+    std::optional<DeviceMemoryBase> workspace,
+    blas::ProfileResult* profile_result) const {
+  return ExecuteOnStream(
+      stream, a_buffer, b_buffer, c_buffer, d_buffer, bias_buffer, aux_buffer,
+      a_scale_buffer, b_scale_buffer, c_scale_buffer, d_scale_buffer,
+      d_amax_buffer, algorithm, workspace, std::nullopt, profile_result);
+}
+
 }  // namespace rocm
 
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.h b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.h
index 54c5b024c60f33..33a042c55ba9c2 100644
--- a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.h
+++ b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.h
@@ -110,6 +110,29 @@ class BlasLt : public gpu::BlasLt {
         ScratchAllocator& scratch_allocator,
         blas::ProfileResult* profile_result = nullptr) const override;
 
+    absl::Status ExecuteOnStream(
+        Stream* stream, DeviceMemoryBase a_buffer, DeviceMemoryBase b_buffer,
+        DeviceMemoryBase c_buffer, DeviceMemoryBase d_buffer,
+        DeviceMemoryBase bias_buffer,  // may be null
+        DeviceMemoryBase aux_buffer,   // may be null
+        DeviceMemoryBase a_scale_buffer, DeviceMemoryBase b_scale_buffer,
+        DeviceMemoryBase c_scale_buffer, DeviceMemoryBase d_scale_buffer,
+        DeviceMemoryBase d_amax_buffer, const MatmulAlgorithm& algorithm,
+        std::optional<DeviceMemoryBase> workspace,
+        blas::ProfileResult* profile_result = nullptr) const override;
+
+    absl::Status ExecuteOnStream(
+        Stream* stream, DeviceMemoryBase a_buffer, DeviceMemoryBase b_buffer,
+        DeviceMemoryBase c_buffer, DeviceMemoryBase d_buffer,
+        DeviceMemoryBase bias_buffer,  // may be null
+        DeviceMemoryBase aux_buffer,   // may be null
+        DeviceMemoryBase a_scale_buffer, DeviceMemoryBase b_scale_buffer,
+        DeviceMemoryBase c_scale_buffer, DeviceMemoryBase d_scale_buffer,
+        DeviceMemoryBase d_amax_buffer, const MatmulAlgorithm& algorithm,
+        std::optional<DeviceMemoryBase> workspace,
+        std::optional<ScratchAllocator*> scratch_allocator,
+        blas::ProfileResult* profile_result = nullptr) const;
+
     absl::StatusOr<std::vector<MatmulAlgorithm>> GetAlgorithms(
         size_t max_algorithm_count, size_t max_workspace_size) const override;
 
@@ -119,6 +142,7 @@ class BlasLt : public gpu::BlasLt {
                                 blas::DataType B_type, blas::DataType C_type,
                                 blas::DataType D_type) const override;
 
+    // API that uses scratch_allocator to allocate workspace
     absl::Status DoMatmul(Stream* stream, const void* alpha, DeviceMemoryBase a,
                           DeviceMemoryBase b, const void* beta,
                           DeviceMemoryBase c, DeviceMemoryBase d,
@@ -130,6 +154,30 @@ class BlasLt : public gpu::BlasLt {
                           DeviceMemoryBase d_amax,
                           blas::ProfileResult* profile_result) const override;
 
+    // API that uses pre-allocated buffer as workspace
+    absl::Status DoMatmul(Stream* stream, const void* alpha, DeviceMemoryBase a,
+                          DeviceMemoryBase b, const void* beta,
+                          DeviceMemoryBase c, DeviceMemoryBase d,
+                          const MatmulAlgorithm& algorithm,
+                          DeviceMemoryBase bias, DeviceMemoryBase aux,
+                          DeviceMemoryBase a_scale, DeviceMemoryBase b_scale,
+                          DeviceMemoryBase c_scale, DeviceMemoryBase d_scale,
+                          DeviceMemoryBase d_amax,
+                          std::optional<DeviceMemoryBase> workspace,
+                          blas::ProfileResult* profile_result) const override;
+
+    absl::Status DoMatmul(Stream* stream, const void* alpha, DeviceMemoryBase a,
+                          DeviceMemoryBase b, const void* beta,
+                          DeviceMemoryBase c, DeviceMemoryBase d,
+                          const MatmulAlgorithm& algorithm,
+                          DeviceMemoryBase bias, DeviceMemoryBase aux,
+                          DeviceMemoryBase a_scale, DeviceMemoryBase b_scale,
+                          DeviceMemoryBase c_scale, DeviceMemoryBase d_scale,
+                          DeviceMemoryBase d_amax,
+                          std::optional<DeviceMemoryBase> workspace,
+                          std::optional<ScratchAllocator*> scratch_allocator,
+                          blas::ProfileResult* profile_result) const override;
+
    private:
     const BlasLt& blas_lt_ref_;
     // TODO(cjfj): Add consistency checks for types, shapes, etc.?
diff --git a/third_party/xla/xla/stream_executor/rocm/hipsparse_wrapper.h b/third_party/xla/xla/stream_executor/rocm/hipsparse_wrapper.h
index b4bcc7d8f3944e..d66b2426d50d84 100644
--- a/third_party/xla/xla/stream_executor/rocm/hipsparse_wrapper.h
+++ b/third_party/xla/xla/stream_executor/rocm/hipsparse_wrapper.h
@@ -20,12 +20,15 @@ limitations under the License.
 #ifndef XLA_STREAM_EXECUTOR_ROCM_HIPSPARSE_WRAPPER_H_
 #define XLA_STREAM_EXECUTOR_ROCM_HIPSPARSE_WRAPPER_H_
 
+#include "rocm/rocm_config.h"
+
 #if (TF_ROCM_VERSION >= 50200)
 #include "rocm/include/hipsparse/hipsparse.h"
 #else
 #include "rocm/include/hipsparse.h"
 #endif
 #include "xla/stream_executor/platform/dso_loader.h"
+#include "xla/stream_executor/platform/platform.h"
 #include "xla/stream_executor/platform/port.h"
 #include "tsl/platform/env.h"
 
@@ -38,7 +41,7 @@ namespace wrap {
   struct WrapperShim__##__name {                    \
     template <typename... Args>                     \
     hipsparseStatus_t operator()(Args... args) {    \
-      hipSparseStatus_t retval = ::__name(args...); \
+      hipsparseStatus_t retval = ::__name(args...); \
       return retval;                                \
     }                                               \
   } __name;
diff --git a/third_party/xla/xla/stream_executor/rocm/rocblas_wrapper.h b/third_party/xla/xla/stream_executor/rocm/rocblas_wrapper.h
index 3d444ab83a0ee6..436b3d3692822b 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocblas_wrapper.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocblas_wrapper.h
@@ -27,23 +27,23 @@ limitations under the License.
 #include "xla/stream_executor/platform/dso_loader.h"
 #include "xla/stream_executor/platform/port.h"
 #include "tsl/platform/env.h"
+#include "tsl/platform/platform.h"
 
 namespace stream_executor {
 namespace wrap {
 
-using stream_executor::internal::CachedDsoLoader::GetRocblasDsoHandle;
-
 #ifdef PLATFORM_GOOGLE
 #define ROCBLAS_API_WRAPPER(__name)               \
   struct WrapperShim__##__name {                  \
     constexpr static const char* kName = #__name; \
     template <typename... Args>                   \
     rocblas_status operator()(Args... args) {     \
-      return ::__name(args...);                   \
+      return (::__name)(args...);                 \
     }                                             \
   } __name;
 
 #else
+using stream_executor::internal::CachedDsoLoader::GetRocblasDsoHandle;
 
 #define ROCBLAS_API_WRAPPER(__name)                                      \
   static struct DynLoadShim__##__name {                                  \
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc b/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc
index 5686a4791de9b4..ec20077bde6e6f 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc
@@ -75,7 +75,7 @@ RocBlasType_t<T> *complex_cast(DeviceMemory<T> *a) {
   return reinterpret_cast<RocBlasType_t<T> *>(GpuMemoryMutable(a));
 }
 
-static string ToString(rocblas_status status) {
+static std::string ToString(rocblas_status status) {
 #define XVAL(x) \
   case x:       \
     return #x
@@ -222,6 +222,26 @@ rocblas_side ROCMBlasSide(blas::Side side) {
   }
 }
 
+int DtypeSize(blas::DataType type) {
+  switch (type) {
+    case blas::DataType::kHalf:
+    case blas::DataType::kBF16:
+      return 2;
+    case blas::DataType::kFloat:
+      return 4;
+    case blas::DataType::kDouble:
+      return 8;
+    case blas::DataType::kInt8:
+      return 1;
+    case blas::DataType::kComplexFloat:
+      return 8;
+    case blas::DataType::kComplexDouble:
+      return 16;
+    default:
+      return 0;
+  }
+}
+
 absl::StatusOr<rocblas_datatype> AsRocBlasType(blas::DataType type) {
   switch (type) {
     case blas::DataType::kHalf:
@@ -432,6 +452,16 @@ Impl_DoBlasScal(wrap::rocblas_sscal, float,
  *    and ex functions expect the same type as the compute type (i.e. floats.)
  *
  **/
+using sei = StreamExecutorInterface;
+using GemmCallTrace = sei::GemmCallTrace;
+
+// Log the GEMM operation if the logging mode is enabled.
+void ROCMBlas::MaybeLogGemmOp(GemmCallTrace::GemmType op,
+                              blas::CallContext context, uint64_t size1,
+                              uint64_t size2) {
+  auto status =
+      parent_->RecordApiTrace(GemmCallTrace{op, (int)context, size1, size2});
+}
 
 absl::Status ROCMBlas::DoBlasGemm(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
@@ -439,6 +469,9 @@ absl::Status ROCMBlas::DoBlasGemm(
     const DeviceMemoryBase &a, int lda, const DeviceMemoryBase &b, int ldb,
     const void *beta, DeviceMemoryBase *c, int ldc,
     const NumericOptions &numeric_options, blas::CallContext context) {
+  MaybeLogGemmOp(GemmCallTrace::GemmType::kPlain, context,
+                 m * k * DtypeSize(dtype), n * k * DtypeSize(dtype));
+
   VLOG(1) << absl::StreamFormat(
       "doing rocBLAS GEMM: at=%d bt=%d m=%u n=%u "
       "k=%llu alpha=%p a=%p lda=%d b=%p ldb=%d beta=%p "
@@ -542,6 +575,8 @@ absl::Status ROCMBlas::DoBlasGemmWithAlgorithm(
                                   numeric_options, context));
 
   } else {
+    MaybeLogGemmOp(GemmCallTrace::GemmType::kPlain, context,
+                   m * k * DtypeSize(type_a), n * k * DtypeSize(type_a));
     CheckPreconditions(transa, transb, m, n, k, type_a, lda, ldb);
     TF_ASSIGN_OR_RETURN(auto roc_type_a, AsRocBlasType(type_a));
     TF_ASSIGN_OR_RETURN(auto roc_type_c, AsRocBlasType(type_c));
@@ -601,6 +636,8 @@ absl::Status ROCMBlas::DoBlasGemmStridedBatchedWithAlgorithm(
         ldb, stride_b, beta, c, ldc, stride_c, batch_count, numeric_options,
         context));
   } else {
+    MaybeLogGemmOp(GemmCallTrace::GemmType::kStridedBatched, context, a.size(),
+                   b.size());
     VLOG(1) << absl::StreamFormat(
         "doing rocBLAS GEMM strided batched with Algorithm: at=%d bt=%d m=%u "
         "n=%u "
@@ -699,7 +736,6 @@ bool ROCMBlas::GetBlasGemmAlgorithms(
   ASSIGN_OR_FALSE(auto roc_comp_type, AsRocBlasComputeType(c->compute_type));
 
   if (c->batch_size == 1) {
-    // TODO: we should possibly use GemmFloat16Flags(type_a, context) here..
     return DoBlasInternalFailureOK(
         NameWrap{blas_lambda}, stream, true,
         wrap::rocblas_gemm_ex_get_solutions, ROCMBlasTranspose(a.transpose),
@@ -1032,6 +1068,8 @@ bool ROCMBlas::DoBlasGemmBatched(
     DeviceMemorySlice<Eigen::half> c, int ldc, int batch_count,
     const NumericOptions &numeric_options, ScratchAllocator *scratch_allocator,
     blas::CallContext context) {
+  MaybeLogGemmOp(GemmCallTrace::GemmType::kBatched, context, a.size(),
+                 b.size());
   const Eigen::half alpha_half(alpha);
   const Eigen::half beta_half(beta);
   absl::Status status;
@@ -1066,6 +1104,8 @@ bool ROCMBlas::DoBlasGemmBatched(
     DeviceMemorySlice<Eigen::bfloat16> c_array, int ldc, int batch_count,
     const NumericOptions &numeric_options, ScratchAllocator *scratch_allocator,
     blas::CallContext context) {
+  MaybeLogGemmOp(GemmCallTrace::GemmType::kBatched, context, a_array.size(),
+                 b_array.size());
   const Eigen::bfloat16 alpha_bf16(alpha);
   const Eigen::bfloat16 beta_bf16(beta);
 
@@ -1087,6 +1127,8 @@ bool ROCMBlas::DoBlasGemmBatched(
       DeviceMemorySlice<T> c_array, int ldc, int batch_count,                  \
       const NumericOptions &numeric_options,                                   \
       ScratchAllocator *scratch_allocator, blas::CallContext context) {        \
+    MaybeLogGemmOp(GemmCallTrace::GemmType::kBatched, context, a_array.size(), \
+                   b_array.size());                                            \
     absl::Status status = DoBlasGemmBatchedInternal(                           \
         Fun, stream, transa, transb, m, n, k, alpha, a_array, lda, b_array,    \
         ldb, beta, c_array, ldc, batch_count, scratch_allocator);              \
@@ -1153,6 +1195,8 @@ IMPL_DoBlasGemmBatched(float, wrap::rocblas_sgemm_strided_batched)
       static_cast<int>(transa), static_cast<int>(transb), m, n, k, alpha,
       a.opaque(), lda, b.opaque(), ldb, beta, c->opaque(), ldc, stride_a,
       stride_b, stride_c, batch_count);
+  MaybeLogGemmOp(GemmCallTrace::GemmType::kStridedBatched, context, a.size(),
+                 b.size());
 
   absl::Status status;
   auto call_gemm = [&](auto func, auto type) {
@@ -1207,7 +1251,7 @@ IMPL_DoBlasGemmBatched(float, wrap::rocblas_sgemm_strided_batched)
   }
 }
 
-absl::Status ROCMBlas::GetVersion(string *version) {
+absl::Status ROCMBlas::GetVersion(std::string *version) {
 #if TF_ROCM_VERSION >= 60300  // Not yet available in ROCM-6.1
   absl::MutexLock lock{&mu_};
   size_t len = 0;
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_blas.h b/third_party/xla/xla/stream_executor/rocm/rocm_blas.h
index 537a3a7a46f07a..d8bb216ebeff8e 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_blas.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_blas.h
@@ -38,6 +38,7 @@ limitations under the License.
 #if TF_HIPBLASLT
 #include "xla/stream_executor/rocm/hip_blas_lt.h"
 #endif
+#include "xla/stream_executor/stream_executor_interface.h"
 
 namespace stream_executor {
 
@@ -199,6 +200,10 @@ class ROCMBlas : public blas::BlasSupport {
   // container holding solutions vector (to avoid reallocating it each time)
   std::vector<rocblas_int> solutions_;
 
+  void MaybeLogGemmOp(StreamExecutorInterface::GemmCallTrace::GemmType op,
+                      blas::CallContext context, uint64_t size1,
+                      uint64_t size2);
+
 #if TF_HIPBLASLT
   rocm::BlasLt blas_lt_;
 #endif
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
index 749e4a1a752522..f4f5910771c03b 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
@@ -43,6 +43,7 @@ limitations under the License.
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/stream_interface.h"
 #include "xla/tsl/util/determinism.h"
 #include "xla/tsl/util/env_var.h"
 #include "tsl/platform/env.h"
@@ -94,7 +95,7 @@ static absl::Status PopulateProfileFromTimer(
   return absl::OkStatus();
 }
 
-string ToString(miopenStatus_t status) {
+std::string ToString(miopenStatus_t status) {
   switch (status) {
     case miopenStatusSuccess:
       return "miopenStatusSuccess";
@@ -112,14 +113,16 @@ string ToString(miopenStatus_t status) {
       return "miopenStatusNotImplemented";
     case miopenStatusUnknownError:
       return "miopenStatusUnknownError";
+    case miopenStatusUnsupportedOp:
+      return "miopenStatusUnsupportedOp";
     default:
       return absl::StrCat("<unknown miopen status: ", static_cast<int>(status),
                           ">");
   }
 }
 
-string ToString(miopenConvFwdAlgorithm_t algorithm) {
-  string s;
+std::string ToString(miopenConvFwdAlgorithm_t algorithm) {
+  std::string s;
   switch (algorithm) {
     case miopenConvolutionFwdAlgoGEMM:
       s = "GEMM";
@@ -140,8 +143,8 @@ string ToString(miopenConvFwdAlgorithm_t algorithm) {
   return s;
 }
 
-string ToString(miopenConvBwdWeightsAlgorithm_t algorithm) {
-  string s;
+std::string ToString(miopenConvBwdWeightsAlgorithm_t algorithm) {
+  std::string s;
   switch (algorithm) {
     case miopenConvolutionBwdWeightsAlgoGEMM:
       s = "GEMM";
@@ -159,8 +162,8 @@ string ToString(miopenConvBwdWeightsAlgorithm_t algorithm) {
   return s;
 }
 
-string ToString(miopenConvBwdDataAlgorithm_t algorithm) {
-  string s;
+std::string ToString(miopenConvBwdDataAlgorithm_t algorithm) {
+  std::string s;
   switch (algorithm) {
     case miopenConvolutionBwdDataAlgoGEMM:
       s = "GEMM";
@@ -184,8 +187,8 @@ string ToString(miopenConvBwdDataAlgorithm_t algorithm) {
   return s;
 }
 
-string ToString(miopenConvAlgorithm_t algorithm) {
-  string s;
+std::string ToString(miopenConvAlgorithm_t algorithm) {
+  std::string s;
   switch (algorithm) {
     case miopenConvolutionAlgoGEMM:
       s = "GEMM";
@@ -322,6 +325,7 @@ namespace wrap {
   __macro(miopenConvolutionBackwardDataGetWorkSpaceSize)             \
   __macro(miopenCreateRNNDescriptor)                                 \
   __macro(miopenSetRNNDescriptor)                                    \
+  __macro(miopenSetRNNDescriptor_V2)                                 \
   __macro(miopenDestroyRNNDescriptor)                                \
   __macro(miopenGetRNNParamsSize)                                    \
   __macro(miopenGetRNNLayerParam)                                    \
@@ -337,6 +341,15 @@ namespace wrap {
   __macro(miopenGetRNNLayerBiasOffset)                               \
   __macro(miopenGetRNNLayerBiasSize)                                 \
   __macro(miopenGetRNNParamsDescriptor)                              \
+  __macro(miopenCreateDropoutDescriptor)                             \
+  __macro(miopenSetDropoutDescriptor)                                \
+  __macro(miopenGetDropoutDescriptor)                                \
+  __macro(miopenDestroyDropoutDescriptor)                            \
+  __macro(miopenRestoreDropoutDescriptor)                            \
+  __macro(miopenDropoutGetReserveSpaceSize)                          \
+  __macro(miopenDropoutGetStatesSize)                                \
+  __macro(miopenDropoutForward)                                      \
+  __macro(miopenDropoutBackward)                                     \
   __macro(miopenCreateActivationDescriptor)                          \
   __macro(miopenSetActivationDescriptor)                             \
   __macro(miopenGetActivationDescriptor)                             \
@@ -367,6 +380,8 @@ namespace wrap {
   __macro(miopenConvolutionForwardGetSolutionWorkspaceSize)          \
   __macro(miopenConvolutionForwardCompileSolution)                   \
   __macro(miopenConvolutionForwardImmediate)                         \
+  __macro(miopenConvolutionForwardBias)                              \
+  __macro(miopenConvolutionBiasActivationForward)                    \
   __macro(miopenConvolutionBackwardDataGetSolutionCount)             \
   __macro(miopenConvolutionBackwardDataGetSolution)                  \
   __macro(miopenConvolutionBackwardDataGetSolutionWorkspaceSize)     \
@@ -435,6 +450,7 @@ namespace wrap {
   __macro(miopenConvolutionBackwardDataGetWorkSpaceSize)             \
   __macro(miopenCreateRNNDescriptor)                                 \
   __macro(miopenSetRNNDescriptor)                                    \
+  __macro(miopenSetRNNDescriptor_V2)                                 \
   __macro(miopenDestroyRNNDescriptor)                                \
   __macro(miopenGetRNNParamsSize)                                    \
   __macro(miopenGetRNNLayerParam)                                    \
@@ -450,6 +466,15 @@ namespace wrap {
   __macro(miopenGetRNNLayerBiasOffset)                               \
   __macro(miopenGetRNNLayerBiasSize)                                 \
   __macro(miopenGetRNNParamsDescriptor)                              \
+  __macro(miopenCreateDropoutDescriptor)                             \
+  __macro(miopenSetDropoutDescriptor)                                \
+  __macro(miopenGetDropoutDescriptor)                                \
+  __macro(miopenDestroyDropoutDescriptor)                            \
+  __macro(miopenRestoreDropoutDescriptor)                            \
+  __macro(miopenDropoutGetReserveSpaceSize)                          \
+  __macro(miopenDropoutGetStatesSize)                                \
+  __macro(miopenDropoutForward)                                      \
+  __macro(miopenDropoutBackward)                                     \
   __macro(miopenCreateActivationDescriptor)                          \
   __macro(miopenSetActivationDescriptor)                             \
   __macro(miopenGetActivationDescriptor)                             \
@@ -475,11 +500,13 @@ namespace wrap {
   __macro(miopenExecuteFusionPlan)                                   \
   __macro(miopenDestroyOperatorArgs)                                 \
   __macro(miopenDestroyFusionPlan)                                   \
+  __macro(miopenConvolutionBiasActivationForward)                    \
   __macro(miopenConvolutionForwardGetSolutionCount)                  \
   __macro(miopenConvolutionForwardGetSolution)                       \
   __macro(miopenConvolutionForwardGetSolutionWorkspaceSize)          \
   __macro(miopenConvolutionForwardCompileSolution)                   \
   __macro(miopenConvolutionForwardImmediate)                         \
+  __macro(miopenConvolutionForwardBias)                              \
   __macro(miopenConvolutionBackwardDataGetSolutionCount)             \
   __macro(miopenConvolutionBackwardDataGetSolution)                  \
   __macro(miopenConvolutionBackwardDataGetSolutionWorkspaceSize)     \
@@ -545,7 +572,7 @@ uint64_t GetHashValue(miopenConvolutionDescriptor_t conv_desc) {
 
   uint64_t hash_value = tsl::hash<int>()(c_mode);
   auto hash64Combine = [&hash_value](int element) {
-    tsl::Hash64Combine(hash_value, tsl::hash<int>()(element));
+    hash_value = tsl::Hash64Combine(hash_value, tsl::hash<int>()(element));
   };
   std::for_each(pad.begin(), pad.end(), hash64Combine);
   std::for_each(stride.begin(), stride.end(), hash64Combine);
@@ -577,9 +604,11 @@ class CachedFusionPlans {
 
     auto it = cached_plans.find(hash);
     if (it != cached_plans.end()) {
+      VLOG(2) << "Found a cached plan for " << hash;
       *fusion_plan = it->second;
       found_cached_plan = true;
     } else {
+      VLOG(2) << "Creating a new plan for " << hash;
       auto status = wrap::miopenCreateFusionPlan(fusion_plan, fusion_direction,
                                                  input_descriptor);
       if (status != miopenStatusSuccess) {
@@ -655,6 +684,7 @@ dnn::ProfileResult GetProfileResultFromConvAlgoPerf(
   int64_t algo_id;
   switch (kind) {
     case dnn::ConvolutionKind::FORWARD:
+    case dnn::ConvolutionKind::FORWARD_BIAS_ACTIVATION:
       algo_id = algorithm.fwd_algo;
       break;
     case dnn::ConvolutionKind::BACKWARD_DATA:
@@ -778,425 +808,398 @@ absl::StatusOr<stream_executor::dnn::VersionInfo> MIOpenSupport::GetVersion() {
   return stream_executor::dnn::VersionInfo(1, 3, 0);
 }
 
-// Turns a BatchDescriptor structure into a miopen tensor handle within a scope.
-class ScopedTensorDescriptor {
- public:
-  ScopedTensorDescriptor(const BatchDescriptor& batch_descriptor,
-                         miopenDataType_t elem_type)
-      : handle_(nullptr) {
-    auto status = wrap::miopenCreateTensorDescriptor(&handle_);
-    if (status != miopenStatusSuccess) {
-      LOG(FATAL) << "could not create miopen tensor descriptor: "
-                 << ToString(status);
-    }
-
-    switch (batch_descriptor.layout()) {
-      case dnn::DataLayout::kBatchYXDepth:
-      case dnn::DataLayout::kBatchDepthYX: {
-        const int nd = batch_descriptor.ndims() + 2;
-
-        // MIOpen requires the strides and dims to be ordered as BDYX.
-        std::vector<int64_t> strides64 =
-            batch_descriptor.full_strides(dnn::DataLayout::kBatchDepthYX);
-        std::vector<int64_t> dims64 =
-            batch_descriptor.full_dims(dnn::DataLayout::kBatchDepthYX);
-
-        // MIOpen requires arrays of ints.
-        std::vector<int> strides(nd);
-        std::vector<int> dims(nd);
-        std::transform(strides64.cbegin(), strides64.cend(), strides.begin(),
-                       &CheckedNarrowing<int64_t, int>);
-        std::transform(dims64.cbegin(), dims64.cend(), dims.begin(),
-                       &CheckedNarrowing<int64_t, int>);
-        status = wrap::miopenSetTensorDescriptor(handle_, elem_type, nd,
-                                                 dims.data(), strides.data());
-
-        if (status != miopenStatusSuccess) {
-          LOG(FATAL) << "could not convert BatchDescriptor "
-                     << batch_descriptor.ToString()
-                     << " to miopen tensor descriptor: " << ToString(status);
-        }
-      } break;
-      default:
-        LOG(FATAL) << "Unsupported tensor format "
-                   << DataLayoutString(batch_descriptor.layout());
-        break;
-    }
-  }
-
-  ~ScopedTensorDescriptor() {
-    auto status = wrap::miopenDestroyTensorDescriptor(handle_);
-    if (status != miopenStatusSuccess) {
-      LOG(ERROR) << "could not destroy miopen tensor descriptor: "
-                 << ToString(status);
-    }
-  }
+template <typename T>
+miopenStatus_t miDestroyObject(T obj) {
+  return miopenStatusSuccess;
+}
 
-  miopenTensorDescriptor_t handle() const { return handle_; }
+template <>
+miopenStatus_t miDestroyObject(miopenTensorDescriptor_t obj) {
+  return wrap::miopenDestroyTensorDescriptor(obj);
+}
 
- private:
-  miopenTensorDescriptor_t handle_;  // Owned.
+template <>
+miopenStatus_t miDestroyObject(miopenConvolutionDescriptor_t obj) {
+  return wrap::miopenDestroyConvolutionDescriptor(obj);
+}
 
-  ScopedTensorDescriptor(const ScopedTensorDescriptor&) = delete;
-  void operator=(const ScopedTensorDescriptor&) = delete;
-};
+template <>
+miopenStatus_t miDestroyObject(miopenPoolingDescriptor_t obj) {
+  return wrap::miopenDestroyPoolingDescriptor(obj);
+}
 
-// Turns a FilterDescriptor structure into a miopen filter handle within a
-// scope.
-class ScopedFilterDescriptor {
- public:
-  ScopedFilterDescriptor(const FilterDescriptor& filter_descriptor,
-                         miopenDataType_t elem_type)
-      : handle_(nullptr) {
-    auto status = wrap::miopenCreateTensorDescriptor(&handle_);
-    if (status != miopenStatusSuccess) {
-      LOG(FATAL) << "could not create miopen filter descriptor: "
-                 << ToString(status);
-    }
+template <>
+miopenStatus_t miDestroyObject(miopenLRNDescriptor_t obj) {
+  return wrap::miopenDestroyLRNDescriptor(obj);
+}
 
-    // We need to pass two vectors to the miopenSetTensorDescriptor routine
-    // "dims" (length == number of dims, elem value == dimension size)
-    // "strides" (length == number of dims, elem value == stride size)
-    //
-    // Irrespective of the actual filter layout, the indexing of both those
-    // vectors must be the following (coz that is what MIOpen expects)
-    // dims[0] = strides[0] = N or output
-    // dims[1] = strides[1] = C or input
-    // dims[2] = strides[2] = H or spatial dim 0
-    // dims[3] = strides[3] = W or spatial dim 1
-    //
-    // assume you have a tensor with dimensions
-    // batch descriptor name    filter descriptor name    value
-    //   N (batch size)            O (output features)    256
-    //   C (channels)              I (input features)       3
-    //   H (height)                H (height)               7
-    //   W (width)                 W (width)                5
-    //
-    // The content of "dims" will be the same irrespective of layout
-    // layout (NCHW or NHWC), and MIOpen expects it should be
-    //                           NCHW layout   NHWC layout
-    // dims[0] = size of N dim =    256           256
-    // dims[1] = size of C dim =      3             3
-    // dims[2] = size of H dim =      7             7
-    // dims[3] = size of W dim =      5             5
-    //
-    // The content of "strides" will be different based on layout
-    //                                  NCHW layout   NHWC layout
-    //  strides[0] = stride of N dim =     7x5x3       7x5x3
-    //  strides[1] = stride of C dim =     7x5         1
-    //  strides[2] = stride of H dim =     5           5x3
-    //  strides[3] = stride of W dim =     1           3
-
-    switch (filter_descriptor.layout()) {
-      case dnn::FilterLayout::kOutputYXInput:
-      case dnn::FilterLayout::kOutputInputYX: {
-        const int nd = filter_descriptor.ndims() + 2;
-
-        // MIOpen requires the strides and dims to be ordered as BDYX.
-        std::vector<int64_t> strides64 =
-            filter_descriptor.full_strides(dnn::FilterLayout::kOutputInputYX);
-        std::vector<int64_t> dims64 =
-            filter_descriptor.full_dims(dnn::FilterLayout::kOutputInputYX);
-
-        // MIOpen requires arrays of ints.
-        std::vector<int> strides;
-        std::vector<int> dims;
-        absl::c_transform(strides64, std::back_inserter(strides),
-                          &CheckedNarrowing<int64_t, int>);
-        absl::c_transform(dims64, std::back_inserter(dims),
-                          &CheckedNarrowing<int64_t, int>);
-        status = wrap::miopenSetTensorDescriptor(handle_, elem_type, nd,
-                                                 dims.data(), strides.data());
+template <typename T>
+struct ScopedDescriptor {
+  ScopedDescriptor() : handle_(nullptr) {}
 
-        if (status != miopenStatusSuccess) {
-          LOG(FATAL) << "could not convert FilterDescriptor "
-                     << filter_descriptor.ToString()
-                     << " to miopen tensor descriptor: " << ToString(status);
-        }
-      } break;
-      default:
-        LOG(FATAL) << "Unsupported tensor format "
-                   << FilterLayoutString(filter_descriptor.layout());
-        break;
-    }
+  ScopedDescriptor(ScopedDescriptor<T>&& other) {
+    handle_ = other.handle_;
+    other.handle_ = nullptr;
   }
 
-  ~ScopedFilterDescriptor() {
-    auto status = wrap::miopenDestroyTensorDescriptor(handle_);
+  ~ScopedDescriptor() {
+    if (handle_ != nullptr) return;
+
+    auto status = miDestroyObject(
+        handle_);  // wrap::miopenDestroyTensorDescriptor(handle_);
     if (status != miopenStatusSuccess) {
-      LOG(ERROR) << "could not destroy miopen filter descriptor: "
+      LOG(ERROR) << "could not destroy miopen tensor descriptor: "
                  << ToString(status);
     }
   }
 
-  miopenTensorDescriptor_t handle() const { return handle_; }
+  T handle() const { return handle_; }
 
- private:
-  // miopen filter descriptor this object creates. Owned.
-  miopenTensorDescriptor_t handle_;
+  T handle_;  // Owned.
 
-  ScopedFilterDescriptor(const ScopedFilterDescriptor&) = delete;
-  void operator=(const ScopedFilterDescriptor&) = delete;
+  ScopedDescriptor(const ScopedDescriptor<T>&) = delete;
+  void operator=(const ScopedDescriptor<T>&) = delete;
 };
 
-// Turns a ConvolutionDescriptor structure into a miopen convolution handle
-// within a scope.
-class ScopedConvolutionDescriptor {
- public:
-  ScopedConvolutionDescriptor(
-      const ConvolutionDescriptor& convolution_descriptor,
-      miopenDataType_t data_type)
-      : handle_(nullptr) {
-    auto status = wrap::miopenCreateConvolutionDescriptor(&handle_);
-    if (status != miopenStatusSuccess) {
-      LOG(FATAL) << "could not create miopen convolution descriptor: "
-                 << ToString(status);
-    }
-    const auto& strides64 = convolution_descriptor.strides();
-    const auto& padding64 = convolution_descriptor.padding();
-    if (convolution_descriptor.pad_alignment() ==
-        dnn::PadAlignment::kTensorFlowPadding) {
-      LOG(ERROR) << "TensorFlow padding alignment is not supported.";
-    }
+using ScopedTensorDescriptor = ScopedDescriptor<miopenTensorDescriptor_t>;
+using ScopedFilterDescriptor = ScopedDescriptor<miopenTensorDescriptor_t>;
+using ScopedConvolutionDescriptor =
+    ScopedDescriptor<miopenConvolutionDescriptor_t>;
+using ScopedPoolingDescriptor = ScopedDescriptor<miopenPoolingDescriptor_t>;
+using ScopedNormalizeDescriptor = ScopedDescriptor<miopenLRNDescriptor_t>;
+
+absl::StatusOr<ScopedTensorDescriptor> scope(
+    const BatchDescriptor& batch_descriptor, miopenDataType_t data_type) {
+  ScopedTensorDescriptor obj;
+  auto status = wrap::miopenCreateTensorDescriptor(&obj.handle_);
+  if (status != miopenStatusSuccess) {
+    return absl::InternalError("could not create miopen tensor descriptor: " +
+                               ToString(status));
+  }
 
-    // MIOpen requires arrays of ints.
-    std::vector<int> strides(convolution_descriptor.ndims());
-    std::vector<int> padding(convolution_descriptor.ndims());
-    std::transform(strides64.cbegin(), strides64.cend(), strides.begin(),
-                   &CheckedNarrowing<int64_t, int>);
-    std::transform(padding64.cbegin(), padding64.cend(), padding.begin(),
-                   &CheckedNarrowing<int64_t, int>);
-
-    std::vector<int> upscale(convolution_descriptor.ndims());
-    const auto& dilations64 = convolution_descriptor.dilations();
-    std::transform(dilations64.cbegin(), dilations64.cend(), upscale.begin(),
-                   &CheckedNarrowing<int64_t, int>);
-
-    status = wrap::miopenInitConvolutionNdDescriptor(
-        handle_, convolution_descriptor.ndims(), padding.data(), strides.data(),
-        upscale.data(), miopenConvolution);
-    if (status != miopenStatusSuccess) {
-      LOG(FATAL) << "could not set miopen convolution descriptor: "
-                 << ToString(status);
-    }
+  switch (batch_descriptor.layout()) {
+    case dnn::DataLayout::kBatchYXDepth:
+    case dnn::DataLayout::kBatchDepthYX: {
+      const int nd = batch_descriptor.ndims() + 2;
 
-    VLOG(2) << "Requesting grouped convolution: "
-            << convolution_descriptor.group_count();
-    status = wrap::miopenSetConvolutionGroupCount(
-        handle_, convolution_descriptor.group_count());
-    if (status != miopenStatusSuccess) {
-      LOG(FATAL) << "could not set miopen convolution group count: "
-                 << ToString(status);
-    }
+      // MIOpen requires the strides and dims to be ordered as BDYX.
+      std::vector<int64_t> strides64 =
+          batch_descriptor.full_strides(dnn::DataLayout::kBatchDepthYX);
+      std::vector<int64_t> dims64 =
+          batch_descriptor.full_dims(dnn::DataLayout::kBatchDepthYX);
+
+      // MIOpen requires arrays of ints.
+      std::vector<int> strides(nd);
+      std::vector<int> dims(nd);
+      std::transform(strides64.cbegin(), strides64.cend(), strides.begin(),
+                     &CheckedNarrowing<int64_t, int>);
+      std::transform(dims64.cbegin(), dims64.cend(), dims.begin(),
+                     &CheckedNarrowing<int64_t, int>);
+      status = wrap::miopenSetTensorDescriptor(obj.handle_, data_type, nd,
+                                               dims.data(), strides.data());
 
-#if (TF_ROCM_VERSION >= 50300)
-    if (RequireMIOpenDeterminism()) {
-      status = wrap::miopenSetConvolutionAttribute(
-          handle_, MIOPEN_CONVOLUTION_ATTRIB_DETERMINISTIC, 1);
       if (status != miopenStatusSuccess) {
-        LOG(FATAL) << "could not set miopen convolution attribute: "
-                   << ToString(status);
+        return absl::InternalError(
+            "could not convert BatchDescriptor " + batch_descriptor.ToString() +
+            " to miopen tensor descriptor: " + ToString(status));
       }
-    }
-#endif
+    } break;
+    default:
+      return absl::InternalError("Unsupported tensor format " +
+                                 DataLayoutString(batch_descriptor.layout()));
+      break;
   }
-  ~ScopedConvolutionDescriptor() {
-    auto status = wrap::miopenDestroyConvolutionDescriptor(handle_);
-    if (status != miopenStatusSuccess) {
-      LOG(ERROR) << "could not destroy miopen convolution descriptor: "
-                 << ToString(status);
-    }
+  return obj;
+}
+
+absl::StatusOr<ScopedFilterDescriptor> scope(
+    const FilterDescriptor& filter_descriptor, miopenDataType_t data_type) {
+  ScopedFilterDescriptor obj;
+  auto status = wrap::miopenCreateTensorDescriptor(&obj.handle_);
+  if (status != miopenStatusSuccess) {
+    LOG(FATAL) << "could not create miopen filter descriptor: "
+               << ToString(status);
   }
 
-  miopenConvolutionDescriptor_t handle() const { return handle_; }
+  // We need to pass two vectors to the miopenSetTensorDescriptor routine
+  // "dims" (length == number of dims, elem value == dimension size)
+  // "strides" (length == number of dims, elem value == stride size)
+  //
+  // Irrespective of the actual filter layout, the indexing of both those
+  // vectors must be the following (coz that is what MIOpen expects)
+  // dims[0] = strides[0] = N or output
+  // dims[1] = strides[1] = C or input
+  // dims[2] = strides[2] = H or spatial dim 0
+  // dims[3] = strides[3] = W or spatial dim 1
+  //
+  // assume you have a tensor with dimensions
+  // batch descriptor name    filter descriptor name    value
+  //   N (batch size)            O (output features)    256
+  //   C (channels)              I (input features)       3
+  //   H (height)                H (height)               7
+  //   W (width)                 W (width)                5
+  //
+  // The content of "dims" will be the same irrespective of layout
+  // layout (NCHW or NHWC), and MIOpen expects it should be
+  //                           NCHW layout   NHWC layout
+  // dims[0] = size of N dim =    256           256
+  // dims[1] = size of C dim =      3             3
+  // dims[2] = size of H dim =      7             7
+  // dims[3] = size of W dim =      5             5
+  //
+  // The content of "strides" will be different based on layout
+  //                                  NCHW layout   NHWC layout
+  //  strides[0] = stride of N dim =     7x5x3       7x5x3
+  //  strides[1] = stride of C dim =     7x5         1
+  //  strides[2] = stride of H dim =     5           5x3
+  //  strides[3] = stride of W dim =     1           3
+
+  switch (filter_descriptor.layout()) {
+    case dnn::FilterLayout::kOutputYXInput:
+    case dnn::FilterLayout::kOutputInputYX: {
+      const int nd = filter_descriptor.ndims() + 2;
+
+      // MIOpen requires the strides and dims to be ordered as BDYX.
+      std::vector<int64_t> strides64 =
+          filter_descriptor.full_strides(dnn::FilterLayout::kOutputInputYX);
+      std::vector<int64_t> dims64 =
+          filter_descriptor.full_dims(dnn::FilterLayout::kOutputInputYX);
+
+      // MIOpen requires arrays of ints.
+      std::vector<int> strides;
+      std::vector<int> dims;
+      absl::c_transform(strides64, std::back_inserter(strides),
+                        &CheckedNarrowing<int64_t, int>);
+      absl::c_transform(dims64, std::back_inserter(dims),
+                        &CheckedNarrowing<int64_t, int>);
+      status = wrap::miopenSetTensorDescriptor(obj.handle_, data_type, nd,
+                                               dims.data(), strides.data());
 
- private:
-  miopenConvolutionDescriptor_t handle_;  // Owned.
+      if (status != miopenStatusSuccess) {
+        LOG(FATAL) << "could not convert FilterDescriptor "
+                   << filter_descriptor.ToString()
+                   << " to miopen tensor descriptor: " << ToString(status);
+      }
+    } break;
+    default:
+      LOG(FATAL) << "Unsupported tensor format "
+                 << FilterLayoutString(filter_descriptor.layout());
+      break;
+  }
+  return obj;
+}
 
-  ScopedConvolutionDescriptor(const ScopedConvolutionDescriptor&) = delete;
-  void operator=(const ScopedConvolutionDescriptor&) = delete;
-};
+absl::StatusOr<ScopedConvolutionDescriptor> scope(
+    const ConvolutionDescriptor& convolution_descriptor) {
+  ScopedConvolutionDescriptor obj;
+  auto status = wrap::miopenCreateConvolutionDescriptor(&obj.handle_);
+  if (status != miopenStatusSuccess) {
+    LOG(FATAL) << "could not create miopen convolution descriptor: "
+               << ToString(status);
+  }
+  const auto& strides64 = convolution_descriptor.strides();
+  const auto& padding64 = convolution_descriptor.padding();
+  if (convolution_descriptor.pad_alignment() ==
+      dnn::PadAlignment::kTensorFlowPadding) {
+    LOG(ERROR) << "TensorFlow padding alignment is not supported.";
+  }
 
-// Turns a PoolingDescriptor structure into a miopen pooling descriptor handle
-// within a scope.
-class ScopedPoolingDescriptor {
- public:
-  ScopedPoolingDescriptor(const PoolingDescriptor& pooling_descriptor)
-      : handle_(nullptr) {
-    auto status = wrap::miopenCreatePoolingDescriptor(&handle_);
-    if (status != miopenStatusSuccess) {
-      LOG(FATAL) << "could not create miopen pooling descriptor: "
-                 << ToString(status);
-    }
+  // MIOpen requires arrays of ints.
+  std::vector<int> strides(convolution_descriptor.ndims());
+  std::vector<int> padding(convolution_descriptor.ndims());
+  std::transform(strides64.cbegin(), strides64.cend(), strides.begin(),
+                 &CheckedNarrowing<int64_t, int>);
+  std::transform(padding64.cbegin(), padding64.cend(), padding.begin(),
+                 &CheckedNarrowing<int64_t, int>);
 
-    absl::Span<const int64_t> strides64 = pooling_descriptor.strides();
-    absl::Span<const int64_t> padding64 = pooling_descriptor.padding();
-    absl::Span<const int64_t> shape64 = pooling_descriptor.window();
-
-    const int nd = pooling_descriptor.ndims();
-    std::vector<int> shape(nd);
-    std::vector<int> padding(nd);
-    std::vector<int> strides(nd);
-    std::transform(strides64.cbegin(), strides64.cend(), strides.begin(),
-                   &CheckedNarrowing<int64_t, int>);
-    std::transform(padding64.cbegin(), padding64.cend(), padding.begin(),
-                   &CheckedNarrowing<int64_t, int>);
-    std::transform(shape64.cbegin(), shape64.cend(), shape.begin(),
-                   &CheckedNarrowing<int64_t, int>);
-
-    status = wrap::miopenSetNdPoolingDescriptor(
-        handle_,
-        (pooling_descriptor.mode() == dnn::PoolingMode::kMaximum
-             ? miopenPoolingMax
-             : miopenPoolingAverage),
-        nd, shape.data(), padding.data(), strides.data());
-
-    // Note: The index type has to be uint32 type for now because MIOpen
-    // API assumes all input indexes to be the same type. Since a tensor
-    // descriptor can only use int32 type, the index type here need to be
-    // aligned with the tensor index type of the (input) tensor descritptor
-    status = wrap::miopenSetPoolingIndexType(handle_, miopenIndexUint32);
+  std::vector<int> upscale(convolution_descriptor.ndims());
+  const auto& dilations64 = convolution_descriptor.dilations();
+  std::transform(dilations64.cbegin(), dilations64.cend(), upscale.begin(),
+                 &CheckedNarrowing<int64_t, int>);
 
-    if (status != miopenStatusSuccess) {
-      LOG(FATAL) << "could not set miopen pooling descriptor: "
-                 << ToString(status);
-    }
+  status = wrap::miopenInitConvolutionNdDescriptor(
+      obj.handle_, convolution_descriptor.ndims(), padding.data(),
+      strides.data(), upscale.data(), miopenConvolution);
+  if (status != miopenStatusSuccess) {
+    LOG(FATAL) << "could not set miopen convolution descriptor: "
+               << ToString(status);
+  }
+
+  VLOG(2) << "Requesting grouped convolution: "
+          << convolution_descriptor.group_count();
+  status = wrap::miopenSetConvolutionGroupCount(
+      obj.handle_, convolution_descriptor.group_count());
+  if (status != miopenStatusSuccess) {
+    LOG(FATAL) << "could not set miopen convolution group count: "
+               << ToString(status);
   }
-  ~ScopedPoolingDescriptor() {
-    auto status = wrap::miopenDestroyPoolingDescriptor(handle_);
+
+#if (TF_ROCM_VERSION >= 50300)
+  if (RequireMIOpenDeterminism()) {
+    status = wrap::miopenSetConvolutionAttribute(
+        obj.handle_, MIOPEN_CONVOLUTION_ATTRIB_DETERMINISTIC, 1);
     if (status != miopenStatusSuccess) {
-      LOG(ERROR) << "could not destroy miopen pooling descriptor: "
+      LOG(FATAL) << "could not set miopen convolution attribute: "
                  << ToString(status);
     }
   }
+#endif
+  return obj;
+}
 
-  miopenPoolingDescriptor_t handle() const { return handle_; }
+absl::StatusOr<ScopedPoolingDescriptor> scope(
+    const PoolingDescriptor& pooling_descriptor) {
+  ScopedPoolingDescriptor obj;
+  auto status = wrap::miopenCreatePoolingDescriptor(&obj.handle_);
+  if (status != miopenStatusSuccess) {
+    LOG(FATAL) << "could not create miopen pooling descriptor: "
+               << ToString(status);
+  }
 
- private:
-  miopenPoolingDescriptor_t handle_;  // Owned.
+  absl::Span<const int64_t> strides64 = pooling_descriptor.strides();
+  absl::Span<const int64_t> padding64 = pooling_descriptor.padding();
+  absl::Span<const int64_t> shape64 = pooling_descriptor.window();
 
-  ScopedPoolingDescriptor(const ScopedPoolingDescriptor&) = delete;
-  void operator=(const ScopedPoolingDescriptor&) = delete;
-};
+  const int nd = pooling_descriptor.ndims();
+  std::vector<int> shape(nd);
+  std::vector<int> padding(nd);
+  std::vector<int> strides(nd);
+  std::transform(strides64.cbegin(), strides64.cend(), strides.begin(),
+                 &CheckedNarrowing<int64_t, int>);
+  std::transform(padding64.cbegin(), padding64.cend(), padding.begin(),
+                 &CheckedNarrowing<int64_t, int>);
+  std::transform(shape64.cbegin(), shape64.cend(), shape.begin(),
+                 &CheckedNarrowing<int64_t, int>);
 
-// Turns a NormalizeDescriptor structure into a miopen LRN descriptor handle.
-class ScopedNormalizeDescriptor {
- public:
-  ScopedNormalizeDescriptor(const NormalizeDescriptor& normalize_descriptor)
-      : handle_(nullptr) {
-    auto status = wrap::miopenCreateLRNDescriptor(&handle_);
-    if (status != miopenStatusSuccess) {
-      LOG(FATAL) << "could not create miopen LRN descriptor: "
-                 << ToString(status);
-    }
+  status = wrap::miopenSetNdPoolingDescriptor(
+      obj.handle_,
+      (pooling_descriptor.mode() == dnn::PoolingMode::kMaximum
+           ? miopenPoolingMax
+           : miopenPoolingAverage),
+      nd, shape.data(), padding.data(), strides.data());
 
-    // The range specifies that the indices in the closed range
-    // [i - range, i + range] should be included in the normalization for index
-    // i. The lrnN value is the total number of elements in the range, so
-    // lrnN = 2*range + 1.
-    unsigned lrn_N = 2 * normalize_descriptor.range() + 1;
-
-    // Note that SE defines the normalization operation as
-    //
-    //  U_i = V_i / ((bias +  alpha      * (sum_j V_j^2)) ^ beta)
-    //
-    // but MIOpen defines it as
-    //
-    //  U_i = V_i / ((bias + (alpha / n) * (sum_j V_j^2)) ^ beta)
-    //
-    // i.e. there is a factor of n difference between the meaning of the alphas
-    // in the two contexts. The MIOpen alpha is n times the SE alpha.
-    double lrn_alpha = lrn_N * normalize_descriptor.alpha();
-
-    double lrn_beta = normalize_descriptor.beta();
-    double lrn_k = normalize_descriptor.bias();
-    status = wrap::miopenSetLRNDescriptor(handle_, miopenLRNCrossChannel, lrn_N,
-                                          lrn_alpha, lrn_beta, lrn_k);
-    if (status != miopenStatusSuccess) {
-      LOG(FATAL) << "could not set miopen LRN descriptor: " << ToString(status);
-    }
-  }
+  // Note: The index type has to be uint32 type for now because MIOpen
+  // API assumes all input indexes to be the same type. Since a tensor
+  // descriptor can only use int32 type, the index type here need to be
+  // aligned with the tensor index type of the (input) tensor descritptor
+  status = wrap::miopenSetPoolingIndexType(obj.handle_, miopenIndexUint32);
 
-  ~ScopedNormalizeDescriptor() {
-    auto status = wrap::miopenDestroyLRNDescriptor(handle_);
-    if (status != miopenStatusSuccess) {
-      LOG(ERROR) << "could not destroy miopen LRN descriptor: "
-                 << ToString(status);
-    }
+  if (status != miopenStatusSuccess) {
+    LOG(FATAL) << "could not set miopen pooling descriptor: "
+               << ToString(status);
   }
+  return obj;
+}
 
-  miopenLRNDescriptor_t handle() const { return handle_; }
+absl::StatusOr<ScopedNormalizeDescriptor> scope(
+    const NormalizeDescriptor& normalize_descriptor) {
+  ScopedNormalizeDescriptor obj;
+  auto status = wrap::miopenCreateLRNDescriptor(&obj.handle_);
+  if (status != miopenStatusSuccess) {
+    LOG(FATAL) << "could not create miopen LRN descriptor: "
+               << ToString(status);
+  }
 
- private:
-  miopenLRNDescriptor_t handle_;  // Owned.
+  // The range specifies that the indices in the closed range
+  // [i - range, i + range] should be included in the normalization for index
+  // i. The lrnN value is the total number of elements in the range, so
+  // lrnN = 2*range + 1.
+  unsigned lrn_N = 2 * normalize_descriptor.range() + 1;
 
-  ScopedNormalizeDescriptor(const ScopedNormalizeDescriptor&) = delete;
-  void operator=(const ScopedNormalizeDescriptor&) = delete;
-};
+  // Note that SE defines the normalization operation as
+  //
+  //  U_i = V_i / ((bias +  alpha      * (sum_j V_j^2)) ^ beta)
+  //
+  // but MIOpen defines it as
+  //
+  //  U_i = V_i / ((bias + (alpha / n) * (sum_j V_j^2)) ^ beta)
+  //
+  // i.e. there is a factor of n difference between the meaning of the alphas
+  // in the two contexts. The MIOpen alpha is n times the SE alpha.
+  double lrn_alpha = lrn_N * normalize_descriptor.alpha();
+
+  double lrn_beta = normalize_descriptor.beta();
+  double lrn_k = normalize_descriptor.bias();
+  status = wrap::miopenSetLRNDescriptor(obj.handle_, miopenLRNCrossChannel,
+                                        lrn_N, lrn_alpha, lrn_beta, lrn_k);
+  if (status != miopenStatusSuccess) {
+    LOG(FATAL) << "could not set miopen LRN descriptor: " << ToString(status);
+  }
+  return obj;
+}
 
 // Turns a activation mode into a miopen activation mode descriptor with a scope
 // around it
-class ScopedActivationDescriptor {
- public:
-  ScopedActivationDescriptor(dnn::ActivationMode activation_mode)
-      : handle_(nullptr),
-        miopen_activation_mode_(miopenActivationPASTHRU),
-        alpha_(0.0),
-        beta_(0.0),
-        gamma_(0.0) {
-    auto status = wrap::miopenCreateActivationDescriptor(&handle_);
+struct ScopedActivationDescriptor
+    : ScopedDescriptor<miopenActivationDescriptor_t> {
+  static absl::StatusOr<ScopedActivationDescriptor> Create(
+      dnn::ActivationMode activation_mode, double alpha = 0.0) {
+    ScopedActivationDescriptor obj;
+    obj.alpha_ = alpha;
+    auto status = wrap::miopenCreateActivationDescriptor(&obj.handle_);
     if (status != miopenStatusSuccess) {
-      LOG(FATAL) << "call to miopenCreateActivationDescriptor failed: "
-                 << ToString(status);
+      return absl::InternalError(
+          "call to miopenCreateActivationDescriptor failed: " +
+          ToString(status));
     } else {
       switch (activation_mode) {
         case dnn::ActivationMode::kNone:
-          miopen_activation_mode_ = miopenActivationPASTHRU;
+          obj.miopen_activation_mode_ = miopenActivationPASTHRU;
           break;
 
         case dnn::ActivationMode::kSigmoid:
-          miopen_activation_mode_ = miopenActivationLOGISTIC;
+          obj.miopen_activation_mode_ = miopenActivationLOGISTIC;
           break;
 
         case dnn::ActivationMode::kRelu:
-          miopen_activation_mode_ = miopenActivationRELU;
+        case dnn::ActivationMode::kReluX:
+          obj.miopen_activation_mode_ = miopenActivationRELU;
           break;
 
         case dnn::ActivationMode::kRelu6:
-          miopen_activation_mode_ = miopenActivationRELU;
-          alpha_ = 6.0;
+          obj.miopen_activation_mode_ = miopenActivationRELU;
+          obj.alpha_ = 6.0;
           break;
 
         case dnn::ActivationMode::kTanh:
-          miopen_activation_mode_ = miopenActivationTANH;
+          obj.miopen_activation_mode_ = miopenActivationTANH;
           break;
 
-        default:
-          LOG(FATAL) << "Activation mode ("
-                     << dnn::ActivationModeString(activation_mode)
-                     << ") not yet implemented";
+        case dnn::ActivationMode::kElu:
+          obj.miopen_activation_mode_ = miopenActivationELU;
+          break;
+
+        case dnn::ActivationMode::kLeakyRelu:
+          obj.miopen_activation_mode_ = miopenActivationLEAKYRELU;
           break;
+          // Check with MIOpen re: support: kBandPass, kGeluExact
+
+        default:
+          VLOG(1) << "Activation mode ("
+                  << dnn::ActivationModeString(activation_mode)
+                  << ") not yet implemented";
+          return absl::InternalError("Activation not implemented");
       }
 
       status = wrap::miopenSetActivationDescriptor(
-          handle_, miopen_activation_mode_, alpha_, beta_, gamma_);
+          obj.handle_, obj.miopen_activation_mode_, obj.alpha_, obj.beta_,
+          obj.gamma_);
       if (status != miopenStatusSuccess) {
-        LOG(FATAL) << "call to miopenSetActivationDescriptor failed: "
-                   << ToString(status);
+        return absl::InternalError(
+            "call to miopenSetActivationDescriptor failed: " +
+            ToString(status));
       }
     }
+    return obj;
   }
-
-  ~ScopedActivationDescriptor() {
-    auto status = wrap::miopenDestroyActivationDescriptor(handle_);
-    if (status != miopenStatusSuccess) {
-      LOG(FATAL) << "call to miopenDestroyActivationDescriptor failed: "
-                 << ToString(status);
-    }
+  ScopedActivationDescriptor(ScopedActivationDescriptor&& other)
+      : ScopedDescriptor<miopenActivationDescriptor_t>(std::move(other)) {
+    miopen_activation_mode_ = other.miopen_activation_mode_;
+    alpha_ = other.alpha_;
+    beta_ = other.beta_;
+    gamma_ = other.gamma_;
   }
 
-  miopenActivationDescriptor_t handle() const { return handle_; }
-
   uint64_t GetHashValue() {
     uint64_t hash_value = tsl::hash<int>()(miopen_activation_mode_);
     hash_value = tsl::Hash64Combine(hash_value, tsl::hash<double>()(alpha_));
@@ -1206,13 +1209,12 @@ class ScopedActivationDescriptor {
     return hash_value;
   }
 
- private:
-  miopenActivationDescriptor_t handle_;  // Owned.
-
-  ScopedActivationDescriptor(const ScopedActivationDescriptor&) = delete;
-  void operator=(const ScopedActivationDescriptor&) = delete;
+  ScopedActivationDescriptor()
+      : miopen_activation_mode_(miopenActivationPASTHRU),
+        alpha_(0.0),
+        beta_(0.0),
+        gamma_(0.0) {}
 
- public:
   // caching these values here to avoid calling miopenGetActivationDescriptor
   // to do the same. miopenGetActivationDescriptor gets called twice during each
   // call to execute a fusion plan (that involves the activation op)...once call
@@ -1242,6 +1244,7 @@ class ScopedFusionPlanBase {
   }
 
   virtual ~ScopedFusionPlanBase() {
+    if (fusion_args_ == nullptr) return;
     auto status = wrap::miopenDestroyOperatorArgs(fusion_args_);
     if (status != miopenStatusSuccess) {
       LOG(FATAL) << "call to miopenDestroyoperatorArgs failed: "
@@ -1266,7 +1269,6 @@ class ScopedFusionPlanBase {
 
   bool CompilationSucceeded() { return fusion_plan_compiled_; }
 
- protected:
   miopenStatus_t SetConvolutionArgs(const int op_idx, const float* alpha,
                                     const float* beta, const void* data) {
     miopenFusionOpDescriptor_t conv_op;
@@ -1417,11 +1419,20 @@ class ScopedFusionPlanBase {
     return status;
   }
 
+ public:
   miopenHandle_t miopen_handle_;
   miopenFusionPlanDescriptor_t fusion_plan_;
   miopenOperatorArgs_t fusion_args_;  // Owned.
   bool fusion_plan_compiled_;
 
+  ScopedFusionPlanBase(ScopedFusionPlanBase&& other) {
+    miopen_handle_ = other.miopen_handle_;
+    fusion_plan_ = other.fusion_plan_;
+    fusion_args_ = other.fusion_args_;
+    other.fusion_args_ = nullptr;
+    fusion_plan_compiled_ = other.fusion_plan_compiled_;
+  }
+
   ScopedFusionPlanBase(const ScopedFusionPlanBase&) = delete;
   void operator=(const ScopedFusionPlanBase&) = delete;
 };
@@ -1430,86 +1441,101 @@ class ScopedFusionPlanBase {
 class ScopedFusionPlanConvolutionBiasActivation : public ScopedFusionPlanBase {
  public:
   ScopedFusionPlanConvolutionBiasActivation(
+      miopenHandle_t miopen_handle, miopenTensorDescriptor_t input_descriptor)
+      : ScopedFusionPlanBase(miopen_handle, miopenVerticalFusion,
+                             input_descriptor) {}
+
+  ScopedFusionPlanConvolutionBiasActivation(
+      ScopedFusionPlanConvolutionBiasActivation&& other)
+      : ScopedFusionPlanBase(std::move(other)) {
+    conv_op = other.conv_op;
+    bias_op = other.bias_op;
+    actv_op = other.actv_op;
+  }
+
+  static absl::StatusOr<ScopedFusionPlanConvolutionBiasActivation> Create(
       miopenHandle_t miopen_handle, miopenTensorDescriptor_t input_descriptor,
       miopenTensorDescriptor_t filter_descriptor,
       miopenConvolutionDescriptor_t conv_descriptor,
       miopenTensorDescriptor_t bias_descriptor,
-      ScopedActivationDescriptor& activation_descriptor)
-      : ScopedFusionPlanBase(miopen_handle, miopenVerticalFusion,
-                             input_descriptor) {
-    uint64_t hash = GetFusionOpHashValue(
-        miopen_handle, input_descriptor, filter_descriptor, conv_descriptor,
-        bias_descriptor, activation_descriptor);
+      ScopedActivationDescriptor& act_descriptor) {
+    ScopedFusionPlanConvolutionBiasActivation obj(miopen_handle,
+                                                  input_descriptor);
+
+    VLOG(2) << "Fusion Plan compile begin";
+
+    uint64_t hash =
+        GetFusionOpHashValue(miopen_handle, input_descriptor, filter_descriptor,
+                             conv_descriptor, bias_descriptor, act_descriptor);
 
     bool is_compiled = CachedFusionPlans::FindOrCreate(
-        hash, &fusion_plan_, miopenVerticalFusion, input_descriptor);
+        hash, &obj.fusion_plan_, miopenVerticalFusion, input_descriptor);
+    if (is_compiled) VLOG(2) << "Cache hit";
     if (!is_compiled) {
-      miopenFusionOpDescriptor_t conv_op;
       auto status = wrap::miopenCreateOpConvForward(
-          fusion_plan_, &conv_op, conv_descriptor, filter_descriptor);
-      if (status != miopenStatusSuccess) {
-        LOG(FATAL) << "call to miopenCreateOpConvForward failed: "
-                   << ToString(status);
-      }
+          obj.fusion_plan_, &obj.conv_op, conv_descriptor, filter_descriptor);
+      if (status != miopenStatusSuccess)
+        return absl::InternalError("miopenCreateOpConvForward failed: " +
+                                   ToString(status));
 
-      miopenFusionOpDescriptor_t bias_op;
-      status = wrap::miopenCreateOpBiasForward(fusion_plan_, &bias_op,
+      status = wrap::miopenCreateOpBiasForward(obj.fusion_plan_, &obj.bias_op,
                                                bias_descriptor);
-      if (status != miopenStatusSuccess) {
-        LOG(FATAL) << "call to miopenCreateOpBiasForward failed: "
-                   << ToString(status);
-      }
-
-      miopenFusionOpDescriptor_t actv_op;
-      status = wrap::miopenCreateOpActivationForward(
-          fusion_plan_, &actv_op,
-          activation_descriptor.miopen_activation_mode_);
-      if (status != miopenStatusSuccess) {
-        LOG(FATAL) << "call to miopenCreateOpActivationForward failed: "
-                   << ToString(status);
+      if (status != miopenStatusSuccess)
+        return absl::InternalError("miopenCreateOpBiasForward failed: " +
+                                   ToString(status));
+
+      if (act_descriptor.miopen_activation_mode_ != miopenActivationPASTHRU) {
+        status = wrap::miopenCreateOpActivationForward(
+            obj.fusion_plan_, &obj.actv_op,
+            act_descriptor.miopen_activation_mode_);
+        if (status != miopenStatusSuccess)
+          return absl::InternalError(
+              "miopenCreateOpActivationForward failed: " + ToString(status));
       }
 
-      status = wrap::miopenCompileFusionPlan(miopen_handle_, fusion_plan_);
+      status = wrap::miopenCompileFusionPlan(miopen_handle, obj.fusion_plan_);
       if (status != miopenStatusSuccess) {
         VLOG(2) << "call to miopenCompileFusionPlan (CBA) failed: "
                 << ToString(status);
 
         CachedFusionPlans::MarkFusionPlanUnsupported(hash);
       } else {
-        VLOG(2) << "Fusion Plan compile succedded (CBA) ";
-        fusion_plan_compiled_ = true;
+        VLOG(2) << "Fusion Plan compile succeeded (CBA) ";
+        obj.fusion_plan_compiled_ = true;
       }
     } else {
       // fusion plan was already compiled...check whether it failed to compile
-      fusion_plan_compiled_ = !CachedFusionPlans::IsUnsupportedFusionPlan(hash);
+      obj.fusion_plan_compiled_ =
+          !CachedFusionPlans::IsUnsupportedFusionPlan(hash);
     }
+    return obj;
   }
 
   miopenStatus_t SetConvolutionArgs(const void* filter_data) {
-    float alpha = 1.0;
-    float beta = 0.0;
+    static const float alpha = 1.0;
+    static const float beta = 0.0;
     return ScopedFusionPlanBase::SetConvolutionArgs(k_conv_op_idx, &alpha,
                                                     &beta, filter_data);
   }
 
   miopenStatus_t SetBiasArgs(const void* bias_data) {
-    float alpha = 1.0;
-    float beta = 0.0;
+    static const float alpha = 1.0;
+    static const float beta = 0.0;
     return ScopedFusionPlanBase::SetBiasArgs(k_bias_op_idx, &alpha, &beta,
                                              bias_data);
   }
 
   miopenStatus_t SetActivationForwardArgs(
       ScopedActivationDescriptor& activation_descriptor) {
-    float alpha = 1.0;
-    float beta = 0.0;
+    static const float alpha = 1.0;
+    static const float beta = 0.0;
 
     return ScopedFusionPlanBase::SetActivationForwardArgs(
         k_actv_op_idx, &alpha, &beta, activation_descriptor.alpha_,
         activation_descriptor.beta_, activation_descriptor.gamma_);
   }
 
-  uint64_t GetFusionOpHashValue(
+  static uint64_t GetFusionOpHashValue(
       miopenHandle_t miopen_handle, miopenTensorDescriptor_t input_descriptor,
       miopenTensorDescriptor_t filter_descriptor,
       miopenConvolutionDescriptor_t conv_descriptor,
@@ -1530,6 +1556,11 @@ class ScopedFusionPlanConvolutionBiasActivation : public ScopedFusionPlanBase {
     return hash_value;
   }
 
+ public:
+  miopenFusionOpDescriptor_t conv_op;
+  miopenFusionOpDescriptor_t bias_op;
+  miopenFusionOpDescriptor_t actv_op;
+
  private:
   const int k_conv_op_idx = 0;
   const int k_bias_op_idx = 1;
@@ -1698,8 +1729,8 @@ class ScopedFusionPlanBatchNormActivationForward : public ScopedFusionPlanBase {
                                          void* batch_mean, void* batch_var,
                                          void* saved_mean, void* saved_var,
                                          double epsilon) {
-    float alpha = 1.0;
-    float beta = 0.0;
+    static const float alpha = 1.0;
+    static const float beta = 0.0;
     return ScopedFusionPlanBase::SetBatchNormForwardArgs(
         k_batchnorm_op_idx, &alpha, &beta, scale, offset, batch_mean, batch_var,
         saved_mean, saved_var, /*exponential_average_factor=*/1.0, epsilon);
@@ -1707,8 +1738,8 @@ class ScopedFusionPlanBatchNormActivationForward : public ScopedFusionPlanBase {
 
   miopenStatus_t SetActivationForwardArgs(
       ScopedActivationDescriptor& activation_descriptor) {
-    float alpha = 1.0;
-    float beta = 0.0;
+    static const float alpha = 1.0;
+    static const float beta = 0.0;
 
     return ScopedFusionPlanBase::SetActivationForwardArgs(
         k_actv_op_idx, &alpha, &beta, activation_descriptor.alpha_,
@@ -1847,6 +1878,24 @@ class ScopedFusionPlanBatchNormActivationBackward
 };
 
 namespace {
+
+const char* getTypeName(dnn::DataType data_type) {
+  switch (data_type) {
+    case dnn::DataType::kBF16:
+      return "BF16";
+    case dnn::DataType::kFloat:
+      return "F32";
+    case dnn::DataType::kHalf:
+      return "F16";
+    case dnn::DataType::kInt8:
+      return "I8";
+    case dnn::DataType::kDouble:
+      return "F64";
+    default:
+      return "Unknown";
+  }
+}
+
 miopenDataType_t ToMIOpenDataType(
     dnn::DataType data_type,
     dnn::DataLayout data_layout = dnn::DataLayout::kBatchDepthYX) {
@@ -1859,6 +1908,9 @@ miopenDataType_t ToMIOpenDataType(
       return miopenHalf;
     case dnn::DataType::kInt8:
       if (data_layout == dnn::DataLayout::kBatchDepthYX) return miopenInt8;
+      LOG(FATAL)
+          << "The kInt8 data type only supports the kBatchDepthYX data layout!";
+      break;
     case dnn::DataType::kDouble:
       LOG(FATAL)
           << "Unsupported DNN data type: tf.float64 (dnn::DataType::kDouble)";
@@ -1914,12 +1966,12 @@ class MixinBase<void> {};
 
 }  // namespace
 
-#define RETURN_IF_MIOPEN_ERROR(STATUS, ...)                              \
-  if (!SE_PREDICT_TRUE((STATUS) == miopenStatusSuccess)) {               \
-    string error_msg = absl::StrCat(ToString(STATUS), " ", __VA_ARGS__); \
-    SetFailure(::absl::UnknownError(error_msg));                         \
-    LOG(ERROR) << error_msg;                                             \
-    return;                                                              \
+#define RETURN_IF_MIOPEN_ERROR(STATUS, ...)                                   \
+  if (!SE_PREDICT_TRUE((STATUS) == miopenStatusSuccess)) {                    \
+    std::string error_msg = absl::StrCat(ToString(STATUS), " ", __VA_ARGS__); \
+    SetFailure(::absl::UnknownError(error_msg));                              \
+    LOG(ERROR) << error_msg;                                                  \
+    return;                                                                   \
   }
 
 template <typename Base>
@@ -1969,6 +2021,68 @@ class MIOpenRnnParamsDescriptor : public MIOpenDescriptorCommon<void> {
   void operator=(const MIOpenRnnParamsDescriptor&) = delete;
 };
 
+class MIOpenDropoutDescriptor {
+ public:
+  MIOpenDropoutDescriptor(miopenHandle_t miopen_handle, float dropout,
+                          uint64_t seed, ScratchAllocator* state_allocator)
+      : dropout_desc_(nullptr) {
+    auto status = wrap::miopenCreateDropoutDescriptor(&dropout_desc_);
+    if (status != miopenStatusSuccess) {
+      LOG(FATAL) << "call to miopenCreateDropoutDescriptor failed: "
+                 << ToString(status);
+    }
+
+    if (dropout > 0.0f) {
+      DeviceMemory<uint8_t> state_memory;
+      if (state_allocator) {
+        size_t state_sizes_in_bytes = 0;
+        status = wrap::miopenDropoutGetStatesSize(miopen_handle,
+                                                  &state_sizes_in_bytes);
+        if (status != miopenStatusSuccess) {
+          LOG(FATAL) << "call to miopenDropoutGetStatesSize failed: "
+                     << ToString(status);
+        }
+        if (state_sizes_in_bytes > 0) {
+          auto allocated = state_allocator->AllocateBytes(state_sizes_in_bytes);
+          if (!allocated.ok() ||
+              (state_memory = allocated.value()) == nullptr) {
+            LOG(FATAL) << "Failed to allocate dropout state space.";
+          }
+        }
+      }
+
+      bool state_evo = false;  // input placeholder, currently not enabled
+      bool use_mask = true;
+      status = wrap::miopenSetDropoutDescriptor(
+          dropout_desc_ /*dropoutDesc*/, miopen_handle /*handle*/,
+          dropout /*dropout*/, state_memory.opaque() /*states*/,
+          state_memory.size() /*stateSizeInBytes*/, seed /*seed*/,
+          use_mask /*use_mask*/, state_evo /*state_evo*/,
+          MIOPEN_RNG_PSEUDO_XORWOW /*rng_mode*/);
+      if (status != miopenStatusSuccess) {
+        LOG(FATAL) << "call to miopenSetDropoutDescriptor failed: "
+                   << ToString(status);
+      }
+    }
+  }
+
+  ~MIOpenDropoutDescriptor() {
+    auto status = wrap::miopenDestroyDropoutDescriptor(dropout_desc_);
+    if (status != miopenStatusSuccess) {
+      LOG(FATAL) << "call to miopenDestroyDropoutDescriptor failed: "
+                 << ToString(status);
+    }
+  }
+
+  miopenDropoutDescriptor_t handle() const { return dropout_desc_; }
+
+ private:
+  miopenDropoutDescriptor_t dropout_desc_;
+
+  MIOpenDropoutDescriptor(const MIOpenDropoutDescriptor&) = delete;
+  void operator=(const MIOpenDropoutDescriptor&) = delete;
+};
+
 class MIOpenRnnDescriptor : public MIOpenDescriptorCommon<dnn::RnnDescriptor> {
  public:
   MIOpenRnnDescriptor(miopenHandle_t miopen_handle, int num_layers,
@@ -1988,15 +2102,19 @@ class MIOpenRnnDescriptor : public MIOpenDescriptorCommon<dnn::RnnDescriptor> {
         rnn_mode_(rnn_mode),
         data_type_(data_type),
         algorithm_config_(algorithm_config) {
+    // Create the dropout handle
+    miopen_dropout_desc_.reset(new MIOpenDropoutDescriptor(
+        miopen_handle, dropout, seed, state_allocator));
     // Create the RNN handle
     auto status = wrap::miopenCreateRNNDescriptor(&rnn_desc_);
     RETURN_IF_MIOPEN_ERROR(status, "Unable to create RNN descriptor");
-    status = wrap::miopenSetRNNDescriptor(
+    status = wrap::miopenSetRNNDescriptor_V2(
         rnn_desc_ /*rnnDesc*/, hidden_size /*hiddenSize*/,
-        num_layers /*numLayers*/, input_mode /*inputMode*/,
-        direction_mode /*direction*/, rnn_mode /*mode*/,
-        miopenRNNwithBias /*biasMode*/, miopenRNNdefault /*algo*/,
-        data_type /*dataType*/);
+        num_layers /*numLayers*/,
+        miopen_dropout_desc_->handle() /*dropoutDesc*/,
+        input_mode /*inputMode*/, direction_mode /*direction*/,
+        rnn_mode /*mode*/, miopenRNNwithBias /*biasMode*/,
+        miopenRNNdefault /*algo*/, data_type /*dataType*/);
     RETURN_IF_MIOPEN_ERROR(status, "Unable to update RNN descriptor");
     // Create the params handle.
     miopen_params_desc_.reset(
@@ -2053,8 +2171,7 @@ class MIOpenRnnDescriptor : public MIOpenDescriptorCommon<dnn::RnnDescriptor> {
   miopenDataType_t data_type_;
   dnn::AlgorithmConfig algorithm_config_;
   absl::Status status_;
-  // no dropout in MIOpen.
-  // std::unique_ptr<miopenDropoutDescriptor> miopen_dropout_desc_;
+  std::unique_ptr<MIOpenDropoutDescriptor> miopen_dropout_desc_;
   std::unique_ptr<MIOpenRnnParamsDescriptor> miopen_params_desc_;
   MIOpenRnnDescriptor(const MIOpenRnnDescriptor&) = delete;
   void operator=(const MIOpenRnnDescriptor&) = delete;
@@ -2088,7 +2205,7 @@ class MIOpenRnnSequenceTensorDescriptor
         data_type_(data_type) {
     miopenTensorDescriptor_t handle = nullptr;
     if (seq_length <= 0) {
-      string error_msg =
+      std::string error_msg =
           absl::StrCat("sequence length must be positive: ", seq_length);
       LOG(ERROR) << error_msg;
       SetFailure(absl::UnknownError(error_msg));
@@ -3161,21 +3278,22 @@ class RocmConvRunner : public dnn::ConvRunner {
  public:
   RocmConvRunner(GpuExecutor* parent, MIOpenAccess* miopen, int64_t algo_id,
                  size_t workspace_size, dnn::ConvolutionKind kind,
-                 dnn::DataType input_type, bool use_immediate_mode,
-                 BatchDescriptor input_descriptor,
-                 BatchDescriptor output_descriptor,
-                 FilterDescriptor filter_descriptor,
-                 ConvolutionDescriptor conv_descriptor)
+                 dnn::DataType input_type, dnn::DataType output_type,
+                 bool use_immediate_mode,
+                 ScopedTensorDescriptor& scoped_input_desc,
+                 ScopedTensorDescriptor& scoped_output_desc,
+                 ScopedFilterDescriptor& scoped_filter_desc,
+                 ScopedConvolutionDescriptor& scoped_conv_desc)
       : parent_(parent),
         miopen_(miopen),
         algo_id_(algo_id),
         workspace_size_(workspace_size),
         kind_(kind),
         use_immediate_mode_(use_immediate_mode),
-        input_desc_{input_descriptor, ToMIOpenDataType(input_type)},
-        output_desc_{output_descriptor, ToMIOpenDataType(input_type)},
-        filter_desc_{filter_descriptor, ToMIOpenDataType(input_type)},
-        conv_desc_{conv_descriptor, ToMIOpenDataType(input_type)} {
+        input_desc_(std::move(scoped_input_desc)),
+        output_desc_(std::move(scoped_output_desc)),
+        filter_desc_(std::move(scoped_filter_desc)),
+        conv_desc_(std::move(scoped_conv_desc)) {
     bool is_backprop = ((kind == dnn::ConvolutionKind::BACKWARD_DATA) ||
                         (kind == dnn::ConvolutionKind::BACKWARD_FILTER));
     // #if TF_ROCM_VERSION >= 50000
@@ -3355,12 +3473,10 @@ absl::Status MIOpenSupport::GetConvolveRunners(
 
   std::vector<dnn::ProfileResult> profile_results;
   if (!GetMIOpenConvolveAlgorithms(
-          kind, input_type, stream, input_descriptor, input_data,
+          kind, input_type, output_type, stream, input_descriptor, input_data,
           filter_descriptor, filter_data, output_descriptor, output_data,
-          convolution_descriptor, scratch_allocator, &profile_results)) {
-    return absl::UnknownError(
-        "GetConvolveRunners: GetMIOpenConvolveAlgorithms failed");
-  }
+          convolution_descriptor, scratch_allocator, &profile_results))
+    return absl::InternalError("GetMIOpenConvolveAlgorithms failure");
 
   for (const auto& profile_result : profile_results) {
     TF_ASSIGN_OR_RETURN(
@@ -3382,27 +3498,24 @@ MIOpenSupport::ConvolveRunnerFromDesc(
     const dnn::FilterDescriptor& filter_descriptor,
     const dnn::BatchDescriptor& output_descriptor,
     const dnn::ConvolutionDescriptor& convolution_descriptor) {
-  if (input_type != output_type) {
-    return absl::UnimplementedError(
-        absl::StrFormat("MIOpen backend does not support different input and "
-                        "output types: %d != %d",
-                        input_type, output_type));
-  }
-
   auto workspace_size = algorithm_desc.workspace_size();
-  if (!workspace_size) {
-    return absl::InvalidArgumentError(
-        "MIOpenSupport::ConvolveRunnerFromDesc requires "
-        "AlgorithmProto.workspace_size, but it was missing.");
-  }
+  TF_ASSIGN_OR_RETURN(auto scoped_input_desc,
+                      scope(input_descriptor, ToMIOpenDataType(input_type)));
+  TF_ASSIGN_OR_RETURN(auto scoped_output_desc,
+                      scope(output_descriptor, ToMIOpenDataType(output_type)));
+  TF_ASSIGN_OR_RETURN(auto scoped_filter_desc,
+                      scope(filter_descriptor, ToMIOpenDataType(input_type)));
+  TF_ASSIGN_OR_RETURN(auto scoped_conv_desc, scope(convolution_descriptor));
+
   return {std::make_unique<RocmConvRunner>(
       parent_, miopen_.get(), algorithm_desc.algo_id(), *workspace_size, kind,
-      input_type, use_immediate_mode_, input_descriptor, output_descriptor,
-      filter_descriptor, convolution_descriptor)};
+      input_type, output_type, use_immediate_mode_, scoped_input_desc,
+      scoped_output_desc, scoped_filter_desc, scoped_conv_desc)};
 }
 
 bool MIOpenSupport::GetMIOpenConvolveAlgorithms(
-    dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
+    dnn::ConvolutionKind kind, dnn::DataType input_type,
+    dnn::DataType output_type, Stream* stream,
     const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
     const dnn::FilterDescriptor& filter_descriptor,
     DeviceMemoryBase filter_data, const dnn::BatchDescriptor& output_descriptor,
@@ -3412,19 +3525,22 @@ bool MIOpenSupport::GetMIOpenConvolveAlgorithms(
     std::vector<dnn::ProfileResult>* out_algorithms) {
   return use_immediate_mode_
              ? GetMIOpenConvolveAlgorithmsImmediateMode(
-                   kind, element_type, stream, input_descriptor, input_data,
-                   filter_descriptor, filter_data, output_descriptor,
-                   output_data, convolution_descriptor, scratch_allocator,
-                   out_algorithms)
+                   kind, input_type, output_type, stream, input_descriptor,
+                   input_data, filter_descriptor, filter_data,
+                   output_descriptor, output_data, convolution_descriptor,
+                   scratch_allocator, out_algorithms)
+                   .ok()
              : GetMIOpenConvolveAlgorithmsFindMode(
-                   kind, element_type, stream, input_descriptor, input_data,
-                   filter_descriptor, filter_data, output_descriptor,
-                   output_data, convolution_descriptor, scratch_allocator,
-                   out_algorithms);
+                   kind, input_type, output_type, stream, input_descriptor,
+                   input_data, filter_descriptor, filter_data,
+                   output_descriptor, output_data, convolution_descriptor,
+                   scratch_allocator, out_algorithms)
+                   .ok();
 }
 
-bool MIOpenSupport::GetMIOpenConvolveAlgorithmsImmediateMode(
-    dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
+absl::Status MIOpenSupport::GetMIOpenConvolveAlgorithmsImmediateMode(
+    dnn::ConvolutionKind kind, dnn::DataType input_type,
+    dnn::DataType output_type, Stream* stream,
     const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
     const dnn::FilterDescriptor& filter_descriptor,
     DeviceMemoryBase filter_data, const dnn::BatchDescriptor& output_descriptor,
@@ -3434,14 +3550,13 @@ bool MIOpenSupport::GetMIOpenConvolveAlgorithmsImmediateMode(
     std::vector<dnn::ProfileResult>* out_algorithms) {
   auto miopen = miopen_->GetHandle(parent_, stream);
 
-  ScopedTensorDescriptor input_nd{input_descriptor,
-                                  ToMIOpenDataType(element_type)};
-  ScopedTensorDescriptor output_nd{output_descriptor,
-                                   ToMIOpenDataType(element_type)};
-  ScopedFilterDescriptor filter{filter_descriptor,
-                                ToMIOpenDataType(element_type)};
-  ScopedConvolutionDescriptor conv{convolution_descriptor,
-                                   ToMIOpenDataType(element_type)};
+  TF_ASSIGN_OR_RETURN(auto input_nd,
+                      scope(input_descriptor, ToMIOpenDataType(input_type)));
+  TF_ASSIGN_OR_RETURN(auto output_nd,
+                      scope(output_descriptor, ToMIOpenDataType(output_type)));
+  TF_ASSIGN_OR_RETURN(auto filter,
+                      scope(filter_descriptor, ToMIOpenDataType(input_type)));
+  TF_ASSIGN_OR_RETURN(auto conv, scope(convolution_descriptor));
 
   bool is_backprop = ((kind == dnn::ConvolutionKind::BACKWARD_DATA) ||
                       (kind == dnn::ConvolutionKind::BACKWARD_FILTER));
@@ -3449,24 +3564,24 @@ bool MIOpenSupport::GetMIOpenConvolveAlgorithmsImmediateMode(
   //                   (call_context == dnn::CallContext::kBackpropFilter);
 
 #if TF_ROCM_VERSION >= 50000
-  if (is_backprop && (ToMIOpenDataType(element_type) == miopenHalf)) {
+  if (is_backprop && (ToMIOpenDataType(input_type) == miopenHalf)) {
     wrap::miopenSetConvolutionAttribute(
         conv.handle(), MIOPEN_CONVOLUTION_ATTRIB_FP16_ALT_IMPL, 1);
   }
 #endif
-  // First determine the number of algorityhms available
+  // First determine the number of algorithms available
   size_t maxSolutionCount = 0;
 
   switch (kind) {
-    case dnn::ConvolutionKind::FORWARD: {
+    case dnn::ConvolutionKind::FORWARD:
+    case dnn::ConvolutionKind::FORWARD_BIAS_ACTIVATION: {
       auto status = wrap::miopenConvolutionForwardGetSolutionCount(
           miopen.handle(), filter.handle(), input_nd.handle(), conv.handle(),
           output_nd.handle(), &maxSolutionCount);
       if (status != miopenStatusSuccess) {
-        LOG(FATAL)
-            << "call to miopenConvolutionForwardGetSolutionCount failed: "
-            << ToString(status);
-        return false;
+        return absl::InternalError(
+            "call to miopenConvolutionForwardGetSolutionCount failed: " +
+            ToString(status));
       }
       break;
     }
@@ -3475,10 +3590,10 @@ bool MIOpenSupport::GetMIOpenConvolveAlgorithmsImmediateMode(
           miopen.handle(), output_nd.handle(), filter.handle(), conv.handle(),
           input_nd.handle(), &maxSolutionCount);
       if (status != miopenStatusSuccess) {
-        LOG(FATAL) << "call to miopenConvolutionBackwardDataGetSolutionCount "
-                      "failed: "
-                   << ToString(status);
-        return false;
+        return absl::InternalError(
+            "call to miopenConvolutionBackwardDataGetSolutionCount "
+            "failed: " +
+            ToString(status));
       }
       break;
     }
@@ -3487,18 +3602,16 @@ bool MIOpenSupport::GetMIOpenConvolveAlgorithmsImmediateMode(
           miopen.handle(), output_nd.handle(), input_nd.handle(), conv.handle(),
           filter.handle(), &maxSolutionCount);
       if (status != miopenStatusSuccess) {
-        LOG(FATAL)
-            << "call to miopenConvolutionBackwardWeightsGetSolutionCount "
-               "failed: "
-            << ToString(status);
-        return false;
+        return absl::InternalError(
+            "call to miopenConvolutionBackwardWeightsGetSolutionCount "
+            "failed: " +
+            ToString(status));
       }
       break;
     }
     default: {
-      LOG(FATAL) << "Unexpected convolution kind " << static_cast<int>(kind);
-      return false;
-      break;
+      return absl::InternalError("Unexpected convolution kind " +
+                                 std::to_string(static_cast<int>(kind)));
     }
   }
 
@@ -3523,9 +3636,9 @@ bool MIOpenSupport::GetMIOpenConvolveAlgorithmsImmediateMode(
           solutions.get());
 
       if (status != miopenStatusSuccess) {
-        LOG(FATAL) << "call to miopenConvolutionForwardGetSolution failed: "
-                   << ToString(status);
-        return false;
+        return absl::InternalError(
+            "call to miopenConvolutionForwardGetSolution failed: " +
+            ToString(status));
       }
 
       VLOG(kConvDebugVlogLevel)
@@ -3544,10 +3657,9 @@ bool MIOpenSupport::GetMIOpenConvolveAlgorithmsImmediateMode(
             output_nd.handle(), solution.solution_id);
 
         if (status != miopenStatusSuccess) {
-          LOG(FATAL)
-              << "call to miopenConvolutionForwardCompileSolution failed: "
-              << ToString(status);
-          return false;
+          return absl::InternalError(
+              "call to miopenConvolutionForwardCompileSolution failed: " +
+              ToString(status));
         }
 
         out_algorithms->emplace_back(
@@ -3561,10 +3673,9 @@ bool MIOpenSupport::GetMIOpenConvolveAlgorithmsImmediateMode(
           miopen.handle(), output_nd.handle(), filter.handle(), conv.handle(),
           input_nd.handle(), maxSolutionCount, &solutionCount, solutions.get());
       if (status != miopenStatusSuccess) {
-        LOG(FATAL)
-            << "call to miopenConvolutionBackwardDataGetSolution failed: "
-            << ToString(status);
-        return false;
+        return absl::InternalError(
+            "call to miopenConvolutionBackwardDataGetSolution failed: " +
+            ToString(status));
       }
 
       VLOG(kConvDebugVlogLevel)
@@ -3583,10 +3694,10 @@ bool MIOpenSupport::GetMIOpenConvolveAlgorithmsImmediateMode(
             input_nd.handle(), solution.solution_id);
 
         if (status != miopenStatusSuccess) {
-          LOG(FATAL) << " call to miopenConvolutionBackwardDataCompileSolution "
-                        "failed: "
-                     << ToString(status);
-          return false;
+          return absl::InternalError(
+              " call to miopenConvolutionBackwardDataCompileSolution "
+              "failed: " +
+              ToString(status));
         }
 
         out_algorithms->emplace_back(
@@ -3599,10 +3710,9 @@ bool MIOpenSupport::GetMIOpenConvolveAlgorithmsImmediateMode(
           miopen.handle(), output_nd.handle(), input_nd.handle(), conv.handle(),
           filter.handle(), maxSolutionCount, &solutionCount, solutions.get());
       if (status != miopenStatusSuccess) {
-        LOG(FATAL)
-            << "call to miopenConvolutionBackwardWeightsGetSolution failed: "
-            << ToString(status);
-        return false;
+        return absl::InternalError(
+            "call to miopenConvolutionBackwardWeightsGetSolution failed: " +
+            ToString(status));
       }
 
       VLOG(kConvDebugVlogLevel)
@@ -3621,11 +3731,10 @@ bool MIOpenSupport::GetMIOpenConvolveAlgorithmsImmediateMode(
             conv.handle(), filter.handle(), solution.solution_id);
 
         if (status != miopenStatusSuccess) {
-          LOG(FATAL)
-              << "call to miopenConvolutionBackwardWeightsCompileSolution "
-                 "failed: "
-              << ToString(status);
-          return false;
+          return absl::InternalError(
+              "call to miopenConvolutionBackwardWeightsCompileSolution "
+              "failed: " +
+              ToString(status));
         }
 
         out_algorithms->emplace_back(
@@ -3634,17 +3743,17 @@ bool MIOpenSupport::GetMIOpenConvolveAlgorithmsImmediateMode(
       break;
     }
     default: {
-      LOG(FATAL) << "Unexpected convolution kind " << static_cast<int>(kind);
-      return false;
-      break;
+      return absl::InternalError("Unexpected convolution kind " +
+                                 std::to_string(static_cast<int>(kind)));
     }
   }
 
-  return true;
+  return absl::OkStatus();
 }
 
-bool MIOpenSupport::GetMIOpenConvolveAlgorithmsFindMode(
-    dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
+absl::Status MIOpenSupport::GetMIOpenConvolveAlgorithmsFindMode(
+    dnn::ConvolutionKind kind, dnn::DataType input_type,
+    dnn::DataType output_type, Stream* stream,
     const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
     const dnn::FilterDescriptor& filter_descriptor,
     DeviceMemoryBase filter_data, const dnn::BatchDescriptor& output_descriptor,
@@ -3654,14 +3763,13 @@ bool MIOpenSupport::GetMIOpenConvolveAlgorithmsFindMode(
     std::vector<dnn::ProfileResult>* out_algorithms) {
   auto miopen = miopen_->GetHandle(parent_, stream);
 
-  ScopedTensorDescriptor input_nd{input_descriptor,
-                                  ToMIOpenDataType(element_type)};
-  ScopedTensorDescriptor output_nd{output_descriptor,
-                                   ToMIOpenDataType(element_type)};
-  ScopedFilterDescriptor filter{filter_descriptor,
-                                ToMIOpenDataType(element_type)};
-  ScopedConvolutionDescriptor conv{convolution_descriptor,
-                                   ToMIOpenDataType(element_type)};
+  TF_ASSIGN_OR_RETURN(auto input_nd,
+                      scope(input_descriptor, ToMIOpenDataType(input_type)));
+  TF_ASSIGN_OR_RETURN(auto output_nd,
+                      scope(output_descriptor, ToMIOpenDataType(output_type)));
+  TF_ASSIGN_OR_RETURN(auto filter,
+                      scope(filter_descriptor, ToMIOpenDataType(input_type)));
+  TF_ASSIGN_OR_RETURN(auto conv, scope(convolution_descriptor));
 
   bool is_backprop = ((kind == dnn::ConvolutionKind::BACKWARD_DATA) ||
                       (kind == dnn::ConvolutionKind::BACKWARD_FILTER));
@@ -3669,7 +3777,7 @@ bool MIOpenSupport::GetMIOpenConvolveAlgorithmsFindMode(
   //                    (call_context == dnn::CallContext::kBackpropFilter);
 
 #if TF_ROCM_VERSION >= 50000
-  if (is_backprop && (ToMIOpenDataType(element_type) == miopenHalf)) {
+  if (is_backprop && (ToMIOpenDataType(input_type) == miopenHalf)) {
     wrap::miopenSetConvolutionAttribute(
         conv.handle(), MIOPEN_CONVOLUTION_ATTRIB_FP16_ALT_IMPL, 1);
   }
@@ -3678,15 +3786,15 @@ bool MIOpenSupport::GetMIOpenConvolveAlgorithmsFindMode(
   // Determine the workspace memory size that will need by the call to Find
   size_t scratch_memory_size = 0;
   switch (kind) {
-    case dnn::ConvolutionKind::FORWARD: {
+    case dnn::ConvolutionKind::FORWARD:
+    case dnn::ConvolutionKind::FORWARD_BIAS_ACTIVATION: {
       auto status = wrap::miopenConvolutionForwardGetWorkSpaceSize(
           miopen.handle(), filter.handle(), input_nd.handle(), conv.handle(),
           output_nd.handle(), &scratch_memory_size);
       if (status != miopenStatusSuccess) {
-        LOG(FATAL)
-            << "call to miopenConvolutionForwardGetWorkspaceSize failed: "
-            << ToString(status);
-        return false;
+        return absl::InternalError(
+            "call to miopenConvolutionForwardGetWorkspaceSize failed: " +
+            ToString(status));
       }
       break;
     }
@@ -3695,10 +3803,9 @@ bool MIOpenSupport::GetMIOpenConvolveAlgorithmsFindMode(
           miopen.handle(), output_nd.handle(), filter.handle(), conv.handle(),
           input_nd.handle(), &scratch_memory_size);
       if (status != miopenStatusSuccess) {
-        LOG(FATAL)
-            << "call to miopenConvolutionBackwardDataGetWorkspaceSize failed: "
-            << ToString(status);
-        return false;
+        return absl::InternalError(
+            "call to miopenConvolutionBackwardDataGetWorkspaceSize failed: " +
+            ToString(status));
       }
       break;
     }
@@ -3707,17 +3814,16 @@ bool MIOpenSupport::GetMIOpenConvolveAlgorithmsFindMode(
           miopen.handle(), output_nd.handle(), input_nd.handle(), conv.handle(),
           filter.handle(), &scratch_memory_size);
       if (status != miopenStatusSuccess) {
-        LOG(FATAL)
-            << "call to miopenConvolutionBackwardWeightsGetWorkspaceSize "
-               "failed: "
-            << ToString(status);
-        return false;
+        return absl::InternalError(
+            "call to miopenConvolutionBackwardWeightsGetWorkspaceSize "
+            "failed: " +
+            ToString(status));
       }
       break;
     }
     default: {
-      LOG(FATAL) << "Unexpected convolution kind " << static_cast<int>(kind);
-      return false;
+      return absl::InternalError(absl::StrFormat(
+          "Unexpected convolution kind %d", static_cast<int>(kind)));
       break;
     }
   }
@@ -3726,9 +3832,9 @@ bool MIOpenSupport::GetMIOpenConvolveAlgorithmsFindMode(
   DeviceMemory<uint8> scratch_memory;
   if (scratch_memory_size != 0) {
     if (scratch_allocator == nullptr) {
-      LOG(FATAL)
-          << "An allocator must be specified when scratch memory is needed";
-      return false;
+      return absl::InternalError(
+          "An allocator must be specified "
+          "when scratch memory is needed");
     }
     auto allocated = scratch_allocator->AllocateBytes(scratch_memory_size);
     if (allocated.ok()) {
@@ -3741,7 +3847,7 @@ bool MIOpenSupport::GetMIOpenConvolveAlgorithmsFindMode(
              "larger number (e.g. 8192) to increase the max memory limit.\n"
           << "\tIncreasing the max memory limit might help resolve this "
              "error";
-      return false;
+      return absl::InternalError("Out of memory");
     }
   }
 
@@ -3757,7 +3863,8 @@ bool MIOpenSupport::GetMIOpenConvolveAlgorithmsFindMode(
   bool exhaustiveSearch = false;
 
   switch (kind) {
-    case dnn::ConvolutionKind::FORWARD: {
+    case dnn::ConvolutionKind::FORWARD:
+    case dnn::ConvolutionKind::FORWARD_BIAS_ACTIVATION: {
       auto status = wrap::miopenFindConvolutionForwardAlgorithm(
           miopen.handle(), input_nd.handle(), input_data.opaque(),
           filter.handle(), filter_data.opaque(), conv.handle(),
@@ -3765,9 +3872,9 @@ bool MIOpenSupport::GetMIOpenConvolveAlgorithmsFindMode(
           &returnedAlgorithmCount, &returnedAlgorithm, scratch_memory.opaque(),
           scratch_memory_size, exhaustiveSearch);
       if (status != miopenStatusSuccess) {
-        LOG(FATAL) << "call to miopenFindConvolutionForwardAlgorithm failed: "
-                   << ToString(status);
-        return false;
+        return absl::InternalError(
+            "call to miopenFindConvolutionForwardAlgorithm failed: " +
+            ToString(status));
       }
       break;
     }
@@ -3779,10 +3886,9 @@ bool MIOpenSupport::GetMIOpenConvolveAlgorithmsFindMode(
           &returnedAlgorithmCount, &returnedAlgorithm, scratch_memory.opaque(),
           scratch_memory_size, exhaustiveSearch);
       if (status != miopenStatusSuccess) {
-        LOG(FATAL)
-            << "call to miopenFindConvolutionBackwardDataAlgorithm failed: "
-            << ToString(status);
-        return false;
+        return absl::InternalError(
+            "call to miopenFindConvolutionBackwardDataAlgorithm failed: " +
+            ToString(status));
       }
       break;
     }
@@ -3794,16 +3900,16 @@ bool MIOpenSupport::GetMIOpenConvolveAlgorithmsFindMode(
           &returnedAlgorithmCount, &returnedAlgorithm, scratch_memory.opaque(),
           scratch_memory_size, exhaustiveSearch);
       if (status != miopenStatusSuccess) {
-        LOG(FATAL) << "call to miopenConvolutionBackwardWeightsAlgorithm "
-                      "failed: "
-                   << ToString(status);
-        return false;
+        return absl::InternalError(
+            "call to miopenConvolutionBackwardWeightsAlgorithm "
+            "failed: " +
+            ToString(status));
       }
       break;
     }
     default: {
-      LOG(FATAL) << "Unexpected convolution kind " << static_cast<int>(kind);
-      return false;
+      return absl::InternalError("Unexpected convolution kind " +
+                                 std::to_string(static_cast<int>(kind)));
       break;
     }
   }
@@ -3811,7 +3917,7 @@ bool MIOpenSupport::GetMIOpenConvolveAlgorithmsFindMode(
   out_algorithms->emplace_back(
       GetProfileResultFromConvAlgoPerf(kind, returnedAlgorithm));
 
-  return true;
+  return absl::OkStatus();
 }
 
 bool MIOpenSupport::GetRnnAlgorithms(
@@ -3844,10 +3950,12 @@ bool MIOpenSupport::DoBatchNormalizationForward(
     bool is_training, ScratchAllocator* reserve_space_allocator,
     ScratchAllocator* workspace_allocator) {
   return DoBatchNormalizationForwardImpl<Eigen::bfloat16, float>(
-      stream, dnn::DataType::kBF16, dnn::DataType::kFloat, x, scale, offset,
-      estimated_mean, estimated_variance, side_input, x_desc, scale_offset_desc,
-      epsilon, exponential_average_factor, activation_mode, y, batch_mean,
-      batch_var, saved_mean, saved_inv_var, is_training);
+             stream, dnn::DataType::kBF16, dnn::DataType::kFloat, x, scale,
+             offset, estimated_mean, estimated_variance, side_input, x_desc,
+             scale_offset_desc, epsilon, exponential_average_factor,
+             activation_mode, y, batch_mean, batch_var, saved_mean,
+             saved_inv_var, is_training)
+      .ok();
 }
 
 bool MIOpenSupport::DoBatchNormalizationForward(
@@ -3865,10 +3973,12 @@ bool MIOpenSupport::DoBatchNormalizationForward(
     bool is_training, ScratchAllocator* reserve_space_allocator,
     ScratchAllocator* workspace_allocator) {
   return DoBatchNormalizationForwardImpl<Eigen::half, float>(
-      stream, dnn::DataType::kHalf, dnn::DataType::kFloat, x, scale, offset,
-      estimated_mean, estimated_variance, side_input, x_desc, scale_offset_desc,
-      epsilon, exponential_average_factor, activation_mode, y, batch_mean,
-      batch_var, saved_mean, saved_inv_var, is_training);
+             stream, dnn::DataType::kHalf, dnn::DataType::kFloat, x, scale,
+             offset, estimated_mean, estimated_variance, side_input, x_desc,
+             scale_offset_desc, epsilon, exponential_average_factor,
+             activation_mode, y, batch_mean, batch_var, saved_mean,
+             saved_inv_var, is_training)
+      .ok();
 }
 
 bool MIOpenSupport::DoBatchNormalizationForward(
@@ -3885,14 +3995,16 @@ bool MIOpenSupport::DoBatchNormalizationForward(
     bool is_training, ScratchAllocator* reserve_space_allocator,
     ScratchAllocator* workspace_allocator) {
   return DoBatchNormalizationForwardImpl<float, float>(
-      stream, dnn::DataType::kFloat, dnn::DataType::kFloat, x, scale, offset,
-      estimated_mean, estimated_variance, side_input, x_desc, scale_offset_desc,
-      epsilon, exponential_average_factor, activation_mode, y, batch_mean,
-      batch_var, saved_mean, saved_inv_var, is_training);
+             stream, dnn::DataType::kFloat, dnn::DataType::kFloat, x, scale,
+             offset, estimated_mean, estimated_variance, side_input, x_desc,
+             scale_offset_desc, epsilon, exponential_average_factor,
+             activation_mode, y, batch_mean, batch_var, saved_mean,
+             saved_inv_var, is_training)
+      .ok();
 }
 
 template <class T, class U>
-bool MIOpenSupport::DoBatchNormalizationForwardImpl(
+absl::Status MIOpenSupport::DoBatchNormalizationForwardImpl(
     Stream* stream, dnn::DataType input_data_type,
     dnn::DataType scale_data_type, const DeviceMemory<T>& x,
     const DeviceMemory<U>& scale, const DeviceMemory<U>& offset,
@@ -3907,10 +4019,11 @@ bool MIOpenSupport::DoBatchNormalizationForwardImpl(
     bool is_training) {
   auto miopen = miopen_->GetHandle(parent_, stream);
 
-  ScopedTensorDescriptor x_descriptor{x_desc,
-                                      ToMIOpenDataType(input_data_type)};
-  ScopedTensorDescriptor scale_offset_descriptor{
-      scale_offset_desc, ToMIOpenDataType(scale_data_type)};
+  TF_ASSIGN_OR_RETURN(auto x_descriptor,
+                      scope(x_desc, ToMIOpenDataType(input_data_type)));
+  TF_ASSIGN_OR_RETURN(
+      auto scale_offset_descriptor,
+      scope(scale_offset_desc, ToMIOpenDataType(scale_data_type)));
   miopenBatchNormMode_t mode = miopenBNSpatial;
   float one = 1.0;
   float zero = 0.0;
@@ -3933,11 +4046,11 @@ bool MIOpenSupport::DoBatchNormalizationForwardImpl(
         const_cast<void*>(maybe_inv_var), epsilon);
   }
   if (status != miopenStatusSuccess) {
-    LOG(ERROR) << "failed to enqueue forward batch normalization on stream: "
-               << ToString(status);
-    return false;
+    return absl::InternalError(
+        "failed to enqueue forward batch normalization on stream: " +
+        ToString(status));
   }
-  return true;
+  return absl::OkStatus();
 }
 
 bool MIOpenSupport::DoBatchNormalizationBackward(
@@ -3954,9 +4067,10 @@ bool MIOpenSupport::DoBatchNormalizationBackward(
     DeviceMemory<uint8_t>* reserve_space_data,
     ScratchAllocator* workspace_allocator) {
   return DoBatchNormalizationBackwardImpl<Eigen::bfloat16, float>(
-      stream, miopenBFloat16, miopenFloat, y_backprop, x, scale, mean, inv_var,
-      x_desc, scale_offset_desc, epsilon, x_backprop, scale_backprop,
-      offset_backprop);
+             stream, miopenBFloat16, miopenFloat, y_backprop, x, scale, mean,
+             inv_var, x_desc, scale_offset_desc, epsilon, x_backprop,
+             scale_backprop, offset_backprop)
+      .ok();
 }
 
 bool MIOpenSupport::DoBatchNormalizationBackward(
@@ -3972,9 +4086,10 @@ bool MIOpenSupport::DoBatchNormalizationBackward(
     DeviceMemory<uint8>* reserve_space_data,
     ScratchAllocator* workspace_allocator) {
   return DoBatchNormalizationBackwardImpl<Eigen::half, float>(
-      stream, miopenHalf, miopenFloat, y_backprop, x, scale, mean, inv_var,
-      x_desc, scale_offset_desc, epsilon, x_backprop, scale_backprop,
-      offset_backprop);
+             stream, miopenHalf, miopenFloat, y_backprop, x, scale, mean,
+             inv_var, x_desc, scale_offset_desc, epsilon, x_backprop,
+             scale_backprop, offset_backprop)
+      .ok();
 }
 
 bool MIOpenSupport::DoBatchNormalizationBackward(
@@ -3990,13 +4105,14 @@ bool MIOpenSupport::DoBatchNormalizationBackward(
     DeviceMemory<uint8>* reserve_space_data,
     ScratchAllocator* workspace_allocator) {
   return DoBatchNormalizationBackwardImpl<float, float>(
-      stream, miopenFloat, miopenFloat, y_backprop, x, scale, mean, variance,
-      x_desc, scale_offset_desc, epsilon, x_backprop, scale_backprop,
-      offset_backprop);
+             stream, miopenFloat, miopenFloat, y_backprop, x, scale, mean,
+             variance, x_desc, scale_offset_desc, epsilon, x_backprop,
+             scale_backprop, offset_backprop)
+      .ok();
 }
 
 template <class T, class U>
-bool MIOpenSupport::DoBatchNormalizationBackwardImpl(
+absl::Status MIOpenSupport::DoBatchNormalizationBackwardImpl(
     Stream* stream, int miopen_input_type, int miopen_scale_type,
     const DeviceMemory<T>& y_backprop, const DeviceMemory<T>& x,
     const DeviceMemory<U>& scale, const DeviceMemory<U>& mean,
@@ -4005,10 +4121,12 @@ bool MIOpenSupport::DoBatchNormalizationBackwardImpl(
     DeviceMemory<T>* x_backprop, DeviceMemory<U>* scale_backprop,
     DeviceMemory<U>* offset_backprop) {
   auto miopen = miopen_->GetHandle(parent_, stream);
-  ScopedTensorDescriptor x_descriptor{
-      x_desc, static_cast<miopenDataType_t>(miopen_input_type)};
-  ScopedTensorDescriptor scale_offset_descriptor{
-      scale_offset_desc, static_cast<miopenDataType_t>(miopen_scale_type)};
+  TF_ASSIGN_OR_RETURN(
+      auto x_descriptor,
+      scope(x_desc, static_cast<miopenDataType_t>(miopen_input_type)));
+  TF_ASSIGN_OR_RETURN(auto scale_offset_descriptor,
+                      scope(scale_offset_desc,
+                            static_cast<miopenDataType_t>(miopen_scale_type)));
   miopenBatchNormMode_t mode = miopenBNSpatial;
   float one = 1.0;
   float zero = 0.0;
@@ -4021,18 +4139,20 @@ bool MIOpenSupport::DoBatchNormalizationBackwardImpl(
       scale_backprop->opaque(), offset_backprop->opaque(), epsilon,
       mean.opaque(), variance.opaque());
   if (status != miopenStatusSuccess) {
-    LOG(ERROR) << "failed to enqueue backward batch normalization on stream: "
-               << ToString(status);
-    return false;
+    return absl::InternalError(
+        "failed to enqueue backward batch normalization on stream: " +
+        ToString(status));
   }
-  return true;
+  return absl::OkStatus();
 }
 
-template <typename T>
+template <typename T, typename Tbias>
 void launchInplaceBiasActivation(hipStream_t stream, void* c_data,
-                                 const void* bias_data, int activation_mode,
-                                 uint64_t m, uint64_t n, int64_t ldc,
-                                 float param);
+                                 const void* bias_data,
+                                 const void* side_input_data,
+                                 float side_input_scale, int activation_mode,
+                                 uint64_t batch, uint64_t m, uint64_t n,
+                                 int64_t ldc, float param);
 
 class ROCmFusedMatmulRunner : public dnn::FusedMatmulRunner {
   template <typename T>
@@ -4127,31 +4247,35 @@ absl::Status ROCmFusedMatmulRunner::gemm(Stream* stream,
                               NumericOptions{}, blas::CallContext::kNone);
 }
 
-template <typename T>
-absl::Status InplaceBiasActivation(Stream* stream, DeviceMemoryBase c_data,
-                                   DeviceMemoryBase bias_data,
-                                   dnn::ActivationMode activation_mode,
-                                   uint64_t m, uint64_t n, int64_t ldc,
-                                   float param) {
+template <typename T, typename Tbias = T>
+absl::Status InplaceBiasActivation(
+    Stream* stream, DeviceMemoryBase c_data, DeviceMemoryBase bias_data,
+    DeviceMemoryBase side_input_data, float side_input_scale,
+    dnn::ActivationMode activation_mode, uint64_t batch, uint64_t m, uint64_t n,
+    int64_t ldc, float param, bool transpose = false) {
   typedef typename std::conditional<
       std::is_same_v<T, Eigen::half>, __half,
       typename std::conditional<std::is_same_v<T, Eigen::bfloat16>,
                                 hip_bfloat16, T>::type>::type CT;
-
-  if (activation_mode == dnn::ActivationMode::kReluX ||
-      activation_mode == dnn::ActivationMode::kBandPass ||
-      activation_mode == dnn::ActivationMode::kLeakyRelu)
-
-    return absl::InvalidArgumentError(
-        "ROCm InplaceBiasActivation can't be used with "
-        "parametric activations yet");
-
-  launchInplaceBiasActivation<CT>(
+  typedef typename std::conditional<
+      std::is_same_v<Tbias, Eigen::half>, __half,
+      typename std::conditional<std::is_same_v<Tbias, Eigen::bfloat16>,
+                                hip_bfloat16, Tbias>::type>::type CTbias;
+  launchInplaceBiasActivation<CT, CTbias>(
       AsGpuStreamValue(stream), c_data.opaque(), bias_data.opaque(),
-      static_cast<int>(activation_mode), m, n, ldc, param);
+      side_input_data.opaque(), side_input_scale,
+      static_cast<int>(activation_mode) + (transpose ? 10 : 0), batch, m, n,
+      ldc, param);
   return absl::OkStatus();
 }
 
+template <typename Ta, typename Tb, typename... Args>
+absl::Status InplaceBiasActivation(Stream* stream, DeviceMemory<Ta> c_data,
+                                   DeviceMemory<Tb> bias_data, Args... args) {
+  return InplaceBiasActivation<Ta, Tb>(stream, DeviceMemoryBase(c_data),
+                                       DeviceMemoryBase(bias_data), args...);
+}
+
 // Launch the operation, with the signature determined by `Sig`.
 absl::Status ROCmFusedMatmulRunner::operator()(
     Stream* stream, dnn::ProfileResult* prof, DeviceMemoryBase scratch_memory,
@@ -4170,18 +4294,24 @@ absl::Status ROCmFusedMatmulRunner::operator()(
     return absl::InvalidArgumentError("Unsupported input type");
 
   if (!status.ok()) return status;
+
+  DeviceMemory<uint8_t> side_input;
   if (_input_type == dnn::DataType::kFloat)
-    return InplaceBiasActivation<float>(stream, c_data, bias_data,
-                                        _activation_mode, _m, _n, _ldc, 0.0f);
+    return InplaceBiasActivation<float>(stream, c_data, bias_data, side_input,
+                                        0.0f, _activation_mode, 1, _m, _n, _ldc,
+                                        0.0f);
   else if (_input_type == dnn::DataType::kHalf)
     return InplaceBiasActivation<Eigen::half>(
-        stream, c_data, bias_data, _activation_mode, _m, _n, _ldc, 0.0f);
+        stream, c_data, bias_data, side_input, 0.0f, _activation_mode, 1, _m,
+        _n, _ldc, 0.0f);
   else if (_input_type == dnn::DataType::kBF16)
     return InplaceBiasActivation<Eigen::bfloat16>(
-        stream, c_data, bias_data, _activation_mode, _m, _n, _ldc, 0.0f);
+        stream, c_data, bias_data, side_input, 0.0f, _activation_mode, 1, _m,
+        _n, _ldc, 0.0f);
   else if (_input_type == dnn::DataType::kDouble)
-    return InplaceBiasActivation<double>(stream, c_data, bias_data,
-                                         _activation_mode, _m, _n, _ldc, 0.0f);
+    return InplaceBiasActivation<double>(stream, c_data, bias_data, side_input,
+                                         0.0f, _activation_mode, 1, _m, _n,
+                                         _ldc, 0.0f);
   else
     return absl::InvalidArgumentError("Unsupported input type");
 }
@@ -4259,14 +4389,13 @@ absl::Status MIOpenSupport::DoPoolForward(
   auto miopen_dtype =
       element_type == dnn::DataType::kFloat ? miopenFloat : miopenHalf;
 
-  ScopedTensorDescriptor src_desc{input_dimensions, miopen_dtype};
-  ScopedTensorDescriptor dest_desc{output_dimensions, miopen_dtype};
-  ScopedPoolingDescriptor pooling_desc{pooling_dimensions};
+  TF_ASSIGN_OR_RETURN(auto src_desc, scope(input_dimensions, miopen_dtype));
+  TF_ASSIGN_OR_RETURN(auto dest_desc, scope(output_dimensions, miopen_dtype));
+  TF_ASSIGN_OR_RETURN(auto pooling_desc, scope(pooling_dimensions));
 
   bool do_backward = false;
   uint8* workspace = nullptr;
   size_t workspace_size = 0;
-  ScopedDeviceMemory<uint8> wsp_mem;
   if (m_pooling_cache_enabled && element_type == dnn::DataType::kFloat) {
     do_backward = true;
     auto status = wrap::miopenPoolingGetWorkSpaceSizeV2(
@@ -4287,12 +4416,9 @@ absl::Status MIOpenSupport::DoPoolForward(
         // reusing the same buffer
         workspace = reinterpret_cast<uint8*>(pdesc->workspace.ptr()->opaque());
       } else {
-        wsp_mem = stream->parent()->AllocateOwnedArray<uint8>(workspace_size);
-        workspace = reinterpret_cast<uint8*>(wsp_mem.ptr()->opaque());
-        m_pooling_cache.insert(input_data.opaque(), input_dimensions,
-                               output_dimensions, pooling_dimensions,
-                               miopenFloat, wsp_mem, workspace_size,
-                               AsGpuStreamValue(stream));
+        TF_ASSIGN_OR_RETURN(auto allocated,
+                            workspace_allocator->AllocateBytes(workspace_size));
+        workspace = reinterpret_cast<uint8*>(allocated.opaque());
       }
     }
   }
@@ -4417,9 +4543,9 @@ absl::Status MIOpenSupport::DoPoolBackward(
   auto miopen_dtype =
       element_type == dnn::DataType::kFloat ? miopenFloat : miopenHalf;
 
-  ScopedTensorDescriptor src_desc{input_dimensions, miopen_dtype};
-  ScopedTensorDescriptor dest_desc{output_dimensions, miopen_dtype};
-  ScopedPoolingDescriptor pooling_desc{pooling_dimensions};
+  TF_ASSIGN_OR_RETURN(auto src_desc, scope(input_dimensions, miopen_dtype));
+  TF_ASSIGN_OR_RETURN(auto dest_desc, scope(output_dimensions, miopen_dtype));
+  TF_ASSIGN_OR_RETURN(auto pooling_desc, scope(pooling_dimensions));
 
   uint8* workspace_ptr = 0;
   DeviceMemory<uint8> workspace;
@@ -4507,6 +4633,17 @@ absl::Status MIOpenSupport::DoPoolBackward(
   return absl::OkStatus();
 }
 
+#define ASSIGN_OR_RETURN_FALSE(lhs, rexpr) \
+  ASSIGN_OR_RETURN_FALSE_IMPL(             \
+      TF_STATUS_MACROS_CONCAT_NAME(_status_or_value, __COUNTER__), lhs, rexpr)
+
+#define ASSIGN_OR_RETURN_FALSE_IMPL(statusor, lhs, rexpr) \
+  auto statusor = (rexpr);                                \
+  if (TF_PREDICT_FALSE(!statusor.ok())) {                 \
+    return false;                                         \
+  }                                                       \
+  lhs = std::move(statusor).value()
+
 bool MIOpenSupport::DoNormalizeWithDimensions(
     Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor,
     const dnn::BatchDescriptor& dimensions,
@@ -4524,8 +4661,8 @@ bool MIOpenSupport::DoNormalizeWithDimensions(
   auto miopen = miopen_->GetHandle(parent_, stream);
 
   // Launch the normalization.
-  ScopedTensorDescriptor dims{dimensions, miopenFloat};
-  ScopedNormalizeDescriptor normalize{normalize_descriptor};
+  ASSIGN_OR_RETURN_FALSE(auto dims, scope(dimensions, miopenFloat));
+  ASSIGN_OR_RETURN_FALSE(auto normalize, scope(normalize_descriptor));
 
   // Alpha is the scaling factor for input.
   float alpha = 1.0f;
@@ -4562,8 +4699,8 @@ bool MIOpenSupport::DoNormalizeBackwardWithDimensions(
 
   auto miopen = miopen_->GetHandle(parent_, stream);
 
-  ScopedTensorDescriptor dims{dimensions, miopenFloat};
-  ScopedNormalizeDescriptor normalize{normalize_descriptor};
+  ASSIGN_OR_RETURN_FALSE(auto dims, scope(dimensions, miopenFloat));
+  ASSIGN_OR_RETURN_FALSE(auto normalize, scope(normalize_descriptor));
 
   float alpha = 1.0f;
   float beta = 0.0f;
@@ -4647,9 +4784,9 @@ bool MIOpenSupport::DeriveOutputBatchDescriptor(
     const FilterDescriptor& filter_descriptor,
     const dnn::ConvolutionDescriptor& convolution_descriptor,
     dnn::BatchDescriptor* output_batch_descriptor) {
-  ScopedTensorDescriptor input_nd{batch_descriptor, miopenFloat};
-  ScopedFilterDescriptor filter{filter_descriptor, miopenFloat};
-  ScopedConvolutionDescriptor conv{convolution_descriptor, miopenFloat};
+  ASSIGN_OR_RETURN_FALSE(auto input_nd, scope(batch_descriptor, miopenFloat));
+  ASSIGN_OR_RETURN_FALSE(auto filter, scope(filter_descriptor, miopenFloat));
+  ASSIGN_OR_RETURN_FALSE(auto conv, scope(convolution_descriptor));
 
   int dn = batch_descriptor.ndims() + 2;
   std::vector<int> dims(dn);  // in BDYX
@@ -4673,6 +4810,367 @@ bool MIOpenSupport::DeriveOutputBatchDescriptor(
   return true;
 }
 
+class RocmFusedConvRunner : public dnn::FusedConvRunner {
+ public:
+  std::string ToString() const override {
+    return MakeAlgorithmDesc().ToString();
+  }
+
+  uint64_t GetWorkspaceSize() const override { return workspace_size_; }
+
+  absl::StatusOr<dnn::AlgorithmDesc> ToAlgorithmDesc() const override {
+    return MakeAlgorithmDesc();
+  }
+
+  absl::Status operator()(Stream* stream, dnn::ProfileResult* profile_result,
+                          DeviceMemoryBase scratch_memory,
+                          DeviceMemoryBase input_data,
+                          DeviceMemoryBase filter_data,
+                          DeviceMemoryBase side_input_data,
+                          DeviceMemoryBase bias_data,
+                          DeviceMemoryBase output_data) const override {
+    VLOG(2) << "RocmFusedConvRunner()";
+    if (parent_ != stream->parent()) {
+      return absl::InternalError(
+          "RocmFusedConvRunner cached across multiple StreamExecutors.");
+    }
+
+    // We can't reliably detect whether this sequence can be fused until
+    // we come here and actually try to fuse it. So, we need a fallback.
+    bool do_unfused =
+        (side_input_scale_ != 0.0) || !fusion_plan_.CompilationSucceeded();
+
+    if (do_unfused)
+      return execute_unfused(stream, profile_result, scratch_memory, input_data,
+                             filter_data, side_input_data, bias_data,
+                             output_data);
+    auto algo = MakeAlgorithmDesc();
+    auto miopen = miopen_->GetHandle(parent_, stream);
+    fusion_plan_.SetConvolutionArgs(filter_data.opaque());
+    fusion_plan_.SetBiasArgs(bias_data.opaque());
+    if (activation_desc_.miopen_activation_mode_ != miopenActivationPASTHRU)
+      fusion_plan_.SetActivationForwardArgs(activation_desc_);
+
+    std::optional<GpuTimer> timer;
+    if (profile_result) {
+      auto timer_or_status = GpuTimer::Create(AsGpuStream(stream));
+      if (!timer_or_status.ok()) {
+        LOG(ERROR) << "Failed to create timer";
+        return absl::InternalError("Failed to start timer");
+      }
+      timer.emplace(std::move(*timer_or_status));
+    }
+
+    miopenStatus_t status;
+    status = wrap::miopenExecuteFusionPlan(
+        miopen.handle(), fusion_plan_.fusion_plan_, input_nd_.handle(),
+        input_data.opaque(), output_nd_.handle(), output_data.opaque(),
+        fusion_plan_.fusion_args_);
+
+    if (status != miopenStatusSuccess) {
+      LOG(ERROR) << "Failed to enqueue fused convolution on stream: "
+                 << stream_executor::gpu::ToString(status);
+      return absl::InternalError(
+          "Failed to enqueue fused convolution on stream: " +
+          stream_executor::gpu::ToString(status));
+    }
+
+    if (profile_result) {
+      absl::StatusOr<absl::Duration> elapsed = timer->GetElapsedDuration();
+      if (!elapsed.ok()) {
+        LOG(ERROR) << "Failed to get elapsed duration";
+        return absl::InternalError("Timer failure");
+      }
+      profile_result->set_elapsed_time_in_ms(
+          absl::ToDoubleMilliseconds(*elapsed));
+      profile_result->set_algorithm(algo);
+      profile_result->set_scratch_size(scratch_memory.size());
+    }
+
+    return absl::OkStatus();
+  }
+
+ public:
+  // Queries the workspace size and constructs a 'RocmFusedConvRunner'.
+  static absl::StatusOr<std::unique_ptr<const dnn::FusedConvRunner>> Create(
+      GpuExecutor* parent, Stream* stream, MIOpenAccess* miopen,
+      const dnn::AlgorithmDesc& algo, dnn::DataType input_type,
+      dnn::DataType bias_type, double conv_scale, double side_input_scale,
+      double leakyrelu_alpha, BatchDescriptor input_nd,
+      BatchDescriptor output_nd, FilterDescriptor filter,
+      BatchDescriptor bias_nd, ConvolutionDescriptor conv,
+      dnn::ActivationMode activation) {
+    TF_ASSIGN_OR_RETURN(
+        auto input_nd_,
+        scope(input_nd, ToMIOpenDataType(input_type, input_nd.layout())));
+    TF_ASSIGN_OR_RETURN(
+        auto output_nd_,
+        scope(output_nd, ToMIOpenDataType(input_type, input_nd.layout())));
+    TF_ASSIGN_OR_RETURN(auto filter_,
+                        scope(filter, ToMIOpenDataType(input_type)));
+    TF_ASSIGN_OR_RETURN(auto bias_nd_,
+                        scope(bias_nd, ToMIOpenDataType(bias_type)));
+    TF_ASSIGN_OR_RETURN(auto conv_, scope(conv));
+
+    TF_ASSIGN_OR_RETURN(
+        auto activation_desc,
+        ScopedActivationDescriptor::Create(activation, leakyrelu_alpha));
+
+    TF_ASSIGN_OR_RETURN(
+        auto fusion_plan,
+        ScopedFusionPlanConvolutionBiasActivation::Create(
+            miopen->GetHandle(parent, stream).handle(), input_nd_.handle(),
+            filter_.handle(), conv_.handle(), bias_nd_.handle(),
+            activation_desc));
+
+    VLOG(2) << "RocmFusedConvRunner";
+    auto mi = miopen->GetHandle(parent, stream);
+
+    size_t maxSolutionCount = 0;
+    auto status = wrap::miopenConvolutionForwardGetSolutionCount(
+        mi.handle(), filter_.handle(), input_nd_.handle(), conv_.handle(),
+        output_nd_.handle(), &maxSolutionCount);
+
+    size_t solutionCount = 0;
+    std::unique_ptr<miopenConvSolution_t[]> solutions(
+        new miopenConvSolution_t[maxSolutionCount]);
+
+    status = wrap::miopenConvolutionForwardGetSolution(
+        mi.handle(), filter_.handle(), input_nd_.handle(), conv_.handle(),
+        output_nd_.handle(), maxSolutionCount, &solutionCount, solutions.get());
+
+    VLOG(2) << solutionCount << " solutions";
+
+    if (solutionCount == 0) return absl::InternalError("No algorithms found");
+
+    size_t workspace_size_1 = solutions[0].workspace_size;
+    size_t true_workspace_size = 0;
+    status = wrap::miopenConvolutionForwardGetWorkSpaceSize(
+        mi.handle(), filter_.handle(), input_nd_.handle(), conv_.handle(),
+        output_nd_.handle(), &true_workspace_size);
+
+    VLOG(2) << "True workspace size " << workspace_size_1 << " "
+            << true_workspace_size;
+
+    auto obj = new RocmFusedConvRunner(
+        parent, stream, miopen, static_cast<int64_t>(solutions[0].solution_id),
+        true_workspace_size, input_type, bias_type, conv_scale,
+        side_input_scale, leakyrelu_alpha, input_nd, output_nd, filter, bias_nd,
+        conv, activation, input_nd_, output_nd_, filter_, bias_nd_, conv_,
+        activation_desc, fusion_plan);
+
+    return std::unique_ptr<const dnn::FusedConvRunner>(obj);
+  }
+
+ private:
+  // Private to prevent passing in the wrong workspace_size.
+  RocmFusedConvRunner(
+      GpuExecutor* parent, Stream* stream, MIOpenAccess* miopen,
+      int64_t algo_id, size_t workspace_size, dnn::DataType input_type,
+      dnn::DataType bias_type, double conv_scale, double side_input_scale,
+      double leakyrelu_alpha, BatchDescriptor dnn_input_nd,
+      BatchDescriptor dnn_output_nd, FilterDescriptor dnn_filter,
+      BatchDescriptor dnn_bias_nd, ConvolutionDescriptor dnn_conv,
+      dnn::ActivationMode activation, ScopedTensorDescriptor& input_nd,
+      ScopedTensorDescriptor& output_nd, ScopedFilterDescriptor& filter,
+      ScopedTensorDescriptor& bias_nd, ScopedConvolutionDescriptor& conv,
+      ScopedActivationDescriptor& activation_desc,
+      ScopedFusionPlanConvolutionBiasActivation& fusion_plan)
+      : parent_(parent),
+        miopen_(miopen),
+        algo_id_(algo_id),
+        workspace_size_(workspace_size),
+        input_type_(input_type),
+        bias_type_(bias_type),
+
+        conv_scale_(conv_scale),
+        side_input_scale_(side_input_scale),
+        leakyrelu_alpha_(leakyrelu_alpha),
+        side_input_scale_f32_(static_cast<float>(side_input_scale)),
+
+        activation_mode_(activation),
+        dnn_input_nd_(dnn_input_nd),
+        dnn_output_nd_(dnn_output_nd),
+        dnn_filter_(dnn_filter),
+        dnn_bias_nd_(dnn_bias_nd),
+        dnn_conv_(dnn_conv),
+
+        input_nd_(std::move(input_nd)),
+        output_nd_(std::move(output_nd)),
+        filter_(std::move(filter)),
+        bias_nd_(std::move(bias_nd)),
+        conv_(std::move(conv)),
+        activation_desc_(std::move(activation_desc)),
+        fusion_plan_(std::move(fusion_plan)) {}
+
+  absl::Status execute_unfused(
+      Stream* stream, dnn::ProfileResult* profile_result,
+      DeviceMemoryBase scratch_memory, DeviceMemoryBase input_data,
+      DeviceMemoryBase filter_data, DeviceMemoryBase side_input_data,
+      DeviceMemoryBase bias_data, DeviceMemoryBase output_data) const {
+    auto miopen = miopen_->GetHandle(parent_, stream);
+    auto status = wrap::miopenConvolutionForwardImmediate(
+        miopen.handle(), filter_.handle(), filter_data.opaque(),
+        input_nd_.handle(), input_data.opaque(), conv_.handle(),
+        output_nd_.handle(), output_data.opaque(), scratch_memory.opaque(),
+        scratch_memory.size(), static_cast<uint64_t>(algo_id_));
+    if (status != miopenStatusSuccess) {
+      VLOG(0) << "Failed to enqueue convolution: "
+              << stream_executor::gpu::ToString(status);
+      return absl::InternalError("Failed to enqueue convolution: " +
+                                 stream_executor::gpu::ToString(status));
+    }
+
+    int batch;
+    std::vector<int64_t> dims_output =
+        dnn_output_nd_.full_dims(dnn_output_nd_.layout());
+    int rank = dims_output.size();
+    if (rank != 4 && rank != 5)
+      return absl::InternalError(
+          "RocmFusedConvRunner expects 4d or 5d descriptors");
+    int d1 = 1, d2 = 1;
+    bool bNCHW = (dnn_output_nd_.layout() != dnn::DataLayout::kBatchYXDepth);
+    batch = dims_output[0];
+    if (bNCHW) {
+      d1 = dims_output[1];
+      for (int i = 2; i < rank; i++) d2 *= dims_output[i];
+    } else {
+      d2 = dims_output[rank - 1];
+      for (int i = 1; i < rank - 1; i++) d1 *= dims_output[i];
+    }
+
+    float param = activation_desc_.alpha_;
+
+    auto inplace_call = [&](auto out, auto bias) {
+      return InplaceBiasActivation(stream, out, bias, side_input_data,
+                                   side_input_scale_f32_, activation_mode_,
+                                   batch, d1, d2, d2, param, bNCHW);
+    };
+
+    absl::Status biasActStatus;
+    if (input_type_ == dnn::DataType::kFloat &&
+        bias_type_ == dnn::DataType::kFloat)
+      biasActStatus = inplace_call(DeviceMemory<float>(output_data),
+                                   DeviceMemory<float>(bias_data));
+    else if (input_type_ == dnn::DataType::kHalf &&
+             bias_type_ == dnn::DataType::kFloat)
+      biasActStatus = inplace_call(DeviceMemory<Eigen::half>(output_data),
+                                   DeviceMemory<float>(bias_data));
+    else if (input_type_ == dnn::DataType::kHalf &&
+             bias_type_ == dnn::DataType::kHalf)
+      biasActStatus = inplace_call(DeviceMemory<Eigen::half>(output_data),
+                                   DeviceMemory<Eigen::half>(bias_data));
+    else if (input_type_ == dnn::DataType::kBF16 &&
+             bias_type_ == dnn::DataType::kFloat)
+      biasActStatus = inplace_call(DeviceMemory<Eigen::bfloat16>(output_data),
+                                   DeviceMemory<float>(bias_data));
+    else if (input_type_ == dnn::DataType::kBF16 &&
+             bias_type_ == dnn::DataType::kBF16)
+      biasActStatus = inplace_call(DeviceMemory<Eigen::bfloat16>(output_data),
+                                   DeviceMemory<Eigen::bfloat16>(bias_data));
+    else
+      return absl::InternalError("Unsupported data type");
+
+    return absl::OkStatus();
+  }
+
+  // Internal form of ToAlgorithmDesc without the StatusOr.
+  dnn::AlgorithmDesc MakeAlgorithmDesc() const {
+    return {algo_id_, /*tensor_ops_enabled_*/ true, workspace_size_};
+  }
+
+  std::string desc_;
+
+  GpuExecutor* parent_;
+  MIOpenAccess* miopen_;
+  int64_t algo_id_;
+  size_t workspace_size_;
+  dnn::DataType input_type_, bias_type_;
+  double conv_scale_, side_input_scale_, leakyrelu_alpha_;
+  float side_input_scale_f32_;
+  dnn::ActivationMode activation_mode_;
+
+  BatchDescriptor dnn_input_nd_;
+  BatchDescriptor dnn_output_nd_;
+  FilterDescriptor dnn_filter_;
+  BatchDescriptor dnn_bias_nd_;
+  ConvolutionDescriptor dnn_conv_;
+
+  ScopedTensorDescriptor input_nd_;
+  ScopedTensorDescriptor output_nd_;
+  ScopedFilterDescriptor filter_;
+  ScopedTensorDescriptor bias_nd_;
+  ScopedConvolutionDescriptor conv_;
+  mutable ScopedActivationDescriptor activation_desc_;
+  mutable ScopedFusionPlanConvolutionBiasActivation fusion_plan_;
+};
+
+absl::StatusOr<std::unique_ptr<const dnn::FusedConvRunner>>
+MIOpenSupport::FusedConvolveRunnerFromDesc(
+    Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
+    dnn::ConvolutionKind kind, dnn::DataType input_type,
+    dnn::DataType bias_type, dnn::DataType output_type, double conv_scale,
+    double side_input_scale, double leakyrelu_alpha,
+    const dnn::BatchDescriptor& input_descriptor,
+    const dnn::FilterDescriptor& filter_descriptor,
+    const dnn::BatchDescriptor& bias_descriptor,
+    const dnn::BatchDescriptor& output_descriptor,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    dnn::ActivationMode activation_mode) {
+  VLOG(2) << "MIOpenSupport::FusedConvolveRunnerFromDesc "
+          << filter_descriptor.ndims() << " " << side_input_scale << " "
+          << convolution_descriptor.ToString() << getTypeName(input_type) << " "
+          << getTypeName(bias_type) << " " << getTypeName(output_type);
+
+  // note: these checks need to be duplicated in XLA logic, because XLA calls
+  // this function directly and it terminates the process on error
+
+  return RocmFusedConvRunner::Create(
+      parent_, stream, miopen_.get(), algorithm_desc, input_type, bias_type,
+      conv_scale, side_input_scale, leakyrelu_alpha, input_descriptor,
+      output_descriptor, filter_descriptor, bias_descriptor,
+      convolution_descriptor, activation_mode);
+}
+
+absl::Status MIOpenSupport::GetFusedConvolveRunners(
+    bool use_cudnn_frontend, dnn::ConvolutionKind kind,
+    dnn::DataType input_type, dnn::DataType bias_type,
+    dnn::DataType output_type, double conv_scale, double side_input_scale,
+    double leakyrelu_alpha, Stream* stream,
+    const dnn::BatchDescriptor& input_descriptor,
+    const dnn::FilterDescriptor& filter_descriptor,
+    const dnn::BatchDescriptor& bias_descriptor,
+    const dnn::BatchDescriptor& output_descriptor,
+    const dnn::ConvolutionDescriptor& convolution_descriptor, bool use_fallback,
+    dnn::ActivationMode activation_mode, const NumericOptions& numeric_options,
+    std::vector<std::unique_ptr<const dnn::FusedConvRunner>>* out_exec_plans) {
+  VLOG(2) << "MIOpenSupport::GetFusedConvolveRunners";
+  VLOG(2) << "filter_descriptor " << filter_descriptor.ndims();
+
+  std::vector<dnn::AlgorithmDesc> algorithms{
+      // clang-format off
+      dnn::AlgorithmDesc(miopenConvolutionFwdAlgoGEMM, false, 0),
+      dnn::AlgorithmDesc(miopenConvolutionFwdAlgoDirect, false, 0),
+      dnn::AlgorithmDesc(miopenConvolutionFwdAlgoFFT, false, 0),
+      dnn::AlgorithmDesc(miopenConvolutionFwdAlgoWinograd, false, 0),
+      // clang-format on
+  };
+
+  for (const auto& algo : algorithms) {
+    auto runner_or = FusedConvolveRunnerFromDesc(
+        stream, algo, kind, input_type, bias_type, output_type, conv_scale,
+        side_input_scale, leakyrelu_alpha, input_descriptor, filter_descriptor,
+        bias_descriptor, output_descriptor, convolution_descriptor,
+        activation_mode);
+    if (!runner_or.ok()) continue;
+    out_exec_plans->push_back(std::move(runner_or).value());
+  }
+
+  VLOG(2) << "MIOpenSupport::GetFusedConvolveRunners returns "
+          << out_exec_plans->size() << " runners";
+  return absl::OkStatus();
+}
+
 bool UseNhwcLayoutForRocm() {
 #if TF_ROCM_VERSION >= 50100
   static bool is_enabled = [] {
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
index ecaffd3cad392d..957c288f12c278 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
@@ -251,8 +251,34 @@ class MIOpenSupport : public dnn::DnnSupport {
       const dnn::BatchDescriptor& output_descriptor,
       const dnn::ConvolutionDescriptor& convolution_descriptor) override;
 
+  absl::StatusOr<std::unique_ptr<const dnn::FusedConvRunner>>
+  FusedConvolveRunnerFromDesc(
+      Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
+      dnn::ConvolutionKind kind, dnn::DataType input_type,
+      dnn::DataType bias_type, dnn::DataType output_type, double conv_scale,
+      double side_input_scale, double leakyrelu_alpha,
+      const dnn::BatchDescriptor& input_descriptor,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const dnn::BatchDescriptor& bias_descriptor,
+      const dnn::BatchDescriptor& output_descriptor,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      dnn::ActivationMode activation_mode) override;
+
   bool GetMIOpenConvolveAlgorithms(
-      dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
+      dnn::ConvolutionKind kind, dnn::DataType input_type,
+      dnn::DataType output_type, Stream* stream,
+      const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
+      const dnn::FilterDescriptor& filter_descriptor,
+      DeviceMemoryBase filter_data,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemoryBase output_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      ScratchAllocator* scratch_allocator,
+      std::vector<dnn::ProfileResult>* out_algorithms);
+
+  absl::Status GetMIOpenConvolveAlgorithmsImmediateMode(
+      dnn::ConvolutionKind kind, dnn::DataType input_type,
+      dnn::DataType output_type, Stream* stream,
       const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
       const dnn::FilterDescriptor& filter_descriptor,
       DeviceMemoryBase filter_data,
@@ -260,7 +286,19 @@ class MIOpenSupport : public dnn::DnnSupport {
       DeviceMemoryBase output_data,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       ScratchAllocator* scratch_allocator,
-      std::vector<dnn::ProfileResult>* out_algorithms) override;
+      std::vector<dnn::ProfileResult>* out_algorithms);
+
+  absl::Status GetMIOpenConvolveAlgorithmsFindMode(
+      dnn::ConvolutionKind kind, dnn::DataType input_type,
+      dnn::DataType output_type, Stream* stream,
+      const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
+      const dnn::FilterDescriptor& filter_descriptor,
+      DeviceMemoryBase filter_data,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemoryBase output_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      ScratchAllocator* scratch_allocator,
+      std::vector<dnn::ProfileResult>* out_algorithms);
 
   bool GetRnnAlgorithms(
       std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
@@ -389,6 +427,21 @@ class MIOpenSupport : public dnn::DnnSupport {
       std::vector<std::unique_ptr<const dnn::FusedMatmulRunner>>*
           out_exec_plans) override;
 
+  absl::Status GetFusedConvolveRunners(
+      bool use_cudnn_frontend, dnn::ConvolutionKind kind,
+      dnn::DataType input_type, dnn::DataType bias_type,
+      dnn::DataType output_type, double conv_scale, double side_input_scale,
+      double leakyrelu_alpha, Stream* stream,
+      const dnn::BatchDescriptor& input_descriptor,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const dnn::BatchDescriptor& bias_descriptor,
+      const dnn::BatchDescriptor& output_descriptor,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      bool use_fallback, dnn::ActivationMode activation_mode,
+      const NumericOptions& numeric_options,
+      std::vector<std::unique_ptr<const dnn::FusedConvRunner>>* out_exec_plans)
+      override;
+
   absl::Status DoPoolForward(dnn::DataType element_type, Stream* stream,
                              const dnn::PoolingDescriptor& pooling_dimensions,
                              const dnn::BatchDescriptor& input_dimensions,
@@ -469,7 +522,7 @@ class MIOpenSupport : public dnn::DnnSupport {
   bool m_pooling_cache_enabled = false;
 
   template <class T, class U>
-  bool DoBatchNormalizationForwardImpl(
+  absl::Status DoBatchNormalizationForwardImpl(
       Stream* stream, dnn::DataType input_data_type,
       dnn::DataType scale_data_type, const DeviceMemory<T>& x,
       const DeviceMemory<U>& scale, const DeviceMemory<U>& offset,
@@ -484,7 +537,7 @@ class MIOpenSupport : public dnn::DnnSupport {
       bool is_training);
 
   template <class T, class U>
-  bool DoBatchNormalizationBackwardImpl(
+  absl::Status DoBatchNormalizationBackwardImpl(
       Stream* stream, int miopen_input_type, int miopen_scale_type,
       const DeviceMemory<T>& y_backprop, const DeviceMemory<T>& x,
       const DeviceMemory<U>& scale, const DeviceMemory<U>& mean,
@@ -569,28 +622,6 @@ class MIOpenSupport : public dnn::DnnSupport {
       ScratchAllocator* scratch_allocator, DeviceMemory<uint8>* scratch_memory,
       int* ctc_loss_algo_id) override;
 
-  bool GetMIOpenConvolveAlgorithmsImmediateMode(
-      dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
-      const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
-      const dnn::FilterDescriptor& filter_descriptor,
-      DeviceMemoryBase filter_data,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemoryBase output_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      ScratchAllocator* scratch_allocator,
-      std::vector<dnn::ProfileResult>* out_algorithms);
-
-  bool GetMIOpenConvolveAlgorithmsFindMode(
-      dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
-      const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
-      const dnn::FilterDescriptor& filter_descriptor,
-      DeviceMemoryBase filter_data,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemoryBase output_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      ScratchAllocator* scratch_allocator,
-      std::vector<dnn::ProfileResult>* out_algorithms);
-
   MIOpenSupport(const MIOpenSupport&) = delete;
   void operator=(const MIOpenSupport&) = delete;
 };
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_driver.cc b/third_party/xla/xla/stream_executor/rocm/rocm_driver.cc
index 409546aafd7ab4..ba4a7913ecfcb0 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_driver.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_driver.cc
@@ -78,7 +78,7 @@ namespace gpu {
 
 // Formats hipError_t to output prettified values into a log stream.
 // Error summaries taken from:
-string ToString(hipError_t result) {
+std::string ToString(hipError_t result) {
 #define OSTREAM_ROCM_ERROR(__name) \
   case hipError##__name:           \
     return "HIP_ERROR_" #__name;
@@ -115,6 +115,17 @@ string ToString(hipError_t result) {
   }
 }
 
+absl::StatusOr<hipError_t> QueryEvent(GpuContext* context, hipEvent_t event) {
+  ScopedActivateContext activated{context};
+  hipError_t res = wrap::hipEventQuery(event);
+  if (res != hipSuccess && res != hipErrorNotReady) {
+    return absl::Status{
+        absl::StatusCode::kInternal,
+        absl::StrFormat("failed to query event: %s", ToString(res).c_str())};
+  }
+  return res;
+}
+
 namespace {
 
 // Returns the current context and checks that it is in the set of HIP contexts
@@ -145,7 +156,7 @@ tsl::thread::ThreadPool* GetDriverExecutor() {
 
 }  // namespace
 
-string MemorySpaceString(MemorySpace memory_space) {
+std::string MemorySpaceString(MemorySpace memory_space) {
   switch (memory_space) {
     case MemorySpace::kHost:
       return "host";
@@ -240,7 +251,7 @@ namespace {
 // Returns a stringified device number associated with pointer, primarily for
 // logging purposes. Returns "?" if the device could not be successfully
 // queried.
-string ROCMPointerToDeviceString(hipDeviceptr_t pointer) {
+std::string ROCMPointerToDeviceString(hipDeviceptr_t pointer) {
   auto value = GpuDriver::GetPointerDevice(pointer);
   if (value.ok()) {
     return absl::StrCat(value.value());
@@ -252,7 +263,7 @@ string ROCMPointerToDeviceString(hipDeviceptr_t pointer) {
 // Returns a stringified memory space associated with pointer, primarily for
 // logging purposes. Returns "?" if the memory space could not be successfully
 // queried.
-string ROCMPointerToMemorySpaceString(hipDeviceptr_t pointer) {
+std::string ROCMPointerToMemorySpaceString(hipDeviceptr_t pointer) {
   auto value = GpuDriver::GetPointerMemorySpace(pointer);
   if (value.ok()) {
     return MemorySpaceString(value.value());
@@ -265,7 +276,8 @@ string ROCMPointerToMemorySpaceString(hipDeviceptr_t pointer) {
 // permitted between the "from" and "to" pointers' associated contexts,
 // primarily for logging purposes. Returns "error" if an error is encountered
 // in the process of querying.
-string ROCMPointersToCanAccessString(hipDeviceptr_t from, hipDeviceptr_t to) {
+std::string ROCMPointersToCanAccessString(hipDeviceptr_t from,
+                                          hipDeviceptr_t to) {
   auto from_context = GpuDriver::GetPointerContext(from);
   if (!from_context.ok()) {
     LOG(ERROR) << "could not retrieve source pointer's context: "
@@ -328,7 +340,7 @@ static absl::Status InternalInit() {
 }
 
 /* static */ absl::Status GpuDriver::GetDeviceName(hipDevice_t device,
-                                                   string* device_name) {
+                                                   std::string* device_name) {
   static const size_t kCharLimit = 64;
   absl::InlinedVector<char, 4> chars(kCharLimit);
   RETURN_IF_ROCM_ERROR(
@@ -433,10 +445,6 @@ static absl::Status InternalInit() {
   CreatedContexts::Remove(context->context());
 }
 
-/* static */ hipCtx_t GpuDriver::GetContextHandle(GpuContext* context) {
-  return context->context();
-}
-
 /* static */ absl::Status GpuDriver::FuncGetAttribute(
     hipFunction_attribute attribute, hipFunction_t func, int* attribute_value) {
   RETURN_IF_ROCM_ERROR(
@@ -1470,19 +1478,6 @@ struct BitPatternToValue {
   }
 }
 
-/* static */ absl::StatusOr<hipError_t> GpuDriver::QueryEvent(
-    GpuContext* context, GpuEventHandle event) {
-  ScopedActivateContext activated{context};
-  hipError_t res = wrap::hipEventQuery(event);
-  if (res != hipSuccess && res != hipErrorNotReady) {
-    return absl::Status{
-        absl::StatusCode::kInternal,
-        absl::StrFormat("failed to query event: %s", ToString(res).c_str())};
-  }
-
-  return res;
-}
-
 /* static */ bool GpuDriver::GetEventElapsedTime(GpuContext* context,
                                                  float* elapsed_milliseconds,
                                                  GpuEventHandle start,
@@ -1769,7 +1764,8 @@ struct BitPatternToValue {
 /* static */ absl::StatusOr<MemorySpace> GpuDriver::GetPointerMemorySpace(
     hipDeviceptr_t pointer) {
   unsigned int value;
-  hipError_t result = hipSuccess;
+  hipError_t result = wrap::hipPointerGetAttribute(
+      &value, HIP_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer);
   if (result == hipSuccess) {
     switch (value) {
       case hipMemoryTypeDevice:
@@ -1866,7 +1862,7 @@ struct BitPatternToValue {
 }
 
 // Helper function that turns the integer output of hipDeviceGetAttribute to
-// type T and wraps it in a StatusOr.
+// type T and wraps it in a absl::StatusOr.
 template <typename T>
 static absl::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
                                             hipDeviceAttribute_t attribute) {
@@ -2063,8 +2059,8 @@ static absl::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
   return true;
 }
 
-/* static */ string GpuDriver::GetPCIBusID(hipDevice_t device) {
-  string pci_bus_id;
+/* static */ std::string GpuDriver::GetPCIBusID(hipDevice_t device) {
+  std::string pci_bus_id;
   static const int kBufferSize = 64;
   absl::InlinedVector<char, 4> chars(kBufferSize);
   chars[kBufferSize - 1] = '\0';
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_driver.h b/third_party/xla/xla/stream_executor/rocm/rocm_driver.h
index 80822446a58d80..c5277ac798c097 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_driver.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_driver.h
@@ -32,6 +32,8 @@ namespace gpu {
 // Error summaries taken from:
 std::string ToString(hipError_t result);
 
+absl::StatusOr<hipError_t> QueryEvent(GpuContext* context, hipEvent_t event);
+
 // GpuContext wraps the device_ordinal and hipCtx_t handle.
 class GpuContext {
  public:
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_event.cc b/third_party/xla/xla/stream_executor/rocm/rocm_event.cc
index 497dee272cc1c0..730f4066ec7f62 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_event.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_event.cc
@@ -16,13 +16,14 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_event.h"
 #include "xla/stream_executor/gpu/gpu_executor.h"
 #include "xla/stream_executor/gpu/gpu_stream.h"
+#include "xla/stream_executor/rocm/rocm_driver.h"
 
 namespace stream_executor {
 namespace gpu {
 
 Event::Status GpuEvent::PollForStatus() {
   absl::StatusOr<hipError_t> status =
-      GpuDriver::QueryEvent(parent_->gpu_context(), gpu_event_);
+      QueryEvent(parent_->gpu_context(), gpu_event_);
   if (!status.ok()) {
     LOG(ERROR) << "Error polling for event status: "
                << status.status().message();
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc b/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
index 40bbb158f9437b..d8b29d23208ff2 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
@@ -85,8 +85,7 @@ static hipDeviceptr_t AsROCmDevicePtr(DeviceMemoryBase* gpu_mem) {
 }
 
 static GpuContext* GetGpuContext(Stream* stream) {
-  return static_cast<GpuExecutor*>(stream->parent()->implementation())
-      ->gpu_context();
+  return static_cast<GpuExecutor*>(stream->parent())->gpu_context();
 }
 
 GpuContext* ExtractGpuContext(GpuExecutor* rocm_exec) {
@@ -246,7 +245,7 @@ absl::Status GpuExecutor::Init() {
 // Arg: strip_exe: if true, remove the name of the executable itself from the
 //                 returned string. Example: calling this from /usr/bin/foo
 //                 would return /usr/bin.
-static string GetBinaryDir(bool strip_exe) {
+static std::string GetBinaryDir(bool strip_exe) {
   char exe_path[PATH_MAX] = {0};
   CHECK_NE(readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1), -1);
   // Make sure it's null-terminated:
@@ -254,8 +253,8 @@ static string GetBinaryDir(bool strip_exe) {
 
   if (strip_exe) {
     // The exe is the last component of the path, so remove one component.
-    string ret = exe_path;
-    std::vector<string> components = absl::StrSplit(exe_path, '/');
+    std::string ret = exe_path;
+    std::vector<std::string> components = absl::StrSplit(exe_path, '/');
     components.pop_back();
     return absl::StrJoin(components, "/");
   }
@@ -266,7 +265,7 @@ absl::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
                                     Kernel* kernel) {
   GpuKernel* rocm_kernel = AsGpuKernel(kernel);
   hipModule_t module = nullptr;
-  const string* kernel_name;
+  const std::string* kernel_name;
 
   if (spec.has_cuda_cubin_in_memory()) {
     kernel_name = &spec.cuda_cubin_in_memory().kernel_name();
@@ -612,7 +611,8 @@ absl::Status GpuExecutor::Memcpy(Stream* stream, void* host_dst,
                                              AsROCmDevicePtr(gpu_src), size,
                                              AsGpuStreamValue(stream));
 
-  // TODO(b/326130105): Change AsynchronousMemcpyD2H calls to return Status.
+  // TODO(b/326130105): Change AsynchronousMemcpyD2H calls to return
+  // absl::Status.
   if (!ok) {
     return absl::InternalError("Failed to memcpy from device to host.");
   }
@@ -624,7 +624,8 @@ absl::Status GpuExecutor::Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst,
   bool ok = GpuDriver::AsynchronousMemcpyH2D(context_, AsROCmDevicePtr(gpu_dst),
                                              host_src, size,
                                              AsGpuStreamValue(stream));
-  // TODO(b/326130105): Change AsynchronousMemcpyD2H calls to return Status.
+  // TODO(b/326130105): Change AsynchronousMemcpyD2H calls to return
+  // absl::Status.
   if (!ok) {
     return absl::InternalError("Failed to memcpy from device to host.");
   }
@@ -659,10 +660,6 @@ bool GpuExecutor::HostCallback(Stream* stream,
   delete callback;
 }
 
-absl::Status GpuExecutor::AllocateEvent(Event* event) {
-  return AsGpuEvent(event)->Init();
-}
-
 absl::Status GpuExecutor::DeallocateEvent(Event* event) {
   return AsGpuEvent(event)->Destroy();
 }
@@ -699,13 +696,6 @@ Event::Status GpuExecutor::PollForEventStatus(Event* event) {
   return AsGpuEvent(event)->PollForStatus();
 }
 
-bool GpuExecutor::AllocateStream(Stream* stream) {
-  absl::MutexLock l(&alive_gpu_streams_mu_);
-  bool out = AsGpuStream(stream)->Init();
-  alive_gpu_streams_[stream->platform_specific_handle().stream] = stream;
-  return out;
-}
-
 void GpuExecutor::DeallocateStream(Stream* stream) {
   {
     absl::MutexLock lock(&mu_);
@@ -816,30 +806,35 @@ bool GpuExecutor::DeviceMemoryUsage(int64_t* free, int64_t* total) const {
   return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
 }
 
-bool GpuExecutor::GetSymbol(const string& symbol_name,
-                            ModuleHandle module_handle, void** mem,
-                            size_t* bytes) {
+absl::StatusOr<DeviceMemoryBase> GpuExecutor::GetSymbol(
+    const std::string& symbol_name, ModuleHandle module_handle) {
+  void* mem = nullptr;
+  size_t bytes = 0;
+
   absl::MutexLock lock{&in_memory_modules_mu_};
   if (static_cast<bool>(module_handle)) {
     auto it = gpu_binary_to_module_.find(module_handle.id());
     CHECK(it != gpu_binary_to_module_.end());
     if (GpuDriver::GetModuleSymbol(
             context_, it->second.first, symbol_name.c_str(),
-            reinterpret_cast<hipDeviceptr_t*>(mem), bytes)) {
-      return true;
+            reinterpret_cast<hipDeviceptr_t*>(&mem), &bytes)) {
+      return DeviceMemoryBase(mem, bytes);
     }
   }
 
   for (auto& it : gpu_binary_to_module_) {
     if (GpuDriver::GetModuleSymbol(
             context_, it.second.first, symbol_name.c_str(),
-            reinterpret_cast<hipDeviceptr_t*>(mem), bytes)) {
-      return true;
+            reinterpret_cast<hipDeviceptr_t*>(&mem), &bytes)) {
+      return DeviceMemoryBase(mem, bytes);
     }
   }
 
   LOG(INFO) << "Falied to find symbol in any modules: " << symbol_name;
-  return false;
+  return absl::NotFoundError(
+      absl::StrCat("Check if module containing symbol ", symbol_name,
+                   " is loaded (module_handle = ",
+                   reinterpret_cast<uintptr_t>(module_handle.id()), ")"));
 }
 
 absl::Status FillBlockDimLimit(GpuDeviceHandle device,
@@ -857,12 +852,32 @@ absl::Status FillBlockDimLimit(GpuDeviceHandle device,
   return absl::OkStatus();
 }
 
-std::unique_ptr<EventInterface> GpuExecutor::CreateEventImplementation() {
-  return std::unique_ptr<EventInterface>(new GpuEvent(this));
+absl::StatusOr<std::unique_ptr<Event>> GpuExecutor::CreateEvent() {
+  auto gpu_event = std::make_unique<GpuEvent>(this);
+  TF_RETURN_IF_ERROR(gpu_event->Init());
+  return std::make_unique<Event>(this, std::move(gpu_event));
 }
 
-std::unique_ptr<StreamInterface> GpuExecutor::GetStreamImplementation() {
-  return std::unique_ptr<StreamInterface>(new GpuStream(this));
+absl::StatusOr<std::unique_ptr<Stream>> GpuExecutor::CreateStream(
+    std::optional<std::variant<StreamPriority, int>> priority) {
+  auto gpu_stream = std::make_unique<GpuStream>(this);
+  if (priority.has_value()) {
+    if (std::holds_alternative<StreamPriority>(*priority)) {
+      gpu_stream->SetPriority(std::get<StreamPriority>(*priority));
+    } else {
+      gpu_stream->SetPriority(std::get<int>(*priority));
+    }
+  }
+  absl::MutexLock l(&alive_gpu_streams_mu_);
+  bool init_worked = gpu_stream->Init();
+  if (init_worked) {
+    auto platform_specific_stream = gpu_stream->platform_specific_stream();
+    auto stream = std::make_unique<Stream>(this, std::move(gpu_stream));
+    alive_gpu_streams_[platform_specific_stream] = stream.get();
+    return std::move(stream);
+  } else {
+    return absl::InvalidArgumentError("Failed to initialize GPU stream");
+  }
 }
 
 absl::StatusOr<std::unique_ptr<Kernel>> GpuExecutor::CreateKernel() {
@@ -892,7 +907,8 @@ GpuContext* GpuExecutor::gpu_context() { return context_; }
 //
 // For anything more complicated/prod-focused than this, you'll likely want to
 // turn to gsys' topology modeling.
-static int TryToReadNumaNode(const string& pci_bus_id, int device_ordinal) {
+static int TryToReadNumaNode(const std::string& pci_bus_id,
+                             int device_ordinal) {
   VLOG(2) << "trying to read NUMA node for device ordinal: " << device_ordinal;
   static const int kUnknownNumaNode = -1;
 
@@ -966,7 +982,7 @@ GpuExecutor::CreateDeviceDescription(int device_ordinal) {
 
   {
     int version = GpuDriver::GetDriverVersion().value_or(-1);
-    string augmented_driver_version = absl::StrFormat(
+    std::string augmented_driver_version = absl::StrFormat(
         "%d (%s)", version,
         rocm::DriverVersionStatusToString(Diagnostician::FindDsoVersion())
             .c_str());
@@ -974,7 +990,7 @@ GpuExecutor::CreateDeviceDescription(int device_ordinal) {
   }
 
   {
-    string pci_bus_id = GpuDriver::GetPCIBusID(device);
+    std::string pci_bus_id = GpuDriver::GetPCIBusID(device);
 
     // Lower the hex characters to match sysfs.
     pci_bus_id = absl::AsciiStrToLower(pci_bus_id);
@@ -1023,7 +1039,7 @@ GpuExecutor::CreateDeviceDescription(int device_ordinal) {
   }
 
   {
-    string device_name;
+    std::string device_name;
     TF_RETURN_IF_ERROR(GpuDriver::GetDeviceName(device, &device_name));
     builder.set_name(device_name);
   }
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_helpers.cu.cc b/third_party/xla/xla/stream_executor/rocm/rocm_helpers.cu.cc
index cf12ffdaeb47ca..ae287aa4025fb2 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_helpers.cu.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_helpers.cu.cc
@@ -83,16 +83,23 @@ __device__ float sigmoid(float x) {
     return __expf(x) / (__expf(x) + 1.);
 }
 
-template <typename T, int act_mode>
-__global__ void launchInplaceBiasActivation_kernel(T* c_data,
-                                                   const T* bias_data,
-                                                   uint64_t m, uint64_t n,
-                                                   int64_t ldc, float param) {
+template <typename T, typename Tbias, int act_mode>
+__global__ void launchInplaceBiasActivation_kernel(
+    T* c_data, const Tbias* bias_data, const T* side_input_data,
+    float side_input_scale, uint64_t m, uint64_t n, int64_t ldc, float param,
+    int transpose) {
   uint64_t x = threadIdx.x + blockIdx.x * blockDim.x;
   uint64_t y = threadIdx.y + blockIdx.y * blockDim.y;
+  uint64_t z = blockIdx.z;
   if (x >= n || y >= m) return;
-  float v = static_cast<float>(c_data[x + y * ldc]) +
-            static_cast<float>(bias_data[x]);
+  float v;
+  uint64_t addr = x + y * ldc + z * m * n;
+  if (!transpose)
+    v = static_cast<float>(c_data[addr]) + static_cast<float>(bias_data[x]);
+  else
+    v = static_cast<float>(c_data[addr]) + static_cast<float>(bias_data[y]);
+  if (side_input_data != 0)
+    v += static_cast<float>(side_input_data[addr]) * side_input_scale;
   if (act_mode == 1)
     v = sigmoid(v);
   else if (act_mode == 2)
@@ -111,58 +118,62 @@ __global__ void launchInplaceBiasActivation_kernel(T* c_data,
     v = v > 0.0f ? v : param * v;
   else if (act_mode == 9)
     v = 0.5 * v * (1 + erf(v / sqrt(2.0f)));
-  c_data[x + y * ldc] = (T)v;
+  c_data[addr] = (T)v;
 }
 
-template <typename T>
+template <typename T, typename Tbias>
 void launchInplaceBiasActivation(hipStream_t stream, void* c_data,
-                                 const void* bias_data, int activation_mode,
-                                 uint64_t m, uint64_t n, int64_t ldc,
-                                 float param) {
+                                 const void* bias_data,
+                                 const void* side_input_data,
+                                 float side_input_scale, int activation_mode,
+                                 uint64_t batch, uint64_t m, uint64_t n,
+                                 int64_t ldc, float param) {
   uint64_t bx = min(n, static_cast<uint64_t>(256));
   uint64_t by = min(m, static_cast<uint64_t>(256) / bx);
   uint64_t gx = (n + bx - 1) / bx;
   uint64_t gy = (m + by - 1) / by;
-  auto kernel = launchInplaceBiasActivation_kernel<T, 0>;
+  int transpose = (activation_mode >= 10);
+  activation_mode %= 10;
+  auto kernel = launchInplaceBiasActivation_kernel<T, Tbias, 0>;
   if (activation_mode == 1)
-    kernel = launchInplaceBiasActivation_kernel<T, 1>;
+    kernel = launchInplaceBiasActivation_kernel<T, Tbias, 1>;
   else if (activation_mode == 2)
-    kernel = launchInplaceBiasActivation_kernel<T, 2>;
+    kernel = launchInplaceBiasActivation_kernel<T, Tbias, 2>;
   else if (activation_mode == 3)
-    kernel = launchInplaceBiasActivation_kernel<T, 3>;
+    kernel = launchInplaceBiasActivation_kernel<T, Tbias, 3>;
   else if (activation_mode == 4)
-    kernel = launchInplaceBiasActivation_kernel<T, 4>;
+    kernel = launchInplaceBiasActivation_kernel<T, Tbias, 4>;
   else if (activation_mode == 5)
-    kernel = launchInplaceBiasActivation_kernel<T, 5>;
+    kernel = launchInplaceBiasActivation_kernel<T, Tbias, 5>;
   else if (activation_mode == 6)
-    kernel = launchInplaceBiasActivation_kernel<T, 6>;
+    kernel = launchInplaceBiasActivation_kernel<T, Tbias, 6>;
   else if (activation_mode == 7)
-    kernel = launchInplaceBiasActivation_kernel<T, 7>;
+    kernel = launchInplaceBiasActivation_kernel<T, Tbias, 7>;
   else if (activation_mode == 8)
-    kernel = launchInplaceBiasActivation_kernel<T, 8>;
+    kernel = launchInplaceBiasActivation_kernel<T, Tbias, 8>;
   else if (activation_mode == 9)
-    kernel = launchInplaceBiasActivation_kernel<T, 9>;
+    kernel = launchInplaceBiasActivation_kernel<T, Tbias, 9>;
 
-  hipLaunchKernelGGL(kernel, dim3(gx, gy, 1), dim3(bx, by, 1), 0, stream,
-                     static_cast<T*>(c_data), static_cast<const T*>(bias_data),
-                     m, n, ldc, param);
+  hipLaunchKernelGGL(kernel, dim3(gx, gy, batch), dim3(bx, by, 1), 0, stream,
+                     static_cast<T*>(c_data),
+                     static_cast<const Tbias*>(bias_data),
+                     static_cast<const T*>(side_input_data), side_input_scale,
+                     m, n, ldc, param, transpose);
 }
 
-template void launchInplaceBiasActivation<__half>(
-    hipStream_t stream, void* c_data, const void* bias_data,
-    int activation_mode, uint64_t m, uint64_t n, int64_t ldc, float param);
-
-template void launchInplaceBiasActivation<hip_bfloat16>(
-    hipStream_t stream, void* c_data, const void* bias_data,
-    int activation_mode, uint64_t m, uint64_t n, int64_t ldc, float param);
-
-template void launchInplaceBiasActivation<float>(
-    hipStream_t stream, void* c_data, const void* bias_data,
-    int activation_mode, uint64_t m, uint64_t n, int64_t ldc, float param);
-
-template void launchInplaceBiasActivation<double>(
-    hipStream_t stream, void* c_data, const void* bias_data,
-    int activation_mode, uint64_t m, uint64_t n, int64_t ldc, float param);
+#define INSTANTIATE_BIAS_ACTIVATION(X, Y)                          \
+  template void launchInplaceBiasActivation<X, Y>(                 \
+      hipStream_t stream, void* c_data, const void* bias_data,     \
+      const void* side_input_data, float side_input_scale,         \
+      int activation_mode, uint64_t batch, uint64_t m, uint64_t n, \
+      int64_t ldc, float param);
+
+INSTANTIATE_BIAS_ACTIVATION(__half, __half)
+INSTANTIATE_BIAS_ACTIVATION(__half, float)
+INSTANTIATE_BIAS_ACTIVATION(hip_bfloat16, hip_bfloat16)
+INSTANTIATE_BIAS_ACTIVATION(hip_bfloat16, float)
+INSTANTIATE_BIAS_ACTIVATION(float, float)
+INSTANTIATE_BIAS_ACTIVATION(double, double)
 
 };  // namespace gpu
 };  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_platform.cc b/third_party/xla/xla/stream_executor/rocm/rocm_platform.cc
index 1a759bd8e6a9b6..0ac3540c4e627d 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_platform.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_platform.cc
@@ -102,7 +102,7 @@ int ROCmPlatform::VisibleDeviceCount() const {
   return GpuDriver::GetDeviceCount();
 }
 
-const string& ROCmPlatform::Name() const { return name_; }
+const std::string& ROCmPlatform::Name() const { return name_; }
 
 absl::StatusOr<std::unique_ptr<DeviceDescription>>
 ROCmPlatform::DescriptionForDevice(int ordinal) const {
@@ -129,8 +129,7 @@ absl::StatusOr<StreamExecutor*> ROCmPlatform::GetExecutor(
 
 absl::StatusOr<std::unique_ptr<StreamExecutor>>
 ROCmPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
-  auto executor = std::make_unique<StreamExecutor>(
-      this, std::make_unique<GpuExecutor>(config.ordinal));
+  auto executor = std::make_unique<GpuExecutor>(this, config.ordinal);
   auto init_status = executor->Init();
   if (!init_status.ok()) {
     return absl::Status{
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_platform.h b/third_party/xla/xla/stream_executor/rocm/rocm_platform.h
index 05934b1245ab6a..e25035da503d7c 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_platform.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_platform.h
@@ -59,7 +59,7 @@ class ROCmPlatform : public Platform {
   // Returns -1 as a sentinel on internal failure (and logs the error).
   int VisibleDeviceCount() const override;
 
-  const string& Name() const override;
+  const std::string& Name() const override;
 
   absl::StatusOr<std::unique_ptr<DeviceDescription>> DescriptionForDevice(
       int ordinal) const override;
@@ -77,7 +77,7 @@ class ROCmPlatform : public Platform {
   void InspectNumaNodes();
 
   // This platform's name.
-  string name_;
+  std::string name_;
 
   // mutex that guards internal state.
   mutable absl::Mutex mu_;
diff --git a/third_party/xla/xla/stream_executor/rocm/roctracer_wrapper.h b/third_party/xla/xla/stream_executor/rocm/roctracer_wrapper.h
index cbfe9e2e2cfe67..b42751bb53e0cf 100644
--- a/third_party/xla/xla/stream_executor/rocm/roctracer_wrapper.h
+++ b/third_party/xla/xla/stream_executor/rocm/roctracer_wrapper.h
@@ -31,16 +31,17 @@ limitations under the License.
 #include "xla/stream_executor/platform/dso_loader.h"
 #include "xla/stream_executor/platform/port.h"
 #include "tsl/platform/env.h"
+#include "tsl/platform/platform.h"
 
 namespace stream_executor {
 namespace wrap {
 
 #ifdef PLATFORM_GOOGLE
 
-#define ROCTRACER_API_WRAPPER(API_NAME)                          \
-  template <typename... Args>                                    \
-  auto API_NAME()(Args... args)->decltype(::API_NAME(args...)) { \
-    return ::API_NAME(args...);                                  \
+#define ROCTRACER_API_WRAPPER(API_NAME)                            \
+  template <typename... Args>                                      \
+  auto API_NAME(Args... args) -> decltype((::API_NAME)(args...)) { \
+    return (::API_NAME)(args...);                                  \
   }
 
 #else
@@ -52,7 +53,7 @@ namespace wrap {
     static FuncPtrT loaded = []() -> FuncPtrT {                               \
       static const char* kName = #API_NAME;                                   \
       void* f;                                                                \
-      auto s = tsl::Env::Default() -> GetSymbolFromLibrary(                   \
+      auto s = tsl::Env::Default()->GetSymbolFromLibrary(                     \
           stream_executor::internal::CachedDsoLoader::GetRoctracerDsoHandle() \
               .value(),                                                       \
           kName, &f);                                                         \
diff --git a/third_party/xla/xla/stream_executor/scoped_module_handle.h b/third_party/xla/xla/stream_executor/scoped_module_handle.h
new file mode 100644
index 00000000000000..35af165f5a2e5b
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/scoped_module_handle.h
@@ -0,0 +1,63 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_SCOPED_MODULE_HANDLE_H_
+#define XLA_STREAM_EXECUTOR_SCOPED_MODULE_HANDLE_H_
+
+#include "absl/log/check.h"
+#include "xla/stream_executor/module_spec.h"
+#include "xla/stream_executor/stream_executor_interface.h"
+
+namespace stream_executor {
+
+// A wrapper around ModuleHandle that uses RAII to manage its lifetime.
+class ScopedModuleHandle {
+ public:
+  ScopedModuleHandle(StreamExecutorInterface* executor,
+                     ModuleHandle module_handle)
+      : executor_(executor), module_handle_(module_handle) {}
+
+  ScopedModuleHandle(ScopedModuleHandle&& other) {
+    executor_ = other.executor_;
+    module_handle_ = other.module_handle_;
+    other.executor_ = nullptr;
+    other.module_handle_ = ModuleHandle();
+  }
+
+  ScopedModuleHandle& operator=(ScopedModuleHandle&& other) {
+    executor_ = other.executor_;
+    module_handle_ = other.module_handle_;
+    other.executor_ = nullptr;
+    other.module_handle_ = ModuleHandle();
+    return *this;
+  }
+
+  ~ScopedModuleHandle() {
+    if (static_cast<bool>(module_handle_)) {
+      CHECK(executor_->UnloadModule(module_handle_));
+    }
+  }
+
+ private:
+  StreamExecutorInterface* executor_;
+  ModuleHandle module_handle_;
+
+  ScopedModuleHandle(const ScopedModuleHandle&) = delete;
+  void operator=(const ScopedModuleHandle&) = delete;
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_SCOPED_MODULE_HANDLE_H_
diff --git a/third_party/xla/xla/stream_executor/scoped_module_handle_test.cc b/third_party/xla/xla/stream_executor/scoped_module_handle_test.cc
new file mode 100644
index 00000000000000..d3594ea0548c99
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/scoped_module_handle_test.cc
@@ -0,0 +1,54 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/scoped_module_handle.h"
+
+#include <utility>
+
+#include "xla/stream_executor/mock_stream_executor.h"
+#include "xla/stream_executor/module_spec.h"
+#include "tsl/platform/test.h"
+
+using testing::Return;
+
+namespace stream_executor {
+namespace {
+
+TEST(ScopedModuleHandleTest, NoUnloadForNullHandle) {
+  ModuleHandle foo;
+  MockStreamExecutor executor;
+  EXPECT_CALL(executor, UnloadModule).Times(0);
+  {
+    ScopedModuleHandle first(&executor, foo);
+    ScopedModuleHandle second = std::move(first);
+    ScopedModuleHandle third(&executor, foo);
+    third = std::move(second);
+  }
+}
+
+TEST(ScopedModuleHandleTest, NonNullHandleUnloadsOnceAfterMoves) {
+  ModuleHandle foo(reinterpret_cast<void*>(1));
+  MockStreamExecutor executor;
+  EXPECT_CALL(executor, UnloadModule).WillOnce(Return(true));
+  {
+    ScopedModuleHandle first(&executor, foo);
+    ScopedModuleHandle second = std::move(first);
+    ScopedModuleHandle third(&executor, ModuleHandle{});
+    third = std::move(second);
+  }
+}
+
+}  // namespace
+}  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/stream.cc b/third_party/xla/xla/stream_executor/stream.cc
index 304eda4b339c57..0c611032332b06 100644
--- a/third_party/xla/xla/stream_executor/stream.cc
+++ b/third_party/xla/xla/stream_executor/stream.cc
@@ -38,35 +38,15 @@ limitations under the License.
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/stacktrace.h"
+#include "tsl/platform/statusor.h"
 
 namespace stream_executor {
 
-Stream::Stream(StreamExecutor *parent)
-    : parent_(parent), implementation_(nullptr), status_(absl::OkStatus()) {}
-
-absl::Status Stream::Initialize(
-    std::optional<std::variant<StreamPriority, int>> priority) {
-  absl::MutexLock lock(&mu_);
-  if (implementation_ != nullptr) {
-    return absl::InternalError(
-        "stream appears to already have been initialized");
-  }
-  implementation_ = parent_->implementation()->GetStreamImplementation();
-  if (priority.has_value()) {
-    if (std::holds_alternative<StreamPriority>(*priority)) {
-      implementation_->SetPriority(std::get<StreamPriority>(*priority));
-    } else {
-      implementation_->SetPriority(std::get<int>(*priority));
-    }
-  }
-
-  if (parent_->AllocateStream(this)) {
-    // Successful initialization!
-    return absl::OkStatus();
-  }
-
-  return absl::InternalError("failed to allocate stream during initialization");
-}
+Stream::Stream(StreamExecutor *parent,
+               std::unique_ptr<StreamInterface> implementation)
+    : parent_(parent),
+      implementation_(std::move(implementation)),
+      status_(absl::OkStatus()) {}
 
 Stream::~Stream() {
   // Ensure the stream is completed.
@@ -143,9 +123,9 @@ absl::StatusOr<Stream *> Stream::GetOrCreateSubStream() {
   }
 
   // No streams are reusable; create a new stream.
-  sub_streams_.emplace_back(std::make_unique<Stream>(parent_), false);
-  Stream *sub_stream = sub_streams_.back().first.get();
-  TF_RETURN_IF_ERROR(sub_stream->Initialize());
+  TF_ASSIGN_OR_RETURN(auto stream, parent_->CreateStream());
+  Stream *sub_stream = stream.get();
+  sub_streams_.emplace_back(std::move(stream), false);
   VLOG(1) << "stream=" << this << " created new sub_stream=" << sub_stream;
 
   return sub_stream;
@@ -204,18 +184,12 @@ absl::Status Stream::WaitFor(Event *event) {
 
 absl::Status Stream::Memcpy(void *host_dst, const DeviceMemoryBase &gpu_src,
                             uint64_t size) {
-  if (parent_->Memcpy(this, host_dst, gpu_src, size)) {
-    return absl::OkStatus();
-  }
-  return absl::InternalError("failed to memcpy");
+  return parent_->Memcpy(this, host_dst, gpu_src, size);
 }
 
 absl::Status Stream::Memcpy(DeviceMemoryBase *gpu_dst, const void *host_src,
                             uint64_t size) {
-  if (parent_->Memcpy(this, gpu_dst, host_src, size)) {
-    return absl::OkStatus();
-  }
-  return absl::InternalError("failed to memcpy");
+  return parent_->Memcpy(this, gpu_dst, host_src, size);
 }
 
 absl::Status Stream::Memcpy(DeviceMemoryBase *gpu_dst,
diff --git a/third_party/xla/xla/stream_executor/stream.h b/third_party/xla/xla/stream_executor/stream.h
index 078b14acbd94c4..317846ea3bdfef 100644
--- a/third_party/xla/xla/stream_executor/stream.h
+++ b/third_party/xla/xla/stream_executor/stream.h
@@ -23,13 +23,10 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
-#include <optional>
-#include <string>
 #include <utility>
 #include <variant>
 #include <vector>
 
-#include "absl/base/attributes.h"
 #include "absl/base/thread_annotations.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
@@ -82,7 +79,8 @@ class Stream {
   // Instantiate a stream tied to parent as a platform executor. Work
   // entrained onto this stream will be launched/managed on that
   // StreamExecutor's platform.
-  explicit Stream(StreamExecutor *parent);
+  explicit Stream(StreamExecutor *parent,
+                  std::unique_ptr<StreamInterface> implementation);
 
   // Deallocates any stream resources that the parent StreamExecutor has
   // bestowed
@@ -109,11 +107,6 @@ class Stream {
   // execution status.
   absl::Status RefreshStatus() TF_LOCKS_EXCLUDED(mu_);
 
-  // Initialize the stream. This must be performed before entraining any other
-  // operations.
-  absl::Status Initialize(
-      std::optional<std::variant<StreamPriority, int>> priority = std::nullopt);
-
   // Get or create a sub-stream from this stream. If there is any sub-stream in
   // the pool that can be reused then just return this sub-stream.  Otherwise
   // create a new sub-stream.
@@ -149,22 +142,12 @@ class Stream {
   absl::Status ThenLaunch(ThreadDim thread_dims, BlockDim block_dims,
                           const TypedKernel<Params...> &kernel, Args... args);
 
-  template <typename... Params, typename... Args>
-  absl::Status ThenLaunch(ThreadDim thread_dims, BlockDim block_dims,
-                          ClusterDim cluster_dims,
-                          const TypedKernel<Params...> &kernel, Args... args);
-
   // Same as above, with an explicit argument for shared memory size in bytes.
   template <typename... Params, typename... Args>
   absl::Status ThenLaunch(ThreadDim thread_dims, BlockDim block_dims,
                           int32_t shmem_bytes,
                           const TypedKernel<Params...> &kernel, Args... args);
 
-  template <typename... Params, typename... Args>
-  absl::Status ThenLaunch(ThreadDim thread_dims, BlockDim block_dims,
-                          ClusterDim cluster_dims, int32_t shmem_bytes,
-                          const TypedKernel<Params...> &kernel, Args... args);
-
   // Create a dependency for this stream's next work on the other stream
   // completing. Does not take ownership of other, and other must not be
   // null.
@@ -283,7 +266,6 @@ class Stream {
     return parent_;
   }
 
-  //
   CudaComputeCapability GetCudaComputeCapability() const {
     return parent()->GetDeviceDescription().cuda_compute_capability();
   }
@@ -333,9 +315,6 @@ class Stream {
   void operator=(const Stream &) = delete;
 };
 
-////////////
-// Inlines
-
 template <typename... Params, typename... Args>
 inline absl::Status Stream::ThenLaunch(ThreadDim thread_dims,
                                        BlockDim block_dims,
@@ -358,28 +337,6 @@ inline absl::Status Stream::ThenLaunch(ThreadDim thread_dims,
   return absl::OkStatus();
 }
 
-template <typename... Params, typename... Args>
-inline absl::Status Stream::ThenLaunch(ThreadDim thread_dims,
-                                       BlockDim block_dims,
-                                       ClusterDim cluster_dims,
-                                       const TypedKernel<Params...> &kernel,
-                                       Args... args) {
-  auto kernel_args = PackKernelArgs(kernel, args...);
-  TF_RETURN_IF_ERROR(parent_->Launch(this, thread_dims, block_dims,
-                                     cluster_dims, *kernel, *kernel_args));
-  return absl::OkStatus();
-}
-
-template <typename... Params, typename... Args>
-inline absl::Status Stream::ThenLaunch(
-    ThreadDim thread_dims, BlockDim block_dims, ClusterDim cluster_dims,
-    int32_t shmem_bytes, const TypedKernel<Params...> &kernel, Args... args) {
-  auto kernel_args = PackKernelArgs(shmem_bytes, args...);
-  TF_RETURN_IF_ERROR(parent_->Launch(this, thread_dims, block_dims,
-                                     cluster_dims, *kernel, *kernel_args));
-  return absl::OkStatus();
-}
-
 }  // namespace stream_executor
 
 #endif  // XLA_STREAM_EXECUTOR_STREAM_H_
diff --git a/third_party/xla/xla/stream_executor/stream_executor_interface.h b/third_party/xla/xla/stream_executor/stream_executor_interface.h
index 6f2b433bfc04ed..aa5f04ced0af43 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_interface.h
+++ b/third_party/xla/xla/stream_executor/stream_executor_interface.h
@@ -19,6 +19,8 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <optional>
+#include <string>
+#include <variant>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -35,10 +37,12 @@ limitations under the License.
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/module_spec.h"
+#include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_interface.h"
 
 namespace stream_executor {
 
+class Event;
 class Stream;
 
 // Interface which defines the method for interacting with an accelerator device
@@ -48,12 +52,40 @@ class StreamExecutorInterface {
   StreamExecutorInterface() = default;
   virtual ~StreamExecutorInterface() = default;
 
+  // Returns a reference to the platform that created this executor.
+  virtual const Platform* GetPlatform() const = 0;
+
   // Initializes the device for use.
   virtual absl::Status Init() = 0;
 
   // Returns the device ordinal.
   virtual int device_ordinal() const { return -1; }
 
+  // Creates and initializes a Stream.
+  virtual absl::StatusOr<std::unique_ptr<Stream>> CreateStream(
+      std::optional<std::variant<StreamPriority, int>> priority =
+          std::nullopt) = 0;
+
+  // Creates and initializes an Event.
+  virtual absl::StatusOr<std::unique_ptr<Event>> CreateEvent() = 0;
+
+  // Obtains metadata about the underlying device.
+  // The value is cached on first use.
+  virtual const DeviceDescription& GetDeviceDescription() const = 0;
+
+  // Synchronously allocates an array on the device of type T with element_count
+  // elements.
+  template <typename T>
+  DeviceMemory<T> AllocateArray(uint64_t element_count,
+                                int64_t memory_space = 0);
+
+  // Convenience wrapper that allocates space for a single element of type T in
+  // device memory.
+  template <typename T>
+  DeviceMemory<T> AllocateScalar() {
+    return AllocateArray<T>(1);
+  }
+
   // Retrieves (loads) a kernel, if one exists.
   //
   // Parameters:
@@ -72,8 +104,8 @@ class StreamExecutorInterface {
   // Loads a module for the platform this StreamExecutor is acting upon.
   //
   // `spec` describes the module to be loaded.  On success writes the handle for
-  // the loaded module to `module_handle` and returns OkStatus().  Otherwise,
-  // returns the error which has occurred.
+  // the loaded module to `module_handle` and returns absl::OkStatus().
+  // Otherwise, returns the error which has occurred.
   virtual absl::Status LoadModule(const MultiModuleLoaderSpec& spec,
                                   ModuleHandle* module_handle) {
     return absl::UnimplementedError("Not Implemented");
@@ -171,12 +203,20 @@ class StreamExecutorInterface {
   virtual absl::Status SynchronousMemcpy(DeviceMemoryBase* device_dst,
                                          const void* host_src,
                                          uint64_t size) = 0;
+  absl::Status SynchronousMemcpyH2D(const void* host_src, int64_t size,
+                                    DeviceMemoryBase* device_dst) {
+    return SynchronousMemcpy(device_dst, host_src, size);
+  }
 
-  // Blocks the caller while "size" bytes are copied to the given location in
-  // host memory.
+  // Blocks the caller while "size" bytes are copied to the given location
+  // in host memory.
   virtual absl::Status SynchronousMemcpy(void* host_dst,
                                          const DeviceMemoryBase& device_src,
                                          uint64_t size) = 0;
+  absl::Status SynchronousMemcpyD2H(const DeviceMemoryBase& device_src,
+                                    int64_t size, void* host_dst) {
+    return SynchronousMemcpy(host_dst, device_src, size);
+  }
 
   // Enqueues an operation onto stream to zero out size bytes at the given
   // device memory location. Neither stream nor location may be null. Returns
@@ -223,9 +263,6 @@ class StreamExecutorInterface {
   virtual bool HostCallback(Stream* stream,
                             absl::AnyInvocable<absl::Status() &&> callback) = 0;
 
-  // Performs platform-specific allocation and initialization of an event.
-  virtual absl::Status AllocateEvent(Event* event) = 0;
-
   // Performs platform-specific deallocation and cleanup of an event.
   virtual absl::Status DeallocateEvent(Event* event) = 0;
 
@@ -246,10 +283,6 @@ class StreamExecutorInterface {
   // Requests the current status of the event from the underlying platform.
   virtual Event::Status PollForEventStatus(Event* event) = 0;
 
-  // Allocates stream resources on the underlying platform and initializes its
-  // internals.
-  virtual bool AllocateStream(Stream* stream) = 0;
-
   // Deallocates stream resources on the underlying platform.
   virtual void DeallocateStream(Stream* stream) = 0;
 
@@ -283,19 +316,16 @@ class StreamExecutorInterface {
     return false;
   }
 
-  // Retrieves device pointer and size for a symbol. The device pointer is
-  // stored at mem, and the size is stored at size. Either mem or bytes can be
-  // null, however, both of them cannot be null at the same time. To use
-  // constant memory in CUDA, GetSymbol has to be used. Returns true if symbol
-  // is found.
+  // Retrieves device pointer and size for a symbol. To use
+  // constant memory in CUDA, GetSymbol has to be used. Returns DeviceMemoryBase
+  // describing the symbol in memory if symbol is found.
   //
   // If ModuleHandle is set then we search for `symbol_name` only within the
   // module corresponding to `module_handle`.  Otherwise all loaded modules are
   // searched.
-  virtual bool GetSymbol(const std::string& symbol_name,
-                         ModuleHandle module_handle, void** mem,
-                         size_t* bytes) {
-    return false;
+  virtual absl::StatusOr<DeviceMemoryBase> GetSymbol(
+      const std::string& symbol_name, ModuleHandle module_handle) {
+    return absl::UnimplementedError("Not implemented");
   }
 
   // Creates a new DeviceDescription object. Ownership is transferred to the
@@ -324,11 +354,6 @@ class StreamExecutorInterface {
   // underlying platform.
   virtual dnn::DnnSupport* AsDnn() { return nullptr; }
 
-  // Each call creates a new instance of the platform-specific implementation of
-  // the corresponding interface type.
-  virtual std::unique_ptr<EventInterface> CreateEventImplementation() = 0;
-  virtual std::unique_ptr<StreamInterface> GetStreamImplementation() = 0;
-
   // Creates a new Kernel object.
   // TODO(klucke) Combine with GetKernel.
   virtual absl::StatusOr<std::unique_ptr<Kernel>> CreateKernel() {
@@ -359,11 +384,62 @@ class StreamExecutorInterface {
   // Returns a stream allocated by this executor, or nullptr if not found.
   virtual Stream* FindAllocatedStream(void* device_stream) { return nullptr; }
 
+  // Returns the memory limit in bytes supported by this executor.
+  virtual int64_t GetMemoryLimitBytes() const = 0;
+
+  // The following methods access an internal log of some subset
+  // of arguments passed to other class methods.
+  // Used for testing/debugging purposes.
+
+  struct GemmCallTrace {
+    enum class GemmType {
+      kPlain = 0,
+      kStridedBatched = 1,
+      kBatched = 2,
+      kBlasLt = 3
+    };
+    GemmType op;
+    int flags;
+    uint64_t size1, size2;
+  };
+  // This may be expanded as necessary to trace other calls
+  using ApiTrace = std::variant<GemmCallTrace>;
+
+  // Retrieves and clears internal argument logs.
+  virtual absl::StatusOr<std::vector<ApiTrace>> ExtractApiTrace() {
+    return absl::UnimplementedError("Not implemented");
+  }
+  virtual absl::Status RecordApiTrace(ApiTrace call) {
+    return absl::UnimplementedError("Not implemented");
+  }
+
+  static constexpr uint64_t kLogGemm = 1 << 0;
+
+  // Sets the argument logging mode. Returns true if 'mode' is valid.
+  // The mode is a bitmask of the kLog* constants.
+  virtual bool SetArgumentLoggingMode(uint64_t mode) { return false; }
+
  private:
   StreamExecutorInterface(const StreamExecutorInterface&) = delete;
   void operator=(const StreamExecutorInterface&) = delete;
 };
 
+template <typename T>
+inline DeviceMemory<T> StreamExecutorInterface::AllocateArray(
+    uint64_t element_count, int64_t memory_space) {
+  uint64_t bytes = sizeof(T) * element_count;
+  auto memory_limit_bytes = GetMemoryLimitBytes();
+  if (memory_limit_bytes > 0 &&
+      static_cast<int64_t>(bytes) > memory_limit_bytes) {
+    LOG(WARNING) << "Not enough memory to allocate " << bytes << " on device "
+                 << device_ordinal()
+                 << " within provided limit.  limit=" << memory_limit_bytes
+                 << "]";
+    return DeviceMemory<T>();
+  }
+  return DeviceMemory<T>(Allocate(bytes, memory_space));
+}
+
 }  // namespace stream_executor
 
 #endif  // XLA_STREAM_EXECUTOR_STREAM_EXECUTOR_INTERFACE_H_
diff --git a/third_party/xla/xla/stream_executor/stream_executor_pimpl.cc b/third_party/xla/xla/stream_executor/stream_executor_pimpl.cc
index ea962a8fe871f8..efd27f4ec49f48 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_pimpl.cc
+++ b/third_party/xla/xla/stream_executor/stream_executor_pimpl.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 #include "xla/stream_executor/stream_executor_pimpl.h"
 
-#include <atomic>
 #include <cstddef>
 #include <cstdint>
 #include <memory>
@@ -29,7 +28,6 @@ limitations under the License.
 #include <utility>
 #include <variant>
 
-#include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -38,18 +36,13 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
-#include "xla/stream_executor/allocator_stats.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/fft.h"
 #include "xla/stream_executor/kernel.h"
-#include "xla/stream_executor/kernel_spec.h"
-#include "xla/stream_executor/launch_dim.h"
-#include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/module_spec.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
@@ -64,58 +57,16 @@ namespace stream_executor {
 
 // Get per-device memory limit in bytes. Returns 0 if
 // TF_PER_DEVICE_MEMORY_LIMIT_MB environment variable is not set.
-static int64_t GetMemoryLimitBytes() {
+static int64_t GetMemoryLimitBytesFromEnvironmentVariable() {
   int64_t value;
   TF_CHECK_OK(
       tsl::ReadInt64FromEnvVar("TF_PER_DEVICE_MEMORY_LIMIT_MB", 0, &value));
   return value * (1ll << 20);
 }
 
-StreamExecutor::StreamExecutor(
-    const Platform* platform,
-    std::unique_ptr<StreamExecutorInterface> implementation)
+StreamExecutor::StreamExecutor(const Platform* platform)
     : platform_(platform),
-      implementation_(std::move(implementation)),
-      memory_limit_bytes_(GetMemoryLimitBytes()),
-      allocator_(this) {}
-
-absl::Status StreamExecutor::Init() { return implementation_->Init(); }
-
-absl::Status StreamExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
-                                       Kernel* kernel) {
-  return implementation_->GetKernel(spec, kernel);
-}
-
-void StreamExecutor::UnloadKernel(const Kernel* kernel) {
-  implementation_->UnloadKernel(kernel);
-}
-
-absl::Status StreamExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
-                                        ModuleHandle* module_handle) {
-  return implementation_->LoadModule(spec, module_handle);
-}
-
-bool StreamExecutor::UnloadModule(ModuleHandle module_handle) {
-  return implementation_->UnloadModule(module_handle);
-}
-
-absl::StatusOr<std::shared_ptr<DeviceMemoryBase>>
-StreamExecutor::CreateOrShareConstant(Stream* stream,
-                                      absl::Span<const uint8_t> content) {
-  return implementation_->CreateOrShareConstant(stream, content);
-}
-
-void StreamExecutor::Deallocate(DeviceMemoryBase* mem) {
-  implementation_->Deallocate(mem);
-}
-
-bool StreamExecutor::CanEnablePeerAccessTo(StreamExecutor* other) {
-  return implementation_->CanEnablePeerAccessTo(other->implementation_.get());
-}
-
-absl::Status StreamExecutor::EnablePeerAccessTo(StreamExecutor* other) {
-  return implementation_->EnablePeerAccessTo(other->implementation_.get());
-}
+      memory_limit_bytes_(GetMemoryLimitBytesFromEnvironmentVariable()) {}
 
 const DeviceDescription& StreamExecutor::GetDeviceDescription() const {
   absl::MutexLock lock(&mu_);
@@ -123,307 +74,8 @@ const DeviceDescription& StreamExecutor::GetDeviceDescription() const {
     return *device_description_;
   }
 
-  device_description_ = CreateDeviceDescription();
+  device_description_ = CreateDeviceDescription().value();
   return *device_description_;
 }
 
-dnn::DnnSupport* StreamExecutor::AsDnn() { return implementation_->AsDnn(); }
-
-blas::BlasSupport* StreamExecutor::AsBlas() {
-  return implementation_->AsBlas();
-}
-
-fft::FftSupport* StreamExecutor::AsFft() { return implementation_->AsFft(); }
-
-absl::Status StreamExecutor::Launch(Stream* stream,
-                                    const ThreadDim& thread_dims,
-                                    const BlockDim& block_dims,
-                                    const Kernel& kernel,
-                                    const KernelArgs& args) {
-  return implementation_->Launch(stream, thread_dims, block_dims, kernel, args);
-}
-
-absl::Status StreamExecutor::Launch(Stream* stream,
-                                    const ThreadDim& thread_dims,
-                                    const BlockDim& block_dims,
-                                    const ClusterDim& cluster_dims,
-                                    const Kernel& kernel,
-                                    const KernelArgs& args) {
-  return implementation_->Launch(stream, thread_dims, block_dims, cluster_dims,
-                                 kernel, args);
-}
-
-absl::Status StreamExecutor::Submit(Stream* stream,
-                                    const CommandBuffer& command_buffer) {
-  return implementation_->Submit(stream, command_buffer);
-}
-
-absl::Status StreamExecutor::BlockHostUntilDone(Stream* stream) {
-  return implementation_->BlockHostUntilDone(stream);
-}
-
-absl::Status StreamExecutor::GetStatus(Stream* stream) {
-  return implementation_->GetStatus(stream);
-}
-
-DeviceMemoryBase StreamExecutor::Allocate(uint64_t size, int64_t memory_space) {
-  if (memory_limit_bytes_ > 0 &&
-      static_cast<int64_t>(size) > memory_limit_bytes_) {
-    LOG(WARNING) << "Not enough memory to allocate " << size << " on device "
-                 << device_ordinal()
-                 << " within provided limit.  limit=" << memory_limit_bytes_
-                 << "]";
-    return DeviceMemoryBase();
-  }
-  return implementation_->Allocate(size, memory_space);
-}
-
-absl::StatusOr<DeviceMemoryBase> StreamExecutor::GetUntypedSymbol(
-    const std::string& symbol_name, ModuleHandle module_handle) {
-  // If failed to get the symbol, opaque/bytes are unchanged. Initialize them to
-  // be nullptr/0 for consistency with DeviceMemory semantics.
-  void* opaque = nullptr;
-  size_t bytes = 0;
-  if (GetSymbol(symbol_name, module_handle, &opaque, &bytes)) {
-    return DeviceMemoryBase(opaque, bytes);
-  }
-
-  return absl::NotFoundError(
-      absl::StrCat("Check if module containing symbol ", symbol_name,
-                   " is loaded (module_handle = ",
-                   reinterpret_cast<uintptr_t>(module_handle.id()), ")"));
-}
-
-bool StreamExecutor::GetSymbol(const std::string& symbol_name,
-                               ModuleHandle module_handle, void** mem,
-                               size_t* bytes) {
-  return implementation_->GetSymbol(symbol_name, module_handle, mem, bytes);
-}
-
-void* StreamExecutor::UnifiedMemoryAllocate(uint64_t bytes) {
-  return implementation_->UnifiedMemoryAllocate(bytes);
-}
-
-void StreamExecutor::UnifiedMemoryDeallocate(void* location) {
-  return implementation_->UnifiedMemoryDeallocate(location);
-}
-
-absl::StatusOr<void*> StreamExecutor::CollectiveMemoryAllocate(uint64_t bytes) {
-  return implementation_->CollectiveMemoryAllocate(bytes);
-}
-
-absl::Status StreamExecutor::CollectiveMemoryDeallocate(void* location) {
-  return implementation_->CollectiveMemoryDeallocate(location);
-}
-
-absl::StatusOr<std::unique_ptr<MemoryAllocation>>
-StreamExecutor::HostMemoryAllocate(uint64_t size) {
-  return implementation_->HostMemoryAllocate(size);
-}
-
-void StreamExecutor::HostMemoryDeallocate(void* data, uint64_t size) {
-  return implementation_->HostMemoryDeallocate(data);
-}
-
-bool StreamExecutor::SynchronizeAllActivity() {
-  return implementation_->SynchronizeAllActivity();
-}
-
-absl::Status StreamExecutor::SynchronousMemZero(DeviceMemoryBase* location,
-                                                uint64_t size) {
-  return implementation_->SynchronousMemZero(location, size);
-}
-
-absl::Status StreamExecutor::SynchronousMemcpyD2H(
-    const DeviceMemoryBase& device_src, int64_t size, void* host_dst) {
-  return implementation_->SynchronousMemcpy(host_dst, device_src, size);
-}
-
-absl::Status StreamExecutor::SynchronousMemcpyH2D(
-    const void* host_src, int64_t size, DeviceMemoryBase* device_dst) {
-  return implementation_->SynchronousMemcpy(device_dst, host_src, size);
-}
-
-bool StreamExecutor::Memcpy(Stream* stream, void* host_dst,
-                            const DeviceMemoryBase& device_src, uint64_t size) {
-  return implementation_->Memcpy(stream, host_dst, device_src, size).ok();
-}
-
-bool StreamExecutor::Memcpy(Stream* stream, DeviceMemoryBase* device_dst,
-                            const void* host_src, uint64_t size) {
-  return implementation_->Memcpy(stream, device_dst, host_src, size).ok();
-}
-
-bool StreamExecutor::MemcpyDeviceToDevice(Stream* stream,
-                                          DeviceMemoryBase* device_dst,
-                                          const DeviceMemoryBase& device_src,
-                                          uint64_t size) {
-  return implementation_->MemcpyDeviceToDevice(stream, device_dst, device_src,
-                                               size);
-}
-
-absl::Status StreamExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
-                                     uint64_t size) {
-  return implementation_->MemZero(stream, location, size);
-}
-
-absl::Status StreamExecutor::Memset32(Stream* stream,
-                                      DeviceMemoryBase* location,
-                                      uint32_t pattern, uint64_t size) {
-  return implementation_->Memset32(stream, location, pattern, size);
-}
-
-bool StreamExecutor::HostCallback(
-    Stream* stream, absl::AnyInvocable<absl::Status() &&> callback) {
-  return implementation_->HostCallback(stream, std::move(callback));
-}
-
-absl::Status StreamExecutor::AllocateEvent(Event* event) {
-  return implementation_->AllocateEvent(event);
-}
-
-absl::Status StreamExecutor::DeallocateEvent(Event* event) {
-  return implementation_->DeallocateEvent(event);
-}
-
-absl::Status StreamExecutor::RecordEvent(Stream* stream, Event* event) {
-  return implementation_->RecordEvent(stream, event);
-}
-
-absl::Status StreamExecutor::WaitForEvent(Stream* stream, Event* event) {
-  return implementation_->WaitForEvent(stream, event);
-}
-
-absl::Status StreamExecutor::WaitForEventOnExternalStream(std::intptr_t stream,
-                                                          Event* event) {
-  return implementation_->WaitForEventOnExternalStream(stream, event);
-}
-
-Event::Status StreamExecutor::PollForEventStatus(Event* event) {
-  return implementation_->PollForEventStatus(event);
-}
-
-absl::StatusOr<std::unique_ptr<Stream>> StreamExecutor::CreateStream(
-    std::optional<std::variant<StreamPriority, int>> priority) {
-  auto stream = std::make_unique<Stream>(this);
-  TF_RETURN_IF_ERROR(stream->Initialize(priority));
-  return std::move(stream);
-}
-
-bool StreamExecutor::AllocateStream(Stream* stream) {
-  return implementation_->AllocateStream(stream);
-}
-
-void StreamExecutor::DeallocateStream(Stream* stream) {
-  implementation_->DeallocateStream(stream);
-}
-
-bool StreamExecutor::CreateStreamDependency(Stream* dependent, Stream* other) {
-  return implementation_->CreateStreamDependency(dependent, other);
-}
-
-std::unique_ptr<DeviceDescription> StreamExecutor::CreateDeviceDescription()
-    const {
-  return implementation_->CreateDeviceDescription().value();
-}
-
-bool StreamExecutor::DeviceMemoryUsage(int64_t* free, int64_t* total) const {
-  return implementation_->DeviceMemoryUsage(free, total);
-}
-
-std::optional<AllocatorStats> StreamExecutor::GetAllocatorStats() {
-  return implementation_->GetAllocatorStats();
-}
-
-bool StreamExecutor::ClearAllocatorStats() {
-  return implementation_->ClearAllocatorStats();
-}
-
-Stream* StreamExecutor::FindAllocatedStream(void* gpu_stream) {
-  return implementation_->FindAllocatedStream(gpu_stream);
-}
-
-StreamExecutorInterface* StreamExecutor::implementation() {
-  return implementation_.get();
-}
-
-StreamExecutorMemoryAllocator::StreamExecutorMemoryAllocator(
-    StreamExecutor* executor)
-    : DeviceMemoryAllocator(executor->platform()) {
-  stream_executors_ = {executor};
-}
-
-StreamExecutorMemoryAllocator::StreamExecutorMemoryAllocator(
-    const Platform* platform,
-    absl::Span<StreamExecutor* const> stream_executors)
-    : DeviceMemoryAllocator(platform),
-      stream_executors_(stream_executors.begin(), stream_executors.end()) {}
-
-absl::StatusOr<OwningDeviceMemory> StreamExecutorMemoryAllocator::Allocate(
-    int device_ordinal, uint64_t size, bool retry_on_failure,
-    int64_t memory_space) {
-  TF_ASSIGN_OR_RETURN(StreamExecutor * executor,
-                      GetStreamExecutor(device_ordinal));
-  DeviceMemoryBase result =
-      executor->AllocateArray<uint8_t>(size, memory_space);
-  if (size > 0 && result == nullptr) {
-    return absl::ResourceExhaustedError(absl::StrFormat(
-        "Failed to allocate request for %s (%uB) on device ordinal %d",
-        tsl::strings::HumanReadableNumBytes(size), size, device_ordinal));
-  }
-  VLOG(3) << absl::StreamFormat("Allocated %s (%uB) on device ordinal %d: %p",
-                                tsl::strings::HumanReadableNumBytes(size), size,
-                                device_ordinal, result.opaque());
-  return OwningDeviceMemory(result, device_ordinal, this);
-}
-
-absl::Status StreamExecutorMemoryAllocator::Deallocate(int device_ordinal,
-                                                       DeviceMemoryBase mem) {
-  if (!mem.is_null()) {
-    TF_ASSIGN_OR_RETURN(StreamExecutor * executor,
-                        GetStreamExecutor(device_ordinal));
-    VLOG(3) << absl::StreamFormat("Freeing %p on device ordinal %d",
-                                  mem.opaque(), device_ordinal);
-    executor->Deallocate(&mem);
-  }
-  return absl::OkStatus();
-}
-
-absl::StatusOr<StreamExecutor*>
-StreamExecutorMemoryAllocator::GetStreamExecutor(int device_ordinal) const {
-  if (device_ordinal < 0) {
-    return absl::InvalidArgumentError(absl::StrFormat(
-        "device ordinal value (%d) must be non-negative", device_ordinal));
-  }
-  for (StreamExecutor* se : stream_executors_) {
-    if (se->device_ordinal() == device_ordinal) {
-      return se;
-    }
-  }
-  return absl::NotFoundError(
-      absl::StrFormat("Device %s:%d present but not supported",
-                      platform()->Name(), device_ordinal));
-}
-
-bool StreamExecutorMemoryAllocator::AllowsAsynchronousDeallocation() const {
-  return false;
-}
-
-absl::StatusOr<Stream*> StreamExecutorMemoryAllocator::GetStream(
-    int device_ordinal) {
-  CHECK(!AllowsAsynchronousDeallocation())
-      << "The logic below only works for synchronous allocators";
-  TF_ASSIGN_OR_RETURN(StreamExecutor * executor,
-                      GetStreamExecutor(device_ordinal));
-  absl::MutexLock lock(&mutex_);
-  if (!streams_.count(device_ordinal)) {
-    auto p = streams_.emplace(std::piecewise_construct,
-                              std::forward_as_tuple(device_ordinal),
-                              std::forward_as_tuple(executor));
-    TF_RETURN_IF_ERROR(p.first->second.Initialize());
-    return &p.first->second;
-  }
-  return &streams_.at(device_ordinal);
-}
-
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/stream_executor_pimpl.h b/third_party/xla/xla/stream_executor/stream_executor_pimpl.h
index 7e30af2c089fae..f57ace700cc9a0 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_pimpl.h
+++ b/third_party/xla/xla/stream_executor/stream_executor_pimpl.h
@@ -16,9 +16,7 @@ limitations under the License.
 #ifndef XLA_STREAM_EXECUTOR_STREAM_EXECUTOR_PIMPL_H_
 #define XLA_STREAM_EXECUTOR_STREAM_EXECUTOR_PIMPL_H_
 
-#include <cstddef>
 #include <cstdint>
-#include <initializer_list>
 #include <memory>
 #include <optional>
 #include <string>
@@ -36,7 +34,6 @@ limitations under the License.
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/fft.h"
@@ -65,343 +62,17 @@ class Stream;
 //
 // Thread-safe after initialization.
 // StreamExecutor interface should not be invoked from a signal handler.
-class StreamExecutor {
+class StreamExecutor : public StreamExecutorInterface {
  public:
-  StreamExecutor(const Platform* platform,
-                 std::unique_ptr<StreamExecutorInterface> implementation);
+  explicit StreamExecutor(const Platform* platform);
 
   ~StreamExecutor() = default;
 
-  absl::Status Init();
-
-  // Returns a reference to the platform that created this executor.
-  const Platform* platform() const { return platform_; }
-
-  // Retrieves (loads) a kernel for the platform this StreamExecutor is acting
-  // upon, if one exists.
-  //
-  // Parameters:
-  //   spec: The MultiKernelLoaderSpec is usually generated as a compile-time
-  //    constant into an appropriate namespace. For example, see
-  //    stream_executor::executor_sample::kKernelLoaderSpecs, from which a
-  //    MultiKernelLoaderSpec is selected.
-  //   kernel: Outparam that the kernel is loaded into. A given Kernel
-  //    instantiation should not be loaded into more than once.
-  //
-  // If an error occurs, or there is no kernel available for the StreamExecutor
-  // platform, error status is returned.
-  absl::Status GetKernel(const MultiKernelLoaderSpec& spec, Kernel* kernel);
-
-  // Releases any state associated with the previously loaded kernel.
-  void UnloadKernel(const Kernel* kernel);
-
-  // Loads a module for the platform this StreamExecutor is acting upon.
-  //
-  // `spec` describes the module to be loaded.  On success writes the handle for
-  // the loaded module to `module_handle` and returns OkStatus().  Otherwise,
-  // returns the error which has occurred.
-  absl::Status LoadModule(const MultiModuleLoaderSpec& spec,
-                          ModuleHandle* module_handle);
-
-  // Unloads the module with handle `module_handle`.
-  bool UnloadModule(ModuleHandle module_handle);
-
-  absl::StatusOr<std::shared_ptr<DeviceMemoryBase>> CreateOrShareConstant(
-      Stream* stream, absl::Span<const uint8_t> content);
-
-  // Synchronously allocates an array on the device of type T with element_count
-  // elements.
-  template <typename T>
-  DeviceMemory<T> AllocateArray(uint64_t element_count,
-                                int64_t memory_space = 0);
-
-  // Convenience wrapper that allocates space for a single element of type T in
-  // device memory.
-  template <typename T>
-  DeviceMemory<T> AllocateScalar() {
-    return AllocateArray<T>(1);
-  }
-
-  // As AllocateScalar(), but returns a ScopedDeviceMemory<T>.
-  template <typename T>
-  ScopedDeviceMemory<T> AllocateOwnedScalar() {
-    return ScopedDeviceMemory<T>(this, AllocateArray<T>(1));
-  }
-
-  // An untyped version of GetSymbol.
-  absl::StatusOr<DeviceMemoryBase> GetUntypedSymbol(
-      const std::string& symbol_name, ModuleHandle module_handle);
-
-  // Deallocate the DeviceMemory previously allocated via this interface.
-  // Deallocation of a nullptr-representative value is permitted.
-  //
-  // Resets the internal contents of mem to be null-representative, but this
-  // null-out effect should not be relied upon in client code.
-  void Deallocate(DeviceMemoryBase* mem);
-
-  // Allocates unified memory space of the given size, if supported.
-  // See
-  // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-unified-memory-programming-hd
-  // for more details on unified memory.
-  void* UnifiedMemoryAllocate(uint64_t bytes);
-
-  // Deallocates unified memory space previously allocated with
-  // UnifiedMemoryAllocate.
-  void UnifiedMemoryDeallocate(void* location);
-
-  // Allocate collective device memory using ncclMemAlloc.
-  // See
-  // https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/bufferreg.html
-  // for more details on User Buffer Registration.
-  absl::StatusOr<void*> CollectiveMemoryAllocate(uint64_t bytes);
-
-  // Deallocate collective device memory previously allocated with
-  // CollectiveMemoryAllocate.
-  absl::Status CollectiveMemoryDeallocate(void* location);
-
-  // Allocates a region of host memory and registers it with the platform API.
-  // Memory allocated in this manner is required for use in asynchronous memcpy
-  // operations, such as Stream::ThenMemcpy.
-  absl::StatusOr<std::unique_ptr<MemoryAllocation>> HostMemoryAllocate(
-      uint64_t size);
-
-  // Synchronizes all activity occurring in the StreamExecutor's context (most
-  // likely a whole device).
-  bool SynchronizeAllActivity() ABSL_MUST_USE_RESULT;
-
-  // Blocks the caller while "size" bytes are zeroed out (in POD fashion) at the
-  // given location in device memory.
-  absl::Status SynchronousMemZero(DeviceMemoryBase* location,
-                                  uint64_t size) ABSL_MUST_USE_RESULT;
-
-  // Same as SynchronousMemcpy(DeviceMemoryBase*, ...) above.
-  absl::Status SynchronousMemcpyH2D(const void* host_src, int64_t size,
-                                    DeviceMemoryBase* device_dst);
-
-  // Alternative interface for memcpying from host to device that takes an
-  // array slice. Checks that the destination size can accommodate the host
-  // slice size.
-  template <class T>
-  absl::Status SynchronousMemcpyH2D(absl::Span<const T> host_src,
-                                    DeviceMemoryBase* device_dst) {
-    auto host_size = host_src.size() * sizeof(T);
-    CHECK(device_dst->size() == 0 || device_dst->size() >= host_size);
-    return SynchronousMemcpyH2D(host_src.begin(), host_size, device_dst);
-  }
-
-  // Same as SynchronousMemcpy(void*, ...) above.
-  absl::Status SynchronousMemcpyD2H(const DeviceMemoryBase& device_src,
-                                    int64_t size, void* host_dst);
-
-  // Enqueues an operation onto stream to zero out size bytes at the given
-  // device memory location. Neither stream nor location may be null. Returns
-  // whether the operation was successfully enqueued onto the stream.
-  absl::Status MemZero(Stream* stream, DeviceMemoryBase* location,
-                       uint64_t size) ABSL_MUST_USE_RESULT;
-
-  // Enqueues an operation onto stream to set 32-bit patterns starting at
-  // location, for byte count given by size. size must be 32-bit quantified
-  // (i.e. evently divisible by 4). Returns whether the operation was
-  // successfully enqueued onto the stream.
-  absl::Status Memset32(Stream* stream, DeviceMemoryBase* location,
-                        uint32_t pattern, uint64_t size);
-
-  // Enables peer access from this StreamExecutor to memory
-  // allocated by other, such that launched device code, memcpies, etc may
-  // access it directly.
-  //
-  // Both this StreamExecutor and other must be backed by the same platform (as
-  // in
-  // CUDA vs OpenCL) implementation.
-  absl::Status EnablePeerAccessTo(StreamExecutor* other);
-
-  // Returns whether it's possible to enable peer access from this
-  // StreamExecutor
-  // to memory allocated by another.
-  //
-  // Even when this returns true, EnablePeerAccessTo may fail for other reasons;
-  // this is more an up-front test as to whether it's expressly forbidden.
-  bool CanEnablePeerAccessTo(StreamExecutor* other);
-
-  // Obtains metadata about the underlying device.
-  // The value is cached on first use.
-  const DeviceDescription& GetDeviceDescription() const;
-
-  // Returns the underlying device memory usage information, if it is available.
-  // If it is not available (false is returned), free/total may not be
-  // initialized.
-  //
-  // Note: "Free" reflects the amount of free memory on the underlying device,
-  // so allocations via other StreamExecutors that have the same underlying
-  // device
-  // will be reflected in "free".
-  bool DeviceMemoryUsage(int64_t* free, int64_t* total) const;
-
-  // Returns the device ordinal that this StreamExecutor was initialized with.
-  // Meaningless before initialization.
-  int device_ordinal() const { return implementation_->device_ordinal(); }
-
-  // Returns a borrowed pointer to the underlying StreamExecutor implementation.
-  StreamExecutorInterface* implementation();
-
-  // Warning: use Stream::ThenLaunch instead, this method is not for general
-  // consumption. However, this is the only way to launch a kernel for which
-  // the type signature is only known at runtime; say, if an application
-  // supports loading/launching kernels with arbitrary type signatures.
-  // In this case, the application is expected to know how to do parameter
-  // packing that obeys the contract of the underlying platform implementation.
-  //
-  // Launches a data parallel kernel with the given thread/block
-  // dimensionality and already-packed args/sizes to pass to the underlying
-  // platform driver.
-  //
-  // This is called by Stream::Launch() to delegate to the platform's launch
-  // implementation in StreamExecutorInterface::Launch().
-  absl::Status Launch(Stream* stream, const ThreadDim& thread_dims,
-                      const BlockDim& block_dims, const Kernel& kernel,
-                      const KernelArgs& args);
-
-  absl::Status Launch(Stream* stream, const ThreadDim& thread_dims,
-                      const BlockDim& block_dims,
-                      const ClusterDim& cluster_dims, const Kernel& kernel,
-                      const KernelArgs& args);
-
-  // Submits command buffer for execution to the underlying platform driver.
-  absl::Status Submit(Stream* stream, const CommandBuffer& command_buffer);
-
-  // Gets-or-creates (creates with memoization) a FftSupport datatype that can
-  // be used to execute FFT routines on the current platform.
-  //
-  // Ownership and user-facing is the same as AsBlas() below.
-  //
-  // Returns null if there was an error initializing the FFT support for the
-  // underlying platform.
-  fft::FftSupport* AsFft();
-
-  // Gets-or-creates (creates with memoization) a DnnSupport datatype that can
-  // be used for neural network routines on the current platform.
-  //
-  // Ownership and user-facing is the same as AsBlas() below.
-  //
-  // Returns null if there was an error initializing the DNN support for the
-  // underlying platform.
-  dnn::DnnSupport* AsDnn();
-
-  // Gets-or-creates (creates with memoization) a BlasSupport datatype that can
-  // be used to execute BLAS routines on the current platform. This is typically
-  // not user-facing, as users will use the Stream::ThenBlas* family of routines
-  // to entrain BLAS operations. See blas.h for additional details.
-  //
-  // Ownership is not transferred to the caller -- ownership is retained by this
-  // object for memoization. This BLAS interface is also only expected to be
-  // used by a Stream for entraining calls to BLAS functionality.
-  //
-  // Returns null if there was an error initializing the BLAS support for the
-  // underlying platform.
-  blas::BlasSupport* AsBlas();
-
-  // Return allocator statistics.
-  std::optional<AllocatorStats> GetAllocatorStats();
-
-  // Clears the internal stats except for the `in_use` fields
-  // and sets the `peak_bytes_in_use` to be equal to the `bytes_in_use`.
-  bool ClearAllocatorStats();
-
-  // Return an allocator which delegates to this stream executor for memory
-  // allocation.
-  StreamExecutorMemoryAllocator* GetAllocator() { return &allocator_; }
-
-  // Returns a stream allocated by this executor, or nullptr if not found.
-  // Performs linear search over alive GPU streams.
-  Stream* FindAllocatedStream(void* gpu_stream);
-
-  // Creates and initializes a Stream.
-  absl::StatusOr<std::unique_ptr<Stream>> CreateStream(
-      std::optional<std::variant<StreamPriority, int>> priority = std::nullopt);
-  // Deallocates a region of host memory allocated by HostMemoryAllocate().
-  void HostMemoryDeallocate(void* data, uint64_t size);
+  const Platform* GetPlatform() const override { return platform_; }
+  const DeviceDescription& GetDeviceDescription() const override;
+  int64_t GetMemoryLimitBytes() const override { return memory_limit_bytes_; }
 
  private:
-  friend class Event;
-  friend class Stream;
-  friend class HostMemoryAllocation;
-
-  // Synchronously allocates size bytes on the underlying platform and returns
-  // a DeviceMemoryBase representing that allocation. In the case of failure,
-  // nullptr is returned.
-  DeviceMemoryBase Allocate(uint64_t size, int64_t memory_space);
-
-  // Causes the host code to synchronously wait for operations entrained
-  // onto stream to complete. Effectively a join on the asynchronous device
-  // operations enqueued on the stream before this program point.
-  absl::Status BlockHostUntilDone(Stream* stream);
-
-  // Without blocking the device, retrieve the current stream status.
-  absl::Status GetStatus(Stream* stream);
-
-  // Finds and retrieves device memory for the symbol on the underlying
-  // platform.
-  bool GetSymbol(const std::string& symbol_name, ModuleHandle module_handle,
-                 void** mem, size_t* bytes);
-
-  // Entrains a memcpy operation onto stream, with a host destination location
-  // host_dst and a device memory source, with target size size.
-  bool Memcpy(Stream* stream, void* host_dst,
-              const DeviceMemoryBase& device_src, uint64_t size);
-
-  // Entrains a memcpy operation onto stream, with a device destination location
-  // and a host memory source, with target size size.
-  bool Memcpy(Stream* stream, DeviceMemoryBase* device_dst,
-              const void* host_src, uint64_t size);
-
-  // Entrains a memcpy operation onto stream, with a device destination location
-  // and a device source location, with target size size. Peer access should
-  // have been enabled between the StreamExecutors owning the device memory
-  // regions.
-  bool MemcpyDeviceToDevice(Stream* stream, DeviceMemoryBase* device_dst,
-                            const DeviceMemoryBase& device_src, uint64_t size);
-
-  // Entrains on a stream a user-specified function to be run on the host.
-  // See Stream::ThenDoHostCallback for full details.
-  // This is the preferred form for a callback that may return an error.
-  bool HostCallback(Stream* stream,
-                    absl::AnyInvocable<absl::Status() &&> callback);
-
-  // Performs platform-specific allocation and initialization of an event.
-  absl::Status AllocateEvent(Event* event);
-
-  // Performs platform-specific deallocation and cleanup of an event.
-  absl::Status DeallocateEvent(Event* event);
-
-  // Inserts the specified event at the end of the specified stream.
-  absl::Status RecordEvent(Stream* stream, Event* event);
-
-  // Wait for the specified event at the end of the specified stream.
-  absl::Status WaitForEvent(Stream* stream, Event* event);
-
-  // Wait for the specified event at the end of the raw platform-specific
-  // stream. Currently only implemented for GPU, where stream is a
-  // GpuStreamHandle (e.g. cudaStream_t).
-  absl::Status WaitForEventOnExternalStream(std::intptr_t stream, Event* event);
-
-  // Requests the current status of the event from the underlying platform.
-  Event::Status PollForEventStatus(Event* event);
-
-  // Allocates stream resources on the underlying platform and initializes its
-  // internals.
-  bool AllocateStream(Stream* stream);
-
-  // Deallocates stream resources on the underlying platform.
-  void DeallocateStream(Stream* stream);
-
-  // Causes dependent to not begin execution until other has finished its
-  // last-enqueued work.
-  bool CreateStreamDependency(Stream* dependent, Stream* other);
-
-  // Allocates a new metadata object, appropriately populated, on the heap, with
-  // ownership transfer to caller.
-  std::unique_ptr<DeviceDescription> CreateDeviceDescription() const;
-
   // Reader/writer lock for mutable data structures on this StreamExecutor.
   //
   // Mutable so that caching functions (like DeviceDescription, AsBlas, etc.)
@@ -411,83 +82,19 @@ class StreamExecutor {
   // Reference to the platform that created this executor.
   const Platform* platform_;
 
-  // Pointer to the platform-specific-interface implementation. This is
-  // delegated to by the interface routines in pointer-to-implementation
-  // fashion.
-  std::unique_ptr<StreamExecutorInterface> implementation_;
-
   // Slot to cache the owned DeviceDescription for the underlying device
   // once it has been queried from DeviceDescription().
   mutable std::unique_ptr<DeviceDescription> device_description_
       ABSL_GUARDED_BY(mu_);
 
-  // Only one worker thread is needed; little work will be done by the
-  // executor.
-  static constexpr int kNumBackgroundThreads = 1;
-
   // Memory limit in bytes. Value less or equal to 0 indicates there is no
   // limit.
   int64_t memory_limit_bytes_;
 
-  StreamExecutorMemoryAllocator allocator_;
-
   StreamExecutor(const StreamExecutor&) = delete;
   void operator=(const StreamExecutor&) = delete;
 };
 
-// A wrapper around ModuleHandle that uses RAII to manage its lifetime.
-class ScopedModuleHandle {
- public:
-  explicit ScopedModuleHandle(StreamExecutor* executor,
-                              ModuleHandle module_handle)
-      : executor_(executor), module_handle_(module_handle) {}
-
-  ScopedModuleHandle(ScopedModuleHandle&& other) {
-    executor_ = other.executor_;
-    module_handle_ = other.module_handle_;
-    other.executor_ = nullptr;
-    other.module_handle_ = ModuleHandle();
-  }
-
-  ScopedModuleHandle& operator=(ScopedModuleHandle&& other) {
-    executor_ = other.executor_;
-    module_handle_ = other.module_handle_;
-    other.executor_ = nullptr;
-    other.module_handle_ = ModuleHandle();
-    return *this;
-  }
-
-  ~ScopedModuleHandle() {
-    if (static_cast<bool>(module_handle_)) {
-      CHECK(executor_->UnloadModule(module_handle_));
-    }
-  }
-
- private:
-  StreamExecutor* executor_;
-  ModuleHandle module_handle_;
-
-  ScopedModuleHandle(const ScopedModuleHandle&) = delete;
-  void operator=(const ScopedModuleHandle&) = delete;
-};
-
-////////////
-// Inlines
-
-template <typename T>
-inline DeviceMemory<T> StreamExecutor::AllocateArray(uint64_t element_count,
-                                                     int64_t memory_space) {
-  uint64_t bytes = sizeof(T) * element_count;
-  return DeviceMemory<T>(Allocate(bytes, memory_space));
-}
-
-template <typename ElemT>
-ScopedDeviceMemory<ElemT>::ScopedDeviceMemory(StreamExecutor* parent,
-                                              DeviceMemoryBase value)
-    : wrapped_(value),
-      device_ordinal_(parent->device_ordinal()),
-      allocator_(parent->GetAllocator()) {}
-
 }  // namespace stream_executor
 
 #endif  // XLA_STREAM_EXECUTOR_STREAM_EXECUTOR_PIMPL_H_
diff --git a/third_party/xla/xla/stream_executor/tpu/BUILD b/third_party/xla/xla/stream_executor/tpu/BUILD
index fe23094d0cc2cf..5de3a5589a76bb 100644
--- a/third_party/xla/xla/stream_executor/tpu/BUILD
+++ b/third_party/xla/xla/stream_executor/tpu/BUILD
@@ -76,6 +76,7 @@ cc_library(
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:logging",
     ],
@@ -89,7 +90,6 @@ xla_cc_test(
         ":c_api_decl",
         "//xla:executable_run_options",
         "//xla:shape_util",
-        "//xla:statusor",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -98,10 +98,10 @@ xla_cc_test(
         "//xla/service:hlo_proto_cc",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:protobuf",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -202,6 +202,7 @@ cc_library(
         ":tpu_topology_external",
         "//xla/stream_executor",
         "//xla/stream_executor:stream_executor_interface",
+        "//xla/stream_executor:stream_interface",
         "//xla/stream_executor/platform",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/functional:any_invocable",
@@ -233,6 +234,7 @@ cc_library(
         ":tpu_topology_external",
         "//xla/stream_executor",
         "//xla/stream_executor:stream_executor_interface",
+        "//xla/stream_executor:stream_interface",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -261,6 +263,7 @@ cc_library(
         ":tpu_topology_external",
         "//xla/stream_executor",
         "//xla/stream_executor:stream_executor_interface",
+        "//xla/stream_executor:stream_interface",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/status",
@@ -309,6 +312,7 @@ cc_library(
         "//xla/stream_executor",
         "//xla/stream_executor:event_interface",
         "//xla/stream_executor:stream_executor_interface",
+        "//xla/stream_executor:stream_interface",
         "//xla/tsl/c:tsl_status_internal",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -321,6 +325,7 @@ cc_library(
         "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:types",
     ],
 )
@@ -358,6 +363,8 @@ cc_library(
         "//xla/service:backend",
         "//xla/service:stream_pool",
         "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:stream_executor_headers",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@local_tsl//tsl/platform:macros",
@@ -385,9 +392,12 @@ cc_library(
     deps = [
         ":noncopyable_buffer",
         ":tpu_platform_interface",
+        "//xla:literal",
+        "//xla:shape_util",
         "//xla:status",
         "//xla/service:transfer_manager",
         "//xla/stream_executor",
+        "@com_google_absl//absl/log",
     ],
 )
 
@@ -412,9 +422,7 @@ cc_library(
         ":c_api_conversions",
         ":c_api_decl",
         ":noncopyable_buffer",
-        ":proto_helper",
         ":status_helper",
-        ":tpu_api",
         ":tpu_executor_api",
         ":tpu_executor_base",
         ":tpu_executor_c_api_hdrs",
@@ -454,6 +462,8 @@ cc_library(
         "//xla/service:executable",
         "//xla/stream_executor",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:errors",
@@ -524,7 +534,6 @@ cc_library(
         ":tpu_executor_hdrs",
         ":tpu_platform_id",
         "//xla:shape_util",
-        "//xla:statusor",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/ir:hlo_module_group",
@@ -534,6 +543,8 @@ cc_library(
         "//xla/service:hlo_proto_cc",
         "//xla/stream_executor",
         "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/status:statusor",
+        "@local_tsl//tsl/platform:statusor",
     ],
     alwayslink = True,
 )
@@ -569,11 +580,13 @@ cc_library(
         "//xla/service:transfer_manager",
         "//xla/stream_executor",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -602,9 +615,11 @@ cc_library(
         "//xla/stream_executor:device_memory",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -634,7 +649,6 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":c_api_decl",
-        ":tpu_api",
         ":tpu_executor_api",
         "@local_tsl//tsl/platform:types",
     ],
diff --git a/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc b/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc
index 24709bb78ef794..a799161cad4604 100644
--- a/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc
+++ b/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc
@@ -37,7 +37,6 @@ limitations under the License.
 #include "xla/shape_layout.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
-#include "xla/statusor.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/tpu/c_api_decl.h"
@@ -235,6 +234,12 @@ SE_DeviceMemoryAllocator ToC(
   return se_allocator;
 }
 
+stream_executor::DeviceMemoryAllocator* FromC(
+    const SE_DeviceMemoryAllocator& c_allocator) {
+  return reinterpret_cast<stream_executor::DeviceMemoryAllocator*>(
+      c_allocator.ctx);
+}
+
 SE_MaybeOwningDeviceMemory ToC(stream_executor::OwningDeviceMemory* mem) {
   SE_MaybeOwningDeviceMemory se_mem;
   se_mem.device_ordinal = mem->device_ordinal();
diff --git a/third_party/xla/xla/stream_executor/tpu/c_api_conversions.h b/third_party/xla/xla/stream_executor/tpu/c_api_conversions.h
index dfe5f2c7e59467..043d860f665c65 100644
--- a/third_party/xla/xla/stream_executor/tpu/c_api_conversions.h
+++ b/third_party/xla/xla/stream_executor/tpu/c_api_conversions.h
@@ -20,8 +20,11 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
 #include "xla/executable_run_options.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/layout.h"
 #include "xla/literal.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/maybe_owning_device_memory.h"
@@ -30,7 +33,9 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/tpu/c_api_decl.h"
+#include "xla/xla_data.pb.h"
 
 // APIs for converting between internal and external versions of
 // XLA/StreamExecutor data structures.
@@ -118,6 +123,8 @@ xla::MaybeOwningDeviceMemory FromC(
 
 // DeviceMemoryAllocator
 SE_DeviceMemoryAllocator ToC(stream_executor::DeviceMemoryAllocator* allocator);
+stream_executor::DeviceMemoryAllocator* FromC(
+    const SE_DeviceMemoryAllocator& c_allocator);
 
 // OwningDeviceMemory
 SE_MaybeOwningDeviceMemory ToC(stream_executor::OwningDeviceMemory* mem);
diff --git a/third_party/xla/xla/stream_executor/tpu/c_api_conversions_test.cc b/third_party/xla/xla/stream_executor/tpu/c_api_conversions_test.cc
index 4092cedce69afd..d155550fcf9059 100644
--- a/third_party/xla/xla/stream_executor/tpu/c_api_conversions_test.cc
+++ b/third_party/xla/xla/stream_executor/tpu/c_api_conversions_test.cc
@@ -23,7 +23,6 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "absl/status/statusor.h"
-#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/executable_run_options.h"
@@ -34,11 +33,11 @@ limitations under the License.
 #include "xla/service/hlo_parser.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/statusor.h"
 #include "xla/stream_executor/tpu/c_api_decl.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/protobuf.h"
+#include "tsl/platform/statusor.h"
 
 namespace ApiConverter {
 
diff --git a/third_party/xla/xla/stream_executor/tpu/c_api_defn.h b/third_party/xla/xla/stream_executor/tpu/c_api_defn.h
index 407d6c326707ac..59ecd662196daf 100644
--- a/third_party/xla/xla/stream_executor/tpu/c_api_defn.h
+++ b/third_party/xla/xla/stream_executor/tpu/c_api_defn.h
@@ -45,8 +45,8 @@ struct SE_Stream {
 };
 
 struct SE_Event {
-  explicit SE_Event(stream_executor::StreamExecutor* parent) : event(parent) {}
-  stream_executor::Event event;
+  explicit SE_Event(stream_executor::StreamExecutor* parent) {}
+  std::unique_ptr<stream_executor::Event> event;
 };
 
 struct SE_StreamExecutorConfig {
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executable.cc b/third_party/xla/xla/stream_executor/tpu/tpu_executable.cc
index abdd5f20c9172a..5c0e5b3b880fce 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executable.cc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executable.cc
@@ -24,12 +24,12 @@ limitations under the License.
 
 #include "absl/cleanup/cleanup.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/executable.h"
 #include "xla/service/hlo_execution_profile.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/service/shaped_buffer.h"
-#include "xla/statusor.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/tpu/c_api_conversions.h"
 #include "xla/stream_executor/tpu/c_api_decl.h"
@@ -39,6 +39,7 @@ limitations under the License.
 #include "xla/stream_executor/tpu/tpu_stream.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/logging.h"  // IWYU pragma: keep
+#include "tsl/platform/statusor.h"
 
 namespace ApiConverter {
 
@@ -151,7 +152,7 @@ absl::StatusOr<ExecutionOutput> TpuExecutable::ExecuteAsyncOnStream(
 
   xla::ScopedShapedBuffer result(
       ApiConverter::FromC(&se_execution_output.result),
-      run_options->stream()->parent()->GetAllocator());
+      ApiConverter::FromC(se_run_options.allocator));
   ApiConverter::Destroy(&se_execution_output.result);
 
   ExecutionOutput output(std::move(result));
@@ -165,7 +166,7 @@ absl::StatusOr<ExecutionOutput> TpuExecutable::ExecuteAsyncOnStream(
   for (int i = 0; i < se_execution_output.to_be_released_size; ++i) {
     output.AddToBeReleased(
         ApiConverter::FromC(&se_execution_output.to_be_released[i],
-                            run_options->stream()->parent()->GetAllocator())
+                            ApiConverter::FromC(se_run_options.allocator))
             .Release()
             .value());
   }
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executable.h b/third_party/xla/xla/stream_executor/tpu/tpu_executable.h
index cd3d8132c82a23..bee80cb8489a59 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executable.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executable.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/log/log.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -61,7 +62,7 @@ class TpuExecutable : public xla::TpuExecutableInterface {
       absl::string_view serialized);
 
  private:
-  Status LoadProgramAndEnqueueToStream(
+  absl::Status LoadProgramAndEnqueueToStream(
       const ServiceExecutableRunOptions& run_options,
       absl::Span<const stream_executor::DeviceMemoryBase> arguments,
       stream_executor::DeviceMemoryBase result,
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executable_interface.cc b/third_party/xla/xla/stream_executor/tpu/tpu_executable_interface.cc
index 69b9c4a0fc8067..6a5435d17fbffb 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executable_interface.cc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executable_interface.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_input_output_alias_config.h"
@@ -38,23 +39,24 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/status_macros.h"
-#include "xla/statusor.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"  // IWYU pragma: keep
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 
 namespace {
 
 // Write the tuple index buffers (arrays of pointers).
-static Status PopulateResultTupleBuffers(const ShapedBuffer& result,
-                                         se::Stream* stream,
-                                         se::Stream* transfer_stream) {
-  TF_ASSIGN_OR_RETURN(auto transfer_manager, TransferManager::GetForPlatform(
-                                                 stream->parent()->platform()));
+static absl::Status PopulateResultTupleBuffers(const ShapedBuffer& result,
+                                               se::Stream* stream,
+                                               se::Stream* transfer_stream) {
+  TF_ASSIGN_OR_RETURN(
+      auto transfer_manager,
+      TransferManager::GetForPlatform(stream->parent()->GetPlatform()));
   if (transfer_manager->CanShapedBufferBeAccessedNow(stream->parent(),
                                                      result)) {
     TF_RETURN_IF_ERROR(transfer_manager->WriteTupleIndexTablesAsync(
@@ -77,7 +79,7 @@ TpuExecutableInterface::AllocateOutputMemoryWithInputReuse(
     std::vector<ExecutionInput>* arguments, se::Stream* stream,
     se::Stream* transfer_stream) {
   auto stream_exec = stream->parent();
-  auto platform = stream_exec->platform();
+  auto platform = stream_exec->GetPlatform();
   TF_ASSIGN_OR_RETURN(auto transfer_manager,
                       TransferManager::GetForPlatform(platform));
   TF_ASSIGN_OR_RETURN(auto compiler, Compiler::GetForPlatform(platform));
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executable_interface.h b/third_party/xla/xla/stream_executor/tpu/tpu_executable_interface.h
index ff037c07a532f5..b4e24551a80493 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executable_interface.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executable_interface.h
@@ -69,7 +69,7 @@ class TpuExecutableInterface : public Executable {
       std::vector<ExecutionInput>* arguments, se::Stream* stream,
       se::Stream* transfer_stream = nullptr);
 
-  virtual Status LoadProgramAndEnqueueToStream(
+  virtual absl::Status LoadProgramAndEnqueueToStream(
       const ServiceExecutableRunOptions& run_options,
       absl::Span<const stream_executor::DeviceMemoryBase> arguments,
       stream_executor::DeviceMemoryBase result,
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executor.cc b/third_party/xla/xla/stream_executor/tpu/tpu_executor.cc
index 194bda4d20fd72..4b515f330eb119 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executor.cc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executor.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/stream_executor_interface.h"
+#include "xla/stream_executor/stream_interface.h"
 #include "xla/stream_executor/tpu/c_api_conversions.h"
 #include "xla/stream_executor/tpu/c_api_decl.h"
 #include "xla/stream_executor/tpu/status_helper.h"
@@ -40,17 +41,14 @@ limitations under the License.
 #include "xla/tsl/c/tsl_status.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"  // IWYU pragma: keep
+#include "tsl/platform/status.h"
 
 namespace stream_executor {
 namespace tpu {
 
-namespace {
-using absl::Status;
-}  // namespace
-
 TpuExecutor::~TpuExecutor() { ExecutorApiFn()->TpuExecutor_FreeFn(executor_); }
 
-Status TpuExecutor::Init() {
+absl::Status TpuExecutor::Init() {
   StatusHelper status;
   ExecutorApiFn()->TpuExecutor_InitFn(executor_, status.c_status);
   return status.status();
@@ -60,14 +58,14 @@ bool TpuExecutor::SynchronizeAllActivity() {
   return ExecutorApiFn()->TpuExecutor_SynchronizeAllActivityFn(executor_);
 }
 
-Status TpuExecutor::BlockHostUntilDone(Stream* stream) {
+absl::Status TpuExecutor::BlockHostUntilDone(Stream* stream) {
   StatusHelper status;
   ExecutorApiFn()->TpuExecutor_BlockHostUntilDoneFn(
       executor_, get_stream(stream->implementation()), status.c_status);
   return status.status();
 }
 
-Status TpuExecutor::GetStatus(Stream* stream) {
+absl::Status TpuExecutor::GetStatus(Stream* stream) {
   StatusHelper status;
   ExecutorApiFn()->TpuExecutor_GetStatusFn(
       executor_, get_stream(stream->implementation()), status.c_status);
@@ -80,11 +78,6 @@ tensorflow::tpu::TpuCoreLocationExternal TpuExecutor::GetCoreLocationExternal()
       ExecutorApiFn()->TpuExecutor_GetCoreLocationFn(executor_));
 }
 
-bool TpuExecutor::AllocateStream(Stream* stream) {
-  return ExecutorApiFn()->TpuExecutor_AllocateStreamFn(
-      executor_, get_stream(stream->implementation()));
-}
-
 void TpuExecutor::DeallocateStream(Stream* stream) {
   ExecutorApiFn()->TpuExecutor_DeallocateStreamFn(
       executor_, get_stream(stream->implementation()));
@@ -99,9 +92,7 @@ bool TpuExecutor::CreateStreamDependency(Stream* dependent, Stream* other) {
       get_stream(other->implementation()));
 }
 
-Status TpuExecutor::AllocateEvent(Event* event) { return absl::OkStatus(); }
-
-Status TpuExecutor::DeallocateEvent(Event* event) {
+absl::Status TpuExecutor::DeallocateEvent(Event* event) {
   tpu_platform().EraseEvent(event->implementation());
   return absl::OkStatus();
 }
@@ -113,8 +104,8 @@ stream_executor::Event::Status TpuExecutor::PollForEventStatus(
       ExecutorApiFn()->TpuExecutor_PollForEventStatusFn(executor_, se_event));
 }
 
-Status TpuExecutor::RecordEvent(Stream* stream,
-                                ::stream_executor::Event* event) {
+absl::Status TpuExecutor::RecordEvent(Stream* stream,
+                                      ::stream_executor::Event* event) {
   StatusHelper status;
   auto se_event = tpu_platform().LookupEvent(event->implementation());
   ExecutorApiFn()->TpuExecutor_RecordEventFn(
@@ -123,8 +114,8 @@ Status TpuExecutor::RecordEvent(Stream* stream,
   return status.status();
 }
 
-Status TpuExecutor::WaitForEvent(Stream* stream,
-                                 ::stream_executor::Event* event) {
+absl::Status TpuExecutor::WaitForEvent(Stream* stream,
+                                       ::stream_executor::Event* event) {
   StatusHelper status;
   auto se_event = tpu_platform().LookupEvent(event->implementation());
   ExecutorApiFn()->TpuExecutor_WaitForEventFn(
@@ -133,28 +124,28 @@ Status TpuExecutor::WaitForEvent(Stream* stream,
   return status.status();
 }
 
-// Implementations for Stream, Event
-// We need to map these implementations to internal equivalents -- thus we
-// allocate the internal Stream and Event operations here, and map
-// the implementations to the internal values. The "wrapper" interfaces are
-// responsible for deallocating the internal value when they are destroyed.
-
-// Called by Stream::Stream
-std::unique_ptr<StreamInterface> TpuExecutor::GetStreamImplementation() {
+absl::StatusOr<std::unique_ptr<Stream>> TpuExecutor::CreateStream(
+    std::optional<std::variant<StreamPriority, int>> priority) {
   SE_Stream* tpu_stream = ExecutorApiFn()->TpuStream_NewFn(executor_);
   auto ptr = std::make_unique<tensorflow::tpu::TpuStream>(tpu_stream);
   tpu_platform().mutex().Lock();
   stream_map()[ptr.get()] = tpu_stream;
   tpu_platform().mutex().Unlock();
-  return ptr;
+  auto stream = std::make_unique<Stream>(this, std::move(ptr));
+  return std::move(stream);
 }
 
-// Called by Event::Event
-std::unique_ptr<EventInterface> TpuExecutor::CreateEventImplementation() {
-  SE_Event* tpu_event = ExecutorApiFn()->TpuEvent_NewFn(executor_);
-  auto ptr = std::make_unique<TpuEvent>(tpu_event);
-  tpu_platform().InsertEvent(ptr.get(), tpu_event);
-  return ptr;
+absl::StatusOr<std::unique_ptr<Event>> TpuExecutor::CreateEvent() {
+  SE_Event* se_event = ExecutorApiFn()->TpuEvent_NewFn(executor_);
+  auto tpu_event = std::make_unique<TpuEvent>(se_event);
+  tpu_platform().InsertEvent(tpu_event.get(), se_event);
+
+  StatusHelper status;
+  ExecutorApiFn()->TpuExecutor_AllocateEventFn(executor_, se_event,
+                                               status.c_status);
+  TF_RETURN_IF_ERROR(status.status());
+
+  return std::make_unique<Event>(this, std::move(tpu_event));
 }
 
 DeviceMemoryBase TpuExecutor::Allocate(uint64_t size, int64_t memory_space) {
@@ -218,8 +209,8 @@ void TpuExecutor::DequeueOutfeed(int32_t outfeed_queue_index,
   done(status.status());
 }
 
-Status TpuExecutor::EnqueueInfeed(int32_t infeed_queue_index,
-                                  absl::Span<const uint8_t> bytes) {
+absl::Status TpuExecutor::EnqueueInfeed(int32_t infeed_queue_index,
+                                        absl::Span<const uint8_t> bytes) {
   StatusHelper status;
   ExecutorApiFn()->TpuExecutor_EnqueueInfeedFn(executor_, infeed_queue_index,
                                                bytes.data(), bytes.size(),
@@ -249,7 +240,7 @@ absl::Status TpuExecutor::Memcpy(
   return status.status();
 }
 
-Status TpuExecutor::SynchronousMemcpy(
+absl::Status TpuExecutor::SynchronousMemcpy(
     ::stream_executor::DeviceMemoryBase* device_dst, const void* host_src,
     uint64_t size) {
   StatusHelper status;
@@ -259,7 +250,7 @@ Status TpuExecutor::SynchronousMemcpy(
   return status.status();
 }
 
-Status TpuExecutor::SynchronousMemcpy(
+absl::Status TpuExecutor::SynchronousMemcpy(
     void* host_dst, const ::stream_executor::DeviceMemoryBase& device_src,
     uint64_t size) {
   StatusHelper status;
@@ -275,13 +266,14 @@ bool TpuExecutor::MemcpyDeviceToDevice(
   LOG(FATAL) << __func__ << " not supported on TpuExecutor";
 }
 
-Status TpuExecutor::UnloadAllPrograms() {
+absl::Status TpuExecutor::UnloadAllPrograms() {
   StatusHelper status;
   ExecutorApiFn()->TpuExecutor_UnloadAllProgramsFn(executor_, status.c_status);
   return status.status();
 }
 
-Status TpuExecutor::EnqueueCompactionOnStreamForHbm(Stream* compaction_stream) {
+absl::Status TpuExecutor::EnqueueCompactionOnStreamForHbm(
+    Stream* compaction_stream) {
   StatusHelper status;
   ExecutorApiFn()->TpuExecutor_EnqueueCompactionOnStreamForHbmFn(
       executor_, get_stream(compaction_stream->implementation()),
@@ -290,27 +282,27 @@ Status TpuExecutor::EnqueueCompactionOnStreamForHbm(Stream* compaction_stream) {
 }
 
 struct HostCallbackContext {
-  absl::AnyInvocable<Status() &&> callback;
+  absl::AnyInvocable<absl::Status() &&> callback;
 };
 
 TSL_Status* HostCallbackTrampoline(void* ctx) {
   HostCallbackContext* host_ctx = reinterpret_cast<HostCallbackContext*>(ctx);
-  Status status = std::move(host_ctx->callback)();
+  absl::Status status = std::move(host_ctx->callback)();
   TSL_Status* c_status = ExecutorApiFn()->TpuStatus_CreateFn(
-      status.raw_code(), tsl::NullTerminatedMessage(status));
+      status.raw_code(), absl::StatusMessageAsCStr(status));
   delete host_ctx;
   return c_status;
 }
 
 bool TpuExecutor::HostCallback(Stream* stream,
-                               absl::AnyInvocable<Status() &&> callback) {
+                               absl::AnyInvocable<absl::Status() &&> callback) {
   HostCallbackContext* ctx = new HostCallbackContext{std::move(callback)};
   return ExecutorApiFn()->TpuExecutor_HostCallbackFn(
       executor_, get_stream(stream->implementation()), &HostCallbackTrampoline,
       ctx);
 }
 
-TpuExecutor::StatusOr<std::unique_ptr<::stream_executor::DeviceDescription>>
+absl::StatusOr<std::unique_ptr<::stream_executor::DeviceDescription>>
 TpuExecutor::CreateDeviceDescription() const {
   StatusHelper status;
   SE_DeviceDescription* description =
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executor.h b/third_party/xla/xla/stream_executor/tpu/tpu_executor.h
index aa71bdb71daab9..066db0c1dc319b 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executor.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executor.h
@@ -1,4 +1,6 @@
+#include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/stream_interface.h"
 /* Copyright 2020 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -49,13 +51,12 @@ namespace tpu {
 
 class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
  public:
-  template <typename T>
-  using StatusOr = ::absl::StatusOr<T>;
   using StatusCallback = std::function<void(const absl::Status&)>;
 
-  explicit TpuExecutor(::tensorflow::tpu::TpuPlatformInterface* platform,
-                       SE_StreamExecutor* executor, int device_ordinal)
-      : platform_(platform),
+  TpuExecutor(::tensorflow::tpu::TpuPlatformInterface* platform,
+              SE_StreamExecutor* executor, int device_ordinal)
+      : TpuExecutorInterface(platform),
+        platform_(platform),
         executor_(executor),
         device_ordinal_(device_ordinal) {}
 
@@ -65,13 +66,9 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
 
   DeviceMemoryBase Allocate(uint64_t size, int64_t memory_space) override;
 
-  absl::Status AllocateEvent(Event* event) override;
-
-  bool AllocateStream(Stream* stream) override;
-
   absl::Status BlockHostUntilDone(Stream* stream) override;
 
-  StatusOr<std::unique_ptr<DeviceDescription>> CreateDeviceDescription()
+  absl::StatusOr<std::unique_ptr<DeviceDescription>> CreateDeviceDescription()
       const override;
 
   bool CreateStreamDependency(Stream* dependent, Stream* other) override;
@@ -99,9 +96,11 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
 
   absl::Status GetStatus(Stream* stream) override;
 
-  std::unique_ptr<StreamInterface> GetStreamImplementation() override;
+  absl::StatusOr<std::unique_ptr<Stream>> CreateStream(
+      std::optional<std::variant<StreamPriority, int>> priority =
+          std::nullopt) override;
 
-  std::unique_ptr<EventInterface> CreateEventImplementation() override;
+  absl::StatusOr<std::unique_ptr<Event>> CreateEvent() override;
 
   bool HostCallback(Stream* stream,
                     absl::AnyInvocable<absl::Status() &&> callback) override;
@@ -144,7 +143,7 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
   int device_ordinal() const override { return device_ordinal_; }
   // TODO(henrytan): convert this to override once the base interface is changed
   // to TpuExecutorInterface.
-  StatusOr<std::unique_ptr<
+  absl::StatusOr<std::unique_ptr<
       tensorflow::tpu::TpuExecutorInterface::TemporaryDeviceMemory>>
   CreateTemporaryDeviceMemory(int64_t memory_space, int64_t byte_offset,
                               int64_t size) override {
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executor_c_api.h b/third_party/xla/xla/stream_executor/tpu/tpu_executor_c_api.h
index 68ddb6c03da435..065a6b18daee39 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executor_c_api.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executor_c_api.h
@@ -64,8 +64,6 @@ SE_TpuTopology_Core* TpuExecutor_GetCoreLocation(SE_StreamExecutor* executor);
 
 void TpuExecutor_AllocateEvent(SE_StreamExecutor* executor, SE_Event* event,
                                TF_Status* status);
-void TpuExecutor_DeallocateEvent(SE_StreamExecutor* executor, SE_Event* event,
-                                 TF_Status* status);
 int TpuExecutor_PollForEventStatus(SE_StreamExecutor* executor,
                                    SE_Event* event);
 void TpuExecutor_RecordEvent(SE_StreamExecutor* executor, SE_Stream* stream,
@@ -399,7 +397,6 @@ struct TfTpu_ExecutorApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_GetStatus);
   TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_GetCoreLocation);
   TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_AllocateEvent);
-  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_DeallocateEvent);
   TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_PollForEventStatus);
   TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_RecordEvent);
   TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_WaitForEvent);
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executor_init_fns.inc b/third_party/xla/xla/stream_executor/tpu/tpu_executor_init_fns.inc
index 3221fe01c0076d..b4260442b5d2ae 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executor_init_fns.inc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executor_init_fns.inc
@@ -22,13 +22,11 @@ absl::Status SetExecutorStructFn(
   TFTPU_SET_FN(executor_fn, TpuExecutor_Deallocate);
   TFTPU_SET_FN(executor_fn, TpuExecutor_GetAllocatorStats);
   TFTPU_SET_FN(executor_fn, TpuExecutor_DeviceMemoryUsage);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_AllocateStream);
   TFTPU_SET_FN(executor_fn, TpuExecutor_DeallocateStream);
   TFTPU_SET_FN(executor_fn, TpuExecutor_CreateStreamDependency);
   TFTPU_SET_FN(executor_fn, TpuExecutor_GetStatus);
   TFTPU_SET_FN(executor_fn, TpuExecutor_GetCoreLocation);
   TFTPU_SET_FN(executor_fn, TpuExecutor_AllocateEvent);
-  TFTPU_SET_FN(executor_fn, TpuExecutor_DeallocateEvent);
   TFTPU_SET_FN(executor_fn, TpuExecutor_PollForEventStatus);
   TFTPU_SET_FN(executor_fn, TpuExecutor_RecordEvent);
   TFTPU_SET_FN(executor_fn, TpuExecutor_WaitForEvent);
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executor_interface.h b/third_party/xla/xla/stream_executor/tpu/tpu_executor_interface.h
index 7d6218f4f871e1..7816efbd5c2637 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executor_interface.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executor_interface.h
@@ -1,3 +1,4 @@
+#include "xla/stream_executor/platform.h"
 /* Copyright 2020 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -22,7 +23,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/stream_executor_interface.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/tpu/tpu_platform_interface.h"
 #include "xla/stream_executor/tpu/tpu_topology.h"
 
@@ -33,10 +34,10 @@ class TpuCore;
 namespace tensorflow {
 namespace tpu {
 
-class TpuExecutorInterface : public stream_executor::StreamExecutorInterface {
+class TpuExecutorInterface : public stream_executor::StreamExecutor {
  public:
-  template <typename T>
-  using StatusOr = absl::StatusOr<T>;
+  explicit TpuExecutorInterface(stream_executor::Platform* platform)
+      : StreamExecutor(platform) {}
 
   class TemporaryDeviceMemory {
    public:
@@ -44,7 +45,7 @@ class TpuExecutorInterface : public stream_executor::StreamExecutorInterface {
     virtual stream_executor::DeviceMemoryBase AsDeviceMemoryBase() const = 0;
   };
 
-  virtual StatusOr<std::unique_ptr<TemporaryDeviceMemory>>
+  virtual absl::StatusOr<std::unique_ptr<TemporaryDeviceMemory>>
   CreateTemporaryDeviceMemory(int64_t memory_space, int64_t byte_offset,
                               int64_t size) {
     LOG(FATAL) << "Unimplemented.";
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_library_init_fns.inc b/third_party/xla/xla/stream_executor/tpu/tpu_library_init_fns.inc
index baf503c6cf262a..1ab49e97110a46 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_library_init_fns.inc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_library_init_fns.inc
@@ -74,6 +74,7 @@ absl::Status SetTpuOpsStructFns(void* library_handle) {  // TENSORFLOW_STATUS_OK
   TFTPU_SET_FN(ops_api_fn, TpuNodeContext_CompactionSupported);
 
   TFTPU_SET_FN(ops_api_fn, TpuTopology_AvailableCoreCount);
+  TFTPU_SET_FN(ops_api_fn, TpuTopology_AvailableCoresPerChip);
   TFTPU_SET_FN(ops_api_fn, TpuNetUtil_RecycleUnusedPort);
   TFTPU_SET_FN(ops_api_fn, TpuCompile_IsTpuCompilationEnabled);
   TFTPU_SET_FN(ops_api_fn, TpuCompile_ShouldTpuCompileOpIgnoreCancellation);
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_node_context.cc b/third_party/xla/xla/stream_executor/tpu/tpu_node_context.cc
index 0299169fdc6422..1a026c9c340242 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_node_context.cc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_node_context.cc
@@ -17,16 +17,19 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/status/status.h"
+#include "xla/service/backend.h"
+#include "xla/stream_executor/stream_executor_pimpl.h"
 #include "xla/stream_executor/tpu/status_helper.h"
 #include "xla/stream_executor/tpu/tpu_api.h"
+#include "xla/stream_executor/tpu/tpu_ops_c_api.h"
+#include "xla/stream_executor/tpu/tpu_platform_interface.h"
 
 namespace tensorflow {
 namespace tpu {
 
-using absl::StatusOr;
-
 /*static*/
-StatusOr<std::unique_ptr<TpuNodeContext>> TpuNodeContext::Create(
+absl::StatusOr<std::unique_ptr<TpuNodeContext>> TpuNodeContext::Create(
     int device_ordinal) {
   StatusHelper status;
   XLA_TpuNodeContext* node_context =
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_node_context.h b/third_party/xla/xla/stream_executor/tpu/tpu_node_context.h
index 036ece761323db..95fde9a67f9cbf 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_node_context.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_node_context.h
@@ -18,11 +18,13 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "xla/service/backend.h"
 #include "xla/service/stream_pool.h"
 #include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/stream_executor_pimpl.h"
 #include "xla/stream_executor/tpu/tpu_ops_c_api.h"
 #include "xla/stream_executor/tpu/tpu_platform_interface.h"
 #include "tsl/platform/macros.h"
@@ -37,10 +39,8 @@ namespace tpu {
 // individual nodes.
 class TpuNodeContext final {
  public:
-  template <typename T>
-  using StatusOr = absl::StatusOr<T>;
-
-  static StatusOr<std::unique_ptr<TpuNodeContext>> Create(int device_ordinal);
+  static absl::StatusOr<std::unique_ptr<TpuNodeContext>> Create(
+      int device_ordinal);
 
   explicit TpuNodeContext(int device_ordinal, XLA_TpuNodeContext* node_context)
       : device_ordinal_(device_ordinal), node_context_(node_context) {
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_on_demand_compiler.cc b/third_party/xla/xla/stream_executor/tpu/tpu_on_demand_compiler.cc
index ed9f7a7f413cb9..254260e48579c9 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_on_demand_compiler.cc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_on_demand_compiler.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/cleanup/cleanup.h"
+#include "absl/status/statusor.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_module_group.h"
 #include "xla/service/compiler.h"
@@ -26,7 +27,6 @@ limitations under the License.
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/shape.h"
-#include "xla/statusor.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/tpu/c_api_conversions.h"
 #include "xla/stream_executor/tpu/c_api_decl.h"
@@ -38,6 +38,7 @@ limitations under the License.
 #include "xla/stream_executor/tpu/tpu_executor_c_api.h"
 #include "xla/stream_executor/tpu/tpu_platform_id.h"
 #include "xla/util.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 
@@ -70,8 +71,7 @@ class TpuCompiler : public Compiler {
     StatusHelper status;
     ExecutorApiFn()->TpuCompiler_RunHloPassesFn(
         compiler_, &hlo_module,
-        static_cast<stream_executor::tpu::TpuExecutor*>(
-            executor->implementation())
+        static_cast<stream_executor::tpu::TpuExecutor*>(executor)
             ->se_executor(),
         &allocator, &result, status.c_status);
     if (!status.ok()) {
@@ -100,8 +100,7 @@ class TpuCompiler : public Compiler {
     StatusHelper status;
     ExecutorApiFn()->TpuCompiler_RunBackendFn(
         compiler_, &hlo_module,
-        static_cast<stream_executor::tpu::TpuExecutor*>(
-            executor->implementation())
+        static_cast<stream_executor::tpu::TpuExecutor*>(executor)
             ->se_executor(),
         &allocator, &result, status.c_status);
     if (!status.ok()) {
@@ -141,9 +140,9 @@ class TpuCompiler : public Compiler {
       se_lists_storage.emplace_back(stream_exec[i].size());
       se_lists[i].exec = se_lists_storage.back().data();
       for (int j = 0; j < stream_exec[i].size(); ++j) {
-        se_lists[i].exec[j] = static_cast<stream_executor::tpu::TpuExecutor*>(
-                                  stream_exec[i][j]->implementation())
-                                  ->se_executor();
+        se_lists[i].exec[j] =
+            static_cast<stream_executor::tpu::TpuExecutor*>(stream_exec[i][j])
+                ->se_executor();
       }
     }
 
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_op_executable.cc b/third_party/xla/xla/stream_executor/tpu/tpu_op_executable.cc
index c4256230ac3bd6..6d1a243b58d1a1 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_op_executable.cc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_op_executable.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/log/log.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/service_executable_run_options.h"
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_op_executable.h b/third_party/xla/xla/stream_executor/tpu/tpu_op_executable.h
index 6ff5cced6c78e9..7239f33b348cbc 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_op_executable.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_op_executable.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/service_executable_run_options.h"
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_ops_c_api.h b/third_party/xla/xla/stream_executor/tpu/tpu_ops_c_api.h
index 80365ebb046a22..8214be6e2063b9 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_ops_c_api.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_ops_c_api.h
@@ -130,7 +130,6 @@ TFTPU_CAPI_EXPORT void TpuCompile_CompileAndBuild(
     TpuSerializedProto compilation_request, const XLA_TpuMeshState* mesh_state,
     XLA_TpuProgram** tpu_programs[], size_t* count, TF_Status* status);
 
-
 // Creates a new TPU mesh state object.
 TFTPU_CAPI_EXPORT XLA_TpuMeshState* TpuMeshState_Create();
 
@@ -433,6 +432,10 @@ TFTPU_CAPI_EXPORT bool TpuCompile_ShouldTpuCompileOpIgnoreCancellation();
 TFTPU_CAPI_EXPORT int TpuTopology_AvailableCoreCount(
     const XLA_TpuMeshState* mesh_state, TpuCoreTypeEnum tpu_core_type);
 
+// Returns the number of cores per Chip.
+TFTPU_CAPI_EXPORT int TpuTopology_AvailableCoresPerChip(
+    TpuCoreTypeEnum tpu_core_type);
+
 // Recycle unused service port.
 TFTPU_CAPI_EXPORT void TpuNetUtil_RecycleUnusedPort(int port);
 
@@ -798,6 +801,7 @@ struct TfTpu_OpsApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_IsTpuCompilationEnabled);
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_ShouldTpuCompileOpIgnoreCancellation);
   TFTPU_ADD_FN_IN_STRUCT(TpuTopology_AvailableCoreCount);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_AvailableCoresPerChip);
   TFTPU_ADD_FN_IN_STRUCT(TpuNetUtil_RecycleUnusedPort);
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_CreateCompilationCacheKey);
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_DestroyCompilationCacheKey);
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_platform.cc b/third_party/xla/xla/stream_executor/tpu/tpu_platform.cc
index b67e8faf5d5263..73bfc99f635fdf 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_platform.cc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_platform.cc
@@ -26,10 +26,10 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
+#include "xla/stream_executor/event.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/stream_executor/stream_executor_interface.h"
 #include "xla/stream_executor/tpu/c_api_decl.h"
 #include "xla/stream_executor/tpu/status_helper.h"
 #include "xla/stream_executor/tpu/tpu_api.h"
@@ -39,6 +39,7 @@ limitations under the License.
 #include "xla/stream_executor/tpu/tpu_platform_interface.h"
 #include "xla/stream_executor/tpu/tpu_topology.h"
 #include "tsl/platform/logging.h"  // IWYU pragma: keep
+#include "tsl/platform/status.h"
 
 namespace tensorflow {
 namespace tpu {
@@ -46,9 +47,6 @@ namespace tpu {
 const ::stream_executor::Platform::Id TpuPlatform::kId = GetTpuPlatformId();
 TpuPlatform* tpu_registered_platform = nullptr;
 
-template <typename T>
-using StatusOr = ::absl::StatusOr<T>;
-
 TpuPlatform::TpuPlatform() : name_("TPU") {
   platform_ = stream_executor::tpu::ExecutorApiFn()->TpuPlatform_NewFn();
   CHECK(platform_ != nullptr);
@@ -98,13 +96,13 @@ int TpuPlatform::VisibleDeviceCount() const {
       ->TpuPlatform_VisibleDeviceCountFn(platform_);
 }
 
-StatusOr<::stream_executor::StreamExecutor*> TpuPlatform::GetExecutor(
+absl::StatusOr<::stream_executor::StreamExecutor*> TpuPlatform::GetExecutor(
     const ::stream_executor::StreamExecutorConfig& config) {
   return executor_cache_.GetOrCreate(
       config, [&]() { return GetUncachedExecutor(config); });
 }
 
-StatusOr<std::unique_ptr<::stream_executor::StreamExecutor>>
+absl::StatusOr<std::unique_ptr<::stream_executor::StreamExecutor>>
 TpuPlatform::GetUncachedExecutor(
     const ::stream_executor::StreamExecutorConfig& config) {
   SE_StreamExecutorConfig* c_config = stream_executor::tpu::ExecutorApiFn()
@@ -122,9 +120,8 @@ TpuPlatform::GetUncachedExecutor(
   if (!status.ok()) {
     return status.status();
   }
-  return std::make_unique<stream_executor::StreamExecutor>(
-      this, std::make_unique<stream_executor::tpu::TpuExecutor>(
-                this, executor, config.ordinal));
+  return std::make_unique<stream_executor::tpu::TpuExecutor>(this, executor,
+                                                             config.ordinal);
 }
 
 ::stream_executor::Platform::Id TpuPlatform::id() const {
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_platform.h b/third_party/xla/xla/stream_executor/tpu/tpu_platform.h
index 13fbf669da6a3a..cb0e6c99acdc87 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_platform.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_platform.h
@@ -26,9 +26,11 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
+#include "xla/stream_executor/event.h"
 #include "xla/stream_executor/executor_cache.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor_interface.h"
+#include "xla/stream_executor/stream_interface.h"
 #include "xla/stream_executor/tpu/c_api_decl.h"
 #include "xla/stream_executor/tpu/tpu_executor_c_api.h"  // IWYU pragma: keep
 #include "xla/stream_executor/tpu/tpu_platform_interface.h"
@@ -47,9 +49,6 @@ class TpuPlatform : public ::tensorflow::tpu::TpuPlatformInterface {
 
   static const ::stream_executor::Platform::Id kId;
 
-  template <typename T>
-  using StatusOr = ::absl::StatusOr<T>;
-
   TpuPlatform();
 
   ~TpuPlatform() override;
@@ -80,22 +79,22 @@ class TpuPlatform : public ::tensorflow::tpu::TpuPlatformInterface {
     LOG(FATAL) << "Not yet implemented";
   }
 
-  StatusOr<std::unique_ptr<::stream_executor::DeviceDescription>>
+  absl::StatusOr<std::unique_ptr<::stream_executor::DeviceDescription>>
   DescriptionForDevice(int ordinal) const override {
     LOG(FATAL) << "Not yet implemented";
   }
 
-  StatusOr<::stream_executor::StreamExecutor*> ExecutorForDevice(
+  absl::StatusOr<::stream_executor::StreamExecutor*> ExecutorForDevice(
       int ordinal) override {
     stream_executor::StreamExecutorConfig config;
     config.ordinal = ordinal;
     return GetExecutor(config);
   }
 
-  StatusOr<::stream_executor::StreamExecutor*> GetExecutor(
+  absl::StatusOr<::stream_executor::StreamExecutor*> GetExecutor(
       const ::stream_executor::StreamExecutorConfig& config) override;
 
-  StatusOr<std::unique_ptr<::stream_executor::StreamExecutor>>
+  absl::StatusOr<std::unique_ptr<::stream_executor::StreamExecutor>>
   GetUncachedExecutor(
       const ::stream_executor::StreamExecutorConfig& config) override;
 
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_platform_id.cc b/third_party/xla/xla/stream_executor/tpu/tpu_platform_id.cc
index 3b3711f805daca..d4ac065f796c89 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_platform_id.cc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_platform_id.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "xla/stream_executor/tpu/tpu_platform_id.h"
 
+#include "xla/stream_executor/platform.h"
+
 namespace tensorflow {
 namespace tpu {
 
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_profiler_c_api.h b/third_party/xla/xla/stream_executor/tpu/tpu_profiler_c_api.h
index 3f0572a66132b9..448dcba8efc1fa 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_profiler_c_api.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_profiler_c_api.h
@@ -61,7 +61,7 @@ TFTPU_CAPI_EXPORT void TpuProfiler_CollectData(TpuProfiler* tpu_profiler,
                                                uint8_t* buffer,
                                                size_t* size_in_bytes);
 
-// Status helpers to create TFStatus for Profiler.
+// absl::Status helpers to create TFStatus for Profiler.
 TF_Status* TpuStatus_New();
 void TpuStatus_Free(TF_Status* status);
 const char* TpuStatus_Message(TF_Status* status);
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_topology.cc b/third_party/xla/xla/stream_executor/tpu/tpu_topology.cc
index dab965a525e6b7..53989afb011c2b 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_topology.cc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_topology.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <vector>
 
 #include "xla/stream_executor/tpu/c_api_decl.h"
-#include "xla/stream_executor/tpu/tpu_api.h"
 #include "xla/stream_executor/tpu/tpu_executor_api.h"
 
 namespace tensorflow {
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager.cc b/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager.cc
index c804af09c4d9fa..1273551804c408 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager.cc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager.cc
@@ -36,9 +36,7 @@ limitations under the License.
 #include "xla/stream_executor/tpu/c_api_conversions.h"
 #include "xla/stream_executor/tpu/c_api_decl.h"
 #include "xla/stream_executor/tpu/noncopyable_buffer.h"
-#include "xla/stream_executor/tpu/proto_helper.h"
 #include "xla/stream_executor/tpu/status_helper.h"
-#include "xla/stream_executor/tpu/tpu_api.h"
 #include "xla/stream_executor/tpu/tpu_executor.h"
 #include "xla/stream_executor/tpu/tpu_executor_api.h"
 #include "xla/stream_executor/tpu/tpu_executor_c_api.h"
@@ -50,9 +48,6 @@ limitations under the License.
 namespace tensorflow {
 namespace tpu {
 
-template <typename T>
-using StatusOr = absl::StatusOr<T>;
-
 TpuTransferManager::TpuTransferManager() {
   manager_ = stream_executor::tpu::ExecutorApiFn()->TpuTransferManager_NewFn();
 }
@@ -110,8 +105,8 @@ absl::Status TpuTransferManager::TransferLiteralToInfeed(
   StatusHelper status;
   XLA_Literal c_literal;
   ApiConverter::ToC(literal, &c_literal);
-  auto* tpu_executor = static_cast<stream_executor::tpu::TpuExecutor*>(
-      executor->implementation());
+  auto* tpu_executor =
+      static_cast<stream_executor::tpu::TpuExecutor*>(executor);
 
   stream_executor::tpu::ExecutorApiFn()
       ->TpuTransferManager_TransferLiteralToInfeedFn(
@@ -126,8 +121,8 @@ absl::Status TpuTransferManager::TransferBuffersToInfeed(
     se::StreamExecutor* executor,
     const std::deque<tensorflow::tpu::NoncopyableBuffer>& buffers) {
   StatusHelper status;
-  auto* tpu_executor = static_cast<stream_executor::tpu::TpuExecutor*>(
-      executor->implementation());
+  auto* tpu_executor =
+      static_cast<stream_executor::tpu::TpuExecutor*>(executor);
 
   std::vector<int64_t> buffers_size;
   std::vector<uint32_t*> buffers_array;
@@ -154,8 +149,8 @@ absl::Status TpuTransferManager::TransferLiteralFromOutfeed(
   StatusHelper status;
   XLA_Shape c_shape;
   XLA_Literal c_literal;
-  auto* tpu_executor = static_cast<stream_executor::tpu::TpuExecutor*>(
-      executor->implementation());
+  auto* tpu_executor =
+      static_cast<stream_executor::tpu::TpuExecutor*>(executor);
 
   ApiConverter::ToC(literal.shape(), &c_shape);
   ApiConverter::ToC(literal, &c_literal);
@@ -177,8 +172,7 @@ absl::Status TpuTransferManager::ResetDevices(
   std::vector<SE_StreamExecutor*> se;
   se.reserve(executor.size());
   for (int64_t i = 0; i < executor.size(); ++i) {
-    se.push_back(static_cast<stream_executor::tpu::TpuExecutor*>(
-                     executor[i]->implementation())
+    se.push_back(static_cast<stream_executor::tpu::TpuExecutor*>(executor[i])
                      ->se_executor());
   }
 
@@ -249,7 +243,7 @@ int64_t TpuTransferManager::GetByteSizeRequirement(
   return size_in_bytes;
 }
 
-StatusOr<xla::Shape> TpuTransferManager::ChooseCompactLayoutForShape(
+absl::StatusOr<xla::Shape> TpuTransferManager::ChooseCompactLayoutForShape(
     const xla::Shape& host_shape) const {
   XLA_Shape c_host_shape;
   ApiConverter::ToC(host_shape, &c_host_shape);
@@ -272,8 +266,7 @@ StatusOr<xla::Shape> TpuTransferManager::ChooseCompactLayoutForShape(
 bool TpuTransferManager::CanShapedBufferBeAccessedNow(
     stream_executor::StreamExecutor* executor,
     const xla::ShapedBuffer& device_buffer) const {
-  auto* tpu_executor =
-      down_cast<stream_executor::tpu::TpuExecutor*>(executor->implementation());
+  auto* tpu_executor = down_cast<stream_executor::tpu::TpuExecutor*>(executor);
   XLA_ShapedBuffer c_device_buffer;
   ApiConverter::ToC(device_buffer, &c_device_buffer);
   absl::Cleanup cleanup = [&c_device_buffer]() {
@@ -287,8 +280,7 @@ bool TpuTransferManager::CanShapedBufferBeAccessedNow(
 bool TpuTransferManager::CanBufferBeAccessedNow(
     se::StreamExecutor* executor,
     const se::DeviceMemoryBase& device_buffer) const {
-  auto* tpu_executor =
-      down_cast<stream_executor::tpu::TpuExecutor*>(executor->implementation());
+  auto* tpu_executor = down_cast<stream_executor::tpu::TpuExecutor*>(executor);
   SE_DeviceMemoryBase c_device_buffer{const_cast<void*>(device_buffer.opaque()),
                                       device_buffer.size(),
                                       device_buffer.payload()};
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager.h b/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager.h
index 83e024435f9dfe..0efd2fa558a647 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager.h
@@ -27,6 +27,8 @@ limitations under the License.
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/transfer_manager.h"
 #include "xla/shape.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/tpu/c_api_decl.h"
 #include "xla/stream_executor/tpu/noncopyable_buffer.h"
@@ -41,9 +43,6 @@ class TpuTransferManager : public xla::TpuTransferManagerInterface {
   TpuTransferManager();
   ~TpuTransferManager() override;
 
-  template <typename T>
-  using StatusOr = absl::StatusOr<T>;
-
   stream_executor::Platform::Id PlatformId() const override;
 
   xla::Shape HostShapeToDeviceShape(
@@ -77,7 +76,7 @@ class TpuTransferManager : public xla::TpuTransferManagerInterface {
 
   int64_t GetByteSizeRequirement(const xla::Shape& shape) const override;
 
-  StatusOr<xla::Shape> ChooseCompactLayoutForShape(
+  absl::StatusOr<xla::Shape> ChooseCompactLayoutForShape(
       const xla::Shape& host_shape) const override;
 
   bool CanShapedBufferBeAccessedNow(
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager_interface.cc b/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager_interface.cc
index f01165bb7b0741..c3df50ae1a640d 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager_interface.cc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager_interface.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/stream_executor/tpu/tpu_transfer_manager_interface.h"
 
+#include "absl/log/log.h"
 #include "xla/service/transfer_manager.h"
 #include "xla/stream_executor/tpu/tpu_platform_interface.h"
 
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager_interface.h b/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager_interface.h
index e3478e3a5ff78a..62a9647e7650b8 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager_interface.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager_interface.h
@@ -18,7 +18,9 @@ limitations under the License.
 
 #include <deque>
 
+#include "xla/literal.h"
 #include "xla/service/transfer_manager.h"
+#include "xla/shape.h"
 #include "xla/status.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/tpu/noncopyable_buffer.h"
@@ -27,11 +29,11 @@ namespace xla {
 
 class TpuTransferManagerInterface : public xla::TransferManager {
  public:
-  virtual Status TransferBuffersToInfeed(
+  virtual absl::Status TransferBuffersToInfeed(
       se::StreamExecutor* executor,
       const std::deque<tensorflow::tpu::NoncopyableBuffer>& buffers) = 0;
 
-  virtual Status LinearizeToBuffers(
+  virtual absl::Status LinearizeToBuffers(
       const LiteralSlice& literal, const Shape& device_shape,
       std::deque<tensorflow::tpu::NoncopyableBuffer>* buffers) = 0;
 
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager_registration.cc b/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager_registration.cc
index 2995bd40c0c253..5784c267cfbc81 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager_registration.cc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager_registration.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include <memory>
 
 #include "xla/service/transfer_manager.h"
-#include "xla/stream_executor/tpu/tpu_platform.h"
 #include "xla/stream_executor/tpu/tpu_platform_id.h"
 #include "xla/stream_executor/tpu/tpu_transfer_manager.h"
 
diff --git a/third_party/xla/xla/stream_executor/command_buffer.cc b/third_party/xla/xla/stream_executor/trace_command_buffer_factory.cc
similarity index 64%
rename from third_party/xla/xla/stream_executor/command_buffer.cc
rename to third_party/xla/xla/stream_executor/trace_command_buffer_factory.cc
index 937532d41ed588..6e66b897e32c28 100644
--- a/third_party/xla/xla/stream_executor/command_buffer.cc
+++ b/third_party/xla/xla/stream_executor/trace_command_buffer_factory.cc
@@ -1,4 +1,4 @@
-/* Copyright 2023 The OpenXLA Authors.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/stream_executor/command_buffer.h"
+#include "xla/stream_executor/trace_command_buffer_factory.h"
 
 #include <memory>
 #include <utility>
@@ -21,37 +21,35 @@ limitations under the License.
 #include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/kernel.h"
-#include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/stream_executor_interface.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
 namespace stream_executor {
 
-absl::StatusOr<std::unique_ptr<CommandBuffer>> CommandBuffer::Create(
-    StreamExecutor* executor, Mode mode) {
-  return executor->implementation()->CreateCommandBuffer(mode);
-}
-
-absl::StatusOr<std::unique_ptr<CommandBuffer>> CommandBuffer::Trace(
-    StreamExecutor* executor,
-    absl::AnyInvocable<absl::Status(Stream*)> function, Mode mode) {
+absl::StatusOr<std::unique_ptr<CommandBuffer>>
+TraceCommandBufferFactory::Create(
+    StreamExecutorInterface* executor,
+    absl::AnyInvocable<absl::Status(Stream*)> function,
+    CommandBuffer::Mode mode) {
   TF_ASSIGN_OR_RETURN(auto stream, executor->CreateStream());
-  return Trace(executor, stream.get(), std::move(function), mode);
+  return TraceCommandBufferFactory::Create(executor, stream.get(),
+                                           std::move(function), mode);
 }
 
-absl::StatusOr<std::unique_ptr<CommandBuffer>> CommandBuffer::Trace(
-    StreamExecutor* executor, Stream* stream,
-    absl::AnyInvocable<absl::Status(Stream*)> function, Mode mode) {
+absl::StatusOr<std::unique_ptr<CommandBuffer>>
+TraceCommandBufferFactory::Create(
+    StreamExecutorInterface* executor, Stream* stream,
+    absl::AnyInvocable<absl::Status(Stream*)> function,
+    CommandBuffer::Mode mode) {
   if (stream == nullptr)
     return absl::InvalidArgumentError(
         "Can't trace command buffer on a null stream");
 
   // Prepare an empty command buffer instance.
   TF_ASSIGN_OR_RETURN(std::unique_ptr<CommandBuffer> command_buffer,
-                      CommandBuffer::Create(executor, mode));
+                      executor->CreateCommandBuffer(mode));
 
   // Trace and finalize the command buffer.
   TF_RETURN_IF_ERROR(
diff --git a/third_party/xla/xla/stream_executor/trace_command_buffer_factory.h b/third_party/xla/xla/stream_executor/trace_command_buffer_factory.h
new file mode 100644
index 00000000000000..9a88770c8d8b58
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/trace_command_buffer_factory.h
@@ -0,0 +1,56 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_STREAM_EXECUTOR_TRACE_COMMAND_BUFFER_FACTORY_H_
+#define XLA_STREAM_EXECUTOR_TRACE_COMMAND_BUFFER_FACTORY_H_
+
+#include <memory>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/command_buffer.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor_interface.h"
+
+namespace stream_executor {
+
+class TraceCommandBufferFactory {
+ public:
+  // Creates a new command buffer on the given executor by tracing `function`
+  // invocation. All StreamExecutor operations on a Stream argument will be
+  // recorded into the command buffer. Returned command buffer is finalized, and
+  // can't be updated.
+  //
+  // Command buffer tracing should be used only when it is impossible to use
+  // explicit construction APIs, e.g. when calling external libraries. By
+  // default we construct traced command buffers in nested mode because the
+  // primary use case for traced command buffers is to be inserted into primary
+  // command buffers constructed with explicit APIs.
+  static absl::StatusOr<std::unique_ptr<CommandBuffer>> Create(
+      StreamExecutorInterface* executor,
+      absl::AnyInvocable<absl::Status(Stream*)> function,
+      CommandBuffer::Mode mode = CommandBuffer::Mode::kNested);
+
+  // Creates a new command buffer on the given executor by tracing `function`
+  // invocation using a user provided stream that will be passed to `function`.
+  static absl::StatusOr<std::unique_ptr<CommandBuffer>> Create(
+      StreamExecutorInterface* executor, Stream* stream,
+      absl::AnyInvocable<absl::Status(Stream*)> function,
+      CommandBuffer::Mode mode = CommandBuffer::Mode::kNested);
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_TRACE_COMMAND_BUFFER_FACTORY_H_
diff --git a/third_party/xla/xla/stream_executor/typed_kernel_factory.h b/third_party/xla/xla/stream_executor/typed_kernel_factory.h
new file mode 100644
index 00000000000000..95b9eb6c505ca5
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/typed_kernel_factory.h
@@ -0,0 +1,95 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_TYPED_KERNEL_FACTORY_H_
+#define XLA_STREAM_EXECUTOR_TYPED_KERNEL_FACTORY_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_factory.h"
+#include "xla/stream_executor/kernel_spec.h"
+#include "xla/stream_executor/stream_executor_interface.h"
+#include "tsl/platform/statusor.h"
+
+namespace stream_executor {
+
+// This class creates TypedKernel objects for stream executors based on the
+// specification.
+template <typename... Params>
+class TypedKernelFactory {
+ public:
+  // Creates a typed kernel on a given executor from a kernel specification.
+  static absl::StatusOr<TypedKernel<Params...>> Create(
+      StreamExecutorInterface *executor, const MultiKernelLoaderSpec &spec) {
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<Kernel> kernel,
+                        KernelFactory::Create(executor, spec));
+    return TypedKernel<Params...>(std::move(kernel));
+  }
+
+  // Creates a kernel which can be launched with `stream.ThenLaunch(...)` from a
+  // PTX (and optional CUBIN), such that the types of the arguments provided for
+  // launch would have to match types of the arguments provided at creation
+  // time. The canonical storage for both ptx and cubin_data should outlive the
+  // lifetime of the kernel.
+  static absl::StatusOr<TypedKernel<Params...>> Create(
+      StreamExecutorInterface *executor, absl::string_view kernel_name,
+      absl::string_view ptx, absl::Span<const uint8_t> cubin_data) {
+    MultiKernelLoaderSpec loader_spec(
+        TypedKernel<Params...>::kNumberOfParameters);
+    loader_spec.AddCudaPtxInMemory(ptx, kernel_name);
+
+    if (!cubin_data.empty()) {
+      loader_spec.AddCudaCubinInMemory(cubin_data, kernel_name);
+    }
+
+    return Create(executor, loader_spec);
+  }
+
+  // Creates a kernel which can be launched with `stream.ThenLaunch(...)` from
+  // an in-process symbol pointer.
+  static absl::StatusOr<TypedKernel<Params...>> Create(
+      StreamExecutorInterface *executor, absl::string_view kernel_name,
+      void *symbol) {
+    MultiKernelLoaderSpec loader_spec(
+        TypedKernel<Params...>::kNumberOfParameters);
+    loader_spec.AddInProcessSymbol(symbol, kernel_name);
+
+    return Create(executor, loader_spec);
+  }
+
+  // Creates a kernel which can be launched with `stream.ThenLaunch(...)` from
+  // an LLVM IR.
+  static absl::StatusOr<TypedKernel<Params...>> Create(
+      StreamExecutorInterface *executor, absl::string_view ir,
+      absl::string_view entrypoint, absl::string_view kernel_name,
+      absl::Span<std::string> options) {
+    MultiKernelLoaderSpec loader_spec(
+        TypedKernel<Params...>::kNumberOfParameters);
+    loader_spec.AddLlvmHostKernel(ir, entrypoint, kernel_name, options);
+
+    return Create(executor, loader_spec);
+  }
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_TYPED_KERNEL_FACTORY_H_
diff --git a/third_party/xla/xla/test_helpers.h b/third_party/xla/xla/test_helpers.h
index 11425fd51fa35e..7a80fd2f1874d5 100644
--- a/third_party/xla/xla/test_helpers.h
+++ b/third_party/xla/xla/test_helpers.h
@@ -31,10 +31,13 @@ class Literal;
 namespace testing {
 
 namespace internal_status {
-inline const Status& GetStatus(const Status& status) { return status; }
+// TODO(b/340953531) Eliminate this function.
+inline const absl::Status& GetStatus(const absl::Status& status) {
+  return status;
+}
 
 template <typename T>
-inline const Status& GetStatus(const StatusOr<T>& status) {
+inline const absl::Status& GetStatus(const absl::StatusOr<T>& status) {
   return status.status();
 }
 }  // namespace internal_status
@@ -45,8 +48,8 @@ inline const Status& GetStatus(const StatusOr<T>& status) {
 // The following macros are similar to macros in gmock, but deliberately named
 // differently in order to avoid conflicts in files which include both.
 
-// Macros for testing the results of functions that return Status or
-// StatusOr<T> (for any type T).
+// Macros for testing the results of functions that return absl::Status or
+// absl::StatusOr<T> (for any type T).
 #define EXPECT_IS_OK(expression) \
   EXPECT_EQ(::absl::OkStatus(),  \
             xla::testing::internal_status::GetStatus(expression))
diff --git a/third_party/xla/xla/tests/BUILD b/third_party/xla/xla/tests/BUILD
index 6ec37032b77ddc..0b8f0cf6889870 100644
--- a/third_party/xla/xla/tests/BUILD
+++ b/third_party/xla/xla/tests/BUILD
@@ -1,20 +1,17 @@
 # Description:
 #   Base testing infrastructure for XLA.
 
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm_is_configured",
 )
-load(
-    "@local_tsl//tsl/platform:build_config_root.bzl",
-    "tf_cuda_tests_tags",
-)
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
-load("//xla:xla.bzl", "xla_cc_binary", "xla_cc_test")
+load("//xla:xla.bzl", "tests_build_defs_bzl_deps", "xla_cc_binary", "xla_cc_test")
 load(
     "//xla/stream_executor:build_defs.bzl",
     "if_gpu_is_configured",
@@ -547,9 +544,10 @@ xla_test(
 xla_test(
     name = "xla_hlo_profile_test",
     srcs = ["xla_hlo_profile_test.cc"],
-    disabled_backends = [
-        # Hlo profiles are not supported on the interpreter backend.
-        "interpreter",
+    backends = [
+        # Hlo profiles are only supported on CPU/GPU.
+        "cpu",
+        "gpu",
     ],
     tags = ["no_aarch64"],
     deps = [
@@ -816,7 +814,10 @@ xla_test(
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:ml_dtypes",
         "@ml_dtypes//:float8",
-    ],
+    ] + if_rocm_is_configured([
+        # keep sorted
+        "@local_config_rocm//rocm:rocm_headers",
+    ]),
 )
 
 cc_library(
@@ -918,7 +919,10 @@ xla_test(
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_benchmark",
-    ],
+    ] + if_rocm_is_configured([
+        # keep sorted
+        "@local_config_rocm//rocm:rocm_headers",
+    ]),
 )
 
 # Run dot tests with auto-tuning disabled.  This just does a basic sanity check
@@ -952,7 +956,10 @@ xla_test(
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_benchmark",
-    ],
+    ] + if_rocm_is_configured([
+        # keep sorted
+        "@local_config_rocm//rocm:rocm_headers",
+    ]),
 )
 
 # Run dot tests with dot canonicalization after the layout assignment pass.
@@ -968,6 +975,7 @@ xla_test(
         "gpu",
         "interpreter",
     ],
+    shard_count = 50,
     tags = [
         "nozapfhahn",
         "optonly",
@@ -990,12 +998,16 @@ xla_test(
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_benchmark",
-    ],
+    ] + if_rocm_is_configured([
+        # keep sorted
+        "@local_config_rocm//rocm:rocm_headers",
+    ]),
 )
 
 xla_test(
     name = "gather_operation_test",
     srcs = ["gather_operation_test.cc"],
+    shard_count = 20,
     tags = ["test_hlo_pjrt_runner"],
     deps = [
         ":client_library_test_base",
@@ -1014,11 +1026,7 @@ xla_test(
 xla_test(
     name = "scatter_test",
     srcs = ["scatter_test.cc"],
-    backend_tags = {
-        # TODO(b/305008247)
-        "ghostfish_grm": ["broken"],
-    },
-    tags = ["test_hlo_pjrt_runner"],
+    # TODO(b/245550554): enable Pjrt runner for scatter test once it's fixed.
     deps = [
         ":client_library_test_base",
         ":hlo_test_base",
@@ -1066,7 +1074,10 @@ xla_test(
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_benchmark",
-    ],
+    ] + if_rocm_is_configured([
+        # keep sorted
+        "@local_config_rocm//rocm:rocm_headers",
+    ]),
 )
 
 xla_test(
@@ -1408,10 +1419,6 @@ xla_test(
 xla_test(
     name = "float8_test",
     srcs = ["float8_test.cc"],
-    backend_tags = {
-        # TODO(b/305009586)
-        "ghostfish_grm": ["broken"],
-    },
     deps = [
         ":client_library_test_base",
         ":xla_internal_test_main",
@@ -1603,6 +1610,24 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "reduce_window_rewriter_execution_test",
+    srcs = ["reduce_window_rewriter_execution_test.cc"],
+    backends = [
+        "cpu",
+        "gpu",
+    ],
+    deps = [
+        ":hlo_test_base",
+        ":test_macros_header",
+        ":xla_internal_test_main",
+        "//xla:error_spec",
+        "//xla:xla_data_proto_cc",
+        "@com_google_absl//absl/strings:string_view",
+        "@local_tsl//tsl/platform:test",
+    ],
+)
+
 # External xla_test targets can add "reduce_window_test_library" to xla_test_library_deps, in order
 # to refer to the cc_library compiled with the correct backend macros. The following test target
 # "reduce_window_test" is an example.
@@ -1638,7 +1663,7 @@ xla_test(
     name = "reduce_window_test",
     timeout = "long",
     srcs = [],
-    shard_count = 20,
+    shard_count = 40,
     tags = [
         "optonly",
     ],
@@ -1775,6 +1800,7 @@ xla_test(
         "//xla/client:xla_builder",
         "//xla/client/lib:constants",
         "//xla/ffi",
+        "//xla/ffi:attribute_map",
         "//xla/ffi:ffi_api",
         "//xla/hlo/ir:hlo",
         "//xla/service:custom_call_status",
@@ -1785,6 +1811,7 @@ xla_test(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
     ],
@@ -1897,7 +1924,10 @@ xla_test(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@local_tsl//tsl/platform:test",
-    ],
+    ] + if_rocm_is_configured([
+        # keep sorted
+        "@local_config_rocm//rocm:rocm_headers",
+    ]),
 )
 
 xla_test(
@@ -2120,6 +2150,7 @@ xla_test(
         ":hlo_test_base",
         ":literal_test_util",
         ":test_macros_header",
+        ":test_utils",
         ":xla_internal_test_main",
         "//xla:literal",
         "//xla/hlo/ir:hlo",
@@ -2176,10 +2207,6 @@ xla_test(
 xla_test(
     name = "bitcast_convert_test",
     srcs = ["bitcast_convert_test.cc"],
-    backend_tags = {
-        # TODO(b/305008729)
-        "ghostfish_grm": ["broken"],
-    },
     deps = [
         ":client_library_test_base",
         ":hlo_test_base",
@@ -2374,7 +2401,7 @@ xla_cc_test(
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM",
     ]),
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = if_gpu_is_configured([
         ":verified_hlo_module",
         "//xla:literal_util",
@@ -2422,6 +2449,11 @@ xla_test(
 xla_test(
     name = "cpu_gpu_fusion_test",
     srcs = ["cpu_gpu_fusion_test.cc"],
+    backends = [
+        "cpu",
+        "gpu",
+        "interpreter",
+    ],
     deps = [
         ":client_library_test_base",
         ":hlo_test_base",
@@ -2477,6 +2509,7 @@ xla_cc_test(
         ":local_client_aot_test_computation.o",
     ],
     linkstatic = 1,
+    tags = ["no_aarch64"],  # b/341355246
     deps = [
         "//xla:executable_run_options",
         "@com_google_absl//absl/base:dynamic_annotations",
@@ -2640,6 +2673,7 @@ xla_cc_test(
 xla_test(
     name = "transfer_manager_test",
     srcs = ["transfer_manager_test.cc"],
+    shard_count = 50,
     deps = [
         ":literal_test_util",
         ":local_client_test_base",
@@ -2685,7 +2719,7 @@ xla_cc_test(
     name = "sample_file_test",
     srcs = ["sample_file_test.cc"],
     data = ["isolated_convolution.hlo"],
-    tags = tf_cuda_tests_tags(),
+    use_gpu = True,
     deps = [
         ":hlo_test_base",
         ":xla_internal_test_main",  # fixdeps: keep
@@ -2720,10 +2754,6 @@ xla_test(
 xla_test(
     name = "iota_test",
     srcs = ["iota_test.cc"],
-    backend_tags = {
-        # TODO(b/305011394)
-        "ghostfish_grm": ["broken"],
-    },
     shard_count = 30,
     tags = [
         # Require optimized builds, iota_test_cpu is very slow in fastbuild.
@@ -2939,3 +2969,20 @@ xla_test(
         "@local_tsl//tsl/platform:test",
     ],
 )
+
+bzl_library(
+    name = "plugin_bzl",
+    srcs = ["plugin.bzl"],
+    deps = [],
+)
+
+bzl_library(
+    name = "build_defs_bzl",
+    srcs = ["build_defs.bzl"],
+    deps = [
+        ":plugin_bzl",
+        "//xla:xla_bzl",
+        "//xla/stream_executor:build_defs_bzl",
+        "@local_tsl//tsl/platform:build_config_root_bzl",
+    ] + tests_build_defs_bzl_deps(),
+)
diff --git a/third_party/xla/xla/tests/buffer_donation_test.cc b/third_party/xla/xla/tests/buffer_donation_test.cc
index 55ac7f0ce2306a..184161921433f4 100644
--- a/third_party/xla/xla/tests/buffer_donation_test.cc
+++ b/third_party/xla/xla/tests/buffer_donation_test.cc
@@ -80,8 +80,10 @@ class BufferDonationTest : public HloTestBase {
 
     TF_ASSERT_OK_AND_ASSIGN(auto stream, executor_->CreateStream());
 
+    auto& executors = backend_->stream_executors();
     se::StreamExecutorMemoryAllocator memory_allocator(
-        platform_, backend_->stream_executors());
+        platform_, std::vector<se::StreamExecutorInterface*>(executors.begin(),
+                                                             executors.end()));
     ExecutableRunOptions run_options;
     run_options.set_stream(stream.get());
     run_options.set_allocator(&memory_allocator);
diff --git a/third_party/xla/xla/tests/build_defs.bzl b/third_party/xla/xla/tests/build_defs.bzl
index b1d56b72b1273d..ef4570ed47c18e 100644
--- a/third_party/xla/xla/tests/build_defs.bzl
+++ b/third_party/xla/xla/tests/build_defs.bzl
@@ -11,7 +11,81 @@ load(
 )
 load("//xla/tests:plugin.bzl", "plugins")
 
-all_backends = ["cpu", "gpu"] + plugins.keys()
+# Possible backend values for the GPU family.
+GPU_BACKENDS = [
+    "gpu_any",
+    "gpu_p100",
+    "gpu_v100",
+    "gpu_a100",
+    "gpu_h100",
+]
+
+# The generic "gpu" backend includes the actual backends in this list.
+GPU_DEFAULT_BACKENDS = [
+    "gpu_any",
+    "gpu_a100",
+    "gpu_h100",
+]
+
+_DEFAULT_BACKENDS = ["cpu"] + GPU_DEFAULT_BACKENDS
+
+_ALL_BACKENDS = ["cpu", "interpreter"] + GPU_BACKENDS + list(plugins.keys())
+
+# buildifier: disable=function-docstring
+def prepare_gpu_backend_data(backends, disabled_backends, backend_tags, backend_args):
+    # Expand "gpu" backend name into device specific backend names.
+    new_backends = [name for name in backends if name != "gpu"]
+    if len(new_backends) < len(backends):
+        new_backends.extend(GPU_DEFAULT_BACKENDS)
+
+    new_disabled_backends = [name for name in disabled_backends if name != "gpu"]
+    if len(new_disabled_backends) < len(disabled_backends):
+        new_disabled_backends.extend(GPU_BACKENDS)
+
+    new_backend_tags = {key: value for key, value in backend_tags.items() if key != "gpu"}
+    gpu_backend_tags = backend_tags.get("gpu", [])
+    for key in GPU_BACKENDS:
+        new_backend_tags.setdefault(key, gpu_backend_tags[:])
+
+    new_backend_args = {key: value for key, value in backend_args.items() if key != "gpu"}
+    if "gpu" in backend_args:
+        for key in GPU_BACKENDS:
+            new_backend_args.setdefault(key, backend_args["gpu"])
+
+    # Disable backends that don't meet the device requirements.
+    sm_requirements = {
+        "gpu_any": (0, 0),
+        "gpu_p100": (6, 0),
+        "gpu_v100": (7, 0),
+        "gpu_a100": (8, 0),
+        "gpu_h100": (9, 0),
+    }
+    for gpu_backend in GPU_BACKENDS:
+        all_tags = new_backend_tags[gpu_backend]
+        requires_gpu = [t for t in all_tags if t.startswith("requires-gpu-")]
+        requires_sm, only = None, False
+        for tag in requires_gpu:
+            if tag.startswith("requires-gpu-sm"):
+                version = tag.split("-")[2][2:]
+                sm = (int(version[:-1]), int(version[-1]))
+                if not requires_sm or sm < requires_sm:
+                    requires_sm = sm
+                if tag.endswith("-only"):
+                    only = True
+        if only:
+            disable = requires_sm != sm_requirements[gpu_backend]
+        else:
+            disable = requires_sm and requires_sm > sm_requirements[gpu_backend]
+
+        if disable:
+            new_disabled_backends.append(gpu_backend)
+        else:
+            sm_major, sm_minor = sm_requirements[gpu_backend]
+            sm_tag = "requires-gpu-nvidia" if sm_major == 0 else "requires-gpu-sm%s%s-only" % (sm_major, sm_minor)
+            new_backend_tags[gpu_backend] = [t for t in all_tags if t not in requires_gpu]
+            new_backend_tags[gpu_backend].append(sm_tag)
+
+    return new_backends, new_disabled_backends, new_backend_tags, new_backend_args
 
 def xla_test(
         name,
@@ -94,7 +168,11 @@ def xla_test(
 
     test_names = []
     if not backends:
-        backends = all_backends
+        backends = _DEFAULT_BACKENDS
+
+    # Expand "gpu" backend name to specific GPU backends and update tags.
+    backends, disabled_backends, backend_tags, backend_args = \
+        prepare_gpu_backend_data(backends, disabled_backends, backend_tags, backend_args)
 
     backends = [
         backend
@@ -108,23 +186,32 @@ def xla_test(
         this_backend_copts = []
         this_backend_args = backend_args.get(backend, [])
         this_backend_data = []
+        backend_deps = []
         if backend == "cpu":
-            backend_deps = ["//xla/service:cpu_plugin"]
-            backend_deps += ["//xla/tests:test_macros_cpu"]  # buildifier: disable=list-append
-        elif backend == "gpu":
-            backend_deps = if_gpu_is_configured(["//xla/service:gpu_plugin"])
-            backend_deps += if_gpu_is_configured(["//xla/tests:test_macros_gpu"])  # buildifier: disable=list-append
+            backend_deps += [
+                "//xla/service:cpu_plugin",
+                "//xla/tests:test_macros_cpu",
+            ]
+        elif backend in GPU_BACKENDS:
+            backend_deps += if_gpu_is_configured([
+                "//xla/service:gpu_plugin",
+                "//xla/tests:test_macros_%s" % backend,
+            ])
             this_backend_tags += tf_gpu_tests_tags()
+            this_backend_copts.append("-DXLA_TEST_BACKEND_GPU=1")
+        elif backend == "interpreter":
+            backend_deps += [
+                "//xla/service:interpreter_plugin",
+                "//xla/tests:test_macros_interpreter",
+            ]
         elif backend in plugins:
-            backend_deps = []
             backend_deps += plugins[backend]["deps"]
             this_backend_copts += plugins[backend]["copts"]
             this_backend_tags += plugins[backend]["tags"]
             this_backend_args += plugins[backend]["args"]
             this_backend_data += plugins[backend]["data"]
         else:
-            # Ignore unknown backends. TODO(b/289028518): Change back to fail.
-            continue
+            fail("Unknown backend %s" % backend)
 
         if xla_test_library_deps:
             for lib_dep in xla_test_library_deps:
@@ -197,12 +284,16 @@ def xla_test_library(
     """
 
     if not backends:
-        backends = all_backends
+        backends = _ALL_BACKENDS
 
     for backend in backends:
         this_backend_copts = []
-        if backend in ["cpu", "gpu"]:
+        if backend == "cpu":
+            backend_deps = ["//xla/tests:test_macros_cpu"]
+        elif backend in GPU_BACKENDS:
             backend_deps = ["//xla/tests:test_macros_%s" % backend]
+        elif backend == "interpreter":
+            backend_deps = ["//xla/tests:test_macros_interpreter"]
         elif backend in plugins:
             backend_deps = plugins[backend]["deps"]
             this_backend_copts += plugins[backend]["copts"]
@@ -230,7 +321,7 @@ def generate_backend_suites(backends = []):  # buildifier: disable=unnamed-macro
     """
 
     if not backends:
-        backends = all_backends
+        backends = _ALL_BACKENDS
     for backend in backends:
         native.test_suite(
             name = "%s_tests" % backend,
@@ -244,7 +335,7 @@ def generate_backend_test_macros(backends = []):  # buildifier: disable=unnamed-
       backends: The list of backends to generate libraries for.
     """
     if not backends:
-        backends = all_backends
+        backends = _ALL_BACKENDS
     for backend in backends:
         manifest = ""
         if backend in plugins:
diff --git a/third_party/xla/xla/tests/client_library_test_base.cc b/third_party/xla/xla/tests/client_library_test_base.cc
index 86dc33b33d1958..7df9f2d1d9d530 100644
--- a/third_party/xla/xla/tests/client_library_test_base.cc
+++ b/third_party/xla/xla/tests/client_library_test_base.cc
@@ -181,7 +181,8 @@ void ClientLibraryTestBase::ComputeAndCompareLiteral(
                                                   error, shape_with_layout));
 }
 
-Status ClientLibraryTestBase::ComputeAndCompareLiteralWithAllOutputLayouts(
+absl::Status
+ClientLibraryTestBase::ComputeAndCompareLiteralWithAllOutputLayouts(
     const xla::XlaComputation& computation, const Literal& expected,
     absl::Span<GlobalData* const> arguments,
     const std::function<void(const Literal& actual,
@@ -207,7 +208,7 @@ Status ClientLibraryTestBase::ComputeAndCompareLiteralWithAllOutputLayouts(
   return OkStatus();
 }
 
-Status ClientLibraryTestBase::ComputeAndCompareLiteralWithAllInputLayouts(
+absl::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithAllInputLayouts(
     const xla::XlaComputation& computation, const Literal& /*expected*/,
     absl::Span<GlobalData* const> arguments,
     const std::function<void(const Literal& actual,
@@ -218,8 +219,8 @@ Status ClientLibraryTestBase::ComputeAndCompareLiteralWithAllInputLayouts(
   // This is a recursive function. It's an std::function instead of a lambda
   // because it needs to capture itself. The index is the index of the argument
   // to try all layouts for.
-  std::function<Status(int64_t)> choose;
-  choose = [&, this](int64_t index) -> Status {
+  std::function<absl::Status(int64_t)> choose;
+  choose = [&, this](int64_t index) -> absl::Status {
     if (index < arguments.size()) {
       // Try out all layouts for the operand.
       TF_ASSIGN_OR_RETURN(auto literal,
@@ -293,7 +294,7 @@ absl::StatusOr<Literal> ClientLibraryTestBase::ComputeAndTransfer(
   return ExecuteAndTransfer(computation, arguments, shape_with_layout);
 }
 
-Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
+absl::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
     XlaBuilder* builder, const Literal& expected,
     absl::Span<GlobalData* const> arguments_passed_in,
     const Shape* shape_with_layout) {
@@ -355,7 +356,7 @@ Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
   return OkStatus();
 }
 
-Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
+absl::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
     XlaBuilder* builder, const Literal& expected,
     absl::Span<GlobalData* const> arguments_passed_in, ErrorSpec error,
     const Shape* shape_with_layout) {
diff --git a/third_party/xla/xla/tests/client_library_test_base.h b/third_party/xla/xla/tests/client_library_test_base.h
index b7be83b5c9c40f..e60ea01b493b44 100644
--- a/third_party/xla/xla/tests/client_library_test_base.h
+++ b/third_party/xla/xla/tests/client_library_test_base.h
@@ -198,11 +198,11 @@ class ClientLibraryTestBase : public ManifestCheckingTest {
       const Shape* shape_with_layout = nullptr);
 
   // ComputeAndCompare variant which returns an error status.
-  Status ComputeAndCompareLiteralWithStatus(
+  absl::Status ComputeAndCompareLiteralWithStatus(
       XlaBuilder* builder, const Literal& expected,
       absl::Span<GlobalData* const> arguments,
       const Shape* shape_with_layout = nullptr);
-  Status ComputeAndCompareLiteralWithStatus(
+  absl::Status ComputeAndCompareLiteralWithStatus(
       XlaBuilder* builder, const Literal& expected,
       absl::Span<GlobalData* const> arguments, ErrorSpec error,
       const Shape* shape_with_layout = nullptr);
@@ -422,13 +422,13 @@ class ClientLibraryTestBase : public ManifestCheckingTest {
   ExecutionOptions execution_options_;
 
  private:
-  Status ComputeAndCompareLiteralWithAllOutputLayouts(
+  absl::Status ComputeAndCompareLiteralWithAllOutputLayouts(
       const xla::XlaComputation& computation, const Literal& expected,
       absl::Span<GlobalData* const> arguments,
       const std::function<void(const Literal& actual,
                                const std::string& error_message)>&
           verify_output);
-  Status ComputeAndCompareLiteralWithAllInputLayouts(
+  absl::Status ComputeAndCompareLiteralWithAllInputLayouts(
       const xla::XlaComputation& computation, const Literal& expected,
       absl::Span<GlobalData* const> arguments,
       const std::function<void(const Literal& actual,
diff --git a/third_party/xla/xla/tests/collective_ops_test_e2e.cc b/third_party/xla/xla/tests/collective_ops_test_e2e.cc
index 55bf673c21783d..d40ce579a41e06 100644
--- a/third_party/xla/xla/tests/collective_ops_test_e2e.cc
+++ b/third_party/xla/xla/tests/collective_ops_test_e2e.cc
@@ -78,14 +78,16 @@ class AsyncCollectiveOps : public CollectiveOpsTestE2E,
     DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
 
     // Enable or disable all async collectives based on test parameter.
-    const bool enable_async = GetParam();
-    debug_options.set_xla_gpu_enable_async_collectives(false);
-    debug_options.set_xla_gpu_enable_async_all_reduce(enable_async);
-    debug_options.set_xla_gpu_enable_async_collective_broadcast(enable_async);
-    debug_options.set_xla_gpu_enable_async_collective_permute(enable_async);
-    debug_options.set_xla_gpu_enable_async_all_gather(enable_async);
-    debug_options.set_xla_gpu_enable_async_reduce_scatter(enable_async);
-    debug_options.set_xla_gpu_enable_async_all_to_all(enable_async);
+    bool enable_async = GetParam();
+    if (!enable_async) {
+      for (auto option :
+           {DebugOptions::NOOP, DebugOptions::ALLREDUCE,
+            DebugOptions::ALLGATHER, DebugOptions::REDUCESCATTER,
+            DebugOptions::COLLECTIVEBROADCAST, DebugOptions::ALLTOALL,
+            DebugOptions::COLLECTIVEPERMUTE}) {
+        debug_options.add_xla_gpu_disable_async_collectives(option);
+      }
+    }
     debug_options.add_xla_disable_hlo_passes(
         "gpu-convert-async-collectives-to-sync");
     return debug_options;
@@ -645,5 +647,85 @@ TEST_F(CollectiveOpsTestE2E, NoAllToAllDecomposition) {
   LiteralTestUtil::ExpectR1Equal<uint32_t>({20, 25, 21, 26}, results[1]);
 }
 
+TEST_F(CollectiveOpsTestE2E, WindowedEinsumE2EAllgatherMultiConsumer) {
+  absl::string_view kModuleReplicatedStr = R"(
+HloModule pjit__unnamed_wrapped_function_, entry_computation_layout={(bf16[2,16,48]{2,1,0}, bf16[48,192]{1,0}, bf16[48,192]{1,0}, bf16[192,48]{1,0})->bf16[2,16,48]{2,1,0}}, allow_spmd_sharding_propagation_to_parameters={false,false,false,false}, num_partitions=4
+
+ENTRY main.12 {
+  Arg_0.1 = bf16[2,16,48]{2,1,0} parameter(0), sharding={devices=[1,4,1]<=[4]}
+  Arg_1.2 = bf16[48,192]{1,0} parameter(1), sharding={devices=[1,4]<=[4]}
+  dot.5 = bf16[2,16,192]{2,1,0} dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+  custom-call.7 = bf16[2,16,192]{2,1,0} custom-call(dot.5), custom_call_target="Sharding", sharding={devices=[1,1,4]<=[4]}
+  Arg_2.3 = bf16[48,192]{1,0} parameter(2), sharding={devices=[1,4]<=[4]}
+  dot.6 = bf16[2,16,192]{2,1,0} dot(Arg_0.1, Arg_2.3), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+  add.8 = bf16[2,16,192]{2,1,0} add(custom-call.7, dot.6)
+  Arg_3.4 = bf16[192,48]{1,0} parameter(3), sharding={devices=[4,1]<=[4]}
+  dot.9 = bf16[2,16,48]{2,1,0} dot(add.8, Arg_3.4), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+  tuple.10 = (bf16[2,16,48]{2,1,0}) tuple(dot.9)
+  ROOT get-tuple-element.11 = bf16[2,16,48]{2,1,0} get-tuple-element(tuple.10), index=0, sharding={devices=[1,4,1]<=[4]}
+} // main.12
+)";
+
+  const int64_t kNumReplicas = 1;
+  const int64_t kNumPartitions = 4;
+
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
+  auto opts = GetDebugOptionsForTest();
+  opts.set_xla_gpu_threshold_for_windowed_einsum_mib(0);
+  opts.set_xla_gpu_multi_streamed_windowed_einsum(true);
+  opts.set_xla_gpu_graph_min_graph_size(200);
+  opts.set_xla_gpu_enable_triton_gemm(false);
+  config.set_debug_options(opts);
+  config.set_num_partitions(kNumPartitions);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, ParseAndReturnVerifiedModule(kModuleReplicatedStr, config));
+  DeviceAssignment assn(/*replica_count=*/kNumReplicas,
+                        /*computation_count=*/kNumPartitions);
+  config.set_replica_count(kNumReplicas);
+  for (int64_t i = 0; i < kNumPartitions; ++i) {
+    assn(0, i) = i;
+  }
+
+  auto fake_arguments = xla::MakeFakeArguments(module.get()).value();
+  std::vector<Literal*> fake_ptrs(fake_arguments.size());
+  for (int i = 0; i < fake_arguments.size(); i++) {
+    fake_ptrs[i] = &fake_arguments[i];
+  }
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<Literal> results,
+      HloTestBase::ExecuteReplicated(
+          std::move(module), fake_ptrs, kNumPartitions, &assn,
+          true /*run_hlo_passes*/, true /*use-threads*/));
+  ASSERT_EQ(results.size(), kNumPartitions);
+  HloModuleConfig ref_config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
+  auto ref_opts = GetDebugOptionsForTest();
+  ref_opts.set_xla_gpu_graph_min_graph_size(200);
+  ref_opts.set_xla_gpu_enable_triton_gemm(false);
+  ref_config.set_debug_options(ref_opts);
+  ref_config.set_num_partitions(kNumPartitions);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto ref_module,
+      ParseAndReturnVerifiedModule(kModuleReplicatedStr, ref_config));
+  auto fake_ref_arguments = xla::MakeFakeArguments(ref_module.get()).value();
+  std::vector<Literal*> ref_fake_ptrs(fake_ref_arguments.size());
+  for (int i = 0; i < fake_ref_arguments.size(); i++) {
+    ref_fake_ptrs[i] = &fake_ref_arguments[i];
+  }
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<Literal> ref_results,
+      HloTestBase::ExecuteReplicated(
+          std::move(ref_module), ref_fake_ptrs, kNumPartitions, &assn,
+          true /*run_hlo_passes*/, true /*use-threads*/));
+  ASSERT_EQ(ref_results.size(), kNumPartitions);
+  ErrorSpec error_spec{1e-2, 1e-2};
+  // Results should be the same between windowed einsum and non-windowed cases
+  for (int i = 0; i < kNumPartitions; i++) {
+    EXPECT_TRUE(LiteralTestUtil::Near(ref_results[i], results[i], error_spec));
+  }
+}
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/tests/collective_pipeliner_execution_test.cc b/third_party/xla/xla/tests/collective_pipeliner_execution_test.cc
index 403f9b32854e79..a9a451c4ae7e96 100644
--- a/third_party/xla/xla/tests/collective_pipeliner_execution_test.cc
+++ b/third_party/xla/xla/tests/collective_pipeliner_execution_test.cc
@@ -40,10 +40,8 @@ absl::StatusOr<bool> RunOptimizer(
     CollectivePipeliner::PipeliningDirection pipelining_direction =
         CollectivePipeliner::PipeliningDirection::kForward,
     bool pipeline_use_tree = false,
-    HloPredicate acceptable_formatting =
-        [](const HloInstruction*) { return true; },
-    HloPredicate reuse_pipelined_op_buffer =
-        [](const HloInstruction*) { return true; }) {
+    HloPredicate acceptable_formatting = HloPredicateTrue,
+    HloPredicate reuse_pipelined_op_buffer = HloPredicateTrue) {
   CollectivePipeliner::Config config = {
       /*level_to_operate_on=*/level_to_operate_on,
       /*max_pipelining_per_loop=*/INT64_MAX,
diff --git a/third_party/xla/xla/tests/compute_constant_test.cc b/third_party/xla/xla/tests/compute_constant_test.cc
index b37cf2d9b83950..53946543cdcccc 100644
--- a/third_party/xla/xla/tests/compute_constant_test.cc
+++ b/third_party/xla/xla/tests/compute_constant_test.cc
@@ -78,8 +78,9 @@ class ComputeConstantTest : public ::testing::Test {
   }
 
   template <class Scalar>
-  StatusOr<Scalar> ComputeConstantScalar(Client* client, const XlaOp operand,
-                                         XlaBuilder* builder) {
+  absl::StatusOr<Scalar> ComputeConstantScalar(Client* client,
+                                               const XlaOp operand,
+                                               XlaBuilder* builder) {
     TF_ASSIGN_OR_RETURN(auto literal, ComputeConstantLiteral(client, operand,
                                                              builder, nullptr));
     return literal.Get<Scalar>({});
diff --git a/third_party/xla/xla/tests/constants_test.cc b/third_party/xla/xla/tests/constants_test.cc
index 54037278e41ddd..a926d24819fd68 100644
--- a/third_party/xla/xla/tests/constants_test.cc
+++ b/third_party/xla/xla/tests/constants_test.cc
@@ -122,10 +122,11 @@ TEST_F(ConstantsTest, OneCellF8e5m2) {
 }
 
 TEST_F(ConstantsTest, OneCellF8e4m3b11fnuz) {
-  std::vector<tsl::float8_e4m3b11> constant = {tsl::float8_e4m3b11{2.0}};
+  std::vector<tsl::float8_e4m3b11fnuz> constant = {
+      tsl::float8_e4m3b11fnuz{2.0}};
 
   XlaBuilder builder(TestName());
-  auto c = ConstantR1<tsl::float8_e4m3b11>(&builder, constant);
+  auto c = ConstantR1<tsl::float8_e4m3b11fnuz>(&builder, constant);
   // F8 outputs are not yet supported so convert to F32
   ConvertElementType(c, F32);
 
diff --git a/third_party/xla/xla/tests/conv_depthwise_test.cc b/third_party/xla/xla/tests/conv_depthwise_test.cc
index b44e089b596ce9..e1826bd902505d 100644
--- a/third_party/xla/xla/tests/conv_depthwise_test.cc
+++ b/third_party/xla/xla/tests/conv_depthwise_test.cc
@@ -97,7 +97,7 @@ XLA_TEST_P(DepthwiseConvolution2DTest, DoIt) {
       BuildHloTextDepthwiseConvolution2D(spec, use_bfloat16);
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{0.01, 0.01},
-                            [](HloModule* module) -> Status {
+                            [](HloModule* module) -> absl::Status {
                               BFloat16MixedPrecisionRemoval remover;
                               TF_RETURN_IF_ERROR(remover.Run(module).status());
                               Despecializer despecializer;
diff --git a/third_party/xla/xla/tests/convert_test.cc b/third_party/xla/xla/tests/convert_test.cc
index 91d545023c6374..13ca51a4025ebb 100644
--- a/third_party/xla/xla/tests/convert_test.cc
+++ b/third_party/xla/xla/tests/convert_test.cc
@@ -960,13 +960,14 @@ XLA_TEST_F(ConvertTest, ConvertF8e4m3b11fnuzF16RoundtripExhaustive) {
   // Convert from FP8 to FP16, then back to FP8
   XlaBuilder builder(TestName());
 
-  std::vector<tsl::float8_e4m3b11> all_f8;
+  std::vector<tsl::float8_e4m3b11fnuz> all_f8;
   for (int i = 0; i < 256; i++) {
-    all_f8.push_back(
-        Eigen::numext::bit_cast<tsl::float8_e4m3b11>(static_cast<uint8_t>(i)));
+    all_f8.push_back(Eigen::numext::bit_cast<tsl::float8_e4m3b11fnuz>(
+        static_cast<uint8_t>(i)));
   }
 
-  xla::XlaOp all_f8_as_f8 = ConstantR1<tsl::float8_e4m3b11>(&builder, all_f8);
+  xla::XlaOp all_f8_as_f8 =
+      ConstantR1<tsl::float8_e4m3b11fnuz>(&builder, all_f8);
   xla::XlaOp all_f8_as_f16 = ConvertElementType(all_f8_as_f8, F16);
   ConvertElementType(all_f8_as_f16, F8E4M3B11FNUZ);
   ComputeAndCompare(&builder, {}, ErrorSpec(0.));
@@ -978,8 +979,9 @@ XLA_TEST_F(ConvertTest, ConvertF8e4m3b11fnuzF16RoundtripExhaustive2) {
 
   std::vector<float> all_f8;
   for (int i = 0; i < 256; i++) {
-    all_f8.push_back(static_cast<float>(
-        Eigen::numext::bit_cast<tsl::float8_e4m3b11>(static_cast<uint8_t>(i))));
+    all_f8.push_back(
+        static_cast<float>(Eigen::numext::bit_cast<tsl::float8_e4m3b11fnuz>(
+            static_cast<uint8_t>(i))));
   }
 
   xla::XlaOp all_f8_as_f32 = ConstantR1<float>(&builder, all_f8);
@@ -991,13 +993,14 @@ XLA_TEST_F(ConvertTest, ConvertF8e4m3b11fnuzF16RoundtripExhaustive3) {
   // Convert from FP8 to FP32.
   XlaBuilder builder(TestName());
 
-  std::vector<tsl::float8_e4m3b11> all_f8;
+  std::vector<tsl::float8_e4m3b11fnuz> all_f8;
   for (int i = 0; i < 256; i++) {
-    all_f8.push_back(
-        Eigen::numext::bit_cast<tsl::float8_e4m3b11>(static_cast<uint8_t>(i)));
+    all_f8.push_back(Eigen::numext::bit_cast<tsl::float8_e4m3b11fnuz>(
+        static_cast<uint8_t>(i)));
   }
 
-  xla::XlaOp all_f8_as_f8 = ConstantR1<tsl::float8_e4m3b11>(&builder, all_f8);
+  xla::XlaOp all_f8_as_f8 =
+      ConstantR1<tsl::float8_e4m3b11fnuz>(&builder, all_f8);
   ConvertElementType(all_f8_as_f8, F32);
   ComputeAndCompare(&builder, {}, ErrorSpec(0.));
 }
diff --git a/third_party/xla/xla/tests/convolution_test.cc b/third_party/xla/xla/tests/convolution_test.cc
index 4691bf1d6bd997..3dd0f1db48f9c4 100644
--- a/third_party/xla/xla/tests/convolution_test.cc
+++ b/third_party/xla/xla/tests/convolution_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/strings/str_replace.h"
 #include "xla/array2d.h"
 #include "xla/array4d.h"
 #include "xla/client/global_data.h"
@@ -1605,7 +1606,8 @@ XLA_TEST_F(ConvolutionTest, ConvolveF32BackwardInputGroupedConvolution) {
 class ConvolutionHloTest : public HloTestBase {};
 
 // double datatype is not yet supported in ROCm
-XLA_TEST_F(ConvolutionHloTest, DISABLED_ON_GPU_ROCM(ConvolveF64Forward)) {
+XLA_TEST_F(ConvolutionHloTest,
+           DISABLED_ON_TPU(DISABLED_ON_GPU_ROCM(ConvolveF64Forward))) {
   constexpr char kHlo[] = R"(
 HloModule TestModule
 
@@ -1644,7 +1646,7 @@ ENTRY Test {
 
 // double datatype is not yet supported in ROCm
 XLA_TEST_F(ConvolutionHloTest,
-           DISABLED_ON_GPU_ROCM(ConvolveF64BackwardFilter)) {
+           DISABLED_ON_TPU(DISABLED_ON_GPU_ROCM(ConvolveF64BackwardFilter))) {
   constexpr char kHlo[] = R"(
 HloModule TestModule
 
@@ -1657,7 +1659,8 @@ ENTRY Test {
 }
 
 // double datatype is not yet supported in ROCm
-XLA_TEST_F(ConvolutionHloTest, DISABLED_ON_GPU_ROCM(ConvolveF64BackwardInput)) {
+XLA_TEST_F(ConvolutionHloTest,
+           DISABLED_ON_TPU(DISABLED_ON_GPU_ROCM(ConvolveF64BackwardInput))) {
   constexpr char kHlo[] = R"(
 HloModule TestModule
 
@@ -1753,7 +1756,7 @@ ENTRY TestComputation {
 }
 
 XLA_TEST_F(ConvolutionHloTest, TestFusedConv2D) {
-  constexpr char kHlo[] = R"(
+  std::string kHlo = R"(
 HloModule TestModule
 
 ENTRY TestComputation {
@@ -1763,11 +1766,51 @@ ENTRY TestComputation {
   %bias = f32[32] parameter(2)
   %broadcasted_bias = f32[8,5,5,32] broadcast(%bias), dimensions={3}
   %add = f32[8,5,5,32] add(%conv, %broadcasted_bias)
+)";
+
+  std::string kHloNoPad = R"(
+HloModule TestModule
+
+ENTRY TestComputation {
+  %p0 = f32[8,7,7,1] parameter(0)
+  %p1 = f32[3,3,1,32] parameter(1)
+  %conv = f32[8,5,5,32] convolution(p0, p1), window={size=3x3 pad=0_0x0_0}, dim_labels=b01f_01io->b01f
+  %bias = f32[32] parameter(2)
+  %broadcasted_bias = f32[8,5,5,32] broadcast(%bias), dimensions={3}
+  %add = f32[8,5,5,32] add(%conv, %broadcasted_bias)
+)";
+
+  std::string kHloRELU = R"(
+
   %zero = f32[] constant(0)
   %zeros = f32[8,5,5,32] broadcast(%zero), dimensions={}
   ROOT relu = f32[8,5,5,32] maximum(%zeros, %add)
 })";
-  EXPECT_TRUE(RunAndCompare(kHlo, ErrorSpec{0.01, 0.01}));
+
+  std::string kHloTANH = R"(
+  ROOT result = f32[8,5,5,32] tanh(%add)
+})";
+
+  std::string kHloELU = R"(
+  %zero = f32[] constant(0)
+  %zeros = f32[8,5,5,32] broadcast(%zero), dimensions={}
+  %one = f32[] constant(1)
+  %ones = f32[8,5,5,32] broadcast(%one), dimensions={}
+  %exp = f32[8,5,5,32] exponential(%add)
+  %expm1 = f32[8,5,5,32] subtract(%exp, %ones)
+  %sgn = pred[8,5,5,32] compare(%add, %zeros), direction=GT
+  ROOT elu = f32[8,5,5,32] select(%sgn, %add, %expm1)
+})";
+
+  EXPECT_TRUE(RunAndCompare(kHlo + kHloRELU, ErrorSpec{0.01, 0.01}));
+  EXPECT_TRUE(RunAndCompare(kHlo + kHloTANH, ErrorSpec{0.01, 0.01}));
+  EXPECT_TRUE(RunAndCompare(kHlo + kHloELU, ErrorSpec{0.01, 0.01}));
+  EXPECT_TRUE(
+      RunAndCompare(absl::StrReplaceAll(kHlo + kHloRELU, {{"f32", "f16"}}),
+                    ErrorSpec{0.03, 0.03}));
+  EXPECT_TRUE(
+      RunAndCompare(absl::StrReplaceAll(kHloNoPad + kHloRELU, {{"f32", "f16"}}),
+                    ErrorSpec{0.03, 0.03}));
 }
 
 XLA_TEST_F(ConvolutionHloTest, TestFusedConv3D) {
diff --git a/third_party/xla/xla/tests/cpu_gpu_fusion_test.cc b/third_party/xla/xla/tests/cpu_gpu_fusion_test.cc
index a504247923df42..9524d65d41d486 100644
--- a/third_party/xla/xla/tests/cpu_gpu_fusion_test.cc
+++ b/third_party/xla/xla/tests/cpu_gpu_fusion_test.cc
@@ -892,7 +892,9 @@ void BM_ParallelFusion(::testing::benchmark::State& state) {
 
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().value();
   auto executors = PlatformUtil::GetStreamExecutors(platform).value();
-  se::StreamExecutorMemoryAllocator allocator(platform, executors);
+  se::StreamExecutorMemoryAllocator allocator(
+      platform, std::vector<se::StreamExecutorInterface*>(executors.begin(),
+                                                          executors.end()));
 
   const int64_t intra_op_parallelism_threads = 24;
   xla::LocalClientOptions client_options;
diff --git a/third_party/xla/xla/tests/custom_call_test.cc b/third_party/xla/xla/tests/custom_call_test.cc
index c6dc3b3ffde015..48fb27125e7bf2 100644
--- a/third_party/xla/xla/tests/custom_call_test.cc
+++ b/third_party/xla/xla/tests/custom_call_test.cc
@@ -38,8 +38,10 @@ limitations under the License.
 #include "xla/layout_util.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
+#include "xla/primitive_util.h"
 #include "xla/service/custom_call_status.h"
 #include "xla/service/custom_call_target_registry.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tests/client_library_test_base.h"
 #include "xla/tests/hlo_test_base.h"
@@ -56,6 +58,11 @@ void R0F32Add2(float* out, float** in) {
   *out = **in + 2.0f;
 }
 
+void R0F32Add2InPlace(float* out, float** in) {
+  ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(in, sizeof(float*));
+  **in = **in + 2.0f;
+}
+
 void R2F32ReduceSum(float* out, float** in) {
   ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(in, sizeof(float) * 4);
   float* array = in[0];
@@ -101,6 +108,7 @@ void CustomCallFailWithBackendConfigStr(float*, float**, const char* opaque,
 }  // namespace
 
 XLA_CPU_REGISTER_CUSTOM_CALL_TARGET(R0F32Add2);
+XLA_CPU_REGISTER_CUSTOM_CALL_TARGET(R0F32Add2InPlace);
 XLA_CPU_REGISTER_CUSTOM_CALL_TARGET(R2F32ReduceSum);
 XLA_CPU_REGISTER_CUSTOM_CALL_TARGET(Add1ToValues);
 XLA_CPU_REGISTER_CUSTOM_CALL_TARGET(F32TupleSwap);
@@ -134,6 +142,24 @@ XLA_TEST_F(CustomCallTest, CustomCallR0F32Add2) {
   LiteralTestUtil::ExpectR0Near<float>(44.0f, result, error_spec_);
 }
 
+XLA_TEST_F(CustomCallTest, CustomCallR0F32Add2Aliased) {
+  auto module = CreateNewVerifiedModule();
+  auto builder = HloComputation::Builder(TestName());
+
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+
+  builder
+      .AddInstruction(HloInstruction::CreateCustomCall(r0f32_, {constant},
+                                                       "R0F32Add2InPlace"))
+      ->set_output_to_operand_aliasing({{{}, {0, {}}}});
+
+  module->AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK_AND_ASSIGN(auto result, Execute(std::move(module), {}));
+  LiteralTestUtil::ExpectR0Near<float>(44.0f, result, error_spec_);
+}
+
 XLA_TEST_F(CustomCallTest, CustomCallR2F32Reduce) {
   auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
@@ -234,6 +260,54 @@ XLA_TEST_F(CustomCallTest, LayoutConstrained) {
   LiteralTestUtil::ExpectR2Equal<float>({{3.f, 4.f}, {5.f, 6.f}}, result);
 }
 
+XLA_TEST_F(CustomCallTest, R2Dimensions_3x4) {
+  auto module = CreateNewVerifiedModule();
+  auto builder = HloComputation::Builder(TestName());
+
+  auto input_3x4 = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(S32, {3, 4}), "arg3x4"));
+
+  builder.AddInstruction(HloInstruction::CreateCustomCall(
+      ShapeUtil::MakeTupleShape({}), {input_3x4},
+      "__xla_test$$VerifyR2Dimensions",
+      /*opaque=*/"{rows = 3 : i32, cols = 4 : i32}",
+      /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI));
+
+  module->AddEntryComputation(builder.Build());
+
+  Literal arg3x4 = LiteralUtil::CreateR2<int>({
+      {0, 0, 0, 0},  //
+      {0, 0, 0, 0},  //
+      {0, 0, 0, 0},  //
+  });
+  TF_ASSERT_OK_AND_ASSIGN(auto result, Execute(std::move(module), {&arg3x4}));
+}
+
+XLA_TEST_F(CustomCallTest, R2Dimensions_5x2) {
+  auto module = CreateNewVerifiedModule();
+  auto builder = HloComputation::Builder(TestName());
+
+  auto input_5x2 = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(S32, {5, 2}), "arg5x2"));
+
+  builder.AddInstruction(HloInstruction::CreateCustomCall(
+      ShapeUtil::MakeTupleShape({}), {input_5x2},
+      "__xla_test$$VerifyR2Dimensions",
+      /*opaque=*/"{rows = 5 : i32, cols = 2 : i32}",
+      /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI));
+
+  module->AddEntryComputation(builder.Build());
+
+  Literal arg5x2 = LiteralUtil::CreateR2<int>({
+      {0, 0},  //
+      {0, 0},  //
+      {0, 0},  //
+      {0, 0},  //
+      {0, 0},  //
+  });
+  TF_ASSERT_OK_AND_ASSIGN(auto result, Execute(std::move(module), {&arg5x2}));
+}
+
 XLA_TEST_F(CustomCallTest, TupleOutput) {
   const char* kModuleStr = R"(
     HloModule m
@@ -378,15 +452,6 @@ XLA_TEST_F(CustomCallClientAPITest, IllegalCustomCallTarget) {
 //===----------------------------------------------------------------------===//
 
 namespace {
-// Helper functions to get data pointers from buffers
-template <typename NativeType, typename BufferType>
-static NativeType* DataPointer(BufferType& buffer) {
-  return reinterpret_cast<NativeType*>(buffer.data.opaque());
-}
-template <typename NativeType, typename BufferType>
-static NativeType* DataPointer(ffi::Result<BufferType>& buffer) {
-  return reinterpret_cast<NativeType*>(buffer->data.opaque());
-}
 
 // TODO(abanas): The following three usings are a workaround, delete when
 // ResultBuffer is implemented as its own class
@@ -426,8 +491,8 @@ XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$always_fail", "Host",
                          kAlwaysFail);
 
 static absl::Status FfiR0F32Add2(R0F32Buffer in, R0F32ResultBuffer out) {
-  auto in_data = DataPointer<float>(in);
-  auto out_data = DataPointer<float>(out);
+  auto in_data = in.data.base();
+  auto out_data = out->data.base();
   *out_data = *in_data + 2.0f;
   return absl::OkStatus();
 }
@@ -441,6 +506,18 @@ XLA_FFI_DEFINE_HANDLER(kFfiR0F32Add2, FfiR0F32Add2,
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$FfiR0F32Add2",
                          "Host", kFfiR0F32Add2);
 
+template <PrimitiveType dtype>
+static absl::Status R0FAdd2(BufferBase in, ResultBufferBase out) {
+  using NativeType =
+      typename ::xla::primitive_util::PrimitiveTypeToNative<dtype>::type;
+
+  auto in_data = reinterpret_cast<const NativeType*>(in.data.opaque());
+  auto out_data = reinterpret_cast<NativeType*>(out->data.opaque());
+  *out_data = *in_data + 2.0f;
+
+  return absl::OkStatus();
+}
+
 // This represents a kernel that is valid only for F32 and F64 types
 static absl::Status FfiR0FAdd2BufferBase(BufferBase in, ResultBufferBase out) {
   if (in.dtype != out->dtype) {
@@ -448,23 +525,13 @@ static absl::Status FfiR0FAdd2BufferBase(BufferBase in, ResultBufferBase out) {
   }
 
   switch (in.dtype) {
-    case PrimitiveType::F32: {
-      auto in_data = DataPointer<float>(in);
-      auto out_data = DataPointer<float>(out);
-      *out_data = *in_data + 2.0f;
-      break;
-    }
-    case PrimitiveType::F64: {
-      auto in_data = DataPointer<double>(in);
-      auto out_data = DataPointer<double>(out);
-      *out_data = *in_data + 2.0f;
-      break;
-    }
+    case PrimitiveType::F32:
+      return R0FAdd2<PrimitiveType::F32>(in, out);
+    case PrimitiveType::F64:
+      return R0FAdd2<PrimitiveType::F64>(in, out);
     default:
       return absl::InternalError("Incorrect type");
   }
-
-  return absl::OkStatus();
 }
 
 XLA_FFI_DEFINE_HANDLER(kFfiR0FAdd2BufferBase, FfiR0FAdd2BufferBase,
@@ -479,8 +546,8 @@ XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(),
 
 static absl::Status FfiR0F32AddN(R0F32Buffer in, R0F32ResultBuffer out,
                                  float n) {
-  auto in_data = DataPointer<float>(in);
-  auto out_data = DataPointer<float>(out);
+  auto in_data = in.data.base();
+  auto out_data = out->data.base();
   *out_data = *in_data + n;
   return absl::OkStatus();
 }
@@ -496,8 +563,8 @@ XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$FfiR0F32AddN",
 
 static absl::Status FfiR0F32AddNPointer(R0F32Buffer in, R0F32ResultBuffer out,
                                         float* n) {
-  auto in_data = DataPointer<float>(in);
-  auto out_data = DataPointer<float>(out);
+  auto in_data = in.data.base();
+  auto out_data = out->data.base();
   *out_data = *in_data + *n;
   return absl::OkStatus();
 }
@@ -512,8 +579,8 @@ XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$FfiR0F32AddNPointer",
                          "Host", kFfiR0F32AddNPointer);
 
 static absl::Status FfiF32ReduceSum(F32Buffer in, R0F32ResultBuffer out) {
-  auto in_data = DataPointer<float>(in);
-  auto out_data = DataPointer<float>(out);
+  auto in_data = in.data.base();
+  auto out_data = out->data.base();
 
   // Calculate the total size of the vector
   const auto size =
@@ -535,8 +602,8 @@ XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$FfiF32ReduceSum",
                          "Host", kFfiF32ReduceSum);
 
 static absl::Status FfiF32Add1ToValues(F32Buffer in, F32ResultBuffer out) {
-  auto in_data = DataPointer<float>(in);
-  auto out_data = DataPointer<float>(out);
+  auto in_data = in.data.base();
+  auto out_data = out->data.base();
 
   // Calculate and verify the total size of the vector
   const auto in_size =
@@ -566,10 +633,10 @@ XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$FfiF32Add1ToValues",
 static absl::Status FfiF32TupleSwap(R0F32Buffer in0, R0F32Buffer in1,
                                     R0F32ResultBuffer out0,
                                     R0F32ResultBuffer out1) {
-  auto in_data0 = DataPointer<float>(in0);
-  auto in_data1 = DataPointer<float>(in1);
-  auto out_data0 = DataPointer<float>(out0);
-  auto out_data1 = DataPointer<float>(out1);
+  auto in_data0 = in0.data.base();
+  auto in_data1 = in1.data.base();
+  auto out_data0 = out0->data.base();
+  auto out_data1 = out1->data.base();
   *out_data0 = *in_data1;
   *out_data1 = *in_data0;
   return absl::OkStatus();
@@ -586,16 +653,76 @@ XLA_FFI_DEFINE_HANDLER(kFfiF32TupleSwap, FfiF32TupleSwap,
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$FfiF32TupleSwap",
                          "Host", kFfiF32TupleSwap);
 
-}  // namespace
+static absl::Status FfiTupleRotate(R0F32Buffer in0, R0F32Buffer in1,
+                                   R0F32Buffer in2, R0F32Buffer in3,
+                                   R0F32ResultBuffer out0,
+                                   R0F32ResultBuffer out1,
+                                   R0F32ResultBuffer out2,
+                                   R0F32ResultBuffer out3) {
+  auto in_data0 = in0.data.base();
+  auto in_data1 = in1.data.base();
+  auto in_data2 = in2.data.base();
+  auto in_data3 = in3.data.base();
+  auto out_data0 = out0->data.base();
+  auto out_data1 = out1->data.base();
+  auto out_data2 = out2->data.base();
+  auto out_data3 = out3->data.base();
+  *out_data0 = *in_data1;
+  *out_data1 = *in_data2;
+  *out_data2 = *in_data3;
+  *out_data3 = *in_data0;
+  return absl::OkStatus();
+}
 
-// TODO(abanas): When #10056 (typed FFI support) is ready, this class can be
-// replaced by a simple 'using FfiCustomCallTest = CustomCallTest;'
-class FfiCustomCallTest : public CustomCallTest {
- protected:
-  void SetUp() override {
-    GTEST_SKIP() << "Typed FFI is not supported yet on CPU";
+XLA_FFI_DEFINE_HANDLER(kFfiTupleRotate, FfiTupleRotate,
+                       ffi::Ffi::Bind()
+                           .Arg<R0F32Buffer>()  // in0
+                           .Arg<R0F32Buffer>()  // in1
+                           .Arg<R0F32Buffer>()  // in2
+                           .Arg<R0F32Buffer>()  // in3
+                           .Ret<R0F32Buffer>()  // out0
+                           .Ret<R0F32Buffer>()  // out1
+                           .Ret<R0F32Buffer>()  // out2
+                           .Ret<R0F32Buffer>()  // out3
+);
+
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$FfiTupleRotate",
+                         "Host", kFfiTupleRotate);
+
+static absl::Status VerifyR2Dimensions(ffi::BufferBase in, int32_t rows,
+                                       int32_t cols) {
+  std::string message;
+  if (in.dimensions.size() != 2) {
+    message += absl::StrFormat("dimensions.size() != 2 because %d != 2\n",
+                               in.dimensions.size());
   }
-};
+  if (in.dimensions.front() != rows) {
+    message += absl::StrFormat("dimensions.front() != rows because %d != %d\n",
+                               in.dimensions.front(), rows);
+  }
+  if (in.dimensions.back() != cols) {
+    message += absl::StrFormat("dimensions.back() != cols because %d != %d\n",
+                               in.dimensions.back(), cols);
+  }
+  if (!message.empty()) {
+    return absl::Status(absl::StatusCode::kFailedPrecondition,
+                        std::move(message));
+  }
+  return absl::OkStatus();
+}
+
+XLA_FFI_DEFINE_HANDLER(kVerifyR2Dimensions, VerifyR2Dimensions,
+                       ffi::Ffi::Bind()
+                           .Arg<ffi::BufferBase>()  // in
+                           .Attr<int32_t>("rows")
+                           .Attr<int32_t>("cols"));
+
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$VerifyR2Dimensions",
+                         "Host", kVerifyR2Dimensions);
+
+}  // namespace
+
+using FfiCustomCallTest = CustomCallTest;
 
 XLA_TEST_F(FfiCustomCallTest, FfiReportsSuccess) {
   auto module = CreateNewVerifiedModule();
@@ -851,7 +978,7 @@ XLA_TEST_F(FfiCustomCallTest, FfiHandleAttrPointer) {
   auto n = 4.0f;
   auto ptr = reinterpret_cast<uintptr_t>(&n);
   builder.AddInstruction(HloInstruction::CreateCustomCall(
-      r0f32_, {constant}, "__xla_test$$FfiR0F32AddN",
+      r0f32_, {constant}, "__xla_test$$FfiR0F32AddNPointer",
       /*opaque=*/absl::StrFormat("{n = %d : i64}", ptr),
       /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI));
 
@@ -999,7 +1126,37 @@ XLA_TEST_F(FfiCustomCallTest, FfiTupleOutput) {
   EXPECT_EQ(result, expected);
 }
 
+XLA_TEST_F(FfiCustomCallTest, FfiNestedTupleOutput) {
+  GTEST_SKIP() << "Nested tuple outputs not yet implemented.";
+  const char* const kModuleStr = R"(
+    HloModule m
+
+    ENTRY test {
+      c0 = f32[] constant(7.0)
+      c1 = f32[] constant(42.0)
+      c2 = f32[] constant(8.0)
+      c3 = f32[] constant(43.0)
+      ROOT custom-call = ((f32[], f32[]), (f32[], f32[])) custom-call(c0, c1, c2, c3), custom_call_target="__xla_test$$FfiTupleRotate", api_version=API_VERSION_TYPED_FFI
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+
+  Literal arg0 = LiteralUtil::CreateR0<float>(7.f);
+  Literal arg1 = LiteralUtil::CreateR0<float>(42.f);
+  Literal arg2 = LiteralUtil::CreateR0<float>(8.f);
+  Literal arg3 = LiteralUtil::CreateR0<float>(43.f);
+
+  Literal tuple0 = LiteralUtil::MakeTuple({&arg1, &arg2});
+  Literal tuple1 = LiteralUtil::MakeTuple({&arg3, &arg0});
+
+  Literal expected = LiteralUtil::MakeTuple({&tuple0, &tuple1});
+  TF_ASSERT_OK_AND_ASSIGN(auto result, Execute(std::move(module), {}));
+  EXPECT_EQ(result, expected);
+}
+
 XLA_TEST_F(FfiCustomCallTest, FfiTupleInput) {
+  GTEST_SKIP() << "Tuple inputs not yet implemented.";
   const char* const kModuleStr = R"(
     HloModule m
 
@@ -1019,5 +1176,28 @@ XLA_TEST_F(FfiCustomCallTest, FfiTupleInput) {
   EXPECT_EQ(result, expected);
 }
 
+XLA_TEST_F(FfiCustomCallTest, FfiNestedTupleInput) {
+  GTEST_SKIP() << "Nested tuple inputs not yet implemented.";
+  const char* const kModuleStr = R"(
+    HloModule m
+
+    ENTRY test {
+      c0 = ((f32[], f32[]), (f32[], f32[])) constant(((7.0, 42.0), (8.0, 43.0)))
+      ROOT custom-call = (f32[], f32[], f32[], f32[]) custom-call(c0), custom_call_target="__xla_test$$FfiTupleRotate", api_version=API_VERSION_TYPED_FFI
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+
+  Literal arg0 = LiteralUtil::CreateR0<float>(7.f);
+  Literal arg1 = LiteralUtil::CreateR0<float>(42.f);
+  Literal arg2 = LiteralUtil::CreateR0<float>(8.f);
+  Literal arg3 = LiteralUtil::CreateR0<float>(43.f);
+
+  Literal expected = LiteralUtil::MakeTuple({&arg1, &arg2, &arg3, &arg0});
+  TF_ASSERT_OK_AND_ASSIGN(auto result, Execute(std::move(module), {}));
+  EXPECT_EQ(result, expected);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/tests/dot_operation_test.cc b/third_party/xla/xla/tests/dot_operation_test.cc
index 6ed8d128ee277f..f272d8da4e00d3 100644
--- a/third_party/xla/xla/tests/dot_operation_test.cc
+++ b/third_party/xla/xla/tests/dot_operation_test.cc
@@ -472,7 +472,7 @@ std::vector<DotTestParam> CreateDotTestParameters() {
 XLA_TEST_P(ParametricDotTest, TestF16) { TestImpl<Eigen::half>(); }
 #endif
 XLA_TEST_P(ParametricDotTest, TestF32) { TestImpl<float>(); }
-XLA_TEST_P(ParametricDotTest, TestF64) { TestImpl<double>(); }
+XLA_TEST_P(ParametricDotTest, OVERSIZE_ON_GRM(TestF64)) { TestImpl<double>(); }
 XLA_TEST_P(ParametricDotTest, TestC64) { TestImpl<std::complex<float>>(); }
 #ifndef XLA_BACKEND_DOES_NOT_SUPPORT_COMPLEX128
 XLA_TEST_P(ParametricDotTest, TestC128) { TestImpl<std::complex<double>>(); }
@@ -643,7 +643,7 @@ TYPED_TEST_CASE(DotOperationTestForBatchMatMul, TypesF16F32F64);
 // Regression test for b/32055648. The root of the graph is a kFusion of 4
 // bitcasts. Although bitcasts don't map to thunks, the root should still be
 // sync-dependent on bitcasts' operands.
-XLA_TYPED_TEST(DotOperationTestForBatchMatMul, Types) {
+XLA_TYPED_TEST(DotOperationTestForBatchMatMul, DISABLED_ON_TPU(Types)) {
   using T = TypeParam;
   XlaBuilder builder(this->TestName());
   auto x = Parameter(&builder, 0, ShapeUtil::MakeShapeWithType<T>({2, 2, 2, 2}),
@@ -2307,7 +2307,9 @@ ENTRY MatrixVectorComplex {
 void DOT_ReorderContracting(::testing::benchmark::State& state) {
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().value();
   auto executors = PlatformUtil::GetStreamExecutors(platform).value();
-  se::StreamExecutorMemoryAllocator allocator(platform, executors);
+  se::StreamExecutorMemoryAllocator allocator(
+      platform, std::vector<se::StreamExecutorInterface*>(executors.begin(),
+                                                          executors.end()));
 
   xla::LocalClientOptions client_options;
   client_options.set_platform(platform);
diff --git a/third_party/xla/xla/tests/dynamic_ops_test.cc b/third_party/xla/xla/tests/dynamic_ops_test.cc
index ae3c81f0c6db48..58e4a1bb0731c3 100644
--- a/third_party/xla/xla/tests/dynamic_ops_test.cc
+++ b/third_party/xla/xla/tests/dynamic_ops_test.cc
@@ -973,7 +973,9 @@ ENTRY main {
 void BM_DynamicSlice(::testing::benchmark::State& state) {
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().value();
   auto executors = PlatformUtil::GetStreamExecutors(platform).value();
-  se::StreamExecutorMemoryAllocator allocator(platform, executors);
+  se::StreamExecutorMemoryAllocator allocator(
+      platform, std::vector<se::StreamExecutorInterface*>(executors.begin(),
+                                                          executors.end()));
   LocalClient* client = ClientLibrary::GetOrCreateLocalClient(platform).value();
   auto* transfer_manager = TransferManager::GetForPlatform(platform).value();
   int device_ordinal = client->default_device_ordinal();
diff --git a/third_party/xla/xla/tests/exhaustive/BUILD b/third_party/xla/xla/tests/exhaustive/BUILD
index e995732bee51f5..e83f089cf18899 100644
--- a/third_party/xla/xla/tests/exhaustive/BUILD
+++ b/third_party/xla/xla/tests/exhaustive/BUILD
@@ -46,7 +46,7 @@ xla_test(
     name = "exhaustive_unary_test_f32_or_smaller",
     srcs = ["exhaustive_unary_test_f32_or_smaller.cc"],
     real_hardware_only = True,  # Very slow on the interpreter.
-    shard_count = 48,
+    shard_count = 50,
     tags = [
         "optonly",
         # This is a big test that we skip for capacity reasons in OSS testing.
@@ -55,6 +55,7 @@ xla_test(
     deps = [
         ":exhaustive_op_test_utils",
         "//xla:util",
+        "//xla/client:xla_builder",
         "//xla/tests:client_library_test_base",
     ],
 )
@@ -77,6 +78,7 @@ xla_test(
     deps = [
         ":exhaustive_op_test_utils",
         "//xla:util",
+        "//xla/client:xla_builder",
         "//xla/tests:client_library_test_base",
     ],
 )
@@ -96,8 +98,12 @@ xla_test(
     ],
     deps = [
         ":exhaustive_op_test_utils",
-        "//xla:util",
+        "//xla:types",
+        "//xla/client:xla_builder",
         "//xla/tests:client_library_test_base",
+        "//xla/tests:literal_test_util",
+        "//xla/tests:test_macros_header",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
diff --git a/third_party/xla/xla/tests/exhaustive/exhaustive_op_test_utils.h b/third_party/xla/xla/tests/exhaustive/exhaustive_op_test_utils.h
index 363ddb357e1458..52fbeb2379827d 100644
--- a/third_party/xla/xla/tests/exhaustive/exhaustive_op_test_utils.h
+++ b/third_party/xla/xla/tests/exhaustive/exhaustive_op_test_utils.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <cstdint>
 #include <functional>
 #include <iterator>
+#include <limits>
 #include <string>
 #include <utility>
 #include <vector>
@@ -41,16 +42,13 @@ namespace xla {
 namespace exhaustive_op_test {
 
 struct ErrorSpec {
-  float abs_err;
-  float rel_err;
-
+  double abs_err = 0;
+  double rel_err = 0;
   // If true, will consider -0 not near to +0 and vice versa.  Note that
   // +epsilon may still be considered close to -0, depending on the error
   // spec; this only covers the case when both `expected` and `actual` are
   // equal to 0.
   bool strict_signed_zeros = false;
-
-  ErrorSpec(float a, float r) : abs_err(a), rel_err(r) {}
 };
 
 // Representations of the reference function passed in by the user.
@@ -940,66 +938,109 @@ inline std::vector<std::pair<int64_t, int64_t>> CreateExhaustiveF32Ranges() {
 }
 
 template <PrimitiveType T, size_t N>
-inline ErrorSpec DefaultSpecGenerator(
-    typename ExhaustiveOpTestBase<T, N>::NativeT) {
-  LOG(FATAL) << "Unhandled Type";
-}
+ErrorSpec DefaultSpecGenerator(typename ExhaustiveOpTestBase<T, N>::NativeT);
 
 template <PrimitiveType T, size_t N>
-inline ErrorSpec DefaultSpecGenerator(
-    typename ExhaustiveOpTestBase<T, N>::NativeT,
-    typename ExhaustiveOpTestBase<T, N>::NativeT) {
-  LOG(FATAL) << "Unhandled Type";
-}
+ErrorSpec DefaultSpecGenerator(typename ExhaustiveOpTestBase<T, N>::NativeT,
+                               typename ExhaustiveOpTestBase<T, N>::NativeT);
+
+// The following two constants set the default absolute and relative error
+// tolerance in units of the smallest normalized value and the relative accuracy
+// of the format, respectively. Notice that setting an absolute tolerance above
+// the value of the smallest normalized float means that we effectively ignore
+// relative errors in values at or below the subnormal boundary (e.g. for values
+// less than ~1e-38 for FP32).
+static constexpr float kDefaultAbsoluteToleranceSlackFactor = 2;
+static constexpr float kDefaultRelativeToleranceSlackFactor = 50;
 
 template <>
 inline ErrorSpec DefaultSpecGenerator<C128, 1>(complex128) {
-  return ErrorSpec{0.0001, 0.0001};
+  double atol =
+      kDefaultAbsoluteToleranceSlackFactor * std::numeric_limits<double>::min();
+  double rtol = kDefaultRelativeToleranceSlackFactor *
+                std::numeric_limits<double>::epsilon();
+  return ErrorSpec{atol, rtol};
 }
 
 template <>
 inline ErrorSpec DefaultSpecGenerator<C64, 1>(complex64) {
-  return ErrorSpec{0.0001, 0.0001};
+  double atol =
+      kDefaultAbsoluteToleranceSlackFactor * std::numeric_limits<float>::min();
+  double rtol = kDefaultRelativeToleranceSlackFactor *
+                std::numeric_limits<float>::epsilon();
+  return ErrorSpec{atol, rtol};
 }
 
 template <>
 inline ErrorSpec DefaultSpecGenerator<F64, 1>(double) {
-  return ErrorSpec{0.0001, 0.0001};
+  double atol =
+      kDefaultAbsoluteToleranceSlackFactor * std::numeric_limits<double>::min();
+  double rtol = kDefaultRelativeToleranceSlackFactor *
+                std::numeric_limits<double>::epsilon();
+  return ErrorSpec{atol, rtol};
 }
 
 template <>
 inline ErrorSpec DefaultSpecGenerator<F32, 1>(float) {
-  return ErrorSpec{0.0001, 0.0001};
+  double atol =
+      kDefaultAbsoluteToleranceSlackFactor * std::numeric_limits<float>::min();
+  double rtol = kDefaultRelativeToleranceSlackFactor *
+                std::numeric_limits<float>::epsilon();
+  return ErrorSpec{atol, rtol};
 }
 
 template <>
 inline ErrorSpec DefaultSpecGenerator<F16, 1>(Eigen::half) {
-  return ErrorSpec{0.001, 0.001};
+  double atol = kDefaultAbsoluteToleranceSlackFactor *
+                std::numeric_limits<Eigen::half>::min();
+  // epsilon for FP16 is quite large, so a slack factor of 5 suffices.
+  double rtol = 5 * std::numeric_limits<Eigen::half>::epsilon();
+  return ErrorSpec{atol, rtol};
 }
 
 template <>
 inline ErrorSpec DefaultSpecGenerator<BF16, 1>(bfloat16) {
-  return ErrorSpec{0.002, 0.02};
+  double atol = kDefaultAbsoluteToleranceSlackFactor *
+                std::numeric_limits<bfloat16>::min();
+  // epsilon for BF16 is quite large, so a slack factor of 2 suffices.
+  double rtol = 2 * std::numeric_limits<bfloat16>::epsilon();
+  return ErrorSpec{atol, rtol};
 }
 
 template <>
 inline ErrorSpec DefaultSpecGenerator<F64, 2>(double, double) {
-  return ErrorSpec{0.001, 0.001};
+  double atol =
+      kDefaultAbsoluteToleranceSlackFactor * std::numeric_limits<double>::min();
+  double rtol = kDefaultRelativeToleranceSlackFactor *
+                std::numeric_limits<double>::epsilon();
+  return ErrorSpec{atol, rtol};
 }
 
 template <>
 inline ErrorSpec DefaultSpecGenerator<F32, 2>(float, float) {
-  return ErrorSpec{0.001, 0.001};
+  double atol =
+      kDefaultAbsoluteToleranceSlackFactor * std::numeric_limits<float>::min();
+  double rtol = kDefaultRelativeToleranceSlackFactor *
+                std::numeric_limits<float>::epsilon();
+  return ErrorSpec{atol, rtol};
 }
 
 template <>
 inline ErrorSpec DefaultSpecGenerator<F16, 2>(Eigen::half, Eigen::half) {
-  return ErrorSpec{0.001, 0.001};
+  double atol = kDefaultAbsoluteToleranceSlackFactor *
+                std::numeric_limits<Eigen::half>::min();
+  // epsilon for FP16 is quite large, so a slack factor of 5 suffices.
+  double rtol = 5 * std::numeric_limits<Eigen::half>::epsilon();
+  return ErrorSpec{atol, rtol};
 }
 
 template <>
 inline ErrorSpec DefaultSpecGenerator<BF16, 2>(bfloat16, bfloat16) {
-  return ErrorSpec{0.002, 0.02};
+  double atol = kDefaultAbsoluteToleranceSlackFactor *
+                std::numeric_limits<bfloat16>::min();
+  // epsilon for BF16 is quite large, so a slack factor of 5 suffices.
+  double rtol = 2 * std::numeric_limits<bfloat16>::epsilon();
+  return ErrorSpec{atol, rtol};
 }
 
 template <PrimitiveType T, size_t N>
diff --git a/third_party/xla/xla/tests/exhaustive/exhaustive_unary_test_complex.cc b/third_party/xla/xla/tests/exhaustive/exhaustive_unary_test_complex.cc
index 0c078d667da787..6672747e30aa4b 100644
--- a/third_party/xla/xla/tests/exhaustive/exhaustive_unary_test_complex.cc
+++ b/third_party/xla/xla/tests/exhaustive/exhaustive_unary_test_complex.cc
@@ -13,9 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <array>
+#include <cmath>
+#include <cstdint>
+#include <limits>
+#include <tuple>
+
+#include "absl/types/span.h"
+#include "xla/client/xla_builder.h"
 #include "xla/tests/client_library_test_base.h"
 #include "xla/tests/exhaustive/exhaustive_op_test_utils.h"
-#include "xla/util.h"
+#include "xla/tests/test_macros.h"
+#include "xla/types.h"
 
 #ifdef __FAST_MATH__
 #error "Can't be compiled with fast math on"
@@ -95,41 +104,68 @@ using ExhaustiveC128UnaryTest = ExhaustiveComplexUnaryTestBase<C128>;
   XLA_TEST_P(ExhaustiveC64UnaryTest, test_name) \
   __VA_ARGS__
 
-// TODO(b/138578594): Enable the test for the CPU backend after fixing the bug.
-UNARY_TEST_COMPLEX_64(DISABLED_ON_CPU(Log), {
-  // TODO(timshen): see b/162664705.
+UNARY_TEST_COMPLEX_64(Log, {
+  // TODO(rmlarsen): see b/162664705 and b/138578594
   known_incorrect_fn_ = [this](int64_t val) {
-    return std::isnan(this->ConvertValue(val));
+    complex64 x = this->ConvertValue(val);
+    return std::isnan(x.real()) || std::isnan(x.imag()) ||
+           (platform_ == "Host" &&
+            std::abs(x) < std::numeric_limits<float>::min());
   };
   ErrorSpecGen error_spec_gen = +[](complex64 x) {
-    // std::log uses std::abs, which uses hypotf. The latter upcasts its
-    // argument to double, so there are cases where there's excess precision, so
-    // abs returns numeric_limits<float>::max, but it's not possible to arrive
-    // at this number without using doubles internally.
-    // Obviously, the default relative error spec makes no sense for this case.
-    // `log(max) ~ 88.72`, which is very far from `inf`, but `max` is considered
-    // very close to `inf`.
-    if (std::abs(x) == std::numeric_limits<float>::max()) {
+    // The reference implementation overflows to infinity for arguments near
+    // FLT_MAX.
+    if (std::abs(x) >= std::numeric_limits<float>::max()) {
       float inf = std::numeric_limits<float>::infinity();
-      return ErrorSpec{inf, inf};
+      return ErrorSpec{.abs_err = inf, .rel_err = inf};
     }
-    return GetDefaultSpecGenerator()(x);
+    return ErrorSpec{.abs_err = std::numeric_limits<float>::epsilon(),
+                     .rel_err = 50 * std::numeric_limits<float>::epsilon()};
   };
-  Run(Log, [](complex64 x) { return std::log<float>(x); }, error_spec_gen);
+  Run(Log, [](complex64 x) { return std::log(x); }, error_spec_gen);
 })
 
 UNARY_TEST_COMPLEX_64(Sqrt, {
-  Run(Sqrt, [](complex64 x) {
-    return static_cast<complex64>(
-        std::sqrt<double>(static_cast<complex128>(x)));
-  });
+  // The reference implementation overflows to infinity for arguments near
+  // FLT_MAX.
+  ErrorSpecGen error_spec_gen = +[](complex64 x) {
+    if (std::abs(x) >= std::numeric_limits<float>::max()) {
+      float inf = std::numeric_limits<float>::infinity();
+      return ErrorSpec{.abs_err = inf, .rel_err = inf};
+    }
+    // The reference implementation appears to be very poor on inputs with
+    // subnormal entries. Allowing an absolute error of ~sqrt(FLT_DENORM_MIN)
+    // allows such cases to pass, effectively letting the relative error
+    // increase gradually until it reaches 100% at abs(x) == FLT_DENORM_MIN.
+    return ErrorSpec{
+        .abs_err = std::sqrt(std::numeric_limits<float>::denorm_min()),
+        .rel_err = 50 * std::numeric_limits<float>::epsilon()};
+  };
+  Run(Sqrt, [](complex64 x) { return std::sqrt(x); }, error_spec_gen);
 })
 
 UNARY_TEST_COMPLEX_64(Rsqrt, {
-  Run(Rsqrt, [](complex64 x) {
-    return static_cast<complex64>(
-        complex128(1, 0) / std::sqrt<double>(static_cast<complex128>(x)));
-  });
+  known_incorrect_fn_ = [this](int64_t val) {
+    complex64 x = this->ConvertValue(val);
+    return (platform_ == "Host" && (x.imag() == 0.0f || x.real() == 0.0f));
+  };
+  ErrorSpecGen error_spec_gen = +[](complex64 x) {
+    // As noted above for Sqrt, the accuracy of sqrt degrades severely for
+    // inputs with inputs with subnormals entries.
+    constexpr double norm_min = std::numeric_limits<float>::min();
+    constexpr double denorm_min = std::numeric_limits<float>::denorm_min();
+    if (std::abs(x) < norm_min) {
+      // Gradually loosen the relative tolerance as abs(x) becomes smaller
+      // than norm_min, letting it reach 100% when abs(x) = 10 * denorm_min.
+      return ErrorSpec{.abs_err = std::sqrt(std::numeric_limits<float>::min()),
+                       .rel_err = 10 * denorm_min / std::abs(x)};
+    }
+    return ErrorSpec{.abs_err = std::sqrt(std::numeric_limits<float>::min()),
+                     .rel_err = 50 * std::numeric_limits<float>::epsilon()};
+  };
+  Run(
+      Rsqrt, [](complex64 x) { return complex64(1, 0) / std::sqrt(x); },
+      error_spec_gen);
 })
 
 // The current libc++ implementation of the complex tanh function provides
@@ -142,7 +178,7 @@ UNARY_TEST_COMPLEX_64(Tanh, {
     // This implementation of Tanh becomes less accurate when the denominator
     // is small.
     if (std::cosh(2 * x.real()) + std::cos(2 * x.imag()) < 1e-4) {
-      return ErrorSpec{5e-2, 5e-2};
+      return ErrorSpec{.abs_err = 5e-2, .rel_err = 5e-2};
     }
 
     return GetDefaultSpecGenerator()(x);
@@ -193,47 +229,65 @@ INSTANTIATE_TEST_SUITE_P(
   __VA_ARGS__
 
 UNARY_TEST_COMPLEX_128(Log, {
-  // TODO(b/138578313): Enable the test for all values after fixing the bug.
+  // TODO(rmlarsen): see b/162664705 and b/138578594
   known_incorrect_fn_ = [&](int64_t v) {
     double f = this->ConvertValue(v);
     return std::fpclassify(f) == FP_NAN || std::abs(f) > 1.0e+300 ||
            std::abs(f) < 1.0e-300;
   };
-  Run(Log, [](complex128 x) { return std::log<double>(x); });
+  Run(Log, [](complex128 x) { return std::log(x); });
 })
 
 UNARY_TEST_COMPLEX_128(Sqrt, {
+  ErrorSpecGen error_spec_gen = +[](complex128 x) {
+    // TODO(rmlarsen): Investigate why we only get ~float accuracy here.
+
+    // The reference implementation appears to be very poor on inputs with
+    // subnormal entries. Allowing an absolute error of ~sqrt(DBL_DENORM_MIN)
+    // allows such cases to pass, effectively letting the relative error
+    // increase gradually until it reaches 100% at abs(x) == DBL_DENORM_MIN.
+    return ErrorSpec{
+        .abs_err = std::sqrt(std::numeric_limits<double>::denorm_min()),
+        .rel_err = 50 * std::numeric_limits<double>::epsilon()};
+  };
   // Similar to the Tanh bug.
   known_incorrect_fn_ = [&](int64_t v) {
     double f = this->ConvertValue(v);
     return std::abs(f) > std::numeric_limits<double>::max() / 2;
   };
-  Run(Sqrt, [](complex128 x) { return std::sqrt<double>(x); });
+  Run(Sqrt, [](complex128 x) { return std::sqrt(x); }, error_spec_gen);
 })
 
 UNARY_TEST_COMPLEX_128(Rsqrt, {
-  ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
-  if (platform_ == "CUDA") {
-    // Edge case on CUDA backend where the Log of a complex number made up of
-    // the smallest denormals is more accurate than the interpreter backend.
-    error_spec_gen = [](complex128 x) {
-      constexpr double denorm_min = std::numeric_limits<double>::denorm_min();
-      if (std::abs(x.real()) == denorm_min &&
-          std::abs(x.imag()) == denorm_min) {
-        return ErrorSpec(0.5, 0.5);
-      }
-      return GetDefaultSpecGenerator()(x);
-    };
-  }
+  ErrorSpecGen error_spec_gen = +[](complex128 x) {
+    // As noted above for Sqrt, the accuracy of sqrt degrades severely for
+    // inputs with inputs with subnormals entries.
+    constexpr double norm_min = std::numeric_limits<double>::min();
+    constexpr double denorm_min = std::numeric_limits<double>::denorm_min();
+    if (std::abs(x) < norm_min) {
+      // Gradually loosen the relative tolerance as abs(x) becomes smaller
+      // than norm_min, letting it reach 100% when abs(x) = 10 * denorm_min.
+      return ErrorSpec{.abs_err = std::sqrt(std::numeric_limits<double>::min()),
+                       .rel_err = 10 * denorm_min / std::abs(x)};
+    }
+    return ErrorSpec{.abs_err = std::sqrt(std::numeric_limits<double>::min()),
+                     .rel_err = 50 * std::numeric_limits<double>::epsilon()};
+  };
   Run(
-      Rsqrt,
-      [](complex128 x) { return complex128(1, 0) / std::sqrt<double>(x); },
+      Rsqrt, [](complex128 x) { return complex128(1, 0) / std::sqrt(x); },
       error_spec_gen);
 })
 
 UNARY_TEST_COMPLEX_128(Tanh, {
+  ErrorSpecGen error_spec_gen = [](complex128 x) {
+    // TODO(rmlarsen): Investigate why we only get slightly better than
+    // float accuracy here.
+    return ErrorSpec{.abs_err = 2 * std::numeric_limits<double>::min(),
+                     .rel_err = 2e-8};
+  };
+
   SetParamsForTanh();
-  Run(Tanh, +[](complex128 x) { return std::tanh(x); });
+  Run(Tanh, +[](complex128 x) { return std::tanh(x); }, error_spec_gen);
 })
 
 INSTANTIATE_TEST_SUITE_P(
diff --git a/third_party/xla/xla/tests/exhaustive/exhaustive_unary_test_f32_or_smaller.cc b/third_party/xla/xla/tests/exhaustive/exhaustive_unary_test_f32_or_smaller.cc
index d908c4e167aeec..56240f06343359 100644
--- a/third_party/xla/xla/tests/exhaustive/exhaustive_unary_test_f32_or_smaller.cc
+++ b/third_party/xla/xla/tests/exhaustive/exhaustive_unary_test_f32_or_smaller.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <fenv.h>  // NOLINT
 
+#include <algorithm>
 #include <array>
 #include <cmath>
 #include <limits>
@@ -22,6 +23,7 @@ limitations under the License.
 #include <tuple>
 #include <utility>
 
+#include "xla/client/xla_builder.h"
 #include "xla/tests/client_library_test_base.h"
 #include "xla/tests/exhaustive/exhaustive_op_test_utils.h"
 #include "xla/util.h"
@@ -305,96 +307,60 @@ using ExhaustiveBF16UnaryTest = Exhaustive32BitOrLessUnaryTest<BF16>;
   UNARY_TEST_BF16(test_name, __VA_ARGS__)
 
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(Log, {
-  ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
-  if (platform_ != "Host" && platform_ != "CUDA" && ty_ == F32) {
-    error_spec_gen = +[](NativeT x) { return ErrorSpec{0.001, 0.001}; };
+  auto error_spec_gen = GetDefaultSpecGenerator();
+  if (platform_ != "Host" && platform_ != "CUDA") {
+    error_spec_gen = +[](NativeT x) {
+      NativeT eps = std::numeric_limits<NativeT>::epsilon();
+      return ErrorSpec{.abs_err = 2e-4, .rel_err = eps};
+    };
   }
   Run(Log, std::log, error_spec_gen);
 })
-
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(Log1p, {
-  ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
-  if (platform_ != "Host" && platform_ != "CUDA" && ty_ == F32) {
-    error_spec_gen = +[](NativeT x) { return ErrorSpec{0.001, 0.001}; };
+  auto error_spec_gen = GetDefaultSpecGenerator();
+  if (platform_ != "Host" && platform_ != "CUDA") {
+    error_spec_gen = +[](NativeT x) {
+      NativeT eps = std::numeric_limits<NativeT>::epsilon();
+      return ErrorSpec{.abs_err = 2e-4, .rel_err = eps};
+    };
   }
   Run(Log1p, std::log1p, error_spec_gen);
 })
-
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(Exp, {
-  // When x < -105, the true value of exp(x) is smaller than the smallest F32,
-  // so exp(x) should return exactly 0. We want our implementation of exp to
-  // return exactly 0 as well, as not doing so implies either that our
-  // implementation of exp is not following the asymptotic behavior that exp(x)
-  // approaches 0 as x approaches -inf, or that our implementation is not
-  // approaching 0 fast enough.
-  ErrorSpecGen error_spec_gen = +[](NativeT x) {
-    if (x < static_cast<NativeT>(-105)) {
-      return ErrorSpec{0, 0};
-    }
-    return GetDefaultSpecGenerator()(x);
-  };
-
-  // Our CPU implementation of exp returns one incorrect value: says
-  // exp(88.7228394) = max-float, but the correct answer is inf.  We deem this
-  // acceptable and check for it explicitly so that we can be aware if anything
-  // changes.
-  if (platform_ == "Host") {
-    auto host_exp_with_overflow = +[](float f) {
-      if (f == 88.7228394f) {
-        return 3.40282347e+38f;
-      }
-      return std::exp(f);
+  auto error_spec_gen = GetDefaultSpecGenerator();
+  if (platform_ != "Host" && platform_ != "CUDA") {
+    error_spec_gen = +[](NativeT x) {
+      NativeT min = std::numeric_limits<NativeT>::min();
+      NativeT eps = std::numeric_limits<NativeT>::epsilon();
+      return ErrorSpec{.abs_err = min, .rel_err = 75 * eps};
     };
-    Run(Exp, host_exp_with_overflow, error_spec_gen);
-  } else {
-    Run(Exp, std::exp, error_spec_gen);
   }
+  Run(Exp, std::exp, error_spec_gen);
 })
 
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(Expm1, {
-  ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
-  if (ty_ == F32) {
-    if (platform_ == "Host") {
-      error_spec_gen = +[](NativeT x) {
-        // We expect no worse than an error of 8 ULPs.
-        return ErrorSpec{
-            0.0, std::scalbn(8.0f, -std::numeric_limits<float>::digits)};
-      };
-    } else {
-      error_spec_gen = +[](NativeT x) { return ErrorSpec{0, 0.00015}; };
-    }
-  }
-
-  // Our CPU implementation of expm1 returns one incorrect value: says
-  // exp(88.7228394) = max-float, but the correct answer is inf.  We deem this
-  // acceptable and check for it explicitly so that we can be aware if anything
-  // changes.
-  if (platform_ == "Host") {
-    auto host_expm1_with_overflow = +[](float f) {
-      if (f == 88.7228394f) {
-        return 3.40282347e+38f;
-      }
-      return std::expm1(f);
+  auto error_spec_gen = GetDefaultSpecGenerator();
+  if (platform_ != "Host" && platform_ != "CUDA") {
+    error_spec_gen = +[](NativeT x) {
+      // FIXME(rmlarsen): Break into region around zero and everything else.
+      NativeT eps = std::numeric_limits<NativeT>::epsilon();
+      return ErrorSpec{.abs_err = 5e-4, .rel_err = 200 * eps};
     };
-    Run(Expm1, host_expm1_with_overflow, error_spec_gen);
-  } else {
-    Run(Expm1, std::expm1, error_spec_gen);
   }
+  Run(Expm1, std::expm1, error_spec_gen);
 })
-
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(Logistic, {
+  // FIXME(rmlarsen): Break into region around zero and everything else.
+  auto error_spec_gen = GetDefaultSpecGenerator();
+  if (platform_ != "Host" && platform_ != "CUDA") {
+    error_spec_gen = +[](NativeT x) {
+      float eps = std::numeric_limits<NativeT>::epsilon();
+      float atol = std::min(0.004f, 200 * eps);
+      return ErrorSpec{.abs_err = atol, .rel_err = 0};
+    };
+  }
   EvaluateOp fn = +[](float x) { return 1.0f / (1.0f + std::exp(-x)); };
-
-  Run(
-      Logistic, fn, +[](NativeT) {
-        // Notice that we use the same absolute and relative tolerance.
-        // Since Logistic(x) -> 0 for x -> -Inf, the relative error
-        // is actually enormous for x < -5, say.
-        // For example, Logistic(-13.8183346) = 1.9967556e-06 while the expected
-        // value is 9.97178972e-07.
-        float tol = 200.0f * std::numeric_limits<NativeT>::epsilon();
-        return ErrorSpec(tol, tol);
-      });
+  Run(Logistic, fn, error_spec_gen);
 })
 
 // It feels a little overkill to exhaustively test sqrt and pow(x, 0.5), but
@@ -402,127 +368,127 @@ UNARY_TEST_FLOAT_32_BITS_OR_LESS(Logistic, {
 // pow(x, 0.5), but this is not true for x == -inf.
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(PowOneHalf, {
   EvaluateOp fn = +[](float x) { return std::pow(x, 0.5f); };
-
   Run([](XlaOp x) { return Pow(x, ScalarLike(x, 0.5)); }, fn);
 })
 
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(Rsqrt, {
-  Run(
-      Rsqrt, +[](float x) { return 1 / std::sqrt(x); });
+  Run(Rsqrt, +[](float x) { return 1 / std::sqrt(x); });
 })
 
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(Sqrt, {
-  ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
-  if (platform_ == "Host" || platform_ == "CUDA") {
-    error_spec_gen = +[](NativeT x) {
-      auto spec = GetDefaultSpecGenerator()(x);
-      spec.strict_signed_zeros = true;
-      return spec;
-    };
-  }
-
+  auto error_spec_gen = +[](NativeT x) {
+    auto spec = GetDefaultSpecGenerator()(x);
+    spec.strict_signed_zeros = true;
+    return spec;
+  };
   Run(Sqrt, std::sqrt, error_spec_gen);
 })
 
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(Cbrt, {
-  if (platform_ == "Host" || platform_ == "CUDA") {
-    ErrorSpecGen error_spec_gen = +[](NativeT x) {
-      return ErrorSpec{0.01, 0.01};
-    };
-    Run(Cbrt, std::cbrt, error_spec_gen);
-  } else {
-    Run(Cbrt, std::cbrt);
-  }
+  auto error_spec_gen = +[](NativeT x) {
+    NativeT eps = std::numeric_limits<NativeT>::epsilon();
+    NativeT min = std::numeric_limits<NativeT>::min();
+    // Allow a small absolute error (e.g. 9e-16 for F32).
+    // This corresponds to a 0.5% relative error for the
+    // smallest normalized floating point values.
+    return ErrorSpec{.abs_err = std::cbrt(min) / 200, .rel_err = 50 * eps};
+  };
+  Run(Cbrt, std::cbrt, error_spec_gen);
 })
 
-// TODO(jlebar): Test trig functions over complex inputs.
-XLA_TEST_P(ExhaustiveF32UnaryTest, Acosh) {
-  // Error inherited from Log, which our implementation of Acosh uses.
-  ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
+// Tests for inverse hyperbolic functions.
+UNARY_TEST_FLOAT_32_BITS_OR_LESS(Acosh, {
+  auto error_spec_gen = +[](NativeT x) {
+    NativeT eps = std::numeric_limits<NativeT>::epsilon();
+    return ErrorSpec{.abs_err = 1e-7, .rel_err = 50 * eps};
+  };
   if (platform_ != "Host" && platform_ != "CUDA") {
-    error_spec_gen = +[](float x) { return ErrorSpec{0.001, 0.001}; };
+    error_spec_gen = +[](NativeT x) {
+      NativeT eps = std::numeric_limits<NativeT>::epsilon();
+      return ErrorSpec{2e-4, eps};
+    };
   }
-
   Run(Acosh, std::acosh, error_spec_gen);
-}
-
-#if !defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT16)
-XLA_TEST_P(ExhaustiveF16UnaryTest, Acosh) { Run(Acosh, std::acosh); }
-#endif
-
-#if defined(XLA_BACKEND_SUPPORTS_BFLOAT16)
-XLA_TEST_P(ExhaustiveBF16UnaryTest, Acosh) { Run(Acosh, std::acosh); }
-#endif
-
-// Tests for Asinh
-XLA_TEST_P(ExhaustiveF32UnaryTest, Asinh) {
-  ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
+})
+UNARY_TEST_FLOAT_32_BITS_OR_LESS(Asinh, {
+  auto error_spec_gen = GetDefaultSpecGenerator();
   if (platform_ != "Host" && platform_ != "CUDA") {
-    error_spec_gen = +[](float x) { return ErrorSpec{0.001, 0.001}; };
+    error_spec_gen = +[](NativeT x) {
+      NativeT eps = std::numeric_limits<NativeT>::epsilon();
+      return ErrorSpec{.abs_err = 2e-4, .rel_err = eps};
+    };
   }
-
   Run(Asinh, std::asinh, error_spec_gen);
-}
-
-#if !defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT16)
-XLA_TEST_P(ExhaustiveF16UnaryTest, Asinh) { Run(Asinh, std::asinh); }
-#endif
-
-#if defined(XLA_BACKEND_SUPPORTS_BFLOAT16)
-XLA_TEST_P(ExhaustiveBF16UnaryTest, Asinh) { Run(Asinh, std::asinh); }
-#endif
+})
+UNARY_TEST_FLOAT_32_BITS_OR_LESS(Atanh, {
+  auto error_spec_gen = GetDefaultSpecGenerator();
+  if (platform_ != "Host" && platform_ != "CUDA") {
+    error_spec_gen = +[](NativeT x) {
+      NativeT eps = std::numeric_limits<NativeT>::epsilon();
+      return ErrorSpec{.abs_err = 1e-4, .rel_err = eps};
+    };
+  }
+  Run(Atanh, std::atanh, error_spec_gen);
+})
 
-UNARY_TEST_FLOAT_32_BITS_OR_LESS(Atanh, { Run(Atanh, std::atanh); })
-UNARY_TEST_FLOAT_32_BITS_OR_LESS(Acos, { Run(Acos, std::acos); })
-UNARY_TEST_FLOAT_32_BITS_OR_LESS(Asin, { Run(Asin, std::asin); })
+// Tests for inverse trogonometric functions.
+UNARY_TEST_FLOAT_32_BITS_OR_LESS(Acos, {
+  ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
+  if (platform_ != "Host") {
+    error_spec_gen = +[](NativeT x) {
+      NativeT eps = std::numeric_limits<NativeT>::epsilon();
+      return ErrorSpec{.abs_err = 1e-6, .rel_err = 10 * eps};
+    };
+  }
+  Run(Acos, std::acos, error_spec_gen);
+})
+UNARY_TEST_FLOAT_32_BITS_OR_LESS(Asin, {
+  auto error_spec_gen = +[](NativeT x) {
+    NativeT min = std::numeric_limits<NativeT>::min();
+    NativeT eps = std::numeric_limits<NativeT>::epsilon();
+    return ErrorSpec{.abs_err = 2.0f * min, .rel_err = 10 * eps};
+  };
+  Run(Asin, std::asin, error_spec_gen);
+})
+UNARY_TEST_FLOAT_32_BITS_OR_LESS(Atan, {
+  auto error_spec_gen = +[](NativeT x) {
+    NativeT min = std::numeric_limits<NativeT>::min();
+    NativeT eps = std::numeric_limits<NativeT>::epsilon();
+    return ErrorSpec{.abs_err = 2.0f * min, .rel_err = 20 * eps};
+  };
+  Run(Atan, std::atan, error_spec_gen);
+})
 
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(Cosh, {
-  // Our cosh implementation incorrectly overflows to inf for +/-89.4159851.
-  // The correct answer of 3.40281961e+38 (0x7f7fffec) is very close to
-  // max-float, so we deem this acceptable.
-  //
-  // This does not occur on CPU because we have an offsetting error in our
-  // implementation of exp.
-  float (*host_cosh)(float);
-  if (platform_ == "Host") {
-    host_cosh = &std::cosh;
-  } else {
-    host_cosh = +[](float x) {
-      if (std::abs(x) == 89.4159851f) {
-        return std::numeric_limits<float>::infinity();
-      }
-      return std::cosh(x);
+  auto error_spec_gen = GetDefaultSpecGenerator();
+  if (platform_ != "Host" && platform_ != "CUDA") {
+    error_spec_gen = +[](NativeT x) {
+      NativeT eps = std::numeric_limits<NativeT>::epsilon();
+      // Cosh is always greater than or equal to 1, so an absolute
+      // tolerance does not make sense.
+      return ErrorSpec{.abs_err = 0, .rel_err = 100 * eps};
     };
   }
-  Run(Cosh, host_cosh);
+  Run(Cosh, std::cosh, error_spec_gen);
 })
 
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(Sinh, {
-  // Our sinh implementation incorrectly overflows to +/-inf for +/-89.4159851.
-  // The correct answer of 3.40281961e+38 (0x7f7fffec) is very close to
-  // max-float, so we deem this acceptable.
-  //
-  // This does not occur on CPU because we have an offsetting error in our
-  // implementation of exp.
-  float (*host_sinh)(float);
-  if (platform_ == "Host") {
-    host_sinh = &std::sinh;
-  } else {
-    host_sinh = +[](float x) {
-      if (std::abs(x) == 89.4159851f) {
-        return std::copysign(std::numeric_limits<float>::infinity(), x);
-      }
-      return std::sinh(x);
+  auto error_spec_gen = GetDefaultSpecGenerator();
+  if (platform_ != "Host" && platform_ != "CUDA") {
+    error_spec_gen = +[](NativeT x) {
+      NativeT eps = std::numeric_limits<NativeT>::epsilon();
+      return ErrorSpec{.abs_err = 1e-5, .rel_err = 100 * eps};
     };
   }
-  Run(Sinh, host_sinh);
+  Run(Sinh, std::sinh, error_spec_gen);
 })
 
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(TanhBounderTestUpperBound, {
   SetBounder(8, 9);
   ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
-  if (platform_ == "CUDA" || platform_ == "CPU") {
-    error_spec_gen = +[](NativeT x) { return ErrorSpec{0, 0}; };
+  if (platform_ == "CUDA" || platform_ == "Host") {
+    error_spec_gen =
+        +[](NativeT x) { return ErrorSpec{.abs_err = 0, .rel_err = 0}; };
   }
   Run(
       Tanh, +[](float) { return 1.0f; }, error_spec_gen,
@@ -532,7 +498,7 @@ UNARY_TEST_FLOAT_32_BITS_OR_LESS(TanhBounderTestUpperBound, {
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(TanhBounderTestLowerBound, {
   SetBounder(-9, -8);
   ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
-  if (platform_ == "CUDA" || platform_ == "CPU") {
+  if (platform_ == "CUDA" || platform_ == "Host") {
     error_spec_gen = +[](NativeT x) { return ErrorSpec{0, 0}; };
   }
   Run(
@@ -542,154 +508,104 @@ UNARY_TEST_FLOAT_32_BITS_OR_LESS(TanhBounderTestLowerBound, {
 
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(TanhNormalTest, {
   ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
-  if (platform_ == "CUDA") {
+  if (platform_ != "CUDA" && platform_ != "Host") {
     error_spec_gen = +[](NativeT x) {
-      return x <= static_cast<NativeT>(-20.0) || x >= static_cast<NativeT>(20.0)
-                 ? ErrorSpec{0, 0}
-                 : GetDefaultSpecGenerator()(x);
+      // The range of tanh is [-1:1], so no point in giving a relative
+      // tolerance when we have an absolute one.
+      return ErrorSpec{.abs_err = 5e-4, .rel_err = 0};
     };
   }
   Run(Tanh, std::tanh, error_spec_gen);
 })
 
-UNARY_TEST_F32(Cos, {
+UNARY_TEST_FLOAT_32_BITS_OR_LESS(Cos, {
   Run(
       Cos, std::cos, +[](NativeT) {
         // This error spec corresponds to a maximum relative error of 2 ULP.
-        return ErrorSpec(0, 2 * std::numeric_limits<float>::epsilon());
+        NativeT eps = std::numeric_limits<NativeT>::epsilon();
+        return ErrorSpec{.abs_err = 0, .rel_err = 2 * eps};
       });
 })
 
-UNARY_TEST_F16(Cos, { Run(Cos, std::cos); })
-
-UNARY_TEST_BF16(Cos, { Run(Cos, std::cos); })
-
-UNARY_TEST_F32(Sin, {
+UNARY_TEST_FLOAT_32_BITS_OR_LESS(Sin, {
   Run(
       Sin, std::sin, +[](NativeT) {
         // This error spec corresponds to a maximum relative error of 2 ULP.
-        return ErrorSpec(0, 2 * std::numeric_limits<float>::epsilon());
+        NativeT eps = std::numeric_limits<NativeT>::epsilon();
+        return ErrorSpec{.abs_err = 0, .rel_err = 2 * eps};
       });
 })
 
-UNARY_TEST_F16(Sin, { Run(Sin, std::sin); })
-
-UNARY_TEST_BF16(Sin, { Run(Sin, std::sin); })
-
-UNARY_TEST_F32(Tan, {
+UNARY_TEST_FLOAT_32_BITS_OR_LESS(Tan, {
   Run(
       Tan, std::tan, +[](NativeT) {
         // This error spec corresponds to a maximum relative error of 4 ULP.
-        return ErrorSpec(0, 4 * std::numeric_limits<float>::epsilon());
+        NativeT eps = std::numeric_limits<NativeT>::epsilon();
+        return ErrorSpec{.abs_err = 0, .rel_err = 4 * eps};
       });
 })
 
-UNARY_TEST_F16(Tan, {
-  Run(
-      Tan, std::tan, +[](NativeT) {
-        // This error spec corresponds to a maximum relative error of 2 ULP.
-        return ErrorSpec(0, 2 * std::numeric_limits<Eigen::half>::epsilon());
-      });
-})
-
-UNARY_TEST_BF16(Tan, {
-  Run(
-      Tan, std::tan, +[](NativeT) {
-        // This error spec corresponds to a maximum relative error of 1 ULP.
-        return ErrorSpec(0, std::numeric_limits<Eigen::bfloat16>::epsilon());
-      });
-})
-
-// TODO(jlebar): Enable these.
-// UNARY_TEST_FLOAT_32_BITS_OR_LESS(Atan) { Run(Atan, std::atan); }
-// UNARY_TEST_FLOAT_32_BITS_OR_LESS(Atan2) { Run(Atan2, std::atan2); }
-
-UNARY_TEST_FLOAT_32_BITS_OR_LESS(Erf, {
-  Run(
-      Erf, std::erf, +[](NativeT x) {
-        NativeT tol =
-            std::max(std::numeric_limits<NativeT>::epsilon(),
-                     NativeT(4 * std::numeric_limits<float>::epsilon()));
-        NativeT abs_tol = std::min(tol, NativeT(1 - std::abs(std::erf(x))));
-        return ErrorSpec(abs_tol, tol);
-      });
+UNARY_TEST_FLOAT_32_BITS_OR_LESS(Erf, { Run(Erf, std::erf); })
+UNARY_TEST_FLOAT_32_BITS_OR_LESS(Erfc, {
+  auto error_spec_gen = GetDefaultSpecGenerator();
+  if (platform_ != "Host" && platform_ != "CUDA") {
+    error_spec_gen = +[](NativeT x) {
+      NativeT min = std::numeric_limits<NativeT>::min();
+      NativeT eps = std::numeric_limits<NativeT>::epsilon();
+      return ErrorSpec{.abs_err = 2 * min, .rel_err = 100 * eps};
+    };
+  }
+  Run(Erfc, std::erfc, error_spec_gen);
 })
-
-UNARY_TEST_FLOAT_32_BITS_OR_LESS(Erfc, { Run(Erfc, std::erfc); })
-
-UNARY_TEST_F32(ErfInv, { Run(ErfInv, HostErfInv); })
-
-UNARY_TEST_F16(ErfInv, {
-  Run(ErfInv, HostErfInv, [](Eigen::half) { return ErrorSpec{0.002, 0.002}; });
+UNARY_TEST_FLOAT_32_BITS_OR_LESS(ErfInv, {
+  auto error_spec_gen = GetDefaultSpecGenerator();
+  if (platform_ != "Host" && platform_ != "CUDA") {
+    error_spec_gen = +[](NativeT x) {
+      NativeT eps = std::numeric_limits<NativeT>::epsilon();
+      return ErrorSpec{.abs_err = 5e-5, .rel_err = 2 * eps};
+    };
+  }
+  Run(ErfInv, HostErfInv, error_spec_gen);
 })
 
-UNARY_TEST_BF16(ErfInv, { Run(ErfInv, HostErfInv); })
-
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(Digamma, {
-  ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
+  auto error_spec_gen = +[](NativeT x) {
+    NativeT eps = std::numeric_limits<NativeT>::epsilon();
+    return ErrorSpec{2e-5, 10 * eps};
+  };
   if (platform_ != "Host" && platform_ != "CUDA") {
-    // TODO(b/123956399): This is a fairly high error, significantly higher than
-    // we see on CPU/GPU.
-    error_spec_gen = +[](NativeT) { return ErrorSpec{0.01, 0.01}; };
-  }
-
-  if (platform_ == "CUDA") {
-    // On GPU we get a wrong answer for the denormal inputs +/-2.93873588e-39
-    // (0x00200000 and 0x80200000).  These should return -/+inf (at least
-    // according to our reference implementation!) but XLA:GPU returns
-    // -/+3.40282326e+38 (0xff7ffffe and 0x7f7ffffe).
-    //
-    // I deem this an acceptable result, as XLA:GPU flushes denormals, and as
-    // the results we get here are very close to MAX_FLOAT.  We just hardcode
-    // these results, as this is better than ignoring these inputs altogether.
-    auto host_digamma_with_gpu_ftz_errors = +[](float x) {
-      if (BitCast<uint32_t>(x) == 0x00200000 ||
-          BitCast<uint32_t>(x) == 0x80200000) {
-        return std::copysign(std::numeric_limits<float>::max(), -x);
-      }
-      return HostDigamma(x);
+    error_spec_gen = +[](NativeT x) {
+      NativeT eps = std::numeric_limits<NativeT>::epsilon();
+      return ErrorSpec{.abs_err = 2e-4, .rel_err = 10 * eps};
     };
-    Run(Digamma, host_digamma_with_gpu_ftz_errors, error_spec_gen);
-  } else {
-    Run(Digamma, HostDigamma, error_spec_gen);
   }
+  Run(Digamma, HostDigamma, error_spec_gen);
 })
 
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(Lgamma, {
-  // Our implementation gets within 0.0001 rel error except for ~20 denormal
-  // inputs on GPU.  Anyway 0.001 rel error should be good enough for lgamma.
-  ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
-  if (platform_ == "CUDA" && (ty_ == F32 || ty_ == F16)) {
+  auto error_spec_gen = +[](NativeT x) {
+    NativeT eps = std::numeric_limits<NativeT>::epsilon();
+    return ErrorSpec{.abs_err = 1e-5, .rel_err = 150 * eps};
+  };
+  if (platform_ == "CUDA") {
     error_spec_gen = +[](NativeT x) {
-      auto spec = GetDefaultSpecGenerator()(x);
-      spec.rel_err = 0.001;
-      return spec;
+      NativeT eps = std::numeric_limits<NativeT>::epsilon();
+      return ErrorSpec{.abs_err = 1e-5, .rel_err = 5000 * eps};
+    };
+  } else if (platform_ != "Host") {
+    error_spec_gen = +[](NativeT x) {
+      NativeT eps = std::numeric_limits<NativeT>::epsilon();
+      return ErrorSpec{.abs_err = 5e-4, .rel_err = 5000 * eps};
     };
   }
-
-  float (*host_lgamma)(float) = std::lgamma;
-  if (platform_ != "Host" && platform_ != "CUDA") {
-    // TODO(b/123956399): This is a fairly high error, significantly higher than
-    // we see on CPU/GPU.
-    error_spec_gen = +[](NativeT) { return ErrorSpec{0.01, 0.01}; };
-
-    // Overflows to inf for input 4.08500343e+36 (0x7c44af8e).
-    if (ty_ == F32) {
-      host_lgamma = +[](float v) {
-        if (BitCast<uint32_t>(v) == 0x7c44af8e) {
-          return std::numeric_limits<float>::infinity();
-        }
-        return std::lgamma(v);
-      };
-    }
-  }
-  Run(Lgamma, host_lgamma, error_spec_gen);
+  Run(Lgamma, std::lgamma, error_spec_gen);
 })
 
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(Round, { Run(Round, std::round); })
 
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(RoundNearestEven, {
-  ErrorSpecGen error_spec_gen = +[](NativeT) { return ErrorSpec{0.0, 0.0}; };
+  ErrorSpecGen error_spec_gen =
+      +[](NativeT) { return ErrorSpec{.abs_err = 0.0, .rel_err = 0.0}; };
   int curr_direction = fegetround();
   fesetround(FE_TONEAREST);
   Run(RoundNearestEven, std::nearbyint, error_spec_gen);
diff --git a/third_party/xla/xla/tests/exhaustive/exhaustive_unary_test_f64.cc b/third_party/xla/xla/tests/exhaustive/exhaustive_unary_test_f64.cc
index f95a03c209e876..7d06c3121b4c3d 100644
--- a/third_party/xla/xla/tests/exhaustive/exhaustive_unary_test_f64.cc
+++ b/third_party/xla/xla/tests/exhaustive/exhaustive_unary_test_f64.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "xla/client/xla_builder.h"
 #include "xla/tests/client_library_test_base.h"
 #include "xla/tests/exhaustive/exhaustive_op_test_utils.h"
 #include "xla/util.h"
diff --git a/third_party/xla/xla/tests/grouped_convolution_test.cc b/third_party/xla/xla/tests/grouped_convolution_test.cc
index 45cc3fe4112eb5..24f41c3c96cfeb 100644
--- a/third_party/xla/xla/tests/grouped_convolution_test.cc
+++ b/third_party/xla/xla/tests/grouped_convolution_test.cc
@@ -241,7 +241,7 @@ XLA_TEST_P(GroupedConvolution2DTest, DoIt) {
       BuildHloTextGroupedConvolution2D(spec, use_bfloat16);
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{0.01, 0.01},
-                            [](HloModule* module) -> Status {
+                            [](HloModule* module) -> absl::Status {
                               BFloat16MixedPrecisionRemoval remover;
                               TF_RETURN_IF_ERROR(remover.Run(module).status());
                               Despecializer despecializer;
diff --git a/third_party/xla/xla/tests/hlo_test_base.cc b/third_party/xla/xla/tests/hlo_test_base.cc
index 06c75fec1fc0ea..12f8b6fde817b1 100644
--- a/third_party/xla/xla/tests/hlo_test_base.cc
+++ b/third_party/xla/xla/tests/hlo_test_base.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/statusor.h"
+#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/tests/filecheck.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tests/pjrt_client_registry.h"
@@ -438,8 +439,14 @@ ::testing::AssertionResult HloTestBase::RunAndCompareNoHloPasses(
 
 ::testing::AssertionResult HloTestBase::RunAndCompare(
     std::unique_ptr<HloModule> module, const optional<ErrorSpec>& error,
-    const std::function<void(HloModule*)>& reference_preprocessor) {
-  auto fake_arguments = MakeFakeArguments(module.get()).value();
+    const std::function<void(HloModule*)>& reference_preprocessor,
+    std::optional<int64_t> args_max_bits_of_precision) {
+  auto fake_arguments =
+      MakeFakeArguments(module.get(), /*pseudo_random=*/true,
+                        /*use_large_range=*/false,
+                        /*treat_gte_as_data_formatting=*/false,
+                        args_max_bits_of_precision)
+          .value();
 
   std::vector<Literal*> fake_argument_ptrs;
   absl::c_transform(
@@ -480,7 +487,8 @@ ::testing::AssertionResult HloTestBase::Run(std::unique_ptr<HloModule> module,
 
 ::testing::AssertionResult HloTestBase::RunAndCompare(
     string_view hlo_string, const std::optional<ErrorSpec>& error,
-    const std::function<void(HloModule*)>& reference_preprocessor) {
+    const std::function<void(HloModule*)>& reference_preprocessor,
+    std::optional<int64_t> args_max_bits_of_precision) {
   auto module_or_status = ParseAndReturnVerifiedModule(hlo_string);
   if (!module_or_status.ok()) {
     return ::testing::AssertionFailure()
@@ -488,7 +496,7 @@ ::testing::AssertionResult HloTestBase::RunAndCompare(
            << module_or_status.status().ToString();
   }
   return RunAndCompare(std::move(module_or_status).value(), error,
-                       reference_preprocessor);
+                       reference_preprocessor, args_max_bits_of_precision);
 }
 
 absl::StatusOr<::testing::AssertionResult>
@@ -679,7 +687,7 @@ ::testing::AssertionResult HloTestBase::Run(
     // Set backend configuration if it is given.
     HloInstruction* instruction =
         module->entry_computation()->root_instruction();
-    Status s = instruction->set_backend_config(*backend_config);
+    absl::Status s = instruction->set_backend_config(*backend_config);
     return s.ok() ? ::testing::AssertionSuccess()
                   : ::testing::AssertionFailure() << s.message();
   }
@@ -715,7 +723,7 @@ ::testing::AssertionResult HloTestBase::RunReplicated(
     // Set backend configuration if it is given.
     HloInstruction* instruction =
         module->entry_computation()->root_instruction();
-    Status s = instruction->set_backend_config(*backend_config);
+    absl::Status s = instruction->set_backend_config(*backend_config);
     return s.ok() ? ::testing::AssertionSuccess()
                   : ::testing::AssertionFailure() << s.message();
   }
@@ -771,7 +779,7 @@ ::testing::AssertionResult HloTestBase::RunMultipleTimes(
       // Set backend configuration if it is given.
       HloInstruction* instruction =
           module->entry_computation()->root_instruction();
-      Status s = instruction->set_backend_config(*backend_config);
+      absl::Status s = instruction->set_backend_config(*backend_config);
       return s.ok() ? ::testing::AssertionSuccess()
                     : ::testing::AssertionFailure() << s.message();
     }
@@ -885,6 +893,14 @@ HloInstruction* HloTestBase::FindInstruction(HloModule* module,
   return nullptr;
 }
 
+se::DeviceMemoryAllocator* HloTestBase::GetAllocator() {
+  if (allocator_ == nullptr) {
+    allocator_ = std::make_unique<se::StreamExecutorMemoryAllocator>(
+        backend().default_stream_executor());
+  }
+  return allocator_.get();
+}
+
 Backend& HloTestBase::backend() { return test_runner_.backend(); }
 
 /* static */
@@ -911,15 +927,14 @@ absl::StatusOr<std::unique_ptr<HloModule>> HloTestBase::GetOptimizedModule(
       std::unique_ptr<HloModule> module,
       ParseAndReturnVerifiedModule(hlo, GetModuleConfigForTest()));
   return backend().compiler()->RunHloPasses(
-      std::move(module), backend().default_stream_executor(),
-      backend().default_stream_executor()->GetAllocator());
+      std::move(module), backend().default_stream_executor(), GetAllocator());
 }
 
 absl::StatusOr<std::unique_ptr<HloModule>> HloTestBase::GetOptimizedModule(
     std::unique_ptr<HloModule> hlo_module) {
-  return backend().compiler()->RunHloPasses(
-      std::move(hlo_module), backend().default_stream_executor(),
-      backend().default_stream_executor()->GetAllocator());
+  return backend().compiler()->RunHloPasses(std::move(hlo_module),
+                                            backend().default_stream_executor(),
+                                            GetAllocator());
 }
 
 absl::StatusOr<std::unique_ptr<HloRunnerInterface>>
diff --git a/third_party/xla/xla/tests/hlo_test_base.h b/third_party/xla/xla/tests/hlo_test_base.h
index f8f91a8df51b96..68b44a208c3470 100644
--- a/third_party/xla/xla/tests/hlo_test_base.h
+++ b/third_party/xla/xla/tests/hlo_test_base.h
@@ -33,6 +33,7 @@ limitations under the License.
 #include "xla/service/platform_util.h"
 #include "xla/shape_layout.h"
 #include "xla/statusor.h"
+#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tests/manifest_checking_test.h"
@@ -100,7 +101,7 @@ class HloTestBase : public ManifestCheckingTest {
 
   // Runs the hlo_pass with the provided module and returns the result. This
   // function also verifies that the module remains unchanged when hlo_pass
-  // returns false as the StatusOr value.
+  // returns false as the absl::StatusOr value.
   //
   // These three overloads all do the same thing.  The && overload lets you do
   // `RunHloPass(MyPass(), module)` all in one line.  The reason for the
@@ -120,7 +121,7 @@ class HloTestBase : public ManifestCheckingTest {
   // Runs the hlo_pass with the provided module group and returns the result.
   // This method runs the input HLO module group pass for a `HloModuleGroup` and
   // it also verifies the module group remains unchanged when hlo_pass returns
-  // false as the StatusOr value.
+  // false as the absl::StatusOr value.
   static absl::StatusOr<bool> RunHloPass(HloPassInterface&& hlo_pass,
                                          HloModuleGroup* module_group);
 
@@ -272,7 +273,8 @@ class HloTestBase : public ManifestCheckingTest {
   // Executes an hlo module with fake inputs and compares the results.
   [[nodiscard]] ::testing::AssertionResult RunAndCompare(
       std::unique_ptr<HloModule> module, const std::optional<ErrorSpec>& error,
-      const std::function<void(HloModule*)>& reference_preprocessor = nullptr);
+      const std::function<void(HloModule*)>& reference_preprocessor = nullptr,
+      std::optional<int64_t> args_max_bits_of_precision = std::nullopt);
 
   // Same as above, except that the module will be executed without Hlo
   // optimization.
@@ -290,7 +292,8 @@ class HloTestBase : public ManifestCheckingTest {
   // or loaded from a file.
   [[nodiscard]] ::testing::AssertionResult RunAndCompare(
       const absl::string_view hlo_string, const std::optional<ErrorSpec>& error,
-      const std::function<void(HloModule*)>& reference_preprocessor = nullptr);
+      const std::function<void(HloModule*)>& reference_preprocessor = nullptr,
+      std::optional<int64_t> args_max_bits_of_precision = std::nullopt);
   [[nodiscard]] ::testing::AssertionResult Run(
       const absl::string_view hlo_string, bool run_hlo_passes = true,
       ExecutionProfile* profile = nullptr,
@@ -430,9 +433,12 @@ class HloTestBase : public ManifestCheckingTest {
   static se::Platform* GetTestPlatform();
 
  private:
+  // Creates or retrieves the allocator.
+  se::DeviceMemoryAllocator* GetAllocator();
   // Either an HloRunner or HloRunnerPjRt depending on if ShouldUsePjRt()
   std::unique_ptr<HloRunnerInterface> runner_;
   se::Platform* test_platform_;
+  std::unique_ptr<se::DeviceMemoryAllocator> allocator_;
 
   // Given the test module, makes a reference module that is ready to run on the
   // reference platform. This assumes that the given module is ready to run on
diff --git a/third_party/xla/xla/tests/literal_test_util.cc b/third_party/xla/xla/tests/literal_test_util.cc
index df898b1cedea84..1d5c99fd712e88 100644
--- a/third_party/xla/xla/tests/literal_test_util.cc
+++ b/third_party/xla/xla/tests/literal_test_util.cc
@@ -63,7 +63,7 @@ void OnMiscompare(const LiteralSlice& expected, const LiteralSlice& actual,
   WriteLiteralToTempFile(mismatches, "mismatches");
 }
 
-::testing::AssertionResult StatusToAssertion(const Status& s) {
+::testing::AssertionResult StatusToAssertion(const absl::Status& s) {
   if (s.ok()) {
     return ::testing::AssertionSuccess();
   }
diff --git a/third_party/xla/xla/tests/llvm_compiler_test.cc b/third_party/xla/xla/tests/llvm_compiler_test.cc
index 5f826254f2524b..70274abf1c6ac3 100644
--- a/third_party/xla/xla/tests/llvm_compiler_test.cc
+++ b/third_party/xla/xla/tests/llvm_compiler_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/llvm_compiler.h"
 
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <utility>
@@ -55,14 +56,17 @@ class GpuDummyCompiler : public GpuCompiler {
  public:
   GpuDummyCompiler()
       : GpuCompiler(kGpuPlatformId, kDummyTriple, kDummyLayout) {}
-  Status OptimizeHloConvolutionCanonicalization(
+
+  int32_t GetToolkitVersion() const override { return 0; }
+
+  absl::Status OptimizeHloConvolutionCanonicalization(
       HloModule* hlo_module, se::GpuComputeCapability gpu_version,
       se::dnn::VersionInfo dnn_version,
       se::DeviceMemoryAllocator* device_allocator) {
     return OkStatus();
   }
 
-  Status OptimizeHloPostLayoutAssignment(
+  absl::Status OptimizeHloPostLayoutAssignment(
       HloModule* hlo_module, se::StreamExecutor* stream_executor,
       const CompileOptions& options, const TargetConfig& gpu_target_config,
       tsl::thread::ThreadPool* thread_pool) override {
diff --git a/third_party/xla/xla/tests/llvm_irgen_test_base.cc b/third_party/xla/xla/tests/llvm_irgen_test_base.cc
index 59b16529b4df1f..b6c7694d353237 100644
--- a/third_party/xla/xla/tests/llvm_irgen_test_base.cc
+++ b/third_party/xla/xla/tests/llvm_irgen_test_base.cc
@@ -50,7 +50,7 @@ void LlvmIrGenTestBase::CompileAndVerifyIr(
     std::unique_ptr<HloModule> hlo_module, const std::string& pattern,
     bool match_optimized_ir, bool run_optimization_passes) {
   SetIrHook(match_optimized_ir);
-  Status status =
+  absl::Status status =
       CompileToExecutable(std::move(hlo_module), run_optimization_passes)
           .status();
   ResetIrHook();
@@ -77,7 +77,7 @@ void LlvmIrGenTestBase::CompileAheadOfTimeAndVerifyIr(
     std::unique_ptr<HloModule> hlo_module, const AotCompilationOptions& options,
     const std::string& pattern, bool match_optimized_ir) {
   SetIrHook(match_optimized_ir);
-  Status status =
+  absl::Status status =
       CompileToAotCompilationResult(std::move(hlo_module), options).status();
   ResetIrHook();
   TF_ASSERT_OK(status);
@@ -91,7 +91,7 @@ LLVMCompiler* LlvmIrGenTestBase::GetLLVMCompiler() {
   return static_cast<LLVMCompiler*>(backend().compiler());
 }
 
-Status LlvmIrGenTestBase::IrHook(const llvm::Module& module) {
+absl::Status LlvmIrGenTestBase::IrHook(const llvm::Module& module) {
   ir_ = llvm_ir::DumpToString(&module);
   return OkStatus();
 }
diff --git a/third_party/xla/xla/tests/llvm_irgen_test_base.h b/third_party/xla/xla/tests/llvm_irgen_test_base.h
index 890e5bacb65ed5..718eb360490ef4 100644
--- a/third_party/xla/xla/tests/llvm_irgen_test_base.h
+++ b/third_party/xla/xla/tests/llvm_irgen_test_base.h
@@ -67,7 +67,7 @@ class LlvmIrGenTestBase : public CodegenTestBase {
   void ResetIrHook();
 
   std::string ir_;
-  Status IrHook(const llvm::Module& module);
+  absl::Status IrHook(const llvm::Module& module);
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/tests/local_client_execute_test.cc b/third_party/xla/xla/tests/local_client_execute_test.cc
index 1c94d82ef387e1..af10fcf3620f8b 100644
--- a/third_party/xla/xla/tests/local_client_execute_test.cc
+++ b/third_party/xla/xla/tests/local_client_execute_test.cc
@@ -904,7 +904,9 @@ XLA_TEST_F(LocalClientExecuteTest, DISABLED_ON_INTERPRETER(InfeedOutfeedTest)) {
 void BM_LocalClientOverhead(::testing::benchmark::State& state) {
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().value();
   auto executors = PlatformUtil::GetStreamExecutors(platform).value();
-  se::StreamExecutorMemoryAllocator allocator(platform, executors);
+  se::StreamExecutorMemoryAllocator allocator(
+      platform, std::vector<se::StreamExecutorInterface*>(executors.begin(),
+                                                          executors.end()));
   LocalClient* client = ClientLibrary::GetOrCreateLocalClient(platform).value();
   auto* transfer_manager = TransferManager::GetForPlatform(platform).value();
   int device_ordinal = client->default_device_ordinal();
diff --git a/third_party/xla/xla/tests/local_client_test_base.cc b/third_party/xla/xla/tests/local_client_test_base.cc
index 8df3e5eb11aca0..60d0310fa7d0b5 100644
--- a/third_party/xla/xla/tests/local_client_test_base.cc
+++ b/third_party/xla/xla/tests/local_client_test_base.cc
@@ -52,7 +52,8 @@ absl::StatusOr<se::OwningDeviceMemory> TestAllocator::Allocate(
       device_ordinal, size, retry_on_failure, memory_space);
 }
 
-Status TestAllocator::Deallocate(int device_ordinal, se::DeviceMemoryBase mem) {
+absl::Status TestAllocator::Deallocate(int device_ordinal,
+                                       se::DeviceMemoryBase mem) {
   VLOG(2) << "Deallocate(" << device_ordinal << ")";
   {
     absl::MutexLock lock(&count_mutex_);
diff --git a/third_party/xla/xla/tests/local_client_test_base.h b/third_party/xla/xla/tests/local_client_test_base.h
index 7f99d8471442f6..02504d0ab0b7bd 100644
--- a/third_party/xla/xla/tests/local_client_test_base.h
+++ b/third_party/xla/xla/tests/local_client_test_base.h
@@ -45,12 +45,15 @@ class TestAllocator : public se::StreamExecutorMemoryAllocator {
  public:
   explicit TestAllocator(se::Platform* platform)
       : se::StreamExecutorMemoryAllocator(
-            platform, PlatformUtil::GetStreamExecutors(platform).value()) {}
+            platform, GetInterfaceVectorFromExecutors(
+                          PlatformUtil::GetStreamExecutors(platform).value())) {
+  }
 
   absl::StatusOr<se::OwningDeviceMemory> Allocate(
       int device_ordinal, uint64_t size, bool retry_on_failure,
       int64_t memory_space) override;
-  Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override;
+  absl::Status Deallocate(int device_ordinal,
+                          se::DeviceMemoryBase mem) override;
 
   // Return the number of allocations that have been performed.
   int64_t allocation_count() const;
@@ -61,6 +64,13 @@ class TestAllocator : public se::StreamExecutorMemoryAllocator {
   int64_t deallocation_count(int device_ordinal) const;
 
  private:
+  // Helper function to turn a vector<StreamExecutor*> into a
+  // vector<StreamExecutorInterface*>.
+  std::vector<se::StreamExecutorInterface*> GetInterfaceVectorFromExecutors(
+      const std::vector<se::StreamExecutor*>& executors) {
+    return std::vector<se::StreamExecutorInterface*>(executors.begin(),
+                                                     executors.end());
+  }
   mutable absl::Mutex count_mutex_;
 
   // Global counts of allocations and deallocations.
diff --git a/third_party/xla/xla/tests/multioutput_fusion_test.cc b/third_party/xla/xla/tests/multioutput_fusion_test.cc
index c2220217b12de2..845a0e2f6628ee 100644
--- a/third_party/xla/xla/tests/multioutput_fusion_test.cc
+++ b/third_party/xla/xla/tests/multioutput_fusion_test.cc
@@ -244,6 +244,11 @@ XLA_TEST_F(MultiOutputFusionTest,
 }
 
 XLA_TEST_F(MultiOutputFusionTest, MultiOutputLoopFeedingMap) {
+#ifdef XLA_TEST_BACKEND_GPU
+  if (GetDebugOptionsForTest().xla_gpu_enable_mlir_emitters()) {
+    GTEST_SKIP() << "Nested fusions not supported on GPU with MLIR emitters.";
+  }
+#endif
   const char* testcase = R"(
     HloModule m, is_scheduled=true
 
diff --git a/third_party/xla/xla/tests/onednn_matmul_test.cc b/third_party/xla/xla/tests/onednn_matmul_test.cc
index 88f308b8624671..2248b05f5ddcdf 100644
--- a/third_party/xla/xla/tests/onednn_matmul_test.cc
+++ b/third_party/xla/xla/tests/onednn_matmul_test.cc
@@ -72,6 +72,43 @@ class MatmulTest : public HloTestBase {
     ; CHECK-DAG:   }
     ; CHECK:     }
     )";
+  const char* fused_matmul_bias_gelu_erf_ = R"(
+    ; CHECK:     custom_call_target="__onednn$matmul",
+    ; CHECK:       backend_config={
+    ; CHECK-DAG:     "outer_dimension_partitions":[],
+    ; CHECK-DAG:     "onednn_matmul_config":{
+    ; CHECK-DAG:       "fused_ops":["BIAS","GELU_ERF"]
+    ; CHECK-DAG:   }
+    ; CHECK:     }
+    )";
+
+  const char* fused_matmul_bias_elu_rewrite_str_ = R"(
+    ; CHECK:     custom_call_target="__onednn$matmul",
+    ; CHECK:       backend_config={
+    ; CHECK-DAG:     "outer_dimension_partitions":[],
+    ; CHECK-DAG:     "onednn_matmul_config":{
+    ; CHECK-DAG:       "fused_ops":["BIAS","ELU"]
+    ; CHECK-DAG:   }
+    ; CHECK:     }
+    )";
+  const char* fused_matmul_bias_tanh_rewrite_str_ = R"(
+    ; CHECK:     custom_call_target="__onednn$matmul",
+    ; CHECK:       backend_config={
+    ; CHECK-DAG:     "outer_dimension_partitions":[],
+    ; CHECK-DAG:     "onednn_matmul_config":{
+    ; CHECK-DAG:       "fused_ops":["BIAS","TANH"]
+    ; CHECK-DAG:   }
+    ; CHECK:     }
+    )";
+  const char* fused_matmul_bias_relu6_rewrite_str_ = R"(
+    ; CHECK:     custom_call_target="__onednn$matmul",
+    ; CHECK:       backend_config={
+    ; CHECK-DAG:     "outer_dimension_partitions":[],
+    ; CHECK-DAG:     "onednn_matmul_config":{
+    ; CHECK-DAG:       "fused_ops":["BIAS","RELU6"]
+    ; CHECK-DAG:   }
+    ; CHECK:     }
+    )";
 };
 
 TEST_F(MatmulTest, SimpleTestF32) {
@@ -525,6 +562,196 @@ TEST_F(MatmulTest, BiasAndApproxTFGELUTestF16) {
   MatchOptimizedHlo(matmul_module_str, fused_matmul_bias_gelu_tanh_);
 }
 
+TEST_F(MatmulTest, ExactGELUTestF32) {
+  const char* matmul_module_str = R"(
+  HloModule matmul.test.f32
+  ENTRY matmul.test.f32 {
+    arg.0 = f32[32,32,4,16] parameter(0), parameter_replication={false}
+    arg.1 = f32[32,32,16,32] parameter(1), parameter_replication={false}
+    onednn.matmul.0 = f32[32,32,4,32] dot(arg.0, arg.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+    const.0 = f32[] constant(0.707106769)
+    bcast.0 = f32[32,32,4,32] broadcast(const.0), dimensions={}
+    mul.0 = f32[32,32,4,32] multiply(onednn.matmul.0, bcast.0)
+    erf.0 = f32[32,32,4,32] erf(mul.0)
+    const.1 = f32[] constant(1)
+    bcast.1 = f32[32,32,4,32] broadcast(const.1), dimensions={}
+    add.0 = f32[32,32,4,32] add(erf.0, bcast.1)
+    const.2 = f32[] constant(0.5)
+    bcast.2 = f32[32,32,4,32] broadcast(const.2), dimensions={}
+    mul.1 = f32[32,32,4,32] multiply(add.0, bcast.2)
+    ROOT out = f32[32,32,4,32] multiply(onednn.matmul.0, mul.1)
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
+  MatchOptimizedHlo(matmul_module_str,
+                    R"(
+  ; CHECK:     custom_call_target="__onednn$matmul",
+  ; CHECK:       backend_config={
+  ; CHECK-DAG:     "outer_dimension_partitions":[],
+  ; CHECK-DAG:     "onednn_matmul_config":{
+  ; CHECK-DAG:       "fused_ops":["GELU_ERF"]
+  ; CHECK-DAG:   }
+  ; CHECK:     }
+  )");
+}
+
+TEST_F(MatmulTest, BiasAndExactGELUTestF32) {
+  const char* matmul_module_str = R"(
+  HloModule matmul.test.f32
+  ENTRY matmul.test.f32 {
+    arg.0 = f32[6304,768] parameter(0), parameter_replication={false}
+    arg.1 = f32[768,3072] parameter(1), parameter_replication={false}
+    dot.378 = f32[6304,3072] dot(arg.0, arg.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    reshape.11 = f32[32,197,3072]reshape(dot.378)
+    constant.381 = f32[3072] constant(0.3)
+    broadcast.382 = f32[32,197,3072] broadcast(constant.381), dimensions={2}
+    add.383 = f32[32,197,3072] add(reshape.11, broadcast.382)
+    constant.384 = f32[] constant(0.707106769)
+    broadcast.385 = f32[32,197,3072] broadcast(constant.384), dimensions={}
+    multiply.386 = f32[32,197,3072] multiply(broadcast.385, add.383)
+    erf.387 = f32[32,197,3072] erf(multiply.386)
+    constant.388 = f32[] constant(1)
+    broadcast.389 = f32[32,197,3072] broadcast(constant.388), dimensions={}
+    add.390 = f32[32,197,3072] add(erf.387, broadcast.389)
+    constant.391 = f32[] constant(0.5)
+    broadcast.392 = f32[32,197,3072] broadcast(constant.391)
+    multiply.393 = f32[32,197,3072] multiply(add.390, broadcast.392)
+    multiply.394 = f32[32,197,3072] multiply(multiply.393, add.383)
+    ROOT out = f32[6304,3072] reshape(multiply.394)
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
+  MatchOptimizedHlo(matmul_module_str, fused_matmul_bias_gelu_erf_);
+}
+
+TEST_F(MatmulTest, BiasAndExactGELUTestBF16) {
+  const char* matmul_module_str = R"(
+  HloModule matmul.test.f32
+  ENTRY matmul.test.f32 {
+    arg.0 = f32[6304,768] parameter(0), parameter_replication={false}
+    convert.0 = bf16[6304,768] convert(arg.0)
+    arg.1 = f32[768,3072] parameter(1), parameter_replication={false}
+    convert.1 = bf16[768,3072] convert(arg.1)
+    dot.378 = bf16[6304,3072] dot(convert.0, convert.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    convert.2 = f32[6304,3072] convert(dot.378)
+    constant.381 = f32[3072] constant(0.3)
+    broadcast.382 = f32[6304,3072] broadcast(constant.381), dimensions={1}
+    add.383 = f32[6304,3072] add(convert.2, broadcast.382)
+    constant.384 = f32[] constant(0.707106769)
+    broadcast.385 = f32[6304,3072] broadcast(constant.384), dimensions={}
+    multiply.386 = f32[6304,3072] multiply(broadcast.385, add.383)
+    erf.387 = f32[6304,3072] erf(multiply.386)
+    constant.388 = f32[] constant(1)
+    broadcast.389 = f32[6304,3072] broadcast(constant.388), dimensions={}
+    add.390 = f32[6304,3072] add(erf.387, broadcast.389)
+    constant.391 = f32[] constant(0.5)
+    broadcast.392 = f32[6304,3072] broadcast(constant.391)
+    multiply.393 = f32[6304,3072] multiply(add.390, broadcast.392)
+    ROOT out = f32[6304,3072] multiply(multiply.393, add.383)
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-2}));
+  MatchOptimizedHlo(matmul_module_str, fused_matmul_bias_gelu_erf_);
+}
+
+TEST_F(MatmulTest, BiasAndExactJaxGELUTestBF16) {
+  if (!IsSupportedType(PrimitiveType::BF16)) {
+    GTEST_SKIP() << "CPU does not support BF16.";
+  }
+  const char* matmul_module_str = R"(
+  HloModule matmul.test.f32
+  ENTRY matmul.test.f32 {
+    arg.0 = f32[6304,768] parameter(0), parameter_replication={false}
+    convert.0 = bf16[6304,768] convert(arg.0)
+    arg.1 = f32[768,3072] parameter(1), parameter_replication={false}
+    convert.1 = bf16[768,3072] convert(arg.1)
+    dot.378 = bf16[6304,3072] dot(convert.0, convert.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    convert.2 = f32[6304,3072] convert(dot.378)
+    reshape.0 = f32[32,197,3072] reshape(convert.2)
+    constant.381 = f32[3072] constant(0.3)
+    broadcast.382 = f32[32,197,3072] broadcast(constant.381), dimensions={2}
+    add.383 = f32[32,197,3072] add(reshape.0, broadcast.382)
+    constant.384 = f32[] constant(0.707182348)
+    broadcast.385 = f32[32,197,3072] broadcast(constant.384), dimensions={}
+    multiply.386 = f32[32,197,3072] multiply(broadcast.385, add.383)
+    erf.387 = f32[32,197,3072] erf(multiply.386)
+    constant.388 = f32[] constant(1)
+    broadcast.389 = f32[32,197,3072] broadcast(constant.388), dimensions={}
+    add.390 = f32[32,197,3072] add(erf.387, broadcast.389)
+    multiply.393 = f32[32,197,3072] multiply(add.390, add.383)
+    constant.391 = f32[] constant(0.5)
+    broadcast.392 = f32[32,197,3072] broadcast(constant.391)
+    ROOT multiply.394 = f32[32,197,3072] multiply(multiply.393, broadcast.392)
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-2}));
+  MatchOptimizedHlo(matmul_module_str, fused_matmul_bias_gelu_erf_);
+}
+
+// Tests GELU approximate pattern from tf.nn.gelu(approximate=False)
+TEST_F(MatmulTest, BiasAndExactTFGELUTestBF16) {
+  if (!IsSupportedType(PrimitiveType::BF16)) {
+    GTEST_SKIP() << "CPU does not support BF16.";
+  }
+  const char* matmul_module_str = R"(
+  HloModule matmul.test.bf16
+  ENTRY matmul.test.bf16 {
+  arg0.1 = f32[1024,512] parameter(0), parameter_replication={false}
+  convert.8 = bf16[1024,512] convert(arg0.1)
+  arg1.2 = f32[512,256] parameter(1), parameter_replication={false}
+  convert.9 = bf16[512,256] convert(arg1.2)
+  dot.10 = bf16[1024,256] dot(convert.8, convert.9), lhs_contracting_dims={1}, rhs_contracting_dims={0}, frontend_attributes={grad_x="false",grad_y="false"}
+  convert = f32[1024,256] convert(dot.10)
+  arg2.3 = f32[256] parameter(2), parameter_replication={false}
+  broadcast = f32[1024,256] broadcast(arg2.3), dimensions={1}
+  add.13 = f32[1024,256] add(convert, broadcast)
+  constant.1 = f32[] constant(0.70703125)
+  broadcast.2 = f32[1024,256] broadcast(constant.1), dimensions={}
+  multiply.16 = f32[1024,256] multiply(add.13, broadcast.2)
+  erf.17 = f32[1024,256] erf(multiply.16)
+  constant.3 = f32[] constant(1)
+  broadcast.4 = f32[1024,256] broadcast(constant.3), dimensions={}
+  add.20 = f32[1024,256] add(erf.17, broadcast.4)
+  constant.5 = f32[] constant(0.5)
+  broadcast.6 = f32[1024,256] broadcast(constant.5), dimensions={}
+  multiply.23 = f32[1024,256] multiply(add.20, broadcast.6)
+  ROOT multiply.24 = f32[1024,256] multiply(add.13, multiply.23)
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-2}));
+  MatchOptimizedHlo(matmul_module_str, fused_matmul_bias_gelu_erf_);
+}
+
+TEST_F(MatmulTest, BiasAndExactGELUTestF16) {
+  if (!IsSupportedType(PrimitiveType::F16)) {
+    GTEST_SKIP() << "CPU does not support F16.";
+  }
+  const char* matmul_module_str = R"(
+  HloModule matmul.test.f16
+  ENTRY matmul.test.f16 {
+    arg.0 = f16[6304,768] parameter(0), parameter_replication={false}
+    arg.1 = f16[768,3072] parameter(1), parameter_replication={false}
+    dot.378 = f16[6304,3072] dot(arg.0, arg.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    constant.381 = f16[3072] constant(0.3)
+    broadcast.382 = f16[6304,3072] broadcast(constant.381), dimensions={1}
+    add.383 = f16[6304,3072] add(dot.378, broadcast.382)
+    constant.384 = f16[] constant(0.707106769)
+    broadcast.385 = f16[6304,3072] broadcast(constant.384), dimensions={}
+    multiply.386 = f16[6304,3072] multiply(broadcast.385, add.383)
+    erf.387 = f16[6304,3072] erf(multiply.386)
+    constant.388 = f16[] constant(1)
+    broadcast.389 = f16[6304,3072] broadcast(constant.388), dimensions={}
+    add.390 = f16[6304,3072] add(erf.387, broadcast.389)
+    constant.391 = f16[] constant(0.5)
+    broadcast.392 = f16[6304,3072] broadcast(constant.391)
+    multiply.393 = f16[6304,3072] multiply(add.390, broadcast.392)
+    ROOT out = f16[6304,3072] multiply(multiply.393, add.383)
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-2}));
+  MatchOptimizedHlo(matmul_module_str, fused_matmul_bias_gelu_erf_);
+}
+
 TEST_F(MatmulTest, ReLUTestF32) {
   const char* matmul_module_str = R"(
   HloModule matmul.test.f32
@@ -724,6 +951,117 @@ TEST_F(MatmulTest, TestF32ConstantWeights) {
   )");
 }
 
+// MM + BiasAdd + Elu : FP32
+TEST_F(MatmulTest, BiasAddELUFusion_F32) {
+  const char* matmul_module_str = R"(
+  HloModule matmul.test.f32
+
+  ENTRY matmul.test.f32 {
+    arg0.1 = f32[1024,1024] parameter(0)
+    arg1.2 = f32[1024,1024] parameter(1)
+    dot.3 = f32[1024,1024] dot(arg1.2, arg0.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    arg2.4 = f32[1024] parameter(2)
+    broadcast.5 = f32[1024,1024] broadcast(arg2.4), dimensions={1}
+    add.6 = f32[1024,1024] add(dot.3, broadcast.5)
+    constant.7 = f32[] constant(0)
+    broadcast.8 = f32[1024,1024] broadcast(constant.7), dimensions={}
+    compare.9 = pred[1024,1024] compare(add.6, broadcast.8), direction=GT
+    exponential-minus-one.10 = f32[1024,1024] exponential-minus-one(add.6)
+    ROOT select.11 = f32[1024,1024] select(compare.9, add.6, exponential-minus-one.10)
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
+  MatchOptimizedHlo(matmul_module_str, fused_matmul_bias_elu_rewrite_str_);
+}
+
+// MM + BiasAdd + Elu : BF16
+TEST_F(MatmulTest, BiasAddELUFusion_BF16) {
+  if (!IsSupportedType(PrimitiveType::BF16)) {
+    GTEST_SKIP() << "CPU does not support BF16.";
+  }
+  const char* matmul_module_str = R"(
+  HloModule matmul.test.bf16
+  ENTRY matmul.test.bf16 {
+    arg0.1 = f32[1024,512] parameter(0)
+    convert.2 = bf16[1024,512] convert(arg0.1)
+    arg1.3 = f32[256,512] parameter(1)
+    convert.4 = bf16[256,512] convert(arg1.3)
+    dot.5 = bf16[1024,256] dot(convert.2, convert.4), lhs_contracting_dims={1}, rhs_contracting_dims={1}
+    convert.6 = f32[1024,256] convert(dot.5)
+    arg2.7 = f32[256] parameter(2)
+    broadcast.8 = f32[1024,256] broadcast(arg2.7), dimensions={1}
+    add.9 = f32[1024,256] add(convert.6, broadcast.8)
+    constant.10 = f32[] constant(0)
+    broadcast.11 = f32[1024,256] broadcast(constant.10), dimensions={}
+    compare.12 = pred[1024,256] compare(add.9, broadcast.11), direction=GT
+    convert.13 = bf16[1024,256] convert(add.9)
+    exponential-minus-one.14 = f32[1024,256] exponential-minus-one(add.9)
+    convert.15 = bf16[1024,256] convert(exponential-minus-one.14)
+    select.16 = bf16[1024,256] select(compare.12, convert.13, convert.15)
+    ROOT convert.17 = f32[1024,256] convert(select.16)
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-2}));
+  MatchOptimizedHlo(matmul_module_str, fused_matmul_bias_elu_rewrite_str_);
+}
+
+// MM + BiasAdd + Elu : F16
+TEST_F(MatmulTest, BiasAddELUFusion_F16) {
+  if (!IsSupportedType(PrimitiveType::F16)) {
+    GTEST_SKIP() << "CPU does not support F16.";
+  }
+  const char* matmul_module_str = R"(
+  HloModule matmul.test.f16
+
+  ENTRY matmul.test.f16 {
+    arg0.1 = f16[1024,1024] parameter(0)
+    arg1.2 = f16[1024,1024] parameter(1)
+    dot.3 = f16[1024,1024] dot(arg1.2, arg0.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    arg2.4 = f16[1024] parameter(2)
+    broadcast.5 = f16[1024,1024] broadcast(arg2.4), dimensions={1}
+    add.6 = f16[1024,1024] add(dot.3, broadcast.5)
+    constant.7 = f16[] constant(0)
+    broadcast.8 = f16[1024,1024] broadcast(constant.7), dimensions={}
+    compare.9 = pred[1024,1024] compare(add.6, broadcast.8), direction=GT
+    exponential-minus-one.10 = f16[1024,1024] exponential-minus-one(add.6)
+    ROOT select.11 = f16[1024,1024] select(compare.9, add.6, exponential-minus-one.10)
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-2}));
+  MatchOptimizedHlo(matmul_module_str, fused_matmul_bias_elu_rewrite_str_);
+}
+
+// MM + BiasAdd + Elu + MM : FP16_2
+TEST_F(MatmulTest, BiasAddELUFusion_F16_2) {
+  if (!IsSupportedType(PrimitiveType::F16)) {
+    GTEST_SKIP() << "CPU does not support F16.";
+  }
+  const char* matmul_module_str = R"(
+  HloModule matmul.test.f16
+
+  ENTRY matmul.test.f16 {
+    arg0.1 = f32[1024,1024] parameter(0)
+    convert.2 = f16[1024,1024] convert(arg0.1)
+    arg1.3 = f32[1024,1024] parameter(2)
+    convert.4 = f16[1024,1024] convert(arg1.3)
+    dot.5 = f16[1024,1024] dot(convert.2, convert.4), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    arg2.6 = f32[1024] parameter(1)
+    convert.7 = f16[1024] convert(arg2.6)
+    broadcast.8 = f16[1024,1024] broadcast(convert.7), dimensions={1}
+    add.9 = f16[1024,1024] add(dot.5, broadcast.8)
+    constant.10 = f16[] constant(0)
+    broadcast.11 = f16[1024,1024] broadcast(constant.10), dimensions={}
+    compare.12 = pred[1024,1024] compare(add.9, broadcast.11), direction=GT
+    exponential-minus-one.13 = f16[1024,1024] exponential-minus-one(add.9)
+    select.14 = f16[1024,1024] select(compare.12, add.9, exponential-minus-one.13)
+    dot.15 = f16[1024,1024] dot(select.14, convert.4), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    ROOT convert.16 = f32[1024,1024] convert(dot.15)
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-2}));
+  MatchOptimizedHlo(matmul_module_str, fused_matmul_bias_elu_rewrite_str_);
+}
+
 TEST_F(MatmulTest, SimpleTestBF16Gemv1) {
   if (!IsSupportedType(PrimitiveType::BF16)) {
     GTEST_SKIP() << "CPU does not support BF16.";
@@ -811,6 +1149,179 @@ TEST_F(MatmulTest, SimpleTestF32WithMulAndAddFusion) {
     )");
 }
 
+TEST_F(MatmulTest, BiasAddTanhFusionTest_F32) {
+  const char* matmul_module_str = R"(
+  HloModule matmul.bias.tanh.test.f32
+  ENTRY matmul.bias.tanh.test.f32 {
+    arg.0 = f32[32,32,40,30] parameter(0)
+    arg.1 = f32[32,32,30,40] parameter(1)
+    dot.2 = f32[32,32,40,40] dot(arg.0, arg.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+    const.3 = f32[40] constant(15)
+    bcast.4 = f32[32,32,40,40] broadcast(const.3), dimensions={3}
+    add.5 = f32[32,32,40,40] add(dot.2, bcast.4)
+    tanh.6 = f32[32,32,40,40] tanh(add.5)
+    tuple.7 = (f32[32,32,40,40]) tuple(tanh.6)
+    ROOT get-tuple-element.8 = f32[32,32,40,40] get-tuple-element(tuple.7), index=0
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
+  MatchOptimizedHlo(matmul_module_str, fused_matmul_bias_tanh_rewrite_str_);
+}
+
+TEST_F(MatmulTest, BiasAddTanhFusionTest_BF16) {
+  if (!IsSupportedType(PrimitiveType::BF16)) {
+    GTEST_SKIP() << "CPU does not support BF16.";
+  }
+  const char* matmul_module_str = R"(
+  HloModule matmul.bias.tanh.test.f32
+  ENTRY matmul.bias.tanh.test.f32 {
+    arg0.1 = f32[1024,512] parameter(0)
+    convert.2 = bf16[1024,512] convert(arg0.1)
+    arg1.3 = f32[256,512] parameter(1)
+    convert.4 = bf16[256,512] convert(arg1.3)
+    dot.5 = bf16[1024,256] dot(convert.2, convert.4), lhs_contracting_dims={1}, rhs_contracting_dims={1}
+    convert.6 = f32[1024,256] convert(dot.5)
+    arg2.7 = f32[256] parameter(2)
+    broadcast.8 = f32[1024,256] broadcast(arg2.7), dimensions={1}
+    add.9 = f32[1024,256] add(convert.6, broadcast.8)
+    ROOT tanh.10 = f32[1024,256] tanh(add.9)
+  })";
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-2}));
+  MatchOptimizedHlo(matmul_module_str, fused_matmul_bias_tanh_rewrite_str_);
+}
+
+TEST_F(MatmulTest, BiasAddTanhFusionTest_F16) {
+  if (!IsSupportedType(PrimitiveType::F16)) {
+    GTEST_SKIP() << "CPU does not support F16.";
+  }
+  const char* matmul_module_str = R"(
+  HloModule matmul.bias.tanh.test.f16
+  ENTRY matmul.bias.tanh.test.f16 {
+    arg0.1 = f16[1024,1024] parameter(0)
+    arg1.2 = f16[1024,1024] parameter(1)
+    dot.3 = f16[1024,1024] dot(arg1.2, arg0.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    arg2.4 = f16[1024] parameter(2)
+    broadcast.5 = f16[1024,1024] broadcast(arg2.4), dimensions={1}
+    add.6 = f16[1024,1024] add(dot.3, broadcast.5)
+    ROOT tanh.7 = f16[1024,1024] tanh(add.6)
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
+  MatchOptimizedHlo(matmul_module_str, fused_matmul_bias_tanh_rewrite_str_);
+}
+
+// Test MM + BiasAdd + Relu6 fusion : F32
+TEST_F(MatmulTest, BiasAddRelu6Fusion_F32) {
+  const char* matmul_module_str = R"(
+  HloModule matmul.bias.relu6.test.f32
+  ENTRY matmul.bias.relu6.test.f32 {
+    constant.1 = f32[] constant(0)
+    broadcast.2 = f32[1024,1024] broadcast(constant.1), dimensions={}
+    arg1.3 = f32[1024,1024] parameter(1)
+    arg2.4 = f32[1024,1024] parameter(0)
+    dot.5 = f32[1024,1024] dot(arg1.3, arg2.4), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    arg3.6 = f32[1024] parameter(2)
+    broadcast.7 = f32[1024,1024] broadcast(arg3.6), dimensions={1}
+    add.8 = f32[1024,1024] add(dot.5, broadcast.7)
+    constant.9 = f32[] constant(6)
+    broadcast.10 = f32[1024,1024] broadcast(constant.9), dimensions={}
+    ROOT clamp.11 = f32[1024,1024] clamp(broadcast.2, add.8, broadcast.10)
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-2}));
+  MatchOptimizedHlo(matmul_module_str, fused_matmul_bias_relu6_rewrite_str_);
+}
+
+// Test MM + BiasAdd + Relu6 fusion : BF16
+TEST_F(MatmulTest, BiasAddRelu6Fusion_BF16) {
+  if (!IsSupportedType(PrimitiveType::BF16)) {
+    GTEST_SKIP() << "CPU does not support BF16.";
+  }
+  const char* matmul_module_str = R"(
+  HloModule matmul.bias.relu6.test.bf16
+  ENTRY matmul.bias.relu6.test.bf16 {
+    constant.1 = f32[] constant(0)
+    broadcast.2 = f32[1024,256] broadcast(constant.1), dimensions={}
+    arg0.3 = f32[1024,512] parameter(0)
+    convert.4 = bf16[1024,512] convert(arg0.3)
+    arg1.5 = f32[256,512] parameter(1)
+    convert.6 = bf16[256,512] convert(arg1.5)
+    dot.7 = bf16[1024,256] dot(convert.4, convert.6), lhs_contracting_dims={1}, rhs_contracting_dims={1}
+    convert.8 = f32[1024,256] convert(dot.7)
+    arg2.9 = f32[256] parameter(2)
+    broadcast.10 = f32[1024,256] broadcast(arg2.9), dimensions={1}
+    add.11 = f32[1024,256] add(convert.8, broadcast.10)
+    constant.12 = f32[] constant(6)
+    broadcast.13 = f32[1024,256] broadcast(constant.12), dimensions={}
+    ROOT clamp.14 = f32[1024,256] clamp(broadcast.2, add.11, broadcast.13)
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-2}));
+  MatchOptimizedHlo(matmul_module_str, fused_matmul_bias_relu6_rewrite_str_);
+}
+
+// Test MM + BiasAdd + Relu6 fusion : F16
+TEST_F(MatmulTest, BiasAddRelu6Fusion_F16) {
+  if (!IsSupportedType(PrimitiveType::F16)) {
+    GTEST_SKIP() << "CPU does not support F16.";
+  }
+  const char* matmul_module_str = R"(
+  HloModule matmul.bias.relu6.test.f16
+  ENTRY matmul.bias.relu6.test.f16 {
+    constant.1 = f16[] constant(0)
+    broadcast.2 = f16[1024,1024] broadcast(constant.7), dimensions={}
+    arg0.3 = f16[1024,1024] parameter(0)
+    arg1.4 = f16[1024,1024] parameter(1)
+    dot.5 = f16[1024,1024] dot(arg1.4, arg0.3), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    arg2.6 = f16[1024] parameter(2)
+    broadcast.7 = f16[1024,1024] broadcast(arg2.6), dimensions={1}
+    add.8 = f16[1024,1024] add(dot.5, broadcast.7)
+    constant.9 = f16[] constant(6)
+    broadcast.10 = f16[1024,1024] broadcast(constant.9), dimensions={}
+    ROOT clamp.11 = f16[1024,1024] clamp(broadcast.2, add.8, broadcast.10)
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
+  MatchOptimizedHlo(matmul_module_str, fused_matmul_bias_relu6_rewrite_str_);
+}
+
+TEST_F(MatmulTest, SimpleTestBF16WithMulAndAddFusion) {
+  if (!IsSupportedType(PrimitiveType::BF16)) {
+    GTEST_SKIP() << "CPU does not support BF16.";
+  }
+
+  const char* matmul_module_str = R"(
+  ENTRY matmul.mul.add.test.bf16 {
+    arg0.1 = f32[32,32,40,30] parameter(0), parameter_replication={false}
+    convert0 = bf16[32,32,40,30] convert(arg0.1)
+    arg0.2 = f32[32,32,30,40] parameter(1), parameter_replication={false}
+    convert1 = bf16[32,32,30,40] convert(arg0.2)
+    dot.7 = bf16[32,32,40,40] dot(convert0, convert1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+    convert2 = f32[32,32,40,40] convert(dot.7)
+    const.0 = f32[] constant(0.044715)
+    bcast.0 = f32[32,32,40,40] broadcast(const.0), dimensions={}
+    mul.0 = f32[32,32,40,40] multiply(convert2,bcast.0)
+    const.1 = f32[] constant(0.65)
+    bcast.1 = f32[32,32,40,40] broadcast(const.1), dimensions={}
+    add.0 = f32[32,32,40,40] add(mul.0, bcast.1)
+    convert3 = bf16[32,32,40,40] convert(add.0)
+    tuple.12 = (bf16[32,32,40,40]) tuple(convert3)
+    ROOT get-tuple-element.13 = bf16[32,32,40,40] get-tuple-element(tuple.12), index=0
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-2}));
+  MatchOptimizedHlo(matmul_module_str,
+                    R"(
+    ; CHECK:     custom_call_target="__onednn$matmul",
+    ; CHECK:       backend_config={
+    ; CHECK-DAG:     "outer_dimension_partitions":[],
+    ; CHECK-DAG:     "onednn_matmul_config":{
+    ; CHECK-DAG:       "fused_ops":["LINEAR","BINARY_ADD"]
+    ; CHECK-DAG:   }
+    ; CHECK:     }
+    )");
+}
+
 TEST_F(MatmulTest, WeightsPrepackAndScratch) {
   const char* matmul_module_str = R"(
   HloModule matmul.test.f32
diff --git a/third_party/xla/xla/tests/reduce_hlo_test.cc b/third_party/xla/xla/tests/reduce_hlo_test.cc
index 60378a80daafaa..c4dcfedb669bc9 100644
--- a/third_party/xla/xla/tests/reduce_hlo_test.cc
+++ b/third_party/xla/xla/tests/reduce_hlo_test.cc
@@ -73,7 +73,7 @@ ENTRY reduce.1 {
   }
 };
 
-XLA_TEST_P(ReduceWithLayoutTest, Reduce) {
+XLA_TEST_P(ReduceWithLayoutTest, DISABLED_ON_TPU(Reduce)) {
   if (IsMlirLoweringEnabled()) {
     GTEST_SKIP() << "Explicit layouts not supported by MLIR";
   }
diff --git a/third_party/xla/xla/tests/reduce_precision_test.cc b/third_party/xla/xla/tests/reduce_precision_test.cc
index 20002e53cebe0e..052f5bdf9ceda2 100644
--- a/third_party/xla/xla/tests/reduce_precision_test.cc
+++ b/third_party/xla/xla/tests/reduce_precision_test.cc
@@ -483,7 +483,8 @@ XLA_TEST_P(ReducedPrecisionAccuracyTest, ReducePrecisionFloat) {
                         operation_index);
 }
 
-XLA_TEST_P(ReducedPrecisionAccuracyTest, ReducePrecisionDouble) {
+XLA_TEST_P(ReducedPrecisionAccuracyTest,
+           DISABLED_ON_TPU(ReducePrecisionDouble)) {
   int operation_index = GetParam();
   DoIt<double, uint64_t>(f64_exponent_sizes[operation_index],
                          f64_mantissa_sizes[operation_index], f64_test_values,
diff --git a/third_party/xla/xla/tests/reduce_window_rewriter_execution_test.cc b/third_party/xla/xla/tests/reduce_window_rewriter_execution_test.cc
new file mode 100644
index 00000000000000..cbb9d37d2c9eb1
--- /dev/null
+++ b/third_party/xla/xla/tests/reduce_window_rewriter_execution_test.cc
@@ -0,0 +1,64 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdlib.h>
+
+#include "absl/strings/string_view.h"
+#include "xla/error_spec.h"
+#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/test_macros.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/test.h"
+
+namespace xla {
+namespace {
+
+class ReduceWindowRewriterExecutionTest : public HloTestBase {};
+
+XLA_TEST_F(ReduceWindowRewriterExecutionTest, RewriterTest) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+addition {
+  x.6 = f32[] parameter(0)
+  y.7 = f32[] parameter(1)
+  ROOT add.8 = f32[] add(x.6, y.7)
+}
+
+ENTRY entry {
+  arg0.1 = f32[1000]{0} parameter(0), parameter_replication={false}
+  reshape.2 = f32[1000]{0} reshape(arg0.1)
+  convert.4 = f32[1000]{0} convert(reshape.2)
+  constant.3 = f32[] constant(0)
+  reduce-window.9 = f32[1000]{0} reduce-window(convert.4, constant.3), window={size=1000 pad=999_0}, to_apply=addition
+  convert.10 = f32[1000]{0} convert(reduce-window.9)
+  reshape.11 = f32[1000]{0} reshape(convert.10)
+  tuple.12 = (f32[1000]{0}) tuple(reshape.11)
+  ROOT get-tuple-element.13 = f32[1000]{0} get-tuple-element(tuple.12), index=0
+}
+)";
+
+  // Verify correctness of the rewrite.
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{1e-5, 1e-5}));
+
+  // Check that the tree reduction decomposition did happen.
+  MatchOptimizedHlo(hlo_string, R"(
+// CHECK: reduce-window
+// CHECK: reduce-window
+  )");
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/tests/reduce_window_test.cc b/third_party/xla/xla/tests/reduce_window_test.cc
index b76065f55649f7..ccbb8f4f3cb8ba 100644
--- a/third_party/xla/xla/tests/reduce_window_test.cc
+++ b/third_party/xla/xla/tests/reduce_window_test.cc
@@ -889,7 +889,10 @@ INSTANTIATE_TEST_CASE_P(
 
 class R4ReduceWindowLargeTest : public R4ReduceWindowTest {};
 
-XLA_TEST_P(R4ReduceWindowLargeTest, DISABLED_ON_INTERPRETER(DoIt)) { DoIt(); }
+XLA_TEST_P(R4ReduceWindowLargeTest,
+           OVERSIZE_ON_GRM(DISABLED_ON_INTERPRETER(DoIt))) {
+  DoIt();
+}
 
 // Test cases that are large/slow/failed.
 const R4ReduceWindowTestData kR4ReduceWindowLargeTestValues[] = {
@@ -980,7 +983,9 @@ struct R3ReduceWindowTestData {
   int64_t layout[3];
   Padding padding;
   Reducer reducer;
-} kR3TestCases[] = {
+};
+
+R3ReduceWindowTestData kR3TestCases[] = {
     {/*base_bounds=*/{2, 1, 2}, /*window_bounds=*/{1, 1, 2},
      /*strides=*/{1, 1, 1}, /*layout=*/{2, 1, 0},
      /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
@@ -1014,15 +1019,6 @@ struct R3ReduceWindowTestData {
     {/*base_bounds=*/{63, 261, 257}, /*window_bounds=*/{63, 261, 257},
      /*strides=*/{1, 1, 1}, /*layout=*/{2, 1, 0},
      /*padding=*/Padding::kValid, /*reducer=*/Reducer::kMax},
-    {/*base_bounds=*/{10003, 10, 5}, /*window_bounds=*/{9999, 7, 3},
-     /*strides=*/{1, 1, 1}, /*layout=*/{2, 1, 0},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
-    {/*base_bounds=*/{9999, 1, 1}, /*window_bounds=*/{9999, 1, 1},
-     /*strides=*/{1, 1, 1}, /*layout=*/{2, 1, 0},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
-    {/*base_bounds=*/{10003, 10, 5}, /*window_bounds=*/{9999, 7, 3},
-     /*strides=*/{2, 2, 2}, /*layout=*/{2, 1, 0},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
 };
 
 std::string R3ReduceWindowTestDataToString(
@@ -1047,46 +1043,48 @@ class R3ReduceWindowTest : public ReduceWindowTestBase,
                                ::testing::tuple<R3ReduceWindowTestData, bool>> {
  protected:
   R3ReduceWindowTest() { set_use_bfloat16(::testing::get<1>(GetParam())); }
-};
 
-XLA_TEST_P(R3ReduceWindowTest, DoIt) {
-  XlaBuilder b(TestName());
-  const auto& param = ::testing::get<0>(GetParam());
+  void DoIt() {
+    XlaBuilder b(TestName());
+    const auto& param = ::testing::get<0>(GetParam());
 
-  const float kInitValue = 0.0f;
-  Array3D<float> input(param.base_bounds[0], param.base_bounds[1],
-                       param.base_bounds[2]);
-  // Choose a prime iota length so that each window sees a unique set of values.
-  // (Technically, the requirement is that the iota length is relatively prime
-  // to all of the dimensions involved in the reduce-window.)
-  input.FillRepeatedIota(0, 137);
-  Literal input_literal = LiteralUtil::CreateR3FromArray3DWithLayout(
-      input, LayoutUtil::MakeLayout(param.layout));
-  auto reducer = param.reducer;
-  if (use_bfloat16()) {
-    input_literal = LiteralUtil::ConvertF32ToBF16(input_literal);
-
-    // To avoid numerical issues, force the reducer to be kMax for bf16
-    // inputs.
-    reducer = kMax;
-  }
+    const float kInitValue = 0.0f;
+    Array3D<float> input(param.base_bounds[0], param.base_bounds[1],
+                         param.base_bounds[2]);
+    // Choose a prime iota length so that each window sees a unique set of
+    // values. (Technically, the requirement is that the iota length is
+    // relatively prime to all of the dimensions involved in the reduce-window.)
+    input.FillRepeatedIota(0, 137);
+    Literal input_literal = LiteralUtil::CreateR3FromArray3DWithLayout(
+        input, LayoutUtil::MakeLayout(param.layout));
+    auto reducer = param.reducer;
+    if (use_bfloat16()) {
+      input_literal = LiteralUtil::ConvertF32ToBF16(input_literal);
 
-  XlaOp parameter = Parameter(&b, 0, input_literal.shape(), "input");
-  auto init_value =
-      CreateConstantFromLiteral(LiteralUtil::CreateR0(kInitValue), &b);
+      // To avoid numerical issues, force the reducer to be kMax for bf16
+      // inputs.
+      reducer = kMax;
+    }
 
-  auto computation = reducer == kAdd
-                         ? CreateScalarAddComputation(FloatType(), &b)
-                         : CreateScalarMaxComputation(FloatType(), &b);
+    XlaOp parameter = Parameter(&b, 0, input_literal.shape(), "input");
+    auto init_value =
+        CreateConstantFromLiteral(LiteralUtil::CreateR0(kInitValue), &b);
 
-  ReduceWindow(/*operand=*/parameter,
-               /*init_value=*/init_value,
-               /*computation=*/computation,
-               /*window_dimensions=*/param.window_bounds,
-               /*window_strides=*/param.strides, /*padding=*/param.padding);
+    auto computation = reducer == kAdd
+                           ? CreateScalarAddComputation(FloatType(), &b)
+                           : CreateScalarMaxComputation(FloatType(), &b);
 
-  ComputeAndCompare(&b, {std::move(input_literal)}, DefaultErrorSpec());
-}
+    ReduceWindow(/*operand=*/parameter,
+                 /*init_value=*/init_value,
+                 /*computation=*/computation,
+                 /*window_dimensions=*/param.window_bounds,
+                 /*window_strides=*/param.strides, /*padding=*/param.padding);
+
+    ComputeAndCompare(&b, {std::move(input_literal)}, DefaultErrorSpec());
+  }
+};
+
+XLA_TEST_P(R3ReduceWindowTest, DoIt) { DoIt(); }
 
 INSTANTIATE_TEST_CASE_P(
     R3ReduceWindowTestInstantiation, R3ReduceWindowTest,
@@ -1094,6 +1092,29 @@ INSTANTIATE_TEST_CASE_P(
                        ::testing::ValuesIn(use_bfloat16_params)),
     R3ReduceWindowTestDataToString);
 
+class R3ReduceWindowLargeTest : public R3ReduceWindowTest {};
+
+XLA_TEST_P(R3ReduceWindowLargeTest, OVERSIZE_ON_GRM(DoIt)) { DoIt(); }
+
+// Test cases that are large/slow/failed.
+const R3ReduceWindowTestData kR3ReduceWindowLargeTestValues[] = {
+    {/*base_bounds=*/{10003, 10, 5}, /*window_bounds=*/{9999, 7, 3},
+     /*strides=*/{1, 1, 1}, /*layout=*/{2, 1, 0},
+     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+    {/*base_bounds=*/{9999, 1, 1}, /*window_bounds=*/{9999, 1, 1},
+     /*strides=*/{1, 1, 1}, /*layout=*/{2, 1, 0},
+     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+    {/*base_bounds=*/{10003, 10, 5}, /*window_bounds=*/{9999, 7, 3},
+     /*strides=*/{2, 2, 2}, /*layout=*/{2, 1, 0},
+     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+};
+
+INSTANTIATE_TEST_CASE_P(
+    R3ReduceWindowLargeTestInstantiation, R3ReduceWindowLargeTest,
+    ::testing::Combine(::testing::ValuesIn(kR3ReduceWindowLargeTestValues),
+                       ::testing::ValuesIn(use_bfloat16_params)),
+    R3ReduceWindowTestDataToString);
+
 struct R2ReduceWindowTestData {
   int64_t base_bounds[2];
   int64_t window_bounds[2];
diff --git a/third_party/xla/xla/tests/select_and_scatter_test.cc b/third_party/xla/xla/tests/select_and_scatter_test.cc
index c4c9bb7b0a3fe0..5c4c302c142b07 100644
--- a/third_party/xla/xla/tests/select_and_scatter_test.cc
+++ b/third_party/xla/xla/tests/select_and_scatter_test.cc
@@ -259,7 +259,7 @@ XLA_TEST_F(SelectAndScatterTest, R1S32OverlappingWindow) {
 }
 
 // Test for S32 2D array, when windows do not overlap.
-XLA_TEST_F(SelectAndScatterTest, R2S32) {
+XLA_TEST_F(SelectAndScatterTest, DISABLED_ON_TPU(R2S32)) {
   const auto operand =
       ConstantR2<int32_t>(&builder_, {{7, 2, 5, 3, 10, 2}, {3, 8, 9, 3, 4, 2}});
   const auto source = ConstantR2<int32_t>(&builder_, {{2, 6}});
@@ -272,7 +272,7 @@ XLA_TEST_F(SelectAndScatterTest, R2S32) {
 
 // Test for tie breaking rule in ge_f32_. When a tie is present, the operand
 // that has the lower lexicographical order (smaller index) should be chosen.
-XLA_TEST_F(SelectAndScatterTest, R2F32Tie) {
+XLA_TEST_F(SelectAndScatterTest, DISABLED_ON_TPU(R2F32Tie)) {
   const auto operand = ConstantR2<float>(
       &builder_, {{0.f, 0.f, 0.f}, {0.f, 0.f, 0.f}, {0.f, 0.f, 0.f}});
   const auto source = ConstantR2<float>(
@@ -286,7 +286,7 @@ XLA_TEST_F(SelectAndScatterTest, R2F32Tie) {
 }
 
 // Similar to SelectAndScatterTest.R2S32 but the input is transposed.
-XLA_TEST_F(SelectAndScatterTest, ReshapeR2S32) {
+XLA_TEST_F(SelectAndScatterTest, DISABLED_ON_TPU(ReshapeR2S32)) {
   const auto operand = ConstantR2<int32_t>(
       &builder_, {{7, 3}, {2, 8}, {5, 9}, {3, 3}, {10, 4}, {2, 2}});
   const auto reshape =
@@ -300,7 +300,7 @@ XLA_TEST_F(SelectAndScatterTest, ReshapeR2S32) {
 }
 
 // Test for S32 2D array, when windows overlap with each other.
-XLA_TEST_F(SelectAndScatterTest, R2S32OverlappingWindow) {
+XLA_TEST_F(SelectAndScatterTest, DISABLED_ON_TPU(R2S32OverlappingWindow)) {
   const auto operand =
       ConstantR2<int32_t>(&builder_, {{7, 2, 5, 3, 8}, {3, 8, 9, 3, 4}});
   const auto source = ConstantR2<int32_t>(&builder_, {{2, 6, 4}});
@@ -312,7 +312,7 @@ XLA_TEST_F(SelectAndScatterTest, R2S32OverlappingWindow) {
 }
 
 // Test for S32 2D array, when the padding is Padding::kSAME.
-XLA_TEST_F(SelectAndScatterTest, R2S32SamePadding) {
+XLA_TEST_F(SelectAndScatterTest, DISABLED_ON_TPU(R2S32SamePadding)) {
   const auto operand =
       ConstantR2<int32_t>(&builder_, {{7, 2, 5, 3, 8}, {3, 8, 9, 3, 4}});
   const auto source = ConstantR2<int32_t>(&builder_, {{2, 6, 4}});
@@ -325,7 +325,8 @@ XLA_TEST_F(SelectAndScatterTest, R2S32SamePadding) {
 
 // Test for S32 2D array, when the padding is Padding::kSAME and windows overlap
 // with each other.
-XLA_TEST_F(SelectAndScatterTest, R2S32SamePaddingOverlappingWindow) {
+XLA_TEST_F(SelectAndScatterTest,
+           DISABLED_ON_TPU(R2S32SamePaddingOverlappingWindow)) {
   const auto operand =
       ConstantR2<int32_t>(&builder_, {{7, 2, 5, 3, 8}, {3, 8, 9, 3, 4}});
   const auto source =
@@ -337,7 +338,7 @@ XLA_TEST_F(SelectAndScatterTest, R2S32SamePaddingOverlappingWindow) {
   ComputeAndCompareR2<int32_t>(&builder_, expected, {});
 }
 
-XLA_TEST_F(SelectAndScatterTest, R2F32OverlappingR2Source) {
+XLA_TEST_F(SelectAndScatterTest, DISABLED_ON_TPU(R2F32OverlappingR2Source)) {
   const auto operand = ConstantR2<float>(
       &builder_, {{1.5f, 2.5f, 1.5f}, {3.5f, 1.5f, 3.5f}, {4.5f, 2.5f, 4.5f}});
   const auto source =
diff --git a/third_party/xla/xla/tests/test_macros.h b/third_party/xla/xla/tests/test_macros.h
index 2930415f6f4c0c..77e2707e0577e2 100644
--- a/third_party/xla/xla/tests/test_macros.h
+++ b/third_party/xla/xla/tests/test_macros.h
@@ -36,6 +36,9 @@ limitations under the License.
 #define DISABLED_ON_INTERPRETER_TSAN(X) X
 #define DISABLED_ON_DEBUG(X) X
 #define DISABLED_ON_TPU(X) X
+#define DISABLED_ON_GRM(X) X
+
+#define OVERSIZE_ON_GRM(X) X
 
 // We need this macro instead of pasting directly to support nesting
 // the DISABLED_ON_FOO macros, as in the definition of DISABLED_ON_CPU.
@@ -87,6 +90,14 @@ limitations under the License.
 # define DISABLED_ON_TPU(X) XLA_TEST_PASTE(DISABLED_, X)
 #endif  // XLA_TEST_BACKEND_TPU
 
+#ifdef XLA_TEST_BACKEND_GRM
+# undef DISABLED_ON_GRM
+# define DISABLED_ON_GRM(X) XLA_TEST_PASTE(DISABLED_, X)
+
+# undef OVERSIZE_ON_GRM
+# define OVERSIZE_ON_GRM(X) XLA_TEST_PASTE(DISABLED_, X)
+#endif  // XLA_TEST_BACKEND_GRM
+
 // clang-format on
 
 namespace xla {
diff --git a/third_party/xla/xla/tests/test_utils.cc b/third_party/xla/xla/tests/test_utils.cc
index d2934f0395c6e3..b078841d26182c 100644
--- a/third_party/xla/xla/tests/test_utils.cc
+++ b/third_party/xla/xla/tests/test_utils.cc
@@ -261,7 +261,7 @@ absl::StatusOr<Literal> MakeFakeLiteralInternal(
   Literal literal(new_shape);
 
   TF_RETURN_IF_ERROR(primitive_util::PrimitiveTypeSwitch<Status>(
-      [&](auto primitive_type_constant) -> Status {
+      [&](auto primitive_type_constant) -> absl::Status {
         if constexpr (primitive_util::IsArrayType(primitive_type_constant)) {
           using NativeT = primitive_util::NativeTypeOf<primitive_type_constant>;
           if constexpr (primitive_util::IsFloatingPointType(
@@ -639,8 +639,8 @@ absl::StatusOr<std::vector<Literal>> MakeFakeArguments(
   return std::move(arguments);
 }
 
-Status VerifyHloModule(HloModule* const module, bool layout_sensitive,
-                       bool allow_mixed_precision) {
+absl::Status VerifyHloModule(HloModule* const module, bool layout_sensitive,
+                             bool allow_mixed_precision) {
   return HloVerifier(/*layout_sensitive=*/layout_sensitive,
                      /*allow_mixed_precision=*/allow_mixed_precision)
       .Run(module)
diff --git a/third_party/xla/xla/tests/test_utils.h b/third_party/xla/xla/tests/test_utils.h
index fb101102d6381d..8ea594511cf3fa 100644
--- a/third_party/xla/xla/tests/test_utils.h
+++ b/third_party/xla/xla/tests/test_utils.h
@@ -109,8 +109,8 @@ absl::StatusOr<std::vector<Literal>> MakeFakeArguments(
 
 // Check that a given module satisfies various constraints before trying to
 // execute it.
-Status VerifyHloModule(HloModule* const module, bool layout_sensitive,
-                       bool allow_mixed_precision);
+absl::Status VerifyHloModule(HloModule* const module, bool layout_sensitive,
+                             bool allow_mixed_precision);
 
 // Creates a dot op with operands 'lhs' and 'rhs' that contracts dimension 1 of
 // the LHS with dimension 0 of the RHS with no batch dimensions.
diff --git a/third_party/xla/xla/tests/tile_assignment_test.cc b/third_party/xla/xla/tests/tile_assignment_test.cc
index d28fb6e754964c..728d48d87c6f04 100644
--- a/third_party/xla/xla/tests/tile_assignment_test.cc
+++ b/third_party/xla/xla/tests/tile_assignment_test.cc
@@ -281,6 +281,74 @@ TEST_P(FormattedTileAssignmentTest, TransposeIotaTileWithDegernateDims) {
       ElementsAre(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15));
 }
 
+TEST_P(FormattedTileAssignmentTest,
+       TransposeIotaTileSplittingCanonicalizedReshapeDims) {
+  TileAssignment tile({8, 2, 16}, {16, 16}, {1, 0});
+  if (ShouldConvertToV1()) {
+    tile = TileAssignment(tile.shared_array());
+  }
+  TileAssignment xposed = tile.Transpose({0, 2, 1});
+  EXPECT_NE(xposed, tile);
+  EXPECT_EQ(xposed, TileAssignment({8, 16, 2}, {16, 8, 2}, {1, 0, 2}));
+  EXPECT_EQ(xposed.num_dimensions(), 3);
+  EXPECT_EQ(xposed.dim(0), 8);
+  EXPECT_EQ(xposed.dim(1), 16);
+  EXPECT_EQ(xposed.dim(2), 2);
+  EXPECT_EQ(xposed(0, 0, 0), 0);
+  EXPECT_EQ(xposed({2, 7, 1}), 117);
+  EXPECT_EQ(xposed.iota().has_value(), !ShouldConvertToV1());
+  EXPECT_TRUE(xposed.UsesDevice(0));
+  EXPECT_TRUE(xposed.UsesDevice(255));
+  EXPECT_FALSE(xposed.UsesDevice(256));
+  EXPECT_THAT(
+      ToVectorUsingEach(xposed),
+      ElementsAre(
+          0, 1, 16, 17, 32, 33, 48, 49, 64, 65, 80, 81, 96, 97, 112, 113, 128,
+          129, 144, 145, 160, 161, 176, 177, 192, 193, 208, 209, 224, 225, 240,
+          241, 2, 3, 18, 19, 34, 35, 50, 51, 66, 67, 82, 83, 98, 99, 114, 115,
+          130, 131, 146, 147, 162, 163, 178, 179, 194, 195, 210, 211, 226, 227,
+          242, 243, 4, 5, 20, 21, 36, 37, 52, 53, 68, 69, 84, 85, 100, 101, 116,
+          117, 132, 133, 148, 149, 164, 165, 180, 181, 196, 197, 212, 213, 228,
+          229, 244, 245, 6, 7, 22, 23, 38, 39, 54, 55, 70, 71, 86, 87, 102, 103,
+          118, 119, 134, 135, 150, 151, 166, 167, 182, 183, 198, 199, 214, 215,
+          230, 231, 246, 247, 8, 9, 24, 25, 40, 41, 56, 57, 72, 73, 88, 89, 104,
+          105, 120, 121, 136, 137, 152, 153, 168, 169, 184, 185, 200, 201, 216,
+          217, 232, 233, 248, 249, 10, 11, 26, 27, 42, 43, 58, 59, 74, 75, 90,
+          91, 106, 107, 122, 123, 138, 139, 154, 155, 170, 171, 186, 187, 202,
+          203, 218, 219, 234, 235, 250, 251, 12, 13, 28, 29, 44, 45, 60, 61, 76,
+          77, 92, 93, 108, 109, 124, 125, 140, 141, 156, 157, 172, 173, 188,
+          189, 204, 205, 220, 221, 236, 237, 252, 253, 14, 15, 30, 31, 46, 47,
+          62, 63, 78, 79, 94, 95, 110, 111, 126, 127, 142, 143, 158, 159, 174,
+          175, 190, 191, 206, 207, 222, 223, 238, 239, 254, 255));
+}
+
+TEST_P(FormattedTileAssignmentTest,
+       TransposeIotaTileGroupingCanonicalizedReshapeDims) {
+  TileAssignment tile({1, 4, 16}, {4, 4, 4}, {1, 0, 2});
+  if (ShouldConvertToV1()) {
+    tile = TileAssignment(tile.shared_array());
+  }
+  TileAssignment xposed = tile.Transpose({2, 0, 1});
+  EXPECT_NE(xposed, tile);
+  EXPECT_EQ(xposed, TileAssignment({16, 1, 4}, {4, 4, 4}, {0, 2, 1}));
+  EXPECT_EQ(xposed.num_dimensions(), 3);
+  EXPECT_EQ(xposed.dim(0), 16);
+  EXPECT_EQ(xposed.dim(1), 1);
+  EXPECT_EQ(xposed.dim(2), 4);
+  EXPECT_EQ(xposed(0, 0, 0), 0);
+  EXPECT_EQ(xposed({7, 0, 3}), 31);
+  EXPECT_EQ(xposed.iota().has_value(), !ShouldConvertToV1());
+  EXPECT_TRUE(xposed.UsesDevice(0));
+  EXPECT_TRUE(xposed.UsesDevice(63));
+  EXPECT_FALSE(xposed.UsesDevice(64));
+  EXPECT_THAT(ToVectorUsingEach(xposed),
+              ElementsAre(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
+                          16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19,
+                          23, 27, 31, 32, 36, 40, 44, 33, 37, 41, 45, 34, 38,
+                          42, 46, 35, 39, 43, 47, 48, 52, 56, 60, 49, 53, 57,
+                          61, 50, 54, 58, 62, 51, 55, 59, 63));
+}
+
 TEST_P(FormattedTileAssignmentTest, TransposeNoopIotaTile) {
   TileAssignment tile({4, 4}, {4, 4}, {1, 0});
   if (ShouldConvertToV1()) {
diff --git a/third_party/xla/xla/tests/transfer_manager_test.cc b/third_party/xla/xla/tests/transfer_manager_test.cc
index eb7ac0fb4e032e..e67e391be026d1 100644
--- a/third_party/xla/xla/tests/transfer_manager_test.cc
+++ b/third_party/xla/xla/tests/transfer_manager_test.cc
@@ -309,7 +309,7 @@ XLA_TEST_F(TransferManagerTest, TransferTokenFromDevice) {
   EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateToken(), result));
 }
 
-XLA_TEST_F(TransferManagerTest, MultiStreamRoundTripSoak) {
+XLA_TEST_F(TransferManagerTest, OVERSIZE_ON_GRM(MultiStreamRoundTripSoak)) {
   const int64_t kIterationCount = 5000;
   Literal literal1 = LiteralUtil::MakeTupleFromSlices(
       {LiteralUtil::CreateR0<float>(123.0f),
@@ -352,7 +352,8 @@ XLA_TEST_F(TransferManagerTest, MultiStreamRoundTripSoak) {
   EXPECT_TRUE(LiteralTestUtil::Equal(literal2, result2));
 }
 
-XLA_TEST_F(TransferManagerTest, TransferDynamicShape) {
+// TODO(b/223222672): TPUs transfer literals using a different codepath.
+XLA_TEST_F(TransferManagerTest, DISABLED_ON_TPU(TransferDynamicShape)) {
   TF_ASSERT_OK_AND_ASSIGN(
       Shape s, ParseShape("(s64[], s32[<=1048576,3], f32[<=1048576,48])"));
 
diff --git a/third_party/xla/xla/tests/verified_hlo_module.cc b/third_party/xla/xla/tests/verified_hlo_module.cc
index 3eaacbc0ac7a1f..86635697a9e70d 100644
--- a/third_party/xla/xla/tests/verified_hlo_module.cc
+++ b/third_party/xla/xla/tests/verified_hlo_module.cc
@@ -26,7 +26,8 @@ limitations under the License.
 
 namespace xla {
 
-Status VerifiedHloModule::ParseHloStringAndVerifyModule(absl::string_view str) {
+absl::Status VerifiedHloModule::ParseHloStringAndVerifyModule(
+    absl::string_view str) {
   TF_RET_CHECK(computation_count() == 0);
   auto parser = HloParser::CreateHloParserForTests(str);
   TF_RETURN_IF_ERROR(parser->Run(this));
@@ -34,7 +35,7 @@ Status VerifiedHloModule::ParseHloStringAndVerifyModule(absl::string_view str) {
 }
 
 void VerifiedHloModule::VerifyOrAddFailure(absl::string_view message) {
-  Status status = Verify();
+  absl::Status status = Verify();
   if (!status.ok()) {
     ADD_FAILURE() << "HloVerifier failed on module " << name()
                   << (message.empty() ? "" : absl::StrCat(" (", message, ")"))
@@ -44,7 +45,7 @@ void VerifiedHloModule::VerifyOrAddFailure(absl::string_view message) {
   }
 }
 
-Status VerifiedHloModule::Verify() {
+absl::Status VerifiedHloModule::Verify() {
   if (computation_count() == 0) {
     // The computation was never built. Nothing to verify.
     return OkStatus();
diff --git a/third_party/xla/xla/tests/verified_hlo_module.h b/third_party/xla/xla/tests/verified_hlo_module.h
index 52fce003b9ccf1..837166a36d3e1d 100644
--- a/third_party/xla/xla/tests/verified_hlo_module.h
+++ b/third_party/xla/xla/tests/verified_hlo_module.h
@@ -48,14 +48,14 @@ class VerifiedHloModule : public HloModule {
   // builds the VerifiedHloModule in place. Before calling this method, the
   // module must be empty (no computations). Finally verifies the module using
   // HloVerifier and returns the status.
-  Status ParseHloStringAndVerifyModule(absl::string_view str);
+  absl::Status ParseHloStringAndVerifyModule(absl::string_view str);
 
   // Verifies the module and flags any error with ADD_FAILURE. 'message' is
   // included in the failure message.
   void VerifyOrAddFailure(absl::string_view message);
 
   // Verifies the module using HloVerifier and returns the status.
-  Status Verify();
+  absl::Status Verify();
 
  private:
   HloVerifier verifier_;
diff --git a/third_party/xla/xla/tests/while_test.cc b/third_party/xla/xla/tests/while_test.cc
index 9624a7f9079aae..2406e7869925fd 100644
--- a/third_party/xla/xla/tests/while_test.cc
+++ b/third_party/xla/xla/tests/while_test.cc
@@ -1250,7 +1250,9 @@ void BM_WhileLoop(::testing::benchmark::State& state) {
 
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().value();
   auto executors = PlatformUtil::GetStreamExecutors(platform).value();
-  se::StreamExecutorMemoryAllocator allocator(platform, executors);
+  se::StreamExecutorMemoryAllocator allocator(
+      platform, std::vector<se::StreamExecutorInterface*>(executors.begin(),
+                                                          executors.end()));
   LocalClient* client = ClientLibrary::GetOrCreateLocalClient(platform).value();
 
   const int64_t seq_len = 100;
diff --git a/third_party/xla/xla/tests/xla_hlo_profile_test.cc b/third_party/xla/xla/tests/xla_hlo_profile_test.cc
index 2806e63f94c074..5533b95f3c265c 100644
--- a/third_party/xla/xla/tests/xla_hlo_profile_test.cc
+++ b/third_party/xla/xla/tests/xla_hlo_profile_test.cc
@@ -78,7 +78,7 @@ ::testing::AssertionResult HasTrops(
          << parsed_line.trops << "'";
 }
 
-Status ParseOneProfileOutputLine(
+absl::Status ParseOneProfileOutputLine(
     const std::string& line, bool expect_hlo,
     absl::flat_hash_map<std::string, ParsedProfileOutputLine>* parsed_results,
     absl::Span<const absl::string_view> opcodes_to_ignore = {}) {
diff --git a/third_party/xla/xla/text_literal_reader.cc b/third_party/xla/xla/text_literal_reader.cc
index 06773cc229f03f..209790bc33eafa 100644
--- a/third_party/xla/xla/text_literal_reader.cc
+++ b/third_party/xla/xla/text_literal_reader.cc
@@ -43,7 +43,8 @@ absl::StatusOr<Literal> TextLiteralReader::ReadPath(absl::string_view path) {
   CHECK(!absl::EndsWith(path, ".gz"))
       << "TextLiteralReader no longer supports reading .gz files";
   std::unique_ptr<tsl::RandomAccessFile> file;
-  Status s = tsl::Env::Default()->NewRandomAccessFile(std::string(path), &file);
+  absl::Status s =
+      tsl::Env::Default()->NewRandomAccessFile(std::string(path), &file);
   if (!s.ok()) {
     return s;
   }
@@ -59,7 +60,7 @@ absl::StatusOr<Literal> TextLiteralReader::ReadAllLines() {
   tsl::io::RandomAccessInputStream stream(file_.get());
   tsl::io::BufferedInputStream buf(&stream, 65536);
   std::string shape_string;
-  Status s = buf.ReadLine(&shape_string);
+  absl::Status s = buf.ReadLine(&shape_string);
   if (!s.ok()) {
     return s;
   }
diff --git a/third_party/xla/xla/text_literal_writer.cc b/third_party/xla/xla/text_literal_writer.cc
index c07f66e6b5b73f..050eb5fe835adc 100644
--- a/third_party/xla/xla/text_literal_writer.cc
+++ b/third_party/xla/xla/text_literal_writer.cc
@@ -29,8 +29,8 @@ limitations under the License.
 
 namespace xla {
 
-/* static */ Status TextLiteralWriter::WriteToPath(const Literal& literal,
-                                                   absl::string_view path) {
+/* static */ absl::Status TextLiteralWriter::WriteToPath(
+    const Literal& literal, absl::string_view path) {
   std::unique_ptr<tsl::WritableFile> f;
   auto s = tsl::Env::Default()->NewWritableFile(std::string(path), &f);
   if (!s.ok()) {
@@ -42,7 +42,7 @@ namespace xla {
     return s;
   }
 
-  Status status;
+  absl::Status status;
   tsl::WritableFile* f_ptr = f.get();
   literal.EachCellAsString([f_ptr, &status](absl::Span<const int64_t> indices,
                                             const std::string& value) {
diff --git a/third_party/xla/xla/text_literal_writer.h b/third_party/xla/xla/text_literal_writer.h
index 6ca94296cf73d3..2ce5b368773d34 100644
--- a/third_party/xla/xla/text_literal_writer.h
+++ b/third_party/xla/xla/text_literal_writer.h
@@ -36,7 +36,8 @@ namespace xla {
 // This should be readable by xla::TextLiteralReader.
 class TextLiteralWriter {
  public:
-  static Status WriteToPath(const Literal& literal, absl::string_view path);
+  static absl::Status WriteToPath(const Literal& literal,
+                                  absl::string_view path);
 
  private:
   TextLiteralWriter(const TextLiteralWriter&) = delete;
diff --git a/third_party/xla/xla/tools/BUILD b/third_party/xla/xla/tools/BUILD
index a244749dacba7c..269912e1054fac 100644
--- a/third_party/xla/xla/tools/BUILD
+++ b/third_party/xla/xla/tools/BUILD
@@ -16,6 +16,7 @@ load(
     "//xla:xla.bzl",
     "xla_cc_binary",
     "xla_cc_test",
+    "xla_internal",
     "xla_py_proto_library",
 )
 load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
@@ -136,6 +137,29 @@ xla_cc_binary(
     ],
 )
 
+build_test(
+    name = "hlo_module_metadata_processor_build_test",
+    targets = [
+        ":hlo_module_metadata_processor",
+    ],
+)
+
+xla_cc_binary(
+    name = "hlo_module_metadata_processor",
+    srcs = ["hlo_module_metadata_processor.cc"],
+    deps = [
+        "//xla/service:hlo_proto_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:platform_port",
+        "@local_tsl//tsl/platform:protobuf",
+        "@local_tsl//tsl/platform:status",
+    ],
+)
+
 build_test(
     name = "show_text_literal_build_test",
     targets = [
@@ -393,7 +417,6 @@ xla_cc_binary(
         "//xla/client:local_client",
         "//xla/service:compiler",
         "//xla/service:cpu_plugin",
-        "//xla/service:hlo_graph_dumper",
         "//xla/service:hlo_proto_cc",
         "//xla/service:hlo_runner",
         "//xla/service:local_service",
@@ -411,7 +434,10 @@ xla_cc_binary(
         "//xla/service:gpu_plugin",
     ]) + if_cuda([
         "//xla/stream_executor:cuda_platform",
-    ]),
+    ]) + xla_internal(
+        ["service:hlo_graph_dumper_google"],
+        otherwise = ["//xla/service:hlo_graph_dumper"],
+    ),
 )
 
 xla_cc_test(
@@ -771,6 +797,7 @@ xla_test(
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/service:platform_util",
+        "//xla/service:symbol_repository",
         "//xla/service:xla_compile_result_proto_cc_impl",
         "//xla/stream_executor:device_description_proto_cc",
         "//xla/tests:hlo_test_base",
diff --git a/third_party/xla/xla/tools/driver.cc b/third_party/xla/xla/tools/driver.cc
index 351d01660aa2f1..780968098cf32b 100644
--- a/third_party/xla/xla/tools/driver.cc
+++ b/third_party/xla/xla/tools/driver.cc
@@ -101,9 +101,14 @@ void Log(const std::string& msg) {
 
 // Needs to be kept in sync with PrimitiveType in xla_data.proto.
 enum PrimitiveType {
-  S16 = 0,
+  S2,
+  S4,
+  S8,
+  S16,
   S32,
   S64,
+  U2,
+  U4,
   U8,
   U16,
   U32,
@@ -116,19 +121,21 @@ enum PrimitiveType {
   C128,
   F8E5M2,
   F8E4M3FN,
-  S4,
-  U4,
   F8E4M3B11FNUZ,
   F8E5M2FNUZ,
   F8E4M3FNUZ,
 };
 
 const std::vector<std::string>& primitive_strings() {
-  static auto vec = new std::vector<std::string>(
-      {"s16", "s32", "s64",           "u8",         "u16",
-       "u32", "u64", "f16",           "bf16",       "f32",
-       "f64", "c64", "c128",          "f8e5m2",     "f8e4m3fn",
-       "s4",  "u4",  "f8e4m3b11fnuz", "f8e5m2fnuz", "f8e4m3fnuz"});
+  static auto vec =
+      new std::vector<std::string>({"s2",         "s4",        "s8",
+                                    "s16",        "s32",       "s64",
+                                    "u2",         "u4",        "u8",
+                                    "u16",        "u32",       "u64",
+                                    "f16",        "bf16",      "f32",
+                                    "f64",        "c64",       "c128",
+                                    "f8e5m2",     "f8e4m3fn",  "f8e4m3b11fnuz",
+                                    "f8e5m2fnuz", "f8e4m3fnuz"});
   return *vec;
 }
 
@@ -384,6 +391,8 @@ void Fill(void* buffer, const ArrayShape& shape) {
   Log("Shape type = " + ToString(shape.type) +
       ", shape = " + ArrayShapeToString(shape));
   switch (shape.type) {
+    case S8:
+      return FillIntT<signed char>(buffer, num_elements);  // NOLINT
     case S16:
       return FillIntT<short>(buffer, num_elements);  // NOLINT
     case S32:
@@ -412,6 +421,8 @@ void Fill(void* buffer, const ArrayShape& shape) {
     case BF16:
     case C64:
     case C128:
+    case S2:
+    case U2:
     case S4:
     case U4:
       ExitWithMsg("Unsupported type: " + ToString(shape.type));
@@ -436,6 +447,8 @@ void DisplayT(const void* buffer, int num_elements) {
 void Display(const void* buffer, const ArrayShape& shape) {
   int num_elements = GetNumElements(shape);
   switch (shape.type) {
+    case S8:
+      return DisplayT<signed char>(buffer, num_elements);  // NOLINT
     case S16:
       return DisplayT<short>(buffer, num_elements);  // NOLINT
     case S32:
@@ -464,6 +477,8 @@ void Display(const void* buffer, const ArrayShape& shape) {
     case BF16:
     case C64:
     case C128:
+    case S2:
+    case U2:
     case S4:
     case U4:
       ExitWithMsg("Unsupported type: " + ToString(shape.type));
diff --git a/third_party/xla/xla/tools/dumped_computation_to_operation_list.cc b/third_party/xla/xla/tools/dumped_computation_to_operation_list.cc
index a916d796b0559d..48b31b993d6fe3 100644
--- a/third_party/xla/xla/tools/dumped_computation_to_operation_list.cc
+++ b/third_party/xla/xla/tools/dumped_computation_to_operation_list.cc
@@ -43,7 +43,7 @@ class OperationDumper : public DfsHloVisitorWithDefault {
  public:
   explicit OperationDumper(const std::string& path) : path_(path) {}
 
-  Status DefaultAction(HloInstruction* hlo) override {
+  absl::Status DefaultAction(HloInstruction* hlo) override {
     std::string params = absl::StrJoin(
         hlo->operands(), ", ",
         [](std::string* out, const HloInstruction* operand) {
diff --git a/third_party/xla/xla/tools/extract_collective_operations.cc b/third_party/xla/xla/tools/extract_collective_operations.cc
index 1a15fa8fbc1fe6..3c46c3983d9291 100644
--- a/third_party/xla/xla/tools/extract_collective_operations.cc
+++ b/third_party/xla/xla/tools/extract_collective_operations.cc
@@ -48,8 +48,8 @@ bazel run extract_collective_operations -- --input=path/to/hlo_module
 }  // namespace
 
 namespace xla {
-Status ExtractCollectiveOperations(const std::string& input,
-                                   const std::string& output) {
+absl::Status ExtractCollectiveOperations(const std::string& input,
+                                         const std::string& output) {
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModule> test_module,
       LoadModuleFromFile(input, std::string(tsl::io::Extension(input)),
diff --git a/third_party/xla/xla/tools/hlo_bisect/hlo_bisect.cc b/third_party/xla/xla/tools/hlo_bisect/hlo_bisect.cc
index 73b018323f34c3..588e4f52999172 100644
--- a/third_party/xla/xla/tools/hlo_bisect/hlo_bisect.cc
+++ b/third_party/xla/xla/tools/hlo_bisect/hlo_bisect.cc
@@ -50,7 +50,7 @@ struct BisectOptions {
   std::string input = "";
   std::string script = "";
   std::string dump_path = "/tmp/hlo_bisect";
-  std::string output_format = "pb";
+  std::string output_format = "hlo";
   bool all_computations = false;
   std::string test_platform = "CUDA";
   std::string reference_platform = "Interpreter";
diff --git a/third_party/xla/xla/tools/hlo_bisect/hlo_bisect_state.cc b/third_party/xla/xla/tools/hlo_bisect/hlo_bisect_state.cc
index 19f1b29fd1b77f..aeabb3193bd439 100644
--- a/third_party/xla/xla/tools/hlo_bisect/hlo_bisect_state.cc
+++ b/third_party/xla/xla/tools/hlo_bisect/hlo_bisect_state.cc
@@ -55,8 +55,8 @@ std::vector<HloInstruction*> GetModifiedInstructionPostOrder(
 // Changes the module by replacing the original root instruction of the entry
 // computation with a new root instruction that is a tuple containing the values
 // in `outputs`.
-Status MorphModuleWithOutputs(HloModule* module,
-                              absl::Span<HloInstruction* const> outputs) {
+absl::Status MorphModuleWithOutputs(HloModule* module,
+                                    absl::Span<HloInstruction* const> outputs) {
   HloComputation* entry_computation = module->entry_computation();
   HloInstruction* new_root = outputs.size() == 1
                                  ? outputs[0]
@@ -75,7 +75,7 @@ Status MorphModuleWithOutputs(HloModule* module,
 // Changes the module by keeping only the provided instructions of the entry
 // computation (should be sorted in the modified instruction post order),
 // inserting a new root instruction to keep all values live.
-Status MorphModuleWithInstructions(
+absl::Status MorphModuleWithInstructions(
     HloModule* module, absl::Span<HloInstruction* const> instructions) {
   ConstHloInstructionSet in_range_instructions(instructions.begin(),
                                                instructions.end());
@@ -94,7 +94,8 @@ Status MorphModuleWithInstructions(
   return MorphModuleWithOutputs(module, outputs);
 }
 
-Status MorphModuleWithInstructions(HloModule* module, size_t num_instructions) {
+absl::Status MorphModuleWithInstructions(HloModule* module,
+                                         size_t num_instructions) {
   std::vector<HloInstruction*> ordered_instructions =
       GetModifiedInstructionPostOrder(module->entry_computation());
   HloInstruction* const* instructions_begin = &ordered_instructions.front();
@@ -104,7 +105,7 @@ Status MorphModuleWithInstructions(HloModule* module, size_t num_instructions) {
 
 // Changes the module by replacing some instructions in the entry computation
 // with literals.
-Status MorphModuleWithLiterals(
+absl::Status MorphModuleWithLiterals(
     HloModule* module, absl::flat_hash_map<std::string, Literal> literal_map) {
   HloComputation* entry_computation = module->entry_computation();
 
@@ -120,7 +121,7 @@ Status MorphModuleWithLiterals(
     if (!instruction->IsDead()) {
       HloInstruction* new_instruction = entry_computation->AddInstruction(
           HloInstruction::CreateConstant(std::move(literal)));
-      Status replace_status =
+      absl::Status replace_status =
           entry_computation->ReplaceInstruction(instruction, new_instruction);
       TF_RETURN_IF_ERROR(replace_status);
     }
@@ -320,11 +321,11 @@ absl::StatusOr<bool> HloBisectState::TrimByUsingConstants() {
   return has_bug;
 }
 
-Status HloBisectState::ExpectModuleIsBuggy() {
+absl::Status HloBisectState::ExpectModuleIsBuggy() {
   // Verify that the current module has a bug.
   TF_ASSIGN_OR_RETURN(bool has_bug, RunModule(*module_));
   if (has_bug) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Check for the bug checker stability.
diff --git a/third_party/xla/xla/tools/hlo_bisect/hlo_bisect_state.h b/third_party/xla/xla/tools/hlo_bisect/hlo_bisect_state.h
index 6cc19dd7bcbee9..8b489f795a30bc 100644
--- a/third_party/xla/xla/tools/hlo_bisect/hlo_bisect_state.h
+++ b/third_party/xla/xla/tools/hlo_bisect/hlo_bisect_state.h
@@ -82,7 +82,7 @@ class HloBisectState {
 
   // Asserts that the module still has the bug. If negative, runs the bug
   // checker repeatedly to verify that it's deterministic.
-  Status ExpectModuleIsBuggy();
+  absl::Status ExpectModuleIsBuggy();
 
   std::unique_ptr<xla::HloModule> module_;
   BugCheckerInterface* bug_checker_;
diff --git a/third_party/xla/xla/tools/hlo_bisect/hlo_bisect_utils.cc b/third_party/xla/xla/tools/hlo_bisect/hlo_bisect_utils.cc
index 338d4755ce47c9..0e381514275a7b 100644
--- a/third_party/xla/xla/tools/hlo_bisect/hlo_bisect_utils.cc
+++ b/third_party/xla/xla/tools/hlo_bisect/hlo_bisect_utils.cc
@@ -109,7 +109,8 @@ absl::StatusOr<ModuleWithInputs> GetModuleAndInputData(
   }
   LOG(INFO) << input_file << " is not HloProto. Trying HLO text.\n";
   std::string hlo_string;
-  Status to_string_status = tsl::ReadFileToString(env, input_file, &hlo_string);
+  absl::Status to_string_status =
+      tsl::ReadFileToString(env, input_file, &hlo_string);
   if (!to_string_status.ok()) {
     LOG(ERROR) << input_file << " problem in reading file to string: "
                << to_string_status.message();
@@ -130,9 +131,9 @@ absl::StatusOr<ModuleWithInputs> GetModuleAndInputData(
 }
 
 // Outputs the given HloModule as HloProto to the given file.
-Status DumpHloModule(HloModule* module, const std::string& file_name,
-                     absl::string_view dir_path,
-                     absl::string_view output_format) {
+absl::Status DumpHloModule(HloModule* module, const std::string& file_name,
+                           absl::string_view dir_path,
+                           absl::string_view output_format) {
   HloProto proto = MakeHloProto(*module);
   if (output_format == "hlo") {
     tsl::Env* env = tsl::Env::Default();
@@ -155,7 +156,7 @@ Status DumpHloModule(HloModule* module, const std::string& file_name,
     LOG(FATAL) << "Unexpected output format: " << output_format;
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -347,7 +348,7 @@ void RunBisect(std::unique_ptr<BisectRunner> runner, bool all_computations,
   CHECK(bisect_status.ok()) << bisect_status.status().message();
 
   std::unique_ptr<HloModule> new_module = std::move(bisect_status.value());
-  Status dump_status =
+  absl::Status dump_status =
       DumpHloModule(new_module.get(), new_module->name() + "_trimmed",
                     dump_path, output_format);
   CHECK(dump_status.ok()) << dump_status.message();
@@ -368,10 +369,10 @@ absl::StatusOr<ModuleWithInputs> GetVerifiedModuleAndInputData(
       }
     }
   }
-  Status verified_status = HloVerifier(/*layout_sensitive=*/false,
-                                       /*allow_mixed_precision=*/false)
-                               .Run(module.get())
-                               .status();
+  absl::Status verified_status = HloVerifier(/*layout_sensitive=*/false,
+                                             /*allow_mixed_precision=*/false)
+                                     .Run(module.get())
+                                     .status();
   if (!verified_status.ok()) {
     LOG(ERROR) << "Failed to verify hlo module " << verified_status.message();
     return verified_status;
diff --git a/third_party/xla/xla/tools/hlo_control_flow_flattening.cc b/third_party/xla/xla/tools/hlo_control_flow_flattening.cc
index 98e3d8584712ed..50eb5bed832397 100644
--- a/third_party/xla/xla/tools/hlo_control_flow_flattening.cc
+++ b/third_party/xla/xla/tools/hlo_control_flow_flattening.cc
@@ -145,7 +145,7 @@ int GetLoopBoundWithOuterLoopMax(const HloInstruction& while_hlo,
   return loop_bound;
 }
 
-Status HloControlFlowFlattening::FlattenWhileLoop(
+absl::Status HloControlFlowFlattening::FlattenWhileLoop(
     HloInstruction* while_hlo, const CallGraph& call_graph) const {
   CHECK_EQ(while_hlo->opcode(), HloOpcode::kWhile);
   HloComputation* computation = while_hlo->parent();
@@ -264,7 +264,7 @@ Status HloControlFlowFlattening::FlattenWhileLoop(
   return OkStatus();
 }
 
-Status HloControlFlowFlattening::RemoveInfeed(
+absl::Status HloControlFlowFlattening::RemoveInfeed(
     HloInstruction* infeed_hlo) const {
   CHECK_EQ(infeed_hlo->opcode(), HloOpcode::kInfeed);
   HloComputation* computation = infeed_hlo->parent();
@@ -285,7 +285,8 @@ Status HloControlFlowFlattening::RemoveInfeed(
   return OkStatus();
 }
 
-Status HloControlFlowFlattening::RemoveRecvDone(
+absl::StatusOr<std::pair<HloInstruction*, HloInstruction*>>
+HloControlFlowFlattening::RemoveRecvAndRecvDone(
     HloInstruction* recv_done,
     absl::flat_hash_set<HloInstruction*>* additional_removed) const {
   CHECK_EQ(recv_done->opcode(), HloOpcode::kRecvDone);
@@ -322,10 +323,10 @@ Status HloControlFlowFlattening::RemoveRecvDone(
       computation->ReplaceInstruction(recv_done, custom_call_recv_done));
   custom_call_recv_done->SetAndSanitizeName(original_recv_done_name);
 
-  return OkStatus();
+  return std::make_pair(custom_call_recv, custom_call_recv_done);
 }
 
-Status HloControlFlowFlattening::RemoveOutfeed(
+absl::Status HloControlFlowFlattening::RemoveOutfeed(
     HloInstruction* outfeed_hlo) const {
   CHECK_EQ(outfeed_hlo->opcode(), HloOpcode::kOutfeed);
   HloComputation* computation = outfeed_hlo->parent();
@@ -346,7 +347,8 @@ Status HloControlFlowFlattening::RemoveOutfeed(
   return OkStatus();
 }
 
-Status HloControlFlowFlattening::RemoveSendDone(
+absl::StatusOr<std::pair<HloInstruction*, HloInstruction*>>
+HloControlFlowFlattening::RemoveSendAndSendDone(
     HloInstruction* send_done,
     absl::flat_hash_set<HloInstruction*>* additional_removed) const {
   CHECK_EQ(send_done->opcode(), HloOpcode::kSendDone);
@@ -384,7 +386,7 @@ Status HloControlFlowFlattening::RemoveSendDone(
       computation->ReplaceInstruction(send_done, custom_call_send_done));
   custom_call_send_done->SetAndSanitizeName(original_send_done_name);
 
-  return OkStatus();
+  return std::make_pair(custom_call_send, custom_call_send_done);
 }
 
 absl::StatusOr<HloInstruction*> HloControlFlowFlattening::RemoveCollective(
@@ -407,7 +409,7 @@ absl::StatusOr<HloInstruction*> HloControlFlowFlattening::RemoveCollective(
   return custom_call;
 }
 
-Status HloControlFlowFlattening::RemoveId(HloInstruction* hlo) const {
+absl::Status HloControlFlowFlattening::RemoveId(HloInstruction* hlo) const {
   HloComputation* computation = hlo->parent();
   HloInstruction* zero = CreateConstant(hlo->shape(), computation);
   std::string original_op_name(hlo->name());
@@ -455,7 +457,8 @@ absl::StatusOr<bool> HloControlFlowFlattening::Run(
         if (remove_comm_ || (remove_host_transfer_ &&
                              send_done_instruction->is_host_transfer())) {
           VLOG(1) << "Remove " << instruction->name();
-          TF_RETURN_IF_ERROR(RemoveSendDone(instruction, &removed));
+          TF_RETURN_IF_ERROR(
+              RemoveSendAndSendDone(instruction, &removed).status());
           changed = true;
         }
       } else if (instruction->opcode() == HloOpcode::kRecvDone) {
@@ -465,7 +468,8 @@ absl::StatusOr<bool> HloControlFlowFlattening::Run(
         if (remove_comm_ || (remove_host_transfer_ &&
                              recv_done_instruction->is_host_transfer())) {
           VLOG(1) << "Remove " << instruction->name();
-          TF_RETURN_IF_ERROR(RemoveRecvDone(instruction, &removed));
+          TF_RETURN_IF_ERROR(
+              RemoveRecvAndRecvDone(instruction, &removed).status());
           changed = true;
         }
       } else if (remove_comm_ && IsCollective(instruction) &&
diff --git a/third_party/xla/xla/tools/hlo_control_flow_flattening.h b/third_party/xla/xla/tools/hlo_control_flow_flattening.h
index c70bc80a504073..e48cce0b865a8a 100644
--- a/third_party/xla/xla/tools/hlo_control_flow_flattening.h
+++ b/third_party/xla/xla/tools/hlo_control_flow_flattening.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <limits>
 #include <string>
+#include <utility>
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
@@ -66,24 +67,16 @@ class HloControlFlowFlattening : public HloModulePass {
 
  private:
   // Replaces an infeed with a custom call.
-  Status RemoveInfeed(HloInstruction* infeed_hlo) const;
+  absl::Status RemoveInfeed(HloInstruction* infeed_hlo) const;
   // Removes outfeeds and replaces the outfeed HLO with a side-effecting custom
   // call that ensures that XLA doesn't dead-code-eliminate the outfeeded values
   // but lowers to a no-op.
-  Status RemoveOutfeed(HloInstruction* outfeed_hlo) const;
+  absl::Status RemoveOutfeed(HloInstruction* outfeed_hlo) const;
   // Flattens the while loop. Precondition: while_hlo is a while instruction.
-  Status FlattenWhileLoop(HloInstruction* while_hlo,
-                          const CallGraph& call_graph) const;
+  absl::Status FlattenWhileLoop(HloInstruction* while_hlo,
+                                const CallGraph& call_graph) const;
   // Replaces an id with a zero constant.
-  Status RemoveId(HloInstruction* hlo) const;
-  // Removes send and send-done with a custom call.
-  Status RemoveSendDone(
-      HloInstruction* send_done,
-      absl::flat_hash_set<HloInstruction*>* additional_removed) const;
-  // Removes recv and recv-done with a custom call.
-  Status RemoveRecvDone(
-      HloInstruction* recv_done,
-      absl::flat_hash_set<HloInstruction*>* additional_removed) const;
+  absl::Status RemoveId(HloInstruction* hlo) const;
 
   int while_execution_count_;
   int max_outer_loop_count_;
@@ -96,7 +89,18 @@ class HloControlFlowFlattening : public HloModulePass {
   // Replaces a collective op with a custom call and returns the custom call.
   virtual absl::StatusOr<HloInstruction*> RemoveCollective(
       HloInstruction* hlo) const;
-
+  // Replaces send and send-done with a custom call. Returns the new custom
+  // calls in a pair.
+  virtual absl::StatusOr<std::pair<HloInstruction*, HloInstruction*>>
+  RemoveSendAndSendDone(
+      HloInstruction* send_done,
+      absl::flat_hash_set<HloInstruction*>* additional_removed) const;
+  // Replaces recv and recv-done with a custom call. Returns the new custom
+  // calls in a pair
+  virtual absl::StatusOr<std::pair<HloInstruction*, HloInstruction*>>
+  RemoveRecvAndRecvDone(
+      HloInstruction* recv_done,
+      absl::flat_hash_set<HloInstruction*>* additional_removed) const;
   bool remove_comm_;
 };
 
diff --git a/third_party/xla/xla/tools/hlo_extractor.cc b/third_party/xla/xla/tools/hlo_extractor.cc
index f9f07153cd9c9e..48935a72b67c26 100644
--- a/third_party/xla/xla/tools/hlo_extractor.cc
+++ b/third_party/xla/xla/tools/hlo_extractor.cc
@@ -94,12 +94,12 @@ class ExtractionVisitor : public ConstDfsHloVisitorWithDefault {
     }
   }
 
-  Status HandleParameter(const HloInstruction* parameter) override {
+  absl::Status HandleParameter(const HloInstruction* parameter) override {
     // Entry parameters need renumbering.
     return ReplaceWithParameter(parameter);
   }
 
-  Status DefaultAction(const HloInstruction* hlo) override {
+  absl::Status DefaultAction(const HloInstruction* hlo) override {
     // Replace the following two types of instructions with parameters/constants
     // (1) the instructions at the boundary with (2) the instructions that are
     // not selected by the hlo_selector.
@@ -154,7 +154,7 @@ class ExtractionVisitor : public ConstDfsHloVisitorWithDefault {
     return OkStatus();
   }
 
-  Status FinishVisit(const HloInstruction* /*root*/) override {
+  absl::Status FinishVisit(const HloInstruction* /*root*/) override {
     // Create the entry computation for the extracted module.
     auto new_entry_computation = module_->AddEntryComputation(
         old_computations_to_builders_.at(root_instruction_->parent())->Build());
@@ -188,7 +188,7 @@ class ExtractionVisitor : public ConstDfsHloVisitorWithDefault {
 
  private:
   // Replace the `hlo` with Constant of the same shape.
-  Status ReplaceWithConstant(const HloInstruction* hlo) {
+  absl::Status ReplaceWithConstant(const HloInstruction* hlo) {
     absl::StatusOr<Literal> literal_status = MakeFakeLiteral(hlo->shape());
     TF_CHECK_OK(literal_status.status());
     auto new_const =
@@ -202,7 +202,7 @@ class ExtractionVisitor : public ConstDfsHloVisitorWithDefault {
   }
 
   // Replace the `hlo` with Parameter of the same shape.
-  Status ReplaceWithParameter(const HloInstruction* hlo) {
+  absl::Status ReplaceWithParameter(const HloInstruction* hlo) {
     CHECK(parameter_numbers_.contains(hlo->parent()));
     auto new_parameter = HloInstruction::CreateParameter(
         parameter_numbers_.at(hlo->parent())++, hlo->shape(), hlo->name());
@@ -268,8 +268,8 @@ class ExtractionVisitor : public ConstDfsHloVisitorWithDefault {
   // Replace with `hlo` with a broadcasted constant of the same shape. The
   // constant could be either a zero or a random number, depending on
   // `replace_type`.
-  Status ReplaceWithConstantBroadcast(const HloInstruction* hlo,
-                                      ReplaceType replace_type) {
+  absl::Status ReplaceWithConstantBroadcast(const HloInstruction* hlo,
+                                            ReplaceType replace_type) {
     CHECK(replace_type == ReplaceType::kReplaceZeroBroadcast ||
           replace_type == ReplaceType::kReplaceRandomBroadcast);
     CHECK(old_computations_to_builders_.contains(hlo->parent()));
diff --git a/third_party/xla/xla/tools/hlo_module_metadata_processor.cc b/third_party/xla/xla/tools/hlo_module_metadata_processor.cc
new file mode 100644
index 00000000000000..c9544724909f7c
--- /dev/null
+++ b/third_party/xla/xla/tools/hlo_module_metadata_processor.cc
@@ -0,0 +1,106 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Tool to process a HloModuleMetadataProto that was dumped with the flag
+// --xla_dump_module_metadata
+//
+// Usage: hlo_module_metadata_processor <filepath>
+// Where <filepath> should point to a file with a serialized HloModuleMetadata
+// proto in text format.
+//
+// The tool writes the individual pass timings to stdout in the following
+// format:
+// Pass timings for <pass name>: id <id0>: x ms, id <id1>: y ms, ...
+// The pass timinings of the individual runs of the same pass are sorted in
+// non-increasing order based on runtime. Also, the different passes are sorted
+// in non-increasing order based on the maximum runtime of a pass.
+// The idea is that with this output, it is easier to spot the pass id of a pass
+// with an unexpected high runtime. This pass id can then be looked up in the
+// dump to gather additional data, e.g. which pipeline the pass was run in.
+
+#include <algorithm>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "xla/service/hlo.pb.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/init_main.h"
+#include "tsl/platform/logging.h"
+#include "tsl/platform/protobuf.h"
+#include "tsl/platform/status.h"
+
+namespace xla {
+namespace tools {
+namespace {
+struct HloPassMetadataFormatter {
+  void operator()(std::string* out, const HloPassMetadata& data) const {
+    out->append(absl::StrFormat(
+        "id %d: %d ms", data.pass_id(),
+        (data.end_timestamp_usec() - data.start_timestamp_usec()) / 1000));
+  }
+};
+
+void ProcessMetadata(const std::string& serialized) {
+  HloModuleMetadataProto metadata;
+  if (!tsl::protobuf::TextFormat::ParseFromString(serialized, &metadata)) {
+    LOG(FATAL) << "Unable to parse HloModuleMetadata";
+  }
+  absl::flat_hash_map<std::string, std::vector<HloPassMetadata>>
+      group_by_pass_name;
+  for (const auto& pass_metadata : metadata.pass_metadata()) {
+    group_by_pass_name[pass_metadata.pass_name()].push_back(pass_metadata);
+  }
+  std::vector<std::string> pass_names_sorted_by_time;
+  for (auto& entry : group_by_pass_name) {
+    pass_names_sorted_by_time.push_back(entry.first);
+    std::sort(entry.second.begin(), entry.second.end(),
+              [](const HloPassMetadata& a, const HloPassMetadata& b) {
+                return a.end_timestamp_usec() - a.start_timestamp_usec() >
+                       b.end_timestamp_usec() - b.start_timestamp_usec();
+              });
+  }
+  std::sort(
+      pass_names_sorted_by_time.begin(), pass_names_sorted_by_time.end(),
+      [&](const std::string& a, const std::string& b) {
+        const auto& a_data = group_by_pass_name[a][0];
+        const auto& b_data = group_by_pass_name[b][0];
+        return a_data.end_timestamp_usec() - a_data.start_timestamp_usec() >
+               b_data.end_timestamp_usec() - b_data.start_timestamp_usec();
+      });
+  for (const auto& name : pass_names_sorted_by_time) {
+    const auto& data = group_by_pass_name[name];
+    std::cout << "Pass timings for " << name << ": "
+              << absl::StrJoin(data, ", ", HloPassMetadataFormatter())
+              << std::endl;
+  }
+}
+}  // namespace
+}  // namespace tools
+}  // namespace xla
+
+int main(int argc, char** argv) {
+  tsl::port::InitMain(argv[0], &argc, &argv);
+
+  QCHECK_EQ(argc, 2) << "usage: " << argv[0] << " <filepath>";
+  std::string serialized;
+  TF_CHECK_OK(tsl::ReadFileToString(tsl::Env::Default(), std::string(argv[1]),
+                                    &serialized));
+  xla::tools::ProcessMetadata(serialized);
+  return 0;
+}
diff --git a/third_party/xla/xla/tools/hlo_opt/BUILD b/third_party/xla/xla/tools/hlo_opt/BUILD
index 06e2380b70a542..4d69ab619d70d0 100644
--- a/third_party/xla/xla/tools/hlo_opt/BUILD
+++ b/third_party/xla/xla/tools/hlo_opt/BUILD
@@ -57,6 +57,7 @@ cc_library(
     name = "gpu_opt",
     testonly = True,
     srcs = if_gpu_is_configured(["gpu_opt.cc"]),
+    tags = ["gpu"],
     deps = [
         ":opt_lib",
         "//xla:debug_options_flags",
@@ -100,10 +101,16 @@ cc_library(
     srcs = ["cpu_opt.cc"],
     deps = [
         ":opt_lib",
+        "//xla/hlo/ir:hlo",
         "//xla/service:cpu_plugin",
+        "//xla/service:executable",
         "//xla/service:hlo_graph_dumper",
+        "//xla/service/cpu:cpu_executable",
         "//xla/stream_executor/host:host_platform",
         "//xla/stream_executor/platform",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:statusor",
     ],
     alwayslink = True,  # Initializer needs to run.
 )
@@ -149,6 +156,7 @@ lit_test_suite(
     srcs = enforce_glob(
         [
             "cpu_hlo.hlo",
+            "cpu_llvm.hlo",
             "gpu_hlo.hlo",
             "gpu_hlo_backend.hlo",
             "gpu_hlo_buffers.hlo",
diff --git a/third_party/xla/xla/tools/hlo_opt/cpu_llvm.hlo b/third_party/xla/xla/tools/hlo_opt/cpu_llvm.hlo
new file mode 100644
index 00000000000000..fbb033e53b07eb
--- /dev/null
+++ b/third_party/xla/xla/tools/hlo_opt/cpu_llvm.hlo
@@ -0,0 +1,19 @@
+// RUN: hlo-opt %s --platform=cpu --stage=llvm-before-optimizations --split-input-file | FileCheck --check-prefixes=CHECK %s
+
+HloModule m
+
+add {
+ a = s8[] parameter(0)
+ b = s8[] parameter(1)
+ ROOT out = s8[] add(a, b)
+}
+
+// CHECK: i8
+ENTRY e {
+ p1 = s8[1048576] parameter(0)
+ i = s8[] constant(0)
+ ROOT out = s8[] reduce(p1, i), dimensions={0}, to_apply=add
+}
+
+
+
diff --git a/third_party/xla/xla/tools/hlo_opt/cpu_opt.cc b/third_party/xla/xla/tools/hlo_opt/cpu_opt.cc
index 9e7d5c2b72ace7..fc5e540fc7907a 100644
--- a/third_party/xla/xla/tools/hlo_opt/cpu_opt.cc
+++ b/third_party/xla/xla/tools/hlo_opt/cpu_opt.cc
@@ -14,10 +14,19 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
+#include <optional>
+#include <set>
 #include <string>
+#include <utility>
 
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/cpu/cpu_executable.h"
+#include "xla/service/executable.h"
 #include "xla/stream_executor/platform/initialize.h"
 #include "xla/tools/hlo_opt/opt_lib.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 
@@ -25,6 +34,23 @@ namespace {
 
 class CpuOptProvider : public OptProvider {
  public:
+  absl::StatusOr<std::optional<std::string>> GenerateStage(
+      std::unique_ptr<HloModule> module, absl::string_view s) override {
+    if (s == "llvm-before-optimizations") {
+      TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
+                          GetExecutable(std::move(module)));
+      return static_cast<cpu::CpuExecutable*>(executable.get())
+          ->ir_module_string();
+    }
+    return OptProvider::GenerateStage(std::move(module), s);
+  }
+
+  std::set<std::string> SupportedStages() override {
+    std::set<std::string> supported = OptProvider::SupportedStages();
+    supported.insert({"llvm-before-optimizations"});
+    return supported;
+  }
+
   std::string GetPlatformName() override { return "cpu"; }
 };
 
diff --git a/third_party/xla/xla/tools/hlo_opt/opt_main.cc b/third_party/xla/xla/tools/hlo_opt/opt_main.cc
index fca335e60647d9..c7c1904dceb66a 100644
--- a/third_party/xla/xla/tools/hlo_opt/opt_main.cc
+++ b/third_party/xla/xla/tools/hlo_opt/opt_main.cc
@@ -168,7 +168,7 @@ absl::StatusOr<std::string> TranslateToStage(int argc, char** argv,
   return out_combined;
 }
 
-Status RunOpt(int argc, char** argv, const HloOptConfig& opts) {
+absl::Status RunOpt(int argc, char** argv, const HloOptConfig& opts) {
   TF_ASSIGN_OR_RETURN(std::string output, TranslateToStage(argc, argv, opts));
   if (opts.output_file == "-") {
     std::cout << output << std::endl;
@@ -176,7 +176,7 @@ Status RunOpt(int argc, char** argv, const HloOptConfig& opts) {
     TF_RETURN_IF_ERROR(
         tsl::WriteStringToFile(tsl::Env::Default(), opts.output_file, output));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/tools/interactive_graphviz_bin_test.cc b/third_party/xla/xla/tools/interactive_graphviz_bin_test.cc
index 8da5f014da3356..0b862aff789cb5 100644
--- a/third_party/xla/xla/tools/interactive_graphviz_bin_test.cc
+++ b/third_party/xla/xla/tools/interactive_graphviz_bin_test.cc
@@ -57,8 +57,12 @@ TEST(InteractiveGraphviz, CPU) {
   std::string out, err;
 
   int status = proc.Communicate(&in, &out, &err);
+#if defined(_WIN32) || defined(_WIN64)
+  EXPECT_EQ(0, status);
+#else
   EXPECT_TRUE(WIFEXITED(status));
   EXPECT_EQ(0, WEXITSTATUS(status));
+#endif  // defined(_WIN32) || defined(_WIN64)
   ASSERT_THAT(err, testing::HasSubstr("Compiling module for Host"));
 }
 
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/BUILD b/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
index ae20683fca7aab..3ec5ee2b60ffbf 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
@@ -1,6 +1,7 @@
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
+load("@local_tsl//tsl/platform:build_config_root.bzl", "tf_gpu_tests_tags")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//xla:xla.bzl", "xla_cc_binary")
 load("//xla/tests:build_defs.bzl", "xla_test")
@@ -16,7 +17,7 @@ build_test(
     name = "hlo_runner_main_build_test",
     tags = [
         "cpu",
-        "gpu",
+        "gpu",  # TODO(ddunleavy): this configuration of tags will only build on GPU in OSS.
     ],
     targets = [
         ":hlo_runner_main",
@@ -28,10 +29,9 @@ xla_cc_binary(
     testonly = True,
     srcs = ["hlo_runner_main.cc"],
     tags = [
-        "gpu",
         "noasan",  # Exceeds linker limit.
         "nomac",
-    ],
+    ] + tf_gpu_tests_tags(),
     deps = [
         ":functional_hlo_runner",
         ":hlo_runner_flags",
@@ -39,6 +39,7 @@ xla_cc_binary(
         "//xla:status",
         "//xla:statusor",
         "//xla/pjrt:pjrt_client",
+        "//xla/pjrt/distributed",
         "//xla/service:cpu_plugin",
         "//xla/tsl/util:command_line_flags",
         "@com_google_absl//absl/log:check",
@@ -68,6 +69,7 @@ cc_library(
         "//xla:statusor",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/pjrt:host_memory_spaces",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:pjrt_executable",
@@ -79,6 +81,7 @@ cc_library(
         "//xla/service:hlo_pass_pipeline",
         "//xla/tests:test_utils",
         "//xla/tools:hlo_control_flow_flattening",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -115,7 +118,10 @@ xla_test(
             "notap",
         ],
     },
-    backends = ["gpu"],
+    backends = [
+        "cpu",
+        "gpu",
+    ],
     data = [
         "data/sharded_16_devices.hlo",
         "data/sharded_2_devices.hlo",
@@ -125,6 +131,8 @@ xla_test(
     tags = ["nomac"],
     deps = [
         ":functional_hlo_runner",
+        "//xla:statusor",
+        "//xla/pjrt:pjrt_client",
         "//xla/tests:filecheck",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/lib/core:status_test_util",
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc
index 93dc178990a10c..e8a9966fd2abab 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/container/btree_map.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
@@ -31,10 +32,12 @@ limitations under the License.
 #include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/layout.h"
 #include "xla/literal.h"
 #include "xla/pjrt/cpu/cpu_client.h"
 #include "xla/pjrt/distributed/client.h"
 #include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
+#include "xla/pjrt/host_memory_spaces.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
@@ -421,7 +424,7 @@ FunctionalHloRunner::CreateExecutableBuildOptionsFromExecutionOptions(
   return build_options;
 }
 
-Status FunctionalHloRunner::DumpOutput(
+absl::Status FunctionalHloRunner::DumpOutput(
     const FunctionalHloRunner::PerDeviceLiteralVecType& output,
     absl::string_view dump_output_to, int task_id) {
   std::vector<std::string> output_path_vec =
@@ -440,7 +443,7 @@ Status FunctionalHloRunner::DumpOutput(
       output_path_vec[literal_id_index] = absl::StrCat("literal_", literal_id);
       std::string literal_path = absl::StrJoin(output_path_vec, ".");
       CHECK_EQ(suffix, std::string("txt"));
-      Status write_status =
+      absl::Status write_status =
           tsl::WriteStringToFile(tsl::Env::Default(), literal_path,
                                  literal_vec[literal_id].ToString());
       if (!write_status.ok()) {
@@ -448,7 +451,7 @@ Status FunctionalHloRunner::DumpOutput(
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 absl::Span<PjRtDevice* const> FunctionalHloRunner::GetLocalDevices(
@@ -485,7 +488,7 @@ FunctionalHloRunner::LoadHloModuleAndArguments(absl::string_view hlo_file,
   return hlo_module_and_arguments;
 }
 
-Status FunctionalHloRunner::LoadAndRunAndDump(
+absl::Status FunctionalHloRunner::LoadAndRunAndDump(
     PjRtClient& client, const DebugOptions& debug_options,
     const xla::FunctionalHloRunner::PreprocessingOptions& preproc_options,
     const xla::FunctionalHloRunner::RawCompileOptions& raw_compile_options,
@@ -501,7 +504,7 @@ Status FunctionalHloRunner::LoadAndRunAndDump(
                                       compile_options, running_options,
                                       hlo_files, input_format));
   return dump_output_to.empty()
-             ? OkStatus()
+             ? absl::OkStatus()
              : FunctionalHloRunner::DumpOutput(output, dump_output_to, task_id);
 }
 
@@ -540,7 +543,7 @@ FunctionalHloRunner::LoadAndRun(PjRtClient& client,
       hlo_module_and_arguments.hlo_module.get(), loaded_arguments);
 }
 
-Status FunctionalHloRunner::LoadAndCompile(
+absl::Status FunctionalHloRunner::LoadAndCompile(
     PjRtClient& client, const DebugOptions& debug_options,
     const PreprocessingOptions& preproc_options,
     const RawCompileOptions& raw_compile_options, std::string_view hlo_file,
@@ -570,7 +573,7 @@ Status FunctionalHloRunner::LoadAndCompile(
                          debug_options, preproc_options, compile_options)
                          .status());
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 absl::StatusOr<std::unique_ptr<HloModule>>
@@ -676,8 +679,13 @@ namespace {
 // CompileOptions::parameter_is_tupled_arguments = false
 // and the HLO module is executed with
 // ExecuteOptions::arguments_are_tupled = false.
-// Additionally, we set ExecuteOptions::untuple_result = false.
 // We will create new on-device buffers for each repeated execution.
+//
+// Irrespective of the above, if the output is a tuple with leaves mixing host
+// and device memory spaces, we set ExecuteOptions::untuple_result = true.
+// Otherwise PJRT cannot correctly represent these tuples, because a PjRtBuffer
+// can only belong to one memory space. By "untupling", PJRT assigns a separate
+// PjRtBuffer to each leaf.
 
 enum class ParameterType {
   kOneTupleOfArrays = 0,
@@ -711,7 +719,7 @@ ParameterType GetParameterType(const HloModule& module) {
 
 }  // namespace
 
-Status FunctionalHloRunner::PrepareHloModuleForCompilation(
+absl::Status FunctionalHloRunner::PrepareHloModuleForCompilation(
     HloModule* hlo_module, const DebugOptions& debug_options,
     const PreprocessingOptions& preproc_options) {
   hlo_module->mutable_config().set_debug_options(debug_options);
@@ -741,7 +749,7 @@ Status FunctionalHloRunner::PrepareHloModuleForCompilation(
             /*remove_comm=*/false, /*remove_host_transfer=*/true});
     TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 CompileOptions FunctionalHloRunner::CompleteCompileOptions(
@@ -814,15 +822,16 @@ FunctionalHloRunner::Run(PjRtClient& client, PjRtLoadedExecutable* executable,
         int device_id = device_id_and_arguments.first;
         flattened_arguments.insert({device_id, std::move(flattened_argument)});
       }
-      return CopyArgumentsToDevice(client, executable->addressable_devices(),
-                                   flattened_arguments,
-                                   running_options.log_input_output());
+      return CopyArgumentsToDevice(client, executable, flattened_arguments,
+                                   running_options.log_input_output(),
+                                   /*flattened_arguments=*/true);
     }
     // If the per-device argument is not a single tuple, we ignore the
     // flatten_tupled_arguments parameter and assume the provided arguments have
     // already been flattened.
-    return CopyArgumentsToDevice(client, executable->addressable_devices(),
-                                 arguments, running_options.log_input_output());
+    return CopyArgumentsToDevice(client, executable, arguments,
+                                 running_options.log_input_output(),
+                                 /*flattened_arguments=*/false);
   };
   return RunInternal(client, executable, create_argument_buffers_on_device,
                      running_options);
@@ -885,7 +894,7 @@ std::vector<Shape> GetArgumentShapes(const HloModule& module) {
   return argument_shapes;
 }
 
-Status EnsureSingleTupleForFlattening(const HloModule& module) {
+absl::Status EnsureSingleTupleForFlattening(const HloModule& module) {
   if (module.entry_computation()->num_parameters() != 1) {
     return InvalidArgument(
         "Flattening arguments requires the number of parameters to be 1. "
@@ -906,7 +915,7 @@ Status EnsureSingleTupleForFlattening(const HloModule& module) {
             ->shape()
             .ToString());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -967,7 +976,20 @@ FunctionalHloRunner::RunInternal(
   std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> output_buffers;
   std::vector<std::vector<PjRtBuffer*>> argument_ptrs =
       CreateArgumentPointersFromDeviceBuffers(device_buffers);
-  bool default_untuple_result = execute_options.untuple_result;
+  auto output_has_tuple_leaf_on_host_memory_space = [&module]() {
+    if (!module.result_shape().IsTuple()) {
+      return false;
+    }
+    return absl::c_any_of(
+        module.result_shape().tuple_shapes(), [](const Shape& shape) {
+          return shape.has_layout() &&
+                 shape.layout().memory_space() == Layout::kHostMemorySpace;
+        });
+  };
+  // If any output leaf buffer is in host memory, PJRT requires untuple_result.
+  bool must_untuple_result = output_has_tuple_leaf_on_host_memory_space();
+  bool default_untuple_result =
+      must_untuple_result || execute_options.untuple_result;
   switch (parameter_type) {
     case ParameterType::kOneTupleOfArrays:
       execute_options.arguments_are_tupled = false;
@@ -984,6 +1006,9 @@ FunctionalHloRunner::RunInternal(
       execute_options.untuple_result = false;
       break;
   }
+  if (must_untuple_result) {
+    execute_options.untuple_result = true;
+  }
   std::optional<std::vector<PjRtFuture<>>> futures;
   futures.emplace();
   for (int repeat = 0; repeat < running_options.num_repeats; ++repeat) {
@@ -1123,22 +1148,14 @@ FunctionalHloRunner::CreateArgumentsOnDevice(
   }
 
   if (kUseSharedInputs) {
-    PerDeviceIndexVecType per_device_index_vec;
-    std::vector<int> argument_indices;
-    argument_indices.resize(
-        per_device_argument_literals[addressable_devices[0]->id()].size());
-    absl::c_iota(argument_indices, 0);
-    for (int i = 0; i < num_addressable_devices; ++i) {
-      per_device_index_vec[addressable_devices[i]->id()] = argument_indices;
-    }
     return CopyArgumentsToDevice(
-        client, addressable_devices,
-        per_device_argument_literals[addressable_devices[0]->id()],
-        per_device_index_vec, running_options.log_input_output());
+        client, executable, per_device_argument_literals,
+        running_options.log_input_output(), flatten_arguments,
+        /*clone_device0_arguments=*/true);
   }
-  return CopyArgumentsToDevice(client, addressable_devices,
-                               per_device_argument_literals,
-                               running_options.log_input_output());
+  return CopyArgumentsToDevice(client, executable, per_device_argument_literals,
+                               running_options.log_input_output(),
+                               flatten_arguments);
 }
 
 absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>
@@ -1227,10 +1244,13 @@ FunctionalHloRunner::CreateUninitializedArgumentsOnDevice(
 
 absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>
 FunctionalHloRunner::CopyArgumentsToDevice(
-    PjRtClient& client, absl::Span<PjRtDevice* const> addressable_devices,
-    const PerDeviceLiteralVecType& arguments, bool log_input) {
+    PjRtClient& client, const PjRtLoadedExecutable* executable,
+    const PerDeviceLiteralVecType& arguments, bool log_input,
+    bool flattened_arguments, bool clone_device0_arguments) {
+  absl::Span<PjRtDevice* const> addressable_devices =
+      executable->addressable_devices();
   size_t num_addressable_devices = addressable_devices.size();
-  if (num_addressable_devices != arguments.size()) {
+  if (!clone_device0_arguments && num_addressable_devices != arguments.size()) {
     return InvalidArgument(
         "The number of provided arguments does not match "
         "the number of logical devices.");
@@ -1238,75 +1258,84 @@ FunctionalHloRunner::CopyArgumentsToDevice(
   std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> argument_buffers;
   argument_buffers.resize(num_addressable_devices);
 
-  for (int i = 0; i < num_addressable_devices; ++i) {
-    PjRtDevice* curr_device = addressable_devices[i];
-    int curr_device_id = curr_device->id();
-    if (!arguments.contains(curr_device_id)) {
-      return InvalidArgument(
-          "The provided argument map does not contain arguments "
-          "for device: %d",
-          curr_device_id);
-    }
-
-    const std::vector<Literal>& curr_device_arguments =
-        arguments.at(curr_device_id);
-
-    argument_buffers[i].reserve(curr_device_arguments.size());
-    for (const Literal& literal : curr_device_arguments) {
-      if (log_input) {
-        LOG(INFO) << "device_id=" << curr_device_id
-                  << ", input = " << literal.ToString();
+  auto argument_memory_space =
+      [&flattened_arguments](const HloModule* module, PjRtDevice* device,
+                             int arg_i) -> absl::StatusOr<PjRtMemorySpace*> {
+    auto non_tuple_memory_space = [&device](const Shape& shape) {
+      if (shape.has_layout() &&
+          shape.layout().memory_space() == Layout::kHostMemorySpace) {
+        return device->memory_space_by_kind(PinnedHostMemorySpace::kKind);
       }
-      TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtBuffer> argument_buffer,
-                          client.BufferFromHostLiteral(literal, curr_device));
-      argument_buffers[i].push_back(std::move(argument_buffer));
+      return device->default_memory_space();
+    };
+
+    const ComputationLayout& entry_layout = module->entry_computation_layout();
+    TF_RET_CHECK(entry_layout.parameter_count() > 0);
+    if (entry_layout.parameter_shape(0).IsTuple() && flattened_arguments) {
+      TF_RET_CHECK(entry_layout.parameter_count() == 1)
+          << "entry_layout.parameter_count(): "
+          << entry_layout.parameter_count();
+      TF_RET_CHECK(arg_i < entry_layout.parameter_shape(0).tuple_shapes_size());
+      const Shape& shape = entry_layout.parameter_shape(0).tuple_shapes(arg_i);
+      TF_RET_CHECK(!shape.IsTuple()) << "Nested tuples are not supported";
+      return non_tuple_memory_space(shape);
     }
-  }
-  for (const auto& device_argument_buffers : argument_buffers) {
-    for (const auto& device_buffer : device_argument_buffers) {
-      TF_RETURN_IF_ERROR(device_buffer->BlockHostUntilReady());
+    TF_RET_CHECK(arg_i < entry_layout.parameter_count());
+    const Shape& shape = entry_layout.parameter_shape(arg_i);
+    TF_RET_CHECK(!shape.IsTuple()) << "Param tuple without flattened_arguments";
+    return non_tuple_memory_space(shape);
+  };
+  auto buffer_from_host_literal = [&client, &argument_memory_space](
+                                      const HloModule* module,
+                                      PjRtDevice* device, int arg_i,
+                                      const Literal& literal)
+      -> absl::StatusOr<std::unique_ptr<PjRtBuffer>> {
+    if (client.memory_spaces().empty()) {
+      return client.BufferFromHostLiteral(literal, device);
     }
-  }
-  return argument_buffers;
-}
+    TF_ASSIGN_OR_RETURN(PjRtMemorySpace * memory_space,
+                        argument_memory_space(module, device, arg_i));
+    return client.BufferFromHostLiteral(literal, memory_space);
+  };
 
-absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>
-FunctionalHloRunner::CopyArgumentsToDevice(
-    PjRtClient& client, absl::Span<PjRtDevice* const> addressable_devices,
-    const LiteralVec& argument_literals,
-    const PerDeviceIndexVecType& argument_indices, bool log_input) {
-  size_t num_addressable_devices = addressable_devices.size();
-  if (num_addressable_devices != argument_indices.size()) {
-    return InvalidArgument(
-        "The number of provided arguments does not match "
-        "the number of logical devices.");
-  }
-  std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> argument_buffers;
-  argument_buffers.resize(num_addressable_devices);
+  absl::Span<const PjRtLoadedExecutable::LogicalDeviceIds>
+      addressable_device_logical_ids =
+          executable->addressable_device_logical_ids();
+  TF_ASSIGN_OR_RETURN(std::vector<std::shared_ptr<HloModule>> hlo_modules,
+                      executable->GetHloModules());
 
   for (int i = 0; i < num_addressable_devices; ++i) {
     PjRtDevice* curr_device = addressable_devices[i];
     int curr_device_id = curr_device->id();
-    if (!argument_indices.contains(curr_device_id)) {
+    // 'source_device' determines where we get the input literal from.
+    PjRtDevice* source_device =
+        addressable_devices[clone_device0_arguments ? 0 : i];
+    int source_device_id = source_device->id();
+    if (!arguments.contains(source_device_id)) {
       return InvalidArgument(
           "The provided argument map does not contain arguments "
           "for device: %d",
           curr_device_id);
     }
 
-    const std::vector<int> curr_device_arguments_indices =
-        argument_indices.at(curr_device_id);
+    const std::vector<Literal>& curr_device_arguments =
+        arguments.at(source_device_id);
+
+    int executable_idx = hlo_modules.size() == 1
+                             ? 0
+                             : addressable_device_logical_ids[i].partition;
+    HloModule* module = hlo_modules[executable_idx].get();
 
-    argument_buffers[i].reserve(curr_device_arguments_indices.size());
-    for (int index : curr_device_arguments_indices) {
-      const Literal& literal = argument_literals[index];
+    argument_buffers[i].reserve(curr_device_arguments.size());
+    for (int arg_i = 0; arg_i < curr_device_arguments.size(); ++arg_i) {
+      const Literal& literal = curr_device_arguments[arg_i];
       if (log_input) {
         LOG(INFO) << "device_id=" << curr_device_id
                   << ", input = " << literal.ToString();
       }
       TF_ASSIGN_OR_RETURN(
           std::unique_ptr<PjRtBuffer> argument_buffer,
-          client.BufferFromHostLiteral(literal, addressable_devices[i]));
+          buffer_from_host_literal(module, curr_device, arg_i, literal));
       argument_buffers[i].push_back(std::move(argument_buffer));
     }
   }
@@ -1325,7 +1354,7 @@ FunctionalHloRunner::FetchAndLogOutput(
     ModuleOutputMode module_output_mode, bool log_output) {
   CHECK(!output_buffers.empty());
   absl::Mutex mu;
-  Status status;
+  absl::Status status;
   size_t num_pending_transfers = 0;
   bool device_0_is_local = false;
   for (PjRtDevice* device : GetLocalDevices(client)) {
@@ -1360,7 +1389,7 @@ FunctionalHloRunner::FetchAndLogOutput(
                "same device";
         output_slice.emplace_back(
             ShapeUtil::DeviceShapeToHostShape(buffer->on_device_shape()));
-        buffer->ToLiteral(&output_slice.back()).OnReady([&](Status s) {
+        buffer->ToLiteral(&output_slice.back()).OnReady([&](absl::Status s) {
           absl::MutexLock lock(&mu);
           --num_pending_transfers;
           status.Update(s);
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.h b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.h
index a49e7d80509327..367897a6e2f48f 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.h
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.h
@@ -241,7 +241,7 @@ class FunctionalHloRunner {
   // Runs on HLO module and dumps the output if needed.
   //
   // This is the highest level API in this file.
-  static Status LoadAndRunAndDump(
+  static absl::Status LoadAndRunAndDump(
       PjRtClient& client, const DebugOptions& debug_options,
       const xla::FunctionalHloRunner::PreprocessingOptions& preproc_options,
       const xla::FunctionalHloRunner::RawCompileOptions& raw_compile_options,
@@ -266,12 +266,11 @@ class FunctionalHloRunner {
   //
   // This function allows compiling multi-device HLOs on machines with fewer
   // devices.
-  static Status LoadAndCompile(PjRtClient& client,
-                               const DebugOptions& debug_options,
-                               const PreprocessingOptions& preproc_options,
-                               const RawCompileOptions& raw_compile_options,
-                               std::string_view hlo_file,
-                               InputFormat input_format, int task_id = 0);
+  static absl::Status LoadAndCompile(
+      PjRtClient& client, const DebugOptions& debug_options,
+      const PreprocessingOptions& preproc_options,
+      const RawCompileOptions& raw_compile_options, std::string_view hlo_file,
+      InputFormat input_format, int task_id = 0);
 
   // Compiles and runs the given HLO module with the given arguments for each
   // device. The given arguments is a map from device ID to a list of arguments.
@@ -326,7 +325,7 @@ class FunctionalHloRunner {
 
   // This would ideally be private, but we need it for the implementation of
   // MultihostHloRunner.
-  static Status PrepareHloModuleForCompilation(
+  static absl::Status PrepareHloModuleForCompilation(
       HloModule* hlo_module, const DebugOptions& debug_options,
       const PreprocessingOptions& preproc_options);
   // This would ideally be private, but we need it for the implementation of
@@ -334,7 +333,7 @@ class FunctionalHloRunner {
   static CompileOptions CompleteCompileOptions(const HloModule& hlo_module,
                                                CompileOptions compile_options);
 
-  static Status DumpOutput(
+  static absl::Status DumpOutput(
       const FunctionalHloRunner::PerDeviceLiteralVecType& output,
       absl::string_view dump_output_to, int task_id);
 
@@ -377,16 +376,10 @@ class FunctionalHloRunner {
   // arguments might be invalid when arguments are destructed.
   static absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>
   CopyArgumentsToDevice(PjRtClient& client,
-                        absl::Span<PjRtDevice* const> addressable_devices,
+                        const PjRtLoadedExecutable* executable,
                         const PerDeviceLiteralVecType& arguments,
-                        bool log_input);
-
-  static absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>
-  CopyArgumentsToDevice(PjRtClient& client,
-                        absl::Span<PjRtDevice* const> addressable_devices,
-                        const LiteralVec& argument_literals,
-                        const PerDeviceIndexVecType& argument_indices,
-                        bool log_input);
+                        bool log_input, bool flattened_arguments,
+                        bool clone_device0_arguments = false);
 
   static absl::StatusOr<PerDeviceLiteralVecType> RunInternal(
       PjRtClient& client, PjRtLoadedExecutable* executable,
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner_test.cc b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner_test.cc
index 662bbdb7e46dcb..369eb4d68a4367 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner_test.cc
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner_test.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/statusor.h"
 #include "xla/tests/filecheck.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/env.h"
@@ -33,6 +35,20 @@ namespace {
 
 using ::testing::SizeIs;
 
+bool IsTestingCpu() {
+#ifdef XLA_TEST_BACKEND_CPU
+  return true;
+#endif
+  return false;
+}
+
+absl::StatusOr<std::unique_ptr<xla::PjRtClient>> GetPjRtClient() {
+  if (IsTestingCpu()) {
+    return xla::FunctionalHloRunner::CreateHostClient();
+  }
+  return xla::FunctionalHloRunner::CreateGpuClient();
+}
+
 class FunctionalHloRunnerTest : public ::testing::Test {
  protected:
   std::string GetHloPath(std::string file_name) {
@@ -43,7 +59,7 @@ class FunctionalHloRunnerTest : public ::testing::Test {
 
 TEST_F(FunctionalHloRunnerTest, SingleDeviceHlo) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::PjRtClient> client,
-                          xla::FunctionalHloRunner::CreateGpuClient());
+                          GetPjRtClient());
 
   // Options corresponding to --num_replicas=1 --num_partitions=1
   xla::DebugOptions debug_options;
@@ -60,7 +76,7 @@ TEST_F(FunctionalHloRunnerTest, SingleDeviceHlo) {
 
 TEST_F(FunctionalHloRunnerTest, Sharded2Devices) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::PjRtClient> client,
-                          xla::FunctionalHloRunner::CreateGpuClient());
+                          GetPjRtClient());
 
   constexpr int kRequiredDeviceCount = 2;
   const int kDeviceCount = client->device_count();
@@ -89,7 +105,7 @@ TEST_F(FunctionalHloRunnerTest, Sharded2Devices) {
 
 TEST_F(FunctionalHloRunnerTest, UseZerosAsInputs) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::PjRtClient> client,
-                          xla::FunctionalHloRunner::CreateGpuClient());
+                          GetPjRtClient());
 
   constexpr int kRequiredDeviceCount = 2;
   const int kDeviceCount = client->device_count();
@@ -121,7 +137,7 @@ TEST_F(FunctionalHloRunnerTest, UseZerosAsInputs) {
 
 TEST_F(FunctionalHloRunnerTest, UseUninitializedInputs) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::PjRtClient> client,
-                          xla::FunctionalHloRunner::CreateGpuClient());
+                          GetPjRtClient());
 
   constexpr int kRequiredDeviceCount = 2;
   const int kDeviceCount = client->device_count();
@@ -153,7 +169,7 @@ TEST_F(FunctionalHloRunnerTest, UseUninitializedInputs) {
 
 TEST_F(FunctionalHloRunnerTest, UseUninitializedInputsWithTupledArguments) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::PjRtClient> client,
-                          xla::FunctionalHloRunner::CreateGpuClient());
+                          GetPjRtClient());
 
   // Options corresponding to:
   // --num_replicas=1 --num_partitions=1
@@ -196,7 +212,7 @@ TEST_F(FunctionalHloRunnerTest, CanCompileWithoutHavingEnoughGpus) {
   raw_compile_options.xla_dump_to = dump_dir;
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::PjRtClient> client,
-                          xla::FunctionalHloRunner::CreateGpuClient());
+                          GetPjRtClient());
   TF_EXPECT_OK(FunctionalHloRunner::LoadAndCompile(
       *client, debug_options, preproc_options, raw_compile_options,
       GetHloPath("sharded_16_devices.hlo"), InputFormat::kText));
@@ -212,8 +228,8 @@ TEST_F(FunctionalHloRunnerTest, CanCompileWithoutHavingEnoughGpus) {
     TF_ASSERT_OK(
         tsl::ReadFileToString(env, after_opt_hlo_paths[0], &after_opt_hlo));
     absl::StatusOr<bool> file_check_result = RunFileCheck(after_opt_hlo, R"(
-      // CHECK: param = f32[16,1]{1,0}
-      // CHECK: add = f32[16,1]{1,0}
+      // CHECK: param{{.*}} = f32[16,1]{1,0}
+      // CHECK: add{{.*}} = f32[16,1]{1,0}
     )");
     TF_ASSERT_OK(file_check_result.status());
     EXPECT_TRUE(file_check_result.value());
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/hlo_runner_main.cc b/third_party/xla/xla/tools/multihost_hlo_runner/hlo_runner_main.cc
index b0666ff491fcba..54394d6dcfa1a2 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/hlo_runner_main.cc
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/hlo_runner_main.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/strings/str_cat.h"
 #include "xla/debug_options_flags.h"
+#include "xla/pjrt/distributed/distributed.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/status.h"
 #include "xla/statusor.h"
@@ -81,6 +82,7 @@ int main(int argc, char** argv) {
   int task_id = 0;
   int num_nodes = 1;
   std::string device_type_str = "gpu";
+  std::string address_str = "";
   xla::FunctionalHloRunner::PreprocessingOptions preproc_options;
   xla::FunctionalHloRunner::RawCompileOptions raw_compile_options;
   xla::FunctionalHloRunner::RunningOptions running_options;
@@ -97,10 +99,15 @@ int main(int argc, char** argv) {
                 "Example: /a/b/literal.txt."),
       tsl::Flag("task_id", &task_id, "Borg task id."),
       tsl::Flag("device_type", &device_type_str, "Device type: gpu, host"),
-      tsl::Flag("num_nodes", &num_nodes, "Number of nodes (hosts)"),
+      tsl::Flag("num_nodes", &num_nodes,
+                "Number of nodes (hosts). If greater than 1, a distributed "
+                "service will be created for task_id 0"),
       tsl::Flag(
           "enable_mock_nccl", &enable_mock_nccl,
           "Should we simulate multi-hosts run with mock nccl collectives?"),
+      tsl::Flag("address", &address_str,
+                "Coordinator address with port for when num_nodes > 1. "
+                "Example: 127.0.0.1:12345"),
   };
 
   xla::MultiHostHloRunnerFlags hlo_runner_flags;
@@ -131,6 +138,8 @@ int main(int argc, char** argv) {
     }
   }
 
+  std::unique_ptr<xla::DistributedRuntimeService> service;
+
   // The main logic:
   absl::StatusOr<std::unique_ptr<xla::PjRtClient>> client = [&] {
     if (device_type_str == "host") {
@@ -144,8 +153,30 @@ int main(int argc, char** argv) {
       CHECK_GT(num_nodes, 1);
       return xla::FunctionalHloRunner::CreateMockGpuClient(num_nodes);
     } else {
-      CHECK_EQ(num_nodes, 1);
-      return xla::FunctionalHloRunner::CreateGpuClient();
+      if (num_nodes == 1) {
+        return xla::FunctionalHloRunner::CreateGpuClient();
+      } else {
+        CHECK_GT(address_str.length(), 0);
+        // Multinode. Start service on task 0.
+        if (task_id == 0) {
+          std::string coordinator_bind_address =
+              "[::]:" + address_str.substr(address_str.rfind(":") + 1);
+          xla::CoordinationServiceImpl::Options options;
+          options.num_nodes = num_nodes;
+          auto status_or = xla::GetDistributedRuntimeService(
+              coordinator_bind_address, options);
+          TF_QCHECK_OK(status_or.status());
+          service = std::move(status_or.value());
+        }
+        xla::DistributedRuntimeClient::Options options;
+        options.node_id = task_id;
+        options.init_timeout = absl::Seconds(300);
+        auto distributed_client =
+            xla::GetDistributedRuntimeClient(address_str, options);
+        TF_QCHECK_OK(distributed_client->Connect());
+        return xla::FunctionalHloRunner::CreateGpuClient(distributed_client,
+                                                         task_id, num_nodes);
+      }
     }
   }();
 
diff --git a/third_party/xla/xla/tools/prepare_reference_module.cc b/third_party/xla/xla/tools/prepare_reference_module.cc
index fc904d9fea1c60..baf17e72c84488 100644
--- a/third_party/xla/xla/tools/prepare_reference_module.cc
+++ b/third_party/xla/xla/tools/prepare_reference_module.cc
@@ -33,8 +33,8 @@ namespace xla {
 absl::StatusOr<std::unique_ptr<HloModule>> PrepareReferenceModule(
     const HloModule& test_module, HloRunnerInterface* test_runner,
     const std::function<void(HloModuleConfig*)>& config_modifier_hook,
-    const std::function<Status(const HloModule&, HloRunnerInterface*,
-                               HloModule*)>& module_modifier_hook) {
+    const std::function<absl::Status(const HloModule&, HloRunnerInterface*,
+                                     HloModule*)>& module_modifier_hook) {
   DebugOptions debug_options = GetDebugOptionsFromFlags();
   // The combination of fast math and optimizations leads to unsound code
   // transformations (see third_party/tensorflow/compiler/xla/xla.proto for
diff --git a/third_party/xla/xla/tools/prepare_reference_module.h b/third_party/xla/xla/tools/prepare_reference_module.h
index 5fac3559a196e0..4b67d6eb947763 100644
--- a/third_party/xla/xla/tools/prepare_reference_module.h
+++ b/third_party/xla/xla/tools/prepare_reference_module.h
@@ -36,8 +36,8 @@ namespace xla {
 absl::StatusOr<std::unique_ptr<HloModule>> PrepareReferenceModule(
     const HloModule& test_module, HloRunnerInterface* test_runner,
     const std::function<void(HloModuleConfig*)>& config_modifier_hook = {},
-    const std::function<Status(const HloModule&, HloRunnerInterface*,
-                               HloModule*)>& module_modifier_hook = {});
+    const std::function<absl::Status(const HloModule&, HloRunnerInterface*,
+                                     HloModule*)>& module_modifier_hook = {});
 
 }  // namespace xla
 
diff --git a/third_party/xla/xla/tools/run_hlo_module.cc b/third_party/xla/xla/tools/run_hlo_module.cc
index 2b137e04c83e68..10b2b70afbfc24 100644
--- a/third_party/xla/xla/tools/run_hlo_module.cc
+++ b/third_party/xla/xla/tools/run_hlo_module.cc
@@ -167,13 +167,14 @@ absl::StatusOr<Literal> ExecuteWithRunner(
   return std::move(result_status).value();
 }
 
-Status RunAndCompareInternal(
+absl::Status RunAndCompareInternal(
     std::unique_ptr<HloModule> test_module,
     const BufferAssignmentProto* buffer_assignment_proto,
     HloRunnerInterface* test_runner, HloRunnerInterface* reference_runner,
     std::minstd_rand0* engine, const RunHloModuleOptions& options,
     xla::RunHloModuleIterationLiterals* iteration_literals_proto,
-    std::function<Status(const HloModule&, HloRunnerInterface*, HloModule*)>
+    std::function<absl::Status(const HloModule&, HloRunnerInterface*,
+                               HloModule*)>
         reference_module_modifier_hook,
     std::function<void(HloModuleConfig*)> config_modifier_hook,
     ModuleResult* test_run_result, ModuleResult* reference_run_result) {
@@ -325,7 +326,7 @@ Status RunAndCompareInternal(
   ErrorSpec error_spec(static_cast<float>(options.abs_error_bound),
                        static_cast<float>(options.rel_error_bound));
 
-  Status comparison_status =
+  absl::Status comparison_status =
       literal_comparison::Near(/*expected=*/reference_result,
                                /*actual=*/test_result,
                                /*error=*/error_spec,
@@ -345,7 +346,7 @@ struct ChunkResult {
   std::string module_name;
   ModuleResult test_result = ModuleResult::kDidntRun;
   ModuleResult reference_result = ModuleResult::kDidntRun;
-  Status status;
+  absl::Status status;
 
   bool operator<(const ChunkResult& other) const {
     if (test_result != other.test_result) {
@@ -412,13 +413,14 @@ std::string BuildResultsTable(absl::Span<const ChunkResult> chunk_results,
   return strstr.str();
 }
 
-Status RunIsolatedAndCompare(
+absl::Status RunIsolatedAndCompare(
     std::unique_ptr<HloModule> test_module,
     const BufferAssignmentProto* buffer_assignment_proto,
     HloRunnerInterface* test_runner, HloRunnerInterface* reference_runner,
     std::minstd_rand0* engine, const RunHloModuleOptions& options,
     xla::RunHloModuleIterationLiterals* iteration_literals_proto,
-    std::function<Status(const HloModule&, HloRunnerInterface*, HloModule*)>
+    std::function<absl::Status(const HloModule&, HloRunnerInterface*,
+                               HloModule*)>
         reference_module_modifier_hook,
     std::function<void(HloModuleConfig*)> config_modifier_hook) {
   CHECK(test_module);
@@ -439,12 +441,12 @@ Status RunIsolatedAndCompare(
       std::vector<std::unique_ptr<HloModule>> modules,
       DecomposeHloModule(*test_module, /*deduplicate_modules=*/true));
 
-  Status status = OkStatus();
+  absl::Status status = OkStatus();
   for (std::unique_ptr<HloModule>& module : modules) {
     const std::string module_name = module->name();
     ModuleResult test_module_result = ModuleResult::kDidntRun;
     ModuleResult reference_module_result = ModuleResult::kDidntRun;
-    Status chunk_status = RunAndCompareInternal(
+    absl::Status chunk_status = RunAndCompareInternal(
         std::move(module), buffer_assignment_proto, test_runner,
         reference_runner, engine, options, iteration_literals_proto,
         reference_module_modifier_hook, config_modifier_hook,
@@ -452,9 +454,6 @@ Status RunIsolatedAndCompare(
     chunk_results.push_back({std::move(module_name), test_module_result,
                              reference_module_result, chunk_status});
     status.Update(chunk_status);
-    if (!chunk_status.ok() && test_module_result != ModuleResult::kMismatch) {
-      break;
-    }
   }
   absl::c_sort(chunk_results);
   std::cout << BuildResultsTable(chunk_results, modules.size());
@@ -463,13 +462,14 @@ Status RunIsolatedAndCompare(
 
 }  // namespace
 
-Status RunAndCompare(
+absl::Status RunAndCompare(
     std::unique_ptr<HloModule> test_module,
     const BufferAssignmentProto* buffer_assignment_proto,
     HloRunnerInterface* test_runner, HloRunnerInterface* reference_runner,
     std::minstd_rand0* engine, const RunHloModuleOptions& options,
     xla::RunHloModuleIterationLiterals* iteration_literals_proto,
-    std::function<Status(const HloModule&, HloRunnerInterface*, HloModule*)>
+    std::function<absl::Status(const HloModule&, HloRunnerInterface*,
+                               HloModule*)>
         reference_module_modifier_hook,
     std::function<void(HloModuleConfig*)> config_modifier_hook) {
   if (options.isolate_instructions) {
@@ -484,15 +484,17 @@ Status RunAndCompare(
       reference_module_modifier_hook, config_modifier_hook, nullptr, nullptr);
 }
 
-Status RunAndCompare(
+absl::Status RunAndCompare(
     const std::string& hlo_filename, HloRunnerInterface* test_runner,
     HloRunnerInterface* reference_runner, std::minstd_rand0* engine,
     const RunHloModuleOptions& options,
     xla::RunHloModuleIterationLiterals* iteration_literals_proto,
-    std::function<Status(const HloModule&, HloRunnerInterface*, HloModule*)>
+    std::function<absl::Status(const HloModule&, HloRunnerInterface*,
+                               HloModule*)>
         reference_module_modifier_hook,
     std::function<void(HloModuleConfig*)> config_modifier_hook,
-    std::function<Status(const RunHloModuleOptions& options, HloModule& module)>
+    std::function<absl::Status(const RunHloModuleOptions& options,
+                               HloModule& module)>
         compilation_env_modifier_hook) {
   BufferAssignmentProto buffer_assignment_proto;
   TF_ASSIGN_OR_RETURN(
diff --git a/third_party/xla/xla/tools/run_hlo_module.h b/third_party/xla/xla/tools/run_hlo_module.h
index 80b16b277c7f08..898b489effb6f6 100644
--- a/third_party/xla/xla/tools/run_hlo_module.h
+++ b/third_party/xla/xla/tools/run_hlo_module.h
@@ -65,13 +65,14 @@ struct RunHloModuleOptions {
 // the results. 'reference_module_modifier_hook' can be used to transform the
 // HloModule before it is run on the reference platform. This may be necessary
 // to match the numerics of the test platform.
-Status RunAndCompare(
+absl::Status RunAndCompare(
     std::unique_ptr<HloModule> test_module,
     const BufferAssignmentProto* buffer_assignment_proto,
     HloRunnerInterface* test_runner, HloRunnerInterface* reference_runner,
     std::minstd_rand0* engine, const RunHloModuleOptions& options,
     xla::RunHloModuleIterationLiterals* iteration_literals_proto = nullptr,
-    std::function<Status(const HloModule&, HloRunnerInterface*, HloModule*)>
+    std::function<absl::Status(const HloModule&, HloRunnerInterface*,
+                               HloModule*)>
         reference_module_modifier_hook = {},
     std::function<void(HloModuleConfig*)> config_modifier_hook = {});
 
@@ -79,15 +80,17 @@ Status RunAndCompare(
 // an argument, a function 'compilation_env_modifier_hook' that potentially sets
 // various fields in compilation environments, for an HLO module being loaded
 // from the file.
-Status RunAndCompare(
+absl::Status RunAndCompare(
     const std::string& hlo_filename, HloRunnerInterface* test_runner,
     HloRunnerInterface* reference_runner, std::minstd_rand0* engine,
     const RunHloModuleOptions& options,
     xla::RunHloModuleIterationLiterals* iteration_literals_proto = nullptr,
-    std::function<Status(const HloModule&, HloRunnerInterface*, HloModule*)>
+    std::function<absl::Status(const HloModule&, HloRunnerInterface*,
+                               HloModule*)>
         reference_module_modifier_hook = {},
     std::function<void(HloModuleConfig*)> config_modifier_hook = {},
-    std::function<Status(const RunHloModuleOptions& options, HloModule& module)>
+    std::function<absl::Status(const RunHloModuleOptions& options,
+                               HloModule& module)>
         compilation_env_modifier_hook = {});
 }  // namespace xla
 
diff --git a/third_party/xla/xla/tools/run_hlo_module_bin_test.cc b/third_party/xla/xla/tools/run_hlo_module_bin_test.cc
index 079babc9424b4c..b4f7609c6f8b6d 100644
--- a/third_party/xla/xla/tools/run_hlo_module_bin_test.cc
+++ b/third_party/xla/xla/tools/run_hlo_module_bin_test.cc
@@ -40,8 +40,13 @@ class RunHloModuleTest : public ::testing::Test {
 
     stdout_output_ = stderr_output_ = "";
     int status = proc.Communicate(nullptr, &stdout_output_, &stderr_output_);
+#if defined(_WIN32) || defined(_WIN64)
+    exited_normally_ = (status == 0);
+    exit_status_ = status;
+#else
     exited_normally_ = WIFEXITED(status);
     exit_status_ = exited_normally_ ? WEXITSTATUS(status) : -1;
+#endif  // (_WIN32) || defined (_WIN64)
   }
 
   std::string stdout_output_;
diff --git a/third_party/xla/xla/tools/tests/hlo_expand_test.cc b/third_party/xla/xla/tools/tests/hlo_expand_test.cc
index 1289f48a6ea67b..b13be09a55d6dc 100644
--- a/third_party/xla/xla/tools/tests/hlo_expand_test.cc
+++ b/third_party/xla/xla/tools/tests/hlo_expand_test.cc
@@ -40,8 +40,13 @@ class HloExpandTest : public ::testing::Test {
 
     stdout_output_ = stderr_output_ = "";
     int status = proc.Communicate(nullptr, &stdout_output_, &stderr_output_);
+#if defined(_WIN32) || defined(_WIN64)
+    exited_normally_ = (status == 0);
+    exit_status_ = status;
+#else
     exited_normally_ = WIFEXITED(status);
     exit_status_ = exited_normally_ ? WEXITSTATUS(status) : -1;
+#endif  // defined(_WIN32) || defined(_WIN64)
   }
 
   std::string stdout_output_;
diff --git a/third_party/xla/xla/tools/xla_compile_lib.cc b/third_party/xla/xla/tools/xla_compile_lib.cc
index 852c83615854e6..955d3fe5d5dd47 100644
--- a/third_party/xla/xla/tools/xla_compile_lib.cc
+++ b/third_party/xla/xla/tools/xla_compile_lib.cc
@@ -147,17 +147,14 @@ static absl::StatusOr<std::string> CompileGpuExecutable(
 }
 
 absl::StatusOr<std::string> CompileExecutable(
-    std::unique_ptr<HloModule> hlo_module, absl::string_view platform,
+    std::unique_ptr<HloModule> hlo_module, BackendType backend,
     std::optional<Compiler::TargetConfig> target_config,
     CompilationResult& result) {
-  if (platform == "cpu") {
+  if (backend == BackendType::kCpu) {
     return AotCompileCpuExecutable(std::move(hlo_module));
-  } else if (platform == "gpu") {
-    return CompileGpuExecutable(std::move(hlo_module), target_config, result);
   }
-
-  return absl::UnimplementedError(
-      absl::StrCat("platform", platform, " is not supported"));
+  return CompileGpuExecutable(std::move(hlo_module), std::move(target_config),
+                              result);
 }
 
 absl::Status WriteResultFile(const absl::string_view result_output_file,
@@ -220,32 +217,90 @@ absl::StatusOr<std::unique_ptr<HloModule>> LoadModule(
   return HloModule::CreateFromProto(hlo_module_proto, config);
 }
 
-absl::Status XlaCompileMain(
-    absl::string_view module_path, absl::string_view output_path,
-    absl::string_view platform, absl::string_view gpu_target_config_path,
-    absl::string_view autotune_results_path, absl::string_view symbol_repo,
-    absl::string_view symbol_id, const bool use_attached_device,
-    const bool wait_for_uploads, absl::string_view result_output_file) {
-  std::unique_ptr<HloModule> hlo_module;
-  std::unique_ptr<Compiler::TargetConfig> target_config;
-  if (!symbol_id.empty()) {
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<HloModuleAndMetadata> mod,
-        LookupSymbolInRepository(symbol_repo, symbol_id, BackendType::kGpu));
-    if (mod == nullptr) {
-      return absl::NotFoundError(
-          absl::StrCat("Could not find ", symbol_id, " in ", symbol_repo));
+static absl::StatusOr<std::unique_ptr<HloModuleAndMetadata>>
+ReadModuleFromSymbolRepo(absl::string_view symbol_repo,
+                         absl::string_view symbol_reference,
+                         BackendType backend) {
+  std::unique_ptr<HloModuleAndMetadata> mod;
+  TF_ASSIGN_OR_RETURN(
+      mod, LookupSymbolInRepository(symbol_repo, symbol_reference, backend));
+  if (mod == nullptr) {
+    return absl::NotFoundError(
+        absl::StrCat("Could not find ", symbol_reference, " in ", symbol_repo));
+  }
+  return mod;
+}
+
+static absl::StatusOr<bool> LoadAutotuneDataFromModule(
+    HloModuleAndMetadata* mod, BackendType backend) {
+  if (backend == BackendType::kGpu) {
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    if (auto* data = static_cast<gpu::GpuBackendSpecificData*>(
+            mod->backend_specific_data.get());
+        data != nullptr && data->autotune_results.has_value()) {
+      TF_RETURN_IF_ERROR(
+          gpu::AutotunerUtil::LoadAutotuneResults(*data->autotune_results));
+      return true;
     }
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  }
+  return false;
+}
+
+static std::unique_ptr<Compiler::TargetConfig> ReadTargetConfigFromModule(
+    HloModuleAndMetadata* mod, BackendType backend) {
+  if (backend == BackendType::kGpu) {
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     if (auto* data = static_cast<gpu::GpuBackendSpecificData*>(
             mod->backend_specific_data.get());
         data != nullptr) {
-      target_config = std::move(mod->target_config);
+      return std::move(mod->target_config);
     }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  }
+
+  return nullptr;
+}
+
+absl::Status XlaCompileMain(const XlaCompileOptions& options) {
+  std::unique_ptr<HloModule> hlo_module;
+  std::unique_ptr<Compiler::TargetConfig> target_config;
+  if (options.platform != "cpu" && options.platform != "gpu") {
+    return absl::UnimplementedError(
+        absl::StrCat("platform", options.platform, " is not supported"));
+  }
+
+  const BackendType backend =
+      (options.platform == "gpu" ? BackendType::kGpu : BackendType::kCpu);
+
+  absl::string_view symbol_repo = options.repo_options.symbol_repo;
+  if (absl::string_view symbol_id = options.repo_options.symbol_id;
+      !symbol_id.empty()) {
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<HloModuleAndMetadata> mod,
+        ReadModuleFromSymbolRepo(symbol_repo, symbol_id, backend));
+
     hlo_module = std::move(mod->hlo_module);
+    target_config = ReadTargetConfigFromModule(mod.get(), backend);
   } else {
-    TF_ASSIGN_OR_RETURN(hlo_module, LoadModule(module_path));
+    TF_ASSIGN_OR_RETURN(hlo_module, LoadModule(options.module_path));
+  }
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  bool found_autotune = false;
+#endif
+
+  if (absl::string_view optimized_symbol_id =
+          options.repo_options.optimized_symbol_id;
+      !optimized_symbol_id.empty()) {
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<HloModuleAndMetadata> optimized_mod,
+        ReadModuleFromSymbolRepo(symbol_repo, optimized_symbol_id, backend));
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    TF_ASSIGN_OR_RETURN(found_autotune, LoadAutotuneDataFromModule(
+                                            optimized_mod.get(), backend));
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   }
 
   xla::TimerStats stats;
@@ -255,15 +310,18 @@ absl::Status XlaCompileMain(
   absl::Cleanup cleanup([&] {
     // Make sure we stop the timer if compilation failed.
     timer.StopAndLog();
-    if (!result_output_file.empty()) {
-      TF_QCHECK_OK(
-          WriteResultFile(result_output_file, stats, compilation_result));
+    if (!options.result_output_file.empty()) {
+      TF_QCHECK_OK(WriteResultFile(options.result_output_file, stats,
+                                   compilation_result));
     }
   });
   // Run AOT compilation.
   std::optional<Compiler::TargetConfig> cfg = std::nullopt;
-  if (platform == "gpu") {
-    if (!gpu_target_config_path.empty()) {
+  if (backend == BackendType::kGpu) {
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    if (absl::string_view gpu_target_config_path =
+            options.gpu_options.gpu_target_config_path;
+        !gpu_target_config_path.empty()) {
       // Parse GpuTargetConfig.
       std::string gpu_target_config_string;
       TF_RETURN_IF_ERROR(tsl::ReadFileToString(
@@ -279,28 +337,30 @@ absl::Status XlaCompileMain(
       target_config =
           std::make_unique<Compiler::TargetConfig>(gpu_target_config_proto);
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-      if (!autotune_results_path.empty()) {
+      if (absl::string_view autotune_results_path =
+              options.gpu_options.autotune_results_path;
+          !found_autotune && !autotune_results_path.empty()) {
         TF_RETURN_IF_ERROR(gpu::AutotunerUtil::LoadAutotuneResultsFromFile(
             autotune_results_path));
       }
-#endif
     }
 
-    cfg = (use_attached_device) ? std::nullopt
-                                : std::make_optional(*std::move(target_config));
+    cfg = (options.gpu_options.use_attached_device)
+              ? std::nullopt
+              : std::make_optional(*std::move(target_config));
+#endif
   }
-  auto result = CompileExecutable(std::move(hlo_module), platform, cfg,
-                                  compilation_result);
+  auto result = CompileExecutable(std::move(hlo_module), backend,
+                                  std::move(cfg), compilation_result);
   if (!result.ok()) {
     *compilation_result.mutable_status() = tsl::StatusToProto(result.status());
     return result.status();
   }
 
   TF_RETURN_IF_ERROR(tsl::WriteStringToFile(tsl::Env::Default(),
-                                            std::string(output_path), *result));
+                                            options.output_path, *result));
 
-  if (wait_for_uploads) {
+  if (options.repo_options.wait_for_uploads) {
     MaybeWaitForUploads();
   }
   return OkStatus();
diff --git a/third_party/xla/xla/tools/xla_compile_lib.h b/third_party/xla/xla/tools/xla_compile_lib.h
index 78b868d2a23ae6..8d4f9e0dae8e01 100644
--- a/third_party/xla/xla/tools/xla_compile_lib.h
+++ b/third_party/xla/xla/tools/xla_compile_lib.h
@@ -25,12 +25,13 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/compiler.h"
+#include "xla/service/symbol_repository.h"
 #include "xla/service/xla_compile_result.pb.h"
 #include "xla/util.h"
 
 namespace xla {
 
-// Compiles the provided module for the given platform, either "cpu" or "gpu".
+// Compiles the provided module for the given platform.
 // When compiling for GPU, if the target config is provided, the compilation
 // will be AOT. If it is not provided, an attached GPU will be used. When
 // compiling for CPU, the compilation will always be AOT. If a result is
@@ -38,7 +39,7 @@ namespace xla {
 //
 // This is the expected entry point to the compilation functionality.
 absl::StatusOr<std::string> CompileExecutable(
-    std::unique_ptr<HloModule> hlo_module, absl::string_view platform,
+    std::unique_ptr<HloModule> hlo_module, BackendType backend,
     std::optional<Compiler::TargetConfig> target_config,
     CompilationResult& result);
 
@@ -52,14 +53,36 @@ absl::Status WriteResultFile(absl::string_view result_output_file,
 absl::StatusOr<std::unique_ptr<HloModule>> LoadModule(
     absl::string_view module_path);
 
-// Full entry point if you want to wrap a binary around this functionality.
-// See flag definitions in ../service/xla_compile_main.cc for semantics.
-absl::Status XlaCompileMain(
-    absl::string_view module_path, absl::string_view output_path,
-    absl::string_view platform, absl::string_view gpu_target_config_path,
-    absl::string_view autotune_results_path, absl::string_view symbol_repo,
-    absl::string_view symbol_id, bool use_attached_device,
-    bool wait_for_uploads, absl::string_view result_output_file);
+struct XlaCompileOptions {
+  // Fully backend-independent options.
+  std::string module_path;
+  std::string output_path;
+  std::string platform;
+  std::string result_output_file;
+
+  // Options for SymbolRepository lookup.
+  struct SymbolRepoOptions {
+    std::string symbol_repo;
+    std::string symbol_id;
+    std::string optimized_symbol_id;
+    bool wait_for_uploads = false;
+  };
+
+  // GPU-specific options.
+  struct GpuOptions {
+    std::string gpu_target_config_path;
+    bool use_attached_device = false;
+    std::string autotune_results_path;
+  };
+
+  SymbolRepoOptions repo_options;
+  GpuOptions gpu_options;
+};
+
+// Full entry point if you want to wrap a binary around this functionality. See
+// flag definitions in ../service/xla_compile_main.cc for semantics, which
+// correspond to fields in XlaCompileOptions.
+absl::Status XlaCompileMain(const XlaCompileOptions& compile_options);
 
 }  // namespace xla
 
diff --git a/third_party/xla/xla/tools/xla_compile_lib_test.cc b/third_party/xla/xla/tools/xla_compile_lib_test.cc
index 415a5ccdb77ef2..18ccf33263cb12 100644
--- a/third_party/xla/xla/tools/xla_compile_lib_test.cc
+++ b/third_party/xla/xla/tools/xla_compile_lib_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/platform_util.h"
+#include "xla/service/symbol_repository.h"
 #include "xla/service/xla_compile_result.pb.h"
 #include "xla/stream_executor/device_description.pb.h"
 #include "xla/tests/hlo_test_base.h"
@@ -80,16 +81,16 @@ class XlaCompileLibTest : public HloTestBase {
 
 TEST_F(XlaCompileLibTest, DISABLED_ON_GPU(CompilesForCpu)) {
   CompilationResult result;
-  EXPECT_THAT(
-      CompileExecutable(std::move(module_), "cpu", std::nullopt, result),
-      IsOkAndHolds(Not(IsEmpty())));
+  EXPECT_THAT(CompileExecutable(std::move(module_), BackendType::kCpu,
+                                std::nullopt, result),
+              IsOkAndHolds(Not(IsEmpty())));
 }
 
 TEST_F(XlaCompileLibTest, DISABLED_ON_CPU(CompilesForGpuWithDevice)) {
   CompilationResult result;
-  EXPECT_THAT(
-      CompileExecutable(std::move(module_), "gpu", std::nullopt, result),
-      IsOkAndHolds(Not(IsEmpty())));
+  EXPECT_THAT(CompileExecutable(std::move(module_), BackendType::kGpu,
+                                std::nullopt, result),
+              IsOkAndHolds(Not(IsEmpty())));
   EXPECT_TRUE(result.has_hlo_module()) << result.DebugString();
 }
 
@@ -101,16 +102,16 @@ TEST_F(XlaCompileLibTest, DISABLED_ON_CPU(CompilesForGpuWithoutDevice)) {
   TF_ASSERT_OK(tsl::ReadTextProto(tsl::Env::Default(), target_config_path,
                                   &target_config));
   CompilationResult result;
-  EXPECT_THAT(
-      CompileExecutable(std::move(module_), "gpu", std::nullopt, result),
-      IsOkAndHolds(Not(IsEmpty())));
+  EXPECT_THAT(CompileExecutable(std::move(module_), BackendType::kGpu,
+                                std::nullopt, result),
+              IsOkAndHolds(Not(IsEmpty())));
   EXPECT_TRUE(result.has_hlo_module()) << result.DebugString();
 }
 
 TEST_F(XlaCompileLibTest, DISABLED_ON_GPU(ErrorsOnUnexpectedPlatform)) {
-  CompilationResult result;
-  EXPECT_THAT(CompileExecutable(nullptr, "tpu", std::nullopt, result),
-              StatusIs(tsl::error::UNIMPLEMENTED));
+  XlaCompileOptions options;
+  options.platform = "tpu";
+  EXPECT_THAT(XlaCompileMain(options), StatusIs(tsl::error::UNIMPLEMENTED));
 }
 
 TEST_F(XlaCompileLibTest, DISABLED_ON_GPU(WriteResultFilePropagatesErrors)) {
@@ -176,13 +177,12 @@ TEST_F(XlaCompileLibTest, DISABLED_ON_GPU(MainForCpu)) {
   const std::string result_file =
       tsl::io::JoinPath(tsl::testing::TmpDir(), "result.pb");
 
-  TF_EXPECT_OK(XlaCompileMain(module_file, output_path, "cpu",
-                              /* gpu_target_config_path= */ "",
-                              /* autotune_results_path= */ "",
-                              /* symbol_repo= */ "", /* symbol_id= */ "",
-                              /* use_attached_device=*/false,
-                              /* wait_for_uploads */ false,
-                              /* result_output_file=*/result_file));
+  XlaCompileOptions options;
+  options.module_path = module_file;
+  options.output_path = output_path;
+  options.platform = "cpu";
+  options.result_output_file = result_file;
+  TF_EXPECT_OK(XlaCompileMain(options));
 }
 
 TEST_F(XlaCompileLibTest, DISABLED_ON_CPU(MainForGpu)) {
@@ -196,13 +196,13 @@ TEST_F(XlaCompileLibTest, DISABLED_ON_CPU(MainForGpu)) {
   const std::string result_file =
       tsl::io::JoinPath(tsl::testing::TmpDir(), "result.pb");
 
-  TF_EXPECT_OK(XlaCompileMain(module_file, output_path, "gpu",
-                              /* gpu_target_config_path= */ "",
-                              /* autotune_results_path= */ "",
-                              /* symbol_repo= */ "", /* symbol_id= */ "",
-                              /* use_attached_device=*/true,
-                              /* wait_for_uploads */ false,
-                              /* result_output_file=*/result_file));
+  XlaCompileOptions options;
+  options.module_path = module_file;
+  options.output_path = output_path;
+  options.platform = "gpu";
+  options.result_output_file = result_file;
+  options.gpu_options.use_attached_device = true;
+  TF_EXPECT_OK(XlaCompileMain(options));
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/translate/BUILD b/third_party/xla/xla/translate/BUILD
index 49d77a762f10f0..87fb85daf04c96 100644
--- a/third_party/xla/xla/translate/BUILD
+++ b/third_party/xla/xla/translate/BUILD
@@ -52,7 +52,6 @@ xla_cc_binary(
         "//xla/mlir/framework/transforms:passes",
         "//xla/mlir_hlo:hlo_dialect_registration",
         "//xla/service:cpu_plugin",
-        "//xla/service/cpu:hlo_xla_runtime_pipeline",  # buildcleaner: keep
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:MlirOptLib",
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/BUILD b/third_party/xla/xla/translate/hlo_to_mhlo/BUILD
index d3f5a0b7fd4121..145cac8110ca6d 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/BUILD
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/BUILD
@@ -85,6 +85,7 @@ cc_library(
         "//xla/mlir_hlo",
         "//xla/service:hlo_proto_cc",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/custom_call_importer.cc b/third_party/xla/xla/translate/hlo_to_mhlo/custom_call_importer.cc
index 4b2217397eb98f..0fd690ac862107 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/custom_call_importer.cc
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/custom_call_importer.cc
@@ -44,15 +44,15 @@ absl::StatusOr<mlir::Operation*> ImportDynamicBroadcastInDimOp(
     return Internal("backend_config attribute cannot be empty.");
   }
 
-  auto attr = mlir::parseAttribute(backend_config, builder->getContext())
-                  .dyn_cast<mlir::DictionaryAttr>();
+  auto attr = mlir::dyn_cast<mlir::DictionaryAttr>(
+      mlir::parseAttribute(backend_config, builder->getContext()));
   if (!attr) {
     return Internal(
         "Couldn't parse backend config into a dictionary attribute");
   }
 
   auto broadcast_dimensions_attr =
-      attr.get("broadcast_dimensions").dyn_cast_or_null<mlir::ArrayAttr>();
+      mlir::dyn_cast_or_null<mlir::ArrayAttr>(attr.get("broadcast_dimensions"));
   if (!broadcast_dimensions_attr) {
     return Internal("broadcast_dimensions attribute is required.");
   }
@@ -61,7 +61,7 @@ absl::StatusOr<mlir::Operation*> ImportDynamicBroadcastInDimOp(
   for (auto [i, broadcast_dimension] :
        llvm::enumerate(broadcast_dimensions_attr)) {
     broadcast_dimensions[i] =
-        broadcast_dimension.cast<mlir::IntegerAttr>().getInt();
+        mlir::cast<mlir::IntegerAttr>(broadcast_dimension).getInt();
   }
 
   return builder
@@ -103,15 +103,17 @@ mlir::Type getQuantizedType(mlir::DictionaryAttr& backend_config) {
 
   if (const mlir::Attribute scales_attr = backend_config.get("scale");
       scales_attr) {
-    for (auto scale_attr : scales_attr.cast<mlir::ArrayAttr>()) {
-      scales.push_back(scale_attr.cast<mlir::FloatAttr>().getValueAsDouble());
+    for (auto scale_attr : mlir::cast<mlir::ArrayAttr>(scales_attr)) {
+      scales.push_back(
+          mlir::cast<mlir::FloatAttr>(scale_attr).getValueAsDouble());
     }
   }
 
   auto zero_points_attr = backend_config.get("zero_point");
   if (zero_points_attr) {
-    for (auto zero_point_attr : zero_points_attr.cast<mlir::ArrayAttr>()) {
-      zero_points.push_back(zero_point_attr.cast<mlir::IntegerAttr>().getInt());
+    for (auto zero_point_attr : mlir::cast<mlir::ArrayAttr>(zero_points_attr)) {
+      zero_points.push_back(
+          mlir::cast<mlir::IntegerAttr>(zero_point_attr).getInt());
     }
   }
 
@@ -119,30 +121,30 @@ mlir::Type getQuantizedType(mlir::DictionaryAttr& backend_config) {
       backend_config.get("quantization_dimension");
   if (quantization_dimension_attr) {
     quantization_dimension =
-        quantization_dimension_attr.cast<mlir::IntegerAttr>().getInt();
+        mlir::cast<mlir::IntegerAttr>(quantization_dimension_attr).getInt();
   }
 
   auto storage_max_attr = backend_config.get("storage_max");
   if (storage_max_attr) {
-    storage_max = storage_max_attr.cast<mlir::IntegerAttr>().getInt();
+    storage_max = mlir::cast<mlir::IntegerAttr>(storage_max_attr).getInt();
   }
 
   auto storage_min_attr = backend_config.get("storage_min");
   if (storage_min_attr) {
-    storage_min = storage_min_attr.cast<mlir::IntegerAttr>().getInt();
+    storage_min = mlir::cast<mlir::IntegerAttr>(storage_min_attr).getInt();
   }
 
   auto storage_type_attr = backend_config.get("storage_type");
   if (storage_type_attr) {
-    storage_type = storage_type_attr.cast<mlir::TypeAttr>().getValue();
+    storage_type = mlir::cast<mlir::TypeAttr>(storage_type_attr).getValue();
   }
 
   auto expressed_type_attr = backend_config.get("expressed_type");
   if (expressed_type_attr) {
-    expressed_type = expressed_type_attr.cast<mlir::TypeAttr>().getValue();
+    expressed_type = mlir::cast<mlir::TypeAttr>(expressed_type_attr).getValue();
   }
 
-  auto is_signed = storage_type.cast<mlir::IntegerType>().isSignless();
+  auto is_signed = mlir::cast<mlir::IntegerType>(storage_type).isSignless();
 
   if (quantization_dimension != -1) {
     return mlir::quant::UniformQuantizedPerAxisType::get(
@@ -175,9 +177,8 @@ absl::StatusOr<mlir::Operation*> ImportCustomCallAsOp(
                                     operands, builder);
   }
 
-  auto backend_config =
-      mlir::parseAttribute(backend_config_str, builder->getContext())
-          .dyn_cast<mlir::DictionaryAttr>();
+  auto backend_config = mlir::dyn_cast<mlir::DictionaryAttr>(
+      mlir::parseAttribute(backend_config_str, builder->getContext()));
   if (!backend_config) {
     return Internal(
         "Couldn't parse backend config into a dictionary attribute");
@@ -188,7 +189,7 @@ absl::StatusOr<mlir::Operation*> ImportCustomCallAsOp(
         ->create<mlir::mhlo::UniformQuantizeOp>(
             loc,
             mlir::RankedTensorType::get(
-                result_type.cast<mlir::RankedTensorType>().getShape(),
+                mlir::cast<mlir::RankedTensorType>(result_type).getShape(),
                 getQuantizedType(backend_config)),
             operands)
         .getOperation();
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.cc b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.cc
index 0e234bcdb7c60d..a2e74df3d6052b 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.cc
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.cc
@@ -127,6 +127,13 @@ bool DotIsDefault(const HloInstruction* instruction) {
   return protobuf_util::ProtobufEquals(dnums, default_dimension_numbers);
 }
 
+ArrayRef<HloSharding> FlattenTupleSharding(const HloSharding& sharding) {
+  if (sharding.IsTuple()) {
+    return sharding.tuple_elements();
+  }
+  return sharding;
+}
+
 // Clean up the GetTupleElementOp, created during the flattening of
 // tuple arguments and return values, if eligible for folding. Removal of
 // get-tuple-element can transitively make the defining TupleOp dead to be
@@ -153,6 +160,14 @@ void CleanUpTupleOps(mlir::Block* block, mlir::OpBuilder* builder) {
   }
 }
 
+Operation* createReturnOp(mlir::OpBuilder& builder, mlir::Location loc,
+                          mlir::ValueRange operands, bool is_func) {
+  if (is_func) {
+    return builder.create<mlir::func::ReturnOp>(loc, operands);
+  }
+  return builder.create<mlir::mhlo::ReturnOp>(loc, operands);
+}
+
 }  // namespace
 
 mlir::TypeRange Untuple(const mlir::Type& type) {
@@ -167,7 +182,7 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportOldStyleAsyncStart(
     llvm::SmallVectorImpl<mlir::NamedAttribute>& attributes,
     const llvm::SmallVectorImpl<mlir::Value>& operands, mlir::Location loc,
     mlir::Type result_type, mlir::OpBuilder* func_builder,
-    std::string func_name, std::function<Status(sync_op)> mutate_op) {
+    std::string func_name, std::function<absl::Status(sync_op)> mutate_op) {
   auto result_types = result_type.cast<mlir::TupleType>().getTypes();
   if (result_types.size() < 2) {
     return tsl::errors::InvalidArgument(
@@ -286,8 +301,7 @@ mlir::Operation* HloFunctionImporter::CreateTupleFromOpResults(
     mlir::Type type) {
   if (!type.isa<mlir::TupleType>()) return op;
 
-  llvm::SmallVector<Value> flattened_results = op->getResults();
-  llvm::MutableArrayRef<mlir::Value> flattened_results_ref(flattened_results);
+  mlir::ValueRange flattened_results_ref(op->getResults());
   auto result =
       CreateTupleValue(func_builder, loc, flattened_results_ref, type);
   auto defining_tuple_op = result.getDefiningOp<mlir::mhlo::TupleOp>();
@@ -384,9 +398,23 @@ void HloFunctionImporter::FlattenTupleValue(
   }
 }
 
-Value HloFunctionImporter::CreateTupleValue(
-    mlir::OpBuilder* func_builder, mlir::Location loc,
-    llvm::MutableArrayRef<Value>& flatten_values, Type type) {
+llvm::SmallVector<Value> HloFunctionImporter::FlattenTupleValues(
+    mlir::OpBuilder* func_builder, mlir::Location loc, mlir::ValueRange values,
+    std::optional<int> reserve_size) {
+  llvm::SmallVector<Value> flattened_values;
+  if (reserve_size) {
+    flattened_values.reserve(*reserve_size);
+  }
+  for (Value value : values) {
+    FlattenTupleValue(func_builder, loc, value, flattened_values);
+  }
+  return flattened_values;
+}
+
+Value HloFunctionImporter::CreateTupleValue(mlir::OpBuilder* func_builder,
+                                            mlir::Location loc,
+                                            mlir::ValueRange& flatten_values,
+                                            Type type) {
   auto tuple_type = type.dyn_cast<mlir::TupleType>();
   if (!tuple_type) {
     assert(!flatten_values.empty());
@@ -407,18 +435,20 @@ Value HloFunctionImporter::CreateTupleValue(
 absl::StatusOr<mlir::func::FuncOp> HloFunctionImporter::ImportAsFunc(
     const HloComputation& computation, mlir::SymbolTable& symbol_table,
     std::unordered_map<const HloComputation*, FuncOp>* function_map,
-    mlir::Builder* builder, bool is_main) {
-  HloFunctionImporter importer(symbol_table, function_map, builder);
+    mlir::Builder* builder, bool is_main,
+    bool flatten_computation_args_result) {
+  HloFunctionImporter importer(symbol_table, function_map, builder,
+                               flatten_computation_args_result);
   return importer.ImportAsFunc(computation, is_main);
 }
 
-Status HloFunctionImporter::ImportAsRegion(const HloComputation& computation,
-                                           mlir::SymbolTable& symbol_table,
-                                           mlir::Region* region,
-                                           mlir::Builder* builder,
-                                           bool flatten_region_arg_tuple) {
-  HloFunctionImporter importer(symbol_table, {}, builder);
-  return importer.ImportAsRegion(computation, region, flatten_region_arg_tuple);
+absl::Status HloFunctionImporter::ImportAsRegion(
+    const HloComputation& computation, mlir::SymbolTable& symbol_table,
+    mlir::Region* region, mlir::Builder* builder,
+    bool flatten_computation_args_result) {
+  HloFunctionImporter importer(symbol_table, {}, builder,
+                               flatten_computation_args_result);
+  return importer.ImportAsRegion(computation, region);
 }
 
 absl::StatusOr<FuncOp> HloFunctionImporter::ImportAsFunc(
@@ -434,54 +464,118 @@ absl::StatusOr<FuncOp> HloFunctionImporter::ImportAsFunc(
     }
   }
 
-  llvm::SmallVector<Type, 4> args, rets;
+  llvm::SmallVector<Type, 4> args;
   TF_RETURN_IF_ERROR(GetMlirTypes(computation.parameter_instructions(), &args));
-  TF_RETURN_IF_ERROR(GetMlirTypes({computation.root_instruction()}, &rets));
-  auto func_type = mlir::FunctionType::get(context_, args, rets);
+  TF_ASSIGN_OR_RETURN(Type retType,
+                      ConvertShapeToType<RankedTensorType>(
+                          computation.root_instruction()->shape(), *builder_));
+
+  mlir::FunctionType func_type;
+  if (flatten_computation_args_result_) {
+    llvm::SmallVector<Type> flattened_args;
+    for (Type type : args) {
+      FlattenTupleType(type, flattened_args);
+    }
+    llvm::SmallVector<Type> flattened_rets;
+    FlattenTupleType(retType, flattened_rets);
+    func_type =
+        mlir::FunctionType::get(context_, flattened_args, flattened_rets);
+  } else {
+    func_type = mlir::FunctionType::get(context_, args, retType);
+  }
 
   // Construct the MLIR function and map arguments.
   llvm::ArrayRef<mlir::NamedAttribute> attrs;
   auto function = FuncOp::create(mlir::UnknownLoc::get(context_),
                                  computation_name, func_type, attrs);
-  auto visibility = computation_name == "main" ? FuncOp::Visibility::Public
-                                               : FuncOp::Visibility::Private;
-  function.setVisibility(visibility);
+  function.setVisibility(is_main ? FuncOp::Visibility::Public
+                                 : FuncOp::Visibility::Private);
 
-  for (const auto& entry :
-       llvm::enumerate(computation.parameter_instructions())) {
+  int arg_index = 0;
+  for (auto instruction : computation.parameter_instructions()) {
     HloParameterInstruction* parameter =
-        Cast<HloParameterInstruction>(entry.value());
-    if (parameter->has_sharding()) {
-      function.setArgAttr(entry.index(), kShardingAttr,
-                          ConvertSharding(parameter->sharding(), builder_));
-    }
-    if (parameter->frontend_attributes().map_size() > 0) {
-      function.setArgAttr(
-          entry.index(), kFrontendAttributesAttr,
-          GetFrontendAttributes(*builder_, parameter->frontend_attributes()));
-    }
-    if (parameter->parameter_replicated_at_leaf_buffers().has_value()) {
-      bool nontrival = false;
-      llvm::SmallVector<bool> replicated_at_leaf_buffers;
-      for (auto b : parameter->parameter_replicated_at_leaf_buffers().value()) {
-        replicated_at_leaf_buffers.push_back(b);
-        nontrival = nontrival || b;
+        Cast<HloParameterInstruction>(instruction);
+    mlir::Attribute frontend_attributes =
+        parameter->frontend_attributes().map_size() > 0
+            ? GetFrontendAttributes(*builder_, parameter->frontend_attributes())
+            : mlir::Attribute();
+
+    if (flatten_computation_args_result_) {
+      int64_t leaf_count = ShapeUtil::GetLeafCount(parameter->shape());
+      ArrayRef<HloSharding> flattened_shardings =
+          parameter->has_sharding()
+              ? FlattenTupleSharding(parameter->sharding())
+              : ArrayRef<HloSharding>();
+      if (!flattened_shardings.empty() &&
+          leaf_count != flattened_shardings.size()) {
+        return Internal("Expected %d leaf parameters but got %d",
+                        flattened_shardings.size(), leaf_count);
       }
-      if (nontrival) {
+
+      for (int i = 0; i < leaf_count; ++i) {
+        if (!flattened_shardings.empty()) {
+          function.setArgAttr(
+              arg_index, kShardingAttr,
+              ConvertSharding(flattened_shardings[i], builder_));
+        }
+        if (frontend_attributes) {
+          if (leaf_count > 1) {
+            return InvalidArgument(
+                "A tuple parameter that is being flattened shouldn't have "
+                "frontend attributes");
+          }
+          function.setArgAttr(arg_index, kFrontendAttributesAttr,
+                              frontend_attributes);
+        }
+        if (parameter->parameter_replicated_at_leaf_buffers() &&
+            parameter->parameter_replicated_at_leaf_buffers()->at(i)) {
+          function.setArgAttr(arg_index, kParameterReplicationAttr,
+                              builder_->getBoolArrayAttr({true}));
+        }
+        ++arg_index;
+      }
+    } else {
+      if (parameter->has_sharding()) {
+        function.setArgAttr(arg_index, kShardingAttr,
+                            ConvertSharding(parameter->sharding(), builder_));
+      }
+      if (frontend_attributes) {
         function.setArgAttr(
-            entry.index(), kParameterReplicationAttr,
-            builder_->getBoolArrayAttr(replicated_at_leaf_buffers));
+            arg_index, kFrontendAttributesAttr,
+            GetFrontendAttributes(*builder_, parameter->frontend_attributes()));
+      }
+      if (parameter->parameter_replicated_at_leaf_buffers().has_value()) {
+        bool nontrival = false;
+        llvm::SmallVector<bool> replicated_at_leaf_buffers;
+        for (auto b :
+             parameter->parameter_replicated_at_leaf_buffers().value()) {
+          replicated_at_leaf_buffers.push_back(b);
+          nontrival = nontrival || b;
+        }
+        if (nontrival) {
+          function.setArgAttr(
+              arg_index, kParameterReplicationAttr,
+              builder_->getBoolArrayAttr(replicated_at_leaf_buffers));
+        }
       }
+      ++arg_index;
     }
   }
   if (computation.root_instruction()->has_sharding()) {
-    auto result = computation.root_instruction();
-    if (function.getNumResults() != 1) {
-      return Internal("Expected only a single result but got %d",
+    ArrayRef<HloSharding> ret_shardings =
+        computation.root_instruction()->sharding();
+    if (flatten_computation_args_result_) {
+      ret_shardings = FlattenTupleSharding(ret_shardings.front());
+    }
+    if (function.getNumResults() != ret_shardings.size()) {
+      return Internal("Expected %d results but got %d", ret_shardings.size(),
                       function.getNumResults());
     }
-    function.setResultAttr(0, kShardingAttr,
-                           ConvertSharding(result->sharding(), builder_));
+    for (const auto& [ret_index, ret_sharding] :
+         llvm::enumerate(ret_shardings)) {
+      function.setResultAttr(ret_index, kShardingAttr,
+                             ConvertSharding(ret_sharding, builder_));
+    }
   }
   if (computation.execution_thread() != "main") {
     function->setAttr("execution_thread",
@@ -524,15 +618,13 @@ absl::StatusOr<FuncOp> HloFunctionImporter::ImportAsFunc(
   }
 
   mlir::Block* block = function.addEntryBlock();
-  TF_RETURN_IF_ERROR(ImportInstructions(computation, block,
-                                        /*flatten_region_arg_tuple=*/false));
+  TF_RETURN_IF_ERROR(ImportInstructions(computation, block));
 
   return function;
 }
 
-Status HloFunctionImporter::ImportAsRegion(const HloComputation& computation,
-                                           mlir::Region* region,
-                                           bool flatten_region_arg_tuple) {
+absl::Status HloFunctionImporter::ImportAsRegion(
+    const HloComputation& computation, mlir::Region* region) {
   auto loc = region->getLoc();
   // TODO(hinsu): Store computation name as an attribute for round-trip.
   auto* block = new mlir::Block;
@@ -542,7 +634,8 @@ Status HloFunctionImporter::ImportAsRegion(const HloComputation& computation,
   TF_RETURN_IF_ERROR(GetMlirTypes(computation.parameter_instructions(), &args));
 
   // Flatten the tuple-typed arguments.
-  if (flatten_region_arg_tuple) {
+  if (!llvm::isa<FuncOp>(region->getParentOp()) ||
+      flatten_computation_args_result_) {
     for (auto arg : args) {
       llvm::SmallVector<Type> flattened_arg_types;
       FlattenTupleType(arg, flattened_arg_types);
@@ -555,7 +648,7 @@ Status HloFunctionImporter::ImportAsRegion(const HloComputation& computation,
                         mlir::SmallVector<mlir::Location>(args.size(), loc));
   }
 
-  return ImportInstructions(computation, block, flatten_region_arg_tuple);
+  return ImportInstructions(computation, block);
 }
 
 absl::StatusOr<Value> HloFunctionImporter::ImportInstructionsImpl(
@@ -588,17 +681,18 @@ absl::StatusOr<Value> HloFunctionImporter::ImportInstructionsImpl(
   return GetMlirValue(computation.root_instruction());
 }
 
-Status HloFunctionImporter::ImportInstructions(
-    const HloComputation& computation, mlir::Block* block,
-    bool flatten_region_arg_tuple) {
+absl::Status HloFunctionImporter::ImportInstructions(
+    const HloComputation& computation, mlir::Block* block) {
   llvm::SmallVector<Value, 4> arguments(block->args_begin(), block->args_end());
   mlir::OpBuilder builder = mlir::OpBuilder::atBlockEnd(block);
 
   // TODO(suderman): Add location tracking details.
   mlir::Location loc = builder.getUnknownLoc();
 
+  bool is_func = llvm::isa<FuncOp>(block->getParentOp());
+  bool flatten_region_arg_tuple = !is_func || flatten_computation_args_result_;
   Value result;
-  if (!llvm::isa<FuncOp>(block->getParentOp()) && flatten_region_arg_tuple) {
+  if (flatten_region_arg_tuple) {
     // 'effective_arguments' stores the mhlo value corresponding to each
     // computation parameter. The value could be a BlockArgument, if the
     // corresponding computation parameter is non-tuple typed, or a TupleOp,
@@ -629,9 +723,9 @@ Status HloFunctionImporter::ImportInstructions(
       llvm::SmallVector<Type> flattened_arg_type;
       FlattenTupleType(orig_tuple_arg_type, flattened_arg_type);
 
-      llvm::MutableArrayRef<Value> sub_args(
+      mlir::ValueRange sub_args(llvm::ArrayRef<Value>(
           arguments.begin() + flatten_idx,
-          arguments.begin() + flatten_idx + flattened_arg_type.size());
+          arguments.begin() + flatten_idx + flattened_arg_type.size()));
 
       auto tupleVal =
           CreateTupleValue(&builder, loc, sub_args, orig_tuple_arg_type);
@@ -649,17 +743,13 @@ Status HloFunctionImporter::ImportInstructions(
   }
 
   // Create terminator op depending on the parent op of this region.
-  if (llvm::isa<FuncOp>(block->getParentOp())) {
-    builder.create<mlir::func::ReturnOp>(loc, result);
+  if (flatten_region_arg_tuple) {
+    // Flatten tuples in results of this region.
+    llvm::SmallVector<Value> flattened_return_operands;
+    FlattenTupleValue(&builder, loc, result, flattened_return_operands);
+    createReturnOp(builder, loc, flattened_return_operands, is_func);
   } else {
-    if (flatten_region_arg_tuple) {
-      // Flatten tuples in results of this region.
-      llvm::SmallVector<Value> flattened_return_operands;
-      FlattenTupleValue(&builder, loc, result, flattened_return_operands);
-      builder.create<mlir::mhlo::ReturnOp>(loc, flattened_return_operands);
-    } else {
-      builder.create<mlir::mhlo::ReturnOp>(loc, result);
-    }
+    createReturnOp(builder, loc, result, is_func);
   }
 
   CleanUpTupleOps(block, &builder);
@@ -670,13 +760,15 @@ Status HloFunctionImporter::ImportInstructions(
 absl::StatusOr<Value> HloFunctionImporter::ImportInstructions(
     const HloComputation& computation,
     const llvm::SmallVectorImpl<Value>& arguments,
-    mlir::SymbolTable& symbol_table, mlir::OpBuilder* builder) {
+    mlir::SymbolTable& symbol_table, mlir::OpBuilder* builder,
+    bool flatten_computation_args_result) {
   mlir::Block* block = builder->getBlock();
   if (block == nullptr)
     return InvalidArgument(
         "ImportInstructions requires a valid block in the builder");
 
-  HloFunctionImporter importer(symbol_table, {}, builder);
+  HloFunctionImporter importer(symbol_table, {}, builder,
+                               flatten_computation_args_result);
   return importer.ImportInstructionsImpl(computation, arguments, builder);
 }
 
@@ -684,13 +776,14 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
     const HloInstruction* instr,
     const llvm::SmallVectorImpl<mlir::Value>& operands,
     mlir::SymbolTable& symbol_table, mlir::OpBuilder* builder,
-    DynamicShapeHandlingMode mode) {
+    bool flatten_computation_args_result, DynamicShapeHandlingMode mode) {
   mlir::Block* block = builder->getBlock();
   if (block == nullptr)
     return InvalidArgument(
         "ImportInstructions requires a valid block in the builder");
 
-  HloFunctionImporter importer(symbol_table, {}, builder);
+  HloFunctionImporter importer(symbol_table, {}, builder,
+                               flatten_computation_args_result);
   return importer.ImportInstructionWithLayout(instr, operands, builder, mode);
 }
 
@@ -877,10 +970,30 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       TF_ASSIGN_OR_RETURN(
           FuncOp function,
           ImportAsFunc(*instruction->to_apply(), /*is_main=*/false));
-      mlir::Operation* new_operation =
-          func_builder->create<mlir::func::CallOp>(loc, function, operands);
-      for (auto attr : attributes) {
-        new_operation->setAttr(attr.getName(), attr.getValue());
+      mlir::Operation* new_operation;
+      if (flatten_computation_args_result_) {
+        // Flatten the tuple-typed operands.
+        llvm::SmallVector<Value> flattened_operands = FlattenTupleValues(
+            func_builder, loc, operands, function.getNumArguments());
+        new_operation = func_builder->create<mlir::func::CallOp>(
+            loc, function, flattened_operands);
+        for (auto attr : attributes) {
+          new_operation->setAttr(attr.getName(), attr.getValue());
+        }
+        // Flatten the tuple-typed results.
+        mlir::ValueRange flattened_results_ref(new_operation->getResults());
+        TF_ASSIGN_OR_RETURN(auto result_type,
+                            ConvertShapeToType<RankedTensorType>(
+                                instruction->shape(), *builder_));
+        new_operation = CreateTupleValue(func_builder, loc,
+                                         flattened_results_ref, result_type)
+                            .getDefiningOp();
+      } else {
+        new_operation =
+            func_builder->create<mlir::func::CallOp>(loc, function, operands);
+        for (auto attr : attributes) {
+          new_operation->setAttr(attr.getName(), attr.getValue());
+        }
       }
       return new_operation;
     }
@@ -1159,8 +1272,7 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       auto scatter_op = func_builder->create<mlir::mhlo::ScatterOp>(
           loc, flattened_types, operands, attributes);
       TF_RETURN_IF_ERROR(ImportAsRegion(*scatter->to_apply(),
-                                        &scatter_op.getUpdateComputation(),
-                                        /*flatten_region_arg_tuple=*/true));
+                                        &scatter_op.getUpdateComputation()));
       TF_ASSIGN_OR_RETURN(auto result_type,
                           ConvertShapeToType<RankedTensorType>(
                               instruction->shape(), *builder_));
@@ -1186,11 +1298,9 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
           func_builder->create<mlir::mhlo::SelectAndScatterOp>(
               loc, result_type, operands, attributes);
       TF_RETURN_IF_ERROR(ImportAsRegion(*select_scatter->select(),
-                                        &select_scatter_op.getSelect(),
-                                        /*flatten_region_arg_tuple=*/true));
+                                        &select_scatter_op.getSelect()));
       TF_RETURN_IF_ERROR(ImportAsRegion(*select_scatter->scatter(),
-                                        &select_scatter_op.getScatter(),
-                                        /*flatten_region_arg_tuple=*/true));
+                                        &select_scatter_op.getScatter()));
       return select_scatter_op.getOperation();
     }
     case HloOpcode::kSetDimensionSize: {
@@ -1223,8 +1333,7 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
           builder_->getI64IntegerAttr(sort_instruction->sort_dimension()),
           builder_->getBoolAttr(sort_instruction->is_stable()));
       TF_RETURN_IF_ERROR(ImportAsRegion(*sort_instruction->to_apply(),
-                                        &sort_op.getComparator(),
-                                        /*flatten_region_arg_tuple=*/true));
+                                        &sort_op.getComparator()));
 
       // Check if the output needs to be tupled.
       if (return_types.size() == 1 && return_types.front() == result_type) {
@@ -1299,7 +1408,7 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       }
       return ImportOldStyleAsyncStart<mlir::mhlo::SendOp>(
           attributes, operands, loc, async_bundled_tuple, func_builder, "send_",
-          [](auto) { return OkStatus(); });
+          [](auto) { return absl::OkStatus(); });
     }
     case HloOpcode::kSendDone: {
       return ImportOldStyleAsyncDone(attributes, operands, loc, result_type,
@@ -1333,7 +1442,7 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       }
       return ImportOldStyleAsyncStart<mlir::mhlo::RecvOp>(
           attributes, operands, loc, async_bundled_tuple, func_builder, "recv_",
-          [](auto) { return OkStatus(); });
+          [](auto) { return absl::OkStatus(); });
     }
     case HloOpcode::kRecvDone: {
       return ImportOldStyleAsyncDone(attributes, operands, loc, result_type,
@@ -1343,9 +1452,8 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       llvm::SmallVector<Type, 4> rets;
 
       // Flatten the tuple-typed operands.
-      llvm::SmallVector<Value> flattened_operands;
-      for (auto& operand : operands)
-        FlattenTupleValue(func_builder, loc, operand, flattened_operands);
+      llvm::SmallVector<Value> flattened_operands =
+          FlattenTupleValues(func_builder, loc, operands);
 
       // If/Case Op has a single operand; we collect the other operands to
       // replace the corresponding block arguments.
@@ -1368,11 +1476,9 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
         auto op = func_builder->create<mlir::mhlo::IfOp>(
             loc, flattened_ret_types, flattened_operands[0], attributes);
         TF_RETURN_IF_ERROR(ImportAsRegion(*instruction->true_computation(),
-                                          &op.getTrueBranch(),
-                                          /*flatten_region_arg_tuple=*/true));
+                                          &op.getTrueBranch()));
         TF_RETURN_IF_ERROR(ImportAsRegion(*instruction->false_computation(),
-                                          &op.getFalseBranch(),
-                                          /*flatten_region_arg_tuple=*/true));
+                                          &op.getFalseBranch()));
 
         // Replace the uses of block-arguments of the IfOp with the
         // implicit_operands.
@@ -1401,9 +1507,8 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
            llvm::enumerate(instruction->branch_computations())) {
         auto index = index_and_computation.index();
         HloComputation* computation = index_and_computation.value();
-        TF_RETURN_IF_ERROR(ImportAsRegion(*computation,
-                                          &op.getBranches()[index],
-                                          /*flatten_region_arg_tuple=*/true));
+        TF_RETURN_IF_ERROR(
+            ImportAsRegion(*computation, &op.getBranches()[index]));
       }
 
       // Replace the uses of block-arguments of the CaseOp with the
@@ -1494,8 +1599,7 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       auto all_reduce_op = func_builder->create<mlir::mhlo::AllReduceOp>(
           loc, result_types, operands, attributes);
       TF_RETURN_IF_ERROR(ImportAsRegion(*all_reduce->to_apply(),
-                                        &all_reduce_op.getComputation(),
-                                        /*flatten_region_arg_tuple=*/true));
+                                        &all_reduce_op.getComputation()));
       if (result_tuple_ty) {
         return func_builder
             ->create<mlir::mhlo::TupleOp>(loc, result_type,
@@ -1521,8 +1625,7 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
           attributes, operands, loc, result_type, func_builder, "all_reduce_",
           [&](auto all_reduce_sync) {
             TF_RETURN_IF_ERROR(ImportAsRegion(
-                *instruction->to_apply(), &all_reduce_sync.getComputation(),
-                /*flatten_region_arg_tuple=*/true));
+                *instruction->to_apply(), &all_reduce_sync.getComputation()));
             return absl::OkStatus();
           });
     }
@@ -1594,9 +1697,8 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
           loc, return_types, llvm::ArrayRef(operands).take_front(num_inputs),
           llvm::ArrayRef(operands).drop_front(num_inputs),
           ConvertDimensions(instruction->dimensions()));
-      TF_RETURN_IF_ERROR(ImportAsRegion(*instruction->to_apply(),
-                                        &reduce.getBody(),
-                                        /*flatten_region_arg_tuple=*/true));
+      TF_RETURN_IF_ERROR(
+          ImportAsRegion(*instruction->to_apply(), &reduce.getBody()));
 
       // Check if the output needs to be tupled.
       if (return_types.size() == 1 && return_types.front() == result_type) {
@@ -1707,12 +1809,10 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       auto op = func_builder->create<mlir::mhlo::WhileOp>(
           loc, flattened_operand_types, flattened_operands);
 
-      TF_RETURN_IF_ERROR(ImportAsRegion(*instruction->while_condition(),
-                                        &op.getCond(),
-                                        /*flatten_region_arg_tuple=*/true));
-      TF_RETURN_IF_ERROR(ImportAsRegion(*instruction->while_body(),
-                                        &op.getBody(),
-                                        /*flatten_region_arg_tuple=*/true));
+      TF_RETURN_IF_ERROR(
+          ImportAsRegion(*instruction->while_condition(), &op.getCond()));
+      TF_RETURN_IF_ERROR(
+          ImportAsRegion(*instruction->while_body(), &op.getBody()));
       return CreateTupleFromOpResults(func_builder, loc, op.getOperation(),
                                       operands[0].getType());
     }
@@ -1782,8 +1882,7 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
           func_builder->create<mlir::mhlo::ReduceScatterOp>(
               loc, result_type, operands, attributes);
       TF_RETURN_IF_ERROR(ImportAsRegion(*reduce_scatter->to_apply(),
-                                        &reduce_scatter_op.getComputation(),
-                                        /*flatten_region_arg_tuple=*/true));
+                                        &reduce_scatter_op.getComputation()));
 
       return reduce_scatter_op.getOperation();
     }
@@ -1814,9 +1913,8 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       attributes.push_back(ConvertPadding(padding));
       auto reduce = func_builder->create<mlir::mhlo::ReduceWindowOp>(
           loc, return_types, operands, attributes);
-      TF_RETURN_IF_ERROR(ImportAsRegion(*instruction->to_apply(),
-                                        &reduce.getBody(),
-                                        /*flatten_region_arg_tuple=*/true));
+      TF_RETURN_IF_ERROR(
+          ImportAsRegion(*instruction->to_apply(), &reduce.getBody()));
 
       // Check if the output needs to be tupled.
       if (return_types.size() == 1 && return_types.front() == result_type) {
@@ -1831,9 +1929,8 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       auto op = func_builder->create<mlir::mhlo::MapOp>(
           loc, result_type, operands,
           ConvertDimensions(instruction->dimensions()));
-      TF_RETURN_IF_ERROR(ImportAsRegion(*instruction->to_apply(),
-                                        &op.getComputation(),
-                                        /*flatten_region_arg_tuple=*/true));
+      TF_RETURN_IF_ERROR(
+          ImportAsRegion(*instruction->to_apply(), &op.getComputation()));
       return op.getOperation();
     }
     case HloOpcode::kConvolution: {
@@ -2071,9 +2168,8 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
 
     case HloOpcode::kFusion: {
       // Flatten the tuple-typed operands.
-      llvm::SmallVector<Value> flattened_operands;
-      for (auto& operand : operands)
-        FlattenTupleValue(func_builder, loc, operand, flattened_operands);
+      llvm::SmallVector<Value> flattened_operands =
+          FlattenTupleValues(func_builder, loc, operands);
 
       // Flatten the return type if they are tuple-typed.
       llvm::SmallVector<Type> flattened_ret_types;
@@ -2090,9 +2186,9 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
                                        builder_)));
       auto fusion = func_builder->create<mlir::mhlo::FusionOp>(
           loc, flattened_ret_types, flattened_operands, attributes);
-      TF_RETURN_IF_ERROR(ImportAsRegion(
-          *instruction->fused_instructions_computation(),
-          &fusion.getFusedComputation(), /*flatten_region_arg_tuple=*/true));
+      TF_RETURN_IF_ERROR(
+          ImportAsRegion(*instruction->fused_instructions_computation(),
+                         &fusion.getFusedComputation()));
 
       return CreateTupleFromOpResults(func_builder, loc, fusion.getOperation(),
                                       result_type);
@@ -2175,7 +2271,7 @@ HloFunctionImporter::GetOperands(const HloInstruction* instruction) {
   return operands;
 }
 
-Status HloFunctionImporter::GetMlirTypes(
+absl::Status HloFunctionImporter::GetMlirTypes(
     absl::Span<const HloInstruction* const> instructions,
     llvm::SmallVectorImpl<mlir::Type>* types) {
   for (auto instruction : instructions) {
@@ -2334,7 +2430,7 @@ void HloFunctionImporter::SetLayoutForMlir(mlir::Operation* op,
   op->setAttr(attr_name, GetLayoutAttribute(b, shape));
 }
 
-Status HloFunctionImporter::ConvertShapeToMlirLayout(
+absl::Status HloFunctionImporter::ConvertShapeToMlirLayout(
     const Shape& shape,
     llvm::SmallVectorImpl<mlir::Attribute>& flattened_attr) {
   if (shape.IsToken()) {
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.h b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.h
index 760730271b5b00..df8a7780c84621 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.h
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.h
@@ -60,27 +60,30 @@ class HloFunctionImporter {
       const HloComputation& computation, mlir::SymbolTable& symbol_table,
       std::unordered_map<const HloComputation*, mlir::func::FuncOp>*
           function_map,
-      mlir::Builder* builder, bool is_main);
+      mlir::Builder* builder, bool is_main,
+      bool flatten_computation_args_result = false);
 
-  // Imports the given hlo computation to the specified region. If
-  // 'flatten_region_arg_tuple' is true, then flatten the tuple-typed region
-  // argument(s) and return value(s).
-  static Status ImportAsRegion(const HloComputation& computation,
-                               mlir::SymbolTable& symbol_table,
-                               mlir::Region* region, mlir::Builder* builder,
-                               bool flatten_region_arg_tuple = false);
+  // Imports the given hlo computation to the specified region.
+  //
+  // Flattens the tuple-typed region argument(s) and return value(s).
+  static absl::Status ImportAsRegion(
+      const HloComputation& computation, mlir::SymbolTable& symbol_table,
+      mlir::Region* region, mlir::Builder* builder,
+      bool flatten_computation_args_result = false);
 
   // Imports the given computation to the given place specified by `builder`.
   // `arguments` contains values for all parameters.
   static absl::StatusOr<mlir::Value> ImportInstructions(
       const HloComputation& computation,
       const llvm::SmallVectorImpl<mlir::Value>& arguments,
-      mlir::SymbolTable& symbol_table, mlir::OpBuilder* builder);
+      mlir::SymbolTable& symbol_table, mlir::OpBuilder* builder,
+      bool flatten_computation_args_result = false);
 
   static absl::StatusOr<mlir::Operation*> ImportInstruction(
       const HloInstruction* instr,
       const llvm::SmallVectorImpl<mlir::Value>& operands,
       mlir::SymbolTable& symbol_table, mlir::OpBuilder* builder,
+      bool flatten_computation_args_result = false,
       DynamicShapeHandlingMode mode = DynamicShapeHandlingMode::kDynamic);
 
   static void SetLayoutForMlir(mlir::Operation* op, const Shape& shape,
@@ -111,7 +114,7 @@ class HloFunctionImporter {
                                             mlir::Type type);
 
   // FlattenTupleType flattens the types in (nested) tuple-type 'type' and
-  // stores them in 'types'.
+  // stores them in 'flattened_types'.
   static void FlattenTupleType(
       mlir::Type type, llvm::SmallVectorImpl<mlir::Type>& flattened_types);
 
@@ -121,6 +124,12 @@ class HloFunctionImporter {
       mlir::OpBuilder* func_builder, mlir::Location loc, mlir::Value value,
       llvm::SmallVectorImpl<mlir::Value>& flattened_values);
 
+  // FlattenTupleValues flattens the values in (nested) tuple-typed 'values' and
+  // returns the flattened values.
+  static llvm::SmallVector<mlir::Value> FlattenTupleValues(
+      mlir::OpBuilder* func_builder, mlir::Location loc,
+      mlir::ValueRange values, std::optional<int> reserve_size = std::nullopt);
+
   // CreateTupleValue creates a root TupleOp of (nested) tuple-type 'type' using
   // the non-tuple-typed values in 'flatten_values'.
   //
@@ -133,19 +142,22 @@ class HloFunctionImporter {
   //          resp. flatten and create tuples in the exact same order.
   //       2. `flatten_values`, initially storing the flattened values, will be
   //          mutated to a 0-length array by the end of function invocation.
-  static mlir::Value CreateTupleValue(
-      mlir::OpBuilder* func_builder, mlir::Location loc,
-      llvm::MutableArrayRef<mlir::Value>& flatten_values, mlir::Type type);
+  static mlir::Value CreateTupleValue(mlir::OpBuilder* func_builder,
+                                      mlir::Location loc,
+                                      mlir::ValueRange& flatten_values,
+                                      mlir::Type type);
 
  private:
   HloFunctionImporter(mlir::SymbolTable& symbol_table,
                       std::unordered_map<const HloComputation*,
                                          mlir::func::FuncOp>* function_map,
-                      mlir::Builder* builder)
+                      mlir::Builder* builder,
+                      bool flatten_computation_args_result)
       : context_(symbol_table.getOp()->getContext()),
         symbol_table_(symbol_table),
         builder_(builder),
-        function_map_(function_map) {
+        function_map_(function_map),
+        flatten_computation_args_result_(flatten_computation_args_result) {
     context_->loadDialect<mlir::arith::ArithDialect>();
     context_->loadDialect<mlir::func::FuncDialect>();
     context_->loadDialect<mlir::mhlo::MhloDialect>();
@@ -158,13 +170,13 @@ class HloFunctionImporter {
       const HloComputation& computation, bool is_main);
 
   // Imports the given computation in the specified region.
-  Status ImportAsRegion(const HloComputation& computation, mlir::Region* region,
-                        bool flatten_region_arg_tuple = false);
+  absl::Status ImportAsRegion(const HloComputation& computation,
+                              mlir::Region* region);
 
   // Imports instructions from the given computation in the specified block.
   // Assumes that the block already has correct arguments populated.
-  Status ImportInstructions(const HloComputation& computation,
-                            mlir::Block* block, bool flatten_region_arg_tuple);
+  absl::Status ImportInstructions(const HloComputation& computation,
+                                  mlir::Block* block);
   absl::StatusOr<mlir::Value> ImportInstructionsImpl(
       const HloComputation& computation,
       const llvm::SmallVectorImpl<mlir::Value>& arguments,
@@ -192,7 +204,7 @@ class HloFunctionImporter {
 
   // Converts an XLA shape/layout to the corresponding MLIR layout, in
   // flattened_attr, while flattening the tuple layout.
-  Status ConvertShapeToMlirLayout(
+  absl::Status ConvertShapeToMlirLayout(
       const Shape& shape,
       llvm::SmallVectorImpl<mlir::Attribute>& flattened_attr);
 
@@ -201,8 +213,9 @@ class HloFunctionImporter {
 
   // Takes a list of HloInstructions and generates the list of types used for
   // input, bypassing tuples to subsets.
-  Status GetMlirTypes(absl::Span<const HloInstruction* const> instructions,
-                      llvm::SmallVectorImpl<mlir::Type>* types);
+  absl::Status GetMlirTypes(
+      absl::Span<const HloInstruction* const> instructions,
+      llvm::SmallVectorImpl<mlir::Type>* types);
 
   // Returns the Mlir Value for the corresponding HloInstruction.
   absl::StatusOr<mlir::Value> GetMlirValue(const HloInstruction* instruction);
@@ -260,7 +273,7 @@ class HloFunctionImporter {
       llvm::SmallVectorImpl<mlir::NamedAttribute>& attributes,
       const llvm::SmallVectorImpl<mlir::Value>& operands, mlir::Location loc,
       mlir::Type result_type, mlir::OpBuilder* func_builder,
-      std::string func_name, std::function<Status(SyncOp)> mutate_op);
+      std::string func_name, std::function<absl::Status(SyncOp)> mutate_op);
 
   // Imports an old-style async done op
   absl::StatusOr<mlir::Operation*> ImportOldStyleAsyncDone(
@@ -280,6 +293,8 @@ class HloFunctionImporter {
 
   // Mapping from HloInstructions to the associative MLIR values.
   std::unordered_map<const HloInstruction*, mlir::Value> instruction_value_map_;
+
+  bool flatten_computation_args_result_;
 };
 
 // Returns a StringAttr that carries a prettyprinted representation of the
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_module_importer.cc b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_module_importer.cc
index c1645030e9408a..299c7b3333500b 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_module_importer.cc
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_module_importer.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 #include <optional>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Casting.h"
@@ -32,7 +33,9 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/translate/hlo_to_mhlo/hlo_function_importer.h"
 #include "xla/xla.pb.h"
@@ -42,15 +45,16 @@ limitations under the License.
 namespace xla {
 
 HloModuleImporter::HloModuleImporter(mlir::ModuleOp module,
-                                     bool import_all_computation)
+                                     bool import_all_computation,
+                                     bool flatten_computation_args_result)
     : import_all_computation_(import_all_computation),
+      flatten_computation_args_result_(flatten_computation_args_result),
       symbol_table_(module),
       builder_(module.getContext()) {
   module.getContext()->loadDialect<mlir::arith::ArithDialect>();
   module.getContext()->loadDialect<mlir::func::FuncDialect>();
   module.getContext()->loadDialect<mlir::mhlo::MhloDialect>();
   module.getContext()->loadDialect<mlir::quant::QuantizationDialect>();
-  module.getContext()->loadDialect<mlir::quant::QuantizationDialect>();
 }
 
 namespace {
@@ -59,28 +63,50 @@ constexpr char kFrontendAttributesAttr[] = "mhlo.frontend_attributes";
 
 mlir::ArrayAttr ConvertCrossProgramPrefetches(
     const absl::Span<const HloModule::CrossProgramPrefetchInfo> prefetches,
-    mlir::Builder* builder) {
+    const HloComputation& entryComputation, mlir::Builder* builder,
+    bool flatten_computation_args_result) {
   llvm::SmallVector<mlir::Attribute, 4> shapes;
-  for (auto [parameter, index, alt_memory_offset] : prefetches) {
-    llvm::SmallVector<int64_t, 4> dims;
-    for (auto dim : index) dims.push_back(dim);
-    std::optional<int64_t> offset =
-        alt_memory_offset ? std::optional<int64_t>(*alt_memory_offset)
-                          : std::nullopt;
-    shapes.push_back(mlir::mhlo::CrossProgramPrefetchAttr::get(
-        builder->getContext(), parameter, dims, offset));
+  shapes.reserve(prefetches.size());
+  if (flatten_computation_args_result) {
+    llvm::SmallVector<absl::flat_hash_map<ShapeIndex, int64_t>>
+        original_param_index_to_flattened_arg_index;
+    int64_t arg_index = 0;
+    for (HloInstruction* param_instruction :
+         entryComputation.parameter_instructions()) {
+      auto& param_map =
+          original_param_index_to_flattened_arg_index.emplace_back();
+      ShapeUtil::ForEachLeafShape(param_instruction->shape(),
+                                  [&](const Shape&, const ShapeIndex& index) {
+                                    param_map[index] = arg_index++;
+                                  });
+    }
+    for (const auto& [parameter, index, alt_memory_offset] : prefetches) {
+      shapes.push_back(mlir::mhlo::CrossProgramPrefetchAttr::get(
+          builder->getContext(),
+          original_param_index_to_flattened_arg_index[parameter][index],
+          /*indices=*/{}, alt_memory_offset));
+    }
+  } else {
+    for (const auto& [parameter, index, alt_memory_offset] : prefetches) {
+      shapes.push_back(mlir::mhlo::CrossProgramPrefetchAttr::get(
+          builder->getContext(), parameter,
+          llvm::ArrayRef<int64_t>(index.data(), index.size()),
+          alt_memory_offset));
+    }
   }
 
   return mlir::ArrayAttr::get(builder->getContext(), shapes);
 }
 }  // namespace
 
-Status HloModuleImporter::Import(const HloModule& hlo_module) {
+absl::Status HloModuleImporter::Import(const HloModule& hlo_module) {
   auto module = llvm::cast<mlir::ModuleOp>(symbol_table_.getOp());
   module.setName(hlo_module.name());
-  module->setAttr("mhlo.cross_program_prefetches",
-                  ConvertCrossProgramPrefetches(
-                      hlo_module.CrossProgramPrefetches(), &builder_));
+  module->setAttr(
+      "mhlo.cross_program_prefetches",
+      ConvertCrossProgramPrefetches(hlo_module.CrossProgramPrefetches(),
+                                    *hlo_module.entry_computation(), &builder_,
+                                    flatten_computation_args_result_));
   module->setAttr(
       "mhlo.is_dynamic",
       mlir::BoolAttr::get(builder_.getContext(), hlo_module.is_dynamic()));
@@ -96,8 +122,15 @@ Status HloModuleImporter::Import(const HloModule& hlo_module) {
 
   if (hlo_module.has_spmd_parameters_shardings()) {
     llvm::SmallVector<mlir::Attribute> parameter_shardings;
-    for (const auto& sharding : hlo_module.spmd_parameters_shardings()) {
-      parameter_shardings.push_back(ConvertSharding(sharding, &builder_));
+    parameter_shardings.reserve(hlo_module.spmd_parameters_shardings().size());
+    for (const auto& root_sharding : hlo_module.spmd_parameters_shardings()) {
+      llvm::ArrayRef<HloSharding> shardings = root_sharding;
+      if (root_sharding.IsTuple() && flatten_computation_args_result_) {
+        shardings = root_sharding.tuple_elements();
+      }
+      for (const auto& sharding : shardings) {
+        parameter_shardings.push_back(ConvertSharding(sharding, &builder_));
+      }
     }
     module->setAttr("mhlo.spmd_parameters_shardings",
                     builder_.getArrayAttr(parameter_shardings));
@@ -106,10 +139,10 @@ Status HloModuleImporter::Import(const HloModule& hlo_module) {
   if (!import_all_computation_)
     // Only import the entry computation, any reachable one will be imported
     // unless turned into a region operation.
-    return HloFunctionImporter::ImportAsFunc(*hlo_module.entry_computation(),
-                                             symbol_table_, &function_map_,
-                                             &builder_,
-                                             /*is_main*/ true)
+    return HloFunctionImporter::ImportAsFunc(
+               *hlo_module.entry_computation(), symbol_table_, &function_map_,
+               &builder_,
+               /*is_main*/ true, flatten_computation_args_result_)
         .status();
 
   auto* module_entry_computation = hlo_module.entry_computation();
@@ -117,13 +150,14 @@ Status HloModuleImporter::Import(const HloModule& hlo_module) {
     TF_RETURN_IF_ERROR(HloFunctionImporter::ImportAsFunc(
                            *computation, symbol_table_, &function_map_,
                            &builder_,
-                           /*is_main*/ computation == module_entry_computation)
+                           /*is_main*/ computation == module_entry_computation,
+                           flatten_computation_args_result_)
                            .status());
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status HloModuleImporter::Import(const HloModuleProto& module_proto) {
+absl::Status HloModuleImporter::Import(const HloModuleProto& module_proto) {
   DebugOptions debug_options;
   TF_ASSIGN_OR_RETURN(
       auto module_config,
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_module_importer.h b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_module_importer.h
index 10dcbb9e2646af..bc608c5f42153d 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_module_importer.h
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_module_importer.h
@@ -39,19 +39,21 @@ class Shape;
 class HloModuleImporter {
  public:
   explicit HloModuleImporter(mlir::ModuleOp module,
-                             bool import_all_computation = false);
+                             bool import_all_computation = false,
+                             bool flatten_computation_args_result = false);
 
   // Import the HloModule into the MLIR Module.
-  Status Import(const xla::HloModule& module);
+  absl::Status Import(const xla::HloModule& module);
 
   // Import the HloModuleProto into the MLIR Module.
-  Status Import(const xla::HloModuleProto& module);
+  absl::Status Import(const xla::HloModuleProto& module);
 
  private:
   void ImportFrontendAttributes(const xla::HloModule& hlo_module,
                                 mlir::ModuleOp module);
 
   bool import_all_computation_;
+  bool flatten_computation_args_result_;
   mlir::SymbolTable symbol_table_;
   mlir::Builder builder_;
 
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_to_mlir_hlo.cc b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_to_mlir_hlo.cc
index 37064723890ffd..e8d81dcc4a92d2 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_to_mlir_hlo.cc
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_to_mlir_hlo.cc
@@ -21,18 +21,24 @@ limitations under the License.
 
 namespace xla {
 
-Status ConvertHloToMlirHlo(mlir::ModuleOp module,
-                           xla::HloModuleProto const* hlo_module_proto,
-                           bool import_all_computation) {
+absl::Status ConvertHloToMlirHlo(mlir::ModuleOp module,
+                                 xla::HloModuleProto const* hlo_module_proto,
+                                 bool import_all_computation,
+                                 bool flatten_computation_args_result) {
   mlir::BaseScopedDiagnosticHandler diag_handler(module.getContext());
-  return HloModuleImporter(module, import_all_computation)
+  return HloModuleImporter(module, import_all_computation,
+                           flatten_computation_args_result)
       .Import(*hlo_module_proto);
 }
 
-Status ConvertHloToMlirHlo(mlir::ModuleOp module, xla::HloModule* hlo_module,
-                           bool import_all_computation) {
+absl::Status ConvertHloToMlirHlo(mlir::ModuleOp module,
+                                 xla::HloModule* hlo_module,
+                                 bool import_all_computation,
+                                 bool flatten_computation_args_result) {
   mlir::BaseScopedDiagnosticHandler diag_handler(module.getContext());
-  return HloModuleImporter(module, import_all_computation).Import(*hlo_module);
+  return HloModuleImporter(module, import_all_computation,
+                           flatten_computation_args_result)
+      .Import(*hlo_module);
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h
index de5cda6d1dde27..277f522331ffda 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef XLA_TRANSLATE_HLO_TO_MHLO_HLO_TO_MLIR_HLO_H_
 #define XLA_TRANSLATE_HLO_TO_MHLO_HLO_TO_MLIR_HLO_H_
 
+#include <stdbool.h>
+
 #include "xla/status.h"
 
 namespace mlir {
@@ -27,17 +29,28 @@ class HloModule;
 class HloModuleProto;
 
 // Converts an HLO module proto to a MLIR module in HLO dialect.
-// If import_all_computation is set to true, imports all computations
+//
+// If `import_all_computation` is set to true, imports all computations
 // irrespective if transitively called from entry computation.
-Status ConvertHloToMlirHlo(mlir::ModuleOp module,
-                           xla::HloModuleProto const* hlo_module,
-                           bool import_all_computations = false);
+//
+// If `flatten_computation_args_result` is set to true, flattens all tuple
+// arguments and result of every computation when importing them as func ops.
+absl::Status ConvertHloToMlirHlo(mlir::ModuleOp module,
+                                 xla::HloModuleProto const* hlo_module,
+                                 bool import_all_computations = false,
+                                 bool flatten_computation_args_result = false);
 
 // Converts an HLO module to a MLIR module in HLO dialect.
-// If import_all_computation is set to true, imports all computations
+//
+// If `import_all_computation` is set to true, imports all computations
 // irrespective if transitively called from entry computation.
-Status ConvertHloToMlirHlo(mlir::ModuleOp module, xla::HloModule* hlo_module,
-                           bool import_all_computations = false);
+//
+// If `flatten_computation_args_result` is set to true, flattens all tuple
+// arguments and result of every computation when importing them as func ops.
+absl::Status ConvertHloToMlirHlo(mlir::ModuleOp module,
+                                 xla::HloModule* hlo_module,
+                                 bool import_all_computations = false,
+                                 bool flatten_computation_args_result = false);
 
 }  // namespace xla
 
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.cc b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.cc
index 59e24ea3aaf002..6218a2b695eeec 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.cc
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.cc
@@ -43,17 +43,18 @@ using mlir::ShapedType;
 template <typename CppType>
 ::mlir::DenseElementsAttr CreateDenseAttrFromLiteral(
     const ShapedType& type, const LiteralBase& literal) {
-  if constexpr (std::is_same_v<CppType, u4> || std::is_same_v<CppType, s4>) {
+  if constexpr (is_intN_v<CppType>) {
     // DenseElementsAttr::get() does not support being passed an i4 array.
-    // Instead, create buffer of padded i4 values and call
+    // Instead, create buffer of padded, packed values and call
     // DenseElementsAttr::getFromRawBuffer()
     auto data_span = literal.data<CppType>();
-    std::vector<char> int4_padded_data;
-    int4_padded_data.reserve(literal.element_count());
+    std::vector<char> packed_padded_data;
+    packed_padded_data.reserve(literal.element_count());
     for (size_t i = 0; i < literal.element_count(); i++) {
-      int4_padded_data.push_back(static_cast<char>(data_span[i]));
+      packed_padded_data.push_back(static_cast<char>(data_span[i]));
     }
-    return ::mlir::DenseElementsAttr::getFromRawBuffer(type, int4_padded_data);
+    return ::mlir::DenseElementsAttr::getFromRawBuffer(type,
+                                                       packed_padded_data);
   } else {
     auto data_span = literal.data<CppType>();
     return ::mlir::DenseElementsAttr::get(
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.h b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.h
index b570125c654da9..7e0f21c97578b9 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.h
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.h
@@ -43,8 +43,8 @@ mlir::DenseIntElementsAttr CreateDenseIntElementsAttrFromVector(
 
 // Converts the given XLA shape for tensors to the template MLIR type.
 template <typename TypeT>
-static StatusOr<TypeT> ConvertTensorShapeToType(const Shape& xla_ty,
-                                                mlir::Builder builder) {
+static absl::StatusOr<TypeT> ConvertTensorShapeToType(const Shape& xla_ty,
+                                                      mlir::Builder builder) {
   auto element_type_or =
       ConvertPrimitiveTypeToMlirType(xla_ty.element_type(), builder);
   if (!element_type_or.ok()) return element_type_or.status();
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/tests/case_conditional.hlotxt b/third_party/xla/xla/translate/hlo_to_mhlo/tests/case_conditional.hlotxt
index bb514dc9647a37..114d12ee5b9790 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/tests/case_conditional.hlotxt
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/tests/case_conditional.hlotxt
@@ -1,4 +1,5 @@
-// RUN: xla-translate -hlo-text-to-mlir-hlo  -hlo-import-all-computations %s -o - | FileCheck %s
+// RUN: xla-translate -hlo-text-to-mlir-hlo -hlo-import-all-computations %s -o - | FileCheck %s
+// RUN: xla-translate -hlo-text-to-mlir-hlo -hlo-import-all-computations -hlo-flatten-computation-args-result %s -o - | FileCheck %s -check-prefix=FLATTEN-CHECK
 
 HloModule Indexed_Conditional
 
@@ -112,3 +113,7 @@ ENTRY %indexed_conditional () -> f32[] {
 // CHECK-NEXT:    }) : (tensor<i32>) -> (tensor<f32>, tensor<f32>)
 // CHECK-NEXT:    %[[TUPLE:.*]] = mhlo.tuple %[[CASE]]#0, %[[CASE]]#1
 // CHECK-NEXT:    return %[[TUPLE]] : tuple<tensor<f32>, tensor<f32>>
+
+// FLATTEN-CHECK-LABEL: func private @conditional_with_tuple_args() -> (tensor<f32>, tensor<f32>)
+// FLATTEN-CHECK: %[[CASE:.*]]:2 = "mhlo.case"
+// FLATTEN-CHECK: return  %[[CASE]]#0, %[[CASE]]#1
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/tests/frontend_attributes.hlotxt b/third_party/xla/xla/translate/hlo_to_mhlo/tests/frontend_attributes.hlotxt
index cec430b2251171..123c0560a42af1 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/tests/frontend_attributes.hlotxt
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/tests/frontend_attributes.hlotxt
@@ -1,4 +1,5 @@
-// RUN: xla-translate -hlo-text-to-mlir-hlo -hlo-import-all-computations %s -o - | FileCheck %s
+// RUN: xla-translate -hlo-text-to-mlir-hlo %s -o - | FileCheck %s
+// RUN: xla-translate -hlo-text-to-mlir-hlo -hlo-flatten-computation-args-result %s -o - | FileCheck %s
 
 HloModule frontend_attributes, entry_computation_layout={(f32[])->f32[]}
 
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/tests/if_conditional.hlotxt b/third_party/xla/xla/translate/hlo_to_mhlo/tests/if_conditional.hlotxt
index be5bbb77294f5a..c96aa39a646504 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/tests/if_conditional.hlotxt
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/tests/if_conditional.hlotxt
@@ -1,4 +1,5 @@
 // RUN: xla-translate -hlo-text-to-mlir-hlo -hlo-import-all-computations %s -o - | FileCheck %s
+// RUN: xla-translate -hlo-text-to-mlir-hlo -hlo-import-all-computations -hlo-flatten-computation-args-result %s -o - | FileCheck %s -check-prefix=FLATTEN-CHECK
 
 // ----------------------
 // Test importing If op with tuple args; element-size of tuple == 1
@@ -21,6 +22,7 @@ HloModule tfcompile.0
 
 // CHECK-LABEL: func @main
 // CHECK-SAME: ([[A0:%.+]]: tensor<f32>)
+//FLATTEN-CHECK-LABEL: func @main
 ENTRY %tfcompile.0 {
   %arg0.1 = f32[] parameter(0), parameter_replication={false}, metadata={op_name="XLA_Args"}
 
@@ -39,11 +41,13 @@ ENTRY %tfcompile.0 {
   // CHECK:   [[R3:%.+]] = mhlo.exponential [[A0]]
   // CHECK:   mhlo.return [[R3]]
   // CHECK: })
+  // FLATTEN-CHECK: [[R2:%.+]] = "mhlo.if"
   %conditional.16 = (f32[]) conditional(%compare.4, %tuple.5, %tuple.5), true_computation=%then_branch0, false_computation=%else_branch0, metadata={op_type="If" op_name="cond/Merge_if"}
 
   %get-tuple-element.17 = f32[] get-tuple-element(%conditional.16), index=0, metadata={op_type="If" op_name="cond/Merge_if"}
   // CHECK: [[R4:%.+]] = mhlo.tuple [[R2]]
   // CHECK:   return [[R4]]
+  // FLATTEN-CHECK: return [[R2]]
   ROOT %tuple.19 = (f32[]) tuple(%get-tuple-element.17), metadata={op_name="XLA_Retvals"}
 }
 
@@ -96,6 +100,10 @@ ENTRY %tfcompile.0 {
 //CHECK-NEXT:    %[[R1:.*]] = mhlo.tuple %[[R0]], %[[IF]]#2
 //CHECK-NEXT:    return %[[R1]] : tuple<tuple<tensor<f32>, tensor<f32>>, tensor<f32>>
 
+//FLATTEN-CHECK-LABEL: func private @tfcompile.1
+//FLATTEN-CHECK: %[[IF:.*]]:3 = "mhlo.if"
+//FLATTEN-CHECK: return %[[IF]]#0, %[[IF]]#1, %[[IF]]#2
+
 %tfcompile.1 {
   %arg0 = f32[] parameter(0), parameter_replication={false}
   %arg1 = f32[] parameter(1), parameter_replication={false}
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/tests/import.hlotxt b/third_party/xla/xla/translate/hlo_to_mhlo/tests/import.hlotxt
index 454c93111a198f..a8ce57c90f5d3d 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/tests/import.hlotxt
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/tests/import.hlotxt
@@ -1,5 +1,6 @@
 // RUN: xla-translate --print-sugar=false -hlo-text-to-mlir-hlo -hlo-import-all-computations %s -o - | FileCheck %s
 // RUN: xla-translate --print-sugar=false -hlo-text-to-mlir-hlo %s -o - | FileCheck %s -check-prefix=NO_DEAD_FUNCTION
+// RUN: xla-translate --print-sugar=false -hlo-text-to-mlir-hlo -hlo-import-all-computations -hlo-flatten-computation-args-result %s -o - | FileCheck %s -check-prefix=FLATTEN-CHECK
 
 // CHECK: #[[$DC:.*]] = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed), posWidth = 32, crdWidth = 32 }>
 // CHECK: #[[$CSS:.*]] = #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : compressed(nonunique), d1 : singleton(nonunique, nonordered), d2 : singleton(nonordered)), posWidth = 32, crdWidth = 32 }>
@@ -161,11 +162,15 @@ add {
 }
 
 // CHECK-LABEL:  func private @test_tuple_all_reduce
+// FLATTEN-CHECK-LABEL:  func private @test_tuple_all_reduce
 %test_tuple_all_reduce {
   %input.1 = f32[8] parameter(0)
   %input.2 = f32[] parameter(1)
-  // CHECK:        %[[ALL_REDUCE:.*]]:2 = "mhlo.all_reduce"(%arg0, %arg1)
-  // CHECK:        %[[TUPLE:.*]] = mhlo.tuple %[[ALL_REDUCE]]#0, %[[ALL_REDUCE]]#1 {{.*}} : tuple<tensor<8xf32>, tensor<f32>>
+  // CHECK:         %[[ALL_REDUCE:.*]]:2 = "mhlo.all_reduce"(%arg0, %arg1)
+  // CHECK:         %[[TUPLE:.*]] = mhlo.tuple %[[ALL_REDUCE]]#0, %[[ALL_REDUCE]]#1 {{.*}} : tuple<tensor<8xf32>, tensor<f32>>
+
+  // FLATTEN-CHECK: %[[ALL_REDUCE:.*]]:2 = "mhlo.all_reduce"(%arg0, %arg1)
+  // FLATTEN-CHECK: return %[[ALL_REDUCE]]#0, %[[ALL_REDUCE]]#1 : tensor<8xf32>, tensor<f32>
   ROOT result = (f32[8]{0}, f32[]) all-reduce(f32[8]{0} %input.1, f32[] %input.2), replica_groups={}, to_apply=add
 }
 
@@ -200,6 +205,7 @@ add {
 }
 
 // CHECK-LABEL:  func private @test_batch_norm_grad
+// FLATTEN-CHECK-LABEL:  func private @test_batch_norm_grad
 %test_batch_norm_grad (input: f32[2,2,2,2], scale: f32[2], mean: f32[2], variance: f32[2], grad_output: f32[2,2,2,2]) -> (f32[2,2,2,2], f32[2], f32[2]) {
   %input = f32[2,2,2,2] parameter(0)
   %scale = f32[2] parameter(1)
@@ -212,10 +218,14 @@ add {
   // CHECK-SAME:  feature_index = 1 : i64
   // CHECK:  %[[TUPLE:.+]] = mhlo.tuple
   // CHECK:  return %[[TUPLE]]
+
+  // FLATTEN-CHECK:  %[[GRAD_OPERAND:.+]], %[[GRAD_SCALE:.+]], %[[GRAD_OFFSET:.+]]  = "mhlo.batch_norm_grad"
+  // FLATTEN-CHECK:  return %[[GRAD_OPERAND]], %[[GRAD_SCALE]], %[[GRAD_OFFSET]]
   ROOT %batch-norm-grad = (f32[2,2,2,2], f32[2], f32[2]) batch-norm-grad(f32[2,2,2,2] %input, f32[2] %scale, f32[2] %mean, f32[2] %variance, f32[2,2,2,2] %grad_output), epsilon=0.001, feature_index=1
 }
 
 // CHECK-LABEL:  func private @test_batch_norm_train
+// FLATTEN-CHECK-LABEL:  func private @test_batch_norm_train
 %test_batch_norm_train (input: f32[2,2,2,2], scale: f32[2], offset: f32[2]) -> (f32[2,2,2,2], f32[2], f32[2]) {
   %input = f32[2,2,2,2] parameter(0)
   %scale = f32[2] parameter(1)
@@ -226,6 +236,9 @@ add {
   // CHECK-SAME:  feature_index = 1 : i64
   // CHECK:  %[[TUPLE:.+]] = mhlo.tuple
   // CHECK:  return %[[TUPLE]]
+
+  // FLATTEN-CHECK:  %[[OUT:.+]], %[[MEAN:.+]], %[[VAR:.+]] = "mhlo.batch_norm_training"
+  // FLATTEN-CHECK:  return %[[OUT]], %[[MEAN]], %[[VAR]]
   ROOT %batch-norm-train = (f32[2,2,2,2], f32[2], f32[2]) batch-norm-training(f32[2,2,2,2] %input, f32[2] %scale, f32[2] %offset), epsilon=0.001, feature_index=1
 }
 
@@ -242,6 +255,49 @@ add {
   ROOT %call.2 = s64[] call(%arg0.1), to_apply=%call
 }
 
+// CHECK-LABEL:  func private @call_tuple_args_result(%arg0: tuple<tensor<i64>, tuple<tensor<i64>, tensor<i64>>>, %arg1: tensor<i64>) -> tuple<tensor<i64>, tensor<i64>> {
+// FLATTEN-CHECK-LABEL:  func private @call_tuple_args_result(%arg0: tensor<i64>, %arg1: tensor<i64>, %arg2: tensor<i64>, %arg3: tensor<i64>) -> (tensor<i64>, tensor<i64>) {
+%call_tuple_args_result (arg1: (s64[], (s64[], s64[])), arg2: s64[]) -> (s64[], s64[]) {
+  %arg1 = (s64[], (s64[], s64[])) parameter(0)
+  %arg2 = s64[] parameter(1)
+  %get-tuple-element.1 = s64[] get-tuple-element((s64[], (s64[], s64[])) %arg1), index=0
+  %get-tuple-element.2 = (s64[], s64[]) get-tuple-element((s64[], (s64[], s64[])) %arg1), index=1
+  %get-tuple-element.3 = s64[] get-tuple-element((s64[], s64[]) %get-tuple-element.2), index=1
+  %get-tuple-element.4 = s64[] get-tuple-element((s64[], s64[]) %get-tuple-element.2), index=0
+  %add.1 = s64[] add(s64[] %get-tuple-element.1, s64[] %get-tuple-element.3)
+  %add.2 = s64[] add(s64[] %arg2, s64[] %get-tuple-element.4)
+
+  // CHECK-NEXT:  %[[GTE_1:.+]] = mhlo.get_tuple_element %arg0[0]
+  // CHECK-NEXT:  %[[GTE_2:.+]] = mhlo.get_tuple_element %arg0[1] {xla_shape = "(s64[], s64[])"
+  // CHECK-NEXT:  %[[GTE_3:.+]] = mhlo.get_tuple_element %[[GTE_2]][1]
+  // CHECK-NEXT:  %[[ADD_1:.+]] = mhlo.add %[[GTE_1]], %[[GTE_3]]
+  // CHECK-NEXT:  %[[GTE_4:.+]] = mhlo.get_tuple_element %[[GTE_2]][0]
+  // CHECK-NEXT:  %[[ADD_2:.+]] = mhlo.add %arg1, %[[GTE_4]]
+  // CHECK-NEXT:  %[[TUPLE:.+]] = mhlo.tuple %[[ADD_1]], %[[ADD_2]]
+  // CHECK-NEXT: return %[[TUPLE]]
+
+  // FLATTEN-CHECK-NEXT:  %[[ADD_1:.+]] = mhlo.add %arg0, %arg2
+  // FLATTEN-CHECK-NEXT:  %[[ADD_2:.+]] = mhlo.add %arg3, %arg1
+  // FLATTEN-CHECK-NEXT: return %[[ADD_1]], %[[ADD_2]]
+
+  ROOT %tuple.1 = (s64[], s64[]) tuple(%add.1, %add.2)
+}
+
+// CHECK-LABEL:  func private @test_call_tuple_args_result(%arg0: tensor<i64>, %arg1: tuple<tensor<i64>, tuple<tensor<i64>, tensor<i64>>>) -> tuple<tensor<i64>, tensor<i64>> {
+// FLATTEN-CHECK-LABEL:  func private @test_call_tuple_args_result(%arg0: tensor<i64>, %arg1: tensor<i64>, %arg2: tensor<i64>, %arg3: tensor<i64>) -> (tensor<i64>, tensor<i64>) {
+%test_call_tuple_args_result (arg1: s64[], arg2: (s64[], (s64[], s64[]))) -> (s64[], s64[]) {
+  %arg1 = s64[] parameter(0)
+  %arg2 = (s64[], (s64[], s64[])) parameter(1)
+
+  // CHECK-NEXT:  %[[CALL:.+]] = call @call_tuple_args_result(%arg1, %arg0) {xla_shape = "(s64[], s64[])"} : (tuple<tensor<i64>, tuple<tensor<i64>, tensor<i64>>>, tensor<i64>) -> tuple<tensor<i64>, tensor<i64>>
+  // CHECK-NEXT:  return %[[CALL]]
+
+  // FLATTEN-CHECK-NEXT:  %[[CALL:.+]]:2 = call @call_tuple_args_result(%arg1, %arg2, %arg3, %arg0) : (tensor<i64>, tensor<i64>, tensor<i64>, tensor<i64>) -> (tensor<i64>, tensor<i64>)
+  // FLATTEN-CHECK-NEXT:  return %[[CALL]]#0, %[[CALL]]#1
+
+  ROOT %call.2 = (s64[], s64[]) call(%arg2, %arg1), to_apply=%call_tuple_args_result
+}
+
 // CHECK-LABEL:  func private @test_cholesky
 // CHECK-SAME:  ([[ARG:%.*]]: tensor<1x291x291xf32>) -> tensor<1x291x291xf32>
 %test_cholesky (a: f32[1,291,291]) -> f32[1,291,291] {
@@ -365,6 +421,8 @@ add {
 // implementations with attributes, etc.
 // CHECK-LABEL: func private @test_conv(
 // CHECK-SAME:                  %[[VAL_0:.*]]: tensor<256x32x32x1xf32>) -> tuple<tensor<256x30x30x1xf32>> {
+// FLATTEN-CHECK-LABEL: func private @test_conv(
+// FLATTEN-CHECK-SAME:                  %[[VAL_0:.*]]: tensor<256x32x32x1xf32>) -> tensor<256x30x30x1xf32> {
 %test_conv {
   %arg0.1 = f32[256,32,32,1]{3,2,1,0} parameter(0), metadata={op_name="HLO_Args"}
 
@@ -395,6 +453,7 @@ add {
   %reshape.5 = f32[256,30,30,1]{3,2,1,0} reshape(%convolution.4), metadata={op_name="HLO_Retvals"}
 
   // CHECK-NEXT: %[[VAL_6:.*]] = mhlo.tuple %[[VAL_5]] {xla_shape = {{.*}}} : tuple<tensor<256x30x30x1xf32>>
+  // FLATTEN-CHECK: return %[[_:.+]] : tensor<256x30x30x1xf32>
   ROOT %tuple.6 = (f32[256,30,30,1]{3,2,1,0}) tuple(%reshape.5), metadata={op_name="HLO_Retvals"}
 }
 
@@ -573,6 +632,8 @@ add {
 
 // CHECK-LABEL:  func private @test_custom_call_tuple_output
 // CHECK-SAME:  [[ARG_0:%.*]]: tensor<2x3xf32>, [[ARG_1:%.*]]: tensor<5x5xf32>) -> tuple<tensor<1x2x3xf32>, tensor<3x7x9xi32>>
+// FLATTEN-CHECK-LABEL:  func private @test_custom_call_tuple_output
+// FLATTEN-CHECK-SAME:  [[ARG_0:%.*]]: tensor<2x3xf32>, [[ARG_1:%.*]]: tensor<5x5xf32>) -> (tensor<1x2x3xf32>, tensor<3x7x9xi32>)
 %test_custom_call_tuple_output (arg1: f32[2,3], arg2: f32[5,5]) -> (f32[1,2,3], s32[3,7,9]) {
   %arg1 = f32[2,3] parameter(0)
   %arg2 = f32[5,5] parameter(1)
@@ -610,12 +671,21 @@ add {
 
 // CHECK-LABEL:  func private @test_custom_call_with_output_operand_alising
 // CHECK-SAME:  [[ARG_0:%.*]]: tuple<tensor<1x1xf32>, tensor<2x3xf32>>, [[ARG_1:%.*]]: tensor<5x5xf32>) -> tuple<tensor<2x3xf32>>
+// FLATTEN-CHECK-LABEL:  func private @test_custom_call_with_output_operand_alising
+// FLATTEN-CHECK-SAME:  [[ARG_0:%.*]]: tensor<1x1xf32>, [[ARG_1:%.*]]: tensor<2x3xf32>, [[ARG_2:%.*]]: tensor<5x5xf32>) -> tensor<2x3xf32>
 %test_custom_call_with_output_operand_alising (arg1: (f32[1,1], f32[2,3]), arg2: f32[5,5]) -> (f32[2,3]) {
   %arg1 = (f32[1,1], f32[2,3]) parameter(0)
   %arg2 = f32[5,5] parameter(1)
   // CHECK:  mhlo.custom_call @foo([[ARG_0]], [[ARG_1]]) {
   // CHECK-SAME{LITERAL}: output_operand_aliases = [#mhlo.output_operand_alias<output_tuple_indices = [0], operand_index = 0, operand_tuple_indices = [1]>]
   // CHECK-SAME: (tuple<tensor<1x1xf32>, tensor<2x3xf32>>, tensor<5x5xf32>) -> tuple<tensor<2x3xf32>>
+
+  // FLATTEN-CHECK-NEXT: [[TUPLE:%.*]] = mhlo.tuple %arg0, %arg1 : tuple<tensor<1x1xf32>, tensor<2x3xf32>>
+  // FLATTEN-CHECK-NEXT: [[CUSTOM_CALL:%.*]] = mhlo.custom_call @foo([[TUPLE]], [[ARG_2]]) {
+  // FLATTEN-CHECK-SAME{LITERAL}: output_operand_aliases = [#mhlo.output_operand_alias<output_tuple_indices = [0], operand_index = 0, operand_tuple_indices = [1]>]
+  // FLATTEN-CHECK-SAME: (tuple<tensor<1x1xf32>, tensor<2x3xf32>>, tensor<5x5xf32>) -> tuple<tensor<2x3xf32>>
+  // FLATTEN-CHECK-NEXT: [[GET_TUPLE_ELEMENT:%.*]] = mhlo.get_tuple_element [[CUSTOM_CALL]][0] : (tuple<tensor<2x3xf32>>) -> tensor<2x3xf32>
+  // FLATTEN-CHECK-NEXT: return [[GET_TUPLE_ELEMENT]]
   ROOT %custom-call = (f32[2,3]) custom-call((f32[1,1], f32[2,3]) %arg1, f32[5,5] %arg2), custom_call_target="foo", output_to_operand_aliasing={{0}: (0, {1})}
 }
 
@@ -840,6 +910,8 @@ add {
 
 // CHECK-LABEL:  func private @test_infeed
 // CHECK-SAME: ([[TOKEN:%.*]]: !mhlo.token) -> tuple<tensor<3x3x3xi32>, !mhlo.token>
+// FLATTEN-CHECK-LABEL:  func private @test_infeed
+// FLATTEN-CCHECK-SAME: ([[TOKEN:%.*]]: !mhlo.token) -> (tensor<3x3x3xi32>, !mhlo.token)
 %test_infeed (token0: token[]) -> (s32[3, 3, 3], token[]) {
   %token0 = token[] parameter(0)
   // CHECK-NEXT:  "mhlo.infeed"([[TOKEN]])
@@ -863,6 +935,8 @@ add {
 
 // CHECK-LABEL:  func private @test_infeed_layout
 // CHECK-SAME: ([[TOKEN:%.*]]: !mhlo.token) -> tuple<tuple<tensor<3x4xi32>, tensor<5x6xi32>, tensor<7x8xi32>, tensor<9x10xi32>>, !mhlo.token>
+// FLATTEN-CHECK-LABEL:  func private @test_infeed_layout
+// FLATTEN-CHECK-SAME: ([[TOKEN:%.*]]: !mhlo.token) -> (tensor<3x4xi32>, tensor<5x6xi32>, tensor<7x8xi32>, tensor<9x10xi32>, !mhlo.token)
 %test_infeed_layout (token0: token[]) -> ( ( s32[3, 4]{1, 0}, s32[5, 6]{1, 0}, s32[7, 8]{1, 0}, s32[9,10]{1, 0}), token[]) {
   %token0 = token[] parameter(0)
   // CHECK-NEXT:  [[INFEED:%.*]]:5 = "mhlo.infeed"([[TOKEN]])
@@ -871,6 +945,9 @@ add {
   // CHECK-NEXT: [[T1:%.*]] = mhlo.tuple [[INFEED]]#0, [[INFEED]]#1, [[INFEED]]#2, [[INFEED]]#3
   // CHECK-NEXT: [[T2:%.*]] = mhlo.tuple [[T1]], [[INFEED]]#4
   // CHECK-NEXT: return [[T2]]
+
+  // FLATTEN-CHECK-NEXT:  [[INFEED:%.*]]:5 = "mhlo.infeed"([[TOKEN]])
+  // FLATTEN-CHECK-NEXT: return [[INFEED]]#0, [[INFEED]]#1, [[INFEED]]#2, [[INFEED]]#3, [[INFEED]]#4
   ROOT %infeed = ( ( s32[3, 4]{1, 0}, s32[5, 6]{1, 0}, s32[7, 8]{1, 0}, s32[9,10]{1, 0}), token[]) infeed(token[] %token0), infeed_config="foobar"
 }
 
@@ -1152,6 +1229,8 @@ add {
 
 // CHECK-LABEL:  func private @test_reduce
 // CHECK-SAME: ([[ARG0:%.*]]: tensor<4x4xf32>, [[ARG1:%.*]]: tensor<4xf32>, [[ARG2:%.*]]: tensor<f32>) -> tuple<tuple<tensor<f32>, tensor<f32>>, tensor<f32>>
+// FLATTEN-CHECK-LABEL:  func private @test_reduce
+// FLATTEN-CHECK-SAME: ([[ARG0:%.*]]: tensor<4x4xf32>, [[ARG1:%.*]]: tensor<4xf32>, [[ARG2:%.*]]: tensor<f32>) -> (tensor<f32>, tensor<f32>, tensor<f32>)
 %test_reduce (Arg_0.1: f32[4, 4], Arg_1.2: f32[4], Arg_2.3: f32[]) -> ((f32[], f32[]), f32[]) {
   %Arg_0.1 = f32[4, 4] parameter(0)
   %Arg_1.2 = f32[4] parameter(1)
@@ -1163,6 +1242,8 @@ add {
   // CHECK:  %[[B:.*]] = mhlo.add{{.*}} : tensor<f32>
   // CHECK: mhlo.return %[[A]], %[[B]] : tensor<f32>, tensor<f32>
   // CHECK: mhlo.tuple %0#0, %0#1 {xla_shape = {{.*}}} : tuple<tensor<f32>, tensor<f32>>
+
+  // FLATTEN-CHECK:  [[REDUCE_1:%.*]]:2 = mhlo.reduce([[ARG0]] init: [[ARG2]]), ([[ARG0]] init: [[ARG2]])
   %reduce.1 = (f32[], f32[]) reduce(%Arg_0.1, %Arg_0.1, %Arg_2.3, %Arg_2.3), dimensions={0, 1}, to_apply=%reduce_helper.1
 
   // CHECK:  [[VAL2:%.*]] = mhlo.reduce([[ARG0]] init: [[ARG2]]) applies mhlo.add across dimensions = [0, 1] : (tensor<4x4xf32>, tensor<f32>) -> tensor<f32>
@@ -1173,9 +1254,11 @@ add {
   // CHECK:  [[VAL4:%.*]] = mhlo.reduce([[VAL3]] init: [[ARG2]]) applies mhlo.add across dimensions = [0] : (tensor<4xf32>, tensor<f32>) -> tensor<f32>
   %reduce.4 = f32[] reduce(%reduce.2, %Arg_2.3), dimensions={0}, to_apply=%reduce_helper.3
 
-  // CHECK:  %5 = mhlo.subtract [[VAL2]], [[VAL4]] : tensor<f32>
+  // CHECK:  mhlo.subtract [[VAL2]], [[VAL4]] : tensor<f32>
+  // FLATTEN-CHECK:  [[SUBTRACT:%.*]] = mhlo.subtract
   %sub.5 = f32[] subtract(%reduce.3, %reduce.4)
 
+  // FLATTEN-CHECK:  return [[REDUCE_1]]#0, [[REDUCE_1]]#1, [[SUBTRACT]]
   ROOT %tuple.6 = ((f32[], f32[]), f32[]) tuple(%reduce.1, %sub.5)
 }
 
@@ -1413,6 +1496,9 @@ add {
 // CHECK:      mhlo.return [[LHS]], [[RHS]] : tensor<f32>, tensor<f32>
 // CHECK:    })
 
+// FLATTEN-CHECK-LABEL:  func.func private @test_variadic_scatter
+// FLATTEN-CHECK-SAME:   [[ARG_0:%.*]]: tensor<200x100x300xf32>, [[ARG_1:%.*]]: tensor<10x2xi64>, [[ARG_2:%.*]]: tensor<10x300xf32>) -> (tensor<200x100x300xf32>, tensor<200x100x300xf32>)
+
 %update_computation_returning_tuple {
   %lhs = f32[] parameter(0)
   %rhs = f32[] parameter(1)
@@ -1627,11 +1713,13 @@ add {
 }
 
 // CHECK-LABEL:  func private @test_tuple(%arg0: tensor<1xi32>, %arg1: tensor<1x2xf32>) -> tuple<tensor<1xi32>, tensor<1x2xf32>>
+// FLATTEN-CHECK-LABEL:  func private @test_tuple(%arg0: tensor<1xi32>, %arg1: tensor<1x2xf32>) -> (tensor<1xi32>, tensor<1x2xf32>)
 %test_tuple(Arg_0.1: s32[1], Arg_1.2: f32[1, 2]) -> (s32[1], f32[1,2]) {
   %Arg_0.1 = s32[1] parameter(0)
   %Arg_1.2 = f32[1, 2] parameter(1)
 
   // CHECK:  mhlo.tuple %arg0, %arg1 {xla_shape = {{.*}}} : tuple<tensor<1xi32>, tensor<1x2xf32>>
+  // FLATTEN-CHECK:  return %arg0, %arg1 : tensor<1xi32>, tensor<1x2xf32>
   ROOT %tuple.4 = (s32[1], f32[1,2]) tuple(%Arg_0.1, %Arg_1.2)
 }
 
@@ -1702,6 +1790,8 @@ add {
 
 // CHECK-LABEL:  func private @complex_type
 // CHECK-SAME:    (%[[ARG0:.*]]: tensor<2xcomplex<f32>>, %[[ARG1:.*]]: tensor<2xcomplex<f64>>) -> tuple<tensor<2xf32>, tensor<2xf64>>
+// FLATTEN-CHECK-LABEL:  func private @complex_type
+// FLATTEN-CHECK-SAME:    (%[[ARG0:.*]]: tensor<2xcomplex<f32>>, %[[ARG1:.*]]: tensor<2xcomplex<f64>>) -> (tensor<2xf32>, tensor<2xf64>)
 %complex_type (Arg_0.1: c64[2], Arg_1.2: c128[2]) -> (f32[2], f64[2]) {
   %Arg_0.1 = c64[2] parameter(0)
   %abs.3 = f32[2] abs(c64[2] %Arg_0.1)
@@ -1724,14 +1814,19 @@ add {
 
 // CHECK-LABEL:  func private @rngbitgen_tuple_shape
 // CHECK-SAME:    (%[[ARG0:.*]]: tensor<3xui64>) -> (tuple<tensor<3xui64>, tensor<2x2xui32>> {mhlo.sharding = "{{\{}}{maximal device=0}, {maximal device=1}}"})
+// FLATTEN-CHECK-LABEL:  func private @rngbitgen_tuple_shape
+// FLATTEN-CHECK-SAME:    (%[[ARG0:.*]]: tensor<3xui64>) -> (tensor<3xui64> {mhlo.sharding = "{maximal device=0}"}, tensor<2x2xui32> {mhlo.sharding = "{maximal device=1}"})
 %rngbitgen_tuple_shape (Arg_0.1: u64[3]) -> (u64[3], u32[2,2]) {
   %Arg_0.1 = u64[3] parameter(0)
-  // CHECK: %[[RNG0:.+]], %[[RNG1:.+]] = "mhlo.rng_bit_generator"(%[[ARG0]]) 
+  // CHECK: %[[RNG0:.+]], %[[RNG1:.+]] = "mhlo.rng_bit_generator"(%[[ARG0]])
   // CHECK-SAME: rng_algorithm = #mhlo.rng_algorithm<PHILOX>
   // CHECK-SAME: mhlo.sharding = "{{\{}}{maximal device=0}, {maximal device=1}}"
   // CHECK-SAME: (tensor<3xui64>) -> (tensor<3xui64>, tensor<2x2xui32>)
   // CHECK: %[[TUPLE:.+]] = mhlo.tuple %[[RNG0]], %[[RNG1]] {xla_shape = "(u64[3]{0}, u32[2,2]{1,0})"} : tuple<tensor<3xui64>, tensor<2x2xui32>>
   // CHECK: return %[[TUPLE]]
+
+  // FLATTEN-CHECK: %[[RNG0:.+]], %[[RNG1:.+]] = "mhlo.rng_bit_generator"(%[[ARG0]])
+  // FLATTEN-CHECK: return %[[RNG0]], %[[RNG1]]
   ROOT %rng-bit-generator.2 = (u64[3], u32[2,2]) rng-bit-generator(u64[3] %Arg_0.1), algorithm=rng_philox, sharding={{maximal device=0}, {maximal device=1}}
 }
 
@@ -1845,14 +1940,10 @@ add {
    %async-start = ((f32[10]{0}), f32[20]{0}, s32[]) async-start(%p0),
                                                     calls=%AsyncOp
   // CHECK: %[[UPDATE:.*]] = "mhlo.async_update"(%[[START]])
-  // CHECK-SAME: called_computation = @AsyncOp
-  // CHECK-SAME: execution_thread = "main"
    %async-update = ((f32[10]{0}), f32[20]{0}, s32[]) async-update(
                                                          %async-start),
                                                      calls=%AsyncOp
   // CHECK: "mhlo.async_done"(%[[UPDATE]])
-  // CHECK-SAME: called_computation = @AsyncOp
-  // CHECK-SAME: execution_thread = "main"
    ROOT %async-done = f32[20]{0} async-done(%async-update), calls=%AsyncOp
  }
 
@@ -1863,6 +1954,33 @@ add {
   ROOT %copy.1 = s32[4] copy(%arg0.1), sharding={devices=[2]0,1}
 }
 
+// CHECK-LABEL: func private @test_tuple_args_and_result_with_sharding
+// CHECK-SAME{LITERAL}: (%arg0: tuple<tuple<tensor<4xi32>, tensor<4xi32>>, tensor<4xi32>> {mhlo.sharding = "{{replicated}, {devices=[4]<=[4]}, {devices=[2,2]<=[4] last_tile_dim_replicate}}"},
+// CHECK-SAME{LITERAL}:  %arg1: tensor<4xi32> {mhlo.sharding = "{devices=[2,2]<=[2,2]T(1,0) last_tile_dim_replicate}"})
+// CHECK-SAME{LITERAL}: -> (tuple<tensor<4xi32>, tuple<tuple<tensor<4xi32>, tensor<4xi32>>, tensor<4xi32>>>
+// CHECK-SAME{LITERAL}:     {mhlo.sharding = "{{devices=[2,2]<=[2,2]T(1,0) last_tile_dim_replicate}, {replicated}, {devices=[4]<=[4]}, {devices=[2,2]<=[4] last_tile_dim_replicate}}"})
+
+// FLATTEN-CHECK-LABEL: func private @test_tuple_args_and_result_with_sharding
+// FLATTEN-CHECK-SAME:    (%arg0: tensor<4xi32> {mhlo.sharding = "{replicated}"},
+// FLATTEN-CHECK-SAME:     %arg1: tensor<4xi32> {mhlo.sharding = "{devices=[4]<=[4]}"},
+// FLATTEN-CHECK-SAME:     %arg2: tensor<4xi32> {mhlo.sharding = "{devices=[2,2]<=[4] last_tile_dim_replicate}"},
+// FLATTEN-CHECK-SAME:     %arg3: tensor<4xi32> {mhlo.sharding = "{devices=[2,2]<=[2,2]T(1,0) last_tile_dim_replicate}"})
+// FLATTEN-CHECK-SAME:    -> (tensor<4xi32> {mhlo.sharding = "{devices=[2,2]<=[2,2]T(1,0) last_tile_dim_replicate}"},
+// FLATTEN-CHECK-SAME:        tensor<4xi32> {mhlo.sharding = "{replicated}"},
+// FLATTEN-CHECK-SAME:        tensor<4xi32> {mhlo.sharding = "{devices=[4]<=[4]}"},
+// FLATTEN-CHECK-SAME:        tensor<4xi32> {mhlo.sharding = "{devices=[2,2]<=[4] last_tile_dim_replicate}"})
+%test_tuple_args_and_result_with_sharding (arg1: ((s32[4], s32[4]), s32[4]), arg2: s32[4]) -> (s32[4], ((s32[4], s32[4]), s32[4])) {
+  %arg1 = ((s32[4], s32[4]), s32[4]) parameter(0), sharding={{replicated},{devices=[4]<=[4]},{devices=[2,2]<=[4] last_tile_dim_replicate}}
+  %arg2 = s32[4] parameter(1), sharding={devices=[2,2]<=[2,2]T(1,0) last_tile_dim_replicate}
+
+  // CHECK-NEXT: %[[TUPLE:.+]] = mhlo.tuple %arg1, %arg0
+  // CHECK-NEXT: return %[[TUPLE]]
+
+  // FLATTEN-CHECK-NEXT: return %arg3, %arg0, %arg1, %arg2
+
+  ROOT %tuple.1 = (s32[4], ((s32[4], s32[4]), s32[4])) tuple(s32[4] %arg2, ((s32[4], s32[4]), s32[4]) %arg1), sharding={{devices=[2,2]<=[2,2]T(1,0) last_tile_dim_replicate}, {replicated}, {devices=[4]<=[4]}, {devices=[2,2]<=[4] last_tile_dim_replicate}}
+}
+
 // CHECK-LABEL: func private @named_thread
 // CHECK-SAME: (%[[ARG0:.*]]: tensor<10xf32>)
 // CHECK-SAME: execution_thread = "mythread"
@@ -1876,6 +1994,11 @@ add {
 // CHECK-LABEL: func private @param_replication
 // CHECK-SAME: mhlo.parameter_replication = [true]
 // CHECK-SAME: mhlo.parameter_replication = [false, true]
+
+// FLATTEN-CHECK-LABEL: func private @param_replication(
+// FLATTEN-CHECK-SAME: %arg0: tensor<f32> {mhlo.parameter_replication = [true]},
+// FLATTEN-CHECK-SAME: %arg1: tensor<2x4xf32>,
+// FLATTEN-CHECK-SAME: %arg2: tensor<2x4xf32> {mhlo.parameter_replication = [true]})
 %param_replication (a: f32[], b: (f32[2,4], (f32[2,4]))) -> (f32[], (f32[2,4], (f32[2,4]))) {
   %a = f32[] parameter(0), parameter_replication={true}
   %b = (f32[2,4]{1,0}, (f32[2,4]{1,0})) parameter(1), parameter_replication={false,true}
@@ -1899,3 +2022,6 @@ add {
 // CHECK-LABEL:  func private @test_topk
 // CHECK-SAME:  ([[ARG:%.*]]: tensor<4x4xf32>) -> tuple<tensor<4x2xf32>, tensor<4x2xi32>>
 // CHECK:  mhlo.topk([[ARG]], k = 2, largest = true) : tensor<4x4xf32> -> (tensor<4x2xf32>, tensor<4x2xi32>)
+
+// FLATTEN-CHECK-LABEL:  func private @test_topk
+// FLATTEN-CHECK-SAME:  ([[ARG:%.*]]: tensor<4x4xf32>) -> (tensor<4x2xf32>, tensor<4x2xi32>)
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/tests/import_async.hlotxt b/third_party/xla/xla/translate/hlo_to_mhlo/tests/import_async.hlotxt
index 8040ac89faabb0..7dcd16a1f9b615 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/tests/import_async.hlotxt
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/tests/import_async.hlotxt
@@ -58,7 +58,6 @@ ENTRY %dummy_main (Arg_0.1: f32[]) -> f32[] {
   // CHECK-SAME: called_computation = [[AG_GENSYM]], execution_thread = "main"
   ag-start = (f32[128,32], f32[128,128]) all-gather-start(input), channel_id=1, replica_groups={{0, 2, 4, 6}, {1, 3, 5, 7}}, dimensions={1}, use_global_device_ids=true
   // CHECK-NEXT:  "mhlo.async_done"([[AG_START]])
-  // CHECK-SAME: called_computation = [[AG_GENSYM]], execution_thread = "main"
   ROOT ag-done = f32[128,128] all-gather-done(ag-start)
 }
 
@@ -76,7 +75,6 @@ add {
   // CHECK-SAME: called_computation = [[AR_GENSYM]], execution_thread = "main"
   ar-start = (f32[128,32], f32[128,32]) all-reduce-start(input), channel_id=1, replica_groups={{0, 2, 4, 6}, {1, 3, 5, 7}}, to_apply=add, use_global_device_ids=true
   // CHECK-NEXT:  "mhlo.async_done"([[AR_START]])
-  // CHECK-SAME: called_computation = [[AR_GENSYM]], execution_thread = "main"
   ROOT ar-done = f32[128,32] all-reduce-done(ar-start)
 }
 
@@ -88,7 +86,6 @@ add {
   // CHECK-SAME: called_computation = [[CP_GENSYM]], execution_thread = "main"
   %cp-start = (f32[128,32]{1,0}, f32[128,32]) collective-permute-start(%input), source_target_pairs={{0,1},{1,2},{2,3}}
   // CHECK-NEXT:  "mhlo.async_done"([[CP_START]])
-  // CHECK-SAME: called_computation = [[CP_GENSYM]], execution_thread = "main"
   ROOT %cp-done = f32[128,32]{1,0} collective-permute-done(%cp-start)
 }
 
@@ -100,7 +97,6 @@ add {
   // CHECK-SAME: called_computation = [[COPY_GENSYM]], execution_thread = "main"
   copy-start = (f32[128,32], f32[128,32], u32[]) copy-start(input), cross_program_prefetch_index=0
   // CHECK-NEXT: "mhlo.async_done"([[COPY_START]])
-  // CHECK-SAME: called_computation = [[COPY_GENSYM]], execution_thread = "main"
   ROOT copy-done = f32[128,32] copy-done(copy-start)
 }
 
@@ -114,7 +110,6 @@ add {
   // CHECK-SAME{LITERAL}: -> !mhlo.async_bundle<tuple<tensor<128x32xf32>, !mhlo.token>, !mhlo.token, tensor<ui32>>
   send-start = (f32[128,32], u32[], token[]) send(input, tok), channel_id=5, is_host_transfer=true
   // CHECK-NEXT: "mhlo.async_done"([[SEND_START]])
-  // CHECK-SAME: called_computation = [[SEND_GENSYM]], execution_thread = "main"
   ROOT send-done = token[] send-done(send-start), channel_id=5, is_host_transfer=true
 }
 
@@ -128,7 +123,6 @@ add {
   // CHECK-SAME{LITERAL}: -> !mhlo.async_bundle<!mhlo.token, tuple<tensor<128x32xf32>, !mhlo.token>, tensor<ui32>>
   recv-start = (f32[128,32], u32[], token[]) recv(tok), channel_id=5, is_host_transfer=true
   // CHECK-NEXT: "mhlo.async_done"([[RECV_START]])
-  // CHECK-SAME: called_computation = [[RECV_GENSYM]], execution_thread = "main"
   recv-done = (f32[128,21], token[]) recv-done(recv-start), channel_id=5, is_host_transfer=true
   ROOT gte = get-tuple-element(recv-done), index=0
 }
@@ -143,7 +137,6 @@ add {
   // CHECK-SAME{LITERAL}: -> !mhlo.async_bundle<!mhlo.token, tuple<tensor<128x32xf32>, !mhlo.token>, tensor<ui32>>
   recv-start = (f32[128,32], u32[], token[]) recv(tok), channel_id=5
   // CHECK-NEXT: "mhlo.async_done"([[RECV_START]])
-  // CHECK-SAME: called_computation = [[RECV_DTD_GENSYM]], execution_thread = "main"
   recv-done = (f32[128,21], token[]) recv-done(recv-start), channel_id=5
   ROOT gte = get-tuple-element(recv-done), index=0
 }
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/tests/module_attributes.hlo b/third_party/xla/xla/translate/hlo_to_mhlo/tests/module_attributes.hlo
index 1acf05ac3a3f04..74eaaea5a0e8fe 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/tests/module_attributes.hlo
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/tests/module_attributes.hlo
@@ -1,6 +1,8 @@
-# RUN: xla-translate -hlo-to-mlir-hlo %s | FileCheck %s
+# RUN: xla-translate -hlo-to-mlir-hlo %s -o - | FileCheck %s
+# RUN: xla-translate -hlo-to-mlir-hlo -hlo-flatten-computation-args-result %s -o - | FileCheck %s -check-prefix=FLATTEN-CHECK
 
 # CHECK-LABEL: module @main attributes {
+# FLATTEN-CHECK-LABEL: module @main attributes {
 hlo_module       {
   name: "main"
   entry_computation_name: "main.5"
@@ -202,10 +204,11 @@ hlo_module       {
   }
   id: 5
   entry_computation_id: 5
-# CHECK-SAME: mhlo.cross_program_prefetches = [#mhlo.cross_program_prefetch<parameter = 1, indices = [0], offset = 0>]
+# CHECK-SAME: mhlo.cross_program_prefetches = [#mhlo.cross_program_prefetch<parameter = 1, indices = [1], offset = 0>]
+# FLATTEN-CHECK-SAME: mhlo.cross_program_prefetches = [#mhlo.cross_program_prefetch<parameter = 2, indices = [], offset = 0>]
   cross_program_prefetches {
     parameter: 1
-    index: 0
+    index: 1
   }
 # CHECK-SAME: mhlo.frontend_attributes = {attr_name = "attr_value"}
   frontend_attributes {
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/tests/spmd_module_sharding.hlo b/third_party/xla/xla/translate/hlo_to_mhlo/tests/spmd_module_sharding.hlo
index 0661e5faaf35d8..c4bf732bd5d37f 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/tests/spmd_module_sharding.hlo
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/tests/spmd_module_sharding.hlo
@@ -1,8 +1,13 @@
-# RUN: xla-translate -hlo-to-mlir-hlo %s | FileCheck %s
+# RUN: xla-translate -hlo-to-mlir-hlo %s -o - | FileCheck %s
+# RUN: xla-translate -hlo-to-mlir-hlo -hlo-flatten-computation-args-result %s -o - | FileCheck %s -check-prefix=FLATTEN-CHECK
 
 # CHECK-LABEL: module @main attributes
 # CHECK-SAME: mhlo.spmd_output_sharding = "{devices=[1,2]0,1}"
-# CHECK-SAME: mhlo.spmd_parameters_shardings = ["{devices=[1,2]0,1}"]
+# CHECK-SAME{LITERAL}: mhlo.spmd_parameters_shardings = ["{{devices=[1,2]0,1}, {replicated}}"]
+
+# FLATTEN-CHECK-LABEL: module @main attributes
+# FLATTEN-CHECK-SAME: mhlo.spmd_output_sharding = "{devices=[1,2]0,1}"
+# FLATTEN-CHECK-SAME: mhlo.spmd_parameters_shardings = ["{devices=[1,2]0,1}", "{replicated}"]
 
 hlo_module       {
   name: "main"
@@ -13,15 +18,21 @@ hlo_module       {
       name: "Arg_0.1"
       opcode: "parameter"
       shape {
-        element_type: F32
-        dimensions: 16
-        dimensions: 16
-        layout {
-          minor_to_major: 1
-          minor_to_major: 0
+        element_type: TUPLE
+        tuple_shapes {
+          element_type: F32
+          dimensions: 16
+          dimensions: 16
+          layout {
+            minor_to_major: 1
+            minor_to_major: 0
+          }
+          is_dynamic_dimension: false
+          is_dynamic_dimension: false
+        }
+        tuple_shapes {
+          element_type: F32
         }
-        is_dynamic_dimension: false
-        is_dynamic_dimension: false
       }
       metadata {
       }
@@ -63,15 +74,21 @@ hlo_module       {
     }
     program_shape {
       parameters {
-        element_type: F32
-        dimensions: 16
-        dimensions: 16
-        layout {
-          minor_to_major: 1
-          minor_to_major: 0
+        element_type: TUPLE
+        tuple_shapes {
+          element_type: F32
+          dimensions: 16
+          dimensions: 16
+          layout {
+            minor_to_major: 1
+            minor_to_major: 0
+          }
+          is_dynamic_dimension: false
+          is_dynamic_dimension: false
+        }
+        tuple_shapes {
+          element_type: F32
         }
-        is_dynamic_dimension: false
-        is_dynamic_dimension: false
       }
       result {
         element_type: F32
@@ -91,15 +108,21 @@ hlo_module       {
   }
   host_program_shape {
     parameters {
-      element_type: F32
-      dimensions: 16
-      dimensions: 16
-      layout {
-        minor_to_major: 1
-        minor_to_major: 0
+      element_type: TUPLE
+      tuple_shapes {
+        element_type: F32
+        dimensions: 16
+        dimensions: 16
+        layout {
+          minor_to_major: 1
+          minor_to_major: 0
+        }
+        is_dynamic_dimension: false
+        is_dynamic_dimension: false
+      }
+      tuple_shapes {
+        element_type: F32
       }
-      is_dynamic_dimension: false
-      is_dynamic_dimension: false
     }
     result {
       element_type: F32
@@ -124,10 +147,16 @@ hlo_module       {
     tile_assignment_devices: 1
   }
   spmd_parameters_shardings {
-    type: OTHER
-    tile_assignment_dimensions: 1
-    tile_assignment_dimensions: 2
-    tile_assignment_devices: 0
-    tile_assignment_devices: 1
+    type: TUPLE
+    tuple_shardings {
+      type: OTHER
+      tile_assignment_dimensions: 1
+      tile_assignment_dimensions: 2
+      tile_assignment_devices: 0
+      tile_assignment_devices: 1
+    }
+    tuple_shardings {
+      type: REPLICATED
+    }
   }
 }
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/tests/while.hlotxt b/third_party/xla/xla/translate/hlo_to_mhlo/tests/while.hlotxt
index 0b6c252a8338d7..101bc837fe9ad3 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/tests/while.hlotxt
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/tests/while.hlotxt
@@ -1,4 +1,5 @@
 // RUN: xla-translate -hlo-text-to-mlir-hlo -hlo-import-all-computations %s -o - | FileCheck %s
+// RUN: xla-translate -hlo-text-to-mlir-hlo -hlo-import-all-computations -hlo-flatten-computation-args-result %s -o - | FileCheck %s -check-prefix=FLATTEN-CHECK
 
 HloModule foo
 
@@ -43,6 +44,11 @@ ENTRY %foo (arg0.1: s64[]) -> s64[] {
 // CHECK-NEXT:   %[[RES:.*]] = mhlo.tuple %[[GTE_0]], %[[GTE_1]], %[[GTE_2]], %[[ADD]]
 // CHECK-NEXT:   return %[[RES]] : tuple<tensor<i32>, tensor<i32>, tensor<f32>, tensor<f32>>
 
+// FLATTEN-CHECK-LABEL: func private @region_body1
+// FLATTEN-CHECK-SAME:    (%[[ARG_0:.*]]: tensor<i32>, %[[ARG_1:.*]]: tensor<i32>, %[[ARG_2:.*]]: tensor<f32>, %[[ARG_3:.*]]: tensor<f32>) -> (tensor<i32>, tensor<i32>, tensor<f32>, tensor<f32>)
+// FLATTEN-CHECK-NEXT:   %[[ADD:.*]] = mhlo.add %[[ARG_2]], %[[ARG_3]]
+// FLATTEN-CHECK-NEXT:   return %[[ARG_0]], %[[ARG_1]], %[[ARG_2]], %[[ADD]]
+
 %region_body1 (Arg_0.7: (s32[], s32[], f32[], f32[])) -> (s32[], s32[], f32[], f32[]) {
   %Arg_0.7 = (s32[], s32[], f32[], f32[]) parameter(0)
   %get-tuple-element.8 = s32[] get-tuple-element((s32[], s32[], f32[], f32[]) %Arg_0.7), index=0
@@ -61,6 +67,11 @@ ENTRY %foo (arg0.1: s64[]) -> s64[] {
 // CHECK-NEXT:   %[[CMP:.*]] = mhlo.compare LT, %[[GTE_0]], %[[GTE_1]]
 // CHECK-NEXT"   return %[[CMP]] : tensor<i1>
 
+// FLATTEN-CHECK-LABEL: func private @region_cond1
+// FLATTEN-CHECK-SAME:    (%[[ARG_0:.*]]: tensor<i32>, %[[ARG_1:.*]]: tensor<i32>, %[[ARG_2:.*]]: tensor<f32>, %[[ARG_3:.*]]: tensor<f32>) -> tensor<i1>
+// FLATTEN-CHECK-NEXT:   %[[CST:.*]] = mhlo.constant
+// FLATTEN-CHECK-NEXT:   %[[CMP:.*]] = mhlo.compare LT, %[[ARG_0]], %[[ARG_1]]
+
 %region_cond1 (Arg_0.15: (s32[], s32[], f32[], f32[])) -> pred[] {
   %constant.18 = s32[] constant(0)
   %Arg_0.15 = (s32[], s32[], f32[], f32[]) parameter(0)
@@ -285,6 +296,11 @@ ENTRY %foo (arg0.1: s64[]) -> s64[] {
 // CHECK:         %[[TUPLE_8:.*]] = mhlo.tuple %[[WHILE]]#0, %[[TUPLE_7]] {xla_shape = "(s32[], (s32[], (s32[])))"} : tuple<tensor<i32>, tuple<tensor<i32>, tuple<tensor<i32>>>>
 // CHECK:          return %[[TUPLE_8]] : tuple<tensor<i32>, tuple<tensor<i32>, tuple<tensor<i32>>>>
 
+// FLATTEN-CHECK-LABEL: func private @test_4
+// FLATTEN-CHECK-SAME:    (%[[ARG_0:.*]]: tensor<i32>, %[[ARG_1:.*]]: tensor<i32>, %[[ARG_2:.*]]: tensor<i32>) -> (tensor<i32>, tensor<i32>, tensor<i32>)
+// FLATTEN-CHECK-NEXT:    %[[WHILE:.*]]:3 = mhlo.while(%[[ITER_ARG1:.*]] = %[[ARG_0]], %[[ITER_ARG2:.*]] = %[[ARG_1]], %[[ITER_ARG3:.*]] = %[[ARG_2]])
+// FLATTEN-CHECK:         return %[[WHILE]]#0, %[[WHILE]]#1, %[[WHILE]]#2
+
 %test_4 (Arg_0: (s32[], (s32[], (s32[])))) -> (s32[], (s32[], (s32[]))) {
   %Arg_0 = (s32[], (s32[], (s32[]))) parameter(0)
   ROOT %while.1 = (s32[], (s32[], (s32[]))) while((s32[], (s32[], (s32[]))) %Arg_0), condition=%region_cond4, body=%region_body4
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/translate.cc b/third_party/xla/xla/translate/hlo_to_mhlo/translate.cc
index addba009d0f952..b31939c58b3364 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/translate.cc
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/translate.cc
@@ -47,7 +47,7 @@ bool LoadHloProto(const std::string& contents, HloProto* hlo_proto) {
 
 mlir::OwningOpRef<mlir::ModuleOp> HloToMlirHloTranslateFunction(
     llvm::StringRef input, mlir::MLIRContext* context,
-    bool import_all_computations) {
+    bool import_all_computations, bool flatten_computation_args_result) {
   HloProto hlo_proto;
   std::string content(input.data(), input.size());
   if (!LoadHloProto(content, &hlo_proto)) {
@@ -58,7 +58,8 @@ mlir::OwningOpRef<mlir::ModuleOp> HloToMlirHloTranslateFunction(
   mlir::OwningOpRef<mlir::ModuleOp> module =
       llvm_ir::CreateMlirModuleOp(mlir::UnknownLoc::get(context));
   auto status = ConvertHloToMlirHlo(
-      module.get(), hlo_proto.mutable_hlo_module(), import_all_computations);
+      module.get(), hlo_proto.mutable_hlo_module(), import_all_computations,
+      flatten_computation_args_result);
   if (!status.ok()) {
     LOG(ERROR) << "Hlo module import failed: " << status;
     return nullptr;
@@ -69,7 +70,7 @@ mlir::OwningOpRef<mlir::ModuleOp> HloToMlirHloTranslateFunction(
 
 mlir::OwningOpRef<mlir::ModuleOp> HloTextToMlirHloTranslateFunction(
     llvm::StringRef input, mlir::MLIRContext* context,
-    bool import_all_computations) {
+    bool import_all_computations, bool flatten_computation_args_result) {
   std::string content(input.data(), input.size());
 
   auto hlo_module_error = ParseAndReturnUnverifiedModule(content);
@@ -82,7 +83,8 @@ mlir::OwningOpRef<mlir::ModuleOp> HloTextToMlirHloTranslateFunction(
   mlir::OwningOpRef<mlir::ModuleOp> module =
       llvm_ir::CreateMlirModuleOp(mlir::UnknownLoc::get(context));
   auto status =
-      ConvertHloToMlirHlo(*module, hlo_module.get(), import_all_computations);
+      ConvertHloToMlirHlo(*module, hlo_module.get(), import_all_computations,
+                          flatten_computation_args_result);
   if (!status.ok()) {
     LOG(ERROR) << "HLO Module import failed: " << status;
     return nullptr;
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/translate.h b/third_party/xla/xla/translate/hlo_to_mhlo/translate.h
index 96d8bcca16491f..0b47e4e8130a87 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/translate.h
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/translate.h
@@ -31,20 +31,30 @@ namespace xla {
 
 // Converts a HloModuleProto stored in the file with the given `input_filename`
 // into a MLIR module. Creates MLIR entities into the given MLIR `context`.
-// If import_all_computation is set to true, imports all computations
+//
+// If `import_all_computation` is set to true, imports all computations
 // irrespective if transitively called from entry computation.
+//
+// If `flatten_computation_args_result` is set to true, flattens all tuple
+// arguments and result of every computation when importing them as func ops.
 mlir::OwningOpRef<mlir::ModuleOp> HloToMlirHloTranslateFunction(
     llvm::StringRef input, mlir::MLIRContext* context,
-    bool import_all_computations = false);
+    bool import_all_computations = false,
+    bool flatten_computation_args_result = false);
 
 // Converts a HloModule stored in text form for a file with the given
 // `input_filename` into a MLIR module. Creates MLIR entities into the given
 // MLIR `context`.
-// If import_all_computation is set to true, imports all computations
+//
+// If `import_all_computation` is set to true, imports all computations
 // irrespective if transitively called from entry computation.
+//
+// If `flatten_computation_args_result` is set to true, flattens all tuple
+// arguments and result of every computation when importing them as func ops.
 mlir::OwningOpRef<mlir::ModuleOp> HloTextToMlirHloTranslateFunction(
     llvm::StringRef input, mlir::MLIRContext* context,
-    bool import_all_computations = false);
+    bool import_all_computations = false,
+    bool flatten_computation_args_result = false);
 
 }  // namespace xla
 
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/translate_registration.cc b/third_party/xla/xla/translate/hlo_to_mhlo/translate_registration.cc
index 505961aac0c7d2..efe90bb0aaad31 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/translate_registration.cc
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/translate_registration.cc
@@ -22,18 +22,23 @@ namespace {
 llvm::cl::opt<bool> import_all_computations(
     "hlo-import-all-computations",
     llvm::cl::desc("Enable importing unreachable computations."));
+
+// NOLINTNEXTLINE
+llvm::cl::opt<bool> flatten_computation_args_result(
+    "hlo-flatten-computation-args-result",
+    llvm::cl::desc("Enable flattening computation arguments and results."));
 }  // namespace
 
 static mlir::OwningOpRef<mlir::ModuleOp> HloToMlirHloTranslate(
     llvm::StringRef input, mlir::MLIRContext* context) {
-  return xla::HloToMlirHloTranslateFunction(input, context,
-                                            import_all_computations);
+  return xla::HloToMlirHloTranslateFunction(
+      input, context, import_all_computations, flatten_computation_args_result);
 }
 
 static mlir::OwningOpRef<mlir::ModuleOp> HloTextToMlirHloTranslate(
     llvm::StringRef input, mlir::MLIRContext* context) {
-  return xla::HloTextToMlirHloTranslateFunction(input, context,
-                                                import_all_computations);
+  return xla::HloTextToMlirHloTranslateFunction(
+      input, context, import_all_computations, flatten_computation_args_result);
 }
 
 static mlir::TranslateToMLIRRegistration HloToMlirHloTranslateRegistration(
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/BUILD b/third_party/xla/xla/translate/mhlo_to_hlo/BUILD
index d9cd70f4658ddc..1386aa586a555c 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/BUILD
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/BUILD
@@ -29,6 +29,7 @@ cc_library(
         "//xla/service:hlo_proto_cc",
         "//xla/stream_executor:dnn",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -42,6 +43,7 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/client:xla_builder",
         "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/status",
     ],
 )
 
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/attribute_exporter.cc b/third_party/xla/xla/translate/mhlo_to_hlo/attribute_exporter.cc
index 21dfb491ec6e9b..da4abeb54be53a 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/attribute_exporter.cc
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/attribute_exporter.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <utility>
 
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/service/hlo_parser.h"
 #include "xla/shape_util.h"
@@ -61,7 +62,7 @@ ConvolutionDimensionNumbers ConvertConvDimensionNumbers(
 absl::StatusOr<std::vector<ReplicaGroup>> ConvertReplicaGroups(
     mlir::DenseIntElementsAttr input) {
   mlir::RankedTensorType type =
-      input.getType().dyn_cast<mlir::RankedTensorType>();
+      mlir::dyn_cast<mlir::RankedTensorType>(input.getType());
   if (!type || type.getRank() != 2 ||
       !type.getElementType().isInteger(/*width=*/64)) {
     return Internal("Execpted replica group to be a rank 2 tensor of i64");
@@ -90,7 +91,7 @@ absl::StatusOr<std::vector<std::pair<int64_t, int64_t>>> ConvertNx2Attribute(
   if (!optional_attr.has_value())
     return std::vector<std::pair<int64_t, int64_t>>{};
   mlir::DenseIntElementsAttr attr = *optional_attr;
-  auto type = attr.getType().dyn_cast<mlir::RankedTensorType>();
+  auto type = mlir::dyn_cast<mlir::RankedTensorType>(attr.getType());
   if (!type || type.getRank() != 2 || type.getShape()[1] != 2)
     return Internal("expected Nx2 attribute to be a tensor of shape Nx2");
   auto it = attr.getValues<int64_t>().begin();
@@ -184,7 +185,7 @@ absl::StatusOr<
 ConvertOutputOperandAliasing(mlir::ArrayAttr aliasArrayAttr) {
   std::vector<std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>> aliasInfo;
   for (auto attr : aliasArrayAttr.getValue()) {
-    auto alias = attr.cast<mlir::mhlo::OutputOperandAliasAttr>();
+    auto alias = mlir::cast<mlir::mhlo::OutputOperandAliasAttr>(attr);
     ShapeIndex outputShapeIndex(alias.getOutputTupleIndices());
     ShapeIndex operandShapeIndex(alias.getOperandTupleIndices());
     aliasInfo.push_back(std::make_pair(
@@ -255,7 +256,7 @@ absl::StatusOr<std::vector<int64_t>> ConvertMlirArrayAttrToInt64Array(
   int rank = array.size();
   std::vector<int64_t> converted_array(rank);
   for (int i = 0; i < rank; i++) {
-    mlir::IntegerAttr attr = array[i].dyn_cast<mlir::IntegerAttr>();
+    mlir::IntegerAttr attr = mlir::dyn_cast<mlir::IntegerAttr>(array[i]);
     if (!attr) {
       return Internal("Type Error: Expected layout integer attribute");
     }
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.cc b/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.cc
index 88261a847874f5..cf219a5a13bdfc 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.cc
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "xla/translate/mhlo_to_hlo/layout_util.h"
 
+#include "absl/status/status.h"
+
 namespace mlir {
 
 // Rewrites the layout of xla_shape if there is tiled sharding.
@@ -58,7 +60,7 @@ absl::Status RewriteLayoutWithShardedShape(
             : per_device_xla_shape);
     *xla_shape->mutable_layout() = per_device_xla_shape.layout();
   }
-  return xla::OkStatus();
+  return absl::OkStatus();
 }
 
 // There is a shape_representation_fn or sharding for an output, this function
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/location_exporter.cc b/third_party/xla/xla/translate/mhlo_to_hlo/location_exporter.cc
index 0d8969116e70e9..1b98859e91bd0e 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/location_exporter.cc
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/location_exporter.cc
@@ -40,7 +40,7 @@ static std::string GetNameFromLocImpl(Location loc) {
   while (!locs.empty()) {
     Location curr_loc = locs.pop_back_val();
 
-    if (auto name_loc = curr_loc.dyn_cast<NameLoc>()) {
+    if (auto name_loc = mlir::dyn_cast<NameLoc>(curr_loc)) {
       // Add name in NameLoc. For NameLoc we also account for names due to ops
       // in functions where the op's name is first.
       auto name = name_loc.getName().strref().split('@').first;
@@ -48,10 +48,10 @@ static std::string GetNameFromLocImpl(Location loc) {
       if (!name.ends_with(":")) {
         loc_names.push_back(name);
       }
-    } else if (auto call_loc = curr_loc.dyn_cast<CallSiteLoc>()) {
+    } else if (auto call_loc = mlir::dyn_cast<CallSiteLoc>(curr_loc)) {
       // Use location of the Callee to generate the name.
       locs.push_back(call_loc.getCallee());
-    } else if (auto fused_loc = curr_loc.dyn_cast<FusedLoc>()) {
+    } else if (auto fused_loc = mlir::dyn_cast<FusedLoc>(curr_loc)) {
       // Push all locations in FusedLoc in reverse order, so locations are
       // visited based on order in FusedLoc.
       auto reversed_fused_locs = llvm::reverse(fused_loc.getLocations());
@@ -70,7 +70,7 @@ static std::string GetOpTypeFromLoc(Location loc) {
   while (!locs.empty()) {
     Location curr_loc = locs.pop_back_val();
 
-    if (auto name_loc = curr_loc.dyn_cast<NameLoc>()) {
+    if (auto name_loc = mlir::dyn_cast<NameLoc>(curr_loc)) {
       // Add name in NameLoc. For NameLoc we also account for names due to ops
       // in functions where the op's name is first.
       auto op_type = name_loc.getName().strref().split('@').first;
@@ -78,10 +78,10 @@ static std::string GetOpTypeFromLoc(Location loc) {
         op_type = op_type.substr(0, op_type.size() - 1);
         loc_op_types.push_back(op_type);
       }
-    } else if (auto call_loc = curr_loc.dyn_cast<CallSiteLoc>()) {
+    } else if (auto call_loc = mlir::dyn_cast<CallSiteLoc>(curr_loc)) {
       // Use location of the Callee to generate the name.
       locs.push_back(call_loc.getCallee());
-    } else if (auto fused_loc = curr_loc.dyn_cast<FusedLoc>()) {
+    } else if (auto fused_loc = mlir::dyn_cast<FusedLoc>(curr_loc)) {
       // The first location is reserved for op_type.
       if (!fused_loc.getLocations().empty())
         locs.push_back(fused_loc.getLocations()[0]);
@@ -92,10 +92,10 @@ static std::string GetOpTypeFromLoc(Location loc) {
 }
 
 static void SetSourceFileAndLine(Location loc, xla::OpMetadata& metadata) {
-  if (auto file_line_col_loc = loc.dyn_cast<mlir::FileLineColLoc>()) {
+  if (auto file_line_col_loc = mlir::dyn_cast<mlir::FileLineColLoc>(loc)) {
     metadata.set_source_file(file_line_col_loc.getFilename().str());
     metadata.set_source_line(file_line_col_loc.getLine());
-  } else if (auto fused_loc = loc.dyn_cast<FusedLoc>()) {
+  } else if (auto fused_loc = mlir::dyn_cast<FusedLoc>(loc)) {
     for (Location it : fused_loc.getLocations()) {
       SetSourceFileAndLine(it, metadata);
     }
@@ -113,7 +113,7 @@ xla::OpMetadata CreateOpMetadataFromLocation(
   std::string op_type = GetOpTypeFromLoc(loc);
   metadata.set_op_type(op_type);
 
-  if (auto name_loc = loc.dyn_cast<mlir::NameLoc>()) {
+  if (auto name_loc = mlir::dyn_cast<mlir::NameLoc>(loc)) {
     loc = name_loc.getChildLoc();
     if (isa<mlir::UnknownLoc>(loc)) return metadata;
 
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc b/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
index 81df83ad7ba4a6..5ab7baea7ce6a9 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
@@ -56,6 +56,7 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
@@ -144,13 +145,13 @@ static mlir::LogicalResult GetXlaOp(
 }
 
 bool IsBoundedOrStatic(mlir::Type ty) {
-  auto ranked_ty = ty.dyn_cast_or_null<mlir::RankedTensorType>();
+  auto ranked_ty = mlir::dyn_cast_or_null<mlir::RankedTensorType>(ty);
   if (!ranked_ty) return false;
 
   if (ranked_ty.hasStaticShape()) return true;
 
-  auto encoding = ranked_ty.getEncoding()
-                      .dyn_cast_or_null<mlir::mhlo::TypeExtensionsAttr>();
+  auto encoding = mlir::dyn_cast_or_null<mlir::mhlo::TypeExtensionsAttr>(
+      ranked_ty.getEncoding());
   if (!encoding || encoding.getBounds().empty()) return false;
 
   int64_t rank = ranked_ty.getRank();
@@ -168,18 +169,18 @@ xla::Array<T> ArrayFromDenseElementsAttr(mlir::DenseElementsAttr dense_attr) {
       xla::primitive_util::NativeToPrimitiveType<T>();
   xla::Shape shape = xla::TypeToShape(dense_attr.getType());
   xla::Array<T> array(shape.dimensions());
-  if constexpr (!xla::primitive_util::Is4BitType(type)) {
+  if constexpr (!xla::primitive_util::IsSubByteNonPredType(type)) {
     array.SetValues(dense_attr.getValues<T>());
   } else {
     // The only way to get subbyte integers from getValues() is to get them as
     // APInts.
     auto values = dense_attr.getValues<llvm::APInt>();
     for (int i = 0; i < values.size(); i++) {
-      if constexpr (type == xla::U4) {
-        array.data()[i] = xla::u4{values[i].getZExtValue()};
+      if constexpr (xla::primitive_util::IsUnsignedIntegralType(type)) {
+        array.data()[i] = T{values[i].getZExtValue()};
       } else {
-        static_assert(type == xla::S4);
-        array.data()[i] = xla::s4(values[i].getSExtValue());
+        static_assert(xla::primitive_util::IsSignedIntegralType(type));
+        array.data()[i] = T{values[i].getSExtValue()};
       }
     }
   }
@@ -188,7 +189,7 @@ xla::Array<T> ArrayFromDenseElementsAttr(mlir::DenseElementsAttr dense_attr) {
 
 absl::StatusOr<xla::Literal> CreateArrayLiteralFromAttr(mlir::ElementsAttr attr,
                                                         xla::Layout layout) {
-  auto dense_attr = attr.dyn_cast<mlir::DenseElementsAttr>();
+  auto dense_attr = mlir::dyn_cast<mlir::DenseElementsAttr>(attr);
   if (!dense_attr)
     return tsl::errors::Unimplemented("Only dense elements attr are supported");
 
@@ -260,7 +261,7 @@ static std::vector<xla::CrossProgramPrefetch> Convert_cross_program_prefetches(
     mlir::ArrayAttr prefetches) {
   std::vector<xla::CrossProgramPrefetch> cross_program_prefetches;
   for (auto prefetch : prefetches) {
-    auto cpp = prefetch.cast<mlir::mhlo::CrossProgramPrefetchAttr>();
+    auto cpp = mlir::cast<mlir::mhlo::CrossProgramPrefetchAttr>(prefetch);
     xla::CrossProgramPrefetch xla_cpp;
     xla_cpp.set_parameter(cpp.getParameter());
     for (auto index : cpp.getIndices()) xla_cpp.add_index(index);
@@ -321,11 +322,11 @@ static void SetLayout(xla::Shape& shape, mlir::ArrayAttr layouts) {
   if (shape.IsTuple()) {
     for (int i = 0; i < shape.tuple_shapes_size(); ++i) {
       SetLayout(*shape.mutable_tuple_shapes(i),
-                layouts[i].cast<mlir::DenseIntElementsAttr>());
+                mlir::cast<mlir::DenseIntElementsAttr>(layouts[i]));
     }
   } else {
     assert(layouts.size() == 1);
-    SetLayout(shape, layouts[0].cast<mlir::DenseIntElementsAttr>());
+    SetLayout(shape, mlir::cast<mlir::DenseIntElementsAttr>(layouts[0]));
   }
 }
 
@@ -335,7 +336,7 @@ static std::vector<xla::Shape> ConvertTypesToShapesWithLayout(
   std::vector<xla::Shape> shapes_with_layout;
   for (auto [type, layout] : llvm::zip(value_types, layouts)) {
     xla::Shape shape = xla::TypeToShape(type);
-    SetLayout(shape, layout.cast<mlir::DenseIntElementsAttr>());
+    SetLayout(shape, mlir::cast<mlir::DenseIntElementsAttr>(layout));
     shapes_with_layout.push_back(std::move(shape));
   }
   return shapes_with_layout;
@@ -433,7 +434,7 @@ static std::unique_ptr<xla::PrecisionConfig> Convert_precision_config(
     xla::PrecisionConfig::Precision p;
     auto operand_precision =
         mlir::mhlo::stringifyPrecision(
-            attr.cast<mlir::mhlo::PrecisionAttr>().getValue())
+            mlir::cast<mlir::mhlo::PrecisionAttr>(attr).getValue())
             .str();
     // TODO(jpienaar): Update this to ensure this is captured by verify.
     if (xla::PrecisionConfig::Precision_Parse(operand_precision, &p)) {
@@ -583,7 +584,7 @@ void ConstructFrontendAttributesFromAttribute(
     const mlir::DictionaryAttr& frontend_attributes_dict,
     xla::FrontendAttributes& frontend_attributes) {
   for (const auto& attr : frontend_attributes_dict)
-    if (auto value_str_attr = attr.getValue().dyn_cast<mlir::StringAttr>())
+    if (auto value_str_attr = mlir::dyn_cast<mlir::StringAttr>(attr.getValue()))
       frontend_attributes.mutable_map()->insert(
           {attr.getName().str(), value_str_attr.getValue().str()});
 }
@@ -640,22 +641,9 @@ static void ExtractShardingsFromFunction(
       (*ret_shardings)[i] = xla::ConvertSharding(sharding.getValue());
 }
 
-void AppendTupleShardingElements(xla::OpSharding* result,
-                                 const xla::OpSharding& tuple_sharding) {
-  if (tuple_sharding.type() == xla::OpSharding::TUPLE) {
-    for (const xla::OpSharding& element : tuple_sharding.tuple_shardings()) {
-      AppendTupleShardingElements(result, element);
-    }
-  } else {
-    *result->add_tuple_shardings() = tuple_sharding;
-  }
-}
-
-// Creates a tuple sharding with `tuple_shardings` if at least one is present.
-// Adds replicated shardings for any missing tuple shardings.
+// Creates a tuple sharding with the given shardings if at least one is present.
 //
-// The tuple xla::Shape can be nested, while xla::OpSharding stores a flattened
-// list of shardings for the leaves of a tuple shape.
+// Adds replicated shardings for any missing tuple shardings.
 std::optional<xla::OpSharding> CreateTupleSharding(
     llvm::ArrayRef<std::optional<xla::OpSharding>> tuple_shardings) {
   if (tuple_shardings.empty() ||
@@ -666,7 +654,7 @@ std::optional<xla::OpSharding> CreateTupleSharding(
   sharding.set_type(xla::OpSharding::TUPLE);
   for (const std::optional<xla::OpSharding>& tuple_sharding : tuple_shardings) {
     if (tuple_sharding) {
-      AppendTupleShardingElements(&sharding, *tuple_sharding);
+      *sharding.add_tuple_shardings() = *tuple_sharding;
     } else {
       xla::OpSharding fallback_sharding;
       fallback_sharding.set_type(xla::OpSharding::REPLICATED);
@@ -938,16 +926,6 @@ LogicalResult ExportXlaOp(CompositeOp, OpLoweringContext) {
   return failure();
 }
 
-LogicalResult ExportXlaOp(ComputeReshapeShapeOp, OpLoweringContext) {
-  // This op should've been removed during PrepareForExport.
-  return failure();
-}
-
-LogicalResult ExportXlaOp(CstrReshapableOp, OpLoweringContext) {
-  // This op should've been removed during PrepareForExport.
-  return failure();
-}
-
 LogicalResult ExportXlaOp(DynamicBroadcastInDimOp op, OpLoweringContext ctx) {
   // This op has no expression in the legacy export format.
   return failure();
@@ -974,12 +952,13 @@ LogicalResult ExportXlaOp(DynamicPadOp op, OpLoweringContext ctx) {
 }
 
 LogicalResult ExportXlaOp(DynamicReshapeOp op, OpLoweringContext ctx) {
-  auto resultType = op.getResult().getType().dyn_cast<RankedTensorType>();
+  auto resultType = mlir::dyn_cast<RankedTensorType>(op.getResult().getType());
   if (!resultType) return op->emitOpError() << "expected ranked result";
   auto resultBounds = hlo::encodingToBounds(resultType.getEncoding());
   if (resultBounds.empty())
     return op->emitOpError() << "expected bounded result";
-  auto shapeType = op.getOutputShape().getType().dyn_cast<RankedTensorType>();
+  auto shapeType =
+      mlir::dyn_cast<RankedTensorType>(op.getOutputShape().getType());
   if (!shapeType || !shapeType.getElementType().isInteger(32))
     return op->emitOpError() << "expected output shape to be tensor<Nxi32>";
 
@@ -1059,8 +1038,9 @@ LogicalResult ExportXlaOp(AllGatherOp op, OpLoweringContext ctx) {
   auto all_gather_dim = op.getAllGatherDim();
   int64_t shard_count = 0;
   for (size_t i = 0; i < operands.size(); ++i) {
-    TensorType operand_type = op.getOperand(i).getType().cast<TensorType>();
-    TensorType result_type = op.getType(i).cast<TensorType>();
+    TensorType operand_type =
+        mlir::cast<TensorType>(op.getOperand(i).getType());
+    TensorType result_type = mlir::cast<TensorType>(op.getType(i));
     if (!operand_type.hasStaticShape() || !result_type.hasStaticShape())
       return failure();
     if (i == 0) {
@@ -1158,7 +1138,7 @@ LogicalResult ExportXlaOp(ReduceScatterOp op, OpLoweringContext ctx) {
   xla::XlaOp operand;
   if (failed(GetXlaOp(op.getOperand(), value_map, &operand, op)))
     return failure();
-  TensorType operand_type = op.getOperand().getType().cast<TensorType>();
+  TensorType operand_type = mlir::cast<TensorType>(op.getOperand().getType());
   TensorType result_type = op.getType();
   if (!operand_type.hasStaticShape() || !result_type.hasStaticShape())
     return failure();
@@ -1182,19 +1162,7 @@ LogicalResult ExportXlaOp(ReduceScatterOp op, OpLoweringContext ctx) {
 
 LogicalResult ExportXlaOp(AsyncStartOp op, OpLoweringContext ctx) {
   for (auto* user : op.getResult().getUsers()) {
-    if (auto asyncOp = dyn_cast_or_null<AsyncDoneOp>(user)) {
-      if (asyncOp.getCalledComputation() != op.getCalledComputation()) {
-        return op.emitOpError()
-               << "Users of AsyncStart's return value must have "
-                  "the same called_computation";
-      }
-    } else if (auto asyncOp = dyn_cast_or_null<AsyncUpdateOp>(user)) {
-      if (asyncOp.getCalledComputation() != op.getCalledComputation()) {
-        return op.emitOpError()
-               << "Users of AsyncStart's return value must have "
-                  "the same called_computation";
-      }
-    } else {
+    if (!isa<AsyncUpdateOp, AsyncDoneOp>(user)) {
       return op.emitOpError() << "Users of AsyncStart's return value must be "
                               << "async_update or async_done";
     }
@@ -1213,8 +1181,8 @@ LogicalResult ExportXlaOp(AsyncStartOp op, OpLoweringContext ctx) {
       dyn_cast_or_null<AllGatherOp>(callee.getBody().front().front());
   if (all_gather_op && SimplyReturnedOp(all_gather_op)) {
     TensorType operand_type =
-        all_gather_op.getOperand(0).getType().cast<TensorType>();
-    TensorType result_type = all_gather_op.getType(0).cast<TensorType>();
+        mlir::cast<TensorType>(all_gather_op.getOperand(0).getType());
+    TensorType result_type = mlir::cast<TensorType>(all_gather_op.getType(0));
     if (!operand_type.hasStaticShape() || !result_type.hasStaticShape())
       return failure();
     if (operands.size() != 1) return failure();
@@ -1226,7 +1194,7 @@ LogicalResult ExportXlaOp(AsyncStartOp op, OpLoweringContext ctx) {
         Convert_replica_groups(all_gather_op.getReplicaGroups()),
         Convert_channel_handle(all_gather_op.getChannelHandle()),
         ExtractLayout(all_gather_op,
-                      result_type.cast<RankedTensorType>().getRank()),
+                      mlir::cast<RankedTensorType>(result_type).getRank()),
         Convert_use_global_device_ids(all_gather_op.getUseGlobalDeviceIds()));
     return success();
   }
@@ -1286,11 +1254,12 @@ LogicalResult ExportXlaOp(AsyncStartOp op, OpLoweringContext ctx) {
   }
   auto recv_op = dyn_cast_or_null<RecvOp>(callee.getBody().front().front());
   if (recv_op && SimplyReturnedOp(recv_op)) {
-    auto result_types = result.getType().cast<AsyncBundleType>().getTypes()[1];
+    auto result_types =
+        mlir::cast<AsyncBundleType>(result.getType()).getTypes()[1];
 
     mlir::Type received_type = mlir::TupleType::get(op->getContext(), {});
     if (isa<TupleType>(result_types)) {
-      received_type = result_types.cast<TupleType>().getType(0);
+      received_type = mlir::cast<TupleType>(result_types).getType(0);
     }
 
     value_map[result] = xla::internal::XlaBuilderFriend::BuildRecv(
@@ -1328,19 +1297,7 @@ LogicalResult ExportXlaOp(AsyncUpdateOp op, OpLoweringContext ctx) {
   }
 
   for (auto* user : op.getResult().getUsers()) {
-    if (auto asyncOp = dyn_cast_or_null<AsyncDoneOp>(user)) {
-      if (asyncOp.getCalledComputation() != op.getCalledComputation()) {
-        return op.emitOpError()
-               << "Users of AsyncUpdate's return value must have "
-                  "the same group_id and called_computation";
-      }
-    } else if (auto asyncOp = dyn_cast_or_null<AsyncUpdateOp>(user)) {
-      if (asyncOp.getCalledComputation() != op.getCalledComputation()) {
-        return op.emitOpError()
-               << "Users of AsyncUpdate's return value must have "
-                  "the same group_id and called_computation";
-      }
-    } else {
+    if (!isa<AsyncUpdateOp, AsyncDoneOp>(user)) {
       return op.emitOpError() << "Users of AsyncUpdate's return value must be "
                               << "async_update or async_done";
     }
@@ -1352,14 +1309,8 @@ LogicalResult ExportXlaOp(AsyncUpdateOp op, OpLoweringContext ctx) {
   if (failed(GetXlaOp(op.getBundle(), value_map, &operand, op)))
     return failure();
 
-  mlir::func::FuncOp callee = ctx.converter->LookUpSymbol(
-      FlatSymbolRefAttr::get(op->getContext(), op.getCalledComputation()));
-  xla::XlaComputation& computation =
-      ctx.converter->GetLoweredComputation(callee);
   value_map[result] = xla::internal::XlaBuilderFriend::BuildAsyncUpdate(
-      ctx.builder, operand, op.getExecutionThread().str(),
-      computation.proto().computations(0).id(),
-      xla::TypeToShape(result.getType()));
+      ctx.builder, operand, xla::TypeToShape(result.getType()));
   return success();
 }
 
@@ -1380,8 +1331,23 @@ LogicalResult ExportXlaOp(AsyncDoneOp op, OpLoweringContext ctx) {
   if (failed(GetXlaOp(op.getBundle(), value_map, &operand, op)))
     return failure();
 
-  mlir::func::FuncOp callee = ctx.converter->LookUpSymbol(
-      FlatSymbolRefAttr::get(op->getContext(), op.getCalledComputation()));
+  // Find the AsyncStartOp that starts the async chain.
+  Operation* start = op;
+  while (start != nullptr && !isa<AsyncStartOp>(start)) {
+    start = start->getOperand(0).getDefiningOp();
+    if (start == nullptr || !isa<AsyncStartOp, AsyncUpdateOp>(start)) {
+      return op.emitError() << "Defining op of AsyncDone's operand must be "
+                            << "async_start or async_update";
+    }
+  }
+
+  if (!isa<AsyncStartOp>(start)) {
+    return op.emitError() << "Could not find async chain start";
+  }
+
+  mlir::func::FuncOp callee =
+      ctx.converter->LookUpSymbol(FlatSymbolRefAttr::get(
+          op->getContext(), cast<AsyncStartOp>(start).getCalledComputation()));
 
   auto all_gather_op =
       dyn_cast_or_null<AllGatherOp>(callee.getBody().front().front());
@@ -1425,11 +1391,11 @@ LogicalResult ExportXlaOp(AsyncDoneOp op, OpLoweringContext ctx) {
   auto recv_op = dyn_cast_or_null<RecvOp>(callee.getBody().front().front());
   if (recv_op && SimplyReturnedOp(recv_op)) {
     auto result_types =
-        op.getBundle().getType().cast<AsyncBundleType>().getTypes()[1];
+        mlir::cast<AsyncBundleType>(op.getBundle().getType()).getTypes()[1];
 
     mlir::Type received_type = mlir::TupleType::get(op->getContext(), {});
     if (isa<TupleType>(result_types)) {
-      received_type = result_types.cast<TupleType>().getType(0);
+      received_type = mlir::cast<TupleType>(result_types).getType(0);
     }
 
     xla::XlaOp xla_recv = xla::internal::XlaBuilderFriend::BuildRecvDone(
@@ -1444,8 +1410,6 @@ LogicalResult ExportXlaOp(AsyncDoneOp op, OpLoweringContext ctx) {
     return success();
   }
 
-  xla::XlaComputation& computation =
-      ctx.converter->GetLoweredComputation(callee);
   std::vector<xla::Shape> subshapes;
   for (const auto& item : op.getResults().getType()) {
     subshapes.push_back(xla::TypeToShape(item));
@@ -1453,8 +1417,7 @@ LogicalResult ExportXlaOp(AsyncDoneOp op, OpLoweringContext ctx) {
   xla::Shape data_shape = xla::ShapeUtil::MakeTupleShape(subshapes);
 
   xla::XlaOp exportedOp = xla::internal::XlaBuilderFriend::BuildAsyncDone(
-      ctx.builder, operand, op.getExecutionThread().str(),
-      computation.proto().computations(0).id(), data_shape);
+      ctx.builder, operand, data_shape);
   if (op.getNumResults() == 1) {
     value_map[op.getResult(0)] = exportedOp;
   } else {
@@ -1476,7 +1439,7 @@ LogicalResult ExportXlaOp(BitcastConvertOp op, OpLoweringContext ctx) {
 }
 
 LogicalResult ExportXlaOp(BroadcastInDimOp op, OpLoweringContext ctx) {
-  auto type = op.getType().dyn_cast<RankedTensorType>();
+  auto type = mlir::dyn_cast<RankedTensorType>(op.getType());
   if (!type) return failure();
   auto& value_map = *ctx.values;
   xla::XlaOp operand;
@@ -1892,7 +1855,7 @@ LogicalResult ExportXlaOp(CustomCallOp op, OpLoweringContext ctx) {
                << " is not a supported attribute for ApproxTopK";
     }
     auto backend_config =
-        op.getBackendConfigAttr().dyn_cast_or_null<mlir::DictionaryAttr>();
+        mlir::dyn_cast_or_null<mlir::DictionaryAttr>(op.getBackendConfigAttr());
     if (!backend_config)
       return op.emitOpError() << "Missing backend_config attribute";
 
@@ -1980,14 +1943,16 @@ LogicalResult ExportXlaOp(CustomCallOp op, OpLoweringContext ctx) {
 
     SmallVector<RankedTensorType> input_types, init_value_types, result_types;
     for (size_t i = 0; i < num_inputs; ++i) {
-      auto input_type = op.getOperand(i).getType().dyn_cast<RankedTensorType>();
+      auto input_type =
+          mlir::dyn_cast<RankedTensorType>(op.getOperand(i).getType());
       if (!input_type) return failure();
       input_types.push_back(input_type);
-      auto init_value_type =
-          op.getOperand(num_inputs + i).getType().dyn_cast<RankedTensorType>();
+      auto init_value_type = mlir::dyn_cast<RankedTensorType>(
+          op.getOperand(num_inputs + i).getType());
       if (!init_value_type) return failure();
       init_value_types.push_back(init_value_type);
-      auto result_type = op.getResult(i).getType().dyn_cast<RankedTensorType>();
+      auto result_type =
+          mlir::dyn_cast<RankedTensorType>(op.getResult(i).getType());
       if (!result_type) return failure();
       result_types.push_back(result_type);
     }
@@ -2038,7 +2003,7 @@ LogicalResult ExportXlaOp(CustomCallOp op, OpLoweringContext ctx) {
              << "ApproxTopK takes exactly 1 called_computation.";
     }
     mlir::func::FuncOp callee = ctx.converter->LookUpSymbol(
-        op.getCalledComputations()[0].cast<FlatSymbolRefAttr>());
+        mlir::cast<FlatSymbolRefAttr>(op.getCalledComputations()[0]));
     mlir::FunctionType callee_type = callee.getFunctionType();
     SmallVector<Type, 4> expected_callee_input_types;
     for (unsigned i = 0; i < num_inputs; ++i) {
@@ -2108,16 +2073,14 @@ LogicalResult ExportXlaOp(CustomCallOp op, OpLoweringContext ctx) {
 
   if (*xla_api_version == xla::CustomCallApiVersion::API_VERSION_TYPED_FFI) {
     // Serialize backend config dictionary as a string.
-    if (auto dict = op.getBackendConfig()
-                        .value_or(mlir::Attribute())
-                        .dyn_cast_or_null<mlir::DictionaryAttr>()) {
+    if (auto dict = mlir::dyn_cast_or_null<mlir::DictionaryAttr>(
+            op.getBackendConfig().value_or(mlir::Attribute()))) {
       llvm::raw_string_ostream(backend_config) << dict;
     }
   } else {
     // Forward backend config string to the HLO instruction.
-    if (auto str = op.getBackendConfig()
-                       .value_or(mlir::Attribute())
-                       .dyn_cast_or_null<mlir::StringAttr>()) {
+    if (auto str = mlir::dyn_cast_or_null<mlir::StringAttr>(
+            op.getBackendConfig().value_or(mlir::Attribute()))) {
       llvm::raw_string_ostream(backend_config) << str.strref();
     }
   }
@@ -2153,7 +2116,7 @@ LogicalResult ExportXlaOp(CustomCallOp op, OpLoweringContext ctx) {
   xla::XlaOp custom_call;
   if (op.getCalledComputations().size() == 1) {
     mlir::func::FuncOp callee = ctx.converter->LookUpSymbol(
-        op.getCalledComputations()[0].cast<FlatSymbolRefAttr>());
+        mlir::cast<FlatSymbolRefAttr>(op.getCalledComputations()[0]));
     if (failed(ctx.converter->RunOnFunction(callee))) return failure();
     xla::XlaComputation& computation =
         ctx.converter->GetLoweredComputation(callee);
@@ -2843,6 +2806,11 @@ LogicalResult ExportXlaOp(TopKOp op, OpLoweringContext ctx) {
   return success();
 }
 
+LogicalResult ExportXlaOp(MinimumBroadcastShapesOp op, OpLoweringContext ctx) {
+  // This op is only used by KernelGen and is not meant to be lowered to HLO.
+  return failure();
+}
+
 }  // namespace
 }  // namespace mhlo
 }  // namespace mlir
@@ -2874,11 +2842,11 @@ LogicalResult ConvertLayout(mlir::Operation* op, const mlir::ArrayAttr& layout,
     }
     for (int i = 0; i < subshapes_data_size; i++) {
       mlir::Attribute child = layout[i];
-      if (child.isa<mlir::UnitAttr>()) {
+      if (mlir::isa<mlir::UnitAttr>(child)) {
         // ignore unit attributes, they are used only for tokens.
         continue;
       }
-      mlir::ArrayAttr c = child.dyn_cast<mlir::ArrayAttr>();
+      mlir::ArrayAttr c = mlir::dyn_cast<mlir::ArrayAttr>(child);
       if (!c) {
         op->emitOpError() << "Type Error: Expected layout array attribute";
         return failure();
@@ -2895,7 +2863,7 @@ LogicalResult ConvertLayout(mlir::Operation* op, const mlir::ArrayAttr& layout,
       }
       std::vector<int64_t> array(rank);
       for (int i = 0; i < rank; i++) {
-        mlir::IntegerAttr attr = layout[i].dyn_cast<mlir::IntegerAttr>();
+        mlir::IntegerAttr attr = mlir::dyn_cast<mlir::IntegerAttr>(layout[i]);
         if (!attr) {
           op->emitOpError() << "Type Error: Expected layout integer attribute";
           return failure();
@@ -2925,7 +2893,7 @@ LogicalResult ConvertInfeedtLayout(mlir::Operation* op,
     // Handles following shape:
     //   single array-shape of infeed data
     mlir::ArrayAttr child_layout =
-        layout[layout_index].dyn_cast<mlir::ArrayAttr>();
+        mlir::dyn_cast<mlir::ArrayAttr>(layout[layout_index]);
     if (!child_layout) {
       op->emitOpError() << "Type Error: Expected layout array attribute";
       return failure();
@@ -2938,7 +2906,8 @@ LogicalResult ConvertInfeedtLayout(mlir::Operation* op,
       }
       std::vector<int64_t> array(rank);
       for (int i = 0; i < rank; i++) {
-        mlir::IntegerAttr attr = child_layout[i].dyn_cast<mlir::IntegerAttr>();
+        mlir::IntegerAttr attr =
+            mlir::dyn_cast<mlir::IntegerAttr>(child_layout[i]);
         if (!attr) {
           op->emitOpError() << "Type Error: Expected layout integer attribute";
           return failure();
@@ -2997,9 +2966,7 @@ LogicalResult ConvertInfeedtLayout(mlir::Operation* op,
 LogicalResult ExportXlaOperatorWrapped(mlir::Operation* inst,
                                        OpLoweringContext ctx) {
   auto op = dyn_cast<mlir::mhlo::AddOp>(inst);
-  if (op && op.getResult()
-                .getType()
-                .cast<mlir::TensorType>()
+  if (op && mlir::cast<mlir::TensorType>(op.getResult().getType())
                 .getElementType()
                 .isSignlessInteger(1)) {
     auto& value_map = *ctx.values;
@@ -3156,7 +3123,7 @@ LogicalResult ConvertToHloModule::Lower(
 
   if (auto op = dyn_cast<mlir::tensor::CastOp>(inst)) {
     Value operand = op.getOperand();
-    auto ty = operand.getType().dyn_cast<ShapedType>();
+    auto ty = mlir::dyn_cast<ShapedType>(operand.getType());
     // If this was a cast from a static or bounded tensors, then it is a noop
     // for export to HLO and we can use the operand.
     if (!ty || !IsBoundedOrStatic(ty)) {
@@ -3176,7 +3143,7 @@ LogicalResult ConvertToHloModule::Lower(
   }
 
   if (matchPattern(inst, m_Constant(&const_attr))) {
-    if (!inst->getResult(0).getType().isa<ShapedType>()) {
+    if (!mlir::isa<ShapedType>(inst->getResult(0).getType())) {
       return inst->emitError(
           "expected shaped type during constant mhlo -> hlo translation");
     }
@@ -3227,23 +3194,16 @@ LogicalResult ConvertToHloModule::Lower(
       xla::XlaScopedShardingAssignment scoped_sharding(builder,
                                                        ret_tuple_sharding);
       *return_value = xla::Tuple(builder, returns);
-    } else {
+    } else if (num_return_values == 1) {
       xla::XlaOp operand;
       if (failed(GetXlaOp(inst->getOperand(0), value_map, &operand, inst)))
         return failure();
 
       if (ret_tuple_sharding) {
-        xla::XlaOp tuple;
-        {
-          xla::XlaScopedShardingAssignment scoped_sharding(builder,
-                                                           ret_tuple_sharding);
-          tuple = Tuple(builder, {operand});
-        }
-        {
-          xla::XlaScopedShardingAssignment scoped_sharding(builder,
-                                                           *ret_shardings[0]);
-          *return_value = GetTupleElement(tuple, 0);
-        }
+        auto tuple = Tuple(builder, {operand});
+        builder->SetSharding(*ret_shardings[0]);
+        *return_value = GetTupleElement(tuple, 0);
+        builder->ClearSharding();
       } else {
         *return_value = operand;
       }
@@ -3379,7 +3339,7 @@ LogicalResult ConvertToHloModule::RunOnFunction(mlir::func::FuncOp f) {
           if (instr.parameter_number() == i)
             instr.mutable_parameter_replication()
                 ->add_replicated_at_leaf_buffers(
-                    b.cast<mlir::BoolAttr>().getValue());
+                    mlir::cast<mlir::BoolAttr>(b).getValue());
     }
   }
   lowered_computation_[f] = std::move(computation);
@@ -3619,7 +3579,6 @@ absl::Status PrepareForExport(mlir::ModuleOp module) {
   bool hasShapeOps = false;
   module.walk([&](Operation* op) {
     hasShapeOps |= isa<shape::ShapeDialect>(op->getDialect());
-    hasShapeOps |= isa<mhlo::ComputeReshapeShapeOp, mhlo::CstrReshapableOp>(op);
     return hasShapeOps ? WalkResult::interrupt() : WalkResult::advance();
   });
   mlir::PassManager pm(module.getContext());
@@ -3703,8 +3662,8 @@ absl::Status ConvertMlirHloToHlo(mlir::ModuleOp module,
   if (auto spmd_parameters_sharding = module->getAttrOfType<mlir::ArrayAttr>(
           "mhlo.spmd_parameters_shardings")) {
     for (const auto& sharding : spmd_parameters_sharding.getValue()) {
-      *hlo_module.add_spmd_parameters_shardings() =
-          *xla::ConvertSharding(sharding.cast<mlir::StringAttr>().getValue());
+      *hlo_module.add_spmd_parameters_shardings() = *xla::ConvertSharding(
+          mlir::cast<mlir::StringAttr>(sharding).getValue());
     }
   }
 
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/tests/export.mlir b/third_party/xla/xla/translate/mhlo_to_hlo/tests/export.mlir
index bbf29e9daa742c..dec3e5dcae858f 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/tests/export.mlir
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/tests/export.mlir
@@ -26,6 +26,17 @@ func.func @main(%arg0: !mhlo.token, %arg1: !mhlo.token) -> !mhlo.token {
 
 // -----
 
+
+// CHECK:  HloModule
+func.func @main() -> !mhlo.token {
+  %0 = "mhlo.after_all"() : () -> !mhlo.token
+  func.return %0 : !mhlo.token
+}
+
+// CHECK:  ROOT [[TOKEN:%.*]] = token[] after-all()
+
+// -----
+
 // CHECK:  HloModule
 func.func @main(%arg0: tensor<10xf32>) -> tensor<5xf32> {
   %0 = "mhlo.reduce_scatter"(%arg0) ({
@@ -112,7 +123,7 @@ func.func @all_gather_0(%arg1: tensor<128x32xf32>) -> tensor<128x128xf32> attrib
 
 func.func @main(%arg0: tensor<128x32xf32>) -> tensor<128x128xf32> {
   %0 = "mhlo.async_start"(%arg0) {called_computation = @all_gather_0, execution_thread = "main"} : (tensor<128x32xf32>) -> !mhlo.async_bundle<tensor<128x32xf32>, tensor<128x128xf32>>
-  %1 = "mhlo.async_done"(%0) {called_computation = @all_gather_0, execution_thread = "main"} : (!mhlo.async_bundle<tensor<128x32xf32>, tensor<128x128xf32>>) -> tensor<128x128xf32>
+  %1 = "mhlo.async_done"(%0) : (!mhlo.async_bundle<tensor<128x32xf32>, tensor<128x128xf32>>) -> tensor<128x128xf32>
   return %1 : tensor<128x128xf32>
 }
 
@@ -147,7 +158,7 @@ func.func @all_reduce_0(%arg0: tensor<10xf32>) -> tensor<10xf32> attributes {exe
 
 func.func @main(%arg0: tensor<10xf32>) -> tensor<10xf32> {
   %0 = "mhlo.async_start"(%arg0) {called_computation = @all_reduce_0, execution_thread = "main"} : (tensor<10xf32>) -> !mhlo.async_bundle<tensor<10xf32>, tensor<10xf32>>
-  %1 = "mhlo.async_done"(%0) {called_computation = @all_reduce_0, execution_thread = "main"} : (!mhlo.async_bundle<tensor<10xf32>, tensor<10xf32>>) -> tensor<10xf32>
+  %1 = "mhlo.async_done"(%0) : (!mhlo.async_bundle<tensor<10xf32>, tensor<10xf32>>) -> tensor<10xf32>
   return %1 : tensor<10xf32>
 }
 
@@ -181,7 +192,7 @@ func.func @all_reduce_0(%arg0: tensor<10xf32>, %arg1: tensor<1xf32>) -> (tensor<
 
 func.func @main(%arg0: tensor<10xf32>, %arg1: tensor<1xf32>) -> (tensor<10xf32>, tensor<1xf32>) {
   %0 = "mhlo.async_start"(%arg0, %arg1) {called_computation = @all_reduce_0, execution_thread = "main"} : (tensor<10xf32>, tensor<1xf32>) -> !mhlo.async_bundle<tuple<tensor<10xf32>,tensor<1xf32>>, tuple<tensor<10xf32>,tensor<1xf32>>>
-  %1:2 = "mhlo.async_done"(%0) {called_computation = @all_reduce_0, execution_thread = "main"} : (!mhlo.async_bundle<tuple<tensor<10xf32>,tensor<1xf32>>, tuple<tensor<10xf32>,tensor<1xf32>>>) -> (tensor<10xf32>, tensor<1xf32>)
+  %1:2 = "mhlo.async_done"(%0) : (!mhlo.async_bundle<tuple<tensor<10xf32>,tensor<1xf32>>, tuple<tensor<10xf32>,tensor<1xf32>>>) -> (tensor<10xf32>, tensor<1xf32>)
   return %1#0, %1#1 : tensor<10xf32>, tensor<1xf32>
 }
 
@@ -200,7 +211,7 @@ func.func @all_gather_0(%arg0: tensor<8x2xf32>, %arg1: tensor<8x4xf32>) -> (tens
 
 func.func @main(%arg0: tensor<8x2xf32>, %arg1: tensor<8x4xf32>) -> (tensor<8x2xf32>, tensor<8x4xf32>) {
   %0 = "mhlo.async_start"(%arg0, %arg1) {called_computation = @all_gather_0, execution_thread = "main"} : (tensor<8x2xf32>, tensor<8x4xf32>) -> !mhlo.async_bundle<tuple<tensor<8x2xf32>,tensor<8x4xf32>>, tuple<tensor<8x2xf32>,tensor<8x4xf32>>>
-  %1:2 = "mhlo.async_done"(%0) {called_computation = @all_gather_0, execution_thread = "main"} : (!mhlo.async_bundle<tuple<tensor<8x2xf32>,tensor<8x4xf32>>, tuple<tensor<8x2xf32>,tensor<8x4xf32>>>) -> (tensor<8x2xf32>, tensor<8x4xf32>)
+  %1:2 = "mhlo.async_done"(%0) : (!mhlo.async_bundle<tuple<tensor<8x2xf32>,tensor<8x4xf32>>, tuple<tensor<8x2xf32>,tensor<8x4xf32>>>) -> (tensor<8x2xf32>, tensor<8x4xf32>)
   return %1#0, %1#1 : tensor<8x2xf32>, tensor<8x4xf32>
 }
 
@@ -624,7 +635,7 @@ func.func @collective_permute_0(%arg0: tensor<128x32xf32>) -> tensor<128x32xf32>
 
 func.func @main(%arg0: tensor<128x32xf32>) -> tensor<128x32xf32> {
   %0 = "mhlo.async_start"(%arg0) {called_computation = @collective_permute_0, execution_thread = "main"} : (tensor<128x32xf32>) -> !mhlo.async_bundle<tensor<128x32xf32>, tensor<128x32xf32>>
-  %1 = "mhlo.async_done"(%0) {called_computation = @collective_permute_0, execution_thread = "main"} : (!mhlo.async_bundle<tensor<128x32xf32>, tensor<128x32xf32>>) -> tensor<128x32xf32>
+  %1 = "mhlo.async_done"(%0) : (!mhlo.async_bundle<tensor<128x32xf32>, tensor<128x32xf32>>) -> tensor<128x32xf32>
   return %1 : tensor<128x32xf32>
 }
 
@@ -889,7 +900,7 @@ func.func @copy_0(%arg0: tensor<128x32xf32>) -> tensor<128x32xf32> attributes {e
 
 func.func @main(%arg0: tensor<128x32xf32>) -> tensor<128x32xf32> {
   %0 = "mhlo.async_start"(%arg0) {called_computation = @copy_0, execution_thread = "main"} : (tensor<128x32xf32>) -> !mhlo.async_bundle<tensor<128x32xf32>, tensor<128x32xf32>>
-  %1 = "mhlo.async_done"(%0) {called_computation = @copy_0, execution_thread = "main"} : (!mhlo.async_bundle<tensor<128x32xf32>, tensor<128x32xf32>>) -> tensor<128x32xf32>
+  %1 = "mhlo.async_done"(%0) : (!mhlo.async_bundle<tensor<128x32xf32>, tensor<128x32xf32>>) -> tensor<128x32xf32>
   return %1 : tensor<128x32xf32>
 }
 
@@ -2134,7 +2145,7 @@ func.func @recv_0(%token: !mhlo.token) -> (!mhlo.token) attributes {execution_th
 
 func.func @main(%token: !mhlo.token) -> (!mhlo.token) {
   %0 = "mhlo.async_start"(%token) {called_computation = @recv_0, execution_thread = "main"} : (!mhlo.token) -> !mhlo.async_bundle<!mhlo.token, !mhlo.token, tensor<i32>>
-  %2 = "mhlo.async_done"(%0) {called_computation = @recv_0, execution_thread = "main"} : (!mhlo.async_bundle<!mhlo.token, !mhlo.token, tensor<i32>>) -> (!mhlo.token)
+  %2 = "mhlo.async_done"(%0) : (!mhlo.async_bundle<!mhlo.token, !mhlo.token, tensor<i32>>) -> (!mhlo.token)
   return %2 : !mhlo.token
 }
 
@@ -2159,7 +2170,7 @@ func.func @recv_0(%token: !mhlo.token) -> (tensor<3x4xi32>, !mhlo.token) attribu
 
 func.func @main(%token: !mhlo.token) -> (tensor<3x4xi32>, !mhlo.token) {
   %0 = "mhlo.async_start"(%token) {called_computation = @recv_0, execution_thread = "main", mhlo.sharding = "{{maximal device=0}, {maximal device=0}, {maximal device=0}}"} : (!mhlo.token) -> !mhlo.async_bundle<!mhlo.token, tuple<tensor<3x4xi32>, !mhlo.token>, tensor<i32>>
-  %1, %2 = "mhlo.async_done"(%0) {called_computation = @recv_0, execution_thread = "main", mhlo.sharding = "{{maximal device=0}, {maximal device=0}}"} : (!mhlo.async_bundle<!mhlo.token, tuple<tensor<3x4xi32>, !mhlo.token>, tensor<i32>>) -> (tensor<3x4xi32>, !mhlo.token)
+  %1, %2 = "mhlo.async_done"(%0) {mhlo.sharding = "{{maximal device=0}, {maximal device=0}}"} : (!mhlo.async_bundle<!mhlo.token, tuple<tensor<3x4xi32>, !mhlo.token>, tensor<i32>>) -> (tensor<3x4xi32>, !mhlo.token)
   return %1, %2 : tensor<3x4xi32>, !mhlo.token
 }
 
@@ -2468,7 +2479,7 @@ func.func @send_0(%arg: tensor<3x4xi32>, %token: !mhlo.token) -> !mhlo.token att
 
 func.func @main(%arg: tensor<3x4xi32>, %token: !mhlo.token) -> !mhlo.token {
   %0 = "mhlo.async_start"(%arg, %token) {called_computation = @send_0, execution_thread = "main"} : (tensor<3x4xi32>, !mhlo.token) -> !mhlo.async_bundle<tuple<tensor<3x4xi32>, !mhlo.token>, !mhlo.token, tensor<i32>>
-  %1 = "mhlo.async_done"(%0) {called_computation = @send_0, execution_thread = "main"} : (!mhlo.async_bundle<tuple<tensor<3x4xi32>, !mhlo.token>, !mhlo.token, tensor<i32>>) -> !mhlo.token
+  %1 = "mhlo.async_done"(%0) : (!mhlo.async_bundle<tuple<tensor<3x4xi32>, !mhlo.token>, !mhlo.token, tensor<i32>>) -> !mhlo.token
   return %1 : !mhlo.token
 }
 
@@ -2494,7 +2505,7 @@ func.func @send_0(%token: !mhlo.token) -> !mhlo.token attributes {execution_thre
 
 func.func @main(%token: !mhlo.token) -> !mhlo.token {
   %0 = "mhlo.async_start"(%token) {called_computation = @send_0, execution_thread = "main"} : (!mhlo.token) -> !mhlo.async_bundle<!mhlo.token, !mhlo.token, tensor<i32>>
-  %1 = "mhlo.async_done"(%0) {called_computation = @send_0, execution_thread = "main"} : (!mhlo.async_bundle<!mhlo.token, !mhlo.token, tensor<i32>>) -> !mhlo.token
+  %1 = "mhlo.async_done"(%0) : (!mhlo.async_bundle<!mhlo.token, !mhlo.token, tensor<i32>>) -> !mhlo.token
   return %1 : !mhlo.token
 }
 
@@ -2941,9 +2952,9 @@ func.func @main(%arg0: tensor<10xf32>) -> tensor<20xf32> {
   // CHECK-SAME: calls=[[CALLED_COMPUTATION]]
   %0 = "mhlo.async_start"(%arg0) {called_computation = @AsyncOp, execution_thread = "thread"} : (tensor<10xf32>) -> !mhlo.async_bundle<tuple<tensor<10xf32>>, tensor<20xf32>, tensor<i32>>
   // CHECK: %[[UPDATE:.*]] = ((f32[10]), f32[20], s32[]) async-update(((f32[10]), f32[20], s32[]) %[[START]])
-  %1 = "mhlo.async_update"(%0) {called_computation = @AsyncOp, execution_thread = "thread"} : (!mhlo.async_bundle<tuple<tensor<10xf32>>, tensor<20xf32>, tensor<i32>>) -> !mhlo.async_bundle<tuple<tensor<10xf32>>, tensor<20xf32>, tensor<i32>>
+  %1 = "mhlo.async_update"(%0) : (!mhlo.async_bundle<tuple<tensor<10xf32>>, tensor<20xf32>, tensor<i32>>) -> !mhlo.async_bundle<tuple<tensor<10xf32>>, tensor<20xf32>, tensor<i32>>
   // CHECK: ROOT %{{.*}} = (f32[20]) async-done(((f32[10]), f32[20], s32[]) %[[UPDATE]])
-  %2 = "mhlo.async_done"(%1) {called_computation = @AsyncOp, execution_thread = "thread"} : (!mhlo.async_bundle<tuple<tensor<10xf32>>, tensor<20xf32>, tensor<i32>>) -> tensor<20xf32>
+  %2 = "mhlo.async_done"(%1) : (!mhlo.async_bundle<tuple<tensor<10xf32>>, tensor<20xf32>, tensor<i32>>) -> tensor<20xf32>
   return %2 : tensor<20xf32>
 }
 
@@ -2968,8 +2979,8 @@ func.func @main(%arg0: tensor<10xf32>) -> tensor<20xf32> {
   // CHECK-SAME: (f32[20]) async-done(((f32[10]), f32[20], s32[]) %[[UPDATE]])
 
   %0 = "mhlo.async_start"(%arg0) {called_computation = @AsyncOp, execution_thread="thread"} : (tensor<10xf32>) -> !mhlo.async_bundle<tuple<tensor<10xf32>>, tensor<20xf32>, tensor<i32>>
-  %1 = "mhlo.async_update"(%0) {called_computation = @AsyncOp, execution_thread="thread"} : (!mhlo.async_bundle<tuple<tensor<10xf32>>, tensor<20xf32>, tensor<i32>>) -> !mhlo.async_bundle<tuple<tensor<10xf32>>, tensor<20xf32>, tensor<i32>>
-  %2 = "mhlo.async_done"(%1) {called_computation = @AsyncOp, execution_thread="thread"} : (!mhlo.async_bundle<tuple<tensor<10xf32>>, tensor<20xf32>, tensor<i32>>) -> tensor<20xf32>
+  %1 = "mhlo.async_update"(%0) : (!mhlo.async_bundle<tuple<tensor<10xf32>>, tensor<20xf32>, tensor<i32>>) -> !mhlo.async_bundle<tuple<tensor<10xf32>>, tensor<20xf32>, tensor<i32>>
+  %2 = "mhlo.async_done"(%1) : (!mhlo.async_bundle<tuple<tensor<10xf32>>, tensor<20xf32>, tensor<i32>>) -> tensor<20xf32>
   return %2 : tensor<20xf32>
 }
 
@@ -3043,7 +3054,7 @@ func.func @main(%arg: tensor<3x4xf32>) -> tensor<3x4xf32> {
 // CHECK: %[[ARG0:.*]] = f32[3,4] parameter(0)
 // CHECK: %[[TOK:.*]] = token[] after-all()
 // CHECK: ROOT %[[RESULT:.*]] = f32[3,4] add-dependency(f32[3,4] %[[ARG0]], token[] %[[TOK]])
-  %token = "mhlo.create_token"() : () -> !mhlo.token
+  %token = "mhlo.after_all"() : () -> !mhlo.token
   %0 = "mhlo.add_dependency"(%arg, %token) : (tensor<3x4xf32>, !mhlo.token) -> tensor<3x4xf32>
   func.return %0 : tensor<3x4xf32>
 }
@@ -3052,7 +3063,7 @@ func.func @main(%arg: tensor<3x4xf32>) -> tensor<3x4xf32> {
 
 // CHECK:  HloModule
 func.func @main(%arg: tensor<3x4xf32>) -> tensor<3x4xf32> attributes {execution_thread = "test_thread"} {
-  %token = "mhlo.create_token"() : () -> !mhlo.token
+  %token = "mhlo.after_all"() : () -> !mhlo.token
   %0 = "mhlo.add_dependency"(%arg, %token) : (tensor<3x4xf32>, !mhlo.token) -> tensor<3x4xf32>
   func.return %0 : tensor<3x4xf32>
 }
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/tests/export_and_check_layouts.mlir b/third_party/xla/xla/translate/mhlo_to_hlo/tests/export_and_check_layouts.mlir
index af3c26cdfd30bb..bfd5bfd33bddde 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/tests/export_and_check_layouts.mlir
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/tests/export_and_check_layouts.mlir
@@ -19,7 +19,7 @@ func.func @main(%arg0: tensor<2x3xf32>, %arg1: tensor<5x5xf32>) -> tensor<1x2x3x
 // CHECK: (token[]) custom-call(
 module @jit_f {
   func.func public @main(%arg0: tensor<0xi1>, %arg1: tensor<i64>) -> tensor<0xi1> {
-    %0 = mhlo.create_token : !mhlo.token
+    %0 = mhlo.after_all : !mhlo.token
     %1 = mhlo.constant dense<57202498903760> : tensor<i64>
     %2 = "mhlo.custom_call"(%1, %0, %arg1) {api_version = 2 : i32, call_target_name ="xla_python_cpu_callback", has_side_effect = true, operand_layouts = [dense<> : tensor<0xindex>, dense<> : tensor<0xindex>, dense<> : tensor<0xindex>], result_layouts = [dense<> : tensor<0xindex>]} : (tensor<i64>, !mhlo.token, tensor<i64>) -> tuple<!mhlo.token>
     %3 = mhlo.get_tuple_element %2[0] : (tuple<!mhlo.token>) -> !mhlo.token
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/tests/sharding.mlir b/third_party/xla/xla/translate/mhlo_to_hlo/tests/sharding.mlir
index 96df99713d6d3d..bb1cde66a2ec87 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/tests/sharding.mlir
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/tests/sharding.mlir
@@ -14,27 +14,16 @@ func.func public @main(%arg0: tensor<f32> {mhlo.sharding = ""}, %arg1: tensor<4x
 // -----
 
 // CHECK-LABEL: ENTRY %main.{{.*}} ({{[^,]*}}: f32[5,8,128]) -> f32[5,8,128]
-func.func @main(%arg0: tensor<5x8x128xf32> {mhlo.sharding = "{devices=[1,2,1]0,1}"}) -> (tensor<5x8x128xf32> {mhlo.sharding = "{devices=[1,2,1]0,1}"}) {
-  // CHECK-NEXT: %Arg_0.1 = f32[5,8,128] parameter(0), sharding={devices=[1,2,1]0,1}
-  // CHECK-NEXT: %custom-call.2 = f32[5,8,128] custom-call(f32[5,8,128] %Arg_0.1), custom_call_target="Sharding", sharding={devices=[1,2,1]0,1}
-  // CHECK-NEXT: %tuple.3 = (f32[5,8,128]) tuple(f32[5,8,128] %custom-call.2), sharding={{\{}}{devices=[1,2,1]0,1}}
-  // CHECK-NEXT: ROOT %get-tuple-element.4 = f32[5,8,128] get-tuple-element((f32[5,8,128]) %tuple.3), index=0, sharding={devices=[1,2,1]0,1}
-  %0 = "mhlo.custom_call"(%arg0) {call_target_name = "Sharding", mhlo.sharding = "{devices=[1,2,1]0,1}"} : (tensor<5x8x128xf32>) -> tensor<5x8x128xf32>
-  func.return %0 : tensor<5x8x128xf32>
-}
-
-// -----
-
-// CHECK-LABEL: ENTRY %main.{{.*}} ({{[^,]*}}: f32[5,8,128]) -> (f32[5,8,128])
-func.func @main(%arg0: tensor<5x8x128xf32> {mhlo.sharding = "{devices=[1,2,1]0,1}"}) -> (tuple<tensor<5x8x128xf32>> {mhlo.sharding = "{{devices=[1,2,1]0,1}}"}) {
+func.func @main(%arg0: tensor<5x8x128xf32> {mhlo.sharding = "\08\03\1A\03\01\02\01\22\02\00\01"}) -> (tensor<5x8x128xf32> {mhlo.sharding = "\08\03\1A\03\01\02\01\22\02\00\01"}) {
   // CHECK-NEXT: %Arg_0.1 = f32[5,8,128] parameter(0), sharding={devices=[1,2,1]0,1}
   // CHECK-NEXT: %custom-call.2 = f32[5,8,128] custom-call(f32[5,8,128] %Arg_0.1), custom_call_target="Sharding", sharding={devices=[1,2,1]0,1}
   // CHECK-NEXT: %tuple.3 = (f32[5,8,128]) tuple(f32[5,8,128] %custom-call.2)
-  // CHECK-NEXT: %tuple.4 = ((f32[5,8,128])) tuple((f32[5,8,128]) %tuple.3), sharding={{\{}}{devices=[1,2,1]0,1}}
-  // CHECK-NEXT: ROOT %get-tuple-element.5 = (f32[5,8,128]) get-tuple-element(((f32[5,8,128])) %tuple.4), index=0, sharding={{\{}}{devices=[1,2,1]0,1}}
-  %0 = "mhlo.custom_call"(%arg0) {call_target_name = "Sharding", mhlo.sharding = "{devices=[1,2,1]0,1}"} : (tensor<5x8x128xf32>) -> tensor<5x8x128xf32>
-  %1 = "mhlo.tuple"(%0) : (tensor<5x8x128xf32>) -> tuple<tensor<5x8x128xf32>>
-  func.return %1 : tuple<tensor<5x8x128xf32>>
+  // CHECK-NEXT: ROOT %get-tuple-element.4 = f32[5,8,128] get-tuple-element((f32[5,8,128]) %tuple.3), index=0
+  // CHECK-SAME: sharding={devices=[1,2,1]0,1}
+  %0 = "mhlo.custom_call"(%arg0) {call_target_name = "Sharding",
+				  mhlo.sharding = "\08\03\1A\03\01\02\01\22\02\00\01"
+				 } : (tensor<5x8x128xf32>) -> tensor<5x8x128xf32>
+  func.return %0 : tensor<5x8x128xf32>
 }
 
 // -----
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/translate.cc b/third_party/xla/xla/translate/mhlo_to_hlo/translate.cc
index 409bab6c722cef..c7d30c709a6c06 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/translate.cc
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/translate.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "xla/service/hlo.pb.h"
 #include "xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h"
 #include "xla/translate/mhlo_to_hlo/type_to_shape.h"
@@ -31,7 +32,7 @@ mlir::LogicalResult MlirHloToHloTranslateFunction(mlir::ModuleOp module,
   if (!module) return mlir::failure();
 
   HloProto hloProto;
-  Status status = mlir::ConvertMlirHloToHlo(
+  absl::Status status = mlir::ConvertMlirHloToHlo(
       module, &hloProto, emit_use_tuple_arg, emit_return_tuple);
   if (!status.ok()) {
     module.emitOpError() << status.message();
@@ -54,9 +55,9 @@ absl::StatusOr<std::unique_ptr<HloModule>> HloModuleFromProto(
 
 // Wraps BuildHloFromMlirHlo to output an HloProto that's the same as
 // ConvertMlirHloToHlo.
-Status ConvertMlirHloToHloViaBuilder(mlir::ModuleOp module,
-                                     ::xla::HloProto* hlo_proto,
-                                     mlir::MlirToHloConversionOptions options) {
+absl::Status ConvertMlirHloToHloViaBuilder(
+    mlir::ModuleOp module, ::xla::HloProto* hlo_proto,
+    mlir::MlirToHloConversionOptions options) {
   mlir::func::FuncOp main = module.lookupSymbol<mlir::func::FuncOp>("main");
   mlir::Block& block = main.getRegion().front();
   xla::XlaBuilder builder("main");
@@ -99,12 +100,12 @@ Status ConvertMlirHloToHloViaBuilder(mlir::ModuleOp module,
             ->mutable_instructions(i)
             ->mutable_parameter_replication()
             ->add_replicated_at_leaf_buffers(
-                b.cast<mlir::BoolAttr>().getValue());
+                mlir::cast<mlir::BoolAttr>(b).getValue());
 
   auto hlo_module = computation.proto();
   hlo_proto->mutable_hlo_module()->Swap(&hlo_module);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 mlir::LogicalResult MlirHloToHloTextTranslateFunction(
@@ -116,7 +117,7 @@ mlir::LogicalResult MlirHloToHloTextTranslateFunction(
   HloProto hloProto;
   mlir::MlirToHloConversionOptions options;
   options.propagate_layouts = with_layouts;
-  Status status =
+  absl::Status status =
       via_builder
           ? ConvertMlirHloToHloViaBuilder(module, &hloProto, options)
           : mlir::ConvertMlirHloToHlo(module, &hloProto, emit_use_tuple_arg,
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/type_to_shape.cc b/third_party/xla/xla/translate/mhlo_to_hlo/type_to_shape.cc
index ddc92393ca86a0..bd74bec471dfec 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/type_to_shape.cc
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/type_to_shape.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "xla/mlir/utils/type_util.h"
@@ -85,20 +86,20 @@ Shape TypeToShape(mlir::Type type) {
     mlir::emitError(mlir::UnknownLoc::get(context))
         << "lowering should have been handled by primitive type lowering for "
         << debugString(type);
-  } else if (auto v = type.dyn_cast<mlir::VectorType>()) {
+  } else if (auto v = mlir::dyn_cast<mlir::VectorType>(type)) {
     llvm::SmallVector<int64_t, 4> span(v.getShape().begin(),
                                        v.getShape().end());
     mlir::Type element_type = v.getElementType();
     PrimitiveType primitive_type = ConvertMlirTypeToPrimitiveType(element_type);
     if (primitive_type != PrimitiveType::PRIMITIVE_TYPE_INVALID)
       return ShapeUtil::MakeShape(primitive_type, span);
-  } else if (auto m = type.dyn_cast<mlir::MemRefType>()) {
+  } else if (auto m = mlir::dyn_cast<mlir::MemRefType>(type)) {
     llvm::SmallVector<int64_t, 6> span(m.getShape().begin(),
                                        m.getShape().end());
     mlir::Type element_type = m.getElementType();
     // Treat a memref of a vector as if it was a memref of primitive type with
     // the vector dimensions at the end.
-    if (auto v = element_type.dyn_cast<mlir::VectorType>()) {
+    if (auto v = mlir::dyn_cast<mlir::VectorType>(element_type)) {
       element_type = v.getElementType();
       span.insert(span.end(), v.getShape().begin(), v.getShape().end());
     }
@@ -136,12 +137,13 @@ Shape TypeToShape(mlir::Type type) {
                                              m.getShape().end());
     return ::xla::ShapeUtil::MakeShapeWithDenseLayout(
         primitive_type, dimensions, minor_to_major);
-  } else if (auto t = type.dyn_cast<mlir::RankedTensorType>()) {
+  } else if (auto t = mlir::dyn_cast<mlir::RankedTensorType>(type)) {
     // TODO(jpienaar): This is only handling the base case with primitive
     // element type.
     int64_t rank = t.getRank();
     llvm::SmallVector<int64_t, 4> bounds;
-    if (auto extn = t.getEncoding().dyn_cast_or_null<TypeExtensionsAttr>()) {
+    if (auto extn =
+            mlir::dyn_cast_or_null<TypeExtensionsAttr>(t.getEncoding())) {
       bounds = llvm::to_vector<4>(extn.getBounds());
     } else {
       bounds.assign(rank, ShapedType::kDynamic);
@@ -203,7 +205,7 @@ Shape TypeToShape(mlir::Type type) {
     }
 
     return ShapeUtil::MakeShape(primitive_type, shape, is_dynamic);
-  } else if (auto tuple_type = type.dyn_cast<mlir::TupleType>()) {
+  } else if (auto tuple_type = mlir::dyn_cast<mlir::TupleType>(type)) {
     llvm::SmallVector<Shape, 4> shapes;
     shapes.reserve(tuple_type.size());
     for (mlir::Type sub_type : tuple_type.getTypes()) {
@@ -211,10 +213,11 @@ Shape TypeToShape(mlir::Type type) {
     }
     return ShapeUtil::MakeTupleShape(shapes);
 
-  } else if (type.isa<mlir::mhlo::TokenType>() ||
-             type.isa<mlir::stablehlo::TokenType>()) {
+  } else if (mlir::isa<mlir::mhlo::TokenType>(type) ||
+             mlir::isa<mlir::stablehlo::TokenType>(type)) {
     return ShapeUtil::MakeTokenShape();
-  } else if (auto bundle_type = type.dyn_cast<mlir::mhlo::AsyncBundleType>()) {
+  } else if (auto bundle_type =
+                 mlir::dyn_cast<mlir::mhlo::AsyncBundleType>(type)) {
     auto tuple_type =
         mlir::TupleType::get(type.getContext(), bundle_type.getTypes());
     return TypeToShape(tuple_type);
diff --git a/third_party/xla/xla/tsl/BUILD b/third_party/xla/xla/tsl/BUILD
index 8a7cb42086d5d5..4d15019d7586f4 100644
--- a/third_party/xla/xla/tsl/BUILD
+++ b/third_party/xla/xla/tsl/BUILD
@@ -2,6 +2,7 @@ load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("@bazel_skylib//lib:selects.bzl", "selects")
 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag", "bool_setting")
 load("tsl.bzl", "if_google", "if_oss")
+load("tsl.default.bzl", "tsl_google_bzl_deps")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
@@ -542,6 +543,16 @@ bzl_library(
     ],
 )
 
+bzl_library(
+    name = "tsl_google_bzl",
+    srcs = if_google(["tsl.default.bzl"]),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":tsl_bzl",
+        "@local_tsl//tsl/platform:build_config_root_bzl",
+    ] + tsl_google_bzl_deps(),
+)
+
 # copybara:comment_begin(oss-only)
 cc_library(
     name = "grpc++",
diff --git a/third_party/xla/xla/tsl/c/tsl_status.cc b/third_party/xla/xla/tsl/c/tsl_status.cc
index d6f86a71aa7aea..fea89436d4bafe 100644
--- a/third_party/xla/xla/tsl/c/tsl_status.cc
+++ b/third_party/xla/xla/tsl/c/tsl_status.cc
@@ -63,5 +63,5 @@ TSL_Code TSL_GetCode(const TSL_Status* s) {
 }
 
 const char* TSL_Message(const TSL_Status* s) {
-  return tsl::NullTerminatedMessage(s->status);
+  return absl::StatusMessageAsCStr(s->status);
 }
diff --git a/third_party/xla/third_party/tsl/tsl/concurrency/BUILD b/third_party/xla/xla/tsl/concurrency/BUILD
similarity index 71%
rename from third_party/xla/third_party/tsl/tsl/concurrency/BUILD
rename to third_party/xla/xla/tsl/concurrency/BUILD
index 9eded5f2cab510..0363d152edac7c 100644
--- a/third_party/xla/third_party/tsl/tsl/concurrency/BUILD
+++ b/third_party/xla/xla/tsl/concurrency/BUILD
@@ -1,6 +1,6 @@
+load("@local_tsl//tsl/platform:build_config.bzl", "tsl_cc_test")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("@local_xla//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
-load("//tsl/platform:build_config.bzl", "tsl_cc_test")
+load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -23,8 +23,6 @@ cc_library(
     deps = [
         ":concurrent_vector",
         ":ref_count",
-        "//tsl/platform:logging",
-        "//tsl/platform:platform_port",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:any_invocable",
@@ -32,6 +30,8 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:platform_port",
     ],
 )
 
@@ -40,9 +40,9 @@ tsl_cc_test(
     srcs = ["async_value_test.cc"],
     deps = [
         ":async_value",
-        "//tsl/platform:test",
-        "//tsl/platform:test_main",
         "@com_google_absl//absl/status",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
     ],
 )
 
@@ -51,11 +51,13 @@ tsl_cc_test(
     srcs = ["async_value_ptr_test.cc"],
     deps = [
         ":async_value",
-        "//tsl/platform:test",
-        "//tsl/platform:test_main",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_benchmark",
+        "@local_tsl//tsl/platform:test_main",
     ],
 )
 
@@ -65,11 +67,12 @@ tsl_cc_test(
     deps = [
         ":async_value",
         ":ref_count",
-        "//tsl/platform:test",
-        "//tsl/platform:test_main",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
     ],
 )
 
@@ -80,6 +83,7 @@ cc_library(
     deps = [
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:logging",
     ],
 )
 
@@ -88,10 +92,10 @@ tsl_cc_test(
     srcs = ["concurrent_vector_test.cc"],
     deps = [
         ":concurrent_vector",
-        "//tsl/platform:env",
-        "//tsl/platform:env_impl",
-        "//tsl/platform:test",
-        "//tsl/platform:test_main",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:env_impl",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
     ],
 )
 
diff --git a/third_party/xla/third_party/tsl/tsl/concurrency/async_value.cc b/third_party/xla/xla/tsl/concurrency/async_value.cc
similarity index 92%
rename from third_party/xla/third_party/tsl/tsl/concurrency/async_value.cc
rename to third_party/xla/xla/tsl/concurrency/async_value.cc
index 431e6272279aa0..2de2bbbd6e5038 100644
--- a/third_party/xla/third_party/tsl/tsl/concurrency/async_value.cc
+++ b/third_party/xla/xla/tsl/concurrency/async_value.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/concurrency/async_value.h"
+#include "xla/tsl/concurrency/async_value.h"
 
 #include <atomic>
 #include <cstdint>
@@ -25,8 +25,8 @@ limitations under the License.
 #include "absl/functional/any_invocable.h"
 #include "absl/synchronization/blocking_counter.h"
 #include "absl/types/span.h"
-#include "tsl/concurrency/async_value_ref.h"
-#include "tsl/concurrency/ref_count.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "tsl/platform/logging.h"
 
 namespace tsl {
@@ -166,17 +166,19 @@ void IndirectAsyncValue::ForwardTo(RCReference<AsyncValue> value) {
       concrete_value->AddRef();
       indirect_value->DropRef();
     }
+    // If indirect async value was created for any particular type id, check
+    // that forwarded to value has exactly the same type id or an error.
+    DCHECK(type_id_ == kUnknownTypeId || type_id_ == concrete_value->type_id_ ||
+           concrete_value->IsType<DummyValueForErrorAsyncValue>())
+        << "IndirectAsyncValue::ForwardTo value has an unexpected type id";
     value_ = concrete_value;
     type_id_ = concrete_value->type_id_;
     NotifyAvailable(s);
   } else {
-    // Copy value here because the evaluation order of
-    // value->AndThen(std::move(value)) is not defined prior to C++17.
-    AsyncValue* value2 = value.get();
-    value2->AndThen(
-        [this2 = FormRef(this), value2 = std::move(value)]() mutable {
-          this2->ForwardTo(std::move(value2));
-        });
+    AsyncValue* av = value.get();
+    av->AndThen([self = FormRef(this), value = std::move(value)]() mutable {
+      self->ForwardTo(std::move(value));
+    });
   }
 }
 
diff --git a/third_party/xla/third_party/tsl/tsl/concurrency/async_value.h b/third_party/xla/xla/tsl/concurrency/async_value.h
similarity index 90%
rename from third_party/xla/third_party/tsl/tsl/concurrency/async_value.h
rename to third_party/xla/xla/tsl/concurrency/async_value.h
index 41049938ad87d5..fb7068b15538f2 100644
--- a/third_party/xla/third_party/tsl/tsl/concurrency/async_value.h
+++ b/third_party/xla/xla/tsl/concurrency/async_value.h
@@ -13,9 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_CONCURRENCY_ASYNC_VALUE_H_
-#define TENSORFLOW_TSL_CONCURRENCY_ASYNC_VALUE_H_
+#ifndef XLA_TSL_CONCURRENCY_ASYNC_VALUE_H_
+#define XLA_TSL_CONCURRENCY_ASYNC_VALUE_H_
 
+#include <algorithm>
 #include <atomic>
 #include <cassert>
 #include <cstddef>
@@ -28,8 +29,8 @@ limitations under the License.
 #include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
 #include "absl/types/span.h"
-#include "tsl/concurrency/concurrent_vector.h"
-#include "tsl/concurrency/ref_count.h"
+#include "xla/tsl/concurrency/concurrent_vector.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "tsl/platform/mem.h"
 
 namespace tsl {
@@ -58,6 +59,8 @@ constexpr bool kMaybeBase = std::is_class<T>::value && !std::is_final<T>::value;
 // data and the payload data in consecutive memory locations.
 class AsyncValue {
  public:
+  class Executor;
+
   ~AsyncValue();
 
   // Return true if state is kUnconstructed.
@@ -147,11 +150,15 @@ class AsyncValue {
 
   void SetError(absl::Status status);
 
-  // If the value is available or becomes available, this calls the closure
-  // immediately.  Otherwise, adds the waiter to the waiter list and calls it
+  // If the value is available or becomes available, this invokes the waiter
+  // immediately. Otherwise, adds the waiter to the waiter list and calls it
   // when the value becomes available.
-  template <typename WaiterT>
-  void AndThen(WaiterT waiter);
+  template <typename Waiter>
+  void AndThen(Waiter&& waiter);
+
+  // Same as above, but waiter will be invoked on the given executor.
+  template <typename Waiter>
+  void AndThen(Executor& executor, Waiter&& waiter);
 
   // Return the total number of async values that are currently live in the
   // process. This is intended for debugging/assertions only, and shouldn't be
@@ -248,12 +255,29 @@ class AsyncValue {
     return waiters_and_state_.load(std::memory_order_acquire).state();
   }
 
- protected:
-  // -----------------------------------------------------------
-  // Implementation details follow.  Clients should ignore them.
+  // AsyncValue executor allows to customize where the waiter callback is
+  // executed. By default the waiter callback is executed on the caller thread
+  // if async value is already available, or on a thread that sets async value
+  // available (emplacing a value or setting an error), which can accidentally
+  // lead to executing a very expensive computations on a low-latency thread.
+  //
+  // IMPORTANT: It's the caller responsibility to ensure that executor passed to
+  // all `AndThen` or `Map` function calls stay alive while async values have
+  // unresolved waiters waiting to be invoked.
+  class Executor {
+   public:
+    using Task = absl::AnyInvocable<void()>;
+
+    virtual ~Executor() = default;
+
+    virtual void Execute(Task task) = 0;
+  };
 
+ protected:
   friend class IndirectAsyncValue;
 
+  static constexpr uint16_t kUnknownTypeId = 0;
+
   // Utility template for tag dispatching.
   template <typename T>
   struct TypeTag {};
@@ -275,7 +299,7 @@ class AsyncValue {
         kind_(kind),
         has_vtable_(false),
         is_refcounted_(is_refcounted),
-        type_id_(0),
+        type_id_(kUnknownTypeId),
         waiters_and_state_(WaitersAndState(nullptr, state)) {
     if (AsyncValueAllocationTrackingEnabled() && is_refcounted)
       total_allocated_async_values_.fetch_add(1, std::memory_order_relaxed);
@@ -724,10 +748,12 @@ class ErrorAsyncValue
             std::move(status)) {}
 };
 
-// IndirectAsyncValue represents an uncomputed AsyncValue of unspecified kind
-// and type. IndirectAsyncValue is used when an AsyncValue must be returned,
-// but the value it holds is not ready and the producer of the value doesn't
-// know what type it will ultimately be, or whether it will be an error.
+// IndirectAsyncValue represents an un-computed AsyncValue of unspecified kind
+// and maybe unknown type. IndirectAsyncValue is used when an AsyncValue must be
+// returned, but the value it holds is not ready and the producer of the value
+// might not know what type it will ultimately be, or whether it will be an
+// error. The purpose of indirect async value is to be eventually forwarded to
+// a concrete async value with a constructed payload or an error.
 class IndirectAsyncValue : public AsyncValue {
   friend class AsyncValue;
 
@@ -758,9 +784,16 @@ class IndirectAsyncValue : public AsyncValue {
            value_->IsUnique();
   }
 
- private:
+ protected:
+  // Constructor for TypedIndirectAsyncValue (defined below).
+  template <typename T>
+  explicit IndirectAsyncValue(TypeTag<T>)
+      : AsyncValue(Kind::kIndirect, State::kUnconstructed,
+                   /*is_refcounted=*/true, TypeTag<T>()) {}
+
   ~IndirectAsyncValue() { Destroy(); }
 
+ private:
   void Destroy() {
     if (value_) {
       value_->DropRef();
@@ -771,9 +804,18 @@ class IndirectAsyncValue : public AsyncValue {
   AsyncValue* value_ = nullptr;
 };
 
-// -----------------------------------------------------------
-// Implementation details follow.  Clients should ignore them.
-//
+// TypedIndirectAsyncValue represents an indirect async value of a particular
+// type. Indirect async values constructed with a known type can be forwarded
+// only to async values of exactly the same type.
+template <typename T>
+class TypedIndirectAsyncValue : public IndirectAsyncValue {
+ public:
+  TypedIndirectAsyncValue() : IndirectAsyncValue(TypeTag<T>()) {
+    static_assert(sizeof(TypedIndirectAsyncValue) ==
+                  sizeof(IndirectAsyncValue));
+  }
+};
+
 inline AsyncValue::~AsyncValue() {
   assert(waiters_and_state_.load().waiter() == nullptr &&
          "An async value with waiters should never have refcount of zero");
@@ -948,11 +990,11 @@ inline const absl::Status& AsyncValue::GetError() const {
   return *result;
 }
 
-template <typename WaiterT>
-void AsyncValue::AndThen(WaiterT waiter) {
+template <typename Waiter>
+void AsyncValue::AndThen(Waiter&& waiter) {
   // Clients generally want to use AndThen without them each having to check
   // to see if the value is present. Check for them, and immediately run the
-  // lambda if it is already here.
+  // waiter if it is already here.
   auto old_value = waiters_and_state_.load(std::memory_order_acquire);
   if (old_value.state() == State::kConcrete ||
       old_value.state() == State::kError) {
@@ -960,7 +1002,26 @@ void AsyncValue::AndThen(WaiterT waiter) {
     waiter();
     return;
   }
-  EnqueueWaiter(std::forward<WaiterT>(waiter), old_value);
+  EnqueueWaiter(std::forward<Waiter>(waiter), old_value);
+}
+
+template <typename Waiter>
+void AsyncValue::AndThen(Executor& executor, Waiter&& waiter) {
+  // Clients generally want to use AndThen without them each having to check
+  // to see if the value is present. Check for them, and immediately run the
+  // waiter if it is already here.
+  auto old_value = waiters_and_state_.load(std::memory_order_acquire);
+  if (old_value.state() == State::kConcrete ||
+      old_value.state() == State::kError) {
+    assert(old_value.waiter() == nullptr);
+    executor.Execute(std::forward<Waiter>(waiter));
+    return;
+  }
+  EnqueueWaiter(
+      [&executor, waiter = std::forward<Waiter>(waiter)]() mutable {
+        executor.Execute(std::move(waiter));
+      },
+      old_value);
 }
 
 inline void AsyncValue::Destroy() {
@@ -992,4 +1053,4 @@ inline bool AsyncValue::IsUnique() const {
 
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_CONCURRENCY_ASYNC_VALUE_H_
+#endif  // XLA_TSL_CONCURRENCY_ASYNC_VALUE_H_
diff --git a/third_party/xla/xla/tsl/concurrency/async_value_ptr_test.cc b/third_party/xla/xla/tsl/concurrency/async_value_ptr_test.cc
new file mode 100644
index 00000000000000..597d5b66f2c1db
--- /dev/null
+++ b/third_party/xla/xla/tsl/concurrency/async_value_ptr_test.cc
@@ -0,0 +1,653 @@
+/* Copyright 2022 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/tsl/concurrency/async_value.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
+#include "tsl/platform/test.h"
+#include "tsl/platform/test_benchmark.h"
+
+namespace tsl {
+
+TEST(AsyncValuePtrTest, Construct) {
+  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+
+  EXPECT_EQ(ptr.get(), 42);
+}
+
+TEST(AsyncValuePtrTest, CopyRef) {
+  AsyncValueRef<int32_t> ref0 = MakeAvailableAsyncValueRef<int32_t>(42);
+  AsyncValuePtr<int32_t> ptr = ref0.AsPtr();
+
+  EXPECT_TRUE(ref0.IsUnique());  // pointer doesn't change the reference count
+
+  AsyncValueRef<int32_t> ref1 = ptr.CopyRef();
+
+  EXPECT_FALSE(ref0.IsUnique());
+  EXPECT_FALSE(ref1.IsUnique());
+}
+
+TEST(AsyncValuePtrTest, Emplace) {
+  AsyncValueRef<int32_t> ref = MakeUnconstructedAsyncValueRef<int32_t>();
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+
+  EXPECT_FALSE(ptr.IsConcrete());
+  EXPECT_FALSE(ptr.IsAvailable());
+
+  ptr.emplace(42);
+  EXPECT_EQ(ptr.get(), 42);
+}
+
+TEST(AsyncValuePtrTest, SetError) {
+  AsyncValueRef<int32_t> ref = MakeUnconstructedAsyncValueRef<int32_t>();
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+
+  EXPECT_FALSE(ptr.IsConcrete());
+  EXPECT_FALSE(ptr.IsAvailable());
+
+  ptr.SetError(absl::InternalError("test error"));
+
+  EXPECT_TRUE(ptr.IsAvailable());
+  EXPECT_TRUE(ptr.IsError());
+}
+
+TEST(AsyncValuePtrTest, AndThen) {
+  AsyncValueRef<int32_t> ref = MakeUnconstructedAsyncValueRef<int32_t>();
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+
+  EXPECT_FALSE(ptr.IsConcrete());
+  EXPECT_FALSE(ptr.IsAvailable());
+
+  bool executed = false;
+  ptr.AndThen([&]() { executed = true; });
+
+  ptr.emplace(42);
+  EXPECT_TRUE(executed);
+}
+
+TEST(AsyncValuePtrTest, AndThenError) {
+  AsyncValueRef<int32_t> ref = MakeConstructedAsyncValueRef<int32_t>(42);
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+
+  auto error = absl::InternalError("test error");
+  ptr.SetError(error);
+  ptr.AndThen([&](absl::Status status) { EXPECT_EQ(status, error); });
+}
+
+TEST(AsyncValuePtrTest, AndThenNoError) {
+  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+
+  ptr.AndThen([](absl::Status status) { EXPECT_TRUE(status.ok()); });
+}
+
+TEST(AsyncValuePtrTest, AndThenStatusOrError) {
+  AsyncValueRef<int32_t> ref = MakeConstructedAsyncValueRef<int32_t>(42);
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+
+  auto error = absl::InternalError("test error");
+  ptr.SetError(error);
+
+  ptr.AndThen([&](absl::StatusOr<int32_t*> v) {
+    EXPECT_FALSE(v.ok());
+    EXPECT_EQ(v.status(), error);
+  });
+}
+
+TEST(AsyncValuePtrTest, AndThenStatusOrNoError) {
+  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+
+  ptr.AndThen([&](absl::StatusOr<int32_t*> v) { EXPECT_EQ(**v, 42); });
+}
+
+TEST(AsyncValuePtrTest, MapAvailable) {
+  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+
+  AsyncValueRef<float> mapped_to_float =
+      ptr.Map([](int32_t value) -> float { return value; });
+  EXPECT_TRUE(mapped_to_float.IsAvailable());
+  EXPECT_EQ(mapped_to_float.get(), 42.0f);
+}
+
+TEST(AsyncValuePtrTest, MapUnvailable) {
+  AsyncValueRef<int32_t> ref = MakeConstructedAsyncValueRef<int32_t>(42);
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+
+  AsyncValueRef<float> mapped_to_float =
+      ptr.Map([](int32_t value) -> float { return value; });
+
+  EXPECT_FALSE(mapped_to_float.IsAvailable());
+  ref.SetStateConcrete();
+
+  EXPECT_TRUE(mapped_to_float.IsAvailable());
+  EXPECT_EQ(mapped_to_float.get(), 42.0f);
+}
+
+TEST(AsyncValuePtrTest, MapToNonMoveable) {
+  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+
+  AsyncValueRef<std::atomic<int32_t>> mapped_to_atomic =
+      ptr.Map<std::atomic<int32_t>>([](int32_t value) { return value; });
+  EXPECT_TRUE(mapped_to_atomic.IsAvailable());
+  EXPECT_EQ(mapped_to_atomic->load(), 42);
+}
+
+TEST(AsyncValuePtrTest, MapError) {
+  AsyncValueRef<int32_t> ref =
+      MakeErrorAsyncValueRef(absl::InternalError("error"));
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+
+  AsyncValueRef<float> mapped_to_float =
+      ptr.Map([](int32_t value) -> float { return value; });
+  EXPECT_TRUE(mapped_to_float.IsError());
+  EXPECT_EQ(mapped_to_float.GetError(), absl::InternalError("error"));
+}
+
+TEST(AsyncValuePtrTest, MapUnvailableError) {
+  AsyncValueRef<int32_t> ref = MakeConstructedAsyncValueRef<int32_t>(42);
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+
+  AsyncValueRef<float> mapped_to_float =
+      ptr.Map([](int32_t value) -> float { return value; });
+
+  EXPECT_FALSE(mapped_to_float.IsAvailable());
+  ref.SetError(absl::InternalError("error"));
+
+  EXPECT_TRUE(mapped_to_float.IsError());
+  EXPECT_EQ(mapped_to_float.GetError(), absl::InternalError("error"));
+}
+
+TEST(AsyncValuePtrTest, MapMultipleTimes) {
+  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+
+  auto plus_one = [](int32_t value) { return value + 1; };
+  AsyncValueRef<int32_t> mapped = ptr.Map(plus_one)
+                                      .Map(plus_one)
+                                      .Map(plus_one)
+                                      .Map(plus_one)
+                                      .Map(plus_one)
+                                      .Map(plus_one);
+
+  EXPECT_TRUE(mapped.IsAvailable());
+  EXPECT_EQ(mapped.get(), 42 + 6);
+}
+
+TEST(AsyncValuePtrTest, MapToStatus) {
+  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+
+  AsyncValueRef<absl::Status> mapped_to_status =
+      ptr.Map([](int32_t value) -> absl::Status { return absl::OkStatus(); });
+  EXPECT_TRUE(mapped_to_status.IsAvailable());
+  EXPECT_EQ(mapped_to_status.get(), absl::OkStatus());
+}
+
+TEST(AsyncValuePtrTest, MapToStatusOr) {
+  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+
+  AsyncValueRef<absl::StatusOr<float>> mapped_to_float =
+      ptr.Map([](int32_t value) -> absl::StatusOr<float> { return value; });
+  EXPECT_TRUE(mapped_to_float.IsAvailable());
+  EXPECT_EQ(*mapped_to_float.get(), 42.0f);
+}
+
+TEST(AsyncValuePtrTest, TryMap) {
+  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+
+  AsyncValueRef<float> mapped_to_float =
+      ptr.TryMap([](int32_t value) -> absl::StatusOr<float> { return value; });
+  EXPECT_TRUE(mapped_to_float.IsAvailable());
+  EXPECT_EQ(mapped_to_float.get(), 42.0f);
+}
+
+TEST(AsyncValuePtrTest, TryMapError) {
+  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+
+  AsyncValueRef<float> mapped_to_float =
+      ptr.TryMap([](int32_t value) -> absl::StatusOr<float> {
+        return absl::InternalError("error");
+      });
+  EXPECT_TRUE(mapped_to_float.IsError());
+  EXPECT_EQ(mapped_to_float.GetError(), absl::InternalError("error"));
+}
+
+TEST(AsyncValuePtrTest, TryMapConstructible) {
+  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+
+  struct X {
+    explicit X(float value) : value(value) {}
+    float value;
+  };
+
+  AsyncValueRef<X> mapped_to_x = ptr.TryMap<X>(
+      [](int32_t value) -> absl::StatusOr<float> { return value; });
+  EXPECT_TRUE(mapped_to_x.IsAvailable());
+  EXPECT_EQ(mapped_to_x->value, 42.0f);
+}
+
+TEST(AsyncValuePtrTest, FlatMapAvailable) {
+  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+
+  AsyncValueRef<float> fmapped_to_float = ptr.FlatMap([](int32_t value) {
+    return MakeAvailableAsyncValueRef<float>(1.0f * value);
+  });
+
+  EXPECT_TRUE(fmapped_to_float.IsAvailable());
+  EXPECT_EQ(fmapped_to_float.get(), 42.0f);
+}
+
+TEST(AsyncValuePtrTest, FlatMapUnavailable) {
+  AsyncValueRef<int32_t> ref = MakeConstructedAsyncValueRef<int32_t>(42);
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+
+  AsyncValueRef<float> fmapped_to_float = ptr.FlatMap([](int32_t value) {
+    return MakeAvailableAsyncValueRef<float>(1.0f * value);
+  });
+
+  EXPECT_FALSE(fmapped_to_float.IsAvailable());
+  ref.SetStateConcrete();
+
+  EXPECT_TRUE(fmapped_to_float.IsAvailable());
+  EXPECT_EQ(fmapped_to_float.get(), 42.0f);
+}
+
+struct DeferredExecutor : public AsyncValue::Executor {
+  void Execute(Task task) final { tasks.push_back(std::move(task)); }
+
+  size_t Quiesce() {
+    size_t n = 0;
+    while (!tasks.empty()) {
+      Task task = std::move(tasks.back());
+      tasks.pop_back();
+      task();
+      ++n;
+    }
+    return n;
+  }
+
+  std::vector<Task> tasks;
+};
+
+TEST(AsyncValuePtrTest, MapAvailableOnExecutor) {
+  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+
+  DeferredExecutor executor;
+  AsyncValueRef<float> mapped_to_float =
+      ptr.Map(executor, [](int32_t value) -> float { return value; });
+
+  EXPECT_FALSE(mapped_to_float.IsAvailable());
+  EXPECT_EQ(executor.Quiesce(), 1);
+
+  EXPECT_TRUE(mapped_to_float.IsAvailable());
+  EXPECT_EQ(mapped_to_float.get(), 42.0f);
+}
+
+TEST(AsyncValuePtrTest, MapErrorOnExecutor) {
+  AsyncValueRef<int32_t> ref =
+      MakeErrorAsyncValueRef(absl::InternalError("error"));
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+
+  DeferredExecutor executor;
+  AsyncValueRef<float> mapped_to_float =
+      ptr.Map(executor, [](int32_t value) -> float { return value; });
+
+  EXPECT_FALSE(mapped_to_float.IsAvailable());
+  EXPECT_EQ(executor.Quiesce(), 1);
+
+  EXPECT_TRUE(mapped_to_float.IsError());
+  EXPECT_EQ(mapped_to_float.GetError(), absl::InternalError("error"));
+}
+
+TEST(AsyncValuePtrTest, MapUnavailableOnExecutor) {
+  AsyncValueRef<int32_t> ref = MakeConstructedAsyncValueRef<int32_t>(42);
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+
+  DeferredExecutor executor;
+  AsyncValueRef<float> mapped_to_float =
+      ptr.Map(executor, [](int32_t& value) -> float { return value; });
+
+  ref.SetStateConcrete();
+  ref.release()->DropRef();
+
+  EXPECT_FALSE(mapped_to_float.IsAvailable());
+  EXPECT_EQ(executor.Quiesce(), 1);
+
+  EXPECT_TRUE(mapped_to_float.IsAvailable());
+  EXPECT_EQ(mapped_to_float.get(), 42.0f);
+}
+
+TEST(AsyncValuePtrTest, TryMapOnExecutor) {
+  AsyncValueRef<int32_t> ref = MakeConstructedAsyncValueRef<int32_t>(42);
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+
+  DeferredExecutor executor;
+  AsyncValueRef<float> mapped_to_float = ptr.TryMap(
+      executor, [](int32_t value) -> absl::StatusOr<float> { return value; });
+
+  ref.SetStateConcrete();
+  ref.release()->DropRef();
+
+  EXPECT_FALSE(mapped_to_float.IsAvailable());
+  EXPECT_EQ(executor.Quiesce(), 1);
+
+  EXPECT_TRUE(mapped_to_float.IsAvailable());
+  EXPECT_EQ(mapped_to_float.get(), 42.0f);
+}
+
+TEST(AsyncValuePtrTest, TryMapErrorOnExecutor) {
+  AsyncValueRef<int32_t> ref = MakeConstructedAsyncValueRef<int32_t>(42);
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+
+  DeferredExecutor executor;
+  AsyncValueRef<float> mapped_to_float =
+      ptr.TryMap(executor, [](int32_t value) -> absl::StatusOr<float> {
+        return absl::InternalError("error");
+      });
+
+  ref.SetStateConcrete();
+  ref.release()->DropRef();
+
+  EXPECT_FALSE(mapped_to_float.IsAvailable());
+  EXPECT_EQ(executor.Quiesce(), 1);
+
+  EXPECT_TRUE(mapped_to_float.IsError());
+  EXPECT_EQ(mapped_to_float.GetError(), absl::InternalError("error"));
+}
+
+TEST(AsyncValuePtrTest, FlatMapAvailableOnExecutor) {
+  AsyncValueRef<int32_t> ref = MakeConstructedAsyncValueRef<int32_t>(42);
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+
+  DeferredExecutor executor;
+  AsyncValueRef<float> fmapped_to_float =
+      ptr.FlatMap(executor, [](int32_t value) {
+        return MakeAvailableAsyncValueRef<float>(1.0f * value);
+      });
+
+  ref.SetStateConcrete();
+  ref.release()->DropRef();
+
+  EXPECT_FALSE(fmapped_to_float.IsAvailable());
+  EXPECT_EQ(executor.Quiesce(), 1);
+
+  EXPECT_TRUE(fmapped_to_float.IsAvailable());
+  EXPECT_EQ(fmapped_to_float.get(), 42.0f);
+}
+
+TEST(AsyncValuePtrTest, BlockUntilReady) {
+  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+  BlockUntilReady(ptr);
+}
+
+TEST(AsyncValuePtrTest, RunWhenReady) {
+  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+  bool executed = false;
+  RunWhenReady(absl::MakeConstSpan({ptr}), [&] { executed = true; });
+  EXPECT_TRUE(executed);
+}
+
+namespace {
+struct A {
+  virtual ~A() = default;
+};
+struct B : public A {};
+struct C : public B {};
+struct D : public A {};
+}  // namespace
+
+TEST(AsyncValuePtrTest, Isa) {
+  // Empty async pointer always returns false for any Isa<T>.
+  AsyncValuePtr<A> null_ptr;
+  EXPECT_FALSE(Isa<A>(null_ptr));
+
+  AsyncValueRef<A> a_ref = MakeAvailableAsyncValueRef<A>();
+  AsyncValueRef<A> b_ref = MakeAvailableAsyncValueRef<B>();
+  AsyncValueRef<A> c_ref = MakeAvailableAsyncValueRef<C>();
+  AsyncValueRef<A> d_ref = MakeAvailableAsyncValueRef<D>();
+
+  EXPECT_TRUE(Isa<A>(a_ref.AsPtr()));
+  EXPECT_TRUE(Isa<B>(b_ref.AsPtr()));
+  EXPECT_TRUE(Isa<C>(c_ref.AsPtr()));
+  EXPECT_TRUE(Isa<D>(d_ref.AsPtr()));
+
+  // Error async value is Isa<T> of any type in the hierarchy.
+  AsyncValueRef<A> err = MakeErrorAsyncValueRef(absl::InternalError("error"));
+  EXPECT_TRUE(Isa<A>(err.AsPtr()));
+  EXPECT_TRUE(Isa<B>(err.AsPtr()));
+  EXPECT_TRUE(Isa<C>(err.AsPtr()));
+  EXPECT_TRUE(Isa<D>(err.AsPtr()));
+
+  // If the value was constructed with a concrete type it should return true
+  // for Isa<T> even if it was set to error later but only if types match.
+  AsyncValueRef<A> a_err = MakeConstructedAsyncValueRef<A>();
+  AsyncValueRef<B> b_err = MakeConstructedAsyncValueRef<B>();
+  a_err.SetError(absl::InternalError("error"));
+  b_err.SetError(absl::InternalError("error"));
+
+  EXPECT_TRUE(Isa<A>(a_err.AsPtr()));
+  EXPECT_TRUE(Isa<B>(b_err.AsPtr()));
+
+  // Indirect async value is Isa<T> only if it would be a no-op cast.
+  auto indirect = MakeIndirectAsyncValue();
+  AsyncValueRef<A> c_indirect(indirect);
+  EXPECT_TRUE(Isa<A>(c_indirect.AsPtr()));
+  EXPECT_FALSE(Isa<C>(c_indirect.AsPtr()));
+
+  // After forwarding indirect async value to a concrete one it correctly
+  // returns true from Isa<T> check.
+  indirect->ForwardTo(c_ref.CopyRCRef());
+  EXPECT_TRUE(Isa<A>(c_indirect.AsPtr()));
+  EXPECT_TRUE(Isa<C>(c_indirect.AsPtr()));
+
+  // Typed indirect async value correctly handled by Isa<T>.
+  auto typed_indirect = MakeIndirectAsyncValue<C>();
+  AsyncValueRef<A> c_typed_indirect(indirect);
+  EXPECT_TRUE(Isa<A>(c_typed_indirect.AsPtr()));
+  EXPECT_TRUE(Isa<C>(c_typed_indirect.AsPtr()));
+
+  // Forwarding does not change anything for typed indirect async value.
+  typed_indirect->ForwardTo(c_ref.CopyRCRef());
+  EXPECT_TRUE(Isa<A>(c_typed_indirect.AsPtr()));
+  EXPECT_TRUE(Isa<C>(c_typed_indirect.AsPtr()));
+
+  // Typed indirect async value with error correctly handled by Isa<T>.
+  auto typed_indirect_err = MakeIndirectAsyncValue<C>();
+  AsyncValueRef<A> c_typed_indirect_err(typed_indirect_err);
+  EXPECT_TRUE(Isa<A>(c_typed_indirect.AsPtr()));
+  EXPECT_TRUE(Isa<C>(c_typed_indirect.AsPtr()));
+
+  // After indirect async value is set to error it should still return true
+  // from Isa<T> checks.
+  typed_indirect_err->SetError(absl::InternalError("error"));
+  EXPECT_TRUE(Isa<A>(c_typed_indirect_err.AsPtr()));
+  EXPECT_TRUE(Isa<C>(c_typed_indirect_err.AsPtr()));
+}
+
+TEST(AsyncValuePtrTest, DynCast) {
+  AsyncValueRef<A> a_ref = MakeAvailableAsyncValueRef<A>();
+  AsyncValueRef<A> b_ref = MakeAvailableAsyncValueRef<B>();
+  AsyncValueRef<A> c_ref = MakeAvailableAsyncValueRef<C>();
+  AsyncValueRef<A> d_ref = MakeAvailableAsyncValueRef<D>();
+
+  EXPECT_TRUE(DynCast<A>(a_ref.AsPtr()));
+  EXPECT_TRUE(DynCast<B>(b_ref.AsPtr()));
+  EXPECT_TRUE(DynCast<C>(c_ref.AsPtr()));
+  EXPECT_TRUE(DynCast<D>(d_ref.AsPtr()));
+
+  // No-op casts are always successful.
+  EXPECT_TRUE(DynCast<A>(c_ref.AsPtr()));
+
+  // We don't support casting to base (C inherits from B) because we can't do
+  // that safely relying just on AsyncValue type id. For safe conversion to base
+  // we need to introduce some kind of traits to the type hierarchy or rely on
+  // builtin `dynamic_cast` (will work only for constructed values).
+  EXPECT_FALSE(DynCast<B>(c_ref.AsPtr()));
+
+  // Types are unrelated, although they have same base.
+  EXPECT_FALSE(DynCast<C>(d_ref.AsPtr()));
+
+  // Error async value can be DynCast to any type in the hierarchy.
+  AsyncValueRef<A> err = MakeErrorAsyncValueRef(absl::InternalError("error"));
+  EXPECT_TRUE(DynCast<A>(err.AsPtr()));
+  EXPECT_TRUE(DynCast<B>(err.AsPtr()));
+  EXPECT_TRUE(DynCast<C>(err.AsPtr()));
+  EXPECT_TRUE(DynCast<D>(err.AsPtr()));
+
+  // If the value was constructed with a concrete type it should DynCast
+  // successfully even it it was set to error later but only if types match.
+  AsyncValueRef<A> a_err = MakeConstructedAsyncValueRef<A>();
+  AsyncValueRef<B> b_err = MakeConstructedAsyncValueRef<B>();
+  a_err.SetError(absl::InternalError("error"));
+  b_err.SetError(absl::InternalError("error"));
+
+  EXPECT_TRUE(DynCast<A>(a_err.AsPtr()));
+  EXPECT_TRUE(DynCast<B>(b_err.AsPtr()));
+  EXPECT_FALSE(DynCast<C>(a_err.AsPtr()));
+
+  // Indirect async value can't be DynCast until it's forwarded unless it's a
+  // no-op DynCast to the same type.
+  auto indirect = MakeIndirectAsyncValue();
+  AsyncValueRef<A> c_indirect(indirect);
+  EXPECT_TRUE(DynCast<A>(c_indirect.AsPtr()));
+  EXPECT_FALSE(DynCast<C>(c_indirect.AsPtr()));
+
+  // After forwarding indirect async value to a concrete one it can be DynCast
+  // to a concrete type.
+  indirect->ForwardTo(c_ref.CopyRCRef());
+  EXPECT_TRUE(DynCast<A>(c_indirect.AsPtr()));
+  EXPECT_TRUE(DynCast<C>(c_indirect.AsPtr()));
+
+  // Typed indirect async value correctly handled by DynCast<T>.
+  auto typed_indirect = MakeIndirectAsyncValue<C>();
+  AsyncValueRef<A> c_typed_indirect(indirect);
+  EXPECT_TRUE(DynCast<A>(c_typed_indirect.AsPtr()));
+  EXPECT_TRUE(DynCast<C>(c_typed_indirect.AsPtr()));
+
+  // Forwarding does not change anything for typed indirect async value.
+  typed_indirect->ForwardTo(c_ref.CopyRCRef());
+  EXPECT_TRUE(DynCast<A>(c_typed_indirect.AsPtr()));
+  EXPECT_TRUE(DynCast<C>(c_typed_indirect.AsPtr()));
+}
+
+TEST(AsyncValuePtrTest, Cast) {
+  AsyncValueRef<A> a_ref = MakeAvailableAsyncValueRef<A>();
+  AsyncValueRef<A> b_ref = MakeAvailableAsyncValueRef<B>();
+  AsyncValueRef<A> c_ref = MakeAvailableAsyncValueRef<C>();
+  AsyncValueRef<A> d_ref = MakeAvailableAsyncValueRef<D>();
+
+  EXPECT_TRUE(Cast<A>(a_ref.AsPtr()));
+  EXPECT_TRUE(Cast<B>(b_ref.AsPtr()));
+  EXPECT_TRUE(Cast<C>(c_ref.AsPtr()));
+  EXPECT_TRUE(Cast<D>(d_ref.AsPtr()));
+
+  EXPECT_TRUE(Cast<A>(c_ref.AsPtr()));
+
+  // Error async value can be Cast to any type in the hierarchy.
+  AsyncValueRef<A> err = MakeErrorAsyncValueRef(absl::InternalError("error"));
+  EXPECT_TRUE(Cast<A>(err.AsPtr()));
+  EXPECT_TRUE(Cast<B>(err.AsPtr()));
+  EXPECT_TRUE(Cast<C>(err.AsPtr()));
+  EXPECT_TRUE(Cast<D>(err.AsPtr()));
+
+  // If the value was constructed with a concrete type it should Cast
+  // successfully even it it was set to error later but only if types match.
+  AsyncValueRef<A> a_err = MakeConstructedAsyncValueRef<A>();
+  AsyncValueRef<B> b_err = MakeConstructedAsyncValueRef<B>();
+  a_err.SetError(absl::InternalError("error"));
+  b_err.SetError(absl::InternalError("error"));
+
+  EXPECT_TRUE(Cast<A>(a_err.AsPtr()));
+  EXPECT_TRUE(Cast<B>(b_err.AsPtr()));
+
+  // Indirect async value can't be Cast until it's forwarded unless it's a
+  // no-op Cast to the same type.
+  auto indirect = MakeIndirectAsyncValue();
+  AsyncValueRef<A> c_indirect(indirect);
+  EXPECT_TRUE(Cast<A>(c_indirect.AsPtr()));
+
+  // After forwarding indirect async value to a concrete one it can be Cast
+  // to a concrete type.
+  indirect->ForwardTo(c_ref.CopyRCRef());
+  EXPECT_TRUE(Cast<A>(c_indirect.AsPtr()));
+  EXPECT_TRUE(Cast<C>(c_indirect.AsPtr()));
+
+  // Typed indirect async value correctly handled by Cast<T>.
+  auto typed_indirect = MakeIndirectAsyncValue<C>();
+  AsyncValueRef<A> c_typed_indirect(indirect);
+  EXPECT_TRUE(Cast<A>(c_typed_indirect.AsPtr()));
+  EXPECT_TRUE(Cast<C>(c_typed_indirect.AsPtr()));
+
+  // Forwarding does not change anything for typed indirect async value.
+  typed_indirect->ForwardTo(c_ref.CopyRCRef());
+  EXPECT_TRUE(Cast<A>(c_typed_indirect.AsPtr()));
+  EXPECT_TRUE(Cast<C>(c_typed_indirect.AsPtr()));
+}
+
+//===----------------------------------------------------------------------===//
+// Performance benchmarks below
+//===----------------------------------------------------------------------===//
+
+struct InlineExecutor : public AsyncValue::Executor {
+  void Execute(Task task) final { task(); }
+};
+
+static void BM_MapIntToFloat(benchmark::State& state) {
+  auto ref = MakeAvailableAsyncValueRef<int32_t>(42);
+  auto ptr = ref.AsPtr();
+
+  for (auto _ : state) {
+    auto mapped = ptr.Map([](int32_t value) -> float { return value; });
+    benchmark::DoNotOptimize(mapped);
+  }
+}
+
+static void BM_MapIntToFloatOnExecutor(benchmark::State& state) {
+  auto ref = MakeAvailableAsyncValueRef<int32_t>(42);
+  auto ptr = ref.AsPtr();
+
+  InlineExecutor executor;
+  for (auto _ : state) {
+    auto mapped =
+        ptr.Map(executor, [](int32_t value) -> float { return value; });
+    benchmark::DoNotOptimize(mapped);
+  }
+}
+
+BENCHMARK(BM_MapIntToFloat);
+BENCHMARK(BM_MapIntToFloatOnExecutor);
+
+}  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/concurrency/async_value_ref.cc b/third_party/xla/xla/tsl/concurrency/async_value_ref.cc
similarity index 90%
rename from third_party/xla/third_party/tsl/tsl/concurrency/async_value_ref.cc
rename to third_party/xla/xla/tsl/concurrency/async_value_ref.cc
index 8a1e23c05d2cc6..60356bf24f16df 100644
--- a/third_party/xla/third_party/tsl/tsl/concurrency/async_value_ref.cc
+++ b/third_party/xla/xla/tsl/concurrency/async_value_ref.cc
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
 
 #include <string_view>
 #include <utility>
 
 #include "absl/status/status.h"
-#include "tsl/concurrency/async_value.h"
-#include "tsl/concurrency/ref_count.h"
+#include "xla/tsl/concurrency/async_value.h"
+#include "xla/tsl/concurrency/ref_count.h"
 
 namespace tsl {
 
diff --git a/third_party/xla/third_party/tsl/tsl/concurrency/async_value_ref.h b/third_party/xla/xla/tsl/concurrency/async_value_ref.h
similarity index 55%
rename from third_party/xla/third_party/tsl/tsl/concurrency/async_value_ref.h
rename to third_party/xla/xla/tsl/concurrency/async_value_ref.h
index 2d1b5db45cd319..c8832766930156 100644
--- a/third_party/xla/third_party/tsl/tsl/concurrency/async_value_ref.h
+++ b/third_party/xla/xla/tsl/concurrency/async_value_ref.h
@@ -13,22 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_CONCURRENCY_ASYNC_VALUE_REF_H_
-#define TENSORFLOW_TSL_CONCURRENCY_ASYNC_VALUE_REF_H_
+#ifndef XLA_TSL_CONCURRENCY_ASYNC_VALUE_REF_H_
+#define XLA_TSL_CONCURRENCY_ASYNC_VALUE_REF_H_
 
+#include <algorithm>
 #include <cstddef>
 #include <string_view>
 #include <type_traits>
 #include <utility>
 
 #include "absl/base/attributes.h"
+#include "absl/base/optimization.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
-#include "tsl/concurrency/async_value.h"
-#include "tsl/concurrency/ref_count.h"
+#include "xla/tsl/concurrency/async_value.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/mem.h"
 
@@ -50,6 +52,9 @@ class AsyncValuePtr;
 template <typename T>
 class AsyncValueRef {
  public:
+  // AsyncValueRef<T>::value_type
+  using value_type = T;
+
   AsyncValueRef() = default;
   AsyncValueRef(std::nullptr_t) {}  // NOLINT
 
@@ -100,6 +105,26 @@ class AsyncValueRef {
     //   (1) This is no-op cast even if concrete payload has different type.
     //   (2) Type id of a concrete payload matches Derived type id.
     //   (3) Payload is for a special case of ErrorAsyncValue.
+    //
+    // IMPORTANT: Because AsyncValue can be in unconstructed state we can't rely
+    // on `dynamic_cast` (and for similar reason on LLVM casts) and have to
+    // rely on type id stored in the async value itself. The downside of this
+    // approach that we might return false negatives.
+    //
+    // Example:
+    //
+    //   struct A {};
+    //   struct B : public A {};
+    //   struct C : public C {}
+    //
+    //   AsyncValueRef<A> ref = MakeUnconstructedAsyncValueRef<C>();
+    //
+    // In this example `ref.Isa<B>()` will return `false` although `C` can be
+    // safely casted to a pointer to its base type `B`, however type id does
+    // not have any details about type relationship. This can be fixed by adding
+    // extra bits of information to type table and by requiring participating
+    // types to register their relationship to base types in terms of their type
+    // ids, however there is no such need in practice (so far).
     return value_ && (std::is_same_v<Derived, T> ||                     // (1)
                       value_->IsType<Derived>() ||                      // (2)
                       value_->IsType<DummyValueForErrorAsyncValue>());  // (3)
@@ -127,9 +152,64 @@ class AsyncValueRef {
 
   T& operator*() const { return get(); }
 
-  template <typename WaiterT>
-  void AndThen(WaiterT&& waiter) const {
-    AsPtr().AndThen(std::forward<WaiterT>(waiter));
+  template <typename Waiter>
+  void AndThen(Waiter&& waiter) const {
+    AsPtr().AndThen(std::forward<Waiter>(waiter));
+  }
+
+  template <typename Waiter>
+  void AndThen(AsyncValue::Executor& executor, Waiter&& waiter) const {
+    AsPtr().AndThen(executor, std::forward<Waiter>(waiter));
+  }
+
+  template <typename R, typename F>
+  AsyncValueRef<R> Map(F&& f) {
+    return AsPtr().template Map<R>(std::forward<F>(f));
+  }
+
+  template <typename R, typename F>
+  AsyncValueRef<R> Map(AsyncValue::Executor& executor, F&& f) {
+    return AsPtr().template Map<R>(executor, std::forward<F>(f));
+  }
+
+  template <typename F>
+  auto Map(F&& f) {
+    return AsPtr().template Map(std::forward<F>(f));
+  }
+
+  template <typename F>
+  auto Map(AsyncValue::Executor& executor, F&& f) {
+    return AsPtr().template Map(executor, std::forward<F>(f));
+  }
+
+  template <typename R, typename F>
+  AsyncValueRef<R> TryMap(F&& f) {
+    return AsPtr().template TryMap<R>(std::forward<F>(f));
+  }
+
+  template <typename R, typename F>
+  AsyncValueRef<R> TryMap(AsyncValue::Executor& executor, F&& f) {
+    return AsPtr().template TryMap<R>(executor, std::forward<F>(f));
+  }
+
+  template <typename F>
+  auto TryMap(F&& f) {
+    return AsPtr().template TryMap(std::forward<F>(f));
+  }
+
+  template <typename F>
+  auto TryMap(AsyncValue::Executor& executor, F&& f) {
+    return AsPtr().template TryMap(executor, std::forward<F>(f));
+  }
+
+  template <typename F>
+  auto FlatMap(F&& f) {
+    return AsPtr().template FlatMap(std::forward<F>(f));
+  }
+
+  template <typename F>
+  auto FlatMap(AsyncValue::Executor& executor, F&& f) {
+    return AsPtr().template FlatMap(executor, std::forward<F>(f));
   }
 
   // Make the AsyncValueRef available.
@@ -206,13 +286,92 @@ class AsyncValueRef {
   RCReference<AsyncValue> value_;
 };
 
+// Detects if a type is a specialization of an AsyncValueRef template.
+template <typename T>
+struct IsAsyncValueRef : std::false_type {};
+template <typename T>
+struct IsAsyncValueRef<AsyncValueRef<T>> : std::true_type {};
+
+template <typename T>
+inline constexpr bool is_async_value_ref_v = IsAsyncValueRef<T>::value;
+
+// Forward declare AsyncValueRef constructors.
+template <typename T>
+AsyncValueRef<T> MakeUnconstructedAsyncValueRef();
+template <typename T, typename... Args>
+AsyncValueRef<T> MakeConstructedAsyncValueRef(Args&&... args);
+template <typename T, typename... Args>
+AsyncValueRef<T> MakeAvailableAsyncValueRef(Args&&... args);
+
+// Forward declare indirect async value constructors.
+RCReference<IndirectAsyncValue> MakeIndirectAsyncValue();
+template <typename T>
+RCReference<IndirectAsyncValue> MakeIndirectAsyncValue();
+
 // Non owning typed pointer for the AsyncValue. Can be cheaply passed around
 // when the lifetime of the underlying async value is clear from the context.
 // It is the user responsibility to construct an owning AsyncValueRef to extend
 // the lifetime of the underlying value if needed.
 template <typename T>
 class AsyncValuePtr {
+  // Detect result types that are `absl::StatusOr<R>` container.
+  template <typename R>
+  struct IsStatusOr : std::false_type {};
+  template <typename R>
+  struct IsStatusOr<absl::StatusOr<R>> : std::true_type {};
+
+  // Type predicates for detecting absl::Status-like types.
+  template <class R>
+  static constexpr bool is_status_v = std::is_same_v<R, absl::Status>;
+  template <class R>
+  static constexpr bool is_status_or_v = IsStatusOr<R>::value;
+  template <class R>
+  static constexpr bool is_status_like_v = is_status_v<R> || is_status_or_v<R>;
+
+  // Wait for async value availability: AndThen([] {})
+  template <typename Waiter>
+  using SimpleWaiter = std::enable_if_t<std::is_invocable_v<Waiter>>;
+
+  // Wait for async value status and value: AndThen([](absl::StatusOr<T*>) {})
+  template <typename Waiter>
+  using StatusOrWaiter =
+      std::enable_if_t<std::is_invocable_v<Waiter, absl::StatusOr<T*>>>;
+
+  // Wait for async value status: AndThen([](absl::Status) {})
+  //
+  // IMPORTANT: We disable this type of AndThen callback if the payload type is
+  // absl::Status because it is ambiguous and confusing: error can be an async
+  // value error or a concrete payload of a completed async value. Users should
+  // use other types of callbacks to disambiguate the provenance of status.
+  template <typename Waiter>
+  using StatusWaiter =
+      std::enable_if_t<(std::is_invocable_v<Waiter, absl::Status> &&
+                        !std::is_invocable_v<Waiter, absl::StatusOr<T*>> &&
+                        !is_status_v<T>)>;
+
+  // Because AsyncValue itself is a discriminated union of absl::Status and
+  // typed payload (error or value) the use of AsyncValueRef<status-like> is
+  // discouraged (work in progress to disable with static assert) and `Map`
+  // automatically folds returned status-like object into the returned async
+  // value error.
+
+  // Async value map functor: Map<R>([](T& value) -> U);
+  //   - R must be constructible from U
+  template <typename R, typename U>
+  using MapFunctor = std::enable_if_t<std::is_constructible_v<R, U>>;
+
+  // Async value try map functor: TryMap<R>([](T& value) -> absl::StatusOr<U>);
+  //   - R must be constructible from U
+  template <typename R, typename U>
+  using TryMapFunctor =
+      std::enable_if_t<!is_status_like_v<R> && is_status_or_v<U> &&
+                       std::is_constructible_v<R, typename U::value_type> &&
+                       !std::is_constructible_v<R, U>>;
+
  public:
+  // AsyncValuePtr<T>::value_type
+  using value_type = T;
+
   AsyncValuePtr() : value_(nullptr) {}
 
   explicit AsyncValuePtr(AsyncValue* value) : value_(value) {}
@@ -283,18 +442,23 @@ class AsyncValuePtr {
     return value_->SetError(std::move(status));
   }
 
-  // If the AsyncValueRef is available, run the waiter immediately. Otherwise,
-  // run the waiter when the AsyncValueRef becomes available.
+  // If the AsyncValueRef is available, invokes the `waiter` immediately.
+  // Otherwise, invokes the `waiter` when the AsyncValueRef becomes available.
   //
   // Sample usage:
   //
-  // async_value_ref.AndThen([] {
-  //   // async_value_ref is now ready.
+  // async_value_ptr.AndThen([] {
+  //   // async_value_ptr is now ready.
   // });
-  template <typename WaiterT,
-            std::enable_if_t<std::is_invocable_v<WaiterT>>* = nullptr>
-  void AndThen(WaiterT&& waiter) const {
-    value_->AndThen(std::forward<WaiterT>(waiter));
+  template <typename Waiter, SimpleWaiter<Waiter>* = nullptr>
+  void AndThen(Waiter&& waiter) const {
+    value_->AndThen(std::forward<Waiter>(waiter));
+  }
+
+  // An overload that executes `waiter` on a user-provided executor.
+  template <typename Waiter, SimpleWaiter<Waiter>* = nullptr>
+  void AndThen(AsyncValue::Executor& executor, Waiter&& waiter) const {
+    value_->AndThen(executor, std::forward<Waiter>(waiter));
   }
 
   // This AndThen() function takes a functor that takes absl::StatusOr<T*> as
@@ -303,8 +467,8 @@ class AsyncValuePtr {
   //
   // Sample usage:
   //
-  // async_value_ref.AndThen([] (absl::StatusOr<T*> status_or) {
-  //   // async_value_ref is now ready and its value/error is in the provided
+  // async_value_ptr.AndThen([] (absl::StatusOr<T*> status_or) {
+  //   // async_value_ptr is now ready and its value/error is in the provided
   //   // `status_or` argument.
   //   if (!status_or.ok()) {
   //      // Handle the error in `status_or.status()`.
@@ -312,18 +476,32 @@ class AsyncValuePtr {
   //      // Handle the value in `*status_or`.
   //   }
   // });
-  template <typename WaiterT, std::enable_if_t<std::is_invocable_v<
-                                  WaiterT, absl::StatusOr<T*>>>* = nullptr>
-  void AndThen(WaiterT&& waiter) const {
-    AndThen([waiter = std::forward<WaiterT>(waiter), av_ptr = *this]() mutable {
-      if (av_ptr.IsError()) {
-        return std::forward<WaiterT>(waiter)(av_ptr.GetError());
+  template <typename Waiter, StatusOrWaiter<Waiter>* = nullptr>
+  void AndThen(Waiter&& waiter) const {
+    AndThen([waiter = std::forward<Waiter>(waiter), ptr = *this]() mutable {
+      if (ABSL_PREDICT_FALSE(ptr.IsError())) {
+        return waiter(ptr.GetError());
       } else {
-        return std::forward<WaiterT>(waiter)(&av_ptr.get());
+        return waiter(&ptr.get());
       }
     });
   }
 
+  // An overload that executes `waiter` on a user-provided executor.
+  template <typename Waiter, StatusOrWaiter<Waiter>* = nullptr>
+  void AndThen(AsyncValue::Executor& executor, Waiter&& waiter) const {
+    // We don't know when the executor will run the callback, so we need to
+    // copy the AsyncValueRef to keep the underlying value alive.
+    AndThen(executor,
+            [waiter = std::forward<Waiter>(waiter), ref = CopyRef()]() mutable {
+              if (ABSL_PREDICT_FALSE(ref.IsError())) {
+                return waiter(ref.GetError());
+              } else {
+                return waiter(&ref.get());
+              }
+            });
+  }
+
   // This AndThen() function takes a functor that takes an absl::Status as
   // argument. This makes it easy for the callback function to use the error of
   // the AsyncValue when it becomes available. This is useful when the callback
@@ -332,8 +510,8 @@ class AsyncValuePtr {
   //
   // Sample usage:
   //
-  // async_value_ref.AndThen([] (absl::Status status) {
-  //   // async_value_ref is now ready and its status is in the provided
+  // async_value_ptr.AndThen([] (absl::Status status) {
+  //   // async_value_ptr is now ready and its status is in the provided
   //   // `status` argument.
   //   if (!status.ok()) {
   //     // Handle the error.
@@ -341,21 +519,213 @@ class AsyncValuePtr {
   //     // No error occurred.
   //   }
   // });
-  template <typename WaiterT,
-            std::enable_if_t<
-                (std::is_invocable_v<WaiterT, absl::Status> &&
-                 !std::is_invocable_v<WaiterT, absl::StatusOr<T*>>)>* = nullptr>
-  void AndThen(WaiterT&& waiter) const {
-    AndThen([waiter = std::forward<WaiterT>(waiter), av_ptr = *this]() mutable {
-      if (av_ptr.IsError()) {
-        return std::forward<WaiterT>(waiter)(av_ptr.GetError());
+  template <typename Waiter, StatusWaiter<Waiter>* = nullptr>
+  void AndThen(Waiter&& waiter) const {
+    AndThen([waiter = std::forward<Waiter>(waiter), ptr = *this]() mutable {
+      if (ABSL_PREDICT_FALSE(ptr.IsError())) {
+        return waiter(ptr.GetError());
+      } else {
+        return waiter(absl::OkStatus());
+      }
+    });
+  }
+
+  // An overload that executes `waiter` on a user-provided executor.
+  template <typename Waiter, StatusWaiter<Waiter>* = nullptr>
+  void AndThen(AsyncValue::Executor& executor, Waiter&& waiter) const {
+    // We don't know when the executor will run the callback, so we need to
+    // copy the AsyncValueRef to keep the underlying value alive.
+    AndThen(executor,
+            [waiter = std::forward<Waiter>(waiter), ref = CopyRef()]() mutable {
+              if (ABSL_PREDICT_FALSE(ref.IsError())) {
+                return waiter(ref.GetError());
+              } else {
+                return waiter(absl::OkStatus());
+              }
+            });
+  }
+
+  // Returns and AsyncValueRef<R> that is emplaced from the result of invoking
+  // functor `f` with *this value. If *this completes with an error, returned
+  // async value will also be an error.
+  //
+  // Sample usage:
+  //
+  // async_value_ptr.Map<R>([](T& value) -> U {
+  //   return U(value); // R must be constructible from U
+  // })
+  //
+  template <typename R, typename F, typename U = std::invoke_result_t<F, T&>,
+            MapFunctor<R, U>* = nullptr>
+  AsyncValueRef<R> Map(F&& f) {
+    auto result = MakeUnconstructedAsyncValueRef<R>();
+    AndThen([f = std::forward<F>(f), result, ptr = *this]() mutable {
+      if (ABSL_PREDICT_FALSE(ptr.IsError())) {
+        result.SetError(ptr.GetError());
       } else {
-        return std::forward<WaiterT>(waiter)(absl::OkStatus());
+        result.emplace(f(*ptr));
       }
     });
+    return result;
+  }
+
+  // An overload that executes `f` on a user-provided executor.
+  template <typename R, typename F, typename U = std::invoke_result_t<F, T&>,
+            MapFunctor<R, U>* = nullptr>
+  AsyncValueRef<R> Map(AsyncValue::Executor& executor, F&& f) {
+    auto result = MakeUnconstructedAsyncValueRef<R>();
+    // We don't know when the executor will run the callback, so we need to
+    // copy the AsyncValueRef to keep the underlying value alive.
+    AndThen(executor,
+            [f = std::forward<F>(f), result, ref = CopyRef()]() mutable {
+              if (ABSL_PREDICT_FALSE(ref.IsError())) {
+                result.SetError(ref.GetError());
+              } else {
+                result.emplace(f(*ref));
+              }
+            });
+    return result;
+  }
+
+  // Returns and AsyncValueRef<R> that is emplaced from the result of invoking
+  // functor `f` with *this value. Functor must return an `absl::StatusOr<U>`
+  // result that in case of error will be folded into the returned async value
+  // as an error. If *this completes with an error, returned async value will
+  // also be an error.
+  //
+  // Sample usage:
+  //
+  // async_value_ptr.TryMap<R>([](T& value) -> absl::StatusOr<U> {
+  //   return absl::StatusOr<U>(U{value}); // R must be constructible from U
+  // })
+  //
+  // If returned status container will have an error status, it will be
+  // automatically converted to async value error.
+  template <typename R, typename F, typename U = std::invoke_result_t<F, T&>,
+            TryMapFunctor<R, U>* = nullptr>
+  AsyncValueRef<R> TryMap(F&& f) {
+    auto result = MakeUnconstructedAsyncValueRef<R>();
+    AndThen([f = std::forward<F>(f), result, ptr = *this]() mutable {
+      if (ABSL_PREDICT_FALSE(ptr.IsError())) {
+        result.SetError(ptr.GetError());
+      } else {
+        auto status_or = f(*ptr);
+        if (status_or.ok()) {
+          result.emplace(std::move(status_or.value()));
+        } else {
+          result.SetError(status_or.status());
+        }
+      }
+    });
+    return result;
+  }
+
+  // An overload that executes `f` on a user-provided executor.
+  template <typename R, typename F, typename U = std::invoke_result_t<F, T&>,
+            TryMapFunctor<R, U>* = nullptr>
+  AsyncValueRef<R> TryMap(AsyncValue::Executor& executor, F&& f) {
+    auto result = MakeUnconstructedAsyncValueRef<R>();
+    // We don't know when the executor will run the callback, so we need to
+    // copy the AsyncValueRef to keep the underlying value alive.
+    AndThen(executor,
+            [f = std::forward<F>(f), result, ref = CopyRef()]() mutable {
+              if (ABSL_PREDICT_FALSE(ref.IsError())) {
+                result.SetError(ref.GetError());
+              } else {
+                auto status_or = f(*ref);
+                if (status_or.ok()) {
+                  result.emplace(std::move(status_or.value()));
+                } else {
+                  result.SetError(status_or.status());
+                }
+              }
+            });
+    return result;
+  }
+
+  // A `Map` overload that automatically infers the type of result from `f`.
+  template <typename F, typename R = std::invoke_result_t<F, T&>>
+  auto Map(F&& f) {
+    return Map<R>(std::forward<F>(f));
+  }
+
+  // A `Map` overload that automatically infers the type of result from `f` and
+  // executes `f` on user-provided executor.
+  template <typename F, typename R = std::invoke_result_t<F, T&>>
+  auto Map(AsyncValue::Executor& executor, F&& f) {
+    return Map<R>(executor, std::forward<F>(f));
+  }
+
+  // A `TryMap` overload that automatically infers the type of result from `f`.
+  template <typename F, typename R = std::invoke_result_t<F, T&>,
+            std::enable_if_t<is_status_or_v<R>>* = nullptr>
+  auto TryMap(F&& f) {
+    return TryMap<typename R::value_type>(std::forward<F>(f));
+  }
+
+  // A `TryMap` overload that automatically infers the type of result from `f`
+  // and executes `f` on user-provided executor.
+  template <typename F, typename R = std::invoke_result_t<F, T&>,
+            std::enable_if_t<is_status_or_v<R>>* = nullptr>
+  auto TryMap(AsyncValue::Executor& executor, F&& f) {
+    return TryMap<typename R::value_type>(executor, std::forward<F>(f));
+  }
+
+  // Returns and AsyncValueRef<R> that will be forwarded to the AsyncValueRef
+  // returned from a functor.
+  //
+  // Sample usage:
+  //
+  // async_value_ptr.FlatMap([](T& value) -> AsyncValueRef<R> {
+  //   return LaunchAsyncTask(value);
+  // })
+  //
+  template <typename F, typename R = std::invoke_result_t<F, T&>,
+            std::enable_if_t<is_async_value_ref_v<R>>* = nullptr>
+  AsyncValueRef<typename R::value_type> FlatMap(F&& f) {
+    auto promise = MakePromise<R>();
+    AndThen([f = std::forward<F>(f), promise, ptr = *this]() mutable {
+      if (ABSL_PREDICT_FALSE(ptr.IsError())) {
+        promise->SetError(ptr.GetError());
+      } else {
+        promise->ForwardTo(f(*ptr));
+      }
+    });
+    return AsyncValueRef<typename R::value_type>(promise);
+  }
+
+  // An overload that executes `f` on a user-provided executor.
+  template <typename F, typename R = std::invoke_result_t<F, T&>,
+            std::enable_if_t<is_async_value_ref_v<R>>* = nullptr>
+  AsyncValueRef<typename R::value_type> FlatMap(AsyncValue::Executor& executor,
+                                                F&& f) {
+    auto promise = MakePromise<R>();
+    // We don't know when the executor will run the callback, so we need to
+    // copy the AsyncValueRef to keep the underlying value alive.
+    AndThen(executor,
+            [f = std::forward<F>(f), promise, ref = CopyRef()]() mutable {
+              if (ABSL_PREDICT_FALSE(ref.IsError())) {
+                promise->SetError(ref.GetError());
+              } else {
+                promise->ForwardTo(f(*ref));
+              }
+            });
+    return AsyncValueRef<typename R::value_type>(promise);
   }
 
  private:
+  // We set a concrete type for indirect async value promise only if the type is
+  // final, because otherwise we can forward it later to one of the derived
+  // types and this will be a run time error.
+  template <typename R>
+  RCReference<IndirectAsyncValue> MakePromise() {
+    if constexpr (std::is_final_v<typename R::value_type>) {
+      return MakeIndirectAsyncValue<typename R::value_type>();
+    } else {
+      return MakeIndirectAsyncValue();
+    };
+  }
+
   AsyncValue* value_;  // doesn't own the async value
 };
 
@@ -365,9 +735,6 @@ RCReference<ErrorAsyncValue> MakeErrorAsyncValueRef(absl::Status status);
 ABSL_DEPRECATED("Use the error async value constructor that takes absl::Status")
 RCReference<ErrorAsyncValue> MakeErrorAsyncValueRef(std::string_view message);
 
-// Construct an empty IndirectAsyncValue, not forwarding to anything.
-RCReference<IndirectAsyncValue> MakeIndirectAsyncValue();
-
 //===----------------------------------------------------------------------===//
 // Functions for awaiting on the async values.
 //===----------------------------------------------------------------------===//
@@ -503,6 +870,15 @@ AsyncValueRef<T> MakeAvailableAsyncValueRef(Args&&... args) {
           std::forward<Args>(args)...)));
 }
 
+// Construct an empty IndirectAsyncValue, not forwarding to anything.
+RCReference<IndirectAsyncValue> MakeIndirectAsyncValue();
+
+// Construct an empty IndirectAsyncValue with a known type.
+template <typename T>
+RCReference<IndirectAsyncValue> MakeIndirectAsyncValue() {
+  return TakeRef(internal::AllocateAndConstruct<TypedIndirectAsyncValue<T>>());
+}
+
 //===----------------------------------------------------------------------===//
 // Constructing non-reference-counted values in user provided storage.
 //===----------------------------------------------------------------------===//
@@ -609,4 +985,4 @@ AsyncValueOwningRef<T> MakeAvailableAsyncValueRef(
 
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_CONCURRENCY_ASYNC_VALUE_REF_H_
+#endif  // XLA_TSL_CONCURRENCY_ASYNC_VALUE_REF_H_
diff --git a/third_party/xla/third_party/tsl/tsl/concurrency/async_value_ref_test.cc b/third_party/xla/xla/tsl/concurrency/async_value_ref_test.cc
similarity index 52%
rename from third_party/xla/third_party/tsl/tsl/concurrency/async_value_ref_test.cc
rename to third_party/xla/xla/tsl/concurrency/async_value_ref_test.cc
index 513b9b83c4df8c..d7e2656f40c1ac 100644
--- a/third_party/xla/third_party/tsl/tsl/concurrency/async_value_ref_test.cc
+++ b/third_party/xla/xla/tsl/concurrency/async_value_ref_test.cc
@@ -13,16 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
 
+#include <atomic>
+#include <cstddef>
 #include <cstdint>
 #include <utility>
+#include <vector>
 
+#include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
-#include "tsl/concurrency/async_value.h"
-#include "tsl/concurrency/ref_count.h"
+#include "xla/tsl/concurrency/async_value.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "tsl/platform/test.h"
 
 namespace tsl {
@@ -173,6 +177,270 @@ TEST(AsyncValueRefTest, Nullptr) {
   EXPECT_FALSE(av_int2);
 }
 
+TEST(AsyncValueRefTest, MapAvailable) {
+  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
+
+  AsyncValueRef<float> mapped_to_float =
+      ref.Map([](int32_t value) -> float { return value; });
+  EXPECT_TRUE(mapped_to_float.IsAvailable());
+  EXPECT_EQ(mapped_to_float.get(), 42.0f);
+}
+
+TEST(AsyncValueRefTest, MapUnvailable) {
+  AsyncValueRef<int32_t> ref = MakeConstructedAsyncValueRef<int32_t>(42);
+
+  AsyncValueRef<float> mapped_to_float =
+      ref.Map([](int32_t value) -> float { return value; });
+
+  EXPECT_FALSE(mapped_to_float.IsAvailable());
+  ref.SetStateConcrete();
+
+  EXPECT_TRUE(mapped_to_float.IsAvailable());
+  EXPECT_EQ(mapped_to_float.get(), 42.0f);
+}
+
+TEST(AsyncValueRefTest, MapToNonMoveable) {
+  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
+
+  AsyncValueRef<std::atomic<int32_t>> mapped_to_atomic =
+      ref.Map<std::atomic<int32_t>>([](int32_t value) { return value; });
+  EXPECT_TRUE(mapped_to_atomic.IsAvailable());
+  EXPECT_EQ(mapped_to_atomic->load(), 42);
+}
+
+TEST(AsyncValueRefTest, MapError) {
+  AsyncValueRef<int32_t> ref =
+      MakeErrorAsyncValueRef(absl::InternalError("error"));
+
+  AsyncValueRef<float> mapped_to_float =
+      ref.Map([](int32_t value) -> float { return value; });
+  EXPECT_TRUE(mapped_to_float.IsError());
+  EXPECT_EQ(mapped_to_float.GetError(), absl::InternalError("error"));
+}
+
+TEST(AsyncValueRefTest, MapUnvailableError) {
+  AsyncValueRef<int32_t> ref = MakeConstructedAsyncValueRef<int32_t>(42);
+
+  AsyncValueRef<float> mapped_to_float =
+      ref.Map([](int32_t value) -> float { return value; });
+
+  EXPECT_FALSE(mapped_to_float.IsAvailable());
+  ref.SetError(absl::InternalError("error"));
+
+  EXPECT_TRUE(mapped_to_float.IsError());
+  EXPECT_EQ(mapped_to_float.GetError(), absl::InternalError("error"));
+}
+
+TEST(AsyncValueRefTest, MapMultipleTimes) {
+  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
+
+  auto plus_one = [](int32_t value) { return value + 1; };
+  AsyncValueRef<int32_t> mapped = ref.Map(plus_one)
+                                      .Map(plus_one)
+                                      .Map(plus_one)
+                                      .Map(plus_one)
+                                      .Map(plus_one)
+                                      .Map(plus_one);
+
+  EXPECT_TRUE(mapped.IsAvailable());
+  EXPECT_EQ(mapped.get(), 42 + 6);
+}
+
+TEST(AsyncValuePtrTest, MapToStatus) {
+  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
+
+  AsyncValueRef<absl::Status> mapped_to_status =
+      ref.Map([](int32_t value) -> absl::Status { return absl::OkStatus(); });
+  EXPECT_TRUE(mapped_to_status.IsAvailable());
+  EXPECT_EQ(mapped_to_status.get(), absl::OkStatus());
+}
+
+TEST(AsyncValueRefTest, MapToStatusOr) {
+  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
+
+  AsyncValueRef<absl::StatusOr<float>> mapped_to_float =
+      ref.Map([](int32_t value) -> absl::StatusOr<float> { return value; });
+  EXPECT_TRUE(mapped_to_float.IsAvailable());
+  EXPECT_EQ(*mapped_to_float.get(), 42.0f);
+}
+
+TEST(AsyncValueRefTest, TryMap) {
+  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
+
+  AsyncValueRef<float> mapped_to_float =
+      ref.TryMap([](int32_t value) -> absl::StatusOr<float> { return value; });
+  EXPECT_TRUE(mapped_to_float.IsAvailable());
+  EXPECT_EQ(mapped_to_float.get(), 42.0f);
+}
+
+TEST(AsyncValueRefTest, TryMapError) {
+  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
+
+  AsyncValueRef<float> mapped_to_float =
+      ref.TryMap([](int32_t value) -> absl::StatusOr<float> {
+        return absl::InternalError("error");
+      });
+  EXPECT_TRUE(mapped_to_float.IsError());
+  EXPECT_EQ(mapped_to_float.GetError(), absl::InternalError("error"));
+}
+
+TEST(AsyncValueRefTest, TryMapConstructible) {
+  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
+
+  struct X {
+    explicit X(float value) : value(value) {}
+    float value;
+  };
+
+  AsyncValueRef<X> mapped_to_x = ref.TryMap<X>(
+      [](int32_t value) -> absl::StatusOr<float> { return value; });
+  EXPECT_TRUE(mapped_to_x.IsAvailable());
+  EXPECT_EQ(mapped_to_x->value, 42.0f);
+}
+
+TEST(AsyncValueRefTest, FlatMapAvailable) {
+  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
+
+  AsyncValueRef<float> fmapped_to_float = ref.FlatMap([](int32_t value) {
+    return MakeAvailableAsyncValueRef<float>(1.0f * value);
+  });
+
+  EXPECT_TRUE(fmapped_to_float.IsAvailable());
+  EXPECT_EQ(fmapped_to_float.get(), 42.0f);
+}
+
+TEST(AsyncValueRefTest, FlatMapUnavailable) {
+  AsyncValueRef<int32_t> ref = MakeConstructedAsyncValueRef<int32_t>(42);
+
+  AsyncValueRef<float> fmapped_to_float = ref.FlatMap([](int32_t value) {
+    return MakeAvailableAsyncValueRef<float>(1.0f * value);
+  });
+
+  EXPECT_FALSE(fmapped_to_float.IsAvailable());
+  ref.SetStateConcrete();
+
+  EXPECT_TRUE(fmapped_to_float.IsAvailable());
+  EXPECT_EQ(fmapped_to_float.get(), 42.0f);
+}
+
+struct DeferredExecutor : public AsyncValue::Executor {
+  void Execute(Task task) final { tasks.push_back(std::move(task)); }
+
+  size_t Quiesce() {
+    size_t n = 0;
+    while (!tasks.empty()) {
+      Task task = std::move(tasks.back());
+      tasks.pop_back();
+      task();
+      ++n;
+    }
+    return n;
+  }
+
+  std::vector<Task> tasks;
+};
+
+TEST(AsyncValueRefTest, MapAvailableOnExecutor) {
+  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
+
+  DeferredExecutor executor;
+  AsyncValueRef<float> mapped_to_float =
+      ref.Map(executor, [](int32_t value) -> float { return value; });
+
+  EXPECT_FALSE(mapped_to_float.IsAvailable());
+  EXPECT_EQ(executor.Quiesce(), 1);
+
+  EXPECT_TRUE(mapped_to_float.IsAvailable());
+  EXPECT_EQ(mapped_to_float.get(), 42.0f);
+}
+
+TEST(AsyncValueRefTest, MapErrorOnExecutor) {
+  AsyncValueRef<int32_t> ref =
+      MakeErrorAsyncValueRef(absl::InternalError("error"));
+
+  DeferredExecutor executor;
+  AsyncValueRef<float> mapped_to_float =
+      ref.Map(executor, [](int32_t value) -> float { return value; });
+
+  EXPECT_FALSE(mapped_to_float.IsAvailable());
+  EXPECT_EQ(executor.Quiesce(), 1);
+
+  EXPECT_TRUE(mapped_to_float.IsError());
+  EXPECT_EQ(mapped_to_float.GetError(), absl::InternalError("error"));
+}
+
+TEST(AsyncValueRefTest, MapUnavailableOnExecutor) {
+  AsyncValueRef<int32_t> ref = MakeConstructedAsyncValueRef<int32_t>(42);
+
+  DeferredExecutor executor;
+  AsyncValueRef<float> mapped_to_float =
+      ref.Map(executor, [](int32_t value) -> float { return value; });
+
+  ref.SetStateConcrete();
+  ref.release()->DropRef();
+
+  EXPECT_FALSE(mapped_to_float.IsAvailable());
+  EXPECT_EQ(executor.Quiesce(), 1);
+
+  EXPECT_TRUE(mapped_to_float.IsAvailable());
+  EXPECT_EQ(mapped_to_float.get(), 42.0f);
+}
+
+TEST(AsyncValueRefTest, TryMapOnExecutor) {
+  AsyncValueRef<int32_t> ref = MakeConstructedAsyncValueRef<int32_t>(42);
+
+  DeferredExecutor executor;
+  AsyncValueRef<float> mapped_to_float = ref.TryMap(
+      executor, [](int32_t value) -> absl::StatusOr<float> { return value; });
+
+  ref.SetStateConcrete();
+  ref.release()->DropRef();
+
+  EXPECT_FALSE(mapped_to_float.IsAvailable());
+  EXPECT_EQ(executor.Quiesce(), 1);
+
+  EXPECT_TRUE(mapped_to_float.IsAvailable());
+  EXPECT_EQ(mapped_to_float.get(), 42.0f);
+}
+
+TEST(AsyncValueRefTest, TryMapErrorOnExecutor) {
+  AsyncValueRef<int32_t> ref = MakeConstructedAsyncValueRef<int32_t>(42);
+
+  DeferredExecutor executor;
+  AsyncValueRef<float> mapped_to_float =
+      ref.TryMap(executor, [](int32_t value) -> absl::StatusOr<float> {
+        return absl::InternalError("error");
+      });
+
+  ref.SetStateConcrete();
+  ref.release()->DropRef();
+
+  EXPECT_FALSE(mapped_to_float.IsAvailable());
+  EXPECT_EQ(executor.Quiesce(), 1);
+
+  EXPECT_TRUE(mapped_to_float.IsError());
+  EXPECT_EQ(mapped_to_float.GetError(), absl::InternalError("error"));
+}
+
+TEST(AsyncValueRefTest, FlatMapAvailableOnExecutor) {
+  AsyncValueRef<int32_t> ref = MakeConstructedAsyncValueRef<int32_t>(42);
+
+  DeferredExecutor executor;
+  AsyncValueRef<float> fmapped_to_float =
+      ref.FlatMap(executor, [](int32_t value) {
+        return MakeAvailableAsyncValueRef<float>(1.0f * value);
+      });
+
+  ref.SetStateConcrete();
+  ref.release()->DropRef();
+
+  EXPECT_FALSE(fmapped_to_float.IsAvailable());
+  EXPECT_EQ(executor.Quiesce(), 1);
+
+  EXPECT_TRUE(fmapped_to_float.IsAvailable());
+  EXPECT_EQ(fmapped_to_float.get(), 42.0f);
+}
+
 TEST(AsyncValueRefTest, BlockUntilReady) {
   AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
   BlockUntilReady(ref);
@@ -265,6 +533,29 @@ TEST(AsyncValueRefTest, Isa) {
   indirect->ForwardTo(c_ref.CopyRCRef());
   EXPECT_TRUE(Isa<A>(c_indirect));
   EXPECT_TRUE(Isa<C>(c_indirect));
+
+  // Typed indirect async value correctly handled by Isa<T>.
+  auto typed_indirect = MakeIndirectAsyncValue<C>();
+  AsyncValueRef<A> c_typed_indirect(indirect);
+  EXPECT_TRUE(Isa<A>(c_typed_indirect));
+  EXPECT_TRUE(Isa<C>(c_typed_indirect));
+
+  // Forwarding does not change anything for typed indirect async value.
+  typed_indirect->ForwardTo(c_ref.CopyRCRef());
+  EXPECT_TRUE(Isa<A>(c_typed_indirect));
+  EXPECT_TRUE(Isa<C>(c_typed_indirect));
+
+  // Typed indirect async value with error correctly handled by Isa<T>.
+  auto typed_indirect_err = MakeIndirectAsyncValue<C>();
+  AsyncValueRef<A> c_typed_indirect_err(typed_indirect_err);
+  EXPECT_TRUE(Isa<A>(c_typed_indirect.AsPtr()));
+  EXPECT_TRUE(Isa<C>(c_typed_indirect.AsPtr()));
+
+  // After indirect async value is set to error it should still return true
+  // from Isa<T> checks.
+  typed_indirect_err->SetError(absl::InternalError("error"));
+  EXPECT_TRUE(Isa<A>(c_typed_indirect_err.AsPtr()));
+  EXPECT_TRUE(Isa<C>(c_typed_indirect_err.AsPtr()));
 }
 
 TEST(AsyncValueRefTest, DynCast) {
@@ -320,6 +611,17 @@ TEST(AsyncValueRefTest, DynCast) {
   indirect->ForwardTo(c_ref.CopyRCRef());
   EXPECT_TRUE(DynCast<A>(c_indirect));
   EXPECT_TRUE(DynCast<C>(c_indirect));
+
+  // Typed indirect async value correctly handled by DynCast<T>.
+  auto typed_indirect = MakeIndirectAsyncValue<C>();
+  AsyncValueRef<A> c_typed_indirect(indirect);
+  EXPECT_TRUE(DynCast<A>(c_typed_indirect));
+  EXPECT_TRUE(DynCast<C>(c_typed_indirect));
+
+  // Forwarding does not change anything for typed indirect async value.
+  typed_indirect->ForwardTo(c_ref.CopyRCRef());
+  EXPECT_TRUE(DynCast<A>(c_typed_indirect));
+  EXPECT_TRUE(DynCast<C>(c_typed_indirect));
 }
 
 TEST(AsyncValueRefTest, Cast) {
@@ -363,6 +665,17 @@ TEST(AsyncValueRefTest, Cast) {
   indirect->ForwardTo(c_ref.CopyRCRef());
   EXPECT_TRUE(Cast<A>(c_indirect));
   EXPECT_TRUE(Cast<C>(c_indirect));
+
+  // Typed indirect async value correctly handled by Cast<T>.
+  auto typed_indirect = MakeIndirectAsyncValue<C>();
+  AsyncValueRef<A> c_typed_indirect(indirect);
+  EXPECT_TRUE(Cast<A>(c_typed_indirect));
+  EXPECT_TRUE(Cast<C>(c_typed_indirect));
+
+  // Forwarding does not change anything for typed indirect async value.
+  typed_indirect->ForwardTo(c_ref.CopyRCRef());
+  EXPECT_TRUE(Cast<A>(c_typed_indirect));
+  EXPECT_TRUE(Cast<C>(c_typed_indirect));
 }
 
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/concurrency/async_value_test.cc b/third_party/xla/xla/tsl/concurrency/async_value_test.cc
similarity index 98%
rename from third_party/xla/third_party/tsl/tsl/concurrency/async_value_test.cc
rename to third_party/xla/xla/tsl/concurrency/async_value_test.cc
index 67ec25d26137c3..7bad99091b78c4 100644
--- a/third_party/xla/third_party/tsl/tsl/concurrency/async_value_test.cc
+++ b/third_party/xla/xla/tsl/concurrency/async_value_test.cc
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/concurrency/async_value.h"
+#include "xla/tsl/concurrency/async_value.h"
 
 #include <cstdint>
 #include <memory>
 #include <utility>
 
 #include "absl/status/status.h"
-#include "tsl/concurrency/async_value_ref.h"
+#include "xla/tsl/concurrency/async_value_ref.h"
 #include "tsl/platform/test.h"
 
 namespace tsl {
diff --git a/third_party/xla/third_party/tsl/tsl/concurrency/chain.h b/third_party/xla/xla/tsl/concurrency/chain.h
similarity index 84%
rename from third_party/xla/third_party/tsl/tsl/concurrency/chain.h
rename to third_party/xla/xla/tsl/concurrency/chain.h
index 91cc7bb929fbd7..d390970c398563 100644
--- a/third_party/xla/third_party/tsl/tsl/concurrency/chain.h
+++ b/third_party/xla/xla/tsl/concurrency/chain.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_CONCURRENCY_CHAIN_H_
-#define TENSORFLOW_TSL_CONCURRENCY_CHAIN_H_
+#ifndef XLA_TSL_CONCURRENCY_CHAIN_H_
+#define XLA_TSL_CONCURRENCY_CHAIN_H_
 
 namespace tsl {
 
@@ -22,4 +22,4 @@ class Chain {};
 
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_CONCURRENCY_CHAIN_H_
+#endif  // XLA_TSL_CONCURRENCY_CHAIN_H_
diff --git a/third_party/xla/third_party/tsl/tsl/concurrency/concurrent_vector.h b/third_party/xla/xla/tsl/concurrency/concurrent_vector.h
similarity index 80%
rename from third_party/xla/third_party/tsl/tsl/concurrency/concurrent_vector.h
rename to third_party/xla/xla/tsl/concurrency/concurrent_vector.h
index cea63b8401172f..b7a033ddaa75a2 100644
--- a/third_party/xla/third_party/tsl/tsl/concurrency/concurrent_vector.h
+++ b/third_party/xla/xla/tsl/concurrency/concurrent_vector.h
@@ -13,17 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_CONCURRENCY_CONCURRENT_VECTOR_H_
-#define TENSORFLOW_TSL_CONCURRENCY_CONCURRENT_VECTOR_H_
+#ifndef XLA_TSL_CONCURRENCY_CONCURRENT_VECTOR_H_
+#define XLA_TSL_CONCURRENCY_CONCURRENT_VECTOR_H_
 
 #include <algorithm>
+#include <array>
 #include <atomic>
 #include <cassert>
-#include <memory>
+#include <cstddef>
+#include <cstdint>
 #include <vector>
 
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
+#include "tsl/platform/logging.h"
 
 namespace tsl {
 namespace internal {
@@ -58,25 +61,13 @@ class ConcurrentVector {
  public:
   // Initialize the vector with the given initial_capapcity
   explicit ConcurrentVector(size_t initial_capacity) : state_(0ull) {
-    // ConcurrentVector does not support inserting more than 2^64 elements,
-    // which should be more than enough for any reasonable use case.
-    all_allocated_elements_.reserve(65);
-    all_allocated_elements_.emplace_back();
-    auto& v = all_allocated_elements_.back();
+    auto& v = all_allocated_elements_[0];
     v.reserve(std::max(static_cast<size_t>(1), initial_capacity));
   }
 
-  T& operator[](size_t index) {
-    auto state = State::Decode(state_.load(std::memory_order_acquire));
-    assert(index < state.size);
-    // .data() is a workaround for libc++ assertions in operator[], which will
-    // cause data race when container is resized from another thread.
-    return all_allocated_elements_.data()[state.last_allocated].data()[index];
-  }
-
   const T& operator[](size_t index) const {
     auto state = State::Decode(state_.load(std::memory_order_acquire));
-    assert(index < state.size);
+    DCHECK_LT(index, state.size);
     // .data() is a workaround for libc++ assertions in operator[], which will
     // cause data race when container is resized from another thread.
     return all_allocated_elements_.data()[state.last_allocated].data()[index];
@@ -106,7 +97,8 @@ class ConcurrentVector {
   size_t emplace_back(Args&&... args) {
     absl::MutexLock lock(&mutex_);
 
-    auto& last = all_allocated_elements_.back();
+    auto state = State::Decode(state_.load(std::memory_order_relaxed));
+    auto& last = all_allocated_elements_[state.last_allocated];
 
     if (last.size() < last.capacity()) {
       // There is still room in the current vector without reallocation. Just
@@ -114,7 +106,6 @@ class ConcurrentVector {
       last.emplace_back(std::forward<Args>(args)...);
 
       // Increment the size of the concurrent vector.
-      auto state = State::Decode(state_.load(std::memory_order_relaxed));
       state.size += 1;
       state_.store(state.Encode(), std::memory_order_release);
 
@@ -124,18 +115,16 @@ class ConcurrentVector {
     // Allocate a new vector with twice as much capacity, copy the elements
     // from the previous vector, and set elements_ to point to the data of the
     // new vector.
-    auto& new_last = all_allocated_elements_.emplace_back();
-    auto& prev = *(all_allocated_elements_.rbegin() + 1);
-    new_last.reserve(prev.capacity() * 2);
-    assert(prev.size() == prev.capacity());
+    auto& new_last = all_allocated_elements_[state.last_allocated + 1];
+    new_last.reserve(last.capacity() * 2);
+    DCHECK_EQ(last.size(), last.capacity());
 
     // Copy over the previous vector to the new vector.
-    new_last.insert(new_last.begin(), prev.begin(), prev.end());
+    new_last.insert(new_last.begin(), last.begin(), last.end());
     new_last.emplace_back(std::forward<Args>(args)...);
 
     // Increment the size of the concurrent vector and index of the last
     // allocated vector.
-    auto state = State::Decode(state_.load(std::memory_order_relaxed));
     state.last_allocated += 1;
     state.size += 1;
     state_.store(state.Encode(), std::memory_order_release);
@@ -168,10 +157,13 @@ class ConcurrentVector {
   std::atomic<uint64_t> state_;
 
   absl::Mutex mutex_;
-  std::vector<std::vector<T>> all_allocated_elements_;
+
+  // ConcurrentVector does not support inserting more than 2^64 elements,
+  // which should be more than enough for any reasonable use case.
+  std::array<std::vector<T>, 64> all_allocated_elements_;
 };
 
 }  // namespace internal
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_CONCURRENCY_CONCURRENT_VECTOR_H_
+#endif  // XLA_TSL_CONCURRENCY_CONCURRENT_VECTOR_H_
diff --git a/third_party/xla/third_party/tsl/tsl/concurrency/concurrent_vector_test.cc b/third_party/xla/xla/tsl/concurrency/concurrent_vector_test.cc
similarity index 96%
rename from third_party/xla/third_party/tsl/tsl/concurrency/concurrent_vector_test.cc
rename to third_party/xla/xla/tsl/concurrency/concurrent_vector_test.cc
index 677f1c9fa43890..5106909ce06146 100644
--- a/third_party/xla/third_party/tsl/tsl/concurrency/concurrent_vector_test.cc
+++ b/third_party/xla/xla/tsl/concurrency/concurrent_vector_test.cc
@@ -13,11 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/concurrency/concurrent_vector.h"
+#include "xla/tsl/concurrency/concurrent_vector.h"
 
 #include <algorithm>
 #include <vector>
 
+#include "tsl/platform/env.h"
 #include "tsl/platform/test.h"
 #include "tsl/platform/threadpool.h"
 
diff --git a/third_party/xla/third_party/tsl/tsl/concurrency/ref_count.h b/third_party/xla/xla/tsl/concurrency/ref_count.h
similarity index 98%
rename from third_party/xla/third_party/tsl/tsl/concurrency/ref_count.h
rename to third_party/xla/xla/tsl/concurrency/ref_count.h
index 1b3154021c6f38..664dd95c4c486c 100644
--- a/third_party/xla/third_party/tsl/tsl/concurrency/ref_count.h
+++ b/third_party/xla/xla/tsl/concurrency/ref_count.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_CONCURRENCY_REF_COUNT_H_
-#define TENSORFLOW_TSL_CONCURRENCY_REF_COUNT_H_
+#ifndef XLA_TSL_CONCURRENCY_REF_COUNT_H_
+#define XLA_TSL_CONCURRENCY_REF_COUNT_H_
 
 #include <atomic>
 #include <cassert>
@@ -262,4 +262,4 @@ void swap(RCReference<T>& a, RCReference<T>& b) {
 
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_CONCURRENCY_REF_COUNT_H_
+#endif  // XLA_TSL_CONCURRENCY_REF_COUNT_H_
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/BUILD b/third_party/xla/xla/tsl/distributed_runtime/coordination/BUILD
index f6389fa39dac79..5810a6ebcbc602 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/BUILD
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/BUILD
@@ -1,6 +1,7 @@
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library", "tsl_cc_test")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//xla/tsl:tsl.bzl", "if_oss", "internal_visibility", "tsl_gpu_library")
+load("//xla/tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -14,9 +15,9 @@ cc_library(
     name = "coordination_service_error_util",
     hdrs = ["coordination_service_error_util.h"],
     deps = [
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:status",
+        "@com_google_absl//absl/strings:cord",
         "@local_tsl//tsl/protobuf:coordination_service_proto_cc",
     ],
 )
@@ -26,7 +27,7 @@ tsl_cc_test(
     srcs = ["coordination_service_error_util_test.cc"],
     deps = [
         ":coordination_service_error_util",
-        "@local_tsl//tsl/platform:errors",
+        "@com_google_absl//absl/status",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
@@ -49,10 +50,13 @@ cc_library(
     hdrs = ["coordination_service.h"],
     deps = [
         ":coordination_client",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
+        "@local_tsl//tsl/platform:macros",
         "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/protobuf:coordination_config_proto_cc",
     ],
 )
@@ -69,20 +73,20 @@ tsl_gpu_library(
         ":coordination_service_error_util",
         "//xla/tsl/distributed_runtime:call_options",
         "//xla/tsl/util:device_name_utils",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/hash",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:macros",
-        "@local_tsl//tsl/platform:mutex",
         "@local_tsl//tsl/platform:random",
         "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:strcat",
-        "@local_tsl//tsl/platform:thread_annotations",
         "@local_tsl//tsl/protobuf:coordination_config_proto_cc",
         "@local_tsl//tsl/protobuf:coordination_service_proto_cc",
     ],
@@ -110,19 +114,20 @@ tsl_cc_test(
         ":coordination_service_impl",
         ":test_device_proto_cc",
         "//xla/tsl/distributed_runtime:call_options",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:mutex",
         "@local_tsl//tsl/platform:random",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
-        "@local_tsl//tsl/platform:thread_annotations",
         "@local_tsl//tsl/platform:types",
         "@local_tsl//tsl/protobuf:coordination_config_proto_cc",
         "@local_tsl//tsl/protobuf:coordination_service_proto_cc",
@@ -140,16 +145,15 @@ tsl_gpu_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@local_tsl//tsl/framework:cancellation",
         "@local_tsl//tsl/lib/monitoring:gauge",
         "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:mutex",
         "@local_tsl//tsl/platform:random",
         "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:thread_annotations",
         "@local_tsl//tsl/protobuf:coordination_config_proto_cc",
         "@local_tsl//tsl/protobuf:coordination_service_proto_cc",
@@ -167,14 +171,13 @@ tsl_cc_test(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/time",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:env_impl",
-        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
         "@local_tsl//tsl/protobuf:coordination_config_proto_cc_impl",
@@ -193,12 +196,14 @@ cc_library(
         ":coordination_service_agent",
         ":coordination_service_error_util",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@local_tsl//tsl/platform:casts",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:mutex",
+        "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:thread_annotations",
         "@local_tsl//tsl/protobuf:coordination_service_proto_cc",
     ],
@@ -215,20 +220,22 @@ tsl_cc_test(
         "//xla/tsl/distributed_runtime/rpc:async_service_interface",
         "//xla/tsl/distributed_runtime/rpc/coordination:grpc_coordination_client",
         "//xla/tsl/distributed_runtime/rpc/coordination:grpc_coordination_service_impl",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:env_impl",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:mutex",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
         "@local_tsl//tsl/protobuf:coordination_config_proto_cc_impl",
         "@local_tsl//tsl/protobuf:coordination_service_proto_cc_impl",
         "@local_tsl//tsl/protobuf:distributed_runtime_payloads_proto_cc_impl",
-    ],
+    ] + tsl_grpc_cc_dependencies(),
 )
 
 filegroup(
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.cc
index fb9735ec28766a..e18086dbadfb65 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "xla/tsl/distributed_runtime/coordination/coordination_service.h"
 
 #include <algorithm>
+#include <cassert>
+#include <cstddef>
 #include <cstdint>
 #include <functional>
 #include <map>
@@ -24,12 +26,16 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/hash/hash.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/synchronization/notification.h"
 #include "absl/time/time.h"
 #include "xla/tsl/distributed_runtime/call_options.h"
@@ -37,13 +43,8 @@ limitations under the License.
 #include "xla/tsl/distributed_runtime/coordination/coordination_service_error_util.h"
 #include "xla/tsl/util/device_name_utils.h"
 #include "tsl/platform/env.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/macros.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/platform/random.h"
 #include "tsl/platform/status.h"
-#include "tsl/platform/strcat.h"
-#include "tsl/platform/thread_annotations.h"
 #include "tsl/protobuf/coordination_config.pb.h"
 #include "tsl/protobuf/coordination_service.pb.h"
 
@@ -66,7 +67,7 @@ constexpr int kPendingTaskLogLimit = 20;
 constexpr int kPendingStragglerLogLimit = 3;
 
 std::string GetTaskName(absl::string_view job_name, int task_id) {
-  return strings::StrCat("/job:", job_name, "/replica:", 0, "/task:", task_id);
+  return absl::StrCat("/job:", job_name, "/replica:", 0, "/task:", task_id);
 }
 
 std::string GetTaskName(const CoordinatedTask& task) {
@@ -107,7 +108,7 @@ class CoordinationServiceStandaloneImpl : public CoordinationServiceInterface {
       std::function<DeviceInfo(const DeviceInfo& devices)>
           post_aggregate_device_fn) override;
 
-  void LogConnectStatusLocked() const TF_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
+  void LogConnectStatusLocked() const ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
 
   absl::Status RegisterTask(const CoordinatedTask& task,
                             uint64_t incarnation) override;
@@ -139,10 +140,11 @@ class CoordinationServiceStandaloneImpl : public CoordinationServiceInterface {
 
  private:
   const DeviceInfo& ListClusterDevices() override
-      TF_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
   uint64_t GetServiceIncarnation() override;
   void StartCheckStaleness();  // Checks both heartbeat and barrier timeouts.
   void Stop(bool shut_staleness_thread = true);
+  bool ServiceHasStopped() const ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
   // Report service error to a specified task.
   void ReportServiceErrorToTaskAsync(const CoordinatedTask& destination_task,
                                      absl::Status error);
@@ -151,16 +153,16 @@ class CoordinationServiceStandaloneImpl : public CoordinationServiceInterface {
   // Note: SetTaskError() must be called before propagating its error.
   void PropagateError(const CoordinatedTask& source_task,
                       bool is_reported_by_task = false)
-      TF_LOCKS_EXCLUDED(state_mu_);
+      ABSL_LOCKS_EXCLUDED(state_mu_);
   void SetTaskError(absl::string_view task_name, absl::Status error)
-      TF_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
-  void AggregateClusterDevices() TF_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
+  void AggregateClusterDevices() ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
   absl::Status DisconnectTask(const CoordinatedTask& task)
-      TF_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
 
   struct BarrierState {
     bool passed = false;
-    absl::Status result = errors::Unknown(
+    absl::Status result = absl::UnknownError(
         "Invalid barrier result.");  // Only valid if `passed` is true.
     uint64_t deadline_in_micros = 0;
     int num_pending_tasks = 0;
@@ -172,7 +174,7 @@ class CoordinationServiceStandaloneImpl : public CoordinationServiceInterface {
   };
   void PassBarrier(absl::string_view barrier_id, absl::Status result,
                    BarrierState* barrier)
-      TF_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
   // Check if participating tasks are specified correctly across barrier calls.
   bool ValidateTaskArgs(
       const std::vector<CoordinatedTask>& tasks_args,
@@ -221,8 +223,8 @@ class CoordinationServiceStandaloneImpl : public CoordinationServiceInterface {
 
     CoordinatedTaskState state_ = CoordinatedTaskState::TASKSTATE_DISCONNECTED;
     absl::Status status_;
-    mutex last_heartbeat_mu_;
-    uint64_t last_heartbeat_us_ TF_GUARDED_BY(last_heartbeat_mu_);
+    absl::Mutex last_heartbeat_mu_;
+    uint64_t last_heartbeat_us_ ABSL_GUARDED_BY(last_heartbeat_mu_);
     // This denotes the deadline after which we stop accepting heartbeats from a
     // disconnected task. This grace period accounts for the lag time between
     // the service recording the state change and the agent stopping heartbeats.
@@ -250,28 +252,26 @@ class CoordinationServiceStandaloneImpl : public CoordinationServiceInterface {
   const std::string shutdown_barrier_id_ =
       absl::StrCat("Shutdown::", std::to_string(service_incarnation_));
 
-  mutex state_mu_;
+  absl::Mutex state_mu_;
   absl::flat_hash_map<std::string, std::unique_ptr<TaskState>> cluster_state_
-      TF_GUARDED_BY(state_mu_);
-  DeviceInfo cluster_devices_ TF_GUARDED_BY(state_mu_);
+      ABSL_GUARDED_BY(state_mu_);
+  DeviceInfo cluster_devices_ ABSL_GUARDED_BY(state_mu_);
 
-  mutex kv_mu_;
+  absl::Mutex kv_mu_;
   // Ordered map to store config key-values
-  std::map<std::string, std::string> kv_store_ TF_GUARDED_BY(kv_mu_);
+  std::map<std::string, std::string> kv_store_ ABSL_GUARDED_BY(kv_mu_);
   absl::flat_hash_map<std::string, std::vector<StatusOrValueCallback>> get_cb_
-      TF_GUARDED_BY(kv_mu_);
+      ABSL_GUARDED_BY(kv_mu_);
 
-  mutex check_staleness_thread_shutdown_mu_;
-  condition_variable check_staleness_thread_cv_;
-  bool shutting_down_ TF_GUARDED_BY(check_staleness_thread_shutdown_mu_) =
-      false;
+  absl::CondVar check_staleness_thread_cv_;
+  bool shutting_down_ ABSL_GUARDED_BY(state_mu_) = false;
   std::unique_ptr<Thread> check_staleness_thread_;
 
   absl::flat_hash_map<std::string, BarrierState> barriers_
-      TF_GUARDED_BY(state_mu_);
+      ABSL_GUARDED_BY(state_mu_);
   // For now, we assume there won't be many simultaneous barriers so we simply
   // use a set.
-  absl::flat_hash_set<std::string> ongoing_barriers_ TF_GUARDED_BY(state_mu_);
+  absl::flat_hash_set<std::string> ongoing_barriers_ ABSL_GUARDED_BY(state_mu_);
 
   absl::flat_hash_set<std::string> recoverable_jobs_;
 
@@ -285,7 +285,7 @@ void CoordinationServiceStandaloneImpl::TaskState::SetConnected(
   state_ = CoordinatedTaskState::TASKSTATE_CONNECTED;
   status_ = absl::OkStatus();
   task_incarnation_ = task_incarnation;
-  mutex_lock l(last_heartbeat_mu_);
+  absl::MutexLock l(&last_heartbeat_mu_);
   last_heartbeat_us_ = Env::Default()->NowMicros();
 }
 
@@ -308,18 +308,18 @@ absl::Status CoordinationServiceStandaloneImpl::TaskState::RecordHeartbeat(
     uint64_t task_incarnation) {
   if (!status_.ok()) return status_;
   if (task_incarnation != task_incarnation_) {
-    return MakeCoordinationError(errors::Aborted(
+    return MakeCoordinationError(absl::AbortedError(absl::StrCat(
         "Incarnation ID mismatch: expecting ", task_incarnation_, " but got ",
-        task_incarnation, ". This means the remote task has restarted."));
+        task_incarnation, ". This means the remote task has restarted.")));
   }
-  mutex_lock l(last_heartbeat_mu_);
+  absl::MutexLock l(&last_heartbeat_mu_);
   last_heartbeat_us_ = Env::Default()->NowMicros();
   return absl::OkStatus();
 }
 
 int64_t
 CoordinationServiceStandaloneImpl::TaskState::TimeSinceLastHeartbeatMs() {
-  mutex_lock l(last_heartbeat_mu_);
+  absl::MutexLock l(&last_heartbeat_mu_);
   return (Env::Default()->NowMicros() - last_heartbeat_us_) / 1000;
 }
 
@@ -386,8 +386,9 @@ void CoordinationServiceStandaloneImpl::StartCheckStaleness() {
         absl::flat_hash_map<std::string, BarrierState*> expired_barriers;
         while (true) {
           {
-            mutex_lock l(check_staleness_thread_shutdown_mu_);
-            check_staleness_thread_cv_.wait_for(l, std::chrono::seconds(1));
+            absl::MutexLock l(&state_mu_);
+            check_staleness_thread_cv_.WaitWithTimeout(&state_mu_,
+                                                       absl::Seconds(1));
             if (shutting_down_) {
               return;
             }
@@ -395,7 +396,7 @@ void CoordinationServiceStandaloneImpl::StartCheckStaleness() {
           // Heartbeat check.
           absl::Status status = absl::OkStatus();
           {
-            mutex_lock l(state_mu_);
+            absl::MutexLock l(&state_mu_);
             for (const auto& [task_name, task_state] : cluster_state_) {
               // Skip tasks that are not registered or in error state
               if (task_state->GetState() !=
@@ -408,11 +409,14 @@ void CoordinationServiceStandaloneImpl::StartCheckStaleness() {
                        << " stale?=" << is_stale;
               if (is_stale) {
                 stale_task_names.push_back(task_name);
-                status = MakeCoordinationError(errors::Unavailable(
-                    "Task ", task_name,
-                    " heartbeat timeout. This indicates that the remote task "
-                    "has failed, got preempted, or crashed unexpectedly. Check "
-                    "the task logs for an earlier error to debug further."));
+                status = MakeCoordinationError(absl::UnavailableError(
+                    absl::StrCat("Task ", task_name,
+                                 " heartbeat timeout. This indicates that the "
+                                 "remote task "
+                                 "has failed, got preempted, or crashed "
+                                 "unexpectedly. Check "
+                                 "the task logs for an earlier error to debug "
+                                 "further.")));
                 SetTaskError(task_name, status);
               }
             }
@@ -442,7 +446,7 @@ void CoordinationServiceStandaloneImpl::StartCheckStaleness() {
           // Barrier timeout check.
           uint64_t current_time_micros = Env::Default()->NowMicros();
           {
-            mutex_lock l(state_mu_);
+            absl::MutexLock l(&state_mu_);
             // Gather barriers which have timed out.
             for (const std::string& barrier_id : ongoing_barriers_) {
               auto* barrier = &barriers_[barrier_id];
@@ -464,8 +468,8 @@ void CoordinationServiceStandaloneImpl::StartCheckStaleness() {
                   }
                 }
               }
-              const absl::Status error =
-                  MakeCoordinationError(errors::DeadlineExceeded(absl::StrCat(
+              const absl::Status error = MakeCoordinationError(
+                  absl::DeadlineExceededError(absl::StrCat(
                       "Barrier timed out. Barrier_id: ", barrier_id,
                       ". Timed out task names:\n", pending_tasks)));
               PassBarrier(barrier_id, error, barrier);
@@ -490,10 +494,10 @@ void CoordinationServiceStandaloneImpl::StartCheckStaleness() {
 
 void CoordinationServiceStandaloneImpl::Stop(bool shut_staleness_thread) {
   {
-    mutex_lock l(kv_mu_);
+    absl::MutexLock l(&kv_mu_);
     for (const auto& [key, get_kv_callbacks] : get_cb_) {
       for (const auto& get_kv_callback : get_kv_callbacks) {
-        get_kv_callback(errors::Cancelled(
+        get_kv_callback(absl::CancelledError(
             absl::StrCat("Coordination service is shutting down. Cancelling "
                          "GetKeyValue() for key: ",
                          key)));
@@ -502,30 +506,37 @@ void CoordinationServiceStandaloneImpl::Stop(bool shut_staleness_thread) {
     get_cb_.clear();
   }
   {
-    mutex_lock l(state_mu_);
+    absl::MutexLock l(&state_mu_);
+    // Indicate that the service is shutting down and stop accepting new RPCs.
+    shutting_down_ = true;
+    // Stop the heartbeat thread.
+    check_staleness_thread_cv_.SignalAll();
+    // Fail all ongoing barriers.
     for (auto& [barrier_id, barrier] : barriers_) {
       if (!barrier.passed) {
-        absl::Status error = MakeCoordinationError(errors::Aborted(absl::StrCat(
-            "Barrier failed because service is shutting down. Barrier_id: ",
-            barrier_id)));
+        absl::Status error =
+            MakeCoordinationError(absl::AbortedError(absl::StrCat(
+                "Barrier failed because service is shutting down. Barrier_id: ",
+                barrier_id)));
         PassBarrier(barrier_id, error, &barrier);
       }
     }
     barriers_.clear();
-    // Cluster state is used in `PassBarrier` and it needs to be cleared after
-    // it.
+    // Erase cluster state.
+    // Note: sequence matters here, this must happen after barrier clean-up as
+    // the state is used in `PassBarrier`.
     cluster_state_.clear();
   }
-  {
-    mutex_lock l(check_staleness_thread_shutdown_mu_);
-    shutting_down_ = true;
-    check_staleness_thread_cv_.notify_all();
-  }
+  // Destroy thread outside of the mutex.
   if (shut_staleness_thread) {
     check_staleness_thread_.reset();
   }
 }
 
+bool CoordinationServiceStandaloneImpl::ServiceHasStopped() const {
+  return shutting_down_;
+}
+
 // Helper to log progress to having waited for all tasks.
 void CoordinationServiceStandaloneImpl::LogConnectStatusLocked() const {
   const int num_tasks = cluster_state_.size();
@@ -548,17 +559,26 @@ void CoordinationServiceStandaloneImpl::LogConnectStatusLocked() const {
 
 absl::Status CoordinationServiceStandaloneImpl::RegisterTask(
     const CoordinatedTask& task, uint64_t incarnation) {
-  const std::string& task_name = GetTaskName(task);
+  const std::string task_name = GetTaskName(task);
 
   absl::Status error;
   std::string error_message;
   {
-    mutex_lock l(state_mu_);
+    absl::MutexLock l(&state_mu_);
+    if (ServiceHasStopped()) {
+      return MakeCoordinationError(absl::InternalError(absl::StrCat(
+          "Coordination service has stopped. RegisterTask() from task: ",
+          task_name,
+          " failed. This usually implies an earlier error that caused "
+          "coordination service to shut down before the workers disconnect "
+          "gracefully. Check the task leader's logs for an earlier error to "
+          "debug the root cause.")));
+    }
     if (!cluster_state_.contains(task_name)) {
       // Note: return early here as unexpected task register errors should not
       // be propagated to other tasks.
-      return MakeCoordinationError(errors::InvalidArgument(
-          "Unexpected task registered with task_name=", task_name));
+      return MakeCoordinationError(absl::InvalidArgumentError(absl::StrCat(
+          "Unexpected task registered with task_name=", task_name)));
     }
 
     auto* task_cluster_state = cluster_state_[task_name].get();
@@ -567,7 +587,7 @@ absl::Status CoordinationServiceStandaloneImpl::RegisterTask(
 
     if (task_state == CoordinatedTaskState::TASKSTATE_DISCONNECTED ||
         (allow_new_incarnation_to_reconnect_ &&
-         (errors::IsUnavailable(task_status) &&
+         (absl::IsUnavailable(task_status) &&
           task_status.GetPayload(CoordinationErrorPayloadKey())))) {
       // The task is allowed to register itself if:
       // - this task is currently disconnected (registering for the first time
@@ -611,7 +631,7 @@ absl::Status CoordinationServiceStandaloneImpl::RegisterTask(
                        "subsequent connect attempt.");
     }
     LOG(ERROR) << error_message;
-    error = MakeCoordinationError(errors::Aborted(error_message), task);
+    error = MakeCoordinationError(absl::AbortedError(error_message), task);
     SetTaskError(task_name, error);
   }
   assert(!error.ok());
@@ -623,7 +643,12 @@ void CoordinationServiceStandaloneImpl::WaitForAllTasks(
     const CoordinatedTask& task, const DeviceInfo& devices,
     StatusCallback done) {
   {
-    mutex_lock l(state_mu_);
+    absl::MutexLock l(&state_mu_);
+    if (ServiceHasStopped()) {
+      done(MakeCoordinationError(absl::InternalError(
+          "Coordination service has stopped. WaitForAllTasks() failed.")));
+      return;
+    }
     const auto& task_state = cluster_state_.find(GetTaskName(task));
     // Collect task device info for the first time that task
     // has called WaitForAllTasks(). This will be aggregated when the barrier
@@ -646,9 +671,14 @@ void CoordinationServiceStandaloneImpl::ShutdownTaskAsync(
   } else {
     absl::Status status;
     {
-      mutex_lock l(state_mu_);
-      // Disconnect task from service individually.
-      status = DisconnectTask(task);
+      absl::MutexLock l(&state_mu_);
+      if (ServiceHasStopped()) {
+        status = MakeCoordinationError(absl::InternalError(
+            "Coordination service has stopped. ShutdownTaskAsync() failed."));
+      } else {
+        // Disconnect task from service individually.
+        status = DisconnectTask(task);
+      }
     }
     done(status);
   }
@@ -656,7 +686,7 @@ void CoordinationServiceStandaloneImpl::ShutdownTaskAsync(
 
 absl::Status CoordinationServiceStandaloneImpl::ResetTask(
     const CoordinatedTask& task) {
-  mutex_lock l(state_mu_);
+  absl::MutexLock l(&state_mu_);
   return DisconnectTask(task);
 }
 
@@ -664,13 +694,18 @@ absl::Status CoordinationServiceStandaloneImpl::DisconnectTask(
     const CoordinatedTask& task) {
   const std::string task_name = GetTaskName(task);
   // Check if task is valid and not already disconnected.
-  if (!cluster_state_.contains(task_name)) {
-    return MakeCoordinationError(errors::InvalidArgument(
-        "Unexpected disconnect request with task_name=", task_name));
+  if (ServiceHasStopped()) {
+    return MakeCoordinationError(absl::InternalError(
+        absl::StrCat("Coordination service has stopped. DisconnectTask() "
+                     "failed for task_name=",
+                     task_name)));
+  } else if (!cluster_state_.contains(task_name)) {
+    return MakeCoordinationError(absl::InvalidArgumentError(absl::StrCat(
+        "Unexpected disconnect request with task_name=", task_name)));
   } else if (cluster_state_[task_name]->GetState() ==
              CoordinatedTaskState::TASKSTATE_DISCONNECTED) {
-    return MakeCoordinationError(errors::FailedPrecondition(
-        "The task is already disconnected: ", task_name));
+    return MakeCoordinationError(absl::FailedPreconditionError(
+        absl::StrCat("The task is already disconnected: ", task_name)));
   }
 
   // Disconnect task and fail any ongoing barriers.
@@ -678,7 +713,7 @@ absl::Status CoordinationServiceStandaloneImpl::DisconnectTask(
       /*grace_period_duration_us=*/heartbeat_timeout_ms_ * 1000);
   for (const auto& barrier_id :
        cluster_state_[task_name]->GetOngoingBarriers()) {
-    absl::Status error = MakeCoordinationError(errors::Internal(absl::StrCat(
+    absl::Status error = MakeCoordinationError(absl::InternalError(absl::StrCat(
         "Barrier failed from a disconnected task. Barrier Id: ", barrier_id,
         ", Task: ", task_name)));
     PassBarrier(barrier_id, error, &barriers_[barrier_id]);
@@ -698,15 +733,18 @@ uint64_t CoordinationServiceStandaloneImpl::GetServiceIncarnation() {
 
 absl::Status CoordinationServiceStandaloneImpl::ReportTaskError(
     const CoordinatedTask& task, absl::Status error) {
-  const std::string& task_name = GetTaskName(task);
+  const std::string task_name = GetTaskName(task);
   {
-    mutex_lock l(state_mu_);
-    if (!cluster_state_.contains(task_name)) {
-      return MakeCoordinationError(
-          errors::InvalidArgument("Unexpected request from task ", task_name));
+    absl::MutexLock l(&state_mu_);
+    if (ServiceHasStopped()) {
+      return MakeCoordinationError(absl::InternalError(
+          "Coordination service has stopped. ReportTaskError() failed."));
+    } else if (!cluster_state_.contains(task_name)) {
+      return MakeCoordinationError(absl::InvalidArgumentError(
+          absl::StrCat("Unexpected request from task ", task_name)));
     } else if (cluster_state_[task_name]->GetState() !=
                CoordinatedTaskState::TASKSTATE_CONNECTED) {
-      return MakeCoordinationError(errors::FailedPrecondition(
+      return MakeCoordinationError(absl::FailedPreconditionError(
           "The task is not connected or already has an error."));
     } else {
       SetTaskError(task_name, error);
@@ -725,7 +763,7 @@ CoordinationServiceStandaloneImpl::GetTaskState(
     auto& state_info = states_info.emplace_back();
     absl::Status error;
     {
-      mutex_lock l(state_mu_);
+      absl::MutexLock l(&state_mu_);
       state_info.set_state(cluster_state_[task_name]->GetState());
       error = cluster_state_[task_name]->GetStatus();
     }
@@ -742,16 +780,22 @@ CoordinationServiceStandaloneImpl::GetTaskState(
 
 absl::Status CoordinationServiceStandaloneImpl::RecordHeartbeat(
     const CoordinatedTask& task, uint64_t incarnation) {
-  const std::string& task_name = GetTaskName(task);
+  const std::string task_name = GetTaskName(task);
   absl::Status s = absl::OkStatus();
   {
-    mutex_lock l(state_mu_);
-    if (!cluster_state_.contains(task_name)) {
-      return MakeCoordinationError(errors::InvalidArgument(
-          "Unexpected heartbeat request from task: ", task_name,
-          ". This usually implies an earlier error that caused coordination "
-          "service to shut down before the workers disconnect. Check the task "
-          "leader's logs for an earlier error to debug the root cause."));
+    absl::MutexLock l(&state_mu_);
+    if (ServiceHasStopped()) {
+      return MakeCoordinationError(absl::InternalError(absl::StrCat(
+          "Coordination service has stopped. RecordHeartbeat() from task: ",
+          task_name,
+          " failed. This usually implies an earlier error that caused "
+          "coordination service to shut down before the workers disconnect "
+          "gracefully. Check the task leader's logs for an earlier error to "
+          "debug the root cause.")));
+    } else if (!cluster_state_.contains(task_name)) {
+      return MakeCoordinationError(absl::InvalidArgumentError(
+          absl::StrCat("Unexpected heartbeat request from task: ", task_name,
+                       ". This usually implies a configuration error.")));
     }
     if (!cluster_state_[task_name]->GetStatus().ok()) {
       return cluster_state_[task_name]->GetStatus();
@@ -763,9 +807,9 @@ absl::Status CoordinationServiceStandaloneImpl::RecordHeartbeat(
                Env::Default()->NowMicros() >
                    cluster_state_[task_name]
                        ->GetDisconnectedGracePeriodMicros()) {
-      return MakeCoordinationError(errors::InvalidArgument(
+      return MakeCoordinationError(absl::InvalidArgumentError(absl::StrCat(
           "Task with task_name=", task_name,
-          " must be registered before sending heartbeat messages"));
+          " must be registered before sending heartbeat messages")));
     }
     s = cluster_state_[task_name]->RecordHeartbeat(incarnation);
   }
@@ -773,7 +817,7 @@ absl::Status CoordinationServiceStandaloneImpl::RecordHeartbeat(
   // Set and propagate any heartbeat errors.
   if (!s.ok()) {
     {
-      mutex_lock l(state_mu_);
+      absl::MutexLock l(&state_mu_);
       SetTaskError(task_name, s);
     }
     PropagateError(task);
@@ -821,7 +865,7 @@ void CoordinationServiceStandaloneImpl::PropagateError(
   if (isRecoverableJob(source_task.job_name())) return;
   absl::Status error;
   {
-    mutex_lock l(state_mu_);
+    absl::MutexLock l(&state_mu_);
     error = cluster_state_[GetTaskName(source_task)]->GetStatus();
   }
   assert(!error.ok());
@@ -837,7 +881,7 @@ void CoordinationServiceStandaloneImpl::PropagateError(
 
   std::vector<absl::string_view> task_names;
   {
-    tf_shared_lock l(state_mu_);
+    absl::ReaderMutexLock l(&state_mu_);
     task_names.reserve(cluster_state_.size());
     for (const auto& pair : cluster_state_) {
       task_names.emplace_back(pair.first);
@@ -845,7 +889,7 @@ void CoordinationServiceStandaloneImpl::PropagateError(
   }
   for (absl::string_view task : task_names) {
     {
-      mutex_lock l(state_mu_);
+      absl::MutexLock l(&state_mu_);
       // Propagate error only to tasks that are connected
       if (cluster_state_[task]->GetState() !=
           CoordinatedTaskState::TASKSTATE_CONNECTED)
@@ -884,7 +928,7 @@ void CoordinationServiceStandaloneImpl::PropagateError(
 // The normalized key will not have leading or trailing slashes, and all parts
 // in the key path are separated by exactly one slack ('/').
 // E.g., ///a//b/c// --> a/b/c
-std::string NormalizeKey(const StringPiece orig_key) {
+std::string NormalizeKey(absl::string_view orig_key) {
   std::string norm_key = std::string(orig_key);
   const char* src = norm_key.c_str();
   std::string::iterator dst = norm_key.begin();
@@ -911,11 +955,11 @@ std::string NormalizeKey(const StringPiece orig_key) {
 absl::Status CoordinationServiceStandaloneImpl::InsertKeyValue(
     const std::string& key, const std::string& value) {
   VLOG(3) << "InsertKeyValue(): " << key << ": " << value;
-  const std::string& norm_key = NormalizeKey(key);
-  mutex_lock l(kv_mu_);
+  const std::string norm_key = NormalizeKey(key);
+  absl::MutexLock l(&kv_mu_);
   if (kv_store_.find(norm_key) != kv_store_.end()) {
-    return MakeCoordinationError(
-        errors::AlreadyExists("Config key ", key, " already exists."));
+    return MakeCoordinationError(absl::AlreadyExistsError(
+        absl::StrCat("Config key ", key, " already exists.")));
   }
   kv_store_.emplace(norm_key, value);
   auto iter = get_cb_.find(norm_key);
@@ -931,8 +975,8 @@ absl::Status CoordinationServiceStandaloneImpl::InsertKeyValue(
 void CoordinationServiceStandaloneImpl::GetKeyValueAsync(
     const std::string& key, StatusOrValueCallback done) {
   VLOG(3) << "GetKeyValue(): " << key;
-  const std::string& norm_key = NormalizeKey(key);
-  mutex_lock l(kv_mu_);
+  const std::string norm_key = NormalizeKey(key);
+  absl::MutexLock l(&kv_mu_);
   const auto& iter = kv_store_.find(norm_key);
   if (iter != kv_store_.end()) {
     done(iter->second);
@@ -949,11 +993,11 @@ void CoordinationServiceStandaloneImpl::GetKeyValueAsync(
 absl::StatusOr<std::string> CoordinationServiceStandaloneImpl::TryGetKeyValue(
     const std::string& key) {
   VLOG(3) << "TryGetKeyValue(): " << key;
-  const std::string& norm_key = NormalizeKey(key);
-  mutex_lock l(kv_mu_);
+  const std::string norm_key = NormalizeKey(key);
+  absl::MutexLock l(&kv_mu_);
   const auto& iter = kv_store_.find(norm_key);
   if (iter == kv_store_.end()) {
-    return errors::NotFound("Config key ", key, " not found.");
+    return absl::NotFoundError(absl::StrCat("Config key ", key, " not found."));
   }
   return iter->second;
 }
@@ -965,7 +1009,7 @@ std::vector<KeyValueEntry> CoordinationServiceStandaloneImpl::GetKeyValueDir(
   const std::string norm_key = NormalizeKey(directory_key);
   const std::string dir = absl::StrCat(norm_key, "/");
 
-  mutex_lock l(kv_mu_);
+  absl::MutexLock l(&kv_mu_);
   // Find first key in ordered map that has the directory prefix.
   auto begin = kv_store_.lower_bound(dir);
   std::map<std::string, std::string>::iterator it;
@@ -989,10 +1033,10 @@ std::vector<KeyValueEntry> CoordinationServiceStandaloneImpl::GetKeyValueDir(
 absl::Status CoordinationServiceStandaloneImpl::DeleteKeyValue(
     const std::string& key) {
   VLOG(3) << "DeleteKeyValue(): " << key;
-  const std::string& norm_key = NormalizeKey(key);
-  mutex_lock l(kv_mu_);
+  const std::string norm_key = NormalizeKey(key);
+  absl::MutexLock l(&kv_mu_);
   // Delete directory: find key range that match directory prefix
-  const std::string& dir = strings::StrCat(norm_key, "/");
+  const std::string dir = absl::StrCat(norm_key, "/");
   auto begin = kv_store_.lower_bound(dir);
   std::map<std::string, std::string>::iterator end;
   for (end = begin; end != kv_store_.end(); end++) {
@@ -1013,7 +1057,7 @@ void CoordinationServiceStandaloneImpl::SetTaskError(
   cluster_state_[task_name]->SetError(error);
   for (const auto& barrier_id :
        cluster_state_[task_name]->GetOngoingBarriers()) {
-    absl::Status error = MakeCoordinationError(errors::Internal(absl::StrCat(
+    absl::Status error = MakeCoordinationError(absl::InternalError(absl::StrCat(
         "Barrier failed from a task error. Barrier Id: ", barrier_id,
         ", Task: ", task_name)));
     PassBarrier(barrier_id, error, &barriers_[barrier_id]);
@@ -1028,9 +1072,52 @@ void CoordinationServiceStandaloneImpl::BarrierAsync(
     const CoordinatedTask& task,
     const std::vector<CoordinatedTask>& participating_tasks,
     StatusCallback done) {
-  VLOG(3) << "Task " << GetTaskName(task) << "invoked BarrierAsync("
+  VLOG(3) << "Task " << GetTaskName(task) << " invoked BarrierAsync("
           << barrier_id << ").";
-  mutex_lock l(state_mu_);
+
+  // Check if caller task is participating in the barrier. If not, update
+  // `barriers_` to cause subsequent calls from the same task and other tasks
+  // that have already called this instance of the barrier to fail.
+  const std::string source_task_name = GetTaskName(task);
+
+  bool among_participating_tasks =
+      std::find_if(participating_tasks.begin(), participating_tasks.end(),
+                   [&](const CoordinatedTask& task) {
+                     return GetTaskName(task) == source_task_name;
+                   }) != participating_tasks.end();
+
+  if (!participating_tasks.empty() && !among_participating_tasks) {
+    const std::string task_name = GetTaskName(task);
+    absl::Status error = MakeCoordinationError(absl::InvalidArgumentError(
+        absl::StrCat("A non-participating task (", GetTaskName(task),
+                     ") called the barrier: ", barrier_id)));
+    {
+      absl::MutexLock l(&state_mu_);
+      // Check if coordination service has stopped. If so, return an error
+      // immediately.
+      if (ServiceHasStopped()) {
+        done(MakeCoordinationError(absl::InternalError(
+            "Barrier requested after coordination service has shut down.")));
+        return;
+      }
+      auto pair = barriers_.try_emplace(barrier_id);
+      auto it = pair.first;
+      auto* barrier = &it->second;
+      // Make sure subsequent calls fail and existing waiting tasks receive the
+      // error.
+      PassBarrier(barrier_id, error, barrier);
+    }
+    done(error);
+    return;
+  }
+  absl::MutexLock l(&state_mu_);
+  // Check if coordination service has stopped. If so, return an error
+  // immediately.
+  if (ServiceHasStopped()) {
+    done(MakeCoordinationError(absl::InternalError(
+        "Barrier requested after coordination service has shut down.")));
+    return;
+  }
   auto pair = barriers_.try_emplace(barrier_id);
   auto it = pair.first;
   bool inserted = pair.second;
@@ -1051,7 +1138,7 @@ void CoordinationServiceStandaloneImpl::BarrierAsync(
         // barrier.
         const std::string task_name = GetTaskName(task);
         if (!cluster_state_.contains(task_name)) {
-          absl::Status error = MakeCoordinationError(errors::InvalidArgument(
+          absl::Status error = MakeCoordinationError(absl::InvalidArgumentError(
               absl::StrCat("Unexpected task (", task_name,
                            ") that is not in the cluster called the barrier. "
                            "Barrier Id: ",
@@ -1070,7 +1157,7 @@ void CoordinationServiceStandaloneImpl::BarrierAsync(
       const std::string task_name = GetTaskName(pending_task.first);
       if (cluster_state_[task_name]->GetState() ==
           CoordinatedTaskState::TASKSTATE_ERROR) {
-        absl::Status error = MakeCoordinationError(errors::Internal(
+        absl::Status error = MakeCoordinationError(absl::InternalError(
             absl::StrCat("Task (", task_name,
                          ") is already in error before the barrier "
                          "was called. Barrier Id: ",
@@ -1117,21 +1204,11 @@ void CoordinationServiceStandaloneImpl::BarrierAsync(
   // Add pending callbacks.
   barrier->done_callbacks.push_back(done);
 
-  // Check if caller task is participating in the barrier.
-  if (!barrier->tasks_at_barrier.contains(task)) {
-    // Unexpected barrier call from a task not participating in the barrier.
-    absl::Status error = MakeCoordinationError(errors::InvalidArgument(
-        absl::StrCat("A non-participating task (", GetTaskName(task),
-                     ") called the barrier: ", barrier_id)));
-    PassBarrier(barrier_id, error, barrier);
-    return;
-  }
-
   // Check if task args are specified consistently across barrier calls.
   if (!ValidateTaskArgs(participating_tasks, barrier->tasks_at_barrier,
                         cluster_state_.size())) {
     absl::Status error =
-        MakeCoordinationError(errors::InvalidArgument(absl::StrCat(
+        MakeCoordinationError(absl::InvalidArgumentError(absl::StrCat(
             "Conflicting tasks specified for the same barrier: ", barrier_id)));
     PassBarrier(barrier_id, error, barrier);
     return;
@@ -1152,7 +1229,11 @@ void CoordinationServiceStandaloneImpl::BarrierAsync(
 
 absl::Status CoordinationServiceStandaloneImpl::CancelBarrier(
     const std::string& barrier_id, const CoordinatedTask& task) {
-  mutex_lock l(state_mu_);
+  absl::MutexLock l(&state_mu_);
+  if (ServiceHasStopped()) {
+    return MakeCoordinationError(absl::InternalError(
+        "Coordination service has stopped. CancelBarrier() failed."));
+  }
   auto [it, inserted] = barriers_.try_emplace(barrier_id);
   auto* barrier = &it->second;
   if (inserted) {
@@ -1162,14 +1243,15 @@ absl::Status CoordinationServiceStandaloneImpl::CancelBarrier(
   }
   // Barrier has already been passed.
   if (barrier->passed) {
-    return MakeCoordinationError(errors::FailedPrecondition(absl::StrCat(
+    return MakeCoordinationError(absl::FailedPreconditionError(absl::StrCat(
         "Barrier (", barrier_id, ") has already been passed with status code: ",
         barrier->result.code())));
   }
 
   // Cancel barrier.
-  absl::Status cancelled = MakeCoordinationError(errors::Cancelled(absl::StrCat(
-      "Barrier (", barrier_id, ") is cancelled by task: ", GetTaskName(task))));
+  absl::Status cancelled = MakeCoordinationError(absl::CancelledError(
+      absl::StrCat("Barrier (", barrier_id,
+                   ") is cancelled by task: ", GetTaskName(task))));
   PassBarrier(barrier_id, cancelled, barrier);
 
   VLOG(3) << "Barrier (" << barrier_id << ") is cancelled.";
@@ -1204,7 +1286,7 @@ void CoordinationServiceStandaloneImpl::PassBarrier(
                     "crashed early or too slow / hanging. Check the logs for "
                     "an earlier error to identify the root cause.";
     }
-    absl::Status shutdown_error = MakeCoordinationError(errors::Internal(
+    absl::Status shutdown_error = MakeCoordinationError(absl::InternalError(
         absl::StrCat("Shutdown barrier has been passed with status: '",
                      barrier->result.ToString(),
                      "', but this task is not at the barrier yet.")));
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.h b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.h
index 298759f1ad461a..627cce09186f9b 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_H_
 #define XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_H_
 
+#include <cstdint>
 #include <functional>
 #include <memory>
 #include <string>
@@ -23,11 +24,14 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/time/time.h"
 #include "xla/tsl/distributed_runtime/coordination/coordination_client.h"
+#include "tsl/platform/macros.h"
 #include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
 #include "tsl/protobuf/coordination_config.pb.h"
 
 namespace tsl {
@@ -50,7 +54,7 @@ class Env;
 // execution in a cluster of multiple tasks.
 //
 // When enabled, the service keeps track of cluster configurations and the state
-// of cluster members. TF runtime and libraries can use it to orchastrate
+// of cluster members. TF runtime and libraries can use it to orchestrate
 // cluster initialization, check the healthiness of tasks, and propagate error
 // messages to the cluster.
 //
@@ -113,6 +117,7 @@ class CoordinationServiceInterface {
 
   // Register a task to the service.
   // Possible service errors:
+  //   - Internal: Service has shut down.
   //   - InvalidArgument: Unexpected task request.
   //   - Aborted: (1) task is in error state, or (2) task is in connected state
   //       with a different incarnation, indicating that it restarted.
@@ -132,6 +137,7 @@ class CoordinationServiceInterface {
   // specified in the config, blocks until all tasks reach the barrier before
   // disconnecting together.
   // Possible service errors:
+  //   - Internal: Service has shut down.
   //   - InvalidArgument: Unexpected task request.
   //   - FailedPrecondition: task has already disconnected.
   virtual void ShutdownTaskAsync(const tensorflow::CoordinatedTask& task,
@@ -139,12 +145,14 @@ class CoordinationServiceInterface {
 
   // Disconnects task from the service and cleans up its internal error state.
   // Possible service errors:
+  //   - Internal: Service has shut down.
   //   - InvalidArgument: Unexpected task request.
   //   - FailedPrecondition: task has already disconnected.
   virtual absl::Status ResetTask(const tensorflow::CoordinatedTask& task) = 0;
 
   // Update the heartbeat timestamp of a task. This should only be invoked on
   // the leader of the cluster.
+  //   - Internal: Service has shut down.
   virtual absl::Status RecordHeartbeat(const tensorflow::CoordinatedTask& task,
                                        uint64_t incarnation) = 0;
 
@@ -202,11 +210,12 @@ class CoordinationServiceInterface {
   //
   // Possible service errors:
   //   - DeadlineExceeded: Timed out waiting for specified tasks at the barrier.
-  //      Deadline is determined by the server timestamp when it receives the
-  //      first WaitAtBarrier() + timeout duration.
+  //       Deadline is determined by the server timestamp when it receives the
+  //       first WaitAtBarrier() + timeout duration.
   //   - Cancelled: One of the tasks called CancelBarrier().
   //   - Aborted: Service is shutting down.
-  //   - Internal: Any participating task is in ERROR state.
+  //   - Internal: Any participating task is in ERROR state, or coordination
+  //   -   service has shut down.
   //   - InvalidArgument: (1) Conflicting tasks specified by different agents
   //       for the same barrier, (2) one of the participating tasks is not in
   //       the cluster, or (3) task making the request is not included in the
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.cc
index 59d1abf1b80956..860d4fd65ab90b 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/substitute.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/synchronization/notification.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
@@ -42,10 +43,8 @@ limitations under the License.
 #include "tsl/framework/cancellation.h"
 #include "tsl/lib/monitoring/gauge.h"
 #include "tsl/platform/env.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/platform/random.h"
 #include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
 #include "tsl/platform/thread_annotations.h"
 #include "tsl/protobuf/coordination_config.pb.h"
 #include "tsl/protobuf/coordination_service.pb.h"
@@ -72,7 +71,7 @@ class CoordinationServiceAgentImpl : public CoordinationServiceAgent {
  public:
   CoordinationServiceAgentImpl() = default;
   ~CoordinationServiceAgentImpl() override {
-    absl::Status s = Shutdown();
+    absl::Status s = ShutdownInternal();
     VLOG(3) << "Coordination agent dtor failed with status: " << s;
   }
   absl::Status Initialize(Env* env, std::string_view job_name, int task_id,
@@ -138,13 +137,15 @@ class CoordinationServiceAgentImpl : public CoordinationServiceAgent {
   void StopHeartbeat();
 
  private:
+  absl::Status ShutdownInternal();
+
   Env* env_ = nullptr;  // Not owned.
   const uint64_t incarnation_id_ = random::New64();
   CoordinatedTask task_;
   CoordinationServiceConfig configs_;
   StatusCallback error_fn_;
 
-  mutable mutex state_mu_;
+  mutable absl::Mutex state_mu_;
   CoordinatedTaskState state_ TF_GUARDED_BY(state_mu_) =
       CoordinatedTaskState::TASKSTATE_UNINITIALIZED;
   absl::Status status_ TF_GUARDED_BY(state_mu_) = absl::OkStatus();
@@ -156,8 +157,8 @@ class CoordinationServiceAgentImpl : public CoordinationServiceAgent {
   uint64_t leader_incarnation_ = 0;
   DeviceInfo cluster_devices_;
 
-  mutex heartbeat_thread_shutdown_mu_;
-  condition_variable heartbeat_thread_cv_;
+  absl::Mutex heartbeat_thread_shutdown_mu_;
+  absl::CondVar heartbeat_thread_cv_;
   bool shutting_down_ TF_GUARDED_BY(heartbeat_thread_shutdown_mu_) = false;
   std::unique_ptr<Thread> heartbeat_thread_;
   // Must outlive coordination client which may need to access it within
@@ -186,7 +187,7 @@ absl::Status CoordinationServiceAgentImpl::Initialize(
     std::unique_ptr<CoordinationClient> leader_client,
     StatusCallback error_fn) {
   enabled_usage_metric->GetCell()->Set(true);
-  mutex_lock l(state_mu_);
+  absl::MutexLock l(&state_mu_);
   if (state_ != CoordinatedTaskState::TASKSTATE_UNINITIALIZED) {
     return MakeCoordinationError(absl::FailedPreconditionError(
         "Coordination service agent has already been initialized."));
@@ -210,33 +211,33 @@ absl::Status CoordinationServiceAgentImpl::Initialize(
 }
 
 bool CoordinationServiceAgentImpl::IsInitialized() {
-  mutex_lock l(state_mu_);
+  absl::MutexLock l(&state_mu_);
   return state_ != CoordinatedTaskState::TASKSTATE_UNINITIALIZED;
 }
 
 bool CoordinationServiceAgentImpl::IsConnected() {
-  mutex_lock l(state_mu_);
+  absl::MutexLock l(&state_mu_);
   return state_ == CoordinatedTaskState::TASKSTATE_CONNECTED;
 }
 
 bool CoordinationServiceAgentImpl::IsError() {
-  mutex_lock l(state_mu_);
+  absl::MutexLock l(&state_mu_);
   return state_ == CoordinatedTaskState::TASKSTATE_ERROR;
 }
 
 void CoordinationServiceAgentImpl::StopHeartbeat() {
   {
-    mutex_lock l(heartbeat_thread_shutdown_mu_);
+    absl::MutexLock l(&heartbeat_thread_shutdown_mu_);
     shutting_down_ = true;
-    heartbeat_thread_cv_.notify_all();
+    heartbeat_thread_cv_.SignalAll();
   }
-  heartbeat_thread_.reset();
+  heartbeat_thread_ = nullptr;
 }
 
 absl::Status CoordinationServiceAgentImpl::Connect() {
   VLOG(3) << "Agent has started trying to Connect().";
   {
-    mutex_lock l(state_mu_);
+    absl::MutexLock l(&state_mu_);
     if (state_ != CoordinatedTaskState::TASKSTATE_DISCONNECTED) {
       return MakeCoordinationError(absl::FailedPreconditionError(
           "Coordination service agent is not in DISCONNECTED state."));
@@ -269,7 +270,7 @@ absl::Status CoordinationServiceAgentImpl::Connect() {
           if (s.ok()) {
             leader_incarnation_ = response.leader_incarnation();
             {
-              mutex_lock l(state_mu_);
+              absl::MutexLock l(&state_mu_);
               state_ = CoordinatedTaskState::TASKSTATE_CONNECTED;
             }
           }
@@ -331,17 +332,21 @@ absl::Status CoordinationServiceAgentImpl::Connect() {
                                          });
           n.WaitForNotification();
           VLOG(10) << "HeartbeatResponse: " << status;
-          {
-            mutex_lock l(heartbeat_thread_shutdown_mu_);
+          if (!status.ok()) {
             // Ignore heartbeat errors and exit thread if shutting down. For
-            // example, the agent may send a heartbeat right after Shutdown(),
-            // but before StopHeartbeat(). This results in an unexpected
-            // heartbeat error.
-            if (shutting_down_) {
-              return;
+            // example, the agent may send a heartbeat right after Shutdown()
+            // started, but before StopHeartbeat() and end of Shutdown(). This
+            // results in an unexpected heartbeat error.
+            // Waiting for a second allows us to identify if errors are due to
+            // inflight heartbeats sent during shutdown and can be ignored.
+            absl::SleepFor(absl::Seconds(1));
+            {
+              absl::MutexLock l(&heartbeat_thread_shutdown_mu_);
+
+              if (shutting_down_) {
+                return;
+              }
             }
-          }
-          if (!status.ok()) {
             SetError(status);
           } else if (response.leader_incarnation() != leader_incarnation_) {
             SetError(MakeCoordinationError(
@@ -350,9 +355,10 @@ absl::Status CoordinationServiceAgentImpl::Connect() {
           }
           // Send next heartbeat after an interval.
           {
-            mutex_lock l(heartbeat_thread_shutdown_mu_);
-            heartbeat_thread_cv_.wait_for(
-                l, std::chrono::milliseconds(heartbeat_interval_ms));
+            absl::MutexLock l(&heartbeat_thread_shutdown_mu_);
+            heartbeat_thread_cv_.WaitWithTimeout(
+                &heartbeat_thread_shutdown_mu_,
+                absl::Milliseconds(heartbeat_interval_ms));
             if (shutting_down_) {
               return;
             }
@@ -430,7 +436,7 @@ CoordinationServiceAgentImpl::GetTaskState(
 absl::Status CoordinationServiceAgentImpl::ReportError(
     const absl::Status& error) {
   {
-    mutex_lock l(state_mu_);
+    absl::MutexLock l(&state_mu_);
     if (state_ == CoordinatedTaskState::TASKSTATE_UNINITIALIZED) {
       return MakeCoordinationError(absl::FailedPreconditionError(
           "Coordination service agent must be initialized first before "
@@ -470,10 +476,14 @@ absl::Status CoordinationServiceAgentImpl::ReportError(
 }
 
 absl::Status CoordinationServiceAgentImpl::Shutdown() {
+  return ShutdownInternal();
+}
+
+absl::Status CoordinationServiceAgentImpl::ShutdownInternal() {
   absl::Status status = absl::OkStatus();
   bool is_connected = false;
   {
-    mutex_lock l(state_mu_);
+    absl::MutexLock l(&state_mu_);
     is_connected = state_ == CoordinatedTaskState::TASKSTATE_CONNECTED;
   }
   // Disconnect agent from service.
@@ -511,7 +521,7 @@ absl::Status CoordinationServiceAgentImpl::Shutdown() {
   // Tear down agent.
   StopHeartbeat();
   {
-    mutex_lock l(state_mu_);
+    absl::MutexLock l(&state_mu_);
     if (state_ == CoordinatedTaskState::TASKSTATE_ERROR) {
       const std::string status_message = absl::StrCat(
           "Shutdown() was called while coordination agent is in error state, "
@@ -535,7 +545,7 @@ absl::Status CoordinationServiceAgentImpl::Shutdown() {
 
 absl::Status CoordinationServiceAgentImpl::Reset() {
   {
-    mutex_lock l(state_mu_);
+    absl::MutexLock l(&state_mu_);
     if (state_ != CoordinatedTaskState::TASKSTATE_ERROR) {
       return MakeCoordinationError(absl::FailedPreconditionError(
           "Reset() failed: coordination service agent is not in ERROR state."));
@@ -563,11 +573,11 @@ absl::Status CoordinationServiceAgentImpl::Reset() {
   // Reset agent state.
   StopHeartbeat();
   {
-    mutex_lock l(state_mu_);
+    absl::MutexLock l(&state_mu_);
     state_ = CoordinatedTaskState::TASKSTATE_DISCONNECTED;
   }
   {
-    mutex_lock l(heartbeat_thread_shutdown_mu_);
+    absl::MutexLock l(&heartbeat_thread_shutdown_mu_);
     shutting_down_ = false;
   }
 
@@ -583,7 +593,7 @@ absl::StatusOr<std::string> CoordinationServiceAgentImpl::GetKeyValue(
 absl::StatusOr<std::string> CoordinationServiceAgentImpl::GetKeyValue(
     std::string_view key, absl::Duration timeout) {
   auto n = std::make_shared<absl::Notification>();
-  auto result = std::make_shared<StatusOr<std::string>>();
+  auto result = std::make_shared<absl::StatusOr<std::string>>();
   GetKeyValueAsync(
       key, [n, result](const absl::StatusOr<std::string>& status_or_value) {
         *result = status_or_value;
@@ -755,7 +765,7 @@ absl::Status CoordinationServiceAgentImpl::StopWatchKey(std::string_view key) {
 
 void CoordinationServiceAgentImpl::SetError(const absl::Status& error) {
   assert(!error.ok());
-  mutex_lock l(state_mu_);
+  absl::MutexLock l(&state_mu_);
   if (state_ == CoordinatedTaskState::TASKSTATE_ERROR) return;
 
   LOG(ERROR) << "Coordination agent is set to ERROR: " << error;
@@ -793,7 +803,7 @@ void CoordinationServiceAgentImpl::WaitAtBarrierAsync(
     return;
   }
   {
-    mutex_lock l(state_mu_);
+    absl::MutexLock l(&state_mu_);
     auto [it, inserted] = used_barrier_ids_.insert(std::string(barrier_id));
     if (!inserted) {
       done(absl::FailedPreconditionError(absl::StrCat(
@@ -854,7 +864,7 @@ void CoordinationServiceAgentImpl::CancelBarrierAsync(
 // Returns an error if agent is not running.
 absl::Status CoordinationServiceAgentImpl::ValidateRunningAgent(
     bool allow_disconnected) {
-  mutex_lock l(state_mu_);
+  absl::MutexLock l(&state_mu_);
   switch (state_) {
     case CoordinatedTaskState::TASKSTATE_CONNECTED:
       return absl::OkStatus();
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.h b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.h
index 0f854ae42a101b..d9cddbe7d9642b 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent.h
@@ -16,20 +16,19 @@ limitations under the License.
 #ifndef XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_AGENT_H_
 #define XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_AGENT_H_
 
-#include <cstdint>
 #include <functional>
 #include <map>
 #include <memory>
 #include <string>
 #include <string_view>
-#include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/time/time.h"
 #include "xla/tsl/distributed_runtime/call_options.h"
 #include "xla/tsl/distributed_runtime/coordination/coordination_client.h"
 #include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
 #include "tsl/protobuf/coordination_service.pb.h"
 
 namespace tensorflow {
@@ -53,9 +52,7 @@ class Env;
 // reported by the user via `agent->ReportError()`.
 //
 // Possible service errors:
-//    - Internal: Coordination service is not enabled.
-//                        If it was previously accessible, coordination service
-//                        has been shut down.
+//    - Internal: Coordination service has shut down or has not been enabled.
 //    - Aborted: Incarnation mismatch during heartbeat (either remote
 //                       task or coordination service has restarted).
 //    - Unavailable: Heartbeat timeout from remote task (failed,
@@ -101,6 +98,7 @@ class CoordinationServiceAgent {
   //   - register itself as a task to the service
   //   - start a thread to periodically send heartbeat message with the service
   // Possible service errors:
+  //   - Internal: Coordination service has shut down.
   //   - FailedPrecondition: Agent is not in DISCONNECTED state.
   //   - InvalidArgument: Unexpected task registration
   //   - Aborted: Duplicate task registration (agent will retry connecting until
@@ -110,6 +108,7 @@ class CoordinationServiceAgent {
   // Wait for all tasks to be up and registered. The call blocks until all tasks
   // in the cluster are up, or some error occurs.
   // Possible service errors:
+  //   - Internal: Coordination service has shut down.
   //   - FailedPrecondition: Agent is not in CONNECTED state.
   //   - InvalidArgument: Unexpected task request
   virtual absl::Status WaitForAllTasks(
@@ -137,6 +136,7 @@ class CoordinationServiceAgent {
   // Note that the error payload will set `is_reported_error` to true, to
   // distinguish user-specified errors from internal service or RPC failures.
   // Possible service errors:
+  //   - Internal: Coordination service has shut down.
   //   - FailedPrecondition: Uninitialized/disconnected/already in error state.
   //   - InvalidArgument: Unexpected task request
   virtual absl::Status ReportError(const absl::Status& error) = 0;
@@ -148,6 +148,7 @@ class CoordinationServiceAgent {
   // the barrier times out, this agent will still disconnect, while an error is
   // reported to other agents that did not reach the barrier on time.
   // Possible service errors:
+  //   - Internal: Coordination service has shut down.
   //   - InvalidArgument: Unexpected task request.
   //   - FailedPrecondition: Task was in error state (note: agent is still
   //                         shut down forcefully).
@@ -155,6 +156,7 @@ class CoordinationServiceAgent {
 
   // Disconnect from the service, and clean up the internal error status.
   // Possible service errors:
+  //   - Internal: Coordination service has shut down.
   //   - InvalidArgument: Unexpected task request.
   //   - FailedPrecondition: task is not in error state/has already
   //       disconnected.
@@ -234,7 +236,8 @@ class CoordinationServiceAgent {
   //      first WaitAtBarrier() + timeout duration.
   //   - Cancelled: One of the tasks called CancelBarrier().
   //   - Aborted: Service is shutting down.
-  //   - Internal: Any participating task is in ERROR state.
+  //   - Internal: Any participating task is in ERROR state, or service has shut
+  //     down.
   //   - InvalidArgument: (1) Conflicting tasks specified by different agents
   //       for the same barrier, (2) one of the participating tasks is not in
   //       the cluster, or (3) task making the request is not included in the
@@ -254,6 +257,7 @@ class CoordinationServiceAgent {
   // Current and future WaitAtBarrier() calls with the same id will return a
   // CANCELLED error status.
   // Possible service errors:
+  //   - Internal: Coordination service has shut down.
   //   - FailedPrecondition: Barrier has already been passed.
   virtual absl::Status CancelBarrier(std::string_view barrier_id) = 0;
   virtual void CancelBarrierAsync(std::string_view barrier_id,
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent_test.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent_test.cc
index 419471938b8981..db0cadf874bacc 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent_test.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_agent_test.cc
@@ -31,9 +31,7 @@ limitations under the License.
 #include "xla/tsl/distributed_runtime/coordination/coordination_client.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/env.h"
-#include "tsl/platform/errors.h"
 #include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 #include "tsl/protobuf/coordination_config.pb.h"
 #include "tsl/protobuf/coordination_service.pb.h"
@@ -250,7 +248,7 @@ TEST_F(CoordinationServiceAgentTest, GetKeyValue_Timeout_ReturnError) {
 
   auto result = agent_->GetKeyValue(test_key, /*timeout=*/absl::Seconds(1));
 
-  EXPECT_EQ(result.status().code(), error::DEADLINE_EXCEEDED);
+  EXPECT_TRUE(absl::IsDeadlineExceeded(result.status()));
   // Needed to tear down test safely since agent dtor would cancel pending
   // calls, which would reference deallocated call_opts.
   owned_done(absl::CancelledError("error"));
@@ -275,8 +273,7 @@ TEST_F(CoordinationServiceAgentTest,
   InitializeAgent();
 
   auto result = agent_->GetKeyValue(test_key, /*timeout=*/absl::Seconds(1));
-  EXPECT_EQ(result.status().code(), error::DEADLINE_EXCEEDED);
-
+  EXPECT_TRUE(absl::IsDeadlineExceeded(result.status()));
   // Delayed server response: set key-value response, and invoke done callback.
   auto kv = owned_response->mutable_kv();
   kv->set_key(test_key);
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_error_util.h b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_error_util.h
index a777d121c4ce93..4555a4e90e3a97 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_error_util.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_error_util.h
@@ -15,9 +15,9 @@ limitations under the License.
 #ifndef XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_ERROR_UTIL_H_
 #define XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_ERROR_UTIL_H_
 
+#include "absl/status/status.h"
+#include "absl/strings/cord.h"
 #include "absl/strings/string_view.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/status.h"
 #include "tsl/protobuf/coordination_service.pb.h"
 
 namespace tsl {
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_error_util_test.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_error_util_test.cc
index 9f1baada2ce84c..3c19fa5759a207 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_error_util_test.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_error_util_test.cc
@@ -16,8 +16,7 @@ limitations under the License.
 
 #include <string>
 
-#include "tsl/platform/errors.h"
-#include "tsl/platform/status.h"
+#include "absl/status/status.h"
 #include "tsl/platform/test.h"
 #include "tsl/protobuf/coordination_service.pb.h"
 namespace tsl {
@@ -26,7 +25,7 @@ using ::tensorflow::CoordinatedTask;
 using ::tensorflow::CoordinationServiceError;
 
 TEST(CoordinationServiceErrorUtil, MakeCoordinationErrorWithEmptyPayload) {
-  absl::Status error = errors::Internal("Test Error");
+  absl::Status error = absl::InternalError("Test Error");
 
   absl::Status coordination_error = MakeCoordinationError(error);
 
@@ -38,7 +37,7 @@ TEST(CoordinationServiceErrorUtil, MakeCoordinationErrorWithEmptyPayload) {
 }
 
 TEST(CoordinationServiceErrorUtil, MakeCoordinationErrorWithErrorOrigin) {
-  absl::Status error = errors::Internal("Test Error");
+  absl::Status error = absl::InternalError("Test Error");
   CoordinatedTask source_task;
   source_task.set_job_name("test_worker");
   source_task.set_task_id(7);
@@ -57,7 +56,7 @@ TEST(CoordinationServiceErrorUtil, MakeCoordinationErrorWithErrorOrigin) {
 }
 
 TEST(CoordinationServiceErrorUtil, MakeCoordinationErrorWithUserReportedError) {
-  absl::Status error = errors::Internal("Test Error");
+  absl::Status error = absl::InternalError("Test Error");
   CoordinatedTask source_task;
   source_task.set_job_name("test_worker");
   source_task.set_task_id(7);
@@ -78,7 +77,7 @@ TEST(CoordinationServiceErrorUtil, MakeCoordinationErrorWithUserReportedError) {
 }
 
 TEST(CoordinationServiceErrorUtil, MakeCoordinationErrorWithPayload) {
-  absl::Status error = errors::Internal("Test Error");
+  absl::Status error = absl::InternalError("Test Error");
   CoordinationServiceError payload;
   CoordinatedTask* source_task = payload.mutable_source_task();
   source_task->set_job_name("test_worker");
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_recoverable_job_test.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_recoverable_job_test.cc
index 37f128b53ff415..836fb8803e02f7 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_recoverable_job_test.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_recoverable_job_test.cc
@@ -15,10 +15,16 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <utility>
-#include <vector>
 
+#include "grpcpp/server.h"
+#include "grpcpp/server_builder.h"
+#include "grpcpp/support/channel_arguments.h"
+#include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/log/log.h"
 #include "absl/memory/memory.h"
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/distributed_runtime/coordination/coordination_client.h"
 #include "xla/tsl/distributed_runtime/coordination/coordination_service.h"
 #include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
@@ -27,7 +33,6 @@ limitations under the License.
 #include "xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/env.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/test.h"
 #include "tsl/platform/threadpool.h"
@@ -46,12 +51,12 @@ constexpr char kServiceLeader[] = "/job:parameter_server/replica:0/task:0";
 class TestCoordinationClientCache : public CoordinationClientCache {
  public:
   void AddTask(const std::string& target, CoordinationClient* client) {
-    mutex_lock l(clients_mu_);
+    absl::MutexLock l(&clients_mu_);
     clients_.emplace(target, client);
   }
 
-  CoordinationClient* GetClient(const string& target) override {
-    mutex_lock l(clients_mu_);
+  CoordinationClient* GetClient(const std::string& target) override {
+    absl::MutexLock l(&clients_mu_);
     if (auto it = clients_.find(target); it != clients_.end()) {
       return it->second;
     }
@@ -59,15 +64,15 @@ class TestCoordinationClientCache : public CoordinationClientCache {
   }
 
   std::unique_ptr<CoordinationClient> GetOwnedClient(
-      const string& target) override {
+      const std::string& target) override {
     LOG(ERROR) << "GetOwnedClient is not supported.";
     return nullptr;
   }
 
  private:
-  mutex clients_mu_;
+  absl::Mutex clients_mu_;
   absl::flat_hash_map<std::string, CoordinationClient*> clients_
-      TF_GUARDED_BY(clients_mu_);
+      ABSL_GUARDED_BY(clients_mu_);
 };
 
 class TestCoordinationServiceTaskState {
@@ -229,37 +234,37 @@ class CoordinationServiceRecoverableJobTest : public ::testing::Test {
 TEST_F(CoordinationServiceRecoverableJobTest,
        UnrecoverableWorkerFailurePropagated) {
   Initialize();
-  TF_ASSERT_OK(state_worker_0_.ReportError(errors::Internal("Test Error.")));
+  TF_ASSERT_OK(state_worker_0_.ReportError(absl::InternalError("Test Error.")));
 
   // For unrecoverable task, error propagates to all connected tasks.
-  EXPECT_TRUE(errors::IsInternal(state_ps_0_.GetStatus()));
-  EXPECT_TRUE(errors::IsInternal(state_ps_1_.GetStatus()));
-  EXPECT_TRUE(errors::IsInternal(state_worker_0_.GetStatus()));
-  EXPECT_TRUE(errors::IsInternal(state_worker_1_.GetStatus()));
+  EXPECT_TRUE(absl::IsInternal(state_ps_0_.GetStatus()));
+  EXPECT_TRUE(absl::IsInternal(state_ps_1_.GetStatus()));
+  EXPECT_TRUE(absl::IsInternal(state_worker_0_.GetStatus()));
+  EXPECT_TRUE(absl::IsInternal(state_worker_1_.GetStatus()));
 }
 
 TEST_F(CoordinationServiceRecoverableJobTest,
        UnrecoverablePSFailurePropagated) {
   Initialize();
-  TF_ASSERT_OK(state_ps_0_.ReportError(errors::Internal("Test Error.")));
+  TF_ASSERT_OK(state_ps_0_.ReportError(absl::InternalError("Test Error.")));
 
   // For unrecoverable task, error propagates to all connected tasks.
-  EXPECT_TRUE(errors::IsInternal(state_ps_0_.GetStatus()));
-  EXPECT_TRUE(errors::IsInternal(state_ps_1_.GetStatus()));
-  EXPECT_TRUE(errors::IsInternal(state_worker_0_.GetStatus()));
-  EXPECT_TRUE(errors::IsInternal(state_worker_1_.GetStatus()));
+  EXPECT_TRUE(absl::IsInternal(state_ps_0_.GetStatus()));
+  EXPECT_TRUE(absl::IsInternal(state_ps_1_.GetStatus()));
+  EXPECT_TRUE(absl::IsInternal(state_worker_0_.GetStatus()));
+  EXPECT_TRUE(absl::IsInternal(state_worker_1_.GetStatus()));
 }
 
 TEST_F(CoordinationServiceRecoverableJobTest,
        RecoverableWorkerFailureNotPropagated) {
   AddJobToRecoverableJobs(kWorkerJobName);
   Initialize();
-  TF_ASSERT_OK(state_worker_0_.ReportError(errors::Internal("Test Error.")));
+  TF_ASSERT_OK(state_worker_0_.ReportError(absl::InternalError("Test Error.")));
 
   // For recoverable task, error does not propagate.
   EXPECT_TRUE(state_ps_0_.GetStatus().ok());
   EXPECT_TRUE(state_ps_1_.GetStatus().ok());
-  EXPECT_TRUE(errors::IsInternal(state_worker_0_.GetStatus()));
+  EXPECT_TRUE(absl::IsInternal(state_worker_0_.GetStatus()));
   EXPECT_TRUE(state_worker_1_.GetStatus().ok());
 }
 
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.cc
index ea15408886197c..920d7840486a9e 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.cc
@@ -15,20 +15,23 @@ limitations under the License.
 
 #include "xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.h"
 
+#include <cstdint>
 #include <iterator>
 #include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
 #include "xla/tsl/distributed_runtime/coordination/coordination_service.h"
 #include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "xla/tsl/distributed_runtime/coordination/coordination_service_error_util.h"
-#include "tsl/platform/casts.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/mutex.h"
-#include "tsl/platform/statusor.h"
+#include "tsl/platform/protobuf.h"
+#include "tsl/platform/status.h"
 #include "tsl/protobuf/coordination_service.pb.h"
 
 namespace tsl {
@@ -40,23 +43,23 @@ using tensorflow::KeyValueEntry;
 
 void CoordinationServiceRpcHandler::SetAgentInstance(
     CoordinationServiceAgent* agent) {
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   agent_ = agent;
 }
 
 void CoordinationServiceRpcHandler::SetServiceInstance(
     CoordinationServiceInterface* service) {
-  mutex_lock l(mu_);
+  absl::MutexLock l(&mu_);
   service_ = service;
 }
 
 void CoordinationServiceRpcHandler::RegisterTaskAsync(
-    const RegisterTaskRequest* request, RegisterTaskResponse* response,
-    StatusCallback done) {
-  tf_shared_lock l(mu_);
+    const tensorflow::RegisterTaskRequest* request,
+    tensorflow::RegisterTaskResponse* response, StatusCallback done) {
+  absl::ReaderMutexLock l(&mu_);
   if (service_ == nullptr) {
     done(MakeCoordinationError(
-        errors::Internal("Coordination service is not enabled.")));
+        absl::InternalError("Coordination service is not enabled.")));
     return;
   }
   const CoordinatedTask& task = request->source_task();
@@ -67,12 +70,12 @@ void CoordinationServiceRpcHandler::RegisterTaskAsync(
 }
 
 void CoordinationServiceRpcHandler::HeartbeatAsync(
-    const HeartbeatRequest* request, HeartbeatResponse* response,
-    StatusCallback done) {
-  tf_shared_lock l(mu_);
+    const tensorflow::HeartbeatRequest* request,
+    tensorflow::HeartbeatResponse* response, StatusCallback done) {
+  absl::ReaderMutexLock l(&mu_);
   if (service_ == nullptr) {
     done(MakeCoordinationError(
-        errors::Internal("Coordination service is not enabled.")));
+        absl::InternalError("Coordination service is not enabled.")));
     return;
   }
   const CoordinatedTask& task = request->source_task();
@@ -88,12 +91,12 @@ void CoordinationServiceRpcHandler::HeartbeatAsync(
 }
 
 void CoordinationServiceRpcHandler::WaitForAllTasksAsync(
-    const WaitForAllTasksRequest* request, WaitForAllTasksResponse* response,
-    StatusCallback done) {
-  tf_shared_lock l(mu_);
+    const tensorflow::WaitForAllTasksRequest* request,
+    tensorflow::WaitForAllTasksResponse* response, StatusCallback done) {
+  absl::ReaderMutexLock l(&mu_);
   if (service_ == nullptr) {
     done(MakeCoordinationError(
-        errors::Internal("Coordination service is not enabled.")));
+        absl::InternalError("Coordination service is not enabled.")));
     return;
   }
   service_->WaitForAllTasks(
@@ -107,12 +110,12 @@ void CoordinationServiceRpcHandler::WaitForAllTasksAsync(
 }
 
 void CoordinationServiceRpcHandler::ShutdownTaskAsync(
-    const ShutdownTaskRequest* request, ShutdownTaskResponse* response,
-    StatusCallback done) {
-  tf_shared_lock l(mu_);
+    const tensorflow::ShutdownTaskRequest* request,
+    tensorflow::ShutdownTaskResponse* response, StatusCallback done) {
+  absl::ReaderMutexLock l(&mu_);
   if (service_ == nullptr) {
     done(MakeCoordinationError(
-        errors::Internal("Coordination service is not enabled.")));
+        absl::InternalError("Coordination service is not enabled.")));
     return;
   }
   service_->ShutdownTaskAsync(request->source_task(),
@@ -120,30 +123,30 @@ void CoordinationServiceRpcHandler::ShutdownTaskAsync(
 }
 
 void CoordinationServiceRpcHandler::ResetTaskAsync(
-    const ResetTaskRequest* request, ResetTaskResponse* response,
-    StatusCallback done) {
-  tf_shared_lock l(mu_);
+    const tensorflow::ResetTaskRequest* request,
+    tensorflow::ResetTaskResponse* response, StatusCallback done) {
+  absl::ReaderMutexLock l(&mu_);
   if (service_ == nullptr) {
     done(MakeCoordinationError(
-        errors::Internal("Coordination service is not enabled.")));
+        absl::InternalError("Coordination service is not enabled.")));
     return;
   }
   done(service_->ResetTask(request->source_task()));
 }
 
 void CoordinationServiceRpcHandler::ReportErrorToTaskAsync(
-    const ReportErrorToTaskRequest* request,
-    ReportErrorToTaskResponse* response, StatusCallback done) {
-  tf_shared_lock l(mu_);
+    const tensorflow::ReportErrorToTaskRequest* request,
+    tensorflow::ReportErrorToTaskResponse* response, StatusCallback done) {
+  absl::ReaderMutexLock l(&mu_);
   if (agent_ == nullptr) {
-    done(MakeCoordinationError(errors::Internal(
+    done(MakeCoordinationError(absl::InternalError(
         "CoordinationServiceAgent is uninitialized or has already shutdown.")));
     return;
   }
   const CoordinationServiceError& error_payload = request->error_payload();
   absl::Status error(
       static_cast<absl::StatusCode>(request->error_code()),
-      strings::StrCat(
+      absl::StrCat(
           "Error reported from /job:", error_payload.source_task().job_name(),
           "/task:", error_payload.source_task().task_id(), ": ",
           request->error_message()));
@@ -153,12 +156,12 @@ void CoordinationServiceRpcHandler::ReportErrorToTaskAsync(
 }
 
 void CoordinationServiceRpcHandler::ReportErrorToServiceAsync(
-    const ReportErrorToServiceRequest* request,
-    ReportErrorToServiceResponse* response, StatusCallback done) {
-  tf_shared_lock l(mu_);
+    const tensorflow::ReportErrorToServiceRequest* request,
+    tensorflow::ReportErrorToServiceResponse* response, StatusCallback done) {
+  absl::ReaderMutexLock l(&mu_);
   if (service_ == nullptr) {
     done(MakeCoordinationError(
-        errors::Internal("Coordination service is not enabled.")));
+        absl::InternalError("Coordination service is not enabled.")));
     return;
   }
   done(service_->ReportTaskError(
@@ -171,40 +174,40 @@ void CoordinationServiceRpcHandler::ReportErrorToServiceAsync(
 }
 
 void CoordinationServiceRpcHandler::GetTaskStateAsync(
-    const GetTaskStateRequest* request, GetTaskStateResponse* response,
-    StatusCallback done) {
-  tf_shared_lock l(mu_);
+    const tensorflow::GetTaskStateRequest* request,
+    tensorflow::GetTaskStateResponse* response, StatusCallback done) {
+  absl::ReaderMutexLock l(&mu_);
   if (service_ == nullptr) {
     done(MakeCoordinationError(
-        errors::Internal("Coordination service is not enabled.")));
+        absl::InternalError("Coordination service is not enabled.")));
     return;
   }
   auto result = service_->GetTaskState(
       {request->source_task().begin(), request->source_task().end()});
-  absl::c_move(result,
-               RepeatedFieldBackInserter(response->mutable_task_state()));
+  absl::c_move(result, tsl::protobuf::RepeatedFieldBackInserter(
+                           response->mutable_task_state()));
   done(absl::OkStatus());
 }
 
 void CoordinationServiceRpcHandler::InsertKeyValueAsync(
-    const InsertKeyValueRequest* request, InsertKeyValueResponse* response,
-    StatusCallback done) {
-  tf_shared_lock l(mu_);
+    const tensorflow::InsertKeyValueRequest* request,
+    tensorflow::InsertKeyValueResponse* response, StatusCallback done) {
+  absl::ReaderMutexLock l(&mu_);
   if (service_ == nullptr) {
     done(MakeCoordinationError(
-        errors::Internal("Coordination service is not enabled.")));
+        absl::InternalError("Coordination service is not enabled.")));
     return;
   }
   done(service_->InsertKeyValue(request->kv().key(), request->kv().value()));
 }
 
 void CoordinationServiceRpcHandler::GetKeyValueAsync(
-    const GetKeyValueRequest* request, GetKeyValueResponse* response,
-    StatusCallback done) {
-  tf_shared_lock l(mu_);
+    const tensorflow::GetKeyValueRequest* request,
+    tensorflow::GetKeyValueResponse* response, StatusCallback done) {
+  absl::ReaderMutexLock l(&mu_);
   if (service_ == nullptr) {
     done(MakeCoordinationError(
-        errors::Internal("Coordination service is not enabled.")));
+        absl::InternalError("Coordination service is not enabled.")));
     return;
   }
   response->mutable_kv()->set_key(request->key());
@@ -219,12 +222,12 @@ void CoordinationServiceRpcHandler::GetKeyValueAsync(
 }
 
 void CoordinationServiceRpcHandler::TryGetKeyValueAsync(
-    const TryGetKeyValueRequest* request, TryGetKeyValueResponse* response,
-    StatusCallback done) {
-  tf_shared_lock l(mu_);
+    const tensorflow::TryGetKeyValueRequest* request,
+    tensorflow::TryGetKeyValueResponse* response, StatusCallback done) {
+  absl::ReaderMutexLock l(&mu_);
   if (service_ == nullptr) {
     done(MakeCoordinationError(
-        errors::Internal("Coordination service is not enabled.")));
+        absl::InternalError("Coordination service is not enabled.")));
     return;
   }
   auto result = service_->TryGetKeyValue(request->key());
@@ -238,12 +241,12 @@ void CoordinationServiceRpcHandler::TryGetKeyValueAsync(
 }
 
 void CoordinationServiceRpcHandler::GetKeyValueDirAsync(
-    const GetKeyValueDirRequest* request, GetKeyValueDirResponse* response,
-    StatusCallback done) {
-  tf_shared_lock l(mu_);
+    const tensorflow::GetKeyValueDirRequest* request,
+    tensorflow::GetKeyValueDirResponse* response, StatusCallback done) {
+  absl::ReaderMutexLock l(&mu_);
   if (service_ == nullptr) {
     done(MakeCoordinationError(
-        errors::Internal("Coordination service is not enabled.")));
+        absl::InternalError("Coordination service is not enabled.")));
     return;
   }
   std::vector<KeyValueEntry> results =
@@ -254,24 +257,24 @@ void CoordinationServiceRpcHandler::GetKeyValueDirAsync(
 }
 
 void CoordinationServiceRpcHandler::DeleteKeyValueAsync(
-    const DeleteKeyValueRequest* request, DeleteKeyValueResponse* response,
-    StatusCallback done) {
-  tf_shared_lock l(mu_);
+    const tensorflow::DeleteKeyValueRequest* request,
+    tensorflow::DeleteKeyValueResponse* response, StatusCallback done) {
+  absl::ReaderMutexLock l(&mu_);
   if (service_ == nullptr) {
     done(MakeCoordinationError(
-        errors::Internal("Coordination service is not enabled.")));
+        absl::InternalError("Coordination service is not enabled.")));
     return;
   }
   done(service_->DeleteKeyValue(request->key()));
 }
 
-void CoordinationServiceRpcHandler::BarrierAsync(const BarrierRequest* request,
-                                                 BarrierResponse* response,
-                                                 StatusCallback done) {
-  tf_shared_lock l(mu_);
+void CoordinationServiceRpcHandler::BarrierAsync(
+    const tensorflow::BarrierRequest* request,
+    tensorflow::BarrierResponse* response, StatusCallback done) {
+  absl::ReaderMutexLock l(&mu_);
   if (service_ == nullptr) {
     done(MakeCoordinationError(
-        errors::Internal("Coordination service is not enabled.")));
+        absl::InternalError("Coordination service is not enabled.")));
     return;
   }
   std::vector<CoordinatedTask> tasks = {request->tasks().begin(),
@@ -284,12 +287,12 @@ void CoordinationServiceRpcHandler::BarrierAsync(const BarrierRequest* request,
 }
 
 void CoordinationServiceRpcHandler::CancelBarrierAsync(
-    const CancelBarrierRequest* request, CancelBarrierResponse* response,
-    StatusCallback done) {
-  tf_shared_lock l(mu_);
+    const tensorflow::CancelBarrierRequest* request,
+    tensorflow::CancelBarrierResponse* response, StatusCallback done) {
+  absl::ReaderMutexLock l(&mu_);
   if (service_ == nullptr) {
     done(MakeCoordinationError(
-        errors::Internal("Coordination service is not enabled.")));
+        absl::InternalError("Coordination service is not enabled.")));
     return;
   }
   done(service_->CancelBarrier(request->barrier_id(), request->source_task()));
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.h b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.h
index 2895467f6e89d6..51d4f9f6901dc6 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.h
@@ -16,9 +16,10 @@ limitations under the License.
 #ifndef XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_RPC_HANDLER_H_
 #define XLA_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_RPC_HANDLER_H_
 
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/distributed_runtime/coordination/coordination_service.h"
 #include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/thread_annotations.h"
 #include "tsl/protobuf/coordination_service.pb.h"
@@ -26,7 +27,7 @@ limitations under the License.
 namespace tsl {
 class CoordinationServiceRpcHandler {
  public:
-  explicit CoordinationServiceRpcHandler() {}
+  explicit CoordinationServiceRpcHandler() = default;
 
   void SetAgentInstance(CoordinationServiceAgent* agent);
 
@@ -92,7 +93,7 @@ class CoordinationServiceRpcHandler {
                           StatusCallback done);
 
  private:
-  mutex mu_;
+  absl::Mutex mu_;
   CoordinationServiceAgent* agent_ TF_GUARDED_BY(mu_) = nullptr;
   CoordinationServiceInterface* service_ TF_GUARDED_BY(mu_) = nullptr;
 };
diff --git a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_test.cc b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_test.cc
index b537301f9b92a8..8eba77c43211f7 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_test.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/coordination/coordination_service_test.cc
@@ -15,15 +15,21 @@ limitations under the License.
 
 #include "xla/tsl/distributed_runtime/coordination/coordination_service.h"
 
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <utility>
 #include <vector>
 
+#include "absl/base/thread_annotations.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/synchronization/notification.h"
 #include "absl/time/time.h"
 #include "xla/tsl/distributed_runtime/call_options.h"
@@ -32,12 +38,9 @@ limitations under the License.
 #include "xla/tsl/distributed_runtime/coordination/test_device.pb.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/env.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/mutex.h"
 #include "tsl/platform/random.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/test.h"
-#include "tsl/platform/thread_annotations.h"
 #include "tsl/platform/types.h"
 #include "tsl/protobuf/coordination_config.pb.h"
 #include "tsl/protobuf/coordination_service.pb.h"
@@ -81,7 +84,7 @@ class TestCoordinationClient : public CoordinationClient {
   TestCoordinationClient() = default;
 
   absl::Status GetStatus() {
-    mutex_lock l(mu_);
+    absl::MutexLock l(&mu_);
     return status_;
   }
 
@@ -95,7 +98,7 @@ class TestCoordinationClient : public CoordinationClient {
                               const ReportErrorToTaskRequest* request,
                               ReportErrorToTaskResponse* response,
                               StatusCallback done) override {
-    mutex_lock l(mu_);
+    absl::MutexLock l(&mu_);
     status_ = absl::Status(static_cast<absl::StatusCode>(request->error_code()),
                            request->error_message());
     done(absl::OkStatus());
@@ -104,8 +107,7 @@ class TestCoordinationClient : public CoordinationClient {
 #define UNIMPLEMENTED(method)                                         \
   void method##Async(const method##Request* request,                  \
                      method##Response* response, StatusCallback done) \
-      override {                                                      \
-    done(errors::Unimplemented(#method "Async"));                     \
+      override{done(absl::UnimplementedError(#method "Async"));       \
   }
 
   UNIMPLEMENTED(WaitForAllTasks);
@@ -123,8 +125,7 @@ class TestCoordinationClient : public CoordinationClient {
 #define UNIMPLEMENTED_WITH_CALL_OPTS(method)                                 \
   void method##Async(CallOptions* call_opts, const method##Request* request, \
                      method##Response* response, StatusCallback done)        \
-      override {                                                             \
-    done(errors::Unimplemented(#method "Async"));                            \
+      override{done(absl::UnimplementedError(#method "Async"));              \
   }
 
   UNIMPLEMENTED_WITH_CALL_OPTS(GetKeyValue);
@@ -133,8 +134,8 @@ class TestCoordinationClient : public CoordinationClient {
 #undef UNIMPLEMENTED_WITH_CALL_OPTS
 
  private:
-  mutex mu_;
-  absl::Status status_ TF_GUARDED_BY(mu_);
+  absl::Mutex mu_;
+  absl::Status status_ ABSL_GUARDED_BY(mu_);
 };
 
 class TestCoordinationClientCache : public CoordinationClientCache {
@@ -203,6 +204,14 @@ class CoordinationBarrierTest : public ::testing::Test {
                         "/task:", task.task_id());
   }
 
+  std::vector<TestCoordinationClient*> GetClients() {
+    std::vector<TestCoordinationClient*> clients;
+    for (const auto& client : clients_) {
+      clients.push_back(client.get());
+    }
+    return clients;
+  }
+
  private:
   std::unique_ptr<CoordinationServiceInterface> coord_service_;
   std::vector<CoordinatedTask> tasks_;
@@ -430,8 +439,8 @@ TEST(CoordinationServiceTest, RegisterTask_AlreadyInError_Fails) {
   // Task connects to coordination service.
   TF_ASSERT_OK(coord_service->RegisterTask(task_0, /*incarnation=*/0));
   // Arbitrarily set task to be in error.
-  TF_ASSERT_OK(
-      coord_service->ReportTaskError(task_0, errors::Internal("test_error")));
+  TF_ASSERT_OK(coord_service->ReportTaskError(
+      task_0, absl::InternalError("test_error")));
 
   // Registration should fail since task already registered previously.
   const absl::Status status =
@@ -467,9 +476,9 @@ TEST_F(CoordinateTwoTasksTest,
       absl::ToInt64Microseconds(2 * kHeartbeatTimeout));
   // Unexpected heartbeat from unregistered tasks since service state has been
   // reset.
-  EXPECT_TRUE(absl::IsInvalidArgument(
+  EXPECT_TRUE(absl::IsInternal(
       coord_service_->RecordHeartbeat(task_0_, incarnation_0_)));
-  EXPECT_TRUE(absl::IsInvalidArgument(
+  EXPECT_TRUE(absl::IsInternal(
       coord_service_->RecordHeartbeat(task_1_, incarnation_1_)));
 }
 
@@ -964,6 +973,49 @@ TEST_F(CoordinationBarrierTest, BarrierByNonParticipatingTask) {
   EXPECT_TRUE(absl::IsInvalidArgument(barrier_status_1));
 }
 
+TEST_F(CoordinationBarrierTest, BarrierByNonParticipatingTaskThreeTasks) {
+  const std::string barrier_id = "barrier_id";
+  absl::Duration timeout = absl::Seconds(5);
+  absl::Status barrier_status_0;
+  absl::Status barrier_status_1;
+  absl::Status barrier_status_2;
+  absl::Notification n_0;
+  absl::Notification n_1;
+
+  GetCoordinationService()->BarrierAsync(
+      barrier_id, timeout, GetTask(0),
+      /*participating_tasks=*/{GetTask(0), GetTask(1)},
+      [&barrier_status_0, &n_0](absl::Status s) {
+        barrier_status_0 = s;
+        n_0.Notify();
+      });
+  GetCoordinationService()->BarrierAsync(
+      barrier_id, timeout, GetTask(1),
+      /*participating_tasks=*/{GetTask(0), GetTask(1)},
+      [&barrier_status_1, &n_1](absl::Status s) {
+        barrier_status_1 = s;
+        n_1.Notify();
+      });
+
+  n_0.WaitForNotification();
+  n_1.WaitForNotification();
+
+  // Barrier should pass because only participating tasks have called it.
+  TF_EXPECT_OK(barrier_status_0);
+  TF_EXPECT_OK(barrier_status_1);
+
+  // Task 2 unexpectedly calls a barrier that it is not participating in.
+  GetCoordinationService()->BarrierAsync(
+      barrier_id, timeout, GetTask(2),
+      /*participating_tasks=*/{GetTask(0), GetTask(1)},
+      [&barrier_status_2](absl::Status s) { barrier_status_2 = s; });
+
+  // Barrier should fail for task 2 which is not participating in the barrier.
+  EXPECT_TRUE(absl::IsInvalidArgument(barrier_status_2));
+
+  // Other clients would need to check the barrier key to detect the error.
+}
+
 TEST_F(CoordinationBarrierTest, BarrierByNonClusterTask) {
   const std::string barrier_id = "barrier_id";
   absl::Duration timeout = absl::Seconds(5);
@@ -1024,7 +1076,7 @@ TEST_F(CoordinationBarrierTest, BarrierReturnsPreviousError) {
         n_0.Notify();
       });
   TF_ASSERT_OK(GetCoordinationService()->ReportTaskError(
-      GetTask(0), errors::Internal("test_error")));
+      GetTask(0), absl::InternalError("test_error")));
   // Block until barrier has failed due to task error.
   n_0.WaitForNotification();
   // Same response should be returned immediately.
@@ -1152,7 +1204,7 @@ TEST_F(CoordinationBarrierTest, BarrierFailsIfTaskIsAlreadyInError) {
   absl::Duration timeout = absl::Seconds(5);
   // Set task 0 to error state.
   TF_ASSERT_OK(GetCoordinationService()->ReportTaskError(
-      GetTask(0), errors::Internal("test_error")));
+      GetTask(0), absl::InternalError("test_error")));
   absl::Status barrier_status;
 
   GetCoordinationService()->BarrierAsync(
@@ -1176,7 +1228,7 @@ TEST_F(CoordinationBarrierTest, BarrierFailsUponTaskError) {
         n0.Notify();
       });
   TF_ASSERT_OK(GetCoordinationService()->ReportTaskError(
-      GetTask(0), errors::Internal("test_error")));
+      GetTask(0), absl::InternalError("test_error")));
   n0.WaitForNotification();
 
   EXPECT_TRUE(absl::IsInternal(barrier_status));
@@ -1399,7 +1451,82 @@ TEST_F(CoordinateTwoTasksTest,
   // service has stopped yet, which should fail.
   absl::Status s = coord_service_->RecordHeartbeat(task_1_, incarnation_1_);
 
-  EXPECT_TRUE(absl::IsInvalidArgument(s)) << s;
+  EXPECT_TRUE(absl::IsInternal(s)) << s;
+}
+
+TEST_F(CoordinateTwoTasksTest, BarrierFailsIfServiceHasStopped) {
+  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  TF_ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
+  TF_ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
+  absl::Notification n0;
+  absl::Status barrier_status;
+  // No heartbeat for a while, leader consider the task as stale.
+  // As no error propagation is available, service stops.
+  Env::Default()->SleepForMicroseconds(
+      absl::ToInt64Microseconds(2 * kHeartbeatTimeout));
+
+  // Barrier should fail when called after service stops.
+  coord_service_->BarrierAsync("barrier_id", absl::Seconds(5), task_0_,
+                               /*participating_tasks=*/{}, [&](absl::Status s) {
+                                 barrier_status = s;
+                                 n0.Notify();
+                               });
+
+  n0.WaitForNotification();
+  EXPECT_TRUE(absl::IsInternal(barrier_status)) << barrier_status;
+}
+
+TEST_F(CoordinateTwoTasksTest, BarrierWithSubsetFailsIfServiceHasStopped) {
+  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  TF_ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
+  TF_ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
+  absl::Notification n0;
+  absl::Status barrier_status;
+  // No heartbeat for a while, leader consider the task as stale.
+  // As no error propagation is available, service stops.
+  Env::Default()->SleepForMicroseconds(
+      absl::ToInt64Microseconds(2 * kHeartbeatTimeout));
+
+  // Barrier should fail when called after service stops.
+  // Note that this is same as above test, but the barrier only blocks for task
+  // 0.
+  coord_service_->BarrierAsync("barrier_id", absl::Seconds(5), task_0_,
+                               /*participating_tasks=*/{task_0_},
+                               [&](absl::Status s) {
+                                 barrier_status = s;
+                                 n0.Notify();
+                               });
+
+  n0.WaitForNotification();
+  EXPECT_TRUE(absl::IsInternal(barrier_status)) << barrier_status;
+}
+
+TEST_F(CoordinateTwoTasksTest,
+       BarrierWithNonParticipatingTaskFailsIfServiceHasStopped) {
+  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  TF_ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
+  TF_ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
+  absl::Notification n0;
+  absl::Status barrier_status;
+  // No heartbeat for a while, leader consider the task as stale.
+  // As no error propagation is available, service stops.
+  Env::Default()->SleepForMicroseconds(
+      absl::ToInt64Microseconds(2 * kHeartbeatTimeout));
+
+  // Barrier should fail when called after service stops.
+  // Note that this is same as above test, but the barrier only blocks for task
+  // 1. Task 0 calls it, which is an invalid invocation (and would have failed
+  // with invalid argument error), but the barrier should fail because of the
+  // shutdown error anyway.
+  coord_service_->BarrierAsync("barrier_id", absl::Seconds(5), task_0_,
+                               /*participating_tasks=*/{task_1_},
+                               [&](absl::Status s) {
+                                 barrier_status = s;
+                                 n0.Notify();
+                               });
+
+  n0.WaitForNotification();
+  EXPECT_TRUE(absl::IsInternal(barrier_status)) << barrier_status;
 }
 
 TEST_F(CoordinateTwoTasksTest, UnrecoverableTaskPropagatesError) {
@@ -1410,8 +1537,8 @@ TEST_F(CoordinateTwoTasksTest, UnrecoverableTaskPropagatesError) {
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   TF_EXPECT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
 
-  TF_ASSERT_OK(
-      coord_service_->ReportTaskError(task_0_, errors::Internal("test_error")));
+  TF_ASSERT_OK(coord_service_->ReportTaskError(
+      task_0_, absl::InternalError("test_error")));
 
   EXPECT_TRUE(absl::IsInternal(
       coord_service_->RecordHeartbeat(task_0_, incarnation_0_)));
@@ -1427,8 +1554,8 @@ TEST_F(CoordinateTwoTasksTest, RecoverableTaskWillNotPropagateError) {
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   TF_EXPECT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
 
-  TF_ASSERT_OK(
-      coord_service_->ReportTaskError(task_0_, errors::Internal("test_error")));
+  TF_ASSERT_OK(coord_service_->ReportTaskError(
+      task_0_, absl::InternalError("test_error")));
 
   EXPECT_TRUE(absl::IsInternal(
       coord_service_->RecordHeartbeat(task_0_, incarnation_0_)));
@@ -1446,8 +1573,8 @@ TEST_F(CoordinateTwoTasksTest,
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   TF_EXPECT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
 
-  TF_ASSERT_OK(
-      coord_service_->ReportTaskError(task_0_, errors::Internal("test_error")));
+  TF_ASSERT_OK(coord_service_->ReportTaskError(
+      task_0_, absl::InternalError("test_error")));
 
   EXPECT_TRUE(absl::IsInternal(
       coord_service_->RecordHeartbeat(task_0_, incarnation_0_)));
@@ -1471,7 +1598,7 @@ TEST_F(CoordinateTwoTasksTest, UnavailableTaskCanReconnect) {
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
 
   TF_ASSERT_OK(coord_service_->ReportTaskError(
-      task_0_, MakeCoordinationError(errors::Unavailable("test_error"))));
+      task_0_, MakeCoordinationError(absl::UnavailableError("test_error"))));
 
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_new_));
 }
diff --git a/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_sync_manager.cc b/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_sync_manager.cc
index 923c7236f83c84..dbe23826d355e8 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_sync_manager.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/preemption/preemption_sync_manager.cc
@@ -153,7 +153,7 @@ absl::Status PreemptionSyncManagerImpl::Initialize(
             kPreemptionNoticeKey, absl::FormatTime(*death_time));
         LOG(INFO) << "Notified coordination service that this task will "
                      "be preempted at "
-                  << *death_time << ". Status: " << s;
+                  << *death_time << ". absl::Status: " << s;
       });
 
   /* Listen for preemption notice (death time) from coordination service, which
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/BUILD b/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/BUILD
index e21bcbd7370868..838c2cbdf5ab5c 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/BUILD
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/BUILD
@@ -21,12 +21,14 @@ cc_library(
         "//xla/tsl/distributed_runtime/rpc:grpc_client_cq_tag",
         "//xla/tsl/distributed_runtime/rpc:grpc_state",
         "//xla/tsl/distributed_runtime/rpc:grpc_util",
-        "@local_tsl//tsl/platform:mutex",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:thread_annotations",
         "@local_tsl//tsl/protobuf:coordination_service_proto_cc",
-    ],
+    ] + tsl_grpc_cc_dependencies(),
 )
 
 cc_library(
@@ -34,14 +36,17 @@ cc_library(
     srcs = ["grpc_coordination_service_impl.cc"],
     hdrs = ["grpc_coordination_service_impl.h"],
     deps = [
+        "//xla/tsl/distributed_runtime/coordination:coordination_service",
         "//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
         "//xla/tsl/distributed_runtime/coordination:coordination_service_rpc_handler",
         "//xla/tsl/distributed_runtime/rpc:async_service_interface",
         "//xla/tsl/distributed_runtime/rpc:grpc_call",
         "//xla/tsl/distributed_runtime/rpc:grpc_util",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:mutex",
-        "@local_tsl//tsl/platform:thread_annotations",
         "@local_tsl//tsl/protobuf:coordination_service_cc_grpc_proto",
         "@local_tsl//tsl/protobuf:coordination_service_proto_cc",
     ] + tsl_grpc_cc_dependencies(),
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.cc b/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.cc
index 7c2c520362bc16..db74a713a7f132 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.cc
@@ -15,22 +15,28 @@ limitations under the License.
 
 #include "xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.h"
 
+#include <cstddef>
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <utility>
 #include <vector>
 
+#include "grpcpp/channel.h"
+#include "grpcpp/completion_queue.h"
+#include "grpcpp/generic/generic_stub.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/log/log.h"
+#include "absl/synchronization/mutex.h"
 #include "xla/tsl/distributed_runtime/call_options.h"
 #include "xla/tsl/distributed_runtime/coordination/coordination_client.h"
 #include "xla/tsl/distributed_runtime/rpc/grpc_channel.h"
 #include "xla/tsl/distributed_runtime/rpc/grpc_client_cq_tag.h"
 #include "xla/tsl/distributed_runtime/rpc/grpc_state.h"
 #include "xla/tsl/distributed_runtime/rpc/grpc_util.h"
-#include "tsl/platform/mutex.h"
+#include "tsl/platform/env.h"
 #include "tsl/platform/protobuf.h"
 #include "tsl/platform/status.h"
-#include "tsl/platform/thread_annotations.h"
 #include "tsl/protobuf/coordination_service.pb.h"
 
 namespace tsl {
@@ -281,7 +287,7 @@ class GrpcCoordinationClientCache : public CoordinationClientCache {
   ~GrpcCoordinationClientCache() override = default;
 
   CoordinationClient* GetClient(const std::string& target) override {
-    mutex_lock l(clients_mu_);
+    absl::MutexLock l(&clients_mu_);
     auto it = clients_.find(target);
     if (it == clients_.end()) {
       SharedGrpcChannelPtr channel = channel_cache_->FindWorkerChannel(target);
@@ -306,15 +312,15 @@ class GrpcCoordinationClientCache : public CoordinationClientCache {
   }
 
  private:
-  mutex assignment_mu_;
+  absl::Mutex assignment_mu_;
   std::unordered_map<std::string, size_t> target_assignments_
-      TF_GUARDED_BY(assignment_mu_);
-  size_t next_round_robin_assignment_ TF_GUARDED_BY(assignment_mu_);
+      ABSL_GUARDED_BY(assignment_mu_);
+  size_t next_round_robin_assignment_ ABSL_GUARDED_BY(assignment_mu_);
 
   size_t AssignClientToThread(const std::string& target) {
     // Round-robin target assignment, but keeps the same target on the same
     // polling thread always, as this is important for gRPC performance
-    mutex_lock lock(assignment_mu_);
+    absl::MutexLock l(&assignment_mu_);
     auto it = target_assignments_.find(target);
     if (it == target_assignments_.end()) {
       it = target_assignments_
@@ -326,9 +332,9 @@ class GrpcCoordinationClientCache : public CoordinationClientCache {
   }
 
   std::shared_ptr<GrpcChannelCache> channel_cache_;
-  mutable mutex clients_mu_;
+  mutable absl::Mutex clients_mu_;
   std::unordered_map<std::string, std::unique_ptr<CoordinationClient>> clients_
-      TF_GUARDED_BY(clients_mu_);
+      ABSL_GUARDED_BY(clients_mu_);
   std::vector<GrpcCoordinationClientThread> threads_;
 };
 
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.cc b/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.cc
index 86ef14c89f1d2b..72160cf8d1b0a3 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h"
 
-#include "tsl/platform/mutex.h"
+#include "absl/synchronization/mutex.h"
 #include "tsl/platform/threadpool.h"
 
 namespace tsl {
@@ -30,7 +30,7 @@ GrpcCoordinationServiceImpl::GrpcCoordinationServiceImpl(
 void GrpcCoordinationServiceImpl::HandleRPCsLoop() {
 #define ENQUEUE_REQUEST(method)                                               \
   do {                                                                        \
-    tf_shared_lock l(shutdown_mu_);                                           \
+    absl::ReaderMutexLock l(&shutdown_mu_);                                   \
     if (shutdown_) {                                                          \
       continue;                                                               \
     }                                                                         \
@@ -80,7 +80,7 @@ void GrpcCoordinationServiceImpl::HandleRPCsLoop() {
 }
 
 void GrpcCoordinationServiceImpl::Shutdown() {
-  mutex_lock l(shutdown_mu_);
+  absl::MutexLock l(&shutdown_mu_);
   shutdown_ = true;
   // This enqueues a special event (with a null tag) that causes the completion
   // queue to be shut down on the polling thread.
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h b/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h
index ac27e6ba28d76b..89574550e48616 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h
@@ -21,13 +21,15 @@ limitations under the License.
 #include "grpcpp/alarm.h"
 #include "grpcpp/completion_queue.h"
 #include "grpcpp/server_builder.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service.h"
 #include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "xla/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.h"
 #include "xla/tsl/distributed_runtime/rpc/async_service_interface.h"
 #include "xla/tsl/distributed_runtime/rpc/grpc_call.h"
 #include "xla/tsl/distributed_runtime/rpc/grpc_util.h"
-#include "tsl/platform/mutex.h"
-#include "tsl/platform/thread_annotations.h"
 #include "tsl/platform/threadpool.h"
 #include "tsl/protobuf/coordination_service.grpc.pb.h"
 #include "tsl/protobuf/coordination_service.pb.h"
@@ -59,15 +61,15 @@ class GrpcCoordinationServiceImpl : public AsyncServiceInterface {
 #define HANDLER(method)                                                       \
   void method##Handler(CoordCall<tensorflow::method##Request,                 \
                                  tensorflow::method##Response>* call) {       \
-    tf_shared_lock l(shutdown_mu_);                                           \
+    absl::ReaderMutexLock l(&shutdown_mu_);                                   \
     if (shutdown_) {                                                          \
       call->SendResponse(ToGrpcStatus(                                        \
-          errors::Internal("Coordination service has been shut down.")));     \
+          absl::InternalError("Coordination service has been shut down.")));  \
       return;                                                                 \
     }                                                                         \
     compute_pool_.Schedule([this, call]() {                                   \
       rpc_handler_.method##Async(&call->request, &call->response,             \
-                                 [call](const Status& s) {                    \
+                                 [call](const absl::Status& s) {              \
                                    call->ClearCancelCallback();               \
                                    call->SendResponse(ToGrpcStatus(s));       \
                                  });                                          \
@@ -101,8 +103,8 @@ class GrpcCoordinationServiceImpl : public AsyncServiceInterface {
   thread::ThreadPool& compute_pool_;
   CoordinationServiceRpcHandler rpc_handler_;
 
-  mutex shutdown_mu_;
-  bool shutdown_ TF_GUARDED_BY(shutdown_mu_);
+  absl::Mutex shutdown_mu_;
+  bool shutdown_ ABSL_GUARDED_BY(shutdown_mu_);
   std::unique_ptr<::grpc::Alarm> shutdown_alarm_;
 
   std::unique_ptr<::grpc::ServerCompletionQueue> cq_;
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util.h b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util.h
index 46247baddb8b47..1b83ce6190408d 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_util.h
@@ -33,7 +33,7 @@ namespace tsl {
 
 // Proto: tensorflow::distributed_runtime::GrpcPayloadsLost
 // Location: tsl/protobuf/distributed_runtime_payloads.proto
-// Usage: Flags the Status to have lost payloads during GRPC conversion.
+// Usage: Flags the absl::Status to have lost payloads during GRPC conversion.
 constexpr char kGrpcPayloadsLost[] =
     "type.googleapis.com/tensorflow.distributed_runtime.GrpcPayloadsLost";
 
@@ -98,7 +98,7 @@ inline ::grpc::Status ToGrpcStatus(const absl::Status& s) {
     if (s.message().size() > 3072 /* 3k bytes */) {
       // TODO(b/62947679): Remove truncation once the gRPC issue is resolved.
       string scratch = strings::Printf("%.3072s ... [truncated]",
-                                       tsl::NullTerminatedMessage(s));
+                                       absl::StatusMessageAsCStr(s));
       LOG(ERROR) << "Truncated error message: " << s;
       return ::grpc::Status(static_cast<::grpc::StatusCode>(s.code()), scratch,
                             SerializePayloads(s));
diff --git a/third_party/xla/xla/tsl/tsl.bzl b/third_party/xla/xla/tsl/tsl.bzl
index 796b7d63cdf88a..e62a27ef823973 100644
--- a/third_party/xla/xla/tsl/tsl.bzl
+++ b/third_party/xla/xla/tsl/tsl.bzl
@@ -463,7 +463,7 @@ def internal_hlo_deps():
 
 # Config setting selector used when building for products
 # which requires restricted licenses to be avoided.
-def if_not_mobile_or_arm_or_lgpl_restricted(a):
+def if_not_mobile_or_arm_or_macos_or_lgpl_restricted(a):
     _ = (a,)  # buildifier: disable=unused-variable
     return select({
         "//conditions:default": [],
@@ -786,3 +786,6 @@ def tsl_pybind_extension_opensource(
 
 def nvtx_headers():
     return if_oss(["@nvtx_archive//:headers"], ["@local_config_cuda//cuda:cuda_headers"])
+
+def tsl_google_bzl_deps():
+    return []
diff --git a/third_party/xla/xla/tsl/tsl.default.bzl b/third_party/xla/xla/tsl/tsl.default.bzl
index 578100d2ee8379..513990d0e6d1cb 100644
--- a/third_party/xla/xla/tsl/tsl.default.bzl
+++ b/third_party/xla/xla/tsl/tsl.default.bzl
@@ -5,8 +5,9 @@ load(
     _filegroup = "filegroup",
     _get_compatible_with_libtpu_portable = "get_compatible_with_libtpu_portable",
     _get_compatible_with_portable = "get_compatible_with_portable",
-    _if_not_mobile_or_arm_or_lgpl_restricted = "if_not_mobile_or_arm_or_lgpl_restricted",
+    _if_not_mobile_or_arm_or_macos_or_lgpl_restricted = "if_not_mobile_or_arm_or_macos_or_lgpl_restricted",
     _internal_hlo_deps = "internal_hlo_deps",
+    _tsl_google_bzl_deps = "tsl_google_bzl_deps",
     _tsl_grpc_cc_dependencies = "tsl_grpc_cc_dependencies",
     _tsl_pybind_extension = "tsl_pybind_extension_opensource",
 )
@@ -14,7 +15,8 @@ load(
 get_compatible_with_portable = _get_compatible_with_portable
 get_compatible_with_libtpu_portable = _get_compatible_with_libtpu_portable
 filegroup = _filegroup
-if_not_mobile_or_arm_or_lgpl_restricted = _if_not_mobile_or_arm_or_lgpl_restricted
+if_not_mobile_or_arm_or_macos_or_lgpl_restricted = _if_not_mobile_or_arm_or_macos_or_lgpl_restricted
 internal_hlo_deps = _internal_hlo_deps
 tsl_grpc_cc_dependencies = _tsl_grpc_cc_dependencies
 tsl_pybind_extension = _tsl_pybind_extension
+tsl_google_bzl_deps = _tsl_google_bzl_deps
diff --git a/third_party/xla/xla/tsl/util/byte_swap_array.h b/third_party/xla/xla/tsl/util/byte_swap_array.h
index 43ee9f77a38bac..a2ff2a864ee2dd 100644
--- a/third_party/xla/xla/tsl/util/byte_swap_array.h
+++ b/third_party/xla/xla/tsl/util/byte_swap_array.h
@@ -95,7 +95,7 @@ namespace tsl {
 //  bytes_per_elem: Number of bytes in each element of the array
 //  array_len: Number of elements in the array
 //
-// Returns: OkStatus() on success, -1 otherwise
+// Returns: absl::OkStatus() on success, -1 otherwise
 //
 absl::Status ByteSwapArray(char *array, size_t bytes_per_elem, int array_len);
 
diff --git a/third_party/xla/xla/tsl/util/use_cudnn.cc b/third_party/xla/xla/tsl/util/use_cudnn.cc
index a3e1b4d25d2667..da15cadca6315f 100644
--- a/third_party/xla/xla/tsl/util/use_cudnn.cc
+++ b/third_party/xla/xla/tsl/util/use_cudnn.cc
@@ -27,14 +27,15 @@ limitations under the License.
 
 namespace tsl {
 
-#define ADD_BOOL_CUDNN_FLAG(func_name, flag_name, default_value)           \
-  bool func_name() {                                                       \
-    bool value = default_value;                                            \
-    Status status = ReadBoolFromEnvVar(#flag_name, default_value, &value); \
-    if (!status.ok()) {                                                    \
-      LOG(ERROR) << status;                                                \
-    }                                                                      \
-    return value;                                                          \
+#define ADD_BOOL_CUDNN_FLAG(func_name, flag_name, default_value) \
+  bool func_name() {                                             \
+    bool value = default_value;                                  \
+    absl::Status status =                                        \
+        ReadBoolFromEnvVar(#flag_name, default_value, &value);   \
+    if (!status.ok()) {                                          \
+      LOG(ERROR) << status;                                      \
+    }                                                            \
+    return value;                                                \
   }
 
 bool CudnnUseFrontend() {
@@ -43,8 +44,8 @@ bool CudnnUseFrontend() {
 #if GOOGLE_CUDA
     if (CUDNN_VERSION >= 8100) {
       // cuDNN 8.1.0 + the frontend has issues regarding fused convolution.
-      Status status = ReadBoolFromEnvVar("TF_CUDNN_USE_FRONTEND",
-                                         CUDNN_VERSION >= 8200, &value);
+      absl::Status status = ReadBoolFromEnvVar("TF_CUDNN_USE_FRONTEND",
+                                               CUDNN_VERSION >= 8200, &value);
       if (!status.ok()) {
         LOG(ERROR) << status;
       }
@@ -64,7 +65,7 @@ bool CudnnUseRuntimeFusion() {
     bool value = false;
 #if GOOGLE_CUDA
     if (CUDNN_VERSION >= 8400) {
-      Status status =
+      absl::Status status =
           ReadBoolFromEnvVar("TF_CUDNN_USE_RUNTIME_FUSION", false, &value);
       if (!status.ok()) {
         LOG(ERROR) << status;
@@ -99,14 +100,15 @@ ADD_BOOL_CUDNN_FLAG(DebugCudnnRnnUseTensorOps,
                     TF_DEBUG_CUDNN_RNN_USE_TENSOR_OPS, false);
 #undef ADD_BOOL_CUDNN_FLAG
 
-#define ADD_INT64_CUDNN_FLAG(func_name, flag_name, default_value)           \
-  int64_t func_name() {                                                     \
-    int64_t value = default_value;                                          \
-    Status status = ReadInt64FromEnvVar(#flag_name, default_value, &value); \
-    if (!status.ok()) {                                                     \
-      LOG(ERROR) << status;                                                 \
-    }                                                                       \
-    return value;                                                           \
+#define ADD_INT64_CUDNN_FLAG(func_name, flag_name, default_value) \
+  int64_t func_name() {                                           \
+    int64_t value = default_value;                                \
+    absl::Status status =                                         \
+        ReadInt64FromEnvVar(#flag_name, default_value, &value);   \
+    if (!status.ok()) {                                           \
+      LOG(ERROR) << status;                                       \
+    }                                                             \
+    return value;                                                 \
   }
 // Cudnn RNN algorithm to use for both forward and backward pass. Only effective
 // when TF_DEBUG_CUDNN_RNN is true. See Nvidia Cudnn manual for allowed
diff --git a/third_party/xla/xla/types.h b/third_party/xla/xla/types.h
index f1891b9a8c076e..3d443ab5f30604 100644
--- a/third_party/xla/xla/types.h
+++ b/third_party/xla/xla/types.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <string>
 #include <type_traits>
 
+#include "absl/strings/str_cat.h"
 #include "Eigen/Core"  // from @eigen_archive  // IWYU pragma: export
 #include "tsl/platform/ml_dtypes.h"  // IWYU pragma: export
 
@@ -59,21 +60,33 @@ template <typename T>
 inline constexpr bool is_specialized_integral_v =
     is_specialized_integral<T>::value;
 
+using u2 = tsl::uint2;
+using s2 = tsl::int2;
 using u4 = tsl::uint4;
 using s4 = tsl::int4;
 
+template <class T>
+struct is_intN : std::false_type {};
+template <int kN, typename UnderlyingType>
+struct is_intN<::ml_dtypes::intN<kN, UnderlyingType>> : std::true_type {};
+
+template <typename T>
+inline constexpr bool is_intN_v = is_intN<T>::value;
+
 }  // namespace xla
 
 // Extend ml_dtypes to allow absl::String functions.
 namespace ml_dtypes {
-template <typename Sink>
-void AbslStringify(Sink& sink, const xla::s4& i) {
-  sink.Append(std::to_string(static_cast<int32_t>(i)));
-}
 
-template <typename Sink>
-void AbslStringify(Sink& sink, const xla::u4& i) {
-  sink.Append(std::to_string(static_cast<uint32_t>(i)));
+template <typename Sink, typename T,
+          std::enable_if_t<xla::is_intN_v<T>, int> = 0>
+void AbslStringify(Sink& sink, const T& i) {
+  static_assert(xla::is_specialized_integral_v<T>);
+  if constexpr (std::numeric_limits<T>::is_signed) {
+    sink.Append(absl::StrCat(static_cast<int32_t>(i)));
+  } else {
+    sink.Append(absl::StrCat(static_cast<uint32_t>(i)));
+  }
 }
 }  // namespace ml_dtypes
 
@@ -84,37 +97,33 @@ namespace se = ::stream_executor;  // NOLINT(misc-unused-alias-decls)
 
 // std::make_signed_t is “behavior undefined” for custom types, so provide a
 // general util to make signed/unsigned for both primitive and custom types.
-template <typename T>
+template <typename T, typename = void>
 struct make_specialized_unsigned {
   using type = std::make_unsigned_t<T>;
 };
 
-template <>
-struct make_specialized_unsigned<xla::s4> {
-  using type = xla::u4;
-};
-
-template <>
-struct make_specialized_unsigned<xla::u4> {
-  using type = xla::u4;
+template <typename T>
+struct make_specialized_unsigned<T, typename std::enable_if_t<is_intN_v<T>>> {
+  static_assert(std::is_integral_v<typename T::underlying_type>);
+  using type =
+      ::ml_dtypes::intN<T::bits,
+                        std::make_unsigned_t<typename T::underlying_type>>;
 };
 
 template <typename T>
 using make_specialized_unsigned_t = typename make_specialized_unsigned<T>::type;
 
-template <typename T>
+template <typename T, typename = void>
 struct make_specialized_signed {
   using type = std::make_signed_t<T>;
 };
 
-template <>
-struct make_specialized_signed<xla::s4> {
-  using type = xla::s4;
-};
-
-template <>
-struct make_specialized_signed<xla::u4> {
-  using type = xla::s4;
+template <typename T>
+struct make_specialized_signed<T, typename std::enable_if_t<is_intN_v<T>>> {
+  static_assert(std::is_integral_v<typename T::underlying_type>);
+  using type =
+      ::ml_dtypes::intN<T::bits,
+                        std::make_signed_t<typename T::underlying_type>>;
 };
 
 template <typename T>
diff --git a/third_party/xla/xla/util.cc b/third_party/xla/xla/util.cc
index c4abd63d8c7f90..585e9615c98d4f 100644
--- a/third_party/xla/xla/util.cc
+++ b/third_party/xla/xla/util.cc
@@ -68,7 +68,7 @@ std::vector<int64_t> ToMixedRadix(const int64_t n,
   return digits;
 }
 
-Status WithLogBacktrace(const Status& status) {
+absl::Status WithLogBacktrace(const absl::Status& status) {
   CHECK(!status.ok());
   VLOG(1) << status.ToString();
   VLOG(2) << tsl::CurrentStackTrace();
@@ -113,14 +113,16 @@ void ScopedLoggingTimer::StopAndLog() {
 
 ScopedLoggingTimer::~ScopedLoggingTimer() { StopAndLog(); }
 
-Status AddStatus(Status prior, absl::string_view context) {
+absl::Status AddStatus(absl::Status prior, absl::string_view context) {
   CHECK(!prior.ok());
-  return Status{prior.code(), absl::StrCat(context, ": ", prior.message())};
+  return absl::Status{prior.code(),
+                      absl::StrCat(context, ": ", prior.message())};
 }
 
-Status AppendStatus(Status prior, absl::string_view context) {
+absl::Status AppendStatus(absl::Status prior, absl::string_view context) {
   CHECK(!prior.ok());
-  return Status{prior.code(), absl::StrCat(prior.message(), ": ", context)};
+  return absl::Status{prior.code(),
+                      absl::StrCat(prior.message(), ": ", context)};
 }
 
 std::string Reindent(absl::string_view original,
@@ -182,7 +184,7 @@ std::string RoundTripFpToString(tsl::float8_e4m3fn value) {
   return result;
 }
 
-std::string RoundTripFpToString(tsl::float8_e4m3b11 value) {
+std::string RoundTripFpToString(tsl::float8_e4m3b11fnuz value) {
   std::string result = GenericRoundTripFpToString(value);
   return result;
 }
@@ -456,28 +458,4 @@ bool DistinctNumbersAreConsecutiveIfSorted(absl::Span<const int64_t> seq) {
          seq.size() - 1;
 }
 
-void PackInt4(absl::Span<const char> input, absl::Span<char> output) {
-  CHECK_EQ(output.size(), CeilOfRatio(input.size(), size_t{2}));
-  for (size_t i = 0; i < input.size(); ++i) {
-    // Mask out the high-order 4 bits in case they have extraneous data.
-    char val = input[i] & 0xf;
-    if (i % 2 == 0) {
-      output[i / 2] = val << 4;
-    } else {
-      output[i / 2] |= val;
-    }
-  }
-}
-
-void UnpackInt4(absl::Span<const char> input, absl::Span<char> output) {
-  CHECK_EQ(input.size(), CeilOfRatio(output.size(), size_t{2}));
-  for (size_t i = 0; i < output.size(); ++i) {
-    if (i % 2 == 0) {
-      output[i] = (input[i / 2] >> 4) & 0xf;
-    } else {
-      output[i] = input[i / 2] & 0xf;
-    }
-  }
-}
-
 }  // namespace xla
diff --git a/third_party/xla/xla/util.h b/third_party/xla/xla/util.h
index a5910d0ce4197a..91a2b50b38a26c 100644
--- a/third_party/xla/xla/util.h
+++ b/third_party/xla/xla/util.h
@@ -71,10 +71,10 @@ std::vector<int64_t> ToMixedRadix(int64_t n, absl::Span<const int64_t> bounds);
 
 // Logs the provided status message with a backtrace.
 //
-// For use by Status-factories, logs a backtrace at the point where the status
-// is created, such that we can use --vmodule=util=1 to see all status
+// For use by absl::Status-factories, logs a backtrace at the point where the
+// status is created, such that we can use --vmodule=util=1 to see all status
 // creation backtraces.
-Status WithLogBacktrace(const Status& status);
+absl::Status WithLogBacktrace(const absl::Status& status);
 
 // Ranks greater than 6 are very rare, so use InlinedVector<int64_t, 6> to store
 // the bounds and indices. And for the rare cases of ranks greater than 6,
@@ -207,10 +207,10 @@ void StridedCopy(D* dest, int64_t dest_stride, const S* src, int64_t src_stride,
 }
 
 // Adds some context information to the error message in a
-// Status.  This is useful as Statuses are
+// absl::Status.  This is useful as absl::Statuses are
 // propagated upwards.
-Status AddStatus(Status prior, absl::string_view context);
-Status AppendStatus(Status prior, absl::string_view context);
+absl::Status AddStatus(absl::Status prior, absl::string_view context);
+absl::Status AppendStatus(absl::Status prior, absl::string_view context);
 
 // The following three macros define a common set of code for creating
 // absl::Status errors with the given error_type, with the addition of adding
@@ -221,7 +221,7 @@ Status AppendStatus(Status prior, absl::string_view context);
 //
 // template <typename... Args>
 // struct ResourceExhausted {
-//   Status status;
+//   absl::Status status;
 // #if defined(PLATFORM_GOOGLE)
 //   // NOLINTNEXTLINE(google-explicit-constructor)
 //   ResourceExhausted(const absl::FormatSpec<Args...>& format, Args&&... args,
@@ -238,16 +238,16 @@ Status AppendStatus(Status prior, absl::string_view context);
 // #endif
 //
 //   // NOLINTNEXTLINE(google-explicit-constructor)
-//   operator Status() const { return status; }
+//   operator absl::Status() const { return status; }
 // };
 //
 #define XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE_PREFIX(error_type) \
   template <typename... Args>                                     \
   struct error_type {                                             \
-    Status status;
+    absl::Status status;
 #define XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE_SUFFIX(error_type)        \
   /* NOLINTNEXTLINE(google-explicit-constructor) */                      \
-  operator Status() const { return status; }                             \
+  operator absl::Status() const { return status; }                       \
   }                                                                      \
   ;                                                                      \
   /*Deduction guide to make variadic arguments play nice with default */ \
@@ -269,8 +269,8 @@ Status AppendStatus(Status prior, absl::string_view context);
 #else
 #define XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE(error_type)          \
   template <typename... Args>                                       \
-  Status error_type(const absl::FormatSpec<Args...>& format,        \
-                    const Args&... args) {                          \
+  absl::Status error_type(const absl::FormatSpec<Args...>& format,  \
+                          const Args&... args) {                    \
     return WithLogBacktrace(                                        \
         absl::error_type##Error(absl::StrFormat(format, args...))); \
   }
@@ -299,7 +299,7 @@ XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE(Unknown);
 //
 // template <typename... Args>
 // struct ResourceExhaustedStrCat {
-//   Status status;
+//   absl::Status status;
 // #if defined(PLATFORM_GOOGLE)
 //   // NOLINTNEXTLINE(google-explicit-constructor)
 //   ResourceExhaustedStrCat(Args&&... concat, absl::SourceLocation loc =
@@ -317,17 +317,17 @@ XLA_ERROR_WITH_STRFORMAT_AND_BACKTRACE(Unknown);
 // #endif
 //
 //   // NOLINTNEXTLINE(google-explicit-constructor)
-//   operator Status() const { return status; }
+//   operator absl::Status() const { return status; }
 // };
 //
 #define XLA_ERROR_WITH_STRCAT_AND_BACKTRACE_PREFIX(error_type) \
   template <typename... Args>                                  \
   struct error_type##StrCat {                                  \
-    Status status;                                             \
+    absl::Status status;                                       \
     /* NOLINTNEXTLINE(google-explicit-constructor) */
 #define XLA_ERROR_WITH_STRCAT_AND_BACKTRACE_SUFFIX(error_type)           \
   /* NOLINTNEXTLINE(google-explicit-constructor) */                      \
-  operator Status() const { return status; }                             \
+  operator absl::Status() const { return status; }                       \
   }                                                                      \
   ;                                                                      \
   /*Deduction guide to make variadic arguments play nice with default */ \
@@ -424,7 +424,7 @@ std::string RoundTripFpToString(tsl::float8_e5m2 value);
 std::string RoundTripFpToString(tsl::float8_e4m3fn value);
 
 // Returns a string which can losslessly round trip to a float8 E4M3B11.
-std::string RoundTripFpToString(tsl::float8_e4m3b11 value);
+std::string RoundTripFpToString(tsl::float8_e4m3b11fnuz value);
 
 // Returns a string which can losslessly round trip to a float8 E5M2FNUZ.
 std::string RoundTripFpToString(tsl::float8_e5m2fnuz value);
@@ -788,28 +788,91 @@ bool IsInt32(T x) {
 }
 
 template <typename T>
-Status EraseElementFromVector(std::vector<T>* container, const T& value) {
+absl::Status EraseElementFromVector(std::vector<T>* container, const T& value) {
   // absl::c_find returns a const_iterator which does not seem to work on
   // gcc 4.8.4, and this breaks the ubuntu/xla_gpu build bot.
   auto it = std::find(container->begin(), container->end(), value);
   TF_RET_CHECK(it != container->end());
   container->erase(it);
-  return OkStatus();
+  return absl::OkStatus();
+}
+
+// Takes a sequence of unpacked n-bit values, such that every byte stores one
+// value in the low-order bits, and packs them so every byte stores as many
+// which will fit. `output` should have ceil((input.size()*kBitsPerElement)/8)
+// bytes. The high-order bits of each byte in `input` are ignored.
+template <size_t kBitsPerElement>
+void PackIntN(absl::Span<const char> input, absl::Span<char> output) {
+  constexpr auto kElementsPerByte = 8 / kBitsPerElement;
+  const size_t aligned_inputs = input.size() / kElementsPerByte;
+  for (size_t i = 0; i < aligned_inputs; ++i) {
+    char byte = 0;
+    for (size_t j = 0; j < kElementsPerByte; ++j) {
+      byte |=
+          (input[i * kElementsPerByte + j] & LsbMask<uint8_t>(kBitsPerElement))
+          << (kBitsPerElement * (kElementsPerByte - j - 1));
+    }
+    output[i] = byte;
+  }
+  if (size_t remainder = input.size() % kElementsPerByte; remainder != 0) {
+    char byte = 0;
+    for (size_t j = 0; j < remainder; ++j) {
+      byte |= (input[aligned_inputs * kElementsPerByte + j] &
+               LsbMask<uint8_t>(kBitsPerElement))
+              << (kBitsPerElement * (kElementsPerByte - j - 1));
+    }
+    output[aligned_inputs] = byte;
+  }
 }
 
-// Takes a sequence of unpacked int4 values, such that every byte stores one
-// int4 value in the low-order four bits, and packs them so every byte stores
-// two int4 values. 'input' should have num_elements bytes; 'output' should have
-// (num_elements+1)/2 bytes. The high-order four bits of each byte in 'input'
-// are ignored.
-void PackInt4(absl::Span<const char> input, absl::Span<char> output);
+inline void PackIntN(int bits_per_element, absl::Span<const char> input,
+                     absl::Span<char> output) {
+  if (bits_per_element == 2) {
+    PackIntN<2>(input, output);
+  } else if (bits_per_element == 4) {
+    PackIntN<4>(input, output);
+  } else {
+    LOG(FATAL) << "Invalid bits_per_element: " << bits_per_element;
+  }
+}
 
-// Takes a sequence of packed int4 values, such that every byte stores two
-// int4 values, and unpacks them so every byte stores one int4 value in the
-// low-order four bits. 'input' should have (num_elements+1)/2 bytes; 'output'
-// should have num_elements bytes. The high-order 4-bits in each output are
-// zero.
-void UnpackInt4(absl::Span<const char> input, absl::Span<char> output);
+// Takes a sequence of packed values, such that every byte stores multiple
+// values, and unpacks them so every byte stores one value in the low-order
+// bits. `input` should have
+// ceil(output.size()*8/kBitsPerElement) bytes. The high-order bits in each
+// output are zero.
+template <size_t kBitsPerElement>
+void UnpackIntN(absl::Span<const char> input, absl::Span<char> output) {
+  constexpr auto kElementsPerByte = 8 / kBitsPerElement;
+  const size_t aligned_outputs = output.size() / kElementsPerByte;
+  for (size_t i = 0; i < aligned_outputs; ++i) {
+    const char byte = input[i];
+    for (int j = 0; j < kElementsPerByte; ++j) {
+      output[i * kElementsPerByte + j] =
+          (byte >> (kBitsPerElement * (kElementsPerByte - j - 1))) &
+          LsbMask<uint8_t>(kBitsPerElement);
+    }
+  }
+  if (size_t remainder = output.size() % kElementsPerByte; remainder != 0) {
+    const char byte = input[aligned_outputs];
+    for (size_t j = 0; j < remainder; ++j) {
+      output[aligned_outputs * kElementsPerByte + j] =
+          (byte >> (kBitsPerElement * (kElementsPerByte - j - 1))) &
+          LsbMask<uint8_t>(kBitsPerElement);
+    }
+  }
+}
+
+inline void UnpackIntN(int bits_per_element, absl::Span<const char> input,
+                       absl::Span<char> output) {
+  if (bits_per_element == 2) {
+    UnpackIntN<2>(input, output);
+  } else if (bits_per_element == 4) {
+    UnpackIntN<4>(input, output);
+  } else {
+    LOG(FATAL) << "Invalid bits_per_element: " << bits_per_element;
+  }
+}
 
 class HloInstruction;
 class HloModule;
diff --git a/third_party/xla/xla/util_test.cc b/third_party/xla/xla/util_test.cc
index 555521332c26e7..8e442edd68254c 100644
--- a/third_party/xla/xla/util_test.cc
+++ b/third_party/xla/xla/util_test.cc
@@ -15,14 +15,19 @@ limitations under the License.
 
 #include "xla/util.h"
 
+#include <cstddef>
+#include <cstdint>
 #include <limits>
 #include <list>
+#include <memory>
+#include <numeric>
 #include <set>
 #include <string>
 #include <string_view>
 #include <utility>
 #include <vector>
 
+#include "xla/maybe_owning.h"
 #include "xla/test.h"
 #include "xla/types.h"
 #include "tsl/platform/logging.h"
@@ -132,7 +137,7 @@ TEST(UtilTest, RoundTripFpToString) {
                 -std::numeric_limits<tsl::float8_e4m3fn>::quiet_NaN()),
             "-nan");
   EXPECT_EQ(RoundTripFpToString(
-                std::numeric_limits<tsl::float8_e4m3b11>::quiet_NaN()),
+                std::numeric_limits<tsl::float8_e4m3b11fnuz>::quiet_NaN()),
             "-nan");
   EXPECT_EQ(RoundTripFpToString(
                 std::numeric_limits<tsl::float8_e4m3fnuz>::quiet_NaN()),
@@ -246,11 +251,13 @@ TEST(UtilTest, TotalOrder_F8E4M3FN) {
 
 TEST(UtilTest, TotalOrder_F8E4M3B11) {
   for (int a = 0; a < 256; ++a) {
-    tsl::float8_e4m3b11 x =
-        Eigen::numext::bit_cast<tsl::float8_e4m3b11>(static_cast<uint8_t>(a));
+    tsl::float8_e4m3b11fnuz x =
+        Eigen::numext::bit_cast<tsl::float8_e4m3b11fnuz>(
+            static_cast<uint8_t>(a));
     for (int b = 0; b < 256; ++b) {
-      tsl::float8_e4m3b11 y =
-          Eigen::numext::bit_cast<tsl::float8_e4m3b11>(static_cast<uint8_t>(b));
+      tsl::float8_e4m3b11fnuz y =
+          Eigen::numext::bit_cast<tsl::float8_e4m3b11fnuz>(
+              static_cast<uint8_t>(b));
       TotalOrderHelper(x, y);
     }
   }
@@ -280,5 +287,61 @@ TEST(UtilTest, TotalOrder_F8E5M2FNUZ) {
   }
 }
 
+void PackInt4(absl::Span<const char> input, absl::Span<char> output) {
+  CHECK_EQ(output.size(), CeilOfRatio(input.size(), size_t{2}));
+  for (size_t i = 0; i < input.size(); ++i) {
+    // Mask out the high-order 4 bits in case they have extraneous data.
+    char val = input[i] & 0xf;
+    if (i % 2 == 0) {
+      output[i / 2] = val << 4;
+    } else {
+      output[i / 2] |= val;
+    }
+  }
+}
+
+TEST(UtilTest, PackInt4) {
+  std::vector<char> input(7);
+  std::iota(input.begin(), input.end(), 0);
+
+  std::vector<char> output_ref(CeilOfRatio<int64_t>(input.size(), 2));
+  PackInt4(input, absl::MakeSpan(output_ref));
+
+  std::vector<char> output_dut(CeilOfRatio<int64_t>(input.size(), 2));
+  PackIntN(4, input, absl::MakeSpan(output_dut));
+  for (size_t i = 0; i < output_dut.size(); ++i) {
+    EXPECT_EQ(output_ref[i], output_dut[i]) << i;
+  }
+
+  std::vector<char> unpacked(input.size());
+  UnpackIntN(4, output_ref, absl::MakeSpan(unpacked));
+  for (size_t i = 0; i < input.size(); ++i) {
+    EXPECT_EQ(unpacked[i], input[i]) << i;
+  }
+}
+
+TEST(UtilTest, MaybeOwningTestNull) {
+  MaybeOwning<char> m(nullptr);
+  EXPECT_EQ(m.get(), nullptr);
+  EXPECT_EQ(m.get_mutable(), nullptr);
+}
+
+TEST(UtilTest, MaybeOwningTestOwning) {
+  MaybeOwning<char> m(std::make_unique<char>());
+  *m.get_mutable() = 'a';
+  EXPECT_EQ(*m, 'a');
+}
+
+TEST(UtilTest, MaybeOwningTestShared) {
+  auto owner = std::make_unique<char>();
+  *owner = 'x';
+  MaybeOwning<char> c1(owner.get());
+  MaybeOwning<char> c2(owner.get());
+
+  EXPECT_EQ(*c1, 'x');
+  EXPECT_EQ(*c2, 'x');
+  EXPECT_EQ(c1.get(), c2.get());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/xla.bzl b/third_party/xla/xla/xla.bzl
index bf8d047dc376be..2143baaf6b5d41 100644
--- a/third_party/xla/xla/xla.bzl
+++ b/third_party/xla/xla/xla.bzl
@@ -8,6 +8,7 @@ load(
     "@local_tsl//tsl/platform:build_config_root.bzl",
     "if_static",
     "tf_exec_properties",
+    "tf_gpu_tests_tags",
 )
 load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
@@ -22,8 +23,6 @@ def xla_py_proto_library(**_kwargs):
     # Note: we don't currently define a proto library target for Python in OSS.
     pass
 
-ORC_JIT_MEMORY_MAPPER_TARGETS = []
-
 def xla_py_test_deps():
     return []
 
@@ -71,7 +70,11 @@ _XLA_SHARED_OBJECT_SENSITIVE_DEPS = if_static(extra_deps = [], otherwise = [
 def xla_cc_binary(deps = [], copts = tsl_copts(), **kwargs):
     native.cc_binary(deps = deps + _XLA_SHARED_OBJECT_SENSITIVE_DEPS, copts = copts, **kwargs)
 
-def xla_cc_test(name, deps = [], **kwargs):
+def xla_cc_test(name, deps = [], use_gpu = False, **kwargs):
+    # Need to do it this way so that `tf_exec_properties` can read tags.
+    _tags = kwargs.get("tags", [])
+    kwargs["tags"] = _tags + tf_gpu_tests_tags() if use_gpu else _tags
+
     native.cc_test(
         name = name,
         deps = deps + _XLA_SHARED_OBJECT_SENSITIVE_DEPS,
@@ -79,20 +82,15 @@ def xla_cc_test(name, deps = [], **kwargs):
         **kwargs
     )
 
-def auto_sharding_deps():
-    return [Label("//xla/hlo/experimental/auto_sharding:auto_sharding_impl")]
-
-def auto_sharding_solver_deps():
-    return [Label("//xla/hlo/experimental/auto_sharding:auto_sharding_solver_impl")]
-
-def xla_export_hlo_deps():
-    return []
-
 def xla_nvml_deps():
     return ["@local_config_cuda//cuda:nvml_headers"]
 
 def xla_cub_deps():
     return ["@local_config_cuda//cuda:cub_headers"]
 
-def xla_symbol_repository_deps():
+def xla_internal(targets, otherwise = []):
+    _ = targets  # buildifier: disable=unused-variable
+    return otherwise
+
+def tests_build_defs_bzl_deps():
     return []
diff --git a/third_party/xla/xla/xla.proto b/third_party/xla/xla/xla.proto
index 0926606545866b..2960c21d183601 100644
--- a/third_party/xla/xla/xla.proto
+++ b/third_party/xla/xla/xla.proto
@@ -114,8 +114,7 @@ message DebugOptions {
   // Generate calls to MKL-DNN in the CPU backend.
   bool xla_cpu_use_mkl_dnn = 97;
 
-  // Enable XLA Runtime in the CPU backend.
-  bool xla_cpu_use_xla_runtime = 177;
+  reserved 177;  // Was xla_cpu_use_xla_runtime
 
   reserved 98;  // Was xla_gpu_max_kernel_unroll_factor
 
@@ -165,12 +164,7 @@ message DebugOptions {
   // above!
   bool xla_gpu_enable_fast_min_max = 100;
 
-  // Defines the number of CUDA threads that can be used to accelerate
-  // a sparse computation compiled for the XLA Runtime and CPU backend.
-  // By default (value 0), no acceleration is used. Otherwise, this
-  // many threads may be used to accelerate sparse operations, typically
-  // useful when accelerating structured sparsity.
-  int32 xla_cpu_sparse_cuda_threads = 207;
+  reserved 207;  // Was xla_cpu_sparse_cuda_threads
 
   // Allows xla to increase the output precision of floating point operations
   // and all floating-point conversions to be simplified, including those
@@ -369,22 +363,36 @@ message DebugOptions {
   int32 xla_gpu_force_compilation_parallelism = 147;
   bool xla_gpu_enable_llvm_module_compilation_parallelism = 268;
 
-  // Guarantees run-to-run determinism. At present, the HLO ops Scatter and
-  // SelectAndScatter do not have deterministic XLA:GPU implementations.
-  // Compilation errors out if these ops are encountered.
+  // Guarantees run-to-run determinism.
+  // This flag implies --xla_gpu_exclude_nondeterministic_ops and in addition
+  // disables autotuning.
   bool xla_gpu_deterministic_ops = 148;
 
   // Paths to files with LLVM code.
   repeated string xla_gpu_llvm_ir_file = 150;
 
-  // Convert synchronous collective ops into asynchronous.
-  bool xla_gpu_enable_async_collectives = 238;
-  bool xla_gpu_enable_async_all_reduce = 152;
-  bool xla_gpu_enable_async_collective_broadcast = 278;
-  bool xla_gpu_enable_async_collective_permute = 183;
-  bool xla_gpu_enable_async_all_gather = 199;
-  bool xla_gpu_enable_async_reduce_scatter = 200;
-  bool xla_gpu_enable_async_all_to_all = 201;
+  // Enum to define all collective ops
+  // that xla supports.
+  enum CollectiveOpType {
+    NOOP = 0;
+    ALLREDUCE = 1;
+    ALLGATHER = 2;
+    REDUCESCATTER = 3;
+    COLLECTIVEBROADCAST = 4;
+    ALLTOALL = 5;
+    COLLECTIVEPERMUTE = 6;
+  }
+
+  repeated CollectiveOpType xla_gpu_disable_async_collectives = 289;
+
+  // Used to be xla_gpu_enable_async_all_reduce
+  // xla_gpu_enable_async_collective_broadcast
+  // xla_gpu_enable_async_collective_permute
+  // xla_gpu_enable_async_all_gather
+  // xla_gpu_enable_async_reduce_scatter
+  // xla_gpu_enable_async_all_to_all
+  // xla_gpu_enable_async_collectives
+  reserved 152, 278, 183, 199, 200, 201, 238;
 
   // Size threshold (in bytes) for the GPU collective combiners.
   int64 xla_gpu_all_reduce_combine_threshold_bytes = 157;
@@ -440,8 +448,7 @@ message DebugOptions {
   reserved 161;  // Was xla_gpu_bef_executable
   reserved 162;  // Was xla_gpu_bef_thunk
 
-  // If true, use XLA runtime for XLA:GPU backend.
-  bool xla_gpu_enable_xla_runtime_executable = 169;
+  reserved 169;  // Was xla_gpu_enable_xla_runtime_executable
 
   // If true, XLA will try to pattern match subgraphs of HLO operations into
   // custom fusions registered in the current process (pre-compiled hand written
@@ -452,7 +459,8 @@ message DebugOptions {
   // if `xla_gpu_enable_custom_fusion` set to true.
   string xla_gpu_enable_custom_fusions_re = 264;
 
-  // If true, use XLA runtime for XLA:GPU backend.
+  // Enables address computation fusion to optimize dynamic-slice and
+  // dynamic-update-slice operations around library calls.
   bool xla_gpu_enable_address_computation_fusion = 105;
 
   reserved 233;  // was xla_gpu_enable_gpu2_runtime
@@ -486,10 +494,7 @@ message DebugOptions {
   // Determine the types of commands that are recorded into command buffers.
   repeated CommandBufferCmdType xla_gpu_enable_command_buffer = 258;
 
-  // Only instantiates a GPU graph after the captured function execution count
-  // reaches the threshold. This constant is a heuristic to avoid creating a
-  // large number of CUDA graph instances in memory.
-  int32 xla_gpu_graph_num_runs_to_instantiate = 202;
+  reserved 202;  // Was xla_gpu_graph_num_runs_to_instantiate
 
   // This number determines how many moved instructions like fusion kernels are
   // required for a region to be captured as a function to be launched as a GPU
@@ -499,10 +504,7 @@ message DebugOptions {
   // Identify concurrent regions in GPU graphs and execute them concurrently.
   bool xla_gpu_graph_enable_concurrent_region = 215;
 
-  // Timeout in seconds to evict instantiated Gpu graphs from device. When XLA
-  // instantiates new Gpu graphs, it evicts graphs that were not recently
-  // executed to free space on device.
-  int32 xla_gpu_graph_eviction_timeout_seconds = 230;
+  reserved 230;  // Was xla_gpu_graph_eviction_timeout_seconds
 
   // Size threshold (in megabytes) for the GPU redzone scratch allocator.
   int64 xla_gpu_redzone_scratch_max_megabytes = 167;
@@ -517,8 +519,7 @@ message DebugOptions {
   // scratch), so this can be multiplied by quite a lot.
   int64 xla_gpu_redzone_padding_bytes = 228;
 
-  // Deprecated. Use xla_allow_excess_precision instead.
-  bool xla_gpu_simplify_all_fp_conversions = 168 [deprecated = true];
+  reserved 168;  // Was xla_gpu_simplify_all_fp_conversions.
 
   // An experimental option to force all layouts present in the
   // after-optimizations HLO to be descending, e.g.
@@ -629,7 +630,9 @@ message DebugOptions {
 
   int32 xla_gpu_triton_fusion_level = 229;
 
-  bool xla_gpu_dump_autotuned_triton_fusions = 232;
+  bool xla_gpu_dump_autotuned_gemm_fusions = 232;
+
+  string xla_gpu_override_gemm_autotuner = 295;
 
   bool xla_gpu_copy_insertion_use_region_analysis = 236;
 
@@ -654,6 +657,18 @@ message DebugOptions {
   // Enable double buffering for loops.
   bool xla_gpu_enable_while_loop_double_buffering = 248;
 
+  enum WhileLoopUnrolling {
+    WHILE_LOOP_UNROLLING_NO_UNROLL = 0;
+    // Has the same effect as setting
+    // `xla_gpu_enable_while_loop_double_buffering`.
+    WHILE_LOOP_UNROLLING_DOUBLE_BUFFER = 1;
+    // Enables full loop unrolling using the same strategy as `DOUBLE_BUFFER`.
+    WHILE_LOOP_UNROLLING_FULL_UNROLL = 2;
+  }
+
+  // Determine the while loop unrolling scheme.
+  WhileLoopUnrolling xla_gpu_enable_while_loop_unrolling = 294;
+
   // Change the layout of the second triton dot operand to be column major.
   // Only works for (bf16 x bf16) -> bf16.
   bool xla_gpu_ensure_minor_dot_contraction_dims = 249;
@@ -740,7 +755,33 @@ message DebugOptions {
   // solutions.
   int64 xla_gpu_autotune_max_solutions = 288;
 
-  // Next id: 289
+  // If true, large constants will be printed out when dumping HLOs.
+  bool xla_dump_large_constants = 290;
+
+  // If true, will verify that the numerical results of Triton fusions match
+  // the results of regular emitters.
+  bool xla_gpu_verify_triton_fusion_numerics = 291;
+
+  // File to write autotune logs to. It will stored in txt format.
+  string xla_gpu_dump_autotune_logs_to = 292;
+
+  // Base length to rewrite the reduce window to, no rewrite if set to 0.
+  int64 xla_reduce_window_rewrite_base_length = 293;
+
+  // If true, will enable host memory offloading on a device.
+  bool xla_gpu_enable_host_memory_offloading = 296;
+
+  // Excludes non-deterministic ops from compiled executables.
+  // Unlike --xla_gpu_deterministic_ops does not disable autotuning - the
+  // compilation itself can be non-deterministic.
+  // At present, the HLO op SelectAndScatter does not have a
+  // deterministic XLA:GPU implementation.
+  // Compilation errors out if SelectAndScatter is encountered.
+  // Scatter ops can non-deterministic by default; these get converted to
+  // a deterministic implementation.
+  bool xla_gpu_exclude_nondeterministic_ops = 297;
+
+  // Next id: 298
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.
diff --git a/third_party/xla/xla/xla_data.proto b/third_party/xla/xla/xla_data.proto
index 5bcdb0aa08320d..b72d69edc87709 100644
--- a/third_party/xla/xla/xla_data.proto
+++ b/third_party/xla/xla/xla_data.proto
@@ -32,6 +32,7 @@ enum PrimitiveType {
   PRED = 1;
 
   // Signed integral values of fixed width.
+  S2 = 26;
   S4 = 21;
   S8 = 2;
   S16 = 3;
@@ -39,6 +40,7 @@ enum PrimitiveType {
   S64 = 5;
 
   // Unsigned integral values of fixed width.
+  U2 = 27;
   U4 = 22;
   U8 = 6;
   U16 = 7;
@@ -124,7 +126,7 @@ enum PrimitiveType {
   // primitive type will have empty dimensions and tuple_shapes fields.
   TOKEN = 17;
 
-  // Next = 26
+  // Next = 28
 }
 // LINT.ThenChange(
 //   https://www.tensorflow.org/code/tensorflow/compiler/xla/shape_util.cc,
@@ -533,7 +535,7 @@ message DeviceAssignmentProto {
   // Each logical computation runs on replica_count physical devices.
   // ComputationDevice represents the device ids assinged to the replicas.
   message ComputationDevice {
-    repeated int32 replica_device_ids = 1;
+    repeated int64 replica_device_ids = 1;
   }
   repeated ComputationDevice computation_devices = 3;
 }
@@ -547,9 +549,11 @@ message DeviceAssignmentProto {
 message LiteralProto {
   ShapeProto shape = 1;
   repeated bool preds = 2;
+  bytes s2s = 26;
   bytes s4s = 21;
-  bytes u4s = 22;
   bytes s8s = 15;
+  bytes u2s = 27;
+  bytes u4s = 22;
   bytes u8s = 3;
   repeated int32 s32s = 4;
   repeated int64 s64s = 5;
@@ -571,7 +575,7 @@ message LiteralProto {
   bytes f8e5m2fnuzs = 24;
   bytes f8e4m3fnuzs = 25;
   repeated int64 sparse_indices = 14;
-  // Next = 26
+  // Next = 28
 }
 
 message WindowDimension {